summaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-06 01:02:30 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-06 01:02:30 +0000
commit76cb841cb886eef6b3bee341a2266c76578724ad (patch)
treef5892e5ba6cc11949952a6ce4ecbe6d516d6ce58 /arch/x86
parentInitial commit. (diff)
downloadlinux-upstream/4.19.249.tar.xz
linux-upstream/4.19.249.zip
Adding upstream version 4.19.249.upstream/4.19.249upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
-rw-r--r--arch/x86/.gitignore7
-rw-r--r--arch/x86/Kbuild24
-rw-r--r--arch/x86/Kconfig2962
-rw-r--r--arch/x86/Kconfig.cpu468
-rw-r--r--arch/x86/Kconfig.debug409
-rw-r--r--arch/x86/Makefile344
-rw-r--r--arch/x86/Makefile.um59
-rw-r--r--arch/x86/Makefile_32.cpu52
-rw-r--r--arch/x86/boot/.gitignore12
-rw-r--r--arch/x86/boot/Makefile156
-rw-r--r--arch/x86/boot/a20.c165
-rw-r--r--arch/x86/boot/apm.c75
-rw-r--r--arch/x86/boot/bioscall.S82
-rw-r--r--arch/x86/boot/bitops.h46
-rw-r--r--arch/x86/boot/boot.h358
-rw-r--r--arch/x86/boot/cmdline.c158
-rw-r--r--arch/x86/boot/code16gcc.h12
-rw-r--r--arch/x86/boot/compressed/.gitignore6
-rw-r--r--arch/x86/boot/compressed/Makefile160
-rw-r--r--arch/x86/boot/compressed/cmdline.c34
-rw-r--r--arch/x86/boot/compressed/cpuflags.c13
-rw-r--r--arch/x86/boot/compressed/early_serial_console.c5
-rw-r--r--arch/x86/boot/compressed/eboot.c934
-rw-r--r--arch/x86/boot/compressed/eboot.h33
-rw-r--r--arch/x86/boot/compressed/efi_stub_32.S87
-rw-r--r--arch/x86/boot/compressed/efi_stub_64.S5
-rw-r--r--arch/x86/boot/compressed/efi_thunk_64.S197
-rw-r--r--arch/x86/boot/compressed/error.c24
-rw-r--r--arch/x86/boot/compressed/error.h10
-rw-r--r--arch/x86/boot/compressed/head_32.S283
-rw-r--r--arch/x86/boot/compressed/head_64.S721
-rw-r--r--arch/x86/boot/compressed/kaslr.c867
-rw-r--r--arch/x86/boot/compressed/kaslr_64.c153
-rw-r--r--arch/x86/boot/compressed/mem_encrypt.S104
-rw-r--r--arch/x86/boot/compressed/misc.c429
-rw-r--r--arch/x86/boot/compressed/misc.h116
-rw-r--r--arch/x86/boot/compressed/mkpiggy.c82
-rw-r--r--arch/x86/boot/compressed/pgtable.h20
-rw-r--r--arch/x86/boot/compressed/pgtable_64.c211
-rw-r--r--arch/x86/boot/compressed/string.c75
-rw-r--r--arch/x86/boot/compressed/vmlinux.lds.S76
-rw-r--r--arch/x86/boot/copy.S67
-rw-r--r--arch/x86/boot/cpu.c101
-rw-r--r--arch/x86/boot/cpucheck.c229
-rw-r--r--arch/x86/boot/cpuflags.c129
-rw-r--r--arch/x86/boot/cpuflags.h21
-rw-r--r--arch/x86/boot/ctype.h21
-rw-r--r--arch/x86/boot/early_serial_console.c154
-rw-r--r--arch/x86/boot/edd.c182
-rw-r--r--arch/x86/boot/genimage.sh130
-rw-r--r--arch/x86/boot/header.S636
-rw-r--r--arch/x86/boot/install.sh59
-rw-r--r--arch/x86/boot/main.c182
-rw-r--r--arch/x86/boot/memory.c136
-rw-r--r--arch/x86/boot/mkcpustr.c52
-rw-r--r--arch/x86/boot/mtools.conf.in17
-rw-r--r--arch/x86/boot/pm.c126
-rw-r--r--arch/x86/boot/pmjump.S77
-rw-r--r--arch/x86/boot/printf.c309
-rw-r--r--arch/x86/boot/regs.c30
-rw-r--r--arch/x86/boot/setup.ld64
-rw-r--r--arch/x86/boot/string.c197
-rw-r--r--arch/x86/boot/string.h32
-rw-r--r--arch/x86/boot/tools/.gitignore1
-rw-r--r--arch/x86/boot/tools/build.c442
-rw-r--r--arch/x86/boot/tty.c139
-rw-r--r--arch/x86/boot/version.c21
-rw-r--r--arch/x86/boot/vesa.h72
-rw-r--r--arch/x86/boot/video-bios.c128
-rw-r--r--arch/x86/boot/video-mode.c173
-rw-r--r--arch/x86/boot/video-vesa.c281
-rw-r--r--arch/x86/boot/video-vga.c288
-rw-r--r--arch/x86/boot/video.c345
-rw-r--r--arch/x86/boot/video.h120
-rw-r--r--arch/x86/configs/i386_defconfig309
-rw-r--r--arch/x86/configs/tiny.config5
-rw-r--r--arch/x86/configs/x86_64_defconfig307
-rw-r--r--arch/x86/configs/xen.config28
-rw-r--r--arch/x86/crypto/Makefile129
-rw-r--r--arch/x86/crypto/aegis128-aesni-asm.S750
-rw-r--r--arch/x86/crypto/aegis128-aesni-glue.c394
-rw-r--r--arch/x86/crypto/aegis128l-aesni-asm.S826
-rw-r--r--arch/x86/crypto/aegis128l-aesni-glue.c394
-rw-r--r--arch/x86/crypto/aegis256-aesni-asm.S703
-rw-r--r--arch/x86/crypto/aegis256-aesni-glue.c394
-rw-r--r--arch/x86/crypto/aes-i586-asm_32.S362
-rw-r--r--arch/x86/crypto/aes-x86_64-asm_64.S185
-rw-r--r--arch/x86/crypto/aes_ctrby8_avx-x86_64.S569
-rw-r--r--arch/x86/crypto/aes_glue.c70
-rw-r--r--arch/x86/crypto/aesni-intel_asm.S2842
-rw-r--r--arch/x86/crypto/aesni-intel_avx-x86_64.S2946
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c1548
-rw-r--r--arch/x86/crypto/blowfish-x86_64-asm_64.S383
-rw-r--r--arch/x86/crypto/blowfish_glue.c477
-rw-r--r--arch/x86/crypto/camellia-aesni-avx-asm_64.S1289
-rw-r--r--arch/x86/crypto/camellia-aesni-avx2-asm_64.S1408
-rw-r--r--arch/x86/crypto/camellia-x86_64-asm_64.S514
-rw-r--r--arch/x86/crypto/camellia_aesni_avx2_glue.c300
-rw-r--r--arch/x86/crypto/camellia_aesni_avx_glue.c325
-rw-r--r--arch/x86/crypto/camellia_glue.c1527
-rw-r--r--arch/x86/crypto/cast5-avx-x86_64-asm_64.S578
-rw-r--r--arch/x86/crypto/cast5_avx_glue.c392
-rw-r--r--arch/x86/crypto/cast6-avx-x86_64-asm_64.S511
-rw-r--r--arch/x86/crypto/cast6_avx_glue.c320
-rw-r--r--arch/x86/crypto/chacha20-avx2-x86_64.S448
-rw-r--r--arch/x86/crypto/chacha20-ssse3-x86_64.S630
-rw-r--r--arch/x86/crypto/chacha20_glue.c146
-rw-r--r--arch/x86/crypto/crc32-pclmul_asm.S241
-rw-r--r--arch/x86/crypto/crc32-pclmul_glue.c202
-rw-r--r--arch/x86/crypto/crc32c-intel_glue.c270
-rw-r--r--arch/x86/crypto/crc32c-pcl-intel-asm_64.S464
-rw-r--r--arch/x86/crypto/crct10dif-pcl-asm_64.S651
-rw-r--r--arch/x86/crypto/crct10dif-pclmul_glue.c148
-rw-r--r--arch/x86/crypto/des3_ede-asm_64.S808
-rw-r--r--arch/x86/crypto/des3_ede_glue.c506
-rw-r--r--arch/x86/crypto/fpu.c207
-rw-r--r--arch/x86/crypto/ghash-clmulni-intel_asm.S136
-rw-r--r--arch/x86/crypto/ghash-clmulni-intel_glue.c366
-rw-r--r--arch/x86/crypto/glue_helper-asm-avx.S150
-rw-r--r--arch/x86/crypto/glue_helper-asm-avx2.S180
-rw-r--r--arch/x86/crypto/glue_helper.c330
-rw-r--r--arch/x86/crypto/morus1280-avx2-asm.S622
-rw-r--r--arch/x86/crypto/morus1280-avx2-glue.c64
-rw-r--r--arch/x86/crypto/morus1280-sse2-asm.S896
-rw-r--r--arch/x86/crypto/morus1280-sse2-glue.c63
-rw-r--r--arch/x86/crypto/morus1280_glue.c294
-rw-r--r--arch/x86/crypto/morus640-sse2-asm.S615
-rw-r--r--arch/x86/crypto/morus640-sse2-glue.c63
-rw-r--r--arch/x86/crypto/morus640_glue.c289
-rw-r--r--arch/x86/crypto/poly1305-avx2-x86_64.S394
-rw-r--r--arch/x86/crypto/poly1305-sse2-x86_64.S590
-rw-r--r--arch/x86/crypto/poly1305_glue.c205
-rw-r--r--arch/x86/crypto/serpent-avx-x86_64-asm_64.S796
-rw-r--r--arch/x86/crypto/serpent-avx2-asm_64.S818
-rw-r--r--arch/x86/crypto/serpent-sse2-i586-asm_32.S631
-rw-r--r--arch/x86/crypto/serpent-sse2-x86_64-asm_64.S754
-rw-r--r--arch/x86/crypto/serpent_avx2_glue.c281
-rw-r--r--arch/x86/crypto/serpent_avx_glue.c326
-rw-r--r--arch/x86/crypto/serpent_sse2_glue.c240
-rw-r--r--arch/x86/crypto/sha1-mb/Makefile14
-rw-r--r--arch/x86/crypto/sha1-mb/sha1_mb.c1011
-rw-r--r--arch/x86/crypto/sha1-mb/sha1_mb_ctx.h134
-rw-r--r--arch/x86/crypto/sha1-mb/sha1_mb_mgr.h110
-rw-r--r--arch/x86/crypto/sha1-mb/sha1_mb_mgr_datastruct.S287
-rw-r--r--arch/x86/crypto/sha1-mb/sha1_mb_mgr_flush_avx2.S304
-rw-r--r--arch/x86/crypto/sha1-mb/sha1_mb_mgr_init_avx2.c64
-rw-r--r--arch/x86/crypto/sha1-mb/sha1_mb_mgr_submit_avx2.S209
-rw-r--r--arch/x86/crypto/sha1-mb/sha1_x8_avx2.S492
-rw-r--r--arch/x86/crypto/sha1_avx2_x86_64_asm.S711
-rw-r--r--arch/x86/crypto/sha1_ni_asm.S304
-rw-r--r--arch/x86/crypto/sha1_ssse3_asm.S557
-rw-r--r--arch/x86/crypto/sha1_ssse3_glue.c378
-rw-r--r--arch/x86/crypto/sha256-avx-asm.S502
-rw-r--r--arch/x86/crypto/sha256-avx2-asm.S771
-rw-r--r--arch/x86/crypto/sha256-mb/Makefile14
-rw-r--r--arch/x86/crypto/sha256-mb/sha256_mb.c1013
-rw-r--r--arch/x86/crypto/sha256-mb/sha256_mb_ctx.h134
-rw-r--r--arch/x86/crypto/sha256-mb/sha256_mb_mgr.h108
-rw-r--r--arch/x86/crypto/sha256-mb/sha256_mb_mgr_datastruct.S304
-rw-r--r--arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S307
-rw-r--r--arch/x86/crypto/sha256-mb/sha256_mb_mgr_init_avx2.c65
-rw-r--r--arch/x86/crypto/sha256-mb/sha256_mb_mgr_submit_avx2.S214
-rw-r--r--arch/x86/crypto/sha256-mb/sha256_x8_avx2.S598
-rw-r--r--arch/x86/crypto/sha256-ssse3-asm.S511
-rw-r--r--arch/x86/crypto/sha256_ni_asm.S355
-rw-r--r--arch/x86/crypto/sha256_ssse3_glue.c432
-rw-r--r--arch/x86/crypto/sha512-avx-asm.S426
-rw-r--r--arch/x86/crypto/sha512-avx2-asm.S752
-rw-r--r--arch/x86/crypto/sha512-mb/Makefile12
-rw-r--r--arch/x86/crypto/sha512-mb/sha512_mb.c1047
-rw-r--r--arch/x86/crypto/sha512-mb/sha512_mb_ctx.h128
-rw-r--r--arch/x86/crypto/sha512-mb/sha512_mb_mgr.h104
-rw-r--r--arch/x86/crypto/sha512-mb/sha512_mb_mgr_datastruct.S281
-rw-r--r--arch/x86/crypto/sha512-mb/sha512_mb_mgr_flush_avx2.S297
-rw-r--r--arch/x86/crypto/sha512-mb/sha512_mb_mgr_init_avx2.c69
-rw-r--r--arch/x86/crypto/sha512-mb/sha512_mb_mgr_submit_avx2.S224
-rw-r--r--arch/x86/crypto/sha512-mb/sha512_x4_avx2.S531
-rw-r--r--arch/x86/crypto/sha512-ssse3-asm.S424
-rw-r--r--arch/x86/crypto/sha512_ssse3_glue.c349
-rw-r--r--arch/x86/crypto/twofish-avx-x86_64-asm_64.S471
-rw-r--r--arch/x86/crypto/twofish-i586-asm_32.S334
-rw-r--r--arch/x86/crypto/twofish-x86_64-asm_64-3way.S320
-rw-r--r--arch/x86/crypto/twofish-x86_64-asm_64.S321
-rw-r--r--arch/x86/crypto/twofish_avx_glue.c328
-rw-r--r--arch/x86/crypto/twofish_glue.c100
-rw-r--r--arch/x86/crypto/twofish_glue_3way.c290
-rw-r--r--arch/x86/entry/Makefile17
-rw-r--r--arch/x86/entry/calling.h350
-rw-r--r--arch/x86/entry/common.c432
-rw-r--r--arch/x86/entry/entry_32.S1514
-rw-r--r--arch/x86/entry/entry_64.S1751
-rw-r--r--arch/x86/entry/entry_64_compat.S414
-rw-r--r--arch/x86/entry/syscall_32.c34
-rw-r--r--arch/x86/entry/syscall_64.c25
-rw-r--r--arch/x86/entry/syscalls/Makefile70
-rw-r--r--arch/x86/entry/syscalls/syscall_32.tbl400
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl388
-rw-r--r--arch/x86/entry/syscalls/syscallhdr.sh28
-rw-r--r--arch/x86/entry/syscalls/syscalltbl.sh81
-rw-r--r--arch/x86/entry/thunk_32.S43
-rw-r--r--arch/x86/entry/thunk_64.S73
-rw-r--r--arch/x86/entry/vdso/.gitignore7
-rw-r--r--arch/x86/entry/vdso/Makefile205
-rwxr-xr-xarch/x86/entry/vdso/checkundef.sh10
-rw-r--r--arch/x86/entry/vdso/vclock_gettime.c341
-rw-r--r--arch/x86/entry/vdso/vdso-layout.lds.S120
-rw-r--r--arch/x86/entry/vdso/vdso-note.S15
-rw-r--r--arch/x86/entry/vdso/vdso.lds.S30
-rw-r--r--arch/x86/entry/vdso/vdso2c.c260
-rw-r--r--arch/x86/entry/vdso/vdso2c.h175
-rw-r--r--arch/x86/entry/vdso/vdso32-setup.c102
-rw-r--r--arch/x86/entry/vdso/vdso32/.gitignore1
-rw-r--r--arch/x86/entry/vdso/vdso32/note.S48
-rw-r--r--arch/x86/entry/vdso/vdso32/sigreturn.S138
-rw-r--r--arch/x86/entry/vdso/vdso32/system_call.S85
-rw-r--r--arch/x86/entry/vdso/vdso32/vclock_gettime.c31
-rw-r--r--arch/x86/entry/vdso/vdso32/vdso32.lds.S38
-rw-r--r--arch/x86/entry/vdso/vdsox32.lds.S26
-rw-r--r--arch/x86/entry/vdso/vgetcpu.c28
-rw-r--r--arch/x86/entry/vdso/vma.c383
-rw-r--r--arch/x86/entry/vsyscall/Makefile7
-rw-r--r--arch/x86/entry/vsyscall/vsyscall_64.c375
-rw-r--r--arch/x86/entry/vsyscall/vsyscall_emu_64.S37
-rw-r--r--arch/x86/entry/vsyscall/vsyscall_gtod.c78
-rw-r--r--arch/x86/entry/vsyscall/vsyscall_trace.h30
-rw-r--r--arch/x86/events/Kconfig37
-rw-r--r--arch/x86/events/Makefile4
-rw-r--r--arch/x86/events/amd/Makefile8
-rw-r--r--arch/x86/events/amd/core.c1009
-rw-r--r--arch/x86/events/amd/ibs.c1086
-rw-r--r--arch/x86/events/amd/iommu.c490
-rw-r--r--arch/x86/events/amd/iommu.h46
-rw-r--r--arch/x86/events/amd/power.c317
-rw-r--r--arch/x86/events/amd/uncore.c619
-rw-r--r--arch/x86/events/core.c2565
-rw-r--r--arch/x86/events/intel/Makefile10
-rw-r--r--arch/x86/events/intel/bts.c625
-rw-r--r--arch/x86/events/intel/core.c4624
-rw-r--r--arch/x86/events/intel/cstate.c707
-rw-r--r--arch/x86/events/intel/ds.c1687
-rw-r--r--arch/x86/events/intel/knc.c322
-rw-r--r--arch/x86/events/intel/lbr.c1279
-rw-r--r--arch/x86/events/intel/p4.c1376
-rw-r--r--arch/x86/events/intel/p6.c280
-rw-r--r--arch/x86/events/intel/pt.c1541
-rw-r--r--arch/x86/events/intel/pt.h192
-rw-r--r--arch/x86/events/intel/rapl.c833
-rw-r--r--arch/x86/events/intel/uncore.c1492
-rw-r--r--arch/x86/events/intel/uncore.h513
-rw-r--r--arch/x86/events/intel/uncore_nhmex.c1228
-rw-r--r--arch/x86/events/intel/uncore_snb.c877
-rw-r--r--arch/x86/events/intel/uncore_snbep.c3965
-rw-r--r--arch/x86/events/msr.c292
-rw-r--r--arch/x86/events/perf_event.h1047
-rw-r--r--arch/x86/hyperv/Makefile2
-rw-r--r--arch/x86/hyperv/hv_apic.c266
-rw-r--r--arch/x86/hyperv/hv_init.c516
-rw-r--r--arch/x86/hyperv/mmu.c244
-rw-r--r--arch/x86/hyperv/nested.c56
-rw-r--r--arch/x86/ia32/Makefile10
-rw-r--r--arch/x86/ia32/audit.c44
-rw-r--r--arch/x86/ia32/ia32_aout.c489
-rw-r--r--arch/x86/ia32/ia32_signal.c411
-rw-r--r--arch/x86/ia32/sys_ia32.c242
-rw-r--r--arch/x86/include/asm/Kbuild13
-rw-r--r--arch/x86/include/asm/a.out-core.h67
-rw-r--r--arch/x86/include/asm/acenv.h57
-rw-r--r--arch/x86/include/asm/acpi.h188
-rw-r--r--arch/x86/include/asm/agp.h32
-rw-r--r--arch/x86/include/asm/alternative-asm.h103
-rw-r--r--arch/x86/include/asm/alternative.h241
-rw-r--r--arch/x86/include/asm/amd_nb.h124
-rw-r--r--arch/x86/include/asm/apb_timer.h49
-rw-r--r--arch/x86/include/asm/apic.h547
-rw-r--r--arch/x86/include/asm/apic_flat_64.h8
-rw-r--r--arch/x86/include/asm/apicdef.h446
-rw-r--r--arch/x86/include/asm/apm.h74
-rw-r--r--arch/x86/include/asm/arch_hweight.h63
-rw-r--r--arch/x86/include/asm/archrandom.h124
-rw-r--r--arch/x86/include/asm/asm-offsets.h1
-rw-r--r--arch/x86/include/asm/asm-prototypes.h41
-rw-r--r--arch/x86/include/asm/asm.h208
-rw-r--r--arch/x86/include/asm/atomic.h267
-rw-r--r--arch/x86/include/asm/atomic64_32.h335
-rw-r--r--arch/x86/include/asm/atomic64_64.h245
-rw-r--r--arch/x86/include/asm/barrier.h106
-rw-r--r--arch/x86/include/asm/bios_ebda.h40
-rw-r--r--arch/x86/include/asm/bitops.h515
-rw-r--r--arch/x86/include/asm/boot.h58
-rw-r--r--arch/x86/include/asm/bootparam_utils.h91
-rw-r--r--arch/x86/include/asm/bug.h85
-rw-r--r--arch/x86/include/asm/bugs.h21
-rw-r--r--arch/x86/include/asm/cache.h24
-rw-r--r--arch/x86/include/asm/cacheflush.h11
-rw-r--r--arch/x86/include/asm/cacheinfo.h7
-rw-r--r--arch/x86/include/asm/calgary.h70
-rw-r--r--arch/x86/include/asm/ce4100.h7
-rw-r--r--arch/x86/include/asm/checksum.h6
-rw-r--r--arch/x86/include/asm/checksum_32.h199
-rw-r--r--arch/x86/include/asm/checksum_64.h199
-rw-r--r--arch/x86/include/asm/clocksource.h17
-rw-r--r--arch/x86/include/asm/cmdline.h9
-rw-r--r--arch/x86/include/asm/cmpxchg.h259
-rw-r--r--arch/x86/include/asm/cmpxchg_32.h115
-rw-r--r--arch/x86/include/asm/cmpxchg_64.h24
-rw-r--r--arch/x86/include/asm/compat.h243
-rw-r--r--arch/x86/include/asm/cpu.h43
-rw-r--r--arch/x86/include/asm/cpu_device_id.h41
-rw-r--r--arch/x86/include/asm/cpu_entry_area.h81
-rw-r--r--arch/x86/include/asm/cpufeature.h230
-rw-r--r--arch/x86/include/asm/cpufeatures.h399
-rw-r--r--arch/x86/include/asm/cpumask.h15
-rw-r--r--arch/x86/include/asm/crash.h12
-rw-r--r--arch/x86/include/asm/crypto/aes.h12
-rw-r--r--arch/x86/include/asm/crypto/camellia.h96
-rw-r--r--arch/x86/include/asm/crypto/glue_helper.h122
-rw-r--r--arch/x86/include/asm/crypto/serpent-avx.h42
-rw-r--r--arch/x86/include/asm/crypto/serpent-sse2.h64
-rw-r--r--arch/x86/include/asm/crypto/twofish.h28
-rw-r--r--arch/x86/include/asm/current.h22
-rw-r--r--arch/x86/include/asm/debugreg.h124
-rw-r--r--arch/x86/include/asm/delay.h10
-rw-r--r--arch/x86/include/asm/desc.h451
-rw-r--r--arch/x86/include/asm/desc_defs.h135
-rw-r--r--arch/x86/include/asm/device.h27
-rw-r--r--arch/x86/include/asm/disabled-features.h83
-rw-r--r--arch/x86/include/asm/div64.h78
-rw-r--r--arch/x86/include/asm/dma-direct.h9
-rw-r--r--arch/x86/include/asm/dma-mapping.h36
-rw-r--r--arch/x86/include/asm/dma.h318
-rw-r--r--arch/x86/include/asm/dmi.h22
-rw-r--r--arch/x86/include/asm/dwarf2.h85
-rw-r--r--arch/x86/include/asm/e820/api.h54
-rw-r--r--arch/x86/include/asm/e820/types.h105
-rw-r--r--arch/x86/include/asm/edac.h19
-rw-r--r--arch/x86/include/asm/efi.h252
-rw-r--r--arch/x86/include/asm/elf.h385
-rw-r--r--arch/x86/include/asm/emergency-restart.h7
-rw-r--r--arch/x86/include/asm/entry_arch.h56
-rw-r--r--arch/x86/include/asm/error-injection.h13
-rw-r--r--arch/x86/include/asm/espfix.h18
-rw-r--r--arch/x86/include/asm/exec.h1
-rw-r--r--arch/x86/include/asm/extable.h37
-rw-r--r--arch/x86/include/asm/fb.h22
-rw-r--r--arch/x86/include/asm/fixmap.h202
-rw-r--r--arch/x86/include/asm/floppy.h281
-rw-r--r--arch/x86/include/asm/fpu/api.h34
-rw-r--r--arch/x86/include/asm/fpu/internal.h601
-rw-r--r--arch/x86/include/asm/fpu/regset.h22
-rw-r--r--arch/x86/include/asm/fpu/signal.h34
-rw-r--r--arch/x86/include/asm/fpu/types.h321
-rw-r--r--arch/x86/include/asm/fpu/xstate.h60
-rw-r--r--arch/x86/include/asm/frame.h94
-rw-r--r--arch/x86/include/asm/ftrace.h87
-rw-r--r--arch/x86/include/asm/futex.h88
-rw-r--r--arch/x86/include/asm/gart.h114
-rw-r--r--arch/x86/include/asm/genapic.h1
-rw-r--r--arch/x86/include/asm/geode.h36
-rw-r--r--arch/x86/include/asm/hardirq.h83
-rw-r--r--arch/x86/include/asm/highmem.h80
-rw-r--r--arch/x86/include/asm/hpet.h115
-rw-r--r--arch/x86/include/asm/hugetlb.h93
-rw-r--r--arch/x86/include/asm/hw_breakpoint.h80
-rw-r--r--arch/x86/include/asm/hw_irq.h164
-rw-r--r--arch/x86/include/asm/hyperv-tlfs.h771
-rw-r--r--arch/x86/include/asm/hypervisor.h70
-rw-r--r--arch/x86/include/asm/i8259.h83
-rw-r--r--arch/x86/include/asm/ia32.h73
-rw-r--r--arch/x86/include/asm/ia32_unistd.h12
-rw-r--r--arch/x86/include/asm/imr.h60
-rw-r--r--arch/x86/include/asm/inat.h244
-rw-r--r--arch/x86/include/asm/inat_types.h29
-rw-r--r--arch/x86/include/asm/init.h17
-rw-r--r--arch/x86/include/asm/insn-eval.h23
-rw-r--r--arch/x86/include/asm/insn.h244
-rw-r--r--arch/x86/include/asm/inst.h311
-rw-r--r--arch/x86/include/asm/intel-family.h117
-rw-r--r--arch/x86/include/asm/intel-mid.h134
-rw-r--r--arch/x86/include/asm/intel_ds.h37
-rw-r--r--arch/x86/include/asm/intel_mid_vrtc.h10
-rw-r--r--arch/x86/include/asm/intel_pconfig.h65
-rw-r--r--arch/x86/include/asm/intel_pmc_ipc.h91
-rw-r--r--arch/x86/include/asm/intel_pt.h11
-rw-r--r--arch/x86/include/asm/intel_punit_ipc.h102
-rw-r--r--arch/x86/include/asm/intel_rdt_sched.h93
-rw-r--r--arch/x86/include/asm/intel_scu_ipc.h82
-rw-r--r--arch/x86/include/asm/intel_telemetry.h147
-rw-r--r--arch/x86/include/asm/invpcid.h53
-rw-r--r--arch/x86/include/asm/io.h413
-rw-r--r--arch/x86/include/asm/io_apic.h240
-rw-r--r--arch/x86/include/asm/iomap.h41
-rw-r--r--arch/x86/include/asm/iommu.h12
-rw-r--r--arch/x86/include/asm/iommu_table.h102
-rw-r--r--arch/x86/include/asm/iosf_mbi.h233
-rw-r--r--arch/x86/include/asm/ipi.h110
-rw-r--r--arch/x86/include/asm/irq.h50
-rw-r--r--arch/x86/include/asm/irq_regs.h32
-rw-r--r--arch/x86/include/asm/irq_remapping.h99
-rw-r--r--arch/x86/include/asm/irq_vectors.h146
-rw-r--r--arch/x86/include/asm/irq_work.h20
-rw-r--r--arch/x86/include/asm/irqdomain.h59
-rw-r--r--arch/x86/include/asm/irqflags.h213
-rw-r--r--arch/x86/include/asm/ist.h23
-rw-r--r--arch/x86/include/asm/jailhouse_para.h26
-rw-r--r--arch/x86/include/asm/jump_label.h100
-rw-r--r--arch/x86/include/asm/kasan.h38
-rw-r--r--arch/x86/include/asm/kaslr.h13
-rw-r--r--arch/x86/include/asm/kbdleds.h18
-rw-r--r--arch/x86/include/asm/kdebug.h43
-rw-r--r--arch/x86/include/asm/kexec-bzimage64.h7
-rw-r--r--arch/x86/include/asm/kexec.h228
-rw-r--r--arch/x86/include/asm/kgdb.h92
-rw-r--r--arch/x86/include/asm/kmap_types.h13
-rw-r--r--arch/x86/include/asm/kprobes.h124
-rw-r--r--arch/x86/include/asm/kvm_emulate.h453
-rw-r--r--arch/x86/include/asm/kvm_host.h1551
-rw-r--r--arch/x86/include/asm/kvm_page_track.h75
-rw-r--r--arch/x86/include/asm/kvm_para.h134
-rw-r--r--arch/x86/include/asm/kvmclock.h21
-rw-r--r--arch/x86/include/asm/linkage.h28
-rw-r--r--arch/x86/include/asm/livepatch.h40
-rw-r--r--arch/x86/include/asm/local.h162
-rw-r--r--arch/x86/include/asm/local64.h1
-rw-r--r--arch/x86/include/asm/mach_timer.h49
-rw-r--r--arch/x86/include/asm/mach_traps.h44
-rw-r--r--arch/x86/include/asm/math_emu.h15
-rw-r--r--arch/x86/include/asm/mc146818rtc.h103
-rw-r--r--arch/x86/include/asm/mce.h344
-rw-r--r--arch/x86/include/asm/mcsafe_test.h75
-rw-r--r--arch/x86/include/asm/mem_encrypt.h100
-rw-r--r--arch/x86/include/asm/microcode.h147
-rw-r--r--arch/x86/include/asm/microcode_amd.h58
-rw-r--r--arch/x86/include/asm/microcode_intel.h85
-rw-r--r--arch/x86/include/asm/misc.h7
-rw-r--r--arch/x86/include/asm/mmconfig.h13
-rw-r--r--arch/x86/include/asm/mmu.h66
-rw-r--r--arch/x86/include/asm/mmu_context.h359
-rw-r--r--arch/x86/include/asm/mmx.h15
-rw-r--r--arch/x86/include/asm/mmzone.h6
-rw-r--r--arch/x86/include/asm/mmzone_32.h56
-rw-r--r--arch/x86/include/asm/mmzone_64.h18
-rw-r--r--arch/x86/include/asm/module.h74
-rw-r--r--arch/x86/include/asm/mpspec.h154
-rw-r--r--arch/x86/include/asm/mpspec_def.h182
-rw-r--r--arch/x86/include/asm/mpx.h109
-rw-r--r--arch/x86/include/asm/mshyperv.h442
-rw-r--r--arch/x86/include/asm/msi.h14
-rw-r--r--arch/x86/include/asm/msidef.h57
-rw-r--r--arch/x86/include/asm/msr-index.h844
-rw-r--r--arch/x86/include/asm/msr-trace.h58
-rw-r--r--arch/x86/include/asm/msr.h398
-rw-r--r--arch/x86/include/asm/mtrr.h133
-rw-r--r--arch/x86/include/asm/mwait.h123
-rw-r--r--arch/x86/include/asm/nmi.h66
-rw-r--r--arch/x86/include/asm/nops.h147
-rw-r--r--arch/x86/include/asm/nospec-branch.h436
-rw-r--r--arch/x86/include/asm/numa.h83
-rw-r--r--arch/x86/include/asm/numa_32.h13
-rw-r--r--arch/x86/include/asm/numachip/numachip.h20
-rw-r--r--arch/x86/include/asm/numachip/numachip_csr.h98
-rw-r--r--arch/x86/include/asm/olpc.h133
-rw-r--r--arch/x86/include/asm/olpc_ofw.h38
-rw-r--r--arch/x86/include/asm/orc_lookup.h46
-rw-r--r--arch/x86/include/asm/orc_types.h109
-rw-r--r--arch/x86/include/asm/page.h82
-rw-r--r--arch/x86/include/asm/page_32.h49
-rw-r--r--arch/x86/include/asm/page_32_types.h65
-rw-r--r--arch/x86/include/asm/page_64.h65
-rw-r--r--arch/x86/include/asm/page_64_types.h80
-rw-r--r--arch/x86/include/asm/page_types.h83
-rw-r--r--arch/x86/include/asm/paravirt.h975
-rw-r--r--arch/x86/include/asm/paravirt_types.h699
-rw-r--r--arch/x86/include/asm/parport.h11
-rw-r--r--arch/x86/include/asm/pat.h27
-rw-r--r--arch/x86/include/asm/pci-direct.h18
-rw-r--r--arch/x86/include/asm/pci-functions.h20
-rw-r--r--arch/x86/include/asm/pci.h161
-rw-r--r--arch/x86/include/asm/pci_64.h28
-rw-r--r--arch/x86/include/asm/pci_x86.h216
-rw-r--r--arch/x86/include/asm/percpu.h613
-rw-r--r--arch/x86/include/asm/perf_event.h315
-rw-r--r--arch/x86/include/asm/perf_event_p4.h877
-rw-r--r--arch/x86/include/asm/pgalloc.h207
-rw-r--r--arch/x86/include/asm/pgtable-2level.h115
-rw-r--r--arch/x86/include/asm/pgtable-2level_types.h41
-rw-r--r--arch/x86/include/asm/pgtable-3level.h337
-rw-r--r--arch/x86/include/asm/pgtable-3level_types.h51
-rw-r--r--arch/x86/include/asm/pgtable-invert.h41
-rw-r--r--arch/x86/include/asm/pgtable.h1448
-rw-r--r--arch/x86/include/asm/pgtable_32.h111
-rw-r--r--arch/x86/include/asm/pgtable_32_types.h73
-rw-r--r--arch/x86/include/asm/pgtable_64.h279
-rw-r--r--arch/x86/include/asm/pgtable_64_types.h162
-rw-r--r--arch/x86/include/asm/pgtable_types.h572
-rw-r--r--arch/x86/include/asm/pkeys.h137
-rw-r--r--arch/x86/include/asm/platform_sst_audio.h142
-rw-r--r--arch/x86/include/asm/pm-trace.h24
-rw-r--r--arch/x86/include/asm/posix_types.h6
-rw-r--r--arch/x86/include/asm/preempt.h115
-rw-r--r--arch/x86/include/asm/probe_roms.h9
-rw-r--r--arch/x86/include/asm/processor-cyrix.h39
-rw-r--r--arch/x86/include/asm/processor-flags.h56
-rw-r--r--arch/x86/include/asm/processor.h1004
-rw-r--r--arch/x86/include/asm/prom.h41
-rw-r--r--arch/x86/include/asm/proto.h43
-rw-r--r--arch/x86/include/asm/pti.h15
-rw-r--r--arch/x86/include/asm/ptrace.h319
-rw-r--r--arch/x86/include/asm/purgatory.h21
-rw-r--r--arch/x86/include/asm/pvclock-abi.h48
-rw-r--r--arch/x86/include/asm/pvclock.h106
-rw-r--r--arch/x86/include/asm/qrwlock.h8
-rw-r--r--arch/x86/include/asm/qspinlock.h100
-rw-r--r--arch/x86/include/asm/qspinlock_paravirt.h69
-rw-r--r--arch/x86/include/asm/realmode.h85
-rw-r--r--arch/x86/include/asm/reboot.h32
-rw-r--r--arch/x86/include/asm/reboot_fixups.h7
-rw-r--r--arch/x86/include/asm/refcount.h110
-rw-r--r--arch/x86/include/asm/required-features.h106
-rw-r--r--arch/x86/include/asm/rio.h64
-rw-r--r--arch/x86/include/asm/rmwcc.h57
-rw-r--r--arch/x86/include/asm/rwsem.h237
-rw-r--r--arch/x86/include/asm/seccomp.h21
-rw-r--r--arch/x86/include/asm/sections.h17
-rw-r--r--arch/x86/include/asm/segment.h340
-rw-r--r--arch/x86/include/asm/serial.h30
-rw-r--r--arch/x86/include/asm/set_memory.h141
-rw-r--r--arch/x86/include/asm/setup.h140
-rw-r--r--arch/x86/include/asm/setup_arch.h3
-rw-r--r--arch/x86/include/asm/shmparam.h7
-rw-r--r--arch/x86/include/asm/sigcontext.h9
-rw-r--r--arch/x86/include/asm/sigframe.h92
-rw-r--r--arch/x86/include/asm/sighandling.h20
-rw-r--r--arch/x86/include/asm/signal.h110
-rw-r--r--arch/x86/include/asm/simd.h12
-rw-r--r--arch/x86/include/asm/smap.h79
-rw-r--r--arch/x86/include/asm/smp.h192
-rw-r--r--arch/x86/include/asm/sparsemem.h35
-rw-r--r--arch/x86/include/asm/spec-ctrl.h88
-rw-r--r--arch/x86/include/asm/special_insns.h245
-rw-r--r--arch/x86/include/asm/spinlock.h45
-rw-r--r--arch/x86/include/asm/spinlock_types.h31
-rw-r--r--arch/x86/include/asm/sta2x11.h13
-rw-r--r--arch/x86/include/asm/stackprotector.h130
-rw-r--r--arch/x86/include/asm/stacktrace.h116
-rw-r--r--arch/x86/include/asm/string.h6
-rw-r--r--arch/x86/include/asm/string_32.h347
-rw-r--r--arch/x86/include/asm/string_64.h142
-rw-r--r--arch/x86/include/asm/suspend.h6
-rw-r--r--arch/x86/include/asm/suspend_32.h35
-rw-r--r--arch/x86/include/asm/suspend_64.h64
-rw-r--r--arch/x86/include/asm/svm.h302
-rw-r--r--arch/x86/include/asm/swiotlb.h30
-rw-r--r--arch/x86/include/asm/switch_to.h109
-rw-r--r--arch/x86/include/asm/sync_bitops.h131
-rw-r--r--arch/x86/include/asm/sync_core.h29
-rw-r--r--arch/x86/include/asm/syscall.h246
-rw-r--r--arch/x86/include/asm/syscall_wrapper.h210
-rw-r--r--arch/x86/include/asm/syscalls.h51
-rw-r--r--arch/x86/include/asm/sysfb.h98
-rw-r--r--arch/x86/include/asm/tce.h48
-rw-r--r--arch/x86/include/asm/text-patching.h72
-rw-r--r--arch/x86/include/asm/thread_info.h279
-rw-r--r--arch/x86/include/asm/time.h13
-rw-r--r--arch/x86/include/asm/timer.h38
-rw-r--r--arch/x86/include/asm/timex.h22
-rw-r--r--arch/x86/include/asm/tlb.h33
-rw-r--r--arch/x86/include/asm/tlbbatch.h15
-rw-r--r--arch/x86/include/asm/tlbflush.h610
-rw-r--r--arch/x86/include/asm/topology.h179
-rw-r--r--arch/x86/include/asm/trace/common.h16
-rw-r--r--arch/x86/include/asm/trace/exceptions.h53
-rw-r--r--arch/x86/include/asm/trace/fpu.h104
-rw-r--r--arch/x86/include/asm/trace/hyperv.h69
-rw-r--r--arch/x86/include/asm/trace/irq_vectors.h397
-rw-r--r--arch/x86/include/asm/trace/mpx.h134
-rw-r--r--arch/x86/include/asm/trace_clock.h21
-rw-r--r--arch/x86/include/asm/traps.h166
-rw-r--r--arch/x86/include/asm/tsc.h74
-rw-r--r--arch/x86/include/asm/uaccess.h745
-rw-r--r--arch/x86/include/asm/uaccess_32.h63
-rw-r--r--arch/x86/include/asm/uaccess_64.h216
-rw-r--r--arch/x86/include/asm/umip.h12
-rw-r--r--arch/x86/include/asm/unaligned.h15
-rw-r--r--arch/x86/include/asm/unistd.h57
-rw-r--r--arch/x86/include/asm/unwind.h126
-rw-r--r--arch/x86/include/asm/unwind_hints.h109
-rw-r--r--arch/x86/include/asm/uprobes.h71
-rw-r--r--arch/x86/include/asm/user.h64
-rw-r--r--arch/x86/include/asm/user32.h71
-rw-r--r--arch/x86/include/asm/user_32.h132
-rw-r--r--arch/x86/include/asm/user_64.h138
-rw-r--r--arch/x86/include/asm/uv/bios.h176
-rw-r--r--arch/x86/include/asm/uv/uv.h43
-rw-r--r--arch/x86/include/asm/uv/uv_bau.h861
-rw-r--r--arch/x86/include/asm/uv/uv_hub.h911
-rw-r--r--arch/x86/include/asm/uv/uv_irq.h38
-rw-r--r--arch/x86/include/asm/uv/uv_mmrs.h4828
-rw-r--r--arch/x86/include/asm/vdso.h50
-rw-r--r--arch/x86/include/asm/vga.h33
-rw-r--r--arch/x86/include/asm/vgtod.h106
-rw-r--r--arch/x86/include/asm/virtext.h128
-rw-r--r--arch/x86/include/asm/vm86.h92
-rw-r--r--arch/x86/include/asm/vmx.h593
-rw-r--r--arch/x86/include/asm/vsyscall.h25
-rw-r--r--arch/x86/include/asm/vvar.h51
-rw-r--r--arch/x86/include/asm/word-at-a-time.h106
-rw-r--r--arch/x86/include/asm/x86_init.h306
-rw-r--r--arch/x86/include/asm/xen/cpuid.h116
-rw-r--r--arch/x86/include/asm/xen/events.h35
-rw-r--r--arch/x86/include/asm/xen/hypercall.h526
-rw-r--r--arch/x86/include/asm/xen/hypervisor.h67
-rw-r--r--arch/x86/include/asm/xen/interface.h390
-rw-r--r--arch/x86/include/asm/xen/interface_32.h103
-rw-r--r--arch/x86/include/asm/xen/interface_64.h149
-rw-r--r--arch/x86/include/asm/xen/page-coherent.h38
-rw-r--r--arch/x86/include/asm/xen/page.h347
-rw-r--r--arch/x86/include/asm/xen/pci.h83
-rw-r--r--arch/x86/include/asm/xen/swiotlb-xen.h17
-rw-r--r--arch/x86/include/asm/xen/trace_types.h19
-rw-r--r--arch/x86/include/asm/xor.h496
-rw-r--r--arch/x86/include/asm/xor_32.h567
-rw-r--r--arch/x86/include/asm/xor_64.h28
-rw-r--r--arch/x86/include/asm/xor_avx.h184
-rw-r--r--arch/x86/include/uapi/asm/Kbuild8
-rw-r--r--arch/x86/include/uapi/asm/a.out.h21
-rw-r--r--arch/x86/include/uapi/asm/auxvec.h20
-rw-r--r--arch/x86/include/uapi/asm/bitsperlong.h14
-rw-r--r--arch/x86/include/uapi/asm/boot.h11
-rw-r--r--arch/x86/include/uapi/asm/bootparam.h249
-rw-r--r--arch/x86/include/uapi/asm/byteorder.h7
-rw-r--r--arch/x86/include/uapi/asm/debugreg.h81
-rw-r--r--arch/x86/include/uapi/asm/e820.h82
-rw-r--r--arch/x86/include/uapi/asm/errno.h1
-rw-r--r--arch/x86/include/uapi/asm/fcntl.h1
-rw-r--r--arch/x86/include/uapi/asm/hw_breakpoint.h2
-rw-r--r--arch/x86/include/uapi/asm/hwcap2.h8
-rw-r--r--arch/x86/include/uapi/asm/ioctl.h1
-rw-r--r--arch/x86/include/uapi/asm/ioctls.h1
-rw-r--r--arch/x86/include/uapi/asm/ipcbuf.h1
-rw-r--r--arch/x86/include/uapi/asm/ist.h30
-rw-r--r--arch/x86/include/uapi/asm/kvm.h420
-rw-r--r--arch/x86/include/uapi/asm/kvm_para.h122
-rw-r--r--arch/x86/include/uapi/asm/kvm_perf.h17
-rw-r--r--arch/x86/include/uapi/asm/ldt.h48
-rw-r--r--arch/x86/include/uapi/asm/mce.h44
-rw-r--r--arch/x86/include/uapi/asm/mman.h31
-rw-r--r--arch/x86/include/uapi/asm/msgbuf.h32
-rw-r--r--arch/x86/include/uapi/asm/msr.h14
-rw-r--r--arch/x86/include/uapi/asm/mtrr.h124
-rw-r--r--arch/x86/include/uapi/asm/param.h1
-rw-r--r--arch/x86/include/uapi/asm/perf_regs.h34
-rw-r--r--arch/x86/include/uapi/asm/posix_types.h10
-rw-r--r--arch/x86/include/uapi/asm/posix_types_32.h26
-rw-r--r--arch/x86/include/uapi/asm/posix_types_64.h20
-rw-r--r--arch/x86/include/uapi/asm/posix_types_x32.h20
-rw-r--r--arch/x86/include/uapi/asm/prctl.h17
-rw-r--r--arch/x86/include/uapi/asm/processor-flags.h166
-rw-r--r--arch/x86/include/uapi/asm/ptrace-abi.h94
-rw-r--r--arch/x86/include/uapi/asm/ptrace.h86
-rw-r--r--arch/x86/include/uapi/asm/resource.h1
-rw-r--r--arch/x86/include/uapi/asm/sembuf.h34
-rw-r--r--arch/x86/include/uapi/asm/setup.h1
-rw-r--r--arch/x86/include/uapi/asm/shmbuf.h43
-rw-r--r--arch/x86/include/uapi/asm/sigcontext.h389
-rw-r--r--arch/x86/include/uapi/asm/sigcontext32.h9
-rw-r--r--arch/x86/include/uapi/asm/siginfo.h17
-rw-r--r--arch/x86/include/uapi/asm/signal.h136
-rw-r--r--arch/x86/include/uapi/asm/socket.h1
-rw-r--r--arch/x86/include/uapi/asm/sockios.h1
-rw-r--r--arch/x86/include/uapi/asm/stat.h138
-rw-r--r--arch/x86/include/uapi/asm/statfs.h13
-rw-r--r--arch/x86/include/uapi/asm/svm.h179
-rw-r--r--arch/x86/include/uapi/asm/swab.h37
-rw-r--r--arch/x86/include/uapi/asm/termbits.h1
-rw-r--r--arch/x86/include/uapi/asm/termios.h1
-rw-r--r--arch/x86/include/uapi/asm/types.h7
-rw-r--r--arch/x86/include/uapi/asm/ucontext.h56
-rw-r--r--arch/x86/include/uapi/asm/unistd.h18
-rw-r--r--arch/x86/include/uapi/asm/vm86.h130
-rw-r--r--arch/x86/include/uapi/asm/vmx.h151
-rw-r--r--arch/x86/include/uapi/asm/vsyscall.h13
-rw-r--r--arch/x86/kernel/.gitignore3
-rw-r--r--arch/x86/kernel/Makefile153
-rw-r--r--arch/x86/kernel/acpi/Makefile12
-rw-r--r--arch/x86/kernel/acpi/apei.c54
-rw-r--r--arch/x86/kernel/acpi/boot.c1796
-rw-r--r--arch/x86/kernel/acpi/cppc_msr.c58
-rw-r--r--arch/x86/kernel/acpi/cstate.c186
-rw-r--r--arch/x86/kernel/acpi/sleep.c150
-rw-r--r--arch/x86/kernel/acpi/sleep.h22
-rw-r--r--arch/x86/kernel/acpi/wakeup_32.S99
-rw-r--r--arch/x86/kernel/acpi/wakeup_64.S136
-rw-r--r--arch/x86/kernel/alternative.c845
-rw-r--r--arch/x86/kernel/amd_gart_64.c891
-rw-r--r--arch/x86/kernel/amd_nb.c472
-rw-r--r--arch/x86/kernel/apb_timer.c404
-rw-r--r--arch/x86/kernel/aperture_64.c551
-rw-r--r--arch/x86/kernel/apic/Makefile30
-rw-r--r--arch/x86/kernel/apic/apic.c2858
-rw-r--r--arch/x86/kernel/apic/apic_common.c46
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c296
-rw-r--r--arch/x86/kernel/apic/apic_noop.c160
-rw-r--r--arch/x86/kernel/apic/apic_numachip.c338
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c196
-rw-r--r--arch/x86/kernel/apic/hw_nmi.c58
-rw-r--r--arch/x86/kernel/apic/io_apic.c3085
-rw-r--r--arch/x86/kernel/apic/ipi.c244
-rw-r--r--arch/x86/kernel/apic/msi.c522
-rw-r--r--arch/x86/kernel/apic/probe_32.c249
-rw-r--r--arch/x86/kernel/apic/probe_64.c71
-rw-r--r--arch/x86/kernel/apic/vector.c1272
-rw-r--r--arch/x86/kernel/apic/x2apic.h9
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c230
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c198
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c1609
-rw-r--r--arch/x86/kernel/apm_32.c2447
-rw-r--r--arch/x86/kernel/asm-offsets.c111
-rw-r--r--arch/x86/kernel/asm-offsets_32.c67
-rw-r--r--arch/x86/kernel/asm-offsets_64.c82
-rw-r--r--arch/x86/kernel/audit_64.c83
-rw-r--r--arch/x86/kernel/bootflag.c102
-rw-r--r--arch/x86/kernel/check.c183
-rw-r--r--arch/x86/kernel/cpu/.gitignore1
-rw-r--r--arch/x86/kernel/cpu/Makefile60
-rw-r--r--arch/x86/kernel/cpu/amd.c1177
-rw-r--r--arch/x86/kernel/cpu/aperfmperf.c127
-rw-r--r--arch/x86/kernel/cpu/bugs.c1997
-rw-r--r--arch/x86/kernel/cpu/cacheinfo.c1006
-rw-r--r--arch/x86/kernel/cpu/centaur.c288
-rw-r--r--arch/x86/kernel/cpu/common.c2065
-rw-r--r--arch/x86/kernel/cpu/cpu.h88
-rw-r--r--arch/x86/kernel/cpu/cpuid-deps.c124
-rw-r--r--arch/x86/kernel/cpu/cyrix.c466
-rw-r--r--arch/x86/kernel/cpu/hypervisor.c102
-rw-r--r--arch/x86/kernel/cpu/intel.c1036
-rw-r--r--arch/x86/kernel/cpu/intel_pconfig.c82
-rw-r--r--arch/x86/kernel/cpu/intel_rdt.c915
-rw-r--r--arch/x86/kernel/cpu/intel_rdt.h570
-rw-r--r--arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c491
-rw-r--r--arch/x86/kernel/cpu/intel_rdt_monitor.c660
-rw-r--r--arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c1534
-rw-r--r--arch/x86/kernel/cpu/intel_rdt_pseudo_lock_event.h43
-rw-r--r--arch/x86/kernel/cpu/intel_rdt_rdtgroup.c3041
-rw-r--r--arch/x86/kernel/cpu/match.c55
-rw-r--r--arch/x86/kernel/cpu/mcheck/Makefile14
-rw-r--r--arch/x86/kernel/cpu/mcheck/dev-mcelog.c363
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-apei.c157
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-genpool.c145
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-inject.c739
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h173
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-severity.c423
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c2506
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c1478
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c519
-rw-r--r--arch/x86/kernel/cpu/mcheck/p5.c69
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c521
-rw-r--r--arch/x86/kernel/cpu/mcheck/threshold.c30
-rw-r--r--arch/x86/kernel/cpu/mcheck/winchip.c44
-rw-r--r--arch/x86/kernel/cpu/microcode/Makefile4
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c818
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c907
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c1004
-rw-r--r--arch/x86/kernel/cpu/mkcapflags.sh67
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c335
-rw-r--r--arch/x86/kernel/cpu/mtrr/Makefile3
-rw-r--r--arch/x86/kernel/cpu/mtrr/amd.c125
-rw-r--r--arch/x86/kernel/cpu/mtrr/centaur.c127
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c987
-rw-r--r--arch/x86/kernel/cpu/mtrr/cyrix.c283
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c913
-rw-r--r--arch/x86/kernel/cpu/mtrr/if.c443
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.c890
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.h80
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c161
-rw-r--r--arch/x86/kernel/cpu/powerflags.c24
-rw-r--r--arch/x86/kernel/cpu/proc.c167
-rw-r--r--arch/x86/kernel/cpu/rdrand.c59
-rw-r--r--arch/x86/kernel/cpu/scattered.c86
-rw-r--r--arch/x86/kernel/cpu/topology.c103
-rw-r--r--arch/x86/kernel/cpu/transmeta.c111
-rw-r--r--arch/x86/kernel/cpu/tsx.c141
-rw-r--r--arch/x86/kernel/cpu/umc.c26
-rw-r--r--arch/x86/kernel/cpu/vmware.c214
-rw-r--r--arch/x86/kernel/cpuid.c196
-rw-r--r--arch/x86/kernel/crash.c483
-rw-r--r--arch/x86/kernel/crash_dump_32.c96
-rw-r--r--arch/x86/kernel/crash_dump_64.c50
-rw-r--r--arch/x86/kernel/devicetree.c323
-rw-r--r--arch/x86/kernel/doublefault.c83
-rw-r--r--arch/x86/kernel/dumpstack.c419
-rw-r--r--arch/x86/kernel/dumpstack_32.c129
-rw-r--r--arch/x86/kernel/dumpstack_64.c151
-rw-r--r--arch/x86/kernel/e820.c1280
-rw-r--r--arch/x86/kernel/early-quirks.c802
-rw-r--r--arch/x86/kernel/early_printk.c405
-rw-r--r--arch/x86/kernel/ebda.c98
-rw-r--r--arch/x86/kernel/eisa.c25
-rw-r--r--arch/x86/kernel/espfix_64.c215
-rw-r--r--arch/x86/kernel/fpu/Makefile5
-rw-r--r--arch/x86/kernel/fpu/bugs.c59
-rw-r--r--arch/x86/kernel/fpu/core.c468
-rw-r--r--arch/x86/kernel/fpu/init.c317
-rw-r--r--arch/x86/kernel/fpu/regset.c387
-rw-r--r--arch/x86/kernel/fpu/signal.c439
-rw-r--r--arch/x86/kernel/fpu/xstate.c1294
-rw-r--r--arch/x86/kernel/ftrace.c1081
-rw-r--r--arch/x86/kernel/ftrace_32.S250
-rw-r--r--arch/x86/kernel/ftrace_64.S342
-rw-r--r--arch/x86/kernel/head32.c119
-rw-r--r--arch/x86/kernel/head64.c491
-rw-r--r--arch/x86/kernel/head_32.S623
-rw-r--r--arch/x86/kernel/head_64.S485
-rw-r--r--arch/x86/kernel/hpet.c1366
-rw-r--r--arch/x86/kernel/hw_breakpoint.c552
-rw-r--r--arch/x86/kernel/i8237.c80
-rw-r--r--arch/x86/kernel/i8253.c44
-rw-r--r--arch/x86/kernel/i8259.c434
-rw-r--r--arch/x86/kernel/idt.c373
-rw-r--r--arch/x86/kernel/io_delay.c132
-rw-r--r--arch/x86/kernel/ioport.c138
-rw-r--r--arch/x86/kernel/irq.c390
-rw-r--r--arch/x86/kernel/irq_32.c164
-rw-r--r--arch/x86/kernel/irq_64.c89
-rw-r--r--arch/x86/kernel/irq_work.c34
-rw-r--r--arch/x86/kernel/irqflags.S26
-rw-r--r--arch/x86/kernel/irqinit.c109
-rw-r--r--arch/x86/kernel/itmt.c212
-rw-r--r--arch/x86/kernel/jailhouse.c219
-rw-r--r--arch/x86/kernel/jump_label.c142
-rw-r--r--arch/x86/kernel/kdebugfs.c202
-rw-r--r--arch/x86/kernel/kexec-bzimage64.c549
-rw-r--r--arch/x86/kernel/kgdb.c816
-rw-r--r--arch/x86/kernel/kprobes/Makefile7
-rw-r--r--arch/x86/kernel/kprobes/common.h108
-rw-r--r--arch/x86/kernel/kprobes/core.c1159
-rw-r--r--arch/x86/kernel/kprobes/ftrace.c76
-rw-r--r--arch/x86/kernel/kprobes/opt.c503
-rw-r--r--arch/x86/kernel/ksysfs.c340
-rw-r--r--arch/x86/kernel/kvm.c868
-rw-r--r--arch/x86/kernel/kvmclock.c360
-rw-r--r--arch/x86/kernel/ldt.c582
-rw-r--r--arch/x86/kernel/livepatch.c65
-rw-r--r--arch/x86/kernel/machine_kexec_32.c266
-rw-r--r--arch/x86/kernel/machine_kexec_64.c576
-rw-r--r--arch/x86/kernel/mmconf-fam10h_64.c238
-rw-r--r--arch/x86/kernel/module.c281
-rw-r--r--arch/x86/kernel/mpparse.c963
-rw-r--r--arch/x86/kernel/msr.c244
-rw-r--r--arch/x86/kernel/nmi.c555
-rw-r--r--arch/x86/kernel/nmi_selftest.c184
-rw-r--r--arch/x86/kernel/paravirt-spinlocks.c45
-rw-r--r--arch/x86/kernel/paravirt.c483
-rw-r--r--arch/x86/kernel/paravirt_patch_32.c82
-rw-r--r--arch/x86/kernel/paravirt_patch_64.c92
-rw-r--r--arch/x86/kernel/pci-calgary_64.c1612
-rw-r--r--arch/x86/kernel/pci-dma.c196
-rw-r--r--arch/x86/kernel/pci-iommu_table.c80
-rw-r--r--arch/x86/kernel/pci-swiotlb.c81
-rw-r--r--arch/x86/kernel/pcspeaker.c14
-rw-r--r--arch/x86/kernel/perf_regs.c179
-rw-r--r--arch/x86/kernel/platform-quirks.c46
-rw-r--r--arch/x86/kernel/pmem.c32
-rw-r--r--arch/x86/kernel/probe_roms.c269
-rw-r--r--arch/x86/kernel/process.c854
-rw-r--r--arch/x86/kernel/process.h39
-rw-r--r--arch/x86/kernel/process_32.c316
-rw-r--r--arch/x86/kernel/process_64.c728
-rw-r--r--arch/x86/kernel/ptrace.c1445
-rw-r--r--arch/x86/kernel/pvclock.c167
-rw-r--r--arch/x86/kernel/quirks.c676
-rw-r--r--arch/x86/kernel/reboot.c917
-rw-r--r--arch/x86/kernel/reboot_fixups_32.c103
-rw-r--r--arch/x86/kernel/relocate_kernel_32.S277
-rw-r--r--arch/x86/kernel/relocate_kernel_64.S290
-rw-r--r--arch/x86/kernel/resource.c51
-rw-r--r--arch/x86/kernel/rtc.c207
-rw-r--r--arch/x86/kernel/setup.c1308
-rw-r--r--arch/x86/kernel/setup_percpu.c301
-rw-r--r--arch/x86/kernel/signal.c878
-rw-r--r--arch/x86/kernel/signal_compat.c182
-rw-r--r--arch/x86/kernel/smp.c336
-rw-r--r--arch/x86/kernel/smpboot.c1707
-rw-r--r--arch/x86/kernel/stacktrace.c229
-rw-r--r--arch/x86/kernel/step.c236
-rw-r--r--arch/x86/kernel/sys_x86_64.c241
-rw-r--r--arch/x86/kernel/sysfb.c74
-rw-r--r--arch/x86/kernel/sysfb_efi.c285
-rw-r--r--arch/x86/kernel/sysfb_simplefb.c115
-rw-r--r--arch/x86/kernel/tboot.c541
-rw-r--r--arch/x86/kernel/tce_64.c190
-rw-r--r--arch/x86/kernel/time.c103
-rw-r--r--arch/x86/kernel/tls.c321
-rw-r--r--arch/x86/kernel/tls.h21
-rw-r--r--arch/x86/kernel/topology.c175
-rw-r--r--arch/x86/kernel/trace_clock.c17
-rw-r--r--arch/x86/kernel/tracepoint.c42
-rw-r--r--arch/x86/kernel/traps.c975
-rw-r--r--arch/x86/kernel/tsc.c1511
-rw-r--r--arch/x86/kernel/tsc_msr.c129
-rw-r--r--arch/x86/kernel/tsc_sync.c494
-rw-r--r--arch/x86/kernel/umip.c429
-rw-r--r--arch/x86/kernel/unwind_frame.c435
-rw-r--r--arch/x86/kernel/unwind_guess.c72
-rw-r--r--arch/x86/kernel/unwind_orc.c676
-rw-r--r--arch/x86/kernel/uprobes.c1107
-rw-r--r--arch/x86/kernel/verify_cpu.S142
-rw-r--r--arch/x86/kernel/vm86_32.c874
-rw-r--r--arch/x86/kernel/vmlinux.lds.S438
-rw-r--r--arch/x86/kernel/vsmp_64.c226
-rw-r--r--arch/x86/kernel/x86_init.c160
-rw-r--r--arch/x86/kvm/Kconfig103
-rw-r--r--arch/x86/kvm/Makefile24
-rw-r--r--arch/x86/kvm/cpuid.c983
-rw-r--r--arch/x86/kvm/cpuid.h182
-rw-r--r--arch/x86/kvm/debugfs.c69
-rw-r--r--arch/x86/kvm/emulate.c5849
-rw-r--r--arch/x86/kvm/hyperv.c1683
-rw-r--r--arch/x86/kvm/hyperv.h99
-rw-r--r--arch/x86/kvm/i8254.c737
-rw-r--r--arch/x86/kvm/i8254.h66
-rw-r--r--arch/x86/kvm/i8259.c655
-rw-r--r--arch/x86/kvm/ioapic.c685
-rw-r--r--arch/x86/kvm/ioapic.h138
-rw-r--r--arch/x86/kvm/irq.c164
-rw-r--r--arch/x86/kvm/irq.h130
-rw-r--r--arch/x86/kvm/irq_comm.c441
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h113
-rw-r--r--arch/x86/kvm/lapic.c2710
-rw-r--r--arch/x86/kvm/lapic.h237
-rw-r--r--arch/x86/kvm/mmu.c6292
-rw-r--r--arch/x86/kvm/mmu.h223
-rw-r--r--arch/x86/kvm/mmu_audit.c306
-rw-r--r--arch/x86/kvm/mmutrace.h395
-rw-r--r--arch/x86/kvm/mtrr.c727
-rw-r--r--arch/x86/kvm/page_track.c267
-rw-r--r--arch/x86/kvm/paging_tmpl.h1083
-rw-r--r--arch/x86/kvm/pmu.c349
-rw-r--r--arch/x86/kvm/pmu.h136
-rw-r--r--arch/x86/kvm/pmu_amd.c317
-rw-r--r--arch/x86/kvm/pmu_intel.c374
-rw-r--r--arch/x86/kvm/svm.c7323
-rw-r--r--arch/x86/kvm/trace.h1429
-rw-r--r--arch/x86/kvm/tss.h60
-rw-r--r--arch/x86/kvm/vmx.c14715
-rw-r--r--arch/x86/kvm/vmx_evmcs.h324
-rw-r--r--arch/x86/kvm/vmx_shadow_fields.h77
-rw-r--r--arch/x86/kvm/x86.c9835
-rw-r--r--arch/x86/kvm/x86.h365
-rw-r--r--arch/x86/lib/.gitignore1
-rw-r--r--arch/x86/lib/Makefile63
-rw-r--r--arch/x86/lib/atomic64_32.c4
-rw-r--r--arch/x86/lib/atomic64_386_32.S191
-rw-r--r--arch/x86/lib/atomic64_cx8_32.S184
-rw-r--r--arch/x86/lib/cache-smp.c20
-rw-r--r--arch/x86/lib/checksum_32.S496
-rw-r--r--arch/x86/lib/clear_page_64.S51
-rw-r--r--arch/x86/lib/cmdline.c215
-rw-r--r--arch/x86/lib/cmpxchg16b_emu.S53
-rw-r--r--arch/x86/lib/cmpxchg8b_emu.S52
-rw-r--r--arch/x86/lib/copy_page_64.S89
-rw-r--r--arch/x86/lib/copy_user_64.S345
-rw-r--r--arch/x86/lib/cpu.c37
-rw-r--r--arch/x86/lib/csum-copy_64.S224
-rw-r--r--arch/x86/lib/csum-partial_64.c150
-rw-r--r--arch/x86/lib/csum-wrappers_64.c158
-rw-r--r--arch/x86/lib/delay.c189
-rw-r--r--arch/x86/lib/error-inject.c20
-rw-r--r--arch/x86/lib/getuser.S143
-rw-r--r--arch/x86/lib/hweight.S83
-rw-r--r--arch/x86/lib/inat.c97
-rw-r--r--arch/x86/lib/insn-eval.c1366
-rw-r--r--arch/x86/lib/insn.c606
-rw-r--r--arch/x86/lib/iomap_copy_64.S27
-rw-r--r--arch/x86/lib/kaslr.c92
-rw-r--r--arch/x86/lib/memcpy_32.c209
-rw-r--r--arch/x86/lib/memcpy_64.S298
-rw-r--r--arch/x86/lib/memmove_64.S213
-rw-r--r--arch/x86/lib/memset_64.S142
-rw-r--r--arch/x86/lib/misc.c22
-rw-r--r--arch/x86/lib/mmx_32.c378
-rw-r--r--arch/x86/lib/msr-reg-export.c6
-rw-r--r--arch/x86/lib/msr-reg.S93
-rw-r--r--arch/x86/lib/msr-smp.c280
-rw-r--r--arch/x86/lib/msr.c138
-rw-r--r--arch/x86/lib/putuser.S103
-rw-r--r--arch/x86/lib/retpoline.S48
-rw-r--r--arch/x86/lib/rwsem.S156
-rw-r--r--arch/x86/lib/string_32.c236
-rw-r--r--arch/x86/lib/strstr_32.c33
-rw-r--r--arch/x86/lib/usercopy.c38
-rw-r--r--arch/x86/lib/usercopy_32.c359
-rw-r--r--arch/x86/lib/usercopy_64.c230
-rw-r--r--arch/x86/lib/x86-opcode-map.txt1078
-rw-r--r--arch/x86/math-emu/Makefile30
-rw-r--r--arch/x86/math-emu/README427
-rw-r--r--arch/x86/math-emu/control_w.h46
-rw-r--r--arch/x86/math-emu/div_Xsig.S367
-rw-r--r--arch/x86/math-emu/div_small.S48
-rw-r--r--arch/x86/math-emu/errors.c685
-rw-r--r--arch/x86/math-emu/exception.h51
-rw-r--r--arch/x86/math-emu/fpu_arith.c153
-rw-r--r--arch/x86/math-emu/fpu_asm.h32
-rw-r--r--arch/x86/math-emu/fpu_aux.c267
-rw-r--r--arch/x86/math-emu/fpu_emu.h218
-rw-r--r--arch/x86/math-emu/fpu_entry.c729
-rw-r--r--arch/x86/math-emu/fpu_etc.c131
-rw-r--r--arch/x86/math-emu/fpu_proto.h157
-rw-r--r--arch/x86/math-emu/fpu_system.h128
-rw-r--r--arch/x86/math-emu/fpu_tags.c116
-rw-r--r--arch/x86/math-emu/fpu_trig.c1644
-rw-r--r--arch/x86/math-emu/get_address.c401
-rw-r--r--arch/x86/math-emu/load_store.c322
-rw-r--r--arch/x86/math-emu/mul_Xsig.S179
-rw-r--r--arch/x86/math-emu/poly.h115
-rw-r--r--arch/x86/math-emu/poly_2xm1.c146
-rw-r--r--arch/x86/math-emu/poly_atan.c209
-rw-r--r--arch/x86/math-emu/poly_l2.c245
-rw-r--r--arch/x86/math-emu/poly_sin.c379
-rw-r--r--arch/x86/math-emu/poly_tan.c213
-rw-r--r--arch/x86/math-emu/polynom_Xsig.S137
-rw-r--r--arch/x86/math-emu/reg_add_sub.c334
-rw-r--r--arch/x86/math-emu/reg_compare.c479
-rw-r--r--arch/x86/math-emu/reg_constant.c118
-rw-r--r--arch/x86/math-emu/reg_constant.h26
-rw-r--r--arch/x86/math-emu/reg_convert.c47
-rw-r--r--arch/x86/math-emu/reg_divide.c183
-rw-r--r--arch/x86/math-emu/reg_ld_str.c1220
-rw-r--r--arch/x86/math-emu/reg_mul.c116
-rw-r--r--arch/x86/math-emu/reg_norm.S150
-rw-r--r--arch/x86/math-emu/reg_round.S711
-rw-r--r--arch/x86/math-emu/reg_u_add.S169
-rw-r--r--arch/x86/math-emu/reg_u_div.S474
-rw-r--r--arch/x86/math-emu/reg_u_mul.S150
-rw-r--r--arch/x86/math-emu/reg_u_sub.S274
-rw-r--r--arch/x86/math-emu/round_Xsig.S142
-rw-r--r--arch/x86/math-emu/shr_Xsig.S89
-rw-r--r--arch/x86/math-emu/status_w.h68
-rw-r--r--arch/x86/math-emu/version.h12
-rw-r--r--arch/x86/math-emu/wm_shrx.S207
-rw-r--r--arch/x86/math-emu/wm_sqrt.S472
-rw-r--r--arch/x86/mm/Makefile55
-rw-r--r--arch/x86/mm/amdtopology.c184
-rw-r--r--arch/x86/mm/cpu_entry_area.c217
-rw-r--r--arch/x86/mm/debug_pagetables.c146
-rw-r--r--arch/x86/mm/dump_pagetables.c640
-rw-r--r--arch/x86/mm/extable.c251
-rw-r--r--arch/x86/mm/fault.c1490
-rw-r--r--arch/x86/mm/highmem_32.c133
-rw-r--r--arch/x86/mm/hugetlbpage.c216
-rw-r--r--arch/x86/mm/ident_map.c147
-rw-r--r--arch/x86/mm/init.c953
-rw-r--r--arch/x86/mm/init_32.c956
-rw-r--r--arch/x86/mm/init_64.c1544
-rw-r--r--arch/x86/mm/iomap_32.c129
-rw-r--r--arch/x86/mm/ioremap.c827
-rw-r--r--arch/x86/mm/kasan_init_64.c392
-rw-r--r--arch/x86/mm/kaslr.c228
-rw-r--r--arch/x86/mm/kmmio.c627
-rw-r--r--arch/x86/mm/mem_encrypt.c400
-rw-r--r--arch/x86/mm/mem_encrypt_boot.S159
-rw-r--r--arch/x86/mm/mem_encrypt_identity.c576
-rw-r--r--arch/x86/mm/mm_internal.h22
-rw-r--r--arch/x86/mm/mmap.c263
-rw-r--r--arch/x86/mm/mmio-mod.c477
-rw-r--r--arch/x86/mm/mpx.c948
-rw-r--r--arch/x86/mm/numa.c901
-rw-r--r--arch/x86/mm/numa_32.c94
-rw-r--r--arch/x86/mm/numa_64.c13
-rw-r--r--arch/x86/mm/numa_emulation.c587
-rw-r--r--arch/x86/mm/numa_internal.h34
-rw-r--r--arch/x86/mm/pageattr-test.c261
-rw-r--r--arch/x86/mm/pageattr.c2154
-rw-r--r--arch/x86/mm/pat.c1184
-rw-r--r--arch/x86/mm/pat_internal.h49
-rw-r--r--arch/x86/mm/pat_rbtree.c281
-rw-r--r--arch/x86/mm/pf_in.c531
-rw-r--r--arch/x86/mm/pf_in.h39
-rw-r--r--arch/x86/mm/pgtable.c891
-rw-r--r--arch/x86/mm/pgtable_32.c105
-rw-r--r--arch/x86/mm/physaddr.c99
-rw-r--r--arch/x86/mm/physaddr.h11
-rw-r--r--arch/x86/mm/pkeys.c226
-rw-r--r--arch/x86/mm/pti.c658
-rw-r--r--arch/x86/mm/setup_nx.c62
-rw-r--r--arch/x86/mm/srat.c114
-rw-r--r--arch/x86/mm/testmmiotrace.c140
-rw-r--r--arch/x86/mm/tlb.c840
-rw-r--r--arch/x86/net/Makefile9
-rw-r--r--arch/x86/net/bpf_jit_comp.c1190
-rw-r--r--arch/x86/net/bpf_jit_comp32.c2330
-rw-r--r--arch/x86/oprofile/Makefile12
-rw-r--r--arch/x86/oprofile/backtrace.c127
-rw-r--r--arch/x86/oprofile/init.c38
-rw-r--r--arch/x86/oprofile/nmi_int.c780
-rw-r--r--arch/x86/oprofile/op_counter.h30
-rw-r--r--arch/x86/oprofile/op_model_amd.c542
-rw-r--r--arch/x86/oprofile/op_model_p4.c723
-rw-r--r--arch/x86/oprofile/op_model_ppro.c245
-rw-r--r--arch/x86/oprofile/op_x86_model.h90
-rw-r--r--arch/x86/pci/Makefile29
-rw-r--r--arch/x86/pci/acpi.c425
-rw-r--r--arch/x86/pci/amd_bus.c402
-rw-r--r--arch/x86/pci/broadcom_bus.c116
-rw-r--r--arch/x86/pci/bus_numa.c146
-rw-r--r--arch/x86/pci/bus_numa.h27
-rw-r--r--arch/x86/pci/ce4100.c340
-rw-r--r--arch/x86/pci/common.c737
-rw-r--r--arch/x86/pci/direct.c315
-rw-r--r--arch/x86/pci/early.c59
-rw-r--r--arch/x86/pci/fixup.c826
-rw-r--r--arch/x86/pci/i386.c409
-rw-r--r--arch/x86/pci/init.c45
-rw-r--r--arch/x86/pci/intel_mid_pci.c392
-rw-r--r--arch/x86/pci/irq.c1288
-rw-r--r--arch/x86/pci/legacy.c76
-rw-r--r--arch/x86/pci/mmconfig-shared.c814
-rw-r--r--arch/x86/pci/mmconfig_32.c157
-rw-r--r--arch/x86/pci/mmconfig_64.c154
-rw-r--r--arch/x86/pci/numachip.c129
-rw-r--r--arch/x86/pci/olpc.c315
-rw-r--r--arch/x86/pci/pcbios.c429
-rw-r--r--arch/x86/pci/sta2x11-fixup.c330
-rw-r--r--arch/x86/pci/xen.c582
-rw-r--r--arch/x86/platform/Makefile16
-rw-r--r--arch/x86/platform/atom/Makefile1
-rw-r--r--arch/x86/platform/atom/punit_atom_debug.c181
-rw-r--r--arch/x86/platform/ce4100/Makefile1
-rw-r--r--arch/x86/platform/ce4100/ce4100.c160
-rw-r--r--arch/x86/platform/ce4100/falconfalls.dts433
-rw-r--r--arch/x86/platform/efi/Makefile7
-rw-r--r--arch/x86/platform/efi/early_printk.c236
-rw-r--r--arch/x86/platform/efi/efi.c1046
-rw-r--r--arch/x86/platform/efi/efi_32.c97
-rw-r--r--arch/x86/platform/efi/efi_64.c1039
-rw-r--r--arch/x86/platform/efi/efi_stub_32.S124
-rw-r--r--arch/x86/platform/efi/efi_stub_64.S58
-rw-r--r--arch/x86/platform/efi/efi_thunk_64.S153
-rw-r--r--arch/x86/platform/efi/quirks.c655
-rw-r--r--arch/x86/platform/geode/Makefile3
-rw-r--r--arch/x86/platform/geode/alix.c200
-rw-r--r--arch/x86/platform/geode/geos.c122
-rw-r--r--arch/x86/platform/geode/net5501.c148
-rw-r--r--arch/x86/platform/goldfish/Makefile1
-rw-r--r--arch/x86/platform/goldfish/goldfish.c63
-rw-r--r--arch/x86/platform/intel-mid/Makefile6
-rw-r--r--arch/x86/platform/intel-mid/device_libs/Makefile33
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_bcm43xx.c95
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_bma023.c20
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_bt.c108
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_emc1403.c43
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c85
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_lis331.c41
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_max7315.c81
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_mpu3050.c36
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_mrfld_pinctrl.c43
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_mrfld_power_btn.c82
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_mrfld_rtc.c48
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_mrfld_sd.c47
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_mrfld_spidev.c54
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c86
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_msic.c87
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_msic.h19
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_msic_audio.c46
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_msic_battery.c36
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_msic_gpio.c47
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_msic_ocd.c48
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_msic_power_btn.c35
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_msic_thermal.c36
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_pcal9555a.c99
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_tc35876x.c36
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_tca6416.c57
-rw-r--r--arch/x86/platform/intel-mid/intel-mid.c216
-rw-r--r--arch/x86/platform/intel-mid/intel_mid_vrtc.c177
-rw-r--r--arch/x86/platform/intel-mid/pwr.c488
-rw-r--r--arch/x86/platform/intel-mid/sfi.c547
-rw-r--r--arch/x86/platform/intel-quark/Makefile2
-rw-r--r--arch/x86/platform/intel-quark/imr.c600
-rw-r--r--arch/x86/platform/intel-quark/imr_selftest.c127
-rw-r--r--arch/x86/platform/intel/Makefile1
-rw-r--r--arch/x86/platform/intel/iosf_mbi.c391
-rw-r--r--arch/x86/platform/iris/Makefile1
-rw-r--r--arch/x86/platform/iris/iris.c136
-rw-r--r--arch/x86/platform/olpc/Makefile6
-rw-r--r--arch/x86/platform/olpc/olpc-xo1-pm.c200
-rw-r--r--arch/x86/platform/olpc/olpc-xo1-rtc.c84
-rw-r--r--arch/x86/platform/olpc/olpc-xo1-sci.c642
-rw-r--r--arch/x86/platform/olpc/olpc-xo15-sci.c235
-rw-r--r--arch/x86/platform/olpc/olpc.c408
-rw-r--r--arch/x86/platform/olpc/olpc_dt.c304
-rw-r--r--arch/x86/platform/olpc/olpc_ofw.c120
-rw-r--r--arch/x86/platform/olpc/xo1-wakeup.S125
-rw-r--r--arch/x86/platform/scx200/Makefile2
-rw-r--r--arch/x86/platform/scx200/scx200_32.c129
-rw-r--r--arch/x86/platform/sfi/Makefile1
-rw-r--r--arch/x86/platform/sfi/sfi.c114
-rw-r--r--arch/x86/platform/ts5500/Makefile1
-rw-r--r--arch/x86/platform/ts5500/ts5500.c347
-rw-r--r--arch/x86/platform/uv/Makefile1
-rw-r--r--arch/x86/platform/uv/bios_uv.c240
-rw-r--r--arch/x86/platform/uv/tlb_uv.c2284
-rw-r--r--arch/x86/platform/uv/uv_irq.c217
-rw-r--r--arch/x86/platform/uv/uv_nmi.c1066
-rw-r--r--arch/x86/platform/uv/uv_sysfs.c76
-rw-r--r--arch/x86/platform/uv/uv_time.c416
-rw-r--r--arch/x86/power/Makefile10
-rw-r--r--arch/x86/power/cpu.c550
-rw-r--r--arch/x86/power/hibernate_32.c175
-rw-r--r--arch/x86/power/hibernate_64.c397
-rw-r--r--arch/x86/power/hibernate_asm_32.S85
-rw-r--r--arch/x86/power/hibernate_asm_64.S146
-rw-r--r--arch/x86/purgatory/Makefile70
-rw-r--r--arch/x86/purgatory/entry64.S101
-rw-r--r--arch/x86/purgatory/purgatory.c78
-rw-r--r--arch/x86/purgatory/setup-x86_64.S59
-rw-r--r--arch/x86/purgatory/stack.S19
-rw-r--r--arch/x86/ras/Kconfig13
-rw-r--r--arch/x86/realmode/Makefile20
-rw-r--r--arch/x86/realmode/init.c170
-rw-r--r--arch/x86/realmode/rm/.gitignore3
-rw-r--r--arch/x86/realmode/rm/Makefile76
-rw-r--r--arch/x86/realmode/rm/bioscall.S1
-rw-r--r--arch/x86/realmode/rm/copy.S1
-rw-r--r--arch/x86/realmode/rm/header.S44
-rw-r--r--arch/x86/realmode/rm/realmode.h22
-rw-r--r--arch/x86/realmode/rm/realmode.lds.S77
-rw-r--r--arch/x86/realmode/rm/reboot.S156
-rw-r--r--arch/x86/realmode/rm/regs.c1
-rw-r--r--arch/x86/realmode/rm/stack.S20
-rw-r--r--arch/x86/realmode/rm/trampoline_32.S74
-rw-r--r--arch/x86/realmode/rm/trampoline_64.S177
-rw-r--r--arch/x86/realmode/rm/trampoline_common.S8
-rw-r--r--arch/x86/realmode/rm/video-bios.c1
-rw-r--r--arch/x86/realmode/rm/video-mode.c1
-rw-r--r--arch/x86/realmode/rm/video-vesa.c1
-rw-r--r--arch/x86/realmode/rm/video-vga.c1
-rw-r--r--arch/x86/realmode/rm/wakemain.c83
-rw-r--r--arch/x86/realmode/rm/wakeup.h43
-rw-r--r--arch/x86/realmode/rm/wakeup_asm.S178
-rw-r--r--arch/x86/realmode/rmpiggy.S21
-rw-r--r--arch/x86/tools/.gitignore1
-rw-r--r--arch/x86/tools/Makefile46
-rw-r--r--arch/x86/tools/chkobjdump.awk34
-rw-r--r--arch/x86/tools/gen-insn-attr-x86.awk393
-rw-r--r--arch/x86/tools/insn_decoder_test.c180
-rw-r--r--arch/x86/tools/insn_sanity.c282
-rw-r--r--arch/x86/tools/objdump_reformat.awk48
-rw-r--r--arch/x86/tools/relocs.c1110
-rw-r--r--arch/x86/tools/relocs.h38
-rw-r--r--arch/x86/tools/relocs_32.c18
-rw-r--r--arch/x86/tools/relocs_64.c18
-rw-r--r--arch/x86/tools/relocs_common.c85
-rw-r--r--arch/x86/um/Kconfig59
-rw-r--r--arch/x86/um/Makefile46
-rw-r--r--arch/x86/um/asm/apic.h4
-rw-r--r--arch/x86/um/asm/arch_hweight.h7
-rw-r--r--arch/x86/um/asm/archparam.h20
-rw-r--r--arch/x86/um/asm/barrier.h38
-rw-r--r--arch/x86/um/asm/checksum.h155
-rw-r--r--arch/x86/um/asm/checksum_32.h61
-rw-r--r--arch/x86/um/asm/checksum_64.h19
-rw-r--r--arch/x86/um/asm/desc.h17
-rw-r--r--arch/x86/um/asm/elf.h218
-rw-r--r--arch/x86/um/asm/irq_vectors.h10
-rw-r--r--arch/x86/um/asm/mm_context.h72
-rw-r--r--arch/x86/um/asm/module.h24
-rw-r--r--arch/x86/um/asm/processor.h35
-rw-r--r--arch/x86/um/asm/processor_32.h61
-rw-r--r--arch/x86/um/asm/processor_64.h40
-rw-r--r--arch/x86/um/asm/ptrace.h87
-rw-r--r--arch/x86/um/asm/required-features.h9
-rw-r--r--arch/x86/um/asm/segment.h19
-rw-r--r--arch/x86/um/asm/syscall.h21
-rw-r--r--arch/x86/um/asm/vm-flags.h25
-rw-r--r--arch/x86/um/bugs_32.c74
-rw-r--r--arch/x86/um/bugs_64.c15
-rw-r--r--arch/x86/um/checksum_32.S218
-rw-r--r--arch/x86/um/delay.c60
-rw-r--r--arch/x86/um/elfcore.c77
-rw-r--r--arch/x86/um/fault.c28
-rw-r--r--arch/x86/um/ldt.c380
-rw-r--r--arch/x86/um/mem_32.c53
-rw-r--r--arch/x86/um/mem_64.c11
-rw-r--r--arch/x86/um/os-Linux/Makefile13
-rw-r--r--arch/x86/um/os-Linux/mcontext.c32
-rw-r--r--arch/x86/um/os-Linux/prctl.c12
-rw-r--r--arch/x86/um/os-Linux/registers.c168
-rw-r--r--arch/x86/um/os-Linux/task_size.c151
-rw-r--r--arch/x86/um/os-Linux/tls.c68
-rw-r--r--arch/x86/um/ptrace_32.c277
-rw-r--r--arch/x86/um/ptrace_64.c277
-rw-r--r--arch/x86/um/ptrace_user.c21
-rw-r--r--arch/x86/um/setjmp_32.S59
-rw-r--r--arch/x86/um/setjmp_64.S55
-rw-r--r--arch/x86/um/shared/sysdep/archsetjmp.h6
-rw-r--r--arch/x86/um/shared/sysdep/archsetjmp_32.h23
-rw-r--r--arch/x86/um/shared/sysdep/archsetjmp_64.h25
-rw-r--r--arch/x86/um/shared/sysdep/faultinfo.h6
-rw-r--r--arch/x86/um/shared/sysdep/faultinfo_32.h32
-rw-r--r--arch/x86/um/shared/sysdep/faultinfo_64.h32
-rw-r--r--arch/x86/um/shared/sysdep/kernel-offsets.h12
-rw-r--r--arch/x86/um/shared/sysdep/mcontext.h31
-rw-r--r--arch/x86/um/shared/sysdep/ptrace.h75
-rw-r--r--arch/x86/um/shared/sysdep/ptrace_32.h26
-rw-r--r--arch/x86/um/shared/sysdep/ptrace_64.h62
-rw-r--r--arch/x86/um/shared/sysdep/ptrace_user.h28
-rw-r--r--arch/x86/um/shared/sysdep/stub.h15
-rw-r--r--arch/x86/um/shared/sysdep/stub_32.h93
-rw-r--r--arch/x86/um/shared/sysdep/stub_64.h99
-rw-r--r--arch/x86/um/shared/sysdep/syscalls.h6
-rw-r--r--arch/x86/um/shared/sysdep/syscalls_32.h15
-rw-r--r--arch/x86/um/shared/sysdep/syscalls_64.h31
-rw-r--r--arch/x86/um/shared/sysdep/tls.h40
-rw-r--r--arch/x86/um/signal.c582
-rw-r--r--arch/x86/um/stub_32.S51
-rw-r--r--arch/x86/um/stub_64.S51
-rw-r--r--arch/x86/um/stub_segv.c20
-rw-r--r--arch/x86/um/sys_call_table_32.c46
-rw-r--r--arch/x86/um/sys_call_table_64.c56
-rw-r--r--arch/x86/um/syscalls_32.c8
-rw-r--r--arch/x86/um/syscalls_64.c90
-rw-r--r--arch/x86/um/sysrq_32.c34
-rw-r--r--arch/x86/um/sysrq_64.c36
-rw-r--r--arch/x86/um/tls_32.c398
-rw-r--r--arch/x86/um/tls_64.c19
-rw-r--r--arch/x86/um/user-offsets.c100
-rw-r--r--arch/x86/um/vdso/.gitignore1
-rw-r--r--arch/x86/um/vdso/Makefile78
-rw-r--r--arch/x86/um/vdso/checkundef.sh11
-rw-r--r--arch/x86/um/vdso/um_vdso.c71
-rw-r--r--arch/x86/um/vdso/vdso-layout.lds.S65
-rw-r--r--arch/x86/um/vdso/vdso-note.S12
-rw-r--r--arch/x86/um/vdso/vdso.S11
-rw-r--r--arch/x86/um/vdso/vdso.lds.S33
-rw-r--r--arch/x86/um/vdso/vma.c75
-rw-r--r--arch/x86/video/Makefile1
-rw-r--r--arch/x86/video/fbdev.c40
-rw-r--r--arch/x86/xen/Kconfig79
-rw-r--r--arch/x86/xen/Makefile36
-rw-r--r--arch/x86/xen/apic.c226
-rw-r--r--arch/x86/xen/debugfs.c21
-rw-r--r--arch/x86/xen/debugfs.h7
-rw-r--r--arch/x86/xen/efi.c194
-rw-r--r--arch/x86/xen/enlighten.c366
-rw-r--r--arch/x86/xen/enlighten_hvm.c270
-rw-r--r--arch/x86/xen/enlighten_pv.c1482
-rw-r--r--arch/x86/xen/enlighten_pvh.c112
-rw-r--r--arch/x86/xen/grant-table.c180
-rw-r--r--arch/x86/xen/irq.c133
-rw-r--r--arch/x86/xen/mmu.c237
-rw-r--r--arch/x86/xen/mmu.h28
-rw-r--r--arch/x86/xen/mmu_hvm.c80
-rw-r--r--arch/x86/xen/mmu_pv.c2677
-rw-r--r--arch/x86/xen/multicalls.c209
-rw-r--r--arch/x86/xen/multicalls.h69
-rw-r--r--arch/x86/xen/p2m.c858
-rw-r--r--arch/x86/xen/pci-swiotlb-xen.c94
-rw-r--r--arch/x86/xen/platform-pci-unplug.c221
-rw-r--r--arch/x86/xen/pmu.c572
-rw-r--r--arch/x86/xen/pmu.h22
-rw-r--r--arch/x86/xen/setup.c1016
-rw-r--r--arch/x86/xen/smp.c284
-rw-r--r--arch/x86/xen/smp.h43
-rw-r--r--arch/x86/xen/smp_hvm.c82
-rw-r--r--arch/x86/xen/smp_pv.c506
-rw-r--r--arch/x86/xen/spinlock.c160
-rw-r--r--arch/x86/xen/suspend.c84
-rw-r--r--arch/x86/xen/suspend_hvm.c18
-rw-r--r--arch/x86/xen/suspend_pv.c48
-rw-r--r--arch/x86/xen/time.c594
-rw-r--r--arch/x86/xen/trace.c21
-rw-r--r--arch/x86/xen/vdso.h4
-rw-r--r--arch/x86/xen/vga.c75
-rw-r--r--arch/x86/xen/xen-asm.S137
-rw-r--r--arch/x86/xen/xen-asm_32.S207
-rw-r--r--arch/x86/xen/xen-asm_64.S177
-rw-r--r--arch/x86/xen/xen-head.S104
-rw-r--r--arch/x86/xen/xen-ops.h164
-rw-r--r--arch/x86/xen/xen-pvh.S188
1378 files changed, 420722 insertions, 0 deletions
diff --git a/arch/x86/.gitignore b/arch/x86/.gitignore
new file mode 100644
index 000000000..5a82bac5e
--- /dev/null
+++ b/arch/x86/.gitignore
@@ -0,0 +1,7 @@
+boot/compressed/vmlinux
+tools/test_get_len
+tools/insn_sanity
+tools/insn_decoder_test
+purgatory/kexec-purgatory.c
+purgatory/purgatory.ro
+
diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild
new file mode 100644
index 000000000..0038a2d10
--- /dev/null
+++ b/arch/x86/Kbuild
@@ -0,0 +1,24 @@
+obj-y += entry/
+
+obj-$(CONFIG_PERF_EVENTS) += events/
+
+obj-$(CONFIG_KVM) += kvm/
+
+# Xen paravirtualization support
+obj-$(CONFIG_XEN) += xen/
+
+# Hyper-V paravirtualization support
+obj-$(subst m,y,$(CONFIG_HYPERV)) += hyperv/
+
+obj-y += realmode/
+obj-y += kernel/
+obj-y += mm/
+
+obj-y += crypto/
+
+obj-$(CONFIG_IA32_EMULATION) += ia32/
+
+obj-y += platform/
+obj-y += net/
+
+obj-$(CONFIG_KEXEC_FILE) += purgatory/
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
new file mode 100644
index 000000000..be4403a8e
--- /dev/null
+++ b/arch/x86/Kconfig
@@ -0,0 +1,2962 @@
+# SPDX-License-Identifier: GPL-2.0
+# Select 32 or 64 bit
+config 64BIT
+ bool "64-bit kernel" if "$(ARCH)" = "x86"
+ default "$(ARCH)" != "i386"
+ ---help---
+ Say yes to build a 64-bit kernel - formerly known as x86_64
+ Say no to build a 32-bit kernel - formerly known as i386
+
+config X86_32
+ def_bool y
+ depends on !64BIT
+ # Options that are inherently 32-bit kernel only:
+ select ARCH_WANT_IPC_PARSE_VERSION
+ select CLKSRC_I8253
+ select CLONE_BACKWARDS
+ select HAVE_AOUT
+ select HAVE_GENERIC_DMA_COHERENT
+ select MODULES_USE_ELF_REL
+ select OLD_SIGACTION
+
+config X86_64
+ def_bool y
+ depends on 64BIT
+ # Options that are inherently 64-bit kernel only:
+ select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA
+ select ARCH_SUPPORTS_INT128
+ select ARCH_USE_CMPXCHG_LOCKREF
+ select HAVE_ARCH_SOFT_DIRTY
+ select MODULES_USE_ELF_RELA
+ select NEED_DMA_MAP_STATE
+ select SWIOTLB
+ select X86_DEV_DMA_OPS
+ select ARCH_HAS_SYSCALL_WRAPPER
+
+#
+# Arch settings
+#
+# ( Note that options that are marked 'if X86_64' could in principle be
+# ported to 32-bit as well. )
+#
+config X86
+ def_bool y
+ #
+ # Note: keep this list sorted alphabetically
+ #
+ select ACPI_LEGACY_TABLES_LOOKUP if ACPI
+ select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI
+ select ANON_INODES
+ select ARCH_CLOCKSOURCE_DATA
+ select ARCH_DISCARD_MEMBLOCK
+ select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI
+ select ARCH_HAS_DEBUG_VIRTUAL
+ select ARCH_HAS_DEVMEM_IS_ALLOWED
+ select ARCH_HAS_ELF_RANDOMIZE
+ select ARCH_HAS_FAST_MULTIPLIER
+ select ARCH_HAS_FILTER_PGPROT
+ select ARCH_HAS_FORTIFY_SOURCE
+ select ARCH_HAS_GCOV_PROFILE_ALL
+ select ARCH_HAS_KCOV if X86_64
+ select ARCH_HAS_MEMBARRIER_SYNC_CORE
+ select ARCH_HAS_PMEM_API if X86_64
+ select ARCH_HAS_PTE_SPECIAL
+ select ARCH_HAS_REFCOUNT
+ select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64
+ select ARCH_HAS_UACCESS_MCSAFE if X86_64 && X86_MCE
+ select ARCH_HAS_SET_MEMORY
+ select ARCH_HAS_SG_CHAIN
+ select ARCH_HAS_STRICT_KERNEL_RWX
+ select ARCH_HAS_STRICT_MODULE_RWX
+ select ARCH_HAS_SYNC_CORE_BEFORE_USERMODE
+ select ARCH_HAS_UBSAN_SANITIZE_ALL
+ select ARCH_HAS_ZONE_DEVICE if X86_64
+ select ARCH_HAVE_NMI_SAFE_CMPXCHG
+ select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
+ select ARCH_MIGHT_HAVE_PC_PARPORT
+ select ARCH_MIGHT_HAVE_PC_SERIO
+ select ARCH_SUPPORTS_ACPI
+ select ARCH_SUPPORTS_ATOMIC_RMW
+ select ARCH_SUPPORTS_NUMA_BALANCING if X86_64
+ select ARCH_USE_BUILTIN_BSWAP
+ select ARCH_USE_QUEUED_RWLOCKS
+ select ARCH_USE_QUEUED_SPINLOCKS
+ select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+ select ARCH_WANTS_DYNAMIC_TASK_STRUCT
+ select ARCH_WANTS_THP_SWAP if X86_64
+ select BUILDTIME_EXTABLE_SORT
+ select CLKEVT_I8253
+ select CLOCKSOURCE_VALIDATE_LAST_CYCLE
+ select CLOCKSOURCE_WATCHDOG
+ select DCACHE_WORD_ACCESS
+ select DMA_DIRECT_OPS
+ select EDAC_ATOMIC_SCRUB
+ select EDAC_SUPPORT
+ select GENERIC_CLOCKEVENTS
+ select GENERIC_CLOCKEVENTS_BROADCAST if X86_64 || (X86_32 && X86_LOCAL_APIC)
+ select GENERIC_CLOCKEVENTS_MIN_ADJUST
+ select GENERIC_CMOS_UPDATE
+ select GENERIC_CPU_AUTOPROBE
+ select GENERIC_CPU_VULNERABILITIES
+ select GENERIC_EARLY_IOREMAP
+ select GENERIC_FIND_FIRST_BIT
+ select GENERIC_IOMAP
+ select GENERIC_IRQ_EFFECTIVE_AFF_MASK if SMP
+ select GENERIC_IRQ_MATRIX_ALLOCATOR if X86_LOCAL_APIC
+ select GENERIC_IRQ_MIGRATION if SMP
+ select GENERIC_IRQ_PROBE
+ select GENERIC_IRQ_RESERVATION_MODE
+ select GENERIC_IRQ_SHOW
+ select GENERIC_PENDING_IRQ if SMP
+ select GENERIC_SMP_IDLE_THREAD
+ select GENERIC_STRNCPY_FROM_USER
+ select GENERIC_STRNLEN_USER
+ select GENERIC_TIME_VSYSCALL
+ select HARDLOCKUP_CHECK_TIMESTAMP if X86_64
+ select HAVE_ACPI_APEI if ACPI
+ select HAVE_ACPI_APEI_NMI if ACPI
+ select HAVE_ALIGNED_STRUCT_PAGE if SLUB
+ select HAVE_ARCH_AUDITSYSCALL
+ select HAVE_ARCH_HUGE_VMAP if X86_64 || X86_PAE
+ select HAVE_ARCH_JUMP_LABEL
+ select HAVE_ARCH_KASAN if X86_64
+ select HAVE_ARCH_KGDB
+ select HAVE_ARCH_MMAP_RND_BITS if MMU
+ select HAVE_ARCH_MMAP_RND_COMPAT_BITS if MMU && COMPAT
+ select HAVE_ARCH_COMPAT_MMAP_BASES if MMU && COMPAT
+ select HAVE_ARCH_PREL32_RELOCATIONS
+ select HAVE_ARCH_SECCOMP_FILTER
+ select HAVE_ARCH_THREAD_STRUCT_WHITELIST
+ select HAVE_ARCH_TRACEHOOK
+ select HAVE_ARCH_TRANSPARENT_HUGEPAGE
+ select HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD if X86_64
+ select HAVE_ARCH_VMAP_STACK if X86_64
+ select HAVE_ARCH_WITHIN_STACK_FRAMES
+ select HAVE_CMPXCHG_DOUBLE
+ select HAVE_CMPXCHG_LOCAL
+ select HAVE_CONTEXT_TRACKING if X86_64
+ select HAVE_COPY_THREAD_TLS
+ select HAVE_C_RECORDMCOUNT
+ select HAVE_DEBUG_KMEMLEAK
+ select HAVE_DEBUG_STACKOVERFLOW
+ select HAVE_DMA_CONTIGUOUS
+ select HAVE_DYNAMIC_FTRACE
+ select HAVE_DYNAMIC_FTRACE_WITH_REGS
+ select HAVE_EBPF_JIT
+ select HAVE_EFFICIENT_UNALIGNED_ACCESS
+ select HAVE_EXIT_THREAD
+ select HAVE_FENTRY if X86_64 || DYNAMIC_FTRACE
+ select HAVE_FTRACE_MCOUNT_RECORD
+ select HAVE_FUNCTION_GRAPH_TRACER
+ select HAVE_FUNCTION_TRACER
+ select HAVE_GCC_PLUGINS
+ select HAVE_HW_BREAKPOINT
+ select HAVE_IDE
+ select HAVE_IOREMAP_PROT
+ select HAVE_IRQ_EXIT_ON_IRQ_STACK if X86_64
+ select HAVE_IRQ_TIME_ACCOUNTING
+ select HAVE_KERNEL_BZIP2
+ select HAVE_KERNEL_GZIP
+ select HAVE_KERNEL_LZ4
+ select HAVE_KERNEL_LZMA
+ select HAVE_KERNEL_LZO
+ select HAVE_KERNEL_XZ
+ select HAVE_KPROBES
+ select HAVE_KPROBES_ON_FTRACE
+ select HAVE_FUNCTION_ERROR_INJECTION
+ select HAVE_KRETPROBES
+ select HAVE_KVM
+ select HAVE_LIVEPATCH if X86_64
+ select HAVE_MEMBLOCK
+ select HAVE_MEMBLOCK_NODE_MAP
+ select HAVE_MIXED_BREAKPOINTS_REGS
+ select HAVE_MOD_ARCH_SPECIFIC
+ select HAVE_NMI
+ select HAVE_OPROFILE
+ select HAVE_OPTPROBES
+ select HAVE_PCSPKR_PLATFORM
+ select HAVE_PERF_EVENTS
+ select HAVE_PERF_EVENTS_NMI
+ select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && HAVE_PERF_EVENTS_NMI
+ select HAVE_PERF_REGS
+ select HAVE_PERF_USER_STACK_DUMP
+ select HAVE_RCU_TABLE_FREE if PARAVIRT
+ select HAVE_RCU_TABLE_INVALIDATE if HAVE_RCU_TABLE_FREE
+ select HAVE_REGS_AND_STACK_ACCESS_API
+ select HAVE_RELIABLE_STACKTRACE if X86_64 && (UNWINDER_FRAME_POINTER || UNWINDER_ORC) && STACK_VALIDATION
+ select HAVE_STACKPROTECTOR if CC_HAS_SANE_STACKPROTECTOR
+ select HAVE_STACK_VALIDATION if X86_64
+ select HAVE_RSEQ
+ select HAVE_SYSCALL_TRACEPOINTS
+ select HAVE_UNSTABLE_SCHED_CLOCK
+ select HAVE_USER_RETURN_NOTIFIER
+ select HOTPLUG_SMT if SMP
+ select IRQ_FORCED_THREADING
+ select NEED_SG_DMA_LENGTH
+ select PCI_LOCKLESS_CONFIG
+ select PERF_EVENTS
+ select RTC_LIB
+ select RTC_MC146818_LIB
+ select SPARSE_IRQ
+ select SRCU
+ select SYSCTL_EXCEPTION_TRACE
+ select THREAD_INFO_IN_TASK
+ select USER_STACKTRACE_SUPPORT
+ select VIRT_TO_BUS
+ select X86_FEATURE_NAMES if PROC_FS
+
+config INSTRUCTION_DECODER
+ def_bool y
+ depends on KPROBES || PERF_EVENTS || UPROBES
+
+config OUTPUT_FORMAT
+ string
+ default "elf32-i386" if X86_32
+ default "elf64-x86-64" if X86_64
+
+config ARCH_DEFCONFIG
+ string
+ default "arch/x86/configs/i386_defconfig" if X86_32
+ default "arch/x86/configs/x86_64_defconfig" if X86_64
+
+config LOCKDEP_SUPPORT
+ def_bool y
+
+config STACKTRACE_SUPPORT
+ def_bool y
+
+config MMU
+ def_bool y
+
+config ARCH_MMAP_RND_BITS_MIN
+ default 28 if 64BIT
+ default 8
+
+config ARCH_MMAP_RND_BITS_MAX
+ default 32 if 64BIT
+ default 16
+
+config ARCH_MMAP_RND_COMPAT_BITS_MIN
+ default 8
+
+config ARCH_MMAP_RND_COMPAT_BITS_MAX
+ default 16
+
+config SBUS
+ bool
+
+config GENERIC_ISA_DMA
+ def_bool y
+ depends on ISA_DMA_API
+
+config GENERIC_BUG
+ def_bool y
+ depends on BUG
+ select GENERIC_BUG_RELATIVE_POINTERS if X86_64
+
+config GENERIC_BUG_RELATIVE_POINTERS
+ bool
+
+config GENERIC_HWEIGHT
+ def_bool y
+
+config ARCH_MAY_HAVE_PC_FDC
+ def_bool y
+ depends on ISA_DMA_API
+
+config RWSEM_XCHGADD_ALGORITHM
+ def_bool y
+
+config GENERIC_CALIBRATE_DELAY
+ def_bool y
+
+config ARCH_HAS_CPU_RELAX
+ def_bool y
+
+config ARCH_HAS_CACHE_LINE_SIZE
+ def_bool y
+
+config ARCH_HAS_FILTER_PGPROT
+ def_bool y
+
+config HAVE_SETUP_PER_CPU_AREA
+ def_bool y
+
+config NEED_PER_CPU_EMBED_FIRST_CHUNK
+ def_bool y
+
+config NEED_PER_CPU_PAGE_FIRST_CHUNK
+ def_bool y
+
+config ARCH_HIBERNATION_POSSIBLE
+ def_bool y
+
+config ARCH_SUSPEND_POSSIBLE
+ def_bool y
+
+config ARCH_WANT_HUGE_PMD_SHARE
+ def_bool y
+
+config ARCH_WANT_GENERAL_HUGETLB
+ def_bool y
+
+config ZONE_DMA32
+ def_bool y if X86_64
+
+config AUDIT_ARCH
+ def_bool y if X86_64
+
+config ARCH_SUPPORTS_OPTIMIZED_INLINING
+ def_bool y
+
+config ARCH_SUPPORTS_DEBUG_PAGEALLOC
+ def_bool y
+
+config KASAN_SHADOW_OFFSET
+ hex
+ depends on KASAN
+ default 0xdffffc0000000000
+
+config HAVE_INTEL_TXT
+ def_bool y
+ depends on INTEL_IOMMU && ACPI
+
+config X86_32_SMP
+ def_bool y
+ depends on X86_32 && SMP
+
+config X86_64_SMP
+ def_bool y
+ depends on X86_64 && SMP
+
+config X86_32_LAZY_GS
+ def_bool y
+ depends on X86_32 && !STACKPROTECTOR
+
+config ARCH_SUPPORTS_UPROBES
+ def_bool y
+
+config FIX_EARLYCON_MEM
+ def_bool y
+
+config DYNAMIC_PHYSICAL_MASK
+ bool
+
+config PGTABLE_LEVELS
+ int
+ default 5 if X86_5LEVEL
+ default 4 if X86_64
+ default 3 if X86_PAE
+ default 2
+
+config CC_HAS_SANE_STACKPROTECTOR
+ bool
+ default $(success,$(srctree)/scripts/gcc-x86_64-has-stack-protector.sh $(CC)) if 64BIT
+ default $(success,$(srctree)/scripts/gcc-x86_32-has-stack-protector.sh $(CC))
+ help
+ We have to make sure stack protector is unconditionally disabled if
+ the compiler produces broken code.
+
+menu "Processor type and features"
+
+config ZONE_DMA
+ bool "DMA memory allocation support" if EXPERT
+ default y
+ help
+ DMA memory allocation support allows devices with less than 32-bit
+ addressing to allocate within the first 16MB of address space.
+ Disable if no such devices will be used.
+
+ If unsure, say Y.
+
+config SMP
+ bool "Symmetric multi-processing support"
+ ---help---
+ This enables support for systems with more than one CPU. If you have
+ a system with only one CPU, say N. If you have a system with more
+ than one CPU, say Y.
+
+ If you say N here, the kernel will run on uni- and multiprocessor
+ machines, but will use only one CPU of a multiprocessor machine. If
+ you say Y here, the kernel will run on many, but not all,
+ uniprocessor machines. On a uniprocessor machine, the kernel
+ will run faster if you say N here.
+
+ Note that if you say Y here and choose architecture "586" or
+ "Pentium" under "Processor family", the kernel will not work on 486
+ architectures. Similarly, multiprocessor kernels for the "PPro"
+ architecture may not work on all Pentium based boards.
+
+ People using multiprocessor machines who say Y here should also say
+ Y to "Enhanced Real Time Clock Support", below. The "Advanced Power
+ Management" code will be disabled if you say Y here.
+
+ See also <file:Documentation/x86/i386/IO-APIC.txt>,
+ <file:Documentation/lockup-watchdogs.txt> and the SMP-HOWTO available at
+ <http://www.tldp.org/docs.html#howto>.
+
+ If you don't know what to do here, say N.
+
+config X86_FEATURE_NAMES
+ bool "Processor feature human-readable names" if EMBEDDED
+ default y
+ ---help---
+ This option compiles in a table of x86 feature bits and corresponding
+ names. This is required to support /proc/cpuinfo and a few kernel
+ messages. You can disable this to save space, at the expense of
+ making those few kernel messages show numeric feature bits instead.
+
+ If in doubt, say Y.
+
+config X86_X2APIC
+ bool "Support x2apic"
+ depends on X86_LOCAL_APIC && X86_64 && (IRQ_REMAP || HYPERVISOR_GUEST)
+ ---help---
+ This enables x2apic support on CPUs that have this feature.
+
+ This allows 32-bit apic IDs (so it can support very large systems),
+ and accesses the local apic via MSRs not via mmio.
+
+ If you don't know what to do here, say N.
+
+config X86_MPPARSE
+ bool "Enable MPS table" if ACPI || SFI
+ default y
+ depends on X86_LOCAL_APIC
+ ---help---
+ For old smp systems that do not have proper acpi support. Newer systems
+ (esp with 64bit cpus) with acpi support, MADT and DSDT will override it
+
+config GOLDFISH
+ def_bool y
+ depends on X86_GOLDFISH
+
+config RETPOLINE
+ bool "Avoid speculative indirect branches in kernel"
+ default y
+ select STACK_VALIDATION if HAVE_STACK_VALIDATION
+ help
+ Compile kernel with the retpoline compiler options to guard against
+ kernel-to-user data leaks by avoiding speculative indirect
+ branches. Requires a compiler with -mindirect-branch=thunk-extern
+ support for full protection. The kernel may run slower.
+
+config INTEL_RDT
+ bool "Intel Resource Director Technology support"
+ default n
+ depends on X86 && CPU_SUP_INTEL
+ select KERNFS
+ help
+ Select to enable resource allocation and monitoring which are
+ sub-features of Intel Resource Director Technology(RDT). More
+ information about RDT can be found in the Intel x86
+ Architecture Software Developer Manual.
+
+ Say N if unsure.
+
+if X86_32
+config X86_BIGSMP
+ bool "Support for big SMP systems with more than 8 CPUs"
+ depends on SMP
+ ---help---
+ This option is needed for the systems that have more than 8 CPUs
+
+config X86_EXTENDED_PLATFORM
+ bool "Support for extended (non-PC) x86 platforms"
+ default y
+ ---help---
+ If you disable this option then the kernel will only support
+ standard PC platforms. (which covers the vast majority of
+ systems out there.)
+
+ If you enable this option then you'll be able to select support
+ for the following (non-PC) 32 bit x86 platforms:
+ Goldfish (Android emulator)
+ AMD Elan
+ RDC R-321x SoC
+ SGI 320/540 (Visual Workstation)
+ STA2X11-based (e.g. Northville)
+ Moorestown MID devices
+
+ If you have one of these systems, or if you want to build a
+ generic distribution kernel, say Y here - otherwise say N.
+endif
+
+if X86_64
+config X86_EXTENDED_PLATFORM
+ bool "Support for extended (non-PC) x86 platforms"
+ default y
+ ---help---
+ If you disable this option then the kernel will only support
+ standard PC platforms. (which covers the vast majority of
+ systems out there.)
+
+ If you enable this option then you'll be able to select support
+ for the following (non-PC) 64 bit x86 platforms:
+ Numascale NumaChip
+ ScaleMP vSMP
+ SGI Ultraviolet
+
+ If you have one of these systems, or if you want to build a
+ generic distribution kernel, say Y here - otherwise say N.
+endif
+# This is an alphabetically sorted list of 64 bit extended platforms
+# Please maintain the alphabetic order if and when there are additions
+config X86_NUMACHIP
+ bool "Numascale NumaChip"
+ depends on X86_64
+ depends on X86_EXTENDED_PLATFORM
+ depends on NUMA
+ depends on SMP
+ depends on X86_X2APIC
+ depends on PCI_MMCONFIG
+ ---help---
+ Adds support for Numascale NumaChip large-SMP systems. Needed to
+ enable more than ~168 cores.
+ If you don't have one of these, you should say N here.
+
+config X86_VSMP
+ bool "ScaleMP vSMP"
+ select HYPERVISOR_GUEST
+ select PARAVIRT
+ depends on X86_64 && PCI
+ depends on X86_EXTENDED_PLATFORM
+ depends on SMP
+ ---help---
+ Support for ScaleMP vSMP systems. Say 'Y' here if this kernel is
+ supposed to run on these EM64T-based machines. Only choose this option
+ if you have one of these machines.
+
+config X86_UV
+ bool "SGI Ultraviolet"
+ depends on X86_64
+ depends on X86_EXTENDED_PLATFORM
+ depends on NUMA
+ depends on EFI
+ depends on KEXEC_CORE
+ depends on X86_X2APIC
+ depends on PCI
+ ---help---
+ This option is needed in order to support SGI Ultraviolet systems.
+ If you don't have one of these, you should say N here.
+
+# Following is an alphabetically sorted list of 32 bit extended platforms
+# Please maintain the alphabetic order if and when there are additions
+
+config X86_GOLDFISH
+ bool "Goldfish (Virtual Platform)"
+ depends on X86_EXTENDED_PLATFORM
+ ---help---
+ Enable support for the Goldfish virtual platform used primarily
+ for Android development. Unless you are building for the Android
+ Goldfish emulator say N here.
+
+config X86_INTEL_CE
+ bool "CE4100 TV platform"
+ depends on PCI
+ depends on PCI_GODIRECT
+ depends on X86_IO_APIC
+ depends on X86_32
+ depends on X86_EXTENDED_PLATFORM
+ select X86_REBOOTFIXUPS
+ select OF
+ select OF_EARLY_FLATTREE
+ ---help---
+ Select for the Intel CE media processor (CE4100) SOC.
+ This option compiles in support for the CE4100 SOC for settop
+ boxes and media devices.
+
+config X86_INTEL_MID
+ bool "Intel MID platform support"
+ depends on X86_EXTENDED_PLATFORM
+ depends on X86_PLATFORM_DEVICES
+ depends on PCI
+ depends on X86_64 || (PCI_GOANY && X86_32)
+ depends on X86_IO_APIC
+ select SFI
+ select I2C
+ select DW_APB_TIMER
+ select APB_TIMER
+ select INTEL_SCU_IPC
+ select MFD_INTEL_MSIC
+ ---help---
+ Select to build a kernel capable of supporting Intel MID (Mobile
+ Internet Device) platform systems which do not have the PCI legacy
+ interfaces. If you are building for a PC class system say N here.
+
+ Intel MID platforms are based on an Intel processor and chipset which
+ consume less power than most of the x86 derivatives.
+
+config X86_INTEL_QUARK
+ bool "Intel Quark platform support"
+ depends on X86_32
+ depends on X86_EXTENDED_PLATFORM
+ depends on X86_PLATFORM_DEVICES
+ depends on X86_TSC
+ depends on PCI
+ depends on PCI_GOANY
+ depends on X86_IO_APIC
+ select IOSF_MBI
+ select INTEL_IMR
+ select COMMON_CLK
+ ---help---
+ Select to include support for Quark X1000 SoC.
+ Say Y here if you have a Quark based system such as the Arduino
+ compatible Intel Galileo.
+
+config X86_INTEL_LPSS
+ bool "Intel Low Power Subsystem Support"
+ depends on X86 && ACPI
+ select COMMON_CLK
+ select PINCTRL
+ select IOSF_MBI
+ ---help---
+ Select to build support for Intel Low Power Subsystem such as
+ found on Intel Lynxpoint PCH. Selecting this option enables
+ things like clock tree (common clock framework) and pincontrol
+ which are needed by the LPSS peripheral drivers.
+
+config X86_AMD_PLATFORM_DEVICE
+ bool "AMD ACPI2Platform devices support"
+ depends on ACPI
+ select COMMON_CLK
+ select PINCTRL
+ ---help---
+ Select to interpret AMD specific ACPI device to platform device
+ such as I2C, UART, GPIO found on AMD Carrizo and later chipsets.
+ I2C and UART depend on COMMON_CLK to set clock. GPIO driver is
+ implemented under PINCTRL subsystem.
+
+config IOSF_MBI
+ tristate "Intel SoC IOSF Sideband support for SoC platforms"
+ depends on PCI
+ ---help---
+ This option enables sideband register access support for Intel SoC
+ platforms. On these platforms the IOSF sideband is used in lieu of
+ MSR's for some register accesses, mostly but not limited to thermal
+ and power. Drivers may query the availability of this device to
+ determine if they need the sideband in order to work on these
+ platforms. The sideband is available on the following SoC products.
+ This list is not meant to be exclusive.
+ - BayTrail
+ - Braswell
+ - Quark
+
+ You should say Y if you are running a kernel on one of these SoC's.
+
+config IOSF_MBI_DEBUG
+ bool "Enable IOSF sideband access through debugfs"
+ depends on IOSF_MBI && DEBUG_FS
+ ---help---
+ Select this option to expose the IOSF sideband access registers (MCR,
+ MDR, MCRX) through debugfs to write and read register information from
+ different units on the SoC. This is most useful for obtaining device
+ state information for debug and analysis. As this is a general access
+ mechanism, users of this option would have specific knowledge of the
+ device they want to access.
+
+ If you don't require the option or are in doubt, say N.
+
+config X86_RDC321X
+ bool "RDC R-321x SoC"
+ depends on X86_32
+ depends on X86_EXTENDED_PLATFORM
+ select M486
+ select X86_REBOOTFIXUPS
+ ---help---
+ This option is needed for RDC R-321x system-on-chip, also known
+ as R-8610-(G).
+ If you don't have one of these chips, you should say N here.
+
+config X86_32_NON_STANDARD
+ bool "Support non-standard 32-bit SMP architectures"
+ depends on X86_32 && SMP
+ depends on X86_EXTENDED_PLATFORM
+ ---help---
+ This option compiles in the bigsmp and STA2X11 default
+ subarchitectures. It is intended for a generic binary
+ kernel. If you select them all, kernel will probe it one by
+ one and will fallback to default.
+
+# Alphabetically sorted list of Non standard 32 bit platforms
+
+config X86_SUPPORTS_MEMORY_FAILURE
+ def_bool y
+ # MCE code calls memory_failure():
+ depends on X86_MCE
+ # On 32-bit this adds too big of NODES_SHIFT and we run out of page flags:
+ # On 32-bit SPARSEMEM adds too big of SECTIONS_WIDTH:
+ depends on X86_64 || !SPARSEMEM
+ select ARCH_SUPPORTS_MEMORY_FAILURE
+
+config STA2X11
+ bool "STA2X11 Companion Chip Support"
+ depends on X86_32_NON_STANDARD && PCI
+ select ARCH_HAS_PHYS_TO_DMA
+ select X86_DEV_DMA_OPS
+ select X86_DMA_REMAP
+ select SWIOTLB
+ select MFD_STA2X11
+ select GPIOLIB
+ default n
+ ---help---
+ This adds support for boards based on the STA2X11 IO-Hub,
+ a.k.a. "ConneXt". The chip is used in place of the standard
+ PC chipset, so all "standard" peripherals are missing. If this
+ option is selected the kernel will still be able to boot on
+ standard PC machines.
+
+config X86_32_IRIS
+ tristate "Eurobraille/Iris poweroff module"
+ depends on X86_32
+ ---help---
+ The Iris machines from EuroBraille do not have APM or ACPI support
+ to shut themselves down properly. A special I/O sequence is
+ needed to do so, which is what this module does at
+ kernel shutdown.
+
+ This is only for Iris machines from EuroBraille.
+
+ If unused, say N.
+
+config SCHED_OMIT_FRAME_POINTER
+ def_bool y
+ prompt "Single-depth WCHAN output"
+ depends on X86
+ ---help---
+ Calculate simpler /proc/<PID>/wchan values. If this option
+ is disabled then wchan values will recurse back to the
+ caller function. This provides more accurate wchan values,
+ at the expense of slightly more scheduling overhead.
+
+ If in doubt, say "Y".
+
+menuconfig HYPERVISOR_GUEST
+ bool "Linux guest support"
+ ---help---
+ Say Y here to enable options for running Linux under various hyper-
+ visors. This option enables basic hypervisor detection and platform
+ setup.
+
+ If you say N, all options in this submenu will be skipped and
+ disabled, and Linux guest support won't be built in.
+
+if HYPERVISOR_GUEST
+
+config PARAVIRT
+ bool "Enable paravirtualization code"
+ ---help---
+ This changes the kernel so it can modify itself when it is run
+ under a hypervisor, potentially improving performance significantly
+ over full virtualization. However, when run without a hypervisor
+ the kernel is theoretically slower and slightly larger.
+
+config PARAVIRT_DEBUG
+ bool "paravirt-ops debugging"
+ depends on PARAVIRT && DEBUG_KERNEL
+ ---help---
+ Enable to debug paravirt_ops internals. Specifically, BUG if
+ a paravirt_op is missing when it is called.
+
+config PARAVIRT_SPINLOCKS
+ bool "Paravirtualization layer for spinlocks"
+ depends on PARAVIRT && SMP
+ ---help---
+ Paravirtualized spinlocks allow a pvops backend to replace the
+ spinlock implementation with something virtualization-friendly
+ (for example, block the virtual CPU rather than spinning).
+
+ It has a minimal impact on native kernels and gives a nice performance
+ benefit on paravirtualized KVM / Xen kernels.
+
+ If you are unsure how to answer this question, answer Y.
+
+config QUEUED_LOCK_STAT
+ bool "Paravirt queued spinlock statistics"
+ depends on PARAVIRT_SPINLOCKS && DEBUG_FS
+ ---help---
+ Enable the collection of statistical data on the slowpath
+ behavior of paravirtualized queued spinlocks and report
+ them on debugfs.
+
+source "arch/x86/xen/Kconfig"
+
+config KVM_GUEST
+ bool "KVM Guest support (including kvmclock)"
+ depends on PARAVIRT
+ select PARAVIRT_CLOCK
+ default y
+ ---help---
+ This option enables various optimizations for running under the KVM
+ hypervisor. It includes a paravirtualized clock, so that instead
+ of relying on a PIT (or probably other) emulation by the
+ underlying device model, the host provides the guest with
+ timing infrastructure such as time of day, and system time
+
+config KVM_DEBUG_FS
+ bool "Enable debug information for KVM Guests in debugfs"
+ depends on KVM_GUEST && DEBUG_FS
+ default n
+ ---help---
+ This option enables collection of various statistics for KVM guest.
+ Statistics are displayed in debugfs filesystem. Enabling this option
+ may incur significant overhead.
+
+config PARAVIRT_TIME_ACCOUNTING
+ bool "Paravirtual steal time accounting"
+ depends on PARAVIRT
+ default n
+ ---help---
+ Select this option to enable fine granularity task steal time
+ accounting. Time spent executing other tasks in parallel with
+ the current vCPU is discounted from the vCPU power. To account for
+ that, there can be a small performance impact.
+
+ If in doubt, say N here.
+
+config PARAVIRT_CLOCK
+ bool
+
+config JAILHOUSE_GUEST
+ bool "Jailhouse non-root cell support"
+ depends on X86_64 && PCI
+ select X86_PM_TIMER
+ ---help---
+ This option allows to run Linux as guest in a Jailhouse non-root
+ cell. You can leave this option disabled if you only want to start
+ Jailhouse and run Linux afterwards in the root cell.
+
+endif #HYPERVISOR_GUEST
+
+config NO_BOOTMEM
+ def_bool y
+
+source "arch/x86/Kconfig.cpu"
+
+config HPET_TIMER
+ def_bool X86_64
+ prompt "HPET Timer Support" if X86_32
+ ---help---
+ Use the IA-PC HPET (High Precision Event Timer) to manage
+ time in preference to the PIT and RTC, if a HPET is
+ present.
+ HPET is the next generation timer replacing legacy 8254s.
+ The HPET provides a stable time base on SMP
+ systems, unlike the TSC, but it is more expensive to access,
+ as it is off-chip. The interface used is documented
+ in the HPET spec, revision 1.
+
+ You can safely choose Y here. However, HPET will only be
+ activated if the platform and the BIOS support this feature.
+ Otherwise the 8254 will be used for timing services.
+
+ Choose N to continue using the legacy 8254 timer.
+
+config HPET_EMULATE_RTC
+ def_bool y
+ depends on HPET_TIMER && (RTC=y || RTC=m || RTC_DRV_CMOS=m || RTC_DRV_CMOS=y)
+
+config APB_TIMER
+ def_bool y if X86_INTEL_MID
+ prompt "Intel MID APB Timer Support" if X86_INTEL_MID
+ select DW_APB_TIMER
+ depends on X86_INTEL_MID && SFI
+ help
+ APB timer is the replacement for 8254, HPET on X86 MID platforms.
+ The APBT provides a stable time base on SMP
+ systems, unlike the TSC, but it is more expensive to access,
+ as it is off-chip. APB timers are always running regardless of CPU
+ C states, they are used as per CPU clockevent device when possible.
+
+# Mark as expert because too many people got it wrong.
+# The code disables itself when not needed.
+config DMI
+ default y
+ select DMI_SCAN_MACHINE_NON_EFI_FALLBACK
+ bool "Enable DMI scanning" if EXPERT
+ ---help---
+ Enabled scanning of DMI to identify machine quirks. Say Y
+ here unless you have verified that your setup is not
+ affected by entries in the DMI blacklist. Required by PNP
+ BIOS code.
+
+config GART_IOMMU
+ bool "Old AMD GART IOMMU support"
+ select IOMMU_HELPER
+ select SWIOTLB
+ depends on X86_64 && PCI && AMD_NB
+ ---help---
+ Provides a driver for older AMD Athlon64/Opteron/Turion/Sempron
+ GART based hardware IOMMUs.
+
+ The GART supports full DMA access for devices with 32-bit access
+ limitations, on systems with more than 3 GB. This is usually needed
+ for USB, sound, many IDE/SATA chipsets and some other devices.
+
+ Newer systems typically have a modern AMD IOMMU, supported via
+ the CONFIG_AMD_IOMMU=y config option.
+
+ In normal configurations this driver is only active when needed:
+ there's more than 3 GB of memory and the system contains a
+ 32-bit limited device.
+
+ If unsure, say Y.
+
+config CALGARY_IOMMU
+ bool "IBM Calgary IOMMU support"
+ select IOMMU_HELPER
+ select SWIOTLB
+ depends on X86_64 && PCI
+ ---help---
+ Support for hardware IOMMUs in IBM's xSeries x366 and x460
+ systems. Needed to run systems with more than 3GB of memory
+ properly with 32-bit PCI devices that do not support DAC
+ (Double Address Cycle). Calgary also supports bus level
+ isolation, where all DMAs pass through the IOMMU. This
+ prevents them from going anywhere except their intended
+ destination. This catches hard-to-find kernel bugs and
+ mis-behaving drivers and devices that do not use the DMA-API
+ properly to set up their DMA buffers. The IOMMU can be
+ turned off at boot time with the iommu=off parameter.
+ Normally the kernel will make the right choice by itself.
+ If unsure, say Y.
+
+config CALGARY_IOMMU_ENABLED_BY_DEFAULT
+ def_bool y
+ prompt "Should Calgary be enabled by default?"
+ depends on CALGARY_IOMMU
+ ---help---
+ Should Calgary be enabled by default? if you choose 'y', Calgary
+ will be used (if it exists). If you choose 'n', Calgary will not be
+ used even if it exists. If you choose 'n' and would like to use
+ Calgary anyway, pass 'iommu=calgary' on the kernel command line.
+ If unsure, say Y.
+
+config MAXSMP
+ bool "Enable Maximum number of SMP Processors and NUMA Nodes"
+ depends on X86_64 && SMP && DEBUG_KERNEL
+ select CPUMASK_OFFSTACK
+ ---help---
+ Enable maximum number of CPUS and NUMA Nodes for this architecture.
+ If unsure, say N.
+
+#
+# The maximum number of CPUs supported:
+#
+# The main config value is NR_CPUS, which defaults to NR_CPUS_DEFAULT,
+# and which can be configured interactively in the
+# [NR_CPUS_RANGE_BEGIN ... NR_CPUS_RANGE_END] range.
+#
+# The ranges are different on 32-bit and 64-bit kernels, depending on
+# hardware capabilities and scalability features of the kernel.
+#
+# ( If MAXSMP is enabled we just use the highest possible value and disable
+# interactive configuration. )
+#
+
+config NR_CPUS_RANGE_BEGIN
+ int
+ default NR_CPUS_RANGE_END if MAXSMP
+ default 1 if !SMP
+ default 2
+
+config NR_CPUS_RANGE_END
+ int
+ depends on X86_32
+ default 64 if SMP && X86_BIGSMP
+ default 8 if SMP && !X86_BIGSMP
+ default 1 if !SMP
+
+config NR_CPUS_RANGE_END
+ int
+ depends on X86_64
+ default 8192 if SMP && ( MAXSMP || CPUMASK_OFFSTACK)
+ default 512 if SMP && (!MAXSMP && !CPUMASK_OFFSTACK)
+ default 1 if !SMP
+
+config NR_CPUS_DEFAULT
+ int
+ depends on X86_32
+ default 32 if X86_BIGSMP
+ default 8 if SMP
+ default 1 if !SMP
+
+config NR_CPUS_DEFAULT
+ int
+ depends on X86_64
+ default 8192 if MAXSMP
+ default 64 if SMP
+ default 1 if !SMP
+
+config NR_CPUS
+ int "Maximum number of CPUs" if SMP && !MAXSMP
+ range NR_CPUS_RANGE_BEGIN NR_CPUS_RANGE_END
+ default NR_CPUS_DEFAULT
+ ---help---
+ This allows you to specify the maximum number of CPUs which this
+ kernel will support. If CPUMASK_OFFSTACK is enabled, the maximum
+ supported value is 8192, otherwise the maximum value is 512. The
+ minimum value which makes sense is 2.
+
+ This is purely to save memory: each supported CPU adds about 8KB
+ to the kernel image.
+
+config SCHED_SMT
+ def_bool y if SMP
+
+config SCHED_MC
+ def_bool y
+ prompt "Multi-core scheduler support"
+ depends on SMP
+ ---help---
+ Multi-core scheduler support improves the CPU scheduler's decision
+ making when dealing with multi-core CPU chips at a cost of slightly
+ increased overhead in some places. If unsure say N here.
+
+config SCHED_MC_PRIO
+ bool "CPU core priorities scheduler support"
+ depends on SCHED_MC && CPU_SUP_INTEL
+ select X86_INTEL_PSTATE
+ select CPU_FREQ
+ default y
+ ---help---
+ Intel Turbo Boost Max Technology 3.0 enabled CPUs have a
+ core ordering determined at manufacturing time, which allows
+ certain cores to reach higher turbo frequencies (when running
+ single threaded workloads) than others.
+
+ Enabling this kernel feature teaches the scheduler about
+ the TBM3 (aka ITMT) priority order of the CPU cores and adjusts the
+ scheduler's CPU selection logic accordingly, so that higher
+ overall system performance can be achieved.
+
+ This feature will have no effect on CPUs without this feature.
+
+ If unsure say Y here.
+
+config UP_LATE_INIT
+ def_bool y
+ depends on !SMP && X86_LOCAL_APIC
+
+config X86_UP_APIC
+ bool "Local APIC support on uniprocessors" if !PCI_MSI
+ default PCI_MSI
+ depends on X86_32 && !SMP && !X86_32_NON_STANDARD
+ ---help---
+ A local APIC (Advanced Programmable Interrupt Controller) is an
+ integrated interrupt controller in the CPU. If you have a single-CPU
+ system which has a processor with a local APIC, you can say Y here to
+ enable and use it. If you say Y here even though your machine doesn't
+ have a local APIC, then the kernel will still run with no slowdown at
+ all. The local APIC supports CPU-generated self-interrupts (timer,
+ performance counters), and the NMI watchdog which detects hard
+ lockups.
+
+config X86_UP_IOAPIC
+ bool "IO-APIC support on uniprocessors"
+ depends on X86_UP_APIC
+ ---help---
+ An IO-APIC (I/O Advanced Programmable Interrupt Controller) is an
+ SMP-capable replacement for PC-style interrupt controllers. Most
+ SMP systems and many recent uniprocessor systems have one.
+
+ If you have a single-CPU system with an IO-APIC, you can say Y here
+ to use it. If you say Y here even though your machine doesn't have
+ an IO-APIC, then the kernel will still run with no slowdown at all.
+
+config X86_LOCAL_APIC
+ def_bool y
+ depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC || PCI_MSI
+ select IRQ_DOMAIN_HIERARCHY
+ select PCI_MSI_IRQ_DOMAIN if PCI_MSI
+
+config X86_IO_APIC
+ def_bool y
+ depends on X86_LOCAL_APIC || X86_UP_IOAPIC
+
+config X86_REROUTE_FOR_BROKEN_BOOT_IRQS
+ bool "Reroute for broken boot IRQs"
+ depends on X86_IO_APIC
+ ---help---
+ This option enables a workaround that fixes a source of
+ spurious interrupts. This is recommended when threaded
+ interrupt handling is used on systems where the generation of
+ superfluous "boot interrupts" cannot be disabled.
+
+ Some chipsets generate a legacy INTx "boot IRQ" when the IRQ
+ entry in the chipset's IO-APIC is masked (as, e.g. the RT
+ kernel does during interrupt handling). On chipsets where this
+ boot IRQ generation cannot be disabled, this workaround keeps
+ the original IRQ line masked so that only the equivalent "boot
+ IRQ" is delivered to the CPUs. The workaround also tells the
+ kernel to set up the IRQ handler on the boot IRQ line. In this
+ way only one interrupt is delivered to the kernel. Otherwise
+ the spurious second interrupt may cause the kernel to bring
+ down (vital) interrupt lines.
+
+ Only affects "broken" chipsets. Interrupt sharing may be
+ increased on these systems.
+
+config X86_MCE
+ bool "Machine Check / overheating reporting"
+ select GENERIC_ALLOCATOR
+ default y
+ ---help---
+ Machine Check support allows the processor to notify the
+ kernel if it detects a problem (e.g. overheating, data corruption).
+ The action the kernel takes depends on the severity of the problem,
+ ranging from warning messages to halting the machine.
+
+config X86_MCELOG_LEGACY
+ bool "Support for deprecated /dev/mcelog character device"
+ depends on X86_MCE
+ ---help---
+ Enable support for /dev/mcelog which is needed by the old mcelog
+ userspace logging daemon. Consider switching to the new generation
+ rasdaemon solution.
+
+config X86_MCE_INTEL
+ def_bool y
+ prompt "Intel MCE features"
+ depends on X86_MCE && X86_LOCAL_APIC
+ ---help---
+ Additional support for intel specific MCE features such as
+ the thermal monitor.
+
+config X86_MCE_AMD
+ def_bool y
+ prompt "AMD MCE features"
+ depends on X86_MCE && X86_LOCAL_APIC && AMD_NB
+ ---help---
+ Additional support for AMD specific MCE features such as
+ the DRAM Error Threshold.
+
+config X86_ANCIENT_MCE
+ bool "Support for old Pentium 5 / WinChip machine checks"
+ depends on X86_32 && X86_MCE
+ ---help---
+ Include support for machine check handling on old Pentium 5 or WinChip
+ systems. These typically need to be enabled explicitly on the command
+ line.
+
+config X86_MCE_THRESHOLD
+ depends on X86_MCE_AMD || X86_MCE_INTEL
+ def_bool y
+
+config X86_MCE_INJECT
+ depends on X86_MCE && X86_LOCAL_APIC && DEBUG_FS
+ tristate "Machine check injector support"
+ ---help---
+ Provide support for injecting machine checks for testing purposes.
+ If you don't know what a machine check is and you don't do kernel
+ QA it is safe to say n.
+
+config X86_THERMAL_VECTOR
+ def_bool y
+ depends on X86_MCE_INTEL
+
+source "arch/x86/events/Kconfig"
+
+config X86_LEGACY_VM86
+ bool "Legacy VM86 support"
+ default n
+ depends on X86_32
+ ---help---
+ This option allows user programs to put the CPU into V8086
+ mode, which is an 80286-era approximation of 16-bit real mode.
+
+ Some very old versions of X and/or vbetool require this option
+ for user mode setting. Similarly, DOSEMU will use it if
+ available to accelerate real mode DOS programs. However, any
+ recent version of DOSEMU, X, or vbetool should be fully
+ functional even without kernel VM86 support, as they will all
+ fall back to software emulation. Nevertheless, if you are using
+ a 16-bit DOS program where 16-bit performance matters, vm86
+ mode might be faster than emulation and you might want to
+ enable this option.
+
+ Note that any app that works on a 64-bit kernel is unlikely to
+ need this option, as 64-bit kernels don't, and can't, support
+ V8086 mode. This option is also unrelated to 16-bit protected
+ mode and is not needed to run most 16-bit programs under Wine.
+
+ Enabling this option increases the complexity of the kernel
+ and slows down exception handling a tiny bit.
+
+ If unsure, say N here.
+
+config VM86
+ bool
+ default X86_LEGACY_VM86
+
+config X86_16BIT
+ bool "Enable support for 16-bit segments" if EXPERT
+ default y
+ depends on MODIFY_LDT_SYSCALL
+ ---help---
+ This option is required by programs like Wine to run 16-bit
+ protected mode legacy code on x86 processors. Disabling
+ this option saves about 300 bytes on i386, or around 6K text
+ plus 16K runtime memory on x86-64,
+
+config X86_ESPFIX32
+ def_bool y
+ depends on X86_16BIT && X86_32
+
+config X86_ESPFIX64
+ def_bool y
+ depends on X86_16BIT && X86_64
+
+config X86_VSYSCALL_EMULATION
+ bool "Enable vsyscall emulation" if EXPERT
+ default y
+ depends on X86_64
+ ---help---
+ This enables emulation of the legacy vsyscall page. Disabling
+ it is roughly equivalent to booting with vsyscall=none, except
+ that it will also disable the helpful warning if a program
+ tries to use a vsyscall. With this option set to N, offending
+ programs will just segfault, citing addresses of the form
+ 0xffffffffff600?00.
+
+ This option is required by many programs built before 2013, and
+ care should be used even with newer programs if set to N.
+
+ Disabling this option saves about 7K of kernel size and
+ possibly 4K of additional runtime pagetable memory.
+
+config TOSHIBA
+ tristate "Toshiba Laptop support"
+ depends on X86_32
+ ---help---
+ This adds a driver to safely access the System Management Mode of
+ the CPU on Toshiba portables with a genuine Toshiba BIOS. It does
+ not work on models with a Phoenix BIOS. The System Management Mode
+ is used to set the BIOS and power saving options on Toshiba portables.
+
+ For information on utilities to make use of this driver see the
+ Toshiba Linux utilities web site at:
+ <http://www.buzzard.org.uk/toshiba/>.
+
+ Say Y if you intend to run this kernel on a Toshiba portable.
+ Say N otherwise.
+
+config I8K
+ tristate "Dell i8k legacy laptop support"
+ select HWMON
+ select SENSORS_DELL_SMM
+ ---help---
+ This option enables legacy /proc/i8k userspace interface in hwmon
+ dell-smm-hwmon driver. Character file /proc/i8k reports bios version,
+ temperature and allows controlling fan speeds of Dell laptops via
+ System Management Mode. For old Dell laptops (like Dell Inspiron 8000)
+ it reports also power and hotkey status. For fan speed control is
+ needed userspace package i8kutils.
+
+ Say Y if you intend to run this kernel on old Dell laptops or want to
+ use userspace package i8kutils.
+ Say N otherwise.
+
+config X86_REBOOTFIXUPS
+ bool "Enable X86 board specific fixups for reboot"
+ depends on X86_32
+ ---help---
+ This enables chipset and/or board specific fixups to be done
+ in order to get reboot to work correctly. This is only needed on
+ some combinations of hardware and BIOS. The symptom, for which
+ this config is intended, is when reboot ends with a stalled/hung
+ system.
+
+ Currently, the only fixup is for the Geode machines using
+ CS5530A and CS5536 chipsets and the RDC R-321x SoC.
+
+ Say Y if you want to enable the fixup. Currently, it's safe to
+ enable this option even if you don't need it.
+ Say N otherwise.
+
+config MICROCODE
+ bool "CPU microcode loading support"
+ default y
+ depends on CPU_SUP_AMD || CPU_SUP_INTEL
+ select FW_LOADER
+ ---help---
+ If you say Y here, you will be able to update the microcode on
+ Intel and AMD processors. The Intel support is for the IA32 family,
+ e.g. Pentium Pro, Pentium II, Pentium III, Pentium 4, Xeon etc. The
+ AMD support is for families 0x10 and later. You will obviously need
+ the actual microcode binary data itself which is not shipped with
+ the Linux kernel.
+
+ The preferred method to load microcode from a detached initrd is described
+ in Documentation/x86/microcode.txt. For that you need to enable
+ CONFIG_BLK_DEV_INITRD in order for the loader to be able to scan the
+ initrd for microcode blobs.
+
+ In addition, you can build the microcode into the kernel. For that you
+ need to add the vendor-supplied microcode to the CONFIG_EXTRA_FIRMWARE
+ config option.
+
+config MICROCODE_INTEL
+ bool "Intel microcode loading support"
+ depends on MICROCODE
+ default MICROCODE
+ select FW_LOADER
+ ---help---
+ This options enables microcode patch loading support for Intel
+ processors.
+
+ For the current Intel microcode data package go to
+ <https://downloadcenter.intel.com> and search for
+ 'Linux Processor Microcode Data File'.
+
+config MICROCODE_AMD
+ bool "AMD microcode loading support"
+ depends on MICROCODE
+ select FW_LOADER
+ ---help---
+ If you select this option, microcode patch loading support for AMD
+ processors will be enabled.
+
+config MICROCODE_OLD_INTERFACE
+ def_bool y
+ depends on MICROCODE
+
+config X86_MSR
+ tristate "/dev/cpu/*/msr - Model-specific register support"
+ ---help---
+ This device gives privileged processes access to the x86
+ Model-Specific Registers (MSRs). It is a character device with
+ major 202 and minors 0 to 31 for /dev/cpu/0/msr to /dev/cpu/31/msr.
+ MSR accesses are directed to a specific CPU on multi-processor
+ systems.
+
+config X86_CPUID
+ tristate "/dev/cpu/*/cpuid - CPU information support"
+ ---help---
+ This device gives processes access to the x86 CPUID instruction to
+ be executed on a specific processor. It is a character device
+ with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to
+ /dev/cpu/31/cpuid.
+
+choice
+ prompt "High Memory Support"
+ default HIGHMEM4G
+ depends on X86_32
+
+config NOHIGHMEM
+ bool "off"
+ ---help---
+ Linux can use up to 64 Gigabytes of physical memory on x86 systems.
+ However, the address space of 32-bit x86 processors is only 4
+ Gigabytes large. That means that, if you have a large amount of
+ physical memory, not all of it can be "permanently mapped" by the
+ kernel. The physical memory that's not permanently mapped is called
+ "high memory".
+
+ If you are compiling a kernel which will never run on a machine with
+ more than 1 Gigabyte total physical RAM, answer "off" here (default
+ choice and suitable for most users). This will result in a "3GB/1GB"
+ split: 3GB are mapped so that each process sees a 3GB virtual memory
+ space and the remaining part of the 4GB virtual memory space is used
+ by the kernel to permanently map as much physical memory as
+ possible.
+
+ If the machine has between 1 and 4 Gigabytes physical RAM, then
+ answer "4GB" here.
+
+ If more than 4 Gigabytes is used then answer "64GB" here. This
+ selection turns Intel PAE (Physical Address Extension) mode on.
+ PAE implements 3-level paging on IA32 processors. PAE is fully
+ supported by Linux, PAE mode is implemented on all recent Intel
+ processors (Pentium Pro and better). NOTE: If you say "64GB" here,
+ then the kernel will not boot on CPUs that don't support PAE!
+
+ The actual amount of total physical memory will either be
+ auto detected or can be forced by using a kernel command line option
+ such as "mem=256M". (Try "man bootparam" or see the documentation of
+ your boot loader (lilo or loadlin) about how to pass options to the
+ kernel at boot time.)
+
+ If unsure, say "off".
+
+config HIGHMEM4G
+ bool "4GB"
+ ---help---
+ Select this if you have a 32-bit processor and between 1 and 4
+ gigabytes of physical RAM.
+
+config HIGHMEM64G
+ bool "64GB"
+ depends on !M486 && !M586 && !M586TSC && !M586MMX && !MGEODE_LX && !MGEODEGX1 && !MCYRIXIII && !MELAN && !MWINCHIPC6 && !MWINCHIP3D && !MK6
+ select X86_PAE
+ ---help---
+ Select this if you have a 32-bit processor and more than 4
+ gigabytes of physical RAM.
+
+endchoice
+
+choice
+ prompt "Memory split" if EXPERT
+ default VMSPLIT_3G
+ depends on X86_32
+ ---help---
+ Select the desired split between kernel and user memory.
+
+ If the address range available to the kernel is less than the
+ physical memory installed, the remaining memory will be available
+ as "high memory". Accessing high memory is a little more costly
+ than low memory, as it needs to be mapped into the kernel first.
+ Note that increasing the kernel address space limits the range
+ available to user programs, making the address space there
+ tighter. Selecting anything other than the default 3G/1G split
+ will also likely make your kernel incompatible with binary-only
+ kernel modules.
+
+ If you are not absolutely sure what you are doing, leave this
+ option alone!
+
+ config VMSPLIT_3G
+ bool "3G/1G user/kernel split"
+ config VMSPLIT_3G_OPT
+ depends on !X86_PAE
+ bool "3G/1G user/kernel split (for full 1G low memory)"
+ config VMSPLIT_2G
+ bool "2G/2G user/kernel split"
+ config VMSPLIT_2G_OPT
+ depends on !X86_PAE
+ bool "2G/2G user/kernel split (for full 2G low memory)"
+ config VMSPLIT_1G
+ bool "1G/3G user/kernel split"
+endchoice
+
+config PAGE_OFFSET
+ hex
+ default 0xB0000000 if VMSPLIT_3G_OPT
+ default 0x80000000 if VMSPLIT_2G
+ default 0x78000000 if VMSPLIT_2G_OPT
+ default 0x40000000 if VMSPLIT_1G
+ default 0xC0000000
+ depends on X86_32
+
+config HIGHMEM
+ def_bool y
+ depends on X86_32 && (HIGHMEM64G || HIGHMEM4G)
+
+config X86_PAE
+ bool "PAE (Physical Address Extension) Support"
+ depends on X86_32 && !HIGHMEM4G
+ select PHYS_ADDR_T_64BIT
+ select SWIOTLB
+ ---help---
+ PAE is required for NX support, and furthermore enables
+ larger swapspace support for non-overcommit purposes. It
+ has the cost of more pagetable lookup overhead, and also
+ consumes more pagetable space per process.
+
+config X86_5LEVEL
+ bool "Enable 5-level page tables support"
+ select DYNAMIC_MEMORY_LAYOUT
+ select SPARSEMEM_VMEMMAP
+ depends on X86_64
+ ---help---
+ 5-level paging enables access to larger address space:
+ upto 128 PiB of virtual address space and 4 PiB of
+ physical address space.
+
+ It will be supported by future Intel CPUs.
+
+ A kernel with the option enabled can be booted on machines that
+ support 4- or 5-level paging.
+
+ See Documentation/x86/x86_64/5level-paging.txt for more
+ information.
+
+ Say N if unsure.
+
+config X86_DIRECT_GBPAGES
+ def_bool y
+ depends on X86_64 && !DEBUG_PAGEALLOC
+ ---help---
+ Certain kernel features effectively disable kernel
+ linear 1 GB mappings (even if the CPU otherwise
+ supports them), so don't confuse the user by printing
+ that we have them enabled.
+
+config ARCH_HAS_MEM_ENCRYPT
+ def_bool y
+
+config AMD_MEM_ENCRYPT
+ bool "AMD Secure Memory Encryption (SME) support"
+ depends on X86_64 && CPU_SUP_AMD
+ select DYNAMIC_PHYSICAL_MASK
+ select ARCH_USE_MEMREMAP_PROT
+ ---help---
+ Say yes to enable support for the encryption of system memory.
+ This requires an AMD processor that supports Secure Memory
+ Encryption (SME).
+
+config AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT
+ bool "Activate AMD Secure Memory Encryption (SME) by default"
+ depends on AMD_MEM_ENCRYPT
+ ---help---
+ Say yes to have system memory encrypted by default if running on
+ an AMD processor that supports Secure Memory Encryption (SME).
+
+ If set to Y, then the encryption of system memory can be
+ deactivated with the mem_encrypt=off command line option.
+
+ If set to N, then the encryption of system memory can be
+ activated with the mem_encrypt=on command line option.
+
+# Common NUMA Features
+config NUMA
+ bool "Numa Memory Allocation and Scheduler Support"
+ depends on SMP
+ depends on X86_64 || (X86_32 && HIGHMEM64G && X86_BIGSMP)
+ default y if X86_BIGSMP
+ ---help---
+ Enable NUMA (Non Uniform Memory Access) support.
+
+ The kernel will try to allocate memory used by a CPU on the
+ local memory controller of the CPU and add some more
+ NUMA awareness to the kernel.
+
+ For 64-bit this is recommended if the system is Intel Core i7
+ (or later), AMD Opteron, or EM64T NUMA.
+
+ For 32-bit this is only needed if you boot a 32-bit
+ kernel on a 64-bit NUMA platform.
+
+ Otherwise, you should say N.
+
+config AMD_NUMA
+ def_bool y
+ prompt "Old style AMD Opteron NUMA detection"
+ depends on X86_64 && NUMA && PCI
+ ---help---
+ Enable AMD NUMA node topology detection. You should say Y here if
+ you have a multi processor AMD system. This uses an old method to
+ read the NUMA configuration directly from the builtin Northbridge
+ of Opteron. It is recommended to use X86_64_ACPI_NUMA instead,
+ which also takes priority if both are compiled in.
+
+config X86_64_ACPI_NUMA
+ def_bool y
+ prompt "ACPI NUMA detection"
+ depends on X86_64 && NUMA && ACPI && PCI
+ select ACPI_NUMA
+ ---help---
+ Enable ACPI SRAT based node topology detection.
+
+# Some NUMA nodes have memory ranges that span
+# other nodes. Even though a pfn is valid and
+# between a node's start and end pfns, it may not
+# reside on that node. See memmap_init_zone()
+# for details.
+config NODES_SPAN_OTHER_NODES
+ def_bool y
+ depends on X86_64_ACPI_NUMA
+
+config NUMA_EMU
+ bool "NUMA emulation"
+ depends on NUMA
+ ---help---
+ Enable NUMA emulation. A flat machine will be split
+ into virtual nodes when booted with "numa=fake=N", where N is the
+ number of nodes. This is only useful for debugging.
+
+config NODES_SHIFT
+ int "Maximum NUMA Nodes (as a power of 2)" if !MAXSMP
+ range 1 10
+ default "10" if MAXSMP
+ default "6" if X86_64
+ default "3"
+ depends on NEED_MULTIPLE_NODES
+ ---help---
+ Specify the maximum number of NUMA Nodes available on the target
+ system. Increases memory reserved to accommodate various tables.
+
+config ARCH_HAVE_MEMORY_PRESENT
+ def_bool y
+ depends on X86_32 && DISCONTIGMEM
+
+config ARCH_FLATMEM_ENABLE
+ def_bool y
+ depends on X86_32 && !NUMA
+
+config ARCH_DISCONTIGMEM_ENABLE
+ def_bool y
+ depends on NUMA && X86_32
+
+config ARCH_DISCONTIGMEM_DEFAULT
+ def_bool y
+ depends on NUMA && X86_32
+
+config ARCH_SPARSEMEM_ENABLE
+ def_bool y
+ depends on X86_64 || NUMA || X86_32 || X86_32_NON_STANDARD
+ select SPARSEMEM_STATIC if X86_32
+ select SPARSEMEM_VMEMMAP_ENABLE if X86_64
+
+config ARCH_SPARSEMEM_DEFAULT
+ def_bool y
+ depends on X86_64
+
+config ARCH_SELECT_MEMORY_MODEL
+ def_bool y
+ depends on ARCH_SPARSEMEM_ENABLE
+
+config ARCH_MEMORY_PROBE
+ bool "Enable sysfs memory/probe interface"
+ depends on X86_64 && MEMORY_HOTPLUG
+ help
+ This option enables a sysfs memory/probe interface for testing.
+ See Documentation/memory-hotplug.txt for more information.
+ If you are unsure how to answer this question, answer N.
+
+config ARCH_PROC_KCORE_TEXT
+ def_bool y
+ depends on X86_64 && PROC_KCORE
+
+config ILLEGAL_POINTER_VALUE
+ hex
+ default 0 if X86_32
+ default 0xdead000000000000 if X86_64
+
+config X86_PMEM_LEGACY_DEVICE
+ bool
+
+config X86_PMEM_LEGACY
+ tristate "Support non-standard NVDIMMs and ADR protected memory"
+ depends on PHYS_ADDR_T_64BIT
+ depends on BLK_DEV
+ select X86_PMEM_LEGACY_DEVICE
+ select LIBNVDIMM
+ help
+ Treat memory marked using the non-standard e820 type of 12 as used
+ by the Intel Sandy Bridge-EP reference BIOS as protected memory.
+ The kernel will offer these regions to the 'pmem' driver so
+ they can be used for persistent storage.
+
+ Say Y if unsure.
+
+config HIGHPTE
+ bool "Allocate 3rd-level pagetables from highmem"
+ depends on HIGHMEM
+ ---help---
+ The VM uses one page table entry for each page of physical memory.
+ For systems with a lot of RAM, this can be wasteful of precious
+ low memory. Setting this option will put user-space page table
+ entries in high memory.
+
+config X86_CHECK_BIOS_CORRUPTION
+ bool "Check for low memory corruption"
+ ---help---
+ Periodically check for memory corruption in low memory, which
+ is suspected to be caused by BIOS. Even when enabled in the
+ configuration, it is disabled at runtime. Enable it by
+ setting "memory_corruption_check=1" on the kernel command
+ line. By default it scans the low 64k of memory every 60
+ seconds; see the memory_corruption_check_size and
+ memory_corruption_check_period parameters in
+ Documentation/admin-guide/kernel-parameters.rst to adjust this.
+
+ When enabled with the default parameters, this option has
+ almost no overhead, as it reserves a relatively small amount
+ of memory and scans it infrequently. It both detects corruption
+ and prevents it from affecting the running system.
+
+ It is, however, intended as a diagnostic tool; if repeatable
+ BIOS-originated corruption always affects the same memory,
+ you can use memmap= to prevent the kernel from using that
+ memory.
+
+config X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK
+ bool "Set the default setting of memory_corruption_check"
+ depends on X86_CHECK_BIOS_CORRUPTION
+ default y
+ ---help---
+ Set whether the default state of memory_corruption_check is
+ on or off.
+
+config X86_RESERVE_LOW
+ int "Amount of low memory, in kilobytes, to reserve for the BIOS"
+ default 64
+ range 4 640
+ ---help---
+ Specify the amount of low memory to reserve for the BIOS.
+
+ The first page contains BIOS data structures that the kernel
+ must not use, so that page must always be reserved.
+
+ By default we reserve the first 64K of physical RAM, as a
+ number of BIOSes are known to corrupt that memory range
+ during events such as suspend/resume or monitor cable
+ insertion, so it must not be used by the kernel.
+
+ You can set this to 4 if you are absolutely sure that you
+ trust the BIOS to get all its memory reservations and usages
+ right. If you know your BIOS have problems beyond the
+ default 64K area, you can set this to 640 to avoid using the
+ entire low memory range.
+
+ If you have doubts about the BIOS (e.g. suspend/resume does
+ not work or there's kernel crashes after certain hardware
+ hotplug events) then you might want to enable
+ X86_CHECK_BIOS_CORRUPTION=y to allow the kernel to check
+ typical corruption patterns.
+
+ Leave this to the default value of 64 if you are unsure.
+
+config MATH_EMULATION
+ bool
+ depends on MODIFY_LDT_SYSCALL
+ prompt "Math emulation" if X86_32
+ ---help---
+ Linux can emulate a math coprocessor (used for floating point
+ operations) if you don't have one. 486DX and Pentium processors have
+ a math coprocessor built in, 486SX and 386 do not, unless you added
+ a 487DX or 387, respectively. (The messages during boot time can
+ give you some hints here ["man dmesg"].) Everyone needs either a
+ coprocessor or this emulation.
+
+ If you don't have a math coprocessor, you need to say Y here; if you
+ say Y here even though you have a coprocessor, the coprocessor will
+ be used nevertheless. (This behavior can be changed with the kernel
+ command line option "no387", which comes handy if your coprocessor
+ is broken. Try "man bootparam" or see the documentation of your boot
+ loader (lilo or loadlin) about how to pass options to the kernel at
+ boot time.) This means that it is a good idea to say Y here if you
+ intend to use this kernel on different machines.
+
+ More information about the internals of the Linux math coprocessor
+ emulation can be found in <file:arch/x86/math-emu/README>.
+
+ If you are not sure, say Y; apart from resulting in a 66 KB bigger
+ kernel, it won't hurt.
+
+config MTRR
+ def_bool y
+ prompt "MTRR (Memory Type Range Register) support" if EXPERT
+ ---help---
+ On Intel P6 family processors (Pentium Pro, Pentium II and later)
+ the Memory Type Range Registers (MTRRs) may be used to control
+ processor access to memory ranges. This is most useful if you have
+ a video (VGA) card on a PCI or AGP bus. Enabling write-combining
+ allows bus write transfers to be combined into a larger transfer
+ before bursting over the PCI/AGP bus. This can increase performance
+ of image write operations 2.5 times or more. Saying Y here creates a
+ /proc/mtrr file which may be used to manipulate your processor's
+ MTRRs. Typically the X server should use this.
+
+ This code has a reasonably generic interface so that similar
+ control registers on other processors can be easily supported
+ as well:
+
+ The Cyrix 6x86, 6x86MX and M II processors have Address Range
+ Registers (ARRs) which provide a similar functionality to MTRRs. For
+ these, the ARRs are used to emulate the MTRRs.
+ The AMD K6-2 (stepping 8 and above) and K6-3 processors have two
+ MTRRs. The Centaur C6 (WinChip) has 8 MCRs, allowing
+ write-combining. All of these processors are supported by this code
+ and it makes sense to say Y here if you have one of them.
+
+ Saying Y here also fixes a problem with buggy SMP BIOSes which only
+ set the MTRRs for the boot CPU and not for the secondary CPUs. This
+ can lead to all sorts of problems, so it's good to say Y here.
+
+ You can safely say Y even if your machine doesn't have MTRRs, you'll
+ just add about 9 KB to your kernel.
+
+ See <file:Documentation/x86/mtrr.txt> for more information.
+
+config MTRR_SANITIZER
+ def_bool y
+ prompt "MTRR cleanup support"
+ depends on MTRR
+ ---help---
+ Convert MTRR layout from continuous to discrete, so X drivers can
+ add writeback entries.
+
+ Can be disabled with disable_mtrr_cleanup on the kernel command line.
+ The largest mtrr entry size for a continuous block can be set with
+ mtrr_chunk_size.
+
+ If unsure, say Y.
+
+config MTRR_SANITIZER_ENABLE_DEFAULT
+ int "MTRR cleanup enable value (0-1)"
+ range 0 1
+ default "0"
+ depends on MTRR_SANITIZER
+ ---help---
+ Enable mtrr cleanup default value
+
+config MTRR_SANITIZER_SPARE_REG_NR_DEFAULT
+ int "MTRR cleanup spare reg num (0-7)"
+ range 0 7
+ default "1"
+ depends on MTRR_SANITIZER
+ ---help---
+ mtrr cleanup spare entries default, it can be changed via
+ mtrr_spare_reg_nr=N on the kernel command line.
+
+config X86_PAT
+ def_bool y
+ prompt "x86 PAT support" if EXPERT
+ depends on MTRR
+ ---help---
+ Use PAT attributes to setup page level cache control.
+
+ PATs are the modern equivalents of MTRRs and are much more
+ flexible than MTRRs.
+
+ Say N here if you see bootup problems (boot crash, boot hang,
+ spontaneous reboots) or a non-working video driver.
+
+ If unsure, say Y.
+
+config ARCH_USES_PG_UNCACHED
+ def_bool y
+ depends on X86_PAT
+
+config ARCH_RANDOM
+ def_bool y
+ prompt "x86 architectural random number generator" if EXPERT
+ ---help---
+ Enable the x86 architectural RDRAND instruction
+ (Intel Bull Mountain technology) to generate random numbers.
+ If supported, this is a high bandwidth, cryptographically
+ secure hardware random number generator.
+
+config X86_SMAP
+ def_bool y
+ prompt "Supervisor Mode Access Prevention" if EXPERT
+ ---help---
+ Supervisor Mode Access Prevention (SMAP) is a security
+ feature in newer Intel processors. There is a small
+ performance cost if this enabled and turned on; there is
+ also a small increase in the kernel size if this is enabled.
+
+ If unsure, say Y.
+
+config X86_INTEL_UMIP
+ def_bool y
+ depends on CPU_SUP_INTEL
+ prompt "Intel User Mode Instruction Prevention" if EXPERT
+ ---help---
+ The User Mode Instruction Prevention (UMIP) is a security
+ feature in newer Intel processors. If enabled, a general
+ protection fault is issued if the SGDT, SLDT, SIDT, SMSW
+ or STR instructions are executed in user mode. These instructions
+ unnecessarily expose information about the hardware state.
+
+ The vast majority of applications do not use these instructions.
+ For the very few that do, software emulation is provided in
+ specific cases in protected and virtual-8086 modes. Emulated
+ results are dummy.
+
+config X86_INTEL_MPX
+ prompt "Intel MPX (Memory Protection Extensions)"
+ def_bool n
+ # Note: only available in 64-bit mode due to VMA flags shortage
+ depends on CPU_SUP_INTEL && X86_64
+ select ARCH_USES_HIGH_VMA_FLAGS
+ ---help---
+ MPX provides hardware features that can be used in
+ conjunction with compiler-instrumented code to check
+ memory references. It is designed to detect buffer
+ overflow or underflow bugs.
+
+ This option enables running applications which are
+ instrumented or otherwise use MPX. It does not use MPX
+ itself inside the kernel or to protect the kernel
+ against bad memory references.
+
+ Enabling this option will make the kernel larger:
+ ~8k of kernel text and 36 bytes of data on a 64-bit
+ defconfig. It adds a long to the 'mm_struct' which
+ will increase the kernel memory overhead of each
+ process and adds some branches to paths used during
+ exec() and munmap().
+
+ For details, see Documentation/x86/intel_mpx.txt
+
+ If unsure, say N.
+
+config X86_INTEL_MEMORY_PROTECTION_KEYS
+ prompt "Intel Memory Protection Keys"
+ def_bool y
+ # Note: only available in 64-bit mode
+ depends on CPU_SUP_INTEL && X86_64
+ select ARCH_USES_HIGH_VMA_FLAGS
+ select ARCH_HAS_PKEYS
+ ---help---
+ Memory Protection Keys provides a mechanism for enforcing
+ page-based protections, but without requiring modification of the
+ page tables when an application changes protection domains.
+
+ For details, see Documentation/x86/protection-keys.txt
+
+ If unsure, say y.
+
+choice
+ prompt "TSX enable mode"
+ depends on CPU_SUP_INTEL
+ default X86_INTEL_TSX_MODE_OFF
+ help
+ Intel's TSX (Transactional Synchronization Extensions) feature
+ allows to optimize locking protocols through lock elision which
+ can lead to a noticeable performance boost.
+
+ On the other hand it has been shown that TSX can be exploited
+ to form side channel attacks (e.g. TAA) and chances are there
+ will be more of those attacks discovered in the future.
+
+ Therefore TSX is not enabled by default (aka tsx=off). An admin
+ might override this decision by tsx=on the command line parameter.
+ Even with TSX enabled, the kernel will attempt to enable the best
+ possible TAA mitigation setting depending on the microcode available
+ for the particular machine.
+
+ This option allows to set the default tsx mode between tsx=on, =off
+ and =auto. See Documentation/admin-guide/kernel-parameters.txt for more
+ details.
+
+ Say off if not sure, auto if TSX is in use but it should be used on safe
+ platforms or on if TSX is in use and the security aspect of tsx is not
+ relevant.
+
+config X86_INTEL_TSX_MODE_OFF
+ bool "off"
+ help
+ TSX is disabled if possible - equals to tsx=off command line parameter.
+
+config X86_INTEL_TSX_MODE_ON
+ bool "on"
+ help
+ TSX is always enabled on TSX capable HW - equals the tsx=on command
+ line parameter.
+
+config X86_INTEL_TSX_MODE_AUTO
+ bool "auto"
+ help
+ TSX is enabled on TSX capable HW that is believed to be safe against
+ side channel attacks- equals the tsx=auto command line parameter.
+endchoice
+
+config EFI
+ bool "EFI runtime service support"
+ depends on ACPI
+ select UCS2_STRING
+ select EFI_RUNTIME_WRAPPERS
+ select ARCH_USE_MEMREMAP_PROT
+ ---help---
+ This enables the kernel to use EFI runtime services that are
+ available (such as the EFI variable services).
+
+ This option is only useful on systems that have EFI firmware.
+ In addition, you should use the latest ELILO loader available
+ at <http://elilo.sourceforge.net> in order to take advantage
+ of EFI runtime services. However, even with this option, the
+ resultant kernel should continue to boot on existing non-EFI
+ platforms.
+
+config EFI_STUB
+ bool "EFI stub support"
+ depends on EFI && !X86_USE_3DNOW
+ select RELOCATABLE
+ ---help---
+ This kernel feature allows a bzImage to be loaded directly
+ by EFI firmware without the use of a bootloader.
+
+ See Documentation/efi-stub.txt for more information.
+
+config EFI_MIXED
+ bool "EFI mixed-mode support"
+ depends on EFI_STUB && X86_64
+ ---help---
+ Enabling this feature allows a 64-bit kernel to be booted
+ on a 32-bit firmware, provided that your CPU supports 64-bit
+ mode.
+
+ Note that it is not possible to boot a mixed-mode enabled
+ kernel via the EFI boot stub - a bootloader that supports
+ the EFI handover protocol must be used.
+
+ If unsure, say N.
+
+config SECCOMP
+ def_bool y
+ prompt "Enable seccomp to safely compute untrusted bytecode"
+ ---help---
+ This kernel feature is useful for number crunching applications
+ that may need to compute untrusted bytecode during their
+ execution. By using pipes or other transports made available to
+ the process as file descriptors supporting the read/write
+ syscalls, it's possible to isolate those applications in
+ their own address space using seccomp. Once seccomp is
+ enabled via prctl(PR_SET_SECCOMP), it cannot be disabled
+ and the task is only allowed to execute a few safe syscalls
+ defined by each seccomp mode.
+
+ If unsure, say Y. Only embedded should say N here.
+
+source kernel/Kconfig.hz
+
+config KEXEC
+ bool "kexec system call"
+ select KEXEC_CORE
+ ---help---
+ kexec is a system call that implements the ability to shutdown your
+ current kernel, and to start another kernel. It is like a reboot
+ but it is independent of the system firmware. And like a reboot
+ you can start any kernel with it, not just Linux.
+
+ The name comes from the similarity to the exec system call.
+
+ It is an ongoing process to be certain the hardware in a machine
+ is properly shutdown, so do not be surprised if this code does not
+ initially work for you. As of this writing the exact hardware
+ interface is strongly in flux, so no good recommendation can be
+ made.
+
+config KEXEC_FILE
+ bool "kexec file based system call"
+ select KEXEC_CORE
+ select BUILD_BIN2C
+ depends on X86_64
+ depends on CRYPTO=y
+ depends on CRYPTO_SHA256=y
+ ---help---
+ This is new version of kexec system call. This system call is
+ file based and takes file descriptors as system call argument
+ for kernel and initramfs as opposed to list of segments as
+ accepted by previous system call.
+
+config ARCH_HAS_KEXEC_PURGATORY
+ def_bool KEXEC_FILE
+
+config KEXEC_VERIFY_SIG
+ bool "Verify kernel signature during kexec_file_load() syscall"
+ depends on KEXEC_FILE
+ ---help---
+ This option makes kernel signature verification mandatory for
+ the kexec_file_load() syscall.
+
+ In addition to that option, you need to enable signature
+ verification for the corresponding kernel image type being
+ loaded in order for this to work.
+
+config KEXEC_BZIMAGE_VERIFY_SIG
+ bool "Enable bzImage signature verification support"
+ depends on KEXEC_VERIFY_SIG
+ depends on SIGNED_PE_FILE_VERIFICATION
+ select SYSTEM_TRUSTED_KEYRING
+ ---help---
+ Enable bzImage signature verification support.
+
+config CRASH_DUMP
+ bool "kernel crash dumps"
+ depends on X86_64 || (X86_32 && HIGHMEM)
+ ---help---
+ Generate crash dump after being started by kexec.
+ This should be normally only set in special crash dump kernels
+ which are loaded in the main kernel with kexec-tools into
+ a specially reserved region and then later executed after
+ a crash by kdump/kexec. The crash dump kernel must be compiled
+ to a memory address not used by the main kernel or BIOS using
+ PHYSICAL_START, or it must be built as a relocatable image
+ (CONFIG_RELOCATABLE=y).
+ For more details see Documentation/kdump/kdump.txt
+
+config KEXEC_JUMP
+ bool "kexec jump"
+ depends on KEXEC && HIBERNATION
+ ---help---
+ Jump between original kernel and kexeced kernel and invoke
+ code in physical address mode via KEXEC
+
+config PHYSICAL_START
+ hex "Physical address where the kernel is loaded" if (EXPERT || CRASH_DUMP)
+ default "0x1000000"
+ ---help---
+ This gives the physical address where the kernel is loaded.
+
+ If kernel is a not relocatable (CONFIG_RELOCATABLE=n) then
+ bzImage will decompress itself to above physical address and
+ run from there. Otherwise, bzImage will run from the address where
+ it has been loaded by the boot loader and will ignore above physical
+ address.
+
+ In normal kdump cases one does not have to set/change this option
+ as now bzImage can be compiled as a completely relocatable image
+ (CONFIG_RELOCATABLE=y) and be used to load and run from a different
+ address. This option is mainly useful for the folks who don't want
+ to use a bzImage for capturing the crash dump and want to use a
+ vmlinux instead. vmlinux is not relocatable hence a kernel needs
+ to be specifically compiled to run from a specific memory area
+ (normally a reserved region) and this option comes handy.
+
+ So if you are using bzImage for capturing the crash dump,
+ leave the value here unchanged to 0x1000000 and set
+ CONFIG_RELOCATABLE=y. Otherwise if you plan to use vmlinux
+ for capturing the crash dump change this value to start of
+ the reserved region. In other words, it can be set based on
+ the "X" value as specified in the "crashkernel=YM@XM"
+ command line boot parameter passed to the panic-ed
+ kernel. Please take a look at Documentation/kdump/kdump.txt
+ for more details about crash dumps.
+
+ Usage of bzImage for capturing the crash dump is recommended as
+ one does not have to build two kernels. Same kernel can be used
+ as production kernel and capture kernel. Above option should have
+ gone away after relocatable bzImage support is introduced. But it
+ is present because there are users out there who continue to use
+ vmlinux for dump capture. This option should go away down the
+ line.
+
+ Don't change this unless you know what you are doing.
+
+config RELOCATABLE
+ bool "Build a relocatable kernel"
+ default y
+ ---help---
+ This builds a kernel image that retains relocation information
+ so it can be loaded someplace besides the default 1MB.
+ The relocations tend to make the kernel binary about 10% larger,
+ but are discarded at runtime.
+
+ One use is for the kexec on panic case where the recovery kernel
+ must live at a different physical address than the primary
+ kernel.
+
+ Note: If CONFIG_RELOCATABLE=y, then the kernel runs from the address
+ it has been loaded at and the compile time physical address
+ (CONFIG_PHYSICAL_START) is used as the minimum location.
+
+config RANDOMIZE_BASE
+ bool "Randomize the address of the kernel image (KASLR)"
+ depends on RELOCATABLE
+ default y
+ ---help---
+ In support of Kernel Address Space Layout Randomization (KASLR),
+ this randomizes the physical address at which the kernel image
+ is decompressed and the virtual address where the kernel
+ image is mapped, as a security feature that deters exploit
+ attempts relying on knowledge of the location of kernel
+ code internals.
+
+ On 64-bit, the kernel physical and virtual addresses are
+ randomized separately. The physical address will be anywhere
+ between 16MB and the top of physical memory (up to 64TB). The
+ virtual address will be randomized from 16MB up to 1GB (9 bits
+ of entropy). Note that this also reduces the memory space
+ available to kernel modules from 1.5GB to 1GB.
+
+ On 32-bit, the kernel physical and virtual addresses are
+ randomized together. They will be randomized from 16MB up to
+ 512MB (8 bits of entropy).
+
+ Entropy is generated using the RDRAND instruction if it is
+ supported. If RDTSC is supported, its value is mixed into
+ the entropy pool as well. If neither RDRAND nor RDTSC are
+ supported, then entropy is read from the i8254 timer. The
+ usable entropy is limited by the kernel being built using
+ 2GB addressing, and that PHYSICAL_ALIGN must be at a
+ minimum of 2MB. As a result, only 10 bits of entropy are
+ theoretically possible, but the implementations are further
+ limited due to memory layouts.
+
+ If unsure, say Y.
+
+# Relocation on x86 needs some additional build support
+config X86_NEED_RELOCS
+ def_bool y
+ depends on RANDOMIZE_BASE || (X86_32 && RELOCATABLE)
+
+config PHYSICAL_ALIGN
+ hex "Alignment value to which kernel should be aligned"
+ default "0x200000"
+ range 0x2000 0x1000000 if X86_32
+ range 0x200000 0x1000000 if X86_64
+ ---help---
+ This value puts the alignment restrictions on physical address
+ where kernel is loaded and run from. Kernel is compiled for an
+ address which meets above alignment restriction.
+
+ If bootloader loads the kernel at a non-aligned address and
+ CONFIG_RELOCATABLE is set, kernel will move itself to nearest
+ address aligned to above value and run from there.
+
+ If bootloader loads the kernel at a non-aligned address and
+ CONFIG_RELOCATABLE is not set, kernel will ignore the run time
+ load address and decompress itself to the address it has been
+ compiled for and run from there. The address for which kernel is
+ compiled already meets above alignment restrictions. Hence the
+ end result is that kernel runs from a physical address meeting
+ above alignment restrictions.
+
+ On 32-bit this value must be a multiple of 0x2000. On 64-bit
+ this value must be a multiple of 0x200000.
+
+ Don't change this unless you know what you are doing.
+
+config DYNAMIC_MEMORY_LAYOUT
+ bool
+ ---help---
+ This option makes base addresses of vmalloc and vmemmap as well as
+ __PAGE_OFFSET movable during boot.
+
+config RANDOMIZE_MEMORY
+ bool "Randomize the kernel memory sections"
+ depends on X86_64
+ depends on RANDOMIZE_BASE
+ select DYNAMIC_MEMORY_LAYOUT
+ default RANDOMIZE_BASE
+ ---help---
+ Randomizes the base virtual address of kernel memory sections
+ (physical memory mapping, vmalloc & vmemmap). This security feature
+ makes exploits relying on predictable memory locations less reliable.
+
+ The order of allocations remains unchanged. Entropy is generated in
+ the same way as RANDOMIZE_BASE. Current implementation in the optimal
+ configuration have in average 30,000 different possible virtual
+ addresses for each memory section.
+
+ If unsure, say Y.
+
+config RANDOMIZE_MEMORY_PHYSICAL_PADDING
+ hex "Physical memory mapping padding" if EXPERT
+ depends on RANDOMIZE_MEMORY
+ default "0xa" if MEMORY_HOTPLUG
+ default "0x0"
+ range 0x1 0x40 if MEMORY_HOTPLUG
+ range 0x0 0x40
+ ---help---
+ Define the padding in terabytes added to the existing physical
+ memory size during kernel memory randomization. It is useful
+ for memory hotplug support but reduces the entropy available for
+ address randomization.
+
+ If unsure, leave at the default value.
+
+config HOTPLUG_CPU
+ def_bool y
+ depends on SMP
+
+config BOOTPARAM_HOTPLUG_CPU0
+ bool "Set default setting of cpu0_hotpluggable"
+ default n
+ depends on HOTPLUG_CPU
+ ---help---
+ Set whether default state of cpu0_hotpluggable is on or off.
+
+ Say Y here to enable CPU0 hotplug by default. If this switch
+ is turned on, there is no need to give cpu0_hotplug kernel
+ parameter and the CPU0 hotplug feature is enabled by default.
+
+ Please note: there are two known CPU0 dependencies if you want
+ to enable the CPU0 hotplug feature either by this switch or by
+ cpu0_hotplug kernel parameter.
+
+ First, resume from hibernate or suspend always starts from CPU0.
+ So hibernate and suspend are prevented if CPU0 is offline.
+
+ Second dependency is PIC interrupts always go to CPU0. CPU0 can not
+ offline if any interrupt can not migrate out of CPU0. There may
+ be other CPU0 dependencies.
+
+ Please make sure the dependencies are under your control before
+ you enable this feature.
+
+ Say N if you don't want to enable CPU0 hotplug feature by default.
+ You still can enable the CPU0 hotplug feature at boot by kernel
+ parameter cpu0_hotplug.
+
+config DEBUG_HOTPLUG_CPU0
+ def_bool n
+ prompt "Debug CPU0 hotplug"
+ depends on HOTPLUG_CPU
+ ---help---
+ Enabling this option offlines CPU0 (if CPU0 can be offlined) as
+ soon as possible and boots up userspace with CPU0 offlined. User
+ can online CPU0 back after boot time.
+
+ To debug CPU0 hotplug, you need to enable CPU0 offline/online
+ feature by either turning on CONFIG_BOOTPARAM_HOTPLUG_CPU0 during
+ compilation or giving cpu0_hotplug kernel parameter at boot.
+
+ If unsure, say N.
+
+config COMPAT_VDSO
+ def_bool n
+ prompt "Disable the 32-bit vDSO (needed for glibc 2.3.3)"
+ depends on COMPAT_32
+ ---help---
+ Certain buggy versions of glibc will crash if they are
+ presented with a 32-bit vDSO that is not mapped at the address
+ indicated in its segment table.
+
+ The bug was introduced by f866314b89d56845f55e6f365e18b31ec978ec3a
+ and fixed by 3b3ddb4f7db98ec9e912ccdf54d35df4aa30e04a and
+ 49ad572a70b8aeb91e57483a11dd1b77e31c4468. Glibc 2.3.3 is
+ the only released version with the bug, but OpenSUSE 9
+ contains a buggy "glibc 2.3.2".
+
+ The symptom of the bug is that everything crashes on startup, saying:
+ dl_main: Assertion `(void *) ph->p_vaddr == _rtld_local._dl_sysinfo_dso' failed!
+
+ Saying Y here changes the default value of the vdso32 boot
+ option from 1 to 0, which turns off the 32-bit vDSO entirely.
+ This works around the glibc bug but hurts performance.
+
+ If unsure, say N: if you are compiling your own kernel, you
+ are unlikely to be using a buggy version of glibc.
+
+choice
+ prompt "vsyscall table for legacy applications"
+ depends on X86_64
+ default LEGACY_VSYSCALL_EMULATE
+ help
+ Legacy user code that does not know how to find the vDSO expects
+ to be able to issue three syscalls by calling fixed addresses in
+ kernel space. Since this location is not randomized with ASLR,
+ it can be used to assist security vulnerability exploitation.
+
+ This setting can be changed at boot time via the kernel command
+ line parameter vsyscall=[emulate|none].
+
+ On a system with recent enough glibc (2.14 or newer) and no
+ static binaries, you can say None without a performance penalty
+ to improve security.
+
+ If unsure, select "Emulate".
+
+ config LEGACY_VSYSCALL_EMULATE
+ bool "Emulate"
+ help
+ The kernel traps and emulates calls into the fixed
+ vsyscall address mapping. This makes the mapping
+ non-executable, but it still contains known contents,
+ which could be used in certain rare security vulnerability
+ exploits. This configuration is recommended when userspace
+ still uses the vsyscall area.
+
+ config LEGACY_VSYSCALL_NONE
+ bool "None"
+ help
+ There will be no vsyscall mapping at all. This will
+ eliminate any risk of ASLR bypass due to the vsyscall
+ fixed address mapping. Attempts to use the vsyscalls
+ will be reported to dmesg, so that either old or
+ malicious userspace programs can be identified.
+
+endchoice
+
+config CMDLINE_BOOL
+ bool "Built-in kernel command line"
+ ---help---
+ Allow for specifying boot arguments to the kernel at
+ build time. On some systems (e.g. embedded ones), it is
+ necessary or convenient to provide some or all of the
+ kernel boot arguments with the kernel itself (that is,
+ to not rely on the boot loader to provide them.)
+
+ To compile command line arguments into the kernel,
+ set this option to 'Y', then fill in the
+ boot arguments in CONFIG_CMDLINE.
+
+ Systems with fully functional boot loaders (i.e. non-embedded)
+ should leave this option set to 'N'.
+
+config CMDLINE
+ string "Built-in kernel command string"
+ depends on CMDLINE_BOOL
+ default ""
+ ---help---
+ Enter arguments here that should be compiled into the kernel
+ image and used at boot time. If the boot loader provides a
+ command line at boot time, it is appended to this string to
+ form the full kernel command line, when the system boots.
+
+ However, you can use the CONFIG_CMDLINE_OVERRIDE option to
+ change this behavior.
+
+ In most cases, the command line (whether built-in or provided
+ by the boot loader) should specify the device for the root
+ file system.
+
+config CMDLINE_OVERRIDE
+ bool "Built-in command line overrides boot loader arguments"
+ depends on CMDLINE_BOOL
+ ---help---
+ Set this option to 'Y' to have the kernel ignore the boot loader
+ command line, and use ONLY the built-in command line.
+
+ This is used to work around broken boot loaders. This should
+ be set to 'N' under normal conditions.
+
+config MODIFY_LDT_SYSCALL
+ bool "Enable the LDT (local descriptor table)" if EXPERT
+ default y
+ ---help---
+ Linux can allow user programs to install a per-process x86
+ Local Descriptor Table (LDT) using the modify_ldt(2) system
+ call. This is required to run 16-bit or segmented code such as
+ DOSEMU or some Wine programs. It is also used by some very old
+ threading libraries.
+
+ Enabling this feature adds a small amount of overhead to
+ context switches and increases the low-level kernel attack
+ surface. Disabling it removes the modify_ldt(2) system call.
+
+ Saying 'N' here may make sense for embedded or server kernels.
+
+source "kernel/livepatch/Kconfig"
+
+endmenu
+
+config ARCH_HAS_ADD_PAGES
+ def_bool y
+ depends on X86_64 && ARCH_ENABLE_MEMORY_HOTPLUG
+
+config ARCH_ENABLE_MEMORY_HOTPLUG
+ def_bool y
+ depends on X86_64 || (X86_32 && HIGHMEM)
+
+config ARCH_ENABLE_MEMORY_HOTREMOVE
+ def_bool y
+ depends on MEMORY_HOTPLUG
+
+config USE_PERCPU_NUMA_NODE_ID
+ def_bool y
+ depends on NUMA
+
+config ARCH_ENABLE_SPLIT_PMD_PTLOCK
+ def_bool y
+ depends on X86_64 || X86_PAE
+
+config ARCH_ENABLE_HUGEPAGE_MIGRATION
+ def_bool y
+ depends on X86_64 && HUGETLB_PAGE && MIGRATION
+
+config ARCH_ENABLE_THP_MIGRATION
+ def_bool y
+ depends on X86_64 && TRANSPARENT_HUGEPAGE
+
+menu "Power management and ACPI options"
+
+config ARCH_HIBERNATION_HEADER
+ def_bool y
+ depends on X86_64 && HIBERNATION
+
+source "kernel/power/Kconfig"
+
+source "drivers/acpi/Kconfig"
+
+source "drivers/sfi/Kconfig"
+
+config X86_APM_BOOT
+ def_bool y
+ depends on APM
+
+menuconfig APM
+ tristate "APM (Advanced Power Management) BIOS support"
+ depends on X86_32 && PM_SLEEP
+ ---help---
+ APM is a BIOS specification for saving power using several different
+ techniques. This is mostly useful for battery powered laptops with
+ APM compliant BIOSes. If you say Y here, the system time will be
+ reset after a RESUME operation, the /proc/apm device will provide
+ battery status information, and user-space programs will receive
+ notification of APM "events" (e.g. battery status change).
+
+ If you select "Y" here, you can disable actual use of the APM
+ BIOS by passing the "apm=off" option to the kernel at boot time.
+
+ Note that the APM support is almost completely disabled for
+ machines with more than one CPU.
+
+ In order to use APM, you will need supporting software. For location
+ and more information, read <file:Documentation/power/apm-acpi.txt>
+ and the Battery Powered Linux mini-HOWTO, available from
+ <http://www.tldp.org/docs.html#howto>.
+
+ This driver does not spin down disk drives (see the hdparm(8)
+ manpage ("man 8 hdparm") for that), and it doesn't turn off
+ VESA-compliant "green" monitors.
+
+ This driver does not support the TI 4000M TravelMate and the ACER
+ 486/DX4/75 because they don't have compliant BIOSes. Many "green"
+ desktop machines also don't have compliant BIOSes, and this driver
+ may cause those machines to panic during the boot phase.
+
+ Generally, if you don't have a battery in your machine, there isn't
+ much point in using this driver and you should say N. If you get
+ random kernel OOPSes or reboots that don't seem to be related to
+ anything, try disabling/enabling this option (or disabling/enabling
+ APM in your BIOS).
+
+ Some other things you should try when experiencing seemingly random,
+ "weird" problems:
+
+ 1) make sure that you have enough swap space and that it is
+ enabled.
+ 2) pass the "no-hlt" option to the kernel
+ 3) switch on floating point emulation in the kernel and pass
+ the "no387" option to the kernel
+ 4) pass the "floppy=nodma" option to the kernel
+ 5) pass the "mem=4M" option to the kernel (thereby disabling
+ all but the first 4 MB of RAM)
+ 6) make sure that the CPU is not over clocked.
+ 7) read the sig11 FAQ at <http://www.bitwizard.nl/sig11/>
+ 8) disable the cache from your BIOS settings
+ 9) install a fan for the video card or exchange video RAM
+ 10) install a better fan for the CPU
+ 11) exchange RAM chips
+ 12) exchange the motherboard.
+
+ To compile this driver as a module, choose M here: the
+ module will be called apm.
+
+if APM
+
+config APM_IGNORE_USER_SUSPEND
+ bool "Ignore USER SUSPEND"
+ ---help---
+ This option will ignore USER SUSPEND requests. On machines with a
+ compliant APM BIOS, you want to say N. However, on the NEC Versa M
+ series notebooks, it is necessary to say Y because of a BIOS bug.
+
+config APM_DO_ENABLE
+ bool "Enable PM at boot time"
+ ---help---
+ Enable APM features at boot time. From page 36 of the APM BIOS
+ specification: "When disabled, the APM BIOS does not automatically
+ power manage devices, enter the Standby State, enter the Suspend
+ State, or take power saving steps in response to CPU Idle calls."
+ This driver will make CPU Idle calls when Linux is idle (unless this
+ feature is turned off -- see "Do CPU IDLE calls", below). This
+ should always save battery power, but more complicated APM features
+ will be dependent on your BIOS implementation. You may need to turn
+ this option off if your computer hangs at boot time when using APM
+ support, or if it beeps continuously instead of suspending. Turn
+ this off if you have a NEC UltraLite Versa 33/C or a Toshiba
+ T400CDT. This is off by default since most machines do fine without
+ this feature.
+
+config APM_CPU_IDLE
+ depends on CPU_IDLE
+ bool "Make CPU Idle calls when idle"
+ ---help---
+ Enable calls to APM CPU Idle/CPU Busy inside the kernel's idle loop.
+ On some machines, this can activate improved power savings, such as
+ a slowed CPU clock rate, when the machine is idle. These idle calls
+ are made after the idle loop has run for some length of time (e.g.,
+ 333 mS). On some machines, this will cause a hang at boot time or
+ whenever the CPU becomes idle. (On machines with more than one CPU,
+ this option does nothing.)
+
+config APM_DISPLAY_BLANK
+ bool "Enable console blanking using APM"
+ ---help---
+ Enable console blanking using the APM. Some laptops can use this to
+ turn off the LCD backlight when the screen blanker of the Linux
+ virtual console blanks the screen. Note that this is only used by
+ the virtual console screen blanker, and won't turn off the backlight
+ when using the X Window system. This also doesn't have anything to
+ do with your VESA-compliant power-saving monitor. Further, this
+ option doesn't work for all laptops -- it might not turn off your
+ backlight at all, or it might print a lot of errors to the console,
+ especially if you are using gpm.
+
+config APM_ALLOW_INTS
+ bool "Allow interrupts during APM BIOS calls"
+ ---help---
+ Normally we disable external interrupts while we are making calls to
+ the APM BIOS as a measure to lessen the effects of a badly behaving
+ BIOS implementation. The BIOS should reenable interrupts if it
+ needs to. Unfortunately, some BIOSes do not -- especially those in
+ many of the newer IBM Thinkpads. If you experience hangs when you
+ suspend, try setting this to Y. Otherwise, say N.
+
+endif # APM
+
+source "drivers/cpufreq/Kconfig"
+
+source "drivers/cpuidle/Kconfig"
+
+source "drivers/idle/Kconfig"
+
+endmenu
+
+
+menu "Bus options (PCI etc.)"
+
+config PCI
+ bool "PCI support"
+ default y
+ ---help---
+ Find out whether you have a PCI motherboard. PCI is the name of a
+ bus system, i.e. the way the CPU talks to the other stuff inside
+ your box. Other bus systems are ISA, EISA, MicroChannel (MCA) or
+ VESA. If you have PCI, say Y, otherwise N.
+
+choice
+ prompt "PCI access mode"
+ depends on X86_32 && PCI
+ default PCI_GOANY
+ ---help---
+ On PCI systems, the BIOS can be used to detect the PCI devices and
+ determine their configuration. However, some old PCI motherboards
+ have BIOS bugs and may crash if this is done. Also, some embedded
+ PCI-based systems don't have any BIOS at all. Linux can also try to
+ detect the PCI hardware directly without using the BIOS.
+
+ With this option, you can specify how Linux should detect the
+ PCI devices. If you choose "BIOS", the BIOS will be used,
+ if you choose "Direct", the BIOS won't be used, and if you
+ choose "MMConfig", then PCI Express MMCONFIG will be used.
+ If you choose "Any", the kernel will try MMCONFIG, then the
+ direct access method and falls back to the BIOS if that doesn't
+ work. If unsure, go with the default, which is "Any".
+
+config PCI_GOBIOS
+ bool "BIOS"
+
+config PCI_GOMMCONFIG
+ bool "MMConfig"
+
+config PCI_GODIRECT
+ bool "Direct"
+
+config PCI_GOOLPC
+ bool "OLPC XO-1"
+ depends on OLPC
+
+config PCI_GOANY
+ bool "Any"
+
+endchoice
+
+config PCI_BIOS
+ def_bool y
+ depends on X86_32 && PCI && (PCI_GOBIOS || PCI_GOANY)
+
+# x86-64 doesn't support PCI BIOS access from long mode so always go direct.
+config PCI_DIRECT
+ def_bool y
+ depends on PCI && (X86_64 || (PCI_GODIRECT || PCI_GOANY || PCI_GOOLPC || PCI_GOMMCONFIG))
+
+config PCI_MMCONFIG
+ bool "Support mmconfig PCI config space access" if X86_64
+ default y
+ depends on PCI && (ACPI || SFI || JAILHOUSE_GUEST)
+ depends on X86_64 || (PCI_GOANY || PCI_GOMMCONFIG)
+
+config PCI_OLPC
+ def_bool y
+ depends on PCI && OLPC && (PCI_GOOLPC || PCI_GOANY)
+
+config PCI_XEN
+ def_bool y
+ depends on PCI && XEN
+ select SWIOTLB_XEN
+
+config PCI_DOMAINS
+ def_bool y
+ depends on PCI
+
+config MMCONF_FAM10H
+ def_bool y
+ depends on X86_64 && PCI_MMCONFIG && ACPI
+
+config PCI_CNB20LE_QUIRK
+ bool "Read CNB20LE Host Bridge Windows" if EXPERT
+ depends on PCI
+ help
+ Read the PCI windows out of the CNB20LE host bridge. This allows
+ PCI hotplug to work on systems with the CNB20LE chipset which do
+ not have ACPI.
+
+ There's no public spec for this chipset, and this functionality
+ is known to be incomplete.
+
+ You should say N unless you know you need this.
+
+source "drivers/pci/Kconfig"
+
+config ISA_BUS
+ bool "ISA bus support on modern systems" if EXPERT
+ help
+ Expose ISA bus device drivers and options available for selection and
+ configuration. Enable this option if your target machine has an ISA
+ bus. ISA is an older system, displaced by PCI and newer bus
+ architectures -- if your target machine is modern, it probably does
+ not have an ISA bus.
+
+ If unsure, say N.
+
+# x86_64 have no ISA slots, but can have ISA-style DMA.
+config ISA_DMA_API
+ bool "ISA-style DMA support" if (X86_64 && EXPERT)
+ default y
+ help
+ Enables ISA-style DMA support for devices requiring such controllers.
+ If unsure, say Y.
+
+if X86_32
+
+config ISA
+ bool "ISA support"
+ ---help---
+ Find out whether you have ISA slots on your motherboard. ISA is the
+ name of a bus system, i.e. the way the CPU talks to the other stuff
+ inside your box. Other bus systems are PCI, EISA, MicroChannel
+ (MCA) or VESA. ISA is an older system, now being displaced by PCI;
+ newer boards don't support it. If you have ISA, say Y, otherwise N.
+
+config EISA
+ bool "EISA support"
+ depends on ISA
+ ---help---
+ The Extended Industry Standard Architecture (EISA) bus was
+ developed as an open alternative to the IBM MicroChannel bus.
+
+ The EISA bus provided some of the features of the IBM MicroChannel
+ bus while maintaining backward compatibility with cards made for
+ the older ISA bus. The EISA bus saw limited use between 1988 and
+ 1995 when it was made obsolete by the PCI bus.
+
+ Say Y here if you are building a kernel for an EISA-based machine.
+
+ Otherwise, say N.
+
+source "drivers/eisa/Kconfig"
+
+config SCx200
+ tristate "NatSemi SCx200 support"
+ ---help---
+ This provides basic support for National Semiconductor's
+ (now AMD's) Geode processors. The driver probes for the
+ PCI-IDs of several on-chip devices, so its a good dependency
+ for other scx200_* drivers.
+
+ If compiled as a module, the driver is named scx200.
+
+config SCx200HR_TIMER
+ tristate "NatSemi SCx200 27MHz High-Resolution Timer Support"
+ depends on SCx200
+ default y
+ ---help---
+ This driver provides a clocksource built upon the on-chip
+ 27MHz high-resolution timer. Its also a workaround for
+ NSC Geode SC-1100's buggy TSC, which loses time when the
+ processor goes idle (as is done by the scheduler). The
+ other workaround is idle=poll boot option.
+
+config OLPC
+ bool "One Laptop Per Child support"
+ depends on !X86_PAE
+ select GPIOLIB
+ select OF
+ select OF_PROMTREE
+ select IRQ_DOMAIN
+ ---help---
+ Add support for detecting the unique features of the OLPC
+ XO hardware.
+
+config OLPC_XO1_PM
+ bool "OLPC XO-1 Power Management"
+ depends on OLPC && MFD_CS5535=y && PM_SLEEP
+ ---help---
+ Add support for poweroff and suspend of the OLPC XO-1 laptop.
+
+config OLPC_XO1_RTC
+ bool "OLPC XO-1 Real Time Clock"
+ depends on OLPC_XO1_PM && RTC_DRV_CMOS
+ ---help---
+ Add support for the XO-1 real time clock, which can be used as a
+ programmable wakeup source.
+
+config OLPC_XO1_SCI
+ bool "OLPC XO-1 SCI extras"
+ depends on OLPC && OLPC_XO1_PM && GPIO_CS5535=y
+ depends on INPUT=y
+ select POWER_SUPPLY
+ ---help---
+ Add support for SCI-based features of the OLPC XO-1 laptop:
+ - EC-driven system wakeups
+ - Power button
+ - Ebook switch
+ - Lid switch
+ - AC adapter status updates
+ - Battery status updates
+
+config OLPC_XO15_SCI
+ bool "OLPC XO-1.5 SCI extras"
+ depends on OLPC && ACPI
+ select POWER_SUPPLY
+ ---help---
+ Add support for SCI-based features of the OLPC XO-1.5 laptop:
+ - EC-driven system wakeups
+ - AC adapter status updates
+ - Battery status updates
+
+config ALIX
+ bool "PCEngines ALIX System Support (LED setup)"
+ select GPIOLIB
+ ---help---
+ This option enables system support for the PCEngines ALIX.
+ At present this just sets up LEDs for GPIO control on
+ ALIX2/3/6 boards. However, other system specific setup should
+ get added here.
+
+ Note: You must still enable the drivers for GPIO and LED support
+ (GPIO_CS5535 & LEDS_GPIO) to actually use the LEDs
+
+ Note: You have to set alix.force=1 for boards with Award BIOS.
+
+config NET5501
+ bool "Soekris Engineering net5501 System Support (LEDS, GPIO, etc)"
+ select GPIOLIB
+ ---help---
+ This option enables system support for the Soekris Engineering net5501.
+
+config GEOS
+ bool "Traverse Technologies GEOS System Support (LEDS, GPIO, etc)"
+ select GPIOLIB
+ depends on DMI
+ ---help---
+ This option enables system support for the Traverse Technologies GEOS.
+
+config TS5500
+ bool "Technologic Systems TS-5500 platform support"
+ depends on MELAN
+ select CHECK_SIGNATURE
+ select NEW_LEDS
+ select LEDS_CLASS
+ ---help---
+ This option enables system support for the Technologic Systems TS-5500.
+
+endif # X86_32
+
+config AMD_NB
+ def_bool y
+ depends on CPU_SUP_AMD && PCI
+
+source "drivers/pcmcia/Kconfig"
+
+config RAPIDIO
+ tristate "RapidIO support"
+ depends on PCI
+ default n
+ help
+ If enabled this option will include drivers and the core
+ infrastructure code to support RapidIO interconnect devices.
+
+source "drivers/rapidio/Kconfig"
+
+config X86_SYSFB
+ bool "Mark VGA/VBE/EFI FB as generic system framebuffer"
+ help
+ Firmwares often provide initial graphics framebuffers so the BIOS,
+ bootloader or kernel can show basic video-output during boot for
+ user-guidance and debugging. Historically, x86 used the VESA BIOS
+ Extensions and EFI-framebuffers for this, which are mostly limited
+ to x86.
+ This option, if enabled, marks VGA/VBE/EFI framebuffers as generic
+ framebuffers so the new generic system-framebuffer drivers can be
+ used on x86. If the framebuffer is not compatible with the generic
+ modes, it is advertised as fallback platform framebuffer so legacy
+ drivers like efifb, vesafb and uvesafb can pick it up.
+ If this option is not selected, all system framebuffers are always
+ marked as fallback platform framebuffers as usual.
+
+ Note: Legacy fbdev drivers, including vesafb, efifb, uvesafb, will
+ not be able to pick up generic system framebuffers if this option
+ is selected. You are highly encouraged to enable simplefb as
+ replacement if you select this option. simplefb can correctly deal
+ with generic system framebuffers. But you should still keep vesafb
+ and others enabled as fallback if a system framebuffer is
+ incompatible with simplefb.
+
+ If unsure, say Y.
+
+endmenu
+
+
+menu "Binary Emulations"
+
+config IA32_EMULATION
+ bool "IA32 Emulation"
+ depends on X86_64
+ select ARCH_WANT_OLD_COMPAT_IPC
+ select BINFMT_ELF
+ select COMPAT_BINFMT_ELF
+ select COMPAT_OLD_SIGACTION
+ ---help---
+ Include code to run legacy 32-bit programs under a
+ 64-bit kernel. You should likely turn this on, unless you're
+ 100% sure that you don't have any 32-bit programs left.
+
+config IA32_AOUT
+ tristate "IA32 a.out support"
+ depends on IA32_EMULATION
+ ---help---
+ Support old a.out binaries in the 32bit emulation.
+
+config X86_X32
+ bool "x32 ABI for 64-bit mode"
+ depends on X86_64
+ ---help---
+ Include code to run binaries for the x32 native 32-bit ABI
+ for 64-bit processors. An x32 process gets access to the
+ full 64-bit register file and wide data path while leaving
+ pointers at 32 bits for smaller memory footprint.
+
+ You will need a recent binutils (2.22 or later) with
+ elf32_x86_64 support enabled to compile a kernel with this
+ option set.
+
+config COMPAT_32
+ def_bool y
+ depends on IA32_EMULATION || X86_32
+ select HAVE_UID16
+ select OLD_SIGSUSPEND3
+
+config COMPAT
+ def_bool y
+ depends on IA32_EMULATION || X86_X32
+
+if COMPAT
+config COMPAT_FOR_U64_ALIGNMENT
+ def_bool y
+
+config SYSVIPC_COMPAT
+ def_bool y
+ depends on SYSVIPC
+endif
+
+endmenu
+
+
+config HAVE_ATOMIC_IOMAP
+ def_bool y
+ depends on X86_32
+
+config X86_DEV_DMA_OPS
+ bool
+ depends on X86_64 || STA2X11
+
+config X86_DMA_REMAP
+ bool
+ depends on STA2X11
+
+config HAVE_GENERIC_GUP
+ def_bool y
+
+source "drivers/firmware/Kconfig"
+
+source "arch/x86/kvm/Kconfig"
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
new file mode 100644
index 000000000..638411f22
--- /dev/null
+++ b/arch/x86/Kconfig.cpu
@@ -0,0 +1,468 @@
+# SPDX-License-Identifier: GPL-2.0
+# Put here option for CPU selection and depending optimization
+choice
+ prompt "Processor family"
+ default M686 if X86_32
+ default GENERIC_CPU if X86_64
+ ---help---
+ This is the processor type of your CPU. This information is
+ used for optimizing purposes. In order to compile a kernel
+ that can run on all supported x86 CPU types (albeit not
+ optimally fast), you can specify "486" here.
+
+ Note that the 386 is no longer supported, this includes
+ AMD/Cyrix/Intel 386DX/DXL/SL/SLC/SX, Cyrix/TI 486DLC/DLC2,
+ UMC 486SX-S and the NexGen Nx586.
+
+ The kernel will not necessarily run on earlier architectures than
+ the one you have chosen, e.g. a Pentium optimized kernel will run on
+ a PPro, but not necessarily on a i486.
+
+ Here are the settings recommended for greatest speed:
+ - "486" for the AMD/Cyrix/IBM/Intel 486DX/DX2/DX4 or
+ SL/SLC/SLC2/SLC3/SX/SX2 and UMC U5D or U5S.
+ - "586" for generic Pentium CPUs lacking the TSC
+ (time stamp counter) register.
+ - "Pentium-Classic" for the Intel Pentium.
+ - "Pentium-MMX" for the Intel Pentium MMX.
+ - "Pentium-Pro" for the Intel Pentium Pro.
+ - "Pentium-II" for the Intel Pentium II or pre-Coppermine Celeron.
+ - "Pentium-III" for the Intel Pentium III or Coppermine Celeron.
+ - "Pentium-4" for the Intel Pentium 4 or P4-based Celeron.
+ - "K6" for the AMD K6, K6-II and K6-III (aka K6-3D).
+ - "Athlon" for the AMD K7 family (Athlon/Duron/Thunderbird).
+ - "Opteron/Athlon64/Hammer/K8" for all K8 and newer AMD CPUs.
+ - "Crusoe" for the Transmeta Crusoe series.
+ - "Efficeon" for the Transmeta Efficeon series.
+ - "Winchip-C6" for original IDT Winchip.
+ - "Winchip-2" for IDT Winchips with 3dNow! capabilities.
+ - "AMD Elan" for the 32-bit AMD Elan embedded CPU.
+ - "GeodeGX1" for Geode GX1 (Cyrix MediaGX).
+ - "Geode GX/LX" For AMD Geode GX and LX processors.
+ - "CyrixIII/VIA C3" for VIA Cyrix III or VIA C3.
+ - "VIA C3-2" for VIA C3-2 "Nehemiah" (model 9 and above).
+ - "VIA C7" for VIA C7.
+ - "Intel P4" for the Pentium 4/Netburst microarchitecture.
+ - "Core 2/newer Xeon" for all core2 and newer Intel CPUs.
+ - "Intel Atom" for the Atom-microarchitecture CPUs.
+ - "Generic-x86-64" for a kernel which runs on any x86-64 CPU.
+
+ See each option's help text for additional details. If you don't know
+ what to do, choose "486".
+
+config M486
+ bool "486"
+ depends on X86_32
+ ---help---
+ Select this for an 486-class CPU such as AMD/Cyrix/IBM/Intel
+ 486DX/DX2/DX4 or SL/SLC/SLC2/SLC3/SX/SX2 and UMC U5D or U5S.
+
+config M586
+ bool "586/K5/5x86/6x86/6x86MX"
+ depends on X86_32
+ ---help---
+ Select this for an 586 or 686 series processor such as the AMD K5,
+ the Cyrix 5x86, 6x86 and 6x86MX. This choice does not
+ assume the RDTSC (Read Time Stamp Counter) instruction.
+
+config M586TSC
+ bool "Pentium-Classic"
+ depends on X86_32
+ ---help---
+ Select this for a Pentium Classic processor with the RDTSC (Read
+ Time Stamp Counter) instruction for benchmarking.
+
+config M586MMX
+ bool "Pentium-MMX"
+ depends on X86_32
+ ---help---
+ Select this for a Pentium with the MMX graphics/multimedia
+ extended instructions.
+
+config M686
+ bool "Pentium-Pro"
+ depends on X86_32
+ ---help---
+ Select this for Intel Pentium Pro chips. This enables the use of
+ Pentium Pro extended instructions, and disables the init-time guard
+ against the f00f bug found in earlier Pentiums.
+
+config MPENTIUMII
+ bool "Pentium-II/Celeron(pre-Coppermine)"
+ depends on X86_32
+ ---help---
+ Select this for Intel chips based on the Pentium-II and
+ pre-Coppermine Celeron core. This option enables an unaligned
+ copy optimization, compiles the kernel with optimization flags
+ tailored for the chip, and applies any applicable Pentium Pro
+ optimizations.
+
+config MPENTIUMIII
+ bool "Pentium-III/Celeron(Coppermine)/Pentium-III Xeon"
+ depends on X86_32
+ ---help---
+ Select this for Intel chips based on the Pentium-III and
+ Celeron-Coppermine core. This option enables use of some
+ extended prefetch instructions in addition to the Pentium II
+ extensions.
+
+config MPENTIUMM
+ bool "Pentium M"
+ depends on X86_32
+ ---help---
+ Select this for Intel Pentium M (not Pentium-4 M)
+ notebook chips.
+
+config MPENTIUM4
+ bool "Pentium-4/Celeron(P4-based)/Pentium-4 M/older Xeon"
+ depends on X86_32
+ ---help---
+ Select this for Intel Pentium 4 chips. This includes the
+ Pentium 4, Pentium D, P4-based Celeron and Xeon, and
+ Pentium-4 M (not Pentium M) chips. This option enables compile
+ flags optimized for the chip, uses the correct cache line size, and
+ applies any applicable optimizations.
+
+ CPUIDs: F[0-6][1-A] (in /proc/cpuinfo show = cpu family : 15 )
+
+ Select this for:
+ Pentiums (Pentium 4, Pentium D, Celeron, Celeron D) corename:
+ -Willamette
+ -Northwood
+ -Mobile Pentium 4
+ -Mobile Pentium 4 M
+ -Extreme Edition (Gallatin)
+ -Prescott
+ -Prescott 2M
+ -Cedar Mill
+ -Presler
+ -Smithfiled
+ Xeons (Intel Xeon, Xeon MP, Xeon LV, Xeon MV) corename:
+ -Foster
+ -Prestonia
+ -Gallatin
+ -Nocona
+ -Irwindale
+ -Cranford
+ -Potomac
+ -Paxville
+ -Dempsey
+
+
+config MK6
+ bool "K6/K6-II/K6-III"
+ depends on X86_32
+ ---help---
+ Select this for an AMD K6-family processor. Enables use of
+ some extended instructions, and passes appropriate optimization
+ flags to GCC.
+
+config MK7
+ bool "Athlon/Duron/K7"
+ depends on X86_32
+ ---help---
+ Select this for an AMD Athlon K7-family processor. Enables use of
+ some extended instructions, and passes appropriate optimization
+ flags to GCC.
+
+config MK8
+ bool "Opteron/Athlon64/Hammer/K8"
+ ---help---
+ Select this for an AMD Opteron or Athlon64 Hammer-family processor.
+ Enables use of some extended instructions, and passes appropriate
+ optimization flags to GCC.
+
+config MCRUSOE
+ bool "Crusoe"
+ depends on X86_32
+ ---help---
+ Select this for a Transmeta Crusoe processor. Treats the processor
+ like a 586 with TSC, and sets some GCC optimization flags (like a
+ Pentium Pro with no alignment requirements).
+
+config MEFFICEON
+ bool "Efficeon"
+ depends on X86_32
+ ---help---
+ Select this for a Transmeta Efficeon processor.
+
+config MWINCHIPC6
+ bool "Winchip-C6"
+ depends on X86_32
+ ---help---
+ Select this for an IDT Winchip C6 chip. Linux and GCC
+ treat this chip as a 586TSC with some extended instructions
+ and alignment requirements.
+
+config MWINCHIP3D
+ bool "Winchip-2/Winchip-2A/Winchip-3"
+ depends on X86_32
+ ---help---
+ Select this for an IDT Winchip-2, 2A or 3. Linux and GCC
+ treat this chip as a 586TSC with some extended instructions
+ and alignment requirements. Also enable out of order memory
+ stores for this CPU, which can increase performance of some
+ operations.
+
+config MELAN
+ bool "AMD Elan"
+ depends on X86_32
+ ---help---
+ Select this for an AMD Elan processor.
+
+ Do not use this option for K6/Athlon/Opteron processors!
+
+config MGEODEGX1
+ bool "GeodeGX1"
+ depends on X86_32
+ ---help---
+ Select this for a Geode GX1 (Cyrix MediaGX) chip.
+
+config MGEODE_LX
+ bool "Geode GX/LX"
+ depends on X86_32
+ ---help---
+ Select this for AMD Geode GX and LX processors.
+
+config MCYRIXIII
+ bool "CyrixIII/VIA-C3"
+ depends on X86_32
+ ---help---
+ Select this for a Cyrix III or C3 chip. Presently Linux and GCC
+ treat this chip as a generic 586. Whilst the CPU is 686 class,
+ it lacks the cmov extension which gcc assumes is present when
+ generating 686 code.
+ Note that Nehemiah (Model 9) and above will not boot with this
+ kernel due to them lacking the 3DNow! instructions used in earlier
+ incarnations of the CPU.
+
+config MVIAC3_2
+ bool "VIA C3-2 (Nehemiah)"
+ depends on X86_32
+ ---help---
+ Select this for a VIA C3 "Nehemiah". Selecting this enables usage
+ of SSE and tells gcc to treat the CPU as a 686.
+ Note, this kernel will not boot on older (pre model 9) C3s.
+
+config MVIAC7
+ bool "VIA C7"
+ depends on X86_32
+ ---help---
+ Select this for a VIA C7. Selecting this uses the correct cache
+ shift and tells gcc to treat the CPU as a 686.
+
+config MPSC
+ bool "Intel P4 / older Netburst based Xeon"
+ depends on X86_64
+ ---help---
+ Optimize for Intel Pentium 4, Pentium D and older Nocona/Dempsey
+ Xeon CPUs with Intel 64bit which is compatible with x86-64.
+ Note that the latest Xeons (Xeon 51xx and 53xx) are not based on the
+ Netburst core and shouldn't use this option. You can distinguish them
+ using the cpu family field
+ in /proc/cpuinfo. Family 15 is an older Xeon, Family 6 a newer one.
+
+config MCORE2
+ bool "Core 2/newer Xeon"
+ ---help---
+
+ Select this for Intel Core 2 and newer Core 2 Xeons (Xeon 51xx and
+ 53xx) CPUs. You can distinguish newer from older Xeons by the CPU
+ family in /proc/cpuinfo. Newer ones have 6 and older ones 15
+ (not a typo)
+
+config MATOM
+ bool "Intel Atom"
+ ---help---
+
+ Select this for the Intel Atom platform. Intel Atom CPUs have an
+ in-order pipelining architecture and thus can benefit from
+ accordingly optimized code. Use a recent GCC with specific Atom
+ support in order to fully benefit from selecting this option.
+
+config GENERIC_CPU
+ bool "Generic-x86-64"
+ depends on X86_64
+ ---help---
+ Generic x86-64 CPU.
+ Run equally well on all x86-64 CPUs.
+
+endchoice
+
+config X86_GENERIC
+ bool "Generic x86 support"
+ depends on X86_32
+ ---help---
+ Instead of just including optimizations for the selected
+ x86 variant (e.g. PII, Crusoe or Athlon), include some more
+ generic optimizations as well. This will make the kernel
+ perform better on x86 CPUs other than that selected.
+
+ This is really intended for distributors who need more
+ generic optimizations.
+
+#
+# Define implied options from the CPU selection here
+config X86_INTERNODE_CACHE_SHIFT
+ int
+ default "12" if X86_VSMP
+ default X86_L1_CACHE_SHIFT
+
+config X86_L1_CACHE_SHIFT
+ int
+ default "7" if MPENTIUM4 || MPSC
+ default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU
+ default "4" if MELAN || M486 || MGEODEGX1
+ default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX
+
+config X86_F00F_BUG
+ def_bool y
+ depends on M586MMX || M586TSC || M586 || M486
+
+config X86_INVD_BUG
+ def_bool y
+ depends on M486
+
+config X86_ALIGNMENT_16
+ def_bool y
+ depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1
+
+config X86_INTEL_USERCOPY
+ def_bool y
+ depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2
+
+config X86_USE_PPRO_CHECKSUM
+ def_bool y
+ depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM
+
+config X86_USE_3DNOW
+ def_bool y
+ depends on (MCYRIXIII || MK7 || MGEODE_LX) && !UML
+
+#
+# P6_NOPs are a relatively minor optimization that require a family >=
+# 6 processor, except that it is broken on certain VIA chips.
+# Furthermore, AMD chips prefer a totally different sequence of NOPs
+# (which work on all CPUs). In addition, it looks like Virtual PC
+# does not understand them.
+#
+# As a result, disallow these if we're not compiling for X86_64 (these
+# NOPs do work on all x86-64 capable chips); the list of processors in
+# the right-hand clause are the cores that benefit from this optimization.
+#
+config X86_P6_NOP
+ def_bool y
+ depends on X86_64
+ depends on (MCORE2 || MPENTIUM4 || MPSC)
+
+config X86_TSC
+ def_bool y
+ depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MATOM) || X86_64
+
+config X86_CMPXCHG64
+ def_bool y
+ depends on X86_PAE || X86_64 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586TSC || M586MMX || MATOM || MGEODE_LX || MGEODEGX1 || MK6 || MK7 || MK8
+
+# this should be set for all -march=.. options where the compiler
+# generates cmov.
+config X86_CMOV
+ def_bool y
+ depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MATOM || MGEODE_LX)
+
+config X86_MINIMUM_CPU_FAMILY
+ int
+ default "64" if X86_64
+ default "6" if X86_32 && (MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MEFFICEON || MATOM || MCRUSOE || MCORE2 || MK7 || MK8)
+ default "5" if X86_32 && X86_CMPXCHG64
+ default "4"
+
+config X86_DEBUGCTLMSR
+ def_bool y
+ depends on !(MK6 || MWINCHIPC6 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486) && !UML
+
+menuconfig PROCESSOR_SELECT
+ bool "Supported processor vendors" if EXPERT
+ ---help---
+ This lets you choose what x86 vendor support code your kernel
+ will include.
+
+config CPU_SUP_INTEL
+ default y
+ bool "Support Intel processors" if PROCESSOR_SELECT
+ ---help---
+ This enables detection, tunings and quirks for Intel processors
+
+ You need this enabled if you want your kernel to run on an
+ Intel CPU. Disabling this option on other types of CPUs
+ makes the kernel a tiny bit smaller. Disabling it on an Intel
+ CPU might render the kernel unbootable.
+
+ If unsure, say N.
+
+config CPU_SUP_CYRIX_32
+ default y
+ bool "Support Cyrix processors" if PROCESSOR_SELECT
+ depends on M486 || M586 || M586TSC || M586MMX || (EXPERT && !64BIT)
+ ---help---
+ This enables detection, tunings and quirks for Cyrix processors
+
+ You need this enabled if you want your kernel to run on a
+ Cyrix CPU. Disabling this option on other types of CPUs
+ makes the kernel a tiny bit smaller. Disabling it on a Cyrix
+ CPU might render the kernel unbootable.
+
+ If unsure, say N.
+
+config CPU_SUP_AMD
+ default y
+ bool "Support AMD processors" if PROCESSOR_SELECT
+ ---help---
+ This enables detection, tunings and quirks for AMD processors
+
+ You need this enabled if you want your kernel to run on an
+ AMD CPU. Disabling this option on other types of CPUs
+ makes the kernel a tiny bit smaller. Disabling it on an AMD
+ CPU might render the kernel unbootable.
+
+ If unsure, say N.
+
+config CPU_SUP_CENTAUR
+ default y
+ bool "Support Centaur processors" if PROCESSOR_SELECT
+ ---help---
+ This enables detection, tunings and quirks for Centaur processors
+
+ You need this enabled if you want your kernel to run on a
+ Centaur CPU. Disabling this option on other types of CPUs
+ makes the kernel a tiny bit smaller. Disabling it on a Centaur
+ CPU might render the kernel unbootable.
+
+ If unsure, say N.
+
+config CPU_SUP_TRANSMETA_32
+ default y
+ bool "Support Transmeta processors" if PROCESSOR_SELECT
+ depends on !64BIT
+ ---help---
+ This enables detection, tunings and quirks for Transmeta processors
+
+ You need this enabled if you want your kernel to run on a
+ Transmeta CPU. Disabling this option on other types of CPUs
+ makes the kernel a tiny bit smaller. Disabling it on a Transmeta
+ CPU might render the kernel unbootable.
+
+ If unsure, say N.
+
+config CPU_SUP_UMC_32
+ default y
+ bool "Support UMC processors" if PROCESSOR_SELECT
+ depends on M486 || (EXPERT && !64BIT)
+ ---help---
+ This enables detection, tunings and quirks for UMC processors
+
+ You need this enabled if you want your kernel to run on a
+ UMC CPU. Disabling this option on other types of CPUs
+ makes the kernel a tiny bit smaller. Disabling it on a UMC
+ CPU might render the kernel unbootable.
+
+ If unsure, say N.
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
new file mode 100644
index 000000000..687cd1a21
--- /dev/null
+++ b/arch/x86/Kconfig.debug
@@ -0,0 +1,409 @@
+# SPDX-License-Identifier: GPL-2.0
+
+config TRACE_IRQFLAGS_SUPPORT
+ def_bool y
+
+config EARLY_PRINTK_USB
+ bool
+
+config X86_VERBOSE_BOOTUP
+ bool "Enable verbose x86 bootup info messages"
+ default y
+ ---help---
+ Enables the informational output from the decompression stage
+ (e.g. bzImage) of the boot. If you disable this you will still
+ see errors. Disable this if you want silent bootup.
+
+config EARLY_PRINTK
+ bool "Early printk" if EXPERT
+ default y
+ ---help---
+ Write kernel log output directly into the VGA buffer or to a serial
+ port.
+
+ This is useful for kernel debugging when your machine crashes very
+ early before the console code is initialized. For normal operation
+ it is not recommended because it looks ugly and doesn't cooperate
+ with klogd/syslogd or the X server. You should normally say N here,
+ unless you want to debug such a crash.
+
+config EARLY_PRINTK_DBGP
+ bool "Early printk via EHCI debug port"
+ depends on EARLY_PRINTK && PCI
+ select EARLY_PRINTK_USB
+ ---help---
+ Write kernel log output directly into the EHCI debug port.
+
+ This is useful for kernel debugging when your machine crashes very
+ early before the console code is initialized. For normal operation
+ it is not recommended because it looks ugly and doesn't cooperate
+ with klogd/syslogd or the X server. You should normally say N here,
+ unless you want to debug such a crash. You need usb debug device.
+
+config EARLY_PRINTK_EFI
+ bool "Early printk via the EFI framebuffer"
+ depends on EFI && EARLY_PRINTK
+ select FONT_SUPPORT
+ ---help---
+ Write kernel log output directly into the EFI framebuffer.
+
+ This is useful for kernel debugging when your machine crashes very
+ early before the console code is initialized.
+
+config EARLY_PRINTK_USB_XDBC
+ bool "Early printk via the xHCI debug port"
+ depends on EARLY_PRINTK && PCI
+ select EARLY_PRINTK_USB
+ ---help---
+ Write kernel log output directly into the xHCI debug port.
+
+ One use for this feature is kernel debugging, for example when your
+ machine crashes very early before the regular console code is
+ initialized. Other uses include simpler, lockless logging instead of
+ a full-blown printk console driver + klogd.
+
+ For normal production environments this is normally not recommended,
+ because it doesn't feed events into klogd/syslogd and doesn't try to
+ print anything on the screen.
+
+ You should normally say N here, unless you want to debug early
+ crashes or need a very simple printk logging facility.
+
+config MCSAFE_TEST
+ def_bool n
+
+config X86_PTDUMP_CORE
+ def_bool n
+
+config X86_PTDUMP
+ tristate "Export kernel pagetable layout to userspace via debugfs"
+ depends on DEBUG_KERNEL
+ select DEBUG_FS
+ select X86_PTDUMP_CORE
+ ---help---
+ Say Y here if you want to show the kernel pagetable layout in a
+ debugfs file. This information is only useful for kernel developers
+ who are working in architecture specific areas of the kernel.
+ It is probably not a good idea to enable this feature in a production
+ kernel.
+ If in doubt, say "N"
+
+config EFI_PGT_DUMP
+ bool "Dump the EFI pagetable"
+ depends on EFI
+ select X86_PTDUMP_CORE
+ ---help---
+ Enable this if you want to dump the EFI page table before
+ enabling virtual mode. This can be used to debug miscellaneous
+ issues with the mapping of the EFI runtime regions into that
+ table.
+
+config DEBUG_WX
+ bool "Warn on W+X mappings at boot"
+ select X86_PTDUMP_CORE
+ ---help---
+ Generate a warning if any W+X mappings are found at boot.
+
+ This is useful for discovering cases where the kernel is leaving
+ W+X mappings after applying NX, as such mappings are a security risk.
+
+ Look for a message in dmesg output like this:
+
+ x86/mm: Checked W+X mappings: passed, no W+X pages found.
+
+ or like this, if the check failed:
+
+ x86/mm: Checked W+X mappings: FAILED, <N> W+X pages found.
+
+ Note that even if the check fails, your kernel is possibly
+ still fine, as W+X mappings are not a security hole in
+ themselves, what they do is that they make the exploitation
+ of other unfixed kernel bugs easier.
+
+ There is no runtime or memory usage effect of this option
+ once the kernel has booted up - it's a one time check.
+
+ If in doubt, say "Y".
+
+config DOUBLEFAULT
+ default y
+ bool "Enable doublefault exception handler" if EXPERT
+ ---help---
+ This option allows trapping of rare doublefault exceptions that
+ would otherwise cause a system to silently reboot. Disabling this
+ option saves about 4k and might cause you much additional grey
+ hair.
+
+config DEBUG_TLBFLUSH
+ bool "Set upper limit of TLB entries to flush one-by-one"
+ depends on DEBUG_KERNEL
+ ---help---
+
+ X86-only for now.
+
+ This option allows the user to tune the amount of TLB entries the
+ kernel flushes one-by-one instead of doing a full TLB flush. In
+ certain situations, the former is cheaper. This is controlled by the
+ tlb_flushall_shift knob under /sys/kernel/debug/x86. If you set it
+ to -1, the code flushes the whole TLB unconditionally. Otherwise,
+ for positive values of it, the kernel will use single TLB entry
+ invalidating instructions according to the following formula:
+
+ flush_entries <= active_tlb_entries / 2^tlb_flushall_shift
+
+ If in doubt, say "N".
+
+config IOMMU_DEBUG
+ bool "Enable IOMMU debugging"
+ depends on GART_IOMMU && DEBUG_KERNEL
+ depends on X86_64
+ ---help---
+ Force the IOMMU to on even when you have less than 4GB of
+ memory and add debugging code. On overflow always panic. And
+ allow to enable IOMMU leak tracing. Can be disabled at boot
+ time with iommu=noforce. This will also enable scatter gather
+ list merging. Currently not recommended for production
+ code. When you use it make sure you have a big enough
+ IOMMU/AGP aperture. Most of the options enabled by this can
+ be set more finegrained using the iommu= command line
+ options. See Documentation/x86/x86_64/boot-options.txt for more
+ details.
+
+config IOMMU_LEAK
+ bool "IOMMU leak tracing"
+ depends on IOMMU_DEBUG && DMA_API_DEBUG
+ ---help---
+ Add a simple leak tracer to the IOMMU code. This is useful when you
+ are debugging a buggy device driver that leaks IOMMU mappings.
+
+config HAVE_MMIOTRACE_SUPPORT
+ def_bool y
+
+config X86_DECODER_SELFTEST
+ bool "x86 instruction decoder selftest"
+ depends on DEBUG_KERNEL && INSTRUCTION_DECODER
+ depends on !COMPILE_TEST
+ ---help---
+ Perform x86 instruction decoder selftests at build time.
+ This option is useful for checking the sanity of x86 instruction
+ decoder code.
+ If unsure, say "N".
+
+#
+# IO delay types:
+#
+
+config IO_DELAY_TYPE_0X80
+ int
+ default "0"
+
+config IO_DELAY_TYPE_0XED
+ int
+ default "1"
+
+config IO_DELAY_TYPE_UDELAY
+ int
+ default "2"
+
+config IO_DELAY_TYPE_NONE
+ int
+ default "3"
+
+choice
+ prompt "IO delay type"
+ default IO_DELAY_0X80
+
+config IO_DELAY_0X80
+ bool "port 0x80 based port-IO delay [recommended]"
+ ---help---
+ This is the traditional Linux IO delay used for in/out_p.
+ It is the most tested hence safest selection here.
+
+config IO_DELAY_0XED
+ bool "port 0xed based port-IO delay"
+ ---help---
+ Use port 0xed as the IO delay. This frees up port 0x80 which is
+ often used as a hardware-debug port.
+
+config IO_DELAY_UDELAY
+ bool "udelay based port-IO delay"
+ ---help---
+ Use udelay(2) as the IO delay method. This provides the delay
+ while not having any side-effect on the IO port space.
+
+config IO_DELAY_NONE
+ bool "no port-IO delay"
+ ---help---
+ No port-IO delay. Will break on old boxes that require port-IO
+ delay for certain operations. Should work on most new machines.
+
+endchoice
+
+if IO_DELAY_0X80
+config DEFAULT_IO_DELAY_TYPE
+ int
+ default IO_DELAY_TYPE_0X80
+endif
+
+if IO_DELAY_0XED
+config DEFAULT_IO_DELAY_TYPE
+ int
+ default IO_DELAY_TYPE_0XED
+endif
+
+if IO_DELAY_UDELAY
+config DEFAULT_IO_DELAY_TYPE
+ int
+ default IO_DELAY_TYPE_UDELAY
+endif
+
+if IO_DELAY_NONE
+config DEFAULT_IO_DELAY_TYPE
+ int
+ default IO_DELAY_TYPE_NONE
+endif
+
+config DEBUG_BOOT_PARAMS
+ bool "Debug boot parameters"
+ depends on DEBUG_KERNEL
+ depends on DEBUG_FS
+ ---help---
+ This option will cause struct boot_params to be exported via debugfs.
+
+config CPA_DEBUG
+ bool "CPA self-test code"
+ depends on DEBUG_KERNEL
+ ---help---
+ Do change_page_attr() self-tests every 30 seconds.
+
+config OPTIMIZE_INLINING
+ bool "Allow gcc to uninline functions marked 'inline'"
+ ---help---
+ This option determines if the kernel forces gcc to inline the functions
+ developers have marked 'inline'. Doing so takes away freedom from gcc to
+ do what it thinks is best, which is desirable for the gcc 3.x series of
+ compilers. The gcc 4.x series have a rewritten inlining algorithm and
+ enabling this option will generate a smaller kernel there. Hopefully
+ this algorithm is so good that allowing gcc 4.x and above to make the
+ decision will become the default in the future. Until then this option
+ is there to test gcc for this.
+
+ If unsure, say N.
+
+config DEBUG_ENTRY
+ bool "Debug low-level entry code"
+ depends on DEBUG_KERNEL
+ ---help---
+ This option enables sanity checks in x86's low-level entry code.
+ Some of these sanity checks may slow down kernel entries and
+ exits or otherwise impact performance.
+
+ If unsure, say N.
+
+config DEBUG_NMI_SELFTEST
+ bool "NMI Selftest"
+ depends on DEBUG_KERNEL && X86_LOCAL_APIC
+ ---help---
+ Enabling this option turns on a quick NMI selftest to verify
+ that the NMI behaves correctly.
+
+ This might help diagnose strange hangs that rely on NMI to
+ function properly.
+
+ If unsure, say N.
+
+config DEBUG_IMR_SELFTEST
+ bool "Isolated Memory Region self test"
+ default n
+ depends on INTEL_IMR
+ ---help---
+ This option enables automated sanity testing of the IMR code.
+ Some simple tests are run to verify IMR bounds checking, alignment
+ and overlapping. This option is really only useful if you are
+ debugging an IMR memory map or are modifying the IMR code and want to
+ test your changes.
+
+ If unsure say N here.
+
+config X86_DEBUG_FPU
+ bool "Debug the x86 FPU code"
+ depends on DEBUG_KERNEL
+ default y
+ ---help---
+ If this option is enabled then there will be extra sanity
+ checks and (boot time) debug printouts added to the kernel.
+ This debugging adds some small amount of runtime overhead
+ to the kernel.
+
+ If unsure, say N.
+
+config PUNIT_ATOM_DEBUG
+ tristate "ATOM Punit debug driver"
+ depends on PCI
+ select DEBUG_FS
+ select IOSF_MBI
+ ---help---
+ This is a debug driver, which gets the power states
+ of all Punit North Complex devices. The power states of
+ each device is exposed as part of the debugfs interface.
+ The current power state can be read from
+ /sys/kernel/debug/punit_atom/dev_power_state
+
+choice
+ prompt "Choose kernel unwinder"
+ default UNWINDER_ORC if X86_64
+ default UNWINDER_FRAME_POINTER if X86_32
+ ---help---
+ This determines which method will be used for unwinding kernel stack
+ traces for panics, oopses, bugs, warnings, perf, /proc/<pid>/stack,
+ livepatch, lockdep, and more.
+
+config UNWINDER_ORC
+ bool "ORC unwinder"
+ depends on X86_64
+ select STACK_VALIDATION
+ ---help---
+ This option enables the ORC (Oops Rewind Capability) unwinder for
+ unwinding kernel stack traces. It uses a custom data format which is
+ a simplified version of the DWARF Call Frame Information standard.
+
+ This unwinder is more accurate across interrupt entry frames than the
+ frame pointer unwinder. It also enables a 5-10% performance
+ improvement across the entire kernel compared to frame pointers.
+
+ Enabling this option will increase the kernel's runtime memory usage
+ by roughly 2-4MB, depending on your kernel config.
+
+config UNWINDER_FRAME_POINTER
+ bool "Frame pointer unwinder"
+ select FRAME_POINTER
+ ---help---
+ This option enables the frame pointer unwinder for unwinding kernel
+ stack traces.
+
+ The unwinder itself is fast and it uses less RAM than the ORC
+ unwinder, but the kernel text size will grow by ~3% and the kernel's
+ overall performance will degrade by roughly 5-10%.
+
+ This option is recommended if you want to use the livepatch
+ consistency model, as this is currently the only way to get a
+ reliable stack trace (CONFIG_HAVE_RELIABLE_STACKTRACE).
+
+config UNWINDER_GUESS
+ bool "Guess unwinder"
+ depends on EXPERT
+ depends on !STACKDEPOT
+ ---help---
+ This option enables the "guess" unwinder for unwinding kernel stack
+ traces. It scans the stack and reports every kernel text address it
+ finds. Some of the addresses it reports may be incorrect.
+
+ While this option often produces false positives, it can still be
+ useful in many cases. Unlike the other unwinders, it has no runtime
+ overhead.
+
+endchoice
+
+config FRAME_POINTER
+ depends on !UNWINDER_ORC && !UNWINDER_GUESS
+ bool
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
new file mode 100644
index 000000000..65a8722e7
--- /dev/null
+++ b/arch/x86/Makefile
@@ -0,0 +1,344 @@
+# SPDX-License-Identifier: GPL-2.0
+# Unified Makefile for i386 and x86_64
+
+# select defconfig based on actual architecture
+ifeq ($(ARCH),x86)
+ ifeq ($(shell uname -m),x86_64)
+ KBUILD_DEFCONFIG := x86_64_defconfig
+ else
+ KBUILD_DEFCONFIG := i386_defconfig
+ endif
+else
+ KBUILD_DEFCONFIG := $(ARCH)_defconfig
+endif
+
+# For gcc stack alignment is specified with -mpreferred-stack-boundary,
+# clang has the option -mstack-alignment for that purpose.
+ifneq ($(call cc-option, -mpreferred-stack-boundary=4),)
+ cc_stack_align4 := -mpreferred-stack-boundary=2
+ cc_stack_align8 := -mpreferred-stack-boundary=3
+else ifneq ($(call cc-option, -mstack-alignment=16),)
+ cc_stack_align4 := -mstack-alignment=4
+ cc_stack_align8 := -mstack-alignment=8
+endif
+
+# How to compile the 16-bit code. Note we always compile for -march=i386;
+# that way we can complain to the user if the CPU is insufficient.
+#
+# The -m16 option is supported by GCC >= 4.9 and clang >= 3.5. For
+# older versions of GCC, include an *assembly* header to make sure that
+# gcc doesn't play any games behind our back.
+CODE16GCC_CFLAGS := -m32 -Wa,$(srctree)/arch/x86/boot/code16gcc.h
+M16_CFLAGS := $(call cc-option, -m16, $(CODE16GCC_CFLAGS))
+
+REALMODE_CFLAGS := $(M16_CFLAGS) -g -Os -DDISABLE_BRANCH_PROFILING \
+ -Wall -Wstrict-prototypes -march=i386 -mregparm=3 \
+ -fno-strict-aliasing -fomit-frame-pointer -fno-pic \
+ -mno-mmx -mno-sse $(call cc-option,-fcf-protection=none)
+
+REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -ffreestanding)
+REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -fno-stack-protector)
+REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -Wno-address-of-packed-member)
+REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), $(cc_stack_align4))
+REALMODE_CFLAGS += $(CLANG_FLAGS)
+export REALMODE_CFLAGS
+
+# BITS is used as extension for files which are available in a 32 bit
+# and a 64 bit version to simplify shared Makefiles.
+# e.g.: obj-y += foo_$(BITS).o
+export BITS
+
+ifdef CONFIG_X86_NEED_RELOCS
+ LDFLAGS_vmlinux := --emit-relocs --discard-none
+endif
+
+#
+# Prevent GCC from generating any FP code by mistake.
+#
+# This must happen before we try the -mpreferred-stack-boundary, see:
+#
+# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53383
+#
+KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow
+KBUILD_CFLAGS += $(call cc-option,-mno-avx,)
+
+# Intel CET isn't enabled in the kernel
+KBUILD_CFLAGS += $(call cc-option,-fcf-protection=none)
+
+ifeq ($(CONFIG_X86_32),y)
+ BITS := 32
+ UTS_MACHINE := i386
+ CHECKFLAGS += -D__i386__
+
+ biarch := $(call cc-option,-m32)
+ KBUILD_AFLAGS += $(biarch)
+ KBUILD_CFLAGS += $(biarch)
+
+ KBUILD_CFLAGS += -msoft-float -mregparm=3 -freg-struct-return
+
+ # Never want PIC in a 32-bit kernel, prevent breakage with GCC built
+ # with nonstandard options
+ KBUILD_CFLAGS += -fno-pic
+
+ # Align the stack to the register width instead of using the default
+ # alignment of 16 bytes. This reduces stack usage and the number of
+ # alignment instructions.
+ KBUILD_CFLAGS += $(call cc-option,$(cc_stack_align4))
+
+ # CPU-specific tuning. Anything which can be shared with UML should go here.
+ include arch/x86/Makefile_32.cpu
+ KBUILD_CFLAGS += $(cflags-y)
+
+ # temporary until string.h is fixed
+ KBUILD_CFLAGS += -ffreestanding
+else
+ BITS := 64
+ UTS_MACHINE := x86_64
+ CHECKFLAGS += -D__x86_64__
+
+ biarch := -m64
+ KBUILD_AFLAGS += -m64
+ KBUILD_CFLAGS += -m64
+
+ # Align jump targets to 1 byte, not the default 16 bytes:
+ KBUILD_CFLAGS += $(call cc-option,-falign-jumps=1)
+
+ # Pack loops tightly as well:
+ KBUILD_CFLAGS += $(call cc-option,-falign-loops=1)
+
+ # Don't autogenerate traditional x87 instructions
+ KBUILD_CFLAGS += $(call cc-option,-mno-80387)
+ KBUILD_CFLAGS += $(call cc-option,-mno-fp-ret-in-387)
+
+ # By default gcc and clang use a stack alignment of 16 bytes for x86.
+ # However the standard kernel entry on x86-64 leaves the stack on an
+ # 8-byte boundary. If the compiler isn't informed about the actual
+ # alignment it will generate extra alignment instructions for the
+ # default alignment which keep the stack *mis*aligned.
+ # Furthermore an alignment to the register width reduces stack usage
+ # and the number of alignment instructions.
+ KBUILD_CFLAGS += $(call cc-option,$(cc_stack_align8))
+
+ # Use -mskip-rax-setup if supported.
+ KBUILD_CFLAGS += $(call cc-option,-mskip-rax-setup)
+
+ # FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu)
+ cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8)
+ cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona)
+
+ cflags-$(CONFIG_MCORE2) += \
+ $(call cc-option,-march=core2,$(call cc-option,-mtune=generic))
+ cflags-$(CONFIG_MATOM) += $(call cc-option,-march=atom) \
+ $(call cc-option,-mtune=atom,$(call cc-option,-mtune=generic))
+ cflags-$(CONFIG_GENERIC_CPU) += $(call cc-option,-mtune=generic)
+ KBUILD_CFLAGS += $(cflags-y)
+
+ KBUILD_CFLAGS += -mno-red-zone
+ KBUILD_CFLAGS += -mcmodel=kernel
+
+ # -funit-at-a-time shrinks the kernel .text considerably
+ # unfortunately it makes reading oopses harder.
+ KBUILD_CFLAGS += $(call cc-option,-funit-at-a-time)
+endif
+
+ifdef CONFIG_X86_X32
+ x32_ld_ok := $(call try-run,\
+ /bin/echo -e '1: .quad 1b' | \
+ $(CC) $(KBUILD_AFLAGS) -c -x assembler -o "$$TMP" - && \
+ $(OBJCOPY) -O elf32-x86-64 "$$TMP" "$$TMPO" && \
+ $(LD) -m elf32_x86_64 "$$TMPO" -o "$$TMP",y,n)
+ ifeq ($(x32_ld_ok),y)
+ CONFIG_X86_X32_ABI := y
+ KBUILD_AFLAGS += -DCONFIG_X86_X32_ABI
+ KBUILD_CFLAGS += -DCONFIG_X86_X32_ABI
+ else
+ $(warning CONFIG_X86_X32 enabled but no binutils support)
+ endif
+endif
+export CONFIG_X86_X32_ABI
+
+#
+# If the function graph tracer is used with mcount instead of fentry,
+# '-maccumulate-outgoing-args' is needed to prevent a GCC bug
+# (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=42109)
+#
+ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ ifndef CONFIG_HAVE_FENTRY
+ ACCUMULATE_OUTGOING_ARGS := 1
+ else
+ ifeq ($(call cc-option-yn, -mfentry), n)
+ ACCUMULATE_OUTGOING_ARGS := 1
+
+ # GCC ignores '-maccumulate-outgoing-args' when used with '-Os'.
+ # If '-Os' is enabled, disable it and print a warning.
+ ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE
+ undefine CONFIG_CC_OPTIMIZE_FOR_SIZE
+ $(warning Disabling CONFIG_CC_OPTIMIZE_FOR_SIZE. Your compiler does not have -mfentry so you cannot optimize for size with CONFIG_FUNCTION_GRAPH_TRACER.)
+ endif
+
+ endif
+ endif
+endif
+
+ifeq ($(ACCUMULATE_OUTGOING_ARGS), 1)
+ # This compiler flag is not supported by Clang:
+ KBUILD_CFLAGS += $(call cc-option,-maccumulate-outgoing-args,)
+endif
+
+# Stackpointer is addressed different for 32 bit and 64 bit x86
+sp-$(CONFIG_X86_32) := esp
+sp-$(CONFIG_X86_64) := rsp
+
+# do binutils support CFI?
+cfi := $(call as-instr,.cfi_startproc\n.cfi_rel_offset $(sp-y)$(comma)0\n.cfi_endproc,-DCONFIG_AS_CFI=1)
+# is .cfi_signal_frame supported too?
+cfi-sigframe := $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1)
+cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTIONS=1)
+
+# does binutils support specific instructions?
+asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1)
+asinstr += $(call as-instr,pshufb %xmm0$(comma)%xmm0,-DCONFIG_AS_SSSE3=1)
+asinstr += $(call as-instr,crc32l %eax$(comma)%eax,-DCONFIG_AS_CRC32=1)
+avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1)
+avx2_instr :=$(call as-instr,vpbroadcastb %xmm0$(comma)%ymm1,-DCONFIG_AS_AVX2=1)
+avx512_instr :=$(call as-instr,vpmovm2b %k1$(comma)%zmm5,-DCONFIG_AS_AVX512=1)
+sha1_ni_instr :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,-DCONFIG_AS_SHA1_NI=1)
+sha256_ni_instr :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,-DCONFIG_AS_SHA256_NI=1)
+
+KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(avx512_instr) $(sha1_ni_instr) $(sha256_ni_instr)
+KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(avx512_instr) $(sha1_ni_instr) $(sha256_ni_instr)
+
+KBUILD_LDFLAGS := -m elf_$(UTS_MACHINE)
+
+#
+# The 64-bit kernel must be aligned to 2MB. Pass -z max-page-size=0x200000 to
+# the linker to force 2MB page size regardless of the default page size used
+# by the linker.
+#
+ifdef CONFIG_X86_64
+KBUILD_LDFLAGS += $(call ld-option, -z max-page-size=0x200000)
+endif
+
+# Speed up the build
+KBUILD_CFLAGS += -pipe
+# Workaround for a gcc prelease that unfortunately was shipped in a suse release
+KBUILD_CFLAGS += -Wno-sign-compare
+#
+KBUILD_CFLAGS += -fno-asynchronous-unwind-tables
+
+# Avoid indirect branches in kernel to deal with Spectre
+ifdef CONFIG_RETPOLINE
+ KBUILD_CFLAGS += $(RETPOLINE_CFLAGS)
+ # Additionally, avoid generating expensive indirect jumps which
+ # are subject to retpolines for small number of switch cases.
+ # clang turns off jump table generation by default when under
+ # retpoline builds, however, gcc does not for x86. This has
+ # only been fixed starting from gcc stable version 8.4.0 and
+ # onwards, but not for older ones. See gcc bug #86952.
+ ifndef CONFIG_CC_IS_CLANG
+ KBUILD_CFLAGS += $(call cc-option,-fno-jump-tables)
+ endif
+endif
+
+archscripts: scripts_basic
+ $(Q)$(MAKE) $(build)=arch/x86/tools relocs
+
+###
+# Syscall table generation
+
+archheaders:
+ $(Q)$(MAKE) $(build)=arch/x86/entry/syscalls all
+
+###
+# Kernel objects
+
+head-y := arch/x86/kernel/head_$(BITS).o
+head-y += arch/x86/kernel/head$(BITS).o
+head-y += arch/x86/kernel/ebda.o
+head-y += arch/x86/kernel/platform-quirks.o
+
+libs-y += arch/x86/lib/
+
+# See arch/x86/Kbuild for content of core part of the kernel
+core-y += arch/x86/
+
+# drivers-y are linked after core-y
+drivers-$(CONFIG_MATH_EMULATION) += arch/x86/math-emu/
+drivers-$(CONFIG_PCI) += arch/x86/pci/
+
+# must be linked after kernel/
+drivers-$(CONFIG_OPROFILE) += arch/x86/oprofile/
+
+# suspend and hibernation support
+drivers-$(CONFIG_PM) += arch/x86/power/
+
+drivers-$(CONFIG_FB) += arch/x86/video/
+
+####
+# boot loader support. Several targets are kept for legacy purposes
+
+boot := arch/x86/boot
+
+BOOT_TARGETS = bzlilo bzdisk fdimage fdimage144 fdimage288 isoimage
+
+PHONY += bzImage $(BOOT_TARGETS)
+
+# Default kernel to build
+all: bzImage
+
+# KBUILD_IMAGE specify target image being built
+KBUILD_IMAGE := $(boot)/bzImage
+
+bzImage: vmlinux
+ifeq ($(CONFIG_X86_DECODER_SELFTEST),y)
+ $(Q)$(MAKE) $(build)=arch/x86/tools posttest
+endif
+ $(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE)
+ $(Q)mkdir -p $(objtree)/arch/$(UTS_MACHINE)/boot
+ $(Q)ln -fsn ../../x86/boot/bzImage $(objtree)/arch/$(UTS_MACHINE)/boot/$@
+
+$(BOOT_TARGETS): vmlinux
+ $(Q)$(MAKE) $(build)=$(boot) $@
+
+PHONY += install
+install:
+ $(Q)$(MAKE) $(build)=$(boot) $@
+
+PHONY += vdso_install
+vdso_install:
+ $(Q)$(MAKE) $(build)=arch/x86/entry/vdso $@
+
+archprepare: checkbin
+checkbin:
+ifndef CONFIG_CC_HAS_ASM_GOTO
+ @echo Compiler lacks asm-goto support.
+ @exit 1
+endif
+ifdef CONFIG_RETPOLINE
+ifeq ($(RETPOLINE_CFLAGS),)
+ @echo "You are building kernel with non-retpoline compiler." >&2
+ @echo "Please update your compiler." >&2
+ @false
+endif
+endif
+
+archclean:
+ $(Q)rm -rf $(objtree)/arch/i386
+ $(Q)rm -rf $(objtree)/arch/x86_64
+ $(Q)$(MAKE) $(clean)=$(boot)
+ $(Q)$(MAKE) $(clean)=arch/x86/tools
+
+define archhelp
+ echo '* bzImage - Compressed kernel image (arch/x86/boot/bzImage)'
+ echo ' install - Install kernel using'
+ echo ' (your) ~/bin/$(INSTALLKERNEL) or'
+ echo ' (distribution) /sbin/$(INSTALLKERNEL) or'
+ echo ' install to $$(INSTALL_PATH) and run lilo'
+ echo ' fdimage - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)'
+ echo ' fdimage144 - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)'
+ echo ' fdimage288 - Create 2.8MB boot floppy image (arch/x86/boot/fdimage)'
+ echo ' isoimage - Create a boot CD-ROM image (arch/x86/boot/image.iso)'
+ echo ' bzdisk/fdimage*/isoimage also accept:'
+ echo ' FDARGS="..." arguments for the booted kernel'
+ echo ' FDINITRD=file initrd for the booted kernel'
+endef
diff --git a/arch/x86/Makefile.um b/arch/x86/Makefile.um
new file mode 100644
index 000000000..91085a08d
--- /dev/null
+++ b/arch/x86/Makefile.um
@@ -0,0 +1,59 @@
+# SPDX-License-Identifier: GPL-2.0
+core-y += arch/x86/crypto/
+
+ifeq ($(CONFIG_X86_32),y)
+START := 0x8048000
+
+KBUILD_LDFLAGS += -m elf_i386
+ELF_ARCH := i386
+ELF_FORMAT := elf32-i386
+CHECKFLAGS += -D__i386__
+
+KBUILD_CFLAGS += $(call cc-option,-m32)
+KBUILD_AFLAGS += $(call cc-option,-m32)
+LINK-y += $(call cc-option,-m32)
+
+LDS_EXTRA := -Ui386
+export LDS_EXTRA
+
+# First of all, tune CFLAGS for the specific CPU. This actually sets cflags-y.
+include arch/x86/Makefile_32.cpu
+
+# prevent gcc from keeping the stack 16 byte aligned. Taken from i386.
+cflags-y += $(call cc-option,-mpreferred-stack-boundary=2)
+
+# Prevent sprintf in nfsd from being converted to strcpy and resulting in
+# an unresolved reference.
+cflags-y += -ffreestanding
+
+# Disable unit-at-a-time mode on pre-gcc-4.0 compilers, it makes gcc use
+# a lot more stack due to the lack of sharing of stacklots. Also, gcc
+# 4.3.0 needs -funit-at-a-time for extern inline functions.
+KBUILD_CFLAGS += $(shell if [ $(cc-version) -lt 0400 ] ; then \
+ echo $(call cc-option,-fno-unit-at-a-time); \
+ else echo $(call cc-option,-funit-at-a-time); fi ;)
+
+KBUILD_CFLAGS += $(cflags-y)
+
+else
+
+START := 0x60000000
+
+KBUILD_CFLAGS += -fno-builtin -m64
+
+CHECKFLAGS += -m64 -D__x86_64__
+KBUILD_AFLAGS += -m64
+KBUILD_LDFLAGS += -m elf_x86_64
+KBUILD_CPPFLAGS += -m64
+
+ELF_ARCH := i386:x86-64
+ELF_FORMAT := elf64-x86-64
+
+# Not on all 64-bit distros /lib is a symlink to /lib64. PLD is an example.
+
+LINK-$(CONFIG_LD_SCRIPT_DYN) += -Wl,-rpath,/lib64
+LINK-y += -m64
+
+# Do unit-at-a-time unconditionally on x86_64, following the host
+KBUILD_CFLAGS += $(call cc-option,-funit-at-a-time)
+endif
diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu
new file mode 100644
index 000000000..1f5faf860
--- /dev/null
+++ b/arch/x86/Makefile_32.cpu
@@ -0,0 +1,52 @@
+# SPDX-License-Identifier: GPL-2.0
+# CPU tuning section - shared with UML.
+# Must change only cflags-y (or [yn]), not CFLAGS! That makes a difference for UML.
+
+#-mtune exists since gcc 3.4
+HAS_MTUNE := $(call cc-option-yn, -mtune=i386)
+ifeq ($(HAS_MTUNE),y)
+tune = $(call cc-option,-mtune=$(1),$(2))
+else
+tune = $(call cc-option,-mcpu=$(1),$(2))
+endif
+
+cflags-$(CONFIG_M486) += -march=i486
+cflags-$(CONFIG_M586) += -march=i586
+cflags-$(CONFIG_M586TSC) += -march=i586
+cflags-$(CONFIG_M586MMX) += -march=pentium-mmx
+cflags-$(CONFIG_M686) += -march=i686
+cflags-$(CONFIG_MPENTIUMII) += -march=i686 $(call tune,pentium2)
+cflags-$(CONFIG_MPENTIUMIII) += -march=i686 $(call tune,pentium3)
+cflags-$(CONFIG_MPENTIUMM) += -march=i686 $(call tune,pentium3)
+cflags-$(CONFIG_MPENTIUM4) += -march=i686 $(call tune,pentium4)
+cflags-$(CONFIG_MK6) += -march=k6
+# Please note, that patches that add -march=athlon-xp and friends are pointless.
+# They make zero difference whatsosever to performance at this time.
+cflags-$(CONFIG_MK7) += -march=athlon
+cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8,-march=athlon)
+cflags-$(CONFIG_MCRUSOE) += -march=i686 -falign-functions=0 -falign-jumps=0 -falign-loops=0
+cflags-$(CONFIG_MEFFICEON) += -march=i686 $(call tune,pentium3) -falign-functions=0 -falign-jumps=0 -falign-loops=0
+cflags-$(CONFIG_MWINCHIPC6) += $(call cc-option,-march=winchip-c6,-march=i586)
+cflags-$(CONFIG_MWINCHIP3D) += $(call cc-option,-march=winchip2,-march=i586)
+cflags-$(CONFIG_MCYRIXIII) += $(call cc-option,-march=c3,-march=i486) -falign-functions=0 -falign-jumps=0 -falign-loops=0
+cflags-$(CONFIG_MVIAC3_2) += $(call cc-option,-march=c3-2,-march=i686)
+cflags-$(CONFIG_MVIAC7) += -march=i686
+cflags-$(CONFIG_MCORE2) += -march=i686 $(call tune,core2)
+cflags-$(CONFIG_MATOM) += $(call cc-option,-march=atom,$(call cc-option,-march=core2,-march=i686)) \
+ $(call cc-option,-mtune=atom,$(call cc-option,-mtune=generic))
+
+# AMD Elan support
+cflags-$(CONFIG_MELAN) += -march=i486
+
+# Geode GX1 support
+cflags-$(CONFIG_MGEODEGX1) += -march=pentium-mmx
+cflags-$(CONFIG_MGEODE_LX) += $(call cc-option,-march=geode,-march=pentium-mmx)
+# add at the end to overwrite eventual tuning options from earlier
+# cpu entries
+cflags-$(CONFIG_X86_GENERIC) += $(call tune,generic,$(call tune,i686))
+
+# Bug fix for binutils: this option is required in order to keep
+# binutils from generating NOPL instructions against our will.
+ifneq ($(CONFIG_X86_P6_NOP),y)
+cflags-y += $(call cc-option,-Wa$(comma)-mtune=generic32,)
+endif
diff --git a/arch/x86/boot/.gitignore b/arch/x86/boot/.gitignore
new file mode 100644
index 000000000..09d25dd09
--- /dev/null
+++ b/arch/x86/boot/.gitignore
@@ -0,0 +1,12 @@
+bootsect
+bzImage
+cpustr.h
+mkcpustr
+voffset.h
+zoffset.h
+setup
+setup.bin
+setup.elf
+fdimage
+mtools.conf
+image.iso
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
new file mode 100644
index 000000000..6539c50fb
--- /dev/null
+++ b/arch/x86/boot/Makefile
@@ -0,0 +1,156 @@
+#
+# arch/x86/boot/Makefile
+#
+# This file is subject to the terms and conditions of the GNU General Public
+# License. See the file "COPYING" in the main directory of this archive
+# for more details.
+#
+# Copyright (C) 1994 by Linus Torvalds
+# Changed by many, many contributors over the years.
+#
+
+KASAN_SANITIZE := n
+OBJECT_FILES_NON_STANDARD := y
+
+# Kernel does not boot with kcov instrumentation here.
+# One of the problems observed was insertion of __sanitizer_cov_trace_pc()
+# callback into middle of per-cpu data enabling code. Thus the callback observed
+# inconsistent state and crashed. We are interested mostly in syscall coverage,
+# so boot code is not interesting anyway.
+KCOV_INSTRUMENT := n
+
+# If you want to preset the SVGA mode, uncomment the next line and
+# set SVGA_MODE to whatever number you want.
+# Set it to -DSVGA_MODE=NORMAL_VGA if you just want the EGA/VGA mode.
+# The number is the same as you would ordinarily press at bootup.
+
+SVGA_MODE := -DSVGA_MODE=NORMAL_VGA
+
+targets := vmlinux.bin setup.bin setup.elf bzImage
+targets += fdimage fdimage144 fdimage288 image.iso mtools.conf
+subdir- := compressed
+
+setup-y += a20.o bioscall.o cmdline.o copy.o cpu.o cpuflags.o cpucheck.o
+setup-y += early_serial_console.o edd.o header.o main.o memory.o
+setup-y += pm.o pmjump.o printf.o regs.o string.o tty.o video.o
+setup-y += video-mode.o version.o
+setup-$(CONFIG_X86_APM_BOOT) += apm.o
+
+# The link order of the video-*.o modules can matter. In particular,
+# video-vga.o *must* be listed first, followed by video-vesa.o.
+# Hardware-specific drivers should follow in the order they should be
+# probed, and video-bios.o should typically be last.
+setup-y += video-vga.o
+setup-y += video-vesa.o
+setup-y += video-bios.o
+
+targets += $(setup-y)
+hostprogs-y := tools/build
+hostprogs-$(CONFIG_X86_FEATURE_NAMES) += mkcpustr
+
+HOST_EXTRACFLAGS += -I$(srctree)/tools/include \
+ -include include/generated/autoconf.h \
+ -D__EXPORTED_HEADERS__
+
+ifdef CONFIG_X86_FEATURE_NAMES
+$(obj)/cpu.o: $(obj)/cpustr.h
+
+quiet_cmd_cpustr = CPUSTR $@
+ cmd_cpustr = $(obj)/mkcpustr > $@
+targets += cpustr.h
+$(obj)/cpustr.h: $(obj)/mkcpustr FORCE
+ $(call if_changed,cpustr)
+endif
+clean-files += cpustr.h
+
+# ---------------------------------------------------------------------------
+
+KBUILD_CFLAGS := $(REALMODE_CFLAGS) -D_SETUP
+KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
+GCOV_PROFILE := n
+UBSAN_SANITIZE := n
+
+$(obj)/bzImage: asflags-y := $(SVGA_MODE)
+
+quiet_cmd_image = BUILD $@
+silent_redirect_image = >/dev/null
+cmd_image = $(obj)/tools/build $(obj)/setup.bin $(obj)/vmlinux.bin \
+ $(obj)/zoffset.h $@ $($(quiet)redirect_image)
+
+$(obj)/bzImage: $(obj)/setup.bin $(obj)/vmlinux.bin $(obj)/tools/build FORCE
+ $(call if_changed,image)
+ @$(kecho) 'Kernel: $@ is ready' ' (#'`cat .version`')'
+
+OBJCOPYFLAGS_vmlinux.bin := -O binary -R .note -R .comment -S
+$(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE
+ $(call if_changed,objcopy)
+
+SETUP_OBJS = $(addprefix $(obj)/,$(setup-y))
+
+sed-zoffset := -e 's/^\([0-9a-fA-F]*\) [a-zA-Z] \(startup_32\|startup_64\|efi32_stub_entry\|efi64_stub_entry\|efi_pe_entry\|input_data\|_end\|_ehead\|_text\|z_.*\)$$/\#define ZO_\2 0x\1/p'
+
+quiet_cmd_zoffset = ZOFFSET $@
+ cmd_zoffset = $(NM) $< | sed -n $(sed-zoffset) > $@
+
+targets += zoffset.h
+$(obj)/zoffset.h: $(obj)/compressed/vmlinux FORCE
+ $(call if_changed,zoffset)
+
+
+AFLAGS_header.o += -I$(objtree)/$(obj)
+$(obj)/header.o: $(obj)/zoffset.h
+
+LDFLAGS_setup.elf := -m elf_i386 -T
+$(obj)/setup.elf: $(src)/setup.ld $(SETUP_OBJS) FORCE
+ $(call if_changed,ld)
+
+OBJCOPYFLAGS_setup.bin := -O binary
+$(obj)/setup.bin: $(obj)/setup.elf FORCE
+ $(call if_changed,objcopy)
+
+$(obj)/compressed/vmlinux: FORCE
+ $(Q)$(MAKE) $(build)=$(obj)/compressed $@
+
+# Set this if you want to pass append arguments to the
+# bzdisk/fdimage/isoimage kernel
+FDARGS =
+# Set this if you want an initrd included with the
+# bzdisk/fdimage/isoimage kernel
+FDINITRD =
+
+image_cmdline = default linux $(FDARGS) $(if $(FDINITRD),initrd=initrd.img,)
+
+$(obj)/mtools.conf: $(src)/mtools.conf.in
+ sed -e 's|@OBJ@|$(obj)|g' < $< > $@
+
+quiet_cmd_genimage = GENIMAGE $3
+cmd_genimage = sh $(srctree)/$(src)/genimage.sh $2 $3 $(obj)/bzImage \
+ $(obj)/mtools.conf '$(image_cmdline)' $(FDINITRD)
+
+# This requires write access to /dev/fd0
+bzdisk: $(obj)/bzImage $(obj)/mtools.conf
+ $(call cmd,genimage,bzdisk,/dev/fd0)
+
+# These require being root or having syslinux 2.02 or higher installed
+fdimage fdimage144: $(obj)/bzImage $(obj)/mtools.conf
+ $(call cmd,genimage,fdimage144,$(obj)/fdimage)
+ @$(kecho) 'Kernel: $(obj)/fdimage is ready'
+
+fdimage288: $(obj)/bzImage $(obj)/mtools.conf
+ $(call cmd,genimage,fdimage288,$(obj)/fdimage)
+ @$(kecho) 'Kernel: $(obj)/fdimage is ready'
+
+isoimage: $(obj)/bzImage
+ $(call cmd,genimage,isoimage,$(obj)/image.iso)
+ @$(kecho) 'Kernel: $(obj)/image.iso is ready'
+
+bzlilo: $(obj)/bzImage
+ if [ -f $(INSTALL_PATH)/vmlinuz ]; then mv $(INSTALL_PATH)/vmlinuz $(INSTALL_PATH)/vmlinuz.old; fi
+ if [ -f $(INSTALL_PATH)/System.map ]; then mv $(INSTALL_PATH)/System.map $(INSTALL_PATH)/System.old; fi
+ cat $(obj)/bzImage > $(INSTALL_PATH)/vmlinuz
+ cp System.map $(INSTALL_PATH)/
+ if [ -x /sbin/lilo ]; then /sbin/lilo; else /etc/lilo/install; fi
+
+install:
+ sh $(srctree)/$(src)/install.sh $(KERNELRELEASE) $(obj)/bzImage \
+ System.map "$(INSTALL_PATH)"
diff --git a/arch/x86/boot/a20.c b/arch/x86/boot/a20.c
new file mode 100644
index 000000000..64a31a6d7
--- /dev/null
+++ b/arch/x86/boot/a20.c
@@ -0,0 +1,165 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright 2007-2008 rPath, Inc. - All Rights Reserved
+ * Copyright 2009 Intel Corporation; author H. Peter Anvin
+ *
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * Enable A20 gate (return -1 on failure)
+ */
+
+#include "boot.h"
+
+#define MAX_8042_LOOPS 100000
+#define MAX_8042_FF 32
+
+static int empty_8042(void)
+{
+ u8 status;
+ int loops = MAX_8042_LOOPS;
+ int ffs = MAX_8042_FF;
+
+ while (loops--) {
+ io_delay();
+
+ status = inb(0x64);
+ if (status == 0xff) {
+ /* FF is a plausible, but very unlikely status */
+ if (!--ffs)
+ return -1; /* Assume no KBC present */
+ }
+ if (status & 1) {
+ /* Read and discard input data */
+ io_delay();
+ (void)inb(0x60);
+ } else if (!(status & 2)) {
+ /* Buffers empty, finished! */
+ return 0;
+ }
+ }
+
+ return -1;
+}
+
+/* Returns nonzero if the A20 line is enabled. The memory address
+ used as a test is the int $0x80 vector, which should be safe. */
+
+#define A20_TEST_ADDR (4*0x80)
+#define A20_TEST_SHORT 32
+#define A20_TEST_LONG 2097152 /* 2^21 */
+
+static int a20_test(int loops)
+{
+ int ok = 0;
+ int saved, ctr;
+
+ set_fs(0x0000);
+ set_gs(0xffff);
+
+ saved = ctr = rdfs32(A20_TEST_ADDR);
+
+ while (loops--) {
+ wrfs32(++ctr, A20_TEST_ADDR);
+ io_delay(); /* Serialize and make delay constant */
+ ok = rdgs32(A20_TEST_ADDR+0x10) ^ ctr;
+ if (ok)
+ break;
+ }
+
+ wrfs32(saved, A20_TEST_ADDR);
+ return ok;
+}
+
+/* Quick test to see if A20 is already enabled */
+static int a20_test_short(void)
+{
+ return a20_test(A20_TEST_SHORT);
+}
+
+/* Longer test that actually waits for A20 to come on line; this
+ is useful when dealing with the KBC or other slow external circuitry. */
+static int a20_test_long(void)
+{
+ return a20_test(A20_TEST_LONG);
+}
+
+static void enable_a20_bios(void)
+{
+ struct biosregs ireg;
+
+ initregs(&ireg);
+ ireg.ax = 0x2401;
+ intcall(0x15, &ireg, NULL);
+}
+
+static void enable_a20_kbc(void)
+{
+ empty_8042();
+
+ outb(0xd1, 0x64); /* Command write */
+ empty_8042();
+
+ outb(0xdf, 0x60); /* A20 on */
+ empty_8042();
+
+ outb(0xff, 0x64); /* Null command, but UHCI wants it */
+ empty_8042();
+}
+
+static void enable_a20_fast(void)
+{
+ u8 port_a;
+
+ port_a = inb(0x92); /* Configuration port A */
+ port_a |= 0x02; /* Enable A20 */
+ port_a &= ~0x01; /* Do not reset machine */
+ outb(port_a, 0x92);
+}
+
+/*
+ * Actual routine to enable A20; return 0 on ok, -1 on failure
+ */
+
+#define A20_ENABLE_LOOPS 255 /* Number of times to try */
+
+int enable_a20(void)
+{
+ int loops = A20_ENABLE_LOOPS;
+ int kbc_err;
+
+ while (loops--) {
+ /* First, check to see if A20 is already enabled
+ (legacy free, etc.) */
+ if (a20_test_short())
+ return 0;
+
+ /* Next, try the BIOS (INT 0x15, AX=0x2401) */
+ enable_a20_bios();
+ if (a20_test_short())
+ return 0;
+
+ /* Try enabling A20 through the keyboard controller */
+ kbc_err = empty_8042();
+
+ if (a20_test_short())
+ return 0; /* BIOS worked, but with delayed reaction */
+
+ if (!kbc_err) {
+ enable_a20_kbc();
+ if (a20_test_long())
+ return 0;
+ }
+
+ /* Finally, try enabling the "fast A20 gate" */
+ enable_a20_fast();
+ if (a20_test_long())
+ return 0;
+ }
+
+ return -1;
+}
diff --git a/arch/x86/boot/apm.c b/arch/x86/boot/apm.c
new file mode 100644
index 000000000..ee274834e
--- /dev/null
+++ b/arch/x86/boot/apm.c
@@ -0,0 +1,75 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright 2007 rPath, Inc. - All Rights Reserved
+ * Copyright 2009 Intel Corporation; author H. Peter Anvin
+ *
+ * Original APM BIOS checking by Stephen Rothwell, May 1994
+ * (sfr@canb.auug.org.au)
+ *
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * Get APM BIOS information
+ */
+
+#include "boot.h"
+
+int query_apm_bios(void)
+{
+ struct biosregs ireg, oreg;
+
+ /* APM BIOS installation check */
+ initregs(&ireg);
+ ireg.ah = 0x53;
+ intcall(0x15, &ireg, &oreg);
+
+ if (oreg.flags & X86_EFLAGS_CF)
+ return -1; /* No APM BIOS */
+
+ if (oreg.bx != 0x504d) /* "PM" signature */
+ return -1;
+
+ if (!(oreg.cx & 0x02)) /* 32 bits supported? */
+ return -1;
+
+ /* Disconnect first, just in case */
+ ireg.al = 0x04;
+ intcall(0x15, &ireg, NULL);
+
+ /* 32-bit connect */
+ ireg.al = 0x03;
+ intcall(0x15, &ireg, &oreg);
+
+ boot_params.apm_bios_info.cseg = oreg.ax;
+ boot_params.apm_bios_info.offset = oreg.ebx;
+ boot_params.apm_bios_info.cseg_16 = oreg.cx;
+ boot_params.apm_bios_info.dseg = oreg.dx;
+ boot_params.apm_bios_info.cseg_len = oreg.si;
+ boot_params.apm_bios_info.cseg_16_len = oreg.hsi;
+ boot_params.apm_bios_info.dseg_len = oreg.di;
+
+ if (oreg.flags & X86_EFLAGS_CF)
+ return -1;
+
+ /* Redo the installation check as the 32-bit connect;
+ some BIOSes return different flags this way... */
+
+ ireg.al = 0x00;
+ intcall(0x15, &ireg, &oreg);
+
+ if ((oreg.eflags & X86_EFLAGS_CF) || oreg.bx != 0x504d) {
+ /* Failure with 32-bit connect, try to disconect and ignore */
+ ireg.al = 0x04;
+ intcall(0x15, &ireg, NULL);
+ return -1;
+ }
+
+ boot_params.apm_bios_info.version = oreg.ax;
+ boot_params.apm_bios_info.flags = oreg.cx;
+ return 0;
+}
+
diff --git a/arch/x86/boot/bioscall.S b/arch/x86/boot/bioscall.S
new file mode 100644
index 000000000..d401b4a26
--- /dev/null
+++ b/arch/x86/boot/bioscall.S
@@ -0,0 +1,82 @@
+/* -----------------------------------------------------------------------
+ *
+ * Copyright 2009-2014 Intel Corporation; author H. Peter Anvin
+ *
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2 or (at your
+ * option) any later version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * "Glove box" for BIOS calls. Avoids the constant problems with BIOSes
+ * touching registers they shouldn't be.
+ */
+
+ .code16
+ .section ".inittext","ax"
+ .globl intcall
+ .type intcall, @function
+intcall:
+ /* Self-modify the INT instruction. Ugly, but works. */
+ cmpb %al, 3f
+ je 1f
+ movb %al, 3f
+ jmp 1f /* Synchronize pipeline */
+1:
+ /* Save state */
+ pushfl
+ pushw %fs
+ pushw %gs
+ pushal
+
+ /* Copy input state to stack frame */
+ subw $44, %sp
+ movw %dx, %si
+ movw %sp, %di
+ movw $11, %cx
+ rep; movsd
+
+ /* Pop full state from the stack */
+ popal
+ popw %gs
+ popw %fs
+ popw %es
+ popw %ds
+ popfl
+
+ /* Actual INT */
+ .byte 0xcd /* INT opcode */
+3: .byte 0
+
+ /* Push full state to the stack */
+ pushfl
+ pushw %ds
+ pushw %es
+ pushw %fs
+ pushw %gs
+ pushal
+
+ /* Re-establish C environment invariants */
+ cld
+ movzwl %sp, %esp
+ movw %cs, %ax
+ movw %ax, %ds
+ movw %ax, %es
+
+ /* Copy output state from stack frame */
+ movw 68(%esp), %di /* Original %cx == 3rd argument */
+ andw %di, %di
+ jz 4f
+ movw %sp, %si
+ movw $11, %cx
+ rep; movsd
+4: addw $44, %sp
+
+ /* Restore state and return */
+ popal
+ popw %gs
+ popw %fs
+ popfl
+ retl
+ .size intcall, .-intcall
diff --git a/arch/x86/boot/bitops.h b/arch/x86/boot/bitops.h
new file mode 100644
index 000000000..2e1382486
--- /dev/null
+++ b/arch/x86/boot/bitops.h
@@ -0,0 +1,46 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright 2007 rPath, Inc. - All Rights Reserved
+ *
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * Very simple bitops for the boot code.
+ */
+
+#ifndef BOOT_BITOPS_H
+#define BOOT_BITOPS_H
+#define _LINUX_BITOPS_H /* Inhibit inclusion of <linux/bitops.h> */
+
+#include <linux/types.h>
+#include <asm/asm.h>
+
+static inline bool constant_test_bit(int nr, const void *addr)
+{
+ const u32 *p = (const u32 *)addr;
+ return ((1UL << (nr & 31)) & (p[nr >> 5])) != 0;
+}
+static inline bool variable_test_bit(int nr, const void *addr)
+{
+ bool v;
+ const u32 *p = (const u32 *)addr;
+
+ asm("btl %2,%1" CC_SET(c) : CC_OUT(c) (v) : "m" (*p), "Ir" (nr));
+ return v;
+}
+
+#define test_bit(nr,addr) \
+(__builtin_constant_p(nr) ? \
+ constant_test_bit((nr),(addr)) : \
+ variable_test_bit((nr),(addr)))
+
+static inline void set_bit(int nr, void *addr)
+{
+ asm("btsl %1,%0" : "+m" (*(u32 *)addr) : "Ir" (nr));
+}
+
+#endif /* BOOT_BITOPS_H */
diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
new file mode 100644
index 000000000..ef5a9cc66
--- /dev/null
+++ b/arch/x86/boot/boot.h
@@ -0,0 +1,358 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright 2007 rPath, Inc. - All Rights Reserved
+ * Copyright 2009 Intel Corporation; author H. Peter Anvin
+ *
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * Header file for the real-mode kernel code
+ */
+
+#ifndef BOOT_BOOT_H
+#define BOOT_BOOT_H
+
+#define STACK_SIZE 1024 /* Minimum number of bytes for stack */
+
+#ifndef __ASSEMBLY__
+
+#include <stdarg.h>
+#include <linux/types.h>
+#include <linux/edd.h>
+#include <asm/setup.h>
+#include <asm/asm.h>
+#include "bitops.h"
+#include "ctype.h"
+#include "cpuflags.h"
+
+/* Useful macros */
+#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
+
+extern struct setup_header hdr;
+extern struct boot_params boot_params;
+
+#define cpu_relax() asm volatile("rep; nop")
+
+/* Basic port I/O */
+static inline void outb(u8 v, u16 port)
+{
+ asm volatile("outb %0,%1" : : "a" (v), "dN" (port));
+}
+static inline u8 inb(u16 port)
+{
+ u8 v;
+ asm volatile("inb %1,%0" : "=a" (v) : "dN" (port));
+ return v;
+}
+
+static inline void outw(u16 v, u16 port)
+{
+ asm volatile("outw %0,%1" : : "a" (v), "dN" (port));
+}
+static inline u16 inw(u16 port)
+{
+ u16 v;
+ asm volatile("inw %1,%0" : "=a" (v) : "dN" (port));
+ return v;
+}
+
+static inline void outl(u32 v, u16 port)
+{
+ asm volatile("outl %0,%1" : : "a" (v), "dN" (port));
+}
+static inline u32 inl(u16 port)
+{
+ u32 v;
+ asm volatile("inl %1,%0" : "=a" (v) : "dN" (port));
+ return v;
+}
+
+static inline void io_delay(void)
+{
+ const u16 DELAY_PORT = 0x80;
+ asm volatile("outb %%al,%0" : : "dN" (DELAY_PORT));
+}
+
+/* These functions are used to reference data in other segments. */
+
+static inline u16 ds(void)
+{
+ u16 seg;
+ asm("movw %%ds,%0" : "=rm" (seg));
+ return seg;
+}
+
+static inline void set_fs(u16 seg)
+{
+ asm volatile("movw %0,%%fs" : : "rm" (seg));
+}
+static inline u16 fs(void)
+{
+ u16 seg;
+ asm volatile("movw %%fs,%0" : "=rm" (seg));
+ return seg;
+}
+
+static inline void set_gs(u16 seg)
+{
+ asm volatile("movw %0,%%gs" : : "rm" (seg));
+}
+static inline u16 gs(void)
+{
+ u16 seg;
+ asm volatile("movw %%gs,%0" : "=rm" (seg));
+ return seg;
+}
+
+typedef unsigned int addr_t;
+
+static inline u8 rdfs8(addr_t addr)
+{
+ u8 v;
+ asm volatile("movb %%fs:%1,%0" : "=q" (v) : "m" (*(u8 *)addr));
+ return v;
+}
+static inline u16 rdfs16(addr_t addr)
+{
+ u16 v;
+ asm volatile("movw %%fs:%1,%0" : "=r" (v) : "m" (*(u16 *)addr));
+ return v;
+}
+static inline u32 rdfs32(addr_t addr)
+{
+ u32 v;
+ asm volatile("movl %%fs:%1,%0" : "=r" (v) : "m" (*(u32 *)addr));
+ return v;
+}
+
+static inline void wrfs8(u8 v, addr_t addr)
+{
+ asm volatile("movb %1,%%fs:%0" : "+m" (*(u8 *)addr) : "qi" (v));
+}
+static inline void wrfs16(u16 v, addr_t addr)
+{
+ asm volatile("movw %1,%%fs:%0" : "+m" (*(u16 *)addr) : "ri" (v));
+}
+static inline void wrfs32(u32 v, addr_t addr)
+{
+ asm volatile("movl %1,%%fs:%0" : "+m" (*(u32 *)addr) : "ri" (v));
+}
+
+static inline u8 rdgs8(addr_t addr)
+{
+ u8 v;
+ asm volatile("movb %%gs:%1,%0" : "=q" (v) : "m" (*(u8 *)addr));
+ return v;
+}
+static inline u16 rdgs16(addr_t addr)
+{
+ u16 v;
+ asm volatile("movw %%gs:%1,%0" : "=r" (v) : "m" (*(u16 *)addr));
+ return v;
+}
+static inline u32 rdgs32(addr_t addr)
+{
+ u32 v;
+ asm volatile("movl %%gs:%1,%0" : "=r" (v) : "m" (*(u32 *)addr));
+ return v;
+}
+
+static inline void wrgs8(u8 v, addr_t addr)
+{
+ asm volatile("movb %1,%%gs:%0" : "+m" (*(u8 *)addr) : "qi" (v));
+}
+static inline void wrgs16(u16 v, addr_t addr)
+{
+ asm volatile("movw %1,%%gs:%0" : "+m" (*(u16 *)addr) : "ri" (v));
+}
+static inline void wrgs32(u32 v, addr_t addr)
+{
+ asm volatile("movl %1,%%gs:%0" : "+m" (*(u32 *)addr) : "ri" (v));
+}
+
+/* Note: these only return true/false, not a signed return value! */
+static inline bool memcmp_fs(const void *s1, addr_t s2, size_t len)
+{
+ bool diff;
+ asm volatile("fs; repe; cmpsb" CC_SET(nz)
+ : CC_OUT(nz) (diff), "+D" (s1), "+S" (s2), "+c" (len));
+ return diff;
+}
+static inline bool memcmp_gs(const void *s1, addr_t s2, size_t len)
+{
+ bool diff;
+ asm volatile("gs; repe; cmpsb" CC_SET(nz)
+ : CC_OUT(nz) (diff), "+D" (s1), "+S" (s2), "+c" (len));
+ return diff;
+}
+
+/* Heap -- available for dynamic lists. */
+extern char _end[];
+extern char *HEAP;
+extern char *heap_end;
+#define RESET_HEAP() ((void *)( HEAP = _end ))
+static inline char *__get_heap(size_t s, size_t a, size_t n)
+{
+ char *tmp;
+
+ HEAP = (char *)(((size_t)HEAP+(a-1)) & ~(a-1));
+ tmp = HEAP;
+ HEAP += s*n;
+ return tmp;
+}
+#define GET_HEAP(type, n) \
+ ((type *)__get_heap(sizeof(type),__alignof__(type),(n)))
+
+static inline bool heap_free(size_t n)
+{
+ return (int)(heap_end-HEAP) >= (int)n;
+}
+
+/* copy.S */
+
+void copy_to_fs(addr_t dst, void *src, size_t len);
+void *copy_from_fs(void *dst, addr_t src, size_t len);
+void copy_to_gs(addr_t dst, void *src, size_t len);
+void *copy_from_gs(void *dst, addr_t src, size_t len);
+
+/* a20.c */
+int enable_a20(void);
+
+/* apm.c */
+int query_apm_bios(void);
+
+/* bioscall.c */
+struct biosregs {
+ union {
+ struct {
+ u32 edi;
+ u32 esi;
+ u32 ebp;
+ u32 _esp;
+ u32 ebx;
+ u32 edx;
+ u32 ecx;
+ u32 eax;
+ u32 _fsgs;
+ u32 _dses;
+ u32 eflags;
+ };
+ struct {
+ u16 di, hdi;
+ u16 si, hsi;
+ u16 bp, hbp;
+ u16 _sp, _hsp;
+ u16 bx, hbx;
+ u16 dx, hdx;
+ u16 cx, hcx;
+ u16 ax, hax;
+ u16 gs, fs;
+ u16 es, ds;
+ u16 flags, hflags;
+ };
+ struct {
+ u8 dil, dih, edi2, edi3;
+ u8 sil, sih, esi2, esi3;
+ u8 bpl, bph, ebp2, ebp3;
+ u8 _spl, _sph, _esp2, _esp3;
+ u8 bl, bh, ebx2, ebx3;
+ u8 dl, dh, edx2, edx3;
+ u8 cl, ch, ecx2, ecx3;
+ u8 al, ah, eax2, eax3;
+ };
+ };
+};
+void intcall(u8 int_no, const struct biosregs *ireg, struct biosregs *oreg);
+
+/* cmdline.c */
+int __cmdline_find_option(unsigned long cmdline_ptr, const char *option, char *buffer, int bufsize);
+int __cmdline_find_option_bool(unsigned long cmdline_ptr, const char *option);
+static inline int cmdline_find_option(const char *option, char *buffer, int bufsize)
+{
+ unsigned long cmd_line_ptr = boot_params.hdr.cmd_line_ptr;
+
+ if (cmd_line_ptr >= 0x100000)
+ return -1; /* inaccessible */
+
+ return __cmdline_find_option(cmd_line_ptr, option, buffer, bufsize);
+}
+
+static inline int cmdline_find_option_bool(const char *option)
+{
+ unsigned long cmd_line_ptr = boot_params.hdr.cmd_line_ptr;
+
+ if (cmd_line_ptr >= 0x100000)
+ return -1; /* inaccessible */
+
+ return __cmdline_find_option_bool(cmd_line_ptr, option);
+}
+
+/* cpu.c, cpucheck.c */
+int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr);
+int check_knl_erratum(void);
+int validate_cpu(void);
+
+/* early_serial_console.c */
+extern int early_serial_base;
+void console_init(void);
+
+/* edd.c */
+void query_edd(void);
+
+/* header.S */
+void __attribute__((noreturn)) die(void);
+
+/* memory.c */
+int detect_memory(void);
+
+/* pm.c */
+void __attribute__((noreturn)) go_to_protected_mode(void);
+
+/* pmjump.S */
+void __attribute__((noreturn))
+ protected_mode_jump(u32 entrypoint, u32 bootparams);
+
+/* printf.c */
+int sprintf(char *buf, const char *fmt, ...);
+int vsprintf(char *buf, const char *fmt, va_list args);
+int printf(const char *fmt, ...);
+
+/* regs.c */
+void initregs(struct biosregs *regs);
+
+/* string.c */
+int strcmp(const char *str1, const char *str2);
+int strncmp(const char *cs, const char *ct, size_t count);
+size_t strnlen(const char *s, size_t maxlen);
+unsigned int atou(const char *s);
+unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base);
+size_t strlen(const char *s);
+char *strchr(const char *s, int c);
+
+/* tty.c */
+void puts(const char *);
+void putchar(int);
+int getchar(void);
+void kbd_flush(void);
+int getchar_timeout(void);
+
+/* video.c */
+void set_video(void);
+
+/* video-mode.c */
+int set_mode(u16 mode);
+int mode_defined(u16 mode);
+void probe_cards(int unsafe);
+
+/* video-vesa.c */
+void vesa_store_edid(void);
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* BOOT_BOOT_H */
diff --git a/arch/x86/boot/cmdline.c b/arch/x86/boot/cmdline.c
new file mode 100644
index 000000000..625d21b0c
--- /dev/null
+++ b/arch/x86/boot/cmdline.c
@@ -0,0 +1,158 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright 2007 rPath, Inc. - All Rights Reserved
+ *
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * Simple command-line parser for early boot.
+ */
+
+#include "boot.h"
+
+static inline int myisspace(u8 c)
+{
+ return c <= ' '; /* Close enough approximation */
+}
+
+/*
+ * Find a non-boolean option, that is, "option=argument". In accordance
+ * with standard Linux practice, if this option is repeated, this returns
+ * the last instance on the command line.
+ *
+ * Returns the length of the argument (regardless of if it was
+ * truncated to fit in the buffer), or -1 on not found.
+ */
+int __cmdline_find_option(unsigned long cmdline_ptr, const char *option, char *buffer, int bufsize)
+{
+ addr_t cptr;
+ char c;
+ int len = -1;
+ const char *opptr = NULL;
+ char *bufptr = buffer;
+ enum {
+ st_wordstart, /* Start of word/after whitespace */
+ st_wordcmp, /* Comparing this word */
+ st_wordskip, /* Miscompare, skip */
+ st_bufcpy /* Copying this to buffer */
+ } state = st_wordstart;
+
+ if (!cmdline_ptr)
+ return -1; /* No command line */
+
+ cptr = cmdline_ptr & 0xf;
+ set_fs(cmdline_ptr >> 4);
+
+ while (cptr < 0x10000 && (c = rdfs8(cptr++))) {
+ switch (state) {
+ case st_wordstart:
+ if (myisspace(c))
+ break;
+
+ /* else */
+ state = st_wordcmp;
+ opptr = option;
+ /* fall through */
+
+ case st_wordcmp:
+ if (c == '=' && !*opptr) {
+ len = 0;
+ bufptr = buffer;
+ state = st_bufcpy;
+ } else if (myisspace(c)) {
+ state = st_wordstart;
+ } else if (c != *opptr++) {
+ state = st_wordskip;
+ }
+ break;
+
+ case st_wordskip:
+ if (myisspace(c))
+ state = st_wordstart;
+ break;
+
+ case st_bufcpy:
+ if (myisspace(c)) {
+ state = st_wordstart;
+ } else {
+ if (len < bufsize-1)
+ *bufptr++ = c;
+ len++;
+ }
+ break;
+ }
+ }
+
+ if (bufsize)
+ *bufptr = '\0';
+
+ return len;
+}
+
+/*
+ * Find a boolean option (like quiet,noapic,nosmp....)
+ *
+ * Returns the position of that option (starts counting with 1)
+ * or 0 on not found
+ */
+int __cmdline_find_option_bool(unsigned long cmdline_ptr, const char *option)
+{
+ addr_t cptr;
+ char c;
+ int pos = 0, wstart = 0;
+ const char *opptr = NULL;
+ enum {
+ st_wordstart, /* Start of word/after whitespace */
+ st_wordcmp, /* Comparing this word */
+ st_wordskip, /* Miscompare, skip */
+ } state = st_wordstart;
+
+ if (!cmdline_ptr)
+ return -1; /* No command line */
+
+ cptr = cmdline_ptr & 0xf;
+ set_fs(cmdline_ptr >> 4);
+
+ while (cptr < 0x10000) {
+ c = rdfs8(cptr++);
+ pos++;
+
+ switch (state) {
+ case st_wordstart:
+ if (!c)
+ return 0;
+ else if (myisspace(c))
+ break;
+
+ state = st_wordcmp;
+ opptr = option;
+ wstart = pos;
+ /* fall through */
+
+ case st_wordcmp:
+ if (!*opptr)
+ if (!c || myisspace(c))
+ return wstart;
+ else
+ state = st_wordskip;
+ else if (!c)
+ return 0;
+ else if (c != *opptr++)
+ state = st_wordskip;
+ break;
+
+ case st_wordskip:
+ if (!c)
+ return 0;
+ else if (myisspace(c))
+ state = st_wordstart;
+ break;
+ }
+ }
+
+ return 0; /* Buffer overrun */
+}
diff --git a/arch/x86/boot/code16gcc.h b/arch/x86/boot/code16gcc.h
new file mode 100644
index 000000000..e19fd7536
--- /dev/null
+++ b/arch/x86/boot/code16gcc.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#
+# code16gcc.h
+#
+# This file is added to the assembler via -Wa when compiling 16-bit C code.
+# This is done this way instead via asm() to make sure gcc does not reorder
+# things around us.
+#
+# gcc 4.9+ has a real -m16 option so we can drop this hack long term.
+#
+
+ .code16gcc
diff --git a/arch/x86/boot/compressed/.gitignore b/arch/x86/boot/compressed/.gitignore
new file mode 100644
index 000000000..4a46fab71
--- /dev/null
+++ b/arch/x86/boot/compressed/.gitignore
@@ -0,0 +1,6 @@
+relocs
+vmlinux.bin.all
+vmlinux.relocs
+vmlinux.lds
+mkpiggy
+piggy.S
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
new file mode 100644
index 000000000..5642f025b
--- /dev/null
+++ b/arch/x86/boot/compressed/Makefile
@@ -0,0 +1,160 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# linux/arch/x86/boot/compressed/Makefile
+#
+# create a compressed vmlinux image from the original vmlinux
+#
+# vmlinuz is:
+# decompression code (*.o)
+# asm globals (piggy.S), including:
+# vmlinux.bin.(gz|bz2|lzma|...)
+#
+# vmlinux.bin is:
+# vmlinux stripped of debugging and comments
+# vmlinux.bin.all is:
+# vmlinux.bin + vmlinux.relocs
+# vmlinux.bin.(gz|bz2|lzma|...) is:
+# (see scripts/Makefile.lib size_append)
+# compressed vmlinux.bin.all + u32 size of vmlinux.bin.all
+
+KASAN_SANITIZE := n
+OBJECT_FILES_NON_STANDARD := y
+
+# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in.
+KCOV_INSTRUMENT := n
+
+targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma \
+ vmlinux.bin.xz vmlinux.bin.lzo vmlinux.bin.lz4
+
+KBUILD_CFLAGS := -m$(BITS) -O2
+KBUILD_CFLAGS += -fno-strict-aliasing $(call cc-option, -fPIE, -fPIC)
+KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
+cflags-$(CONFIG_X86_32) := -march=i386
+cflags-$(CONFIG_X86_64) := -mcmodel=small
+KBUILD_CFLAGS += $(cflags-y)
+KBUILD_CFLAGS += -mno-mmx -mno-sse
+KBUILD_CFLAGS += $(call cc-option,-ffreestanding)
+KBUILD_CFLAGS += $(call cc-option,-fno-stack-protector)
+KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member)
+KBUILD_CFLAGS += $(call cc-disable-warning, gnu)
+KBUILD_CFLAGS += -Wno-pointer-sign
+# Disable relocation relaxation in case the link is not PIE.
+KBUILD_CFLAGS += $(call as-option,-Wa$(comma)-mrelax-relocations=no)
+
+KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
+GCOV_PROFILE := n
+UBSAN_SANITIZE :=n
+
+KBUILD_LDFLAGS := -m elf_$(UTS_MACHINE)
+# Compressed kernel should be built as PIE since it may be loaded at any
+# address by the bootloader.
+ifeq ($(CONFIG_X86_32),y)
+KBUILD_LDFLAGS += $(call ld-option, -pie) $(call ld-option, --no-dynamic-linker)
+else
+# To build 64-bit compressed kernel as PIE, we disable relocation
+# overflow check to avoid relocation overflow error with a new linker
+# command-line option, -z noreloc-overflow.
+KBUILD_LDFLAGS += $(shell $(LD) --help 2>&1 | grep -q "\-z noreloc-overflow" \
+ && echo "-z noreloc-overflow -pie --no-dynamic-linker")
+endif
+LDFLAGS_vmlinux := -T
+
+hostprogs-y := mkpiggy
+HOST_EXTRACFLAGS += -I$(srctree)/tools/include
+
+sed-voffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(_text\|__bss_start\|_end\)$$/\#define VO_\2 _AC(0x\1,UL)/p'
+
+quiet_cmd_voffset = VOFFSET $@
+ cmd_voffset = $(NM) $< | sed -n $(sed-voffset) > $@
+
+targets += ../voffset.h
+
+$(obj)/../voffset.h: vmlinux FORCE
+ $(call if_changed,voffset)
+
+$(obj)/misc.o: $(obj)/../voffset.h
+
+vmlinux-objs-y := $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o \
+ $(obj)/string.o $(obj)/cmdline.o $(obj)/error.o \
+ $(obj)/piggy.o $(obj)/cpuflags.o
+
+vmlinux-objs-$(CONFIG_EARLY_PRINTK) += $(obj)/early_serial_console.o
+vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/kaslr.o
+ifdef CONFIG_X86_64
+ vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/kaslr_64.o
+ vmlinux-objs-y += $(obj)/mem_encrypt.o
+ vmlinux-objs-y += $(obj)/pgtable_64.o
+endif
+
+$(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone
+
+vmlinux-objs-$(CONFIG_EFI_STUB) += $(obj)/eboot.o $(obj)/efi_stub_$(BITS).o \
+ $(objtree)/drivers/firmware/efi/libstub/lib.a
+vmlinux-objs-$(CONFIG_EFI_MIXED) += $(obj)/efi_thunk_$(BITS).o
+
+# The compressed kernel is built with -fPIC/-fPIE so that a boot loader
+# can place it anywhere in memory and it will still run. However, since
+# it is executed as-is without any ELF relocation processing performed
+# (and has already had all relocation sections stripped from the binary),
+# none of the code can use data relocations (e.g. static assignments of
+# pointer values), since they will be meaningless at runtime. This check
+# will refuse to link the vmlinux if any of these relocations are found.
+quiet_cmd_check_data_rel = DATAREL $@
+define cmd_check_data_rel
+ for obj in $(filter %.o,$^); do \
+ $(READELF) -S $$obj | grep -qF .rel.local && { \
+ echo "error: $$obj has data relocations!" >&2; \
+ exit 1; \
+ } || true; \
+ done
+endef
+
+# We need to run two commands under "if_changed", so merge them into a
+# single invocation.
+quiet_cmd_check-and-link-vmlinux = LD $@
+ cmd_check-and-link-vmlinux = $(cmd_check_data_rel); $(cmd_ld)
+
+$(obj)/vmlinux: $(vmlinux-objs-y) FORCE
+ $(call if_changed,check-and-link-vmlinux)
+
+OBJCOPYFLAGS_vmlinux.bin := -R .comment -S
+$(obj)/vmlinux.bin: vmlinux FORCE
+ $(call if_changed,objcopy)
+
+targets += $(patsubst $(obj)/%,%,$(vmlinux-objs-y)) vmlinux.bin.all vmlinux.relocs
+
+CMD_RELOCS = arch/x86/tools/relocs
+quiet_cmd_relocs = RELOCS $@
+ cmd_relocs = $(CMD_RELOCS) $< > $@;$(CMD_RELOCS) --abs-relocs $<
+$(obj)/vmlinux.relocs: vmlinux FORCE
+ $(call if_changed,relocs)
+
+vmlinux.bin.all-y := $(obj)/vmlinux.bin
+vmlinux.bin.all-$(CONFIG_X86_NEED_RELOCS) += $(obj)/vmlinux.relocs
+
+$(obj)/vmlinux.bin.gz: $(vmlinux.bin.all-y) FORCE
+ $(call if_changed,gzip)
+$(obj)/vmlinux.bin.bz2: $(vmlinux.bin.all-y) FORCE
+ $(call if_changed,bzip2)
+$(obj)/vmlinux.bin.lzma: $(vmlinux.bin.all-y) FORCE
+ $(call if_changed,lzma)
+$(obj)/vmlinux.bin.xz: $(vmlinux.bin.all-y) FORCE
+ $(call if_changed,xzkern)
+$(obj)/vmlinux.bin.lzo: $(vmlinux.bin.all-y) FORCE
+ $(call if_changed,lzo)
+$(obj)/vmlinux.bin.lz4: $(vmlinux.bin.all-y) FORCE
+ $(call if_changed,lz4)
+
+suffix-$(CONFIG_KERNEL_GZIP) := gz
+suffix-$(CONFIG_KERNEL_BZIP2) := bz2
+suffix-$(CONFIG_KERNEL_LZMA) := lzma
+suffix-$(CONFIG_KERNEL_XZ) := xz
+suffix-$(CONFIG_KERNEL_LZO) := lzo
+suffix-$(CONFIG_KERNEL_LZ4) := lz4
+
+quiet_cmd_mkpiggy = MKPIGGY $@
+ cmd_mkpiggy = $(obj)/mkpiggy $< > $@ || ( rm -f $@ ; false )
+
+targets += piggy.S
+$(obj)/piggy.S: $(obj)/vmlinux.bin.$(suffix-y) $(obj)/mkpiggy FORCE
+ $(call if_changed,mkpiggy)
diff --git a/arch/x86/boot/compressed/cmdline.c b/arch/x86/boot/compressed/cmdline.c
new file mode 100644
index 000000000..af6cda0b7
--- /dev/null
+++ b/arch/x86/boot/compressed/cmdline.c
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "misc.h"
+
+#if CONFIG_EARLY_PRINTK || CONFIG_RANDOMIZE_BASE || CONFIG_X86_5LEVEL
+
+static unsigned long fs;
+static inline void set_fs(unsigned long seg)
+{
+ fs = seg << 4; /* shift it back */
+}
+typedef unsigned long addr_t;
+static inline char rdfs8(addr_t addr)
+{
+ return *((char *)(fs + addr));
+}
+#include "../cmdline.c"
+unsigned long get_cmd_line_ptr(void)
+{
+ unsigned long cmd_line_ptr = boot_params->hdr.cmd_line_ptr;
+
+ cmd_line_ptr |= (u64)boot_params->ext_cmd_line_ptr << 32;
+
+ return cmd_line_ptr;
+}
+int cmdline_find_option(const char *option, char *buffer, int bufsize)
+{
+ return __cmdline_find_option(get_cmd_line_ptr(), option, buffer, bufsize);
+}
+int cmdline_find_option_bool(const char *option)
+{
+ return __cmdline_find_option_bool(get_cmd_line_ptr(), option);
+}
+
+#endif
diff --git a/arch/x86/boot/compressed/cpuflags.c b/arch/x86/boot/compressed/cpuflags.c
new file mode 100644
index 000000000..6448a8196
--- /dev/null
+++ b/arch/x86/boot/compressed/cpuflags.c
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifdef CONFIG_RANDOMIZE_BASE
+
+#include "../cpuflags.c"
+
+bool has_cpuflag(int flag)
+{
+ get_cpuflags();
+
+ return test_bit(flag, cpu.flags);
+}
+
+#endif
diff --git a/arch/x86/boot/compressed/early_serial_console.c b/arch/x86/boot/compressed/early_serial_console.c
new file mode 100644
index 000000000..261e81fb9
--- /dev/null
+++ b/arch/x86/boot/compressed/early_serial_console.c
@@ -0,0 +1,5 @@
+#include "misc.h"
+
+int early_serial_base;
+
+#include "../early_serial_console.c"
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
new file mode 100644
index 000000000..544ac4faf
--- /dev/null
+++ b/arch/x86/boot/compressed/eboot.c
@@ -0,0 +1,934 @@
+
+/* -----------------------------------------------------------------------
+ *
+ * Copyright 2011 Intel Corporation; author Matt Fleming
+ *
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+
+#include <linux/efi.h>
+#include <linux/pci.h>
+
+#include <asm/efi.h>
+#include <asm/e820/types.h>
+#include <asm/setup.h>
+#include <asm/desc.h>
+
+#include "../string.h"
+#include "eboot.h"
+
+static efi_system_table_t *sys_table;
+
+static struct efi_config *efi_early;
+
+__pure const struct efi_config *__efi_early(void)
+{
+ return efi_early;
+}
+
+#define BOOT_SERVICES(bits) \
+static void setup_boot_services##bits(struct efi_config *c) \
+{ \
+ efi_system_table_##bits##_t *table; \
+ \
+ table = (typeof(table))sys_table; \
+ \
+ c->runtime_services = table->runtime; \
+ c->boot_services = table->boottime; \
+ c->text_output = table->con_out; \
+}
+BOOT_SERVICES(32);
+BOOT_SERVICES(64);
+
+void efi_char16_printk(efi_system_table_t *table, efi_char16_t *str)
+{
+ efi_call_proto(efi_simple_text_output_protocol, output_string,
+ efi_early->text_output, str);
+}
+
+static efi_status_t
+preserve_pci_rom_image(efi_pci_io_protocol_t *pci, struct pci_setup_rom **__rom)
+{
+ struct pci_setup_rom *rom = NULL;
+ efi_status_t status;
+ unsigned long size;
+ uint64_t romsize;
+ void *romimage;
+
+ /*
+ * Some firmware images contain EFI function pointers at the place where
+ * the romimage and romsize fields are supposed to be. Typically the EFI
+ * code is mapped at high addresses, translating to an unrealistically
+ * large romsize. The UEFI spec limits the size of option ROMs to 16
+ * MiB so we reject any ROMs over 16 MiB in size to catch this.
+ */
+ romimage = (void *)(unsigned long)efi_table_attr(efi_pci_io_protocol,
+ romimage, pci);
+ romsize = efi_table_attr(efi_pci_io_protocol, romsize, pci);
+ if (!romimage || !romsize || romsize > SZ_16M)
+ return EFI_INVALID_PARAMETER;
+
+ size = romsize + sizeof(*rom);
+
+ status = efi_call_early(allocate_pool, EFI_LOADER_DATA, size, &rom);
+ if (status != EFI_SUCCESS) {
+ efi_printk(sys_table, "Failed to allocate memory for 'rom'\n");
+ return status;
+ }
+
+ memset(rom, 0, sizeof(*rom));
+
+ rom->data.type = SETUP_PCI;
+ rom->data.len = size - sizeof(struct setup_data);
+ rom->data.next = 0;
+ rom->pcilen = pci->romsize;
+ *__rom = rom;
+
+ status = efi_call_proto(efi_pci_io_protocol, pci.read, pci,
+ EfiPciIoWidthUint16, PCI_VENDOR_ID, 1,
+ &rom->vendor);
+
+ if (status != EFI_SUCCESS) {
+ efi_printk(sys_table, "Failed to read rom->vendor\n");
+ goto free_struct;
+ }
+
+ status = efi_call_proto(efi_pci_io_protocol, pci.read, pci,
+ EfiPciIoWidthUint16, PCI_DEVICE_ID, 1,
+ &rom->devid);
+
+ if (status != EFI_SUCCESS) {
+ efi_printk(sys_table, "Failed to read rom->devid\n");
+ goto free_struct;
+ }
+
+ status = efi_call_proto(efi_pci_io_protocol, get_location, pci,
+ &rom->segment, &rom->bus, &rom->device,
+ &rom->function);
+
+ if (status != EFI_SUCCESS)
+ goto free_struct;
+
+ memcpy(rom->romdata, romimage, romsize);
+ return status;
+
+free_struct:
+ efi_call_early(free_pool, rom);
+ return status;
+}
+
+/*
+ * There's no way to return an informative status from this function,
+ * because any analysis (and printing of error messages) needs to be
+ * done directly at the EFI function call-site.
+ *
+ * For example, EFI_INVALID_PARAMETER could indicate a bug or maybe we
+ * just didn't find any PCI devices, but there's no way to tell outside
+ * the context of the call.
+ */
+static void setup_efi_pci(struct boot_params *params)
+{
+ efi_status_t status;
+ void **pci_handle = NULL;
+ efi_guid_t pci_proto = EFI_PCI_IO_PROTOCOL_GUID;
+ unsigned long size = 0;
+ unsigned long nr_pci;
+ struct setup_data *data;
+ int i;
+
+ status = efi_call_early(locate_handle,
+ EFI_LOCATE_BY_PROTOCOL,
+ &pci_proto, NULL, &size, pci_handle);
+
+ if (status == EFI_BUFFER_TOO_SMALL) {
+ status = efi_call_early(allocate_pool,
+ EFI_LOADER_DATA,
+ size, (void **)&pci_handle);
+
+ if (status != EFI_SUCCESS) {
+ efi_printk(sys_table, "Failed to allocate memory for 'pci_handle'\n");
+ return;
+ }
+
+ status = efi_call_early(locate_handle,
+ EFI_LOCATE_BY_PROTOCOL, &pci_proto,
+ NULL, &size, pci_handle);
+ }
+
+ if (status != EFI_SUCCESS)
+ goto free_handle;
+
+ data = (struct setup_data *)(unsigned long)params->hdr.setup_data;
+
+ while (data && data->next)
+ data = (struct setup_data *)(unsigned long)data->next;
+
+ nr_pci = size / (efi_is_64bit() ? sizeof(u64) : sizeof(u32));
+ for (i = 0; i < nr_pci; i++) {
+ efi_pci_io_protocol_t *pci = NULL;
+ struct pci_setup_rom *rom;
+
+ status = efi_call_early(handle_protocol,
+ efi_is_64bit() ? ((u64 *)pci_handle)[i]
+ : ((u32 *)pci_handle)[i],
+ &pci_proto, (void **)&pci);
+ if (status != EFI_SUCCESS || !pci)
+ continue;
+
+ status = preserve_pci_rom_image(pci, &rom);
+ if (status != EFI_SUCCESS)
+ continue;
+
+ if (data)
+ data->next = (unsigned long)rom;
+ else
+ params->hdr.setup_data = (unsigned long)rom;
+
+ data = (struct setup_data *)rom;
+ }
+
+free_handle:
+ efi_call_early(free_pool, pci_handle);
+}
+
+static void retrieve_apple_device_properties(struct boot_params *boot_params)
+{
+ efi_guid_t guid = APPLE_PROPERTIES_PROTOCOL_GUID;
+ struct setup_data *data, *new;
+ efi_status_t status;
+ u32 size = 0;
+ void *p;
+
+ status = efi_call_early(locate_protocol, &guid, NULL, &p);
+ if (status != EFI_SUCCESS)
+ return;
+
+ if (efi_table_attr(apple_properties_protocol, version, p) != 0x10000) {
+ efi_printk(sys_table, "Unsupported properties proto version\n");
+ return;
+ }
+
+ efi_call_proto(apple_properties_protocol, get_all, p, NULL, &size);
+ if (!size)
+ return;
+
+ do {
+ status = efi_call_early(allocate_pool, EFI_LOADER_DATA,
+ size + sizeof(struct setup_data), &new);
+ if (status != EFI_SUCCESS) {
+ efi_printk(sys_table, "Failed to allocate memory for 'properties'\n");
+ return;
+ }
+
+ status = efi_call_proto(apple_properties_protocol, get_all, p,
+ new->data, &size);
+
+ if (status == EFI_BUFFER_TOO_SMALL)
+ efi_call_early(free_pool, new);
+ } while (status == EFI_BUFFER_TOO_SMALL);
+
+ new->type = SETUP_APPLE_PROPERTIES;
+ new->len = size;
+ new->next = 0;
+
+ data = (struct setup_data *)(unsigned long)boot_params->hdr.setup_data;
+ if (!data) {
+ boot_params->hdr.setup_data = (unsigned long)new;
+ } else {
+ while (data->next)
+ data = (struct setup_data *)(unsigned long)data->next;
+ data->next = (unsigned long)new;
+ }
+}
+
+static const efi_char16_t apple[] = L"Apple";
+
+static void setup_quirks(struct boot_params *boot_params)
+{
+ efi_char16_t *fw_vendor = (efi_char16_t *)(unsigned long)
+ efi_table_attr(efi_system_table, fw_vendor, sys_table);
+
+ if (!memcmp(fw_vendor, apple, sizeof(apple))) {
+ if (IS_ENABLED(CONFIG_APPLE_PROPERTIES))
+ retrieve_apple_device_properties(boot_params);
+ }
+}
+
+/*
+ * See if we have Universal Graphics Adapter (UGA) protocol
+ */
+static efi_status_t
+setup_uga(struct screen_info *si, efi_guid_t *uga_proto, unsigned long size)
+{
+ efi_status_t status;
+ u32 width, height;
+ void **uga_handle = NULL;
+ efi_uga_draw_protocol_t *uga = NULL, *first_uga;
+ unsigned long nr_ugas;
+ int i;
+
+ status = efi_call_early(allocate_pool, EFI_LOADER_DATA,
+ size, (void **)&uga_handle);
+ if (status != EFI_SUCCESS)
+ return status;
+
+ status = efi_call_early(locate_handle,
+ EFI_LOCATE_BY_PROTOCOL,
+ uga_proto, NULL, &size, uga_handle);
+ if (status != EFI_SUCCESS)
+ goto free_handle;
+
+ height = 0;
+ width = 0;
+
+ first_uga = NULL;
+ nr_ugas = size / (efi_is_64bit() ? sizeof(u64) : sizeof(u32));
+ for (i = 0; i < nr_ugas; i++) {
+ efi_guid_t pciio_proto = EFI_PCI_IO_PROTOCOL_GUID;
+ u32 w, h, depth, refresh;
+ void *pciio;
+ unsigned long handle = efi_is_64bit() ? ((u64 *)uga_handle)[i]
+ : ((u32 *)uga_handle)[i];
+
+ status = efi_call_early(handle_protocol, handle,
+ uga_proto, (void **)&uga);
+ if (status != EFI_SUCCESS)
+ continue;
+
+ pciio = NULL;
+ efi_call_early(handle_protocol, handle, &pciio_proto, &pciio);
+
+ status = efi_call_proto(efi_uga_draw_protocol, get_mode, uga,
+ &w, &h, &depth, &refresh);
+ if (status == EFI_SUCCESS && (!first_uga || pciio)) {
+ width = w;
+ height = h;
+
+ /*
+ * Once we've found a UGA supporting PCIIO,
+ * don't bother looking any further.
+ */
+ if (pciio)
+ break;
+
+ first_uga = uga;
+ }
+ }
+
+ if (!width && !height)
+ goto free_handle;
+
+ /* EFI framebuffer */
+ si->orig_video_isVGA = VIDEO_TYPE_EFI;
+
+ si->lfb_depth = 32;
+ si->lfb_width = width;
+ si->lfb_height = height;
+
+ si->red_size = 8;
+ si->red_pos = 16;
+ si->green_size = 8;
+ si->green_pos = 8;
+ si->blue_size = 8;
+ si->blue_pos = 0;
+ si->rsvd_size = 8;
+ si->rsvd_pos = 24;
+
+free_handle:
+ efi_call_early(free_pool, uga_handle);
+
+ return status;
+}
+
+void setup_graphics(struct boot_params *boot_params)
+{
+ efi_guid_t graphics_proto = EFI_GRAPHICS_OUTPUT_PROTOCOL_GUID;
+ struct screen_info *si;
+ efi_guid_t uga_proto = EFI_UGA_PROTOCOL_GUID;
+ efi_status_t status;
+ unsigned long size;
+ void **gop_handle = NULL;
+ void **uga_handle = NULL;
+
+ si = &boot_params->screen_info;
+ memset(si, 0, sizeof(*si));
+
+ size = 0;
+ status = efi_call_early(locate_handle,
+ EFI_LOCATE_BY_PROTOCOL,
+ &graphics_proto, NULL, &size, gop_handle);
+ if (status == EFI_BUFFER_TOO_SMALL)
+ status = efi_setup_gop(NULL, si, &graphics_proto, size);
+
+ if (status != EFI_SUCCESS) {
+ size = 0;
+ status = efi_call_early(locate_handle,
+ EFI_LOCATE_BY_PROTOCOL,
+ &uga_proto, NULL, &size, uga_handle);
+ if (status == EFI_BUFFER_TOO_SMALL)
+ setup_uga(si, &uga_proto, size);
+ }
+}
+
+/*
+ * Because the x86 boot code expects to be passed a boot_params we
+ * need to create one ourselves (usually the bootloader would create
+ * one for us).
+ *
+ * The caller is responsible for filling out ->code32_start in the
+ * returned boot_params.
+ */
+struct boot_params *make_boot_params(struct efi_config *c)
+{
+ struct boot_params *boot_params;
+ struct apm_bios_info *bi;
+ struct setup_header *hdr;
+ efi_loaded_image_t *image;
+ void *options, *handle;
+ efi_guid_t proto = LOADED_IMAGE_PROTOCOL_GUID;
+ int options_size = 0;
+ efi_status_t status;
+ char *cmdline_ptr;
+ u16 *s2;
+ u8 *s1;
+ int i;
+ unsigned long ramdisk_addr;
+ unsigned long ramdisk_size;
+
+ efi_early = c;
+ sys_table = (efi_system_table_t *)(unsigned long)efi_early->table;
+ handle = (void *)(unsigned long)efi_early->image_handle;
+
+ /* Check if we were booted by the EFI firmware */
+ if (sys_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE)
+ return NULL;
+
+ if (efi_is_64bit())
+ setup_boot_services64(efi_early);
+ else
+ setup_boot_services32(efi_early);
+
+ status = efi_call_early(handle_protocol, handle,
+ &proto, (void *)&image);
+ if (status != EFI_SUCCESS) {
+ efi_printk(sys_table, "Failed to get handle for LOADED_IMAGE_PROTOCOL\n");
+ return NULL;
+ }
+
+ status = efi_low_alloc(sys_table, 0x4000, 1,
+ (unsigned long *)&boot_params);
+ if (status != EFI_SUCCESS) {
+ efi_printk(sys_table, "Failed to allocate lowmem for boot params\n");
+ return NULL;
+ }
+
+ memset(boot_params, 0x0, 0x4000);
+
+ hdr = &boot_params->hdr;
+ bi = &boot_params->apm_bios_info;
+
+ /* Copy the second sector to boot_params */
+ memcpy(&hdr->jump, image->image_base + 512, 512);
+
+ /*
+ * Fill out some of the header fields ourselves because the
+ * EFI firmware loader doesn't load the first sector.
+ */
+ hdr->root_flags = 1;
+ hdr->vid_mode = 0xffff;
+ hdr->boot_flag = 0xAA55;
+
+ hdr->type_of_loader = 0x21;
+
+ /* Convert unicode cmdline to ascii */
+ cmdline_ptr = efi_convert_cmdline(sys_table, image, &options_size);
+ if (!cmdline_ptr)
+ goto fail;
+
+ hdr->cmd_line_ptr = (unsigned long)cmdline_ptr;
+ /* Fill in upper bits of command line address, NOP on 32 bit */
+ boot_params->ext_cmd_line_ptr = (u64)(unsigned long)cmdline_ptr >> 32;
+
+ hdr->ramdisk_image = 0;
+ hdr->ramdisk_size = 0;
+
+ /* Clear APM BIOS info */
+ memset(bi, 0, sizeof(*bi));
+
+ status = efi_parse_options(cmdline_ptr);
+ if (status != EFI_SUCCESS)
+ goto fail2;
+
+ status = handle_cmdline_files(sys_table, image,
+ (char *)(unsigned long)hdr->cmd_line_ptr,
+ "initrd=", hdr->initrd_addr_max,
+ &ramdisk_addr, &ramdisk_size);
+
+ if (status != EFI_SUCCESS &&
+ hdr->xloadflags & XLF_CAN_BE_LOADED_ABOVE_4G) {
+ efi_printk(sys_table, "Trying to load files to higher address\n");
+ status = handle_cmdline_files(sys_table, image,
+ (char *)(unsigned long)hdr->cmd_line_ptr,
+ "initrd=", -1UL,
+ &ramdisk_addr, &ramdisk_size);
+ }
+
+ if (status != EFI_SUCCESS)
+ goto fail2;
+ hdr->ramdisk_image = ramdisk_addr & 0xffffffff;
+ hdr->ramdisk_size = ramdisk_size & 0xffffffff;
+ boot_params->ext_ramdisk_image = (u64)ramdisk_addr >> 32;
+ boot_params->ext_ramdisk_size = (u64)ramdisk_size >> 32;
+
+ return boot_params;
+
+fail2:
+ efi_free(sys_table, options_size, hdr->cmd_line_ptr);
+fail:
+ efi_free(sys_table, 0x4000, (unsigned long)boot_params);
+
+ return NULL;
+}
+
+static void add_e820ext(struct boot_params *params,
+ struct setup_data *e820ext, u32 nr_entries)
+{
+ struct setup_data *data;
+ efi_status_t status;
+ unsigned long size;
+
+ e820ext->type = SETUP_E820_EXT;
+ e820ext->len = nr_entries * sizeof(struct boot_e820_entry);
+ e820ext->next = 0;
+
+ data = (struct setup_data *)(unsigned long)params->hdr.setup_data;
+
+ while (data && data->next)
+ data = (struct setup_data *)(unsigned long)data->next;
+
+ if (data)
+ data->next = (unsigned long)e820ext;
+ else
+ params->hdr.setup_data = (unsigned long)e820ext;
+}
+
+static efi_status_t
+setup_e820(struct boot_params *params, struct setup_data *e820ext, u32 e820ext_size)
+{
+ struct boot_e820_entry *entry = params->e820_table;
+ struct efi_info *efi = &params->efi_info;
+ struct boot_e820_entry *prev = NULL;
+ u32 nr_entries;
+ u32 nr_desc;
+ int i;
+
+ nr_entries = 0;
+ nr_desc = efi->efi_memmap_size / efi->efi_memdesc_size;
+
+ for (i = 0; i < nr_desc; i++) {
+ efi_memory_desc_t *d;
+ unsigned int e820_type = 0;
+ unsigned long m = efi->efi_memmap;
+
+#ifdef CONFIG_X86_64
+ m |= (u64)efi->efi_memmap_hi << 32;
+#endif
+
+ d = efi_early_memdesc_ptr(m, efi->efi_memdesc_size, i);
+ switch (d->type) {
+ case EFI_RESERVED_TYPE:
+ case EFI_RUNTIME_SERVICES_CODE:
+ case EFI_RUNTIME_SERVICES_DATA:
+ case EFI_MEMORY_MAPPED_IO:
+ case EFI_MEMORY_MAPPED_IO_PORT_SPACE:
+ case EFI_PAL_CODE:
+ e820_type = E820_TYPE_RESERVED;
+ break;
+
+ case EFI_UNUSABLE_MEMORY:
+ e820_type = E820_TYPE_UNUSABLE;
+ break;
+
+ case EFI_ACPI_RECLAIM_MEMORY:
+ e820_type = E820_TYPE_ACPI;
+ break;
+
+ case EFI_LOADER_CODE:
+ case EFI_LOADER_DATA:
+ case EFI_BOOT_SERVICES_CODE:
+ case EFI_BOOT_SERVICES_DATA:
+ case EFI_CONVENTIONAL_MEMORY:
+ e820_type = E820_TYPE_RAM;
+ break;
+
+ case EFI_ACPI_MEMORY_NVS:
+ e820_type = E820_TYPE_NVS;
+ break;
+
+ case EFI_PERSISTENT_MEMORY:
+ e820_type = E820_TYPE_PMEM;
+ break;
+
+ default:
+ continue;
+ }
+
+ /* Merge adjacent mappings */
+ if (prev && prev->type == e820_type &&
+ (prev->addr + prev->size) == d->phys_addr) {
+ prev->size += d->num_pages << 12;
+ continue;
+ }
+
+ if (nr_entries == ARRAY_SIZE(params->e820_table)) {
+ u32 need = (nr_desc - i) * sizeof(struct e820_entry) +
+ sizeof(struct setup_data);
+
+ if (!e820ext || e820ext_size < need)
+ return EFI_BUFFER_TOO_SMALL;
+
+ /* boot_params map full, switch to e820 extended */
+ entry = (struct boot_e820_entry *)e820ext->data;
+ }
+
+ entry->addr = d->phys_addr;
+ entry->size = d->num_pages << PAGE_SHIFT;
+ entry->type = e820_type;
+ prev = entry++;
+ nr_entries++;
+ }
+
+ if (nr_entries > ARRAY_SIZE(params->e820_table)) {
+ u32 nr_e820ext = nr_entries - ARRAY_SIZE(params->e820_table);
+
+ add_e820ext(params, e820ext, nr_e820ext);
+ nr_entries -= nr_e820ext;
+ }
+
+ params->e820_entries = (u8)nr_entries;
+
+ return EFI_SUCCESS;
+}
+
+static efi_status_t alloc_e820ext(u32 nr_desc, struct setup_data **e820ext,
+ u32 *e820ext_size)
+{
+ efi_status_t status;
+ unsigned long size;
+
+ size = sizeof(struct setup_data) +
+ sizeof(struct e820_entry) * nr_desc;
+
+ if (*e820ext) {
+ efi_call_early(free_pool, *e820ext);
+ *e820ext = NULL;
+ *e820ext_size = 0;
+ }
+
+ status = efi_call_early(allocate_pool, EFI_LOADER_DATA,
+ size, (void **)e820ext);
+ if (status == EFI_SUCCESS)
+ *e820ext_size = size;
+
+ return status;
+}
+
+static efi_status_t allocate_e820(struct boot_params *params,
+ struct setup_data **e820ext,
+ u32 *e820ext_size)
+{
+ unsigned long map_size, desc_size, buff_size;
+ struct efi_boot_memmap boot_map;
+ efi_memory_desc_t *map;
+ efi_status_t status;
+ __u32 nr_desc;
+
+ boot_map.map = &map;
+ boot_map.map_size = &map_size;
+ boot_map.desc_size = &desc_size;
+ boot_map.desc_ver = NULL;
+ boot_map.key_ptr = NULL;
+ boot_map.buff_size = &buff_size;
+
+ status = efi_get_memory_map(sys_table, &boot_map);
+ if (status != EFI_SUCCESS)
+ return status;
+
+ nr_desc = buff_size / desc_size;
+
+ if (nr_desc > ARRAY_SIZE(params->e820_table)) {
+ u32 nr_e820ext = nr_desc - ARRAY_SIZE(params->e820_table);
+
+ status = alloc_e820ext(nr_e820ext, e820ext, e820ext_size);
+ if (status != EFI_SUCCESS)
+ return status;
+ }
+
+ return EFI_SUCCESS;
+}
+
+struct exit_boot_struct {
+ struct boot_params *boot_params;
+ struct efi_info *efi;
+};
+
+static efi_status_t exit_boot_func(efi_system_table_t *sys_table_arg,
+ struct efi_boot_memmap *map,
+ void *priv)
+{
+ const char *signature;
+ __u32 nr_desc;
+ efi_status_t status;
+ struct exit_boot_struct *p = priv;
+
+ signature = efi_is_64bit() ? EFI64_LOADER_SIGNATURE
+ : EFI32_LOADER_SIGNATURE;
+ memcpy(&p->efi->efi_loader_signature, signature, sizeof(__u32));
+
+ p->efi->efi_systab = (unsigned long)sys_table_arg;
+ p->efi->efi_memdesc_size = *map->desc_size;
+ p->efi->efi_memdesc_version = *map->desc_ver;
+ p->efi->efi_memmap = (unsigned long)*map->map;
+ p->efi->efi_memmap_size = *map->map_size;
+
+#ifdef CONFIG_X86_64
+ p->efi->efi_systab_hi = (unsigned long)sys_table_arg >> 32;
+ p->efi->efi_memmap_hi = (unsigned long)*map->map >> 32;
+#endif
+
+ return EFI_SUCCESS;
+}
+
+static efi_status_t exit_boot(struct boot_params *boot_params, void *handle)
+{
+ unsigned long map_sz, key, desc_size, buff_size;
+ efi_memory_desc_t *mem_map;
+ struct setup_data *e820ext = NULL;
+ __u32 e820ext_size = 0;
+ efi_status_t status;
+ __u32 desc_version;
+ struct efi_boot_memmap map;
+ struct exit_boot_struct priv;
+
+ map.map = &mem_map;
+ map.map_size = &map_sz;
+ map.desc_size = &desc_size;
+ map.desc_ver = &desc_version;
+ map.key_ptr = &key;
+ map.buff_size = &buff_size;
+ priv.boot_params = boot_params;
+ priv.efi = &boot_params->efi_info;
+
+ status = allocate_e820(boot_params, &e820ext, &e820ext_size);
+ if (status != EFI_SUCCESS)
+ return status;
+
+ /* Might as well exit boot services now */
+ status = efi_exit_boot_services(sys_table, handle, &map, &priv,
+ exit_boot_func);
+ if (status != EFI_SUCCESS)
+ return status;
+
+ /* Historic? */
+ boot_params->alt_mem_k = 32 * 1024;
+
+ status = setup_e820(boot_params, e820ext, e820ext_size);
+ if (status != EFI_SUCCESS)
+ return status;
+
+ return EFI_SUCCESS;
+}
+
+/*
+ * On success we return a pointer to a boot_params structure, and NULL
+ * on failure.
+ */
+struct boot_params *
+efi_main(struct efi_config *c, struct boot_params *boot_params)
+{
+ struct desc_ptr *gdt = NULL;
+ efi_loaded_image_t *image;
+ struct setup_header *hdr = &boot_params->hdr;
+ efi_status_t status;
+ struct desc_struct *desc;
+ void *handle;
+ efi_system_table_t *_table;
+ unsigned long cmdline_paddr;
+
+ efi_early = c;
+
+ _table = (efi_system_table_t *)(unsigned long)efi_early->table;
+ handle = (void *)(unsigned long)efi_early->image_handle;
+
+ sys_table = _table;
+
+ /* Check if we were booted by the EFI firmware */
+ if (sys_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE)
+ goto fail;
+
+ if (efi_is_64bit())
+ setup_boot_services64(efi_early);
+ else
+ setup_boot_services32(efi_early);
+
+ /*
+ * make_boot_params() may have been called before efi_main(), in which
+ * case this is the second time we parse the cmdline. This is ok,
+ * parsing the cmdline multiple times does not have side-effects.
+ */
+ cmdline_paddr = ((u64)hdr->cmd_line_ptr |
+ ((u64)boot_params->ext_cmd_line_ptr << 32));
+ efi_parse_options((char *)cmdline_paddr);
+
+ /*
+ * If the boot loader gave us a value for secure_boot then we use that,
+ * otherwise we ask the BIOS.
+ */
+ if (boot_params->secure_boot == efi_secureboot_mode_unset)
+ boot_params->secure_boot = efi_get_secureboot(sys_table);
+
+ /* Ask the firmware to clear memory on unclean shutdown */
+ efi_enable_reset_attack_mitigation(sys_table);
+ efi_retrieve_tpm2_eventlog(sys_table);
+
+ setup_graphics(boot_params);
+
+ setup_efi_pci(boot_params);
+
+ setup_quirks(boot_params);
+
+ status = efi_call_early(allocate_pool, EFI_LOADER_DATA,
+ sizeof(*gdt), (void **)&gdt);
+ if (status != EFI_SUCCESS) {
+ efi_printk(sys_table, "Failed to allocate memory for 'gdt' structure\n");
+ goto fail;
+ }
+
+ gdt->size = 0x800;
+ status = efi_low_alloc(sys_table, gdt->size, 8,
+ (unsigned long *)&gdt->address);
+ if (status != EFI_SUCCESS) {
+ efi_printk(sys_table, "Failed to allocate memory for 'gdt'\n");
+ goto fail;
+ }
+
+ /*
+ * If the kernel isn't already loaded at the preferred load
+ * address, relocate it.
+ */
+ if (hdr->pref_address != hdr->code32_start) {
+ unsigned long bzimage_addr = hdr->code32_start;
+ status = efi_relocate_kernel(sys_table, &bzimage_addr,
+ hdr->init_size, hdr->init_size,
+ hdr->pref_address,
+ hdr->kernel_alignment);
+ if (status != EFI_SUCCESS) {
+ efi_printk(sys_table, "efi_relocate_kernel() failed!\n");
+ goto fail;
+ }
+
+ hdr->pref_address = hdr->code32_start;
+ hdr->code32_start = bzimage_addr;
+ }
+
+ status = exit_boot(boot_params, handle);
+ if (status != EFI_SUCCESS) {
+ efi_printk(sys_table, "exit_boot() failed!\n");
+ goto fail;
+ }
+
+ memset((char *)gdt->address, 0x0, gdt->size);
+ desc = (struct desc_struct *)gdt->address;
+
+ /* The first GDT is a dummy. */
+ desc++;
+
+ if (IS_ENABLED(CONFIG_X86_64)) {
+ /* __KERNEL32_CS */
+ desc->limit0 = 0xffff;
+ desc->base0 = 0x0000;
+ desc->base1 = 0x0000;
+ desc->type = SEG_TYPE_CODE | SEG_TYPE_EXEC_READ;
+ desc->s = DESC_TYPE_CODE_DATA;
+ desc->dpl = 0;
+ desc->p = 1;
+ desc->limit1 = 0xf;
+ desc->avl = 0;
+ desc->l = 0;
+ desc->d = SEG_OP_SIZE_32BIT;
+ desc->g = SEG_GRANULARITY_4KB;
+ desc->base2 = 0x00;
+
+ desc++;
+ } else {
+ /* Second entry is unused on 32-bit */
+ desc++;
+ }
+
+ /* __KERNEL_CS */
+ desc->limit0 = 0xffff;
+ desc->base0 = 0x0000;
+ desc->base1 = 0x0000;
+ desc->type = SEG_TYPE_CODE | SEG_TYPE_EXEC_READ;
+ desc->s = DESC_TYPE_CODE_DATA;
+ desc->dpl = 0;
+ desc->p = 1;
+ desc->limit1 = 0xf;
+ desc->avl = 0;
+
+ if (IS_ENABLED(CONFIG_X86_64)) {
+ desc->l = 1;
+ desc->d = 0;
+ } else {
+ desc->l = 0;
+ desc->d = SEG_OP_SIZE_32BIT;
+ }
+ desc->g = SEG_GRANULARITY_4KB;
+ desc->base2 = 0x00;
+ desc++;
+
+ /* __KERNEL_DS */
+ desc->limit0 = 0xffff;
+ desc->base0 = 0x0000;
+ desc->base1 = 0x0000;
+ desc->type = SEG_TYPE_DATA | SEG_TYPE_READ_WRITE;
+ desc->s = DESC_TYPE_CODE_DATA;
+ desc->dpl = 0;
+ desc->p = 1;
+ desc->limit1 = 0xf;
+ desc->avl = 0;
+ desc->l = 0;
+ desc->d = SEG_OP_SIZE_32BIT;
+ desc->g = SEG_GRANULARITY_4KB;
+ desc->base2 = 0x00;
+ desc++;
+
+ if (IS_ENABLED(CONFIG_X86_64)) {
+ /* Task segment value */
+ desc->limit0 = 0x0000;
+ desc->base0 = 0x0000;
+ desc->base1 = 0x0000;
+ desc->type = SEG_TYPE_TSS;
+ desc->s = 0;
+ desc->dpl = 0;
+ desc->p = 1;
+ desc->limit1 = 0x0;
+ desc->avl = 0;
+ desc->l = 0;
+ desc->d = 0;
+ desc->g = SEG_GRANULARITY_4KB;
+ desc->base2 = 0x00;
+ desc++;
+ }
+
+ asm volatile("cli");
+ asm volatile ("lgdt %0" : : "m" (*gdt));
+
+ return boot_params;
+fail:
+ efi_printk(sys_table, "efi_main() failed!\n");
+
+ return NULL;
+}
diff --git a/arch/x86/boot/compressed/eboot.h b/arch/x86/boot/compressed/eboot.h
new file mode 100644
index 000000000..8297387c4
--- /dev/null
+++ b/arch/x86/boot/compressed/eboot.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef BOOT_COMPRESSED_EBOOT_H
+#define BOOT_COMPRESSED_EBOOT_H
+
+#define SEG_TYPE_DATA (0 << 3)
+#define SEG_TYPE_READ_WRITE (1 << 1)
+#define SEG_TYPE_CODE (1 << 3)
+#define SEG_TYPE_EXEC_READ (1 << 1)
+#define SEG_TYPE_TSS ((1 << 3) | (1 << 0))
+#define SEG_OP_SIZE_32BIT (1 << 0)
+#define SEG_GRANULARITY_4KB (1 << 0)
+
+#define DESC_TYPE_CODE_DATA (1 << 0)
+
+typedef struct {
+ u32 get_mode;
+ u32 set_mode;
+ u32 blt;
+} efi_uga_draw_protocol_32_t;
+
+typedef struct {
+ u64 get_mode;
+ u64 set_mode;
+ u64 blt;
+} efi_uga_draw_protocol_64_t;
+
+typedef struct {
+ void *get_mode;
+ void *set_mode;
+ void *blt;
+} efi_uga_draw_protocol_t;
+
+#endif /* BOOT_COMPRESSED_EBOOT_H */
diff --git a/arch/x86/boot/compressed/efi_stub_32.S b/arch/x86/boot/compressed/efi_stub_32.S
new file mode 100644
index 000000000..257e341fd
--- /dev/null
+++ b/arch/x86/boot/compressed/efi_stub_32.S
@@ -0,0 +1,87 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * EFI call stub for IA32.
+ *
+ * This stub allows us to make EFI calls in physical mode with interrupts
+ * turned off. Note that this implementation is different from the one in
+ * arch/x86/platform/efi/efi_stub_32.S because we're _already_ in physical
+ * mode at this point.
+ */
+
+#include <linux/linkage.h>
+#include <asm/page_types.h>
+
+/*
+ * efi_call_phys(void *, ...) is a function with variable parameters.
+ * All the callers of this function assure that all the parameters are 4-bytes.
+ */
+
+/*
+ * In gcc calling convention, EBX, ESP, EBP, ESI and EDI are all callee save.
+ * So we'd better save all of them at the beginning of this function and restore
+ * at the end no matter how many we use, because we can not assure EFI runtime
+ * service functions will comply with gcc calling convention, too.
+ */
+
+.text
+ENTRY(efi_call_phys)
+ /*
+ * 0. The function can only be called in Linux kernel. So CS has been
+ * set to 0x0010, DS and SS have been set to 0x0018. In EFI, I found
+ * the values of these registers are the same. And, the corresponding
+ * GDT entries are identical. So I will do nothing about segment reg
+ * and GDT, but change GDT base register in prelog and epilog.
+ */
+
+ /*
+ * 1. Because we haven't been relocated by this point we need to
+ * use relative addressing.
+ */
+ call 1f
+1: popl %edx
+ subl $1b, %edx
+
+ /*
+ * 2. Now on the top of stack is the return
+ * address in the caller of efi_call_phys(), then parameter 1,
+ * parameter 2, ..., param n. To make things easy, we save the return
+ * address of efi_call_phys in a global variable.
+ */
+ popl %ecx
+ movl %ecx, saved_return_addr(%edx)
+ /* get the function pointer into ECX*/
+ popl %ecx
+ movl %ecx, efi_rt_function_ptr(%edx)
+
+ /*
+ * 3. Call the physical function.
+ */
+ call *%ecx
+
+ /*
+ * 4. Balance the stack. And because EAX contain the return value,
+ * we'd better not clobber it. We need to calculate our address
+ * again because %ecx and %edx are not preserved across EFI function
+ * calls.
+ */
+ call 1f
+1: popl %edx
+ subl $1b, %edx
+
+ movl efi_rt_function_ptr(%edx), %ecx
+ pushl %ecx
+
+ /*
+ * 10. Push the saved return address onto the stack and return.
+ */
+ movl saved_return_addr(%edx), %ecx
+ pushl %ecx
+ ret
+ENDPROC(efi_call_phys)
+.previous
+
+.data
+saved_return_addr:
+ .long 0
+efi_rt_function_ptr:
+ .long 0
diff --git a/arch/x86/boot/compressed/efi_stub_64.S b/arch/x86/boot/compressed/efi_stub_64.S
new file mode 100644
index 000000000..99494dff2
--- /dev/null
+++ b/arch/x86/boot/compressed/efi_stub_64.S
@@ -0,0 +1,5 @@
+#include <asm/segment.h>
+#include <asm/msr.h>
+#include <asm/processor-flags.h>
+
+#include "../../platform/efi/efi_stub_64.S"
diff --git a/arch/x86/boot/compressed/efi_thunk_64.S b/arch/x86/boot/compressed/efi_thunk_64.S
new file mode 100644
index 000000000..bff9ab7c6
--- /dev/null
+++ b/arch/x86/boot/compressed/efi_thunk_64.S
@@ -0,0 +1,197 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2014, 2015 Intel Corporation; author Matt Fleming
+ *
+ * Early support for invoking 32-bit EFI services from a 64-bit kernel.
+ *
+ * Because this thunking occurs before ExitBootServices() we have to
+ * restore the firmware's 32-bit GDT before we make EFI serivce calls,
+ * since the firmware's 32-bit IDT is still currently installed and it
+ * needs to be able to service interrupts.
+ *
+ * On the plus side, we don't have to worry about mangling 64-bit
+ * addresses into 32-bits because we're executing with an identify
+ * mapped pagetable and haven't transitioned to 64-bit virtual addresses
+ * yet.
+ */
+
+#include <linux/linkage.h>
+#include <asm/msr.h>
+#include <asm/page_types.h>
+#include <asm/processor-flags.h>
+#include <asm/segment.h>
+
+ .code64
+ .text
+ENTRY(efi64_thunk)
+ push %rbp
+ push %rbx
+
+ subq $8, %rsp
+ leaq efi_exit32(%rip), %rax
+ movl %eax, 4(%rsp)
+ leaq efi_gdt64(%rip), %rax
+ movl %eax, (%rsp)
+ movl %eax, 2(%rax) /* Fixup the gdt base address */
+
+ movl %ds, %eax
+ push %rax
+ movl %es, %eax
+ push %rax
+ movl %ss, %eax
+ push %rax
+
+ /*
+ * Convert x86-64 ABI params to i386 ABI
+ */
+ subq $32, %rsp
+ movl %esi, 0x0(%rsp)
+ movl %edx, 0x4(%rsp)
+ movl %ecx, 0x8(%rsp)
+ movq %r8, %rsi
+ movl %esi, 0xc(%rsp)
+ movq %r9, %rsi
+ movl %esi, 0x10(%rsp)
+
+ sgdt save_gdt(%rip)
+
+ leaq 1f(%rip), %rbx
+ movq %rbx, func_rt_ptr(%rip)
+
+ /*
+ * Switch to gdt with 32-bit segments. This is the firmware GDT
+ * that was installed when the kernel started executing. This
+ * pointer was saved at the EFI stub entry point in head_64.S.
+ */
+ leaq efi32_boot_gdt(%rip), %rax
+ lgdt (%rax)
+
+ pushq $__KERNEL_CS
+ leaq efi_enter32(%rip), %rax
+ pushq %rax
+ lretq
+
+1: addq $32, %rsp
+
+ lgdt save_gdt(%rip)
+
+ pop %rbx
+ movl %ebx, %ss
+ pop %rbx
+ movl %ebx, %es
+ pop %rbx
+ movl %ebx, %ds
+
+ /*
+ * Convert 32-bit status code into 64-bit.
+ */
+ test %rax, %rax
+ jz 1f
+ movl %eax, %ecx
+ andl $0x0fffffff, %ecx
+ andl $0xf0000000, %eax
+ shl $32, %rax
+ or %rcx, %rax
+1:
+ addq $8, %rsp
+ pop %rbx
+ pop %rbp
+ ret
+ENDPROC(efi64_thunk)
+
+ENTRY(efi_exit32)
+ movq func_rt_ptr(%rip), %rax
+ push %rax
+ mov %rdi, %rax
+ ret
+ENDPROC(efi_exit32)
+
+ .code32
+/*
+ * EFI service pointer must be in %edi.
+ *
+ * The stack should represent the 32-bit calling convention.
+ */
+ENTRY(efi_enter32)
+ movl $__KERNEL_DS, %eax
+ movl %eax, %ds
+ movl %eax, %es
+ movl %eax, %ss
+
+ /* Reload pgtables */
+ movl %cr3, %eax
+ movl %eax, %cr3
+
+ /* Disable paging */
+ movl %cr0, %eax
+ btrl $X86_CR0_PG_BIT, %eax
+ movl %eax, %cr0
+
+ /* Disable long mode via EFER */
+ movl $MSR_EFER, %ecx
+ rdmsr
+ btrl $_EFER_LME, %eax
+ wrmsr
+
+ call *%edi
+
+ /* We must preserve return value */
+ movl %eax, %edi
+
+ /*
+ * Some firmware will return with interrupts enabled. Be sure to
+ * disable them before we switch GDTs.
+ */
+ cli
+
+ movl 56(%esp), %eax
+ movl %eax, 2(%eax)
+ lgdtl (%eax)
+
+ movl %cr4, %eax
+ btsl $(X86_CR4_PAE_BIT), %eax
+ movl %eax, %cr4
+
+ movl %cr3, %eax
+ movl %eax, %cr3
+
+ movl $MSR_EFER, %ecx
+ rdmsr
+ btsl $_EFER_LME, %eax
+ wrmsr
+
+ xorl %eax, %eax
+ lldt %ax
+
+ movl 60(%esp), %eax
+ pushl $__KERNEL_CS
+ pushl %eax
+
+ /* Enable paging */
+ movl %cr0, %eax
+ btsl $X86_CR0_PG_BIT, %eax
+ movl %eax, %cr0
+ lret
+ENDPROC(efi_enter32)
+
+ .data
+ .balign 8
+ .global efi32_boot_gdt
+efi32_boot_gdt: .word 0
+ .quad 0
+
+save_gdt: .word 0
+ .quad 0
+func_rt_ptr: .quad 0
+
+ .global efi_gdt64
+efi_gdt64:
+ .word efi_gdt64_end - efi_gdt64
+ .long 0 /* Filled out by user */
+ .word 0
+ .quad 0x0000000000000000 /* NULL descriptor */
+ .quad 0x00af9a000000ffff /* __KERNEL_CS */
+ .quad 0x00cf92000000ffff /* __KERNEL_DS */
+ .quad 0x0080890000000000 /* TS descriptor */
+ .quad 0x0000000000000000 /* TS continued */
+efi_gdt64_end:
diff --git a/arch/x86/boot/compressed/error.c b/arch/x86/boot/compressed/error.c
new file mode 100644
index 000000000..c881878e5
--- /dev/null
+++ b/arch/x86/boot/compressed/error.c
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Callers outside of misc.c need access to the error reporting routines,
+ * but the *_putstr() functions need to stay in misc.c because of how
+ * memcpy() and memmove() are defined for the compressed boot environment.
+ */
+#include "misc.h"
+#include "error.h"
+
+void warn(char *m)
+{
+ error_putstr("\n\n");
+ error_putstr(m);
+ error_putstr("\n\n");
+}
+
+void error(char *m)
+{
+ warn(m);
+ error_putstr(" -- System halted");
+
+ while (1)
+ asm("hlt");
+}
diff --git a/arch/x86/boot/compressed/error.h b/arch/x86/boot/compressed/error.h
new file mode 100644
index 000000000..1de582118
--- /dev/null
+++ b/arch/x86/boot/compressed/error.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef BOOT_COMPRESSED_ERROR_H
+#define BOOT_COMPRESSED_ERROR_H
+
+#include <linux/compiler.h>
+
+void warn(char *m);
+void error(char *m) __noreturn;
+
+#endif /* BOOT_COMPRESSED_ERROR_H */
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
new file mode 100644
index 000000000..c6c4b877f
--- /dev/null
+++ b/arch/x86/boot/compressed/head_32.S
@@ -0,0 +1,283 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * linux/boot/head.S
+ *
+ * Copyright (C) 1991, 1992, 1993 Linus Torvalds
+ */
+
+/*
+ * head.S contains the 32-bit startup code.
+ *
+ * NOTE!!! Startup happens at absolute address 0x00001000, which is also where
+ * the page directory will exist. The startup code will be overwritten by
+ * the page directory. [According to comments etc elsewhere on a compressed
+ * kernel it will end up at 0x1000 + 1Mb I hope so as I assume this. - AC]
+ *
+ * Page 0 is deliberately kept safe, since System Management Mode code in
+ * laptops may need to access the BIOS data stored there. This is also
+ * useful for future device drivers that either access the BIOS via VM86
+ * mode.
+ */
+
+/*
+ * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
+ */
+ .text
+
+#include <linux/init.h>
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/page_types.h>
+#include <asm/boot.h>
+#include <asm/asm-offsets.h>
+#include <asm/bootparam.h>
+
+/*
+ * The 32-bit x86 assembler in binutils 2.26 will generate R_386_GOT32X
+ * relocation to get the symbol address in PIC. When the compressed x86
+ * kernel isn't built as PIC, the linker optimizes R_386_GOT32X
+ * relocations to their fixed symbol addresses. However, when the
+ * compressed x86 kernel is loaded at a different address, it leads
+ * to the following load failure:
+ *
+ * Failed to allocate space for phdrs
+ *
+ * during the decompression stage.
+ *
+ * If the compressed x86 kernel is relocatable at run-time, it should be
+ * compiled with -fPIE, instead of -fPIC, if possible and should be built as
+ * Position Independent Executable (PIE) so that linker won't optimize
+ * R_386_GOT32X relocation to its fixed symbol address. Older
+ * linkers generate R_386_32 relocations against locally defined symbols,
+ * _bss, _ebss, _got, _egot and _end, in PIE. It isn't wrong, just less
+ * optimal than R_386_RELATIVE. But the x86 kernel fails to properly handle
+ * R_386_32 relocations when relocating the kernel. To generate
+ * R_386_RELATIVE relocations, we mark _bss, _ebss, _got, _egot and _end as
+ * hidden:
+ */
+ .hidden _bss
+ .hidden _ebss
+ .hidden _got
+ .hidden _egot
+ .hidden _end
+
+ __HEAD
+ENTRY(startup_32)
+ cld
+ /*
+ * Test KEEP_SEGMENTS flag to see if the bootloader is asking
+ * us to not reload segments
+ */
+ testb $KEEP_SEGMENTS, BP_loadflags(%esi)
+ jnz 1f
+
+ cli
+ movl $__BOOT_DS, %eax
+ movl %eax, %ds
+ movl %eax, %es
+ movl %eax, %fs
+ movl %eax, %gs
+ movl %eax, %ss
+1:
+
+/*
+ * Calculate the delta between where we were compiled to run
+ * at and where we were actually loaded at. This can only be done
+ * with a short local call on x86. Nothing else will tell us what
+ * address we are running at. The reserved chunk of the real-mode
+ * data at 0x1e4 (defined as a scratch field) are used as the stack
+ * for this calculation. Only 4 bytes are needed.
+ */
+ leal (BP_scratch+4)(%esi), %esp
+ call 1f
+1: popl %ebp
+ subl $1b, %ebp
+
+/*
+ * %ebp contains the address we are loaded at by the boot loader and %ebx
+ * contains the address where we should move the kernel image temporarily
+ * for safe in-place decompression.
+ */
+
+#ifdef CONFIG_RELOCATABLE
+ movl %ebp, %ebx
+ movl BP_kernel_alignment(%esi), %eax
+ decl %eax
+ addl %eax, %ebx
+ notl %eax
+ andl %eax, %ebx
+ cmpl $LOAD_PHYSICAL_ADDR, %ebx
+ jae 1f
+#endif
+ movl $LOAD_PHYSICAL_ADDR, %ebx
+1:
+
+ /* Target address to relocate to for decompression */
+ movl BP_init_size(%esi), %eax
+ subl $_end, %eax
+ addl %eax, %ebx
+
+ /* Set up the stack */
+ leal boot_stack_end(%ebx), %esp
+
+ /* Zero EFLAGS */
+ pushl $0
+ popfl
+
+/*
+ * Copy the compressed kernel to the end of our buffer
+ * where decompression in place becomes safe.
+ */
+ pushl %esi
+ leal (_bss-4)(%ebp), %esi
+ leal (_bss-4)(%ebx), %edi
+ movl $(_bss - startup_32), %ecx
+ shrl $2, %ecx
+ std
+ rep movsl
+ cld
+ popl %esi
+
+/*
+ * Jump to the relocated address.
+ */
+ leal relocated(%ebx), %eax
+ jmp *%eax
+ENDPROC(startup_32)
+
+#ifdef CONFIG_EFI_STUB
+/*
+ * We don't need the return address, so set up the stack so efi_main() can find
+ * its arguments.
+ */
+ENTRY(efi_pe_entry)
+ add $0x4, %esp
+
+ call 1f
+1: popl %esi
+ subl $1b, %esi
+
+ popl %ecx
+ movl %ecx, efi32_config(%esi) /* Handle */
+ popl %ecx
+ movl %ecx, efi32_config+8(%esi) /* EFI System table pointer */
+
+ /* Relocate efi_config->call() */
+ leal efi32_config(%esi), %eax
+ add %esi, 40(%eax)
+ pushl %eax
+
+ call make_boot_params
+ cmpl $0, %eax
+ je fail
+ movl %esi, BP_code32_start(%eax)
+ popl %ecx
+ pushl %eax
+ pushl %ecx
+ jmp 2f /* Skip efi_config initialization */
+ENDPROC(efi_pe_entry)
+
+ENTRY(efi32_stub_entry)
+ add $0x4, %esp
+ popl %ecx
+ popl %edx
+
+ call 1f
+1: popl %esi
+ subl $1b, %esi
+
+ movl %ecx, efi32_config(%esi) /* Handle */
+ movl %edx, efi32_config+8(%esi) /* EFI System table pointer */
+
+ /* Relocate efi_config->call() */
+ leal efi32_config(%esi), %eax
+ add %esi, 40(%eax)
+ pushl %eax
+2:
+ call efi_main
+ cmpl $0, %eax
+ movl %eax, %esi
+ jne 2f
+fail:
+ /* EFI init failed, so hang. */
+ hlt
+ jmp fail
+2:
+ movl BP_code32_start(%esi), %eax
+ leal startup_32(%eax), %eax
+ jmp *%eax
+ENDPROC(efi32_stub_entry)
+#endif
+
+ .text
+relocated:
+
+/*
+ * Clear BSS (stack is currently empty)
+ */
+ xorl %eax, %eax
+ leal _bss(%ebx), %edi
+ leal _ebss(%ebx), %ecx
+ subl %edi, %ecx
+ shrl $2, %ecx
+ rep stosl
+
+/*
+ * Adjust our own GOT
+ */
+ leal _got(%ebx), %edx
+ leal _egot(%ebx), %ecx
+1:
+ cmpl %ecx, %edx
+ jae 2f
+ addl %ebx, (%edx)
+ addl $4, %edx
+ jmp 1b
+2:
+
+/*
+ * Do the extraction, and jump to the new kernel..
+ */
+ /* push arguments for extract_kernel: */
+ pushl $z_output_len /* decompressed length, end of relocs */
+
+ movl BP_init_size(%esi), %eax
+ subl $_end, %eax
+ movl %ebx, %ebp
+ subl %eax, %ebp
+ pushl %ebp /* output address */
+
+ pushl $z_input_len /* input_len */
+ leal input_data(%ebx), %eax
+ pushl %eax /* input_data */
+ leal boot_heap(%ebx), %eax
+ pushl %eax /* heap area */
+ pushl %esi /* real mode pointer */
+ call extract_kernel /* returns kernel location in %eax */
+ addl $24, %esp
+
+/*
+ * Jump to the extracted kernel.
+ */
+ xorl %ebx, %ebx
+ jmp *%eax
+
+#ifdef CONFIG_EFI_STUB
+ .data
+efi32_config:
+ .fill 5,8,0
+ .long efi_call_phys
+ .long 0
+ .byte 0
+#endif
+
+/*
+ * Stack and heap for uncompression
+ */
+ .bss
+ .balign 4
+boot_heap:
+ .fill BOOT_HEAP_SIZE, 1, 0
+boot_stack:
+ .fill BOOT_STACK_SIZE, 1, 0
+boot_stack_end:
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
new file mode 100644
index 000000000..474733f8b
--- /dev/null
+++ b/arch/x86/boot/compressed/head_64.S
@@ -0,0 +1,721 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * linux/boot/head.S
+ *
+ * Copyright (C) 1991, 1992, 1993 Linus Torvalds
+ */
+
+/*
+ * head.S contains the 32-bit startup code.
+ *
+ * NOTE!!! Startup happens at absolute address 0x00001000, which is also where
+ * the page directory will exist. The startup code will be overwritten by
+ * the page directory. [According to comments etc elsewhere on a compressed
+ * kernel it will end up at 0x1000 + 1Mb I hope so as I assume this. - AC]
+ *
+ * Page 0 is deliberately kept safe, since System Management Mode code in
+ * laptops may need to access the BIOS data stored there. This is also
+ * useful for future device drivers that either access the BIOS via VM86
+ * mode.
+ */
+
+/*
+ * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
+ */
+ .code32
+ .text
+
+#include <linux/init.h>
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/boot.h>
+#include <asm/msr.h>
+#include <asm/processor-flags.h>
+#include <asm/asm-offsets.h>
+#include <asm/bootparam.h>
+#include "pgtable.h"
+
+/*
+ * Locally defined symbols should be marked hidden:
+ */
+ .hidden _bss
+ .hidden _ebss
+ .hidden _got
+ .hidden _egot
+ .hidden _end
+
+ __HEAD
+ .code32
+ENTRY(startup_32)
+ /*
+ * 32bit entry is 0 and it is ABI so immutable!
+ * If we come here directly from a bootloader,
+ * kernel(text+data+bss+brk) ramdisk, zero_page, command line
+ * all need to be under the 4G limit.
+ */
+ cld
+ /*
+ * Test KEEP_SEGMENTS flag to see if the bootloader is asking
+ * us to not reload segments
+ */
+ testb $KEEP_SEGMENTS, BP_loadflags(%esi)
+ jnz 1f
+
+ cli
+ movl $(__BOOT_DS), %eax
+ movl %eax, %ds
+ movl %eax, %es
+ movl %eax, %ss
+1:
+
+/*
+ * Calculate the delta between where we were compiled to run
+ * at and where we were actually loaded at. This can only be done
+ * with a short local call on x86. Nothing else will tell us what
+ * address we are running at. The reserved chunk of the real-mode
+ * data at 0x1e4 (defined as a scratch field) are used as the stack
+ * for this calculation. Only 4 bytes are needed.
+ */
+ leal (BP_scratch+4)(%esi), %esp
+ call 1f
+1: popl %ebp
+ subl $1b, %ebp
+
+/* setup a stack and make sure cpu supports long mode. */
+ movl $boot_stack_end, %eax
+ addl %ebp, %eax
+ movl %eax, %esp
+
+ call verify_cpu
+ testl %eax, %eax
+ jnz no_longmode
+
+/*
+ * Compute the delta between where we were compiled to run at
+ * and where the code will actually run at.
+ *
+ * %ebp contains the address we are loaded at by the boot loader and %ebx
+ * contains the address where we should move the kernel image temporarily
+ * for safe in-place decompression.
+ */
+
+#ifdef CONFIG_RELOCATABLE
+ movl %ebp, %ebx
+ movl BP_kernel_alignment(%esi), %eax
+ decl %eax
+ addl %eax, %ebx
+ notl %eax
+ andl %eax, %ebx
+ cmpl $LOAD_PHYSICAL_ADDR, %ebx
+ jae 1f
+#endif
+ movl $LOAD_PHYSICAL_ADDR, %ebx
+1:
+
+ /* Target address to relocate to for decompression */
+ movl BP_init_size(%esi), %eax
+ subl $_end, %eax
+ addl %eax, %ebx
+
+/*
+ * Prepare for entering 64 bit mode
+ */
+
+ /* Load new GDT with the 64bit segments using 32bit descriptor */
+ addl %ebp, gdt+2(%ebp)
+ lgdt gdt(%ebp)
+
+ /* Enable PAE mode */
+ movl %cr4, %eax
+ orl $X86_CR4_PAE, %eax
+ movl %eax, %cr4
+
+ /*
+ * Build early 4G boot pagetable
+ */
+ /*
+ * If SEV is active then set the encryption mask in the page tables.
+ * This will insure that when the kernel is copied and decompressed
+ * it will be done so encrypted.
+ */
+ call get_sev_encryption_bit
+ xorl %edx, %edx
+ testl %eax, %eax
+ jz 1f
+ subl $32, %eax /* Encryption bit is always above bit 31 */
+ bts %eax, %edx /* Set encryption mask for page tables */
+1:
+
+ /* Initialize Page tables to 0 */
+ leal pgtable(%ebx), %edi
+ xorl %eax, %eax
+ movl $(BOOT_INIT_PGT_SIZE/4), %ecx
+ rep stosl
+
+ /* Build Level 4 */
+ leal pgtable + 0(%ebx), %edi
+ leal 0x1007 (%edi), %eax
+ movl %eax, 0(%edi)
+ addl %edx, 4(%edi)
+
+ /* Build Level 3 */
+ leal pgtable + 0x1000(%ebx), %edi
+ leal 0x1007(%edi), %eax
+ movl $4, %ecx
+1: movl %eax, 0x00(%edi)
+ addl %edx, 0x04(%edi)
+ addl $0x00001000, %eax
+ addl $8, %edi
+ decl %ecx
+ jnz 1b
+
+ /* Build Level 2 */
+ leal pgtable + 0x2000(%ebx), %edi
+ movl $0x00000183, %eax
+ movl $2048, %ecx
+1: movl %eax, 0(%edi)
+ addl %edx, 4(%edi)
+ addl $0x00200000, %eax
+ addl $8, %edi
+ decl %ecx
+ jnz 1b
+
+ /* Enable the boot page tables */
+ leal pgtable(%ebx), %eax
+ movl %eax, %cr3
+
+ /* Enable Long mode in EFER (Extended Feature Enable Register) */
+ movl $MSR_EFER, %ecx
+ rdmsr
+ btsl $_EFER_LME, %eax
+ wrmsr
+
+ /* After gdt is loaded */
+ xorl %eax, %eax
+ lldt %ax
+ movl $__BOOT_TSS, %eax
+ ltr %ax
+
+ /*
+ * Setup for the jump to 64bit mode
+ *
+ * When the jump is performend we will be in long mode but
+ * in 32bit compatibility mode with EFER.LME = 1, CS.L = 0, CS.D = 1
+ * (and in turn EFER.LMA = 1). To jump into 64bit mode we use
+ * the new gdt/idt that has __KERNEL_CS with CS.L = 1.
+ * We place all of the values on our mini stack so lret can
+ * used to perform that far jump.
+ */
+ pushl $__KERNEL_CS
+ leal startup_64(%ebp), %eax
+#ifdef CONFIG_EFI_MIXED
+ movl efi32_config(%ebp), %ebx
+ cmp $0, %ebx
+ jz 1f
+ leal handover_entry(%ebp), %eax
+1:
+#endif
+ pushl %eax
+
+ /* Enter paged protected Mode, activating Long Mode */
+ movl $(X86_CR0_PG | X86_CR0_PE), %eax /* Enable Paging and Protected mode */
+ movl %eax, %cr0
+
+ /* Jump from 32bit compatibility mode into 64bit mode. */
+ lret
+ENDPROC(startup_32)
+
+#ifdef CONFIG_EFI_MIXED
+ .org 0x190
+ENTRY(efi32_stub_entry)
+ add $0x4, %esp /* Discard return address */
+ popl %ecx
+ popl %edx
+ popl %esi
+
+ leal (BP_scratch+4)(%esi), %esp
+ call 1f
+1: pop %ebp
+ subl $1b, %ebp
+
+ movl %ecx, efi32_config(%ebp)
+ movl %edx, efi32_config+8(%ebp)
+ sgdtl efi32_boot_gdt(%ebp)
+
+ leal efi32_config(%ebp), %eax
+ movl %eax, efi_config(%ebp)
+
+ /* Disable paging */
+ movl %cr0, %eax
+ btrl $X86_CR0_PG_BIT, %eax
+ movl %eax, %cr0
+
+ jmp startup_32
+ENDPROC(efi32_stub_entry)
+#endif
+
+ .code64
+ .org 0x200
+ENTRY(startup_64)
+ /*
+ * 64bit entry is 0x200 and it is ABI so immutable!
+ * We come here either from startup_32 or directly from a
+ * 64bit bootloader.
+ * If we come here from a bootloader, kernel(text+data+bss+brk),
+ * ramdisk, zero_page, command line could be above 4G.
+ * We depend on an identity mapped page table being provided
+ * that maps our entire kernel(text+data+bss+brk), zero page
+ * and command line.
+ */
+
+ /* Setup data segments. */
+ xorl %eax, %eax
+ movl %eax, %ds
+ movl %eax, %es
+ movl %eax, %ss
+ movl %eax, %fs
+ movl %eax, %gs
+
+ /*
+ * Compute the decompressed kernel start address. It is where
+ * we were loaded at aligned to a 2M boundary. %rbp contains the
+ * decompressed kernel start address.
+ *
+ * If it is a relocatable kernel then decompress and run the kernel
+ * from load address aligned to 2MB addr, otherwise decompress and
+ * run the kernel from LOAD_PHYSICAL_ADDR
+ *
+ * We cannot rely on the calculation done in 32-bit mode, since we
+ * may have been invoked via the 64-bit entry point.
+ */
+
+ /* Start with the delta to where the kernel will run at. */
+#ifdef CONFIG_RELOCATABLE
+ leaq startup_32(%rip) /* - $startup_32 */, %rbp
+ movl BP_kernel_alignment(%rsi), %eax
+ decl %eax
+ addq %rax, %rbp
+ notq %rax
+ andq %rax, %rbp
+ cmpq $LOAD_PHYSICAL_ADDR, %rbp
+ jae 1f
+#endif
+ movq $LOAD_PHYSICAL_ADDR, %rbp
+1:
+
+ /* Target address to relocate to for decompression */
+ movl BP_init_size(%rsi), %ebx
+ subl $_end, %ebx
+ addq %rbp, %rbx
+
+ /* Set up the stack */
+ leaq boot_stack_end(%rbx), %rsp
+
+ /*
+ * paging_prepare() and cleanup_trampoline() below can have GOT
+ * references. Adjust the table with address we are running at.
+ *
+ * Zero RAX for adjust_got: the GOT was not adjusted before;
+ * there's no adjustment to undo.
+ */
+ xorq %rax, %rax
+
+ /*
+ * Calculate the address the binary is loaded at and use it as
+ * a GOT adjustment.
+ */
+ call 1f
+1: popq %rdi
+ subq $1b, %rdi
+
+ call adjust_got
+
+ /*
+ * At this point we are in long mode with 4-level paging enabled,
+ * but we might want to enable 5-level paging or vice versa.
+ *
+ * The problem is that we cannot do it directly. Setting or clearing
+ * CR4.LA57 in long mode would trigger #GP. So we need to switch off
+ * long mode and paging first.
+ *
+ * We also need a trampoline in lower memory to switch over from
+ * 4- to 5-level paging for cases when the bootloader puts the kernel
+ * above 4G, but didn't enable 5-level paging for us.
+ *
+ * The same trampoline can be used to switch from 5- to 4-level paging
+ * mode, like when starting 4-level paging kernel via kexec() when
+ * original kernel worked in 5-level paging mode.
+ *
+ * For the trampoline, we need the top page table to reside in lower
+ * memory as we don't have a way to load 64-bit values into CR3 in
+ * 32-bit mode.
+ *
+ * We go though the trampoline even if we don't have to: if we're
+ * already in a desired paging mode. This way the trampoline code gets
+ * tested on every boot.
+ */
+
+ /* Make sure we have GDT with 32-bit code segment */
+ leaq gdt(%rip), %rax
+ movq %rax, gdt64+2(%rip)
+ lgdt gdt64(%rip)
+
+ /*
+ * paging_prepare() sets up the trampoline and checks if we need to
+ * enable 5-level paging.
+ *
+ * Address of the trampoline is returned in RAX.
+ * Non zero RDX on return means we need to enable 5-level paging.
+ *
+ * RSI holds real mode data and needs to be preserved across
+ * this function call.
+ */
+ pushq %rsi
+ movq %rsi, %rdi /* real mode address */
+ call paging_prepare
+ popq %rsi
+
+ /* Save the trampoline address in RCX */
+ movq %rax, %rcx
+
+ /*
+ * Load the address of trampoline_return() into RDI.
+ * It will be used by the trampoline to return to the main code.
+ */
+ leaq trampoline_return(%rip), %rdi
+
+ /* Switch to compatibility mode (CS.L = 0 CS.D = 1) via far return */
+ pushq $__KERNEL32_CS
+ leaq TRAMPOLINE_32BIT_CODE_OFFSET(%rax), %rax
+ pushq %rax
+ lretq
+trampoline_return:
+ /* Restore the stack, the 32-bit trampoline uses its own stack */
+ leaq boot_stack_end(%rbx), %rsp
+
+ /*
+ * cleanup_trampoline() would restore trampoline memory.
+ *
+ * RDI is address of the page table to use instead of page table
+ * in trampoline memory (if required).
+ *
+ * RSI holds real mode data and needs to be preserved across
+ * this function call.
+ */
+ pushq %rsi
+ leaq top_pgtable(%rbx), %rdi
+ call cleanup_trampoline
+ popq %rsi
+
+ /* Zero EFLAGS */
+ pushq $0
+ popfq
+
+ /*
+ * Previously we've adjusted the GOT with address the binary was
+ * loaded at. Now we need to re-adjust for relocation address.
+ *
+ * Calculate the address the binary is loaded at, so that we can
+ * undo the previous GOT adjustment.
+ */
+ call 1f
+1: popq %rax
+ subq $1b, %rax
+
+ /* The new adjustment is the relocation address */
+ movq %rbx, %rdi
+ call adjust_got
+
+/*
+ * Copy the compressed kernel to the end of our buffer
+ * where decompression in place becomes safe.
+ */
+ pushq %rsi
+ leaq (_bss-8)(%rip), %rsi
+ leaq (_bss-8)(%rbx), %rdi
+ movq $_bss /* - $startup_32 */, %rcx
+ shrq $3, %rcx
+ std
+ rep movsq
+ cld
+ popq %rsi
+
+/*
+ * Jump to the relocated address.
+ */
+ leaq relocated(%rbx), %rax
+ jmp *%rax
+
+#ifdef CONFIG_EFI_STUB
+
+/* The entry point for the PE/COFF executable is efi_pe_entry. */
+ENTRY(efi_pe_entry)
+ movq %rcx, efi64_config(%rip) /* Handle */
+ movq %rdx, efi64_config+8(%rip) /* EFI System table pointer */
+
+ leaq efi64_config(%rip), %rax
+ movq %rax, efi_config(%rip)
+
+ call 1f
+1: popq %rbp
+ subq $1b, %rbp
+
+ /*
+ * Relocate efi_config->call().
+ */
+ addq %rbp, efi64_config+40(%rip)
+
+ movq %rax, %rdi
+ call make_boot_params
+ cmpq $0,%rax
+ je fail
+ mov %rax, %rsi
+ leaq startup_32(%rip), %rax
+ movl %eax, BP_code32_start(%rsi)
+ jmp 2f /* Skip the relocation */
+
+handover_entry:
+ call 1f
+1: popq %rbp
+ subq $1b, %rbp
+
+ /*
+ * Relocate efi_config->call().
+ */
+ movq efi_config(%rip), %rax
+ addq %rbp, 40(%rax)
+2:
+ movq efi_config(%rip), %rdi
+ call efi_main
+ movq %rax,%rsi
+ cmpq $0,%rax
+ jne 2f
+fail:
+ /* EFI init failed, so hang. */
+ hlt
+ jmp fail
+2:
+ movl BP_code32_start(%esi), %eax
+ leaq startup_64(%rax), %rax
+ jmp *%rax
+ENDPROC(efi_pe_entry)
+
+ .org 0x390
+ENTRY(efi64_stub_entry)
+ movq %rdi, efi64_config(%rip) /* Handle */
+ movq %rsi, efi64_config+8(%rip) /* EFI System table pointer */
+
+ leaq efi64_config(%rip), %rax
+ movq %rax, efi_config(%rip)
+
+ movq %rdx, %rsi
+ jmp handover_entry
+ENDPROC(efi64_stub_entry)
+#endif
+
+ .text
+relocated:
+
+/*
+ * Clear BSS (stack is currently empty)
+ */
+ xorl %eax, %eax
+ leaq _bss(%rip), %rdi
+ leaq _ebss(%rip), %rcx
+ subq %rdi, %rcx
+ shrq $3, %rcx
+ rep stosq
+
+/*
+ * Do the extraction, and jump to the new kernel..
+ */
+ pushq %rsi /* Save the real mode argument */
+ movq %rsi, %rdi /* real mode address */
+ leaq boot_heap(%rip), %rsi /* malloc area for uncompression */
+ leaq input_data(%rip), %rdx /* input_data */
+ movl $z_input_len, %ecx /* input_len */
+ movq %rbp, %r8 /* output target address */
+ movq $z_output_len, %r9 /* decompressed length, end of relocs */
+ call extract_kernel /* returns kernel location in %rax */
+ popq %rsi
+
+/*
+ * Jump to the decompressed kernel.
+ */
+ jmp *%rax
+
+/*
+ * Adjust the global offset table
+ *
+ * RAX is the previous adjustment of the table to undo (use 0 if it's the
+ * first time we touch GOT).
+ * RDI is the new adjustment to apply.
+ */
+adjust_got:
+ /* Walk through the GOT adding the address to the entries */
+ leaq _got(%rip), %rdx
+ leaq _egot(%rip), %rcx
+1:
+ cmpq %rcx, %rdx
+ jae 2f
+ subq %rax, (%rdx) /* Undo previous adjustment */
+ addq %rdi, (%rdx) /* Apply the new adjustment */
+ addq $8, %rdx
+ jmp 1b
+2:
+ ret
+
+ .code32
+/*
+ * This is the 32-bit trampoline that will be copied over to low memory.
+ *
+ * RDI contains the return address (might be above 4G).
+ * ECX contains the base address of the trampoline memory.
+ * Non zero RDX on return means we need to enable 5-level paging.
+ */
+ENTRY(trampoline_32bit_src)
+ /* Set up data and stack segments */
+ movl $__KERNEL_DS, %eax
+ movl %eax, %ds
+ movl %eax, %ss
+
+ /* Set up new stack */
+ leal TRAMPOLINE_32BIT_STACK_END(%ecx), %esp
+
+ /* Disable paging */
+ movl %cr0, %eax
+ btrl $X86_CR0_PG_BIT, %eax
+ movl %eax, %cr0
+
+ /* Check what paging mode we want to be in after the trampoline */
+ cmpl $0, %edx
+ jz 1f
+
+ /* We want 5-level paging: don't touch CR3 if it already points to 5-level page tables */
+ movl %cr4, %eax
+ testl $X86_CR4_LA57, %eax
+ jnz 3f
+ jmp 2f
+1:
+ /* We want 4-level paging: don't touch CR3 if it already points to 4-level page tables */
+ movl %cr4, %eax
+ testl $X86_CR4_LA57, %eax
+ jz 3f
+2:
+ /* Point CR3 to the trampoline's new top level page table */
+ leal TRAMPOLINE_32BIT_PGTABLE_OFFSET(%ecx), %eax
+ movl %eax, %cr3
+3:
+ /* Set EFER.LME=1 as a precaution in case hypervsior pulls the rug */
+ pushl %ecx
+ pushl %edx
+ movl $MSR_EFER, %ecx
+ rdmsr
+ btsl $_EFER_LME, %eax
+ wrmsr
+ popl %edx
+ popl %ecx
+
+ /* Enable PAE and LA57 (if required) paging modes */
+ movl $X86_CR4_PAE, %eax
+ cmpl $0, %edx
+ jz 1f
+ orl $X86_CR4_LA57, %eax
+1:
+ movl %eax, %cr4
+
+ /* Calculate address of paging_enabled() once we are executing in the trampoline */
+ leal paging_enabled - trampoline_32bit_src + TRAMPOLINE_32BIT_CODE_OFFSET(%ecx), %eax
+
+ /* Prepare the stack for far return to Long Mode */
+ pushl $__KERNEL_CS
+ pushl %eax
+
+ /* Enable paging again */
+ movl $(X86_CR0_PG | X86_CR0_PE), %eax
+ movl %eax, %cr0
+
+ lret
+
+ .code64
+paging_enabled:
+ /* Return from the trampoline */
+ jmp *%rdi
+
+ /*
+ * The trampoline code has a size limit.
+ * Make sure we fail to compile if the trampoline code grows
+ * beyond TRAMPOLINE_32BIT_CODE_SIZE bytes.
+ */
+ .org trampoline_32bit_src + TRAMPOLINE_32BIT_CODE_SIZE
+
+ .code32
+no_longmode:
+ /* This isn't an x86-64 CPU, so hang intentionally, we cannot continue */
+1:
+ hlt
+ jmp 1b
+
+#include "../../kernel/verify_cpu.S"
+
+ .data
+gdt64:
+ .word gdt_end - gdt
+ .long 0
+ .word 0
+ .quad 0
+gdt:
+ .word gdt_end - gdt
+ .long gdt
+ .word 0
+ .quad 0x00cf9a000000ffff /* __KERNEL32_CS */
+ .quad 0x00af9a000000ffff /* __KERNEL_CS */
+ .quad 0x00cf92000000ffff /* __KERNEL_DS */
+ .quad 0x0080890000000000 /* TS descriptor */
+ .quad 0x0000000000000000 /* TS continued */
+gdt_end:
+
+#ifdef CONFIG_EFI_STUB
+efi_config:
+ .quad 0
+
+#ifdef CONFIG_EFI_MIXED
+ .global efi32_config
+efi32_config:
+ .fill 5,8,0
+ .quad efi64_thunk
+ .byte 0
+#endif
+
+ .global efi64_config
+efi64_config:
+ .fill 5,8,0
+ .quad efi_call
+ .byte 1
+#endif /* CONFIG_EFI_STUB */
+
+/*
+ * Stack and heap for uncompression
+ */
+ .bss
+ .balign 4
+boot_heap:
+ .fill BOOT_HEAP_SIZE, 1, 0
+boot_stack:
+ .fill BOOT_STACK_SIZE, 1, 0
+boot_stack_end:
+
+/*
+ * Space for page tables (not in .bss so not zeroed)
+ */
+ .section ".pgtable","a",@nobits
+ .balign 4096
+pgtable:
+ .fill BOOT_PGT_SIZE, 1, 0
+
+/*
+ * The page table is going to be used instead of page table in the trampoline
+ * memory.
+ */
+top_pgtable:
+ .fill PAGE_SIZE, 1, 0
diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c
new file mode 100644
index 000000000..d1e19f358
--- /dev/null
+++ b/arch/x86/boot/compressed/kaslr.c
@@ -0,0 +1,867 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * kaslr.c
+ *
+ * This contains the routines needed to generate a reasonable level of
+ * entropy to choose a randomized kernel base address offset in support
+ * of Kernel Address Space Layout Randomization (KASLR). Additionally
+ * handles walking the physical memory maps (and tracking memory regions
+ * to avoid) in order to select a physical memory location that can
+ * contain the entire properly aligned running kernel image.
+ *
+ */
+
+/*
+ * isspace() in linux/ctype.h is expected by next_args() to filter
+ * out "space/lf/tab". While boot/ctype.h conflicts with linux/ctype.h,
+ * since isdigit() is implemented in both of them. Hence disable it
+ * here.
+ */
+#define BOOT_CTYPE_H
+
+/*
+ * _ctype[] in lib/ctype.c is needed by isspace() of linux/ctype.h.
+ * While both lib/ctype.c and lib/cmdline.c will bring EXPORT_SYMBOL
+ * which is meaningless and will cause compiling error in some cases.
+ */
+#define __DISABLE_EXPORTS
+
+#include "misc.h"
+#include "error.h"
+#include "../string.h"
+
+#include <generated/compile.h>
+#include <linux/module.h>
+#include <linux/uts.h>
+#include <linux/utsname.h>
+#include <linux/ctype.h>
+#include <linux/efi.h>
+#include <generated/utsrelease.h>
+#include <asm/efi.h>
+
+/* Macros used by the included decompressor code below. */
+#define STATIC
+#include <linux/decompress/mm.h>
+
+#ifdef CONFIG_X86_5LEVEL
+unsigned int __pgtable_l5_enabled;
+unsigned int pgdir_shift __ro_after_init = 39;
+unsigned int ptrs_per_p4d __ro_after_init = 1;
+#endif
+
+extern unsigned long get_cmd_line_ptr(void);
+
+/* Used by PAGE_KERN* macros: */
+pteval_t __default_kernel_pte_mask __read_mostly = ~0;
+
+/* Simplified build-specific string for starting entropy. */
+static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@"
+ LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION;
+
+static unsigned long rotate_xor(unsigned long hash, const void *area,
+ size_t size)
+{
+ size_t i;
+ unsigned long *ptr = (unsigned long *)area;
+
+ for (i = 0; i < size / sizeof(hash); i++) {
+ /* Rotate by odd number of bits and XOR. */
+ hash = (hash << ((sizeof(hash) * 8) - 7)) | (hash >> 7);
+ hash ^= ptr[i];
+ }
+
+ return hash;
+}
+
+/* Attempt to create a simple but unpredictable starting entropy. */
+static unsigned long get_boot_seed(void)
+{
+ unsigned long hash = 0;
+
+ hash = rotate_xor(hash, build_str, sizeof(build_str));
+ hash = rotate_xor(hash, boot_params, sizeof(*boot_params));
+
+ return hash;
+}
+
+#define KASLR_COMPRESSED_BOOT
+#include "../../lib/kaslr.c"
+
+struct mem_vector {
+ unsigned long long start;
+ unsigned long long size;
+};
+
+/* Only supporting at most 4 unusable memmap regions with kaslr */
+#define MAX_MEMMAP_REGIONS 4
+
+static bool memmap_too_large;
+
+
+/* Store memory limit specified by "mem=nn[KMG]" or "memmap=nn[KMG]" */
+static unsigned long long mem_limit = ULLONG_MAX;
+
+
+enum mem_avoid_index {
+ MEM_AVOID_ZO_RANGE = 0,
+ MEM_AVOID_INITRD,
+ MEM_AVOID_CMDLINE,
+ MEM_AVOID_BOOTPARAMS,
+ MEM_AVOID_MEMMAP_BEGIN,
+ MEM_AVOID_MEMMAP_END = MEM_AVOID_MEMMAP_BEGIN + MAX_MEMMAP_REGIONS - 1,
+ MEM_AVOID_MAX,
+};
+
+static struct mem_vector mem_avoid[MEM_AVOID_MAX];
+
+static bool mem_overlaps(struct mem_vector *one, struct mem_vector *two)
+{
+ /* Item one is entirely before item two. */
+ if (one->start + one->size <= two->start)
+ return false;
+ /* Item one is entirely after item two. */
+ if (one->start >= two->start + two->size)
+ return false;
+ return true;
+}
+
+char *skip_spaces(const char *str)
+{
+ while (isspace(*str))
+ ++str;
+ return (char *)str;
+}
+#include "../../../../lib/ctype.c"
+#include "../../../../lib/cmdline.c"
+
+static int
+parse_memmap(char *p, unsigned long long *start, unsigned long long *size)
+{
+ char *oldp;
+
+ if (!p)
+ return -EINVAL;
+
+ /* We don't care about this option here */
+ if (!strncmp(p, "exactmap", 8))
+ return -EINVAL;
+
+ oldp = p;
+ *size = memparse(p, &p);
+ if (p == oldp)
+ return -EINVAL;
+
+ switch (*p) {
+ case '#':
+ case '$':
+ case '!':
+ *start = memparse(p + 1, &p);
+ return 0;
+ case '@':
+ /* memmap=nn@ss specifies usable region, should be skipped */
+ *size = 0;
+ /* Fall through */
+ default:
+ /*
+ * If w/o offset, only size specified, memmap=nn[KMG] has the
+ * same behaviour as mem=nn[KMG]. It limits the max address
+ * system can use. Region above the limit should be avoided.
+ */
+ *start = 0;
+ return 0;
+ }
+
+ return -EINVAL;
+}
+
+static void mem_avoid_memmap(char *str)
+{
+ static int i;
+
+ if (i >= MAX_MEMMAP_REGIONS)
+ return;
+
+ while (str && (i < MAX_MEMMAP_REGIONS)) {
+ int rc;
+ unsigned long long start, size;
+ char *k = strchr(str, ',');
+
+ if (k)
+ *k++ = 0;
+
+ rc = parse_memmap(str, &start, &size);
+ if (rc < 0)
+ break;
+ str = k;
+
+ if (start == 0) {
+ /* Store the specified memory limit if size > 0 */
+ if (size > 0)
+ mem_limit = size;
+
+ continue;
+ }
+
+ mem_avoid[MEM_AVOID_MEMMAP_BEGIN + i].start = start;
+ mem_avoid[MEM_AVOID_MEMMAP_BEGIN + i].size = size;
+ i++;
+ }
+
+ /* More than 4 memmaps, fail kaslr */
+ if ((i >= MAX_MEMMAP_REGIONS) && str)
+ memmap_too_large = true;
+}
+
+/* Store the number of 1GB huge pages which users specified: */
+static unsigned long max_gb_huge_pages;
+
+static void parse_gb_huge_pages(char *param, char *val)
+{
+ static bool gbpage_sz;
+ char *p;
+
+ if (!strcmp(param, "hugepagesz")) {
+ p = val;
+ if (memparse(p, &p) != PUD_SIZE) {
+ gbpage_sz = false;
+ return;
+ }
+
+ if (gbpage_sz)
+ warn("Repeatedly set hugeTLB page size of 1G!\n");
+ gbpage_sz = true;
+ return;
+ }
+
+ if (!strcmp(param, "hugepages") && gbpage_sz) {
+ p = val;
+ max_gb_huge_pages = simple_strtoull(p, &p, 0);
+ return;
+ }
+}
+
+
+static int handle_mem_options(void)
+{
+ char *args = (char *)get_cmd_line_ptr();
+ size_t len = strlen((char *)args);
+ char *tmp_cmdline;
+ char *param, *val;
+ u64 mem_size;
+
+ if (!strstr(args, "memmap=") && !strstr(args, "mem=") &&
+ !strstr(args, "hugepages"))
+ return 0;
+
+ tmp_cmdline = malloc(len + 1);
+ if (!tmp_cmdline)
+ error("Failed to allocate space for tmp_cmdline");
+
+ memcpy(tmp_cmdline, args, len);
+ tmp_cmdline[len] = 0;
+ args = tmp_cmdline;
+
+ /* Chew leading spaces */
+ args = skip_spaces(args);
+
+ while (*args) {
+ args = next_arg(args, &param, &val);
+ /* Stop at -- */
+ if (!val && strcmp(param, "--") == 0) {
+ warn("Only '--' specified in cmdline");
+ free(tmp_cmdline);
+ return -1;
+ }
+
+ if (!strcmp(param, "memmap")) {
+ mem_avoid_memmap(val);
+ } else if (strstr(param, "hugepages")) {
+ parse_gb_huge_pages(param, val);
+ } else if (!strcmp(param, "mem")) {
+ char *p = val;
+
+ if (!strcmp(p, "nopentium"))
+ continue;
+ mem_size = memparse(p, &p);
+ if (mem_size == 0) {
+ free(tmp_cmdline);
+ return -EINVAL;
+ }
+ mem_limit = mem_size;
+ }
+ }
+
+ free(tmp_cmdline);
+ return 0;
+}
+
+/*
+ * In theory, KASLR can put the kernel anywhere in the range of [16M, 64T).
+ * The mem_avoid array is used to store the ranges that need to be avoided
+ * when KASLR searches for an appropriate random address. We must avoid any
+ * regions that are unsafe to overlap with during decompression, and other
+ * things like the initrd, cmdline and boot_params. This comment seeks to
+ * explain mem_avoid as clearly as possible since incorrect mem_avoid
+ * memory ranges lead to really hard to debug boot failures.
+ *
+ * The initrd, cmdline, and boot_params are trivial to identify for
+ * avoiding. They are MEM_AVOID_INITRD, MEM_AVOID_CMDLINE, and
+ * MEM_AVOID_BOOTPARAMS respectively below.
+ *
+ * What is not obvious how to avoid is the range of memory that is used
+ * during decompression (MEM_AVOID_ZO_RANGE below). This range must cover
+ * the compressed kernel (ZO) and its run space, which is used to extract
+ * the uncompressed kernel (VO) and relocs.
+ *
+ * ZO's full run size sits against the end of the decompression buffer, so
+ * we can calculate where text, data, bss, etc of ZO are positioned more
+ * easily.
+ *
+ * For additional background, the decompression calculations can be found
+ * in header.S, and the memory diagram is based on the one found in misc.c.
+ *
+ * The following conditions are already enforced by the image layouts and
+ * associated code:
+ * - input + input_size >= output + output_size
+ * - kernel_total_size <= init_size
+ * - kernel_total_size <= output_size (see Note below)
+ * - output + init_size >= output + output_size
+ *
+ * (Note that kernel_total_size and output_size have no fundamental
+ * relationship, but output_size is passed to choose_random_location
+ * as a maximum of the two. The diagram is showing a case where
+ * kernel_total_size is larger than output_size, but this case is
+ * handled by bumping output_size.)
+ *
+ * The above conditions can be illustrated by a diagram:
+ *
+ * 0 output input input+input_size output+init_size
+ * | | | | |
+ * | | | | |
+ * |-----|--------|--------|--------------|-----------|--|-------------|
+ * | | |
+ * | | |
+ * output+init_size-ZO_INIT_SIZE output+output_size output+kernel_total_size
+ *
+ * [output, output+init_size) is the entire memory range used for
+ * extracting the compressed image.
+ *
+ * [output, output+kernel_total_size) is the range needed for the
+ * uncompressed kernel (VO) and its run size (bss, brk, etc).
+ *
+ * [output, output+output_size) is VO plus relocs (i.e. the entire
+ * uncompressed payload contained by ZO). This is the area of the buffer
+ * written to during decompression.
+ *
+ * [output+init_size-ZO_INIT_SIZE, output+init_size) is the worst-case
+ * range of the copied ZO and decompression code. (i.e. the range
+ * covered backwards of size ZO_INIT_SIZE, starting from output+init_size.)
+ *
+ * [input, input+input_size) is the original copied compressed image (ZO)
+ * (i.e. it does not include its run size). This range must be avoided
+ * because it contains the data used for decompression.
+ *
+ * [input+input_size, output+init_size) is [_text, _end) for ZO. This
+ * range includes ZO's heap and stack, and must be avoided since it
+ * performs the decompression.
+ *
+ * Since the above two ranges need to be avoided and they are adjacent,
+ * they can be merged, resulting in: [input, output+init_size) which
+ * becomes the MEM_AVOID_ZO_RANGE below.
+ */
+static void mem_avoid_init(unsigned long input, unsigned long input_size,
+ unsigned long output)
+{
+ unsigned long init_size = boot_params->hdr.init_size;
+ u64 initrd_start, initrd_size;
+ u64 cmd_line, cmd_line_size;
+ char *ptr;
+
+ /*
+ * Avoid the region that is unsafe to overlap during
+ * decompression.
+ */
+ mem_avoid[MEM_AVOID_ZO_RANGE].start = input;
+ mem_avoid[MEM_AVOID_ZO_RANGE].size = (output + init_size) - input;
+ add_identity_map(mem_avoid[MEM_AVOID_ZO_RANGE].start,
+ mem_avoid[MEM_AVOID_ZO_RANGE].size);
+
+ /* Avoid initrd. */
+ initrd_start = (u64)boot_params->ext_ramdisk_image << 32;
+ initrd_start |= boot_params->hdr.ramdisk_image;
+ initrd_size = (u64)boot_params->ext_ramdisk_size << 32;
+ initrd_size |= boot_params->hdr.ramdisk_size;
+ mem_avoid[MEM_AVOID_INITRD].start = initrd_start;
+ mem_avoid[MEM_AVOID_INITRD].size = initrd_size;
+ /* No need to set mapping for initrd, it will be handled in VO. */
+
+ /* Avoid kernel command line. */
+ cmd_line = (u64)boot_params->ext_cmd_line_ptr << 32;
+ cmd_line |= boot_params->hdr.cmd_line_ptr;
+ /* Calculate size of cmd_line. */
+ ptr = (char *)(unsigned long)cmd_line;
+ for (cmd_line_size = 0; ptr[cmd_line_size++];)
+ ;
+ mem_avoid[MEM_AVOID_CMDLINE].start = cmd_line;
+ mem_avoid[MEM_AVOID_CMDLINE].size = cmd_line_size;
+ add_identity_map(mem_avoid[MEM_AVOID_CMDLINE].start,
+ mem_avoid[MEM_AVOID_CMDLINE].size);
+
+ /* Avoid boot parameters. */
+ mem_avoid[MEM_AVOID_BOOTPARAMS].start = (unsigned long)boot_params;
+ mem_avoid[MEM_AVOID_BOOTPARAMS].size = sizeof(*boot_params);
+ add_identity_map(mem_avoid[MEM_AVOID_BOOTPARAMS].start,
+ mem_avoid[MEM_AVOID_BOOTPARAMS].size);
+
+ /* We don't need to set a mapping for setup_data. */
+
+ /* Mark the memmap regions we need to avoid */
+ handle_mem_options();
+
+#ifdef CONFIG_X86_VERBOSE_BOOTUP
+ /* Make sure video RAM can be used. */
+ add_identity_map(0, PMD_SIZE);
+#endif
+}
+
+/*
+ * Does this memory vector overlap a known avoided area? If so, record the
+ * overlap region with the lowest address.
+ */
+static bool mem_avoid_overlap(struct mem_vector *img,
+ struct mem_vector *overlap)
+{
+ int i;
+ struct setup_data *ptr;
+ unsigned long earliest = img->start + img->size;
+ bool is_overlapping = false;
+
+ for (i = 0; i < MEM_AVOID_MAX; i++) {
+ if (mem_overlaps(img, &mem_avoid[i]) &&
+ mem_avoid[i].start < earliest) {
+ *overlap = mem_avoid[i];
+ earliest = overlap->start;
+ is_overlapping = true;
+ }
+ }
+
+ /* Avoid all entries in the setup_data linked list. */
+ ptr = (struct setup_data *)(unsigned long)boot_params->hdr.setup_data;
+ while (ptr) {
+ struct mem_vector avoid;
+
+ avoid.start = (unsigned long)ptr;
+ avoid.size = sizeof(*ptr) + ptr->len;
+
+ if (mem_overlaps(img, &avoid) && (avoid.start < earliest)) {
+ *overlap = avoid;
+ earliest = overlap->start;
+ is_overlapping = true;
+ }
+
+ ptr = (struct setup_data *)(unsigned long)ptr->next;
+ }
+
+ return is_overlapping;
+}
+
+struct slot_area {
+ unsigned long addr;
+ int num;
+};
+
+#define MAX_SLOT_AREA 100
+
+static struct slot_area slot_areas[MAX_SLOT_AREA];
+
+static unsigned long slot_max;
+
+static unsigned long slot_area_index;
+
+static void store_slot_info(struct mem_vector *region, unsigned long image_size)
+{
+ struct slot_area slot_area;
+
+ if (slot_area_index == MAX_SLOT_AREA)
+ return;
+
+ slot_area.addr = region->start;
+ slot_area.num = (region->size - image_size) /
+ CONFIG_PHYSICAL_ALIGN + 1;
+
+ if (slot_area.num > 0) {
+ slot_areas[slot_area_index++] = slot_area;
+ slot_max += slot_area.num;
+ }
+}
+
+/*
+ * Skip as many 1GB huge pages as possible in the passed region
+ * according to the number which users specified:
+ */
+static void
+process_gb_huge_pages(struct mem_vector *region, unsigned long image_size)
+{
+ unsigned long addr, size = 0;
+ struct mem_vector tmp;
+ int i = 0;
+
+ if (!max_gb_huge_pages) {
+ store_slot_info(region, image_size);
+ return;
+ }
+
+ addr = ALIGN(region->start, PUD_SIZE);
+ /* Did we raise the address above the passed in memory entry? */
+ if (addr < region->start + region->size)
+ size = region->size - (addr - region->start);
+
+ /* Check how many 1GB huge pages can be filtered out: */
+ while (size > PUD_SIZE && max_gb_huge_pages) {
+ size -= PUD_SIZE;
+ max_gb_huge_pages--;
+ i++;
+ }
+
+ /* No good 1GB huge pages found: */
+ if (!i) {
+ store_slot_info(region, image_size);
+ return;
+ }
+
+ /*
+ * Skip those 'i'*1GB good huge pages, and continue checking and
+ * processing the remaining head or tail part of the passed region
+ * if available.
+ */
+
+ if (addr >= region->start + image_size) {
+ tmp.start = region->start;
+ tmp.size = addr - region->start;
+ store_slot_info(&tmp, image_size);
+ }
+
+ size = region->size - (addr - region->start) - i * PUD_SIZE;
+ if (size >= image_size) {
+ tmp.start = addr + i * PUD_SIZE;
+ tmp.size = size;
+ store_slot_info(&tmp, image_size);
+ }
+}
+
+static unsigned long slots_fetch_random(void)
+{
+ unsigned long slot;
+ int i;
+
+ /* Handle case of no slots stored. */
+ if (slot_max == 0)
+ return 0;
+
+ slot = kaslr_get_random_long("Physical") % slot_max;
+
+ for (i = 0; i < slot_area_index; i++) {
+ if (slot >= slot_areas[i].num) {
+ slot -= slot_areas[i].num;
+ continue;
+ }
+ return slot_areas[i].addr + slot * CONFIG_PHYSICAL_ALIGN;
+ }
+
+ if (i == slot_area_index)
+ debug_putstr("slots_fetch_random() failed!?\n");
+ return 0;
+}
+
+static void process_mem_region(struct mem_vector *entry,
+ unsigned long minimum,
+ unsigned long image_size)
+{
+ struct mem_vector region, overlap;
+ struct slot_area slot_area;
+ unsigned long start_orig, end;
+ struct mem_vector cur_entry;
+
+ /* On 32-bit, ignore entries entirely above our maximum. */
+ if (IS_ENABLED(CONFIG_X86_32) && entry->start >= KERNEL_IMAGE_SIZE)
+ return;
+
+ /* Ignore entries entirely below our minimum. */
+ if (entry->start + entry->size < minimum)
+ return;
+
+ /* Ignore entries above memory limit */
+ end = min(entry->size + entry->start, mem_limit);
+ if (entry->start >= end)
+ return;
+ cur_entry.start = entry->start;
+ cur_entry.size = end - entry->start;
+
+ region.start = cur_entry.start;
+ region.size = cur_entry.size;
+
+ /* Give up if slot area array is full. */
+ while (slot_area_index < MAX_SLOT_AREA) {
+ start_orig = region.start;
+
+ /* Potentially raise address to minimum location. */
+ if (region.start < minimum)
+ region.start = minimum;
+
+ /* Potentially raise address to meet alignment needs. */
+ region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN);
+
+ /* Did we raise the address above the passed in memory entry? */
+ if (region.start > cur_entry.start + cur_entry.size)
+ return;
+
+ /* Reduce size by any delta from the original address. */
+ region.size -= region.start - start_orig;
+
+ /* On 32-bit, reduce region size to fit within max size. */
+ if (IS_ENABLED(CONFIG_X86_32) &&
+ region.start + region.size > KERNEL_IMAGE_SIZE)
+ region.size = KERNEL_IMAGE_SIZE - region.start;
+
+ /* Return if region can't contain decompressed kernel */
+ if (region.size < image_size)
+ return;
+
+ /* If nothing overlaps, store the region and return. */
+ if (!mem_avoid_overlap(&region, &overlap)) {
+ process_gb_huge_pages(&region, image_size);
+ return;
+ }
+
+ /* Store beginning of region if holds at least image_size. */
+ if (overlap.start > region.start + image_size) {
+ struct mem_vector beginning;
+
+ beginning.start = region.start;
+ beginning.size = overlap.start - region.start;
+ process_gb_huge_pages(&beginning, image_size);
+ }
+
+ /* Return if overlap extends to or past end of region. */
+ if (overlap.start + overlap.size >= region.start + region.size)
+ return;
+
+ /* Clip off the overlapping region and start over. */
+ region.size -= overlap.start - region.start + overlap.size;
+ region.start = overlap.start + overlap.size;
+ }
+}
+
+#ifdef CONFIG_EFI
+/*
+ * Returns true if mirror region found (and must have been processed
+ * for slots adding)
+ */
+static bool
+process_efi_entries(unsigned long minimum, unsigned long image_size)
+{
+ struct efi_info *e = &boot_params->efi_info;
+ bool efi_mirror_found = false;
+ struct mem_vector region;
+ efi_memory_desc_t *md;
+ unsigned long pmap;
+ char *signature;
+ u32 nr_desc;
+ int i;
+
+ signature = (char *)&e->efi_loader_signature;
+ if (strncmp(signature, EFI32_LOADER_SIGNATURE, 4) &&
+ strncmp(signature, EFI64_LOADER_SIGNATURE, 4))
+ return false;
+
+#ifdef CONFIG_X86_32
+ /* Can't handle data above 4GB at this time */
+ if (e->efi_memmap_hi) {
+ warn("EFI memmap is above 4GB, can't be handled now on x86_32. EFI should be disabled.\n");
+ return false;
+ }
+ pmap = e->efi_memmap;
+#else
+ pmap = (e->efi_memmap | ((__u64)e->efi_memmap_hi << 32));
+#endif
+
+ nr_desc = e->efi_memmap_size / e->efi_memdesc_size;
+ for (i = 0; i < nr_desc; i++) {
+ md = efi_early_memdesc_ptr(pmap, e->efi_memdesc_size, i);
+ if (md->attribute & EFI_MEMORY_MORE_RELIABLE) {
+ efi_mirror_found = true;
+ break;
+ }
+ }
+
+ for (i = 0; i < nr_desc; i++) {
+ md = efi_early_memdesc_ptr(pmap, e->efi_memdesc_size, i);
+
+ /*
+ * Here we are more conservative in picking free memory than
+ * the EFI spec allows:
+ *
+ * According to the spec, EFI_BOOT_SERVICES_{CODE|DATA} are also
+ * free memory and thus available to place the kernel image into,
+ * but in practice there's firmware where using that memory leads
+ * to crashes.
+ *
+ * Only EFI_CONVENTIONAL_MEMORY is guaranteed to be free.
+ */
+ if (md->type != EFI_CONVENTIONAL_MEMORY)
+ continue;
+
+ if (efi_mirror_found &&
+ !(md->attribute & EFI_MEMORY_MORE_RELIABLE))
+ continue;
+
+ region.start = md->phys_addr;
+ region.size = md->num_pages << EFI_PAGE_SHIFT;
+ process_mem_region(&region, minimum, image_size);
+ if (slot_area_index == MAX_SLOT_AREA) {
+ debug_putstr("Aborted EFI scan (slot_areas full)!\n");
+ break;
+ }
+ }
+ return true;
+}
+#else
+static inline bool
+process_efi_entries(unsigned long minimum, unsigned long image_size)
+{
+ return false;
+}
+#endif
+
+static void process_e820_entries(unsigned long minimum,
+ unsigned long image_size)
+{
+ int i;
+ struct mem_vector region;
+ struct boot_e820_entry *entry;
+
+ /* Verify potential e820 positions, appending to slots list. */
+ for (i = 0; i < boot_params->e820_entries; i++) {
+ entry = &boot_params->e820_table[i];
+ /* Skip non-RAM entries. */
+ if (entry->type != E820_TYPE_RAM)
+ continue;
+ region.start = entry->addr;
+ region.size = entry->size;
+ process_mem_region(&region, minimum, image_size);
+ if (slot_area_index == MAX_SLOT_AREA) {
+ debug_putstr("Aborted e820 scan (slot_areas full)!\n");
+ break;
+ }
+ }
+}
+
+static unsigned long find_random_phys_addr(unsigned long minimum,
+ unsigned long image_size)
+{
+ /* Check if we had too many memmaps. */
+ if (memmap_too_large) {
+ debug_putstr("Aborted memory entries scan (more than 4 memmap= args)!\n");
+ return 0;
+ }
+
+ /* Make sure minimum is aligned. */
+ minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN);
+
+ if (process_efi_entries(minimum, image_size))
+ return slots_fetch_random();
+
+ process_e820_entries(minimum, image_size);
+ return slots_fetch_random();
+}
+
+static unsigned long find_random_virt_addr(unsigned long minimum,
+ unsigned long image_size)
+{
+ unsigned long slots, random_addr;
+
+ /* Make sure minimum is aligned. */
+ minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN);
+ /* Align image_size for easy slot calculations. */
+ image_size = ALIGN(image_size, CONFIG_PHYSICAL_ALIGN);
+
+ /*
+ * There are how many CONFIG_PHYSICAL_ALIGN-sized slots
+ * that can hold image_size within the range of minimum to
+ * KERNEL_IMAGE_SIZE?
+ */
+ slots = (KERNEL_IMAGE_SIZE - minimum - image_size) /
+ CONFIG_PHYSICAL_ALIGN + 1;
+
+ random_addr = kaslr_get_random_long("Virtual") % slots;
+
+ return random_addr * CONFIG_PHYSICAL_ALIGN + minimum;
+}
+
+/*
+ * Since this function examines addresses much more numerically,
+ * it takes the input and output pointers as 'unsigned long'.
+ */
+void choose_random_location(unsigned long input,
+ unsigned long input_size,
+ unsigned long *output,
+ unsigned long output_size,
+ unsigned long *virt_addr)
+{
+ unsigned long random_addr, min_addr;
+
+ if (cmdline_find_option_bool("nokaslr")) {
+ warn("KASLR disabled: 'nokaslr' on cmdline.");
+ return;
+ }
+
+#ifdef CONFIG_X86_5LEVEL
+ if (__read_cr4() & X86_CR4_LA57) {
+ __pgtable_l5_enabled = 1;
+ pgdir_shift = 48;
+ ptrs_per_p4d = 512;
+ }
+#endif
+
+ boot_params->hdr.loadflags |= KASLR_FLAG;
+
+ /* Prepare to add new identity pagetables on demand. */
+ initialize_identity_maps();
+
+ /* Record the various known unsafe memory ranges. */
+ mem_avoid_init(input, input_size, *output);
+
+ /*
+ * Low end of the randomization range should be the
+ * smaller of 512M or the initial kernel image
+ * location:
+ */
+ min_addr = min(*output, 512UL << 20);
+
+ /* Walk available memory entries to find a random address. */
+ random_addr = find_random_phys_addr(min_addr, output_size);
+ if (!random_addr) {
+ warn("Physical KASLR disabled: no suitable memory region!");
+ } else {
+ /* Update the new physical address location. */
+ if (*output != random_addr) {
+ add_identity_map(random_addr, output_size);
+ *output = random_addr;
+ }
+
+ /*
+ * This loads the identity mapping page table.
+ * This should only be done if a new physical address
+ * is found for the kernel, otherwise we should keep
+ * the old page table to make it be like the "nokaslr"
+ * case.
+ */
+ finalize_identity_maps();
+ }
+
+
+ /* Pick random virtual address starting from LOAD_PHYSICAL_ADDR. */
+ if (IS_ENABLED(CONFIG_X86_64))
+ random_addr = find_random_virt_addr(LOAD_PHYSICAL_ADDR, output_size);
+ *virt_addr = random_addr;
+}
diff --git a/arch/x86/boot/compressed/kaslr_64.c b/arch/x86/boot/compressed/kaslr_64.c
new file mode 100644
index 000000000..9557c5a15
--- /dev/null
+++ b/arch/x86/boot/compressed/kaslr_64.c
@@ -0,0 +1,153 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This code is used on x86_64 to create page table identity mappings on
+ * demand by building up a new set of page tables (or appending to the
+ * existing ones), and then switching over to them when ready.
+ *
+ * Copyright (C) 2015-2016 Yinghai Lu
+ * Copyright (C) 2016 Kees Cook
+ */
+
+/*
+ * Since we're dealing with identity mappings, physical and virtual
+ * addresses are the same, so override these defines which are ultimately
+ * used by the headers in misc.h.
+ */
+#define __pa(x) ((unsigned long)(x))
+#define __va(x) ((void *)((unsigned long)(x)))
+
+/* No PAGE_TABLE_ISOLATION support needed either: */
+#undef CONFIG_PAGE_TABLE_ISOLATION
+
+#include "misc.h"
+
+/* These actually do the work of building the kernel identity maps. */
+#include <asm/init.h>
+#include <asm/pgtable.h>
+/* Use the static base for this part of the boot process */
+#undef __PAGE_OFFSET
+#define __PAGE_OFFSET __PAGE_OFFSET_BASE
+#include "../../mm/ident_map.c"
+
+/* Used to track our page table allocation area. */
+struct alloc_pgt_data {
+ unsigned char *pgt_buf;
+ unsigned long pgt_buf_size;
+ unsigned long pgt_buf_offset;
+};
+
+/*
+ * Allocates space for a page table entry, using struct alloc_pgt_data
+ * above. Besides the local callers, this is used as the allocation
+ * callback in mapping_info below.
+ */
+static void *alloc_pgt_page(void *context)
+{
+ struct alloc_pgt_data *pages = (struct alloc_pgt_data *)context;
+ unsigned char *entry;
+
+ /* Validate there is space available for a new page. */
+ if (pages->pgt_buf_offset >= pages->pgt_buf_size) {
+ debug_putstr("out of pgt_buf in " __FILE__ "!?\n");
+ debug_putaddr(pages->pgt_buf_offset);
+ debug_putaddr(pages->pgt_buf_size);
+ return NULL;
+ }
+
+ entry = pages->pgt_buf + pages->pgt_buf_offset;
+ pages->pgt_buf_offset += PAGE_SIZE;
+
+ return entry;
+}
+
+/* Used to track our allocated page tables. */
+static struct alloc_pgt_data pgt_data;
+
+/* The top level page table entry pointer. */
+static unsigned long top_level_pgt;
+
+phys_addr_t physical_mask = (1ULL << __PHYSICAL_MASK_SHIFT) - 1;
+
+/*
+ * Mapping information structure passed to kernel_ident_mapping_init().
+ * Due to relocation, pointers must be assigned at run time not build time.
+ */
+static struct x86_mapping_info mapping_info;
+
+/* Locates and clears a region for a new top level page table. */
+void initialize_identity_maps(void)
+{
+ /* If running as an SEV guest, the encryption mask is required. */
+ set_sev_encryption_mask();
+
+ /* Exclude the encryption mask from __PHYSICAL_MASK */
+ physical_mask &= ~sme_me_mask;
+
+ /* Init mapping_info with run-time function/buffer pointers. */
+ mapping_info.alloc_pgt_page = alloc_pgt_page;
+ mapping_info.context = &pgt_data;
+ mapping_info.page_flag = __PAGE_KERNEL_LARGE_EXEC | sme_me_mask;
+ mapping_info.kernpg_flag = _KERNPG_TABLE;
+
+ /*
+ * It should be impossible for this not to already be true,
+ * but since calling this a second time would rewind the other
+ * counters, let's just make sure this is reset too.
+ */
+ pgt_data.pgt_buf_offset = 0;
+
+ /*
+ * If we came here via startup_32(), cr3 will be _pgtable already
+ * and we must append to the existing area instead of entirely
+ * overwriting it.
+ *
+ * With 5-level paging, we use '_pgtable' to allocate the p4d page table,
+ * the top-level page table is allocated separately.
+ *
+ * p4d_offset(top_level_pgt, 0) would cover both the 4- and 5-level
+ * cases. On 4-level paging it's equal to 'top_level_pgt'.
+ */
+ top_level_pgt = read_cr3_pa();
+ if (p4d_offset((pgd_t *)top_level_pgt, 0) == (p4d_t *)_pgtable) {
+ debug_putstr("booted via startup_32()\n");
+ pgt_data.pgt_buf = _pgtable + BOOT_INIT_PGT_SIZE;
+ pgt_data.pgt_buf_size = BOOT_PGT_SIZE - BOOT_INIT_PGT_SIZE;
+ memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size);
+ } else {
+ debug_putstr("booted via startup_64()\n");
+ pgt_data.pgt_buf = _pgtable;
+ pgt_data.pgt_buf_size = BOOT_PGT_SIZE;
+ memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size);
+ top_level_pgt = (unsigned long)alloc_pgt_page(&pgt_data);
+ }
+}
+
+/*
+ * Adds the specified range to what will become the new identity mappings.
+ * Once all ranges have been added, the new mapping is activated by calling
+ * finalize_identity_maps() below.
+ */
+void add_identity_map(unsigned long start, unsigned long size)
+{
+ unsigned long end = start + size;
+
+ /* Align boundary to 2M. */
+ start = round_down(start, PMD_SIZE);
+ end = round_up(end, PMD_SIZE);
+ if (start >= end)
+ return;
+
+ /* Build the mapping. */
+ kernel_ident_mapping_init(&mapping_info, (pgd_t *)top_level_pgt,
+ start, end);
+}
+
+/*
+ * This switches the page tables to the new level4 that has been built
+ * via calls to add_identity_map() above. If booted via startup_32(),
+ * this is effectively a no-op.
+ */
+void finalize_identity_maps(void)
+{
+ write_cr3(top_level_pgt);
+}
diff --git a/arch/x86/boot/compressed/mem_encrypt.S b/arch/x86/boot/compressed/mem_encrypt.S
new file mode 100644
index 000000000..a480356e0
--- /dev/null
+++ b/arch/x86/boot/compressed/mem_encrypt.S
@@ -0,0 +1,104 @@
+/*
+ * AMD Memory Encryption Support
+ *
+ * Copyright (C) 2017 Advanced Micro Devices, Inc.
+ *
+ * Author: Tom Lendacky <thomas.lendacky@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+
+#include <asm/processor-flags.h>
+#include <asm/msr.h>
+#include <asm/asm-offsets.h>
+
+ .text
+ .code32
+ENTRY(get_sev_encryption_bit)
+ xor %eax, %eax
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ push %ebx
+ push %ecx
+ push %edx
+
+ /* Check if running under a hypervisor */
+ movl $1, %eax
+ cpuid
+ bt $31, %ecx /* Check the hypervisor bit */
+ jnc .Lno_sev
+
+ movl $0x80000000, %eax /* CPUID to check the highest leaf */
+ cpuid
+ cmpl $0x8000001f, %eax /* See if 0x8000001f is available */
+ jb .Lno_sev
+
+ /*
+ * Check for the SEV feature:
+ * CPUID Fn8000_001F[EAX] - Bit 1
+ * CPUID Fn8000_001F[EBX] - Bits 5:0
+ * Pagetable bit position used to indicate encryption
+ */
+ movl $0x8000001f, %eax
+ cpuid
+ bt $1, %eax /* Check if SEV is available */
+ jnc .Lno_sev
+
+ movl $MSR_AMD64_SEV, %ecx /* Read the SEV MSR */
+ rdmsr
+ bt $MSR_AMD64_SEV_ENABLED_BIT, %eax /* Check if SEV is active */
+ jnc .Lno_sev
+
+ movl %ebx, %eax
+ andl $0x3f, %eax /* Return the encryption bit location */
+ jmp .Lsev_exit
+
+.Lno_sev:
+ xor %eax, %eax
+
+.Lsev_exit:
+ pop %edx
+ pop %ecx
+ pop %ebx
+
+#endif /* CONFIG_AMD_MEM_ENCRYPT */
+
+ ret
+ENDPROC(get_sev_encryption_bit)
+
+ .code64
+ENTRY(set_sev_encryption_mask)
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ push %rbp
+ push %rdx
+
+ movq %rsp, %rbp /* Save current stack pointer */
+
+ call get_sev_encryption_bit /* Get the encryption bit position */
+ testl %eax, %eax
+ jz .Lno_sev_mask
+
+ bts %rax, sme_me_mask(%rip) /* Create the encryption mask */
+
+.Lno_sev_mask:
+ movq %rbp, %rsp /* Restore original stack pointer */
+
+ pop %rdx
+ pop %rbp
+#endif
+
+ xor %rax, %rax
+ ret
+ENDPROC(set_sev_encryption_mask)
+
+ .data
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ .balign 8
+GLOBAL(sme_me_mask)
+ .quad 0
+#endif
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
new file mode 100644
index 000000000..0387d7a96
--- /dev/null
+++ b/arch/x86/boot/compressed/misc.c
@@ -0,0 +1,429 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * misc.c
+ *
+ * This is a collection of several routines used to extract the kernel
+ * which includes KASLR relocation, decompression, ELF parsing, and
+ * relocation processing. Additionally included are the screen and serial
+ * output functions and related debugging support functions.
+ *
+ * malloc by Hannu Savolainen 1993 and Matthias Urlichs 1994
+ * puts by Nick Holloway 1993, better puts by Martin Mares 1995
+ * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
+ */
+
+#include "misc.h"
+#include "error.h"
+#include "pgtable.h"
+#include "../string.h"
+#include "../voffset.h"
+#include <asm/bootparam_utils.h>
+
+/*
+ * WARNING!!
+ * This code is compiled with -fPIC and it is relocated dynamically at
+ * run time, but no relocation processing is performed. This means that
+ * it is not safe to place pointers in static structures.
+ */
+
+/* Macros used by the included decompressor code below. */
+#define STATIC static
+
+/*
+ * Use normal definitions of mem*() from string.c. There are already
+ * included header files which expect a definition of memset() and by
+ * the time we define memset macro, it is too late.
+ */
+#undef memcpy
+#undef memset
+#define memzero(s, n) memset((s), 0, (n))
+#define memmove memmove
+
+/* Functions used by the included decompressor code below. */
+void *memmove(void *dest, const void *src, size_t n);
+
+/*
+ * This is set up by the setup-routine at boot-time
+ */
+struct boot_params *boot_params;
+
+memptr free_mem_ptr;
+memptr free_mem_end_ptr;
+
+static char *vidmem;
+static int vidport;
+static int lines, cols;
+
+#ifdef CONFIG_KERNEL_GZIP
+#include "../../../../lib/decompress_inflate.c"
+#endif
+
+#ifdef CONFIG_KERNEL_BZIP2
+#include "../../../../lib/decompress_bunzip2.c"
+#endif
+
+#ifdef CONFIG_KERNEL_LZMA
+#include "../../../../lib/decompress_unlzma.c"
+#endif
+
+#ifdef CONFIG_KERNEL_XZ
+#include "../../../../lib/decompress_unxz.c"
+#endif
+
+#ifdef CONFIG_KERNEL_LZO
+#include "../../../../lib/decompress_unlzo.c"
+#endif
+
+#ifdef CONFIG_KERNEL_LZ4
+#include "../../../../lib/decompress_unlz4.c"
+#endif
+/*
+ * NOTE: When adding a new decompressor, please update the analysis in
+ * ../header.S.
+ */
+
+static void scroll(void)
+{
+ int i;
+
+ memmove(vidmem, vidmem + cols * 2, (lines - 1) * cols * 2);
+ for (i = (lines - 1) * cols * 2; i < lines * cols * 2; i += 2)
+ vidmem[i] = ' ';
+}
+
+#define XMTRDY 0x20
+
+#define TXR 0 /* Transmit register (WRITE) */
+#define LSR 5 /* Line Status */
+static void serial_putchar(int ch)
+{
+ unsigned timeout = 0xffff;
+
+ while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout)
+ cpu_relax();
+
+ outb(ch, early_serial_base + TXR);
+}
+
+void __putstr(const char *s)
+{
+ int x, y, pos;
+ char c;
+
+ if (early_serial_base) {
+ const char *str = s;
+ while (*str) {
+ if (*str == '\n')
+ serial_putchar('\r');
+ serial_putchar(*str++);
+ }
+ }
+
+ if (lines == 0 || cols == 0)
+ return;
+
+ x = boot_params->screen_info.orig_x;
+ y = boot_params->screen_info.orig_y;
+
+ while ((c = *s++) != '\0') {
+ if (c == '\n') {
+ x = 0;
+ if (++y >= lines) {
+ scroll();
+ y--;
+ }
+ } else {
+ vidmem[(x + cols * y) * 2] = c;
+ if (++x >= cols) {
+ x = 0;
+ if (++y >= lines) {
+ scroll();
+ y--;
+ }
+ }
+ }
+ }
+
+ boot_params->screen_info.orig_x = x;
+ boot_params->screen_info.orig_y = y;
+
+ pos = (x + cols * y) * 2; /* Update cursor position */
+ outb(14, vidport);
+ outb(0xff & (pos >> 9), vidport+1);
+ outb(15, vidport);
+ outb(0xff & (pos >> 1), vidport+1);
+}
+
+void __puthex(unsigned long value)
+{
+ char alpha[2] = "0";
+ int bits;
+
+ for (bits = sizeof(value) * 8 - 4; bits >= 0; bits -= 4) {
+ unsigned long digit = (value >> bits) & 0xf;
+
+ if (digit < 0xA)
+ alpha[0] = '0' + digit;
+ else
+ alpha[0] = 'a' + (digit - 0xA);
+
+ __putstr(alpha);
+ }
+}
+
+#if CONFIG_X86_NEED_RELOCS
+static void handle_relocations(void *output, unsigned long output_len,
+ unsigned long virt_addr)
+{
+ int *reloc;
+ unsigned long delta, map, ptr;
+ unsigned long min_addr = (unsigned long)output;
+ unsigned long max_addr = min_addr + (VO___bss_start - VO__text);
+
+ /*
+ * Calculate the delta between where vmlinux was linked to load
+ * and where it was actually loaded.
+ */
+ delta = min_addr - LOAD_PHYSICAL_ADDR;
+
+ /*
+ * The kernel contains a table of relocation addresses. Those
+ * addresses have the final load address of the kernel in virtual
+ * memory. We are currently working in the self map. So we need to
+ * create an adjustment for kernel memory addresses to the self map.
+ * This will involve subtracting out the base address of the kernel.
+ */
+ map = delta - __START_KERNEL_map;
+
+ /*
+ * 32-bit always performs relocations. 64-bit relocations are only
+ * needed if KASLR has chosen a different starting address offset
+ * from __START_KERNEL_map.
+ */
+ if (IS_ENABLED(CONFIG_X86_64))
+ delta = virt_addr - LOAD_PHYSICAL_ADDR;
+
+ if (!delta) {
+ debug_putstr("No relocation needed... ");
+ return;
+ }
+ debug_putstr("Performing relocations... ");
+
+ /*
+ * Process relocations: 32 bit relocations first then 64 bit after.
+ * Three sets of binary relocations are added to the end of the kernel
+ * before compression. Each relocation table entry is the kernel
+ * address of the location which needs to be updated stored as a
+ * 32-bit value which is sign extended to 64 bits.
+ *
+ * Format is:
+ *
+ * kernel bits...
+ * 0 - zero terminator for 64 bit relocations
+ * 64 bit relocation repeated
+ * 0 - zero terminator for inverse 32 bit relocations
+ * 32 bit inverse relocation repeated
+ * 0 - zero terminator for 32 bit relocations
+ * 32 bit relocation repeated
+ *
+ * So we work backwards from the end of the decompressed image.
+ */
+ for (reloc = output + output_len - sizeof(*reloc); *reloc; reloc--) {
+ long extended = *reloc;
+ extended += map;
+
+ ptr = (unsigned long)extended;
+ if (ptr < min_addr || ptr > max_addr)
+ error("32-bit relocation outside of kernel!\n");
+
+ *(uint32_t *)ptr += delta;
+ }
+#ifdef CONFIG_X86_64
+ while (*--reloc) {
+ long extended = *reloc;
+ extended += map;
+
+ ptr = (unsigned long)extended;
+ if (ptr < min_addr || ptr > max_addr)
+ error("inverse 32-bit relocation outside of kernel!\n");
+
+ *(int32_t *)ptr -= delta;
+ }
+ for (reloc--; *reloc; reloc--) {
+ long extended = *reloc;
+ extended += map;
+
+ ptr = (unsigned long)extended;
+ if (ptr < min_addr || ptr > max_addr)
+ error("64-bit relocation outside of kernel!\n");
+
+ *(uint64_t *)ptr += delta;
+ }
+#endif
+}
+#else
+static inline void handle_relocations(void *output, unsigned long output_len,
+ unsigned long virt_addr)
+{ }
+#endif
+
+static void parse_elf(void *output)
+{
+#ifdef CONFIG_X86_64
+ Elf64_Ehdr ehdr;
+ Elf64_Phdr *phdrs, *phdr;
+#else
+ Elf32_Ehdr ehdr;
+ Elf32_Phdr *phdrs, *phdr;
+#endif
+ void *dest;
+ int i;
+
+ memcpy(&ehdr, output, sizeof(ehdr));
+ if (ehdr.e_ident[EI_MAG0] != ELFMAG0 ||
+ ehdr.e_ident[EI_MAG1] != ELFMAG1 ||
+ ehdr.e_ident[EI_MAG2] != ELFMAG2 ||
+ ehdr.e_ident[EI_MAG3] != ELFMAG3) {
+ error("Kernel is not a valid ELF file");
+ return;
+ }
+
+ debug_putstr("Parsing ELF... ");
+
+ phdrs = malloc(sizeof(*phdrs) * ehdr.e_phnum);
+ if (!phdrs)
+ error("Failed to allocate space for phdrs");
+
+ memcpy(phdrs, output + ehdr.e_phoff, sizeof(*phdrs) * ehdr.e_phnum);
+
+ for (i = 0; i < ehdr.e_phnum; i++) {
+ phdr = &phdrs[i];
+
+ switch (phdr->p_type) {
+ case PT_LOAD:
+#ifdef CONFIG_X86_64
+ if ((phdr->p_align % 0x200000) != 0)
+ error("Alignment of LOAD segment isn't multiple of 2MB");
+#endif
+#ifdef CONFIG_RELOCATABLE
+ dest = output;
+ dest += (phdr->p_paddr - LOAD_PHYSICAL_ADDR);
+#else
+ dest = (void *)(phdr->p_paddr);
+#endif
+ memmove(dest, output + phdr->p_offset, phdr->p_filesz);
+ break;
+ default: /* Ignore other PT_* */ break;
+ }
+ }
+
+ free(phdrs);
+}
+
+/*
+ * The compressed kernel image (ZO), has been moved so that its position
+ * is against the end of the buffer used to hold the uncompressed kernel
+ * image (VO) and the execution environment (.bss, .brk), which makes sure
+ * there is room to do the in-place decompression. (See header.S for the
+ * calculations.)
+ *
+ * |-----compressed kernel image------|
+ * V V
+ * 0 extract_offset +INIT_SIZE
+ * |-----------|---------------|-------------------------|--------|
+ * | | | |
+ * VO__text startup_32 of ZO VO__end ZO__end
+ * ^ ^
+ * |-------uncompressed kernel image---------|
+ *
+ */
+asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,
+ unsigned char *input_data,
+ unsigned long input_len,
+ unsigned char *output,
+ unsigned long output_len)
+{
+ const unsigned long kernel_total_size = VO__end - VO__text;
+ unsigned long virt_addr = LOAD_PHYSICAL_ADDR;
+
+ /* Retain x86 boot parameters pointer passed from startup_32/64. */
+ boot_params = rmode;
+
+ /* Clear flags intended for solely in-kernel use. */
+ boot_params->hdr.loadflags &= ~KASLR_FLAG;
+
+ sanitize_boot_params(boot_params);
+
+ if (boot_params->screen_info.orig_video_mode == 7) {
+ vidmem = (char *) 0xb0000;
+ vidport = 0x3b4;
+ } else {
+ vidmem = (char *) 0xb8000;
+ vidport = 0x3d4;
+ }
+
+ lines = boot_params->screen_info.orig_video_lines;
+ cols = boot_params->screen_info.orig_video_cols;
+
+ console_init();
+ debug_putstr("early console in extract_kernel\n");
+
+ free_mem_ptr = heap; /* Heap */
+ free_mem_end_ptr = heap + BOOT_HEAP_SIZE;
+
+ /* Report initial kernel position details. */
+ debug_putaddr(input_data);
+ debug_putaddr(input_len);
+ debug_putaddr(output);
+ debug_putaddr(output_len);
+ debug_putaddr(kernel_total_size);
+
+#ifdef CONFIG_X86_64
+ /* Report address of 32-bit trampoline */
+ debug_putaddr(trampoline_32bit);
+#endif
+
+ /*
+ * The memory hole needed for the kernel is the larger of either
+ * the entire decompressed kernel plus relocation table, or the
+ * entire decompressed kernel plus .bss and .brk sections.
+ */
+ choose_random_location((unsigned long)input_data, input_len,
+ (unsigned long *)&output,
+ max(output_len, kernel_total_size),
+ &virt_addr);
+
+ /* Validate memory location choices. */
+ if ((unsigned long)output & (MIN_KERNEL_ALIGN - 1))
+ error("Destination physical address inappropriately aligned");
+ if (virt_addr & (MIN_KERNEL_ALIGN - 1))
+ error("Destination virtual address inappropriately aligned");
+#ifdef CONFIG_X86_64
+ if (heap > 0x3fffffffffffUL)
+ error("Destination address too large");
+ if (virt_addr + max(output_len, kernel_total_size) > KERNEL_IMAGE_SIZE)
+ error("Destination virtual address is beyond the kernel mapping area");
+#else
+ if (heap > ((-__PAGE_OFFSET-(128<<20)-1) & 0x7fffffff))
+ error("Destination address too large");
+#endif
+#ifndef CONFIG_RELOCATABLE
+ if ((unsigned long)output != LOAD_PHYSICAL_ADDR)
+ error("Destination address does not match LOAD_PHYSICAL_ADDR");
+ if (virt_addr != LOAD_PHYSICAL_ADDR)
+ error("Destination virtual address changed when not relocatable");
+#endif
+
+ debug_putstr("\nDecompressing Linux... ");
+ __decompress(input_data, input_len, NULL, NULL, output, output_len,
+ NULL, error);
+ parse_elf(output);
+ handle_relocations(output, output_len, virt_addr);
+ debug_putstr("done.\nBooting the kernel.\n");
+ return output;
+}
+
+void fortify_panic(const char *name)
+{
+ error("detected buffer overflow");
+}
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
new file mode 100644
index 000000000..47fd18db6
--- /dev/null
+++ b/arch/x86/boot/compressed/misc.h
@@ -0,0 +1,116 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef BOOT_COMPRESSED_MISC_H
+#define BOOT_COMPRESSED_MISC_H
+
+/*
+ * Special hack: we have to be careful, because no indirections are allowed here,
+ * and paravirt_ops is a kind of one. As it will only run in baremetal anyway,
+ * we just keep it from happening. (This list needs to be extended when new
+ * paravirt and debugging variants are added.)
+ */
+#undef CONFIG_PARAVIRT
+#undef CONFIG_PARAVIRT_SPINLOCKS
+#undef CONFIG_KASAN
+
+/* cpu_feature_enabled() cannot be used this early */
+#define USE_EARLY_PGTABLE_L5
+
+#include <linux/linkage.h>
+#include <linux/screen_info.h>
+#include <linux/elf.h>
+#include <linux/io.h>
+#include <asm/page.h>
+#include <asm/boot.h>
+#include <asm/bootparam.h>
+
+#define BOOT_BOOT_H
+#include "../ctype.h"
+
+#ifdef CONFIG_X86_64
+#define memptr long
+#else
+#define memptr unsigned
+#endif
+
+/* misc.c */
+extern memptr free_mem_ptr;
+extern memptr free_mem_end_ptr;
+extern struct boot_params *boot_params;
+void __putstr(const char *s);
+void __puthex(unsigned long value);
+#define error_putstr(__x) __putstr(__x)
+#define error_puthex(__x) __puthex(__x)
+
+#ifdef CONFIG_X86_VERBOSE_BOOTUP
+
+#define debug_putstr(__x) __putstr(__x)
+#define debug_puthex(__x) __puthex(__x)
+#define debug_putaddr(__x) { \
+ debug_putstr(#__x ": 0x"); \
+ debug_puthex((unsigned long)(__x)); \
+ debug_putstr("\n"); \
+ }
+
+#else
+
+static inline void debug_putstr(const char *s)
+{ }
+static inline void debug_puthex(const char *s)
+{ }
+#define debug_putaddr(x) /* */
+
+#endif
+
+#if CONFIG_EARLY_PRINTK || CONFIG_RANDOMIZE_BASE
+/* cmdline.c */
+int cmdline_find_option(const char *option, char *buffer, int bufsize);
+int cmdline_find_option_bool(const char *option);
+#endif
+
+
+#if CONFIG_RANDOMIZE_BASE
+/* kaslr.c */
+void choose_random_location(unsigned long input,
+ unsigned long input_size,
+ unsigned long *output,
+ unsigned long output_size,
+ unsigned long *virt_addr);
+/* cpuflags.c */
+bool has_cpuflag(int flag);
+#else
+static inline void choose_random_location(unsigned long input,
+ unsigned long input_size,
+ unsigned long *output,
+ unsigned long output_size,
+ unsigned long *virt_addr)
+{
+}
+#endif
+
+#ifdef CONFIG_X86_64
+void initialize_identity_maps(void);
+void add_identity_map(unsigned long start, unsigned long size);
+void finalize_identity_maps(void);
+extern unsigned char _pgtable[];
+#else
+static inline void initialize_identity_maps(void)
+{ }
+static inline void add_identity_map(unsigned long start, unsigned long size)
+{ }
+static inline void finalize_identity_maps(void)
+{ }
+#endif
+
+#ifdef CONFIG_EARLY_PRINTK
+/* early_serial_console.c */
+extern int early_serial_base;
+void console_init(void);
+#else
+static const int early_serial_base;
+static inline void console_init(void)
+{ }
+#endif
+
+void set_sev_encryption_mask(void);
+
+#endif
diff --git a/arch/x86/boot/compressed/mkpiggy.c b/arch/x86/boot/compressed/mkpiggy.c
new file mode 100644
index 000000000..72bad2c8d
--- /dev/null
+++ b/arch/x86/boot/compressed/mkpiggy.c
@@ -0,0 +1,82 @@
+/* ----------------------------------------------------------------------- *
+ *
+ * Copyright (C) 2009 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ *
+ * H. Peter Anvin <hpa@linux.intel.com>
+ *
+ * -----------------------------------------------------------------------
+ *
+ * Outputs a small assembly wrapper with the appropriate symbols defined.
+ *
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <inttypes.h>
+#include <tools/le_byteshift.h>
+
+int main(int argc, char *argv[])
+{
+ uint32_t olen;
+ long ilen;
+ FILE *f = NULL;
+ int retval = 1;
+
+ if (argc < 2) {
+ fprintf(stderr, "Usage: %s compressed_file\n", argv[0]);
+ goto bail;
+ }
+
+ /* Get the information for the compressed kernel image first */
+
+ f = fopen(argv[1], "r");
+ if (!f) {
+ perror(argv[1]);
+ goto bail;
+ }
+
+
+ if (fseek(f, -4L, SEEK_END)) {
+ perror(argv[1]);
+ }
+
+ if (fread(&olen, sizeof(olen), 1, f) != 1) {
+ perror(argv[1]);
+ goto bail;
+ }
+
+ ilen = ftell(f);
+ olen = get_unaligned_le32(&olen);
+
+ printf(".section \".rodata..compressed\",\"a\",@progbits\n");
+ printf(".globl z_input_len\n");
+ printf("z_input_len = %lu\n", ilen);
+ printf(".globl z_output_len\n");
+ printf("z_output_len = %lu\n", (unsigned long)olen);
+
+ printf(".globl input_data, input_data_end\n");
+ printf("input_data:\n");
+ printf(".incbin \"%s\"\n", argv[1]);
+ printf("input_data_end:\n");
+
+ retval = 0;
+bail:
+ if (f)
+ fclose(f);
+ return retval;
+}
diff --git a/arch/x86/boot/compressed/pgtable.h b/arch/x86/boot/compressed/pgtable.h
new file mode 100644
index 000000000..6ff7e81b5
--- /dev/null
+++ b/arch/x86/boot/compressed/pgtable.h
@@ -0,0 +1,20 @@
+#ifndef BOOT_COMPRESSED_PAGETABLE_H
+#define BOOT_COMPRESSED_PAGETABLE_H
+
+#define TRAMPOLINE_32BIT_SIZE (2 * PAGE_SIZE)
+
+#define TRAMPOLINE_32BIT_PGTABLE_OFFSET 0
+
+#define TRAMPOLINE_32BIT_CODE_OFFSET PAGE_SIZE
+#define TRAMPOLINE_32BIT_CODE_SIZE 0x70
+
+#define TRAMPOLINE_32BIT_STACK_END TRAMPOLINE_32BIT_SIZE
+
+#ifndef __ASSEMBLER__
+
+extern unsigned long *trampoline_32bit;
+
+extern void trampoline_32bit_src(void *return_ptr);
+
+#endif /* __ASSEMBLER__ */
+#endif /* BOOT_COMPRESSED_PAGETABLE_H */
diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c
new file mode 100644
index 000000000..76e1edf5b
--- /dev/null
+++ b/arch/x86/boot/compressed/pgtable_64.c
@@ -0,0 +1,211 @@
+#include <linux/efi.h>
+#include <asm/e820/types.h>
+#include <asm/processor.h>
+#include <asm/efi.h>
+#include "pgtable.h"
+#include "../string.h"
+
+/*
+ * __force_order is used by special_insns.h asm code to force instruction
+ * serialization.
+ *
+ * It is not referenced from the code, but GCC < 5 with -fPIE would fail
+ * due to an undefined symbol. Define it to make these ancient GCCs work.
+ */
+unsigned long __force_order;
+
+#define BIOS_START_MIN 0x20000U /* 128K, less than this is insane */
+#define BIOS_START_MAX 0x9f000U /* 640K, absolute maximum */
+
+struct paging_config {
+ unsigned long trampoline_start;
+ unsigned long l5_required;
+};
+
+/* Buffer to preserve trampoline memory */
+static char trampoline_save[TRAMPOLINE_32BIT_SIZE];
+
+/*
+ * Trampoline address will be printed by extract_kernel() for debugging
+ * purposes.
+ *
+ * Avoid putting the pointer into .bss as it will be cleared between
+ * paging_prepare() and extract_kernel().
+ */
+unsigned long *trampoline_32bit __section(.data);
+
+extern struct boot_params *boot_params;
+int cmdline_find_option_bool(const char *option);
+
+static unsigned long find_trampoline_placement(void)
+{
+ unsigned long bios_start = 0, ebda_start = 0;
+ unsigned long trampoline_start;
+ struct boot_e820_entry *entry;
+ char *signature;
+ int i;
+
+ /*
+ * Find a suitable spot for the trampoline.
+ * This code is based on reserve_bios_regions().
+ */
+
+ /*
+ * EFI systems may not provide legacy ROM. The memory may not be mapped
+ * at all.
+ *
+ * Only look for values in the legacy ROM for non-EFI system.
+ */
+ signature = (char *)&boot_params->efi_info.efi_loader_signature;
+ if (strncmp(signature, EFI32_LOADER_SIGNATURE, 4) &&
+ strncmp(signature, EFI64_LOADER_SIGNATURE, 4)) {
+ ebda_start = *(unsigned short *)0x40e << 4;
+ bios_start = *(unsigned short *)0x413 << 10;
+ }
+
+ if (bios_start < BIOS_START_MIN || bios_start > BIOS_START_MAX)
+ bios_start = BIOS_START_MAX;
+
+ if (ebda_start > BIOS_START_MIN && ebda_start < bios_start)
+ bios_start = ebda_start;
+
+ bios_start = round_down(bios_start, PAGE_SIZE);
+
+ /* Find the first usable memory region under bios_start. */
+ for (i = boot_params->e820_entries - 1; i >= 0; i--) {
+ unsigned long new = bios_start;
+
+ entry = &boot_params->e820_table[i];
+
+ /* Skip all entries above bios_start. */
+ if (bios_start <= entry->addr)
+ continue;
+
+ /* Skip non-RAM entries. */
+ if (entry->type != E820_TYPE_RAM)
+ continue;
+
+ /* Adjust bios_start to the end of the entry if needed. */
+ if (bios_start > entry->addr + entry->size)
+ new = entry->addr + entry->size;
+
+ /* Keep bios_start page-aligned. */
+ new = round_down(new, PAGE_SIZE);
+
+ /* Skip the entry if it's too small. */
+ if (new - TRAMPOLINE_32BIT_SIZE < entry->addr)
+ continue;
+
+ /* Protect against underflow. */
+ if (new - TRAMPOLINE_32BIT_SIZE > bios_start)
+ break;
+
+ bios_start = new;
+ break;
+ }
+
+ /* Place the trampoline just below the end of low memory */
+ return bios_start - TRAMPOLINE_32BIT_SIZE;
+}
+
+struct paging_config paging_prepare(void *rmode)
+{
+ struct paging_config paging_config = {};
+
+ /* Initialize boot_params. Required for cmdline_find_option_bool(). */
+ boot_params = rmode;
+
+ /*
+ * Check if LA57 is desired and supported.
+ *
+ * There are several parts to the check:
+ * - if the kernel supports 5-level paging: CONFIG_X86_5LEVEL=y
+ * - if user asked to disable 5-level paging: no5lvl in cmdline
+ * - if the machine supports 5-level paging:
+ * + CPUID leaf 7 is supported
+ * + the leaf has the feature bit set
+ *
+ * That's substitute for boot_cpu_has() in early boot code.
+ */
+ if (IS_ENABLED(CONFIG_X86_5LEVEL) &&
+ !cmdline_find_option_bool("no5lvl") &&
+ native_cpuid_eax(0) >= 7 &&
+ (native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31)))) {
+ paging_config.l5_required = 1;
+ }
+
+ paging_config.trampoline_start = find_trampoline_placement();
+
+ trampoline_32bit = (unsigned long *)paging_config.trampoline_start;
+
+ /* Preserve trampoline memory */
+ memcpy(trampoline_save, trampoline_32bit, TRAMPOLINE_32BIT_SIZE);
+
+ /* Clear trampoline memory first */
+ memset(trampoline_32bit, 0, TRAMPOLINE_32BIT_SIZE);
+
+ /* Copy trampoline code in place */
+ memcpy(trampoline_32bit + TRAMPOLINE_32BIT_CODE_OFFSET / sizeof(unsigned long),
+ &trampoline_32bit_src, TRAMPOLINE_32BIT_CODE_SIZE);
+
+ /*
+ * The code below prepares page table in trampoline memory.
+ *
+ * The new page table will be used by trampoline code for switching
+ * from 4- to 5-level paging or vice versa.
+ *
+ * If switching is not required, the page table is unused: trampoline
+ * code wouldn't touch CR3.
+ */
+
+ /*
+ * We are not going to use the page table in trampoline memory if we
+ * are already in the desired paging mode.
+ */
+ if (paging_config.l5_required == !!(native_read_cr4() & X86_CR4_LA57))
+ goto out;
+
+ if (paging_config.l5_required) {
+ /*
+ * For 4- to 5-level paging transition, set up current CR3 as
+ * the first and the only entry in a new top-level page table.
+ */
+ trampoline_32bit[TRAMPOLINE_32BIT_PGTABLE_OFFSET] = __native_read_cr3() | _PAGE_TABLE_NOENC;
+ } else {
+ unsigned long src;
+
+ /*
+ * For 5- to 4-level paging transition, copy page table pointed
+ * by first entry in the current top-level page table as our
+ * new top-level page table.
+ *
+ * We cannot just point to the page table from trampoline as it
+ * may be above 4G.
+ */
+ src = *(unsigned long *)__native_read_cr3() & PAGE_MASK;
+ memcpy(trampoline_32bit + TRAMPOLINE_32BIT_PGTABLE_OFFSET / sizeof(unsigned long),
+ (void *)src, PAGE_SIZE);
+ }
+
+out:
+ return paging_config;
+}
+
+void cleanup_trampoline(void *pgtable)
+{
+ void *trampoline_pgtable;
+
+ trampoline_pgtable = trampoline_32bit + TRAMPOLINE_32BIT_PGTABLE_OFFSET / sizeof(unsigned long);
+
+ /*
+ * Move the top level page table out of trampoline memory,
+ * if it's there.
+ */
+ if ((void *)__native_read_cr3() == trampoline_pgtable) {
+ memcpy(pgtable, trampoline_pgtable, PAGE_SIZE);
+ native_write_cr3((unsigned long)pgtable);
+ }
+
+ /* Restore trampoline memory */
+ memcpy(trampoline_32bit, trampoline_save, TRAMPOLINE_32BIT_SIZE);
+}
diff --git a/arch/x86/boot/compressed/string.c b/arch/x86/boot/compressed/string.c
new file mode 100644
index 000000000..19dbbcdd1
--- /dev/null
+++ b/arch/x86/boot/compressed/string.c
@@ -0,0 +1,75 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This provides an optimized implementation of memcpy, and a simplified
+ * implementation of memset and memmove. These are used here because the
+ * standard kernel runtime versions are not yet available and we don't
+ * trust the gcc built-in implementations as they may do unexpected things
+ * (e.g. FPU ops) in the minimal decompression stub execution environment.
+ */
+#include "error.h"
+
+#include "../string.c"
+
+#ifdef CONFIG_X86_32
+static void *__memcpy(void *dest, const void *src, size_t n)
+{
+ int d0, d1, d2;
+ asm volatile(
+ "rep ; movsl\n\t"
+ "movl %4,%%ecx\n\t"
+ "rep ; movsb\n\t"
+ : "=&c" (d0), "=&D" (d1), "=&S" (d2)
+ : "0" (n >> 2), "g" (n & 3), "1" (dest), "2" (src)
+ : "memory");
+
+ return dest;
+}
+#else
+static void *__memcpy(void *dest, const void *src, size_t n)
+{
+ long d0, d1, d2;
+ asm volatile(
+ "rep ; movsq\n\t"
+ "movq %4,%%rcx\n\t"
+ "rep ; movsb\n\t"
+ : "=&c" (d0), "=&D" (d1), "=&S" (d2)
+ : "0" (n >> 3), "g" (n & 7), "1" (dest), "2" (src)
+ : "memory");
+
+ return dest;
+}
+#endif
+
+void *memset(void *s, int c, size_t n)
+{
+ int i;
+ char *ss = s;
+
+ for (i = 0; i < n; i++)
+ ss[i] = c;
+ return s;
+}
+
+void *memmove(void *dest, const void *src, size_t n)
+{
+ unsigned char *d = dest;
+ const unsigned char *s = src;
+
+ if (d <= s || d - s >= n)
+ return __memcpy(dest, src, n);
+
+ while (n-- > 0)
+ d[n] = s[n];
+
+ return dest;
+}
+
+/* Detect and warn about potential overlaps, but handle them with memmove. */
+void *memcpy(void *dest, const void *src, size_t n)
+{
+ if (dest > src && dest - src < n) {
+ warn("Avoiding potentially unsafe overlapping memcpy()!");
+ return memmove(dest, src, n);
+ }
+ return __memcpy(dest, src, n);
+}
diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S
new file mode 100644
index 000000000..f491bbde8
--- /dev/null
+++ b/arch/x86/boot/compressed/vmlinux.lds.S
@@ -0,0 +1,76 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <asm-generic/vmlinux.lds.h>
+
+OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT)
+
+#undef i386
+
+#include <asm/cache.h>
+#include <asm/page_types.h>
+
+#ifdef CONFIG_X86_64
+OUTPUT_ARCH(i386:x86-64)
+ENTRY(startup_64)
+#else
+OUTPUT_ARCH(i386)
+ENTRY(startup_32)
+#endif
+
+SECTIONS
+{
+ /* Be careful parts of head_64.S assume startup_32 is at
+ * address 0.
+ */
+ . = 0;
+ .head.text : {
+ _head = . ;
+ HEAD_TEXT
+ _ehead = . ;
+ }
+ .rodata..compressed : {
+ *(.rodata..compressed)
+ }
+ .text : {
+ _text = .; /* Text */
+ *(.text)
+ *(.text.*)
+ _etext = . ;
+ }
+ .rodata : {
+ _rodata = . ;
+ *(.rodata) /* read-only data */
+ *(.rodata.*)
+ _erodata = . ;
+ }
+ .got : {
+ _got = .;
+ KEEP(*(.got.plt))
+ KEEP(*(.got))
+ _egot = .;
+ }
+ .data : {
+ _data = . ;
+ *(.data)
+ *(.data.*)
+ _edata = . ;
+ }
+ . = ALIGN(L1_CACHE_BYTES);
+ .bss : {
+ _bss = . ;
+ *(.bss)
+ *(.bss.*)
+ *(COMMON)
+ . = ALIGN(8); /* For convenience during zeroing */
+ _ebss = .;
+ }
+#ifdef CONFIG_X86_64
+ . = ALIGN(PAGE_SIZE);
+ .pgtable : {
+ _pgtable = . ;
+ *(.pgtable)
+ _epgtable = . ;
+ }
+#endif
+ . = ALIGN(PAGE_SIZE); /* keep ZO size page aligned */
+ _end = .;
+}
diff --git a/arch/x86/boot/copy.S b/arch/x86/boot/copy.S
new file mode 100644
index 000000000..15d9f74b0
--- /dev/null
+++ b/arch/x86/boot/copy.S
@@ -0,0 +1,67 @@
+/* ----------------------------------------------------------------------- *
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright 2007 rPath, Inc. - All Rights Reserved
+ *
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+
+#include <linux/linkage.h>
+
+/*
+ * Memory copy routines
+ */
+
+ .code16
+ .text
+
+GLOBAL(memcpy)
+ pushw %si
+ pushw %di
+ movw %ax, %di
+ movw %dx, %si
+ pushw %cx
+ shrw $2, %cx
+ rep; movsl
+ popw %cx
+ andw $3, %cx
+ rep; movsb
+ popw %di
+ popw %si
+ retl
+ENDPROC(memcpy)
+
+GLOBAL(memset)
+ pushw %di
+ movw %ax, %di
+ movzbl %dl, %eax
+ imull $0x01010101,%eax
+ pushw %cx
+ shrw $2, %cx
+ rep; stosl
+ popw %cx
+ andw $3, %cx
+ rep; stosb
+ popw %di
+ retl
+ENDPROC(memset)
+
+GLOBAL(copy_from_fs)
+ pushw %ds
+ pushw %fs
+ popw %ds
+ calll memcpy
+ popw %ds
+ retl
+ENDPROC(copy_from_fs)
+
+GLOBAL(copy_to_fs)
+ pushw %es
+ pushw %fs
+ popw %es
+ calll memcpy
+ popw %es
+ retl
+ENDPROC(copy_to_fs)
diff --git a/arch/x86/boot/cpu.c b/arch/x86/boot/cpu.c
new file mode 100644
index 000000000..26240dde0
--- /dev/null
+++ b/arch/x86/boot/cpu.c
@@ -0,0 +1,101 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright 2007-2008 rPath, Inc. - All Rights Reserved
+ *
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * arch/x86/boot/cpu.c
+ *
+ * Check for obligatory CPU features and abort if the features are not
+ * present.
+ */
+
+#include "boot.h"
+#ifdef CONFIG_X86_FEATURE_NAMES
+#include "cpustr.h"
+#endif
+
+static char *cpu_name(int level)
+{
+ static char buf[6];
+
+ if (level == 64) {
+ return "x86-64";
+ } else {
+ if (level == 15)
+ level = 6;
+ sprintf(buf, "i%d86", level);
+ return buf;
+ }
+}
+
+static void show_cap_strs(u32 *err_flags)
+{
+ int i, j;
+#ifdef CONFIG_X86_FEATURE_NAMES
+ const unsigned char *msg_strs = (const unsigned char *)x86_cap_strs;
+ for (i = 0; i < NCAPINTS; i++) {
+ u32 e = err_flags[i];
+ for (j = 0; j < 32; j++) {
+ if (msg_strs[0] < i ||
+ (msg_strs[0] == i && msg_strs[1] < j)) {
+ /* Skip to the next string */
+ msg_strs += 2;
+ while (*msg_strs++)
+ ;
+ }
+ if (e & 1) {
+ if (msg_strs[0] == i &&
+ msg_strs[1] == j &&
+ msg_strs[2])
+ printf("%s ", msg_strs+2);
+ else
+ printf("%d:%d ", i, j);
+ }
+ e >>= 1;
+ }
+ }
+#else
+ for (i = 0; i < NCAPINTS; i++) {
+ u32 e = err_flags[i];
+ for (j = 0; j < 32; j++) {
+ if (e & 1)
+ printf("%d:%d ", i, j);
+ e >>= 1;
+ }
+ }
+#endif
+}
+
+int validate_cpu(void)
+{
+ u32 *err_flags;
+ int cpu_level, req_level;
+
+ check_cpu(&cpu_level, &req_level, &err_flags);
+
+ if (cpu_level < req_level) {
+ printf("This kernel requires an %s CPU, ",
+ cpu_name(req_level));
+ printf("but only detected an %s CPU.\n",
+ cpu_name(cpu_level));
+ return -1;
+ }
+
+ if (err_flags) {
+ puts("This kernel requires the following features "
+ "not present on the CPU:\n");
+ show_cap_strs(err_flags);
+ putchar('\n');
+ return -1;
+ } else if (check_knl_erratum()) {
+ return -1;
+ } else {
+ return 0;
+ }
+}
diff --git a/arch/x86/boot/cpucheck.c b/arch/x86/boot/cpucheck.c
new file mode 100644
index 000000000..8f0c4c9fc
--- /dev/null
+++ b/arch/x86/boot/cpucheck.c
@@ -0,0 +1,229 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright 2007 rPath, Inc. - All Rights Reserved
+ *
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * Check for obligatory CPU features and abort if the features are not
+ * present. This code should be compilable as 16-, 32- or 64-bit
+ * code, so be very careful with types and inline assembly.
+ *
+ * This code should not contain any messages; that requires an
+ * additional wrapper.
+ *
+ * As written, this code is not safe for inclusion into the kernel
+ * proper (after FPU initialization, in particular).
+ */
+
+#ifdef _SETUP
+# include "boot.h"
+#endif
+#include <linux/types.h>
+#include <asm/intel-family.h>
+#include <asm/processor-flags.h>
+#include <asm/required-features.h>
+#include <asm/msr-index.h>
+#include "string.h"
+
+static u32 err_flags[NCAPINTS];
+
+static const int req_level = CONFIG_X86_MINIMUM_CPU_FAMILY;
+
+static const u32 req_flags[NCAPINTS] =
+{
+ REQUIRED_MASK0,
+ REQUIRED_MASK1,
+ 0, /* REQUIRED_MASK2 not implemented in this file */
+ 0, /* REQUIRED_MASK3 not implemented in this file */
+ REQUIRED_MASK4,
+ 0, /* REQUIRED_MASK5 not implemented in this file */
+ REQUIRED_MASK6,
+ 0, /* REQUIRED_MASK7 not implemented in this file */
+ 0, /* REQUIRED_MASK8 not implemented in this file */
+ 0, /* REQUIRED_MASK9 not implemented in this file */
+ 0, /* REQUIRED_MASK10 not implemented in this file */
+ 0, /* REQUIRED_MASK11 not implemented in this file */
+ 0, /* REQUIRED_MASK12 not implemented in this file */
+ 0, /* REQUIRED_MASK13 not implemented in this file */
+ 0, /* REQUIRED_MASK14 not implemented in this file */
+ 0, /* REQUIRED_MASK15 not implemented in this file */
+ REQUIRED_MASK16,
+};
+
+#define A32(a, b, c, d) (((d) << 24)+((c) << 16)+((b) << 8)+(a))
+
+static int is_amd(void)
+{
+ return cpu_vendor[0] == A32('A', 'u', 't', 'h') &&
+ cpu_vendor[1] == A32('e', 'n', 't', 'i') &&
+ cpu_vendor[2] == A32('c', 'A', 'M', 'D');
+}
+
+static int is_centaur(void)
+{
+ return cpu_vendor[0] == A32('C', 'e', 'n', 't') &&
+ cpu_vendor[1] == A32('a', 'u', 'r', 'H') &&
+ cpu_vendor[2] == A32('a', 'u', 'l', 's');
+}
+
+static int is_transmeta(void)
+{
+ return cpu_vendor[0] == A32('G', 'e', 'n', 'u') &&
+ cpu_vendor[1] == A32('i', 'n', 'e', 'T') &&
+ cpu_vendor[2] == A32('M', 'x', '8', '6');
+}
+
+static int is_intel(void)
+{
+ return cpu_vendor[0] == A32('G', 'e', 'n', 'u') &&
+ cpu_vendor[1] == A32('i', 'n', 'e', 'I') &&
+ cpu_vendor[2] == A32('n', 't', 'e', 'l');
+}
+
+/* Returns a bitmask of which words we have error bits in */
+static int check_cpuflags(void)
+{
+ u32 err;
+ int i;
+
+ err = 0;
+ for (i = 0; i < NCAPINTS; i++) {
+ err_flags[i] = req_flags[i] & ~cpu.flags[i];
+ if (err_flags[i])
+ err |= 1 << i;
+ }
+
+ return err;
+}
+
+/*
+ * Returns -1 on error.
+ *
+ * *cpu_level is set to the current CPU level; *req_level to the required
+ * level. x86-64 is considered level 64 for this purpose.
+ *
+ * *err_flags_ptr is set to the flags error array if there are flags missing.
+ */
+int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr)
+{
+ int err;
+
+ memset(&cpu.flags, 0, sizeof cpu.flags);
+ cpu.level = 3;
+
+ if (has_eflag(X86_EFLAGS_AC))
+ cpu.level = 4;
+
+ get_cpuflags();
+ err = check_cpuflags();
+
+ if (test_bit(X86_FEATURE_LM, cpu.flags))
+ cpu.level = 64;
+
+ if (err == 0x01 &&
+ !(err_flags[0] &
+ ~((1 << X86_FEATURE_XMM)|(1 << X86_FEATURE_XMM2))) &&
+ is_amd()) {
+ /* If this is an AMD and we're only missing SSE+SSE2, try to
+ turn them on */
+
+ u32 ecx = MSR_K7_HWCR;
+ u32 eax, edx;
+
+ asm("rdmsr" : "=a" (eax), "=d" (edx) : "c" (ecx));
+ eax &= ~(1 << 15);
+ asm("wrmsr" : : "a" (eax), "d" (edx), "c" (ecx));
+
+ get_cpuflags(); /* Make sure it really did something */
+ err = check_cpuflags();
+ } else if (err == 0x01 &&
+ !(err_flags[0] & ~(1 << X86_FEATURE_CX8)) &&
+ is_centaur() && cpu.model >= 6) {
+ /* If this is a VIA C3, we might have to enable CX8
+ explicitly */
+
+ u32 ecx = MSR_VIA_FCR;
+ u32 eax, edx;
+
+ asm("rdmsr" : "=a" (eax), "=d" (edx) : "c" (ecx));
+ eax |= (1<<1)|(1<<7);
+ asm("wrmsr" : : "a" (eax), "d" (edx), "c" (ecx));
+
+ set_bit(X86_FEATURE_CX8, cpu.flags);
+ err = check_cpuflags();
+ } else if (err == 0x01 && is_transmeta()) {
+ /* Transmeta might have masked feature bits in word 0 */
+
+ u32 ecx = 0x80860004;
+ u32 eax, edx;
+ u32 level = 1;
+
+ asm("rdmsr" : "=a" (eax), "=d" (edx) : "c" (ecx));
+ asm("wrmsr" : : "a" (~0), "d" (edx), "c" (ecx));
+ asm("cpuid"
+ : "+a" (level), "=d" (cpu.flags[0])
+ : : "ecx", "ebx");
+ asm("wrmsr" : : "a" (eax), "d" (edx), "c" (ecx));
+
+ err = check_cpuflags();
+ } else if (err == 0x01 &&
+ !(err_flags[0] & ~(1 << X86_FEATURE_PAE)) &&
+ is_intel() && cpu.level == 6 &&
+ (cpu.model == 9 || cpu.model == 13)) {
+ /* PAE is disabled on this Pentium M but can be forced */
+ if (cmdline_find_option_bool("forcepae")) {
+ puts("WARNING: Forcing PAE in CPU flags\n");
+ set_bit(X86_FEATURE_PAE, cpu.flags);
+ err = check_cpuflags();
+ }
+ else {
+ puts("WARNING: PAE disabled. Use parameter 'forcepae' to enable at your own risk!\n");
+ }
+ }
+ if (!err)
+ err = check_knl_erratum();
+
+ if (err_flags_ptr)
+ *err_flags_ptr = err ? err_flags : NULL;
+ if (cpu_level_ptr)
+ *cpu_level_ptr = cpu.level;
+ if (req_level_ptr)
+ *req_level_ptr = req_level;
+
+ return (cpu.level < req_level || err) ? -1 : 0;
+}
+
+int check_knl_erratum(void)
+{
+ /*
+ * First check for the affected model/family:
+ */
+ if (!is_intel() ||
+ cpu.family != 6 ||
+ cpu.model != INTEL_FAM6_XEON_PHI_KNL)
+ return 0;
+
+ /*
+ * This erratum affects the Accessed/Dirty bits, and can
+ * cause stray bits to be set in !Present PTEs. We have
+ * enough bits in our 64-bit PTEs (which we have on real
+ * 64-bit mode or PAE) to avoid using these troublesome
+ * bits. But, we do not have enough space in our 32-bit
+ * PTEs. So, refuse to run on 32-bit non-PAE kernels.
+ */
+ if (IS_ENABLED(CONFIG_X86_64) || IS_ENABLED(CONFIG_X86_PAE))
+ return 0;
+
+ puts("This 32-bit kernel can not run on this Xeon Phi x200\n"
+ "processor due to a processor erratum. Use a 64-bit\n"
+ "kernel, or enable PAE in this 32-bit kernel.\n\n");
+
+ return -1;
+}
+
+
diff --git a/arch/x86/boot/cpuflags.c b/arch/x86/boot/cpuflags.c
new file mode 100644
index 000000000..a0b75f73d
--- /dev/null
+++ b/arch/x86/boot/cpuflags.c
@@ -0,0 +1,129 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/types.h>
+#include "bitops.h"
+
+#include <asm/processor-flags.h>
+#include <asm/required-features.h>
+#include <asm/msr-index.h>
+#include "cpuflags.h"
+
+struct cpu_features cpu;
+u32 cpu_vendor[3];
+
+static bool loaded_flags;
+
+static int has_fpu(void)
+{
+ u16 fcw = -1, fsw = -1;
+ unsigned long cr0;
+
+ asm volatile("mov %%cr0,%0" : "=r" (cr0));
+ if (cr0 & (X86_CR0_EM|X86_CR0_TS)) {
+ cr0 &= ~(X86_CR0_EM|X86_CR0_TS);
+ asm volatile("mov %0,%%cr0" : : "r" (cr0));
+ }
+
+ asm volatile("fninit ; fnstsw %0 ; fnstcw %1"
+ : "+m" (fsw), "+m" (fcw));
+
+ return fsw == 0 && (fcw & 0x103f) == 0x003f;
+}
+
+/*
+ * For building the 16-bit code we want to explicitly specify 32-bit
+ * push/pop operations, rather than just saying 'pushf' or 'popf' and
+ * letting the compiler choose. But this is also included from the
+ * compressed/ directory where it may be 64-bit code, and thus needs
+ * to be 'pushfq' or 'popfq' in that case.
+ */
+#ifdef __x86_64__
+#define PUSHF "pushfq"
+#define POPF "popfq"
+#else
+#define PUSHF "pushfl"
+#define POPF "popfl"
+#endif
+
+int has_eflag(unsigned long mask)
+{
+ unsigned long f0, f1;
+
+ asm volatile(PUSHF " \n\t"
+ PUSHF " \n\t"
+ "pop %0 \n\t"
+ "mov %0,%1 \n\t"
+ "xor %2,%1 \n\t"
+ "push %1 \n\t"
+ POPF " \n\t"
+ PUSHF " \n\t"
+ "pop %1 \n\t"
+ POPF
+ : "=&r" (f0), "=&r" (f1)
+ : "ri" (mask));
+
+ return !!((f0^f1) & mask);
+}
+
+/* Handle x86_32 PIC using ebx. */
+#if defined(__i386__) && defined(__PIC__)
+# define EBX_REG "=r"
+#else
+# define EBX_REG "=b"
+#endif
+
+static inline void cpuid_count(u32 id, u32 count,
+ u32 *a, u32 *b, u32 *c, u32 *d)
+{
+ asm volatile(".ifnc %%ebx,%3 ; movl %%ebx,%3 ; .endif \n\t"
+ "cpuid \n\t"
+ ".ifnc %%ebx,%3 ; xchgl %%ebx,%3 ; .endif \n\t"
+ : "=a" (*a), "=c" (*c), "=d" (*d), EBX_REG (*b)
+ : "a" (id), "c" (count)
+ );
+}
+
+#define cpuid(id, a, b, c, d) cpuid_count(id, 0, a, b, c, d)
+
+void get_cpuflags(void)
+{
+ u32 max_intel_level, max_amd_level;
+ u32 tfms;
+ u32 ignored;
+
+ if (loaded_flags)
+ return;
+ loaded_flags = true;
+
+ if (has_fpu())
+ set_bit(X86_FEATURE_FPU, cpu.flags);
+
+ if (has_eflag(X86_EFLAGS_ID)) {
+ cpuid(0x0, &max_intel_level, &cpu_vendor[0], &cpu_vendor[2],
+ &cpu_vendor[1]);
+
+ if (max_intel_level >= 0x00000001 &&
+ max_intel_level <= 0x0000ffff) {
+ cpuid(0x1, &tfms, &ignored, &cpu.flags[4],
+ &cpu.flags[0]);
+ cpu.level = (tfms >> 8) & 15;
+ cpu.family = cpu.level;
+ cpu.model = (tfms >> 4) & 15;
+ if (cpu.level >= 6)
+ cpu.model += ((tfms >> 16) & 0xf) << 4;
+ }
+
+ if (max_intel_level >= 0x00000007) {
+ cpuid_count(0x00000007, 0, &ignored, &ignored,
+ &cpu.flags[16], &ignored);
+ }
+
+ cpuid(0x80000000, &max_amd_level, &ignored, &ignored,
+ &ignored);
+
+ if (max_amd_level >= 0x80000001 &&
+ max_amd_level <= 0x8000ffff) {
+ cpuid(0x80000001, &ignored, &ignored, &cpu.flags[6],
+ &cpu.flags[1]);
+ }
+ }
+}
diff --git a/arch/x86/boot/cpuflags.h b/arch/x86/boot/cpuflags.h
new file mode 100644
index 000000000..2e20814d3
--- /dev/null
+++ b/arch/x86/boot/cpuflags.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef BOOT_CPUFLAGS_H
+#define BOOT_CPUFLAGS_H
+
+#include <asm/cpufeatures.h>
+#include <asm/processor-flags.h>
+
+struct cpu_features {
+ int level; /* Family, or 64 for x86-64 */
+ int family; /* Family, always */
+ int model;
+ u32 flags[NCAPINTS];
+};
+
+extern struct cpu_features cpu;
+extern u32 cpu_vendor[3];
+
+int has_eflag(unsigned long mask);
+void get_cpuflags(void);
+
+#endif
diff --git a/arch/x86/boot/ctype.h b/arch/x86/boot/ctype.h
new file mode 100644
index 000000000..8f5ef2994
--- /dev/null
+++ b/arch/x86/boot/ctype.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef BOOT_CTYPE_H
+#define BOOT_CTYPE_H
+
+static inline int isdigit(int ch)
+{
+ return (ch >= '0') && (ch <= '9');
+}
+
+static inline int isxdigit(int ch)
+{
+ if (isdigit(ch))
+ return true;
+
+ if ((ch >= 'a') && (ch <= 'f'))
+ return true;
+
+ return (ch >= 'A') && (ch <= 'F');
+}
+
+#endif
diff --git a/arch/x86/boot/early_serial_console.c b/arch/x86/boot/early_serial_console.c
new file mode 100644
index 000000000..b25c53527
--- /dev/null
+++ b/arch/x86/boot/early_serial_console.c
@@ -0,0 +1,154 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Serial port routines for use during early boot reporting. This code is
+ * included from both the compressed kernel and the regular kernel.
+ */
+#include "boot.h"
+
+#define DEFAULT_SERIAL_PORT 0x3f8 /* ttyS0 */
+
+#define DLAB 0x80
+
+#define TXR 0 /* Transmit register (WRITE) */
+#define RXR 0 /* Receive register (READ) */
+#define IER 1 /* Interrupt Enable */
+#define IIR 2 /* Interrupt ID */
+#define FCR 2 /* FIFO control */
+#define LCR 3 /* Line control */
+#define MCR 4 /* Modem control */
+#define LSR 5 /* Line Status */
+#define MSR 6 /* Modem Status */
+#define DLL 0 /* Divisor Latch Low */
+#define DLH 1 /* Divisor latch High */
+
+#define DEFAULT_BAUD 9600
+
+static void early_serial_init(int port, int baud)
+{
+ unsigned char c;
+ unsigned divisor;
+
+ outb(0x3, port + LCR); /* 8n1 */
+ outb(0, port + IER); /* no interrupt */
+ outb(0, port + FCR); /* no fifo */
+ outb(0x3, port + MCR); /* DTR + RTS */
+
+ divisor = 115200 / baud;
+ c = inb(port + LCR);
+ outb(c | DLAB, port + LCR);
+ outb(divisor & 0xff, port + DLL);
+ outb((divisor >> 8) & 0xff, port + DLH);
+ outb(c & ~DLAB, port + LCR);
+
+ early_serial_base = port;
+}
+
+static void parse_earlyprintk(void)
+{
+ int baud = DEFAULT_BAUD;
+ char arg[32];
+ int pos = 0;
+ int port = 0;
+
+ if (cmdline_find_option("earlyprintk", arg, sizeof arg) > 0) {
+ char *e;
+
+ if (!strncmp(arg, "serial", 6)) {
+ port = DEFAULT_SERIAL_PORT;
+ pos += 6;
+ }
+
+ if (arg[pos] == ',')
+ pos++;
+
+ /*
+ * make sure we have
+ * "serial,0x3f8,115200"
+ * "serial,ttyS0,115200"
+ * "ttyS0,115200"
+ */
+ if (pos == 7 && !strncmp(arg + pos, "0x", 2)) {
+ port = simple_strtoull(arg + pos, &e, 16);
+ if (port == 0 || arg + pos == e)
+ port = DEFAULT_SERIAL_PORT;
+ else
+ pos = e - arg;
+ } else if (!strncmp(arg + pos, "ttyS", 4)) {
+ static const int bases[] = { 0x3f8, 0x2f8 };
+ int idx = 0;
+
+ /* += strlen("ttyS"); */
+ pos += 4;
+
+ if (arg[pos++] == '1')
+ idx = 1;
+
+ port = bases[idx];
+ }
+
+ if (arg[pos] == ',')
+ pos++;
+
+ baud = simple_strtoull(arg + pos, &e, 0);
+ if (baud == 0 || arg + pos == e)
+ baud = DEFAULT_BAUD;
+ }
+
+ if (port)
+ early_serial_init(port, baud);
+}
+
+#define BASE_BAUD (1843200/16)
+static unsigned int probe_baud(int port)
+{
+ unsigned char lcr, dll, dlh;
+ unsigned int quot;
+
+ lcr = inb(port + LCR);
+ outb(lcr | DLAB, port + LCR);
+ dll = inb(port + DLL);
+ dlh = inb(port + DLH);
+ outb(lcr, port + LCR);
+ quot = (dlh << 8) | dll;
+
+ return BASE_BAUD / quot;
+}
+
+static void parse_console_uart8250(void)
+{
+ char optstr[64], *options;
+ int baud = DEFAULT_BAUD;
+ int port = 0;
+
+ /*
+ * console=uart8250,io,0x3f8,115200n8
+ * need to make sure it is last one console !
+ */
+ if (cmdline_find_option("console", optstr, sizeof optstr) <= 0)
+ return;
+
+ options = optstr;
+
+ if (!strncmp(options, "uart8250,io,", 12))
+ port = simple_strtoull(options + 12, &options, 0);
+ else if (!strncmp(options, "uart,io,", 8))
+ port = simple_strtoull(options + 8, &options, 0);
+ else
+ return;
+
+ if (options && (options[0] == ','))
+ baud = simple_strtoull(options + 1, &options, 0);
+ else
+ baud = probe_baud(port);
+
+ if (port)
+ early_serial_init(port, baud);
+}
+
+void console_init(void)
+{
+ parse_earlyprintk();
+
+ if (!early_serial_base)
+ parse_console_uart8250();
+}
diff --git a/arch/x86/boot/edd.c b/arch/x86/boot/edd.c
new file mode 100644
index 000000000..223e42527
--- /dev/null
+++ b/arch/x86/boot/edd.c
@@ -0,0 +1,182 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright 2007 rPath, Inc. - All Rights Reserved
+ * Copyright 2009 Intel Corporation; author H. Peter Anvin
+ *
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * Get EDD BIOS disk information
+ */
+
+#include "boot.h"
+#include <linux/edd.h>
+#include "string.h"
+
+#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
+
+/*
+ * Read the MBR (first sector) from a specific device.
+ */
+static int read_mbr(u8 devno, void *buf)
+{
+ struct biosregs ireg, oreg;
+
+ initregs(&ireg);
+ ireg.ax = 0x0201; /* Legacy Read, one sector */
+ ireg.cx = 0x0001; /* Sector 0-0-1 */
+ ireg.dl = devno;
+ ireg.bx = (size_t)buf;
+
+ intcall(0x13, &ireg, &oreg);
+
+ return -(oreg.eflags & X86_EFLAGS_CF); /* 0 or -1 */
+}
+
+static u32 read_mbr_sig(u8 devno, struct edd_info *ei, u32 *mbrsig)
+{
+ int sector_size;
+ char *mbrbuf_ptr, *mbrbuf_end;
+ u32 buf_base, mbr_base;
+ extern char _end[];
+ u16 mbr_magic;
+
+ sector_size = ei->params.bytes_per_sector;
+ if (!sector_size)
+ sector_size = 512; /* Best available guess */
+
+ /* Produce a naturally aligned buffer on the heap */
+ buf_base = (ds() << 4) + (u32)&_end;
+ mbr_base = (buf_base+sector_size-1) & ~(sector_size-1);
+ mbrbuf_ptr = _end + (mbr_base-buf_base);
+ mbrbuf_end = mbrbuf_ptr + sector_size;
+
+ /* Make sure we actually have space on the heap... */
+ if (!(boot_params.hdr.loadflags & CAN_USE_HEAP))
+ return -1;
+ if (mbrbuf_end > (char *)(size_t)boot_params.hdr.heap_end_ptr)
+ return -1;
+
+ memset(mbrbuf_ptr, 0, sector_size);
+ if (read_mbr(devno, mbrbuf_ptr))
+ return -1;
+
+ *mbrsig = *(u32 *)&mbrbuf_ptr[EDD_MBR_SIG_OFFSET];
+ mbr_magic = *(u16 *)&mbrbuf_ptr[510];
+
+ /* check for valid MBR magic */
+ return mbr_magic == 0xAA55 ? 0 : -1;
+}
+
+static int get_edd_info(u8 devno, struct edd_info *ei)
+{
+ struct biosregs ireg, oreg;
+
+ memset(ei, 0, sizeof *ei);
+
+ /* Check Extensions Present */
+
+ initregs(&ireg);
+ ireg.ah = 0x41;
+ ireg.bx = EDDMAGIC1;
+ ireg.dl = devno;
+ intcall(0x13, &ireg, &oreg);
+
+ if (oreg.eflags & X86_EFLAGS_CF)
+ return -1; /* No extended information */
+
+ if (oreg.bx != EDDMAGIC2)
+ return -1;
+
+ ei->device = devno;
+ ei->version = oreg.ah; /* EDD version number */
+ ei->interface_support = oreg.cx; /* EDD functionality subsets */
+
+ /* Extended Get Device Parameters */
+
+ ei->params.length = sizeof(ei->params);
+ ireg.ah = 0x48;
+ ireg.si = (size_t)&ei->params;
+ intcall(0x13, &ireg, &oreg);
+
+ /* Get legacy CHS parameters */
+
+ /* Ralf Brown recommends setting ES:DI to 0:0 */
+ ireg.ah = 0x08;
+ ireg.es = 0;
+ intcall(0x13, &ireg, &oreg);
+
+ if (!(oreg.eflags & X86_EFLAGS_CF)) {
+ ei->legacy_max_cylinder = oreg.ch + ((oreg.cl & 0xc0) << 2);
+ ei->legacy_max_head = oreg.dh;
+ ei->legacy_sectors_per_track = oreg.cl & 0x3f;
+ }
+
+ return 0;
+}
+
+void query_edd(void)
+{
+ char eddarg[8];
+ int do_mbr = 1;
+#ifdef CONFIG_EDD_OFF
+ int do_edd = 0;
+#else
+ int do_edd = 1;
+#endif
+ int be_quiet;
+ int devno;
+ struct edd_info ei, *edp;
+ u32 *mbrptr;
+
+ if (cmdline_find_option("edd", eddarg, sizeof eddarg) > 0) {
+ if (!strcmp(eddarg, "skipmbr") || !strcmp(eddarg, "skip")) {
+ do_edd = 1;
+ do_mbr = 0;
+ }
+ else if (!strcmp(eddarg, "off"))
+ do_edd = 0;
+ else if (!strcmp(eddarg, "on"))
+ do_edd = 1;
+ }
+
+ be_quiet = cmdline_find_option_bool("quiet");
+
+ edp = boot_params.eddbuf;
+ mbrptr = boot_params.edd_mbr_sig_buffer;
+
+ if (!do_edd)
+ return;
+
+ /* Bugs in OnBoard or AddOnCards Bios may hang the EDD probe,
+ * so give a hint if this happens.
+ */
+
+ if (!be_quiet)
+ printf("Probing EDD (edd=off to disable)... ");
+
+ for (devno = 0x80; devno < 0x80+EDD_MBR_SIG_MAX; devno++) {
+ /*
+ * Scan the BIOS-supported hard disks and query EDD
+ * information...
+ */
+ if (!get_edd_info(devno, &ei)
+ && boot_params.eddbuf_entries < EDDMAXNR) {
+ memcpy(edp, &ei, sizeof ei);
+ edp++;
+ boot_params.eddbuf_entries++;
+ }
+
+ if (do_mbr && !read_mbr_sig(devno, &ei, mbrptr++))
+ boot_params.edd_mbr_sig_buf_entries = devno-0x80+1;
+ }
+
+ if (!be_quiet)
+ printf("ok\n");
+}
+
+#endif
diff --git a/arch/x86/boot/genimage.sh b/arch/x86/boot/genimage.sh
new file mode 100644
index 000000000..6a10d52a4
--- /dev/null
+++ b/arch/x86/boot/genimage.sh
@@ -0,0 +1,130 @@
+#!/bin/sh
+#
+# This file is subject to the terms and conditions of the GNU General Public
+# License. See the file "COPYING" in the main directory of this archive
+# for more details.
+#
+# Copyright (C) 2017 by Changbin Du <changbin.du@intel.com>
+#
+# Adapted from code in arch/x86/boot/Makefile by H. Peter Anvin and others
+#
+# "make fdimage/fdimage144/fdimage288/isoimage" script for x86 architecture
+#
+# Arguments:
+# $1 - fdimage format
+# $2 - target image file
+# $3 - kernel bzImage file
+# $4 - mtool configuration file
+# $5 - kernel cmdline
+# $6 - inird image file
+#
+
+# Use "make V=1" to debug this script
+case "${KBUILD_VERBOSE}" in
+*1*)
+ set -x
+ ;;
+esac
+
+verify () {
+ if [ ! -f "$1" ]; then
+ echo "" 1>&2
+ echo " *** Missing file: $1" 1>&2
+ echo "" 1>&2
+ exit 1
+ fi
+}
+
+
+export MTOOLSRC=$4
+FIMAGE=$2
+FBZIMAGE=$3
+KCMDLINE=$5
+FDINITRD=$6
+
+# Make sure the files actually exist
+verify "$FBZIMAGE"
+
+genbzdisk() {
+ verify "$MTOOLSRC"
+ mformat a:
+ syslinux $FIMAGE
+ echo "$KCMDLINE" | mcopy - a:syslinux.cfg
+ if [ -f "$FDINITRD" ] ; then
+ mcopy "$FDINITRD" a:initrd.img
+ fi
+ mcopy $FBZIMAGE a:linux
+}
+
+genfdimage144() {
+ verify "$MTOOLSRC"
+ dd if=/dev/zero of=$FIMAGE bs=1024 count=1440 2> /dev/null
+ mformat v:
+ syslinux $FIMAGE
+ echo "$KCMDLINE" | mcopy - v:syslinux.cfg
+ if [ -f "$FDINITRD" ] ; then
+ mcopy "$FDINITRD" v:initrd.img
+ fi
+ mcopy $FBZIMAGE v:linux
+}
+
+genfdimage288() {
+ verify "$MTOOLSRC"
+ dd if=/dev/zero of=$FIMAGE bs=1024 count=2880 2> /dev/null
+ mformat w:
+ syslinux $FIMAGE
+ echo "$KCMDLINE" | mcopy - W:syslinux.cfg
+ if [ -f "$FDINITRD" ] ; then
+ mcopy "$FDINITRD" w:initrd.img
+ fi
+ mcopy $FBZIMAGE w:linux
+}
+
+geniso() {
+ tmp_dir=`dirname $FIMAGE`/isoimage
+ rm -rf $tmp_dir
+ mkdir $tmp_dir
+ for i in lib lib64 share ; do
+ for j in syslinux ISOLINUX ; do
+ if [ -f /usr/$i/$j/isolinux.bin ] ; then
+ isolinux=/usr/$i/$j/isolinux.bin
+ fi
+ done
+ for j in syslinux syslinux/modules/bios ; do
+ if [ -f /usr/$i/$j/ldlinux.c32 ]; then
+ ldlinux=/usr/$i/$j/ldlinux.c32
+ fi
+ done
+ if [ -n "$isolinux" -a -n "$ldlinux" ] ; then
+ break
+ fi
+ done
+ if [ -z "$isolinux" ] ; then
+ echo 'Need an isolinux.bin file, please install syslinux/isolinux.'
+ exit 1
+ fi
+ if [ -z "$ldlinux" ] ; then
+ echo 'Need an ldlinux.c32 file, please install syslinux/isolinux.'
+ exit 1
+ fi
+ cp $isolinux $tmp_dir
+ cp $ldlinux $tmp_dir
+ cp $FBZIMAGE $tmp_dir/linux
+ echo "$KCMDLINE" > $tmp_dir/isolinux.cfg
+ if [ -f "$FDINITRD" ] ; then
+ cp "$FDINITRD" $tmp_dir/initrd.img
+ fi
+ genisoimage -J -r -input-charset=utf-8 -quiet -o $FIMAGE \
+ -b isolinux.bin -c boot.cat -no-emul-boot -boot-load-size 4 \
+ -boot-info-table $tmp_dir
+ isohybrid $FIMAGE 2>/dev/null || true
+ rm -rf $tmp_dir
+}
+
+case $1 in
+ bzdisk) genbzdisk;;
+ fdimage144) genfdimage144;;
+ fdimage288) genfdimage288;;
+ isoimage) geniso;;
+ *) echo 'Unknown image format'; exit 1;
+esac
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
new file mode 100644
index 000000000..850b8762e
--- /dev/null
+++ b/arch/x86/boot/header.S
@@ -0,0 +1,636 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * header.S
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * Based on bootsect.S and setup.S
+ * modified by more people than can be counted
+ *
+ * Rewritten as a common file by H. Peter Anvin (Apr 2007)
+ *
+ * BIG FAT NOTE: We're in real mode using 64k segments. Therefore segment
+ * addresses must be multiplied by 16 to obtain their respective linear
+ * addresses. To avoid confusion, linear addresses are written using leading
+ * hex while segment addresses are written as segment:offset.
+ *
+ */
+
+#include <asm/segment.h>
+#include <asm/boot.h>
+#include <asm/page_types.h>
+#include <asm/setup.h>
+#include <asm/bootparam.h>
+#include "boot.h"
+#include "voffset.h"
+#include "zoffset.h"
+
+BOOTSEG = 0x07C0 /* original address of boot-sector */
+SYSSEG = 0x1000 /* historical load address >> 4 */
+
+#ifndef SVGA_MODE
+#define SVGA_MODE ASK_VGA
+#endif
+
+#ifndef ROOT_RDONLY
+#define ROOT_RDONLY 1
+#endif
+
+ .code16
+ .section ".bstext", "ax"
+
+ .global bootsect_start
+bootsect_start:
+#ifdef CONFIG_EFI_STUB
+ # "MZ", MS-DOS header
+ .byte 0x4d
+ .byte 0x5a
+#endif
+
+ # Normalize the start address
+ ljmp $BOOTSEG, $start2
+
+start2:
+ movw %cs, %ax
+ movw %ax, %ds
+ movw %ax, %es
+ movw %ax, %ss
+ xorw %sp, %sp
+ sti
+ cld
+
+ movw $bugger_off_msg, %si
+
+msg_loop:
+ lodsb
+ andb %al, %al
+ jz bs_die
+ movb $0xe, %ah
+ movw $7, %bx
+ int $0x10
+ jmp msg_loop
+
+bs_die:
+ # Allow the user to press a key, then reboot
+ xorw %ax, %ax
+ int $0x16
+ int $0x19
+
+ # int 0x19 should never return. In case it does anyway,
+ # invoke the BIOS reset code...
+ ljmp $0xf000,$0xfff0
+
+#ifdef CONFIG_EFI_STUB
+ .org 0x3c
+ #
+ # Offset to the PE header.
+ #
+ .long pe_header
+#endif /* CONFIG_EFI_STUB */
+
+ .section ".bsdata", "a"
+bugger_off_msg:
+ .ascii "Use a boot loader.\r\n"
+ .ascii "\n"
+ .ascii "Remove disk and press any key to reboot...\r\n"
+ .byte 0
+
+#ifdef CONFIG_EFI_STUB
+pe_header:
+ .ascii "PE"
+ .word 0
+
+coff_header:
+#ifdef CONFIG_X86_32
+ .word 0x14c # i386
+#else
+ .word 0x8664 # x86-64
+#endif
+ .word 4 # nr_sections
+ .long 0 # TimeDateStamp
+ .long 0 # PointerToSymbolTable
+ .long 1 # NumberOfSymbols
+ .word section_table - optional_header # SizeOfOptionalHeader
+#ifdef CONFIG_X86_32
+ .word 0x306 # Characteristics.
+ # IMAGE_FILE_32BIT_MACHINE |
+ # IMAGE_FILE_DEBUG_STRIPPED |
+ # IMAGE_FILE_EXECUTABLE_IMAGE |
+ # IMAGE_FILE_LINE_NUMS_STRIPPED
+#else
+ .word 0x206 # Characteristics
+ # IMAGE_FILE_DEBUG_STRIPPED |
+ # IMAGE_FILE_EXECUTABLE_IMAGE |
+ # IMAGE_FILE_LINE_NUMS_STRIPPED
+#endif
+
+optional_header:
+#ifdef CONFIG_X86_32
+ .word 0x10b # PE32 format
+#else
+ .word 0x20b # PE32+ format
+#endif
+ .byte 0x02 # MajorLinkerVersion
+ .byte 0x14 # MinorLinkerVersion
+
+ # Filled in by build.c
+ .long 0 # SizeOfCode
+
+ .long 0 # SizeOfInitializedData
+ .long 0 # SizeOfUninitializedData
+
+ # Filled in by build.c
+ .long 0x0000 # AddressOfEntryPoint
+
+ .long 0x0200 # BaseOfCode
+#ifdef CONFIG_X86_32
+ .long 0 # data
+#endif
+
+extra_header_fields:
+#ifdef CONFIG_X86_32
+ .long 0 # ImageBase
+#else
+ .quad 0 # ImageBase
+#endif
+ .long 0x20 # SectionAlignment
+ .long 0x20 # FileAlignment
+ .word 0 # MajorOperatingSystemVersion
+ .word 0 # MinorOperatingSystemVersion
+ .word 0 # MajorImageVersion
+ .word 0 # MinorImageVersion
+ .word 0 # MajorSubsystemVersion
+ .word 0 # MinorSubsystemVersion
+ .long 0 # Win32VersionValue
+
+ #
+ # The size of the bzImage is written in tools/build.c
+ #
+ .long 0 # SizeOfImage
+
+ .long 0x200 # SizeOfHeaders
+ .long 0 # CheckSum
+ .word 0xa # Subsystem (EFI application)
+ .word 0 # DllCharacteristics
+#ifdef CONFIG_X86_32
+ .long 0 # SizeOfStackReserve
+ .long 0 # SizeOfStackCommit
+ .long 0 # SizeOfHeapReserve
+ .long 0 # SizeOfHeapCommit
+#else
+ .quad 0 # SizeOfStackReserve
+ .quad 0 # SizeOfStackCommit
+ .quad 0 # SizeOfHeapReserve
+ .quad 0 # SizeOfHeapCommit
+#endif
+ .long 0 # LoaderFlags
+ .long 0x6 # NumberOfRvaAndSizes
+
+ .quad 0 # ExportTable
+ .quad 0 # ImportTable
+ .quad 0 # ResourceTable
+ .quad 0 # ExceptionTable
+ .quad 0 # CertificationTable
+ .quad 0 # BaseRelocationTable
+
+ # Section table
+section_table:
+ #
+ # The offset & size fields are filled in by build.c.
+ #
+ .ascii ".setup"
+ .byte 0
+ .byte 0
+ .long 0
+ .long 0x0 # startup_{32,64}
+ .long 0 # Size of initialized data
+ # on disk
+ .long 0x0 # startup_{32,64}
+ .long 0 # PointerToRelocations
+ .long 0 # PointerToLineNumbers
+ .word 0 # NumberOfRelocations
+ .word 0 # NumberOfLineNumbers
+ .long 0x60500020 # Characteristics (section flags)
+
+ #
+ # The EFI application loader requires a relocation section
+ # because EFI applications must be relocatable. The .reloc
+ # offset & size fields are filled in by build.c.
+ #
+ .ascii ".reloc"
+ .byte 0
+ .byte 0
+ .long 0
+ .long 0
+ .long 0 # SizeOfRawData
+ .long 0 # PointerToRawData
+ .long 0 # PointerToRelocations
+ .long 0 # PointerToLineNumbers
+ .word 0 # NumberOfRelocations
+ .word 0 # NumberOfLineNumbers
+ .long 0x42100040 # Characteristics (section flags)
+
+ #
+ # The offset & size fields are filled in by build.c.
+ #
+ .ascii ".text"
+ .byte 0
+ .byte 0
+ .byte 0
+ .long 0
+ .long 0x0 # startup_{32,64}
+ .long 0 # Size of initialized data
+ # on disk
+ .long 0x0 # startup_{32,64}
+ .long 0 # PointerToRelocations
+ .long 0 # PointerToLineNumbers
+ .word 0 # NumberOfRelocations
+ .word 0 # NumberOfLineNumbers
+ .long 0x60500020 # Characteristics (section flags)
+
+ #
+ # The offset & size fields are filled in by build.c.
+ #
+ .ascii ".bss"
+ .byte 0
+ .byte 0
+ .byte 0
+ .byte 0
+ .long 0
+ .long 0x0
+ .long 0 # Size of initialized data
+ # on disk
+ .long 0x0
+ .long 0 # PointerToRelocations
+ .long 0 # PointerToLineNumbers
+ .word 0 # NumberOfRelocations
+ .word 0 # NumberOfLineNumbers
+ .long 0xc8000080 # Characteristics (section flags)
+
+#endif /* CONFIG_EFI_STUB */
+
+ # Kernel attributes; used by setup. This is part 1 of the
+ # header, from the old boot sector.
+
+ .section ".header", "a"
+ .globl sentinel
+sentinel: .byte 0xff, 0xff /* Used to detect broken loaders */
+
+ .globl hdr
+hdr:
+setup_sects: .byte 0 /* Filled in by build.c */
+root_flags: .word ROOT_RDONLY
+syssize: .long 0 /* Filled in by build.c */
+ram_size: .word 0 /* Obsolete */
+vid_mode: .word SVGA_MODE
+root_dev: .word 0 /* Filled in by build.c */
+boot_flag: .word 0xAA55
+
+ # offset 512, entry point
+
+ .globl _start
+_start:
+ # Explicitly enter this as bytes, or the assembler
+ # tries to generate a 3-byte jump here, which causes
+ # everything else to push off to the wrong offset.
+ .byte 0xeb # short (2-byte) jump
+ .byte start_of_setup-1f
+1:
+
+ # Part 2 of the header, from the old setup.S
+
+ .ascii "HdrS" # header signature
+ .word 0x020d # header version number (>= 0x0105)
+ # or else old loadlin-1.5 will fail)
+ .globl realmode_swtch
+realmode_swtch: .word 0, 0 # default_switch, SETUPSEG
+start_sys_seg: .word SYSSEG # obsolete and meaningless, but just
+ # in case something decided to "use" it
+ .word kernel_version-512 # pointing to kernel version string
+ # above section of header is compatible
+ # with loadlin-1.5 (header v1.5). Don't
+ # change it.
+
+type_of_loader: .byte 0 # 0 means ancient bootloader, newer
+ # bootloaders know to change this.
+ # See Documentation/x86/boot.txt for
+ # assigned ids
+
+# flags, unused bits must be zero (RFU) bit within loadflags
+loadflags:
+ .byte LOADED_HIGH # The kernel is to be loaded high
+
+setup_move_size: .word 0x8000 # size to move, when setup is not
+ # loaded at 0x90000. We will move setup
+ # to 0x90000 then just before jumping
+ # into the kernel. However, only the
+ # loader knows how much data behind
+ # us also needs to be loaded.
+
+code32_start: # here loaders can put a different
+ # start address for 32-bit code.
+ .long 0x100000 # 0x100000 = default for big kernel
+
+ramdisk_image: .long 0 # address of loaded ramdisk image
+ # Here the loader puts the 32-bit
+ # address where it loaded the image.
+ # This only will be read by the kernel.
+
+ramdisk_size: .long 0 # its size in bytes
+
+bootsect_kludge:
+ .long 0 # obsolete
+
+heap_end_ptr: .word _end+STACK_SIZE-512
+ # (Header version 0x0201 or later)
+ # space from here (exclusive) down to
+ # end of setup code can be used by setup
+ # for local heap purposes.
+
+ext_loader_ver:
+ .byte 0 # Extended boot loader version
+ext_loader_type:
+ .byte 0 # Extended boot loader type
+
+cmd_line_ptr: .long 0 # (Header version 0x0202 or later)
+ # If nonzero, a 32-bit pointer
+ # to the kernel command line.
+ # The command line should be
+ # located between the start of
+ # setup and the end of low
+ # memory (0xa0000), or it may
+ # get overwritten before it
+ # gets read. If this field is
+ # used, there is no longer
+ # anything magical about the
+ # 0x90000 segment; the setup
+ # can be located anywhere in
+ # low memory 0x10000 or higher.
+
+initrd_addr_max: .long 0x7fffffff
+ # (Header version 0x0203 or later)
+ # The highest safe address for
+ # the contents of an initrd
+ # The current kernel allows up to 4 GB,
+ # but leave it at 2 GB to avoid
+ # possible bootloader bugs.
+
+kernel_alignment: .long CONFIG_PHYSICAL_ALIGN #physical addr alignment
+ #required for protected mode
+ #kernel
+#ifdef CONFIG_RELOCATABLE
+relocatable_kernel: .byte 1
+#else
+relocatable_kernel: .byte 0
+#endif
+min_alignment: .byte MIN_KERNEL_ALIGN_LG2 # minimum alignment
+
+xloadflags:
+#ifdef CONFIG_X86_64
+# define XLF0 XLF_KERNEL_64 /* 64-bit kernel */
+#else
+# define XLF0 0
+#endif
+
+#if defined(CONFIG_RELOCATABLE) && defined(CONFIG_X86_64)
+ /* kernel/boot_param/ramdisk could be loaded above 4g */
+# define XLF1 XLF_CAN_BE_LOADED_ABOVE_4G
+#else
+# define XLF1 0
+#endif
+
+#ifdef CONFIG_EFI_STUB
+# ifdef CONFIG_EFI_MIXED
+# define XLF23 (XLF_EFI_HANDOVER_32|XLF_EFI_HANDOVER_64)
+# else
+# ifdef CONFIG_X86_64
+# define XLF23 XLF_EFI_HANDOVER_64 /* 64-bit EFI handover ok */
+# else
+# define XLF23 XLF_EFI_HANDOVER_32 /* 32-bit EFI handover ok */
+# endif
+# endif
+#else
+# define XLF23 0
+#endif
+
+#if defined(CONFIG_X86_64) && defined(CONFIG_EFI) && defined(CONFIG_KEXEC_CORE)
+# define XLF4 XLF_EFI_KEXEC
+#else
+# define XLF4 0
+#endif
+
+ .word XLF0 | XLF1 | XLF23 | XLF4
+
+cmdline_size: .long COMMAND_LINE_SIZE-1 #length of the command line,
+ #added with boot protocol
+ #version 2.06
+
+hardware_subarch: .long 0 # subarchitecture, added with 2.07
+ # default to 0 for normal x86 PC
+
+hardware_subarch_data: .quad 0
+
+payload_offset: .long ZO_input_data
+payload_length: .long ZO_z_input_len
+
+setup_data: .quad 0 # 64-bit physical pointer to
+ # single linked list of
+ # struct setup_data
+
+pref_address: .quad LOAD_PHYSICAL_ADDR # preferred load addr
+
+#
+# Getting to provably safe in-place decompression is hard. Worst case
+# behaviours need to be analyzed. Here let's take the decompression of
+# a gzip-compressed kernel as example, to illustrate it:
+#
+# The file layout of gzip compressed kernel is:
+#
+# magic[2]
+# method[1]
+# flags[1]
+# timestamp[4]
+# extraflags[1]
+# os[1]
+# compressed data blocks[N]
+# crc[4] orig_len[4]
+#
+# ... resulting in +18 bytes overhead of uncompressed data.
+#
+# (For more information, please refer to RFC 1951 and RFC 1952.)
+#
+# Files divided into blocks
+# 1 bit (last block flag)
+# 2 bits (block type)
+#
+# 1 block occurs every 32K -1 bytes or when there 50% compression
+# has been achieved. The smallest block type encoding is always used.
+#
+# stored:
+# 32 bits length in bytes.
+#
+# fixed:
+# magic fixed tree.
+# symbols.
+#
+# dynamic:
+# dynamic tree encoding.
+# symbols.
+#
+#
+# The buffer for decompression in place is the length of the uncompressed
+# data, plus a small amount extra to keep the algorithm safe. The
+# compressed data is placed at the end of the buffer. The output pointer
+# is placed at the start of the buffer and the input pointer is placed
+# where the compressed data starts. Problems will occur when the output
+# pointer overruns the input pointer.
+#
+# The output pointer can only overrun the input pointer if the input
+# pointer is moving faster than the output pointer. A condition only
+# triggered by data whose compressed form is larger than the uncompressed
+# form.
+#
+# The worst case at the block level is a growth of the compressed data
+# of 5 bytes per 32767 bytes.
+#
+# The worst case internal to a compressed block is very hard to figure.
+# The worst case can at least be bounded by having one bit that represents
+# 32764 bytes and then all of the rest of the bytes representing the very
+# very last byte.
+#
+# All of which is enough to compute an amount of extra data that is required
+# to be safe. To avoid problems at the block level allocating 5 extra bytes
+# per 32767 bytes of data is sufficient. To avoid problems internal to a
+# block adding an extra 32767 bytes (the worst case uncompressed block size)
+# is sufficient, to ensure that in the worst case the decompressed data for
+# block will stop the byte before the compressed data for a block begins.
+# To avoid problems with the compressed data's meta information an extra 18
+# bytes are needed. Leading to the formula:
+#
+# extra_bytes = (uncompressed_size >> 12) + 32768 + 18
+#
+# Adding 8 bytes per 32K is a bit excessive but much easier to calculate.
+# Adding 32768 instead of 32767 just makes for round numbers.
+#
+# Above analysis is for decompressing gzip compressed kernel only. Up to
+# now 6 different decompressor are supported all together. And among them
+# xz stores data in chunks and has maximum chunk of 64K. Hence safety
+# margin should be updated to cover all decompressors so that we don't
+# need to deal with each of them separately. Please check
+# the description in lib/decompressor_xxx.c for specific information.
+#
+# extra_bytes = (uncompressed_size >> 12) + 65536 + 128
+#
+# LZ4 is even worse: data that cannot be further compressed grows by 0.4%,
+# or one byte per 256 bytes. OTOH, we can safely get rid of the +128 as
+# the size-dependent part now grows so fast.
+#
+# extra_bytes = (uncompressed_size >> 8) + 65536
+
+#define ZO_z_extra_bytes ((ZO_z_output_len >> 8) + 65536)
+#if ZO_z_output_len > ZO_z_input_len
+# define ZO_z_extract_offset (ZO_z_output_len + ZO_z_extra_bytes - \
+ ZO_z_input_len)
+#else
+# define ZO_z_extract_offset ZO_z_extra_bytes
+#endif
+
+/*
+ * The extract_offset has to be bigger than ZO head section. Otherwise when
+ * the head code is running to move ZO to the end of the buffer, it will
+ * overwrite the head code itself.
+ */
+#if (ZO__ehead - ZO_startup_32) > ZO_z_extract_offset
+# define ZO_z_min_extract_offset ((ZO__ehead - ZO_startup_32 + 4095) & ~4095)
+#else
+# define ZO_z_min_extract_offset ((ZO_z_extract_offset + 4095) & ~4095)
+#endif
+
+#define ZO_INIT_SIZE (ZO__end - ZO_startup_32 + ZO_z_min_extract_offset)
+
+#define VO_INIT_SIZE (VO__end - VO__text)
+#if ZO_INIT_SIZE > VO_INIT_SIZE
+# define INIT_SIZE ZO_INIT_SIZE
+#else
+# define INIT_SIZE VO_INIT_SIZE
+#endif
+
+init_size: .long INIT_SIZE # kernel initialization size
+handover_offset: .long 0 # Filled in by build.c
+
+# End of setup header #####################################################
+
+ .section ".entrytext", "ax"
+start_of_setup:
+# Force %es = %ds
+ movw %ds, %ax
+ movw %ax, %es
+ cld
+
+# Apparently some ancient versions of LILO invoked the kernel with %ss != %ds,
+# which happened to work by accident for the old code. Recalculate the stack
+# pointer if %ss is invalid. Otherwise leave it alone, LOADLIN sets up the
+# stack behind its own code, so we can't blindly put it directly past the heap.
+
+ movw %ss, %dx
+ cmpw %ax, %dx # %ds == %ss?
+ movw %sp, %dx
+ je 2f # -> assume %sp is reasonably set
+
+ # Invalid %ss, make up a new stack
+ movw $_end, %dx
+ testb $CAN_USE_HEAP, loadflags
+ jz 1f
+ movw heap_end_ptr, %dx
+1: addw $STACK_SIZE, %dx
+ jnc 2f
+ xorw %dx, %dx # Prevent wraparound
+
+2: # Now %dx should point to the end of our stack space
+ andw $~3, %dx # dword align (might as well...)
+ jnz 3f
+ movw $0xfffc, %dx # Make sure we're not zero
+3: movw %ax, %ss
+ movzwl %dx, %esp # Clear upper half of %esp
+ sti # Now we should have a working stack
+
+# We will have entered with %cs = %ds+0x20, normalize %cs so
+# it is on par with the other segments.
+ pushw %ds
+ pushw $6f
+ lretw
+6:
+
+# Check signature at end of setup
+ cmpl $0x5a5aaa55, setup_sig
+ jne setup_bad
+
+# Zero the bss
+ movw $__bss_start, %di
+ movw $_end+3, %cx
+ xorl %eax, %eax
+ subw %di, %cx
+ shrw $2, %cx
+ rep; stosl
+
+# Jump to C code (should not return)
+ calll main
+
+# Setup corrupt somehow...
+setup_bad:
+ movl $setup_corrupt, %eax
+ calll puts
+ # Fall through...
+
+ .globl die
+ .type die, @function
+die:
+ hlt
+ jmp die
+
+ .size die, .-die
+
+ .section ".initdata", "a"
+setup_corrupt:
+ .byte 7
+ .string "No setup signature found...\n"
diff --git a/arch/x86/boot/install.sh b/arch/x86/boot/install.sh
new file mode 100644
index 000000000..d13ec1c38
--- /dev/null
+++ b/arch/x86/boot/install.sh
@@ -0,0 +1,59 @@
+#!/bin/sh
+#
+# This file is subject to the terms and conditions of the GNU General Public
+# License. See the file "COPYING" in the main directory of this archive
+# for more details.
+#
+# Copyright (C) 1995 by Linus Torvalds
+#
+# Adapted from code in arch/i386/boot/Makefile by H. Peter Anvin
+#
+# "make install" script for i386 architecture
+#
+# Arguments:
+# $1 - kernel version
+# $2 - kernel image file
+# $3 - kernel map file
+# $4 - default install path (blank if root directory)
+#
+
+verify () {
+ if [ ! -f "$1" ]; then
+ echo "" 1>&2
+ echo " *** Missing file: $1" 1>&2
+ echo ' *** You need to run "make" before "make install".' 1>&2
+ echo "" 1>&2
+ exit 1
+ fi
+}
+
+# Make sure the files actually exist
+verify "$2"
+verify "$3"
+
+# User may have a custom install script
+
+if [ -x ~/bin/${INSTALLKERNEL} ]; then exec ~/bin/${INSTALLKERNEL} "$@"; fi
+if [ -x /sbin/${INSTALLKERNEL} ]; then exec /sbin/${INSTALLKERNEL} "$@"; fi
+
+# Default install - same as make zlilo
+
+if [ -f $4/vmlinuz ]; then
+ mv $4/vmlinuz $4/vmlinuz.old
+fi
+
+if [ -f $4/System.map ]; then
+ mv $4/System.map $4/System.old
+fi
+
+cat $2 > $4/vmlinuz
+cp $3 $4/System.map
+
+if [ -x /sbin/lilo ]; then
+ /sbin/lilo
+elif [ -x /etc/lilo/install ]; then
+ /etc/lilo/install
+else
+ sync
+ echo "Cannot find LILO."
+fi
diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c
new file mode 100644
index 000000000..9bcea386d
--- /dev/null
+++ b/arch/x86/boot/main.c
@@ -0,0 +1,182 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright 2007 rPath, Inc. - All Rights Reserved
+ * Copyright 2009 Intel Corporation; author H. Peter Anvin
+ *
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * Main module for the real-mode kernel code
+ */
+
+#include "boot.h"
+#include "string.h"
+
+struct boot_params boot_params __attribute__((aligned(16)));
+
+char *HEAP = _end;
+char *heap_end = _end; /* Default end of heap = no heap */
+
+/*
+ * Copy the header into the boot parameter block. Since this
+ * screws up the old-style command line protocol, adjust by
+ * filling in the new-style command line pointer instead.
+ */
+
+static void copy_boot_params(void)
+{
+ struct old_cmdline {
+ u16 cl_magic;
+ u16 cl_offset;
+ };
+ const struct old_cmdline * const oldcmd =
+ (const struct old_cmdline *)OLD_CL_ADDRESS;
+
+ BUILD_BUG_ON(sizeof boot_params != 4096);
+ memcpy(&boot_params.hdr, &hdr, sizeof hdr);
+
+ if (!boot_params.hdr.cmd_line_ptr &&
+ oldcmd->cl_magic == OLD_CL_MAGIC) {
+ /* Old-style command line protocol. */
+ u16 cmdline_seg;
+
+ /* Figure out if the command line falls in the region
+ of memory that an old kernel would have copied up
+ to 0x90000... */
+ if (oldcmd->cl_offset < boot_params.hdr.setup_move_size)
+ cmdline_seg = ds();
+ else
+ cmdline_seg = 0x9000;
+
+ boot_params.hdr.cmd_line_ptr =
+ (cmdline_seg << 4) + oldcmd->cl_offset;
+ }
+}
+
+/*
+ * Query the keyboard lock status as given by the BIOS, and
+ * set the keyboard repeat rate to maximum. Unclear why the latter
+ * is done here; this might be possible to kill off as stale code.
+ */
+static void keyboard_init(void)
+{
+ struct biosregs ireg, oreg;
+ initregs(&ireg);
+
+ ireg.ah = 0x02; /* Get keyboard status */
+ intcall(0x16, &ireg, &oreg);
+ boot_params.kbd_status = oreg.al;
+
+ ireg.ax = 0x0305; /* Set keyboard repeat rate */
+ intcall(0x16, &ireg, NULL);
+}
+
+/*
+ * Get Intel SpeedStep (IST) information.
+ */
+static void query_ist(void)
+{
+ struct biosregs ireg, oreg;
+
+ /* Some older BIOSes apparently crash on this call, so filter
+ it from machines too old to have SpeedStep at all. */
+ if (cpu.level < 6)
+ return;
+
+ initregs(&ireg);
+ ireg.ax = 0xe980; /* IST Support */
+ ireg.edx = 0x47534943; /* Request value */
+ intcall(0x15, &ireg, &oreg);
+
+ boot_params.ist_info.signature = oreg.eax;
+ boot_params.ist_info.command = oreg.ebx;
+ boot_params.ist_info.event = oreg.ecx;
+ boot_params.ist_info.perf_level = oreg.edx;
+}
+
+/*
+ * Tell the BIOS what CPU mode we intend to run in.
+ */
+static void set_bios_mode(void)
+{
+#ifdef CONFIG_X86_64
+ struct biosregs ireg;
+
+ initregs(&ireg);
+ ireg.ax = 0xec00;
+ ireg.bx = 2;
+ intcall(0x15, &ireg, NULL);
+#endif
+}
+
+static void init_heap(void)
+{
+ char *stack_end;
+
+ if (boot_params.hdr.loadflags & CAN_USE_HEAP) {
+ asm("leal %P1(%%esp),%0"
+ : "=r" (stack_end) : "i" (-STACK_SIZE));
+
+ heap_end = (char *)
+ ((size_t)boot_params.hdr.heap_end_ptr + 0x200);
+ if (heap_end > stack_end)
+ heap_end = stack_end;
+ } else {
+ /* Boot protocol 2.00 only, no heap available */
+ puts("WARNING: Ancient bootloader, some functionality "
+ "may be limited!\n");
+ }
+}
+
+void main(void)
+{
+ /* First, copy the boot header into the "zeropage" */
+ copy_boot_params();
+
+ /* Initialize the early-boot console */
+ console_init();
+ if (cmdline_find_option_bool("debug"))
+ puts("early console in setup code\n");
+
+ /* End of heap check */
+ init_heap();
+
+ /* Make sure we have all the proper CPU support */
+ if (validate_cpu()) {
+ puts("Unable to boot - please use a kernel appropriate "
+ "for your CPU.\n");
+ die();
+ }
+
+ /* Tell the BIOS what CPU mode we intend to run in. */
+ set_bios_mode();
+
+ /* Detect memory layout */
+ detect_memory();
+
+ /* Set keyboard repeat rate (why?) and query the lock flags */
+ keyboard_init();
+
+ /* Query Intel SpeedStep (IST) information */
+ query_ist();
+
+ /* Query APM information */
+#if defined(CONFIG_APM) || defined(CONFIG_APM_MODULE)
+ query_apm_bios();
+#endif
+
+ /* Query EDD information */
+#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
+ query_edd();
+#endif
+
+ /* Set the video mode */
+ set_video();
+
+ /* Do the last things and invoke protected mode */
+ go_to_protected_mode();
+}
diff --git a/arch/x86/boot/memory.c b/arch/x86/boot/memory.c
new file mode 100644
index 000000000..d9c28c87e
--- /dev/null
+++ b/arch/x86/boot/memory.c
@@ -0,0 +1,136 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright 2007 rPath, Inc. - All Rights Reserved
+ * Copyright 2009 Intel Corporation; author H. Peter Anvin
+ *
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * Memory detection code
+ */
+
+#include "boot.h"
+
+#define SMAP 0x534d4150 /* ASCII "SMAP" */
+
+static int detect_memory_e820(void)
+{
+ int count = 0;
+ struct biosregs ireg, oreg;
+ struct boot_e820_entry *desc = boot_params.e820_table;
+ static struct boot_e820_entry buf; /* static so it is zeroed */
+
+ initregs(&ireg);
+ ireg.ax = 0xe820;
+ ireg.cx = sizeof buf;
+ ireg.edx = SMAP;
+ ireg.di = (size_t)&buf;
+
+ /*
+ * Note: at least one BIOS is known which assumes that the
+ * buffer pointed to by one e820 call is the same one as
+ * the previous call, and only changes modified fields. Therefore,
+ * we use a temporary buffer and copy the results entry by entry.
+ *
+ * This routine deliberately does not try to account for
+ * ACPI 3+ extended attributes. This is because there are
+ * BIOSes in the field which report zero for the valid bit for
+ * all ranges, and we don't currently make any use of the
+ * other attribute bits. Revisit this if we see the extended
+ * attribute bits deployed in a meaningful way in the future.
+ */
+
+ do {
+ intcall(0x15, &ireg, &oreg);
+ ireg.ebx = oreg.ebx; /* for next iteration... */
+
+ /* BIOSes which terminate the chain with CF = 1 as opposed
+ to %ebx = 0 don't always report the SMAP signature on
+ the final, failing, probe. */
+ if (oreg.eflags & X86_EFLAGS_CF)
+ break;
+
+ /* Some BIOSes stop returning SMAP in the middle of
+ the search loop. We don't know exactly how the BIOS
+ screwed up the map at that point, we might have a
+ partial map, the full map, or complete garbage, so
+ just return failure. */
+ if (oreg.eax != SMAP) {
+ count = 0;
+ break;
+ }
+
+ *desc++ = buf;
+ count++;
+ } while (ireg.ebx && count < ARRAY_SIZE(boot_params.e820_table));
+
+ return boot_params.e820_entries = count;
+}
+
+static int detect_memory_e801(void)
+{
+ struct biosregs ireg, oreg;
+
+ initregs(&ireg);
+ ireg.ax = 0xe801;
+ intcall(0x15, &ireg, &oreg);
+
+ if (oreg.eflags & X86_EFLAGS_CF)
+ return -1;
+
+ /* Do we really need to do this? */
+ if (oreg.cx || oreg.dx) {
+ oreg.ax = oreg.cx;
+ oreg.bx = oreg.dx;
+ }
+
+ if (oreg.ax > 15*1024) {
+ return -1; /* Bogus! */
+ } else if (oreg.ax == 15*1024) {
+ boot_params.alt_mem_k = (oreg.bx << 6) + oreg.ax;
+ } else {
+ /*
+ * This ignores memory above 16MB if we have a memory
+ * hole there. If someone actually finds a machine
+ * with a memory hole at 16MB and no support for
+ * 0E820h they should probably generate a fake e820
+ * map.
+ */
+ boot_params.alt_mem_k = oreg.ax;
+ }
+
+ return 0;
+}
+
+static int detect_memory_88(void)
+{
+ struct biosregs ireg, oreg;
+
+ initregs(&ireg);
+ ireg.ah = 0x88;
+ intcall(0x15, &ireg, &oreg);
+
+ boot_params.screen_info.ext_mem_k = oreg.ax;
+
+ return -(oreg.eflags & X86_EFLAGS_CF); /* 0 or -1 */
+}
+
+int detect_memory(void)
+{
+ int err = -1;
+
+ if (detect_memory_e820() > 0)
+ err = 0;
+
+ if (!detect_memory_e801())
+ err = 0;
+
+ if (!detect_memory_88())
+ err = 0;
+
+ return err;
+}
diff --git a/arch/x86/boot/mkcpustr.c b/arch/x86/boot/mkcpustr.c
new file mode 100644
index 000000000..f72498dc9
--- /dev/null
+++ b/arch/x86/boot/mkcpustr.c
@@ -0,0 +1,52 @@
+/* ----------------------------------------------------------------------- *
+ *
+ * Copyright 2008 rPath, Inc. - All Rights Reserved
+ *
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2 or (at your
+ * option) any later version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * This is a host program to preprocess the CPU strings into a
+ * compact format suitable for the setup code.
+ */
+
+#include <stdio.h>
+
+#include "../include/asm/required-features.h"
+#include "../include/asm/disabled-features.h"
+#include "../include/asm/cpufeatures.h"
+#include "../kernel/cpu/capflags.c"
+
+int main(void)
+{
+ int i, j;
+ const char *str;
+
+ printf("static const char x86_cap_strs[] =\n");
+
+ for (i = 0; i < NCAPINTS; i++) {
+ for (j = 0; j < 32; j++) {
+ str = x86_cap_flags[i*32+j];
+
+ if (i == NCAPINTS-1 && j == 31) {
+ /* The last entry must be unconditional; this
+ also consumes the compiler-added null
+ character */
+ if (!str)
+ str = "";
+ printf("\t\"\\x%02x\\x%02x\"\"%s\"\n",
+ i, j, str);
+ } else if (str) {
+ printf("#if REQUIRED_MASK%d & (1 << %d)\n"
+ "\t\"\\x%02x\\x%02x\"\"%s\\0\"\n"
+ "#endif\n",
+ i, j, i, j, str);
+ }
+ }
+ }
+ printf("\t;\n");
+ return 0;
+}
diff --git a/arch/x86/boot/mtools.conf.in b/arch/x86/boot/mtools.conf.in
new file mode 100644
index 000000000..efd6d2490
--- /dev/null
+++ b/arch/x86/boot/mtools.conf.in
@@ -0,0 +1,17 @@
+#
+# mtools configuration file for "make (b)zdisk"
+#
+
+# Actual floppy drive
+drive a:
+ file="/dev/fd0"
+
+# 1.44 MB floppy disk image
+drive v:
+ file="@OBJ@/fdimage" cylinders=80 heads=2 sectors=18 filter
+
+# 2.88 MB floppy disk image (mostly for virtual uses)
+drive w:
+ file="@OBJ@/fdimage" cylinders=80 heads=2 sectors=36 filter
+
+
diff --git a/arch/x86/boot/pm.c b/arch/x86/boot/pm.c
new file mode 100644
index 000000000..8062f8915
--- /dev/null
+++ b/arch/x86/boot/pm.c
@@ -0,0 +1,126 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright 2007 rPath, Inc. - All Rights Reserved
+ *
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * Prepare the machine for transition to protected mode.
+ */
+
+#include "boot.h"
+#include <asm/segment.h>
+
+/*
+ * Invoke the realmode switch hook if present; otherwise
+ * disable all interrupts.
+ */
+static void realmode_switch_hook(void)
+{
+ if (boot_params.hdr.realmode_swtch) {
+ asm volatile("lcallw *%0"
+ : : "m" (boot_params.hdr.realmode_swtch)
+ : "eax", "ebx", "ecx", "edx");
+ } else {
+ asm volatile("cli");
+ outb(0x80, 0x70); /* Disable NMI */
+ io_delay();
+ }
+}
+
+/*
+ * Disable all interrupts at the legacy PIC.
+ */
+static void mask_all_interrupts(void)
+{
+ outb(0xff, 0xa1); /* Mask all interrupts on the secondary PIC */
+ io_delay();
+ outb(0xfb, 0x21); /* Mask all but cascade on the primary PIC */
+ io_delay();
+}
+
+/*
+ * Reset IGNNE# if asserted in the FPU.
+ */
+static void reset_coprocessor(void)
+{
+ outb(0, 0xf0);
+ io_delay();
+ outb(0, 0xf1);
+ io_delay();
+}
+
+/*
+ * Set up the GDT
+ */
+
+struct gdt_ptr {
+ u16 len;
+ u32 ptr;
+} __attribute__((packed));
+
+static void setup_gdt(void)
+{
+ /* There are machines which are known to not boot with the GDT
+ being 8-byte unaligned. Intel recommends 16 byte alignment. */
+ static const u64 boot_gdt[] __attribute__((aligned(16))) = {
+ /* CS: code, read/execute, 4 GB, base 0 */
+ [GDT_ENTRY_BOOT_CS] = GDT_ENTRY(0xc09b, 0, 0xfffff),
+ /* DS: data, read/write, 4 GB, base 0 */
+ [GDT_ENTRY_BOOT_DS] = GDT_ENTRY(0xc093, 0, 0xfffff),
+ /* TSS: 32-bit tss, 104 bytes, base 4096 */
+ /* We only have a TSS here to keep Intel VT happy;
+ we don't actually use it for anything. */
+ [GDT_ENTRY_BOOT_TSS] = GDT_ENTRY(0x0089, 4096, 103),
+ };
+ /* Xen HVM incorrectly stores a pointer to the gdt_ptr, instead
+ of the gdt_ptr contents. Thus, make it static so it will
+ stay in memory, at least long enough that we switch to the
+ proper kernel GDT. */
+ static struct gdt_ptr gdt;
+
+ gdt.len = sizeof(boot_gdt)-1;
+ gdt.ptr = (u32)&boot_gdt + (ds() << 4);
+
+ asm volatile("lgdtl %0" : : "m" (gdt));
+}
+
+/*
+ * Set up the IDT
+ */
+static void setup_idt(void)
+{
+ static const struct gdt_ptr null_idt = {0, 0};
+ asm volatile("lidtl %0" : : "m" (null_idt));
+}
+
+/*
+ * Actual invocation sequence
+ */
+void go_to_protected_mode(void)
+{
+ /* Hook before leaving real mode, also disables interrupts */
+ realmode_switch_hook();
+
+ /* Enable the A20 gate */
+ if (enable_a20()) {
+ puts("A20 gate not responding, unable to boot...\n");
+ die();
+ }
+
+ /* Reset coprocessor (IGNNE#) */
+ reset_coprocessor();
+
+ /* Mask all interrupts in the PIC */
+ mask_all_interrupts();
+
+ /* Actual transition to protected mode... */
+ setup_idt();
+ setup_gdt();
+ protected_mode_jump(boot_params.hdr.code32_start,
+ (u32)&boot_params + (ds() << 4));
+}
diff --git a/arch/x86/boot/pmjump.S b/arch/x86/boot/pmjump.S
new file mode 100644
index 000000000..3e0edc6d2
--- /dev/null
+++ b/arch/x86/boot/pmjump.S
@@ -0,0 +1,77 @@
+/* ----------------------------------------------------------------------- *
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright 2007 rPath, Inc. - All Rights Reserved
+ *
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * The actual transition into protected mode
+ */
+
+#include <asm/boot.h>
+#include <asm/processor-flags.h>
+#include <asm/segment.h>
+#include <linux/linkage.h>
+
+ .text
+ .code16
+
+/*
+ * void protected_mode_jump(u32 entrypoint, u32 bootparams);
+ */
+GLOBAL(protected_mode_jump)
+ movl %edx, %esi # Pointer to boot_params table
+
+ xorl %ebx, %ebx
+ movw %cs, %bx
+ shll $4, %ebx
+ addl %ebx, 2f
+ jmp 1f # Short jump to serialize on 386/486
+1:
+
+ movw $__BOOT_DS, %cx
+ movw $__BOOT_TSS, %di
+
+ movl %cr0, %edx
+ orb $X86_CR0_PE, %dl # Protected mode
+ movl %edx, %cr0
+
+ # Transition to 32-bit mode
+ .byte 0x66, 0xea # ljmpl opcode
+2: .long in_pm32 # offset
+ .word __BOOT_CS # segment
+ENDPROC(protected_mode_jump)
+
+ .code32
+ .section ".text32","ax"
+GLOBAL(in_pm32)
+ # Set up data segments for flat 32-bit mode
+ movl %ecx, %ds
+ movl %ecx, %es
+ movl %ecx, %fs
+ movl %ecx, %gs
+ movl %ecx, %ss
+ # The 32-bit code sets up its own stack, but this way we do have
+ # a valid stack if some debugging hack wants to use it.
+ addl %ebx, %esp
+
+ # Set up TR to make Intel VT happy
+ ltr %di
+
+ # Clear registers to allow for future extensions to the
+ # 32-bit boot protocol
+ xorl %ecx, %ecx
+ xorl %edx, %edx
+ xorl %ebx, %ebx
+ xorl %ebp, %ebp
+ xorl %edi, %edi
+
+ # Set up LDTR to make Intel VT happy
+ lldt %cx
+
+ jmpl *%eax # Jump to the 32-bit entrypoint
+ENDPROC(in_pm32)
diff --git a/arch/x86/boot/printf.c b/arch/x86/boot/printf.c
new file mode 100644
index 000000000..565083c16
--- /dev/null
+++ b/arch/x86/boot/printf.c
@@ -0,0 +1,309 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright 2007 rPath, Inc. - All Rights Reserved
+ *
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * Oh, it's a waste of space, but oh-so-yummy for debugging. This
+ * version of printf() does not include 64-bit support. "Live with
+ * it."
+ *
+ */
+
+#include "boot.h"
+
+static int skip_atoi(const char **s)
+{
+ int i = 0;
+
+ while (isdigit(**s))
+ i = i * 10 + *((*s)++) - '0';
+ return i;
+}
+
+#define ZEROPAD 1 /* pad with zero */
+#define SIGN 2 /* unsigned/signed long */
+#define PLUS 4 /* show plus */
+#define SPACE 8 /* space if plus */
+#define LEFT 16 /* left justified */
+#define SMALL 32 /* Must be 32 == 0x20 */
+#define SPECIAL 64 /* 0x */
+
+#define __do_div(n, base) ({ \
+int __res; \
+__res = ((unsigned long) n) % (unsigned) base; \
+n = ((unsigned long) n) / (unsigned) base; \
+__res; })
+
+static char *number(char *str, long num, int base, int size, int precision,
+ int type)
+{
+ /* we are called with base 8, 10 or 16, only, thus don't need "G..." */
+ static const char digits[16] = "0123456789ABCDEF"; /* "GHIJKLMNOPQRSTUVWXYZ"; */
+
+ char tmp[66];
+ char c, sign, locase;
+ int i;
+
+ /* locase = 0 or 0x20. ORing digits or letters with 'locase'
+ * produces same digits or (maybe lowercased) letters */
+ locase = (type & SMALL);
+ if (type & LEFT)
+ type &= ~ZEROPAD;
+ if (base < 2 || base > 16)
+ return NULL;
+ c = (type & ZEROPAD) ? '0' : ' ';
+ sign = 0;
+ if (type & SIGN) {
+ if (num < 0) {
+ sign = '-';
+ num = -num;
+ size--;
+ } else if (type & PLUS) {
+ sign = '+';
+ size--;
+ } else if (type & SPACE) {
+ sign = ' ';
+ size--;
+ }
+ }
+ if (type & SPECIAL) {
+ if (base == 16)
+ size -= 2;
+ else if (base == 8)
+ size--;
+ }
+ i = 0;
+ if (num == 0)
+ tmp[i++] = '0';
+ else
+ while (num != 0)
+ tmp[i++] = (digits[__do_div(num, base)] | locase);
+ if (i > precision)
+ precision = i;
+ size -= precision;
+ if (!(type & (ZEROPAD + LEFT)))
+ while (size-- > 0)
+ *str++ = ' ';
+ if (sign)
+ *str++ = sign;
+ if (type & SPECIAL) {
+ if (base == 8)
+ *str++ = '0';
+ else if (base == 16) {
+ *str++ = '0';
+ *str++ = ('X' | locase);
+ }
+ }
+ if (!(type & LEFT))
+ while (size-- > 0)
+ *str++ = c;
+ while (i < precision--)
+ *str++ = '0';
+ while (i-- > 0)
+ *str++ = tmp[i];
+ while (size-- > 0)
+ *str++ = ' ';
+ return str;
+}
+
+int vsprintf(char *buf, const char *fmt, va_list args)
+{
+ int len;
+ unsigned long num;
+ int i, base;
+ char *str;
+ const char *s;
+
+ int flags; /* flags to number() */
+
+ int field_width; /* width of output field */
+ int precision; /* min. # of digits for integers; max
+ number of chars for from string */
+ int qualifier; /* 'h', 'l', or 'L' for integer fields */
+
+ for (str = buf; *fmt; ++fmt) {
+ if (*fmt != '%') {
+ *str++ = *fmt;
+ continue;
+ }
+
+ /* process flags */
+ flags = 0;
+ repeat:
+ ++fmt; /* this also skips first '%' */
+ switch (*fmt) {
+ case '-':
+ flags |= LEFT;
+ goto repeat;
+ case '+':
+ flags |= PLUS;
+ goto repeat;
+ case ' ':
+ flags |= SPACE;
+ goto repeat;
+ case '#':
+ flags |= SPECIAL;
+ goto repeat;
+ case '0':
+ flags |= ZEROPAD;
+ goto repeat;
+ }
+
+ /* get field width */
+ field_width = -1;
+ if (isdigit(*fmt))
+ field_width = skip_atoi(&fmt);
+ else if (*fmt == '*') {
+ ++fmt;
+ /* it's the next argument */
+ field_width = va_arg(args, int);
+ if (field_width < 0) {
+ field_width = -field_width;
+ flags |= LEFT;
+ }
+ }
+
+ /* get the precision */
+ precision = -1;
+ if (*fmt == '.') {
+ ++fmt;
+ if (isdigit(*fmt))
+ precision = skip_atoi(&fmt);
+ else if (*fmt == '*') {
+ ++fmt;
+ /* it's the next argument */
+ precision = va_arg(args, int);
+ }
+ if (precision < 0)
+ precision = 0;
+ }
+
+ /* get the conversion qualifier */
+ qualifier = -1;
+ if (*fmt == 'h' || *fmt == 'l' || *fmt == 'L') {
+ qualifier = *fmt;
+ ++fmt;
+ }
+
+ /* default base */
+ base = 10;
+
+ switch (*fmt) {
+ case 'c':
+ if (!(flags & LEFT))
+ while (--field_width > 0)
+ *str++ = ' ';
+ *str++ = (unsigned char)va_arg(args, int);
+ while (--field_width > 0)
+ *str++ = ' ';
+ continue;
+
+ case 's':
+ s = va_arg(args, char *);
+ len = strnlen(s, precision);
+
+ if (!(flags & LEFT))
+ while (len < field_width--)
+ *str++ = ' ';
+ for (i = 0; i < len; ++i)
+ *str++ = *s++;
+ while (len < field_width--)
+ *str++ = ' ';
+ continue;
+
+ case 'p':
+ if (field_width == -1) {
+ field_width = 2 * sizeof(void *);
+ flags |= ZEROPAD;
+ }
+ str = number(str,
+ (unsigned long)va_arg(args, void *), 16,
+ field_width, precision, flags);
+ continue;
+
+ case 'n':
+ if (qualifier == 'l') {
+ long *ip = va_arg(args, long *);
+ *ip = (str - buf);
+ } else {
+ int *ip = va_arg(args, int *);
+ *ip = (str - buf);
+ }
+ continue;
+
+ case '%':
+ *str++ = '%';
+ continue;
+
+ /* integer number formats - set up the flags and "break" */
+ case 'o':
+ base = 8;
+ break;
+
+ case 'x':
+ flags |= SMALL;
+ case 'X':
+ base = 16;
+ break;
+
+ case 'd':
+ case 'i':
+ flags |= SIGN;
+ case 'u':
+ break;
+
+ default:
+ *str++ = '%';
+ if (*fmt)
+ *str++ = *fmt;
+ else
+ --fmt;
+ continue;
+ }
+ if (qualifier == 'l')
+ num = va_arg(args, unsigned long);
+ else if (qualifier == 'h') {
+ num = (unsigned short)va_arg(args, int);
+ if (flags & SIGN)
+ num = (short)num;
+ } else if (flags & SIGN)
+ num = va_arg(args, int);
+ else
+ num = va_arg(args, unsigned int);
+ str = number(str, num, base, field_width, precision, flags);
+ }
+ *str = '\0';
+ return str - buf;
+}
+
+int sprintf(char *buf, const char *fmt, ...)
+{
+ va_list args;
+ int i;
+
+ va_start(args, fmt);
+ i = vsprintf(buf, fmt, args);
+ va_end(args);
+ return i;
+}
+
+int printf(const char *fmt, ...)
+{
+ char printf_buf[1024];
+ va_list args;
+ int printed;
+
+ va_start(args, fmt);
+ printed = vsprintf(printf_buf, fmt, args);
+ va_end(args);
+
+ puts(printf_buf);
+
+ return printed;
+}
diff --git a/arch/x86/boot/regs.c b/arch/x86/boot/regs.c
new file mode 100644
index 000000000..c0fb356a3
--- /dev/null
+++ b/arch/x86/boot/regs.c
@@ -0,0 +1,30 @@
+/* -----------------------------------------------------------------------
+ *
+ * Copyright 2009 Intel Corporation; author H. Peter Anvin
+ *
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2 or (at your
+ * option) any later version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * Simple helper function for initializing a register set.
+ *
+ * Note that this sets EFLAGS_CF in the input register set; this
+ * makes it easier to catch functions which do nothing but don't
+ * explicitly set CF.
+ */
+
+#include "boot.h"
+#include "string.h"
+
+void initregs(struct biosregs *reg)
+{
+ memset(reg, 0, sizeof *reg);
+ reg->eflags |= X86_EFLAGS_CF;
+ reg->ds = ds();
+ reg->es = ds();
+ reg->fs = fs();
+ reg->gs = gs();
+}
diff --git a/arch/x86/boot/setup.ld b/arch/x86/boot/setup.ld
new file mode 100644
index 000000000..96a6c7563
--- /dev/null
+++ b/arch/x86/boot/setup.ld
@@ -0,0 +1,64 @@
+/*
+ * setup.ld
+ *
+ * Linker script for the i386 setup code
+ */
+OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
+OUTPUT_ARCH(i386)
+ENTRY(_start)
+
+SECTIONS
+{
+ . = 0;
+ .bstext : { *(.bstext) }
+ .bsdata : { *(.bsdata) }
+
+ . = 495;
+ .header : { *(.header) }
+ .entrytext : { *(.entrytext) }
+ .inittext : { *(.inittext) }
+ .initdata : { *(.initdata) }
+ __end_init = .;
+
+ .text : { *(.text) }
+ .text32 : { *(.text32) }
+
+ . = ALIGN(16);
+ .rodata : { *(.rodata*) }
+
+ .videocards : {
+ video_cards = .;
+ *(.videocards)
+ video_cards_end = .;
+ }
+
+ . = ALIGN(16);
+ .data : { *(.data*) }
+
+ .signature : {
+ setup_sig = .;
+ LONG(0x5a5aaa55)
+ }
+
+
+ . = ALIGN(16);
+ .bss :
+ {
+ __bss_start = .;
+ *(.bss)
+ __bss_end = .;
+ }
+ . = ALIGN(16);
+ _end = .;
+
+ /DISCARD/ : { *(.note*) }
+
+ /*
+ * The ASSERT() sink to . is intentional, for binutils 2.14 compatibility:
+ */
+ . = ASSERT(_end <= 0x8000, "Setup too big!");
+ . = ASSERT(hdr == 0x1f1, "The setup header has the wrong offset!");
+ /* Necessary for the very-old-loader check to work... */
+ . = ASSERT(__end_init <= 5*512, "init sections too big!");
+
+}
diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c
new file mode 100644
index 000000000..2622c0742
--- /dev/null
+++ b/arch/x86/boot/string.c
@@ -0,0 +1,197 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright 2007 rPath, Inc. - All Rights Reserved
+ *
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * Very basic string functions
+ */
+
+#include <linux/types.h>
+#include <asm/asm.h>
+#include "ctype.h"
+#include "string.h"
+
+/*
+ * Undef these macros so that the functions that we provide
+ * here will have the correct names regardless of how string.h
+ * may have chosen to #define them.
+ */
+#undef memcpy
+#undef memset
+#undef memcmp
+
+int memcmp(const void *s1, const void *s2, size_t len)
+{
+ bool diff;
+ asm("repe; cmpsb" CC_SET(nz)
+ : CC_OUT(nz) (diff), "+D" (s1), "+S" (s2), "+c" (len));
+ return diff;
+}
+
+/*
+ * Clang may lower `memcmp == 0` to `bcmp == 0`.
+ */
+int bcmp(const void *s1, const void *s2, size_t len)
+{
+ return memcmp(s1, s2, len);
+}
+
+int strcmp(const char *str1, const char *str2)
+{
+ const unsigned char *s1 = (const unsigned char *)str1;
+ const unsigned char *s2 = (const unsigned char *)str2;
+ int delta = 0;
+
+ while (*s1 || *s2) {
+ delta = *s1 - *s2;
+ if (delta)
+ return delta;
+ s1++;
+ s2++;
+ }
+ return 0;
+}
+
+int strncmp(const char *cs, const char *ct, size_t count)
+{
+ unsigned char c1, c2;
+
+ while (count) {
+ c1 = *cs++;
+ c2 = *ct++;
+ if (c1 != c2)
+ return c1 < c2 ? -1 : 1;
+ if (!c1)
+ break;
+ count--;
+ }
+ return 0;
+}
+
+size_t strnlen(const char *s, size_t maxlen)
+{
+ const char *es = s;
+ while (*es && maxlen) {
+ es++;
+ maxlen--;
+ }
+
+ return (es - s);
+}
+
+unsigned int atou(const char *s)
+{
+ unsigned int i = 0;
+ while (isdigit(*s))
+ i = i * 10 + (*s++ - '0');
+ return i;
+}
+
+/* Works only for digits and letters, but small and fast */
+#define TOLOWER(x) ((x) | 0x20)
+
+static unsigned int simple_guess_base(const char *cp)
+{
+ if (cp[0] == '0') {
+ if (TOLOWER(cp[1]) == 'x' && isxdigit(cp[2]))
+ return 16;
+ else
+ return 8;
+ } else {
+ return 10;
+ }
+}
+
+/**
+ * simple_strtoull - convert a string to an unsigned long long
+ * @cp: The start of the string
+ * @endp: A pointer to the end of the parsed string will be placed here
+ * @base: The number base to use
+ */
+
+unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base)
+{
+ unsigned long long result = 0;
+
+ if (!base)
+ base = simple_guess_base(cp);
+
+ if (base == 16 && cp[0] == '0' && TOLOWER(cp[1]) == 'x')
+ cp += 2;
+
+ while (isxdigit(*cp)) {
+ unsigned int value;
+
+ value = isdigit(*cp) ? *cp - '0' : TOLOWER(*cp) - 'a' + 10;
+ if (value >= base)
+ break;
+ result = result * base + value;
+ cp++;
+ }
+ if (endp)
+ *endp = (char *)cp;
+
+ return result;
+}
+
+long simple_strtol(const char *cp, char **endp, unsigned int base)
+{
+ if (*cp == '-')
+ return -simple_strtoull(cp + 1, endp, base);
+
+ return simple_strtoull(cp, endp, base);
+}
+
+/**
+ * strlen - Find the length of a string
+ * @s: The string to be sized
+ */
+size_t strlen(const char *s)
+{
+ const char *sc;
+
+ for (sc = s; *sc != '\0'; ++sc)
+ /* nothing */;
+ return sc - s;
+}
+
+/**
+ * strstr - Find the first substring in a %NUL terminated string
+ * @s1: The string to be searched
+ * @s2: The string to search for
+ */
+char *strstr(const char *s1, const char *s2)
+{
+ size_t l1, l2;
+
+ l2 = strlen(s2);
+ if (!l2)
+ return (char *)s1;
+ l1 = strlen(s1);
+ while (l1 >= l2) {
+ l1--;
+ if (!memcmp(s1, s2, l2))
+ return (char *)s1;
+ s1++;
+ }
+ return NULL;
+}
+
+/**
+ * strchr - Find the first occurrence of the character c in the string s.
+ * @s: the string to be searched
+ * @c: the character to search for
+ */
+char *strchr(const char *s, int c)
+{
+ while (*s != (char)c)
+ if (*s++ == '\0')
+ return NULL;
+ return (char *)s;
+}
diff --git a/arch/x86/boot/string.h b/arch/x86/boot/string.h
new file mode 100644
index 000000000..3d78e2707
--- /dev/null
+++ b/arch/x86/boot/string.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef BOOT_STRING_H
+#define BOOT_STRING_H
+
+/* Undef any of these macros coming from string_32.h. */
+#undef memcpy
+#undef memset
+#undef memcmp
+
+void *memcpy(void *dst, const void *src, size_t len);
+void *memset(void *dst, int c, size_t len);
+int memcmp(const void *s1, const void *s2, size_t len);
+
+/*
+ * Access builtin version by default. If one needs to use optimized version,
+ * do "undef memcpy" in .c file and link against right string.c
+ */
+#define memcpy(d,s,l) __builtin_memcpy(d,s,l)
+#define memset(d,c,l) __builtin_memset(d,c,l)
+#define memcmp __builtin_memcmp
+
+extern int strcmp(const char *str1, const char *str2);
+extern int strncmp(const char *cs, const char *ct, size_t count);
+extern size_t strlen(const char *s);
+extern char *strstr(const char *s1, const char *s2);
+extern char *strchr(const char *s, int c);
+extern size_t strnlen(const char *s, size_t maxlen);
+extern unsigned int atou(const char *s);
+extern unsigned long long simple_strtoull(const char *cp, char **endp,
+ unsigned int base);
+
+#endif /* BOOT_STRING_H */
diff --git a/arch/x86/boot/tools/.gitignore b/arch/x86/boot/tools/.gitignore
new file mode 100644
index 000000000..378eac25d
--- /dev/null
+++ b/arch/x86/boot/tools/.gitignore
@@ -0,0 +1 @@
+build
diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c
new file mode 100644
index 000000000..bf0e82400
--- /dev/null
+++ b/arch/x86/boot/tools/build.c
@@ -0,0 +1,442 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright (C) 1997 Martin Mares
+ * Copyright (C) 2007 H. Peter Anvin
+ */
+
+/*
+ * This file builds a disk-image from three different files:
+ *
+ * - setup: 8086 machine code, sets up system parm
+ * - system: 80386 code for actual system
+ * - zoffset.h: header with ZO_* defines
+ *
+ * It does some checking that all files are of the correct type, and writes
+ * the result to the specified destination, removing headers and padding to
+ * the right amount. It also writes some system data to stdout.
+ */
+
+/*
+ * Changes by tytso to allow root device specification
+ * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
+ * Cross compiling fixes by Gertjan van Wingerde, July 1996
+ * Rewritten by Martin Mares, April 1997
+ * Substantially overhauled by H. Peter Anvin, April 2007
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <tools/le_byteshift.h>
+
+typedef unsigned char u8;
+typedef unsigned short u16;
+typedef unsigned int u32;
+
+#define DEFAULT_MAJOR_ROOT 0
+#define DEFAULT_MINOR_ROOT 0
+#define DEFAULT_ROOT_DEV (DEFAULT_MAJOR_ROOT << 8 | DEFAULT_MINOR_ROOT)
+
+/* Minimal number of setup sectors */
+#define SETUP_SECT_MIN 5
+#define SETUP_SECT_MAX 64
+
+/* This must be large enough to hold the entire setup */
+u8 buf[SETUP_SECT_MAX*512];
+
+#define PECOFF_RELOC_RESERVE 0x20
+
+unsigned long efi32_stub_entry;
+unsigned long efi64_stub_entry;
+unsigned long efi_pe_entry;
+unsigned long startup_64;
+
+/*----------------------------------------------------------------------*/
+
+static const u32 crctab32[] = {
+ 0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419,
+ 0x706af48f, 0xe963a535, 0x9e6495a3, 0x0edb8832, 0x79dcb8a4,
+ 0xe0d5e91e, 0x97d2d988, 0x09b64c2b, 0x7eb17cbd, 0xe7b82d07,
+ 0x90bf1d91, 0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de,
+ 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7, 0x136c9856,
+ 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9,
+ 0xfa0f3d63, 0x8d080df5, 0x3b6e20c8, 0x4c69105e, 0xd56041e4,
+ 0xa2677172, 0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b,
+ 0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940, 0x32d86ce3,
+ 0x45df5c75, 0xdcd60dcf, 0xabd13d59, 0x26d930ac, 0x51de003a,
+ 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423, 0xcfba9599,
+ 0xb8bda50f, 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924,
+ 0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, 0x76dc4190,
+ 0x01db7106, 0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f,
+ 0x9fbfe4a5, 0xe8b8d433, 0x7807c9a2, 0x0f00f934, 0x9609a88e,
+ 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d, 0x91646c97, 0xe6635c01,
+ 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e, 0x6c0695ed,
+ 0x1b01a57b, 0x8208f4c1, 0xf50fc457, 0x65b0d9c6, 0x12b7e950,
+ 0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3,
+ 0xfbd44c65, 0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2,
+ 0x4adfa541, 0x3dd895d7, 0xa4d1c46d, 0xd3d6f4fb, 0x4369e96a,
+ 0x346ed9fc, 0xad678846, 0xda60b8d0, 0x44042d73, 0x33031de5,
+ 0xaa0a4c5f, 0xdd0d7cc9, 0x5005713c, 0x270241aa, 0xbe0b1010,
+ 0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f,
+ 0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17,
+ 0x2eb40d81, 0xb7bd5c3b, 0xc0ba6cad, 0xedb88320, 0x9abfb3b6,
+ 0x03b6e20c, 0x74b1d29a, 0xead54739, 0x9dd277af, 0x04db2615,
+ 0x73dc1683, 0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8,
+ 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1, 0xf00f9344,
+ 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb,
+ 0x196c3671, 0x6e6b06e7, 0xfed41b76, 0x89d32be0, 0x10da7a5a,
+ 0x67dd4acc, 0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5,
+ 0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252, 0xd1bb67f1,
+ 0xa6bc5767, 0x3fb506dd, 0x48b2364b, 0xd80d2bda, 0xaf0a1b4c,
+ 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55, 0x316e8eef,
+ 0x4669be79, 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236,
+ 0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, 0xc5ba3bbe,
+ 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31,
+ 0x2cd99e8b, 0x5bdeae1d, 0x9b64c2b0, 0xec63f226, 0x756aa39c,
+ 0x026d930a, 0x9c0906a9, 0xeb0e363f, 0x72076785, 0x05005713,
+ 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38, 0x92d28e9b,
+ 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, 0x86d3d2d4, 0xf1d4e242,
+ 0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1,
+ 0x18b74777, 0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c,
+ 0x8f659eff, 0xf862ae69, 0x616bffd3, 0x166ccf45, 0xa00ae278,
+ 0xd70dd2ee, 0x4e048354, 0x3903b3c2, 0xa7672661, 0xd06016f7,
+ 0x4969474d, 0x3e6e77db, 0xaed16a4a, 0xd9d65adc, 0x40df0b66,
+ 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9,
+ 0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605,
+ 0xcdd70693, 0x54de5729, 0x23d967bf, 0xb3667a2e, 0xc4614ab8,
+ 0x5d681b02, 0x2a6f2b94, 0xb40bbe37, 0xc30c8ea1, 0x5a05df1b,
+ 0x2d02ef8d
+};
+
+static u32 partial_crc32_one(u8 c, u32 crc)
+{
+ return crctab32[(crc ^ c) & 0xff] ^ (crc >> 8);
+}
+
+static u32 partial_crc32(const u8 *s, int len, u32 crc)
+{
+ while (len--)
+ crc = partial_crc32_one(*s++, crc);
+ return crc;
+}
+
+static void die(const char * str, ...)
+{
+ va_list args;
+ va_start(args, str);
+ vfprintf(stderr, str, args);
+ fputc('\n', stderr);
+ exit(1);
+}
+
+static void usage(void)
+{
+ die("Usage: build setup system zoffset.h image");
+}
+
+#ifdef CONFIG_EFI_STUB
+
+static void update_pecoff_section_header_fields(char *section_name, u32 vma, u32 size, u32 datasz, u32 offset)
+{
+ unsigned int pe_header;
+ unsigned short num_sections;
+ u8 *section;
+
+ pe_header = get_unaligned_le32(&buf[0x3c]);
+ num_sections = get_unaligned_le16(&buf[pe_header + 6]);
+
+#ifdef CONFIG_X86_32
+ section = &buf[pe_header + 0xa8];
+#else
+ section = &buf[pe_header + 0xb8];
+#endif
+
+ while (num_sections > 0) {
+ if (strncmp((char*)section, section_name, 8) == 0) {
+ /* section header size field */
+ put_unaligned_le32(size, section + 0x8);
+
+ /* section header vma field */
+ put_unaligned_le32(vma, section + 0xc);
+
+ /* section header 'size of initialised data' field */
+ put_unaligned_le32(datasz, section + 0x10);
+
+ /* section header 'file offset' field */
+ put_unaligned_le32(offset, section + 0x14);
+
+ break;
+ }
+ section += 0x28;
+ num_sections--;
+ }
+}
+
+static void update_pecoff_section_header(char *section_name, u32 offset, u32 size)
+{
+ update_pecoff_section_header_fields(section_name, offset, size, size, offset);
+}
+
+static void update_pecoff_setup_and_reloc(unsigned int size)
+{
+ u32 setup_offset = 0x200;
+ u32 reloc_offset = size - PECOFF_RELOC_RESERVE;
+ u32 setup_size = reloc_offset - setup_offset;
+
+ update_pecoff_section_header(".setup", setup_offset, setup_size);
+ update_pecoff_section_header(".reloc", reloc_offset, PECOFF_RELOC_RESERVE);
+
+ /*
+ * Modify .reloc section contents with a single entry. The
+ * relocation is applied to offset 10 of the relocation section.
+ */
+ put_unaligned_le32(reloc_offset + 10, &buf[reloc_offset]);
+ put_unaligned_le32(10, &buf[reloc_offset + 4]);
+}
+
+static void update_pecoff_text(unsigned int text_start, unsigned int file_sz)
+{
+ unsigned int pe_header;
+ unsigned int text_sz = file_sz - text_start;
+
+ pe_header = get_unaligned_le32(&buf[0x3c]);
+
+ /*
+ * Size of code: Subtract the size of the first sector (512 bytes)
+ * which includes the header.
+ */
+ put_unaligned_le32(file_sz - 512, &buf[pe_header + 0x1c]);
+
+ /*
+ * Address of entry point for PE/COFF executable
+ */
+ put_unaligned_le32(text_start + efi_pe_entry, &buf[pe_header + 0x28]);
+
+ update_pecoff_section_header(".text", text_start, text_sz);
+}
+
+static void update_pecoff_bss(unsigned int file_sz, unsigned int init_sz)
+{
+ unsigned int pe_header;
+ unsigned int bss_sz = init_sz - file_sz;
+
+ pe_header = get_unaligned_le32(&buf[0x3c]);
+
+ /* Size of uninitialized data */
+ put_unaligned_le32(bss_sz, &buf[pe_header + 0x24]);
+
+ /* Size of image */
+ put_unaligned_le32(init_sz, &buf[pe_header + 0x50]);
+
+ update_pecoff_section_header_fields(".bss", file_sz, bss_sz, 0, 0);
+}
+
+static int reserve_pecoff_reloc_section(int c)
+{
+ /* Reserve 0x20 bytes for .reloc section */
+ memset(buf+c, 0, PECOFF_RELOC_RESERVE);
+ return PECOFF_RELOC_RESERVE;
+}
+
+static void efi_stub_defaults(void)
+{
+ /* Defaults for old kernel */
+#ifdef CONFIG_X86_32
+ efi_pe_entry = 0x10;
+#else
+ efi_pe_entry = 0x210;
+ startup_64 = 0x200;
+#endif
+}
+
+static void efi_stub_entry_update(void)
+{
+ unsigned long addr = efi32_stub_entry;
+
+#ifdef CONFIG_X86_64
+ /* Yes, this is really how we defined it :( */
+ addr = efi64_stub_entry - 0x200;
+#endif
+
+#ifdef CONFIG_EFI_MIXED
+ if (efi32_stub_entry != addr)
+ die("32-bit and 64-bit EFI entry points do not match\n");
+#endif
+ put_unaligned_le32(addr, &buf[0x264]);
+}
+
+#else
+
+static inline void update_pecoff_setup_and_reloc(unsigned int size) {}
+static inline void update_pecoff_text(unsigned int text_start,
+ unsigned int file_sz) {}
+static inline void update_pecoff_bss(unsigned int file_sz,
+ unsigned int init_sz) {}
+static inline void efi_stub_defaults(void) {}
+static inline void efi_stub_entry_update(void) {}
+
+static inline int reserve_pecoff_reloc_section(int c)
+{
+ return 0;
+}
+#endif /* CONFIG_EFI_STUB */
+
+
+/*
+ * Parse zoffset.h and find the entry points. We could just #include zoffset.h
+ * but that would mean tools/build would have to be rebuilt every time. It's
+ * not as if parsing it is hard...
+ */
+#define PARSE_ZOFS(p, sym) do { \
+ if (!strncmp(p, "#define ZO_" #sym " ", 11+sizeof(#sym))) \
+ sym = strtoul(p + 11 + sizeof(#sym), NULL, 16); \
+} while (0)
+
+static void parse_zoffset(char *fname)
+{
+ FILE *file;
+ char *p;
+ int c;
+
+ file = fopen(fname, "r");
+ if (!file)
+ die("Unable to open `%s': %m", fname);
+ c = fread(buf, 1, sizeof(buf) - 1, file);
+ if (ferror(file))
+ die("read-error on `zoffset.h'");
+ fclose(file);
+ buf[c] = 0;
+
+ p = (char *)buf;
+
+ while (p && *p) {
+ PARSE_ZOFS(p, efi32_stub_entry);
+ PARSE_ZOFS(p, efi64_stub_entry);
+ PARSE_ZOFS(p, efi_pe_entry);
+ PARSE_ZOFS(p, startup_64);
+
+ p = strchr(p, '\n');
+ while (p && (*p == '\r' || *p == '\n'))
+ p++;
+ }
+}
+
+int main(int argc, char ** argv)
+{
+ unsigned int i, sz, setup_sectors, init_sz;
+ int c;
+ u32 sys_size;
+ struct stat sb;
+ FILE *file, *dest;
+ int fd;
+ void *kernel;
+ u32 crc = 0xffffffffUL;
+
+ efi_stub_defaults();
+
+ if (argc != 5)
+ usage();
+ parse_zoffset(argv[3]);
+
+ dest = fopen(argv[4], "w");
+ if (!dest)
+ die("Unable to write `%s': %m", argv[4]);
+
+ /* Copy the setup code */
+ file = fopen(argv[1], "r");
+ if (!file)
+ die("Unable to open `%s': %m", argv[1]);
+ c = fread(buf, 1, sizeof(buf), file);
+ if (ferror(file))
+ die("read-error on `setup'");
+ if (c < 1024)
+ die("The setup must be at least 1024 bytes");
+ if (get_unaligned_le16(&buf[510]) != 0xAA55)
+ die("Boot block hasn't got boot flag (0xAA55)");
+ fclose(file);
+
+ c += reserve_pecoff_reloc_section(c);
+
+ /* Pad unused space with zeros */
+ setup_sectors = (c + 511) / 512;
+ if (setup_sectors < SETUP_SECT_MIN)
+ setup_sectors = SETUP_SECT_MIN;
+ i = setup_sectors*512;
+ memset(buf+c, 0, i-c);
+
+ update_pecoff_setup_and_reloc(i);
+
+ /* Set the default root device */
+ put_unaligned_le16(DEFAULT_ROOT_DEV, &buf[508]);
+
+ printf("Setup is %d bytes (padded to %d bytes).\n", c, i);
+
+ /* Open and stat the kernel file */
+ fd = open(argv[2], O_RDONLY);
+ if (fd < 0)
+ die("Unable to open `%s': %m", argv[2]);
+ if (fstat(fd, &sb))
+ die("Unable to stat `%s': %m", argv[2]);
+ sz = sb.st_size;
+ printf("System is %d kB\n", (sz+1023)/1024);
+ kernel = mmap(NULL, sz, PROT_READ, MAP_SHARED, fd, 0);
+ if (kernel == MAP_FAILED)
+ die("Unable to mmap '%s': %m", argv[2]);
+ /* Number of 16-byte paragraphs, including space for a 4-byte CRC */
+ sys_size = (sz + 15 + 4) / 16;
+#ifdef CONFIG_EFI_STUB
+ /*
+ * COFF requires minimum 32-byte alignment of sections, and
+ * adding a signature is problematic without that alignment.
+ */
+ sys_size = (sys_size + 1) & ~1;
+#endif
+
+ /* Patch the setup code with the appropriate size parameters */
+ buf[0x1f1] = setup_sectors-1;
+ put_unaligned_le32(sys_size, &buf[0x1f4]);
+
+ update_pecoff_text(setup_sectors * 512, i + (sys_size * 16));
+ init_sz = get_unaligned_le32(&buf[0x260]);
+ update_pecoff_bss(i + (sys_size * 16), init_sz);
+
+ efi_stub_entry_update();
+
+ crc = partial_crc32(buf, i, crc);
+ if (fwrite(buf, 1, i, dest) != i)
+ die("Writing setup failed");
+
+ /* Copy the kernel code */
+ crc = partial_crc32(kernel, sz, crc);
+ if (fwrite(kernel, 1, sz, dest) != sz)
+ die("Writing kernel failed");
+
+ /* Add padding leaving 4 bytes for the checksum */
+ while (sz++ < (sys_size*16) - 4) {
+ crc = partial_crc32_one('\0', crc);
+ if (fwrite("\0", 1, 1, dest) != 1)
+ die("Writing padding failed");
+ }
+
+ /* Write the CRC */
+ printf("CRC %x\n", crc);
+ put_unaligned_le32(crc, buf);
+ if (fwrite(buf, 1, 4, dest) != 4)
+ die("Writing CRC failed");
+
+ /* Catch any delayed write failures */
+ if (fclose(dest))
+ die("Writing image failed");
+
+ close(fd);
+
+ /* Everything is OK */
+ return 0;
+}
diff --git a/arch/x86/boot/tty.c b/arch/x86/boot/tty.c
new file mode 100644
index 000000000..def2451f4
--- /dev/null
+++ b/arch/x86/boot/tty.c
@@ -0,0 +1,139 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright 2007 rPath, Inc. - All Rights Reserved
+ * Copyright 2009 Intel Corporation; author H. Peter Anvin
+ *
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * Very simple screen and serial I/O
+ */
+
+#include "boot.h"
+
+int early_serial_base;
+
+#define XMTRDY 0x20
+
+#define TXR 0 /* Transmit register (WRITE) */
+#define LSR 5 /* Line Status */
+
+/*
+ * These functions are in .inittext so they can be used to signal
+ * error during initialization.
+ */
+
+static void __attribute__((section(".inittext"))) serial_putchar(int ch)
+{
+ unsigned timeout = 0xffff;
+
+ while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout)
+ cpu_relax();
+
+ outb(ch, early_serial_base + TXR);
+}
+
+static void __attribute__((section(".inittext"))) bios_putchar(int ch)
+{
+ struct biosregs ireg;
+
+ initregs(&ireg);
+ ireg.bx = 0x0007;
+ ireg.cx = 0x0001;
+ ireg.ah = 0x0e;
+ ireg.al = ch;
+ intcall(0x10, &ireg, NULL);
+}
+
+void __attribute__((section(".inittext"))) putchar(int ch)
+{
+ if (ch == '\n')
+ putchar('\r'); /* \n -> \r\n */
+
+ bios_putchar(ch);
+
+ if (early_serial_base != 0)
+ serial_putchar(ch);
+}
+
+void __attribute__((section(".inittext"))) puts(const char *str)
+{
+ while (*str)
+ putchar(*str++);
+}
+
+/*
+ * Read the CMOS clock through the BIOS, and return the
+ * seconds in BCD.
+ */
+
+static u8 gettime(void)
+{
+ struct biosregs ireg, oreg;
+
+ initregs(&ireg);
+ ireg.ah = 0x02;
+ intcall(0x1a, &ireg, &oreg);
+
+ return oreg.dh;
+}
+
+/*
+ * Read from the keyboard
+ */
+int getchar(void)
+{
+ struct biosregs ireg, oreg;
+
+ initregs(&ireg);
+ /* ireg.ah = 0x00; */
+ intcall(0x16, &ireg, &oreg);
+
+ return oreg.al;
+}
+
+static int kbd_pending(void)
+{
+ struct biosregs ireg, oreg;
+
+ initregs(&ireg);
+ ireg.ah = 0x01;
+ intcall(0x16, &ireg, &oreg);
+
+ return !(oreg.eflags & X86_EFLAGS_ZF);
+}
+
+void kbd_flush(void)
+{
+ for (;;) {
+ if (!kbd_pending())
+ break;
+ getchar();
+ }
+}
+
+int getchar_timeout(void)
+{
+ int cnt = 30;
+ int t0, t1;
+
+ t0 = gettime();
+
+ while (cnt) {
+ if (kbd_pending())
+ return getchar();
+
+ t1 = gettime();
+ if (t0 != t1) {
+ cnt--;
+ t0 = t1;
+ }
+ }
+
+ return 0; /* Timeout! */
+}
+
diff --git a/arch/x86/boot/version.c b/arch/x86/boot/version.c
new file mode 100644
index 000000000..2b15aa488
--- /dev/null
+++ b/arch/x86/boot/version.c
@@ -0,0 +1,21 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright 2007 rPath, Inc. - All Rights Reserved
+ *
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * Kernel version string
+ */
+
+#include "boot.h"
+#include <generated/utsrelease.h>
+#include <generated/compile.h>
+
+const char kernel_version[] =
+ UTS_RELEASE " (" LINUX_COMPILE_BY "@" LINUX_COMPILE_HOST ") "
+ UTS_VERSION;
diff --git a/arch/x86/boot/vesa.h b/arch/x86/boot/vesa.h
new file mode 100644
index 000000000..468e44462
--- /dev/null
+++ b/arch/x86/boot/vesa.h
@@ -0,0 +1,72 @@
+/* ----------------------------------------------------------------------- *
+ *
+ * Copyright 1999-2007 H. Peter Anvin - All Rights Reserved
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, Inc., 53 Temple Place Ste 330,
+ * Boston MA 02111-1307, USA; either version 2 of the License, or
+ * (at your option) any later version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+
+#ifndef BOOT_VESA_H
+#define BOOT_VESA_H
+
+typedef struct {
+ u16 off, seg;
+} far_ptr;
+
+/* VESA General Information table */
+struct vesa_general_info {
+ u32 signature; /* 0 Magic number = "VESA" */
+ u16 version; /* 4 */
+ far_ptr vendor_string; /* 6 */
+ u32 capabilities; /* 10 */
+ far_ptr video_mode_ptr; /* 14 */
+ u16 total_memory; /* 18 */
+
+ u8 reserved[236]; /* 20 */
+} __attribute__ ((packed));
+
+#define VESA_MAGIC ('V' + ('E' << 8) + ('S' << 16) + ('A' << 24))
+
+struct vesa_mode_info {
+ u16 mode_attr; /* 0 */
+ u8 win_attr[2]; /* 2 */
+ u16 win_grain; /* 4 */
+ u16 win_size; /* 6 */
+ u16 win_seg[2]; /* 8 */
+ far_ptr win_scheme; /* 12 */
+ u16 logical_scan; /* 16 */
+
+ u16 h_res; /* 18 */
+ u16 v_res; /* 20 */
+ u8 char_width; /* 22 */
+ u8 char_height; /* 23 */
+ u8 memory_planes; /* 24 */
+ u8 bpp; /* 25 */
+ u8 banks; /* 26 */
+ u8 memory_layout; /* 27 */
+ u8 bank_size; /* 28 */
+ u8 image_planes; /* 29 */
+ u8 page_function; /* 30 */
+
+ u8 rmask; /* 31 */
+ u8 rpos; /* 32 */
+ u8 gmask; /* 33 */
+ u8 gpos; /* 34 */
+ u8 bmask; /* 35 */
+ u8 bpos; /* 36 */
+ u8 resv_mask; /* 37 */
+ u8 resv_pos; /* 38 */
+ u8 dcm_info; /* 39 */
+
+ u32 lfb_ptr; /* 40 Linear frame buffer address */
+ u32 offscreen_ptr; /* 44 Offscreen memory address */
+ u16 offscreen_size; /* 48 */
+
+ u8 reserved[206]; /* 50 */
+} __attribute__ ((packed));
+
+#endif /* LIB_SYS_VESA_H */
diff --git a/arch/x86/boot/video-bios.c b/arch/x86/boot/video-bios.c
new file mode 100644
index 000000000..49e0c1883
--- /dev/null
+++ b/arch/x86/boot/video-bios.c
@@ -0,0 +1,128 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright 2007 rPath, Inc. - All Rights Reserved
+ * Copyright 2009 Intel Corporation; author H. Peter Anvin
+ *
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * Standard video BIOS modes
+ *
+ * We have two options for this; silent and scanned.
+ */
+
+#include "boot.h"
+#include "video.h"
+
+static __videocard video_bios;
+
+/* Set a conventional BIOS mode */
+static int set_bios_mode(u8 mode);
+
+static int bios_set_mode(struct mode_info *mi)
+{
+ return set_bios_mode(mi->mode - VIDEO_FIRST_BIOS);
+}
+
+static int set_bios_mode(u8 mode)
+{
+ struct biosregs ireg, oreg;
+ u8 new_mode;
+
+ initregs(&ireg);
+ ireg.al = mode; /* AH=0x00 Set Video Mode */
+ intcall(0x10, &ireg, NULL);
+
+ ireg.ah = 0x0f; /* Get Current Video Mode */
+ intcall(0x10, &ireg, &oreg);
+
+ do_restore = 1; /* Assume video contents were lost */
+
+ /* Not all BIOSes are clean with the top bit */
+ new_mode = oreg.al & 0x7f;
+
+ if (new_mode == mode)
+ return 0; /* Mode change OK */
+
+#ifndef _WAKEUP
+ if (new_mode != boot_params.screen_info.orig_video_mode) {
+ /* Mode setting failed, but we didn't end up where we
+ started. That's bad. Try to revert to the original
+ video mode. */
+ ireg.ax = boot_params.screen_info.orig_video_mode;
+ intcall(0x10, &ireg, NULL);
+ }
+#endif
+ return -1;
+}
+
+static int bios_probe(void)
+{
+ u8 mode;
+#ifdef _WAKEUP
+ u8 saved_mode = 0x03;
+#else
+ u8 saved_mode = boot_params.screen_info.orig_video_mode;
+#endif
+ u16 crtc;
+ struct mode_info *mi;
+ int nmodes = 0;
+
+ if (adapter != ADAPTER_EGA && adapter != ADAPTER_VGA)
+ return 0;
+
+ set_fs(0);
+ crtc = vga_crtc();
+
+ video_bios.modes = GET_HEAP(struct mode_info, 0);
+
+ for (mode = 0x14; mode <= 0x7f; mode++) {
+ if (!heap_free(sizeof(struct mode_info)))
+ break;
+
+ if (mode_defined(VIDEO_FIRST_BIOS+mode))
+ continue;
+
+ if (set_bios_mode(mode))
+ continue;
+
+ /* Try to verify that it's a text mode. */
+
+ /* Attribute Controller: make graphics controller disabled */
+ if (in_idx(0x3c0, 0x10) & 0x01)
+ continue;
+
+ /* Graphics Controller: verify Alpha addressing enabled */
+ if (in_idx(0x3ce, 0x06) & 0x01)
+ continue;
+
+ /* CRTC cursor location low should be zero(?) */
+ if (in_idx(crtc, 0x0f))
+ continue;
+
+ mi = GET_HEAP(struct mode_info, 1);
+ mi->mode = VIDEO_FIRST_BIOS+mode;
+ mi->depth = 0; /* text */
+ mi->x = rdfs16(0x44a);
+ mi->y = rdfs8(0x484)+1;
+ nmodes++;
+ }
+
+ set_bios_mode(saved_mode);
+
+ return nmodes;
+}
+
+static __videocard video_bios =
+{
+ .card_name = "BIOS",
+ .probe = bios_probe,
+ .set_mode = bios_set_mode,
+ .unsafe = 1,
+ .xmode_first = VIDEO_FIRST_BIOS,
+ .xmode_n = 0x80,
+};
diff --git a/arch/x86/boot/video-mode.c b/arch/x86/boot/video-mode.c
new file mode 100644
index 000000000..95c7a818c
--- /dev/null
+++ b/arch/x86/boot/video-mode.c
@@ -0,0 +1,173 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright 2007-2008 rPath, Inc. - All Rights Reserved
+ *
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * arch/i386/boot/video-mode.c
+ *
+ * Set the video mode. This is separated out into a different
+ * file in order to be shared with the ACPI wakeup code.
+ */
+
+#include "boot.h"
+#include "video.h"
+#include "vesa.h"
+
+#include <uapi/asm/boot.h>
+
+/*
+ * Common variables
+ */
+int adapter; /* 0=CGA/MDA/HGC, 1=EGA, 2=VGA+ */
+int force_x, force_y; /* Don't query the BIOS for cols/rows */
+int do_restore; /* Screen contents changed during mode flip */
+int graphic_mode; /* Graphic mode with linear frame buffer */
+
+/* Probe the video drivers and have them generate their mode lists. */
+void probe_cards(int unsafe)
+{
+ struct card_info *card;
+ static u8 probed[2];
+
+ if (probed[unsafe])
+ return;
+
+ probed[unsafe] = 1;
+
+ for (card = video_cards; card < video_cards_end; card++) {
+ if (card->unsafe == unsafe) {
+ if (card->probe)
+ card->nmodes = card->probe();
+ else
+ card->nmodes = 0;
+ }
+ }
+}
+
+/* Test if a mode is defined */
+int mode_defined(u16 mode)
+{
+ struct card_info *card;
+ struct mode_info *mi;
+ int i;
+
+ for (card = video_cards; card < video_cards_end; card++) {
+ mi = card->modes;
+ for (i = 0; i < card->nmodes; i++, mi++) {
+ if (mi->mode == mode)
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+/* Set mode (without recalc) */
+static int raw_set_mode(u16 mode, u16 *real_mode)
+{
+ int nmode, i;
+ struct card_info *card;
+ struct mode_info *mi;
+
+ /* Drop the recalc bit if set */
+ mode &= ~VIDEO_RECALC;
+
+ /* Scan for mode based on fixed ID, position, or resolution */
+ nmode = 0;
+ for (card = video_cards; card < video_cards_end; card++) {
+ mi = card->modes;
+ for (i = 0; i < card->nmodes; i++, mi++) {
+ int visible = mi->x || mi->y;
+
+ if ((mode == nmode && visible) ||
+ mode == mi->mode ||
+ mode == (mi->y << 8)+mi->x) {
+ *real_mode = mi->mode;
+ return card->set_mode(mi);
+ }
+
+ if (visible)
+ nmode++;
+ }
+ }
+
+ /* Nothing found? Is it an "exceptional" (unprobed) mode? */
+ for (card = video_cards; card < video_cards_end; card++) {
+ if (mode >= card->xmode_first &&
+ mode < card->xmode_first+card->xmode_n) {
+ struct mode_info mix;
+ *real_mode = mix.mode = mode;
+ mix.x = mix.y = 0;
+ return card->set_mode(&mix);
+ }
+ }
+
+ /* Otherwise, failure... */
+ return -1;
+}
+
+/*
+ * Recalculate the vertical video cutoff (hack!)
+ */
+static void vga_recalc_vertical(void)
+{
+ unsigned int font_size, rows;
+ u16 crtc;
+ u8 pt, ov;
+
+ set_fs(0);
+ font_size = rdfs8(0x485); /* BIOS: font size (pixels) */
+ rows = force_y ? force_y : rdfs8(0x484)+1; /* Text rows */
+
+ rows *= font_size; /* Visible scan lines */
+ rows--; /* ... minus one */
+
+ crtc = vga_crtc();
+
+ pt = in_idx(crtc, 0x11);
+ pt &= ~0x80; /* Unlock CR0-7 */
+ out_idx(pt, crtc, 0x11);
+
+ out_idx((u8)rows, crtc, 0x12); /* Lower height register */
+
+ ov = in_idx(crtc, 0x07); /* Overflow register */
+ ov &= 0xbd;
+ ov |= (rows >> (8-1)) & 0x02;
+ ov |= (rows >> (9-6)) & 0x40;
+ out_idx(ov, crtc, 0x07);
+}
+
+/* Set mode (with recalc if specified) */
+int set_mode(u16 mode)
+{
+ int rv;
+ u16 real_mode;
+
+ /* Very special mode numbers... */
+ if (mode == VIDEO_CURRENT_MODE)
+ return 0; /* Nothing to do... */
+ else if (mode == NORMAL_VGA)
+ mode = VIDEO_80x25;
+ else if (mode == EXTENDED_VGA)
+ mode = VIDEO_8POINT;
+
+ rv = raw_set_mode(mode, &real_mode);
+ if (rv)
+ return rv;
+
+ if (mode & VIDEO_RECALC)
+ vga_recalc_vertical();
+
+ /* Save the canonical mode number for the kernel, not
+ an alias, size specification or menu position */
+#ifndef _WAKEUP
+ boot_params.hdr.vid_mode = real_mode;
+#endif
+ return 0;
+}
diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c
new file mode 100644
index 000000000..ba3e10065
--- /dev/null
+++ b/arch/x86/boot/video-vesa.c
@@ -0,0 +1,281 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright 2007 rPath, Inc. - All Rights Reserved
+ * Copyright 2009 Intel Corporation; author H. Peter Anvin
+ *
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * VESA text modes
+ */
+
+#include "boot.h"
+#include "video.h"
+#include "vesa.h"
+#include "string.h"
+
+/* VESA information */
+static struct vesa_general_info vginfo;
+static struct vesa_mode_info vminfo;
+
+static __videocard video_vesa;
+
+#ifndef _WAKEUP
+static void vesa_store_mode_params_graphics(void);
+#else /* _WAKEUP */
+static inline void vesa_store_mode_params_graphics(void) {}
+#endif /* _WAKEUP */
+
+static int vesa_probe(void)
+{
+ struct biosregs ireg, oreg;
+ u16 mode;
+ addr_t mode_ptr;
+ struct mode_info *mi;
+ int nmodes = 0;
+
+ video_vesa.modes = GET_HEAP(struct mode_info, 0);
+
+ initregs(&ireg);
+ ireg.ax = 0x4f00;
+ ireg.di = (size_t)&vginfo;
+ intcall(0x10, &ireg, &oreg);
+
+ if (oreg.ax != 0x004f ||
+ vginfo.signature != VESA_MAGIC ||
+ vginfo.version < 0x0102)
+ return 0; /* Not present */
+
+ set_fs(vginfo.video_mode_ptr.seg);
+ mode_ptr = vginfo.video_mode_ptr.off;
+
+ while ((mode = rdfs16(mode_ptr)) != 0xffff) {
+ mode_ptr += 2;
+
+ if (!heap_free(sizeof(struct mode_info)))
+ break; /* Heap full, can't save mode info */
+
+ if (mode & ~0x1ff)
+ continue;
+
+ memset(&vminfo, 0, sizeof vminfo); /* Just in case... */
+
+ ireg.ax = 0x4f01;
+ ireg.cx = mode;
+ ireg.di = (size_t)&vminfo;
+ intcall(0x10, &ireg, &oreg);
+
+ if (oreg.ax != 0x004f)
+ continue;
+
+ if ((vminfo.mode_attr & 0x15) == 0x05) {
+ /* Text Mode, TTY BIOS supported,
+ supported by hardware */
+ mi = GET_HEAP(struct mode_info, 1);
+ mi->mode = mode + VIDEO_FIRST_VESA;
+ mi->depth = 0; /* text */
+ mi->x = vminfo.h_res;
+ mi->y = vminfo.v_res;
+ nmodes++;
+ } else if ((vminfo.mode_attr & 0x99) == 0x99 &&
+ (vminfo.memory_layout == 4 ||
+ vminfo.memory_layout == 6) &&
+ vminfo.memory_planes == 1) {
+#ifdef CONFIG_FB_BOOT_VESA_SUPPORT
+ /* Graphics mode, color, linear frame buffer
+ supported. Only register the mode if
+ if framebuffer is configured, however,
+ otherwise the user will be left without a screen. */
+ mi = GET_HEAP(struct mode_info, 1);
+ mi->mode = mode + VIDEO_FIRST_VESA;
+ mi->depth = vminfo.bpp;
+ mi->x = vminfo.h_res;
+ mi->y = vminfo.v_res;
+ nmodes++;
+#endif
+ }
+ }
+
+ return nmodes;
+}
+
+static int vesa_set_mode(struct mode_info *mode)
+{
+ struct biosregs ireg, oreg;
+ int is_graphic;
+ u16 vesa_mode = mode->mode - VIDEO_FIRST_VESA;
+
+ memset(&vminfo, 0, sizeof vminfo); /* Just in case... */
+
+ initregs(&ireg);
+ ireg.ax = 0x4f01;
+ ireg.cx = vesa_mode;
+ ireg.di = (size_t)&vminfo;
+ intcall(0x10, &ireg, &oreg);
+
+ if (oreg.ax != 0x004f)
+ return -1;
+
+ if ((vminfo.mode_attr & 0x15) == 0x05) {
+ /* It's a supported text mode */
+ is_graphic = 0;
+#ifdef CONFIG_FB_BOOT_VESA_SUPPORT
+ } else if ((vminfo.mode_attr & 0x99) == 0x99) {
+ /* It's a graphics mode with linear frame buffer */
+ is_graphic = 1;
+ vesa_mode |= 0x4000; /* Request linear frame buffer */
+#endif
+ } else {
+ return -1; /* Invalid mode */
+ }
+
+
+ initregs(&ireg);
+ ireg.ax = 0x4f02;
+ ireg.bx = vesa_mode;
+ intcall(0x10, &ireg, &oreg);
+
+ if (oreg.ax != 0x004f)
+ return -1;
+
+ graphic_mode = is_graphic;
+ if (!is_graphic) {
+ /* Text mode */
+ force_x = mode->x;
+ force_y = mode->y;
+ do_restore = 1;
+ } else {
+ /* Graphics mode */
+ vesa_store_mode_params_graphics();
+ }
+
+ return 0;
+}
+
+
+#ifndef _WAKEUP
+
+/* Switch DAC to 8-bit mode */
+static void vesa_dac_set_8bits(void)
+{
+ struct biosregs ireg, oreg;
+ u8 dac_size = 6;
+
+ /* If possible, switch the DAC to 8-bit mode */
+ if (vginfo.capabilities & 1) {
+ initregs(&ireg);
+ ireg.ax = 0x4f08;
+ ireg.bh = 0x08;
+ intcall(0x10, &ireg, &oreg);
+ if (oreg.ax == 0x004f)
+ dac_size = oreg.bh;
+ }
+
+ /* Set the color sizes to the DAC size, and offsets to 0 */
+ boot_params.screen_info.red_size = dac_size;
+ boot_params.screen_info.green_size = dac_size;
+ boot_params.screen_info.blue_size = dac_size;
+ boot_params.screen_info.rsvd_size = dac_size;
+
+ boot_params.screen_info.red_pos = 0;
+ boot_params.screen_info.green_pos = 0;
+ boot_params.screen_info.blue_pos = 0;
+ boot_params.screen_info.rsvd_pos = 0;
+}
+
+/* Save the VESA protected mode info */
+static void vesa_store_pm_info(void)
+{
+ struct biosregs ireg, oreg;
+
+ initregs(&ireg);
+ ireg.ax = 0x4f0a;
+ intcall(0x10, &ireg, &oreg);
+
+ if (oreg.ax != 0x004f)
+ return;
+
+ boot_params.screen_info.vesapm_seg = oreg.es;
+ boot_params.screen_info.vesapm_off = oreg.di;
+}
+
+/*
+ * Save video mode parameters for graphics mode
+ */
+static void vesa_store_mode_params_graphics(void)
+{
+ /* Tell the kernel we're in VESA graphics mode */
+ boot_params.screen_info.orig_video_isVGA = VIDEO_TYPE_VLFB;
+
+ /* Mode parameters */
+ boot_params.screen_info.vesa_attributes = vminfo.mode_attr;
+ boot_params.screen_info.lfb_linelength = vminfo.logical_scan;
+ boot_params.screen_info.lfb_width = vminfo.h_res;
+ boot_params.screen_info.lfb_height = vminfo.v_res;
+ boot_params.screen_info.lfb_depth = vminfo.bpp;
+ boot_params.screen_info.pages = vminfo.image_planes;
+ boot_params.screen_info.lfb_base = vminfo.lfb_ptr;
+ memcpy(&boot_params.screen_info.red_size,
+ &vminfo.rmask, 8);
+
+ /* General parameters */
+ boot_params.screen_info.lfb_size = vginfo.total_memory;
+
+ if (vminfo.bpp <= 8)
+ vesa_dac_set_8bits();
+
+ vesa_store_pm_info();
+}
+
+/*
+ * Save EDID information for the kernel; this is invoked, separately,
+ * after mode-setting.
+ */
+void vesa_store_edid(void)
+{
+#ifdef CONFIG_FIRMWARE_EDID
+ struct biosregs ireg, oreg;
+
+ /* Apparently used as a nonsense token... */
+ memset(&boot_params.edid_info, 0x13, sizeof boot_params.edid_info);
+
+ if (vginfo.version < 0x0200)
+ return; /* EDID requires VBE 2.0+ */
+
+ initregs(&ireg);
+ ireg.ax = 0x4f15; /* VBE DDC */
+ /* ireg.bx = 0x0000; */ /* Report DDC capabilities */
+ /* ireg.cx = 0; */ /* Controller 0 */
+ ireg.es = 0; /* ES:DI must be 0 by spec */
+ intcall(0x10, &ireg, &oreg);
+
+ if (oreg.ax != 0x004f)
+ return; /* No EDID */
+
+ /* BH = time in seconds to transfer EDD information */
+ /* BL = DDC level supported */
+
+ ireg.ax = 0x4f15; /* VBE DDC */
+ ireg.bx = 0x0001; /* Read EDID */
+ /* ireg.cx = 0; */ /* Controller 0 */
+ /* ireg.dx = 0; */ /* EDID block number */
+ ireg.es = ds();
+ ireg.di =(size_t)&boot_params.edid_info; /* (ES:)Pointer to block */
+ intcall(0x10, &ireg, &oreg);
+#endif /* CONFIG_FIRMWARE_EDID */
+}
+
+#endif /* not _WAKEUP */
+
+static __videocard video_vesa =
+{
+ .card_name = "VESA",
+ .probe = vesa_probe,
+ .set_mode = vesa_set_mode,
+ .xmode_first = VIDEO_FIRST_VESA,
+ .xmode_n = 0x200,
+};
diff --git a/arch/x86/boot/video-vga.c b/arch/x86/boot/video-vga.c
new file mode 100644
index 000000000..a14c5178d
--- /dev/null
+++ b/arch/x86/boot/video-vga.c
@@ -0,0 +1,288 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright 2007 rPath, Inc. - All Rights Reserved
+ * Copyright 2009 Intel Corporation; author H. Peter Anvin
+ *
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * Common all-VGA modes
+ */
+
+#include "boot.h"
+#include "video.h"
+
+static struct mode_info vga_modes[] = {
+ { VIDEO_80x25, 80, 25, 0 },
+ { VIDEO_8POINT, 80, 50, 0 },
+ { VIDEO_80x43, 80, 43, 0 },
+ { VIDEO_80x28, 80, 28, 0 },
+ { VIDEO_80x30, 80, 30, 0 },
+ { VIDEO_80x34, 80, 34, 0 },
+ { VIDEO_80x60, 80, 60, 0 },
+};
+
+static struct mode_info ega_modes[] = {
+ { VIDEO_80x25, 80, 25, 0 },
+ { VIDEO_8POINT, 80, 43, 0 },
+};
+
+static struct mode_info cga_modes[] = {
+ { VIDEO_80x25, 80, 25, 0 },
+};
+
+static __videocard video_vga;
+
+/* Set basic 80x25 mode */
+static u8 vga_set_basic_mode(void)
+{
+ struct biosregs ireg, oreg;
+ u8 mode;
+
+ initregs(&ireg);
+
+ /* Query current mode */
+ ireg.ax = 0x0f00;
+ intcall(0x10, &ireg, &oreg);
+ mode = oreg.al;
+
+ if (mode != 3 && mode != 7)
+ mode = 3;
+
+ /* Set the mode */
+ ireg.ax = mode; /* AH=0: set mode */
+ intcall(0x10, &ireg, NULL);
+ do_restore = 1;
+ return mode;
+}
+
+static void vga_set_8font(void)
+{
+ /* Set 8x8 font - 80x43 on EGA, 80x50 on VGA */
+ struct biosregs ireg;
+
+ initregs(&ireg);
+
+ /* Set 8x8 font */
+ ireg.ax = 0x1112;
+ /* ireg.bl = 0; */
+ intcall(0x10, &ireg, NULL);
+
+ /* Use alternate print screen */
+ ireg.ax = 0x1200;
+ ireg.bl = 0x20;
+ intcall(0x10, &ireg, NULL);
+
+ /* Turn off cursor emulation */
+ ireg.ax = 0x1201;
+ ireg.bl = 0x34;
+ intcall(0x10, &ireg, NULL);
+
+ /* Cursor is scan lines 6-7 */
+ ireg.ax = 0x0100;
+ ireg.cx = 0x0607;
+ intcall(0x10, &ireg, NULL);
+}
+
+static void vga_set_14font(void)
+{
+ /* Set 9x14 font - 80x28 on VGA */
+ struct biosregs ireg;
+
+ initregs(&ireg);
+
+ /* Set 9x14 font */
+ ireg.ax = 0x1111;
+ /* ireg.bl = 0; */
+ intcall(0x10, &ireg, NULL);
+
+ /* Turn off cursor emulation */
+ ireg.ax = 0x1201;
+ ireg.bl = 0x34;
+ intcall(0x10, &ireg, NULL);
+
+ /* Cursor is scan lines 11-12 */
+ ireg.ax = 0x0100;
+ ireg.cx = 0x0b0c;
+ intcall(0x10, &ireg, NULL);
+}
+
+static void vga_set_80x43(void)
+{
+ /* Set 80x43 mode on VGA (not EGA) */
+ struct biosregs ireg;
+
+ initregs(&ireg);
+
+ /* Set 350 scans */
+ ireg.ax = 0x1201;
+ ireg.bl = 0x30;
+ intcall(0x10, &ireg, NULL);
+
+ /* Reset video mode */
+ ireg.ax = 0x0003;
+ intcall(0x10, &ireg, NULL);
+
+ vga_set_8font();
+}
+
+/* I/O address of the VGA CRTC */
+u16 vga_crtc(void)
+{
+ return (inb(0x3cc) & 1) ? 0x3d4 : 0x3b4;
+}
+
+static void vga_set_480_scanlines(void)
+{
+ u16 crtc; /* CRTC base address */
+ u8 csel; /* CRTC miscellaneous output register */
+
+ crtc = vga_crtc();
+
+ out_idx(0x0c, crtc, 0x11); /* Vertical sync end, unlock CR0-7 */
+ out_idx(0x0b, crtc, 0x06); /* Vertical total */
+ out_idx(0x3e, crtc, 0x07); /* Vertical overflow */
+ out_idx(0xea, crtc, 0x10); /* Vertical sync start */
+ out_idx(0xdf, crtc, 0x12); /* Vertical display end */
+ out_idx(0xe7, crtc, 0x15); /* Vertical blank start */
+ out_idx(0x04, crtc, 0x16); /* Vertical blank end */
+ csel = inb(0x3cc);
+ csel &= 0x0d;
+ csel |= 0xe2;
+ outb(csel, 0x3c2);
+}
+
+static void vga_set_vertical_end(int lines)
+{
+ u16 crtc; /* CRTC base address */
+ u8 ovfw; /* CRTC overflow register */
+ int end = lines-1;
+
+ crtc = vga_crtc();
+
+ ovfw = 0x3c | ((end >> (8-1)) & 0x02) | ((end >> (9-6)) & 0x40);
+
+ out_idx(ovfw, crtc, 0x07); /* Vertical overflow */
+ out_idx(end, crtc, 0x12); /* Vertical display end */
+}
+
+static void vga_set_80x30(void)
+{
+ vga_set_480_scanlines();
+ vga_set_vertical_end(30*16);
+}
+
+static void vga_set_80x34(void)
+{
+ vga_set_480_scanlines();
+ vga_set_14font();
+ vga_set_vertical_end(34*14);
+}
+
+static void vga_set_80x60(void)
+{
+ vga_set_480_scanlines();
+ vga_set_8font();
+ vga_set_vertical_end(60*8);
+}
+
+static int vga_set_mode(struct mode_info *mode)
+{
+ /* Set the basic mode */
+ vga_set_basic_mode();
+
+ /* Override a possibly broken BIOS */
+ force_x = mode->x;
+ force_y = mode->y;
+
+ switch (mode->mode) {
+ case VIDEO_80x25:
+ break;
+ case VIDEO_8POINT:
+ vga_set_8font();
+ break;
+ case VIDEO_80x43:
+ vga_set_80x43();
+ break;
+ case VIDEO_80x28:
+ vga_set_14font();
+ break;
+ case VIDEO_80x30:
+ vga_set_80x30();
+ break;
+ case VIDEO_80x34:
+ vga_set_80x34();
+ break;
+ case VIDEO_80x60:
+ vga_set_80x60();
+ break;
+ }
+
+ return 0;
+}
+
+/*
+ * Note: this probe includes basic information required by all
+ * systems. It should be executed first, by making sure
+ * video-vga.c is listed first in the Makefile.
+ */
+static int vga_probe(void)
+{
+ static const char *card_name[] = {
+ "CGA/MDA/HGC", "EGA", "VGA"
+ };
+ static struct mode_info *mode_lists[] = {
+ cga_modes,
+ ega_modes,
+ vga_modes,
+ };
+ static int mode_count[] = {
+ ARRAY_SIZE(cga_modes),
+ ARRAY_SIZE(ega_modes),
+ ARRAY_SIZE(vga_modes),
+ };
+
+ struct biosregs ireg, oreg;
+
+ initregs(&ireg);
+
+ ireg.ax = 0x1200;
+ ireg.bl = 0x10; /* Check EGA/VGA */
+ intcall(0x10, &ireg, &oreg);
+
+#ifndef _WAKEUP
+ boot_params.screen_info.orig_video_ega_bx = oreg.bx;
+#endif
+
+ /* If we have MDA/CGA/HGC then BL will be unchanged at 0x10 */
+ if (oreg.bl != 0x10) {
+ /* EGA/VGA */
+ ireg.ax = 0x1a00;
+ intcall(0x10, &ireg, &oreg);
+
+ if (oreg.al == 0x1a) {
+ adapter = ADAPTER_VGA;
+#ifndef _WAKEUP
+ boot_params.screen_info.orig_video_isVGA = 1;
+#endif
+ } else {
+ adapter = ADAPTER_EGA;
+ }
+ } else {
+ adapter = ADAPTER_CGA;
+ }
+
+ video_vga.modes = mode_lists[adapter];
+ video_vga.card_name = card_name[adapter];
+ return mode_count[adapter];
+}
+
+static __videocard video_vga = {
+ .card_name = "VGA",
+ .probe = vga_probe,
+ .set_mode = vga_set_mode,
+};
diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c
new file mode 100644
index 000000000..77780e386
--- /dev/null
+++ b/arch/x86/boot/video.c
@@ -0,0 +1,345 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright 2007 rPath, Inc. - All Rights Reserved
+ * Copyright 2009 Intel Corporation; author H. Peter Anvin
+ *
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * Select video mode
+ */
+
+#include <uapi/asm/boot.h>
+
+#include "boot.h"
+#include "video.h"
+#include "vesa.h"
+
+static u16 video_segment;
+
+static void store_cursor_position(void)
+{
+ struct biosregs ireg, oreg;
+
+ initregs(&ireg);
+ ireg.ah = 0x03;
+ intcall(0x10, &ireg, &oreg);
+
+ boot_params.screen_info.orig_x = oreg.dl;
+ boot_params.screen_info.orig_y = oreg.dh;
+
+ if (oreg.ch & 0x20)
+ boot_params.screen_info.flags |= VIDEO_FLAGS_NOCURSOR;
+
+ if ((oreg.ch & 0x1f) > (oreg.cl & 0x1f))
+ boot_params.screen_info.flags |= VIDEO_FLAGS_NOCURSOR;
+}
+
+static void store_video_mode(void)
+{
+ struct biosregs ireg, oreg;
+
+ /* N.B.: the saving of the video page here is a bit silly,
+ since we pretty much assume page 0 everywhere. */
+ initregs(&ireg);
+ ireg.ah = 0x0f;
+ intcall(0x10, &ireg, &oreg);
+
+ /* Not all BIOSes are clean with respect to the top bit */
+ boot_params.screen_info.orig_video_mode = oreg.al & 0x7f;
+ boot_params.screen_info.orig_video_page = oreg.bh;
+}
+
+/*
+ * Store the video mode parameters for later usage by the kernel.
+ * This is done by asking the BIOS except for the rows/columns
+ * parameters in the default 80x25 mode -- these are set directly,
+ * because some very obscure BIOSes supply insane values.
+ */
+static void store_mode_params(void)
+{
+ u16 font_size;
+ int x, y;
+
+ /* For graphics mode, it is up to the mode-setting driver
+ (currently only video-vesa.c) to store the parameters */
+ if (graphic_mode)
+ return;
+
+ store_cursor_position();
+ store_video_mode();
+
+ if (boot_params.screen_info.orig_video_mode == 0x07) {
+ /* MDA, HGC, or VGA in monochrome mode */
+ video_segment = 0xb000;
+ } else {
+ /* CGA, EGA, VGA and so forth */
+ video_segment = 0xb800;
+ }
+
+ set_fs(0);
+ font_size = rdfs16(0x485); /* Font size, BIOS area */
+ boot_params.screen_info.orig_video_points = font_size;
+
+ x = rdfs16(0x44a);
+ y = (adapter == ADAPTER_CGA) ? 25 : rdfs8(0x484)+1;
+
+ if (force_x)
+ x = force_x;
+ if (force_y)
+ y = force_y;
+
+ boot_params.screen_info.orig_video_cols = x;
+ boot_params.screen_info.orig_video_lines = y;
+}
+
+static unsigned int get_entry(void)
+{
+ char entry_buf[4];
+ int i, len = 0;
+ int key;
+ unsigned int v;
+
+ do {
+ key = getchar();
+
+ if (key == '\b') {
+ if (len > 0) {
+ puts("\b \b");
+ len--;
+ }
+ } else if ((key >= '0' && key <= '9') ||
+ (key >= 'A' && key <= 'Z') ||
+ (key >= 'a' && key <= 'z')) {
+ if (len < sizeof entry_buf) {
+ entry_buf[len++] = key;
+ putchar(key);
+ }
+ }
+ } while (key != '\r');
+ putchar('\n');
+
+ if (len == 0)
+ return VIDEO_CURRENT_MODE; /* Default */
+
+ v = 0;
+ for (i = 0; i < len; i++) {
+ v <<= 4;
+ key = entry_buf[i] | 0x20;
+ v += (key > '9') ? key-'a'+10 : key-'0';
+ }
+
+ return v;
+}
+
+static void display_menu(void)
+{
+ struct card_info *card;
+ struct mode_info *mi;
+ char ch;
+ int i;
+ int nmodes;
+ int modes_per_line;
+ int col;
+
+ nmodes = 0;
+ for (card = video_cards; card < video_cards_end; card++)
+ nmodes += card->nmodes;
+
+ modes_per_line = 1;
+ if (nmodes >= 20)
+ modes_per_line = 3;
+
+ for (col = 0; col < modes_per_line; col++)
+ puts("Mode: Resolution: Type: ");
+ putchar('\n');
+
+ col = 0;
+ ch = '0';
+ for (card = video_cards; card < video_cards_end; card++) {
+ mi = card->modes;
+ for (i = 0; i < card->nmodes; i++, mi++) {
+ char resbuf[32];
+ int visible = mi->x && mi->y;
+ u16 mode_id = mi->mode ? mi->mode :
+ (mi->y << 8)+mi->x;
+
+ if (!visible)
+ continue; /* Hidden mode */
+
+ if (mi->depth)
+ sprintf(resbuf, "%dx%d", mi->y, mi->depth);
+ else
+ sprintf(resbuf, "%d", mi->y);
+
+ printf("%c %03X %4dx%-7s %-6s",
+ ch, mode_id, mi->x, resbuf, card->card_name);
+ col++;
+ if (col >= modes_per_line) {
+ putchar('\n');
+ col = 0;
+ }
+
+ if (ch == '9')
+ ch = 'a';
+ else if (ch == 'z' || ch == ' ')
+ ch = ' '; /* Out of keys... */
+ else
+ ch++;
+ }
+ }
+ if (col)
+ putchar('\n');
+}
+
+#define H(x) ((x)-'a'+10)
+#define SCAN ((H('s')<<12)+(H('c')<<8)+(H('a')<<4)+H('n'))
+
+static unsigned int mode_menu(void)
+{
+ int key;
+ unsigned int sel;
+
+ puts("Press <ENTER> to see video modes available, "
+ "<SPACE> to continue, or wait 30 sec\n");
+
+ kbd_flush();
+ while (1) {
+ key = getchar_timeout();
+ if (key == ' ' || key == 0)
+ return VIDEO_CURRENT_MODE; /* Default */
+ if (key == '\r')
+ break;
+ putchar('\a'); /* Beep! */
+ }
+
+
+ for (;;) {
+ display_menu();
+
+ puts("Enter a video mode or \"scan\" to scan for "
+ "additional modes: ");
+ sel = get_entry();
+ if (sel != SCAN)
+ return sel;
+
+ probe_cards(1);
+ }
+}
+
+/* Save screen content to the heap */
+static struct saved_screen {
+ int x, y;
+ int curx, cury;
+ u16 *data;
+} saved;
+
+static void save_screen(void)
+{
+ /* Should be called after store_mode_params() */
+ saved.x = boot_params.screen_info.orig_video_cols;
+ saved.y = boot_params.screen_info.orig_video_lines;
+ saved.curx = boot_params.screen_info.orig_x;
+ saved.cury = boot_params.screen_info.orig_y;
+
+ if (!heap_free(saved.x*saved.y*sizeof(u16)+512))
+ return; /* Not enough heap to save the screen */
+
+ saved.data = GET_HEAP(u16, saved.x*saved.y);
+
+ set_fs(video_segment);
+ copy_from_fs(saved.data, 0, saved.x*saved.y*sizeof(u16));
+}
+
+static void restore_screen(void)
+{
+ /* Should be called after store_mode_params() */
+ int xs = boot_params.screen_info.orig_video_cols;
+ int ys = boot_params.screen_info.orig_video_lines;
+ int y;
+ addr_t dst = 0;
+ u16 *src = saved.data;
+ struct biosregs ireg;
+
+ if (graphic_mode)
+ return; /* Can't restore onto a graphic mode */
+
+ if (!src)
+ return; /* No saved screen contents */
+
+ /* Restore screen contents */
+
+ set_fs(video_segment);
+ for (y = 0; y < ys; y++) {
+ int npad;
+
+ if (y < saved.y) {
+ int copy = (xs < saved.x) ? xs : saved.x;
+ copy_to_fs(dst, src, copy*sizeof(u16));
+ dst += copy*sizeof(u16);
+ src += saved.x;
+ npad = (xs < saved.x) ? 0 : xs-saved.x;
+ } else {
+ npad = xs;
+ }
+
+ /* Writes "npad" blank characters to
+ video_segment:dst and advances dst */
+ asm volatile("pushw %%es ; "
+ "movw %2,%%es ; "
+ "shrw %%cx ; "
+ "jnc 1f ; "
+ "stosw \n\t"
+ "1: rep;stosl ; "
+ "popw %%es"
+ : "+D" (dst), "+c" (npad)
+ : "bdS" (video_segment),
+ "a" (0x07200720));
+ }
+
+ /* Restore cursor position */
+ if (saved.curx >= xs)
+ saved.curx = xs-1;
+ if (saved.cury >= ys)
+ saved.cury = ys-1;
+
+ initregs(&ireg);
+ ireg.ah = 0x02; /* Set cursor position */
+ ireg.dh = saved.cury;
+ ireg.dl = saved.curx;
+ intcall(0x10, &ireg, NULL);
+
+ store_cursor_position();
+}
+
+void set_video(void)
+{
+ u16 mode = boot_params.hdr.vid_mode;
+
+ RESET_HEAP();
+
+ store_mode_params();
+ save_screen();
+ probe_cards(0);
+
+ for (;;) {
+ if (mode == ASK_VGA)
+ mode = mode_menu();
+
+ if (!set_mode(mode))
+ break;
+
+ printf("Undefined video mode number: %x\n", mode);
+ mode = ASK_VGA;
+ }
+ boot_params.hdr.vid_mode = mode;
+ vesa_store_edid();
+ store_mode_params();
+
+ if (do_restore)
+ restore_screen();
+}
diff --git a/arch/x86/boot/video.h b/arch/x86/boot/video.h
new file mode 100644
index 000000000..b54e0328c
--- /dev/null
+++ b/arch/x86/boot/video.h
@@ -0,0 +1,120 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright 2007 rPath, Inc. - All Rights Reserved
+ *
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * Header file for the real-mode video probing code
+ */
+
+#ifndef BOOT_VIDEO_H
+#define BOOT_VIDEO_H
+
+#include <linux/types.h>
+
+/*
+ * This code uses an extended set of video mode numbers. These include:
+ * Aliases for standard modes
+ * NORMAL_VGA (-1)
+ * EXTENDED_VGA (-2)
+ * ASK_VGA (-3)
+ * Video modes numbered by menu position -- NOT RECOMMENDED because of lack
+ * of compatibility when extending the table. These are between 0x00 and 0xff.
+ */
+#define VIDEO_FIRST_MENU 0x0000
+
+/* Standard BIOS video modes (BIOS number + 0x0100) */
+#define VIDEO_FIRST_BIOS 0x0100
+
+/* VESA BIOS video modes (VESA number + 0x0200) */
+#define VIDEO_FIRST_VESA 0x0200
+
+/* Video7 special modes (BIOS number + 0x0900) */
+#define VIDEO_FIRST_V7 0x0900
+
+/* Special video modes */
+#define VIDEO_FIRST_SPECIAL 0x0f00
+#define VIDEO_80x25 0x0f00
+#define VIDEO_8POINT 0x0f01
+#define VIDEO_80x43 0x0f02
+#define VIDEO_80x28 0x0f03
+#define VIDEO_CURRENT_MODE 0x0f04
+#define VIDEO_80x30 0x0f05
+#define VIDEO_80x34 0x0f06
+#define VIDEO_80x60 0x0f07
+#define VIDEO_GFX_HACK 0x0f08
+#define VIDEO_LAST_SPECIAL 0x0f09
+
+/* Video modes given by resolution */
+#define VIDEO_FIRST_RESOLUTION 0x1000
+
+/* The "recalculate timings" flag */
+#define VIDEO_RECALC 0x8000
+
+void store_screen(void);
+#define DO_STORE() store_screen()
+
+/*
+ * Mode table structures
+ */
+
+struct mode_info {
+ u16 mode; /* Mode number (vga= style) */
+ u16 x, y; /* Width, height */
+ u16 depth; /* Bits per pixel, 0 for text mode */
+};
+
+struct card_info {
+ const char *card_name;
+ int (*set_mode)(struct mode_info *mode);
+ int (*probe)(void);
+ struct mode_info *modes;
+ int nmodes; /* Number of probed modes so far */
+ int unsafe; /* Probing is unsafe, only do after "scan" */
+ u16 xmode_first; /* Unprobed modes to try to call anyway */
+ u16 xmode_n; /* Size of unprobed mode range */
+};
+
+#define __videocard struct card_info __attribute__((used,section(".videocards")))
+extern struct card_info video_cards[], video_cards_end[];
+
+int mode_defined(u16 mode); /* video.c */
+
+/* Basic video information */
+#define ADAPTER_CGA 0 /* CGA/MDA/HGC */
+#define ADAPTER_EGA 1
+#define ADAPTER_VGA 2
+
+extern int adapter;
+extern int force_x, force_y; /* Don't query the BIOS for cols/rows */
+extern int do_restore; /* Restore screen contents */
+extern int graphic_mode; /* Graphics mode with linear frame buffer */
+
+/* Accessing VGA indexed registers */
+static inline u8 in_idx(u16 port, u8 index)
+{
+ outb(index, port);
+ return inb(port+1);
+}
+
+static inline void out_idx(u8 v, u16 port, u8 index)
+{
+ outw(index+(v << 8), port);
+}
+
+/* Writes a value to an indexed port and then reads the port again */
+static inline u8 tst_idx(u8 v, u16 port, u8 index)
+{
+ out_idx(port, index, v);
+ return in_idx(port, index);
+}
+
+/* Get the I/O port of the VGA CRTC */
+u16 vga_crtc(void); /* video-vga.c */
+
+#endif /* BOOT_VIDEO_H */
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
new file mode 100644
index 000000000..9218cb128
--- /dev/null
+++ b/arch/x86/configs/i386_defconfig
@@ -0,0 +1,309 @@
+# CONFIG_64BIT is not set
+# CONFIG_LOCALVERSION_AUTO is not set
+CONFIG_SYSVIPC=y
+CONFIG_POSIX_MQUEUE=y
+CONFIG_BSD_PROCESS_ACCT=y
+CONFIG_TASKSTATS=y
+CONFIG_TASK_DELAY_ACCT=y
+CONFIG_TASK_XACCT=y
+CONFIG_TASK_IO_ACCOUNTING=y
+CONFIG_FHANDLE=y
+CONFIG_AUDIT=y
+CONFIG_NO_HZ=y
+CONFIG_HIGH_RES_TIMERS=y
+CONFIG_LOG_BUF_SHIFT=18
+CONFIG_CGROUPS=y
+CONFIG_CGROUP_FREEZER=y
+CONFIG_CPUSETS=y
+CONFIG_CGROUP_CPUACCT=y
+CONFIG_CGROUP_SCHED=y
+CONFIG_BLK_DEV_INITRD=y
+# CONFIG_COMPAT_BRK is not set
+CONFIG_PROFILING=y
+CONFIG_KPROBES=y
+CONFIG_JUMP_LABEL=y
+CONFIG_MODULES=y
+CONFIG_MODULE_UNLOAD=y
+CONFIG_MODULE_FORCE_UNLOAD=y
+CONFIG_PARTITION_ADVANCED=y
+CONFIG_OSF_PARTITION=y
+CONFIG_AMIGA_PARTITION=y
+CONFIG_MAC_PARTITION=y
+CONFIG_BSD_DISKLABEL=y
+CONFIG_MINIX_SUBPARTITION=y
+CONFIG_SOLARIS_X86_PARTITION=y
+CONFIG_UNIXWARE_DISKLABEL=y
+CONFIG_SGI_PARTITION=y
+CONFIG_SUN_PARTITION=y
+CONFIG_KARMA_PARTITION=y
+CONFIG_EFI_PARTITION=y
+CONFIG_SMP=y
+CONFIG_X86_GENERIC=y
+CONFIG_HPET_TIMER=y
+CONFIG_SCHED_SMT=y
+CONFIG_PREEMPT_VOLUNTARY=y
+CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y
+CONFIG_X86_MCE=y
+CONFIG_X86_REBOOTFIXUPS=y
+CONFIG_MICROCODE=y
+CONFIG_MICROCODE_AMD=y
+CONFIG_X86_MSR=y
+CONFIG_X86_CPUID=y
+CONFIG_HIGHPTE=y
+CONFIG_X86_CHECK_BIOS_CORRUPTION=y
+# CONFIG_MTRR_SANITIZER is not set
+CONFIG_EFI=y
+CONFIG_HZ_1000=y
+CONFIG_KEXEC=y
+CONFIG_CRASH_DUMP=y
+CONFIG_RANDOMIZE_BASE=y
+CONFIG_RANDOMIZE_MEMORY=y
+# CONFIG_COMPAT_VDSO is not set
+CONFIG_HIBERNATION=y
+CONFIG_PM_DEBUG=y
+CONFIG_PM_TRACE_RTC=y
+CONFIG_ACPI_DOCK=y
+CONFIG_CPU_FREQ=y
+# CONFIG_CPU_FREQ_STAT is not set
+CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE=y
+CONFIG_CPU_FREQ_GOV_PERFORMANCE=y
+CONFIG_CPU_FREQ_GOV_ONDEMAND=y
+CONFIG_X86_ACPI_CPUFREQ=y
+CONFIG_PCIEPORTBUS=y
+CONFIG_PCI_MSI=y
+CONFIG_PCCARD=y
+CONFIG_YENTA=y
+CONFIG_HOTPLUG_PCI=y
+CONFIG_BINFMT_MISC=y
+CONFIG_NET=y
+CONFIG_PACKET=y
+CONFIG_UNIX=y
+CONFIG_XFRM_USER=y
+CONFIG_INET=y
+CONFIG_IP_MULTICAST=y
+CONFIG_IP_ADVANCED_ROUTER=y
+CONFIG_IP_MULTIPLE_TABLES=y
+CONFIG_IP_ROUTE_MULTIPATH=y
+CONFIG_IP_ROUTE_VERBOSE=y
+CONFIG_IP_PNP=y
+CONFIG_IP_PNP_DHCP=y
+CONFIG_IP_PNP_BOOTP=y
+CONFIG_IP_PNP_RARP=y
+CONFIG_IP_MROUTE=y
+CONFIG_IP_PIMSM_V1=y
+CONFIG_IP_PIMSM_V2=y
+CONFIG_SYN_COOKIES=y
+# CONFIG_INET_XFRM_MODE_TRANSPORT is not set
+# CONFIG_INET_XFRM_MODE_TUNNEL is not set
+# CONFIG_INET_XFRM_MODE_BEET is not set
+# CONFIG_INET_DIAG is not set
+CONFIG_TCP_CONG_ADVANCED=y
+# CONFIG_TCP_CONG_BIC is not set
+# CONFIG_TCP_CONG_WESTWOOD is not set
+# CONFIG_TCP_CONG_HTCP is not set
+CONFIG_TCP_MD5SIG=y
+CONFIG_IPV6=y
+CONFIG_INET6_AH=y
+CONFIG_INET6_ESP=y
+CONFIG_NETLABEL=y
+CONFIG_NETFILTER=y
+# CONFIG_NETFILTER_ADVANCED is not set
+CONFIG_NF_CONNTRACK=y
+CONFIG_NF_CONNTRACK_FTP=y
+CONFIG_NF_CONNTRACK_IRC=y
+CONFIG_NF_CONNTRACK_SIP=y
+CONFIG_NF_CT_NETLINK=y
+CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y
+CONFIG_NETFILTER_XT_TARGET_NFLOG=y
+CONFIG_NETFILTER_XT_TARGET_SECMARK=y
+CONFIG_NETFILTER_XT_TARGET_TCPMSS=y
+CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y
+CONFIG_NETFILTER_XT_MATCH_POLICY=y
+CONFIG_NETFILTER_XT_MATCH_STATE=y
+CONFIG_NF_CONNTRACK_IPV4=y
+CONFIG_IP_NF_IPTABLES=y
+CONFIG_IP_NF_FILTER=y
+CONFIG_IP_NF_TARGET_REJECT=y
+CONFIG_NF_NAT=y
+CONFIG_IP_NF_TARGET_MASQUERADE=y
+CONFIG_IP_NF_MANGLE=y
+CONFIG_NF_CONNTRACK_IPV6=y
+CONFIG_IP6_NF_IPTABLES=y
+CONFIG_IP6_NF_MATCH_IPV6HEADER=y
+CONFIG_IP6_NF_FILTER=y
+CONFIG_IP6_NF_TARGET_REJECT=y
+CONFIG_IP6_NF_MANGLE=y
+CONFIG_NET_SCHED=y
+CONFIG_NET_EMATCH=y
+CONFIG_NET_CLS_ACT=y
+CONFIG_HAMRADIO=y
+CONFIG_CFG80211=y
+CONFIG_MAC80211=y
+CONFIG_MAC80211_LEDS=y
+CONFIG_RFKILL=y
+CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
+CONFIG_DEVTMPFS=y
+CONFIG_DEVTMPFS_MOUNT=y
+CONFIG_DEBUG_DEVRES=y
+CONFIG_CONNECTOR=y
+CONFIG_BLK_DEV_LOOP=y
+CONFIG_BLK_DEV_SD=y
+CONFIG_BLK_DEV_SR=y
+CONFIG_CHR_DEV_SG=y
+CONFIG_SCSI_CONSTANTS=y
+CONFIG_SCSI_SPI_ATTRS=y
+# CONFIG_SCSI_LOWLEVEL is not set
+CONFIG_ATA=y
+CONFIG_SATA_AHCI=y
+CONFIG_ATA_PIIX=y
+CONFIG_PATA_AMD=y
+CONFIG_PATA_OLDPIIX=y
+CONFIG_PATA_SCH=y
+CONFIG_PATA_MPIIX=y
+CONFIG_ATA_GENERIC=y
+CONFIG_MD=y
+CONFIG_BLK_DEV_MD=y
+CONFIG_BLK_DEV_DM=y
+CONFIG_DM_MIRROR=y
+CONFIG_DM_ZERO=y
+CONFIG_MACINTOSH_DRIVERS=y
+CONFIG_MAC_EMUMOUSEBTN=y
+CONFIG_NETDEVICES=y
+CONFIG_NETCONSOLE=y
+CONFIG_BNX2=y
+CONFIG_TIGON3=y
+CONFIG_NET_TULIP=y
+CONFIG_E100=y
+CONFIG_E1000=y
+CONFIG_E1000E=y
+CONFIG_SKY2=y
+CONFIG_NE2K_PCI=y
+CONFIG_FORCEDETH=y
+CONFIG_8139TOO=y
+# CONFIG_8139TOO_PIO is not set
+CONFIG_R8169=y
+CONFIG_FDDI=y
+CONFIG_INPUT_POLLDEV=y
+# CONFIG_INPUT_MOUSEDEV_PSAUX is not set
+CONFIG_INPUT_EVDEV=y
+CONFIG_INPUT_JOYSTICK=y
+CONFIG_INPUT_TABLET=y
+CONFIG_INPUT_TOUCHSCREEN=y
+CONFIG_INPUT_MISC=y
+CONFIG_VT_HW_CONSOLE_BINDING=y
+# CONFIG_LEGACY_PTYS is not set
+CONFIG_SERIAL_NONSTANDARD=y
+CONFIG_SERIAL_8250=y
+CONFIG_SERIAL_8250_CONSOLE=y
+CONFIG_SERIAL_8250_NR_UARTS=32
+CONFIG_SERIAL_8250_EXTENDED=y
+CONFIG_SERIAL_8250_MANY_PORTS=y
+CONFIG_SERIAL_8250_SHARE_IRQ=y
+CONFIG_SERIAL_8250_DETECT_IRQ=y
+CONFIG_SERIAL_8250_RSA=y
+CONFIG_HW_RANDOM=y
+CONFIG_NVRAM=y
+CONFIG_HPET=y
+# CONFIG_HPET_MMAP is not set
+CONFIG_I2C_I801=y
+CONFIG_WATCHDOG=y
+CONFIG_AGP=y
+CONFIG_AGP_AMD64=y
+CONFIG_AGP_INTEL=y
+CONFIG_DRM=y
+CONFIG_DRM_I915=y
+CONFIG_FB_MODE_HELPERS=y
+CONFIG_FB_TILEBLITTING=y
+CONFIG_FB_EFI=y
+# CONFIG_LCD_CLASS_DEVICE is not set
+CONFIG_LOGO=y
+# CONFIG_LOGO_LINUX_MONO is not set
+# CONFIG_LOGO_LINUX_VGA16 is not set
+CONFIG_SOUND=y
+CONFIG_SND=y
+CONFIG_SND_SEQUENCER=y
+CONFIG_SND_SEQ_DUMMY=y
+CONFIG_SND_MIXER_OSS=y
+CONFIG_SND_PCM_OSS=y
+CONFIG_SND_SEQUENCER_OSS=y
+CONFIG_SND_HRTIMER=y
+CONFIG_SND_HDA_INTEL=y
+CONFIG_SND_HDA_HWDEP=y
+CONFIG_HIDRAW=y
+CONFIG_HID_GYRATION=y
+CONFIG_LOGITECH_FF=y
+CONFIG_HID_NTRIG=y
+CONFIG_HID_PANTHERLORD=y
+CONFIG_PANTHERLORD_FF=y
+CONFIG_HID_PETALYNX=y
+CONFIG_HID_SAMSUNG=y
+CONFIG_HID_SONY=y
+CONFIG_HID_SUNPLUS=y
+CONFIG_HID_TOPSEED=y
+CONFIG_HID_PID=y
+CONFIG_USB_HIDDEV=y
+CONFIG_USB=y
+CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
+CONFIG_USB_MON=y
+CONFIG_USB_XHCI_HCD=y
+CONFIG_USB_EHCI_HCD=y
+CONFIG_USB_EHCI_TT_NEWSCHED=y
+CONFIG_USB_OHCI_HCD=y
+CONFIG_USB_UHCI_HCD=y
+CONFIG_USB_PRINTER=y
+CONFIG_USB_STORAGE=y
+CONFIG_EDAC=y
+CONFIG_RTC_CLASS=y
+# CONFIG_RTC_HCTOSYS is not set
+CONFIG_DMADEVICES=y
+CONFIG_EEEPC_LAPTOP=y
+CONFIG_EFI_VARS=y
+CONFIG_EXT4_FS=y
+CONFIG_EXT4_FS_POSIX_ACL=y
+CONFIG_EXT4_FS_SECURITY=y
+CONFIG_QUOTA=y
+CONFIG_QUOTA_NETLINK_INTERFACE=y
+# CONFIG_PRINT_QUOTA_WARNING is not set
+CONFIG_QFMT_V2=y
+CONFIG_AUTOFS4_FS=y
+CONFIG_ISO9660_FS=y
+CONFIG_JOLIET=y
+CONFIG_ZISOFS=y
+CONFIG_MSDOS_FS=y
+CONFIG_VFAT_FS=y
+CONFIG_PROC_KCORE=y
+CONFIG_TMPFS_POSIX_ACL=y
+CONFIG_HUGETLBFS=y
+CONFIG_NFS_FS=y
+CONFIG_NFS_V3_ACL=y
+CONFIG_NFS_V4=y
+CONFIG_ROOT_NFS=y
+CONFIG_NLS_DEFAULT="utf8"
+CONFIG_NLS_CODEPAGE_437=y
+CONFIG_NLS_ASCII=y
+CONFIG_NLS_ISO8859_1=y
+CONFIG_NLS_UTF8=y
+CONFIG_PRINTK_TIME=y
+# CONFIG_ENABLE_WARN_DEPRECATED is not set
+CONFIG_FRAME_WARN=1024
+CONFIG_MAGIC_SYSRQ=y
+# CONFIG_UNUSED_SYMBOLS is not set
+CONFIG_DEBUG_KERNEL=y
+# CONFIG_SCHED_DEBUG is not set
+CONFIG_SCHEDSTATS=y
+CONFIG_TIMER_STATS=y
+CONFIG_DEBUG_STACK_USAGE=y
+CONFIG_BLK_DEV_IO_TRACE=y
+CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
+CONFIG_EARLY_PRINTK_DBGP=y
+CONFIG_DEBUG_STACKOVERFLOW=y
+# CONFIG_DEBUG_RODATA_TEST is not set
+CONFIG_DEBUG_BOOT_PARAMS=y
+CONFIG_OPTIMIZE_INLINING=y
+CONFIG_SECURITY=y
+CONFIG_SECURITY_NETWORK=y
+CONFIG_SECURITY_SELINUX=y
+CONFIG_SECURITY_SELINUX_BOOTPARAM=y
+CONFIG_SECURITY_SELINUX_DISABLE=y
+CONFIG_CRYPTO_AES_586=y
+# CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/arch/x86/configs/tiny.config b/arch/x86/configs/tiny.config
new file mode 100644
index 000000000..66c9e2aab
--- /dev/null
+++ b/arch/x86/configs/tiny.config
@@ -0,0 +1,5 @@
+CONFIG_NOHIGHMEM=y
+# CONFIG_HIGHMEM4G is not set
+# CONFIG_HIGHMEM64G is not set
+CONFIG_UNWINDER_GUESS=y
+# CONFIG_UNWINDER_FRAME_POINTER is not set
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
new file mode 100644
index 000000000..146a12293
--- /dev/null
+++ b/arch/x86/configs/x86_64_defconfig
@@ -0,0 +1,307 @@
+# CONFIG_LOCALVERSION_AUTO is not set
+CONFIG_SYSVIPC=y
+CONFIG_POSIX_MQUEUE=y
+CONFIG_BSD_PROCESS_ACCT=y
+CONFIG_TASKSTATS=y
+CONFIG_TASK_DELAY_ACCT=y
+CONFIG_TASK_XACCT=y
+CONFIG_TASK_IO_ACCOUNTING=y
+CONFIG_FHANDLE=y
+CONFIG_AUDIT=y
+CONFIG_NO_HZ=y
+CONFIG_HIGH_RES_TIMERS=y
+CONFIG_LOG_BUF_SHIFT=18
+CONFIG_CGROUPS=y
+CONFIG_CGROUP_FREEZER=y
+CONFIG_CPUSETS=y
+CONFIG_CGROUP_CPUACCT=y
+CONFIG_CGROUP_SCHED=y
+CONFIG_BLK_DEV_INITRD=y
+# CONFIG_COMPAT_BRK is not set
+CONFIG_PROFILING=y
+CONFIG_KPROBES=y
+CONFIG_JUMP_LABEL=y
+CONFIG_MODULES=y
+CONFIG_MODULE_UNLOAD=y
+CONFIG_MODULE_FORCE_UNLOAD=y
+CONFIG_PARTITION_ADVANCED=y
+CONFIG_OSF_PARTITION=y
+CONFIG_AMIGA_PARTITION=y
+CONFIG_MAC_PARTITION=y
+CONFIG_BSD_DISKLABEL=y
+CONFIG_MINIX_SUBPARTITION=y
+CONFIG_SOLARIS_X86_PARTITION=y
+CONFIG_UNIXWARE_DISKLABEL=y
+CONFIG_SGI_PARTITION=y
+CONFIG_SUN_PARTITION=y
+CONFIG_KARMA_PARTITION=y
+CONFIG_EFI_PARTITION=y
+CONFIG_SMP=y
+CONFIG_CALGARY_IOMMU=y
+CONFIG_NR_CPUS=64
+CONFIG_SCHED_SMT=y
+CONFIG_PREEMPT_VOLUNTARY=y
+CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y
+CONFIG_X86_MCE=y
+CONFIG_MICROCODE=y
+CONFIG_MICROCODE_AMD=y
+CONFIG_X86_MSR=y
+CONFIG_X86_CPUID=y
+CONFIG_NUMA=y
+CONFIG_X86_CHECK_BIOS_CORRUPTION=y
+# CONFIG_MTRR_SANITIZER is not set
+CONFIG_EFI=y
+CONFIG_HZ_1000=y
+CONFIG_KEXEC=y
+CONFIG_CRASH_DUMP=y
+CONFIG_RANDOMIZE_BASE=y
+CONFIG_RANDOMIZE_MEMORY=y
+# CONFIG_COMPAT_VDSO is not set
+CONFIG_HIBERNATION=y
+CONFIG_PM_DEBUG=y
+CONFIG_PM_TRACE_RTC=y
+CONFIG_ACPI_DOCK=y
+CONFIG_CPU_FREQ=y
+# CONFIG_CPU_FREQ_STAT is not set
+CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE=y
+CONFIG_CPU_FREQ_GOV_PERFORMANCE=y
+CONFIG_CPU_FREQ_GOV_ONDEMAND=y
+CONFIG_X86_ACPI_CPUFREQ=y
+CONFIG_PCI_MMCONFIG=y
+CONFIG_PCIEPORTBUS=y
+CONFIG_PCCARD=y
+CONFIG_YENTA=y
+CONFIG_HOTPLUG_PCI=y
+CONFIG_BINFMT_MISC=y
+CONFIG_IA32_EMULATION=y
+CONFIG_NET=y
+CONFIG_PACKET=y
+CONFIG_UNIX=y
+CONFIG_XFRM_USER=y
+CONFIG_INET=y
+CONFIG_IP_MULTICAST=y
+CONFIG_IP_ADVANCED_ROUTER=y
+CONFIG_IP_MULTIPLE_TABLES=y
+CONFIG_IP_ROUTE_MULTIPATH=y
+CONFIG_IP_ROUTE_VERBOSE=y
+CONFIG_IP_PNP=y
+CONFIG_IP_PNP_DHCP=y
+CONFIG_IP_PNP_BOOTP=y
+CONFIG_IP_PNP_RARP=y
+CONFIG_IP_MROUTE=y
+CONFIG_IP_PIMSM_V1=y
+CONFIG_IP_PIMSM_V2=y
+CONFIG_SYN_COOKIES=y
+# CONFIG_INET_XFRM_MODE_TRANSPORT is not set
+# CONFIG_INET_XFRM_MODE_TUNNEL is not set
+# CONFIG_INET_XFRM_MODE_BEET is not set
+# CONFIG_INET_DIAG is not set
+CONFIG_TCP_CONG_ADVANCED=y
+# CONFIG_TCP_CONG_BIC is not set
+# CONFIG_TCP_CONG_WESTWOOD is not set
+# CONFIG_TCP_CONG_HTCP is not set
+CONFIG_TCP_MD5SIG=y
+CONFIG_IPV6=y
+CONFIG_INET6_AH=y
+CONFIG_INET6_ESP=y
+CONFIG_NETLABEL=y
+CONFIG_NETFILTER=y
+# CONFIG_NETFILTER_ADVANCED is not set
+CONFIG_NF_CONNTRACK=y
+CONFIG_NF_CONNTRACK_FTP=y
+CONFIG_NF_CONNTRACK_IRC=y
+CONFIG_NF_CONNTRACK_SIP=y
+CONFIG_NF_CT_NETLINK=y
+CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y
+CONFIG_NETFILTER_XT_TARGET_NFLOG=y
+CONFIG_NETFILTER_XT_TARGET_SECMARK=y
+CONFIG_NETFILTER_XT_TARGET_TCPMSS=y
+CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y
+CONFIG_NETFILTER_XT_MATCH_POLICY=y
+CONFIG_NETFILTER_XT_MATCH_STATE=y
+CONFIG_NF_CONNTRACK_IPV4=y
+CONFIG_IP_NF_IPTABLES=y
+CONFIG_IP_NF_FILTER=y
+CONFIG_IP_NF_TARGET_REJECT=y
+CONFIG_NF_NAT=y
+CONFIG_IP_NF_TARGET_MASQUERADE=y
+CONFIG_IP_NF_MANGLE=y
+CONFIG_NF_CONNTRACK_IPV6=y
+CONFIG_IP6_NF_IPTABLES=y
+CONFIG_IP6_NF_MATCH_IPV6HEADER=y
+CONFIG_IP6_NF_FILTER=y
+CONFIG_IP6_NF_TARGET_REJECT=y
+CONFIG_IP6_NF_MANGLE=y
+CONFIG_NET_SCHED=y
+CONFIG_NET_EMATCH=y
+CONFIG_NET_CLS_ACT=y
+CONFIG_HAMRADIO=y
+CONFIG_CFG80211=y
+CONFIG_MAC80211=y
+CONFIG_MAC80211_LEDS=y
+CONFIG_RFKILL=y
+CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
+CONFIG_DEVTMPFS=y
+CONFIG_DEVTMPFS_MOUNT=y
+CONFIG_DEBUG_DEVRES=y
+CONFIG_CONNECTOR=y
+CONFIG_BLK_DEV_LOOP=y
+CONFIG_BLK_DEV_SD=y
+CONFIG_BLK_DEV_SR=y
+CONFIG_CHR_DEV_SG=y
+CONFIG_SCSI_CONSTANTS=y
+CONFIG_SCSI_SPI_ATTRS=y
+# CONFIG_SCSI_LOWLEVEL is not set
+CONFIG_ATA=y
+CONFIG_SATA_AHCI=y
+CONFIG_ATA_PIIX=y
+CONFIG_PATA_AMD=y
+CONFIG_PATA_OLDPIIX=y
+CONFIG_PATA_SCH=y
+CONFIG_MD=y
+CONFIG_BLK_DEV_MD=y
+CONFIG_BLK_DEV_DM=y
+CONFIG_DM_MIRROR=y
+CONFIG_DM_ZERO=y
+CONFIG_MACINTOSH_DRIVERS=y
+CONFIG_MAC_EMUMOUSEBTN=y
+CONFIG_NETDEVICES=y
+CONFIG_NETCONSOLE=y
+CONFIG_TIGON3=y
+CONFIG_NET_TULIP=y
+CONFIG_E100=y
+CONFIG_E1000=y
+CONFIG_E1000E=y
+CONFIG_SKY2=y
+CONFIG_FORCEDETH=y
+CONFIG_8139TOO=y
+CONFIG_R8169=y
+CONFIG_FDDI=y
+CONFIG_INPUT_POLLDEV=y
+# CONFIG_INPUT_MOUSEDEV_PSAUX is not set
+CONFIG_INPUT_EVDEV=y
+CONFIG_INPUT_JOYSTICK=y
+CONFIG_INPUT_TABLET=y
+CONFIG_INPUT_TOUCHSCREEN=y
+CONFIG_INPUT_MISC=y
+CONFIG_VT_HW_CONSOLE_BINDING=y
+# CONFIG_LEGACY_PTYS is not set
+CONFIG_SERIAL_NONSTANDARD=y
+CONFIG_SERIAL_8250=y
+CONFIG_SERIAL_8250_CONSOLE=y
+CONFIG_SERIAL_8250_NR_UARTS=32
+CONFIG_SERIAL_8250_EXTENDED=y
+CONFIG_SERIAL_8250_MANY_PORTS=y
+CONFIG_SERIAL_8250_SHARE_IRQ=y
+CONFIG_SERIAL_8250_DETECT_IRQ=y
+CONFIG_SERIAL_8250_RSA=y
+CONFIG_HW_RANDOM=y
+# CONFIG_HW_RANDOM_INTEL is not set
+# CONFIG_HW_RANDOM_AMD is not set
+CONFIG_NVRAM=y
+CONFIG_HPET=y
+# CONFIG_HPET_MMAP is not set
+CONFIG_I2C_I801=y
+CONFIG_WATCHDOG=y
+CONFIG_AGP=y
+CONFIG_AGP_AMD64=y
+CONFIG_AGP_INTEL=y
+CONFIG_DRM=y
+CONFIG_DRM_I915=y
+CONFIG_FB_MODE_HELPERS=y
+CONFIG_FB_TILEBLITTING=y
+CONFIG_FB_EFI=y
+# CONFIG_LCD_CLASS_DEVICE is not set
+CONFIG_LOGO=y
+# CONFIG_LOGO_LINUX_MONO is not set
+# CONFIG_LOGO_LINUX_VGA16 is not set
+CONFIG_SOUND=y
+CONFIG_SND=y
+CONFIG_SND_SEQUENCER=y
+CONFIG_SND_SEQ_DUMMY=y
+CONFIG_SND_MIXER_OSS=y
+CONFIG_SND_PCM_OSS=y
+CONFIG_SND_SEQUENCER_OSS=y
+CONFIG_SND_HRTIMER=y
+CONFIG_SND_HDA_INTEL=y
+CONFIG_SND_HDA_HWDEP=y
+CONFIG_HIDRAW=y
+CONFIG_HID_GYRATION=y
+CONFIG_LOGITECH_FF=y
+CONFIG_HID_NTRIG=y
+CONFIG_HID_PANTHERLORD=y
+CONFIG_PANTHERLORD_FF=y
+CONFIG_HID_PETALYNX=y
+CONFIG_HID_SAMSUNG=y
+CONFIG_HID_SONY=y
+CONFIG_HID_SUNPLUS=y
+CONFIG_HID_TOPSEED=y
+CONFIG_HID_PID=y
+CONFIG_USB_HIDDEV=y
+CONFIG_USB=y
+CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
+CONFIG_USB_MON=y
+CONFIG_USB_XHCI_HCD=y
+CONFIG_USB_EHCI_HCD=y
+CONFIG_USB_EHCI_TT_NEWSCHED=y
+CONFIG_USB_OHCI_HCD=y
+CONFIG_USB_UHCI_HCD=y
+CONFIG_USB_PRINTER=y
+CONFIG_USB_STORAGE=y
+CONFIG_EDAC=y
+CONFIG_RTC_CLASS=y
+# CONFIG_RTC_HCTOSYS is not set
+CONFIG_DMADEVICES=y
+CONFIG_EEEPC_LAPTOP=y
+CONFIG_AMD_IOMMU=y
+CONFIG_INTEL_IOMMU=y
+# CONFIG_INTEL_IOMMU_DEFAULT_ON is not set
+CONFIG_EFI_VARS=y
+CONFIG_EXT4_FS=y
+CONFIG_EXT4_FS_POSIX_ACL=y
+CONFIG_EXT4_FS_SECURITY=y
+CONFIG_QUOTA=y
+CONFIG_QUOTA_NETLINK_INTERFACE=y
+# CONFIG_PRINT_QUOTA_WARNING is not set
+CONFIG_QFMT_V2=y
+CONFIG_AUTOFS4_FS=y
+CONFIG_ISO9660_FS=y
+CONFIG_JOLIET=y
+CONFIG_ZISOFS=y
+CONFIG_MSDOS_FS=y
+CONFIG_VFAT_FS=y
+CONFIG_PROC_KCORE=y
+CONFIG_TMPFS_POSIX_ACL=y
+CONFIG_HUGETLBFS=y
+CONFIG_NFS_FS=y
+CONFIG_NFS_V3_ACL=y
+CONFIG_NFS_V4=y
+CONFIG_ROOT_NFS=y
+CONFIG_NLS_DEFAULT="utf8"
+CONFIG_NLS_CODEPAGE_437=y
+CONFIG_NLS_ASCII=y
+CONFIG_NLS_ISO8859_1=y
+CONFIG_NLS_UTF8=y
+CONFIG_PRINTK_TIME=y
+# CONFIG_ENABLE_WARN_DEPRECATED is not set
+CONFIG_MAGIC_SYSRQ=y
+# CONFIG_UNUSED_SYMBOLS is not set
+CONFIG_DEBUG_KERNEL=y
+# CONFIG_SCHED_DEBUG is not set
+CONFIG_SCHEDSTATS=y
+CONFIG_TIMER_STATS=y
+CONFIG_DEBUG_STACK_USAGE=y
+CONFIG_BLK_DEV_IO_TRACE=y
+CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
+CONFIG_EARLY_PRINTK_DBGP=y
+CONFIG_DEBUG_STACKOVERFLOW=y
+# CONFIG_DEBUG_RODATA_TEST is not set
+CONFIG_DEBUG_BOOT_PARAMS=y
+CONFIG_OPTIMIZE_INLINING=y
+CONFIG_UNWINDER_ORC=y
+CONFIG_SECURITY=y
+CONFIG_SECURITY_NETWORK=y
+CONFIG_SECURITY_SELINUX=y
+CONFIG_SECURITY_SELINUX_BOOTPARAM=y
+CONFIG_SECURITY_SELINUX_DISABLE=y
+# CONFIG_CRYPTO_ANSI_CPRNG is not set
diff --git a/arch/x86/configs/xen.config b/arch/x86/configs/xen.config
new file mode 100644
index 000000000..d9fc7139f
--- /dev/null
+++ b/arch/x86/configs/xen.config
@@ -0,0 +1,28 @@
+# global x86 required specific stuff
+# On 32-bit HIGHMEM4G is not allowed
+CONFIG_HIGHMEM64G=y
+CONFIG_64BIT=y
+
+# These enable us to allow some of the
+# not so generic stuff below
+CONFIG_HYPERVISOR_GUEST=y
+CONFIG_PCI=y
+CONFIG_PCI_MSI=y
+CONFIG_X86_MCE=y
+CONFIG_ACPI_PROCESSOR=y
+CONFIG_CPU_FREQ=y
+
+# x86 xen specific config options
+CONFIG_XEN_PVH=y
+CONFIG_XEN_MAX_DOMAIN_MEMORY=500
+CONFIG_XEN_SAVE_RESTORE=y
+# CONFIG_XEN_DEBUG_FS is not set
+CONFIG_XEN_MCE_LOG=y
+CONFIG_XEN_ACPI_PROCESSOR=m
+# x86 specific backend drivers
+CONFIG_XEN_PCIDEV_BACKEND=m
+# x86 specific frontend drivers
+CONFIG_XEN_PCIDEV_FRONTEND=m
+# depends on MEMORY_HOTPLUG, arm64 doesn't enable this yet,
+# move to generic config if it ever does.
+CONFIG_XEN_BALLOON_MEMORY_HOTPLUG=y
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
new file mode 100644
index 000000000..a450ad573
--- /dev/null
+++ b/arch/x86/crypto/Makefile
@@ -0,0 +1,129 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Arch-specific CryptoAPI modules.
+#
+
+OBJECT_FILES_NON_STANDARD := y
+
+avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no)
+avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
+ $(comma)4)$(comma)%ymm2,yes,no)
+sha1_ni_supported :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,yes,no)
+sha256_ni_supported :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,yes,no)
+
+obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o
+
+obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o
+obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o
+obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o
+
+obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
+obj-$(CONFIG_CRYPTO_DES3_EDE_X86_64) += des3_ede-x86_64.o
+obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o
+obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
+obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
+obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o
+obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha20-x86_64.o
+obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o
+obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
+obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
+
+obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o
+obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o
+obj-$(CONFIG_CRYPTO_CRC32_PCLMUL) += crc32-pclmul.o
+obj-$(CONFIG_CRYPTO_SHA256_SSSE3) += sha256-ssse3.o
+obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o
+obj-$(CONFIG_CRYPTO_CRCT10DIF_PCLMUL) += crct10dif-pclmul.o
+obj-$(CONFIG_CRYPTO_POLY1305_X86_64) += poly1305-x86_64.o
+
+obj-$(CONFIG_CRYPTO_AEGIS128_AESNI_SSE2) += aegis128-aesni.o
+obj-$(CONFIG_CRYPTO_AEGIS128L_AESNI_SSE2) += aegis128l-aesni.o
+obj-$(CONFIG_CRYPTO_AEGIS256_AESNI_SSE2) += aegis256-aesni.o
+
+obj-$(CONFIG_CRYPTO_MORUS640_GLUE) += morus640_glue.o
+obj-$(CONFIG_CRYPTO_MORUS1280_GLUE) += morus1280_glue.o
+
+obj-$(CONFIG_CRYPTO_MORUS640_SSE2) += morus640-sse2.o
+obj-$(CONFIG_CRYPTO_MORUS1280_SSE2) += morus1280-sse2.o
+
+# These modules require assembler to support AVX.
+ifeq ($(avx_supported),yes)
+ obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += \
+ camellia-aesni-avx-x86_64.o
+ obj-$(CONFIG_CRYPTO_CAST5_AVX_X86_64) += cast5-avx-x86_64.o
+ obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o
+ obj-$(CONFIG_CRYPTO_TWOFISH_AVX_X86_64) += twofish-avx-x86_64.o
+ obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o
+endif
+
+# These modules require assembler to support AVX2.
+ifeq ($(avx2_supported),yes)
+ obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64) += camellia-aesni-avx2.o
+ obj-$(CONFIG_CRYPTO_SERPENT_AVX2_X86_64) += serpent-avx2.o
+ obj-$(CONFIG_CRYPTO_SHA1_MB) += sha1-mb/
+ obj-$(CONFIG_CRYPTO_SHA256_MB) += sha256-mb/
+ obj-$(CONFIG_CRYPTO_SHA512_MB) += sha512-mb/
+
+ obj-$(CONFIG_CRYPTO_MORUS1280_AVX2) += morus1280-avx2.o
+endif
+
+aes-i586-y := aes-i586-asm_32.o aes_glue.o
+twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o
+serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o
+
+aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o
+des3_ede-x86_64-y := des3_ede-asm_64.o des3_ede_glue.o
+camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o
+blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
+twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
+twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o
+chacha20-x86_64-y := chacha20-ssse3-x86_64.o chacha20_glue.o
+serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o
+
+aegis128-aesni-y := aegis128-aesni-asm.o aegis128-aesni-glue.o
+aegis128l-aesni-y := aegis128l-aesni-asm.o aegis128l-aesni-glue.o
+aegis256-aesni-y := aegis256-aesni-asm.o aegis256-aesni-glue.o
+
+morus640-sse2-y := morus640-sse2-asm.o morus640-sse2-glue.o
+morus1280-sse2-y := morus1280-sse2-asm.o morus1280-sse2-glue.o
+
+ifeq ($(avx_supported),yes)
+ camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \
+ camellia_aesni_avx_glue.o
+ cast5-avx-x86_64-y := cast5-avx-x86_64-asm_64.o cast5_avx_glue.o
+ cast6-avx-x86_64-y := cast6-avx-x86_64-asm_64.o cast6_avx_glue.o
+ twofish-avx-x86_64-y := twofish-avx-x86_64-asm_64.o \
+ twofish_avx_glue.o
+ serpent-avx-x86_64-y := serpent-avx-x86_64-asm_64.o \
+ serpent_avx_glue.o
+endif
+
+ifeq ($(avx2_supported),yes)
+ camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o
+ chacha20-x86_64-y += chacha20-avx2-x86_64.o
+ serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o
+
+ morus1280-avx2-y := morus1280-avx2-asm.o morus1280-avx2-glue.o
+endif
+
+aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
+aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o
+ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
+sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
+poly1305-x86_64-y := poly1305-sse2-x86_64.o poly1305_glue.o
+ifeq ($(avx2_supported),yes)
+sha1-ssse3-y += sha1_avx2_x86_64_asm.o
+poly1305-x86_64-y += poly1305-avx2-x86_64.o
+endif
+ifeq ($(sha1_ni_supported),yes)
+sha1-ssse3-y += sha1_ni_asm.o
+endif
+crc32c-intel-y := crc32c-intel_glue.o
+crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o
+crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o
+sha256-ssse3-y := sha256-ssse3-asm.o sha256-avx-asm.o sha256-avx2-asm.o sha256_ssse3_glue.o
+ifeq ($(sha256_ni_supported),yes)
+sha256-ssse3-y += sha256_ni_asm.o
+endif
+sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o
+crct10dif-pclmul-y := crct10dif-pcl-asm_64.o crct10dif-pclmul_glue.o
diff --git a/arch/x86/crypto/aegis128-aesni-asm.S b/arch/x86/crypto/aegis128-aesni-asm.S
new file mode 100644
index 000000000..5f7e43d4f
--- /dev/null
+++ b/arch/x86/crypto/aegis128-aesni-asm.S
@@ -0,0 +1,750 @@
+/*
+ * AES-NI + SSE2 implementation of AEGIS-128
+ *
+ * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
+ * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+
+#define STATE0 %xmm0
+#define STATE1 %xmm1
+#define STATE2 %xmm2
+#define STATE3 %xmm3
+#define STATE4 %xmm4
+#define KEY %xmm5
+#define MSG %xmm5
+#define T0 %xmm6
+#define T1 %xmm7
+
+#define STATEP %rdi
+#define LEN %rsi
+#define SRC %rdx
+#define DST %rcx
+
+.section .rodata.cst16.aegis128_const, "aM", @progbits, 32
+.align 16
+.Laegis128_const_0:
+ .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d
+ .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62
+.Laegis128_const_1:
+ .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1
+ .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd
+
+.section .rodata.cst16.aegis128_counter, "aM", @progbits, 16
+.align 16
+.Laegis128_counter:
+ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
+ .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+
+.text
+
+/*
+ * aegis128_update
+ * input:
+ * STATE[0-4] - input state
+ * output:
+ * STATE[0-4] - output state (shifted positions)
+ * changed:
+ * T0
+ */
+.macro aegis128_update
+ movdqa STATE4, T0
+ aesenc STATE0, STATE4
+ aesenc STATE1, STATE0
+ aesenc STATE2, STATE1
+ aesenc STATE3, STATE2
+ aesenc T0, STATE3
+.endm
+
+/*
+ * __load_partial: internal ABI
+ * input:
+ * LEN - bytes
+ * SRC - src
+ * output:
+ * MSG - message block
+ * changed:
+ * T0
+ * %r8
+ * %r9
+ */
+__load_partial:
+ xor %r9d, %r9d
+ pxor MSG, MSG
+
+ mov LEN, %r8
+ and $0x1, %r8
+ jz .Lld_partial_1
+
+ mov LEN, %r8
+ and $0x1E, %r8
+ add SRC, %r8
+ mov (%r8), %r9b
+
+.Lld_partial_1:
+ mov LEN, %r8
+ and $0x2, %r8
+ jz .Lld_partial_2
+
+ mov LEN, %r8
+ and $0x1C, %r8
+ add SRC, %r8
+ shl $0x10, %r9
+ mov (%r8), %r9w
+
+.Lld_partial_2:
+ mov LEN, %r8
+ and $0x4, %r8
+ jz .Lld_partial_4
+
+ mov LEN, %r8
+ and $0x18, %r8
+ add SRC, %r8
+ shl $32, %r9
+ mov (%r8), %r8d
+ xor %r8, %r9
+
+.Lld_partial_4:
+ movq %r9, MSG
+
+ mov LEN, %r8
+ and $0x8, %r8
+ jz .Lld_partial_8
+
+ mov LEN, %r8
+ and $0x10, %r8
+ add SRC, %r8
+ pslldq $8, MSG
+ movq (%r8), T0
+ pxor T0, MSG
+
+.Lld_partial_8:
+ ret
+ENDPROC(__load_partial)
+
+/*
+ * __store_partial: internal ABI
+ * input:
+ * LEN - bytes
+ * DST - dst
+ * output:
+ * T0 - message block
+ * changed:
+ * %r8
+ * %r9
+ * %r10
+ */
+__store_partial:
+ mov LEN, %r8
+ mov DST, %r9
+
+ movq T0, %r10
+
+ cmp $8, %r8
+ jl .Lst_partial_8
+
+ mov %r10, (%r9)
+ psrldq $8, T0
+ movq T0, %r10
+
+ sub $8, %r8
+ add $8, %r9
+
+.Lst_partial_8:
+ cmp $4, %r8
+ jl .Lst_partial_4
+
+ mov %r10d, (%r9)
+ shr $32, %r10
+
+ sub $4, %r8
+ add $4, %r9
+
+.Lst_partial_4:
+ cmp $2, %r8
+ jl .Lst_partial_2
+
+ mov %r10w, (%r9)
+ shr $0x10, %r10
+
+ sub $2, %r8
+ add $2, %r9
+
+.Lst_partial_2:
+ cmp $1, %r8
+ jl .Lst_partial_1
+
+ mov %r10b, (%r9)
+
+.Lst_partial_1:
+ ret
+ENDPROC(__store_partial)
+
+/*
+ * void crypto_aegis128_aesni_init(void *state, const void *key, const void *iv);
+ */
+ENTRY(crypto_aegis128_aesni_init)
+ FRAME_BEGIN
+
+ /* load IV: */
+ movdqu (%rdx), T1
+
+ /* load key: */
+ movdqa (%rsi), KEY
+ pxor KEY, T1
+ movdqa T1, STATE0
+ movdqa KEY, STATE3
+ movdqa KEY, STATE4
+
+ /* load the constants: */
+ movdqa .Laegis128_const_0, STATE2
+ movdqa .Laegis128_const_1, STATE1
+ pxor STATE2, STATE3
+ pxor STATE1, STATE4
+
+ /* update 10 times with KEY / KEY xor IV: */
+ aegis128_update; pxor KEY, STATE4
+ aegis128_update; pxor T1, STATE3
+ aegis128_update; pxor KEY, STATE2
+ aegis128_update; pxor T1, STATE1
+ aegis128_update; pxor KEY, STATE0
+ aegis128_update; pxor T1, STATE4
+ aegis128_update; pxor KEY, STATE3
+ aegis128_update; pxor T1, STATE2
+ aegis128_update; pxor KEY, STATE1
+ aegis128_update; pxor T1, STATE0
+
+ /* store the state: */
+ movdqu STATE0, 0x00(STATEP)
+ movdqu STATE1, 0x10(STATEP)
+ movdqu STATE2, 0x20(STATEP)
+ movdqu STATE3, 0x30(STATEP)
+ movdqu STATE4, 0x40(STATEP)
+
+ FRAME_END
+ ret
+ENDPROC(crypto_aegis128_aesni_init)
+
+/*
+ * void crypto_aegis128_aesni_ad(void *state, unsigned int length,
+ * const void *data);
+ */
+ENTRY(crypto_aegis128_aesni_ad)
+ FRAME_BEGIN
+
+ cmp $0x10, LEN
+ jb .Lad_out
+
+ /* load the state: */
+ movdqu 0x00(STATEP), STATE0
+ movdqu 0x10(STATEP), STATE1
+ movdqu 0x20(STATEP), STATE2
+ movdqu 0x30(STATEP), STATE3
+ movdqu 0x40(STATEP), STATE4
+
+ mov SRC, %r8
+ and $0xF, %r8
+ jnz .Lad_u_loop
+
+.align 8
+.Lad_a_loop:
+ movdqa 0x00(SRC), MSG
+ aegis128_update
+ pxor MSG, STATE4
+ sub $0x10, LEN
+ cmp $0x10, LEN
+ jl .Lad_out_1
+
+ movdqa 0x10(SRC), MSG
+ aegis128_update
+ pxor MSG, STATE3
+ sub $0x10, LEN
+ cmp $0x10, LEN
+ jl .Lad_out_2
+
+ movdqa 0x20(SRC), MSG
+ aegis128_update
+ pxor MSG, STATE2
+ sub $0x10, LEN
+ cmp $0x10, LEN
+ jl .Lad_out_3
+
+ movdqa 0x30(SRC), MSG
+ aegis128_update
+ pxor MSG, STATE1
+ sub $0x10, LEN
+ cmp $0x10, LEN
+ jl .Lad_out_4
+
+ movdqa 0x40(SRC), MSG
+ aegis128_update
+ pxor MSG, STATE0
+ sub $0x10, LEN
+ cmp $0x10, LEN
+ jl .Lad_out_0
+
+ add $0x50, SRC
+ jmp .Lad_a_loop
+
+.align 8
+.Lad_u_loop:
+ movdqu 0x00(SRC), MSG
+ aegis128_update
+ pxor MSG, STATE4
+ sub $0x10, LEN
+ cmp $0x10, LEN
+ jl .Lad_out_1
+
+ movdqu 0x10(SRC), MSG
+ aegis128_update
+ pxor MSG, STATE3
+ sub $0x10, LEN
+ cmp $0x10, LEN
+ jl .Lad_out_2
+
+ movdqu 0x20(SRC), MSG
+ aegis128_update
+ pxor MSG, STATE2
+ sub $0x10, LEN
+ cmp $0x10, LEN
+ jl .Lad_out_3
+
+ movdqu 0x30(SRC), MSG
+ aegis128_update
+ pxor MSG, STATE1
+ sub $0x10, LEN
+ cmp $0x10, LEN
+ jl .Lad_out_4
+
+ movdqu 0x40(SRC), MSG
+ aegis128_update
+ pxor MSG, STATE0
+ sub $0x10, LEN
+ cmp $0x10, LEN
+ jl .Lad_out_0
+
+ add $0x50, SRC
+ jmp .Lad_u_loop
+
+ /* store the state: */
+.Lad_out_0:
+ movdqu STATE0, 0x00(STATEP)
+ movdqu STATE1, 0x10(STATEP)
+ movdqu STATE2, 0x20(STATEP)
+ movdqu STATE3, 0x30(STATEP)
+ movdqu STATE4, 0x40(STATEP)
+ FRAME_END
+ ret
+
+.Lad_out_1:
+ movdqu STATE4, 0x00(STATEP)
+ movdqu STATE0, 0x10(STATEP)
+ movdqu STATE1, 0x20(STATEP)
+ movdqu STATE2, 0x30(STATEP)
+ movdqu STATE3, 0x40(STATEP)
+ FRAME_END
+ ret
+
+.Lad_out_2:
+ movdqu STATE3, 0x00(STATEP)
+ movdqu STATE4, 0x10(STATEP)
+ movdqu STATE0, 0x20(STATEP)
+ movdqu STATE1, 0x30(STATEP)
+ movdqu STATE2, 0x40(STATEP)
+ FRAME_END
+ ret
+
+.Lad_out_3:
+ movdqu STATE2, 0x00(STATEP)
+ movdqu STATE3, 0x10(STATEP)
+ movdqu STATE4, 0x20(STATEP)
+ movdqu STATE0, 0x30(STATEP)
+ movdqu STATE1, 0x40(STATEP)
+ FRAME_END
+ ret
+
+.Lad_out_4:
+ movdqu STATE1, 0x00(STATEP)
+ movdqu STATE2, 0x10(STATEP)
+ movdqu STATE3, 0x20(STATEP)
+ movdqu STATE4, 0x30(STATEP)
+ movdqu STATE0, 0x40(STATEP)
+ FRAME_END
+ ret
+
+.Lad_out:
+ FRAME_END
+ ret
+ENDPROC(crypto_aegis128_aesni_ad)
+
+.macro encrypt_block a s0 s1 s2 s3 s4 i
+ movdq\a (\i * 0x10)(SRC), MSG
+ movdqa MSG, T0
+ pxor \s1, T0
+ pxor \s4, T0
+ movdqa \s2, T1
+ pand \s3, T1
+ pxor T1, T0
+ movdq\a T0, (\i * 0x10)(DST)
+
+ aegis128_update
+ pxor MSG, \s4
+
+ sub $0x10, LEN
+ cmp $0x10, LEN
+ jl .Lenc_out_\i
+.endm
+
+/*
+ * void crypto_aegis128_aesni_enc(void *state, unsigned int length,
+ * const void *src, void *dst);
+ */
+ENTRY(crypto_aegis128_aesni_enc)
+ FRAME_BEGIN
+
+ cmp $0x10, LEN
+ jb .Lenc_out
+
+ /* load the state: */
+ movdqu 0x00(STATEP), STATE0
+ movdqu 0x10(STATEP), STATE1
+ movdqu 0x20(STATEP), STATE2
+ movdqu 0x30(STATEP), STATE3
+ movdqu 0x40(STATEP), STATE4
+
+ mov SRC, %r8
+ or DST, %r8
+ and $0xF, %r8
+ jnz .Lenc_u_loop
+
+.align 8
+.Lenc_a_loop:
+ encrypt_block a STATE0 STATE1 STATE2 STATE3 STATE4 0
+ encrypt_block a STATE4 STATE0 STATE1 STATE2 STATE3 1
+ encrypt_block a STATE3 STATE4 STATE0 STATE1 STATE2 2
+ encrypt_block a STATE2 STATE3 STATE4 STATE0 STATE1 3
+ encrypt_block a STATE1 STATE2 STATE3 STATE4 STATE0 4
+
+ add $0x50, SRC
+ add $0x50, DST
+ jmp .Lenc_a_loop
+
+.align 8
+.Lenc_u_loop:
+ encrypt_block u STATE0 STATE1 STATE2 STATE3 STATE4 0
+ encrypt_block u STATE4 STATE0 STATE1 STATE2 STATE3 1
+ encrypt_block u STATE3 STATE4 STATE0 STATE1 STATE2 2
+ encrypt_block u STATE2 STATE3 STATE4 STATE0 STATE1 3
+ encrypt_block u STATE1 STATE2 STATE3 STATE4 STATE0 4
+
+ add $0x50, SRC
+ add $0x50, DST
+ jmp .Lenc_u_loop
+
+ /* store the state: */
+.Lenc_out_0:
+ movdqu STATE4, 0x00(STATEP)
+ movdqu STATE0, 0x10(STATEP)
+ movdqu STATE1, 0x20(STATEP)
+ movdqu STATE2, 0x30(STATEP)
+ movdqu STATE3, 0x40(STATEP)
+ FRAME_END
+ ret
+
+.Lenc_out_1:
+ movdqu STATE3, 0x00(STATEP)
+ movdqu STATE4, 0x10(STATEP)
+ movdqu STATE0, 0x20(STATEP)
+ movdqu STATE1, 0x30(STATEP)
+ movdqu STATE2, 0x40(STATEP)
+ FRAME_END
+ ret
+
+.Lenc_out_2:
+ movdqu STATE2, 0x00(STATEP)
+ movdqu STATE3, 0x10(STATEP)
+ movdqu STATE4, 0x20(STATEP)
+ movdqu STATE0, 0x30(STATEP)
+ movdqu STATE1, 0x40(STATEP)
+ FRAME_END
+ ret
+
+.Lenc_out_3:
+ movdqu STATE1, 0x00(STATEP)
+ movdqu STATE2, 0x10(STATEP)
+ movdqu STATE3, 0x20(STATEP)
+ movdqu STATE4, 0x30(STATEP)
+ movdqu STATE0, 0x40(STATEP)
+ FRAME_END
+ ret
+
+.Lenc_out_4:
+ movdqu STATE0, 0x00(STATEP)
+ movdqu STATE1, 0x10(STATEP)
+ movdqu STATE2, 0x20(STATEP)
+ movdqu STATE3, 0x30(STATEP)
+ movdqu STATE4, 0x40(STATEP)
+ FRAME_END
+ ret
+
+.Lenc_out:
+ FRAME_END
+ ret
+ENDPROC(crypto_aegis128_aesni_enc)
+
+/*
+ * void crypto_aegis128_aesni_enc_tail(void *state, unsigned int length,
+ * const void *src, void *dst);
+ */
+ENTRY(crypto_aegis128_aesni_enc_tail)
+ FRAME_BEGIN
+
+ /* load the state: */
+ movdqu 0x00(STATEP), STATE0
+ movdqu 0x10(STATEP), STATE1
+ movdqu 0x20(STATEP), STATE2
+ movdqu 0x30(STATEP), STATE3
+ movdqu 0x40(STATEP), STATE4
+
+ /* encrypt message: */
+ call __load_partial
+
+ movdqa MSG, T0
+ pxor STATE1, T0
+ pxor STATE4, T0
+ movdqa STATE2, T1
+ pand STATE3, T1
+ pxor T1, T0
+
+ call __store_partial
+
+ aegis128_update
+ pxor MSG, STATE4
+
+ /* store the state: */
+ movdqu STATE4, 0x00(STATEP)
+ movdqu STATE0, 0x10(STATEP)
+ movdqu STATE1, 0x20(STATEP)
+ movdqu STATE2, 0x30(STATEP)
+ movdqu STATE3, 0x40(STATEP)
+
+ FRAME_END
+ ret
+ENDPROC(crypto_aegis128_aesni_enc_tail)
+
+.macro decrypt_block a s0 s1 s2 s3 s4 i
+ movdq\a (\i * 0x10)(SRC), MSG
+ pxor \s1, MSG
+ pxor \s4, MSG
+ movdqa \s2, T1
+ pand \s3, T1
+ pxor T1, MSG
+ movdq\a MSG, (\i * 0x10)(DST)
+
+ aegis128_update
+ pxor MSG, \s4
+
+ sub $0x10, LEN
+ cmp $0x10, LEN
+ jl .Ldec_out_\i
+.endm
+
+/*
+ * void crypto_aegis128_aesni_dec(void *state, unsigned int length,
+ * const void *src, void *dst);
+ */
+ENTRY(crypto_aegis128_aesni_dec)
+ FRAME_BEGIN
+
+ cmp $0x10, LEN
+ jb .Ldec_out
+
+ /* load the state: */
+ movdqu 0x00(STATEP), STATE0
+ movdqu 0x10(STATEP), STATE1
+ movdqu 0x20(STATEP), STATE2
+ movdqu 0x30(STATEP), STATE3
+ movdqu 0x40(STATEP), STATE4
+
+ mov SRC, %r8
+ or DST, %r8
+ and $0xF, %r8
+ jnz .Ldec_u_loop
+
+.align 8
+.Ldec_a_loop:
+ decrypt_block a STATE0 STATE1 STATE2 STATE3 STATE4 0
+ decrypt_block a STATE4 STATE0 STATE1 STATE2 STATE3 1
+ decrypt_block a STATE3 STATE4 STATE0 STATE1 STATE2 2
+ decrypt_block a STATE2 STATE3 STATE4 STATE0 STATE1 3
+ decrypt_block a STATE1 STATE2 STATE3 STATE4 STATE0 4
+
+ add $0x50, SRC
+ add $0x50, DST
+ jmp .Ldec_a_loop
+
+.align 8
+.Ldec_u_loop:
+ decrypt_block u STATE0 STATE1 STATE2 STATE3 STATE4 0
+ decrypt_block u STATE4 STATE0 STATE1 STATE2 STATE3 1
+ decrypt_block u STATE3 STATE4 STATE0 STATE1 STATE2 2
+ decrypt_block u STATE2 STATE3 STATE4 STATE0 STATE1 3
+ decrypt_block u STATE1 STATE2 STATE3 STATE4 STATE0 4
+
+ add $0x50, SRC
+ add $0x50, DST
+ jmp .Ldec_u_loop
+
+ /* store the state: */
+.Ldec_out_0:
+ movdqu STATE4, 0x00(STATEP)
+ movdqu STATE0, 0x10(STATEP)
+ movdqu STATE1, 0x20(STATEP)
+ movdqu STATE2, 0x30(STATEP)
+ movdqu STATE3, 0x40(STATEP)
+ FRAME_END
+ ret
+
+.Ldec_out_1:
+ movdqu STATE3, 0x00(STATEP)
+ movdqu STATE4, 0x10(STATEP)
+ movdqu STATE0, 0x20(STATEP)
+ movdqu STATE1, 0x30(STATEP)
+ movdqu STATE2, 0x40(STATEP)
+ FRAME_END
+ ret
+
+.Ldec_out_2:
+ movdqu STATE2, 0x00(STATEP)
+ movdqu STATE3, 0x10(STATEP)
+ movdqu STATE4, 0x20(STATEP)
+ movdqu STATE0, 0x30(STATEP)
+ movdqu STATE1, 0x40(STATEP)
+ FRAME_END
+ ret
+
+.Ldec_out_3:
+ movdqu STATE1, 0x00(STATEP)
+ movdqu STATE2, 0x10(STATEP)
+ movdqu STATE3, 0x20(STATEP)
+ movdqu STATE4, 0x30(STATEP)
+ movdqu STATE0, 0x40(STATEP)
+ FRAME_END
+ ret
+
+.Ldec_out_4:
+ movdqu STATE0, 0x00(STATEP)
+ movdqu STATE1, 0x10(STATEP)
+ movdqu STATE2, 0x20(STATEP)
+ movdqu STATE3, 0x30(STATEP)
+ movdqu STATE4, 0x40(STATEP)
+ FRAME_END
+ ret
+
+.Ldec_out:
+ FRAME_END
+ ret
+ENDPROC(crypto_aegis128_aesni_dec)
+
+/*
+ * void crypto_aegis128_aesni_dec_tail(void *state, unsigned int length,
+ * const void *src, void *dst);
+ */
+ENTRY(crypto_aegis128_aesni_dec_tail)
+ FRAME_BEGIN
+
+ /* load the state: */
+ movdqu 0x00(STATEP), STATE0
+ movdqu 0x10(STATEP), STATE1
+ movdqu 0x20(STATEP), STATE2
+ movdqu 0x30(STATEP), STATE3
+ movdqu 0x40(STATEP), STATE4
+
+ /* decrypt message: */
+ call __load_partial
+
+ pxor STATE1, MSG
+ pxor STATE4, MSG
+ movdqa STATE2, T1
+ pand STATE3, T1
+ pxor T1, MSG
+
+ movdqa MSG, T0
+ call __store_partial
+
+ /* mask with byte count: */
+ movq LEN, T0
+ punpcklbw T0, T0
+ punpcklbw T0, T0
+ punpcklbw T0, T0
+ punpcklbw T0, T0
+ movdqa .Laegis128_counter, T1
+ pcmpgtb T1, T0
+ pand T0, MSG
+
+ aegis128_update
+ pxor MSG, STATE4
+
+ /* store the state: */
+ movdqu STATE4, 0x00(STATEP)
+ movdqu STATE0, 0x10(STATEP)
+ movdqu STATE1, 0x20(STATEP)
+ movdqu STATE2, 0x30(STATEP)
+ movdqu STATE3, 0x40(STATEP)
+
+ FRAME_END
+ ret
+ENDPROC(crypto_aegis128_aesni_dec_tail)
+
+/*
+ * void crypto_aegis128_aesni_final(void *state, void *tag_xor,
+ * u64 assoclen, u64 cryptlen);
+ */
+ENTRY(crypto_aegis128_aesni_final)
+ FRAME_BEGIN
+
+ /* load the state: */
+ movdqu 0x00(STATEP), STATE0
+ movdqu 0x10(STATEP), STATE1
+ movdqu 0x20(STATEP), STATE2
+ movdqu 0x30(STATEP), STATE3
+ movdqu 0x40(STATEP), STATE4
+
+ /* prepare length block: */
+ movq %rdx, MSG
+ movq %rcx, T0
+ pslldq $8, T0
+ pxor T0, MSG
+ psllq $3, MSG /* multiply by 8 (to get bit count) */
+
+ pxor STATE3, MSG
+
+ /* update state: */
+ aegis128_update; pxor MSG, STATE4
+ aegis128_update; pxor MSG, STATE3
+ aegis128_update; pxor MSG, STATE2
+ aegis128_update; pxor MSG, STATE1
+ aegis128_update; pxor MSG, STATE0
+ aegis128_update; pxor MSG, STATE4
+ aegis128_update; pxor MSG, STATE3
+
+ /* xor tag: */
+ movdqu (%rsi), MSG
+
+ pxor STATE0, MSG
+ pxor STATE1, MSG
+ pxor STATE2, MSG
+ pxor STATE3, MSG
+ pxor STATE4, MSG
+
+ movdqu MSG, (%rsi)
+
+ FRAME_END
+ ret
+ENDPROC(crypto_aegis128_aesni_final)
diff --git a/arch/x86/crypto/aegis128-aesni-glue.c b/arch/x86/crypto/aegis128-aesni-glue.c
new file mode 100644
index 000000000..3ea71b871
--- /dev/null
+++ b/arch/x86/crypto/aegis128-aesni-glue.c
@@ -0,0 +1,394 @@
+/*
+ * The AEGIS-128 Authenticated-Encryption Algorithm
+ * Glue for AES-NI + SSE2 implementation
+ *
+ * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
+ * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include <crypto/cryptd.h>
+#include <crypto/internal/aead.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/scatterwalk.h>
+#include <linux/module.h>
+#include <asm/fpu/api.h>
+#include <asm/cpu_device_id.h>
+
+#define AEGIS128_BLOCK_ALIGN 16
+#define AEGIS128_BLOCK_SIZE 16
+#define AEGIS128_NONCE_SIZE 16
+#define AEGIS128_STATE_BLOCKS 5
+#define AEGIS128_KEY_SIZE 16
+#define AEGIS128_MIN_AUTH_SIZE 8
+#define AEGIS128_MAX_AUTH_SIZE 16
+
+asmlinkage void crypto_aegis128_aesni_init(void *state, void *key, void *iv);
+
+asmlinkage void crypto_aegis128_aesni_ad(
+ void *state, unsigned int length, const void *data);
+
+asmlinkage void crypto_aegis128_aesni_enc(
+ void *state, unsigned int length, const void *src, void *dst);
+
+asmlinkage void crypto_aegis128_aesni_dec(
+ void *state, unsigned int length, const void *src, void *dst);
+
+asmlinkage void crypto_aegis128_aesni_enc_tail(
+ void *state, unsigned int length, const void *src, void *dst);
+
+asmlinkage void crypto_aegis128_aesni_dec_tail(
+ void *state, unsigned int length, const void *src, void *dst);
+
+asmlinkage void crypto_aegis128_aesni_final(
+ void *state, void *tag_xor, unsigned int cryptlen,
+ unsigned int assoclen);
+
+struct aegis_block {
+ u8 bytes[AEGIS128_BLOCK_SIZE] __aligned(AEGIS128_BLOCK_ALIGN);
+};
+
+struct aegis_state {
+ struct aegis_block blocks[AEGIS128_STATE_BLOCKS];
+};
+
+struct aegis_ctx {
+ struct aegis_block key;
+};
+
+struct aegis_crypt_ops {
+ int (*skcipher_walk_init)(struct skcipher_walk *walk,
+ struct aead_request *req, bool atomic);
+
+ void (*crypt_blocks)(void *state, unsigned int length, const void *src,
+ void *dst);
+ void (*crypt_tail)(void *state, unsigned int length, const void *src,
+ void *dst);
+};
+
+static void crypto_aegis128_aesni_process_ad(
+ struct aegis_state *state, struct scatterlist *sg_src,
+ unsigned int assoclen)
+{
+ struct scatter_walk walk;
+ struct aegis_block buf;
+ unsigned int pos = 0;
+
+ scatterwalk_start(&walk, sg_src);
+ while (assoclen != 0) {
+ unsigned int size = scatterwalk_clamp(&walk, assoclen);
+ unsigned int left = size;
+ void *mapped = scatterwalk_map(&walk);
+ const u8 *src = (const u8 *)mapped;
+
+ if (pos + size >= AEGIS128_BLOCK_SIZE) {
+ if (pos > 0) {
+ unsigned int fill = AEGIS128_BLOCK_SIZE - pos;
+ memcpy(buf.bytes + pos, src, fill);
+ crypto_aegis128_aesni_ad(state,
+ AEGIS128_BLOCK_SIZE,
+ buf.bytes);
+ pos = 0;
+ left -= fill;
+ src += fill;
+ }
+
+ crypto_aegis128_aesni_ad(state, left, src);
+
+ src += left & ~(AEGIS128_BLOCK_SIZE - 1);
+ left &= AEGIS128_BLOCK_SIZE - 1;
+ }
+
+ memcpy(buf.bytes + pos, src, left);
+ pos += left;
+ assoclen -= size;
+
+ scatterwalk_unmap(mapped);
+ scatterwalk_advance(&walk, size);
+ scatterwalk_done(&walk, 0, assoclen);
+ }
+
+ if (pos > 0) {
+ memset(buf.bytes + pos, 0, AEGIS128_BLOCK_SIZE - pos);
+ crypto_aegis128_aesni_ad(state, AEGIS128_BLOCK_SIZE, buf.bytes);
+ }
+}
+
+static void crypto_aegis128_aesni_process_crypt(
+ struct aegis_state *state, struct skcipher_walk *walk,
+ const struct aegis_crypt_ops *ops)
+{
+ while (walk->nbytes >= AEGIS128_BLOCK_SIZE) {
+ ops->crypt_blocks(state,
+ round_down(walk->nbytes, AEGIS128_BLOCK_SIZE),
+ walk->src.virt.addr, walk->dst.virt.addr);
+ skcipher_walk_done(walk, walk->nbytes % AEGIS128_BLOCK_SIZE);
+ }
+
+ if (walk->nbytes) {
+ ops->crypt_tail(state, walk->nbytes, walk->src.virt.addr,
+ walk->dst.virt.addr);
+ skcipher_walk_done(walk, 0);
+ }
+}
+
+static struct aegis_ctx *crypto_aegis128_aesni_ctx(struct crypto_aead *aead)
+{
+ u8 *ctx = crypto_aead_ctx(aead);
+ ctx = PTR_ALIGN(ctx, __alignof__(struct aegis_ctx));
+ return (void *)ctx;
+}
+
+static int crypto_aegis128_aesni_setkey(struct crypto_aead *aead, const u8 *key,
+ unsigned int keylen)
+{
+ struct aegis_ctx *ctx = crypto_aegis128_aesni_ctx(aead);
+
+ if (keylen != AEGIS128_KEY_SIZE) {
+ crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN);
+ return -EINVAL;
+ }
+
+ memcpy(ctx->key.bytes, key, AEGIS128_KEY_SIZE);
+
+ return 0;
+}
+
+static int crypto_aegis128_aesni_setauthsize(struct crypto_aead *tfm,
+ unsigned int authsize)
+{
+ if (authsize > AEGIS128_MAX_AUTH_SIZE)
+ return -EINVAL;
+ if (authsize < AEGIS128_MIN_AUTH_SIZE)
+ return -EINVAL;
+ return 0;
+}
+
+static void crypto_aegis128_aesni_crypt(struct aead_request *req,
+ struct aegis_block *tag_xor,
+ unsigned int cryptlen,
+ const struct aegis_crypt_ops *ops)
+{
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ struct aegis_ctx *ctx = crypto_aegis128_aesni_ctx(tfm);
+ struct skcipher_walk walk;
+ struct aegis_state state;
+
+ ops->skcipher_walk_init(&walk, req, true);
+
+ kernel_fpu_begin();
+
+ crypto_aegis128_aesni_init(&state, ctx->key.bytes, req->iv);
+ crypto_aegis128_aesni_process_ad(&state, req->src, req->assoclen);
+ crypto_aegis128_aesni_process_crypt(&state, &walk, ops);
+ crypto_aegis128_aesni_final(&state, tag_xor, req->assoclen, cryptlen);
+
+ kernel_fpu_end();
+}
+
+static int crypto_aegis128_aesni_encrypt(struct aead_request *req)
+{
+ static const struct aegis_crypt_ops OPS = {
+ .skcipher_walk_init = skcipher_walk_aead_encrypt,
+ .crypt_blocks = crypto_aegis128_aesni_enc,
+ .crypt_tail = crypto_aegis128_aesni_enc_tail,
+ };
+
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ struct aegis_block tag = {};
+ unsigned int authsize = crypto_aead_authsize(tfm);
+ unsigned int cryptlen = req->cryptlen;
+
+ crypto_aegis128_aesni_crypt(req, &tag, cryptlen, &OPS);
+
+ scatterwalk_map_and_copy(tag.bytes, req->dst,
+ req->assoclen + cryptlen, authsize, 1);
+ return 0;
+}
+
+static int crypto_aegis128_aesni_decrypt(struct aead_request *req)
+{
+ static const struct aegis_block zeros = {};
+
+ static const struct aegis_crypt_ops OPS = {
+ .skcipher_walk_init = skcipher_walk_aead_decrypt,
+ .crypt_blocks = crypto_aegis128_aesni_dec,
+ .crypt_tail = crypto_aegis128_aesni_dec_tail,
+ };
+
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ struct aegis_block tag;
+ unsigned int authsize = crypto_aead_authsize(tfm);
+ unsigned int cryptlen = req->cryptlen - authsize;
+
+ scatterwalk_map_and_copy(tag.bytes, req->src,
+ req->assoclen + cryptlen, authsize, 0);
+
+ crypto_aegis128_aesni_crypt(req, &tag, cryptlen, &OPS);
+
+ return crypto_memneq(tag.bytes, zeros.bytes, authsize) ? -EBADMSG : 0;
+}
+
+static int crypto_aegis128_aesni_init_tfm(struct crypto_aead *aead)
+{
+ return 0;
+}
+
+static void crypto_aegis128_aesni_exit_tfm(struct crypto_aead *aead)
+{
+}
+
+static int cryptd_aegis128_aesni_setkey(struct crypto_aead *aead,
+ const u8 *key, unsigned int keylen)
+{
+ struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+ struct cryptd_aead *cryptd_tfm = *ctx;
+
+ return crypto_aead_setkey(&cryptd_tfm->base, key, keylen);
+}
+
+static int cryptd_aegis128_aesni_setauthsize(struct crypto_aead *aead,
+ unsigned int authsize)
+{
+ struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+ struct cryptd_aead *cryptd_tfm = *ctx;
+
+ return crypto_aead_setauthsize(&cryptd_tfm->base, authsize);
+}
+
+static int cryptd_aegis128_aesni_encrypt(struct aead_request *req)
+{
+ struct crypto_aead *aead = crypto_aead_reqtfm(req);
+ struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+ struct cryptd_aead *cryptd_tfm = *ctx;
+
+ aead = &cryptd_tfm->base;
+ if (irq_fpu_usable() && (!in_atomic() ||
+ !cryptd_aead_queued(cryptd_tfm)))
+ aead = cryptd_aead_child(cryptd_tfm);
+
+ aead_request_set_tfm(req, aead);
+
+ return crypto_aead_encrypt(req);
+}
+
+static int cryptd_aegis128_aesni_decrypt(struct aead_request *req)
+{
+ struct crypto_aead *aead = crypto_aead_reqtfm(req);
+ struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+ struct cryptd_aead *cryptd_tfm = *ctx;
+
+ aead = &cryptd_tfm->base;
+ if (irq_fpu_usable() && (!in_atomic() ||
+ !cryptd_aead_queued(cryptd_tfm)))
+ aead = cryptd_aead_child(cryptd_tfm);
+
+ aead_request_set_tfm(req, aead);
+
+ return crypto_aead_decrypt(req);
+}
+
+static int cryptd_aegis128_aesni_init_tfm(struct crypto_aead *aead)
+{
+ struct cryptd_aead *cryptd_tfm;
+ struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+
+ cryptd_tfm = cryptd_alloc_aead("__aegis128-aesni", CRYPTO_ALG_INTERNAL,
+ CRYPTO_ALG_INTERNAL);
+ if (IS_ERR(cryptd_tfm))
+ return PTR_ERR(cryptd_tfm);
+
+ *ctx = cryptd_tfm;
+ crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base));
+ return 0;
+}
+
+static void cryptd_aegis128_aesni_exit_tfm(struct crypto_aead *aead)
+{
+ struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+
+ cryptd_free_aead(*ctx);
+}
+
+static struct aead_alg crypto_aegis128_aesni_alg[] = {
+ {
+ .setkey = crypto_aegis128_aesni_setkey,
+ .setauthsize = crypto_aegis128_aesni_setauthsize,
+ .encrypt = crypto_aegis128_aesni_encrypt,
+ .decrypt = crypto_aegis128_aesni_decrypt,
+ .init = crypto_aegis128_aesni_init_tfm,
+ .exit = crypto_aegis128_aesni_exit_tfm,
+
+ .ivsize = AEGIS128_NONCE_SIZE,
+ .maxauthsize = AEGIS128_MAX_AUTH_SIZE,
+ .chunksize = AEGIS128_BLOCK_SIZE,
+
+ .base = {
+ .cra_flags = CRYPTO_ALG_INTERNAL,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct aegis_ctx) +
+ __alignof__(struct aegis_ctx),
+ .cra_alignmask = 0,
+
+ .cra_name = "__aegis128",
+ .cra_driver_name = "__aegis128-aesni",
+
+ .cra_module = THIS_MODULE,
+ }
+ }, {
+ .setkey = cryptd_aegis128_aesni_setkey,
+ .setauthsize = cryptd_aegis128_aesni_setauthsize,
+ .encrypt = cryptd_aegis128_aesni_encrypt,
+ .decrypt = cryptd_aegis128_aesni_decrypt,
+ .init = cryptd_aegis128_aesni_init_tfm,
+ .exit = cryptd_aegis128_aesni_exit_tfm,
+
+ .ivsize = AEGIS128_NONCE_SIZE,
+ .maxauthsize = AEGIS128_MAX_AUTH_SIZE,
+ .chunksize = AEGIS128_BLOCK_SIZE,
+
+ .base = {
+ .cra_flags = CRYPTO_ALG_ASYNC,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct cryptd_aead *),
+ .cra_alignmask = 0,
+
+ .cra_priority = 400,
+
+ .cra_name = "aegis128",
+ .cra_driver_name = "aegis128-aesni",
+
+ .cra_module = THIS_MODULE,
+ }
+ }
+};
+
+static int __init crypto_aegis128_aesni_module_init(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_XMM2) ||
+ !boot_cpu_has(X86_FEATURE_AES) ||
+ !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL))
+ return -ENODEV;
+
+ return crypto_register_aeads(crypto_aegis128_aesni_alg,
+ ARRAY_SIZE(crypto_aegis128_aesni_alg));
+}
+
+static void __exit crypto_aegis128_aesni_module_exit(void)
+{
+ crypto_unregister_aeads(crypto_aegis128_aesni_alg,
+ ARRAY_SIZE(crypto_aegis128_aesni_alg));
+}
+
+module_init(crypto_aegis128_aesni_module_init);
+module_exit(crypto_aegis128_aesni_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>");
+MODULE_DESCRIPTION("AEGIS-128 AEAD algorithm -- AESNI+SSE2 implementation");
+MODULE_ALIAS_CRYPTO("aegis128");
+MODULE_ALIAS_CRYPTO("aegis128-aesni");
diff --git a/arch/x86/crypto/aegis128l-aesni-asm.S b/arch/x86/crypto/aegis128l-aesni-asm.S
new file mode 100644
index 000000000..491dd61c8
--- /dev/null
+++ b/arch/x86/crypto/aegis128l-aesni-asm.S
@@ -0,0 +1,826 @@
+/*
+ * AES-NI + SSE2 implementation of AEGIS-128L
+ *
+ * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
+ * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+
+#define STATE0 %xmm0
+#define STATE1 %xmm1
+#define STATE2 %xmm2
+#define STATE3 %xmm3
+#define STATE4 %xmm4
+#define STATE5 %xmm5
+#define STATE6 %xmm6
+#define STATE7 %xmm7
+#define MSG0 %xmm8
+#define MSG1 %xmm9
+#define T0 %xmm10
+#define T1 %xmm11
+#define T2 %xmm12
+#define T3 %xmm13
+
+#define STATEP %rdi
+#define LEN %rsi
+#define SRC %rdx
+#define DST %rcx
+
+.section .rodata.cst16.aegis128l_const, "aM", @progbits, 32
+.align 16
+.Laegis128l_const_0:
+ .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d
+ .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62
+.Laegis128l_const_1:
+ .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1
+ .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd
+
+.section .rodata.cst16.aegis128l_counter, "aM", @progbits, 16
+.align 16
+.Laegis128l_counter0:
+ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
+ .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+.Laegis128l_counter1:
+ .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
+ .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
+
+.text
+
+/*
+ * __load_partial: internal ABI
+ * input:
+ * LEN - bytes
+ * SRC - src
+ * output:
+ * MSG0 - first message block
+ * MSG1 - second message block
+ * changed:
+ * T0
+ * %r8
+ * %r9
+ */
+__load_partial:
+ xor %r9d, %r9d
+ pxor MSG0, MSG0
+ pxor MSG1, MSG1
+
+ mov LEN, %r8
+ and $0x1, %r8
+ jz .Lld_partial_1
+
+ mov LEN, %r8
+ and $0x1E, %r8
+ add SRC, %r8
+ mov (%r8), %r9b
+
+.Lld_partial_1:
+ mov LEN, %r8
+ and $0x2, %r8
+ jz .Lld_partial_2
+
+ mov LEN, %r8
+ and $0x1C, %r8
+ add SRC, %r8
+ shl $0x10, %r9
+ mov (%r8), %r9w
+
+.Lld_partial_2:
+ mov LEN, %r8
+ and $0x4, %r8
+ jz .Lld_partial_4
+
+ mov LEN, %r8
+ and $0x18, %r8
+ add SRC, %r8
+ shl $32, %r9
+ mov (%r8), %r8d
+ xor %r8, %r9
+
+.Lld_partial_4:
+ movq %r9, MSG0
+
+ mov LEN, %r8
+ and $0x8, %r8
+ jz .Lld_partial_8
+
+ mov LEN, %r8
+ and $0x10, %r8
+ add SRC, %r8
+ pslldq $8, MSG0
+ movq (%r8), T0
+ pxor T0, MSG0
+
+.Lld_partial_8:
+ mov LEN, %r8
+ and $0x10, %r8
+ jz .Lld_partial_16
+
+ movdqa MSG0, MSG1
+ movdqu (SRC), MSG0
+
+.Lld_partial_16:
+ ret
+ENDPROC(__load_partial)
+
+/*
+ * __store_partial: internal ABI
+ * input:
+ * LEN - bytes
+ * DST - dst
+ * output:
+ * T0 - first message block
+ * T1 - second message block
+ * changed:
+ * %r8
+ * %r9
+ * %r10
+ */
+__store_partial:
+ mov LEN, %r8
+ mov DST, %r9
+
+ cmp $16, %r8
+ jl .Lst_partial_16
+
+ movdqu T0, (%r9)
+ movdqa T1, T0
+
+ sub $16, %r8
+ add $16, %r9
+
+.Lst_partial_16:
+ movq T0, %r10
+
+ cmp $8, %r8
+ jl .Lst_partial_8
+
+ mov %r10, (%r9)
+ psrldq $8, T0
+ movq T0, %r10
+
+ sub $8, %r8
+ add $8, %r9
+
+.Lst_partial_8:
+ cmp $4, %r8
+ jl .Lst_partial_4
+
+ mov %r10d, (%r9)
+ shr $32, %r10
+
+ sub $4, %r8
+ add $4, %r9
+
+.Lst_partial_4:
+ cmp $2, %r8
+ jl .Lst_partial_2
+
+ mov %r10w, (%r9)
+ shr $0x10, %r10
+
+ sub $2, %r8
+ add $2, %r9
+
+.Lst_partial_2:
+ cmp $1, %r8
+ jl .Lst_partial_1
+
+ mov %r10b, (%r9)
+
+.Lst_partial_1:
+ ret
+ENDPROC(__store_partial)
+
+.macro update
+ movdqa STATE7, T0
+ aesenc STATE0, STATE7
+ aesenc STATE1, STATE0
+ aesenc STATE2, STATE1
+ aesenc STATE3, STATE2
+ aesenc STATE4, STATE3
+ aesenc STATE5, STATE4
+ aesenc STATE6, STATE5
+ aesenc T0, STATE6
+.endm
+
+.macro update0
+ update
+ pxor MSG0, STATE7
+ pxor MSG1, STATE3
+.endm
+
+.macro update1
+ update
+ pxor MSG0, STATE6
+ pxor MSG1, STATE2
+.endm
+
+.macro update2
+ update
+ pxor MSG0, STATE5
+ pxor MSG1, STATE1
+.endm
+
+.macro update3
+ update
+ pxor MSG0, STATE4
+ pxor MSG1, STATE0
+.endm
+
+.macro update4
+ update
+ pxor MSG0, STATE3
+ pxor MSG1, STATE7
+.endm
+
+.macro update5
+ update
+ pxor MSG0, STATE2
+ pxor MSG1, STATE6
+.endm
+
+.macro update6
+ update
+ pxor MSG0, STATE1
+ pxor MSG1, STATE5
+.endm
+
+.macro update7
+ update
+ pxor MSG0, STATE0
+ pxor MSG1, STATE4
+.endm
+
+.macro state_load
+ movdqu 0x00(STATEP), STATE0
+ movdqu 0x10(STATEP), STATE1
+ movdqu 0x20(STATEP), STATE2
+ movdqu 0x30(STATEP), STATE3
+ movdqu 0x40(STATEP), STATE4
+ movdqu 0x50(STATEP), STATE5
+ movdqu 0x60(STATEP), STATE6
+ movdqu 0x70(STATEP), STATE7
+.endm
+
+.macro state_store s0 s1 s2 s3 s4 s5 s6 s7
+ movdqu \s7, 0x00(STATEP)
+ movdqu \s0, 0x10(STATEP)
+ movdqu \s1, 0x20(STATEP)
+ movdqu \s2, 0x30(STATEP)
+ movdqu \s3, 0x40(STATEP)
+ movdqu \s4, 0x50(STATEP)
+ movdqu \s5, 0x60(STATEP)
+ movdqu \s6, 0x70(STATEP)
+.endm
+
+.macro state_store0
+ state_store STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 STATE6 STATE7
+.endm
+
+.macro state_store1
+ state_store STATE7 STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 STATE6
+.endm
+
+.macro state_store2
+ state_store STATE6 STATE7 STATE0 STATE1 STATE2 STATE3 STATE4 STATE5
+.endm
+
+.macro state_store3
+ state_store STATE5 STATE6 STATE7 STATE0 STATE1 STATE2 STATE3 STATE4
+.endm
+
+.macro state_store4
+ state_store STATE4 STATE5 STATE6 STATE7 STATE0 STATE1 STATE2 STATE3
+.endm
+
+.macro state_store5
+ state_store STATE3 STATE4 STATE5 STATE6 STATE7 STATE0 STATE1 STATE2
+.endm
+
+.macro state_store6
+ state_store STATE2 STATE3 STATE4 STATE5 STATE6 STATE7 STATE0 STATE1
+.endm
+
+.macro state_store7
+ state_store STATE1 STATE2 STATE3 STATE4 STATE5 STATE6 STATE7 STATE0
+.endm
+
+/*
+ * void crypto_aegis128l_aesni_init(void *state, const void *key, const void *iv);
+ */
+ENTRY(crypto_aegis128l_aesni_init)
+ FRAME_BEGIN
+
+ /* load key: */
+ movdqa (%rsi), MSG1
+ movdqa MSG1, STATE0
+ movdqa MSG1, STATE4
+ movdqa MSG1, STATE5
+ movdqa MSG1, STATE6
+ movdqa MSG1, STATE7
+
+ /* load IV: */
+ movdqu (%rdx), MSG0
+ pxor MSG0, STATE0
+ pxor MSG0, STATE4
+
+ /* load the constants: */
+ movdqa .Laegis128l_const_0, STATE2
+ movdqa .Laegis128l_const_1, STATE1
+ movdqa STATE1, STATE3
+ pxor STATE2, STATE5
+ pxor STATE1, STATE6
+ pxor STATE2, STATE7
+
+ /* update 10 times with IV and KEY: */
+ update0
+ update1
+ update2
+ update3
+ update4
+ update5
+ update6
+ update7
+ update0
+ update1
+
+ state_store1
+
+ FRAME_END
+ ret
+ENDPROC(crypto_aegis128l_aesni_init)
+
+.macro ad_block a i
+ movdq\a (\i * 0x20 + 0x00)(SRC), MSG0
+ movdq\a (\i * 0x20 + 0x10)(SRC), MSG1
+ update\i
+ sub $0x20, LEN
+ cmp $0x20, LEN
+ jl .Lad_out_\i
+.endm
+
+/*
+ * void crypto_aegis128l_aesni_ad(void *state, unsigned int length,
+ * const void *data);
+ */
+ENTRY(crypto_aegis128l_aesni_ad)
+ FRAME_BEGIN
+
+ cmp $0x20, LEN
+ jb .Lad_out
+
+ state_load
+
+ mov SRC, %r8
+ and $0xf, %r8
+ jnz .Lad_u_loop
+
+.align 8
+.Lad_a_loop:
+ ad_block a 0
+ ad_block a 1
+ ad_block a 2
+ ad_block a 3
+ ad_block a 4
+ ad_block a 5
+ ad_block a 6
+ ad_block a 7
+
+ add $0x100, SRC
+ jmp .Lad_a_loop
+
+.align 8
+.Lad_u_loop:
+ ad_block u 0
+ ad_block u 1
+ ad_block u 2
+ ad_block u 3
+ ad_block u 4
+ ad_block u 5
+ ad_block u 6
+ ad_block u 7
+
+ add $0x100, SRC
+ jmp .Lad_u_loop
+
+.Lad_out_0:
+ state_store0
+ FRAME_END
+ ret
+
+.Lad_out_1:
+ state_store1
+ FRAME_END
+ ret
+
+.Lad_out_2:
+ state_store2
+ FRAME_END
+ ret
+
+.Lad_out_3:
+ state_store3
+ FRAME_END
+ ret
+
+.Lad_out_4:
+ state_store4
+ FRAME_END
+ ret
+
+.Lad_out_5:
+ state_store5
+ FRAME_END
+ ret
+
+.Lad_out_6:
+ state_store6
+ FRAME_END
+ ret
+
+.Lad_out_7:
+ state_store7
+ FRAME_END
+ ret
+
+.Lad_out:
+ FRAME_END
+ ret
+ENDPROC(crypto_aegis128l_aesni_ad)
+
+.macro crypt m0 m1 s0 s1 s2 s3 s4 s5 s6 s7
+ pxor \s1, \m0
+ pxor \s6, \m0
+ movdqa \s2, T3
+ pand \s3, T3
+ pxor T3, \m0
+
+ pxor \s2, \m1
+ pxor \s5, \m1
+ movdqa \s6, T3
+ pand \s7, T3
+ pxor T3, \m1
+.endm
+
+.macro crypt0 m0 m1
+ crypt \m0 \m1 STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 STATE6 STATE7
+.endm
+
+.macro crypt1 m0 m1
+ crypt \m0 \m1 STATE7 STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 STATE6
+.endm
+
+.macro crypt2 m0 m1
+ crypt \m0 \m1 STATE6 STATE7 STATE0 STATE1 STATE2 STATE3 STATE4 STATE5
+.endm
+
+.macro crypt3 m0 m1
+ crypt \m0 \m1 STATE5 STATE6 STATE7 STATE0 STATE1 STATE2 STATE3 STATE4
+.endm
+
+.macro crypt4 m0 m1
+ crypt \m0 \m1 STATE4 STATE5 STATE6 STATE7 STATE0 STATE1 STATE2 STATE3
+.endm
+
+.macro crypt5 m0 m1
+ crypt \m0 \m1 STATE3 STATE4 STATE5 STATE6 STATE7 STATE0 STATE1 STATE2
+.endm
+
+.macro crypt6 m0 m1
+ crypt \m0 \m1 STATE2 STATE3 STATE4 STATE5 STATE6 STATE7 STATE0 STATE1
+.endm
+
+.macro crypt7 m0 m1
+ crypt \m0 \m1 STATE1 STATE2 STATE3 STATE4 STATE5 STATE6 STATE7 STATE0
+.endm
+
+.macro encrypt_block a i
+ movdq\a (\i * 0x20 + 0x00)(SRC), MSG0
+ movdq\a (\i * 0x20 + 0x10)(SRC), MSG1
+ movdqa MSG0, T0
+ movdqa MSG1, T1
+ crypt\i T0, T1
+ movdq\a T0, (\i * 0x20 + 0x00)(DST)
+ movdq\a T1, (\i * 0x20 + 0x10)(DST)
+
+ update\i
+
+ sub $0x20, LEN
+ cmp $0x20, LEN
+ jl .Lenc_out_\i
+.endm
+
+.macro decrypt_block a i
+ movdq\a (\i * 0x20 + 0x00)(SRC), MSG0
+ movdq\a (\i * 0x20 + 0x10)(SRC), MSG1
+ crypt\i MSG0, MSG1
+ movdq\a MSG0, (\i * 0x20 + 0x00)(DST)
+ movdq\a MSG1, (\i * 0x20 + 0x10)(DST)
+
+ update\i
+
+ sub $0x20, LEN
+ cmp $0x20, LEN
+ jl .Ldec_out_\i
+.endm
+
+/*
+ * void crypto_aegis128l_aesni_enc(void *state, unsigned int length,
+ * const void *src, void *dst);
+ */
+ENTRY(crypto_aegis128l_aesni_enc)
+ FRAME_BEGIN
+
+ cmp $0x20, LEN
+ jb .Lenc_out
+
+ state_load
+
+ mov SRC, %r8
+ or DST, %r8
+ and $0xf, %r8
+ jnz .Lenc_u_loop
+
+.align 8
+.Lenc_a_loop:
+ encrypt_block a 0
+ encrypt_block a 1
+ encrypt_block a 2
+ encrypt_block a 3
+ encrypt_block a 4
+ encrypt_block a 5
+ encrypt_block a 6
+ encrypt_block a 7
+
+ add $0x100, SRC
+ add $0x100, DST
+ jmp .Lenc_a_loop
+
+.align 8
+.Lenc_u_loop:
+ encrypt_block u 0
+ encrypt_block u 1
+ encrypt_block u 2
+ encrypt_block u 3
+ encrypt_block u 4
+ encrypt_block u 5
+ encrypt_block u 6
+ encrypt_block u 7
+
+ add $0x100, SRC
+ add $0x100, DST
+ jmp .Lenc_u_loop
+
+.Lenc_out_0:
+ state_store0
+ FRAME_END
+ ret
+
+.Lenc_out_1:
+ state_store1
+ FRAME_END
+ ret
+
+.Lenc_out_2:
+ state_store2
+ FRAME_END
+ ret
+
+.Lenc_out_3:
+ state_store3
+ FRAME_END
+ ret
+
+.Lenc_out_4:
+ state_store4
+ FRAME_END
+ ret
+
+.Lenc_out_5:
+ state_store5
+ FRAME_END
+ ret
+
+.Lenc_out_6:
+ state_store6
+ FRAME_END
+ ret
+
+.Lenc_out_7:
+ state_store7
+ FRAME_END
+ ret
+
+.Lenc_out:
+ FRAME_END
+ ret
+ENDPROC(crypto_aegis128l_aesni_enc)
+
+/*
+ * void crypto_aegis128l_aesni_enc_tail(void *state, unsigned int length,
+ * const void *src, void *dst);
+ */
+ENTRY(crypto_aegis128l_aesni_enc_tail)
+ FRAME_BEGIN
+
+ state_load
+
+ /* encrypt message: */
+ call __load_partial
+
+ movdqa MSG0, T0
+ movdqa MSG1, T1
+ crypt0 T0, T1
+
+ call __store_partial
+
+ update0
+
+ state_store0
+
+ FRAME_END
+ ret
+ENDPROC(crypto_aegis128l_aesni_enc_tail)
+
+/*
+ * void crypto_aegis128l_aesni_dec(void *state, unsigned int length,
+ * const void *src, void *dst);
+ */
+ENTRY(crypto_aegis128l_aesni_dec)
+ FRAME_BEGIN
+
+ cmp $0x20, LEN
+ jb .Ldec_out
+
+ state_load
+
+ mov SRC, %r8
+ or DST, %r8
+ and $0xF, %r8
+ jnz .Ldec_u_loop
+
+.align 8
+.Ldec_a_loop:
+ decrypt_block a 0
+ decrypt_block a 1
+ decrypt_block a 2
+ decrypt_block a 3
+ decrypt_block a 4
+ decrypt_block a 5
+ decrypt_block a 6
+ decrypt_block a 7
+
+ add $0x100, SRC
+ add $0x100, DST
+ jmp .Ldec_a_loop
+
+.align 8
+.Ldec_u_loop:
+ decrypt_block u 0
+ decrypt_block u 1
+ decrypt_block u 2
+ decrypt_block u 3
+ decrypt_block u 4
+ decrypt_block u 5
+ decrypt_block u 6
+ decrypt_block u 7
+
+ add $0x100, SRC
+ add $0x100, DST
+ jmp .Ldec_u_loop
+
+.Ldec_out_0:
+ state_store0
+ FRAME_END
+ ret
+
+.Ldec_out_1:
+ state_store1
+ FRAME_END
+ ret
+
+.Ldec_out_2:
+ state_store2
+ FRAME_END
+ ret
+
+.Ldec_out_3:
+ state_store3
+ FRAME_END
+ ret
+
+.Ldec_out_4:
+ state_store4
+ FRAME_END
+ ret
+
+.Ldec_out_5:
+ state_store5
+ FRAME_END
+ ret
+
+.Ldec_out_6:
+ state_store6
+ FRAME_END
+ ret
+
+.Ldec_out_7:
+ state_store7
+ FRAME_END
+ ret
+
+.Ldec_out:
+ FRAME_END
+ ret
+ENDPROC(crypto_aegis128l_aesni_dec)
+
+/*
+ * void crypto_aegis128l_aesni_dec_tail(void *state, unsigned int length,
+ * const void *src, void *dst);
+ */
+ENTRY(crypto_aegis128l_aesni_dec_tail)
+ FRAME_BEGIN
+
+ state_load
+
+ /* decrypt message: */
+ call __load_partial
+
+ crypt0 MSG0, MSG1
+
+ movdqa MSG0, T0
+ movdqa MSG1, T1
+ call __store_partial
+
+ /* mask with byte count: */
+ movq LEN, T0
+ punpcklbw T0, T0
+ punpcklbw T0, T0
+ punpcklbw T0, T0
+ punpcklbw T0, T0
+ movdqa T0, T1
+ movdqa .Laegis128l_counter0, T2
+ movdqa .Laegis128l_counter1, T3
+ pcmpgtb T2, T0
+ pcmpgtb T3, T1
+ pand T0, MSG0
+ pand T1, MSG1
+
+ update0
+
+ state_store0
+
+ FRAME_END
+ ret
+ENDPROC(crypto_aegis128l_aesni_dec_tail)
+
+/*
+ * void crypto_aegis128l_aesni_final(void *state, void *tag_xor,
+ * u64 assoclen, u64 cryptlen);
+ */
+ENTRY(crypto_aegis128l_aesni_final)
+ FRAME_BEGIN
+
+ state_load
+
+ /* prepare length block: */
+ movq %rdx, MSG0
+ movq %rcx, T0
+ pslldq $8, T0
+ pxor T0, MSG0
+ psllq $3, MSG0 /* multiply by 8 (to get bit count) */
+
+ pxor STATE2, MSG0
+ movdqa MSG0, MSG1
+
+ /* update state: */
+ update0
+ update1
+ update2
+ update3
+ update4
+ update5
+ update6
+
+ /* xor tag: */
+ movdqu (%rsi), T0
+
+ pxor STATE1, T0
+ pxor STATE2, T0
+ pxor STATE3, T0
+ pxor STATE4, T0
+ pxor STATE5, T0
+ pxor STATE6, T0
+ pxor STATE7, T0
+
+ movdqu T0, (%rsi)
+
+ FRAME_END
+ ret
+ENDPROC(crypto_aegis128l_aesni_final)
diff --git a/arch/x86/crypto/aegis128l-aesni-glue.c b/arch/x86/crypto/aegis128l-aesni-glue.c
new file mode 100644
index 000000000..1b1b39c66
--- /dev/null
+++ b/arch/x86/crypto/aegis128l-aesni-glue.c
@@ -0,0 +1,394 @@
+/*
+ * The AEGIS-128L Authenticated-Encryption Algorithm
+ * Glue for AES-NI + SSE2 implementation
+ *
+ * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
+ * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include <crypto/cryptd.h>
+#include <crypto/internal/aead.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/scatterwalk.h>
+#include <linux/module.h>
+#include <asm/fpu/api.h>
+#include <asm/cpu_device_id.h>
+
+#define AEGIS128L_BLOCK_ALIGN 16
+#define AEGIS128L_BLOCK_SIZE 32
+#define AEGIS128L_NONCE_SIZE 16
+#define AEGIS128L_STATE_BLOCKS 8
+#define AEGIS128L_KEY_SIZE 16
+#define AEGIS128L_MIN_AUTH_SIZE 8
+#define AEGIS128L_MAX_AUTH_SIZE 16
+
+asmlinkage void crypto_aegis128l_aesni_init(void *state, void *key, void *iv);
+
+asmlinkage void crypto_aegis128l_aesni_ad(
+ void *state, unsigned int length, const void *data);
+
+asmlinkage void crypto_aegis128l_aesni_enc(
+ void *state, unsigned int length, const void *src, void *dst);
+
+asmlinkage void crypto_aegis128l_aesni_dec(
+ void *state, unsigned int length, const void *src, void *dst);
+
+asmlinkage void crypto_aegis128l_aesni_enc_tail(
+ void *state, unsigned int length, const void *src, void *dst);
+
+asmlinkage void crypto_aegis128l_aesni_dec_tail(
+ void *state, unsigned int length, const void *src, void *dst);
+
+asmlinkage void crypto_aegis128l_aesni_final(
+ void *state, void *tag_xor, unsigned int cryptlen,
+ unsigned int assoclen);
+
+struct aegis_block {
+ u8 bytes[AEGIS128L_BLOCK_SIZE] __aligned(AEGIS128L_BLOCK_ALIGN);
+};
+
+struct aegis_state {
+ struct aegis_block blocks[AEGIS128L_STATE_BLOCKS];
+};
+
+struct aegis_ctx {
+ struct aegis_block key;
+};
+
+struct aegis_crypt_ops {
+ int (*skcipher_walk_init)(struct skcipher_walk *walk,
+ struct aead_request *req, bool atomic);
+
+ void (*crypt_blocks)(void *state, unsigned int length, const void *src,
+ void *dst);
+ void (*crypt_tail)(void *state, unsigned int length, const void *src,
+ void *dst);
+};
+
+static void crypto_aegis128l_aesni_process_ad(
+ struct aegis_state *state, struct scatterlist *sg_src,
+ unsigned int assoclen)
+{
+ struct scatter_walk walk;
+ struct aegis_block buf;
+ unsigned int pos = 0;
+
+ scatterwalk_start(&walk, sg_src);
+ while (assoclen != 0) {
+ unsigned int size = scatterwalk_clamp(&walk, assoclen);
+ unsigned int left = size;
+ void *mapped = scatterwalk_map(&walk);
+ const u8 *src = (const u8 *)mapped;
+
+ if (pos + size >= AEGIS128L_BLOCK_SIZE) {
+ if (pos > 0) {
+ unsigned int fill = AEGIS128L_BLOCK_SIZE - pos;
+ memcpy(buf.bytes + pos, src, fill);
+ crypto_aegis128l_aesni_ad(state,
+ AEGIS128L_BLOCK_SIZE,
+ buf.bytes);
+ pos = 0;
+ left -= fill;
+ src += fill;
+ }
+
+ crypto_aegis128l_aesni_ad(state, left, src);
+
+ src += left & ~(AEGIS128L_BLOCK_SIZE - 1);
+ left &= AEGIS128L_BLOCK_SIZE - 1;
+ }
+
+ memcpy(buf.bytes + pos, src, left);
+ pos += left;
+ assoclen -= size;
+
+ scatterwalk_unmap(mapped);
+ scatterwalk_advance(&walk, size);
+ scatterwalk_done(&walk, 0, assoclen);
+ }
+
+ if (pos > 0) {
+ memset(buf.bytes + pos, 0, AEGIS128L_BLOCK_SIZE - pos);
+ crypto_aegis128l_aesni_ad(state, AEGIS128L_BLOCK_SIZE, buf.bytes);
+ }
+}
+
+static void crypto_aegis128l_aesni_process_crypt(
+ struct aegis_state *state, struct skcipher_walk *walk,
+ const struct aegis_crypt_ops *ops)
+{
+ while (walk->nbytes >= AEGIS128L_BLOCK_SIZE) {
+ ops->crypt_blocks(state, round_down(walk->nbytes,
+ AEGIS128L_BLOCK_SIZE),
+ walk->src.virt.addr, walk->dst.virt.addr);
+ skcipher_walk_done(walk, walk->nbytes % AEGIS128L_BLOCK_SIZE);
+ }
+
+ if (walk->nbytes) {
+ ops->crypt_tail(state, walk->nbytes, walk->src.virt.addr,
+ walk->dst.virt.addr);
+ skcipher_walk_done(walk, 0);
+ }
+}
+
+static struct aegis_ctx *crypto_aegis128l_aesni_ctx(struct crypto_aead *aead)
+{
+ u8 *ctx = crypto_aead_ctx(aead);
+ ctx = PTR_ALIGN(ctx, __alignof__(struct aegis_ctx));
+ return (void *)ctx;
+}
+
+static int crypto_aegis128l_aesni_setkey(struct crypto_aead *aead,
+ const u8 *key, unsigned int keylen)
+{
+ struct aegis_ctx *ctx = crypto_aegis128l_aesni_ctx(aead);
+
+ if (keylen != AEGIS128L_KEY_SIZE) {
+ crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN);
+ return -EINVAL;
+ }
+
+ memcpy(ctx->key.bytes, key, AEGIS128L_KEY_SIZE);
+
+ return 0;
+}
+
+static int crypto_aegis128l_aesni_setauthsize(struct crypto_aead *tfm,
+ unsigned int authsize)
+{
+ if (authsize > AEGIS128L_MAX_AUTH_SIZE)
+ return -EINVAL;
+ if (authsize < AEGIS128L_MIN_AUTH_SIZE)
+ return -EINVAL;
+ return 0;
+}
+
+static void crypto_aegis128l_aesni_crypt(struct aead_request *req,
+ struct aegis_block *tag_xor,
+ unsigned int cryptlen,
+ const struct aegis_crypt_ops *ops)
+{
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ struct aegis_ctx *ctx = crypto_aegis128l_aesni_ctx(tfm);
+ struct skcipher_walk walk;
+ struct aegis_state state;
+
+ ops->skcipher_walk_init(&walk, req, true);
+
+ kernel_fpu_begin();
+
+ crypto_aegis128l_aesni_init(&state, ctx->key.bytes, req->iv);
+ crypto_aegis128l_aesni_process_ad(&state, req->src, req->assoclen);
+ crypto_aegis128l_aesni_process_crypt(&state, &walk, ops);
+ crypto_aegis128l_aesni_final(&state, tag_xor, req->assoclen, cryptlen);
+
+ kernel_fpu_end();
+}
+
+static int crypto_aegis128l_aesni_encrypt(struct aead_request *req)
+{
+ static const struct aegis_crypt_ops OPS = {
+ .skcipher_walk_init = skcipher_walk_aead_encrypt,
+ .crypt_blocks = crypto_aegis128l_aesni_enc,
+ .crypt_tail = crypto_aegis128l_aesni_enc_tail,
+ };
+
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ struct aegis_block tag = {};
+ unsigned int authsize = crypto_aead_authsize(tfm);
+ unsigned int cryptlen = req->cryptlen;
+
+ crypto_aegis128l_aesni_crypt(req, &tag, cryptlen, &OPS);
+
+ scatterwalk_map_and_copy(tag.bytes, req->dst,
+ req->assoclen + cryptlen, authsize, 1);
+ return 0;
+}
+
+static int crypto_aegis128l_aesni_decrypt(struct aead_request *req)
+{
+ static const struct aegis_block zeros = {};
+
+ static const struct aegis_crypt_ops OPS = {
+ .skcipher_walk_init = skcipher_walk_aead_decrypt,
+ .crypt_blocks = crypto_aegis128l_aesni_dec,
+ .crypt_tail = crypto_aegis128l_aesni_dec_tail,
+ };
+
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ struct aegis_block tag;
+ unsigned int authsize = crypto_aead_authsize(tfm);
+ unsigned int cryptlen = req->cryptlen - authsize;
+
+ scatterwalk_map_and_copy(tag.bytes, req->src,
+ req->assoclen + cryptlen, authsize, 0);
+
+ crypto_aegis128l_aesni_crypt(req, &tag, cryptlen, &OPS);
+
+ return crypto_memneq(tag.bytes, zeros.bytes, authsize) ? -EBADMSG : 0;
+}
+
+static int crypto_aegis128l_aesni_init_tfm(struct crypto_aead *aead)
+{
+ return 0;
+}
+
+static void crypto_aegis128l_aesni_exit_tfm(struct crypto_aead *aead)
+{
+}
+
+static int cryptd_aegis128l_aesni_setkey(struct crypto_aead *aead,
+ const u8 *key, unsigned int keylen)
+{
+ struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+ struct cryptd_aead *cryptd_tfm = *ctx;
+
+ return crypto_aead_setkey(&cryptd_tfm->base, key, keylen);
+}
+
+static int cryptd_aegis128l_aesni_setauthsize(struct crypto_aead *aead,
+ unsigned int authsize)
+{
+ struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+ struct cryptd_aead *cryptd_tfm = *ctx;
+
+ return crypto_aead_setauthsize(&cryptd_tfm->base, authsize);
+}
+
+static int cryptd_aegis128l_aesni_encrypt(struct aead_request *req)
+{
+ struct crypto_aead *aead = crypto_aead_reqtfm(req);
+ struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+ struct cryptd_aead *cryptd_tfm = *ctx;
+
+ aead = &cryptd_tfm->base;
+ if (irq_fpu_usable() && (!in_atomic() ||
+ !cryptd_aead_queued(cryptd_tfm)))
+ aead = cryptd_aead_child(cryptd_tfm);
+
+ aead_request_set_tfm(req, aead);
+
+ return crypto_aead_encrypt(req);
+}
+
+static int cryptd_aegis128l_aesni_decrypt(struct aead_request *req)
+{
+ struct crypto_aead *aead = crypto_aead_reqtfm(req);
+ struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+ struct cryptd_aead *cryptd_tfm = *ctx;
+
+ aead = &cryptd_tfm->base;
+ if (irq_fpu_usable() && (!in_atomic() ||
+ !cryptd_aead_queued(cryptd_tfm)))
+ aead = cryptd_aead_child(cryptd_tfm);
+
+ aead_request_set_tfm(req, aead);
+
+ return crypto_aead_decrypt(req);
+}
+
+static int cryptd_aegis128l_aesni_init_tfm(struct crypto_aead *aead)
+{
+ struct cryptd_aead *cryptd_tfm;
+ struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+
+ cryptd_tfm = cryptd_alloc_aead("__aegis128l-aesni", CRYPTO_ALG_INTERNAL,
+ CRYPTO_ALG_INTERNAL);
+ if (IS_ERR(cryptd_tfm))
+ return PTR_ERR(cryptd_tfm);
+
+ *ctx = cryptd_tfm;
+ crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base));
+ return 0;
+}
+
+static void cryptd_aegis128l_aesni_exit_tfm(struct crypto_aead *aead)
+{
+ struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+
+ cryptd_free_aead(*ctx);
+}
+
+static struct aead_alg crypto_aegis128l_aesni_alg[] = {
+ {
+ .setkey = crypto_aegis128l_aesni_setkey,
+ .setauthsize = crypto_aegis128l_aesni_setauthsize,
+ .encrypt = crypto_aegis128l_aesni_encrypt,
+ .decrypt = crypto_aegis128l_aesni_decrypt,
+ .init = crypto_aegis128l_aesni_init_tfm,
+ .exit = crypto_aegis128l_aesni_exit_tfm,
+
+ .ivsize = AEGIS128L_NONCE_SIZE,
+ .maxauthsize = AEGIS128L_MAX_AUTH_SIZE,
+ .chunksize = AEGIS128L_BLOCK_SIZE,
+
+ .base = {
+ .cra_flags = CRYPTO_ALG_INTERNAL,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct aegis_ctx) +
+ __alignof__(struct aegis_ctx),
+ .cra_alignmask = 0,
+
+ .cra_name = "__aegis128l",
+ .cra_driver_name = "__aegis128l-aesni",
+
+ .cra_module = THIS_MODULE,
+ }
+ }, {
+ .setkey = cryptd_aegis128l_aesni_setkey,
+ .setauthsize = cryptd_aegis128l_aesni_setauthsize,
+ .encrypt = cryptd_aegis128l_aesni_encrypt,
+ .decrypt = cryptd_aegis128l_aesni_decrypt,
+ .init = cryptd_aegis128l_aesni_init_tfm,
+ .exit = cryptd_aegis128l_aesni_exit_tfm,
+
+ .ivsize = AEGIS128L_NONCE_SIZE,
+ .maxauthsize = AEGIS128L_MAX_AUTH_SIZE,
+ .chunksize = AEGIS128L_BLOCK_SIZE,
+
+ .base = {
+ .cra_flags = CRYPTO_ALG_ASYNC,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct cryptd_aead *),
+ .cra_alignmask = 0,
+
+ .cra_priority = 400,
+
+ .cra_name = "aegis128l",
+ .cra_driver_name = "aegis128l-aesni",
+
+ .cra_module = THIS_MODULE,
+ }
+ }
+};
+
+static int __init crypto_aegis128l_aesni_module_init(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_XMM2) ||
+ !boot_cpu_has(X86_FEATURE_AES) ||
+ !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL))
+ return -ENODEV;
+
+ return crypto_register_aeads(crypto_aegis128l_aesni_alg,
+ ARRAY_SIZE(crypto_aegis128l_aesni_alg));
+}
+
+static void __exit crypto_aegis128l_aesni_module_exit(void)
+{
+ crypto_unregister_aeads(crypto_aegis128l_aesni_alg,
+ ARRAY_SIZE(crypto_aegis128l_aesni_alg));
+}
+
+module_init(crypto_aegis128l_aesni_module_init);
+module_exit(crypto_aegis128l_aesni_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>");
+MODULE_DESCRIPTION("AEGIS-128L AEAD algorithm -- AESNI+SSE2 implementation");
+MODULE_ALIAS_CRYPTO("aegis128l");
+MODULE_ALIAS_CRYPTO("aegis128l-aesni");
diff --git a/arch/x86/crypto/aegis256-aesni-asm.S b/arch/x86/crypto/aegis256-aesni-asm.S
new file mode 100644
index 000000000..8870c7c5d
--- /dev/null
+++ b/arch/x86/crypto/aegis256-aesni-asm.S
@@ -0,0 +1,703 @@
+/*
+ * AES-NI + SSE2 implementation of AEGIS-128L
+ *
+ * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
+ * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+
+#define STATE0 %xmm0
+#define STATE1 %xmm1
+#define STATE2 %xmm2
+#define STATE3 %xmm3
+#define STATE4 %xmm4
+#define STATE5 %xmm5
+#define MSG %xmm6
+#define T0 %xmm7
+#define T1 %xmm8
+#define T2 %xmm9
+#define T3 %xmm10
+
+#define STATEP %rdi
+#define LEN %rsi
+#define SRC %rdx
+#define DST %rcx
+
+.section .rodata.cst16.aegis256_const, "aM", @progbits, 32
+.align 16
+.Laegis256_const_0:
+ .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d
+ .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62
+.Laegis256_const_1:
+ .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1
+ .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd
+
+.section .rodata.cst16.aegis256_counter, "aM", @progbits, 16
+.align 16
+.Laegis256_counter:
+ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
+ .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+
+.text
+
+/*
+ * __load_partial: internal ABI
+ * input:
+ * LEN - bytes
+ * SRC - src
+ * output:
+ * MSG - message block
+ * changed:
+ * T0
+ * %r8
+ * %r9
+ */
+__load_partial:
+ xor %r9d, %r9d
+ pxor MSG, MSG
+
+ mov LEN, %r8
+ and $0x1, %r8
+ jz .Lld_partial_1
+
+ mov LEN, %r8
+ and $0x1E, %r8
+ add SRC, %r8
+ mov (%r8), %r9b
+
+.Lld_partial_1:
+ mov LEN, %r8
+ and $0x2, %r8
+ jz .Lld_partial_2
+
+ mov LEN, %r8
+ and $0x1C, %r8
+ add SRC, %r8
+ shl $0x10, %r9
+ mov (%r8), %r9w
+
+.Lld_partial_2:
+ mov LEN, %r8
+ and $0x4, %r8
+ jz .Lld_partial_4
+
+ mov LEN, %r8
+ and $0x18, %r8
+ add SRC, %r8
+ shl $32, %r9
+ mov (%r8), %r8d
+ xor %r8, %r9
+
+.Lld_partial_4:
+ movq %r9, MSG
+
+ mov LEN, %r8
+ and $0x8, %r8
+ jz .Lld_partial_8
+
+ mov LEN, %r8
+ and $0x10, %r8
+ add SRC, %r8
+ pslldq $8, MSG
+ movq (%r8), T0
+ pxor T0, MSG
+
+.Lld_partial_8:
+ ret
+ENDPROC(__load_partial)
+
+/*
+ * __store_partial: internal ABI
+ * input:
+ * LEN - bytes
+ * DST - dst
+ * output:
+ * T0 - message block
+ * changed:
+ * %r8
+ * %r9
+ * %r10
+ */
+__store_partial:
+ mov LEN, %r8
+ mov DST, %r9
+
+ movq T0, %r10
+
+ cmp $8, %r8
+ jl .Lst_partial_8
+
+ mov %r10, (%r9)
+ psrldq $8, T0
+ movq T0, %r10
+
+ sub $8, %r8
+ add $8, %r9
+
+.Lst_partial_8:
+ cmp $4, %r8
+ jl .Lst_partial_4
+
+ mov %r10d, (%r9)
+ shr $32, %r10
+
+ sub $4, %r8
+ add $4, %r9
+
+.Lst_partial_4:
+ cmp $2, %r8
+ jl .Lst_partial_2
+
+ mov %r10w, (%r9)
+ shr $0x10, %r10
+
+ sub $2, %r8
+ add $2, %r9
+
+.Lst_partial_2:
+ cmp $1, %r8
+ jl .Lst_partial_1
+
+ mov %r10b, (%r9)
+
+.Lst_partial_1:
+ ret
+ENDPROC(__store_partial)
+
+.macro update
+ movdqa STATE5, T0
+ aesenc STATE0, STATE5
+ aesenc STATE1, STATE0
+ aesenc STATE2, STATE1
+ aesenc STATE3, STATE2
+ aesenc STATE4, STATE3
+ aesenc T0, STATE4
+.endm
+
+.macro update0 m
+ update
+ pxor \m, STATE5
+.endm
+
+.macro update1 m
+ update
+ pxor \m, STATE4
+.endm
+
+.macro update2 m
+ update
+ pxor \m, STATE3
+.endm
+
+.macro update3 m
+ update
+ pxor \m, STATE2
+.endm
+
+.macro update4 m
+ update
+ pxor \m, STATE1
+.endm
+
+.macro update5 m
+ update
+ pxor \m, STATE0
+.endm
+
+.macro state_load
+ movdqu 0x00(STATEP), STATE0
+ movdqu 0x10(STATEP), STATE1
+ movdqu 0x20(STATEP), STATE2
+ movdqu 0x30(STATEP), STATE3
+ movdqu 0x40(STATEP), STATE4
+ movdqu 0x50(STATEP), STATE5
+.endm
+
+.macro state_store s0 s1 s2 s3 s4 s5
+ movdqu \s5, 0x00(STATEP)
+ movdqu \s0, 0x10(STATEP)
+ movdqu \s1, 0x20(STATEP)
+ movdqu \s2, 0x30(STATEP)
+ movdqu \s3, 0x40(STATEP)
+ movdqu \s4, 0x50(STATEP)
+.endm
+
+.macro state_store0
+ state_store STATE0 STATE1 STATE2 STATE3 STATE4 STATE5
+.endm
+
+.macro state_store1
+ state_store STATE5 STATE0 STATE1 STATE2 STATE3 STATE4
+.endm
+
+.macro state_store2
+ state_store STATE4 STATE5 STATE0 STATE1 STATE2 STATE3
+.endm
+
+.macro state_store3
+ state_store STATE3 STATE4 STATE5 STATE0 STATE1 STATE2
+.endm
+
+.macro state_store4
+ state_store STATE2 STATE3 STATE4 STATE5 STATE0 STATE1
+.endm
+
+.macro state_store5
+ state_store STATE1 STATE2 STATE3 STATE4 STATE5 STATE0
+.endm
+
+/*
+ * void crypto_aegis256_aesni_init(void *state, const void *key, const void *iv);
+ */
+ENTRY(crypto_aegis256_aesni_init)
+ FRAME_BEGIN
+
+ /* load key: */
+ movdqa 0x00(%rsi), MSG
+ movdqa 0x10(%rsi), T1
+ movdqa MSG, STATE4
+ movdqa T1, STATE5
+
+ /* load IV: */
+ movdqu 0x00(%rdx), T2
+ movdqu 0x10(%rdx), T3
+ pxor MSG, T2
+ pxor T1, T3
+ movdqa T2, STATE0
+ movdqa T3, STATE1
+
+ /* load the constants: */
+ movdqa .Laegis256_const_0, STATE3
+ movdqa .Laegis256_const_1, STATE2
+ pxor STATE3, STATE4
+ pxor STATE2, STATE5
+
+ /* update 10 times with IV and KEY: */
+ update0 MSG
+ update1 T1
+ update2 T2
+ update3 T3
+ update4 MSG
+ update5 T1
+ update0 T2
+ update1 T3
+ update2 MSG
+ update3 T1
+ update4 T2
+ update5 T3
+ update0 MSG
+ update1 T1
+ update2 T2
+ update3 T3
+
+ state_store3
+
+ FRAME_END
+ ret
+ENDPROC(crypto_aegis256_aesni_init)
+
+.macro ad_block a i
+ movdq\a (\i * 0x10)(SRC), MSG
+ update\i MSG
+ sub $0x10, LEN
+ cmp $0x10, LEN
+ jl .Lad_out_\i
+.endm
+
+/*
+ * void crypto_aegis256_aesni_ad(void *state, unsigned int length,
+ * const void *data);
+ */
+ENTRY(crypto_aegis256_aesni_ad)
+ FRAME_BEGIN
+
+ cmp $0x10, LEN
+ jb .Lad_out
+
+ state_load
+
+ mov SRC, %r8
+ and $0xf, %r8
+ jnz .Lad_u_loop
+
+.align 8
+.Lad_a_loop:
+ ad_block a 0
+ ad_block a 1
+ ad_block a 2
+ ad_block a 3
+ ad_block a 4
+ ad_block a 5
+
+ add $0x60, SRC
+ jmp .Lad_a_loop
+
+.align 8
+.Lad_u_loop:
+ ad_block u 0
+ ad_block u 1
+ ad_block u 2
+ ad_block u 3
+ ad_block u 4
+ ad_block u 5
+
+ add $0x60, SRC
+ jmp .Lad_u_loop
+
+.Lad_out_0:
+ state_store0
+ FRAME_END
+ ret
+
+.Lad_out_1:
+ state_store1
+ FRAME_END
+ ret
+
+.Lad_out_2:
+ state_store2
+ FRAME_END
+ ret
+
+.Lad_out_3:
+ state_store3
+ FRAME_END
+ ret
+
+.Lad_out_4:
+ state_store4
+ FRAME_END
+ ret
+
+.Lad_out_5:
+ state_store5
+ FRAME_END
+ ret
+
+.Lad_out:
+ FRAME_END
+ ret
+ENDPROC(crypto_aegis256_aesni_ad)
+
+.macro crypt m s0 s1 s2 s3 s4 s5
+ pxor \s1, \m
+ pxor \s4, \m
+ pxor \s5, \m
+ movdqa \s2, T3
+ pand \s3, T3
+ pxor T3, \m
+.endm
+
+.macro crypt0 m
+ crypt \m STATE0 STATE1 STATE2 STATE3 STATE4 STATE5
+.endm
+
+.macro crypt1 m
+ crypt \m STATE5 STATE0 STATE1 STATE2 STATE3 STATE4
+.endm
+
+.macro crypt2 m
+ crypt \m STATE4 STATE5 STATE0 STATE1 STATE2 STATE3
+.endm
+
+.macro crypt3 m
+ crypt \m STATE3 STATE4 STATE5 STATE0 STATE1 STATE2
+.endm
+
+.macro crypt4 m
+ crypt \m STATE2 STATE3 STATE4 STATE5 STATE0 STATE1
+.endm
+
+.macro crypt5 m
+ crypt \m STATE1 STATE2 STATE3 STATE4 STATE5 STATE0
+.endm
+
+.macro encrypt_block a i
+ movdq\a (\i * 0x10)(SRC), MSG
+ movdqa MSG, T0
+ crypt\i T0
+ movdq\a T0, (\i * 0x10)(DST)
+
+ update\i MSG
+
+ sub $0x10, LEN
+ cmp $0x10, LEN
+ jl .Lenc_out_\i
+.endm
+
+.macro decrypt_block a i
+ movdq\a (\i * 0x10)(SRC), MSG
+ crypt\i MSG
+ movdq\a MSG, (\i * 0x10)(DST)
+
+ update\i MSG
+
+ sub $0x10, LEN
+ cmp $0x10, LEN
+ jl .Ldec_out_\i
+.endm
+
+/*
+ * void crypto_aegis256_aesni_enc(void *state, unsigned int length,
+ * const void *src, void *dst);
+ */
+ENTRY(crypto_aegis256_aesni_enc)
+ FRAME_BEGIN
+
+ cmp $0x10, LEN
+ jb .Lenc_out
+
+ state_load
+
+ mov SRC, %r8
+ or DST, %r8
+ and $0xf, %r8
+ jnz .Lenc_u_loop
+
+.align 8
+.Lenc_a_loop:
+ encrypt_block a 0
+ encrypt_block a 1
+ encrypt_block a 2
+ encrypt_block a 3
+ encrypt_block a 4
+ encrypt_block a 5
+
+ add $0x60, SRC
+ add $0x60, DST
+ jmp .Lenc_a_loop
+
+.align 8
+.Lenc_u_loop:
+ encrypt_block u 0
+ encrypt_block u 1
+ encrypt_block u 2
+ encrypt_block u 3
+ encrypt_block u 4
+ encrypt_block u 5
+
+ add $0x60, SRC
+ add $0x60, DST
+ jmp .Lenc_u_loop
+
+.Lenc_out_0:
+ state_store0
+ FRAME_END
+ ret
+
+.Lenc_out_1:
+ state_store1
+ FRAME_END
+ ret
+
+.Lenc_out_2:
+ state_store2
+ FRAME_END
+ ret
+
+.Lenc_out_3:
+ state_store3
+ FRAME_END
+ ret
+
+.Lenc_out_4:
+ state_store4
+ FRAME_END
+ ret
+
+.Lenc_out_5:
+ state_store5
+ FRAME_END
+ ret
+
+.Lenc_out:
+ FRAME_END
+ ret
+ENDPROC(crypto_aegis256_aesni_enc)
+
+/*
+ * void crypto_aegis256_aesni_enc_tail(void *state, unsigned int length,
+ * const void *src, void *dst);
+ */
+ENTRY(crypto_aegis256_aesni_enc_tail)
+ FRAME_BEGIN
+
+ state_load
+
+ /* encrypt message: */
+ call __load_partial
+
+ movdqa MSG, T0
+ crypt0 T0
+
+ call __store_partial
+
+ update0 MSG
+
+ state_store0
+
+ FRAME_END
+ ret
+ENDPROC(crypto_aegis256_aesni_enc_tail)
+
+/*
+ * void crypto_aegis256_aesni_dec(void *state, unsigned int length,
+ * const void *src, void *dst);
+ */
+ENTRY(crypto_aegis256_aesni_dec)
+ FRAME_BEGIN
+
+ cmp $0x10, LEN
+ jb .Ldec_out
+
+ state_load
+
+ mov SRC, %r8
+ or DST, %r8
+ and $0xF, %r8
+ jnz .Ldec_u_loop
+
+.align 8
+.Ldec_a_loop:
+ decrypt_block a 0
+ decrypt_block a 1
+ decrypt_block a 2
+ decrypt_block a 3
+ decrypt_block a 4
+ decrypt_block a 5
+
+ add $0x60, SRC
+ add $0x60, DST
+ jmp .Ldec_a_loop
+
+.align 8
+.Ldec_u_loop:
+ decrypt_block u 0
+ decrypt_block u 1
+ decrypt_block u 2
+ decrypt_block u 3
+ decrypt_block u 4
+ decrypt_block u 5
+
+ add $0x60, SRC
+ add $0x60, DST
+ jmp .Ldec_u_loop
+
+.Ldec_out_0:
+ state_store0
+ FRAME_END
+ ret
+
+.Ldec_out_1:
+ state_store1
+ FRAME_END
+ ret
+
+.Ldec_out_2:
+ state_store2
+ FRAME_END
+ ret
+
+.Ldec_out_3:
+ state_store3
+ FRAME_END
+ ret
+
+.Ldec_out_4:
+ state_store4
+ FRAME_END
+ ret
+
+.Ldec_out_5:
+ state_store5
+ FRAME_END
+ ret
+
+.Ldec_out:
+ FRAME_END
+ ret
+ENDPROC(crypto_aegis256_aesni_dec)
+
+/*
+ * void crypto_aegis256_aesni_dec_tail(void *state, unsigned int length,
+ * const void *src, void *dst);
+ */
+ENTRY(crypto_aegis256_aesni_dec_tail)
+ FRAME_BEGIN
+
+ state_load
+
+ /* decrypt message: */
+ call __load_partial
+
+ crypt0 MSG
+
+ movdqa MSG, T0
+ call __store_partial
+
+ /* mask with byte count: */
+ movq LEN, T0
+ punpcklbw T0, T0
+ punpcklbw T0, T0
+ punpcklbw T0, T0
+ punpcklbw T0, T0
+ movdqa .Laegis256_counter, T1
+ pcmpgtb T1, T0
+ pand T0, MSG
+
+ update0 MSG
+
+ state_store0
+
+ FRAME_END
+ ret
+ENDPROC(crypto_aegis256_aesni_dec_tail)
+
+/*
+ * void crypto_aegis256_aesni_final(void *state, void *tag_xor,
+ * u64 assoclen, u64 cryptlen);
+ */
+ENTRY(crypto_aegis256_aesni_final)
+ FRAME_BEGIN
+
+ state_load
+
+ /* prepare length block: */
+ movq %rdx, MSG
+ movq %rcx, T0
+ pslldq $8, T0
+ pxor T0, MSG
+ psllq $3, MSG /* multiply by 8 (to get bit count) */
+
+ pxor STATE3, MSG
+
+ /* update state: */
+ update0 MSG
+ update1 MSG
+ update2 MSG
+ update3 MSG
+ update4 MSG
+ update5 MSG
+ update0 MSG
+
+ /* xor tag: */
+ movdqu (%rsi), MSG
+
+ pxor STATE0, MSG
+ pxor STATE1, MSG
+ pxor STATE2, MSG
+ pxor STATE3, MSG
+ pxor STATE4, MSG
+ pxor STATE5, MSG
+
+ movdqu MSG, (%rsi)
+
+ FRAME_END
+ ret
+ENDPROC(crypto_aegis256_aesni_final)
diff --git a/arch/x86/crypto/aegis256-aesni-glue.c b/arch/x86/crypto/aegis256-aesni-glue.c
new file mode 100644
index 000000000..6227ca322
--- /dev/null
+++ b/arch/x86/crypto/aegis256-aesni-glue.c
@@ -0,0 +1,394 @@
+/*
+ * The AEGIS-256 Authenticated-Encryption Algorithm
+ * Glue for AES-NI + SSE2 implementation
+ *
+ * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
+ * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include <crypto/cryptd.h>
+#include <crypto/internal/aead.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/scatterwalk.h>
+#include <linux/module.h>
+#include <asm/fpu/api.h>
+#include <asm/cpu_device_id.h>
+
+#define AEGIS256_BLOCK_ALIGN 16
+#define AEGIS256_BLOCK_SIZE 16
+#define AEGIS256_NONCE_SIZE 32
+#define AEGIS256_STATE_BLOCKS 6
+#define AEGIS256_KEY_SIZE 32
+#define AEGIS256_MIN_AUTH_SIZE 8
+#define AEGIS256_MAX_AUTH_SIZE 16
+
+asmlinkage void crypto_aegis256_aesni_init(void *state, void *key, void *iv);
+
+asmlinkage void crypto_aegis256_aesni_ad(
+ void *state, unsigned int length, const void *data);
+
+asmlinkage void crypto_aegis256_aesni_enc(
+ void *state, unsigned int length, const void *src, void *dst);
+
+asmlinkage void crypto_aegis256_aesni_dec(
+ void *state, unsigned int length, const void *src, void *dst);
+
+asmlinkage void crypto_aegis256_aesni_enc_tail(
+ void *state, unsigned int length, const void *src, void *dst);
+
+asmlinkage void crypto_aegis256_aesni_dec_tail(
+ void *state, unsigned int length, const void *src, void *dst);
+
+asmlinkage void crypto_aegis256_aesni_final(
+ void *state, void *tag_xor, unsigned int cryptlen,
+ unsigned int assoclen);
+
+struct aegis_block {
+ u8 bytes[AEGIS256_BLOCK_SIZE] __aligned(AEGIS256_BLOCK_ALIGN);
+};
+
+struct aegis_state {
+ struct aegis_block blocks[AEGIS256_STATE_BLOCKS];
+};
+
+struct aegis_ctx {
+ struct aegis_block key[AEGIS256_KEY_SIZE / AEGIS256_BLOCK_SIZE];
+};
+
+struct aegis_crypt_ops {
+ int (*skcipher_walk_init)(struct skcipher_walk *walk,
+ struct aead_request *req, bool atomic);
+
+ void (*crypt_blocks)(void *state, unsigned int length, const void *src,
+ void *dst);
+ void (*crypt_tail)(void *state, unsigned int length, const void *src,
+ void *dst);
+};
+
+static void crypto_aegis256_aesni_process_ad(
+ struct aegis_state *state, struct scatterlist *sg_src,
+ unsigned int assoclen)
+{
+ struct scatter_walk walk;
+ struct aegis_block buf;
+ unsigned int pos = 0;
+
+ scatterwalk_start(&walk, sg_src);
+ while (assoclen != 0) {
+ unsigned int size = scatterwalk_clamp(&walk, assoclen);
+ unsigned int left = size;
+ void *mapped = scatterwalk_map(&walk);
+ const u8 *src = (const u8 *)mapped;
+
+ if (pos + size >= AEGIS256_BLOCK_SIZE) {
+ if (pos > 0) {
+ unsigned int fill = AEGIS256_BLOCK_SIZE - pos;
+ memcpy(buf.bytes + pos, src, fill);
+ crypto_aegis256_aesni_ad(state,
+ AEGIS256_BLOCK_SIZE,
+ buf.bytes);
+ pos = 0;
+ left -= fill;
+ src += fill;
+ }
+
+ crypto_aegis256_aesni_ad(state, left, src);
+
+ src += left & ~(AEGIS256_BLOCK_SIZE - 1);
+ left &= AEGIS256_BLOCK_SIZE - 1;
+ }
+
+ memcpy(buf.bytes + pos, src, left);
+ pos += left;
+ assoclen -= size;
+
+ scatterwalk_unmap(mapped);
+ scatterwalk_advance(&walk, size);
+ scatterwalk_done(&walk, 0, assoclen);
+ }
+
+ if (pos > 0) {
+ memset(buf.bytes + pos, 0, AEGIS256_BLOCK_SIZE - pos);
+ crypto_aegis256_aesni_ad(state, AEGIS256_BLOCK_SIZE, buf.bytes);
+ }
+}
+
+static void crypto_aegis256_aesni_process_crypt(
+ struct aegis_state *state, struct skcipher_walk *walk,
+ const struct aegis_crypt_ops *ops)
+{
+ while (walk->nbytes >= AEGIS256_BLOCK_SIZE) {
+ ops->crypt_blocks(state,
+ round_down(walk->nbytes, AEGIS256_BLOCK_SIZE),
+ walk->src.virt.addr, walk->dst.virt.addr);
+ skcipher_walk_done(walk, walk->nbytes % AEGIS256_BLOCK_SIZE);
+ }
+
+ if (walk->nbytes) {
+ ops->crypt_tail(state, walk->nbytes, walk->src.virt.addr,
+ walk->dst.virt.addr);
+ skcipher_walk_done(walk, 0);
+ }
+}
+
+static struct aegis_ctx *crypto_aegis256_aesni_ctx(struct crypto_aead *aead)
+{
+ u8 *ctx = crypto_aead_ctx(aead);
+ ctx = PTR_ALIGN(ctx, __alignof__(struct aegis_ctx));
+ return (void *)ctx;
+}
+
+static int crypto_aegis256_aesni_setkey(struct crypto_aead *aead, const u8 *key,
+ unsigned int keylen)
+{
+ struct aegis_ctx *ctx = crypto_aegis256_aesni_ctx(aead);
+
+ if (keylen != AEGIS256_KEY_SIZE) {
+ crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN);
+ return -EINVAL;
+ }
+
+ memcpy(ctx->key, key, AEGIS256_KEY_SIZE);
+
+ return 0;
+}
+
+static int crypto_aegis256_aesni_setauthsize(struct crypto_aead *tfm,
+ unsigned int authsize)
+{
+ if (authsize > AEGIS256_MAX_AUTH_SIZE)
+ return -EINVAL;
+ if (authsize < AEGIS256_MIN_AUTH_SIZE)
+ return -EINVAL;
+ return 0;
+}
+
+static void crypto_aegis256_aesni_crypt(struct aead_request *req,
+ struct aegis_block *tag_xor,
+ unsigned int cryptlen,
+ const struct aegis_crypt_ops *ops)
+{
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ struct aegis_ctx *ctx = crypto_aegis256_aesni_ctx(tfm);
+ struct skcipher_walk walk;
+ struct aegis_state state;
+
+ ops->skcipher_walk_init(&walk, req, true);
+
+ kernel_fpu_begin();
+
+ crypto_aegis256_aesni_init(&state, ctx->key, req->iv);
+ crypto_aegis256_aesni_process_ad(&state, req->src, req->assoclen);
+ crypto_aegis256_aesni_process_crypt(&state, &walk, ops);
+ crypto_aegis256_aesni_final(&state, tag_xor, req->assoclen, cryptlen);
+
+ kernel_fpu_end();
+}
+
+static int crypto_aegis256_aesni_encrypt(struct aead_request *req)
+{
+ static const struct aegis_crypt_ops OPS = {
+ .skcipher_walk_init = skcipher_walk_aead_encrypt,
+ .crypt_blocks = crypto_aegis256_aesni_enc,
+ .crypt_tail = crypto_aegis256_aesni_enc_tail,
+ };
+
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ struct aegis_block tag = {};
+ unsigned int authsize = crypto_aead_authsize(tfm);
+ unsigned int cryptlen = req->cryptlen;
+
+ crypto_aegis256_aesni_crypt(req, &tag, cryptlen, &OPS);
+
+ scatterwalk_map_and_copy(tag.bytes, req->dst,
+ req->assoclen + cryptlen, authsize, 1);
+ return 0;
+}
+
+static int crypto_aegis256_aesni_decrypt(struct aead_request *req)
+{
+ static const struct aegis_block zeros = {};
+
+ static const struct aegis_crypt_ops OPS = {
+ .skcipher_walk_init = skcipher_walk_aead_decrypt,
+ .crypt_blocks = crypto_aegis256_aesni_dec,
+ .crypt_tail = crypto_aegis256_aesni_dec_tail,
+ };
+
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ struct aegis_block tag;
+ unsigned int authsize = crypto_aead_authsize(tfm);
+ unsigned int cryptlen = req->cryptlen - authsize;
+
+ scatterwalk_map_and_copy(tag.bytes, req->src,
+ req->assoclen + cryptlen, authsize, 0);
+
+ crypto_aegis256_aesni_crypt(req, &tag, cryptlen, &OPS);
+
+ return crypto_memneq(tag.bytes, zeros.bytes, authsize) ? -EBADMSG : 0;
+}
+
+static int crypto_aegis256_aesni_init_tfm(struct crypto_aead *aead)
+{
+ return 0;
+}
+
+static void crypto_aegis256_aesni_exit_tfm(struct crypto_aead *aead)
+{
+}
+
+static int cryptd_aegis256_aesni_setkey(struct crypto_aead *aead,
+ const u8 *key, unsigned int keylen)
+{
+ struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+ struct cryptd_aead *cryptd_tfm = *ctx;
+
+ return crypto_aead_setkey(&cryptd_tfm->base, key, keylen);
+}
+
+static int cryptd_aegis256_aesni_setauthsize(struct crypto_aead *aead,
+ unsigned int authsize)
+{
+ struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+ struct cryptd_aead *cryptd_tfm = *ctx;
+
+ return crypto_aead_setauthsize(&cryptd_tfm->base, authsize);
+}
+
+static int cryptd_aegis256_aesni_encrypt(struct aead_request *req)
+{
+ struct crypto_aead *aead = crypto_aead_reqtfm(req);
+ struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+ struct cryptd_aead *cryptd_tfm = *ctx;
+
+ aead = &cryptd_tfm->base;
+ if (irq_fpu_usable() && (!in_atomic() ||
+ !cryptd_aead_queued(cryptd_tfm)))
+ aead = cryptd_aead_child(cryptd_tfm);
+
+ aead_request_set_tfm(req, aead);
+
+ return crypto_aead_encrypt(req);
+}
+
+static int cryptd_aegis256_aesni_decrypt(struct aead_request *req)
+{
+ struct crypto_aead *aead = crypto_aead_reqtfm(req);
+ struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+ struct cryptd_aead *cryptd_tfm = *ctx;
+
+ aead = &cryptd_tfm->base;
+ if (irq_fpu_usable() && (!in_atomic() ||
+ !cryptd_aead_queued(cryptd_tfm)))
+ aead = cryptd_aead_child(cryptd_tfm);
+
+ aead_request_set_tfm(req, aead);
+
+ return crypto_aead_decrypt(req);
+}
+
+static int cryptd_aegis256_aesni_init_tfm(struct crypto_aead *aead)
+{
+ struct cryptd_aead *cryptd_tfm;
+ struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+
+ cryptd_tfm = cryptd_alloc_aead("__aegis256-aesni", CRYPTO_ALG_INTERNAL,
+ CRYPTO_ALG_INTERNAL);
+ if (IS_ERR(cryptd_tfm))
+ return PTR_ERR(cryptd_tfm);
+
+ *ctx = cryptd_tfm;
+ crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base));
+ return 0;
+}
+
+static void cryptd_aegis256_aesni_exit_tfm(struct crypto_aead *aead)
+{
+ struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+
+ cryptd_free_aead(*ctx);
+}
+
+static struct aead_alg crypto_aegis256_aesni_alg[] = {
+ {
+ .setkey = crypto_aegis256_aesni_setkey,
+ .setauthsize = crypto_aegis256_aesni_setauthsize,
+ .encrypt = crypto_aegis256_aesni_encrypt,
+ .decrypt = crypto_aegis256_aesni_decrypt,
+ .init = crypto_aegis256_aesni_init_tfm,
+ .exit = crypto_aegis256_aesni_exit_tfm,
+
+ .ivsize = AEGIS256_NONCE_SIZE,
+ .maxauthsize = AEGIS256_MAX_AUTH_SIZE,
+ .chunksize = AEGIS256_BLOCK_SIZE,
+
+ .base = {
+ .cra_flags = CRYPTO_ALG_INTERNAL,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct aegis_ctx) +
+ __alignof__(struct aegis_ctx),
+ .cra_alignmask = 0,
+
+ .cra_name = "__aegis256",
+ .cra_driver_name = "__aegis256-aesni",
+
+ .cra_module = THIS_MODULE,
+ }
+ }, {
+ .setkey = cryptd_aegis256_aesni_setkey,
+ .setauthsize = cryptd_aegis256_aesni_setauthsize,
+ .encrypt = cryptd_aegis256_aesni_encrypt,
+ .decrypt = cryptd_aegis256_aesni_decrypt,
+ .init = cryptd_aegis256_aesni_init_tfm,
+ .exit = cryptd_aegis256_aesni_exit_tfm,
+
+ .ivsize = AEGIS256_NONCE_SIZE,
+ .maxauthsize = AEGIS256_MAX_AUTH_SIZE,
+ .chunksize = AEGIS256_BLOCK_SIZE,
+
+ .base = {
+ .cra_flags = CRYPTO_ALG_ASYNC,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct cryptd_aead *),
+ .cra_alignmask = 0,
+
+ .cra_priority = 400,
+
+ .cra_name = "aegis256",
+ .cra_driver_name = "aegis256-aesni",
+
+ .cra_module = THIS_MODULE,
+ }
+ }
+};
+
+static int __init crypto_aegis256_aesni_module_init(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_XMM2) ||
+ !boot_cpu_has(X86_FEATURE_AES) ||
+ !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL))
+ return -ENODEV;
+
+ return crypto_register_aeads(crypto_aegis256_aesni_alg,
+ ARRAY_SIZE(crypto_aegis256_aesni_alg));
+}
+
+static void __exit crypto_aegis256_aesni_module_exit(void)
+{
+ crypto_unregister_aeads(crypto_aegis256_aesni_alg,
+ ARRAY_SIZE(crypto_aegis256_aesni_alg));
+}
+
+module_init(crypto_aegis256_aesni_module_init);
+module_exit(crypto_aegis256_aesni_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>");
+MODULE_DESCRIPTION("AEGIS-256 AEAD algorithm -- AESNI+SSE2 implementation");
+MODULE_ALIAS_CRYPTO("aegis256");
+MODULE_ALIAS_CRYPTO("aegis256-aesni");
diff --git a/arch/x86/crypto/aes-i586-asm_32.S b/arch/x86/crypto/aes-i586-asm_32.S
new file mode 100644
index 000000000..2849dbc59
--- /dev/null
+++ b/arch/x86/crypto/aes-i586-asm_32.S
@@ -0,0 +1,362 @@
+// -------------------------------------------------------------------------
+// Copyright (c) 2001, Dr Brian Gladman < >, Worcester, UK.
+// All rights reserved.
+//
+// LICENSE TERMS
+//
+// The free distribution and use of this software in both source and binary
+// form is allowed (with or without changes) provided that:
+//
+// 1. distributions of this source code include the above copyright
+// notice, this list of conditions and the following disclaimer//
+//
+// 2. distributions in binary form include the above copyright
+// notice, this list of conditions and the following disclaimer
+// in the documentation and/or other associated materials//
+//
+// 3. the copyright holder's name is not used to endorse products
+// built using this software without specific written permission.
+//
+//
+// ALTERNATIVELY, provided that this notice is retained in full, this product
+// may be distributed under the terms of the GNU General Public License (GPL),
+// in which case the provisions of the GPL apply INSTEAD OF those given above.
+//
+// Copyright (c) 2004 Linus Torvalds <torvalds@osdl.org>
+// Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
+
+// DISCLAIMER
+//
+// This software is provided 'as is' with no explicit or implied warranties
+// in respect of its properties including, but not limited to, correctness
+// and fitness for purpose.
+// -------------------------------------------------------------------------
+// Issue Date: 29/07/2002
+
+.file "aes-i586-asm.S"
+.text
+
+#include <linux/linkage.h>
+#include <asm/asm-offsets.h>
+
+#define tlen 1024 // length of each of 4 'xor' arrays (256 32-bit words)
+
+/* offsets to parameters with one register pushed onto stack */
+#define ctx 8
+#define out_blk 12
+#define in_blk 16
+
+/* offsets in crypto_aes_ctx structure */
+#define klen (480)
+#define ekey (0)
+#define dkey (240)
+
+// register mapping for encrypt and decrypt subroutines
+
+#define r0 eax
+#define r1 ebx
+#define r2 ecx
+#define r3 edx
+#define r4 esi
+#define r5 edi
+
+#define eaxl al
+#define eaxh ah
+#define ebxl bl
+#define ebxh bh
+#define ecxl cl
+#define ecxh ch
+#define edxl dl
+#define edxh dh
+
+#define _h(reg) reg##h
+#define h(reg) _h(reg)
+
+#define _l(reg) reg##l
+#define l(reg) _l(reg)
+
+// This macro takes a 32-bit word representing a column and uses
+// each of its four bytes to index into four tables of 256 32-bit
+// words to obtain values that are then xored into the appropriate
+// output registers r0, r1, r4 or r5.
+
+// Parameters:
+// table table base address
+// %1 out_state[0]
+// %2 out_state[1]
+// %3 out_state[2]
+// %4 out_state[3]
+// idx input register for the round (destroyed)
+// tmp scratch register for the round
+// sched key schedule
+
+#define do_col(table, a1,a2,a3,a4, idx, tmp) \
+ movzx %l(idx),%tmp; \
+ xor table(,%tmp,4),%a1; \
+ movzx %h(idx),%tmp; \
+ shr $16,%idx; \
+ xor table+tlen(,%tmp,4),%a2; \
+ movzx %l(idx),%tmp; \
+ movzx %h(idx),%idx; \
+ xor table+2*tlen(,%tmp,4),%a3; \
+ xor table+3*tlen(,%idx,4),%a4;
+
+// initialise output registers from the key schedule
+// NB1: original value of a3 is in idx on exit
+// NB2: original values of a1,a2,a4 aren't used
+#define do_fcol(table, a1,a2,a3,a4, idx, tmp, sched) \
+ mov 0 sched,%a1; \
+ movzx %l(idx),%tmp; \
+ mov 12 sched,%a2; \
+ xor table(,%tmp,4),%a1; \
+ mov 4 sched,%a4; \
+ movzx %h(idx),%tmp; \
+ shr $16,%idx; \
+ xor table+tlen(,%tmp,4),%a2; \
+ movzx %l(idx),%tmp; \
+ movzx %h(idx),%idx; \
+ xor table+3*tlen(,%idx,4),%a4; \
+ mov %a3,%idx; \
+ mov 8 sched,%a3; \
+ xor table+2*tlen(,%tmp,4),%a3;
+
+// initialise output registers from the key schedule
+// NB1: original value of a3 is in idx on exit
+// NB2: original values of a1,a2,a4 aren't used
+#define do_icol(table, a1,a2,a3,a4, idx, tmp, sched) \
+ mov 0 sched,%a1; \
+ movzx %l(idx),%tmp; \
+ mov 4 sched,%a2; \
+ xor table(,%tmp,4),%a1; \
+ mov 12 sched,%a4; \
+ movzx %h(idx),%tmp; \
+ shr $16,%idx; \
+ xor table+tlen(,%tmp,4),%a2; \
+ movzx %l(idx),%tmp; \
+ movzx %h(idx),%idx; \
+ xor table+3*tlen(,%idx,4),%a4; \
+ mov %a3,%idx; \
+ mov 8 sched,%a3; \
+ xor table+2*tlen(,%tmp,4),%a3;
+
+
+// original Gladman had conditional saves to MMX regs.
+#define save(a1, a2) \
+ mov %a2,4*a1(%esp)
+
+#define restore(a1, a2) \
+ mov 4*a2(%esp),%a1
+
+// These macros perform a forward encryption cycle. They are entered with
+// the first previous round column values in r0,r1,r4,r5 and
+// exit with the final values in the same registers, using stack
+// for temporary storage.
+
+// round column values
+// on entry: r0,r1,r4,r5
+// on exit: r2,r1,r4,r5
+#define fwd_rnd1(arg, table) \
+ save (0,r1); \
+ save (1,r5); \
+ \
+ /* compute new column values */ \
+ do_fcol(table, r2,r5,r4,r1, r0,r3, arg); /* idx=r0 */ \
+ do_col (table, r4,r1,r2,r5, r0,r3); /* idx=r4 */ \
+ restore(r0,0); \
+ do_col (table, r1,r2,r5,r4, r0,r3); /* idx=r1 */ \
+ restore(r0,1); \
+ do_col (table, r5,r4,r1,r2, r0,r3); /* idx=r5 */
+
+// round column values
+// on entry: r2,r1,r4,r5
+// on exit: r0,r1,r4,r5
+#define fwd_rnd2(arg, table) \
+ save (0,r1); \
+ save (1,r5); \
+ \
+ /* compute new column values */ \
+ do_fcol(table, r0,r5,r4,r1, r2,r3, arg); /* idx=r2 */ \
+ do_col (table, r4,r1,r0,r5, r2,r3); /* idx=r4 */ \
+ restore(r2,0); \
+ do_col (table, r1,r0,r5,r4, r2,r3); /* idx=r1 */ \
+ restore(r2,1); \
+ do_col (table, r5,r4,r1,r0, r2,r3); /* idx=r5 */
+
+// These macros performs an inverse encryption cycle. They are entered with
+// the first previous round column values in r0,r1,r4,r5 and
+// exit with the final values in the same registers, using stack
+// for temporary storage
+
+// round column values
+// on entry: r0,r1,r4,r5
+// on exit: r2,r1,r4,r5
+#define inv_rnd1(arg, table) \
+ save (0,r1); \
+ save (1,r5); \
+ \
+ /* compute new column values */ \
+ do_icol(table, r2,r1,r4,r5, r0,r3, arg); /* idx=r0 */ \
+ do_col (table, r4,r5,r2,r1, r0,r3); /* idx=r4 */ \
+ restore(r0,0); \
+ do_col (table, r1,r4,r5,r2, r0,r3); /* idx=r1 */ \
+ restore(r0,1); \
+ do_col (table, r5,r2,r1,r4, r0,r3); /* idx=r5 */
+
+// round column values
+// on entry: r2,r1,r4,r5
+// on exit: r0,r1,r4,r5
+#define inv_rnd2(arg, table) \
+ save (0,r1); \
+ save (1,r5); \
+ \
+ /* compute new column values */ \
+ do_icol(table, r0,r1,r4,r5, r2,r3, arg); /* idx=r2 */ \
+ do_col (table, r4,r5,r0,r1, r2,r3); /* idx=r4 */ \
+ restore(r2,0); \
+ do_col (table, r1,r4,r5,r0, r2,r3); /* idx=r1 */ \
+ restore(r2,1); \
+ do_col (table, r5,r0,r1,r4, r2,r3); /* idx=r5 */
+
+// AES (Rijndael) Encryption Subroutine
+/* void aes_enc_blk(struct crypto_aes_ctx *ctx, u8 *out_blk, const u8 *in_blk) */
+
+.extern crypto_ft_tab
+.extern crypto_fl_tab
+
+ENTRY(aes_enc_blk)
+ push %ebp
+ mov ctx(%esp),%ebp
+
+// CAUTION: the order and the values used in these assigns
+// rely on the register mappings
+
+1: push %ebx
+ mov in_blk+4(%esp),%r2
+ push %esi
+ mov klen(%ebp),%r3 // key size
+ push %edi
+#if ekey != 0
+ lea ekey(%ebp),%ebp // key pointer
+#endif
+
+// input four columns and xor in first round key
+
+ mov (%r2),%r0
+ mov 4(%r2),%r1
+ mov 8(%r2),%r4
+ mov 12(%r2),%r5
+ xor (%ebp),%r0
+ xor 4(%ebp),%r1
+ xor 8(%ebp),%r4
+ xor 12(%ebp),%r5
+
+ sub $8,%esp // space for register saves on stack
+ add $16,%ebp // increment to next round key
+ cmp $24,%r3
+ jb 4f // 10 rounds for 128-bit key
+ lea 32(%ebp),%ebp
+ je 3f // 12 rounds for 192-bit key
+ lea 32(%ebp),%ebp
+
+2: fwd_rnd1( -64(%ebp), crypto_ft_tab) // 14 rounds for 256-bit key
+ fwd_rnd2( -48(%ebp), crypto_ft_tab)
+3: fwd_rnd1( -32(%ebp), crypto_ft_tab) // 12 rounds for 192-bit key
+ fwd_rnd2( -16(%ebp), crypto_ft_tab)
+4: fwd_rnd1( (%ebp), crypto_ft_tab) // 10 rounds for 128-bit key
+ fwd_rnd2( +16(%ebp), crypto_ft_tab)
+ fwd_rnd1( +32(%ebp), crypto_ft_tab)
+ fwd_rnd2( +48(%ebp), crypto_ft_tab)
+ fwd_rnd1( +64(%ebp), crypto_ft_tab)
+ fwd_rnd2( +80(%ebp), crypto_ft_tab)
+ fwd_rnd1( +96(%ebp), crypto_ft_tab)
+ fwd_rnd2(+112(%ebp), crypto_ft_tab)
+ fwd_rnd1(+128(%ebp), crypto_ft_tab)
+ fwd_rnd2(+144(%ebp), crypto_fl_tab) // last round uses a different table
+
+// move final values to the output array. CAUTION: the
+// order of these assigns rely on the register mappings
+
+ add $8,%esp
+ mov out_blk+12(%esp),%ebp
+ mov %r5,12(%ebp)
+ pop %edi
+ mov %r4,8(%ebp)
+ pop %esi
+ mov %r1,4(%ebp)
+ pop %ebx
+ mov %r0,(%ebp)
+ pop %ebp
+ ret
+ENDPROC(aes_enc_blk)
+
+// AES (Rijndael) Decryption Subroutine
+/* void aes_dec_blk(struct crypto_aes_ctx *ctx, u8 *out_blk, const u8 *in_blk) */
+
+.extern crypto_it_tab
+.extern crypto_il_tab
+
+ENTRY(aes_dec_blk)
+ push %ebp
+ mov ctx(%esp),%ebp
+
+// CAUTION: the order and the values used in these assigns
+// rely on the register mappings
+
+1: push %ebx
+ mov in_blk+4(%esp),%r2
+ push %esi
+ mov klen(%ebp),%r3 // key size
+ push %edi
+#if dkey != 0
+ lea dkey(%ebp),%ebp // key pointer
+#endif
+
+// input four columns and xor in first round key
+
+ mov (%r2),%r0
+ mov 4(%r2),%r1
+ mov 8(%r2),%r4
+ mov 12(%r2),%r5
+ xor (%ebp),%r0
+ xor 4(%ebp),%r1
+ xor 8(%ebp),%r4
+ xor 12(%ebp),%r5
+
+ sub $8,%esp // space for register saves on stack
+ add $16,%ebp // increment to next round key
+ cmp $24,%r3
+ jb 4f // 10 rounds for 128-bit key
+ lea 32(%ebp),%ebp
+ je 3f // 12 rounds for 192-bit key
+ lea 32(%ebp),%ebp
+
+2: inv_rnd1( -64(%ebp), crypto_it_tab) // 14 rounds for 256-bit key
+ inv_rnd2( -48(%ebp), crypto_it_tab)
+3: inv_rnd1( -32(%ebp), crypto_it_tab) // 12 rounds for 192-bit key
+ inv_rnd2( -16(%ebp), crypto_it_tab)
+4: inv_rnd1( (%ebp), crypto_it_tab) // 10 rounds for 128-bit key
+ inv_rnd2( +16(%ebp), crypto_it_tab)
+ inv_rnd1( +32(%ebp), crypto_it_tab)
+ inv_rnd2( +48(%ebp), crypto_it_tab)
+ inv_rnd1( +64(%ebp), crypto_it_tab)
+ inv_rnd2( +80(%ebp), crypto_it_tab)
+ inv_rnd1( +96(%ebp), crypto_it_tab)
+ inv_rnd2(+112(%ebp), crypto_it_tab)
+ inv_rnd1(+128(%ebp), crypto_it_tab)
+ inv_rnd2(+144(%ebp), crypto_il_tab) // last round uses a different table
+
+// move final values to the output array. CAUTION: the
+// order of these assigns rely on the register mappings
+
+ add $8,%esp
+ mov out_blk+12(%esp),%ebp
+ mov %r5,12(%ebp)
+ pop %edi
+ mov %r4,8(%ebp)
+ pop %esi
+ mov %r1,4(%ebp)
+ pop %ebx
+ mov %r0,(%ebp)
+ pop %ebp
+ ret
+ENDPROC(aes_dec_blk)
diff --git a/arch/x86/crypto/aes-x86_64-asm_64.S b/arch/x86/crypto/aes-x86_64-asm_64.S
new file mode 100644
index 000000000..8739cf779
--- /dev/null
+++ b/arch/x86/crypto/aes-x86_64-asm_64.S
@@ -0,0 +1,185 @@
+/* AES (Rijndael) implementation (FIPS PUB 197) for x86_64
+ *
+ * Copyright (C) 2005 Andreas Steinmetz, <ast@domdv.de>
+ *
+ * License:
+ * This code can be distributed under the terms of the GNU General Public
+ * License (GPL) Version 2 provided that the above header down to and
+ * including this sentence is retained in full.
+ */
+
+.extern crypto_ft_tab
+.extern crypto_it_tab
+.extern crypto_fl_tab
+.extern crypto_il_tab
+
+.text
+
+#include <linux/linkage.h>
+#include <asm/asm-offsets.h>
+
+#define R1 %rax
+#define R1E %eax
+#define R1X %ax
+#define R1H %ah
+#define R1L %al
+#define R2 %rbx
+#define R2E %ebx
+#define R2X %bx
+#define R2H %bh
+#define R2L %bl
+#define R3 %rcx
+#define R3E %ecx
+#define R3X %cx
+#define R3H %ch
+#define R3L %cl
+#define R4 %rdx
+#define R4E %edx
+#define R4X %dx
+#define R4H %dh
+#define R4L %dl
+#define R5 %rsi
+#define R5E %esi
+#define R6 %rdi
+#define R6E %edi
+#define R7 %r9 /* don't use %rbp; it breaks stack traces */
+#define R7E %r9d
+#define R8 %r8
+#define R10 %r10
+#define R11 %r11
+
+#define prologue(FUNC,KEY,B128,B192,r1,r2,r5,r6,r7,r8,r9,r10,r11) \
+ ENTRY(FUNC); \
+ movq r1,r2; \
+ leaq KEY+48(r8),r9; \
+ movq r10,r11; \
+ movl (r7),r5 ## E; \
+ movl 4(r7),r1 ## E; \
+ movl 8(r7),r6 ## E; \
+ movl 12(r7),r7 ## E; \
+ movl 480(r8),r10 ## E; \
+ xorl -48(r9),r5 ## E; \
+ xorl -44(r9),r1 ## E; \
+ xorl -40(r9),r6 ## E; \
+ xorl -36(r9),r7 ## E; \
+ cmpl $24,r10 ## E; \
+ jb B128; \
+ leaq 32(r9),r9; \
+ je B192; \
+ leaq 32(r9),r9;
+
+#define epilogue(FUNC,r1,r2,r5,r6,r7,r8,r9) \
+ movq r1,r2; \
+ movl r5 ## E,(r9); \
+ movl r6 ## E,4(r9); \
+ movl r7 ## E,8(r9); \
+ movl r8 ## E,12(r9); \
+ ret; \
+ ENDPROC(FUNC);
+
+#define round(TAB,OFFSET,r1,r2,r3,r4,r5,r6,r7,r8,ra,rb,rc,rd) \
+ movzbl r2 ## H,r5 ## E; \
+ movzbl r2 ## L,r6 ## E; \
+ movl TAB+1024(,r5,4),r5 ## E;\
+ movw r4 ## X,r2 ## X; \
+ movl TAB(,r6,4),r6 ## E; \
+ roll $16,r2 ## E; \
+ shrl $16,r4 ## E; \
+ movzbl r4 ## L,r7 ## E; \
+ movzbl r4 ## H,r4 ## E; \
+ xorl OFFSET(r8),ra ## E; \
+ xorl OFFSET+4(r8),rb ## E; \
+ xorl TAB+3072(,r4,4),r5 ## E;\
+ xorl TAB+2048(,r7,4),r6 ## E;\
+ movzbl r1 ## L,r7 ## E; \
+ movzbl r1 ## H,r4 ## E; \
+ movl TAB+1024(,r4,4),r4 ## E;\
+ movw r3 ## X,r1 ## X; \
+ roll $16,r1 ## E; \
+ shrl $16,r3 ## E; \
+ xorl TAB(,r7,4),r5 ## E; \
+ movzbl r3 ## L,r7 ## E; \
+ movzbl r3 ## H,r3 ## E; \
+ xorl TAB+3072(,r3,4),r4 ## E;\
+ xorl TAB+2048(,r7,4),r5 ## E;\
+ movzbl r1 ## L,r7 ## E; \
+ movzbl r1 ## H,r3 ## E; \
+ shrl $16,r1 ## E; \
+ xorl TAB+3072(,r3,4),r6 ## E;\
+ movl TAB+2048(,r7,4),r3 ## E;\
+ movzbl r1 ## L,r7 ## E; \
+ movzbl r1 ## H,r1 ## E; \
+ xorl TAB+1024(,r1,4),r6 ## E;\
+ xorl TAB(,r7,4),r3 ## E; \
+ movzbl r2 ## H,r1 ## E; \
+ movzbl r2 ## L,r7 ## E; \
+ shrl $16,r2 ## E; \
+ xorl TAB+3072(,r1,4),r3 ## E;\
+ xorl TAB+2048(,r7,4),r4 ## E;\
+ movzbl r2 ## H,r1 ## E; \
+ movzbl r2 ## L,r2 ## E; \
+ xorl OFFSET+8(r8),rc ## E; \
+ xorl OFFSET+12(r8),rd ## E; \
+ xorl TAB+1024(,r1,4),r3 ## E;\
+ xorl TAB(,r2,4),r4 ## E;
+
+#define move_regs(r1,r2,r3,r4) \
+ movl r3 ## E,r1 ## E; \
+ movl r4 ## E,r2 ## E;
+
+#define entry(FUNC,KEY,B128,B192) \
+ prologue(FUNC,KEY,B128,B192,R2,R8,R1,R3,R4,R6,R10,R5,R11)
+
+#define return(FUNC) epilogue(FUNC,R8,R2,R5,R6,R3,R4,R11)
+
+#define encrypt_round(TAB,OFFSET) \
+ round(TAB,OFFSET,R1,R2,R3,R4,R5,R6,R7,R10,R5,R6,R3,R4) \
+ move_regs(R1,R2,R5,R6)
+
+#define encrypt_final(TAB,OFFSET) \
+ round(TAB,OFFSET,R1,R2,R3,R4,R5,R6,R7,R10,R5,R6,R3,R4)
+
+#define decrypt_round(TAB,OFFSET) \
+ round(TAB,OFFSET,R2,R1,R4,R3,R6,R5,R7,R10,R5,R6,R3,R4) \
+ move_regs(R1,R2,R5,R6)
+
+#define decrypt_final(TAB,OFFSET) \
+ round(TAB,OFFSET,R2,R1,R4,R3,R6,R5,R7,R10,R5,R6,R3,R4)
+
+/* void aes_enc_blk(stuct crypto_tfm *tfm, u8 *out, const u8 *in) */
+
+ entry(aes_enc_blk,0,.Le128,.Le192)
+ encrypt_round(crypto_ft_tab,-96)
+ encrypt_round(crypto_ft_tab,-80)
+.Le192: encrypt_round(crypto_ft_tab,-64)
+ encrypt_round(crypto_ft_tab,-48)
+.Le128: encrypt_round(crypto_ft_tab,-32)
+ encrypt_round(crypto_ft_tab,-16)
+ encrypt_round(crypto_ft_tab, 0)
+ encrypt_round(crypto_ft_tab, 16)
+ encrypt_round(crypto_ft_tab, 32)
+ encrypt_round(crypto_ft_tab, 48)
+ encrypt_round(crypto_ft_tab, 64)
+ encrypt_round(crypto_ft_tab, 80)
+ encrypt_round(crypto_ft_tab, 96)
+ encrypt_final(crypto_fl_tab,112)
+ return(aes_enc_blk)
+
+/* void aes_dec_blk(struct crypto_tfm *tfm, u8 *out, const u8 *in) */
+
+ entry(aes_dec_blk,240,.Ld128,.Ld192)
+ decrypt_round(crypto_it_tab,-96)
+ decrypt_round(crypto_it_tab,-80)
+.Ld192: decrypt_round(crypto_it_tab,-64)
+ decrypt_round(crypto_it_tab,-48)
+.Ld128: decrypt_round(crypto_it_tab,-32)
+ decrypt_round(crypto_it_tab,-16)
+ decrypt_round(crypto_it_tab, 0)
+ decrypt_round(crypto_it_tab, 16)
+ decrypt_round(crypto_it_tab, 32)
+ decrypt_round(crypto_it_tab, 48)
+ decrypt_round(crypto_it_tab, 64)
+ decrypt_round(crypto_it_tab, 80)
+ decrypt_round(crypto_it_tab, 96)
+ decrypt_final(crypto_il_tab,112)
+ return(aes_dec_blk)
diff --git a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
new file mode 100644
index 000000000..77043a82d
--- /dev/null
+++ b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
@@ -0,0 +1,569 @@
+/*
+ * Implement AES CTR mode by8 optimization with AVX instructions. (x86_64)
+ *
+ * This is AES128/192/256 CTR mode optimization implementation. It requires
+ * the support of Intel(R) AESNI and AVX instructions.
+ *
+ * This work was inspired by the AES CTR mode optimization published
+ * in Intel Optimized IPSEC Cryptograhpic library.
+ * Additional information on it can be found at:
+ * http://downloadcenter.intel.com/Detail_Desc.aspx?agr=Y&DwnldID=22972
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * James Guilford <james.guilford@intel.com>
+ * Sean Gulley <sean.m.gulley@intel.com>
+ * Chandramouli Narayanan <mouli@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/linkage.h>
+#include <asm/inst.h>
+
+#define VMOVDQ vmovdqu
+
+#define xdata0 %xmm0
+#define xdata1 %xmm1
+#define xdata2 %xmm2
+#define xdata3 %xmm3
+#define xdata4 %xmm4
+#define xdata5 %xmm5
+#define xdata6 %xmm6
+#define xdata7 %xmm7
+#define xcounter %xmm8
+#define xbyteswap %xmm9
+#define xkey0 %xmm10
+#define xkey4 %xmm11
+#define xkey8 %xmm12
+#define xkey12 %xmm13
+#define xkeyA %xmm14
+#define xkeyB %xmm15
+
+#define p_in %rdi
+#define p_iv %rsi
+#define p_keys %rdx
+#define p_out %rcx
+#define num_bytes %r8
+
+#define tmp %r10
+#define DDQ_DATA 0
+#define XDATA 1
+#define KEY_128 1
+#define KEY_192 2
+#define KEY_256 3
+
+.section .rodata
+.align 16
+
+byteswap_const:
+ .octa 0x000102030405060708090A0B0C0D0E0F
+ddq_low_msk:
+ .octa 0x0000000000000000FFFFFFFFFFFFFFFF
+ddq_high_add_1:
+ .octa 0x00000000000000010000000000000000
+ddq_add_1:
+ .octa 0x00000000000000000000000000000001
+ddq_add_2:
+ .octa 0x00000000000000000000000000000002
+ddq_add_3:
+ .octa 0x00000000000000000000000000000003
+ddq_add_4:
+ .octa 0x00000000000000000000000000000004
+ddq_add_5:
+ .octa 0x00000000000000000000000000000005
+ddq_add_6:
+ .octa 0x00000000000000000000000000000006
+ddq_add_7:
+ .octa 0x00000000000000000000000000000007
+ddq_add_8:
+ .octa 0x00000000000000000000000000000008
+
+.text
+
+/* generate a unique variable for ddq_add_x */
+
+/* generate a unique variable for xmm register */
+.macro setxdata n
+ var_xdata = %xmm\n
+.endm
+
+/* club the numeric 'id' to the symbol 'name' */
+
+.macro club name, id
+.altmacro
+ .if \name == XDATA
+ setxdata %\id
+ .endif
+.noaltmacro
+.endm
+
+/*
+ * do_aes num_in_par load_keys key_len
+ * This increments p_in, but not p_out
+ */
+.macro do_aes b, k, key_len
+ .set by, \b
+ .set load_keys, \k
+ .set klen, \key_len
+
+ .if (load_keys)
+ vmovdqa 0*16(p_keys), xkey0
+ .endif
+
+ vpshufb xbyteswap, xcounter, xdata0
+
+ .set i, 1
+ .rept (by - 1)
+ club XDATA, i
+ vpaddq (ddq_add_1 + 16 * (i - 1))(%rip), xcounter, var_xdata
+ vptest ddq_low_msk(%rip), var_xdata
+ jnz 1f
+ vpaddq ddq_high_add_1(%rip), var_xdata, var_xdata
+ vpaddq ddq_high_add_1(%rip), xcounter, xcounter
+ 1:
+ vpshufb xbyteswap, var_xdata, var_xdata
+ .set i, (i +1)
+ .endr
+
+ vmovdqa 1*16(p_keys), xkeyA
+
+ vpxor xkey0, xdata0, xdata0
+ vpaddq (ddq_add_1 + 16 * (by - 1))(%rip), xcounter, xcounter
+ vptest ddq_low_msk(%rip), xcounter
+ jnz 1f
+ vpaddq ddq_high_add_1(%rip), xcounter, xcounter
+ 1:
+
+ .set i, 1
+ .rept (by - 1)
+ club XDATA, i
+ vpxor xkey0, var_xdata, var_xdata
+ .set i, (i +1)
+ .endr
+
+ vmovdqa 2*16(p_keys), xkeyB
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ vaesenc xkeyA, var_xdata, var_xdata /* key 1 */
+ .set i, (i +1)
+ .endr
+
+ .if (klen == KEY_128)
+ .if (load_keys)
+ vmovdqa 3*16(p_keys), xkey4
+ .endif
+ .else
+ vmovdqa 3*16(p_keys), xkeyA
+ .endif
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ vaesenc xkeyB, var_xdata, var_xdata /* key 2 */
+ .set i, (i +1)
+ .endr
+
+ add $(16*by), p_in
+
+ .if (klen == KEY_128)
+ vmovdqa 4*16(p_keys), xkeyB
+ .else
+ .if (load_keys)
+ vmovdqa 4*16(p_keys), xkey4
+ .endif
+ .endif
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ /* key 3 */
+ .if (klen == KEY_128)
+ vaesenc xkey4, var_xdata, var_xdata
+ .else
+ vaesenc xkeyA, var_xdata, var_xdata
+ .endif
+ .set i, (i +1)
+ .endr
+
+ vmovdqa 5*16(p_keys), xkeyA
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ /* key 4 */
+ .if (klen == KEY_128)
+ vaesenc xkeyB, var_xdata, var_xdata
+ .else
+ vaesenc xkey4, var_xdata, var_xdata
+ .endif
+ .set i, (i +1)
+ .endr
+
+ .if (klen == KEY_128)
+ .if (load_keys)
+ vmovdqa 6*16(p_keys), xkey8
+ .endif
+ .else
+ vmovdqa 6*16(p_keys), xkeyB
+ .endif
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ vaesenc xkeyA, var_xdata, var_xdata /* key 5 */
+ .set i, (i +1)
+ .endr
+
+ vmovdqa 7*16(p_keys), xkeyA
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ /* key 6 */
+ .if (klen == KEY_128)
+ vaesenc xkey8, var_xdata, var_xdata
+ .else
+ vaesenc xkeyB, var_xdata, var_xdata
+ .endif
+ .set i, (i +1)
+ .endr
+
+ .if (klen == KEY_128)
+ vmovdqa 8*16(p_keys), xkeyB
+ .else
+ .if (load_keys)
+ vmovdqa 8*16(p_keys), xkey8
+ .endif
+ .endif
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ vaesenc xkeyA, var_xdata, var_xdata /* key 7 */
+ .set i, (i +1)
+ .endr
+
+ .if (klen == KEY_128)
+ .if (load_keys)
+ vmovdqa 9*16(p_keys), xkey12
+ .endif
+ .else
+ vmovdqa 9*16(p_keys), xkeyA
+ .endif
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ /* key 8 */
+ .if (klen == KEY_128)
+ vaesenc xkeyB, var_xdata, var_xdata
+ .else
+ vaesenc xkey8, var_xdata, var_xdata
+ .endif
+ .set i, (i +1)
+ .endr
+
+ vmovdqa 10*16(p_keys), xkeyB
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ /* key 9 */
+ .if (klen == KEY_128)
+ vaesenc xkey12, var_xdata, var_xdata
+ .else
+ vaesenc xkeyA, var_xdata, var_xdata
+ .endif
+ .set i, (i +1)
+ .endr
+
+ .if (klen != KEY_128)
+ vmovdqa 11*16(p_keys), xkeyA
+ .endif
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ /* key 10 */
+ .if (klen == KEY_128)
+ vaesenclast xkeyB, var_xdata, var_xdata
+ .else
+ vaesenc xkeyB, var_xdata, var_xdata
+ .endif
+ .set i, (i +1)
+ .endr
+
+ .if (klen != KEY_128)
+ .if (load_keys)
+ vmovdqa 12*16(p_keys), xkey12
+ .endif
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ vaesenc xkeyA, var_xdata, var_xdata /* key 11 */
+ .set i, (i +1)
+ .endr
+
+ .if (klen == KEY_256)
+ vmovdqa 13*16(p_keys), xkeyA
+ .endif
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ .if (klen == KEY_256)
+ /* key 12 */
+ vaesenc xkey12, var_xdata, var_xdata
+ .else
+ vaesenclast xkey12, var_xdata, var_xdata
+ .endif
+ .set i, (i +1)
+ .endr
+
+ .if (klen == KEY_256)
+ vmovdqa 14*16(p_keys), xkeyB
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ /* key 13 */
+ vaesenc xkeyA, var_xdata, var_xdata
+ .set i, (i +1)
+ .endr
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ /* key 14 */
+ vaesenclast xkeyB, var_xdata, var_xdata
+ .set i, (i +1)
+ .endr
+ .endif
+ .endif
+
+ .set i, 0
+ .rept (by / 2)
+ .set j, (i+1)
+ VMOVDQ (i*16 - 16*by)(p_in), xkeyA
+ VMOVDQ (j*16 - 16*by)(p_in), xkeyB
+ club XDATA, i
+ vpxor xkeyA, var_xdata, var_xdata
+ club XDATA, j
+ vpxor xkeyB, var_xdata, var_xdata
+ .set i, (i+2)
+ .endr
+
+ .if (i < by)
+ VMOVDQ (i*16 - 16*by)(p_in), xkeyA
+ club XDATA, i
+ vpxor xkeyA, var_xdata, var_xdata
+ .endif
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ VMOVDQ var_xdata, i*16(p_out)
+ .set i, (i+1)
+ .endr
+.endm
+
+.macro do_aes_load val, key_len
+ do_aes \val, 1, \key_len
+.endm
+
+.macro do_aes_noload val, key_len
+ do_aes \val, 0, \key_len
+.endm
+
+/* main body of aes ctr load */
+
+.macro do_aes_ctrmain key_len
+ cmp $16, num_bytes
+ jb .Ldo_return2\key_len
+
+ vmovdqa byteswap_const(%rip), xbyteswap
+ vmovdqu (p_iv), xcounter
+ vpshufb xbyteswap, xcounter, xcounter
+
+ mov num_bytes, tmp
+ and $(7*16), tmp
+ jz .Lmult_of_8_blks\key_len
+
+ /* 1 <= tmp <= 7 */
+ cmp $(4*16), tmp
+ jg .Lgt4\key_len
+ je .Leq4\key_len
+
+.Llt4\key_len:
+ cmp $(2*16), tmp
+ jg .Leq3\key_len
+ je .Leq2\key_len
+
+.Leq1\key_len:
+ do_aes_load 1, \key_len
+ add $(1*16), p_out
+ and $(~7*16), num_bytes
+ jz .Ldo_return2\key_len
+ jmp .Lmain_loop2\key_len
+
+.Leq2\key_len:
+ do_aes_load 2, \key_len
+ add $(2*16), p_out
+ and $(~7*16), num_bytes
+ jz .Ldo_return2\key_len
+ jmp .Lmain_loop2\key_len
+
+
+.Leq3\key_len:
+ do_aes_load 3, \key_len
+ add $(3*16), p_out
+ and $(~7*16), num_bytes
+ jz .Ldo_return2\key_len
+ jmp .Lmain_loop2\key_len
+
+.Leq4\key_len:
+ do_aes_load 4, \key_len
+ add $(4*16), p_out
+ and $(~7*16), num_bytes
+ jz .Ldo_return2\key_len
+ jmp .Lmain_loop2\key_len
+
+.Lgt4\key_len:
+ cmp $(6*16), tmp
+ jg .Leq7\key_len
+ je .Leq6\key_len
+
+.Leq5\key_len:
+ do_aes_load 5, \key_len
+ add $(5*16), p_out
+ and $(~7*16), num_bytes
+ jz .Ldo_return2\key_len
+ jmp .Lmain_loop2\key_len
+
+.Leq6\key_len:
+ do_aes_load 6, \key_len
+ add $(6*16), p_out
+ and $(~7*16), num_bytes
+ jz .Ldo_return2\key_len
+ jmp .Lmain_loop2\key_len
+
+.Leq7\key_len:
+ do_aes_load 7, \key_len
+ add $(7*16), p_out
+ and $(~7*16), num_bytes
+ jz .Ldo_return2\key_len
+ jmp .Lmain_loop2\key_len
+
+.Lmult_of_8_blks\key_len:
+ .if (\key_len != KEY_128)
+ vmovdqa 0*16(p_keys), xkey0
+ vmovdqa 4*16(p_keys), xkey4
+ vmovdqa 8*16(p_keys), xkey8
+ vmovdqa 12*16(p_keys), xkey12
+ .else
+ vmovdqa 0*16(p_keys), xkey0
+ vmovdqa 3*16(p_keys), xkey4
+ vmovdqa 6*16(p_keys), xkey8
+ vmovdqa 9*16(p_keys), xkey12
+ .endif
+.align 16
+.Lmain_loop2\key_len:
+ /* num_bytes is a multiple of 8 and >0 */
+ do_aes_noload 8, \key_len
+ add $(8*16), p_out
+ sub $(8*16), num_bytes
+ jne .Lmain_loop2\key_len
+
+.Ldo_return2\key_len:
+ /* return updated IV */
+ vpshufb xbyteswap, xcounter, xcounter
+ vmovdqu xcounter, (p_iv)
+ ret
+.endm
+
+/*
+ * routine to do AES128 CTR enc/decrypt "by8"
+ * XMM registers are clobbered.
+ * Saving/restoring must be done at a higher level
+ * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out,
+ * unsigned int num_bytes)
+ */
+ENTRY(aes_ctr_enc_128_avx_by8)
+ /* call the aes main loop */
+ do_aes_ctrmain KEY_128
+
+ENDPROC(aes_ctr_enc_128_avx_by8)
+
+/*
+ * routine to do AES192 CTR enc/decrypt "by8"
+ * XMM registers are clobbered.
+ * Saving/restoring must be done at a higher level
+ * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out,
+ * unsigned int num_bytes)
+ */
+ENTRY(aes_ctr_enc_192_avx_by8)
+ /* call the aes main loop */
+ do_aes_ctrmain KEY_192
+
+ENDPROC(aes_ctr_enc_192_avx_by8)
+
+/*
+ * routine to do AES256 CTR enc/decrypt "by8"
+ * XMM registers are clobbered.
+ * Saving/restoring must be done at a higher level
+ * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out,
+ * unsigned int num_bytes)
+ */
+ENTRY(aes_ctr_enc_256_avx_by8)
+ /* call the aes main loop */
+ do_aes_ctrmain KEY_256
+
+ENDPROC(aes_ctr_enc_256_avx_by8)
diff --git a/arch/x86/crypto/aes_glue.c b/arch/x86/crypto/aes_glue.c
new file mode 100644
index 000000000..e26984f7a
--- /dev/null
+++ b/arch/x86/crypto/aes_glue.c
@@ -0,0 +1,70 @@
+/*
+ * Glue Code for the asm optimized version of the AES Cipher Algorithm
+ *
+ */
+
+#include <linux/module.h>
+#include <crypto/aes.h>
+#include <asm/crypto/aes.h>
+
+asmlinkage void aes_enc_blk(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in);
+asmlinkage void aes_dec_blk(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in);
+
+void crypto_aes_encrypt_x86(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
+{
+ aes_enc_blk(ctx, dst, src);
+}
+EXPORT_SYMBOL_GPL(crypto_aes_encrypt_x86);
+
+void crypto_aes_decrypt_x86(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
+{
+ aes_dec_blk(ctx, dst, src);
+}
+EXPORT_SYMBOL_GPL(crypto_aes_decrypt_x86);
+
+static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+ aes_enc_blk(crypto_tfm_ctx(tfm), dst, src);
+}
+
+static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+ aes_dec_blk(crypto_tfm_ctx(tfm), dst, src);
+}
+
+static struct crypto_alg aes_alg = {
+ .cra_name = "aes",
+ .cra_driver_name = "aes-asm",
+ .cra_priority = 200,
+ .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
+ .cra_blocksize = AES_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct crypto_aes_ctx),
+ .cra_module = THIS_MODULE,
+ .cra_u = {
+ .cipher = {
+ .cia_min_keysize = AES_MIN_KEY_SIZE,
+ .cia_max_keysize = AES_MAX_KEY_SIZE,
+ .cia_setkey = crypto_aes_set_key,
+ .cia_encrypt = aes_encrypt,
+ .cia_decrypt = aes_decrypt
+ }
+ }
+};
+
+static int __init aes_init(void)
+{
+ return crypto_register_alg(&aes_alg);
+}
+
+static void __exit aes_fini(void)
+{
+ crypto_unregister_alg(&aes_alg);
+}
+
+module_init(aes_init);
+module_exit(aes_fini);
+
+MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm, asm optimized");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_CRYPTO("aes");
+MODULE_ALIAS_CRYPTO("aes-asm");
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
new file mode 100644
index 000000000..29b27f9a6
--- /dev/null
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -0,0 +1,2842 @@
+/*
+ * Implement AES algorithm in Intel AES-NI instructions.
+ *
+ * The white paper of AES-NI instructions can be downloaded from:
+ * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
+ *
+ * Copyright (C) 2008, Intel Corp.
+ * Author: Huang Ying <ying.huang@intel.com>
+ * Vinodh Gopal <vinodh.gopal@intel.com>
+ * Kahraman Akdemir
+ *
+ * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
+ * interface for 64-bit kernels.
+ * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
+ * Aidan O'Mahony (aidan.o.mahony@intel.com)
+ * Adrian Hoban <adrian.hoban@intel.com>
+ * James Guilford (james.guilford@intel.com)
+ * Gabriele Paoloni <gabriele.paoloni@intel.com>
+ * Tadeusz Struk (tadeusz.struk@intel.com)
+ * Wajdi Feghali (wajdi.k.feghali@intel.com)
+ * Copyright (c) 2010, Intel Corporation.
+ *
+ * Ported x86_64 version to x86:
+ * Author: Mathias Krause <minipli@googlemail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/linkage.h>
+#include <asm/inst.h>
+#include <asm/frame.h>
+#include <asm/nospec-branch.h>
+
+/*
+ * The following macros are used to move an (un)aligned 16 byte value to/from
+ * an XMM register. This can done for either FP or integer values, for FP use
+ * movaps (move aligned packed single) or integer use movdqa (move double quad
+ * aligned). It doesn't make a performance difference which instruction is used
+ * since Nehalem (original Core i7) was released. However, the movaps is a byte
+ * shorter, so that is the one we'll use for now. (same for unaligned).
+ */
+#define MOVADQ movaps
+#define MOVUDQ movups
+
+#ifdef __x86_64__
+
+# constants in mergeable sections, linker can reorder and merge
+.section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
+.align 16
+.Lgf128mul_x_ble_mask:
+ .octa 0x00000000000000010000000000000087
+.section .rodata.cst16.POLY, "aM", @progbits, 16
+.align 16
+POLY: .octa 0xC2000000000000000000000000000001
+.section .rodata.cst16.TWOONE, "aM", @progbits, 16
+.align 16
+TWOONE: .octa 0x00000001000000000000000000000001
+
+.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
+.align 16
+SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
+.section .rodata.cst16.MASK1, "aM", @progbits, 16
+.align 16
+MASK1: .octa 0x0000000000000000ffffffffffffffff
+.section .rodata.cst16.MASK2, "aM", @progbits, 16
+.align 16
+MASK2: .octa 0xffffffffffffffff0000000000000000
+.section .rodata.cst16.ONE, "aM", @progbits, 16
+.align 16
+ONE: .octa 0x00000000000000000000000000000001
+.section .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
+.align 16
+F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
+.section .rodata.cst16.dec, "aM", @progbits, 16
+.align 16
+dec: .octa 0x1
+.section .rodata.cst16.enc, "aM", @progbits, 16
+.align 16
+enc: .octa 0x2
+
+# order of these constants should not change.
+# more specifically, ALL_F should follow SHIFT_MASK,
+# and zero should follow ALL_F
+.section .rodata, "a", @progbits
+.align 16
+SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
+ALL_F: .octa 0xffffffffffffffffffffffffffffffff
+ .octa 0x00000000000000000000000000000000
+
+.text
+
+
+#define STACK_OFFSET 8*3
+
+#define AadHash 16*0
+#define AadLen 16*1
+#define InLen (16*1)+8
+#define PBlockEncKey 16*2
+#define OrigIV 16*3
+#define CurCount 16*4
+#define PBlockLen 16*5
+#define HashKey 16*6 // store HashKey <<1 mod poly here
+#define HashKey_2 16*7 // store HashKey^2 <<1 mod poly here
+#define HashKey_3 16*8 // store HashKey^3 <<1 mod poly here
+#define HashKey_4 16*9 // store HashKey^4 <<1 mod poly here
+#define HashKey_k 16*10 // store XOR of High 64 bits and Low 64
+ // bits of HashKey <<1 mod poly here
+ //(for Karatsuba purposes)
+#define HashKey_2_k 16*11 // store XOR of High 64 bits and Low 64
+ // bits of HashKey^2 <<1 mod poly here
+ // (for Karatsuba purposes)
+#define HashKey_3_k 16*12 // store XOR of High 64 bits and Low 64
+ // bits of HashKey^3 <<1 mod poly here
+ // (for Karatsuba purposes)
+#define HashKey_4_k 16*13 // store XOR of High 64 bits and Low 64
+ // bits of HashKey^4 <<1 mod poly here
+ // (for Karatsuba purposes)
+
+#define arg1 rdi
+#define arg2 rsi
+#define arg3 rdx
+#define arg4 rcx
+#define arg5 r8
+#define arg6 r9
+#define arg7 STACK_OFFSET+8(%rsp)
+#define arg8 STACK_OFFSET+16(%rsp)
+#define arg9 STACK_OFFSET+24(%rsp)
+#define arg10 STACK_OFFSET+32(%rsp)
+#define arg11 STACK_OFFSET+40(%rsp)
+#define keysize 2*15*16(%arg1)
+#endif
+
+
+#define STATE1 %xmm0
+#define STATE2 %xmm4
+#define STATE3 %xmm5
+#define STATE4 %xmm6
+#define STATE STATE1
+#define IN1 %xmm1
+#define IN2 %xmm7
+#define IN3 %xmm8
+#define IN4 %xmm9
+#define IN IN1
+#define KEY %xmm2
+#define IV %xmm3
+
+#define BSWAP_MASK %xmm10
+#define CTR %xmm11
+#define INC %xmm12
+
+#define GF128MUL_MASK %xmm10
+
+#ifdef __x86_64__
+#define AREG %rax
+#define KEYP %rdi
+#define OUTP %rsi
+#define UKEYP OUTP
+#define INP %rdx
+#define LEN %rcx
+#define IVP %r8
+#define KLEN %r9d
+#define T1 %r10
+#define TKEYP T1
+#define T2 %r11
+#define TCTR_LOW T2
+#else
+#define AREG %eax
+#define KEYP %edi
+#define OUTP AREG
+#define UKEYP OUTP
+#define INP %edx
+#define LEN %esi
+#define IVP %ebp
+#define KLEN %ebx
+#define T1 %ecx
+#define TKEYP T1
+#endif
+
+.macro FUNC_SAVE
+ push %r12
+ push %r13
+ push %r14
+#
+# states of %xmm registers %xmm6:%xmm15 not saved
+# all %xmm registers are clobbered
+#
+.endm
+
+
+.macro FUNC_RESTORE
+ pop %r14
+ pop %r13
+ pop %r12
+.endm
+
+# Precompute hashkeys.
+# Input: Hash subkey.
+# Output: HashKeys stored in gcm_context_data. Only needs to be called
+# once per key.
+# clobbers r12, and tmp xmm registers.
+.macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7
+ mov \SUBKEY, %r12
+ movdqu (%r12), \TMP3
+ movdqa SHUF_MASK(%rip), \TMP2
+ PSHUFB_XMM \TMP2, \TMP3
+
+ # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
+
+ movdqa \TMP3, \TMP2
+ psllq $1, \TMP3
+ psrlq $63, \TMP2
+ movdqa \TMP2, \TMP1
+ pslldq $8, \TMP2
+ psrldq $8, \TMP1
+ por \TMP2, \TMP3
+
+ # reduce HashKey<<1
+
+ pshufd $0x24, \TMP1, \TMP2
+ pcmpeqd TWOONE(%rip), \TMP2
+ pand POLY(%rip), \TMP2
+ pxor \TMP2, \TMP3
+ movdqu \TMP3, HashKey(%arg2)
+
+ movdqa \TMP3, \TMP5
+ pshufd $78, \TMP3, \TMP1
+ pxor \TMP3, \TMP1
+ movdqu \TMP1, HashKey_k(%arg2)
+
+ GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
+# TMP5 = HashKey^2<<1 (mod poly)
+ movdqu \TMP5, HashKey_2(%arg2)
+# HashKey_2 = HashKey^2<<1 (mod poly)
+ pshufd $78, \TMP5, \TMP1
+ pxor \TMP5, \TMP1
+ movdqu \TMP1, HashKey_2_k(%arg2)
+
+ GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
+# TMP5 = HashKey^3<<1 (mod poly)
+ movdqu \TMP5, HashKey_3(%arg2)
+ pshufd $78, \TMP5, \TMP1
+ pxor \TMP5, \TMP1
+ movdqu \TMP1, HashKey_3_k(%arg2)
+
+ GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
+# TMP5 = HashKey^3<<1 (mod poly)
+ movdqu \TMP5, HashKey_4(%arg2)
+ pshufd $78, \TMP5, \TMP1
+ pxor \TMP5, \TMP1
+ movdqu \TMP1, HashKey_4_k(%arg2)
+.endm
+
+# GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
+# Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
+.macro GCM_INIT Iv SUBKEY AAD AADLEN
+ mov \AADLEN, %r11
+ mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
+ xor %r11d, %r11d
+ mov %r11, InLen(%arg2) # ctx_data.in_length = 0
+ mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
+ mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
+ mov \Iv, %rax
+ movdqu (%rax), %xmm0
+ movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv
+
+ movdqa SHUF_MASK(%rip), %xmm2
+ PSHUFB_XMM %xmm2, %xmm0
+ movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv
+
+ PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7
+ movdqu HashKey(%arg2), %xmm13
+
+ CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \
+ %xmm4, %xmm5, %xmm6
+.endm
+
+# GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context
+# struct has been initialized by GCM_INIT.
+# Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK
+# Clobbers rax, r10-r13, and xmm0-xmm15
+.macro GCM_ENC_DEC operation
+ movdqu AadHash(%arg2), %xmm8
+ movdqu HashKey(%arg2), %xmm13
+ add %arg5, InLen(%arg2)
+
+ xor %r11d, %r11d # initialise the data pointer offset as zero
+ PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation
+
+ sub %r11, %arg5 # sub partial block data used
+ mov %arg5, %r13 # save the number of bytes
+
+ and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
+ mov %r13, %r12
+ # Encrypt/Decrypt first few blocks
+
+ and $(3<<4), %r12
+ jz _initial_num_blocks_is_0_\@
+ cmp $(2<<4), %r12
+ jb _initial_num_blocks_is_1_\@
+ je _initial_num_blocks_is_2_\@
+_initial_num_blocks_is_3_\@:
+ INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation
+ sub $48, %r13
+ jmp _initial_blocks_\@
+_initial_num_blocks_is_2_\@:
+ INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation
+ sub $32, %r13
+ jmp _initial_blocks_\@
+_initial_num_blocks_is_1_\@:
+ INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation
+ sub $16, %r13
+ jmp _initial_blocks_\@
+_initial_num_blocks_is_0_\@:
+ INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation
+_initial_blocks_\@:
+
+ # Main loop - Encrypt/Decrypt remaining blocks
+
+ cmp $0, %r13
+ je _zero_cipher_left_\@
+ sub $64, %r13
+ je _four_cipher_left_\@
+_crypt_by_4_\@:
+ GHASH_4_ENCRYPT_4_PARALLEL_\operation %xmm9, %xmm10, %xmm11, %xmm12, \
+ %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \
+ %xmm7, %xmm8, enc
+ add $64, %r11
+ sub $64, %r13
+ jne _crypt_by_4_\@
+_four_cipher_left_\@:
+ GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
+%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
+_zero_cipher_left_\@:
+ movdqu %xmm8, AadHash(%arg2)
+ movdqu %xmm0, CurCount(%arg2)
+
+ mov %arg5, %r13
+ and $15, %r13 # %r13 = arg5 (mod 16)
+ je _multiple_of_16_bytes_\@
+
+ mov %r13, PBlockLen(%arg2)
+
+ # Handle the last <16 Byte block separately
+ paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
+ movdqu %xmm0, CurCount(%arg2)
+ movdqa SHUF_MASK(%rip), %xmm10
+ PSHUFB_XMM %xmm10, %xmm0
+
+ ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
+ movdqu %xmm0, PBlockEncKey(%arg2)
+
+ cmp $16, %arg5
+ jge _large_enough_update_\@
+
+ lea (%arg4,%r11,1), %r10
+ mov %r13, %r12
+ READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
+ jmp _data_read_\@
+
+_large_enough_update_\@:
+ sub $16, %r11
+ add %r13, %r11
+
+ # receive the last <16 Byte block
+ movdqu (%arg4, %r11, 1), %xmm1
+
+ sub %r13, %r11
+ add $16, %r11
+
+ lea SHIFT_MASK+16(%rip), %r12
+ # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
+ # (r13 is the number of bytes in plaintext mod 16)
+ sub %r13, %r12
+ # get the appropriate shuffle mask
+ movdqu (%r12), %xmm2
+ # shift right 16-r13 bytes
+ PSHUFB_XMM %xmm2, %xmm1
+
+_data_read_\@:
+ lea ALL_F+16(%rip), %r12
+ sub %r13, %r12
+
+.ifc \operation, dec
+ movdqa %xmm1, %xmm2
+.endif
+ pxor %xmm1, %xmm0 # XOR Encrypt(K, Yn)
+ movdqu (%r12), %xmm1
+ # get the appropriate mask to mask out top 16-r13 bytes of xmm0
+ pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
+.ifc \operation, dec
+ pand %xmm1, %xmm2
+ movdqa SHUF_MASK(%rip), %xmm10
+ PSHUFB_XMM %xmm10 ,%xmm2
+
+ pxor %xmm2, %xmm8
+.else
+ movdqa SHUF_MASK(%rip), %xmm10
+ PSHUFB_XMM %xmm10,%xmm0
+
+ pxor %xmm0, %xmm8
+.endif
+
+ movdqu %xmm8, AadHash(%arg2)
+.ifc \operation, enc
+ # GHASH computation for the last <16 byte block
+ movdqa SHUF_MASK(%rip), %xmm10
+ # shuffle xmm0 back to output as ciphertext
+ PSHUFB_XMM %xmm10, %xmm0
+.endif
+
+ # Output %r13 bytes
+ MOVQ_R64_XMM %xmm0, %rax
+ cmp $8, %r13
+ jle _less_than_8_bytes_left_\@
+ mov %rax, (%arg3 , %r11, 1)
+ add $8, %r11
+ psrldq $8, %xmm0
+ MOVQ_R64_XMM %xmm0, %rax
+ sub $8, %r13
+_less_than_8_bytes_left_\@:
+ mov %al, (%arg3, %r11, 1)
+ add $1, %r11
+ shr $8, %rax
+ sub $1, %r13
+ jne _less_than_8_bytes_left_\@
+_multiple_of_16_bytes_\@:
+.endm
+
+# GCM_COMPLETE Finishes update of tag of last partial block
+# Output: Authorization Tag (AUTH_TAG)
+# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
+.macro GCM_COMPLETE AUTHTAG AUTHTAGLEN
+ movdqu AadHash(%arg2), %xmm8
+ movdqu HashKey(%arg2), %xmm13
+
+ mov PBlockLen(%arg2), %r12
+
+ cmp $0, %r12
+ je _partial_done\@
+
+ GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
+
+_partial_done\@:
+ mov AadLen(%arg2), %r12 # %r13 = aadLen (number of bytes)
+ shl $3, %r12 # convert into number of bits
+ movd %r12d, %xmm15 # len(A) in %xmm15
+ mov InLen(%arg2), %r12
+ shl $3, %r12 # len(C) in bits (*128)
+ MOVQ_R64_XMM %r12, %xmm1
+
+ pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
+ pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
+ pxor %xmm15, %xmm8
+ GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
+ # final GHASH computation
+ movdqa SHUF_MASK(%rip), %xmm10
+ PSHUFB_XMM %xmm10, %xmm8
+
+ movdqu OrigIV(%arg2), %xmm0 # %xmm0 = Y0
+ ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
+ pxor %xmm8, %xmm0
+_return_T_\@:
+ mov \AUTHTAG, %r10 # %r10 = authTag
+ mov \AUTHTAGLEN, %r11 # %r11 = auth_tag_len
+ cmp $16, %r11
+ je _T_16_\@
+ cmp $8, %r11
+ jl _T_4_\@
+_T_8_\@:
+ MOVQ_R64_XMM %xmm0, %rax
+ mov %rax, (%r10)
+ add $8, %r10
+ sub $8, %r11
+ psrldq $8, %xmm0
+ cmp $0, %r11
+ je _return_T_done_\@
+_T_4_\@:
+ movd %xmm0, %eax
+ mov %eax, (%r10)
+ add $4, %r10
+ sub $4, %r11
+ psrldq $4, %xmm0
+ cmp $0, %r11
+ je _return_T_done_\@
+_T_123_\@:
+ movd %xmm0, %eax
+ cmp $2, %r11
+ jl _T_1_\@
+ mov %ax, (%r10)
+ cmp $2, %r11
+ je _return_T_done_\@
+ add $2, %r10
+ sar $16, %eax
+_T_1_\@:
+ mov %al, (%r10)
+ jmp _return_T_done_\@
+_T_16_\@:
+ movdqu %xmm0, (%r10)
+_return_T_done_\@:
+.endm
+
+#ifdef __x86_64__
+/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
+*
+*
+* Input: A and B (128-bits each, bit-reflected)
+* Output: C = A*B*x mod poly, (i.e. >>1 )
+* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
+* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
+*
+*/
+.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
+ movdqa \GH, \TMP1
+ pshufd $78, \GH, \TMP2
+ pshufd $78, \HK, \TMP3
+ pxor \GH, \TMP2 # TMP2 = a1+a0
+ pxor \HK, \TMP3 # TMP3 = b1+b0
+ PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
+ PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
+ PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
+ pxor \GH, \TMP2
+ pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
+ movdqa \TMP2, \TMP3
+ pslldq $8, \TMP3 # left shift TMP3 2 DWs
+ psrldq $8, \TMP2 # right shift TMP2 2 DWs
+ pxor \TMP3, \GH
+ pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
+
+ # first phase of the reduction
+
+ movdqa \GH, \TMP2
+ movdqa \GH, \TMP3
+ movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
+ # in in order to perform
+ # independent shifts
+ pslld $31, \TMP2 # packed right shift <<31
+ pslld $30, \TMP3 # packed right shift <<30
+ pslld $25, \TMP4 # packed right shift <<25
+ pxor \TMP3, \TMP2 # xor the shifted versions
+ pxor \TMP4, \TMP2
+ movdqa \TMP2, \TMP5
+ psrldq $4, \TMP5 # right shift TMP5 1 DW
+ pslldq $12, \TMP2 # left shift TMP2 3 DWs
+ pxor \TMP2, \GH
+
+ # second phase of the reduction
+
+ movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
+ # in in order to perform
+ # independent shifts
+ movdqa \GH,\TMP3
+ movdqa \GH,\TMP4
+ psrld $1,\TMP2 # packed left shift >>1
+ psrld $2,\TMP3 # packed left shift >>2
+ psrld $7,\TMP4 # packed left shift >>7
+ pxor \TMP3,\TMP2 # xor the shifted versions
+ pxor \TMP4,\TMP2
+ pxor \TMP5, \TMP2
+ pxor \TMP2, \GH
+ pxor \TMP1, \GH # result is in TMP1
+.endm
+
+# Reads DLEN bytes starting at DPTR and stores in XMMDst
+# where 0 < DLEN < 16
+# Clobbers %rax, DLEN and XMM1
+.macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
+ cmp $8, \DLEN
+ jl _read_lt8_\@
+ mov (\DPTR), %rax
+ MOVQ_R64_XMM %rax, \XMMDst
+ sub $8, \DLEN
+ jz _done_read_partial_block_\@
+ xor %eax, %eax
+_read_next_byte_\@:
+ shl $8, %rax
+ mov 7(\DPTR, \DLEN, 1), %al
+ dec \DLEN
+ jnz _read_next_byte_\@
+ MOVQ_R64_XMM %rax, \XMM1
+ pslldq $8, \XMM1
+ por \XMM1, \XMMDst
+ jmp _done_read_partial_block_\@
+_read_lt8_\@:
+ xor %eax, %eax
+_read_next_byte_lt8_\@:
+ shl $8, %rax
+ mov -1(\DPTR, \DLEN, 1), %al
+ dec \DLEN
+ jnz _read_next_byte_lt8_\@
+ MOVQ_R64_XMM %rax, \XMMDst
+_done_read_partial_block_\@:
+.endm
+
+# CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
+# clobbers r10-11, xmm14
+.macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \
+ TMP6 TMP7
+ MOVADQ SHUF_MASK(%rip), %xmm14
+ mov \AAD, %r10 # %r10 = AAD
+ mov \AADLEN, %r11 # %r11 = aadLen
+ pxor \TMP7, \TMP7
+ pxor \TMP6, \TMP6
+
+ cmp $16, %r11
+ jl _get_AAD_rest\@
+_get_AAD_blocks\@:
+ movdqu (%r10), \TMP7
+ PSHUFB_XMM %xmm14, \TMP7 # byte-reflect the AAD data
+ pxor \TMP7, \TMP6
+ GHASH_MUL \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
+ add $16, %r10
+ sub $16, %r11
+ cmp $16, %r11
+ jge _get_AAD_blocks\@
+
+ movdqu \TMP6, \TMP7
+
+ /* read the last <16B of AAD */
+_get_AAD_rest\@:
+ cmp $0, %r11
+ je _get_AAD_done\@
+
+ READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7
+ PSHUFB_XMM %xmm14, \TMP7 # byte-reflect the AAD data
+ pxor \TMP6, \TMP7
+ GHASH_MUL \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
+ movdqu \TMP7, \TMP6
+
+_get_AAD_done\@:
+ movdqu \TMP6, AadHash(%arg2)
+.endm
+
+# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
+# between update calls.
+# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
+# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
+# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
+.macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
+ AAD_HASH operation
+ mov PBlockLen(%arg2), %r13
+ cmp $0, %r13
+ je _partial_block_done_\@ # Leave Macro if no partial blocks
+ # Read in input data without over reading
+ cmp $16, \PLAIN_CYPH_LEN
+ jl _fewer_than_16_bytes_\@
+ movups (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm
+ jmp _data_read_\@
+
+_fewer_than_16_bytes_\@:
+ lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
+ mov \PLAIN_CYPH_LEN, %r12
+ READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1
+
+ mov PBlockLen(%arg2), %r13
+
+_data_read_\@: # Finished reading in data
+
+ movdqu PBlockEncKey(%arg2), %xmm9
+ movdqu HashKey(%arg2), %xmm13
+
+ lea SHIFT_MASK(%rip), %r12
+
+ # adjust the shuffle mask pointer to be able to shift r13 bytes
+ # r16-r13 is the number of bytes in plaintext mod 16)
+ add %r13, %r12
+ movdqu (%r12), %xmm2 # get the appropriate shuffle mask
+ PSHUFB_XMM %xmm2, %xmm9 # shift right r13 bytes
+
+.ifc \operation, dec
+ movdqa %xmm1, %xmm3
+ pxor %xmm1, %xmm9 # Cyphertext XOR E(K, Yn)
+
+ mov \PLAIN_CYPH_LEN, %r10
+ add %r13, %r10
+ # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
+ sub $16, %r10
+ # Determine if if partial block is not being filled and
+ # shift mask accordingly
+ jge _no_extra_mask_1_\@
+ sub %r10, %r12
+_no_extra_mask_1_\@:
+
+ movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
+ # get the appropriate mask to mask out bottom r13 bytes of xmm9
+ pand %xmm1, %xmm9 # mask out bottom r13 bytes of xmm9
+
+ pand %xmm1, %xmm3
+ movdqa SHUF_MASK(%rip), %xmm10
+ PSHUFB_XMM %xmm10, %xmm3
+ PSHUFB_XMM %xmm2, %xmm3
+ pxor %xmm3, \AAD_HASH
+
+ cmp $0, %r10
+ jl _partial_incomplete_1_\@
+
+ # GHASH computation for the last <16 Byte block
+ GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
+ xor %eax, %eax
+
+ mov %rax, PBlockLen(%arg2)
+ jmp _dec_done_\@
+_partial_incomplete_1_\@:
+ add \PLAIN_CYPH_LEN, PBlockLen(%arg2)
+_dec_done_\@:
+ movdqu \AAD_HASH, AadHash(%arg2)
+.else
+ pxor %xmm1, %xmm9 # Plaintext XOR E(K, Yn)
+
+ mov \PLAIN_CYPH_LEN, %r10
+ add %r13, %r10
+ # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
+ sub $16, %r10
+ # Determine if if partial block is not being filled and
+ # shift mask accordingly
+ jge _no_extra_mask_2_\@
+ sub %r10, %r12
+_no_extra_mask_2_\@:
+
+ movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
+ # get the appropriate mask to mask out bottom r13 bytes of xmm9
+ pand %xmm1, %xmm9
+
+ movdqa SHUF_MASK(%rip), %xmm1
+ PSHUFB_XMM %xmm1, %xmm9
+ PSHUFB_XMM %xmm2, %xmm9
+ pxor %xmm9, \AAD_HASH
+
+ cmp $0, %r10
+ jl _partial_incomplete_2_\@
+
+ # GHASH computation for the last <16 Byte block
+ GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
+ xor %eax, %eax
+
+ mov %rax, PBlockLen(%arg2)
+ jmp _encode_done_\@
+_partial_incomplete_2_\@:
+ add \PLAIN_CYPH_LEN, PBlockLen(%arg2)
+_encode_done_\@:
+ movdqu \AAD_HASH, AadHash(%arg2)
+
+ movdqa SHUF_MASK(%rip), %xmm10
+ # shuffle xmm9 back to output as ciphertext
+ PSHUFB_XMM %xmm10, %xmm9
+ PSHUFB_XMM %xmm2, %xmm9
+.endif
+ # output encrypted Bytes
+ cmp $0, %r10
+ jl _partial_fill_\@
+ mov %r13, %r12
+ mov $16, %r13
+ # Set r13 to be the number of bytes to write out
+ sub %r12, %r13
+ jmp _count_set_\@
+_partial_fill_\@:
+ mov \PLAIN_CYPH_LEN, %r13
+_count_set_\@:
+ movdqa %xmm9, %xmm0
+ MOVQ_R64_XMM %xmm0, %rax
+ cmp $8, %r13
+ jle _less_than_8_bytes_left_\@
+
+ mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
+ add $8, \DATA_OFFSET
+ psrldq $8, %xmm0
+ MOVQ_R64_XMM %xmm0, %rax
+ sub $8, %r13
+_less_than_8_bytes_left_\@:
+ movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
+ add $1, \DATA_OFFSET
+ shr $8, %rax
+ sub $1, %r13
+ jne _less_than_8_bytes_left_\@
+_partial_block_done_\@:
+.endm # PARTIAL_BLOCK
+
+/*
+* if a = number of total plaintext bytes
+* b = floor(a/16)
+* num_initial_blocks = b mod 4
+* encrypt the initial num_initial_blocks blocks and apply ghash on
+* the ciphertext
+* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
+* are clobbered
+* arg1, %arg2, %arg3 are used as a pointer only, not modified
+*/
+
+
+.macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
+ XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
+ MOVADQ SHUF_MASK(%rip), %xmm14
+
+ movdqu AadHash(%arg2), %xmm\i # XMM0 = Y0
+
+ # start AES for num_initial_blocks blocks
+
+ movdqu CurCount(%arg2), \XMM0 # XMM0 = Y0
+
+.if (\i == 5) || (\i == 6) || (\i == 7)
+
+ MOVADQ ONE(%RIP),\TMP1
+ MOVADQ 0(%arg1),\TMP2
+.irpc index, \i_seq
+ paddd \TMP1, \XMM0 # INCR Y0
+.ifc \operation, dec
+ movdqa \XMM0, %xmm\index
+.else
+ MOVADQ \XMM0, %xmm\index
+.endif
+ PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
+ pxor \TMP2, %xmm\index
+.endr
+ lea 0x10(%arg1),%r10
+ mov keysize,%eax
+ shr $2,%eax # 128->4, 192->6, 256->8
+ add $5,%eax # 128->9, 192->11, 256->13
+
+aes_loop_initial_\@:
+ MOVADQ (%r10),\TMP1
+.irpc index, \i_seq
+ AESENC \TMP1, %xmm\index
+.endr
+ add $16,%r10
+ sub $1,%eax
+ jnz aes_loop_initial_\@
+
+ MOVADQ (%r10), \TMP1
+.irpc index, \i_seq
+ AESENCLAST \TMP1, %xmm\index # Last Round
+.endr
+.irpc index, \i_seq
+ movdqu (%arg4 , %r11, 1), \TMP1
+ pxor \TMP1, %xmm\index
+ movdqu %xmm\index, (%arg3 , %r11, 1)
+ # write back plaintext/ciphertext for num_initial_blocks
+ add $16, %r11
+
+.ifc \operation, dec
+ movdqa \TMP1, %xmm\index
+.endif
+ PSHUFB_XMM %xmm14, %xmm\index
+
+ # prepare plaintext/ciphertext for GHASH computation
+.endr
+.endif
+
+ # apply GHASH on num_initial_blocks blocks
+
+.if \i == 5
+ pxor %xmm5, %xmm6
+ GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+ pxor %xmm6, %xmm7
+ GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+ pxor %xmm7, %xmm8
+ GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+.elseif \i == 6
+ pxor %xmm6, %xmm7
+ GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+ pxor %xmm7, %xmm8
+ GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+.elseif \i == 7
+ pxor %xmm7, %xmm8
+ GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+.endif
+ cmp $64, %r13
+ jl _initial_blocks_done\@
+ # no need for precomputed values
+/*
+*
+* Precomputations for HashKey parallel with encryption of first 4 blocks.
+* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+*/
+ MOVADQ ONE(%RIP),\TMP1
+ paddd \TMP1, \XMM0 # INCR Y0
+ MOVADQ \XMM0, \XMM1
+ PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
+
+ paddd \TMP1, \XMM0 # INCR Y0
+ MOVADQ \XMM0, \XMM2
+ PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
+
+ paddd \TMP1, \XMM0 # INCR Y0
+ MOVADQ \XMM0, \XMM3
+ PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
+
+ paddd \TMP1, \XMM0 # INCR Y0
+ MOVADQ \XMM0, \XMM4
+ PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
+
+ MOVADQ 0(%arg1),\TMP1
+ pxor \TMP1, \XMM1
+ pxor \TMP1, \XMM2
+ pxor \TMP1, \XMM3
+ pxor \TMP1, \XMM4
+.irpc index, 1234 # do 4 rounds
+ movaps 0x10*\index(%arg1), \TMP1
+ AESENC \TMP1, \XMM1
+ AESENC \TMP1, \XMM2
+ AESENC \TMP1, \XMM3
+ AESENC \TMP1, \XMM4
+.endr
+.irpc index, 56789 # do next 5 rounds
+ movaps 0x10*\index(%arg1), \TMP1
+ AESENC \TMP1, \XMM1
+ AESENC \TMP1, \XMM2
+ AESENC \TMP1, \XMM3
+ AESENC \TMP1, \XMM4
+.endr
+ lea 0xa0(%arg1),%r10
+ mov keysize,%eax
+ shr $2,%eax # 128->4, 192->6, 256->8
+ sub $4,%eax # 128->0, 192->2, 256->4
+ jz aes_loop_pre_done\@
+
+aes_loop_pre_\@:
+ MOVADQ (%r10),\TMP2
+.irpc index, 1234
+ AESENC \TMP2, %xmm\index
+.endr
+ add $16,%r10
+ sub $1,%eax
+ jnz aes_loop_pre_\@
+
+aes_loop_pre_done\@:
+ MOVADQ (%r10), \TMP2
+ AESENCLAST \TMP2, \XMM1
+ AESENCLAST \TMP2, \XMM2
+ AESENCLAST \TMP2, \XMM3
+ AESENCLAST \TMP2, \XMM4
+ movdqu 16*0(%arg4 , %r11 , 1), \TMP1
+ pxor \TMP1, \XMM1
+.ifc \operation, dec
+ movdqu \XMM1, 16*0(%arg3 , %r11 , 1)
+ movdqa \TMP1, \XMM1
+.endif
+ movdqu 16*1(%arg4 , %r11 , 1), \TMP1
+ pxor \TMP1, \XMM2
+.ifc \operation, dec
+ movdqu \XMM2, 16*1(%arg3 , %r11 , 1)
+ movdqa \TMP1, \XMM2
+.endif
+ movdqu 16*2(%arg4 , %r11 , 1), \TMP1
+ pxor \TMP1, \XMM3
+.ifc \operation, dec
+ movdqu \XMM3, 16*2(%arg3 , %r11 , 1)
+ movdqa \TMP1, \XMM3
+.endif
+ movdqu 16*3(%arg4 , %r11 , 1), \TMP1
+ pxor \TMP1, \XMM4
+.ifc \operation, dec
+ movdqu \XMM4, 16*3(%arg3 , %r11 , 1)
+ movdqa \TMP1, \XMM4
+.else
+ movdqu \XMM1, 16*0(%arg3 , %r11 , 1)
+ movdqu \XMM2, 16*1(%arg3 , %r11 , 1)
+ movdqu \XMM3, 16*2(%arg3 , %r11 , 1)
+ movdqu \XMM4, 16*3(%arg3 , %r11 , 1)
+.endif
+
+ add $64, %r11
+ PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
+ pxor \XMMDst, \XMM1
+# combine GHASHed value with the corresponding ciphertext
+ PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
+ PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
+ PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
+
+_initial_blocks_done\@:
+
+.endm
+
+/*
+* encrypt 4 blocks at a time
+* ghash the 4 previously encrypted ciphertext blocks
+* arg1, %arg3, %arg4 are used as pointers only, not modified
+* %r11 is the data offset value
+*/
+.macro GHASH_4_ENCRYPT_4_PARALLEL_enc TMP1 TMP2 TMP3 TMP4 TMP5 \
+TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
+
+ movdqa \XMM1, \XMM5
+ movdqa \XMM2, \XMM6
+ movdqa \XMM3, \XMM7
+ movdqa \XMM4, \XMM8
+
+ movdqa SHUF_MASK(%rip), %xmm15
+ # multiply TMP5 * HashKey using karatsuba
+
+ movdqa \XMM5, \TMP4
+ pshufd $78, \XMM5, \TMP6
+ pxor \XMM5, \TMP6
+ paddd ONE(%rip), \XMM0 # INCR CNT
+ movdqu HashKey_4(%arg2), \TMP5
+ PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
+ movdqa \XMM0, \XMM1
+ paddd ONE(%rip), \XMM0 # INCR CNT
+ movdqa \XMM0, \XMM2
+ paddd ONE(%rip), \XMM0 # INCR CNT
+ movdqa \XMM0, \XMM3
+ paddd ONE(%rip), \XMM0 # INCR CNT
+ movdqa \XMM0, \XMM4
+ PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
+ PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
+ PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
+ PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
+ PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
+
+ pxor (%arg1), \XMM1
+ pxor (%arg1), \XMM2
+ pxor (%arg1), \XMM3
+ pxor (%arg1), \XMM4
+ movdqu HashKey_4_k(%arg2), \TMP5
+ PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
+ movaps 0x10(%arg1), \TMP1
+ AESENC \TMP1, \XMM1 # Round 1
+ AESENC \TMP1, \XMM2
+ AESENC \TMP1, \XMM3
+ AESENC \TMP1, \XMM4
+ movaps 0x20(%arg1), \TMP1
+ AESENC \TMP1, \XMM1 # Round 2
+ AESENC \TMP1, \XMM2
+ AESENC \TMP1, \XMM3
+ AESENC \TMP1, \XMM4
+ movdqa \XMM6, \TMP1
+ pshufd $78, \XMM6, \TMP2
+ pxor \XMM6, \TMP2
+ movdqu HashKey_3(%arg2), \TMP5
+ PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
+ movaps 0x30(%arg1), \TMP3
+ AESENC \TMP3, \XMM1 # Round 3
+ AESENC \TMP3, \XMM2
+ AESENC \TMP3, \XMM3
+ AESENC \TMP3, \XMM4
+ PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
+ movaps 0x40(%arg1), \TMP3
+ AESENC \TMP3, \XMM1 # Round 4
+ AESENC \TMP3, \XMM2
+ AESENC \TMP3, \XMM3
+ AESENC \TMP3, \XMM4
+ movdqu HashKey_3_k(%arg2), \TMP5
+ PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
+ movaps 0x50(%arg1), \TMP3
+ AESENC \TMP3, \XMM1 # Round 5
+ AESENC \TMP3, \XMM2
+ AESENC \TMP3, \XMM3
+ AESENC \TMP3, \XMM4
+ pxor \TMP1, \TMP4
+# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
+ pxor \XMM6, \XMM5
+ pxor \TMP2, \TMP6
+ movdqa \XMM7, \TMP1
+ pshufd $78, \XMM7, \TMP2
+ pxor \XMM7, \TMP2
+ movdqu HashKey_2(%arg2), \TMP5
+
+ # Multiply TMP5 * HashKey using karatsuba
+
+ PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
+ movaps 0x60(%arg1), \TMP3
+ AESENC \TMP3, \XMM1 # Round 6
+ AESENC \TMP3, \XMM2
+ AESENC \TMP3, \XMM3
+ AESENC \TMP3, \XMM4
+ PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
+ movaps 0x70(%arg1), \TMP3
+ AESENC \TMP3, \XMM1 # Round 7
+ AESENC \TMP3, \XMM2
+ AESENC \TMP3, \XMM3
+ AESENC \TMP3, \XMM4
+ movdqu HashKey_2_k(%arg2), \TMP5
+ PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
+ movaps 0x80(%arg1), \TMP3
+ AESENC \TMP3, \XMM1 # Round 8
+ AESENC \TMP3, \XMM2
+ AESENC \TMP3, \XMM3
+ AESENC \TMP3, \XMM4
+ pxor \TMP1, \TMP4
+# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
+ pxor \XMM7, \XMM5
+ pxor \TMP2, \TMP6
+
+ # Multiply XMM8 * HashKey
+ # XMM8 and TMP5 hold the values for the two operands
+
+ movdqa \XMM8, \TMP1
+ pshufd $78, \XMM8, \TMP2
+ pxor \XMM8, \TMP2
+ movdqu HashKey(%arg2), \TMP5
+ PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
+ movaps 0x90(%arg1), \TMP3
+ AESENC \TMP3, \XMM1 # Round 9
+ AESENC \TMP3, \XMM2
+ AESENC \TMP3, \XMM3
+ AESENC \TMP3, \XMM4
+ PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
+ lea 0xa0(%arg1),%r10
+ mov keysize,%eax
+ shr $2,%eax # 128->4, 192->6, 256->8
+ sub $4,%eax # 128->0, 192->2, 256->4
+ jz aes_loop_par_enc_done\@
+
+aes_loop_par_enc\@:
+ MOVADQ (%r10),\TMP3
+.irpc index, 1234
+ AESENC \TMP3, %xmm\index
+.endr
+ add $16,%r10
+ sub $1,%eax
+ jnz aes_loop_par_enc\@
+
+aes_loop_par_enc_done\@:
+ MOVADQ (%r10), \TMP3
+ AESENCLAST \TMP3, \XMM1 # Round 10
+ AESENCLAST \TMP3, \XMM2
+ AESENCLAST \TMP3, \XMM3
+ AESENCLAST \TMP3, \XMM4
+ movdqu HashKey_k(%arg2), \TMP5
+ PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
+ movdqu (%arg4,%r11,1), \TMP3
+ pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
+ movdqu 16(%arg4,%r11,1), \TMP3
+ pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
+ movdqu 32(%arg4,%r11,1), \TMP3
+ pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
+ movdqu 48(%arg4,%r11,1), \TMP3
+ pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
+ movdqu \XMM1, (%arg3,%r11,1) # Write to the ciphertext buffer
+ movdqu \XMM2, 16(%arg3,%r11,1) # Write to the ciphertext buffer
+ movdqu \XMM3, 32(%arg3,%r11,1) # Write to the ciphertext buffer
+ movdqu \XMM4, 48(%arg3,%r11,1) # Write to the ciphertext buffer
+ PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
+ PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
+ PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
+ PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
+
+ pxor \TMP4, \TMP1
+ pxor \XMM8, \XMM5
+ pxor \TMP6, \TMP2
+ pxor \TMP1, \TMP2
+ pxor \XMM5, \TMP2
+ movdqa \TMP2, \TMP3
+ pslldq $8, \TMP3 # left shift TMP3 2 DWs
+ psrldq $8, \TMP2 # right shift TMP2 2 DWs
+ pxor \TMP3, \XMM5
+ pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
+
+ # first phase of reduction
+
+ movdqa \XMM5, \TMP2
+ movdqa \XMM5, \TMP3
+ movdqa \XMM5, \TMP4
+# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
+ pslld $31, \TMP2 # packed right shift << 31
+ pslld $30, \TMP3 # packed right shift << 30
+ pslld $25, \TMP4 # packed right shift << 25
+ pxor \TMP3, \TMP2 # xor the shifted versions
+ pxor \TMP4, \TMP2
+ movdqa \TMP2, \TMP5
+ psrldq $4, \TMP5 # right shift T5 1 DW
+ pslldq $12, \TMP2 # left shift T2 3 DWs
+ pxor \TMP2, \XMM5
+
+ # second phase of reduction
+
+ movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
+ movdqa \XMM5,\TMP3
+ movdqa \XMM5,\TMP4
+ psrld $1, \TMP2 # packed left shift >>1
+ psrld $2, \TMP3 # packed left shift >>2
+ psrld $7, \TMP4 # packed left shift >>7
+ pxor \TMP3,\TMP2 # xor the shifted versions
+ pxor \TMP4,\TMP2
+ pxor \TMP5, \TMP2
+ pxor \TMP2, \XMM5
+ pxor \TMP1, \XMM5 # result is in TMP1
+
+ pxor \XMM5, \XMM1
+.endm
+
+/*
+* decrypt 4 blocks at a time
+* ghash the 4 previously decrypted ciphertext blocks
+* arg1, %arg3, %arg4 are used as pointers only, not modified
+* %r11 is the data offset value
+*/
+.macro GHASH_4_ENCRYPT_4_PARALLEL_dec TMP1 TMP2 TMP3 TMP4 TMP5 \
+TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
+
+ movdqa \XMM1, \XMM5
+ movdqa \XMM2, \XMM6
+ movdqa \XMM3, \XMM7
+ movdqa \XMM4, \XMM8
+
+ movdqa SHUF_MASK(%rip), %xmm15
+ # multiply TMP5 * HashKey using karatsuba
+
+ movdqa \XMM5, \TMP4
+ pshufd $78, \XMM5, \TMP6
+ pxor \XMM5, \TMP6
+ paddd ONE(%rip), \XMM0 # INCR CNT
+ movdqu HashKey_4(%arg2), \TMP5
+ PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
+ movdqa \XMM0, \XMM1
+ paddd ONE(%rip), \XMM0 # INCR CNT
+ movdqa \XMM0, \XMM2
+ paddd ONE(%rip), \XMM0 # INCR CNT
+ movdqa \XMM0, \XMM3
+ paddd ONE(%rip), \XMM0 # INCR CNT
+ movdqa \XMM0, \XMM4
+ PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
+ PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
+ PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
+ PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
+ PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
+
+ pxor (%arg1), \XMM1
+ pxor (%arg1), \XMM2
+ pxor (%arg1), \XMM3
+ pxor (%arg1), \XMM4
+ movdqu HashKey_4_k(%arg2), \TMP5
+ PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
+ movaps 0x10(%arg1), \TMP1
+ AESENC \TMP1, \XMM1 # Round 1
+ AESENC \TMP1, \XMM2
+ AESENC \TMP1, \XMM3
+ AESENC \TMP1, \XMM4
+ movaps 0x20(%arg1), \TMP1
+ AESENC \TMP1, \XMM1 # Round 2
+ AESENC \TMP1, \XMM2
+ AESENC \TMP1, \XMM3
+ AESENC \TMP1, \XMM4
+ movdqa \XMM6, \TMP1
+ pshufd $78, \XMM6, \TMP2
+ pxor \XMM6, \TMP2
+ movdqu HashKey_3(%arg2), \TMP5
+ PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
+ movaps 0x30(%arg1), \TMP3
+ AESENC \TMP3, \XMM1 # Round 3
+ AESENC \TMP3, \XMM2
+ AESENC \TMP3, \XMM3
+ AESENC \TMP3, \XMM4
+ PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
+ movaps 0x40(%arg1), \TMP3
+ AESENC \TMP3, \XMM1 # Round 4
+ AESENC \TMP3, \XMM2
+ AESENC \TMP3, \XMM3
+ AESENC \TMP3, \XMM4
+ movdqu HashKey_3_k(%arg2), \TMP5
+ PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
+ movaps 0x50(%arg1), \TMP3
+ AESENC \TMP3, \XMM1 # Round 5
+ AESENC \TMP3, \XMM2
+ AESENC \TMP3, \XMM3
+ AESENC \TMP3, \XMM4
+ pxor \TMP1, \TMP4
+# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
+ pxor \XMM6, \XMM5
+ pxor \TMP2, \TMP6
+ movdqa \XMM7, \TMP1
+ pshufd $78, \XMM7, \TMP2
+ pxor \XMM7, \TMP2
+ movdqu HashKey_2(%arg2), \TMP5
+
+ # Multiply TMP5 * HashKey using karatsuba
+
+ PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
+ movaps 0x60(%arg1), \TMP3
+ AESENC \TMP3, \XMM1 # Round 6
+ AESENC \TMP3, \XMM2
+ AESENC \TMP3, \XMM3
+ AESENC \TMP3, \XMM4
+ PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
+ movaps 0x70(%arg1), \TMP3
+ AESENC \TMP3, \XMM1 # Round 7
+ AESENC \TMP3, \XMM2
+ AESENC \TMP3, \XMM3
+ AESENC \TMP3, \XMM4
+ movdqu HashKey_2_k(%arg2), \TMP5
+ PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
+ movaps 0x80(%arg1), \TMP3
+ AESENC \TMP3, \XMM1 # Round 8
+ AESENC \TMP3, \XMM2
+ AESENC \TMP3, \XMM3
+ AESENC \TMP3, \XMM4
+ pxor \TMP1, \TMP4
+# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
+ pxor \XMM7, \XMM5
+ pxor \TMP2, \TMP6
+
+ # Multiply XMM8 * HashKey
+ # XMM8 and TMP5 hold the values for the two operands
+
+ movdqa \XMM8, \TMP1
+ pshufd $78, \XMM8, \TMP2
+ pxor \XMM8, \TMP2
+ movdqu HashKey(%arg2), \TMP5
+ PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
+ movaps 0x90(%arg1), \TMP3
+ AESENC \TMP3, \XMM1 # Round 9
+ AESENC \TMP3, \XMM2
+ AESENC \TMP3, \XMM3
+ AESENC \TMP3, \XMM4
+ PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
+ lea 0xa0(%arg1),%r10
+ mov keysize,%eax
+ shr $2,%eax # 128->4, 192->6, 256->8
+ sub $4,%eax # 128->0, 192->2, 256->4
+ jz aes_loop_par_dec_done\@
+
+aes_loop_par_dec\@:
+ MOVADQ (%r10),\TMP3
+.irpc index, 1234
+ AESENC \TMP3, %xmm\index
+.endr
+ add $16,%r10
+ sub $1,%eax
+ jnz aes_loop_par_dec\@
+
+aes_loop_par_dec_done\@:
+ MOVADQ (%r10), \TMP3
+ AESENCLAST \TMP3, \XMM1 # last round
+ AESENCLAST \TMP3, \XMM2
+ AESENCLAST \TMP3, \XMM3
+ AESENCLAST \TMP3, \XMM4
+ movdqu HashKey_k(%arg2), \TMP5
+ PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
+ movdqu (%arg4,%r11,1), \TMP3
+ pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
+ movdqu \XMM1, (%arg3,%r11,1) # Write to plaintext buffer
+ movdqa \TMP3, \XMM1
+ movdqu 16(%arg4,%r11,1), \TMP3
+ pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
+ movdqu \XMM2, 16(%arg3,%r11,1) # Write to plaintext buffer
+ movdqa \TMP3, \XMM2
+ movdqu 32(%arg4,%r11,1), \TMP3
+ pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
+ movdqu \XMM3, 32(%arg3,%r11,1) # Write to plaintext buffer
+ movdqa \TMP3, \XMM3
+ movdqu 48(%arg4,%r11,1), \TMP3
+ pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
+ movdqu \XMM4, 48(%arg3,%r11,1) # Write to plaintext buffer
+ movdqa \TMP3, \XMM4
+ PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
+ PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
+ PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
+ PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
+
+ pxor \TMP4, \TMP1
+ pxor \XMM8, \XMM5
+ pxor \TMP6, \TMP2
+ pxor \TMP1, \TMP2
+ pxor \XMM5, \TMP2
+ movdqa \TMP2, \TMP3
+ pslldq $8, \TMP3 # left shift TMP3 2 DWs
+ psrldq $8, \TMP2 # right shift TMP2 2 DWs
+ pxor \TMP3, \XMM5
+ pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
+
+ # first phase of reduction
+
+ movdqa \XMM5, \TMP2
+ movdqa \XMM5, \TMP3
+ movdqa \XMM5, \TMP4
+# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
+ pslld $31, \TMP2 # packed right shift << 31
+ pslld $30, \TMP3 # packed right shift << 30
+ pslld $25, \TMP4 # packed right shift << 25
+ pxor \TMP3, \TMP2 # xor the shifted versions
+ pxor \TMP4, \TMP2
+ movdqa \TMP2, \TMP5
+ psrldq $4, \TMP5 # right shift T5 1 DW
+ pslldq $12, \TMP2 # left shift T2 3 DWs
+ pxor \TMP2, \XMM5
+
+ # second phase of reduction
+
+ movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
+ movdqa \XMM5,\TMP3
+ movdqa \XMM5,\TMP4
+ psrld $1, \TMP2 # packed left shift >>1
+ psrld $2, \TMP3 # packed left shift >>2
+ psrld $7, \TMP4 # packed left shift >>7
+ pxor \TMP3,\TMP2 # xor the shifted versions
+ pxor \TMP4,\TMP2
+ pxor \TMP5, \TMP2
+ pxor \TMP2, \XMM5
+ pxor \TMP1, \XMM5 # result is in TMP1
+
+ pxor \XMM5, \XMM1
+.endm
+
+/* GHASH the last 4 ciphertext blocks. */
+.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
+TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
+
+ # Multiply TMP6 * HashKey (using Karatsuba)
+
+ movdqa \XMM1, \TMP6
+ pshufd $78, \XMM1, \TMP2
+ pxor \XMM1, \TMP2
+ movdqu HashKey_4(%arg2), \TMP5
+ PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
+ PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
+ movdqu HashKey_4_k(%arg2), \TMP4
+ PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
+ movdqa \XMM1, \XMMDst
+ movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
+
+ # Multiply TMP1 * HashKey (using Karatsuba)
+
+ movdqa \XMM2, \TMP1
+ pshufd $78, \XMM2, \TMP2
+ pxor \XMM2, \TMP2
+ movdqu HashKey_3(%arg2), \TMP5
+ PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
+ PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
+ movdqu HashKey_3_k(%arg2), \TMP4
+ PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
+ pxor \TMP1, \TMP6
+ pxor \XMM2, \XMMDst
+ pxor \TMP2, \XMM1
+# results accumulated in TMP6, XMMDst, XMM1
+
+ # Multiply TMP1 * HashKey (using Karatsuba)
+
+ movdqa \XMM3, \TMP1
+ pshufd $78, \XMM3, \TMP2
+ pxor \XMM3, \TMP2
+ movdqu HashKey_2(%arg2), \TMP5
+ PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
+ PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
+ movdqu HashKey_2_k(%arg2), \TMP4
+ PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
+ pxor \TMP1, \TMP6
+ pxor \XMM3, \XMMDst
+ pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
+
+ # Multiply TMP1 * HashKey (using Karatsuba)
+ movdqa \XMM4, \TMP1
+ pshufd $78, \XMM4, \TMP2
+ pxor \XMM4, \TMP2
+ movdqu HashKey(%arg2), \TMP5
+ PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
+ PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
+ movdqu HashKey_k(%arg2), \TMP4
+ PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
+ pxor \TMP1, \TMP6
+ pxor \XMM4, \XMMDst
+ pxor \XMM1, \TMP2
+ pxor \TMP6, \TMP2
+ pxor \XMMDst, \TMP2
+ # middle section of the temp results combined as in karatsuba algorithm
+ movdqa \TMP2, \TMP4
+ pslldq $8, \TMP4 # left shift TMP4 2 DWs
+ psrldq $8, \TMP2 # right shift TMP2 2 DWs
+ pxor \TMP4, \XMMDst
+ pxor \TMP2, \TMP6
+# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
+ # first phase of the reduction
+ movdqa \XMMDst, \TMP2
+ movdqa \XMMDst, \TMP3
+ movdqa \XMMDst, \TMP4
+# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
+ pslld $31, \TMP2 # packed right shifting << 31
+ pslld $30, \TMP3 # packed right shifting << 30
+ pslld $25, \TMP4 # packed right shifting << 25
+ pxor \TMP3, \TMP2 # xor the shifted versions
+ pxor \TMP4, \TMP2
+ movdqa \TMP2, \TMP7
+ psrldq $4, \TMP7 # right shift TMP7 1 DW
+ pslldq $12, \TMP2 # left shift TMP2 3 DWs
+ pxor \TMP2, \XMMDst
+
+ # second phase of the reduction
+ movdqa \XMMDst, \TMP2
+ # make 3 copies of XMMDst for doing 3 shift operations
+ movdqa \XMMDst, \TMP3
+ movdqa \XMMDst, \TMP4
+ psrld $1, \TMP2 # packed left shift >> 1
+ psrld $2, \TMP3 # packed left shift >> 2
+ psrld $7, \TMP4 # packed left shift >> 7
+ pxor \TMP3, \TMP2 # xor the shifted versions
+ pxor \TMP4, \TMP2
+ pxor \TMP7, \TMP2
+ pxor \TMP2, \XMMDst
+ pxor \TMP6, \XMMDst # reduced result is in XMMDst
+.endm
+
+
+/* Encryption of a single block
+* uses eax & r10
+*/
+
+.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
+
+ pxor (%arg1), \XMM0
+ mov keysize,%eax
+ shr $2,%eax # 128->4, 192->6, 256->8
+ add $5,%eax # 128->9, 192->11, 256->13
+ lea 16(%arg1), %r10 # get first expanded key address
+
+_esb_loop_\@:
+ MOVADQ (%r10),\TMP1
+ AESENC \TMP1,\XMM0
+ add $16,%r10
+ sub $1,%eax
+ jnz _esb_loop_\@
+
+ MOVADQ (%r10),\TMP1
+ AESENCLAST \TMP1,\XMM0
+.endm
+/*****************************************************************************
+* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
+* struct gcm_context_data *data
+* // Context data
+* u8 *out, // Plaintext output. Encrypt in-place is allowed.
+* const u8 *in, // Ciphertext input
+* u64 plaintext_len, // Length of data in bytes for decryption.
+* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
+* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
+* // concatenated with 0x00000001. 16-byte aligned pointer.
+* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
+* const u8 *aad, // Additional Authentication Data (AAD)
+* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
+* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
+* // given authentication tag and only return the plaintext if they match.
+* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
+* // (most likely), 12 or 8.
+*
+* Assumptions:
+*
+* keys:
+* keys are pre-expanded and aligned to 16 bytes. we are using the first
+* set of 11 keys in the data structure void *aes_ctx
+*
+* iv:
+* 0 1 2 3
+* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | Salt (From the SA) |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | Initialization Vector |
+* | (This is the sequence number from IPSec header) |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | 0x1 |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+*
+*
+* AAD:
+* AAD padded to 128 bits with 0
+* for example, assume AAD is a u32 vector
+*
+* if AAD is 8 bytes:
+* AAD[3] = {A0, A1};
+* padded AAD in xmm register = {A1 A0 0 0}
+*
+* 0 1 2 3
+* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | SPI (A1) |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | 32-bit Sequence Number (A0) |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | 0x0 |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+* AAD Format with 32-bit Sequence Number
+*
+* if AAD is 12 bytes:
+* AAD[3] = {A0, A1, A2};
+* padded AAD in xmm register = {A2 A1 A0 0}
+*
+* 0 1 2 3
+* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | SPI (A2) |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | 64-bit Extended Sequence Number {A1,A0} |
+* | |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | 0x0 |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+* AAD Format with 64-bit Extended Sequence Number
+*
+* poly = x^128 + x^127 + x^126 + x^121 + 1
+*
+*****************************************************************************/
+ENTRY(aesni_gcm_dec)
+ FUNC_SAVE
+
+ GCM_INIT %arg6, arg7, arg8, arg9
+ GCM_ENC_DEC dec
+ GCM_COMPLETE arg10, arg11
+ FUNC_RESTORE
+ ret
+ENDPROC(aesni_gcm_dec)
+
+
+/*****************************************************************************
+* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
+* struct gcm_context_data *data
+* // Context data
+* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
+* const u8 *in, // Plaintext input
+* u64 plaintext_len, // Length of data in bytes for encryption.
+* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
+* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
+* // concatenated with 0x00000001. 16-byte aligned pointer.
+* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
+* const u8 *aad, // Additional Authentication Data (AAD)
+* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
+* u8 *auth_tag, // Authenticated Tag output.
+* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
+* // 12 or 8.
+*
+* Assumptions:
+*
+* keys:
+* keys are pre-expanded and aligned to 16 bytes. we are using the
+* first set of 11 keys in the data structure void *aes_ctx
+*
+*
+* iv:
+* 0 1 2 3
+* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | Salt (From the SA) |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | Initialization Vector |
+* | (This is the sequence number from IPSec header) |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | 0x1 |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+*
+*
+* AAD:
+* AAD padded to 128 bits with 0
+* for example, assume AAD is a u32 vector
+*
+* if AAD is 8 bytes:
+* AAD[3] = {A0, A1};
+* padded AAD in xmm register = {A1 A0 0 0}
+*
+* 0 1 2 3
+* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | SPI (A1) |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | 32-bit Sequence Number (A0) |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | 0x0 |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+* AAD Format with 32-bit Sequence Number
+*
+* if AAD is 12 bytes:
+* AAD[3] = {A0, A1, A2};
+* padded AAD in xmm register = {A2 A1 A0 0}
+*
+* 0 1 2 3
+* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | SPI (A2) |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | 64-bit Extended Sequence Number {A1,A0} |
+* | |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | 0x0 |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+* AAD Format with 64-bit Extended Sequence Number
+*
+* poly = x^128 + x^127 + x^126 + x^121 + 1
+***************************************************************************/
+ENTRY(aesni_gcm_enc)
+ FUNC_SAVE
+
+ GCM_INIT %arg6, arg7, arg8, arg9
+ GCM_ENC_DEC enc
+
+ GCM_COMPLETE arg10, arg11
+ FUNC_RESTORE
+ ret
+ENDPROC(aesni_gcm_enc)
+
+/*****************************************************************************
+* void aesni_gcm_init(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
+* struct gcm_context_data *data,
+* // context data
+* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
+* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
+* // concatenated with 0x00000001. 16-byte aligned pointer.
+* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
+* const u8 *aad, // Additional Authentication Data (AAD)
+* u64 aad_len) // Length of AAD in bytes.
+*/
+ENTRY(aesni_gcm_init)
+ FUNC_SAVE
+ GCM_INIT %arg3, %arg4,%arg5, %arg6
+ FUNC_RESTORE
+ ret
+ENDPROC(aesni_gcm_init)
+
+/*****************************************************************************
+* void aesni_gcm_enc_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
+* struct gcm_context_data *data,
+* // context data
+* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
+* const u8 *in, // Plaintext input
+* u64 plaintext_len, // Length of data in bytes for encryption.
+*/
+ENTRY(aesni_gcm_enc_update)
+ FUNC_SAVE
+ GCM_ENC_DEC enc
+ FUNC_RESTORE
+ ret
+ENDPROC(aesni_gcm_enc_update)
+
+/*****************************************************************************
+* void aesni_gcm_dec_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
+* struct gcm_context_data *data,
+* // context data
+* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
+* const u8 *in, // Plaintext input
+* u64 plaintext_len, // Length of data in bytes for encryption.
+*/
+ENTRY(aesni_gcm_dec_update)
+ FUNC_SAVE
+ GCM_ENC_DEC dec
+ FUNC_RESTORE
+ ret
+ENDPROC(aesni_gcm_dec_update)
+
+/*****************************************************************************
+* void aesni_gcm_finalize(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
+* struct gcm_context_data *data,
+* // context data
+* u8 *auth_tag, // Authenticated Tag output.
+* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
+* // 12 or 8.
+*/
+ENTRY(aesni_gcm_finalize)
+ FUNC_SAVE
+ GCM_COMPLETE %arg3 %arg4
+ FUNC_RESTORE
+ ret
+ENDPROC(aesni_gcm_finalize)
+
+#endif
+
+
+.align 4
+_key_expansion_128:
+_key_expansion_256a:
+ pshufd $0b11111111, %xmm1, %xmm1
+ shufps $0b00010000, %xmm0, %xmm4
+ pxor %xmm4, %xmm0
+ shufps $0b10001100, %xmm0, %xmm4
+ pxor %xmm4, %xmm0
+ pxor %xmm1, %xmm0
+ movaps %xmm0, (TKEYP)
+ add $0x10, TKEYP
+ ret
+ENDPROC(_key_expansion_128)
+ENDPROC(_key_expansion_256a)
+
+.align 4
+_key_expansion_192a:
+ pshufd $0b01010101, %xmm1, %xmm1
+ shufps $0b00010000, %xmm0, %xmm4
+ pxor %xmm4, %xmm0
+ shufps $0b10001100, %xmm0, %xmm4
+ pxor %xmm4, %xmm0
+ pxor %xmm1, %xmm0
+
+ movaps %xmm2, %xmm5
+ movaps %xmm2, %xmm6
+ pslldq $4, %xmm5
+ pshufd $0b11111111, %xmm0, %xmm3
+ pxor %xmm3, %xmm2
+ pxor %xmm5, %xmm2
+
+ movaps %xmm0, %xmm1
+ shufps $0b01000100, %xmm0, %xmm6
+ movaps %xmm6, (TKEYP)
+ shufps $0b01001110, %xmm2, %xmm1
+ movaps %xmm1, 0x10(TKEYP)
+ add $0x20, TKEYP
+ ret
+ENDPROC(_key_expansion_192a)
+
+.align 4
+_key_expansion_192b:
+ pshufd $0b01010101, %xmm1, %xmm1
+ shufps $0b00010000, %xmm0, %xmm4
+ pxor %xmm4, %xmm0
+ shufps $0b10001100, %xmm0, %xmm4
+ pxor %xmm4, %xmm0
+ pxor %xmm1, %xmm0
+
+ movaps %xmm2, %xmm5
+ pslldq $4, %xmm5
+ pshufd $0b11111111, %xmm0, %xmm3
+ pxor %xmm3, %xmm2
+ pxor %xmm5, %xmm2
+
+ movaps %xmm0, (TKEYP)
+ add $0x10, TKEYP
+ ret
+ENDPROC(_key_expansion_192b)
+
+.align 4
+_key_expansion_256b:
+ pshufd $0b10101010, %xmm1, %xmm1
+ shufps $0b00010000, %xmm2, %xmm4
+ pxor %xmm4, %xmm2
+ shufps $0b10001100, %xmm2, %xmm4
+ pxor %xmm4, %xmm2
+ pxor %xmm1, %xmm2
+ movaps %xmm2, (TKEYP)
+ add $0x10, TKEYP
+ ret
+ENDPROC(_key_expansion_256b)
+
+/*
+ * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
+ * unsigned int key_len)
+ */
+ENTRY(aesni_set_key)
+ FRAME_BEGIN
+#ifndef __x86_64__
+ pushl KEYP
+ movl (FRAME_OFFSET+8)(%esp), KEYP # ctx
+ movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key
+ movl (FRAME_OFFSET+16)(%esp), %edx # key_len
+#endif
+ movups (UKEYP), %xmm0 # user key (first 16 bytes)
+ movaps %xmm0, (KEYP)
+ lea 0x10(KEYP), TKEYP # key addr
+ movl %edx, 480(KEYP)
+ pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
+ cmp $24, %dl
+ jb .Lenc_key128
+ je .Lenc_key192
+ movups 0x10(UKEYP), %xmm2 # other user key
+ movaps %xmm2, (TKEYP)
+ add $0x10, TKEYP
+ AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
+ call _key_expansion_256a
+ AESKEYGENASSIST 0x1 %xmm0 %xmm1
+ call _key_expansion_256b
+ AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
+ call _key_expansion_256a
+ AESKEYGENASSIST 0x2 %xmm0 %xmm1
+ call _key_expansion_256b
+ AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
+ call _key_expansion_256a
+ AESKEYGENASSIST 0x4 %xmm0 %xmm1
+ call _key_expansion_256b
+ AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
+ call _key_expansion_256a
+ AESKEYGENASSIST 0x8 %xmm0 %xmm1
+ call _key_expansion_256b
+ AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
+ call _key_expansion_256a
+ AESKEYGENASSIST 0x10 %xmm0 %xmm1
+ call _key_expansion_256b
+ AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
+ call _key_expansion_256a
+ AESKEYGENASSIST 0x20 %xmm0 %xmm1
+ call _key_expansion_256b
+ AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
+ call _key_expansion_256a
+ jmp .Ldec_key
+.Lenc_key192:
+ movq 0x10(UKEYP), %xmm2 # other user key
+ AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
+ call _key_expansion_192a
+ AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
+ call _key_expansion_192b
+ AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
+ call _key_expansion_192a
+ AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
+ call _key_expansion_192b
+ AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
+ call _key_expansion_192a
+ AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
+ call _key_expansion_192b
+ AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
+ call _key_expansion_192a
+ AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
+ call _key_expansion_192b
+ jmp .Ldec_key
+.Lenc_key128:
+ AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
+ call _key_expansion_128
+ AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
+ call _key_expansion_128
+ AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
+ call _key_expansion_128
+ AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
+ call _key_expansion_128
+ AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
+ call _key_expansion_128
+ AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
+ call _key_expansion_128
+ AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
+ call _key_expansion_128
+ AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
+ call _key_expansion_128
+ AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
+ call _key_expansion_128
+ AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
+ call _key_expansion_128
+.Ldec_key:
+ sub $0x10, TKEYP
+ movaps (KEYP), %xmm0
+ movaps (TKEYP), %xmm1
+ movaps %xmm0, 240(TKEYP)
+ movaps %xmm1, 240(KEYP)
+ add $0x10, KEYP
+ lea 240-16(TKEYP), UKEYP
+.align 4
+.Ldec_key_loop:
+ movaps (KEYP), %xmm0
+ AESIMC %xmm0 %xmm1
+ movaps %xmm1, (UKEYP)
+ add $0x10, KEYP
+ sub $0x10, UKEYP
+ cmp TKEYP, KEYP
+ jb .Ldec_key_loop
+ xor AREG, AREG
+#ifndef __x86_64__
+ popl KEYP
+#endif
+ FRAME_END
+ ret
+ENDPROC(aesni_set_key)
+
+/*
+ * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
+ */
+ENTRY(aesni_enc)
+ FRAME_BEGIN
+#ifndef __x86_64__
+ pushl KEYP
+ pushl KLEN
+ movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
+ movl (FRAME_OFFSET+16)(%esp), OUTP # dst
+ movl (FRAME_OFFSET+20)(%esp), INP # src
+#endif
+ movl 480(KEYP), KLEN # key length
+ movups (INP), STATE # input
+ call _aesni_enc1
+ movups STATE, (OUTP) # output
+#ifndef __x86_64__
+ popl KLEN
+ popl KEYP
+#endif
+ FRAME_END
+ ret
+ENDPROC(aesni_enc)
+
+/*
+ * _aesni_enc1: internal ABI
+ * input:
+ * KEYP: key struct pointer
+ * KLEN: round count
+ * STATE: initial state (input)
+ * output:
+ * STATE: finial state (output)
+ * changed:
+ * KEY
+ * TKEYP (T1)
+ */
+.align 4
+_aesni_enc1:
+ movaps (KEYP), KEY # key
+ mov KEYP, TKEYP
+ pxor KEY, STATE # round 0
+ add $0x30, TKEYP
+ cmp $24, KLEN
+ jb .Lenc128
+ lea 0x20(TKEYP), TKEYP
+ je .Lenc192
+ add $0x20, TKEYP
+ movaps -0x60(TKEYP), KEY
+ AESENC KEY STATE
+ movaps -0x50(TKEYP), KEY
+ AESENC KEY STATE
+.align 4
+.Lenc192:
+ movaps -0x40(TKEYP), KEY
+ AESENC KEY STATE
+ movaps -0x30(TKEYP), KEY
+ AESENC KEY STATE
+.align 4
+.Lenc128:
+ movaps -0x20(TKEYP), KEY
+ AESENC KEY STATE
+ movaps -0x10(TKEYP), KEY
+ AESENC KEY STATE
+ movaps (TKEYP), KEY
+ AESENC KEY STATE
+ movaps 0x10(TKEYP), KEY
+ AESENC KEY STATE
+ movaps 0x20(TKEYP), KEY
+ AESENC KEY STATE
+ movaps 0x30(TKEYP), KEY
+ AESENC KEY STATE
+ movaps 0x40(TKEYP), KEY
+ AESENC KEY STATE
+ movaps 0x50(TKEYP), KEY
+ AESENC KEY STATE
+ movaps 0x60(TKEYP), KEY
+ AESENC KEY STATE
+ movaps 0x70(TKEYP), KEY
+ AESENCLAST KEY STATE
+ ret
+ENDPROC(_aesni_enc1)
+
+/*
+ * _aesni_enc4: internal ABI
+ * input:
+ * KEYP: key struct pointer
+ * KLEN: round count
+ * STATE1: initial state (input)
+ * STATE2
+ * STATE3
+ * STATE4
+ * output:
+ * STATE1: finial state (output)
+ * STATE2
+ * STATE3
+ * STATE4
+ * changed:
+ * KEY
+ * TKEYP (T1)
+ */
+.align 4
+_aesni_enc4:
+ movaps (KEYP), KEY # key
+ mov KEYP, TKEYP
+ pxor KEY, STATE1 # round 0
+ pxor KEY, STATE2
+ pxor KEY, STATE3
+ pxor KEY, STATE4
+ add $0x30, TKEYP
+ cmp $24, KLEN
+ jb .L4enc128
+ lea 0x20(TKEYP), TKEYP
+ je .L4enc192
+ add $0x20, TKEYP
+ movaps -0x60(TKEYP), KEY
+ AESENC KEY STATE1
+ AESENC KEY STATE2
+ AESENC KEY STATE3
+ AESENC KEY STATE4
+ movaps -0x50(TKEYP), KEY
+ AESENC KEY STATE1
+ AESENC KEY STATE2
+ AESENC KEY STATE3
+ AESENC KEY STATE4
+#.align 4
+.L4enc192:
+ movaps -0x40(TKEYP), KEY
+ AESENC KEY STATE1
+ AESENC KEY STATE2
+ AESENC KEY STATE3
+ AESENC KEY STATE4
+ movaps -0x30(TKEYP), KEY
+ AESENC KEY STATE1
+ AESENC KEY STATE2
+ AESENC KEY STATE3
+ AESENC KEY STATE4
+#.align 4
+.L4enc128:
+ movaps -0x20(TKEYP), KEY
+ AESENC KEY STATE1
+ AESENC KEY STATE2
+ AESENC KEY STATE3
+ AESENC KEY STATE4
+ movaps -0x10(TKEYP), KEY
+ AESENC KEY STATE1
+ AESENC KEY STATE2
+ AESENC KEY STATE3
+ AESENC KEY STATE4
+ movaps (TKEYP), KEY
+ AESENC KEY STATE1
+ AESENC KEY STATE2
+ AESENC KEY STATE3
+ AESENC KEY STATE4
+ movaps 0x10(TKEYP), KEY
+ AESENC KEY STATE1
+ AESENC KEY STATE2
+ AESENC KEY STATE3
+ AESENC KEY STATE4
+ movaps 0x20(TKEYP), KEY
+ AESENC KEY STATE1
+ AESENC KEY STATE2
+ AESENC KEY STATE3
+ AESENC KEY STATE4
+ movaps 0x30(TKEYP), KEY
+ AESENC KEY STATE1
+ AESENC KEY STATE2
+ AESENC KEY STATE3
+ AESENC KEY STATE4
+ movaps 0x40(TKEYP), KEY
+ AESENC KEY STATE1
+ AESENC KEY STATE2
+ AESENC KEY STATE3
+ AESENC KEY STATE4
+ movaps 0x50(TKEYP), KEY
+ AESENC KEY STATE1
+ AESENC KEY STATE2
+ AESENC KEY STATE3
+ AESENC KEY STATE4
+ movaps 0x60(TKEYP), KEY
+ AESENC KEY STATE1
+ AESENC KEY STATE2
+ AESENC KEY STATE3
+ AESENC KEY STATE4
+ movaps 0x70(TKEYP), KEY
+ AESENCLAST KEY STATE1 # last round
+ AESENCLAST KEY STATE2
+ AESENCLAST KEY STATE3
+ AESENCLAST KEY STATE4
+ ret
+ENDPROC(_aesni_enc4)
+
+/*
+ * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
+ */
+ENTRY(aesni_dec)
+ FRAME_BEGIN
+#ifndef __x86_64__
+ pushl KEYP
+ pushl KLEN
+ movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
+ movl (FRAME_OFFSET+16)(%esp), OUTP # dst
+ movl (FRAME_OFFSET+20)(%esp), INP # src
+#endif
+ mov 480(KEYP), KLEN # key length
+ add $240, KEYP
+ movups (INP), STATE # input
+ call _aesni_dec1
+ movups STATE, (OUTP) #output
+#ifndef __x86_64__
+ popl KLEN
+ popl KEYP
+#endif
+ FRAME_END
+ ret
+ENDPROC(aesni_dec)
+
+/*
+ * _aesni_dec1: internal ABI
+ * input:
+ * KEYP: key struct pointer
+ * KLEN: key length
+ * STATE: initial state (input)
+ * output:
+ * STATE: finial state (output)
+ * changed:
+ * KEY
+ * TKEYP (T1)
+ */
+.align 4
+_aesni_dec1:
+ movaps (KEYP), KEY # key
+ mov KEYP, TKEYP
+ pxor KEY, STATE # round 0
+ add $0x30, TKEYP
+ cmp $24, KLEN
+ jb .Ldec128
+ lea 0x20(TKEYP), TKEYP
+ je .Ldec192
+ add $0x20, TKEYP
+ movaps -0x60(TKEYP), KEY
+ AESDEC KEY STATE
+ movaps -0x50(TKEYP), KEY
+ AESDEC KEY STATE
+.align 4
+.Ldec192:
+ movaps -0x40(TKEYP), KEY
+ AESDEC KEY STATE
+ movaps -0x30(TKEYP), KEY
+ AESDEC KEY STATE
+.align 4
+.Ldec128:
+ movaps -0x20(TKEYP), KEY
+ AESDEC KEY STATE
+ movaps -0x10(TKEYP), KEY
+ AESDEC KEY STATE
+ movaps (TKEYP), KEY
+ AESDEC KEY STATE
+ movaps 0x10(TKEYP), KEY
+ AESDEC KEY STATE
+ movaps 0x20(TKEYP), KEY
+ AESDEC KEY STATE
+ movaps 0x30(TKEYP), KEY
+ AESDEC KEY STATE
+ movaps 0x40(TKEYP), KEY
+ AESDEC KEY STATE
+ movaps 0x50(TKEYP), KEY
+ AESDEC KEY STATE
+ movaps 0x60(TKEYP), KEY
+ AESDEC KEY STATE
+ movaps 0x70(TKEYP), KEY
+ AESDECLAST KEY STATE
+ ret
+ENDPROC(_aesni_dec1)
+
+/*
+ * _aesni_dec4: internal ABI
+ * input:
+ * KEYP: key struct pointer
+ * KLEN: key length
+ * STATE1: initial state (input)
+ * STATE2
+ * STATE3
+ * STATE4
+ * output:
+ * STATE1: finial state (output)
+ * STATE2
+ * STATE3
+ * STATE4
+ * changed:
+ * KEY
+ * TKEYP (T1)
+ */
+.align 4
+_aesni_dec4:
+ movaps (KEYP), KEY # key
+ mov KEYP, TKEYP
+ pxor KEY, STATE1 # round 0
+ pxor KEY, STATE2
+ pxor KEY, STATE3
+ pxor KEY, STATE4
+ add $0x30, TKEYP
+ cmp $24, KLEN
+ jb .L4dec128
+ lea 0x20(TKEYP), TKEYP
+ je .L4dec192
+ add $0x20, TKEYP
+ movaps -0x60(TKEYP), KEY
+ AESDEC KEY STATE1
+ AESDEC KEY STATE2
+ AESDEC KEY STATE3
+ AESDEC KEY STATE4
+ movaps -0x50(TKEYP), KEY
+ AESDEC KEY STATE1
+ AESDEC KEY STATE2
+ AESDEC KEY STATE3
+ AESDEC KEY STATE4
+.align 4
+.L4dec192:
+ movaps -0x40(TKEYP), KEY
+ AESDEC KEY STATE1
+ AESDEC KEY STATE2
+ AESDEC KEY STATE3
+ AESDEC KEY STATE4
+ movaps -0x30(TKEYP), KEY
+ AESDEC KEY STATE1
+ AESDEC KEY STATE2
+ AESDEC KEY STATE3
+ AESDEC KEY STATE4
+.align 4
+.L4dec128:
+ movaps -0x20(TKEYP), KEY
+ AESDEC KEY STATE1
+ AESDEC KEY STATE2
+ AESDEC KEY STATE3
+ AESDEC KEY STATE4
+ movaps -0x10(TKEYP), KEY
+ AESDEC KEY STATE1
+ AESDEC KEY STATE2
+ AESDEC KEY STATE3
+ AESDEC KEY STATE4
+ movaps (TKEYP), KEY
+ AESDEC KEY STATE1
+ AESDEC KEY STATE2
+ AESDEC KEY STATE3
+ AESDEC KEY STATE4
+ movaps 0x10(TKEYP), KEY
+ AESDEC KEY STATE1
+ AESDEC KEY STATE2
+ AESDEC KEY STATE3
+ AESDEC KEY STATE4
+ movaps 0x20(TKEYP), KEY
+ AESDEC KEY STATE1
+ AESDEC KEY STATE2
+ AESDEC KEY STATE3
+ AESDEC KEY STATE4
+ movaps 0x30(TKEYP), KEY
+ AESDEC KEY STATE1
+ AESDEC KEY STATE2
+ AESDEC KEY STATE3
+ AESDEC KEY STATE4
+ movaps 0x40(TKEYP), KEY
+ AESDEC KEY STATE1
+ AESDEC KEY STATE2
+ AESDEC KEY STATE3
+ AESDEC KEY STATE4
+ movaps 0x50(TKEYP), KEY
+ AESDEC KEY STATE1
+ AESDEC KEY STATE2
+ AESDEC KEY STATE3
+ AESDEC KEY STATE4
+ movaps 0x60(TKEYP), KEY
+ AESDEC KEY STATE1
+ AESDEC KEY STATE2
+ AESDEC KEY STATE3
+ AESDEC KEY STATE4
+ movaps 0x70(TKEYP), KEY
+ AESDECLAST KEY STATE1 # last round
+ AESDECLAST KEY STATE2
+ AESDECLAST KEY STATE3
+ AESDECLAST KEY STATE4
+ ret
+ENDPROC(_aesni_dec4)
+
+/*
+ * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
+ * size_t len)
+ */
+ENTRY(aesni_ecb_enc)
+ FRAME_BEGIN
+#ifndef __x86_64__
+ pushl LEN
+ pushl KEYP
+ pushl KLEN
+ movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
+ movl (FRAME_OFFSET+20)(%esp), OUTP # dst
+ movl (FRAME_OFFSET+24)(%esp), INP # src
+ movl (FRAME_OFFSET+28)(%esp), LEN # len
+#endif
+ test LEN, LEN # check length
+ jz .Lecb_enc_ret
+ mov 480(KEYP), KLEN
+ cmp $16, LEN
+ jb .Lecb_enc_ret
+ cmp $64, LEN
+ jb .Lecb_enc_loop1
+.align 4
+.Lecb_enc_loop4:
+ movups (INP), STATE1
+ movups 0x10(INP), STATE2
+ movups 0x20(INP), STATE3
+ movups 0x30(INP), STATE4
+ call _aesni_enc4
+ movups STATE1, (OUTP)
+ movups STATE2, 0x10(OUTP)
+ movups STATE3, 0x20(OUTP)
+ movups STATE4, 0x30(OUTP)
+ sub $64, LEN
+ add $64, INP
+ add $64, OUTP
+ cmp $64, LEN
+ jge .Lecb_enc_loop4
+ cmp $16, LEN
+ jb .Lecb_enc_ret
+.align 4
+.Lecb_enc_loop1:
+ movups (INP), STATE1
+ call _aesni_enc1
+ movups STATE1, (OUTP)
+ sub $16, LEN
+ add $16, INP
+ add $16, OUTP
+ cmp $16, LEN
+ jge .Lecb_enc_loop1
+.Lecb_enc_ret:
+#ifndef __x86_64__
+ popl KLEN
+ popl KEYP
+ popl LEN
+#endif
+ FRAME_END
+ ret
+ENDPROC(aesni_ecb_enc)
+
+/*
+ * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
+ * size_t len);
+ */
+ENTRY(aesni_ecb_dec)
+ FRAME_BEGIN
+#ifndef __x86_64__
+ pushl LEN
+ pushl KEYP
+ pushl KLEN
+ movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
+ movl (FRAME_OFFSET+20)(%esp), OUTP # dst
+ movl (FRAME_OFFSET+24)(%esp), INP # src
+ movl (FRAME_OFFSET+28)(%esp), LEN # len
+#endif
+ test LEN, LEN
+ jz .Lecb_dec_ret
+ mov 480(KEYP), KLEN
+ add $240, KEYP
+ cmp $16, LEN
+ jb .Lecb_dec_ret
+ cmp $64, LEN
+ jb .Lecb_dec_loop1
+.align 4
+.Lecb_dec_loop4:
+ movups (INP), STATE1
+ movups 0x10(INP), STATE2
+ movups 0x20(INP), STATE3
+ movups 0x30(INP), STATE4
+ call _aesni_dec4
+ movups STATE1, (OUTP)
+ movups STATE2, 0x10(OUTP)
+ movups STATE3, 0x20(OUTP)
+ movups STATE4, 0x30(OUTP)
+ sub $64, LEN
+ add $64, INP
+ add $64, OUTP
+ cmp $64, LEN
+ jge .Lecb_dec_loop4
+ cmp $16, LEN
+ jb .Lecb_dec_ret
+.align 4
+.Lecb_dec_loop1:
+ movups (INP), STATE1
+ call _aesni_dec1
+ movups STATE1, (OUTP)
+ sub $16, LEN
+ add $16, INP
+ add $16, OUTP
+ cmp $16, LEN
+ jge .Lecb_dec_loop1
+.Lecb_dec_ret:
+#ifndef __x86_64__
+ popl KLEN
+ popl KEYP
+ popl LEN
+#endif
+ FRAME_END
+ ret
+ENDPROC(aesni_ecb_dec)
+
+/*
+ * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
+ * size_t len, u8 *iv)
+ */
+ENTRY(aesni_cbc_enc)
+ FRAME_BEGIN
+#ifndef __x86_64__
+ pushl IVP
+ pushl LEN
+ pushl KEYP
+ pushl KLEN
+ movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
+ movl (FRAME_OFFSET+24)(%esp), OUTP # dst
+ movl (FRAME_OFFSET+28)(%esp), INP # src
+ movl (FRAME_OFFSET+32)(%esp), LEN # len
+ movl (FRAME_OFFSET+36)(%esp), IVP # iv
+#endif
+ cmp $16, LEN
+ jb .Lcbc_enc_ret
+ mov 480(KEYP), KLEN
+ movups (IVP), STATE # load iv as initial state
+.align 4
+.Lcbc_enc_loop:
+ movups (INP), IN # load input
+ pxor IN, STATE
+ call _aesni_enc1
+ movups STATE, (OUTP) # store output
+ sub $16, LEN
+ add $16, INP
+ add $16, OUTP
+ cmp $16, LEN
+ jge .Lcbc_enc_loop
+ movups STATE, (IVP)
+.Lcbc_enc_ret:
+#ifndef __x86_64__
+ popl KLEN
+ popl KEYP
+ popl LEN
+ popl IVP
+#endif
+ FRAME_END
+ ret
+ENDPROC(aesni_cbc_enc)
+
+/*
+ * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
+ * size_t len, u8 *iv)
+ */
+ENTRY(aesni_cbc_dec)
+ FRAME_BEGIN
+#ifndef __x86_64__
+ pushl IVP
+ pushl LEN
+ pushl KEYP
+ pushl KLEN
+ movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
+ movl (FRAME_OFFSET+24)(%esp), OUTP # dst
+ movl (FRAME_OFFSET+28)(%esp), INP # src
+ movl (FRAME_OFFSET+32)(%esp), LEN # len
+ movl (FRAME_OFFSET+36)(%esp), IVP # iv
+#endif
+ cmp $16, LEN
+ jb .Lcbc_dec_just_ret
+ mov 480(KEYP), KLEN
+ add $240, KEYP
+ movups (IVP), IV
+ cmp $64, LEN
+ jb .Lcbc_dec_loop1
+.align 4
+.Lcbc_dec_loop4:
+ movups (INP), IN1
+ movaps IN1, STATE1
+ movups 0x10(INP), IN2
+ movaps IN2, STATE2
+#ifdef __x86_64__
+ movups 0x20(INP), IN3
+ movaps IN3, STATE3
+ movups 0x30(INP), IN4
+ movaps IN4, STATE4
+#else
+ movups 0x20(INP), IN1
+ movaps IN1, STATE3
+ movups 0x30(INP), IN2
+ movaps IN2, STATE4
+#endif
+ call _aesni_dec4
+ pxor IV, STATE1
+#ifdef __x86_64__
+ pxor IN1, STATE2
+ pxor IN2, STATE3
+ pxor IN3, STATE4
+ movaps IN4, IV
+#else
+ pxor IN1, STATE4
+ movaps IN2, IV
+ movups (INP), IN1
+ pxor IN1, STATE2
+ movups 0x10(INP), IN2
+ pxor IN2, STATE3
+#endif
+ movups STATE1, (OUTP)
+ movups STATE2, 0x10(OUTP)
+ movups STATE3, 0x20(OUTP)
+ movups STATE4, 0x30(OUTP)
+ sub $64, LEN
+ add $64, INP
+ add $64, OUTP
+ cmp $64, LEN
+ jge .Lcbc_dec_loop4
+ cmp $16, LEN
+ jb .Lcbc_dec_ret
+.align 4
+.Lcbc_dec_loop1:
+ movups (INP), IN
+ movaps IN, STATE
+ call _aesni_dec1
+ pxor IV, STATE
+ movups STATE, (OUTP)
+ movaps IN, IV
+ sub $16, LEN
+ add $16, INP
+ add $16, OUTP
+ cmp $16, LEN
+ jge .Lcbc_dec_loop1
+.Lcbc_dec_ret:
+ movups IV, (IVP)
+.Lcbc_dec_just_ret:
+#ifndef __x86_64__
+ popl KLEN
+ popl KEYP
+ popl LEN
+ popl IVP
+#endif
+ FRAME_END
+ ret
+ENDPROC(aesni_cbc_dec)
+
+#ifdef __x86_64__
+.pushsection .rodata
+.align 16
+.Lbswap_mask:
+ .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+.popsection
+
+/*
+ * _aesni_inc_init: internal ABI
+ * setup registers used by _aesni_inc
+ * input:
+ * IV
+ * output:
+ * CTR: == IV, in little endian
+ * TCTR_LOW: == lower qword of CTR
+ * INC: == 1, in little endian
+ * BSWAP_MASK == endian swapping mask
+ */
+.align 4
+_aesni_inc_init:
+ movaps .Lbswap_mask, BSWAP_MASK
+ movaps IV, CTR
+ PSHUFB_XMM BSWAP_MASK CTR
+ mov $1, TCTR_LOW
+ MOVQ_R64_XMM TCTR_LOW INC
+ MOVQ_R64_XMM CTR TCTR_LOW
+ ret
+ENDPROC(_aesni_inc_init)
+
+/*
+ * _aesni_inc: internal ABI
+ * Increase IV by 1, IV is in big endian
+ * input:
+ * IV
+ * CTR: == IV, in little endian
+ * TCTR_LOW: == lower qword of CTR
+ * INC: == 1, in little endian
+ * BSWAP_MASK == endian swapping mask
+ * output:
+ * IV: Increase by 1
+ * changed:
+ * CTR: == output IV, in little endian
+ * TCTR_LOW: == lower qword of CTR
+ */
+.align 4
+_aesni_inc:
+ paddq INC, CTR
+ add $1, TCTR_LOW
+ jnc .Linc_low
+ pslldq $8, INC
+ paddq INC, CTR
+ psrldq $8, INC
+.Linc_low:
+ movaps CTR, IV
+ PSHUFB_XMM BSWAP_MASK IV
+ ret
+ENDPROC(_aesni_inc)
+
+/*
+ * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
+ * size_t len, u8 *iv)
+ */
+ENTRY(aesni_ctr_enc)
+ FRAME_BEGIN
+ cmp $16, LEN
+ jb .Lctr_enc_just_ret
+ mov 480(KEYP), KLEN
+ movups (IVP), IV
+ call _aesni_inc_init
+ cmp $64, LEN
+ jb .Lctr_enc_loop1
+.align 4
+.Lctr_enc_loop4:
+ movaps IV, STATE1
+ call _aesni_inc
+ movups (INP), IN1
+ movaps IV, STATE2
+ call _aesni_inc
+ movups 0x10(INP), IN2
+ movaps IV, STATE3
+ call _aesni_inc
+ movups 0x20(INP), IN3
+ movaps IV, STATE4
+ call _aesni_inc
+ movups 0x30(INP), IN4
+ call _aesni_enc4
+ pxor IN1, STATE1
+ movups STATE1, (OUTP)
+ pxor IN2, STATE2
+ movups STATE2, 0x10(OUTP)
+ pxor IN3, STATE3
+ movups STATE3, 0x20(OUTP)
+ pxor IN4, STATE4
+ movups STATE4, 0x30(OUTP)
+ sub $64, LEN
+ add $64, INP
+ add $64, OUTP
+ cmp $64, LEN
+ jge .Lctr_enc_loop4
+ cmp $16, LEN
+ jb .Lctr_enc_ret
+.align 4
+.Lctr_enc_loop1:
+ movaps IV, STATE
+ call _aesni_inc
+ movups (INP), IN
+ call _aesni_enc1
+ pxor IN, STATE
+ movups STATE, (OUTP)
+ sub $16, LEN
+ add $16, INP
+ add $16, OUTP
+ cmp $16, LEN
+ jge .Lctr_enc_loop1
+.Lctr_enc_ret:
+ movups IV, (IVP)
+.Lctr_enc_just_ret:
+ FRAME_END
+ ret
+ENDPROC(aesni_ctr_enc)
+
+/*
+ * _aesni_gf128mul_x_ble: internal ABI
+ * Multiply in GF(2^128) for XTS IVs
+ * input:
+ * IV: current IV
+ * GF128MUL_MASK == mask with 0x87 and 0x01
+ * output:
+ * IV: next IV
+ * changed:
+ * CTR: == temporary value
+ */
+#define _aesni_gf128mul_x_ble() \
+ pshufd $0x13, IV, CTR; \
+ paddq IV, IV; \
+ psrad $31, CTR; \
+ pand GF128MUL_MASK, CTR; \
+ pxor CTR, IV;
+
+/*
+ * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
+ * bool enc, u8 *iv)
+ */
+ENTRY(aesni_xts_crypt8)
+ FRAME_BEGIN
+ cmpb $0, %cl
+ movl $0, %ecx
+ movl $240, %r10d
+ leaq _aesni_enc4, %r11
+ leaq _aesni_dec4, %rax
+ cmovel %r10d, %ecx
+ cmoveq %rax, %r11
+
+ movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
+ movups (IVP), IV
+
+ mov 480(KEYP), KLEN
+ addq %rcx, KEYP
+
+ movdqa IV, STATE1
+ movdqu 0x00(INP), INC
+ pxor INC, STATE1
+ movdqu IV, 0x00(OUTP)
+
+ _aesni_gf128mul_x_ble()
+ movdqa IV, STATE2
+ movdqu 0x10(INP), INC
+ pxor INC, STATE2
+ movdqu IV, 0x10(OUTP)
+
+ _aesni_gf128mul_x_ble()
+ movdqa IV, STATE3
+ movdqu 0x20(INP), INC
+ pxor INC, STATE3
+ movdqu IV, 0x20(OUTP)
+
+ _aesni_gf128mul_x_ble()
+ movdqa IV, STATE4
+ movdqu 0x30(INP), INC
+ pxor INC, STATE4
+ movdqu IV, 0x30(OUTP)
+
+ CALL_NOSPEC %r11
+
+ movdqu 0x00(OUTP), INC
+ pxor INC, STATE1
+ movdqu STATE1, 0x00(OUTP)
+
+ _aesni_gf128mul_x_ble()
+ movdqa IV, STATE1
+ movdqu 0x40(INP), INC
+ pxor INC, STATE1
+ movdqu IV, 0x40(OUTP)
+
+ movdqu 0x10(OUTP), INC
+ pxor INC, STATE2
+ movdqu STATE2, 0x10(OUTP)
+
+ _aesni_gf128mul_x_ble()
+ movdqa IV, STATE2
+ movdqu 0x50(INP), INC
+ pxor INC, STATE2
+ movdqu IV, 0x50(OUTP)
+
+ movdqu 0x20(OUTP), INC
+ pxor INC, STATE3
+ movdqu STATE3, 0x20(OUTP)
+
+ _aesni_gf128mul_x_ble()
+ movdqa IV, STATE3
+ movdqu 0x60(INP), INC
+ pxor INC, STATE3
+ movdqu IV, 0x60(OUTP)
+
+ movdqu 0x30(OUTP), INC
+ pxor INC, STATE4
+ movdqu STATE4, 0x30(OUTP)
+
+ _aesni_gf128mul_x_ble()
+ movdqa IV, STATE4
+ movdqu 0x70(INP), INC
+ pxor INC, STATE4
+ movdqu IV, 0x70(OUTP)
+
+ _aesni_gf128mul_x_ble()
+ movups IV, (IVP)
+
+ CALL_NOSPEC %r11
+
+ movdqu 0x40(OUTP), INC
+ pxor INC, STATE1
+ movdqu STATE1, 0x40(OUTP)
+
+ movdqu 0x50(OUTP), INC
+ pxor INC, STATE2
+ movdqu STATE2, 0x50(OUTP)
+
+ movdqu 0x60(OUTP), INC
+ pxor INC, STATE3
+ movdqu STATE3, 0x60(OUTP)
+
+ movdqu 0x70(OUTP), INC
+ pxor INC, STATE4
+ movdqu STATE4, 0x70(OUTP)
+
+ FRAME_END
+ ret
+ENDPROC(aesni_xts_crypt8)
+
+#endif
diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S
new file mode 100644
index 000000000..1985ea0b5
--- /dev/null
+++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S
@@ -0,0 +1,2946 @@
+########################################################################
+# Copyright (c) 2013, Intel Corporation
+#
+# This software is available to you under a choice of one of two
+# licenses. You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the
+# distribution.
+#
+# * Neither the name of the Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+#
+# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR
+# PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+##
+## Authors:
+## Erdinc Ozturk <erdinc.ozturk@intel.com>
+## Vinodh Gopal <vinodh.gopal@intel.com>
+## James Guilford <james.guilford@intel.com>
+## Tim Chen <tim.c.chen@linux.intel.com>
+##
+## References:
+## This code was derived and highly optimized from the code described in paper:
+## Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation
+## on Intel Architecture Processors. August, 2010
+## The details of the implementation is explained in:
+## Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode
+## on Intel Architecture Processors. October, 2012.
+##
+## Assumptions:
+##
+##
+##
+## iv:
+## 0 1 2 3
+## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+## | Salt (From the SA) |
+## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+## | Initialization Vector |
+## | (This is the sequence number from IPSec header) |
+## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+## | 0x1 |
+## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+##
+##
+##
+## AAD:
+## AAD padded to 128 bits with 0
+## for example, assume AAD is a u32 vector
+##
+## if AAD is 8 bytes:
+## AAD[3] = {A0, A1}#
+## padded AAD in xmm register = {A1 A0 0 0}
+##
+## 0 1 2 3
+## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+## | SPI (A1) |
+## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+## | 32-bit Sequence Number (A0) |
+## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+## | 0x0 |
+## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+##
+## AAD Format with 32-bit Sequence Number
+##
+## if AAD is 12 bytes:
+## AAD[3] = {A0, A1, A2}#
+## padded AAD in xmm register = {A2 A1 A0 0}
+##
+## 0 1 2 3
+## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+## | SPI (A2) |
+## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+## | 64-bit Extended Sequence Number {A1,A0} |
+## | |
+## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+## | 0x0 |
+## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+##
+## AAD Format with 64-bit Extended Sequence Number
+##
+##
+## aadLen:
+## from the definition of the spec, aadLen can only be 8 or 12 bytes.
+## The code additionally supports aadLen of length 16 bytes.
+##
+## TLen:
+## from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
+##
+## poly = x^128 + x^127 + x^126 + x^121 + 1
+## throughout the code, one tab and two tab indentations are used. one tab is
+## for GHASH part, two tabs is for AES part.
+##
+
+#include <linux/linkage.h>
+#include <asm/inst.h>
+
+# constants in mergeable sections, linker can reorder and merge
+.section .rodata.cst16.POLY, "aM", @progbits, 16
+.align 16
+POLY: .octa 0xC2000000000000000000000000000001
+
+.section .rodata.cst16.POLY2, "aM", @progbits, 16
+.align 16
+POLY2: .octa 0xC20000000000000000000001C2000000
+
+.section .rodata.cst16.TWOONE, "aM", @progbits, 16
+.align 16
+TWOONE: .octa 0x00000001000000000000000000000001
+
+.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
+.align 16
+SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
+
+.section .rodata.cst16.ONE, "aM", @progbits, 16
+.align 16
+ONE: .octa 0x00000000000000000000000000000001
+
+.section .rodata.cst16.ONEf, "aM", @progbits, 16
+.align 16
+ONEf: .octa 0x01000000000000000000000000000000
+
+# order of these constants should not change.
+# more specifically, ALL_F should follow SHIFT_MASK, and zero should follow ALL_F
+.section .rodata, "a", @progbits
+.align 16
+SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
+ALL_F: .octa 0xffffffffffffffffffffffffffffffff
+ .octa 0x00000000000000000000000000000000
+
+.section .rodata
+.align 16
+.type aad_shift_arr, @object
+.size aad_shift_arr, 272
+aad_shift_arr:
+ .octa 0xffffffffffffffffffffffffffffffff
+ .octa 0xffffffffffffffffffffffffffffff0C
+ .octa 0xffffffffffffffffffffffffffff0D0C
+ .octa 0xffffffffffffffffffffffffff0E0D0C
+ .octa 0xffffffffffffffffffffffff0F0E0D0C
+ .octa 0xffffffffffffffffffffff0C0B0A0908
+ .octa 0xffffffffffffffffffff0D0C0B0A0908
+ .octa 0xffffffffffffffffff0E0D0C0B0A0908
+ .octa 0xffffffffffffffff0F0E0D0C0B0A0908
+ .octa 0xffffffffffffff0C0B0A090807060504
+ .octa 0xffffffffffff0D0C0B0A090807060504
+ .octa 0xffffffffff0E0D0C0B0A090807060504
+ .octa 0xffffffff0F0E0D0C0B0A090807060504
+ .octa 0xffffff0C0B0A09080706050403020100
+ .octa 0xffff0D0C0B0A09080706050403020100
+ .octa 0xff0E0D0C0B0A09080706050403020100
+ .octa 0x0F0E0D0C0B0A09080706050403020100
+
+
+.text
+
+
+##define the fields of the gcm aes context
+#{
+# u8 expanded_keys[16*11] store expanded keys
+# u8 shifted_hkey_1[16] store HashKey <<1 mod poly here
+# u8 shifted_hkey_2[16] store HashKey^2 <<1 mod poly here
+# u8 shifted_hkey_3[16] store HashKey^3 <<1 mod poly here
+# u8 shifted_hkey_4[16] store HashKey^4 <<1 mod poly here
+# u8 shifted_hkey_5[16] store HashKey^5 <<1 mod poly here
+# u8 shifted_hkey_6[16] store HashKey^6 <<1 mod poly here
+# u8 shifted_hkey_7[16] store HashKey^7 <<1 mod poly here
+# u8 shifted_hkey_8[16] store HashKey^8 <<1 mod poly here
+# u8 shifted_hkey_1_k[16] store XOR HashKey <<1 mod poly here (for Karatsuba purposes)
+# u8 shifted_hkey_2_k[16] store XOR HashKey^2 <<1 mod poly here (for Karatsuba purposes)
+# u8 shifted_hkey_3_k[16] store XOR HashKey^3 <<1 mod poly here (for Karatsuba purposes)
+# u8 shifted_hkey_4_k[16] store XOR HashKey^4 <<1 mod poly here (for Karatsuba purposes)
+# u8 shifted_hkey_5_k[16] store XOR HashKey^5 <<1 mod poly here (for Karatsuba purposes)
+# u8 shifted_hkey_6_k[16] store XOR HashKey^6 <<1 mod poly here (for Karatsuba purposes)
+# u8 shifted_hkey_7_k[16] store XOR HashKey^7 <<1 mod poly here (for Karatsuba purposes)
+# u8 shifted_hkey_8_k[16] store XOR HashKey^8 <<1 mod poly here (for Karatsuba purposes)
+#} gcm_ctx#
+
+HashKey = 16*11 # store HashKey <<1 mod poly here
+HashKey_2 = 16*12 # store HashKey^2 <<1 mod poly here
+HashKey_3 = 16*13 # store HashKey^3 <<1 mod poly here
+HashKey_4 = 16*14 # store HashKey^4 <<1 mod poly here
+HashKey_5 = 16*15 # store HashKey^5 <<1 mod poly here
+HashKey_6 = 16*16 # store HashKey^6 <<1 mod poly here
+HashKey_7 = 16*17 # store HashKey^7 <<1 mod poly here
+HashKey_8 = 16*18 # store HashKey^8 <<1 mod poly here
+HashKey_k = 16*19 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes)
+HashKey_2_k = 16*20 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
+HashKey_3_k = 16*21 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
+HashKey_4_k = 16*22 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
+HashKey_5_k = 16*23 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
+HashKey_6_k = 16*24 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
+HashKey_7_k = 16*25 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
+HashKey_8_k = 16*26 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
+
+#define arg1 %rdi
+#define arg2 %rsi
+#define arg3 %rdx
+#define arg4 %rcx
+#define arg5 %r8
+#define arg6 %r9
+#define arg7 STACK_OFFSET+8*1(%r14)
+#define arg8 STACK_OFFSET+8*2(%r14)
+#define arg9 STACK_OFFSET+8*3(%r14)
+
+i = 0
+j = 0
+
+out_order = 0
+in_order = 1
+DEC = 0
+ENC = 1
+
+.macro define_reg r n
+reg_\r = %xmm\n
+.endm
+
+.macro setreg
+.altmacro
+define_reg i %i
+define_reg j %j
+.noaltmacro
+.endm
+
+# need to push 4 registers into stack to maintain
+STACK_OFFSET = 8*4
+
+TMP1 = 16*0 # Temporary storage for AAD
+TMP2 = 16*1 # Temporary storage for AES State 2 (State 1 is stored in an XMM register)
+TMP3 = 16*2 # Temporary storage for AES State 3
+TMP4 = 16*3 # Temporary storage for AES State 4
+TMP5 = 16*4 # Temporary storage for AES State 5
+TMP6 = 16*5 # Temporary storage for AES State 6
+TMP7 = 16*6 # Temporary storage for AES State 7
+TMP8 = 16*7 # Temporary storage for AES State 8
+
+VARIABLE_OFFSET = 16*8
+
+################################
+# Utility Macros
+################################
+
+# Encryption of a single block
+.macro ENCRYPT_SINGLE_BLOCK XMM0
+ vpxor (arg1), \XMM0, \XMM0
+ i = 1
+ setreg
+.rep 9
+ vaesenc 16*i(arg1), \XMM0, \XMM0
+ i = (i+1)
+ setreg
+.endr
+ vaesenclast 16*10(arg1), \XMM0, \XMM0
+.endm
+
+#ifdef CONFIG_AS_AVX
+###############################################################################
+# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
+# Input: A and B (128-bits each, bit-reflected)
+# Output: C = A*B*x mod poly, (i.e. >>1 )
+# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
+# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
+###############################################################################
+.macro GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5
+
+ vpshufd $0b01001110, \GH, \T2
+ vpshufd $0b01001110, \HK, \T3
+ vpxor \GH , \T2, \T2 # T2 = (a1+a0)
+ vpxor \HK , \T3, \T3 # T3 = (b1+b0)
+
+ vpclmulqdq $0x11, \HK, \GH, \T1 # T1 = a1*b1
+ vpclmulqdq $0x00, \HK, \GH, \GH # GH = a0*b0
+ vpclmulqdq $0x00, \T3, \T2, \T2 # T2 = (a1+a0)*(b1+b0)
+ vpxor \GH, \T2,\T2
+ vpxor \T1, \T2,\T2 # T2 = a0*b1+a1*b0
+
+ vpslldq $8, \T2,\T3 # shift-L T3 2 DWs
+ vpsrldq $8, \T2,\T2 # shift-R T2 2 DWs
+ vpxor \T3, \GH, \GH
+ vpxor \T2, \T1, \T1 # <T1:GH> = GH x HK
+
+ #first phase of the reduction
+ vpslld $31, \GH, \T2 # packed right shifting << 31
+ vpslld $30, \GH, \T3 # packed right shifting shift << 30
+ vpslld $25, \GH, \T4 # packed right shifting shift << 25
+
+ vpxor \T3, \T2, \T2 # xor the shifted versions
+ vpxor \T4, \T2, \T2
+
+ vpsrldq $4, \T2, \T5 # shift-R T5 1 DW
+
+ vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
+ vpxor \T2, \GH, \GH # first phase of the reduction complete
+
+ #second phase of the reduction
+
+ vpsrld $1,\GH, \T2 # packed left shifting >> 1
+ vpsrld $2,\GH, \T3 # packed left shifting >> 2
+ vpsrld $7,\GH, \T4 # packed left shifting >> 7
+ vpxor \T3, \T2, \T2 # xor the shifted versions
+ vpxor \T4, \T2, \T2
+
+ vpxor \T5, \T2, \T2
+ vpxor \T2, \GH, \GH
+ vpxor \T1, \GH, \GH # the result is in GH
+
+
+.endm
+
+.macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6
+
+ # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+ vmovdqa \HK, \T5
+
+ vpshufd $0b01001110, \T5, \T1
+ vpxor \T5, \T1, \T1
+ vmovdqa \T1, HashKey_k(arg1)
+
+ GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
+ vmovdqa \T5, HashKey_2(arg1) # [HashKey_2] = HashKey^2<<1 mod poly
+ vpshufd $0b01001110, \T5, \T1
+ vpxor \T5, \T1, \T1
+ vmovdqa \T1, HashKey_2_k(arg1)
+
+ GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
+ vmovdqa \T5, HashKey_3(arg1)
+ vpshufd $0b01001110, \T5, \T1
+ vpxor \T5, \T1, \T1
+ vmovdqa \T1, HashKey_3_k(arg1)
+
+ GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
+ vmovdqa \T5, HashKey_4(arg1)
+ vpshufd $0b01001110, \T5, \T1
+ vpxor \T5, \T1, \T1
+ vmovdqa \T1, HashKey_4_k(arg1)
+
+ GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
+ vmovdqa \T5, HashKey_5(arg1)
+ vpshufd $0b01001110, \T5, \T1
+ vpxor \T5, \T1, \T1
+ vmovdqa \T1, HashKey_5_k(arg1)
+
+ GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
+ vmovdqa \T5, HashKey_6(arg1)
+ vpshufd $0b01001110, \T5, \T1
+ vpxor \T5, \T1, \T1
+ vmovdqa \T1, HashKey_6_k(arg1)
+
+ GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
+ vmovdqa \T5, HashKey_7(arg1)
+ vpshufd $0b01001110, \T5, \T1
+ vpxor \T5, \T1, \T1
+ vmovdqa \T1, HashKey_7_k(arg1)
+
+ GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
+ vmovdqa \T5, HashKey_8(arg1)
+ vpshufd $0b01001110, \T5, \T1
+ vpxor \T5, \T1, \T1
+ vmovdqa \T1, HashKey_8_k(arg1)
+
+.endm
+
+## if a = number of total plaintext bytes
+## b = floor(a/16)
+## num_initial_blocks = b mod 4#
+## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
+## r10, r11, r12, rax are clobbered
+## arg1, arg2, arg3, r14 are used as a pointer only, not modified
+
+.macro INITIAL_BLOCKS_AVX num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
+ i = (8-\num_initial_blocks)
+ j = 0
+ setreg
+
+ mov arg6, %r10 # r10 = AAD
+ mov arg7, %r12 # r12 = aadLen
+
+
+ mov %r12, %r11
+
+ vpxor reg_j, reg_j, reg_j
+ vpxor reg_i, reg_i, reg_i
+ cmp $16, %r11
+ jl _get_AAD_rest8\@
+_get_AAD_blocks\@:
+ vmovdqu (%r10), reg_i
+ vpshufb SHUF_MASK(%rip), reg_i, reg_i
+ vpxor reg_i, reg_j, reg_j
+ GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6
+ add $16, %r10
+ sub $16, %r12
+ sub $16, %r11
+ cmp $16, %r11
+ jge _get_AAD_blocks\@
+ vmovdqu reg_j, reg_i
+ cmp $0, %r11
+ je _get_AAD_done\@
+
+ vpxor reg_i, reg_i, reg_i
+
+ /* read the last <16B of AAD. since we have at least 4B of
+ data right after the AAD (the ICV, and maybe some CT), we can
+ read 4B/8B blocks safely, and then get rid of the extra stuff */
+_get_AAD_rest8\@:
+ cmp $4, %r11
+ jle _get_AAD_rest4\@
+ movq (%r10), \T1
+ add $8, %r10
+ sub $8, %r11
+ vpslldq $8, \T1, \T1
+ vpsrldq $8, reg_i, reg_i
+ vpxor \T1, reg_i, reg_i
+ jmp _get_AAD_rest8\@
+_get_AAD_rest4\@:
+ cmp $0, %r11
+ jle _get_AAD_rest0\@
+ mov (%r10), %eax
+ movq %rax, \T1
+ add $4, %r10
+ sub $4, %r11
+ vpslldq $12, \T1, \T1
+ vpsrldq $4, reg_i, reg_i
+ vpxor \T1, reg_i, reg_i
+_get_AAD_rest0\@:
+ /* finalize: shift out the extra bytes we read, and align
+ left. since pslldq can only shift by an immediate, we use
+ vpshufb and an array of shuffle masks */
+ movq %r12, %r11
+ salq $4, %r11
+ movdqu aad_shift_arr(%r11), \T1
+ vpshufb \T1, reg_i, reg_i
+_get_AAD_rest_final\@:
+ vpshufb SHUF_MASK(%rip), reg_i, reg_i
+ vpxor reg_j, reg_i, reg_i
+ GHASH_MUL_AVX reg_i, \T2, \T1, \T3, \T4, \T5, \T6
+
+_get_AAD_done\@:
+ # initialize the data pointer offset as zero
+ xor %r11d, %r11d
+
+ # start AES for num_initial_blocks blocks
+ mov arg5, %rax # rax = *Y0
+ vmovdqu (%rax), \CTR # CTR = Y0
+ vpshufb SHUF_MASK(%rip), \CTR, \CTR
+
+
+ i = (9-\num_initial_blocks)
+ setreg
+.rep \num_initial_blocks
+ vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
+ vmovdqa \CTR, reg_i
+ vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap
+ i = (i+1)
+ setreg
+.endr
+
+ vmovdqa (arg1), \T_key
+ i = (9-\num_initial_blocks)
+ setreg
+.rep \num_initial_blocks
+ vpxor \T_key, reg_i, reg_i
+ i = (i+1)
+ setreg
+.endr
+
+ j = 1
+ setreg
+.rep 9
+ vmovdqa 16*j(arg1), \T_key
+ i = (9-\num_initial_blocks)
+ setreg
+.rep \num_initial_blocks
+ vaesenc \T_key, reg_i, reg_i
+ i = (i+1)
+ setreg
+.endr
+
+ j = (j+1)
+ setreg
+.endr
+
+
+ vmovdqa 16*10(arg1), \T_key
+ i = (9-\num_initial_blocks)
+ setreg
+.rep \num_initial_blocks
+ vaesenclast \T_key, reg_i, reg_i
+ i = (i+1)
+ setreg
+.endr
+
+ i = (9-\num_initial_blocks)
+ setreg
+.rep \num_initial_blocks
+ vmovdqu (arg3, %r11), \T1
+ vpxor \T1, reg_i, reg_i
+ vmovdqu reg_i, (arg2 , %r11) # write back ciphertext for num_initial_blocks blocks
+ add $16, %r11
+.if \ENC_DEC == DEC
+ vmovdqa \T1, reg_i
+.endif
+ vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations
+ i = (i+1)
+ setreg
+.endr
+
+
+ i = (8-\num_initial_blocks)
+ j = (9-\num_initial_blocks)
+ setreg
+
+.rep \num_initial_blocks
+ vpxor reg_i, reg_j, reg_j
+ GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
+ i = (i+1)
+ j = (j+1)
+ setreg
+.endr
+ # XMM8 has the combined result here
+
+ vmovdqa \XMM8, TMP1(%rsp)
+ vmovdqa \XMM8, \T3
+
+ cmp $128, %r13
+ jl _initial_blocks_done\@ # no need for precomputed constants
+
+###############################################################################
+# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+ vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
+ vmovdqa \CTR, \XMM1
+ vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
+
+ vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
+ vmovdqa \CTR, \XMM2
+ vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
+
+ vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
+ vmovdqa \CTR, \XMM3
+ vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
+
+ vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
+ vmovdqa \CTR, \XMM4
+ vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
+
+ vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
+ vmovdqa \CTR, \XMM5
+ vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
+
+ vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
+ vmovdqa \CTR, \XMM6
+ vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
+
+ vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
+ vmovdqa \CTR, \XMM7
+ vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
+
+ vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
+ vmovdqa \CTR, \XMM8
+ vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
+
+ vmovdqa (arg1), \T_key
+ vpxor \T_key, \XMM1, \XMM1
+ vpxor \T_key, \XMM2, \XMM2
+ vpxor \T_key, \XMM3, \XMM3
+ vpxor \T_key, \XMM4, \XMM4
+ vpxor \T_key, \XMM5, \XMM5
+ vpxor \T_key, \XMM6, \XMM6
+ vpxor \T_key, \XMM7, \XMM7
+ vpxor \T_key, \XMM8, \XMM8
+
+ i = 1
+ setreg
+.rep 9 # do 9 rounds
+ vmovdqa 16*i(arg1), \T_key
+ vaesenc \T_key, \XMM1, \XMM1
+ vaesenc \T_key, \XMM2, \XMM2
+ vaesenc \T_key, \XMM3, \XMM3
+ vaesenc \T_key, \XMM4, \XMM4
+ vaesenc \T_key, \XMM5, \XMM5
+ vaesenc \T_key, \XMM6, \XMM6
+ vaesenc \T_key, \XMM7, \XMM7
+ vaesenc \T_key, \XMM8, \XMM8
+ i = (i+1)
+ setreg
+.endr
+
+
+ vmovdqa 16*i(arg1), \T_key
+ vaesenclast \T_key, \XMM1, \XMM1
+ vaesenclast \T_key, \XMM2, \XMM2
+ vaesenclast \T_key, \XMM3, \XMM3
+ vaesenclast \T_key, \XMM4, \XMM4
+ vaesenclast \T_key, \XMM5, \XMM5
+ vaesenclast \T_key, \XMM6, \XMM6
+ vaesenclast \T_key, \XMM7, \XMM7
+ vaesenclast \T_key, \XMM8, \XMM8
+
+ vmovdqu (arg3, %r11), \T1
+ vpxor \T1, \XMM1, \XMM1
+ vmovdqu \XMM1, (arg2 , %r11)
+ .if \ENC_DEC == DEC
+ vmovdqa \T1, \XMM1
+ .endif
+
+ vmovdqu 16*1(arg3, %r11), \T1
+ vpxor \T1, \XMM2, \XMM2
+ vmovdqu \XMM2, 16*1(arg2 , %r11)
+ .if \ENC_DEC == DEC
+ vmovdqa \T1, \XMM2
+ .endif
+
+ vmovdqu 16*2(arg3, %r11), \T1
+ vpxor \T1, \XMM3, \XMM3
+ vmovdqu \XMM3, 16*2(arg2 , %r11)
+ .if \ENC_DEC == DEC
+ vmovdqa \T1, \XMM3
+ .endif
+
+ vmovdqu 16*3(arg3, %r11), \T1
+ vpxor \T1, \XMM4, \XMM4
+ vmovdqu \XMM4, 16*3(arg2 , %r11)
+ .if \ENC_DEC == DEC
+ vmovdqa \T1, \XMM4
+ .endif
+
+ vmovdqu 16*4(arg3, %r11), \T1
+ vpxor \T1, \XMM5, \XMM5
+ vmovdqu \XMM5, 16*4(arg2 , %r11)
+ .if \ENC_DEC == DEC
+ vmovdqa \T1, \XMM5
+ .endif
+
+ vmovdqu 16*5(arg3, %r11), \T1
+ vpxor \T1, \XMM6, \XMM6
+ vmovdqu \XMM6, 16*5(arg2 , %r11)
+ .if \ENC_DEC == DEC
+ vmovdqa \T1, \XMM6
+ .endif
+
+ vmovdqu 16*6(arg3, %r11), \T1
+ vpxor \T1, \XMM7, \XMM7
+ vmovdqu \XMM7, 16*6(arg2 , %r11)
+ .if \ENC_DEC == DEC
+ vmovdqa \T1, \XMM7
+ .endif
+
+ vmovdqu 16*7(arg3, %r11), \T1
+ vpxor \T1, \XMM8, \XMM8
+ vmovdqu \XMM8, 16*7(arg2 , %r11)
+ .if \ENC_DEC == DEC
+ vmovdqa \T1, \XMM8
+ .endif
+
+ add $128, %r11
+
+ vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
+ vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with the corresponding ciphertext
+ vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
+
+###############################################################################
+
+_initial_blocks_done\@:
+
+.endm
+
+# encrypt 8 blocks at a time
+# ghash the 8 previously encrypted ciphertext blocks
+# arg1, arg2, arg3 are used as pointers only, not modified
+# r11 is the data offset value
+.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
+
+ vmovdqa \XMM1, \T2
+ vmovdqa \XMM2, TMP2(%rsp)
+ vmovdqa \XMM3, TMP3(%rsp)
+ vmovdqa \XMM4, TMP4(%rsp)
+ vmovdqa \XMM5, TMP5(%rsp)
+ vmovdqa \XMM6, TMP6(%rsp)
+ vmovdqa \XMM7, TMP7(%rsp)
+ vmovdqa \XMM8, TMP8(%rsp)
+
+.if \loop_idx == in_order
+ vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT
+ vpaddd ONE(%rip), \XMM1, \XMM2
+ vpaddd ONE(%rip), \XMM2, \XMM3
+ vpaddd ONE(%rip), \XMM3, \XMM4
+ vpaddd ONE(%rip), \XMM4, \XMM5
+ vpaddd ONE(%rip), \XMM5, \XMM6
+ vpaddd ONE(%rip), \XMM6, \XMM7
+ vpaddd ONE(%rip), \XMM7, \XMM8
+ vmovdqa \XMM8, \CTR
+
+ vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
+.else
+ vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT
+ vpaddd ONEf(%rip), \XMM1, \XMM2
+ vpaddd ONEf(%rip), \XMM2, \XMM3
+ vpaddd ONEf(%rip), \XMM3, \XMM4
+ vpaddd ONEf(%rip), \XMM4, \XMM5
+ vpaddd ONEf(%rip), \XMM5, \XMM6
+ vpaddd ONEf(%rip), \XMM6, \XMM7
+ vpaddd ONEf(%rip), \XMM7, \XMM8
+ vmovdqa \XMM8, \CTR
+.endif
+
+
+ #######################################################################
+
+ vmovdqu (arg1), \T1
+ vpxor \T1, \XMM1, \XMM1
+ vpxor \T1, \XMM2, \XMM2
+ vpxor \T1, \XMM3, \XMM3
+ vpxor \T1, \XMM4, \XMM4
+ vpxor \T1, \XMM5, \XMM5
+ vpxor \T1, \XMM6, \XMM6
+ vpxor \T1, \XMM7, \XMM7
+ vpxor \T1, \XMM8, \XMM8
+
+ #######################################################################
+
+
+
+
+
+ vmovdqu 16*1(arg1), \T1
+ vaesenc \T1, \XMM1, \XMM1
+ vaesenc \T1, \XMM2, \XMM2
+ vaesenc \T1, \XMM3, \XMM3
+ vaesenc \T1, \XMM4, \XMM4
+ vaesenc \T1, \XMM5, \XMM5
+ vaesenc \T1, \XMM6, \XMM6
+ vaesenc \T1, \XMM7, \XMM7
+ vaesenc \T1, \XMM8, \XMM8
+
+ vmovdqu 16*2(arg1), \T1
+ vaesenc \T1, \XMM1, \XMM1
+ vaesenc \T1, \XMM2, \XMM2
+ vaesenc \T1, \XMM3, \XMM3
+ vaesenc \T1, \XMM4, \XMM4
+ vaesenc \T1, \XMM5, \XMM5
+ vaesenc \T1, \XMM6, \XMM6
+ vaesenc \T1, \XMM7, \XMM7
+ vaesenc \T1, \XMM8, \XMM8
+
+
+ #######################################################################
+
+ vmovdqa HashKey_8(arg1), \T5
+ vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
+ vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
+
+ vpshufd $0b01001110, \T2, \T6
+ vpxor \T2, \T6, \T6
+
+ vmovdqa HashKey_8_k(arg1), \T5
+ vpclmulqdq $0x00, \T5, \T6, \T6
+
+ vmovdqu 16*3(arg1), \T1
+ vaesenc \T1, \XMM1, \XMM1
+ vaesenc \T1, \XMM2, \XMM2
+ vaesenc \T1, \XMM3, \XMM3
+ vaesenc \T1, \XMM4, \XMM4
+ vaesenc \T1, \XMM5, \XMM5
+ vaesenc \T1, \XMM6, \XMM6
+ vaesenc \T1, \XMM7, \XMM7
+ vaesenc \T1, \XMM8, \XMM8
+
+ vmovdqa TMP2(%rsp), \T1
+ vmovdqa HashKey_7(arg1), \T5
+ vpclmulqdq $0x11, \T5, \T1, \T3
+ vpxor \T3, \T4, \T4
+ vpclmulqdq $0x00, \T5, \T1, \T3
+ vpxor \T3, \T7, \T7
+
+ vpshufd $0b01001110, \T1, \T3
+ vpxor \T1, \T3, \T3
+ vmovdqa HashKey_7_k(arg1), \T5
+ vpclmulqdq $0x10, \T5, \T3, \T3
+ vpxor \T3, \T6, \T6
+
+ vmovdqu 16*4(arg1), \T1
+ vaesenc \T1, \XMM1, \XMM1
+ vaesenc \T1, \XMM2, \XMM2
+ vaesenc \T1, \XMM3, \XMM3
+ vaesenc \T1, \XMM4, \XMM4
+ vaesenc \T1, \XMM5, \XMM5
+ vaesenc \T1, \XMM6, \XMM6
+ vaesenc \T1, \XMM7, \XMM7
+ vaesenc \T1, \XMM8, \XMM8
+
+ #######################################################################
+
+ vmovdqa TMP3(%rsp), \T1
+ vmovdqa HashKey_6(arg1), \T5
+ vpclmulqdq $0x11, \T5, \T1, \T3
+ vpxor \T3, \T4, \T4
+ vpclmulqdq $0x00, \T5, \T1, \T3
+ vpxor \T3, \T7, \T7
+
+ vpshufd $0b01001110, \T1, \T3
+ vpxor \T1, \T3, \T3
+ vmovdqa HashKey_6_k(arg1), \T5
+ vpclmulqdq $0x10, \T5, \T3, \T3
+ vpxor \T3, \T6, \T6
+
+ vmovdqu 16*5(arg1), \T1
+ vaesenc \T1, \XMM1, \XMM1
+ vaesenc \T1, \XMM2, \XMM2
+ vaesenc \T1, \XMM3, \XMM3
+ vaesenc \T1, \XMM4, \XMM4
+ vaesenc \T1, \XMM5, \XMM5
+ vaesenc \T1, \XMM6, \XMM6
+ vaesenc \T1, \XMM7, \XMM7
+ vaesenc \T1, \XMM8, \XMM8
+
+ vmovdqa TMP4(%rsp), \T1
+ vmovdqa HashKey_5(arg1), \T5
+ vpclmulqdq $0x11, \T5, \T1, \T3
+ vpxor \T3, \T4, \T4
+ vpclmulqdq $0x00, \T5, \T1, \T3
+ vpxor \T3, \T7, \T7
+
+ vpshufd $0b01001110, \T1, \T3
+ vpxor \T1, \T3, \T3
+ vmovdqa HashKey_5_k(arg1), \T5
+ vpclmulqdq $0x10, \T5, \T3, \T3
+ vpxor \T3, \T6, \T6
+
+ vmovdqu 16*6(arg1), \T1
+ vaesenc \T1, \XMM1, \XMM1
+ vaesenc \T1, \XMM2, \XMM2
+ vaesenc \T1, \XMM3, \XMM3
+ vaesenc \T1, \XMM4, \XMM4
+ vaesenc \T1, \XMM5, \XMM5
+ vaesenc \T1, \XMM6, \XMM6
+ vaesenc \T1, \XMM7, \XMM7
+ vaesenc \T1, \XMM8, \XMM8
+
+
+ vmovdqa TMP5(%rsp), \T1
+ vmovdqa HashKey_4(arg1), \T5
+ vpclmulqdq $0x11, \T5, \T1, \T3
+ vpxor \T3, \T4, \T4
+ vpclmulqdq $0x00, \T5, \T1, \T3
+ vpxor \T3, \T7, \T7
+
+ vpshufd $0b01001110, \T1, \T3
+ vpxor \T1, \T3, \T3
+ vmovdqa HashKey_4_k(arg1), \T5
+ vpclmulqdq $0x10, \T5, \T3, \T3
+ vpxor \T3, \T6, \T6
+
+ vmovdqu 16*7(arg1), \T1
+ vaesenc \T1, \XMM1, \XMM1
+ vaesenc \T1, \XMM2, \XMM2
+ vaesenc \T1, \XMM3, \XMM3
+ vaesenc \T1, \XMM4, \XMM4
+ vaesenc \T1, \XMM5, \XMM5
+ vaesenc \T1, \XMM6, \XMM6
+ vaesenc \T1, \XMM7, \XMM7
+ vaesenc \T1, \XMM8, \XMM8
+
+ vmovdqa TMP6(%rsp), \T1
+ vmovdqa HashKey_3(arg1), \T5
+ vpclmulqdq $0x11, \T5, \T1, \T3
+ vpxor \T3, \T4, \T4
+ vpclmulqdq $0x00, \T5, \T1, \T3
+ vpxor \T3, \T7, \T7
+
+ vpshufd $0b01001110, \T1, \T3
+ vpxor \T1, \T3, \T3
+ vmovdqa HashKey_3_k(arg1), \T5
+ vpclmulqdq $0x10, \T5, \T3, \T3
+ vpxor \T3, \T6, \T6
+
+
+ vmovdqu 16*8(arg1), \T1
+ vaesenc \T1, \XMM1, \XMM1
+ vaesenc \T1, \XMM2, \XMM2
+ vaesenc \T1, \XMM3, \XMM3
+ vaesenc \T1, \XMM4, \XMM4
+ vaesenc \T1, \XMM5, \XMM5
+ vaesenc \T1, \XMM6, \XMM6
+ vaesenc \T1, \XMM7, \XMM7
+ vaesenc \T1, \XMM8, \XMM8
+
+ vmovdqa TMP7(%rsp), \T1
+ vmovdqa HashKey_2(arg1), \T5
+ vpclmulqdq $0x11, \T5, \T1, \T3
+ vpxor \T3, \T4, \T4
+ vpclmulqdq $0x00, \T5, \T1, \T3
+ vpxor \T3, \T7, \T7
+
+ vpshufd $0b01001110, \T1, \T3
+ vpxor \T1, \T3, \T3
+ vmovdqa HashKey_2_k(arg1), \T5
+ vpclmulqdq $0x10, \T5, \T3, \T3
+ vpxor \T3, \T6, \T6
+
+ #######################################################################
+
+ vmovdqu 16*9(arg1), \T5
+ vaesenc \T5, \XMM1, \XMM1
+ vaesenc \T5, \XMM2, \XMM2
+ vaesenc \T5, \XMM3, \XMM3
+ vaesenc \T5, \XMM4, \XMM4
+ vaesenc \T5, \XMM5, \XMM5
+ vaesenc \T5, \XMM6, \XMM6
+ vaesenc \T5, \XMM7, \XMM7
+ vaesenc \T5, \XMM8, \XMM8
+
+ vmovdqa TMP8(%rsp), \T1
+ vmovdqa HashKey(arg1), \T5
+ vpclmulqdq $0x11, \T5, \T1, \T3
+ vpxor \T3, \T4, \T4
+ vpclmulqdq $0x00, \T5, \T1, \T3
+ vpxor \T3, \T7, \T7
+
+ vpshufd $0b01001110, \T1, \T3
+ vpxor \T1, \T3, \T3
+ vmovdqa HashKey_k(arg1), \T5
+ vpclmulqdq $0x10, \T5, \T3, \T3
+ vpxor \T3, \T6, \T6
+
+ vpxor \T4, \T6, \T6
+ vpxor \T7, \T6, \T6
+
+ vmovdqu 16*10(arg1), \T5
+
+ i = 0
+ j = 1
+ setreg
+.rep 8
+ vpxor 16*i(arg3, %r11), \T5, \T2
+ .if \ENC_DEC == ENC
+ vaesenclast \T2, reg_j, reg_j
+ .else
+ vaesenclast \T2, reg_j, \T3
+ vmovdqu 16*i(arg3, %r11), reg_j
+ vmovdqu \T3, 16*i(arg2, %r11)
+ .endif
+ i = (i+1)
+ j = (j+1)
+ setreg
+.endr
+ #######################################################################
+
+
+ vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
+ vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
+ vpxor \T3, \T7, \T7
+ vpxor \T4, \T6, \T6 # accumulate the results in T6:T7
+
+
+
+ #######################################################################
+ #first phase of the reduction
+ #######################################################################
+ vpslld $31, \T7, \T2 # packed right shifting << 31
+ vpslld $30, \T7, \T3 # packed right shifting shift << 30
+ vpslld $25, \T7, \T4 # packed right shifting shift << 25
+
+ vpxor \T3, \T2, \T2 # xor the shifted versions
+ vpxor \T4, \T2, \T2
+
+ vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
+
+ vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
+ vpxor \T2, \T7, \T7 # first phase of the reduction complete
+ #######################################################################
+ .if \ENC_DEC == ENC
+ vmovdqu \XMM1, 16*0(arg2,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM2, 16*1(arg2,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM3, 16*2(arg2,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM4, 16*3(arg2,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM5, 16*4(arg2,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM6, 16*5(arg2,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM7, 16*6(arg2,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM8, 16*7(arg2,%r11) # Write to the Ciphertext buffer
+ .endif
+
+ #######################################################################
+ #second phase of the reduction
+ vpsrld $1, \T7, \T2 # packed left shifting >> 1
+ vpsrld $2, \T7, \T3 # packed left shifting >> 2
+ vpsrld $7, \T7, \T4 # packed left shifting >> 7
+ vpxor \T3, \T2, \T2 # xor the shifted versions
+ vpxor \T4, \T2, \T2
+
+ vpxor \T1, \T2, \T2
+ vpxor \T2, \T7, \T7
+ vpxor \T7, \T6, \T6 # the result is in T6
+ #######################################################################
+
+ vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
+
+
+ vpxor \T6, \XMM1, \XMM1
+
+
+
+.endm
+
+
+# GHASH the last 4 ciphertext blocks.
+.macro GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
+
+ ## Karatsuba Method
+
+
+ vpshufd $0b01001110, \XMM1, \T2
+ vpxor \XMM1, \T2, \T2
+ vmovdqa HashKey_8(arg1), \T5
+ vpclmulqdq $0x11, \T5, \XMM1, \T6
+ vpclmulqdq $0x00, \T5, \XMM1, \T7
+
+ vmovdqa HashKey_8_k(arg1), \T3
+ vpclmulqdq $0x00, \T3, \T2, \XMM1
+
+ ######################
+
+ vpshufd $0b01001110, \XMM2, \T2
+ vpxor \XMM2, \T2, \T2
+ vmovdqa HashKey_7(arg1), \T5
+ vpclmulqdq $0x11, \T5, \XMM2, \T4
+ vpxor \T4, \T6, \T6
+
+ vpclmulqdq $0x00, \T5, \XMM2, \T4
+ vpxor \T4, \T7, \T7
+
+ vmovdqa HashKey_7_k(arg1), \T3
+ vpclmulqdq $0x00, \T3, \T2, \T2
+ vpxor \T2, \XMM1, \XMM1
+
+ ######################
+
+ vpshufd $0b01001110, \XMM3, \T2
+ vpxor \XMM3, \T2, \T2
+ vmovdqa HashKey_6(arg1), \T5
+ vpclmulqdq $0x11, \T5, \XMM3, \T4
+ vpxor \T4, \T6, \T6
+
+ vpclmulqdq $0x00, \T5, \XMM3, \T4
+ vpxor \T4, \T7, \T7
+
+ vmovdqa HashKey_6_k(arg1), \T3
+ vpclmulqdq $0x00, \T3, \T2, \T2
+ vpxor \T2, \XMM1, \XMM1
+
+ ######################
+
+ vpshufd $0b01001110, \XMM4, \T2
+ vpxor \XMM4, \T2, \T2
+ vmovdqa HashKey_5(arg1), \T5
+ vpclmulqdq $0x11, \T5, \XMM4, \T4
+ vpxor \T4, \T6, \T6
+
+ vpclmulqdq $0x00, \T5, \XMM4, \T4
+ vpxor \T4, \T7, \T7
+
+ vmovdqa HashKey_5_k(arg1), \T3
+ vpclmulqdq $0x00, \T3, \T2, \T2
+ vpxor \T2, \XMM1, \XMM1
+
+ ######################
+
+ vpshufd $0b01001110, \XMM5, \T2
+ vpxor \XMM5, \T2, \T2
+ vmovdqa HashKey_4(arg1), \T5
+ vpclmulqdq $0x11, \T5, \XMM5, \T4
+ vpxor \T4, \T6, \T6
+
+ vpclmulqdq $0x00, \T5, \XMM5, \T4
+ vpxor \T4, \T7, \T7
+
+ vmovdqa HashKey_4_k(arg1), \T3
+ vpclmulqdq $0x00, \T3, \T2, \T2
+ vpxor \T2, \XMM1, \XMM1
+
+ ######################
+
+ vpshufd $0b01001110, \XMM6, \T2
+ vpxor \XMM6, \T2, \T2
+ vmovdqa HashKey_3(arg1), \T5
+ vpclmulqdq $0x11, \T5, \XMM6, \T4
+ vpxor \T4, \T6, \T6
+
+ vpclmulqdq $0x00, \T5, \XMM6, \T4
+ vpxor \T4, \T7, \T7
+
+ vmovdqa HashKey_3_k(arg1), \T3
+ vpclmulqdq $0x00, \T3, \T2, \T2
+ vpxor \T2, \XMM1, \XMM1
+
+ ######################
+
+ vpshufd $0b01001110, \XMM7, \T2
+ vpxor \XMM7, \T2, \T2
+ vmovdqa HashKey_2(arg1), \T5
+ vpclmulqdq $0x11, \T5, \XMM7, \T4
+ vpxor \T4, \T6, \T6
+
+ vpclmulqdq $0x00, \T5, \XMM7, \T4
+ vpxor \T4, \T7, \T7
+
+ vmovdqa HashKey_2_k(arg1), \T3
+ vpclmulqdq $0x00, \T3, \T2, \T2
+ vpxor \T2, \XMM1, \XMM1
+
+ ######################
+
+ vpshufd $0b01001110, \XMM8, \T2
+ vpxor \XMM8, \T2, \T2
+ vmovdqa HashKey(arg1), \T5
+ vpclmulqdq $0x11, \T5, \XMM8, \T4
+ vpxor \T4, \T6, \T6
+
+ vpclmulqdq $0x00, \T5, \XMM8, \T4
+ vpxor \T4, \T7, \T7
+
+ vmovdqa HashKey_k(arg1), \T3
+ vpclmulqdq $0x00, \T3, \T2, \T2
+
+ vpxor \T2, \XMM1, \XMM1
+ vpxor \T6, \XMM1, \XMM1
+ vpxor \T7, \XMM1, \T2
+
+
+
+
+ vpslldq $8, \T2, \T4
+ vpsrldq $8, \T2, \T2
+
+ vpxor \T4, \T7, \T7
+ vpxor \T2, \T6, \T6 # <T6:T7> holds the result of
+ # the accumulated carry-less multiplications
+
+ #######################################################################
+ #first phase of the reduction
+ vpslld $31, \T7, \T2 # packed right shifting << 31
+ vpslld $30, \T7, \T3 # packed right shifting shift << 30
+ vpslld $25, \T7, \T4 # packed right shifting shift << 25
+
+ vpxor \T3, \T2, \T2 # xor the shifted versions
+ vpxor \T4, \T2, \T2
+
+ vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
+
+ vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
+ vpxor \T2, \T7, \T7 # first phase of the reduction complete
+ #######################################################################
+
+
+ #second phase of the reduction
+ vpsrld $1, \T7, \T2 # packed left shifting >> 1
+ vpsrld $2, \T7, \T3 # packed left shifting >> 2
+ vpsrld $7, \T7, \T4 # packed left shifting >> 7
+ vpxor \T3, \T2, \T2 # xor the shifted versions
+ vpxor \T4, \T2, \T2
+
+ vpxor \T1, \T2, \T2
+ vpxor \T2, \T7, \T7
+ vpxor \T7, \T6, \T6 # the result is in T6
+
+.endm
+
+
+# combined for GCM encrypt and decrypt functions
+# clobbering all xmm registers
+# clobbering r10, r11, r12, r13, r14, r15
+.macro GCM_ENC_DEC_AVX ENC_DEC
+
+ #the number of pushes must equal STACK_OFFSET
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ mov %rsp, %r14
+
+
+
+
+ sub $VARIABLE_OFFSET, %rsp
+ and $~63, %rsp # align rsp to 64 bytes
+
+
+ vmovdqu HashKey(arg1), %xmm13 # xmm13 = HashKey
+
+ mov arg4, %r13 # save the number of bytes of plaintext/ciphertext
+ and $-16, %r13 # r13 = r13 - (r13 mod 16)
+
+ mov %r13, %r12
+ shr $4, %r12
+ and $7, %r12
+ jz _initial_num_blocks_is_0\@
+
+ cmp $7, %r12
+ je _initial_num_blocks_is_7\@
+ cmp $6, %r12
+ je _initial_num_blocks_is_6\@
+ cmp $5, %r12
+ je _initial_num_blocks_is_5\@
+ cmp $4, %r12
+ je _initial_num_blocks_is_4\@
+ cmp $3, %r12
+ je _initial_num_blocks_is_3\@
+ cmp $2, %r12
+ je _initial_num_blocks_is_2\@
+
+ jmp _initial_num_blocks_is_1\@
+
+_initial_num_blocks_is_7\@:
+ INITIAL_BLOCKS_AVX 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+ sub $16*7, %r13
+ jmp _initial_blocks_encrypted\@
+
+_initial_num_blocks_is_6\@:
+ INITIAL_BLOCKS_AVX 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+ sub $16*6, %r13
+ jmp _initial_blocks_encrypted\@
+
+_initial_num_blocks_is_5\@:
+ INITIAL_BLOCKS_AVX 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+ sub $16*5, %r13
+ jmp _initial_blocks_encrypted\@
+
+_initial_num_blocks_is_4\@:
+ INITIAL_BLOCKS_AVX 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+ sub $16*4, %r13
+ jmp _initial_blocks_encrypted\@
+
+_initial_num_blocks_is_3\@:
+ INITIAL_BLOCKS_AVX 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+ sub $16*3, %r13
+ jmp _initial_blocks_encrypted\@
+
+_initial_num_blocks_is_2\@:
+ INITIAL_BLOCKS_AVX 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+ sub $16*2, %r13
+ jmp _initial_blocks_encrypted\@
+
+_initial_num_blocks_is_1\@:
+ INITIAL_BLOCKS_AVX 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+ sub $16*1, %r13
+ jmp _initial_blocks_encrypted\@
+
+_initial_num_blocks_is_0\@:
+ INITIAL_BLOCKS_AVX 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+
+
+_initial_blocks_encrypted\@:
+ cmp $0, %r13
+ je _zero_cipher_left\@
+
+ sub $128, %r13
+ je _eight_cipher_left\@
+
+
+
+
+ vmovd %xmm9, %r15d
+ and $255, %r15d
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+
+
+_encrypt_by_8_new\@:
+ cmp $(255-8), %r15d
+ jg _encrypt_by_8\@
+
+
+
+ add $8, %r15b
+ GHASH_8_ENCRYPT_8_PARALLEL_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
+ add $128, %r11
+ sub $128, %r13
+ jne _encrypt_by_8_new\@
+
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+ jmp _eight_cipher_left\@
+
+_encrypt_by_8\@:
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+ add $8, %r15b
+ GHASH_8_ENCRYPT_8_PARALLEL_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+ add $128, %r11
+ sub $128, %r13
+ jne _encrypt_by_8_new\@
+
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+
+
+
+
+_eight_cipher_left\@:
+ GHASH_LAST_8_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
+
+
+_zero_cipher_left\@:
+ cmp $16, arg4
+ jl _only_less_than_16\@
+
+ mov arg4, %r13
+ and $15, %r13 # r13 = (arg4 mod 16)
+
+ je _multiple_of_16_bytes\@
+
+ # handle the last <16 Byte block seperately
+
+
+ vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+ ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
+
+ sub $16, %r11
+ add %r13, %r11
+ vmovdqu (arg3, %r11), %xmm1 # receive the last <16 Byte block
+
+ lea SHIFT_MASK+16(%rip), %r12
+ sub %r13, %r12 # adjust the shuffle mask pointer to be
+ # able to shift 16-r13 bytes (r13 is the
+ # number of bytes in plaintext mod 16)
+ vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask
+ vpshufb %xmm2, %xmm1, %xmm1 # shift right 16-r13 bytes
+ jmp _final_ghash_mul\@
+
+_only_less_than_16\@:
+ # check for 0 length
+ mov arg4, %r13
+ and $15, %r13 # r13 = (arg4 mod 16)
+
+ je _multiple_of_16_bytes\@
+
+ # handle the last <16 Byte block seperately
+
+
+ vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+ ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
+
+
+ lea SHIFT_MASK+16(%rip), %r12
+ sub %r13, %r12 # adjust the shuffle mask pointer to be
+ # able to shift 16-r13 bytes (r13 is the
+ # number of bytes in plaintext mod 16)
+
+_get_last_16_byte_loop\@:
+ movb (arg3, %r11), %al
+ movb %al, TMP1 (%rsp , %r11)
+ add $1, %r11
+ cmp %r13, %r11
+ jne _get_last_16_byte_loop\@
+
+ vmovdqu TMP1(%rsp), %xmm1
+
+ sub $16, %r11
+
+_final_ghash_mul\@:
+ .if \ENC_DEC == DEC
+ vmovdqa %xmm1, %xmm2
+ vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
+ vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
+ # mask out top 16-r13 bytes of xmm9
+ vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
+ vpand %xmm1, %xmm2, %xmm2
+ vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
+ vpxor %xmm2, %xmm14, %xmm14
+ #GHASH computation for the last <16 Byte block
+ GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
+ sub %r13, %r11
+ add $16, %r11
+ .else
+ vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
+ vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
+ # mask out top 16-r13 bytes of xmm9
+ vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+ vpxor %xmm9, %xmm14, %xmm14
+ #GHASH computation for the last <16 Byte block
+ GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
+ sub %r13, %r11
+ add $16, %r11
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext
+ .endif
+
+
+ #############################
+ # output r13 Bytes
+ vmovq %xmm9, %rax
+ cmp $8, %r13
+ jle _less_than_8_bytes_left\@
+
+ mov %rax, (arg2 , %r11)
+ add $8, %r11
+ vpsrldq $8, %xmm9, %xmm9
+ vmovq %xmm9, %rax
+ sub $8, %r13
+
+_less_than_8_bytes_left\@:
+ movb %al, (arg2 , %r11)
+ add $1, %r11
+ shr $8, %rax
+ sub $1, %r13
+ jne _less_than_8_bytes_left\@
+ #############################
+
+_multiple_of_16_bytes\@:
+ mov arg7, %r12 # r12 = aadLen (number of bytes)
+ shl $3, %r12 # convert into number of bits
+ vmovd %r12d, %xmm15 # len(A) in xmm15
+
+ shl $3, arg4 # len(C) in bits (*128)
+ vmovq arg4, %xmm1
+ vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000
+ vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C)
+
+ vpxor %xmm15, %xmm14, %xmm14
+ GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation
+ vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap
+
+ mov arg5, %rax # rax = *Y0
+ vmovdqu (%rax), %xmm9 # xmm9 = Y0
+
+ ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Y0)
+
+ vpxor %xmm14, %xmm9, %xmm9
+
+
+
+_return_T\@:
+ mov arg8, %r10 # r10 = authTag
+ mov arg9, %r11 # r11 = auth_tag_len
+
+ cmp $16, %r11
+ je _T_16\@
+
+ cmp $8, %r11
+ jl _T_4\@
+
+_T_8\@:
+ vmovq %xmm9, %rax
+ mov %rax, (%r10)
+ add $8, %r10
+ sub $8, %r11
+ vpsrldq $8, %xmm9, %xmm9
+ cmp $0, %r11
+ je _return_T_done\@
+_T_4\@:
+ vmovd %xmm9, %eax
+ mov %eax, (%r10)
+ add $4, %r10
+ sub $4, %r11
+ vpsrldq $4, %xmm9, %xmm9
+ cmp $0, %r11
+ je _return_T_done\@
+_T_123\@:
+ vmovd %xmm9, %eax
+ cmp $2, %r11
+ jl _T_1\@
+ mov %ax, (%r10)
+ cmp $2, %r11
+ je _return_T_done\@
+ add $2, %r10
+ sar $16, %eax
+_T_1\@:
+ mov %al, (%r10)
+ jmp _return_T_done\@
+
+_T_16\@:
+ vmovdqu %xmm9, (%r10)
+
+_return_T_done\@:
+ mov %r14, %rsp
+
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+.endm
+
+
+#############################################################
+#void aesni_gcm_precomp_avx_gen2
+# (gcm_data *my_ctx_data,
+# u8 *hash_subkey)# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
+#############################################################
+ENTRY(aesni_gcm_precomp_avx_gen2)
+ #the number of pushes must equal STACK_OFFSET
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ mov %rsp, %r14
+
+
+
+ sub $VARIABLE_OFFSET, %rsp
+ and $~63, %rsp # align rsp to 64 bytes
+
+ vmovdqu (arg2), %xmm6 # xmm6 = HashKey
+
+ vpshufb SHUF_MASK(%rip), %xmm6, %xmm6
+ ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
+ vmovdqa %xmm6, %xmm2
+ vpsllq $1, %xmm6, %xmm6
+ vpsrlq $63, %xmm2, %xmm2
+ vmovdqa %xmm2, %xmm1
+ vpslldq $8, %xmm2, %xmm2
+ vpsrldq $8, %xmm1, %xmm1
+ vpor %xmm2, %xmm6, %xmm6
+ #reduction
+ vpshufd $0b00100100, %xmm1, %xmm2
+ vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
+ vpand POLY(%rip), %xmm2, %xmm2
+ vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly
+ #######################################################################
+ vmovdqa %xmm6, HashKey(arg1) # store HashKey<<1 mod poly
+
+
+ PRECOMPUTE_AVX %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
+
+ mov %r14, %rsp
+
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ ret
+ENDPROC(aesni_gcm_precomp_avx_gen2)
+
+###############################################################################
+#void aesni_gcm_enc_avx_gen2(
+# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
+# u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
+# const u8 *in, /* Plaintext input */
+# u64 plaintext_len, /* Length of data in Bytes for encryption. */
+# u8 *iv, /* Pre-counter block j0: 4 byte salt
+# (from Security Association) concatenated with 8 byte
+# Initialisation Vector (from IPSec ESP Payload)
+# concatenated with 0x00000001. 16-byte aligned pointer. */
+# const u8 *aad, /* Additional Authentication Data (AAD)*/
+# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
+# u8 *auth_tag, /* Authenticated Tag output. */
+# u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
+# Valid values are 16 (most likely), 12 or 8. */
+###############################################################################
+ENTRY(aesni_gcm_enc_avx_gen2)
+ GCM_ENC_DEC_AVX ENC
+ ret
+ENDPROC(aesni_gcm_enc_avx_gen2)
+
+###############################################################################
+#void aesni_gcm_dec_avx_gen2(
+# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
+# u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
+# const u8 *in, /* Ciphertext input */
+# u64 plaintext_len, /* Length of data in Bytes for encryption. */
+# u8 *iv, /* Pre-counter block j0: 4 byte salt
+# (from Security Association) concatenated with 8 byte
+# Initialisation Vector (from IPSec ESP Payload)
+# concatenated with 0x00000001. 16-byte aligned pointer. */
+# const u8 *aad, /* Additional Authentication Data (AAD)*/
+# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
+# u8 *auth_tag, /* Authenticated Tag output. */
+# u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
+# Valid values are 16 (most likely), 12 or 8. */
+###############################################################################
+ENTRY(aesni_gcm_dec_avx_gen2)
+ GCM_ENC_DEC_AVX DEC
+ ret
+ENDPROC(aesni_gcm_dec_avx_gen2)
+#endif /* CONFIG_AS_AVX */
+
+#ifdef CONFIG_AS_AVX2
+###############################################################################
+# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
+# Input: A and B (128-bits each, bit-reflected)
+# Output: C = A*B*x mod poly, (i.e. >>1 )
+# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
+# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
+###############################################################################
+.macro GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5
+
+ vpclmulqdq $0x11,\HK,\GH,\T1 # T1 = a1*b1
+ vpclmulqdq $0x00,\HK,\GH,\T2 # T2 = a0*b0
+ vpclmulqdq $0x01,\HK,\GH,\T3 # T3 = a1*b0
+ vpclmulqdq $0x10,\HK,\GH,\GH # GH = a0*b1
+ vpxor \T3, \GH, \GH
+
+
+ vpsrldq $8 , \GH, \T3 # shift-R GH 2 DWs
+ vpslldq $8 , \GH, \GH # shift-L GH 2 DWs
+
+ vpxor \T3, \T1, \T1
+ vpxor \T2, \GH, \GH
+
+ #######################################################################
+ #first phase of the reduction
+ vmovdqa POLY2(%rip), \T3
+
+ vpclmulqdq $0x01, \GH, \T3, \T2
+ vpslldq $8, \T2, \T2 # shift-L T2 2 DWs
+
+ vpxor \T2, \GH, \GH # first phase of the reduction complete
+ #######################################################################
+ #second phase of the reduction
+ vpclmulqdq $0x00, \GH, \T3, \T2
+ vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
+
+ vpclmulqdq $0x10, \GH, \T3, \GH
+ vpslldq $4, \GH, \GH # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
+
+ vpxor \T2, \GH, \GH # second phase of the reduction complete
+ #######################################################################
+ vpxor \T1, \GH, \GH # the result is in GH
+
+
+.endm
+
+.macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6
+
+ # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+ vmovdqa \HK, \T5
+ GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
+ vmovdqa \T5, HashKey_2(arg1) # [HashKey_2] = HashKey^2<<1 mod poly
+
+ GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
+ vmovdqa \T5, HashKey_3(arg1)
+
+ GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
+ vmovdqa \T5, HashKey_4(arg1)
+
+ GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
+ vmovdqa \T5, HashKey_5(arg1)
+
+ GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
+ vmovdqa \T5, HashKey_6(arg1)
+
+ GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
+ vmovdqa \T5, HashKey_7(arg1)
+
+ GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
+ vmovdqa \T5, HashKey_8(arg1)
+
+.endm
+
+
+## if a = number of total plaintext bytes
+## b = floor(a/16)
+## num_initial_blocks = b mod 4#
+## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
+## r10, r11, r12, rax are clobbered
+## arg1, arg2, arg3, r14 are used as a pointer only, not modified
+
+.macro INITIAL_BLOCKS_AVX2 num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
+ i = (8-\num_initial_blocks)
+ j = 0
+ setreg
+
+ mov arg6, %r10 # r10 = AAD
+ mov arg7, %r12 # r12 = aadLen
+
+
+ mov %r12, %r11
+
+ vpxor reg_j, reg_j, reg_j
+ vpxor reg_i, reg_i, reg_i
+
+ cmp $16, %r11
+ jl _get_AAD_rest8\@
+_get_AAD_blocks\@:
+ vmovdqu (%r10), reg_i
+ vpshufb SHUF_MASK(%rip), reg_i, reg_i
+ vpxor reg_i, reg_j, reg_j
+ GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6
+ add $16, %r10
+ sub $16, %r12
+ sub $16, %r11
+ cmp $16, %r11
+ jge _get_AAD_blocks\@
+ vmovdqu reg_j, reg_i
+ cmp $0, %r11
+ je _get_AAD_done\@
+
+ vpxor reg_i, reg_i, reg_i
+
+ /* read the last <16B of AAD. since we have at least 4B of
+ data right after the AAD (the ICV, and maybe some CT), we can
+ read 4B/8B blocks safely, and then get rid of the extra stuff */
+_get_AAD_rest8\@:
+ cmp $4, %r11
+ jle _get_AAD_rest4\@
+ movq (%r10), \T1
+ add $8, %r10
+ sub $8, %r11
+ vpslldq $8, \T1, \T1
+ vpsrldq $8, reg_i, reg_i
+ vpxor \T1, reg_i, reg_i
+ jmp _get_AAD_rest8\@
+_get_AAD_rest4\@:
+ cmp $0, %r11
+ jle _get_AAD_rest0\@
+ mov (%r10), %eax
+ movq %rax, \T1
+ add $4, %r10
+ sub $4, %r11
+ vpslldq $12, \T1, \T1
+ vpsrldq $4, reg_i, reg_i
+ vpxor \T1, reg_i, reg_i
+_get_AAD_rest0\@:
+ /* finalize: shift out the extra bytes we read, and align
+ left. since pslldq can only shift by an immediate, we use
+ vpshufb and an array of shuffle masks */
+ movq %r12, %r11
+ salq $4, %r11
+ movdqu aad_shift_arr(%r11), \T1
+ vpshufb \T1, reg_i, reg_i
+_get_AAD_rest_final\@:
+ vpshufb SHUF_MASK(%rip), reg_i, reg_i
+ vpxor reg_j, reg_i, reg_i
+ GHASH_MUL_AVX2 reg_i, \T2, \T1, \T3, \T4, \T5, \T6
+
+_get_AAD_done\@:
+ # initialize the data pointer offset as zero
+ xor %r11d, %r11d
+
+ # start AES for num_initial_blocks blocks
+ mov arg5, %rax # rax = *Y0
+ vmovdqu (%rax), \CTR # CTR = Y0
+ vpshufb SHUF_MASK(%rip), \CTR, \CTR
+
+
+ i = (9-\num_initial_blocks)
+ setreg
+.rep \num_initial_blocks
+ vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
+ vmovdqa \CTR, reg_i
+ vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap
+ i = (i+1)
+ setreg
+.endr
+
+ vmovdqa (arg1), \T_key
+ i = (9-\num_initial_blocks)
+ setreg
+.rep \num_initial_blocks
+ vpxor \T_key, reg_i, reg_i
+ i = (i+1)
+ setreg
+.endr
+
+ j = 1
+ setreg
+.rep 9
+ vmovdqa 16*j(arg1), \T_key
+ i = (9-\num_initial_blocks)
+ setreg
+.rep \num_initial_blocks
+ vaesenc \T_key, reg_i, reg_i
+ i = (i+1)
+ setreg
+.endr
+
+ j = (j+1)
+ setreg
+.endr
+
+
+ vmovdqa 16*10(arg1), \T_key
+ i = (9-\num_initial_blocks)
+ setreg
+.rep \num_initial_blocks
+ vaesenclast \T_key, reg_i, reg_i
+ i = (i+1)
+ setreg
+.endr
+
+ i = (9-\num_initial_blocks)
+ setreg
+.rep \num_initial_blocks
+ vmovdqu (arg3, %r11), \T1
+ vpxor \T1, reg_i, reg_i
+ vmovdqu reg_i, (arg2 , %r11) # write back ciphertext for
+ # num_initial_blocks blocks
+ add $16, %r11
+.if \ENC_DEC == DEC
+ vmovdqa \T1, reg_i
+.endif
+ vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations
+ i = (i+1)
+ setreg
+.endr
+
+
+ i = (8-\num_initial_blocks)
+ j = (9-\num_initial_blocks)
+ setreg
+
+.rep \num_initial_blocks
+ vpxor reg_i, reg_j, reg_j
+ GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
+ i = (i+1)
+ j = (j+1)
+ setreg
+.endr
+ # XMM8 has the combined result here
+
+ vmovdqa \XMM8, TMP1(%rsp)
+ vmovdqa \XMM8, \T3
+
+ cmp $128, %r13
+ jl _initial_blocks_done\@ # no need for precomputed constants
+
+###############################################################################
+# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+ vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
+ vmovdqa \CTR, \XMM1
+ vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
+
+ vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
+ vmovdqa \CTR, \XMM2
+ vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
+
+ vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
+ vmovdqa \CTR, \XMM3
+ vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
+
+ vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
+ vmovdqa \CTR, \XMM4
+ vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
+
+ vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
+ vmovdqa \CTR, \XMM5
+ vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
+
+ vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
+ vmovdqa \CTR, \XMM6
+ vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
+
+ vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
+ vmovdqa \CTR, \XMM7
+ vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
+
+ vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
+ vmovdqa \CTR, \XMM8
+ vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
+
+ vmovdqa (arg1), \T_key
+ vpxor \T_key, \XMM1, \XMM1
+ vpxor \T_key, \XMM2, \XMM2
+ vpxor \T_key, \XMM3, \XMM3
+ vpxor \T_key, \XMM4, \XMM4
+ vpxor \T_key, \XMM5, \XMM5
+ vpxor \T_key, \XMM6, \XMM6
+ vpxor \T_key, \XMM7, \XMM7
+ vpxor \T_key, \XMM8, \XMM8
+
+ i = 1
+ setreg
+.rep 9 # do 9 rounds
+ vmovdqa 16*i(arg1), \T_key
+ vaesenc \T_key, \XMM1, \XMM1
+ vaesenc \T_key, \XMM2, \XMM2
+ vaesenc \T_key, \XMM3, \XMM3
+ vaesenc \T_key, \XMM4, \XMM4
+ vaesenc \T_key, \XMM5, \XMM5
+ vaesenc \T_key, \XMM6, \XMM6
+ vaesenc \T_key, \XMM7, \XMM7
+ vaesenc \T_key, \XMM8, \XMM8
+ i = (i+1)
+ setreg
+.endr
+
+
+ vmovdqa 16*i(arg1), \T_key
+ vaesenclast \T_key, \XMM1, \XMM1
+ vaesenclast \T_key, \XMM2, \XMM2
+ vaesenclast \T_key, \XMM3, \XMM3
+ vaesenclast \T_key, \XMM4, \XMM4
+ vaesenclast \T_key, \XMM5, \XMM5
+ vaesenclast \T_key, \XMM6, \XMM6
+ vaesenclast \T_key, \XMM7, \XMM7
+ vaesenclast \T_key, \XMM8, \XMM8
+
+ vmovdqu (arg3, %r11), \T1
+ vpxor \T1, \XMM1, \XMM1
+ vmovdqu \XMM1, (arg2 , %r11)
+ .if \ENC_DEC == DEC
+ vmovdqa \T1, \XMM1
+ .endif
+
+ vmovdqu 16*1(arg3, %r11), \T1
+ vpxor \T1, \XMM2, \XMM2
+ vmovdqu \XMM2, 16*1(arg2 , %r11)
+ .if \ENC_DEC == DEC
+ vmovdqa \T1, \XMM2
+ .endif
+
+ vmovdqu 16*2(arg3, %r11), \T1
+ vpxor \T1, \XMM3, \XMM3
+ vmovdqu \XMM3, 16*2(arg2 , %r11)
+ .if \ENC_DEC == DEC
+ vmovdqa \T1, \XMM3
+ .endif
+
+ vmovdqu 16*3(arg3, %r11), \T1
+ vpxor \T1, \XMM4, \XMM4
+ vmovdqu \XMM4, 16*3(arg2 , %r11)
+ .if \ENC_DEC == DEC
+ vmovdqa \T1, \XMM4
+ .endif
+
+ vmovdqu 16*4(arg3, %r11), \T1
+ vpxor \T1, \XMM5, \XMM5
+ vmovdqu \XMM5, 16*4(arg2 , %r11)
+ .if \ENC_DEC == DEC
+ vmovdqa \T1, \XMM5
+ .endif
+
+ vmovdqu 16*5(arg3, %r11), \T1
+ vpxor \T1, \XMM6, \XMM6
+ vmovdqu \XMM6, 16*5(arg2 , %r11)
+ .if \ENC_DEC == DEC
+ vmovdqa \T1, \XMM6
+ .endif
+
+ vmovdqu 16*6(arg3, %r11), \T1
+ vpxor \T1, \XMM7, \XMM7
+ vmovdqu \XMM7, 16*6(arg2 , %r11)
+ .if \ENC_DEC == DEC
+ vmovdqa \T1, \XMM7
+ .endif
+
+ vmovdqu 16*7(arg3, %r11), \T1
+ vpxor \T1, \XMM8, \XMM8
+ vmovdqu \XMM8, 16*7(arg2 , %r11)
+ .if \ENC_DEC == DEC
+ vmovdqa \T1, \XMM8
+ .endif
+
+ add $128, %r11
+
+ vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
+ vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with
+ # the corresponding ciphertext
+ vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
+
+###############################################################################
+
+_initial_blocks_done\@:
+
+
+.endm
+
+
+
+# encrypt 8 blocks at a time
+# ghash the 8 previously encrypted ciphertext blocks
+# arg1, arg2, arg3 are used as pointers only, not modified
+# r11 is the data offset value
+.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
+
+ vmovdqa \XMM1, \T2
+ vmovdqa \XMM2, TMP2(%rsp)
+ vmovdqa \XMM3, TMP3(%rsp)
+ vmovdqa \XMM4, TMP4(%rsp)
+ vmovdqa \XMM5, TMP5(%rsp)
+ vmovdqa \XMM6, TMP6(%rsp)
+ vmovdqa \XMM7, TMP7(%rsp)
+ vmovdqa \XMM8, TMP8(%rsp)
+
+.if \loop_idx == in_order
+ vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT
+ vpaddd ONE(%rip), \XMM1, \XMM2
+ vpaddd ONE(%rip), \XMM2, \XMM3
+ vpaddd ONE(%rip), \XMM3, \XMM4
+ vpaddd ONE(%rip), \XMM4, \XMM5
+ vpaddd ONE(%rip), \XMM5, \XMM6
+ vpaddd ONE(%rip), \XMM6, \XMM7
+ vpaddd ONE(%rip), \XMM7, \XMM8
+ vmovdqa \XMM8, \CTR
+
+ vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
+.else
+ vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT
+ vpaddd ONEf(%rip), \XMM1, \XMM2
+ vpaddd ONEf(%rip), \XMM2, \XMM3
+ vpaddd ONEf(%rip), \XMM3, \XMM4
+ vpaddd ONEf(%rip), \XMM4, \XMM5
+ vpaddd ONEf(%rip), \XMM5, \XMM6
+ vpaddd ONEf(%rip), \XMM6, \XMM7
+ vpaddd ONEf(%rip), \XMM7, \XMM8
+ vmovdqa \XMM8, \CTR
+.endif
+
+
+ #######################################################################
+
+ vmovdqu (arg1), \T1
+ vpxor \T1, \XMM1, \XMM1
+ vpxor \T1, \XMM2, \XMM2
+ vpxor \T1, \XMM3, \XMM3
+ vpxor \T1, \XMM4, \XMM4
+ vpxor \T1, \XMM5, \XMM5
+ vpxor \T1, \XMM6, \XMM6
+ vpxor \T1, \XMM7, \XMM7
+ vpxor \T1, \XMM8, \XMM8
+
+ #######################################################################
+
+
+
+
+
+ vmovdqu 16*1(arg1), \T1
+ vaesenc \T1, \XMM1, \XMM1
+ vaesenc \T1, \XMM2, \XMM2
+ vaesenc \T1, \XMM3, \XMM3
+ vaesenc \T1, \XMM4, \XMM4
+ vaesenc \T1, \XMM5, \XMM5
+ vaesenc \T1, \XMM6, \XMM6
+ vaesenc \T1, \XMM7, \XMM7
+ vaesenc \T1, \XMM8, \XMM8
+
+ vmovdqu 16*2(arg1), \T1
+ vaesenc \T1, \XMM1, \XMM1
+ vaesenc \T1, \XMM2, \XMM2
+ vaesenc \T1, \XMM3, \XMM3
+ vaesenc \T1, \XMM4, \XMM4
+ vaesenc \T1, \XMM5, \XMM5
+ vaesenc \T1, \XMM6, \XMM6
+ vaesenc \T1, \XMM7, \XMM7
+ vaesenc \T1, \XMM8, \XMM8
+
+
+ #######################################################################
+
+ vmovdqa HashKey_8(arg1), \T5
+ vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
+ vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
+ vpclmulqdq $0x01, \T5, \T2, \T6 # T6 = a1*b0
+ vpclmulqdq $0x10, \T5, \T2, \T5 # T5 = a0*b1
+ vpxor \T5, \T6, \T6
+
+ vmovdqu 16*3(arg1), \T1
+ vaesenc \T1, \XMM1, \XMM1
+ vaesenc \T1, \XMM2, \XMM2
+ vaesenc \T1, \XMM3, \XMM3
+ vaesenc \T1, \XMM4, \XMM4
+ vaesenc \T1, \XMM5, \XMM5
+ vaesenc \T1, \XMM6, \XMM6
+ vaesenc \T1, \XMM7, \XMM7
+ vaesenc \T1, \XMM8, \XMM8
+
+ vmovdqa TMP2(%rsp), \T1
+ vmovdqa HashKey_7(arg1), \T5
+ vpclmulqdq $0x11, \T5, \T1, \T3
+ vpxor \T3, \T4, \T4
+
+ vpclmulqdq $0x00, \T5, \T1, \T3
+ vpxor \T3, \T7, \T7
+
+ vpclmulqdq $0x01, \T5, \T1, \T3
+ vpxor \T3, \T6, \T6
+
+ vpclmulqdq $0x10, \T5, \T1, \T3
+ vpxor \T3, \T6, \T6
+
+ vmovdqu 16*4(arg1), \T1
+ vaesenc \T1, \XMM1, \XMM1
+ vaesenc \T1, \XMM2, \XMM2
+ vaesenc \T1, \XMM3, \XMM3
+ vaesenc \T1, \XMM4, \XMM4
+ vaesenc \T1, \XMM5, \XMM5
+ vaesenc \T1, \XMM6, \XMM6
+ vaesenc \T1, \XMM7, \XMM7
+ vaesenc \T1, \XMM8, \XMM8
+
+ #######################################################################
+
+ vmovdqa TMP3(%rsp), \T1
+ vmovdqa HashKey_6(arg1), \T5
+ vpclmulqdq $0x11, \T5, \T1, \T3
+ vpxor \T3, \T4, \T4
+
+ vpclmulqdq $0x00, \T5, \T1, \T3
+ vpxor \T3, \T7, \T7
+
+ vpclmulqdq $0x01, \T5, \T1, \T3
+ vpxor \T3, \T6, \T6
+
+ vpclmulqdq $0x10, \T5, \T1, \T3
+ vpxor \T3, \T6, \T6
+
+ vmovdqu 16*5(arg1), \T1
+ vaesenc \T1, \XMM1, \XMM1
+ vaesenc \T1, \XMM2, \XMM2
+ vaesenc \T1, \XMM3, \XMM3
+ vaesenc \T1, \XMM4, \XMM4
+ vaesenc \T1, \XMM5, \XMM5
+ vaesenc \T1, \XMM6, \XMM6
+ vaesenc \T1, \XMM7, \XMM7
+ vaesenc \T1, \XMM8, \XMM8
+
+ vmovdqa TMP4(%rsp), \T1
+ vmovdqa HashKey_5(arg1), \T5
+ vpclmulqdq $0x11, \T5, \T1, \T3
+ vpxor \T3, \T4, \T4
+
+ vpclmulqdq $0x00, \T5, \T1, \T3
+ vpxor \T3, \T7, \T7
+
+ vpclmulqdq $0x01, \T5, \T1, \T3
+ vpxor \T3, \T6, \T6
+
+ vpclmulqdq $0x10, \T5, \T1, \T3
+ vpxor \T3, \T6, \T6
+
+ vmovdqu 16*6(arg1), \T1
+ vaesenc \T1, \XMM1, \XMM1
+ vaesenc \T1, \XMM2, \XMM2
+ vaesenc \T1, \XMM3, \XMM3
+ vaesenc \T1, \XMM4, \XMM4
+ vaesenc \T1, \XMM5, \XMM5
+ vaesenc \T1, \XMM6, \XMM6
+ vaesenc \T1, \XMM7, \XMM7
+ vaesenc \T1, \XMM8, \XMM8
+
+
+ vmovdqa TMP5(%rsp), \T1
+ vmovdqa HashKey_4(arg1), \T5
+ vpclmulqdq $0x11, \T5, \T1, \T3
+ vpxor \T3, \T4, \T4
+
+ vpclmulqdq $0x00, \T5, \T1, \T3
+ vpxor \T3, \T7, \T7
+
+ vpclmulqdq $0x01, \T5, \T1, \T3
+ vpxor \T3, \T6, \T6
+
+ vpclmulqdq $0x10, \T5, \T1, \T3
+ vpxor \T3, \T6, \T6
+
+ vmovdqu 16*7(arg1), \T1
+ vaesenc \T1, \XMM1, \XMM1
+ vaesenc \T1, \XMM2, \XMM2
+ vaesenc \T1, \XMM3, \XMM3
+ vaesenc \T1, \XMM4, \XMM4
+ vaesenc \T1, \XMM5, \XMM5
+ vaesenc \T1, \XMM6, \XMM6
+ vaesenc \T1, \XMM7, \XMM7
+ vaesenc \T1, \XMM8, \XMM8
+
+ vmovdqa TMP6(%rsp), \T1
+ vmovdqa HashKey_3(arg1), \T5
+ vpclmulqdq $0x11, \T5, \T1, \T3
+ vpxor \T3, \T4, \T4
+
+ vpclmulqdq $0x00, \T5, \T1, \T3
+ vpxor \T3, \T7, \T7
+
+ vpclmulqdq $0x01, \T5, \T1, \T3
+ vpxor \T3, \T6, \T6
+
+ vpclmulqdq $0x10, \T5, \T1, \T3
+ vpxor \T3, \T6, \T6
+
+ vmovdqu 16*8(arg1), \T1
+ vaesenc \T1, \XMM1, \XMM1
+ vaesenc \T1, \XMM2, \XMM2
+ vaesenc \T1, \XMM3, \XMM3
+ vaesenc \T1, \XMM4, \XMM4
+ vaesenc \T1, \XMM5, \XMM5
+ vaesenc \T1, \XMM6, \XMM6
+ vaesenc \T1, \XMM7, \XMM7
+ vaesenc \T1, \XMM8, \XMM8
+
+ vmovdqa TMP7(%rsp), \T1
+ vmovdqa HashKey_2(arg1), \T5
+ vpclmulqdq $0x11, \T5, \T1, \T3
+ vpxor \T3, \T4, \T4
+
+ vpclmulqdq $0x00, \T5, \T1, \T3
+ vpxor \T3, \T7, \T7
+
+ vpclmulqdq $0x01, \T5, \T1, \T3
+ vpxor \T3, \T6, \T6
+
+ vpclmulqdq $0x10, \T5, \T1, \T3
+ vpxor \T3, \T6, \T6
+
+
+ #######################################################################
+
+ vmovdqu 16*9(arg1), \T5
+ vaesenc \T5, \XMM1, \XMM1
+ vaesenc \T5, \XMM2, \XMM2
+ vaesenc \T5, \XMM3, \XMM3
+ vaesenc \T5, \XMM4, \XMM4
+ vaesenc \T5, \XMM5, \XMM5
+ vaesenc \T5, \XMM6, \XMM6
+ vaesenc \T5, \XMM7, \XMM7
+ vaesenc \T5, \XMM8, \XMM8
+
+ vmovdqa TMP8(%rsp), \T1
+ vmovdqa HashKey(arg1), \T5
+
+ vpclmulqdq $0x00, \T5, \T1, \T3
+ vpxor \T3, \T7, \T7
+
+ vpclmulqdq $0x01, \T5, \T1, \T3
+ vpxor \T3, \T6, \T6
+
+ vpclmulqdq $0x10, \T5, \T1, \T3
+ vpxor \T3, \T6, \T6
+
+ vpclmulqdq $0x11, \T5, \T1, \T3
+ vpxor \T3, \T4, \T1
+
+
+ vmovdqu 16*10(arg1), \T5
+
+ i = 0
+ j = 1
+ setreg
+.rep 8
+ vpxor 16*i(arg3, %r11), \T5, \T2
+ .if \ENC_DEC == ENC
+ vaesenclast \T2, reg_j, reg_j
+ .else
+ vaesenclast \T2, reg_j, \T3
+ vmovdqu 16*i(arg3, %r11), reg_j
+ vmovdqu \T3, 16*i(arg2, %r11)
+ .endif
+ i = (i+1)
+ j = (j+1)
+ setreg
+.endr
+ #######################################################################
+
+
+ vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
+ vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
+ vpxor \T3, \T7, \T7
+ vpxor \T6, \T1, \T1 # accumulate the results in T1:T7
+
+
+
+ #######################################################################
+ #first phase of the reduction
+ vmovdqa POLY2(%rip), \T3
+
+ vpclmulqdq $0x01, \T7, \T3, \T2
+ vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
+
+ vpxor \T2, \T7, \T7 # first phase of the reduction complete
+ #######################################################################
+ .if \ENC_DEC == ENC
+ vmovdqu \XMM1, 16*0(arg2,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM2, 16*1(arg2,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM3, 16*2(arg2,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM4, 16*3(arg2,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM5, 16*4(arg2,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM6, 16*5(arg2,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM7, 16*6(arg2,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM8, 16*7(arg2,%r11) # Write to the Ciphertext buffer
+ .endif
+
+ #######################################################################
+ #second phase of the reduction
+ vpclmulqdq $0x00, \T7, \T3, \T2
+ vpsrldq $4, \T2, \T2 # shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
+
+ vpclmulqdq $0x10, \T7, \T3, \T4
+ vpslldq $4, \T4, \T4 # shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
+
+ vpxor \T2, \T4, \T4 # second phase of the reduction complete
+ #######################################################################
+ vpxor \T4, \T1, \T1 # the result is in T1
+
+ vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
+ vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
+
+
+ vpxor \T1, \XMM1, \XMM1
+
+
+
+.endm
+
+
+# GHASH the last 4 ciphertext blocks.
+.macro GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
+
+ ## Karatsuba Method
+
+ vmovdqa HashKey_8(arg1), \T5
+
+ vpshufd $0b01001110, \XMM1, \T2
+ vpshufd $0b01001110, \T5, \T3
+ vpxor \XMM1, \T2, \T2
+ vpxor \T5, \T3, \T3
+
+ vpclmulqdq $0x11, \T5, \XMM1, \T6
+ vpclmulqdq $0x00, \T5, \XMM1, \T7
+
+ vpclmulqdq $0x00, \T3, \T2, \XMM1
+
+ ######################
+
+ vmovdqa HashKey_7(arg1), \T5
+ vpshufd $0b01001110, \XMM2, \T2
+ vpshufd $0b01001110, \T5, \T3
+ vpxor \XMM2, \T2, \T2
+ vpxor \T5, \T3, \T3
+
+ vpclmulqdq $0x11, \T5, \XMM2, \T4
+ vpxor \T4, \T6, \T6
+
+ vpclmulqdq $0x00, \T5, \XMM2, \T4
+ vpxor \T4, \T7, \T7
+
+ vpclmulqdq $0x00, \T3, \T2, \T2
+
+ vpxor \T2, \XMM1, \XMM1
+
+ ######################
+
+ vmovdqa HashKey_6(arg1), \T5
+ vpshufd $0b01001110, \XMM3, \T2
+ vpshufd $0b01001110, \T5, \T3
+ vpxor \XMM3, \T2, \T2
+ vpxor \T5, \T3, \T3
+
+ vpclmulqdq $0x11, \T5, \XMM3, \T4
+ vpxor \T4, \T6, \T6
+
+ vpclmulqdq $0x00, \T5, \XMM3, \T4
+ vpxor \T4, \T7, \T7
+
+ vpclmulqdq $0x00, \T3, \T2, \T2
+
+ vpxor \T2, \XMM1, \XMM1
+
+ ######################
+
+ vmovdqa HashKey_5(arg1), \T5
+ vpshufd $0b01001110, \XMM4, \T2
+ vpshufd $0b01001110, \T5, \T3
+ vpxor \XMM4, \T2, \T2
+ vpxor \T5, \T3, \T3
+
+ vpclmulqdq $0x11, \T5, \XMM4, \T4
+ vpxor \T4, \T6, \T6
+
+ vpclmulqdq $0x00, \T5, \XMM4, \T4
+ vpxor \T4, \T7, \T7
+
+ vpclmulqdq $0x00, \T3, \T2, \T2
+
+ vpxor \T2, \XMM1, \XMM1
+
+ ######################
+
+ vmovdqa HashKey_4(arg1), \T5
+ vpshufd $0b01001110, \XMM5, \T2
+ vpshufd $0b01001110, \T5, \T3
+ vpxor \XMM5, \T2, \T2
+ vpxor \T5, \T3, \T3
+
+ vpclmulqdq $0x11, \T5, \XMM5, \T4
+ vpxor \T4, \T6, \T6
+
+ vpclmulqdq $0x00, \T5, \XMM5, \T4
+ vpxor \T4, \T7, \T7
+
+ vpclmulqdq $0x00, \T3, \T2, \T2
+
+ vpxor \T2, \XMM1, \XMM1
+
+ ######################
+
+ vmovdqa HashKey_3(arg1), \T5
+ vpshufd $0b01001110, \XMM6, \T2
+ vpshufd $0b01001110, \T5, \T3
+ vpxor \XMM6, \T2, \T2
+ vpxor \T5, \T3, \T3
+
+ vpclmulqdq $0x11, \T5, \XMM6, \T4
+ vpxor \T4, \T6, \T6
+
+ vpclmulqdq $0x00, \T5, \XMM6, \T4
+ vpxor \T4, \T7, \T7
+
+ vpclmulqdq $0x00, \T3, \T2, \T2
+
+ vpxor \T2, \XMM1, \XMM1
+
+ ######################
+
+ vmovdqa HashKey_2(arg1), \T5
+ vpshufd $0b01001110, \XMM7, \T2
+ vpshufd $0b01001110, \T5, \T3
+ vpxor \XMM7, \T2, \T2
+ vpxor \T5, \T3, \T3
+
+ vpclmulqdq $0x11, \T5, \XMM7, \T4
+ vpxor \T4, \T6, \T6
+
+ vpclmulqdq $0x00, \T5, \XMM7, \T4
+ vpxor \T4, \T7, \T7
+
+ vpclmulqdq $0x00, \T3, \T2, \T2
+
+ vpxor \T2, \XMM1, \XMM1
+
+ ######################
+
+ vmovdqa HashKey(arg1), \T5
+ vpshufd $0b01001110, \XMM8, \T2
+ vpshufd $0b01001110, \T5, \T3
+ vpxor \XMM8, \T2, \T2
+ vpxor \T5, \T3, \T3
+
+ vpclmulqdq $0x11, \T5, \XMM8, \T4
+ vpxor \T4, \T6, \T6
+
+ vpclmulqdq $0x00, \T5, \XMM8, \T4
+ vpxor \T4, \T7, \T7
+
+ vpclmulqdq $0x00, \T3, \T2, \T2
+
+ vpxor \T2, \XMM1, \XMM1
+ vpxor \T6, \XMM1, \XMM1
+ vpxor \T7, \XMM1, \T2
+
+
+
+
+ vpslldq $8, \T2, \T4
+ vpsrldq $8, \T2, \T2
+
+ vpxor \T4, \T7, \T7
+ vpxor \T2, \T6, \T6 # <T6:T7> holds the result of the
+ # accumulated carry-less multiplications
+
+ #######################################################################
+ #first phase of the reduction
+ vmovdqa POLY2(%rip), \T3
+
+ vpclmulqdq $0x01, \T7, \T3, \T2
+ vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
+
+ vpxor \T2, \T7, \T7 # first phase of the reduction complete
+ #######################################################################
+
+
+ #second phase of the reduction
+ vpclmulqdq $0x00, \T7, \T3, \T2
+ vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
+
+ vpclmulqdq $0x10, \T7, \T3, \T4
+ vpslldq $4, \T4, \T4 # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
+
+ vpxor \T2, \T4, \T4 # second phase of the reduction complete
+ #######################################################################
+ vpxor \T4, \T6, \T6 # the result is in T6
+.endm
+
+
+
+# combined for GCM encrypt and decrypt functions
+# clobbering all xmm registers
+# clobbering r10, r11, r12, r13, r14, r15
+.macro GCM_ENC_DEC_AVX2 ENC_DEC
+
+ #the number of pushes must equal STACK_OFFSET
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ mov %rsp, %r14
+
+
+
+
+ sub $VARIABLE_OFFSET, %rsp
+ and $~63, %rsp # align rsp to 64 bytes
+
+
+ vmovdqu HashKey(arg1), %xmm13 # xmm13 = HashKey
+
+ mov arg4, %r13 # save the number of bytes of plaintext/ciphertext
+ and $-16, %r13 # r13 = r13 - (r13 mod 16)
+
+ mov %r13, %r12
+ shr $4, %r12
+ and $7, %r12
+ jz _initial_num_blocks_is_0\@
+
+ cmp $7, %r12
+ je _initial_num_blocks_is_7\@
+ cmp $6, %r12
+ je _initial_num_blocks_is_6\@
+ cmp $5, %r12
+ je _initial_num_blocks_is_5\@
+ cmp $4, %r12
+ je _initial_num_blocks_is_4\@
+ cmp $3, %r12
+ je _initial_num_blocks_is_3\@
+ cmp $2, %r12
+ je _initial_num_blocks_is_2\@
+
+ jmp _initial_num_blocks_is_1\@
+
+_initial_num_blocks_is_7\@:
+ INITIAL_BLOCKS_AVX2 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+ sub $16*7, %r13
+ jmp _initial_blocks_encrypted\@
+
+_initial_num_blocks_is_6\@:
+ INITIAL_BLOCKS_AVX2 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+ sub $16*6, %r13
+ jmp _initial_blocks_encrypted\@
+
+_initial_num_blocks_is_5\@:
+ INITIAL_BLOCKS_AVX2 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+ sub $16*5, %r13
+ jmp _initial_blocks_encrypted\@
+
+_initial_num_blocks_is_4\@:
+ INITIAL_BLOCKS_AVX2 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+ sub $16*4, %r13
+ jmp _initial_blocks_encrypted\@
+
+_initial_num_blocks_is_3\@:
+ INITIAL_BLOCKS_AVX2 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+ sub $16*3, %r13
+ jmp _initial_blocks_encrypted\@
+
+_initial_num_blocks_is_2\@:
+ INITIAL_BLOCKS_AVX2 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+ sub $16*2, %r13
+ jmp _initial_blocks_encrypted\@
+
+_initial_num_blocks_is_1\@:
+ INITIAL_BLOCKS_AVX2 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+ sub $16*1, %r13
+ jmp _initial_blocks_encrypted\@
+
+_initial_num_blocks_is_0\@:
+ INITIAL_BLOCKS_AVX2 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+
+
+_initial_blocks_encrypted\@:
+ cmp $0, %r13
+ je _zero_cipher_left\@
+
+ sub $128, %r13
+ je _eight_cipher_left\@
+
+
+
+
+ vmovd %xmm9, %r15d
+ and $255, %r15d
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+
+
+_encrypt_by_8_new\@:
+ cmp $(255-8), %r15d
+ jg _encrypt_by_8\@
+
+
+
+ add $8, %r15b
+ GHASH_8_ENCRYPT_8_PARALLEL_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
+ add $128, %r11
+ sub $128, %r13
+ jne _encrypt_by_8_new\@
+
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+ jmp _eight_cipher_left\@
+
+_encrypt_by_8\@:
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+ add $8, %r15b
+ GHASH_8_ENCRYPT_8_PARALLEL_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+ add $128, %r11
+ sub $128, %r13
+ jne _encrypt_by_8_new\@
+
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+
+
+
+
+_eight_cipher_left\@:
+ GHASH_LAST_8_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
+
+
+_zero_cipher_left\@:
+ cmp $16, arg4
+ jl _only_less_than_16\@
+
+ mov arg4, %r13
+ and $15, %r13 # r13 = (arg4 mod 16)
+
+ je _multiple_of_16_bytes\@
+
+ # handle the last <16 Byte block seperately
+
+
+ vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+ ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
+
+ sub $16, %r11
+ add %r13, %r11
+ vmovdqu (arg3, %r11), %xmm1 # receive the last <16 Byte block
+
+ lea SHIFT_MASK+16(%rip), %r12
+ sub %r13, %r12 # adjust the shuffle mask pointer
+ # to be able to shift 16-r13 bytes
+ # (r13 is the number of bytes in plaintext mod 16)
+ vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask
+ vpshufb %xmm2, %xmm1, %xmm1 # shift right 16-r13 bytes
+ jmp _final_ghash_mul\@
+
+_only_less_than_16\@:
+ # check for 0 length
+ mov arg4, %r13
+ and $15, %r13 # r13 = (arg4 mod 16)
+
+ je _multiple_of_16_bytes\@
+
+ # handle the last <16 Byte block seperately
+
+
+ vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+ ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
+
+
+ lea SHIFT_MASK+16(%rip), %r12
+ sub %r13, %r12 # adjust the shuffle mask pointer to be
+ # able to shift 16-r13 bytes (r13 is the
+ # number of bytes in plaintext mod 16)
+
+_get_last_16_byte_loop\@:
+ movb (arg3, %r11), %al
+ movb %al, TMP1 (%rsp , %r11)
+ add $1, %r11
+ cmp %r13, %r11
+ jne _get_last_16_byte_loop\@
+
+ vmovdqu TMP1(%rsp), %xmm1
+
+ sub $16, %r11
+
+_final_ghash_mul\@:
+ .if \ENC_DEC == DEC
+ vmovdqa %xmm1, %xmm2
+ vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
+ vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to mask out top 16-r13 bytes of xmm9
+ vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
+ vpand %xmm1, %xmm2, %xmm2
+ vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
+ vpxor %xmm2, %xmm14, %xmm14
+ #GHASH computation for the last <16 Byte block
+ GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
+ sub %r13, %r11
+ add $16, %r11
+ .else
+ vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
+ vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to mask out top 16-r13 bytes of xmm9
+ vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+ vpxor %xmm9, %xmm14, %xmm14
+ #GHASH computation for the last <16 Byte block
+ GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
+ sub %r13, %r11
+ add $16, %r11
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext
+ .endif
+
+
+ #############################
+ # output r13 Bytes
+ vmovq %xmm9, %rax
+ cmp $8, %r13
+ jle _less_than_8_bytes_left\@
+
+ mov %rax, (arg2 , %r11)
+ add $8, %r11
+ vpsrldq $8, %xmm9, %xmm9
+ vmovq %xmm9, %rax
+ sub $8, %r13
+
+_less_than_8_bytes_left\@:
+ movb %al, (arg2 , %r11)
+ add $1, %r11
+ shr $8, %rax
+ sub $1, %r13
+ jne _less_than_8_bytes_left\@
+ #############################
+
+_multiple_of_16_bytes\@:
+ mov arg7, %r12 # r12 = aadLen (number of bytes)
+ shl $3, %r12 # convert into number of bits
+ vmovd %r12d, %xmm15 # len(A) in xmm15
+
+ shl $3, arg4 # len(C) in bits (*128)
+ vmovq arg4, %xmm1
+ vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000
+ vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C)
+
+ vpxor %xmm15, %xmm14, %xmm14
+ GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation
+ vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap
+
+ mov arg5, %rax # rax = *Y0
+ vmovdqu (%rax), %xmm9 # xmm9 = Y0
+
+ ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Y0)
+
+ vpxor %xmm14, %xmm9, %xmm9
+
+
+
+_return_T\@:
+ mov arg8, %r10 # r10 = authTag
+ mov arg9, %r11 # r11 = auth_tag_len
+
+ cmp $16, %r11
+ je _T_16\@
+
+ cmp $8, %r11
+ jl _T_4\@
+
+_T_8\@:
+ vmovq %xmm9, %rax
+ mov %rax, (%r10)
+ add $8, %r10
+ sub $8, %r11
+ vpsrldq $8, %xmm9, %xmm9
+ cmp $0, %r11
+ je _return_T_done\@
+_T_4\@:
+ vmovd %xmm9, %eax
+ mov %eax, (%r10)
+ add $4, %r10
+ sub $4, %r11
+ vpsrldq $4, %xmm9, %xmm9
+ cmp $0, %r11
+ je _return_T_done\@
+_T_123\@:
+ vmovd %xmm9, %eax
+ cmp $2, %r11
+ jl _T_1\@
+ mov %ax, (%r10)
+ cmp $2, %r11
+ je _return_T_done\@
+ add $2, %r10
+ sar $16, %eax
+_T_1\@:
+ mov %al, (%r10)
+ jmp _return_T_done\@
+
+_T_16\@:
+ vmovdqu %xmm9, (%r10)
+
+_return_T_done\@:
+ mov %r14, %rsp
+
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+.endm
+
+
+#############################################################
+#void aesni_gcm_precomp_avx_gen4
+# (gcm_data *my_ctx_data,
+# u8 *hash_subkey)# /* H, the Hash sub key input.
+# Data starts on a 16-byte boundary. */
+#############################################################
+ENTRY(aesni_gcm_precomp_avx_gen4)
+ #the number of pushes must equal STACK_OFFSET
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ mov %rsp, %r14
+
+
+
+ sub $VARIABLE_OFFSET, %rsp
+ and $~63, %rsp # align rsp to 64 bytes
+
+ vmovdqu (arg2), %xmm6 # xmm6 = HashKey
+
+ vpshufb SHUF_MASK(%rip), %xmm6, %xmm6
+ ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
+ vmovdqa %xmm6, %xmm2
+ vpsllq $1, %xmm6, %xmm6
+ vpsrlq $63, %xmm2, %xmm2
+ vmovdqa %xmm2, %xmm1
+ vpslldq $8, %xmm2, %xmm2
+ vpsrldq $8, %xmm1, %xmm1
+ vpor %xmm2, %xmm6, %xmm6
+ #reduction
+ vpshufd $0b00100100, %xmm1, %xmm2
+ vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
+ vpand POLY(%rip), %xmm2, %xmm2
+ vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly
+ #######################################################################
+ vmovdqa %xmm6, HashKey(arg1) # store HashKey<<1 mod poly
+
+
+ PRECOMPUTE_AVX2 %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
+
+ mov %r14, %rsp
+
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ ret
+ENDPROC(aesni_gcm_precomp_avx_gen4)
+
+
+###############################################################################
+#void aesni_gcm_enc_avx_gen4(
+# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
+# u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
+# const u8 *in, /* Plaintext input */
+# u64 plaintext_len, /* Length of data in Bytes for encryption. */
+# u8 *iv, /* Pre-counter block j0: 4 byte salt
+# (from Security Association) concatenated with 8 byte
+# Initialisation Vector (from IPSec ESP Payload)
+# concatenated with 0x00000001. 16-byte aligned pointer. */
+# const u8 *aad, /* Additional Authentication Data (AAD)*/
+# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
+# u8 *auth_tag, /* Authenticated Tag output. */
+# u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
+# Valid values are 16 (most likely), 12 or 8. */
+###############################################################################
+ENTRY(aesni_gcm_enc_avx_gen4)
+ GCM_ENC_DEC_AVX2 ENC
+ ret
+ENDPROC(aesni_gcm_enc_avx_gen4)
+
+###############################################################################
+#void aesni_gcm_dec_avx_gen4(
+# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
+# u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
+# const u8 *in, /* Ciphertext input */
+# u64 plaintext_len, /* Length of data in Bytes for encryption. */
+# u8 *iv, /* Pre-counter block j0: 4 byte salt
+# (from Security Association) concatenated with 8 byte
+# Initialisation Vector (from IPSec ESP Payload)
+# concatenated with 0x00000001. 16-byte aligned pointer. */
+# const u8 *aad, /* Additional Authentication Data (AAD)*/
+# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
+# u8 *auth_tag, /* Authenticated Tag output. */
+# u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
+# Valid values are 16 (most likely), 12 or 8. */
+###############################################################################
+ENTRY(aesni_gcm_dec_avx_gen4)
+ GCM_ENC_DEC_AVX2 DEC
+ ret
+ENDPROC(aesni_gcm_dec_avx_gen4)
+
+#endif /* CONFIG_AS_AVX2 */
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
new file mode 100644
index 000000000..917f25e4d
--- /dev/null
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -0,0 +1,1548 @@
+/*
+ * Support for Intel AES-NI instructions. This file contains glue
+ * code, the real AES implementation is in intel-aes_asm.S.
+ *
+ * Copyright (C) 2008, Intel Corp.
+ * Author: Huang Ying <ying.huang@intel.com>
+ *
+ * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
+ * interface for 64-bit kernels.
+ * Authors: Adrian Hoban <adrian.hoban@intel.com>
+ * Gabriele Paoloni <gabriele.paoloni@intel.com>
+ * Tadeusz Struk (tadeusz.struk@intel.com)
+ * Aidan O'Mahony (aidan.o.mahony@intel.com)
+ * Copyright (c) 2010, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/hardirq.h>
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/err.h>
+#include <crypto/algapi.h>
+#include <crypto/aes.h>
+#include <crypto/cryptd.h>
+#include <crypto/ctr.h>
+#include <crypto/b128ops.h>
+#include <crypto/gcm.h>
+#include <crypto/xts.h>
+#include <asm/cpu_device_id.h>
+#include <asm/fpu/api.h>
+#include <asm/crypto/aes.h>
+#include <crypto/scatterwalk.h>
+#include <crypto/internal/aead.h>
+#include <crypto/internal/simd.h>
+#include <crypto/internal/skcipher.h>
+#include <linux/workqueue.h>
+#include <linux/spinlock.h>
+#ifdef CONFIG_X86_64
+#include <asm/crypto/glue_helper.h>
+#endif
+
+
+#define AESNI_ALIGN 16
+#define AESNI_ALIGN_ATTR __attribute__ ((__aligned__(AESNI_ALIGN)))
+#define AES_BLOCK_MASK (~(AES_BLOCK_SIZE - 1))
+#define RFC4106_HASH_SUBKEY_SIZE 16
+#define AESNI_ALIGN_EXTRA ((AESNI_ALIGN - 1) & ~(CRYPTO_MINALIGN - 1))
+#define CRYPTO_AES_CTX_SIZE (sizeof(struct crypto_aes_ctx) + AESNI_ALIGN_EXTRA)
+#define XTS_AES_CTX_SIZE (sizeof(struct aesni_xts_ctx) + AESNI_ALIGN_EXTRA)
+
+/* This data is stored at the end of the crypto_tfm struct.
+ * It's a type of per "session" data storage location.
+ * This needs to be 16 byte aligned.
+ */
+struct aesni_rfc4106_gcm_ctx {
+ u8 hash_subkey[16] AESNI_ALIGN_ATTR;
+ struct crypto_aes_ctx aes_key_expanded AESNI_ALIGN_ATTR;
+ u8 nonce[4];
+};
+
+struct generic_gcmaes_ctx {
+ u8 hash_subkey[16] AESNI_ALIGN_ATTR;
+ struct crypto_aes_ctx aes_key_expanded AESNI_ALIGN_ATTR;
+};
+
+struct aesni_xts_ctx {
+ u8 raw_tweak_ctx[sizeof(struct crypto_aes_ctx)] AESNI_ALIGN_ATTR;
+ u8 raw_crypt_ctx[sizeof(struct crypto_aes_ctx)] AESNI_ALIGN_ATTR;
+};
+
+#define GCM_BLOCK_LEN 16
+
+struct gcm_context_data {
+ /* init, update and finalize context data */
+ u8 aad_hash[GCM_BLOCK_LEN];
+ u64 aad_length;
+ u64 in_length;
+ u8 partial_block_enc_key[GCM_BLOCK_LEN];
+ u8 orig_IV[GCM_BLOCK_LEN];
+ u8 current_counter[GCM_BLOCK_LEN];
+ u64 partial_block_len;
+ u64 unused;
+ u8 hash_keys[GCM_BLOCK_LEN * 8];
+};
+
+asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
+ unsigned int key_len);
+asmlinkage void aesni_enc(struct crypto_aes_ctx *ctx, u8 *out,
+ const u8 *in);
+asmlinkage void aesni_dec(struct crypto_aes_ctx *ctx, u8 *out,
+ const u8 *in);
+asmlinkage void aesni_ecb_enc(struct crypto_aes_ctx *ctx, u8 *out,
+ const u8 *in, unsigned int len);
+asmlinkage void aesni_ecb_dec(struct crypto_aes_ctx *ctx, u8 *out,
+ const u8 *in, unsigned int len);
+asmlinkage void aesni_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out,
+ const u8 *in, unsigned int len, u8 *iv);
+asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
+ const u8 *in, unsigned int len, u8 *iv);
+
+int crypto_fpu_init(void);
+void crypto_fpu_exit(void);
+
+#define AVX_GEN2_OPTSIZE 640
+#define AVX_GEN4_OPTSIZE 4096
+
+#ifdef CONFIG_X86_64
+
+static void (*aesni_ctr_enc_tfm)(struct crypto_aes_ctx *ctx, u8 *out,
+ const u8 *in, unsigned int len, u8 *iv);
+asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
+ const u8 *in, unsigned int len, u8 *iv);
+
+asmlinkage void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, u8 *out,
+ const u8 *in, bool enc, u8 *iv);
+
+/* asmlinkage void aesni_gcm_enc()
+ * void *ctx, AES Key schedule. Starts on a 16 byte boundary.
+ * struct gcm_context_data. May be uninitialized.
+ * u8 *out, Ciphertext output. Encrypt in-place is allowed.
+ * const u8 *in, Plaintext input
+ * unsigned long plaintext_len, Length of data in bytes for encryption.
+ * u8 *iv, Pre-counter block j0: 12 byte IV concatenated with 0x00000001.
+ * 16-byte aligned pointer.
+ * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
+ * const u8 *aad, Additional Authentication Data (AAD)
+ * unsigned long aad_len, Length of AAD in bytes.
+ * u8 *auth_tag, Authenticated Tag output.
+ * unsigned long auth_tag_len), Authenticated Tag Length in bytes.
+ * Valid values are 16 (most likely), 12 or 8.
+ */
+asmlinkage void aesni_gcm_enc(void *ctx,
+ struct gcm_context_data *gdata, u8 *out,
+ const u8 *in, unsigned long plaintext_len, u8 *iv,
+ u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
+ u8 *auth_tag, unsigned long auth_tag_len);
+
+/* asmlinkage void aesni_gcm_dec()
+ * void *ctx, AES Key schedule. Starts on a 16 byte boundary.
+ * struct gcm_context_data. May be uninitialized.
+ * u8 *out, Plaintext output. Decrypt in-place is allowed.
+ * const u8 *in, Ciphertext input
+ * unsigned long ciphertext_len, Length of data in bytes for decryption.
+ * u8 *iv, Pre-counter block j0: 12 byte IV concatenated with 0x00000001.
+ * 16-byte aligned pointer.
+ * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
+ * const u8 *aad, Additional Authentication Data (AAD)
+ * unsigned long aad_len, Length of AAD in bytes. With RFC4106 this is going
+ * to be 8 or 12 bytes
+ * u8 *auth_tag, Authenticated Tag output.
+ * unsigned long auth_tag_len) Authenticated Tag Length in bytes.
+ * Valid values are 16 (most likely), 12 or 8.
+ */
+asmlinkage void aesni_gcm_dec(void *ctx,
+ struct gcm_context_data *gdata, u8 *out,
+ const u8 *in, unsigned long ciphertext_len, u8 *iv,
+ u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
+ u8 *auth_tag, unsigned long auth_tag_len);
+
+/* Scatter / Gather routines, with args similar to above */
+asmlinkage void aesni_gcm_init(void *ctx,
+ struct gcm_context_data *gdata,
+ u8 *iv,
+ u8 *hash_subkey, const u8 *aad,
+ unsigned long aad_len);
+asmlinkage void aesni_gcm_enc_update(void *ctx,
+ struct gcm_context_data *gdata, u8 *out,
+ const u8 *in, unsigned long plaintext_len);
+asmlinkage void aesni_gcm_dec_update(void *ctx,
+ struct gcm_context_data *gdata, u8 *out,
+ const u8 *in,
+ unsigned long ciphertext_len);
+asmlinkage void aesni_gcm_finalize(void *ctx,
+ struct gcm_context_data *gdata,
+ u8 *auth_tag, unsigned long auth_tag_len);
+
+#ifdef CONFIG_AS_AVX
+asmlinkage void aes_ctr_enc_128_avx_by8(const u8 *in, u8 *iv,
+ void *keys, u8 *out, unsigned int num_bytes);
+asmlinkage void aes_ctr_enc_192_avx_by8(const u8 *in, u8 *iv,
+ void *keys, u8 *out, unsigned int num_bytes);
+asmlinkage void aes_ctr_enc_256_avx_by8(const u8 *in, u8 *iv,
+ void *keys, u8 *out, unsigned int num_bytes);
+/*
+ * asmlinkage void aesni_gcm_precomp_avx_gen2()
+ * gcm_data *my_ctx_data, context data
+ * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
+ */
+asmlinkage void aesni_gcm_precomp_avx_gen2(void *my_ctx_data, u8 *hash_subkey);
+
+asmlinkage void aesni_gcm_enc_avx_gen2(void *ctx, u8 *out,
+ const u8 *in, unsigned long plaintext_len, u8 *iv,
+ const u8 *aad, unsigned long aad_len,
+ u8 *auth_tag, unsigned long auth_tag_len);
+
+asmlinkage void aesni_gcm_dec_avx_gen2(void *ctx, u8 *out,
+ const u8 *in, unsigned long ciphertext_len, u8 *iv,
+ const u8 *aad, unsigned long aad_len,
+ u8 *auth_tag, unsigned long auth_tag_len);
+
+static void aesni_gcm_enc_avx(void *ctx,
+ struct gcm_context_data *data, u8 *out,
+ const u8 *in, unsigned long plaintext_len, u8 *iv,
+ u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
+ u8 *auth_tag, unsigned long auth_tag_len)
+{
+ struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx;
+ if ((plaintext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)){
+ aesni_gcm_enc(ctx, data, out, in,
+ plaintext_len, iv, hash_subkey, aad,
+ aad_len, auth_tag, auth_tag_len);
+ } else {
+ aesni_gcm_precomp_avx_gen2(ctx, hash_subkey);
+ aesni_gcm_enc_avx_gen2(ctx, out, in, plaintext_len, iv, aad,
+ aad_len, auth_tag, auth_tag_len);
+ }
+}
+
+static void aesni_gcm_dec_avx(void *ctx,
+ struct gcm_context_data *data, u8 *out,
+ const u8 *in, unsigned long ciphertext_len, u8 *iv,
+ u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
+ u8 *auth_tag, unsigned long auth_tag_len)
+{
+ struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx;
+ if ((ciphertext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)) {
+ aesni_gcm_dec(ctx, data, out, in,
+ ciphertext_len, iv, hash_subkey, aad,
+ aad_len, auth_tag, auth_tag_len);
+ } else {
+ aesni_gcm_precomp_avx_gen2(ctx, hash_subkey);
+ aesni_gcm_dec_avx_gen2(ctx, out, in, ciphertext_len, iv, aad,
+ aad_len, auth_tag, auth_tag_len);
+ }
+}
+#endif
+
+#ifdef CONFIG_AS_AVX2
+/*
+ * asmlinkage void aesni_gcm_precomp_avx_gen4()
+ * gcm_data *my_ctx_data, context data
+ * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
+ */
+asmlinkage void aesni_gcm_precomp_avx_gen4(void *my_ctx_data, u8 *hash_subkey);
+
+asmlinkage void aesni_gcm_enc_avx_gen4(void *ctx, u8 *out,
+ const u8 *in, unsigned long plaintext_len, u8 *iv,
+ const u8 *aad, unsigned long aad_len,
+ u8 *auth_tag, unsigned long auth_tag_len);
+
+asmlinkage void aesni_gcm_dec_avx_gen4(void *ctx, u8 *out,
+ const u8 *in, unsigned long ciphertext_len, u8 *iv,
+ const u8 *aad, unsigned long aad_len,
+ u8 *auth_tag, unsigned long auth_tag_len);
+
+static void aesni_gcm_enc_avx2(void *ctx,
+ struct gcm_context_data *data, u8 *out,
+ const u8 *in, unsigned long plaintext_len, u8 *iv,
+ u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
+ u8 *auth_tag, unsigned long auth_tag_len)
+{
+ struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx;
+ if ((plaintext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)) {
+ aesni_gcm_enc(ctx, data, out, in,
+ plaintext_len, iv, hash_subkey, aad,
+ aad_len, auth_tag, auth_tag_len);
+ } else if (plaintext_len < AVX_GEN4_OPTSIZE) {
+ aesni_gcm_precomp_avx_gen2(ctx, hash_subkey);
+ aesni_gcm_enc_avx_gen2(ctx, out, in, plaintext_len, iv, aad,
+ aad_len, auth_tag, auth_tag_len);
+ } else {
+ aesni_gcm_precomp_avx_gen4(ctx, hash_subkey);
+ aesni_gcm_enc_avx_gen4(ctx, out, in, plaintext_len, iv, aad,
+ aad_len, auth_tag, auth_tag_len);
+ }
+}
+
+static void aesni_gcm_dec_avx2(void *ctx,
+ struct gcm_context_data *data, u8 *out,
+ const u8 *in, unsigned long ciphertext_len, u8 *iv,
+ u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
+ u8 *auth_tag, unsigned long auth_tag_len)
+{
+ struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx;
+ if ((ciphertext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)) {
+ aesni_gcm_dec(ctx, data, out, in,
+ ciphertext_len, iv, hash_subkey,
+ aad, aad_len, auth_tag, auth_tag_len);
+ } else if (ciphertext_len < AVX_GEN4_OPTSIZE) {
+ aesni_gcm_precomp_avx_gen2(ctx, hash_subkey);
+ aesni_gcm_dec_avx_gen2(ctx, out, in, ciphertext_len, iv, aad,
+ aad_len, auth_tag, auth_tag_len);
+ } else {
+ aesni_gcm_precomp_avx_gen4(ctx, hash_subkey);
+ aesni_gcm_dec_avx_gen4(ctx, out, in, ciphertext_len, iv, aad,
+ aad_len, auth_tag, auth_tag_len);
+ }
+}
+#endif
+
+static void (*aesni_gcm_enc_tfm)(void *ctx,
+ struct gcm_context_data *data, u8 *out,
+ const u8 *in, unsigned long plaintext_len,
+ u8 *iv, u8 *hash_subkey, const u8 *aad,
+ unsigned long aad_len, u8 *auth_tag,
+ unsigned long auth_tag_len);
+
+static void (*aesni_gcm_dec_tfm)(void *ctx,
+ struct gcm_context_data *data, u8 *out,
+ const u8 *in, unsigned long ciphertext_len,
+ u8 *iv, u8 *hash_subkey, const u8 *aad,
+ unsigned long aad_len, u8 *auth_tag,
+ unsigned long auth_tag_len);
+
+static inline struct
+aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm)
+{
+ unsigned long align = AESNI_ALIGN;
+
+ if (align <= crypto_tfm_ctx_alignment())
+ align = 1;
+ return PTR_ALIGN(crypto_aead_ctx(tfm), align);
+}
+
+static inline struct
+generic_gcmaes_ctx *generic_gcmaes_ctx_get(struct crypto_aead *tfm)
+{
+ unsigned long align = AESNI_ALIGN;
+
+ if (align <= crypto_tfm_ctx_alignment())
+ align = 1;
+ return PTR_ALIGN(crypto_aead_ctx(tfm), align);
+}
+#endif
+
+static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx)
+{
+ unsigned long addr = (unsigned long)raw_ctx;
+ unsigned long align = AESNI_ALIGN;
+
+ if (align <= crypto_tfm_ctx_alignment())
+ align = 1;
+ return (struct crypto_aes_ctx *)ALIGN(addr, align);
+}
+
+static int aes_set_key_common(struct crypto_tfm *tfm, void *raw_ctx,
+ const u8 *in_key, unsigned int key_len)
+{
+ struct crypto_aes_ctx *ctx = aes_ctx(raw_ctx);
+ u32 *flags = &tfm->crt_flags;
+ int err;
+
+ if (key_len != AES_KEYSIZE_128 && key_len != AES_KEYSIZE_192 &&
+ key_len != AES_KEYSIZE_256) {
+ *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+ return -EINVAL;
+ }
+
+ if (!irq_fpu_usable())
+ err = crypto_aes_expand_key(ctx, in_key, key_len);
+ else {
+ kernel_fpu_begin();
+ err = aesni_set_key(ctx, in_key, key_len);
+ kernel_fpu_end();
+ }
+
+ return err;
+}
+
+static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key,
+ unsigned int key_len)
+{
+ return aes_set_key_common(tfm, crypto_tfm_ctx(tfm), in_key, key_len);
+}
+
+static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+ struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm));
+
+ if (!irq_fpu_usable())
+ crypto_aes_encrypt_x86(ctx, dst, src);
+ else {
+ kernel_fpu_begin();
+ aesni_enc(ctx, dst, src);
+ kernel_fpu_end();
+ }
+}
+
+static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+ struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm));
+
+ if (!irq_fpu_usable())
+ crypto_aes_decrypt_x86(ctx, dst, src);
+ else {
+ kernel_fpu_begin();
+ aesni_dec(ctx, dst, src);
+ kernel_fpu_end();
+ }
+}
+
+static void __aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+ struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm));
+
+ aesni_enc(ctx, dst, src);
+}
+
+static void __aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+ struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm));
+
+ aesni_dec(ctx, dst, src);
+}
+
+static int aesni_skcipher_setkey(struct crypto_skcipher *tfm, const u8 *key,
+ unsigned int len)
+{
+ return aes_set_key_common(crypto_skcipher_tfm(tfm),
+ crypto_skcipher_ctx(tfm), key, len);
+}
+
+static int ecb_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm));
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, true);
+
+ kernel_fpu_begin();
+ while ((nbytes = walk.nbytes)) {
+ aesni_ecb_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
+ nbytes & AES_BLOCK_MASK);
+ nbytes &= AES_BLOCK_SIZE - 1;
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+ kernel_fpu_end();
+
+ return err;
+}
+
+static int ecb_decrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm));
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, true);
+
+ kernel_fpu_begin();
+ while ((nbytes = walk.nbytes)) {
+ aesni_ecb_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
+ nbytes & AES_BLOCK_MASK);
+ nbytes &= AES_BLOCK_SIZE - 1;
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+ kernel_fpu_end();
+
+ return err;
+}
+
+static int cbc_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm));
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, true);
+
+ kernel_fpu_begin();
+ while ((nbytes = walk.nbytes)) {
+ aesni_cbc_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
+ nbytes & AES_BLOCK_MASK, walk.iv);
+ nbytes &= AES_BLOCK_SIZE - 1;
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+ kernel_fpu_end();
+
+ return err;
+}
+
+static int cbc_decrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm));
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, true);
+
+ kernel_fpu_begin();
+ while ((nbytes = walk.nbytes)) {
+ aesni_cbc_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
+ nbytes & AES_BLOCK_MASK, walk.iv);
+ nbytes &= AES_BLOCK_SIZE - 1;
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+ kernel_fpu_end();
+
+ return err;
+}
+
+#ifdef CONFIG_X86_64
+static void ctr_crypt_final(struct crypto_aes_ctx *ctx,
+ struct skcipher_walk *walk)
+{
+ u8 *ctrblk = walk->iv;
+ u8 keystream[AES_BLOCK_SIZE];
+ u8 *src = walk->src.virt.addr;
+ u8 *dst = walk->dst.virt.addr;
+ unsigned int nbytes = walk->nbytes;
+
+ aesni_enc(ctx, keystream, ctrblk);
+ crypto_xor_cpy(dst, keystream, src, nbytes);
+
+ crypto_inc(ctrblk, AES_BLOCK_SIZE);
+}
+
+#ifdef CONFIG_AS_AVX
+static void aesni_ctr_enc_avx_tfm(struct crypto_aes_ctx *ctx, u8 *out,
+ const u8 *in, unsigned int len, u8 *iv)
+{
+ /*
+ * based on key length, override with the by8 version
+ * of ctr mode encryption/decryption for improved performance
+ * aes_set_key_common() ensures that key length is one of
+ * {128,192,256}
+ */
+ if (ctx->key_length == AES_KEYSIZE_128)
+ aes_ctr_enc_128_avx_by8(in, iv, (void *)ctx, out, len);
+ else if (ctx->key_length == AES_KEYSIZE_192)
+ aes_ctr_enc_192_avx_by8(in, iv, (void *)ctx, out, len);
+ else
+ aes_ctr_enc_256_avx_by8(in, iv, (void *)ctx, out, len);
+}
+#endif
+
+static int ctr_crypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm));
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, true);
+
+ kernel_fpu_begin();
+ while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) {
+ aesni_ctr_enc_tfm(ctx, walk.dst.virt.addr, walk.src.virt.addr,
+ nbytes & AES_BLOCK_MASK, walk.iv);
+ nbytes &= AES_BLOCK_SIZE - 1;
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+ if (walk.nbytes) {
+ ctr_crypt_final(ctx, &walk);
+ err = skcipher_walk_done(&walk, 0);
+ }
+ kernel_fpu_end();
+
+ return err;
+}
+
+static int xts_aesni_setkey(struct crypto_skcipher *tfm, const u8 *key,
+ unsigned int keylen)
+{
+ struct aesni_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+ int err;
+
+ err = xts_verify_key(tfm, key, keylen);
+ if (err)
+ return err;
+
+ keylen /= 2;
+
+ /* first half of xts-key is for crypt */
+ err = aes_set_key_common(crypto_skcipher_tfm(tfm), ctx->raw_crypt_ctx,
+ key, keylen);
+ if (err)
+ return err;
+
+ /* second half of xts-key is for tweak */
+ return aes_set_key_common(crypto_skcipher_tfm(tfm), ctx->raw_tweak_ctx,
+ key + keylen, keylen);
+}
+
+
+static void aesni_xts_tweak(void *ctx, u8 *out, const u8 *in)
+{
+ aesni_enc(ctx, out, in);
+}
+
+static void aesni_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+{
+ glue_xts_crypt_128bit_one(ctx, dst, src, iv, GLUE_FUNC_CAST(aesni_enc));
+}
+
+static void aesni_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+{
+ glue_xts_crypt_128bit_one(ctx, dst, src, iv, GLUE_FUNC_CAST(aesni_dec));
+}
+
+static void aesni_xts_enc8(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+{
+ aesni_xts_crypt8(ctx, (u8 *)dst, (const u8 *)src, true, (u8 *)iv);
+}
+
+static void aesni_xts_dec8(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+{
+ aesni_xts_crypt8(ctx, (u8 *)dst, (const u8 *)src, false, (u8 *)iv);
+}
+
+static const struct common_glue_ctx aesni_enc_xts = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = 1,
+
+ .funcs = { {
+ .num_blocks = 8,
+ .fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_enc8) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_enc) }
+ } }
+};
+
+static const struct common_glue_ctx aesni_dec_xts = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = 1,
+
+ .funcs = { {
+ .num_blocks = 8,
+ .fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_dec8) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_dec) }
+ } }
+};
+
+static int xts_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct aesni_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return glue_xts_req_128bit(&aesni_enc_xts, req,
+ XTS_TWEAK_CAST(aesni_xts_tweak),
+ aes_ctx(ctx->raw_tweak_ctx),
+ aes_ctx(ctx->raw_crypt_ctx));
+}
+
+static int xts_decrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct aesni_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return glue_xts_req_128bit(&aesni_dec_xts, req,
+ XTS_TWEAK_CAST(aesni_xts_tweak),
+ aes_ctx(ctx->raw_tweak_ctx),
+ aes_ctx(ctx->raw_crypt_ctx));
+}
+
+static int rfc4106_init(struct crypto_aead *aead)
+{
+ struct cryptd_aead *cryptd_tfm;
+ struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+
+ cryptd_tfm = cryptd_alloc_aead("__driver-gcm-aes-aesni",
+ CRYPTO_ALG_INTERNAL,
+ CRYPTO_ALG_INTERNAL);
+ if (IS_ERR(cryptd_tfm))
+ return PTR_ERR(cryptd_tfm);
+
+ *ctx = cryptd_tfm;
+ crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base));
+ return 0;
+}
+
+static void rfc4106_exit(struct crypto_aead *aead)
+{
+ struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+
+ cryptd_free_aead(*ctx);
+}
+
+static int
+rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len)
+{
+ struct crypto_cipher *tfm;
+ int ret;
+
+ tfm = crypto_alloc_cipher("aes", 0, 0);
+ if (IS_ERR(tfm))
+ return PTR_ERR(tfm);
+
+ ret = crypto_cipher_setkey(tfm, key, key_len);
+ if (ret)
+ goto out_free_cipher;
+
+ /* Clear the data in the hash sub key container to zero.*/
+ /* We want to cipher all zeros to create the hash sub key. */
+ memset(hash_subkey, 0, RFC4106_HASH_SUBKEY_SIZE);
+
+ crypto_cipher_encrypt_one(tfm, hash_subkey, hash_subkey);
+
+out_free_cipher:
+ crypto_free_cipher(tfm);
+ return ret;
+}
+
+static int common_rfc4106_set_key(struct crypto_aead *aead, const u8 *key,
+ unsigned int key_len)
+{
+ struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(aead);
+
+ if (key_len < 4) {
+ crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN);
+ return -EINVAL;
+ }
+ /*Account for 4 byte nonce at the end.*/
+ key_len -= 4;
+
+ memcpy(ctx->nonce, key + key_len, sizeof(ctx->nonce));
+
+ return aes_set_key_common(crypto_aead_tfm(aead),
+ &ctx->aes_key_expanded, key, key_len) ?:
+ rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len);
+}
+
+static int gcmaes_wrapper_set_key(struct crypto_aead *parent, const u8 *key,
+ unsigned int key_len)
+{
+ struct cryptd_aead **ctx = crypto_aead_ctx(parent);
+ struct cryptd_aead *cryptd_tfm = *ctx;
+
+ return crypto_aead_setkey(&cryptd_tfm->base, key, key_len);
+}
+
+static int common_rfc4106_set_authsize(struct crypto_aead *aead,
+ unsigned int authsize)
+{
+ switch (authsize) {
+ case 8:
+ case 12:
+ case 16:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/* This is the Integrity Check Value (aka the authentication tag length and can
+ * be 8, 12 or 16 bytes long. */
+static int gcmaes_wrapper_set_authsize(struct crypto_aead *parent,
+ unsigned int authsize)
+{
+ struct cryptd_aead **ctx = crypto_aead_ctx(parent);
+ struct cryptd_aead *cryptd_tfm = *ctx;
+
+ return crypto_aead_setauthsize(&cryptd_tfm->base, authsize);
+}
+
+static int generic_gcmaes_set_authsize(struct crypto_aead *tfm,
+ unsigned int authsize)
+{
+ switch (authsize) {
+ case 4:
+ case 8:
+ case 12:
+ case 13:
+ case 14:
+ case 15:
+ case 16:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req,
+ unsigned int assoclen, u8 *hash_subkey,
+ u8 *iv, void *aes_ctx)
+{
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ unsigned long auth_tag_len = crypto_aead_authsize(tfm);
+ struct gcm_context_data data AESNI_ALIGN_ATTR;
+ struct scatter_walk dst_sg_walk = {};
+ unsigned long left = req->cryptlen;
+ unsigned long len, srclen, dstlen;
+ struct scatter_walk assoc_sg_walk;
+ struct scatter_walk src_sg_walk;
+ struct scatterlist src_start[2];
+ struct scatterlist dst_start[2];
+ struct scatterlist *src_sg;
+ struct scatterlist *dst_sg;
+ u8 *src, *dst, *assoc;
+ u8 *assocmem = NULL;
+ u8 authTag[16];
+
+ if (!enc)
+ left -= auth_tag_len;
+
+ /* Linearize assoc, if not already linear */
+ if (req->src->length >= assoclen && req->src->length &&
+ (!PageHighMem(sg_page(req->src)) ||
+ req->src->offset + req->src->length <= PAGE_SIZE)) {
+ scatterwalk_start(&assoc_sg_walk, req->src);
+ assoc = scatterwalk_map(&assoc_sg_walk);
+ } else {
+ /* assoc can be any length, so must be on heap */
+ assocmem = kmalloc(assoclen, GFP_ATOMIC);
+ if (unlikely(!assocmem))
+ return -ENOMEM;
+ assoc = assocmem;
+
+ scatterwalk_map_and_copy(assoc, req->src, 0, assoclen, 0);
+ }
+
+ if (left) {
+ src_sg = scatterwalk_ffwd(src_start, req->src, req->assoclen);
+ scatterwalk_start(&src_sg_walk, src_sg);
+ if (req->src != req->dst) {
+ dst_sg = scatterwalk_ffwd(dst_start, req->dst,
+ req->assoclen);
+ scatterwalk_start(&dst_sg_walk, dst_sg);
+ }
+ }
+
+ kernel_fpu_begin();
+ aesni_gcm_init(aes_ctx, &data, iv,
+ hash_subkey, assoc, assoclen);
+ if (req->src != req->dst) {
+ while (left) {
+ src = scatterwalk_map(&src_sg_walk);
+ dst = scatterwalk_map(&dst_sg_walk);
+ srclen = scatterwalk_clamp(&src_sg_walk, left);
+ dstlen = scatterwalk_clamp(&dst_sg_walk, left);
+ len = min(srclen, dstlen);
+ if (len) {
+ if (enc)
+ aesni_gcm_enc_update(aes_ctx, &data,
+ dst, src, len);
+ else
+ aesni_gcm_dec_update(aes_ctx, &data,
+ dst, src, len);
+ }
+ left -= len;
+
+ scatterwalk_unmap(src);
+ scatterwalk_unmap(dst);
+ scatterwalk_advance(&src_sg_walk, len);
+ scatterwalk_advance(&dst_sg_walk, len);
+ scatterwalk_done(&src_sg_walk, 0, left);
+ scatterwalk_done(&dst_sg_walk, 1, left);
+ }
+ } else {
+ while (left) {
+ dst = src = scatterwalk_map(&src_sg_walk);
+ len = scatterwalk_clamp(&src_sg_walk, left);
+ if (len) {
+ if (enc)
+ aesni_gcm_enc_update(aes_ctx, &data,
+ src, src, len);
+ else
+ aesni_gcm_dec_update(aes_ctx, &data,
+ src, src, len);
+ }
+ left -= len;
+ scatterwalk_unmap(src);
+ scatterwalk_advance(&src_sg_walk, len);
+ scatterwalk_done(&src_sg_walk, 1, left);
+ }
+ }
+ aesni_gcm_finalize(aes_ctx, &data, authTag, auth_tag_len);
+ kernel_fpu_end();
+
+ if (!assocmem)
+ scatterwalk_unmap(assoc);
+ else
+ kfree(assocmem);
+
+ if (!enc) {
+ u8 authTagMsg[16];
+
+ /* Copy out original authTag */
+ scatterwalk_map_and_copy(authTagMsg, req->src,
+ req->assoclen + req->cryptlen -
+ auth_tag_len,
+ auth_tag_len, 0);
+
+ /* Compare generated tag with passed in tag. */
+ return crypto_memneq(authTagMsg, authTag, auth_tag_len) ?
+ -EBADMSG : 0;
+ }
+
+ /* Copy in the authTag */
+ scatterwalk_map_and_copy(authTag, req->dst,
+ req->assoclen + req->cryptlen,
+ auth_tag_len, 1);
+
+ return 0;
+}
+
+static int gcmaes_encrypt(struct aead_request *req, unsigned int assoclen,
+ u8 *hash_subkey, u8 *iv, void *aes_ctx)
+{
+ u8 one_entry_in_sg = 0;
+ u8 *src, *dst, *assoc;
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ unsigned long auth_tag_len = crypto_aead_authsize(tfm);
+ struct scatter_walk src_sg_walk;
+ struct scatter_walk dst_sg_walk = {};
+ struct gcm_context_data data AESNI_ALIGN_ATTR;
+
+ if (((struct crypto_aes_ctx *)aes_ctx)->key_length != AES_KEYSIZE_128 ||
+ aesni_gcm_enc_tfm == aesni_gcm_enc ||
+ req->cryptlen < AVX_GEN2_OPTSIZE) {
+ return gcmaes_crypt_by_sg(true, req, assoclen, hash_subkey, iv,
+ aes_ctx);
+ }
+ if (sg_is_last(req->src) &&
+ (!PageHighMem(sg_page(req->src)) ||
+ req->src->offset + req->src->length <= PAGE_SIZE) &&
+ sg_is_last(req->dst) &&
+ (!PageHighMem(sg_page(req->dst)) ||
+ req->dst->offset + req->dst->length <= PAGE_SIZE)) {
+ one_entry_in_sg = 1;
+ scatterwalk_start(&src_sg_walk, req->src);
+ assoc = scatterwalk_map(&src_sg_walk);
+ src = assoc + req->assoclen;
+ dst = src;
+ if (unlikely(req->src != req->dst)) {
+ scatterwalk_start(&dst_sg_walk, req->dst);
+ dst = scatterwalk_map(&dst_sg_walk) + req->assoclen;
+ }
+ } else {
+ /* Allocate memory for src, dst, assoc */
+ assoc = kmalloc(req->cryptlen + auth_tag_len + req->assoclen,
+ GFP_ATOMIC);
+ if (unlikely(!assoc))
+ return -ENOMEM;
+ scatterwalk_map_and_copy(assoc, req->src, 0,
+ req->assoclen + req->cryptlen, 0);
+ src = assoc + req->assoclen;
+ dst = src;
+ }
+
+ kernel_fpu_begin();
+ aesni_gcm_enc_tfm(aes_ctx, &data, dst, src, req->cryptlen, iv,
+ hash_subkey, assoc, assoclen,
+ dst + req->cryptlen, auth_tag_len);
+ kernel_fpu_end();
+
+ /* The authTag (aka the Integrity Check Value) needs to be written
+ * back to the packet. */
+ if (one_entry_in_sg) {
+ if (unlikely(req->src != req->dst)) {
+ scatterwalk_unmap(dst - req->assoclen);
+ scatterwalk_advance(&dst_sg_walk, req->dst->length);
+ scatterwalk_done(&dst_sg_walk, 1, 0);
+ }
+ scatterwalk_unmap(assoc);
+ scatterwalk_advance(&src_sg_walk, req->src->length);
+ scatterwalk_done(&src_sg_walk, req->src == req->dst, 0);
+ } else {
+ scatterwalk_map_and_copy(dst, req->dst, req->assoclen,
+ req->cryptlen + auth_tag_len, 1);
+ kfree(assoc);
+ }
+ return 0;
+}
+
+static int gcmaes_decrypt(struct aead_request *req, unsigned int assoclen,
+ u8 *hash_subkey, u8 *iv, void *aes_ctx)
+{
+ u8 one_entry_in_sg = 0;
+ u8 *src, *dst, *assoc;
+ unsigned long tempCipherLen = 0;
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ unsigned long auth_tag_len = crypto_aead_authsize(tfm);
+ u8 authTag[16];
+ struct scatter_walk src_sg_walk;
+ struct scatter_walk dst_sg_walk = {};
+ struct gcm_context_data data AESNI_ALIGN_ATTR;
+ int retval = 0;
+
+ if (((struct crypto_aes_ctx *)aes_ctx)->key_length != AES_KEYSIZE_128 ||
+ aesni_gcm_enc_tfm == aesni_gcm_enc ||
+ req->cryptlen < AVX_GEN2_OPTSIZE) {
+ return gcmaes_crypt_by_sg(false, req, assoclen, hash_subkey, iv,
+ aes_ctx);
+ }
+ tempCipherLen = (unsigned long)(req->cryptlen - auth_tag_len);
+
+ if (sg_is_last(req->src) &&
+ (!PageHighMem(sg_page(req->src)) ||
+ req->src->offset + req->src->length <= PAGE_SIZE) &&
+ sg_is_last(req->dst) && req->dst->length &&
+ (!PageHighMem(sg_page(req->dst)) ||
+ req->dst->offset + req->dst->length <= PAGE_SIZE)) {
+ one_entry_in_sg = 1;
+ scatterwalk_start(&src_sg_walk, req->src);
+ assoc = scatterwalk_map(&src_sg_walk);
+ src = assoc + req->assoclen;
+ dst = src;
+ if (unlikely(req->src != req->dst)) {
+ scatterwalk_start(&dst_sg_walk, req->dst);
+ dst = scatterwalk_map(&dst_sg_walk) + req->assoclen;
+ }
+ } else {
+ /* Allocate memory for src, dst, assoc */
+ assoc = kmalloc(req->cryptlen + req->assoclen, GFP_ATOMIC);
+ if (!assoc)
+ return -ENOMEM;
+ scatterwalk_map_and_copy(assoc, req->src, 0,
+ req->assoclen + req->cryptlen, 0);
+ src = assoc + req->assoclen;
+ dst = src;
+ }
+
+
+ kernel_fpu_begin();
+ aesni_gcm_dec_tfm(aes_ctx, &data, dst, src, tempCipherLen, iv,
+ hash_subkey, assoc, assoclen,
+ authTag, auth_tag_len);
+ kernel_fpu_end();
+
+ /* Compare generated tag with passed in tag. */
+ retval = crypto_memneq(src + tempCipherLen, authTag, auth_tag_len) ?
+ -EBADMSG : 0;
+
+ if (one_entry_in_sg) {
+ if (unlikely(req->src != req->dst)) {
+ scatterwalk_unmap(dst - req->assoclen);
+ scatterwalk_advance(&dst_sg_walk, req->dst->length);
+ scatterwalk_done(&dst_sg_walk, 1, 0);
+ }
+ scatterwalk_unmap(assoc);
+ scatterwalk_advance(&src_sg_walk, req->src->length);
+ scatterwalk_done(&src_sg_walk, req->src == req->dst, 0);
+ } else {
+ scatterwalk_map_and_copy(dst, req->dst, req->assoclen,
+ tempCipherLen, 1);
+ kfree(assoc);
+ }
+ return retval;
+
+}
+
+static int helper_rfc4106_encrypt(struct aead_request *req)
+{
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
+ void *aes_ctx = &(ctx->aes_key_expanded);
+ u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN)));
+ unsigned int i;
+ __be32 counter = cpu_to_be32(1);
+
+ /* Assuming we are supporting rfc4106 64-bit extended */
+ /* sequence numbers We need to have the AAD length equal */
+ /* to 16 or 20 bytes */
+ if (unlikely(req->assoclen != 16 && req->assoclen != 20))
+ return -EINVAL;
+
+ /* IV below built */
+ for (i = 0; i < 4; i++)
+ *(iv+i) = ctx->nonce[i];
+ for (i = 0; i < 8; i++)
+ *(iv+4+i) = req->iv[i];
+ *((__be32 *)(iv+12)) = counter;
+
+ return gcmaes_encrypt(req, req->assoclen - 8, ctx->hash_subkey, iv,
+ aes_ctx);
+}
+
+static int helper_rfc4106_decrypt(struct aead_request *req)
+{
+ __be32 counter = cpu_to_be32(1);
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
+ void *aes_ctx = &(ctx->aes_key_expanded);
+ u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN)));
+ unsigned int i;
+
+ if (unlikely(req->assoclen != 16 && req->assoclen != 20))
+ return -EINVAL;
+
+ /* Assuming we are supporting rfc4106 64-bit extended */
+ /* sequence numbers We need to have the AAD length */
+ /* equal to 16 or 20 bytes */
+
+ /* IV below built */
+ for (i = 0; i < 4; i++)
+ *(iv+i) = ctx->nonce[i];
+ for (i = 0; i < 8; i++)
+ *(iv+4+i) = req->iv[i];
+ *((__be32 *)(iv+12)) = counter;
+
+ return gcmaes_decrypt(req, req->assoclen - 8, ctx->hash_subkey, iv,
+ aes_ctx);
+}
+
+static int gcmaes_wrapper_encrypt(struct aead_request *req)
+{
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ struct cryptd_aead **ctx = crypto_aead_ctx(tfm);
+ struct cryptd_aead *cryptd_tfm = *ctx;
+
+ tfm = &cryptd_tfm->base;
+ if (irq_fpu_usable() && (!in_atomic() ||
+ !cryptd_aead_queued(cryptd_tfm)))
+ tfm = cryptd_aead_child(cryptd_tfm);
+
+ aead_request_set_tfm(req, tfm);
+
+ return crypto_aead_encrypt(req);
+}
+
+static int gcmaes_wrapper_decrypt(struct aead_request *req)
+{
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ struct cryptd_aead **ctx = crypto_aead_ctx(tfm);
+ struct cryptd_aead *cryptd_tfm = *ctx;
+
+ tfm = &cryptd_tfm->base;
+ if (irq_fpu_usable() && (!in_atomic() ||
+ !cryptd_aead_queued(cryptd_tfm)))
+ tfm = cryptd_aead_child(cryptd_tfm);
+
+ aead_request_set_tfm(req, tfm);
+
+ return crypto_aead_decrypt(req);
+}
+#endif
+
+static struct crypto_alg aesni_algs[] = { {
+ .cra_name = "aes",
+ .cra_driver_name = "aes-aesni",
+ .cra_priority = 300,
+ .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
+ .cra_blocksize = AES_BLOCK_SIZE,
+ .cra_ctxsize = CRYPTO_AES_CTX_SIZE,
+ .cra_module = THIS_MODULE,
+ .cra_u = {
+ .cipher = {
+ .cia_min_keysize = AES_MIN_KEY_SIZE,
+ .cia_max_keysize = AES_MAX_KEY_SIZE,
+ .cia_setkey = aes_set_key,
+ .cia_encrypt = aes_encrypt,
+ .cia_decrypt = aes_decrypt
+ }
+ }
+}, {
+ .cra_name = "__aes",
+ .cra_driver_name = "__aes-aesni",
+ .cra_priority = 300,
+ .cra_flags = CRYPTO_ALG_TYPE_CIPHER | CRYPTO_ALG_INTERNAL,
+ .cra_blocksize = AES_BLOCK_SIZE,
+ .cra_ctxsize = CRYPTO_AES_CTX_SIZE,
+ .cra_module = THIS_MODULE,
+ .cra_u = {
+ .cipher = {
+ .cia_min_keysize = AES_MIN_KEY_SIZE,
+ .cia_max_keysize = AES_MAX_KEY_SIZE,
+ .cia_setkey = aes_set_key,
+ .cia_encrypt = __aes_encrypt,
+ .cia_decrypt = __aes_decrypt
+ }
+ }
+} };
+
+static struct skcipher_alg aesni_skciphers[] = {
+ {
+ .base = {
+ .cra_name = "__ecb(aes)",
+ .cra_driver_name = "__ecb-aes-aesni",
+ .cra_priority = 400,
+ .cra_flags = CRYPTO_ALG_INTERNAL,
+ .cra_blocksize = AES_BLOCK_SIZE,
+ .cra_ctxsize = CRYPTO_AES_CTX_SIZE,
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = AES_MIN_KEY_SIZE,
+ .max_keysize = AES_MAX_KEY_SIZE,
+ .setkey = aesni_skcipher_setkey,
+ .encrypt = ecb_encrypt,
+ .decrypt = ecb_decrypt,
+ }, {
+ .base = {
+ .cra_name = "__cbc(aes)",
+ .cra_driver_name = "__cbc-aes-aesni",
+ .cra_priority = 400,
+ .cra_flags = CRYPTO_ALG_INTERNAL,
+ .cra_blocksize = AES_BLOCK_SIZE,
+ .cra_ctxsize = CRYPTO_AES_CTX_SIZE,
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = AES_MIN_KEY_SIZE,
+ .max_keysize = AES_MAX_KEY_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .setkey = aesni_skcipher_setkey,
+ .encrypt = cbc_encrypt,
+ .decrypt = cbc_decrypt,
+#ifdef CONFIG_X86_64
+ }, {
+ .base = {
+ .cra_name = "__ctr(aes)",
+ .cra_driver_name = "__ctr-aes-aesni",
+ .cra_priority = 400,
+ .cra_flags = CRYPTO_ALG_INTERNAL,
+ .cra_blocksize = 1,
+ .cra_ctxsize = CRYPTO_AES_CTX_SIZE,
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = AES_MIN_KEY_SIZE,
+ .max_keysize = AES_MAX_KEY_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .chunksize = AES_BLOCK_SIZE,
+ .setkey = aesni_skcipher_setkey,
+ .encrypt = ctr_crypt,
+ .decrypt = ctr_crypt,
+ }, {
+ .base = {
+ .cra_name = "__xts(aes)",
+ .cra_driver_name = "__xts-aes-aesni",
+ .cra_priority = 401,
+ .cra_flags = CRYPTO_ALG_INTERNAL,
+ .cra_blocksize = AES_BLOCK_SIZE,
+ .cra_ctxsize = XTS_AES_CTX_SIZE,
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = 2 * AES_MIN_KEY_SIZE,
+ .max_keysize = 2 * AES_MAX_KEY_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .setkey = xts_aesni_setkey,
+ .encrypt = xts_encrypt,
+ .decrypt = xts_decrypt,
+#endif
+ }
+};
+
+static
+struct simd_skcipher_alg *aesni_simd_skciphers[ARRAY_SIZE(aesni_skciphers)];
+
+static struct {
+ const char *algname;
+ const char *drvname;
+ const char *basename;
+ struct simd_skcipher_alg *simd;
+} aesni_simd_skciphers2[] = {
+#if (defined(MODULE) && IS_ENABLED(CONFIG_CRYPTO_PCBC)) || \
+ IS_BUILTIN(CONFIG_CRYPTO_PCBC)
+ {
+ .algname = "pcbc(aes)",
+ .drvname = "pcbc-aes-aesni",
+ .basename = "fpu(pcbc(__aes-aesni))",
+ },
+#endif
+};
+
+#ifdef CONFIG_X86_64
+static int generic_gcmaes_set_key(struct crypto_aead *aead, const u8 *key,
+ unsigned int key_len)
+{
+ struct generic_gcmaes_ctx *ctx = generic_gcmaes_ctx_get(aead);
+
+ return aes_set_key_common(crypto_aead_tfm(aead),
+ &ctx->aes_key_expanded, key, key_len) ?:
+ rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len);
+}
+
+static int generic_gcmaes_encrypt(struct aead_request *req)
+{
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ struct generic_gcmaes_ctx *ctx = generic_gcmaes_ctx_get(tfm);
+ void *aes_ctx = &(ctx->aes_key_expanded);
+ u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN)));
+ __be32 counter = cpu_to_be32(1);
+
+ memcpy(iv, req->iv, 12);
+ *((__be32 *)(iv+12)) = counter;
+
+ return gcmaes_encrypt(req, req->assoclen, ctx->hash_subkey, iv,
+ aes_ctx);
+}
+
+static int generic_gcmaes_decrypt(struct aead_request *req)
+{
+ __be32 counter = cpu_to_be32(1);
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ struct generic_gcmaes_ctx *ctx = generic_gcmaes_ctx_get(tfm);
+ void *aes_ctx = &(ctx->aes_key_expanded);
+ u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN)));
+
+ memcpy(iv, req->iv, 12);
+ *((__be32 *)(iv+12)) = counter;
+
+ return gcmaes_decrypt(req, req->assoclen, ctx->hash_subkey, iv,
+ aes_ctx);
+}
+
+static int generic_gcmaes_init(struct crypto_aead *aead)
+{
+ struct cryptd_aead *cryptd_tfm;
+ struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+
+ cryptd_tfm = cryptd_alloc_aead("__driver-generic-gcm-aes-aesni",
+ CRYPTO_ALG_INTERNAL,
+ CRYPTO_ALG_INTERNAL);
+ if (IS_ERR(cryptd_tfm))
+ return PTR_ERR(cryptd_tfm);
+
+ *ctx = cryptd_tfm;
+ crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base));
+
+ return 0;
+}
+
+static void generic_gcmaes_exit(struct crypto_aead *aead)
+{
+ struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+
+ cryptd_free_aead(*ctx);
+}
+
+static struct aead_alg aesni_aead_algs[] = { {
+ .setkey = common_rfc4106_set_key,
+ .setauthsize = common_rfc4106_set_authsize,
+ .encrypt = helper_rfc4106_encrypt,
+ .decrypt = helper_rfc4106_decrypt,
+ .ivsize = GCM_RFC4106_IV_SIZE,
+ .maxauthsize = 16,
+ .base = {
+ .cra_name = "__gcm-aes-aesni",
+ .cra_driver_name = "__driver-gcm-aes-aesni",
+ .cra_flags = CRYPTO_ALG_INTERNAL,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx),
+ .cra_alignmask = AESNI_ALIGN - 1,
+ .cra_module = THIS_MODULE,
+ },
+}, {
+ .init = rfc4106_init,
+ .exit = rfc4106_exit,
+ .setkey = gcmaes_wrapper_set_key,
+ .setauthsize = gcmaes_wrapper_set_authsize,
+ .encrypt = gcmaes_wrapper_encrypt,
+ .decrypt = gcmaes_wrapper_decrypt,
+ .ivsize = GCM_RFC4106_IV_SIZE,
+ .maxauthsize = 16,
+ .base = {
+ .cra_name = "rfc4106(gcm(aes))",
+ .cra_driver_name = "rfc4106-gcm-aesni",
+ .cra_priority = 400,
+ .cra_flags = CRYPTO_ALG_ASYNC,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct cryptd_aead *),
+ .cra_module = THIS_MODULE,
+ },
+}, {
+ .setkey = generic_gcmaes_set_key,
+ .setauthsize = generic_gcmaes_set_authsize,
+ .encrypt = generic_gcmaes_encrypt,
+ .decrypt = generic_gcmaes_decrypt,
+ .ivsize = GCM_AES_IV_SIZE,
+ .maxauthsize = 16,
+ .base = {
+ .cra_name = "__generic-gcm-aes-aesni",
+ .cra_driver_name = "__driver-generic-gcm-aes-aesni",
+ .cra_priority = 0,
+ .cra_flags = CRYPTO_ALG_INTERNAL,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct generic_gcmaes_ctx),
+ .cra_alignmask = AESNI_ALIGN - 1,
+ .cra_module = THIS_MODULE,
+ },
+}, {
+ .init = generic_gcmaes_init,
+ .exit = generic_gcmaes_exit,
+ .setkey = gcmaes_wrapper_set_key,
+ .setauthsize = gcmaes_wrapper_set_authsize,
+ .encrypt = gcmaes_wrapper_encrypt,
+ .decrypt = gcmaes_wrapper_decrypt,
+ .ivsize = GCM_AES_IV_SIZE,
+ .maxauthsize = 16,
+ .base = {
+ .cra_name = "gcm(aes)",
+ .cra_driver_name = "generic-gcm-aesni",
+ .cra_priority = 400,
+ .cra_flags = CRYPTO_ALG_ASYNC,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct cryptd_aead *),
+ .cra_module = THIS_MODULE,
+ },
+} };
+#else
+static struct aead_alg aesni_aead_algs[0];
+#endif
+
+
+static const struct x86_cpu_id aesni_cpu_id[] = {
+ X86_FEATURE_MATCH(X86_FEATURE_AES),
+ {}
+};
+MODULE_DEVICE_TABLE(x86cpu, aesni_cpu_id);
+
+static void aesni_free_simds(void)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(aesni_simd_skciphers) &&
+ aesni_simd_skciphers[i]; i++)
+ simd_skcipher_free(aesni_simd_skciphers[i]);
+
+ for (i = 0; i < ARRAY_SIZE(aesni_simd_skciphers2); i++)
+ if (aesni_simd_skciphers2[i].simd)
+ simd_skcipher_free(aesni_simd_skciphers2[i].simd);
+}
+
+static int __init aesni_init(void)
+{
+ struct simd_skcipher_alg *simd;
+ const char *basename;
+ const char *algname;
+ const char *drvname;
+ int err;
+ int i;
+
+ if (!x86_match_cpu(aesni_cpu_id))
+ return -ENODEV;
+#ifdef CONFIG_X86_64
+#ifdef CONFIG_AS_AVX2
+ if (boot_cpu_has(X86_FEATURE_AVX2)) {
+ pr_info("AVX2 version of gcm_enc/dec engaged.\n");
+ aesni_gcm_enc_tfm = aesni_gcm_enc_avx2;
+ aesni_gcm_dec_tfm = aesni_gcm_dec_avx2;
+ } else
+#endif
+#ifdef CONFIG_AS_AVX
+ if (boot_cpu_has(X86_FEATURE_AVX)) {
+ pr_info("AVX version of gcm_enc/dec engaged.\n");
+ aesni_gcm_enc_tfm = aesni_gcm_enc_avx;
+ aesni_gcm_dec_tfm = aesni_gcm_dec_avx;
+ } else
+#endif
+ {
+ pr_info("SSE version of gcm_enc/dec engaged.\n");
+ aesni_gcm_enc_tfm = aesni_gcm_enc;
+ aesni_gcm_dec_tfm = aesni_gcm_dec;
+ }
+ aesni_ctr_enc_tfm = aesni_ctr_enc;
+#ifdef CONFIG_AS_AVX
+ if (boot_cpu_has(X86_FEATURE_AVX)) {
+ /* optimize performance of ctr mode encryption transform */
+ aesni_ctr_enc_tfm = aesni_ctr_enc_avx_tfm;
+ pr_info("AES CTR mode by8 optimization enabled\n");
+ }
+#endif
+#endif
+
+ err = crypto_fpu_init();
+ if (err)
+ return err;
+
+ err = crypto_register_algs(aesni_algs, ARRAY_SIZE(aesni_algs));
+ if (err)
+ goto fpu_exit;
+
+ err = crypto_register_skciphers(aesni_skciphers,
+ ARRAY_SIZE(aesni_skciphers));
+ if (err)
+ goto unregister_algs;
+
+ err = crypto_register_aeads(aesni_aead_algs,
+ ARRAY_SIZE(aesni_aead_algs));
+ if (err)
+ goto unregister_skciphers;
+
+ for (i = 0; i < ARRAY_SIZE(aesni_skciphers); i++) {
+ algname = aesni_skciphers[i].base.cra_name + 2;
+ drvname = aesni_skciphers[i].base.cra_driver_name + 2;
+ basename = aesni_skciphers[i].base.cra_driver_name;
+ simd = simd_skcipher_create_compat(algname, drvname, basename);
+ err = PTR_ERR(simd);
+ if (IS_ERR(simd))
+ goto unregister_simds;
+
+ aesni_simd_skciphers[i] = simd;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(aesni_simd_skciphers2); i++) {
+ algname = aesni_simd_skciphers2[i].algname;
+ drvname = aesni_simd_skciphers2[i].drvname;
+ basename = aesni_simd_skciphers2[i].basename;
+ simd = simd_skcipher_create_compat(algname, drvname, basename);
+ err = PTR_ERR(simd);
+ if (IS_ERR(simd))
+ continue;
+
+ aesni_simd_skciphers2[i].simd = simd;
+ }
+
+ return 0;
+
+unregister_simds:
+ aesni_free_simds();
+ crypto_unregister_aeads(aesni_aead_algs, ARRAY_SIZE(aesni_aead_algs));
+unregister_skciphers:
+ crypto_unregister_skciphers(aesni_skciphers,
+ ARRAY_SIZE(aesni_skciphers));
+unregister_algs:
+ crypto_unregister_algs(aesni_algs, ARRAY_SIZE(aesni_algs));
+fpu_exit:
+ crypto_fpu_exit();
+ return err;
+}
+
+static void __exit aesni_exit(void)
+{
+ aesni_free_simds();
+ crypto_unregister_aeads(aesni_aead_algs, ARRAY_SIZE(aesni_aead_algs));
+ crypto_unregister_skciphers(aesni_skciphers,
+ ARRAY_SIZE(aesni_skciphers));
+ crypto_unregister_algs(aesni_algs, ARRAY_SIZE(aesni_algs));
+
+ crypto_fpu_exit();
+}
+
+late_initcall(aesni_init);
+module_exit(aesni_exit);
+
+MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm, Intel AES-NI instructions optimized");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_CRYPTO("aes");
diff --git a/arch/x86/crypto/blowfish-x86_64-asm_64.S b/arch/x86/crypto/blowfish-x86_64-asm_64.S
new file mode 100644
index 000000000..8c1fcb6ba
--- /dev/null
+++ b/arch/x86/crypto/blowfish-x86_64-asm_64.S
@@ -0,0 +1,383 @@
+/*
+ * Blowfish Cipher Algorithm (x86_64)
+ *
+ * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+ * USA
+ *
+ */
+
+#include <linux/linkage.h>
+
+.file "blowfish-x86_64-asm.S"
+.text
+
+/* structure of crypto context */
+#define p 0
+#define s0 ((16 + 2) * 4)
+#define s1 ((16 + 2 + (1 * 256)) * 4)
+#define s2 ((16 + 2 + (2 * 256)) * 4)
+#define s3 ((16 + 2 + (3 * 256)) * 4)
+
+/* register macros */
+#define CTX %r12
+#define RIO %rsi
+
+#define RX0 %rax
+#define RX1 %rbx
+#define RX2 %rcx
+#define RX3 %rdx
+
+#define RX0d %eax
+#define RX1d %ebx
+#define RX2d %ecx
+#define RX3d %edx
+
+#define RX0bl %al
+#define RX1bl %bl
+#define RX2bl %cl
+#define RX3bl %dl
+
+#define RX0bh %ah
+#define RX1bh %bh
+#define RX2bh %ch
+#define RX3bh %dh
+
+#define RT0 %rdi
+#define RT1 %rsi
+#define RT2 %r8
+#define RT3 %r9
+
+#define RT0d %edi
+#define RT1d %esi
+#define RT2d %r8d
+#define RT3d %r9d
+
+#define RKEY %r10
+
+/***********************************************************************
+ * 1-way blowfish
+ ***********************************************************************/
+#define F() \
+ rorq $16, RX0; \
+ movzbl RX0bh, RT0d; \
+ movzbl RX0bl, RT1d; \
+ rolq $16, RX0; \
+ movl s0(CTX,RT0,4), RT0d; \
+ addl s1(CTX,RT1,4), RT0d; \
+ movzbl RX0bh, RT1d; \
+ movzbl RX0bl, RT2d; \
+ rolq $32, RX0; \
+ xorl s2(CTX,RT1,4), RT0d; \
+ addl s3(CTX,RT2,4), RT0d; \
+ xorq RT0, RX0;
+
+#define add_roundkey_enc(n) \
+ xorq p+4*(n)(CTX), RX0;
+
+#define round_enc(n) \
+ add_roundkey_enc(n); \
+ \
+ F(); \
+ F();
+
+#define add_roundkey_dec(n) \
+ movq p+4*(n-1)(CTX), RT0; \
+ rorq $32, RT0; \
+ xorq RT0, RX0;
+
+#define round_dec(n) \
+ add_roundkey_dec(n); \
+ \
+ F(); \
+ F(); \
+
+#define read_block() \
+ movq (RIO), RX0; \
+ rorq $32, RX0; \
+ bswapq RX0;
+
+#define write_block() \
+ bswapq RX0; \
+ movq RX0, (RIO);
+
+#define xor_block() \
+ bswapq RX0; \
+ xorq RX0, (RIO);
+
+ENTRY(__blowfish_enc_blk)
+ /* input:
+ * %rdi: ctx
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: bool, if true: xor output
+ */
+ movq %r12, %r11;
+
+ movq %rdi, CTX;
+ movq %rsi, %r10;
+ movq %rdx, RIO;
+
+ read_block();
+
+ round_enc(0);
+ round_enc(2);
+ round_enc(4);
+ round_enc(6);
+ round_enc(8);
+ round_enc(10);
+ round_enc(12);
+ round_enc(14);
+ add_roundkey_enc(16);
+
+ movq %r11, %r12;
+
+ movq %r10, RIO;
+ test %cl, %cl;
+ jnz .L__enc_xor;
+
+ write_block();
+ ret;
+.L__enc_xor:
+ xor_block();
+ ret;
+ENDPROC(__blowfish_enc_blk)
+
+ENTRY(blowfish_dec_blk)
+ /* input:
+ * %rdi: ctx
+ * %rsi: dst
+ * %rdx: src
+ */
+ movq %r12, %r11;
+
+ movq %rdi, CTX;
+ movq %rsi, %r10;
+ movq %rdx, RIO;
+
+ read_block();
+
+ round_dec(17);
+ round_dec(15);
+ round_dec(13);
+ round_dec(11);
+ round_dec(9);
+ round_dec(7);
+ round_dec(5);
+ round_dec(3);
+ add_roundkey_dec(1);
+
+ movq %r10, RIO;
+ write_block();
+
+ movq %r11, %r12;
+
+ ret;
+ENDPROC(blowfish_dec_blk)
+
+/**********************************************************************
+ 4-way blowfish, four blocks parallel
+ **********************************************************************/
+
+/* F() for 4-way. Slower when used alone/1-way, but faster when used
+ * parallel/4-way (tested on AMD Phenom II & Intel Xeon E7330).
+ */
+#define F4(x) \
+ movzbl x ## bh, RT1d; \
+ movzbl x ## bl, RT3d; \
+ rorq $16, x; \
+ movzbl x ## bh, RT0d; \
+ movzbl x ## bl, RT2d; \
+ rorq $16, x; \
+ movl s0(CTX,RT0,4), RT0d; \
+ addl s1(CTX,RT2,4), RT0d; \
+ xorl s2(CTX,RT1,4), RT0d; \
+ addl s3(CTX,RT3,4), RT0d; \
+ xorq RT0, x;
+
+#define add_preloaded_roundkey4() \
+ xorq RKEY, RX0; \
+ xorq RKEY, RX1; \
+ xorq RKEY, RX2; \
+ xorq RKEY, RX3;
+
+#define preload_roundkey_enc(n) \
+ movq p+4*(n)(CTX), RKEY;
+
+#define add_roundkey_enc4(n) \
+ add_preloaded_roundkey4(); \
+ preload_roundkey_enc(n + 2);
+
+#define round_enc4(n) \
+ add_roundkey_enc4(n); \
+ \
+ F4(RX0); \
+ F4(RX1); \
+ F4(RX2); \
+ F4(RX3); \
+ \
+ F4(RX0); \
+ F4(RX1); \
+ F4(RX2); \
+ F4(RX3);
+
+#define preload_roundkey_dec(n) \
+ movq p+4*((n)-1)(CTX), RKEY; \
+ rorq $32, RKEY;
+
+#define add_roundkey_dec4(n) \
+ add_preloaded_roundkey4(); \
+ preload_roundkey_dec(n - 2);
+
+#define round_dec4(n) \
+ add_roundkey_dec4(n); \
+ \
+ F4(RX0); \
+ F4(RX1); \
+ F4(RX2); \
+ F4(RX3); \
+ \
+ F4(RX0); \
+ F4(RX1); \
+ F4(RX2); \
+ F4(RX3);
+
+#define read_block4() \
+ movq (RIO), RX0; \
+ rorq $32, RX0; \
+ bswapq RX0; \
+ \
+ movq 8(RIO), RX1; \
+ rorq $32, RX1; \
+ bswapq RX1; \
+ \
+ movq 16(RIO), RX2; \
+ rorq $32, RX2; \
+ bswapq RX2; \
+ \
+ movq 24(RIO), RX3; \
+ rorq $32, RX3; \
+ bswapq RX3;
+
+#define write_block4() \
+ bswapq RX0; \
+ movq RX0, (RIO); \
+ \
+ bswapq RX1; \
+ movq RX1, 8(RIO); \
+ \
+ bswapq RX2; \
+ movq RX2, 16(RIO); \
+ \
+ bswapq RX3; \
+ movq RX3, 24(RIO);
+
+#define xor_block4() \
+ bswapq RX0; \
+ xorq RX0, (RIO); \
+ \
+ bswapq RX1; \
+ xorq RX1, 8(RIO); \
+ \
+ bswapq RX2; \
+ xorq RX2, 16(RIO); \
+ \
+ bswapq RX3; \
+ xorq RX3, 24(RIO);
+
+ENTRY(__blowfish_enc_blk_4way)
+ /* input:
+ * %rdi: ctx
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: bool, if true: xor output
+ */
+ pushq %r12;
+ pushq %rbx;
+ pushq %rcx;
+
+ movq %rdi, CTX
+ movq %rsi, %r11;
+ movq %rdx, RIO;
+
+ preload_roundkey_enc(0);
+
+ read_block4();
+
+ round_enc4(0);
+ round_enc4(2);
+ round_enc4(4);
+ round_enc4(6);
+ round_enc4(8);
+ round_enc4(10);
+ round_enc4(12);
+ round_enc4(14);
+ add_preloaded_roundkey4();
+
+ popq %r12;
+ movq %r11, RIO;
+
+ test %r12b, %r12b;
+ jnz .L__enc_xor4;
+
+ write_block4();
+
+ popq %rbx;
+ popq %r12;
+ ret;
+
+.L__enc_xor4:
+ xor_block4();
+
+ popq %rbx;
+ popq %r12;
+ ret;
+ENDPROC(__blowfish_enc_blk_4way)
+
+ENTRY(blowfish_dec_blk_4way)
+ /* input:
+ * %rdi: ctx
+ * %rsi: dst
+ * %rdx: src
+ */
+ pushq %r12;
+ pushq %rbx;
+
+ movq %rdi, CTX;
+ movq %rsi, %r11
+ movq %rdx, RIO;
+
+ preload_roundkey_dec(17);
+ read_block4();
+
+ round_dec4(17);
+ round_dec4(15);
+ round_dec4(13);
+ round_dec4(11);
+ round_dec4(9);
+ round_dec4(7);
+ round_dec4(5);
+ round_dec4(3);
+ add_preloaded_roundkey4();
+
+ movq %r11, RIO;
+ write_block4();
+
+ popq %rbx;
+ popq %r12;
+
+ ret;
+ENDPROC(blowfish_dec_blk_4way)
diff --git a/arch/x86/crypto/blowfish_glue.c b/arch/x86/crypto/blowfish_glue.c
new file mode 100644
index 000000000..3e0c07cc9
--- /dev/null
+++ b/arch/x86/crypto/blowfish_glue.c
@@ -0,0 +1,477 @@
+/*
+ * Glue Code for assembler optimized version of Blowfish
+ *
+ * Copyright (c) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
+ * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
+ * CTR part based on code (crypto/ctr.c) by:
+ * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+ * USA
+ *
+ */
+
+#include <crypto/algapi.h>
+#include <crypto/blowfish.h>
+#include <crypto/internal/skcipher.h>
+#include <linux/crypto.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/types.h>
+
+/* regular block cipher functions */
+asmlinkage void __blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src,
+ bool xor);
+asmlinkage void blowfish_dec_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src);
+
+/* 4-way parallel cipher functions */
+asmlinkage void __blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst,
+ const u8 *src, bool xor);
+asmlinkage void blowfish_dec_blk_4way(struct bf_ctx *ctx, u8 *dst,
+ const u8 *src);
+
+static inline void blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src)
+{
+ __blowfish_enc_blk(ctx, dst, src, false);
+}
+
+static inline void blowfish_enc_blk_xor(struct bf_ctx *ctx, u8 *dst,
+ const u8 *src)
+{
+ __blowfish_enc_blk(ctx, dst, src, true);
+}
+
+static inline void blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst,
+ const u8 *src)
+{
+ __blowfish_enc_blk_4way(ctx, dst, src, false);
+}
+
+static inline void blowfish_enc_blk_xor_4way(struct bf_ctx *ctx, u8 *dst,
+ const u8 *src)
+{
+ __blowfish_enc_blk_4way(ctx, dst, src, true);
+}
+
+static void blowfish_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+ blowfish_enc_blk(crypto_tfm_ctx(tfm), dst, src);
+}
+
+static void blowfish_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+ blowfish_dec_blk(crypto_tfm_ctx(tfm), dst, src);
+}
+
+static int blowfish_setkey_skcipher(struct crypto_skcipher *tfm,
+ const u8 *key, unsigned int keylen)
+{
+ return blowfish_setkey(&tfm->base, key, keylen);
+}
+
+static int ecb_crypt(struct skcipher_request *req,
+ void (*fn)(struct bf_ctx *, u8 *, const u8 *),
+ void (*fn_4way)(struct bf_ctx *, u8 *, const u8 *))
+{
+ unsigned int bsize = BF_BLOCK_SIZE;
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct bf_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes)) {
+ u8 *wsrc = walk.src.virt.addr;
+ u8 *wdst = walk.dst.virt.addr;
+
+ /* Process four block batch */
+ if (nbytes >= bsize * 4) {
+ do {
+ fn_4way(ctx, wdst, wsrc);
+
+ wsrc += bsize * 4;
+ wdst += bsize * 4;
+ nbytes -= bsize * 4;
+ } while (nbytes >= bsize * 4);
+
+ if (nbytes < bsize)
+ goto done;
+ }
+
+ /* Handle leftovers */
+ do {
+ fn(ctx, wdst, wsrc);
+
+ wsrc += bsize;
+ wdst += bsize;
+ nbytes -= bsize;
+ } while (nbytes >= bsize);
+
+done:
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+
+static int ecb_encrypt(struct skcipher_request *req)
+{
+ return ecb_crypt(req, blowfish_enc_blk, blowfish_enc_blk_4way);
+}
+
+static int ecb_decrypt(struct skcipher_request *req)
+{
+ return ecb_crypt(req, blowfish_dec_blk, blowfish_dec_blk_4way);
+}
+
+static unsigned int __cbc_encrypt(struct bf_ctx *ctx,
+ struct skcipher_walk *walk)
+{
+ unsigned int bsize = BF_BLOCK_SIZE;
+ unsigned int nbytes = walk->nbytes;
+ u64 *src = (u64 *)walk->src.virt.addr;
+ u64 *dst = (u64 *)walk->dst.virt.addr;
+ u64 *iv = (u64 *)walk->iv;
+
+ do {
+ *dst = *src ^ *iv;
+ blowfish_enc_blk(ctx, (u8 *)dst, (u8 *)dst);
+ iv = dst;
+
+ src += 1;
+ dst += 1;
+ nbytes -= bsize;
+ } while (nbytes >= bsize);
+
+ *(u64 *)walk->iv = *iv;
+ return nbytes;
+}
+
+static int cbc_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct bf_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes)) {
+ nbytes = __cbc_encrypt(ctx, &walk);
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+
+static unsigned int __cbc_decrypt(struct bf_ctx *ctx,
+ struct skcipher_walk *walk)
+{
+ unsigned int bsize = BF_BLOCK_SIZE;
+ unsigned int nbytes = walk->nbytes;
+ u64 *src = (u64 *)walk->src.virt.addr;
+ u64 *dst = (u64 *)walk->dst.virt.addr;
+ u64 ivs[4 - 1];
+ u64 last_iv;
+
+ /* Start of the last block. */
+ src += nbytes / bsize - 1;
+ dst += nbytes / bsize - 1;
+
+ last_iv = *src;
+
+ /* Process four block batch */
+ if (nbytes >= bsize * 4) {
+ do {
+ nbytes -= bsize * 4 - bsize;
+ src -= 4 - 1;
+ dst -= 4 - 1;
+
+ ivs[0] = src[0];
+ ivs[1] = src[1];
+ ivs[2] = src[2];
+
+ blowfish_dec_blk_4way(ctx, (u8 *)dst, (u8 *)src);
+
+ dst[1] ^= ivs[0];
+ dst[2] ^= ivs[1];
+ dst[3] ^= ivs[2];
+
+ nbytes -= bsize;
+ if (nbytes < bsize)
+ goto done;
+
+ *dst ^= *(src - 1);
+ src -= 1;
+ dst -= 1;
+ } while (nbytes >= bsize * 4);
+ }
+
+ /* Handle leftovers */
+ for (;;) {
+ blowfish_dec_blk(ctx, (u8 *)dst, (u8 *)src);
+
+ nbytes -= bsize;
+ if (nbytes < bsize)
+ break;
+
+ *dst ^= *(src - 1);
+ src -= 1;
+ dst -= 1;
+ }
+
+done:
+ *dst ^= *(u64 *)walk->iv;
+ *(u64 *)walk->iv = last_iv;
+
+ return nbytes;
+}
+
+static int cbc_decrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct bf_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes)) {
+ nbytes = __cbc_decrypt(ctx, &walk);
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+
+static void ctr_crypt_final(struct bf_ctx *ctx, struct skcipher_walk *walk)
+{
+ u8 *ctrblk = walk->iv;
+ u8 keystream[BF_BLOCK_SIZE];
+ u8 *src = walk->src.virt.addr;
+ u8 *dst = walk->dst.virt.addr;
+ unsigned int nbytes = walk->nbytes;
+
+ blowfish_enc_blk(ctx, keystream, ctrblk);
+ crypto_xor_cpy(dst, keystream, src, nbytes);
+
+ crypto_inc(ctrblk, BF_BLOCK_SIZE);
+}
+
+static unsigned int __ctr_crypt(struct bf_ctx *ctx, struct skcipher_walk *walk)
+{
+ unsigned int bsize = BF_BLOCK_SIZE;
+ unsigned int nbytes = walk->nbytes;
+ u64 *src = (u64 *)walk->src.virt.addr;
+ u64 *dst = (u64 *)walk->dst.virt.addr;
+ u64 ctrblk = be64_to_cpu(*(__be64 *)walk->iv);
+ __be64 ctrblocks[4];
+
+ /* Process four block batch */
+ if (nbytes >= bsize * 4) {
+ do {
+ if (dst != src) {
+ dst[0] = src[0];
+ dst[1] = src[1];
+ dst[2] = src[2];
+ dst[3] = src[3];
+ }
+
+ /* create ctrblks for parallel encrypt */
+ ctrblocks[0] = cpu_to_be64(ctrblk++);
+ ctrblocks[1] = cpu_to_be64(ctrblk++);
+ ctrblocks[2] = cpu_to_be64(ctrblk++);
+ ctrblocks[3] = cpu_to_be64(ctrblk++);
+
+ blowfish_enc_blk_xor_4way(ctx, (u8 *)dst,
+ (u8 *)ctrblocks);
+
+ src += 4;
+ dst += 4;
+ } while ((nbytes -= bsize * 4) >= bsize * 4);
+
+ if (nbytes < bsize)
+ goto done;
+ }
+
+ /* Handle leftovers */
+ do {
+ if (dst != src)
+ *dst = *src;
+
+ ctrblocks[0] = cpu_to_be64(ctrblk++);
+
+ blowfish_enc_blk_xor(ctx, (u8 *)dst, (u8 *)ctrblocks);
+
+ src += 1;
+ dst += 1;
+ } while ((nbytes -= bsize) >= bsize);
+
+done:
+ *(__be64 *)walk->iv = cpu_to_be64(ctrblk);
+ return nbytes;
+}
+
+static int ctr_crypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct bf_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) >= BF_BLOCK_SIZE) {
+ nbytes = __ctr_crypt(ctx, &walk);
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ if (nbytes) {
+ ctr_crypt_final(ctx, &walk);
+ err = skcipher_walk_done(&walk, 0);
+ }
+
+ return err;
+}
+
+static struct crypto_alg bf_cipher_alg = {
+ .cra_name = "blowfish",
+ .cra_driver_name = "blowfish-asm",
+ .cra_priority = 200,
+ .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
+ .cra_blocksize = BF_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct bf_ctx),
+ .cra_alignmask = 0,
+ .cra_module = THIS_MODULE,
+ .cra_u = {
+ .cipher = {
+ .cia_min_keysize = BF_MIN_KEY_SIZE,
+ .cia_max_keysize = BF_MAX_KEY_SIZE,
+ .cia_setkey = blowfish_setkey,
+ .cia_encrypt = blowfish_encrypt,
+ .cia_decrypt = blowfish_decrypt,
+ }
+ }
+};
+
+static struct skcipher_alg bf_skcipher_algs[] = {
+ {
+ .base.cra_name = "ecb(blowfish)",
+ .base.cra_driver_name = "ecb-blowfish-asm",
+ .base.cra_priority = 300,
+ .base.cra_blocksize = BF_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct bf_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = BF_MIN_KEY_SIZE,
+ .max_keysize = BF_MAX_KEY_SIZE,
+ .setkey = blowfish_setkey_skcipher,
+ .encrypt = ecb_encrypt,
+ .decrypt = ecb_decrypt,
+ }, {
+ .base.cra_name = "cbc(blowfish)",
+ .base.cra_driver_name = "cbc-blowfish-asm",
+ .base.cra_priority = 300,
+ .base.cra_blocksize = BF_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct bf_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = BF_MIN_KEY_SIZE,
+ .max_keysize = BF_MAX_KEY_SIZE,
+ .ivsize = BF_BLOCK_SIZE,
+ .setkey = blowfish_setkey_skcipher,
+ .encrypt = cbc_encrypt,
+ .decrypt = cbc_decrypt,
+ }, {
+ .base.cra_name = "ctr(blowfish)",
+ .base.cra_driver_name = "ctr-blowfish-asm",
+ .base.cra_priority = 300,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct bf_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = BF_MIN_KEY_SIZE,
+ .max_keysize = BF_MAX_KEY_SIZE,
+ .ivsize = BF_BLOCK_SIZE,
+ .chunksize = BF_BLOCK_SIZE,
+ .setkey = blowfish_setkey_skcipher,
+ .encrypt = ctr_crypt,
+ .decrypt = ctr_crypt,
+ },
+};
+
+static bool is_blacklisted_cpu(void)
+{
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return false;
+
+ if (boot_cpu_data.x86 == 0x0f) {
+ /*
+ * On Pentium 4, blowfish-x86_64 is slower than generic C
+ * implementation because use of 64bit rotates (which are really
+ * slow on P4). Therefore blacklist P4s.
+ */
+ return true;
+ }
+
+ return false;
+}
+
+static int force;
+module_param(force, int, 0);
+MODULE_PARM_DESC(force, "Force module load, ignore CPU blacklist");
+
+static int __init init(void)
+{
+ int err;
+
+ if (!force && is_blacklisted_cpu()) {
+ printk(KERN_INFO
+ "blowfish-x86_64: performance on this CPU "
+ "would be suboptimal: disabling "
+ "blowfish-x86_64.\n");
+ return -ENODEV;
+ }
+
+ err = crypto_register_alg(&bf_cipher_alg);
+ if (err)
+ return err;
+
+ err = crypto_register_skciphers(bf_skcipher_algs,
+ ARRAY_SIZE(bf_skcipher_algs));
+ if (err)
+ crypto_unregister_alg(&bf_cipher_alg);
+
+ return err;
+}
+
+static void __exit fini(void)
+{
+ crypto_unregister_alg(&bf_cipher_alg);
+ crypto_unregister_skciphers(bf_skcipher_algs,
+ ARRAY_SIZE(bf_skcipher_algs));
+}
+
+module_init(init);
+module_exit(fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Blowfish Cipher Algorithm, asm optimized");
+MODULE_ALIAS_CRYPTO("blowfish");
+MODULE_ALIAS_CRYPTO("blowfish-asm");
diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S
new file mode 100644
index 000000000..a14af6eb0
--- /dev/null
+++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S
@@ -0,0 +1,1289 @@
+/*
+ * x86_64/AVX/AES-NI assembler implementation of Camellia
+ *
+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+/*
+ * Version licensed under 2-clause BSD License is available at:
+ * http://koti.mbnet.fi/axh/crypto/camellia-BSD-1.2.0-aesni1.tar.xz
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+#include <asm/nospec-branch.h>
+
+#define CAMELLIA_TABLE_BYTE_LEN 272
+
+/* struct camellia_ctx: */
+#define key_table 0
+#define key_length CAMELLIA_TABLE_BYTE_LEN
+
+/* register macros */
+#define CTX %rdi
+
+/**********************************************************************
+ 16-way camellia
+ **********************************************************************/
+#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
+ vpand x, mask4bit, tmp0; \
+ vpandn x, mask4bit, x; \
+ vpsrld $4, x, x; \
+ \
+ vpshufb tmp0, lo_t, tmp0; \
+ vpshufb x, hi_t, x; \
+ vpxor tmp0, x, x;
+
+/*
+ * IN:
+ * x0..x7: byte-sliced AB state
+ * mem_cd: register pointer storing CD state
+ * key: index for key material
+ * OUT:
+ * x0..x7: new byte-sliced CD state
+ */
+#define roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \
+ t7, mem_cd, key) \
+ /* \
+ * S-function with AES subbytes \
+ */ \
+ vmovdqa .Linv_shift_row, t4; \
+ vbroadcastss .L0f0f0f0f, t7; \
+ vmovdqa .Lpre_tf_lo_s1, t0; \
+ vmovdqa .Lpre_tf_hi_s1, t1; \
+ \
+ /* AES inverse shift rows */ \
+ vpshufb t4, x0, x0; \
+ vpshufb t4, x7, x7; \
+ vpshufb t4, x1, x1; \
+ vpshufb t4, x4, x4; \
+ vpshufb t4, x2, x2; \
+ vpshufb t4, x5, x5; \
+ vpshufb t4, x3, x3; \
+ vpshufb t4, x6, x6; \
+ \
+ /* prefilter sboxes 1, 2 and 3 */ \
+ vmovdqa .Lpre_tf_lo_s4, t2; \
+ vmovdqa .Lpre_tf_hi_s4, t3; \
+ filter_8bit(x0, t0, t1, t7, t6); \
+ filter_8bit(x7, t0, t1, t7, t6); \
+ filter_8bit(x1, t0, t1, t7, t6); \
+ filter_8bit(x4, t0, t1, t7, t6); \
+ filter_8bit(x2, t0, t1, t7, t6); \
+ filter_8bit(x5, t0, t1, t7, t6); \
+ \
+ /* prefilter sbox 4 */ \
+ vpxor t4, t4, t4; \
+ filter_8bit(x3, t2, t3, t7, t6); \
+ filter_8bit(x6, t2, t3, t7, t6); \
+ \
+ /* AES subbytes + AES shift rows */ \
+ vmovdqa .Lpost_tf_lo_s1, t0; \
+ vmovdqa .Lpost_tf_hi_s1, t1; \
+ vaesenclast t4, x0, x0; \
+ vaesenclast t4, x7, x7; \
+ vaesenclast t4, x1, x1; \
+ vaesenclast t4, x4, x4; \
+ vaesenclast t4, x2, x2; \
+ vaesenclast t4, x5, x5; \
+ vaesenclast t4, x3, x3; \
+ vaesenclast t4, x6, x6; \
+ \
+ /* postfilter sboxes 1 and 4 */ \
+ vmovdqa .Lpost_tf_lo_s3, t2; \
+ vmovdqa .Lpost_tf_hi_s3, t3; \
+ filter_8bit(x0, t0, t1, t7, t6); \
+ filter_8bit(x7, t0, t1, t7, t6); \
+ filter_8bit(x3, t0, t1, t7, t6); \
+ filter_8bit(x6, t0, t1, t7, t6); \
+ \
+ /* postfilter sbox 3 */ \
+ vmovdqa .Lpost_tf_lo_s2, t4; \
+ vmovdqa .Lpost_tf_hi_s2, t5; \
+ filter_8bit(x2, t2, t3, t7, t6); \
+ filter_8bit(x5, t2, t3, t7, t6); \
+ \
+ vpxor t6, t6, t6; \
+ vmovq key, t0; \
+ \
+ /* postfilter sbox 2 */ \
+ filter_8bit(x1, t4, t5, t7, t2); \
+ filter_8bit(x4, t4, t5, t7, t2); \
+ \
+ vpsrldq $5, t0, t5; \
+ vpsrldq $1, t0, t1; \
+ vpsrldq $2, t0, t2; \
+ vpsrldq $3, t0, t3; \
+ vpsrldq $4, t0, t4; \
+ vpshufb t6, t0, t0; \
+ vpshufb t6, t1, t1; \
+ vpshufb t6, t2, t2; \
+ vpshufb t6, t3, t3; \
+ vpshufb t6, t4, t4; \
+ vpsrldq $2, t5, t7; \
+ vpshufb t6, t7, t7; \
+ \
+ /* \
+ * P-function \
+ */ \
+ vpxor x5, x0, x0; \
+ vpxor x6, x1, x1; \
+ vpxor x7, x2, x2; \
+ vpxor x4, x3, x3; \
+ \
+ vpxor x2, x4, x4; \
+ vpxor x3, x5, x5; \
+ vpxor x0, x6, x6; \
+ vpxor x1, x7, x7; \
+ \
+ vpxor x7, x0, x0; \
+ vpxor x4, x1, x1; \
+ vpxor x5, x2, x2; \
+ vpxor x6, x3, x3; \
+ \
+ vpxor x3, x4, x4; \
+ vpxor x0, x5, x5; \
+ vpxor x1, x6, x6; \
+ vpxor x2, x7, x7; /* note: high and low parts swapped */ \
+ \
+ /* \
+ * Add key material and result to CD (x becomes new CD) \
+ */ \
+ \
+ vpxor t3, x4, x4; \
+ vpxor 0 * 16(mem_cd), x4, x4; \
+ \
+ vpxor t2, x5, x5; \
+ vpxor 1 * 16(mem_cd), x5, x5; \
+ \
+ vpsrldq $1, t5, t3; \
+ vpshufb t6, t5, t5; \
+ vpshufb t6, t3, t6; \
+ \
+ vpxor t1, x6, x6; \
+ vpxor 2 * 16(mem_cd), x6, x6; \
+ \
+ vpxor t0, x7, x7; \
+ vpxor 3 * 16(mem_cd), x7, x7; \
+ \
+ vpxor t7, x0, x0; \
+ vpxor 4 * 16(mem_cd), x0, x0; \
+ \
+ vpxor t6, x1, x1; \
+ vpxor 5 * 16(mem_cd), x1, x1; \
+ \
+ vpxor t5, x2, x2; \
+ vpxor 6 * 16(mem_cd), x2, x2; \
+ \
+ vpxor t4, x3, x3; \
+ vpxor 7 * 16(mem_cd), x3, x3;
+
+/*
+ * Size optimization... with inlined roundsm16, binary would be over 5 times
+ * larger and would only be 0.5% faster (on sandy-bridge).
+ */
+.align 8
+roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd:
+ roundsm16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15,
+ %rcx, (%r9));
+ ret;
+ENDPROC(roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
+
+.align 8
+roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab:
+ roundsm16(%xmm4, %xmm5, %xmm6, %xmm7, %xmm0, %xmm1, %xmm2, %xmm3,
+ %xmm12, %xmm13, %xmm14, %xmm15, %xmm8, %xmm9, %xmm10, %xmm11,
+ %rax, (%r9));
+ ret;
+ENDPROC(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
+
+/*
+ * IN/OUT:
+ * x0..x7: byte-sliced AB state preloaded
+ * mem_ab: byte-sliced AB state in memory
+ * mem_cb: byte-sliced CD state in memory
+ */
+#define two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd, i, dir, store_ab) \
+ leaq (key_table + (i) * 8)(CTX), %r9; \
+ call roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \
+ \
+ vmovdqu x4, 0 * 16(mem_cd); \
+ vmovdqu x5, 1 * 16(mem_cd); \
+ vmovdqu x6, 2 * 16(mem_cd); \
+ vmovdqu x7, 3 * 16(mem_cd); \
+ vmovdqu x0, 4 * 16(mem_cd); \
+ vmovdqu x1, 5 * 16(mem_cd); \
+ vmovdqu x2, 6 * 16(mem_cd); \
+ vmovdqu x3, 7 * 16(mem_cd); \
+ \
+ leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \
+ call roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \
+ \
+ store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab);
+
+#define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */
+
+#define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \
+ /* Store new AB state */ \
+ vmovdqu x0, 0 * 16(mem_ab); \
+ vmovdqu x1, 1 * 16(mem_ab); \
+ vmovdqu x2, 2 * 16(mem_ab); \
+ vmovdqu x3, 3 * 16(mem_ab); \
+ vmovdqu x4, 4 * 16(mem_ab); \
+ vmovdqu x5, 5 * 16(mem_ab); \
+ vmovdqu x6, 6 * 16(mem_ab); \
+ vmovdqu x7, 7 * 16(mem_ab);
+
+#define enc_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd, i) \
+ two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \
+ two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \
+ two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store);
+
+#define dec_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd, i) \
+ two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \
+ two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \
+ two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
+
+/*
+ * IN:
+ * v0..3: byte-sliced 32-bit integers
+ * OUT:
+ * v0..3: (IN <<< 1)
+ */
+#define rol32_1_16(v0, v1, v2, v3, t0, t1, t2, zero) \
+ vpcmpgtb v0, zero, t0; \
+ vpaddb v0, v0, v0; \
+ vpabsb t0, t0; \
+ \
+ vpcmpgtb v1, zero, t1; \
+ vpaddb v1, v1, v1; \
+ vpabsb t1, t1; \
+ \
+ vpcmpgtb v2, zero, t2; \
+ vpaddb v2, v2, v2; \
+ vpabsb t2, t2; \
+ \
+ vpor t0, v1, v1; \
+ \
+ vpcmpgtb v3, zero, t0; \
+ vpaddb v3, v3, v3; \
+ vpabsb t0, t0; \
+ \
+ vpor t1, v2, v2; \
+ vpor t2, v3, v3; \
+ vpor t0, v0, v0;
+
+/*
+ * IN:
+ * r: byte-sliced AB state in memory
+ * l: byte-sliced CD state in memory
+ * OUT:
+ * x0..x7: new byte-sliced CD state
+ */
+#define fls16(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \
+ tt1, tt2, tt3, kll, klr, krl, krr) \
+ /* \
+ * t0 = kll; \
+ * t0 &= ll; \
+ * lr ^= rol32(t0, 1); \
+ */ \
+ vpxor tt0, tt0, tt0; \
+ vmovd kll, t0; \
+ vpshufb tt0, t0, t3; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t2; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t1; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t0; \
+ \
+ vpand l0, t0, t0; \
+ vpand l1, t1, t1; \
+ vpand l2, t2, t2; \
+ vpand l3, t3, t3; \
+ \
+ rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
+ \
+ vpxor l4, t0, l4; \
+ vmovdqu l4, 4 * 16(l); \
+ vpxor l5, t1, l5; \
+ vmovdqu l5, 5 * 16(l); \
+ vpxor l6, t2, l6; \
+ vmovdqu l6, 6 * 16(l); \
+ vpxor l7, t3, l7; \
+ vmovdqu l7, 7 * 16(l); \
+ \
+ /* \
+ * t2 = krr; \
+ * t2 |= rr; \
+ * rl ^= t2; \
+ */ \
+ \
+ vmovd krr, t0; \
+ vpshufb tt0, t0, t3; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t2; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t1; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t0; \
+ \
+ vpor 4 * 16(r), t0, t0; \
+ vpor 5 * 16(r), t1, t1; \
+ vpor 6 * 16(r), t2, t2; \
+ vpor 7 * 16(r), t3, t3; \
+ \
+ vpxor 0 * 16(r), t0, t0; \
+ vpxor 1 * 16(r), t1, t1; \
+ vpxor 2 * 16(r), t2, t2; \
+ vpxor 3 * 16(r), t3, t3; \
+ vmovdqu t0, 0 * 16(r); \
+ vmovdqu t1, 1 * 16(r); \
+ vmovdqu t2, 2 * 16(r); \
+ vmovdqu t3, 3 * 16(r); \
+ \
+ /* \
+ * t2 = krl; \
+ * t2 &= rl; \
+ * rr ^= rol32(t2, 1); \
+ */ \
+ vmovd krl, t0; \
+ vpshufb tt0, t0, t3; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t2; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t1; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t0; \
+ \
+ vpand 0 * 16(r), t0, t0; \
+ vpand 1 * 16(r), t1, t1; \
+ vpand 2 * 16(r), t2, t2; \
+ vpand 3 * 16(r), t3, t3; \
+ \
+ rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
+ \
+ vpxor 4 * 16(r), t0, t0; \
+ vpxor 5 * 16(r), t1, t1; \
+ vpxor 6 * 16(r), t2, t2; \
+ vpxor 7 * 16(r), t3, t3; \
+ vmovdqu t0, 4 * 16(r); \
+ vmovdqu t1, 5 * 16(r); \
+ vmovdqu t2, 6 * 16(r); \
+ vmovdqu t3, 7 * 16(r); \
+ \
+ /* \
+ * t0 = klr; \
+ * t0 |= lr; \
+ * ll ^= t0; \
+ */ \
+ \
+ vmovd klr, t0; \
+ vpshufb tt0, t0, t3; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t2; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t1; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t0; \
+ \
+ vpor l4, t0, t0; \
+ vpor l5, t1, t1; \
+ vpor l6, t2, t2; \
+ vpor l7, t3, t3; \
+ \
+ vpxor l0, t0, l0; \
+ vmovdqu l0, 0 * 16(l); \
+ vpxor l1, t1, l1; \
+ vmovdqu l1, 1 * 16(l); \
+ vpxor l2, t2, l2; \
+ vmovdqu l2, 2 * 16(l); \
+ vpxor l3, t3, l3; \
+ vmovdqu l3, 3 * 16(l);
+
+#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
+ vpunpckhdq x1, x0, t2; \
+ vpunpckldq x1, x0, x0; \
+ \
+ vpunpckldq x3, x2, t1; \
+ vpunpckhdq x3, x2, x2; \
+ \
+ vpunpckhqdq t1, x0, x1; \
+ vpunpcklqdq t1, x0, x0; \
+ \
+ vpunpckhqdq x2, t2, x3; \
+ vpunpcklqdq x2, t2, x2;
+
+#define byteslice_16x16b(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, \
+ b3, c3, d3, st0, st1) \
+ vmovdqu d2, st0; \
+ vmovdqu d3, st1; \
+ transpose_4x4(a0, a1, a2, a3, d2, d3); \
+ transpose_4x4(b0, b1, b2, b3, d2, d3); \
+ vmovdqu st0, d2; \
+ vmovdqu st1, d3; \
+ \
+ vmovdqu a0, st0; \
+ vmovdqu a1, st1; \
+ transpose_4x4(c0, c1, c2, c3, a0, a1); \
+ transpose_4x4(d0, d1, d2, d3, a0, a1); \
+ \
+ vmovdqu .Lshufb_16x16b, a0; \
+ vmovdqu st1, a1; \
+ vpshufb a0, a2, a2; \
+ vpshufb a0, a3, a3; \
+ vpshufb a0, b0, b0; \
+ vpshufb a0, b1, b1; \
+ vpshufb a0, b2, b2; \
+ vpshufb a0, b3, b3; \
+ vpshufb a0, a1, a1; \
+ vpshufb a0, c0, c0; \
+ vpshufb a0, c1, c1; \
+ vpshufb a0, c2, c2; \
+ vpshufb a0, c3, c3; \
+ vpshufb a0, d0, d0; \
+ vpshufb a0, d1, d1; \
+ vpshufb a0, d2, d2; \
+ vpshufb a0, d3, d3; \
+ vmovdqu d3, st1; \
+ vmovdqu st0, d3; \
+ vpshufb a0, d3, a0; \
+ vmovdqu d2, st0; \
+ \
+ transpose_4x4(a0, b0, c0, d0, d2, d3); \
+ transpose_4x4(a1, b1, c1, d1, d2, d3); \
+ vmovdqu st0, d2; \
+ vmovdqu st1, d3; \
+ \
+ vmovdqu b0, st0; \
+ vmovdqu b1, st1; \
+ transpose_4x4(a2, b2, c2, d2, b0, b1); \
+ transpose_4x4(a3, b3, c3, d3, b0, b1); \
+ vmovdqu st0, b0; \
+ vmovdqu st1, b1; \
+ /* does not adjust output bytes inside vectors */
+
+/* load blocks to registers and apply pre-whitening */
+#define inpack16_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, rio, key) \
+ vmovq key, x0; \
+ vpshufb .Lpack_bswap, x0, x0; \
+ \
+ vpxor 0 * 16(rio), x0, y7; \
+ vpxor 1 * 16(rio), x0, y6; \
+ vpxor 2 * 16(rio), x0, y5; \
+ vpxor 3 * 16(rio), x0, y4; \
+ vpxor 4 * 16(rio), x0, y3; \
+ vpxor 5 * 16(rio), x0, y2; \
+ vpxor 6 * 16(rio), x0, y1; \
+ vpxor 7 * 16(rio), x0, y0; \
+ vpxor 8 * 16(rio), x0, x7; \
+ vpxor 9 * 16(rio), x0, x6; \
+ vpxor 10 * 16(rio), x0, x5; \
+ vpxor 11 * 16(rio), x0, x4; \
+ vpxor 12 * 16(rio), x0, x3; \
+ vpxor 13 * 16(rio), x0, x2; \
+ vpxor 14 * 16(rio), x0, x1; \
+ vpxor 15 * 16(rio), x0, x0;
+
+/* byteslice pre-whitened blocks and store to temporary memory */
+#define inpack16_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd) \
+ byteslice_16x16b(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
+ y5, y6, y7, (mem_ab), (mem_cd)); \
+ \
+ vmovdqu x0, 0 * 16(mem_ab); \
+ vmovdqu x1, 1 * 16(mem_ab); \
+ vmovdqu x2, 2 * 16(mem_ab); \
+ vmovdqu x3, 3 * 16(mem_ab); \
+ vmovdqu x4, 4 * 16(mem_ab); \
+ vmovdqu x5, 5 * 16(mem_ab); \
+ vmovdqu x6, 6 * 16(mem_ab); \
+ vmovdqu x7, 7 * 16(mem_ab); \
+ vmovdqu y0, 0 * 16(mem_cd); \
+ vmovdqu y1, 1 * 16(mem_cd); \
+ vmovdqu y2, 2 * 16(mem_cd); \
+ vmovdqu y3, 3 * 16(mem_cd); \
+ vmovdqu y4, 4 * 16(mem_cd); \
+ vmovdqu y5, 5 * 16(mem_cd); \
+ vmovdqu y6, 6 * 16(mem_cd); \
+ vmovdqu y7, 7 * 16(mem_cd);
+
+/* de-byteslice, apply post-whitening and store blocks */
+#define outunpack16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
+ y5, y6, y7, key, stack_tmp0, stack_tmp1) \
+ byteslice_16x16b(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, y3, \
+ y7, x3, x7, stack_tmp0, stack_tmp1); \
+ \
+ vmovdqu x0, stack_tmp0; \
+ \
+ vmovq key, x0; \
+ vpshufb .Lpack_bswap, x0, x0; \
+ \
+ vpxor x0, y7, y7; \
+ vpxor x0, y6, y6; \
+ vpxor x0, y5, y5; \
+ vpxor x0, y4, y4; \
+ vpxor x0, y3, y3; \
+ vpxor x0, y2, y2; \
+ vpxor x0, y1, y1; \
+ vpxor x0, y0, y0; \
+ vpxor x0, x7, x7; \
+ vpxor x0, x6, x6; \
+ vpxor x0, x5, x5; \
+ vpxor x0, x4, x4; \
+ vpxor x0, x3, x3; \
+ vpxor x0, x2, x2; \
+ vpxor x0, x1, x1; \
+ vpxor stack_tmp0, x0, x0;
+
+#define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, rio) \
+ vmovdqu x0, 0 * 16(rio); \
+ vmovdqu x1, 1 * 16(rio); \
+ vmovdqu x2, 2 * 16(rio); \
+ vmovdqu x3, 3 * 16(rio); \
+ vmovdqu x4, 4 * 16(rio); \
+ vmovdqu x5, 5 * 16(rio); \
+ vmovdqu x6, 6 * 16(rio); \
+ vmovdqu x7, 7 * 16(rio); \
+ vmovdqu y0, 8 * 16(rio); \
+ vmovdqu y1, 9 * 16(rio); \
+ vmovdqu y2, 10 * 16(rio); \
+ vmovdqu y3, 11 * 16(rio); \
+ vmovdqu y4, 12 * 16(rio); \
+ vmovdqu y5, 13 * 16(rio); \
+ vmovdqu y6, 14 * 16(rio); \
+ vmovdqu y7, 15 * 16(rio);
+
+
+/* NB: section is mergeable, all elements must be aligned 16-byte blocks */
+.section .rodata.cst16, "aM", @progbits, 16
+.align 16
+
+#define SHUFB_BYTES(idx) \
+ 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
+
+.Lshufb_16x16b:
+ .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3);
+
+.Lpack_bswap:
+ .long 0x00010203
+ .long 0x04050607
+ .long 0x80808080
+ .long 0x80808080
+
+/* For CTR-mode IV byteswap */
+.Lbswap128_mask:
+ .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+/* For XTS mode IV generation */
+.Lxts_gf128mul_and_shl1_mask:
+ .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
+
+/*
+ * pre-SubByte transform
+ *
+ * pre-lookup for sbox1, sbox2, sbox3:
+ * swap_bitendianness(
+ * isom_map_camellia_to_aes(
+ * camellia_f(
+ * swap_bitendianess(in)
+ * )
+ * )
+ * )
+ *
+ * (note: '⊕ 0xc5' inside camellia_f())
+ */
+.Lpre_tf_lo_s1:
+ .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86
+ .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88
+.Lpre_tf_hi_s1:
+ .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a
+ .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23
+
+/*
+ * pre-SubByte transform
+ *
+ * pre-lookup for sbox4:
+ * swap_bitendianness(
+ * isom_map_camellia_to_aes(
+ * camellia_f(
+ * swap_bitendianess(in <<< 1)
+ * )
+ * )
+ * )
+ *
+ * (note: '⊕ 0xc5' inside camellia_f())
+ */
+.Lpre_tf_lo_s4:
+ .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25
+ .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74
+.Lpre_tf_hi_s4:
+ .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72
+ .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf
+
+/*
+ * post-SubByte transform
+ *
+ * post-lookup for sbox1, sbox4:
+ * swap_bitendianness(
+ * camellia_h(
+ * isom_map_aes_to_camellia(
+ * swap_bitendianness(
+ * aes_inverse_affine_transform(in)
+ * )
+ * )
+ * )
+ * )
+ *
+ * (note: '⊕ 0x6e' inside camellia_h())
+ */
+.Lpost_tf_lo_s1:
+ .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31
+ .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1
+.Lpost_tf_hi_s1:
+ .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8
+ .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c
+
+/*
+ * post-SubByte transform
+ *
+ * post-lookup for sbox2:
+ * swap_bitendianness(
+ * camellia_h(
+ * isom_map_aes_to_camellia(
+ * swap_bitendianness(
+ * aes_inverse_affine_transform(in)
+ * )
+ * )
+ * )
+ * ) <<< 1
+ *
+ * (note: '⊕ 0x6e' inside camellia_h())
+ */
+.Lpost_tf_lo_s2:
+ .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62
+ .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3
+.Lpost_tf_hi_s2:
+ .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51
+ .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18
+
+/*
+ * post-SubByte transform
+ *
+ * post-lookup for sbox3:
+ * swap_bitendianness(
+ * camellia_h(
+ * isom_map_aes_to_camellia(
+ * swap_bitendianness(
+ * aes_inverse_affine_transform(in)
+ * )
+ * )
+ * )
+ * ) >>> 1
+ *
+ * (note: '⊕ 0x6e' inside camellia_h())
+ */
+.Lpost_tf_lo_s3:
+ .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98
+ .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8
+.Lpost_tf_hi_s3:
+ .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54
+ .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06
+
+/* For isolating SubBytes from AESENCLAST, inverse shift row */
+.Linv_shift_row:
+ .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
+ .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
+
+/* 4-bit mask */
+.section .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
+.align 4
+.L0f0f0f0f:
+ .long 0x0f0f0f0f
+
+.text
+
+.align 8
+__camellia_enc_blk16:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rax: temporary storage, 256 bytes
+ * %xmm0..%xmm15: 16 plaintext blocks
+ * output:
+ * %xmm0..%xmm15: 16 encrypted blocks, order swapped:
+ * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
+ */
+ FRAME_BEGIN
+
+ leaq 8 * 16(%rax), %rcx;
+
+ inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %rcx);
+
+ enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %rcx, 0);
+
+ fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15,
+ ((key_table + (8) * 8) + 0)(CTX),
+ ((key_table + (8) * 8) + 4)(CTX),
+ ((key_table + (8) * 8) + 8)(CTX),
+ ((key_table + (8) * 8) + 12)(CTX));
+
+ enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %rcx, 8);
+
+ fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15,
+ ((key_table + (16) * 8) + 0)(CTX),
+ ((key_table + (16) * 8) + 4)(CTX),
+ ((key_table + (16) * 8) + 8)(CTX),
+ ((key_table + (16) * 8) + 12)(CTX));
+
+ enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %rcx, 16);
+
+ movl $24, %r8d;
+ cmpl $16, key_length(CTX);
+ jne .Lenc_max32;
+
+.Lenc_done:
+ /* load CD for output */
+ vmovdqu 0 * 16(%rcx), %xmm8;
+ vmovdqu 1 * 16(%rcx), %xmm9;
+ vmovdqu 2 * 16(%rcx), %xmm10;
+ vmovdqu 3 * 16(%rcx), %xmm11;
+ vmovdqu 4 * 16(%rcx), %xmm12;
+ vmovdqu 5 * 16(%rcx), %xmm13;
+ vmovdqu 6 * 16(%rcx), %xmm14;
+ vmovdqu 7 * 16(%rcx), %xmm15;
+
+ outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 16(%rax));
+
+ FRAME_END
+ ret;
+
+.align 8
+.Lenc_max32:
+ movl $32, %r8d;
+
+ fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15,
+ ((key_table + (24) * 8) + 0)(CTX),
+ ((key_table + (24) * 8) + 4)(CTX),
+ ((key_table + (24) * 8) + 8)(CTX),
+ ((key_table + (24) * 8) + 12)(CTX));
+
+ enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %rcx, 24);
+
+ jmp .Lenc_done;
+ENDPROC(__camellia_enc_blk16)
+
+.align 8
+__camellia_dec_blk16:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rax: temporary storage, 256 bytes
+ * %r8d: 24 for 16 byte key, 32 for larger
+ * %xmm0..%xmm15: 16 encrypted blocks
+ * output:
+ * %xmm0..%xmm15: 16 plaintext blocks, order swapped:
+ * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
+ */
+ FRAME_BEGIN
+
+ leaq 8 * 16(%rax), %rcx;
+
+ inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %rcx);
+
+ cmpl $32, %r8d;
+ je .Ldec_max32;
+
+.Ldec_max24:
+ dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %rcx, 16);
+
+ fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15,
+ ((key_table + (16) * 8) + 8)(CTX),
+ ((key_table + (16) * 8) + 12)(CTX),
+ ((key_table + (16) * 8) + 0)(CTX),
+ ((key_table + (16) * 8) + 4)(CTX));
+
+ dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %rcx, 8);
+
+ fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15,
+ ((key_table + (8) * 8) + 8)(CTX),
+ ((key_table + (8) * 8) + 12)(CTX),
+ ((key_table + (8) * 8) + 0)(CTX),
+ ((key_table + (8) * 8) + 4)(CTX));
+
+ dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %rcx, 0);
+
+ /* load CD for output */
+ vmovdqu 0 * 16(%rcx), %xmm8;
+ vmovdqu 1 * 16(%rcx), %xmm9;
+ vmovdqu 2 * 16(%rcx), %xmm10;
+ vmovdqu 3 * 16(%rcx), %xmm11;
+ vmovdqu 4 * 16(%rcx), %xmm12;
+ vmovdqu 5 * 16(%rcx), %xmm13;
+ vmovdqu 6 * 16(%rcx), %xmm14;
+ vmovdqu 7 * 16(%rcx), %xmm15;
+
+ outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, (key_table)(CTX), (%rax), 1 * 16(%rax));
+
+ FRAME_END
+ ret;
+
+.align 8
+.Ldec_max32:
+ dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rax, %rcx, 24);
+
+ fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15,
+ ((key_table + (24) * 8) + 8)(CTX),
+ ((key_table + (24) * 8) + 12)(CTX),
+ ((key_table + (24) * 8) + 0)(CTX),
+ ((key_table + (24) * 8) + 4)(CTX));
+
+ jmp .Ldec_max24;
+ENDPROC(__camellia_dec_blk16)
+
+ENTRY(camellia_ecb_enc_16way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (16 blocks)
+ * %rdx: src (16 blocks)
+ */
+ FRAME_BEGIN
+
+ inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rdx, (key_table)(CTX));
+
+ /* now dst can be used as temporary buffer (even in src == dst case) */
+ movq %rsi, %rax;
+
+ call __camellia_enc_blk16;
+
+ write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
+ %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
+ %xmm8, %rsi);
+
+ FRAME_END
+ ret;
+ENDPROC(camellia_ecb_enc_16way)
+
+ENTRY(camellia_ecb_dec_16way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (16 blocks)
+ * %rdx: src (16 blocks)
+ */
+ FRAME_BEGIN
+
+ cmpl $16, key_length(CTX);
+ movl $32, %r8d;
+ movl $24, %eax;
+ cmovel %eax, %r8d; /* max */
+
+ inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rdx, (key_table)(CTX, %r8, 8));
+
+ /* now dst can be used as temporary buffer (even in src == dst case) */
+ movq %rsi, %rax;
+
+ call __camellia_dec_blk16;
+
+ write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
+ %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
+ %xmm8, %rsi);
+
+ FRAME_END
+ ret;
+ENDPROC(camellia_ecb_dec_16way)
+
+ENTRY(camellia_cbc_dec_16way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (16 blocks)
+ * %rdx: src (16 blocks)
+ */
+ FRAME_BEGIN
+
+ cmpl $16, key_length(CTX);
+ movl $32, %r8d;
+ movl $24, %eax;
+ cmovel %eax, %r8d; /* max */
+
+ inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+ %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+ %xmm15, %rdx, (key_table)(CTX, %r8, 8));
+
+ /*
+ * dst might still be in-use (in case dst == src), so use stack for
+ * temporary storage.
+ */
+ subq $(16 * 16), %rsp;
+ movq %rsp, %rax;
+
+ call __camellia_dec_blk16;
+
+ addq $(16 * 16), %rsp;
+
+ vpxor (0 * 16)(%rdx), %xmm6, %xmm6;
+ vpxor (1 * 16)(%rdx), %xmm5, %xmm5;
+ vpxor (2 * 16)(%rdx), %xmm4, %xmm4;
+ vpxor (3 * 16)(%rdx), %xmm3, %xmm3;
+ vpxor (4 * 16)(%rdx), %xmm2, %xmm2;
+ vpxor (5 * 16)(%rdx), %xmm1, %xmm1;
+ vpxor (6 * 16)(%rdx), %xmm0, %xmm0;
+ vpxor (7 * 16)(%rdx), %xmm15, %xmm15;
+ vpxor (8 * 16)(%rdx), %xmm14, %xmm14;
+ vpxor (9 * 16)(%rdx), %xmm13, %xmm13;
+ vpxor (10 * 16)(%rdx), %xmm12, %xmm12;
+ vpxor (11 * 16)(%rdx), %xmm11, %xmm11;
+ vpxor (12 * 16)(%rdx), %xmm10, %xmm10;
+ vpxor (13 * 16)(%rdx), %xmm9, %xmm9;
+ vpxor (14 * 16)(%rdx), %xmm8, %xmm8;
+ write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
+ %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
+ %xmm8, %rsi);
+
+ FRAME_END
+ ret;
+ENDPROC(camellia_cbc_dec_16way)
+
+#define inc_le128(x, minus_one, tmp) \
+ vpcmpeqq minus_one, x, tmp; \
+ vpsubq minus_one, x, x; \
+ vpslldq $8, tmp, tmp; \
+ vpsubq tmp, x, x;
+
+ENTRY(camellia_ctr_16way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (16 blocks)
+ * %rdx: src (16 blocks)
+ * %rcx: iv (little endian, 128bit)
+ */
+ FRAME_BEGIN
+
+ subq $(16 * 16), %rsp;
+ movq %rsp, %rax;
+
+ vmovdqa .Lbswap128_mask, %xmm14;
+
+ /* load IV and byteswap */
+ vmovdqu (%rcx), %xmm0;
+ vpshufb %xmm14, %xmm0, %xmm15;
+ vmovdqu %xmm15, 15 * 16(%rax);
+
+ vpcmpeqd %xmm15, %xmm15, %xmm15;
+ vpsrldq $8, %xmm15, %xmm15; /* low: -1, high: 0 */
+
+ /* construct IVs */
+ inc_le128(%xmm0, %xmm15, %xmm13);
+ vpshufb %xmm14, %xmm0, %xmm13;
+ vmovdqu %xmm13, 14 * 16(%rax);
+ inc_le128(%xmm0, %xmm15, %xmm13);
+ vpshufb %xmm14, %xmm0, %xmm13;
+ vmovdqu %xmm13, 13 * 16(%rax);
+ inc_le128(%xmm0, %xmm15, %xmm13);
+ vpshufb %xmm14, %xmm0, %xmm12;
+ inc_le128(%xmm0, %xmm15, %xmm13);
+ vpshufb %xmm14, %xmm0, %xmm11;
+ inc_le128(%xmm0, %xmm15, %xmm13);
+ vpshufb %xmm14, %xmm0, %xmm10;
+ inc_le128(%xmm0, %xmm15, %xmm13);
+ vpshufb %xmm14, %xmm0, %xmm9;
+ inc_le128(%xmm0, %xmm15, %xmm13);
+ vpshufb %xmm14, %xmm0, %xmm8;
+ inc_le128(%xmm0, %xmm15, %xmm13);
+ vpshufb %xmm14, %xmm0, %xmm7;
+ inc_le128(%xmm0, %xmm15, %xmm13);
+ vpshufb %xmm14, %xmm0, %xmm6;
+ inc_le128(%xmm0, %xmm15, %xmm13);
+ vpshufb %xmm14, %xmm0, %xmm5;
+ inc_le128(%xmm0, %xmm15, %xmm13);
+ vpshufb %xmm14, %xmm0, %xmm4;
+ inc_le128(%xmm0, %xmm15, %xmm13);
+ vpshufb %xmm14, %xmm0, %xmm3;
+ inc_le128(%xmm0, %xmm15, %xmm13);
+ vpshufb %xmm14, %xmm0, %xmm2;
+ inc_le128(%xmm0, %xmm15, %xmm13);
+ vpshufb %xmm14, %xmm0, %xmm1;
+ inc_le128(%xmm0, %xmm15, %xmm13);
+ vmovdqa %xmm0, %xmm13;
+ vpshufb %xmm14, %xmm0, %xmm0;
+ inc_le128(%xmm13, %xmm15, %xmm14);
+ vmovdqu %xmm13, (%rcx);
+
+ /* inpack16_pre: */
+ vmovq (key_table)(CTX), %xmm15;
+ vpshufb .Lpack_bswap, %xmm15, %xmm15;
+ vpxor %xmm0, %xmm15, %xmm0;
+ vpxor %xmm1, %xmm15, %xmm1;
+ vpxor %xmm2, %xmm15, %xmm2;
+ vpxor %xmm3, %xmm15, %xmm3;
+ vpxor %xmm4, %xmm15, %xmm4;
+ vpxor %xmm5, %xmm15, %xmm5;
+ vpxor %xmm6, %xmm15, %xmm6;
+ vpxor %xmm7, %xmm15, %xmm7;
+ vpxor %xmm8, %xmm15, %xmm8;
+ vpxor %xmm9, %xmm15, %xmm9;
+ vpxor %xmm10, %xmm15, %xmm10;
+ vpxor %xmm11, %xmm15, %xmm11;
+ vpxor %xmm12, %xmm15, %xmm12;
+ vpxor 13 * 16(%rax), %xmm15, %xmm13;
+ vpxor 14 * 16(%rax), %xmm15, %xmm14;
+ vpxor 15 * 16(%rax), %xmm15, %xmm15;
+
+ call __camellia_enc_blk16;
+
+ addq $(16 * 16), %rsp;
+
+ vpxor 0 * 16(%rdx), %xmm7, %xmm7;
+ vpxor 1 * 16(%rdx), %xmm6, %xmm6;
+ vpxor 2 * 16(%rdx), %xmm5, %xmm5;
+ vpxor 3 * 16(%rdx), %xmm4, %xmm4;
+ vpxor 4 * 16(%rdx), %xmm3, %xmm3;
+ vpxor 5 * 16(%rdx), %xmm2, %xmm2;
+ vpxor 6 * 16(%rdx), %xmm1, %xmm1;
+ vpxor 7 * 16(%rdx), %xmm0, %xmm0;
+ vpxor 8 * 16(%rdx), %xmm15, %xmm15;
+ vpxor 9 * 16(%rdx), %xmm14, %xmm14;
+ vpxor 10 * 16(%rdx), %xmm13, %xmm13;
+ vpxor 11 * 16(%rdx), %xmm12, %xmm12;
+ vpxor 12 * 16(%rdx), %xmm11, %xmm11;
+ vpxor 13 * 16(%rdx), %xmm10, %xmm10;
+ vpxor 14 * 16(%rdx), %xmm9, %xmm9;
+ vpxor 15 * 16(%rdx), %xmm8, %xmm8;
+ write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
+ %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
+ %xmm8, %rsi);
+
+ FRAME_END
+ ret;
+ENDPROC(camellia_ctr_16way)
+
+#define gf128mul_x_ble(iv, mask, tmp) \
+ vpsrad $31, iv, tmp; \
+ vpaddq iv, iv, iv; \
+ vpshufd $0x13, tmp, tmp; \
+ vpand mask, tmp, tmp; \
+ vpxor tmp, iv, iv;
+
+.align 8
+camellia_xts_crypt_16way:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (16 blocks)
+ * %rdx: src (16 blocks)
+ * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+ * %r8: index for input whitening key
+ * %r9: pointer to __camellia_enc_blk16 or __camellia_dec_blk16
+ */
+ FRAME_BEGIN
+
+ subq $(16 * 16), %rsp;
+ movq %rsp, %rax;
+
+ vmovdqa .Lxts_gf128mul_and_shl1_mask, %xmm14;
+
+ /* load IV */
+ vmovdqu (%rcx), %xmm0;
+ vpxor 0 * 16(%rdx), %xmm0, %xmm15;
+ vmovdqu %xmm15, 15 * 16(%rax);
+ vmovdqu %xmm0, 0 * 16(%rsi);
+
+ /* construct IVs */
+ gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+ vpxor 1 * 16(%rdx), %xmm0, %xmm15;
+ vmovdqu %xmm15, 14 * 16(%rax);
+ vmovdqu %xmm0, 1 * 16(%rsi);
+
+ gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+ vpxor 2 * 16(%rdx), %xmm0, %xmm13;
+ vmovdqu %xmm0, 2 * 16(%rsi);
+
+ gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+ vpxor 3 * 16(%rdx), %xmm0, %xmm12;
+ vmovdqu %xmm0, 3 * 16(%rsi);
+
+ gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+ vpxor 4 * 16(%rdx), %xmm0, %xmm11;
+ vmovdqu %xmm0, 4 * 16(%rsi);
+
+ gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+ vpxor 5 * 16(%rdx), %xmm0, %xmm10;
+ vmovdqu %xmm0, 5 * 16(%rsi);
+
+ gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+ vpxor 6 * 16(%rdx), %xmm0, %xmm9;
+ vmovdqu %xmm0, 6 * 16(%rsi);
+
+ gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+ vpxor 7 * 16(%rdx), %xmm0, %xmm8;
+ vmovdqu %xmm0, 7 * 16(%rsi);
+
+ gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+ vpxor 8 * 16(%rdx), %xmm0, %xmm7;
+ vmovdqu %xmm0, 8 * 16(%rsi);
+
+ gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+ vpxor 9 * 16(%rdx), %xmm0, %xmm6;
+ vmovdqu %xmm0, 9 * 16(%rsi);
+
+ gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+ vpxor 10 * 16(%rdx), %xmm0, %xmm5;
+ vmovdqu %xmm0, 10 * 16(%rsi);
+
+ gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+ vpxor 11 * 16(%rdx), %xmm0, %xmm4;
+ vmovdqu %xmm0, 11 * 16(%rsi);
+
+ gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+ vpxor 12 * 16(%rdx), %xmm0, %xmm3;
+ vmovdqu %xmm0, 12 * 16(%rsi);
+
+ gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+ vpxor 13 * 16(%rdx), %xmm0, %xmm2;
+ vmovdqu %xmm0, 13 * 16(%rsi);
+
+ gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+ vpxor 14 * 16(%rdx), %xmm0, %xmm1;
+ vmovdqu %xmm0, 14 * 16(%rsi);
+
+ gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+ vpxor 15 * 16(%rdx), %xmm0, %xmm15;
+ vmovdqu %xmm15, 0 * 16(%rax);
+ vmovdqu %xmm0, 15 * 16(%rsi);
+
+ gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+ vmovdqu %xmm0, (%rcx);
+
+ /* inpack16_pre: */
+ vmovq (key_table)(CTX, %r8, 8), %xmm15;
+ vpshufb .Lpack_bswap, %xmm15, %xmm15;
+ vpxor 0 * 16(%rax), %xmm15, %xmm0;
+ vpxor %xmm1, %xmm15, %xmm1;
+ vpxor %xmm2, %xmm15, %xmm2;
+ vpxor %xmm3, %xmm15, %xmm3;
+ vpxor %xmm4, %xmm15, %xmm4;
+ vpxor %xmm5, %xmm15, %xmm5;
+ vpxor %xmm6, %xmm15, %xmm6;
+ vpxor %xmm7, %xmm15, %xmm7;
+ vpxor %xmm8, %xmm15, %xmm8;
+ vpxor %xmm9, %xmm15, %xmm9;
+ vpxor %xmm10, %xmm15, %xmm10;
+ vpxor %xmm11, %xmm15, %xmm11;
+ vpxor %xmm12, %xmm15, %xmm12;
+ vpxor %xmm13, %xmm15, %xmm13;
+ vpxor 14 * 16(%rax), %xmm15, %xmm14;
+ vpxor 15 * 16(%rax), %xmm15, %xmm15;
+
+ CALL_NOSPEC %r9;
+
+ addq $(16 * 16), %rsp;
+
+ vpxor 0 * 16(%rsi), %xmm7, %xmm7;
+ vpxor 1 * 16(%rsi), %xmm6, %xmm6;
+ vpxor 2 * 16(%rsi), %xmm5, %xmm5;
+ vpxor 3 * 16(%rsi), %xmm4, %xmm4;
+ vpxor 4 * 16(%rsi), %xmm3, %xmm3;
+ vpxor 5 * 16(%rsi), %xmm2, %xmm2;
+ vpxor 6 * 16(%rsi), %xmm1, %xmm1;
+ vpxor 7 * 16(%rsi), %xmm0, %xmm0;
+ vpxor 8 * 16(%rsi), %xmm15, %xmm15;
+ vpxor 9 * 16(%rsi), %xmm14, %xmm14;
+ vpxor 10 * 16(%rsi), %xmm13, %xmm13;
+ vpxor 11 * 16(%rsi), %xmm12, %xmm12;
+ vpxor 12 * 16(%rsi), %xmm11, %xmm11;
+ vpxor 13 * 16(%rsi), %xmm10, %xmm10;
+ vpxor 14 * 16(%rsi), %xmm9, %xmm9;
+ vpxor 15 * 16(%rsi), %xmm8, %xmm8;
+ write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
+ %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
+ %xmm8, %rsi);
+
+ FRAME_END
+ ret;
+ENDPROC(camellia_xts_crypt_16way)
+
+ENTRY(camellia_xts_enc_16way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (16 blocks)
+ * %rdx: src (16 blocks)
+ * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+ */
+ xorl %r8d, %r8d; /* input whitening key, 0 for enc */
+
+ leaq __camellia_enc_blk16, %r9;
+
+ jmp camellia_xts_crypt_16way;
+ENDPROC(camellia_xts_enc_16way)
+
+ENTRY(camellia_xts_dec_16way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (16 blocks)
+ * %rdx: src (16 blocks)
+ * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+ */
+
+ cmpl $16, key_length(CTX);
+ movl $32, %r8d;
+ movl $24, %eax;
+ cmovel %eax, %r8d; /* input whitening key, last for dec */
+
+ leaq __camellia_dec_blk16, %r9;
+
+ jmp camellia_xts_crypt_16way;
+ENDPROC(camellia_xts_dec_16way)
diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
new file mode 100644
index 000000000..b66bbfa62
--- /dev/null
+++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
@@ -0,0 +1,1408 @@
+/*
+ * x86_64/AVX2/AES-NI assembler implementation of Camellia
+ *
+ * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+#include <asm/nospec-branch.h>
+
+#define CAMELLIA_TABLE_BYTE_LEN 272
+
+/* struct camellia_ctx: */
+#define key_table 0
+#define key_length CAMELLIA_TABLE_BYTE_LEN
+
+/* register macros */
+#define CTX %rdi
+#define RIO %r8
+
+/**********************************************************************
+ helper macros
+ **********************************************************************/
+#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
+ vpand x, mask4bit, tmp0; \
+ vpandn x, mask4bit, x; \
+ vpsrld $4, x, x; \
+ \
+ vpshufb tmp0, lo_t, tmp0; \
+ vpshufb x, hi_t, x; \
+ vpxor tmp0, x, x;
+
+#define ymm0_x xmm0
+#define ymm1_x xmm1
+#define ymm2_x xmm2
+#define ymm3_x xmm3
+#define ymm4_x xmm4
+#define ymm5_x xmm5
+#define ymm6_x xmm6
+#define ymm7_x xmm7
+#define ymm8_x xmm8
+#define ymm9_x xmm9
+#define ymm10_x xmm10
+#define ymm11_x xmm11
+#define ymm12_x xmm12
+#define ymm13_x xmm13
+#define ymm14_x xmm14
+#define ymm15_x xmm15
+
+/**********************************************************************
+ 32-way camellia
+ **********************************************************************/
+
+/*
+ * IN:
+ * x0..x7: byte-sliced AB state
+ * mem_cd: register pointer storing CD state
+ * key: index for key material
+ * OUT:
+ * x0..x7: new byte-sliced CD state
+ */
+#define roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \
+ t7, mem_cd, key) \
+ /* \
+ * S-function with AES subbytes \
+ */ \
+ vbroadcasti128 .Linv_shift_row, t4; \
+ vpbroadcastd .L0f0f0f0f, t7; \
+ vbroadcasti128 .Lpre_tf_lo_s1, t5; \
+ vbroadcasti128 .Lpre_tf_hi_s1, t6; \
+ vbroadcasti128 .Lpre_tf_lo_s4, t2; \
+ vbroadcasti128 .Lpre_tf_hi_s4, t3; \
+ \
+ /* AES inverse shift rows */ \
+ vpshufb t4, x0, x0; \
+ vpshufb t4, x7, x7; \
+ vpshufb t4, x3, x3; \
+ vpshufb t4, x6, x6; \
+ vpshufb t4, x2, x2; \
+ vpshufb t4, x5, x5; \
+ vpshufb t4, x1, x1; \
+ vpshufb t4, x4, x4; \
+ \
+ /* prefilter sboxes 1, 2 and 3 */ \
+ /* prefilter sbox 4 */ \
+ filter_8bit(x0, t5, t6, t7, t4); \
+ filter_8bit(x7, t5, t6, t7, t4); \
+ vextracti128 $1, x0, t0##_x; \
+ vextracti128 $1, x7, t1##_x; \
+ filter_8bit(x3, t2, t3, t7, t4); \
+ filter_8bit(x6, t2, t3, t7, t4); \
+ vextracti128 $1, x3, t3##_x; \
+ vextracti128 $1, x6, t2##_x; \
+ filter_8bit(x2, t5, t6, t7, t4); \
+ filter_8bit(x5, t5, t6, t7, t4); \
+ filter_8bit(x1, t5, t6, t7, t4); \
+ filter_8bit(x4, t5, t6, t7, t4); \
+ \
+ vpxor t4##_x, t4##_x, t4##_x; \
+ \
+ /* AES subbytes + AES shift rows */ \
+ vextracti128 $1, x2, t6##_x; \
+ vextracti128 $1, x5, t5##_x; \
+ vaesenclast t4##_x, x0##_x, x0##_x; \
+ vaesenclast t4##_x, t0##_x, t0##_x; \
+ vinserti128 $1, t0##_x, x0, x0; \
+ vaesenclast t4##_x, x7##_x, x7##_x; \
+ vaesenclast t4##_x, t1##_x, t1##_x; \
+ vinserti128 $1, t1##_x, x7, x7; \
+ vaesenclast t4##_x, x3##_x, x3##_x; \
+ vaesenclast t4##_x, t3##_x, t3##_x; \
+ vinserti128 $1, t3##_x, x3, x3; \
+ vaesenclast t4##_x, x6##_x, x6##_x; \
+ vaesenclast t4##_x, t2##_x, t2##_x; \
+ vinserti128 $1, t2##_x, x6, x6; \
+ vextracti128 $1, x1, t3##_x; \
+ vextracti128 $1, x4, t2##_x; \
+ vbroadcasti128 .Lpost_tf_lo_s1, t0; \
+ vbroadcasti128 .Lpost_tf_hi_s1, t1; \
+ vaesenclast t4##_x, x2##_x, x2##_x; \
+ vaesenclast t4##_x, t6##_x, t6##_x; \
+ vinserti128 $1, t6##_x, x2, x2; \
+ vaesenclast t4##_x, x5##_x, x5##_x; \
+ vaesenclast t4##_x, t5##_x, t5##_x; \
+ vinserti128 $1, t5##_x, x5, x5; \
+ vaesenclast t4##_x, x1##_x, x1##_x; \
+ vaesenclast t4##_x, t3##_x, t3##_x; \
+ vinserti128 $1, t3##_x, x1, x1; \
+ vaesenclast t4##_x, x4##_x, x4##_x; \
+ vaesenclast t4##_x, t2##_x, t2##_x; \
+ vinserti128 $1, t2##_x, x4, x4; \
+ \
+ /* postfilter sboxes 1 and 4 */ \
+ vbroadcasti128 .Lpost_tf_lo_s3, t2; \
+ vbroadcasti128 .Lpost_tf_hi_s3, t3; \
+ filter_8bit(x0, t0, t1, t7, t6); \
+ filter_8bit(x7, t0, t1, t7, t6); \
+ filter_8bit(x3, t0, t1, t7, t6); \
+ filter_8bit(x6, t0, t1, t7, t6); \
+ \
+ /* postfilter sbox 3 */ \
+ vbroadcasti128 .Lpost_tf_lo_s2, t4; \
+ vbroadcasti128 .Lpost_tf_hi_s2, t5; \
+ filter_8bit(x2, t2, t3, t7, t6); \
+ filter_8bit(x5, t2, t3, t7, t6); \
+ \
+ vpbroadcastq key, t0; /* higher 64-bit duplicate ignored */ \
+ \
+ /* postfilter sbox 2 */ \
+ filter_8bit(x1, t4, t5, t7, t2); \
+ filter_8bit(x4, t4, t5, t7, t2); \
+ vpxor t7, t7, t7; \
+ \
+ vpsrldq $1, t0, t1; \
+ vpsrldq $2, t0, t2; \
+ vpshufb t7, t1, t1; \
+ vpsrldq $3, t0, t3; \
+ \
+ /* P-function */ \
+ vpxor x5, x0, x0; \
+ vpxor x6, x1, x1; \
+ vpxor x7, x2, x2; \
+ vpxor x4, x3, x3; \
+ \
+ vpshufb t7, t2, t2; \
+ vpsrldq $4, t0, t4; \
+ vpshufb t7, t3, t3; \
+ vpsrldq $5, t0, t5; \
+ vpshufb t7, t4, t4; \
+ \
+ vpxor x2, x4, x4; \
+ vpxor x3, x5, x5; \
+ vpxor x0, x6, x6; \
+ vpxor x1, x7, x7; \
+ \
+ vpsrldq $6, t0, t6; \
+ vpshufb t7, t5, t5; \
+ vpshufb t7, t6, t6; \
+ \
+ vpxor x7, x0, x0; \
+ vpxor x4, x1, x1; \
+ vpxor x5, x2, x2; \
+ vpxor x6, x3, x3; \
+ \
+ vpxor x3, x4, x4; \
+ vpxor x0, x5, x5; \
+ vpxor x1, x6, x6; \
+ vpxor x2, x7, x7; /* note: high and low parts swapped */ \
+ \
+ /* Add key material and result to CD (x becomes new CD) */ \
+ \
+ vpxor t6, x1, x1; \
+ vpxor 5 * 32(mem_cd), x1, x1; \
+ \
+ vpsrldq $7, t0, t6; \
+ vpshufb t7, t0, t0; \
+ vpshufb t7, t6, t7; \
+ \
+ vpxor t7, x0, x0; \
+ vpxor 4 * 32(mem_cd), x0, x0; \
+ \
+ vpxor t5, x2, x2; \
+ vpxor 6 * 32(mem_cd), x2, x2; \
+ \
+ vpxor t4, x3, x3; \
+ vpxor 7 * 32(mem_cd), x3, x3; \
+ \
+ vpxor t3, x4, x4; \
+ vpxor 0 * 32(mem_cd), x4, x4; \
+ \
+ vpxor t2, x5, x5; \
+ vpxor 1 * 32(mem_cd), x5, x5; \
+ \
+ vpxor t1, x6, x6; \
+ vpxor 2 * 32(mem_cd), x6, x6; \
+ \
+ vpxor t0, x7, x7; \
+ vpxor 3 * 32(mem_cd), x7, x7;
+
+/*
+ * Size optimization... with inlined roundsm32 binary would be over 5 times
+ * larger and would only marginally faster.
+ */
+.align 8
+roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd:
+ roundsm32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15,
+ %rcx, (%r9));
+ ret;
+ENDPROC(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
+
+.align 8
+roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab:
+ roundsm32(%ymm4, %ymm5, %ymm6, %ymm7, %ymm0, %ymm1, %ymm2, %ymm3,
+ %ymm12, %ymm13, %ymm14, %ymm15, %ymm8, %ymm9, %ymm10, %ymm11,
+ %rax, (%r9));
+ ret;
+ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
+
+/*
+ * IN/OUT:
+ * x0..x7: byte-sliced AB state preloaded
+ * mem_ab: byte-sliced AB state in memory
+ * mem_cb: byte-sliced CD state in memory
+ */
+#define two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd, i, dir, store_ab) \
+ leaq (key_table + (i) * 8)(CTX), %r9; \
+ call roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \
+ \
+ vmovdqu x0, 4 * 32(mem_cd); \
+ vmovdqu x1, 5 * 32(mem_cd); \
+ vmovdqu x2, 6 * 32(mem_cd); \
+ vmovdqu x3, 7 * 32(mem_cd); \
+ vmovdqu x4, 0 * 32(mem_cd); \
+ vmovdqu x5, 1 * 32(mem_cd); \
+ vmovdqu x6, 2 * 32(mem_cd); \
+ vmovdqu x7, 3 * 32(mem_cd); \
+ \
+ leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \
+ call roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \
+ \
+ store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab);
+
+#define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */
+
+#define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \
+ /* Store new AB state */ \
+ vmovdqu x4, 4 * 32(mem_ab); \
+ vmovdqu x5, 5 * 32(mem_ab); \
+ vmovdqu x6, 6 * 32(mem_ab); \
+ vmovdqu x7, 7 * 32(mem_ab); \
+ vmovdqu x0, 0 * 32(mem_ab); \
+ vmovdqu x1, 1 * 32(mem_ab); \
+ vmovdqu x2, 2 * 32(mem_ab); \
+ vmovdqu x3, 3 * 32(mem_ab);
+
+#define enc_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd, i) \
+ two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \
+ two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \
+ two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store);
+
+#define dec_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd, i) \
+ two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \
+ two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \
+ two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
+
+/*
+ * IN:
+ * v0..3: byte-sliced 32-bit integers
+ * OUT:
+ * v0..3: (IN <<< 1)
+ */
+#define rol32_1_32(v0, v1, v2, v3, t0, t1, t2, zero) \
+ vpcmpgtb v0, zero, t0; \
+ vpaddb v0, v0, v0; \
+ vpabsb t0, t0; \
+ \
+ vpcmpgtb v1, zero, t1; \
+ vpaddb v1, v1, v1; \
+ vpabsb t1, t1; \
+ \
+ vpcmpgtb v2, zero, t2; \
+ vpaddb v2, v2, v2; \
+ vpabsb t2, t2; \
+ \
+ vpor t0, v1, v1; \
+ \
+ vpcmpgtb v3, zero, t0; \
+ vpaddb v3, v3, v3; \
+ vpabsb t0, t0; \
+ \
+ vpor t1, v2, v2; \
+ vpor t2, v3, v3; \
+ vpor t0, v0, v0;
+
+/*
+ * IN:
+ * r: byte-sliced AB state in memory
+ * l: byte-sliced CD state in memory
+ * OUT:
+ * x0..x7: new byte-sliced CD state
+ */
+#define fls32(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \
+ tt1, tt2, tt3, kll, klr, krl, krr) \
+ /* \
+ * t0 = kll; \
+ * t0 &= ll; \
+ * lr ^= rol32(t0, 1); \
+ */ \
+ vpbroadcastd kll, t0; /* only lowest 32-bit used */ \
+ vpxor tt0, tt0, tt0; \
+ vpshufb tt0, t0, t3; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t2; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t1; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t0; \
+ \
+ vpand l0, t0, t0; \
+ vpand l1, t1, t1; \
+ vpand l2, t2, t2; \
+ vpand l3, t3, t3; \
+ \
+ rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
+ \
+ vpxor l4, t0, l4; \
+ vpbroadcastd krr, t0; /* only lowest 32-bit used */ \
+ vmovdqu l4, 4 * 32(l); \
+ vpxor l5, t1, l5; \
+ vmovdqu l5, 5 * 32(l); \
+ vpxor l6, t2, l6; \
+ vmovdqu l6, 6 * 32(l); \
+ vpxor l7, t3, l7; \
+ vmovdqu l7, 7 * 32(l); \
+ \
+ /* \
+ * t2 = krr; \
+ * t2 |= rr; \
+ * rl ^= t2; \
+ */ \
+ \
+ vpshufb tt0, t0, t3; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t2; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t1; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t0; \
+ \
+ vpor 4 * 32(r), t0, t0; \
+ vpor 5 * 32(r), t1, t1; \
+ vpor 6 * 32(r), t2, t2; \
+ vpor 7 * 32(r), t3, t3; \
+ \
+ vpxor 0 * 32(r), t0, t0; \
+ vpxor 1 * 32(r), t1, t1; \
+ vpxor 2 * 32(r), t2, t2; \
+ vpxor 3 * 32(r), t3, t3; \
+ vmovdqu t0, 0 * 32(r); \
+ vpbroadcastd krl, t0; /* only lowest 32-bit used */ \
+ vmovdqu t1, 1 * 32(r); \
+ vmovdqu t2, 2 * 32(r); \
+ vmovdqu t3, 3 * 32(r); \
+ \
+ /* \
+ * t2 = krl; \
+ * t2 &= rl; \
+ * rr ^= rol32(t2, 1); \
+ */ \
+ vpshufb tt0, t0, t3; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t2; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t1; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t0; \
+ \
+ vpand 0 * 32(r), t0, t0; \
+ vpand 1 * 32(r), t1, t1; \
+ vpand 2 * 32(r), t2, t2; \
+ vpand 3 * 32(r), t3, t3; \
+ \
+ rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
+ \
+ vpxor 4 * 32(r), t0, t0; \
+ vpxor 5 * 32(r), t1, t1; \
+ vpxor 6 * 32(r), t2, t2; \
+ vpxor 7 * 32(r), t3, t3; \
+ vmovdqu t0, 4 * 32(r); \
+ vpbroadcastd klr, t0; /* only lowest 32-bit used */ \
+ vmovdqu t1, 5 * 32(r); \
+ vmovdqu t2, 6 * 32(r); \
+ vmovdqu t3, 7 * 32(r); \
+ \
+ /* \
+ * t0 = klr; \
+ * t0 |= lr; \
+ * ll ^= t0; \
+ */ \
+ \
+ vpshufb tt0, t0, t3; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t2; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t1; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt0, t0, t0; \
+ \
+ vpor l4, t0, t0; \
+ vpor l5, t1, t1; \
+ vpor l6, t2, t2; \
+ vpor l7, t3, t3; \
+ \
+ vpxor l0, t0, l0; \
+ vmovdqu l0, 0 * 32(l); \
+ vpxor l1, t1, l1; \
+ vmovdqu l1, 1 * 32(l); \
+ vpxor l2, t2, l2; \
+ vmovdqu l2, 2 * 32(l); \
+ vpxor l3, t3, l3; \
+ vmovdqu l3, 3 * 32(l);
+
+#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
+ vpunpckhdq x1, x0, t2; \
+ vpunpckldq x1, x0, x0; \
+ \
+ vpunpckldq x3, x2, t1; \
+ vpunpckhdq x3, x2, x2; \
+ \
+ vpunpckhqdq t1, x0, x1; \
+ vpunpcklqdq t1, x0, x0; \
+ \
+ vpunpckhqdq x2, t2, x3; \
+ vpunpcklqdq x2, t2, x2;
+
+#define byteslice_16x16b_fast(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, \
+ a3, b3, c3, d3, st0, st1) \
+ vmovdqu d2, st0; \
+ vmovdqu d3, st1; \
+ transpose_4x4(a0, a1, a2, a3, d2, d3); \
+ transpose_4x4(b0, b1, b2, b3, d2, d3); \
+ vmovdqu st0, d2; \
+ vmovdqu st1, d3; \
+ \
+ vmovdqu a0, st0; \
+ vmovdqu a1, st1; \
+ transpose_4x4(c0, c1, c2, c3, a0, a1); \
+ transpose_4x4(d0, d1, d2, d3, a0, a1); \
+ \
+ vbroadcasti128 .Lshufb_16x16b, a0; \
+ vmovdqu st1, a1; \
+ vpshufb a0, a2, a2; \
+ vpshufb a0, a3, a3; \
+ vpshufb a0, b0, b0; \
+ vpshufb a0, b1, b1; \
+ vpshufb a0, b2, b2; \
+ vpshufb a0, b3, b3; \
+ vpshufb a0, a1, a1; \
+ vpshufb a0, c0, c0; \
+ vpshufb a0, c1, c1; \
+ vpshufb a0, c2, c2; \
+ vpshufb a0, c3, c3; \
+ vpshufb a0, d0, d0; \
+ vpshufb a0, d1, d1; \
+ vpshufb a0, d2, d2; \
+ vpshufb a0, d3, d3; \
+ vmovdqu d3, st1; \
+ vmovdqu st0, d3; \
+ vpshufb a0, d3, a0; \
+ vmovdqu d2, st0; \
+ \
+ transpose_4x4(a0, b0, c0, d0, d2, d3); \
+ transpose_4x4(a1, b1, c1, d1, d2, d3); \
+ vmovdqu st0, d2; \
+ vmovdqu st1, d3; \
+ \
+ vmovdqu b0, st0; \
+ vmovdqu b1, st1; \
+ transpose_4x4(a2, b2, c2, d2, b0, b1); \
+ transpose_4x4(a3, b3, c3, d3, b0, b1); \
+ vmovdqu st0, b0; \
+ vmovdqu st1, b1; \
+ /* does not adjust output bytes inside vectors */
+
+/* load blocks to registers and apply pre-whitening */
+#define inpack32_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, rio, key) \
+ vpbroadcastq key, x0; \
+ vpshufb .Lpack_bswap, x0, x0; \
+ \
+ vpxor 0 * 32(rio), x0, y7; \
+ vpxor 1 * 32(rio), x0, y6; \
+ vpxor 2 * 32(rio), x0, y5; \
+ vpxor 3 * 32(rio), x0, y4; \
+ vpxor 4 * 32(rio), x0, y3; \
+ vpxor 5 * 32(rio), x0, y2; \
+ vpxor 6 * 32(rio), x0, y1; \
+ vpxor 7 * 32(rio), x0, y0; \
+ vpxor 8 * 32(rio), x0, x7; \
+ vpxor 9 * 32(rio), x0, x6; \
+ vpxor 10 * 32(rio), x0, x5; \
+ vpxor 11 * 32(rio), x0, x4; \
+ vpxor 12 * 32(rio), x0, x3; \
+ vpxor 13 * 32(rio), x0, x2; \
+ vpxor 14 * 32(rio), x0, x1; \
+ vpxor 15 * 32(rio), x0, x0;
+
+/* byteslice pre-whitened blocks and store to temporary memory */
+#define inpack32_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd) \
+ byteslice_16x16b_fast(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, \
+ y4, y5, y6, y7, (mem_ab), (mem_cd)); \
+ \
+ vmovdqu x0, 0 * 32(mem_ab); \
+ vmovdqu x1, 1 * 32(mem_ab); \
+ vmovdqu x2, 2 * 32(mem_ab); \
+ vmovdqu x3, 3 * 32(mem_ab); \
+ vmovdqu x4, 4 * 32(mem_ab); \
+ vmovdqu x5, 5 * 32(mem_ab); \
+ vmovdqu x6, 6 * 32(mem_ab); \
+ vmovdqu x7, 7 * 32(mem_ab); \
+ vmovdqu y0, 0 * 32(mem_cd); \
+ vmovdqu y1, 1 * 32(mem_cd); \
+ vmovdqu y2, 2 * 32(mem_cd); \
+ vmovdqu y3, 3 * 32(mem_cd); \
+ vmovdqu y4, 4 * 32(mem_cd); \
+ vmovdqu y5, 5 * 32(mem_cd); \
+ vmovdqu y6, 6 * 32(mem_cd); \
+ vmovdqu y7, 7 * 32(mem_cd);
+
+/* de-byteslice, apply post-whitening and store blocks */
+#define outunpack32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
+ y5, y6, y7, key, stack_tmp0, stack_tmp1) \
+ byteslice_16x16b_fast(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, \
+ y3, y7, x3, x7, stack_tmp0, stack_tmp1); \
+ \
+ vmovdqu x0, stack_tmp0; \
+ \
+ vpbroadcastq key, x0; \
+ vpshufb .Lpack_bswap, x0, x0; \
+ \
+ vpxor x0, y7, y7; \
+ vpxor x0, y6, y6; \
+ vpxor x0, y5, y5; \
+ vpxor x0, y4, y4; \
+ vpxor x0, y3, y3; \
+ vpxor x0, y2, y2; \
+ vpxor x0, y1, y1; \
+ vpxor x0, y0, y0; \
+ vpxor x0, x7, x7; \
+ vpxor x0, x6, x6; \
+ vpxor x0, x5, x5; \
+ vpxor x0, x4, x4; \
+ vpxor x0, x3, x3; \
+ vpxor x0, x2, x2; \
+ vpxor x0, x1, x1; \
+ vpxor stack_tmp0, x0, x0;
+
+#define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, rio) \
+ vmovdqu x0, 0 * 32(rio); \
+ vmovdqu x1, 1 * 32(rio); \
+ vmovdqu x2, 2 * 32(rio); \
+ vmovdqu x3, 3 * 32(rio); \
+ vmovdqu x4, 4 * 32(rio); \
+ vmovdqu x5, 5 * 32(rio); \
+ vmovdqu x6, 6 * 32(rio); \
+ vmovdqu x7, 7 * 32(rio); \
+ vmovdqu y0, 8 * 32(rio); \
+ vmovdqu y1, 9 * 32(rio); \
+ vmovdqu y2, 10 * 32(rio); \
+ vmovdqu y3, 11 * 32(rio); \
+ vmovdqu y4, 12 * 32(rio); \
+ vmovdqu y5, 13 * 32(rio); \
+ vmovdqu y6, 14 * 32(rio); \
+ vmovdqu y7, 15 * 32(rio);
+
+
+.section .rodata.cst32.shufb_16x16b, "aM", @progbits, 32
+.align 32
+#define SHUFB_BYTES(idx) \
+ 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
+.Lshufb_16x16b:
+ .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
+ .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
+
+.section .rodata.cst32.pack_bswap, "aM", @progbits, 32
+.align 32
+.Lpack_bswap:
+ .long 0x00010203, 0x04050607, 0x80808080, 0x80808080
+ .long 0x00010203, 0x04050607, 0x80808080, 0x80808080
+
+/* NB: section is mergeable, all elements must be aligned 16-byte blocks */
+.section .rodata.cst16, "aM", @progbits, 16
+.align 16
+
+/* For CTR-mode IV byteswap */
+.Lbswap128_mask:
+ .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+/* For XTS mode */
+.Lxts_gf128mul_and_shl1_mask_0:
+ .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
+.Lxts_gf128mul_and_shl1_mask_1:
+ .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0
+
+/*
+ * pre-SubByte transform
+ *
+ * pre-lookup for sbox1, sbox2, sbox3:
+ * swap_bitendianness(
+ * isom_map_camellia_to_aes(
+ * camellia_f(
+ * swap_bitendianess(in)
+ * )
+ * )
+ * )
+ *
+ * (note: '⊕ 0xc5' inside camellia_f())
+ */
+.Lpre_tf_lo_s1:
+ .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86
+ .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88
+.Lpre_tf_hi_s1:
+ .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a
+ .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23
+
+/*
+ * pre-SubByte transform
+ *
+ * pre-lookup for sbox4:
+ * swap_bitendianness(
+ * isom_map_camellia_to_aes(
+ * camellia_f(
+ * swap_bitendianess(in <<< 1)
+ * )
+ * )
+ * )
+ *
+ * (note: '⊕ 0xc5' inside camellia_f())
+ */
+.Lpre_tf_lo_s4:
+ .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25
+ .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74
+.Lpre_tf_hi_s4:
+ .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72
+ .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf
+
+/*
+ * post-SubByte transform
+ *
+ * post-lookup for sbox1, sbox4:
+ * swap_bitendianness(
+ * camellia_h(
+ * isom_map_aes_to_camellia(
+ * swap_bitendianness(
+ * aes_inverse_affine_transform(in)
+ * )
+ * )
+ * )
+ * )
+ *
+ * (note: '⊕ 0x6e' inside camellia_h())
+ */
+.Lpost_tf_lo_s1:
+ .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31
+ .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1
+.Lpost_tf_hi_s1:
+ .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8
+ .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c
+
+/*
+ * post-SubByte transform
+ *
+ * post-lookup for sbox2:
+ * swap_bitendianness(
+ * camellia_h(
+ * isom_map_aes_to_camellia(
+ * swap_bitendianness(
+ * aes_inverse_affine_transform(in)
+ * )
+ * )
+ * )
+ * ) <<< 1
+ *
+ * (note: '⊕ 0x6e' inside camellia_h())
+ */
+.Lpost_tf_lo_s2:
+ .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62
+ .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3
+.Lpost_tf_hi_s2:
+ .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51
+ .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18
+
+/*
+ * post-SubByte transform
+ *
+ * post-lookup for sbox3:
+ * swap_bitendianness(
+ * camellia_h(
+ * isom_map_aes_to_camellia(
+ * swap_bitendianness(
+ * aes_inverse_affine_transform(in)
+ * )
+ * )
+ * )
+ * ) >>> 1
+ *
+ * (note: '⊕ 0x6e' inside camellia_h())
+ */
+.Lpost_tf_lo_s3:
+ .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98
+ .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8
+.Lpost_tf_hi_s3:
+ .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54
+ .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06
+
+/* For isolating SubBytes from AESENCLAST, inverse shift row */
+.Linv_shift_row:
+ .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
+ .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
+
+.section .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
+.align 4
+/* 4-bit mask */
+.L0f0f0f0f:
+ .long 0x0f0f0f0f
+
+.text
+
+.align 8
+__camellia_enc_blk32:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rax: temporary storage, 512 bytes
+ * %ymm0..%ymm15: 32 plaintext blocks
+ * output:
+ * %ymm0..%ymm15: 32 encrypted blocks, order swapped:
+ * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
+ */
+ FRAME_BEGIN
+
+ leaq 8 * 32(%rax), %rcx;
+
+ inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %rcx);
+
+ enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %rcx, 0);
+
+ fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15,
+ ((key_table + (8) * 8) + 0)(CTX),
+ ((key_table + (8) * 8) + 4)(CTX),
+ ((key_table + (8) * 8) + 8)(CTX),
+ ((key_table + (8) * 8) + 12)(CTX));
+
+ enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %rcx, 8);
+
+ fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15,
+ ((key_table + (16) * 8) + 0)(CTX),
+ ((key_table + (16) * 8) + 4)(CTX),
+ ((key_table + (16) * 8) + 8)(CTX),
+ ((key_table + (16) * 8) + 12)(CTX));
+
+ enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %rcx, 16);
+
+ movl $24, %r8d;
+ cmpl $16, key_length(CTX);
+ jne .Lenc_max32;
+
+.Lenc_done:
+ /* load CD for output */
+ vmovdqu 0 * 32(%rcx), %ymm8;
+ vmovdqu 1 * 32(%rcx), %ymm9;
+ vmovdqu 2 * 32(%rcx), %ymm10;
+ vmovdqu 3 * 32(%rcx), %ymm11;
+ vmovdqu 4 * 32(%rcx), %ymm12;
+ vmovdqu 5 * 32(%rcx), %ymm13;
+ vmovdqu 6 * 32(%rcx), %ymm14;
+ vmovdqu 7 * 32(%rcx), %ymm15;
+
+ outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 32(%rax));
+
+ FRAME_END
+ ret;
+
+.align 8
+.Lenc_max32:
+ movl $32, %r8d;
+
+ fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15,
+ ((key_table + (24) * 8) + 0)(CTX),
+ ((key_table + (24) * 8) + 4)(CTX),
+ ((key_table + (24) * 8) + 8)(CTX),
+ ((key_table + (24) * 8) + 12)(CTX));
+
+ enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %rcx, 24);
+
+ jmp .Lenc_done;
+ENDPROC(__camellia_enc_blk32)
+
+.align 8
+__camellia_dec_blk32:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rax: temporary storage, 512 bytes
+ * %r8d: 24 for 16 byte key, 32 for larger
+ * %ymm0..%ymm15: 16 encrypted blocks
+ * output:
+ * %ymm0..%ymm15: 16 plaintext blocks, order swapped:
+ * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
+ */
+ FRAME_BEGIN
+
+ leaq 8 * 32(%rax), %rcx;
+
+ inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %rcx);
+
+ cmpl $32, %r8d;
+ je .Ldec_max32;
+
+.Ldec_max24:
+ dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %rcx, 16);
+
+ fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15,
+ ((key_table + (16) * 8) + 8)(CTX),
+ ((key_table + (16) * 8) + 12)(CTX),
+ ((key_table + (16) * 8) + 0)(CTX),
+ ((key_table + (16) * 8) + 4)(CTX));
+
+ dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %rcx, 8);
+
+ fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15,
+ ((key_table + (8) * 8) + 8)(CTX),
+ ((key_table + (8) * 8) + 12)(CTX),
+ ((key_table + (8) * 8) + 0)(CTX),
+ ((key_table + (8) * 8) + 4)(CTX));
+
+ dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %rcx, 0);
+
+ /* load CD for output */
+ vmovdqu 0 * 32(%rcx), %ymm8;
+ vmovdqu 1 * 32(%rcx), %ymm9;
+ vmovdqu 2 * 32(%rcx), %ymm10;
+ vmovdqu 3 * 32(%rcx), %ymm11;
+ vmovdqu 4 * 32(%rcx), %ymm12;
+ vmovdqu 5 * 32(%rcx), %ymm13;
+ vmovdqu 6 * 32(%rcx), %ymm14;
+ vmovdqu 7 * 32(%rcx), %ymm15;
+
+ outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, (key_table)(CTX), (%rax), 1 * 32(%rax));
+
+ FRAME_END
+ ret;
+
+.align 8
+.Ldec_max32:
+ dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %rcx, 24);
+
+ fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15,
+ ((key_table + (24) * 8) + 8)(CTX),
+ ((key_table + (24) * 8) + 12)(CTX),
+ ((key_table + (24) * 8) + 0)(CTX),
+ ((key_table + (24) * 8) + 4)(CTX));
+
+ jmp .Ldec_max24;
+ENDPROC(__camellia_dec_blk32)
+
+ENTRY(camellia_ecb_enc_32way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (32 blocks)
+ * %rdx: src (32 blocks)
+ */
+ FRAME_BEGIN
+
+ vzeroupper;
+
+ inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rdx, (key_table)(CTX));
+
+ /* now dst can be used as temporary buffer (even in src == dst case) */
+ movq %rsi, %rax;
+
+ call __camellia_enc_blk32;
+
+ write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
+ %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
+ %ymm8, %rsi);
+
+ vzeroupper;
+
+ FRAME_END
+ ret;
+ENDPROC(camellia_ecb_enc_32way)
+
+ENTRY(camellia_ecb_dec_32way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (32 blocks)
+ * %rdx: src (32 blocks)
+ */
+ FRAME_BEGIN
+
+ vzeroupper;
+
+ cmpl $16, key_length(CTX);
+ movl $32, %r8d;
+ movl $24, %eax;
+ cmovel %eax, %r8d; /* max */
+
+ inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rdx, (key_table)(CTX, %r8, 8));
+
+ /* now dst can be used as temporary buffer (even in src == dst case) */
+ movq %rsi, %rax;
+
+ call __camellia_dec_blk32;
+
+ write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
+ %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
+ %ymm8, %rsi);
+
+ vzeroupper;
+
+ FRAME_END
+ ret;
+ENDPROC(camellia_ecb_dec_32way)
+
+ENTRY(camellia_cbc_dec_32way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (32 blocks)
+ * %rdx: src (32 blocks)
+ */
+ FRAME_BEGIN
+
+ vzeroupper;
+
+ cmpl $16, key_length(CTX);
+ movl $32, %r8d;
+ movl $24, %eax;
+ cmovel %eax, %r8d; /* max */
+
+ inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rdx, (key_table)(CTX, %r8, 8));
+
+ movq %rsp, %r10;
+ cmpq %rsi, %rdx;
+ je .Lcbc_dec_use_stack;
+
+ /* dst can be used as temporary storage, src is not overwritten. */
+ movq %rsi, %rax;
+ jmp .Lcbc_dec_continue;
+
+.Lcbc_dec_use_stack:
+ /*
+ * dst still in-use (because dst == src), so use stack for temporary
+ * storage.
+ */
+ subq $(16 * 32), %rsp;
+ movq %rsp, %rax;
+
+.Lcbc_dec_continue:
+ call __camellia_dec_blk32;
+
+ vmovdqu %ymm7, (%rax);
+ vpxor %ymm7, %ymm7, %ymm7;
+ vinserti128 $1, (%rdx), %ymm7, %ymm7;
+ vpxor (%rax), %ymm7, %ymm7;
+ movq %r10, %rsp;
+ vpxor (0 * 32 + 16)(%rdx), %ymm6, %ymm6;
+ vpxor (1 * 32 + 16)(%rdx), %ymm5, %ymm5;
+ vpxor (2 * 32 + 16)(%rdx), %ymm4, %ymm4;
+ vpxor (3 * 32 + 16)(%rdx), %ymm3, %ymm3;
+ vpxor (4 * 32 + 16)(%rdx), %ymm2, %ymm2;
+ vpxor (5 * 32 + 16)(%rdx), %ymm1, %ymm1;
+ vpxor (6 * 32 + 16)(%rdx), %ymm0, %ymm0;
+ vpxor (7 * 32 + 16)(%rdx), %ymm15, %ymm15;
+ vpxor (8 * 32 + 16)(%rdx), %ymm14, %ymm14;
+ vpxor (9 * 32 + 16)(%rdx), %ymm13, %ymm13;
+ vpxor (10 * 32 + 16)(%rdx), %ymm12, %ymm12;
+ vpxor (11 * 32 + 16)(%rdx), %ymm11, %ymm11;
+ vpxor (12 * 32 + 16)(%rdx), %ymm10, %ymm10;
+ vpxor (13 * 32 + 16)(%rdx), %ymm9, %ymm9;
+ vpxor (14 * 32 + 16)(%rdx), %ymm8, %ymm8;
+ write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
+ %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
+ %ymm8, %rsi);
+
+ vzeroupper;
+
+ FRAME_END
+ ret;
+ENDPROC(camellia_cbc_dec_32way)
+
+#define inc_le128(x, minus_one, tmp) \
+ vpcmpeqq minus_one, x, tmp; \
+ vpsubq minus_one, x, x; \
+ vpslldq $8, tmp, tmp; \
+ vpsubq tmp, x, x;
+
+#define add2_le128(x, minus_one, minus_two, tmp1, tmp2) \
+ vpcmpeqq minus_one, x, tmp1; \
+ vpcmpeqq minus_two, x, tmp2; \
+ vpsubq minus_two, x, x; \
+ vpor tmp2, tmp1, tmp1; \
+ vpslldq $8, tmp1, tmp1; \
+ vpsubq tmp1, x, x;
+
+ENTRY(camellia_ctr_32way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (32 blocks)
+ * %rdx: src (32 blocks)
+ * %rcx: iv (little endian, 128bit)
+ */
+ FRAME_BEGIN
+
+ vzeroupper;
+
+ movq %rsp, %r10;
+ cmpq %rsi, %rdx;
+ je .Lctr_use_stack;
+
+ /* dst can be used as temporary storage, src is not overwritten. */
+ movq %rsi, %rax;
+ jmp .Lctr_continue;
+
+.Lctr_use_stack:
+ subq $(16 * 32), %rsp;
+ movq %rsp, %rax;
+
+.Lctr_continue:
+ vpcmpeqd %ymm15, %ymm15, %ymm15;
+ vpsrldq $8, %ymm15, %ymm15; /* ab: -1:0 ; cd: -1:0 */
+ vpaddq %ymm15, %ymm15, %ymm12; /* ab: -2:0 ; cd: -2:0 */
+
+ /* load IV and byteswap */
+ vmovdqu (%rcx), %xmm0;
+ vmovdqa %xmm0, %xmm1;
+ inc_le128(%xmm0, %xmm15, %xmm14);
+ vbroadcasti128 .Lbswap128_mask, %ymm14;
+ vinserti128 $1, %xmm0, %ymm1, %ymm0;
+ vpshufb %ymm14, %ymm0, %ymm13;
+ vmovdqu %ymm13, 15 * 32(%rax);
+
+ /* construct IVs */
+ add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); /* ab:le2 ; cd:le3 */
+ vpshufb %ymm14, %ymm0, %ymm13;
+ vmovdqu %ymm13, 14 * 32(%rax);
+ add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+ vpshufb %ymm14, %ymm0, %ymm13;
+ vmovdqu %ymm13, 13 * 32(%rax);
+ add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+ vpshufb %ymm14, %ymm0, %ymm13;
+ vmovdqu %ymm13, 12 * 32(%rax);
+ add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+ vpshufb %ymm14, %ymm0, %ymm13;
+ vmovdqu %ymm13, 11 * 32(%rax);
+ add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+ vpshufb %ymm14, %ymm0, %ymm10;
+ add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+ vpshufb %ymm14, %ymm0, %ymm9;
+ add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+ vpshufb %ymm14, %ymm0, %ymm8;
+ add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+ vpshufb %ymm14, %ymm0, %ymm7;
+ add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+ vpshufb %ymm14, %ymm0, %ymm6;
+ add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+ vpshufb %ymm14, %ymm0, %ymm5;
+ add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+ vpshufb %ymm14, %ymm0, %ymm4;
+ add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+ vpshufb %ymm14, %ymm0, %ymm3;
+ add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+ vpshufb %ymm14, %ymm0, %ymm2;
+ add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+ vpshufb %ymm14, %ymm0, %ymm1;
+ add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+ vextracti128 $1, %ymm0, %xmm13;
+ vpshufb %ymm14, %ymm0, %ymm0;
+ inc_le128(%xmm13, %xmm15, %xmm14);
+ vmovdqu %xmm13, (%rcx);
+
+ /* inpack32_pre: */
+ vpbroadcastq (key_table)(CTX), %ymm15;
+ vpshufb .Lpack_bswap, %ymm15, %ymm15;
+ vpxor %ymm0, %ymm15, %ymm0;
+ vpxor %ymm1, %ymm15, %ymm1;
+ vpxor %ymm2, %ymm15, %ymm2;
+ vpxor %ymm3, %ymm15, %ymm3;
+ vpxor %ymm4, %ymm15, %ymm4;
+ vpxor %ymm5, %ymm15, %ymm5;
+ vpxor %ymm6, %ymm15, %ymm6;
+ vpxor %ymm7, %ymm15, %ymm7;
+ vpxor %ymm8, %ymm15, %ymm8;
+ vpxor %ymm9, %ymm15, %ymm9;
+ vpxor %ymm10, %ymm15, %ymm10;
+ vpxor 11 * 32(%rax), %ymm15, %ymm11;
+ vpxor 12 * 32(%rax), %ymm15, %ymm12;
+ vpxor 13 * 32(%rax), %ymm15, %ymm13;
+ vpxor 14 * 32(%rax), %ymm15, %ymm14;
+ vpxor 15 * 32(%rax), %ymm15, %ymm15;
+
+ call __camellia_enc_blk32;
+
+ movq %r10, %rsp;
+
+ vpxor 0 * 32(%rdx), %ymm7, %ymm7;
+ vpxor 1 * 32(%rdx), %ymm6, %ymm6;
+ vpxor 2 * 32(%rdx), %ymm5, %ymm5;
+ vpxor 3 * 32(%rdx), %ymm4, %ymm4;
+ vpxor 4 * 32(%rdx), %ymm3, %ymm3;
+ vpxor 5 * 32(%rdx), %ymm2, %ymm2;
+ vpxor 6 * 32(%rdx), %ymm1, %ymm1;
+ vpxor 7 * 32(%rdx), %ymm0, %ymm0;
+ vpxor 8 * 32(%rdx), %ymm15, %ymm15;
+ vpxor 9 * 32(%rdx), %ymm14, %ymm14;
+ vpxor 10 * 32(%rdx), %ymm13, %ymm13;
+ vpxor 11 * 32(%rdx), %ymm12, %ymm12;
+ vpxor 12 * 32(%rdx), %ymm11, %ymm11;
+ vpxor 13 * 32(%rdx), %ymm10, %ymm10;
+ vpxor 14 * 32(%rdx), %ymm9, %ymm9;
+ vpxor 15 * 32(%rdx), %ymm8, %ymm8;
+ write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
+ %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
+ %ymm8, %rsi);
+
+ vzeroupper;
+
+ FRAME_END
+ ret;
+ENDPROC(camellia_ctr_32way)
+
+#define gf128mul_x_ble(iv, mask, tmp) \
+ vpsrad $31, iv, tmp; \
+ vpaddq iv, iv, iv; \
+ vpshufd $0x13, tmp, tmp; \
+ vpand mask, tmp, tmp; \
+ vpxor tmp, iv, iv;
+
+#define gf128mul_x2_ble(iv, mask1, mask2, tmp0, tmp1) \
+ vpsrad $31, iv, tmp0; \
+ vpaddq iv, iv, tmp1; \
+ vpsllq $2, iv, iv; \
+ vpshufd $0x13, tmp0, tmp0; \
+ vpsrad $31, tmp1, tmp1; \
+ vpand mask2, tmp0, tmp0; \
+ vpshufd $0x13, tmp1, tmp1; \
+ vpxor tmp0, iv, iv; \
+ vpand mask1, tmp1, tmp1; \
+ vpxor tmp1, iv, iv;
+
+.align 8
+camellia_xts_crypt_32way:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (32 blocks)
+ * %rdx: src (32 blocks)
+ * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+ * %r8: index for input whitening key
+ * %r9: pointer to __camellia_enc_blk32 or __camellia_dec_blk32
+ */
+ FRAME_BEGIN
+
+ vzeroupper;
+
+ subq $(16 * 32), %rsp;
+ movq %rsp, %rax;
+
+ vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_0, %ymm12;
+
+ /* load IV and construct second IV */
+ vmovdqu (%rcx), %xmm0;
+ vmovdqa %xmm0, %xmm15;
+ gf128mul_x_ble(%xmm0, %xmm12, %xmm13);
+ vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_1, %ymm13;
+ vinserti128 $1, %xmm0, %ymm15, %ymm0;
+ vpxor 0 * 32(%rdx), %ymm0, %ymm15;
+ vmovdqu %ymm15, 15 * 32(%rax);
+ vmovdqu %ymm0, 0 * 32(%rsi);
+
+ /* construct IVs */
+ gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+ vpxor 1 * 32(%rdx), %ymm0, %ymm15;
+ vmovdqu %ymm15, 14 * 32(%rax);
+ vmovdqu %ymm0, 1 * 32(%rsi);
+
+ gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+ vpxor 2 * 32(%rdx), %ymm0, %ymm15;
+ vmovdqu %ymm15, 13 * 32(%rax);
+ vmovdqu %ymm0, 2 * 32(%rsi);
+
+ gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+ vpxor 3 * 32(%rdx), %ymm0, %ymm15;
+ vmovdqu %ymm15, 12 * 32(%rax);
+ vmovdqu %ymm0, 3 * 32(%rsi);
+
+ gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+ vpxor 4 * 32(%rdx), %ymm0, %ymm11;
+ vmovdqu %ymm0, 4 * 32(%rsi);
+
+ gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+ vpxor 5 * 32(%rdx), %ymm0, %ymm10;
+ vmovdqu %ymm0, 5 * 32(%rsi);
+
+ gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+ vpxor 6 * 32(%rdx), %ymm0, %ymm9;
+ vmovdqu %ymm0, 6 * 32(%rsi);
+
+ gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+ vpxor 7 * 32(%rdx), %ymm0, %ymm8;
+ vmovdqu %ymm0, 7 * 32(%rsi);
+
+ gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+ vpxor 8 * 32(%rdx), %ymm0, %ymm7;
+ vmovdqu %ymm0, 8 * 32(%rsi);
+
+ gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+ vpxor 9 * 32(%rdx), %ymm0, %ymm6;
+ vmovdqu %ymm0, 9 * 32(%rsi);
+
+ gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+ vpxor 10 * 32(%rdx), %ymm0, %ymm5;
+ vmovdqu %ymm0, 10 * 32(%rsi);
+
+ gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+ vpxor 11 * 32(%rdx), %ymm0, %ymm4;
+ vmovdqu %ymm0, 11 * 32(%rsi);
+
+ gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+ vpxor 12 * 32(%rdx), %ymm0, %ymm3;
+ vmovdqu %ymm0, 12 * 32(%rsi);
+
+ gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+ vpxor 13 * 32(%rdx), %ymm0, %ymm2;
+ vmovdqu %ymm0, 13 * 32(%rsi);
+
+ gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+ vpxor 14 * 32(%rdx), %ymm0, %ymm1;
+ vmovdqu %ymm0, 14 * 32(%rsi);
+
+ gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+ vpxor 15 * 32(%rdx), %ymm0, %ymm15;
+ vmovdqu %ymm15, 0 * 32(%rax);
+ vmovdqu %ymm0, 15 * 32(%rsi);
+
+ vextracti128 $1, %ymm0, %xmm0;
+ gf128mul_x_ble(%xmm0, %xmm12, %xmm15);
+ vmovdqu %xmm0, (%rcx);
+
+ /* inpack32_pre: */
+ vpbroadcastq (key_table)(CTX, %r8, 8), %ymm15;
+ vpshufb .Lpack_bswap, %ymm15, %ymm15;
+ vpxor 0 * 32(%rax), %ymm15, %ymm0;
+ vpxor %ymm1, %ymm15, %ymm1;
+ vpxor %ymm2, %ymm15, %ymm2;
+ vpxor %ymm3, %ymm15, %ymm3;
+ vpxor %ymm4, %ymm15, %ymm4;
+ vpxor %ymm5, %ymm15, %ymm5;
+ vpxor %ymm6, %ymm15, %ymm6;
+ vpxor %ymm7, %ymm15, %ymm7;
+ vpxor %ymm8, %ymm15, %ymm8;
+ vpxor %ymm9, %ymm15, %ymm9;
+ vpxor %ymm10, %ymm15, %ymm10;
+ vpxor %ymm11, %ymm15, %ymm11;
+ vpxor 12 * 32(%rax), %ymm15, %ymm12;
+ vpxor 13 * 32(%rax), %ymm15, %ymm13;
+ vpxor 14 * 32(%rax), %ymm15, %ymm14;
+ vpxor 15 * 32(%rax), %ymm15, %ymm15;
+
+ CALL_NOSPEC %r9;
+
+ addq $(16 * 32), %rsp;
+
+ vpxor 0 * 32(%rsi), %ymm7, %ymm7;
+ vpxor 1 * 32(%rsi), %ymm6, %ymm6;
+ vpxor 2 * 32(%rsi), %ymm5, %ymm5;
+ vpxor 3 * 32(%rsi), %ymm4, %ymm4;
+ vpxor 4 * 32(%rsi), %ymm3, %ymm3;
+ vpxor 5 * 32(%rsi), %ymm2, %ymm2;
+ vpxor 6 * 32(%rsi), %ymm1, %ymm1;
+ vpxor 7 * 32(%rsi), %ymm0, %ymm0;
+ vpxor 8 * 32(%rsi), %ymm15, %ymm15;
+ vpxor 9 * 32(%rsi), %ymm14, %ymm14;
+ vpxor 10 * 32(%rsi), %ymm13, %ymm13;
+ vpxor 11 * 32(%rsi), %ymm12, %ymm12;
+ vpxor 12 * 32(%rsi), %ymm11, %ymm11;
+ vpxor 13 * 32(%rsi), %ymm10, %ymm10;
+ vpxor 14 * 32(%rsi), %ymm9, %ymm9;
+ vpxor 15 * 32(%rsi), %ymm8, %ymm8;
+ write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
+ %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
+ %ymm8, %rsi);
+
+ vzeroupper;
+
+ FRAME_END
+ ret;
+ENDPROC(camellia_xts_crypt_32way)
+
+ENTRY(camellia_xts_enc_32way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (32 blocks)
+ * %rdx: src (32 blocks)
+ * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+ */
+
+ xorl %r8d, %r8d; /* input whitening key, 0 for enc */
+
+ leaq __camellia_enc_blk32, %r9;
+
+ jmp camellia_xts_crypt_32way;
+ENDPROC(camellia_xts_enc_32way)
+
+ENTRY(camellia_xts_dec_32way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (32 blocks)
+ * %rdx: src (32 blocks)
+ * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+ */
+
+ cmpl $16, key_length(CTX);
+ movl $32, %r8d;
+ movl $24, %eax;
+ cmovel %eax, %r8d; /* input whitening key, last for dec */
+
+ leaq __camellia_dec_blk32, %r9;
+
+ jmp camellia_xts_crypt_32way;
+ENDPROC(camellia_xts_dec_32way)
diff --git a/arch/x86/crypto/camellia-x86_64-asm_64.S b/arch/x86/crypto/camellia-x86_64-asm_64.S
new file mode 100644
index 000000000..95ba6956a
--- /dev/null
+++ b/arch/x86/crypto/camellia-x86_64-asm_64.S
@@ -0,0 +1,514 @@
+/*
+ * Camellia Cipher Algorithm (x86_64)
+ *
+ * Copyright (C) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+ * USA
+ *
+ */
+
+#include <linux/linkage.h>
+
+.file "camellia-x86_64-asm_64.S"
+.text
+
+.extern camellia_sp10011110;
+.extern camellia_sp22000222;
+.extern camellia_sp03303033;
+.extern camellia_sp00444404;
+.extern camellia_sp02220222;
+.extern camellia_sp30333033;
+.extern camellia_sp44044404;
+.extern camellia_sp11101110;
+
+#define sp10011110 camellia_sp10011110
+#define sp22000222 camellia_sp22000222
+#define sp03303033 camellia_sp03303033
+#define sp00444404 camellia_sp00444404
+#define sp02220222 camellia_sp02220222
+#define sp30333033 camellia_sp30333033
+#define sp44044404 camellia_sp44044404
+#define sp11101110 camellia_sp11101110
+
+#define CAMELLIA_TABLE_BYTE_LEN 272
+
+/* struct camellia_ctx: */
+#define key_table 0
+#define key_length CAMELLIA_TABLE_BYTE_LEN
+
+/* register macros */
+#define CTX %rdi
+#define RIO %rsi
+#define RIOd %esi
+
+#define RAB0 %rax
+#define RCD0 %rcx
+#define RAB1 %rbx
+#define RCD1 %rdx
+
+#define RAB0d %eax
+#define RCD0d %ecx
+#define RAB1d %ebx
+#define RCD1d %edx
+
+#define RAB0bl %al
+#define RCD0bl %cl
+#define RAB1bl %bl
+#define RCD1bl %dl
+
+#define RAB0bh %ah
+#define RCD0bh %ch
+#define RAB1bh %bh
+#define RCD1bh %dh
+
+#define RT0 %rsi
+#define RT1 %r12
+#define RT2 %r8
+
+#define RT0d %esi
+#define RT1d %r12d
+#define RT2d %r8d
+
+#define RT2bl %r8b
+
+#define RXOR %r9
+#define RR12 %r10
+#define RDST %r11
+
+#define RXORd %r9d
+#define RXORbl %r9b
+
+#define xor2ror16(T0, T1, tmp1, tmp2, ab, dst) \
+ movzbl ab ## bl, tmp2 ## d; \
+ movzbl ab ## bh, tmp1 ## d; \
+ rorq $16, ab; \
+ xorq T0(, tmp2, 8), dst; \
+ xorq T1(, tmp1, 8), dst;
+
+/**********************************************************************
+ 1-way camellia
+ **********************************************************************/
+#define roundsm(ab, subkey, cd) \
+ movq (key_table + ((subkey) * 2) * 4)(CTX), RT2; \
+ \
+ xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \
+ xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \
+ xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \
+ xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \
+ \
+ xorq RT2, cd ## 0;
+
+#define fls(l, r, kl, kr) \
+ movl (key_table + ((kl) * 2) * 4)(CTX), RT0d; \
+ andl l ## 0d, RT0d; \
+ roll $1, RT0d; \
+ shlq $32, RT0; \
+ xorq RT0, l ## 0; \
+ movq (key_table + ((kr) * 2) * 4)(CTX), RT1; \
+ orq r ## 0, RT1; \
+ shrq $32, RT1; \
+ xorq RT1, r ## 0; \
+ \
+ movq (key_table + ((kl) * 2) * 4)(CTX), RT2; \
+ orq l ## 0, RT2; \
+ shrq $32, RT2; \
+ xorq RT2, l ## 0; \
+ movl (key_table + ((kr) * 2) * 4)(CTX), RT0d; \
+ andl r ## 0d, RT0d; \
+ roll $1, RT0d; \
+ shlq $32, RT0; \
+ xorq RT0, r ## 0;
+
+#define enc_rounds(i) \
+ roundsm(RAB, i + 2, RCD); \
+ roundsm(RCD, i + 3, RAB); \
+ roundsm(RAB, i + 4, RCD); \
+ roundsm(RCD, i + 5, RAB); \
+ roundsm(RAB, i + 6, RCD); \
+ roundsm(RCD, i + 7, RAB);
+
+#define enc_fls(i) \
+ fls(RAB, RCD, i + 0, i + 1);
+
+#define enc_inpack() \
+ movq (RIO), RAB0; \
+ bswapq RAB0; \
+ rolq $32, RAB0; \
+ movq 4*2(RIO), RCD0; \
+ bswapq RCD0; \
+ rorq $32, RCD0; \
+ xorq key_table(CTX), RAB0;
+
+#define enc_outunpack(op, max) \
+ xorq key_table(CTX, max, 8), RCD0; \
+ rorq $32, RCD0; \
+ bswapq RCD0; \
+ op ## q RCD0, (RIO); \
+ rolq $32, RAB0; \
+ bswapq RAB0; \
+ op ## q RAB0, 4*2(RIO);
+
+#define dec_rounds(i) \
+ roundsm(RAB, i + 7, RCD); \
+ roundsm(RCD, i + 6, RAB); \
+ roundsm(RAB, i + 5, RCD); \
+ roundsm(RCD, i + 4, RAB); \
+ roundsm(RAB, i + 3, RCD); \
+ roundsm(RCD, i + 2, RAB);
+
+#define dec_fls(i) \
+ fls(RAB, RCD, i + 1, i + 0);
+
+#define dec_inpack(max) \
+ movq (RIO), RAB0; \
+ bswapq RAB0; \
+ rolq $32, RAB0; \
+ movq 4*2(RIO), RCD0; \
+ bswapq RCD0; \
+ rorq $32, RCD0; \
+ xorq key_table(CTX, max, 8), RAB0;
+
+#define dec_outunpack() \
+ xorq key_table(CTX), RCD0; \
+ rorq $32, RCD0; \
+ bswapq RCD0; \
+ movq RCD0, (RIO); \
+ rolq $32, RAB0; \
+ bswapq RAB0; \
+ movq RAB0, 4*2(RIO);
+
+ENTRY(__camellia_enc_blk)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: bool xor
+ */
+ movq %r12, RR12;
+
+ movq %rcx, RXOR;
+ movq %rsi, RDST;
+ movq %rdx, RIO;
+
+ enc_inpack();
+
+ enc_rounds(0);
+ enc_fls(8);
+ enc_rounds(8);
+ enc_fls(16);
+ enc_rounds(16);
+ movl $24, RT1d; /* max */
+
+ cmpb $16, key_length(CTX);
+ je .L__enc_done;
+
+ enc_fls(24);
+ enc_rounds(24);
+ movl $32, RT1d; /* max */
+
+.L__enc_done:
+ testb RXORbl, RXORbl;
+ movq RDST, RIO;
+
+ jnz .L__enc_xor;
+
+ enc_outunpack(mov, RT1);
+
+ movq RR12, %r12;
+ ret;
+
+.L__enc_xor:
+ enc_outunpack(xor, RT1);
+
+ movq RR12, %r12;
+ ret;
+ENDPROC(__camellia_enc_blk)
+
+ENTRY(camellia_dec_blk)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+ cmpl $16, key_length(CTX);
+ movl $32, RT2d;
+ movl $24, RXORd;
+ cmovel RXORd, RT2d; /* max */
+
+ movq %r12, RR12;
+ movq %rsi, RDST;
+ movq %rdx, RIO;
+
+ dec_inpack(RT2);
+
+ cmpb $24, RT2bl;
+ je .L__dec_rounds16;
+
+ dec_rounds(24);
+ dec_fls(24);
+
+.L__dec_rounds16:
+ dec_rounds(16);
+ dec_fls(16);
+ dec_rounds(8);
+ dec_fls(8);
+ dec_rounds(0);
+
+ movq RDST, RIO;
+
+ dec_outunpack();
+
+ movq RR12, %r12;
+ ret;
+ENDPROC(camellia_dec_blk)
+
+/**********************************************************************
+ 2-way camellia
+ **********************************************************************/
+#define roundsm2(ab, subkey, cd) \
+ movq (key_table + ((subkey) * 2) * 4)(CTX), RT2; \
+ xorq RT2, cd ## 1; \
+ \
+ xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \
+ xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \
+ xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \
+ xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \
+ \
+ xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 1, cd ## 1); \
+ xorq RT2, cd ## 0; \
+ xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 1, cd ## 1); \
+ xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 1, cd ## 1); \
+ xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 1, cd ## 1);
+
+#define fls2(l, r, kl, kr) \
+ movl (key_table + ((kl) * 2) * 4)(CTX), RT0d; \
+ andl l ## 0d, RT0d; \
+ roll $1, RT0d; \
+ shlq $32, RT0; \
+ xorq RT0, l ## 0; \
+ movq (key_table + ((kr) * 2) * 4)(CTX), RT1; \
+ orq r ## 0, RT1; \
+ shrq $32, RT1; \
+ xorq RT1, r ## 0; \
+ \
+ movl (key_table + ((kl) * 2) * 4)(CTX), RT2d; \
+ andl l ## 1d, RT2d; \
+ roll $1, RT2d; \
+ shlq $32, RT2; \
+ xorq RT2, l ## 1; \
+ movq (key_table + ((kr) * 2) * 4)(CTX), RT0; \
+ orq r ## 1, RT0; \
+ shrq $32, RT0; \
+ xorq RT0, r ## 1; \
+ \
+ movq (key_table + ((kl) * 2) * 4)(CTX), RT1; \
+ orq l ## 0, RT1; \
+ shrq $32, RT1; \
+ xorq RT1, l ## 0; \
+ movl (key_table + ((kr) * 2) * 4)(CTX), RT2d; \
+ andl r ## 0d, RT2d; \
+ roll $1, RT2d; \
+ shlq $32, RT2; \
+ xorq RT2, r ## 0; \
+ \
+ movq (key_table + ((kl) * 2) * 4)(CTX), RT0; \
+ orq l ## 1, RT0; \
+ shrq $32, RT0; \
+ xorq RT0, l ## 1; \
+ movl (key_table + ((kr) * 2) * 4)(CTX), RT1d; \
+ andl r ## 1d, RT1d; \
+ roll $1, RT1d; \
+ shlq $32, RT1; \
+ xorq RT1, r ## 1;
+
+#define enc_rounds2(i) \
+ roundsm2(RAB, i + 2, RCD); \
+ roundsm2(RCD, i + 3, RAB); \
+ roundsm2(RAB, i + 4, RCD); \
+ roundsm2(RCD, i + 5, RAB); \
+ roundsm2(RAB, i + 6, RCD); \
+ roundsm2(RCD, i + 7, RAB);
+
+#define enc_fls2(i) \
+ fls2(RAB, RCD, i + 0, i + 1);
+
+#define enc_inpack2() \
+ movq (RIO), RAB0; \
+ bswapq RAB0; \
+ rorq $32, RAB0; \
+ movq 4*2(RIO), RCD0; \
+ bswapq RCD0; \
+ rolq $32, RCD0; \
+ xorq key_table(CTX), RAB0; \
+ \
+ movq 8*2(RIO), RAB1; \
+ bswapq RAB1; \
+ rorq $32, RAB1; \
+ movq 12*2(RIO), RCD1; \
+ bswapq RCD1; \
+ rolq $32, RCD1; \
+ xorq key_table(CTX), RAB1;
+
+#define enc_outunpack2(op, max) \
+ xorq key_table(CTX, max, 8), RCD0; \
+ rolq $32, RCD0; \
+ bswapq RCD0; \
+ op ## q RCD0, (RIO); \
+ rorq $32, RAB0; \
+ bswapq RAB0; \
+ op ## q RAB0, 4*2(RIO); \
+ \
+ xorq key_table(CTX, max, 8), RCD1; \
+ rolq $32, RCD1; \
+ bswapq RCD1; \
+ op ## q RCD1, 8*2(RIO); \
+ rorq $32, RAB1; \
+ bswapq RAB1; \
+ op ## q RAB1, 12*2(RIO);
+
+#define dec_rounds2(i) \
+ roundsm2(RAB, i + 7, RCD); \
+ roundsm2(RCD, i + 6, RAB); \
+ roundsm2(RAB, i + 5, RCD); \
+ roundsm2(RCD, i + 4, RAB); \
+ roundsm2(RAB, i + 3, RCD); \
+ roundsm2(RCD, i + 2, RAB);
+
+#define dec_fls2(i) \
+ fls2(RAB, RCD, i + 1, i + 0);
+
+#define dec_inpack2(max) \
+ movq (RIO), RAB0; \
+ bswapq RAB0; \
+ rorq $32, RAB0; \
+ movq 4*2(RIO), RCD0; \
+ bswapq RCD0; \
+ rolq $32, RCD0; \
+ xorq key_table(CTX, max, 8), RAB0; \
+ \
+ movq 8*2(RIO), RAB1; \
+ bswapq RAB1; \
+ rorq $32, RAB1; \
+ movq 12*2(RIO), RCD1; \
+ bswapq RCD1; \
+ rolq $32, RCD1; \
+ xorq key_table(CTX, max, 8), RAB1;
+
+#define dec_outunpack2() \
+ xorq key_table(CTX), RCD0; \
+ rolq $32, RCD0; \
+ bswapq RCD0; \
+ movq RCD0, (RIO); \
+ rorq $32, RAB0; \
+ bswapq RAB0; \
+ movq RAB0, 4*2(RIO); \
+ \
+ xorq key_table(CTX), RCD1; \
+ rolq $32, RCD1; \
+ bswapq RCD1; \
+ movq RCD1, 8*2(RIO); \
+ rorq $32, RAB1; \
+ bswapq RAB1; \
+ movq RAB1, 12*2(RIO);
+
+ENTRY(__camellia_enc_blk_2way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: bool xor
+ */
+ pushq %rbx;
+
+ movq %r12, RR12;
+ movq %rcx, RXOR;
+ movq %rsi, RDST;
+ movq %rdx, RIO;
+
+ enc_inpack2();
+
+ enc_rounds2(0);
+ enc_fls2(8);
+ enc_rounds2(8);
+ enc_fls2(16);
+ enc_rounds2(16);
+ movl $24, RT2d; /* max */
+
+ cmpb $16, key_length(CTX);
+ je .L__enc2_done;
+
+ enc_fls2(24);
+ enc_rounds2(24);
+ movl $32, RT2d; /* max */
+
+.L__enc2_done:
+ test RXORbl, RXORbl;
+ movq RDST, RIO;
+ jnz .L__enc2_xor;
+
+ enc_outunpack2(mov, RT2);
+
+ movq RR12, %r12;
+ popq %rbx;
+ ret;
+
+.L__enc2_xor:
+ enc_outunpack2(xor, RT2);
+
+ movq RR12, %r12;
+ popq %rbx;
+ ret;
+ENDPROC(__camellia_enc_blk_2way)
+
+ENTRY(camellia_dec_blk_2way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+ cmpl $16, key_length(CTX);
+ movl $32, RT2d;
+ movl $24, RXORd;
+ cmovel RXORd, RT2d; /* max */
+
+ movq %rbx, RXOR;
+ movq %r12, RR12;
+ movq %rsi, RDST;
+ movq %rdx, RIO;
+
+ dec_inpack2(RT2);
+
+ cmpb $24, RT2bl;
+ je .L__dec2_rounds16;
+
+ dec_rounds2(24);
+ dec_fls2(24);
+
+.L__dec2_rounds16:
+ dec_rounds2(16);
+ dec_fls2(16);
+ dec_rounds2(8);
+ dec_fls2(8);
+ dec_rounds2(0);
+
+ movq RDST, RIO;
+
+ dec_outunpack2();
+
+ movq RR12, %r12;
+ movq RXOR, %rbx;
+ ret;
+ENDPROC(camellia_dec_blk_2way)
diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c b/arch/x86/crypto/camellia_aesni_avx2_glue.c
new file mode 100644
index 000000000..d4992e458
--- /dev/null
+++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c
@@ -0,0 +1,300 @@
+/*
+ * Glue Code for x86_64/AVX2/AES-NI assembler optimized version of Camellia
+ *
+ * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+#include <asm/crypto/camellia.h>
+#include <asm/crypto/glue_helper.h>
+#include <crypto/algapi.h>
+#include <crypto/internal/simd.h>
+#include <crypto/xts.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/types.h>
+
+#define CAMELLIA_AESNI_PARALLEL_BLOCKS 16
+#define CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS 32
+
+/* 32-way AVX2/AES-NI parallel cipher functions */
+asmlinkage void camellia_ecb_enc_32way(struct camellia_ctx *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void camellia_ecb_dec_32way(struct camellia_ctx *ctx, u8 *dst,
+ const u8 *src);
+
+asmlinkage void camellia_cbc_dec_32way(struct camellia_ctx *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void camellia_ctr_32way(struct camellia_ctx *ctx, u8 *dst,
+ const u8 *src, le128 *iv);
+
+asmlinkage void camellia_xts_enc_32way(struct camellia_ctx *ctx, u8 *dst,
+ const u8 *src, le128 *iv);
+asmlinkage void camellia_xts_dec_32way(struct camellia_ctx *ctx, u8 *dst,
+ const u8 *src, le128 *iv);
+
+static const struct common_glue_ctx camellia_enc = {
+ .num_funcs = 4,
+ .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS,
+ .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_enc_32way) }
+ }, {
+ .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+ .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_enc_16way) }
+ }, {
+ .num_blocks = 2,
+ .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk_2way) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk) }
+ } }
+};
+
+static const struct common_glue_ctx camellia_ctr = {
+ .num_funcs = 4,
+ .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS,
+ .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_ctr_32way) }
+ }, {
+ .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+ .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_ctr_16way) }
+ }, {
+ .num_blocks = 2,
+ .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr_2way) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr) }
+ } }
+};
+
+static const struct common_glue_ctx camellia_enc_xts = {
+ .num_funcs = 3,
+ .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS,
+ .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc_32way) }
+ }, {
+ .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+ .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc_16way) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc) }
+ } }
+};
+
+static const struct common_glue_ctx camellia_dec = {
+ .num_funcs = 4,
+ .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS,
+ .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_dec_32way) }
+ }, {
+ .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+ .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_dec_16way) }
+ }, {
+ .num_blocks = 2,
+ .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk_2way) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk) }
+ } }
+};
+
+static const struct common_glue_ctx camellia_dec_cbc = {
+ .num_funcs = 4,
+ .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS,
+ .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_cbc_dec_32way) }
+ }, {
+ .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+ .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_cbc_dec_16way) }
+ }, {
+ .num_blocks = 2,
+ .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_decrypt_cbc_2way) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_dec_blk) }
+ } }
+};
+
+static const struct common_glue_ctx camellia_dec_xts = {
+ .num_funcs = 3,
+ .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS,
+ .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec_32way) }
+ }, {
+ .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+ .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec_16way) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec) }
+ } }
+};
+
+static int camellia_setkey(struct crypto_skcipher *tfm, const u8 *key,
+ unsigned int keylen)
+{
+ return __camellia_setkey(crypto_skcipher_ctx(tfm), key, keylen,
+ &tfm->base.crt_flags);
+}
+
+static int ecb_encrypt(struct skcipher_request *req)
+{
+ return glue_ecb_req_128bit(&camellia_enc, req);
+}
+
+static int ecb_decrypt(struct skcipher_request *req)
+{
+ return glue_ecb_req_128bit(&camellia_dec, req);
+}
+
+static int cbc_encrypt(struct skcipher_request *req)
+{
+ return glue_cbc_encrypt_req_128bit(GLUE_FUNC_CAST(camellia_enc_blk),
+ req);
+}
+
+static int cbc_decrypt(struct skcipher_request *req)
+{
+ return glue_cbc_decrypt_req_128bit(&camellia_dec_cbc, req);
+}
+
+static int ctr_crypt(struct skcipher_request *req)
+{
+ return glue_ctr_req_128bit(&camellia_ctr, req);
+}
+
+static int xts_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct camellia_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return glue_xts_req_128bit(&camellia_enc_xts, req,
+ XTS_TWEAK_CAST(camellia_enc_blk),
+ &ctx->tweak_ctx, &ctx->crypt_ctx);
+}
+
+static int xts_decrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct camellia_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return glue_xts_req_128bit(&camellia_dec_xts, req,
+ XTS_TWEAK_CAST(camellia_enc_blk),
+ &ctx->tweak_ctx, &ctx->crypt_ctx);
+}
+
+static struct skcipher_alg camellia_algs[] = {
+ {
+ .base.cra_name = "__ecb(camellia)",
+ .base.cra_driver_name = "__ecb-camellia-aesni-avx2",
+ .base.cra_priority = 500,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = CAMELLIA_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct camellia_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = CAMELLIA_MIN_KEY_SIZE,
+ .max_keysize = CAMELLIA_MAX_KEY_SIZE,
+ .setkey = camellia_setkey,
+ .encrypt = ecb_encrypt,
+ .decrypt = ecb_decrypt,
+ }, {
+ .base.cra_name = "__cbc(camellia)",
+ .base.cra_driver_name = "__cbc-camellia-aesni-avx2",
+ .base.cra_priority = 500,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = CAMELLIA_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct camellia_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = CAMELLIA_MIN_KEY_SIZE,
+ .max_keysize = CAMELLIA_MAX_KEY_SIZE,
+ .ivsize = CAMELLIA_BLOCK_SIZE,
+ .setkey = camellia_setkey,
+ .encrypt = cbc_encrypt,
+ .decrypt = cbc_decrypt,
+ }, {
+ .base.cra_name = "__ctr(camellia)",
+ .base.cra_driver_name = "__ctr-camellia-aesni-avx2",
+ .base.cra_priority = 500,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct camellia_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = CAMELLIA_MIN_KEY_SIZE,
+ .max_keysize = CAMELLIA_MAX_KEY_SIZE,
+ .ivsize = CAMELLIA_BLOCK_SIZE,
+ .chunksize = CAMELLIA_BLOCK_SIZE,
+ .setkey = camellia_setkey,
+ .encrypt = ctr_crypt,
+ .decrypt = ctr_crypt,
+ }, {
+ .base.cra_name = "__xts(camellia)",
+ .base.cra_driver_name = "__xts-camellia-aesni-avx2",
+ .base.cra_priority = 500,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = CAMELLIA_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct camellia_xts_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = 2 * CAMELLIA_MIN_KEY_SIZE,
+ .max_keysize = 2 * CAMELLIA_MAX_KEY_SIZE,
+ .ivsize = CAMELLIA_BLOCK_SIZE,
+ .setkey = xts_camellia_setkey,
+ .encrypt = xts_encrypt,
+ .decrypt = xts_decrypt,
+ },
+};
+
+static struct simd_skcipher_alg *camellia_simd_algs[ARRAY_SIZE(camellia_algs)];
+
+static int __init camellia_aesni_init(void)
+{
+ const char *feature_name;
+
+ if (!boot_cpu_has(X86_FEATURE_AVX) ||
+ !boot_cpu_has(X86_FEATURE_AVX2) ||
+ !boot_cpu_has(X86_FEATURE_AES) ||
+ !boot_cpu_has(X86_FEATURE_OSXSAVE)) {
+ pr_info("AVX2 or AES-NI instructions are not detected.\n");
+ return -ENODEV;
+ }
+
+ if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+ &feature_name)) {
+ pr_info("CPU feature '%s' is not supported.\n", feature_name);
+ return -ENODEV;
+ }
+
+ return simd_register_skciphers_compat(camellia_algs,
+ ARRAY_SIZE(camellia_algs),
+ camellia_simd_algs);
+}
+
+static void __exit camellia_aesni_fini(void)
+{
+ simd_unregister_skciphers(camellia_algs, ARRAY_SIZE(camellia_algs),
+ camellia_simd_algs);
+}
+
+module_init(camellia_aesni_init);
+module_exit(camellia_aesni_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Camellia Cipher Algorithm, AES-NI/AVX2 optimized");
+MODULE_ALIAS_CRYPTO("camellia");
+MODULE_ALIAS_CRYPTO("camellia-asm");
diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c
new file mode 100644
index 000000000..d09f65214
--- /dev/null
+++ b/arch/x86/crypto/camellia_aesni_avx_glue.c
@@ -0,0 +1,325 @@
+/*
+ * Glue Code for x86_64/AVX/AES-NI assembler optimized version of Camellia
+ *
+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+#include <asm/crypto/camellia.h>
+#include <asm/crypto/glue_helper.h>
+#include <crypto/algapi.h>
+#include <crypto/internal/simd.h>
+#include <crypto/xts.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/types.h>
+
+#define CAMELLIA_AESNI_PARALLEL_BLOCKS 16
+
+/* 16-way parallel cipher functions (avx/aes-ni) */
+asmlinkage void camellia_ecb_enc_16way(struct camellia_ctx *ctx, u8 *dst,
+ const u8 *src);
+EXPORT_SYMBOL_GPL(camellia_ecb_enc_16way);
+
+asmlinkage void camellia_ecb_dec_16way(struct camellia_ctx *ctx, u8 *dst,
+ const u8 *src);
+EXPORT_SYMBOL_GPL(camellia_ecb_dec_16way);
+
+asmlinkage void camellia_cbc_dec_16way(struct camellia_ctx *ctx, u8 *dst,
+ const u8 *src);
+EXPORT_SYMBOL_GPL(camellia_cbc_dec_16way);
+
+asmlinkage void camellia_ctr_16way(struct camellia_ctx *ctx, u8 *dst,
+ const u8 *src, le128 *iv);
+EXPORT_SYMBOL_GPL(camellia_ctr_16way);
+
+asmlinkage void camellia_xts_enc_16way(struct camellia_ctx *ctx, u8 *dst,
+ const u8 *src, le128 *iv);
+EXPORT_SYMBOL_GPL(camellia_xts_enc_16way);
+
+asmlinkage void camellia_xts_dec_16way(struct camellia_ctx *ctx, u8 *dst,
+ const u8 *src, le128 *iv);
+EXPORT_SYMBOL_GPL(camellia_xts_dec_16way);
+
+void camellia_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+{
+ glue_xts_crypt_128bit_one(ctx, dst, src, iv,
+ GLUE_FUNC_CAST(camellia_enc_blk));
+}
+EXPORT_SYMBOL_GPL(camellia_xts_enc);
+
+void camellia_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+{
+ glue_xts_crypt_128bit_one(ctx, dst, src, iv,
+ GLUE_FUNC_CAST(camellia_dec_blk));
+}
+EXPORT_SYMBOL_GPL(camellia_xts_dec);
+
+static const struct common_glue_ctx camellia_enc = {
+ .num_funcs = 3,
+ .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+ .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_enc_16way) }
+ }, {
+ .num_blocks = 2,
+ .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk_2way) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk) }
+ } }
+};
+
+static const struct common_glue_ctx camellia_ctr = {
+ .num_funcs = 3,
+ .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+ .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_ctr_16way) }
+ }, {
+ .num_blocks = 2,
+ .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr_2way) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr) }
+ } }
+};
+
+static const struct common_glue_ctx camellia_enc_xts = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+ .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc_16way) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc) }
+ } }
+};
+
+static const struct common_glue_ctx camellia_dec = {
+ .num_funcs = 3,
+ .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+ .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_dec_16way) }
+ }, {
+ .num_blocks = 2,
+ .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk_2way) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk) }
+ } }
+};
+
+static const struct common_glue_ctx camellia_dec_cbc = {
+ .num_funcs = 3,
+ .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+ .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_cbc_dec_16way) }
+ }, {
+ .num_blocks = 2,
+ .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_decrypt_cbc_2way) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_dec_blk) }
+ } }
+};
+
+static const struct common_glue_ctx camellia_dec_xts = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+ .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec_16way) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec) }
+ } }
+};
+
+static int camellia_setkey(struct crypto_skcipher *tfm, const u8 *key,
+ unsigned int keylen)
+{
+ return __camellia_setkey(crypto_skcipher_ctx(tfm), key, keylen,
+ &tfm->base.crt_flags);
+}
+
+static int ecb_encrypt(struct skcipher_request *req)
+{
+ return glue_ecb_req_128bit(&camellia_enc, req);
+}
+
+static int ecb_decrypt(struct skcipher_request *req)
+{
+ return glue_ecb_req_128bit(&camellia_dec, req);
+}
+
+static int cbc_encrypt(struct skcipher_request *req)
+{
+ return glue_cbc_encrypt_req_128bit(GLUE_FUNC_CAST(camellia_enc_blk),
+ req);
+}
+
+static int cbc_decrypt(struct skcipher_request *req)
+{
+ return glue_cbc_decrypt_req_128bit(&camellia_dec_cbc, req);
+}
+
+static int ctr_crypt(struct skcipher_request *req)
+{
+ return glue_ctr_req_128bit(&camellia_ctr, req);
+}
+
+int xts_camellia_setkey(struct crypto_skcipher *tfm, const u8 *key,
+ unsigned int keylen)
+{
+ struct camellia_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+ u32 *flags = &tfm->base.crt_flags;
+ int err;
+
+ err = xts_verify_key(tfm, key, keylen);
+ if (err)
+ return err;
+
+ /* first half of xts-key is for crypt */
+ err = __camellia_setkey(&ctx->crypt_ctx, key, keylen / 2, flags);
+ if (err)
+ return err;
+
+ /* second half of xts-key is for tweak */
+ return __camellia_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2,
+ flags);
+}
+EXPORT_SYMBOL_GPL(xts_camellia_setkey);
+
+static int xts_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct camellia_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return glue_xts_req_128bit(&camellia_enc_xts, req,
+ XTS_TWEAK_CAST(camellia_enc_blk),
+ &ctx->tweak_ctx, &ctx->crypt_ctx);
+}
+
+static int xts_decrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct camellia_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return glue_xts_req_128bit(&camellia_dec_xts, req,
+ XTS_TWEAK_CAST(camellia_enc_blk),
+ &ctx->tweak_ctx, &ctx->crypt_ctx);
+}
+
+static struct skcipher_alg camellia_algs[] = {
+ {
+ .base.cra_name = "__ecb(camellia)",
+ .base.cra_driver_name = "__ecb-camellia-aesni",
+ .base.cra_priority = 400,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = CAMELLIA_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct camellia_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = CAMELLIA_MIN_KEY_SIZE,
+ .max_keysize = CAMELLIA_MAX_KEY_SIZE,
+ .setkey = camellia_setkey,
+ .encrypt = ecb_encrypt,
+ .decrypt = ecb_decrypt,
+ }, {
+ .base.cra_name = "__cbc(camellia)",
+ .base.cra_driver_name = "__cbc-camellia-aesni",
+ .base.cra_priority = 400,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = CAMELLIA_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct camellia_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = CAMELLIA_MIN_KEY_SIZE,
+ .max_keysize = CAMELLIA_MAX_KEY_SIZE,
+ .ivsize = CAMELLIA_BLOCK_SIZE,
+ .setkey = camellia_setkey,
+ .encrypt = cbc_encrypt,
+ .decrypt = cbc_decrypt,
+ }, {
+ .base.cra_name = "__ctr(camellia)",
+ .base.cra_driver_name = "__ctr-camellia-aesni",
+ .base.cra_priority = 400,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct camellia_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = CAMELLIA_MIN_KEY_SIZE,
+ .max_keysize = CAMELLIA_MAX_KEY_SIZE,
+ .ivsize = CAMELLIA_BLOCK_SIZE,
+ .chunksize = CAMELLIA_BLOCK_SIZE,
+ .setkey = camellia_setkey,
+ .encrypt = ctr_crypt,
+ .decrypt = ctr_crypt,
+ }, {
+ .base.cra_name = "__xts(camellia)",
+ .base.cra_driver_name = "__xts-camellia-aesni",
+ .base.cra_priority = 400,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = CAMELLIA_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct camellia_xts_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = 2 * CAMELLIA_MIN_KEY_SIZE,
+ .max_keysize = 2 * CAMELLIA_MAX_KEY_SIZE,
+ .ivsize = CAMELLIA_BLOCK_SIZE,
+ .setkey = xts_camellia_setkey,
+ .encrypt = xts_encrypt,
+ .decrypt = xts_decrypt,
+ },
+};
+
+static struct simd_skcipher_alg *camellia_simd_algs[ARRAY_SIZE(camellia_algs)];
+
+static int __init camellia_aesni_init(void)
+{
+ const char *feature_name;
+
+ if (!boot_cpu_has(X86_FEATURE_AVX) ||
+ !boot_cpu_has(X86_FEATURE_AES) ||
+ !boot_cpu_has(X86_FEATURE_OSXSAVE)) {
+ pr_info("AVX or AES-NI instructions are not detected.\n");
+ return -ENODEV;
+ }
+
+ if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+ &feature_name)) {
+ pr_info("CPU feature '%s' is not supported.\n", feature_name);
+ return -ENODEV;
+ }
+
+ return simd_register_skciphers_compat(camellia_algs,
+ ARRAY_SIZE(camellia_algs),
+ camellia_simd_algs);
+}
+
+static void __exit camellia_aesni_fini(void)
+{
+ simd_unregister_skciphers(camellia_algs, ARRAY_SIZE(camellia_algs),
+ camellia_simd_algs);
+}
+
+module_init(camellia_aesni_init);
+module_exit(camellia_aesni_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Camellia Cipher Algorithm, AES-NI/AVX optimized");
+MODULE_ALIAS_CRYPTO("camellia");
+MODULE_ALIAS_CRYPTO("camellia-asm");
diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c
new file mode 100644
index 000000000..dcd5e0f71
--- /dev/null
+++ b/arch/x86/crypto/camellia_glue.c
@@ -0,0 +1,1527 @@
+/*
+ * Glue Code for assembler optimized version of Camellia
+ *
+ * Copyright (c) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * Camellia parts based on code by:
+ * Copyright (C) 2006 NTT (Nippon Telegraph and Telephone Corporation)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+ * USA
+ *
+ */
+
+#include <asm/unaligned.h>
+#include <linux/crypto.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <crypto/algapi.h>
+#include <asm/crypto/camellia.h>
+#include <asm/crypto/glue_helper.h>
+
+/* regular block cipher functions */
+asmlinkage void __camellia_enc_blk(struct camellia_ctx *ctx, u8 *dst,
+ const u8 *src, bool xor);
+EXPORT_SYMBOL_GPL(__camellia_enc_blk);
+asmlinkage void camellia_dec_blk(struct camellia_ctx *ctx, u8 *dst,
+ const u8 *src);
+EXPORT_SYMBOL_GPL(camellia_dec_blk);
+
+/* 2-way parallel cipher functions */
+asmlinkage void __camellia_enc_blk_2way(struct camellia_ctx *ctx, u8 *dst,
+ const u8 *src, bool xor);
+EXPORT_SYMBOL_GPL(__camellia_enc_blk_2way);
+asmlinkage void camellia_dec_blk_2way(struct camellia_ctx *ctx, u8 *dst,
+ const u8 *src);
+EXPORT_SYMBOL_GPL(camellia_dec_blk_2way);
+
+static void camellia_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+ camellia_enc_blk(crypto_tfm_ctx(tfm), dst, src);
+}
+
+static void camellia_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+ camellia_dec_blk(crypto_tfm_ctx(tfm), dst, src);
+}
+
+/* camellia sboxes */
+__visible const u64 camellia_sp10011110[256] = {
+ 0x7000007070707000ULL, 0x8200008282828200ULL, 0x2c00002c2c2c2c00ULL,
+ 0xec0000ecececec00ULL, 0xb30000b3b3b3b300ULL, 0x2700002727272700ULL,
+ 0xc00000c0c0c0c000ULL, 0xe50000e5e5e5e500ULL, 0xe40000e4e4e4e400ULL,
+ 0x8500008585858500ULL, 0x5700005757575700ULL, 0x3500003535353500ULL,
+ 0xea0000eaeaeaea00ULL, 0x0c00000c0c0c0c00ULL, 0xae0000aeaeaeae00ULL,
+ 0x4100004141414100ULL, 0x2300002323232300ULL, 0xef0000efefefef00ULL,
+ 0x6b00006b6b6b6b00ULL, 0x9300009393939300ULL, 0x4500004545454500ULL,
+ 0x1900001919191900ULL, 0xa50000a5a5a5a500ULL, 0x2100002121212100ULL,
+ 0xed0000edededed00ULL, 0x0e00000e0e0e0e00ULL, 0x4f00004f4f4f4f00ULL,
+ 0x4e00004e4e4e4e00ULL, 0x1d00001d1d1d1d00ULL, 0x6500006565656500ULL,
+ 0x9200009292929200ULL, 0xbd0000bdbdbdbd00ULL, 0x8600008686868600ULL,
+ 0xb80000b8b8b8b800ULL, 0xaf0000afafafaf00ULL, 0x8f00008f8f8f8f00ULL,
+ 0x7c00007c7c7c7c00ULL, 0xeb0000ebebebeb00ULL, 0x1f00001f1f1f1f00ULL,
+ 0xce0000cececece00ULL, 0x3e00003e3e3e3e00ULL, 0x3000003030303000ULL,
+ 0xdc0000dcdcdcdc00ULL, 0x5f00005f5f5f5f00ULL, 0x5e00005e5e5e5e00ULL,
+ 0xc50000c5c5c5c500ULL, 0x0b00000b0b0b0b00ULL, 0x1a00001a1a1a1a00ULL,
+ 0xa60000a6a6a6a600ULL, 0xe10000e1e1e1e100ULL, 0x3900003939393900ULL,
+ 0xca0000cacacaca00ULL, 0xd50000d5d5d5d500ULL, 0x4700004747474700ULL,
+ 0x5d00005d5d5d5d00ULL, 0x3d00003d3d3d3d00ULL, 0xd90000d9d9d9d900ULL,
+ 0x0100000101010100ULL, 0x5a00005a5a5a5a00ULL, 0xd60000d6d6d6d600ULL,
+ 0x5100005151515100ULL, 0x5600005656565600ULL, 0x6c00006c6c6c6c00ULL,
+ 0x4d00004d4d4d4d00ULL, 0x8b00008b8b8b8b00ULL, 0x0d00000d0d0d0d00ULL,
+ 0x9a00009a9a9a9a00ULL, 0x6600006666666600ULL, 0xfb0000fbfbfbfb00ULL,
+ 0xcc0000cccccccc00ULL, 0xb00000b0b0b0b000ULL, 0x2d00002d2d2d2d00ULL,
+ 0x7400007474747400ULL, 0x1200001212121200ULL, 0x2b00002b2b2b2b00ULL,
+ 0x2000002020202000ULL, 0xf00000f0f0f0f000ULL, 0xb10000b1b1b1b100ULL,
+ 0x8400008484848400ULL, 0x9900009999999900ULL, 0xdf0000dfdfdfdf00ULL,
+ 0x4c00004c4c4c4c00ULL, 0xcb0000cbcbcbcb00ULL, 0xc20000c2c2c2c200ULL,
+ 0x3400003434343400ULL, 0x7e00007e7e7e7e00ULL, 0x7600007676767600ULL,
+ 0x0500000505050500ULL, 0x6d00006d6d6d6d00ULL, 0xb70000b7b7b7b700ULL,
+ 0xa90000a9a9a9a900ULL, 0x3100003131313100ULL, 0xd10000d1d1d1d100ULL,
+ 0x1700001717171700ULL, 0x0400000404040400ULL, 0xd70000d7d7d7d700ULL,
+ 0x1400001414141400ULL, 0x5800005858585800ULL, 0x3a00003a3a3a3a00ULL,
+ 0x6100006161616100ULL, 0xde0000dededede00ULL, 0x1b00001b1b1b1b00ULL,
+ 0x1100001111111100ULL, 0x1c00001c1c1c1c00ULL, 0x3200003232323200ULL,
+ 0x0f00000f0f0f0f00ULL, 0x9c00009c9c9c9c00ULL, 0x1600001616161600ULL,
+ 0x5300005353535300ULL, 0x1800001818181800ULL, 0xf20000f2f2f2f200ULL,
+ 0x2200002222222200ULL, 0xfe0000fefefefe00ULL, 0x4400004444444400ULL,
+ 0xcf0000cfcfcfcf00ULL, 0xb20000b2b2b2b200ULL, 0xc30000c3c3c3c300ULL,
+ 0xb50000b5b5b5b500ULL, 0x7a00007a7a7a7a00ULL, 0x9100009191919100ULL,
+ 0x2400002424242400ULL, 0x0800000808080800ULL, 0xe80000e8e8e8e800ULL,
+ 0xa80000a8a8a8a800ULL, 0x6000006060606000ULL, 0xfc0000fcfcfcfc00ULL,
+ 0x6900006969696900ULL, 0x5000005050505000ULL, 0xaa0000aaaaaaaa00ULL,
+ 0xd00000d0d0d0d000ULL, 0xa00000a0a0a0a000ULL, 0x7d00007d7d7d7d00ULL,
+ 0xa10000a1a1a1a100ULL, 0x8900008989898900ULL, 0x6200006262626200ULL,
+ 0x9700009797979700ULL, 0x5400005454545400ULL, 0x5b00005b5b5b5b00ULL,
+ 0x1e00001e1e1e1e00ULL, 0x9500009595959500ULL, 0xe00000e0e0e0e000ULL,
+ 0xff0000ffffffff00ULL, 0x6400006464646400ULL, 0xd20000d2d2d2d200ULL,
+ 0x1000001010101000ULL, 0xc40000c4c4c4c400ULL, 0x0000000000000000ULL,
+ 0x4800004848484800ULL, 0xa30000a3a3a3a300ULL, 0xf70000f7f7f7f700ULL,
+ 0x7500007575757500ULL, 0xdb0000dbdbdbdb00ULL, 0x8a00008a8a8a8a00ULL,
+ 0x0300000303030300ULL, 0xe60000e6e6e6e600ULL, 0xda0000dadadada00ULL,
+ 0x0900000909090900ULL, 0x3f00003f3f3f3f00ULL, 0xdd0000dddddddd00ULL,
+ 0x9400009494949400ULL, 0x8700008787878700ULL, 0x5c00005c5c5c5c00ULL,
+ 0x8300008383838300ULL, 0x0200000202020200ULL, 0xcd0000cdcdcdcd00ULL,
+ 0x4a00004a4a4a4a00ULL, 0x9000009090909000ULL, 0x3300003333333300ULL,
+ 0x7300007373737300ULL, 0x6700006767676700ULL, 0xf60000f6f6f6f600ULL,
+ 0xf30000f3f3f3f300ULL, 0x9d00009d9d9d9d00ULL, 0x7f00007f7f7f7f00ULL,
+ 0xbf0000bfbfbfbf00ULL, 0xe20000e2e2e2e200ULL, 0x5200005252525200ULL,
+ 0x9b00009b9b9b9b00ULL, 0xd80000d8d8d8d800ULL, 0x2600002626262600ULL,
+ 0xc80000c8c8c8c800ULL, 0x3700003737373700ULL, 0xc60000c6c6c6c600ULL,
+ 0x3b00003b3b3b3b00ULL, 0x8100008181818100ULL, 0x9600009696969600ULL,
+ 0x6f00006f6f6f6f00ULL, 0x4b00004b4b4b4b00ULL, 0x1300001313131300ULL,
+ 0xbe0000bebebebe00ULL, 0x6300006363636300ULL, 0x2e00002e2e2e2e00ULL,
+ 0xe90000e9e9e9e900ULL, 0x7900007979797900ULL, 0xa70000a7a7a7a700ULL,
+ 0x8c00008c8c8c8c00ULL, 0x9f00009f9f9f9f00ULL, 0x6e00006e6e6e6e00ULL,
+ 0xbc0000bcbcbcbc00ULL, 0x8e00008e8e8e8e00ULL, 0x2900002929292900ULL,
+ 0xf50000f5f5f5f500ULL, 0xf90000f9f9f9f900ULL, 0xb60000b6b6b6b600ULL,
+ 0x2f00002f2f2f2f00ULL, 0xfd0000fdfdfdfd00ULL, 0xb40000b4b4b4b400ULL,
+ 0x5900005959595900ULL, 0x7800007878787800ULL, 0x9800009898989800ULL,
+ 0x0600000606060600ULL, 0x6a00006a6a6a6a00ULL, 0xe70000e7e7e7e700ULL,
+ 0x4600004646464600ULL, 0x7100007171717100ULL, 0xba0000babababa00ULL,
+ 0xd40000d4d4d4d400ULL, 0x2500002525252500ULL, 0xab0000abababab00ULL,
+ 0x4200004242424200ULL, 0x8800008888888800ULL, 0xa20000a2a2a2a200ULL,
+ 0x8d00008d8d8d8d00ULL, 0xfa0000fafafafa00ULL, 0x7200007272727200ULL,
+ 0x0700000707070700ULL, 0xb90000b9b9b9b900ULL, 0x5500005555555500ULL,
+ 0xf80000f8f8f8f800ULL, 0xee0000eeeeeeee00ULL, 0xac0000acacacac00ULL,
+ 0x0a00000a0a0a0a00ULL, 0x3600003636363600ULL, 0x4900004949494900ULL,
+ 0x2a00002a2a2a2a00ULL, 0x6800006868686800ULL, 0x3c00003c3c3c3c00ULL,
+ 0x3800003838383800ULL, 0xf10000f1f1f1f100ULL, 0xa40000a4a4a4a400ULL,
+ 0x4000004040404000ULL, 0x2800002828282800ULL, 0xd30000d3d3d3d300ULL,
+ 0x7b00007b7b7b7b00ULL, 0xbb0000bbbbbbbb00ULL, 0xc90000c9c9c9c900ULL,
+ 0x4300004343434300ULL, 0xc10000c1c1c1c100ULL, 0x1500001515151500ULL,
+ 0xe30000e3e3e3e300ULL, 0xad0000adadadad00ULL, 0xf40000f4f4f4f400ULL,
+ 0x7700007777777700ULL, 0xc70000c7c7c7c700ULL, 0x8000008080808000ULL,
+ 0x9e00009e9e9e9e00ULL,
+};
+
+__visible const u64 camellia_sp22000222[256] = {
+ 0xe0e0000000e0e0e0ULL, 0x0505000000050505ULL, 0x5858000000585858ULL,
+ 0xd9d9000000d9d9d9ULL, 0x6767000000676767ULL, 0x4e4e0000004e4e4eULL,
+ 0x8181000000818181ULL, 0xcbcb000000cbcbcbULL, 0xc9c9000000c9c9c9ULL,
+ 0x0b0b0000000b0b0bULL, 0xaeae000000aeaeaeULL, 0x6a6a0000006a6a6aULL,
+ 0xd5d5000000d5d5d5ULL, 0x1818000000181818ULL, 0x5d5d0000005d5d5dULL,
+ 0x8282000000828282ULL, 0x4646000000464646ULL, 0xdfdf000000dfdfdfULL,
+ 0xd6d6000000d6d6d6ULL, 0x2727000000272727ULL, 0x8a8a0000008a8a8aULL,
+ 0x3232000000323232ULL, 0x4b4b0000004b4b4bULL, 0x4242000000424242ULL,
+ 0xdbdb000000dbdbdbULL, 0x1c1c0000001c1c1cULL, 0x9e9e0000009e9e9eULL,
+ 0x9c9c0000009c9c9cULL, 0x3a3a0000003a3a3aULL, 0xcaca000000cacacaULL,
+ 0x2525000000252525ULL, 0x7b7b0000007b7b7bULL, 0x0d0d0000000d0d0dULL,
+ 0x7171000000717171ULL, 0x5f5f0000005f5f5fULL, 0x1f1f0000001f1f1fULL,
+ 0xf8f8000000f8f8f8ULL, 0xd7d7000000d7d7d7ULL, 0x3e3e0000003e3e3eULL,
+ 0x9d9d0000009d9d9dULL, 0x7c7c0000007c7c7cULL, 0x6060000000606060ULL,
+ 0xb9b9000000b9b9b9ULL, 0xbebe000000bebebeULL, 0xbcbc000000bcbcbcULL,
+ 0x8b8b0000008b8b8bULL, 0x1616000000161616ULL, 0x3434000000343434ULL,
+ 0x4d4d0000004d4d4dULL, 0xc3c3000000c3c3c3ULL, 0x7272000000727272ULL,
+ 0x9595000000959595ULL, 0xabab000000abababULL, 0x8e8e0000008e8e8eULL,
+ 0xbaba000000bababaULL, 0x7a7a0000007a7a7aULL, 0xb3b3000000b3b3b3ULL,
+ 0x0202000000020202ULL, 0xb4b4000000b4b4b4ULL, 0xadad000000adadadULL,
+ 0xa2a2000000a2a2a2ULL, 0xacac000000acacacULL, 0xd8d8000000d8d8d8ULL,
+ 0x9a9a0000009a9a9aULL, 0x1717000000171717ULL, 0x1a1a0000001a1a1aULL,
+ 0x3535000000353535ULL, 0xcccc000000ccccccULL, 0xf7f7000000f7f7f7ULL,
+ 0x9999000000999999ULL, 0x6161000000616161ULL, 0x5a5a0000005a5a5aULL,
+ 0xe8e8000000e8e8e8ULL, 0x2424000000242424ULL, 0x5656000000565656ULL,
+ 0x4040000000404040ULL, 0xe1e1000000e1e1e1ULL, 0x6363000000636363ULL,
+ 0x0909000000090909ULL, 0x3333000000333333ULL, 0xbfbf000000bfbfbfULL,
+ 0x9898000000989898ULL, 0x9797000000979797ULL, 0x8585000000858585ULL,
+ 0x6868000000686868ULL, 0xfcfc000000fcfcfcULL, 0xecec000000ecececULL,
+ 0x0a0a0000000a0a0aULL, 0xdada000000dadadaULL, 0x6f6f0000006f6f6fULL,
+ 0x5353000000535353ULL, 0x6262000000626262ULL, 0xa3a3000000a3a3a3ULL,
+ 0x2e2e0000002e2e2eULL, 0x0808000000080808ULL, 0xafaf000000afafafULL,
+ 0x2828000000282828ULL, 0xb0b0000000b0b0b0ULL, 0x7474000000747474ULL,
+ 0xc2c2000000c2c2c2ULL, 0xbdbd000000bdbdbdULL, 0x3636000000363636ULL,
+ 0x2222000000222222ULL, 0x3838000000383838ULL, 0x6464000000646464ULL,
+ 0x1e1e0000001e1e1eULL, 0x3939000000393939ULL, 0x2c2c0000002c2c2cULL,
+ 0xa6a6000000a6a6a6ULL, 0x3030000000303030ULL, 0xe5e5000000e5e5e5ULL,
+ 0x4444000000444444ULL, 0xfdfd000000fdfdfdULL, 0x8888000000888888ULL,
+ 0x9f9f0000009f9f9fULL, 0x6565000000656565ULL, 0x8787000000878787ULL,
+ 0x6b6b0000006b6b6bULL, 0xf4f4000000f4f4f4ULL, 0x2323000000232323ULL,
+ 0x4848000000484848ULL, 0x1010000000101010ULL, 0xd1d1000000d1d1d1ULL,
+ 0x5151000000515151ULL, 0xc0c0000000c0c0c0ULL, 0xf9f9000000f9f9f9ULL,
+ 0xd2d2000000d2d2d2ULL, 0xa0a0000000a0a0a0ULL, 0x5555000000555555ULL,
+ 0xa1a1000000a1a1a1ULL, 0x4141000000414141ULL, 0xfafa000000fafafaULL,
+ 0x4343000000434343ULL, 0x1313000000131313ULL, 0xc4c4000000c4c4c4ULL,
+ 0x2f2f0000002f2f2fULL, 0xa8a8000000a8a8a8ULL, 0xb6b6000000b6b6b6ULL,
+ 0x3c3c0000003c3c3cULL, 0x2b2b0000002b2b2bULL, 0xc1c1000000c1c1c1ULL,
+ 0xffff000000ffffffULL, 0xc8c8000000c8c8c8ULL, 0xa5a5000000a5a5a5ULL,
+ 0x2020000000202020ULL, 0x8989000000898989ULL, 0x0000000000000000ULL,
+ 0x9090000000909090ULL, 0x4747000000474747ULL, 0xefef000000efefefULL,
+ 0xeaea000000eaeaeaULL, 0xb7b7000000b7b7b7ULL, 0x1515000000151515ULL,
+ 0x0606000000060606ULL, 0xcdcd000000cdcdcdULL, 0xb5b5000000b5b5b5ULL,
+ 0x1212000000121212ULL, 0x7e7e0000007e7e7eULL, 0xbbbb000000bbbbbbULL,
+ 0x2929000000292929ULL, 0x0f0f0000000f0f0fULL, 0xb8b8000000b8b8b8ULL,
+ 0x0707000000070707ULL, 0x0404000000040404ULL, 0x9b9b0000009b9b9bULL,
+ 0x9494000000949494ULL, 0x2121000000212121ULL, 0x6666000000666666ULL,
+ 0xe6e6000000e6e6e6ULL, 0xcece000000cececeULL, 0xeded000000edededULL,
+ 0xe7e7000000e7e7e7ULL, 0x3b3b0000003b3b3bULL, 0xfefe000000fefefeULL,
+ 0x7f7f0000007f7f7fULL, 0xc5c5000000c5c5c5ULL, 0xa4a4000000a4a4a4ULL,
+ 0x3737000000373737ULL, 0xb1b1000000b1b1b1ULL, 0x4c4c0000004c4c4cULL,
+ 0x9191000000919191ULL, 0x6e6e0000006e6e6eULL, 0x8d8d0000008d8d8dULL,
+ 0x7676000000767676ULL, 0x0303000000030303ULL, 0x2d2d0000002d2d2dULL,
+ 0xdede000000dededeULL, 0x9696000000969696ULL, 0x2626000000262626ULL,
+ 0x7d7d0000007d7d7dULL, 0xc6c6000000c6c6c6ULL, 0x5c5c0000005c5c5cULL,
+ 0xd3d3000000d3d3d3ULL, 0xf2f2000000f2f2f2ULL, 0x4f4f0000004f4f4fULL,
+ 0x1919000000191919ULL, 0x3f3f0000003f3f3fULL, 0xdcdc000000dcdcdcULL,
+ 0x7979000000797979ULL, 0x1d1d0000001d1d1dULL, 0x5252000000525252ULL,
+ 0xebeb000000ebebebULL, 0xf3f3000000f3f3f3ULL, 0x6d6d0000006d6d6dULL,
+ 0x5e5e0000005e5e5eULL, 0xfbfb000000fbfbfbULL, 0x6969000000696969ULL,
+ 0xb2b2000000b2b2b2ULL, 0xf0f0000000f0f0f0ULL, 0x3131000000313131ULL,
+ 0x0c0c0000000c0c0cULL, 0xd4d4000000d4d4d4ULL, 0xcfcf000000cfcfcfULL,
+ 0x8c8c0000008c8c8cULL, 0xe2e2000000e2e2e2ULL, 0x7575000000757575ULL,
+ 0xa9a9000000a9a9a9ULL, 0x4a4a0000004a4a4aULL, 0x5757000000575757ULL,
+ 0x8484000000848484ULL, 0x1111000000111111ULL, 0x4545000000454545ULL,
+ 0x1b1b0000001b1b1bULL, 0xf5f5000000f5f5f5ULL, 0xe4e4000000e4e4e4ULL,
+ 0x0e0e0000000e0e0eULL, 0x7373000000737373ULL, 0xaaaa000000aaaaaaULL,
+ 0xf1f1000000f1f1f1ULL, 0xdddd000000ddddddULL, 0x5959000000595959ULL,
+ 0x1414000000141414ULL, 0x6c6c0000006c6c6cULL, 0x9292000000929292ULL,
+ 0x5454000000545454ULL, 0xd0d0000000d0d0d0ULL, 0x7878000000787878ULL,
+ 0x7070000000707070ULL, 0xe3e3000000e3e3e3ULL, 0x4949000000494949ULL,
+ 0x8080000000808080ULL, 0x5050000000505050ULL, 0xa7a7000000a7a7a7ULL,
+ 0xf6f6000000f6f6f6ULL, 0x7777000000777777ULL, 0x9393000000939393ULL,
+ 0x8686000000868686ULL, 0x8383000000838383ULL, 0x2a2a0000002a2a2aULL,
+ 0xc7c7000000c7c7c7ULL, 0x5b5b0000005b5b5bULL, 0xe9e9000000e9e9e9ULL,
+ 0xeeee000000eeeeeeULL, 0x8f8f0000008f8f8fULL, 0x0101000000010101ULL,
+ 0x3d3d0000003d3d3dULL,
+};
+
+__visible const u64 camellia_sp03303033[256] = {
+ 0x0038380038003838ULL, 0x0041410041004141ULL, 0x0016160016001616ULL,
+ 0x0076760076007676ULL, 0x00d9d900d900d9d9ULL, 0x0093930093009393ULL,
+ 0x0060600060006060ULL, 0x00f2f200f200f2f2ULL, 0x0072720072007272ULL,
+ 0x00c2c200c200c2c2ULL, 0x00abab00ab00ababULL, 0x009a9a009a009a9aULL,
+ 0x0075750075007575ULL, 0x0006060006000606ULL, 0x0057570057005757ULL,
+ 0x00a0a000a000a0a0ULL, 0x0091910091009191ULL, 0x00f7f700f700f7f7ULL,
+ 0x00b5b500b500b5b5ULL, 0x00c9c900c900c9c9ULL, 0x00a2a200a200a2a2ULL,
+ 0x008c8c008c008c8cULL, 0x00d2d200d200d2d2ULL, 0x0090900090009090ULL,
+ 0x00f6f600f600f6f6ULL, 0x0007070007000707ULL, 0x00a7a700a700a7a7ULL,
+ 0x0027270027002727ULL, 0x008e8e008e008e8eULL, 0x00b2b200b200b2b2ULL,
+ 0x0049490049004949ULL, 0x00dede00de00dedeULL, 0x0043430043004343ULL,
+ 0x005c5c005c005c5cULL, 0x00d7d700d700d7d7ULL, 0x00c7c700c700c7c7ULL,
+ 0x003e3e003e003e3eULL, 0x00f5f500f500f5f5ULL, 0x008f8f008f008f8fULL,
+ 0x0067670067006767ULL, 0x001f1f001f001f1fULL, 0x0018180018001818ULL,
+ 0x006e6e006e006e6eULL, 0x00afaf00af00afafULL, 0x002f2f002f002f2fULL,
+ 0x00e2e200e200e2e2ULL, 0x0085850085008585ULL, 0x000d0d000d000d0dULL,
+ 0x0053530053005353ULL, 0x00f0f000f000f0f0ULL, 0x009c9c009c009c9cULL,
+ 0x0065650065006565ULL, 0x00eaea00ea00eaeaULL, 0x00a3a300a300a3a3ULL,
+ 0x00aeae00ae00aeaeULL, 0x009e9e009e009e9eULL, 0x00ecec00ec00ececULL,
+ 0x0080800080008080ULL, 0x002d2d002d002d2dULL, 0x006b6b006b006b6bULL,
+ 0x00a8a800a800a8a8ULL, 0x002b2b002b002b2bULL, 0x0036360036003636ULL,
+ 0x00a6a600a600a6a6ULL, 0x00c5c500c500c5c5ULL, 0x0086860086008686ULL,
+ 0x004d4d004d004d4dULL, 0x0033330033003333ULL, 0x00fdfd00fd00fdfdULL,
+ 0x0066660066006666ULL, 0x0058580058005858ULL, 0x0096960096009696ULL,
+ 0x003a3a003a003a3aULL, 0x0009090009000909ULL, 0x0095950095009595ULL,
+ 0x0010100010001010ULL, 0x0078780078007878ULL, 0x00d8d800d800d8d8ULL,
+ 0x0042420042004242ULL, 0x00cccc00cc00ccccULL, 0x00efef00ef00efefULL,
+ 0x0026260026002626ULL, 0x00e5e500e500e5e5ULL, 0x0061610061006161ULL,
+ 0x001a1a001a001a1aULL, 0x003f3f003f003f3fULL, 0x003b3b003b003b3bULL,
+ 0x0082820082008282ULL, 0x00b6b600b600b6b6ULL, 0x00dbdb00db00dbdbULL,
+ 0x00d4d400d400d4d4ULL, 0x0098980098009898ULL, 0x00e8e800e800e8e8ULL,
+ 0x008b8b008b008b8bULL, 0x0002020002000202ULL, 0x00ebeb00eb00ebebULL,
+ 0x000a0a000a000a0aULL, 0x002c2c002c002c2cULL, 0x001d1d001d001d1dULL,
+ 0x00b0b000b000b0b0ULL, 0x006f6f006f006f6fULL, 0x008d8d008d008d8dULL,
+ 0x0088880088008888ULL, 0x000e0e000e000e0eULL, 0x0019190019001919ULL,
+ 0x0087870087008787ULL, 0x004e4e004e004e4eULL, 0x000b0b000b000b0bULL,
+ 0x00a9a900a900a9a9ULL, 0x000c0c000c000c0cULL, 0x0079790079007979ULL,
+ 0x0011110011001111ULL, 0x007f7f007f007f7fULL, 0x0022220022002222ULL,
+ 0x00e7e700e700e7e7ULL, 0x0059590059005959ULL, 0x00e1e100e100e1e1ULL,
+ 0x00dada00da00dadaULL, 0x003d3d003d003d3dULL, 0x00c8c800c800c8c8ULL,
+ 0x0012120012001212ULL, 0x0004040004000404ULL, 0x0074740074007474ULL,
+ 0x0054540054005454ULL, 0x0030300030003030ULL, 0x007e7e007e007e7eULL,
+ 0x00b4b400b400b4b4ULL, 0x0028280028002828ULL, 0x0055550055005555ULL,
+ 0x0068680068006868ULL, 0x0050500050005050ULL, 0x00bebe00be00bebeULL,
+ 0x00d0d000d000d0d0ULL, 0x00c4c400c400c4c4ULL, 0x0031310031003131ULL,
+ 0x00cbcb00cb00cbcbULL, 0x002a2a002a002a2aULL, 0x00adad00ad00adadULL,
+ 0x000f0f000f000f0fULL, 0x00caca00ca00cacaULL, 0x0070700070007070ULL,
+ 0x00ffff00ff00ffffULL, 0x0032320032003232ULL, 0x0069690069006969ULL,
+ 0x0008080008000808ULL, 0x0062620062006262ULL, 0x0000000000000000ULL,
+ 0x0024240024002424ULL, 0x00d1d100d100d1d1ULL, 0x00fbfb00fb00fbfbULL,
+ 0x00baba00ba00babaULL, 0x00eded00ed00ededULL, 0x0045450045004545ULL,
+ 0x0081810081008181ULL, 0x0073730073007373ULL, 0x006d6d006d006d6dULL,
+ 0x0084840084008484ULL, 0x009f9f009f009f9fULL, 0x00eeee00ee00eeeeULL,
+ 0x004a4a004a004a4aULL, 0x00c3c300c300c3c3ULL, 0x002e2e002e002e2eULL,
+ 0x00c1c100c100c1c1ULL, 0x0001010001000101ULL, 0x00e6e600e600e6e6ULL,
+ 0x0025250025002525ULL, 0x0048480048004848ULL, 0x0099990099009999ULL,
+ 0x00b9b900b900b9b9ULL, 0x00b3b300b300b3b3ULL, 0x007b7b007b007b7bULL,
+ 0x00f9f900f900f9f9ULL, 0x00cece00ce00ceceULL, 0x00bfbf00bf00bfbfULL,
+ 0x00dfdf00df00dfdfULL, 0x0071710071007171ULL, 0x0029290029002929ULL,
+ 0x00cdcd00cd00cdcdULL, 0x006c6c006c006c6cULL, 0x0013130013001313ULL,
+ 0x0064640064006464ULL, 0x009b9b009b009b9bULL, 0x0063630063006363ULL,
+ 0x009d9d009d009d9dULL, 0x00c0c000c000c0c0ULL, 0x004b4b004b004b4bULL,
+ 0x00b7b700b700b7b7ULL, 0x00a5a500a500a5a5ULL, 0x0089890089008989ULL,
+ 0x005f5f005f005f5fULL, 0x00b1b100b100b1b1ULL, 0x0017170017001717ULL,
+ 0x00f4f400f400f4f4ULL, 0x00bcbc00bc00bcbcULL, 0x00d3d300d300d3d3ULL,
+ 0x0046460046004646ULL, 0x00cfcf00cf00cfcfULL, 0x0037370037003737ULL,
+ 0x005e5e005e005e5eULL, 0x0047470047004747ULL, 0x0094940094009494ULL,
+ 0x00fafa00fa00fafaULL, 0x00fcfc00fc00fcfcULL, 0x005b5b005b005b5bULL,
+ 0x0097970097009797ULL, 0x00fefe00fe00fefeULL, 0x005a5a005a005a5aULL,
+ 0x00acac00ac00acacULL, 0x003c3c003c003c3cULL, 0x004c4c004c004c4cULL,
+ 0x0003030003000303ULL, 0x0035350035003535ULL, 0x00f3f300f300f3f3ULL,
+ 0x0023230023002323ULL, 0x00b8b800b800b8b8ULL, 0x005d5d005d005d5dULL,
+ 0x006a6a006a006a6aULL, 0x0092920092009292ULL, 0x00d5d500d500d5d5ULL,
+ 0x0021210021002121ULL, 0x0044440044004444ULL, 0x0051510051005151ULL,
+ 0x00c6c600c600c6c6ULL, 0x007d7d007d007d7dULL, 0x0039390039003939ULL,
+ 0x0083830083008383ULL, 0x00dcdc00dc00dcdcULL, 0x00aaaa00aa00aaaaULL,
+ 0x007c7c007c007c7cULL, 0x0077770077007777ULL, 0x0056560056005656ULL,
+ 0x0005050005000505ULL, 0x001b1b001b001b1bULL, 0x00a4a400a400a4a4ULL,
+ 0x0015150015001515ULL, 0x0034340034003434ULL, 0x001e1e001e001e1eULL,
+ 0x001c1c001c001c1cULL, 0x00f8f800f800f8f8ULL, 0x0052520052005252ULL,
+ 0x0020200020002020ULL, 0x0014140014001414ULL, 0x00e9e900e900e9e9ULL,
+ 0x00bdbd00bd00bdbdULL, 0x00dddd00dd00ddddULL, 0x00e4e400e400e4e4ULL,
+ 0x00a1a100a100a1a1ULL, 0x00e0e000e000e0e0ULL, 0x008a8a008a008a8aULL,
+ 0x00f1f100f100f1f1ULL, 0x00d6d600d600d6d6ULL, 0x007a7a007a007a7aULL,
+ 0x00bbbb00bb00bbbbULL, 0x00e3e300e300e3e3ULL, 0x0040400040004040ULL,
+ 0x004f4f004f004f4fULL,
+};
+
+__visible const u64 camellia_sp00444404[256] = {
+ 0x0000707070700070ULL, 0x00002c2c2c2c002cULL, 0x0000b3b3b3b300b3ULL,
+ 0x0000c0c0c0c000c0ULL, 0x0000e4e4e4e400e4ULL, 0x0000575757570057ULL,
+ 0x0000eaeaeaea00eaULL, 0x0000aeaeaeae00aeULL, 0x0000232323230023ULL,
+ 0x00006b6b6b6b006bULL, 0x0000454545450045ULL, 0x0000a5a5a5a500a5ULL,
+ 0x0000edededed00edULL, 0x00004f4f4f4f004fULL, 0x00001d1d1d1d001dULL,
+ 0x0000929292920092ULL, 0x0000868686860086ULL, 0x0000afafafaf00afULL,
+ 0x00007c7c7c7c007cULL, 0x00001f1f1f1f001fULL, 0x00003e3e3e3e003eULL,
+ 0x0000dcdcdcdc00dcULL, 0x00005e5e5e5e005eULL, 0x00000b0b0b0b000bULL,
+ 0x0000a6a6a6a600a6ULL, 0x0000393939390039ULL, 0x0000d5d5d5d500d5ULL,
+ 0x00005d5d5d5d005dULL, 0x0000d9d9d9d900d9ULL, 0x00005a5a5a5a005aULL,
+ 0x0000515151510051ULL, 0x00006c6c6c6c006cULL, 0x00008b8b8b8b008bULL,
+ 0x00009a9a9a9a009aULL, 0x0000fbfbfbfb00fbULL, 0x0000b0b0b0b000b0ULL,
+ 0x0000747474740074ULL, 0x00002b2b2b2b002bULL, 0x0000f0f0f0f000f0ULL,
+ 0x0000848484840084ULL, 0x0000dfdfdfdf00dfULL, 0x0000cbcbcbcb00cbULL,
+ 0x0000343434340034ULL, 0x0000767676760076ULL, 0x00006d6d6d6d006dULL,
+ 0x0000a9a9a9a900a9ULL, 0x0000d1d1d1d100d1ULL, 0x0000040404040004ULL,
+ 0x0000141414140014ULL, 0x00003a3a3a3a003aULL, 0x0000dededede00deULL,
+ 0x0000111111110011ULL, 0x0000323232320032ULL, 0x00009c9c9c9c009cULL,
+ 0x0000535353530053ULL, 0x0000f2f2f2f200f2ULL, 0x0000fefefefe00feULL,
+ 0x0000cfcfcfcf00cfULL, 0x0000c3c3c3c300c3ULL, 0x00007a7a7a7a007aULL,
+ 0x0000242424240024ULL, 0x0000e8e8e8e800e8ULL, 0x0000606060600060ULL,
+ 0x0000696969690069ULL, 0x0000aaaaaaaa00aaULL, 0x0000a0a0a0a000a0ULL,
+ 0x0000a1a1a1a100a1ULL, 0x0000626262620062ULL, 0x0000545454540054ULL,
+ 0x00001e1e1e1e001eULL, 0x0000e0e0e0e000e0ULL, 0x0000646464640064ULL,
+ 0x0000101010100010ULL, 0x0000000000000000ULL, 0x0000a3a3a3a300a3ULL,
+ 0x0000757575750075ULL, 0x00008a8a8a8a008aULL, 0x0000e6e6e6e600e6ULL,
+ 0x0000090909090009ULL, 0x0000dddddddd00ddULL, 0x0000878787870087ULL,
+ 0x0000838383830083ULL, 0x0000cdcdcdcd00cdULL, 0x0000909090900090ULL,
+ 0x0000737373730073ULL, 0x0000f6f6f6f600f6ULL, 0x00009d9d9d9d009dULL,
+ 0x0000bfbfbfbf00bfULL, 0x0000525252520052ULL, 0x0000d8d8d8d800d8ULL,
+ 0x0000c8c8c8c800c8ULL, 0x0000c6c6c6c600c6ULL, 0x0000818181810081ULL,
+ 0x00006f6f6f6f006fULL, 0x0000131313130013ULL, 0x0000636363630063ULL,
+ 0x0000e9e9e9e900e9ULL, 0x0000a7a7a7a700a7ULL, 0x00009f9f9f9f009fULL,
+ 0x0000bcbcbcbc00bcULL, 0x0000292929290029ULL, 0x0000f9f9f9f900f9ULL,
+ 0x00002f2f2f2f002fULL, 0x0000b4b4b4b400b4ULL, 0x0000787878780078ULL,
+ 0x0000060606060006ULL, 0x0000e7e7e7e700e7ULL, 0x0000717171710071ULL,
+ 0x0000d4d4d4d400d4ULL, 0x0000abababab00abULL, 0x0000888888880088ULL,
+ 0x00008d8d8d8d008dULL, 0x0000727272720072ULL, 0x0000b9b9b9b900b9ULL,
+ 0x0000f8f8f8f800f8ULL, 0x0000acacacac00acULL, 0x0000363636360036ULL,
+ 0x00002a2a2a2a002aULL, 0x00003c3c3c3c003cULL, 0x0000f1f1f1f100f1ULL,
+ 0x0000404040400040ULL, 0x0000d3d3d3d300d3ULL, 0x0000bbbbbbbb00bbULL,
+ 0x0000434343430043ULL, 0x0000151515150015ULL, 0x0000adadadad00adULL,
+ 0x0000777777770077ULL, 0x0000808080800080ULL, 0x0000828282820082ULL,
+ 0x0000ecececec00ecULL, 0x0000272727270027ULL, 0x0000e5e5e5e500e5ULL,
+ 0x0000858585850085ULL, 0x0000353535350035ULL, 0x00000c0c0c0c000cULL,
+ 0x0000414141410041ULL, 0x0000efefefef00efULL, 0x0000939393930093ULL,
+ 0x0000191919190019ULL, 0x0000212121210021ULL, 0x00000e0e0e0e000eULL,
+ 0x00004e4e4e4e004eULL, 0x0000656565650065ULL, 0x0000bdbdbdbd00bdULL,
+ 0x0000b8b8b8b800b8ULL, 0x00008f8f8f8f008fULL, 0x0000ebebebeb00ebULL,
+ 0x0000cececece00ceULL, 0x0000303030300030ULL, 0x00005f5f5f5f005fULL,
+ 0x0000c5c5c5c500c5ULL, 0x00001a1a1a1a001aULL, 0x0000e1e1e1e100e1ULL,
+ 0x0000cacacaca00caULL, 0x0000474747470047ULL, 0x00003d3d3d3d003dULL,
+ 0x0000010101010001ULL, 0x0000d6d6d6d600d6ULL, 0x0000565656560056ULL,
+ 0x00004d4d4d4d004dULL, 0x00000d0d0d0d000dULL, 0x0000666666660066ULL,
+ 0x0000cccccccc00ccULL, 0x00002d2d2d2d002dULL, 0x0000121212120012ULL,
+ 0x0000202020200020ULL, 0x0000b1b1b1b100b1ULL, 0x0000999999990099ULL,
+ 0x00004c4c4c4c004cULL, 0x0000c2c2c2c200c2ULL, 0x00007e7e7e7e007eULL,
+ 0x0000050505050005ULL, 0x0000b7b7b7b700b7ULL, 0x0000313131310031ULL,
+ 0x0000171717170017ULL, 0x0000d7d7d7d700d7ULL, 0x0000585858580058ULL,
+ 0x0000616161610061ULL, 0x00001b1b1b1b001bULL, 0x00001c1c1c1c001cULL,
+ 0x00000f0f0f0f000fULL, 0x0000161616160016ULL, 0x0000181818180018ULL,
+ 0x0000222222220022ULL, 0x0000444444440044ULL, 0x0000b2b2b2b200b2ULL,
+ 0x0000b5b5b5b500b5ULL, 0x0000919191910091ULL, 0x0000080808080008ULL,
+ 0x0000a8a8a8a800a8ULL, 0x0000fcfcfcfc00fcULL, 0x0000505050500050ULL,
+ 0x0000d0d0d0d000d0ULL, 0x00007d7d7d7d007dULL, 0x0000898989890089ULL,
+ 0x0000979797970097ULL, 0x00005b5b5b5b005bULL, 0x0000959595950095ULL,
+ 0x0000ffffffff00ffULL, 0x0000d2d2d2d200d2ULL, 0x0000c4c4c4c400c4ULL,
+ 0x0000484848480048ULL, 0x0000f7f7f7f700f7ULL, 0x0000dbdbdbdb00dbULL,
+ 0x0000030303030003ULL, 0x0000dadadada00daULL, 0x00003f3f3f3f003fULL,
+ 0x0000949494940094ULL, 0x00005c5c5c5c005cULL, 0x0000020202020002ULL,
+ 0x00004a4a4a4a004aULL, 0x0000333333330033ULL, 0x0000676767670067ULL,
+ 0x0000f3f3f3f300f3ULL, 0x00007f7f7f7f007fULL, 0x0000e2e2e2e200e2ULL,
+ 0x00009b9b9b9b009bULL, 0x0000262626260026ULL, 0x0000373737370037ULL,
+ 0x00003b3b3b3b003bULL, 0x0000969696960096ULL, 0x00004b4b4b4b004bULL,
+ 0x0000bebebebe00beULL, 0x00002e2e2e2e002eULL, 0x0000797979790079ULL,
+ 0x00008c8c8c8c008cULL, 0x00006e6e6e6e006eULL, 0x00008e8e8e8e008eULL,
+ 0x0000f5f5f5f500f5ULL, 0x0000b6b6b6b600b6ULL, 0x0000fdfdfdfd00fdULL,
+ 0x0000595959590059ULL, 0x0000989898980098ULL, 0x00006a6a6a6a006aULL,
+ 0x0000464646460046ULL, 0x0000babababa00baULL, 0x0000252525250025ULL,
+ 0x0000424242420042ULL, 0x0000a2a2a2a200a2ULL, 0x0000fafafafa00faULL,
+ 0x0000070707070007ULL, 0x0000555555550055ULL, 0x0000eeeeeeee00eeULL,
+ 0x00000a0a0a0a000aULL, 0x0000494949490049ULL, 0x0000686868680068ULL,
+ 0x0000383838380038ULL, 0x0000a4a4a4a400a4ULL, 0x0000282828280028ULL,
+ 0x00007b7b7b7b007bULL, 0x0000c9c9c9c900c9ULL, 0x0000c1c1c1c100c1ULL,
+ 0x0000e3e3e3e300e3ULL, 0x0000f4f4f4f400f4ULL, 0x0000c7c7c7c700c7ULL,
+ 0x00009e9e9e9e009eULL,
+};
+
+__visible const u64 camellia_sp02220222[256] = {
+ 0x00e0e0e000e0e0e0ULL, 0x0005050500050505ULL, 0x0058585800585858ULL,
+ 0x00d9d9d900d9d9d9ULL, 0x0067676700676767ULL, 0x004e4e4e004e4e4eULL,
+ 0x0081818100818181ULL, 0x00cbcbcb00cbcbcbULL, 0x00c9c9c900c9c9c9ULL,
+ 0x000b0b0b000b0b0bULL, 0x00aeaeae00aeaeaeULL, 0x006a6a6a006a6a6aULL,
+ 0x00d5d5d500d5d5d5ULL, 0x0018181800181818ULL, 0x005d5d5d005d5d5dULL,
+ 0x0082828200828282ULL, 0x0046464600464646ULL, 0x00dfdfdf00dfdfdfULL,
+ 0x00d6d6d600d6d6d6ULL, 0x0027272700272727ULL, 0x008a8a8a008a8a8aULL,
+ 0x0032323200323232ULL, 0x004b4b4b004b4b4bULL, 0x0042424200424242ULL,
+ 0x00dbdbdb00dbdbdbULL, 0x001c1c1c001c1c1cULL, 0x009e9e9e009e9e9eULL,
+ 0x009c9c9c009c9c9cULL, 0x003a3a3a003a3a3aULL, 0x00cacaca00cacacaULL,
+ 0x0025252500252525ULL, 0x007b7b7b007b7b7bULL, 0x000d0d0d000d0d0dULL,
+ 0x0071717100717171ULL, 0x005f5f5f005f5f5fULL, 0x001f1f1f001f1f1fULL,
+ 0x00f8f8f800f8f8f8ULL, 0x00d7d7d700d7d7d7ULL, 0x003e3e3e003e3e3eULL,
+ 0x009d9d9d009d9d9dULL, 0x007c7c7c007c7c7cULL, 0x0060606000606060ULL,
+ 0x00b9b9b900b9b9b9ULL, 0x00bebebe00bebebeULL, 0x00bcbcbc00bcbcbcULL,
+ 0x008b8b8b008b8b8bULL, 0x0016161600161616ULL, 0x0034343400343434ULL,
+ 0x004d4d4d004d4d4dULL, 0x00c3c3c300c3c3c3ULL, 0x0072727200727272ULL,
+ 0x0095959500959595ULL, 0x00ababab00abababULL, 0x008e8e8e008e8e8eULL,
+ 0x00bababa00bababaULL, 0x007a7a7a007a7a7aULL, 0x00b3b3b300b3b3b3ULL,
+ 0x0002020200020202ULL, 0x00b4b4b400b4b4b4ULL, 0x00adadad00adadadULL,
+ 0x00a2a2a200a2a2a2ULL, 0x00acacac00acacacULL, 0x00d8d8d800d8d8d8ULL,
+ 0x009a9a9a009a9a9aULL, 0x0017171700171717ULL, 0x001a1a1a001a1a1aULL,
+ 0x0035353500353535ULL, 0x00cccccc00ccccccULL, 0x00f7f7f700f7f7f7ULL,
+ 0x0099999900999999ULL, 0x0061616100616161ULL, 0x005a5a5a005a5a5aULL,
+ 0x00e8e8e800e8e8e8ULL, 0x0024242400242424ULL, 0x0056565600565656ULL,
+ 0x0040404000404040ULL, 0x00e1e1e100e1e1e1ULL, 0x0063636300636363ULL,
+ 0x0009090900090909ULL, 0x0033333300333333ULL, 0x00bfbfbf00bfbfbfULL,
+ 0x0098989800989898ULL, 0x0097979700979797ULL, 0x0085858500858585ULL,
+ 0x0068686800686868ULL, 0x00fcfcfc00fcfcfcULL, 0x00ececec00ecececULL,
+ 0x000a0a0a000a0a0aULL, 0x00dadada00dadadaULL, 0x006f6f6f006f6f6fULL,
+ 0x0053535300535353ULL, 0x0062626200626262ULL, 0x00a3a3a300a3a3a3ULL,
+ 0x002e2e2e002e2e2eULL, 0x0008080800080808ULL, 0x00afafaf00afafafULL,
+ 0x0028282800282828ULL, 0x00b0b0b000b0b0b0ULL, 0x0074747400747474ULL,
+ 0x00c2c2c200c2c2c2ULL, 0x00bdbdbd00bdbdbdULL, 0x0036363600363636ULL,
+ 0x0022222200222222ULL, 0x0038383800383838ULL, 0x0064646400646464ULL,
+ 0x001e1e1e001e1e1eULL, 0x0039393900393939ULL, 0x002c2c2c002c2c2cULL,
+ 0x00a6a6a600a6a6a6ULL, 0x0030303000303030ULL, 0x00e5e5e500e5e5e5ULL,
+ 0x0044444400444444ULL, 0x00fdfdfd00fdfdfdULL, 0x0088888800888888ULL,
+ 0x009f9f9f009f9f9fULL, 0x0065656500656565ULL, 0x0087878700878787ULL,
+ 0x006b6b6b006b6b6bULL, 0x00f4f4f400f4f4f4ULL, 0x0023232300232323ULL,
+ 0x0048484800484848ULL, 0x0010101000101010ULL, 0x00d1d1d100d1d1d1ULL,
+ 0x0051515100515151ULL, 0x00c0c0c000c0c0c0ULL, 0x00f9f9f900f9f9f9ULL,
+ 0x00d2d2d200d2d2d2ULL, 0x00a0a0a000a0a0a0ULL, 0x0055555500555555ULL,
+ 0x00a1a1a100a1a1a1ULL, 0x0041414100414141ULL, 0x00fafafa00fafafaULL,
+ 0x0043434300434343ULL, 0x0013131300131313ULL, 0x00c4c4c400c4c4c4ULL,
+ 0x002f2f2f002f2f2fULL, 0x00a8a8a800a8a8a8ULL, 0x00b6b6b600b6b6b6ULL,
+ 0x003c3c3c003c3c3cULL, 0x002b2b2b002b2b2bULL, 0x00c1c1c100c1c1c1ULL,
+ 0x00ffffff00ffffffULL, 0x00c8c8c800c8c8c8ULL, 0x00a5a5a500a5a5a5ULL,
+ 0x0020202000202020ULL, 0x0089898900898989ULL, 0x0000000000000000ULL,
+ 0x0090909000909090ULL, 0x0047474700474747ULL, 0x00efefef00efefefULL,
+ 0x00eaeaea00eaeaeaULL, 0x00b7b7b700b7b7b7ULL, 0x0015151500151515ULL,
+ 0x0006060600060606ULL, 0x00cdcdcd00cdcdcdULL, 0x00b5b5b500b5b5b5ULL,
+ 0x0012121200121212ULL, 0x007e7e7e007e7e7eULL, 0x00bbbbbb00bbbbbbULL,
+ 0x0029292900292929ULL, 0x000f0f0f000f0f0fULL, 0x00b8b8b800b8b8b8ULL,
+ 0x0007070700070707ULL, 0x0004040400040404ULL, 0x009b9b9b009b9b9bULL,
+ 0x0094949400949494ULL, 0x0021212100212121ULL, 0x0066666600666666ULL,
+ 0x00e6e6e600e6e6e6ULL, 0x00cecece00cececeULL, 0x00ededed00edededULL,
+ 0x00e7e7e700e7e7e7ULL, 0x003b3b3b003b3b3bULL, 0x00fefefe00fefefeULL,
+ 0x007f7f7f007f7f7fULL, 0x00c5c5c500c5c5c5ULL, 0x00a4a4a400a4a4a4ULL,
+ 0x0037373700373737ULL, 0x00b1b1b100b1b1b1ULL, 0x004c4c4c004c4c4cULL,
+ 0x0091919100919191ULL, 0x006e6e6e006e6e6eULL, 0x008d8d8d008d8d8dULL,
+ 0x0076767600767676ULL, 0x0003030300030303ULL, 0x002d2d2d002d2d2dULL,
+ 0x00dedede00dededeULL, 0x0096969600969696ULL, 0x0026262600262626ULL,
+ 0x007d7d7d007d7d7dULL, 0x00c6c6c600c6c6c6ULL, 0x005c5c5c005c5c5cULL,
+ 0x00d3d3d300d3d3d3ULL, 0x00f2f2f200f2f2f2ULL, 0x004f4f4f004f4f4fULL,
+ 0x0019191900191919ULL, 0x003f3f3f003f3f3fULL, 0x00dcdcdc00dcdcdcULL,
+ 0x0079797900797979ULL, 0x001d1d1d001d1d1dULL, 0x0052525200525252ULL,
+ 0x00ebebeb00ebebebULL, 0x00f3f3f300f3f3f3ULL, 0x006d6d6d006d6d6dULL,
+ 0x005e5e5e005e5e5eULL, 0x00fbfbfb00fbfbfbULL, 0x0069696900696969ULL,
+ 0x00b2b2b200b2b2b2ULL, 0x00f0f0f000f0f0f0ULL, 0x0031313100313131ULL,
+ 0x000c0c0c000c0c0cULL, 0x00d4d4d400d4d4d4ULL, 0x00cfcfcf00cfcfcfULL,
+ 0x008c8c8c008c8c8cULL, 0x00e2e2e200e2e2e2ULL, 0x0075757500757575ULL,
+ 0x00a9a9a900a9a9a9ULL, 0x004a4a4a004a4a4aULL, 0x0057575700575757ULL,
+ 0x0084848400848484ULL, 0x0011111100111111ULL, 0x0045454500454545ULL,
+ 0x001b1b1b001b1b1bULL, 0x00f5f5f500f5f5f5ULL, 0x00e4e4e400e4e4e4ULL,
+ 0x000e0e0e000e0e0eULL, 0x0073737300737373ULL, 0x00aaaaaa00aaaaaaULL,
+ 0x00f1f1f100f1f1f1ULL, 0x00dddddd00ddddddULL, 0x0059595900595959ULL,
+ 0x0014141400141414ULL, 0x006c6c6c006c6c6cULL, 0x0092929200929292ULL,
+ 0x0054545400545454ULL, 0x00d0d0d000d0d0d0ULL, 0x0078787800787878ULL,
+ 0x0070707000707070ULL, 0x00e3e3e300e3e3e3ULL, 0x0049494900494949ULL,
+ 0x0080808000808080ULL, 0x0050505000505050ULL, 0x00a7a7a700a7a7a7ULL,
+ 0x00f6f6f600f6f6f6ULL, 0x0077777700777777ULL, 0x0093939300939393ULL,
+ 0x0086868600868686ULL, 0x0083838300838383ULL, 0x002a2a2a002a2a2aULL,
+ 0x00c7c7c700c7c7c7ULL, 0x005b5b5b005b5b5bULL, 0x00e9e9e900e9e9e9ULL,
+ 0x00eeeeee00eeeeeeULL, 0x008f8f8f008f8f8fULL, 0x0001010100010101ULL,
+ 0x003d3d3d003d3d3dULL,
+};
+
+__visible const u64 camellia_sp30333033[256] = {
+ 0x3800383838003838ULL, 0x4100414141004141ULL, 0x1600161616001616ULL,
+ 0x7600767676007676ULL, 0xd900d9d9d900d9d9ULL, 0x9300939393009393ULL,
+ 0x6000606060006060ULL, 0xf200f2f2f200f2f2ULL, 0x7200727272007272ULL,
+ 0xc200c2c2c200c2c2ULL, 0xab00ababab00ababULL, 0x9a009a9a9a009a9aULL,
+ 0x7500757575007575ULL, 0x0600060606000606ULL, 0x5700575757005757ULL,
+ 0xa000a0a0a000a0a0ULL, 0x9100919191009191ULL, 0xf700f7f7f700f7f7ULL,
+ 0xb500b5b5b500b5b5ULL, 0xc900c9c9c900c9c9ULL, 0xa200a2a2a200a2a2ULL,
+ 0x8c008c8c8c008c8cULL, 0xd200d2d2d200d2d2ULL, 0x9000909090009090ULL,
+ 0xf600f6f6f600f6f6ULL, 0x0700070707000707ULL, 0xa700a7a7a700a7a7ULL,
+ 0x2700272727002727ULL, 0x8e008e8e8e008e8eULL, 0xb200b2b2b200b2b2ULL,
+ 0x4900494949004949ULL, 0xde00dedede00dedeULL, 0x4300434343004343ULL,
+ 0x5c005c5c5c005c5cULL, 0xd700d7d7d700d7d7ULL, 0xc700c7c7c700c7c7ULL,
+ 0x3e003e3e3e003e3eULL, 0xf500f5f5f500f5f5ULL, 0x8f008f8f8f008f8fULL,
+ 0x6700676767006767ULL, 0x1f001f1f1f001f1fULL, 0x1800181818001818ULL,
+ 0x6e006e6e6e006e6eULL, 0xaf00afafaf00afafULL, 0x2f002f2f2f002f2fULL,
+ 0xe200e2e2e200e2e2ULL, 0x8500858585008585ULL, 0x0d000d0d0d000d0dULL,
+ 0x5300535353005353ULL, 0xf000f0f0f000f0f0ULL, 0x9c009c9c9c009c9cULL,
+ 0x6500656565006565ULL, 0xea00eaeaea00eaeaULL, 0xa300a3a3a300a3a3ULL,
+ 0xae00aeaeae00aeaeULL, 0x9e009e9e9e009e9eULL, 0xec00ececec00ececULL,
+ 0x8000808080008080ULL, 0x2d002d2d2d002d2dULL, 0x6b006b6b6b006b6bULL,
+ 0xa800a8a8a800a8a8ULL, 0x2b002b2b2b002b2bULL, 0x3600363636003636ULL,
+ 0xa600a6a6a600a6a6ULL, 0xc500c5c5c500c5c5ULL, 0x8600868686008686ULL,
+ 0x4d004d4d4d004d4dULL, 0x3300333333003333ULL, 0xfd00fdfdfd00fdfdULL,
+ 0x6600666666006666ULL, 0x5800585858005858ULL, 0x9600969696009696ULL,
+ 0x3a003a3a3a003a3aULL, 0x0900090909000909ULL, 0x9500959595009595ULL,
+ 0x1000101010001010ULL, 0x7800787878007878ULL, 0xd800d8d8d800d8d8ULL,
+ 0x4200424242004242ULL, 0xcc00cccccc00ccccULL, 0xef00efefef00efefULL,
+ 0x2600262626002626ULL, 0xe500e5e5e500e5e5ULL, 0x6100616161006161ULL,
+ 0x1a001a1a1a001a1aULL, 0x3f003f3f3f003f3fULL, 0x3b003b3b3b003b3bULL,
+ 0x8200828282008282ULL, 0xb600b6b6b600b6b6ULL, 0xdb00dbdbdb00dbdbULL,
+ 0xd400d4d4d400d4d4ULL, 0x9800989898009898ULL, 0xe800e8e8e800e8e8ULL,
+ 0x8b008b8b8b008b8bULL, 0x0200020202000202ULL, 0xeb00ebebeb00ebebULL,
+ 0x0a000a0a0a000a0aULL, 0x2c002c2c2c002c2cULL, 0x1d001d1d1d001d1dULL,
+ 0xb000b0b0b000b0b0ULL, 0x6f006f6f6f006f6fULL, 0x8d008d8d8d008d8dULL,
+ 0x8800888888008888ULL, 0x0e000e0e0e000e0eULL, 0x1900191919001919ULL,
+ 0x8700878787008787ULL, 0x4e004e4e4e004e4eULL, 0x0b000b0b0b000b0bULL,
+ 0xa900a9a9a900a9a9ULL, 0x0c000c0c0c000c0cULL, 0x7900797979007979ULL,
+ 0x1100111111001111ULL, 0x7f007f7f7f007f7fULL, 0x2200222222002222ULL,
+ 0xe700e7e7e700e7e7ULL, 0x5900595959005959ULL, 0xe100e1e1e100e1e1ULL,
+ 0xda00dadada00dadaULL, 0x3d003d3d3d003d3dULL, 0xc800c8c8c800c8c8ULL,
+ 0x1200121212001212ULL, 0x0400040404000404ULL, 0x7400747474007474ULL,
+ 0x5400545454005454ULL, 0x3000303030003030ULL, 0x7e007e7e7e007e7eULL,
+ 0xb400b4b4b400b4b4ULL, 0x2800282828002828ULL, 0x5500555555005555ULL,
+ 0x6800686868006868ULL, 0x5000505050005050ULL, 0xbe00bebebe00bebeULL,
+ 0xd000d0d0d000d0d0ULL, 0xc400c4c4c400c4c4ULL, 0x3100313131003131ULL,
+ 0xcb00cbcbcb00cbcbULL, 0x2a002a2a2a002a2aULL, 0xad00adadad00adadULL,
+ 0x0f000f0f0f000f0fULL, 0xca00cacaca00cacaULL, 0x7000707070007070ULL,
+ 0xff00ffffff00ffffULL, 0x3200323232003232ULL, 0x6900696969006969ULL,
+ 0x0800080808000808ULL, 0x6200626262006262ULL, 0x0000000000000000ULL,
+ 0x2400242424002424ULL, 0xd100d1d1d100d1d1ULL, 0xfb00fbfbfb00fbfbULL,
+ 0xba00bababa00babaULL, 0xed00ededed00ededULL, 0x4500454545004545ULL,
+ 0x8100818181008181ULL, 0x7300737373007373ULL, 0x6d006d6d6d006d6dULL,
+ 0x8400848484008484ULL, 0x9f009f9f9f009f9fULL, 0xee00eeeeee00eeeeULL,
+ 0x4a004a4a4a004a4aULL, 0xc300c3c3c300c3c3ULL, 0x2e002e2e2e002e2eULL,
+ 0xc100c1c1c100c1c1ULL, 0x0100010101000101ULL, 0xe600e6e6e600e6e6ULL,
+ 0x2500252525002525ULL, 0x4800484848004848ULL, 0x9900999999009999ULL,
+ 0xb900b9b9b900b9b9ULL, 0xb300b3b3b300b3b3ULL, 0x7b007b7b7b007b7bULL,
+ 0xf900f9f9f900f9f9ULL, 0xce00cecece00ceceULL, 0xbf00bfbfbf00bfbfULL,
+ 0xdf00dfdfdf00dfdfULL, 0x7100717171007171ULL, 0x2900292929002929ULL,
+ 0xcd00cdcdcd00cdcdULL, 0x6c006c6c6c006c6cULL, 0x1300131313001313ULL,
+ 0x6400646464006464ULL, 0x9b009b9b9b009b9bULL, 0x6300636363006363ULL,
+ 0x9d009d9d9d009d9dULL, 0xc000c0c0c000c0c0ULL, 0x4b004b4b4b004b4bULL,
+ 0xb700b7b7b700b7b7ULL, 0xa500a5a5a500a5a5ULL, 0x8900898989008989ULL,
+ 0x5f005f5f5f005f5fULL, 0xb100b1b1b100b1b1ULL, 0x1700171717001717ULL,
+ 0xf400f4f4f400f4f4ULL, 0xbc00bcbcbc00bcbcULL, 0xd300d3d3d300d3d3ULL,
+ 0x4600464646004646ULL, 0xcf00cfcfcf00cfcfULL, 0x3700373737003737ULL,
+ 0x5e005e5e5e005e5eULL, 0x4700474747004747ULL, 0x9400949494009494ULL,
+ 0xfa00fafafa00fafaULL, 0xfc00fcfcfc00fcfcULL, 0x5b005b5b5b005b5bULL,
+ 0x9700979797009797ULL, 0xfe00fefefe00fefeULL, 0x5a005a5a5a005a5aULL,
+ 0xac00acacac00acacULL, 0x3c003c3c3c003c3cULL, 0x4c004c4c4c004c4cULL,
+ 0x0300030303000303ULL, 0x3500353535003535ULL, 0xf300f3f3f300f3f3ULL,
+ 0x2300232323002323ULL, 0xb800b8b8b800b8b8ULL, 0x5d005d5d5d005d5dULL,
+ 0x6a006a6a6a006a6aULL, 0x9200929292009292ULL, 0xd500d5d5d500d5d5ULL,
+ 0x2100212121002121ULL, 0x4400444444004444ULL, 0x5100515151005151ULL,
+ 0xc600c6c6c600c6c6ULL, 0x7d007d7d7d007d7dULL, 0x3900393939003939ULL,
+ 0x8300838383008383ULL, 0xdc00dcdcdc00dcdcULL, 0xaa00aaaaaa00aaaaULL,
+ 0x7c007c7c7c007c7cULL, 0x7700777777007777ULL, 0x5600565656005656ULL,
+ 0x0500050505000505ULL, 0x1b001b1b1b001b1bULL, 0xa400a4a4a400a4a4ULL,
+ 0x1500151515001515ULL, 0x3400343434003434ULL, 0x1e001e1e1e001e1eULL,
+ 0x1c001c1c1c001c1cULL, 0xf800f8f8f800f8f8ULL, 0x5200525252005252ULL,
+ 0x2000202020002020ULL, 0x1400141414001414ULL, 0xe900e9e9e900e9e9ULL,
+ 0xbd00bdbdbd00bdbdULL, 0xdd00dddddd00ddddULL, 0xe400e4e4e400e4e4ULL,
+ 0xa100a1a1a100a1a1ULL, 0xe000e0e0e000e0e0ULL, 0x8a008a8a8a008a8aULL,
+ 0xf100f1f1f100f1f1ULL, 0xd600d6d6d600d6d6ULL, 0x7a007a7a7a007a7aULL,
+ 0xbb00bbbbbb00bbbbULL, 0xe300e3e3e300e3e3ULL, 0x4000404040004040ULL,
+ 0x4f004f4f4f004f4fULL,
+};
+
+__visible const u64 camellia_sp44044404[256] = {
+ 0x7070007070700070ULL, 0x2c2c002c2c2c002cULL, 0xb3b300b3b3b300b3ULL,
+ 0xc0c000c0c0c000c0ULL, 0xe4e400e4e4e400e4ULL, 0x5757005757570057ULL,
+ 0xeaea00eaeaea00eaULL, 0xaeae00aeaeae00aeULL, 0x2323002323230023ULL,
+ 0x6b6b006b6b6b006bULL, 0x4545004545450045ULL, 0xa5a500a5a5a500a5ULL,
+ 0xeded00ededed00edULL, 0x4f4f004f4f4f004fULL, 0x1d1d001d1d1d001dULL,
+ 0x9292009292920092ULL, 0x8686008686860086ULL, 0xafaf00afafaf00afULL,
+ 0x7c7c007c7c7c007cULL, 0x1f1f001f1f1f001fULL, 0x3e3e003e3e3e003eULL,
+ 0xdcdc00dcdcdc00dcULL, 0x5e5e005e5e5e005eULL, 0x0b0b000b0b0b000bULL,
+ 0xa6a600a6a6a600a6ULL, 0x3939003939390039ULL, 0xd5d500d5d5d500d5ULL,
+ 0x5d5d005d5d5d005dULL, 0xd9d900d9d9d900d9ULL, 0x5a5a005a5a5a005aULL,
+ 0x5151005151510051ULL, 0x6c6c006c6c6c006cULL, 0x8b8b008b8b8b008bULL,
+ 0x9a9a009a9a9a009aULL, 0xfbfb00fbfbfb00fbULL, 0xb0b000b0b0b000b0ULL,
+ 0x7474007474740074ULL, 0x2b2b002b2b2b002bULL, 0xf0f000f0f0f000f0ULL,
+ 0x8484008484840084ULL, 0xdfdf00dfdfdf00dfULL, 0xcbcb00cbcbcb00cbULL,
+ 0x3434003434340034ULL, 0x7676007676760076ULL, 0x6d6d006d6d6d006dULL,
+ 0xa9a900a9a9a900a9ULL, 0xd1d100d1d1d100d1ULL, 0x0404000404040004ULL,
+ 0x1414001414140014ULL, 0x3a3a003a3a3a003aULL, 0xdede00dedede00deULL,
+ 0x1111001111110011ULL, 0x3232003232320032ULL, 0x9c9c009c9c9c009cULL,
+ 0x5353005353530053ULL, 0xf2f200f2f2f200f2ULL, 0xfefe00fefefe00feULL,
+ 0xcfcf00cfcfcf00cfULL, 0xc3c300c3c3c300c3ULL, 0x7a7a007a7a7a007aULL,
+ 0x2424002424240024ULL, 0xe8e800e8e8e800e8ULL, 0x6060006060600060ULL,
+ 0x6969006969690069ULL, 0xaaaa00aaaaaa00aaULL, 0xa0a000a0a0a000a0ULL,
+ 0xa1a100a1a1a100a1ULL, 0x6262006262620062ULL, 0x5454005454540054ULL,
+ 0x1e1e001e1e1e001eULL, 0xe0e000e0e0e000e0ULL, 0x6464006464640064ULL,
+ 0x1010001010100010ULL, 0x0000000000000000ULL, 0xa3a300a3a3a300a3ULL,
+ 0x7575007575750075ULL, 0x8a8a008a8a8a008aULL, 0xe6e600e6e6e600e6ULL,
+ 0x0909000909090009ULL, 0xdddd00dddddd00ddULL, 0x8787008787870087ULL,
+ 0x8383008383830083ULL, 0xcdcd00cdcdcd00cdULL, 0x9090009090900090ULL,
+ 0x7373007373730073ULL, 0xf6f600f6f6f600f6ULL, 0x9d9d009d9d9d009dULL,
+ 0xbfbf00bfbfbf00bfULL, 0x5252005252520052ULL, 0xd8d800d8d8d800d8ULL,
+ 0xc8c800c8c8c800c8ULL, 0xc6c600c6c6c600c6ULL, 0x8181008181810081ULL,
+ 0x6f6f006f6f6f006fULL, 0x1313001313130013ULL, 0x6363006363630063ULL,
+ 0xe9e900e9e9e900e9ULL, 0xa7a700a7a7a700a7ULL, 0x9f9f009f9f9f009fULL,
+ 0xbcbc00bcbcbc00bcULL, 0x2929002929290029ULL, 0xf9f900f9f9f900f9ULL,
+ 0x2f2f002f2f2f002fULL, 0xb4b400b4b4b400b4ULL, 0x7878007878780078ULL,
+ 0x0606000606060006ULL, 0xe7e700e7e7e700e7ULL, 0x7171007171710071ULL,
+ 0xd4d400d4d4d400d4ULL, 0xabab00ababab00abULL, 0x8888008888880088ULL,
+ 0x8d8d008d8d8d008dULL, 0x7272007272720072ULL, 0xb9b900b9b9b900b9ULL,
+ 0xf8f800f8f8f800f8ULL, 0xacac00acacac00acULL, 0x3636003636360036ULL,
+ 0x2a2a002a2a2a002aULL, 0x3c3c003c3c3c003cULL, 0xf1f100f1f1f100f1ULL,
+ 0x4040004040400040ULL, 0xd3d300d3d3d300d3ULL, 0xbbbb00bbbbbb00bbULL,
+ 0x4343004343430043ULL, 0x1515001515150015ULL, 0xadad00adadad00adULL,
+ 0x7777007777770077ULL, 0x8080008080800080ULL, 0x8282008282820082ULL,
+ 0xecec00ececec00ecULL, 0x2727002727270027ULL, 0xe5e500e5e5e500e5ULL,
+ 0x8585008585850085ULL, 0x3535003535350035ULL, 0x0c0c000c0c0c000cULL,
+ 0x4141004141410041ULL, 0xefef00efefef00efULL, 0x9393009393930093ULL,
+ 0x1919001919190019ULL, 0x2121002121210021ULL, 0x0e0e000e0e0e000eULL,
+ 0x4e4e004e4e4e004eULL, 0x6565006565650065ULL, 0xbdbd00bdbdbd00bdULL,
+ 0xb8b800b8b8b800b8ULL, 0x8f8f008f8f8f008fULL, 0xebeb00ebebeb00ebULL,
+ 0xcece00cecece00ceULL, 0x3030003030300030ULL, 0x5f5f005f5f5f005fULL,
+ 0xc5c500c5c5c500c5ULL, 0x1a1a001a1a1a001aULL, 0xe1e100e1e1e100e1ULL,
+ 0xcaca00cacaca00caULL, 0x4747004747470047ULL, 0x3d3d003d3d3d003dULL,
+ 0x0101000101010001ULL, 0xd6d600d6d6d600d6ULL, 0x5656005656560056ULL,
+ 0x4d4d004d4d4d004dULL, 0x0d0d000d0d0d000dULL, 0x6666006666660066ULL,
+ 0xcccc00cccccc00ccULL, 0x2d2d002d2d2d002dULL, 0x1212001212120012ULL,
+ 0x2020002020200020ULL, 0xb1b100b1b1b100b1ULL, 0x9999009999990099ULL,
+ 0x4c4c004c4c4c004cULL, 0xc2c200c2c2c200c2ULL, 0x7e7e007e7e7e007eULL,
+ 0x0505000505050005ULL, 0xb7b700b7b7b700b7ULL, 0x3131003131310031ULL,
+ 0x1717001717170017ULL, 0xd7d700d7d7d700d7ULL, 0x5858005858580058ULL,
+ 0x6161006161610061ULL, 0x1b1b001b1b1b001bULL, 0x1c1c001c1c1c001cULL,
+ 0x0f0f000f0f0f000fULL, 0x1616001616160016ULL, 0x1818001818180018ULL,
+ 0x2222002222220022ULL, 0x4444004444440044ULL, 0xb2b200b2b2b200b2ULL,
+ 0xb5b500b5b5b500b5ULL, 0x9191009191910091ULL, 0x0808000808080008ULL,
+ 0xa8a800a8a8a800a8ULL, 0xfcfc00fcfcfc00fcULL, 0x5050005050500050ULL,
+ 0xd0d000d0d0d000d0ULL, 0x7d7d007d7d7d007dULL, 0x8989008989890089ULL,
+ 0x9797009797970097ULL, 0x5b5b005b5b5b005bULL, 0x9595009595950095ULL,
+ 0xffff00ffffff00ffULL, 0xd2d200d2d2d200d2ULL, 0xc4c400c4c4c400c4ULL,
+ 0x4848004848480048ULL, 0xf7f700f7f7f700f7ULL, 0xdbdb00dbdbdb00dbULL,
+ 0x0303000303030003ULL, 0xdada00dadada00daULL, 0x3f3f003f3f3f003fULL,
+ 0x9494009494940094ULL, 0x5c5c005c5c5c005cULL, 0x0202000202020002ULL,
+ 0x4a4a004a4a4a004aULL, 0x3333003333330033ULL, 0x6767006767670067ULL,
+ 0xf3f300f3f3f300f3ULL, 0x7f7f007f7f7f007fULL, 0xe2e200e2e2e200e2ULL,
+ 0x9b9b009b9b9b009bULL, 0x2626002626260026ULL, 0x3737003737370037ULL,
+ 0x3b3b003b3b3b003bULL, 0x9696009696960096ULL, 0x4b4b004b4b4b004bULL,
+ 0xbebe00bebebe00beULL, 0x2e2e002e2e2e002eULL, 0x7979007979790079ULL,
+ 0x8c8c008c8c8c008cULL, 0x6e6e006e6e6e006eULL, 0x8e8e008e8e8e008eULL,
+ 0xf5f500f5f5f500f5ULL, 0xb6b600b6b6b600b6ULL, 0xfdfd00fdfdfd00fdULL,
+ 0x5959005959590059ULL, 0x9898009898980098ULL, 0x6a6a006a6a6a006aULL,
+ 0x4646004646460046ULL, 0xbaba00bababa00baULL, 0x2525002525250025ULL,
+ 0x4242004242420042ULL, 0xa2a200a2a2a200a2ULL, 0xfafa00fafafa00faULL,
+ 0x0707000707070007ULL, 0x5555005555550055ULL, 0xeeee00eeeeee00eeULL,
+ 0x0a0a000a0a0a000aULL, 0x4949004949490049ULL, 0x6868006868680068ULL,
+ 0x3838003838380038ULL, 0xa4a400a4a4a400a4ULL, 0x2828002828280028ULL,
+ 0x7b7b007b7b7b007bULL, 0xc9c900c9c9c900c9ULL, 0xc1c100c1c1c100c1ULL,
+ 0xe3e300e3e3e300e3ULL, 0xf4f400f4f4f400f4ULL, 0xc7c700c7c7c700c7ULL,
+ 0x9e9e009e9e9e009eULL,
+};
+
+__visible const u64 camellia_sp11101110[256] = {
+ 0x7070700070707000ULL, 0x8282820082828200ULL, 0x2c2c2c002c2c2c00ULL,
+ 0xececec00ececec00ULL, 0xb3b3b300b3b3b300ULL, 0x2727270027272700ULL,
+ 0xc0c0c000c0c0c000ULL, 0xe5e5e500e5e5e500ULL, 0xe4e4e400e4e4e400ULL,
+ 0x8585850085858500ULL, 0x5757570057575700ULL, 0x3535350035353500ULL,
+ 0xeaeaea00eaeaea00ULL, 0x0c0c0c000c0c0c00ULL, 0xaeaeae00aeaeae00ULL,
+ 0x4141410041414100ULL, 0x2323230023232300ULL, 0xefefef00efefef00ULL,
+ 0x6b6b6b006b6b6b00ULL, 0x9393930093939300ULL, 0x4545450045454500ULL,
+ 0x1919190019191900ULL, 0xa5a5a500a5a5a500ULL, 0x2121210021212100ULL,
+ 0xededed00ededed00ULL, 0x0e0e0e000e0e0e00ULL, 0x4f4f4f004f4f4f00ULL,
+ 0x4e4e4e004e4e4e00ULL, 0x1d1d1d001d1d1d00ULL, 0x6565650065656500ULL,
+ 0x9292920092929200ULL, 0xbdbdbd00bdbdbd00ULL, 0x8686860086868600ULL,
+ 0xb8b8b800b8b8b800ULL, 0xafafaf00afafaf00ULL, 0x8f8f8f008f8f8f00ULL,
+ 0x7c7c7c007c7c7c00ULL, 0xebebeb00ebebeb00ULL, 0x1f1f1f001f1f1f00ULL,
+ 0xcecece00cecece00ULL, 0x3e3e3e003e3e3e00ULL, 0x3030300030303000ULL,
+ 0xdcdcdc00dcdcdc00ULL, 0x5f5f5f005f5f5f00ULL, 0x5e5e5e005e5e5e00ULL,
+ 0xc5c5c500c5c5c500ULL, 0x0b0b0b000b0b0b00ULL, 0x1a1a1a001a1a1a00ULL,
+ 0xa6a6a600a6a6a600ULL, 0xe1e1e100e1e1e100ULL, 0x3939390039393900ULL,
+ 0xcacaca00cacaca00ULL, 0xd5d5d500d5d5d500ULL, 0x4747470047474700ULL,
+ 0x5d5d5d005d5d5d00ULL, 0x3d3d3d003d3d3d00ULL, 0xd9d9d900d9d9d900ULL,
+ 0x0101010001010100ULL, 0x5a5a5a005a5a5a00ULL, 0xd6d6d600d6d6d600ULL,
+ 0x5151510051515100ULL, 0x5656560056565600ULL, 0x6c6c6c006c6c6c00ULL,
+ 0x4d4d4d004d4d4d00ULL, 0x8b8b8b008b8b8b00ULL, 0x0d0d0d000d0d0d00ULL,
+ 0x9a9a9a009a9a9a00ULL, 0x6666660066666600ULL, 0xfbfbfb00fbfbfb00ULL,
+ 0xcccccc00cccccc00ULL, 0xb0b0b000b0b0b000ULL, 0x2d2d2d002d2d2d00ULL,
+ 0x7474740074747400ULL, 0x1212120012121200ULL, 0x2b2b2b002b2b2b00ULL,
+ 0x2020200020202000ULL, 0xf0f0f000f0f0f000ULL, 0xb1b1b100b1b1b100ULL,
+ 0x8484840084848400ULL, 0x9999990099999900ULL, 0xdfdfdf00dfdfdf00ULL,
+ 0x4c4c4c004c4c4c00ULL, 0xcbcbcb00cbcbcb00ULL, 0xc2c2c200c2c2c200ULL,
+ 0x3434340034343400ULL, 0x7e7e7e007e7e7e00ULL, 0x7676760076767600ULL,
+ 0x0505050005050500ULL, 0x6d6d6d006d6d6d00ULL, 0xb7b7b700b7b7b700ULL,
+ 0xa9a9a900a9a9a900ULL, 0x3131310031313100ULL, 0xd1d1d100d1d1d100ULL,
+ 0x1717170017171700ULL, 0x0404040004040400ULL, 0xd7d7d700d7d7d700ULL,
+ 0x1414140014141400ULL, 0x5858580058585800ULL, 0x3a3a3a003a3a3a00ULL,
+ 0x6161610061616100ULL, 0xdedede00dedede00ULL, 0x1b1b1b001b1b1b00ULL,
+ 0x1111110011111100ULL, 0x1c1c1c001c1c1c00ULL, 0x3232320032323200ULL,
+ 0x0f0f0f000f0f0f00ULL, 0x9c9c9c009c9c9c00ULL, 0x1616160016161600ULL,
+ 0x5353530053535300ULL, 0x1818180018181800ULL, 0xf2f2f200f2f2f200ULL,
+ 0x2222220022222200ULL, 0xfefefe00fefefe00ULL, 0x4444440044444400ULL,
+ 0xcfcfcf00cfcfcf00ULL, 0xb2b2b200b2b2b200ULL, 0xc3c3c300c3c3c300ULL,
+ 0xb5b5b500b5b5b500ULL, 0x7a7a7a007a7a7a00ULL, 0x9191910091919100ULL,
+ 0x2424240024242400ULL, 0x0808080008080800ULL, 0xe8e8e800e8e8e800ULL,
+ 0xa8a8a800a8a8a800ULL, 0x6060600060606000ULL, 0xfcfcfc00fcfcfc00ULL,
+ 0x6969690069696900ULL, 0x5050500050505000ULL, 0xaaaaaa00aaaaaa00ULL,
+ 0xd0d0d000d0d0d000ULL, 0xa0a0a000a0a0a000ULL, 0x7d7d7d007d7d7d00ULL,
+ 0xa1a1a100a1a1a100ULL, 0x8989890089898900ULL, 0x6262620062626200ULL,
+ 0x9797970097979700ULL, 0x5454540054545400ULL, 0x5b5b5b005b5b5b00ULL,
+ 0x1e1e1e001e1e1e00ULL, 0x9595950095959500ULL, 0xe0e0e000e0e0e000ULL,
+ 0xffffff00ffffff00ULL, 0x6464640064646400ULL, 0xd2d2d200d2d2d200ULL,
+ 0x1010100010101000ULL, 0xc4c4c400c4c4c400ULL, 0x0000000000000000ULL,
+ 0x4848480048484800ULL, 0xa3a3a300a3a3a300ULL, 0xf7f7f700f7f7f700ULL,
+ 0x7575750075757500ULL, 0xdbdbdb00dbdbdb00ULL, 0x8a8a8a008a8a8a00ULL,
+ 0x0303030003030300ULL, 0xe6e6e600e6e6e600ULL, 0xdadada00dadada00ULL,
+ 0x0909090009090900ULL, 0x3f3f3f003f3f3f00ULL, 0xdddddd00dddddd00ULL,
+ 0x9494940094949400ULL, 0x8787870087878700ULL, 0x5c5c5c005c5c5c00ULL,
+ 0x8383830083838300ULL, 0x0202020002020200ULL, 0xcdcdcd00cdcdcd00ULL,
+ 0x4a4a4a004a4a4a00ULL, 0x9090900090909000ULL, 0x3333330033333300ULL,
+ 0x7373730073737300ULL, 0x6767670067676700ULL, 0xf6f6f600f6f6f600ULL,
+ 0xf3f3f300f3f3f300ULL, 0x9d9d9d009d9d9d00ULL, 0x7f7f7f007f7f7f00ULL,
+ 0xbfbfbf00bfbfbf00ULL, 0xe2e2e200e2e2e200ULL, 0x5252520052525200ULL,
+ 0x9b9b9b009b9b9b00ULL, 0xd8d8d800d8d8d800ULL, 0x2626260026262600ULL,
+ 0xc8c8c800c8c8c800ULL, 0x3737370037373700ULL, 0xc6c6c600c6c6c600ULL,
+ 0x3b3b3b003b3b3b00ULL, 0x8181810081818100ULL, 0x9696960096969600ULL,
+ 0x6f6f6f006f6f6f00ULL, 0x4b4b4b004b4b4b00ULL, 0x1313130013131300ULL,
+ 0xbebebe00bebebe00ULL, 0x6363630063636300ULL, 0x2e2e2e002e2e2e00ULL,
+ 0xe9e9e900e9e9e900ULL, 0x7979790079797900ULL, 0xa7a7a700a7a7a700ULL,
+ 0x8c8c8c008c8c8c00ULL, 0x9f9f9f009f9f9f00ULL, 0x6e6e6e006e6e6e00ULL,
+ 0xbcbcbc00bcbcbc00ULL, 0x8e8e8e008e8e8e00ULL, 0x2929290029292900ULL,
+ 0xf5f5f500f5f5f500ULL, 0xf9f9f900f9f9f900ULL, 0xb6b6b600b6b6b600ULL,
+ 0x2f2f2f002f2f2f00ULL, 0xfdfdfd00fdfdfd00ULL, 0xb4b4b400b4b4b400ULL,
+ 0x5959590059595900ULL, 0x7878780078787800ULL, 0x9898980098989800ULL,
+ 0x0606060006060600ULL, 0x6a6a6a006a6a6a00ULL, 0xe7e7e700e7e7e700ULL,
+ 0x4646460046464600ULL, 0x7171710071717100ULL, 0xbababa00bababa00ULL,
+ 0xd4d4d400d4d4d400ULL, 0x2525250025252500ULL, 0xababab00ababab00ULL,
+ 0x4242420042424200ULL, 0x8888880088888800ULL, 0xa2a2a200a2a2a200ULL,
+ 0x8d8d8d008d8d8d00ULL, 0xfafafa00fafafa00ULL, 0x7272720072727200ULL,
+ 0x0707070007070700ULL, 0xb9b9b900b9b9b900ULL, 0x5555550055555500ULL,
+ 0xf8f8f800f8f8f800ULL, 0xeeeeee00eeeeee00ULL, 0xacacac00acacac00ULL,
+ 0x0a0a0a000a0a0a00ULL, 0x3636360036363600ULL, 0x4949490049494900ULL,
+ 0x2a2a2a002a2a2a00ULL, 0x6868680068686800ULL, 0x3c3c3c003c3c3c00ULL,
+ 0x3838380038383800ULL, 0xf1f1f100f1f1f100ULL, 0xa4a4a400a4a4a400ULL,
+ 0x4040400040404000ULL, 0x2828280028282800ULL, 0xd3d3d300d3d3d300ULL,
+ 0x7b7b7b007b7b7b00ULL, 0xbbbbbb00bbbbbb00ULL, 0xc9c9c900c9c9c900ULL,
+ 0x4343430043434300ULL, 0xc1c1c100c1c1c100ULL, 0x1515150015151500ULL,
+ 0xe3e3e300e3e3e300ULL, 0xadadad00adadad00ULL, 0xf4f4f400f4f4f400ULL,
+ 0x7777770077777700ULL, 0xc7c7c700c7c7c700ULL, 0x8080800080808000ULL,
+ 0x9e9e9e009e9e9e00ULL,
+};
+
+/* key constants */
+#define CAMELLIA_SIGMA1L (0xA09E667FL)
+#define CAMELLIA_SIGMA1R (0x3BCC908BL)
+#define CAMELLIA_SIGMA2L (0xB67AE858L)
+#define CAMELLIA_SIGMA2R (0x4CAA73B2L)
+#define CAMELLIA_SIGMA3L (0xC6EF372FL)
+#define CAMELLIA_SIGMA3R (0xE94F82BEL)
+#define CAMELLIA_SIGMA4L (0x54FF53A5L)
+#define CAMELLIA_SIGMA4R (0xF1D36F1CL)
+#define CAMELLIA_SIGMA5L (0x10E527FAL)
+#define CAMELLIA_SIGMA5R (0xDE682D1DL)
+#define CAMELLIA_SIGMA6L (0xB05688C2L)
+#define CAMELLIA_SIGMA6R (0xB3E6C1FDL)
+
+/* macros */
+#define ROLDQ(l, r, bits) ({ \
+ u64 t = l; \
+ l = (l << bits) | (r >> (64 - bits)); \
+ r = (r << bits) | (t >> (64 - bits)); \
+})
+
+#define CAMELLIA_F(x, kl, kr, y) ({ \
+ u64 ii = x ^ (((u64)kl << 32) | kr); \
+ y = camellia_sp11101110[(uint8_t)ii]; \
+ y ^= camellia_sp44044404[(uint8_t)(ii >> 8)]; \
+ ii >>= 16; \
+ y ^= camellia_sp30333033[(uint8_t)ii]; \
+ y ^= camellia_sp02220222[(uint8_t)(ii >> 8)]; \
+ ii >>= 16; \
+ y ^= camellia_sp00444404[(uint8_t)ii]; \
+ y ^= camellia_sp03303033[(uint8_t)(ii >> 8)]; \
+ ii >>= 16; \
+ y ^= camellia_sp22000222[(uint8_t)ii]; \
+ y ^= camellia_sp10011110[(uint8_t)(ii >> 8)]; \
+ y = ror64(y, 32); \
+})
+
+#define SET_SUBKEY_LR(INDEX, sRL) (subkey[(INDEX)] = ror64((sRL), 32))
+
+static void camellia_setup_tail(u64 *subkey, u64 *subRL, int max)
+{
+ u64 kw4, tt;
+ u32 dw, tl, tr;
+
+ /* absorb kw2 to other subkeys */
+ /* round 2 */
+ subRL[3] ^= subRL[1];
+ /* round 4 */
+ subRL[5] ^= subRL[1];
+ /* round 6 */
+ subRL[7] ^= subRL[1];
+
+ subRL[1] ^= (subRL[1] & ~subRL[9]) << 32;
+ /* modified for FLinv(kl2) */
+ dw = (subRL[1] & subRL[9]) >> 32;
+ subRL[1] ^= rol32(dw, 1);
+
+ /* round 8 */
+ subRL[11] ^= subRL[1];
+ /* round 10 */
+ subRL[13] ^= subRL[1];
+ /* round 12 */
+ subRL[15] ^= subRL[1];
+
+ subRL[1] ^= (subRL[1] & ~subRL[17]) << 32;
+ /* modified for FLinv(kl4) */
+ dw = (subRL[1] & subRL[17]) >> 32;
+ subRL[1] ^= rol32(dw, 1);
+
+ /* round 14 */
+ subRL[19] ^= subRL[1];
+ /* round 16 */
+ subRL[21] ^= subRL[1];
+ /* round 18 */
+ subRL[23] ^= subRL[1];
+
+ if (max == 24) {
+ /* kw3 */
+ subRL[24] ^= subRL[1];
+
+ /* absorb kw4 to other subkeys */
+ kw4 = subRL[25];
+ } else {
+ subRL[1] ^= (subRL[1] & ~subRL[25]) << 32;
+ /* modified for FLinv(kl6) */
+ dw = (subRL[1] & subRL[25]) >> 32;
+ subRL[1] ^= rol32(dw, 1);
+
+ /* round 20 */
+ subRL[27] ^= subRL[1];
+ /* round 22 */
+ subRL[29] ^= subRL[1];
+ /* round 24 */
+ subRL[31] ^= subRL[1];
+ /* kw3 */
+ subRL[32] ^= subRL[1];
+
+ /* absorb kw4 to other subkeys */
+ kw4 = subRL[33];
+ /* round 23 */
+ subRL[30] ^= kw4;
+ /* round 21 */
+ subRL[28] ^= kw4;
+ /* round 19 */
+ subRL[26] ^= kw4;
+
+ kw4 ^= (kw4 & ~subRL[24]) << 32;
+ /* modified for FL(kl5) */
+ dw = (kw4 & subRL[24]) >> 32;
+ kw4 ^= rol32(dw, 1);
+ }
+
+ /* round 17 */
+ subRL[22] ^= kw4;
+ /* round 15 */
+ subRL[20] ^= kw4;
+ /* round 13 */
+ subRL[18] ^= kw4;
+
+ kw4 ^= (kw4 & ~subRL[16]) << 32;
+ /* modified for FL(kl3) */
+ dw = (kw4 & subRL[16]) >> 32;
+ kw4 ^= rol32(dw, 1);
+
+ /* round 11 */
+ subRL[14] ^= kw4;
+ /* round 9 */
+ subRL[12] ^= kw4;
+ /* round 7 */
+ subRL[10] ^= kw4;
+
+ kw4 ^= (kw4 & ~subRL[8]) << 32;
+ /* modified for FL(kl1) */
+ dw = (kw4 & subRL[8]) >> 32;
+ kw4 ^= rol32(dw, 1);
+
+ /* round 5 */
+ subRL[6] ^= kw4;
+ /* round 3 */
+ subRL[4] ^= kw4;
+ /* round 1 */
+ subRL[2] ^= kw4;
+ /* kw1 */
+ subRL[0] ^= kw4;
+
+ /* key XOR is end of F-function */
+ SET_SUBKEY_LR(0, subRL[0] ^ subRL[2]); /* kw1 */
+ SET_SUBKEY_LR(2, subRL[3]); /* round 1 */
+ SET_SUBKEY_LR(3, subRL[2] ^ subRL[4]); /* round 2 */
+ SET_SUBKEY_LR(4, subRL[3] ^ subRL[5]); /* round 3 */
+ SET_SUBKEY_LR(5, subRL[4] ^ subRL[6]); /* round 4 */
+ SET_SUBKEY_LR(6, subRL[5] ^ subRL[7]); /* round 5 */
+
+ tl = (subRL[10] >> 32) ^ (subRL[10] & ~subRL[8]);
+ dw = tl & (subRL[8] >> 32); /* FL(kl1) */
+ tr = subRL[10] ^ rol32(dw, 1);
+ tt = (tr | ((u64)tl << 32));
+
+ SET_SUBKEY_LR(7, subRL[6] ^ tt); /* round 6 */
+ SET_SUBKEY_LR(8, subRL[8]); /* FL(kl1) */
+ SET_SUBKEY_LR(9, subRL[9]); /* FLinv(kl2) */
+
+ tl = (subRL[7] >> 32) ^ (subRL[7] & ~subRL[9]);
+ dw = tl & (subRL[9] >> 32); /* FLinv(kl2) */
+ tr = subRL[7] ^ rol32(dw, 1);
+ tt = (tr | ((u64)tl << 32));
+
+ SET_SUBKEY_LR(10, subRL[11] ^ tt); /* round 7 */
+ SET_SUBKEY_LR(11, subRL[10] ^ subRL[12]); /* round 8 */
+ SET_SUBKEY_LR(12, subRL[11] ^ subRL[13]); /* round 9 */
+ SET_SUBKEY_LR(13, subRL[12] ^ subRL[14]); /* round 10 */
+ SET_SUBKEY_LR(14, subRL[13] ^ subRL[15]); /* round 11 */
+
+ tl = (subRL[18] >> 32) ^ (subRL[18] & ~subRL[16]);
+ dw = tl & (subRL[16] >> 32); /* FL(kl3) */
+ tr = subRL[18] ^ rol32(dw, 1);
+ tt = (tr | ((u64)tl << 32));
+
+ SET_SUBKEY_LR(15, subRL[14] ^ tt); /* round 12 */
+ SET_SUBKEY_LR(16, subRL[16]); /* FL(kl3) */
+ SET_SUBKEY_LR(17, subRL[17]); /* FLinv(kl4) */
+
+ tl = (subRL[15] >> 32) ^ (subRL[15] & ~subRL[17]);
+ dw = tl & (subRL[17] >> 32); /* FLinv(kl4) */
+ tr = subRL[15] ^ rol32(dw, 1);
+ tt = (tr | ((u64)tl << 32));
+
+ SET_SUBKEY_LR(18, subRL[19] ^ tt); /* round 13 */
+ SET_SUBKEY_LR(19, subRL[18] ^ subRL[20]); /* round 14 */
+ SET_SUBKEY_LR(20, subRL[19] ^ subRL[21]); /* round 15 */
+ SET_SUBKEY_LR(21, subRL[20] ^ subRL[22]); /* round 16 */
+ SET_SUBKEY_LR(22, subRL[21] ^ subRL[23]); /* round 17 */
+
+ if (max == 24) {
+ SET_SUBKEY_LR(23, subRL[22]); /* round 18 */
+ SET_SUBKEY_LR(24, subRL[24] ^ subRL[23]); /* kw3 */
+ } else {
+ tl = (subRL[26] >> 32) ^ (subRL[26] & ~subRL[24]);
+ dw = tl & (subRL[24] >> 32); /* FL(kl5) */
+ tr = subRL[26] ^ rol32(dw, 1);
+ tt = (tr | ((u64)tl << 32));
+
+ SET_SUBKEY_LR(23, subRL[22] ^ tt); /* round 18 */
+ SET_SUBKEY_LR(24, subRL[24]); /* FL(kl5) */
+ SET_SUBKEY_LR(25, subRL[25]); /* FLinv(kl6) */
+
+ tl = (subRL[23] >> 32) ^ (subRL[23] & ~subRL[25]);
+ dw = tl & (subRL[25] >> 32); /* FLinv(kl6) */
+ tr = subRL[23] ^ rol32(dw, 1);
+ tt = (tr | ((u64)tl << 32));
+
+ SET_SUBKEY_LR(26, subRL[27] ^ tt); /* round 19 */
+ SET_SUBKEY_LR(27, subRL[26] ^ subRL[28]); /* round 20 */
+ SET_SUBKEY_LR(28, subRL[27] ^ subRL[29]); /* round 21 */
+ SET_SUBKEY_LR(29, subRL[28] ^ subRL[30]); /* round 22 */
+ SET_SUBKEY_LR(30, subRL[29] ^ subRL[31]); /* round 23 */
+ SET_SUBKEY_LR(31, subRL[30]); /* round 24 */
+ SET_SUBKEY_LR(32, subRL[32] ^ subRL[31]); /* kw3 */
+ }
+}
+
+static void camellia_setup128(const unsigned char *key, u64 *subkey)
+{
+ u64 kl, kr, ww;
+ u64 subRL[26];
+
+ /**
+ * k == kl || kr (|| is concatenation)
+ */
+ kl = get_unaligned_be64(key);
+ kr = get_unaligned_be64(key + 8);
+
+ /* generate KL dependent subkeys */
+ /* kw1 */
+ subRL[0] = kl;
+ /* kw2 */
+ subRL[1] = kr;
+
+ /* rotation left shift 15bit */
+ ROLDQ(kl, kr, 15);
+
+ /* k3 */
+ subRL[4] = kl;
+ /* k4 */
+ subRL[5] = kr;
+
+ /* rotation left shift 15+30bit */
+ ROLDQ(kl, kr, 30);
+
+ /* k7 */
+ subRL[10] = kl;
+ /* k8 */
+ subRL[11] = kr;
+
+ /* rotation left shift 15+30+15bit */
+ ROLDQ(kl, kr, 15);
+
+ /* k10 */
+ subRL[13] = kr;
+ /* rotation left shift 15+30+15+17 bit */
+ ROLDQ(kl, kr, 17);
+
+ /* kl3 */
+ subRL[16] = kl;
+ /* kl4 */
+ subRL[17] = kr;
+
+ /* rotation left shift 15+30+15+17+17 bit */
+ ROLDQ(kl, kr, 17);
+
+ /* k13 */
+ subRL[18] = kl;
+ /* k14 */
+ subRL[19] = kr;
+
+ /* rotation left shift 15+30+15+17+17+17 bit */
+ ROLDQ(kl, kr, 17);
+
+ /* k17 */
+ subRL[22] = kl;
+ /* k18 */
+ subRL[23] = kr;
+
+ /* generate KA */
+ kl = subRL[0];
+ kr = subRL[1];
+ CAMELLIA_F(kl, CAMELLIA_SIGMA1L, CAMELLIA_SIGMA1R, ww);
+ kr ^= ww;
+ CAMELLIA_F(kr, CAMELLIA_SIGMA2L, CAMELLIA_SIGMA2R, kl);
+
+ /* current status == (kll, klr, w0, w1) */
+ CAMELLIA_F(kl, CAMELLIA_SIGMA3L, CAMELLIA_SIGMA3R, kr);
+ kr ^= ww;
+ CAMELLIA_F(kr, CAMELLIA_SIGMA4L, CAMELLIA_SIGMA4R, ww);
+ kl ^= ww;
+
+ /* generate KA dependent subkeys */
+ /* k1, k2 */
+ subRL[2] = kl;
+ subRL[3] = kr;
+ ROLDQ(kl, kr, 15);
+ /* k5,k6 */
+ subRL[6] = kl;
+ subRL[7] = kr;
+ ROLDQ(kl, kr, 15);
+ /* kl1, kl2 */
+ subRL[8] = kl;
+ subRL[9] = kr;
+ ROLDQ(kl, kr, 15);
+ /* k9 */
+ subRL[12] = kl;
+ ROLDQ(kl, kr, 15);
+ /* k11, k12 */
+ subRL[14] = kl;
+ subRL[15] = kr;
+ ROLDQ(kl, kr, 34);
+ /* k15, k16 */
+ subRL[20] = kl;
+ subRL[21] = kr;
+ ROLDQ(kl, kr, 17);
+ /* kw3, kw4 */
+ subRL[24] = kl;
+ subRL[25] = kr;
+
+ camellia_setup_tail(subkey, subRL, 24);
+}
+
+static void camellia_setup256(const unsigned char *key, u64 *subkey)
+{
+ u64 kl, kr; /* left half of key */
+ u64 krl, krr; /* right half of key */
+ u64 ww; /* temporary variables */
+ u64 subRL[34];
+
+ /**
+ * key = (kl || kr || krl || krr) (|| is concatenation)
+ */
+ kl = get_unaligned_be64(key);
+ kr = get_unaligned_be64(key + 8);
+ krl = get_unaligned_be64(key + 16);
+ krr = get_unaligned_be64(key + 24);
+
+ /* generate KL dependent subkeys */
+ /* kw1 */
+ subRL[0] = kl;
+ /* kw2 */
+ subRL[1] = kr;
+ ROLDQ(kl, kr, 45);
+ /* k9 */
+ subRL[12] = kl;
+ /* k10 */
+ subRL[13] = kr;
+ ROLDQ(kl, kr, 15);
+ /* kl3 */
+ subRL[16] = kl;
+ /* kl4 */
+ subRL[17] = kr;
+ ROLDQ(kl, kr, 17);
+ /* k17 */
+ subRL[22] = kl;
+ /* k18 */
+ subRL[23] = kr;
+ ROLDQ(kl, kr, 34);
+ /* k23 */
+ subRL[30] = kl;
+ /* k24 */
+ subRL[31] = kr;
+
+ /* generate KR dependent subkeys */
+ ROLDQ(krl, krr, 15);
+ /* k3 */
+ subRL[4] = krl;
+ /* k4 */
+ subRL[5] = krr;
+ ROLDQ(krl, krr, 15);
+ /* kl1 */
+ subRL[8] = krl;
+ /* kl2 */
+ subRL[9] = krr;
+ ROLDQ(krl, krr, 30);
+ /* k13 */
+ subRL[18] = krl;
+ /* k14 */
+ subRL[19] = krr;
+ ROLDQ(krl, krr, 34);
+ /* k19 */
+ subRL[26] = krl;
+ /* k20 */
+ subRL[27] = krr;
+ ROLDQ(krl, krr, 34);
+
+ /* generate KA */
+ kl = subRL[0] ^ krl;
+ kr = subRL[1] ^ krr;
+
+ CAMELLIA_F(kl, CAMELLIA_SIGMA1L, CAMELLIA_SIGMA1R, ww);
+ kr ^= ww;
+ CAMELLIA_F(kr, CAMELLIA_SIGMA2L, CAMELLIA_SIGMA2R, kl);
+ kl ^= krl;
+ CAMELLIA_F(kl, CAMELLIA_SIGMA3L, CAMELLIA_SIGMA3R, kr);
+ kr ^= ww ^ krr;
+ CAMELLIA_F(kr, CAMELLIA_SIGMA4L, CAMELLIA_SIGMA4R, ww);
+ kl ^= ww;
+
+ /* generate KB */
+ krl ^= kl;
+ krr ^= kr;
+ CAMELLIA_F(krl, CAMELLIA_SIGMA5L, CAMELLIA_SIGMA5R, ww);
+ krr ^= ww;
+ CAMELLIA_F(krr, CAMELLIA_SIGMA6L, CAMELLIA_SIGMA6R, ww);
+ krl ^= ww;
+
+ /* generate KA dependent subkeys */
+ ROLDQ(kl, kr, 15);
+ /* k5 */
+ subRL[6] = kl;
+ /* k6 */
+ subRL[7] = kr;
+ ROLDQ(kl, kr, 30);
+ /* k11 */
+ subRL[14] = kl;
+ /* k12 */
+ subRL[15] = kr;
+ /* rotation left shift 32bit */
+ ROLDQ(kl, kr, 32);
+ /* kl5 */
+ subRL[24] = kl;
+ /* kl6 */
+ subRL[25] = kr;
+ /* rotation left shift 17 from k11,k12 -> k21,k22 */
+ ROLDQ(kl, kr, 17);
+ /* k21 */
+ subRL[28] = kl;
+ /* k22 */
+ subRL[29] = kr;
+
+ /* generate KB dependent subkeys */
+ /* k1 */
+ subRL[2] = krl;
+ /* k2 */
+ subRL[3] = krr;
+ ROLDQ(krl, krr, 30);
+ /* k7 */
+ subRL[10] = krl;
+ /* k8 */
+ subRL[11] = krr;
+ ROLDQ(krl, krr, 30);
+ /* k15 */
+ subRL[20] = krl;
+ /* k16 */
+ subRL[21] = krr;
+ ROLDQ(krl, krr, 51);
+ /* kw3 */
+ subRL[32] = krl;
+ /* kw4 */
+ subRL[33] = krr;
+
+ camellia_setup_tail(subkey, subRL, 32);
+}
+
+static void camellia_setup192(const unsigned char *key, u64 *subkey)
+{
+ unsigned char kk[32];
+ u64 krl, krr;
+
+ memcpy(kk, key, 24);
+ memcpy((unsigned char *)&krl, key+16, 8);
+ krr = ~krl;
+ memcpy(kk+24, (unsigned char *)&krr, 8);
+ camellia_setup256(kk, subkey);
+}
+
+int __camellia_setkey(struct camellia_ctx *cctx, const unsigned char *key,
+ unsigned int key_len, u32 *flags)
+{
+ if (key_len != 16 && key_len != 24 && key_len != 32) {
+ *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+ return -EINVAL;
+ }
+
+ cctx->key_length = key_len;
+
+ switch (key_len) {
+ case 16:
+ camellia_setup128(key, cctx->key_table);
+ break;
+ case 24:
+ camellia_setup192(key, cctx->key_table);
+ break;
+ case 32:
+ camellia_setup256(key, cctx->key_table);
+ break;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(__camellia_setkey);
+
+static int camellia_setkey(struct crypto_tfm *tfm, const u8 *key,
+ unsigned int key_len)
+{
+ return __camellia_setkey(crypto_tfm_ctx(tfm), key, key_len,
+ &tfm->crt_flags);
+}
+
+static int camellia_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key,
+ unsigned int key_len)
+{
+ return camellia_setkey(&tfm->base, key, key_len);
+}
+
+void camellia_decrypt_cbc_2way(void *ctx, u128 *dst, const u128 *src)
+{
+ u128 iv = *src;
+
+ camellia_dec_blk_2way(ctx, (u8 *)dst, (u8 *)src);
+
+ u128_xor(&dst[1], &dst[1], &iv);
+}
+EXPORT_SYMBOL_GPL(camellia_decrypt_cbc_2way);
+
+void camellia_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+{
+ be128 ctrblk;
+
+ if (dst != src)
+ *dst = *src;
+
+ le128_to_be128(&ctrblk, iv);
+ le128_inc(iv);
+
+ camellia_enc_blk_xor(ctx, (u8 *)dst, (u8 *)&ctrblk);
+}
+EXPORT_SYMBOL_GPL(camellia_crypt_ctr);
+
+void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+{
+ be128 ctrblks[2];
+
+ if (dst != src) {
+ dst[0] = src[0];
+ dst[1] = src[1];
+ }
+
+ le128_to_be128(&ctrblks[0], iv);
+ le128_inc(iv);
+ le128_to_be128(&ctrblks[1], iv);
+ le128_inc(iv);
+
+ camellia_enc_blk_xor_2way(ctx, (u8 *)dst, (u8 *)ctrblks);
+}
+EXPORT_SYMBOL_GPL(camellia_crypt_ctr_2way);
+
+static const struct common_glue_ctx camellia_enc = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = -1,
+
+ .funcs = { {
+ .num_blocks = 2,
+ .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk_2way) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk) }
+ } }
+};
+
+static const struct common_glue_ctx camellia_ctr = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = -1,
+
+ .funcs = { {
+ .num_blocks = 2,
+ .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr_2way) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr) }
+ } }
+};
+
+static const struct common_glue_ctx camellia_dec = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = -1,
+
+ .funcs = { {
+ .num_blocks = 2,
+ .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk_2way) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk) }
+ } }
+};
+
+static const struct common_glue_ctx camellia_dec_cbc = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = -1,
+
+ .funcs = { {
+ .num_blocks = 2,
+ .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_decrypt_cbc_2way) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_dec_blk) }
+ } }
+};
+
+static int ecb_encrypt(struct skcipher_request *req)
+{
+ return glue_ecb_req_128bit(&camellia_enc, req);
+}
+
+static int ecb_decrypt(struct skcipher_request *req)
+{
+ return glue_ecb_req_128bit(&camellia_dec, req);
+}
+
+static int cbc_encrypt(struct skcipher_request *req)
+{
+ return glue_cbc_encrypt_req_128bit(GLUE_FUNC_CAST(camellia_enc_blk),
+ req);
+}
+
+static int cbc_decrypt(struct skcipher_request *req)
+{
+ return glue_cbc_decrypt_req_128bit(&camellia_dec_cbc, req);
+}
+
+static int ctr_crypt(struct skcipher_request *req)
+{
+ return glue_ctr_req_128bit(&camellia_ctr, req);
+}
+
+static struct crypto_alg camellia_cipher_alg = {
+ .cra_name = "camellia",
+ .cra_driver_name = "camellia-asm",
+ .cra_priority = 200,
+ .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
+ .cra_blocksize = CAMELLIA_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct camellia_ctx),
+ .cra_alignmask = 0,
+ .cra_module = THIS_MODULE,
+ .cra_u = {
+ .cipher = {
+ .cia_min_keysize = CAMELLIA_MIN_KEY_SIZE,
+ .cia_max_keysize = CAMELLIA_MAX_KEY_SIZE,
+ .cia_setkey = camellia_setkey,
+ .cia_encrypt = camellia_encrypt,
+ .cia_decrypt = camellia_decrypt
+ }
+ }
+};
+
+static struct skcipher_alg camellia_skcipher_algs[] = {
+ {
+ .base.cra_name = "ecb(camellia)",
+ .base.cra_driver_name = "ecb-camellia-asm",
+ .base.cra_priority = 300,
+ .base.cra_blocksize = CAMELLIA_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct camellia_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = CAMELLIA_MIN_KEY_SIZE,
+ .max_keysize = CAMELLIA_MAX_KEY_SIZE,
+ .setkey = camellia_setkey_skcipher,
+ .encrypt = ecb_encrypt,
+ .decrypt = ecb_decrypt,
+ }, {
+ .base.cra_name = "cbc(camellia)",
+ .base.cra_driver_name = "cbc-camellia-asm",
+ .base.cra_priority = 300,
+ .base.cra_blocksize = CAMELLIA_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct camellia_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = CAMELLIA_MIN_KEY_SIZE,
+ .max_keysize = CAMELLIA_MAX_KEY_SIZE,
+ .ivsize = CAMELLIA_BLOCK_SIZE,
+ .setkey = camellia_setkey_skcipher,
+ .encrypt = cbc_encrypt,
+ .decrypt = cbc_decrypt,
+ }, {
+ .base.cra_name = "ctr(camellia)",
+ .base.cra_driver_name = "ctr-camellia-asm",
+ .base.cra_priority = 300,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct camellia_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = CAMELLIA_MIN_KEY_SIZE,
+ .max_keysize = CAMELLIA_MAX_KEY_SIZE,
+ .ivsize = CAMELLIA_BLOCK_SIZE,
+ .chunksize = CAMELLIA_BLOCK_SIZE,
+ .setkey = camellia_setkey_skcipher,
+ .encrypt = ctr_crypt,
+ .decrypt = ctr_crypt,
+ }
+};
+
+static bool is_blacklisted_cpu(void)
+{
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return false;
+
+ if (boot_cpu_data.x86 == 0x0f) {
+ /*
+ * On Pentium 4, camellia-asm is slower than original assembler
+ * implementation because excessive uses of 64bit rotate and
+ * left-shifts (which are really slow on P4) needed to store and
+ * handle 128bit block in two 64bit registers.
+ */
+ return true;
+ }
+
+ return false;
+}
+
+static int force;
+module_param(force, int, 0);
+MODULE_PARM_DESC(force, "Force module load, ignore CPU blacklist");
+
+static int __init init(void)
+{
+ int err;
+
+ if (!force && is_blacklisted_cpu()) {
+ printk(KERN_INFO
+ "camellia-x86_64: performance on this CPU "
+ "would be suboptimal: disabling "
+ "camellia-x86_64.\n");
+ return -ENODEV;
+ }
+
+ err = crypto_register_alg(&camellia_cipher_alg);
+ if (err)
+ return err;
+
+ err = crypto_register_skciphers(camellia_skcipher_algs,
+ ARRAY_SIZE(camellia_skcipher_algs));
+ if (err)
+ crypto_unregister_alg(&camellia_cipher_alg);
+
+ return err;
+}
+
+static void __exit fini(void)
+{
+ crypto_unregister_alg(&camellia_cipher_alg);
+ crypto_unregister_skciphers(camellia_skcipher_algs,
+ ARRAY_SIZE(camellia_skcipher_algs));
+}
+
+module_init(init);
+module_exit(fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Camellia Cipher Algorithm, asm optimized");
+MODULE_ALIAS_CRYPTO("camellia");
+MODULE_ALIAS_CRYPTO("camellia-asm");
diff --git a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
new file mode 100644
index 000000000..86107c961
--- /dev/null
+++ b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
@@ -0,0 +1,578 @@
+/*
+ * Cast5 Cipher 16-way parallel algorithm (AVX/x86_64)
+ *
+ * Copyright (C) 2012 Johannes Goetzfried
+ * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
+ *
+ * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+ * USA
+ *
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+
+.file "cast5-avx-x86_64-asm_64.S"
+
+.extern cast_s1
+.extern cast_s2
+.extern cast_s3
+.extern cast_s4
+
+/* structure of crypto context */
+#define km 0
+#define kr (16*4)
+#define rr ((16*4)+16)
+
+/* s-boxes */
+#define s1 cast_s1
+#define s2 cast_s2
+#define s3 cast_s3
+#define s4 cast_s4
+
+/**********************************************************************
+ 16-way AVX cast5
+ **********************************************************************/
+#define CTX %r15
+
+#define RL1 %xmm0
+#define RR1 %xmm1
+#define RL2 %xmm2
+#define RR2 %xmm3
+#define RL3 %xmm4
+#define RR3 %xmm5
+#define RL4 %xmm6
+#define RR4 %xmm7
+
+#define RX %xmm8
+
+#define RKM %xmm9
+#define RKR %xmm10
+#define RKRF %xmm11
+#define RKRR %xmm12
+
+#define R32 %xmm13
+#define R1ST %xmm14
+
+#define RTMP %xmm15
+
+#define RID1 %rdi
+#define RID1d %edi
+#define RID2 %rsi
+#define RID2d %esi
+
+#define RGI1 %rdx
+#define RGI1bl %dl
+#define RGI1bh %dh
+#define RGI2 %rcx
+#define RGI2bl %cl
+#define RGI2bh %ch
+
+#define RGI3 %rax
+#define RGI3bl %al
+#define RGI3bh %ah
+#define RGI4 %rbx
+#define RGI4bl %bl
+#define RGI4bh %bh
+
+#define RFS1 %r8
+#define RFS1d %r8d
+#define RFS2 %r9
+#define RFS2d %r9d
+#define RFS3 %r10
+#define RFS3d %r10d
+
+
+#define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \
+ movzbl src ## bh, RID1d; \
+ movzbl src ## bl, RID2d; \
+ shrq $16, src; \
+ movl s1(, RID1, 4), dst ## d; \
+ op1 s2(, RID2, 4), dst ## d; \
+ movzbl src ## bh, RID1d; \
+ movzbl src ## bl, RID2d; \
+ interleave_op(il_reg); \
+ op2 s3(, RID1, 4), dst ## d; \
+ op3 s4(, RID2, 4), dst ## d;
+
+#define dummy(d) /* do nothing */
+
+#define shr_next(reg) \
+ shrq $16, reg;
+
+#define F_head(a, x, gi1, gi2, op0) \
+ op0 a, RKM, x; \
+ vpslld RKRF, x, RTMP; \
+ vpsrld RKRR, x, x; \
+ vpor RTMP, x, x; \
+ \
+ vmovq x, gi1; \
+ vpextrq $1, x, gi2;
+
+#define F_tail(a, x, gi1, gi2, op1, op2, op3) \
+ lookup_32bit(##gi1, RFS1, op1, op2, op3, shr_next, ##gi1); \
+ lookup_32bit(##gi2, RFS3, op1, op2, op3, shr_next, ##gi2); \
+ \
+ lookup_32bit(##gi1, RFS2, op1, op2, op3, dummy, none); \
+ shlq $32, RFS2; \
+ orq RFS1, RFS2; \
+ lookup_32bit(##gi2, RFS1, op1, op2, op3, dummy, none); \
+ shlq $32, RFS1; \
+ orq RFS1, RFS3; \
+ \
+ vmovq RFS2, x; \
+ vpinsrq $1, RFS3, x, x;
+
+#define F_2(a1, b1, a2, b2, op0, op1, op2, op3) \
+ F_head(b1, RX, RGI1, RGI2, op0); \
+ F_head(b2, RX, RGI3, RGI4, op0); \
+ \
+ F_tail(b1, RX, RGI1, RGI2, op1, op2, op3); \
+ F_tail(b2, RTMP, RGI3, RGI4, op1, op2, op3); \
+ \
+ vpxor a1, RX, a1; \
+ vpxor a2, RTMP, a2;
+
+#define F1_2(a1, b1, a2, b2) \
+ F_2(a1, b1, a2, b2, vpaddd, xorl, subl, addl)
+#define F2_2(a1, b1, a2, b2) \
+ F_2(a1, b1, a2, b2, vpxor, subl, addl, xorl)
+#define F3_2(a1, b1, a2, b2) \
+ F_2(a1, b1, a2, b2, vpsubd, addl, xorl, subl)
+
+#define subround(a1, b1, a2, b2, f) \
+ F ## f ## _2(a1, b1, a2, b2);
+
+#define round(l, r, n, f) \
+ vbroadcastss (km+(4*n))(CTX), RKM; \
+ vpand R1ST, RKR, RKRF; \
+ vpsubq RKRF, R32, RKRR; \
+ vpsrldq $1, RKR, RKR; \
+ subround(l ## 1, r ## 1, l ## 2, r ## 2, f); \
+ subround(l ## 3, r ## 3, l ## 4, r ## 4, f);
+
+#define enc_preload_rkr() \
+ vbroadcastss .L16_mask, RKR; \
+ /* add 16-bit rotation to key rotations (mod 32) */ \
+ vpxor kr(CTX), RKR, RKR;
+
+#define dec_preload_rkr() \
+ vbroadcastss .L16_mask, RKR; \
+ /* add 16-bit rotation to key rotations (mod 32) */ \
+ vpxor kr(CTX), RKR, RKR; \
+ vpshufb .Lbswap128_mask, RKR, RKR;
+
+#define transpose_2x4(x0, x1, t0, t1) \
+ vpunpckldq x1, x0, t0; \
+ vpunpckhdq x1, x0, t1; \
+ \
+ vpunpcklqdq t1, t0, x0; \
+ vpunpckhqdq t1, t0, x1;
+
+#define inpack_blocks(x0, x1, t0, t1, rmask) \
+ vpshufb rmask, x0, x0; \
+ vpshufb rmask, x1, x1; \
+ \
+ transpose_2x4(x0, x1, t0, t1)
+
+#define outunpack_blocks(x0, x1, t0, t1, rmask) \
+ transpose_2x4(x0, x1, t0, t1) \
+ \
+ vpshufb rmask, x0, x0; \
+ vpshufb rmask, x1, x1;
+
+.section .rodata.cst16.bswap_mask, "aM", @progbits, 16
+.align 16
+.Lbswap_mask:
+ .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
+.section .rodata.cst16.bswap128_mask, "aM", @progbits, 16
+.align 16
+.Lbswap128_mask:
+ .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+.section .rodata.cst16.bswap_iv_mask, "aM", @progbits, 16
+.align 16
+.Lbswap_iv_mask:
+ .byte 7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0
+
+.section .rodata.cst4.16_mask, "aM", @progbits, 4
+.align 4
+.L16_mask:
+ .byte 16, 16, 16, 16
+.section .rodata.cst4.32_mask, "aM", @progbits, 4
+.align 4
+.L32_mask:
+ .byte 32, 0, 0, 0
+.section .rodata.cst4.first_mask, "aM", @progbits, 4
+.align 4
+.Lfirst_mask:
+ .byte 0x1f, 0, 0, 0
+
+.text
+
+.align 16
+__cast5_enc_blk16:
+ /* input:
+ * %rdi: ctx
+ * RL1: blocks 1 and 2
+ * RR1: blocks 3 and 4
+ * RL2: blocks 5 and 6
+ * RR2: blocks 7 and 8
+ * RL3: blocks 9 and 10
+ * RR3: blocks 11 and 12
+ * RL4: blocks 13 and 14
+ * RR4: blocks 15 and 16
+ * output:
+ * RL1: encrypted blocks 1 and 2
+ * RR1: encrypted blocks 3 and 4
+ * RL2: encrypted blocks 5 and 6
+ * RR2: encrypted blocks 7 and 8
+ * RL3: encrypted blocks 9 and 10
+ * RR3: encrypted blocks 11 and 12
+ * RL4: encrypted blocks 13 and 14
+ * RR4: encrypted blocks 15 and 16
+ */
+
+ pushq %r15;
+ pushq %rbx;
+
+ movq %rdi, CTX;
+
+ vmovdqa .Lbswap_mask, RKM;
+ vmovd .Lfirst_mask, R1ST;
+ vmovd .L32_mask, R32;
+ enc_preload_rkr();
+
+ inpack_blocks(RL1, RR1, RTMP, RX, RKM);
+ inpack_blocks(RL2, RR2, RTMP, RX, RKM);
+ inpack_blocks(RL3, RR3, RTMP, RX, RKM);
+ inpack_blocks(RL4, RR4, RTMP, RX, RKM);
+
+ round(RL, RR, 0, 1);
+ round(RR, RL, 1, 2);
+ round(RL, RR, 2, 3);
+ round(RR, RL, 3, 1);
+ round(RL, RR, 4, 2);
+ round(RR, RL, 5, 3);
+ round(RL, RR, 6, 1);
+ round(RR, RL, 7, 2);
+ round(RL, RR, 8, 3);
+ round(RR, RL, 9, 1);
+ round(RL, RR, 10, 2);
+ round(RR, RL, 11, 3);
+
+ movzbl rr(CTX), %eax;
+ testl %eax, %eax;
+ jnz .L__skip_enc;
+
+ round(RL, RR, 12, 1);
+ round(RR, RL, 13, 2);
+ round(RL, RR, 14, 3);
+ round(RR, RL, 15, 1);
+
+.L__skip_enc:
+ popq %rbx;
+ popq %r15;
+
+ vmovdqa .Lbswap_mask, RKM;
+
+ outunpack_blocks(RR1, RL1, RTMP, RX, RKM);
+ outunpack_blocks(RR2, RL2, RTMP, RX, RKM);
+ outunpack_blocks(RR3, RL3, RTMP, RX, RKM);
+ outunpack_blocks(RR4, RL4, RTMP, RX, RKM);
+
+ ret;
+ENDPROC(__cast5_enc_blk16)
+
+.align 16
+__cast5_dec_blk16:
+ /* input:
+ * %rdi: ctx
+ * RL1: encrypted blocks 1 and 2
+ * RR1: encrypted blocks 3 and 4
+ * RL2: encrypted blocks 5 and 6
+ * RR2: encrypted blocks 7 and 8
+ * RL3: encrypted blocks 9 and 10
+ * RR3: encrypted blocks 11 and 12
+ * RL4: encrypted blocks 13 and 14
+ * RR4: encrypted blocks 15 and 16
+ * output:
+ * RL1: decrypted blocks 1 and 2
+ * RR1: decrypted blocks 3 and 4
+ * RL2: decrypted blocks 5 and 6
+ * RR2: decrypted blocks 7 and 8
+ * RL3: decrypted blocks 9 and 10
+ * RR3: decrypted blocks 11 and 12
+ * RL4: decrypted blocks 13 and 14
+ * RR4: decrypted blocks 15 and 16
+ */
+
+ pushq %r15;
+ pushq %rbx;
+
+ movq %rdi, CTX;
+
+ vmovdqa .Lbswap_mask, RKM;
+ vmovd .Lfirst_mask, R1ST;
+ vmovd .L32_mask, R32;
+ dec_preload_rkr();
+
+ inpack_blocks(RL1, RR1, RTMP, RX, RKM);
+ inpack_blocks(RL2, RR2, RTMP, RX, RKM);
+ inpack_blocks(RL3, RR3, RTMP, RX, RKM);
+ inpack_blocks(RL4, RR4, RTMP, RX, RKM);
+
+ movzbl rr(CTX), %eax;
+ testl %eax, %eax;
+ jnz .L__skip_dec;
+
+ round(RL, RR, 15, 1);
+ round(RR, RL, 14, 3);
+ round(RL, RR, 13, 2);
+ round(RR, RL, 12, 1);
+
+.L__dec_tail:
+ round(RL, RR, 11, 3);
+ round(RR, RL, 10, 2);
+ round(RL, RR, 9, 1);
+ round(RR, RL, 8, 3);
+ round(RL, RR, 7, 2);
+ round(RR, RL, 6, 1);
+ round(RL, RR, 5, 3);
+ round(RR, RL, 4, 2);
+ round(RL, RR, 3, 1);
+ round(RR, RL, 2, 3);
+ round(RL, RR, 1, 2);
+ round(RR, RL, 0, 1);
+
+ vmovdqa .Lbswap_mask, RKM;
+ popq %rbx;
+ popq %r15;
+
+ outunpack_blocks(RR1, RL1, RTMP, RX, RKM);
+ outunpack_blocks(RR2, RL2, RTMP, RX, RKM);
+ outunpack_blocks(RR3, RL3, RTMP, RX, RKM);
+ outunpack_blocks(RR4, RL4, RTMP, RX, RKM);
+
+ ret;
+
+.L__skip_dec:
+ vpsrldq $4, RKR, RKR;
+ jmp .L__dec_tail;
+ENDPROC(__cast5_dec_blk16)
+
+ENTRY(cast5_ecb_enc_16way)
+ /* input:
+ * %rdi: ctx
+ * %rsi: dst
+ * %rdx: src
+ */
+ FRAME_BEGIN
+ pushq %r15;
+
+ movq %rdi, CTX;
+ movq %rsi, %r11;
+
+ vmovdqu (0*4*4)(%rdx), RL1;
+ vmovdqu (1*4*4)(%rdx), RR1;
+ vmovdqu (2*4*4)(%rdx), RL2;
+ vmovdqu (3*4*4)(%rdx), RR2;
+ vmovdqu (4*4*4)(%rdx), RL3;
+ vmovdqu (5*4*4)(%rdx), RR3;
+ vmovdqu (6*4*4)(%rdx), RL4;
+ vmovdqu (7*4*4)(%rdx), RR4;
+
+ call __cast5_enc_blk16;
+
+ vmovdqu RR1, (0*4*4)(%r11);
+ vmovdqu RL1, (1*4*4)(%r11);
+ vmovdqu RR2, (2*4*4)(%r11);
+ vmovdqu RL2, (3*4*4)(%r11);
+ vmovdqu RR3, (4*4*4)(%r11);
+ vmovdqu RL3, (5*4*4)(%r11);
+ vmovdqu RR4, (6*4*4)(%r11);
+ vmovdqu RL4, (7*4*4)(%r11);
+
+ popq %r15;
+ FRAME_END
+ ret;
+ENDPROC(cast5_ecb_enc_16way)
+
+ENTRY(cast5_ecb_dec_16way)
+ /* input:
+ * %rdi: ctx
+ * %rsi: dst
+ * %rdx: src
+ */
+
+ FRAME_BEGIN
+ pushq %r15;
+
+ movq %rdi, CTX;
+ movq %rsi, %r11;
+
+ vmovdqu (0*4*4)(%rdx), RL1;
+ vmovdqu (1*4*4)(%rdx), RR1;
+ vmovdqu (2*4*4)(%rdx), RL2;
+ vmovdqu (3*4*4)(%rdx), RR2;
+ vmovdqu (4*4*4)(%rdx), RL3;
+ vmovdqu (5*4*4)(%rdx), RR3;
+ vmovdqu (6*4*4)(%rdx), RL4;
+ vmovdqu (7*4*4)(%rdx), RR4;
+
+ call __cast5_dec_blk16;
+
+ vmovdqu RR1, (0*4*4)(%r11);
+ vmovdqu RL1, (1*4*4)(%r11);
+ vmovdqu RR2, (2*4*4)(%r11);
+ vmovdqu RL2, (3*4*4)(%r11);
+ vmovdqu RR3, (4*4*4)(%r11);
+ vmovdqu RL3, (5*4*4)(%r11);
+ vmovdqu RR4, (6*4*4)(%r11);
+ vmovdqu RL4, (7*4*4)(%r11);
+
+ popq %r15;
+ FRAME_END
+ ret;
+ENDPROC(cast5_ecb_dec_16way)
+
+ENTRY(cast5_cbc_dec_16way)
+ /* input:
+ * %rdi: ctx
+ * %rsi: dst
+ * %rdx: src
+ */
+ FRAME_BEGIN
+ pushq %r12;
+ pushq %r15;
+
+ movq %rdi, CTX;
+ movq %rsi, %r11;
+ movq %rdx, %r12;
+
+ vmovdqu (0*16)(%rdx), RL1;
+ vmovdqu (1*16)(%rdx), RR1;
+ vmovdqu (2*16)(%rdx), RL2;
+ vmovdqu (3*16)(%rdx), RR2;
+ vmovdqu (4*16)(%rdx), RL3;
+ vmovdqu (5*16)(%rdx), RR3;
+ vmovdqu (6*16)(%rdx), RL4;
+ vmovdqu (7*16)(%rdx), RR4;
+
+ call __cast5_dec_blk16;
+
+ /* xor with src */
+ vmovq (%r12), RX;
+ vpshufd $0x4f, RX, RX;
+ vpxor RX, RR1, RR1;
+ vpxor 0*16+8(%r12), RL1, RL1;
+ vpxor 1*16+8(%r12), RR2, RR2;
+ vpxor 2*16+8(%r12), RL2, RL2;
+ vpxor 3*16+8(%r12), RR3, RR3;
+ vpxor 4*16+8(%r12), RL3, RL3;
+ vpxor 5*16+8(%r12), RR4, RR4;
+ vpxor 6*16+8(%r12), RL4, RL4;
+
+ vmovdqu RR1, (0*16)(%r11);
+ vmovdqu RL1, (1*16)(%r11);
+ vmovdqu RR2, (2*16)(%r11);
+ vmovdqu RL2, (3*16)(%r11);
+ vmovdqu RR3, (4*16)(%r11);
+ vmovdqu RL3, (5*16)(%r11);
+ vmovdqu RR4, (6*16)(%r11);
+ vmovdqu RL4, (7*16)(%r11);
+
+ popq %r15;
+ popq %r12;
+ FRAME_END
+ ret;
+ENDPROC(cast5_cbc_dec_16way)
+
+ENTRY(cast5_ctr_16way)
+ /* input:
+ * %rdi: ctx
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: iv (big endian, 64bit)
+ */
+ FRAME_BEGIN
+ pushq %r12;
+ pushq %r15;
+
+ movq %rdi, CTX;
+ movq %rsi, %r11;
+ movq %rdx, %r12;
+
+ vpcmpeqd RTMP, RTMP, RTMP;
+ vpsrldq $8, RTMP, RTMP; /* low: -1, high: 0 */
+
+ vpcmpeqd RKR, RKR, RKR;
+ vpaddq RKR, RKR, RKR; /* low: -2, high: -2 */
+ vmovdqa .Lbswap_iv_mask, R1ST;
+ vmovdqa .Lbswap128_mask, RKM;
+
+ /* load IV and byteswap */
+ vmovq (%rcx), RX;
+ vpshufb R1ST, RX, RX;
+
+ /* construct IVs */
+ vpsubq RTMP, RX, RX; /* le: IV1, IV0 */
+ vpshufb RKM, RX, RL1; /* be: IV0, IV1 */
+ vpsubq RKR, RX, RX;
+ vpshufb RKM, RX, RR1; /* be: IV2, IV3 */
+ vpsubq RKR, RX, RX;
+ vpshufb RKM, RX, RL2; /* be: IV4, IV5 */
+ vpsubq RKR, RX, RX;
+ vpshufb RKM, RX, RR2; /* be: IV6, IV7 */
+ vpsubq RKR, RX, RX;
+ vpshufb RKM, RX, RL3; /* be: IV8, IV9 */
+ vpsubq RKR, RX, RX;
+ vpshufb RKM, RX, RR3; /* be: IV10, IV11 */
+ vpsubq RKR, RX, RX;
+ vpshufb RKM, RX, RL4; /* be: IV12, IV13 */
+ vpsubq RKR, RX, RX;
+ vpshufb RKM, RX, RR4; /* be: IV14, IV15 */
+
+ /* store last IV */
+ vpsubq RTMP, RX, RX; /* le: IV16, IV14 */
+ vpshufb R1ST, RX, RX; /* be: IV16, IV16 */
+ vmovq RX, (%rcx);
+
+ call __cast5_enc_blk16;
+
+ /* dst = src ^ iv */
+ vpxor (0*16)(%r12), RR1, RR1;
+ vpxor (1*16)(%r12), RL1, RL1;
+ vpxor (2*16)(%r12), RR2, RR2;
+ vpxor (3*16)(%r12), RL2, RL2;
+ vpxor (4*16)(%r12), RR3, RR3;
+ vpxor (5*16)(%r12), RL3, RL3;
+ vpxor (6*16)(%r12), RR4, RR4;
+ vpxor (7*16)(%r12), RL4, RL4;
+ vmovdqu RR1, (0*16)(%r11);
+ vmovdqu RL1, (1*16)(%r11);
+ vmovdqu RR2, (2*16)(%r11);
+ vmovdqu RL2, (3*16)(%r11);
+ vmovdqu RR3, (4*16)(%r11);
+ vmovdqu RL3, (5*16)(%r11);
+ vmovdqu RR4, (6*16)(%r11);
+ vmovdqu RL4, (7*16)(%r11);
+
+ popq %r15;
+ popq %r12;
+ FRAME_END
+ ret;
+ENDPROC(cast5_ctr_16way)
diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c
new file mode 100644
index 000000000..41034745d
--- /dev/null
+++ b/arch/x86/crypto/cast5_avx_glue.c
@@ -0,0 +1,392 @@
+/*
+ * Glue Code for the AVX assembler implemention of the Cast5 Cipher
+ *
+ * Copyright (C) 2012 Johannes Goetzfried
+ * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+ * USA
+ *
+ */
+
+#include <asm/crypto/glue_helper.h>
+#include <crypto/algapi.h>
+#include <crypto/cast5.h>
+#include <crypto/internal/simd.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/types.h>
+
+#define CAST5_PARALLEL_BLOCKS 16
+
+asmlinkage void cast5_ecb_enc_16way(struct cast5_ctx *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void cast5_ecb_dec_16way(struct cast5_ctx *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void cast5_cbc_dec_16way(struct cast5_ctx *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void cast5_ctr_16way(struct cast5_ctx *ctx, u8 *dst, const u8 *src,
+ __be64 *iv);
+
+static int cast5_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key,
+ unsigned int keylen)
+{
+ return cast5_setkey(&tfm->base, key, keylen);
+}
+
+static inline bool cast5_fpu_begin(bool fpu_enabled, struct skcipher_walk *walk,
+ unsigned int nbytes)
+{
+ return glue_fpu_begin(CAST5_BLOCK_SIZE, CAST5_PARALLEL_BLOCKS,
+ walk, fpu_enabled, nbytes);
+}
+
+static inline void cast5_fpu_end(bool fpu_enabled)
+{
+ return glue_fpu_end(fpu_enabled);
+}
+
+static int ecb_crypt(struct skcipher_request *req, bool enc)
+{
+ bool fpu_enabled = false;
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct cast5_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ const unsigned int bsize = CAST5_BLOCK_SIZE;
+ unsigned int nbytes;
+ void (*fn)(struct cast5_ctx *ctx, u8 *dst, const u8 *src);
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes)) {
+ u8 *wsrc = walk.src.virt.addr;
+ u8 *wdst = walk.dst.virt.addr;
+
+ fpu_enabled = cast5_fpu_begin(fpu_enabled, &walk, nbytes);
+
+ /* Process multi-block batch */
+ if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
+ fn = (enc) ? cast5_ecb_enc_16way : cast5_ecb_dec_16way;
+ do {
+ fn(ctx, wdst, wsrc);
+
+ wsrc += bsize * CAST5_PARALLEL_BLOCKS;
+ wdst += bsize * CAST5_PARALLEL_BLOCKS;
+ nbytes -= bsize * CAST5_PARALLEL_BLOCKS;
+ } while (nbytes >= bsize * CAST5_PARALLEL_BLOCKS);
+
+ if (nbytes < bsize)
+ goto done;
+ }
+
+ fn = (enc) ? __cast5_encrypt : __cast5_decrypt;
+
+ /* Handle leftovers */
+ do {
+ fn(ctx, wdst, wsrc);
+
+ wsrc += bsize;
+ wdst += bsize;
+ nbytes -= bsize;
+ } while (nbytes >= bsize);
+
+done:
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ cast5_fpu_end(fpu_enabled);
+ return err;
+}
+
+static int ecb_encrypt(struct skcipher_request *req)
+{
+ return ecb_crypt(req, true);
+}
+
+static int ecb_decrypt(struct skcipher_request *req)
+{
+ return ecb_crypt(req, false);
+}
+
+static int cbc_encrypt(struct skcipher_request *req)
+{
+ const unsigned int bsize = CAST5_BLOCK_SIZE;
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct cast5_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes)) {
+ u64 *src = (u64 *)walk.src.virt.addr;
+ u64 *dst = (u64 *)walk.dst.virt.addr;
+ u64 *iv = (u64 *)walk.iv;
+
+ do {
+ *dst = *src ^ *iv;
+ __cast5_encrypt(ctx, (u8 *)dst, (u8 *)dst);
+ iv = dst;
+ src++;
+ dst++;
+ nbytes -= bsize;
+ } while (nbytes >= bsize);
+
+ *(u64 *)walk.iv = *iv;
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+
+static unsigned int __cbc_decrypt(struct cast5_ctx *ctx,
+ struct skcipher_walk *walk)
+{
+ const unsigned int bsize = CAST5_BLOCK_SIZE;
+ unsigned int nbytes = walk->nbytes;
+ u64 *src = (u64 *)walk->src.virt.addr;
+ u64 *dst = (u64 *)walk->dst.virt.addr;
+ u64 last_iv;
+
+ /* Start of the last block. */
+ src += nbytes / bsize - 1;
+ dst += nbytes / bsize - 1;
+
+ last_iv = *src;
+
+ /* Process multi-block batch */
+ if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
+ do {
+ nbytes -= bsize * (CAST5_PARALLEL_BLOCKS - 1);
+ src -= CAST5_PARALLEL_BLOCKS - 1;
+ dst -= CAST5_PARALLEL_BLOCKS - 1;
+
+ cast5_cbc_dec_16way(ctx, (u8 *)dst, (u8 *)src);
+
+ nbytes -= bsize;
+ if (nbytes < bsize)
+ goto done;
+
+ *dst ^= *(src - 1);
+ src -= 1;
+ dst -= 1;
+ } while (nbytes >= bsize * CAST5_PARALLEL_BLOCKS);
+ }
+
+ /* Handle leftovers */
+ for (;;) {
+ __cast5_decrypt(ctx, (u8 *)dst, (u8 *)src);
+
+ nbytes -= bsize;
+ if (nbytes < bsize)
+ break;
+
+ *dst ^= *(src - 1);
+ src -= 1;
+ dst -= 1;
+ }
+
+done:
+ *dst ^= *(u64 *)walk->iv;
+ *(u64 *)walk->iv = last_iv;
+
+ return nbytes;
+}
+
+static int cbc_decrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct cast5_ctx *ctx = crypto_skcipher_ctx(tfm);
+ bool fpu_enabled = false;
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes)) {
+ fpu_enabled = cast5_fpu_begin(fpu_enabled, &walk, nbytes);
+ nbytes = __cbc_decrypt(ctx, &walk);
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ cast5_fpu_end(fpu_enabled);
+ return err;
+}
+
+static void ctr_crypt_final(struct skcipher_walk *walk, struct cast5_ctx *ctx)
+{
+ u8 *ctrblk = walk->iv;
+ u8 keystream[CAST5_BLOCK_SIZE];
+ u8 *src = walk->src.virt.addr;
+ u8 *dst = walk->dst.virt.addr;
+ unsigned int nbytes = walk->nbytes;
+
+ __cast5_encrypt(ctx, keystream, ctrblk);
+ crypto_xor_cpy(dst, keystream, src, nbytes);
+
+ crypto_inc(ctrblk, CAST5_BLOCK_SIZE);
+}
+
+static unsigned int __ctr_crypt(struct skcipher_walk *walk,
+ struct cast5_ctx *ctx)
+{
+ const unsigned int bsize = CAST5_BLOCK_SIZE;
+ unsigned int nbytes = walk->nbytes;
+ u64 *src = (u64 *)walk->src.virt.addr;
+ u64 *dst = (u64 *)walk->dst.virt.addr;
+
+ /* Process multi-block batch */
+ if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
+ do {
+ cast5_ctr_16way(ctx, (u8 *)dst, (u8 *)src,
+ (__be64 *)walk->iv);
+
+ src += CAST5_PARALLEL_BLOCKS;
+ dst += CAST5_PARALLEL_BLOCKS;
+ nbytes -= bsize * CAST5_PARALLEL_BLOCKS;
+ } while (nbytes >= bsize * CAST5_PARALLEL_BLOCKS);
+
+ if (nbytes < bsize)
+ goto done;
+ }
+
+ /* Handle leftovers */
+ do {
+ u64 ctrblk;
+
+ if (dst != src)
+ *dst = *src;
+
+ ctrblk = *(u64 *)walk->iv;
+ be64_add_cpu((__be64 *)walk->iv, 1);
+
+ __cast5_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
+ *dst ^= ctrblk;
+
+ src += 1;
+ dst += 1;
+ nbytes -= bsize;
+ } while (nbytes >= bsize);
+
+done:
+ return nbytes;
+}
+
+static int ctr_crypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct cast5_ctx *ctx = crypto_skcipher_ctx(tfm);
+ bool fpu_enabled = false;
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) >= CAST5_BLOCK_SIZE) {
+ fpu_enabled = cast5_fpu_begin(fpu_enabled, &walk, nbytes);
+ nbytes = __ctr_crypt(&walk, ctx);
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ cast5_fpu_end(fpu_enabled);
+
+ if (walk.nbytes) {
+ ctr_crypt_final(&walk, ctx);
+ err = skcipher_walk_done(&walk, 0);
+ }
+
+ return err;
+}
+
+static struct skcipher_alg cast5_algs[] = {
+ {
+ .base.cra_name = "__ecb(cast5)",
+ .base.cra_driver_name = "__ecb-cast5-avx",
+ .base.cra_priority = 200,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = CAST5_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct cast5_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = CAST5_MIN_KEY_SIZE,
+ .max_keysize = CAST5_MAX_KEY_SIZE,
+ .setkey = cast5_setkey_skcipher,
+ .encrypt = ecb_encrypt,
+ .decrypt = ecb_decrypt,
+ }, {
+ .base.cra_name = "__cbc(cast5)",
+ .base.cra_driver_name = "__cbc-cast5-avx",
+ .base.cra_priority = 200,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = CAST5_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct cast5_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = CAST5_MIN_KEY_SIZE,
+ .max_keysize = CAST5_MAX_KEY_SIZE,
+ .ivsize = CAST5_BLOCK_SIZE,
+ .setkey = cast5_setkey_skcipher,
+ .encrypt = cbc_encrypt,
+ .decrypt = cbc_decrypt,
+ }, {
+ .base.cra_name = "__ctr(cast5)",
+ .base.cra_driver_name = "__ctr-cast5-avx",
+ .base.cra_priority = 200,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct cast5_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = CAST5_MIN_KEY_SIZE,
+ .max_keysize = CAST5_MAX_KEY_SIZE,
+ .ivsize = CAST5_BLOCK_SIZE,
+ .chunksize = CAST5_BLOCK_SIZE,
+ .setkey = cast5_setkey_skcipher,
+ .encrypt = ctr_crypt,
+ .decrypt = ctr_crypt,
+ }
+};
+
+static struct simd_skcipher_alg *cast5_simd_algs[ARRAY_SIZE(cast5_algs)];
+
+static int __init cast5_init(void)
+{
+ const char *feature_name;
+
+ if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+ &feature_name)) {
+ pr_info("CPU feature '%s' is not supported.\n", feature_name);
+ return -ENODEV;
+ }
+
+ return simd_register_skciphers_compat(cast5_algs,
+ ARRAY_SIZE(cast5_algs),
+ cast5_simd_algs);
+}
+
+static void __exit cast5_exit(void)
+{
+ simd_unregister_skciphers(cast5_algs, ARRAY_SIZE(cast5_algs),
+ cast5_simd_algs);
+}
+
+module_init(cast5_init);
+module_exit(cast5_exit);
+
+MODULE_DESCRIPTION("Cast5 Cipher Algorithm, AVX optimized");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_CRYPTO("cast5");
diff --git a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
new file mode 100644
index 000000000..7f30b6f0d
--- /dev/null
+++ b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
@@ -0,0 +1,511 @@
+/*
+ * Cast6 Cipher 8-way parallel algorithm (AVX/x86_64)
+ *
+ * Copyright (C) 2012 Johannes Goetzfried
+ * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
+ *
+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+ * USA
+ *
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+#include "glue_helper-asm-avx.S"
+
+.file "cast6-avx-x86_64-asm_64.S"
+
+.extern cast_s1
+.extern cast_s2
+.extern cast_s3
+.extern cast_s4
+
+/* structure of crypto context */
+#define km 0
+#define kr (12*4*4)
+
+/* s-boxes */
+#define s1 cast_s1
+#define s2 cast_s2
+#define s3 cast_s3
+#define s4 cast_s4
+
+/**********************************************************************
+ 8-way AVX cast6
+ **********************************************************************/
+#define CTX %r15
+
+#define RA1 %xmm0
+#define RB1 %xmm1
+#define RC1 %xmm2
+#define RD1 %xmm3
+
+#define RA2 %xmm4
+#define RB2 %xmm5
+#define RC2 %xmm6
+#define RD2 %xmm7
+
+#define RX %xmm8
+
+#define RKM %xmm9
+#define RKR %xmm10
+#define RKRF %xmm11
+#define RKRR %xmm12
+#define R32 %xmm13
+#define R1ST %xmm14
+
+#define RTMP %xmm15
+
+#define RID1 %rdi
+#define RID1d %edi
+#define RID2 %rsi
+#define RID2d %esi
+
+#define RGI1 %rdx
+#define RGI1bl %dl
+#define RGI1bh %dh
+#define RGI2 %rcx
+#define RGI2bl %cl
+#define RGI2bh %ch
+
+#define RGI3 %rax
+#define RGI3bl %al
+#define RGI3bh %ah
+#define RGI4 %rbx
+#define RGI4bl %bl
+#define RGI4bh %bh
+
+#define RFS1 %r8
+#define RFS1d %r8d
+#define RFS2 %r9
+#define RFS2d %r9d
+#define RFS3 %r10
+#define RFS3d %r10d
+
+
+#define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \
+ movzbl src ## bh, RID1d; \
+ movzbl src ## bl, RID2d; \
+ shrq $16, src; \
+ movl s1(, RID1, 4), dst ## d; \
+ op1 s2(, RID2, 4), dst ## d; \
+ movzbl src ## bh, RID1d; \
+ movzbl src ## bl, RID2d; \
+ interleave_op(il_reg); \
+ op2 s3(, RID1, 4), dst ## d; \
+ op3 s4(, RID2, 4), dst ## d;
+
+#define dummy(d) /* do nothing */
+
+#define shr_next(reg) \
+ shrq $16, reg;
+
+#define F_head(a, x, gi1, gi2, op0) \
+ op0 a, RKM, x; \
+ vpslld RKRF, x, RTMP; \
+ vpsrld RKRR, x, x; \
+ vpor RTMP, x, x; \
+ \
+ vmovq x, gi1; \
+ vpextrq $1, x, gi2;
+
+#define F_tail(a, x, gi1, gi2, op1, op2, op3) \
+ lookup_32bit(##gi1, RFS1, op1, op2, op3, shr_next, ##gi1); \
+ lookup_32bit(##gi2, RFS3, op1, op2, op3, shr_next, ##gi2); \
+ \
+ lookup_32bit(##gi1, RFS2, op1, op2, op3, dummy, none); \
+ shlq $32, RFS2; \
+ orq RFS1, RFS2; \
+ lookup_32bit(##gi2, RFS1, op1, op2, op3, dummy, none); \
+ shlq $32, RFS1; \
+ orq RFS1, RFS3; \
+ \
+ vmovq RFS2, x; \
+ vpinsrq $1, RFS3, x, x;
+
+#define F_2(a1, b1, a2, b2, op0, op1, op2, op3) \
+ F_head(b1, RX, RGI1, RGI2, op0); \
+ F_head(b2, RX, RGI3, RGI4, op0); \
+ \
+ F_tail(b1, RX, RGI1, RGI2, op1, op2, op3); \
+ F_tail(b2, RTMP, RGI3, RGI4, op1, op2, op3); \
+ \
+ vpxor a1, RX, a1; \
+ vpxor a2, RTMP, a2;
+
+#define F1_2(a1, b1, a2, b2) \
+ F_2(a1, b1, a2, b2, vpaddd, xorl, subl, addl)
+#define F2_2(a1, b1, a2, b2) \
+ F_2(a1, b1, a2, b2, vpxor, subl, addl, xorl)
+#define F3_2(a1, b1, a2, b2) \
+ F_2(a1, b1, a2, b2, vpsubd, addl, xorl, subl)
+
+#define qop(in, out, f) \
+ F ## f ## _2(out ## 1, in ## 1, out ## 2, in ## 2);
+
+#define get_round_keys(nn) \
+ vbroadcastss (km+(4*(nn)))(CTX), RKM; \
+ vpand R1ST, RKR, RKRF; \
+ vpsubq RKRF, R32, RKRR; \
+ vpsrldq $1, RKR, RKR;
+
+#define Q(n) \
+ get_round_keys(4*n+0); \
+ qop(RD, RC, 1); \
+ \
+ get_round_keys(4*n+1); \
+ qop(RC, RB, 2); \
+ \
+ get_round_keys(4*n+2); \
+ qop(RB, RA, 3); \
+ \
+ get_round_keys(4*n+3); \
+ qop(RA, RD, 1);
+
+#define QBAR(n) \
+ get_round_keys(4*n+3); \
+ qop(RA, RD, 1); \
+ \
+ get_round_keys(4*n+2); \
+ qop(RB, RA, 3); \
+ \
+ get_round_keys(4*n+1); \
+ qop(RC, RB, 2); \
+ \
+ get_round_keys(4*n+0); \
+ qop(RD, RC, 1);
+
+#define shuffle(mask) \
+ vpshufb mask, RKR, RKR;
+
+#define preload_rkr(n, do_mask, mask) \
+ vbroadcastss .L16_mask, RKR; \
+ /* add 16-bit rotation to key rotations (mod 32) */ \
+ vpxor (kr+n*16)(CTX), RKR, RKR; \
+ do_mask(mask);
+
+#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+ vpunpckldq x1, x0, t0; \
+ vpunpckhdq x1, x0, t2; \
+ vpunpckldq x3, x2, t1; \
+ vpunpckhdq x3, x2, x3; \
+ \
+ vpunpcklqdq t1, t0, x0; \
+ vpunpckhqdq t1, t0, x1; \
+ vpunpcklqdq x3, t2, x2; \
+ vpunpckhqdq x3, t2, x3;
+
+#define inpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
+ vpshufb rmask, x0, x0; \
+ vpshufb rmask, x1, x1; \
+ vpshufb rmask, x2, x2; \
+ vpshufb rmask, x3, x3; \
+ \
+ transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
+
+#define outunpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
+ transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+ \
+ vpshufb rmask, x0, x0; \
+ vpshufb rmask, x1, x1; \
+ vpshufb rmask, x2, x2; \
+ vpshufb rmask, x3, x3;
+
+.section .rodata.cst16, "aM", @progbits, 16
+.align 16
+.Lxts_gf128mul_and_shl1_mask:
+ .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
+.Lbswap_mask:
+ .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
+.Lbswap128_mask:
+ .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+.Lrkr_enc_Q_Q_QBAR_QBAR:
+ .byte 0, 1, 2, 3, 4, 5, 6, 7, 11, 10, 9, 8, 15, 14, 13, 12
+.Lrkr_enc_QBAR_QBAR_QBAR_QBAR:
+ .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
+.Lrkr_dec_Q_Q_Q_Q:
+ .byte 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
+.Lrkr_dec_Q_Q_QBAR_QBAR:
+ .byte 12, 13, 14, 15, 8, 9, 10, 11, 7, 6, 5, 4, 3, 2, 1, 0
+.Lrkr_dec_QBAR_QBAR_QBAR_QBAR:
+ .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+.section .rodata.cst4.L16_mask, "aM", @progbits, 4
+.align 4
+.L16_mask:
+ .byte 16, 16, 16, 16
+
+.section .rodata.cst4.L32_mask, "aM", @progbits, 4
+.align 4
+.L32_mask:
+ .byte 32, 0, 0, 0
+
+.section .rodata.cst4.first_mask, "aM", @progbits, 4
+.align 4
+.Lfirst_mask:
+ .byte 0x1f, 0, 0, 0
+
+.text
+
+.align 8
+__cast6_enc_blk8:
+ /* input:
+ * %rdi: ctx
+ * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
+ * output:
+ * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
+ */
+
+ pushq %r15;
+ pushq %rbx;
+
+ movq %rdi, CTX;
+
+ vmovdqa .Lbswap_mask, RKM;
+ vmovd .Lfirst_mask, R1ST;
+ vmovd .L32_mask, R32;
+
+ inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
+ inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
+
+ preload_rkr(0, dummy, none);
+ Q(0);
+ Q(1);
+ Q(2);
+ Q(3);
+ preload_rkr(1, shuffle, .Lrkr_enc_Q_Q_QBAR_QBAR);
+ Q(4);
+ Q(5);
+ QBAR(6);
+ QBAR(7);
+ preload_rkr(2, shuffle, .Lrkr_enc_QBAR_QBAR_QBAR_QBAR);
+ QBAR(8);
+ QBAR(9);
+ QBAR(10);
+ QBAR(11);
+
+ popq %rbx;
+ popq %r15;
+
+ vmovdqa .Lbswap_mask, RKM;
+
+ outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
+ outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
+
+ ret;
+ENDPROC(__cast6_enc_blk8)
+
+.align 8
+__cast6_dec_blk8:
+ /* input:
+ * %rdi: ctx
+ * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
+ * output:
+ * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks
+ */
+
+ pushq %r15;
+ pushq %rbx;
+
+ movq %rdi, CTX;
+
+ vmovdqa .Lbswap_mask, RKM;
+ vmovd .Lfirst_mask, R1ST;
+ vmovd .L32_mask, R32;
+
+ inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
+ inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
+
+ preload_rkr(2, shuffle, .Lrkr_dec_Q_Q_Q_Q);
+ Q(11);
+ Q(10);
+ Q(9);
+ Q(8);
+ preload_rkr(1, shuffle, .Lrkr_dec_Q_Q_QBAR_QBAR);
+ Q(7);
+ Q(6);
+ QBAR(5);
+ QBAR(4);
+ preload_rkr(0, shuffle, .Lrkr_dec_QBAR_QBAR_QBAR_QBAR);
+ QBAR(3);
+ QBAR(2);
+ QBAR(1);
+ QBAR(0);
+
+ popq %rbx;
+ popq %r15;
+
+ vmovdqa .Lbswap_mask, RKM;
+ outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
+ outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
+
+ ret;
+ENDPROC(__cast6_dec_blk8)
+
+ENTRY(cast6_ecb_enc_8way)
+ /* input:
+ * %rdi: ctx
+ * %rsi: dst
+ * %rdx: src
+ */
+ FRAME_BEGIN
+ pushq %r15;
+
+ movq %rdi, CTX;
+ movq %rsi, %r11;
+
+ load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ call __cast6_enc_blk8;
+
+ store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ popq %r15;
+ FRAME_END
+ ret;
+ENDPROC(cast6_ecb_enc_8way)
+
+ENTRY(cast6_ecb_dec_8way)
+ /* input:
+ * %rdi: ctx
+ * %rsi: dst
+ * %rdx: src
+ */
+ FRAME_BEGIN
+ pushq %r15;
+
+ movq %rdi, CTX;
+ movq %rsi, %r11;
+
+ load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ call __cast6_dec_blk8;
+
+ store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ popq %r15;
+ FRAME_END
+ ret;
+ENDPROC(cast6_ecb_dec_8way)
+
+ENTRY(cast6_cbc_dec_8way)
+ /* input:
+ * %rdi: ctx
+ * %rsi: dst
+ * %rdx: src
+ */
+ FRAME_BEGIN
+ pushq %r12;
+ pushq %r15;
+
+ movq %rdi, CTX;
+ movq %rsi, %r11;
+ movq %rdx, %r12;
+
+ load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ call __cast6_dec_blk8;
+
+ store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ popq %r15;
+ popq %r12;
+ FRAME_END
+ ret;
+ENDPROC(cast6_cbc_dec_8way)
+
+ENTRY(cast6_ctr_8way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: iv (little endian, 128bit)
+ */
+ FRAME_BEGIN
+ pushq %r12;
+ pushq %r15
+
+ movq %rdi, CTX;
+ movq %rsi, %r11;
+ movq %rdx, %r12;
+
+ load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
+ RD2, RX, RKR, RKM);
+
+ call __cast6_enc_blk8;
+
+ store_ctr_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ popq %r15;
+ popq %r12;
+ FRAME_END
+ ret;
+ENDPROC(cast6_ctr_8way)
+
+ENTRY(cast6_xts_enc_8way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+ */
+ FRAME_BEGIN
+ pushq %r15;
+
+ movq %rdi, CTX
+ movq %rsi, %r11;
+
+ /* regs <= src, dst <= IVs, regs <= regs xor IVs */
+ load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
+ RX, RKR, RKM, .Lxts_gf128mul_and_shl1_mask);
+
+ call __cast6_enc_blk8;
+
+ /* dst <= regs xor IVs(in dst) */
+ store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ popq %r15;
+ FRAME_END
+ ret;
+ENDPROC(cast6_xts_enc_8way)
+
+ENTRY(cast6_xts_dec_8way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+ */
+ FRAME_BEGIN
+ pushq %r15;
+
+ movq %rdi, CTX
+ movq %rsi, %r11;
+
+ /* regs <= src, dst <= IVs, regs <= regs xor IVs */
+ load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
+ RX, RKR, RKM, .Lxts_gf128mul_and_shl1_mask);
+
+ call __cast6_dec_blk8;
+
+ /* dst <= regs xor IVs(in dst) */
+ store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ popq %r15;
+ FRAME_END
+ ret;
+ENDPROC(cast6_xts_dec_8way)
diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c
new file mode 100644
index 000000000..9fb66b5e9
--- /dev/null
+++ b/arch/x86/crypto/cast6_avx_glue.c
@@ -0,0 +1,320 @@
+/*
+ * Glue Code for the AVX assembler implemention of the Cast6 Cipher
+ *
+ * Copyright (C) 2012 Johannes Goetzfried
+ * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
+ *
+ * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+ * USA
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <crypto/algapi.h>
+#include <crypto/cast6.h>
+#include <crypto/internal/simd.h>
+#include <crypto/xts.h>
+#include <asm/crypto/glue_helper.h>
+
+#define CAST6_PARALLEL_BLOCKS 8
+
+asmlinkage void cast6_ecb_enc_8way(struct cast6_ctx *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void cast6_ecb_dec_8way(struct cast6_ctx *ctx, u8 *dst,
+ const u8 *src);
+
+asmlinkage void cast6_cbc_dec_8way(struct cast6_ctx *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void cast6_ctr_8way(struct cast6_ctx *ctx, u8 *dst, const u8 *src,
+ le128 *iv);
+
+asmlinkage void cast6_xts_enc_8way(struct cast6_ctx *ctx, u8 *dst,
+ const u8 *src, le128 *iv);
+asmlinkage void cast6_xts_dec_8way(struct cast6_ctx *ctx, u8 *dst,
+ const u8 *src, le128 *iv);
+
+static int cast6_setkey_skcipher(struct crypto_skcipher *tfm,
+ const u8 *key, unsigned int keylen)
+{
+ return cast6_setkey(&tfm->base, key, keylen);
+}
+
+static void cast6_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+{
+ glue_xts_crypt_128bit_one(ctx, dst, src, iv,
+ GLUE_FUNC_CAST(__cast6_encrypt));
+}
+
+static void cast6_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+{
+ glue_xts_crypt_128bit_one(ctx, dst, src, iv,
+ GLUE_FUNC_CAST(__cast6_decrypt));
+}
+
+static void cast6_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+{
+ be128 ctrblk;
+
+ le128_to_be128(&ctrblk, iv);
+ le128_inc(iv);
+
+ __cast6_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
+ u128_xor(dst, src, (u128 *)&ctrblk);
+}
+
+static const struct common_glue_ctx cast6_enc = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = CAST6_PARALLEL_BLOCKS,
+ .fn_u = { .ecb = GLUE_FUNC_CAST(cast6_ecb_enc_8way) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .ecb = GLUE_FUNC_CAST(__cast6_encrypt) }
+ } }
+};
+
+static const struct common_glue_ctx cast6_ctr = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = CAST6_PARALLEL_BLOCKS,
+ .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(cast6_ctr_8way) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(cast6_crypt_ctr) }
+ } }
+};
+
+static const struct common_glue_ctx cast6_enc_xts = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = CAST6_PARALLEL_BLOCKS,
+ .fn_u = { .xts = GLUE_XTS_FUNC_CAST(cast6_xts_enc_8way) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .xts = GLUE_XTS_FUNC_CAST(cast6_xts_enc) }
+ } }
+};
+
+static const struct common_glue_ctx cast6_dec = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = CAST6_PARALLEL_BLOCKS,
+ .fn_u = { .ecb = GLUE_FUNC_CAST(cast6_ecb_dec_8way) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .ecb = GLUE_FUNC_CAST(__cast6_decrypt) }
+ } }
+};
+
+static const struct common_glue_ctx cast6_dec_cbc = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = CAST6_PARALLEL_BLOCKS,
+ .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(cast6_cbc_dec_8way) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__cast6_decrypt) }
+ } }
+};
+
+static const struct common_glue_ctx cast6_dec_xts = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = CAST6_PARALLEL_BLOCKS,
+ .fn_u = { .xts = GLUE_XTS_FUNC_CAST(cast6_xts_dec_8way) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .xts = GLUE_XTS_FUNC_CAST(cast6_xts_dec) }
+ } }
+};
+
+static int ecb_encrypt(struct skcipher_request *req)
+{
+ return glue_ecb_req_128bit(&cast6_enc, req);
+}
+
+static int ecb_decrypt(struct skcipher_request *req)
+{
+ return glue_ecb_req_128bit(&cast6_dec, req);
+}
+
+static int cbc_encrypt(struct skcipher_request *req)
+{
+ return glue_cbc_encrypt_req_128bit(GLUE_FUNC_CAST(__cast6_encrypt),
+ req);
+}
+
+static int cbc_decrypt(struct skcipher_request *req)
+{
+ return glue_cbc_decrypt_req_128bit(&cast6_dec_cbc, req);
+}
+
+static int ctr_crypt(struct skcipher_request *req)
+{
+ return glue_ctr_req_128bit(&cast6_ctr, req);
+}
+
+struct cast6_xts_ctx {
+ struct cast6_ctx tweak_ctx;
+ struct cast6_ctx crypt_ctx;
+};
+
+static int xts_cast6_setkey(struct crypto_skcipher *tfm, const u8 *key,
+ unsigned int keylen)
+{
+ struct cast6_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+ u32 *flags = &tfm->base.crt_flags;
+ int err;
+
+ err = xts_verify_key(tfm, key, keylen);
+ if (err)
+ return err;
+
+ /* first half of xts-key is for crypt */
+ err = __cast6_setkey(&ctx->crypt_ctx, key, keylen / 2, flags);
+ if (err)
+ return err;
+
+ /* second half of xts-key is for tweak */
+ return __cast6_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2,
+ flags);
+}
+
+static int xts_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct cast6_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return glue_xts_req_128bit(&cast6_enc_xts, req,
+ XTS_TWEAK_CAST(__cast6_encrypt),
+ &ctx->tweak_ctx, &ctx->crypt_ctx);
+}
+
+static int xts_decrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct cast6_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return glue_xts_req_128bit(&cast6_dec_xts, req,
+ XTS_TWEAK_CAST(__cast6_encrypt),
+ &ctx->tweak_ctx, &ctx->crypt_ctx);
+}
+
+static struct skcipher_alg cast6_algs[] = {
+ {
+ .base.cra_name = "__ecb(cast6)",
+ .base.cra_driver_name = "__ecb-cast6-avx",
+ .base.cra_priority = 200,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = CAST6_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct cast6_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = CAST6_MIN_KEY_SIZE,
+ .max_keysize = CAST6_MAX_KEY_SIZE,
+ .setkey = cast6_setkey_skcipher,
+ .encrypt = ecb_encrypt,
+ .decrypt = ecb_decrypt,
+ }, {
+ .base.cra_name = "__cbc(cast6)",
+ .base.cra_driver_name = "__cbc-cast6-avx",
+ .base.cra_priority = 200,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = CAST6_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct cast6_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = CAST6_MIN_KEY_SIZE,
+ .max_keysize = CAST6_MAX_KEY_SIZE,
+ .ivsize = CAST6_BLOCK_SIZE,
+ .setkey = cast6_setkey_skcipher,
+ .encrypt = cbc_encrypt,
+ .decrypt = cbc_decrypt,
+ }, {
+ .base.cra_name = "__ctr(cast6)",
+ .base.cra_driver_name = "__ctr-cast6-avx",
+ .base.cra_priority = 200,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct cast6_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = CAST6_MIN_KEY_SIZE,
+ .max_keysize = CAST6_MAX_KEY_SIZE,
+ .ivsize = CAST6_BLOCK_SIZE,
+ .chunksize = CAST6_BLOCK_SIZE,
+ .setkey = cast6_setkey_skcipher,
+ .encrypt = ctr_crypt,
+ .decrypt = ctr_crypt,
+ }, {
+ .base.cra_name = "__xts(cast6)",
+ .base.cra_driver_name = "__xts-cast6-avx",
+ .base.cra_priority = 200,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = CAST6_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct cast6_xts_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = 2 * CAST6_MIN_KEY_SIZE,
+ .max_keysize = 2 * CAST6_MAX_KEY_SIZE,
+ .ivsize = CAST6_BLOCK_SIZE,
+ .setkey = xts_cast6_setkey,
+ .encrypt = xts_encrypt,
+ .decrypt = xts_decrypt,
+ },
+};
+
+static struct simd_skcipher_alg *cast6_simd_algs[ARRAY_SIZE(cast6_algs)];
+
+static int __init cast6_init(void)
+{
+ const char *feature_name;
+
+ if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+ &feature_name)) {
+ pr_info("CPU feature '%s' is not supported.\n", feature_name);
+ return -ENODEV;
+ }
+
+ return simd_register_skciphers_compat(cast6_algs,
+ ARRAY_SIZE(cast6_algs),
+ cast6_simd_algs);
+}
+
+static void __exit cast6_exit(void)
+{
+ simd_unregister_skciphers(cast6_algs, ARRAY_SIZE(cast6_algs),
+ cast6_simd_algs);
+}
+
+module_init(cast6_init);
+module_exit(cast6_exit);
+
+MODULE_DESCRIPTION("Cast6 Cipher Algorithm, AVX optimized");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_CRYPTO("cast6");
diff --git a/arch/x86/crypto/chacha20-avx2-x86_64.S b/arch/x86/crypto/chacha20-avx2-x86_64.S
new file mode 100644
index 000000000..f3cd26f48
--- /dev/null
+++ b/arch/x86/crypto/chacha20-avx2-x86_64.S
@@ -0,0 +1,448 @@
+/*
+ * ChaCha20 256-bit cipher algorithm, RFC7539, x64 AVX2 functions
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/linkage.h>
+
+.section .rodata.cst32.ROT8, "aM", @progbits, 32
+.align 32
+ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003
+ .octa 0x0e0d0c0f0a09080b0605040702010003
+
+.section .rodata.cst32.ROT16, "aM", @progbits, 32
+.align 32
+ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302
+ .octa 0x0d0c0f0e09080b0a0504070601000302
+
+.section .rodata.cst32.CTRINC, "aM", @progbits, 32
+.align 32
+CTRINC: .octa 0x00000003000000020000000100000000
+ .octa 0x00000007000000060000000500000004
+
+.text
+
+ENTRY(chacha20_8block_xor_avx2)
+ # %rdi: Input state matrix, s
+ # %rsi: 8 data blocks output, o
+ # %rdx: 8 data blocks input, i
+
+ # This function encrypts eight consecutive ChaCha20 blocks by loading
+ # the state matrix in AVX registers eight times. As we need some
+ # scratch registers, we save the first four registers on the stack. The
+ # algorithm performs each operation on the corresponding word of each
+ # state matrix, hence requires no word shuffling. For final XORing step
+ # we transpose the matrix by interleaving 32-, 64- and then 128-bit
+ # words, which allows us to do XOR in AVX registers. 8/16-bit word
+ # rotation is done with the slightly better performing byte shuffling,
+ # 7/12-bit word rotation uses traditional shift+OR.
+
+ vzeroupper
+ # 4 * 32 byte stack, 32-byte aligned
+ lea 8(%rsp),%r10
+ and $~31, %rsp
+ sub $0x80, %rsp
+
+ # x0..15[0-7] = s[0..15]
+ vpbroadcastd 0x00(%rdi),%ymm0
+ vpbroadcastd 0x04(%rdi),%ymm1
+ vpbroadcastd 0x08(%rdi),%ymm2
+ vpbroadcastd 0x0c(%rdi),%ymm3
+ vpbroadcastd 0x10(%rdi),%ymm4
+ vpbroadcastd 0x14(%rdi),%ymm5
+ vpbroadcastd 0x18(%rdi),%ymm6
+ vpbroadcastd 0x1c(%rdi),%ymm7
+ vpbroadcastd 0x20(%rdi),%ymm8
+ vpbroadcastd 0x24(%rdi),%ymm9
+ vpbroadcastd 0x28(%rdi),%ymm10
+ vpbroadcastd 0x2c(%rdi),%ymm11
+ vpbroadcastd 0x30(%rdi),%ymm12
+ vpbroadcastd 0x34(%rdi),%ymm13
+ vpbroadcastd 0x38(%rdi),%ymm14
+ vpbroadcastd 0x3c(%rdi),%ymm15
+ # x0..3 on stack
+ vmovdqa %ymm0,0x00(%rsp)
+ vmovdqa %ymm1,0x20(%rsp)
+ vmovdqa %ymm2,0x40(%rsp)
+ vmovdqa %ymm3,0x60(%rsp)
+
+ vmovdqa CTRINC(%rip),%ymm1
+ vmovdqa ROT8(%rip),%ymm2
+ vmovdqa ROT16(%rip),%ymm3
+
+ # x12 += counter values 0-3
+ vpaddd %ymm1,%ymm12,%ymm12
+
+ mov $10,%ecx
+
+.Ldoubleround8:
+ # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
+ vpaddd 0x00(%rsp),%ymm4,%ymm0
+ vmovdqa %ymm0,0x00(%rsp)
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm3,%ymm12,%ymm12
+ # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
+ vpaddd 0x20(%rsp),%ymm5,%ymm0
+ vmovdqa %ymm0,0x20(%rsp)
+ vpxor %ymm0,%ymm13,%ymm13
+ vpshufb %ymm3,%ymm13,%ymm13
+ # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
+ vpaddd 0x40(%rsp),%ymm6,%ymm0
+ vmovdqa %ymm0,0x40(%rsp)
+ vpxor %ymm0,%ymm14,%ymm14
+ vpshufb %ymm3,%ymm14,%ymm14
+ # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
+ vpaddd 0x60(%rsp),%ymm7,%ymm0
+ vmovdqa %ymm0,0x60(%rsp)
+ vpxor %ymm0,%ymm15,%ymm15
+ vpshufb %ymm3,%ymm15,%ymm15
+
+ # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $12,%ymm4,%ymm0
+ vpsrld $20,%ymm4,%ymm4
+ vpor %ymm0,%ymm4,%ymm4
+ # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $12,%ymm5,%ymm0
+ vpsrld $20,%ymm5,%ymm5
+ vpor %ymm0,%ymm5,%ymm5
+ # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpslld $12,%ymm6,%ymm0
+ vpsrld $20,%ymm6,%ymm6
+ vpor %ymm0,%ymm6,%ymm6
+ # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpxor %ymm11,%ymm7,%ymm7
+ vpslld $12,%ymm7,%ymm0
+ vpsrld $20,%ymm7,%ymm7
+ vpor %ymm0,%ymm7,%ymm7
+
+ # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
+ vpaddd 0x00(%rsp),%ymm4,%ymm0
+ vmovdqa %ymm0,0x00(%rsp)
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm2,%ymm12,%ymm12
+ # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
+ vpaddd 0x20(%rsp),%ymm5,%ymm0
+ vmovdqa %ymm0,0x20(%rsp)
+ vpxor %ymm0,%ymm13,%ymm13
+ vpshufb %ymm2,%ymm13,%ymm13
+ # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
+ vpaddd 0x40(%rsp),%ymm6,%ymm0
+ vmovdqa %ymm0,0x40(%rsp)
+ vpxor %ymm0,%ymm14,%ymm14
+ vpshufb %ymm2,%ymm14,%ymm14
+ # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
+ vpaddd 0x60(%rsp),%ymm7,%ymm0
+ vmovdqa %ymm0,0x60(%rsp)
+ vpxor %ymm0,%ymm15,%ymm15
+ vpshufb %ymm2,%ymm15,%ymm15
+
+ # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm0
+ vpsrld $25,%ymm4,%ymm4
+ vpor %ymm0,%ymm4,%ymm4
+ # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm0
+ vpsrld $25,%ymm5,%ymm5
+ vpor %ymm0,%ymm5,%ymm5
+ # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpslld $7,%ymm6,%ymm0
+ vpsrld $25,%ymm6,%ymm6
+ vpor %ymm0,%ymm6,%ymm6
+ # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpxor %ymm11,%ymm7,%ymm7
+ vpslld $7,%ymm7,%ymm0
+ vpsrld $25,%ymm7,%ymm7
+ vpor %ymm0,%ymm7,%ymm7
+
+ # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
+ vpaddd 0x00(%rsp),%ymm5,%ymm0
+ vmovdqa %ymm0,0x00(%rsp)
+ vpxor %ymm0,%ymm15,%ymm15
+ vpshufb %ymm3,%ymm15,%ymm15
+ # x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0
+ vpaddd 0x20(%rsp),%ymm6,%ymm0
+ vmovdqa %ymm0,0x20(%rsp)
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm3,%ymm12,%ymm12
+ # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
+ vpaddd 0x40(%rsp),%ymm7,%ymm0
+ vmovdqa %ymm0,0x40(%rsp)
+ vpxor %ymm0,%ymm13,%ymm13
+ vpshufb %ymm3,%ymm13,%ymm13
+ # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
+ vpaddd 0x60(%rsp),%ymm4,%ymm0
+ vmovdqa %ymm0,0x60(%rsp)
+ vpxor %ymm0,%ymm14,%ymm14
+ vpshufb %ymm3,%ymm14,%ymm14
+
+ # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
+ vpaddd %ymm15,%ymm10,%ymm10
+ vpxor %ymm10,%ymm5,%ymm5
+ vpslld $12,%ymm5,%ymm0
+ vpsrld $20,%ymm5,%ymm5
+ vpor %ymm0,%ymm5,%ymm5
+ # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
+ vpaddd %ymm12,%ymm11,%ymm11
+ vpxor %ymm11,%ymm6,%ymm6
+ vpslld $12,%ymm6,%ymm0
+ vpsrld $20,%ymm6,%ymm6
+ vpor %ymm0,%ymm6,%ymm6
+ # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
+ vpaddd %ymm13,%ymm8,%ymm8
+ vpxor %ymm8,%ymm7,%ymm7
+ vpslld $12,%ymm7,%ymm0
+ vpsrld $20,%ymm7,%ymm7
+ vpor %ymm0,%ymm7,%ymm7
+ # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
+ vpaddd %ymm14,%ymm9,%ymm9
+ vpxor %ymm9,%ymm4,%ymm4
+ vpslld $12,%ymm4,%ymm0
+ vpsrld $20,%ymm4,%ymm4
+ vpor %ymm0,%ymm4,%ymm4
+
+ # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
+ vpaddd 0x00(%rsp),%ymm5,%ymm0
+ vmovdqa %ymm0,0x00(%rsp)
+ vpxor %ymm0,%ymm15,%ymm15
+ vpshufb %ymm2,%ymm15,%ymm15
+ # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
+ vpaddd 0x20(%rsp),%ymm6,%ymm0
+ vmovdqa %ymm0,0x20(%rsp)
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm2,%ymm12,%ymm12
+ # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
+ vpaddd 0x40(%rsp),%ymm7,%ymm0
+ vmovdqa %ymm0,0x40(%rsp)
+ vpxor %ymm0,%ymm13,%ymm13
+ vpshufb %ymm2,%ymm13,%ymm13
+ # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
+ vpaddd 0x60(%rsp),%ymm4,%ymm0
+ vmovdqa %ymm0,0x60(%rsp)
+ vpxor %ymm0,%ymm14,%ymm14
+ vpshufb %ymm2,%ymm14,%ymm14
+
+ # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
+ vpaddd %ymm15,%ymm10,%ymm10
+ vpxor %ymm10,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm0
+ vpsrld $25,%ymm5,%ymm5
+ vpor %ymm0,%ymm5,%ymm5
+ # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
+ vpaddd %ymm12,%ymm11,%ymm11
+ vpxor %ymm11,%ymm6,%ymm6
+ vpslld $7,%ymm6,%ymm0
+ vpsrld $25,%ymm6,%ymm6
+ vpor %ymm0,%ymm6,%ymm6
+ # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
+ vpaddd %ymm13,%ymm8,%ymm8
+ vpxor %ymm8,%ymm7,%ymm7
+ vpslld $7,%ymm7,%ymm0
+ vpsrld $25,%ymm7,%ymm7
+ vpor %ymm0,%ymm7,%ymm7
+ # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
+ vpaddd %ymm14,%ymm9,%ymm9
+ vpxor %ymm9,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm0
+ vpsrld $25,%ymm4,%ymm4
+ vpor %ymm0,%ymm4,%ymm4
+
+ dec %ecx
+ jnz .Ldoubleround8
+
+ # x0..15[0-3] += s[0..15]
+ vpbroadcastd 0x00(%rdi),%ymm0
+ vpaddd 0x00(%rsp),%ymm0,%ymm0
+ vmovdqa %ymm0,0x00(%rsp)
+ vpbroadcastd 0x04(%rdi),%ymm0
+ vpaddd 0x20(%rsp),%ymm0,%ymm0
+ vmovdqa %ymm0,0x20(%rsp)
+ vpbroadcastd 0x08(%rdi),%ymm0
+ vpaddd 0x40(%rsp),%ymm0,%ymm0
+ vmovdqa %ymm0,0x40(%rsp)
+ vpbroadcastd 0x0c(%rdi),%ymm0
+ vpaddd 0x60(%rsp),%ymm0,%ymm0
+ vmovdqa %ymm0,0x60(%rsp)
+ vpbroadcastd 0x10(%rdi),%ymm0
+ vpaddd %ymm0,%ymm4,%ymm4
+ vpbroadcastd 0x14(%rdi),%ymm0
+ vpaddd %ymm0,%ymm5,%ymm5
+ vpbroadcastd 0x18(%rdi),%ymm0
+ vpaddd %ymm0,%ymm6,%ymm6
+ vpbroadcastd 0x1c(%rdi),%ymm0
+ vpaddd %ymm0,%ymm7,%ymm7
+ vpbroadcastd 0x20(%rdi),%ymm0
+ vpaddd %ymm0,%ymm8,%ymm8
+ vpbroadcastd 0x24(%rdi),%ymm0
+ vpaddd %ymm0,%ymm9,%ymm9
+ vpbroadcastd 0x28(%rdi),%ymm0
+ vpaddd %ymm0,%ymm10,%ymm10
+ vpbroadcastd 0x2c(%rdi),%ymm0
+ vpaddd %ymm0,%ymm11,%ymm11
+ vpbroadcastd 0x30(%rdi),%ymm0
+ vpaddd %ymm0,%ymm12,%ymm12
+ vpbroadcastd 0x34(%rdi),%ymm0
+ vpaddd %ymm0,%ymm13,%ymm13
+ vpbroadcastd 0x38(%rdi),%ymm0
+ vpaddd %ymm0,%ymm14,%ymm14
+ vpbroadcastd 0x3c(%rdi),%ymm0
+ vpaddd %ymm0,%ymm15,%ymm15
+
+ # x12 += counter values 0-3
+ vpaddd %ymm1,%ymm12,%ymm12
+
+ # interleave 32-bit words in state n, n+1
+ vmovdqa 0x00(%rsp),%ymm0
+ vmovdqa 0x20(%rsp),%ymm1
+ vpunpckldq %ymm1,%ymm0,%ymm2
+ vpunpckhdq %ymm1,%ymm0,%ymm1
+ vmovdqa %ymm2,0x00(%rsp)
+ vmovdqa %ymm1,0x20(%rsp)
+ vmovdqa 0x40(%rsp),%ymm0
+ vmovdqa 0x60(%rsp),%ymm1
+ vpunpckldq %ymm1,%ymm0,%ymm2
+ vpunpckhdq %ymm1,%ymm0,%ymm1
+ vmovdqa %ymm2,0x40(%rsp)
+ vmovdqa %ymm1,0x60(%rsp)
+ vmovdqa %ymm4,%ymm0
+ vpunpckldq %ymm5,%ymm0,%ymm4
+ vpunpckhdq %ymm5,%ymm0,%ymm5
+ vmovdqa %ymm6,%ymm0
+ vpunpckldq %ymm7,%ymm0,%ymm6
+ vpunpckhdq %ymm7,%ymm0,%ymm7
+ vmovdqa %ymm8,%ymm0
+ vpunpckldq %ymm9,%ymm0,%ymm8
+ vpunpckhdq %ymm9,%ymm0,%ymm9
+ vmovdqa %ymm10,%ymm0
+ vpunpckldq %ymm11,%ymm0,%ymm10
+ vpunpckhdq %ymm11,%ymm0,%ymm11
+ vmovdqa %ymm12,%ymm0
+ vpunpckldq %ymm13,%ymm0,%ymm12
+ vpunpckhdq %ymm13,%ymm0,%ymm13
+ vmovdqa %ymm14,%ymm0
+ vpunpckldq %ymm15,%ymm0,%ymm14
+ vpunpckhdq %ymm15,%ymm0,%ymm15
+
+ # interleave 64-bit words in state n, n+2
+ vmovdqa 0x00(%rsp),%ymm0
+ vmovdqa 0x40(%rsp),%ymm2
+ vpunpcklqdq %ymm2,%ymm0,%ymm1
+ vpunpckhqdq %ymm2,%ymm0,%ymm2
+ vmovdqa %ymm1,0x00(%rsp)
+ vmovdqa %ymm2,0x40(%rsp)
+ vmovdqa 0x20(%rsp),%ymm0
+ vmovdqa 0x60(%rsp),%ymm2
+ vpunpcklqdq %ymm2,%ymm0,%ymm1
+ vpunpckhqdq %ymm2,%ymm0,%ymm2
+ vmovdqa %ymm1,0x20(%rsp)
+ vmovdqa %ymm2,0x60(%rsp)
+ vmovdqa %ymm4,%ymm0
+ vpunpcklqdq %ymm6,%ymm0,%ymm4
+ vpunpckhqdq %ymm6,%ymm0,%ymm6
+ vmovdqa %ymm5,%ymm0
+ vpunpcklqdq %ymm7,%ymm0,%ymm5
+ vpunpckhqdq %ymm7,%ymm0,%ymm7
+ vmovdqa %ymm8,%ymm0
+ vpunpcklqdq %ymm10,%ymm0,%ymm8
+ vpunpckhqdq %ymm10,%ymm0,%ymm10
+ vmovdqa %ymm9,%ymm0
+ vpunpcklqdq %ymm11,%ymm0,%ymm9
+ vpunpckhqdq %ymm11,%ymm0,%ymm11
+ vmovdqa %ymm12,%ymm0
+ vpunpcklqdq %ymm14,%ymm0,%ymm12
+ vpunpckhqdq %ymm14,%ymm0,%ymm14
+ vmovdqa %ymm13,%ymm0
+ vpunpcklqdq %ymm15,%ymm0,%ymm13
+ vpunpckhqdq %ymm15,%ymm0,%ymm15
+
+ # interleave 128-bit words in state n, n+4
+ vmovdqa 0x00(%rsp),%ymm0
+ vperm2i128 $0x20,%ymm4,%ymm0,%ymm1
+ vperm2i128 $0x31,%ymm4,%ymm0,%ymm4
+ vmovdqa %ymm1,0x00(%rsp)
+ vmovdqa 0x20(%rsp),%ymm0
+ vperm2i128 $0x20,%ymm5,%ymm0,%ymm1
+ vperm2i128 $0x31,%ymm5,%ymm0,%ymm5
+ vmovdqa %ymm1,0x20(%rsp)
+ vmovdqa 0x40(%rsp),%ymm0
+ vperm2i128 $0x20,%ymm6,%ymm0,%ymm1
+ vperm2i128 $0x31,%ymm6,%ymm0,%ymm6
+ vmovdqa %ymm1,0x40(%rsp)
+ vmovdqa 0x60(%rsp),%ymm0
+ vperm2i128 $0x20,%ymm7,%ymm0,%ymm1
+ vperm2i128 $0x31,%ymm7,%ymm0,%ymm7
+ vmovdqa %ymm1,0x60(%rsp)
+ vperm2i128 $0x20,%ymm12,%ymm8,%ymm0
+ vperm2i128 $0x31,%ymm12,%ymm8,%ymm12
+ vmovdqa %ymm0,%ymm8
+ vperm2i128 $0x20,%ymm13,%ymm9,%ymm0
+ vperm2i128 $0x31,%ymm13,%ymm9,%ymm13
+ vmovdqa %ymm0,%ymm9
+ vperm2i128 $0x20,%ymm14,%ymm10,%ymm0
+ vperm2i128 $0x31,%ymm14,%ymm10,%ymm14
+ vmovdqa %ymm0,%ymm10
+ vperm2i128 $0x20,%ymm15,%ymm11,%ymm0
+ vperm2i128 $0x31,%ymm15,%ymm11,%ymm15
+ vmovdqa %ymm0,%ymm11
+
+ # xor with corresponding input, write to output
+ vmovdqa 0x00(%rsp),%ymm0
+ vpxor 0x0000(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x0000(%rsi)
+ vmovdqa 0x20(%rsp),%ymm0
+ vpxor 0x0080(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x0080(%rsi)
+ vmovdqa 0x40(%rsp),%ymm0
+ vpxor 0x0040(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x0040(%rsi)
+ vmovdqa 0x60(%rsp),%ymm0
+ vpxor 0x00c0(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x00c0(%rsi)
+ vpxor 0x0100(%rdx),%ymm4,%ymm4
+ vmovdqu %ymm4,0x0100(%rsi)
+ vpxor 0x0180(%rdx),%ymm5,%ymm5
+ vmovdqu %ymm5,0x00180(%rsi)
+ vpxor 0x0140(%rdx),%ymm6,%ymm6
+ vmovdqu %ymm6,0x0140(%rsi)
+ vpxor 0x01c0(%rdx),%ymm7,%ymm7
+ vmovdqu %ymm7,0x01c0(%rsi)
+ vpxor 0x0020(%rdx),%ymm8,%ymm8
+ vmovdqu %ymm8,0x0020(%rsi)
+ vpxor 0x00a0(%rdx),%ymm9,%ymm9
+ vmovdqu %ymm9,0x00a0(%rsi)
+ vpxor 0x0060(%rdx),%ymm10,%ymm10
+ vmovdqu %ymm10,0x0060(%rsi)
+ vpxor 0x00e0(%rdx),%ymm11,%ymm11
+ vmovdqu %ymm11,0x00e0(%rsi)
+ vpxor 0x0120(%rdx),%ymm12,%ymm12
+ vmovdqu %ymm12,0x0120(%rsi)
+ vpxor 0x01a0(%rdx),%ymm13,%ymm13
+ vmovdqu %ymm13,0x01a0(%rsi)
+ vpxor 0x0160(%rdx),%ymm14,%ymm14
+ vmovdqu %ymm14,0x0160(%rsi)
+ vpxor 0x01e0(%rdx),%ymm15,%ymm15
+ vmovdqu %ymm15,0x01e0(%rsi)
+
+ vzeroupper
+ lea -8(%r10),%rsp
+ ret
+ENDPROC(chacha20_8block_xor_avx2)
diff --git a/arch/x86/crypto/chacha20-ssse3-x86_64.S b/arch/x86/crypto/chacha20-ssse3-x86_64.S
new file mode 100644
index 000000000..512a2b500
--- /dev/null
+++ b/arch/x86/crypto/chacha20-ssse3-x86_64.S
@@ -0,0 +1,630 @@
+/*
+ * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/linkage.h>
+
+.section .rodata.cst16.ROT8, "aM", @progbits, 16
+.align 16
+ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003
+.section .rodata.cst16.ROT16, "aM", @progbits, 16
+.align 16
+ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302
+.section .rodata.cst16.CTRINC, "aM", @progbits, 16
+.align 16
+CTRINC: .octa 0x00000003000000020000000100000000
+
+.text
+
+ENTRY(chacha20_block_xor_ssse3)
+ # %rdi: Input state matrix, s
+ # %rsi: 1 data block output, o
+ # %rdx: 1 data block input, i
+
+ # This function encrypts one ChaCha20 block by loading the state matrix
+ # in four SSE registers. It performs matrix operation on four words in
+ # parallel, but requireds shuffling to rearrange the words after each
+ # round. 8/16-bit word rotation is done with the slightly better
+ # performing SSSE3 byte shuffling, 7/12-bit word rotation uses
+ # traditional shift+OR.
+
+ # x0..3 = s0..3
+ movdqa 0x00(%rdi),%xmm0
+ movdqa 0x10(%rdi),%xmm1
+ movdqa 0x20(%rdi),%xmm2
+ movdqa 0x30(%rdi),%xmm3
+ movdqa %xmm0,%xmm8
+ movdqa %xmm1,%xmm9
+ movdqa %xmm2,%xmm10
+ movdqa %xmm3,%xmm11
+
+ movdqa ROT8(%rip),%xmm4
+ movdqa ROT16(%rip),%xmm5
+
+ mov $10,%ecx
+
+.Ldoubleround:
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm5,%xmm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm6
+ pslld $12,%xmm6
+ psrld $20,%xmm1
+ por %xmm6,%xmm1
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm4,%xmm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm7
+ pslld $7,%xmm7
+ psrld $25,%xmm1
+ por %xmm7,%xmm1
+
+ # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+ pshufd $0x39,%xmm1,%xmm1
+ # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ pshufd $0x4e,%xmm2,%xmm2
+ # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+ pshufd $0x93,%xmm3,%xmm3
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm5,%xmm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm6
+ pslld $12,%xmm6
+ psrld $20,%xmm1
+ por %xmm6,%xmm1
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm4,%xmm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm7
+ pslld $7,%xmm7
+ psrld $25,%xmm1
+ por %xmm7,%xmm1
+
+ # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+ pshufd $0x93,%xmm1,%xmm1
+ # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ pshufd $0x4e,%xmm2,%xmm2
+ # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+ pshufd $0x39,%xmm3,%xmm3
+
+ dec %ecx
+ jnz .Ldoubleround
+
+ # o0 = i0 ^ (x0 + s0)
+ movdqu 0x00(%rdx),%xmm4
+ paddd %xmm8,%xmm0
+ pxor %xmm4,%xmm0
+ movdqu %xmm0,0x00(%rsi)
+ # o1 = i1 ^ (x1 + s1)
+ movdqu 0x10(%rdx),%xmm5
+ paddd %xmm9,%xmm1
+ pxor %xmm5,%xmm1
+ movdqu %xmm1,0x10(%rsi)
+ # o2 = i2 ^ (x2 + s2)
+ movdqu 0x20(%rdx),%xmm6
+ paddd %xmm10,%xmm2
+ pxor %xmm6,%xmm2
+ movdqu %xmm2,0x20(%rsi)
+ # o3 = i3 ^ (x3 + s3)
+ movdqu 0x30(%rdx),%xmm7
+ paddd %xmm11,%xmm3
+ pxor %xmm7,%xmm3
+ movdqu %xmm3,0x30(%rsi)
+
+ ret
+ENDPROC(chacha20_block_xor_ssse3)
+
+ENTRY(chacha20_4block_xor_ssse3)
+ # %rdi: Input state matrix, s
+ # %rsi: 4 data blocks output, o
+ # %rdx: 4 data blocks input, i
+
+ # This function encrypts four consecutive ChaCha20 blocks by loading the
+ # the state matrix in SSE registers four times. As we need some scratch
+ # registers, we save the first four registers on the stack. The
+ # algorithm performs each operation on the corresponding word of each
+ # state matrix, hence requires no word shuffling. For final XORing step
+ # we transpose the matrix by interleaving 32- and then 64-bit words,
+ # which allows us to do XOR in SSE registers. 8/16-bit word rotation is
+ # done with the slightly better performing SSSE3 byte shuffling,
+ # 7/12-bit word rotation uses traditional shift+OR.
+
+ lea 8(%rsp),%r10
+ sub $0x80,%rsp
+ and $~63,%rsp
+
+ # x0..15[0-3] = s0..3[0..3]
+ movq 0x00(%rdi),%xmm1
+ pshufd $0x00,%xmm1,%xmm0
+ pshufd $0x55,%xmm1,%xmm1
+ movq 0x08(%rdi),%xmm3
+ pshufd $0x00,%xmm3,%xmm2
+ pshufd $0x55,%xmm3,%xmm3
+ movq 0x10(%rdi),%xmm5
+ pshufd $0x00,%xmm5,%xmm4
+ pshufd $0x55,%xmm5,%xmm5
+ movq 0x18(%rdi),%xmm7
+ pshufd $0x00,%xmm7,%xmm6
+ pshufd $0x55,%xmm7,%xmm7
+ movq 0x20(%rdi),%xmm9
+ pshufd $0x00,%xmm9,%xmm8
+ pshufd $0x55,%xmm9,%xmm9
+ movq 0x28(%rdi),%xmm11
+ pshufd $0x00,%xmm11,%xmm10
+ pshufd $0x55,%xmm11,%xmm11
+ movq 0x30(%rdi),%xmm13
+ pshufd $0x00,%xmm13,%xmm12
+ pshufd $0x55,%xmm13,%xmm13
+ movq 0x38(%rdi),%xmm15
+ pshufd $0x00,%xmm15,%xmm14
+ pshufd $0x55,%xmm15,%xmm15
+ # x0..3 on stack
+ movdqa %xmm0,0x00(%rsp)
+ movdqa %xmm1,0x10(%rsp)
+ movdqa %xmm2,0x20(%rsp)
+ movdqa %xmm3,0x30(%rsp)
+
+ movdqa CTRINC(%rip),%xmm1
+ movdqa ROT8(%rip),%xmm2
+ movdqa ROT16(%rip),%xmm3
+
+ # x12 += counter values 0-3
+ paddd %xmm1,%xmm12
+
+ mov $10,%ecx
+
+.Ldoubleround4:
+ # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
+ movdqa 0x00(%rsp),%xmm0
+ paddd %xmm4,%xmm0
+ movdqa %xmm0,0x00(%rsp)
+ pxor %xmm0,%xmm12
+ pshufb %xmm3,%xmm12
+ # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
+ movdqa 0x10(%rsp),%xmm0
+ paddd %xmm5,%xmm0
+ movdqa %xmm0,0x10(%rsp)
+ pxor %xmm0,%xmm13
+ pshufb %xmm3,%xmm13
+ # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
+ movdqa 0x20(%rsp),%xmm0
+ paddd %xmm6,%xmm0
+ movdqa %xmm0,0x20(%rsp)
+ pxor %xmm0,%xmm14
+ pshufb %xmm3,%xmm14
+ # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
+ movdqa 0x30(%rsp),%xmm0
+ paddd %xmm7,%xmm0
+ movdqa %xmm0,0x30(%rsp)
+ pxor %xmm0,%xmm15
+ pshufb %xmm3,%xmm15
+
+ # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm0
+ pslld $12,%xmm0
+ psrld $20,%xmm4
+ por %xmm0,%xmm4
+ # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm0
+ pslld $12,%xmm0
+ psrld $20,%xmm5
+ por %xmm0,%xmm5
+ # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm0
+ pslld $12,%xmm0
+ psrld $20,%xmm6
+ por %xmm0,%xmm6
+ # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
+ paddd %xmm15,%xmm11
+ pxor %xmm11,%xmm7
+ movdqa %xmm7,%xmm0
+ pslld $12,%xmm0
+ psrld $20,%xmm7
+ por %xmm0,%xmm7
+
+ # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
+ movdqa 0x00(%rsp),%xmm0
+ paddd %xmm4,%xmm0
+ movdqa %xmm0,0x00(%rsp)
+ pxor %xmm0,%xmm12
+ pshufb %xmm2,%xmm12
+ # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
+ movdqa 0x10(%rsp),%xmm0
+ paddd %xmm5,%xmm0
+ movdqa %xmm0,0x10(%rsp)
+ pxor %xmm0,%xmm13
+ pshufb %xmm2,%xmm13
+ # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
+ movdqa 0x20(%rsp),%xmm0
+ paddd %xmm6,%xmm0
+ movdqa %xmm0,0x20(%rsp)
+ pxor %xmm0,%xmm14
+ pshufb %xmm2,%xmm14
+ # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
+ movdqa 0x30(%rsp),%xmm0
+ paddd %xmm7,%xmm0
+ movdqa %xmm0,0x30(%rsp)
+ pxor %xmm0,%xmm15
+ pshufb %xmm2,%xmm15
+
+ # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm0
+ pslld $7,%xmm0
+ psrld $25,%xmm4
+ por %xmm0,%xmm4
+ # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm0
+ pslld $7,%xmm0
+ psrld $25,%xmm5
+ por %xmm0,%xmm5
+ # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm0
+ pslld $7,%xmm0
+ psrld $25,%xmm6
+ por %xmm0,%xmm6
+ # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
+ paddd %xmm15,%xmm11
+ pxor %xmm11,%xmm7
+ movdqa %xmm7,%xmm0
+ pslld $7,%xmm0
+ psrld $25,%xmm7
+ por %xmm0,%xmm7
+
+ # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
+ movdqa 0x00(%rsp),%xmm0
+ paddd %xmm5,%xmm0
+ movdqa %xmm0,0x00(%rsp)
+ pxor %xmm0,%xmm15
+ pshufb %xmm3,%xmm15
+ # x1 += x6, x12 = rotl32(x12 ^ x1, 16)
+ movdqa 0x10(%rsp),%xmm0
+ paddd %xmm6,%xmm0
+ movdqa %xmm0,0x10(%rsp)
+ pxor %xmm0,%xmm12
+ pshufb %xmm3,%xmm12
+ # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
+ movdqa 0x20(%rsp),%xmm0
+ paddd %xmm7,%xmm0
+ movdqa %xmm0,0x20(%rsp)
+ pxor %xmm0,%xmm13
+ pshufb %xmm3,%xmm13
+ # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
+ movdqa 0x30(%rsp),%xmm0
+ paddd %xmm4,%xmm0
+ movdqa %xmm0,0x30(%rsp)
+ pxor %xmm0,%xmm14
+ pshufb %xmm3,%xmm14
+
+ # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
+ paddd %xmm15,%xmm10
+ pxor %xmm10,%xmm5
+ movdqa %xmm5,%xmm0
+ pslld $12,%xmm0
+ psrld $20,%xmm5
+ por %xmm0,%xmm5
+ # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
+ paddd %xmm12,%xmm11
+ pxor %xmm11,%xmm6
+ movdqa %xmm6,%xmm0
+ pslld $12,%xmm0
+ psrld $20,%xmm6
+ por %xmm0,%xmm6
+ # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
+ paddd %xmm13,%xmm8
+ pxor %xmm8,%xmm7
+ movdqa %xmm7,%xmm0
+ pslld $12,%xmm0
+ psrld $20,%xmm7
+ por %xmm0,%xmm7
+ # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
+ paddd %xmm14,%xmm9
+ pxor %xmm9,%xmm4
+ movdqa %xmm4,%xmm0
+ pslld $12,%xmm0
+ psrld $20,%xmm4
+ por %xmm0,%xmm4
+
+ # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
+ movdqa 0x00(%rsp),%xmm0
+ paddd %xmm5,%xmm0
+ movdqa %xmm0,0x00(%rsp)
+ pxor %xmm0,%xmm15
+ pshufb %xmm2,%xmm15
+ # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
+ movdqa 0x10(%rsp),%xmm0
+ paddd %xmm6,%xmm0
+ movdqa %xmm0,0x10(%rsp)
+ pxor %xmm0,%xmm12
+ pshufb %xmm2,%xmm12
+ # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
+ movdqa 0x20(%rsp),%xmm0
+ paddd %xmm7,%xmm0
+ movdqa %xmm0,0x20(%rsp)
+ pxor %xmm0,%xmm13
+ pshufb %xmm2,%xmm13
+ # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
+ movdqa 0x30(%rsp),%xmm0
+ paddd %xmm4,%xmm0
+ movdqa %xmm0,0x30(%rsp)
+ pxor %xmm0,%xmm14
+ pshufb %xmm2,%xmm14
+
+ # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
+ paddd %xmm15,%xmm10
+ pxor %xmm10,%xmm5
+ movdqa %xmm5,%xmm0
+ pslld $7,%xmm0
+ psrld $25,%xmm5
+ por %xmm0,%xmm5
+ # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
+ paddd %xmm12,%xmm11
+ pxor %xmm11,%xmm6
+ movdqa %xmm6,%xmm0
+ pslld $7,%xmm0
+ psrld $25,%xmm6
+ por %xmm0,%xmm6
+ # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
+ paddd %xmm13,%xmm8
+ pxor %xmm8,%xmm7
+ movdqa %xmm7,%xmm0
+ pslld $7,%xmm0
+ psrld $25,%xmm7
+ por %xmm0,%xmm7
+ # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
+ paddd %xmm14,%xmm9
+ pxor %xmm9,%xmm4
+ movdqa %xmm4,%xmm0
+ pslld $7,%xmm0
+ psrld $25,%xmm4
+ por %xmm0,%xmm4
+
+ dec %ecx
+ jnz .Ldoubleround4
+
+ # x0[0-3] += s0[0]
+ # x1[0-3] += s0[1]
+ movq 0x00(%rdi),%xmm3
+ pshufd $0x00,%xmm3,%xmm2
+ pshufd $0x55,%xmm3,%xmm3
+ paddd 0x00(%rsp),%xmm2
+ movdqa %xmm2,0x00(%rsp)
+ paddd 0x10(%rsp),%xmm3
+ movdqa %xmm3,0x10(%rsp)
+ # x2[0-3] += s0[2]
+ # x3[0-3] += s0[3]
+ movq 0x08(%rdi),%xmm3
+ pshufd $0x00,%xmm3,%xmm2
+ pshufd $0x55,%xmm3,%xmm3
+ paddd 0x20(%rsp),%xmm2
+ movdqa %xmm2,0x20(%rsp)
+ paddd 0x30(%rsp),%xmm3
+ movdqa %xmm3,0x30(%rsp)
+
+ # x4[0-3] += s1[0]
+ # x5[0-3] += s1[1]
+ movq 0x10(%rdi),%xmm3
+ pshufd $0x00,%xmm3,%xmm2
+ pshufd $0x55,%xmm3,%xmm3
+ paddd %xmm2,%xmm4
+ paddd %xmm3,%xmm5
+ # x6[0-3] += s1[2]
+ # x7[0-3] += s1[3]
+ movq 0x18(%rdi),%xmm3
+ pshufd $0x00,%xmm3,%xmm2
+ pshufd $0x55,%xmm3,%xmm3
+ paddd %xmm2,%xmm6
+ paddd %xmm3,%xmm7
+
+ # x8[0-3] += s2[0]
+ # x9[0-3] += s2[1]
+ movq 0x20(%rdi),%xmm3
+ pshufd $0x00,%xmm3,%xmm2
+ pshufd $0x55,%xmm3,%xmm3
+ paddd %xmm2,%xmm8
+ paddd %xmm3,%xmm9
+ # x10[0-3] += s2[2]
+ # x11[0-3] += s2[3]
+ movq 0x28(%rdi),%xmm3
+ pshufd $0x00,%xmm3,%xmm2
+ pshufd $0x55,%xmm3,%xmm3
+ paddd %xmm2,%xmm10
+ paddd %xmm3,%xmm11
+
+ # x12[0-3] += s3[0]
+ # x13[0-3] += s3[1]
+ movq 0x30(%rdi),%xmm3
+ pshufd $0x00,%xmm3,%xmm2
+ pshufd $0x55,%xmm3,%xmm3
+ paddd %xmm2,%xmm12
+ paddd %xmm3,%xmm13
+ # x14[0-3] += s3[2]
+ # x15[0-3] += s3[3]
+ movq 0x38(%rdi),%xmm3
+ pshufd $0x00,%xmm3,%xmm2
+ pshufd $0x55,%xmm3,%xmm3
+ paddd %xmm2,%xmm14
+ paddd %xmm3,%xmm15
+
+ # x12 += counter values 0-3
+ paddd %xmm1,%xmm12
+
+ # interleave 32-bit words in state n, n+1
+ movdqa 0x00(%rsp),%xmm0
+ movdqa 0x10(%rsp),%xmm1
+ movdqa %xmm0,%xmm2
+ punpckldq %xmm1,%xmm2
+ punpckhdq %xmm1,%xmm0
+ movdqa %xmm2,0x00(%rsp)
+ movdqa %xmm0,0x10(%rsp)
+ movdqa 0x20(%rsp),%xmm0
+ movdqa 0x30(%rsp),%xmm1
+ movdqa %xmm0,%xmm2
+ punpckldq %xmm1,%xmm2
+ punpckhdq %xmm1,%xmm0
+ movdqa %xmm2,0x20(%rsp)
+ movdqa %xmm0,0x30(%rsp)
+ movdqa %xmm4,%xmm0
+ punpckldq %xmm5,%xmm4
+ punpckhdq %xmm5,%xmm0
+ movdqa %xmm0,%xmm5
+ movdqa %xmm6,%xmm0
+ punpckldq %xmm7,%xmm6
+ punpckhdq %xmm7,%xmm0
+ movdqa %xmm0,%xmm7
+ movdqa %xmm8,%xmm0
+ punpckldq %xmm9,%xmm8
+ punpckhdq %xmm9,%xmm0
+ movdqa %xmm0,%xmm9
+ movdqa %xmm10,%xmm0
+ punpckldq %xmm11,%xmm10
+ punpckhdq %xmm11,%xmm0
+ movdqa %xmm0,%xmm11
+ movdqa %xmm12,%xmm0
+ punpckldq %xmm13,%xmm12
+ punpckhdq %xmm13,%xmm0
+ movdqa %xmm0,%xmm13
+ movdqa %xmm14,%xmm0
+ punpckldq %xmm15,%xmm14
+ punpckhdq %xmm15,%xmm0
+ movdqa %xmm0,%xmm15
+
+ # interleave 64-bit words in state n, n+2
+ movdqa 0x00(%rsp),%xmm0
+ movdqa 0x20(%rsp),%xmm1
+ movdqa %xmm0,%xmm2
+ punpcklqdq %xmm1,%xmm2
+ punpckhqdq %xmm1,%xmm0
+ movdqa %xmm2,0x00(%rsp)
+ movdqa %xmm0,0x20(%rsp)
+ movdqa 0x10(%rsp),%xmm0
+ movdqa 0x30(%rsp),%xmm1
+ movdqa %xmm0,%xmm2
+ punpcklqdq %xmm1,%xmm2
+ punpckhqdq %xmm1,%xmm0
+ movdqa %xmm2,0x10(%rsp)
+ movdqa %xmm0,0x30(%rsp)
+ movdqa %xmm4,%xmm0
+ punpcklqdq %xmm6,%xmm4
+ punpckhqdq %xmm6,%xmm0
+ movdqa %xmm0,%xmm6
+ movdqa %xmm5,%xmm0
+ punpcklqdq %xmm7,%xmm5
+ punpckhqdq %xmm7,%xmm0
+ movdqa %xmm0,%xmm7
+ movdqa %xmm8,%xmm0
+ punpcklqdq %xmm10,%xmm8
+ punpckhqdq %xmm10,%xmm0
+ movdqa %xmm0,%xmm10
+ movdqa %xmm9,%xmm0
+ punpcklqdq %xmm11,%xmm9
+ punpckhqdq %xmm11,%xmm0
+ movdqa %xmm0,%xmm11
+ movdqa %xmm12,%xmm0
+ punpcklqdq %xmm14,%xmm12
+ punpckhqdq %xmm14,%xmm0
+ movdqa %xmm0,%xmm14
+ movdqa %xmm13,%xmm0
+ punpcklqdq %xmm15,%xmm13
+ punpckhqdq %xmm15,%xmm0
+ movdqa %xmm0,%xmm15
+
+ # xor with corresponding input, write to output
+ movdqa 0x00(%rsp),%xmm0
+ movdqu 0x00(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0x00(%rsi)
+ movdqa 0x10(%rsp),%xmm0
+ movdqu 0x80(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0x80(%rsi)
+ movdqa 0x20(%rsp),%xmm0
+ movdqu 0x40(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0x40(%rsi)
+ movdqa 0x30(%rsp),%xmm0
+ movdqu 0xc0(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0xc0(%rsi)
+ movdqu 0x10(%rdx),%xmm1
+ pxor %xmm1,%xmm4
+ movdqu %xmm4,0x10(%rsi)
+ movdqu 0x90(%rdx),%xmm1
+ pxor %xmm1,%xmm5
+ movdqu %xmm5,0x90(%rsi)
+ movdqu 0x50(%rdx),%xmm1
+ pxor %xmm1,%xmm6
+ movdqu %xmm6,0x50(%rsi)
+ movdqu 0xd0(%rdx),%xmm1
+ pxor %xmm1,%xmm7
+ movdqu %xmm7,0xd0(%rsi)
+ movdqu 0x20(%rdx),%xmm1
+ pxor %xmm1,%xmm8
+ movdqu %xmm8,0x20(%rsi)
+ movdqu 0xa0(%rdx),%xmm1
+ pxor %xmm1,%xmm9
+ movdqu %xmm9,0xa0(%rsi)
+ movdqu 0x60(%rdx),%xmm1
+ pxor %xmm1,%xmm10
+ movdqu %xmm10,0x60(%rsi)
+ movdqu 0xe0(%rdx),%xmm1
+ pxor %xmm1,%xmm11
+ movdqu %xmm11,0xe0(%rsi)
+ movdqu 0x30(%rdx),%xmm1
+ pxor %xmm1,%xmm12
+ movdqu %xmm12,0x30(%rsi)
+ movdqu 0xb0(%rdx),%xmm1
+ pxor %xmm1,%xmm13
+ movdqu %xmm13,0xb0(%rsi)
+ movdqu 0x70(%rdx),%xmm1
+ pxor %xmm1,%xmm14
+ movdqu %xmm14,0x70(%rsi)
+ movdqu 0xf0(%rdx),%xmm1
+ pxor %xmm1,%xmm15
+ movdqu %xmm15,0xf0(%rsi)
+
+ lea -8(%r10),%rsp
+ ret
+ENDPROC(chacha20_4block_xor_ssse3)
diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c
new file mode 100644
index 000000000..dce7c5d39
--- /dev/null
+++ b/arch/x86/crypto/chacha20_glue.c
@@ -0,0 +1,146 @@
+/*
+ * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <crypto/algapi.h>
+#include <crypto/chacha20.h>
+#include <crypto/internal/skcipher.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <asm/fpu/api.h>
+#include <asm/simd.h>
+
+#define CHACHA20_STATE_ALIGN 16
+
+asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src);
+asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src);
+#ifdef CONFIG_AS_AVX2
+asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src);
+static bool chacha20_use_avx2;
+#endif
+
+static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
+ unsigned int bytes)
+{
+ u8 buf[CHACHA20_BLOCK_SIZE];
+
+#ifdef CONFIG_AS_AVX2
+ if (chacha20_use_avx2) {
+ while (bytes >= CHACHA20_BLOCK_SIZE * 8) {
+ chacha20_8block_xor_avx2(state, dst, src);
+ bytes -= CHACHA20_BLOCK_SIZE * 8;
+ src += CHACHA20_BLOCK_SIZE * 8;
+ dst += CHACHA20_BLOCK_SIZE * 8;
+ state[12] += 8;
+ }
+ }
+#endif
+ while (bytes >= CHACHA20_BLOCK_SIZE * 4) {
+ chacha20_4block_xor_ssse3(state, dst, src);
+ bytes -= CHACHA20_BLOCK_SIZE * 4;
+ src += CHACHA20_BLOCK_SIZE * 4;
+ dst += CHACHA20_BLOCK_SIZE * 4;
+ state[12] += 4;
+ }
+ while (bytes >= CHACHA20_BLOCK_SIZE) {
+ chacha20_block_xor_ssse3(state, dst, src);
+ bytes -= CHACHA20_BLOCK_SIZE;
+ src += CHACHA20_BLOCK_SIZE;
+ dst += CHACHA20_BLOCK_SIZE;
+ state[12]++;
+ }
+ if (bytes) {
+ memcpy(buf, src, bytes);
+ chacha20_block_xor_ssse3(state, buf, buf);
+ memcpy(dst, buf, bytes);
+ }
+}
+
+static int chacha20_simd(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct chacha20_ctx *ctx = crypto_skcipher_ctx(tfm);
+ u32 *state, state_buf[16 + 2] __aligned(8);
+ struct skcipher_walk walk;
+ int err;
+
+ BUILD_BUG_ON(CHACHA20_STATE_ALIGN != 16);
+ state = PTR_ALIGN(state_buf + 0, CHACHA20_STATE_ALIGN);
+
+ if (req->cryptlen <= CHACHA20_BLOCK_SIZE || !may_use_simd())
+ return crypto_chacha20_crypt(req);
+
+ err = skcipher_walk_virt(&walk, req, true);
+
+ crypto_chacha20_init(state, ctx, walk.iv);
+
+ kernel_fpu_begin();
+
+ while (walk.nbytes >= CHACHA20_BLOCK_SIZE) {
+ chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
+ rounddown(walk.nbytes, CHACHA20_BLOCK_SIZE));
+ err = skcipher_walk_done(&walk,
+ walk.nbytes % CHACHA20_BLOCK_SIZE);
+ }
+
+ if (walk.nbytes) {
+ chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
+ walk.nbytes);
+ err = skcipher_walk_done(&walk, 0);
+ }
+
+ kernel_fpu_end();
+
+ return err;
+}
+
+static struct skcipher_alg alg = {
+ .base.cra_name = "chacha20",
+ .base.cra_driver_name = "chacha20-simd",
+ .base.cra_priority = 300,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct chacha20_ctx),
+ .base.cra_module = THIS_MODULE,
+
+ .min_keysize = CHACHA20_KEY_SIZE,
+ .max_keysize = CHACHA20_KEY_SIZE,
+ .ivsize = CHACHA20_IV_SIZE,
+ .chunksize = CHACHA20_BLOCK_SIZE,
+ .setkey = crypto_chacha20_setkey,
+ .encrypt = chacha20_simd,
+ .decrypt = chacha20_simd,
+};
+
+static int __init chacha20_simd_mod_init(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_SSSE3))
+ return -ENODEV;
+
+#ifdef CONFIG_AS_AVX2
+ chacha20_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) &&
+ boot_cpu_has(X86_FEATURE_AVX2) &&
+ cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
+#endif
+ return crypto_register_skcipher(&alg);
+}
+
+static void __exit chacha20_simd_mod_fini(void)
+{
+ crypto_unregister_skcipher(&alg);
+}
+
+module_init(chacha20_simd_mod_init);
+module_exit(chacha20_simd_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
+MODULE_DESCRIPTION("chacha20 cipher algorithm, SIMD accelerated");
+MODULE_ALIAS_CRYPTO("chacha20");
+MODULE_ALIAS_CRYPTO("chacha20-simd");
diff --git a/arch/x86/crypto/crc32-pclmul_asm.S b/arch/x86/crypto/crc32-pclmul_asm.S
new file mode 100644
index 000000000..1c099dc08
--- /dev/null
+++ b/arch/x86/crypto/crc32-pclmul_asm.S
@@ -0,0 +1,241 @@
+/* GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please visit http://www.xyratex.com/contact if you need additional
+ * information or have any questions.
+ *
+ * GPL HEADER END
+ */
+
+/*
+ * Copyright 2012 Xyratex Technology Limited
+ *
+ * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32
+ * calculation.
+ * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE)
+ * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found
+ * at:
+ * http://www.intel.com/products/processor/manuals/
+ * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
+ * Volume 2B: Instruction Set Reference, N-Z
+ *
+ * Authors: Gregory Prestas <Gregory_Prestas@us.xyratex.com>
+ * Alexander Boyko <Alexander_Boyko@xyratex.com>
+ */
+
+#include <linux/linkage.h>
+#include <asm/inst.h>
+
+
+.section .rodata
+.align 16
+/*
+ * [x4*128+32 mod P(x) << 32)]' << 1 = 0x154442bd4
+ * #define CONSTANT_R1 0x154442bd4LL
+ *
+ * [(x4*128-32 mod P(x) << 32)]' << 1 = 0x1c6e41596
+ * #define CONSTANT_R2 0x1c6e41596LL
+ */
+.Lconstant_R2R1:
+ .octa 0x00000001c6e415960000000154442bd4
+/*
+ * [(x128+32 mod P(x) << 32)]' << 1 = 0x1751997d0
+ * #define CONSTANT_R3 0x1751997d0LL
+ *
+ * [(x128-32 mod P(x) << 32)]' << 1 = 0x0ccaa009e
+ * #define CONSTANT_R4 0x0ccaa009eLL
+ */
+.Lconstant_R4R3:
+ .octa 0x00000000ccaa009e00000001751997d0
+/*
+ * [(x64 mod P(x) << 32)]' << 1 = 0x163cd6124
+ * #define CONSTANT_R5 0x163cd6124LL
+ */
+.Lconstant_R5:
+ .octa 0x00000000000000000000000163cd6124
+.Lconstant_mask32:
+ .octa 0x000000000000000000000000FFFFFFFF
+/*
+ * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL
+ *
+ * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))` = 0x1F7011641LL
+ * #define CONSTANT_RU 0x1F7011641LL
+ */
+.Lconstant_RUpoly:
+ .octa 0x00000001F701164100000001DB710641
+
+#define CONSTANT %xmm0
+
+#ifdef __x86_64__
+#define BUF %rdi
+#define LEN %rsi
+#define CRC %edx
+#else
+#define BUF %eax
+#define LEN %edx
+#define CRC %ecx
+#endif
+
+
+
+.text
+/**
+ * Calculate crc32
+ * BUF - buffer (16 bytes aligned)
+ * LEN - sizeof buffer (16 bytes aligned), LEN should be grater than 63
+ * CRC - initial crc32
+ * return %eax crc32
+ * uint crc32_pclmul_le_16(unsigned char const *buffer,
+ * size_t len, uint crc32)
+ */
+
+ENTRY(crc32_pclmul_le_16) /* buffer and buffer size are 16 bytes aligned */
+ movdqa (BUF), %xmm1
+ movdqa 0x10(BUF), %xmm2
+ movdqa 0x20(BUF), %xmm3
+ movdqa 0x30(BUF), %xmm4
+ movd CRC, CONSTANT
+ pxor CONSTANT, %xmm1
+ sub $0x40, LEN
+ add $0x40, BUF
+ cmp $0x40, LEN
+ jb less_64
+
+#ifdef __x86_64__
+ movdqa .Lconstant_R2R1(%rip), CONSTANT
+#else
+ movdqa .Lconstant_R2R1, CONSTANT
+#endif
+
+loop_64:/* 64 bytes Full cache line folding */
+ prefetchnta 0x40(BUF)
+ movdqa %xmm1, %xmm5
+ movdqa %xmm2, %xmm6
+ movdqa %xmm3, %xmm7
+#ifdef __x86_64__
+ movdqa %xmm4, %xmm8
+#endif
+ PCLMULQDQ 00, CONSTANT, %xmm1
+ PCLMULQDQ 00, CONSTANT, %xmm2
+ PCLMULQDQ 00, CONSTANT, %xmm3
+#ifdef __x86_64__
+ PCLMULQDQ 00, CONSTANT, %xmm4
+#endif
+ PCLMULQDQ 0x11, CONSTANT, %xmm5
+ PCLMULQDQ 0x11, CONSTANT, %xmm6
+ PCLMULQDQ 0x11, CONSTANT, %xmm7
+#ifdef __x86_64__
+ PCLMULQDQ 0x11, CONSTANT, %xmm8
+#endif
+ pxor %xmm5, %xmm1
+ pxor %xmm6, %xmm2
+ pxor %xmm7, %xmm3
+#ifdef __x86_64__
+ pxor %xmm8, %xmm4
+#else
+ /* xmm8 unsupported for x32 */
+ movdqa %xmm4, %xmm5
+ PCLMULQDQ 00, CONSTANT, %xmm4
+ PCLMULQDQ 0x11, CONSTANT, %xmm5
+ pxor %xmm5, %xmm4
+#endif
+
+ pxor (BUF), %xmm1
+ pxor 0x10(BUF), %xmm2
+ pxor 0x20(BUF), %xmm3
+ pxor 0x30(BUF), %xmm4
+
+ sub $0x40, LEN
+ add $0x40, BUF
+ cmp $0x40, LEN
+ jge loop_64
+less_64:/* Folding cache line into 128bit */
+#ifdef __x86_64__
+ movdqa .Lconstant_R4R3(%rip), CONSTANT
+#else
+ movdqa .Lconstant_R4R3, CONSTANT
+#endif
+ prefetchnta (BUF)
+
+ movdqa %xmm1, %xmm5
+ PCLMULQDQ 0x00, CONSTANT, %xmm1
+ PCLMULQDQ 0x11, CONSTANT, %xmm5
+ pxor %xmm5, %xmm1
+ pxor %xmm2, %xmm1
+
+ movdqa %xmm1, %xmm5
+ PCLMULQDQ 0x00, CONSTANT, %xmm1
+ PCLMULQDQ 0x11, CONSTANT, %xmm5
+ pxor %xmm5, %xmm1
+ pxor %xmm3, %xmm1
+
+ movdqa %xmm1, %xmm5
+ PCLMULQDQ 0x00, CONSTANT, %xmm1
+ PCLMULQDQ 0x11, CONSTANT, %xmm5
+ pxor %xmm5, %xmm1
+ pxor %xmm4, %xmm1
+
+ cmp $0x10, LEN
+ jb fold_64
+loop_16:/* Folding rest buffer into 128bit */
+ movdqa %xmm1, %xmm5
+ PCLMULQDQ 0x00, CONSTANT, %xmm1
+ PCLMULQDQ 0x11, CONSTANT, %xmm5
+ pxor %xmm5, %xmm1
+ pxor (BUF), %xmm1
+ sub $0x10, LEN
+ add $0x10, BUF
+ cmp $0x10, LEN
+ jge loop_16
+
+fold_64:
+ /* perform the last 64 bit fold, also adds 32 zeroes
+ * to the input stream */
+ PCLMULQDQ 0x01, %xmm1, CONSTANT /* R4 * xmm1.low */
+ psrldq $0x08, %xmm1
+ pxor CONSTANT, %xmm1
+
+ /* final 32-bit fold */
+ movdqa %xmm1, %xmm2
+#ifdef __x86_64__
+ movdqa .Lconstant_R5(%rip), CONSTANT
+ movdqa .Lconstant_mask32(%rip), %xmm3
+#else
+ movdqa .Lconstant_R5, CONSTANT
+ movdqa .Lconstant_mask32, %xmm3
+#endif
+ psrldq $0x04, %xmm2
+ pand %xmm3, %xmm1
+ PCLMULQDQ 0x00, CONSTANT, %xmm1
+ pxor %xmm2, %xmm1
+
+ /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */
+#ifdef __x86_64__
+ movdqa .Lconstant_RUpoly(%rip), CONSTANT
+#else
+ movdqa .Lconstant_RUpoly, CONSTANT
+#endif
+ movdqa %xmm1, %xmm2
+ pand %xmm3, %xmm1
+ PCLMULQDQ 0x10, CONSTANT, %xmm1
+ pand %xmm3, %xmm1
+ PCLMULQDQ 0x00, CONSTANT, %xmm1
+ pxor %xmm2, %xmm1
+ PEXTRD 0x01, %xmm1, %eax
+
+ ret
+ENDPROC(crc32_pclmul_le_16)
diff --git a/arch/x86/crypto/crc32-pclmul_glue.c b/arch/x86/crypto/crc32-pclmul_glue.c
new file mode 100644
index 000000000..c8d9cdacb
--- /dev/null
+++ b/arch/x86/crypto/crc32-pclmul_glue.c
@@ -0,0 +1,202 @@
+/* GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please visit http://www.xyratex.com/contact if you need additional
+ * information or have any questions.
+ *
+ * GPL HEADER END
+ */
+
+/*
+ * Copyright 2012 Xyratex Technology Limited
+ *
+ * Wrappers for kernel crypto shash api to pclmulqdq crc32 imlementation.
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/crc32.h>
+#include <crypto/internal/hash.h>
+
+#include <asm/cpufeatures.h>
+#include <asm/cpu_device_id.h>
+#include <asm/fpu/api.h>
+
+#define CHKSUM_BLOCK_SIZE 1
+#define CHKSUM_DIGEST_SIZE 4
+
+#define PCLMUL_MIN_LEN 64L /* minimum size of buffer
+ * for crc32_pclmul_le_16 */
+#define SCALE_F 16L /* size of xmm register */
+#define SCALE_F_MASK (SCALE_F - 1)
+
+u32 crc32_pclmul_le_16(unsigned char const *buffer, size_t len, u32 crc32);
+
+static u32 __attribute__((pure))
+ crc32_pclmul_le(u32 crc, unsigned char const *p, size_t len)
+{
+ unsigned int iquotient;
+ unsigned int iremainder;
+ unsigned int prealign;
+
+ if (len < PCLMUL_MIN_LEN + SCALE_F_MASK || !irq_fpu_usable())
+ return crc32_le(crc, p, len);
+
+ if ((long)p & SCALE_F_MASK) {
+ /* align p to 16 byte */
+ prealign = SCALE_F - ((long)p & SCALE_F_MASK);
+
+ crc = crc32_le(crc, p, prealign);
+ len -= prealign;
+ p = (unsigned char *)(((unsigned long)p + SCALE_F_MASK) &
+ ~SCALE_F_MASK);
+ }
+ iquotient = len & (~SCALE_F_MASK);
+ iremainder = len & SCALE_F_MASK;
+
+ kernel_fpu_begin();
+ crc = crc32_pclmul_le_16(p, iquotient, crc);
+ kernel_fpu_end();
+
+ if (iremainder)
+ crc = crc32_le(crc, p + iquotient, iremainder);
+
+ return crc;
+}
+
+static int crc32_pclmul_cra_init(struct crypto_tfm *tfm)
+{
+ u32 *key = crypto_tfm_ctx(tfm);
+
+ *key = 0;
+
+ return 0;
+}
+
+static int crc32_pclmul_setkey(struct crypto_shash *hash, const u8 *key,
+ unsigned int keylen)
+{
+ u32 *mctx = crypto_shash_ctx(hash);
+
+ if (keylen != sizeof(u32)) {
+ crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
+ return -EINVAL;
+ }
+ *mctx = le32_to_cpup((__le32 *)key);
+ return 0;
+}
+
+static int crc32_pclmul_init(struct shash_desc *desc)
+{
+ u32 *mctx = crypto_shash_ctx(desc->tfm);
+ u32 *crcp = shash_desc_ctx(desc);
+
+ *crcp = *mctx;
+
+ return 0;
+}
+
+static int crc32_pclmul_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ u32 *crcp = shash_desc_ctx(desc);
+
+ *crcp = crc32_pclmul_le(*crcp, data, len);
+ return 0;
+}
+
+/* No final XOR 0xFFFFFFFF, like crc32_le */
+static int __crc32_pclmul_finup(u32 *crcp, const u8 *data, unsigned int len,
+ u8 *out)
+{
+ *(__le32 *)out = cpu_to_le32(crc32_pclmul_le(*crcp, data, len));
+ return 0;
+}
+
+static int crc32_pclmul_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return __crc32_pclmul_finup(shash_desc_ctx(desc), data, len, out);
+}
+
+static int crc32_pclmul_final(struct shash_desc *desc, u8 *out)
+{
+ u32 *crcp = shash_desc_ctx(desc);
+
+ *(__le32 *)out = cpu_to_le32p(crcp);
+ return 0;
+}
+
+static int crc32_pclmul_digest(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return __crc32_pclmul_finup(crypto_shash_ctx(desc->tfm), data, len,
+ out);
+}
+
+static struct shash_alg alg = {
+ .setkey = crc32_pclmul_setkey,
+ .init = crc32_pclmul_init,
+ .update = crc32_pclmul_update,
+ .final = crc32_pclmul_final,
+ .finup = crc32_pclmul_finup,
+ .digest = crc32_pclmul_digest,
+ .descsize = sizeof(u32),
+ .digestsize = CHKSUM_DIGEST_SIZE,
+ .base = {
+ .cra_name = "crc32",
+ .cra_driver_name = "crc32-pclmul",
+ .cra_priority = 200,
+ .cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
+ .cra_blocksize = CHKSUM_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(u32),
+ .cra_module = THIS_MODULE,
+ .cra_init = crc32_pclmul_cra_init,
+ }
+};
+
+static const struct x86_cpu_id crc32pclmul_cpu_id[] = {
+ X86_FEATURE_MATCH(X86_FEATURE_PCLMULQDQ),
+ {}
+};
+MODULE_DEVICE_TABLE(x86cpu, crc32pclmul_cpu_id);
+
+
+static int __init crc32_pclmul_mod_init(void)
+{
+
+ if (!x86_match_cpu(crc32pclmul_cpu_id)) {
+ pr_info("PCLMULQDQ-NI instructions are not detected.\n");
+ return -ENODEV;
+ }
+ return crypto_register_shash(&alg);
+}
+
+static void __exit crc32_pclmul_mod_fini(void)
+{
+ crypto_unregister_shash(&alg);
+}
+
+module_init(crc32_pclmul_mod_init);
+module_exit(crc32_pclmul_mod_fini);
+
+MODULE_AUTHOR("Alexander Boyko <alexander_boyko@xyratex.com>");
+MODULE_LICENSE("GPL");
+
+MODULE_ALIAS_CRYPTO("crc32");
+MODULE_ALIAS_CRYPTO("crc32-pclmul");
diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c
new file mode 100644
index 000000000..5773e1161
--- /dev/null
+++ b/arch/x86/crypto/crc32c-intel_glue.c
@@ -0,0 +1,270 @@
+/*
+ * Using hardware provided CRC32 instruction to accelerate the CRC32 disposal.
+ * CRC32C polynomial:0x1EDC6F41(BE)/0x82F63B78(LE)
+ * CRC32 is a new instruction in Intel SSE4.2, the reference can be found at:
+ * http://www.intel.com/products/processor/manuals/
+ * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
+ * Volume 2A: Instruction Set Reference, A-M
+ *
+ * Copyright (C) 2008 Intel Corporation
+ * Authors: Austin Zhang <austin_zhang@linux.intel.com>
+ * Kent Liu <kent.liu@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <crypto/internal/hash.h>
+
+#include <asm/cpufeatures.h>
+#include <asm/cpu_device_id.h>
+#include <asm/fpu/internal.h>
+
+#define CHKSUM_BLOCK_SIZE 1
+#define CHKSUM_DIGEST_SIZE 4
+
+#define SCALE_F sizeof(unsigned long)
+
+#ifdef CONFIG_X86_64
+#define REX_PRE "0x48, "
+#else
+#define REX_PRE
+#endif
+
+#ifdef CONFIG_X86_64
+/*
+ * use carryless multiply version of crc32c when buffer
+ * size is >= 512 to account
+ * for fpu state save/restore overhead.
+ */
+#define CRC32C_PCL_BREAKEVEN 512
+
+asmlinkage unsigned int crc_pcl(const u8 *buffer, int len,
+ unsigned int crc_init);
+#endif /* CONFIG_X86_64 */
+
+static u32 crc32c_intel_le_hw_byte(u32 crc, unsigned char const *data, size_t length)
+{
+ while (length--) {
+ __asm__ __volatile__(
+ ".byte 0xf2, 0xf, 0x38, 0xf0, 0xf1"
+ :"=S"(crc)
+ :"0"(crc), "c"(*data)
+ );
+ data++;
+ }
+
+ return crc;
+}
+
+static u32 __pure crc32c_intel_le_hw(u32 crc, unsigned char const *p, size_t len)
+{
+ unsigned int iquotient = len / SCALE_F;
+ unsigned int iremainder = len % SCALE_F;
+ unsigned long *ptmp = (unsigned long *)p;
+
+ while (iquotient--) {
+ __asm__ __volatile__(
+ ".byte 0xf2, " REX_PRE "0xf, 0x38, 0xf1, 0xf1;"
+ :"=S"(crc)
+ :"0"(crc), "c"(*ptmp)
+ );
+ ptmp++;
+ }
+
+ if (iremainder)
+ crc = crc32c_intel_le_hw_byte(crc, (unsigned char *)ptmp,
+ iremainder);
+
+ return crc;
+}
+
+/*
+ * Setting the seed allows arbitrary accumulators and flexible XOR policy
+ * If your algorithm starts with ~0, then XOR with ~0 before you set
+ * the seed.
+ */
+static int crc32c_intel_setkey(struct crypto_shash *hash, const u8 *key,
+ unsigned int keylen)
+{
+ u32 *mctx = crypto_shash_ctx(hash);
+
+ if (keylen != sizeof(u32)) {
+ crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
+ return -EINVAL;
+ }
+ *mctx = le32_to_cpup((__le32 *)key);
+ return 0;
+}
+
+static int crc32c_intel_init(struct shash_desc *desc)
+{
+ u32 *mctx = crypto_shash_ctx(desc->tfm);
+ u32 *crcp = shash_desc_ctx(desc);
+
+ *crcp = *mctx;
+
+ return 0;
+}
+
+static int crc32c_intel_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ u32 *crcp = shash_desc_ctx(desc);
+
+ *crcp = crc32c_intel_le_hw(*crcp, data, len);
+ return 0;
+}
+
+static int __crc32c_intel_finup(u32 *crcp, const u8 *data, unsigned int len,
+ u8 *out)
+{
+ *(__le32 *)out = ~cpu_to_le32(crc32c_intel_le_hw(*crcp, data, len));
+ return 0;
+}
+
+static int crc32c_intel_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return __crc32c_intel_finup(shash_desc_ctx(desc), data, len, out);
+}
+
+static int crc32c_intel_final(struct shash_desc *desc, u8 *out)
+{
+ u32 *crcp = shash_desc_ctx(desc);
+
+ *(__le32 *)out = ~cpu_to_le32p(crcp);
+ return 0;
+}
+
+static int crc32c_intel_digest(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return __crc32c_intel_finup(crypto_shash_ctx(desc->tfm), data, len,
+ out);
+}
+
+static int crc32c_intel_cra_init(struct crypto_tfm *tfm)
+{
+ u32 *key = crypto_tfm_ctx(tfm);
+
+ *key = ~0;
+
+ return 0;
+}
+
+#ifdef CONFIG_X86_64
+static int crc32c_pcl_intel_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ u32 *crcp = shash_desc_ctx(desc);
+
+ /*
+ * use faster PCL version if datasize is large enough to
+ * overcome kernel fpu state save/restore overhead
+ */
+ if (len >= CRC32C_PCL_BREAKEVEN && irq_fpu_usable()) {
+ kernel_fpu_begin();
+ *crcp = crc_pcl(data, len, *crcp);
+ kernel_fpu_end();
+ } else
+ *crcp = crc32c_intel_le_hw(*crcp, data, len);
+ return 0;
+}
+
+static int __crc32c_pcl_intel_finup(u32 *crcp, const u8 *data, unsigned int len,
+ u8 *out)
+{
+ if (len >= CRC32C_PCL_BREAKEVEN && irq_fpu_usable()) {
+ kernel_fpu_begin();
+ *(__le32 *)out = ~cpu_to_le32(crc_pcl(data, len, *crcp));
+ kernel_fpu_end();
+ } else
+ *(__le32 *)out =
+ ~cpu_to_le32(crc32c_intel_le_hw(*crcp, data, len));
+ return 0;
+}
+
+static int crc32c_pcl_intel_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return __crc32c_pcl_intel_finup(shash_desc_ctx(desc), data, len, out);
+}
+
+static int crc32c_pcl_intel_digest(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return __crc32c_pcl_intel_finup(crypto_shash_ctx(desc->tfm), data, len,
+ out);
+}
+#endif /* CONFIG_X86_64 */
+
+static struct shash_alg alg = {
+ .setkey = crc32c_intel_setkey,
+ .init = crc32c_intel_init,
+ .update = crc32c_intel_update,
+ .final = crc32c_intel_final,
+ .finup = crc32c_intel_finup,
+ .digest = crc32c_intel_digest,
+ .descsize = sizeof(u32),
+ .digestsize = CHKSUM_DIGEST_SIZE,
+ .base = {
+ .cra_name = "crc32c",
+ .cra_driver_name = "crc32c-intel",
+ .cra_priority = 200,
+ .cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
+ .cra_blocksize = CHKSUM_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(u32),
+ .cra_module = THIS_MODULE,
+ .cra_init = crc32c_intel_cra_init,
+ }
+};
+
+static const struct x86_cpu_id crc32c_cpu_id[] = {
+ X86_FEATURE_MATCH(X86_FEATURE_XMM4_2),
+ {}
+};
+MODULE_DEVICE_TABLE(x86cpu, crc32c_cpu_id);
+
+static int __init crc32c_intel_mod_init(void)
+{
+ if (!x86_match_cpu(crc32c_cpu_id))
+ return -ENODEV;
+#ifdef CONFIG_X86_64
+ if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) {
+ alg.update = crc32c_pcl_intel_update;
+ alg.finup = crc32c_pcl_intel_finup;
+ alg.digest = crc32c_pcl_intel_digest;
+ }
+#endif
+ return crypto_register_shash(&alg);
+}
+
+static void __exit crc32c_intel_mod_fini(void)
+{
+ crypto_unregister_shash(&alg);
+}
+
+module_init(crc32c_intel_mod_init);
+module_exit(crc32c_intel_mod_fini);
+
+MODULE_AUTHOR("Austin Zhang <austin.zhang@intel.com>, Kent Liu <kent.liu@intel.com>");
+MODULE_DESCRIPTION("CRC32c (Castagnoli) optimization using Intel Hardware.");
+MODULE_LICENSE("GPL");
+
+MODULE_ALIAS_CRYPTO("crc32c");
+MODULE_ALIAS_CRYPTO("crc32c-intel");
diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
new file mode 100644
index 000000000..3c6e01520
--- /dev/null
+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
@@ -0,0 +1,464 @@
+/*
+ * Implement fast CRC32C with PCLMULQDQ instructions. (x86_64)
+ *
+ * The white papers on CRC32C calculations with PCLMULQDQ instruction can be
+ * downloaded from:
+ * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf
+ * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-paper.pdf
+ *
+ * Copyright (C) 2012 Intel Corporation.
+ *
+ * Authors:
+ * Wajdi Feghali <wajdi.k.feghali@intel.com>
+ * James Guilford <james.guilford@intel.com>
+ * David Cote <david.m.cote@intel.com>
+ * Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <asm/inst.h>
+#include <linux/linkage.h>
+#include <asm/nospec-branch.h>
+
+## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction
+
+.macro LABEL prefix n
+\prefix\n\():
+.endm
+
+.macro JMPTBL_ENTRY i
+.word crc_\i - crc_array
+.endm
+
+.macro JNC_LESS_THAN j
+ jnc less_than_\j
+.endm
+
+# Define threshold where buffers are considered "small" and routed to more
+# efficient "by-1" code. This "by-1" code only handles up to 255 bytes, so
+# SMALL_SIZE can be no larger than 255.
+
+#define SMALL_SIZE 200
+
+.if (SMALL_SIZE > 255)
+.error "SMALL_ SIZE must be < 256"
+.endif
+
+# unsigned int crc_pcl(u8 *buffer, int len, unsigned int crc_init);
+
+.text
+ENTRY(crc_pcl)
+#define bufp %rdi
+#define bufp_dw %edi
+#define bufp_w %di
+#define bufp_b %dil
+#define bufptmp %rcx
+#define block_0 %rcx
+#define block_1 %rdx
+#define block_2 %r11
+#define len %rsi
+#define len_dw %esi
+#define len_w %si
+#define len_b %sil
+#define crc_init_arg %rdx
+#define tmp %rbx
+#define crc_init %r8
+#define crc_init_dw %r8d
+#define crc1 %r9
+#define crc2 %r10
+
+ pushq %rbx
+ pushq %rdi
+ pushq %rsi
+
+ ## Move crc_init for Linux to a different
+ mov crc_init_arg, crc_init
+
+ ################################################################
+ ## 1) ALIGN:
+ ################################################################
+
+ mov bufp, bufptmp # rdi = *buf
+ neg bufp
+ and $7, bufp # calculate the unalignment amount of
+ # the address
+ je proc_block # Skip if aligned
+
+ ## If len is less than 8 and we're unaligned, we need to jump
+ ## to special code to avoid reading beyond the end of the buffer
+ cmp $8, len
+ jae do_align
+ # less_than_8 expects length in upper 3 bits of len_dw
+ # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30]
+ shl $32-3+1, len_dw
+ jmp less_than_8_post_shl1
+
+do_align:
+ #### Calculate CRC of unaligned bytes of the buffer (if any)
+ movq (bufptmp), tmp # load a quadward from the buffer
+ add bufp, bufptmp # align buffer pointer for quadword
+ # processing
+ sub bufp, len # update buffer length
+align_loop:
+ crc32b %bl, crc_init_dw # compute crc32 of 1-byte
+ shr $8, tmp # get next byte
+ dec bufp
+ jne align_loop
+
+proc_block:
+
+ ################################################################
+ ## 2) PROCESS BLOCKS:
+ ################################################################
+
+ ## compute num of bytes to be processed
+ movq len, tmp # save num bytes in tmp
+
+ cmpq $128*24, len
+ jae full_block
+
+continue_block:
+ cmpq $SMALL_SIZE, len
+ jb small
+
+ ## len < 128*24
+ movq $2731, %rax # 2731 = ceil(2^16 / 24)
+ mul len_dw
+ shrq $16, %rax
+
+ ## eax contains floor(bytes / 24) = num 24-byte chunks to do
+
+ ## process rax 24-byte chunks (128 >= rax >= 0)
+
+ ## compute end address of each block
+ ## block 0 (base addr + RAX * 8)
+ ## block 1 (base addr + RAX * 16)
+ ## block 2 (base addr + RAX * 24)
+ lea (bufptmp, %rax, 8), block_0
+ lea (block_0, %rax, 8), block_1
+ lea (block_1, %rax, 8), block_2
+
+ xor crc1, crc1
+ xor crc2, crc2
+
+ ## branch into array
+ lea jump_table(%rip), bufp
+ movzwq (bufp, %rax, 2), len
+ lea crc_array(%rip), bufp
+ lea (bufp, len, 1), bufp
+ JMP_NOSPEC bufp
+
+ ################################################################
+ ## 2a) PROCESS FULL BLOCKS:
+ ################################################################
+full_block:
+ movl $128,%eax
+ lea 128*8*2(block_0), block_1
+ lea 128*8*3(block_0), block_2
+ add $128*8*1, block_0
+
+ xor crc1,crc1
+ xor crc2,crc2
+
+ # Fall thruogh into top of crc array (crc_128)
+
+ ################################################################
+ ## 3) CRC Array:
+ ################################################################
+
+crc_array:
+ i=128
+.rept 128-1
+.altmacro
+LABEL crc_ %i
+.noaltmacro
+ crc32q -i*8(block_0), crc_init
+ crc32q -i*8(block_1), crc1
+ crc32q -i*8(block_2), crc2
+ i=(i-1)
+.endr
+
+.altmacro
+LABEL crc_ %i
+.noaltmacro
+ crc32q -i*8(block_0), crc_init
+ crc32q -i*8(block_1), crc1
+# SKIP crc32 -i*8(block_2), crc2 ; Don't do this one yet
+
+ mov block_2, block_0
+
+ ################################################################
+ ## 4) Combine three results:
+ ################################################################
+
+ lea (K_table-8)(%rip), bufp # first entry is for idx 1
+ shlq $3, %rax # rax *= 8
+ pmovzxdq (bufp,%rax), %xmm0 # 2 consts: K1:K2
+ leal (%eax,%eax,2), %eax # rax *= 3 (total *24)
+ subq %rax, tmp # tmp -= rax*24
+
+ movq crc_init, %xmm1 # CRC for block 1
+ PCLMULQDQ 0x00,%xmm0,%xmm1 # Multiply by K2
+
+ movq crc1, %xmm2 # CRC for block 2
+ PCLMULQDQ 0x10, %xmm0, %xmm2 # Multiply by K1
+
+ pxor %xmm2,%xmm1
+ movq %xmm1, %rax
+ xor -i*8(block_2), %rax
+ mov crc2, crc_init
+ crc32 %rax, crc_init
+
+ ################################################################
+ ## 5) Check for end:
+ ################################################################
+
+LABEL crc_ 0
+ mov tmp, len
+ cmp $128*24, tmp
+ jae full_block
+ cmp $24, tmp
+ jae continue_block
+
+less_than_24:
+ shl $32-4, len_dw # less_than_16 expects length
+ # in upper 4 bits of len_dw
+ jnc less_than_16
+ crc32q (bufptmp), crc_init
+ crc32q 8(bufptmp), crc_init
+ jz do_return
+ add $16, bufptmp
+ # len is less than 8 if we got here
+ # less_than_8 expects length in upper 3 bits of len_dw
+ # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30]
+ shl $2, len_dw
+ jmp less_than_8_post_shl1
+
+ #######################################################################
+ ## 6) LESS THAN 256-bytes REMAIN AT THIS POINT (8-bits of len are full)
+ #######################################################################
+small:
+ shl $32-8, len_dw # Prepare len_dw for less_than_256
+ j=256
+.rept 5 # j = {256, 128, 64, 32, 16}
+.altmacro
+LABEL less_than_ %j # less_than_j: Length should be in
+ # upper lg(j) bits of len_dw
+ j=(j/2)
+ shl $1, len_dw # Get next MSB
+ JNC_LESS_THAN %j
+.noaltmacro
+ i=0
+.rept (j/8)
+ crc32q i(bufptmp), crc_init # Compute crc32 of 8-byte data
+ i=i+8
+.endr
+ jz do_return # Return if remaining length is zero
+ add $j, bufptmp # Advance buf
+.endr
+
+less_than_8: # Length should be stored in
+ # upper 3 bits of len_dw
+ shl $1, len_dw
+less_than_8_post_shl1:
+ jnc less_than_4
+ crc32l (bufptmp), crc_init_dw # CRC of 4 bytes
+ jz do_return # return if remaining data is zero
+ add $4, bufptmp
+less_than_4: # Length should be stored in
+ # upper 2 bits of len_dw
+ shl $1, len_dw
+ jnc less_than_2
+ crc32w (bufptmp), crc_init_dw # CRC of 2 bytes
+ jz do_return # return if remaining data is zero
+ add $2, bufptmp
+less_than_2: # Length should be stored in the MSB
+ # of len_dw
+ shl $1, len_dw
+ jnc less_than_1
+ crc32b (bufptmp), crc_init_dw # CRC of 1 byte
+less_than_1: # Length should be zero
+do_return:
+ movq crc_init, %rax
+ popq %rsi
+ popq %rdi
+ popq %rbx
+ ret
+ENDPROC(crc_pcl)
+
+.section .rodata, "a", @progbits
+ ################################################################
+ ## jump table Table is 129 entries x 2 bytes each
+ ################################################################
+.align 4
+jump_table:
+ i=0
+.rept 129
+.altmacro
+JMPTBL_ENTRY %i
+.noaltmacro
+ i=i+1
+.endr
+
+
+ ################################################################
+ ## PCLMULQDQ tables
+ ## Table is 128 entries x 2 words (8 bytes) each
+ ################################################################
+.align 8
+K_table:
+ .long 0x493c7d27, 0x00000001
+ .long 0xba4fc28e, 0x493c7d27
+ .long 0xddc0152b, 0xf20c0dfe
+ .long 0x9e4addf8, 0xba4fc28e
+ .long 0x39d3b296, 0x3da6d0cb
+ .long 0x0715ce53, 0xddc0152b
+ .long 0x47db8317, 0x1c291d04
+ .long 0x0d3b6092, 0x9e4addf8
+ .long 0xc96cfdc0, 0x740eef02
+ .long 0x878a92a7, 0x39d3b296
+ .long 0xdaece73e, 0x083a6eec
+ .long 0xab7aff2a, 0x0715ce53
+ .long 0x2162d385, 0xc49f4f67
+ .long 0x83348832, 0x47db8317
+ .long 0x299847d5, 0x2ad91c30
+ .long 0xb9e02b86, 0x0d3b6092
+ .long 0x18b33a4e, 0x6992cea2
+ .long 0xb6dd949b, 0xc96cfdc0
+ .long 0x78d9ccb7, 0x7e908048
+ .long 0xbac2fd7b, 0x878a92a7
+ .long 0xa60ce07b, 0x1b3d8f29
+ .long 0xce7f39f4, 0xdaece73e
+ .long 0x61d82e56, 0xf1d0f55e
+ .long 0xd270f1a2, 0xab7aff2a
+ .long 0xc619809d, 0xa87ab8a8
+ .long 0x2b3cac5d, 0x2162d385
+ .long 0x65863b64, 0x8462d800
+ .long 0x1b03397f, 0x83348832
+ .long 0xebb883bd, 0x71d111a8
+ .long 0xb3e32c28, 0x299847d5
+ .long 0x064f7f26, 0xffd852c6
+ .long 0xdd7e3b0c, 0xb9e02b86
+ .long 0xf285651c, 0xdcb17aa4
+ .long 0x10746f3c, 0x18b33a4e
+ .long 0xc7a68855, 0xf37c5aee
+ .long 0x271d9844, 0xb6dd949b
+ .long 0x8e766a0c, 0x6051d5a2
+ .long 0x93a5f730, 0x78d9ccb7
+ .long 0x6cb08e5c, 0x18b0d4ff
+ .long 0x6b749fb2, 0xbac2fd7b
+ .long 0x1393e203, 0x21f3d99c
+ .long 0xcec3662e, 0xa60ce07b
+ .long 0x96c515bb, 0x8f158014
+ .long 0xe6fc4e6a, 0xce7f39f4
+ .long 0x8227bb8a, 0xa00457f7
+ .long 0xb0cd4768, 0x61d82e56
+ .long 0x39c7ff35, 0x8d6d2c43
+ .long 0xd7a4825c, 0xd270f1a2
+ .long 0x0ab3844b, 0x00ac29cf
+ .long 0x0167d312, 0xc619809d
+ .long 0xf6076544, 0xe9adf796
+ .long 0x26f6a60a, 0x2b3cac5d
+ .long 0xa741c1bf, 0x96638b34
+ .long 0x98d8d9cb, 0x65863b64
+ .long 0x49c3cc9c, 0xe0e9f351
+ .long 0x68bce87a, 0x1b03397f
+ .long 0x57a3d037, 0x9af01f2d
+ .long 0x6956fc3b, 0xebb883bd
+ .long 0x42d98888, 0x2cff42cf
+ .long 0x3771e98f, 0xb3e32c28
+ .long 0xb42ae3d9, 0x88f25a3a
+ .long 0x2178513a, 0x064f7f26
+ .long 0xe0ac139e, 0x4e36f0b0
+ .long 0x170076fa, 0xdd7e3b0c
+ .long 0x444dd413, 0xbd6f81f8
+ .long 0x6f345e45, 0xf285651c
+ .long 0x41d17b64, 0x91c9bd4b
+ .long 0xff0dba97, 0x10746f3c
+ .long 0xa2b73df1, 0x885f087b
+ .long 0xf872e54c, 0xc7a68855
+ .long 0x1e41e9fc, 0x4c144932
+ .long 0x86d8e4d2, 0x271d9844
+ .long 0x651bd98b, 0x52148f02
+ .long 0x5bb8f1bc, 0x8e766a0c
+ .long 0xa90fd27a, 0xa3c6f37a
+ .long 0xb3af077a, 0x93a5f730
+ .long 0x4984d782, 0xd7c0557f
+ .long 0xca6ef3ac, 0x6cb08e5c
+ .long 0x234e0b26, 0x63ded06a
+ .long 0xdd66cbbb, 0x6b749fb2
+ .long 0x4597456a, 0x4d56973c
+ .long 0xe9e28eb4, 0x1393e203
+ .long 0x7b3ff57a, 0x9669c9df
+ .long 0xc9c8b782, 0xcec3662e
+ .long 0x3f70cc6f, 0xe417f38a
+ .long 0x93e106a4, 0x96c515bb
+ .long 0x62ec6c6d, 0x4b9e0f71
+ .long 0xd813b325, 0xe6fc4e6a
+ .long 0x0df04680, 0xd104b8fc
+ .long 0x2342001e, 0x8227bb8a
+ .long 0x0a2a8d7e, 0x5b397730
+ .long 0x6d9a4957, 0xb0cd4768
+ .long 0xe8b6368b, 0xe78eb416
+ .long 0xd2c3ed1a, 0x39c7ff35
+ .long 0x995a5724, 0x61ff0e01
+ .long 0x9ef68d35, 0xd7a4825c
+ .long 0x0c139b31, 0x8d96551c
+ .long 0xf2271e60, 0x0ab3844b
+ .long 0x0b0bf8ca, 0x0bf80dd2
+ .long 0x2664fd8b, 0x0167d312
+ .long 0xed64812d, 0x8821abed
+ .long 0x02ee03b2, 0xf6076544
+ .long 0x8604ae0f, 0x6a45d2b2
+ .long 0x363bd6b3, 0x26f6a60a
+ .long 0x135c83fd, 0xd8d26619
+ .long 0x5fabe670, 0xa741c1bf
+ .long 0x35ec3279, 0xde87806c
+ .long 0x00bcf5f6, 0x98d8d9cb
+ .long 0x8ae00689, 0x14338754
+ .long 0x17f27698, 0x49c3cc9c
+ .long 0x58ca5f00, 0x5bd2011f
+ .long 0xaa7c7ad5, 0x68bce87a
+ .long 0xb5cfca28, 0xdd07448e
+ .long 0xded288f8, 0x57a3d037
+ .long 0x59f229bc, 0xdde8f5b9
+ .long 0x6d390dec, 0x6956fc3b
+ .long 0x37170390, 0xa3e3e02c
+ .long 0x6353c1cc, 0x42d98888
+ .long 0xc4584f5c, 0xd73c7bea
+ .long 0xf48642e9, 0x3771e98f
+ .long 0x531377e2, 0x80ff0093
+ .long 0xdd35bc8d, 0xb42ae3d9
+ .long 0xb25b29f2, 0x8fe4c34d
+ .long 0x9a5ede41, 0x2178513a
+ .long 0xa563905d, 0xdf99fc11
+ .long 0x45cddf4e, 0xe0ac139e
+ .long 0xacfa3103, 0x6c23e841
+ .long 0xa51b6135, 0x170076fa
diff --git a/arch/x86/crypto/crct10dif-pcl-asm_64.S b/arch/x86/crypto/crct10dif-pcl-asm_64.S
new file mode 100644
index 000000000..de04d3e98
--- /dev/null
+++ b/arch/x86/crypto/crct10dif-pcl-asm_64.S
@@ -0,0 +1,651 @@
+########################################################################
+# Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
+#
+# Copyright (c) 2013, Intel Corporation
+#
+# Authors:
+# Erdinc Ozturk <erdinc.ozturk@intel.com>
+# Vinodh Gopal <vinodh.gopal@intel.com>
+# James Guilford <james.guilford@intel.com>
+# Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses. You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the
+# distribution.
+#
+# * Neither the name of the Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+#
+# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+# Function API:
+# UINT16 crc_t10dif_pcl(
+# UINT16 init_crc, //initial CRC value, 16 bits
+# const unsigned char *buf, //buffer pointer to calculate CRC on
+# UINT64 len //buffer length in bytes (64-bit data)
+# );
+#
+# Reference paper titled "Fast CRC Computation for Generic
+# Polynomials Using PCLMULQDQ Instruction"
+# URL: http://www.intel.com/content/dam/www/public/us/en/documents
+# /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
+#
+#
+
+#include <linux/linkage.h>
+
+.text
+
+#define arg1 %rdi
+#define arg2 %rsi
+#define arg3 %rdx
+
+#define arg1_low32 %edi
+
+ENTRY(crc_t10dif_pcl)
+.align 16
+
+ # adjust the 16-bit initial_crc value, scale it to 32 bits
+ shl $16, arg1_low32
+
+ # Allocate Stack Space
+ mov %rsp, %rcx
+ sub $16*2, %rsp
+ # align stack to 16 byte boundary
+ and $~(0x10 - 1), %rsp
+
+ # check if smaller than 256
+ cmp $256, arg3
+
+ # for sizes less than 128, we can't fold 64B at a time...
+ jl _less_than_128
+
+
+ # load the initial crc value
+ movd arg1_low32, %xmm10 # initial crc
+
+ # crc value does not need to be byte-reflected, but it needs
+ # to be moved to the high part of the register.
+ # because data will be byte-reflected and will align with
+ # initial crc at correct place.
+ pslldq $12, %xmm10
+
+ movdqa SHUF_MASK(%rip), %xmm11
+ # receive the initial 64B data, xor the initial crc value
+ movdqu 16*0(arg2), %xmm0
+ movdqu 16*1(arg2), %xmm1
+ movdqu 16*2(arg2), %xmm2
+ movdqu 16*3(arg2), %xmm3
+ movdqu 16*4(arg2), %xmm4
+ movdqu 16*5(arg2), %xmm5
+ movdqu 16*6(arg2), %xmm6
+ movdqu 16*7(arg2), %xmm7
+
+ pshufb %xmm11, %xmm0
+ # XOR the initial_crc value
+ pxor %xmm10, %xmm0
+ pshufb %xmm11, %xmm1
+ pshufb %xmm11, %xmm2
+ pshufb %xmm11, %xmm3
+ pshufb %xmm11, %xmm4
+ pshufb %xmm11, %xmm5
+ pshufb %xmm11, %xmm6
+ pshufb %xmm11, %xmm7
+
+ movdqa rk3(%rip), %xmm10 #xmm10 has rk3 and rk4
+ #imm value of pclmulqdq instruction
+ #will determine which constant to use
+
+ #################################################################
+ # we subtract 256 instead of 128 to save one instruction from the loop
+ sub $256, arg3
+
+ # at this section of the code, there is 64*x+y (0<=y<64) bytes of
+ # buffer. The _fold_64_B_loop will fold 64B at a time
+ # until we have 64+y Bytes of buffer
+
+
+ # fold 64B at a time. This section of the code folds 4 xmm
+ # registers in parallel
+_fold_64_B_loop:
+
+ # update the buffer pointer
+ add $128, arg2 # buf += 64#
+
+ movdqu 16*0(arg2), %xmm9
+ movdqu 16*1(arg2), %xmm12
+ pshufb %xmm11, %xmm9
+ pshufb %xmm11, %xmm12
+ movdqa %xmm0, %xmm8
+ movdqa %xmm1, %xmm13
+ pclmulqdq $0x0 , %xmm10, %xmm0
+ pclmulqdq $0x11, %xmm10, %xmm8
+ pclmulqdq $0x0 , %xmm10, %xmm1
+ pclmulqdq $0x11, %xmm10, %xmm13
+ pxor %xmm9 , %xmm0
+ xorps %xmm8 , %xmm0
+ pxor %xmm12, %xmm1
+ xorps %xmm13, %xmm1
+
+ movdqu 16*2(arg2), %xmm9
+ movdqu 16*3(arg2), %xmm12
+ pshufb %xmm11, %xmm9
+ pshufb %xmm11, %xmm12
+ movdqa %xmm2, %xmm8
+ movdqa %xmm3, %xmm13
+ pclmulqdq $0x0, %xmm10, %xmm2
+ pclmulqdq $0x11, %xmm10, %xmm8
+ pclmulqdq $0x0, %xmm10, %xmm3
+ pclmulqdq $0x11, %xmm10, %xmm13
+ pxor %xmm9 , %xmm2
+ xorps %xmm8 , %xmm2
+ pxor %xmm12, %xmm3
+ xorps %xmm13, %xmm3
+
+ movdqu 16*4(arg2), %xmm9
+ movdqu 16*5(arg2), %xmm12
+ pshufb %xmm11, %xmm9
+ pshufb %xmm11, %xmm12
+ movdqa %xmm4, %xmm8
+ movdqa %xmm5, %xmm13
+ pclmulqdq $0x0, %xmm10, %xmm4
+ pclmulqdq $0x11, %xmm10, %xmm8
+ pclmulqdq $0x0, %xmm10, %xmm5
+ pclmulqdq $0x11, %xmm10, %xmm13
+ pxor %xmm9 , %xmm4
+ xorps %xmm8 , %xmm4
+ pxor %xmm12, %xmm5
+ xorps %xmm13, %xmm5
+
+ movdqu 16*6(arg2), %xmm9
+ movdqu 16*7(arg2), %xmm12
+ pshufb %xmm11, %xmm9
+ pshufb %xmm11, %xmm12
+ movdqa %xmm6 , %xmm8
+ movdqa %xmm7 , %xmm13
+ pclmulqdq $0x0 , %xmm10, %xmm6
+ pclmulqdq $0x11, %xmm10, %xmm8
+ pclmulqdq $0x0 , %xmm10, %xmm7
+ pclmulqdq $0x11, %xmm10, %xmm13
+ pxor %xmm9 , %xmm6
+ xorps %xmm8 , %xmm6
+ pxor %xmm12, %xmm7
+ xorps %xmm13, %xmm7
+
+ sub $128, arg3
+
+ # check if there is another 64B in the buffer to be able to fold
+ jge _fold_64_B_loop
+ ##################################################################
+
+
+ add $128, arg2
+ # at this point, the buffer pointer is pointing at the last y Bytes
+ # of the buffer the 64B of folded data is in 4 of the xmm
+ # registers: xmm0, xmm1, xmm2, xmm3
+
+
+ # fold the 8 xmm registers to 1 xmm register with different constants
+
+ movdqa rk9(%rip), %xmm10
+ movdqa %xmm0, %xmm8
+ pclmulqdq $0x11, %xmm10, %xmm0
+ pclmulqdq $0x0 , %xmm10, %xmm8
+ pxor %xmm8, %xmm7
+ xorps %xmm0, %xmm7
+
+ movdqa rk11(%rip), %xmm10
+ movdqa %xmm1, %xmm8
+ pclmulqdq $0x11, %xmm10, %xmm1
+ pclmulqdq $0x0 , %xmm10, %xmm8
+ pxor %xmm8, %xmm7
+ xorps %xmm1, %xmm7
+
+ movdqa rk13(%rip), %xmm10
+ movdqa %xmm2, %xmm8
+ pclmulqdq $0x11, %xmm10, %xmm2
+ pclmulqdq $0x0 , %xmm10, %xmm8
+ pxor %xmm8, %xmm7
+ pxor %xmm2, %xmm7
+
+ movdqa rk15(%rip), %xmm10
+ movdqa %xmm3, %xmm8
+ pclmulqdq $0x11, %xmm10, %xmm3
+ pclmulqdq $0x0 , %xmm10, %xmm8
+ pxor %xmm8, %xmm7
+ xorps %xmm3, %xmm7
+
+ movdqa rk17(%rip), %xmm10
+ movdqa %xmm4, %xmm8
+ pclmulqdq $0x11, %xmm10, %xmm4
+ pclmulqdq $0x0 , %xmm10, %xmm8
+ pxor %xmm8, %xmm7
+ pxor %xmm4, %xmm7
+
+ movdqa rk19(%rip), %xmm10
+ movdqa %xmm5, %xmm8
+ pclmulqdq $0x11, %xmm10, %xmm5
+ pclmulqdq $0x0 , %xmm10, %xmm8
+ pxor %xmm8, %xmm7
+ xorps %xmm5, %xmm7
+
+ movdqa rk1(%rip), %xmm10 #xmm10 has rk1 and rk2
+ #imm value of pclmulqdq instruction
+ #will determine which constant to use
+ movdqa %xmm6, %xmm8
+ pclmulqdq $0x11, %xmm10, %xmm6
+ pclmulqdq $0x0 , %xmm10, %xmm8
+ pxor %xmm8, %xmm7
+ pxor %xmm6, %xmm7
+
+
+ # instead of 64, we add 48 to the loop counter to save 1 instruction
+ # from the loop instead of a cmp instruction, we use the negative
+ # flag with the jl instruction
+ add $128-16, arg3
+ jl _final_reduction_for_128
+
+ # now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7
+ # and the rest is in memory. We can fold 16 bytes at a time if y>=16
+ # continue folding 16B at a time
+
+_16B_reduction_loop:
+ movdqa %xmm7, %xmm8
+ pclmulqdq $0x11, %xmm10, %xmm7
+ pclmulqdq $0x0 , %xmm10, %xmm8
+ pxor %xmm8, %xmm7
+ movdqu (arg2), %xmm0
+ pshufb %xmm11, %xmm0
+ pxor %xmm0 , %xmm7
+ add $16, arg2
+ sub $16, arg3
+ # instead of a cmp instruction, we utilize the flags with the
+ # jge instruction equivalent of: cmp arg3, 16-16
+ # check if there is any more 16B in the buffer to be able to fold
+ jge _16B_reduction_loop
+
+ #now we have 16+z bytes left to reduce, where 0<= z < 16.
+ #first, we reduce the data in the xmm7 register
+
+
+_final_reduction_for_128:
+ # check if any more data to fold. If not, compute the CRC of
+ # the final 128 bits
+ add $16, arg3
+ je _128_done
+
+ # here we are getting data that is less than 16 bytes.
+ # since we know that there was data before the pointer, we can
+ # offset the input pointer before the actual point, to receive
+ # exactly 16 bytes. after that the registers need to be adjusted.
+_get_last_two_xmms:
+ movdqa %xmm7, %xmm2
+
+ movdqu -16(arg2, arg3), %xmm1
+ pshufb %xmm11, %xmm1
+
+ # get rid of the extra data that was loaded before
+ # load the shift constant
+ lea pshufb_shf_table+16(%rip), %rax
+ sub arg3, %rax
+ movdqu (%rax), %xmm0
+
+ # shift xmm2 to the left by arg3 bytes
+ pshufb %xmm0, %xmm2
+
+ # shift xmm7 to the right by 16-arg3 bytes
+ pxor mask1(%rip), %xmm0
+ pshufb %xmm0, %xmm7
+ pblendvb %xmm2, %xmm1 #xmm0 is implicit
+
+ # fold 16 Bytes
+ movdqa %xmm1, %xmm2
+ movdqa %xmm7, %xmm8
+ pclmulqdq $0x11, %xmm10, %xmm7
+ pclmulqdq $0x0 , %xmm10, %xmm8
+ pxor %xmm8, %xmm7
+ pxor %xmm2, %xmm7
+
+_128_done:
+ # compute crc of a 128-bit value
+ movdqa rk5(%rip), %xmm10 # rk5 and rk6 in xmm10
+ movdqa %xmm7, %xmm0
+
+ #64b fold
+ pclmulqdq $0x1, %xmm10, %xmm7
+ pslldq $8 , %xmm0
+ pxor %xmm0, %xmm7
+
+ #32b fold
+ movdqa %xmm7, %xmm0
+
+ pand mask2(%rip), %xmm0
+
+ psrldq $12, %xmm7
+ pclmulqdq $0x10, %xmm10, %xmm7
+ pxor %xmm0, %xmm7
+
+ #barrett reduction
+_barrett:
+ movdqa rk7(%rip), %xmm10 # rk7 and rk8 in xmm10
+ movdqa %xmm7, %xmm0
+ pclmulqdq $0x01, %xmm10, %xmm7
+ pslldq $4, %xmm7
+ pclmulqdq $0x11, %xmm10, %xmm7
+
+ pslldq $4, %xmm7
+ pxor %xmm0, %xmm7
+ pextrd $1, %xmm7, %eax
+
+_cleanup:
+ # scale the result back to 16 bits
+ shr $16, %eax
+ mov %rcx, %rsp
+ ret
+
+########################################################################
+
+.align 16
+_less_than_128:
+
+ # check if there is enough buffer to be able to fold 16B at a time
+ cmp $32, arg3
+ jl _less_than_32
+ movdqa SHUF_MASK(%rip), %xmm11
+
+ # now if there is, load the constants
+ movdqa rk1(%rip), %xmm10 # rk1 and rk2 in xmm10
+
+ movd arg1_low32, %xmm0 # get the initial crc value
+ pslldq $12, %xmm0 # align it to its correct place
+ movdqu (arg2), %xmm7 # load the plaintext
+ pshufb %xmm11, %xmm7 # byte-reflect the plaintext
+ pxor %xmm0, %xmm7
+
+
+ # update the buffer pointer
+ add $16, arg2
+
+ # update the counter. subtract 32 instead of 16 to save one
+ # instruction from the loop
+ sub $32, arg3
+
+ jmp _16B_reduction_loop
+
+
+.align 16
+_less_than_32:
+ # mov initial crc to the return value. this is necessary for
+ # zero-length buffers.
+ mov arg1_low32, %eax
+ test arg3, arg3
+ je _cleanup
+
+ movdqa SHUF_MASK(%rip), %xmm11
+
+ movd arg1_low32, %xmm0 # get the initial crc value
+ pslldq $12, %xmm0 # align it to its correct place
+
+ cmp $16, arg3
+ je _exact_16_left
+ jl _less_than_16_left
+
+ movdqu (arg2), %xmm7 # load the plaintext
+ pshufb %xmm11, %xmm7 # byte-reflect the plaintext
+ pxor %xmm0 , %xmm7 # xor the initial crc value
+ add $16, arg2
+ sub $16, arg3
+ movdqa rk1(%rip), %xmm10 # rk1 and rk2 in xmm10
+ jmp _get_last_two_xmms
+
+
+.align 16
+_less_than_16_left:
+ # use stack space to load data less than 16 bytes, zero-out
+ # the 16B in memory first.
+
+ pxor %xmm1, %xmm1
+ mov %rsp, %r11
+ movdqa %xmm1, (%r11)
+
+ cmp $4, arg3
+ jl _only_less_than_4
+
+ # backup the counter value
+ mov arg3, %r9
+ cmp $8, arg3
+ jl _less_than_8_left
+
+ # load 8 Bytes
+ mov (arg2), %rax
+ mov %rax, (%r11)
+ add $8, %r11
+ sub $8, arg3
+ add $8, arg2
+_less_than_8_left:
+
+ cmp $4, arg3
+ jl _less_than_4_left
+
+ # load 4 Bytes
+ mov (arg2), %eax
+ mov %eax, (%r11)
+ add $4, %r11
+ sub $4, arg3
+ add $4, arg2
+_less_than_4_left:
+
+ cmp $2, arg3
+ jl _less_than_2_left
+
+ # load 2 Bytes
+ mov (arg2), %ax
+ mov %ax, (%r11)
+ add $2, %r11
+ sub $2, arg3
+ add $2, arg2
+_less_than_2_left:
+ cmp $1, arg3
+ jl _zero_left
+
+ # load 1 Byte
+ mov (arg2), %al
+ mov %al, (%r11)
+_zero_left:
+ movdqa (%rsp), %xmm7
+ pshufb %xmm11, %xmm7
+ pxor %xmm0 , %xmm7 # xor the initial crc value
+
+ # shl r9, 4
+ lea pshufb_shf_table+16(%rip), %rax
+ sub %r9, %rax
+ movdqu (%rax), %xmm0
+ pxor mask1(%rip), %xmm0
+
+ pshufb %xmm0, %xmm7
+ jmp _128_done
+
+.align 16
+_exact_16_left:
+ movdqu (arg2), %xmm7
+ pshufb %xmm11, %xmm7
+ pxor %xmm0 , %xmm7 # xor the initial crc value
+
+ jmp _128_done
+
+_only_less_than_4:
+ cmp $3, arg3
+ jl _only_less_than_3
+
+ # load 3 Bytes
+ mov (arg2), %al
+ mov %al, (%r11)
+
+ mov 1(arg2), %al
+ mov %al, 1(%r11)
+
+ mov 2(arg2), %al
+ mov %al, 2(%r11)
+
+ movdqa (%rsp), %xmm7
+ pshufb %xmm11, %xmm7
+ pxor %xmm0 , %xmm7 # xor the initial crc value
+
+ psrldq $5, %xmm7
+
+ jmp _barrett
+_only_less_than_3:
+ cmp $2, arg3
+ jl _only_less_than_2
+
+ # load 2 Bytes
+ mov (arg2), %al
+ mov %al, (%r11)
+
+ mov 1(arg2), %al
+ mov %al, 1(%r11)
+
+ movdqa (%rsp), %xmm7
+ pshufb %xmm11, %xmm7
+ pxor %xmm0 , %xmm7 # xor the initial crc value
+
+ psrldq $6, %xmm7
+
+ jmp _barrett
+_only_less_than_2:
+
+ # load 1 Byte
+ mov (arg2), %al
+ mov %al, (%r11)
+
+ movdqa (%rsp), %xmm7
+ pshufb %xmm11, %xmm7
+ pxor %xmm0 , %xmm7 # xor the initial crc value
+
+ psrldq $7, %xmm7
+
+ jmp _barrett
+
+ENDPROC(crc_t10dif_pcl)
+
+.section .rodata, "a", @progbits
+.align 16
+# precomputed constants
+# these constants are precomputed from the poly:
+# 0x8bb70000 (0x8bb7 scaled to 32 bits)
+# Q = 0x18BB70000
+# rk1 = 2^(32*3) mod Q << 32
+# rk2 = 2^(32*5) mod Q << 32
+# rk3 = 2^(32*15) mod Q << 32
+# rk4 = 2^(32*17) mod Q << 32
+# rk5 = 2^(32*3) mod Q << 32
+# rk6 = 2^(32*2) mod Q << 32
+# rk7 = floor(2^64/Q)
+# rk8 = Q
+rk1:
+.quad 0x2d56000000000000
+rk2:
+.quad 0x06df000000000000
+rk3:
+.quad 0x9d9d000000000000
+rk4:
+.quad 0x7cf5000000000000
+rk5:
+.quad 0x2d56000000000000
+rk6:
+.quad 0x1368000000000000
+rk7:
+.quad 0x00000001f65a57f8
+rk8:
+.quad 0x000000018bb70000
+
+rk9:
+.quad 0xceae000000000000
+rk10:
+.quad 0xbfd6000000000000
+rk11:
+.quad 0x1e16000000000000
+rk12:
+.quad 0x713c000000000000
+rk13:
+.quad 0xf7f9000000000000
+rk14:
+.quad 0x80a6000000000000
+rk15:
+.quad 0x044c000000000000
+rk16:
+.quad 0xe658000000000000
+rk17:
+.quad 0xad18000000000000
+rk18:
+.quad 0xa497000000000000
+rk19:
+.quad 0x6ee3000000000000
+rk20:
+.quad 0xe7b5000000000000
+
+
+
+.section .rodata.cst16.mask1, "aM", @progbits, 16
+.align 16
+mask1:
+.octa 0x80808080808080808080808080808080
+
+.section .rodata.cst16.mask2, "aM", @progbits, 16
+.align 16
+mask2:
+.octa 0x00000000FFFFFFFFFFFFFFFFFFFFFFFF
+
+.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
+.align 16
+SHUF_MASK:
+.octa 0x000102030405060708090A0B0C0D0E0F
+
+.section .rodata.cst32.pshufb_shf_table, "aM", @progbits, 32
+.align 32
+pshufb_shf_table:
+# use these values for shift constants for the pshufb instruction
+# different alignments result in values as shown:
+# DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1
+# DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2
+# DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3
+# DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4
+# DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5
+# DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6
+# DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9 (16-7) / shr7
+# DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8 (16-8) / shr8
+# DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7 (16-9) / shr9
+# DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6 (16-10) / shr10
+# DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5 (16-11) / shr11
+# DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4 (16-12) / shr12
+# DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3 (16-13) / shr13
+# DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2 (16-14) / shr14
+# DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1 (16-15) / shr15
+.octa 0x8f8e8d8c8b8a89888786858483828100
+.octa 0x000e0d0c0b0a09080706050403020100
diff --git a/arch/x86/crypto/crct10dif-pclmul_glue.c b/arch/x86/crypto/crct10dif-pclmul_glue.c
new file mode 100644
index 000000000..7bbfe7d35
--- /dev/null
+++ b/arch/x86/crypto/crct10dif-pclmul_glue.c
@@ -0,0 +1,148 @@
+/*
+ * Cryptographic API.
+ *
+ * T10 Data Integrity Field CRC16 Crypto Transform using PCLMULQDQ Instructions
+ *
+ * Copyright (C) 2013 Intel Corporation
+ * Author: Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/crc-t10dif.h>
+#include <crypto/internal/hash.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <asm/fpu/api.h>
+#include <asm/cpufeatures.h>
+#include <asm/cpu_device_id.h>
+
+asmlinkage __u16 crc_t10dif_pcl(__u16 crc, const unsigned char *buf,
+ size_t len);
+
+struct chksum_desc_ctx {
+ __u16 crc;
+};
+
+/*
+ * Steps through buffer one byte at at time, calculates reflected
+ * crc using table.
+ */
+
+static int chksum_init(struct shash_desc *desc)
+{
+ struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+
+ ctx->crc = 0;
+
+ return 0;
+}
+
+static int chksum_update(struct shash_desc *desc, const u8 *data,
+ unsigned int length)
+{
+ struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+
+ if (irq_fpu_usable()) {
+ kernel_fpu_begin();
+ ctx->crc = crc_t10dif_pcl(ctx->crc, data, length);
+ kernel_fpu_end();
+ } else
+ ctx->crc = crc_t10dif_generic(ctx->crc, data, length);
+ return 0;
+}
+
+static int chksum_final(struct shash_desc *desc, u8 *out)
+{
+ struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+
+ *(__u16 *)out = ctx->crc;
+ return 0;
+}
+
+static int __chksum_finup(__u16 crc, const u8 *data, unsigned int len, u8 *out)
+{
+ if (irq_fpu_usable()) {
+ kernel_fpu_begin();
+ *(__u16 *)out = crc_t10dif_pcl(crc, data, len);
+ kernel_fpu_end();
+ } else
+ *(__u16 *)out = crc_t10dif_generic(crc, data, len);
+ return 0;
+}
+
+static int chksum_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+
+ return __chksum_finup(ctx->crc, data, len, out);
+}
+
+static int chksum_digest(struct shash_desc *desc, const u8 *data,
+ unsigned int length, u8 *out)
+{
+ return __chksum_finup(0, data, length, out);
+}
+
+static struct shash_alg alg = {
+ .digestsize = CRC_T10DIF_DIGEST_SIZE,
+ .init = chksum_init,
+ .update = chksum_update,
+ .final = chksum_final,
+ .finup = chksum_finup,
+ .digest = chksum_digest,
+ .descsize = sizeof(struct chksum_desc_ctx),
+ .base = {
+ .cra_name = "crct10dif",
+ .cra_driver_name = "crct10dif-pclmul",
+ .cra_priority = 200,
+ .cra_blocksize = CRC_T10DIF_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+};
+
+static const struct x86_cpu_id crct10dif_cpu_id[] = {
+ X86_FEATURE_MATCH(X86_FEATURE_PCLMULQDQ),
+ {}
+};
+MODULE_DEVICE_TABLE(x86cpu, crct10dif_cpu_id);
+
+static int __init crct10dif_intel_mod_init(void)
+{
+ if (!x86_match_cpu(crct10dif_cpu_id))
+ return -ENODEV;
+
+ return crypto_register_shash(&alg);
+}
+
+static void __exit crct10dif_intel_mod_fini(void)
+{
+ crypto_unregister_shash(&alg);
+}
+
+module_init(crct10dif_intel_mod_init);
+module_exit(crct10dif_intel_mod_fini);
+
+MODULE_AUTHOR("Tim Chen <tim.c.chen@linux.intel.com>");
+MODULE_DESCRIPTION("T10 DIF CRC calculation accelerated with PCLMULQDQ.");
+MODULE_LICENSE("GPL");
+
+MODULE_ALIAS_CRYPTO("crct10dif");
+MODULE_ALIAS_CRYPTO("crct10dif-pclmul");
diff --git a/arch/x86/crypto/des3_ede-asm_64.S b/arch/x86/crypto/des3_ede-asm_64.S
new file mode 100644
index 000000000..8e49ce117
--- /dev/null
+++ b/arch/x86/crypto/des3_ede-asm_64.S
@@ -0,0 +1,808 @@
+/*
+ * des3_ede-asm_64.S - x86-64 assembly implementation of 3DES cipher
+ *
+ * Copyright © 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/linkage.h>
+
+.file "des3_ede-asm_64.S"
+.text
+
+#define s1 .L_s1
+#define s2 ((s1) + (64*8))
+#define s3 ((s2) + (64*8))
+#define s4 ((s3) + (64*8))
+#define s5 ((s4) + (64*8))
+#define s6 ((s5) + (64*8))
+#define s7 ((s6) + (64*8))
+#define s8 ((s7) + (64*8))
+
+/* register macros */
+#define CTX %rdi
+
+#define RL0 %r8
+#define RL1 %r9
+#define RL2 %r10
+
+#define RL0d %r8d
+#define RL1d %r9d
+#define RL2d %r10d
+
+#define RR0 %r11
+#define RR1 %r12
+#define RR2 %r13
+
+#define RR0d %r11d
+#define RR1d %r12d
+#define RR2d %r13d
+
+#define RW0 %rax
+#define RW1 %rbx
+#define RW2 %rcx
+
+#define RW0d %eax
+#define RW1d %ebx
+#define RW2d %ecx
+
+#define RW0bl %al
+#define RW1bl %bl
+#define RW2bl %cl
+
+#define RW0bh %ah
+#define RW1bh %bh
+#define RW2bh %ch
+
+#define RT0 %r15
+#define RT1 %rsi
+#define RT2 %r14
+#define RT3 %rdx
+
+#define RT0d %r15d
+#define RT1d %esi
+#define RT2d %r14d
+#define RT3d %edx
+
+/***********************************************************************
+ * 1-way 3DES
+ ***********************************************************************/
+#define do_permutation(a, b, offset, mask) \
+ movl a, RT0d; \
+ shrl $(offset), RT0d; \
+ xorl b, RT0d; \
+ andl $(mask), RT0d; \
+ xorl RT0d, b; \
+ shll $(offset), RT0d; \
+ xorl RT0d, a;
+
+#define expand_to_64bits(val, mask) \
+ movl val##d, RT0d; \
+ rorl $4, RT0d; \
+ shlq $32, RT0; \
+ orq RT0, val; \
+ andq mask, val;
+
+#define compress_to_64bits(val) \
+ movq val, RT0; \
+ shrq $32, RT0; \
+ roll $4, RT0d; \
+ orl RT0d, val##d;
+
+#define initial_permutation(left, right) \
+ do_permutation(left##d, right##d, 4, 0x0f0f0f0f); \
+ do_permutation(left##d, right##d, 16, 0x0000ffff); \
+ do_permutation(right##d, left##d, 2, 0x33333333); \
+ do_permutation(right##d, left##d, 8, 0x00ff00ff); \
+ movabs $0x3f3f3f3f3f3f3f3f, RT3; \
+ movl left##d, RW0d; \
+ roll $1, right##d; \
+ xorl right##d, RW0d; \
+ andl $0xaaaaaaaa, RW0d; \
+ xorl RW0d, left##d; \
+ xorl RW0d, right##d; \
+ roll $1, left##d; \
+ expand_to_64bits(right, RT3); \
+ expand_to_64bits(left, RT3);
+
+#define final_permutation(left, right) \
+ compress_to_64bits(right); \
+ compress_to_64bits(left); \
+ movl right##d, RW0d; \
+ rorl $1, left##d; \
+ xorl left##d, RW0d; \
+ andl $0xaaaaaaaa, RW0d; \
+ xorl RW0d, right##d; \
+ xorl RW0d, left##d; \
+ rorl $1, right##d; \
+ do_permutation(right##d, left##d, 8, 0x00ff00ff); \
+ do_permutation(right##d, left##d, 2, 0x33333333); \
+ do_permutation(left##d, right##d, 16, 0x0000ffff); \
+ do_permutation(left##d, right##d, 4, 0x0f0f0f0f);
+
+#define round1(n, from, to, load_next_key) \
+ xorq from, RW0; \
+ \
+ movzbl RW0bl, RT0d; \
+ movzbl RW0bh, RT1d; \
+ shrq $16, RW0; \
+ movzbl RW0bl, RT2d; \
+ movzbl RW0bh, RT3d; \
+ shrq $16, RW0; \
+ movq s8(, RT0, 8), RT0; \
+ xorq s6(, RT1, 8), to; \
+ movzbl RW0bl, RL1d; \
+ movzbl RW0bh, RT1d; \
+ shrl $16, RW0d; \
+ xorq s4(, RT2, 8), RT0; \
+ xorq s2(, RT3, 8), to; \
+ movzbl RW0bl, RT2d; \
+ movzbl RW0bh, RT3d; \
+ xorq s7(, RL1, 8), RT0; \
+ xorq s5(, RT1, 8), to; \
+ xorq s3(, RT2, 8), RT0; \
+ load_next_key(n, RW0); \
+ xorq RT0, to; \
+ xorq s1(, RT3, 8), to; \
+
+#define load_next_key(n, RWx) \
+ movq (((n) + 1) * 8)(CTX), RWx;
+
+#define dummy2(a, b) /*_*/
+
+#define read_block(io, left, right) \
+ movl (io), left##d; \
+ movl 4(io), right##d; \
+ bswapl left##d; \
+ bswapl right##d;
+
+#define write_block(io, left, right) \
+ bswapl left##d; \
+ bswapl right##d; \
+ movl left##d, (io); \
+ movl right##d, 4(io);
+
+ENTRY(des3_ede_x86_64_crypt_blk)
+ /* input:
+ * %rdi: round keys, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+ pushq %rbx;
+ pushq %r12;
+ pushq %r13;
+ pushq %r14;
+ pushq %r15;
+
+ pushq %rsi; /* dst */
+
+ read_block(%rdx, RL0, RR0);
+ initial_permutation(RL0, RR0);
+
+ movq (CTX), RW0;
+
+ round1(0, RR0, RL0, load_next_key);
+ round1(1, RL0, RR0, load_next_key);
+ round1(2, RR0, RL0, load_next_key);
+ round1(3, RL0, RR0, load_next_key);
+ round1(4, RR0, RL0, load_next_key);
+ round1(5, RL0, RR0, load_next_key);
+ round1(6, RR0, RL0, load_next_key);
+ round1(7, RL0, RR0, load_next_key);
+ round1(8, RR0, RL0, load_next_key);
+ round1(9, RL0, RR0, load_next_key);
+ round1(10, RR0, RL0, load_next_key);
+ round1(11, RL0, RR0, load_next_key);
+ round1(12, RR0, RL0, load_next_key);
+ round1(13, RL0, RR0, load_next_key);
+ round1(14, RR0, RL0, load_next_key);
+ round1(15, RL0, RR0, load_next_key);
+
+ round1(16+0, RL0, RR0, load_next_key);
+ round1(16+1, RR0, RL0, load_next_key);
+ round1(16+2, RL0, RR0, load_next_key);
+ round1(16+3, RR0, RL0, load_next_key);
+ round1(16+4, RL0, RR0, load_next_key);
+ round1(16+5, RR0, RL0, load_next_key);
+ round1(16+6, RL0, RR0, load_next_key);
+ round1(16+7, RR0, RL0, load_next_key);
+ round1(16+8, RL0, RR0, load_next_key);
+ round1(16+9, RR0, RL0, load_next_key);
+ round1(16+10, RL0, RR0, load_next_key);
+ round1(16+11, RR0, RL0, load_next_key);
+ round1(16+12, RL0, RR0, load_next_key);
+ round1(16+13, RR0, RL0, load_next_key);
+ round1(16+14, RL0, RR0, load_next_key);
+ round1(16+15, RR0, RL0, load_next_key);
+
+ round1(32+0, RR0, RL0, load_next_key);
+ round1(32+1, RL0, RR0, load_next_key);
+ round1(32+2, RR0, RL0, load_next_key);
+ round1(32+3, RL0, RR0, load_next_key);
+ round1(32+4, RR0, RL0, load_next_key);
+ round1(32+5, RL0, RR0, load_next_key);
+ round1(32+6, RR0, RL0, load_next_key);
+ round1(32+7, RL0, RR0, load_next_key);
+ round1(32+8, RR0, RL0, load_next_key);
+ round1(32+9, RL0, RR0, load_next_key);
+ round1(32+10, RR0, RL0, load_next_key);
+ round1(32+11, RL0, RR0, load_next_key);
+ round1(32+12, RR0, RL0, load_next_key);
+ round1(32+13, RL0, RR0, load_next_key);
+ round1(32+14, RR0, RL0, load_next_key);
+ round1(32+15, RL0, RR0, dummy2);
+
+ final_permutation(RR0, RL0);
+
+ popq %rsi /* dst */
+ write_block(%rsi, RR0, RL0);
+
+ popq %r15;
+ popq %r14;
+ popq %r13;
+ popq %r12;
+ popq %rbx;
+
+ ret;
+ENDPROC(des3_ede_x86_64_crypt_blk)
+
+/***********************************************************************
+ * 3-way 3DES
+ ***********************************************************************/
+#define expand_to_64bits(val, mask) \
+ movl val##d, RT0d; \
+ rorl $4, RT0d; \
+ shlq $32, RT0; \
+ orq RT0, val; \
+ andq mask, val;
+
+#define compress_to_64bits(val) \
+ movq val, RT0; \
+ shrq $32, RT0; \
+ roll $4, RT0d; \
+ orl RT0d, val##d;
+
+#define initial_permutation3(left, right) \
+ do_permutation(left##0d, right##0d, 4, 0x0f0f0f0f); \
+ do_permutation(left##0d, right##0d, 16, 0x0000ffff); \
+ do_permutation(left##1d, right##1d, 4, 0x0f0f0f0f); \
+ do_permutation(left##1d, right##1d, 16, 0x0000ffff); \
+ do_permutation(left##2d, right##2d, 4, 0x0f0f0f0f); \
+ do_permutation(left##2d, right##2d, 16, 0x0000ffff); \
+ \
+ do_permutation(right##0d, left##0d, 2, 0x33333333); \
+ do_permutation(right##0d, left##0d, 8, 0x00ff00ff); \
+ do_permutation(right##1d, left##1d, 2, 0x33333333); \
+ do_permutation(right##1d, left##1d, 8, 0x00ff00ff); \
+ do_permutation(right##2d, left##2d, 2, 0x33333333); \
+ do_permutation(right##2d, left##2d, 8, 0x00ff00ff); \
+ \
+ movabs $0x3f3f3f3f3f3f3f3f, RT3; \
+ \
+ movl left##0d, RW0d; \
+ roll $1, right##0d; \
+ xorl right##0d, RW0d; \
+ andl $0xaaaaaaaa, RW0d; \
+ xorl RW0d, left##0d; \
+ xorl RW0d, right##0d; \
+ roll $1, left##0d; \
+ expand_to_64bits(right##0, RT3); \
+ expand_to_64bits(left##0, RT3); \
+ movl left##1d, RW1d; \
+ roll $1, right##1d; \
+ xorl right##1d, RW1d; \
+ andl $0xaaaaaaaa, RW1d; \
+ xorl RW1d, left##1d; \
+ xorl RW1d, right##1d; \
+ roll $1, left##1d; \
+ expand_to_64bits(right##1, RT3); \
+ expand_to_64bits(left##1, RT3); \
+ movl left##2d, RW2d; \
+ roll $1, right##2d; \
+ xorl right##2d, RW2d; \
+ andl $0xaaaaaaaa, RW2d; \
+ xorl RW2d, left##2d; \
+ xorl RW2d, right##2d; \
+ roll $1, left##2d; \
+ expand_to_64bits(right##2, RT3); \
+ expand_to_64bits(left##2, RT3);
+
+#define final_permutation3(left, right) \
+ compress_to_64bits(right##0); \
+ compress_to_64bits(left##0); \
+ movl right##0d, RW0d; \
+ rorl $1, left##0d; \
+ xorl left##0d, RW0d; \
+ andl $0xaaaaaaaa, RW0d; \
+ xorl RW0d, right##0d; \
+ xorl RW0d, left##0d; \
+ rorl $1, right##0d; \
+ compress_to_64bits(right##1); \
+ compress_to_64bits(left##1); \
+ movl right##1d, RW1d; \
+ rorl $1, left##1d; \
+ xorl left##1d, RW1d; \
+ andl $0xaaaaaaaa, RW1d; \
+ xorl RW1d, right##1d; \
+ xorl RW1d, left##1d; \
+ rorl $1, right##1d; \
+ compress_to_64bits(right##2); \
+ compress_to_64bits(left##2); \
+ movl right##2d, RW2d; \
+ rorl $1, left##2d; \
+ xorl left##2d, RW2d; \
+ andl $0xaaaaaaaa, RW2d; \
+ xorl RW2d, right##2d; \
+ xorl RW2d, left##2d; \
+ rorl $1, right##2d; \
+ \
+ do_permutation(right##0d, left##0d, 8, 0x00ff00ff); \
+ do_permutation(right##0d, left##0d, 2, 0x33333333); \
+ do_permutation(right##1d, left##1d, 8, 0x00ff00ff); \
+ do_permutation(right##1d, left##1d, 2, 0x33333333); \
+ do_permutation(right##2d, left##2d, 8, 0x00ff00ff); \
+ do_permutation(right##2d, left##2d, 2, 0x33333333); \
+ \
+ do_permutation(left##0d, right##0d, 16, 0x0000ffff); \
+ do_permutation(left##0d, right##0d, 4, 0x0f0f0f0f); \
+ do_permutation(left##1d, right##1d, 16, 0x0000ffff); \
+ do_permutation(left##1d, right##1d, 4, 0x0f0f0f0f); \
+ do_permutation(left##2d, right##2d, 16, 0x0000ffff); \
+ do_permutation(left##2d, right##2d, 4, 0x0f0f0f0f);
+
+#define round3(n, from, to, load_next_key, do_movq) \
+ xorq from##0, RW0; \
+ movzbl RW0bl, RT3d; \
+ movzbl RW0bh, RT1d; \
+ shrq $16, RW0; \
+ xorq s8(, RT3, 8), to##0; \
+ xorq s6(, RT1, 8), to##0; \
+ movzbl RW0bl, RT3d; \
+ movzbl RW0bh, RT1d; \
+ shrq $16, RW0; \
+ xorq s4(, RT3, 8), to##0; \
+ xorq s2(, RT1, 8), to##0; \
+ movzbl RW0bl, RT3d; \
+ movzbl RW0bh, RT1d; \
+ shrl $16, RW0d; \
+ xorq s7(, RT3, 8), to##0; \
+ xorq s5(, RT1, 8), to##0; \
+ movzbl RW0bl, RT3d; \
+ movzbl RW0bh, RT1d; \
+ load_next_key(n, RW0); \
+ xorq s3(, RT3, 8), to##0; \
+ xorq s1(, RT1, 8), to##0; \
+ xorq from##1, RW1; \
+ movzbl RW1bl, RT3d; \
+ movzbl RW1bh, RT1d; \
+ shrq $16, RW1; \
+ xorq s8(, RT3, 8), to##1; \
+ xorq s6(, RT1, 8), to##1; \
+ movzbl RW1bl, RT3d; \
+ movzbl RW1bh, RT1d; \
+ shrq $16, RW1; \
+ xorq s4(, RT3, 8), to##1; \
+ xorq s2(, RT1, 8), to##1; \
+ movzbl RW1bl, RT3d; \
+ movzbl RW1bh, RT1d; \
+ shrl $16, RW1d; \
+ xorq s7(, RT3, 8), to##1; \
+ xorq s5(, RT1, 8), to##1; \
+ movzbl RW1bl, RT3d; \
+ movzbl RW1bh, RT1d; \
+ do_movq(RW0, RW1); \
+ xorq s3(, RT3, 8), to##1; \
+ xorq s1(, RT1, 8), to##1; \
+ xorq from##2, RW2; \
+ movzbl RW2bl, RT3d; \
+ movzbl RW2bh, RT1d; \
+ shrq $16, RW2; \
+ xorq s8(, RT3, 8), to##2; \
+ xorq s6(, RT1, 8), to##2; \
+ movzbl RW2bl, RT3d; \
+ movzbl RW2bh, RT1d; \
+ shrq $16, RW2; \
+ xorq s4(, RT3, 8), to##2; \
+ xorq s2(, RT1, 8), to##2; \
+ movzbl RW2bl, RT3d; \
+ movzbl RW2bh, RT1d; \
+ shrl $16, RW2d; \
+ xorq s7(, RT3, 8), to##2; \
+ xorq s5(, RT1, 8), to##2; \
+ movzbl RW2bl, RT3d; \
+ movzbl RW2bh, RT1d; \
+ do_movq(RW0, RW2); \
+ xorq s3(, RT3, 8), to##2; \
+ xorq s1(, RT1, 8), to##2;
+
+#define __movq(src, dst) \
+ movq src, dst;
+
+ENTRY(des3_ede_x86_64_crypt_blk_3way)
+ /* input:
+ * %rdi: ctx, round keys
+ * %rsi: dst (3 blocks)
+ * %rdx: src (3 blocks)
+ */
+
+ pushq %rbx;
+ pushq %r12;
+ pushq %r13;
+ pushq %r14;
+ pushq %r15;
+
+ pushq %rsi /* dst */
+
+ /* load input */
+ movl 0 * 4(%rdx), RL0d;
+ movl 1 * 4(%rdx), RR0d;
+ movl 2 * 4(%rdx), RL1d;
+ movl 3 * 4(%rdx), RR1d;
+ movl 4 * 4(%rdx), RL2d;
+ movl 5 * 4(%rdx), RR2d;
+
+ bswapl RL0d;
+ bswapl RR0d;
+ bswapl RL1d;
+ bswapl RR1d;
+ bswapl RL2d;
+ bswapl RR2d;
+
+ initial_permutation3(RL, RR);
+
+ movq 0(CTX), RW0;
+ movq RW0, RW1;
+ movq RW0, RW2;
+
+ round3(0, RR, RL, load_next_key, __movq);
+ round3(1, RL, RR, load_next_key, __movq);
+ round3(2, RR, RL, load_next_key, __movq);
+ round3(3, RL, RR, load_next_key, __movq);
+ round3(4, RR, RL, load_next_key, __movq);
+ round3(5, RL, RR, load_next_key, __movq);
+ round3(6, RR, RL, load_next_key, __movq);
+ round3(7, RL, RR, load_next_key, __movq);
+ round3(8, RR, RL, load_next_key, __movq);
+ round3(9, RL, RR, load_next_key, __movq);
+ round3(10, RR, RL, load_next_key, __movq);
+ round3(11, RL, RR, load_next_key, __movq);
+ round3(12, RR, RL, load_next_key, __movq);
+ round3(13, RL, RR, load_next_key, __movq);
+ round3(14, RR, RL, load_next_key, __movq);
+ round3(15, RL, RR, load_next_key, __movq);
+
+ round3(16+0, RL, RR, load_next_key, __movq);
+ round3(16+1, RR, RL, load_next_key, __movq);
+ round3(16+2, RL, RR, load_next_key, __movq);
+ round3(16+3, RR, RL, load_next_key, __movq);
+ round3(16+4, RL, RR, load_next_key, __movq);
+ round3(16+5, RR, RL, load_next_key, __movq);
+ round3(16+6, RL, RR, load_next_key, __movq);
+ round3(16+7, RR, RL, load_next_key, __movq);
+ round3(16+8, RL, RR, load_next_key, __movq);
+ round3(16+9, RR, RL, load_next_key, __movq);
+ round3(16+10, RL, RR, load_next_key, __movq);
+ round3(16+11, RR, RL, load_next_key, __movq);
+ round3(16+12, RL, RR, load_next_key, __movq);
+ round3(16+13, RR, RL, load_next_key, __movq);
+ round3(16+14, RL, RR, load_next_key, __movq);
+ round3(16+15, RR, RL, load_next_key, __movq);
+
+ round3(32+0, RR, RL, load_next_key, __movq);
+ round3(32+1, RL, RR, load_next_key, __movq);
+ round3(32+2, RR, RL, load_next_key, __movq);
+ round3(32+3, RL, RR, load_next_key, __movq);
+ round3(32+4, RR, RL, load_next_key, __movq);
+ round3(32+5, RL, RR, load_next_key, __movq);
+ round3(32+6, RR, RL, load_next_key, __movq);
+ round3(32+7, RL, RR, load_next_key, __movq);
+ round3(32+8, RR, RL, load_next_key, __movq);
+ round3(32+9, RL, RR, load_next_key, __movq);
+ round3(32+10, RR, RL, load_next_key, __movq);
+ round3(32+11, RL, RR, load_next_key, __movq);
+ round3(32+12, RR, RL, load_next_key, __movq);
+ round3(32+13, RL, RR, load_next_key, __movq);
+ round3(32+14, RR, RL, load_next_key, __movq);
+ round3(32+15, RL, RR, dummy2, dummy2);
+
+ final_permutation3(RR, RL);
+
+ bswapl RR0d;
+ bswapl RL0d;
+ bswapl RR1d;
+ bswapl RL1d;
+ bswapl RR2d;
+ bswapl RL2d;
+
+ popq %rsi /* dst */
+ movl RR0d, 0 * 4(%rsi);
+ movl RL0d, 1 * 4(%rsi);
+ movl RR1d, 2 * 4(%rsi);
+ movl RL1d, 3 * 4(%rsi);
+ movl RR2d, 4 * 4(%rsi);
+ movl RL2d, 5 * 4(%rsi);
+
+ popq %r15;
+ popq %r14;
+ popq %r13;
+ popq %r12;
+ popq %rbx;
+
+ ret;
+ENDPROC(des3_ede_x86_64_crypt_blk_3way)
+
+.section .rodata, "a", @progbits
+.align 16
+.L_s1:
+ .quad 0x0010100001010400, 0x0000000000000000
+ .quad 0x0000100000010000, 0x0010100001010404
+ .quad 0x0010100001010004, 0x0000100000010404
+ .quad 0x0000000000000004, 0x0000100000010000
+ .quad 0x0000000000000400, 0x0010100001010400
+ .quad 0x0010100001010404, 0x0000000000000400
+ .quad 0x0010000001000404, 0x0010100001010004
+ .quad 0x0010000001000000, 0x0000000000000004
+ .quad 0x0000000000000404, 0x0010000001000400
+ .quad 0x0010000001000400, 0x0000100000010400
+ .quad 0x0000100000010400, 0x0010100001010000
+ .quad 0x0010100001010000, 0x0010000001000404
+ .quad 0x0000100000010004, 0x0010000001000004
+ .quad 0x0010000001000004, 0x0000100000010004
+ .quad 0x0000000000000000, 0x0000000000000404
+ .quad 0x0000100000010404, 0x0010000001000000
+ .quad 0x0000100000010000, 0x0010100001010404
+ .quad 0x0000000000000004, 0x0010100001010000
+ .quad 0x0010100001010400, 0x0010000001000000
+ .quad 0x0010000001000000, 0x0000000000000400
+ .quad 0x0010100001010004, 0x0000100000010000
+ .quad 0x0000100000010400, 0x0010000001000004
+ .quad 0x0000000000000400, 0x0000000000000004
+ .quad 0x0010000001000404, 0x0000100000010404
+ .quad 0x0010100001010404, 0x0000100000010004
+ .quad 0x0010100001010000, 0x0010000001000404
+ .quad 0x0010000001000004, 0x0000000000000404
+ .quad 0x0000100000010404, 0x0010100001010400
+ .quad 0x0000000000000404, 0x0010000001000400
+ .quad 0x0010000001000400, 0x0000000000000000
+ .quad 0x0000100000010004, 0x0000100000010400
+ .quad 0x0000000000000000, 0x0010100001010004
+.L_s2:
+ .quad 0x0801080200100020, 0x0800080000000000
+ .quad 0x0000080000000000, 0x0001080200100020
+ .quad 0x0001000000100000, 0x0000000200000020
+ .quad 0x0801000200100020, 0x0800080200000020
+ .quad 0x0800000200000020, 0x0801080200100020
+ .quad 0x0801080000100000, 0x0800000000000000
+ .quad 0x0800080000000000, 0x0001000000100000
+ .quad 0x0000000200000020, 0x0801000200100020
+ .quad 0x0001080000100000, 0x0001000200100020
+ .quad 0x0800080200000020, 0x0000000000000000
+ .quad 0x0800000000000000, 0x0000080000000000
+ .quad 0x0001080200100020, 0x0801000000100000
+ .quad 0x0001000200100020, 0x0800000200000020
+ .quad 0x0000000000000000, 0x0001080000100000
+ .quad 0x0000080200000020, 0x0801080000100000
+ .quad 0x0801000000100000, 0x0000080200000020
+ .quad 0x0000000000000000, 0x0001080200100020
+ .quad 0x0801000200100020, 0x0001000000100000
+ .quad 0x0800080200000020, 0x0801000000100000
+ .quad 0x0801080000100000, 0x0000080000000000
+ .quad 0x0801000000100000, 0x0800080000000000
+ .quad 0x0000000200000020, 0x0801080200100020
+ .quad 0x0001080200100020, 0x0000000200000020
+ .quad 0x0000080000000000, 0x0800000000000000
+ .quad 0x0000080200000020, 0x0801080000100000
+ .quad 0x0001000000100000, 0x0800000200000020
+ .quad 0x0001000200100020, 0x0800080200000020
+ .quad 0x0800000200000020, 0x0001000200100020
+ .quad 0x0001080000100000, 0x0000000000000000
+ .quad 0x0800080000000000, 0x0000080200000020
+ .quad 0x0800000000000000, 0x0801000200100020
+ .quad 0x0801080200100020, 0x0001080000100000
+.L_s3:
+ .quad 0x0000002000000208, 0x0000202008020200
+ .quad 0x0000000000000000, 0x0000200008020008
+ .quad 0x0000002008000200, 0x0000000000000000
+ .quad 0x0000202000020208, 0x0000002008000200
+ .quad 0x0000200000020008, 0x0000000008000008
+ .quad 0x0000000008000008, 0x0000200000020000
+ .quad 0x0000202008020208, 0x0000200000020008
+ .quad 0x0000200008020000, 0x0000002000000208
+ .quad 0x0000000008000000, 0x0000000000000008
+ .quad 0x0000202008020200, 0x0000002000000200
+ .quad 0x0000202000020200, 0x0000200008020000
+ .quad 0x0000200008020008, 0x0000202000020208
+ .quad 0x0000002008000208, 0x0000202000020200
+ .quad 0x0000200000020000, 0x0000002008000208
+ .quad 0x0000000000000008, 0x0000202008020208
+ .quad 0x0000002000000200, 0x0000000008000000
+ .quad 0x0000202008020200, 0x0000000008000000
+ .quad 0x0000200000020008, 0x0000002000000208
+ .quad 0x0000200000020000, 0x0000202008020200
+ .quad 0x0000002008000200, 0x0000000000000000
+ .quad 0x0000002000000200, 0x0000200000020008
+ .quad 0x0000202008020208, 0x0000002008000200
+ .quad 0x0000000008000008, 0x0000002000000200
+ .quad 0x0000000000000000, 0x0000200008020008
+ .quad 0x0000002008000208, 0x0000200000020000
+ .quad 0x0000000008000000, 0x0000202008020208
+ .quad 0x0000000000000008, 0x0000202000020208
+ .quad 0x0000202000020200, 0x0000000008000008
+ .quad 0x0000200008020000, 0x0000002008000208
+ .quad 0x0000002000000208, 0x0000200008020000
+ .quad 0x0000202000020208, 0x0000000000000008
+ .quad 0x0000200008020008, 0x0000202000020200
+.L_s4:
+ .quad 0x1008020000002001, 0x1000020800002001
+ .quad 0x1000020800002001, 0x0000000800000000
+ .quad 0x0008020800002000, 0x1008000800000001
+ .quad 0x1008000000000001, 0x1000020000002001
+ .quad 0x0000000000000000, 0x0008020000002000
+ .quad 0x0008020000002000, 0x1008020800002001
+ .quad 0x1000000800000001, 0x0000000000000000
+ .quad 0x0008000800000000, 0x1008000000000001
+ .quad 0x1000000000000001, 0x0000020000002000
+ .quad 0x0008000000000000, 0x1008020000002001
+ .quad 0x0000000800000000, 0x0008000000000000
+ .quad 0x1000020000002001, 0x0000020800002000
+ .quad 0x1008000800000001, 0x1000000000000001
+ .quad 0x0000020800002000, 0x0008000800000000
+ .quad 0x0000020000002000, 0x0008020800002000
+ .quad 0x1008020800002001, 0x1000000800000001
+ .quad 0x0008000800000000, 0x1008000000000001
+ .quad 0x0008020000002000, 0x1008020800002001
+ .quad 0x1000000800000001, 0x0000000000000000
+ .quad 0x0000000000000000, 0x0008020000002000
+ .quad 0x0000020800002000, 0x0008000800000000
+ .quad 0x1008000800000001, 0x1000000000000001
+ .quad 0x1008020000002001, 0x1000020800002001
+ .quad 0x1000020800002001, 0x0000000800000000
+ .quad 0x1008020800002001, 0x1000000800000001
+ .quad 0x1000000000000001, 0x0000020000002000
+ .quad 0x1008000000000001, 0x1000020000002001
+ .quad 0x0008020800002000, 0x1008000800000001
+ .quad 0x1000020000002001, 0x0000020800002000
+ .quad 0x0008000000000000, 0x1008020000002001
+ .quad 0x0000000800000000, 0x0008000000000000
+ .quad 0x0000020000002000, 0x0008020800002000
+.L_s5:
+ .quad 0x0000001000000100, 0x0020001002080100
+ .quad 0x0020000002080000, 0x0420001002000100
+ .quad 0x0000000000080000, 0x0000001000000100
+ .quad 0x0400000000000000, 0x0020000002080000
+ .quad 0x0400001000080100, 0x0000000000080000
+ .quad 0x0020001002000100, 0x0400001000080100
+ .quad 0x0420001002000100, 0x0420000002080000
+ .quad 0x0000001000080100, 0x0400000000000000
+ .quad 0x0020000002000000, 0x0400000000080000
+ .quad 0x0400000000080000, 0x0000000000000000
+ .quad 0x0400001000000100, 0x0420001002080100
+ .quad 0x0420001002080100, 0x0020001002000100
+ .quad 0x0420000002080000, 0x0400001000000100
+ .quad 0x0000000000000000, 0x0420000002000000
+ .quad 0x0020001002080100, 0x0020000002000000
+ .quad 0x0420000002000000, 0x0000001000080100
+ .quad 0x0000000000080000, 0x0420001002000100
+ .quad 0x0000001000000100, 0x0020000002000000
+ .quad 0x0400000000000000, 0x0020000002080000
+ .quad 0x0420001002000100, 0x0400001000080100
+ .quad 0x0020001002000100, 0x0400000000000000
+ .quad 0x0420000002080000, 0x0020001002080100
+ .quad 0x0400001000080100, 0x0000001000000100
+ .quad 0x0020000002000000, 0x0420000002080000
+ .quad 0x0420001002080100, 0x0000001000080100
+ .quad 0x0420000002000000, 0x0420001002080100
+ .quad 0x0020000002080000, 0x0000000000000000
+ .quad 0x0400000000080000, 0x0420000002000000
+ .quad 0x0000001000080100, 0x0020001002000100
+ .quad 0x0400001000000100, 0x0000000000080000
+ .quad 0x0000000000000000, 0x0400000000080000
+ .quad 0x0020001002080100, 0x0400001000000100
+.L_s6:
+ .quad 0x0200000120000010, 0x0204000020000000
+ .quad 0x0000040000000000, 0x0204040120000010
+ .quad 0x0204000020000000, 0x0000000100000010
+ .quad 0x0204040120000010, 0x0004000000000000
+ .quad 0x0200040020000000, 0x0004040100000010
+ .quad 0x0004000000000000, 0x0200000120000010
+ .quad 0x0004000100000010, 0x0200040020000000
+ .quad 0x0200000020000000, 0x0000040100000010
+ .quad 0x0000000000000000, 0x0004000100000010
+ .quad 0x0200040120000010, 0x0000040000000000
+ .quad 0x0004040000000000, 0x0200040120000010
+ .quad 0x0000000100000010, 0x0204000120000010
+ .quad 0x0204000120000010, 0x0000000000000000
+ .quad 0x0004040100000010, 0x0204040020000000
+ .quad 0x0000040100000010, 0x0004040000000000
+ .quad 0x0204040020000000, 0x0200000020000000
+ .quad 0x0200040020000000, 0x0000000100000010
+ .quad 0x0204000120000010, 0x0004040000000000
+ .quad 0x0204040120000010, 0x0004000000000000
+ .quad 0x0000040100000010, 0x0200000120000010
+ .quad 0x0004000000000000, 0x0200040020000000
+ .quad 0x0200000020000000, 0x0000040100000010
+ .quad 0x0200000120000010, 0x0204040120000010
+ .quad 0x0004040000000000, 0x0204000020000000
+ .quad 0x0004040100000010, 0x0204040020000000
+ .quad 0x0000000000000000, 0x0204000120000010
+ .quad 0x0000000100000010, 0x0000040000000000
+ .quad 0x0204000020000000, 0x0004040100000010
+ .quad 0x0000040000000000, 0x0004000100000010
+ .quad 0x0200040120000010, 0x0000000000000000
+ .quad 0x0204040020000000, 0x0200000020000000
+ .quad 0x0004000100000010, 0x0200040120000010
+.L_s7:
+ .quad 0x0002000000200000, 0x2002000004200002
+ .quad 0x2000000004000802, 0x0000000000000000
+ .quad 0x0000000000000800, 0x2000000004000802
+ .quad 0x2002000000200802, 0x0002000004200800
+ .quad 0x2002000004200802, 0x0002000000200000
+ .quad 0x0000000000000000, 0x2000000004000002
+ .quad 0x2000000000000002, 0x0000000004000000
+ .quad 0x2002000004200002, 0x2000000000000802
+ .quad 0x0000000004000800, 0x2002000000200802
+ .quad 0x2002000000200002, 0x0000000004000800
+ .quad 0x2000000004000002, 0x0002000004200000
+ .quad 0x0002000004200800, 0x2002000000200002
+ .quad 0x0002000004200000, 0x0000000000000800
+ .quad 0x2000000000000802, 0x2002000004200802
+ .quad 0x0002000000200800, 0x2000000000000002
+ .quad 0x0000000004000000, 0x0002000000200800
+ .quad 0x0000000004000000, 0x0002000000200800
+ .quad 0x0002000000200000, 0x2000000004000802
+ .quad 0x2000000004000802, 0x2002000004200002
+ .quad 0x2002000004200002, 0x2000000000000002
+ .quad 0x2002000000200002, 0x0000000004000000
+ .quad 0x0000000004000800, 0x0002000000200000
+ .quad 0x0002000004200800, 0x2000000000000802
+ .quad 0x2002000000200802, 0x0002000004200800
+ .quad 0x2000000000000802, 0x2000000004000002
+ .quad 0x2002000004200802, 0x0002000004200000
+ .quad 0x0002000000200800, 0x0000000000000000
+ .quad 0x2000000000000002, 0x2002000004200802
+ .quad 0x0000000000000000, 0x2002000000200802
+ .quad 0x0002000004200000, 0x0000000000000800
+ .quad 0x2000000004000002, 0x0000000004000800
+ .quad 0x0000000000000800, 0x2002000000200002
+.L_s8:
+ .quad 0x0100010410001000, 0x0000010000001000
+ .quad 0x0000000000040000, 0x0100010410041000
+ .quad 0x0100000010000000, 0x0100010410001000
+ .quad 0x0000000400000000, 0x0100000010000000
+ .quad 0x0000000400040000, 0x0100000010040000
+ .quad 0x0100010410041000, 0x0000010000041000
+ .quad 0x0100010010041000, 0x0000010400041000
+ .quad 0x0000010000001000, 0x0000000400000000
+ .quad 0x0100000010040000, 0x0100000410000000
+ .quad 0x0100010010001000, 0x0000010400001000
+ .quad 0x0000010000041000, 0x0000000400040000
+ .quad 0x0100000410040000, 0x0100010010041000
+ .quad 0x0000010400001000, 0x0000000000000000
+ .quad 0x0000000000000000, 0x0100000410040000
+ .quad 0x0100000410000000, 0x0100010010001000
+ .quad 0x0000010400041000, 0x0000000000040000
+ .quad 0x0000010400041000, 0x0000000000040000
+ .quad 0x0100010010041000, 0x0000010000001000
+ .quad 0x0000000400000000, 0x0100000410040000
+ .quad 0x0000010000001000, 0x0000010400041000
+ .quad 0x0100010010001000, 0x0000000400000000
+ .quad 0x0100000410000000, 0x0100000010040000
+ .quad 0x0100000410040000, 0x0100000010000000
+ .quad 0x0000000000040000, 0x0100010410001000
+ .quad 0x0000000000000000, 0x0100010410041000
+ .quad 0x0000000400040000, 0x0100000410000000
+ .quad 0x0100000010040000, 0x0100010010001000
+ .quad 0x0100010410001000, 0x0000000000000000
+ .quad 0x0100010410041000, 0x0000010000041000
+ .quad 0x0000010000041000, 0x0000010400001000
+ .quad 0x0000010400001000, 0x0000000400040000
+ .quad 0x0100000010000000, 0x0100010010041000
diff --git a/arch/x86/crypto/des3_ede_glue.c b/arch/x86/crypto/des3_ede_glue.c
new file mode 100644
index 000000000..5c610d4ef
--- /dev/null
+++ b/arch/x86/crypto/des3_ede_glue.c
@@ -0,0 +1,506 @@
+/*
+ * Glue Code for assembler optimized version of 3DES
+ *
+ * Copyright © 2014 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
+ * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
+ * CTR part based on code (crypto/ctr.c) by:
+ * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <crypto/algapi.h>
+#include <crypto/des.h>
+#include <crypto/internal/skcipher.h>
+#include <linux/crypto.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/types.h>
+
+struct des3_ede_x86_ctx {
+ u32 enc_expkey[DES3_EDE_EXPKEY_WORDS];
+ u32 dec_expkey[DES3_EDE_EXPKEY_WORDS];
+};
+
+/* regular block cipher functions */
+asmlinkage void des3_ede_x86_64_crypt_blk(const u32 *expkey, u8 *dst,
+ const u8 *src);
+
+/* 3-way parallel cipher functions */
+asmlinkage void des3_ede_x86_64_crypt_blk_3way(const u32 *expkey, u8 *dst,
+ const u8 *src);
+
+static inline void des3_ede_enc_blk(struct des3_ede_x86_ctx *ctx, u8 *dst,
+ const u8 *src)
+{
+ u32 *enc_ctx = ctx->enc_expkey;
+
+ des3_ede_x86_64_crypt_blk(enc_ctx, dst, src);
+}
+
+static inline void des3_ede_dec_blk(struct des3_ede_x86_ctx *ctx, u8 *dst,
+ const u8 *src)
+{
+ u32 *dec_ctx = ctx->dec_expkey;
+
+ des3_ede_x86_64_crypt_blk(dec_ctx, dst, src);
+}
+
+static inline void des3_ede_enc_blk_3way(struct des3_ede_x86_ctx *ctx, u8 *dst,
+ const u8 *src)
+{
+ u32 *enc_ctx = ctx->enc_expkey;
+
+ des3_ede_x86_64_crypt_blk_3way(enc_ctx, dst, src);
+}
+
+static inline void des3_ede_dec_blk_3way(struct des3_ede_x86_ctx *ctx, u8 *dst,
+ const u8 *src)
+{
+ u32 *dec_ctx = ctx->dec_expkey;
+
+ des3_ede_x86_64_crypt_blk_3way(dec_ctx, dst, src);
+}
+
+static void des3_ede_x86_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+ des3_ede_enc_blk(crypto_tfm_ctx(tfm), dst, src);
+}
+
+static void des3_ede_x86_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+ des3_ede_dec_blk(crypto_tfm_ctx(tfm), dst, src);
+}
+
+static int ecb_crypt(struct skcipher_request *req, const u32 *expkey)
+{
+ const unsigned int bsize = DES3_EDE_BLOCK_SIZE;
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes)) {
+ u8 *wsrc = walk.src.virt.addr;
+ u8 *wdst = walk.dst.virt.addr;
+
+ /* Process four block batch */
+ if (nbytes >= bsize * 3) {
+ do {
+ des3_ede_x86_64_crypt_blk_3way(expkey, wdst,
+ wsrc);
+
+ wsrc += bsize * 3;
+ wdst += bsize * 3;
+ nbytes -= bsize * 3;
+ } while (nbytes >= bsize * 3);
+
+ if (nbytes < bsize)
+ goto done;
+ }
+
+ /* Handle leftovers */
+ do {
+ des3_ede_x86_64_crypt_blk(expkey, wdst, wsrc);
+
+ wsrc += bsize;
+ wdst += bsize;
+ nbytes -= bsize;
+ } while (nbytes >= bsize);
+
+done:
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+
+static int ecb_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct des3_ede_x86_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return ecb_crypt(req, ctx->enc_expkey);
+}
+
+static int ecb_decrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct des3_ede_x86_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return ecb_crypt(req, ctx->dec_expkey);
+}
+
+static unsigned int __cbc_encrypt(struct des3_ede_x86_ctx *ctx,
+ struct skcipher_walk *walk)
+{
+ unsigned int bsize = DES3_EDE_BLOCK_SIZE;
+ unsigned int nbytes = walk->nbytes;
+ u64 *src = (u64 *)walk->src.virt.addr;
+ u64 *dst = (u64 *)walk->dst.virt.addr;
+ u64 *iv = (u64 *)walk->iv;
+
+ do {
+ *dst = *src ^ *iv;
+ des3_ede_enc_blk(ctx, (u8 *)dst, (u8 *)dst);
+ iv = dst;
+
+ src += 1;
+ dst += 1;
+ nbytes -= bsize;
+ } while (nbytes >= bsize);
+
+ *(u64 *)walk->iv = *iv;
+ return nbytes;
+}
+
+static int cbc_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct des3_ede_x86_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes)) {
+ nbytes = __cbc_encrypt(ctx, &walk);
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+
+static unsigned int __cbc_decrypt(struct des3_ede_x86_ctx *ctx,
+ struct skcipher_walk *walk)
+{
+ unsigned int bsize = DES3_EDE_BLOCK_SIZE;
+ unsigned int nbytes = walk->nbytes;
+ u64 *src = (u64 *)walk->src.virt.addr;
+ u64 *dst = (u64 *)walk->dst.virt.addr;
+ u64 ivs[3 - 1];
+ u64 last_iv;
+
+ /* Start of the last block. */
+ src += nbytes / bsize - 1;
+ dst += nbytes / bsize - 1;
+
+ last_iv = *src;
+
+ /* Process four block batch */
+ if (nbytes >= bsize * 3) {
+ do {
+ nbytes -= bsize * 3 - bsize;
+ src -= 3 - 1;
+ dst -= 3 - 1;
+
+ ivs[0] = src[0];
+ ivs[1] = src[1];
+
+ des3_ede_dec_blk_3way(ctx, (u8 *)dst, (u8 *)src);
+
+ dst[1] ^= ivs[0];
+ dst[2] ^= ivs[1];
+
+ nbytes -= bsize;
+ if (nbytes < bsize)
+ goto done;
+
+ *dst ^= *(src - 1);
+ src -= 1;
+ dst -= 1;
+ } while (nbytes >= bsize * 3);
+ }
+
+ /* Handle leftovers */
+ for (;;) {
+ des3_ede_dec_blk(ctx, (u8 *)dst, (u8 *)src);
+
+ nbytes -= bsize;
+ if (nbytes < bsize)
+ break;
+
+ *dst ^= *(src - 1);
+ src -= 1;
+ dst -= 1;
+ }
+
+done:
+ *dst ^= *(u64 *)walk->iv;
+ *(u64 *)walk->iv = last_iv;
+
+ return nbytes;
+}
+
+static int cbc_decrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct des3_ede_x86_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes)) {
+ nbytes = __cbc_decrypt(ctx, &walk);
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+
+static void ctr_crypt_final(struct des3_ede_x86_ctx *ctx,
+ struct skcipher_walk *walk)
+{
+ u8 *ctrblk = walk->iv;
+ u8 keystream[DES3_EDE_BLOCK_SIZE];
+ u8 *src = walk->src.virt.addr;
+ u8 *dst = walk->dst.virt.addr;
+ unsigned int nbytes = walk->nbytes;
+
+ des3_ede_enc_blk(ctx, keystream, ctrblk);
+ crypto_xor_cpy(dst, keystream, src, nbytes);
+
+ crypto_inc(ctrblk, DES3_EDE_BLOCK_SIZE);
+}
+
+static unsigned int __ctr_crypt(struct des3_ede_x86_ctx *ctx,
+ struct skcipher_walk *walk)
+{
+ unsigned int bsize = DES3_EDE_BLOCK_SIZE;
+ unsigned int nbytes = walk->nbytes;
+ __be64 *src = (__be64 *)walk->src.virt.addr;
+ __be64 *dst = (__be64 *)walk->dst.virt.addr;
+ u64 ctrblk = be64_to_cpu(*(__be64 *)walk->iv);
+ __be64 ctrblocks[3];
+
+ /* Process four block batch */
+ if (nbytes >= bsize * 3) {
+ do {
+ /* create ctrblks for parallel encrypt */
+ ctrblocks[0] = cpu_to_be64(ctrblk++);
+ ctrblocks[1] = cpu_to_be64(ctrblk++);
+ ctrblocks[2] = cpu_to_be64(ctrblk++);
+
+ des3_ede_enc_blk_3way(ctx, (u8 *)ctrblocks,
+ (u8 *)ctrblocks);
+
+ dst[0] = src[0] ^ ctrblocks[0];
+ dst[1] = src[1] ^ ctrblocks[1];
+ dst[2] = src[2] ^ ctrblocks[2];
+
+ src += 3;
+ dst += 3;
+ } while ((nbytes -= bsize * 3) >= bsize * 3);
+
+ if (nbytes < bsize)
+ goto done;
+ }
+
+ /* Handle leftovers */
+ do {
+ ctrblocks[0] = cpu_to_be64(ctrblk++);
+
+ des3_ede_enc_blk(ctx, (u8 *)ctrblocks, (u8 *)ctrblocks);
+
+ dst[0] = src[0] ^ ctrblocks[0];
+
+ src += 1;
+ dst += 1;
+ } while ((nbytes -= bsize) >= bsize);
+
+done:
+ *(__be64 *)walk->iv = cpu_to_be64(ctrblk);
+ return nbytes;
+}
+
+static int ctr_crypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct des3_ede_x86_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) >= DES3_EDE_BLOCK_SIZE) {
+ nbytes = __ctr_crypt(ctx, &walk);
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ if (nbytes) {
+ ctr_crypt_final(ctx, &walk);
+ err = skcipher_walk_done(&walk, 0);
+ }
+
+ return err;
+}
+
+static int des3_ede_x86_setkey(struct crypto_tfm *tfm, const u8 *key,
+ unsigned int keylen)
+{
+ struct des3_ede_x86_ctx *ctx = crypto_tfm_ctx(tfm);
+ u32 i, j, tmp;
+ int err;
+
+ /* Generate encryption context using generic implementation. */
+ err = __des3_ede_setkey(ctx->enc_expkey, &tfm->crt_flags, key, keylen);
+ if (err < 0)
+ return err;
+
+ /* Fix encryption context for this implementation and form decryption
+ * context. */
+ j = DES3_EDE_EXPKEY_WORDS - 2;
+ for (i = 0; i < DES3_EDE_EXPKEY_WORDS; i += 2, j -= 2) {
+ tmp = ror32(ctx->enc_expkey[i + 1], 4);
+ ctx->enc_expkey[i + 1] = tmp;
+
+ ctx->dec_expkey[j + 0] = ctx->enc_expkey[i + 0];
+ ctx->dec_expkey[j + 1] = tmp;
+ }
+
+ return 0;
+}
+
+static int des3_ede_x86_setkey_skcipher(struct crypto_skcipher *tfm,
+ const u8 *key,
+ unsigned int keylen)
+{
+ return des3_ede_x86_setkey(&tfm->base, key, keylen);
+}
+
+static struct crypto_alg des3_ede_cipher = {
+ .cra_name = "des3_ede",
+ .cra_driver_name = "des3_ede-asm",
+ .cra_priority = 200,
+ .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
+ .cra_blocksize = DES3_EDE_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct des3_ede_x86_ctx),
+ .cra_alignmask = 0,
+ .cra_module = THIS_MODULE,
+ .cra_u = {
+ .cipher = {
+ .cia_min_keysize = DES3_EDE_KEY_SIZE,
+ .cia_max_keysize = DES3_EDE_KEY_SIZE,
+ .cia_setkey = des3_ede_x86_setkey,
+ .cia_encrypt = des3_ede_x86_encrypt,
+ .cia_decrypt = des3_ede_x86_decrypt,
+ }
+ }
+};
+
+static struct skcipher_alg des3_ede_skciphers[] = {
+ {
+ .base.cra_name = "ecb(des3_ede)",
+ .base.cra_driver_name = "ecb-des3_ede-asm",
+ .base.cra_priority = 300,
+ .base.cra_blocksize = DES3_EDE_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct des3_ede_x86_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = DES3_EDE_KEY_SIZE,
+ .max_keysize = DES3_EDE_KEY_SIZE,
+ .setkey = des3_ede_x86_setkey_skcipher,
+ .encrypt = ecb_encrypt,
+ .decrypt = ecb_decrypt,
+ }, {
+ .base.cra_name = "cbc(des3_ede)",
+ .base.cra_driver_name = "cbc-des3_ede-asm",
+ .base.cra_priority = 300,
+ .base.cra_blocksize = DES3_EDE_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct des3_ede_x86_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = DES3_EDE_KEY_SIZE,
+ .max_keysize = DES3_EDE_KEY_SIZE,
+ .ivsize = DES3_EDE_BLOCK_SIZE,
+ .setkey = des3_ede_x86_setkey_skcipher,
+ .encrypt = cbc_encrypt,
+ .decrypt = cbc_decrypt,
+ }, {
+ .base.cra_name = "ctr(des3_ede)",
+ .base.cra_driver_name = "ctr-des3_ede-asm",
+ .base.cra_priority = 300,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct des3_ede_x86_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = DES3_EDE_KEY_SIZE,
+ .max_keysize = DES3_EDE_KEY_SIZE,
+ .ivsize = DES3_EDE_BLOCK_SIZE,
+ .chunksize = DES3_EDE_BLOCK_SIZE,
+ .setkey = des3_ede_x86_setkey_skcipher,
+ .encrypt = ctr_crypt,
+ .decrypt = ctr_crypt,
+ }
+};
+
+static bool is_blacklisted_cpu(void)
+{
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return false;
+
+ if (boot_cpu_data.x86 == 0x0f) {
+ /*
+ * On Pentium 4, des3_ede-x86_64 is slower than generic C
+ * implementation because use of 64bit rotates (which are really
+ * slow on P4). Therefore blacklist P4s.
+ */
+ return true;
+ }
+
+ return false;
+}
+
+static int force;
+module_param(force, int, 0);
+MODULE_PARM_DESC(force, "Force module load, ignore CPU blacklist");
+
+static int __init des3_ede_x86_init(void)
+{
+ int err;
+
+ if (!force && is_blacklisted_cpu()) {
+ pr_info("des3_ede-x86_64: performance on this CPU would be suboptimal: disabling des3_ede-x86_64.\n");
+ return -ENODEV;
+ }
+
+ err = crypto_register_alg(&des3_ede_cipher);
+ if (err)
+ return err;
+
+ err = crypto_register_skciphers(des3_ede_skciphers,
+ ARRAY_SIZE(des3_ede_skciphers));
+ if (err)
+ crypto_unregister_alg(&des3_ede_cipher);
+
+ return err;
+}
+
+static void __exit des3_ede_x86_fini(void)
+{
+ crypto_unregister_alg(&des3_ede_cipher);
+ crypto_unregister_skciphers(des3_ede_skciphers,
+ ARRAY_SIZE(des3_ede_skciphers));
+}
+
+module_init(des3_ede_x86_init);
+module_exit(des3_ede_x86_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Triple DES EDE Cipher Algorithm, asm optimized");
+MODULE_ALIAS_CRYPTO("des3_ede");
+MODULE_ALIAS_CRYPTO("des3_ede-asm");
+MODULE_AUTHOR("Jussi Kivilinna <jussi.kivilinna@iki.fi>");
diff --git a/arch/x86/crypto/fpu.c b/arch/x86/crypto/fpu.c
new file mode 100644
index 000000000..406680476
--- /dev/null
+++ b/arch/x86/crypto/fpu.c
@@ -0,0 +1,207 @@
+/*
+ * FPU: Wrapper for blkcipher touching fpu
+ *
+ * Copyright (c) Intel Corp.
+ * Author: Huang Ying <ying.huang@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <crypto/internal/skcipher.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <asm/fpu/api.h>
+
+struct crypto_fpu_ctx {
+ struct crypto_skcipher *child;
+};
+
+static int crypto_fpu_setkey(struct crypto_skcipher *parent, const u8 *key,
+ unsigned int keylen)
+{
+ struct crypto_fpu_ctx *ctx = crypto_skcipher_ctx(parent);
+ struct crypto_skcipher *child = ctx->child;
+ int err;
+
+ crypto_skcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK);
+ crypto_skcipher_set_flags(child, crypto_skcipher_get_flags(parent) &
+ CRYPTO_TFM_REQ_MASK);
+ err = crypto_skcipher_setkey(child, key, keylen);
+ crypto_skcipher_set_flags(parent, crypto_skcipher_get_flags(child) &
+ CRYPTO_TFM_RES_MASK);
+ return err;
+}
+
+static int crypto_fpu_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct crypto_fpu_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct crypto_skcipher *child = ctx->child;
+ SKCIPHER_REQUEST_ON_STACK(subreq, child);
+ int err;
+
+ skcipher_request_set_tfm(subreq, child);
+ skcipher_request_set_callback(subreq, 0, NULL, NULL);
+ skcipher_request_set_crypt(subreq, req->src, req->dst, req->cryptlen,
+ req->iv);
+
+ kernel_fpu_begin();
+ err = crypto_skcipher_encrypt(subreq);
+ kernel_fpu_end();
+
+ skcipher_request_zero(subreq);
+ return err;
+}
+
+static int crypto_fpu_decrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct crypto_fpu_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct crypto_skcipher *child = ctx->child;
+ SKCIPHER_REQUEST_ON_STACK(subreq, child);
+ int err;
+
+ skcipher_request_set_tfm(subreq, child);
+ skcipher_request_set_callback(subreq, 0, NULL, NULL);
+ skcipher_request_set_crypt(subreq, req->src, req->dst, req->cryptlen,
+ req->iv);
+
+ kernel_fpu_begin();
+ err = crypto_skcipher_decrypt(subreq);
+ kernel_fpu_end();
+
+ skcipher_request_zero(subreq);
+ return err;
+}
+
+static int crypto_fpu_init_tfm(struct crypto_skcipher *tfm)
+{
+ struct skcipher_instance *inst = skcipher_alg_instance(tfm);
+ struct crypto_fpu_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct crypto_skcipher_spawn *spawn;
+ struct crypto_skcipher *cipher;
+
+ spawn = skcipher_instance_ctx(inst);
+ cipher = crypto_spawn_skcipher(spawn);
+ if (IS_ERR(cipher))
+ return PTR_ERR(cipher);
+
+ ctx->child = cipher;
+
+ return 0;
+}
+
+static void crypto_fpu_exit_tfm(struct crypto_skcipher *tfm)
+{
+ struct crypto_fpu_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ crypto_free_skcipher(ctx->child);
+}
+
+static void crypto_fpu_free(struct skcipher_instance *inst)
+{
+ crypto_drop_skcipher(skcipher_instance_ctx(inst));
+ kfree(inst);
+}
+
+static int crypto_fpu_create(struct crypto_template *tmpl, struct rtattr **tb)
+{
+ struct crypto_skcipher_spawn *spawn;
+ struct skcipher_instance *inst;
+ struct crypto_attr_type *algt;
+ struct skcipher_alg *alg;
+ const char *cipher_name;
+ int err;
+
+ algt = crypto_get_attr_type(tb);
+ if (IS_ERR(algt))
+ return PTR_ERR(algt);
+
+ if ((algt->type ^ (CRYPTO_ALG_INTERNAL | CRYPTO_ALG_TYPE_SKCIPHER)) &
+ algt->mask)
+ return -EINVAL;
+
+ if (!(algt->mask & CRYPTO_ALG_INTERNAL))
+ return -EINVAL;
+
+ cipher_name = crypto_attr_alg_name(tb[1]);
+ if (IS_ERR(cipher_name))
+ return PTR_ERR(cipher_name);
+
+ inst = kzalloc(sizeof(*inst) + sizeof(*spawn), GFP_KERNEL);
+ if (!inst)
+ return -ENOMEM;
+
+ spawn = skcipher_instance_ctx(inst);
+
+ crypto_set_skcipher_spawn(spawn, skcipher_crypto_instance(inst));
+ err = crypto_grab_skcipher(spawn, cipher_name, CRYPTO_ALG_INTERNAL,
+ CRYPTO_ALG_INTERNAL | CRYPTO_ALG_ASYNC);
+ if (err)
+ goto out_free_inst;
+
+ alg = crypto_skcipher_spawn_alg(spawn);
+
+ err = crypto_inst_setname(skcipher_crypto_instance(inst), "fpu",
+ &alg->base);
+ if (err)
+ goto out_drop_skcipher;
+
+ inst->alg.base.cra_flags = CRYPTO_ALG_INTERNAL;
+ inst->alg.base.cra_priority = alg->base.cra_priority;
+ inst->alg.base.cra_blocksize = alg->base.cra_blocksize;
+ inst->alg.base.cra_alignmask = alg->base.cra_alignmask;
+
+ inst->alg.ivsize = crypto_skcipher_alg_ivsize(alg);
+ inst->alg.min_keysize = crypto_skcipher_alg_min_keysize(alg);
+ inst->alg.max_keysize = crypto_skcipher_alg_max_keysize(alg);
+
+ inst->alg.base.cra_ctxsize = sizeof(struct crypto_fpu_ctx);
+
+ inst->alg.init = crypto_fpu_init_tfm;
+ inst->alg.exit = crypto_fpu_exit_tfm;
+
+ inst->alg.setkey = crypto_fpu_setkey;
+ inst->alg.encrypt = crypto_fpu_encrypt;
+ inst->alg.decrypt = crypto_fpu_decrypt;
+
+ inst->free = crypto_fpu_free;
+
+ err = skcipher_register_instance(tmpl, inst);
+ if (err)
+ goto out_drop_skcipher;
+
+out:
+ return err;
+
+out_drop_skcipher:
+ crypto_drop_skcipher(spawn);
+out_free_inst:
+ kfree(inst);
+ goto out;
+}
+
+static struct crypto_template crypto_fpu_tmpl = {
+ .name = "fpu",
+ .create = crypto_fpu_create,
+ .module = THIS_MODULE,
+};
+
+int __init crypto_fpu_init(void)
+{
+ return crypto_register_template(&crypto_fpu_tmpl);
+}
+
+void crypto_fpu_exit(void)
+{
+ crypto_unregister_template(&crypto_fpu_tmpl);
+}
+
+MODULE_ALIAS_CRYPTO("fpu");
diff --git a/arch/x86/crypto/ghash-clmulni-intel_asm.S b/arch/x86/crypto/ghash-clmulni-intel_asm.S
new file mode 100644
index 000000000..f94375a8d
--- /dev/null
+++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S
@@ -0,0 +1,136 @@
+/*
+ * Accelerated GHASH implementation with Intel PCLMULQDQ-NI
+ * instructions. This file contains accelerated part of ghash
+ * implementation. More information about PCLMULQDQ can be found at:
+ *
+ * http://software.intel.com/en-us/articles/carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/
+ *
+ * Copyright (c) 2009 Intel Corp.
+ * Author: Huang Ying <ying.huang@intel.com>
+ * Vinodh Gopal
+ * Erdinc Ozturk
+ * Deniz Karakoyunlu
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/inst.h>
+#include <asm/frame.h>
+
+.section .rodata.cst16.bswap_mask, "aM", @progbits, 16
+.align 16
+.Lbswap_mask:
+ .octa 0x000102030405060708090a0b0c0d0e0f
+
+#define DATA %xmm0
+#define SHASH %xmm1
+#define T1 %xmm2
+#define T2 %xmm3
+#define T3 %xmm4
+#define BSWAP %xmm5
+#define IN1 %xmm6
+
+.text
+
+/*
+ * __clmul_gf128mul_ble: internal ABI
+ * input:
+ * DATA: operand1
+ * SHASH: operand2, hash_key << 1 mod poly
+ * output:
+ * DATA: operand1 * operand2 mod poly
+ * changed:
+ * T1
+ * T2
+ * T3
+ */
+__clmul_gf128mul_ble:
+ movaps DATA, T1
+ pshufd $0b01001110, DATA, T2
+ pshufd $0b01001110, SHASH, T3
+ pxor DATA, T2
+ pxor SHASH, T3
+
+ PCLMULQDQ 0x00 SHASH DATA # DATA = a0 * b0
+ PCLMULQDQ 0x11 SHASH T1 # T1 = a1 * b1
+ PCLMULQDQ 0x00 T3 T2 # T2 = (a1 + a0) * (b1 + b0)
+ pxor DATA, T2
+ pxor T1, T2 # T2 = a0 * b1 + a1 * b0
+
+ movaps T2, T3
+ pslldq $8, T3
+ psrldq $8, T2
+ pxor T3, DATA
+ pxor T2, T1 # <T1:DATA> is result of
+ # carry-less multiplication
+
+ # first phase of the reduction
+ movaps DATA, T3
+ psllq $1, T3
+ pxor DATA, T3
+ psllq $5, T3
+ pxor DATA, T3
+ psllq $57, T3
+ movaps T3, T2
+ pslldq $8, T2
+ psrldq $8, T3
+ pxor T2, DATA
+ pxor T3, T1
+
+ # second phase of the reduction
+ movaps DATA, T2
+ psrlq $5, T2
+ pxor DATA, T2
+ psrlq $1, T2
+ pxor DATA, T2
+ psrlq $1, T2
+ pxor T2, T1
+ pxor T1, DATA
+ ret
+ENDPROC(__clmul_gf128mul_ble)
+
+/* void clmul_ghash_mul(char *dst, const u128 *shash) */
+ENTRY(clmul_ghash_mul)
+ FRAME_BEGIN
+ movups (%rdi), DATA
+ movups (%rsi), SHASH
+ movaps .Lbswap_mask, BSWAP
+ PSHUFB_XMM BSWAP DATA
+ call __clmul_gf128mul_ble
+ PSHUFB_XMM BSWAP DATA
+ movups DATA, (%rdi)
+ FRAME_END
+ ret
+ENDPROC(clmul_ghash_mul)
+
+/*
+ * void clmul_ghash_update(char *dst, const char *src, unsigned int srclen,
+ * const u128 *shash);
+ */
+ENTRY(clmul_ghash_update)
+ FRAME_BEGIN
+ cmp $16, %rdx
+ jb .Lupdate_just_ret # check length
+ movaps .Lbswap_mask, BSWAP
+ movups (%rdi), DATA
+ movups (%rcx), SHASH
+ PSHUFB_XMM BSWAP DATA
+.align 4
+.Lupdate_loop:
+ movups (%rsi), IN1
+ PSHUFB_XMM BSWAP IN1
+ pxor IN1, DATA
+ call __clmul_gf128mul_ble
+ sub $16, %rdx
+ add $16, %rsi
+ cmp $16, %rdx
+ jge .Lupdate_loop
+ PSHUFB_XMM BSWAP DATA
+ movups DATA, (%rdi)
+.Lupdate_just_ret:
+ FRAME_END
+ ret
+ENDPROC(clmul_ghash_update)
diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c
new file mode 100644
index 000000000..3582ae885
--- /dev/null
+++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c
@@ -0,0 +1,366 @@
+/*
+ * Accelerated GHASH implementation with Intel PCLMULQDQ-NI
+ * instructions. This file contains glue code.
+ *
+ * Copyright (c) 2009 Intel Corp.
+ * Author: Huang Ying <ying.huang@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/crypto.h>
+#include <crypto/algapi.h>
+#include <crypto/cryptd.h>
+#include <crypto/gf128mul.h>
+#include <crypto/internal/hash.h>
+#include <asm/fpu/api.h>
+#include <asm/cpu_device_id.h>
+
+#define GHASH_BLOCK_SIZE 16
+#define GHASH_DIGEST_SIZE 16
+
+void clmul_ghash_mul(char *dst, const u128 *shash);
+
+void clmul_ghash_update(char *dst, const char *src, unsigned int srclen,
+ const u128 *shash);
+
+struct ghash_async_ctx {
+ struct cryptd_ahash *cryptd_tfm;
+};
+
+struct ghash_ctx {
+ u128 shash;
+};
+
+struct ghash_desc_ctx {
+ u8 buffer[GHASH_BLOCK_SIZE];
+ u32 bytes;
+};
+
+static int ghash_init(struct shash_desc *desc)
+{
+ struct ghash_desc_ctx *dctx = shash_desc_ctx(desc);
+
+ memset(dctx, 0, sizeof(*dctx));
+
+ return 0;
+}
+
+static int ghash_setkey(struct crypto_shash *tfm,
+ const u8 *key, unsigned int keylen)
+{
+ struct ghash_ctx *ctx = crypto_shash_ctx(tfm);
+ be128 *x = (be128 *)key;
+ u64 a, b;
+
+ if (keylen != GHASH_BLOCK_SIZE) {
+ crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
+ return -EINVAL;
+ }
+
+ /* perform multiplication by 'x' in GF(2^128) */
+ a = be64_to_cpu(x->a);
+ b = be64_to_cpu(x->b);
+
+ ctx->shash.a = (b << 1) | (a >> 63);
+ ctx->shash.b = (a << 1) | (b >> 63);
+
+ if (a >> 63)
+ ctx->shash.b ^= ((u64)0xc2) << 56;
+
+ return 0;
+}
+
+static int ghash_update(struct shash_desc *desc,
+ const u8 *src, unsigned int srclen)
+{
+ struct ghash_desc_ctx *dctx = shash_desc_ctx(desc);
+ struct ghash_ctx *ctx = crypto_shash_ctx(desc->tfm);
+ u8 *dst = dctx->buffer;
+
+ kernel_fpu_begin();
+ if (dctx->bytes) {
+ int n = min(srclen, dctx->bytes);
+ u8 *pos = dst + (GHASH_BLOCK_SIZE - dctx->bytes);
+
+ dctx->bytes -= n;
+ srclen -= n;
+
+ while (n--)
+ *pos++ ^= *src++;
+
+ if (!dctx->bytes)
+ clmul_ghash_mul(dst, &ctx->shash);
+ }
+
+ clmul_ghash_update(dst, src, srclen, &ctx->shash);
+ kernel_fpu_end();
+
+ if (srclen & 0xf) {
+ src += srclen - (srclen & 0xf);
+ srclen &= 0xf;
+ dctx->bytes = GHASH_BLOCK_SIZE - srclen;
+ while (srclen--)
+ *dst++ ^= *src++;
+ }
+
+ return 0;
+}
+
+static void ghash_flush(struct ghash_ctx *ctx, struct ghash_desc_ctx *dctx)
+{
+ u8 *dst = dctx->buffer;
+
+ if (dctx->bytes) {
+ u8 *tmp = dst + (GHASH_BLOCK_SIZE - dctx->bytes);
+
+ while (dctx->bytes--)
+ *tmp++ ^= 0;
+
+ kernel_fpu_begin();
+ clmul_ghash_mul(dst, &ctx->shash);
+ kernel_fpu_end();
+ }
+
+ dctx->bytes = 0;
+}
+
+static int ghash_final(struct shash_desc *desc, u8 *dst)
+{
+ struct ghash_desc_ctx *dctx = shash_desc_ctx(desc);
+ struct ghash_ctx *ctx = crypto_shash_ctx(desc->tfm);
+ u8 *buf = dctx->buffer;
+
+ ghash_flush(ctx, dctx);
+ memcpy(dst, buf, GHASH_BLOCK_SIZE);
+
+ return 0;
+}
+
+static struct shash_alg ghash_alg = {
+ .digestsize = GHASH_DIGEST_SIZE,
+ .init = ghash_init,
+ .update = ghash_update,
+ .final = ghash_final,
+ .setkey = ghash_setkey,
+ .descsize = sizeof(struct ghash_desc_ctx),
+ .base = {
+ .cra_name = "__ghash",
+ .cra_driver_name = "__ghash-pclmulqdqni",
+ .cra_priority = 0,
+ .cra_flags = CRYPTO_ALG_INTERNAL,
+ .cra_blocksize = GHASH_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct ghash_ctx),
+ .cra_module = THIS_MODULE,
+ },
+};
+
+static int ghash_async_init(struct ahash_request *req)
+{
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+ struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
+ struct ahash_request *cryptd_req = ahash_request_ctx(req);
+ struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
+ struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
+ struct crypto_shash *child = cryptd_ahash_child(cryptd_tfm);
+
+ desc->tfm = child;
+ desc->flags = req->base.flags;
+ return crypto_shash_init(desc);
+}
+
+static int ghash_async_update(struct ahash_request *req)
+{
+ struct ahash_request *cryptd_req = ahash_request_ctx(req);
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+ struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
+ struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
+
+ if (!irq_fpu_usable() ||
+ (in_atomic() && cryptd_ahash_queued(cryptd_tfm))) {
+ memcpy(cryptd_req, req, sizeof(*req));
+ ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
+ return crypto_ahash_update(cryptd_req);
+ } else {
+ struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
+ return shash_ahash_update(req, desc);
+ }
+}
+
+static int ghash_async_final(struct ahash_request *req)
+{
+ struct ahash_request *cryptd_req = ahash_request_ctx(req);
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+ struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
+ struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
+
+ if (!irq_fpu_usable() ||
+ (in_atomic() && cryptd_ahash_queued(cryptd_tfm))) {
+ memcpy(cryptd_req, req, sizeof(*req));
+ ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
+ return crypto_ahash_final(cryptd_req);
+ } else {
+ struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
+ return crypto_shash_final(desc, req->result);
+ }
+}
+
+static int ghash_async_import(struct ahash_request *req, const void *in)
+{
+ struct ahash_request *cryptd_req = ahash_request_ctx(req);
+ struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
+ struct ghash_desc_ctx *dctx = shash_desc_ctx(desc);
+
+ ghash_async_init(req);
+ memcpy(dctx, in, sizeof(*dctx));
+ return 0;
+
+}
+
+static int ghash_async_export(struct ahash_request *req, void *out)
+{
+ struct ahash_request *cryptd_req = ahash_request_ctx(req);
+ struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
+ struct ghash_desc_ctx *dctx = shash_desc_ctx(desc);
+
+ memcpy(out, dctx, sizeof(*dctx));
+ return 0;
+
+}
+
+static int ghash_async_digest(struct ahash_request *req)
+{
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+ struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
+ struct ahash_request *cryptd_req = ahash_request_ctx(req);
+ struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
+
+ if (!irq_fpu_usable() ||
+ (in_atomic() && cryptd_ahash_queued(cryptd_tfm))) {
+ memcpy(cryptd_req, req, sizeof(*req));
+ ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
+ return crypto_ahash_digest(cryptd_req);
+ } else {
+ struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
+ struct crypto_shash *child = cryptd_ahash_child(cryptd_tfm);
+
+ desc->tfm = child;
+ desc->flags = req->base.flags;
+ return shash_ahash_digest(req, desc);
+ }
+}
+
+static int ghash_async_setkey(struct crypto_ahash *tfm, const u8 *key,
+ unsigned int keylen)
+{
+ struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
+ struct crypto_ahash *child = &ctx->cryptd_tfm->base;
+ int err;
+
+ crypto_ahash_clear_flags(child, CRYPTO_TFM_REQ_MASK);
+ crypto_ahash_set_flags(child, crypto_ahash_get_flags(tfm)
+ & CRYPTO_TFM_REQ_MASK);
+ err = crypto_ahash_setkey(child, key, keylen);
+ crypto_ahash_set_flags(tfm, crypto_ahash_get_flags(child)
+ & CRYPTO_TFM_RES_MASK);
+
+ return err;
+}
+
+static int ghash_async_init_tfm(struct crypto_tfm *tfm)
+{
+ struct cryptd_ahash *cryptd_tfm;
+ struct ghash_async_ctx *ctx = crypto_tfm_ctx(tfm);
+
+ cryptd_tfm = cryptd_alloc_ahash("__ghash-pclmulqdqni",
+ CRYPTO_ALG_INTERNAL,
+ CRYPTO_ALG_INTERNAL);
+ if (IS_ERR(cryptd_tfm))
+ return PTR_ERR(cryptd_tfm);
+ ctx->cryptd_tfm = cryptd_tfm;
+ crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
+ sizeof(struct ahash_request) +
+ crypto_ahash_reqsize(&cryptd_tfm->base));
+
+ return 0;
+}
+
+static void ghash_async_exit_tfm(struct crypto_tfm *tfm)
+{
+ struct ghash_async_ctx *ctx = crypto_tfm_ctx(tfm);
+
+ cryptd_free_ahash(ctx->cryptd_tfm);
+}
+
+static struct ahash_alg ghash_async_alg = {
+ .init = ghash_async_init,
+ .update = ghash_async_update,
+ .final = ghash_async_final,
+ .setkey = ghash_async_setkey,
+ .digest = ghash_async_digest,
+ .export = ghash_async_export,
+ .import = ghash_async_import,
+ .halg = {
+ .digestsize = GHASH_DIGEST_SIZE,
+ .statesize = sizeof(struct ghash_desc_ctx),
+ .base = {
+ .cra_name = "ghash",
+ .cra_driver_name = "ghash-clmulni",
+ .cra_priority = 400,
+ .cra_ctxsize = sizeof(struct ghash_async_ctx),
+ .cra_flags = CRYPTO_ALG_ASYNC,
+ .cra_blocksize = GHASH_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ .cra_init = ghash_async_init_tfm,
+ .cra_exit = ghash_async_exit_tfm,
+ },
+ },
+};
+
+static const struct x86_cpu_id pcmul_cpu_id[] = {
+ X86_FEATURE_MATCH(X86_FEATURE_PCLMULQDQ), /* Pickle-Mickle-Duck */
+ {}
+};
+MODULE_DEVICE_TABLE(x86cpu, pcmul_cpu_id);
+
+static int __init ghash_pclmulqdqni_mod_init(void)
+{
+ int err;
+
+ if (!x86_match_cpu(pcmul_cpu_id))
+ return -ENODEV;
+
+ err = crypto_register_shash(&ghash_alg);
+ if (err)
+ goto err_out;
+ err = crypto_register_ahash(&ghash_async_alg);
+ if (err)
+ goto err_shash;
+
+ return 0;
+
+err_shash:
+ crypto_unregister_shash(&ghash_alg);
+err_out:
+ return err;
+}
+
+static void __exit ghash_pclmulqdqni_mod_exit(void)
+{
+ crypto_unregister_ahash(&ghash_async_alg);
+ crypto_unregister_shash(&ghash_alg);
+}
+
+module_init(ghash_pclmulqdqni_mod_init);
+module_exit(ghash_pclmulqdqni_mod_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("GHASH Message Digest Algorithm, "
+ "accelerated by PCLMULQDQ-NI");
+MODULE_ALIAS_CRYPTO("ghash");
diff --git a/arch/x86/crypto/glue_helper-asm-avx.S b/arch/x86/crypto/glue_helper-asm-avx.S
new file mode 100644
index 000000000..02ee2308f
--- /dev/null
+++ b/arch/x86/crypto/glue_helper-asm-avx.S
@@ -0,0 +1,150 @@
+/*
+ * Shared glue code for 128bit block ciphers, AVX assembler macros
+ *
+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#define load_8way(src, x0, x1, x2, x3, x4, x5, x6, x7) \
+ vmovdqu (0*16)(src), x0; \
+ vmovdqu (1*16)(src), x1; \
+ vmovdqu (2*16)(src), x2; \
+ vmovdqu (3*16)(src), x3; \
+ vmovdqu (4*16)(src), x4; \
+ vmovdqu (5*16)(src), x5; \
+ vmovdqu (6*16)(src), x6; \
+ vmovdqu (7*16)(src), x7;
+
+#define store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \
+ vmovdqu x0, (0*16)(dst); \
+ vmovdqu x1, (1*16)(dst); \
+ vmovdqu x2, (2*16)(dst); \
+ vmovdqu x3, (3*16)(dst); \
+ vmovdqu x4, (4*16)(dst); \
+ vmovdqu x5, (5*16)(dst); \
+ vmovdqu x6, (6*16)(dst); \
+ vmovdqu x7, (7*16)(dst);
+
+#define store_cbc_8way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \
+ vpxor (0*16)(src), x1, x1; \
+ vpxor (1*16)(src), x2, x2; \
+ vpxor (2*16)(src), x3, x3; \
+ vpxor (3*16)(src), x4, x4; \
+ vpxor (4*16)(src), x5, x5; \
+ vpxor (5*16)(src), x6, x6; \
+ vpxor (6*16)(src), x7, x7; \
+ store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
+
+#define inc_le128(x, minus_one, tmp) \
+ vpcmpeqq minus_one, x, tmp; \
+ vpsubq minus_one, x, x; \
+ vpslldq $8, tmp, tmp; \
+ vpsubq tmp, x, x;
+
+#define load_ctr_8way(iv, bswap, x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2) \
+ vpcmpeqd t0, t0, t0; \
+ vpsrldq $8, t0, t0; /* low: -1, high: 0 */ \
+ vmovdqa bswap, t1; \
+ \
+ /* load IV and byteswap */ \
+ vmovdqu (iv), x7; \
+ vpshufb t1, x7, x0; \
+ \
+ /* construct IVs */ \
+ inc_le128(x7, t0, t2); \
+ vpshufb t1, x7, x1; \
+ inc_le128(x7, t0, t2); \
+ vpshufb t1, x7, x2; \
+ inc_le128(x7, t0, t2); \
+ vpshufb t1, x7, x3; \
+ inc_le128(x7, t0, t2); \
+ vpshufb t1, x7, x4; \
+ inc_le128(x7, t0, t2); \
+ vpshufb t1, x7, x5; \
+ inc_le128(x7, t0, t2); \
+ vpshufb t1, x7, x6; \
+ inc_le128(x7, t0, t2); \
+ vmovdqa x7, t2; \
+ vpshufb t1, x7, x7; \
+ inc_le128(t2, t0, t1); \
+ vmovdqu t2, (iv);
+
+#define store_ctr_8way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \
+ vpxor (0*16)(src), x0, x0; \
+ vpxor (1*16)(src), x1, x1; \
+ vpxor (2*16)(src), x2, x2; \
+ vpxor (3*16)(src), x3, x3; \
+ vpxor (4*16)(src), x4, x4; \
+ vpxor (5*16)(src), x5, x5; \
+ vpxor (6*16)(src), x6, x6; \
+ vpxor (7*16)(src), x7, x7; \
+ store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
+
+#define gf128mul_x_ble(iv, mask, tmp) \
+ vpsrad $31, iv, tmp; \
+ vpaddq iv, iv, iv; \
+ vpshufd $0x13, tmp, tmp; \
+ vpand mask, tmp, tmp; \
+ vpxor tmp, iv, iv;
+
+#define load_xts_8way(iv, src, dst, x0, x1, x2, x3, x4, x5, x6, x7, tiv, t0, \
+ t1, xts_gf128mul_and_shl1_mask) \
+ vmovdqa xts_gf128mul_and_shl1_mask, t0; \
+ \
+ /* load IV */ \
+ vmovdqu (iv), tiv; \
+ vpxor (0*16)(src), tiv, x0; \
+ vmovdqu tiv, (0*16)(dst); \
+ \
+ /* construct and store IVs, also xor with source */ \
+ gf128mul_x_ble(tiv, t0, t1); \
+ vpxor (1*16)(src), tiv, x1; \
+ vmovdqu tiv, (1*16)(dst); \
+ \
+ gf128mul_x_ble(tiv, t0, t1); \
+ vpxor (2*16)(src), tiv, x2; \
+ vmovdqu tiv, (2*16)(dst); \
+ \
+ gf128mul_x_ble(tiv, t0, t1); \
+ vpxor (3*16)(src), tiv, x3; \
+ vmovdqu tiv, (3*16)(dst); \
+ \
+ gf128mul_x_ble(tiv, t0, t1); \
+ vpxor (4*16)(src), tiv, x4; \
+ vmovdqu tiv, (4*16)(dst); \
+ \
+ gf128mul_x_ble(tiv, t0, t1); \
+ vpxor (5*16)(src), tiv, x5; \
+ vmovdqu tiv, (5*16)(dst); \
+ \
+ gf128mul_x_ble(tiv, t0, t1); \
+ vpxor (6*16)(src), tiv, x6; \
+ vmovdqu tiv, (6*16)(dst); \
+ \
+ gf128mul_x_ble(tiv, t0, t1); \
+ vpxor (7*16)(src), tiv, x7; \
+ vmovdqu tiv, (7*16)(dst); \
+ \
+ gf128mul_x_ble(tiv, t0, t1); \
+ vmovdqu tiv, (iv);
+
+#define store_xts_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \
+ vpxor (0*16)(dst), x0, x0; \
+ vpxor (1*16)(dst), x1, x1; \
+ vpxor (2*16)(dst), x2, x2; \
+ vpxor (3*16)(dst), x3, x3; \
+ vpxor (4*16)(dst), x4, x4; \
+ vpxor (5*16)(dst), x5, x5; \
+ vpxor (6*16)(dst), x6, x6; \
+ vpxor (7*16)(dst), x7, x7; \
+ store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
diff --git a/arch/x86/crypto/glue_helper-asm-avx2.S b/arch/x86/crypto/glue_helper-asm-avx2.S
new file mode 100644
index 000000000..a53ac11dd
--- /dev/null
+++ b/arch/x86/crypto/glue_helper-asm-avx2.S
@@ -0,0 +1,180 @@
+/*
+ * Shared glue code for 128bit block ciphers, AVX2 assembler macros
+ *
+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+#define load_16way(src, x0, x1, x2, x3, x4, x5, x6, x7) \
+ vmovdqu (0*32)(src), x0; \
+ vmovdqu (1*32)(src), x1; \
+ vmovdqu (2*32)(src), x2; \
+ vmovdqu (3*32)(src), x3; \
+ vmovdqu (4*32)(src), x4; \
+ vmovdqu (5*32)(src), x5; \
+ vmovdqu (6*32)(src), x6; \
+ vmovdqu (7*32)(src), x7;
+
+#define store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \
+ vmovdqu x0, (0*32)(dst); \
+ vmovdqu x1, (1*32)(dst); \
+ vmovdqu x2, (2*32)(dst); \
+ vmovdqu x3, (3*32)(dst); \
+ vmovdqu x4, (4*32)(dst); \
+ vmovdqu x5, (5*32)(dst); \
+ vmovdqu x6, (6*32)(dst); \
+ vmovdqu x7, (7*32)(dst);
+
+#define store_cbc_16way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7, t0) \
+ vpxor t0, t0, t0; \
+ vinserti128 $1, (src), t0, t0; \
+ vpxor t0, x0, x0; \
+ vpxor (0*32+16)(src), x1, x1; \
+ vpxor (1*32+16)(src), x2, x2; \
+ vpxor (2*32+16)(src), x3, x3; \
+ vpxor (3*32+16)(src), x4, x4; \
+ vpxor (4*32+16)(src), x5, x5; \
+ vpxor (5*32+16)(src), x6, x6; \
+ vpxor (6*32+16)(src), x7, x7; \
+ store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
+
+#define inc_le128(x, minus_one, tmp) \
+ vpcmpeqq minus_one, x, tmp; \
+ vpsubq minus_one, x, x; \
+ vpslldq $8, tmp, tmp; \
+ vpsubq tmp, x, x;
+
+#define add2_le128(x, minus_one, minus_two, tmp1, tmp2) \
+ vpcmpeqq minus_one, x, tmp1; \
+ vpcmpeqq minus_two, x, tmp2; \
+ vpsubq minus_two, x, x; \
+ vpor tmp2, tmp1, tmp1; \
+ vpslldq $8, tmp1, tmp1; \
+ vpsubq tmp1, x, x;
+
+#define load_ctr_16way(iv, bswap, x0, x1, x2, x3, x4, x5, x6, x7, t0, t0x, t1, \
+ t1x, t2, t2x, t3, t3x, t4, t5) \
+ vpcmpeqd t0, t0, t0; \
+ vpsrldq $8, t0, t0; /* ab: -1:0 ; cd: -1:0 */ \
+ vpaddq t0, t0, t4; /* ab: -2:0 ; cd: -2:0 */\
+ \
+ /* load IV and byteswap */ \
+ vmovdqu (iv), t2x; \
+ vmovdqa t2x, t3x; \
+ inc_le128(t2x, t0x, t1x); \
+ vbroadcasti128 bswap, t1; \
+ vinserti128 $1, t2x, t3, t2; /* ab: le0 ; cd: le1 */ \
+ vpshufb t1, t2, x0; \
+ \
+ /* construct IVs */ \
+ add2_le128(t2, t0, t4, t3, t5); /* ab: le2 ; cd: le3 */ \
+ vpshufb t1, t2, x1; \
+ add2_le128(t2, t0, t4, t3, t5); \
+ vpshufb t1, t2, x2; \
+ add2_le128(t2, t0, t4, t3, t5); \
+ vpshufb t1, t2, x3; \
+ add2_le128(t2, t0, t4, t3, t5); \
+ vpshufb t1, t2, x4; \
+ add2_le128(t2, t0, t4, t3, t5); \
+ vpshufb t1, t2, x5; \
+ add2_le128(t2, t0, t4, t3, t5); \
+ vpshufb t1, t2, x6; \
+ add2_le128(t2, t0, t4, t3, t5); \
+ vpshufb t1, t2, x7; \
+ vextracti128 $1, t2, t2x; \
+ inc_le128(t2x, t0x, t3x); \
+ vmovdqu t2x, (iv);
+
+#define store_ctr_16way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \
+ vpxor (0*32)(src), x0, x0; \
+ vpxor (1*32)(src), x1, x1; \
+ vpxor (2*32)(src), x2, x2; \
+ vpxor (3*32)(src), x3, x3; \
+ vpxor (4*32)(src), x4, x4; \
+ vpxor (5*32)(src), x5, x5; \
+ vpxor (6*32)(src), x6, x6; \
+ vpxor (7*32)(src), x7, x7; \
+ store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
+
+#define gf128mul_x_ble(iv, mask, tmp) \
+ vpsrad $31, iv, tmp; \
+ vpaddq iv, iv, iv; \
+ vpshufd $0x13, tmp, tmp; \
+ vpand mask, tmp, tmp; \
+ vpxor tmp, iv, iv;
+
+#define gf128mul_x2_ble(iv, mask1, mask2, tmp0, tmp1) \
+ vpsrad $31, iv, tmp0; \
+ vpaddq iv, iv, tmp1; \
+ vpsllq $2, iv, iv; \
+ vpshufd $0x13, tmp0, tmp0; \
+ vpsrad $31, tmp1, tmp1; \
+ vpand mask2, tmp0, tmp0; \
+ vpshufd $0x13, tmp1, tmp1; \
+ vpxor tmp0, iv, iv; \
+ vpand mask1, tmp1, tmp1; \
+ vpxor tmp1, iv, iv;
+
+#define load_xts_16way(iv, src, dst, x0, x1, x2, x3, x4, x5, x6, x7, tiv, \
+ tivx, t0, t0x, t1, t1x, t2, t2x, t3, \
+ xts_gf128mul_and_shl1_mask_0, \
+ xts_gf128mul_and_shl1_mask_1) \
+ vbroadcasti128 xts_gf128mul_and_shl1_mask_0, t1; \
+ \
+ /* load IV and construct second IV */ \
+ vmovdqu (iv), tivx; \
+ vmovdqa tivx, t0x; \
+ gf128mul_x_ble(tivx, t1x, t2x); \
+ vbroadcasti128 xts_gf128mul_and_shl1_mask_1, t2; \
+ vinserti128 $1, tivx, t0, tiv; \
+ vpxor (0*32)(src), tiv, x0; \
+ vmovdqu tiv, (0*32)(dst); \
+ \
+ /* construct and store IVs, also xor with source */ \
+ gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
+ vpxor (1*32)(src), tiv, x1; \
+ vmovdqu tiv, (1*32)(dst); \
+ \
+ gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
+ vpxor (2*32)(src), tiv, x2; \
+ vmovdqu tiv, (2*32)(dst); \
+ \
+ gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
+ vpxor (3*32)(src), tiv, x3; \
+ vmovdqu tiv, (3*32)(dst); \
+ \
+ gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
+ vpxor (4*32)(src), tiv, x4; \
+ vmovdqu tiv, (4*32)(dst); \
+ \
+ gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
+ vpxor (5*32)(src), tiv, x5; \
+ vmovdqu tiv, (5*32)(dst); \
+ \
+ gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
+ vpxor (6*32)(src), tiv, x6; \
+ vmovdqu tiv, (6*32)(dst); \
+ \
+ gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
+ vpxor (7*32)(src), tiv, x7; \
+ vmovdqu tiv, (7*32)(dst); \
+ \
+ vextracti128 $1, tiv, tivx; \
+ gf128mul_x_ble(tivx, t1x, t2x); \
+ vmovdqu tivx, (iv);
+
+#define store_xts_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \
+ vpxor (0*32)(dst), x0, x0; \
+ vpxor (1*32)(dst), x1, x1; \
+ vpxor (2*32)(dst), x2, x2; \
+ vpxor (3*32)(dst), x3, x3; \
+ vpxor (4*32)(dst), x4, x4; \
+ vpxor (5*32)(dst), x5, x5; \
+ vpxor (6*32)(dst), x6, x6; \
+ vpxor (7*32)(dst), x7, x7; \
+ store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c
new file mode 100644
index 000000000..a78ef99a9
--- /dev/null
+++ b/arch/x86/crypto/glue_helper.c
@@ -0,0 +1,330 @@
+/*
+ * Shared glue code for 128bit block ciphers
+ *
+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
+ * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
+ * CTR part based on code (crypto/ctr.c) by:
+ * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+ * USA
+ *
+ */
+
+#include <linux/module.h>
+#include <crypto/b128ops.h>
+#include <crypto/gf128mul.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/xts.h>
+#include <asm/crypto/glue_helper.h>
+
+int glue_ecb_req_128bit(const struct common_glue_ctx *gctx,
+ struct skcipher_request *req)
+{
+ void *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req));
+ const unsigned int bsize = 128 / 8;
+ struct skcipher_walk walk;
+ bool fpu_enabled = false;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes)) {
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+ unsigned int func_bytes;
+ unsigned int i;
+
+ fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
+ &walk, fpu_enabled, nbytes);
+ for (i = 0; i < gctx->num_funcs; i++) {
+ func_bytes = bsize * gctx->funcs[i].num_blocks;
+
+ if (nbytes < func_bytes)
+ continue;
+
+ /* Process multi-block batch */
+ do {
+ gctx->funcs[i].fn_u.ecb(ctx, dst, src);
+ src += func_bytes;
+ dst += func_bytes;
+ nbytes -= func_bytes;
+ } while (nbytes >= func_bytes);
+
+ if (nbytes < bsize)
+ break;
+ }
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ glue_fpu_end(fpu_enabled);
+ return err;
+}
+EXPORT_SYMBOL_GPL(glue_ecb_req_128bit);
+
+int glue_cbc_encrypt_req_128bit(const common_glue_func_t fn,
+ struct skcipher_request *req)
+{
+ void *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req));
+ const unsigned int bsize = 128 / 8;
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes)) {
+ const u128 *src = (u128 *)walk.src.virt.addr;
+ u128 *dst = (u128 *)walk.dst.virt.addr;
+ u128 *iv = (u128 *)walk.iv;
+
+ do {
+ u128_xor(dst, src, iv);
+ fn(ctx, (u8 *)dst, (u8 *)dst);
+ iv = dst;
+ src++;
+ dst++;
+ nbytes -= bsize;
+ } while (nbytes >= bsize);
+
+ *(u128 *)walk.iv = *iv;
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+ return err;
+}
+EXPORT_SYMBOL_GPL(glue_cbc_encrypt_req_128bit);
+
+int glue_cbc_decrypt_req_128bit(const struct common_glue_ctx *gctx,
+ struct skcipher_request *req)
+{
+ void *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req));
+ const unsigned int bsize = 128 / 8;
+ struct skcipher_walk walk;
+ bool fpu_enabled = false;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes)) {
+ const u128 *src = walk.src.virt.addr;
+ u128 *dst = walk.dst.virt.addr;
+ unsigned int func_bytes, num_blocks;
+ unsigned int i;
+ u128 last_iv;
+
+ fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
+ &walk, fpu_enabled, nbytes);
+ /* Start of the last block. */
+ src += nbytes / bsize - 1;
+ dst += nbytes / bsize - 1;
+
+ last_iv = *src;
+
+ for (i = 0; i < gctx->num_funcs; i++) {
+ num_blocks = gctx->funcs[i].num_blocks;
+ func_bytes = bsize * num_blocks;
+
+ if (nbytes < func_bytes)
+ continue;
+
+ /* Process multi-block batch */
+ do {
+ src -= num_blocks - 1;
+ dst -= num_blocks - 1;
+
+ gctx->funcs[i].fn_u.cbc(ctx, dst, src);
+
+ nbytes -= func_bytes;
+ if (nbytes < bsize)
+ goto done;
+
+ u128_xor(dst, dst, --src);
+ dst--;
+ } while (nbytes >= func_bytes);
+ }
+done:
+ u128_xor(dst, dst, (u128 *)walk.iv);
+ *(u128 *)walk.iv = last_iv;
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ glue_fpu_end(fpu_enabled);
+ return err;
+}
+EXPORT_SYMBOL_GPL(glue_cbc_decrypt_req_128bit);
+
+int glue_ctr_req_128bit(const struct common_glue_ctx *gctx,
+ struct skcipher_request *req)
+{
+ void *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req));
+ const unsigned int bsize = 128 / 8;
+ struct skcipher_walk walk;
+ bool fpu_enabled = false;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) >= bsize) {
+ const u128 *src = walk.src.virt.addr;
+ u128 *dst = walk.dst.virt.addr;
+ unsigned int func_bytes, num_blocks;
+ unsigned int i;
+ le128 ctrblk;
+
+ fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
+ &walk, fpu_enabled, nbytes);
+
+ be128_to_le128(&ctrblk, (be128 *)walk.iv);
+
+ for (i = 0; i < gctx->num_funcs; i++) {
+ num_blocks = gctx->funcs[i].num_blocks;
+ func_bytes = bsize * num_blocks;
+
+ if (nbytes < func_bytes)
+ continue;
+
+ /* Process multi-block batch */
+ do {
+ gctx->funcs[i].fn_u.ctr(ctx, dst, src, &ctrblk);
+ src += num_blocks;
+ dst += num_blocks;
+ nbytes -= func_bytes;
+ } while (nbytes >= func_bytes);
+
+ if (nbytes < bsize)
+ break;
+ }
+
+ le128_to_be128((be128 *)walk.iv, &ctrblk);
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ glue_fpu_end(fpu_enabled);
+
+ if (nbytes) {
+ le128 ctrblk;
+ u128 tmp;
+
+ be128_to_le128(&ctrblk, (be128 *)walk.iv);
+ memcpy(&tmp, walk.src.virt.addr, nbytes);
+ gctx->funcs[gctx->num_funcs - 1].fn_u.ctr(ctx, &tmp, &tmp,
+ &ctrblk);
+ memcpy(walk.dst.virt.addr, &tmp, nbytes);
+ le128_to_be128((be128 *)walk.iv, &ctrblk);
+
+ err = skcipher_walk_done(&walk, 0);
+ }
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(glue_ctr_req_128bit);
+
+static unsigned int __glue_xts_req_128bit(const struct common_glue_ctx *gctx,
+ void *ctx,
+ struct skcipher_walk *walk)
+{
+ const unsigned int bsize = 128 / 8;
+ unsigned int nbytes = walk->nbytes;
+ u128 *src = walk->src.virt.addr;
+ u128 *dst = walk->dst.virt.addr;
+ unsigned int num_blocks, func_bytes;
+ unsigned int i;
+
+ /* Process multi-block batch */
+ for (i = 0; i < gctx->num_funcs; i++) {
+ num_blocks = gctx->funcs[i].num_blocks;
+ func_bytes = bsize * num_blocks;
+
+ if (nbytes >= func_bytes) {
+ do {
+ gctx->funcs[i].fn_u.xts(ctx, dst, src,
+ walk->iv);
+
+ src += num_blocks;
+ dst += num_blocks;
+ nbytes -= func_bytes;
+ } while (nbytes >= func_bytes);
+
+ if (nbytes < bsize)
+ goto done;
+ }
+ }
+
+done:
+ return nbytes;
+}
+
+int glue_xts_req_128bit(const struct common_glue_ctx *gctx,
+ struct skcipher_request *req,
+ common_glue_func_t tweak_fn, void *tweak_ctx,
+ void *crypt_ctx)
+{
+ const unsigned int bsize = 128 / 8;
+ struct skcipher_walk walk;
+ bool fpu_enabled = false;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+ nbytes = walk.nbytes;
+ if (!nbytes)
+ return err;
+
+ /* set minimum length to bsize, for tweak_fn */
+ fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
+ &walk, fpu_enabled,
+ nbytes < bsize ? bsize : nbytes);
+
+ /* calculate first value of T */
+ tweak_fn(tweak_ctx, walk.iv, walk.iv);
+
+ while (nbytes) {
+ nbytes = __glue_xts_req_128bit(gctx, crypt_ctx, &walk);
+
+ err = skcipher_walk_done(&walk, nbytes);
+ nbytes = walk.nbytes;
+ }
+
+ glue_fpu_end(fpu_enabled);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(glue_xts_req_128bit);
+
+void glue_xts_crypt_128bit_one(void *ctx, u128 *dst, const u128 *src, le128 *iv,
+ common_glue_func_t fn)
+{
+ le128 ivblk = *iv;
+
+ /* generate next IV */
+ gf128mul_x_ble(iv, &ivblk);
+
+ /* CC <- T xor C */
+ u128_xor(dst, src, (u128 *)&ivblk);
+
+ /* PP <- D(Key2,CC) */
+ fn(ctx, (u8 *)dst, (u8 *)dst);
+
+ /* P <- T xor PP */
+ u128_xor(dst, dst, (u128 *)&ivblk);
+}
+EXPORT_SYMBOL_GPL(glue_xts_crypt_128bit_one);
+
+MODULE_LICENSE("GPL");
diff --git a/arch/x86/crypto/morus1280-avx2-asm.S b/arch/x86/crypto/morus1280-avx2-asm.S
new file mode 100644
index 000000000..de182c460
--- /dev/null
+++ b/arch/x86/crypto/morus1280-avx2-asm.S
@@ -0,0 +1,622 @@
+/*
+ * AVX2 implementation of MORUS-1280
+ *
+ * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
+ * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+
+#define SHUFFLE_MASK(i0, i1, i2, i3) \
+ (i0 | (i1 << 2) | (i2 << 4) | (i3 << 6))
+
+#define MASK1 SHUFFLE_MASK(3, 0, 1, 2)
+#define MASK2 SHUFFLE_MASK(2, 3, 0, 1)
+#define MASK3 SHUFFLE_MASK(1, 2, 3, 0)
+
+#define STATE0 %ymm0
+#define STATE0_LOW %xmm0
+#define STATE1 %ymm1
+#define STATE2 %ymm2
+#define STATE3 %ymm3
+#define STATE4 %ymm4
+#define KEY %ymm5
+#define MSG %ymm5
+#define MSG_LOW %xmm5
+#define T0 %ymm6
+#define T0_LOW %xmm6
+#define T1 %ymm7
+
+.section .rodata.cst32.morus1280_const, "aM", @progbits, 32
+.align 32
+.Lmorus1280_const:
+ .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d
+ .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62
+ .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1
+ .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd
+
+.section .rodata.cst32.morus1280_counter, "aM", @progbits, 32
+.align 32
+.Lmorus1280_counter:
+ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
+ .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+ .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
+ .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
+
+.text
+
+.macro morus1280_round s0, s1, s2, s3, s4, b, w
+ vpand \s1, \s2, T0
+ vpxor T0, \s0, \s0
+ vpxor \s3, \s0, \s0
+ vpsllq $\b, \s0, T0
+ vpsrlq $(64 - \b), \s0, \s0
+ vpxor T0, \s0, \s0
+ vpermq $\w, \s3, \s3
+.endm
+
+/*
+ * __morus1280_update: internal ABI
+ * input:
+ * STATE[0-4] - input state
+ * MSG - message block
+ * output:
+ * STATE[0-4] - output state
+ * changed:
+ * T0
+ */
+__morus1280_update:
+ morus1280_round STATE0, STATE1, STATE2, STATE3, STATE4, 13, MASK1
+ vpxor MSG, STATE1, STATE1
+ morus1280_round STATE1, STATE2, STATE3, STATE4, STATE0, 46, MASK2
+ vpxor MSG, STATE2, STATE2
+ morus1280_round STATE2, STATE3, STATE4, STATE0, STATE1, 38, MASK3
+ vpxor MSG, STATE3, STATE3
+ morus1280_round STATE3, STATE4, STATE0, STATE1, STATE2, 7, MASK2
+ vpxor MSG, STATE4, STATE4
+ morus1280_round STATE4, STATE0, STATE1, STATE2, STATE3, 4, MASK1
+ ret
+ENDPROC(__morus1280_update)
+
+/*
+ * __morus1280_update_zero: internal ABI
+ * input:
+ * STATE[0-4] - input state
+ * output:
+ * STATE[0-4] - output state
+ * changed:
+ * T0
+ */
+__morus1280_update_zero:
+ morus1280_round STATE0, STATE1, STATE2, STATE3, STATE4, 13, MASK1
+ morus1280_round STATE1, STATE2, STATE3, STATE4, STATE0, 46, MASK2
+ morus1280_round STATE2, STATE3, STATE4, STATE0, STATE1, 38, MASK3
+ morus1280_round STATE3, STATE4, STATE0, STATE1, STATE2, 7, MASK2
+ morus1280_round STATE4, STATE0, STATE1, STATE2, STATE3, 4, MASK1
+ ret
+ENDPROC(__morus1280_update_zero)
+
+/*
+ * __load_partial: internal ABI
+ * input:
+ * %rsi - src
+ * %rcx - bytes
+ * output:
+ * MSG - message block
+ * changed:
+ * %r8
+ * %r9
+ */
+__load_partial:
+ xor %r9d, %r9d
+ vpxor MSG, MSG, MSG
+
+ mov %rcx, %r8
+ and $0x1, %r8
+ jz .Lld_partial_1
+
+ mov %rcx, %r8
+ and $0x1E, %r8
+ add %rsi, %r8
+ mov (%r8), %r9b
+
+.Lld_partial_1:
+ mov %rcx, %r8
+ and $0x2, %r8
+ jz .Lld_partial_2
+
+ mov %rcx, %r8
+ and $0x1C, %r8
+ add %rsi, %r8
+ shl $16, %r9
+ mov (%r8), %r9w
+
+.Lld_partial_2:
+ mov %rcx, %r8
+ and $0x4, %r8
+ jz .Lld_partial_4
+
+ mov %rcx, %r8
+ and $0x18, %r8
+ add %rsi, %r8
+ shl $32, %r9
+ mov (%r8), %r8d
+ xor %r8, %r9
+
+.Lld_partial_4:
+ movq %r9, MSG_LOW
+
+ mov %rcx, %r8
+ and $0x8, %r8
+ jz .Lld_partial_8
+
+ mov %rcx, %r8
+ and $0x10, %r8
+ add %rsi, %r8
+ pshufd $MASK2, MSG_LOW, MSG_LOW
+ pinsrq $0, (%r8), MSG_LOW
+
+.Lld_partial_8:
+ mov %rcx, %r8
+ and $0x10, %r8
+ jz .Lld_partial_16
+
+ vpermq $MASK2, MSG, MSG
+ movdqu (%rsi), MSG_LOW
+
+.Lld_partial_16:
+ ret
+ENDPROC(__load_partial)
+
+/*
+ * __store_partial: internal ABI
+ * input:
+ * %rdx - dst
+ * %rcx - bytes
+ * output:
+ * T0 - message block
+ * changed:
+ * %r8
+ * %r9
+ * %r10
+ */
+__store_partial:
+ mov %rcx, %r8
+ mov %rdx, %r9
+
+ cmp $16, %r8
+ jl .Lst_partial_16
+
+ movdqu T0_LOW, (%r9)
+ vpermq $MASK2, T0, T0
+
+ sub $16, %r8
+ add $16, %r9
+
+.Lst_partial_16:
+ movq T0_LOW, %r10
+
+ cmp $8, %r8
+ jl .Lst_partial_8
+
+ mov %r10, (%r9)
+ pextrq $1, T0_LOW, %r10
+
+ sub $8, %r8
+ add $8, %r9
+
+.Lst_partial_8:
+ cmp $4, %r8
+ jl .Lst_partial_4
+
+ mov %r10d, (%r9)
+ shr $32, %r10
+
+ sub $4, %r8
+ add $4, %r9
+
+.Lst_partial_4:
+ cmp $2, %r8
+ jl .Lst_partial_2
+
+ mov %r10w, (%r9)
+ shr $16, %r10
+
+ sub $2, %r8
+ add $2, %r9
+
+.Lst_partial_2:
+ cmp $1, %r8
+ jl .Lst_partial_1
+
+ mov %r10b, (%r9)
+
+.Lst_partial_1:
+ ret
+ENDPROC(__store_partial)
+
+/*
+ * void crypto_morus1280_avx2_init(void *state, const void *key,
+ * const void *iv);
+ */
+ENTRY(crypto_morus1280_avx2_init)
+ FRAME_BEGIN
+
+ /* load IV: */
+ vpxor STATE0, STATE0, STATE0
+ movdqu (%rdx), STATE0_LOW
+ /* load key: */
+ vmovdqu (%rsi), KEY
+ vmovdqa KEY, STATE1
+ /* load all ones: */
+ vpcmpeqd STATE2, STATE2, STATE2
+ /* load all zeros: */
+ vpxor STATE3, STATE3, STATE3
+ /* load the constant: */
+ vmovdqa .Lmorus1280_const, STATE4
+
+ /* update 16 times with zero: */
+ call __morus1280_update_zero
+ call __morus1280_update_zero
+ call __morus1280_update_zero
+ call __morus1280_update_zero
+ call __morus1280_update_zero
+ call __morus1280_update_zero
+ call __morus1280_update_zero
+ call __morus1280_update_zero
+ call __morus1280_update_zero
+ call __morus1280_update_zero
+ call __morus1280_update_zero
+ call __morus1280_update_zero
+ call __morus1280_update_zero
+ call __morus1280_update_zero
+ call __morus1280_update_zero
+ call __morus1280_update_zero
+
+ /* xor-in the key again after updates: */
+ vpxor KEY, STATE1, STATE1
+
+ /* store the state: */
+ vmovdqu STATE0, (0 * 32)(%rdi)
+ vmovdqu STATE1, (1 * 32)(%rdi)
+ vmovdqu STATE2, (2 * 32)(%rdi)
+ vmovdqu STATE3, (3 * 32)(%rdi)
+ vmovdqu STATE4, (4 * 32)(%rdi)
+
+ FRAME_END
+ ret
+ENDPROC(crypto_morus1280_avx2_init)
+
+/*
+ * void crypto_morus1280_avx2_ad(void *state, const void *data,
+ * unsigned int length);
+ */
+ENTRY(crypto_morus1280_avx2_ad)
+ FRAME_BEGIN
+
+ cmp $32, %rdx
+ jb .Lad_out
+
+ /* load the state: */
+ vmovdqu (0 * 32)(%rdi), STATE0
+ vmovdqu (1 * 32)(%rdi), STATE1
+ vmovdqu (2 * 32)(%rdi), STATE2
+ vmovdqu (3 * 32)(%rdi), STATE3
+ vmovdqu (4 * 32)(%rdi), STATE4
+
+ mov %rsi, %r8
+ and $0x1F, %r8
+ jnz .Lad_u_loop
+
+.align 4
+.Lad_a_loop:
+ vmovdqa (%rsi), MSG
+ call __morus1280_update
+ sub $32, %rdx
+ add $32, %rsi
+ cmp $32, %rdx
+ jge .Lad_a_loop
+
+ jmp .Lad_cont
+.align 4
+.Lad_u_loop:
+ vmovdqu (%rsi), MSG
+ call __morus1280_update
+ sub $32, %rdx
+ add $32, %rsi
+ cmp $32, %rdx
+ jge .Lad_u_loop
+
+.Lad_cont:
+ /* store the state: */
+ vmovdqu STATE0, (0 * 32)(%rdi)
+ vmovdqu STATE1, (1 * 32)(%rdi)
+ vmovdqu STATE2, (2 * 32)(%rdi)
+ vmovdqu STATE3, (3 * 32)(%rdi)
+ vmovdqu STATE4, (4 * 32)(%rdi)
+
+.Lad_out:
+ FRAME_END
+ ret
+ENDPROC(crypto_morus1280_avx2_ad)
+
+/*
+ * void crypto_morus1280_avx2_enc(void *state, const void *src, void *dst,
+ * unsigned int length);
+ */
+ENTRY(crypto_morus1280_avx2_enc)
+ FRAME_BEGIN
+
+ cmp $32, %rcx
+ jb .Lenc_out
+
+ /* load the state: */
+ vmovdqu (0 * 32)(%rdi), STATE0
+ vmovdqu (1 * 32)(%rdi), STATE1
+ vmovdqu (2 * 32)(%rdi), STATE2
+ vmovdqu (3 * 32)(%rdi), STATE3
+ vmovdqu (4 * 32)(%rdi), STATE4
+
+ mov %rsi, %r8
+ or %rdx, %r8
+ and $0x1F, %r8
+ jnz .Lenc_u_loop
+
+.align 4
+.Lenc_a_loop:
+ vmovdqa (%rsi), MSG
+ vmovdqa MSG, T0
+ vpxor STATE0, T0, T0
+ vpermq $MASK3, STATE1, T1
+ vpxor T1, T0, T0
+ vpand STATE2, STATE3, T1
+ vpxor T1, T0, T0
+ vmovdqa T0, (%rdx)
+
+ call __morus1280_update
+ sub $32, %rcx
+ add $32, %rsi
+ add $32, %rdx
+ cmp $32, %rcx
+ jge .Lenc_a_loop
+
+ jmp .Lenc_cont
+.align 4
+.Lenc_u_loop:
+ vmovdqu (%rsi), MSG
+ vmovdqa MSG, T0
+ vpxor STATE0, T0, T0
+ vpermq $MASK3, STATE1, T1
+ vpxor T1, T0, T0
+ vpand STATE2, STATE3, T1
+ vpxor T1, T0, T0
+ vmovdqu T0, (%rdx)
+
+ call __morus1280_update
+ sub $32, %rcx
+ add $32, %rsi
+ add $32, %rdx
+ cmp $32, %rcx
+ jge .Lenc_u_loop
+
+.Lenc_cont:
+ /* store the state: */
+ vmovdqu STATE0, (0 * 32)(%rdi)
+ vmovdqu STATE1, (1 * 32)(%rdi)
+ vmovdqu STATE2, (2 * 32)(%rdi)
+ vmovdqu STATE3, (3 * 32)(%rdi)
+ vmovdqu STATE4, (4 * 32)(%rdi)
+
+.Lenc_out:
+ FRAME_END
+ ret
+ENDPROC(crypto_morus1280_avx2_enc)
+
+/*
+ * void crypto_morus1280_avx2_enc_tail(void *state, const void *src, void *dst,
+ * unsigned int length);
+ */
+ENTRY(crypto_morus1280_avx2_enc_tail)
+ FRAME_BEGIN
+
+ /* load the state: */
+ vmovdqu (0 * 32)(%rdi), STATE0
+ vmovdqu (1 * 32)(%rdi), STATE1
+ vmovdqu (2 * 32)(%rdi), STATE2
+ vmovdqu (3 * 32)(%rdi), STATE3
+ vmovdqu (4 * 32)(%rdi), STATE4
+
+ /* encrypt message: */
+ call __load_partial
+
+ vmovdqa MSG, T0
+ vpxor STATE0, T0, T0
+ vpermq $MASK3, STATE1, T1
+ vpxor T1, T0, T0
+ vpand STATE2, STATE3, T1
+ vpxor T1, T0, T0
+
+ call __store_partial
+
+ call __morus1280_update
+
+ /* store the state: */
+ vmovdqu STATE0, (0 * 32)(%rdi)
+ vmovdqu STATE1, (1 * 32)(%rdi)
+ vmovdqu STATE2, (2 * 32)(%rdi)
+ vmovdqu STATE3, (3 * 32)(%rdi)
+ vmovdqu STATE4, (4 * 32)(%rdi)
+
+ FRAME_END
+ ret
+ENDPROC(crypto_morus1280_avx2_enc_tail)
+
+/*
+ * void crypto_morus1280_avx2_dec(void *state, const void *src, void *dst,
+ * unsigned int length);
+ */
+ENTRY(crypto_morus1280_avx2_dec)
+ FRAME_BEGIN
+
+ cmp $32, %rcx
+ jb .Ldec_out
+
+ /* load the state: */
+ vmovdqu (0 * 32)(%rdi), STATE0
+ vmovdqu (1 * 32)(%rdi), STATE1
+ vmovdqu (2 * 32)(%rdi), STATE2
+ vmovdqu (3 * 32)(%rdi), STATE3
+ vmovdqu (4 * 32)(%rdi), STATE4
+
+ mov %rsi, %r8
+ or %rdx, %r8
+ and $0x1F, %r8
+ jnz .Ldec_u_loop
+
+.align 4
+.Ldec_a_loop:
+ vmovdqa (%rsi), MSG
+ vpxor STATE0, MSG, MSG
+ vpermq $MASK3, STATE1, T0
+ vpxor T0, MSG, MSG
+ vpand STATE2, STATE3, T0
+ vpxor T0, MSG, MSG
+ vmovdqa MSG, (%rdx)
+
+ call __morus1280_update
+ sub $32, %rcx
+ add $32, %rsi
+ add $32, %rdx
+ cmp $32, %rcx
+ jge .Ldec_a_loop
+
+ jmp .Ldec_cont
+.align 4
+.Ldec_u_loop:
+ vmovdqu (%rsi), MSG
+ vpxor STATE0, MSG, MSG
+ vpermq $MASK3, STATE1, T0
+ vpxor T0, MSG, MSG
+ vpand STATE2, STATE3, T0
+ vpxor T0, MSG, MSG
+ vmovdqu MSG, (%rdx)
+
+ call __morus1280_update
+ sub $32, %rcx
+ add $32, %rsi
+ add $32, %rdx
+ cmp $32, %rcx
+ jge .Ldec_u_loop
+
+.Ldec_cont:
+ /* store the state: */
+ vmovdqu STATE0, (0 * 32)(%rdi)
+ vmovdqu STATE1, (1 * 32)(%rdi)
+ vmovdqu STATE2, (2 * 32)(%rdi)
+ vmovdqu STATE3, (3 * 32)(%rdi)
+ vmovdqu STATE4, (4 * 32)(%rdi)
+
+.Ldec_out:
+ FRAME_END
+ ret
+ENDPROC(crypto_morus1280_avx2_dec)
+
+/*
+ * void crypto_morus1280_avx2_dec_tail(void *state, const void *src, void *dst,
+ * unsigned int length);
+ */
+ENTRY(crypto_morus1280_avx2_dec_tail)
+ FRAME_BEGIN
+
+ /* load the state: */
+ vmovdqu (0 * 32)(%rdi), STATE0
+ vmovdqu (1 * 32)(%rdi), STATE1
+ vmovdqu (2 * 32)(%rdi), STATE2
+ vmovdqu (3 * 32)(%rdi), STATE3
+ vmovdqu (4 * 32)(%rdi), STATE4
+
+ /* decrypt message: */
+ call __load_partial
+
+ vpxor STATE0, MSG, MSG
+ vpermq $MASK3, STATE1, T0
+ vpxor T0, MSG, MSG
+ vpand STATE2, STATE3, T0
+ vpxor T0, MSG, MSG
+ vmovdqa MSG, T0
+
+ call __store_partial
+
+ /* mask with byte count: */
+ movq %rcx, T0_LOW
+ vpbroadcastb T0_LOW, T0
+ vmovdqa .Lmorus1280_counter, T1
+ vpcmpgtb T1, T0, T0
+ vpand T0, MSG, MSG
+
+ call __morus1280_update
+
+ /* store the state: */
+ vmovdqu STATE0, (0 * 32)(%rdi)
+ vmovdqu STATE1, (1 * 32)(%rdi)
+ vmovdqu STATE2, (2 * 32)(%rdi)
+ vmovdqu STATE3, (3 * 32)(%rdi)
+ vmovdqu STATE4, (4 * 32)(%rdi)
+
+ FRAME_END
+ ret
+ENDPROC(crypto_morus1280_avx2_dec_tail)
+
+/*
+ * void crypto_morus1280_avx2_final(void *state, void *tag_xor,
+ * u64 assoclen, u64 cryptlen);
+ */
+ENTRY(crypto_morus1280_avx2_final)
+ FRAME_BEGIN
+
+ /* load the state: */
+ vmovdqu (0 * 32)(%rdi), STATE0
+ vmovdqu (1 * 32)(%rdi), STATE1
+ vmovdqu (2 * 32)(%rdi), STATE2
+ vmovdqu (3 * 32)(%rdi), STATE3
+ vmovdqu (4 * 32)(%rdi), STATE4
+
+ /* xor state[0] into state[4]: */
+ vpxor STATE0, STATE4, STATE4
+
+ /* prepare length block: */
+ vpxor MSG, MSG, MSG
+ vpinsrq $0, %rdx, MSG_LOW, MSG_LOW
+ vpinsrq $1, %rcx, MSG_LOW, MSG_LOW
+ vpsllq $3, MSG, MSG /* multiply by 8 (to get bit count) */
+
+ /* update state: */
+ call __morus1280_update
+ call __morus1280_update
+ call __morus1280_update
+ call __morus1280_update
+ call __morus1280_update
+ call __morus1280_update
+ call __morus1280_update
+ call __morus1280_update
+ call __morus1280_update
+ call __morus1280_update
+
+ /* xor tag: */
+ vmovdqu (%rsi), MSG
+
+ vpxor STATE0, MSG, MSG
+ vpermq $MASK3, STATE1, T0
+ vpxor T0, MSG, MSG
+ vpand STATE2, STATE3, T0
+ vpxor T0, MSG, MSG
+ vmovdqu MSG, (%rsi)
+
+ FRAME_END
+ ret
+ENDPROC(crypto_morus1280_avx2_final)
diff --git a/arch/x86/crypto/morus1280-avx2-glue.c b/arch/x86/crypto/morus1280-avx2-glue.c
new file mode 100644
index 000000000..6634907d6
--- /dev/null
+++ b/arch/x86/crypto/morus1280-avx2-glue.c
@@ -0,0 +1,64 @@
+/*
+ * The MORUS-1280 Authenticated-Encryption Algorithm
+ * Glue for AVX2 implementation
+ *
+ * Copyright (c) 2016-2018 Ondrej Mosnacek <omosnacek@gmail.com>
+ * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include <crypto/internal/aead.h>
+#include <crypto/morus1280_glue.h>
+#include <linux/module.h>
+#include <asm/fpu/api.h>
+#include <asm/cpu_device_id.h>
+
+asmlinkage void crypto_morus1280_avx2_init(void *state, const void *key,
+ const void *iv);
+asmlinkage void crypto_morus1280_avx2_ad(void *state, const void *data,
+ unsigned int length);
+
+asmlinkage void crypto_morus1280_avx2_enc(void *state, const void *src,
+ void *dst, unsigned int length);
+asmlinkage void crypto_morus1280_avx2_dec(void *state, const void *src,
+ void *dst, unsigned int length);
+
+asmlinkage void crypto_morus1280_avx2_enc_tail(void *state, const void *src,
+ void *dst, unsigned int length);
+asmlinkage void crypto_morus1280_avx2_dec_tail(void *state, const void *src,
+ void *dst, unsigned int length);
+
+asmlinkage void crypto_morus1280_avx2_final(void *state, void *tag_xor,
+ u64 assoclen, u64 cryptlen);
+
+MORUS1280_DECLARE_ALGS(avx2, "morus1280-avx2", 400);
+
+static int __init crypto_morus1280_avx2_module_init(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_AVX2) ||
+ !boot_cpu_has(X86_FEATURE_OSXSAVE) ||
+ !cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
+ return -ENODEV;
+
+ return crypto_register_aeads(crypto_morus1280_avx2_algs,
+ ARRAY_SIZE(crypto_morus1280_avx2_algs));
+}
+
+static void __exit crypto_morus1280_avx2_module_exit(void)
+{
+ crypto_unregister_aeads(crypto_morus1280_avx2_algs,
+ ARRAY_SIZE(crypto_morus1280_avx2_algs));
+}
+
+module_init(crypto_morus1280_avx2_module_init);
+module_exit(crypto_morus1280_avx2_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>");
+MODULE_DESCRIPTION("MORUS-1280 AEAD algorithm -- AVX2 implementation");
+MODULE_ALIAS_CRYPTO("morus1280");
+MODULE_ALIAS_CRYPTO("morus1280-avx2");
diff --git a/arch/x86/crypto/morus1280-sse2-asm.S b/arch/x86/crypto/morus1280-sse2-asm.S
new file mode 100644
index 000000000..da5d2905d
--- /dev/null
+++ b/arch/x86/crypto/morus1280-sse2-asm.S
@@ -0,0 +1,896 @@
+/*
+ * SSE2 implementation of MORUS-1280
+ *
+ * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
+ * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+
+#define SHUFFLE_MASK(i0, i1, i2, i3) \
+ (i0 | (i1 << 2) | (i2 << 4) | (i3 << 6))
+
+#define MASK2 SHUFFLE_MASK(2, 3, 0, 1)
+
+#define STATE0_LO %xmm0
+#define STATE0_HI %xmm1
+#define STATE1_LO %xmm2
+#define STATE1_HI %xmm3
+#define STATE2_LO %xmm4
+#define STATE2_HI %xmm5
+#define STATE3_LO %xmm6
+#define STATE3_HI %xmm7
+#define STATE4_LO %xmm8
+#define STATE4_HI %xmm9
+#define KEY_LO %xmm10
+#define KEY_HI %xmm11
+#define MSG_LO %xmm10
+#define MSG_HI %xmm11
+#define T0_LO %xmm12
+#define T0_HI %xmm13
+#define T1_LO %xmm14
+#define T1_HI %xmm15
+
+.section .rodata.cst16.morus640_const, "aM", @progbits, 16
+.align 16
+.Lmorus640_const_0:
+ .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d
+ .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62
+.Lmorus640_const_1:
+ .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1
+ .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd
+
+.section .rodata.cst16.morus640_counter, "aM", @progbits, 16
+.align 16
+.Lmorus640_counter_0:
+ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
+ .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+.Lmorus640_counter_1:
+ .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
+ .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
+
+.text
+
+.macro rol1 hi, lo
+ /*
+ * HI_1 | HI_0 || LO_1 | LO_0
+ * ==>
+ * HI_0 | HI_1 || LO_1 | LO_0
+ * ==>
+ * HI_0 | LO_1 || LO_0 | HI_1
+ */
+ pshufd $MASK2, \hi, \hi
+ movdqa \hi, T0_LO
+ punpcklqdq \lo, T0_LO
+ punpckhqdq \hi, \lo
+ movdqa \lo, \hi
+ movdqa T0_LO, \lo
+.endm
+
+.macro rol2 hi, lo
+ movdqa \lo, T0_LO
+ movdqa \hi, \lo
+ movdqa T0_LO, \hi
+.endm
+
+.macro rol3 hi, lo
+ /*
+ * HI_1 | HI_0 || LO_1 | LO_0
+ * ==>
+ * HI_0 | HI_1 || LO_1 | LO_0
+ * ==>
+ * LO_0 | HI_1 || HI_0 | LO_1
+ */
+ pshufd $MASK2, \hi, \hi
+ movdqa \lo, T0_LO
+ punpckhqdq \hi, T0_LO
+ punpcklqdq \lo, \hi
+ movdqa T0_LO, \lo
+.endm
+
+.macro morus1280_round s0_l, s0_h, s1_l, s1_h, s2_l, s2_h, s3_l, s3_h, s4_l, s4_h, b, w
+ movdqa \s1_l, T0_LO
+ pand \s2_l, T0_LO
+ pxor T0_LO, \s0_l
+
+ movdqa \s1_h, T0_LO
+ pand \s2_h, T0_LO
+ pxor T0_LO, \s0_h
+
+ pxor \s3_l, \s0_l
+ pxor \s3_h, \s0_h
+
+ movdqa \s0_l, T0_LO
+ psllq $\b, T0_LO
+ psrlq $(64 - \b), \s0_l
+ pxor T0_LO, \s0_l
+
+ movdqa \s0_h, T0_LO
+ psllq $\b, T0_LO
+ psrlq $(64 - \b), \s0_h
+ pxor T0_LO, \s0_h
+
+ \w \s3_h, \s3_l
+.endm
+
+/*
+ * __morus1280_update: internal ABI
+ * input:
+ * STATE[0-4] - input state
+ * MSG - message block
+ * output:
+ * STATE[0-4] - output state
+ * changed:
+ * T0
+ */
+__morus1280_update:
+ morus1280_round \
+ STATE0_LO, STATE0_HI, \
+ STATE1_LO, STATE1_HI, \
+ STATE2_LO, STATE2_HI, \
+ STATE3_LO, STATE3_HI, \
+ STATE4_LO, STATE4_HI, \
+ 13, rol1
+ pxor MSG_LO, STATE1_LO
+ pxor MSG_HI, STATE1_HI
+ morus1280_round \
+ STATE1_LO, STATE1_HI, \
+ STATE2_LO, STATE2_HI, \
+ STATE3_LO, STATE3_HI, \
+ STATE4_LO, STATE4_HI, \
+ STATE0_LO, STATE0_HI, \
+ 46, rol2
+ pxor MSG_LO, STATE2_LO
+ pxor MSG_HI, STATE2_HI
+ morus1280_round \
+ STATE2_LO, STATE2_HI, \
+ STATE3_LO, STATE3_HI, \
+ STATE4_LO, STATE4_HI, \
+ STATE0_LO, STATE0_HI, \
+ STATE1_LO, STATE1_HI, \
+ 38, rol3
+ pxor MSG_LO, STATE3_LO
+ pxor MSG_HI, STATE3_HI
+ morus1280_round \
+ STATE3_LO, STATE3_HI, \
+ STATE4_LO, STATE4_HI, \
+ STATE0_LO, STATE0_HI, \
+ STATE1_LO, STATE1_HI, \
+ STATE2_LO, STATE2_HI, \
+ 7, rol2
+ pxor MSG_LO, STATE4_LO
+ pxor MSG_HI, STATE4_HI
+ morus1280_round \
+ STATE4_LO, STATE4_HI, \
+ STATE0_LO, STATE0_HI, \
+ STATE1_LO, STATE1_HI, \
+ STATE2_LO, STATE2_HI, \
+ STATE3_LO, STATE3_HI, \
+ 4, rol1
+ ret
+ENDPROC(__morus1280_update)
+
+/*
+ * __morus1280_update_zero: internal ABI
+ * input:
+ * STATE[0-4] - input state
+ * output:
+ * STATE[0-4] - output state
+ * changed:
+ * T0
+ */
+__morus1280_update_zero:
+ morus1280_round \
+ STATE0_LO, STATE0_HI, \
+ STATE1_LO, STATE1_HI, \
+ STATE2_LO, STATE2_HI, \
+ STATE3_LO, STATE3_HI, \
+ STATE4_LO, STATE4_HI, \
+ 13, rol1
+ morus1280_round \
+ STATE1_LO, STATE1_HI, \
+ STATE2_LO, STATE2_HI, \
+ STATE3_LO, STATE3_HI, \
+ STATE4_LO, STATE4_HI, \
+ STATE0_LO, STATE0_HI, \
+ 46, rol2
+ morus1280_round \
+ STATE2_LO, STATE2_HI, \
+ STATE3_LO, STATE3_HI, \
+ STATE4_LO, STATE4_HI, \
+ STATE0_LO, STATE0_HI, \
+ STATE1_LO, STATE1_HI, \
+ 38, rol3
+ morus1280_round \
+ STATE3_LO, STATE3_HI, \
+ STATE4_LO, STATE4_HI, \
+ STATE0_LO, STATE0_HI, \
+ STATE1_LO, STATE1_HI, \
+ STATE2_LO, STATE2_HI, \
+ 7, rol2
+ morus1280_round \
+ STATE4_LO, STATE4_HI, \
+ STATE0_LO, STATE0_HI, \
+ STATE1_LO, STATE1_HI, \
+ STATE2_LO, STATE2_HI, \
+ STATE3_LO, STATE3_HI, \
+ 4, rol1
+ ret
+ENDPROC(__morus1280_update_zero)
+
+/*
+ * __load_partial: internal ABI
+ * input:
+ * %rsi - src
+ * %rcx - bytes
+ * output:
+ * MSG - message block
+ * changed:
+ * %r8
+ * %r9
+ */
+__load_partial:
+ xor %r9d, %r9d
+ pxor MSG_LO, MSG_LO
+ pxor MSG_HI, MSG_HI
+
+ mov %rcx, %r8
+ and $0x1, %r8
+ jz .Lld_partial_1
+
+ mov %rcx, %r8
+ and $0x1E, %r8
+ add %rsi, %r8
+ mov (%r8), %r9b
+
+.Lld_partial_1:
+ mov %rcx, %r8
+ and $0x2, %r8
+ jz .Lld_partial_2
+
+ mov %rcx, %r8
+ and $0x1C, %r8
+ add %rsi, %r8
+ shl $16, %r9
+ mov (%r8), %r9w
+
+.Lld_partial_2:
+ mov %rcx, %r8
+ and $0x4, %r8
+ jz .Lld_partial_4
+
+ mov %rcx, %r8
+ and $0x18, %r8
+ add %rsi, %r8
+ shl $32, %r9
+ mov (%r8), %r8d
+ xor %r8, %r9
+
+.Lld_partial_4:
+ movq %r9, MSG_LO
+
+ mov %rcx, %r8
+ and $0x8, %r8
+ jz .Lld_partial_8
+
+ mov %rcx, %r8
+ and $0x10, %r8
+ add %rsi, %r8
+ pslldq $8, MSG_LO
+ movq (%r8), T0_LO
+ pxor T0_LO, MSG_LO
+
+.Lld_partial_8:
+ mov %rcx, %r8
+ and $0x10, %r8
+ jz .Lld_partial_16
+
+ movdqa MSG_LO, MSG_HI
+ movdqu (%rsi), MSG_LO
+
+.Lld_partial_16:
+ ret
+ENDPROC(__load_partial)
+
+/*
+ * __store_partial: internal ABI
+ * input:
+ * %rdx - dst
+ * %rcx - bytes
+ * output:
+ * T0 - message block
+ * changed:
+ * %r8
+ * %r9
+ * %r10
+ */
+__store_partial:
+ mov %rcx, %r8
+ mov %rdx, %r9
+
+ cmp $16, %r8
+ jl .Lst_partial_16
+
+ movdqu T0_LO, (%r9)
+ movdqa T0_HI, T0_LO
+
+ sub $16, %r8
+ add $16, %r9
+
+.Lst_partial_16:
+ movq T0_LO, %r10
+
+ cmp $8, %r8
+ jl .Lst_partial_8
+
+ mov %r10, (%r9)
+ psrldq $8, T0_LO
+ movq T0_LO, %r10
+
+ sub $8, %r8
+ add $8, %r9
+
+.Lst_partial_8:
+ cmp $4, %r8
+ jl .Lst_partial_4
+
+ mov %r10d, (%r9)
+ shr $32, %r10
+
+ sub $4, %r8
+ add $4, %r9
+
+.Lst_partial_4:
+ cmp $2, %r8
+ jl .Lst_partial_2
+
+ mov %r10w, (%r9)
+ shr $16, %r10
+
+ sub $2, %r8
+ add $2, %r9
+
+.Lst_partial_2:
+ cmp $1, %r8
+ jl .Lst_partial_1
+
+ mov %r10b, (%r9)
+
+.Lst_partial_1:
+ ret
+ENDPROC(__store_partial)
+
+/*
+ * void crypto_morus1280_sse2_init(void *state, const void *key,
+ * const void *iv);
+ */
+ENTRY(crypto_morus1280_sse2_init)
+ FRAME_BEGIN
+
+ /* load IV: */
+ pxor STATE0_HI, STATE0_HI
+ movdqu (%rdx), STATE0_LO
+ /* load key: */
+ movdqu 0(%rsi), KEY_LO
+ movdqu 16(%rsi), KEY_HI
+ movdqa KEY_LO, STATE1_LO
+ movdqa KEY_HI, STATE1_HI
+ /* load all ones: */
+ pcmpeqd STATE2_LO, STATE2_LO
+ pcmpeqd STATE2_HI, STATE2_HI
+ /* load all zeros: */
+ pxor STATE3_LO, STATE3_LO
+ pxor STATE3_HI, STATE3_HI
+ /* load the constant: */
+ movdqa .Lmorus640_const_0, STATE4_LO
+ movdqa .Lmorus640_const_1, STATE4_HI
+
+ /* update 16 times with zero: */
+ call __morus1280_update_zero
+ call __morus1280_update_zero
+ call __morus1280_update_zero
+ call __morus1280_update_zero
+ call __morus1280_update_zero
+ call __morus1280_update_zero
+ call __morus1280_update_zero
+ call __morus1280_update_zero
+ call __morus1280_update_zero
+ call __morus1280_update_zero
+ call __morus1280_update_zero
+ call __morus1280_update_zero
+ call __morus1280_update_zero
+ call __morus1280_update_zero
+ call __morus1280_update_zero
+ call __morus1280_update_zero
+
+ /* xor-in the key again after updates: */
+ pxor KEY_LO, STATE1_LO
+ pxor KEY_HI, STATE1_HI
+
+ /* store the state: */
+ movdqu STATE0_LO, (0 * 16)(%rdi)
+ movdqu STATE0_HI, (1 * 16)(%rdi)
+ movdqu STATE1_LO, (2 * 16)(%rdi)
+ movdqu STATE1_HI, (3 * 16)(%rdi)
+ movdqu STATE2_LO, (4 * 16)(%rdi)
+ movdqu STATE2_HI, (5 * 16)(%rdi)
+ movdqu STATE3_LO, (6 * 16)(%rdi)
+ movdqu STATE3_HI, (7 * 16)(%rdi)
+ movdqu STATE4_LO, (8 * 16)(%rdi)
+ movdqu STATE4_HI, (9 * 16)(%rdi)
+
+ FRAME_END
+ ret
+ENDPROC(crypto_morus1280_sse2_init)
+
+/*
+ * void crypto_morus1280_sse2_ad(void *state, const void *data,
+ * unsigned int length);
+ */
+ENTRY(crypto_morus1280_sse2_ad)
+ FRAME_BEGIN
+
+ cmp $32, %rdx
+ jb .Lad_out
+
+ /* load the state: */
+ movdqu (0 * 16)(%rdi), STATE0_LO
+ movdqu (1 * 16)(%rdi), STATE0_HI
+ movdqu (2 * 16)(%rdi), STATE1_LO
+ movdqu (3 * 16)(%rdi), STATE1_HI
+ movdqu (4 * 16)(%rdi), STATE2_LO
+ movdqu (5 * 16)(%rdi), STATE2_HI
+ movdqu (6 * 16)(%rdi), STATE3_LO
+ movdqu (7 * 16)(%rdi), STATE3_HI
+ movdqu (8 * 16)(%rdi), STATE4_LO
+ movdqu (9 * 16)(%rdi), STATE4_HI
+
+ mov %rsi, %r8
+ and $0xF, %r8
+ jnz .Lad_u_loop
+
+.align 4
+.Lad_a_loop:
+ movdqa 0(%rsi), MSG_LO
+ movdqa 16(%rsi), MSG_HI
+ call __morus1280_update
+ sub $32, %rdx
+ add $32, %rsi
+ cmp $32, %rdx
+ jge .Lad_a_loop
+
+ jmp .Lad_cont
+.align 4
+.Lad_u_loop:
+ movdqu 0(%rsi), MSG_LO
+ movdqu 16(%rsi), MSG_HI
+ call __morus1280_update
+ sub $32, %rdx
+ add $32, %rsi
+ cmp $32, %rdx
+ jge .Lad_u_loop
+
+.Lad_cont:
+ /* store the state: */
+ movdqu STATE0_LO, (0 * 16)(%rdi)
+ movdqu STATE0_HI, (1 * 16)(%rdi)
+ movdqu STATE1_LO, (2 * 16)(%rdi)
+ movdqu STATE1_HI, (3 * 16)(%rdi)
+ movdqu STATE2_LO, (4 * 16)(%rdi)
+ movdqu STATE2_HI, (5 * 16)(%rdi)
+ movdqu STATE3_LO, (6 * 16)(%rdi)
+ movdqu STATE3_HI, (7 * 16)(%rdi)
+ movdqu STATE4_LO, (8 * 16)(%rdi)
+ movdqu STATE4_HI, (9 * 16)(%rdi)
+
+.Lad_out:
+ FRAME_END
+ ret
+ENDPROC(crypto_morus1280_sse2_ad)
+
+/*
+ * void crypto_morus1280_sse2_enc(void *state, const void *src, void *dst,
+ * unsigned int length);
+ */
+ENTRY(crypto_morus1280_sse2_enc)
+ FRAME_BEGIN
+
+ cmp $32, %rcx
+ jb .Lenc_out
+
+ /* load the state: */
+ movdqu (0 * 16)(%rdi), STATE0_LO
+ movdqu (1 * 16)(%rdi), STATE0_HI
+ movdqu (2 * 16)(%rdi), STATE1_LO
+ movdqu (3 * 16)(%rdi), STATE1_HI
+ movdqu (4 * 16)(%rdi), STATE2_LO
+ movdqu (5 * 16)(%rdi), STATE2_HI
+ movdqu (6 * 16)(%rdi), STATE3_LO
+ movdqu (7 * 16)(%rdi), STATE3_HI
+ movdqu (8 * 16)(%rdi), STATE4_LO
+ movdqu (9 * 16)(%rdi), STATE4_HI
+
+ mov %rsi, %r8
+ or %rdx, %r8
+ and $0xF, %r8
+ jnz .Lenc_u_loop
+
+.align 4
+.Lenc_a_loop:
+ movdqa 0(%rsi), MSG_LO
+ movdqa 16(%rsi), MSG_HI
+ movdqa STATE1_LO, T1_LO
+ movdqa STATE1_HI, T1_HI
+ rol3 T1_HI, T1_LO
+ movdqa MSG_LO, T0_LO
+ movdqa MSG_HI, T0_HI
+ pxor T1_LO, T0_LO
+ pxor T1_HI, T0_HI
+ pxor STATE0_LO, T0_LO
+ pxor STATE0_HI, T0_HI
+ movdqa STATE2_LO, T1_LO
+ movdqa STATE2_HI, T1_HI
+ pand STATE3_LO, T1_LO
+ pand STATE3_HI, T1_HI
+ pxor T1_LO, T0_LO
+ pxor T1_HI, T0_HI
+ movdqa T0_LO, 0(%rdx)
+ movdqa T0_HI, 16(%rdx)
+
+ call __morus1280_update
+ sub $32, %rcx
+ add $32, %rsi
+ add $32, %rdx
+ cmp $32, %rcx
+ jge .Lenc_a_loop
+
+ jmp .Lenc_cont
+.align 4
+.Lenc_u_loop:
+ movdqu 0(%rsi), MSG_LO
+ movdqu 16(%rsi), MSG_HI
+ movdqa STATE1_LO, T1_LO
+ movdqa STATE1_HI, T1_HI
+ rol3 T1_HI, T1_LO
+ movdqa MSG_LO, T0_LO
+ movdqa MSG_HI, T0_HI
+ pxor T1_LO, T0_LO
+ pxor T1_HI, T0_HI
+ pxor STATE0_LO, T0_LO
+ pxor STATE0_HI, T0_HI
+ movdqa STATE2_LO, T1_LO
+ movdqa STATE2_HI, T1_HI
+ pand STATE3_LO, T1_LO
+ pand STATE3_HI, T1_HI
+ pxor T1_LO, T0_LO
+ pxor T1_HI, T0_HI
+ movdqu T0_LO, 0(%rdx)
+ movdqu T0_HI, 16(%rdx)
+
+ call __morus1280_update
+ sub $32, %rcx
+ add $32, %rsi
+ add $32, %rdx
+ cmp $32, %rcx
+ jge .Lenc_u_loop
+
+.Lenc_cont:
+ /* store the state: */
+ movdqu STATE0_LO, (0 * 16)(%rdi)
+ movdqu STATE0_HI, (1 * 16)(%rdi)
+ movdqu STATE1_LO, (2 * 16)(%rdi)
+ movdqu STATE1_HI, (3 * 16)(%rdi)
+ movdqu STATE2_LO, (4 * 16)(%rdi)
+ movdqu STATE2_HI, (5 * 16)(%rdi)
+ movdqu STATE3_LO, (6 * 16)(%rdi)
+ movdqu STATE3_HI, (7 * 16)(%rdi)
+ movdqu STATE4_LO, (8 * 16)(%rdi)
+ movdqu STATE4_HI, (9 * 16)(%rdi)
+
+.Lenc_out:
+ FRAME_END
+ ret
+ENDPROC(crypto_morus1280_sse2_enc)
+
+/*
+ * void crypto_morus1280_sse2_enc_tail(void *state, const void *src, void *dst,
+ * unsigned int length);
+ */
+ENTRY(crypto_morus1280_sse2_enc_tail)
+ FRAME_BEGIN
+
+ /* load the state: */
+ movdqu (0 * 16)(%rdi), STATE0_LO
+ movdqu (1 * 16)(%rdi), STATE0_HI
+ movdqu (2 * 16)(%rdi), STATE1_LO
+ movdqu (3 * 16)(%rdi), STATE1_HI
+ movdqu (4 * 16)(%rdi), STATE2_LO
+ movdqu (5 * 16)(%rdi), STATE2_HI
+ movdqu (6 * 16)(%rdi), STATE3_LO
+ movdqu (7 * 16)(%rdi), STATE3_HI
+ movdqu (8 * 16)(%rdi), STATE4_LO
+ movdqu (9 * 16)(%rdi), STATE4_HI
+
+ /* encrypt message: */
+ call __load_partial
+
+ movdqa STATE1_LO, T1_LO
+ movdqa STATE1_HI, T1_HI
+ rol3 T1_HI, T1_LO
+ movdqa MSG_LO, T0_LO
+ movdqa MSG_HI, T0_HI
+ pxor T1_LO, T0_LO
+ pxor T1_HI, T0_HI
+ pxor STATE0_LO, T0_LO
+ pxor STATE0_HI, T0_HI
+ movdqa STATE2_LO, T1_LO
+ movdqa STATE2_HI, T1_HI
+ pand STATE3_LO, T1_LO
+ pand STATE3_HI, T1_HI
+ pxor T1_LO, T0_LO
+ pxor T1_HI, T0_HI
+
+ call __store_partial
+
+ call __morus1280_update
+
+ /* store the state: */
+ movdqu STATE0_LO, (0 * 16)(%rdi)
+ movdqu STATE0_HI, (1 * 16)(%rdi)
+ movdqu STATE1_LO, (2 * 16)(%rdi)
+ movdqu STATE1_HI, (3 * 16)(%rdi)
+ movdqu STATE2_LO, (4 * 16)(%rdi)
+ movdqu STATE2_HI, (5 * 16)(%rdi)
+ movdqu STATE3_LO, (6 * 16)(%rdi)
+ movdqu STATE3_HI, (7 * 16)(%rdi)
+ movdqu STATE4_LO, (8 * 16)(%rdi)
+ movdqu STATE4_HI, (9 * 16)(%rdi)
+
+ FRAME_END
+ ret
+ENDPROC(crypto_morus1280_sse2_enc_tail)
+
+/*
+ * void crypto_morus1280_sse2_dec(void *state, const void *src, void *dst,
+ * unsigned int length);
+ */
+ENTRY(crypto_morus1280_sse2_dec)
+ FRAME_BEGIN
+
+ cmp $32, %rcx
+ jb .Ldec_out
+
+ /* load the state: */
+ movdqu (0 * 16)(%rdi), STATE0_LO
+ movdqu (1 * 16)(%rdi), STATE0_HI
+ movdqu (2 * 16)(%rdi), STATE1_LO
+ movdqu (3 * 16)(%rdi), STATE1_HI
+ movdqu (4 * 16)(%rdi), STATE2_LO
+ movdqu (5 * 16)(%rdi), STATE2_HI
+ movdqu (6 * 16)(%rdi), STATE3_LO
+ movdqu (7 * 16)(%rdi), STATE3_HI
+ movdqu (8 * 16)(%rdi), STATE4_LO
+ movdqu (9 * 16)(%rdi), STATE4_HI
+
+ mov %rsi, %r8
+ or %rdx, %r8
+ and $0xF, %r8
+ jnz .Ldec_u_loop
+
+.align 4
+.Ldec_a_loop:
+ movdqa 0(%rsi), MSG_LO
+ movdqa 16(%rsi), MSG_HI
+ pxor STATE0_LO, MSG_LO
+ pxor STATE0_HI, MSG_HI
+ movdqa STATE1_LO, T1_LO
+ movdqa STATE1_HI, T1_HI
+ rol3 T1_HI, T1_LO
+ pxor T1_LO, MSG_LO
+ pxor T1_HI, MSG_HI
+ movdqa STATE2_LO, T1_LO
+ movdqa STATE2_HI, T1_HI
+ pand STATE3_LO, T1_LO
+ pand STATE3_HI, T1_HI
+ pxor T1_LO, MSG_LO
+ pxor T1_HI, MSG_HI
+ movdqa MSG_LO, 0(%rdx)
+ movdqa MSG_HI, 16(%rdx)
+
+ call __morus1280_update
+ sub $32, %rcx
+ add $32, %rsi
+ add $32, %rdx
+ cmp $32, %rcx
+ jge .Ldec_a_loop
+
+ jmp .Ldec_cont
+.align 4
+.Ldec_u_loop:
+ movdqu 0(%rsi), MSG_LO
+ movdqu 16(%rsi), MSG_HI
+ pxor STATE0_LO, MSG_LO
+ pxor STATE0_HI, MSG_HI
+ movdqa STATE1_LO, T1_LO
+ movdqa STATE1_HI, T1_HI
+ rol3 T1_HI, T1_LO
+ pxor T1_LO, MSG_LO
+ pxor T1_HI, MSG_HI
+ movdqa STATE2_LO, T1_LO
+ movdqa STATE2_HI, T1_HI
+ pand STATE3_LO, T1_LO
+ pand STATE3_HI, T1_HI
+ pxor T1_LO, MSG_LO
+ pxor T1_HI, MSG_HI
+ movdqu MSG_LO, 0(%rdx)
+ movdqu MSG_HI, 16(%rdx)
+
+ call __morus1280_update
+ sub $32, %rcx
+ add $32, %rsi
+ add $32, %rdx
+ cmp $32, %rcx
+ jge .Ldec_u_loop
+
+.Ldec_cont:
+ /* store the state: */
+ movdqu STATE0_LO, (0 * 16)(%rdi)
+ movdqu STATE0_HI, (1 * 16)(%rdi)
+ movdqu STATE1_LO, (2 * 16)(%rdi)
+ movdqu STATE1_HI, (3 * 16)(%rdi)
+ movdqu STATE2_LO, (4 * 16)(%rdi)
+ movdqu STATE2_HI, (5 * 16)(%rdi)
+ movdqu STATE3_LO, (6 * 16)(%rdi)
+ movdqu STATE3_HI, (7 * 16)(%rdi)
+ movdqu STATE4_LO, (8 * 16)(%rdi)
+ movdqu STATE4_HI, (9 * 16)(%rdi)
+
+.Ldec_out:
+ FRAME_END
+ ret
+ENDPROC(crypto_morus1280_sse2_dec)
+
+/*
+ * void crypto_morus1280_sse2_dec_tail(void *state, const void *src, void *dst,
+ * unsigned int length);
+ */
+ENTRY(crypto_morus1280_sse2_dec_tail)
+ FRAME_BEGIN
+
+ /* load the state: */
+ movdqu (0 * 16)(%rdi), STATE0_LO
+ movdqu (1 * 16)(%rdi), STATE0_HI
+ movdqu (2 * 16)(%rdi), STATE1_LO
+ movdqu (3 * 16)(%rdi), STATE1_HI
+ movdqu (4 * 16)(%rdi), STATE2_LO
+ movdqu (5 * 16)(%rdi), STATE2_HI
+ movdqu (6 * 16)(%rdi), STATE3_LO
+ movdqu (7 * 16)(%rdi), STATE3_HI
+ movdqu (8 * 16)(%rdi), STATE4_LO
+ movdqu (9 * 16)(%rdi), STATE4_HI
+
+ /* decrypt message: */
+ call __load_partial
+
+ pxor STATE0_LO, MSG_LO
+ pxor STATE0_HI, MSG_HI
+ movdqa STATE1_LO, T1_LO
+ movdqa STATE1_HI, T1_HI
+ rol3 T1_HI, T1_LO
+ pxor T1_LO, MSG_LO
+ pxor T1_HI, MSG_HI
+ movdqa STATE2_LO, T1_LO
+ movdqa STATE2_HI, T1_HI
+ pand STATE3_LO, T1_LO
+ pand STATE3_HI, T1_HI
+ pxor T1_LO, MSG_LO
+ pxor T1_HI, MSG_HI
+ movdqa MSG_LO, T0_LO
+ movdqa MSG_HI, T0_HI
+
+ call __store_partial
+
+ /* mask with byte count: */
+ movq %rcx, T0_LO
+ punpcklbw T0_LO, T0_LO
+ punpcklbw T0_LO, T0_LO
+ punpcklbw T0_LO, T0_LO
+ punpcklbw T0_LO, T0_LO
+ movdqa T0_LO, T0_HI
+ movdqa .Lmorus640_counter_0, T1_LO
+ movdqa .Lmorus640_counter_1, T1_HI
+ pcmpgtb T1_LO, T0_LO
+ pcmpgtb T1_HI, T0_HI
+ pand T0_LO, MSG_LO
+ pand T0_HI, MSG_HI
+
+ call __morus1280_update
+
+ /* store the state: */
+ movdqu STATE0_LO, (0 * 16)(%rdi)
+ movdqu STATE0_HI, (1 * 16)(%rdi)
+ movdqu STATE1_LO, (2 * 16)(%rdi)
+ movdqu STATE1_HI, (3 * 16)(%rdi)
+ movdqu STATE2_LO, (4 * 16)(%rdi)
+ movdqu STATE2_HI, (5 * 16)(%rdi)
+ movdqu STATE3_LO, (6 * 16)(%rdi)
+ movdqu STATE3_HI, (7 * 16)(%rdi)
+ movdqu STATE4_LO, (8 * 16)(%rdi)
+ movdqu STATE4_HI, (9 * 16)(%rdi)
+
+ FRAME_END
+ ret
+ENDPROC(crypto_morus1280_sse2_dec_tail)
+
+/*
+ * void crypto_morus1280_sse2_final(void *state, void *tag_xor,
+ * u64 assoclen, u64 cryptlen);
+ */
+ENTRY(crypto_morus1280_sse2_final)
+ FRAME_BEGIN
+
+ /* load the state: */
+ movdqu (0 * 16)(%rdi), STATE0_LO
+ movdqu (1 * 16)(%rdi), STATE0_HI
+ movdqu (2 * 16)(%rdi), STATE1_LO
+ movdqu (3 * 16)(%rdi), STATE1_HI
+ movdqu (4 * 16)(%rdi), STATE2_LO
+ movdqu (5 * 16)(%rdi), STATE2_HI
+ movdqu (6 * 16)(%rdi), STATE3_LO
+ movdqu (7 * 16)(%rdi), STATE3_HI
+ movdqu (8 * 16)(%rdi), STATE4_LO
+ movdqu (9 * 16)(%rdi), STATE4_HI
+
+ /* xor state[0] into state[4]: */
+ pxor STATE0_LO, STATE4_LO
+ pxor STATE0_HI, STATE4_HI
+
+ /* prepare length block: */
+ movq %rdx, MSG_LO
+ movq %rcx, T0_LO
+ pslldq $8, T0_LO
+ pxor T0_LO, MSG_LO
+ psllq $3, MSG_LO /* multiply by 8 (to get bit count) */
+ pxor MSG_HI, MSG_HI
+
+ /* update state: */
+ call __morus1280_update
+ call __morus1280_update
+ call __morus1280_update
+ call __morus1280_update
+ call __morus1280_update
+ call __morus1280_update
+ call __morus1280_update
+ call __morus1280_update
+ call __morus1280_update
+ call __morus1280_update
+
+ /* xor tag: */
+ movdqu 0(%rsi), MSG_LO
+ movdqu 16(%rsi), MSG_HI
+
+ pxor STATE0_LO, MSG_LO
+ pxor STATE0_HI, MSG_HI
+ movdqa STATE1_LO, T0_LO
+ movdqa STATE1_HI, T0_HI
+ rol3 T0_HI, T0_LO
+ pxor T0_LO, MSG_LO
+ pxor T0_HI, MSG_HI
+ movdqa STATE2_LO, T0_LO
+ movdqa STATE2_HI, T0_HI
+ pand STATE3_LO, T0_LO
+ pand STATE3_HI, T0_HI
+ pxor T0_LO, MSG_LO
+ pxor T0_HI, MSG_HI
+
+ movdqu MSG_LO, 0(%rsi)
+ movdqu MSG_HI, 16(%rsi)
+
+ FRAME_END
+ ret
+ENDPROC(crypto_morus1280_sse2_final)
diff --git a/arch/x86/crypto/morus1280-sse2-glue.c b/arch/x86/crypto/morus1280-sse2-glue.c
new file mode 100644
index 000000000..f40244eaf
--- /dev/null
+++ b/arch/x86/crypto/morus1280-sse2-glue.c
@@ -0,0 +1,63 @@
+/*
+ * The MORUS-1280 Authenticated-Encryption Algorithm
+ * Glue for SSE2 implementation
+ *
+ * Copyright (c) 2016-2018 Ondrej Mosnacek <omosnacek@gmail.com>
+ * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include <crypto/internal/aead.h>
+#include <crypto/morus1280_glue.h>
+#include <linux/module.h>
+#include <asm/fpu/api.h>
+#include <asm/cpu_device_id.h>
+
+asmlinkage void crypto_morus1280_sse2_init(void *state, const void *key,
+ const void *iv);
+asmlinkage void crypto_morus1280_sse2_ad(void *state, const void *data,
+ unsigned int length);
+
+asmlinkage void crypto_morus1280_sse2_enc(void *state, const void *src,
+ void *dst, unsigned int length);
+asmlinkage void crypto_morus1280_sse2_dec(void *state, const void *src,
+ void *dst, unsigned int length);
+
+asmlinkage void crypto_morus1280_sse2_enc_tail(void *state, const void *src,
+ void *dst, unsigned int length);
+asmlinkage void crypto_morus1280_sse2_dec_tail(void *state, const void *src,
+ void *dst, unsigned int length);
+
+asmlinkage void crypto_morus1280_sse2_final(void *state, void *tag_xor,
+ u64 assoclen, u64 cryptlen);
+
+MORUS1280_DECLARE_ALGS(sse2, "morus1280-sse2", 350);
+
+static int __init crypto_morus1280_sse2_module_init(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_XMM2) ||
+ !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL))
+ return -ENODEV;
+
+ return crypto_register_aeads(crypto_morus1280_sse2_algs,
+ ARRAY_SIZE(crypto_morus1280_sse2_algs));
+}
+
+static void __exit crypto_morus1280_sse2_module_exit(void)
+{
+ crypto_unregister_aeads(crypto_morus1280_sse2_algs,
+ ARRAY_SIZE(crypto_morus1280_sse2_algs));
+}
+
+module_init(crypto_morus1280_sse2_module_init);
+module_exit(crypto_morus1280_sse2_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>");
+MODULE_DESCRIPTION("MORUS-1280 AEAD algorithm -- SSE2 implementation");
+MODULE_ALIAS_CRYPTO("morus1280");
+MODULE_ALIAS_CRYPTO("morus1280-sse2");
diff --git a/arch/x86/crypto/morus1280_glue.c b/arch/x86/crypto/morus1280_glue.c
new file mode 100644
index 000000000..7e600f8bc
--- /dev/null
+++ b/arch/x86/crypto/morus1280_glue.c
@@ -0,0 +1,294 @@
+/*
+ * The MORUS-1280 Authenticated-Encryption Algorithm
+ * Common x86 SIMD glue skeleton
+ *
+ * Copyright (c) 2016-2018 Ondrej Mosnacek <omosnacek@gmail.com>
+ * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include <crypto/cryptd.h>
+#include <crypto/internal/aead.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/morus1280_glue.h>
+#include <crypto/scatterwalk.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/scatterlist.h>
+#include <asm/fpu/api.h>
+
+struct morus1280_state {
+ struct morus1280_block s[MORUS_STATE_BLOCKS];
+};
+
+struct morus1280_ops {
+ int (*skcipher_walk_init)(struct skcipher_walk *walk,
+ struct aead_request *req, bool atomic);
+
+ void (*crypt_blocks)(void *state, const void *src, void *dst,
+ unsigned int length);
+ void (*crypt_tail)(void *state, const void *src, void *dst,
+ unsigned int length);
+};
+
+static void crypto_morus1280_glue_process_ad(
+ struct morus1280_state *state,
+ const struct morus1280_glue_ops *ops,
+ struct scatterlist *sg_src, unsigned int assoclen)
+{
+ struct scatter_walk walk;
+ struct morus1280_block buf;
+ unsigned int pos = 0;
+
+ scatterwalk_start(&walk, sg_src);
+ while (assoclen != 0) {
+ unsigned int size = scatterwalk_clamp(&walk, assoclen);
+ unsigned int left = size;
+ void *mapped = scatterwalk_map(&walk);
+ const u8 *src = (const u8 *)mapped;
+
+ if (pos + size >= MORUS1280_BLOCK_SIZE) {
+ if (pos > 0) {
+ unsigned int fill = MORUS1280_BLOCK_SIZE - pos;
+ memcpy(buf.bytes + pos, src, fill);
+ ops->ad(state, buf.bytes, MORUS1280_BLOCK_SIZE);
+ pos = 0;
+ left -= fill;
+ src += fill;
+ }
+
+ ops->ad(state, src, left);
+ src += left & ~(MORUS1280_BLOCK_SIZE - 1);
+ left &= MORUS1280_BLOCK_SIZE - 1;
+ }
+
+ memcpy(buf.bytes + pos, src, left);
+
+ pos += left;
+ assoclen -= size;
+ scatterwalk_unmap(mapped);
+ scatterwalk_advance(&walk, size);
+ scatterwalk_done(&walk, 0, assoclen);
+ }
+
+ if (pos > 0) {
+ memset(buf.bytes + pos, 0, MORUS1280_BLOCK_SIZE - pos);
+ ops->ad(state, buf.bytes, MORUS1280_BLOCK_SIZE);
+ }
+}
+
+static void crypto_morus1280_glue_process_crypt(struct morus1280_state *state,
+ struct morus1280_ops ops,
+ struct skcipher_walk *walk)
+{
+ while (walk->nbytes >= MORUS1280_BLOCK_SIZE) {
+ ops.crypt_blocks(state, walk->src.virt.addr,
+ walk->dst.virt.addr,
+ round_down(walk->nbytes,
+ MORUS1280_BLOCK_SIZE));
+ skcipher_walk_done(walk, walk->nbytes % MORUS1280_BLOCK_SIZE);
+ }
+
+ if (walk->nbytes) {
+ ops.crypt_tail(state, walk->src.virt.addr, walk->dst.virt.addr,
+ walk->nbytes);
+ skcipher_walk_done(walk, 0);
+ }
+}
+
+int crypto_morus1280_glue_setkey(struct crypto_aead *aead, const u8 *key,
+ unsigned int keylen)
+{
+ struct morus1280_ctx *ctx = crypto_aead_ctx(aead);
+
+ if (keylen == MORUS1280_BLOCK_SIZE) {
+ memcpy(ctx->key.bytes, key, MORUS1280_BLOCK_SIZE);
+ } else if (keylen == MORUS1280_BLOCK_SIZE / 2) {
+ memcpy(ctx->key.bytes, key, keylen);
+ memcpy(ctx->key.bytes + keylen, key, keylen);
+ } else {
+ crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(crypto_morus1280_glue_setkey);
+
+int crypto_morus1280_glue_setauthsize(struct crypto_aead *tfm,
+ unsigned int authsize)
+{
+ return (authsize <= MORUS_MAX_AUTH_SIZE) ? 0 : -EINVAL;
+}
+EXPORT_SYMBOL_GPL(crypto_morus1280_glue_setauthsize);
+
+static void crypto_morus1280_glue_crypt(struct aead_request *req,
+ struct morus1280_ops ops,
+ unsigned int cryptlen,
+ struct morus1280_block *tag_xor)
+{
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ struct morus1280_ctx *ctx = crypto_aead_ctx(tfm);
+ struct morus1280_state state;
+ struct skcipher_walk walk;
+
+ ops.skcipher_walk_init(&walk, req, true);
+
+ kernel_fpu_begin();
+
+ ctx->ops->init(&state, &ctx->key, req->iv);
+ crypto_morus1280_glue_process_ad(&state, ctx->ops, req->src, req->assoclen);
+ crypto_morus1280_glue_process_crypt(&state, ops, &walk);
+ ctx->ops->final(&state, tag_xor, req->assoclen, cryptlen);
+
+ kernel_fpu_end();
+}
+
+int crypto_morus1280_glue_encrypt(struct aead_request *req)
+{
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ struct morus1280_ctx *ctx = crypto_aead_ctx(tfm);
+ struct morus1280_ops OPS = {
+ .skcipher_walk_init = skcipher_walk_aead_encrypt,
+ .crypt_blocks = ctx->ops->enc,
+ .crypt_tail = ctx->ops->enc_tail,
+ };
+
+ struct morus1280_block tag = {};
+ unsigned int authsize = crypto_aead_authsize(tfm);
+ unsigned int cryptlen = req->cryptlen;
+
+ crypto_morus1280_glue_crypt(req, OPS, cryptlen, &tag);
+
+ scatterwalk_map_and_copy(tag.bytes, req->dst,
+ req->assoclen + cryptlen, authsize, 1);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(crypto_morus1280_glue_encrypt);
+
+int crypto_morus1280_glue_decrypt(struct aead_request *req)
+{
+ static const u8 zeros[MORUS1280_BLOCK_SIZE] = {};
+
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ struct morus1280_ctx *ctx = crypto_aead_ctx(tfm);
+ struct morus1280_ops OPS = {
+ .skcipher_walk_init = skcipher_walk_aead_decrypt,
+ .crypt_blocks = ctx->ops->dec,
+ .crypt_tail = ctx->ops->dec_tail,
+ };
+
+ struct morus1280_block tag;
+ unsigned int authsize = crypto_aead_authsize(tfm);
+ unsigned int cryptlen = req->cryptlen - authsize;
+
+ scatterwalk_map_and_copy(tag.bytes, req->src,
+ req->assoclen + cryptlen, authsize, 0);
+
+ crypto_morus1280_glue_crypt(req, OPS, cryptlen, &tag);
+
+ return crypto_memneq(tag.bytes, zeros, authsize) ? -EBADMSG : 0;
+}
+EXPORT_SYMBOL_GPL(crypto_morus1280_glue_decrypt);
+
+void crypto_morus1280_glue_init_ops(struct crypto_aead *aead,
+ const struct morus1280_glue_ops *ops)
+{
+ struct morus1280_ctx *ctx = crypto_aead_ctx(aead);
+ ctx->ops = ops;
+}
+EXPORT_SYMBOL_GPL(crypto_morus1280_glue_init_ops);
+
+int cryptd_morus1280_glue_setkey(struct crypto_aead *aead, const u8 *key,
+ unsigned int keylen)
+{
+ struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+ struct cryptd_aead *cryptd_tfm = *ctx;
+
+ return crypto_aead_setkey(&cryptd_tfm->base, key, keylen);
+}
+EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_setkey);
+
+int cryptd_morus1280_glue_setauthsize(struct crypto_aead *aead,
+ unsigned int authsize)
+{
+ struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+ struct cryptd_aead *cryptd_tfm = *ctx;
+
+ return crypto_aead_setauthsize(&cryptd_tfm->base, authsize);
+}
+EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_setauthsize);
+
+int cryptd_morus1280_glue_encrypt(struct aead_request *req)
+{
+ struct crypto_aead *aead = crypto_aead_reqtfm(req);
+ struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+ struct cryptd_aead *cryptd_tfm = *ctx;
+
+ aead = &cryptd_tfm->base;
+ if (irq_fpu_usable() && (!in_atomic() ||
+ !cryptd_aead_queued(cryptd_tfm)))
+ aead = cryptd_aead_child(cryptd_tfm);
+
+ aead_request_set_tfm(req, aead);
+
+ return crypto_aead_encrypt(req);
+}
+EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_encrypt);
+
+int cryptd_morus1280_glue_decrypt(struct aead_request *req)
+{
+ struct crypto_aead *aead = crypto_aead_reqtfm(req);
+ struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+ struct cryptd_aead *cryptd_tfm = *ctx;
+
+ aead = &cryptd_tfm->base;
+ if (irq_fpu_usable() && (!in_atomic() ||
+ !cryptd_aead_queued(cryptd_tfm)))
+ aead = cryptd_aead_child(cryptd_tfm);
+
+ aead_request_set_tfm(req, aead);
+
+ return crypto_aead_decrypt(req);
+}
+EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_decrypt);
+
+int cryptd_morus1280_glue_init_tfm(struct crypto_aead *aead)
+{
+ struct cryptd_aead *cryptd_tfm;
+ struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+ const char *name = crypto_aead_alg(aead)->base.cra_driver_name;
+ char internal_name[CRYPTO_MAX_ALG_NAME];
+
+ if (snprintf(internal_name, CRYPTO_MAX_ALG_NAME, "__%s", name)
+ >= CRYPTO_MAX_ALG_NAME)
+ return -ENAMETOOLONG;
+
+ cryptd_tfm = cryptd_alloc_aead(internal_name, CRYPTO_ALG_INTERNAL,
+ CRYPTO_ALG_INTERNAL);
+ if (IS_ERR(cryptd_tfm))
+ return PTR_ERR(cryptd_tfm);
+
+ *ctx = cryptd_tfm;
+ crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base));
+ return 0;
+}
+EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_init_tfm);
+
+void cryptd_morus1280_glue_exit_tfm(struct crypto_aead *aead)
+{
+ struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+
+ cryptd_free_aead(*ctx);
+}
+EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_exit_tfm);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>");
+MODULE_DESCRIPTION("MORUS-1280 AEAD mode -- glue for x86 optimizations");
diff --git a/arch/x86/crypto/morus640-sse2-asm.S b/arch/x86/crypto/morus640-sse2-asm.S
new file mode 100644
index 000000000..414db4802
--- /dev/null
+++ b/arch/x86/crypto/morus640-sse2-asm.S
@@ -0,0 +1,615 @@
+/*
+ * SSE2 implementation of MORUS-640
+ *
+ * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
+ * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+
+#define SHUFFLE_MASK(i0, i1, i2, i3) \
+ (i0 | (i1 << 2) | (i2 << 4) | (i3 << 6))
+
+#define MASK1 SHUFFLE_MASK(3, 0, 1, 2)
+#define MASK2 SHUFFLE_MASK(2, 3, 0, 1)
+#define MASK3 SHUFFLE_MASK(1, 2, 3, 0)
+
+#define STATE0 %xmm0
+#define STATE1 %xmm1
+#define STATE2 %xmm2
+#define STATE3 %xmm3
+#define STATE4 %xmm4
+#define KEY %xmm5
+#define MSG %xmm5
+#define T0 %xmm6
+#define T1 %xmm7
+
+.section .rodata.cst16.morus640_const, "aM", @progbits, 32
+.align 16
+.Lmorus640_const_0:
+ .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d
+ .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62
+.Lmorus640_const_1:
+ .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1
+ .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd
+
+.section .rodata.cst16.morus640_counter, "aM", @progbits, 16
+.align 16
+.Lmorus640_counter:
+ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
+ .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+
+.text
+
+.macro morus640_round s0, s1, s2, s3, s4, b, w
+ movdqa \s1, T0
+ pand \s2, T0
+ pxor T0, \s0
+ pxor \s3, \s0
+ movdqa \s0, T0
+ pslld $\b, T0
+ psrld $(32 - \b), \s0
+ pxor T0, \s0
+ pshufd $\w, \s3, \s3
+.endm
+
+/*
+ * __morus640_update: internal ABI
+ * input:
+ * STATE[0-4] - input state
+ * MSG - message block
+ * output:
+ * STATE[0-4] - output state
+ * changed:
+ * T0
+ */
+__morus640_update:
+ morus640_round STATE0, STATE1, STATE2, STATE3, STATE4, 5, MASK1
+ pxor MSG, STATE1
+ morus640_round STATE1, STATE2, STATE3, STATE4, STATE0, 31, MASK2
+ pxor MSG, STATE2
+ morus640_round STATE2, STATE3, STATE4, STATE0, STATE1, 7, MASK3
+ pxor MSG, STATE3
+ morus640_round STATE3, STATE4, STATE0, STATE1, STATE2, 22, MASK2
+ pxor MSG, STATE4
+ morus640_round STATE4, STATE0, STATE1, STATE2, STATE3, 13, MASK1
+ ret
+ENDPROC(__morus640_update)
+
+
+/*
+ * __morus640_update_zero: internal ABI
+ * input:
+ * STATE[0-4] - input state
+ * output:
+ * STATE[0-4] - output state
+ * changed:
+ * T0
+ */
+__morus640_update_zero:
+ morus640_round STATE0, STATE1, STATE2, STATE3, STATE4, 5, MASK1
+ morus640_round STATE1, STATE2, STATE3, STATE4, STATE0, 31, MASK2
+ morus640_round STATE2, STATE3, STATE4, STATE0, STATE1, 7, MASK3
+ morus640_round STATE3, STATE4, STATE0, STATE1, STATE2, 22, MASK2
+ morus640_round STATE4, STATE0, STATE1, STATE2, STATE3, 13, MASK1
+ ret
+ENDPROC(__morus640_update_zero)
+
+/*
+ * __load_partial: internal ABI
+ * input:
+ * %rsi - src
+ * %rcx - bytes
+ * output:
+ * MSG - message block
+ * changed:
+ * T0
+ * %r8
+ * %r9
+ */
+__load_partial:
+ xor %r9d, %r9d
+ pxor MSG, MSG
+
+ mov %rcx, %r8
+ and $0x1, %r8
+ jz .Lld_partial_1
+
+ mov %rcx, %r8
+ and $0x1E, %r8
+ add %rsi, %r8
+ mov (%r8), %r9b
+
+.Lld_partial_1:
+ mov %rcx, %r8
+ and $0x2, %r8
+ jz .Lld_partial_2
+
+ mov %rcx, %r8
+ and $0x1C, %r8
+ add %rsi, %r8
+ shl $16, %r9
+ mov (%r8), %r9w
+
+.Lld_partial_2:
+ mov %rcx, %r8
+ and $0x4, %r8
+ jz .Lld_partial_4
+
+ mov %rcx, %r8
+ and $0x18, %r8
+ add %rsi, %r8
+ shl $32, %r9
+ mov (%r8), %r8d
+ xor %r8, %r9
+
+.Lld_partial_4:
+ movq %r9, MSG
+
+ mov %rcx, %r8
+ and $0x8, %r8
+ jz .Lld_partial_8
+
+ mov %rcx, %r8
+ and $0x10, %r8
+ add %rsi, %r8
+ pslldq $8, MSG
+ movq (%r8), T0
+ pxor T0, MSG
+
+.Lld_partial_8:
+ ret
+ENDPROC(__load_partial)
+
+/*
+ * __store_partial: internal ABI
+ * input:
+ * %rdx - dst
+ * %rcx - bytes
+ * output:
+ * T0 - message block
+ * changed:
+ * %r8
+ * %r9
+ * %r10
+ */
+__store_partial:
+ mov %rcx, %r8
+ mov %rdx, %r9
+
+ movq T0, %r10
+
+ cmp $8, %r8
+ jl .Lst_partial_8
+
+ mov %r10, (%r9)
+ psrldq $8, T0
+ movq T0, %r10
+
+ sub $8, %r8
+ add $8, %r9
+
+.Lst_partial_8:
+ cmp $4, %r8
+ jl .Lst_partial_4
+
+ mov %r10d, (%r9)
+ shr $32, %r10
+
+ sub $4, %r8
+ add $4, %r9
+
+.Lst_partial_4:
+ cmp $2, %r8
+ jl .Lst_partial_2
+
+ mov %r10w, (%r9)
+ shr $16, %r10
+
+ sub $2, %r8
+ add $2, %r9
+
+.Lst_partial_2:
+ cmp $1, %r8
+ jl .Lst_partial_1
+
+ mov %r10b, (%r9)
+
+.Lst_partial_1:
+ ret
+ENDPROC(__store_partial)
+
+/*
+ * void crypto_morus640_sse2_init(void *state, const void *key, const void *iv);
+ */
+ENTRY(crypto_morus640_sse2_init)
+ FRAME_BEGIN
+
+ /* load IV: */
+ movdqu (%rdx), STATE0
+ /* load key: */
+ movdqu (%rsi), KEY
+ movdqa KEY, STATE1
+ /* load all ones: */
+ pcmpeqd STATE2, STATE2
+ /* load the constants: */
+ movdqa .Lmorus640_const_0, STATE3
+ movdqa .Lmorus640_const_1, STATE4
+
+ /* update 16 times with zero: */
+ call __morus640_update_zero
+ call __morus640_update_zero
+ call __morus640_update_zero
+ call __morus640_update_zero
+ call __morus640_update_zero
+ call __morus640_update_zero
+ call __morus640_update_zero
+ call __morus640_update_zero
+ call __morus640_update_zero
+ call __morus640_update_zero
+ call __morus640_update_zero
+ call __morus640_update_zero
+ call __morus640_update_zero
+ call __morus640_update_zero
+ call __morus640_update_zero
+ call __morus640_update_zero
+
+ /* xor-in the key again after updates: */
+ pxor KEY, STATE1
+
+ /* store the state: */
+ movdqu STATE0, (0 * 16)(%rdi)
+ movdqu STATE1, (1 * 16)(%rdi)
+ movdqu STATE2, (2 * 16)(%rdi)
+ movdqu STATE3, (3 * 16)(%rdi)
+ movdqu STATE4, (4 * 16)(%rdi)
+
+ FRAME_END
+ ret
+ENDPROC(crypto_morus640_sse2_init)
+
+/*
+ * void crypto_morus640_sse2_ad(void *state, const void *data,
+ * unsigned int length);
+ */
+ENTRY(crypto_morus640_sse2_ad)
+ FRAME_BEGIN
+
+ cmp $16, %rdx
+ jb .Lad_out
+
+ /* load the state: */
+ movdqu (0 * 16)(%rdi), STATE0
+ movdqu (1 * 16)(%rdi), STATE1
+ movdqu (2 * 16)(%rdi), STATE2
+ movdqu (3 * 16)(%rdi), STATE3
+ movdqu (4 * 16)(%rdi), STATE4
+
+ mov %rsi, %r8
+ and $0xF, %r8
+ jnz .Lad_u_loop
+
+.align 4
+.Lad_a_loop:
+ movdqa (%rsi), MSG
+ call __morus640_update
+ sub $16, %rdx
+ add $16, %rsi
+ cmp $16, %rdx
+ jge .Lad_a_loop
+
+ jmp .Lad_cont
+.align 4
+.Lad_u_loop:
+ movdqu (%rsi), MSG
+ call __morus640_update
+ sub $16, %rdx
+ add $16, %rsi
+ cmp $16, %rdx
+ jge .Lad_u_loop
+
+.Lad_cont:
+ /* store the state: */
+ movdqu STATE0, (0 * 16)(%rdi)
+ movdqu STATE1, (1 * 16)(%rdi)
+ movdqu STATE2, (2 * 16)(%rdi)
+ movdqu STATE3, (3 * 16)(%rdi)
+ movdqu STATE4, (4 * 16)(%rdi)
+
+.Lad_out:
+ FRAME_END
+ ret
+ENDPROC(crypto_morus640_sse2_ad)
+
+/*
+ * void crypto_morus640_sse2_enc(void *state, const void *src, void *dst,
+ * unsigned int length);
+ */
+ENTRY(crypto_morus640_sse2_enc)
+ FRAME_BEGIN
+
+ cmp $16, %rcx
+ jb .Lenc_out
+
+ /* load the state: */
+ movdqu (0 * 16)(%rdi), STATE0
+ movdqu (1 * 16)(%rdi), STATE1
+ movdqu (2 * 16)(%rdi), STATE2
+ movdqu (3 * 16)(%rdi), STATE3
+ movdqu (4 * 16)(%rdi), STATE4
+
+ mov %rsi, %r8
+ or %rdx, %r8
+ and $0xF, %r8
+ jnz .Lenc_u_loop
+
+.align 4
+.Lenc_a_loop:
+ movdqa (%rsi), MSG
+ movdqa MSG, T0
+ pxor STATE0, T0
+ pshufd $MASK3, STATE1, T1
+ pxor T1, T0
+ movdqa STATE2, T1
+ pand STATE3, T1
+ pxor T1, T0
+ movdqa T0, (%rdx)
+
+ call __morus640_update
+ sub $16, %rcx
+ add $16, %rsi
+ add $16, %rdx
+ cmp $16, %rcx
+ jge .Lenc_a_loop
+
+ jmp .Lenc_cont
+.align 4
+.Lenc_u_loop:
+ movdqu (%rsi), MSG
+ movdqa MSG, T0
+ pxor STATE0, T0
+ pshufd $MASK3, STATE1, T1
+ pxor T1, T0
+ movdqa STATE2, T1
+ pand STATE3, T1
+ pxor T1, T0
+ movdqu T0, (%rdx)
+
+ call __morus640_update
+ sub $16, %rcx
+ add $16, %rsi
+ add $16, %rdx
+ cmp $16, %rcx
+ jge .Lenc_u_loop
+
+.Lenc_cont:
+ /* store the state: */
+ movdqu STATE0, (0 * 16)(%rdi)
+ movdqu STATE1, (1 * 16)(%rdi)
+ movdqu STATE2, (2 * 16)(%rdi)
+ movdqu STATE3, (3 * 16)(%rdi)
+ movdqu STATE4, (4 * 16)(%rdi)
+
+.Lenc_out:
+ FRAME_END
+ ret
+ENDPROC(crypto_morus640_sse2_enc)
+
+/*
+ * void crypto_morus640_sse2_enc_tail(void *state, const void *src, void *dst,
+ * unsigned int length);
+ */
+ENTRY(crypto_morus640_sse2_enc_tail)
+ FRAME_BEGIN
+
+ /* load the state: */
+ movdqu (0 * 16)(%rdi), STATE0
+ movdqu (1 * 16)(%rdi), STATE1
+ movdqu (2 * 16)(%rdi), STATE2
+ movdqu (3 * 16)(%rdi), STATE3
+ movdqu (4 * 16)(%rdi), STATE4
+
+ /* encrypt message: */
+ call __load_partial
+
+ movdqa MSG, T0
+ pxor STATE0, T0
+ pshufd $MASK3, STATE1, T1
+ pxor T1, T0
+ movdqa STATE2, T1
+ pand STATE3, T1
+ pxor T1, T0
+
+ call __store_partial
+
+ call __morus640_update
+
+ /* store the state: */
+ movdqu STATE0, (0 * 16)(%rdi)
+ movdqu STATE1, (1 * 16)(%rdi)
+ movdqu STATE2, (2 * 16)(%rdi)
+ movdqu STATE3, (3 * 16)(%rdi)
+ movdqu STATE4, (4 * 16)(%rdi)
+
+ FRAME_END
+ ret
+ENDPROC(crypto_morus640_sse2_enc_tail)
+
+/*
+ * void crypto_morus640_sse2_dec(void *state, const void *src, void *dst,
+ * unsigned int length);
+ */
+ENTRY(crypto_morus640_sse2_dec)
+ FRAME_BEGIN
+
+ cmp $16, %rcx
+ jb .Ldec_out
+
+ /* load the state: */
+ movdqu (0 * 16)(%rdi), STATE0
+ movdqu (1 * 16)(%rdi), STATE1
+ movdqu (2 * 16)(%rdi), STATE2
+ movdqu (3 * 16)(%rdi), STATE3
+ movdqu (4 * 16)(%rdi), STATE4
+
+ mov %rsi, %r8
+ or %rdx, %r8
+ and $0xF, %r8
+ jnz .Ldec_u_loop
+
+.align 4
+.Ldec_a_loop:
+ movdqa (%rsi), MSG
+ pxor STATE0, MSG
+ pshufd $MASK3, STATE1, T0
+ pxor T0, MSG
+ movdqa STATE2, T0
+ pand STATE3, T0
+ pxor T0, MSG
+ movdqa MSG, (%rdx)
+
+ call __morus640_update
+ sub $16, %rcx
+ add $16, %rsi
+ add $16, %rdx
+ cmp $16, %rcx
+ jge .Ldec_a_loop
+
+ jmp .Ldec_cont
+.align 4
+.Ldec_u_loop:
+ movdqu (%rsi), MSG
+ pxor STATE0, MSG
+ pshufd $MASK3, STATE1, T0
+ pxor T0, MSG
+ movdqa STATE2, T0
+ pand STATE3, T0
+ pxor T0, MSG
+ movdqu MSG, (%rdx)
+
+ call __morus640_update
+ sub $16, %rcx
+ add $16, %rsi
+ add $16, %rdx
+ cmp $16, %rcx
+ jge .Ldec_u_loop
+
+.Ldec_cont:
+ /* store the state: */
+ movdqu STATE0, (0 * 16)(%rdi)
+ movdqu STATE1, (1 * 16)(%rdi)
+ movdqu STATE2, (2 * 16)(%rdi)
+ movdqu STATE3, (3 * 16)(%rdi)
+ movdqu STATE4, (4 * 16)(%rdi)
+
+.Ldec_out:
+ FRAME_END
+ ret
+ENDPROC(crypto_morus640_sse2_dec)
+
+/*
+ * void crypto_morus640_sse2_dec_tail(void *state, const void *src, void *dst,
+ * unsigned int length);
+ */
+ENTRY(crypto_morus640_sse2_dec_tail)
+ FRAME_BEGIN
+
+ /* load the state: */
+ movdqu (0 * 16)(%rdi), STATE0
+ movdqu (1 * 16)(%rdi), STATE1
+ movdqu (2 * 16)(%rdi), STATE2
+ movdqu (3 * 16)(%rdi), STATE3
+ movdqu (4 * 16)(%rdi), STATE4
+
+ /* decrypt message: */
+ call __load_partial
+
+ pxor STATE0, MSG
+ pshufd $MASK3, STATE1, T0
+ pxor T0, MSG
+ movdqa STATE2, T0
+ pand STATE3, T0
+ pxor T0, MSG
+ movdqa MSG, T0
+
+ call __store_partial
+
+ /* mask with byte count: */
+ movq %rcx, T0
+ punpcklbw T0, T0
+ punpcklbw T0, T0
+ punpcklbw T0, T0
+ punpcklbw T0, T0
+ movdqa .Lmorus640_counter, T1
+ pcmpgtb T1, T0
+ pand T0, MSG
+
+ call __morus640_update
+
+ /* store the state: */
+ movdqu STATE0, (0 * 16)(%rdi)
+ movdqu STATE1, (1 * 16)(%rdi)
+ movdqu STATE2, (2 * 16)(%rdi)
+ movdqu STATE3, (3 * 16)(%rdi)
+ movdqu STATE4, (4 * 16)(%rdi)
+
+ FRAME_END
+ ret
+ENDPROC(crypto_morus640_sse2_dec_tail)
+
+/*
+ * void crypto_morus640_sse2_final(void *state, void *tag_xor,
+ * u64 assoclen, u64 cryptlen);
+ */
+ENTRY(crypto_morus640_sse2_final)
+ FRAME_BEGIN
+
+ /* load the state: */
+ movdqu (0 * 16)(%rdi), STATE0
+ movdqu (1 * 16)(%rdi), STATE1
+ movdqu (2 * 16)(%rdi), STATE2
+ movdqu (3 * 16)(%rdi), STATE3
+ movdqu (4 * 16)(%rdi), STATE4
+
+ /* xor state[0] into state[4]: */
+ pxor STATE0, STATE4
+
+ /* prepare length block: */
+ movq %rdx, MSG
+ movq %rcx, T0
+ pslldq $8, T0
+ pxor T0, MSG
+ psllq $3, MSG /* multiply by 8 (to get bit count) */
+
+ /* update state: */
+ call __morus640_update
+ call __morus640_update
+ call __morus640_update
+ call __morus640_update
+ call __morus640_update
+ call __morus640_update
+ call __morus640_update
+ call __morus640_update
+ call __morus640_update
+ call __morus640_update
+
+ /* xor tag: */
+ movdqu (%rsi), MSG
+
+ pxor STATE0, MSG
+ pshufd $MASK3, STATE1, T0
+ pxor T0, MSG
+ movdqa STATE2, T0
+ pand STATE3, T0
+ pxor T0, MSG
+
+ movdqu MSG, (%rsi)
+
+ FRAME_END
+ ret
+ENDPROC(crypto_morus640_sse2_final)
diff --git a/arch/x86/crypto/morus640-sse2-glue.c b/arch/x86/crypto/morus640-sse2-glue.c
new file mode 100644
index 000000000..9afaf8f85
--- /dev/null
+++ b/arch/x86/crypto/morus640-sse2-glue.c
@@ -0,0 +1,63 @@
+/*
+ * The MORUS-640 Authenticated-Encryption Algorithm
+ * Glue for SSE2 implementation
+ *
+ * Copyright (c) 2016-2018 Ondrej Mosnacek <omosnacek@gmail.com>
+ * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include <crypto/internal/aead.h>
+#include <crypto/morus640_glue.h>
+#include <linux/module.h>
+#include <asm/fpu/api.h>
+#include <asm/cpu_device_id.h>
+
+asmlinkage void crypto_morus640_sse2_init(void *state, const void *key,
+ const void *iv);
+asmlinkage void crypto_morus640_sse2_ad(void *state, const void *data,
+ unsigned int length);
+
+asmlinkage void crypto_morus640_sse2_enc(void *state, const void *src,
+ void *dst, unsigned int length);
+asmlinkage void crypto_morus640_sse2_dec(void *state, const void *src,
+ void *dst, unsigned int length);
+
+asmlinkage void crypto_morus640_sse2_enc_tail(void *state, const void *src,
+ void *dst, unsigned int length);
+asmlinkage void crypto_morus640_sse2_dec_tail(void *state, const void *src,
+ void *dst, unsigned int length);
+
+asmlinkage void crypto_morus640_sse2_final(void *state, void *tag_xor,
+ u64 assoclen, u64 cryptlen);
+
+MORUS640_DECLARE_ALGS(sse2, "morus640-sse2", 400);
+
+static int __init crypto_morus640_sse2_module_init(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_XMM2) ||
+ !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL))
+ return -ENODEV;
+
+ return crypto_register_aeads(crypto_morus640_sse2_algs,
+ ARRAY_SIZE(crypto_morus640_sse2_algs));
+}
+
+static void __exit crypto_morus640_sse2_module_exit(void)
+{
+ crypto_unregister_aeads(crypto_morus640_sse2_algs,
+ ARRAY_SIZE(crypto_morus640_sse2_algs));
+}
+
+module_init(crypto_morus640_sse2_module_init);
+module_exit(crypto_morus640_sse2_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>");
+MODULE_DESCRIPTION("MORUS-640 AEAD algorithm -- SSE2 implementation");
+MODULE_ALIAS_CRYPTO("morus640");
+MODULE_ALIAS_CRYPTO("morus640-sse2");
diff --git a/arch/x86/crypto/morus640_glue.c b/arch/x86/crypto/morus640_glue.c
new file mode 100644
index 000000000..cb3a81732
--- /dev/null
+++ b/arch/x86/crypto/morus640_glue.c
@@ -0,0 +1,289 @@
+/*
+ * The MORUS-640 Authenticated-Encryption Algorithm
+ * Common x86 SIMD glue skeleton
+ *
+ * Copyright (c) 2016-2018 Ondrej Mosnacek <omosnacek@gmail.com>
+ * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include <crypto/cryptd.h>
+#include <crypto/internal/aead.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/morus640_glue.h>
+#include <crypto/scatterwalk.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/scatterlist.h>
+#include <asm/fpu/api.h>
+
+struct morus640_state {
+ struct morus640_block s[MORUS_STATE_BLOCKS];
+};
+
+struct morus640_ops {
+ int (*skcipher_walk_init)(struct skcipher_walk *walk,
+ struct aead_request *req, bool atomic);
+
+ void (*crypt_blocks)(void *state, const void *src, void *dst,
+ unsigned int length);
+ void (*crypt_tail)(void *state, const void *src, void *dst,
+ unsigned int length);
+};
+
+static void crypto_morus640_glue_process_ad(
+ struct morus640_state *state,
+ const struct morus640_glue_ops *ops,
+ struct scatterlist *sg_src, unsigned int assoclen)
+{
+ struct scatter_walk walk;
+ struct morus640_block buf;
+ unsigned int pos = 0;
+
+ scatterwalk_start(&walk, sg_src);
+ while (assoclen != 0) {
+ unsigned int size = scatterwalk_clamp(&walk, assoclen);
+ unsigned int left = size;
+ void *mapped = scatterwalk_map(&walk);
+ const u8 *src = (const u8 *)mapped;
+
+ if (pos + size >= MORUS640_BLOCK_SIZE) {
+ if (pos > 0) {
+ unsigned int fill = MORUS640_BLOCK_SIZE - pos;
+ memcpy(buf.bytes + pos, src, fill);
+ ops->ad(state, buf.bytes, MORUS640_BLOCK_SIZE);
+ pos = 0;
+ left -= fill;
+ src += fill;
+ }
+
+ ops->ad(state, src, left);
+ src += left & ~(MORUS640_BLOCK_SIZE - 1);
+ left &= MORUS640_BLOCK_SIZE - 1;
+ }
+
+ memcpy(buf.bytes + pos, src, left);
+
+ pos += left;
+ assoclen -= size;
+ scatterwalk_unmap(mapped);
+ scatterwalk_advance(&walk, size);
+ scatterwalk_done(&walk, 0, assoclen);
+ }
+
+ if (pos > 0) {
+ memset(buf.bytes + pos, 0, MORUS640_BLOCK_SIZE - pos);
+ ops->ad(state, buf.bytes, MORUS640_BLOCK_SIZE);
+ }
+}
+
+static void crypto_morus640_glue_process_crypt(struct morus640_state *state,
+ struct morus640_ops ops,
+ struct skcipher_walk *walk)
+{
+ while (walk->nbytes >= MORUS640_BLOCK_SIZE) {
+ ops.crypt_blocks(state, walk->src.virt.addr,
+ walk->dst.virt.addr,
+ round_down(walk->nbytes, MORUS640_BLOCK_SIZE));
+ skcipher_walk_done(walk, walk->nbytes % MORUS640_BLOCK_SIZE);
+ }
+
+ if (walk->nbytes) {
+ ops.crypt_tail(state, walk->src.virt.addr, walk->dst.virt.addr,
+ walk->nbytes);
+ skcipher_walk_done(walk, 0);
+ }
+}
+
+int crypto_morus640_glue_setkey(struct crypto_aead *aead, const u8 *key,
+ unsigned int keylen)
+{
+ struct morus640_ctx *ctx = crypto_aead_ctx(aead);
+
+ if (keylen != MORUS640_BLOCK_SIZE) {
+ crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN);
+ return -EINVAL;
+ }
+
+ memcpy(ctx->key.bytes, key, MORUS640_BLOCK_SIZE);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(crypto_morus640_glue_setkey);
+
+int crypto_morus640_glue_setauthsize(struct crypto_aead *tfm,
+ unsigned int authsize)
+{
+ return (authsize <= MORUS_MAX_AUTH_SIZE) ? 0 : -EINVAL;
+}
+EXPORT_SYMBOL_GPL(crypto_morus640_glue_setauthsize);
+
+static void crypto_morus640_glue_crypt(struct aead_request *req,
+ struct morus640_ops ops,
+ unsigned int cryptlen,
+ struct morus640_block *tag_xor)
+{
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ struct morus640_ctx *ctx = crypto_aead_ctx(tfm);
+ struct morus640_state state;
+ struct skcipher_walk walk;
+
+ ops.skcipher_walk_init(&walk, req, true);
+
+ kernel_fpu_begin();
+
+ ctx->ops->init(&state, &ctx->key, req->iv);
+ crypto_morus640_glue_process_ad(&state, ctx->ops, req->src, req->assoclen);
+ crypto_morus640_glue_process_crypt(&state, ops, &walk);
+ ctx->ops->final(&state, tag_xor, req->assoclen, cryptlen);
+
+ kernel_fpu_end();
+}
+
+int crypto_morus640_glue_encrypt(struct aead_request *req)
+{
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ struct morus640_ctx *ctx = crypto_aead_ctx(tfm);
+ struct morus640_ops OPS = {
+ .skcipher_walk_init = skcipher_walk_aead_encrypt,
+ .crypt_blocks = ctx->ops->enc,
+ .crypt_tail = ctx->ops->enc_tail,
+ };
+
+ struct morus640_block tag = {};
+ unsigned int authsize = crypto_aead_authsize(tfm);
+ unsigned int cryptlen = req->cryptlen;
+
+ crypto_morus640_glue_crypt(req, OPS, cryptlen, &tag);
+
+ scatterwalk_map_and_copy(tag.bytes, req->dst,
+ req->assoclen + cryptlen, authsize, 1);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(crypto_morus640_glue_encrypt);
+
+int crypto_morus640_glue_decrypt(struct aead_request *req)
+{
+ static const u8 zeros[MORUS640_BLOCK_SIZE] = {};
+
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ struct morus640_ctx *ctx = crypto_aead_ctx(tfm);
+ struct morus640_ops OPS = {
+ .skcipher_walk_init = skcipher_walk_aead_decrypt,
+ .crypt_blocks = ctx->ops->dec,
+ .crypt_tail = ctx->ops->dec_tail,
+ };
+
+ struct morus640_block tag;
+ unsigned int authsize = crypto_aead_authsize(tfm);
+ unsigned int cryptlen = req->cryptlen - authsize;
+
+ scatterwalk_map_and_copy(tag.bytes, req->src,
+ req->assoclen + cryptlen, authsize, 0);
+
+ crypto_morus640_glue_crypt(req, OPS, cryptlen, &tag);
+
+ return crypto_memneq(tag.bytes, zeros, authsize) ? -EBADMSG : 0;
+}
+EXPORT_SYMBOL_GPL(crypto_morus640_glue_decrypt);
+
+void crypto_morus640_glue_init_ops(struct crypto_aead *aead,
+ const struct morus640_glue_ops *ops)
+{
+ struct morus640_ctx *ctx = crypto_aead_ctx(aead);
+ ctx->ops = ops;
+}
+EXPORT_SYMBOL_GPL(crypto_morus640_glue_init_ops);
+
+int cryptd_morus640_glue_setkey(struct crypto_aead *aead, const u8 *key,
+ unsigned int keylen)
+{
+ struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+ struct cryptd_aead *cryptd_tfm = *ctx;
+
+ return crypto_aead_setkey(&cryptd_tfm->base, key, keylen);
+}
+EXPORT_SYMBOL_GPL(cryptd_morus640_glue_setkey);
+
+int cryptd_morus640_glue_setauthsize(struct crypto_aead *aead,
+ unsigned int authsize)
+{
+ struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+ struct cryptd_aead *cryptd_tfm = *ctx;
+
+ return crypto_aead_setauthsize(&cryptd_tfm->base, authsize);
+}
+EXPORT_SYMBOL_GPL(cryptd_morus640_glue_setauthsize);
+
+int cryptd_morus640_glue_encrypt(struct aead_request *req)
+{
+ struct crypto_aead *aead = crypto_aead_reqtfm(req);
+ struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+ struct cryptd_aead *cryptd_tfm = *ctx;
+
+ aead = &cryptd_tfm->base;
+ if (irq_fpu_usable() && (!in_atomic() ||
+ !cryptd_aead_queued(cryptd_tfm)))
+ aead = cryptd_aead_child(cryptd_tfm);
+
+ aead_request_set_tfm(req, aead);
+
+ return crypto_aead_encrypt(req);
+}
+EXPORT_SYMBOL_GPL(cryptd_morus640_glue_encrypt);
+
+int cryptd_morus640_glue_decrypt(struct aead_request *req)
+{
+ struct crypto_aead *aead = crypto_aead_reqtfm(req);
+ struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+ struct cryptd_aead *cryptd_tfm = *ctx;
+
+ aead = &cryptd_tfm->base;
+ if (irq_fpu_usable() && (!in_atomic() ||
+ !cryptd_aead_queued(cryptd_tfm)))
+ aead = cryptd_aead_child(cryptd_tfm);
+
+ aead_request_set_tfm(req, aead);
+
+ return crypto_aead_decrypt(req);
+}
+EXPORT_SYMBOL_GPL(cryptd_morus640_glue_decrypt);
+
+int cryptd_morus640_glue_init_tfm(struct crypto_aead *aead)
+{
+ struct cryptd_aead *cryptd_tfm;
+ struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+ const char *name = crypto_aead_alg(aead)->base.cra_driver_name;
+ char internal_name[CRYPTO_MAX_ALG_NAME];
+
+ if (snprintf(internal_name, CRYPTO_MAX_ALG_NAME, "__%s", name)
+ >= CRYPTO_MAX_ALG_NAME)
+ return -ENAMETOOLONG;
+
+ cryptd_tfm = cryptd_alloc_aead(internal_name, CRYPTO_ALG_INTERNAL,
+ CRYPTO_ALG_INTERNAL);
+ if (IS_ERR(cryptd_tfm))
+ return PTR_ERR(cryptd_tfm);
+
+ *ctx = cryptd_tfm;
+ crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base));
+ return 0;
+}
+EXPORT_SYMBOL_GPL(cryptd_morus640_glue_init_tfm);
+
+void cryptd_morus640_glue_exit_tfm(struct crypto_aead *aead)
+{
+ struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+
+ cryptd_free_aead(*ctx);
+}
+EXPORT_SYMBOL_GPL(cryptd_morus640_glue_exit_tfm);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>");
+MODULE_DESCRIPTION("MORUS-640 AEAD mode -- glue for x86 optimizations");
diff --git a/arch/x86/crypto/poly1305-avx2-x86_64.S b/arch/x86/crypto/poly1305-avx2-x86_64.S
new file mode 100644
index 000000000..8457cdd47
--- /dev/null
+++ b/arch/x86/crypto/poly1305-avx2-x86_64.S
@@ -0,0 +1,394 @@
+/*
+ * Poly1305 authenticator algorithm, RFC7539, x64 AVX2 functions
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/linkage.h>
+
+.section .rodata.cst32.ANMASK, "aM", @progbits, 32
+.align 32
+ANMASK: .octa 0x0000000003ffffff0000000003ffffff
+ .octa 0x0000000003ffffff0000000003ffffff
+
+.section .rodata.cst32.ORMASK, "aM", @progbits, 32
+.align 32
+ORMASK: .octa 0x00000000010000000000000001000000
+ .octa 0x00000000010000000000000001000000
+
+.text
+
+#define h0 0x00(%rdi)
+#define h1 0x04(%rdi)
+#define h2 0x08(%rdi)
+#define h3 0x0c(%rdi)
+#define h4 0x10(%rdi)
+#define r0 0x00(%rdx)
+#define r1 0x04(%rdx)
+#define r2 0x08(%rdx)
+#define r3 0x0c(%rdx)
+#define r4 0x10(%rdx)
+#define u0 0x00(%r8)
+#define u1 0x04(%r8)
+#define u2 0x08(%r8)
+#define u3 0x0c(%r8)
+#define u4 0x10(%r8)
+#define w0 0x14(%r8)
+#define w1 0x18(%r8)
+#define w2 0x1c(%r8)
+#define w3 0x20(%r8)
+#define w4 0x24(%r8)
+#define y0 0x28(%r8)
+#define y1 0x2c(%r8)
+#define y2 0x30(%r8)
+#define y3 0x34(%r8)
+#define y4 0x38(%r8)
+#define m %rsi
+#define hc0 %ymm0
+#define hc1 %ymm1
+#define hc2 %ymm2
+#define hc3 %ymm3
+#define hc4 %ymm4
+#define hc0x %xmm0
+#define hc1x %xmm1
+#define hc2x %xmm2
+#define hc3x %xmm3
+#define hc4x %xmm4
+#define t1 %ymm5
+#define t2 %ymm6
+#define t1x %xmm5
+#define t2x %xmm6
+#define ruwy0 %ymm7
+#define ruwy1 %ymm8
+#define ruwy2 %ymm9
+#define ruwy3 %ymm10
+#define ruwy4 %ymm11
+#define ruwy0x %xmm7
+#define ruwy1x %xmm8
+#define ruwy2x %xmm9
+#define ruwy3x %xmm10
+#define ruwy4x %xmm11
+#define svxz1 %ymm12
+#define svxz2 %ymm13
+#define svxz3 %ymm14
+#define svxz4 %ymm15
+#define d0 %r9
+#define d1 %r10
+#define d2 %r11
+#define d3 %r12
+#define d4 %r13
+
+ENTRY(poly1305_4block_avx2)
+ # %rdi: Accumulator h[5]
+ # %rsi: 64 byte input block m
+ # %rdx: Poly1305 key r[5]
+ # %rcx: Quadblock count
+ # %r8: Poly1305 derived key r^2 u[5], r^3 w[5], r^4 y[5],
+
+ # This four-block variant uses loop unrolled block processing. It
+ # requires 4 Poly1305 keys: r, r^2, r^3 and r^4:
+ # h = (h + m) * r => h = (h + m1) * r^4 + m2 * r^3 + m3 * r^2 + m4 * r
+
+ vzeroupper
+ push %rbx
+ push %r12
+ push %r13
+
+ # combine r0,u0,w0,y0
+ vmovd y0,ruwy0x
+ vmovd w0,t1x
+ vpunpcklqdq t1,ruwy0,ruwy0
+ vmovd u0,t1x
+ vmovd r0,t2x
+ vpunpcklqdq t2,t1,t1
+ vperm2i128 $0x20,t1,ruwy0,ruwy0
+
+ # combine r1,u1,w1,y1 and s1=r1*5,v1=u1*5,x1=w1*5,z1=y1*5
+ vmovd y1,ruwy1x
+ vmovd w1,t1x
+ vpunpcklqdq t1,ruwy1,ruwy1
+ vmovd u1,t1x
+ vmovd r1,t2x
+ vpunpcklqdq t2,t1,t1
+ vperm2i128 $0x20,t1,ruwy1,ruwy1
+ vpslld $2,ruwy1,svxz1
+ vpaddd ruwy1,svxz1,svxz1
+
+ # combine r2,u2,w2,y2 and s2=r2*5,v2=u2*5,x2=w2*5,z2=y2*5
+ vmovd y2,ruwy2x
+ vmovd w2,t1x
+ vpunpcklqdq t1,ruwy2,ruwy2
+ vmovd u2,t1x
+ vmovd r2,t2x
+ vpunpcklqdq t2,t1,t1
+ vperm2i128 $0x20,t1,ruwy2,ruwy2
+ vpslld $2,ruwy2,svxz2
+ vpaddd ruwy2,svxz2,svxz2
+
+ # combine r3,u3,w3,y3 and s3=r3*5,v3=u3*5,x3=w3*5,z3=y3*5
+ vmovd y3,ruwy3x
+ vmovd w3,t1x
+ vpunpcklqdq t1,ruwy3,ruwy3
+ vmovd u3,t1x
+ vmovd r3,t2x
+ vpunpcklqdq t2,t1,t1
+ vperm2i128 $0x20,t1,ruwy3,ruwy3
+ vpslld $2,ruwy3,svxz3
+ vpaddd ruwy3,svxz3,svxz3
+
+ # combine r4,u4,w4,y4 and s4=r4*5,v4=u4*5,x4=w4*5,z4=y4*5
+ vmovd y4,ruwy4x
+ vmovd w4,t1x
+ vpunpcklqdq t1,ruwy4,ruwy4
+ vmovd u4,t1x
+ vmovd r4,t2x
+ vpunpcklqdq t2,t1,t1
+ vperm2i128 $0x20,t1,ruwy4,ruwy4
+ vpslld $2,ruwy4,svxz4
+ vpaddd ruwy4,svxz4,svxz4
+
+.Ldoblock4:
+ # hc0 = [m[48-51] & 0x3ffffff, m[32-35] & 0x3ffffff,
+ # m[16-19] & 0x3ffffff, m[ 0- 3] & 0x3ffffff + h0]
+ vmovd 0x00(m),hc0x
+ vmovd 0x10(m),t1x
+ vpunpcklqdq t1,hc0,hc0
+ vmovd 0x20(m),t1x
+ vmovd 0x30(m),t2x
+ vpunpcklqdq t2,t1,t1
+ vperm2i128 $0x20,t1,hc0,hc0
+ vpand ANMASK(%rip),hc0,hc0
+ vmovd h0,t1x
+ vpaddd t1,hc0,hc0
+ # hc1 = [(m[51-54] >> 2) & 0x3ffffff, (m[35-38] >> 2) & 0x3ffffff,
+ # (m[19-22] >> 2) & 0x3ffffff, (m[ 3- 6] >> 2) & 0x3ffffff + h1]
+ vmovd 0x03(m),hc1x
+ vmovd 0x13(m),t1x
+ vpunpcklqdq t1,hc1,hc1
+ vmovd 0x23(m),t1x
+ vmovd 0x33(m),t2x
+ vpunpcklqdq t2,t1,t1
+ vperm2i128 $0x20,t1,hc1,hc1
+ vpsrld $2,hc1,hc1
+ vpand ANMASK(%rip),hc1,hc1
+ vmovd h1,t1x
+ vpaddd t1,hc1,hc1
+ # hc2 = [(m[54-57] >> 4) & 0x3ffffff, (m[38-41] >> 4) & 0x3ffffff,
+ # (m[22-25] >> 4) & 0x3ffffff, (m[ 6- 9] >> 4) & 0x3ffffff + h2]
+ vmovd 0x06(m),hc2x
+ vmovd 0x16(m),t1x
+ vpunpcklqdq t1,hc2,hc2
+ vmovd 0x26(m),t1x
+ vmovd 0x36(m),t2x
+ vpunpcklqdq t2,t1,t1
+ vperm2i128 $0x20,t1,hc2,hc2
+ vpsrld $4,hc2,hc2
+ vpand ANMASK(%rip),hc2,hc2
+ vmovd h2,t1x
+ vpaddd t1,hc2,hc2
+ # hc3 = [(m[57-60] >> 6) & 0x3ffffff, (m[41-44] >> 6) & 0x3ffffff,
+ # (m[25-28] >> 6) & 0x3ffffff, (m[ 9-12] >> 6) & 0x3ffffff + h3]
+ vmovd 0x09(m),hc3x
+ vmovd 0x19(m),t1x
+ vpunpcklqdq t1,hc3,hc3
+ vmovd 0x29(m),t1x
+ vmovd 0x39(m),t2x
+ vpunpcklqdq t2,t1,t1
+ vperm2i128 $0x20,t1,hc3,hc3
+ vpsrld $6,hc3,hc3
+ vpand ANMASK(%rip),hc3,hc3
+ vmovd h3,t1x
+ vpaddd t1,hc3,hc3
+ # hc4 = [(m[60-63] >> 8) | (1<<24), (m[44-47] >> 8) | (1<<24),
+ # (m[28-31] >> 8) | (1<<24), (m[12-15] >> 8) | (1<<24) + h4]
+ vmovd 0x0c(m),hc4x
+ vmovd 0x1c(m),t1x
+ vpunpcklqdq t1,hc4,hc4
+ vmovd 0x2c(m),t1x
+ vmovd 0x3c(m),t2x
+ vpunpcklqdq t2,t1,t1
+ vperm2i128 $0x20,t1,hc4,hc4
+ vpsrld $8,hc4,hc4
+ vpor ORMASK(%rip),hc4,hc4
+ vmovd h4,t1x
+ vpaddd t1,hc4,hc4
+
+ # t1 = [ hc0[3] * r0, hc0[2] * u0, hc0[1] * w0, hc0[0] * y0 ]
+ vpmuludq hc0,ruwy0,t1
+ # t1 += [ hc1[3] * s4, hc1[2] * v4, hc1[1] * x4, hc1[0] * z4 ]
+ vpmuludq hc1,svxz4,t2
+ vpaddq t2,t1,t1
+ # t1 += [ hc2[3] * s3, hc2[2] * v3, hc2[1] * x3, hc2[0] * z3 ]
+ vpmuludq hc2,svxz3,t2
+ vpaddq t2,t1,t1
+ # t1 += [ hc3[3] * s2, hc3[2] * v2, hc3[1] * x2, hc3[0] * z2 ]
+ vpmuludq hc3,svxz2,t2
+ vpaddq t2,t1,t1
+ # t1 += [ hc4[3] * s1, hc4[2] * v1, hc4[1] * x1, hc4[0] * z1 ]
+ vpmuludq hc4,svxz1,t2
+ vpaddq t2,t1,t1
+ # d0 = t1[0] + t1[1] + t[2] + t[3]
+ vpermq $0xee,t1,t2
+ vpaddq t2,t1,t1
+ vpsrldq $8,t1,t2
+ vpaddq t2,t1,t1
+ vmovq t1x,d0
+
+ # t1 = [ hc0[3] * r1, hc0[2] * u1,hc0[1] * w1, hc0[0] * y1 ]
+ vpmuludq hc0,ruwy1,t1
+ # t1 += [ hc1[3] * r0, hc1[2] * u0, hc1[1] * w0, hc1[0] * y0 ]
+ vpmuludq hc1,ruwy0,t2
+ vpaddq t2,t1,t1
+ # t1 += [ hc2[3] * s4, hc2[2] * v4, hc2[1] * x4, hc2[0] * z4 ]
+ vpmuludq hc2,svxz4,t2
+ vpaddq t2,t1,t1
+ # t1 += [ hc3[3] * s3, hc3[2] * v3, hc3[1] * x3, hc3[0] * z3 ]
+ vpmuludq hc3,svxz3,t2
+ vpaddq t2,t1,t1
+ # t1 += [ hc4[3] * s2, hc4[2] * v2, hc4[1] * x2, hc4[0] * z2 ]
+ vpmuludq hc4,svxz2,t2
+ vpaddq t2,t1,t1
+ # d1 = t1[0] + t1[1] + t1[3] + t1[4]
+ vpermq $0xee,t1,t2
+ vpaddq t2,t1,t1
+ vpsrldq $8,t1,t2
+ vpaddq t2,t1,t1
+ vmovq t1x,d1
+
+ # t1 = [ hc0[3] * r2, hc0[2] * u2, hc0[1] * w2, hc0[0] * y2 ]
+ vpmuludq hc0,ruwy2,t1
+ # t1 += [ hc1[3] * r1, hc1[2] * u1, hc1[1] * w1, hc1[0] * y1 ]
+ vpmuludq hc1,ruwy1,t2
+ vpaddq t2,t1,t1
+ # t1 += [ hc2[3] * r0, hc2[2] * u0, hc2[1] * w0, hc2[0] * y0 ]
+ vpmuludq hc2,ruwy0,t2
+ vpaddq t2,t1,t1
+ # t1 += [ hc3[3] * s4, hc3[2] * v4, hc3[1] * x4, hc3[0] * z4 ]
+ vpmuludq hc3,svxz4,t2
+ vpaddq t2,t1,t1
+ # t1 += [ hc4[3] * s3, hc4[2] * v3, hc4[1] * x3, hc4[0] * z3 ]
+ vpmuludq hc4,svxz3,t2
+ vpaddq t2,t1,t1
+ # d2 = t1[0] + t1[1] + t1[2] + t1[3]
+ vpermq $0xee,t1,t2
+ vpaddq t2,t1,t1
+ vpsrldq $8,t1,t2
+ vpaddq t2,t1,t1
+ vmovq t1x,d2
+
+ # t1 = [ hc0[3] * r3, hc0[2] * u3, hc0[1] * w3, hc0[0] * y3 ]
+ vpmuludq hc0,ruwy3,t1
+ # t1 += [ hc1[3] * r2, hc1[2] * u2, hc1[1] * w2, hc1[0] * y2 ]
+ vpmuludq hc1,ruwy2,t2
+ vpaddq t2,t1,t1
+ # t1 += [ hc2[3] * r1, hc2[2] * u1, hc2[1] * w1, hc2[0] * y1 ]
+ vpmuludq hc2,ruwy1,t2
+ vpaddq t2,t1,t1
+ # t1 += [ hc3[3] * r0, hc3[2] * u0, hc3[1] * w0, hc3[0] * y0 ]
+ vpmuludq hc3,ruwy0,t2
+ vpaddq t2,t1,t1
+ # t1 += [ hc4[3] * s4, hc4[2] * v4, hc4[1] * x4, hc4[0] * z4 ]
+ vpmuludq hc4,svxz4,t2
+ vpaddq t2,t1,t1
+ # d3 = t1[0] + t1[1] + t1[2] + t1[3]
+ vpermq $0xee,t1,t2
+ vpaddq t2,t1,t1
+ vpsrldq $8,t1,t2
+ vpaddq t2,t1,t1
+ vmovq t1x,d3
+
+ # t1 = [ hc0[3] * r4, hc0[2] * u4, hc0[1] * w4, hc0[0] * y4 ]
+ vpmuludq hc0,ruwy4,t1
+ # t1 += [ hc1[3] * r3, hc1[2] * u3, hc1[1] * w3, hc1[0] * y3 ]
+ vpmuludq hc1,ruwy3,t2
+ vpaddq t2,t1,t1
+ # t1 += [ hc2[3] * r2, hc2[2] * u2, hc2[1] * w2, hc2[0] * y2 ]
+ vpmuludq hc2,ruwy2,t2
+ vpaddq t2,t1,t1
+ # t1 += [ hc3[3] * r1, hc3[2] * u1, hc3[1] * w1, hc3[0] * y1 ]
+ vpmuludq hc3,ruwy1,t2
+ vpaddq t2,t1,t1
+ # t1 += [ hc4[3] * r0, hc4[2] * u0, hc4[1] * w0, hc4[0] * y0 ]
+ vpmuludq hc4,ruwy0,t2
+ vpaddq t2,t1,t1
+ # d4 = t1[0] + t1[1] + t1[2] + t1[3]
+ vpermq $0xee,t1,t2
+ vpaddq t2,t1,t1
+ vpsrldq $8,t1,t2
+ vpaddq t2,t1,t1
+ vmovq t1x,d4
+
+ # Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 ->
+ # h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small
+ # amount. Careful: we must not assume the carry bits 'd0 >> 26',
+ # 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit
+ # integers. It's true in a single-block implementation, but not here.
+
+ # d1 += d0 >> 26
+ mov d0,%rax
+ shr $26,%rax
+ add %rax,d1
+ # h0 = d0 & 0x3ffffff
+ mov d0,%rbx
+ and $0x3ffffff,%ebx
+
+ # d2 += d1 >> 26
+ mov d1,%rax
+ shr $26,%rax
+ add %rax,d2
+ # h1 = d1 & 0x3ffffff
+ mov d1,%rax
+ and $0x3ffffff,%eax
+ mov %eax,h1
+
+ # d3 += d2 >> 26
+ mov d2,%rax
+ shr $26,%rax
+ add %rax,d3
+ # h2 = d2 & 0x3ffffff
+ mov d2,%rax
+ and $0x3ffffff,%eax
+ mov %eax,h2
+
+ # d4 += d3 >> 26
+ mov d3,%rax
+ shr $26,%rax
+ add %rax,d4
+ # h3 = d3 & 0x3ffffff
+ mov d3,%rax
+ and $0x3ffffff,%eax
+ mov %eax,h3
+
+ # h0 += (d4 >> 26) * 5
+ mov d4,%rax
+ shr $26,%rax
+ lea (%rax,%rax,4),%rax
+ add %rax,%rbx
+ # h4 = d4 & 0x3ffffff
+ mov d4,%rax
+ and $0x3ffffff,%eax
+ mov %eax,h4
+
+ # h1 += h0 >> 26
+ mov %rbx,%rax
+ shr $26,%rax
+ add %eax,h1
+ # h0 = h0 & 0x3ffffff
+ andl $0x3ffffff,%ebx
+ mov %ebx,h0
+
+ add $0x40,m
+ dec %rcx
+ jnz .Ldoblock4
+
+ vzeroupper
+ pop %r13
+ pop %r12
+ pop %rbx
+ ret
+ENDPROC(poly1305_4block_avx2)
diff --git a/arch/x86/crypto/poly1305-sse2-x86_64.S b/arch/x86/crypto/poly1305-sse2-x86_64.S
new file mode 100644
index 000000000..5851c7418
--- /dev/null
+++ b/arch/x86/crypto/poly1305-sse2-x86_64.S
@@ -0,0 +1,590 @@
+/*
+ * Poly1305 authenticator algorithm, RFC7539, x64 SSE2 functions
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/linkage.h>
+
+.section .rodata.cst16.ANMASK, "aM", @progbits, 16
+.align 16
+ANMASK: .octa 0x0000000003ffffff0000000003ffffff
+
+.section .rodata.cst16.ORMASK, "aM", @progbits, 16
+.align 16
+ORMASK: .octa 0x00000000010000000000000001000000
+
+.text
+
+#define h0 0x00(%rdi)
+#define h1 0x04(%rdi)
+#define h2 0x08(%rdi)
+#define h3 0x0c(%rdi)
+#define h4 0x10(%rdi)
+#define r0 0x00(%rdx)
+#define r1 0x04(%rdx)
+#define r2 0x08(%rdx)
+#define r3 0x0c(%rdx)
+#define r4 0x10(%rdx)
+#define s1 0x00(%rsp)
+#define s2 0x04(%rsp)
+#define s3 0x08(%rsp)
+#define s4 0x0c(%rsp)
+#define m %rsi
+#define h01 %xmm0
+#define h23 %xmm1
+#define h44 %xmm2
+#define t1 %xmm3
+#define t2 %xmm4
+#define t3 %xmm5
+#define t4 %xmm6
+#define mask %xmm7
+#define d0 %r8
+#define d1 %r9
+#define d2 %r10
+#define d3 %r11
+#define d4 %r12
+
+ENTRY(poly1305_block_sse2)
+ # %rdi: Accumulator h[5]
+ # %rsi: 16 byte input block m
+ # %rdx: Poly1305 key r[5]
+ # %rcx: Block count
+
+ # This single block variant tries to improve performance by doing two
+ # multiplications in parallel using SSE instructions. There is quite
+ # some quardword packing involved, hence the speedup is marginal.
+
+ push %rbx
+ push %r12
+ sub $0x10,%rsp
+
+ # s1..s4 = r1..r4 * 5
+ mov r1,%eax
+ lea (%eax,%eax,4),%eax
+ mov %eax,s1
+ mov r2,%eax
+ lea (%eax,%eax,4),%eax
+ mov %eax,s2
+ mov r3,%eax
+ lea (%eax,%eax,4),%eax
+ mov %eax,s3
+ mov r4,%eax
+ lea (%eax,%eax,4),%eax
+ mov %eax,s4
+
+ movdqa ANMASK(%rip),mask
+
+.Ldoblock:
+ # h01 = [0, h1, 0, h0]
+ # h23 = [0, h3, 0, h2]
+ # h44 = [0, h4, 0, h4]
+ movd h0,h01
+ movd h1,t1
+ movd h2,h23
+ movd h3,t2
+ movd h4,h44
+ punpcklqdq t1,h01
+ punpcklqdq t2,h23
+ punpcklqdq h44,h44
+
+ # h01 += [ (m[3-6] >> 2) & 0x3ffffff, m[0-3] & 0x3ffffff ]
+ movd 0x00(m),t1
+ movd 0x03(m),t2
+ psrld $2,t2
+ punpcklqdq t2,t1
+ pand mask,t1
+ paddd t1,h01
+ # h23 += [ (m[9-12] >> 6) & 0x3ffffff, (m[6-9] >> 4) & 0x3ffffff ]
+ movd 0x06(m),t1
+ movd 0x09(m),t2
+ psrld $4,t1
+ psrld $6,t2
+ punpcklqdq t2,t1
+ pand mask,t1
+ paddd t1,h23
+ # h44 += [ (m[12-15] >> 8) | (1 << 24), (m[12-15] >> 8) | (1 << 24) ]
+ mov 0x0c(m),%eax
+ shr $8,%eax
+ or $0x01000000,%eax
+ movd %eax,t1
+ pshufd $0xc4,t1,t1
+ paddd t1,h44
+
+ # t1[0] = h0 * r0 + h2 * s3
+ # t1[1] = h1 * s4 + h3 * s2
+ movd r0,t1
+ movd s4,t2
+ punpcklqdq t2,t1
+ pmuludq h01,t1
+ movd s3,t2
+ movd s2,t3
+ punpcklqdq t3,t2
+ pmuludq h23,t2
+ paddq t2,t1
+ # t2[0] = h0 * r1 + h2 * s4
+ # t2[1] = h1 * r0 + h3 * s3
+ movd r1,t2
+ movd r0,t3
+ punpcklqdq t3,t2
+ pmuludq h01,t2
+ movd s4,t3
+ movd s3,t4
+ punpcklqdq t4,t3
+ pmuludq h23,t3
+ paddq t3,t2
+ # t3[0] = h4 * s1
+ # t3[1] = h4 * s2
+ movd s1,t3
+ movd s2,t4
+ punpcklqdq t4,t3
+ pmuludq h44,t3
+ # d0 = t1[0] + t1[1] + t3[0]
+ # d1 = t2[0] + t2[1] + t3[1]
+ movdqa t1,t4
+ punpcklqdq t2,t4
+ punpckhqdq t2,t1
+ paddq t4,t1
+ paddq t3,t1
+ movq t1,d0
+ psrldq $8,t1
+ movq t1,d1
+
+ # t1[0] = h0 * r2 + h2 * r0
+ # t1[1] = h1 * r1 + h3 * s4
+ movd r2,t1
+ movd r1,t2
+ punpcklqdq t2,t1
+ pmuludq h01,t1
+ movd r0,t2
+ movd s4,t3
+ punpcklqdq t3,t2
+ pmuludq h23,t2
+ paddq t2,t1
+ # t2[0] = h0 * r3 + h2 * r1
+ # t2[1] = h1 * r2 + h3 * r0
+ movd r3,t2
+ movd r2,t3
+ punpcklqdq t3,t2
+ pmuludq h01,t2
+ movd r1,t3
+ movd r0,t4
+ punpcklqdq t4,t3
+ pmuludq h23,t3
+ paddq t3,t2
+ # t3[0] = h4 * s3
+ # t3[1] = h4 * s4
+ movd s3,t3
+ movd s4,t4
+ punpcklqdq t4,t3
+ pmuludq h44,t3
+ # d2 = t1[0] + t1[1] + t3[0]
+ # d3 = t2[0] + t2[1] + t3[1]
+ movdqa t1,t4
+ punpcklqdq t2,t4
+ punpckhqdq t2,t1
+ paddq t4,t1
+ paddq t3,t1
+ movq t1,d2
+ psrldq $8,t1
+ movq t1,d3
+
+ # t1[0] = h0 * r4 + h2 * r2
+ # t1[1] = h1 * r3 + h3 * r1
+ movd r4,t1
+ movd r3,t2
+ punpcklqdq t2,t1
+ pmuludq h01,t1
+ movd r2,t2
+ movd r1,t3
+ punpcklqdq t3,t2
+ pmuludq h23,t2
+ paddq t2,t1
+ # t3[0] = h4 * r0
+ movd r0,t3
+ pmuludq h44,t3
+ # d4 = t1[0] + t1[1] + t3[0]
+ movdqa t1,t4
+ psrldq $8,t4
+ paddq t4,t1
+ paddq t3,t1
+ movq t1,d4
+
+ # d1 += d0 >> 26
+ mov d0,%rax
+ shr $26,%rax
+ add %rax,d1
+ # h0 = d0 & 0x3ffffff
+ mov d0,%rbx
+ and $0x3ffffff,%ebx
+
+ # d2 += d1 >> 26
+ mov d1,%rax
+ shr $26,%rax
+ add %rax,d2
+ # h1 = d1 & 0x3ffffff
+ mov d1,%rax
+ and $0x3ffffff,%eax
+ mov %eax,h1
+
+ # d3 += d2 >> 26
+ mov d2,%rax
+ shr $26,%rax
+ add %rax,d3
+ # h2 = d2 & 0x3ffffff
+ mov d2,%rax
+ and $0x3ffffff,%eax
+ mov %eax,h2
+
+ # d4 += d3 >> 26
+ mov d3,%rax
+ shr $26,%rax
+ add %rax,d4
+ # h3 = d3 & 0x3ffffff
+ mov d3,%rax
+ and $0x3ffffff,%eax
+ mov %eax,h3
+
+ # h0 += (d4 >> 26) * 5
+ mov d4,%rax
+ shr $26,%rax
+ lea (%rax,%rax,4),%rax
+ add %rax,%rbx
+ # h4 = d4 & 0x3ffffff
+ mov d4,%rax
+ and $0x3ffffff,%eax
+ mov %eax,h4
+
+ # h1 += h0 >> 26
+ mov %rbx,%rax
+ shr $26,%rax
+ add %eax,h1
+ # h0 = h0 & 0x3ffffff
+ andl $0x3ffffff,%ebx
+ mov %ebx,h0
+
+ add $0x10,m
+ dec %rcx
+ jnz .Ldoblock
+
+ add $0x10,%rsp
+ pop %r12
+ pop %rbx
+ ret
+ENDPROC(poly1305_block_sse2)
+
+
+#define u0 0x00(%r8)
+#define u1 0x04(%r8)
+#define u2 0x08(%r8)
+#define u3 0x0c(%r8)
+#define u4 0x10(%r8)
+#define hc0 %xmm0
+#define hc1 %xmm1
+#define hc2 %xmm2
+#define hc3 %xmm5
+#define hc4 %xmm6
+#define ru0 %xmm7
+#define ru1 %xmm8
+#define ru2 %xmm9
+#define ru3 %xmm10
+#define ru4 %xmm11
+#define sv1 %xmm12
+#define sv2 %xmm13
+#define sv3 %xmm14
+#define sv4 %xmm15
+#undef d0
+#define d0 %r13
+
+ENTRY(poly1305_2block_sse2)
+ # %rdi: Accumulator h[5]
+ # %rsi: 16 byte input block m
+ # %rdx: Poly1305 key r[5]
+ # %rcx: Doubleblock count
+ # %r8: Poly1305 derived key r^2 u[5]
+
+ # This two-block variant further improves performance by using loop
+ # unrolled block processing. This is more straight forward and does
+ # less byte shuffling, but requires a second Poly1305 key r^2:
+ # h = (h + m) * r => h = (h + m1) * r^2 + m2 * r
+
+ push %rbx
+ push %r12
+ push %r13
+
+ # combine r0,u0
+ movd u0,ru0
+ movd r0,t1
+ punpcklqdq t1,ru0
+
+ # combine r1,u1 and s1=r1*5,v1=u1*5
+ movd u1,ru1
+ movd r1,t1
+ punpcklqdq t1,ru1
+ movdqa ru1,sv1
+ pslld $2,sv1
+ paddd ru1,sv1
+
+ # combine r2,u2 and s2=r2*5,v2=u2*5
+ movd u2,ru2
+ movd r2,t1
+ punpcklqdq t1,ru2
+ movdqa ru2,sv2
+ pslld $2,sv2
+ paddd ru2,sv2
+
+ # combine r3,u3 and s3=r3*5,v3=u3*5
+ movd u3,ru3
+ movd r3,t1
+ punpcklqdq t1,ru3
+ movdqa ru3,sv3
+ pslld $2,sv3
+ paddd ru3,sv3
+
+ # combine r4,u4 and s4=r4*5,v4=u4*5
+ movd u4,ru4
+ movd r4,t1
+ punpcklqdq t1,ru4
+ movdqa ru4,sv4
+ pslld $2,sv4
+ paddd ru4,sv4
+
+.Ldoblock2:
+ # hc0 = [ m[16-19] & 0x3ffffff, h0 + m[0-3] & 0x3ffffff ]
+ movd 0x00(m),hc0
+ movd 0x10(m),t1
+ punpcklqdq t1,hc0
+ pand ANMASK(%rip),hc0
+ movd h0,t1
+ paddd t1,hc0
+ # hc1 = [ (m[19-22] >> 2) & 0x3ffffff, h1 + (m[3-6] >> 2) & 0x3ffffff ]
+ movd 0x03(m),hc1
+ movd 0x13(m),t1
+ punpcklqdq t1,hc1
+ psrld $2,hc1
+ pand ANMASK(%rip),hc1
+ movd h1,t1
+ paddd t1,hc1
+ # hc2 = [ (m[22-25] >> 4) & 0x3ffffff, h2 + (m[6-9] >> 4) & 0x3ffffff ]
+ movd 0x06(m),hc2
+ movd 0x16(m),t1
+ punpcklqdq t1,hc2
+ psrld $4,hc2
+ pand ANMASK(%rip),hc2
+ movd h2,t1
+ paddd t1,hc2
+ # hc3 = [ (m[25-28] >> 6) & 0x3ffffff, h3 + (m[9-12] >> 6) & 0x3ffffff ]
+ movd 0x09(m),hc3
+ movd 0x19(m),t1
+ punpcklqdq t1,hc3
+ psrld $6,hc3
+ pand ANMASK(%rip),hc3
+ movd h3,t1
+ paddd t1,hc3
+ # hc4 = [ (m[28-31] >> 8) | (1<<24), h4 + (m[12-15] >> 8) | (1<<24) ]
+ movd 0x0c(m),hc4
+ movd 0x1c(m),t1
+ punpcklqdq t1,hc4
+ psrld $8,hc4
+ por ORMASK(%rip),hc4
+ movd h4,t1
+ paddd t1,hc4
+
+ # t1 = [ hc0[1] * r0, hc0[0] * u0 ]
+ movdqa ru0,t1
+ pmuludq hc0,t1
+ # t1 += [ hc1[1] * s4, hc1[0] * v4 ]
+ movdqa sv4,t2
+ pmuludq hc1,t2
+ paddq t2,t1
+ # t1 += [ hc2[1] * s3, hc2[0] * v3 ]
+ movdqa sv3,t2
+ pmuludq hc2,t2
+ paddq t2,t1
+ # t1 += [ hc3[1] * s2, hc3[0] * v2 ]
+ movdqa sv2,t2
+ pmuludq hc3,t2
+ paddq t2,t1
+ # t1 += [ hc4[1] * s1, hc4[0] * v1 ]
+ movdqa sv1,t2
+ pmuludq hc4,t2
+ paddq t2,t1
+ # d0 = t1[0] + t1[1]
+ movdqa t1,t2
+ psrldq $8,t2
+ paddq t2,t1
+ movq t1,d0
+
+ # t1 = [ hc0[1] * r1, hc0[0] * u1 ]
+ movdqa ru1,t1
+ pmuludq hc0,t1
+ # t1 += [ hc1[1] * r0, hc1[0] * u0 ]
+ movdqa ru0,t2
+ pmuludq hc1,t2
+ paddq t2,t1
+ # t1 += [ hc2[1] * s4, hc2[0] * v4 ]
+ movdqa sv4,t2
+ pmuludq hc2,t2
+ paddq t2,t1
+ # t1 += [ hc3[1] * s3, hc3[0] * v3 ]
+ movdqa sv3,t2
+ pmuludq hc3,t2
+ paddq t2,t1
+ # t1 += [ hc4[1] * s2, hc4[0] * v2 ]
+ movdqa sv2,t2
+ pmuludq hc4,t2
+ paddq t2,t1
+ # d1 = t1[0] + t1[1]
+ movdqa t1,t2
+ psrldq $8,t2
+ paddq t2,t1
+ movq t1,d1
+
+ # t1 = [ hc0[1] * r2, hc0[0] * u2 ]
+ movdqa ru2,t1
+ pmuludq hc0,t1
+ # t1 += [ hc1[1] * r1, hc1[0] * u1 ]
+ movdqa ru1,t2
+ pmuludq hc1,t2
+ paddq t2,t1
+ # t1 += [ hc2[1] * r0, hc2[0] * u0 ]
+ movdqa ru0,t2
+ pmuludq hc2,t2
+ paddq t2,t1
+ # t1 += [ hc3[1] * s4, hc3[0] * v4 ]
+ movdqa sv4,t2
+ pmuludq hc3,t2
+ paddq t2,t1
+ # t1 += [ hc4[1] * s3, hc4[0] * v3 ]
+ movdqa sv3,t2
+ pmuludq hc4,t2
+ paddq t2,t1
+ # d2 = t1[0] + t1[1]
+ movdqa t1,t2
+ psrldq $8,t2
+ paddq t2,t1
+ movq t1,d2
+
+ # t1 = [ hc0[1] * r3, hc0[0] * u3 ]
+ movdqa ru3,t1
+ pmuludq hc0,t1
+ # t1 += [ hc1[1] * r2, hc1[0] * u2 ]
+ movdqa ru2,t2
+ pmuludq hc1,t2
+ paddq t2,t1
+ # t1 += [ hc2[1] * r1, hc2[0] * u1 ]
+ movdqa ru1,t2
+ pmuludq hc2,t2
+ paddq t2,t1
+ # t1 += [ hc3[1] * r0, hc3[0] * u0 ]
+ movdqa ru0,t2
+ pmuludq hc3,t2
+ paddq t2,t1
+ # t1 += [ hc4[1] * s4, hc4[0] * v4 ]
+ movdqa sv4,t2
+ pmuludq hc4,t2
+ paddq t2,t1
+ # d3 = t1[0] + t1[1]
+ movdqa t1,t2
+ psrldq $8,t2
+ paddq t2,t1
+ movq t1,d3
+
+ # t1 = [ hc0[1] * r4, hc0[0] * u4 ]
+ movdqa ru4,t1
+ pmuludq hc0,t1
+ # t1 += [ hc1[1] * r3, hc1[0] * u3 ]
+ movdqa ru3,t2
+ pmuludq hc1,t2
+ paddq t2,t1
+ # t1 += [ hc2[1] * r2, hc2[0] * u2 ]
+ movdqa ru2,t2
+ pmuludq hc2,t2
+ paddq t2,t1
+ # t1 += [ hc3[1] * r1, hc3[0] * u1 ]
+ movdqa ru1,t2
+ pmuludq hc3,t2
+ paddq t2,t1
+ # t1 += [ hc4[1] * r0, hc4[0] * u0 ]
+ movdqa ru0,t2
+ pmuludq hc4,t2
+ paddq t2,t1
+ # d4 = t1[0] + t1[1]
+ movdqa t1,t2
+ psrldq $8,t2
+ paddq t2,t1
+ movq t1,d4
+
+ # Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 ->
+ # h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small
+ # amount. Careful: we must not assume the carry bits 'd0 >> 26',
+ # 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit
+ # integers. It's true in a single-block implementation, but not here.
+
+ # d1 += d0 >> 26
+ mov d0,%rax
+ shr $26,%rax
+ add %rax,d1
+ # h0 = d0 & 0x3ffffff
+ mov d0,%rbx
+ and $0x3ffffff,%ebx
+
+ # d2 += d1 >> 26
+ mov d1,%rax
+ shr $26,%rax
+ add %rax,d2
+ # h1 = d1 & 0x3ffffff
+ mov d1,%rax
+ and $0x3ffffff,%eax
+ mov %eax,h1
+
+ # d3 += d2 >> 26
+ mov d2,%rax
+ shr $26,%rax
+ add %rax,d3
+ # h2 = d2 & 0x3ffffff
+ mov d2,%rax
+ and $0x3ffffff,%eax
+ mov %eax,h2
+
+ # d4 += d3 >> 26
+ mov d3,%rax
+ shr $26,%rax
+ add %rax,d4
+ # h3 = d3 & 0x3ffffff
+ mov d3,%rax
+ and $0x3ffffff,%eax
+ mov %eax,h3
+
+ # h0 += (d4 >> 26) * 5
+ mov d4,%rax
+ shr $26,%rax
+ lea (%rax,%rax,4),%rax
+ add %rax,%rbx
+ # h4 = d4 & 0x3ffffff
+ mov d4,%rax
+ and $0x3ffffff,%eax
+ mov %eax,h4
+
+ # h1 += h0 >> 26
+ mov %rbx,%rax
+ shr $26,%rax
+ add %eax,h1
+ # h0 = h0 & 0x3ffffff
+ andl $0x3ffffff,%ebx
+ mov %ebx,h0
+
+ add $0x20,m
+ dec %rcx
+ jnz .Ldoblock2
+
+ pop %r13
+ pop %r12
+ pop %rbx
+ ret
+ENDPROC(poly1305_2block_sse2)
diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c
new file mode 100644
index 000000000..f012b7e28
--- /dev/null
+++ b/arch/x86/crypto/poly1305_glue.c
@@ -0,0 +1,205 @@
+/*
+ * Poly1305 authenticator algorithm, RFC7539, SIMD glue code
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <crypto/algapi.h>
+#include <crypto/internal/hash.h>
+#include <crypto/poly1305.h>
+#include <linux/crypto.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <asm/fpu/api.h>
+#include <asm/simd.h>
+
+struct poly1305_simd_desc_ctx {
+ struct poly1305_desc_ctx base;
+ /* derived key u set? */
+ bool uset;
+#ifdef CONFIG_AS_AVX2
+ /* derived keys r^3, r^4 set? */
+ bool wset;
+#endif
+ /* derived Poly1305 key r^2 */
+ u32 u[5];
+ /* ... silently appended r^3 and r^4 when using AVX2 */
+};
+
+asmlinkage void poly1305_block_sse2(u32 *h, const u8 *src,
+ const u32 *r, unsigned int blocks);
+asmlinkage void poly1305_2block_sse2(u32 *h, const u8 *src, const u32 *r,
+ unsigned int blocks, const u32 *u);
+#ifdef CONFIG_AS_AVX2
+asmlinkage void poly1305_4block_avx2(u32 *h, const u8 *src, const u32 *r,
+ unsigned int blocks, const u32 *u);
+static bool poly1305_use_avx2;
+#endif
+
+static int poly1305_simd_init(struct shash_desc *desc)
+{
+ struct poly1305_simd_desc_ctx *sctx = shash_desc_ctx(desc);
+
+ sctx->uset = false;
+#ifdef CONFIG_AS_AVX2
+ sctx->wset = false;
+#endif
+
+ return crypto_poly1305_init(desc);
+}
+
+static void poly1305_simd_mult(u32 *a, const u32 *b)
+{
+ u8 m[POLY1305_BLOCK_SIZE];
+
+ memset(m, 0, sizeof(m));
+ /* The poly1305 block function adds a hi-bit to the accumulator which
+ * we don't need for key multiplication; compensate for it. */
+ a[4] -= 1 << 24;
+ poly1305_block_sse2(a, m, b, 1);
+}
+
+static unsigned int poly1305_simd_blocks(struct poly1305_desc_ctx *dctx,
+ const u8 *src, unsigned int srclen)
+{
+ struct poly1305_simd_desc_ctx *sctx;
+ unsigned int blocks, datalen;
+
+ BUILD_BUG_ON(offsetof(struct poly1305_simd_desc_ctx, base));
+ sctx = container_of(dctx, struct poly1305_simd_desc_ctx, base);
+
+ if (unlikely(!dctx->sset)) {
+ datalen = crypto_poly1305_setdesckey(dctx, src, srclen);
+ src += srclen - datalen;
+ srclen = datalen;
+ }
+
+#ifdef CONFIG_AS_AVX2
+ if (poly1305_use_avx2 && srclen >= POLY1305_BLOCK_SIZE * 4) {
+ if (unlikely(!sctx->wset)) {
+ if (!sctx->uset) {
+ memcpy(sctx->u, dctx->r, sizeof(sctx->u));
+ poly1305_simd_mult(sctx->u, dctx->r);
+ sctx->uset = true;
+ }
+ memcpy(sctx->u + 5, sctx->u, sizeof(sctx->u));
+ poly1305_simd_mult(sctx->u + 5, dctx->r);
+ memcpy(sctx->u + 10, sctx->u + 5, sizeof(sctx->u));
+ poly1305_simd_mult(sctx->u + 10, dctx->r);
+ sctx->wset = true;
+ }
+ blocks = srclen / (POLY1305_BLOCK_SIZE * 4);
+ poly1305_4block_avx2(dctx->h, src, dctx->r, blocks, sctx->u);
+ src += POLY1305_BLOCK_SIZE * 4 * blocks;
+ srclen -= POLY1305_BLOCK_SIZE * 4 * blocks;
+ }
+#endif
+ if (likely(srclen >= POLY1305_BLOCK_SIZE * 2)) {
+ if (unlikely(!sctx->uset)) {
+ memcpy(sctx->u, dctx->r, sizeof(sctx->u));
+ poly1305_simd_mult(sctx->u, dctx->r);
+ sctx->uset = true;
+ }
+ blocks = srclen / (POLY1305_BLOCK_SIZE * 2);
+ poly1305_2block_sse2(dctx->h, src, dctx->r, blocks, sctx->u);
+ src += POLY1305_BLOCK_SIZE * 2 * blocks;
+ srclen -= POLY1305_BLOCK_SIZE * 2 * blocks;
+ }
+ if (srclen >= POLY1305_BLOCK_SIZE) {
+ poly1305_block_sse2(dctx->h, src, dctx->r, 1);
+ srclen -= POLY1305_BLOCK_SIZE;
+ }
+ return srclen;
+}
+
+static int poly1305_simd_update(struct shash_desc *desc,
+ const u8 *src, unsigned int srclen)
+{
+ struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
+ unsigned int bytes;
+
+ /* kernel_fpu_begin/end is costly, use fallback for small updates */
+ if (srclen <= 288 || !may_use_simd())
+ return crypto_poly1305_update(desc, src, srclen);
+
+ kernel_fpu_begin();
+
+ if (unlikely(dctx->buflen)) {
+ bytes = min(srclen, POLY1305_BLOCK_SIZE - dctx->buflen);
+ memcpy(dctx->buf + dctx->buflen, src, bytes);
+ src += bytes;
+ srclen -= bytes;
+ dctx->buflen += bytes;
+
+ if (dctx->buflen == POLY1305_BLOCK_SIZE) {
+ poly1305_simd_blocks(dctx, dctx->buf,
+ POLY1305_BLOCK_SIZE);
+ dctx->buflen = 0;
+ }
+ }
+
+ if (likely(srclen >= POLY1305_BLOCK_SIZE)) {
+ bytes = poly1305_simd_blocks(dctx, src, srclen);
+ src += srclen - bytes;
+ srclen = bytes;
+ }
+
+ kernel_fpu_end();
+
+ if (unlikely(srclen)) {
+ dctx->buflen = srclen;
+ memcpy(dctx->buf, src, srclen);
+ }
+
+ return 0;
+}
+
+static struct shash_alg alg = {
+ .digestsize = POLY1305_DIGEST_SIZE,
+ .init = poly1305_simd_init,
+ .update = poly1305_simd_update,
+ .final = crypto_poly1305_final,
+ .descsize = sizeof(struct poly1305_simd_desc_ctx),
+ .base = {
+ .cra_name = "poly1305",
+ .cra_driver_name = "poly1305-simd",
+ .cra_priority = 300,
+ .cra_blocksize = POLY1305_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ },
+};
+
+static int __init poly1305_simd_mod_init(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_XMM2))
+ return -ENODEV;
+
+#ifdef CONFIG_AS_AVX2
+ poly1305_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) &&
+ boot_cpu_has(X86_FEATURE_AVX2) &&
+ cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
+ alg.descsize = sizeof(struct poly1305_simd_desc_ctx);
+ if (poly1305_use_avx2)
+ alg.descsize += 10 * sizeof(u32);
+#endif
+ return crypto_register_shash(&alg);
+}
+
+static void __exit poly1305_simd_mod_exit(void)
+{
+ crypto_unregister_shash(&alg);
+}
+
+module_init(poly1305_simd_mod_init);
+module_exit(poly1305_simd_mod_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
+MODULE_DESCRIPTION("Poly1305 authenticator");
+MODULE_ALIAS_CRYPTO("poly1305");
+MODULE_ALIAS_CRYPTO("poly1305-simd");
diff --git a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
new file mode 100644
index 000000000..2925077f8
--- /dev/null
+++ b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
@@ -0,0 +1,796 @@
+/*
+ * Serpent Cipher 8-way parallel algorithm (x86_64/AVX)
+ *
+ * Copyright (C) 2012 Johannes Goetzfried
+ * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
+ *
+ * Copyright © 2011-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+ * USA
+ *
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+#include "glue_helper-asm-avx.S"
+
+.file "serpent-avx-x86_64-asm_64.S"
+
+.section .rodata.cst16.bswap128_mask, "aM", @progbits, 16
+.align 16
+.Lbswap128_mask:
+ .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+.section .rodata.cst16.xts_gf128mul_and_shl1_mask, "aM", @progbits, 16
+.align 16
+.Lxts_gf128mul_and_shl1_mask:
+ .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
+
+.text
+
+#define CTX %rdi
+
+/**********************************************************************
+ 8-way AVX serpent
+ **********************************************************************/
+#define RA1 %xmm0
+#define RB1 %xmm1
+#define RC1 %xmm2
+#define RD1 %xmm3
+#define RE1 %xmm4
+
+#define tp %xmm5
+
+#define RA2 %xmm6
+#define RB2 %xmm7
+#define RC2 %xmm8
+#define RD2 %xmm9
+#define RE2 %xmm10
+
+#define RNOT %xmm11
+
+#define RK0 %xmm12
+#define RK1 %xmm13
+#define RK2 %xmm14
+#define RK3 %xmm15
+
+
+#define S0_1(x0, x1, x2, x3, x4) \
+ vpor x0, x3, tp; \
+ vpxor x3, x0, x0; \
+ vpxor x2, x3, x4; \
+ vpxor RNOT, x4, x4; \
+ vpxor x1, tp, x3; \
+ vpand x0, x1, x1; \
+ vpxor x4, x1, x1; \
+ vpxor x0, x2, x2;
+#define S0_2(x0, x1, x2, x3, x4) \
+ vpxor x3, x0, x0; \
+ vpor x0, x4, x4; \
+ vpxor x2, x0, x0; \
+ vpand x1, x2, x2; \
+ vpxor x2, x3, x3; \
+ vpxor RNOT, x1, x1; \
+ vpxor x4, x2, x2; \
+ vpxor x2, x1, x1;
+
+#define S1_1(x0, x1, x2, x3, x4) \
+ vpxor x0, x1, tp; \
+ vpxor x3, x0, x0; \
+ vpxor RNOT, x3, x3; \
+ vpand tp, x1, x4; \
+ vpor tp, x0, x0; \
+ vpxor x2, x3, x3; \
+ vpxor x3, x0, x0; \
+ vpxor x3, tp, x1;
+#define S1_2(x0, x1, x2, x3, x4) \
+ vpxor x4, x3, x3; \
+ vpor x4, x1, x1; \
+ vpxor x2, x4, x4; \
+ vpand x0, x2, x2; \
+ vpxor x1, x2, x2; \
+ vpor x0, x1, x1; \
+ vpxor RNOT, x0, x0; \
+ vpxor x2, x0, x0; \
+ vpxor x1, x4, x4;
+
+#define S2_1(x0, x1, x2, x3, x4) \
+ vpxor RNOT, x3, x3; \
+ vpxor x0, x1, x1; \
+ vpand x2, x0, tp; \
+ vpxor x3, tp, tp; \
+ vpor x0, x3, x3; \
+ vpxor x1, x2, x2; \
+ vpxor x1, x3, x3; \
+ vpand tp, x1, x1;
+#define S2_2(x0, x1, x2, x3, x4) \
+ vpxor x2, tp, tp; \
+ vpand x3, x2, x2; \
+ vpor x1, x3, x3; \
+ vpxor RNOT, tp, tp; \
+ vpxor tp, x3, x3; \
+ vpxor tp, x0, x4; \
+ vpxor x2, tp, x0; \
+ vpor x2, x1, x1;
+
+#define S3_1(x0, x1, x2, x3, x4) \
+ vpxor x3, x1, tp; \
+ vpor x0, x3, x3; \
+ vpand x0, x1, x4; \
+ vpxor x2, x0, x0; \
+ vpxor tp, x2, x2; \
+ vpand x3, tp, x1; \
+ vpxor x3, x2, x2; \
+ vpor x4, x0, x0; \
+ vpxor x3, x4, x4;
+#define S3_2(x0, x1, x2, x3, x4) \
+ vpxor x0, x1, x1; \
+ vpand x3, x0, x0; \
+ vpand x4, x3, x3; \
+ vpxor x2, x3, x3; \
+ vpor x1, x4, x4; \
+ vpand x1, x2, x2; \
+ vpxor x3, x4, x4; \
+ vpxor x3, x0, x0; \
+ vpxor x2, x3, x3;
+
+#define S4_1(x0, x1, x2, x3, x4) \
+ vpand x0, x3, tp; \
+ vpxor x3, x0, x0; \
+ vpxor x2, tp, tp; \
+ vpor x3, x2, x2; \
+ vpxor x1, x0, x0; \
+ vpxor tp, x3, x4; \
+ vpor x0, x2, x2; \
+ vpxor x1, x2, x2;
+#define S4_2(x0, x1, x2, x3, x4) \
+ vpand x0, x1, x1; \
+ vpxor x4, x1, x1; \
+ vpand x2, x4, x4; \
+ vpxor tp, x2, x2; \
+ vpxor x0, x4, x4; \
+ vpor x1, tp, x3; \
+ vpxor RNOT, x1, x1; \
+ vpxor x0, x3, x3;
+
+#define S5_1(x0, x1, x2, x3, x4) \
+ vpor x0, x1, tp; \
+ vpxor tp, x2, x2; \
+ vpxor RNOT, x3, x3; \
+ vpxor x0, x1, x4; \
+ vpxor x2, x0, x0; \
+ vpand x4, tp, x1; \
+ vpor x3, x4, x4; \
+ vpxor x0, x4, x4;
+#define S5_2(x0, x1, x2, x3, x4) \
+ vpand x3, x0, x0; \
+ vpxor x3, x1, x1; \
+ vpxor x2, x3, x3; \
+ vpxor x1, x0, x0; \
+ vpand x4, x2, x2; \
+ vpxor x2, x1, x1; \
+ vpand x0, x2, x2; \
+ vpxor x2, x3, x3;
+
+#define S6_1(x0, x1, x2, x3, x4) \
+ vpxor x0, x3, x3; \
+ vpxor x2, x1, tp; \
+ vpxor x0, x2, x2; \
+ vpand x3, x0, x0; \
+ vpor x3, tp, tp; \
+ vpxor RNOT, x1, x4; \
+ vpxor tp, x0, x0; \
+ vpxor x2, tp, x1;
+#define S6_2(x0, x1, x2, x3, x4) \
+ vpxor x4, x3, x3; \
+ vpxor x0, x4, x4; \
+ vpand x0, x2, x2; \
+ vpxor x1, x4, x4; \
+ vpxor x3, x2, x2; \
+ vpand x1, x3, x3; \
+ vpxor x0, x3, x3; \
+ vpxor x2, x1, x1;
+
+#define S7_1(x0, x1, x2, x3, x4) \
+ vpxor RNOT, x1, tp; \
+ vpxor RNOT, x0, x0; \
+ vpand x2, tp, x1; \
+ vpxor x3, x1, x1; \
+ vpor tp, x3, x3; \
+ vpxor x2, tp, x4; \
+ vpxor x3, x2, x2; \
+ vpxor x0, x3, x3; \
+ vpor x1, x0, x0;
+#define S7_2(x0, x1, x2, x3, x4) \
+ vpand x0, x2, x2; \
+ vpxor x4, x0, x0; \
+ vpxor x3, x4, x4; \
+ vpand x0, x3, x3; \
+ vpxor x1, x4, x4; \
+ vpxor x4, x2, x2; \
+ vpxor x1, x3, x3; \
+ vpor x0, x4, x4; \
+ vpxor x1, x4, x4;
+
+#define SI0_1(x0, x1, x2, x3, x4) \
+ vpxor x0, x1, x1; \
+ vpor x1, x3, tp; \
+ vpxor x1, x3, x4; \
+ vpxor RNOT, x0, x0; \
+ vpxor tp, x2, x2; \
+ vpxor x0, tp, x3; \
+ vpand x1, x0, x0; \
+ vpxor x2, x0, x0;
+#define SI0_2(x0, x1, x2, x3, x4) \
+ vpand x3, x2, x2; \
+ vpxor x4, x3, x3; \
+ vpxor x3, x2, x2; \
+ vpxor x3, x1, x1; \
+ vpand x0, x3, x3; \
+ vpxor x0, x1, x1; \
+ vpxor x2, x0, x0; \
+ vpxor x3, x4, x4;
+
+#define SI1_1(x0, x1, x2, x3, x4) \
+ vpxor x3, x1, x1; \
+ vpxor x2, x0, tp; \
+ vpxor RNOT, x2, x2; \
+ vpor x1, x0, x4; \
+ vpxor x3, x4, x4; \
+ vpand x1, x3, x3; \
+ vpxor x2, x1, x1; \
+ vpand x4, x2, x2;
+#define SI1_2(x0, x1, x2, x3, x4) \
+ vpxor x1, x4, x4; \
+ vpor x3, x1, x1; \
+ vpxor tp, x3, x3; \
+ vpxor tp, x2, x2; \
+ vpor x4, tp, x0; \
+ vpxor x4, x2, x2; \
+ vpxor x0, x1, x1; \
+ vpxor x1, x4, x4;
+
+#define SI2_1(x0, x1, x2, x3, x4) \
+ vpxor x1, x2, x2; \
+ vpxor RNOT, x3, tp; \
+ vpor x2, tp, tp; \
+ vpxor x3, x2, x2; \
+ vpxor x0, x3, x4; \
+ vpxor x1, tp, x3; \
+ vpor x2, x1, x1; \
+ vpxor x0, x2, x2;
+#define SI2_2(x0, x1, x2, x3, x4) \
+ vpxor x4, x1, x1; \
+ vpor x3, x4, x4; \
+ vpxor x3, x2, x2; \
+ vpxor x2, x4, x4; \
+ vpand x1, x2, x2; \
+ vpxor x3, x2, x2; \
+ vpxor x4, x3, x3; \
+ vpxor x0, x4, x4;
+
+#define SI3_1(x0, x1, x2, x3, x4) \
+ vpxor x1, x2, x2; \
+ vpand x2, x1, tp; \
+ vpxor x0, tp, tp; \
+ vpor x1, x0, x0; \
+ vpxor x3, x1, x4; \
+ vpxor x3, x0, x0; \
+ vpor tp, x3, x3; \
+ vpxor x2, tp, x1;
+#define SI3_2(x0, x1, x2, x3, x4) \
+ vpxor x3, x1, x1; \
+ vpxor x2, x0, x0; \
+ vpxor x3, x2, x2; \
+ vpand x1, x3, x3; \
+ vpxor x0, x1, x1; \
+ vpand x2, x0, x0; \
+ vpxor x3, x4, x4; \
+ vpxor x0, x3, x3; \
+ vpxor x1, x0, x0;
+
+#define SI4_1(x0, x1, x2, x3, x4) \
+ vpxor x3, x2, x2; \
+ vpand x1, x0, tp; \
+ vpxor x2, tp, tp; \
+ vpor x3, x2, x2; \
+ vpxor RNOT, x0, x4; \
+ vpxor tp, x1, x1; \
+ vpxor x2, tp, x0; \
+ vpand x4, x2, x2;
+#define SI4_2(x0, x1, x2, x3, x4) \
+ vpxor x0, x2, x2; \
+ vpor x4, x0, x0; \
+ vpxor x3, x0, x0; \
+ vpand x2, x3, x3; \
+ vpxor x3, x4, x4; \
+ vpxor x1, x3, x3; \
+ vpand x0, x1, x1; \
+ vpxor x1, x4, x4; \
+ vpxor x3, x0, x0;
+
+#define SI5_1(x0, x1, x2, x3, x4) \
+ vpor x2, x1, tp; \
+ vpxor x1, x2, x2; \
+ vpxor x3, tp, tp; \
+ vpand x1, x3, x3; \
+ vpxor x3, x2, x2; \
+ vpor x0, x3, x3; \
+ vpxor RNOT, x0, x0; \
+ vpxor x2, x3, x3; \
+ vpor x0, x2, x2;
+#define SI5_2(x0, x1, x2, x3, x4) \
+ vpxor tp, x1, x4; \
+ vpxor x4, x2, x2; \
+ vpand x0, x4, x4; \
+ vpxor tp, x0, x0; \
+ vpxor x3, tp, x1; \
+ vpand x2, x0, x0; \
+ vpxor x3, x2, x2; \
+ vpxor x2, x0, x0; \
+ vpxor x4, x2, x2; \
+ vpxor x3, x4, x4;
+
+#define SI6_1(x0, x1, x2, x3, x4) \
+ vpxor x2, x0, x0; \
+ vpand x3, x0, tp; \
+ vpxor x3, x2, x2; \
+ vpxor x2, tp, tp; \
+ vpxor x1, x3, x3; \
+ vpor x0, x2, x2; \
+ vpxor x3, x2, x2; \
+ vpand tp, x3, x3;
+#define SI6_2(x0, x1, x2, x3, x4) \
+ vpxor RNOT, tp, tp; \
+ vpxor x1, x3, x3; \
+ vpand x2, x1, x1; \
+ vpxor tp, x0, x4; \
+ vpxor x4, x3, x3; \
+ vpxor x2, x4, x4; \
+ vpxor x1, tp, x0; \
+ vpxor x0, x2, x2;
+
+#define SI7_1(x0, x1, x2, x3, x4) \
+ vpand x0, x3, tp; \
+ vpxor x2, x0, x0; \
+ vpor x3, x2, x2; \
+ vpxor x1, x3, x4; \
+ vpxor RNOT, x0, x0; \
+ vpor tp, x1, x1; \
+ vpxor x0, x4, x4; \
+ vpand x2, x0, x0; \
+ vpxor x1, x0, x0;
+#define SI7_2(x0, x1, x2, x3, x4) \
+ vpand x2, x1, x1; \
+ vpxor x2, tp, x3; \
+ vpxor x3, x4, x4; \
+ vpand x3, x2, x2; \
+ vpor x0, x3, x3; \
+ vpxor x4, x1, x1; \
+ vpxor x4, x3, x3; \
+ vpand x0, x4, x4; \
+ vpxor x2, x4, x4;
+
+#define get_key(i, j, t) \
+ vbroadcastss (4*(i)+(j))*4(CTX), t;
+
+#define K2(x0, x1, x2, x3, x4, i) \
+ get_key(i, 0, RK0); \
+ get_key(i, 1, RK1); \
+ get_key(i, 2, RK2); \
+ get_key(i, 3, RK3); \
+ vpxor RK0, x0 ## 1, x0 ## 1; \
+ vpxor RK1, x1 ## 1, x1 ## 1; \
+ vpxor RK2, x2 ## 1, x2 ## 1; \
+ vpxor RK3, x3 ## 1, x3 ## 1; \
+ vpxor RK0, x0 ## 2, x0 ## 2; \
+ vpxor RK1, x1 ## 2, x1 ## 2; \
+ vpxor RK2, x2 ## 2, x2 ## 2; \
+ vpxor RK3, x3 ## 2, x3 ## 2;
+
+#define LK2(x0, x1, x2, x3, x4, i) \
+ vpslld $13, x0 ## 1, x4 ## 1; \
+ vpsrld $(32 - 13), x0 ## 1, x0 ## 1; \
+ vpor x4 ## 1, x0 ## 1, x0 ## 1; \
+ vpxor x0 ## 1, x1 ## 1, x1 ## 1; \
+ vpslld $3, x2 ## 1, x4 ## 1; \
+ vpsrld $(32 - 3), x2 ## 1, x2 ## 1; \
+ vpor x4 ## 1, x2 ## 1, x2 ## 1; \
+ vpxor x2 ## 1, x1 ## 1, x1 ## 1; \
+ vpslld $13, x0 ## 2, x4 ## 2; \
+ vpsrld $(32 - 13), x0 ## 2, x0 ## 2; \
+ vpor x4 ## 2, x0 ## 2, x0 ## 2; \
+ vpxor x0 ## 2, x1 ## 2, x1 ## 2; \
+ vpslld $3, x2 ## 2, x4 ## 2; \
+ vpsrld $(32 - 3), x2 ## 2, x2 ## 2; \
+ vpor x4 ## 2, x2 ## 2, x2 ## 2; \
+ vpxor x2 ## 2, x1 ## 2, x1 ## 2; \
+ vpslld $1, x1 ## 1, x4 ## 1; \
+ vpsrld $(32 - 1), x1 ## 1, x1 ## 1; \
+ vpor x4 ## 1, x1 ## 1, x1 ## 1; \
+ vpslld $3, x0 ## 1, x4 ## 1; \
+ vpxor x2 ## 1, x3 ## 1, x3 ## 1; \
+ vpxor x4 ## 1, x3 ## 1, x3 ## 1; \
+ get_key(i, 1, RK1); \
+ vpslld $1, x1 ## 2, x4 ## 2; \
+ vpsrld $(32 - 1), x1 ## 2, x1 ## 2; \
+ vpor x4 ## 2, x1 ## 2, x1 ## 2; \
+ vpslld $3, x0 ## 2, x4 ## 2; \
+ vpxor x2 ## 2, x3 ## 2, x3 ## 2; \
+ vpxor x4 ## 2, x3 ## 2, x3 ## 2; \
+ get_key(i, 3, RK3); \
+ vpslld $7, x3 ## 1, x4 ## 1; \
+ vpsrld $(32 - 7), x3 ## 1, x3 ## 1; \
+ vpor x4 ## 1, x3 ## 1, x3 ## 1; \
+ vpslld $7, x1 ## 1, x4 ## 1; \
+ vpxor x1 ## 1, x0 ## 1, x0 ## 1; \
+ vpxor x3 ## 1, x0 ## 1, x0 ## 1; \
+ vpxor x3 ## 1, x2 ## 1, x2 ## 1; \
+ vpxor x4 ## 1, x2 ## 1, x2 ## 1; \
+ get_key(i, 0, RK0); \
+ vpslld $7, x3 ## 2, x4 ## 2; \
+ vpsrld $(32 - 7), x3 ## 2, x3 ## 2; \
+ vpor x4 ## 2, x3 ## 2, x3 ## 2; \
+ vpslld $7, x1 ## 2, x4 ## 2; \
+ vpxor x1 ## 2, x0 ## 2, x0 ## 2; \
+ vpxor x3 ## 2, x0 ## 2, x0 ## 2; \
+ vpxor x3 ## 2, x2 ## 2, x2 ## 2; \
+ vpxor x4 ## 2, x2 ## 2, x2 ## 2; \
+ get_key(i, 2, RK2); \
+ vpxor RK1, x1 ## 1, x1 ## 1; \
+ vpxor RK3, x3 ## 1, x3 ## 1; \
+ vpslld $5, x0 ## 1, x4 ## 1; \
+ vpsrld $(32 - 5), x0 ## 1, x0 ## 1; \
+ vpor x4 ## 1, x0 ## 1, x0 ## 1; \
+ vpslld $22, x2 ## 1, x4 ## 1; \
+ vpsrld $(32 - 22), x2 ## 1, x2 ## 1; \
+ vpor x4 ## 1, x2 ## 1, x2 ## 1; \
+ vpxor RK0, x0 ## 1, x0 ## 1; \
+ vpxor RK2, x2 ## 1, x2 ## 1; \
+ vpxor RK1, x1 ## 2, x1 ## 2; \
+ vpxor RK3, x3 ## 2, x3 ## 2; \
+ vpslld $5, x0 ## 2, x4 ## 2; \
+ vpsrld $(32 - 5), x0 ## 2, x0 ## 2; \
+ vpor x4 ## 2, x0 ## 2, x0 ## 2; \
+ vpslld $22, x2 ## 2, x4 ## 2; \
+ vpsrld $(32 - 22), x2 ## 2, x2 ## 2; \
+ vpor x4 ## 2, x2 ## 2, x2 ## 2; \
+ vpxor RK0, x0 ## 2, x0 ## 2; \
+ vpxor RK2, x2 ## 2, x2 ## 2;
+
+#define KL2(x0, x1, x2, x3, x4, i) \
+ vpxor RK0, x0 ## 1, x0 ## 1; \
+ vpxor RK2, x2 ## 1, x2 ## 1; \
+ vpsrld $5, x0 ## 1, x4 ## 1; \
+ vpslld $(32 - 5), x0 ## 1, x0 ## 1; \
+ vpor x4 ## 1, x0 ## 1, x0 ## 1; \
+ vpxor RK3, x3 ## 1, x3 ## 1; \
+ vpxor RK1, x1 ## 1, x1 ## 1; \
+ vpsrld $22, x2 ## 1, x4 ## 1; \
+ vpslld $(32 - 22), x2 ## 1, x2 ## 1; \
+ vpor x4 ## 1, x2 ## 1, x2 ## 1; \
+ vpxor x3 ## 1, x2 ## 1, x2 ## 1; \
+ vpxor RK0, x0 ## 2, x0 ## 2; \
+ vpxor RK2, x2 ## 2, x2 ## 2; \
+ vpsrld $5, x0 ## 2, x4 ## 2; \
+ vpslld $(32 - 5), x0 ## 2, x0 ## 2; \
+ vpor x4 ## 2, x0 ## 2, x0 ## 2; \
+ vpxor RK3, x3 ## 2, x3 ## 2; \
+ vpxor RK1, x1 ## 2, x1 ## 2; \
+ vpsrld $22, x2 ## 2, x4 ## 2; \
+ vpslld $(32 - 22), x2 ## 2, x2 ## 2; \
+ vpor x4 ## 2, x2 ## 2, x2 ## 2; \
+ vpxor x3 ## 2, x2 ## 2, x2 ## 2; \
+ vpxor x3 ## 1, x0 ## 1, x0 ## 1; \
+ vpslld $7, x1 ## 1, x4 ## 1; \
+ vpxor x1 ## 1, x0 ## 1, x0 ## 1; \
+ vpxor x4 ## 1, x2 ## 1, x2 ## 1; \
+ vpsrld $1, x1 ## 1, x4 ## 1; \
+ vpslld $(32 - 1), x1 ## 1, x1 ## 1; \
+ vpor x4 ## 1, x1 ## 1, x1 ## 1; \
+ vpxor x3 ## 2, x0 ## 2, x0 ## 2; \
+ vpslld $7, x1 ## 2, x4 ## 2; \
+ vpxor x1 ## 2, x0 ## 2, x0 ## 2; \
+ vpxor x4 ## 2, x2 ## 2, x2 ## 2; \
+ vpsrld $1, x1 ## 2, x4 ## 2; \
+ vpslld $(32 - 1), x1 ## 2, x1 ## 2; \
+ vpor x4 ## 2, x1 ## 2, x1 ## 2; \
+ vpsrld $7, x3 ## 1, x4 ## 1; \
+ vpslld $(32 - 7), x3 ## 1, x3 ## 1; \
+ vpor x4 ## 1, x3 ## 1, x3 ## 1; \
+ vpxor x0 ## 1, x1 ## 1, x1 ## 1; \
+ vpslld $3, x0 ## 1, x4 ## 1; \
+ vpxor x4 ## 1, x3 ## 1, x3 ## 1; \
+ vpsrld $7, x3 ## 2, x4 ## 2; \
+ vpslld $(32 - 7), x3 ## 2, x3 ## 2; \
+ vpor x4 ## 2, x3 ## 2, x3 ## 2; \
+ vpxor x0 ## 2, x1 ## 2, x1 ## 2; \
+ vpslld $3, x0 ## 2, x4 ## 2; \
+ vpxor x4 ## 2, x3 ## 2, x3 ## 2; \
+ vpsrld $13, x0 ## 1, x4 ## 1; \
+ vpslld $(32 - 13), x0 ## 1, x0 ## 1; \
+ vpor x4 ## 1, x0 ## 1, x0 ## 1; \
+ vpxor x2 ## 1, x1 ## 1, x1 ## 1; \
+ vpxor x2 ## 1, x3 ## 1, x3 ## 1; \
+ vpsrld $3, x2 ## 1, x4 ## 1; \
+ vpslld $(32 - 3), x2 ## 1, x2 ## 1; \
+ vpor x4 ## 1, x2 ## 1, x2 ## 1; \
+ vpsrld $13, x0 ## 2, x4 ## 2; \
+ vpslld $(32 - 13), x0 ## 2, x0 ## 2; \
+ vpor x4 ## 2, x0 ## 2, x0 ## 2; \
+ vpxor x2 ## 2, x1 ## 2, x1 ## 2; \
+ vpxor x2 ## 2, x3 ## 2, x3 ## 2; \
+ vpsrld $3, x2 ## 2, x4 ## 2; \
+ vpslld $(32 - 3), x2 ## 2, x2 ## 2; \
+ vpor x4 ## 2, x2 ## 2, x2 ## 2;
+
+#define S(SBOX, x0, x1, x2, x3, x4) \
+ SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+ SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+ SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
+ SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2);
+
+#define SP(SBOX, x0, x1, x2, x3, x4, i) \
+ get_key(i, 0, RK0); \
+ SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+ get_key(i, 2, RK2); \
+ SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+ get_key(i, 3, RK3); \
+ SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
+ get_key(i, 1, RK1); \
+ SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
+
+#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+ vpunpckldq x1, x0, t0; \
+ vpunpckhdq x1, x0, t2; \
+ vpunpckldq x3, x2, t1; \
+ vpunpckhdq x3, x2, x3; \
+ \
+ vpunpcklqdq t1, t0, x0; \
+ vpunpckhqdq t1, t0, x1; \
+ vpunpcklqdq x3, t2, x2; \
+ vpunpckhqdq x3, t2, x3;
+
+#define read_blocks(x0, x1, x2, x3, t0, t1, t2) \
+ transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
+
+#define write_blocks(x0, x1, x2, x3, t0, t1, t2) \
+ transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
+
+.align 8
+__serpent_enc_blk8_avx:
+ /* input:
+ * %rdi: ctx, CTX
+ * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
+ * output:
+ * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
+ */
+
+ vpcmpeqd RNOT, RNOT, RNOT;
+
+ read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+ read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+
+ K2(RA, RB, RC, RD, RE, 0);
+ S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1);
+ S(S1, RC, RB, RD, RA, RE); LK2(RE, RD, RA, RC, RB, 2);
+ S(S2, RE, RD, RA, RC, RB); LK2(RB, RD, RE, RC, RA, 3);
+ S(S3, RB, RD, RE, RC, RA); LK2(RC, RA, RD, RB, RE, 4);
+ S(S4, RC, RA, RD, RB, RE); LK2(RA, RD, RB, RE, RC, 5);
+ S(S5, RA, RD, RB, RE, RC); LK2(RC, RA, RD, RE, RB, 6);
+ S(S6, RC, RA, RD, RE, RB); LK2(RD, RB, RA, RE, RC, 7);
+ S(S7, RD, RB, RA, RE, RC); LK2(RC, RA, RE, RD, RB, 8);
+ S(S0, RC, RA, RE, RD, RB); LK2(RE, RA, RD, RC, RB, 9);
+ S(S1, RE, RA, RD, RC, RB); LK2(RB, RD, RC, RE, RA, 10);
+ S(S2, RB, RD, RC, RE, RA); LK2(RA, RD, RB, RE, RC, 11);
+ S(S3, RA, RD, RB, RE, RC); LK2(RE, RC, RD, RA, RB, 12);
+ S(S4, RE, RC, RD, RA, RB); LK2(RC, RD, RA, RB, RE, 13);
+ S(S5, RC, RD, RA, RB, RE); LK2(RE, RC, RD, RB, RA, 14);
+ S(S6, RE, RC, RD, RB, RA); LK2(RD, RA, RC, RB, RE, 15);
+ S(S7, RD, RA, RC, RB, RE); LK2(RE, RC, RB, RD, RA, 16);
+ S(S0, RE, RC, RB, RD, RA); LK2(RB, RC, RD, RE, RA, 17);
+ S(S1, RB, RC, RD, RE, RA); LK2(RA, RD, RE, RB, RC, 18);
+ S(S2, RA, RD, RE, RB, RC); LK2(RC, RD, RA, RB, RE, 19);
+ S(S3, RC, RD, RA, RB, RE); LK2(RB, RE, RD, RC, RA, 20);
+ S(S4, RB, RE, RD, RC, RA); LK2(RE, RD, RC, RA, RB, 21);
+ S(S5, RE, RD, RC, RA, RB); LK2(RB, RE, RD, RA, RC, 22);
+ S(S6, RB, RE, RD, RA, RC); LK2(RD, RC, RE, RA, RB, 23);
+ S(S7, RD, RC, RE, RA, RB); LK2(RB, RE, RA, RD, RC, 24);
+ S(S0, RB, RE, RA, RD, RC); LK2(RA, RE, RD, RB, RC, 25);
+ S(S1, RA, RE, RD, RB, RC); LK2(RC, RD, RB, RA, RE, 26);
+ S(S2, RC, RD, RB, RA, RE); LK2(RE, RD, RC, RA, RB, 27);
+ S(S3, RE, RD, RC, RA, RB); LK2(RA, RB, RD, RE, RC, 28);
+ S(S4, RA, RB, RD, RE, RC); LK2(RB, RD, RE, RC, RA, 29);
+ S(S5, RB, RD, RE, RC, RA); LK2(RA, RB, RD, RC, RE, 30);
+ S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31);
+ S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32);
+
+ write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+ write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+
+ ret;
+ENDPROC(__serpent_enc_blk8_avx)
+
+.align 8
+__serpent_dec_blk8_avx:
+ /* input:
+ * %rdi: ctx, CTX
+ * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
+ * output:
+ * RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2: decrypted blocks
+ */
+
+ vpcmpeqd RNOT, RNOT, RNOT;
+
+ read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+ read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+
+ K2(RA, RB, RC, RD, RE, 32);
+ SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31);
+ SP(SI6, RB, RD, RA, RE, RC, 30); KL2(RA, RC, RE, RB, RD, 30);
+ SP(SI5, RA, RC, RE, RB, RD, 29); KL2(RC, RD, RA, RE, RB, 29);
+ SP(SI4, RC, RD, RA, RE, RB, 28); KL2(RC, RA, RB, RE, RD, 28);
+ SP(SI3, RC, RA, RB, RE, RD, 27); KL2(RB, RC, RD, RE, RA, 27);
+ SP(SI2, RB, RC, RD, RE, RA, 26); KL2(RC, RA, RE, RD, RB, 26);
+ SP(SI1, RC, RA, RE, RD, RB, 25); KL2(RB, RA, RE, RD, RC, 25);
+ SP(SI0, RB, RA, RE, RD, RC, 24); KL2(RE, RC, RA, RB, RD, 24);
+ SP(SI7, RE, RC, RA, RB, RD, 23); KL2(RC, RB, RE, RD, RA, 23);
+ SP(SI6, RC, RB, RE, RD, RA, 22); KL2(RE, RA, RD, RC, RB, 22);
+ SP(SI5, RE, RA, RD, RC, RB, 21); KL2(RA, RB, RE, RD, RC, 21);
+ SP(SI4, RA, RB, RE, RD, RC, 20); KL2(RA, RE, RC, RD, RB, 20);
+ SP(SI3, RA, RE, RC, RD, RB, 19); KL2(RC, RA, RB, RD, RE, 19);
+ SP(SI2, RC, RA, RB, RD, RE, 18); KL2(RA, RE, RD, RB, RC, 18);
+ SP(SI1, RA, RE, RD, RB, RC, 17); KL2(RC, RE, RD, RB, RA, 17);
+ SP(SI0, RC, RE, RD, RB, RA, 16); KL2(RD, RA, RE, RC, RB, 16);
+ SP(SI7, RD, RA, RE, RC, RB, 15); KL2(RA, RC, RD, RB, RE, 15);
+ SP(SI6, RA, RC, RD, RB, RE, 14); KL2(RD, RE, RB, RA, RC, 14);
+ SP(SI5, RD, RE, RB, RA, RC, 13); KL2(RE, RC, RD, RB, RA, 13);
+ SP(SI4, RE, RC, RD, RB, RA, 12); KL2(RE, RD, RA, RB, RC, 12);
+ SP(SI3, RE, RD, RA, RB, RC, 11); KL2(RA, RE, RC, RB, RD, 11);
+ SP(SI2, RA, RE, RC, RB, RD, 10); KL2(RE, RD, RB, RC, RA, 10);
+ SP(SI1, RE, RD, RB, RC, RA, 9); KL2(RA, RD, RB, RC, RE, 9);
+ SP(SI0, RA, RD, RB, RC, RE, 8); KL2(RB, RE, RD, RA, RC, 8);
+ SP(SI7, RB, RE, RD, RA, RC, 7); KL2(RE, RA, RB, RC, RD, 7);
+ SP(SI6, RE, RA, RB, RC, RD, 6); KL2(RB, RD, RC, RE, RA, 6);
+ SP(SI5, RB, RD, RC, RE, RA, 5); KL2(RD, RA, RB, RC, RE, 5);
+ SP(SI4, RD, RA, RB, RC, RE, 4); KL2(RD, RB, RE, RC, RA, 4);
+ SP(SI3, RD, RB, RE, RC, RA, 3); KL2(RE, RD, RA, RC, RB, 3);
+ SP(SI2, RE, RD, RA, RC, RB, 2); KL2(RD, RB, RC, RA, RE, 2);
+ SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1);
+ S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0);
+
+ write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2);
+ write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2);
+
+ ret;
+ENDPROC(__serpent_dec_blk8_avx)
+
+ENTRY(serpent_ecb_enc_8way_avx)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+ FRAME_BEGIN
+
+ load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ call __serpent_enc_blk8_avx;
+
+ store_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ FRAME_END
+ ret;
+ENDPROC(serpent_ecb_enc_8way_avx)
+
+ENTRY(serpent_ecb_dec_8way_avx)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+ FRAME_BEGIN
+
+ load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ call __serpent_dec_blk8_avx;
+
+ store_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
+
+ FRAME_END
+ ret;
+ENDPROC(serpent_ecb_dec_8way_avx)
+
+ENTRY(serpent_cbc_dec_8way_avx)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+ FRAME_BEGIN
+
+ load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ call __serpent_dec_blk8_avx;
+
+ store_cbc_8way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
+
+ FRAME_END
+ ret;
+ENDPROC(serpent_cbc_dec_8way_avx)
+
+ENTRY(serpent_ctr_8way_avx)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: iv (little endian, 128bit)
+ */
+ FRAME_BEGIN
+
+ load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
+ RD2, RK0, RK1, RK2);
+
+ call __serpent_enc_blk8_avx;
+
+ store_ctr_8way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ FRAME_END
+ ret;
+ENDPROC(serpent_ctr_8way_avx)
+
+ENTRY(serpent_xts_enc_8way_avx)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+ */
+ FRAME_BEGIN
+
+ /* regs <= src, dst <= IVs, regs <= regs xor IVs */
+ load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
+ RK0, RK1, RK2, .Lxts_gf128mul_and_shl1_mask);
+
+ call __serpent_enc_blk8_avx;
+
+ /* dst <= regs xor IVs(in dst) */
+ store_xts_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ FRAME_END
+ ret;
+ENDPROC(serpent_xts_enc_8way_avx)
+
+ENTRY(serpent_xts_dec_8way_avx)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+ */
+ FRAME_BEGIN
+
+ /* regs <= src, dst <= IVs, regs <= regs xor IVs */
+ load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
+ RK0, RK1, RK2, .Lxts_gf128mul_and_shl1_mask);
+
+ call __serpent_dec_blk8_avx;
+
+ /* dst <= regs xor IVs(in dst) */
+ store_xts_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
+
+ FRAME_END
+ ret;
+ENDPROC(serpent_xts_dec_8way_avx)
diff --git a/arch/x86/crypto/serpent-avx2-asm_64.S b/arch/x86/crypto/serpent-avx2-asm_64.S
new file mode 100644
index 000000000..d67888f2a
--- /dev/null
+++ b/arch/x86/crypto/serpent-avx2-asm_64.S
@@ -0,0 +1,818 @@
+/*
+ * x86_64/AVX2 assembler optimized version of Serpent
+ *
+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * Based on AVX assembler implementation of Serpent by:
+ * Copyright © 2012 Johannes Goetzfried
+ * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+#include "glue_helper-asm-avx2.S"
+
+.file "serpent-avx2-asm_64.S"
+
+.section .rodata.cst16.bswap128_mask, "aM", @progbits, 16
+.align 16
+.Lbswap128_mask:
+ .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+.section .rodata.cst16.xts_gf128mul_and_shl1_mask_0, "aM", @progbits, 16
+.align 16
+.Lxts_gf128mul_and_shl1_mask_0:
+ .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
+
+.section .rodata.cst16.xts_gf128mul_and_shl1_mask_1, "aM", @progbits, 16
+.align 16
+.Lxts_gf128mul_and_shl1_mask_1:
+ .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0
+
+.text
+
+#define CTX %rdi
+
+#define RNOT %ymm0
+#define tp %ymm1
+
+#define RA1 %ymm2
+#define RA2 %ymm3
+#define RB1 %ymm4
+#define RB2 %ymm5
+#define RC1 %ymm6
+#define RC2 %ymm7
+#define RD1 %ymm8
+#define RD2 %ymm9
+#define RE1 %ymm10
+#define RE2 %ymm11
+
+#define RK0 %ymm12
+#define RK1 %ymm13
+#define RK2 %ymm14
+#define RK3 %ymm15
+
+#define RK0x %xmm12
+#define RK1x %xmm13
+#define RK2x %xmm14
+#define RK3x %xmm15
+
+#define S0_1(x0, x1, x2, x3, x4) \
+ vpor x0, x3, tp; \
+ vpxor x3, x0, x0; \
+ vpxor x2, x3, x4; \
+ vpxor RNOT, x4, x4; \
+ vpxor x1, tp, x3; \
+ vpand x0, x1, x1; \
+ vpxor x4, x1, x1; \
+ vpxor x0, x2, x2;
+#define S0_2(x0, x1, x2, x3, x4) \
+ vpxor x3, x0, x0; \
+ vpor x0, x4, x4; \
+ vpxor x2, x0, x0; \
+ vpand x1, x2, x2; \
+ vpxor x2, x3, x3; \
+ vpxor RNOT, x1, x1; \
+ vpxor x4, x2, x2; \
+ vpxor x2, x1, x1;
+
+#define S1_1(x0, x1, x2, x3, x4) \
+ vpxor x0, x1, tp; \
+ vpxor x3, x0, x0; \
+ vpxor RNOT, x3, x3; \
+ vpand tp, x1, x4; \
+ vpor tp, x0, x0; \
+ vpxor x2, x3, x3; \
+ vpxor x3, x0, x0; \
+ vpxor x3, tp, x1;
+#define S1_2(x0, x1, x2, x3, x4) \
+ vpxor x4, x3, x3; \
+ vpor x4, x1, x1; \
+ vpxor x2, x4, x4; \
+ vpand x0, x2, x2; \
+ vpxor x1, x2, x2; \
+ vpor x0, x1, x1; \
+ vpxor RNOT, x0, x0; \
+ vpxor x2, x0, x0; \
+ vpxor x1, x4, x4;
+
+#define S2_1(x0, x1, x2, x3, x4) \
+ vpxor RNOT, x3, x3; \
+ vpxor x0, x1, x1; \
+ vpand x2, x0, tp; \
+ vpxor x3, tp, tp; \
+ vpor x0, x3, x3; \
+ vpxor x1, x2, x2; \
+ vpxor x1, x3, x3; \
+ vpand tp, x1, x1;
+#define S2_2(x0, x1, x2, x3, x4) \
+ vpxor x2, tp, tp; \
+ vpand x3, x2, x2; \
+ vpor x1, x3, x3; \
+ vpxor RNOT, tp, tp; \
+ vpxor tp, x3, x3; \
+ vpxor tp, x0, x4; \
+ vpxor x2, tp, x0; \
+ vpor x2, x1, x1;
+
+#define S3_1(x0, x1, x2, x3, x4) \
+ vpxor x3, x1, tp; \
+ vpor x0, x3, x3; \
+ vpand x0, x1, x4; \
+ vpxor x2, x0, x0; \
+ vpxor tp, x2, x2; \
+ vpand x3, tp, x1; \
+ vpxor x3, x2, x2; \
+ vpor x4, x0, x0; \
+ vpxor x3, x4, x4;
+#define S3_2(x0, x1, x2, x3, x4) \
+ vpxor x0, x1, x1; \
+ vpand x3, x0, x0; \
+ vpand x4, x3, x3; \
+ vpxor x2, x3, x3; \
+ vpor x1, x4, x4; \
+ vpand x1, x2, x2; \
+ vpxor x3, x4, x4; \
+ vpxor x3, x0, x0; \
+ vpxor x2, x3, x3;
+
+#define S4_1(x0, x1, x2, x3, x4) \
+ vpand x0, x3, tp; \
+ vpxor x3, x0, x0; \
+ vpxor x2, tp, tp; \
+ vpor x3, x2, x2; \
+ vpxor x1, x0, x0; \
+ vpxor tp, x3, x4; \
+ vpor x0, x2, x2; \
+ vpxor x1, x2, x2;
+#define S4_2(x0, x1, x2, x3, x4) \
+ vpand x0, x1, x1; \
+ vpxor x4, x1, x1; \
+ vpand x2, x4, x4; \
+ vpxor tp, x2, x2; \
+ vpxor x0, x4, x4; \
+ vpor x1, tp, x3; \
+ vpxor RNOT, x1, x1; \
+ vpxor x0, x3, x3;
+
+#define S5_1(x0, x1, x2, x3, x4) \
+ vpor x0, x1, tp; \
+ vpxor tp, x2, x2; \
+ vpxor RNOT, x3, x3; \
+ vpxor x0, x1, x4; \
+ vpxor x2, x0, x0; \
+ vpand x4, tp, x1; \
+ vpor x3, x4, x4; \
+ vpxor x0, x4, x4;
+#define S5_2(x0, x1, x2, x3, x4) \
+ vpand x3, x0, x0; \
+ vpxor x3, x1, x1; \
+ vpxor x2, x3, x3; \
+ vpxor x1, x0, x0; \
+ vpand x4, x2, x2; \
+ vpxor x2, x1, x1; \
+ vpand x0, x2, x2; \
+ vpxor x2, x3, x3;
+
+#define S6_1(x0, x1, x2, x3, x4) \
+ vpxor x0, x3, x3; \
+ vpxor x2, x1, tp; \
+ vpxor x0, x2, x2; \
+ vpand x3, x0, x0; \
+ vpor x3, tp, tp; \
+ vpxor RNOT, x1, x4; \
+ vpxor tp, x0, x0; \
+ vpxor x2, tp, x1;
+#define S6_2(x0, x1, x2, x3, x4) \
+ vpxor x4, x3, x3; \
+ vpxor x0, x4, x4; \
+ vpand x0, x2, x2; \
+ vpxor x1, x4, x4; \
+ vpxor x3, x2, x2; \
+ vpand x1, x3, x3; \
+ vpxor x0, x3, x3; \
+ vpxor x2, x1, x1;
+
+#define S7_1(x0, x1, x2, x3, x4) \
+ vpxor RNOT, x1, tp; \
+ vpxor RNOT, x0, x0; \
+ vpand x2, tp, x1; \
+ vpxor x3, x1, x1; \
+ vpor tp, x3, x3; \
+ vpxor x2, tp, x4; \
+ vpxor x3, x2, x2; \
+ vpxor x0, x3, x3; \
+ vpor x1, x0, x0;
+#define S7_2(x0, x1, x2, x3, x4) \
+ vpand x0, x2, x2; \
+ vpxor x4, x0, x0; \
+ vpxor x3, x4, x4; \
+ vpand x0, x3, x3; \
+ vpxor x1, x4, x4; \
+ vpxor x4, x2, x2; \
+ vpxor x1, x3, x3; \
+ vpor x0, x4, x4; \
+ vpxor x1, x4, x4;
+
+#define SI0_1(x0, x1, x2, x3, x4) \
+ vpxor x0, x1, x1; \
+ vpor x1, x3, tp; \
+ vpxor x1, x3, x4; \
+ vpxor RNOT, x0, x0; \
+ vpxor tp, x2, x2; \
+ vpxor x0, tp, x3; \
+ vpand x1, x0, x0; \
+ vpxor x2, x0, x0;
+#define SI0_2(x0, x1, x2, x3, x4) \
+ vpand x3, x2, x2; \
+ vpxor x4, x3, x3; \
+ vpxor x3, x2, x2; \
+ vpxor x3, x1, x1; \
+ vpand x0, x3, x3; \
+ vpxor x0, x1, x1; \
+ vpxor x2, x0, x0; \
+ vpxor x3, x4, x4;
+
+#define SI1_1(x0, x1, x2, x3, x4) \
+ vpxor x3, x1, x1; \
+ vpxor x2, x0, tp; \
+ vpxor RNOT, x2, x2; \
+ vpor x1, x0, x4; \
+ vpxor x3, x4, x4; \
+ vpand x1, x3, x3; \
+ vpxor x2, x1, x1; \
+ vpand x4, x2, x2;
+#define SI1_2(x0, x1, x2, x3, x4) \
+ vpxor x1, x4, x4; \
+ vpor x3, x1, x1; \
+ vpxor tp, x3, x3; \
+ vpxor tp, x2, x2; \
+ vpor x4, tp, x0; \
+ vpxor x4, x2, x2; \
+ vpxor x0, x1, x1; \
+ vpxor x1, x4, x4;
+
+#define SI2_1(x0, x1, x2, x3, x4) \
+ vpxor x1, x2, x2; \
+ vpxor RNOT, x3, tp; \
+ vpor x2, tp, tp; \
+ vpxor x3, x2, x2; \
+ vpxor x0, x3, x4; \
+ vpxor x1, tp, x3; \
+ vpor x2, x1, x1; \
+ vpxor x0, x2, x2;
+#define SI2_2(x0, x1, x2, x3, x4) \
+ vpxor x4, x1, x1; \
+ vpor x3, x4, x4; \
+ vpxor x3, x2, x2; \
+ vpxor x2, x4, x4; \
+ vpand x1, x2, x2; \
+ vpxor x3, x2, x2; \
+ vpxor x4, x3, x3; \
+ vpxor x0, x4, x4;
+
+#define SI3_1(x0, x1, x2, x3, x4) \
+ vpxor x1, x2, x2; \
+ vpand x2, x1, tp; \
+ vpxor x0, tp, tp; \
+ vpor x1, x0, x0; \
+ vpxor x3, x1, x4; \
+ vpxor x3, x0, x0; \
+ vpor tp, x3, x3; \
+ vpxor x2, tp, x1;
+#define SI3_2(x0, x1, x2, x3, x4) \
+ vpxor x3, x1, x1; \
+ vpxor x2, x0, x0; \
+ vpxor x3, x2, x2; \
+ vpand x1, x3, x3; \
+ vpxor x0, x1, x1; \
+ vpand x2, x0, x0; \
+ vpxor x3, x4, x4; \
+ vpxor x0, x3, x3; \
+ vpxor x1, x0, x0;
+
+#define SI4_1(x0, x1, x2, x3, x4) \
+ vpxor x3, x2, x2; \
+ vpand x1, x0, tp; \
+ vpxor x2, tp, tp; \
+ vpor x3, x2, x2; \
+ vpxor RNOT, x0, x4; \
+ vpxor tp, x1, x1; \
+ vpxor x2, tp, x0; \
+ vpand x4, x2, x2;
+#define SI4_2(x0, x1, x2, x3, x4) \
+ vpxor x0, x2, x2; \
+ vpor x4, x0, x0; \
+ vpxor x3, x0, x0; \
+ vpand x2, x3, x3; \
+ vpxor x3, x4, x4; \
+ vpxor x1, x3, x3; \
+ vpand x0, x1, x1; \
+ vpxor x1, x4, x4; \
+ vpxor x3, x0, x0;
+
+#define SI5_1(x0, x1, x2, x3, x4) \
+ vpor x2, x1, tp; \
+ vpxor x1, x2, x2; \
+ vpxor x3, tp, tp; \
+ vpand x1, x3, x3; \
+ vpxor x3, x2, x2; \
+ vpor x0, x3, x3; \
+ vpxor RNOT, x0, x0; \
+ vpxor x2, x3, x3; \
+ vpor x0, x2, x2;
+#define SI5_2(x0, x1, x2, x3, x4) \
+ vpxor tp, x1, x4; \
+ vpxor x4, x2, x2; \
+ vpand x0, x4, x4; \
+ vpxor tp, x0, x0; \
+ vpxor x3, tp, x1; \
+ vpand x2, x0, x0; \
+ vpxor x3, x2, x2; \
+ vpxor x2, x0, x0; \
+ vpxor x4, x2, x2; \
+ vpxor x3, x4, x4;
+
+#define SI6_1(x0, x1, x2, x3, x4) \
+ vpxor x2, x0, x0; \
+ vpand x3, x0, tp; \
+ vpxor x3, x2, x2; \
+ vpxor x2, tp, tp; \
+ vpxor x1, x3, x3; \
+ vpor x0, x2, x2; \
+ vpxor x3, x2, x2; \
+ vpand tp, x3, x3;
+#define SI6_2(x0, x1, x2, x3, x4) \
+ vpxor RNOT, tp, tp; \
+ vpxor x1, x3, x3; \
+ vpand x2, x1, x1; \
+ vpxor tp, x0, x4; \
+ vpxor x4, x3, x3; \
+ vpxor x2, x4, x4; \
+ vpxor x1, tp, x0; \
+ vpxor x0, x2, x2;
+
+#define SI7_1(x0, x1, x2, x3, x4) \
+ vpand x0, x3, tp; \
+ vpxor x2, x0, x0; \
+ vpor x3, x2, x2; \
+ vpxor x1, x3, x4; \
+ vpxor RNOT, x0, x0; \
+ vpor tp, x1, x1; \
+ vpxor x0, x4, x4; \
+ vpand x2, x0, x0; \
+ vpxor x1, x0, x0;
+#define SI7_2(x0, x1, x2, x3, x4) \
+ vpand x2, x1, x1; \
+ vpxor x2, tp, x3; \
+ vpxor x3, x4, x4; \
+ vpand x3, x2, x2; \
+ vpor x0, x3, x3; \
+ vpxor x4, x1, x1; \
+ vpxor x4, x3, x3; \
+ vpand x0, x4, x4; \
+ vpxor x2, x4, x4;
+
+#define get_key(i,j,t) \
+ vpbroadcastd (4*(i)+(j))*4(CTX), t;
+
+#define K2(x0, x1, x2, x3, x4, i) \
+ get_key(i, 0, RK0); \
+ get_key(i, 1, RK1); \
+ get_key(i, 2, RK2); \
+ get_key(i, 3, RK3); \
+ vpxor RK0, x0 ## 1, x0 ## 1; \
+ vpxor RK1, x1 ## 1, x1 ## 1; \
+ vpxor RK2, x2 ## 1, x2 ## 1; \
+ vpxor RK3, x3 ## 1, x3 ## 1; \
+ vpxor RK0, x0 ## 2, x0 ## 2; \
+ vpxor RK1, x1 ## 2, x1 ## 2; \
+ vpxor RK2, x2 ## 2, x2 ## 2; \
+ vpxor RK3, x3 ## 2, x3 ## 2;
+
+#define LK2(x0, x1, x2, x3, x4, i) \
+ vpslld $13, x0 ## 1, x4 ## 1; \
+ vpsrld $(32 - 13), x0 ## 1, x0 ## 1; \
+ vpor x4 ## 1, x0 ## 1, x0 ## 1; \
+ vpxor x0 ## 1, x1 ## 1, x1 ## 1; \
+ vpslld $3, x2 ## 1, x4 ## 1; \
+ vpsrld $(32 - 3), x2 ## 1, x2 ## 1; \
+ vpor x4 ## 1, x2 ## 1, x2 ## 1; \
+ vpxor x2 ## 1, x1 ## 1, x1 ## 1; \
+ vpslld $13, x0 ## 2, x4 ## 2; \
+ vpsrld $(32 - 13), x0 ## 2, x0 ## 2; \
+ vpor x4 ## 2, x0 ## 2, x0 ## 2; \
+ vpxor x0 ## 2, x1 ## 2, x1 ## 2; \
+ vpslld $3, x2 ## 2, x4 ## 2; \
+ vpsrld $(32 - 3), x2 ## 2, x2 ## 2; \
+ vpor x4 ## 2, x2 ## 2, x2 ## 2; \
+ vpxor x2 ## 2, x1 ## 2, x1 ## 2; \
+ vpslld $1, x1 ## 1, x4 ## 1; \
+ vpsrld $(32 - 1), x1 ## 1, x1 ## 1; \
+ vpor x4 ## 1, x1 ## 1, x1 ## 1; \
+ vpslld $3, x0 ## 1, x4 ## 1; \
+ vpxor x2 ## 1, x3 ## 1, x3 ## 1; \
+ vpxor x4 ## 1, x3 ## 1, x3 ## 1; \
+ get_key(i, 1, RK1); \
+ vpslld $1, x1 ## 2, x4 ## 2; \
+ vpsrld $(32 - 1), x1 ## 2, x1 ## 2; \
+ vpor x4 ## 2, x1 ## 2, x1 ## 2; \
+ vpslld $3, x0 ## 2, x4 ## 2; \
+ vpxor x2 ## 2, x3 ## 2, x3 ## 2; \
+ vpxor x4 ## 2, x3 ## 2, x3 ## 2; \
+ get_key(i, 3, RK3); \
+ vpslld $7, x3 ## 1, x4 ## 1; \
+ vpsrld $(32 - 7), x3 ## 1, x3 ## 1; \
+ vpor x4 ## 1, x3 ## 1, x3 ## 1; \
+ vpslld $7, x1 ## 1, x4 ## 1; \
+ vpxor x1 ## 1, x0 ## 1, x0 ## 1; \
+ vpxor x3 ## 1, x0 ## 1, x0 ## 1; \
+ vpxor x3 ## 1, x2 ## 1, x2 ## 1; \
+ vpxor x4 ## 1, x2 ## 1, x2 ## 1; \
+ get_key(i, 0, RK0); \
+ vpslld $7, x3 ## 2, x4 ## 2; \
+ vpsrld $(32 - 7), x3 ## 2, x3 ## 2; \
+ vpor x4 ## 2, x3 ## 2, x3 ## 2; \
+ vpslld $7, x1 ## 2, x4 ## 2; \
+ vpxor x1 ## 2, x0 ## 2, x0 ## 2; \
+ vpxor x3 ## 2, x0 ## 2, x0 ## 2; \
+ vpxor x3 ## 2, x2 ## 2, x2 ## 2; \
+ vpxor x4 ## 2, x2 ## 2, x2 ## 2; \
+ get_key(i, 2, RK2); \
+ vpxor RK1, x1 ## 1, x1 ## 1; \
+ vpxor RK3, x3 ## 1, x3 ## 1; \
+ vpslld $5, x0 ## 1, x4 ## 1; \
+ vpsrld $(32 - 5), x0 ## 1, x0 ## 1; \
+ vpor x4 ## 1, x0 ## 1, x0 ## 1; \
+ vpslld $22, x2 ## 1, x4 ## 1; \
+ vpsrld $(32 - 22), x2 ## 1, x2 ## 1; \
+ vpor x4 ## 1, x2 ## 1, x2 ## 1; \
+ vpxor RK0, x0 ## 1, x0 ## 1; \
+ vpxor RK2, x2 ## 1, x2 ## 1; \
+ vpxor RK1, x1 ## 2, x1 ## 2; \
+ vpxor RK3, x3 ## 2, x3 ## 2; \
+ vpslld $5, x0 ## 2, x4 ## 2; \
+ vpsrld $(32 - 5), x0 ## 2, x0 ## 2; \
+ vpor x4 ## 2, x0 ## 2, x0 ## 2; \
+ vpslld $22, x2 ## 2, x4 ## 2; \
+ vpsrld $(32 - 22), x2 ## 2, x2 ## 2; \
+ vpor x4 ## 2, x2 ## 2, x2 ## 2; \
+ vpxor RK0, x0 ## 2, x0 ## 2; \
+ vpxor RK2, x2 ## 2, x2 ## 2;
+
+#define KL2(x0, x1, x2, x3, x4, i) \
+ vpxor RK0, x0 ## 1, x0 ## 1; \
+ vpxor RK2, x2 ## 1, x2 ## 1; \
+ vpsrld $5, x0 ## 1, x4 ## 1; \
+ vpslld $(32 - 5), x0 ## 1, x0 ## 1; \
+ vpor x4 ## 1, x0 ## 1, x0 ## 1; \
+ vpxor RK3, x3 ## 1, x3 ## 1; \
+ vpxor RK1, x1 ## 1, x1 ## 1; \
+ vpsrld $22, x2 ## 1, x4 ## 1; \
+ vpslld $(32 - 22), x2 ## 1, x2 ## 1; \
+ vpor x4 ## 1, x2 ## 1, x2 ## 1; \
+ vpxor x3 ## 1, x2 ## 1, x2 ## 1; \
+ vpxor RK0, x0 ## 2, x0 ## 2; \
+ vpxor RK2, x2 ## 2, x2 ## 2; \
+ vpsrld $5, x0 ## 2, x4 ## 2; \
+ vpslld $(32 - 5), x0 ## 2, x0 ## 2; \
+ vpor x4 ## 2, x0 ## 2, x0 ## 2; \
+ vpxor RK3, x3 ## 2, x3 ## 2; \
+ vpxor RK1, x1 ## 2, x1 ## 2; \
+ vpsrld $22, x2 ## 2, x4 ## 2; \
+ vpslld $(32 - 22), x2 ## 2, x2 ## 2; \
+ vpor x4 ## 2, x2 ## 2, x2 ## 2; \
+ vpxor x3 ## 2, x2 ## 2, x2 ## 2; \
+ vpxor x3 ## 1, x0 ## 1, x0 ## 1; \
+ vpslld $7, x1 ## 1, x4 ## 1; \
+ vpxor x1 ## 1, x0 ## 1, x0 ## 1; \
+ vpxor x4 ## 1, x2 ## 1, x2 ## 1; \
+ vpsrld $1, x1 ## 1, x4 ## 1; \
+ vpslld $(32 - 1), x1 ## 1, x1 ## 1; \
+ vpor x4 ## 1, x1 ## 1, x1 ## 1; \
+ vpxor x3 ## 2, x0 ## 2, x0 ## 2; \
+ vpslld $7, x1 ## 2, x4 ## 2; \
+ vpxor x1 ## 2, x0 ## 2, x0 ## 2; \
+ vpxor x4 ## 2, x2 ## 2, x2 ## 2; \
+ vpsrld $1, x1 ## 2, x4 ## 2; \
+ vpslld $(32 - 1), x1 ## 2, x1 ## 2; \
+ vpor x4 ## 2, x1 ## 2, x1 ## 2; \
+ vpsrld $7, x3 ## 1, x4 ## 1; \
+ vpslld $(32 - 7), x3 ## 1, x3 ## 1; \
+ vpor x4 ## 1, x3 ## 1, x3 ## 1; \
+ vpxor x0 ## 1, x1 ## 1, x1 ## 1; \
+ vpslld $3, x0 ## 1, x4 ## 1; \
+ vpxor x4 ## 1, x3 ## 1, x3 ## 1; \
+ vpsrld $7, x3 ## 2, x4 ## 2; \
+ vpslld $(32 - 7), x3 ## 2, x3 ## 2; \
+ vpor x4 ## 2, x3 ## 2, x3 ## 2; \
+ vpxor x0 ## 2, x1 ## 2, x1 ## 2; \
+ vpslld $3, x0 ## 2, x4 ## 2; \
+ vpxor x4 ## 2, x3 ## 2, x3 ## 2; \
+ vpsrld $13, x0 ## 1, x4 ## 1; \
+ vpslld $(32 - 13), x0 ## 1, x0 ## 1; \
+ vpor x4 ## 1, x0 ## 1, x0 ## 1; \
+ vpxor x2 ## 1, x1 ## 1, x1 ## 1; \
+ vpxor x2 ## 1, x3 ## 1, x3 ## 1; \
+ vpsrld $3, x2 ## 1, x4 ## 1; \
+ vpslld $(32 - 3), x2 ## 1, x2 ## 1; \
+ vpor x4 ## 1, x2 ## 1, x2 ## 1; \
+ vpsrld $13, x0 ## 2, x4 ## 2; \
+ vpslld $(32 - 13), x0 ## 2, x0 ## 2; \
+ vpor x4 ## 2, x0 ## 2, x0 ## 2; \
+ vpxor x2 ## 2, x1 ## 2, x1 ## 2; \
+ vpxor x2 ## 2, x3 ## 2, x3 ## 2; \
+ vpsrld $3, x2 ## 2, x4 ## 2; \
+ vpslld $(32 - 3), x2 ## 2, x2 ## 2; \
+ vpor x4 ## 2, x2 ## 2, x2 ## 2;
+
+#define S(SBOX, x0, x1, x2, x3, x4) \
+ SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+ SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+ SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
+ SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2);
+
+#define SP(SBOX, x0, x1, x2, x3, x4, i) \
+ get_key(i, 0, RK0); \
+ SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+ get_key(i, 2, RK2); \
+ SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+ get_key(i, 3, RK3); \
+ SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
+ get_key(i, 1, RK1); \
+ SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
+
+#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+ vpunpckldq x1, x0, t0; \
+ vpunpckhdq x1, x0, t2; \
+ vpunpckldq x3, x2, t1; \
+ vpunpckhdq x3, x2, x3; \
+ \
+ vpunpcklqdq t1, t0, x0; \
+ vpunpckhqdq t1, t0, x1; \
+ vpunpcklqdq x3, t2, x2; \
+ vpunpckhqdq x3, t2, x3;
+
+#define read_blocks(x0, x1, x2, x3, t0, t1, t2) \
+ transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
+
+#define write_blocks(x0, x1, x2, x3, t0, t1, t2) \
+ transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
+
+.align 8
+__serpent_enc_blk16:
+ /* input:
+ * %rdi: ctx, CTX
+ * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: plaintext
+ * output:
+ * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: ciphertext
+ */
+
+ vpcmpeqd RNOT, RNOT, RNOT;
+
+ read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+ read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+
+ K2(RA, RB, RC, RD, RE, 0);
+ S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1);
+ S(S1, RC, RB, RD, RA, RE); LK2(RE, RD, RA, RC, RB, 2);
+ S(S2, RE, RD, RA, RC, RB); LK2(RB, RD, RE, RC, RA, 3);
+ S(S3, RB, RD, RE, RC, RA); LK2(RC, RA, RD, RB, RE, 4);
+ S(S4, RC, RA, RD, RB, RE); LK2(RA, RD, RB, RE, RC, 5);
+ S(S5, RA, RD, RB, RE, RC); LK2(RC, RA, RD, RE, RB, 6);
+ S(S6, RC, RA, RD, RE, RB); LK2(RD, RB, RA, RE, RC, 7);
+ S(S7, RD, RB, RA, RE, RC); LK2(RC, RA, RE, RD, RB, 8);
+ S(S0, RC, RA, RE, RD, RB); LK2(RE, RA, RD, RC, RB, 9);
+ S(S1, RE, RA, RD, RC, RB); LK2(RB, RD, RC, RE, RA, 10);
+ S(S2, RB, RD, RC, RE, RA); LK2(RA, RD, RB, RE, RC, 11);
+ S(S3, RA, RD, RB, RE, RC); LK2(RE, RC, RD, RA, RB, 12);
+ S(S4, RE, RC, RD, RA, RB); LK2(RC, RD, RA, RB, RE, 13);
+ S(S5, RC, RD, RA, RB, RE); LK2(RE, RC, RD, RB, RA, 14);
+ S(S6, RE, RC, RD, RB, RA); LK2(RD, RA, RC, RB, RE, 15);
+ S(S7, RD, RA, RC, RB, RE); LK2(RE, RC, RB, RD, RA, 16);
+ S(S0, RE, RC, RB, RD, RA); LK2(RB, RC, RD, RE, RA, 17);
+ S(S1, RB, RC, RD, RE, RA); LK2(RA, RD, RE, RB, RC, 18);
+ S(S2, RA, RD, RE, RB, RC); LK2(RC, RD, RA, RB, RE, 19);
+ S(S3, RC, RD, RA, RB, RE); LK2(RB, RE, RD, RC, RA, 20);
+ S(S4, RB, RE, RD, RC, RA); LK2(RE, RD, RC, RA, RB, 21);
+ S(S5, RE, RD, RC, RA, RB); LK2(RB, RE, RD, RA, RC, 22);
+ S(S6, RB, RE, RD, RA, RC); LK2(RD, RC, RE, RA, RB, 23);
+ S(S7, RD, RC, RE, RA, RB); LK2(RB, RE, RA, RD, RC, 24);
+ S(S0, RB, RE, RA, RD, RC); LK2(RA, RE, RD, RB, RC, 25);
+ S(S1, RA, RE, RD, RB, RC); LK2(RC, RD, RB, RA, RE, 26);
+ S(S2, RC, RD, RB, RA, RE); LK2(RE, RD, RC, RA, RB, 27);
+ S(S3, RE, RD, RC, RA, RB); LK2(RA, RB, RD, RE, RC, 28);
+ S(S4, RA, RB, RD, RE, RC); LK2(RB, RD, RE, RC, RA, 29);
+ S(S5, RB, RD, RE, RC, RA); LK2(RA, RB, RD, RC, RE, 30);
+ S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31);
+ S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32);
+
+ write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+ write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+
+ ret;
+ENDPROC(__serpent_enc_blk16)
+
+.align 8
+__serpent_dec_blk16:
+ /* input:
+ * %rdi: ctx, CTX
+ * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: ciphertext
+ * output:
+ * RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2: plaintext
+ */
+
+ vpcmpeqd RNOT, RNOT, RNOT;
+
+ read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+ read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+
+ K2(RA, RB, RC, RD, RE, 32);
+ SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31);
+ SP(SI6, RB, RD, RA, RE, RC, 30); KL2(RA, RC, RE, RB, RD, 30);
+ SP(SI5, RA, RC, RE, RB, RD, 29); KL2(RC, RD, RA, RE, RB, 29);
+ SP(SI4, RC, RD, RA, RE, RB, 28); KL2(RC, RA, RB, RE, RD, 28);
+ SP(SI3, RC, RA, RB, RE, RD, 27); KL2(RB, RC, RD, RE, RA, 27);
+ SP(SI2, RB, RC, RD, RE, RA, 26); KL2(RC, RA, RE, RD, RB, 26);
+ SP(SI1, RC, RA, RE, RD, RB, 25); KL2(RB, RA, RE, RD, RC, 25);
+ SP(SI0, RB, RA, RE, RD, RC, 24); KL2(RE, RC, RA, RB, RD, 24);
+ SP(SI7, RE, RC, RA, RB, RD, 23); KL2(RC, RB, RE, RD, RA, 23);
+ SP(SI6, RC, RB, RE, RD, RA, 22); KL2(RE, RA, RD, RC, RB, 22);
+ SP(SI5, RE, RA, RD, RC, RB, 21); KL2(RA, RB, RE, RD, RC, 21);
+ SP(SI4, RA, RB, RE, RD, RC, 20); KL2(RA, RE, RC, RD, RB, 20);
+ SP(SI3, RA, RE, RC, RD, RB, 19); KL2(RC, RA, RB, RD, RE, 19);
+ SP(SI2, RC, RA, RB, RD, RE, 18); KL2(RA, RE, RD, RB, RC, 18);
+ SP(SI1, RA, RE, RD, RB, RC, 17); KL2(RC, RE, RD, RB, RA, 17);
+ SP(SI0, RC, RE, RD, RB, RA, 16); KL2(RD, RA, RE, RC, RB, 16);
+ SP(SI7, RD, RA, RE, RC, RB, 15); KL2(RA, RC, RD, RB, RE, 15);
+ SP(SI6, RA, RC, RD, RB, RE, 14); KL2(RD, RE, RB, RA, RC, 14);
+ SP(SI5, RD, RE, RB, RA, RC, 13); KL2(RE, RC, RD, RB, RA, 13);
+ SP(SI4, RE, RC, RD, RB, RA, 12); KL2(RE, RD, RA, RB, RC, 12);
+ SP(SI3, RE, RD, RA, RB, RC, 11); KL2(RA, RE, RC, RB, RD, 11);
+ SP(SI2, RA, RE, RC, RB, RD, 10); KL2(RE, RD, RB, RC, RA, 10);
+ SP(SI1, RE, RD, RB, RC, RA, 9); KL2(RA, RD, RB, RC, RE, 9);
+ SP(SI0, RA, RD, RB, RC, RE, 8); KL2(RB, RE, RD, RA, RC, 8);
+ SP(SI7, RB, RE, RD, RA, RC, 7); KL2(RE, RA, RB, RC, RD, 7);
+ SP(SI6, RE, RA, RB, RC, RD, 6); KL2(RB, RD, RC, RE, RA, 6);
+ SP(SI5, RB, RD, RC, RE, RA, 5); KL2(RD, RA, RB, RC, RE, 5);
+ SP(SI4, RD, RA, RB, RC, RE, 4); KL2(RD, RB, RE, RC, RA, 4);
+ SP(SI3, RD, RB, RE, RC, RA, 3); KL2(RE, RD, RA, RC, RB, 3);
+ SP(SI2, RE, RD, RA, RC, RB, 2); KL2(RD, RB, RC, RA, RE, 2);
+ SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1);
+ S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0);
+
+ write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2);
+ write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2);
+
+ ret;
+ENDPROC(__serpent_dec_blk16)
+
+ENTRY(serpent_ecb_enc_16way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+ FRAME_BEGIN
+
+ vzeroupper;
+
+ load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ call __serpent_enc_blk16;
+
+ store_16way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ vzeroupper;
+
+ FRAME_END
+ ret;
+ENDPROC(serpent_ecb_enc_16way)
+
+ENTRY(serpent_ecb_dec_16way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+ FRAME_BEGIN
+
+ vzeroupper;
+
+ load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ call __serpent_dec_blk16;
+
+ store_16way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
+
+ vzeroupper;
+
+ FRAME_END
+ ret;
+ENDPROC(serpent_ecb_dec_16way)
+
+ENTRY(serpent_cbc_dec_16way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+ FRAME_BEGIN
+
+ vzeroupper;
+
+ load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ call __serpent_dec_blk16;
+
+ store_cbc_16way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2,
+ RK0);
+
+ vzeroupper;
+
+ FRAME_END
+ ret;
+ENDPROC(serpent_cbc_dec_16way)
+
+ENTRY(serpent_ctr_16way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (16 blocks)
+ * %rdx: src (16 blocks)
+ * %rcx: iv (little endian, 128bit)
+ */
+ FRAME_BEGIN
+
+ vzeroupper;
+
+ load_ctr_16way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
+ RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT,
+ tp);
+
+ call __serpent_enc_blk16;
+
+ store_ctr_16way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ vzeroupper;
+
+ FRAME_END
+ ret;
+ENDPROC(serpent_ctr_16way)
+
+ENTRY(serpent_xts_enc_16way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (16 blocks)
+ * %rdx: src (16 blocks)
+ * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+ */
+ FRAME_BEGIN
+
+ vzeroupper;
+
+ load_xts_16way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
+ RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT,
+ .Lxts_gf128mul_and_shl1_mask_0,
+ .Lxts_gf128mul_and_shl1_mask_1);
+
+ call __serpent_enc_blk16;
+
+ store_xts_16way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ vzeroupper;
+
+ FRAME_END
+ ret;
+ENDPROC(serpent_xts_enc_16way)
+
+ENTRY(serpent_xts_dec_16way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (16 blocks)
+ * %rdx: src (16 blocks)
+ * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+ */
+ FRAME_BEGIN
+
+ vzeroupper;
+
+ load_xts_16way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
+ RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT,
+ .Lxts_gf128mul_and_shl1_mask_0,
+ .Lxts_gf128mul_and_shl1_mask_1);
+
+ call __serpent_dec_blk16;
+
+ store_xts_16way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
+
+ vzeroupper;
+
+ FRAME_END
+ ret;
+ENDPROC(serpent_xts_dec_16way)
diff --git a/arch/x86/crypto/serpent-sse2-i586-asm_32.S b/arch/x86/crypto/serpent-sse2-i586-asm_32.S
new file mode 100644
index 000000000..d348f1553
--- /dev/null
+++ b/arch/x86/crypto/serpent-sse2-i586-asm_32.S
@@ -0,0 +1,631 @@
+/*
+ * Serpent Cipher 4-way parallel algorithm (i586/SSE2)
+ *
+ * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * Based on crypto/serpent.c by
+ * Copyright (C) 2002 Dag Arne Osvik <osvik@ii.uib.no>
+ * 2003 Herbert Valerio Riedel <hvr@gnu.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+ * USA
+ *
+ */
+
+#include <linux/linkage.h>
+
+.file "serpent-sse2-i586-asm_32.S"
+.text
+
+#define arg_ctx 4
+#define arg_dst 8
+#define arg_src 12
+#define arg_xor 16
+
+/**********************************************************************
+ 4-way SSE2 serpent
+ **********************************************************************/
+#define CTX %edx
+
+#define RA %xmm0
+#define RB %xmm1
+#define RC %xmm2
+#define RD %xmm3
+#define RE %xmm4
+
+#define RT0 %xmm5
+#define RT1 %xmm6
+
+#define RNOT %xmm7
+
+#define get_key(i, j, t) \
+ movd (4*(i)+(j))*4(CTX), t; \
+ pshufd $0, t, t;
+
+#define K(x0, x1, x2, x3, x4, i) \
+ get_key(i, 0, x4); \
+ get_key(i, 1, RT0); \
+ get_key(i, 2, RT1); \
+ pxor x4, x0; \
+ pxor RT0, x1; \
+ pxor RT1, x2; \
+ get_key(i, 3, x4); \
+ pxor x4, x3;
+
+#define LK(x0, x1, x2, x3, x4, i) \
+ movdqa x0, x4; \
+ pslld $13, x0; \
+ psrld $(32 - 13), x4; \
+ por x4, x0; \
+ pxor x0, x1; \
+ movdqa x2, x4; \
+ pslld $3, x2; \
+ psrld $(32 - 3), x4; \
+ por x4, x2; \
+ pxor x2, x1; \
+ movdqa x1, x4; \
+ pslld $1, x1; \
+ psrld $(32 - 1), x4; \
+ por x4, x1; \
+ movdqa x0, x4; \
+ pslld $3, x4; \
+ pxor x2, x3; \
+ pxor x4, x3; \
+ movdqa x3, x4; \
+ pslld $7, x3; \
+ psrld $(32 - 7), x4; \
+ por x4, x3; \
+ movdqa x1, x4; \
+ pslld $7, x4; \
+ pxor x1, x0; \
+ pxor x3, x0; \
+ pxor x3, x2; \
+ pxor x4, x2; \
+ movdqa x0, x4; \
+ get_key(i, 1, RT0); \
+ pxor RT0, x1; \
+ get_key(i, 3, RT0); \
+ pxor RT0, x3; \
+ pslld $5, x0; \
+ psrld $(32 - 5), x4; \
+ por x4, x0; \
+ movdqa x2, x4; \
+ pslld $22, x2; \
+ psrld $(32 - 22), x4; \
+ por x4, x2; \
+ get_key(i, 0, RT0); \
+ pxor RT0, x0; \
+ get_key(i, 2, RT0); \
+ pxor RT0, x2;
+
+#define KL(x0, x1, x2, x3, x4, i) \
+ K(x0, x1, x2, x3, x4, i); \
+ movdqa x0, x4; \
+ psrld $5, x0; \
+ pslld $(32 - 5), x4; \
+ por x4, x0; \
+ movdqa x2, x4; \
+ psrld $22, x2; \
+ pslld $(32 - 22), x4; \
+ por x4, x2; \
+ pxor x3, x2; \
+ pxor x3, x0; \
+ movdqa x1, x4; \
+ pslld $7, x4; \
+ pxor x1, x0; \
+ pxor x4, x2; \
+ movdqa x1, x4; \
+ psrld $1, x1; \
+ pslld $(32 - 1), x4; \
+ por x4, x1; \
+ movdqa x3, x4; \
+ psrld $7, x3; \
+ pslld $(32 - 7), x4; \
+ por x4, x3; \
+ pxor x0, x1; \
+ movdqa x0, x4; \
+ pslld $3, x4; \
+ pxor x4, x3; \
+ movdqa x0, x4; \
+ psrld $13, x0; \
+ pslld $(32 - 13), x4; \
+ por x4, x0; \
+ pxor x2, x1; \
+ pxor x2, x3; \
+ movdqa x2, x4; \
+ psrld $3, x2; \
+ pslld $(32 - 3), x4; \
+ por x4, x2;
+
+#define S0(x0, x1, x2, x3, x4) \
+ movdqa x3, x4; \
+ por x0, x3; \
+ pxor x4, x0; \
+ pxor x2, x4; \
+ pxor RNOT, x4; \
+ pxor x1, x3; \
+ pand x0, x1; \
+ pxor x4, x1; \
+ pxor x0, x2; \
+ pxor x3, x0; \
+ por x0, x4; \
+ pxor x2, x0; \
+ pand x1, x2; \
+ pxor x2, x3; \
+ pxor RNOT, x1; \
+ pxor x4, x2; \
+ pxor x2, x1;
+
+#define S1(x0, x1, x2, x3, x4) \
+ movdqa x1, x4; \
+ pxor x0, x1; \
+ pxor x3, x0; \
+ pxor RNOT, x3; \
+ pand x1, x4; \
+ por x1, x0; \
+ pxor x2, x3; \
+ pxor x3, x0; \
+ pxor x3, x1; \
+ pxor x4, x3; \
+ por x4, x1; \
+ pxor x2, x4; \
+ pand x0, x2; \
+ pxor x1, x2; \
+ por x0, x1; \
+ pxor RNOT, x0; \
+ pxor x2, x0; \
+ pxor x1, x4;
+
+#define S2(x0, x1, x2, x3, x4) \
+ pxor RNOT, x3; \
+ pxor x0, x1; \
+ movdqa x0, x4; \
+ pand x2, x0; \
+ pxor x3, x0; \
+ por x4, x3; \
+ pxor x1, x2; \
+ pxor x1, x3; \
+ pand x0, x1; \
+ pxor x2, x0; \
+ pand x3, x2; \
+ por x1, x3; \
+ pxor RNOT, x0; \
+ pxor x0, x3; \
+ pxor x0, x4; \
+ pxor x2, x0; \
+ por x2, x1;
+
+#define S3(x0, x1, x2, x3, x4) \
+ movdqa x1, x4; \
+ pxor x3, x1; \
+ por x0, x3; \
+ pand x0, x4; \
+ pxor x2, x0; \
+ pxor x1, x2; \
+ pand x3, x1; \
+ pxor x3, x2; \
+ por x4, x0; \
+ pxor x3, x4; \
+ pxor x0, x1; \
+ pand x3, x0; \
+ pand x4, x3; \
+ pxor x2, x3; \
+ por x1, x4; \
+ pand x1, x2; \
+ pxor x3, x4; \
+ pxor x3, x0; \
+ pxor x2, x3;
+
+#define S4(x0, x1, x2, x3, x4) \
+ movdqa x3, x4; \
+ pand x0, x3; \
+ pxor x4, x0; \
+ pxor x2, x3; \
+ por x4, x2; \
+ pxor x1, x0; \
+ pxor x3, x4; \
+ por x0, x2; \
+ pxor x1, x2; \
+ pand x0, x1; \
+ pxor x4, x1; \
+ pand x2, x4; \
+ pxor x3, x2; \
+ pxor x0, x4; \
+ por x1, x3; \
+ pxor RNOT, x1; \
+ pxor x0, x3;
+
+#define S5(x0, x1, x2, x3, x4) \
+ movdqa x1, x4; \
+ por x0, x1; \
+ pxor x1, x2; \
+ pxor RNOT, x3; \
+ pxor x0, x4; \
+ pxor x2, x0; \
+ pand x4, x1; \
+ por x3, x4; \
+ pxor x0, x4; \
+ pand x3, x0; \
+ pxor x3, x1; \
+ pxor x2, x3; \
+ pxor x1, x0; \
+ pand x4, x2; \
+ pxor x2, x1; \
+ pand x0, x2; \
+ pxor x2, x3;
+
+#define S6(x0, x1, x2, x3, x4) \
+ movdqa x1, x4; \
+ pxor x0, x3; \
+ pxor x2, x1; \
+ pxor x0, x2; \
+ pand x3, x0; \
+ por x3, x1; \
+ pxor RNOT, x4; \
+ pxor x1, x0; \
+ pxor x2, x1; \
+ pxor x4, x3; \
+ pxor x0, x4; \
+ pand x0, x2; \
+ pxor x1, x4; \
+ pxor x3, x2; \
+ pand x1, x3; \
+ pxor x0, x3; \
+ pxor x2, x1;
+
+#define S7(x0, x1, x2, x3, x4) \
+ pxor RNOT, x1; \
+ movdqa x1, x4; \
+ pxor RNOT, x0; \
+ pand x2, x1; \
+ pxor x3, x1; \
+ por x4, x3; \
+ pxor x2, x4; \
+ pxor x3, x2; \
+ pxor x0, x3; \
+ por x1, x0; \
+ pand x0, x2; \
+ pxor x4, x0; \
+ pxor x3, x4; \
+ pand x0, x3; \
+ pxor x1, x4; \
+ pxor x4, x2; \
+ pxor x1, x3; \
+ por x0, x4; \
+ pxor x1, x4;
+
+#define SI0(x0, x1, x2, x3, x4) \
+ movdqa x3, x4; \
+ pxor x0, x1; \
+ por x1, x3; \
+ pxor x1, x4; \
+ pxor RNOT, x0; \
+ pxor x3, x2; \
+ pxor x0, x3; \
+ pand x1, x0; \
+ pxor x2, x0; \
+ pand x3, x2; \
+ pxor x4, x3; \
+ pxor x3, x2; \
+ pxor x3, x1; \
+ pand x0, x3; \
+ pxor x0, x1; \
+ pxor x2, x0; \
+ pxor x3, x4;
+
+#define SI1(x0, x1, x2, x3, x4) \
+ pxor x3, x1; \
+ movdqa x0, x4; \
+ pxor x2, x0; \
+ pxor RNOT, x2; \
+ por x1, x4; \
+ pxor x3, x4; \
+ pand x1, x3; \
+ pxor x2, x1; \
+ pand x4, x2; \
+ pxor x1, x4; \
+ por x3, x1; \
+ pxor x0, x3; \
+ pxor x0, x2; \
+ por x4, x0; \
+ pxor x4, x2; \
+ pxor x0, x1; \
+ pxor x1, x4;
+
+#define SI2(x0, x1, x2, x3, x4) \
+ pxor x1, x2; \
+ movdqa x3, x4; \
+ pxor RNOT, x3; \
+ por x2, x3; \
+ pxor x4, x2; \
+ pxor x0, x4; \
+ pxor x1, x3; \
+ por x2, x1; \
+ pxor x0, x2; \
+ pxor x4, x1; \
+ por x3, x4; \
+ pxor x3, x2; \
+ pxor x2, x4; \
+ pand x1, x2; \
+ pxor x3, x2; \
+ pxor x4, x3; \
+ pxor x0, x4;
+
+#define SI3(x0, x1, x2, x3, x4) \
+ pxor x1, x2; \
+ movdqa x1, x4; \
+ pand x2, x1; \
+ pxor x0, x1; \
+ por x4, x0; \
+ pxor x3, x4; \
+ pxor x3, x0; \
+ por x1, x3; \
+ pxor x2, x1; \
+ pxor x3, x1; \
+ pxor x2, x0; \
+ pxor x3, x2; \
+ pand x1, x3; \
+ pxor x0, x1; \
+ pand x2, x0; \
+ pxor x3, x4; \
+ pxor x0, x3; \
+ pxor x1, x0;
+
+#define SI4(x0, x1, x2, x3, x4) \
+ pxor x3, x2; \
+ movdqa x0, x4; \
+ pand x1, x0; \
+ pxor x2, x0; \
+ por x3, x2; \
+ pxor RNOT, x4; \
+ pxor x0, x1; \
+ pxor x2, x0; \
+ pand x4, x2; \
+ pxor x0, x2; \
+ por x4, x0; \
+ pxor x3, x0; \
+ pand x2, x3; \
+ pxor x3, x4; \
+ pxor x1, x3; \
+ pand x0, x1; \
+ pxor x1, x4; \
+ pxor x3, x0;
+
+#define SI5(x0, x1, x2, x3, x4) \
+ movdqa x1, x4; \
+ por x2, x1; \
+ pxor x4, x2; \
+ pxor x3, x1; \
+ pand x4, x3; \
+ pxor x3, x2; \
+ por x0, x3; \
+ pxor RNOT, x0; \
+ pxor x2, x3; \
+ por x0, x2; \
+ pxor x1, x4; \
+ pxor x4, x2; \
+ pand x0, x4; \
+ pxor x1, x0; \
+ pxor x3, x1; \
+ pand x2, x0; \
+ pxor x3, x2; \
+ pxor x2, x0; \
+ pxor x4, x2; \
+ pxor x3, x4;
+
+#define SI6(x0, x1, x2, x3, x4) \
+ pxor x2, x0; \
+ movdqa x0, x4; \
+ pand x3, x0; \
+ pxor x3, x2; \
+ pxor x2, x0; \
+ pxor x1, x3; \
+ por x4, x2; \
+ pxor x3, x2; \
+ pand x0, x3; \
+ pxor RNOT, x0; \
+ pxor x1, x3; \
+ pand x2, x1; \
+ pxor x0, x4; \
+ pxor x4, x3; \
+ pxor x2, x4; \
+ pxor x1, x0; \
+ pxor x0, x2;
+
+#define SI7(x0, x1, x2, x3, x4) \
+ movdqa x3, x4; \
+ pand x0, x3; \
+ pxor x2, x0; \
+ por x4, x2; \
+ pxor x1, x4; \
+ pxor RNOT, x0; \
+ por x3, x1; \
+ pxor x0, x4; \
+ pand x2, x0; \
+ pxor x1, x0; \
+ pand x2, x1; \
+ pxor x2, x3; \
+ pxor x3, x4; \
+ pand x3, x2; \
+ por x0, x3; \
+ pxor x4, x1; \
+ pxor x4, x3; \
+ pand x0, x4; \
+ pxor x2, x4;
+
+#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+ movdqa x0, t2; \
+ punpckldq x1, x0; \
+ punpckhdq x1, t2; \
+ movdqa x2, t1; \
+ punpckhdq x3, x2; \
+ punpckldq x3, t1; \
+ movdqa x0, x1; \
+ punpcklqdq t1, x0; \
+ punpckhqdq t1, x1; \
+ movdqa t2, x3; \
+ punpcklqdq x2, t2; \
+ punpckhqdq x2, x3; \
+ movdqa t2, x2;
+
+#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
+ movdqu (0*4*4)(in), x0; \
+ movdqu (1*4*4)(in), x1; \
+ movdqu (2*4*4)(in), x2; \
+ movdqu (3*4*4)(in), x3; \
+ \
+ transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
+
+#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
+ transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+ \
+ movdqu x0, (0*4*4)(out); \
+ movdqu x1, (1*4*4)(out); \
+ movdqu x2, (2*4*4)(out); \
+ movdqu x3, (3*4*4)(out);
+
+#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
+ transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+ \
+ movdqu (0*4*4)(out), t0; \
+ pxor t0, x0; \
+ movdqu x0, (0*4*4)(out); \
+ movdqu (1*4*4)(out), t0; \
+ pxor t0, x1; \
+ movdqu x1, (1*4*4)(out); \
+ movdqu (2*4*4)(out), t0; \
+ pxor t0, x2; \
+ movdqu x2, (2*4*4)(out); \
+ movdqu (3*4*4)(out), t0; \
+ pxor t0, x3; \
+ movdqu x3, (3*4*4)(out);
+
+ENTRY(__serpent_enc_blk_4way)
+ /* input:
+ * arg_ctx(%esp): ctx, CTX
+ * arg_dst(%esp): dst
+ * arg_src(%esp): src
+ * arg_xor(%esp): bool, if true: xor output
+ */
+
+ pcmpeqd RNOT, RNOT;
+
+ movl arg_ctx(%esp), CTX;
+
+ movl arg_src(%esp), %eax;
+ read_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
+
+ K(RA, RB, RC, RD, RE, 0);
+ S0(RA, RB, RC, RD, RE); LK(RC, RB, RD, RA, RE, 1);
+ S1(RC, RB, RD, RA, RE); LK(RE, RD, RA, RC, RB, 2);
+ S2(RE, RD, RA, RC, RB); LK(RB, RD, RE, RC, RA, 3);
+ S3(RB, RD, RE, RC, RA); LK(RC, RA, RD, RB, RE, 4);
+ S4(RC, RA, RD, RB, RE); LK(RA, RD, RB, RE, RC, 5);
+ S5(RA, RD, RB, RE, RC); LK(RC, RA, RD, RE, RB, 6);
+ S6(RC, RA, RD, RE, RB); LK(RD, RB, RA, RE, RC, 7);
+ S7(RD, RB, RA, RE, RC); LK(RC, RA, RE, RD, RB, 8);
+ S0(RC, RA, RE, RD, RB); LK(RE, RA, RD, RC, RB, 9);
+ S1(RE, RA, RD, RC, RB); LK(RB, RD, RC, RE, RA, 10);
+ S2(RB, RD, RC, RE, RA); LK(RA, RD, RB, RE, RC, 11);
+ S3(RA, RD, RB, RE, RC); LK(RE, RC, RD, RA, RB, 12);
+ S4(RE, RC, RD, RA, RB); LK(RC, RD, RA, RB, RE, 13);
+ S5(RC, RD, RA, RB, RE); LK(RE, RC, RD, RB, RA, 14);
+ S6(RE, RC, RD, RB, RA); LK(RD, RA, RC, RB, RE, 15);
+ S7(RD, RA, RC, RB, RE); LK(RE, RC, RB, RD, RA, 16);
+ S0(RE, RC, RB, RD, RA); LK(RB, RC, RD, RE, RA, 17);
+ S1(RB, RC, RD, RE, RA); LK(RA, RD, RE, RB, RC, 18);
+ S2(RA, RD, RE, RB, RC); LK(RC, RD, RA, RB, RE, 19);
+ S3(RC, RD, RA, RB, RE); LK(RB, RE, RD, RC, RA, 20);
+ S4(RB, RE, RD, RC, RA); LK(RE, RD, RC, RA, RB, 21);
+ S5(RE, RD, RC, RA, RB); LK(RB, RE, RD, RA, RC, 22);
+ S6(RB, RE, RD, RA, RC); LK(RD, RC, RE, RA, RB, 23);
+ S7(RD, RC, RE, RA, RB); LK(RB, RE, RA, RD, RC, 24);
+ S0(RB, RE, RA, RD, RC); LK(RA, RE, RD, RB, RC, 25);
+ S1(RA, RE, RD, RB, RC); LK(RC, RD, RB, RA, RE, 26);
+ S2(RC, RD, RB, RA, RE); LK(RE, RD, RC, RA, RB, 27);
+ S3(RE, RD, RC, RA, RB); LK(RA, RB, RD, RE, RC, 28);
+ S4(RA, RB, RD, RE, RC); LK(RB, RD, RE, RC, RA, 29);
+ S5(RB, RD, RE, RC, RA); LK(RA, RB, RD, RC, RE, 30);
+ S6(RA, RB, RD, RC, RE); LK(RD, RE, RB, RC, RA, 31);
+ S7(RD, RE, RB, RC, RA); K(RA, RB, RC, RD, RE, 32);
+
+ movl arg_dst(%esp), %eax;
+
+ cmpb $0, arg_xor(%esp);
+ jnz .L__enc_xor4;
+
+ write_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
+
+ ret;
+
+.L__enc_xor4:
+ xor_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
+
+ ret;
+ENDPROC(__serpent_enc_blk_4way)
+
+ENTRY(serpent_dec_blk_4way)
+ /* input:
+ * arg_ctx(%esp): ctx, CTX
+ * arg_dst(%esp): dst
+ * arg_src(%esp): src
+ */
+
+ pcmpeqd RNOT, RNOT;
+
+ movl arg_ctx(%esp), CTX;
+
+ movl arg_src(%esp), %eax;
+ read_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
+
+ K(RA, RB, RC, RD, RE, 32);
+ SI7(RA, RB, RC, RD, RE); KL(RB, RD, RA, RE, RC, 31);
+ SI6(RB, RD, RA, RE, RC); KL(RA, RC, RE, RB, RD, 30);
+ SI5(RA, RC, RE, RB, RD); KL(RC, RD, RA, RE, RB, 29);
+ SI4(RC, RD, RA, RE, RB); KL(RC, RA, RB, RE, RD, 28);
+ SI3(RC, RA, RB, RE, RD); KL(RB, RC, RD, RE, RA, 27);
+ SI2(RB, RC, RD, RE, RA); KL(RC, RA, RE, RD, RB, 26);
+ SI1(RC, RA, RE, RD, RB); KL(RB, RA, RE, RD, RC, 25);
+ SI0(RB, RA, RE, RD, RC); KL(RE, RC, RA, RB, RD, 24);
+ SI7(RE, RC, RA, RB, RD); KL(RC, RB, RE, RD, RA, 23);
+ SI6(RC, RB, RE, RD, RA); KL(RE, RA, RD, RC, RB, 22);
+ SI5(RE, RA, RD, RC, RB); KL(RA, RB, RE, RD, RC, 21);
+ SI4(RA, RB, RE, RD, RC); KL(RA, RE, RC, RD, RB, 20);
+ SI3(RA, RE, RC, RD, RB); KL(RC, RA, RB, RD, RE, 19);
+ SI2(RC, RA, RB, RD, RE); KL(RA, RE, RD, RB, RC, 18);
+ SI1(RA, RE, RD, RB, RC); KL(RC, RE, RD, RB, RA, 17);
+ SI0(RC, RE, RD, RB, RA); KL(RD, RA, RE, RC, RB, 16);
+ SI7(RD, RA, RE, RC, RB); KL(RA, RC, RD, RB, RE, 15);
+ SI6(RA, RC, RD, RB, RE); KL(RD, RE, RB, RA, RC, 14);
+ SI5(RD, RE, RB, RA, RC); KL(RE, RC, RD, RB, RA, 13);
+ SI4(RE, RC, RD, RB, RA); KL(RE, RD, RA, RB, RC, 12);
+ SI3(RE, RD, RA, RB, RC); KL(RA, RE, RC, RB, RD, 11);
+ SI2(RA, RE, RC, RB, RD); KL(RE, RD, RB, RC, RA, 10);
+ SI1(RE, RD, RB, RC, RA); KL(RA, RD, RB, RC, RE, 9);
+ SI0(RA, RD, RB, RC, RE); KL(RB, RE, RD, RA, RC, 8);
+ SI7(RB, RE, RD, RA, RC); KL(RE, RA, RB, RC, RD, 7);
+ SI6(RE, RA, RB, RC, RD); KL(RB, RD, RC, RE, RA, 6);
+ SI5(RB, RD, RC, RE, RA); KL(RD, RA, RB, RC, RE, 5);
+ SI4(RD, RA, RB, RC, RE); KL(RD, RB, RE, RC, RA, 4);
+ SI3(RD, RB, RE, RC, RA); KL(RE, RD, RA, RC, RB, 3);
+ SI2(RE, RD, RA, RC, RB); KL(RD, RB, RC, RA, RE, 2);
+ SI1(RD, RB, RC, RA, RE); KL(RE, RB, RC, RA, RD, 1);
+ SI0(RE, RB, RC, RA, RD); K(RC, RD, RB, RE, RA, 0);
+
+ movl arg_dst(%esp), %eax;
+ write_blocks(%eax, RC, RD, RB, RE, RT0, RT1, RA);
+
+ ret;
+ENDPROC(serpent_dec_blk_4way)
diff --git a/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S b/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S
new file mode 100644
index 000000000..acc066c7c
--- /dev/null
+++ b/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S
@@ -0,0 +1,754 @@
+/*
+ * Serpent Cipher 8-way parallel algorithm (x86_64/SSE2)
+ *
+ * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * Based on crypto/serpent.c by
+ * Copyright (C) 2002 Dag Arne Osvik <osvik@ii.uib.no>
+ * 2003 Herbert Valerio Riedel <hvr@gnu.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+ * USA
+ *
+ */
+
+#include <linux/linkage.h>
+
+.file "serpent-sse2-x86_64-asm_64.S"
+.text
+
+#define CTX %rdi
+
+/**********************************************************************
+ 8-way SSE2 serpent
+ **********************************************************************/
+#define RA1 %xmm0
+#define RB1 %xmm1
+#define RC1 %xmm2
+#define RD1 %xmm3
+#define RE1 %xmm4
+
+#define RA2 %xmm5
+#define RB2 %xmm6
+#define RC2 %xmm7
+#define RD2 %xmm8
+#define RE2 %xmm9
+
+#define RNOT %xmm10
+
+#define RK0 %xmm11
+#define RK1 %xmm12
+#define RK2 %xmm13
+#define RK3 %xmm14
+
+#define S0_1(x0, x1, x2, x3, x4) \
+ movdqa x3, x4; \
+ por x0, x3; \
+ pxor x4, x0; \
+ pxor x2, x4; \
+ pxor RNOT, x4; \
+ pxor x1, x3; \
+ pand x0, x1; \
+ pxor x4, x1; \
+ pxor x0, x2;
+#define S0_2(x0, x1, x2, x3, x4) \
+ pxor x3, x0; \
+ por x0, x4; \
+ pxor x2, x0; \
+ pand x1, x2; \
+ pxor x2, x3; \
+ pxor RNOT, x1; \
+ pxor x4, x2; \
+ pxor x2, x1;
+
+#define S1_1(x0, x1, x2, x3, x4) \
+ movdqa x1, x4; \
+ pxor x0, x1; \
+ pxor x3, x0; \
+ pxor RNOT, x3; \
+ pand x1, x4; \
+ por x1, x0; \
+ pxor x2, x3; \
+ pxor x3, x0; \
+ pxor x3, x1;
+#define S1_2(x0, x1, x2, x3, x4) \
+ pxor x4, x3; \
+ por x4, x1; \
+ pxor x2, x4; \
+ pand x0, x2; \
+ pxor x1, x2; \
+ por x0, x1; \
+ pxor RNOT, x0; \
+ pxor x2, x0; \
+ pxor x1, x4;
+
+#define S2_1(x0, x1, x2, x3, x4) \
+ pxor RNOT, x3; \
+ pxor x0, x1; \
+ movdqa x0, x4; \
+ pand x2, x0; \
+ pxor x3, x0; \
+ por x4, x3; \
+ pxor x1, x2; \
+ pxor x1, x3; \
+ pand x0, x1;
+#define S2_2(x0, x1, x2, x3, x4) \
+ pxor x2, x0; \
+ pand x3, x2; \
+ por x1, x3; \
+ pxor RNOT, x0; \
+ pxor x0, x3; \
+ pxor x0, x4; \
+ pxor x2, x0; \
+ por x2, x1;
+
+#define S3_1(x0, x1, x2, x3, x4) \
+ movdqa x1, x4; \
+ pxor x3, x1; \
+ por x0, x3; \
+ pand x0, x4; \
+ pxor x2, x0; \
+ pxor x1, x2; \
+ pand x3, x1; \
+ pxor x3, x2; \
+ por x4, x0; \
+ pxor x3, x4;
+#define S3_2(x0, x1, x2, x3, x4) \
+ pxor x0, x1; \
+ pand x3, x0; \
+ pand x4, x3; \
+ pxor x2, x3; \
+ por x1, x4; \
+ pand x1, x2; \
+ pxor x3, x4; \
+ pxor x3, x0; \
+ pxor x2, x3;
+
+#define S4_1(x0, x1, x2, x3, x4) \
+ movdqa x3, x4; \
+ pand x0, x3; \
+ pxor x4, x0; \
+ pxor x2, x3; \
+ por x4, x2; \
+ pxor x1, x0; \
+ pxor x3, x4; \
+ por x0, x2; \
+ pxor x1, x2;
+#define S4_2(x0, x1, x2, x3, x4) \
+ pand x0, x1; \
+ pxor x4, x1; \
+ pand x2, x4; \
+ pxor x3, x2; \
+ pxor x0, x4; \
+ por x1, x3; \
+ pxor RNOT, x1; \
+ pxor x0, x3;
+
+#define S5_1(x0, x1, x2, x3, x4) \
+ movdqa x1, x4; \
+ por x0, x1; \
+ pxor x1, x2; \
+ pxor RNOT, x3; \
+ pxor x0, x4; \
+ pxor x2, x0; \
+ pand x4, x1; \
+ por x3, x4; \
+ pxor x0, x4;
+#define S5_2(x0, x1, x2, x3, x4) \
+ pand x3, x0; \
+ pxor x3, x1; \
+ pxor x2, x3; \
+ pxor x1, x0; \
+ pand x4, x2; \
+ pxor x2, x1; \
+ pand x0, x2; \
+ pxor x2, x3;
+
+#define S6_1(x0, x1, x2, x3, x4) \
+ movdqa x1, x4; \
+ pxor x0, x3; \
+ pxor x2, x1; \
+ pxor x0, x2; \
+ pand x3, x0; \
+ por x3, x1; \
+ pxor RNOT, x4; \
+ pxor x1, x0; \
+ pxor x2, x1;
+#define S6_2(x0, x1, x2, x3, x4) \
+ pxor x4, x3; \
+ pxor x0, x4; \
+ pand x0, x2; \
+ pxor x1, x4; \
+ pxor x3, x2; \
+ pand x1, x3; \
+ pxor x0, x3; \
+ pxor x2, x1;
+
+#define S7_1(x0, x1, x2, x3, x4) \
+ pxor RNOT, x1; \
+ movdqa x1, x4; \
+ pxor RNOT, x0; \
+ pand x2, x1; \
+ pxor x3, x1; \
+ por x4, x3; \
+ pxor x2, x4; \
+ pxor x3, x2; \
+ pxor x0, x3; \
+ por x1, x0;
+#define S7_2(x0, x1, x2, x3, x4) \
+ pand x0, x2; \
+ pxor x4, x0; \
+ pxor x3, x4; \
+ pand x0, x3; \
+ pxor x1, x4; \
+ pxor x4, x2; \
+ pxor x1, x3; \
+ por x0, x4; \
+ pxor x1, x4;
+
+#define SI0_1(x0, x1, x2, x3, x4) \
+ movdqa x3, x4; \
+ pxor x0, x1; \
+ por x1, x3; \
+ pxor x1, x4; \
+ pxor RNOT, x0; \
+ pxor x3, x2; \
+ pxor x0, x3; \
+ pand x1, x0; \
+ pxor x2, x0;
+#define SI0_2(x0, x1, x2, x3, x4) \
+ pand x3, x2; \
+ pxor x4, x3; \
+ pxor x3, x2; \
+ pxor x3, x1; \
+ pand x0, x3; \
+ pxor x0, x1; \
+ pxor x2, x0; \
+ pxor x3, x4;
+
+#define SI1_1(x0, x1, x2, x3, x4) \
+ pxor x3, x1; \
+ movdqa x0, x4; \
+ pxor x2, x0; \
+ pxor RNOT, x2; \
+ por x1, x4; \
+ pxor x3, x4; \
+ pand x1, x3; \
+ pxor x2, x1; \
+ pand x4, x2;
+#define SI1_2(x0, x1, x2, x3, x4) \
+ pxor x1, x4; \
+ por x3, x1; \
+ pxor x0, x3; \
+ pxor x0, x2; \
+ por x4, x0; \
+ pxor x4, x2; \
+ pxor x0, x1; \
+ pxor x1, x4;
+
+#define SI2_1(x0, x1, x2, x3, x4) \
+ pxor x1, x2; \
+ movdqa x3, x4; \
+ pxor RNOT, x3; \
+ por x2, x3; \
+ pxor x4, x2; \
+ pxor x0, x4; \
+ pxor x1, x3; \
+ por x2, x1; \
+ pxor x0, x2;
+#define SI2_2(x0, x1, x2, x3, x4) \
+ pxor x4, x1; \
+ por x3, x4; \
+ pxor x3, x2; \
+ pxor x2, x4; \
+ pand x1, x2; \
+ pxor x3, x2; \
+ pxor x4, x3; \
+ pxor x0, x4;
+
+#define SI3_1(x0, x1, x2, x3, x4) \
+ pxor x1, x2; \
+ movdqa x1, x4; \
+ pand x2, x1; \
+ pxor x0, x1; \
+ por x4, x0; \
+ pxor x3, x4; \
+ pxor x3, x0; \
+ por x1, x3; \
+ pxor x2, x1;
+#define SI3_2(x0, x1, x2, x3, x4) \
+ pxor x3, x1; \
+ pxor x2, x0; \
+ pxor x3, x2; \
+ pand x1, x3; \
+ pxor x0, x1; \
+ pand x2, x0; \
+ pxor x3, x4; \
+ pxor x0, x3; \
+ pxor x1, x0;
+
+#define SI4_1(x0, x1, x2, x3, x4) \
+ pxor x3, x2; \
+ movdqa x0, x4; \
+ pand x1, x0; \
+ pxor x2, x0; \
+ por x3, x2; \
+ pxor RNOT, x4; \
+ pxor x0, x1; \
+ pxor x2, x0; \
+ pand x4, x2;
+#define SI4_2(x0, x1, x2, x3, x4) \
+ pxor x0, x2; \
+ por x4, x0; \
+ pxor x3, x0; \
+ pand x2, x3; \
+ pxor x3, x4; \
+ pxor x1, x3; \
+ pand x0, x1; \
+ pxor x1, x4; \
+ pxor x3, x0;
+
+#define SI5_1(x0, x1, x2, x3, x4) \
+ movdqa x1, x4; \
+ por x2, x1; \
+ pxor x4, x2; \
+ pxor x3, x1; \
+ pand x4, x3; \
+ pxor x3, x2; \
+ por x0, x3; \
+ pxor RNOT, x0; \
+ pxor x2, x3; \
+ por x0, x2;
+#define SI5_2(x0, x1, x2, x3, x4) \
+ pxor x1, x4; \
+ pxor x4, x2; \
+ pand x0, x4; \
+ pxor x1, x0; \
+ pxor x3, x1; \
+ pand x2, x0; \
+ pxor x3, x2; \
+ pxor x2, x0; \
+ pxor x4, x2; \
+ pxor x3, x4;
+
+#define SI6_1(x0, x1, x2, x3, x4) \
+ pxor x2, x0; \
+ movdqa x0, x4; \
+ pand x3, x0; \
+ pxor x3, x2; \
+ pxor x2, x0; \
+ pxor x1, x3; \
+ por x4, x2; \
+ pxor x3, x2; \
+ pand x0, x3;
+#define SI6_2(x0, x1, x2, x3, x4) \
+ pxor RNOT, x0; \
+ pxor x1, x3; \
+ pand x2, x1; \
+ pxor x0, x4; \
+ pxor x4, x3; \
+ pxor x2, x4; \
+ pxor x1, x0; \
+ pxor x0, x2;
+
+#define SI7_1(x0, x1, x2, x3, x4) \
+ movdqa x3, x4; \
+ pand x0, x3; \
+ pxor x2, x0; \
+ por x4, x2; \
+ pxor x1, x4; \
+ pxor RNOT, x0; \
+ por x3, x1; \
+ pxor x0, x4; \
+ pand x2, x0; \
+ pxor x1, x0;
+#define SI7_2(x0, x1, x2, x3, x4) \
+ pand x2, x1; \
+ pxor x2, x3; \
+ pxor x3, x4; \
+ pand x3, x2; \
+ por x0, x3; \
+ pxor x4, x1; \
+ pxor x4, x3; \
+ pand x0, x4; \
+ pxor x2, x4;
+
+#define get_key(i, j, t) \
+ movd (4*(i)+(j))*4(CTX), t; \
+ pshufd $0, t, t;
+
+#define K2(x0, x1, x2, x3, x4, i) \
+ get_key(i, 0, RK0); \
+ get_key(i, 1, RK1); \
+ get_key(i, 2, RK2); \
+ get_key(i, 3, RK3); \
+ pxor RK0, x0 ## 1; \
+ pxor RK1, x1 ## 1; \
+ pxor RK2, x2 ## 1; \
+ pxor RK3, x3 ## 1; \
+ pxor RK0, x0 ## 2; \
+ pxor RK1, x1 ## 2; \
+ pxor RK2, x2 ## 2; \
+ pxor RK3, x3 ## 2;
+
+#define LK2(x0, x1, x2, x3, x4, i) \
+ movdqa x0 ## 1, x4 ## 1; \
+ pslld $13, x0 ## 1; \
+ psrld $(32 - 13), x4 ## 1; \
+ por x4 ## 1, x0 ## 1; \
+ pxor x0 ## 1, x1 ## 1; \
+ movdqa x2 ## 1, x4 ## 1; \
+ pslld $3, x2 ## 1; \
+ psrld $(32 - 3), x4 ## 1; \
+ por x4 ## 1, x2 ## 1; \
+ pxor x2 ## 1, x1 ## 1; \
+ movdqa x0 ## 2, x4 ## 2; \
+ pslld $13, x0 ## 2; \
+ psrld $(32 - 13), x4 ## 2; \
+ por x4 ## 2, x0 ## 2; \
+ pxor x0 ## 2, x1 ## 2; \
+ movdqa x2 ## 2, x4 ## 2; \
+ pslld $3, x2 ## 2; \
+ psrld $(32 - 3), x4 ## 2; \
+ por x4 ## 2, x2 ## 2; \
+ pxor x2 ## 2, x1 ## 2; \
+ movdqa x1 ## 1, x4 ## 1; \
+ pslld $1, x1 ## 1; \
+ psrld $(32 - 1), x4 ## 1; \
+ por x4 ## 1, x1 ## 1; \
+ movdqa x0 ## 1, x4 ## 1; \
+ pslld $3, x4 ## 1; \
+ pxor x2 ## 1, x3 ## 1; \
+ pxor x4 ## 1, x3 ## 1; \
+ movdqa x3 ## 1, x4 ## 1; \
+ get_key(i, 1, RK1); \
+ movdqa x1 ## 2, x4 ## 2; \
+ pslld $1, x1 ## 2; \
+ psrld $(32 - 1), x4 ## 2; \
+ por x4 ## 2, x1 ## 2; \
+ movdqa x0 ## 2, x4 ## 2; \
+ pslld $3, x4 ## 2; \
+ pxor x2 ## 2, x3 ## 2; \
+ pxor x4 ## 2, x3 ## 2; \
+ movdqa x3 ## 2, x4 ## 2; \
+ get_key(i, 3, RK3); \
+ pslld $7, x3 ## 1; \
+ psrld $(32 - 7), x4 ## 1; \
+ por x4 ## 1, x3 ## 1; \
+ movdqa x1 ## 1, x4 ## 1; \
+ pslld $7, x4 ## 1; \
+ pxor x1 ## 1, x0 ## 1; \
+ pxor x3 ## 1, x0 ## 1; \
+ pxor x3 ## 1, x2 ## 1; \
+ pxor x4 ## 1, x2 ## 1; \
+ get_key(i, 0, RK0); \
+ pslld $7, x3 ## 2; \
+ psrld $(32 - 7), x4 ## 2; \
+ por x4 ## 2, x3 ## 2; \
+ movdqa x1 ## 2, x4 ## 2; \
+ pslld $7, x4 ## 2; \
+ pxor x1 ## 2, x0 ## 2; \
+ pxor x3 ## 2, x0 ## 2; \
+ pxor x3 ## 2, x2 ## 2; \
+ pxor x4 ## 2, x2 ## 2; \
+ get_key(i, 2, RK2); \
+ pxor RK1, x1 ## 1; \
+ pxor RK3, x3 ## 1; \
+ movdqa x0 ## 1, x4 ## 1; \
+ pslld $5, x0 ## 1; \
+ psrld $(32 - 5), x4 ## 1; \
+ por x4 ## 1, x0 ## 1; \
+ movdqa x2 ## 1, x4 ## 1; \
+ pslld $22, x2 ## 1; \
+ psrld $(32 - 22), x4 ## 1; \
+ por x4 ## 1, x2 ## 1; \
+ pxor RK0, x0 ## 1; \
+ pxor RK2, x2 ## 1; \
+ pxor RK1, x1 ## 2; \
+ pxor RK3, x3 ## 2; \
+ movdqa x0 ## 2, x4 ## 2; \
+ pslld $5, x0 ## 2; \
+ psrld $(32 - 5), x4 ## 2; \
+ por x4 ## 2, x0 ## 2; \
+ movdqa x2 ## 2, x4 ## 2; \
+ pslld $22, x2 ## 2; \
+ psrld $(32 - 22), x4 ## 2; \
+ por x4 ## 2, x2 ## 2; \
+ pxor RK0, x0 ## 2; \
+ pxor RK2, x2 ## 2;
+
+#define KL2(x0, x1, x2, x3, x4, i) \
+ pxor RK0, x0 ## 1; \
+ pxor RK2, x2 ## 1; \
+ movdqa x0 ## 1, x4 ## 1; \
+ psrld $5, x0 ## 1; \
+ pslld $(32 - 5), x4 ## 1; \
+ por x4 ## 1, x0 ## 1; \
+ pxor RK3, x3 ## 1; \
+ pxor RK1, x1 ## 1; \
+ movdqa x2 ## 1, x4 ## 1; \
+ psrld $22, x2 ## 1; \
+ pslld $(32 - 22), x4 ## 1; \
+ por x4 ## 1, x2 ## 1; \
+ pxor x3 ## 1, x2 ## 1; \
+ pxor RK0, x0 ## 2; \
+ pxor RK2, x2 ## 2; \
+ movdqa x0 ## 2, x4 ## 2; \
+ psrld $5, x0 ## 2; \
+ pslld $(32 - 5), x4 ## 2; \
+ por x4 ## 2, x0 ## 2; \
+ pxor RK3, x3 ## 2; \
+ pxor RK1, x1 ## 2; \
+ movdqa x2 ## 2, x4 ## 2; \
+ psrld $22, x2 ## 2; \
+ pslld $(32 - 22), x4 ## 2; \
+ por x4 ## 2, x2 ## 2; \
+ pxor x3 ## 2, x2 ## 2; \
+ pxor x3 ## 1, x0 ## 1; \
+ movdqa x1 ## 1, x4 ## 1; \
+ pslld $7, x4 ## 1; \
+ pxor x1 ## 1, x0 ## 1; \
+ pxor x4 ## 1, x2 ## 1; \
+ movdqa x1 ## 1, x4 ## 1; \
+ psrld $1, x1 ## 1; \
+ pslld $(32 - 1), x4 ## 1; \
+ por x4 ## 1, x1 ## 1; \
+ pxor x3 ## 2, x0 ## 2; \
+ movdqa x1 ## 2, x4 ## 2; \
+ pslld $7, x4 ## 2; \
+ pxor x1 ## 2, x0 ## 2; \
+ pxor x4 ## 2, x2 ## 2; \
+ movdqa x1 ## 2, x4 ## 2; \
+ psrld $1, x1 ## 2; \
+ pslld $(32 - 1), x4 ## 2; \
+ por x4 ## 2, x1 ## 2; \
+ movdqa x3 ## 1, x4 ## 1; \
+ psrld $7, x3 ## 1; \
+ pslld $(32 - 7), x4 ## 1; \
+ por x4 ## 1, x3 ## 1; \
+ pxor x0 ## 1, x1 ## 1; \
+ movdqa x0 ## 1, x4 ## 1; \
+ pslld $3, x4 ## 1; \
+ pxor x4 ## 1, x3 ## 1; \
+ movdqa x0 ## 1, x4 ## 1; \
+ movdqa x3 ## 2, x4 ## 2; \
+ psrld $7, x3 ## 2; \
+ pslld $(32 - 7), x4 ## 2; \
+ por x4 ## 2, x3 ## 2; \
+ pxor x0 ## 2, x1 ## 2; \
+ movdqa x0 ## 2, x4 ## 2; \
+ pslld $3, x4 ## 2; \
+ pxor x4 ## 2, x3 ## 2; \
+ movdqa x0 ## 2, x4 ## 2; \
+ psrld $13, x0 ## 1; \
+ pslld $(32 - 13), x4 ## 1; \
+ por x4 ## 1, x0 ## 1; \
+ pxor x2 ## 1, x1 ## 1; \
+ pxor x2 ## 1, x3 ## 1; \
+ movdqa x2 ## 1, x4 ## 1; \
+ psrld $3, x2 ## 1; \
+ pslld $(32 - 3), x4 ## 1; \
+ por x4 ## 1, x2 ## 1; \
+ psrld $13, x0 ## 2; \
+ pslld $(32 - 13), x4 ## 2; \
+ por x4 ## 2, x0 ## 2; \
+ pxor x2 ## 2, x1 ## 2; \
+ pxor x2 ## 2, x3 ## 2; \
+ movdqa x2 ## 2, x4 ## 2; \
+ psrld $3, x2 ## 2; \
+ pslld $(32 - 3), x4 ## 2; \
+ por x4 ## 2, x2 ## 2;
+
+#define S(SBOX, x0, x1, x2, x3, x4) \
+ SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+ SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+ SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
+ SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2);
+
+#define SP(SBOX, x0, x1, x2, x3, x4, i) \
+ get_key(i, 0, RK0); \
+ SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+ get_key(i, 2, RK2); \
+ SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
+ get_key(i, 3, RK3); \
+ SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+ get_key(i, 1, RK1); \
+ SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
+
+#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+ movdqa x0, t2; \
+ punpckldq x1, x0; \
+ punpckhdq x1, t2; \
+ movdqa x2, t1; \
+ punpckhdq x3, x2; \
+ punpckldq x3, t1; \
+ movdqa x0, x1; \
+ punpcklqdq t1, x0; \
+ punpckhqdq t1, x1; \
+ movdqa t2, x3; \
+ punpcklqdq x2, t2; \
+ punpckhqdq x2, x3; \
+ movdqa t2, x2;
+
+#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
+ movdqu (0*4*4)(in), x0; \
+ movdqu (1*4*4)(in), x1; \
+ movdqu (2*4*4)(in), x2; \
+ movdqu (3*4*4)(in), x3; \
+ \
+ transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
+
+#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
+ transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+ \
+ movdqu x0, (0*4*4)(out); \
+ movdqu x1, (1*4*4)(out); \
+ movdqu x2, (2*4*4)(out); \
+ movdqu x3, (3*4*4)(out);
+
+#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
+ transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+ \
+ movdqu (0*4*4)(out), t0; \
+ pxor t0, x0; \
+ movdqu x0, (0*4*4)(out); \
+ movdqu (1*4*4)(out), t0; \
+ pxor t0, x1; \
+ movdqu x1, (1*4*4)(out); \
+ movdqu (2*4*4)(out), t0; \
+ pxor t0, x2; \
+ movdqu x2, (2*4*4)(out); \
+ movdqu (3*4*4)(out), t0; \
+ pxor t0, x3; \
+ movdqu x3, (3*4*4)(out);
+
+ENTRY(__serpent_enc_blk_8way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: bool, if true: xor output
+ */
+
+ pcmpeqd RNOT, RNOT;
+
+ leaq (4*4*4)(%rdx), %rax;
+ read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+ read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+
+ K2(RA, RB, RC, RD, RE, 0);
+ S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1);
+ S(S1, RC, RB, RD, RA, RE); LK2(RE, RD, RA, RC, RB, 2);
+ S(S2, RE, RD, RA, RC, RB); LK2(RB, RD, RE, RC, RA, 3);
+ S(S3, RB, RD, RE, RC, RA); LK2(RC, RA, RD, RB, RE, 4);
+ S(S4, RC, RA, RD, RB, RE); LK2(RA, RD, RB, RE, RC, 5);
+ S(S5, RA, RD, RB, RE, RC); LK2(RC, RA, RD, RE, RB, 6);
+ S(S6, RC, RA, RD, RE, RB); LK2(RD, RB, RA, RE, RC, 7);
+ S(S7, RD, RB, RA, RE, RC); LK2(RC, RA, RE, RD, RB, 8);
+ S(S0, RC, RA, RE, RD, RB); LK2(RE, RA, RD, RC, RB, 9);
+ S(S1, RE, RA, RD, RC, RB); LK2(RB, RD, RC, RE, RA, 10);
+ S(S2, RB, RD, RC, RE, RA); LK2(RA, RD, RB, RE, RC, 11);
+ S(S3, RA, RD, RB, RE, RC); LK2(RE, RC, RD, RA, RB, 12);
+ S(S4, RE, RC, RD, RA, RB); LK2(RC, RD, RA, RB, RE, 13);
+ S(S5, RC, RD, RA, RB, RE); LK2(RE, RC, RD, RB, RA, 14);
+ S(S6, RE, RC, RD, RB, RA); LK2(RD, RA, RC, RB, RE, 15);
+ S(S7, RD, RA, RC, RB, RE); LK2(RE, RC, RB, RD, RA, 16);
+ S(S0, RE, RC, RB, RD, RA); LK2(RB, RC, RD, RE, RA, 17);
+ S(S1, RB, RC, RD, RE, RA); LK2(RA, RD, RE, RB, RC, 18);
+ S(S2, RA, RD, RE, RB, RC); LK2(RC, RD, RA, RB, RE, 19);
+ S(S3, RC, RD, RA, RB, RE); LK2(RB, RE, RD, RC, RA, 20);
+ S(S4, RB, RE, RD, RC, RA); LK2(RE, RD, RC, RA, RB, 21);
+ S(S5, RE, RD, RC, RA, RB); LK2(RB, RE, RD, RA, RC, 22);
+ S(S6, RB, RE, RD, RA, RC); LK2(RD, RC, RE, RA, RB, 23);
+ S(S7, RD, RC, RE, RA, RB); LK2(RB, RE, RA, RD, RC, 24);
+ S(S0, RB, RE, RA, RD, RC); LK2(RA, RE, RD, RB, RC, 25);
+ S(S1, RA, RE, RD, RB, RC); LK2(RC, RD, RB, RA, RE, 26);
+ S(S2, RC, RD, RB, RA, RE); LK2(RE, RD, RC, RA, RB, 27);
+ S(S3, RE, RD, RC, RA, RB); LK2(RA, RB, RD, RE, RC, 28);
+ S(S4, RA, RB, RD, RE, RC); LK2(RB, RD, RE, RC, RA, 29);
+ S(S5, RB, RD, RE, RC, RA); LK2(RA, RB, RD, RC, RE, 30);
+ S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31);
+ S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32);
+
+ leaq (4*4*4)(%rsi), %rax;
+
+ testb %cl, %cl;
+ jnz .L__enc_xor8;
+
+ write_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+ write_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+
+ ret;
+
+.L__enc_xor8:
+ xor_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+ xor_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+
+ ret;
+ENDPROC(__serpent_enc_blk_8way)
+
+ENTRY(serpent_dec_blk_8way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+
+ pcmpeqd RNOT, RNOT;
+
+ leaq (4*4*4)(%rdx), %rax;
+ read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+ read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+
+ K2(RA, RB, RC, RD, RE, 32);
+ SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31);
+ SP(SI6, RB, RD, RA, RE, RC, 30); KL2(RA, RC, RE, RB, RD, 30);
+ SP(SI5, RA, RC, RE, RB, RD, 29); KL2(RC, RD, RA, RE, RB, 29);
+ SP(SI4, RC, RD, RA, RE, RB, 28); KL2(RC, RA, RB, RE, RD, 28);
+ SP(SI3, RC, RA, RB, RE, RD, 27); KL2(RB, RC, RD, RE, RA, 27);
+ SP(SI2, RB, RC, RD, RE, RA, 26); KL2(RC, RA, RE, RD, RB, 26);
+ SP(SI1, RC, RA, RE, RD, RB, 25); KL2(RB, RA, RE, RD, RC, 25);
+ SP(SI0, RB, RA, RE, RD, RC, 24); KL2(RE, RC, RA, RB, RD, 24);
+ SP(SI7, RE, RC, RA, RB, RD, 23); KL2(RC, RB, RE, RD, RA, 23);
+ SP(SI6, RC, RB, RE, RD, RA, 22); KL2(RE, RA, RD, RC, RB, 22);
+ SP(SI5, RE, RA, RD, RC, RB, 21); KL2(RA, RB, RE, RD, RC, 21);
+ SP(SI4, RA, RB, RE, RD, RC, 20); KL2(RA, RE, RC, RD, RB, 20);
+ SP(SI3, RA, RE, RC, RD, RB, 19); KL2(RC, RA, RB, RD, RE, 19);
+ SP(SI2, RC, RA, RB, RD, RE, 18); KL2(RA, RE, RD, RB, RC, 18);
+ SP(SI1, RA, RE, RD, RB, RC, 17); KL2(RC, RE, RD, RB, RA, 17);
+ SP(SI0, RC, RE, RD, RB, RA, 16); KL2(RD, RA, RE, RC, RB, 16);
+ SP(SI7, RD, RA, RE, RC, RB, 15); KL2(RA, RC, RD, RB, RE, 15);
+ SP(SI6, RA, RC, RD, RB, RE, 14); KL2(RD, RE, RB, RA, RC, 14);
+ SP(SI5, RD, RE, RB, RA, RC, 13); KL2(RE, RC, RD, RB, RA, 13);
+ SP(SI4, RE, RC, RD, RB, RA, 12); KL2(RE, RD, RA, RB, RC, 12);
+ SP(SI3, RE, RD, RA, RB, RC, 11); KL2(RA, RE, RC, RB, RD, 11);
+ SP(SI2, RA, RE, RC, RB, RD, 10); KL2(RE, RD, RB, RC, RA, 10);
+ SP(SI1, RE, RD, RB, RC, RA, 9); KL2(RA, RD, RB, RC, RE, 9);
+ SP(SI0, RA, RD, RB, RC, RE, 8); KL2(RB, RE, RD, RA, RC, 8);
+ SP(SI7, RB, RE, RD, RA, RC, 7); KL2(RE, RA, RB, RC, RD, 7);
+ SP(SI6, RE, RA, RB, RC, RD, 6); KL2(RB, RD, RC, RE, RA, 6);
+ SP(SI5, RB, RD, RC, RE, RA, 5); KL2(RD, RA, RB, RC, RE, 5);
+ SP(SI4, RD, RA, RB, RC, RE, 4); KL2(RD, RB, RE, RC, RA, 4);
+ SP(SI3, RD, RB, RE, RC, RA, 3); KL2(RE, RD, RA, RC, RB, 3);
+ SP(SI2, RE, RD, RA, RC, RB, 2); KL2(RD, RB, RC, RA, RE, 2);
+ SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1);
+ S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0);
+
+ leaq (4*4*4)(%rsi), %rax;
+ write_blocks(%rsi, RC1, RD1, RB1, RE1, RK0, RK1, RK2);
+ write_blocks(%rax, RC2, RD2, RB2, RE2, RK0, RK1, RK2);
+
+ ret;
+ENDPROC(serpent_dec_blk_8way)
diff --git a/arch/x86/crypto/serpent_avx2_glue.c b/arch/x86/crypto/serpent_avx2_glue.c
new file mode 100644
index 000000000..03347b16a
--- /dev/null
+++ b/arch/x86/crypto/serpent_avx2_glue.c
@@ -0,0 +1,281 @@
+/*
+ * Glue Code for x86_64/AVX2 assembler optimized version of Serpent
+ *
+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <crypto/algapi.h>
+#include <crypto/internal/simd.h>
+#include <crypto/serpent.h>
+#include <crypto/xts.h>
+#include <asm/crypto/glue_helper.h>
+#include <asm/crypto/serpent-avx.h>
+
+#define SERPENT_AVX2_PARALLEL_BLOCKS 16
+
+/* 16-way AVX2 parallel cipher functions */
+asmlinkage void serpent_ecb_enc_16way(struct serpent_ctx *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void serpent_ecb_dec_16way(struct serpent_ctx *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void serpent_cbc_dec_16way(void *ctx, u128 *dst, const u128 *src);
+
+asmlinkage void serpent_ctr_16way(void *ctx, u128 *dst, const u128 *src,
+ le128 *iv);
+asmlinkage void serpent_xts_enc_16way(struct serpent_ctx *ctx, u8 *dst,
+ const u8 *src, le128 *iv);
+asmlinkage void serpent_xts_dec_16way(struct serpent_ctx *ctx, u8 *dst,
+ const u8 *src, le128 *iv);
+
+static int serpent_setkey_skcipher(struct crypto_skcipher *tfm,
+ const u8 *key, unsigned int keylen)
+{
+ return __serpent_setkey(crypto_skcipher_ctx(tfm), key, keylen);
+}
+
+static const struct common_glue_ctx serpent_enc = {
+ .num_funcs = 3,
+ .fpu_blocks_limit = 8,
+
+ .funcs = { {
+ .num_blocks = 16,
+ .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_enc_16way) }
+ }, {
+ .num_blocks = 8,
+ .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_enc_8way_avx) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_encrypt) }
+ } }
+};
+
+static const struct common_glue_ctx serpent_ctr = {
+ .num_funcs = 3,
+ .fpu_blocks_limit = 8,
+
+ .funcs = { {
+ .num_blocks = 16,
+ .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_ctr_16way) }
+ }, {
+ .num_blocks = 8,
+ .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_ctr_8way_avx) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(__serpent_crypt_ctr) }
+ } }
+};
+
+static const struct common_glue_ctx serpent_enc_xts = {
+ .num_funcs = 3,
+ .fpu_blocks_limit = 8,
+
+ .funcs = { {
+ .num_blocks = 16,
+ .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_enc_16way) }
+ }, {
+ .num_blocks = 8,
+ .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_enc_8way_avx) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_enc) }
+ } }
+};
+
+static const struct common_glue_ctx serpent_dec = {
+ .num_funcs = 3,
+ .fpu_blocks_limit = 8,
+
+ .funcs = { {
+ .num_blocks = 16,
+ .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_dec_16way) }
+ }, {
+ .num_blocks = 8,
+ .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_dec_8way_avx) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_decrypt) }
+ } }
+};
+
+static const struct common_glue_ctx serpent_dec_cbc = {
+ .num_funcs = 3,
+ .fpu_blocks_limit = 8,
+
+ .funcs = { {
+ .num_blocks = 16,
+ .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_cbc_dec_16way) }
+ }, {
+ .num_blocks = 8,
+ .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_cbc_dec_8way_avx) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__serpent_decrypt) }
+ } }
+};
+
+static const struct common_glue_ctx serpent_dec_xts = {
+ .num_funcs = 3,
+ .fpu_blocks_limit = 8,
+
+ .funcs = { {
+ .num_blocks = 16,
+ .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_dec_16way) }
+ }, {
+ .num_blocks = 8,
+ .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_dec_8way_avx) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_dec) }
+ } }
+};
+
+static int ecb_encrypt(struct skcipher_request *req)
+{
+ return glue_ecb_req_128bit(&serpent_enc, req);
+}
+
+static int ecb_decrypt(struct skcipher_request *req)
+{
+ return glue_ecb_req_128bit(&serpent_dec, req);
+}
+
+static int cbc_encrypt(struct skcipher_request *req)
+{
+ return glue_cbc_encrypt_req_128bit(GLUE_FUNC_CAST(__serpent_encrypt),
+ req);
+}
+
+static int cbc_decrypt(struct skcipher_request *req)
+{
+ return glue_cbc_decrypt_req_128bit(&serpent_dec_cbc, req);
+}
+
+static int ctr_crypt(struct skcipher_request *req)
+{
+ return glue_ctr_req_128bit(&serpent_ctr, req);
+}
+
+static int xts_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct serpent_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return glue_xts_req_128bit(&serpent_enc_xts, req,
+ XTS_TWEAK_CAST(__serpent_encrypt),
+ &ctx->tweak_ctx, &ctx->crypt_ctx);
+}
+
+static int xts_decrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct serpent_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return glue_xts_req_128bit(&serpent_dec_xts, req,
+ XTS_TWEAK_CAST(__serpent_encrypt),
+ &ctx->tweak_ctx, &ctx->crypt_ctx);
+}
+
+static struct skcipher_alg serpent_algs[] = {
+ {
+ .base.cra_name = "__ecb(serpent)",
+ .base.cra_driver_name = "__ecb-serpent-avx2",
+ .base.cra_priority = 600,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = SERPENT_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct serpent_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = SERPENT_MIN_KEY_SIZE,
+ .max_keysize = SERPENT_MAX_KEY_SIZE,
+ .setkey = serpent_setkey_skcipher,
+ .encrypt = ecb_encrypt,
+ .decrypt = ecb_decrypt,
+ }, {
+ .base.cra_name = "__cbc(serpent)",
+ .base.cra_driver_name = "__cbc-serpent-avx2",
+ .base.cra_priority = 600,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = SERPENT_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct serpent_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = SERPENT_MIN_KEY_SIZE,
+ .max_keysize = SERPENT_MAX_KEY_SIZE,
+ .ivsize = SERPENT_BLOCK_SIZE,
+ .setkey = serpent_setkey_skcipher,
+ .encrypt = cbc_encrypt,
+ .decrypt = cbc_decrypt,
+ }, {
+ .base.cra_name = "__ctr(serpent)",
+ .base.cra_driver_name = "__ctr-serpent-avx2",
+ .base.cra_priority = 600,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct serpent_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = SERPENT_MIN_KEY_SIZE,
+ .max_keysize = SERPENT_MAX_KEY_SIZE,
+ .ivsize = SERPENT_BLOCK_SIZE,
+ .chunksize = SERPENT_BLOCK_SIZE,
+ .setkey = serpent_setkey_skcipher,
+ .encrypt = ctr_crypt,
+ .decrypt = ctr_crypt,
+ }, {
+ .base.cra_name = "__xts(serpent)",
+ .base.cra_driver_name = "__xts-serpent-avx2",
+ .base.cra_priority = 600,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = SERPENT_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct serpent_xts_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = 2 * SERPENT_MIN_KEY_SIZE,
+ .max_keysize = 2 * SERPENT_MAX_KEY_SIZE,
+ .ivsize = SERPENT_BLOCK_SIZE,
+ .setkey = xts_serpent_setkey,
+ .encrypt = xts_encrypt,
+ .decrypt = xts_decrypt,
+ },
+};
+
+static struct simd_skcipher_alg *serpent_simd_algs[ARRAY_SIZE(serpent_algs)];
+
+static int __init init(void)
+{
+ const char *feature_name;
+
+ if (!boot_cpu_has(X86_FEATURE_AVX2) || !boot_cpu_has(X86_FEATURE_OSXSAVE)) {
+ pr_info("AVX2 instructions are not detected.\n");
+ return -ENODEV;
+ }
+ if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+ &feature_name)) {
+ pr_info("CPU feature '%s' is not supported.\n", feature_name);
+ return -ENODEV;
+ }
+
+ return simd_register_skciphers_compat(serpent_algs,
+ ARRAY_SIZE(serpent_algs),
+ serpent_simd_algs);
+}
+
+static void __exit fini(void)
+{
+ simd_unregister_skciphers(serpent_algs, ARRAY_SIZE(serpent_algs),
+ serpent_simd_algs);
+}
+
+module_init(init);
+module_exit(fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Serpent Cipher Algorithm, AVX2 optimized");
+MODULE_ALIAS_CRYPTO("serpent");
+MODULE_ALIAS_CRYPTO("serpent-asm");
diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c
new file mode 100644
index 000000000..458567ecf
--- /dev/null
+++ b/arch/x86/crypto/serpent_avx_glue.c
@@ -0,0 +1,326 @@
+/*
+ * Glue Code for AVX assembler versions of Serpent Cipher
+ *
+ * Copyright (C) 2012 Johannes Goetzfried
+ * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
+ *
+ * Copyright © 2011-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+ * USA
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <crypto/algapi.h>
+#include <crypto/internal/simd.h>
+#include <crypto/serpent.h>
+#include <crypto/xts.h>
+#include <asm/crypto/glue_helper.h>
+#include <asm/crypto/serpent-avx.h>
+
+/* 8-way parallel cipher functions */
+asmlinkage void serpent_ecb_enc_8way_avx(struct serpent_ctx *ctx, u8 *dst,
+ const u8 *src);
+EXPORT_SYMBOL_GPL(serpent_ecb_enc_8way_avx);
+
+asmlinkage void serpent_ecb_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst,
+ const u8 *src);
+EXPORT_SYMBOL_GPL(serpent_ecb_dec_8way_avx);
+
+asmlinkage void serpent_cbc_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst,
+ const u8 *src);
+EXPORT_SYMBOL_GPL(serpent_cbc_dec_8way_avx);
+
+asmlinkage void serpent_ctr_8way_avx(struct serpent_ctx *ctx, u8 *dst,
+ const u8 *src, le128 *iv);
+EXPORT_SYMBOL_GPL(serpent_ctr_8way_avx);
+
+asmlinkage void serpent_xts_enc_8way_avx(struct serpent_ctx *ctx, u8 *dst,
+ const u8 *src, le128 *iv);
+EXPORT_SYMBOL_GPL(serpent_xts_enc_8way_avx);
+
+asmlinkage void serpent_xts_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst,
+ const u8 *src, le128 *iv);
+EXPORT_SYMBOL_GPL(serpent_xts_dec_8way_avx);
+
+void __serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+{
+ be128 ctrblk;
+
+ le128_to_be128(&ctrblk, iv);
+ le128_inc(iv);
+
+ __serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
+ u128_xor(dst, src, (u128 *)&ctrblk);
+}
+EXPORT_SYMBOL_GPL(__serpent_crypt_ctr);
+
+void serpent_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+{
+ glue_xts_crypt_128bit_one(ctx, dst, src, iv,
+ GLUE_FUNC_CAST(__serpent_encrypt));
+}
+EXPORT_SYMBOL_GPL(serpent_xts_enc);
+
+void serpent_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+{
+ glue_xts_crypt_128bit_one(ctx, dst, src, iv,
+ GLUE_FUNC_CAST(__serpent_decrypt));
+}
+EXPORT_SYMBOL_GPL(serpent_xts_dec);
+
+static int serpent_setkey_skcipher(struct crypto_skcipher *tfm,
+ const u8 *key, unsigned int keylen)
+{
+ return __serpent_setkey(crypto_skcipher_ctx(tfm), key, keylen);
+}
+
+int xts_serpent_setkey(struct crypto_skcipher *tfm, const u8 *key,
+ unsigned int keylen)
+{
+ struct serpent_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+ int err;
+
+ err = xts_verify_key(tfm, key, keylen);
+ if (err)
+ return err;
+
+ /* first half of xts-key is for crypt */
+ err = __serpent_setkey(&ctx->crypt_ctx, key, keylen / 2);
+ if (err)
+ return err;
+
+ /* second half of xts-key is for tweak */
+ return __serpent_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2);
+}
+EXPORT_SYMBOL_GPL(xts_serpent_setkey);
+
+static const struct common_glue_ctx serpent_enc = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = SERPENT_PARALLEL_BLOCKS,
+ .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_enc_8way_avx) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_encrypt) }
+ } }
+};
+
+static const struct common_glue_ctx serpent_ctr = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = SERPENT_PARALLEL_BLOCKS,
+ .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_ctr_8way_avx) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(__serpent_crypt_ctr) }
+ } }
+};
+
+static const struct common_glue_ctx serpent_enc_xts = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = SERPENT_PARALLEL_BLOCKS,
+ .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_enc_8way_avx) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_enc) }
+ } }
+};
+
+static const struct common_glue_ctx serpent_dec = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = SERPENT_PARALLEL_BLOCKS,
+ .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_dec_8way_avx) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_decrypt) }
+ } }
+};
+
+static const struct common_glue_ctx serpent_dec_cbc = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = SERPENT_PARALLEL_BLOCKS,
+ .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_cbc_dec_8way_avx) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__serpent_decrypt) }
+ } }
+};
+
+static const struct common_glue_ctx serpent_dec_xts = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = SERPENT_PARALLEL_BLOCKS,
+ .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_dec_8way_avx) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_dec) }
+ } }
+};
+
+static int ecb_encrypt(struct skcipher_request *req)
+{
+ return glue_ecb_req_128bit(&serpent_enc, req);
+}
+
+static int ecb_decrypt(struct skcipher_request *req)
+{
+ return glue_ecb_req_128bit(&serpent_dec, req);
+}
+
+static int cbc_encrypt(struct skcipher_request *req)
+{
+ return glue_cbc_encrypt_req_128bit(GLUE_FUNC_CAST(__serpent_encrypt),
+ req);
+}
+
+static int cbc_decrypt(struct skcipher_request *req)
+{
+ return glue_cbc_decrypt_req_128bit(&serpent_dec_cbc, req);
+}
+
+static int ctr_crypt(struct skcipher_request *req)
+{
+ return glue_ctr_req_128bit(&serpent_ctr, req);
+}
+
+static int xts_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct serpent_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return glue_xts_req_128bit(&serpent_enc_xts, req,
+ XTS_TWEAK_CAST(__serpent_encrypt),
+ &ctx->tweak_ctx, &ctx->crypt_ctx);
+}
+
+static int xts_decrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct serpent_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return glue_xts_req_128bit(&serpent_dec_xts, req,
+ XTS_TWEAK_CAST(__serpent_encrypt),
+ &ctx->tweak_ctx, &ctx->crypt_ctx);
+}
+
+static struct skcipher_alg serpent_algs[] = {
+ {
+ .base.cra_name = "__ecb(serpent)",
+ .base.cra_driver_name = "__ecb-serpent-avx",
+ .base.cra_priority = 500,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = SERPENT_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct serpent_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = SERPENT_MIN_KEY_SIZE,
+ .max_keysize = SERPENT_MAX_KEY_SIZE,
+ .setkey = serpent_setkey_skcipher,
+ .encrypt = ecb_encrypt,
+ .decrypt = ecb_decrypt,
+ }, {
+ .base.cra_name = "__cbc(serpent)",
+ .base.cra_driver_name = "__cbc-serpent-avx",
+ .base.cra_priority = 500,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = SERPENT_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct serpent_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = SERPENT_MIN_KEY_SIZE,
+ .max_keysize = SERPENT_MAX_KEY_SIZE,
+ .ivsize = SERPENT_BLOCK_SIZE,
+ .setkey = serpent_setkey_skcipher,
+ .encrypt = cbc_encrypt,
+ .decrypt = cbc_decrypt,
+ }, {
+ .base.cra_name = "__ctr(serpent)",
+ .base.cra_driver_name = "__ctr-serpent-avx",
+ .base.cra_priority = 500,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct serpent_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = SERPENT_MIN_KEY_SIZE,
+ .max_keysize = SERPENT_MAX_KEY_SIZE,
+ .ivsize = SERPENT_BLOCK_SIZE,
+ .chunksize = SERPENT_BLOCK_SIZE,
+ .setkey = serpent_setkey_skcipher,
+ .encrypt = ctr_crypt,
+ .decrypt = ctr_crypt,
+ }, {
+ .base.cra_name = "__xts(serpent)",
+ .base.cra_driver_name = "__xts-serpent-avx",
+ .base.cra_priority = 500,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = SERPENT_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct serpent_xts_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = 2 * SERPENT_MIN_KEY_SIZE,
+ .max_keysize = 2 * SERPENT_MAX_KEY_SIZE,
+ .ivsize = SERPENT_BLOCK_SIZE,
+ .setkey = xts_serpent_setkey,
+ .encrypt = xts_encrypt,
+ .decrypt = xts_decrypt,
+ },
+};
+
+static struct simd_skcipher_alg *serpent_simd_algs[ARRAY_SIZE(serpent_algs)];
+
+static int __init serpent_init(void)
+{
+ const char *feature_name;
+
+ if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+ &feature_name)) {
+ pr_info("CPU feature '%s' is not supported.\n", feature_name);
+ return -ENODEV;
+ }
+
+ return simd_register_skciphers_compat(serpent_algs,
+ ARRAY_SIZE(serpent_algs),
+ serpent_simd_algs);
+}
+
+static void __exit serpent_exit(void)
+{
+ simd_unregister_skciphers(serpent_algs, ARRAY_SIZE(serpent_algs),
+ serpent_simd_algs);
+}
+
+module_init(serpent_init);
+module_exit(serpent_exit);
+
+MODULE_DESCRIPTION("Serpent Cipher Algorithm, AVX optimized");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_CRYPTO("serpent");
diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c
new file mode 100644
index 000000000..3dafe1375
--- /dev/null
+++ b/arch/x86/crypto/serpent_sse2_glue.c
@@ -0,0 +1,240 @@
+/*
+ * Glue Code for SSE2 assembler versions of Serpent Cipher
+ *
+ * Copyright (c) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * Glue code based on aesni-intel_glue.c by:
+ * Copyright (C) 2008, Intel Corp.
+ * Author: Huang Ying <ying.huang@intel.com>
+ *
+ * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
+ * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
+ * CTR part based on code (crypto/ctr.c) by:
+ * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+ * USA
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <crypto/algapi.h>
+#include <crypto/b128ops.h>
+#include <crypto/internal/simd.h>
+#include <crypto/serpent.h>
+#include <asm/crypto/serpent-sse2.h>
+#include <asm/crypto/glue_helper.h>
+
+static int serpent_setkey_skcipher(struct crypto_skcipher *tfm,
+ const u8 *key, unsigned int keylen)
+{
+ return __serpent_setkey(crypto_skcipher_ctx(tfm), key, keylen);
+}
+
+static void serpent_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src)
+{
+ u128 ivs[SERPENT_PARALLEL_BLOCKS - 1];
+ unsigned int j;
+
+ for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++)
+ ivs[j] = src[j];
+
+ serpent_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
+
+ for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++)
+ u128_xor(dst + (j + 1), dst + (j + 1), ivs + j);
+}
+
+static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+{
+ be128 ctrblk;
+
+ le128_to_be128(&ctrblk, iv);
+ le128_inc(iv);
+
+ __serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
+ u128_xor(dst, src, (u128 *)&ctrblk);
+}
+
+static void serpent_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src,
+ le128 *iv)
+{
+ be128 ctrblks[SERPENT_PARALLEL_BLOCKS];
+ unsigned int i;
+
+ for (i = 0; i < SERPENT_PARALLEL_BLOCKS; i++) {
+ if (dst != src)
+ dst[i] = src[i];
+
+ le128_to_be128(&ctrblks[i], iv);
+ le128_inc(iv);
+ }
+
+ serpent_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks);
+}
+
+static const struct common_glue_ctx serpent_enc = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = SERPENT_PARALLEL_BLOCKS,
+ .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_enc_blk_xway) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_encrypt) }
+ } }
+};
+
+static const struct common_glue_ctx serpent_ctr = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = SERPENT_PARALLEL_BLOCKS,
+ .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr_xway) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr) }
+ } }
+};
+
+static const struct common_glue_ctx serpent_dec = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = SERPENT_PARALLEL_BLOCKS,
+ .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_dec_blk_xway) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_decrypt) }
+ } }
+};
+
+static const struct common_glue_ctx serpent_dec_cbc = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = SERPENT_PARALLEL_BLOCKS,
+ .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_decrypt_cbc_xway) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__serpent_decrypt) }
+ } }
+};
+
+static int ecb_encrypt(struct skcipher_request *req)
+{
+ return glue_ecb_req_128bit(&serpent_enc, req);
+}
+
+static int ecb_decrypt(struct skcipher_request *req)
+{
+ return glue_ecb_req_128bit(&serpent_dec, req);
+}
+
+static int cbc_encrypt(struct skcipher_request *req)
+{
+ return glue_cbc_encrypt_req_128bit(GLUE_FUNC_CAST(__serpent_encrypt),
+ req);
+}
+
+static int cbc_decrypt(struct skcipher_request *req)
+{
+ return glue_cbc_decrypt_req_128bit(&serpent_dec_cbc, req);
+}
+
+static int ctr_crypt(struct skcipher_request *req)
+{
+ return glue_ctr_req_128bit(&serpent_ctr, req);
+}
+
+static struct skcipher_alg serpent_algs[] = {
+ {
+ .base.cra_name = "__ecb(serpent)",
+ .base.cra_driver_name = "__ecb-serpent-sse2",
+ .base.cra_priority = 400,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = SERPENT_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct serpent_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = SERPENT_MIN_KEY_SIZE,
+ .max_keysize = SERPENT_MAX_KEY_SIZE,
+ .setkey = serpent_setkey_skcipher,
+ .encrypt = ecb_encrypt,
+ .decrypt = ecb_decrypt,
+ }, {
+ .base.cra_name = "__cbc(serpent)",
+ .base.cra_driver_name = "__cbc-serpent-sse2",
+ .base.cra_priority = 400,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = SERPENT_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct serpent_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = SERPENT_MIN_KEY_SIZE,
+ .max_keysize = SERPENT_MAX_KEY_SIZE,
+ .ivsize = SERPENT_BLOCK_SIZE,
+ .setkey = serpent_setkey_skcipher,
+ .encrypt = cbc_encrypt,
+ .decrypt = cbc_decrypt,
+ }, {
+ .base.cra_name = "__ctr(serpent)",
+ .base.cra_driver_name = "__ctr-serpent-sse2",
+ .base.cra_priority = 400,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct serpent_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = SERPENT_MIN_KEY_SIZE,
+ .max_keysize = SERPENT_MAX_KEY_SIZE,
+ .ivsize = SERPENT_BLOCK_SIZE,
+ .chunksize = SERPENT_BLOCK_SIZE,
+ .setkey = serpent_setkey_skcipher,
+ .encrypt = ctr_crypt,
+ .decrypt = ctr_crypt,
+ },
+};
+
+static struct simd_skcipher_alg *serpent_simd_algs[ARRAY_SIZE(serpent_algs)];
+
+static int __init serpent_sse2_init(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_XMM2)) {
+ printk(KERN_INFO "SSE2 instructions are not detected.\n");
+ return -ENODEV;
+ }
+
+ return simd_register_skciphers_compat(serpent_algs,
+ ARRAY_SIZE(serpent_algs),
+ serpent_simd_algs);
+}
+
+static void __exit serpent_sse2_exit(void)
+{
+ simd_unregister_skciphers(serpent_algs, ARRAY_SIZE(serpent_algs),
+ serpent_simd_algs);
+}
+
+module_init(serpent_sse2_init);
+module_exit(serpent_sse2_exit);
+
+MODULE_DESCRIPTION("Serpent Cipher Algorithm, SSE2 optimized");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_CRYPTO("serpent");
diff --git a/arch/x86/crypto/sha1-mb/Makefile b/arch/x86/crypto/sha1-mb/Makefile
new file mode 100644
index 000000000..815ded3ba
--- /dev/null
+++ b/arch/x86/crypto/sha1-mb/Makefile
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Arch-specific CryptoAPI modules.
+#
+
+OBJECT_FILES_NON_STANDARD := y
+
+avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
+ $(comma)4)$(comma)%ymm2,yes,no)
+ifeq ($(avx2_supported),yes)
+ obj-$(CONFIG_CRYPTO_SHA1_MB) += sha1-mb.o
+ sha1-mb-y := sha1_mb.o sha1_mb_mgr_flush_avx2.o \
+ sha1_mb_mgr_init_avx2.o sha1_mb_mgr_submit_avx2.o sha1_x8_avx2.o
+endif
diff --git a/arch/x86/crypto/sha1-mb/sha1_mb.c b/arch/x86/crypto/sha1-mb/sha1_mb.c
new file mode 100644
index 000000000..b93805664
--- /dev/null
+++ b/arch/x86/crypto/sha1-mb/sha1_mb.c
@@ -0,0 +1,1011 @@
+/*
+ * Multi buffer SHA1 algorithm Glue Code
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <crypto/internal/hash.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <crypto/scatterwalk.h>
+#include <crypto/sha.h>
+#include <crypto/mcryptd.h>
+#include <crypto/crypto_wq.h>
+#include <asm/byteorder.h>
+#include <linux/hardirq.h>
+#include <asm/fpu/api.h>
+#include "sha1_mb_ctx.h"
+
+#define FLUSH_INTERVAL 1000 /* in usec */
+
+static struct mcryptd_alg_state sha1_mb_alg_state;
+
+struct sha1_mb_ctx {
+ struct mcryptd_ahash *mcryptd_tfm;
+};
+
+static inline struct mcryptd_hash_request_ctx
+ *cast_hash_to_mcryptd_ctx(struct sha1_hash_ctx *hash_ctx)
+{
+ struct ahash_request *areq;
+
+ areq = container_of((void *) hash_ctx, struct ahash_request, __ctx);
+ return container_of(areq, struct mcryptd_hash_request_ctx, areq);
+}
+
+static inline struct ahash_request
+ *cast_mcryptd_ctx_to_req(struct mcryptd_hash_request_ctx *ctx)
+{
+ return container_of((void *) ctx, struct ahash_request, __ctx);
+}
+
+static void req_ctx_init(struct mcryptd_hash_request_ctx *rctx,
+ struct ahash_request *areq)
+{
+ rctx->flag = HASH_UPDATE;
+}
+
+static asmlinkage void (*sha1_job_mgr_init)(struct sha1_mb_mgr *state);
+static asmlinkage struct job_sha1* (*sha1_job_mgr_submit)
+ (struct sha1_mb_mgr *state, struct job_sha1 *job);
+static asmlinkage struct job_sha1* (*sha1_job_mgr_flush)
+ (struct sha1_mb_mgr *state);
+static asmlinkage struct job_sha1* (*sha1_job_mgr_get_comp_job)
+ (struct sha1_mb_mgr *state);
+
+static inline uint32_t sha1_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2],
+ uint64_t total_len)
+{
+ uint32_t i = total_len & (SHA1_BLOCK_SIZE - 1);
+
+ memset(&padblock[i], 0, SHA1_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ i += ((SHA1_BLOCK_SIZE - 1) &
+ (0 - (total_len + SHA1_PADLENGTHFIELD_SIZE + 1)))
+ + 1 + SHA1_PADLENGTHFIELD_SIZE;
+
+#if SHA1_PADLENGTHFIELD_SIZE == 16
+ *((uint64_t *) &padblock[i - 16]) = 0;
+#endif
+
+ *((uint64_t *) &padblock[i - 8]) = cpu_to_be64(total_len << 3);
+
+ /* Number of extra blocks to hash */
+ return i >> SHA1_LOG2_BLOCK_SIZE;
+}
+
+static struct sha1_hash_ctx *sha1_ctx_mgr_resubmit(struct sha1_ctx_mgr *mgr,
+ struct sha1_hash_ctx *ctx)
+{
+ while (ctx) {
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ /* Clear PROCESSING bit */
+ ctx->status = HASH_CTX_STS_COMPLETE;
+ return ctx;
+ }
+
+ /*
+ * If the extra blocks are empty, begin hashing what remains
+ * in the user's buffer.
+ */
+ if (ctx->partial_block_buffer_length == 0 &&
+ ctx->incoming_buffer_length) {
+
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+ uint32_t copy_len;
+
+ /*
+ * Only entire blocks can be hashed.
+ * Copy remainder to extra blocks buffer.
+ */
+ copy_len = len & (SHA1_BLOCK_SIZE-1);
+
+ if (copy_len) {
+ len -= copy_len;
+ memcpy(ctx->partial_block_buffer,
+ ((const char *) buffer + len),
+ copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ /* len should be a multiple of the block size now */
+ assert((len % SHA1_BLOCK_SIZE) == 0);
+
+ /* Set len to the number of blocks to be hashed */
+ len >>= SHA1_LOG2_BLOCK_SIZE;
+
+ if (len) {
+
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx = (struct sha1_hash_ctx *)sha1_job_mgr_submit(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+
+ /*
+ * If the extra blocks are not empty, then we are
+ * either on the last block(s) or we need more
+ * user input before continuing.
+ */
+ if (ctx->status & HASH_CTX_STS_LAST) {
+
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks =
+ sha1_pad(buf, ctx->total_length);
+
+ ctx->status = (HASH_CTX_STS_PROCESSING |
+ HASH_CTX_STS_COMPLETE);
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx = (struct sha1_hash_ctx *)
+ sha1_job_mgr_submit(&mgr->mgr, &ctx->job);
+ continue;
+ }
+
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static struct sha1_hash_ctx
+ *sha1_ctx_mgr_get_comp_ctx(struct sha1_ctx_mgr *mgr)
+{
+ /*
+ * If get_comp_job returns NULL, there are no jobs complete.
+ * If get_comp_job returns a job, verify that it is safe to return to
+ * the user.
+ * If it is not ready, resubmit the job to finish processing.
+ * If sha1_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ * Otherwise, all jobs currently being managed by the hash_ctx_mgr
+ * still need processing.
+ */
+ struct sha1_hash_ctx *ctx;
+
+ ctx = (struct sha1_hash_ctx *) sha1_job_mgr_get_comp_job(&mgr->mgr);
+ return sha1_ctx_mgr_resubmit(mgr, ctx);
+}
+
+static void sha1_ctx_mgr_init(struct sha1_ctx_mgr *mgr)
+{
+ sha1_job_mgr_init(&mgr->mgr);
+}
+
+static struct sha1_hash_ctx *sha1_ctx_mgr_submit(struct sha1_ctx_mgr *mgr,
+ struct sha1_hash_ctx *ctx,
+ const void *buffer,
+ uint32_t len,
+ int flags)
+{
+ if (flags & ~(HASH_UPDATE | HASH_LAST)) {
+ /* User should not pass anything other than UPDATE or LAST */
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ /* Cannot submit to a currently processing job. */
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ /* Cannot update a finished job. */
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ /*
+ * If we made it here, there were no errors during this call to
+ * submit
+ */
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ /* Store buffer ptr info from user */
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ /*
+ * Store the user's request flags and mark this ctx as currently
+ * being processed.
+ */
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ /* Advance byte counter */
+ ctx->total_length += len;
+
+ /*
+ * If there is anything currently buffered in the extra blocks,
+ * append to it until it contains a whole block.
+ * Or if the user's buffer contains less than a whole block,
+ * append as much as possible to the extra block.
+ */
+ if (ctx->partial_block_buffer_length || len < SHA1_BLOCK_SIZE) {
+ /*
+ * Compute how many bytes to copy from user buffer into
+ * extra block
+ */
+ uint32_t copy_len = SHA1_BLOCK_SIZE -
+ ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ /* Copy and update relevant pointers and counters */
+ memcpy(&ctx->partial_block_buffer[ctx->partial_block_buffer_length],
+ buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)
+ ((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+
+ /*
+ * The extra block should never contain more than 1 block
+ * here
+ */
+ assert(ctx->partial_block_buffer_length <= SHA1_BLOCK_SIZE);
+
+ /*
+ * If the extra block buffer contains exactly 1 block, it can
+ * be hashed.
+ */
+ if (ctx->partial_block_buffer_length >= SHA1_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+ ctx = (struct sha1_hash_ctx *)
+ sha1_job_mgr_submit(&mgr->mgr, &ctx->job);
+ }
+ }
+
+ return sha1_ctx_mgr_resubmit(mgr, ctx);
+}
+
+static struct sha1_hash_ctx *sha1_ctx_mgr_flush(struct sha1_ctx_mgr *mgr)
+{
+ struct sha1_hash_ctx *ctx;
+
+ while (1) {
+ ctx = (struct sha1_hash_ctx *) sha1_job_mgr_flush(&mgr->mgr);
+
+ /* If flush returned 0, there are no more jobs in flight. */
+ if (!ctx)
+ return NULL;
+
+ /*
+ * If flush returned a job, resubmit the job to finish
+ * processing.
+ */
+ ctx = sha1_ctx_mgr_resubmit(mgr, ctx);
+
+ /*
+ * If sha1_ctx_mgr_resubmit returned a job, it is ready to be
+ * returned. Otherwise, all jobs currently being managed by the
+ * sha1_ctx_mgr still need processing. Loop.
+ */
+ if (ctx)
+ return ctx;
+ }
+}
+
+static int sha1_mb_init(struct ahash_request *areq)
+{
+ struct sha1_hash_ctx *sctx = ahash_request_ctx(areq);
+
+ hash_ctx_init(sctx);
+ sctx->job.result_digest[0] = SHA1_H0;
+ sctx->job.result_digest[1] = SHA1_H1;
+ sctx->job.result_digest[2] = SHA1_H2;
+ sctx->job.result_digest[3] = SHA1_H3;
+ sctx->job.result_digest[4] = SHA1_H4;
+ sctx->total_length = 0;
+ sctx->partial_block_buffer_length = 0;
+ sctx->status = HASH_CTX_STS_IDLE;
+
+ return 0;
+}
+
+static int sha1_mb_set_results(struct mcryptd_hash_request_ctx *rctx)
+{
+ int i;
+ struct sha1_hash_ctx *sctx = ahash_request_ctx(&rctx->areq);
+ __be32 *dst = (__be32 *) rctx->out;
+
+ for (i = 0; i < 5; ++i)
+ dst[i] = cpu_to_be32(sctx->job.result_digest[i]);
+
+ return 0;
+}
+
+static int sha_finish_walk(struct mcryptd_hash_request_ctx **ret_rctx,
+ struct mcryptd_alg_cstate *cstate, bool flush)
+{
+ int flag = HASH_UPDATE;
+ int nbytes, err = 0;
+ struct mcryptd_hash_request_ctx *rctx = *ret_rctx;
+ struct sha1_hash_ctx *sha_ctx;
+
+ /* more work ? */
+ while (!(rctx->flag & HASH_DONE)) {
+ nbytes = crypto_ahash_walk_done(&rctx->walk, 0);
+ if (nbytes < 0) {
+ err = nbytes;
+ goto out;
+ }
+ /* check if the walk is done */
+ if (crypto_ahash_walk_last(&rctx->walk)) {
+ rctx->flag |= HASH_DONE;
+ if (rctx->flag & HASH_FINAL)
+ flag |= HASH_LAST;
+
+ }
+ sha_ctx = (struct sha1_hash_ctx *)
+ ahash_request_ctx(&rctx->areq);
+ kernel_fpu_begin();
+ sha_ctx = sha1_ctx_mgr_submit(cstate->mgr, sha_ctx,
+ rctx->walk.data, nbytes, flag);
+ if (!sha_ctx) {
+ if (flush)
+ sha_ctx = sha1_ctx_mgr_flush(cstate->mgr);
+ }
+ kernel_fpu_end();
+ if (sha_ctx)
+ rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+ else {
+ rctx = NULL;
+ goto out;
+ }
+ }
+
+ /* copy the results */
+ if (rctx->flag & HASH_FINAL)
+ sha1_mb_set_results(rctx);
+
+out:
+ *ret_rctx = rctx;
+ return err;
+}
+
+static int sha_complete_job(struct mcryptd_hash_request_ctx *rctx,
+ struct mcryptd_alg_cstate *cstate,
+ int err)
+{
+ struct ahash_request *req = cast_mcryptd_ctx_to_req(rctx);
+ struct sha1_hash_ctx *sha_ctx;
+ struct mcryptd_hash_request_ctx *req_ctx;
+ int ret;
+
+ /* remove from work list */
+ spin_lock(&cstate->work_lock);
+ list_del(&rctx->waiter);
+ spin_unlock(&cstate->work_lock);
+
+ if (irqs_disabled())
+ rctx->complete(&req->base, err);
+ else {
+ local_bh_disable();
+ rctx->complete(&req->base, err);
+ local_bh_enable();
+ }
+
+ /* check to see if there are other jobs that are done */
+ sha_ctx = sha1_ctx_mgr_get_comp_ctx(cstate->mgr);
+ while (sha_ctx) {
+ req_ctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+ ret = sha_finish_walk(&req_ctx, cstate, false);
+ if (req_ctx) {
+ spin_lock(&cstate->work_lock);
+ list_del(&req_ctx->waiter);
+ spin_unlock(&cstate->work_lock);
+
+ req = cast_mcryptd_ctx_to_req(req_ctx);
+ if (irqs_disabled())
+ req_ctx->complete(&req->base, ret);
+ else {
+ local_bh_disable();
+ req_ctx->complete(&req->base, ret);
+ local_bh_enable();
+ }
+ }
+ sha_ctx = sha1_ctx_mgr_get_comp_ctx(cstate->mgr);
+ }
+
+ return 0;
+}
+
+static void sha1_mb_add_list(struct mcryptd_hash_request_ctx *rctx,
+ struct mcryptd_alg_cstate *cstate)
+{
+ unsigned long next_flush;
+ unsigned long delay = usecs_to_jiffies(FLUSH_INTERVAL);
+
+ /* initialize tag */
+ rctx->tag.arrival = jiffies; /* tag the arrival time */
+ rctx->tag.seq_num = cstate->next_seq_num++;
+ next_flush = rctx->tag.arrival + delay;
+ rctx->tag.expire = next_flush;
+
+ spin_lock(&cstate->work_lock);
+ list_add_tail(&rctx->waiter, &cstate->work_list);
+ spin_unlock(&cstate->work_lock);
+
+ mcryptd_arm_flusher(cstate, delay);
+}
+
+static int sha1_mb_update(struct ahash_request *areq)
+{
+ struct mcryptd_hash_request_ctx *rctx =
+ container_of(areq, struct mcryptd_hash_request_ctx, areq);
+ struct mcryptd_alg_cstate *cstate =
+ this_cpu_ptr(sha1_mb_alg_state.alg_cstate);
+
+ struct ahash_request *req = cast_mcryptd_ctx_to_req(rctx);
+ struct sha1_hash_ctx *sha_ctx;
+ int ret = 0, nbytes;
+
+
+ /* sanity check */
+ if (rctx->tag.cpu != smp_processor_id()) {
+ pr_err("mcryptd error: cpu clash\n");
+ goto done;
+ }
+
+ /* need to init context */
+ req_ctx_init(rctx, areq);
+
+ nbytes = crypto_ahash_walk_first(req, &rctx->walk);
+
+ if (nbytes < 0) {
+ ret = nbytes;
+ goto done;
+ }
+
+ if (crypto_ahash_walk_last(&rctx->walk))
+ rctx->flag |= HASH_DONE;
+
+ /* submit */
+ sha_ctx = (struct sha1_hash_ctx *) ahash_request_ctx(areq);
+ sha1_mb_add_list(rctx, cstate);
+ kernel_fpu_begin();
+ sha_ctx = sha1_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data,
+ nbytes, HASH_UPDATE);
+ kernel_fpu_end();
+
+ /* check if anything is returned */
+ if (!sha_ctx)
+ return -EINPROGRESS;
+
+ if (sha_ctx->error) {
+ ret = sha_ctx->error;
+ rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+ goto done;
+ }
+
+ rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+ ret = sha_finish_walk(&rctx, cstate, false);
+
+ if (!rctx)
+ return -EINPROGRESS;
+done:
+ sha_complete_job(rctx, cstate, ret);
+ return ret;
+}
+
+static int sha1_mb_finup(struct ahash_request *areq)
+{
+ struct mcryptd_hash_request_ctx *rctx =
+ container_of(areq, struct mcryptd_hash_request_ctx, areq);
+ struct mcryptd_alg_cstate *cstate =
+ this_cpu_ptr(sha1_mb_alg_state.alg_cstate);
+
+ struct ahash_request *req = cast_mcryptd_ctx_to_req(rctx);
+ struct sha1_hash_ctx *sha_ctx;
+ int ret = 0, flag = HASH_UPDATE, nbytes;
+
+ /* sanity check */
+ if (rctx->tag.cpu != smp_processor_id()) {
+ pr_err("mcryptd error: cpu clash\n");
+ goto done;
+ }
+
+ /* need to init context */
+ req_ctx_init(rctx, areq);
+
+ nbytes = crypto_ahash_walk_first(req, &rctx->walk);
+
+ if (nbytes < 0) {
+ ret = nbytes;
+ goto done;
+ }
+
+ if (crypto_ahash_walk_last(&rctx->walk)) {
+ rctx->flag |= HASH_DONE;
+ flag = HASH_LAST;
+ }
+
+ /* submit */
+ rctx->flag |= HASH_FINAL;
+ sha_ctx = (struct sha1_hash_ctx *) ahash_request_ctx(areq);
+ sha1_mb_add_list(rctx, cstate);
+
+ kernel_fpu_begin();
+ sha_ctx = sha1_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data,
+ nbytes, flag);
+ kernel_fpu_end();
+
+ /* check if anything is returned */
+ if (!sha_ctx)
+ return -EINPROGRESS;
+
+ if (sha_ctx->error) {
+ ret = sha_ctx->error;
+ goto done;
+ }
+
+ rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+ ret = sha_finish_walk(&rctx, cstate, false);
+ if (!rctx)
+ return -EINPROGRESS;
+done:
+ sha_complete_job(rctx, cstate, ret);
+ return ret;
+}
+
+static int sha1_mb_final(struct ahash_request *areq)
+{
+ struct mcryptd_hash_request_ctx *rctx =
+ container_of(areq, struct mcryptd_hash_request_ctx, areq);
+ struct mcryptd_alg_cstate *cstate =
+ this_cpu_ptr(sha1_mb_alg_state.alg_cstate);
+
+ struct sha1_hash_ctx *sha_ctx;
+ int ret = 0;
+ u8 data;
+
+ /* sanity check */
+ if (rctx->tag.cpu != smp_processor_id()) {
+ pr_err("mcryptd error: cpu clash\n");
+ goto done;
+ }
+
+ /* need to init context */
+ req_ctx_init(rctx, areq);
+
+ rctx->flag |= HASH_DONE | HASH_FINAL;
+
+ sha_ctx = (struct sha1_hash_ctx *) ahash_request_ctx(areq);
+ /* flag HASH_FINAL and 0 data size */
+ sha1_mb_add_list(rctx, cstate);
+ kernel_fpu_begin();
+ sha_ctx = sha1_ctx_mgr_submit(cstate->mgr, sha_ctx, &data, 0,
+ HASH_LAST);
+ kernel_fpu_end();
+
+ /* check if anything is returned */
+ if (!sha_ctx)
+ return -EINPROGRESS;
+
+ if (sha_ctx->error) {
+ ret = sha_ctx->error;
+ rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+ goto done;
+ }
+
+ rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+ ret = sha_finish_walk(&rctx, cstate, false);
+ if (!rctx)
+ return -EINPROGRESS;
+done:
+ sha_complete_job(rctx, cstate, ret);
+ return ret;
+}
+
+static int sha1_mb_export(struct ahash_request *areq, void *out)
+{
+ struct sha1_hash_ctx *sctx = ahash_request_ctx(areq);
+
+ memcpy(out, sctx, sizeof(*sctx));
+
+ return 0;
+}
+
+static int sha1_mb_import(struct ahash_request *areq, const void *in)
+{
+ struct sha1_hash_ctx *sctx = ahash_request_ctx(areq);
+
+ memcpy(sctx, in, sizeof(*sctx));
+
+ return 0;
+}
+
+static int sha1_mb_async_init_tfm(struct crypto_tfm *tfm)
+{
+ struct mcryptd_ahash *mcryptd_tfm;
+ struct sha1_mb_ctx *ctx = crypto_tfm_ctx(tfm);
+ struct mcryptd_hash_ctx *mctx;
+
+ mcryptd_tfm = mcryptd_alloc_ahash("__intel_sha1-mb",
+ CRYPTO_ALG_INTERNAL,
+ CRYPTO_ALG_INTERNAL);
+ if (IS_ERR(mcryptd_tfm))
+ return PTR_ERR(mcryptd_tfm);
+ mctx = crypto_ahash_ctx(&mcryptd_tfm->base);
+ mctx->alg_state = &sha1_mb_alg_state;
+ ctx->mcryptd_tfm = mcryptd_tfm;
+ crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
+ sizeof(struct ahash_request) +
+ crypto_ahash_reqsize(&mcryptd_tfm->base));
+
+ return 0;
+}
+
+static void sha1_mb_async_exit_tfm(struct crypto_tfm *tfm)
+{
+ struct sha1_mb_ctx *ctx = crypto_tfm_ctx(tfm);
+
+ mcryptd_free_ahash(ctx->mcryptd_tfm);
+}
+
+static int sha1_mb_areq_init_tfm(struct crypto_tfm *tfm)
+{
+ crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
+ sizeof(struct ahash_request) +
+ sizeof(struct sha1_hash_ctx));
+
+ return 0;
+}
+
+static void sha1_mb_areq_exit_tfm(struct crypto_tfm *tfm)
+{
+ struct sha1_mb_ctx *ctx = crypto_tfm_ctx(tfm);
+
+ mcryptd_free_ahash(ctx->mcryptd_tfm);
+}
+
+static struct ahash_alg sha1_mb_areq_alg = {
+ .init = sha1_mb_init,
+ .update = sha1_mb_update,
+ .final = sha1_mb_final,
+ .finup = sha1_mb_finup,
+ .export = sha1_mb_export,
+ .import = sha1_mb_import,
+ .halg = {
+ .digestsize = SHA1_DIGEST_SIZE,
+ .statesize = sizeof(struct sha1_hash_ctx),
+ .base = {
+ .cra_name = "__sha1-mb",
+ .cra_driver_name = "__intel_sha1-mb",
+ .cra_priority = 100,
+ /*
+ * use ASYNC flag as some buffers in multi-buffer
+ * algo may not have completed before hashing thread
+ * sleep
+ */
+ .cra_flags = CRYPTO_ALG_ASYNC |
+ CRYPTO_ALG_INTERNAL,
+ .cra_blocksize = SHA1_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ .cra_list = LIST_HEAD_INIT
+ (sha1_mb_areq_alg.halg.base.cra_list),
+ .cra_init = sha1_mb_areq_init_tfm,
+ .cra_exit = sha1_mb_areq_exit_tfm,
+ .cra_ctxsize = sizeof(struct sha1_hash_ctx),
+ }
+ }
+};
+
+static int sha1_mb_async_init(struct ahash_request *req)
+{
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+ struct sha1_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+ struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+ struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+ memcpy(mcryptd_req, req, sizeof(*req));
+ ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+ return crypto_ahash_init(mcryptd_req);
+}
+
+static int sha1_mb_async_update(struct ahash_request *req)
+{
+ struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+ struct sha1_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+ struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+ memcpy(mcryptd_req, req, sizeof(*req));
+ ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+ return crypto_ahash_update(mcryptd_req);
+}
+
+static int sha1_mb_async_finup(struct ahash_request *req)
+{
+ struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+ struct sha1_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+ struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+ memcpy(mcryptd_req, req, sizeof(*req));
+ ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+ return crypto_ahash_finup(mcryptd_req);
+}
+
+static int sha1_mb_async_final(struct ahash_request *req)
+{
+ struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+ struct sha1_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+ struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+ memcpy(mcryptd_req, req, sizeof(*req));
+ ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+ return crypto_ahash_final(mcryptd_req);
+}
+
+static int sha1_mb_async_digest(struct ahash_request *req)
+{
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+ struct sha1_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+ struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+ struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+ memcpy(mcryptd_req, req, sizeof(*req));
+ ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+ return crypto_ahash_digest(mcryptd_req);
+}
+
+static int sha1_mb_async_export(struct ahash_request *req, void *out)
+{
+ struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+ struct sha1_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+ struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+ memcpy(mcryptd_req, req, sizeof(*req));
+ ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+ return crypto_ahash_export(mcryptd_req, out);
+}
+
+static int sha1_mb_async_import(struct ahash_request *req, const void *in)
+{
+ struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+ struct sha1_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+ struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+ struct crypto_ahash *child = mcryptd_ahash_child(mcryptd_tfm);
+ struct mcryptd_hash_request_ctx *rctx;
+ struct ahash_request *areq;
+
+ memcpy(mcryptd_req, req, sizeof(*req));
+ ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+ rctx = ahash_request_ctx(mcryptd_req);
+ areq = &rctx->areq;
+
+ ahash_request_set_tfm(areq, child);
+ ahash_request_set_callback(areq, CRYPTO_TFM_REQ_MAY_SLEEP,
+ rctx->complete, req);
+
+ return crypto_ahash_import(mcryptd_req, in);
+}
+
+static struct ahash_alg sha1_mb_async_alg = {
+ .init = sha1_mb_async_init,
+ .update = sha1_mb_async_update,
+ .final = sha1_mb_async_final,
+ .finup = sha1_mb_async_finup,
+ .digest = sha1_mb_async_digest,
+ .export = sha1_mb_async_export,
+ .import = sha1_mb_async_import,
+ .halg = {
+ .digestsize = SHA1_DIGEST_SIZE,
+ .statesize = sizeof(struct sha1_hash_ctx),
+ .base = {
+ .cra_name = "sha1",
+ .cra_driver_name = "sha1_mb",
+ /*
+ * Low priority, since with few concurrent hash requests
+ * this is extremely slow due to the flush delay. Users
+ * whose workloads would benefit from this can request
+ * it explicitly by driver name, or can increase its
+ * priority at runtime using NETLINK_CRYPTO.
+ */
+ .cra_priority = 50,
+ .cra_flags = CRYPTO_ALG_ASYNC,
+ .cra_blocksize = SHA1_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ .cra_list = LIST_HEAD_INIT(sha1_mb_async_alg.halg.base.cra_list),
+ .cra_init = sha1_mb_async_init_tfm,
+ .cra_exit = sha1_mb_async_exit_tfm,
+ .cra_ctxsize = sizeof(struct sha1_mb_ctx),
+ .cra_alignmask = 0,
+ },
+ },
+};
+
+static unsigned long sha1_mb_flusher(struct mcryptd_alg_cstate *cstate)
+{
+ struct mcryptd_hash_request_ctx *rctx;
+ unsigned long cur_time;
+ unsigned long next_flush = 0;
+ struct sha1_hash_ctx *sha_ctx;
+
+
+ cur_time = jiffies;
+
+ while (!list_empty(&cstate->work_list)) {
+ rctx = list_entry(cstate->work_list.next,
+ struct mcryptd_hash_request_ctx, waiter);
+ if (time_before(cur_time, rctx->tag.expire))
+ break;
+ kernel_fpu_begin();
+ sha_ctx = (struct sha1_hash_ctx *)
+ sha1_ctx_mgr_flush(cstate->mgr);
+ kernel_fpu_end();
+ if (!sha_ctx) {
+ pr_err("sha1_mb error: nothing got flushed for non-empty list\n");
+ break;
+ }
+ rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+ sha_finish_walk(&rctx, cstate, true);
+ sha_complete_job(rctx, cstate, 0);
+ }
+
+ if (!list_empty(&cstate->work_list)) {
+ rctx = list_entry(cstate->work_list.next,
+ struct mcryptd_hash_request_ctx, waiter);
+ /* get the hash context and then flush time */
+ next_flush = rctx->tag.expire;
+ mcryptd_arm_flusher(cstate, get_delay(next_flush));
+ }
+ return next_flush;
+}
+
+static int __init sha1_mb_mod_init(void)
+{
+
+ int cpu;
+ int err;
+ struct mcryptd_alg_cstate *cpu_state;
+
+ /* check for dependent cpu features */
+ if (!boot_cpu_has(X86_FEATURE_AVX2) ||
+ !boot_cpu_has(X86_FEATURE_BMI2))
+ return -ENODEV;
+
+ /* initialize multibuffer structures */
+ sha1_mb_alg_state.alg_cstate = alloc_percpu(struct mcryptd_alg_cstate);
+
+ sha1_job_mgr_init = sha1_mb_mgr_init_avx2;
+ sha1_job_mgr_submit = sha1_mb_mgr_submit_avx2;
+ sha1_job_mgr_flush = sha1_mb_mgr_flush_avx2;
+ sha1_job_mgr_get_comp_job = sha1_mb_mgr_get_comp_job_avx2;
+
+ if (!sha1_mb_alg_state.alg_cstate)
+ return -ENOMEM;
+ for_each_possible_cpu(cpu) {
+ cpu_state = per_cpu_ptr(sha1_mb_alg_state.alg_cstate, cpu);
+ cpu_state->next_flush = 0;
+ cpu_state->next_seq_num = 0;
+ cpu_state->flusher_engaged = false;
+ INIT_DELAYED_WORK(&cpu_state->flush, mcryptd_flusher);
+ cpu_state->cpu = cpu;
+ cpu_state->alg_state = &sha1_mb_alg_state;
+ cpu_state->mgr = kzalloc(sizeof(struct sha1_ctx_mgr),
+ GFP_KERNEL);
+ if (!cpu_state->mgr)
+ goto err2;
+ sha1_ctx_mgr_init(cpu_state->mgr);
+ INIT_LIST_HEAD(&cpu_state->work_list);
+ spin_lock_init(&cpu_state->work_lock);
+ }
+ sha1_mb_alg_state.flusher = &sha1_mb_flusher;
+
+ err = crypto_register_ahash(&sha1_mb_areq_alg);
+ if (err)
+ goto err2;
+ err = crypto_register_ahash(&sha1_mb_async_alg);
+ if (err)
+ goto err1;
+
+
+ return 0;
+err1:
+ crypto_unregister_ahash(&sha1_mb_areq_alg);
+err2:
+ for_each_possible_cpu(cpu) {
+ cpu_state = per_cpu_ptr(sha1_mb_alg_state.alg_cstate, cpu);
+ kfree(cpu_state->mgr);
+ }
+ free_percpu(sha1_mb_alg_state.alg_cstate);
+ return -ENODEV;
+}
+
+static void __exit sha1_mb_mod_fini(void)
+{
+ int cpu;
+ struct mcryptd_alg_cstate *cpu_state;
+
+ crypto_unregister_ahash(&sha1_mb_async_alg);
+ crypto_unregister_ahash(&sha1_mb_areq_alg);
+ for_each_possible_cpu(cpu) {
+ cpu_state = per_cpu_ptr(sha1_mb_alg_state.alg_cstate, cpu);
+ kfree(cpu_state->mgr);
+ }
+ free_percpu(sha1_mb_alg_state.alg_cstate);
+}
+
+module_init(sha1_mb_mod_init);
+module_exit(sha1_mb_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm, multi buffer accelerated");
+
+MODULE_ALIAS_CRYPTO("sha1");
diff --git a/arch/x86/crypto/sha1-mb/sha1_mb_ctx.h b/arch/x86/crypto/sha1-mb/sha1_mb_ctx.h
new file mode 100644
index 000000000..9454bd16f
--- /dev/null
+++ b/arch/x86/crypto/sha1-mb/sha1_mb_ctx.h
@@ -0,0 +1,134 @@
+/*
+ * Header file for multi buffer SHA context
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _SHA_MB_CTX_INTERNAL_H
+#define _SHA_MB_CTX_INTERNAL_H
+
+#include "sha1_mb_mgr.h"
+
+#define HASH_UPDATE 0x00
+#define HASH_LAST 0x01
+#define HASH_DONE 0x02
+#define HASH_FINAL 0x04
+
+#define HASH_CTX_STS_IDLE 0x00
+#define HASH_CTX_STS_PROCESSING 0x01
+#define HASH_CTX_STS_LAST 0x02
+#define HASH_CTX_STS_COMPLETE 0x04
+
+enum hash_ctx_error {
+ HASH_CTX_ERROR_NONE = 0,
+ HASH_CTX_ERROR_INVALID_FLAGS = -1,
+ HASH_CTX_ERROR_ALREADY_PROCESSING = -2,
+ HASH_CTX_ERROR_ALREADY_COMPLETED = -3,
+
+#ifdef HASH_CTX_DEBUG
+ HASH_CTX_ERROR_DEBUG_DIGEST_MISMATCH = -4,
+#endif
+};
+
+
+#define hash_ctx_user_data(ctx) ((ctx)->user_data)
+#define hash_ctx_digest(ctx) ((ctx)->job.result_digest)
+#define hash_ctx_processing(ctx) ((ctx)->status & HASH_CTX_STS_PROCESSING)
+#define hash_ctx_complete(ctx) ((ctx)->status == HASH_CTX_STS_COMPLETE)
+#define hash_ctx_status(ctx) ((ctx)->status)
+#define hash_ctx_error(ctx) ((ctx)->error)
+#define hash_ctx_init(ctx) \
+ do { \
+ (ctx)->error = HASH_CTX_ERROR_NONE; \
+ (ctx)->status = HASH_CTX_STS_COMPLETE; \
+ } while (0)
+
+
+/* Hash Constants and Typedefs */
+#define SHA1_DIGEST_LENGTH 5
+#define SHA1_LOG2_BLOCK_SIZE 6
+
+#define SHA1_PADLENGTHFIELD_SIZE 8
+
+#ifdef SHA_MB_DEBUG
+#define assert(expr) \
+do { \
+ if (unlikely(!(expr))) { \
+ printk(KERN_ERR "Assertion failed! %s,%s,%s,line=%d\n", \
+ #expr, __FILE__, __func__, __LINE__); \
+ } \
+} while (0)
+#else
+#define assert(expr) do {} while (0)
+#endif
+
+struct sha1_ctx_mgr {
+ struct sha1_mb_mgr mgr;
+};
+
+/* typedef struct sha1_ctx_mgr sha1_ctx_mgr; */
+
+struct sha1_hash_ctx {
+ /* Must be at struct offset 0 */
+ struct job_sha1 job;
+ /* status flag */
+ int status;
+ /* error flag */
+ int error;
+
+ uint64_t total_length;
+ const void *incoming_buffer;
+ uint32_t incoming_buffer_length;
+ uint8_t partial_block_buffer[SHA1_BLOCK_SIZE * 2];
+ uint32_t partial_block_buffer_length;
+ void *user_data;
+};
+
+#endif
diff --git a/arch/x86/crypto/sha1-mb/sha1_mb_mgr.h b/arch/x86/crypto/sha1-mb/sha1_mb_mgr.h
new file mode 100644
index 000000000..08ad1a9ac
--- /dev/null
+++ b/arch/x86/crypto/sha1-mb/sha1_mb_mgr.h
@@ -0,0 +1,110 @@
+/*
+ * Header file for multi buffer SHA1 algorithm manager
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * James Guilford <james.guilford@intel.com>
+ * Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef __SHA_MB_MGR_H
+#define __SHA_MB_MGR_H
+
+
+#include <linux/types.h>
+
+#define NUM_SHA1_DIGEST_WORDS 5
+
+enum job_sts { STS_UNKNOWN = 0,
+ STS_BEING_PROCESSED = 1,
+ STS_COMPLETED = 2,
+ STS_INTERNAL_ERROR = 3,
+ STS_ERROR = 4
+};
+
+struct job_sha1 {
+ u8 *buffer;
+ u32 len;
+ u32 result_digest[NUM_SHA1_DIGEST_WORDS] __aligned(32);
+ enum job_sts status;
+ void *user_data;
+};
+
+/* SHA1 out-of-order scheduler */
+
+/* typedef uint32_t sha1_digest_array[5][8]; */
+
+struct sha1_args_x8 {
+ uint32_t digest[5][8];
+ uint8_t *data_ptr[8];
+};
+
+struct sha1_lane_data {
+ struct job_sha1 *job_in_lane;
+};
+
+struct sha1_mb_mgr {
+ struct sha1_args_x8 args;
+
+ uint32_t lens[8];
+
+ /* each byte is index (0...7) of unused lanes */
+ uint64_t unused_lanes;
+ /* byte 4 is set to FF as a flag */
+ struct sha1_lane_data ldata[8];
+};
+
+
+#define SHA1_MB_MGR_NUM_LANES_AVX2 8
+
+void sha1_mb_mgr_init_avx2(struct sha1_mb_mgr *state);
+struct job_sha1 *sha1_mb_mgr_submit_avx2(struct sha1_mb_mgr *state,
+ struct job_sha1 *job);
+struct job_sha1 *sha1_mb_mgr_flush_avx2(struct sha1_mb_mgr *state);
+struct job_sha1 *sha1_mb_mgr_get_comp_job_avx2(struct sha1_mb_mgr *state);
+
+#endif
diff --git a/arch/x86/crypto/sha1-mb/sha1_mb_mgr_datastruct.S b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_datastruct.S
new file mode 100644
index 000000000..86688c6e7
--- /dev/null
+++ b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_datastruct.S
@@ -0,0 +1,287 @@
+/*
+ * Header file for multi buffer SHA1 algorithm data structure
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * James Guilford <james.guilford@intel.com>
+ * Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+# Macros for defining data structures
+
+# Usage example
+
+#START_FIELDS # JOB_AES
+### name size align
+#FIELD _plaintext, 8, 8 # pointer to plaintext
+#FIELD _ciphertext, 8, 8 # pointer to ciphertext
+#FIELD _IV, 16, 8 # IV
+#FIELD _keys, 8, 8 # pointer to keys
+#FIELD _len, 4, 4 # length in bytes
+#FIELD _status, 4, 4 # status enumeration
+#FIELD _user_data, 8, 8 # pointer to user data
+#UNION _union, size1, align1, \
+# size2, align2, \
+# size3, align3, \
+# ...
+#END_FIELDS
+#%assign _JOB_AES_size _FIELD_OFFSET
+#%assign _JOB_AES_align _STRUCT_ALIGN
+
+#########################################################################
+
+# Alternate "struc-like" syntax:
+# STRUCT job_aes2
+# RES_Q .plaintext, 1
+# RES_Q .ciphertext, 1
+# RES_DQ .IV, 1
+# RES_B .nested, _JOB_AES_SIZE, _JOB_AES_ALIGN
+# RES_U .union, size1, align1, \
+# size2, align2, \
+# ...
+# ENDSTRUCT
+# # Following only needed if nesting
+# %assign job_aes2_size _FIELD_OFFSET
+# %assign job_aes2_align _STRUCT_ALIGN
+#
+# RES_* macros take a name, a count and an optional alignment.
+# The count in in terms of the base size of the macro, and the
+# default alignment is the base size.
+# The macros are:
+# Macro Base size
+# RES_B 1
+# RES_W 2
+# RES_D 4
+# RES_Q 8
+# RES_DQ 16
+# RES_Y 32
+# RES_Z 64
+#
+# RES_U defines a union. It's arguments are a name and two or more
+# pairs of "size, alignment"
+#
+# The two assigns are only needed if this structure is being nested
+# within another. Even if the assigns are not done, one can still use
+# STRUCT_NAME_size as the size of the structure.
+#
+# Note that for nesting, you still need to assign to STRUCT_NAME_size.
+#
+# The differences between this and using "struc" directly are that each
+# type is implicitly aligned to its natural length (although this can be
+# over-ridden with an explicit third parameter), and that the structure
+# is padded at the end to its overall alignment.
+#
+
+#########################################################################
+
+#ifndef _SHA1_MB_MGR_DATASTRUCT_ASM_
+#define _SHA1_MB_MGR_DATASTRUCT_ASM_
+
+## START_FIELDS
+.macro START_FIELDS
+ _FIELD_OFFSET = 0
+ _STRUCT_ALIGN = 0
+.endm
+
+## FIELD name size align
+.macro FIELD name size align
+ _FIELD_OFFSET = (_FIELD_OFFSET + (\align) - 1) & (~ ((\align)-1))
+ \name = _FIELD_OFFSET
+ _FIELD_OFFSET = _FIELD_OFFSET + (\size)
+.if (\align > _STRUCT_ALIGN)
+ _STRUCT_ALIGN = \align
+.endif
+.endm
+
+## END_FIELDS
+.macro END_FIELDS
+ _FIELD_OFFSET = (_FIELD_OFFSET + _STRUCT_ALIGN-1) & (~ (_STRUCT_ALIGN-1))
+.endm
+
+########################################################################
+
+.macro STRUCT p1
+START_FIELDS
+.struc \p1
+.endm
+
+.macro ENDSTRUCT
+ tmp = _FIELD_OFFSET
+ END_FIELDS
+ tmp = (_FIELD_OFFSET - %%tmp)
+.if (tmp > 0)
+ .lcomm tmp
+.endif
+.endstruc
+.endm
+
+## RES_int name size align
+.macro RES_int p1 p2 p3
+ name = \p1
+ size = \p2
+ align = .\p3
+
+ _FIELD_OFFSET = (_FIELD_OFFSET + (align) - 1) & (~ ((align)-1))
+.align align
+.lcomm name size
+ _FIELD_OFFSET = _FIELD_OFFSET + (size)
+.if (align > _STRUCT_ALIGN)
+ _STRUCT_ALIGN = align
+.endif
+.endm
+
+
+
+# macro RES_B name, size [, align]
+.macro RES_B _name, _size, _align=1
+RES_int _name _size _align
+.endm
+
+# macro RES_W name, size [, align]
+.macro RES_W _name, _size, _align=2
+RES_int _name 2*(_size) _align
+.endm
+
+# macro RES_D name, size [, align]
+.macro RES_D _name, _size, _align=4
+RES_int _name 4*(_size) _align
+.endm
+
+# macro RES_Q name, size [, align]
+.macro RES_Q _name, _size, _align=8
+RES_int _name 8*(_size) _align
+.endm
+
+# macro RES_DQ name, size [, align]
+.macro RES_DQ _name, _size, _align=16
+RES_int _name 16*(_size) _align
+.endm
+
+# macro RES_Y name, size [, align]
+.macro RES_Y _name, _size, _align=32
+RES_int _name 32*(_size) _align
+.endm
+
+# macro RES_Z name, size [, align]
+.macro RES_Z _name, _size, _align=64
+RES_int _name 64*(_size) _align
+.endm
+
+
+#endif
+
+########################################################################
+#### Define constants
+########################################################################
+
+########################################################################
+#### Define SHA1 Out Of Order Data Structures
+########################################################################
+
+START_FIELDS # LANE_DATA
+### name size align
+FIELD _job_in_lane, 8, 8 # pointer to job object
+END_FIELDS
+
+_LANE_DATA_size = _FIELD_OFFSET
+_LANE_DATA_align = _STRUCT_ALIGN
+
+########################################################################
+
+START_FIELDS # SHA1_ARGS_X8
+### name size align
+FIELD _digest, 4*5*8, 16 # transposed digest
+FIELD _data_ptr, 8*8, 8 # array of pointers to data
+END_FIELDS
+
+_SHA1_ARGS_X4_size = _FIELD_OFFSET
+_SHA1_ARGS_X4_align = _STRUCT_ALIGN
+_SHA1_ARGS_X8_size = _FIELD_OFFSET
+_SHA1_ARGS_X8_align = _STRUCT_ALIGN
+
+########################################################################
+
+START_FIELDS # MB_MGR
+### name size align
+FIELD _args, _SHA1_ARGS_X4_size, _SHA1_ARGS_X4_align
+FIELD _lens, 4*8, 8
+FIELD _unused_lanes, 8, 8
+FIELD _ldata, _LANE_DATA_size*8, _LANE_DATA_align
+END_FIELDS
+
+_MB_MGR_size = _FIELD_OFFSET
+_MB_MGR_align = _STRUCT_ALIGN
+
+_args_digest = _args + _digest
+_args_data_ptr = _args + _data_ptr
+
+
+########################################################################
+#### Define constants
+########################################################################
+
+#define STS_UNKNOWN 0
+#define STS_BEING_PROCESSED 1
+#define STS_COMPLETED 2
+
+########################################################################
+#### Define JOB_SHA1 structure
+########################################################################
+
+START_FIELDS # JOB_SHA1
+
+### name size align
+FIELD _buffer, 8, 8 # pointer to buffer
+FIELD _len, 4, 4 # length in bytes
+FIELD _result_digest, 5*4, 32 # Digest (output)
+FIELD _status, 4, 4
+FIELD _user_data, 8, 8
+END_FIELDS
+
+_JOB_SHA1_size = _FIELD_OFFSET
+_JOB_SHA1_align = _STRUCT_ALIGN
diff --git a/arch/x86/crypto/sha1-mb/sha1_mb_mgr_flush_avx2.S b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_flush_avx2.S
new file mode 100644
index 000000000..7cfba738f
--- /dev/null
+++ b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_flush_avx2.S
@@ -0,0 +1,304 @@
+/*
+ * Flush routine for SHA1 multibuffer
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * James Guilford <james.guilford@intel.com>
+ * Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <linux/linkage.h>
+#include <asm/frame.h>
+#include "sha1_mb_mgr_datastruct.S"
+
+
+.extern sha1_x8_avx2
+
+# LINUX register definitions
+#define arg1 %rdi
+#define arg2 %rsi
+
+# Common definitions
+#define state arg1
+#define job arg2
+#define len2 arg2
+
+# idx must be a register not clobbered by sha1_x8_avx2
+#define idx %r8
+#define DWORD_idx %r8d
+
+#define unused_lanes %rbx
+#define lane_data %rbx
+#define tmp2 %rbx
+#define tmp2_w %ebx
+
+#define job_rax %rax
+#define tmp1 %rax
+#define size_offset %rax
+#define tmp %rax
+#define start_offset %rax
+
+#define tmp3 %arg1
+
+#define extra_blocks %arg2
+#define p %arg2
+
+.macro LABEL prefix n
+\prefix\n\():
+.endm
+
+.macro JNE_SKIP i
+jne skip_\i
+.endm
+
+.altmacro
+.macro SET_OFFSET _offset
+offset = \_offset
+.endm
+.noaltmacro
+
+# JOB* sha1_mb_mgr_flush_avx2(MB_MGR *state)
+# arg 1 : rcx : state
+ENTRY(sha1_mb_mgr_flush_avx2)
+ FRAME_BEGIN
+ push %rbx
+
+ # If bit (32+3) is set, then all lanes are empty
+ mov _unused_lanes(state), unused_lanes
+ bt $32+3, unused_lanes
+ jc return_null
+
+ # find a lane with a non-null job
+ xor idx, idx
+ offset = (_ldata + 1 * _LANE_DATA_size + _job_in_lane)
+ cmpq $0, offset(state)
+ cmovne one(%rip), idx
+ offset = (_ldata + 2 * _LANE_DATA_size + _job_in_lane)
+ cmpq $0, offset(state)
+ cmovne two(%rip), idx
+ offset = (_ldata + 3 * _LANE_DATA_size + _job_in_lane)
+ cmpq $0, offset(state)
+ cmovne three(%rip), idx
+ offset = (_ldata + 4 * _LANE_DATA_size + _job_in_lane)
+ cmpq $0, offset(state)
+ cmovne four(%rip), idx
+ offset = (_ldata + 5 * _LANE_DATA_size + _job_in_lane)
+ cmpq $0, offset(state)
+ cmovne five(%rip), idx
+ offset = (_ldata + 6 * _LANE_DATA_size + _job_in_lane)
+ cmpq $0, offset(state)
+ cmovne six(%rip), idx
+ offset = (_ldata + 7 * _LANE_DATA_size + _job_in_lane)
+ cmpq $0, offset(state)
+ cmovne seven(%rip), idx
+
+ # copy idx to empty lanes
+copy_lane_data:
+ offset = (_args + _data_ptr)
+ mov offset(state,idx,8), tmp
+
+ I = 0
+.rep 8
+ offset = (_ldata + I * _LANE_DATA_size + _job_in_lane)
+ cmpq $0, offset(state)
+.altmacro
+ JNE_SKIP %I
+ offset = (_args + _data_ptr + 8*I)
+ mov tmp, offset(state)
+ offset = (_lens + 4*I)
+ movl $0xFFFFFFFF, offset(state)
+LABEL skip_ %I
+ I = (I+1)
+.noaltmacro
+.endr
+
+ # Find min length
+ vmovdqu _lens+0*16(state), %xmm0
+ vmovdqu _lens+1*16(state), %xmm1
+
+ vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A}
+ vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C}
+ vpminud %xmm3, %xmm2, %xmm2 # xmm2 has {x,x,E,F}
+ vpalignr $4, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,x,E}
+ vpminud %xmm3, %xmm2, %xmm2 # xmm2 has min value in low dword
+
+ vmovd %xmm2, DWORD_idx
+ mov idx, len2
+ and $0xF, idx
+ shr $4, len2
+ jz len_is_0
+
+ vpand clear_low_nibble(%rip), %xmm2, %xmm2
+ vpshufd $0, %xmm2, %xmm2
+
+ vpsubd %xmm2, %xmm0, %xmm0
+ vpsubd %xmm2, %xmm1, %xmm1
+
+ vmovdqu %xmm0, _lens+0*16(state)
+ vmovdqu %xmm1, _lens+1*16(state)
+
+ # "state" and "args" are the same address, arg1
+ # len is arg2
+ call sha1_x8_avx2
+ # state and idx are intact
+
+
+len_is_0:
+ # process completed job "idx"
+ imul $_LANE_DATA_size, idx, lane_data
+ lea _ldata(state, lane_data), lane_data
+
+ mov _job_in_lane(lane_data), job_rax
+ movq $0, _job_in_lane(lane_data)
+ movl $STS_COMPLETED, _status(job_rax)
+ mov _unused_lanes(state), unused_lanes
+ shl $4, unused_lanes
+ or idx, unused_lanes
+ mov unused_lanes, _unused_lanes(state)
+
+ movl $0xFFFFFFFF, _lens(state, idx, 4)
+
+ vmovd _args_digest(state , idx, 4) , %xmm0
+ vpinsrd $1, _args_digest+1*32(state, idx, 4), %xmm0, %xmm0
+ vpinsrd $2, _args_digest+2*32(state, idx, 4), %xmm0, %xmm0
+ vpinsrd $3, _args_digest+3*32(state, idx, 4), %xmm0, %xmm0
+ movl _args_digest+4*32(state, idx, 4), tmp2_w
+
+ vmovdqu %xmm0, _result_digest(job_rax)
+ offset = (_result_digest + 1*16)
+ mov tmp2_w, offset(job_rax)
+
+return:
+ pop %rbx
+ FRAME_END
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+ENDPROC(sha1_mb_mgr_flush_avx2)
+
+
+#################################################################
+
+.align 16
+ENTRY(sha1_mb_mgr_get_comp_job_avx2)
+ push %rbx
+
+ ## if bit 32+3 is set, then all lanes are empty
+ mov _unused_lanes(state), unused_lanes
+ bt $(32+3), unused_lanes
+ jc .return_null
+
+ # Find min length
+ vmovdqu _lens(state), %xmm0
+ vmovdqu _lens+1*16(state), %xmm1
+
+ vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A}
+ vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C}
+ vpminud %xmm3, %xmm2, %xmm2 # xmm2 has {x,x,E,F}
+ vpalignr $4, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,x,E}
+ vpminud %xmm3, %xmm2, %xmm2 # xmm2 has min value in low dword
+
+ vmovd %xmm2, DWORD_idx
+ test $~0xF, idx
+ jnz .return_null
+
+ # process completed job "idx"
+ imul $_LANE_DATA_size, idx, lane_data
+ lea _ldata(state, lane_data), lane_data
+
+ mov _job_in_lane(lane_data), job_rax
+ movq $0, _job_in_lane(lane_data)
+ movl $STS_COMPLETED, _status(job_rax)
+ mov _unused_lanes(state), unused_lanes
+ shl $4, unused_lanes
+ or idx, unused_lanes
+ mov unused_lanes, _unused_lanes(state)
+
+ movl $0xFFFFFFFF, _lens(state, idx, 4)
+
+ vmovd _args_digest(state, idx, 4), %xmm0
+ vpinsrd $1, _args_digest+1*32(state, idx, 4), %xmm0, %xmm0
+ vpinsrd $2, _args_digest+2*32(state, idx, 4), %xmm0, %xmm0
+ vpinsrd $3, _args_digest+3*32(state, idx, 4), %xmm0, %xmm0
+ movl _args_digest+4*32(state, idx, 4), tmp2_w
+
+ vmovdqu %xmm0, _result_digest(job_rax)
+ movl tmp2_w, _result_digest+1*16(job_rax)
+
+ pop %rbx
+
+ ret
+
+.return_null:
+ xor job_rax, job_rax
+ pop %rbx
+ ret
+ENDPROC(sha1_mb_mgr_get_comp_job_avx2)
+
+.section .rodata.cst16.clear_low_nibble, "aM", @progbits, 16
+.align 16
+clear_low_nibble:
+.octa 0x000000000000000000000000FFFFFFF0
+
+.section .rodata.cst8, "aM", @progbits, 8
+.align 8
+one:
+.quad 1
+two:
+.quad 2
+three:
+.quad 3
+four:
+.quad 4
+five:
+.quad 5
+six:
+.quad 6
+seven:
+.quad 7
diff --git a/arch/x86/crypto/sha1-mb/sha1_mb_mgr_init_avx2.c b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_init_avx2.c
new file mode 100644
index 000000000..d2add0d35
--- /dev/null
+++ b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_init_avx2.c
@@ -0,0 +1,64 @@
+/*
+ * Initialization code for multi buffer SHA1 algorithm for AVX2
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "sha1_mb_mgr.h"
+
+void sha1_mb_mgr_init_avx2(struct sha1_mb_mgr *state)
+{
+ unsigned int j;
+ state->unused_lanes = 0xF76543210ULL;
+ for (j = 0; j < 8; j++) {
+ state->lens[j] = 0xFFFFFFFF;
+ state->ldata[j].job_in_lane = NULL;
+ }
+}
diff --git a/arch/x86/crypto/sha1-mb/sha1_mb_mgr_submit_avx2.S b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_submit_avx2.S
new file mode 100644
index 000000000..7a93b1c0d
--- /dev/null
+++ b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_submit_avx2.S
@@ -0,0 +1,209 @@
+/*
+ * Buffer submit code for multi buffer SHA1 algorithm
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * James Guilford <james.guilford@intel.com>
+ * Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+#include "sha1_mb_mgr_datastruct.S"
+
+
+.extern sha1_x8_avx
+
+# LINUX register definitions
+arg1 = %rdi
+arg2 = %rsi
+size_offset = %rcx
+tmp2 = %rcx
+extra_blocks = %rdx
+
+# Common definitions
+#define state arg1
+#define job %rsi
+#define len2 arg2
+#define p2 arg2
+
+# idx must be a register not clobberred by sha1_x8_avx2
+idx = %r8
+DWORD_idx = %r8d
+last_len = %r8
+
+p = %r11
+start_offset = %r11
+
+unused_lanes = %rbx
+BYTE_unused_lanes = %bl
+
+job_rax = %rax
+len = %rax
+DWORD_len = %eax
+
+lane = %r12
+tmp3 = %r12
+
+tmp = %r9
+DWORD_tmp = %r9d
+
+lane_data = %r10
+
+# JOB* submit_mb_mgr_submit_avx2(MB_MGR *state, job_sha1 *job)
+# arg 1 : rcx : state
+# arg 2 : rdx : job
+ENTRY(sha1_mb_mgr_submit_avx2)
+ FRAME_BEGIN
+ push %rbx
+ push %r12
+
+ mov _unused_lanes(state), unused_lanes
+ mov unused_lanes, lane
+ and $0xF, lane
+ shr $4, unused_lanes
+ imul $_LANE_DATA_size, lane, lane_data
+ movl $STS_BEING_PROCESSED, _status(job)
+ lea _ldata(state, lane_data), lane_data
+ mov unused_lanes, _unused_lanes(state)
+ movl _len(job), DWORD_len
+
+ mov job, _job_in_lane(lane_data)
+ shl $4, len
+ or lane, len
+
+ movl DWORD_len, _lens(state , lane, 4)
+
+ # Load digest words from result_digest
+ vmovdqu _result_digest(job), %xmm0
+ mov _result_digest+1*16(job), DWORD_tmp
+ vmovd %xmm0, _args_digest(state, lane, 4)
+ vpextrd $1, %xmm0, _args_digest+1*32(state , lane, 4)
+ vpextrd $2, %xmm0, _args_digest+2*32(state , lane, 4)
+ vpextrd $3, %xmm0, _args_digest+3*32(state , lane, 4)
+ movl DWORD_tmp, _args_digest+4*32(state , lane, 4)
+
+ mov _buffer(job), p
+ mov p, _args_data_ptr(state, lane, 8)
+
+ cmp $0xF, unused_lanes
+ jne return_null
+
+start_loop:
+ # Find min length
+ vmovdqa _lens(state), %xmm0
+ vmovdqa _lens+1*16(state), %xmm1
+
+ vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A}
+ vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C}
+ vpminud %xmm3, %xmm2, %xmm2 # xmm2 has {x,x,E,F}
+ vpalignr $4, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,x,E}
+ vpminud %xmm3, %xmm2, %xmm2 # xmm2 has min value in low dword
+
+ vmovd %xmm2, DWORD_idx
+ mov idx, len2
+ and $0xF, idx
+ shr $4, len2
+ jz len_is_0
+
+ vpand clear_low_nibble(%rip), %xmm2, %xmm2
+ vpshufd $0, %xmm2, %xmm2
+
+ vpsubd %xmm2, %xmm0, %xmm0
+ vpsubd %xmm2, %xmm1, %xmm1
+
+ vmovdqa %xmm0, _lens + 0*16(state)
+ vmovdqa %xmm1, _lens + 1*16(state)
+
+
+ # "state" and "args" are the same address, arg1
+ # len is arg2
+ call sha1_x8_avx2
+
+ # state and idx are intact
+
+len_is_0:
+ # process completed job "idx"
+ imul $_LANE_DATA_size, idx, lane_data
+ lea _ldata(state, lane_data), lane_data
+
+ mov _job_in_lane(lane_data), job_rax
+ mov _unused_lanes(state), unused_lanes
+ movq $0, _job_in_lane(lane_data)
+ movl $STS_COMPLETED, _status(job_rax)
+ shl $4, unused_lanes
+ or idx, unused_lanes
+ mov unused_lanes, _unused_lanes(state)
+
+ movl $0xFFFFFFFF, _lens(state, idx, 4)
+
+ vmovd _args_digest(state, idx, 4), %xmm0
+ vpinsrd $1, _args_digest+1*32(state , idx, 4), %xmm0, %xmm0
+ vpinsrd $2, _args_digest+2*32(state , idx, 4), %xmm0, %xmm0
+ vpinsrd $3, _args_digest+3*32(state , idx, 4), %xmm0, %xmm0
+ movl _args_digest+4*32(state, idx, 4), DWORD_tmp
+
+ vmovdqu %xmm0, _result_digest(job_rax)
+ movl DWORD_tmp, _result_digest+1*16(job_rax)
+
+return:
+ pop %r12
+ pop %rbx
+ FRAME_END
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+ENDPROC(sha1_mb_mgr_submit_avx2)
+
+.section .rodata.cst16.clear_low_nibble, "aM", @progbits, 16
+.align 16
+clear_low_nibble:
+ .octa 0x000000000000000000000000FFFFFFF0
diff --git a/arch/x86/crypto/sha1-mb/sha1_x8_avx2.S b/arch/x86/crypto/sha1-mb/sha1_x8_avx2.S
new file mode 100644
index 000000000..20f77aa63
--- /dev/null
+++ b/arch/x86/crypto/sha1-mb/sha1_x8_avx2.S
@@ -0,0 +1,492 @@
+/*
+ * Multi-buffer SHA1 algorithm hash compute routine
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * James Guilford <james.guilford@intel.com>
+ * Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/linkage.h>
+#include "sha1_mb_mgr_datastruct.S"
+
+## code to compute oct SHA1 using SSE-256
+## outer calling routine takes care of save and restore of XMM registers
+
+## Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15# ymm0-15
+##
+## Linux clobbers: rax rbx rcx rdx rsi r9 r10 r11 r12 r13 r14 r15
+## Linux preserves: rdi rbp r8
+##
+## clobbers ymm0-15
+
+
+# TRANSPOSE8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
+# "transpose" data in {r0...r7} using temps {t0...t1}
+# Input looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
+# r0 = {a7 a6 a5 a4 a3 a2 a1 a0}
+# r1 = {b7 b6 b5 b4 b3 b2 b1 b0}
+# r2 = {c7 c6 c5 c4 c3 c2 c1 c0}
+# r3 = {d7 d6 d5 d4 d3 d2 d1 d0}
+# r4 = {e7 e6 e5 e4 e3 e2 e1 e0}
+# r5 = {f7 f6 f5 f4 f3 f2 f1 f0}
+# r6 = {g7 g6 g5 g4 g3 g2 g1 g0}
+# r7 = {h7 h6 h5 h4 h3 h2 h1 h0}
+#
+# Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
+# r0 = {h0 g0 f0 e0 d0 c0 b0 a0}
+# r1 = {h1 g1 f1 e1 d1 c1 b1 a1}
+# r2 = {h2 g2 f2 e2 d2 c2 b2 a2}
+# r3 = {h3 g3 f3 e3 d3 c3 b3 a3}
+# r4 = {h4 g4 f4 e4 d4 c4 b4 a4}
+# r5 = {h5 g5 f5 e5 d5 c5 b5 a5}
+# r6 = {h6 g6 f6 e6 d6 c6 b6 a6}
+# r7 = {h7 g7 f7 e7 d7 c7 b7 a7}
+#
+
+.macro TRANSPOSE8 r0 r1 r2 r3 r4 r5 r6 r7 t0 t1
+ # process top half (r0..r3) {a...d}
+ vshufps $0x44, \r1, \r0, \t0 # t0 = {b5 b4 a5 a4 b1 b0 a1 a0}
+ vshufps $0xEE, \r1, \r0, \r0 # r0 = {b7 b6 a7 a6 b3 b2 a3 a2}
+ vshufps $0x44, \r3, \r2, \t1 # t1 = {d5 d4 c5 c4 d1 d0 c1 c0}
+ vshufps $0xEE, \r3, \r2, \r2 # r2 = {d7 d6 c7 c6 d3 d2 c3 c2}
+ vshufps $0xDD, \t1, \t0, \r3 # r3 = {d5 c5 b5 a5 d1 c1 b1 a1}
+ vshufps $0x88, \r2, \r0, \r1 # r1 = {d6 c6 b6 a6 d2 c2 b2 a2}
+ vshufps $0xDD, \r2, \r0, \r0 # r0 = {d7 c7 b7 a7 d3 c3 b3 a3}
+ vshufps $0x88, \t1, \t0, \t0 # t0 = {d4 c4 b4 a4 d0 c0 b0 a0}
+
+ # use r2 in place of t0
+ # process bottom half (r4..r7) {e...h}
+ vshufps $0x44, \r5, \r4, \r2 # r2 = {f5 f4 e5 e4 f1 f0 e1 e0}
+ vshufps $0xEE, \r5, \r4, \r4 # r4 = {f7 f6 e7 e6 f3 f2 e3 e2}
+ vshufps $0x44, \r7, \r6, \t1 # t1 = {h5 h4 g5 g4 h1 h0 g1 g0}
+ vshufps $0xEE, \r7, \r6, \r6 # r6 = {h7 h6 g7 g6 h3 h2 g3 g2}
+ vshufps $0xDD, \t1, \r2, \r7 # r7 = {h5 g5 f5 e5 h1 g1 f1 e1}
+ vshufps $0x88, \r6, \r4, \r5 # r5 = {h6 g6 f6 e6 h2 g2 f2 e2}
+ vshufps $0xDD, \r6, \r4, \r4 # r4 = {h7 g7 f7 e7 h3 g3 f3 e3}
+ vshufps $0x88, \t1, \r2, \t1 # t1 = {h4 g4 f4 e4 h0 g0 f0 e0}
+
+ vperm2f128 $0x13, \r1, \r5, \r6 # h6...a6
+ vperm2f128 $0x02, \r1, \r5, \r2 # h2...a2
+ vperm2f128 $0x13, \r3, \r7, \r5 # h5...a5
+ vperm2f128 $0x02, \r3, \r7, \r1 # h1...a1
+ vperm2f128 $0x13, \r0, \r4, \r7 # h7...a7
+ vperm2f128 $0x02, \r0, \r4, \r3 # h3...a3
+ vperm2f128 $0x13, \t0, \t1, \r4 # h4...a4
+ vperm2f128 $0x02, \t0, \t1, \r0 # h0...a0
+
+.endm
+##
+## Magic functions defined in FIPS 180-1
+##
+# macro MAGIC_F0 F,B,C,D,T ## F = (D ^ (B & (C ^ D)))
+.macro MAGIC_F0 regF regB regC regD regT
+ vpxor \regD, \regC, \regF
+ vpand \regB, \regF, \regF
+ vpxor \regD, \regF, \regF
+.endm
+
+# macro MAGIC_F1 F,B,C,D,T ## F = (B ^ C ^ D)
+.macro MAGIC_F1 regF regB regC regD regT
+ vpxor \regC, \regD, \regF
+ vpxor \regB, \regF, \regF
+.endm
+
+# macro MAGIC_F2 F,B,C,D,T ## F = ((B & C) | (B & D) | (C & D))
+.macro MAGIC_F2 regF regB regC regD regT
+ vpor \regC, \regB, \regF
+ vpand \regC, \regB, \regT
+ vpand \regD, \regF, \regF
+ vpor \regT, \regF, \regF
+.endm
+
+# macro MAGIC_F3 F,B,C,D,T ## F = (B ^ C ^ D)
+.macro MAGIC_F3 regF regB regC regD regT
+ MAGIC_F1 \regF,\regB,\regC,\regD,\regT
+.endm
+
+# PROLD reg, imm, tmp
+.macro PROLD reg imm tmp
+ vpsrld $(32-\imm), \reg, \tmp
+ vpslld $\imm, \reg, \reg
+ vpor \tmp, \reg, \reg
+.endm
+
+.macro PROLD_nd reg imm tmp src
+ vpsrld $(32-\imm), \src, \tmp
+ vpslld $\imm, \src, \reg
+ vpor \tmp, \reg, \reg
+.endm
+
+.macro SHA1_STEP_00_15 regA regB regC regD regE regT regF memW immCNT MAGIC
+ vpaddd \immCNT, \regE, \regE
+ vpaddd \memW*32(%rsp), \regE, \regE
+ PROLD_nd \regT, 5, \regF, \regA
+ vpaddd \regT, \regE, \regE
+ \MAGIC \regF, \regB, \regC, \regD, \regT
+ PROLD \regB, 30, \regT
+ vpaddd \regF, \regE, \regE
+.endm
+
+.macro SHA1_STEP_16_79 regA regB regC regD regE regT regF memW immCNT MAGIC
+ vpaddd \immCNT, \regE, \regE
+ offset = ((\memW - 14) & 15) * 32
+ vmovdqu offset(%rsp), W14
+ vpxor W14, W16, W16
+ offset = ((\memW - 8) & 15) * 32
+ vpxor offset(%rsp), W16, W16
+ offset = ((\memW - 3) & 15) * 32
+ vpxor offset(%rsp), W16, W16
+ vpsrld $(32-1), W16, \regF
+ vpslld $1, W16, W16
+ vpor W16, \regF, \regF
+
+ ROTATE_W
+
+ offset = ((\memW - 0) & 15) * 32
+ vmovdqu \regF, offset(%rsp)
+ vpaddd \regF, \regE, \regE
+ PROLD_nd \regT, 5, \regF, \regA
+ vpaddd \regT, \regE, \regE
+ \MAGIC \regF,\regB,\regC,\regD,\regT ## FUN = MAGIC_Fi(B,C,D)
+ PROLD \regB,30, \regT
+ vpaddd \regF, \regE, \regE
+.endm
+
+########################################################################
+########################################################################
+########################################################################
+
+## FRAMESZ plus pushes must be an odd multiple of 8
+YMM_SAVE = (15-15)*32
+FRAMESZ = 32*16 + YMM_SAVE
+_YMM = FRAMESZ - YMM_SAVE
+
+#define VMOVPS vmovups
+
+IDX = %rax
+inp0 = %r9
+inp1 = %r10
+inp2 = %r11
+inp3 = %r12
+inp4 = %r13
+inp5 = %r14
+inp6 = %r15
+inp7 = %rcx
+arg1 = %rdi
+arg2 = %rsi
+RSP_SAVE = %rdx
+
+# ymm0 A
+# ymm1 B
+# ymm2 C
+# ymm3 D
+# ymm4 E
+# ymm5 F AA
+# ymm6 T0 BB
+# ymm7 T1 CC
+# ymm8 T2 DD
+# ymm9 T3 EE
+# ymm10 T4 TMP
+# ymm11 T5 FUN
+# ymm12 T6 K
+# ymm13 T7 W14
+# ymm14 T8 W15
+# ymm15 T9 W16
+
+
+A = %ymm0
+B = %ymm1
+C = %ymm2
+D = %ymm3
+E = %ymm4
+F = %ymm5
+T0 = %ymm6
+T1 = %ymm7
+T2 = %ymm8
+T3 = %ymm9
+T4 = %ymm10
+T5 = %ymm11
+T6 = %ymm12
+T7 = %ymm13
+T8 = %ymm14
+T9 = %ymm15
+
+AA = %ymm5
+BB = %ymm6
+CC = %ymm7
+DD = %ymm8
+EE = %ymm9
+TMP = %ymm10
+FUN = %ymm11
+K = %ymm12
+W14 = %ymm13
+W15 = %ymm14
+W16 = %ymm15
+
+.macro ROTATE_ARGS
+ TMP_ = E
+ E = D
+ D = C
+ C = B
+ B = A
+ A = TMP_
+.endm
+
+.macro ROTATE_W
+TMP_ = W16
+W16 = W15
+W15 = W14
+W14 = TMP_
+.endm
+
+# 8 streams x 5 32bit words per digest x 4 bytes per word
+#define DIGEST_SIZE (8*5*4)
+
+.align 32
+
+# void sha1_x8_avx2(void **input_data, UINT128 *digest, UINT32 size)
+# arg 1 : pointer to array[4] of pointer to input data
+# arg 2 : size (in blocks) ;; assumed to be >= 1
+#
+ENTRY(sha1_x8_avx2)
+
+ # save callee-saved clobbered registers to comply with C function ABI
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ #save rsp
+ mov %rsp, RSP_SAVE
+ sub $FRAMESZ, %rsp
+
+ #align rsp to 32 Bytes
+ and $~0x1F, %rsp
+
+ ## Initialize digests
+ vmovdqu 0*32(arg1), A
+ vmovdqu 1*32(arg1), B
+ vmovdqu 2*32(arg1), C
+ vmovdqu 3*32(arg1), D
+ vmovdqu 4*32(arg1), E
+
+ ## transpose input onto stack
+ mov _data_ptr+0*8(arg1),inp0
+ mov _data_ptr+1*8(arg1),inp1
+ mov _data_ptr+2*8(arg1),inp2
+ mov _data_ptr+3*8(arg1),inp3
+ mov _data_ptr+4*8(arg1),inp4
+ mov _data_ptr+5*8(arg1),inp5
+ mov _data_ptr+6*8(arg1),inp6
+ mov _data_ptr+7*8(arg1),inp7
+
+ xor IDX, IDX
+lloop:
+ vmovdqu PSHUFFLE_BYTE_FLIP_MASK(%rip), F
+ I=0
+.rep 2
+ VMOVPS (inp0, IDX), T0
+ VMOVPS (inp1, IDX), T1
+ VMOVPS (inp2, IDX), T2
+ VMOVPS (inp3, IDX), T3
+ VMOVPS (inp4, IDX), T4
+ VMOVPS (inp5, IDX), T5
+ VMOVPS (inp6, IDX), T6
+ VMOVPS (inp7, IDX), T7
+
+ TRANSPOSE8 T0, T1, T2, T3, T4, T5, T6, T7, T8, T9
+ vpshufb F, T0, T0
+ vmovdqu T0, (I*8)*32(%rsp)
+ vpshufb F, T1, T1
+ vmovdqu T1, (I*8+1)*32(%rsp)
+ vpshufb F, T2, T2
+ vmovdqu T2, (I*8+2)*32(%rsp)
+ vpshufb F, T3, T3
+ vmovdqu T3, (I*8+3)*32(%rsp)
+ vpshufb F, T4, T4
+ vmovdqu T4, (I*8+4)*32(%rsp)
+ vpshufb F, T5, T5
+ vmovdqu T5, (I*8+5)*32(%rsp)
+ vpshufb F, T6, T6
+ vmovdqu T6, (I*8+6)*32(%rsp)
+ vpshufb F, T7, T7
+ vmovdqu T7, (I*8+7)*32(%rsp)
+ add $32, IDX
+ I = (I+1)
+.endr
+ # save old digests
+ vmovdqu A,AA
+ vmovdqu B,BB
+ vmovdqu C,CC
+ vmovdqu D,DD
+ vmovdqu E,EE
+
+##
+## perform 0-79 steps
+##
+ vmovdqu K00_19(%rip), K
+## do rounds 0...15
+ I = 0
+.rep 16
+ SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
+ ROTATE_ARGS
+ I = (I+1)
+.endr
+
+## do rounds 16...19
+ vmovdqu ((16 - 16) & 15) * 32 (%rsp), W16
+ vmovdqu ((16 - 15) & 15) * 32 (%rsp), W15
+.rep 4
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
+ ROTATE_ARGS
+ I = (I+1)
+.endr
+
+## do rounds 20...39
+ vmovdqu K20_39(%rip), K
+.rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1
+ ROTATE_ARGS
+ I = (I+1)
+.endr
+
+## do rounds 40...59
+ vmovdqu K40_59(%rip), K
+.rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2
+ ROTATE_ARGS
+ I = (I+1)
+.endr
+
+## do rounds 60...79
+ vmovdqu K60_79(%rip), K
+.rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3
+ ROTATE_ARGS
+ I = (I+1)
+.endr
+
+ vpaddd AA,A,A
+ vpaddd BB,B,B
+ vpaddd CC,C,C
+ vpaddd DD,D,D
+ vpaddd EE,E,E
+
+ sub $1, arg2
+ jne lloop
+
+ # write out digests
+ vmovdqu A, 0*32(arg1)
+ vmovdqu B, 1*32(arg1)
+ vmovdqu C, 2*32(arg1)
+ vmovdqu D, 3*32(arg1)
+ vmovdqu E, 4*32(arg1)
+
+ # update input pointers
+ add IDX, inp0
+ add IDX, inp1
+ add IDX, inp2
+ add IDX, inp3
+ add IDX, inp4
+ add IDX, inp5
+ add IDX, inp6
+ add IDX, inp7
+ mov inp0, _data_ptr (arg1)
+ mov inp1, _data_ptr + 1*8(arg1)
+ mov inp2, _data_ptr + 2*8(arg1)
+ mov inp3, _data_ptr + 3*8(arg1)
+ mov inp4, _data_ptr + 4*8(arg1)
+ mov inp5, _data_ptr + 5*8(arg1)
+ mov inp6, _data_ptr + 6*8(arg1)
+ mov inp7, _data_ptr + 7*8(arg1)
+
+ ################
+ ## Postamble
+
+ mov RSP_SAVE, %rsp
+
+ # restore callee-saved clobbered registers
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+
+ ret
+ENDPROC(sha1_x8_avx2)
+
+
+.section .rodata.cst32.K00_19, "aM", @progbits, 32
+.align 32
+K00_19:
+.octa 0x5A8279995A8279995A8279995A827999
+.octa 0x5A8279995A8279995A8279995A827999
+
+.section .rodata.cst32.K20_39, "aM", @progbits, 32
+.align 32
+K20_39:
+.octa 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
+.octa 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
+
+.section .rodata.cst32.K40_59, "aM", @progbits, 32
+.align 32
+K40_59:
+.octa 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
+.octa 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
+
+.section .rodata.cst32.K60_79, "aM", @progbits, 32
+.align 32
+K60_79:
+.octa 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
+.octa 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
+
+.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
+.align 32
+PSHUFFLE_BYTE_FLIP_MASK:
+.octa 0x0c0d0e0f08090a0b0405060700010203
+.octa 0x0c0d0e0f08090a0b0405060700010203
diff --git a/arch/x86/crypto/sha1_avx2_x86_64_asm.S b/arch/x86/crypto/sha1_avx2_x86_64_asm.S
new file mode 100644
index 000000000..9f712a7df
--- /dev/null
+++ b/arch/x86/crypto/sha1_avx2_x86_64_asm.S
@@ -0,0 +1,711 @@
+/*
+ * Implement fast SHA-1 with AVX2 instructions. (x86_64)
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * Ilya Albrekht <ilya.albrekht@intel.com>
+ * Maxim Locktyukhin <maxim.locktyukhin@intel.com>
+ * Ronen Zohar <ronen.zohar@intel.com>
+ * Chandramouli Narayanan <mouli@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * SHA-1 implementation with Intel(R) AVX2 instruction set extensions.
+ *
+ *This implementation is based on the previous SSSE3 release:
+ *Visit http://software.intel.com/en-us/articles/
+ *and refer to improving-the-performance-of-the-secure-hash-algorithm-1/
+ *
+ *Updates 20-byte SHA-1 record in 'hash' for even number of
+ *'num_blocks' consecutive 64-byte blocks
+ *
+ *extern "C" void sha1_transform_avx2(
+ * int *hash, const char* input, size_t num_blocks );
+ */
+
+#include <linux/linkage.h>
+
+#define CTX %rdi /* arg1 */
+#define BUF %rsi /* arg2 */
+#define CNT %rdx /* arg3 */
+
+#define REG_A %ecx
+#define REG_B %esi
+#define REG_C %edi
+#define REG_D %eax
+#define REG_E %edx
+#define REG_TB %ebx
+#define REG_TA %r12d
+#define REG_RA %rcx
+#define REG_RB %rsi
+#define REG_RC %rdi
+#define REG_RD %rax
+#define REG_RE %rdx
+#define REG_RTA %r12
+#define REG_RTB %rbx
+#define REG_T1 %r11d
+#define xmm_mov vmovups
+#define avx2_zeroupper vzeroupper
+#define RND_F1 1
+#define RND_F2 2
+#define RND_F3 3
+
+.macro REGALLOC
+ .set A, REG_A
+ .set B, REG_B
+ .set C, REG_C
+ .set D, REG_D
+ .set E, REG_E
+ .set TB, REG_TB
+ .set TA, REG_TA
+
+ .set RA, REG_RA
+ .set RB, REG_RB
+ .set RC, REG_RC
+ .set RD, REG_RD
+ .set RE, REG_RE
+
+ .set RTA, REG_RTA
+ .set RTB, REG_RTB
+
+ .set T1, REG_T1
+.endm
+
+#define HASH_PTR %r9
+#define BLOCKS_CTR %r8
+#define BUFFER_PTR %r10
+#define BUFFER_PTR2 %r13
+
+#define PRECALC_BUF %r14
+#define WK_BUF %r15
+
+#define W_TMP %xmm0
+#define WY_TMP %ymm0
+#define WY_TMP2 %ymm9
+
+# AVX2 variables
+#define WY0 %ymm3
+#define WY4 %ymm5
+#define WY08 %ymm7
+#define WY12 %ymm8
+#define WY16 %ymm12
+#define WY20 %ymm13
+#define WY24 %ymm14
+#define WY28 %ymm15
+
+#define YMM_SHUFB_BSWAP %ymm10
+
+/*
+ * Keep 2 iterations precalculated at a time:
+ * - 80 DWORDs per iteration * 2
+ */
+#define W_SIZE (80*2*2 +16)
+
+#define WK(t) ((((t) % 80) / 4)*32 + ( (t) % 4)*4 + ((t)/80)*16 )(WK_BUF)
+#define PRECALC_WK(t) ((t)*2*2)(PRECALC_BUF)
+
+
+.macro UPDATE_HASH hash, val
+ add \hash, \val
+ mov \val, \hash
+.endm
+
+.macro PRECALC_RESET_WY
+ .set WY_00, WY0
+ .set WY_04, WY4
+ .set WY_08, WY08
+ .set WY_12, WY12
+ .set WY_16, WY16
+ .set WY_20, WY20
+ .set WY_24, WY24
+ .set WY_28, WY28
+ .set WY_32, WY_00
+.endm
+
+.macro PRECALC_ROTATE_WY
+ /* Rotate macros */
+ .set WY_32, WY_28
+ .set WY_28, WY_24
+ .set WY_24, WY_20
+ .set WY_20, WY_16
+ .set WY_16, WY_12
+ .set WY_12, WY_08
+ .set WY_08, WY_04
+ .set WY_04, WY_00
+ .set WY_00, WY_32
+
+ /* Define register aliases */
+ .set WY, WY_00
+ .set WY_minus_04, WY_04
+ .set WY_minus_08, WY_08
+ .set WY_minus_12, WY_12
+ .set WY_minus_16, WY_16
+ .set WY_minus_20, WY_20
+ .set WY_minus_24, WY_24
+ .set WY_minus_28, WY_28
+ .set WY_minus_32, WY
+.endm
+
+.macro PRECALC_00_15
+ .if (i == 0) # Initialize and rotate registers
+ PRECALC_RESET_WY
+ PRECALC_ROTATE_WY
+ .endif
+
+ /* message scheduling pre-compute for rounds 0-15 */
+ .if ((i & 7) == 0)
+ /*
+ * blended AVX2 and ALU instruction scheduling
+ * 1 vector iteration per 8 rounds
+ */
+ vmovdqu (i * 2)(BUFFER_PTR), W_TMP
+ .elseif ((i & 7) == 1)
+ vinsertf128 $1, ((i-1) * 2)(BUFFER_PTR2),\
+ WY_TMP, WY_TMP
+ .elseif ((i & 7) == 2)
+ vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY
+ .elseif ((i & 7) == 4)
+ vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
+ .elseif ((i & 7) == 7)
+ vmovdqu WY_TMP, PRECALC_WK(i&~7)
+
+ PRECALC_ROTATE_WY
+ .endif
+.endm
+
+.macro PRECALC_16_31
+ /*
+ * message scheduling pre-compute for rounds 16-31
+ * calculating last 32 w[i] values in 8 XMM registers
+ * pre-calculate K+w[i] values and store to mem
+ * for later load by ALU add instruction
+ *
+ * "brute force" vectorization for rounds 16-31 only
+ * due to w[i]->w[i-3] dependency
+ */
+ .if ((i & 7) == 0)
+ /*
+ * blended AVX2 and ALU instruction scheduling
+ * 1 vector iteration per 8 rounds
+ */
+ /* w[i-14] */
+ vpalignr $8, WY_minus_16, WY_minus_12, WY
+ vpsrldq $4, WY_minus_04, WY_TMP /* w[i-3] */
+ .elseif ((i & 7) == 1)
+ vpxor WY_minus_08, WY, WY
+ vpxor WY_minus_16, WY_TMP, WY_TMP
+ .elseif ((i & 7) == 2)
+ vpxor WY_TMP, WY, WY
+ vpslldq $12, WY, WY_TMP2
+ .elseif ((i & 7) == 3)
+ vpslld $1, WY, WY_TMP
+ vpsrld $31, WY, WY
+ .elseif ((i & 7) == 4)
+ vpor WY, WY_TMP, WY_TMP
+ vpslld $2, WY_TMP2, WY
+ .elseif ((i & 7) == 5)
+ vpsrld $30, WY_TMP2, WY_TMP2
+ vpxor WY, WY_TMP, WY_TMP
+ .elseif ((i & 7) == 7)
+ vpxor WY_TMP2, WY_TMP, WY
+ vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
+ vmovdqu WY_TMP, PRECALC_WK(i&~7)
+
+ PRECALC_ROTATE_WY
+ .endif
+.endm
+
+.macro PRECALC_32_79
+ /*
+ * in SHA-1 specification:
+ * w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1
+ * instead we do equal:
+ * w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
+ * allows more efficient vectorization
+ * since w[i]=>w[i-3] dependency is broken
+ */
+
+ .if ((i & 7) == 0)
+ /*
+ * blended AVX2 and ALU instruction scheduling
+ * 1 vector iteration per 8 rounds
+ */
+ vpalignr $8, WY_minus_08, WY_minus_04, WY_TMP
+ .elseif ((i & 7) == 1)
+ /* W is W_minus_32 before xor */
+ vpxor WY_minus_28, WY, WY
+ .elseif ((i & 7) == 2)
+ vpxor WY_minus_16, WY_TMP, WY_TMP
+ .elseif ((i & 7) == 3)
+ vpxor WY_TMP, WY, WY
+ .elseif ((i & 7) == 4)
+ vpslld $2, WY, WY_TMP
+ .elseif ((i & 7) == 5)
+ vpsrld $30, WY, WY
+ vpor WY, WY_TMP, WY
+ .elseif ((i & 7) == 7)
+ vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
+ vmovdqu WY_TMP, PRECALC_WK(i&~7)
+
+ PRECALC_ROTATE_WY
+ .endif
+.endm
+
+.macro PRECALC r, s
+ .set i, \r
+
+ .if (i < 40)
+ .set K_XMM, 32*0
+ .elseif (i < 80)
+ .set K_XMM, 32*1
+ .elseif (i < 120)
+ .set K_XMM, 32*2
+ .else
+ .set K_XMM, 32*3
+ .endif
+
+ .if (i<32)
+ PRECALC_00_15 \s
+ .elseif (i<64)
+ PRECALC_16_31 \s
+ .elseif (i < 160)
+ PRECALC_32_79 \s
+ .endif
+.endm
+
+.macro ROTATE_STATE
+ .set T_REG, E
+ .set E, D
+ .set D, C
+ .set C, B
+ .set B, TB
+ .set TB, A
+ .set A, T_REG
+
+ .set T_REG, RE
+ .set RE, RD
+ .set RD, RC
+ .set RC, RB
+ .set RB, RTB
+ .set RTB, RA
+ .set RA, T_REG
+.endm
+
+/* Macro relies on saved ROUND_Fx */
+
+.macro RND_FUN f, r
+ .if (\f == RND_F1)
+ ROUND_F1 \r
+ .elseif (\f == RND_F2)
+ ROUND_F2 \r
+ .elseif (\f == RND_F3)
+ ROUND_F3 \r
+ .endif
+.endm
+
+.macro RR r
+ .set round_id, (\r % 80)
+
+ .if (round_id == 0) /* Precalculate F for first round */
+ .set ROUND_FUNC, RND_F1
+ mov B, TB
+
+ rorx $(32-30), B, B /* b>>>2 */
+ andn D, TB, T1
+ and C, TB
+ xor T1, TB
+ .endif
+
+ RND_FUN ROUND_FUNC, \r
+ ROTATE_STATE
+
+ .if (round_id == 18)
+ .set ROUND_FUNC, RND_F2
+ .elseif (round_id == 38)
+ .set ROUND_FUNC, RND_F3
+ .elseif (round_id == 58)
+ .set ROUND_FUNC, RND_F2
+ .endif
+
+ .set round_id, ( (\r+1) % 80)
+
+ RND_FUN ROUND_FUNC, (\r+1)
+ ROTATE_STATE
+.endm
+
+.macro ROUND_F1 r
+ add WK(\r), E
+
+ andn C, A, T1 /* ~b&d */
+ lea (RE,RTB), E /* Add F from the previous round */
+
+ rorx $(32-5), A, TA /* T2 = A >>> 5 */
+ rorx $(32-30),A, TB /* b>>>2 for next round */
+
+ PRECALC (\r) /* msg scheduling for next 2 blocks */
+
+ /*
+ * Calculate F for the next round
+ * (b & c) ^ andn[b, d]
+ */
+ and B, A /* b&c */
+ xor T1, A /* F1 = (b&c) ^ (~b&d) */
+
+ lea (RE,RTA), E /* E += A >>> 5 */
+.endm
+
+.macro ROUND_F2 r
+ add WK(\r), E
+ lea (RE,RTB), E /* Add F from the previous round */
+
+ /* Calculate F for the next round */
+ rorx $(32-5), A, TA /* T2 = A >>> 5 */
+ .if ((round_id) < 79)
+ rorx $(32-30), A, TB /* b>>>2 for next round */
+ .endif
+ PRECALC (\r) /* msg scheduling for next 2 blocks */
+
+ .if ((round_id) < 79)
+ xor B, A
+ .endif
+
+ add TA, E /* E += A >>> 5 */
+
+ .if ((round_id) < 79)
+ xor C, A
+ .endif
+.endm
+
+.macro ROUND_F3 r
+ add WK(\r), E
+ PRECALC (\r) /* msg scheduling for next 2 blocks */
+
+ lea (RE,RTB), E /* Add F from the previous round */
+
+ mov B, T1
+ or A, T1
+
+ rorx $(32-5), A, TA /* T2 = A >>> 5 */
+ rorx $(32-30), A, TB /* b>>>2 for next round */
+
+ /* Calculate F for the next round
+ * (b and c) or (d and (b or c))
+ */
+ and C, T1
+ and B, A
+ or T1, A
+
+ add TA, E /* E += A >>> 5 */
+
+.endm
+
+/* Add constant only if (%2 > %3) condition met (uses RTA as temp)
+ * %1 + %2 >= %3 ? %4 : 0
+ */
+.macro ADD_IF_GE a, b, c, d
+ mov \a, RTA
+ add $\d, RTA
+ cmp $\c, \b
+ cmovge RTA, \a
+.endm
+
+/*
+ * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining
+ */
+.macro SHA1_PIPELINED_MAIN_BODY
+
+ REGALLOC
+
+ mov (HASH_PTR), A
+ mov 4(HASH_PTR), B
+ mov 8(HASH_PTR), C
+ mov 12(HASH_PTR), D
+ mov 16(HASH_PTR), E
+
+ mov %rsp, PRECALC_BUF
+ lea (2*4*80+32)(%rsp), WK_BUF
+
+ # Precalc WK for first 2 blocks
+ ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64
+ .set i, 0
+ .rept 160
+ PRECALC i
+ .set i, i + 1
+ .endr
+
+ /* Go to next block if needed */
+ ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128
+ ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
+ xchg WK_BUF, PRECALC_BUF
+
+ .align 32
+_loop:
+ /*
+ * code loops through more than one block
+ * we use K_BASE value as a signal of a last block,
+ * it is set below by: cmovae BUFFER_PTR, K_BASE
+ */
+ test BLOCKS_CTR, BLOCKS_CTR
+ jnz _begin
+ .align 32
+ jmp _end
+ .align 32
+_begin:
+
+ /*
+ * Do first block
+ * rounds: 0,2,4,6,8
+ */
+ .set j, 0
+ .rept 5
+ RR j
+ .set j, j+2
+ .endr
+
+ jmp _loop0
+_loop0:
+
+ /*
+ * rounds:
+ * 10,12,14,16,18
+ * 20,22,24,26,28
+ * 30,32,34,36,38
+ * 40,42,44,46,48
+ * 50,52,54,56,58
+ */
+ .rept 25
+ RR j
+ .set j, j+2
+ .endr
+
+ /* Update Counter */
+ sub $1, BLOCKS_CTR
+ /* Move to the next block only if needed*/
+ ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128
+ /*
+ * rounds
+ * 60,62,64,66,68
+ * 70,72,74,76,78
+ */
+ .rept 10
+ RR j
+ .set j, j+2
+ .endr
+
+ UPDATE_HASH (HASH_PTR), A
+ UPDATE_HASH 4(HASH_PTR), TB
+ UPDATE_HASH 8(HASH_PTR), C
+ UPDATE_HASH 12(HASH_PTR), D
+ UPDATE_HASH 16(HASH_PTR), E
+
+ test BLOCKS_CTR, BLOCKS_CTR
+ jz _loop
+
+ mov TB, B
+
+ /* Process second block */
+ /*
+ * rounds
+ * 0+80, 2+80, 4+80, 6+80, 8+80
+ * 10+80,12+80,14+80,16+80,18+80
+ */
+
+ .set j, 0
+ .rept 10
+ RR j+80
+ .set j, j+2
+ .endr
+
+ jmp _loop1
+_loop1:
+ /*
+ * rounds
+ * 20+80,22+80,24+80,26+80,28+80
+ * 30+80,32+80,34+80,36+80,38+80
+ */
+ .rept 10
+ RR j+80
+ .set j, j+2
+ .endr
+
+ jmp _loop2
+_loop2:
+
+ /*
+ * rounds
+ * 40+80,42+80,44+80,46+80,48+80
+ * 50+80,52+80,54+80,56+80,58+80
+ */
+ .rept 10
+ RR j+80
+ .set j, j+2
+ .endr
+
+ /* update counter */
+ sub $1, BLOCKS_CTR
+ /* Move to the next block only if needed*/
+ ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
+
+ jmp _loop3
+_loop3:
+
+ /*
+ * rounds
+ * 60+80,62+80,64+80,66+80,68+80
+ * 70+80,72+80,74+80,76+80,78+80
+ */
+ .rept 10
+ RR j+80
+ .set j, j+2
+ .endr
+
+ UPDATE_HASH (HASH_PTR), A
+ UPDATE_HASH 4(HASH_PTR), TB
+ UPDATE_HASH 8(HASH_PTR), C
+ UPDATE_HASH 12(HASH_PTR), D
+ UPDATE_HASH 16(HASH_PTR), E
+
+ /* Reset state for AVX2 reg permutation */
+ mov A, TA
+ mov TB, A
+ mov C, TB
+ mov E, C
+ mov D, B
+ mov TA, D
+
+ REGALLOC
+
+ xchg WK_BUF, PRECALC_BUF
+
+ jmp _loop
+
+ .align 32
+ _end:
+
+.endm
+/*
+ * macro implements SHA-1 function's body for several 64-byte blocks
+ * param: function's name
+ */
+.macro SHA1_VECTOR_ASM name
+ ENTRY(\name)
+
+ push %rbx
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ RESERVE_STACK = (W_SIZE*4 + 8+24)
+
+ /* Align stack */
+ mov %rsp, %rbx
+ and $~(0x20-1), %rsp
+ push %rbx
+ sub $RESERVE_STACK, %rsp
+
+ avx2_zeroupper
+
+ /* Setup initial values */
+ mov CTX, HASH_PTR
+ mov BUF, BUFFER_PTR
+
+ mov BUF, BUFFER_PTR2
+ mov CNT, BLOCKS_CTR
+
+ xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP
+
+ SHA1_PIPELINED_MAIN_BODY
+
+ avx2_zeroupper
+
+ add $RESERVE_STACK, %rsp
+ pop %rsp
+
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbx
+
+ ret
+
+ ENDPROC(\name)
+.endm
+
+.section .rodata
+
+#define K1 0x5a827999
+#define K2 0x6ed9eba1
+#define K3 0x8f1bbcdc
+#define K4 0xca62c1d6
+
+.align 128
+K_XMM_AR:
+ .long K1, K1, K1, K1
+ .long K1, K1, K1, K1
+ .long K2, K2, K2, K2
+ .long K2, K2, K2, K2
+ .long K3, K3, K3, K3
+ .long K3, K3, K3, K3
+ .long K4, K4, K4, K4
+ .long K4, K4, K4, K4
+
+BSWAP_SHUFB_CTL:
+ .long 0x00010203
+ .long 0x04050607
+ .long 0x08090a0b
+ .long 0x0c0d0e0f
+ .long 0x00010203
+ .long 0x04050607
+ .long 0x08090a0b
+ .long 0x0c0d0e0f
+.text
+
+SHA1_VECTOR_ASM sha1_transform_avx2
diff --git a/arch/x86/crypto/sha1_ni_asm.S b/arch/x86/crypto/sha1_ni_asm.S
new file mode 100644
index 000000000..ebbdba72a
--- /dev/null
+++ b/arch/x86/crypto/sha1_ni_asm.S
@@ -0,0 +1,304 @@
+/*
+ * Intel SHA Extensions optimized implementation of a SHA-1 update function
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * Sean Gulley <sean.m.gulley@intel.com>
+ * Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/linkage.h>
+
+#define DIGEST_PTR %rdi /* 1st arg */
+#define DATA_PTR %rsi /* 2nd arg */
+#define NUM_BLKS %rdx /* 3rd arg */
+
+#define RSPSAVE %rax
+
+/* gcc conversion */
+#define FRAME_SIZE 32 /* space for 2x16 bytes */
+
+#define ABCD %xmm0
+#define E0 %xmm1 /* Need two E's b/c they ping pong */
+#define E1 %xmm2
+#define MSG0 %xmm3
+#define MSG1 %xmm4
+#define MSG2 %xmm5
+#define MSG3 %xmm6
+#define SHUF_MASK %xmm7
+
+
+/*
+ * Intel SHA Extensions optimized implementation of a SHA-1 update function
+ *
+ * The function takes a pointer to the current hash values, a pointer to the
+ * input data, and a number of 64 byte blocks to process. Once all blocks have
+ * been processed, the digest pointer is updated with the resulting hash value.
+ * The function only processes complete blocks, there is no functionality to
+ * store partial blocks. All message padding and hash value initialization must
+ * be done outside the update function.
+ *
+ * The indented lines in the loop are instructions related to rounds processing.
+ * The non-indented lines are instructions related to the message schedule.
+ *
+ * void sha1_ni_transform(uint32_t *digest, const void *data,
+ uint32_t numBlocks)
+ * digest : pointer to digest
+ * data: pointer to input data
+ * numBlocks: Number of blocks to process
+ */
+.text
+.align 32
+ENTRY(sha1_ni_transform)
+ mov %rsp, RSPSAVE
+ sub $FRAME_SIZE, %rsp
+ and $~0xF, %rsp
+
+ shl $6, NUM_BLKS /* convert to bytes */
+ jz .Ldone_hash
+ add DATA_PTR, NUM_BLKS /* pointer to end of data */
+
+ /* load initial hash values */
+ pinsrd $3, 1*16(DIGEST_PTR), E0
+ movdqu 0*16(DIGEST_PTR), ABCD
+ pand UPPER_WORD_MASK(%rip), E0
+ pshufd $0x1B, ABCD, ABCD
+
+ movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
+
+.Lloop0:
+ /* Save hash values for addition after rounds */
+ movdqa E0, (0*16)(%rsp)
+ movdqa ABCD, (1*16)(%rsp)
+
+ /* Rounds 0-3 */
+ movdqu 0*16(DATA_PTR), MSG0
+ pshufb SHUF_MASK, MSG0
+ paddd MSG0, E0
+ movdqa ABCD, E1
+ sha1rnds4 $0, E0, ABCD
+
+ /* Rounds 4-7 */
+ movdqu 1*16(DATA_PTR), MSG1
+ pshufb SHUF_MASK, MSG1
+ sha1nexte MSG1, E1
+ movdqa ABCD, E0
+ sha1rnds4 $0, E1, ABCD
+ sha1msg1 MSG1, MSG0
+
+ /* Rounds 8-11 */
+ movdqu 2*16(DATA_PTR), MSG2
+ pshufb SHUF_MASK, MSG2
+ sha1nexte MSG2, E0
+ movdqa ABCD, E1
+ sha1rnds4 $0, E0, ABCD
+ sha1msg1 MSG2, MSG1
+ pxor MSG2, MSG0
+
+ /* Rounds 12-15 */
+ movdqu 3*16(DATA_PTR), MSG3
+ pshufb SHUF_MASK, MSG3
+ sha1nexte MSG3, E1
+ movdqa ABCD, E0
+ sha1msg2 MSG3, MSG0
+ sha1rnds4 $0, E1, ABCD
+ sha1msg1 MSG3, MSG2
+ pxor MSG3, MSG1
+
+ /* Rounds 16-19 */
+ sha1nexte MSG0, E0
+ movdqa ABCD, E1
+ sha1msg2 MSG0, MSG1
+ sha1rnds4 $0, E0, ABCD
+ sha1msg1 MSG0, MSG3
+ pxor MSG0, MSG2
+
+ /* Rounds 20-23 */
+ sha1nexte MSG1, E1
+ movdqa ABCD, E0
+ sha1msg2 MSG1, MSG2
+ sha1rnds4 $1, E1, ABCD
+ sha1msg1 MSG1, MSG0
+ pxor MSG1, MSG3
+
+ /* Rounds 24-27 */
+ sha1nexte MSG2, E0
+ movdqa ABCD, E1
+ sha1msg2 MSG2, MSG3
+ sha1rnds4 $1, E0, ABCD
+ sha1msg1 MSG2, MSG1
+ pxor MSG2, MSG0
+
+ /* Rounds 28-31 */
+ sha1nexte MSG3, E1
+ movdqa ABCD, E0
+ sha1msg2 MSG3, MSG0
+ sha1rnds4 $1, E1, ABCD
+ sha1msg1 MSG3, MSG2
+ pxor MSG3, MSG1
+
+ /* Rounds 32-35 */
+ sha1nexte MSG0, E0
+ movdqa ABCD, E1
+ sha1msg2 MSG0, MSG1
+ sha1rnds4 $1, E0, ABCD
+ sha1msg1 MSG0, MSG3
+ pxor MSG0, MSG2
+
+ /* Rounds 36-39 */
+ sha1nexte MSG1, E1
+ movdqa ABCD, E0
+ sha1msg2 MSG1, MSG2
+ sha1rnds4 $1, E1, ABCD
+ sha1msg1 MSG1, MSG0
+ pxor MSG1, MSG3
+
+ /* Rounds 40-43 */
+ sha1nexte MSG2, E0
+ movdqa ABCD, E1
+ sha1msg2 MSG2, MSG3
+ sha1rnds4 $2, E0, ABCD
+ sha1msg1 MSG2, MSG1
+ pxor MSG2, MSG0
+
+ /* Rounds 44-47 */
+ sha1nexte MSG3, E1
+ movdqa ABCD, E0
+ sha1msg2 MSG3, MSG0
+ sha1rnds4 $2, E1, ABCD
+ sha1msg1 MSG3, MSG2
+ pxor MSG3, MSG1
+
+ /* Rounds 48-51 */
+ sha1nexte MSG0, E0
+ movdqa ABCD, E1
+ sha1msg2 MSG0, MSG1
+ sha1rnds4 $2, E0, ABCD
+ sha1msg1 MSG0, MSG3
+ pxor MSG0, MSG2
+
+ /* Rounds 52-55 */
+ sha1nexte MSG1, E1
+ movdqa ABCD, E0
+ sha1msg2 MSG1, MSG2
+ sha1rnds4 $2, E1, ABCD
+ sha1msg1 MSG1, MSG0
+ pxor MSG1, MSG3
+
+ /* Rounds 56-59 */
+ sha1nexte MSG2, E0
+ movdqa ABCD, E1
+ sha1msg2 MSG2, MSG3
+ sha1rnds4 $2, E0, ABCD
+ sha1msg1 MSG2, MSG1
+ pxor MSG2, MSG0
+
+ /* Rounds 60-63 */
+ sha1nexte MSG3, E1
+ movdqa ABCD, E0
+ sha1msg2 MSG3, MSG0
+ sha1rnds4 $3, E1, ABCD
+ sha1msg1 MSG3, MSG2
+ pxor MSG3, MSG1
+
+ /* Rounds 64-67 */
+ sha1nexte MSG0, E0
+ movdqa ABCD, E1
+ sha1msg2 MSG0, MSG1
+ sha1rnds4 $3, E0, ABCD
+ sha1msg1 MSG0, MSG3
+ pxor MSG0, MSG2
+
+ /* Rounds 68-71 */
+ sha1nexte MSG1, E1
+ movdqa ABCD, E0
+ sha1msg2 MSG1, MSG2
+ sha1rnds4 $3, E1, ABCD
+ pxor MSG1, MSG3
+
+ /* Rounds 72-75 */
+ sha1nexte MSG2, E0
+ movdqa ABCD, E1
+ sha1msg2 MSG2, MSG3
+ sha1rnds4 $3, E0, ABCD
+
+ /* Rounds 76-79 */
+ sha1nexte MSG3, E1
+ movdqa ABCD, E0
+ sha1rnds4 $3, E1, ABCD
+
+ /* Add current hash values with previously saved */
+ sha1nexte (0*16)(%rsp), E0
+ paddd (1*16)(%rsp), ABCD
+
+ /* Increment data pointer and loop if more to process */
+ add $64, DATA_PTR
+ cmp NUM_BLKS, DATA_PTR
+ jne .Lloop0
+
+ /* Write hash values back in the correct order */
+ pshufd $0x1B, ABCD, ABCD
+ movdqu ABCD, 0*16(DIGEST_PTR)
+ pextrd $3, E0, 1*16(DIGEST_PTR)
+
+.Ldone_hash:
+ mov RSPSAVE, %rsp
+
+ ret
+ENDPROC(sha1_ni_transform)
+
+.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
+.align 16
+PSHUFFLE_BYTE_FLIP_MASK:
+ .octa 0x000102030405060708090a0b0c0d0e0f
+
+.section .rodata.cst16.UPPER_WORD_MASK, "aM", @progbits, 16
+.align 16
+UPPER_WORD_MASK:
+ .octa 0xFFFFFFFF000000000000000000000000
diff --git a/arch/x86/crypto/sha1_ssse3_asm.S b/arch/x86/crypto/sha1_ssse3_asm.S
new file mode 100644
index 000000000..613d0bfc3
--- /dev/null
+++ b/arch/x86/crypto/sha1_ssse3_asm.S
@@ -0,0 +1,557 @@
+/*
+ * This is a SIMD SHA-1 implementation. It requires the Intel(R) Supplemental
+ * SSE3 instruction set extensions introduced in Intel Core Microarchitecture
+ * processors. CPUs supporting Intel(R) AVX extensions will get an additional
+ * boost.
+ *
+ * This work was inspired by the vectorized implementation of Dean Gaudet.
+ * Additional information on it can be found at:
+ * http://www.arctic.org/~dean/crypto/sha1.html
+ *
+ * It was improved upon with more efficient vectorization of the message
+ * scheduling. This implementation has also been optimized for all current and
+ * several future generations of Intel CPUs.
+ *
+ * See this article for more information about the implementation details:
+ * http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/
+ *
+ * Copyright (C) 2010, Intel Corp.
+ * Authors: Maxim Locktyukhin <maxim.locktyukhin@intel.com>
+ * Ronen Zohar <ronen.zohar@intel.com>
+ *
+ * Converted to AT&T syntax and adapted for inclusion in the Linux kernel:
+ * Author: Mathias Krause <minipli@googlemail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/linkage.h>
+
+#define CTX %rdi // arg1
+#define BUF %rsi // arg2
+#define CNT %rdx // arg3
+
+#define REG_A %ecx
+#define REG_B %esi
+#define REG_C %edi
+#define REG_D %r12d
+#define REG_E %edx
+
+#define REG_T1 %eax
+#define REG_T2 %ebx
+
+#define K_BASE %r8
+#define HASH_PTR %r9
+#define BUFFER_PTR %r10
+#define BUFFER_END %r11
+
+#define W_TMP1 %xmm0
+#define W_TMP2 %xmm9
+
+#define W0 %xmm1
+#define W4 %xmm2
+#define W8 %xmm3
+#define W12 %xmm4
+#define W16 %xmm5
+#define W20 %xmm6
+#define W24 %xmm7
+#define W28 %xmm8
+
+#define XMM_SHUFB_BSWAP %xmm10
+
+/* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */
+#define WK(t) (((t) & 15) * 4)(%rsp)
+#define W_PRECALC_AHEAD 16
+
+/*
+ * This macro implements the SHA-1 function's body for single 64-byte block
+ * param: function's name
+ */
+.macro SHA1_VECTOR_ASM name
+ ENTRY(\name)
+
+ push %rbx
+ push %r12
+ push %rbp
+ mov %rsp, %rbp
+
+ sub $64, %rsp # allocate workspace
+ and $~15, %rsp # align stack
+
+ mov CTX, HASH_PTR
+ mov BUF, BUFFER_PTR
+
+ shl $6, CNT # multiply by 64
+ add BUF, CNT
+ mov CNT, BUFFER_END
+
+ lea K_XMM_AR(%rip), K_BASE
+ xmm_mov BSWAP_SHUFB_CTL(%rip), XMM_SHUFB_BSWAP
+
+ SHA1_PIPELINED_MAIN_BODY
+
+ # cleanup workspace
+ mov $8, %ecx
+ mov %rsp, %rdi
+ xor %eax, %eax
+ rep stosq
+
+ mov %rbp, %rsp # deallocate workspace
+ pop %rbp
+ pop %r12
+ pop %rbx
+ ret
+
+ ENDPROC(\name)
+.endm
+
+/*
+ * This macro implements 80 rounds of SHA-1 for one 64-byte block
+ */
+.macro SHA1_PIPELINED_MAIN_BODY
+ INIT_REGALLOC
+
+ mov (HASH_PTR), A
+ mov 4(HASH_PTR), B
+ mov 8(HASH_PTR), C
+ mov 12(HASH_PTR), D
+ mov 16(HASH_PTR), E
+
+ .set i, 0
+ .rept W_PRECALC_AHEAD
+ W_PRECALC i
+ .set i, (i+1)
+ .endr
+
+.align 4
+1:
+ RR F1,A,B,C,D,E,0
+ RR F1,D,E,A,B,C,2
+ RR F1,B,C,D,E,A,4
+ RR F1,E,A,B,C,D,6
+ RR F1,C,D,E,A,B,8
+
+ RR F1,A,B,C,D,E,10
+ RR F1,D,E,A,B,C,12
+ RR F1,B,C,D,E,A,14
+ RR F1,E,A,B,C,D,16
+ RR F1,C,D,E,A,B,18
+
+ RR F2,A,B,C,D,E,20
+ RR F2,D,E,A,B,C,22
+ RR F2,B,C,D,E,A,24
+ RR F2,E,A,B,C,D,26
+ RR F2,C,D,E,A,B,28
+
+ RR F2,A,B,C,D,E,30
+ RR F2,D,E,A,B,C,32
+ RR F2,B,C,D,E,A,34
+ RR F2,E,A,B,C,D,36
+ RR F2,C,D,E,A,B,38
+
+ RR F3,A,B,C,D,E,40
+ RR F3,D,E,A,B,C,42
+ RR F3,B,C,D,E,A,44
+ RR F3,E,A,B,C,D,46
+ RR F3,C,D,E,A,B,48
+
+ RR F3,A,B,C,D,E,50
+ RR F3,D,E,A,B,C,52
+ RR F3,B,C,D,E,A,54
+ RR F3,E,A,B,C,D,56
+ RR F3,C,D,E,A,B,58
+
+ add $64, BUFFER_PTR # move to the next 64-byte block
+ cmp BUFFER_END, BUFFER_PTR # if the current is the last one use
+ cmovae K_BASE, BUFFER_PTR # dummy source to avoid buffer overrun
+
+ RR F4,A,B,C,D,E,60
+ RR F4,D,E,A,B,C,62
+ RR F4,B,C,D,E,A,64
+ RR F4,E,A,B,C,D,66
+ RR F4,C,D,E,A,B,68
+
+ RR F4,A,B,C,D,E,70
+ RR F4,D,E,A,B,C,72
+ RR F4,B,C,D,E,A,74
+ RR F4,E,A,B,C,D,76
+ RR F4,C,D,E,A,B,78
+
+ UPDATE_HASH (HASH_PTR), A
+ UPDATE_HASH 4(HASH_PTR), B
+ UPDATE_HASH 8(HASH_PTR), C
+ UPDATE_HASH 12(HASH_PTR), D
+ UPDATE_HASH 16(HASH_PTR), E
+
+ RESTORE_RENAMED_REGS
+ cmp K_BASE, BUFFER_PTR # K_BASE means, we reached the end
+ jne 1b
+.endm
+
+.macro INIT_REGALLOC
+ .set A, REG_A
+ .set B, REG_B
+ .set C, REG_C
+ .set D, REG_D
+ .set E, REG_E
+ .set T1, REG_T1
+ .set T2, REG_T2
+.endm
+
+.macro RESTORE_RENAMED_REGS
+ # order is important (REG_C is where it should be)
+ mov B, REG_B
+ mov D, REG_D
+ mov A, REG_A
+ mov E, REG_E
+.endm
+
+.macro SWAP_REG_NAMES a, b
+ .set _T, \a
+ .set \a, \b
+ .set \b, _T
+.endm
+
+.macro F1 b, c, d
+ mov \c, T1
+ SWAP_REG_NAMES \c, T1
+ xor \d, T1
+ and \b, T1
+ xor \d, T1
+.endm
+
+.macro F2 b, c, d
+ mov \d, T1
+ SWAP_REG_NAMES \d, T1
+ xor \c, T1
+ xor \b, T1
+.endm
+
+.macro F3 b, c ,d
+ mov \c, T1
+ SWAP_REG_NAMES \c, T1
+ mov \b, T2
+ or \b, T1
+ and \c, T2
+ and \d, T1
+ or T2, T1
+.endm
+
+.macro F4 b, c, d
+ F2 \b, \c, \d
+.endm
+
+.macro UPDATE_HASH hash, val
+ add \hash, \val
+ mov \val, \hash
+.endm
+
+/*
+ * RR does two rounds of SHA-1 back to back with W[] pre-calc
+ * t1 = F(b, c, d); e += w(i)
+ * e += t1; b <<= 30; d += w(i+1);
+ * t1 = F(a, b, c);
+ * d += t1; a <<= 5;
+ * e += a;
+ * t1 = e; a >>= 7;
+ * t1 <<= 5;
+ * d += t1;
+ */
+.macro RR F, a, b, c, d, e, round
+ add WK(\round), \e
+ \F \b, \c, \d # t1 = F(b, c, d);
+ W_PRECALC (\round + W_PRECALC_AHEAD)
+ rol $30, \b
+ add T1, \e
+ add WK(\round + 1), \d
+
+ \F \a, \b, \c
+ W_PRECALC (\round + W_PRECALC_AHEAD + 1)
+ rol $5, \a
+ add \a, \e
+ add T1, \d
+ ror $7, \a # (a <<r 5) >>r 7) => a <<r 30)
+
+ mov \e, T1
+ SWAP_REG_NAMES \e, T1
+
+ rol $5, T1
+ add T1, \d
+
+ # write: \a, \b
+ # rotate: \a<=\d, \b<=\e, \c<=\a, \d<=\b, \e<=\c
+.endm
+
+.macro W_PRECALC r
+ .set i, \r
+
+ .if (i < 20)
+ .set K_XMM, 0
+ .elseif (i < 40)
+ .set K_XMM, 16
+ .elseif (i < 60)
+ .set K_XMM, 32
+ .elseif (i < 80)
+ .set K_XMM, 48
+ .endif
+
+ .if ((i < 16) || ((i >= 80) && (i < (80 + W_PRECALC_AHEAD))))
+ .set i, ((\r) % 80) # pre-compute for the next iteration
+ .if (i == 0)
+ W_PRECALC_RESET
+ .endif
+ W_PRECALC_00_15
+ .elseif (i<32)
+ W_PRECALC_16_31
+ .elseif (i < 80) // rounds 32-79
+ W_PRECALC_32_79
+ .endif
+.endm
+
+.macro W_PRECALC_RESET
+ .set W, W0
+ .set W_minus_04, W4
+ .set W_minus_08, W8
+ .set W_minus_12, W12
+ .set W_minus_16, W16
+ .set W_minus_20, W20
+ .set W_minus_24, W24
+ .set W_minus_28, W28
+ .set W_minus_32, W
+.endm
+
+.macro W_PRECALC_ROTATE
+ .set W_minus_32, W_minus_28
+ .set W_minus_28, W_minus_24
+ .set W_minus_24, W_minus_20
+ .set W_minus_20, W_minus_16
+ .set W_minus_16, W_minus_12
+ .set W_minus_12, W_minus_08
+ .set W_minus_08, W_minus_04
+ .set W_minus_04, W
+ .set W, W_minus_32
+.endm
+
+.macro W_PRECALC_SSSE3
+
+.macro W_PRECALC_00_15
+ W_PRECALC_00_15_SSSE3
+.endm
+.macro W_PRECALC_16_31
+ W_PRECALC_16_31_SSSE3
+.endm
+.macro W_PRECALC_32_79
+ W_PRECALC_32_79_SSSE3
+.endm
+
+/* message scheduling pre-compute for rounds 0-15 */
+.macro W_PRECALC_00_15_SSSE3
+ .if ((i & 3) == 0)
+ movdqu (i*4)(BUFFER_PTR), W_TMP1
+ .elseif ((i & 3) == 1)
+ pshufb XMM_SHUFB_BSWAP, W_TMP1
+ movdqa W_TMP1, W
+ .elseif ((i & 3) == 2)
+ paddd (K_BASE), W_TMP1
+ .elseif ((i & 3) == 3)
+ movdqa W_TMP1, WK(i&~3)
+ W_PRECALC_ROTATE
+ .endif
+.endm
+
+/* message scheduling pre-compute for rounds 16-31
+ *
+ * - calculating last 32 w[i] values in 8 XMM registers
+ * - pre-calculate K+w[i] values and store to mem, for later load by ALU add
+ * instruction
+ *
+ * some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3]
+ * dependency, but improves for 32-79
+ */
+.macro W_PRECALC_16_31_SSSE3
+ # blended scheduling of vector and scalar instruction streams, one 4-wide
+ # vector iteration / 4 scalar rounds
+ .if ((i & 3) == 0)
+ movdqa W_minus_12, W
+ palignr $8, W_minus_16, W # w[i-14]
+ movdqa W_minus_04, W_TMP1
+ psrldq $4, W_TMP1 # w[i-3]
+ pxor W_minus_08, W
+ .elseif ((i & 3) == 1)
+ pxor W_minus_16, W_TMP1
+ pxor W_TMP1, W
+ movdqa W, W_TMP2
+ movdqa W, W_TMP1
+ pslldq $12, W_TMP2
+ .elseif ((i & 3) == 2)
+ psrld $31, W
+ pslld $1, W_TMP1
+ por W, W_TMP1
+ movdqa W_TMP2, W
+ psrld $30, W_TMP2
+ pslld $2, W
+ .elseif ((i & 3) == 3)
+ pxor W, W_TMP1
+ pxor W_TMP2, W_TMP1
+ movdqa W_TMP1, W
+ paddd K_XMM(K_BASE), W_TMP1
+ movdqa W_TMP1, WK(i&~3)
+ W_PRECALC_ROTATE
+ .endif
+.endm
+
+/* message scheduling pre-compute for rounds 32-79
+ *
+ * in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1
+ * instead we do equal: w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
+ * allows more efficient vectorization since w[i]=>w[i-3] dependency is broken
+ */
+.macro W_PRECALC_32_79_SSSE3
+ .if ((i & 3) == 0)
+ movdqa W_minus_04, W_TMP1
+ pxor W_minus_28, W # W is W_minus_32 before xor
+ palignr $8, W_minus_08, W_TMP1
+ .elseif ((i & 3) == 1)
+ pxor W_minus_16, W
+ pxor W_TMP1, W
+ movdqa W, W_TMP1
+ .elseif ((i & 3) == 2)
+ psrld $30, W
+ pslld $2, W_TMP1
+ por W, W_TMP1
+ .elseif ((i & 3) == 3)
+ movdqa W_TMP1, W
+ paddd K_XMM(K_BASE), W_TMP1
+ movdqa W_TMP1, WK(i&~3)
+ W_PRECALC_ROTATE
+ .endif
+.endm
+
+.endm // W_PRECALC_SSSE3
+
+
+#define K1 0x5a827999
+#define K2 0x6ed9eba1
+#define K3 0x8f1bbcdc
+#define K4 0xca62c1d6
+
+.section .rodata
+.align 16
+
+K_XMM_AR:
+ .long K1, K1, K1, K1
+ .long K2, K2, K2, K2
+ .long K3, K3, K3, K3
+ .long K4, K4, K4, K4
+
+BSWAP_SHUFB_CTL:
+ .long 0x00010203
+ .long 0x04050607
+ .long 0x08090a0b
+ .long 0x0c0d0e0f
+
+
+.section .text
+
+W_PRECALC_SSSE3
+.macro xmm_mov a, b
+ movdqu \a,\b
+.endm
+
+/* SSSE3 optimized implementation:
+ * extern "C" void sha1_transform_ssse3(u32 *digest, const char *data, u32 *ws,
+ * unsigned int rounds);
+ */
+SHA1_VECTOR_ASM sha1_transform_ssse3
+
+#ifdef CONFIG_AS_AVX
+
+.macro W_PRECALC_AVX
+
+.purgem W_PRECALC_00_15
+.macro W_PRECALC_00_15
+ W_PRECALC_00_15_AVX
+.endm
+.purgem W_PRECALC_16_31
+.macro W_PRECALC_16_31
+ W_PRECALC_16_31_AVX
+.endm
+.purgem W_PRECALC_32_79
+.macro W_PRECALC_32_79
+ W_PRECALC_32_79_AVX
+.endm
+
+.macro W_PRECALC_00_15_AVX
+ .if ((i & 3) == 0)
+ vmovdqu (i*4)(BUFFER_PTR), W_TMP1
+ .elseif ((i & 3) == 1)
+ vpshufb XMM_SHUFB_BSWAP, W_TMP1, W
+ .elseif ((i & 3) == 2)
+ vpaddd (K_BASE), W, W_TMP1
+ .elseif ((i & 3) == 3)
+ vmovdqa W_TMP1, WK(i&~3)
+ W_PRECALC_ROTATE
+ .endif
+.endm
+
+.macro W_PRECALC_16_31_AVX
+ .if ((i & 3) == 0)
+ vpalignr $8, W_minus_16, W_minus_12, W # w[i-14]
+ vpsrldq $4, W_minus_04, W_TMP1 # w[i-3]
+ vpxor W_minus_08, W, W
+ vpxor W_minus_16, W_TMP1, W_TMP1
+ .elseif ((i & 3) == 1)
+ vpxor W_TMP1, W, W
+ vpslldq $12, W, W_TMP2
+ vpslld $1, W, W_TMP1
+ .elseif ((i & 3) == 2)
+ vpsrld $31, W, W
+ vpor W, W_TMP1, W_TMP1
+ vpslld $2, W_TMP2, W
+ vpsrld $30, W_TMP2, W_TMP2
+ .elseif ((i & 3) == 3)
+ vpxor W, W_TMP1, W_TMP1
+ vpxor W_TMP2, W_TMP1, W
+ vpaddd K_XMM(K_BASE), W, W_TMP1
+ vmovdqu W_TMP1, WK(i&~3)
+ W_PRECALC_ROTATE
+ .endif
+.endm
+
+.macro W_PRECALC_32_79_AVX
+ .if ((i & 3) == 0)
+ vpalignr $8, W_minus_08, W_minus_04, W_TMP1
+ vpxor W_minus_28, W, W # W is W_minus_32 before xor
+ .elseif ((i & 3) == 1)
+ vpxor W_minus_16, W_TMP1, W_TMP1
+ vpxor W_TMP1, W, W
+ .elseif ((i & 3) == 2)
+ vpslld $2, W, W_TMP1
+ vpsrld $30, W, W
+ vpor W, W_TMP1, W
+ .elseif ((i & 3) == 3)
+ vpaddd K_XMM(K_BASE), W, W_TMP1
+ vmovdqu W_TMP1, WK(i&~3)
+ W_PRECALC_ROTATE
+ .endif
+.endm
+
+.endm // W_PRECALC_AVX
+
+W_PRECALC_AVX
+.purgem xmm_mov
+.macro xmm_mov a, b
+ vmovdqu \a,\b
+.endm
+
+
+/* AVX optimized implementation:
+ * extern "C" void sha1_transform_avx(u32 *digest, const char *data, u32 *ws,
+ * unsigned int rounds);
+ */
+SHA1_VECTOR_ASM sha1_transform_avx
+
+#endif
diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c
new file mode 100644
index 000000000..7391c7de7
--- /dev/null
+++ b/arch/x86/crypto/sha1_ssse3_glue.c
@@ -0,0 +1,378 @@
+/*
+ * Cryptographic API.
+ *
+ * Glue code for the SHA1 Secure Hash Algorithm assembler implementation using
+ * Supplemental SSE3 instructions.
+ *
+ * This file is based on sha1_generic.c
+ *
+ * Copyright (c) Alan Smithee.
+ * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
+ * Copyright (c) Jean-Francois Dive <jef@linuxbe.org>
+ * Copyright (c) Mathias Krause <minipli@googlemail.com>
+ * Copyright (c) Chandramouli Narayanan <mouli@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <crypto/internal/hash.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <crypto/sha.h>
+#include <crypto/sha1_base.h>
+#include <asm/fpu/api.h>
+
+typedef void (sha1_transform_fn)(u32 *digest, const char *data,
+ unsigned int rounds);
+
+static int sha1_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len, sha1_transform_fn *sha1_xform)
+{
+ struct sha1_state *sctx = shash_desc_ctx(desc);
+
+ if (!irq_fpu_usable() ||
+ (sctx->count % SHA1_BLOCK_SIZE) + len < SHA1_BLOCK_SIZE)
+ return crypto_sha1_update(desc, data, len);
+
+ /* make sure casting to sha1_block_fn() is safe */
+ BUILD_BUG_ON(offsetof(struct sha1_state, state) != 0);
+
+ kernel_fpu_begin();
+ sha1_base_do_update(desc, data, len,
+ (sha1_block_fn *)sha1_xform);
+ kernel_fpu_end();
+
+ return 0;
+}
+
+static int sha1_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out, sha1_transform_fn *sha1_xform)
+{
+ if (!irq_fpu_usable())
+ return crypto_sha1_finup(desc, data, len, out);
+
+ kernel_fpu_begin();
+ if (len)
+ sha1_base_do_update(desc, data, len,
+ (sha1_block_fn *)sha1_xform);
+ sha1_base_do_finalize(desc, (sha1_block_fn *)sha1_xform);
+ kernel_fpu_end();
+
+ return sha1_base_finish(desc, out);
+}
+
+asmlinkage void sha1_transform_ssse3(u32 *digest, const char *data,
+ unsigned int rounds);
+
+static int sha1_ssse3_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ return sha1_update(desc, data, len,
+ (sha1_transform_fn *) sha1_transform_ssse3);
+}
+
+static int sha1_ssse3_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return sha1_finup(desc, data, len, out,
+ (sha1_transform_fn *) sha1_transform_ssse3);
+}
+
+/* Add padding and return the message digest. */
+static int sha1_ssse3_final(struct shash_desc *desc, u8 *out)
+{
+ return sha1_ssse3_finup(desc, NULL, 0, out);
+}
+
+static struct shash_alg sha1_ssse3_alg = {
+ .digestsize = SHA1_DIGEST_SIZE,
+ .init = sha1_base_init,
+ .update = sha1_ssse3_update,
+ .final = sha1_ssse3_final,
+ .finup = sha1_ssse3_finup,
+ .descsize = sizeof(struct sha1_state),
+ .base = {
+ .cra_name = "sha1",
+ .cra_driver_name = "sha1-ssse3",
+ .cra_priority = 150,
+ .cra_blocksize = SHA1_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+};
+
+static int register_sha1_ssse3(void)
+{
+ if (boot_cpu_has(X86_FEATURE_SSSE3))
+ return crypto_register_shash(&sha1_ssse3_alg);
+ return 0;
+}
+
+static void unregister_sha1_ssse3(void)
+{
+ if (boot_cpu_has(X86_FEATURE_SSSE3))
+ crypto_unregister_shash(&sha1_ssse3_alg);
+}
+
+#ifdef CONFIG_AS_AVX
+asmlinkage void sha1_transform_avx(u32 *digest, const char *data,
+ unsigned int rounds);
+
+static int sha1_avx_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ return sha1_update(desc, data, len,
+ (sha1_transform_fn *) sha1_transform_avx);
+}
+
+static int sha1_avx_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return sha1_finup(desc, data, len, out,
+ (sha1_transform_fn *) sha1_transform_avx);
+}
+
+static int sha1_avx_final(struct shash_desc *desc, u8 *out)
+{
+ return sha1_avx_finup(desc, NULL, 0, out);
+}
+
+static struct shash_alg sha1_avx_alg = {
+ .digestsize = SHA1_DIGEST_SIZE,
+ .init = sha1_base_init,
+ .update = sha1_avx_update,
+ .final = sha1_avx_final,
+ .finup = sha1_avx_finup,
+ .descsize = sizeof(struct sha1_state),
+ .base = {
+ .cra_name = "sha1",
+ .cra_driver_name = "sha1-avx",
+ .cra_priority = 160,
+ .cra_blocksize = SHA1_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+};
+
+static bool avx_usable(void)
+{
+ if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
+ if (boot_cpu_has(X86_FEATURE_AVX))
+ pr_info("AVX detected but unusable.\n");
+ return false;
+ }
+
+ return true;
+}
+
+static int register_sha1_avx(void)
+{
+ if (avx_usable())
+ return crypto_register_shash(&sha1_avx_alg);
+ return 0;
+}
+
+static void unregister_sha1_avx(void)
+{
+ if (avx_usable())
+ crypto_unregister_shash(&sha1_avx_alg);
+}
+
+#else /* CONFIG_AS_AVX */
+static inline int register_sha1_avx(void) { return 0; }
+static inline void unregister_sha1_avx(void) { }
+#endif /* CONFIG_AS_AVX */
+
+
+#if defined(CONFIG_AS_AVX2) && (CONFIG_AS_AVX)
+#define SHA1_AVX2_BLOCK_OPTSIZE 4 /* optimal 4*64 bytes of SHA1 blocks */
+
+asmlinkage void sha1_transform_avx2(u32 *digest, const char *data,
+ unsigned int rounds);
+
+static bool avx2_usable(void)
+{
+ if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2)
+ && boot_cpu_has(X86_FEATURE_BMI1)
+ && boot_cpu_has(X86_FEATURE_BMI2))
+ return true;
+
+ return false;
+}
+
+static void sha1_apply_transform_avx2(u32 *digest, const char *data,
+ unsigned int rounds)
+{
+ /* Select the optimal transform based on data block size */
+ if (rounds >= SHA1_AVX2_BLOCK_OPTSIZE)
+ sha1_transform_avx2(digest, data, rounds);
+ else
+ sha1_transform_avx(digest, data, rounds);
+}
+
+static int sha1_avx2_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ return sha1_update(desc, data, len,
+ (sha1_transform_fn *) sha1_apply_transform_avx2);
+}
+
+static int sha1_avx2_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return sha1_finup(desc, data, len, out,
+ (sha1_transform_fn *) sha1_apply_transform_avx2);
+}
+
+static int sha1_avx2_final(struct shash_desc *desc, u8 *out)
+{
+ return sha1_avx2_finup(desc, NULL, 0, out);
+}
+
+static struct shash_alg sha1_avx2_alg = {
+ .digestsize = SHA1_DIGEST_SIZE,
+ .init = sha1_base_init,
+ .update = sha1_avx2_update,
+ .final = sha1_avx2_final,
+ .finup = sha1_avx2_finup,
+ .descsize = sizeof(struct sha1_state),
+ .base = {
+ .cra_name = "sha1",
+ .cra_driver_name = "sha1-avx2",
+ .cra_priority = 170,
+ .cra_blocksize = SHA1_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+};
+
+static int register_sha1_avx2(void)
+{
+ if (avx2_usable())
+ return crypto_register_shash(&sha1_avx2_alg);
+ return 0;
+}
+
+static void unregister_sha1_avx2(void)
+{
+ if (avx2_usable())
+ crypto_unregister_shash(&sha1_avx2_alg);
+}
+
+#else
+static inline int register_sha1_avx2(void) { return 0; }
+static inline void unregister_sha1_avx2(void) { }
+#endif
+
+#ifdef CONFIG_AS_SHA1_NI
+asmlinkage void sha1_ni_transform(u32 *digest, const char *data,
+ unsigned int rounds);
+
+static int sha1_ni_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ return sha1_update(desc, data, len,
+ (sha1_transform_fn *) sha1_ni_transform);
+}
+
+static int sha1_ni_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return sha1_finup(desc, data, len, out,
+ (sha1_transform_fn *) sha1_ni_transform);
+}
+
+static int sha1_ni_final(struct shash_desc *desc, u8 *out)
+{
+ return sha1_ni_finup(desc, NULL, 0, out);
+}
+
+static struct shash_alg sha1_ni_alg = {
+ .digestsize = SHA1_DIGEST_SIZE,
+ .init = sha1_base_init,
+ .update = sha1_ni_update,
+ .final = sha1_ni_final,
+ .finup = sha1_ni_finup,
+ .descsize = sizeof(struct sha1_state),
+ .base = {
+ .cra_name = "sha1",
+ .cra_driver_name = "sha1-ni",
+ .cra_priority = 250,
+ .cra_blocksize = SHA1_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+};
+
+static int register_sha1_ni(void)
+{
+ if (boot_cpu_has(X86_FEATURE_SHA_NI))
+ return crypto_register_shash(&sha1_ni_alg);
+ return 0;
+}
+
+static void unregister_sha1_ni(void)
+{
+ if (boot_cpu_has(X86_FEATURE_SHA_NI))
+ crypto_unregister_shash(&sha1_ni_alg);
+}
+
+#else
+static inline int register_sha1_ni(void) { return 0; }
+static inline void unregister_sha1_ni(void) { }
+#endif
+
+static int __init sha1_ssse3_mod_init(void)
+{
+ if (register_sha1_ssse3())
+ goto fail;
+
+ if (register_sha1_avx()) {
+ unregister_sha1_ssse3();
+ goto fail;
+ }
+
+ if (register_sha1_avx2()) {
+ unregister_sha1_avx();
+ unregister_sha1_ssse3();
+ goto fail;
+ }
+
+ if (register_sha1_ni()) {
+ unregister_sha1_avx2();
+ unregister_sha1_avx();
+ unregister_sha1_ssse3();
+ goto fail;
+ }
+
+ return 0;
+fail:
+ return -ENODEV;
+}
+
+static void __exit sha1_ssse3_mod_fini(void)
+{
+ unregister_sha1_ni();
+ unregister_sha1_avx2();
+ unregister_sha1_avx();
+ unregister_sha1_ssse3();
+}
+
+module_init(sha1_ssse3_mod_init);
+module_exit(sha1_ssse3_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm, Supplemental SSE3 accelerated");
+
+MODULE_ALIAS_CRYPTO("sha1");
+MODULE_ALIAS_CRYPTO("sha1-ssse3");
+MODULE_ALIAS_CRYPTO("sha1-avx");
+MODULE_ALIAS_CRYPTO("sha1-avx2");
+#ifdef CONFIG_AS_SHA1_NI
+MODULE_ALIAS_CRYPTO("sha1-ni");
+#endif
diff --git a/arch/x86/crypto/sha256-avx-asm.S b/arch/x86/crypto/sha256-avx-asm.S
new file mode 100644
index 000000000..001bbcf93
--- /dev/null
+++ b/arch/x86/crypto/sha256-avx-asm.S
@@ -0,0 +1,502 @@
+########################################################################
+# Implement fast SHA-256 with AVX1 instructions. (x86_64)
+#
+# Copyright (C) 2013 Intel Corporation.
+#
+# Authors:
+# James Guilford <james.guilford@intel.com>
+# Kirk Yap <kirk.s.yap@intel.com>
+# Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses. You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+# Redistribution and use in source and binary forms, with or
+# without modification, are permitted provided that the following
+# conditions are met:
+#
+# - Redistributions of source code must retain the above
+# copyright notice, this list of conditions and the following
+# disclaimer.
+#
+# - Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following
+# disclaimer in the documentation and/or other materials
+# provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+########################################################################
+#
+# This code is described in an Intel White-Paper:
+# "Fast SHA-256 Implementations on Intel Architecture Processors"
+#
+# To find it, surf to http://www.intel.com/p/en_US/embedded
+# and search for that title.
+#
+########################################################################
+# This code schedules 1 block at a time, with 4 lanes per block
+########################################################################
+
+#ifdef CONFIG_AS_AVX
+#include <linux/linkage.h>
+
+## assume buffers not aligned
+#define VMOVDQ vmovdqu
+
+################################ Define Macros
+
+# addm [mem], reg
+# Add reg to mem using reg-mem add and store
+.macro addm p1 p2
+ add \p1, \p2
+ mov \p2, \p1
+.endm
+
+
+.macro MY_ROR p1 p2
+ shld $(32-(\p1)), \p2, \p2
+.endm
+
+################################
+
+# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
+# Load xmm with mem and byte swap each dword
+.macro COPY_XMM_AND_BSWAP p1 p2 p3
+ VMOVDQ \p2, \p1
+ vpshufb \p3, \p1, \p1
+.endm
+
+################################
+
+X0 = %xmm4
+X1 = %xmm5
+X2 = %xmm6
+X3 = %xmm7
+
+XTMP0 = %xmm0
+XTMP1 = %xmm1
+XTMP2 = %xmm2
+XTMP3 = %xmm3
+XTMP4 = %xmm8
+XFER = %xmm9
+XTMP5 = %xmm11
+
+SHUF_00BA = %xmm10 # shuffle xBxA -> 00BA
+SHUF_DC00 = %xmm12 # shuffle xDxC -> DC00
+BYTE_FLIP_MASK = %xmm13
+
+NUM_BLKS = %rdx # 3rd arg
+INP = %rsi # 2nd arg
+CTX = %rdi # 1st arg
+
+SRND = %rsi # clobbers INP
+c = %ecx
+d = %r8d
+e = %edx
+TBL = %r12
+a = %eax
+b = %ebx
+
+f = %r9d
+g = %r10d
+h = %r11d
+
+y0 = %r13d
+y1 = %r14d
+y2 = %r15d
+
+
+_INP_END_SIZE = 8
+_INP_SIZE = 8
+_XFER_SIZE = 16
+_XMM_SAVE_SIZE = 0
+
+_INP_END = 0
+_INP = _INP_END + _INP_END_SIZE
+_XFER = _INP + _INP_SIZE
+_XMM_SAVE = _XFER + _XFER_SIZE
+STACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE
+
+# rotate_Xs
+# Rotate values of symbols X0...X3
+.macro rotate_Xs
+X_ = X0
+X0 = X1
+X1 = X2
+X2 = X3
+X3 = X_
+.endm
+
+# ROTATE_ARGS
+# Rotate values of symbols a...h
+.macro ROTATE_ARGS
+TMP_ = h
+h = g
+g = f
+f = e
+e = d
+d = c
+c = b
+b = a
+a = TMP_
+.endm
+
+.macro FOUR_ROUNDS_AND_SCHED
+ ## compute s0 four at a time and s1 two at a time
+ ## compute W[-16] + W[-7] 4 at a time
+
+ mov e, y0 # y0 = e
+ MY_ROR (25-11), y0 # y0 = e >> (25-11)
+ mov a, y1 # y1 = a
+ vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
+ MY_ROR (22-13), y1 # y1 = a >> (22-13)
+ xor e, y0 # y0 = e ^ (e >> (25-11))
+ mov f, y2 # y2 = f
+ MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
+ xor a, y1 # y1 = a ^ (a >> (22-13)
+ xor g, y2 # y2 = f^g
+ vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]
+ xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ and e, y2 # y2 = (f^g)&e
+ MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
+ ## compute s0
+ vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15]
+ xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ xor g, y2 # y2 = CH = ((f^g)&e)^g
+ MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ add y0, y2 # y2 = S1 + CH
+ add _XFER(%rsp), y2 # y2 = k + w + S1 + CH
+ mov a, y0 # y0 = a
+ add y2, h # h = h + S1 + CH + k + w
+ mov a, y2 # y2 = a
+ vpsrld $7, XTMP1, XTMP2
+ or c, y0 # y0 = a|c
+ add h, d # d = d + h + S1 + CH + k + w
+ and c, y2 # y2 = a&c
+ vpslld $(32-7), XTMP1, XTMP3
+ and b, y0 # y0 = (a|c)&b
+ add y1, h # h = h + S1 + CH + k + w + S0
+ vpor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7
+ or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
+ add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
+ ROTATE_ARGS
+ mov e, y0 # y0 = e
+ mov a, y1 # y1 = a
+ MY_ROR (25-11), y0 # y0 = e >> (25-11)
+ xor e, y0 # y0 = e ^ (e >> (25-11))
+ mov f, y2 # y2 = f
+ MY_ROR (22-13), y1 # y1 = a >> (22-13)
+ vpsrld $18, XTMP1, XTMP2 #
+ xor a, y1 # y1 = a ^ (a >> (22-13)
+ MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
+ xor g, y2 # y2 = f^g
+ vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
+ MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
+ xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ and e, y2 # y2 = (f^g)&e
+ MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ vpslld $(32-18), XTMP1, XTMP1
+ xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ xor g, y2 # y2 = CH = ((f^g)&e)^g
+ vpxor XTMP1, XTMP3, XTMP3 #
+ add y0, y2 # y2 = S1 + CH
+ add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
+ MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ vpxor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR
+ mov a, y0 # y0 = a
+ add y2, h # h = h + S1 + CH + k + w
+ mov a, y2 # y2 = a
+ vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0
+ or c, y0 # y0 = a|c
+ add h, d # d = d + h + S1 + CH + k + w
+ and c, y2 # y2 = a&c
+ ## compute low s1
+ vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
+ and b, y0 # y0 = (a|c)&b
+ add y1, h # h = h + S1 + CH + k + w + S0
+ vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
+ or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
+ add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
+ ROTATE_ARGS
+ mov e, y0 # y0 = e
+ mov a, y1 # y1 = a
+ MY_ROR (25-11), y0 # y0 = e >> (25-11)
+ xor e, y0 # y0 = e ^ (e >> (25-11))
+ MY_ROR (22-13), y1 # y1 = a >> (22-13)
+ mov f, y2 # y2 = f
+ xor a, y1 # y1 = a ^ (a >> (22-13)
+ MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
+ vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
+ xor g, y2 # y2 = f^g
+ vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xBxA}
+ xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ and e, y2 # y2 = (f^g)&e
+ vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xBxA}
+ MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
+ xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ xor g, y2 # y2 = CH = ((f^g)&e)^g
+ MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ vpxor XTMP3, XTMP2, XTMP2 #
+ add y0, y2 # y2 = S1 + CH
+ MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
+ vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA}
+ mov a, y0 # y0 = a
+ add y2, h # h = h + S1 + CH + k + w
+ mov a, y2 # y2 = a
+ vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}
+ or c, y0 # y0 = a|c
+ add h, d # d = d + h + S1 + CH + k + w
+ and c, y2 # y2 = a&c
+ vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
+ and b, y0 # y0 = (a|c)&b
+ add y1, h # h = h + S1 + CH + k + w + S0
+ ## compute high s1
+ vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
+ or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
+ add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
+ ROTATE_ARGS
+ mov e, y0 # y0 = e
+ MY_ROR (25-11), y0 # y0 = e >> (25-11)
+ mov a, y1 # y1 = a
+ MY_ROR (22-13), y1 # y1 = a >> (22-13)
+ xor e, y0 # y0 = e ^ (e >> (25-11))
+ mov f, y2 # y2 = f
+ MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
+ vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC}
+ xor a, y1 # y1 = a ^ (a >> (22-13)
+ xor g, y2 # y2 = f^g
+ vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xDxC}
+ xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ and e, y2 # y2 = (f^g)&e
+ MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
+ vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xDxC}
+ xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ xor g, y2 # y2 = CH = ((f^g)&e)^g
+ vpxor XTMP3, XTMP2, XTMP2
+ MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ add y0, y2 # y2 = S1 + CH
+ add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
+ vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC}
+ mov a, y0 # y0 = a
+ add y2, h # h = h + S1 + CH + k + w
+ mov a, y2 # y2 = a
+ vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}
+ or c, y0 # y0 = a|c
+ add h, d # d = d + h + S1 + CH + k + w
+ and c, y2 # y2 = a&c
+ vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]}
+ and b, y0 # y0 = (a|c)&b
+ add y1, h # h = h + S1 + CH + k + w + S0
+ or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
+ add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
+ ROTATE_ARGS
+ rotate_Xs
+.endm
+
+## input is [rsp + _XFER + %1 * 4]
+.macro DO_ROUND round
+ mov e, y0 # y0 = e
+ MY_ROR (25-11), y0 # y0 = e >> (25-11)
+ mov a, y1 # y1 = a
+ xor e, y0 # y0 = e ^ (e >> (25-11))
+ MY_ROR (22-13), y1 # y1 = a >> (22-13)
+ mov f, y2 # y2 = f
+ xor a, y1 # y1 = a ^ (a >> (22-13)
+ MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
+ xor g, y2 # y2 = f^g
+ xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
+ and e, y2 # y2 = (f^g)&e
+ xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ xor g, y2 # y2 = CH = ((f^g)&e)^g
+ add y0, y2 # y2 = S1 + CH
+ MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ offset = \round * 4 + _XFER #
+ add offset(%rsp), y2 # y2 = k + w + S1 + CH
+ mov a, y0 # y0 = a
+ add y2, h # h = h + S1 + CH + k + w
+ mov a, y2 # y2 = a
+ or c, y0 # y0 = a|c
+ add h, d # d = d + h + S1 + CH + k + w
+ and c, y2 # y2 = a&c
+ and b, y0 # y0 = (a|c)&b
+ add y1, h # h = h + S1 + CH + k + w + S0
+ or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
+ add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
+ ROTATE_ARGS
+.endm
+
+########################################################################
+## void sha256_transform_avx(void *input_data, UINT32 digest[8], UINT64 num_blks)
+## arg 1 : pointer to digest
+## arg 2 : pointer to input data
+## arg 3 : Num blocks
+########################################################################
+.text
+ENTRY(sha256_transform_avx)
+.align 32
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ pushq %rbp
+ movq %rsp, %rbp
+
+ subq $STACK_SIZE, %rsp # allocate stack space
+ and $~15, %rsp # align stack pointer
+
+ shl $6, NUM_BLKS # convert to bytes
+ jz done_hash
+ add INP, NUM_BLKS # pointer to end of data
+ mov NUM_BLKS, _INP_END(%rsp)
+
+ ## load initial digest
+ mov 4*0(CTX), a
+ mov 4*1(CTX), b
+ mov 4*2(CTX), c
+ mov 4*3(CTX), d
+ mov 4*4(CTX), e
+ mov 4*5(CTX), f
+ mov 4*6(CTX), g
+ mov 4*7(CTX), h
+
+ vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
+ vmovdqa _SHUF_00BA(%rip), SHUF_00BA
+ vmovdqa _SHUF_DC00(%rip), SHUF_DC00
+loop0:
+ lea K256(%rip), TBL
+
+ ## byte swap first 16 dwords
+ COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK
+
+ mov INP, _INP(%rsp)
+
+ ## schedule 48 input dwords, by doing 3 rounds of 16 each
+ mov $3, SRND
+.align 16
+loop1:
+ vpaddd (TBL), X0, XFER
+ vmovdqa XFER, _XFER(%rsp)
+ FOUR_ROUNDS_AND_SCHED
+
+ vpaddd 1*16(TBL), X0, XFER
+ vmovdqa XFER, _XFER(%rsp)
+ FOUR_ROUNDS_AND_SCHED
+
+ vpaddd 2*16(TBL), X0, XFER
+ vmovdqa XFER, _XFER(%rsp)
+ FOUR_ROUNDS_AND_SCHED
+
+ vpaddd 3*16(TBL), X0, XFER
+ vmovdqa XFER, _XFER(%rsp)
+ add $4*16, TBL
+ FOUR_ROUNDS_AND_SCHED
+
+ sub $1, SRND
+ jne loop1
+
+ mov $2, SRND
+loop2:
+ vpaddd (TBL), X0, XFER
+ vmovdqa XFER, _XFER(%rsp)
+ DO_ROUND 0
+ DO_ROUND 1
+ DO_ROUND 2
+ DO_ROUND 3
+
+ vpaddd 1*16(TBL), X1, XFER
+ vmovdqa XFER, _XFER(%rsp)
+ add $2*16, TBL
+ DO_ROUND 0
+ DO_ROUND 1
+ DO_ROUND 2
+ DO_ROUND 3
+
+ vmovdqa X2, X0
+ vmovdqa X3, X1
+
+ sub $1, SRND
+ jne loop2
+
+ addm (4*0)(CTX),a
+ addm (4*1)(CTX),b
+ addm (4*2)(CTX),c
+ addm (4*3)(CTX),d
+ addm (4*4)(CTX),e
+ addm (4*5)(CTX),f
+ addm (4*6)(CTX),g
+ addm (4*7)(CTX),h
+
+ mov _INP(%rsp), INP
+ add $64, INP
+ cmp _INP_END(%rsp), INP
+ jne loop0
+
+done_hash:
+
+ mov %rbp, %rsp
+ popq %rbp
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbx
+ ret
+ENDPROC(sha256_transform_avx)
+
+.section .rodata.cst256.K256, "aM", @progbits, 256
+.align 64
+K256:
+ .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+ .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+ .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+ .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+ .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+ .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+ .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
+.align 16
+PSHUFFLE_BYTE_FLIP_MASK:
+ .octa 0x0c0d0e0f08090a0b0405060700010203
+
+.section .rodata.cst16._SHUF_00BA, "aM", @progbits, 16
+.align 16
+# shuffle xBxA -> 00BA
+_SHUF_00BA:
+ .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
+
+.section .rodata.cst16._SHUF_DC00, "aM", @progbits, 16
+.align 16
+# shuffle xDxC -> DC00
+_SHUF_DC00:
+ .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
+
+#endif
diff --git a/arch/x86/crypto/sha256-avx2-asm.S b/arch/x86/crypto/sha256-avx2-asm.S
new file mode 100644
index 000000000..1420db15d
--- /dev/null
+++ b/arch/x86/crypto/sha256-avx2-asm.S
@@ -0,0 +1,771 @@
+########################################################################
+# Implement fast SHA-256 with AVX2 instructions. (x86_64)
+#
+# Copyright (C) 2013 Intel Corporation.
+#
+# Authors:
+# James Guilford <james.guilford@intel.com>
+# Kirk Yap <kirk.s.yap@intel.com>
+# Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses. You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+# Redistribution and use in source and binary forms, with or
+# without modification, are permitted provided that the following
+# conditions are met:
+#
+# - Redistributions of source code must retain the above
+# copyright notice, this list of conditions and the following
+# disclaimer.
+#
+# - Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following
+# disclaimer in the documentation and/or other materials
+# provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+########################################################################
+#
+# This code is described in an Intel White-Paper:
+# "Fast SHA-256 Implementations on Intel Architecture Processors"
+#
+# To find it, surf to http://www.intel.com/p/en_US/embedded
+# and search for that title.
+#
+########################################################################
+# This code schedules 2 blocks at a time, with 4 lanes per block
+########################################################################
+
+#ifdef CONFIG_AS_AVX2
+#include <linux/linkage.h>
+
+## assume buffers not aligned
+#define VMOVDQ vmovdqu
+
+################################ Define Macros
+
+# addm [mem], reg
+# Add reg to mem using reg-mem add and store
+.macro addm p1 p2
+ add \p1, \p2
+ mov \p2, \p1
+.endm
+
+################################
+
+X0 = %ymm4
+X1 = %ymm5
+X2 = %ymm6
+X3 = %ymm7
+
+# XMM versions of above
+XWORD0 = %xmm4
+XWORD1 = %xmm5
+XWORD2 = %xmm6
+XWORD3 = %xmm7
+
+XTMP0 = %ymm0
+XTMP1 = %ymm1
+XTMP2 = %ymm2
+XTMP3 = %ymm3
+XTMP4 = %ymm8
+XFER = %ymm9
+XTMP5 = %ymm11
+
+SHUF_00BA = %ymm10 # shuffle xBxA -> 00BA
+SHUF_DC00 = %ymm12 # shuffle xDxC -> DC00
+BYTE_FLIP_MASK = %ymm13
+
+X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK
+
+NUM_BLKS = %rdx # 3rd arg
+INP = %rsi # 2nd arg
+CTX = %rdi # 1st arg
+c = %ecx
+d = %r8d
+e = %edx # clobbers NUM_BLKS
+y3 = %esi # clobbers INP
+
+SRND = CTX # SRND is same register as CTX
+
+a = %eax
+b = %ebx
+f = %r9d
+g = %r10d
+h = %r11d
+old_h = %r11d
+
+T1 = %r12d
+y0 = %r13d
+y1 = %r14d
+y2 = %r15d
+
+
+_XFER_SIZE = 2*64*4 # 2 blocks, 64 rounds, 4 bytes/round
+_XMM_SAVE_SIZE = 0
+_INP_END_SIZE = 8
+_INP_SIZE = 8
+_CTX_SIZE = 8
+_RSP_SIZE = 8
+
+_XFER = 0
+_XMM_SAVE = _XFER + _XFER_SIZE
+_INP_END = _XMM_SAVE + _XMM_SAVE_SIZE
+_INP = _INP_END + _INP_END_SIZE
+_CTX = _INP + _INP_SIZE
+_RSP = _CTX + _CTX_SIZE
+STACK_SIZE = _RSP + _RSP_SIZE
+
+# rotate_Xs
+# Rotate values of symbols X0...X3
+.macro rotate_Xs
+ X_ = X0
+ X0 = X1
+ X1 = X2
+ X2 = X3
+ X3 = X_
+.endm
+
+# ROTATE_ARGS
+# Rotate values of symbols a...h
+.macro ROTATE_ARGS
+ old_h = h
+ TMP_ = h
+ h = g
+ g = f
+ f = e
+ e = d
+ d = c
+ c = b
+ b = a
+ a = TMP_
+.endm
+
+.macro FOUR_ROUNDS_AND_SCHED disp
+################################### RND N + 0 ############################
+
+ mov a, y3 # y3 = a # MAJA
+ rorx $25, e, y0 # y0 = e >> 25 # S1A
+ rorx $11, e, y1 # y1 = e >> 11 # S1B
+
+ addl \disp(%rsp, SRND), h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+ vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
+ mov f, y2 # y2 = f # CH
+ rorx $13, a, T1 # T1 = a >> 13 # S0B
+
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
+ xor g, y2 # y2 = f^g # CH
+ vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1
+ rorx $6, e, y1 # y1 = (e >> 6) # S1
+
+ and e, y2 # y2 = (f^g)&e # CH
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
+ rorx $22, a, y1 # y1 = a >> 22 # S0A
+ add h, d # d = k + w + h + d # --
+
+ and b, y3 # y3 = (a|c)&b # MAJA
+ vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15]
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
+ rorx $2, a, T1 # T1 = (a >> 2) # S0
+
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+ vpsrld $7, XTMP1, XTMP2
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
+ mov a, T1 # T1 = a # MAJB
+ and c, T1 # T1 = a&c # MAJB
+
+ add y0, y2 # y2 = S1 + CH # --
+ vpslld $(32-7), XTMP1, XTMP3
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1, h # h = k + w + h + S0 # --
+
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+ vpor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7
+
+ vpsrld $18, XTMP1, XTMP2
+ add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+ add y3, h # h = t1 + S0 + MAJ # --
+
+
+ ROTATE_ARGS
+
+################################### RND N + 1 ############################
+
+ mov a, y3 # y3 = a # MAJA
+ rorx $25, e, y0 # y0 = e >> 25 # S1A
+ rorx $11, e, y1 # y1 = e >> 11 # S1B
+ offset = \disp + 1*4
+ addl offset(%rsp, SRND), h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+
+
+ vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
+ mov f, y2 # y2 = f # CH
+ rorx $13, a, T1 # T1 = a >> 13 # S0B
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
+ xor g, y2 # y2 = f^g # CH
+
+
+ rorx $6, e, y1 # y1 = (e >> 6) # S1
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
+ rorx $22, a, y1 # y1 = a >> 22 # S0A
+ and e, y2 # y2 = (f^g)&e # CH
+ add h, d # d = k + w + h + d # --
+
+ vpslld $(32-18), XTMP1, XTMP1
+ and b, y3 # y3 = (a|c)&b # MAJA
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
+
+ vpxor XTMP1, XTMP3, XTMP3
+ rorx $2, a, T1 # T1 = (a >> 2) # S0
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+
+ vpxor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
+ mov a, T1 # T1 = a # MAJB
+ and c, T1 # T1 = a&c # MAJB
+ add y0, y2 # y2 = S1 + CH # --
+
+ vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0
+ vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1, h # h = k + w + h + S0 # --
+
+ vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+ add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+ add y3, h # h = t1 + S0 + MAJ # --
+
+ vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
+
+
+ ROTATE_ARGS
+
+################################### RND N + 2 ############################
+
+ mov a, y3 # y3 = a # MAJA
+ rorx $25, e, y0 # y0 = e >> 25 # S1A
+ offset = \disp + 2*4
+ addl offset(%rsp, SRND), h # h = k + w + h # --
+
+ vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
+ rorx $11, e, y1 # y1 = e >> 11 # S1B
+ or c, y3 # y3 = a|c # MAJA
+ mov f, y2 # y2 = f # CH
+ xor g, y2 # y2 = f^g # CH
+
+ rorx $13, a, T1 # T1 = a >> 13 # S0B
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
+ vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA}
+ and e, y2 # y2 = (f^g)&e # CH
+
+ rorx $6, e, y1 # y1 = (e >> 6) # S1
+ vpxor XTMP3, XTMP2, XTMP2
+ add h, d # d = k + w + h + d # --
+ and b, y3 # y3 = (a|c)&b # MAJA
+
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
+ rorx $22, a, y1 # y1 = a >> 22 # S0A
+ vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA}
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+
+ vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
+ rorx $2, a ,T1 # T1 = (a >> 2) # S0
+ vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
+
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
+ mov a, T1 # T1 = a # MAJB
+ and c, T1 # T1 = a&c # MAJB
+ add y0, y2 # y2 = S1 + CH # --
+ vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
+
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1,h # h = k + w + h + S0 # --
+ add y2,d # d = k + w + h + d + S1 + CH = d + t1 # --
+ add y2,h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+
+ add y3,h # h = t1 + S0 + MAJ # --
+
+
+ ROTATE_ARGS
+
+################################### RND N + 3 ############################
+
+ mov a, y3 # y3 = a # MAJA
+ rorx $25, e, y0 # y0 = e >> 25 # S1A
+ rorx $11, e, y1 # y1 = e >> 11 # S1B
+ offset = \disp + 3*4
+ addl offset(%rsp, SRND), h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+
+
+ vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC}
+ mov f, y2 # y2 = f # CH
+ rorx $13, a, T1 # T1 = a >> 13 # S0B
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
+ xor g, y2 # y2 = f^g # CH
+
+
+ vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC}
+ rorx $6, e, y1 # y1 = (e >> 6) # S1
+ and e, y2 # y2 = (f^g)&e # CH
+ add h, d # d = k + w + h + d # --
+ and b, y3 # y3 = (a|c)&b # MAJA
+
+ vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC}
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+
+ vpxor XTMP3, XTMP2, XTMP2
+ rorx $22, a, y1 # y1 = a >> 22 # S0A
+ add y0, y2 # y2 = S1 + CH # --
+
+ vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC}
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+
+ rorx $2, a, T1 # T1 = (a >> 2) # S0
+ vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}
+
+ vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]}
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
+ mov a, T1 # T1 = a # MAJB
+ and c, T1 # T1 = a&c # MAJB
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+
+ add y1, h # h = k + w + h + S0 # --
+ add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+ add y3, h # h = t1 + S0 + MAJ # --
+
+ ROTATE_ARGS
+ rotate_Xs
+.endm
+
+.macro DO_4ROUNDS disp
+################################### RND N + 0 ###########################
+
+ mov f, y2 # y2 = f # CH
+ rorx $25, e, y0 # y0 = e >> 25 # S1A
+ rorx $11, e, y1 # y1 = e >> 11 # S1B
+ xor g, y2 # y2 = f^g # CH
+
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
+ rorx $6, e, y1 # y1 = (e >> 6) # S1
+ and e, y2 # y2 = (f^g)&e # CH
+
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
+ rorx $13, a, T1 # T1 = a >> 13 # S0B
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+ rorx $22, a, y1 # y1 = a >> 22 # S0A
+ mov a, y3 # y3 = a # MAJA
+
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
+ rorx $2, a, T1 # T1 = (a >> 2) # S0
+ addl \disp(%rsp, SRND), h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
+ mov a, T1 # T1 = a # MAJB
+ and b, y3 # y3 = (a|c)&b # MAJA
+ and c, T1 # T1 = a&c # MAJB
+ add y0, y2 # y2 = S1 + CH # --
+
+
+ add h, d # d = k + w + h + d # --
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1, h # h = k + w + h + S0 # --
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+
+ ROTATE_ARGS
+
+################################### RND N + 1 ###########################
+
+ add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+ mov f, y2 # y2 = f # CH
+ rorx $25, e, y0 # y0 = e >> 25 # S1A
+ rorx $11, e, y1 # y1 = e >> 11 # S1B
+ xor g, y2 # y2 = f^g # CH
+
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
+ rorx $6, e, y1 # y1 = (e >> 6) # S1
+ and e, y2 # y2 = (f^g)&e # CH
+ add y3, old_h # h = t1 + S0 + MAJ # --
+
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
+ rorx $13, a, T1 # T1 = a >> 13 # S0B
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+ rorx $22, a, y1 # y1 = a >> 22 # S0A
+ mov a, y3 # y3 = a # MAJA
+
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
+ rorx $2, a, T1 # T1 = (a >> 2) # S0
+ offset = 4*1 + \disp
+ addl offset(%rsp, SRND), h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
+ mov a, T1 # T1 = a # MAJB
+ and b, y3 # y3 = (a|c)&b # MAJA
+ and c, T1 # T1 = a&c # MAJB
+ add y0, y2 # y2 = S1 + CH # --
+
+
+ add h, d # d = k + w + h + d # --
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1, h # h = k + w + h + S0 # --
+
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+
+ ROTATE_ARGS
+
+################################### RND N + 2 ##############################
+
+ add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+ mov f, y2 # y2 = f # CH
+ rorx $25, e, y0 # y0 = e >> 25 # S1A
+ rorx $11, e, y1 # y1 = e >> 11 # S1B
+ xor g, y2 # y2 = f^g # CH
+
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
+ rorx $6, e, y1 # y1 = (e >> 6) # S1
+ and e, y2 # y2 = (f^g)&e # CH
+ add y3, old_h # h = t1 + S0 + MAJ # --
+
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
+ rorx $13, a, T1 # T1 = a >> 13 # S0B
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+ rorx $22, a, y1 # y1 = a >> 22 # S0A
+ mov a, y3 # y3 = a # MAJA
+
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
+ rorx $2, a, T1 # T1 = (a >> 2) # S0
+ offset = 4*2 + \disp
+ addl offset(%rsp, SRND), h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
+ mov a, T1 # T1 = a # MAJB
+ and b, y3 # y3 = (a|c)&b # MAJA
+ and c, T1 # T1 = a&c # MAJB
+ add y0, y2 # y2 = S1 + CH # --
+
+
+ add h, d # d = k + w + h + d # --
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1, h # h = k + w + h + S0 # --
+
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+
+ ROTATE_ARGS
+
+################################### RND N + 3 ###########################
+
+ add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+ mov f, y2 # y2 = f # CH
+ rorx $25, e, y0 # y0 = e >> 25 # S1A
+ rorx $11, e, y1 # y1 = e >> 11 # S1B
+ xor g, y2 # y2 = f^g # CH
+
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
+ rorx $6, e, y1 # y1 = (e >> 6) # S1
+ and e, y2 # y2 = (f^g)&e # CH
+ add y3, old_h # h = t1 + S0 + MAJ # --
+
+ xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
+ rorx $13, a, T1 # T1 = a >> 13 # S0B
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+ rorx $22, a, y1 # y1 = a >> 22 # S0A
+ mov a, y3 # y3 = a # MAJA
+
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
+ rorx $2, a, T1 # T1 = (a >> 2) # S0
+ offset = 4*3 + \disp
+ addl offset(%rsp, SRND), h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+
+ xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
+ mov a, T1 # T1 = a # MAJB
+ and b, y3 # y3 = (a|c)&b # MAJA
+ and c, T1 # T1 = a&c # MAJB
+ add y0, y2 # y2 = S1 + CH # --
+
+
+ add h, d # d = k + w + h + d # --
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1, h # h = k + w + h + S0 # --
+
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+
+
+ add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+
+ add y3, h # h = t1 + S0 + MAJ # --
+
+ ROTATE_ARGS
+
+.endm
+
+########################################################################
+## void sha256_transform_rorx(void *input_data, UINT32 digest[8], UINT64 num_blks)
+## arg 1 : pointer to digest
+## arg 2 : pointer to input data
+## arg 3 : Num blocks
+########################################################################
+.text
+ENTRY(sha256_transform_rorx)
+.align 32
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+
+ mov %rsp, %rax
+ subq $STACK_SIZE, %rsp
+ and $-32, %rsp # align rsp to 32 byte boundary
+ mov %rax, _RSP(%rsp)
+
+
+ shl $6, NUM_BLKS # convert to bytes
+ jz done_hash
+ lea -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block
+ mov NUM_BLKS, _INP_END(%rsp)
+
+ cmp NUM_BLKS, INP
+ je only_one_block
+
+ ## load initial digest
+ mov (CTX), a
+ mov 4*1(CTX), b
+ mov 4*2(CTX), c
+ mov 4*3(CTX), d
+ mov 4*4(CTX), e
+ mov 4*5(CTX), f
+ mov 4*6(CTX), g
+ mov 4*7(CTX), h
+
+ vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
+ vmovdqa _SHUF_00BA(%rip), SHUF_00BA
+ vmovdqa _SHUF_DC00(%rip), SHUF_DC00
+
+ mov CTX, _CTX(%rsp)
+
+loop0:
+ ## Load first 16 dwords from two blocks
+ VMOVDQ 0*32(INP),XTMP0
+ VMOVDQ 1*32(INP),XTMP1
+ VMOVDQ 2*32(INP),XTMP2
+ VMOVDQ 3*32(INP),XTMP3
+
+ ## byte swap data
+ vpshufb BYTE_FLIP_MASK, XTMP0, XTMP0
+ vpshufb BYTE_FLIP_MASK, XTMP1, XTMP1
+ vpshufb BYTE_FLIP_MASK, XTMP2, XTMP2
+ vpshufb BYTE_FLIP_MASK, XTMP3, XTMP3
+
+ ## transpose data into high/low halves
+ vperm2i128 $0x20, XTMP2, XTMP0, X0
+ vperm2i128 $0x31, XTMP2, XTMP0, X1
+ vperm2i128 $0x20, XTMP3, XTMP1, X2
+ vperm2i128 $0x31, XTMP3, XTMP1, X3
+
+last_block_enter:
+ add $64, INP
+ mov INP, _INP(%rsp)
+
+ ## schedule 48 input dwords, by doing 3 rounds of 12 each
+ xor SRND, SRND
+
+.align 16
+loop1:
+ vpaddd K256+0*32(SRND), X0, XFER
+ vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
+ FOUR_ROUNDS_AND_SCHED _XFER + 0*32
+
+ vpaddd K256+1*32(SRND), X0, XFER
+ vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
+ FOUR_ROUNDS_AND_SCHED _XFER + 1*32
+
+ vpaddd K256+2*32(SRND), X0, XFER
+ vmovdqa XFER, 2*32+_XFER(%rsp, SRND)
+ FOUR_ROUNDS_AND_SCHED _XFER + 2*32
+
+ vpaddd K256+3*32(SRND), X0, XFER
+ vmovdqa XFER, 3*32+_XFER(%rsp, SRND)
+ FOUR_ROUNDS_AND_SCHED _XFER + 3*32
+
+ add $4*32, SRND
+ cmp $3*4*32, SRND
+ jb loop1
+
+loop2:
+ ## Do last 16 rounds with no scheduling
+ vpaddd K256+0*32(SRND), X0, XFER
+ vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
+ DO_4ROUNDS _XFER + 0*32
+
+ vpaddd K256+1*32(SRND), X1, XFER
+ vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
+ DO_4ROUNDS _XFER + 1*32
+ add $2*32, SRND
+
+ vmovdqa X2, X0
+ vmovdqa X3, X1
+
+ cmp $4*4*32, SRND
+ jb loop2
+
+ mov _CTX(%rsp), CTX
+ mov _INP(%rsp), INP
+
+ addm (4*0)(CTX),a
+ addm (4*1)(CTX),b
+ addm (4*2)(CTX),c
+ addm (4*3)(CTX),d
+ addm (4*4)(CTX),e
+ addm (4*5)(CTX),f
+ addm (4*6)(CTX),g
+ addm (4*7)(CTX),h
+
+ cmp _INP_END(%rsp), INP
+ ja done_hash
+
+ #### Do second block using previously scheduled results
+ xor SRND, SRND
+.align 16
+loop3:
+ DO_4ROUNDS _XFER + 0*32 + 16
+ DO_4ROUNDS _XFER + 1*32 + 16
+ add $2*32, SRND
+ cmp $4*4*32, SRND
+ jb loop3
+
+ mov _CTX(%rsp), CTX
+ mov _INP(%rsp), INP
+ add $64, INP
+
+ addm (4*0)(CTX),a
+ addm (4*1)(CTX),b
+ addm (4*2)(CTX),c
+ addm (4*3)(CTX),d
+ addm (4*4)(CTX),e
+ addm (4*5)(CTX),f
+ addm (4*6)(CTX),g
+ addm (4*7)(CTX),h
+
+ cmp _INP_END(%rsp), INP
+ jb loop0
+ ja done_hash
+
+do_last_block:
+ VMOVDQ 0*16(INP),XWORD0
+ VMOVDQ 1*16(INP),XWORD1
+ VMOVDQ 2*16(INP),XWORD2
+ VMOVDQ 3*16(INP),XWORD3
+
+ vpshufb X_BYTE_FLIP_MASK, XWORD0, XWORD0
+ vpshufb X_BYTE_FLIP_MASK, XWORD1, XWORD1
+ vpshufb X_BYTE_FLIP_MASK, XWORD2, XWORD2
+ vpshufb X_BYTE_FLIP_MASK, XWORD3, XWORD3
+
+ jmp last_block_enter
+
+only_one_block:
+
+ ## load initial digest
+ mov (4*0)(CTX),a
+ mov (4*1)(CTX),b
+ mov (4*2)(CTX),c
+ mov (4*3)(CTX),d
+ mov (4*4)(CTX),e
+ mov (4*5)(CTX),f
+ mov (4*6)(CTX),g
+ mov (4*7)(CTX),h
+
+ vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
+ vmovdqa _SHUF_00BA(%rip), SHUF_00BA
+ vmovdqa _SHUF_DC00(%rip), SHUF_DC00
+
+ mov CTX, _CTX(%rsp)
+ jmp do_last_block
+
+done_hash:
+
+ mov _RSP(%rsp), %rsp
+
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbx
+ ret
+ENDPROC(sha256_transform_rorx)
+
+.section .rodata.cst512.K256, "aM", @progbits, 512
+.align 64
+K256:
+ .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+ .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+ .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+ .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+ .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+ .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+ .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+ .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+ .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+ .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+ .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+ .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+ .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+ .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
+.align 32
+PSHUFFLE_BYTE_FLIP_MASK:
+ .octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
+
+# shuffle xBxA -> 00BA
+.section .rodata.cst32._SHUF_00BA, "aM", @progbits, 32
+.align 32
+_SHUF_00BA:
+ .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
+
+# shuffle xDxC -> DC00
+.section .rodata.cst32._SHUF_DC00, "aM", @progbits, 32
+.align 32
+_SHUF_DC00:
+ .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF
+
+#endif
diff --git a/arch/x86/crypto/sha256-mb/Makefile b/arch/x86/crypto/sha256-mb/Makefile
new file mode 100644
index 000000000..53ad6e7db
--- /dev/null
+++ b/arch/x86/crypto/sha256-mb/Makefile
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Arch-specific CryptoAPI modules.
+#
+
+OBJECT_FILES_NON_STANDARD := y
+
+avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
+ $(comma)4)$(comma)%ymm2,yes,no)
+ifeq ($(avx2_supported),yes)
+ obj-$(CONFIG_CRYPTO_SHA256_MB) += sha256-mb.o
+ sha256-mb-y := sha256_mb.o sha256_mb_mgr_flush_avx2.o \
+ sha256_mb_mgr_init_avx2.o sha256_mb_mgr_submit_avx2.o sha256_x8_avx2.o
+endif
diff --git a/arch/x86/crypto/sha256-mb/sha256_mb.c b/arch/x86/crypto/sha256-mb/sha256_mb.c
new file mode 100644
index 000000000..97c5fc43e
--- /dev/null
+++ b/arch/x86/crypto/sha256-mb/sha256_mb.c
@@ -0,0 +1,1013 @@
+/*
+ * Multi buffer SHA256 algorithm Glue Code
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * Megha Dey <megha.dey@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <crypto/internal/hash.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <crypto/scatterwalk.h>
+#include <crypto/sha.h>
+#include <crypto/mcryptd.h>
+#include <crypto/crypto_wq.h>
+#include <asm/byteorder.h>
+#include <linux/hardirq.h>
+#include <asm/fpu/api.h>
+#include "sha256_mb_ctx.h"
+
+#define FLUSH_INTERVAL 1000 /* in usec */
+
+static struct mcryptd_alg_state sha256_mb_alg_state;
+
+struct sha256_mb_ctx {
+ struct mcryptd_ahash *mcryptd_tfm;
+};
+
+static inline struct mcryptd_hash_request_ctx
+ *cast_hash_to_mcryptd_ctx(struct sha256_hash_ctx *hash_ctx)
+{
+ struct ahash_request *areq;
+
+ areq = container_of((void *) hash_ctx, struct ahash_request, __ctx);
+ return container_of(areq, struct mcryptd_hash_request_ctx, areq);
+}
+
+static inline struct ahash_request
+ *cast_mcryptd_ctx_to_req(struct mcryptd_hash_request_ctx *ctx)
+{
+ return container_of((void *) ctx, struct ahash_request, __ctx);
+}
+
+static void req_ctx_init(struct mcryptd_hash_request_ctx *rctx,
+ struct ahash_request *areq)
+{
+ rctx->flag = HASH_UPDATE;
+}
+
+static asmlinkage void (*sha256_job_mgr_init)(struct sha256_mb_mgr *state);
+static asmlinkage struct job_sha256* (*sha256_job_mgr_submit)
+ (struct sha256_mb_mgr *state, struct job_sha256 *job);
+static asmlinkage struct job_sha256* (*sha256_job_mgr_flush)
+ (struct sha256_mb_mgr *state);
+static asmlinkage struct job_sha256* (*sha256_job_mgr_get_comp_job)
+ (struct sha256_mb_mgr *state);
+
+inline uint32_t sha256_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2],
+ uint64_t total_len)
+{
+ uint32_t i = total_len & (SHA256_BLOCK_SIZE - 1);
+
+ memset(&padblock[i], 0, SHA256_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ i += ((SHA256_BLOCK_SIZE - 1) &
+ (0 - (total_len + SHA256_PADLENGTHFIELD_SIZE + 1)))
+ + 1 + SHA256_PADLENGTHFIELD_SIZE;
+
+#if SHA256_PADLENGTHFIELD_SIZE == 16
+ *((uint64_t *) &padblock[i - 16]) = 0;
+#endif
+
+ *((uint64_t *) &padblock[i - 8]) = cpu_to_be64(total_len << 3);
+
+ /* Number of extra blocks to hash */
+ return i >> SHA256_LOG2_BLOCK_SIZE;
+}
+
+static struct sha256_hash_ctx
+ *sha256_ctx_mgr_resubmit(struct sha256_ctx_mgr *mgr,
+ struct sha256_hash_ctx *ctx)
+{
+ while (ctx) {
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ /* Clear PROCESSING bit */
+ ctx->status = HASH_CTX_STS_COMPLETE;
+ return ctx;
+ }
+
+ /*
+ * If the extra blocks are empty, begin hashing what remains
+ * in the user's buffer.
+ */
+ if (ctx->partial_block_buffer_length == 0 &&
+ ctx->incoming_buffer_length) {
+
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+ uint32_t copy_len;
+
+ /*
+ * Only entire blocks can be hashed.
+ * Copy remainder to extra blocks buffer.
+ */
+ copy_len = len & (SHA256_BLOCK_SIZE-1);
+
+ if (copy_len) {
+ len -= copy_len;
+ memcpy(ctx->partial_block_buffer,
+ ((const char *) buffer + len),
+ copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ /* len should be a multiple of the block size now */
+ assert((len % SHA256_BLOCK_SIZE) == 0);
+
+ /* Set len to the number of blocks to be hashed */
+ len >>= SHA256_LOG2_BLOCK_SIZE;
+
+ if (len) {
+
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx = (struct sha256_hash_ctx *)
+ sha256_job_mgr_submit(&mgr->mgr, &ctx->job);
+ continue;
+ }
+ }
+
+ /*
+ * If the extra blocks are not empty, then we are
+ * either on the last block(s) or we need more
+ * user input before continuing.
+ */
+ if (ctx->status & HASH_CTX_STS_LAST) {
+
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks =
+ sha256_pad(buf, ctx->total_length);
+
+ ctx->status = (HASH_CTX_STS_PROCESSING |
+ HASH_CTX_STS_COMPLETE);
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx = (struct sha256_hash_ctx *)
+ sha256_job_mgr_submit(&mgr->mgr, &ctx->job);
+ continue;
+ }
+
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static struct sha256_hash_ctx
+ *sha256_ctx_mgr_get_comp_ctx(struct sha256_ctx_mgr *mgr)
+{
+ /*
+ * If get_comp_job returns NULL, there are no jobs complete.
+ * If get_comp_job returns a job, verify that it is safe to return to
+ * the user. If it is not ready, resubmit the job to finish processing.
+ * If sha256_ctx_mgr_resubmit returned a job, it is ready to be
+ * returned. Otherwise, all jobs currently being managed by the
+ * hash_ctx_mgr still need processing.
+ */
+ struct sha256_hash_ctx *ctx;
+
+ ctx = (struct sha256_hash_ctx *) sha256_job_mgr_get_comp_job(&mgr->mgr);
+ return sha256_ctx_mgr_resubmit(mgr, ctx);
+}
+
+static void sha256_ctx_mgr_init(struct sha256_ctx_mgr *mgr)
+{
+ sha256_job_mgr_init(&mgr->mgr);
+}
+
+static struct sha256_hash_ctx *sha256_ctx_mgr_submit(struct sha256_ctx_mgr *mgr,
+ struct sha256_hash_ctx *ctx,
+ const void *buffer,
+ uint32_t len,
+ int flags)
+{
+ if (flags & ~(HASH_UPDATE | HASH_LAST)) {
+ /* User should not pass anything other than UPDATE or LAST */
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ /* Cannot submit to a currently processing job. */
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ /* Cannot update a finished job. */
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ /* If we made it here, there was no error during this call to submit */
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ /* Store buffer ptr info from user */
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ /*
+ * Store the user's request flags and mark this ctx as currently
+ * being processed.
+ */
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ /* Advance byte counter */
+ ctx->total_length += len;
+
+ /*
+ * If there is anything currently buffered in the extra blocks,
+ * append to it until it contains a whole block.
+ * Or if the user's buffer contains less than a whole block,
+ * append as much as possible to the extra block.
+ */
+ if (ctx->partial_block_buffer_length || len < SHA256_BLOCK_SIZE) {
+ /*
+ * Compute how many bytes to copy from user buffer into
+ * extra block
+ */
+ uint32_t copy_len = SHA256_BLOCK_SIZE -
+ ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ /* Copy and update relevant pointers and counters */
+ memcpy(
+ &ctx->partial_block_buffer[ctx->partial_block_buffer_length],
+ buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)
+ ((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+
+ /* The extra block should never contain more than 1 block */
+ assert(ctx->partial_block_buffer_length <= SHA256_BLOCK_SIZE);
+
+ /*
+ * If the extra block buffer contains exactly 1 block,
+ * it can be hashed.
+ */
+ if (ctx->partial_block_buffer_length >= SHA256_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+ ctx = (struct sha256_hash_ctx *)
+ sha256_job_mgr_submit(&mgr->mgr, &ctx->job);
+ }
+ }
+
+ return sha256_ctx_mgr_resubmit(mgr, ctx);
+}
+
+static struct sha256_hash_ctx *sha256_ctx_mgr_flush(struct sha256_ctx_mgr *mgr)
+{
+ struct sha256_hash_ctx *ctx;
+
+ while (1) {
+ ctx = (struct sha256_hash_ctx *)
+ sha256_job_mgr_flush(&mgr->mgr);
+
+ /* If flush returned 0, there are no more jobs in flight. */
+ if (!ctx)
+ return NULL;
+
+ /*
+ * If flush returned a job, resubmit the job to finish
+ * processing.
+ */
+ ctx = sha256_ctx_mgr_resubmit(mgr, ctx);
+
+ /*
+ * If sha256_ctx_mgr_resubmit returned a job, it is ready to
+ * be returned. Otherwise, all jobs currently being managed by
+ * the sha256_ctx_mgr still need processing. Loop.
+ */
+ if (ctx)
+ return ctx;
+ }
+}
+
+static int sha256_mb_init(struct ahash_request *areq)
+{
+ struct sha256_hash_ctx *sctx = ahash_request_ctx(areq);
+
+ hash_ctx_init(sctx);
+ sctx->job.result_digest[0] = SHA256_H0;
+ sctx->job.result_digest[1] = SHA256_H1;
+ sctx->job.result_digest[2] = SHA256_H2;
+ sctx->job.result_digest[3] = SHA256_H3;
+ sctx->job.result_digest[4] = SHA256_H4;
+ sctx->job.result_digest[5] = SHA256_H5;
+ sctx->job.result_digest[6] = SHA256_H6;
+ sctx->job.result_digest[7] = SHA256_H7;
+ sctx->total_length = 0;
+ sctx->partial_block_buffer_length = 0;
+ sctx->status = HASH_CTX_STS_IDLE;
+
+ return 0;
+}
+
+static int sha256_mb_set_results(struct mcryptd_hash_request_ctx *rctx)
+{
+ int i;
+ struct sha256_hash_ctx *sctx = ahash_request_ctx(&rctx->areq);
+ __be32 *dst = (__be32 *) rctx->out;
+
+ for (i = 0; i < 8; ++i)
+ dst[i] = cpu_to_be32(sctx->job.result_digest[i]);
+
+ return 0;
+}
+
+static int sha_finish_walk(struct mcryptd_hash_request_ctx **ret_rctx,
+ struct mcryptd_alg_cstate *cstate, bool flush)
+{
+ int flag = HASH_UPDATE;
+ int nbytes, err = 0;
+ struct mcryptd_hash_request_ctx *rctx = *ret_rctx;
+ struct sha256_hash_ctx *sha_ctx;
+
+ /* more work ? */
+ while (!(rctx->flag & HASH_DONE)) {
+ nbytes = crypto_ahash_walk_done(&rctx->walk, 0);
+ if (nbytes < 0) {
+ err = nbytes;
+ goto out;
+ }
+ /* check if the walk is done */
+ if (crypto_ahash_walk_last(&rctx->walk)) {
+ rctx->flag |= HASH_DONE;
+ if (rctx->flag & HASH_FINAL)
+ flag |= HASH_LAST;
+
+ }
+ sha_ctx = (struct sha256_hash_ctx *)
+ ahash_request_ctx(&rctx->areq);
+ kernel_fpu_begin();
+ sha_ctx = sha256_ctx_mgr_submit(cstate->mgr, sha_ctx,
+ rctx->walk.data, nbytes, flag);
+ if (!sha_ctx) {
+ if (flush)
+ sha_ctx = sha256_ctx_mgr_flush(cstate->mgr);
+ }
+ kernel_fpu_end();
+ if (sha_ctx)
+ rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+ else {
+ rctx = NULL;
+ goto out;
+ }
+ }
+
+ /* copy the results */
+ if (rctx->flag & HASH_FINAL)
+ sha256_mb_set_results(rctx);
+
+out:
+ *ret_rctx = rctx;
+ return err;
+}
+
+static int sha_complete_job(struct mcryptd_hash_request_ctx *rctx,
+ struct mcryptd_alg_cstate *cstate,
+ int err)
+{
+ struct ahash_request *req = cast_mcryptd_ctx_to_req(rctx);
+ struct sha256_hash_ctx *sha_ctx;
+ struct mcryptd_hash_request_ctx *req_ctx;
+ int ret;
+
+ /* remove from work list */
+ spin_lock(&cstate->work_lock);
+ list_del(&rctx->waiter);
+ spin_unlock(&cstate->work_lock);
+
+ if (irqs_disabled())
+ rctx->complete(&req->base, err);
+ else {
+ local_bh_disable();
+ rctx->complete(&req->base, err);
+ local_bh_enable();
+ }
+
+ /* check to see if there are other jobs that are done */
+ sha_ctx = sha256_ctx_mgr_get_comp_ctx(cstate->mgr);
+ while (sha_ctx) {
+ req_ctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+ ret = sha_finish_walk(&req_ctx, cstate, false);
+ if (req_ctx) {
+ spin_lock(&cstate->work_lock);
+ list_del(&req_ctx->waiter);
+ spin_unlock(&cstate->work_lock);
+
+ req = cast_mcryptd_ctx_to_req(req_ctx);
+ if (irqs_disabled())
+ req_ctx->complete(&req->base, ret);
+ else {
+ local_bh_disable();
+ req_ctx->complete(&req->base, ret);
+ local_bh_enable();
+ }
+ }
+ sha_ctx = sha256_ctx_mgr_get_comp_ctx(cstate->mgr);
+ }
+
+ return 0;
+}
+
+static void sha256_mb_add_list(struct mcryptd_hash_request_ctx *rctx,
+ struct mcryptd_alg_cstate *cstate)
+{
+ unsigned long next_flush;
+ unsigned long delay = usecs_to_jiffies(FLUSH_INTERVAL);
+
+ /* initialize tag */
+ rctx->tag.arrival = jiffies; /* tag the arrival time */
+ rctx->tag.seq_num = cstate->next_seq_num++;
+ next_flush = rctx->tag.arrival + delay;
+ rctx->tag.expire = next_flush;
+
+ spin_lock(&cstate->work_lock);
+ list_add_tail(&rctx->waiter, &cstate->work_list);
+ spin_unlock(&cstate->work_lock);
+
+ mcryptd_arm_flusher(cstate, delay);
+}
+
+static int sha256_mb_update(struct ahash_request *areq)
+{
+ struct mcryptd_hash_request_ctx *rctx =
+ container_of(areq, struct mcryptd_hash_request_ctx, areq);
+ struct mcryptd_alg_cstate *cstate =
+ this_cpu_ptr(sha256_mb_alg_state.alg_cstate);
+
+ struct ahash_request *req = cast_mcryptd_ctx_to_req(rctx);
+ struct sha256_hash_ctx *sha_ctx;
+ int ret = 0, nbytes;
+
+ /* sanity check */
+ if (rctx->tag.cpu != smp_processor_id()) {
+ pr_err("mcryptd error: cpu clash\n");
+ goto done;
+ }
+
+ /* need to init context */
+ req_ctx_init(rctx, areq);
+
+ nbytes = crypto_ahash_walk_first(req, &rctx->walk);
+
+ if (nbytes < 0) {
+ ret = nbytes;
+ goto done;
+ }
+
+ if (crypto_ahash_walk_last(&rctx->walk))
+ rctx->flag |= HASH_DONE;
+
+ /* submit */
+ sha_ctx = (struct sha256_hash_ctx *) ahash_request_ctx(areq);
+ sha256_mb_add_list(rctx, cstate);
+ kernel_fpu_begin();
+ sha_ctx = sha256_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data,
+ nbytes, HASH_UPDATE);
+ kernel_fpu_end();
+
+ /* check if anything is returned */
+ if (!sha_ctx)
+ return -EINPROGRESS;
+
+ if (sha_ctx->error) {
+ ret = sha_ctx->error;
+ rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+ goto done;
+ }
+
+ rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+ ret = sha_finish_walk(&rctx, cstate, false);
+
+ if (!rctx)
+ return -EINPROGRESS;
+done:
+ sha_complete_job(rctx, cstate, ret);
+ return ret;
+}
+
+static int sha256_mb_finup(struct ahash_request *areq)
+{
+ struct mcryptd_hash_request_ctx *rctx =
+ container_of(areq, struct mcryptd_hash_request_ctx, areq);
+ struct mcryptd_alg_cstate *cstate =
+ this_cpu_ptr(sha256_mb_alg_state.alg_cstate);
+
+ struct ahash_request *req = cast_mcryptd_ctx_to_req(rctx);
+ struct sha256_hash_ctx *sha_ctx;
+ int ret = 0, flag = HASH_UPDATE, nbytes;
+
+ /* sanity check */
+ if (rctx->tag.cpu != smp_processor_id()) {
+ pr_err("mcryptd error: cpu clash\n");
+ goto done;
+ }
+
+ /* need to init context */
+ req_ctx_init(rctx, areq);
+
+ nbytes = crypto_ahash_walk_first(req, &rctx->walk);
+
+ if (nbytes < 0) {
+ ret = nbytes;
+ goto done;
+ }
+
+ if (crypto_ahash_walk_last(&rctx->walk)) {
+ rctx->flag |= HASH_DONE;
+ flag = HASH_LAST;
+ }
+
+ /* submit */
+ rctx->flag |= HASH_FINAL;
+ sha_ctx = (struct sha256_hash_ctx *) ahash_request_ctx(areq);
+ sha256_mb_add_list(rctx, cstate);
+
+ kernel_fpu_begin();
+ sha_ctx = sha256_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data,
+ nbytes, flag);
+ kernel_fpu_end();
+
+ /* check if anything is returned */
+ if (!sha_ctx)
+ return -EINPROGRESS;
+
+ if (sha_ctx->error) {
+ ret = sha_ctx->error;
+ goto done;
+ }
+
+ rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+ ret = sha_finish_walk(&rctx, cstate, false);
+ if (!rctx)
+ return -EINPROGRESS;
+done:
+ sha_complete_job(rctx, cstate, ret);
+ return ret;
+}
+
+static int sha256_mb_final(struct ahash_request *areq)
+{
+ struct mcryptd_hash_request_ctx *rctx =
+ container_of(areq, struct mcryptd_hash_request_ctx,
+ areq);
+ struct mcryptd_alg_cstate *cstate =
+ this_cpu_ptr(sha256_mb_alg_state.alg_cstate);
+
+ struct sha256_hash_ctx *sha_ctx;
+ int ret = 0;
+ u8 data;
+
+ /* sanity check */
+ if (rctx->tag.cpu != smp_processor_id()) {
+ pr_err("mcryptd error: cpu clash\n");
+ goto done;
+ }
+
+ /* need to init context */
+ req_ctx_init(rctx, areq);
+
+ rctx->flag |= HASH_DONE | HASH_FINAL;
+
+ sha_ctx = (struct sha256_hash_ctx *) ahash_request_ctx(areq);
+ /* flag HASH_FINAL and 0 data size */
+ sha256_mb_add_list(rctx, cstate);
+ kernel_fpu_begin();
+ sha_ctx = sha256_ctx_mgr_submit(cstate->mgr, sha_ctx, &data, 0,
+ HASH_LAST);
+ kernel_fpu_end();
+
+ /* check if anything is returned */
+ if (!sha_ctx)
+ return -EINPROGRESS;
+
+ if (sha_ctx->error) {
+ ret = sha_ctx->error;
+ rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+ goto done;
+ }
+
+ rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+ ret = sha_finish_walk(&rctx, cstate, false);
+ if (!rctx)
+ return -EINPROGRESS;
+done:
+ sha_complete_job(rctx, cstate, ret);
+ return ret;
+}
+
+static int sha256_mb_export(struct ahash_request *areq, void *out)
+{
+ struct sha256_hash_ctx *sctx = ahash_request_ctx(areq);
+
+ memcpy(out, sctx, sizeof(*sctx));
+
+ return 0;
+}
+
+static int sha256_mb_import(struct ahash_request *areq, const void *in)
+{
+ struct sha256_hash_ctx *sctx = ahash_request_ctx(areq);
+
+ memcpy(sctx, in, sizeof(*sctx));
+
+ return 0;
+}
+
+static int sha256_mb_async_init_tfm(struct crypto_tfm *tfm)
+{
+ struct mcryptd_ahash *mcryptd_tfm;
+ struct sha256_mb_ctx *ctx = crypto_tfm_ctx(tfm);
+ struct mcryptd_hash_ctx *mctx;
+
+ mcryptd_tfm = mcryptd_alloc_ahash("__intel_sha256-mb",
+ CRYPTO_ALG_INTERNAL,
+ CRYPTO_ALG_INTERNAL);
+ if (IS_ERR(mcryptd_tfm))
+ return PTR_ERR(mcryptd_tfm);
+ mctx = crypto_ahash_ctx(&mcryptd_tfm->base);
+ mctx->alg_state = &sha256_mb_alg_state;
+ ctx->mcryptd_tfm = mcryptd_tfm;
+ crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
+ sizeof(struct ahash_request) +
+ crypto_ahash_reqsize(&mcryptd_tfm->base));
+
+ return 0;
+}
+
+static void sha256_mb_async_exit_tfm(struct crypto_tfm *tfm)
+{
+ struct sha256_mb_ctx *ctx = crypto_tfm_ctx(tfm);
+
+ mcryptd_free_ahash(ctx->mcryptd_tfm);
+}
+
+static int sha256_mb_areq_init_tfm(struct crypto_tfm *tfm)
+{
+ crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
+ sizeof(struct ahash_request) +
+ sizeof(struct sha256_hash_ctx));
+
+ return 0;
+}
+
+static void sha256_mb_areq_exit_tfm(struct crypto_tfm *tfm)
+{
+ struct sha256_mb_ctx *ctx = crypto_tfm_ctx(tfm);
+
+ mcryptd_free_ahash(ctx->mcryptd_tfm);
+}
+
+static struct ahash_alg sha256_mb_areq_alg = {
+ .init = sha256_mb_init,
+ .update = sha256_mb_update,
+ .final = sha256_mb_final,
+ .finup = sha256_mb_finup,
+ .export = sha256_mb_export,
+ .import = sha256_mb_import,
+ .halg = {
+ .digestsize = SHA256_DIGEST_SIZE,
+ .statesize = sizeof(struct sha256_hash_ctx),
+ .base = {
+ .cra_name = "__sha256-mb",
+ .cra_driver_name = "__intel_sha256-mb",
+ .cra_priority = 100,
+ /*
+ * use ASYNC flag as some buffers in multi-buffer
+ * algo may not have completed before hashing thread
+ * sleep
+ */
+ .cra_flags = CRYPTO_ALG_ASYNC |
+ CRYPTO_ALG_INTERNAL,
+ .cra_blocksize = SHA256_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ .cra_list = LIST_HEAD_INIT
+ (sha256_mb_areq_alg.halg.base.cra_list),
+ .cra_init = sha256_mb_areq_init_tfm,
+ .cra_exit = sha256_mb_areq_exit_tfm,
+ .cra_ctxsize = sizeof(struct sha256_hash_ctx),
+ }
+ }
+};
+
+static int sha256_mb_async_init(struct ahash_request *req)
+{
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+ struct sha256_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+ struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+ struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+ memcpy(mcryptd_req, req, sizeof(*req));
+ ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+ return crypto_ahash_init(mcryptd_req);
+}
+
+static int sha256_mb_async_update(struct ahash_request *req)
+{
+ struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+ struct sha256_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+ struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+ memcpy(mcryptd_req, req, sizeof(*req));
+ ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+ return crypto_ahash_update(mcryptd_req);
+}
+
+static int sha256_mb_async_finup(struct ahash_request *req)
+{
+ struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+ struct sha256_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+ struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+ memcpy(mcryptd_req, req, sizeof(*req));
+ ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+ return crypto_ahash_finup(mcryptd_req);
+}
+
+static int sha256_mb_async_final(struct ahash_request *req)
+{
+ struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+ struct sha256_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+ struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+ memcpy(mcryptd_req, req, sizeof(*req));
+ ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+ return crypto_ahash_final(mcryptd_req);
+}
+
+static int sha256_mb_async_digest(struct ahash_request *req)
+{
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+ struct sha256_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+ struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+ struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+ memcpy(mcryptd_req, req, sizeof(*req));
+ ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+ return crypto_ahash_digest(mcryptd_req);
+}
+
+static int sha256_mb_async_export(struct ahash_request *req, void *out)
+{
+ struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+ struct sha256_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+ struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+ memcpy(mcryptd_req, req, sizeof(*req));
+ ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+ return crypto_ahash_export(mcryptd_req, out);
+}
+
+static int sha256_mb_async_import(struct ahash_request *req, const void *in)
+{
+ struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+ struct sha256_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+ struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+ struct crypto_ahash *child = mcryptd_ahash_child(mcryptd_tfm);
+ struct mcryptd_hash_request_ctx *rctx;
+ struct ahash_request *areq;
+
+ memcpy(mcryptd_req, req, sizeof(*req));
+ ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+ rctx = ahash_request_ctx(mcryptd_req);
+ areq = &rctx->areq;
+
+ ahash_request_set_tfm(areq, child);
+ ahash_request_set_callback(areq, CRYPTO_TFM_REQ_MAY_SLEEP,
+ rctx->complete, req);
+
+ return crypto_ahash_import(mcryptd_req, in);
+}
+
+static struct ahash_alg sha256_mb_async_alg = {
+ .init = sha256_mb_async_init,
+ .update = sha256_mb_async_update,
+ .final = sha256_mb_async_final,
+ .finup = sha256_mb_async_finup,
+ .export = sha256_mb_async_export,
+ .import = sha256_mb_async_import,
+ .digest = sha256_mb_async_digest,
+ .halg = {
+ .digestsize = SHA256_DIGEST_SIZE,
+ .statesize = sizeof(struct sha256_hash_ctx),
+ .base = {
+ .cra_name = "sha256",
+ .cra_driver_name = "sha256_mb",
+ /*
+ * Low priority, since with few concurrent hash requests
+ * this is extremely slow due to the flush delay. Users
+ * whose workloads would benefit from this can request
+ * it explicitly by driver name, or can increase its
+ * priority at runtime using NETLINK_CRYPTO.
+ */
+ .cra_priority = 50,
+ .cra_flags = CRYPTO_ALG_ASYNC,
+ .cra_blocksize = SHA256_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ .cra_list = LIST_HEAD_INIT
+ (sha256_mb_async_alg.halg.base.cra_list),
+ .cra_init = sha256_mb_async_init_tfm,
+ .cra_exit = sha256_mb_async_exit_tfm,
+ .cra_ctxsize = sizeof(struct sha256_mb_ctx),
+ .cra_alignmask = 0,
+ },
+ },
+};
+
+static unsigned long sha256_mb_flusher(struct mcryptd_alg_cstate *cstate)
+{
+ struct mcryptd_hash_request_ctx *rctx;
+ unsigned long cur_time;
+ unsigned long next_flush = 0;
+ struct sha256_hash_ctx *sha_ctx;
+
+
+ cur_time = jiffies;
+
+ while (!list_empty(&cstate->work_list)) {
+ rctx = list_entry(cstate->work_list.next,
+ struct mcryptd_hash_request_ctx, waiter);
+ if (time_before(cur_time, rctx->tag.expire))
+ break;
+ kernel_fpu_begin();
+ sha_ctx = (struct sha256_hash_ctx *)
+ sha256_ctx_mgr_flush(cstate->mgr);
+ kernel_fpu_end();
+ if (!sha_ctx) {
+ pr_err("sha256_mb error: nothing got"
+ " flushed for non-empty list\n");
+ break;
+ }
+ rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+ sha_finish_walk(&rctx, cstate, true);
+ sha_complete_job(rctx, cstate, 0);
+ }
+
+ if (!list_empty(&cstate->work_list)) {
+ rctx = list_entry(cstate->work_list.next,
+ struct mcryptd_hash_request_ctx, waiter);
+ /* get the hash context and then flush time */
+ next_flush = rctx->tag.expire;
+ mcryptd_arm_flusher(cstate, get_delay(next_flush));
+ }
+ return next_flush;
+}
+
+static int __init sha256_mb_mod_init(void)
+{
+
+ int cpu;
+ int err;
+ struct mcryptd_alg_cstate *cpu_state;
+
+ /* check for dependent cpu features */
+ if (!boot_cpu_has(X86_FEATURE_AVX2) ||
+ !boot_cpu_has(X86_FEATURE_BMI2))
+ return -ENODEV;
+
+ /* initialize multibuffer structures */
+ sha256_mb_alg_state.alg_cstate = alloc_percpu
+ (struct mcryptd_alg_cstate);
+
+ sha256_job_mgr_init = sha256_mb_mgr_init_avx2;
+ sha256_job_mgr_submit = sha256_mb_mgr_submit_avx2;
+ sha256_job_mgr_flush = sha256_mb_mgr_flush_avx2;
+ sha256_job_mgr_get_comp_job = sha256_mb_mgr_get_comp_job_avx2;
+
+ if (!sha256_mb_alg_state.alg_cstate)
+ return -ENOMEM;
+ for_each_possible_cpu(cpu) {
+ cpu_state = per_cpu_ptr(sha256_mb_alg_state.alg_cstate, cpu);
+ cpu_state->next_flush = 0;
+ cpu_state->next_seq_num = 0;
+ cpu_state->flusher_engaged = false;
+ INIT_DELAYED_WORK(&cpu_state->flush, mcryptd_flusher);
+ cpu_state->cpu = cpu;
+ cpu_state->alg_state = &sha256_mb_alg_state;
+ cpu_state->mgr = kzalloc(sizeof(struct sha256_ctx_mgr),
+ GFP_KERNEL);
+ if (!cpu_state->mgr)
+ goto err2;
+ sha256_ctx_mgr_init(cpu_state->mgr);
+ INIT_LIST_HEAD(&cpu_state->work_list);
+ spin_lock_init(&cpu_state->work_lock);
+ }
+ sha256_mb_alg_state.flusher = &sha256_mb_flusher;
+
+ err = crypto_register_ahash(&sha256_mb_areq_alg);
+ if (err)
+ goto err2;
+ err = crypto_register_ahash(&sha256_mb_async_alg);
+ if (err)
+ goto err1;
+
+
+ return 0;
+err1:
+ crypto_unregister_ahash(&sha256_mb_areq_alg);
+err2:
+ for_each_possible_cpu(cpu) {
+ cpu_state = per_cpu_ptr(sha256_mb_alg_state.alg_cstate, cpu);
+ kfree(cpu_state->mgr);
+ }
+ free_percpu(sha256_mb_alg_state.alg_cstate);
+ return -ENODEV;
+}
+
+static void __exit sha256_mb_mod_fini(void)
+{
+ int cpu;
+ struct mcryptd_alg_cstate *cpu_state;
+
+ crypto_unregister_ahash(&sha256_mb_async_alg);
+ crypto_unregister_ahash(&sha256_mb_areq_alg);
+ for_each_possible_cpu(cpu) {
+ cpu_state = per_cpu_ptr(sha256_mb_alg_state.alg_cstate, cpu);
+ kfree(cpu_state->mgr);
+ }
+ free_percpu(sha256_mb_alg_state.alg_cstate);
+}
+
+module_init(sha256_mb_mod_init);
+module_exit(sha256_mb_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm, multi buffer accelerated");
+
+MODULE_ALIAS_CRYPTO("sha256");
diff --git a/arch/x86/crypto/sha256-mb/sha256_mb_ctx.h b/arch/x86/crypto/sha256-mb/sha256_mb_ctx.h
new file mode 100644
index 000000000..7c432543d
--- /dev/null
+++ b/arch/x86/crypto/sha256-mb/sha256_mb_ctx.h
@@ -0,0 +1,134 @@
+/*
+ * Header file for multi buffer SHA256 context
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * Megha Dey <megha.dey@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _SHA_MB_CTX_INTERNAL_H
+#define _SHA_MB_CTX_INTERNAL_H
+
+#include "sha256_mb_mgr.h"
+
+#define HASH_UPDATE 0x00
+#define HASH_LAST 0x01
+#define HASH_DONE 0x02
+#define HASH_FINAL 0x04
+
+#define HASH_CTX_STS_IDLE 0x00
+#define HASH_CTX_STS_PROCESSING 0x01
+#define HASH_CTX_STS_LAST 0x02
+#define HASH_CTX_STS_COMPLETE 0x04
+
+enum hash_ctx_error {
+ HASH_CTX_ERROR_NONE = 0,
+ HASH_CTX_ERROR_INVALID_FLAGS = -1,
+ HASH_CTX_ERROR_ALREADY_PROCESSING = -2,
+ HASH_CTX_ERROR_ALREADY_COMPLETED = -3,
+
+#ifdef HASH_CTX_DEBUG
+ HASH_CTX_ERROR_DEBUG_DIGEST_MISMATCH = -4,
+#endif
+};
+
+
+#define hash_ctx_user_data(ctx) ((ctx)->user_data)
+#define hash_ctx_digest(ctx) ((ctx)->job.result_digest)
+#define hash_ctx_processing(ctx) ((ctx)->status & HASH_CTX_STS_PROCESSING)
+#define hash_ctx_complete(ctx) ((ctx)->status == HASH_CTX_STS_COMPLETE)
+#define hash_ctx_status(ctx) ((ctx)->status)
+#define hash_ctx_error(ctx) ((ctx)->error)
+#define hash_ctx_init(ctx) \
+ do { \
+ (ctx)->error = HASH_CTX_ERROR_NONE; \
+ (ctx)->status = HASH_CTX_STS_COMPLETE; \
+ } while (0)
+
+
+/* Hash Constants and Typedefs */
+#define SHA256_DIGEST_LENGTH 8
+#define SHA256_LOG2_BLOCK_SIZE 6
+
+#define SHA256_PADLENGTHFIELD_SIZE 8
+
+#ifdef SHA_MB_DEBUG
+#define assert(expr) \
+do { \
+ if (unlikely(!(expr))) { \
+ printk(KERN_ERR "Assertion failed! %s,%s,%s,line=%d\n", \
+ #expr, __FILE__, __func__, __LINE__); \
+ } \
+} while (0)
+#else
+#define assert(expr) do {} while (0)
+#endif
+
+struct sha256_ctx_mgr {
+ struct sha256_mb_mgr mgr;
+};
+
+/* typedef struct sha256_ctx_mgr sha256_ctx_mgr; */
+
+struct sha256_hash_ctx {
+ /* Must be at struct offset 0 */
+ struct job_sha256 job;
+ /* status flag */
+ int status;
+ /* error flag */
+ int error;
+
+ uint64_t total_length;
+ const void *incoming_buffer;
+ uint32_t incoming_buffer_length;
+ uint8_t partial_block_buffer[SHA256_BLOCK_SIZE * 2];
+ uint32_t partial_block_buffer_length;
+ void *user_data;
+};
+
+#endif
diff --git a/arch/x86/crypto/sha256-mb/sha256_mb_mgr.h b/arch/x86/crypto/sha256-mb/sha256_mb_mgr.h
new file mode 100644
index 000000000..b01ae408c
--- /dev/null
+++ b/arch/x86/crypto/sha256-mb/sha256_mb_mgr.h
@@ -0,0 +1,108 @@
+/*
+ * Header file for multi buffer SHA256 algorithm manager
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * Megha Dey <megha.dey@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef __SHA_MB_MGR_H
+#define __SHA_MB_MGR_H
+
+#include <linux/types.h>
+
+#define NUM_SHA256_DIGEST_WORDS 8
+
+enum job_sts { STS_UNKNOWN = 0,
+ STS_BEING_PROCESSED = 1,
+ STS_COMPLETED = 2,
+ STS_INTERNAL_ERROR = 3,
+ STS_ERROR = 4
+};
+
+struct job_sha256 {
+ u8 *buffer;
+ u32 len;
+ u32 result_digest[NUM_SHA256_DIGEST_WORDS] __aligned(32);
+ enum job_sts status;
+ void *user_data;
+};
+
+/* SHA256 out-of-order scheduler */
+
+/* typedef uint32_t sha8_digest_array[8][8]; */
+
+struct sha256_args_x8 {
+ uint32_t digest[8][8];
+ uint8_t *data_ptr[8];
+};
+
+struct sha256_lane_data {
+ struct job_sha256 *job_in_lane;
+};
+
+struct sha256_mb_mgr {
+ struct sha256_args_x8 args;
+
+ uint32_t lens[8];
+
+ /* each byte is index (0...7) of unused lanes */
+ uint64_t unused_lanes;
+ /* byte 4 is set to FF as a flag */
+ struct sha256_lane_data ldata[8];
+};
+
+
+#define SHA256_MB_MGR_NUM_LANES_AVX2 8
+
+void sha256_mb_mgr_init_avx2(struct sha256_mb_mgr *state);
+struct job_sha256 *sha256_mb_mgr_submit_avx2(struct sha256_mb_mgr *state,
+ struct job_sha256 *job);
+struct job_sha256 *sha256_mb_mgr_flush_avx2(struct sha256_mb_mgr *state);
+struct job_sha256 *sha256_mb_mgr_get_comp_job_avx2(struct sha256_mb_mgr *state);
+
+#endif
diff --git a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_datastruct.S b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_datastruct.S
new file mode 100644
index 000000000..5c377bac2
--- /dev/null
+++ b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_datastruct.S
@@ -0,0 +1,304 @@
+/*
+ * Header file for multi buffer SHA256 algorithm data structure
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * Megha Dey <megha.dey@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+# Macros for defining data structures
+
+# Usage example
+
+#START_FIELDS # JOB_AES
+### name size align
+#FIELD _plaintext, 8, 8 # pointer to plaintext
+#FIELD _ciphertext, 8, 8 # pointer to ciphertext
+#FIELD _IV, 16, 8 # IV
+#FIELD _keys, 8, 8 # pointer to keys
+#FIELD _len, 4, 4 # length in bytes
+#FIELD _status, 4, 4 # status enumeration
+#FIELD _user_data, 8, 8 # pointer to user data
+#UNION _union, size1, align1, \
+# size2, align2, \
+# size3, align3, \
+# ...
+#END_FIELDS
+#%assign _JOB_AES_size _FIELD_OFFSET
+#%assign _JOB_AES_align _STRUCT_ALIGN
+
+#########################################################################
+
+# Alternate "struc-like" syntax:
+# STRUCT job_aes2
+# RES_Q .plaintext, 1
+# RES_Q .ciphertext, 1
+# RES_DQ .IV, 1
+# RES_B .nested, _JOB_AES_SIZE, _JOB_AES_ALIGN
+# RES_U .union, size1, align1, \
+# size2, align2, \
+# ...
+# ENDSTRUCT
+# # Following only needed if nesting
+# %assign job_aes2_size _FIELD_OFFSET
+# %assign job_aes2_align _STRUCT_ALIGN
+#
+# RES_* macros take a name, a count and an optional alignment.
+# The count in in terms of the base size of the macro, and the
+# default alignment is the base size.
+# The macros are:
+# Macro Base size
+# RES_B 1
+# RES_W 2
+# RES_D 4
+# RES_Q 8
+# RES_DQ 16
+# RES_Y 32
+# RES_Z 64
+#
+# RES_U defines a union. It's arguments are a name and two or more
+# pairs of "size, alignment"
+#
+# The two assigns are only needed if this structure is being nested
+# within another. Even if the assigns are not done, one can still use
+# STRUCT_NAME_size as the size of the structure.
+#
+# Note that for nesting, you still need to assign to STRUCT_NAME_size.
+#
+# The differences between this and using "struc" directly are that each
+# type is implicitly aligned to its natural length (although this can be
+# over-ridden with an explicit third parameter), and that the structure
+# is padded at the end to its overall alignment.
+#
+
+#########################################################################
+
+#ifndef _DATASTRUCT_ASM_
+#define _DATASTRUCT_ASM_
+
+#define SZ8 8*SHA256_DIGEST_WORD_SIZE
+#define ROUNDS 64*SZ8
+#define PTR_SZ 8
+#define SHA256_DIGEST_WORD_SIZE 4
+#define MAX_SHA256_LANES 8
+#define SHA256_DIGEST_WORDS 8
+#define SHA256_DIGEST_ROW_SIZE (MAX_SHA256_LANES * SHA256_DIGEST_WORD_SIZE)
+#define SHA256_DIGEST_SIZE (SHA256_DIGEST_ROW_SIZE * SHA256_DIGEST_WORDS)
+#define SHA256_BLK_SZ 64
+
+# START_FIELDS
+.macro START_FIELDS
+ _FIELD_OFFSET = 0
+ _STRUCT_ALIGN = 0
+.endm
+
+# FIELD name size align
+.macro FIELD name size align
+ _FIELD_OFFSET = (_FIELD_OFFSET + (\align) - 1) & (~ ((\align)-1))
+ \name = _FIELD_OFFSET
+ _FIELD_OFFSET = _FIELD_OFFSET + (\size)
+.if (\align > _STRUCT_ALIGN)
+ _STRUCT_ALIGN = \align
+.endif
+.endm
+
+# END_FIELDS
+.macro END_FIELDS
+ _FIELD_OFFSET = (_FIELD_OFFSET + _STRUCT_ALIGN-1) & (~ (_STRUCT_ALIGN-1))
+.endm
+
+########################################################################
+
+.macro STRUCT p1
+START_FIELDS
+.struc \p1
+.endm
+
+.macro ENDSTRUCT
+ tmp = _FIELD_OFFSET
+ END_FIELDS
+ tmp = (_FIELD_OFFSET - %%tmp)
+.if (tmp > 0)
+ .lcomm tmp
+.endif
+.endstruc
+.endm
+
+## RES_int name size align
+.macro RES_int p1 p2 p3
+ name = \p1
+ size = \p2
+ align = .\p3
+
+ _FIELD_OFFSET = (_FIELD_OFFSET + (align) - 1) & (~ ((align)-1))
+.align align
+.lcomm name size
+ _FIELD_OFFSET = _FIELD_OFFSET + (size)
+.if (align > _STRUCT_ALIGN)
+ _STRUCT_ALIGN = align
+.endif
+.endm
+
+# macro RES_B name, size [, align]
+.macro RES_B _name, _size, _align=1
+RES_int _name _size _align
+.endm
+
+# macro RES_W name, size [, align]
+.macro RES_W _name, _size, _align=2
+RES_int _name 2*(_size) _align
+.endm
+
+# macro RES_D name, size [, align]
+.macro RES_D _name, _size, _align=4
+RES_int _name 4*(_size) _align
+.endm
+
+# macro RES_Q name, size [, align]
+.macro RES_Q _name, _size, _align=8
+RES_int _name 8*(_size) _align
+.endm
+
+# macro RES_DQ name, size [, align]
+.macro RES_DQ _name, _size, _align=16
+RES_int _name 16*(_size) _align
+.endm
+
+# macro RES_Y name, size [, align]
+.macro RES_Y _name, _size, _align=32
+RES_int _name 32*(_size) _align
+.endm
+
+# macro RES_Z name, size [, align]
+.macro RES_Z _name, _size, _align=64
+RES_int _name 64*(_size) _align
+.endm
+
+#endif
+
+
+########################################################################
+#### Define SHA256 Out Of Order Data Structures
+########################################################################
+
+START_FIELDS # LANE_DATA
+### name size align
+FIELD _job_in_lane, 8, 8 # pointer to job object
+END_FIELDS
+
+ _LANE_DATA_size = _FIELD_OFFSET
+ _LANE_DATA_align = _STRUCT_ALIGN
+
+########################################################################
+
+START_FIELDS # SHA256_ARGS_X4
+### name size align
+FIELD _digest, 4*8*8, 4 # transposed digest
+FIELD _data_ptr, 8*8, 8 # array of pointers to data
+END_FIELDS
+
+ _SHA256_ARGS_X4_size = _FIELD_OFFSET
+ _SHA256_ARGS_X4_align = _STRUCT_ALIGN
+ _SHA256_ARGS_X8_size = _FIELD_OFFSET
+ _SHA256_ARGS_X8_align = _STRUCT_ALIGN
+
+#######################################################################
+
+START_FIELDS # MB_MGR
+### name size align
+FIELD _args, _SHA256_ARGS_X4_size, _SHA256_ARGS_X4_align
+FIELD _lens, 4*8, 8
+FIELD _unused_lanes, 8, 8
+FIELD _ldata, _LANE_DATA_size*8, _LANE_DATA_align
+END_FIELDS
+
+ _MB_MGR_size = _FIELD_OFFSET
+ _MB_MGR_align = _STRUCT_ALIGN
+
+_args_digest = _args + _digest
+_args_data_ptr = _args + _data_ptr
+
+#######################################################################
+
+START_FIELDS #STACK_FRAME
+### name size align
+FIELD _data, 16*SZ8, 1 # transposed digest
+FIELD _digest, 8*SZ8, 1 # array of pointers to data
+FIELD _ytmp, 4*SZ8, 1
+FIELD _rsp, 8, 1
+END_FIELDS
+
+ _STACK_FRAME_size = _FIELD_OFFSET
+ _STACK_FRAME_align = _STRUCT_ALIGN
+
+#######################################################################
+
+########################################################################
+#### Define constants
+########################################################################
+
+#define STS_UNKNOWN 0
+#define STS_BEING_PROCESSED 1
+#define STS_COMPLETED 2
+
+########################################################################
+#### Define JOB_SHA256 structure
+########################################################################
+
+START_FIELDS # JOB_SHA256
+
+### name size align
+FIELD _buffer, 8, 8 # pointer to buffer
+FIELD _len, 8, 8 # length in bytes
+FIELD _result_digest, 8*4, 32 # Digest (output)
+FIELD _status, 4, 4
+FIELD _user_data, 8, 8
+END_FIELDS
+
+ _JOB_SHA256_size = _FIELD_OFFSET
+ _JOB_SHA256_align = _STRUCT_ALIGN
diff --git a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S
new file mode 100644
index 000000000..d2364c55b
--- /dev/null
+++ b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S
@@ -0,0 +1,307 @@
+/*
+ * Flush routine for SHA256 multibuffer
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * Megha Dey <megha.dey@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <linux/linkage.h>
+#include <asm/frame.h>
+#include "sha256_mb_mgr_datastruct.S"
+
+.extern sha256_x8_avx2
+
+#LINUX register definitions
+#define arg1 %rdi
+#define arg2 %rsi
+
+# Common register definitions
+#define state arg1
+#define job arg2
+#define len2 arg2
+
+# idx must be a register not clobberred by sha1_mult
+#define idx %r8
+#define DWORD_idx %r8d
+
+#define unused_lanes %rbx
+#define lane_data %rbx
+#define tmp2 %rbx
+#define tmp2_w %ebx
+
+#define job_rax %rax
+#define tmp1 %rax
+#define size_offset %rax
+#define tmp %rax
+#define start_offset %rax
+
+#define tmp3 %arg1
+
+#define extra_blocks %arg2
+#define p %arg2
+
+.macro LABEL prefix n
+\prefix\n\():
+.endm
+
+.macro JNE_SKIP i
+jne skip_\i
+.endm
+
+.altmacro
+.macro SET_OFFSET _offset
+offset = \_offset
+.endm
+.noaltmacro
+
+# JOB_SHA256* sha256_mb_mgr_flush_avx2(MB_MGR *state)
+# arg 1 : rcx : state
+ENTRY(sha256_mb_mgr_flush_avx2)
+ FRAME_BEGIN
+ push %rbx
+
+ # If bit (32+3) is set, then all lanes are empty
+ mov _unused_lanes(state), unused_lanes
+ bt $32+3, unused_lanes
+ jc return_null
+
+ # find a lane with a non-null job
+ xor idx, idx
+ offset = (_ldata + 1 * _LANE_DATA_size + _job_in_lane)
+ cmpq $0, offset(state)
+ cmovne one(%rip), idx
+ offset = (_ldata + 2 * _LANE_DATA_size + _job_in_lane)
+ cmpq $0, offset(state)
+ cmovne two(%rip), idx
+ offset = (_ldata + 3 * _LANE_DATA_size + _job_in_lane)
+ cmpq $0, offset(state)
+ cmovne three(%rip), idx
+ offset = (_ldata + 4 * _LANE_DATA_size + _job_in_lane)
+ cmpq $0, offset(state)
+ cmovne four(%rip), idx
+ offset = (_ldata + 5 * _LANE_DATA_size + _job_in_lane)
+ cmpq $0, offset(state)
+ cmovne five(%rip), idx
+ offset = (_ldata + 6 * _LANE_DATA_size + _job_in_lane)
+ cmpq $0, offset(state)
+ cmovne six(%rip), idx
+ offset = (_ldata + 7 * _LANE_DATA_size + _job_in_lane)
+ cmpq $0, offset(state)
+ cmovne seven(%rip), idx
+
+ # copy idx to empty lanes
+copy_lane_data:
+ offset = (_args + _data_ptr)
+ mov offset(state,idx,8), tmp
+
+ I = 0
+.rep 8
+ offset = (_ldata + I * _LANE_DATA_size + _job_in_lane)
+ cmpq $0, offset(state)
+.altmacro
+ JNE_SKIP %I
+ offset = (_args + _data_ptr + 8*I)
+ mov tmp, offset(state)
+ offset = (_lens + 4*I)
+ movl $0xFFFFFFFF, offset(state)
+LABEL skip_ %I
+ I = (I+1)
+.noaltmacro
+.endr
+
+ # Find min length
+ vmovdqu _lens+0*16(state), %xmm0
+ vmovdqu _lens+1*16(state), %xmm1
+
+ vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A}
+ vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C}
+ vpminud %xmm3, %xmm2, %xmm2 # xmm2 has {x,x,E,F}
+ vpalignr $4, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,x,E}
+ vpminud %xmm3, %xmm2, %xmm2 # xmm2 has min val in low dword
+
+ vmovd %xmm2, DWORD_idx
+ mov idx, len2
+ and $0xF, idx
+ shr $4, len2
+ jz len_is_0
+
+ vpand clear_low_nibble(%rip), %xmm2, %xmm2
+ vpshufd $0, %xmm2, %xmm2
+
+ vpsubd %xmm2, %xmm0, %xmm0
+ vpsubd %xmm2, %xmm1, %xmm1
+
+ vmovdqu %xmm0, _lens+0*16(state)
+ vmovdqu %xmm1, _lens+1*16(state)
+
+ # "state" and "args" are the same address, arg1
+ # len is arg2
+ call sha256_x8_avx2
+ # state and idx are intact
+
+len_is_0:
+ # process completed job "idx"
+ imul $_LANE_DATA_size, idx, lane_data
+ lea _ldata(state, lane_data), lane_data
+
+ mov _job_in_lane(lane_data), job_rax
+ movq $0, _job_in_lane(lane_data)
+ movl $STS_COMPLETED, _status(job_rax)
+ mov _unused_lanes(state), unused_lanes
+ shl $4, unused_lanes
+ or idx, unused_lanes
+
+ mov unused_lanes, _unused_lanes(state)
+ movl $0xFFFFFFFF, _lens(state,idx,4)
+
+ vmovd _args_digest(state , idx, 4) , %xmm0
+ vpinsrd $1, _args_digest+1*32(state, idx, 4), %xmm0, %xmm0
+ vpinsrd $2, _args_digest+2*32(state, idx, 4), %xmm0, %xmm0
+ vpinsrd $3, _args_digest+3*32(state, idx, 4), %xmm0, %xmm0
+ vmovd _args_digest+4*32(state, idx, 4), %xmm1
+ vpinsrd $1, _args_digest+5*32(state, idx, 4), %xmm1, %xmm1
+ vpinsrd $2, _args_digest+6*32(state, idx, 4), %xmm1, %xmm1
+ vpinsrd $3, _args_digest+7*32(state, idx, 4), %xmm1, %xmm1
+
+ vmovdqu %xmm0, _result_digest(job_rax)
+ offset = (_result_digest + 1*16)
+ vmovdqu %xmm1, offset(job_rax)
+
+return:
+ pop %rbx
+ FRAME_END
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+ENDPROC(sha256_mb_mgr_flush_avx2)
+
+##############################################################################
+
+.align 16
+ENTRY(sha256_mb_mgr_get_comp_job_avx2)
+ push %rbx
+
+ ## if bit 32+3 is set, then all lanes are empty
+ mov _unused_lanes(state), unused_lanes
+ bt $(32+3), unused_lanes
+ jc .return_null
+
+ # Find min length
+ vmovdqu _lens(state), %xmm0
+ vmovdqu _lens+1*16(state), %xmm1
+
+ vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A}
+ vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C}
+ vpminud %xmm3, %xmm2, %xmm2 # xmm2 has {x,x,E,F}
+ vpalignr $4, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,x,E}
+ vpminud %xmm3, %xmm2, %xmm2 # xmm2 has min val in low dword
+
+ vmovd %xmm2, DWORD_idx
+ test $~0xF, idx
+ jnz .return_null
+
+ # process completed job "idx"
+ imul $_LANE_DATA_size, idx, lane_data
+ lea _ldata(state, lane_data), lane_data
+
+ mov _job_in_lane(lane_data), job_rax
+ movq $0, _job_in_lane(lane_data)
+ movl $STS_COMPLETED, _status(job_rax)
+ mov _unused_lanes(state), unused_lanes
+ shl $4, unused_lanes
+ or idx, unused_lanes
+ mov unused_lanes, _unused_lanes(state)
+
+ movl $0xFFFFFFFF, _lens(state, idx, 4)
+
+ vmovd _args_digest(state, idx, 4), %xmm0
+ vpinsrd $1, _args_digest+1*32(state, idx, 4), %xmm0, %xmm0
+ vpinsrd $2, _args_digest+2*32(state, idx, 4), %xmm0, %xmm0
+ vpinsrd $3, _args_digest+3*32(state, idx, 4), %xmm0, %xmm0
+ vmovd _args_digest+4*32(state, idx, 4), %xmm1
+ vpinsrd $1, _args_digest+5*32(state, idx, 4), %xmm1, %xmm1
+ vpinsrd $2, _args_digest+6*32(state, idx, 4), %xmm1, %xmm1
+ vpinsrd $3, _args_digest+7*32(state, idx, 4), %xmm1, %xmm1
+
+ vmovdqu %xmm0, _result_digest(job_rax)
+ offset = (_result_digest + 1*16)
+ vmovdqu %xmm1, offset(job_rax)
+
+ pop %rbx
+
+ ret
+
+.return_null:
+ xor job_rax, job_rax
+ pop %rbx
+ ret
+ENDPROC(sha256_mb_mgr_get_comp_job_avx2)
+
+.section .rodata.cst16.clear_low_nibble, "aM", @progbits, 16
+.align 16
+clear_low_nibble:
+.octa 0x000000000000000000000000FFFFFFF0
+
+.section .rodata.cst8, "aM", @progbits, 8
+.align 8
+one:
+.quad 1
+two:
+.quad 2
+three:
+.quad 3
+four:
+.quad 4
+five:
+.quad 5
+six:
+.quad 6
+seven:
+.quad 7
diff --git a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_init_avx2.c b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_init_avx2.c
new file mode 100644
index 000000000..b0c498371
--- /dev/null
+++ b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_init_avx2.c
@@ -0,0 +1,65 @@
+/*
+ * Initialization code for multi buffer SHA256 algorithm for AVX2
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * Megha Dey <megha.dey@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "sha256_mb_mgr.h"
+
+void sha256_mb_mgr_init_avx2(struct sha256_mb_mgr *state)
+{
+ unsigned int j;
+
+ state->unused_lanes = 0xF76543210ULL;
+ for (j = 0; j < 8; j++) {
+ state->lens[j] = 0xFFFFFFFF;
+ state->ldata[j].job_in_lane = NULL;
+ }
+}
diff --git a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_submit_avx2.S b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_submit_avx2.S
new file mode 100644
index 000000000..b36ae7454
--- /dev/null
+++ b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_submit_avx2.S
@@ -0,0 +1,214 @@
+/*
+ * Buffer submit code for multi buffer SHA256 algorithm
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * Megha Dey <megha.dey@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+#include "sha256_mb_mgr_datastruct.S"
+
+.extern sha256_x8_avx2
+
+# LINUX register definitions
+arg1 = %rdi
+arg2 = %rsi
+size_offset = %rcx
+tmp2 = %rcx
+extra_blocks = %rdx
+
+# Common definitions
+#define state arg1
+#define job %rsi
+#define len2 arg2
+#define p2 arg2
+
+# idx must be a register not clobberred by sha1_x8_avx2
+idx = %r8
+DWORD_idx = %r8d
+last_len = %r8
+
+p = %r11
+start_offset = %r11
+
+unused_lanes = %rbx
+BYTE_unused_lanes = %bl
+
+job_rax = %rax
+len = %rax
+DWORD_len = %eax
+
+lane = %r12
+tmp3 = %r12
+
+tmp = %r9
+DWORD_tmp = %r9d
+
+lane_data = %r10
+
+# JOB* sha256_mb_mgr_submit_avx2(MB_MGR *state, JOB_SHA256 *job)
+# arg 1 : rcx : state
+# arg 2 : rdx : job
+ENTRY(sha256_mb_mgr_submit_avx2)
+ FRAME_BEGIN
+ push %rbx
+ push %r12
+
+ mov _unused_lanes(state), unused_lanes
+ mov unused_lanes, lane
+ and $0xF, lane
+ shr $4, unused_lanes
+ imul $_LANE_DATA_size, lane, lane_data
+ movl $STS_BEING_PROCESSED, _status(job)
+ lea _ldata(state, lane_data), lane_data
+ mov unused_lanes, _unused_lanes(state)
+ movl _len(job), DWORD_len
+
+ mov job, _job_in_lane(lane_data)
+ shl $4, len
+ or lane, len
+
+ movl DWORD_len, _lens(state , lane, 4)
+
+ # Load digest words from result_digest
+ vmovdqu _result_digest(job), %xmm0
+ vmovdqu _result_digest+1*16(job), %xmm1
+ vmovd %xmm0, _args_digest(state, lane, 4)
+ vpextrd $1, %xmm0, _args_digest+1*32(state , lane, 4)
+ vpextrd $2, %xmm0, _args_digest+2*32(state , lane, 4)
+ vpextrd $3, %xmm0, _args_digest+3*32(state , lane, 4)
+ vmovd %xmm1, _args_digest+4*32(state , lane, 4)
+
+ vpextrd $1, %xmm1, _args_digest+5*32(state , lane, 4)
+ vpextrd $2, %xmm1, _args_digest+6*32(state , lane, 4)
+ vpextrd $3, %xmm1, _args_digest+7*32(state , lane, 4)
+
+ mov _buffer(job), p
+ mov p, _args_data_ptr(state, lane, 8)
+
+ cmp $0xF, unused_lanes
+ jne return_null
+
+start_loop:
+ # Find min length
+ vmovdqa _lens(state), %xmm0
+ vmovdqa _lens+1*16(state), %xmm1
+
+ vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A}
+ vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C}
+ vpminud %xmm3, %xmm2, %xmm2 # xmm2 has {x,x,E,F}
+ vpalignr $4, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,x,E}
+ vpminud %xmm3, %xmm2, %xmm2 # xmm2 has min val in low dword
+
+ vmovd %xmm2, DWORD_idx
+ mov idx, len2
+ and $0xF, idx
+ shr $4, len2
+ jz len_is_0
+
+ vpand clear_low_nibble(%rip), %xmm2, %xmm2
+ vpshufd $0, %xmm2, %xmm2
+
+ vpsubd %xmm2, %xmm0, %xmm0
+ vpsubd %xmm2, %xmm1, %xmm1
+
+ vmovdqa %xmm0, _lens + 0*16(state)
+ vmovdqa %xmm1, _lens + 1*16(state)
+
+ # "state" and "args" are the same address, arg1
+ # len is arg2
+ call sha256_x8_avx2
+
+ # state and idx are intact
+
+len_is_0:
+ # process completed job "idx"
+ imul $_LANE_DATA_size, idx, lane_data
+ lea _ldata(state, lane_data), lane_data
+
+ mov _job_in_lane(lane_data), job_rax
+ mov _unused_lanes(state), unused_lanes
+ movq $0, _job_in_lane(lane_data)
+ movl $STS_COMPLETED, _status(job_rax)
+ shl $4, unused_lanes
+ or idx, unused_lanes
+ mov unused_lanes, _unused_lanes(state)
+
+ movl $0xFFFFFFFF, _lens(state,idx,4)
+
+ vmovd _args_digest(state, idx, 4), %xmm0
+ vpinsrd $1, _args_digest+1*32(state , idx, 4), %xmm0, %xmm0
+ vpinsrd $2, _args_digest+2*32(state , idx, 4), %xmm0, %xmm0
+ vpinsrd $3, _args_digest+3*32(state , idx, 4), %xmm0, %xmm0
+ vmovd _args_digest+4*32(state, idx, 4), %xmm1
+
+ vpinsrd $1, _args_digest+5*32(state , idx, 4), %xmm1, %xmm1
+ vpinsrd $2, _args_digest+6*32(state , idx, 4), %xmm1, %xmm1
+ vpinsrd $3, _args_digest+7*32(state , idx, 4), %xmm1, %xmm1
+
+ vmovdqu %xmm0, _result_digest(job_rax)
+ vmovdqu %xmm1, _result_digest+1*16(job_rax)
+
+return:
+ pop %r12
+ pop %rbx
+ FRAME_END
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+ENDPROC(sha256_mb_mgr_submit_avx2)
+
+.section .rodata.cst16.clear_low_nibble, "aM", @progbits, 16
+.align 16
+clear_low_nibble:
+ .octa 0x000000000000000000000000FFFFFFF0
diff --git a/arch/x86/crypto/sha256-mb/sha256_x8_avx2.S b/arch/x86/crypto/sha256-mb/sha256_x8_avx2.S
new file mode 100644
index 000000000..1687c80c5
--- /dev/null
+++ b/arch/x86/crypto/sha256-mb/sha256_x8_avx2.S
@@ -0,0 +1,598 @@
+/*
+ * Multi-buffer SHA256 algorithm hash compute routine
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * Megha Dey <megha.dey@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/linkage.h>
+#include "sha256_mb_mgr_datastruct.S"
+
+## code to compute oct SHA256 using SSE-256
+## outer calling routine takes care of save and restore of XMM registers
+## Logic designed/laid out by JDG
+
+## Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; %ymm0-15
+## Linux clobbers: rax rbx rcx rdx rsi r9 r10 r11 r12 r13 r14 r15
+## Linux preserves: rdi rbp r8
+##
+## clobbers %ymm0-15
+
+arg1 = %rdi
+arg2 = %rsi
+reg3 = %rcx
+reg4 = %rdx
+
+# Common definitions
+STATE = arg1
+INP_SIZE = arg2
+
+IDX = %rax
+ROUND = %rbx
+TBL = reg3
+
+inp0 = %r9
+inp1 = %r10
+inp2 = %r11
+inp3 = %r12
+inp4 = %r13
+inp5 = %r14
+inp6 = %r15
+inp7 = reg4
+
+a = %ymm0
+b = %ymm1
+c = %ymm2
+d = %ymm3
+e = %ymm4
+f = %ymm5
+g = %ymm6
+h = %ymm7
+
+T1 = %ymm8
+
+a0 = %ymm12
+a1 = %ymm13
+a2 = %ymm14
+TMP = %ymm15
+TMP0 = %ymm6
+TMP1 = %ymm7
+
+TT0 = %ymm8
+TT1 = %ymm9
+TT2 = %ymm10
+TT3 = %ymm11
+TT4 = %ymm12
+TT5 = %ymm13
+TT6 = %ymm14
+TT7 = %ymm15
+
+# Define stack usage
+
+# Assume stack aligned to 32 bytes before call
+# Therefore FRAMESZ mod 32 must be 32-8 = 24
+
+#define FRAMESZ 0x388
+
+#define VMOVPS vmovups
+
+# TRANSPOSE8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
+# "transpose" data in {r0...r7} using temps {t0...t1}
+# Input looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
+# r0 = {a7 a6 a5 a4 a3 a2 a1 a0}
+# r1 = {b7 b6 b5 b4 b3 b2 b1 b0}
+# r2 = {c7 c6 c5 c4 c3 c2 c1 c0}
+# r3 = {d7 d6 d5 d4 d3 d2 d1 d0}
+# r4 = {e7 e6 e5 e4 e3 e2 e1 e0}
+# r5 = {f7 f6 f5 f4 f3 f2 f1 f0}
+# r6 = {g7 g6 g5 g4 g3 g2 g1 g0}
+# r7 = {h7 h6 h5 h4 h3 h2 h1 h0}
+#
+# Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
+# r0 = {h0 g0 f0 e0 d0 c0 b0 a0}
+# r1 = {h1 g1 f1 e1 d1 c1 b1 a1}
+# r2 = {h2 g2 f2 e2 d2 c2 b2 a2}
+# r3 = {h3 g3 f3 e3 d3 c3 b3 a3}
+# r4 = {h4 g4 f4 e4 d4 c4 b4 a4}
+# r5 = {h5 g5 f5 e5 d5 c5 b5 a5}
+# r6 = {h6 g6 f6 e6 d6 c6 b6 a6}
+# r7 = {h7 g7 f7 e7 d7 c7 b7 a7}
+#
+
+.macro TRANSPOSE8 r0 r1 r2 r3 r4 r5 r6 r7 t0 t1
+ # process top half (r0..r3) {a...d}
+ vshufps $0x44, \r1, \r0, \t0 # t0 = {b5 b4 a5 a4 b1 b0 a1 a0}
+ vshufps $0xEE, \r1, \r0, \r0 # r0 = {b7 b6 a7 a6 b3 b2 a3 a2}
+ vshufps $0x44, \r3, \r2, \t1 # t1 = {d5 d4 c5 c4 d1 d0 c1 c0}
+ vshufps $0xEE, \r3, \r2, \r2 # r2 = {d7 d6 c7 c6 d3 d2 c3 c2}
+ vshufps $0xDD, \t1, \t0, \r3 # r3 = {d5 c5 b5 a5 d1 c1 b1 a1}
+ vshufps $0x88, \r2, \r0, \r1 # r1 = {d6 c6 b6 a6 d2 c2 b2 a2}
+ vshufps $0xDD, \r2, \r0, \r0 # r0 = {d7 c7 b7 a7 d3 c3 b3 a3}
+ vshufps $0x88, \t1, \t0, \t0 # t0 = {d4 c4 b4 a4 d0 c0 b0 a0}
+
+ # use r2 in place of t0
+ # process bottom half (r4..r7) {e...h}
+ vshufps $0x44, \r5, \r4, \r2 # r2 = {f5 f4 e5 e4 f1 f0 e1 e0}
+ vshufps $0xEE, \r5, \r4, \r4 # r4 = {f7 f6 e7 e6 f3 f2 e3 e2}
+ vshufps $0x44, \r7, \r6, \t1 # t1 = {h5 h4 g5 g4 h1 h0 g1 g0}
+ vshufps $0xEE, \r7, \r6, \r6 # r6 = {h7 h6 g7 g6 h3 h2 g3 g2}
+ vshufps $0xDD, \t1, \r2, \r7 # r7 = {h5 g5 f5 e5 h1 g1 f1 e1}
+ vshufps $0x88, \r6, \r4, \r5 # r5 = {h6 g6 f6 e6 h2 g2 f2 e2}
+ vshufps $0xDD, \r6, \r4, \r4 # r4 = {h7 g7 f7 e7 h3 g3 f3 e3}
+ vshufps $0x88, \t1, \r2, \t1 # t1 = {h4 g4 f4 e4 h0 g0 f0 e0}
+
+ vperm2f128 $0x13, \r1, \r5, \r6 # h6...a6
+ vperm2f128 $0x02, \r1, \r5, \r2 # h2...a2
+ vperm2f128 $0x13, \r3, \r7, \r5 # h5...a5
+ vperm2f128 $0x02, \r3, \r7, \r1 # h1...a1
+ vperm2f128 $0x13, \r0, \r4, \r7 # h7...a7
+ vperm2f128 $0x02, \r0, \r4, \r3 # h3...a3
+ vperm2f128 $0x13, \t0, \t1, \r4 # h4...a4
+ vperm2f128 $0x02, \t0, \t1, \r0 # h0...a0
+
+.endm
+
+.macro ROTATE_ARGS
+TMP_ = h
+h = g
+g = f
+f = e
+e = d
+d = c
+c = b
+b = a
+a = TMP_
+.endm
+
+.macro _PRORD reg imm tmp
+ vpslld $(32-\imm),\reg,\tmp
+ vpsrld $\imm,\reg, \reg
+ vpor \tmp,\reg, \reg
+.endm
+
+# PRORD_nd reg, imm, tmp, src
+.macro _PRORD_nd reg imm tmp src
+ vpslld $(32-\imm), \src, \tmp
+ vpsrld $\imm, \src, \reg
+ vpor \tmp, \reg, \reg
+.endm
+
+# PRORD dst/src, amt
+.macro PRORD reg imm
+ _PRORD \reg,\imm,TMP
+.endm
+
+# PRORD_nd dst, src, amt
+.macro PRORD_nd reg tmp imm
+ _PRORD_nd \reg, \imm, TMP, \tmp
+.endm
+
+# arguments passed implicitly in preprocessor symbols i, a...h
+.macro ROUND_00_15 _T1 i
+ PRORD_nd a0,e,5 # sig1: a0 = (e >> 5)
+
+ vpxor g, f, a2 # ch: a2 = f^g
+ vpand e,a2, a2 # ch: a2 = (f^g)&e
+ vpxor g, a2, a2 # a2 = ch
+
+ PRORD_nd a1,e,25 # sig1: a1 = (e >> 25)
+
+ vmovdqu \_T1,(SZ8*(\i & 0xf))(%rsp)
+ vpaddd (TBL,ROUND,1), \_T1, \_T1 # T1 = W + K
+ vpxor e,a0, a0 # sig1: a0 = e ^ (e >> 5)
+ PRORD a0, 6 # sig1: a0 = (e >> 6) ^ (e >> 11)
+ vpaddd a2, h, h # h = h + ch
+ PRORD_nd a2,a,11 # sig0: a2 = (a >> 11)
+ vpaddd \_T1,h, h # h = h + ch + W + K
+ vpxor a1, a0, a0 # a0 = sigma1
+ PRORD_nd a1,a,22 # sig0: a1 = (a >> 22)
+ vpxor c, a, \_T1 # maj: T1 = a^c
+ add $SZ8, ROUND # ROUND++
+ vpand b, \_T1, \_T1 # maj: T1 = (a^c)&b
+ vpaddd a0, h, h
+ vpaddd h, d, d
+ vpxor a, a2, a2 # sig0: a2 = a ^ (a >> 11)
+ PRORD a2,2 # sig0: a2 = (a >> 2) ^ (a >> 13)
+ vpxor a1, a2, a2 # a2 = sig0
+ vpand c, a, a1 # maj: a1 = a&c
+ vpor \_T1, a1, a1 # a1 = maj
+ vpaddd a1, h, h # h = h + ch + W + K + maj
+ vpaddd a2, h, h # h = h + ch + W + K + maj + sigma0
+ ROTATE_ARGS
+.endm
+
+# arguments passed implicitly in preprocessor symbols i, a...h
+.macro ROUND_16_XX _T1 i
+ vmovdqu (SZ8*((\i-15)&0xf))(%rsp), \_T1
+ vmovdqu (SZ8*((\i-2)&0xf))(%rsp), a1
+ vmovdqu \_T1, a0
+ PRORD \_T1,11
+ vmovdqu a1, a2
+ PRORD a1,2
+ vpxor a0, \_T1, \_T1
+ PRORD \_T1, 7
+ vpxor a2, a1, a1
+ PRORD a1, 17
+ vpsrld $3, a0, a0
+ vpxor a0, \_T1, \_T1
+ vpsrld $10, a2, a2
+ vpxor a2, a1, a1
+ vpaddd (SZ8*((\i-16)&0xf))(%rsp), \_T1, \_T1
+ vpaddd (SZ8*((\i-7)&0xf))(%rsp), a1, a1
+ vpaddd a1, \_T1, \_T1
+
+ ROUND_00_15 \_T1,\i
+.endm
+
+# SHA256_ARGS:
+# UINT128 digest[8]; // transposed digests
+# UINT8 *data_ptr[4];
+
+# void sha256_x8_avx2(SHA256_ARGS *args, UINT64 bytes);
+# arg 1 : STATE : pointer to array of pointers to input data
+# arg 2 : INP_SIZE : size of input in blocks
+ # general registers preserved in outer calling routine
+ # outer calling routine saves all the XMM registers
+ # save rsp, allocate 32-byte aligned for local variables
+ENTRY(sha256_x8_avx2)
+
+ # save callee-saved clobbered registers to comply with C function ABI
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ mov %rsp, IDX
+ sub $FRAMESZ, %rsp
+ and $~0x1F, %rsp
+ mov IDX, _rsp(%rsp)
+
+ # Load the pre-transposed incoming digest.
+ vmovdqu 0*SHA256_DIGEST_ROW_SIZE(STATE),a
+ vmovdqu 1*SHA256_DIGEST_ROW_SIZE(STATE),b
+ vmovdqu 2*SHA256_DIGEST_ROW_SIZE(STATE),c
+ vmovdqu 3*SHA256_DIGEST_ROW_SIZE(STATE),d
+ vmovdqu 4*SHA256_DIGEST_ROW_SIZE(STATE),e
+ vmovdqu 5*SHA256_DIGEST_ROW_SIZE(STATE),f
+ vmovdqu 6*SHA256_DIGEST_ROW_SIZE(STATE),g
+ vmovdqu 7*SHA256_DIGEST_ROW_SIZE(STATE),h
+
+ lea K256_8(%rip),TBL
+
+ # load the address of each of the 4 message lanes
+ # getting ready to transpose input onto stack
+ mov _args_data_ptr+0*PTR_SZ(STATE),inp0
+ mov _args_data_ptr+1*PTR_SZ(STATE),inp1
+ mov _args_data_ptr+2*PTR_SZ(STATE),inp2
+ mov _args_data_ptr+3*PTR_SZ(STATE),inp3
+ mov _args_data_ptr+4*PTR_SZ(STATE),inp4
+ mov _args_data_ptr+5*PTR_SZ(STATE),inp5
+ mov _args_data_ptr+6*PTR_SZ(STATE),inp6
+ mov _args_data_ptr+7*PTR_SZ(STATE),inp7
+
+ xor IDX, IDX
+lloop:
+ xor ROUND, ROUND
+
+ # save old digest
+ vmovdqu a, _digest(%rsp)
+ vmovdqu b, _digest+1*SZ8(%rsp)
+ vmovdqu c, _digest+2*SZ8(%rsp)
+ vmovdqu d, _digest+3*SZ8(%rsp)
+ vmovdqu e, _digest+4*SZ8(%rsp)
+ vmovdqu f, _digest+5*SZ8(%rsp)
+ vmovdqu g, _digest+6*SZ8(%rsp)
+ vmovdqu h, _digest+7*SZ8(%rsp)
+ i = 0
+.rep 2
+ VMOVPS i*32(inp0, IDX), TT0
+ VMOVPS i*32(inp1, IDX), TT1
+ VMOVPS i*32(inp2, IDX), TT2
+ VMOVPS i*32(inp3, IDX), TT3
+ VMOVPS i*32(inp4, IDX), TT4
+ VMOVPS i*32(inp5, IDX), TT5
+ VMOVPS i*32(inp6, IDX), TT6
+ VMOVPS i*32(inp7, IDX), TT7
+ vmovdqu g, _ytmp(%rsp)
+ vmovdqu h, _ytmp+1*SZ8(%rsp)
+ TRANSPOSE8 TT0, TT1, TT2, TT3, TT4, TT5, TT6, TT7, TMP0, TMP1
+ vmovdqu PSHUFFLE_BYTE_FLIP_MASK(%rip), TMP1
+ vmovdqu _ytmp(%rsp), g
+ vpshufb TMP1, TT0, TT0
+ vpshufb TMP1, TT1, TT1
+ vpshufb TMP1, TT2, TT2
+ vpshufb TMP1, TT3, TT3
+ vpshufb TMP1, TT4, TT4
+ vpshufb TMP1, TT5, TT5
+ vpshufb TMP1, TT6, TT6
+ vpshufb TMP1, TT7, TT7
+ vmovdqu _ytmp+1*SZ8(%rsp), h
+ vmovdqu TT4, _ytmp(%rsp)
+ vmovdqu TT5, _ytmp+1*SZ8(%rsp)
+ vmovdqu TT6, _ytmp+2*SZ8(%rsp)
+ vmovdqu TT7, _ytmp+3*SZ8(%rsp)
+ ROUND_00_15 TT0,(i*8+0)
+ vmovdqu _ytmp(%rsp), TT0
+ ROUND_00_15 TT1,(i*8+1)
+ vmovdqu _ytmp+1*SZ8(%rsp), TT1
+ ROUND_00_15 TT2,(i*8+2)
+ vmovdqu _ytmp+2*SZ8(%rsp), TT2
+ ROUND_00_15 TT3,(i*8+3)
+ vmovdqu _ytmp+3*SZ8(%rsp), TT3
+ ROUND_00_15 TT0,(i*8+4)
+ ROUND_00_15 TT1,(i*8+5)
+ ROUND_00_15 TT2,(i*8+6)
+ ROUND_00_15 TT3,(i*8+7)
+ i = (i+1)
+.endr
+ add $64, IDX
+ i = (i*8)
+
+ jmp Lrounds_16_xx
+.align 16
+Lrounds_16_xx:
+.rep 16
+ ROUND_16_XX T1, i
+ i = (i+1)
+.endr
+
+ cmp $ROUNDS,ROUND
+ jb Lrounds_16_xx
+
+ # add old digest
+ vpaddd _digest+0*SZ8(%rsp), a, a
+ vpaddd _digest+1*SZ8(%rsp), b, b
+ vpaddd _digest+2*SZ8(%rsp), c, c
+ vpaddd _digest+3*SZ8(%rsp), d, d
+ vpaddd _digest+4*SZ8(%rsp), e, e
+ vpaddd _digest+5*SZ8(%rsp), f, f
+ vpaddd _digest+6*SZ8(%rsp), g, g
+ vpaddd _digest+7*SZ8(%rsp), h, h
+
+ sub $1, INP_SIZE # unit is blocks
+ jne lloop
+
+ # write back to memory (state object) the transposed digest
+ vmovdqu a, 0*SHA256_DIGEST_ROW_SIZE(STATE)
+ vmovdqu b, 1*SHA256_DIGEST_ROW_SIZE(STATE)
+ vmovdqu c, 2*SHA256_DIGEST_ROW_SIZE(STATE)
+ vmovdqu d, 3*SHA256_DIGEST_ROW_SIZE(STATE)
+ vmovdqu e, 4*SHA256_DIGEST_ROW_SIZE(STATE)
+ vmovdqu f, 5*SHA256_DIGEST_ROW_SIZE(STATE)
+ vmovdqu g, 6*SHA256_DIGEST_ROW_SIZE(STATE)
+ vmovdqu h, 7*SHA256_DIGEST_ROW_SIZE(STATE)
+
+ # update input pointers
+ add IDX, inp0
+ mov inp0, _args_data_ptr+0*8(STATE)
+ add IDX, inp1
+ mov inp1, _args_data_ptr+1*8(STATE)
+ add IDX, inp2
+ mov inp2, _args_data_ptr+2*8(STATE)
+ add IDX, inp3
+ mov inp3, _args_data_ptr+3*8(STATE)
+ add IDX, inp4
+ mov inp4, _args_data_ptr+4*8(STATE)
+ add IDX, inp5
+ mov inp5, _args_data_ptr+5*8(STATE)
+ add IDX, inp6
+ mov inp6, _args_data_ptr+6*8(STATE)
+ add IDX, inp7
+ mov inp7, _args_data_ptr+7*8(STATE)
+
+ # Postamble
+ mov _rsp(%rsp), %rsp
+
+ # restore callee-saved clobbered registers
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+
+ ret
+ENDPROC(sha256_x8_avx2)
+
+.section .rodata.K256_8, "a", @progbits
+.align 64
+K256_8:
+ .octa 0x428a2f98428a2f98428a2f98428a2f98
+ .octa 0x428a2f98428a2f98428a2f98428a2f98
+ .octa 0x71374491713744917137449171374491
+ .octa 0x71374491713744917137449171374491
+ .octa 0xb5c0fbcfb5c0fbcfb5c0fbcfb5c0fbcf
+ .octa 0xb5c0fbcfb5c0fbcfb5c0fbcfb5c0fbcf
+ .octa 0xe9b5dba5e9b5dba5e9b5dba5e9b5dba5
+ .octa 0xe9b5dba5e9b5dba5e9b5dba5e9b5dba5
+ .octa 0x3956c25b3956c25b3956c25b3956c25b
+ .octa 0x3956c25b3956c25b3956c25b3956c25b
+ .octa 0x59f111f159f111f159f111f159f111f1
+ .octa 0x59f111f159f111f159f111f159f111f1
+ .octa 0x923f82a4923f82a4923f82a4923f82a4
+ .octa 0x923f82a4923f82a4923f82a4923f82a4
+ .octa 0xab1c5ed5ab1c5ed5ab1c5ed5ab1c5ed5
+ .octa 0xab1c5ed5ab1c5ed5ab1c5ed5ab1c5ed5
+ .octa 0xd807aa98d807aa98d807aa98d807aa98
+ .octa 0xd807aa98d807aa98d807aa98d807aa98
+ .octa 0x12835b0112835b0112835b0112835b01
+ .octa 0x12835b0112835b0112835b0112835b01
+ .octa 0x243185be243185be243185be243185be
+ .octa 0x243185be243185be243185be243185be
+ .octa 0x550c7dc3550c7dc3550c7dc3550c7dc3
+ .octa 0x550c7dc3550c7dc3550c7dc3550c7dc3
+ .octa 0x72be5d7472be5d7472be5d7472be5d74
+ .octa 0x72be5d7472be5d7472be5d7472be5d74
+ .octa 0x80deb1fe80deb1fe80deb1fe80deb1fe
+ .octa 0x80deb1fe80deb1fe80deb1fe80deb1fe
+ .octa 0x9bdc06a79bdc06a79bdc06a79bdc06a7
+ .octa 0x9bdc06a79bdc06a79bdc06a79bdc06a7
+ .octa 0xc19bf174c19bf174c19bf174c19bf174
+ .octa 0xc19bf174c19bf174c19bf174c19bf174
+ .octa 0xe49b69c1e49b69c1e49b69c1e49b69c1
+ .octa 0xe49b69c1e49b69c1e49b69c1e49b69c1
+ .octa 0xefbe4786efbe4786efbe4786efbe4786
+ .octa 0xefbe4786efbe4786efbe4786efbe4786
+ .octa 0x0fc19dc60fc19dc60fc19dc60fc19dc6
+ .octa 0x0fc19dc60fc19dc60fc19dc60fc19dc6
+ .octa 0x240ca1cc240ca1cc240ca1cc240ca1cc
+ .octa 0x240ca1cc240ca1cc240ca1cc240ca1cc
+ .octa 0x2de92c6f2de92c6f2de92c6f2de92c6f
+ .octa 0x2de92c6f2de92c6f2de92c6f2de92c6f
+ .octa 0x4a7484aa4a7484aa4a7484aa4a7484aa
+ .octa 0x4a7484aa4a7484aa4a7484aa4a7484aa
+ .octa 0x5cb0a9dc5cb0a9dc5cb0a9dc5cb0a9dc
+ .octa 0x5cb0a9dc5cb0a9dc5cb0a9dc5cb0a9dc
+ .octa 0x76f988da76f988da76f988da76f988da
+ .octa 0x76f988da76f988da76f988da76f988da
+ .octa 0x983e5152983e5152983e5152983e5152
+ .octa 0x983e5152983e5152983e5152983e5152
+ .octa 0xa831c66da831c66da831c66da831c66d
+ .octa 0xa831c66da831c66da831c66da831c66d
+ .octa 0xb00327c8b00327c8b00327c8b00327c8
+ .octa 0xb00327c8b00327c8b00327c8b00327c8
+ .octa 0xbf597fc7bf597fc7bf597fc7bf597fc7
+ .octa 0xbf597fc7bf597fc7bf597fc7bf597fc7
+ .octa 0xc6e00bf3c6e00bf3c6e00bf3c6e00bf3
+ .octa 0xc6e00bf3c6e00bf3c6e00bf3c6e00bf3
+ .octa 0xd5a79147d5a79147d5a79147d5a79147
+ .octa 0xd5a79147d5a79147d5a79147d5a79147
+ .octa 0x06ca635106ca635106ca635106ca6351
+ .octa 0x06ca635106ca635106ca635106ca6351
+ .octa 0x14292967142929671429296714292967
+ .octa 0x14292967142929671429296714292967
+ .octa 0x27b70a8527b70a8527b70a8527b70a85
+ .octa 0x27b70a8527b70a8527b70a8527b70a85
+ .octa 0x2e1b21382e1b21382e1b21382e1b2138
+ .octa 0x2e1b21382e1b21382e1b21382e1b2138
+ .octa 0x4d2c6dfc4d2c6dfc4d2c6dfc4d2c6dfc
+ .octa 0x4d2c6dfc4d2c6dfc4d2c6dfc4d2c6dfc
+ .octa 0x53380d1353380d1353380d1353380d13
+ .octa 0x53380d1353380d1353380d1353380d13
+ .octa 0x650a7354650a7354650a7354650a7354
+ .octa 0x650a7354650a7354650a7354650a7354
+ .octa 0x766a0abb766a0abb766a0abb766a0abb
+ .octa 0x766a0abb766a0abb766a0abb766a0abb
+ .octa 0x81c2c92e81c2c92e81c2c92e81c2c92e
+ .octa 0x81c2c92e81c2c92e81c2c92e81c2c92e
+ .octa 0x92722c8592722c8592722c8592722c85
+ .octa 0x92722c8592722c8592722c8592722c85
+ .octa 0xa2bfe8a1a2bfe8a1a2bfe8a1a2bfe8a1
+ .octa 0xa2bfe8a1a2bfe8a1a2bfe8a1a2bfe8a1
+ .octa 0xa81a664ba81a664ba81a664ba81a664b
+ .octa 0xa81a664ba81a664ba81a664ba81a664b
+ .octa 0xc24b8b70c24b8b70c24b8b70c24b8b70
+ .octa 0xc24b8b70c24b8b70c24b8b70c24b8b70
+ .octa 0xc76c51a3c76c51a3c76c51a3c76c51a3
+ .octa 0xc76c51a3c76c51a3c76c51a3c76c51a3
+ .octa 0xd192e819d192e819d192e819d192e819
+ .octa 0xd192e819d192e819d192e819d192e819
+ .octa 0xd6990624d6990624d6990624d6990624
+ .octa 0xd6990624d6990624d6990624d6990624
+ .octa 0xf40e3585f40e3585f40e3585f40e3585
+ .octa 0xf40e3585f40e3585f40e3585f40e3585
+ .octa 0x106aa070106aa070106aa070106aa070
+ .octa 0x106aa070106aa070106aa070106aa070
+ .octa 0x19a4c11619a4c11619a4c11619a4c116
+ .octa 0x19a4c11619a4c11619a4c11619a4c116
+ .octa 0x1e376c081e376c081e376c081e376c08
+ .octa 0x1e376c081e376c081e376c081e376c08
+ .octa 0x2748774c2748774c2748774c2748774c
+ .octa 0x2748774c2748774c2748774c2748774c
+ .octa 0x34b0bcb534b0bcb534b0bcb534b0bcb5
+ .octa 0x34b0bcb534b0bcb534b0bcb534b0bcb5
+ .octa 0x391c0cb3391c0cb3391c0cb3391c0cb3
+ .octa 0x391c0cb3391c0cb3391c0cb3391c0cb3
+ .octa 0x4ed8aa4a4ed8aa4a4ed8aa4a4ed8aa4a
+ .octa 0x4ed8aa4a4ed8aa4a4ed8aa4a4ed8aa4a
+ .octa 0x5b9cca4f5b9cca4f5b9cca4f5b9cca4f
+ .octa 0x5b9cca4f5b9cca4f5b9cca4f5b9cca4f
+ .octa 0x682e6ff3682e6ff3682e6ff3682e6ff3
+ .octa 0x682e6ff3682e6ff3682e6ff3682e6ff3
+ .octa 0x748f82ee748f82ee748f82ee748f82ee
+ .octa 0x748f82ee748f82ee748f82ee748f82ee
+ .octa 0x78a5636f78a5636f78a5636f78a5636f
+ .octa 0x78a5636f78a5636f78a5636f78a5636f
+ .octa 0x84c8781484c8781484c8781484c87814
+ .octa 0x84c8781484c8781484c8781484c87814
+ .octa 0x8cc702088cc702088cc702088cc70208
+ .octa 0x8cc702088cc702088cc702088cc70208
+ .octa 0x90befffa90befffa90befffa90befffa
+ .octa 0x90befffa90befffa90befffa90befffa
+ .octa 0xa4506ceba4506ceba4506ceba4506ceb
+ .octa 0xa4506ceba4506ceba4506ceba4506ceb
+ .octa 0xbef9a3f7bef9a3f7bef9a3f7bef9a3f7
+ .octa 0xbef9a3f7bef9a3f7bef9a3f7bef9a3f7
+ .octa 0xc67178f2c67178f2c67178f2c67178f2
+ .octa 0xc67178f2c67178f2c67178f2c67178f2
+
+.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
+.align 32
+PSHUFFLE_BYTE_FLIP_MASK:
+.octa 0x0c0d0e0f08090a0b0405060700010203
+.octa 0x0c0d0e0f08090a0b0405060700010203
+
+.section .rodata.cst256.K256, "aM", @progbits, 256
+.align 64
+.global K256
+K256:
+ .int 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ .int 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+ .int 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ .int 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+ .int 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ .int 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+ .int 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ .int 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ .int 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+ .int 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ .int 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+ .int 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ .int 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ .int 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+ .int 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ .int 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
diff --git a/arch/x86/crypto/sha256-ssse3-asm.S b/arch/x86/crypto/sha256-ssse3-asm.S
new file mode 100644
index 000000000..c6c05ed2c
--- /dev/null
+++ b/arch/x86/crypto/sha256-ssse3-asm.S
@@ -0,0 +1,511 @@
+########################################################################
+# Implement fast SHA-256 with SSSE3 instructions. (x86_64)
+#
+# Copyright (C) 2013 Intel Corporation.
+#
+# Authors:
+# James Guilford <james.guilford@intel.com>
+# Kirk Yap <kirk.s.yap@intel.com>
+# Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses. You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+# Redistribution and use in source and binary forms, with or
+# without modification, are permitted provided that the following
+# conditions are met:
+#
+# - Redistributions of source code must retain the above
+# copyright notice, this list of conditions and the following
+# disclaimer.
+#
+# - Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following
+# disclaimer in the documentation and/or other materials
+# provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+########################################################################
+#
+# This code is described in an Intel White-Paper:
+# "Fast SHA-256 Implementations on Intel Architecture Processors"
+#
+# To find it, surf to http://www.intel.com/p/en_US/embedded
+# and search for that title.
+#
+########################################################################
+
+#include <linux/linkage.h>
+
+## assume buffers not aligned
+#define MOVDQ movdqu
+
+################################ Define Macros
+
+# addm [mem], reg
+# Add reg to mem using reg-mem add and store
+.macro addm p1 p2
+ add \p1, \p2
+ mov \p2, \p1
+.endm
+
+################################
+
+# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
+# Load xmm with mem and byte swap each dword
+.macro COPY_XMM_AND_BSWAP p1 p2 p3
+ MOVDQ \p2, \p1
+ pshufb \p3, \p1
+.endm
+
+################################
+
+X0 = %xmm4
+X1 = %xmm5
+X2 = %xmm6
+X3 = %xmm7
+
+XTMP0 = %xmm0
+XTMP1 = %xmm1
+XTMP2 = %xmm2
+XTMP3 = %xmm3
+XTMP4 = %xmm8
+XFER = %xmm9
+
+SHUF_00BA = %xmm10 # shuffle xBxA -> 00BA
+SHUF_DC00 = %xmm11 # shuffle xDxC -> DC00
+BYTE_FLIP_MASK = %xmm12
+
+NUM_BLKS = %rdx # 3rd arg
+INP = %rsi # 2nd arg
+CTX = %rdi # 1st arg
+
+SRND = %rsi # clobbers INP
+c = %ecx
+d = %r8d
+e = %edx
+TBL = %r12
+a = %eax
+b = %ebx
+
+f = %r9d
+g = %r10d
+h = %r11d
+
+y0 = %r13d
+y1 = %r14d
+y2 = %r15d
+
+
+
+_INP_END_SIZE = 8
+_INP_SIZE = 8
+_XFER_SIZE = 16
+_XMM_SAVE_SIZE = 0
+
+_INP_END = 0
+_INP = _INP_END + _INP_END_SIZE
+_XFER = _INP + _INP_SIZE
+_XMM_SAVE = _XFER + _XFER_SIZE
+STACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE
+
+# rotate_Xs
+# Rotate values of symbols X0...X3
+.macro rotate_Xs
+X_ = X0
+X0 = X1
+X1 = X2
+X2 = X3
+X3 = X_
+.endm
+
+# ROTATE_ARGS
+# Rotate values of symbols a...h
+.macro ROTATE_ARGS
+TMP_ = h
+h = g
+g = f
+f = e
+e = d
+d = c
+c = b
+b = a
+a = TMP_
+.endm
+
+.macro FOUR_ROUNDS_AND_SCHED
+ ## compute s0 four at a time and s1 two at a time
+ ## compute W[-16] + W[-7] 4 at a time
+ movdqa X3, XTMP0
+ mov e, y0 # y0 = e
+ ror $(25-11), y0 # y0 = e >> (25-11)
+ mov a, y1 # y1 = a
+ palignr $4, X2, XTMP0 # XTMP0 = W[-7]
+ ror $(22-13), y1 # y1 = a >> (22-13)
+ xor e, y0 # y0 = e ^ (e >> (25-11))
+ mov f, y2 # y2 = f
+ ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
+ movdqa X1, XTMP1
+ xor a, y1 # y1 = a ^ (a >> (22-13)
+ xor g, y2 # y2 = f^g
+ paddd X0, XTMP0 # XTMP0 = W[-7] + W[-16]
+ xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ and e, y2 # y2 = (f^g)&e
+ ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
+ ## compute s0
+ palignr $4, X0, XTMP1 # XTMP1 = W[-15]
+ xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ xor g, y2 # y2 = CH = ((f^g)&e)^g
+ movdqa XTMP1, XTMP2 # XTMP2 = W[-15]
+ ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ add y0, y2 # y2 = S1 + CH
+ add _XFER(%rsp) , y2 # y2 = k + w + S1 + CH
+ movdqa XTMP1, XTMP3 # XTMP3 = W[-15]
+ mov a, y0 # y0 = a
+ add y2, h # h = h + S1 + CH + k + w
+ mov a, y2 # y2 = a
+ pslld $(32-7), XTMP1 #
+ or c, y0 # y0 = a|c
+ add h, d # d = d + h + S1 + CH + k + w
+ and c, y2 # y2 = a&c
+ psrld $7, XTMP2 #
+ and b, y0 # y0 = (a|c)&b
+ add y1, h # h = h + S1 + CH + k + w + S0
+ por XTMP2, XTMP1 # XTMP1 = W[-15] ror 7
+ or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
+ add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
+ #
+ ROTATE_ARGS #
+ movdqa XTMP3, XTMP2 # XTMP2 = W[-15]
+ mov e, y0 # y0 = e
+ mov a, y1 # y1 = a
+ movdqa XTMP3, XTMP4 # XTMP4 = W[-15]
+ ror $(25-11), y0 # y0 = e >> (25-11)
+ xor e, y0 # y0 = e ^ (e >> (25-11))
+ mov f, y2 # y2 = f
+ ror $(22-13), y1 # y1 = a >> (22-13)
+ pslld $(32-18), XTMP3 #
+ xor a, y1 # y1 = a ^ (a >> (22-13)
+ ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
+ xor g, y2 # y2 = f^g
+ psrld $18, XTMP2 #
+ ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
+ xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ and e, y2 # y2 = (f^g)&e
+ ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ pxor XTMP3, XTMP1
+ xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ xor g, y2 # y2 = CH = ((f^g)&e)^g
+ psrld $3, XTMP4 # XTMP4 = W[-15] >> 3
+ add y0, y2 # y2 = S1 + CH
+ add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
+ ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ pxor XTMP2, XTMP1 # XTMP1 = W[-15] ror 7 ^ W[-15] ror 18
+ mov a, y0 # y0 = a
+ add y2, h # h = h + S1 + CH + k + w
+ mov a, y2 # y2 = a
+ pxor XTMP4, XTMP1 # XTMP1 = s0
+ or c, y0 # y0 = a|c
+ add h, d # d = d + h + S1 + CH + k + w
+ and c, y2 # y2 = a&c
+ ## compute low s1
+ pshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
+ and b, y0 # y0 = (a|c)&b
+ add y1, h # h = h + S1 + CH + k + w + S0
+ paddd XTMP1, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
+ or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
+ add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
+
+ ROTATE_ARGS
+ movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {BBAA}
+ mov e, y0 # y0 = e
+ mov a, y1 # y1 = a
+ ror $(25-11), y0 # y0 = e >> (25-11)
+ movdqa XTMP2, XTMP4 # XTMP4 = W[-2] {BBAA}
+ xor e, y0 # y0 = e ^ (e >> (25-11))
+ ror $(22-13), y1 # y1 = a >> (22-13)
+ mov f, y2 # y2 = f
+ xor a, y1 # y1 = a ^ (a >> (22-13)
+ ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
+ psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA}
+ xor g, y2 # y2 = f^g
+ psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
+ xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ and e, y2 # y2 = (f^g)&e
+ psrld $10, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
+ ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
+ xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ xor g, y2 # y2 = CH = ((f^g)&e)^g
+ ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ pxor XTMP3, XTMP2
+ add y0, y2 # y2 = S1 + CH
+ ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
+ pxor XTMP2, XTMP4 # XTMP4 = s1 {xBxA}
+ mov a, y0 # y0 = a
+ add y2, h # h = h + S1 + CH + k + w
+ mov a, y2 # y2 = a
+ pshufb SHUF_00BA, XTMP4 # XTMP4 = s1 {00BA}
+ or c, y0 # y0 = a|c
+ add h, d # d = d + h + S1 + CH + k + w
+ and c, y2 # y2 = a&c
+ paddd XTMP4, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
+ and b, y0 # y0 = (a|c)&b
+ add y1, h # h = h + S1 + CH + k + w + S0
+ ## compute high s1
+ pshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {BBAA}
+ or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
+ add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
+ #
+ ROTATE_ARGS #
+ movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {DDCC}
+ mov e, y0 # y0 = e
+ ror $(25-11), y0 # y0 = e >> (25-11)
+ mov a, y1 # y1 = a
+ movdqa XTMP2, X0 # X0 = W[-2] {DDCC}
+ ror $(22-13), y1 # y1 = a >> (22-13)
+ xor e, y0 # y0 = e ^ (e >> (25-11))
+ mov f, y2 # y2 = f
+ ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
+ psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC}
+ xor a, y1 # y1 = a ^ (a >> (22-13)
+ xor g, y2 # y2 = f^g
+ psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC}
+ xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25
+ and e, y2 # y2 = (f^g)&e
+ ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
+ psrld $10, X0 # X0 = W[-2] >> 10 {DDCC}
+ xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22
+ ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>2
+ xor g, y2 # y2 = CH = ((f^g)&e)^g
+ pxor XTMP3, XTMP2 #
+ ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>2
+ add y0, y2 # y2 = S1 + CH
+ add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
+ pxor XTMP2, X0 # X0 = s1 {xDxC}
+ mov a, y0 # y0 = a
+ add y2, h # h = h + S1 + CH + k + w
+ mov a, y2 # y2 = a
+ pshufb SHUF_DC00, X0 # X0 = s1 {DC00}
+ or c, y0 # y0 = a|c
+ add h, d # d = d + h + S1 + CH + k + w
+ and c, y2 # y2 = a&c
+ paddd XTMP0, X0 # X0 = {W[3], W[2], W[1], W[0]}
+ and b, y0 # y0 = (a|c)&b
+ add y1, h # h = h + S1 + CH + k + w + S0
+ or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
+ add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
+
+ ROTATE_ARGS
+ rotate_Xs
+.endm
+
+## input is [rsp + _XFER + %1 * 4]
+.macro DO_ROUND round
+ mov e, y0 # y0 = e
+ ror $(25-11), y0 # y0 = e >> (25-11)
+ mov a, y1 # y1 = a
+ xor e, y0 # y0 = e ^ (e >> (25-11))
+ ror $(22-13), y1 # y1 = a >> (22-13)
+ mov f, y2 # y2 = f
+ xor a, y1 # y1 = a ^ (a >> (22-13)
+ ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
+ xor g, y2 # y2 = f^g
+ xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
+ and e, y2 # y2 = (f^g)&e
+ xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ xor g, y2 # y2 = CH = ((f^g)&e)^g
+ add y0, y2 # y2 = S1 + CH
+ ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ offset = \round * 4 + _XFER
+ add offset(%rsp), y2 # y2 = k + w + S1 + CH
+ mov a, y0 # y0 = a
+ add y2, h # h = h + S1 + CH + k + w
+ mov a, y2 # y2 = a
+ or c, y0 # y0 = a|c
+ add h, d # d = d + h + S1 + CH + k + w
+ and c, y2 # y2 = a&c
+ and b, y0 # y0 = (a|c)&b
+ add y1, h # h = h + S1 + CH + k + w + S0
+ or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
+ add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
+ ROTATE_ARGS
+.endm
+
+########################################################################
+## void sha256_transform_ssse3(void *input_data, UINT32 digest[8], UINT64 num_blks)
+## arg 1 : pointer to digest
+## arg 2 : pointer to input data
+## arg 3 : Num blocks
+########################################################################
+.text
+ENTRY(sha256_transform_ssse3)
+.align 32
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ pushq %rbp
+ mov %rsp, %rbp
+
+ subq $STACK_SIZE, %rsp
+ and $~15, %rsp
+
+ shl $6, NUM_BLKS # convert to bytes
+ jz done_hash
+ add INP, NUM_BLKS
+ mov NUM_BLKS, _INP_END(%rsp) # pointer to end of data
+
+ ## load initial digest
+ mov 4*0(CTX), a
+ mov 4*1(CTX), b
+ mov 4*2(CTX), c
+ mov 4*3(CTX), d
+ mov 4*4(CTX), e
+ mov 4*5(CTX), f
+ mov 4*6(CTX), g
+ mov 4*7(CTX), h
+
+ movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
+ movdqa _SHUF_00BA(%rip), SHUF_00BA
+ movdqa _SHUF_DC00(%rip), SHUF_DC00
+
+loop0:
+ lea K256(%rip), TBL
+
+ ## byte swap first 16 dwords
+ COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK
+
+ mov INP, _INP(%rsp)
+
+ ## schedule 48 input dwords, by doing 3 rounds of 16 each
+ mov $3, SRND
+.align 16
+loop1:
+ movdqa (TBL), XFER
+ paddd X0, XFER
+ movdqa XFER, _XFER(%rsp)
+ FOUR_ROUNDS_AND_SCHED
+
+ movdqa 1*16(TBL), XFER
+ paddd X0, XFER
+ movdqa XFER, _XFER(%rsp)
+ FOUR_ROUNDS_AND_SCHED
+
+ movdqa 2*16(TBL), XFER
+ paddd X0, XFER
+ movdqa XFER, _XFER(%rsp)
+ FOUR_ROUNDS_AND_SCHED
+
+ movdqa 3*16(TBL), XFER
+ paddd X0, XFER
+ movdqa XFER, _XFER(%rsp)
+ add $4*16, TBL
+ FOUR_ROUNDS_AND_SCHED
+
+ sub $1, SRND
+ jne loop1
+
+ mov $2, SRND
+loop2:
+ paddd (TBL), X0
+ movdqa X0, _XFER(%rsp)
+ DO_ROUND 0
+ DO_ROUND 1
+ DO_ROUND 2
+ DO_ROUND 3
+ paddd 1*16(TBL), X1
+ movdqa X1, _XFER(%rsp)
+ add $2*16, TBL
+ DO_ROUND 0
+ DO_ROUND 1
+ DO_ROUND 2
+ DO_ROUND 3
+
+ movdqa X2, X0
+ movdqa X3, X1
+
+ sub $1, SRND
+ jne loop2
+
+ addm (4*0)(CTX),a
+ addm (4*1)(CTX),b
+ addm (4*2)(CTX),c
+ addm (4*3)(CTX),d
+ addm (4*4)(CTX),e
+ addm (4*5)(CTX),f
+ addm (4*6)(CTX),g
+ addm (4*7)(CTX),h
+
+ mov _INP(%rsp), INP
+ add $64, INP
+ cmp _INP_END(%rsp), INP
+ jne loop0
+
+done_hash:
+
+ mov %rbp, %rsp
+ popq %rbp
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbx
+
+ ret
+ENDPROC(sha256_transform_ssse3)
+
+.section .rodata.cst256.K256, "aM", @progbits, 256
+.align 64
+K256:
+ .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+ .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+ .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+ .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+ .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+ .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+ .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
+.align 16
+PSHUFFLE_BYTE_FLIP_MASK:
+ .octa 0x0c0d0e0f08090a0b0405060700010203
+
+.section .rodata.cst16._SHUF_00BA, "aM", @progbits, 16
+.align 16
+# shuffle xBxA -> 00BA
+_SHUF_00BA:
+ .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
+
+.section .rodata.cst16._SHUF_DC00, "aM", @progbits, 16
+.align 16
+# shuffle xDxC -> DC00
+_SHUF_DC00:
+ .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
diff --git a/arch/x86/crypto/sha256_ni_asm.S b/arch/x86/crypto/sha256_ni_asm.S
new file mode 100644
index 000000000..fb58f58ec
--- /dev/null
+++ b/arch/x86/crypto/sha256_ni_asm.S
@@ -0,0 +1,355 @@
+/*
+ * Intel SHA Extensions optimized implementation of a SHA-256 update function
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * Sean Gulley <sean.m.gulley@intel.com>
+ * Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/linkage.h>
+
+#define DIGEST_PTR %rdi /* 1st arg */
+#define DATA_PTR %rsi /* 2nd arg */
+#define NUM_BLKS %rdx /* 3rd arg */
+
+#define SHA256CONSTANTS %rax
+
+#define MSG %xmm0
+#define STATE0 %xmm1
+#define STATE1 %xmm2
+#define MSGTMP0 %xmm3
+#define MSGTMP1 %xmm4
+#define MSGTMP2 %xmm5
+#define MSGTMP3 %xmm6
+#define MSGTMP4 %xmm7
+
+#define SHUF_MASK %xmm8
+
+#define ABEF_SAVE %xmm9
+#define CDGH_SAVE %xmm10
+
+/*
+ * Intel SHA Extensions optimized implementation of a SHA-256 update function
+ *
+ * The function takes a pointer to the current hash values, a pointer to the
+ * input data, and a number of 64 byte blocks to process. Once all blocks have
+ * been processed, the digest pointer is updated with the resulting hash value.
+ * The function only processes complete blocks, there is no functionality to
+ * store partial blocks. All message padding and hash value initialization must
+ * be done outside the update function.
+ *
+ * The indented lines in the loop are instructions related to rounds processing.
+ * The non-indented lines are instructions related to the message schedule.
+ *
+ * void sha256_ni_transform(uint32_t *digest, const void *data,
+ uint32_t numBlocks);
+ * digest : pointer to digest
+ * data: pointer to input data
+ * numBlocks: Number of blocks to process
+ */
+
+.text
+.align 32
+ENTRY(sha256_ni_transform)
+
+ shl $6, NUM_BLKS /* convert to bytes */
+ jz .Ldone_hash
+ add DATA_PTR, NUM_BLKS /* pointer to end of data */
+
+ /*
+ * load initial hash values
+ * Need to reorder these appropriately
+ * DCBA, HGFE -> ABEF, CDGH
+ */
+ movdqu 0*16(DIGEST_PTR), STATE0
+ movdqu 1*16(DIGEST_PTR), STATE1
+
+ pshufd $0xB1, STATE0, STATE0 /* CDAB */
+ pshufd $0x1B, STATE1, STATE1 /* EFGH */
+ movdqa STATE0, MSGTMP4
+ palignr $8, STATE1, STATE0 /* ABEF */
+ pblendw $0xF0, MSGTMP4, STATE1 /* CDGH */
+
+ movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
+ lea K256(%rip), SHA256CONSTANTS
+
+.Lloop0:
+ /* Save hash values for addition after rounds */
+ movdqa STATE0, ABEF_SAVE
+ movdqa STATE1, CDGH_SAVE
+
+ /* Rounds 0-3 */
+ movdqu 0*16(DATA_PTR), MSG
+ pshufb SHUF_MASK, MSG
+ movdqa MSG, MSGTMP0
+ paddd 0*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+
+ /* Rounds 4-7 */
+ movdqu 1*16(DATA_PTR), MSG
+ pshufb SHUF_MASK, MSG
+ movdqa MSG, MSGTMP1
+ paddd 1*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+ sha256msg1 MSGTMP1, MSGTMP0
+
+ /* Rounds 8-11 */
+ movdqu 2*16(DATA_PTR), MSG
+ pshufb SHUF_MASK, MSG
+ movdqa MSG, MSGTMP2
+ paddd 2*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+ sha256msg1 MSGTMP2, MSGTMP1
+
+ /* Rounds 12-15 */
+ movdqu 3*16(DATA_PTR), MSG
+ pshufb SHUF_MASK, MSG
+ movdqa MSG, MSGTMP3
+ paddd 3*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ movdqa MSGTMP3, MSGTMP4
+ palignr $4, MSGTMP2, MSGTMP4
+ paddd MSGTMP4, MSGTMP0
+ sha256msg2 MSGTMP3, MSGTMP0
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+ sha256msg1 MSGTMP3, MSGTMP2
+
+ /* Rounds 16-19 */
+ movdqa MSGTMP0, MSG
+ paddd 4*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ movdqa MSGTMP0, MSGTMP4
+ palignr $4, MSGTMP3, MSGTMP4
+ paddd MSGTMP4, MSGTMP1
+ sha256msg2 MSGTMP0, MSGTMP1
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+ sha256msg1 MSGTMP0, MSGTMP3
+
+ /* Rounds 20-23 */
+ movdqa MSGTMP1, MSG
+ paddd 5*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ movdqa MSGTMP1, MSGTMP4
+ palignr $4, MSGTMP0, MSGTMP4
+ paddd MSGTMP4, MSGTMP2
+ sha256msg2 MSGTMP1, MSGTMP2
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+ sha256msg1 MSGTMP1, MSGTMP0
+
+ /* Rounds 24-27 */
+ movdqa MSGTMP2, MSG
+ paddd 6*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ movdqa MSGTMP2, MSGTMP4
+ palignr $4, MSGTMP1, MSGTMP4
+ paddd MSGTMP4, MSGTMP3
+ sha256msg2 MSGTMP2, MSGTMP3
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+ sha256msg1 MSGTMP2, MSGTMP1
+
+ /* Rounds 28-31 */
+ movdqa MSGTMP3, MSG
+ paddd 7*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ movdqa MSGTMP3, MSGTMP4
+ palignr $4, MSGTMP2, MSGTMP4
+ paddd MSGTMP4, MSGTMP0
+ sha256msg2 MSGTMP3, MSGTMP0
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+ sha256msg1 MSGTMP3, MSGTMP2
+
+ /* Rounds 32-35 */
+ movdqa MSGTMP0, MSG
+ paddd 8*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ movdqa MSGTMP0, MSGTMP4
+ palignr $4, MSGTMP3, MSGTMP4
+ paddd MSGTMP4, MSGTMP1
+ sha256msg2 MSGTMP0, MSGTMP1
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+ sha256msg1 MSGTMP0, MSGTMP3
+
+ /* Rounds 36-39 */
+ movdqa MSGTMP1, MSG
+ paddd 9*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ movdqa MSGTMP1, MSGTMP4
+ palignr $4, MSGTMP0, MSGTMP4
+ paddd MSGTMP4, MSGTMP2
+ sha256msg2 MSGTMP1, MSGTMP2
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+ sha256msg1 MSGTMP1, MSGTMP0
+
+ /* Rounds 40-43 */
+ movdqa MSGTMP2, MSG
+ paddd 10*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ movdqa MSGTMP2, MSGTMP4
+ palignr $4, MSGTMP1, MSGTMP4
+ paddd MSGTMP4, MSGTMP3
+ sha256msg2 MSGTMP2, MSGTMP3
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+ sha256msg1 MSGTMP2, MSGTMP1
+
+ /* Rounds 44-47 */
+ movdqa MSGTMP3, MSG
+ paddd 11*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ movdqa MSGTMP3, MSGTMP4
+ palignr $4, MSGTMP2, MSGTMP4
+ paddd MSGTMP4, MSGTMP0
+ sha256msg2 MSGTMP3, MSGTMP0
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+ sha256msg1 MSGTMP3, MSGTMP2
+
+ /* Rounds 48-51 */
+ movdqa MSGTMP0, MSG
+ paddd 12*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ movdqa MSGTMP0, MSGTMP4
+ palignr $4, MSGTMP3, MSGTMP4
+ paddd MSGTMP4, MSGTMP1
+ sha256msg2 MSGTMP0, MSGTMP1
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+ sha256msg1 MSGTMP0, MSGTMP3
+
+ /* Rounds 52-55 */
+ movdqa MSGTMP1, MSG
+ paddd 13*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ movdqa MSGTMP1, MSGTMP4
+ palignr $4, MSGTMP0, MSGTMP4
+ paddd MSGTMP4, MSGTMP2
+ sha256msg2 MSGTMP1, MSGTMP2
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+
+ /* Rounds 56-59 */
+ movdqa MSGTMP2, MSG
+ paddd 14*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ movdqa MSGTMP2, MSGTMP4
+ palignr $4, MSGTMP1, MSGTMP4
+ paddd MSGTMP4, MSGTMP3
+ sha256msg2 MSGTMP2, MSGTMP3
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+
+ /* Rounds 60-63 */
+ movdqa MSGTMP3, MSG
+ paddd 15*16(SHA256CONSTANTS), MSG
+ sha256rnds2 STATE0, STATE1
+ pshufd $0x0E, MSG, MSG
+ sha256rnds2 STATE1, STATE0
+
+ /* Add current hash values with previously saved */
+ paddd ABEF_SAVE, STATE0
+ paddd CDGH_SAVE, STATE1
+
+ /* Increment data pointer and loop if more to process */
+ add $64, DATA_PTR
+ cmp NUM_BLKS, DATA_PTR
+ jne .Lloop0
+
+ /* Write hash values back in the correct order */
+ pshufd $0x1B, STATE0, STATE0 /* FEBA */
+ pshufd $0xB1, STATE1, STATE1 /* DCHG */
+ movdqa STATE0, MSGTMP4
+ pblendw $0xF0, STATE1, STATE0 /* DCBA */
+ palignr $8, MSGTMP4, STATE1 /* HGFE */
+
+ movdqu STATE0, 0*16(DIGEST_PTR)
+ movdqu STATE1, 1*16(DIGEST_PTR)
+
+.Ldone_hash:
+
+ ret
+ENDPROC(sha256_ni_transform)
+
+.section .rodata.cst256.K256, "aM", @progbits, 256
+.align 64
+K256:
+ .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+ .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+ .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+ .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+ .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+ .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+ .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
+.align 16
+PSHUFFLE_BYTE_FLIP_MASK:
+ .octa 0x0c0d0e0f08090a0b0405060700010203
diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c
new file mode 100644
index 000000000..773a873d2
--- /dev/null
+++ b/arch/x86/crypto/sha256_ssse3_glue.c
@@ -0,0 +1,432 @@
+/*
+ * Cryptographic API.
+ *
+ * Glue code for the SHA256 Secure Hash Algorithm assembler
+ * implementation using supplemental SSE3 / AVX / AVX2 instructions.
+ *
+ * This file is based on sha256_generic.c
+ *
+ * Copyright (C) 2013 Intel Corporation.
+ *
+ * Author:
+ * Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <crypto/internal/hash.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <crypto/sha.h>
+#include <crypto/sha256_base.h>
+#include <asm/fpu/api.h>
+#include <linux/string.h>
+
+asmlinkage void sha256_transform_ssse3(u32 *digest, const char *data,
+ u64 rounds);
+typedef void (sha256_transform_fn)(u32 *digest, const char *data, u64 rounds);
+
+static int sha256_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len, sha256_transform_fn *sha256_xform)
+{
+ struct sha256_state *sctx = shash_desc_ctx(desc);
+
+ if (!irq_fpu_usable() ||
+ (sctx->count % SHA256_BLOCK_SIZE) + len < SHA256_BLOCK_SIZE)
+ return crypto_sha256_update(desc, data, len);
+
+ /* make sure casting to sha256_block_fn() is safe */
+ BUILD_BUG_ON(offsetof(struct sha256_state, state) != 0);
+
+ kernel_fpu_begin();
+ sha256_base_do_update(desc, data, len,
+ (sha256_block_fn *)sha256_xform);
+ kernel_fpu_end();
+
+ return 0;
+}
+
+static int sha256_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out, sha256_transform_fn *sha256_xform)
+{
+ if (!irq_fpu_usable())
+ return crypto_sha256_finup(desc, data, len, out);
+
+ kernel_fpu_begin();
+ if (len)
+ sha256_base_do_update(desc, data, len,
+ (sha256_block_fn *)sha256_xform);
+ sha256_base_do_finalize(desc, (sha256_block_fn *)sha256_xform);
+ kernel_fpu_end();
+
+ return sha256_base_finish(desc, out);
+}
+
+static int sha256_ssse3_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ return sha256_update(desc, data, len, sha256_transform_ssse3);
+}
+
+static int sha256_ssse3_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return sha256_finup(desc, data, len, out, sha256_transform_ssse3);
+}
+
+/* Add padding and return the message digest. */
+static int sha256_ssse3_final(struct shash_desc *desc, u8 *out)
+{
+ return sha256_ssse3_finup(desc, NULL, 0, out);
+}
+
+static struct shash_alg sha256_ssse3_algs[] = { {
+ .digestsize = SHA256_DIGEST_SIZE,
+ .init = sha256_base_init,
+ .update = sha256_ssse3_update,
+ .final = sha256_ssse3_final,
+ .finup = sha256_ssse3_finup,
+ .descsize = sizeof(struct sha256_state),
+ .base = {
+ .cra_name = "sha256",
+ .cra_driver_name = "sha256-ssse3",
+ .cra_priority = 150,
+ .cra_blocksize = SHA256_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+}, {
+ .digestsize = SHA224_DIGEST_SIZE,
+ .init = sha224_base_init,
+ .update = sha256_ssse3_update,
+ .final = sha256_ssse3_final,
+ .finup = sha256_ssse3_finup,
+ .descsize = sizeof(struct sha256_state),
+ .base = {
+ .cra_name = "sha224",
+ .cra_driver_name = "sha224-ssse3",
+ .cra_priority = 150,
+ .cra_blocksize = SHA224_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+} };
+
+static int register_sha256_ssse3(void)
+{
+ if (boot_cpu_has(X86_FEATURE_SSSE3))
+ return crypto_register_shashes(sha256_ssse3_algs,
+ ARRAY_SIZE(sha256_ssse3_algs));
+ return 0;
+}
+
+static void unregister_sha256_ssse3(void)
+{
+ if (boot_cpu_has(X86_FEATURE_SSSE3))
+ crypto_unregister_shashes(sha256_ssse3_algs,
+ ARRAY_SIZE(sha256_ssse3_algs));
+}
+
+#ifdef CONFIG_AS_AVX
+asmlinkage void sha256_transform_avx(u32 *digest, const char *data,
+ u64 rounds);
+
+static int sha256_avx_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ return sha256_update(desc, data, len, sha256_transform_avx);
+}
+
+static int sha256_avx_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return sha256_finup(desc, data, len, out, sha256_transform_avx);
+}
+
+static int sha256_avx_final(struct shash_desc *desc, u8 *out)
+{
+ return sha256_avx_finup(desc, NULL, 0, out);
+}
+
+static struct shash_alg sha256_avx_algs[] = { {
+ .digestsize = SHA256_DIGEST_SIZE,
+ .init = sha256_base_init,
+ .update = sha256_avx_update,
+ .final = sha256_avx_final,
+ .finup = sha256_avx_finup,
+ .descsize = sizeof(struct sha256_state),
+ .base = {
+ .cra_name = "sha256",
+ .cra_driver_name = "sha256-avx",
+ .cra_priority = 160,
+ .cra_blocksize = SHA256_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+}, {
+ .digestsize = SHA224_DIGEST_SIZE,
+ .init = sha224_base_init,
+ .update = sha256_avx_update,
+ .final = sha256_avx_final,
+ .finup = sha256_avx_finup,
+ .descsize = sizeof(struct sha256_state),
+ .base = {
+ .cra_name = "sha224",
+ .cra_driver_name = "sha224-avx",
+ .cra_priority = 160,
+ .cra_blocksize = SHA224_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+} };
+
+static bool avx_usable(void)
+{
+ if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
+ if (boot_cpu_has(X86_FEATURE_AVX))
+ pr_info("AVX detected but unusable.\n");
+ return false;
+ }
+
+ return true;
+}
+
+static int register_sha256_avx(void)
+{
+ if (avx_usable())
+ return crypto_register_shashes(sha256_avx_algs,
+ ARRAY_SIZE(sha256_avx_algs));
+ return 0;
+}
+
+static void unregister_sha256_avx(void)
+{
+ if (avx_usable())
+ crypto_unregister_shashes(sha256_avx_algs,
+ ARRAY_SIZE(sha256_avx_algs));
+}
+
+#else
+static inline int register_sha256_avx(void) { return 0; }
+static inline void unregister_sha256_avx(void) { }
+#endif
+
+#if defined(CONFIG_AS_AVX2) && defined(CONFIG_AS_AVX)
+asmlinkage void sha256_transform_rorx(u32 *digest, const char *data,
+ u64 rounds);
+
+static int sha256_avx2_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ return sha256_update(desc, data, len, sha256_transform_rorx);
+}
+
+static int sha256_avx2_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return sha256_finup(desc, data, len, out, sha256_transform_rorx);
+}
+
+static int sha256_avx2_final(struct shash_desc *desc, u8 *out)
+{
+ return sha256_avx2_finup(desc, NULL, 0, out);
+}
+
+static struct shash_alg sha256_avx2_algs[] = { {
+ .digestsize = SHA256_DIGEST_SIZE,
+ .init = sha256_base_init,
+ .update = sha256_avx2_update,
+ .final = sha256_avx2_final,
+ .finup = sha256_avx2_finup,
+ .descsize = sizeof(struct sha256_state),
+ .base = {
+ .cra_name = "sha256",
+ .cra_driver_name = "sha256-avx2",
+ .cra_priority = 170,
+ .cra_blocksize = SHA256_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+}, {
+ .digestsize = SHA224_DIGEST_SIZE,
+ .init = sha224_base_init,
+ .update = sha256_avx2_update,
+ .final = sha256_avx2_final,
+ .finup = sha256_avx2_finup,
+ .descsize = sizeof(struct sha256_state),
+ .base = {
+ .cra_name = "sha224",
+ .cra_driver_name = "sha224-avx2",
+ .cra_priority = 170,
+ .cra_blocksize = SHA224_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+} };
+
+static bool avx2_usable(void)
+{
+ if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2) &&
+ boot_cpu_has(X86_FEATURE_BMI2))
+ return true;
+
+ return false;
+}
+
+static int register_sha256_avx2(void)
+{
+ if (avx2_usable())
+ return crypto_register_shashes(sha256_avx2_algs,
+ ARRAY_SIZE(sha256_avx2_algs));
+ return 0;
+}
+
+static void unregister_sha256_avx2(void)
+{
+ if (avx2_usable())
+ crypto_unregister_shashes(sha256_avx2_algs,
+ ARRAY_SIZE(sha256_avx2_algs));
+}
+
+#else
+static inline int register_sha256_avx2(void) { return 0; }
+static inline void unregister_sha256_avx2(void) { }
+#endif
+
+#ifdef CONFIG_AS_SHA256_NI
+asmlinkage void sha256_ni_transform(u32 *digest, const char *data,
+ u64 rounds); /*unsigned int rounds);*/
+
+static int sha256_ni_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ return sha256_update(desc, data, len, sha256_ni_transform);
+}
+
+static int sha256_ni_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return sha256_finup(desc, data, len, out, sha256_ni_transform);
+}
+
+static int sha256_ni_final(struct shash_desc *desc, u8 *out)
+{
+ return sha256_ni_finup(desc, NULL, 0, out);
+}
+
+static struct shash_alg sha256_ni_algs[] = { {
+ .digestsize = SHA256_DIGEST_SIZE,
+ .init = sha256_base_init,
+ .update = sha256_ni_update,
+ .final = sha256_ni_final,
+ .finup = sha256_ni_finup,
+ .descsize = sizeof(struct sha256_state),
+ .base = {
+ .cra_name = "sha256",
+ .cra_driver_name = "sha256-ni",
+ .cra_priority = 250,
+ .cra_blocksize = SHA256_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+}, {
+ .digestsize = SHA224_DIGEST_SIZE,
+ .init = sha224_base_init,
+ .update = sha256_ni_update,
+ .final = sha256_ni_final,
+ .finup = sha256_ni_finup,
+ .descsize = sizeof(struct sha256_state),
+ .base = {
+ .cra_name = "sha224",
+ .cra_driver_name = "sha224-ni",
+ .cra_priority = 250,
+ .cra_blocksize = SHA224_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+} };
+
+static int register_sha256_ni(void)
+{
+ if (boot_cpu_has(X86_FEATURE_SHA_NI))
+ return crypto_register_shashes(sha256_ni_algs,
+ ARRAY_SIZE(sha256_ni_algs));
+ return 0;
+}
+
+static void unregister_sha256_ni(void)
+{
+ if (boot_cpu_has(X86_FEATURE_SHA_NI))
+ crypto_unregister_shashes(sha256_ni_algs,
+ ARRAY_SIZE(sha256_ni_algs));
+}
+
+#else
+static inline int register_sha256_ni(void) { return 0; }
+static inline void unregister_sha256_ni(void) { }
+#endif
+
+static int __init sha256_ssse3_mod_init(void)
+{
+ if (register_sha256_ssse3())
+ goto fail;
+
+ if (register_sha256_avx()) {
+ unregister_sha256_ssse3();
+ goto fail;
+ }
+
+ if (register_sha256_avx2()) {
+ unregister_sha256_avx();
+ unregister_sha256_ssse3();
+ goto fail;
+ }
+
+ if (register_sha256_ni()) {
+ unregister_sha256_avx2();
+ unregister_sha256_avx();
+ unregister_sha256_ssse3();
+ goto fail;
+ }
+
+ return 0;
+fail:
+ return -ENODEV;
+}
+
+static void __exit sha256_ssse3_mod_fini(void)
+{
+ unregister_sha256_ni();
+ unregister_sha256_avx2();
+ unregister_sha256_avx();
+ unregister_sha256_ssse3();
+}
+
+module_init(sha256_ssse3_mod_init);
+module_exit(sha256_ssse3_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm, Supplemental SSE3 accelerated");
+
+MODULE_ALIAS_CRYPTO("sha256");
+MODULE_ALIAS_CRYPTO("sha256-ssse3");
+MODULE_ALIAS_CRYPTO("sha256-avx");
+MODULE_ALIAS_CRYPTO("sha256-avx2");
+MODULE_ALIAS_CRYPTO("sha224");
+MODULE_ALIAS_CRYPTO("sha224-ssse3");
+MODULE_ALIAS_CRYPTO("sha224-avx");
+MODULE_ALIAS_CRYPTO("sha224-avx2");
+#ifdef CONFIG_AS_SHA256_NI
+MODULE_ALIAS_CRYPTO("sha256-ni");
+MODULE_ALIAS_CRYPTO("sha224-ni");
+#endif
diff --git a/arch/x86/crypto/sha512-avx-asm.S b/arch/x86/crypto/sha512-avx-asm.S
new file mode 100644
index 000000000..39235fefe
--- /dev/null
+++ b/arch/x86/crypto/sha512-avx-asm.S
@@ -0,0 +1,426 @@
+########################################################################
+# Implement fast SHA-512 with AVX instructions. (x86_64)
+#
+# Copyright (C) 2013 Intel Corporation.
+#
+# Authors:
+# James Guilford <james.guilford@intel.com>
+# Kirk Yap <kirk.s.yap@intel.com>
+# David Cote <david.m.cote@intel.com>
+# Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses. You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+# Redistribution and use in source and binary forms, with or
+# without modification, are permitted provided that the following
+# conditions are met:
+#
+# - Redistributions of source code must retain the above
+# copyright notice, this list of conditions and the following
+# disclaimer.
+#
+# - Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following
+# disclaimer in the documentation and/or other materials
+# provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+########################################################################
+#
+# This code is described in an Intel White-Paper:
+# "Fast SHA-512 Implementations on Intel Architecture Processors"
+#
+# To find it, surf to http://www.intel.com/p/en_US/embedded
+# and search for that title.
+#
+########################################################################
+
+#ifdef CONFIG_AS_AVX
+#include <linux/linkage.h>
+
+.text
+
+# Virtual Registers
+# ARG1
+digest = %rdi
+# ARG2
+msg = %rsi
+# ARG3
+msglen = %rdx
+T1 = %rcx
+T2 = %r8
+a_64 = %r9
+b_64 = %r10
+c_64 = %r11
+d_64 = %r12
+e_64 = %r13
+f_64 = %r14
+g_64 = %r15
+h_64 = %rbx
+tmp0 = %rax
+
+# Local variables (stack frame)
+
+# Message Schedule
+W_SIZE = 80*8
+# W[t] + K[t] | W[t+1] + K[t+1]
+WK_SIZE = 2*8
+RSPSAVE_SIZE = 1*8
+GPRSAVE_SIZE = 5*8
+
+frame_W = 0
+frame_WK = frame_W + W_SIZE
+frame_RSPSAVE = frame_WK + WK_SIZE
+frame_GPRSAVE = frame_RSPSAVE + RSPSAVE_SIZE
+frame_size = frame_GPRSAVE + GPRSAVE_SIZE
+
+# Useful QWORD "arrays" for simpler memory references
+# MSG, DIGEST, K_t, W_t are arrays
+# WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even
+
+# Input message (arg1)
+#define MSG(i) 8*i(msg)
+
+# Output Digest (arg2)
+#define DIGEST(i) 8*i(digest)
+
+# SHA Constants (static mem)
+#define K_t(i) 8*i+K512(%rip)
+
+# Message Schedule (stack frame)
+#define W_t(i) 8*i+frame_W(%rsp)
+
+# W[t]+K[t] (stack frame)
+#define WK_2(i) 8*((i%2))+frame_WK(%rsp)
+
+.macro RotateState
+ # Rotate symbols a..h right
+ TMP = h_64
+ h_64 = g_64
+ g_64 = f_64
+ f_64 = e_64
+ e_64 = d_64
+ d_64 = c_64
+ c_64 = b_64
+ b_64 = a_64
+ a_64 = TMP
+.endm
+
+.macro RORQ p1 p2
+ # shld is faster than ror on Sandybridge
+ shld $(64-\p2), \p1, \p1
+.endm
+
+.macro SHA512_Round rnd
+ # Compute Round %%t
+ mov f_64, T1 # T1 = f
+ mov e_64, tmp0 # tmp = e
+ xor g_64, T1 # T1 = f ^ g
+ RORQ tmp0, 23 # 41 # tmp = e ror 23
+ and e_64, T1 # T1 = (f ^ g) & e
+ xor e_64, tmp0 # tmp = (e ror 23) ^ e
+ xor g_64, T1 # T1 = ((f ^ g) & e) ^ g = CH(e,f,g)
+ idx = \rnd
+ add WK_2(idx), T1 # W[t] + K[t] from message scheduler
+ RORQ tmp0, 4 # 18 # tmp = ((e ror 23) ^ e) ror 4
+ xor e_64, tmp0 # tmp = (((e ror 23) ^ e) ror 4) ^ e
+ mov a_64, T2 # T2 = a
+ add h_64, T1 # T1 = CH(e,f,g) + W[t] + K[t] + h
+ RORQ tmp0, 14 # 14 # tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e)
+ add tmp0, T1 # T1 = CH(e,f,g) + W[t] + K[t] + S1(e)
+ mov a_64, tmp0 # tmp = a
+ xor c_64, T2 # T2 = a ^ c
+ and c_64, tmp0 # tmp = a & c
+ and b_64, T2 # T2 = (a ^ c) & b
+ xor tmp0, T2 # T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c)
+ mov a_64, tmp0 # tmp = a
+ RORQ tmp0, 5 # 39 # tmp = a ror 5
+ xor a_64, tmp0 # tmp = (a ror 5) ^ a
+ add T1, d_64 # e(next_state) = d + T1
+ RORQ tmp0, 6 # 34 # tmp = ((a ror 5) ^ a) ror 6
+ xor a_64, tmp0 # tmp = (((a ror 5) ^ a) ror 6) ^ a
+ lea (T1, T2), h_64 # a(next_state) = T1 + Maj(a,b,c)
+ RORQ tmp0, 28 # 28 # tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a)
+ add tmp0, h_64 # a(next_state) = T1 + Maj(a,b,c) S0(a)
+ RotateState
+.endm
+
+.macro SHA512_2Sched_2Round_avx rnd
+ # Compute rounds t-2 and t-1
+ # Compute message schedule QWORDS t and t+1
+
+ # Two rounds are computed based on the values for K[t-2]+W[t-2] and
+ # K[t-1]+W[t-1] which were previously stored at WK_2 by the message
+ # scheduler.
+ # The two new schedule QWORDS are stored at [W_t(t)] and [W_t(t+1)].
+ # They are then added to their respective SHA512 constants at
+ # [K_t(t)] and [K_t(t+1)] and stored at dqword [WK_2(t)]
+ # For brievity, the comments following vectored instructions only refer to
+ # the first of a pair of QWORDS.
+ # Eg. XMM4=W[t-2] really means XMM4={W[t-2]|W[t-1]}
+ # The computation of the message schedule and the rounds are tightly
+ # stitched to take advantage of instruction-level parallelism.
+
+ idx = \rnd - 2
+ vmovdqa W_t(idx), %xmm4 # XMM4 = W[t-2]
+ idx = \rnd - 15
+ vmovdqu W_t(idx), %xmm5 # XMM5 = W[t-15]
+ mov f_64, T1
+ vpsrlq $61, %xmm4, %xmm0 # XMM0 = W[t-2]>>61
+ mov e_64, tmp0
+ vpsrlq $1, %xmm5, %xmm6 # XMM6 = W[t-15]>>1
+ xor g_64, T1
+ RORQ tmp0, 23 # 41
+ vpsrlq $19, %xmm4, %xmm1 # XMM1 = W[t-2]>>19
+ and e_64, T1
+ xor e_64, tmp0
+ vpxor %xmm1, %xmm0, %xmm0 # XMM0 = W[t-2]>>61 ^ W[t-2]>>19
+ xor g_64, T1
+ idx = \rnd
+ add WK_2(idx), T1#
+ vpsrlq $8, %xmm5, %xmm7 # XMM7 = W[t-15]>>8
+ RORQ tmp0, 4 # 18
+ vpsrlq $6, %xmm4, %xmm2 # XMM2 = W[t-2]>>6
+ xor e_64, tmp0
+ mov a_64, T2
+ add h_64, T1
+ vpxor %xmm7, %xmm6, %xmm6 # XMM6 = W[t-15]>>1 ^ W[t-15]>>8
+ RORQ tmp0, 14 # 14
+ add tmp0, T1
+ vpsrlq $7, %xmm5, %xmm8 # XMM8 = W[t-15]>>7
+ mov a_64, tmp0
+ xor c_64, T2
+ vpsllq $(64-61), %xmm4, %xmm3 # XMM3 = W[t-2]<<3
+ and c_64, tmp0
+ and b_64, T2
+ vpxor %xmm3, %xmm2, %xmm2 # XMM2 = W[t-2]>>6 ^ W[t-2]<<3
+ xor tmp0, T2
+ mov a_64, tmp0
+ vpsllq $(64-1), %xmm5, %xmm9 # XMM9 = W[t-15]<<63
+ RORQ tmp0, 5 # 39
+ vpxor %xmm9, %xmm8, %xmm8 # XMM8 = W[t-15]>>7 ^ W[t-15]<<63
+ xor a_64, tmp0
+ add T1, d_64
+ RORQ tmp0, 6 # 34
+ xor a_64, tmp0
+ vpxor %xmm8, %xmm6, %xmm6 # XMM6 = W[t-15]>>1 ^ W[t-15]>>8 ^
+ # W[t-15]>>7 ^ W[t-15]<<63
+ lea (T1, T2), h_64
+ RORQ tmp0, 28 # 28
+ vpsllq $(64-19), %xmm4, %xmm4 # XMM4 = W[t-2]<<25
+ add tmp0, h_64
+ RotateState
+ vpxor %xmm4, %xmm0, %xmm0 # XMM0 = W[t-2]>>61 ^ W[t-2]>>19 ^
+ # W[t-2]<<25
+ mov f_64, T1
+ vpxor %xmm2, %xmm0, %xmm0 # XMM0 = s1(W[t-2])
+ mov e_64, tmp0
+ xor g_64, T1
+ idx = \rnd - 16
+ vpaddq W_t(idx), %xmm0, %xmm0 # XMM0 = s1(W[t-2]) + W[t-16]
+ idx = \rnd - 7
+ vmovdqu W_t(idx), %xmm1 # XMM1 = W[t-7]
+ RORQ tmp0, 23 # 41
+ and e_64, T1
+ xor e_64, tmp0
+ xor g_64, T1
+ vpsllq $(64-8), %xmm5, %xmm5 # XMM5 = W[t-15]<<56
+ idx = \rnd + 1
+ add WK_2(idx), T1
+ vpxor %xmm5, %xmm6, %xmm6 # XMM6 = s0(W[t-15])
+ RORQ tmp0, 4 # 18
+ vpaddq %xmm6, %xmm0, %xmm0 # XMM0 = s1(W[t-2]) + W[t-16] + s0(W[t-15])
+ xor e_64, tmp0
+ vpaddq %xmm1, %xmm0, %xmm0 # XMM0 = W[t] = s1(W[t-2]) + W[t-7] +
+ # s0(W[t-15]) + W[t-16]
+ mov a_64, T2
+ add h_64, T1
+ RORQ tmp0, 14 # 14
+ add tmp0, T1
+ idx = \rnd
+ vmovdqa %xmm0, W_t(idx) # Store W[t]
+ vpaddq K_t(idx), %xmm0, %xmm0 # Compute W[t]+K[t]
+ vmovdqa %xmm0, WK_2(idx) # Store W[t]+K[t] for next rounds
+ mov a_64, tmp0
+ xor c_64, T2
+ and c_64, tmp0
+ and b_64, T2
+ xor tmp0, T2
+ mov a_64, tmp0
+ RORQ tmp0, 5 # 39
+ xor a_64, tmp0
+ add T1, d_64
+ RORQ tmp0, 6 # 34
+ xor a_64, tmp0
+ lea (T1, T2), h_64
+ RORQ tmp0, 28 # 28
+ add tmp0, h_64
+ RotateState
+.endm
+
+########################################################################
+# void sha512_transform_avx(void* D, const void* M, u64 L)
+# Purpose: Updates the SHA512 digest stored at D with the message stored in M.
+# The size of the message pointed to by M must be an integer multiple of SHA512
+# message blocks.
+# L is the message length in SHA512 blocks
+########################################################################
+ENTRY(sha512_transform_avx)
+ cmp $0, msglen
+ je nowork
+
+ # Allocate Stack Space
+ mov %rsp, %rax
+ sub $frame_size, %rsp
+ and $~(0x20 - 1), %rsp
+ mov %rax, frame_RSPSAVE(%rsp)
+
+ # Save GPRs
+ mov %rbx, frame_GPRSAVE(%rsp)
+ mov %r12, frame_GPRSAVE +8*1(%rsp)
+ mov %r13, frame_GPRSAVE +8*2(%rsp)
+ mov %r14, frame_GPRSAVE +8*3(%rsp)
+ mov %r15, frame_GPRSAVE +8*4(%rsp)
+
+updateblock:
+
+ # Load state variables
+ mov DIGEST(0), a_64
+ mov DIGEST(1), b_64
+ mov DIGEST(2), c_64
+ mov DIGEST(3), d_64
+ mov DIGEST(4), e_64
+ mov DIGEST(5), f_64
+ mov DIGEST(6), g_64
+ mov DIGEST(7), h_64
+
+ t = 0
+ .rept 80/2 + 1
+ # (80 rounds) / (2 rounds/iteration) + (1 iteration)
+ # +1 iteration because the scheduler leads hashing by 1 iteration
+ .if t < 2
+ # BSWAP 2 QWORDS
+ vmovdqa XMM_QWORD_BSWAP(%rip), %xmm1
+ vmovdqu MSG(t), %xmm0
+ vpshufb %xmm1, %xmm0, %xmm0 # BSWAP
+ vmovdqa %xmm0, W_t(t) # Store Scheduled Pair
+ vpaddq K_t(t), %xmm0, %xmm0 # Compute W[t]+K[t]
+ vmovdqa %xmm0, WK_2(t) # Store into WK for rounds
+ .elseif t < 16
+ # BSWAP 2 QWORDS# Compute 2 Rounds
+ vmovdqu MSG(t), %xmm0
+ vpshufb %xmm1, %xmm0, %xmm0 # BSWAP
+ SHA512_Round t-2 # Round t-2
+ vmovdqa %xmm0, W_t(t) # Store Scheduled Pair
+ vpaddq K_t(t), %xmm0, %xmm0 # Compute W[t]+K[t]
+ SHA512_Round t-1 # Round t-1
+ vmovdqa %xmm0, WK_2(t)# Store W[t]+K[t] into WK
+ .elseif t < 79
+ # Schedule 2 QWORDS# Compute 2 Rounds
+ SHA512_2Sched_2Round_avx t
+ .else
+ # Compute 2 Rounds
+ SHA512_Round t-2
+ SHA512_Round t-1
+ .endif
+ t = t+2
+ .endr
+
+ # Update digest
+ add a_64, DIGEST(0)
+ add b_64, DIGEST(1)
+ add c_64, DIGEST(2)
+ add d_64, DIGEST(3)
+ add e_64, DIGEST(4)
+ add f_64, DIGEST(5)
+ add g_64, DIGEST(6)
+ add h_64, DIGEST(7)
+
+ # Advance to next message block
+ add $16*8, msg
+ dec msglen
+ jnz updateblock
+
+ # Restore GPRs
+ mov frame_GPRSAVE(%rsp), %rbx
+ mov frame_GPRSAVE +8*1(%rsp), %r12
+ mov frame_GPRSAVE +8*2(%rsp), %r13
+ mov frame_GPRSAVE +8*3(%rsp), %r14
+ mov frame_GPRSAVE +8*4(%rsp), %r15
+
+ # Restore Stack Pointer
+ mov frame_RSPSAVE(%rsp), %rsp
+
+nowork:
+ ret
+ENDPROC(sha512_transform_avx)
+
+########################################################################
+### Binary Data
+
+.section .rodata.cst16.XMM_QWORD_BSWAP, "aM", @progbits, 16
+.align 16
+# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
+XMM_QWORD_BSWAP:
+ .octa 0x08090a0b0c0d0e0f0001020304050607
+
+# Mergeable 640-byte rodata section. This allows linker to merge the table
+# with other, exactly the same 640-byte fragment of another rodata section
+# (if such section exists).
+.section .rodata.cst640.K512, "aM", @progbits, 640
+.align 64
+# K[t] used in SHA512 hashing
+K512:
+ .quad 0x428a2f98d728ae22,0x7137449123ef65cd
+ .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+ .quad 0x3956c25bf348b538,0x59f111f1b605d019
+ .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+ .quad 0xd807aa98a3030242,0x12835b0145706fbe
+ .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+ .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+ .quad 0x9bdc06a725c71235,0xc19bf174cf692694
+ .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
+ .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+ .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
+ .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+ .quad 0x983e5152ee66dfab,0xa831c66d2db43210
+ .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
+ .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
+ .quad 0x06ca6351e003826f,0x142929670a0e6e70
+ .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
+ .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+ .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
+ .quad 0x81c2c92e47edaee6,0x92722c851482353b
+ .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
+ .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
+ .quad 0xd192e819d6ef5218,0xd69906245565a910
+ .quad 0xf40e35855771202a,0x106aa07032bbd1b8
+ .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+ .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+ .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+ .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+ .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
+ .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
+ .quad 0x90befffa23631e28,0xa4506cebde82bde9
+ .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
+ .quad 0xca273eceea26619c,0xd186b8c721c0c207
+ .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+ .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
+ .quad 0x113f9804bef90dae,0x1b710b35131c471b
+ .quad 0x28db77f523047d84,0x32caab7b40c72493
+ .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+ .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+ .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
+#endif
diff --git a/arch/x86/crypto/sha512-avx2-asm.S b/arch/x86/crypto/sha512-avx2-asm.S
new file mode 100644
index 000000000..b16d56005
--- /dev/null
+++ b/arch/x86/crypto/sha512-avx2-asm.S
@@ -0,0 +1,752 @@
+########################################################################
+# Implement fast SHA-512 with AVX2 instructions. (x86_64)
+#
+# Copyright (C) 2013 Intel Corporation.
+#
+# Authors:
+# James Guilford <james.guilford@intel.com>
+# Kirk Yap <kirk.s.yap@intel.com>
+# David Cote <david.m.cote@intel.com>
+# Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses. You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+# Redistribution and use in source and binary forms, with or
+# without modification, are permitted provided that the following
+# conditions are met:
+#
+# - Redistributions of source code must retain the above
+# copyright notice, this list of conditions and the following
+# disclaimer.
+#
+# - Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following
+# disclaimer in the documentation and/or other materials
+# provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+########################################################################
+#
+# This code is described in an Intel White-Paper:
+# "Fast SHA-512 Implementations on Intel Architecture Processors"
+#
+# To find it, surf to http://www.intel.com/p/en_US/embedded
+# and search for that title.
+#
+########################################################################
+# This code schedules 1 blocks at a time, with 4 lanes per block
+########################################################################
+
+#ifdef CONFIG_AS_AVX2
+#include <linux/linkage.h>
+
+.text
+
+# Virtual Registers
+Y_0 = %ymm4
+Y_1 = %ymm5
+Y_2 = %ymm6
+Y_3 = %ymm7
+
+YTMP0 = %ymm0
+YTMP1 = %ymm1
+YTMP2 = %ymm2
+YTMP3 = %ymm3
+YTMP4 = %ymm8
+XFER = YTMP0
+
+BYTE_FLIP_MASK = %ymm9
+
+# 1st arg is %rdi, which is saved to the stack and accessed later via %r12
+CTX1 = %rdi
+CTX2 = %r12
+# 2nd arg
+INP = %rsi
+# 3rd arg
+NUM_BLKS = %rdx
+
+c = %rcx
+d = %r8
+e = %rdx
+y3 = %rsi
+
+TBL = %rdi # clobbers CTX1
+
+a = %rax
+b = %rbx
+
+f = %r9
+g = %r10
+h = %r11
+old_h = %r11
+
+T1 = %r12 # clobbers CTX2
+y0 = %r13
+y1 = %r14
+y2 = %r15
+
+# Local variables (stack frame)
+XFER_SIZE = 4*8
+SRND_SIZE = 1*8
+INP_SIZE = 1*8
+INPEND_SIZE = 1*8
+CTX_SIZE = 1*8
+RSPSAVE_SIZE = 1*8
+GPRSAVE_SIZE = 5*8
+
+frame_XFER = 0
+frame_SRND = frame_XFER + XFER_SIZE
+frame_INP = frame_SRND + SRND_SIZE
+frame_INPEND = frame_INP + INP_SIZE
+frame_CTX = frame_INPEND + INPEND_SIZE
+frame_RSPSAVE = frame_CTX + CTX_SIZE
+frame_GPRSAVE = frame_RSPSAVE + RSPSAVE_SIZE
+frame_size = frame_GPRSAVE + GPRSAVE_SIZE
+
+## assume buffers not aligned
+#define VMOVDQ vmovdqu
+
+# addm [mem], reg
+# Add reg to mem using reg-mem add and store
+.macro addm p1 p2
+ add \p1, \p2
+ mov \p2, \p1
+.endm
+
+
+# COPY_YMM_AND_BSWAP ymm, [mem], byte_flip_mask
+# Load ymm with mem and byte swap each dword
+.macro COPY_YMM_AND_BSWAP p1 p2 p3
+ VMOVDQ \p2, \p1
+ vpshufb \p3, \p1, \p1
+.endm
+# rotate_Ys
+# Rotate values of symbols Y0...Y3
+.macro rotate_Ys
+ Y_ = Y_0
+ Y_0 = Y_1
+ Y_1 = Y_2
+ Y_2 = Y_3
+ Y_3 = Y_
+.endm
+
+# RotateState
+.macro RotateState
+ # Rotate symbols a..h right
+ old_h = h
+ TMP_ = h
+ h = g
+ g = f
+ f = e
+ e = d
+ d = c
+ c = b
+ b = a
+ a = TMP_
+.endm
+
+# macro MY_VPALIGNR YDST, YSRC1, YSRC2, RVAL
+# YDST = {YSRC1, YSRC2} >> RVAL*8
+.macro MY_VPALIGNR YDST YSRC1 YSRC2 RVAL
+ vperm2f128 $0x3, \YSRC2, \YSRC1, \YDST # YDST = {YS1_LO, YS2_HI}
+ vpalignr $\RVAL, \YSRC2, \YDST, \YDST # YDST = {YDS1, YS2} >> RVAL*8
+.endm
+
+.macro FOUR_ROUNDS_AND_SCHED
+################################### RND N + 0 #########################################
+
+ # Extract w[t-7]
+ MY_VPALIGNR YTMP0, Y_3, Y_2, 8 # YTMP0 = W[-7]
+ # Calculate w[t-16] + w[t-7]
+ vpaddq Y_0, YTMP0, YTMP0 # YTMP0 = W[-7] + W[-16]
+ # Extract w[t-15]
+ MY_VPALIGNR YTMP1, Y_1, Y_0, 8 # YTMP1 = W[-15]
+
+ # Calculate sigma0
+
+ # Calculate w[t-15] ror 1
+ vpsrlq $1, YTMP1, YTMP2
+ vpsllq $(64-1), YTMP1, YTMP3
+ vpor YTMP2, YTMP3, YTMP3 # YTMP3 = W[-15] ror 1
+ # Calculate w[t-15] shr 7
+ vpsrlq $7, YTMP1, YTMP4 # YTMP4 = W[-15] >> 7
+
+ mov a, y3 # y3 = a # MAJA
+ rorx $41, e, y0 # y0 = e >> 41 # S1A
+ rorx $18, e, y1 # y1 = e >> 18 # S1B
+ add frame_XFER(%rsp),h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+ mov f, y2 # y2 = f # CH
+ rorx $34, a, T1 # T1 = a >> 34 # S0B
+
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
+ xor g, y2 # y2 = f^g # CH
+ rorx $14, e, y1 # y1 = (e >> 14) # S1
+
+ and e, y2 # y2 = (f^g)&e # CH
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
+ rorx $39, a, y1 # y1 = a >> 39 # S0A
+ add h, d # d = k + w + h + d # --
+
+ and b, y3 # y3 = (a|c)&b # MAJA
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
+ rorx $28, a, T1 # T1 = (a >> 28) # S0
+
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
+ mov a, T1 # T1 = a # MAJB
+ and c, T1 # T1 = a&c # MAJB
+
+ add y0, y2 # y2 = S1 + CH # --
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1, h # h = k + w + h + S0 # --
+
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+
+ add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+ add y3, h # h = t1 + S0 + MAJ # --
+
+ RotateState
+
+################################### RND N + 1 #########################################
+
+ # Calculate w[t-15] ror 8
+ vpsrlq $8, YTMP1, YTMP2
+ vpsllq $(64-8), YTMP1, YTMP1
+ vpor YTMP2, YTMP1, YTMP1 # YTMP1 = W[-15] ror 8
+ # XOR the three components
+ vpxor YTMP4, YTMP3, YTMP3 # YTMP3 = W[-15] ror 1 ^ W[-15] >> 7
+ vpxor YTMP1, YTMP3, YTMP1 # YTMP1 = s0
+
+
+ # Add three components, w[t-16], w[t-7] and sigma0
+ vpaddq YTMP1, YTMP0, YTMP0 # YTMP0 = W[-16] + W[-7] + s0
+ # Move to appropriate lanes for calculating w[16] and w[17]
+ vperm2f128 $0x0, YTMP0, YTMP0, Y_0 # Y_0 = W[-16] + W[-7] + s0 {BABA}
+ # Move to appropriate lanes for calculating w[18] and w[19]
+ vpand MASK_YMM_LO(%rip), YTMP0, YTMP0 # YTMP0 = W[-16] + W[-7] + s0 {DC00}
+
+ # Calculate w[16] and w[17] in both 128 bit lanes
+
+ # Calculate sigma1 for w[16] and w[17] on both 128 bit lanes
+ vperm2f128 $0x11, Y_3, Y_3, YTMP2 # YTMP2 = W[-2] {BABA}
+ vpsrlq $6, YTMP2, YTMP4 # YTMP4 = W[-2] >> 6 {BABA}
+
+
+ mov a, y3 # y3 = a # MAJA
+ rorx $41, e, y0 # y0 = e >> 41 # S1A
+ rorx $18, e, y1 # y1 = e >> 18 # S1B
+ add 1*8+frame_XFER(%rsp), h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+
+
+ mov f, y2 # y2 = f # CH
+ rorx $34, a, T1 # T1 = a >> 34 # S0B
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
+ xor g, y2 # y2 = f^g # CH
+
+
+ rorx $14, e, y1 # y1 = (e >> 14) # S1
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
+ rorx $39, a, y1 # y1 = a >> 39 # S0A
+ and e, y2 # y2 = (f^g)&e # CH
+ add h, d # d = k + w + h + d # --
+
+ and b, y3 # y3 = (a|c)&b # MAJA
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
+
+ rorx $28, a, T1 # T1 = (a >> 28) # S0
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
+ mov a, T1 # T1 = a # MAJB
+ and c, T1 # T1 = a&c # MAJB
+ add y0, y2 # y2 = S1 + CH # --
+
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1, h # h = k + w + h + S0 # --
+
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+ add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+ add y3, h # h = t1 + S0 + MAJ # --
+
+ RotateState
+
+
+################################### RND N + 2 #########################################
+
+ vpsrlq $19, YTMP2, YTMP3 # YTMP3 = W[-2] >> 19 {BABA}
+ vpsllq $(64-19), YTMP2, YTMP1 # YTMP1 = W[-2] << 19 {BABA}
+ vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 19 {BABA}
+ vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {BABA}
+ vpsrlq $61, YTMP2, YTMP3 # YTMP3 = W[-2] >> 61 {BABA}
+ vpsllq $(64-61), YTMP2, YTMP1 # YTMP1 = W[-2] << 61 {BABA}
+ vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 61 {BABA}
+ vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = s1 = (W[-2] ror 19) ^
+ # (W[-2] ror 61) ^ (W[-2] >> 6) {BABA}
+
+ # Add sigma1 to the other compunents to get w[16] and w[17]
+ vpaddq YTMP4, Y_0, Y_0 # Y_0 = {W[1], W[0], W[1], W[0]}
+
+ # Calculate sigma1 for w[18] and w[19] for upper 128 bit lane
+ vpsrlq $6, Y_0, YTMP4 # YTMP4 = W[-2] >> 6 {DC--}
+
+ mov a, y3 # y3 = a # MAJA
+ rorx $41, e, y0 # y0 = e >> 41 # S1A
+ add 2*8+frame_XFER(%rsp), h # h = k + w + h # --
+
+ rorx $18, e, y1 # y1 = e >> 18 # S1B
+ or c, y3 # y3 = a|c # MAJA
+ mov f, y2 # y2 = f # CH
+ xor g, y2 # y2 = f^g # CH
+
+ rorx $34, a, T1 # T1 = a >> 34 # S0B
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
+ and e, y2 # y2 = (f^g)&e # CH
+
+ rorx $14, e, y1 # y1 = (e >> 14) # S1
+ add h, d # d = k + w + h + d # --
+ and b, y3 # y3 = (a|c)&b # MAJA
+
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
+ rorx $39, a, y1 # y1 = a >> 39 # S0A
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
+ rorx $28, a, T1 # T1 = (a >> 28) # S0
+
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
+ mov a, T1 # T1 = a # MAJB
+ and c, T1 # T1 = a&c # MAJB
+ add y0, y2 # y2 = S1 + CH # --
+
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1, h # h = k + w + h + S0 # --
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+ add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+
+ add y3, h # h = t1 + S0 + MAJ # --
+
+ RotateState
+
+################################### RND N + 3 #########################################
+
+ vpsrlq $19, Y_0, YTMP3 # YTMP3 = W[-2] >> 19 {DC--}
+ vpsllq $(64-19), Y_0, YTMP1 # YTMP1 = W[-2] << 19 {DC--}
+ vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 19 {DC--}
+ vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {DC--}
+ vpsrlq $61, Y_0, YTMP3 # YTMP3 = W[-2] >> 61 {DC--}
+ vpsllq $(64-61), Y_0, YTMP1 # YTMP1 = W[-2] << 61 {DC--}
+ vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 61 {DC--}
+ vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = s1 = (W[-2] ror 19) ^
+ # (W[-2] ror 61) ^ (W[-2] >> 6) {DC--}
+
+ # Add the sigma0 + w[t-7] + w[t-16] for w[18] and w[19]
+ # to newly calculated sigma1 to get w[18] and w[19]
+ vpaddq YTMP4, YTMP0, YTMP2 # YTMP2 = {W[3], W[2], --, --}
+
+ # Form w[19, w[18], w17], w[16]
+ vpblendd $0xF0, YTMP2, Y_0, Y_0 # Y_0 = {W[3], W[2], W[1], W[0]}
+
+ mov a, y3 # y3 = a # MAJA
+ rorx $41, e, y0 # y0 = e >> 41 # S1A
+ rorx $18, e, y1 # y1 = e >> 18 # S1B
+ add 3*8+frame_XFER(%rsp), h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+
+
+ mov f, y2 # y2 = f # CH
+ rorx $34, a, T1 # T1 = a >> 34 # S0B
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
+ xor g, y2 # y2 = f^g # CH
+
+
+ rorx $14, e, y1 # y1 = (e >> 14) # S1
+ and e, y2 # y2 = (f^g)&e # CH
+ add h, d # d = k + w + h + d # --
+ and b, y3 # y3 = (a|c)&b # MAJA
+
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+
+ rorx $39, a, y1 # y1 = a >> 39 # S0A
+ add y0, y2 # y2 = S1 + CH # --
+
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+
+ rorx $28, a, T1 # T1 = (a >> 28) # S0
+
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
+ mov a, T1 # T1 = a # MAJB
+ and c, T1 # T1 = a&c # MAJB
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+
+ add y1, h # h = k + w + h + S0 # --
+ add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+ add y3, h # h = t1 + S0 + MAJ # --
+
+ RotateState
+
+ rotate_Ys
+.endm
+
+.macro DO_4ROUNDS
+
+################################### RND N + 0 #########################################
+
+ mov f, y2 # y2 = f # CH
+ rorx $41, e, y0 # y0 = e >> 41 # S1A
+ rorx $18, e, y1 # y1 = e >> 18 # S1B
+ xor g, y2 # y2 = f^g # CH
+
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
+ rorx $14, e, y1 # y1 = (e >> 14) # S1
+ and e, y2 # y2 = (f^g)&e # CH
+
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
+ rorx $34, a, T1 # T1 = a >> 34 # S0B
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+ rorx $39, a, y1 # y1 = a >> 39 # S0A
+ mov a, y3 # y3 = a # MAJA
+
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
+ rorx $28, a, T1 # T1 = (a >> 28) # S0
+ add frame_XFER(%rsp), h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
+ mov a, T1 # T1 = a # MAJB
+ and b, y3 # y3 = (a|c)&b # MAJA
+ and c, T1 # T1 = a&c # MAJB
+ add y0, y2 # y2 = S1 + CH # --
+
+ add h, d # d = k + w + h + d # --
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1, h # h = k + w + h + S0 # --
+
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+
+ RotateState
+
+################################### RND N + 1 #########################################
+
+ add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+ mov f, y2 # y2 = f # CH
+ rorx $41, e, y0 # y0 = e >> 41 # S1A
+ rorx $18, e, y1 # y1 = e >> 18 # S1B
+ xor g, y2 # y2 = f^g # CH
+
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
+ rorx $14, e, y1 # y1 = (e >> 14) # S1
+ and e, y2 # y2 = (f^g)&e # CH
+ add y3, old_h # h = t1 + S0 + MAJ # --
+
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
+ rorx $34, a, T1 # T1 = a >> 34 # S0B
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+ rorx $39, a, y1 # y1 = a >> 39 # S0A
+ mov a, y3 # y3 = a # MAJA
+
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
+ rorx $28, a, T1 # T1 = (a >> 28) # S0
+ add 8*1+frame_XFER(%rsp), h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
+ mov a, T1 # T1 = a # MAJB
+ and b, y3 # y3 = (a|c)&b # MAJA
+ and c, T1 # T1 = a&c # MAJB
+ add y0, y2 # y2 = S1 + CH # --
+
+ add h, d # d = k + w + h + d # --
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1, h # h = k + w + h + S0 # --
+
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+
+ RotateState
+
+################################### RND N + 2 #########################################
+
+ add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+ mov f, y2 # y2 = f # CH
+ rorx $41, e, y0 # y0 = e >> 41 # S1A
+ rorx $18, e, y1 # y1 = e >> 18 # S1B
+ xor g, y2 # y2 = f^g # CH
+
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
+ rorx $14, e, y1 # y1 = (e >> 14) # S1
+ and e, y2 # y2 = (f^g)&e # CH
+ add y3, old_h # h = t1 + S0 + MAJ # --
+
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
+ rorx $34, a, T1 # T1 = a >> 34 # S0B
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+ rorx $39, a, y1 # y1 = a >> 39 # S0A
+ mov a, y3 # y3 = a # MAJA
+
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
+ rorx $28, a, T1 # T1 = (a >> 28) # S0
+ add 8*2+frame_XFER(%rsp), h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
+ mov a, T1 # T1 = a # MAJB
+ and b, y3 # y3 = (a|c)&b # MAJA
+ and c, T1 # T1 = a&c # MAJB
+ add y0, y2 # y2 = S1 + CH # --
+
+ add h, d # d = k + w + h + d # --
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1, h # h = k + w + h + S0 # --
+
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+
+ RotateState
+
+################################### RND N + 3 #########################################
+
+ add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+ mov f, y2 # y2 = f # CH
+ rorx $41, e, y0 # y0 = e >> 41 # S1A
+ rorx $18, e, y1 # y1 = e >> 18 # S1B
+ xor g, y2 # y2 = f^g # CH
+
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1
+ rorx $14, e, y1 # y1 = (e >> 14) # S1
+ and e, y2 # y2 = (f^g)&e # CH
+ add y3, old_h # h = t1 + S0 + MAJ # --
+
+ xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1
+ rorx $34, a, T1 # T1 = a >> 34 # S0B
+ xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
+ rorx $39, a, y1 # y1 = a >> 39 # S0A
+ mov a, y3 # y3 = a # MAJA
+
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
+ rorx $28, a, T1 # T1 = (a >> 28) # S0
+ add 8*3+frame_XFER(%rsp), h # h = k + w + h # --
+ or c, y3 # y3 = a|c # MAJA
+
+ xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
+ mov a, T1 # T1 = a # MAJB
+ and b, y3 # y3 = (a|c)&b # MAJA
+ and c, T1 # T1 = a&c # MAJB
+ add y0, y2 # y2 = S1 + CH # --
+
+
+ add h, d # d = k + w + h + d # --
+ or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
+ add y1, h # h = k + w + h + S0 # --
+
+ add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
+
+ add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
+
+ add y3, h # h = t1 + S0 + MAJ # --
+
+ RotateState
+
+.endm
+
+########################################################################
+# void sha512_transform_rorx(void* D, const void* M, uint64_t L)#
+# Purpose: Updates the SHA512 digest stored at D with the message stored in M.
+# The size of the message pointed to by M must be an integer multiple of SHA512
+# message blocks.
+# L is the message length in SHA512 blocks
+########################################################################
+ENTRY(sha512_transform_rorx)
+ # Allocate Stack Space
+ mov %rsp, %rax
+ sub $frame_size, %rsp
+ and $~(0x20 - 1), %rsp
+ mov %rax, frame_RSPSAVE(%rsp)
+
+ # Save GPRs
+ mov %rbx, 8*0+frame_GPRSAVE(%rsp)
+ mov %r12, 8*1+frame_GPRSAVE(%rsp)
+ mov %r13, 8*2+frame_GPRSAVE(%rsp)
+ mov %r14, 8*3+frame_GPRSAVE(%rsp)
+ mov %r15, 8*4+frame_GPRSAVE(%rsp)
+
+ shl $7, NUM_BLKS # convert to bytes
+ jz done_hash
+ add INP, NUM_BLKS # pointer to end of data
+ mov NUM_BLKS, frame_INPEND(%rsp)
+
+ ## load initial digest
+ mov 8*0(CTX1), a
+ mov 8*1(CTX1), b
+ mov 8*2(CTX1), c
+ mov 8*3(CTX1), d
+ mov 8*4(CTX1), e
+ mov 8*5(CTX1), f
+ mov 8*6(CTX1), g
+ mov 8*7(CTX1), h
+
+ # save %rdi (CTX) before it gets clobbered
+ mov %rdi, frame_CTX(%rsp)
+
+ vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
+
+loop0:
+ lea K512(%rip), TBL
+
+ ## byte swap first 16 dwords
+ COPY_YMM_AND_BSWAP Y_0, (INP), BYTE_FLIP_MASK
+ COPY_YMM_AND_BSWAP Y_1, 1*32(INP), BYTE_FLIP_MASK
+ COPY_YMM_AND_BSWAP Y_2, 2*32(INP), BYTE_FLIP_MASK
+ COPY_YMM_AND_BSWAP Y_3, 3*32(INP), BYTE_FLIP_MASK
+
+ mov INP, frame_INP(%rsp)
+
+ ## schedule 64 input dwords, by doing 12 rounds of 4 each
+ movq $4, frame_SRND(%rsp)
+
+.align 16
+loop1:
+ vpaddq (TBL), Y_0, XFER
+ vmovdqa XFER, frame_XFER(%rsp)
+ FOUR_ROUNDS_AND_SCHED
+
+ vpaddq 1*32(TBL), Y_0, XFER
+ vmovdqa XFER, frame_XFER(%rsp)
+ FOUR_ROUNDS_AND_SCHED
+
+ vpaddq 2*32(TBL), Y_0, XFER
+ vmovdqa XFER, frame_XFER(%rsp)
+ FOUR_ROUNDS_AND_SCHED
+
+ vpaddq 3*32(TBL), Y_0, XFER
+ vmovdqa XFER, frame_XFER(%rsp)
+ add $(4*32), TBL
+ FOUR_ROUNDS_AND_SCHED
+
+ subq $1, frame_SRND(%rsp)
+ jne loop1
+
+ movq $2, frame_SRND(%rsp)
+loop2:
+ vpaddq (TBL), Y_0, XFER
+ vmovdqa XFER, frame_XFER(%rsp)
+ DO_4ROUNDS
+ vpaddq 1*32(TBL), Y_1, XFER
+ vmovdqa XFER, frame_XFER(%rsp)
+ add $(2*32), TBL
+ DO_4ROUNDS
+
+ vmovdqa Y_2, Y_0
+ vmovdqa Y_3, Y_1
+
+ subq $1, frame_SRND(%rsp)
+ jne loop2
+
+ mov frame_CTX(%rsp), CTX2
+ addm 8*0(CTX2), a
+ addm 8*1(CTX2), b
+ addm 8*2(CTX2), c
+ addm 8*3(CTX2), d
+ addm 8*4(CTX2), e
+ addm 8*5(CTX2), f
+ addm 8*6(CTX2), g
+ addm 8*7(CTX2), h
+
+ mov frame_INP(%rsp), INP
+ add $128, INP
+ cmp frame_INPEND(%rsp), INP
+ jne loop0
+
+done_hash:
+
+# Restore GPRs
+ mov 8*0+frame_GPRSAVE(%rsp), %rbx
+ mov 8*1+frame_GPRSAVE(%rsp), %r12
+ mov 8*2+frame_GPRSAVE(%rsp), %r13
+ mov 8*3+frame_GPRSAVE(%rsp), %r14
+ mov 8*4+frame_GPRSAVE(%rsp), %r15
+
+ # Restore Stack Pointer
+ mov frame_RSPSAVE(%rsp), %rsp
+ ret
+ENDPROC(sha512_transform_rorx)
+
+########################################################################
+### Binary Data
+
+
+# Mergeable 640-byte rodata section. This allows linker to merge the table
+# with other, exactly the same 640-byte fragment of another rodata section
+# (if such section exists).
+.section .rodata.cst640.K512, "aM", @progbits, 640
+.align 64
+# K[t] used in SHA512 hashing
+K512:
+ .quad 0x428a2f98d728ae22,0x7137449123ef65cd
+ .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+ .quad 0x3956c25bf348b538,0x59f111f1b605d019
+ .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+ .quad 0xd807aa98a3030242,0x12835b0145706fbe
+ .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+ .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+ .quad 0x9bdc06a725c71235,0xc19bf174cf692694
+ .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
+ .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+ .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
+ .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+ .quad 0x983e5152ee66dfab,0xa831c66d2db43210
+ .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
+ .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
+ .quad 0x06ca6351e003826f,0x142929670a0e6e70
+ .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
+ .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+ .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
+ .quad 0x81c2c92e47edaee6,0x92722c851482353b
+ .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
+ .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
+ .quad 0xd192e819d6ef5218,0xd69906245565a910
+ .quad 0xf40e35855771202a,0x106aa07032bbd1b8
+ .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+ .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+ .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+ .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+ .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
+ .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
+ .quad 0x90befffa23631e28,0xa4506cebde82bde9
+ .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
+ .quad 0xca273eceea26619c,0xd186b8c721c0c207
+ .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+ .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
+ .quad 0x113f9804bef90dae,0x1b710b35131c471b
+ .quad 0x28db77f523047d84,0x32caab7b40c72493
+ .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+ .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+ .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
+
+.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
+.align 32
+# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
+PSHUFFLE_BYTE_FLIP_MASK:
+ .octa 0x08090a0b0c0d0e0f0001020304050607
+ .octa 0x18191a1b1c1d1e1f1011121314151617
+
+.section .rodata.cst32.MASK_YMM_LO, "aM", @progbits, 32
+.align 32
+MASK_YMM_LO:
+ .octa 0x00000000000000000000000000000000
+ .octa 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
+
+#endif
diff --git a/arch/x86/crypto/sha512-mb/Makefile b/arch/x86/crypto/sha512-mb/Makefile
new file mode 100644
index 000000000..90f1ef691
--- /dev/null
+++ b/arch/x86/crypto/sha512-mb/Makefile
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Arch-specific CryptoAPI modules.
+#
+
+avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
+ $(comma)4)$(comma)%ymm2,yes,no)
+ifeq ($(avx2_supported),yes)
+ obj-$(CONFIG_CRYPTO_SHA512_MB) += sha512-mb.o
+ sha512-mb-y := sha512_mb.o sha512_mb_mgr_flush_avx2.o \
+ sha512_mb_mgr_init_avx2.o sha512_mb_mgr_submit_avx2.o sha512_x4_avx2.o
+endif
diff --git a/arch/x86/crypto/sha512-mb/sha512_mb.c b/arch/x86/crypto/sha512-mb/sha512_mb.c
new file mode 100644
index 000000000..26b856780
--- /dev/null
+++ b/arch/x86/crypto/sha512-mb/sha512_mb.c
@@ -0,0 +1,1047 @@
+/*
+ * Multi buffer SHA512 algorithm Glue Code
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * Megha Dey <megha.dey@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <crypto/internal/hash.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <crypto/scatterwalk.h>
+#include <crypto/sha.h>
+#include <crypto/mcryptd.h>
+#include <crypto/crypto_wq.h>
+#include <asm/byteorder.h>
+#include <linux/hardirq.h>
+#include <asm/fpu/api.h>
+#include "sha512_mb_ctx.h"
+
+#define FLUSH_INTERVAL 1000 /* in usec */
+
+static struct mcryptd_alg_state sha512_mb_alg_state;
+
+struct sha512_mb_ctx {
+ struct mcryptd_ahash *mcryptd_tfm;
+};
+
+static inline struct mcryptd_hash_request_ctx
+ *cast_hash_to_mcryptd_ctx(struct sha512_hash_ctx *hash_ctx)
+{
+ struct ahash_request *areq;
+
+ areq = container_of((void *) hash_ctx, struct ahash_request, __ctx);
+ return container_of(areq, struct mcryptd_hash_request_ctx, areq);
+}
+
+static inline struct ahash_request
+ *cast_mcryptd_ctx_to_req(struct mcryptd_hash_request_ctx *ctx)
+{
+ return container_of((void *) ctx, struct ahash_request, __ctx);
+}
+
+static void req_ctx_init(struct mcryptd_hash_request_ctx *rctx,
+ struct ahash_request *areq)
+{
+ rctx->flag = HASH_UPDATE;
+}
+
+static asmlinkage void (*sha512_job_mgr_init)(struct sha512_mb_mgr *state);
+static asmlinkage struct job_sha512* (*sha512_job_mgr_submit)
+ (struct sha512_mb_mgr *state,
+ struct job_sha512 *job);
+static asmlinkage struct job_sha512* (*sha512_job_mgr_flush)
+ (struct sha512_mb_mgr *state);
+static asmlinkage struct job_sha512* (*sha512_job_mgr_get_comp_job)
+ (struct sha512_mb_mgr *state);
+
+inline uint32_t sha512_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2],
+ uint64_t total_len)
+{
+ uint32_t i = total_len & (SHA512_BLOCK_SIZE - 1);
+
+ memset(&padblock[i], 0, SHA512_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ i += ((SHA512_BLOCK_SIZE - 1) &
+ (0 - (total_len + SHA512_PADLENGTHFIELD_SIZE + 1)))
+ + 1 + SHA512_PADLENGTHFIELD_SIZE;
+
+#if SHA512_PADLENGTHFIELD_SIZE == 16
+ *((uint64_t *) &padblock[i - 16]) = 0;
+#endif
+
+ *((uint64_t *) &padblock[i - 8]) = cpu_to_be64(total_len << 3);
+
+ /* Number of extra blocks to hash */
+ return i >> SHA512_LOG2_BLOCK_SIZE;
+}
+
+static struct sha512_hash_ctx *sha512_ctx_mgr_resubmit
+ (struct sha512_ctx_mgr *mgr, struct sha512_hash_ctx *ctx)
+{
+ while (ctx) {
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ /* Clear PROCESSING bit */
+ ctx->status = HASH_CTX_STS_COMPLETE;
+ return ctx;
+ }
+
+ /*
+ * If the extra blocks are empty, begin hashing what remains
+ * in the user's buffer.
+ */
+ if (ctx->partial_block_buffer_length == 0 &&
+ ctx->incoming_buffer_length) {
+
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+ uint32_t copy_len;
+
+ /*
+ * Only entire blocks can be hashed.
+ * Copy remainder to extra blocks buffer.
+ */
+ copy_len = len & (SHA512_BLOCK_SIZE-1);
+
+ if (copy_len) {
+ len -= copy_len;
+ memcpy(ctx->partial_block_buffer,
+ ((const char *) buffer + len),
+ copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ /* len should be a multiple of the block size now */
+ assert((len % SHA512_BLOCK_SIZE) == 0);
+
+ /* Set len to the number of blocks to be hashed */
+ len >>= SHA512_LOG2_BLOCK_SIZE;
+
+ if (len) {
+
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx = (struct sha512_hash_ctx *)
+ sha512_job_mgr_submit(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+
+ /*
+ * If the extra blocks are not empty, then we are
+ * either on the last block(s) or we need more
+ * user input before continuing.
+ */
+ if (ctx->status & HASH_CTX_STS_LAST) {
+
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks =
+ sha512_pad(buf, ctx->total_length);
+
+ ctx->status = (HASH_CTX_STS_PROCESSING |
+ HASH_CTX_STS_COMPLETE);
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx = (struct sha512_hash_ctx *)
+ sha512_job_mgr_submit(&mgr->mgr, &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static struct sha512_hash_ctx
+ *sha512_ctx_mgr_get_comp_ctx(struct mcryptd_alg_cstate *cstate)
+{
+ /*
+ * If get_comp_job returns NULL, there are no jobs complete.
+ * If get_comp_job returns a job, verify that it is safe to return to
+ * the user.
+ * If it is not ready, resubmit the job to finish processing.
+ * If sha512_ctx_mgr_resubmit returned a job, it is ready to be
+ * returned.
+ * Otherwise, all jobs currently being managed by the hash_ctx_mgr
+ * still need processing.
+ */
+ struct sha512_ctx_mgr *mgr;
+ struct sha512_hash_ctx *ctx;
+ unsigned long flags;
+
+ mgr = cstate->mgr;
+ spin_lock_irqsave(&cstate->work_lock, flags);
+ ctx = (struct sha512_hash_ctx *)
+ sha512_job_mgr_get_comp_job(&mgr->mgr);
+ ctx = sha512_ctx_mgr_resubmit(mgr, ctx);
+ spin_unlock_irqrestore(&cstate->work_lock, flags);
+ return ctx;
+}
+
+static void sha512_ctx_mgr_init(struct sha512_ctx_mgr *mgr)
+{
+ sha512_job_mgr_init(&mgr->mgr);
+}
+
+static struct sha512_hash_ctx
+ *sha512_ctx_mgr_submit(struct mcryptd_alg_cstate *cstate,
+ struct sha512_hash_ctx *ctx,
+ const void *buffer,
+ uint32_t len,
+ int flags)
+{
+ struct sha512_ctx_mgr *mgr;
+ unsigned long irqflags;
+
+ mgr = cstate->mgr;
+ spin_lock_irqsave(&cstate->work_lock, irqflags);
+ if (flags & ~(HASH_UPDATE | HASH_LAST)) {
+ /* User should not pass anything other than UPDATE or LAST */
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ goto unlock;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ /* Cannot submit to a currently processing job. */
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ goto unlock;
+ }
+
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ /* Cannot update a finished job. */
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ goto unlock;
+ }
+
+ /*
+ * If we made it here, there were no errors during this call to
+ * submit
+ */
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ /* Store buffer ptr info from user */
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ /*
+ * Store the user's request flags and mark this ctx as currently being
+ * processed.
+ */
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ /* Advance byte counter */
+ ctx->total_length += len;
+
+ /*
+ * If there is anything currently buffered in the extra blocks,
+ * append to it until it contains a whole block.
+ * Or if the user's buffer contains less than a whole block,
+ * append as much as possible to the extra block.
+ */
+ if (ctx->partial_block_buffer_length || len < SHA512_BLOCK_SIZE) {
+ /* Compute how many bytes to copy from user buffer into extra
+ * block
+ */
+ uint32_t copy_len = SHA512_BLOCK_SIZE -
+ ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ /* Copy and update relevant pointers and counters */
+ memcpy
+ (&ctx->partial_block_buffer[ctx->partial_block_buffer_length],
+ buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)
+ ((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+
+ /* The extra block should never contain more than 1 block
+ * here
+ */
+ assert(ctx->partial_block_buffer_length <= SHA512_BLOCK_SIZE);
+
+ /* If the extra block buffer contains exactly 1 block, it can
+ * be hashed.
+ */
+ if (ctx->partial_block_buffer_length >= SHA512_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+ ctx = (struct sha512_hash_ctx *)
+ sha512_job_mgr_submit(&mgr->mgr, &ctx->job);
+ }
+ }
+
+ ctx = sha512_ctx_mgr_resubmit(mgr, ctx);
+unlock:
+ spin_unlock_irqrestore(&cstate->work_lock, irqflags);
+ return ctx;
+}
+
+static struct sha512_hash_ctx *sha512_ctx_mgr_flush(struct mcryptd_alg_cstate *cstate)
+{
+ struct sha512_ctx_mgr *mgr;
+ struct sha512_hash_ctx *ctx;
+ unsigned long flags;
+
+ mgr = cstate->mgr;
+ spin_lock_irqsave(&cstate->work_lock, flags);
+ while (1) {
+ ctx = (struct sha512_hash_ctx *)
+ sha512_job_mgr_flush(&mgr->mgr);
+
+ /* If flush returned 0, there are no more jobs in flight. */
+ if (!ctx)
+ break;
+
+ /*
+ * If flush returned a job, resubmit the job to finish
+ * processing.
+ */
+ ctx = sha512_ctx_mgr_resubmit(mgr, ctx);
+
+ /*
+ * If sha512_ctx_mgr_resubmit returned a job, it is ready to
+ * be returned. Otherwise, all jobs currently being managed by
+ * the sha512_ctx_mgr still need processing. Loop.
+ */
+ if (ctx)
+ break;
+ }
+ spin_unlock_irqrestore(&cstate->work_lock, flags);
+ return ctx;
+}
+
+static int sha512_mb_init(struct ahash_request *areq)
+{
+ struct sha512_hash_ctx *sctx = ahash_request_ctx(areq);
+
+ hash_ctx_init(sctx);
+ sctx->job.result_digest[0] = SHA512_H0;
+ sctx->job.result_digest[1] = SHA512_H1;
+ sctx->job.result_digest[2] = SHA512_H2;
+ sctx->job.result_digest[3] = SHA512_H3;
+ sctx->job.result_digest[4] = SHA512_H4;
+ sctx->job.result_digest[5] = SHA512_H5;
+ sctx->job.result_digest[6] = SHA512_H6;
+ sctx->job.result_digest[7] = SHA512_H7;
+ sctx->total_length = 0;
+ sctx->partial_block_buffer_length = 0;
+ sctx->status = HASH_CTX_STS_IDLE;
+
+ return 0;
+}
+
+static int sha512_mb_set_results(struct mcryptd_hash_request_ctx *rctx)
+{
+ int i;
+ struct sha512_hash_ctx *sctx = ahash_request_ctx(&rctx->areq);
+ __be64 *dst = (__be64 *) rctx->out;
+
+ for (i = 0; i < 8; ++i)
+ dst[i] = cpu_to_be64(sctx->job.result_digest[i]);
+
+ return 0;
+}
+
+static int sha_finish_walk(struct mcryptd_hash_request_ctx **ret_rctx,
+ struct mcryptd_alg_cstate *cstate, bool flush)
+{
+ int flag = HASH_UPDATE;
+ int nbytes, err = 0;
+ struct mcryptd_hash_request_ctx *rctx = *ret_rctx;
+ struct sha512_hash_ctx *sha_ctx;
+
+ /* more work ? */
+ while (!(rctx->flag & HASH_DONE)) {
+ nbytes = crypto_ahash_walk_done(&rctx->walk, 0);
+ if (nbytes < 0) {
+ err = nbytes;
+ goto out;
+ }
+ /* check if the walk is done */
+ if (crypto_ahash_walk_last(&rctx->walk)) {
+ rctx->flag |= HASH_DONE;
+ if (rctx->flag & HASH_FINAL)
+ flag |= HASH_LAST;
+
+ }
+ sha_ctx = (struct sha512_hash_ctx *)
+ ahash_request_ctx(&rctx->areq);
+ kernel_fpu_begin();
+ sha_ctx = sha512_ctx_mgr_submit(cstate, sha_ctx,
+ rctx->walk.data, nbytes, flag);
+ if (!sha_ctx) {
+ if (flush)
+ sha_ctx = sha512_ctx_mgr_flush(cstate);
+ }
+ kernel_fpu_end();
+ if (sha_ctx)
+ rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+ else {
+ rctx = NULL;
+ goto out;
+ }
+ }
+
+ /* copy the results */
+ if (rctx->flag & HASH_FINAL)
+ sha512_mb_set_results(rctx);
+
+out:
+ *ret_rctx = rctx;
+ return err;
+}
+
+static int sha_complete_job(struct mcryptd_hash_request_ctx *rctx,
+ struct mcryptd_alg_cstate *cstate,
+ int err)
+{
+ struct ahash_request *req = cast_mcryptd_ctx_to_req(rctx);
+ struct sha512_hash_ctx *sha_ctx;
+ struct mcryptd_hash_request_ctx *req_ctx;
+ int ret;
+ unsigned long flags;
+
+ /* remove from work list */
+ spin_lock_irqsave(&cstate->work_lock, flags);
+ list_del(&rctx->waiter);
+ spin_unlock_irqrestore(&cstate->work_lock, flags);
+
+ if (irqs_disabled())
+ rctx->complete(&req->base, err);
+ else {
+ local_bh_disable();
+ rctx->complete(&req->base, err);
+ local_bh_enable();
+ }
+
+ /* check to see if there are other jobs that are done */
+ sha_ctx = sha512_ctx_mgr_get_comp_ctx(cstate);
+ while (sha_ctx) {
+ req_ctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+ ret = sha_finish_walk(&req_ctx, cstate, false);
+ if (req_ctx) {
+ spin_lock_irqsave(&cstate->work_lock, flags);
+ list_del(&req_ctx->waiter);
+ spin_unlock_irqrestore(&cstate->work_lock, flags);
+
+ req = cast_mcryptd_ctx_to_req(req_ctx);
+ if (irqs_disabled())
+ req_ctx->complete(&req->base, ret);
+ else {
+ local_bh_disable();
+ req_ctx->complete(&req->base, ret);
+ local_bh_enable();
+ }
+ }
+ sha_ctx = sha512_ctx_mgr_get_comp_ctx(cstate);
+ }
+
+ return 0;
+}
+
+static void sha512_mb_add_list(struct mcryptd_hash_request_ctx *rctx,
+ struct mcryptd_alg_cstate *cstate)
+{
+ unsigned long next_flush;
+ unsigned long delay = usecs_to_jiffies(FLUSH_INTERVAL);
+ unsigned long flags;
+
+ /* initialize tag */
+ rctx->tag.arrival = jiffies; /* tag the arrival time */
+ rctx->tag.seq_num = cstate->next_seq_num++;
+ next_flush = rctx->tag.arrival + delay;
+ rctx->tag.expire = next_flush;
+
+ spin_lock_irqsave(&cstate->work_lock, flags);
+ list_add_tail(&rctx->waiter, &cstate->work_list);
+ spin_unlock_irqrestore(&cstate->work_lock, flags);
+
+ mcryptd_arm_flusher(cstate, delay);
+}
+
+static int sha512_mb_update(struct ahash_request *areq)
+{
+ struct mcryptd_hash_request_ctx *rctx =
+ container_of(areq, struct mcryptd_hash_request_ctx,
+ areq);
+ struct mcryptd_alg_cstate *cstate =
+ this_cpu_ptr(sha512_mb_alg_state.alg_cstate);
+
+ struct ahash_request *req = cast_mcryptd_ctx_to_req(rctx);
+ struct sha512_hash_ctx *sha_ctx;
+ int ret = 0, nbytes;
+
+
+ /* sanity check */
+ if (rctx->tag.cpu != smp_processor_id()) {
+ pr_err("mcryptd error: cpu clash\n");
+ goto done;
+ }
+
+ /* need to init context */
+ req_ctx_init(rctx, areq);
+
+ nbytes = crypto_ahash_walk_first(req, &rctx->walk);
+
+ if (nbytes < 0) {
+ ret = nbytes;
+ goto done;
+ }
+
+ if (crypto_ahash_walk_last(&rctx->walk))
+ rctx->flag |= HASH_DONE;
+
+ /* submit */
+ sha_ctx = (struct sha512_hash_ctx *) ahash_request_ctx(areq);
+ sha512_mb_add_list(rctx, cstate);
+ kernel_fpu_begin();
+ sha_ctx = sha512_ctx_mgr_submit(cstate, sha_ctx, rctx->walk.data,
+ nbytes, HASH_UPDATE);
+ kernel_fpu_end();
+
+ /* check if anything is returned */
+ if (!sha_ctx)
+ return -EINPROGRESS;
+
+ if (sha_ctx->error) {
+ ret = sha_ctx->error;
+ rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+ goto done;
+ }
+
+ rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+ ret = sha_finish_walk(&rctx, cstate, false);
+
+ if (!rctx)
+ return -EINPROGRESS;
+done:
+ sha_complete_job(rctx, cstate, ret);
+ return ret;
+}
+
+static int sha512_mb_finup(struct ahash_request *areq)
+{
+ struct mcryptd_hash_request_ctx *rctx =
+ container_of(areq, struct mcryptd_hash_request_ctx,
+ areq);
+ struct mcryptd_alg_cstate *cstate =
+ this_cpu_ptr(sha512_mb_alg_state.alg_cstate);
+
+ struct ahash_request *req = cast_mcryptd_ctx_to_req(rctx);
+ struct sha512_hash_ctx *sha_ctx;
+ int ret = 0, flag = HASH_UPDATE, nbytes;
+
+ /* sanity check */
+ if (rctx->tag.cpu != smp_processor_id()) {
+ pr_err("mcryptd error: cpu clash\n");
+ goto done;
+ }
+
+ /* need to init context */
+ req_ctx_init(rctx, areq);
+
+ nbytes = crypto_ahash_walk_first(req, &rctx->walk);
+
+ if (nbytes < 0) {
+ ret = nbytes;
+ goto done;
+ }
+
+ if (crypto_ahash_walk_last(&rctx->walk)) {
+ rctx->flag |= HASH_DONE;
+ flag = HASH_LAST;
+ }
+
+ /* submit */
+ rctx->flag |= HASH_FINAL;
+ sha_ctx = (struct sha512_hash_ctx *) ahash_request_ctx(areq);
+ sha512_mb_add_list(rctx, cstate);
+
+ kernel_fpu_begin();
+ sha_ctx = sha512_ctx_mgr_submit(cstate, sha_ctx, rctx->walk.data,
+ nbytes, flag);
+ kernel_fpu_end();
+
+ /* check if anything is returned */
+ if (!sha_ctx)
+ return -EINPROGRESS;
+
+ if (sha_ctx->error) {
+ ret = sha_ctx->error;
+ goto done;
+ }
+
+ rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+ ret = sha_finish_walk(&rctx, cstate, false);
+ if (!rctx)
+ return -EINPROGRESS;
+done:
+ sha_complete_job(rctx, cstate, ret);
+ return ret;
+}
+
+static int sha512_mb_final(struct ahash_request *areq)
+{
+ struct mcryptd_hash_request_ctx *rctx =
+ container_of(areq, struct mcryptd_hash_request_ctx,
+ areq);
+ struct mcryptd_alg_cstate *cstate =
+ this_cpu_ptr(sha512_mb_alg_state.alg_cstate);
+
+ struct sha512_hash_ctx *sha_ctx;
+ int ret = 0;
+ u8 data;
+
+ /* sanity check */
+ if (rctx->tag.cpu != smp_processor_id()) {
+ pr_err("mcryptd error: cpu clash\n");
+ goto done;
+ }
+
+ /* need to init context */
+ req_ctx_init(rctx, areq);
+
+ rctx->flag |= HASH_DONE | HASH_FINAL;
+
+ sha_ctx = (struct sha512_hash_ctx *) ahash_request_ctx(areq);
+ /* flag HASH_FINAL and 0 data size */
+ sha512_mb_add_list(rctx, cstate);
+ kernel_fpu_begin();
+ sha_ctx = sha512_ctx_mgr_submit(cstate, sha_ctx, &data, 0, HASH_LAST);
+ kernel_fpu_end();
+
+ /* check if anything is returned */
+ if (!sha_ctx)
+ return -EINPROGRESS;
+
+ if (sha_ctx->error) {
+ ret = sha_ctx->error;
+ rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+ goto done;
+ }
+
+ rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+ ret = sha_finish_walk(&rctx, cstate, false);
+ if (!rctx)
+ return -EINPROGRESS;
+done:
+ sha_complete_job(rctx, cstate, ret);
+ return ret;
+}
+
+static int sha512_mb_export(struct ahash_request *areq, void *out)
+{
+ struct sha512_hash_ctx *sctx = ahash_request_ctx(areq);
+
+ memcpy(out, sctx, sizeof(*sctx));
+
+ return 0;
+}
+
+static int sha512_mb_import(struct ahash_request *areq, const void *in)
+{
+ struct sha512_hash_ctx *sctx = ahash_request_ctx(areq);
+
+ memcpy(sctx, in, sizeof(*sctx));
+
+ return 0;
+}
+
+static int sha512_mb_async_init_tfm(struct crypto_tfm *tfm)
+{
+ struct mcryptd_ahash *mcryptd_tfm;
+ struct sha512_mb_ctx *ctx = crypto_tfm_ctx(tfm);
+ struct mcryptd_hash_ctx *mctx;
+
+ mcryptd_tfm = mcryptd_alloc_ahash("__intel_sha512-mb",
+ CRYPTO_ALG_INTERNAL,
+ CRYPTO_ALG_INTERNAL);
+ if (IS_ERR(mcryptd_tfm))
+ return PTR_ERR(mcryptd_tfm);
+ mctx = crypto_ahash_ctx(&mcryptd_tfm->base);
+ mctx->alg_state = &sha512_mb_alg_state;
+ ctx->mcryptd_tfm = mcryptd_tfm;
+ crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
+ sizeof(struct ahash_request) +
+ crypto_ahash_reqsize(&mcryptd_tfm->base));
+
+ return 0;
+}
+
+static void sha512_mb_async_exit_tfm(struct crypto_tfm *tfm)
+{
+ struct sha512_mb_ctx *ctx = crypto_tfm_ctx(tfm);
+
+ mcryptd_free_ahash(ctx->mcryptd_tfm);
+}
+
+static int sha512_mb_areq_init_tfm(struct crypto_tfm *tfm)
+{
+ crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
+ sizeof(struct ahash_request) +
+ sizeof(struct sha512_hash_ctx));
+
+ return 0;
+}
+
+static void sha512_mb_areq_exit_tfm(struct crypto_tfm *tfm)
+{
+ struct sha512_mb_ctx *ctx = crypto_tfm_ctx(tfm);
+
+ mcryptd_free_ahash(ctx->mcryptd_tfm);
+}
+
+static struct ahash_alg sha512_mb_areq_alg = {
+ .init = sha512_mb_init,
+ .update = sha512_mb_update,
+ .final = sha512_mb_final,
+ .finup = sha512_mb_finup,
+ .export = sha512_mb_export,
+ .import = sha512_mb_import,
+ .halg = {
+ .digestsize = SHA512_DIGEST_SIZE,
+ .statesize = sizeof(struct sha512_hash_ctx),
+ .base = {
+ .cra_name = "__sha512-mb",
+ .cra_driver_name = "__intel_sha512-mb",
+ .cra_priority = 100,
+ /*
+ * use ASYNC flag as some buffers in multi-buffer
+ * algo may not have completed before hashing thread
+ * sleep
+ */
+ .cra_flags = CRYPTO_ALG_ASYNC |
+ CRYPTO_ALG_INTERNAL,
+ .cra_blocksize = SHA512_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ .cra_list = LIST_HEAD_INIT
+ (sha512_mb_areq_alg.halg.base.cra_list),
+ .cra_init = sha512_mb_areq_init_tfm,
+ .cra_exit = sha512_mb_areq_exit_tfm,
+ .cra_ctxsize = sizeof(struct sha512_hash_ctx),
+ }
+ }
+};
+
+static int sha512_mb_async_init(struct ahash_request *req)
+{
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+ struct sha512_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+ struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+ struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+ memcpy(mcryptd_req, req, sizeof(*req));
+ ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+ return crypto_ahash_init(mcryptd_req);
+}
+
+static int sha512_mb_async_update(struct ahash_request *req)
+{
+ struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+ struct sha512_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+ struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+ memcpy(mcryptd_req, req, sizeof(*req));
+ ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+ return crypto_ahash_update(mcryptd_req);
+}
+
+static int sha512_mb_async_finup(struct ahash_request *req)
+{
+ struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+ struct sha512_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+ struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+ memcpy(mcryptd_req, req, sizeof(*req));
+ ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+ return crypto_ahash_finup(mcryptd_req);
+}
+
+static int sha512_mb_async_final(struct ahash_request *req)
+{
+ struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+ struct sha512_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+ struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+ memcpy(mcryptd_req, req, sizeof(*req));
+ ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+ return crypto_ahash_final(mcryptd_req);
+}
+
+static int sha512_mb_async_digest(struct ahash_request *req)
+{
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+ struct sha512_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+ struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+ struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+ memcpy(mcryptd_req, req, sizeof(*req));
+ ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+ return crypto_ahash_digest(mcryptd_req);
+}
+
+static int sha512_mb_async_export(struct ahash_request *req, void *out)
+{
+ struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+ struct sha512_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+ struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+ memcpy(mcryptd_req, req, sizeof(*req));
+ ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+ return crypto_ahash_export(mcryptd_req, out);
+}
+
+static int sha512_mb_async_import(struct ahash_request *req, const void *in)
+{
+ struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+ struct sha512_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+ struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+ struct crypto_ahash *child = mcryptd_ahash_child(mcryptd_tfm);
+ struct mcryptd_hash_request_ctx *rctx;
+ struct ahash_request *areq;
+
+ memcpy(mcryptd_req, req, sizeof(*req));
+ ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+ rctx = ahash_request_ctx(mcryptd_req);
+
+ areq = &rctx->areq;
+
+ ahash_request_set_tfm(areq, child);
+ ahash_request_set_callback(areq, CRYPTO_TFM_REQ_MAY_SLEEP,
+ rctx->complete, req);
+
+ return crypto_ahash_import(mcryptd_req, in);
+}
+
+static struct ahash_alg sha512_mb_async_alg = {
+ .init = sha512_mb_async_init,
+ .update = sha512_mb_async_update,
+ .final = sha512_mb_async_final,
+ .finup = sha512_mb_async_finup,
+ .digest = sha512_mb_async_digest,
+ .export = sha512_mb_async_export,
+ .import = sha512_mb_async_import,
+ .halg = {
+ .digestsize = SHA512_DIGEST_SIZE,
+ .statesize = sizeof(struct sha512_hash_ctx),
+ .base = {
+ .cra_name = "sha512",
+ .cra_driver_name = "sha512_mb",
+ /*
+ * Low priority, since with few concurrent hash requests
+ * this is extremely slow due to the flush delay. Users
+ * whose workloads would benefit from this can request
+ * it explicitly by driver name, or can increase its
+ * priority at runtime using NETLINK_CRYPTO.
+ */
+ .cra_priority = 50,
+ .cra_flags = CRYPTO_ALG_ASYNC,
+ .cra_blocksize = SHA512_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ .cra_list = LIST_HEAD_INIT
+ (sha512_mb_async_alg.halg.base.cra_list),
+ .cra_init = sha512_mb_async_init_tfm,
+ .cra_exit = sha512_mb_async_exit_tfm,
+ .cra_ctxsize = sizeof(struct sha512_mb_ctx),
+ .cra_alignmask = 0,
+ },
+ },
+};
+
+static unsigned long sha512_mb_flusher(struct mcryptd_alg_cstate *cstate)
+{
+ struct mcryptd_hash_request_ctx *rctx;
+ unsigned long cur_time;
+ unsigned long next_flush = 0;
+ struct sha512_hash_ctx *sha_ctx;
+
+
+ cur_time = jiffies;
+
+ while (!list_empty(&cstate->work_list)) {
+ rctx = list_entry(cstate->work_list.next,
+ struct mcryptd_hash_request_ctx, waiter);
+ if time_before(cur_time, rctx->tag.expire)
+ break;
+ kernel_fpu_begin();
+ sha_ctx = (struct sha512_hash_ctx *)
+ sha512_ctx_mgr_flush(cstate);
+ kernel_fpu_end();
+ if (!sha_ctx) {
+ pr_err("sha512_mb error: nothing got flushed for"
+ " non-empty list\n");
+ break;
+ }
+ rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+ sha_finish_walk(&rctx, cstate, true);
+ sha_complete_job(rctx, cstate, 0);
+ }
+
+ if (!list_empty(&cstate->work_list)) {
+ rctx = list_entry(cstate->work_list.next,
+ struct mcryptd_hash_request_ctx, waiter);
+ /* get the hash context and then flush time */
+ next_flush = rctx->tag.expire;
+ mcryptd_arm_flusher(cstate, get_delay(next_flush));
+ }
+ return next_flush;
+}
+
+static int __init sha512_mb_mod_init(void)
+{
+
+ int cpu;
+ int err;
+ struct mcryptd_alg_cstate *cpu_state;
+
+ /* check for dependent cpu features */
+ if (!boot_cpu_has(X86_FEATURE_AVX2) ||
+ !boot_cpu_has(X86_FEATURE_BMI2))
+ return -ENODEV;
+
+ /* initialize multibuffer structures */
+ sha512_mb_alg_state.alg_cstate =
+ alloc_percpu(struct mcryptd_alg_cstate);
+
+ sha512_job_mgr_init = sha512_mb_mgr_init_avx2;
+ sha512_job_mgr_submit = sha512_mb_mgr_submit_avx2;
+ sha512_job_mgr_flush = sha512_mb_mgr_flush_avx2;
+ sha512_job_mgr_get_comp_job = sha512_mb_mgr_get_comp_job_avx2;
+
+ if (!sha512_mb_alg_state.alg_cstate)
+ return -ENOMEM;
+ for_each_possible_cpu(cpu) {
+ cpu_state = per_cpu_ptr(sha512_mb_alg_state.alg_cstate, cpu);
+ cpu_state->next_flush = 0;
+ cpu_state->next_seq_num = 0;
+ cpu_state->flusher_engaged = false;
+ INIT_DELAYED_WORK(&cpu_state->flush, mcryptd_flusher);
+ cpu_state->cpu = cpu;
+ cpu_state->alg_state = &sha512_mb_alg_state;
+ cpu_state->mgr = kzalloc(sizeof(struct sha512_ctx_mgr),
+ GFP_KERNEL);
+ if (!cpu_state->mgr)
+ goto err2;
+ sha512_ctx_mgr_init(cpu_state->mgr);
+ INIT_LIST_HEAD(&cpu_state->work_list);
+ spin_lock_init(&cpu_state->work_lock);
+ }
+ sha512_mb_alg_state.flusher = &sha512_mb_flusher;
+
+ err = crypto_register_ahash(&sha512_mb_areq_alg);
+ if (err)
+ goto err2;
+ err = crypto_register_ahash(&sha512_mb_async_alg);
+ if (err)
+ goto err1;
+
+
+ return 0;
+err1:
+ crypto_unregister_ahash(&sha512_mb_areq_alg);
+err2:
+ for_each_possible_cpu(cpu) {
+ cpu_state = per_cpu_ptr(sha512_mb_alg_state.alg_cstate, cpu);
+ kfree(cpu_state->mgr);
+ }
+ free_percpu(sha512_mb_alg_state.alg_cstate);
+ return -ENODEV;
+}
+
+static void __exit sha512_mb_mod_fini(void)
+{
+ int cpu;
+ struct mcryptd_alg_cstate *cpu_state;
+
+ crypto_unregister_ahash(&sha512_mb_async_alg);
+ crypto_unregister_ahash(&sha512_mb_areq_alg);
+ for_each_possible_cpu(cpu) {
+ cpu_state = per_cpu_ptr(sha512_mb_alg_state.alg_cstate, cpu);
+ kfree(cpu_state->mgr);
+ }
+ free_percpu(sha512_mb_alg_state.alg_cstate);
+}
+
+module_init(sha512_mb_mod_init);
+module_exit(sha512_mb_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA512 Secure Hash Algorithm, multi buffer accelerated");
+
+MODULE_ALIAS("sha512");
diff --git a/arch/x86/crypto/sha512-mb/sha512_mb_ctx.h b/arch/x86/crypto/sha512-mb/sha512_mb_ctx.h
new file mode 100644
index 000000000..e5c465bd8
--- /dev/null
+++ b/arch/x86/crypto/sha512-mb/sha512_mb_ctx.h
@@ -0,0 +1,128 @@
+/*
+ * Header file for multi buffer SHA512 context
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * Megha Dey <megha.dey@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _SHA_MB_CTX_INTERNAL_H
+#define _SHA_MB_CTX_INTERNAL_H
+
+#include "sha512_mb_mgr.h"
+
+#define HASH_UPDATE 0x00
+#define HASH_LAST 0x01
+#define HASH_DONE 0x02
+#define HASH_FINAL 0x04
+
+#define HASH_CTX_STS_IDLE 0x00
+#define HASH_CTX_STS_PROCESSING 0x01
+#define HASH_CTX_STS_LAST 0x02
+#define HASH_CTX_STS_COMPLETE 0x04
+
+enum hash_ctx_error {
+ HASH_CTX_ERROR_NONE = 0,
+ HASH_CTX_ERROR_INVALID_FLAGS = -1,
+ HASH_CTX_ERROR_ALREADY_PROCESSING = -2,
+ HASH_CTX_ERROR_ALREADY_COMPLETED = -3,
+};
+
+#define hash_ctx_user_data(ctx) ((ctx)->user_data)
+#define hash_ctx_digest(ctx) ((ctx)->job.result_digest)
+#define hash_ctx_processing(ctx) ((ctx)->status & HASH_CTX_STS_PROCESSING)
+#define hash_ctx_complete(ctx) ((ctx)->status == HASH_CTX_STS_COMPLETE)
+#define hash_ctx_status(ctx) ((ctx)->status)
+#define hash_ctx_error(ctx) ((ctx)->error)
+#define hash_ctx_init(ctx) \
+ do { \
+ (ctx)->error = HASH_CTX_ERROR_NONE; \
+ (ctx)->status = HASH_CTX_STS_COMPLETE; \
+ } while (0)
+
+/* Hash Constants and Typedefs */
+#define SHA512_DIGEST_LENGTH 8
+#define SHA512_LOG2_BLOCK_SIZE 7
+
+#define SHA512_PADLENGTHFIELD_SIZE 16
+
+#ifdef SHA_MB_DEBUG
+#define assert(expr) \
+do { \
+ if (unlikely(!(expr))) { \
+ printk(KERN_ERR "Assertion failed! %s,%s,%s,line=%d\n", \
+ #expr, __FILE__, __func__, __LINE__); \
+ } \
+} while (0)
+#else
+#define assert(expr) do {} while (0)
+#endif
+
+struct sha512_ctx_mgr {
+ struct sha512_mb_mgr mgr;
+};
+
+/* typedef struct sha512_ctx_mgr sha512_ctx_mgr; */
+
+struct sha512_hash_ctx {
+ /* Must be at struct offset 0 */
+ struct job_sha512 job;
+ /* status flag */
+ int status;
+ /* error flag */
+ int error;
+
+ uint64_t total_length;
+ const void *incoming_buffer;
+ uint32_t incoming_buffer_length;
+ uint8_t partial_block_buffer[SHA512_BLOCK_SIZE * 2];
+ uint32_t partial_block_buffer_length;
+ void *user_data;
+};
+
+#endif
diff --git a/arch/x86/crypto/sha512-mb/sha512_mb_mgr.h b/arch/x86/crypto/sha512-mb/sha512_mb_mgr.h
new file mode 100644
index 000000000..178f17eef
--- /dev/null
+++ b/arch/x86/crypto/sha512-mb/sha512_mb_mgr.h
@@ -0,0 +1,104 @@
+/*
+ * Header file for multi buffer SHA512 algorithm manager
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * Megha Dey <megha.dey@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __SHA_MB_MGR_H
+#define __SHA_MB_MGR_H
+
+#include <linux/types.h>
+
+#define NUM_SHA512_DIGEST_WORDS 8
+
+enum job_sts {STS_UNKNOWN = 0,
+ STS_BEING_PROCESSED = 1,
+ STS_COMPLETED = 2,
+ STS_INTERNAL_ERROR = 3,
+ STS_ERROR = 4
+};
+
+struct job_sha512 {
+ u8 *buffer;
+ u64 len;
+ u64 result_digest[NUM_SHA512_DIGEST_WORDS] __aligned(32);
+ enum job_sts status;
+ void *user_data;
+};
+
+struct sha512_args_x4 {
+ uint64_t digest[8][4];
+ uint8_t *data_ptr[4];
+};
+
+struct sha512_lane_data {
+ struct job_sha512 *job_in_lane;
+};
+
+struct sha512_mb_mgr {
+ struct sha512_args_x4 args;
+
+ uint64_t lens[4];
+
+ /* each byte is index (0...7) of unused lanes */
+ uint64_t unused_lanes;
+ /* byte 4 is set to FF as a flag */
+ struct sha512_lane_data ldata[4];
+};
+
+#define SHA512_MB_MGR_NUM_LANES_AVX2 4
+
+void sha512_mb_mgr_init_avx2(struct sha512_mb_mgr *state);
+struct job_sha512 *sha512_mb_mgr_submit_avx2(struct sha512_mb_mgr *state,
+ struct job_sha512 *job);
+struct job_sha512 *sha512_mb_mgr_flush_avx2(struct sha512_mb_mgr *state);
+struct job_sha512 *sha512_mb_mgr_get_comp_job_avx2(struct sha512_mb_mgr *state);
+
+#endif
diff --git a/arch/x86/crypto/sha512-mb/sha512_mb_mgr_datastruct.S b/arch/x86/crypto/sha512-mb/sha512_mb_mgr_datastruct.S
new file mode 100644
index 000000000..cf2636d4c
--- /dev/null
+++ b/arch/x86/crypto/sha512-mb/sha512_mb_mgr_datastruct.S
@@ -0,0 +1,281 @@
+/*
+ * Header file for multi buffer SHA256 algorithm data structure
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * Megha Dey <megha.dey@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+# Macros for defining data structures
+
+# Usage example
+
+#START_FIELDS # JOB_AES
+### name size align
+#FIELD _plaintext, 8, 8 # pointer to plaintext
+#FIELD _ciphertext, 8, 8 # pointer to ciphertext
+#FIELD _IV, 16, 8 # IV
+#FIELD _keys, 8, 8 # pointer to keys
+#FIELD _len, 4, 4 # length in bytes
+#FIELD _status, 4, 4 # status enumeration
+#FIELD _user_data, 8, 8 # pointer to user data
+#UNION _union, size1, align1, \
+# size2, align2, \
+# size3, align3, \
+# ...
+#END_FIELDS
+#%assign _JOB_AES_size _FIELD_OFFSET
+#%assign _JOB_AES_align _STRUCT_ALIGN
+
+#########################################################################
+
+# Alternate "struc-like" syntax:
+# STRUCT job_aes2
+# RES_Q .plaintext, 1
+# RES_Q .ciphertext, 1
+# RES_DQ .IV, 1
+# RES_B .nested, _JOB_AES_SIZE, _JOB_AES_ALIGN
+# RES_U .union, size1, align1, \
+# size2, align2, \
+# ...
+# ENDSTRUCT
+# # Following only needed if nesting
+# %assign job_aes2_size _FIELD_OFFSET
+# %assign job_aes2_align _STRUCT_ALIGN
+#
+# RES_* macros take a name, a count and an optional alignment.
+# The count in in terms of the base size of the macro, and the
+# default alignment is the base size.
+# The macros are:
+# Macro Base size
+# RES_B 1
+# RES_W 2
+# RES_D 4
+# RES_Q 8
+# RES_DQ 16
+# RES_Y 32
+# RES_Z 64
+#
+# RES_U defines a union. It's arguments are a name and two or more
+# pairs of "size, alignment"
+#
+# The two assigns are only needed if this structure is being nested
+# within another. Even if the assigns are not done, one can still use
+# STRUCT_NAME_size as the size of the structure.
+#
+# Note that for nesting, you still need to assign to STRUCT_NAME_size.
+#
+# The differences between this and using "struc" directly are that each
+# type is implicitly aligned to its natural length (although this can be
+# over-ridden with an explicit third parameter), and that the structure
+# is padded at the end to its overall alignment.
+#
+
+#########################################################################
+
+#ifndef _DATASTRUCT_ASM_
+#define _DATASTRUCT_ASM_
+
+#define PTR_SZ 8
+#define SHA512_DIGEST_WORD_SIZE 8
+#define SHA512_MB_MGR_NUM_LANES_AVX2 4
+#define NUM_SHA512_DIGEST_WORDS 8
+#define SZ4 4*SHA512_DIGEST_WORD_SIZE
+#define ROUNDS 80*SZ4
+#define SHA512_DIGEST_ROW_SIZE (SHA512_MB_MGR_NUM_LANES_AVX2 * 8)
+
+# START_FIELDS
+.macro START_FIELDS
+ _FIELD_OFFSET = 0
+ _STRUCT_ALIGN = 0
+.endm
+
+# FIELD name size align
+.macro FIELD name size align
+ _FIELD_OFFSET = (_FIELD_OFFSET + (\align) - 1) & (~ ((\align)-1))
+ \name = _FIELD_OFFSET
+ _FIELD_OFFSET = _FIELD_OFFSET + (\size)
+.if (\align > _STRUCT_ALIGN)
+ _STRUCT_ALIGN = \align
+.endif
+.endm
+
+# END_FIELDS
+.macro END_FIELDS
+ _FIELD_OFFSET = (_FIELD_OFFSET + _STRUCT_ALIGN-1) & (~ (_STRUCT_ALIGN-1))
+.endm
+
+.macro STRUCT p1
+START_FIELDS
+.struc \p1
+.endm
+
+.macro ENDSTRUCT
+ tmp = _FIELD_OFFSET
+ END_FIELDS
+ tmp = (_FIELD_OFFSET - ##tmp)
+.if (tmp > 0)
+ .lcomm tmp
+.endm
+
+## RES_int name size align
+.macro RES_int p1 p2 p3
+ name = \p1
+ size = \p2
+ align = .\p3
+
+ _FIELD_OFFSET = (_FIELD_OFFSET + (align) - 1) & (~ ((align)-1))
+.align align
+.lcomm name size
+ _FIELD_OFFSET = _FIELD_OFFSET + (size)
+.if (align > _STRUCT_ALIGN)
+ _STRUCT_ALIGN = align
+.endif
+.endm
+
+# macro RES_B name, size [, align]
+.macro RES_B _name, _size, _align=1
+RES_int _name _size _align
+.endm
+
+# macro RES_W name, size [, align]
+.macro RES_W _name, _size, _align=2
+RES_int _name 2*(_size) _align
+.endm
+
+# macro RES_D name, size [, align]
+.macro RES_D _name, _size, _align=4
+RES_int _name 4*(_size) _align
+.endm
+
+# macro RES_Q name, size [, align]
+.macro RES_Q _name, _size, _align=8
+RES_int _name 8*(_size) _align
+.endm
+
+# macro RES_DQ name, size [, align]
+.macro RES_DQ _name, _size, _align=16
+RES_int _name 16*(_size) _align
+.endm
+
+# macro RES_Y name, size [, align]
+.macro RES_Y _name, _size, _align=32
+RES_int _name 32*(_size) _align
+.endm
+
+# macro RES_Z name, size [, align]
+.macro RES_Z _name, _size, _align=64
+RES_int _name 64*(_size) _align
+.endm
+
+#endif
+
+###################################################################
+### Define SHA512 Out Of Order Data Structures
+###################################################################
+
+START_FIELDS # LANE_DATA
+### name size align
+FIELD _job_in_lane, 8, 8 # pointer to job object
+END_FIELDS
+
+ _LANE_DATA_size = _FIELD_OFFSET
+ _LANE_DATA_align = _STRUCT_ALIGN
+
+####################################################################
+
+START_FIELDS # SHA512_ARGS_X4
+### name size align
+FIELD _digest, 8*8*4, 4 # transposed digest
+FIELD _data_ptr, 8*4, 8 # array of pointers to data
+END_FIELDS
+
+ _SHA512_ARGS_X4_size = _FIELD_OFFSET
+ _SHA512_ARGS_X4_align = _STRUCT_ALIGN
+
+#####################################################################
+
+START_FIELDS # MB_MGR
+### name size align
+FIELD _args, _SHA512_ARGS_X4_size, _SHA512_ARGS_X4_align
+FIELD _lens, 8*4, 8
+FIELD _unused_lanes, 8, 8
+FIELD _ldata, _LANE_DATA_size*4, _LANE_DATA_align
+END_FIELDS
+
+ _MB_MGR_size = _FIELD_OFFSET
+ _MB_MGR_align = _STRUCT_ALIGN
+
+_args_digest = _args + _digest
+_args_data_ptr = _args + _data_ptr
+
+#######################################################################
+
+#######################################################################
+#### Define constants
+#######################################################################
+
+#define STS_UNKNOWN 0
+#define STS_BEING_PROCESSED 1
+#define STS_COMPLETED 2
+
+#######################################################################
+#### Define JOB_SHA512 structure
+#######################################################################
+
+START_FIELDS # JOB_SHA512
+### name size align
+FIELD _buffer, 8, 8 # pointer to buffer
+FIELD _len, 8, 8 # length in bytes
+FIELD _result_digest, 8*8, 32 # Digest (output)
+FIELD _status, 4, 4
+FIELD _user_data, 8, 8
+END_FIELDS
+
+ _JOB_SHA512_size = _FIELD_OFFSET
+ _JOB_SHA512_align = _STRUCT_ALIGN
diff --git a/arch/x86/crypto/sha512-mb/sha512_mb_mgr_flush_avx2.S b/arch/x86/crypto/sha512-mb/sha512_mb_mgr_flush_avx2.S
new file mode 100644
index 000000000..7c629caeb
--- /dev/null
+++ b/arch/x86/crypto/sha512-mb/sha512_mb_mgr_flush_avx2.S
@@ -0,0 +1,297 @@
+/*
+ * Flush routine for SHA512 multibuffer
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * Megha Dey <megha.dey@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+#include "sha512_mb_mgr_datastruct.S"
+
+.extern sha512_x4_avx2
+
+# LINUX register definitions
+#define arg1 %rdi
+#define arg2 %rsi
+
+# idx needs to be other than arg1, arg2, rbx, r12
+#define idx %rdx
+
+# Common definitions
+#define state arg1
+#define job arg2
+#define len2 arg2
+
+#define unused_lanes %rbx
+#define lane_data %rbx
+#define tmp2 %rbx
+
+#define job_rax %rax
+#define tmp1 %rax
+#define size_offset %rax
+#define tmp %rax
+#define start_offset %rax
+
+#define tmp3 arg1
+
+#define extra_blocks arg2
+#define p arg2
+
+#define tmp4 %r8
+#define lens0 %r8
+
+#define lens1 %r9
+#define lens2 %r10
+#define lens3 %r11
+
+.macro LABEL prefix n
+\prefix\n\():
+.endm
+
+.macro JNE_SKIP i
+jne skip_\i
+.endm
+
+.altmacro
+.macro SET_OFFSET _offset
+offset = \_offset
+.endm
+.noaltmacro
+
+# JOB* sha512_mb_mgr_flush_avx2(MB_MGR *state)
+# arg 1 : rcx : state
+ENTRY(sha512_mb_mgr_flush_avx2)
+ FRAME_BEGIN
+ push %rbx
+
+ # If bit (32+3) is set, then all lanes are empty
+ mov _unused_lanes(state), unused_lanes
+ bt $32+7, unused_lanes
+ jc return_null
+
+ # find a lane with a non-null job
+ xor idx, idx
+ offset = (_ldata + 1*_LANE_DATA_size + _job_in_lane)
+ cmpq $0, offset(state)
+ cmovne one(%rip), idx
+ offset = (_ldata + 2*_LANE_DATA_size + _job_in_lane)
+ cmpq $0, offset(state)
+ cmovne two(%rip), idx
+ offset = (_ldata + 3*_LANE_DATA_size + _job_in_lane)
+ cmpq $0, offset(state)
+ cmovne three(%rip), idx
+
+ # copy idx to empty lanes
+copy_lane_data:
+ offset = (_args + _data_ptr)
+ mov offset(state,idx,8), tmp
+
+ I = 0
+.rep 4
+ offset = (_ldata + I * _LANE_DATA_size + _job_in_lane)
+ cmpq $0, offset(state)
+.altmacro
+ JNE_SKIP %I
+ offset = (_args + _data_ptr + 8*I)
+ mov tmp, offset(state)
+ offset = (_lens + 8*I +4)
+ movl $0xFFFFFFFF, offset(state)
+LABEL skip_ %I
+ I = (I+1)
+.noaltmacro
+.endr
+
+ # Find min length
+ mov _lens + 0*8(state),lens0
+ mov lens0,idx
+ mov _lens + 1*8(state),lens1
+ cmp idx,lens1
+ cmovb lens1,idx
+ mov _lens + 2*8(state),lens2
+ cmp idx,lens2
+ cmovb lens2,idx
+ mov _lens + 3*8(state),lens3
+ cmp idx,lens3
+ cmovb lens3,idx
+ mov idx,len2
+ and $0xF,idx
+ and $~0xFF,len2
+ jz len_is_0
+
+ sub len2, lens0
+ sub len2, lens1
+ sub len2, lens2
+ sub len2, lens3
+ shr $32,len2
+ mov lens0, _lens + 0*8(state)
+ mov lens1, _lens + 1*8(state)
+ mov lens2, _lens + 2*8(state)
+ mov lens3, _lens + 3*8(state)
+
+ # "state" and "args" are the same address, arg1
+ # len is arg2
+ call sha512_x4_avx2
+ # state and idx are intact
+
+len_is_0:
+ # process completed job "idx"
+ imul $_LANE_DATA_size, idx, lane_data
+ lea _ldata(state, lane_data), lane_data
+
+ mov _job_in_lane(lane_data), job_rax
+ movq $0, _job_in_lane(lane_data)
+ movl $STS_COMPLETED, _status(job_rax)
+ mov _unused_lanes(state), unused_lanes
+ shl $8, unused_lanes
+ or idx, unused_lanes
+ mov unused_lanes, _unused_lanes(state)
+
+ movl $0xFFFFFFFF, _lens+4(state, idx, 8)
+
+ vmovq _args_digest+0*32(state, idx, 8), %xmm0
+ vpinsrq $1, _args_digest+1*32(state, idx, 8), %xmm0, %xmm0
+ vmovq _args_digest+2*32(state, idx, 8), %xmm1
+ vpinsrq $1, _args_digest+3*32(state, idx, 8), %xmm1, %xmm1
+ vmovq _args_digest+4*32(state, idx, 8), %xmm2
+ vpinsrq $1, _args_digest+5*32(state, idx, 8), %xmm2, %xmm2
+ vmovq _args_digest+6*32(state, idx, 8), %xmm3
+ vpinsrq $1, _args_digest+7*32(state, idx, 8), %xmm3, %xmm3
+
+ vmovdqu %xmm0, _result_digest(job_rax)
+ vmovdqu %xmm1, _result_digest+1*16(job_rax)
+ vmovdqu %xmm2, _result_digest+2*16(job_rax)
+ vmovdqu %xmm3, _result_digest+3*16(job_rax)
+
+return:
+ pop %rbx
+ FRAME_END
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+ENDPROC(sha512_mb_mgr_flush_avx2)
+.align 16
+
+ENTRY(sha512_mb_mgr_get_comp_job_avx2)
+ push %rbx
+
+ mov _unused_lanes(state), unused_lanes
+ bt $(32+7), unused_lanes
+ jc .return_null
+
+ # Find min length
+ mov _lens(state),lens0
+ mov lens0,idx
+ mov _lens+1*8(state),lens1
+ cmp idx,lens1
+ cmovb lens1,idx
+ mov _lens+2*8(state),lens2
+ cmp idx,lens2
+ cmovb lens2,idx
+ mov _lens+3*8(state),lens3
+ cmp idx,lens3
+ cmovb lens3,idx
+ test $~0xF,idx
+ jnz .return_null
+ and $0xF,idx
+
+ #process completed job "idx"
+ imul $_LANE_DATA_size, idx, lane_data
+ lea _ldata(state, lane_data), lane_data
+
+ mov _job_in_lane(lane_data), job_rax
+ movq $0, _job_in_lane(lane_data)
+ movl $STS_COMPLETED, _status(job_rax)
+ mov _unused_lanes(state), unused_lanes
+ shl $8, unused_lanes
+ or idx, unused_lanes
+ mov unused_lanes, _unused_lanes(state)
+
+ movl $0xFFFFFFFF, _lens+4(state, idx, 8)
+
+ vmovq _args_digest(state, idx, 8), %xmm0
+ vpinsrq $1, _args_digest+1*32(state, idx, 8), %xmm0, %xmm0
+ vmovq _args_digest+2*32(state, idx, 8), %xmm1
+ vpinsrq $1, _args_digest+3*32(state, idx, 8), %xmm1, %xmm1
+ vmovq _args_digest+4*32(state, idx, 8), %xmm2
+ vpinsrq $1, _args_digest+5*32(state, idx, 8), %xmm2, %xmm2
+ vmovq _args_digest+6*32(state, idx, 8), %xmm3
+ vpinsrq $1, _args_digest+7*32(state, idx, 8), %xmm3, %xmm3
+
+ vmovdqu %xmm0, _result_digest+0*16(job_rax)
+ vmovdqu %xmm1, _result_digest+1*16(job_rax)
+ vmovdqu %xmm2, _result_digest+2*16(job_rax)
+ vmovdqu %xmm3, _result_digest+3*16(job_rax)
+
+ pop %rbx
+
+ ret
+
+.return_null:
+ xor job_rax, job_rax
+ pop %rbx
+ ret
+ENDPROC(sha512_mb_mgr_get_comp_job_avx2)
+
+.section .rodata.cst8.one, "aM", @progbits, 8
+.align 8
+one:
+.quad 1
+
+.section .rodata.cst8.two, "aM", @progbits, 8
+.align 8
+two:
+.quad 2
+
+.section .rodata.cst8.three, "aM", @progbits, 8
+.align 8
+three:
+.quad 3
diff --git a/arch/x86/crypto/sha512-mb/sha512_mb_mgr_init_avx2.c b/arch/x86/crypto/sha512-mb/sha512_mb_mgr_init_avx2.c
new file mode 100644
index 000000000..d08805032
--- /dev/null
+++ b/arch/x86/crypto/sha512-mb/sha512_mb_mgr_init_avx2.c
@@ -0,0 +1,69 @@
+/*
+ * Initialization code for multi buffer SHA256 algorithm for AVX2
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * Megha Dey <megha.dey@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "sha512_mb_mgr.h"
+
+void sha512_mb_mgr_init_avx2(struct sha512_mb_mgr *state)
+{
+ unsigned int j;
+
+ /* initially all lanes are unused */
+ state->lens[0] = 0xFFFFFFFF00000000;
+ state->lens[1] = 0xFFFFFFFF00000001;
+ state->lens[2] = 0xFFFFFFFF00000002;
+ state->lens[3] = 0xFFFFFFFF00000003;
+
+ state->unused_lanes = 0xFF03020100;
+ for (j = 0; j < 4; j++)
+ state->ldata[j].job_in_lane = NULL;
+}
diff --git a/arch/x86/crypto/sha512-mb/sha512_mb_mgr_submit_avx2.S b/arch/x86/crypto/sha512-mb/sha512_mb_mgr_submit_avx2.S
new file mode 100644
index 000000000..4ba709ba7
--- /dev/null
+++ b/arch/x86/crypto/sha512-mb/sha512_mb_mgr_submit_avx2.S
@@ -0,0 +1,224 @@
+/*
+ * Buffer submit code for multi buffer SHA512 algorithm
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * Megha Dey <megha.dey@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+#include "sha512_mb_mgr_datastruct.S"
+
+.extern sha512_x4_avx2
+
+#define arg1 %rdi
+#define arg2 %rsi
+
+#define idx %rdx
+#define last_len %rdx
+
+#define size_offset %rcx
+#define tmp2 %rcx
+
+# Common definitions
+#define state arg1
+#define job arg2
+#define len2 arg2
+#define p2 arg2
+
+#define p %r11
+#define start_offset %r11
+
+#define unused_lanes %rbx
+
+#define job_rax %rax
+#define len %rax
+
+#define lane %r12
+#define tmp3 %r12
+#define lens3 %r12
+
+#define extra_blocks %r8
+#define lens0 %r8
+
+#define tmp %r9
+#define lens1 %r9
+
+#define lane_data %r10
+#define lens2 %r10
+
+#define DWORD_len %eax
+
+# JOB* sha512_mb_mgr_submit_avx2(MB_MGR *state, JOB *job)
+# arg 1 : rcx : state
+# arg 2 : rdx : job
+ENTRY(sha512_mb_mgr_submit_avx2)
+ FRAME_BEGIN
+ push %rbx
+ push %r12
+
+ mov _unused_lanes(state), unused_lanes
+ movzb %bl,lane
+ shr $8, unused_lanes
+ imul $_LANE_DATA_size, lane,lane_data
+ movl $STS_BEING_PROCESSED, _status(job)
+ lea _ldata(state, lane_data), lane_data
+ mov unused_lanes, _unused_lanes(state)
+ movl _len(job), DWORD_len
+
+ mov job, _job_in_lane(lane_data)
+ movl DWORD_len,_lens+4(state , lane, 8)
+
+ # Load digest words from result_digest
+ vmovdqu _result_digest+0*16(job), %xmm0
+ vmovdqu _result_digest+1*16(job), %xmm1
+ vmovdqu _result_digest+2*16(job), %xmm2
+ vmovdqu _result_digest+3*16(job), %xmm3
+
+ vmovq %xmm0, _args_digest(state, lane, 8)
+ vpextrq $1, %xmm0, _args_digest+1*32(state , lane, 8)
+ vmovq %xmm1, _args_digest+2*32(state , lane, 8)
+ vpextrq $1, %xmm1, _args_digest+3*32(state , lane, 8)
+ vmovq %xmm2, _args_digest+4*32(state , lane, 8)
+ vpextrq $1, %xmm2, _args_digest+5*32(state , lane, 8)
+ vmovq %xmm3, _args_digest+6*32(state , lane, 8)
+ vpextrq $1, %xmm3, _args_digest+7*32(state , lane, 8)
+
+ mov _buffer(job), p
+ mov p, _args_data_ptr(state, lane, 8)
+
+ cmp $0xFF, unused_lanes
+ jne return_null
+
+start_loop:
+
+ # Find min length
+ mov _lens+0*8(state),lens0
+ mov lens0,idx
+ mov _lens+1*8(state),lens1
+ cmp idx,lens1
+ cmovb lens1, idx
+ mov _lens+2*8(state),lens2
+ cmp idx,lens2
+ cmovb lens2,idx
+ mov _lens+3*8(state),lens3
+ cmp idx,lens3
+ cmovb lens3,idx
+ mov idx,len2
+ and $0xF,idx
+ and $~0xFF,len2
+ jz len_is_0
+
+ sub len2,lens0
+ sub len2,lens1
+ sub len2,lens2
+ sub len2,lens3
+ shr $32,len2
+ mov lens0, _lens + 0*8(state)
+ mov lens1, _lens + 1*8(state)
+ mov lens2, _lens + 2*8(state)
+ mov lens3, _lens + 3*8(state)
+
+ # "state" and "args" are the same address, arg1
+ # len is arg2
+ call sha512_x4_avx2
+ # state and idx are intact
+
+len_is_0:
+
+ # process completed job "idx"
+ imul $_LANE_DATA_size, idx, lane_data
+ lea _ldata(state, lane_data), lane_data
+
+ mov _job_in_lane(lane_data), job_rax
+ mov _unused_lanes(state), unused_lanes
+ movq $0, _job_in_lane(lane_data)
+ movl $STS_COMPLETED, _status(job_rax)
+ shl $8, unused_lanes
+ or idx, unused_lanes
+ mov unused_lanes, _unused_lanes(state)
+
+ movl $0xFFFFFFFF,_lens+4(state,idx,8)
+ vmovq _args_digest+0*32(state , idx, 8), %xmm0
+ vpinsrq $1, _args_digest+1*32(state , idx, 8), %xmm0, %xmm0
+ vmovq _args_digest+2*32(state , idx, 8), %xmm1
+ vpinsrq $1, _args_digest+3*32(state , idx, 8), %xmm1, %xmm1
+ vmovq _args_digest+4*32(state , idx, 8), %xmm2
+ vpinsrq $1, _args_digest+5*32(state , idx, 8), %xmm2, %xmm2
+ vmovq _args_digest+6*32(state , idx, 8), %xmm3
+ vpinsrq $1, _args_digest+7*32(state , idx, 8), %xmm3, %xmm3
+
+ vmovdqu %xmm0, _result_digest + 0*16(job_rax)
+ vmovdqu %xmm1, _result_digest + 1*16(job_rax)
+ vmovdqu %xmm2, _result_digest + 2*16(job_rax)
+ vmovdqu %xmm3, _result_digest + 3*16(job_rax)
+
+return:
+ pop %r12
+ pop %rbx
+ FRAME_END
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+ENDPROC(sha512_mb_mgr_submit_avx2)
+
+/* UNUSED?
+.section .rodata.cst16, "aM", @progbits, 16
+.align 16
+H0: .int 0x6a09e667
+H1: .int 0xbb67ae85
+H2: .int 0x3c6ef372
+H3: .int 0xa54ff53a
+H4: .int 0x510e527f
+H5: .int 0x9b05688c
+H6: .int 0x1f83d9ab
+H7: .int 0x5be0cd19
+*/
diff --git a/arch/x86/crypto/sha512-mb/sha512_x4_avx2.S b/arch/x86/crypto/sha512-mb/sha512_x4_avx2.S
new file mode 100644
index 000000000..e22e90764
--- /dev/null
+++ b/arch/x86/crypto/sha512-mb/sha512_x4_avx2.S
@@ -0,0 +1,531 @@
+/*
+ * Multi-buffer SHA512 algorithm hash compute routine
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * Megha Dey <megha.dey@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2016 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+# code to compute quad SHA512 using AVX2
+# use YMMs to tackle the larger digest size
+# outer calling routine takes care of save and restore of XMM registers
+# Logic designed/laid out by JDG
+
+# Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; ymm0-15
+# Stack must be aligned to 32 bytes before call
+# Linux clobbers: rax rbx rcx rsi r8 r9 r10 r11 r12
+# Linux preserves: rcx rdx rdi rbp r13 r14 r15
+# clobbers ymm0-15
+
+#include <linux/linkage.h>
+#include "sha512_mb_mgr_datastruct.S"
+
+arg1 = %rdi
+arg2 = %rsi
+
+# Common definitions
+STATE = arg1
+INP_SIZE = arg2
+
+IDX = %rax
+ROUND = %rbx
+TBL = %r8
+
+inp0 = %r9
+inp1 = %r10
+inp2 = %r11
+inp3 = %r12
+
+a = %ymm0
+b = %ymm1
+c = %ymm2
+d = %ymm3
+e = %ymm4
+f = %ymm5
+g = %ymm6
+h = %ymm7
+
+a0 = %ymm8
+a1 = %ymm9
+a2 = %ymm10
+
+TT0 = %ymm14
+TT1 = %ymm13
+TT2 = %ymm12
+TT3 = %ymm11
+TT4 = %ymm10
+TT5 = %ymm9
+
+T1 = %ymm14
+TMP = %ymm15
+
+# Define stack usage
+STACK_SPACE1 = SZ4*16 + NUM_SHA512_DIGEST_WORDS*SZ4 + 24
+
+#define VMOVPD vmovupd
+_digest = SZ4*16
+
+# transpose r0, r1, r2, r3, t0, t1
+# "transpose" data in {r0..r3} using temps {t0..t3}
+# Input looks like: {r0 r1 r2 r3}
+# r0 = {a7 a6 a5 a4 a3 a2 a1 a0}
+# r1 = {b7 b6 b5 b4 b3 b2 b1 b0}
+# r2 = {c7 c6 c5 c4 c3 c2 c1 c0}
+# r3 = {d7 d6 d5 d4 d3 d2 d1 d0}
+#
+# output looks like: {t0 r1 r0 r3}
+# t0 = {d1 d0 c1 c0 b1 b0 a1 a0}
+# r1 = {d3 d2 c3 c2 b3 b2 a3 a2}
+# r0 = {d5 d4 c5 c4 b5 b4 a5 a4}
+# r3 = {d7 d6 c7 c6 b7 b6 a7 a6}
+
+.macro TRANSPOSE r0 r1 r2 r3 t0 t1
+ vshufps $0x44, \r1, \r0, \t0 # t0 = {b5 b4 a5 a4 b1 b0 a1 a0}
+ vshufps $0xEE, \r1, \r0, \r0 # r0 = {b7 b6 a7 a6 b3 b2 a3 a2}
+ vshufps $0x44, \r3, \r2, \t1 # t1 = {d5 d4 c5 c4 d1 d0 c1 c0}
+ vshufps $0xEE, \r3, \r2, \r2 # r2 = {d7 d6 c7 c6 d3 d2 c3 c2}
+
+ vperm2f128 $0x20, \r2, \r0, \r1 # h6...a6
+ vperm2f128 $0x31, \r2, \r0, \r3 # h2...a2
+ vperm2f128 $0x31, \t1, \t0, \r0 # h5...a5
+ vperm2f128 $0x20, \t1, \t0, \t0 # h1...a1
+.endm
+
+.macro ROTATE_ARGS
+TMP_ = h
+h = g
+g = f
+f = e
+e = d
+d = c
+c = b
+b = a
+a = TMP_
+.endm
+
+# PRORQ reg, imm, tmp
+# packed-rotate-right-double
+# does a rotate by doing two shifts and an or
+.macro _PRORQ reg imm tmp
+ vpsllq $(64-\imm),\reg,\tmp
+ vpsrlq $\imm,\reg, \reg
+ vpor \tmp,\reg, \reg
+.endm
+
+# non-destructive
+# PRORQ_nd reg, imm, tmp, src
+.macro _PRORQ_nd reg imm tmp src
+ vpsllq $(64-\imm), \src, \tmp
+ vpsrlq $\imm, \src, \reg
+ vpor \tmp, \reg, \reg
+.endm
+
+# PRORQ dst/src, amt
+.macro PRORQ reg imm
+ _PRORQ \reg, \imm, TMP
+.endm
+
+# PRORQ_nd dst, src, amt
+.macro PRORQ_nd reg tmp imm
+ _PRORQ_nd \reg, \imm, TMP, \tmp
+.endm
+
+#; arguments passed implicitly in preprocessor symbols i, a...h
+.macro ROUND_00_15 _T1 i
+ PRORQ_nd a0, e, (18-14) # sig1: a0 = (e >> 4)
+
+ vpxor g, f, a2 # ch: a2 = f^g
+ vpand e,a2, a2 # ch: a2 = (f^g)&e
+ vpxor g, a2, a2 # a2 = ch
+
+ PRORQ_nd a1,e,41 # sig1: a1 = (e >> 25)
+
+ offset = SZ4*(\i & 0xf)
+ vmovdqu \_T1,offset(%rsp)
+ vpaddq (TBL,ROUND,1), \_T1, \_T1 # T1 = W + K
+ vpxor e,a0, a0 # sig1: a0 = e ^ (e >> 5)
+ PRORQ a0, 14 # sig1: a0 = (e >> 6) ^ (e >> 11)
+ vpaddq a2, h, h # h = h + ch
+ PRORQ_nd a2,a,6 # sig0: a2 = (a >> 11)
+ vpaddq \_T1,h, h # h = h + ch + W + K
+ vpxor a1, a0, a0 # a0 = sigma1
+ vmovdqu a,\_T1
+ PRORQ_nd a1,a,39 # sig0: a1 = (a >> 22)
+ vpxor c, \_T1, \_T1 # maj: T1 = a^c
+ add $SZ4, ROUND # ROUND++
+ vpand b, \_T1, \_T1 # maj: T1 = (a^c)&b
+ vpaddq a0, h, h
+ vpaddq h, d, d
+ vpxor a, a2, a2 # sig0: a2 = a ^ (a >> 11)
+ PRORQ a2,28 # sig0: a2 = (a >> 2) ^ (a >> 13)
+ vpxor a1, a2, a2 # a2 = sig0
+ vpand c, a, a1 # maj: a1 = a&c
+ vpor \_T1, a1, a1 # a1 = maj
+ vpaddq a1, h, h # h = h + ch + W + K + maj
+ vpaddq a2, h, h # h = h + ch + W + K + maj + sigma0
+ ROTATE_ARGS
+.endm
+
+
+#; arguments passed implicitly in preprocessor symbols i, a...h
+.macro ROUND_16_XX _T1 i
+ vmovdqu SZ4*((\i-15)&0xf)(%rsp), \_T1
+ vmovdqu SZ4*((\i-2)&0xf)(%rsp), a1
+ vmovdqu \_T1, a0
+ PRORQ \_T1,7
+ vmovdqu a1, a2
+ PRORQ a1,42
+ vpxor a0, \_T1, \_T1
+ PRORQ \_T1, 1
+ vpxor a2, a1, a1
+ PRORQ a1, 19
+ vpsrlq $7, a0, a0
+ vpxor a0, \_T1, \_T1
+ vpsrlq $6, a2, a2
+ vpxor a2, a1, a1
+ vpaddq SZ4*((\i-16)&0xf)(%rsp), \_T1, \_T1
+ vpaddq SZ4*((\i-7)&0xf)(%rsp), a1, a1
+ vpaddq a1, \_T1, \_T1
+
+ ROUND_00_15 \_T1,\i
+.endm
+
+
+# void sha512_x4_avx2(void *STATE, const int INP_SIZE)
+# arg 1 : STATE : pointer to input data
+# arg 2 : INP_SIZE : size of data in blocks (assumed >= 1)
+ENTRY(sha512_x4_avx2)
+ # general registers preserved in outer calling routine
+ # outer calling routine saves all the XMM registers
+ # save callee-saved clobbered registers to comply with C function ABI
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ sub $STACK_SPACE1, %rsp
+
+ # Load the pre-transposed incoming digest.
+ vmovdqu 0*SHA512_DIGEST_ROW_SIZE(STATE),a
+ vmovdqu 1*SHA512_DIGEST_ROW_SIZE(STATE),b
+ vmovdqu 2*SHA512_DIGEST_ROW_SIZE(STATE),c
+ vmovdqu 3*SHA512_DIGEST_ROW_SIZE(STATE),d
+ vmovdqu 4*SHA512_DIGEST_ROW_SIZE(STATE),e
+ vmovdqu 5*SHA512_DIGEST_ROW_SIZE(STATE),f
+ vmovdqu 6*SHA512_DIGEST_ROW_SIZE(STATE),g
+ vmovdqu 7*SHA512_DIGEST_ROW_SIZE(STATE),h
+
+ lea K512_4(%rip),TBL
+
+ # load the address of each of the 4 message lanes
+ # getting ready to transpose input onto stack
+ mov _data_ptr+0*PTR_SZ(STATE),inp0
+ mov _data_ptr+1*PTR_SZ(STATE),inp1
+ mov _data_ptr+2*PTR_SZ(STATE),inp2
+ mov _data_ptr+3*PTR_SZ(STATE),inp3
+
+ xor IDX, IDX
+lloop:
+ xor ROUND, ROUND
+
+ # save old digest
+ vmovdqu a, _digest(%rsp)
+ vmovdqu b, _digest+1*SZ4(%rsp)
+ vmovdqu c, _digest+2*SZ4(%rsp)
+ vmovdqu d, _digest+3*SZ4(%rsp)
+ vmovdqu e, _digest+4*SZ4(%rsp)
+ vmovdqu f, _digest+5*SZ4(%rsp)
+ vmovdqu g, _digest+6*SZ4(%rsp)
+ vmovdqu h, _digest+7*SZ4(%rsp)
+ i = 0
+.rep 4
+ vmovdqu PSHUFFLE_BYTE_FLIP_MASK(%rip), TMP
+ VMOVPD i*32(inp0, IDX), TT2
+ VMOVPD i*32(inp1, IDX), TT1
+ VMOVPD i*32(inp2, IDX), TT4
+ VMOVPD i*32(inp3, IDX), TT3
+ TRANSPOSE TT2, TT1, TT4, TT3, TT0, TT5
+ vpshufb TMP, TT0, TT0
+ vpshufb TMP, TT1, TT1
+ vpshufb TMP, TT2, TT2
+ vpshufb TMP, TT3, TT3
+ ROUND_00_15 TT0,(i*4+0)
+ ROUND_00_15 TT1,(i*4+1)
+ ROUND_00_15 TT2,(i*4+2)
+ ROUND_00_15 TT3,(i*4+3)
+ i = (i+1)
+.endr
+ add $128, IDX
+
+ i = (i*4)
+
+ jmp Lrounds_16_xx
+.align 16
+Lrounds_16_xx:
+.rep 16
+ ROUND_16_XX T1, i
+ i = (i+1)
+.endr
+ cmp $0xa00,ROUND
+ jb Lrounds_16_xx
+
+ # add old digest
+ vpaddq _digest(%rsp), a, a
+ vpaddq _digest+1*SZ4(%rsp), b, b
+ vpaddq _digest+2*SZ4(%rsp), c, c
+ vpaddq _digest+3*SZ4(%rsp), d, d
+ vpaddq _digest+4*SZ4(%rsp), e, e
+ vpaddq _digest+5*SZ4(%rsp), f, f
+ vpaddq _digest+6*SZ4(%rsp), g, g
+ vpaddq _digest+7*SZ4(%rsp), h, h
+
+ sub $1, INP_SIZE # unit is blocks
+ jne lloop
+
+ # write back to memory (state object) the transposed digest
+ vmovdqu a, 0*SHA512_DIGEST_ROW_SIZE(STATE)
+ vmovdqu b, 1*SHA512_DIGEST_ROW_SIZE(STATE)
+ vmovdqu c, 2*SHA512_DIGEST_ROW_SIZE(STATE)
+ vmovdqu d, 3*SHA512_DIGEST_ROW_SIZE(STATE)
+ vmovdqu e, 4*SHA512_DIGEST_ROW_SIZE(STATE)
+ vmovdqu f, 5*SHA512_DIGEST_ROW_SIZE(STATE)
+ vmovdqu g, 6*SHA512_DIGEST_ROW_SIZE(STATE)
+ vmovdqu h, 7*SHA512_DIGEST_ROW_SIZE(STATE)
+
+ # update input data pointers
+ add IDX, inp0
+ mov inp0, _data_ptr+0*PTR_SZ(STATE)
+ add IDX, inp1
+ mov inp1, _data_ptr+1*PTR_SZ(STATE)
+ add IDX, inp2
+ mov inp2, _data_ptr+2*PTR_SZ(STATE)
+ add IDX, inp3
+ mov inp3, _data_ptr+3*PTR_SZ(STATE)
+
+ #;;;;;;;;;;;;;;;
+ #; Postamble
+ add $STACK_SPACE1, %rsp
+ # restore callee-saved clobbered registers
+
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+
+ # outer calling routine restores XMM and other GP registers
+ ret
+ENDPROC(sha512_x4_avx2)
+
+.section .rodata.K512_4, "a", @progbits
+.align 64
+K512_4:
+ .octa 0x428a2f98d728ae22428a2f98d728ae22,\
+ 0x428a2f98d728ae22428a2f98d728ae22
+ .octa 0x7137449123ef65cd7137449123ef65cd,\
+ 0x7137449123ef65cd7137449123ef65cd
+ .octa 0xb5c0fbcfec4d3b2fb5c0fbcfec4d3b2f,\
+ 0xb5c0fbcfec4d3b2fb5c0fbcfec4d3b2f
+ .octa 0xe9b5dba58189dbbce9b5dba58189dbbc,\
+ 0xe9b5dba58189dbbce9b5dba58189dbbc
+ .octa 0x3956c25bf348b5383956c25bf348b538,\
+ 0x3956c25bf348b5383956c25bf348b538
+ .octa 0x59f111f1b605d01959f111f1b605d019,\
+ 0x59f111f1b605d01959f111f1b605d019
+ .octa 0x923f82a4af194f9b923f82a4af194f9b,\
+ 0x923f82a4af194f9b923f82a4af194f9b
+ .octa 0xab1c5ed5da6d8118ab1c5ed5da6d8118,\
+ 0xab1c5ed5da6d8118ab1c5ed5da6d8118
+ .octa 0xd807aa98a3030242d807aa98a3030242,\
+ 0xd807aa98a3030242d807aa98a3030242
+ .octa 0x12835b0145706fbe12835b0145706fbe,\
+ 0x12835b0145706fbe12835b0145706fbe
+ .octa 0x243185be4ee4b28c243185be4ee4b28c,\
+ 0x243185be4ee4b28c243185be4ee4b28c
+ .octa 0x550c7dc3d5ffb4e2550c7dc3d5ffb4e2,\
+ 0x550c7dc3d5ffb4e2550c7dc3d5ffb4e2
+ .octa 0x72be5d74f27b896f72be5d74f27b896f,\
+ 0x72be5d74f27b896f72be5d74f27b896f
+ .octa 0x80deb1fe3b1696b180deb1fe3b1696b1,\
+ 0x80deb1fe3b1696b180deb1fe3b1696b1
+ .octa 0x9bdc06a725c712359bdc06a725c71235,\
+ 0x9bdc06a725c712359bdc06a725c71235
+ .octa 0xc19bf174cf692694c19bf174cf692694,\
+ 0xc19bf174cf692694c19bf174cf692694
+ .octa 0xe49b69c19ef14ad2e49b69c19ef14ad2,\
+ 0xe49b69c19ef14ad2e49b69c19ef14ad2
+ .octa 0xefbe4786384f25e3efbe4786384f25e3,\
+ 0xefbe4786384f25e3efbe4786384f25e3
+ .octa 0x0fc19dc68b8cd5b50fc19dc68b8cd5b5,\
+ 0x0fc19dc68b8cd5b50fc19dc68b8cd5b5
+ .octa 0x240ca1cc77ac9c65240ca1cc77ac9c65,\
+ 0x240ca1cc77ac9c65240ca1cc77ac9c65
+ .octa 0x2de92c6f592b02752de92c6f592b0275,\
+ 0x2de92c6f592b02752de92c6f592b0275
+ .octa 0x4a7484aa6ea6e4834a7484aa6ea6e483,\
+ 0x4a7484aa6ea6e4834a7484aa6ea6e483
+ .octa 0x5cb0a9dcbd41fbd45cb0a9dcbd41fbd4,\
+ 0x5cb0a9dcbd41fbd45cb0a9dcbd41fbd4
+ .octa 0x76f988da831153b576f988da831153b5,\
+ 0x76f988da831153b576f988da831153b5
+ .octa 0x983e5152ee66dfab983e5152ee66dfab,\
+ 0x983e5152ee66dfab983e5152ee66dfab
+ .octa 0xa831c66d2db43210a831c66d2db43210,\
+ 0xa831c66d2db43210a831c66d2db43210
+ .octa 0xb00327c898fb213fb00327c898fb213f,\
+ 0xb00327c898fb213fb00327c898fb213f
+ .octa 0xbf597fc7beef0ee4bf597fc7beef0ee4,\
+ 0xbf597fc7beef0ee4bf597fc7beef0ee4
+ .octa 0xc6e00bf33da88fc2c6e00bf33da88fc2,\
+ 0xc6e00bf33da88fc2c6e00bf33da88fc2
+ .octa 0xd5a79147930aa725d5a79147930aa725,\
+ 0xd5a79147930aa725d5a79147930aa725
+ .octa 0x06ca6351e003826f06ca6351e003826f,\
+ 0x06ca6351e003826f06ca6351e003826f
+ .octa 0x142929670a0e6e70142929670a0e6e70,\
+ 0x142929670a0e6e70142929670a0e6e70
+ .octa 0x27b70a8546d22ffc27b70a8546d22ffc,\
+ 0x27b70a8546d22ffc27b70a8546d22ffc
+ .octa 0x2e1b21385c26c9262e1b21385c26c926,\
+ 0x2e1b21385c26c9262e1b21385c26c926
+ .octa 0x4d2c6dfc5ac42aed4d2c6dfc5ac42aed,\
+ 0x4d2c6dfc5ac42aed4d2c6dfc5ac42aed
+ .octa 0x53380d139d95b3df53380d139d95b3df,\
+ 0x53380d139d95b3df53380d139d95b3df
+ .octa 0x650a73548baf63de650a73548baf63de,\
+ 0x650a73548baf63de650a73548baf63de
+ .octa 0x766a0abb3c77b2a8766a0abb3c77b2a8,\
+ 0x766a0abb3c77b2a8766a0abb3c77b2a8
+ .octa 0x81c2c92e47edaee681c2c92e47edaee6,\
+ 0x81c2c92e47edaee681c2c92e47edaee6
+ .octa 0x92722c851482353b92722c851482353b,\
+ 0x92722c851482353b92722c851482353b
+ .octa 0xa2bfe8a14cf10364a2bfe8a14cf10364,\
+ 0xa2bfe8a14cf10364a2bfe8a14cf10364
+ .octa 0xa81a664bbc423001a81a664bbc423001,\
+ 0xa81a664bbc423001a81a664bbc423001
+ .octa 0xc24b8b70d0f89791c24b8b70d0f89791,\
+ 0xc24b8b70d0f89791c24b8b70d0f89791
+ .octa 0xc76c51a30654be30c76c51a30654be30,\
+ 0xc76c51a30654be30c76c51a30654be30
+ .octa 0xd192e819d6ef5218d192e819d6ef5218,\
+ 0xd192e819d6ef5218d192e819d6ef5218
+ .octa 0xd69906245565a910d69906245565a910,\
+ 0xd69906245565a910d69906245565a910
+ .octa 0xf40e35855771202af40e35855771202a,\
+ 0xf40e35855771202af40e35855771202a
+ .octa 0x106aa07032bbd1b8106aa07032bbd1b8,\
+ 0x106aa07032bbd1b8106aa07032bbd1b8
+ .octa 0x19a4c116b8d2d0c819a4c116b8d2d0c8,\
+ 0x19a4c116b8d2d0c819a4c116b8d2d0c8
+ .octa 0x1e376c085141ab531e376c085141ab53,\
+ 0x1e376c085141ab531e376c085141ab53
+ .octa 0x2748774cdf8eeb992748774cdf8eeb99,\
+ 0x2748774cdf8eeb992748774cdf8eeb99
+ .octa 0x34b0bcb5e19b48a834b0bcb5e19b48a8,\
+ 0x34b0bcb5e19b48a834b0bcb5e19b48a8
+ .octa 0x391c0cb3c5c95a63391c0cb3c5c95a63,\
+ 0x391c0cb3c5c95a63391c0cb3c5c95a63
+ .octa 0x4ed8aa4ae3418acb4ed8aa4ae3418acb,\
+ 0x4ed8aa4ae3418acb4ed8aa4ae3418acb
+ .octa 0x5b9cca4f7763e3735b9cca4f7763e373,\
+ 0x5b9cca4f7763e3735b9cca4f7763e373
+ .octa 0x682e6ff3d6b2b8a3682e6ff3d6b2b8a3,\
+ 0x682e6ff3d6b2b8a3682e6ff3d6b2b8a3
+ .octa 0x748f82ee5defb2fc748f82ee5defb2fc,\
+ 0x748f82ee5defb2fc748f82ee5defb2fc
+ .octa 0x78a5636f43172f6078a5636f43172f60,\
+ 0x78a5636f43172f6078a5636f43172f60
+ .octa 0x84c87814a1f0ab7284c87814a1f0ab72,\
+ 0x84c87814a1f0ab7284c87814a1f0ab72
+ .octa 0x8cc702081a6439ec8cc702081a6439ec,\
+ 0x8cc702081a6439ec8cc702081a6439ec
+ .octa 0x90befffa23631e2890befffa23631e28,\
+ 0x90befffa23631e2890befffa23631e28
+ .octa 0xa4506cebde82bde9a4506cebde82bde9,\
+ 0xa4506cebde82bde9a4506cebde82bde9
+ .octa 0xbef9a3f7b2c67915bef9a3f7b2c67915,\
+ 0xbef9a3f7b2c67915bef9a3f7b2c67915
+ .octa 0xc67178f2e372532bc67178f2e372532b,\
+ 0xc67178f2e372532bc67178f2e372532b
+ .octa 0xca273eceea26619cca273eceea26619c,\
+ 0xca273eceea26619cca273eceea26619c
+ .octa 0xd186b8c721c0c207d186b8c721c0c207,\
+ 0xd186b8c721c0c207d186b8c721c0c207
+ .octa 0xeada7dd6cde0eb1eeada7dd6cde0eb1e,\
+ 0xeada7dd6cde0eb1eeada7dd6cde0eb1e
+ .octa 0xf57d4f7fee6ed178f57d4f7fee6ed178,\
+ 0xf57d4f7fee6ed178f57d4f7fee6ed178
+ .octa 0x06f067aa72176fba06f067aa72176fba,\
+ 0x06f067aa72176fba06f067aa72176fba
+ .octa 0x0a637dc5a2c898a60a637dc5a2c898a6,\
+ 0x0a637dc5a2c898a60a637dc5a2c898a6
+ .octa 0x113f9804bef90dae113f9804bef90dae,\
+ 0x113f9804bef90dae113f9804bef90dae
+ .octa 0x1b710b35131c471b1b710b35131c471b,\
+ 0x1b710b35131c471b1b710b35131c471b
+ .octa 0x28db77f523047d8428db77f523047d84,\
+ 0x28db77f523047d8428db77f523047d84
+ .octa 0x32caab7b40c7249332caab7b40c72493,\
+ 0x32caab7b40c7249332caab7b40c72493
+ .octa 0x3c9ebe0a15c9bebc3c9ebe0a15c9bebc,\
+ 0x3c9ebe0a15c9bebc3c9ebe0a15c9bebc
+ .octa 0x431d67c49c100d4c431d67c49c100d4c,\
+ 0x431d67c49c100d4c431d67c49c100d4c
+ .octa 0x4cc5d4becb3e42b64cc5d4becb3e42b6,\
+ 0x4cc5d4becb3e42b64cc5d4becb3e42b6
+ .octa 0x597f299cfc657e2a597f299cfc657e2a,\
+ 0x597f299cfc657e2a597f299cfc657e2a
+ .octa 0x5fcb6fab3ad6faec5fcb6fab3ad6faec,\
+ 0x5fcb6fab3ad6faec5fcb6fab3ad6faec
+ .octa 0x6c44198c4a4758176c44198c4a475817,\
+ 0x6c44198c4a4758176c44198c4a475817
+
+.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
+.align 32
+PSHUFFLE_BYTE_FLIP_MASK: .octa 0x08090a0b0c0d0e0f0001020304050607
+ .octa 0x18191a1b1c1d1e1f1011121314151617
diff --git a/arch/x86/crypto/sha512-ssse3-asm.S b/arch/x86/crypto/sha512-ssse3-asm.S
new file mode 100644
index 000000000..66bbd9058
--- /dev/null
+++ b/arch/x86/crypto/sha512-ssse3-asm.S
@@ -0,0 +1,424 @@
+########################################################################
+# Implement fast SHA-512 with SSSE3 instructions. (x86_64)
+#
+# Copyright (C) 2013 Intel Corporation.
+#
+# Authors:
+# James Guilford <james.guilford@intel.com>
+# Kirk Yap <kirk.s.yap@intel.com>
+# David Cote <david.m.cote@intel.com>
+# Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses. You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+# Redistribution and use in source and binary forms, with or
+# without modification, are permitted provided that the following
+# conditions are met:
+#
+# - Redistributions of source code must retain the above
+# copyright notice, this list of conditions and the following
+# disclaimer.
+#
+# - Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following
+# disclaimer in the documentation and/or other materials
+# provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+########################################################################
+#
+# This code is described in an Intel White-Paper:
+# "Fast SHA-512 Implementations on Intel Architecture Processors"
+#
+# To find it, surf to http://www.intel.com/p/en_US/embedded
+# and search for that title.
+#
+########################################################################
+
+#include <linux/linkage.h>
+
+.text
+
+# Virtual Registers
+# ARG1
+digest = %rdi
+# ARG2
+msg = %rsi
+# ARG3
+msglen = %rdx
+T1 = %rcx
+T2 = %r8
+a_64 = %r9
+b_64 = %r10
+c_64 = %r11
+d_64 = %r12
+e_64 = %r13
+f_64 = %r14
+g_64 = %r15
+h_64 = %rbx
+tmp0 = %rax
+
+# Local variables (stack frame)
+
+W_SIZE = 80*8
+WK_SIZE = 2*8
+RSPSAVE_SIZE = 1*8
+GPRSAVE_SIZE = 5*8
+
+frame_W = 0
+frame_WK = frame_W + W_SIZE
+frame_RSPSAVE = frame_WK + WK_SIZE
+frame_GPRSAVE = frame_RSPSAVE + RSPSAVE_SIZE
+frame_size = frame_GPRSAVE + GPRSAVE_SIZE
+
+# Useful QWORD "arrays" for simpler memory references
+# MSG, DIGEST, K_t, W_t are arrays
+# WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even
+
+# Input message (arg1)
+#define MSG(i) 8*i(msg)
+
+# Output Digest (arg2)
+#define DIGEST(i) 8*i(digest)
+
+# SHA Constants (static mem)
+#define K_t(i) 8*i+K512(%rip)
+
+# Message Schedule (stack frame)
+#define W_t(i) 8*i+frame_W(%rsp)
+
+# W[t]+K[t] (stack frame)
+#define WK_2(i) 8*((i%2))+frame_WK(%rsp)
+
+.macro RotateState
+ # Rotate symbols a..h right
+ TMP = h_64
+ h_64 = g_64
+ g_64 = f_64
+ f_64 = e_64
+ e_64 = d_64
+ d_64 = c_64
+ c_64 = b_64
+ b_64 = a_64
+ a_64 = TMP
+.endm
+
+.macro SHA512_Round rnd
+
+ # Compute Round %%t
+ mov f_64, T1 # T1 = f
+ mov e_64, tmp0 # tmp = e
+ xor g_64, T1 # T1 = f ^ g
+ ror $23, tmp0 # 41 # tmp = e ror 23
+ and e_64, T1 # T1 = (f ^ g) & e
+ xor e_64, tmp0 # tmp = (e ror 23) ^ e
+ xor g_64, T1 # T1 = ((f ^ g) & e) ^ g = CH(e,f,g)
+ idx = \rnd
+ add WK_2(idx), T1 # W[t] + K[t] from message scheduler
+ ror $4, tmp0 # 18 # tmp = ((e ror 23) ^ e) ror 4
+ xor e_64, tmp0 # tmp = (((e ror 23) ^ e) ror 4) ^ e
+ mov a_64, T2 # T2 = a
+ add h_64, T1 # T1 = CH(e,f,g) + W[t] + K[t] + h
+ ror $14, tmp0 # 14 # tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e)
+ add tmp0, T1 # T1 = CH(e,f,g) + W[t] + K[t] + S1(e)
+ mov a_64, tmp0 # tmp = a
+ xor c_64, T2 # T2 = a ^ c
+ and c_64, tmp0 # tmp = a & c
+ and b_64, T2 # T2 = (a ^ c) & b
+ xor tmp0, T2 # T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c)
+ mov a_64, tmp0 # tmp = a
+ ror $5, tmp0 # 39 # tmp = a ror 5
+ xor a_64, tmp0 # tmp = (a ror 5) ^ a
+ add T1, d_64 # e(next_state) = d + T1
+ ror $6, tmp0 # 34 # tmp = ((a ror 5) ^ a) ror 6
+ xor a_64, tmp0 # tmp = (((a ror 5) ^ a) ror 6) ^ a
+ lea (T1, T2), h_64 # a(next_state) = T1 + Maj(a,b,c)
+ ror $28, tmp0 # 28 # tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a)
+ add tmp0, h_64 # a(next_state) = T1 + Maj(a,b,c) S0(a)
+ RotateState
+.endm
+
+.macro SHA512_2Sched_2Round_sse rnd
+
+ # Compute rounds t-2 and t-1
+ # Compute message schedule QWORDS t and t+1
+
+ # Two rounds are computed based on the values for K[t-2]+W[t-2] and
+ # K[t-1]+W[t-1] which were previously stored at WK_2 by the message
+ # scheduler.
+ # The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)].
+ # They are then added to their respective SHA512 constants at
+ # [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)]
+ # For brievity, the comments following vectored instructions only refer to
+ # the first of a pair of QWORDS.
+ # Eg. XMM2=W[t-2] really means XMM2={W[t-2]|W[t-1]}
+ # The computation of the message schedule and the rounds are tightly
+ # stitched to take advantage of instruction-level parallelism.
+ # For clarity, integer instructions (for the rounds calculation) are indented
+ # by one tab. Vectored instructions (for the message scheduler) are indented
+ # by two tabs.
+
+ mov f_64, T1
+ idx = \rnd -2
+ movdqa W_t(idx), %xmm2 # XMM2 = W[t-2]
+ xor g_64, T1
+ and e_64, T1
+ movdqa %xmm2, %xmm0 # XMM0 = W[t-2]
+ xor g_64, T1
+ idx = \rnd
+ add WK_2(idx), T1
+ idx = \rnd - 15
+ movdqu W_t(idx), %xmm5 # XMM5 = W[t-15]
+ mov e_64, tmp0
+ ror $23, tmp0 # 41
+ movdqa %xmm5, %xmm3 # XMM3 = W[t-15]
+ xor e_64, tmp0
+ ror $4, tmp0 # 18
+ psrlq $61-19, %xmm0 # XMM0 = W[t-2] >> 42
+ xor e_64, tmp0
+ ror $14, tmp0 # 14
+ psrlq $(8-7), %xmm3 # XMM3 = W[t-15] >> 1
+ add tmp0, T1
+ add h_64, T1
+ pxor %xmm2, %xmm0 # XMM0 = (W[t-2] >> 42) ^ W[t-2]
+ mov a_64, T2
+ xor c_64, T2
+ pxor %xmm5, %xmm3 # XMM3 = (W[t-15] >> 1) ^ W[t-15]
+ and b_64, T2
+ mov a_64, tmp0
+ psrlq $(19-6), %xmm0 # XMM0 = ((W[t-2]>>42)^W[t-2])>>13
+ and c_64, tmp0
+ xor tmp0, T2
+ psrlq $(7-1), %xmm3 # XMM3 = ((W[t-15]>>1)^W[t-15])>>6
+ mov a_64, tmp0
+ ror $5, tmp0 # 39
+ pxor %xmm2, %xmm0 # XMM0 = (((W[t-2]>>42)^W[t-2])>>13)^W[t-2]
+ xor a_64, tmp0
+ ror $6, tmp0 # 34
+ pxor %xmm5, %xmm3 # XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]
+ xor a_64, tmp0
+ ror $28, tmp0 # 28
+ psrlq $6, %xmm0 # XMM0 = ((((W[t-2]>>42)^W[t-2])>>13)^W[t-2])>>6
+ add tmp0, T2
+ add T1, d_64
+ psrlq $1, %xmm3 # XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]>>1
+ lea (T1, T2), h_64
+ RotateState
+ movdqa %xmm2, %xmm1 # XMM1 = W[t-2]
+ mov f_64, T1
+ xor g_64, T1
+ movdqa %xmm5, %xmm4 # XMM4 = W[t-15]
+ and e_64, T1
+ xor g_64, T1
+ psllq $(64-19)-(64-61) , %xmm1 # XMM1 = W[t-2] << 42
+ idx = \rnd + 1
+ add WK_2(idx), T1
+ mov e_64, tmp0
+ psllq $(64-1)-(64-8), %xmm4 # XMM4 = W[t-15] << 7
+ ror $23, tmp0 # 41
+ xor e_64, tmp0
+ pxor %xmm2, %xmm1 # XMM1 = (W[t-2] << 42)^W[t-2]
+ ror $4, tmp0 # 18
+ xor e_64, tmp0
+ pxor %xmm5, %xmm4 # XMM4 = (W[t-15]<<7)^W[t-15]
+ ror $14, tmp0 # 14
+ add tmp0, T1
+ psllq $(64-61), %xmm1 # XMM1 = ((W[t-2] << 42)^W[t-2])<<3
+ add h_64, T1
+ mov a_64, T2
+ psllq $(64-8), %xmm4 # XMM4 = ((W[t-15]<<7)^W[t-15])<<56
+ xor c_64, T2
+ and b_64, T2
+ pxor %xmm1, %xmm0 # XMM0 = s1(W[t-2])
+ mov a_64, tmp0
+ and c_64, tmp0
+ idx = \rnd - 7
+ movdqu W_t(idx), %xmm1 # XMM1 = W[t-7]
+ xor tmp0, T2
+ pxor %xmm4, %xmm3 # XMM3 = s0(W[t-15])
+ mov a_64, tmp0
+ paddq %xmm3, %xmm0 # XMM0 = s1(W[t-2]) + s0(W[t-15])
+ ror $5, tmp0 # 39
+ idx =\rnd-16
+ paddq W_t(idx), %xmm0 # XMM0 = s1(W[t-2]) + s0(W[t-15]) + W[t-16]
+ xor a_64, tmp0
+ paddq %xmm1, %xmm0 # XMM0 = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16]
+ ror $6, tmp0 # 34
+ movdqa %xmm0, W_t(\rnd) # Store scheduled qwords
+ xor a_64, tmp0
+ paddq K_t(\rnd), %xmm0 # Compute W[t]+K[t]
+ ror $28, tmp0 # 28
+ idx = \rnd
+ movdqa %xmm0, WK_2(idx) # Store W[t]+K[t] for next rounds
+ add tmp0, T2
+ add T1, d_64
+ lea (T1, T2), h_64
+ RotateState
+.endm
+
+########################################################################
+# void sha512_transform_ssse3(void* D, const void* M, u64 L)#
+# Purpose: Updates the SHA512 digest stored at D with the message stored in M.
+# The size of the message pointed to by M must be an integer multiple of SHA512
+# message blocks.
+# L is the message length in SHA512 blocks.
+########################################################################
+ENTRY(sha512_transform_ssse3)
+
+ cmp $0, msglen
+ je nowork
+
+ # Allocate Stack Space
+ mov %rsp, %rax
+ sub $frame_size, %rsp
+ and $~(0x20 - 1), %rsp
+ mov %rax, frame_RSPSAVE(%rsp)
+
+ # Save GPRs
+ mov %rbx, frame_GPRSAVE(%rsp)
+ mov %r12, frame_GPRSAVE +8*1(%rsp)
+ mov %r13, frame_GPRSAVE +8*2(%rsp)
+ mov %r14, frame_GPRSAVE +8*3(%rsp)
+ mov %r15, frame_GPRSAVE +8*4(%rsp)
+
+updateblock:
+
+# Load state variables
+ mov DIGEST(0), a_64
+ mov DIGEST(1), b_64
+ mov DIGEST(2), c_64
+ mov DIGEST(3), d_64
+ mov DIGEST(4), e_64
+ mov DIGEST(5), f_64
+ mov DIGEST(6), g_64
+ mov DIGEST(7), h_64
+
+ t = 0
+ .rept 80/2 + 1
+ # (80 rounds) / (2 rounds/iteration) + (1 iteration)
+ # +1 iteration because the scheduler leads hashing by 1 iteration
+ .if t < 2
+ # BSWAP 2 QWORDS
+ movdqa XMM_QWORD_BSWAP(%rip), %xmm1
+ movdqu MSG(t), %xmm0
+ pshufb %xmm1, %xmm0 # BSWAP
+ movdqa %xmm0, W_t(t) # Store Scheduled Pair
+ paddq K_t(t), %xmm0 # Compute W[t]+K[t]
+ movdqa %xmm0, WK_2(t) # Store into WK for rounds
+ .elseif t < 16
+ # BSWAP 2 QWORDS# Compute 2 Rounds
+ movdqu MSG(t), %xmm0
+ pshufb %xmm1, %xmm0 # BSWAP
+ SHA512_Round t-2 # Round t-2
+ movdqa %xmm0, W_t(t) # Store Scheduled Pair
+ paddq K_t(t), %xmm0 # Compute W[t]+K[t]
+ SHA512_Round t-1 # Round t-1
+ movdqa %xmm0, WK_2(t) # Store W[t]+K[t] into WK
+ .elseif t < 79
+ # Schedule 2 QWORDS# Compute 2 Rounds
+ SHA512_2Sched_2Round_sse t
+ .else
+ # Compute 2 Rounds
+ SHA512_Round t-2
+ SHA512_Round t-1
+ .endif
+ t = t+2
+ .endr
+
+ # Update digest
+ add a_64, DIGEST(0)
+ add b_64, DIGEST(1)
+ add c_64, DIGEST(2)
+ add d_64, DIGEST(3)
+ add e_64, DIGEST(4)
+ add f_64, DIGEST(5)
+ add g_64, DIGEST(6)
+ add h_64, DIGEST(7)
+
+ # Advance to next message block
+ add $16*8, msg
+ dec msglen
+ jnz updateblock
+
+ # Restore GPRs
+ mov frame_GPRSAVE(%rsp), %rbx
+ mov frame_GPRSAVE +8*1(%rsp), %r12
+ mov frame_GPRSAVE +8*2(%rsp), %r13
+ mov frame_GPRSAVE +8*3(%rsp), %r14
+ mov frame_GPRSAVE +8*4(%rsp), %r15
+
+ # Restore Stack Pointer
+ mov frame_RSPSAVE(%rsp), %rsp
+
+nowork:
+ ret
+ENDPROC(sha512_transform_ssse3)
+
+########################################################################
+### Binary Data
+
+.section .rodata.cst16.XMM_QWORD_BSWAP, "aM", @progbits, 16
+.align 16
+# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
+XMM_QWORD_BSWAP:
+ .octa 0x08090a0b0c0d0e0f0001020304050607
+
+# Mergeable 640-byte rodata section. This allows linker to merge the table
+# with other, exactly the same 640-byte fragment of another rodata section
+# (if such section exists).
+.section .rodata.cst640.K512, "aM", @progbits, 640
+.align 64
+# K[t] used in SHA512 hashing
+K512:
+ .quad 0x428a2f98d728ae22,0x7137449123ef65cd
+ .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+ .quad 0x3956c25bf348b538,0x59f111f1b605d019
+ .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+ .quad 0xd807aa98a3030242,0x12835b0145706fbe
+ .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+ .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+ .quad 0x9bdc06a725c71235,0xc19bf174cf692694
+ .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
+ .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+ .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
+ .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+ .quad 0x983e5152ee66dfab,0xa831c66d2db43210
+ .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
+ .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
+ .quad 0x06ca6351e003826f,0x142929670a0e6e70
+ .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
+ .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+ .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
+ .quad 0x81c2c92e47edaee6,0x92722c851482353b
+ .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
+ .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
+ .quad 0xd192e819d6ef5218,0xd69906245565a910
+ .quad 0xf40e35855771202a,0x106aa07032bbd1b8
+ .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+ .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+ .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+ .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+ .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
+ .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
+ .quad 0x90befffa23631e28,0xa4506cebde82bde9
+ .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
+ .quad 0xca273eceea26619c,0xd186b8c721c0c207
+ .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+ .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
+ .quad 0x113f9804bef90dae,0x1b710b35131c471b
+ .quad 0x28db77f523047d84,0x32caab7b40c72493
+ .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+ .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+ .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c
new file mode 100644
index 000000000..f1b811b60
--- /dev/null
+++ b/arch/x86/crypto/sha512_ssse3_glue.c
@@ -0,0 +1,349 @@
+/*
+ * Cryptographic API.
+ *
+ * Glue code for the SHA512 Secure Hash Algorithm assembler
+ * implementation using supplemental SSE3 / AVX / AVX2 instructions.
+ *
+ * This file is based on sha512_generic.c
+ *
+ * Copyright (C) 2013 Intel Corporation
+ * Author: Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <crypto/internal/hash.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <crypto/sha.h>
+#include <crypto/sha512_base.h>
+#include <asm/fpu/api.h>
+
+#include <linux/string.h>
+
+asmlinkage void sha512_transform_ssse3(u64 *digest, const char *data,
+ u64 rounds);
+
+typedef void (sha512_transform_fn)(u64 *digest, const char *data, u64 rounds);
+
+static int sha512_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len, sha512_transform_fn *sha512_xform)
+{
+ struct sha512_state *sctx = shash_desc_ctx(desc);
+
+ if (!irq_fpu_usable() ||
+ (sctx->count[0] % SHA512_BLOCK_SIZE) + len < SHA512_BLOCK_SIZE)
+ return crypto_sha512_update(desc, data, len);
+
+ /* make sure casting to sha512_block_fn() is safe */
+ BUILD_BUG_ON(offsetof(struct sha512_state, state) != 0);
+
+ kernel_fpu_begin();
+ sha512_base_do_update(desc, data, len,
+ (sha512_block_fn *)sha512_xform);
+ kernel_fpu_end();
+
+ return 0;
+}
+
+static int sha512_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out, sha512_transform_fn *sha512_xform)
+{
+ if (!irq_fpu_usable())
+ return crypto_sha512_finup(desc, data, len, out);
+
+ kernel_fpu_begin();
+ if (len)
+ sha512_base_do_update(desc, data, len,
+ (sha512_block_fn *)sha512_xform);
+ sha512_base_do_finalize(desc, (sha512_block_fn *)sha512_xform);
+ kernel_fpu_end();
+
+ return sha512_base_finish(desc, out);
+}
+
+static int sha512_ssse3_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ return sha512_update(desc, data, len, sha512_transform_ssse3);
+}
+
+static int sha512_ssse3_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return sha512_finup(desc, data, len, out, sha512_transform_ssse3);
+}
+
+/* Add padding and return the message digest. */
+static int sha512_ssse3_final(struct shash_desc *desc, u8 *out)
+{
+ return sha512_ssse3_finup(desc, NULL, 0, out);
+}
+
+static struct shash_alg sha512_ssse3_algs[] = { {
+ .digestsize = SHA512_DIGEST_SIZE,
+ .init = sha512_base_init,
+ .update = sha512_ssse3_update,
+ .final = sha512_ssse3_final,
+ .finup = sha512_ssse3_finup,
+ .descsize = sizeof(struct sha512_state),
+ .base = {
+ .cra_name = "sha512",
+ .cra_driver_name = "sha512-ssse3",
+ .cra_priority = 150,
+ .cra_blocksize = SHA512_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+}, {
+ .digestsize = SHA384_DIGEST_SIZE,
+ .init = sha384_base_init,
+ .update = sha512_ssse3_update,
+ .final = sha512_ssse3_final,
+ .finup = sha512_ssse3_finup,
+ .descsize = sizeof(struct sha512_state),
+ .base = {
+ .cra_name = "sha384",
+ .cra_driver_name = "sha384-ssse3",
+ .cra_priority = 150,
+ .cra_blocksize = SHA384_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+} };
+
+static int register_sha512_ssse3(void)
+{
+ if (boot_cpu_has(X86_FEATURE_SSSE3))
+ return crypto_register_shashes(sha512_ssse3_algs,
+ ARRAY_SIZE(sha512_ssse3_algs));
+ return 0;
+}
+
+static void unregister_sha512_ssse3(void)
+{
+ if (boot_cpu_has(X86_FEATURE_SSSE3))
+ crypto_unregister_shashes(sha512_ssse3_algs,
+ ARRAY_SIZE(sha512_ssse3_algs));
+}
+
+#ifdef CONFIG_AS_AVX
+asmlinkage void sha512_transform_avx(u64 *digest, const char *data,
+ u64 rounds);
+static bool avx_usable(void)
+{
+ if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
+ if (boot_cpu_has(X86_FEATURE_AVX))
+ pr_info("AVX detected but unusable.\n");
+ return false;
+ }
+
+ return true;
+}
+
+static int sha512_avx_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ return sha512_update(desc, data, len, sha512_transform_avx);
+}
+
+static int sha512_avx_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return sha512_finup(desc, data, len, out, sha512_transform_avx);
+}
+
+/* Add padding and return the message digest. */
+static int sha512_avx_final(struct shash_desc *desc, u8 *out)
+{
+ return sha512_avx_finup(desc, NULL, 0, out);
+}
+
+static struct shash_alg sha512_avx_algs[] = { {
+ .digestsize = SHA512_DIGEST_SIZE,
+ .init = sha512_base_init,
+ .update = sha512_avx_update,
+ .final = sha512_avx_final,
+ .finup = sha512_avx_finup,
+ .descsize = sizeof(struct sha512_state),
+ .base = {
+ .cra_name = "sha512",
+ .cra_driver_name = "sha512-avx",
+ .cra_priority = 160,
+ .cra_blocksize = SHA512_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+}, {
+ .digestsize = SHA384_DIGEST_SIZE,
+ .init = sha384_base_init,
+ .update = sha512_avx_update,
+ .final = sha512_avx_final,
+ .finup = sha512_avx_finup,
+ .descsize = sizeof(struct sha512_state),
+ .base = {
+ .cra_name = "sha384",
+ .cra_driver_name = "sha384-avx",
+ .cra_priority = 160,
+ .cra_blocksize = SHA384_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+} };
+
+static int register_sha512_avx(void)
+{
+ if (avx_usable())
+ return crypto_register_shashes(sha512_avx_algs,
+ ARRAY_SIZE(sha512_avx_algs));
+ return 0;
+}
+
+static void unregister_sha512_avx(void)
+{
+ if (avx_usable())
+ crypto_unregister_shashes(sha512_avx_algs,
+ ARRAY_SIZE(sha512_avx_algs));
+}
+#else
+static inline int register_sha512_avx(void) { return 0; }
+static inline void unregister_sha512_avx(void) { }
+#endif
+
+#if defined(CONFIG_AS_AVX2) && defined(CONFIG_AS_AVX)
+asmlinkage void sha512_transform_rorx(u64 *digest, const char *data,
+ u64 rounds);
+
+static int sha512_avx2_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ return sha512_update(desc, data, len, sha512_transform_rorx);
+}
+
+static int sha512_avx2_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return sha512_finup(desc, data, len, out, sha512_transform_rorx);
+}
+
+/* Add padding and return the message digest. */
+static int sha512_avx2_final(struct shash_desc *desc, u8 *out)
+{
+ return sha512_avx2_finup(desc, NULL, 0, out);
+}
+
+static struct shash_alg sha512_avx2_algs[] = { {
+ .digestsize = SHA512_DIGEST_SIZE,
+ .init = sha512_base_init,
+ .update = sha512_avx2_update,
+ .final = sha512_avx2_final,
+ .finup = sha512_avx2_finup,
+ .descsize = sizeof(struct sha512_state),
+ .base = {
+ .cra_name = "sha512",
+ .cra_driver_name = "sha512-avx2",
+ .cra_priority = 170,
+ .cra_blocksize = SHA512_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+}, {
+ .digestsize = SHA384_DIGEST_SIZE,
+ .init = sha384_base_init,
+ .update = sha512_avx2_update,
+ .final = sha512_avx2_final,
+ .finup = sha512_avx2_finup,
+ .descsize = sizeof(struct sha512_state),
+ .base = {
+ .cra_name = "sha384",
+ .cra_driver_name = "sha384-avx2",
+ .cra_priority = 170,
+ .cra_blocksize = SHA384_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ }
+} };
+
+static bool avx2_usable(void)
+{
+ if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2) &&
+ boot_cpu_has(X86_FEATURE_BMI2))
+ return true;
+
+ return false;
+}
+
+static int register_sha512_avx2(void)
+{
+ if (avx2_usable())
+ return crypto_register_shashes(sha512_avx2_algs,
+ ARRAY_SIZE(sha512_avx2_algs));
+ return 0;
+}
+
+static void unregister_sha512_avx2(void)
+{
+ if (avx2_usable())
+ crypto_unregister_shashes(sha512_avx2_algs,
+ ARRAY_SIZE(sha512_avx2_algs));
+}
+#else
+static inline int register_sha512_avx2(void) { return 0; }
+static inline void unregister_sha512_avx2(void) { }
+#endif
+
+static int __init sha512_ssse3_mod_init(void)
+{
+
+ if (register_sha512_ssse3())
+ goto fail;
+
+ if (register_sha512_avx()) {
+ unregister_sha512_ssse3();
+ goto fail;
+ }
+
+ if (register_sha512_avx2()) {
+ unregister_sha512_avx();
+ unregister_sha512_ssse3();
+ goto fail;
+ }
+
+ return 0;
+fail:
+ return -ENODEV;
+}
+
+static void __exit sha512_ssse3_mod_fini(void)
+{
+ unregister_sha512_avx2();
+ unregister_sha512_avx();
+ unregister_sha512_ssse3();
+}
+
+module_init(sha512_ssse3_mod_init);
+module_exit(sha512_ssse3_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA512 Secure Hash Algorithm, Supplemental SSE3 accelerated");
+
+MODULE_ALIAS_CRYPTO("sha512");
+MODULE_ALIAS_CRYPTO("sha512-ssse3");
+MODULE_ALIAS_CRYPTO("sha512-avx");
+MODULE_ALIAS_CRYPTO("sha512-avx2");
+MODULE_ALIAS_CRYPTO("sha384");
+MODULE_ALIAS_CRYPTO("sha384-ssse3");
+MODULE_ALIAS_CRYPTO("sha384-avx");
+MODULE_ALIAS_CRYPTO("sha384-avx2");
diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
new file mode 100644
index 000000000..73b471da3
--- /dev/null
+++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
@@ -0,0 +1,471 @@
+/*
+ * Twofish Cipher 8-way parallel algorithm (AVX/x86_64)
+ *
+ * Copyright (C) 2012 Johannes Goetzfried
+ * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
+ *
+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+ * USA
+ *
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+#include "glue_helper-asm-avx.S"
+
+.file "twofish-avx-x86_64-asm_64.S"
+
+.section .rodata.cst16.bswap128_mask, "aM", @progbits, 16
+.align 16
+.Lbswap128_mask:
+ .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+.section .rodata.cst16.xts_gf128mul_and_shl1_mask, "aM", @progbits, 16
+.align 16
+.Lxts_gf128mul_and_shl1_mask:
+ .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
+
+.text
+
+/* structure of crypto context */
+#define s0 0
+#define s1 1024
+#define s2 2048
+#define s3 3072
+#define w 4096
+#define k 4128
+
+/**********************************************************************
+ 8-way AVX twofish
+ **********************************************************************/
+#define CTX %rdi
+
+#define RA1 %xmm0
+#define RB1 %xmm1
+#define RC1 %xmm2
+#define RD1 %xmm3
+
+#define RA2 %xmm4
+#define RB2 %xmm5
+#define RC2 %xmm6
+#define RD2 %xmm7
+
+#define RX0 %xmm8
+#define RY0 %xmm9
+
+#define RX1 %xmm10
+#define RY1 %xmm11
+
+#define RK1 %xmm12
+#define RK2 %xmm13
+
+#define RT %xmm14
+#define RR %xmm15
+
+#define RID1 %r13
+#define RID1d %r13d
+#define RID2 %rsi
+#define RID2d %esi
+
+#define RGI1 %rdx
+#define RGI1bl %dl
+#define RGI1bh %dh
+#define RGI2 %rcx
+#define RGI2bl %cl
+#define RGI2bh %ch
+
+#define RGI3 %rax
+#define RGI3bl %al
+#define RGI3bh %ah
+#define RGI4 %rbx
+#define RGI4bl %bl
+#define RGI4bh %bh
+
+#define RGS1 %r8
+#define RGS1d %r8d
+#define RGS2 %r9
+#define RGS2d %r9d
+#define RGS3 %r10
+#define RGS3d %r10d
+
+
+#define lookup_32bit(t0, t1, t2, t3, src, dst, interleave_op, il_reg) \
+ movzbl src ## bl, RID1d; \
+ movzbl src ## bh, RID2d; \
+ shrq $16, src; \
+ movl t0(CTX, RID1, 4), dst ## d; \
+ movl t1(CTX, RID2, 4), RID2d; \
+ movzbl src ## bl, RID1d; \
+ xorl RID2d, dst ## d; \
+ movzbl src ## bh, RID2d; \
+ interleave_op(il_reg); \
+ xorl t2(CTX, RID1, 4), dst ## d; \
+ xorl t3(CTX, RID2, 4), dst ## d;
+
+#define dummy(d) /* do nothing */
+
+#define shr_next(reg) \
+ shrq $16, reg;
+
+#define G(gi1, gi2, x, t0, t1, t2, t3) \
+ lookup_32bit(t0, t1, t2, t3, ##gi1, RGS1, shr_next, ##gi1); \
+ lookup_32bit(t0, t1, t2, t3, ##gi2, RGS3, shr_next, ##gi2); \
+ \
+ lookup_32bit(t0, t1, t2, t3, ##gi1, RGS2, dummy, none); \
+ shlq $32, RGS2; \
+ orq RGS1, RGS2; \
+ lookup_32bit(t0, t1, t2, t3, ##gi2, RGS1, dummy, none); \
+ shlq $32, RGS1; \
+ orq RGS1, RGS3;
+
+#define round_head_2(a, b, x1, y1, x2, y2) \
+ vmovq b ## 1, RGI3; \
+ vpextrq $1, b ## 1, RGI4; \
+ \
+ G(RGI1, RGI2, x1, s0, s1, s2, s3); \
+ vmovq a ## 2, RGI1; \
+ vpextrq $1, a ## 2, RGI2; \
+ vmovq RGS2, x1; \
+ vpinsrq $1, RGS3, x1, x1; \
+ \
+ G(RGI3, RGI4, y1, s1, s2, s3, s0); \
+ vmovq b ## 2, RGI3; \
+ vpextrq $1, b ## 2, RGI4; \
+ vmovq RGS2, y1; \
+ vpinsrq $1, RGS3, y1, y1; \
+ \
+ G(RGI1, RGI2, x2, s0, s1, s2, s3); \
+ vmovq RGS2, x2; \
+ vpinsrq $1, RGS3, x2, x2; \
+ \
+ G(RGI3, RGI4, y2, s1, s2, s3, s0); \
+ vmovq RGS2, y2; \
+ vpinsrq $1, RGS3, y2, y2;
+
+#define encround_tail(a, b, c, d, x, y, prerotate) \
+ vpaddd x, y, x; \
+ vpaddd x, RK1, RT;\
+ prerotate(b); \
+ vpxor RT, c, c; \
+ vpaddd y, x, y; \
+ vpaddd y, RK2, y; \
+ vpsrld $1, c, RT; \
+ vpslld $(32 - 1), c, c; \
+ vpor c, RT, c; \
+ vpxor d, y, d; \
+
+#define decround_tail(a, b, c, d, x, y, prerotate) \
+ vpaddd x, y, x; \
+ vpaddd x, RK1, RT;\
+ prerotate(a); \
+ vpxor RT, c, c; \
+ vpaddd y, x, y; \
+ vpaddd y, RK2, y; \
+ vpxor d, y, d; \
+ vpsrld $1, d, y; \
+ vpslld $(32 - 1), d, d; \
+ vpor d, y, d; \
+
+#define rotate_1l(x) \
+ vpslld $1, x, RR; \
+ vpsrld $(32 - 1), x, x; \
+ vpor x, RR, x;
+
+#define preload_rgi(c) \
+ vmovq c, RGI1; \
+ vpextrq $1, c, RGI2;
+
+#define encrypt_round(n, a, b, c, d, preload, prerotate) \
+ vbroadcastss (k+4*(2*(n)))(CTX), RK1; \
+ vbroadcastss (k+4*(2*(n)+1))(CTX), RK2; \
+ round_head_2(a, b, RX0, RY0, RX1, RY1); \
+ encround_tail(a ## 1, b ## 1, c ## 1, d ## 1, RX0, RY0, prerotate); \
+ preload(c ## 1); \
+ encround_tail(a ## 2, b ## 2, c ## 2, d ## 2, RX1, RY1, prerotate);
+
+#define decrypt_round(n, a, b, c, d, preload, prerotate) \
+ vbroadcastss (k+4*(2*(n)))(CTX), RK1; \
+ vbroadcastss (k+4*(2*(n)+1))(CTX), RK2; \
+ round_head_2(a, b, RX0, RY0, RX1, RY1); \
+ decround_tail(a ## 1, b ## 1, c ## 1, d ## 1, RX0, RY0, prerotate); \
+ preload(c ## 1); \
+ decround_tail(a ## 2, b ## 2, c ## 2, d ## 2, RX1, RY1, prerotate);
+
+#define encrypt_cycle(n) \
+ encrypt_round((2*n), RA, RB, RC, RD, preload_rgi, rotate_1l); \
+ encrypt_round(((2*n) + 1), RC, RD, RA, RB, preload_rgi, rotate_1l);
+
+#define encrypt_cycle_last(n) \
+ encrypt_round((2*n), RA, RB, RC, RD, preload_rgi, rotate_1l); \
+ encrypt_round(((2*n) + 1), RC, RD, RA, RB, dummy, dummy);
+
+#define decrypt_cycle(n) \
+ decrypt_round(((2*n) + 1), RC, RD, RA, RB, preload_rgi, rotate_1l); \
+ decrypt_round((2*n), RA, RB, RC, RD, preload_rgi, rotate_1l);
+
+#define decrypt_cycle_last(n) \
+ decrypt_round(((2*n) + 1), RC, RD, RA, RB, preload_rgi, rotate_1l); \
+ decrypt_round((2*n), RA, RB, RC, RD, dummy, dummy);
+
+#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+ vpunpckldq x1, x0, t0; \
+ vpunpckhdq x1, x0, t2; \
+ vpunpckldq x3, x2, t1; \
+ vpunpckhdq x3, x2, x3; \
+ \
+ vpunpcklqdq t1, t0, x0; \
+ vpunpckhqdq t1, t0, x1; \
+ vpunpcklqdq x3, t2, x2; \
+ vpunpckhqdq x3, t2, x3;
+
+#define inpack_blocks(x0, x1, x2, x3, wkey, t0, t1, t2) \
+ vpxor x0, wkey, x0; \
+ vpxor x1, wkey, x1; \
+ vpxor x2, wkey, x2; \
+ vpxor x3, wkey, x3; \
+ \
+ transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
+
+#define outunpack_blocks(x0, x1, x2, x3, wkey, t0, t1, t2) \
+ transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+ \
+ vpxor x0, wkey, x0; \
+ vpxor x1, wkey, x1; \
+ vpxor x2, wkey, x2; \
+ vpxor x3, wkey, x3;
+
+.align 8
+__twofish_enc_blk8:
+ /* input:
+ * %rdi: ctx, CTX
+ * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
+ * output:
+ * RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2: encrypted blocks
+ */
+
+ vmovdqu w(CTX), RK1;
+
+ pushq %r13;
+ pushq %rbx;
+ pushq %rcx;
+
+ inpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
+ preload_rgi(RA1);
+ rotate_1l(RD1);
+ inpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
+ rotate_1l(RD2);
+
+ encrypt_cycle(0);
+ encrypt_cycle(1);
+ encrypt_cycle(2);
+ encrypt_cycle(3);
+ encrypt_cycle(4);
+ encrypt_cycle(5);
+ encrypt_cycle(6);
+ encrypt_cycle_last(7);
+
+ vmovdqu (w+4*4)(CTX), RK1;
+
+ popq %rcx;
+ popq %rbx;
+ popq %r13;
+
+ outunpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
+ outunpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
+
+ ret;
+ENDPROC(__twofish_enc_blk8)
+
+.align 8
+__twofish_dec_blk8:
+ /* input:
+ * %rdi: ctx, CTX
+ * RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2: encrypted blocks
+ * output:
+ * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks
+ */
+
+ vmovdqu (w+4*4)(CTX), RK1;
+
+ pushq %r13;
+ pushq %rbx;
+
+ inpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
+ preload_rgi(RC1);
+ rotate_1l(RA1);
+ inpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
+ rotate_1l(RA2);
+
+ decrypt_cycle(7);
+ decrypt_cycle(6);
+ decrypt_cycle(5);
+ decrypt_cycle(4);
+ decrypt_cycle(3);
+ decrypt_cycle(2);
+ decrypt_cycle(1);
+ decrypt_cycle_last(0);
+
+ vmovdqu (w)(CTX), RK1;
+
+ popq %rbx;
+ popq %r13;
+
+ outunpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
+ outunpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
+
+ ret;
+ENDPROC(__twofish_dec_blk8)
+
+ENTRY(twofish_ecb_enc_8way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+ FRAME_BEGIN
+
+ movq %rsi, %r11;
+
+ load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ call __twofish_enc_blk8;
+
+ store_8way(%r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
+
+ FRAME_END
+ ret;
+ENDPROC(twofish_ecb_enc_8way)
+
+ENTRY(twofish_ecb_dec_8way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+ FRAME_BEGIN
+
+ movq %rsi, %r11;
+
+ load_8way(%rdx, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
+
+ call __twofish_dec_blk8;
+
+ store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ FRAME_END
+ ret;
+ENDPROC(twofish_ecb_dec_8way)
+
+ENTRY(twofish_cbc_dec_8way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+ FRAME_BEGIN
+
+ pushq %r12;
+
+ movq %rsi, %r11;
+ movq %rdx, %r12;
+
+ load_8way(%rdx, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
+
+ call __twofish_dec_blk8;
+
+ store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ popq %r12;
+
+ FRAME_END
+ ret;
+ENDPROC(twofish_cbc_dec_8way)
+
+ENTRY(twofish_ctr_8way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: iv (little endian, 128bit)
+ */
+ FRAME_BEGIN
+
+ pushq %r12;
+
+ movq %rsi, %r11;
+ movq %rdx, %r12;
+
+ load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
+ RD2, RX0, RX1, RY0);
+
+ call __twofish_enc_blk8;
+
+ store_ctr_8way(%r12, %r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
+
+ popq %r12;
+
+ FRAME_END
+ ret;
+ENDPROC(twofish_ctr_8way)
+
+ENTRY(twofish_xts_enc_8way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+ */
+ FRAME_BEGIN
+
+ movq %rsi, %r11;
+
+ /* regs <= src, dst <= IVs, regs <= regs xor IVs */
+ load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
+ RX0, RX1, RY0, .Lxts_gf128mul_and_shl1_mask);
+
+ call __twofish_enc_blk8;
+
+ /* dst <= regs xor IVs(in dst) */
+ store_xts_8way(%r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
+
+ FRAME_END
+ ret;
+ENDPROC(twofish_xts_enc_8way)
+
+ENTRY(twofish_xts_dec_8way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+ */
+ FRAME_BEGIN
+
+ movq %rsi, %r11;
+
+ /* regs <= src, dst <= IVs, regs <= regs xor IVs */
+ load_xts_8way(%rcx, %rdx, %rsi, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2,
+ RX0, RX1, RY0, .Lxts_gf128mul_and_shl1_mask);
+
+ call __twofish_dec_blk8;
+
+ /* dst <= regs xor IVs(in dst) */
+ store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+ FRAME_END
+ ret;
+ENDPROC(twofish_xts_dec_8way)
diff --git a/arch/x86/crypto/twofish-i586-asm_32.S b/arch/x86/crypto/twofish-i586-asm_32.S
new file mode 100644
index 000000000..694ea4587
--- /dev/null
+++ b/arch/x86/crypto/twofish-i586-asm_32.S
@@ -0,0 +1,334 @@
+/***************************************************************************
+* Copyright (C) 2006 by Joachim Fritschi, <jfritschi@freenet.de> *
+* *
+* This program is free software; you can redistribute it and/or modify *
+* it under the terms of the GNU General Public License as published by *
+* the Free Software Foundation; either version 2 of the License, or *
+* (at your option) any later version. *
+* *
+* This program is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
+* GNU General Public License for more details. *
+* *
+* You should have received a copy of the GNU General Public License *
+* along with this program; if not, write to the *
+* Free Software Foundation, Inc., *
+* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
+***************************************************************************/
+
+.file "twofish-i586-asm.S"
+.text
+
+#include <linux/linkage.h>
+#include <asm/asm-offsets.h>
+
+/* return address at 0 */
+
+#define in_blk 12 /* input byte array address parameter*/
+#define out_blk 8 /* output byte array address parameter*/
+#define ctx 4 /* Twofish context structure */
+
+#define a_offset 0
+#define b_offset 4
+#define c_offset 8
+#define d_offset 12
+
+/* Structure of the crypto context struct*/
+
+#define s0 0 /* S0 Array 256 Words each */
+#define s1 1024 /* S1 Array */
+#define s2 2048 /* S2 Array */
+#define s3 3072 /* S3 Array */
+#define w 4096 /* 8 whitening keys (word) */
+#define k 4128 /* key 1-32 ( word ) */
+
+/* define a few register aliases to allow macro substitution */
+
+#define R0D %eax
+#define R0B %al
+#define R0H %ah
+
+#define R1D %ebx
+#define R1B %bl
+#define R1H %bh
+
+#define R2D %ecx
+#define R2B %cl
+#define R2H %ch
+
+#define R3D %edx
+#define R3B %dl
+#define R3H %dh
+
+
+/* performs input whitening */
+#define input_whitening(src,context,offset)\
+ xor w+offset(context), src;
+
+/* performs input whitening */
+#define output_whitening(src,context,offset)\
+ xor w+16+offset(context), src;
+
+/*
+ * a input register containing a (rotated 16)
+ * b input register containing b
+ * c input register containing c
+ * d input register containing d (already rol $1)
+ * operations on a and b are interleaved to increase performance
+ */
+#define encrypt_round(a,b,c,d,round)\
+ push d ## D;\
+ movzx b ## B, %edi;\
+ mov s1(%ebp,%edi,4),d ## D;\
+ movzx a ## B, %edi;\
+ mov s2(%ebp,%edi,4),%esi;\
+ movzx b ## H, %edi;\
+ ror $16, b ## D;\
+ xor s2(%ebp,%edi,4),d ## D;\
+ movzx a ## H, %edi;\
+ ror $16, a ## D;\
+ xor s3(%ebp,%edi,4),%esi;\
+ movzx b ## B, %edi;\
+ xor s3(%ebp,%edi,4),d ## D;\
+ movzx a ## B, %edi;\
+ xor (%ebp,%edi,4), %esi;\
+ movzx b ## H, %edi;\
+ ror $15, b ## D;\
+ xor (%ebp,%edi,4), d ## D;\
+ movzx a ## H, %edi;\
+ xor s1(%ebp,%edi,4),%esi;\
+ pop %edi;\
+ add d ## D, %esi;\
+ add %esi, d ## D;\
+ add k+round(%ebp), %esi;\
+ xor %esi, c ## D;\
+ rol $15, c ## D;\
+ add k+4+round(%ebp),d ## D;\
+ xor %edi, d ## D;
+
+/*
+ * a input register containing a (rotated 16)
+ * b input register containing b
+ * c input register containing c
+ * d input register containing d (already rol $1)
+ * operations on a and b are interleaved to increase performance
+ * last round has different rotations for the output preparation
+ */
+#define encrypt_last_round(a,b,c,d,round)\
+ push d ## D;\
+ movzx b ## B, %edi;\
+ mov s1(%ebp,%edi,4),d ## D;\
+ movzx a ## B, %edi;\
+ mov s2(%ebp,%edi,4),%esi;\
+ movzx b ## H, %edi;\
+ ror $16, b ## D;\
+ xor s2(%ebp,%edi,4),d ## D;\
+ movzx a ## H, %edi;\
+ ror $16, a ## D;\
+ xor s3(%ebp,%edi,4),%esi;\
+ movzx b ## B, %edi;\
+ xor s3(%ebp,%edi,4),d ## D;\
+ movzx a ## B, %edi;\
+ xor (%ebp,%edi,4), %esi;\
+ movzx b ## H, %edi;\
+ ror $16, b ## D;\
+ xor (%ebp,%edi,4), d ## D;\
+ movzx a ## H, %edi;\
+ xor s1(%ebp,%edi,4),%esi;\
+ pop %edi;\
+ add d ## D, %esi;\
+ add %esi, d ## D;\
+ add k+round(%ebp), %esi;\
+ xor %esi, c ## D;\
+ ror $1, c ## D;\
+ add k+4+round(%ebp),d ## D;\
+ xor %edi, d ## D;
+
+/*
+ * a input register containing a
+ * b input register containing b (rotated 16)
+ * c input register containing c
+ * d input register containing d (already rol $1)
+ * operations on a and b are interleaved to increase performance
+ */
+#define decrypt_round(a,b,c,d,round)\
+ push c ## D;\
+ movzx a ## B, %edi;\
+ mov (%ebp,%edi,4), c ## D;\
+ movzx b ## B, %edi;\
+ mov s3(%ebp,%edi,4),%esi;\
+ movzx a ## H, %edi;\
+ ror $16, a ## D;\
+ xor s1(%ebp,%edi,4),c ## D;\
+ movzx b ## H, %edi;\
+ ror $16, b ## D;\
+ xor (%ebp,%edi,4), %esi;\
+ movzx a ## B, %edi;\
+ xor s2(%ebp,%edi,4),c ## D;\
+ movzx b ## B, %edi;\
+ xor s1(%ebp,%edi,4),%esi;\
+ movzx a ## H, %edi;\
+ ror $15, a ## D;\
+ xor s3(%ebp,%edi,4),c ## D;\
+ movzx b ## H, %edi;\
+ xor s2(%ebp,%edi,4),%esi;\
+ pop %edi;\
+ add %esi, c ## D;\
+ add c ## D, %esi;\
+ add k+round(%ebp), c ## D;\
+ xor %edi, c ## D;\
+ add k+4+round(%ebp),%esi;\
+ xor %esi, d ## D;\
+ rol $15, d ## D;
+
+/*
+ * a input register containing a
+ * b input register containing b (rotated 16)
+ * c input register containing c
+ * d input register containing d (already rol $1)
+ * operations on a and b are interleaved to increase performance
+ * last round has different rotations for the output preparation
+ */
+#define decrypt_last_round(a,b,c,d,round)\
+ push c ## D;\
+ movzx a ## B, %edi;\
+ mov (%ebp,%edi,4), c ## D;\
+ movzx b ## B, %edi;\
+ mov s3(%ebp,%edi,4),%esi;\
+ movzx a ## H, %edi;\
+ ror $16, a ## D;\
+ xor s1(%ebp,%edi,4),c ## D;\
+ movzx b ## H, %edi;\
+ ror $16, b ## D;\
+ xor (%ebp,%edi,4), %esi;\
+ movzx a ## B, %edi;\
+ xor s2(%ebp,%edi,4),c ## D;\
+ movzx b ## B, %edi;\
+ xor s1(%ebp,%edi,4),%esi;\
+ movzx a ## H, %edi;\
+ ror $16, a ## D;\
+ xor s3(%ebp,%edi,4),c ## D;\
+ movzx b ## H, %edi;\
+ xor s2(%ebp,%edi,4),%esi;\
+ pop %edi;\
+ add %esi, c ## D;\
+ add c ## D, %esi;\
+ add k+round(%ebp), c ## D;\
+ xor %edi, c ## D;\
+ add k+4+round(%ebp),%esi;\
+ xor %esi, d ## D;\
+ ror $1, d ## D;
+
+ENTRY(twofish_enc_blk)
+ push %ebp /* save registers according to calling convention*/
+ push %ebx
+ push %esi
+ push %edi
+
+ mov ctx + 16(%esp), %ebp /* abuse the base pointer: set new base
+ * pointer to the ctx address */
+ mov in_blk+16(%esp),%edi /* input address in edi */
+
+ mov (%edi), %eax
+ mov b_offset(%edi), %ebx
+ mov c_offset(%edi), %ecx
+ mov d_offset(%edi), %edx
+ input_whitening(%eax,%ebp,a_offset)
+ ror $16, %eax
+ input_whitening(%ebx,%ebp,b_offset)
+ input_whitening(%ecx,%ebp,c_offset)
+ input_whitening(%edx,%ebp,d_offset)
+ rol $1, %edx
+
+ encrypt_round(R0,R1,R2,R3,0);
+ encrypt_round(R2,R3,R0,R1,8);
+ encrypt_round(R0,R1,R2,R3,2*8);
+ encrypt_round(R2,R3,R0,R1,3*8);
+ encrypt_round(R0,R1,R2,R3,4*8);
+ encrypt_round(R2,R3,R0,R1,5*8);
+ encrypt_round(R0,R1,R2,R3,6*8);
+ encrypt_round(R2,R3,R0,R1,7*8);
+ encrypt_round(R0,R1,R2,R3,8*8);
+ encrypt_round(R2,R3,R0,R1,9*8);
+ encrypt_round(R0,R1,R2,R3,10*8);
+ encrypt_round(R2,R3,R0,R1,11*8);
+ encrypt_round(R0,R1,R2,R3,12*8);
+ encrypt_round(R2,R3,R0,R1,13*8);
+ encrypt_round(R0,R1,R2,R3,14*8);
+ encrypt_last_round(R2,R3,R0,R1,15*8);
+
+ output_whitening(%eax,%ebp,c_offset)
+ output_whitening(%ebx,%ebp,d_offset)
+ output_whitening(%ecx,%ebp,a_offset)
+ output_whitening(%edx,%ebp,b_offset)
+ mov out_blk+16(%esp),%edi;
+ mov %eax, c_offset(%edi)
+ mov %ebx, d_offset(%edi)
+ mov %ecx, (%edi)
+ mov %edx, b_offset(%edi)
+
+ pop %edi
+ pop %esi
+ pop %ebx
+ pop %ebp
+ mov $1, %eax
+ ret
+ENDPROC(twofish_enc_blk)
+
+ENTRY(twofish_dec_blk)
+ push %ebp /* save registers according to calling convention*/
+ push %ebx
+ push %esi
+ push %edi
+
+
+ mov ctx + 16(%esp), %ebp /* abuse the base pointer: set new base
+ * pointer to the ctx address */
+ mov in_blk+16(%esp),%edi /* input address in edi */
+
+ mov (%edi), %eax
+ mov b_offset(%edi), %ebx
+ mov c_offset(%edi), %ecx
+ mov d_offset(%edi), %edx
+ output_whitening(%eax,%ebp,a_offset)
+ output_whitening(%ebx,%ebp,b_offset)
+ ror $16, %ebx
+ output_whitening(%ecx,%ebp,c_offset)
+ output_whitening(%edx,%ebp,d_offset)
+ rol $1, %ecx
+
+ decrypt_round(R0,R1,R2,R3,15*8);
+ decrypt_round(R2,R3,R0,R1,14*8);
+ decrypt_round(R0,R1,R2,R3,13*8);
+ decrypt_round(R2,R3,R0,R1,12*8);
+ decrypt_round(R0,R1,R2,R3,11*8);
+ decrypt_round(R2,R3,R0,R1,10*8);
+ decrypt_round(R0,R1,R2,R3,9*8);
+ decrypt_round(R2,R3,R0,R1,8*8);
+ decrypt_round(R0,R1,R2,R3,7*8);
+ decrypt_round(R2,R3,R0,R1,6*8);
+ decrypt_round(R0,R1,R2,R3,5*8);
+ decrypt_round(R2,R3,R0,R1,4*8);
+ decrypt_round(R0,R1,R2,R3,3*8);
+ decrypt_round(R2,R3,R0,R1,2*8);
+ decrypt_round(R0,R1,R2,R3,1*8);
+ decrypt_last_round(R2,R3,R0,R1,0);
+
+ input_whitening(%eax,%ebp,c_offset)
+ input_whitening(%ebx,%ebp,d_offset)
+ input_whitening(%ecx,%ebp,a_offset)
+ input_whitening(%edx,%ebp,b_offset)
+ mov out_blk+16(%esp),%edi;
+ mov %eax, c_offset(%edi)
+ mov %ebx, d_offset(%edi)
+ mov %ecx, (%edi)
+ mov %edx, b_offset(%edi)
+
+ pop %edi
+ pop %esi
+ pop %ebx
+ pop %ebp
+ mov $1, %eax
+ ret
+ENDPROC(twofish_dec_blk)
diff --git a/arch/x86/crypto/twofish-x86_64-asm_64-3way.S b/arch/x86/crypto/twofish-x86_64-asm_64-3way.S
new file mode 100644
index 000000000..e7273a606
--- /dev/null
+++ b/arch/x86/crypto/twofish-x86_64-asm_64-3way.S
@@ -0,0 +1,320 @@
+/*
+ * Twofish Cipher 3-way parallel algorithm (x86_64)
+ *
+ * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+ * USA
+ *
+ */
+
+#include <linux/linkage.h>
+
+.file "twofish-x86_64-asm-3way.S"
+.text
+
+/* structure of crypto context */
+#define s0 0
+#define s1 1024
+#define s2 2048
+#define s3 3072
+#define w 4096
+#define k 4128
+
+/**********************************************************************
+ 3-way twofish
+ **********************************************************************/
+#define CTX %rdi
+#define RIO %rdx
+
+#define RAB0 %rax
+#define RAB1 %rbx
+#define RAB2 %rcx
+
+#define RAB0d %eax
+#define RAB1d %ebx
+#define RAB2d %ecx
+
+#define RAB0bh %ah
+#define RAB1bh %bh
+#define RAB2bh %ch
+
+#define RAB0bl %al
+#define RAB1bl %bl
+#define RAB2bl %cl
+
+#define CD0 0x0(%rsp)
+#define CD1 0x8(%rsp)
+#define CD2 0x10(%rsp)
+
+# used only before/after all rounds
+#define RCD0 %r8
+#define RCD1 %r9
+#define RCD2 %r10
+
+# used only during rounds
+#define RX0 %r8
+#define RX1 %r9
+#define RX2 %r10
+
+#define RX0d %r8d
+#define RX1d %r9d
+#define RX2d %r10d
+
+#define RY0 %r11
+#define RY1 %r12
+#define RY2 %r13
+
+#define RY0d %r11d
+#define RY1d %r12d
+#define RY2d %r13d
+
+#define RT0 %rdx
+#define RT1 %rsi
+
+#define RT0d %edx
+#define RT1d %esi
+
+#define RT1bl %sil
+
+#define do16bit_ror(rot, op1, op2, T0, T1, tmp1, tmp2, ab, dst) \
+ movzbl ab ## bl, tmp2 ## d; \
+ movzbl ab ## bh, tmp1 ## d; \
+ rorq $(rot), ab; \
+ op1##l T0(CTX, tmp2, 4), dst ## d; \
+ op2##l T1(CTX, tmp1, 4), dst ## d;
+
+#define swap_ab_with_cd(ab, cd, tmp) \
+ movq cd, tmp; \
+ movq ab, cd; \
+ movq tmp, ab;
+
+/*
+ * Combined G1 & G2 function. Reordered with help of rotates to have moves
+ * at begining.
+ */
+#define g1g2_3(ab, cd, Tx0, Tx1, Tx2, Tx3, Ty0, Ty1, Ty2, Ty3, x, y) \
+ /* G1,1 && G2,1 */ \
+ do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 0, ab ## 0, x ## 0); \
+ do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 0, ab ## 0, y ## 0); \
+ \
+ do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 1, ab ## 1, x ## 1); \
+ do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 1, ab ## 1, y ## 1); \
+ \
+ do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 2, ab ## 2, x ## 2); \
+ do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 2, ab ## 2, y ## 2); \
+ \
+ /* G1,2 && G2,2 */ \
+ do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 0, x ## 0); \
+ do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 0, y ## 0); \
+ swap_ab_with_cd(ab ## 0, cd ## 0, RT0); \
+ \
+ do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 1, x ## 1); \
+ do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 1, y ## 1); \
+ swap_ab_with_cd(ab ## 1, cd ## 1, RT0); \
+ \
+ do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 2, x ## 2); \
+ do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 2, y ## 2); \
+ swap_ab_with_cd(ab ## 2, cd ## 2, RT0);
+
+#define enc_round_end(ab, x, y, n) \
+ addl y ## d, x ## d; \
+ addl x ## d, y ## d; \
+ addl k+4*(2*(n))(CTX), x ## d; \
+ xorl ab ## d, x ## d; \
+ addl k+4*(2*(n)+1)(CTX), y ## d; \
+ shrq $32, ab; \
+ roll $1, ab ## d; \
+ xorl y ## d, ab ## d; \
+ shlq $32, ab; \
+ rorl $1, x ## d; \
+ orq x, ab;
+
+#define dec_round_end(ba, x, y, n) \
+ addl y ## d, x ## d; \
+ addl x ## d, y ## d; \
+ addl k+4*(2*(n))(CTX), x ## d; \
+ addl k+4*(2*(n)+1)(CTX), y ## d; \
+ xorl ba ## d, y ## d; \
+ shrq $32, ba; \
+ roll $1, ba ## d; \
+ xorl x ## d, ba ## d; \
+ shlq $32, ba; \
+ rorl $1, y ## d; \
+ orq y, ba;
+
+#define encrypt_round3(ab, cd, n) \
+ g1g2_3(ab, cd, s0, s1, s2, s3, s0, s1, s2, s3, RX, RY); \
+ \
+ enc_round_end(ab ## 0, RX0, RY0, n); \
+ enc_round_end(ab ## 1, RX1, RY1, n); \
+ enc_round_end(ab ## 2, RX2, RY2, n);
+
+#define decrypt_round3(ba, dc, n) \
+ g1g2_3(ba, dc, s1, s2, s3, s0, s3, s0, s1, s2, RY, RX); \
+ \
+ dec_round_end(ba ## 0, RX0, RY0, n); \
+ dec_round_end(ba ## 1, RX1, RY1, n); \
+ dec_round_end(ba ## 2, RX2, RY2, n);
+
+#define encrypt_cycle3(ab, cd, n) \
+ encrypt_round3(ab, cd, n*2); \
+ encrypt_round3(ab, cd, (n*2)+1);
+
+#define decrypt_cycle3(ba, dc, n) \
+ decrypt_round3(ba, dc, (n*2)+1); \
+ decrypt_round3(ba, dc, (n*2));
+
+#define push_cd() \
+ pushq RCD2; \
+ pushq RCD1; \
+ pushq RCD0;
+
+#define pop_cd() \
+ popq RCD0; \
+ popq RCD1; \
+ popq RCD2;
+
+#define inpack3(in, n, xy, m) \
+ movq 4*(n)(in), xy ## 0; \
+ xorq w+4*m(CTX), xy ## 0; \
+ \
+ movq 4*(4+(n))(in), xy ## 1; \
+ xorq w+4*m(CTX), xy ## 1; \
+ \
+ movq 4*(8+(n))(in), xy ## 2; \
+ xorq w+4*m(CTX), xy ## 2;
+
+#define outunpack3(op, out, n, xy, m) \
+ xorq w+4*m(CTX), xy ## 0; \
+ op ## q xy ## 0, 4*(n)(out); \
+ \
+ xorq w+4*m(CTX), xy ## 1; \
+ op ## q xy ## 1, 4*(4+(n))(out); \
+ \
+ xorq w+4*m(CTX), xy ## 2; \
+ op ## q xy ## 2, 4*(8+(n))(out);
+
+#define inpack_enc3() \
+ inpack3(RIO, 0, RAB, 0); \
+ inpack3(RIO, 2, RCD, 2);
+
+#define outunpack_enc3(op) \
+ outunpack3(op, RIO, 2, RAB, 6); \
+ outunpack3(op, RIO, 0, RCD, 4);
+
+#define inpack_dec3() \
+ inpack3(RIO, 0, RAB, 4); \
+ rorq $32, RAB0; \
+ rorq $32, RAB1; \
+ rorq $32, RAB2; \
+ inpack3(RIO, 2, RCD, 6); \
+ rorq $32, RCD0; \
+ rorq $32, RCD1; \
+ rorq $32, RCD2;
+
+#define outunpack_dec3() \
+ rorq $32, RCD0; \
+ rorq $32, RCD1; \
+ rorq $32, RCD2; \
+ outunpack3(mov, RIO, 0, RCD, 0); \
+ rorq $32, RAB0; \
+ rorq $32, RAB1; \
+ rorq $32, RAB2; \
+ outunpack3(mov, RIO, 2, RAB, 2);
+
+ENTRY(__twofish_enc_blk_3way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src, RIO
+ * %rcx: bool, if true: xor output
+ */
+ pushq %r13;
+ pushq %r12;
+ pushq %rbx;
+
+ pushq %rcx; /* bool xor */
+ pushq %rsi; /* dst */
+
+ inpack_enc3();
+
+ push_cd();
+ encrypt_cycle3(RAB, CD, 0);
+ encrypt_cycle3(RAB, CD, 1);
+ encrypt_cycle3(RAB, CD, 2);
+ encrypt_cycle3(RAB, CD, 3);
+ encrypt_cycle3(RAB, CD, 4);
+ encrypt_cycle3(RAB, CD, 5);
+ encrypt_cycle3(RAB, CD, 6);
+ encrypt_cycle3(RAB, CD, 7);
+ pop_cd();
+
+ popq RIO; /* dst */
+ popq RT1; /* bool xor */
+
+ testb RT1bl, RT1bl;
+ jnz .L__enc_xor3;
+
+ outunpack_enc3(mov);
+
+ popq %rbx;
+ popq %r12;
+ popq %r13;
+ ret;
+
+.L__enc_xor3:
+ outunpack_enc3(xor);
+
+ popq %rbx;
+ popq %r12;
+ popq %r13;
+ ret;
+ENDPROC(__twofish_enc_blk_3way)
+
+ENTRY(twofish_dec_blk_3way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src, RIO
+ */
+ pushq %r13;
+ pushq %r12;
+ pushq %rbx;
+
+ pushq %rsi; /* dst */
+
+ inpack_dec3();
+
+ push_cd();
+ decrypt_cycle3(RAB, CD, 7);
+ decrypt_cycle3(RAB, CD, 6);
+ decrypt_cycle3(RAB, CD, 5);
+ decrypt_cycle3(RAB, CD, 4);
+ decrypt_cycle3(RAB, CD, 3);
+ decrypt_cycle3(RAB, CD, 2);
+ decrypt_cycle3(RAB, CD, 1);
+ decrypt_cycle3(RAB, CD, 0);
+ pop_cd();
+
+ popq RIO; /* dst */
+
+ outunpack_dec3();
+
+ popq %rbx;
+ popq %r12;
+ popq %r13;
+ ret;
+ENDPROC(twofish_dec_blk_3way)
diff --git a/arch/x86/crypto/twofish-x86_64-asm_64.S b/arch/x86/crypto/twofish-x86_64-asm_64.S
new file mode 100644
index 000000000..a350c990d
--- /dev/null
+++ b/arch/x86/crypto/twofish-x86_64-asm_64.S
@@ -0,0 +1,321 @@
+/***************************************************************************
+* Copyright (C) 2006 by Joachim Fritschi, <jfritschi@freenet.de> *
+* *
+* This program is free software; you can redistribute it and/or modify *
+* it under the terms of the GNU General Public License as published by *
+* the Free Software Foundation; either version 2 of the License, or *
+* (at your option) any later version. *
+* *
+* This program is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
+* GNU General Public License for more details. *
+* *
+* You should have received a copy of the GNU General Public License *
+* along with this program; if not, write to the *
+* Free Software Foundation, Inc., *
+* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
+***************************************************************************/
+
+.file "twofish-x86_64-asm.S"
+.text
+
+#include <linux/linkage.h>
+#include <asm/asm-offsets.h>
+
+#define a_offset 0
+#define b_offset 4
+#define c_offset 8
+#define d_offset 12
+
+/* Structure of the crypto context struct*/
+
+#define s0 0 /* S0 Array 256 Words each */
+#define s1 1024 /* S1 Array */
+#define s2 2048 /* S2 Array */
+#define s3 3072 /* S3 Array */
+#define w 4096 /* 8 whitening keys (word) */
+#define k 4128 /* key 1-32 ( word ) */
+
+/* define a few register aliases to allow macro substitution */
+
+#define R0 %rax
+#define R0D %eax
+#define R0B %al
+#define R0H %ah
+
+#define R1 %rbx
+#define R1D %ebx
+#define R1B %bl
+#define R1H %bh
+
+#define R2 %rcx
+#define R2D %ecx
+#define R2B %cl
+#define R2H %ch
+
+#define R3 %rdx
+#define R3D %edx
+#define R3B %dl
+#define R3H %dh
+
+
+/* performs input whitening */
+#define input_whitening(src,context,offset)\
+ xor w+offset(context), src;
+
+/* performs input whitening */
+#define output_whitening(src,context,offset)\
+ xor w+16+offset(context), src;
+
+
+/*
+ * a input register containing a (rotated 16)
+ * b input register containing b
+ * c input register containing c
+ * d input register containing d (already rol $1)
+ * operations on a and b are interleaved to increase performance
+ */
+#define encrypt_round(a,b,c,d,round)\
+ movzx b ## B, %edi;\
+ mov s1(%r11,%rdi,4),%r8d;\
+ movzx a ## B, %edi;\
+ mov s2(%r11,%rdi,4),%r9d;\
+ movzx b ## H, %edi;\
+ ror $16, b ## D;\
+ xor s2(%r11,%rdi,4),%r8d;\
+ movzx a ## H, %edi;\
+ ror $16, a ## D;\
+ xor s3(%r11,%rdi,4),%r9d;\
+ movzx b ## B, %edi;\
+ xor s3(%r11,%rdi,4),%r8d;\
+ movzx a ## B, %edi;\
+ xor (%r11,%rdi,4), %r9d;\
+ movzx b ## H, %edi;\
+ ror $15, b ## D;\
+ xor (%r11,%rdi,4), %r8d;\
+ movzx a ## H, %edi;\
+ xor s1(%r11,%rdi,4),%r9d;\
+ add %r8d, %r9d;\
+ add %r9d, %r8d;\
+ add k+round(%r11), %r9d;\
+ xor %r9d, c ## D;\
+ rol $15, c ## D;\
+ add k+4+round(%r11),%r8d;\
+ xor %r8d, d ## D;
+
+/*
+ * a input register containing a(rotated 16)
+ * b input register containing b
+ * c input register containing c
+ * d input register containing d (already rol $1)
+ * operations on a and b are interleaved to increase performance
+ * during the round a and b are prepared for the output whitening
+ */
+#define encrypt_last_round(a,b,c,d,round)\
+ mov b ## D, %r10d;\
+ shl $32, %r10;\
+ movzx b ## B, %edi;\
+ mov s1(%r11,%rdi,4),%r8d;\
+ movzx a ## B, %edi;\
+ mov s2(%r11,%rdi,4),%r9d;\
+ movzx b ## H, %edi;\
+ ror $16, b ## D;\
+ xor s2(%r11,%rdi,4),%r8d;\
+ movzx a ## H, %edi;\
+ ror $16, a ## D;\
+ xor s3(%r11,%rdi,4),%r9d;\
+ movzx b ## B, %edi;\
+ xor s3(%r11,%rdi,4),%r8d;\
+ movzx a ## B, %edi;\
+ xor (%r11,%rdi,4), %r9d;\
+ xor a, %r10;\
+ movzx b ## H, %edi;\
+ xor (%r11,%rdi,4), %r8d;\
+ movzx a ## H, %edi;\
+ xor s1(%r11,%rdi,4),%r9d;\
+ add %r8d, %r9d;\
+ add %r9d, %r8d;\
+ add k+round(%r11), %r9d;\
+ xor %r9d, c ## D;\
+ ror $1, c ## D;\
+ add k+4+round(%r11),%r8d;\
+ xor %r8d, d ## D
+
+/*
+ * a input register containing a
+ * b input register containing b (rotated 16)
+ * c input register containing c (already rol $1)
+ * d input register containing d
+ * operations on a and b are interleaved to increase performance
+ */
+#define decrypt_round(a,b,c,d,round)\
+ movzx a ## B, %edi;\
+ mov (%r11,%rdi,4), %r9d;\
+ movzx b ## B, %edi;\
+ mov s3(%r11,%rdi,4),%r8d;\
+ movzx a ## H, %edi;\
+ ror $16, a ## D;\
+ xor s1(%r11,%rdi,4),%r9d;\
+ movzx b ## H, %edi;\
+ ror $16, b ## D;\
+ xor (%r11,%rdi,4), %r8d;\
+ movzx a ## B, %edi;\
+ xor s2(%r11,%rdi,4),%r9d;\
+ movzx b ## B, %edi;\
+ xor s1(%r11,%rdi,4),%r8d;\
+ movzx a ## H, %edi;\
+ ror $15, a ## D;\
+ xor s3(%r11,%rdi,4),%r9d;\
+ movzx b ## H, %edi;\
+ xor s2(%r11,%rdi,4),%r8d;\
+ add %r8d, %r9d;\
+ add %r9d, %r8d;\
+ add k+round(%r11), %r9d;\
+ xor %r9d, c ## D;\
+ add k+4+round(%r11),%r8d;\
+ xor %r8d, d ## D;\
+ rol $15, d ## D;
+
+/*
+ * a input register containing a
+ * b input register containing b
+ * c input register containing c (already rol $1)
+ * d input register containing d
+ * operations on a and b are interleaved to increase performance
+ * during the round a and b are prepared for the output whitening
+ */
+#define decrypt_last_round(a,b,c,d,round)\
+ movzx a ## B, %edi;\
+ mov (%r11,%rdi,4), %r9d;\
+ movzx b ## B, %edi;\
+ mov s3(%r11,%rdi,4),%r8d;\
+ movzx b ## H, %edi;\
+ ror $16, b ## D;\
+ xor (%r11,%rdi,4), %r8d;\
+ movzx a ## H, %edi;\
+ mov b ## D, %r10d;\
+ shl $32, %r10;\
+ xor a, %r10;\
+ ror $16, a ## D;\
+ xor s1(%r11,%rdi,4),%r9d;\
+ movzx b ## B, %edi;\
+ xor s1(%r11,%rdi,4),%r8d;\
+ movzx a ## B, %edi;\
+ xor s2(%r11,%rdi,4),%r9d;\
+ movzx b ## H, %edi;\
+ xor s2(%r11,%rdi,4),%r8d;\
+ movzx a ## H, %edi;\
+ xor s3(%r11,%rdi,4),%r9d;\
+ add %r8d, %r9d;\
+ add %r9d, %r8d;\
+ add k+round(%r11), %r9d;\
+ xor %r9d, c ## D;\
+ add k+4+round(%r11),%r8d;\
+ xor %r8d, d ## D;\
+ ror $1, d ## D;
+
+ENTRY(twofish_enc_blk)
+ pushq R1
+
+ /* %rdi contains the ctx address */
+ /* %rsi contains the output address */
+ /* %rdx contains the input address */
+ /* ctx address is moved to free one non-rex register
+ as target for the 8bit high operations */
+ mov %rdi, %r11
+
+ movq (R3), R1
+ movq 8(R3), R3
+ input_whitening(R1,%r11,a_offset)
+ input_whitening(R3,%r11,c_offset)
+ mov R1D, R0D
+ rol $16, R0D
+ shr $32, R1
+ mov R3D, R2D
+ shr $32, R3
+ rol $1, R3D
+
+ encrypt_round(R0,R1,R2,R3,0);
+ encrypt_round(R2,R3,R0,R1,8);
+ encrypt_round(R0,R1,R2,R3,2*8);
+ encrypt_round(R2,R3,R0,R1,3*8);
+ encrypt_round(R0,R1,R2,R3,4*8);
+ encrypt_round(R2,R3,R0,R1,5*8);
+ encrypt_round(R0,R1,R2,R3,6*8);
+ encrypt_round(R2,R3,R0,R1,7*8);
+ encrypt_round(R0,R1,R2,R3,8*8);
+ encrypt_round(R2,R3,R0,R1,9*8);
+ encrypt_round(R0,R1,R2,R3,10*8);
+ encrypt_round(R2,R3,R0,R1,11*8);
+ encrypt_round(R0,R1,R2,R3,12*8);
+ encrypt_round(R2,R3,R0,R1,13*8);
+ encrypt_round(R0,R1,R2,R3,14*8);
+ encrypt_last_round(R2,R3,R0,R1,15*8);
+
+
+ output_whitening(%r10,%r11,a_offset)
+ movq %r10, (%rsi)
+
+ shl $32, R1
+ xor R0, R1
+
+ output_whitening(R1,%r11,c_offset)
+ movq R1, 8(%rsi)
+
+ popq R1
+ movl $1,%eax
+ ret
+ENDPROC(twofish_enc_blk)
+
+ENTRY(twofish_dec_blk)
+ pushq R1
+
+ /* %rdi contains the ctx address */
+ /* %rsi contains the output address */
+ /* %rdx contains the input address */
+ /* ctx address is moved to free one non-rex register
+ as target for the 8bit high operations */
+ mov %rdi, %r11
+
+ movq (R3), R1
+ movq 8(R3), R3
+ output_whitening(R1,%r11,a_offset)
+ output_whitening(R3,%r11,c_offset)
+ mov R1D, R0D
+ shr $32, R1
+ rol $16, R1D
+ mov R3D, R2D
+ shr $32, R3
+ rol $1, R2D
+
+ decrypt_round(R0,R1,R2,R3,15*8);
+ decrypt_round(R2,R3,R0,R1,14*8);
+ decrypt_round(R0,R1,R2,R3,13*8);
+ decrypt_round(R2,R3,R0,R1,12*8);
+ decrypt_round(R0,R1,R2,R3,11*8);
+ decrypt_round(R2,R3,R0,R1,10*8);
+ decrypt_round(R0,R1,R2,R3,9*8);
+ decrypt_round(R2,R3,R0,R1,8*8);
+ decrypt_round(R0,R1,R2,R3,7*8);
+ decrypt_round(R2,R3,R0,R1,6*8);
+ decrypt_round(R0,R1,R2,R3,5*8);
+ decrypt_round(R2,R3,R0,R1,4*8);
+ decrypt_round(R0,R1,R2,R3,3*8);
+ decrypt_round(R2,R3,R0,R1,2*8);
+ decrypt_round(R0,R1,R2,R3,1*8);
+ decrypt_last_round(R2,R3,R0,R1,0);
+
+ input_whitening(%r10,%r11,a_offset)
+ movq %r10, (%rsi)
+
+ shl $32, R1
+ xor R0, R1
+
+ input_whitening(R1,%r11,c_offset)
+ movq R1, 8(%rsi)
+
+ popq R1
+ movl $1,%eax
+ ret
+ENDPROC(twofish_dec_blk)
diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c
new file mode 100644
index 000000000..66d989230
--- /dev/null
+++ b/arch/x86/crypto/twofish_avx_glue.c
@@ -0,0 +1,328 @@
+/*
+ * Glue Code for AVX assembler version of Twofish Cipher
+ *
+ * Copyright (C) 2012 Johannes Goetzfried
+ * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
+ *
+ * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+ * USA
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <crypto/algapi.h>
+#include <crypto/internal/simd.h>
+#include <crypto/twofish.h>
+#include <crypto/xts.h>
+#include <asm/crypto/glue_helper.h>
+#include <asm/crypto/twofish.h>
+
+#define TWOFISH_PARALLEL_BLOCKS 8
+
+/* 8-way parallel cipher functions */
+asmlinkage void twofish_ecb_enc_8way(struct twofish_ctx *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void twofish_ecb_dec_8way(struct twofish_ctx *ctx, u8 *dst,
+ const u8 *src);
+
+asmlinkage void twofish_cbc_dec_8way(struct twofish_ctx *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void twofish_ctr_8way(struct twofish_ctx *ctx, u8 *dst,
+ const u8 *src, le128 *iv);
+
+asmlinkage void twofish_xts_enc_8way(struct twofish_ctx *ctx, u8 *dst,
+ const u8 *src, le128 *iv);
+asmlinkage void twofish_xts_dec_8way(struct twofish_ctx *ctx, u8 *dst,
+ const u8 *src, le128 *iv);
+
+static int twofish_setkey_skcipher(struct crypto_skcipher *tfm,
+ const u8 *key, unsigned int keylen)
+{
+ return twofish_setkey(&tfm->base, key, keylen);
+}
+
+static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
+ const u8 *src)
+{
+ __twofish_enc_blk_3way(ctx, dst, src, false);
+}
+
+static void twofish_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+{
+ glue_xts_crypt_128bit_one(ctx, dst, src, iv,
+ GLUE_FUNC_CAST(twofish_enc_blk));
+}
+
+static void twofish_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+{
+ glue_xts_crypt_128bit_one(ctx, dst, src, iv,
+ GLUE_FUNC_CAST(twofish_dec_blk));
+}
+
+struct twofish_xts_ctx {
+ struct twofish_ctx tweak_ctx;
+ struct twofish_ctx crypt_ctx;
+};
+
+static int xts_twofish_setkey(struct crypto_skcipher *tfm, const u8 *key,
+ unsigned int keylen)
+{
+ struct twofish_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+ u32 *flags = &tfm->base.crt_flags;
+ int err;
+
+ err = xts_verify_key(tfm, key, keylen);
+ if (err)
+ return err;
+
+ /* first half of xts-key is for crypt */
+ err = __twofish_setkey(&ctx->crypt_ctx, key, keylen / 2, flags);
+ if (err)
+ return err;
+
+ /* second half of xts-key is for tweak */
+ return __twofish_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2,
+ flags);
+}
+
+static const struct common_glue_ctx twofish_enc = {
+ .num_funcs = 3,
+ .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = TWOFISH_PARALLEL_BLOCKS,
+ .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_enc_8way) }
+ }, {
+ .num_blocks = 3,
+ .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_3way) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk) }
+ } }
+};
+
+static const struct common_glue_ctx twofish_ctr = {
+ .num_funcs = 3,
+ .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = TWOFISH_PARALLEL_BLOCKS,
+ .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_ctr_8way) }
+ }, {
+ .num_blocks = 3,
+ .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr_3way) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr) }
+ } }
+};
+
+static const struct common_glue_ctx twofish_enc_xts = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = TWOFISH_PARALLEL_BLOCKS,
+ .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc_8way) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc) }
+ } }
+};
+
+static const struct common_glue_ctx twofish_dec = {
+ .num_funcs = 3,
+ .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = TWOFISH_PARALLEL_BLOCKS,
+ .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_dec_8way) }
+ }, {
+ .num_blocks = 3,
+ .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_3way) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk) }
+ } }
+};
+
+static const struct common_glue_ctx twofish_dec_cbc = {
+ .num_funcs = 3,
+ .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = TWOFISH_PARALLEL_BLOCKS,
+ .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_cbc_dec_8way) }
+ }, {
+ .num_blocks = 3,
+ .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_3way) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk) }
+ } }
+};
+
+static const struct common_glue_ctx twofish_dec_xts = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS,
+
+ .funcs = { {
+ .num_blocks = TWOFISH_PARALLEL_BLOCKS,
+ .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec_8way) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec) }
+ } }
+};
+
+static int ecb_encrypt(struct skcipher_request *req)
+{
+ return glue_ecb_req_128bit(&twofish_enc, req);
+}
+
+static int ecb_decrypt(struct skcipher_request *req)
+{
+ return glue_ecb_req_128bit(&twofish_dec, req);
+}
+
+static int cbc_encrypt(struct skcipher_request *req)
+{
+ return glue_cbc_encrypt_req_128bit(GLUE_FUNC_CAST(twofish_enc_blk),
+ req);
+}
+
+static int cbc_decrypt(struct skcipher_request *req)
+{
+ return glue_cbc_decrypt_req_128bit(&twofish_dec_cbc, req);
+}
+
+static int ctr_crypt(struct skcipher_request *req)
+{
+ return glue_ctr_req_128bit(&twofish_ctr, req);
+}
+
+static int xts_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct twofish_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return glue_xts_req_128bit(&twofish_enc_xts, req,
+ XTS_TWEAK_CAST(twofish_enc_blk),
+ &ctx->tweak_ctx, &ctx->crypt_ctx);
+}
+
+static int xts_decrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct twofish_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return glue_xts_req_128bit(&twofish_dec_xts, req,
+ XTS_TWEAK_CAST(twofish_enc_blk),
+ &ctx->tweak_ctx, &ctx->crypt_ctx);
+}
+
+static struct skcipher_alg twofish_algs[] = {
+ {
+ .base.cra_name = "__ecb(twofish)",
+ .base.cra_driver_name = "__ecb-twofish-avx",
+ .base.cra_priority = 400,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = TF_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct twofish_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = TF_MIN_KEY_SIZE,
+ .max_keysize = TF_MAX_KEY_SIZE,
+ .setkey = twofish_setkey_skcipher,
+ .encrypt = ecb_encrypt,
+ .decrypt = ecb_decrypt,
+ }, {
+ .base.cra_name = "__cbc(twofish)",
+ .base.cra_driver_name = "__cbc-twofish-avx",
+ .base.cra_priority = 400,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = TF_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct twofish_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = TF_MIN_KEY_SIZE,
+ .max_keysize = TF_MAX_KEY_SIZE,
+ .ivsize = TF_BLOCK_SIZE,
+ .setkey = twofish_setkey_skcipher,
+ .encrypt = cbc_encrypt,
+ .decrypt = cbc_decrypt,
+ }, {
+ .base.cra_name = "__ctr(twofish)",
+ .base.cra_driver_name = "__ctr-twofish-avx",
+ .base.cra_priority = 400,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct twofish_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = TF_MIN_KEY_SIZE,
+ .max_keysize = TF_MAX_KEY_SIZE,
+ .ivsize = TF_BLOCK_SIZE,
+ .chunksize = TF_BLOCK_SIZE,
+ .setkey = twofish_setkey_skcipher,
+ .encrypt = ctr_crypt,
+ .decrypt = ctr_crypt,
+ }, {
+ .base.cra_name = "__xts(twofish)",
+ .base.cra_driver_name = "__xts-twofish-avx",
+ .base.cra_priority = 400,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = TF_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct twofish_xts_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = 2 * TF_MIN_KEY_SIZE,
+ .max_keysize = 2 * TF_MAX_KEY_SIZE,
+ .ivsize = TF_BLOCK_SIZE,
+ .setkey = xts_twofish_setkey,
+ .encrypt = xts_encrypt,
+ .decrypt = xts_decrypt,
+ },
+};
+
+static struct simd_skcipher_alg *twofish_simd_algs[ARRAY_SIZE(twofish_algs)];
+
+static int __init twofish_init(void)
+{
+ const char *feature_name;
+
+ if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, &feature_name)) {
+ pr_info("CPU feature '%s' is not supported.\n", feature_name);
+ return -ENODEV;
+ }
+
+ return simd_register_skciphers_compat(twofish_algs,
+ ARRAY_SIZE(twofish_algs),
+ twofish_simd_algs);
+}
+
+static void __exit twofish_exit(void)
+{
+ simd_unregister_skciphers(twofish_algs, ARRAY_SIZE(twofish_algs),
+ twofish_simd_algs);
+}
+
+module_init(twofish_init);
+module_exit(twofish_exit);
+
+MODULE_DESCRIPTION("Twofish Cipher Algorithm, AVX optimized");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_CRYPTO("twofish");
diff --git a/arch/x86/crypto/twofish_glue.c b/arch/x86/crypto/twofish_glue.c
new file mode 100644
index 000000000..77e06c2da
--- /dev/null
+++ b/arch/x86/crypto/twofish_glue.c
@@ -0,0 +1,100 @@
+/*
+ * Glue Code for assembler optimized version of TWOFISH
+ *
+ * Originally Twofish for GPG
+ * By Matthew Skala <mskala@ansuz.sooke.bc.ca>, July 26, 1998
+ * 256-bit key length added March 20, 1999
+ * Some modifications to reduce the text size by Werner Koch, April, 1998
+ * Ported to the kerneli patch by Marc Mutz <Marc@Mutz.com>
+ * Ported to CryptoAPI by Colin Slater <hoho@tacomeat.net>
+ *
+ * The original author has disclaimed all copyright interest in this
+ * code and thus put it in the public domain. The subsequent authors
+ * have put this under the GNU General Public License.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+ * USA
+ *
+ * This code is a "clean room" implementation, written from the paper
+ * _Twofish: A 128-Bit Block Cipher_ by Bruce Schneier, John Kelsey,
+ * Doug Whiting, David Wagner, Chris Hall, and Niels Ferguson, available
+ * through http://www.counterpane.com/twofish.html
+ *
+ * For background information on multiplication in finite fields, used for
+ * the matrix operations in the key schedule, see the book _Contemporary
+ * Abstract Algebra_ by Joseph A. Gallian, especially chapter 22 in the
+ * Third Edition.
+ */
+
+#include <crypto/twofish.h>
+#include <linux/crypto.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/types.h>
+
+asmlinkage void twofish_enc_blk(struct twofish_ctx *ctx, u8 *dst,
+ const u8 *src);
+EXPORT_SYMBOL_GPL(twofish_enc_blk);
+asmlinkage void twofish_dec_blk(struct twofish_ctx *ctx, u8 *dst,
+ const u8 *src);
+EXPORT_SYMBOL_GPL(twofish_dec_blk);
+
+static void twofish_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+ twofish_enc_blk(crypto_tfm_ctx(tfm), dst, src);
+}
+
+static void twofish_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+ twofish_dec_blk(crypto_tfm_ctx(tfm), dst, src);
+}
+
+static struct crypto_alg alg = {
+ .cra_name = "twofish",
+ .cra_driver_name = "twofish-asm",
+ .cra_priority = 200,
+ .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
+ .cra_blocksize = TF_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct twofish_ctx),
+ .cra_alignmask = 0,
+ .cra_module = THIS_MODULE,
+ .cra_u = {
+ .cipher = {
+ .cia_min_keysize = TF_MIN_KEY_SIZE,
+ .cia_max_keysize = TF_MAX_KEY_SIZE,
+ .cia_setkey = twofish_setkey,
+ .cia_encrypt = twofish_encrypt,
+ .cia_decrypt = twofish_decrypt
+ }
+ }
+};
+
+static int __init init(void)
+{
+ return crypto_register_alg(&alg);
+}
+
+static void __exit fini(void)
+{
+ crypto_unregister_alg(&alg);
+}
+
+module_init(init);
+module_exit(fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION ("Twofish Cipher Algorithm, asm optimized");
+MODULE_ALIAS_CRYPTO("twofish");
+MODULE_ALIAS_CRYPTO("twofish-asm");
diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c
new file mode 100644
index 000000000..571485502
--- /dev/null
+++ b/arch/x86/crypto/twofish_glue_3way.c
@@ -0,0 +1,290 @@
+/*
+ * Glue Code for 3-way parallel assembler optimized version of Twofish
+ *
+ * Copyright (c) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+ * USA
+ *
+ */
+
+#include <asm/crypto/glue_helper.h>
+#include <asm/crypto/twofish.h>
+#include <crypto/algapi.h>
+#include <crypto/b128ops.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/twofish.h>
+#include <linux/crypto.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/types.h>
+
+EXPORT_SYMBOL_GPL(__twofish_enc_blk_3way);
+EXPORT_SYMBOL_GPL(twofish_dec_blk_3way);
+
+static int twofish_setkey_skcipher(struct crypto_skcipher *tfm,
+ const u8 *key, unsigned int keylen)
+{
+ return twofish_setkey(&tfm->base, key, keylen);
+}
+
+static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
+ const u8 *src)
+{
+ __twofish_enc_blk_3way(ctx, dst, src, false);
+}
+
+static inline void twofish_enc_blk_xor_3way(struct twofish_ctx *ctx, u8 *dst,
+ const u8 *src)
+{
+ __twofish_enc_blk_3way(ctx, dst, src, true);
+}
+
+void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src)
+{
+ u128 ivs[2];
+
+ ivs[0] = src[0];
+ ivs[1] = src[1];
+
+ twofish_dec_blk_3way(ctx, (u8 *)dst, (u8 *)src);
+
+ u128_xor(&dst[1], &dst[1], &ivs[0]);
+ u128_xor(&dst[2], &dst[2], &ivs[1]);
+}
+EXPORT_SYMBOL_GPL(twofish_dec_blk_cbc_3way);
+
+void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+{
+ be128 ctrblk;
+
+ if (dst != src)
+ *dst = *src;
+
+ le128_to_be128(&ctrblk, iv);
+ le128_inc(iv);
+
+ twofish_enc_blk(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
+ u128_xor(dst, dst, (u128 *)&ctrblk);
+}
+EXPORT_SYMBOL_GPL(twofish_enc_blk_ctr);
+
+void twofish_enc_blk_ctr_3way(void *ctx, u128 *dst, const u128 *src,
+ le128 *iv)
+{
+ be128 ctrblks[3];
+
+ if (dst != src) {
+ dst[0] = src[0];
+ dst[1] = src[1];
+ dst[2] = src[2];
+ }
+
+ le128_to_be128(&ctrblks[0], iv);
+ le128_inc(iv);
+ le128_to_be128(&ctrblks[1], iv);
+ le128_inc(iv);
+ le128_to_be128(&ctrblks[2], iv);
+ le128_inc(iv);
+
+ twofish_enc_blk_xor_3way(ctx, (u8 *)dst, (u8 *)ctrblks);
+}
+EXPORT_SYMBOL_GPL(twofish_enc_blk_ctr_3way);
+
+static const struct common_glue_ctx twofish_enc = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = -1,
+
+ .funcs = { {
+ .num_blocks = 3,
+ .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_3way) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk) }
+ } }
+};
+
+static const struct common_glue_ctx twofish_ctr = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = -1,
+
+ .funcs = { {
+ .num_blocks = 3,
+ .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_ctr_3way) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_ctr) }
+ } }
+};
+
+static const struct common_glue_ctx twofish_dec = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = -1,
+
+ .funcs = { {
+ .num_blocks = 3,
+ .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_3way) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk) }
+ } }
+};
+
+static const struct common_glue_ctx twofish_dec_cbc = {
+ .num_funcs = 2,
+ .fpu_blocks_limit = -1,
+
+ .funcs = { {
+ .num_blocks = 3,
+ .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_3way) }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk) }
+ } }
+};
+
+static int ecb_encrypt(struct skcipher_request *req)
+{
+ return glue_ecb_req_128bit(&twofish_enc, req);
+}
+
+static int ecb_decrypt(struct skcipher_request *req)
+{
+ return glue_ecb_req_128bit(&twofish_dec, req);
+}
+
+static int cbc_encrypt(struct skcipher_request *req)
+{
+ return glue_cbc_encrypt_req_128bit(GLUE_FUNC_CAST(twofish_enc_blk),
+ req);
+}
+
+static int cbc_decrypt(struct skcipher_request *req)
+{
+ return glue_cbc_decrypt_req_128bit(&twofish_dec_cbc, req);
+}
+
+static int ctr_crypt(struct skcipher_request *req)
+{
+ return glue_ctr_req_128bit(&twofish_ctr, req);
+}
+
+static struct skcipher_alg tf_skciphers[] = {
+ {
+ .base.cra_name = "ecb(twofish)",
+ .base.cra_driver_name = "ecb-twofish-3way",
+ .base.cra_priority = 300,
+ .base.cra_blocksize = TF_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct twofish_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = TF_MIN_KEY_SIZE,
+ .max_keysize = TF_MAX_KEY_SIZE,
+ .setkey = twofish_setkey_skcipher,
+ .encrypt = ecb_encrypt,
+ .decrypt = ecb_decrypt,
+ }, {
+ .base.cra_name = "cbc(twofish)",
+ .base.cra_driver_name = "cbc-twofish-3way",
+ .base.cra_priority = 300,
+ .base.cra_blocksize = TF_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct twofish_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = TF_MIN_KEY_SIZE,
+ .max_keysize = TF_MAX_KEY_SIZE,
+ .ivsize = TF_BLOCK_SIZE,
+ .setkey = twofish_setkey_skcipher,
+ .encrypt = cbc_encrypt,
+ .decrypt = cbc_decrypt,
+ }, {
+ .base.cra_name = "ctr(twofish)",
+ .base.cra_driver_name = "ctr-twofish-3way",
+ .base.cra_priority = 300,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct twofish_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = TF_MIN_KEY_SIZE,
+ .max_keysize = TF_MAX_KEY_SIZE,
+ .ivsize = TF_BLOCK_SIZE,
+ .chunksize = TF_BLOCK_SIZE,
+ .setkey = twofish_setkey_skcipher,
+ .encrypt = ctr_crypt,
+ .decrypt = ctr_crypt,
+ },
+};
+
+static bool is_blacklisted_cpu(void)
+{
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return false;
+
+ if (boot_cpu_data.x86 == 0x06 &&
+ (boot_cpu_data.x86_model == 0x1c ||
+ boot_cpu_data.x86_model == 0x26 ||
+ boot_cpu_data.x86_model == 0x36)) {
+ /*
+ * On Atom, twofish-3way is slower than original assembler
+ * implementation. Twofish-3way trades off some performance in
+ * storing blocks in 64bit registers to allow three blocks to
+ * be processed parallel. Parallel operation then allows gaining
+ * more performance than was trade off, on out-of-order CPUs.
+ * However Atom does not benefit from this parallellism and
+ * should be blacklisted.
+ */
+ return true;
+ }
+
+ if (boot_cpu_data.x86 == 0x0f) {
+ /*
+ * On Pentium 4, twofish-3way is slower than original assembler
+ * implementation because excessive uses of 64bit rotate and
+ * left-shifts (which are really slow on P4) needed to store and
+ * handle 128bit block in two 64bit registers.
+ */
+ return true;
+ }
+
+ return false;
+}
+
+static int force;
+module_param(force, int, 0);
+MODULE_PARM_DESC(force, "Force module load, ignore CPU blacklist");
+
+static int __init init(void)
+{
+ if (!force && is_blacklisted_cpu()) {
+ printk(KERN_INFO
+ "twofish-x86_64-3way: performance on this CPU "
+ "would be suboptimal: disabling "
+ "twofish-x86_64-3way.\n");
+ return -ENODEV;
+ }
+
+ return crypto_register_skciphers(tf_skciphers,
+ ARRAY_SIZE(tf_skciphers));
+}
+
+static void __exit fini(void)
+{
+ crypto_unregister_skciphers(tf_skciphers, ARRAY_SIZE(tf_skciphers));
+}
+
+module_init(init);
+module_exit(fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Twofish Cipher Algorithm, 3-way parallel asm optimized");
+MODULE_ALIAS_CRYPTO("twofish");
+MODULE_ALIAS_CRYPTO("twofish-asm");
diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile
new file mode 100644
index 000000000..06fc70cf5
--- /dev/null
+++ b/arch/x86/entry/Makefile
@@ -0,0 +1,17 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for the x86 low level entry code
+#
+
+OBJECT_FILES_NON_STANDARD_entry_64_compat.o := y
+
+CFLAGS_syscall_64.o += $(call cc-option,-Wno-override-init,)
+CFLAGS_syscall_32.o += $(call cc-option,-Wno-override-init,)
+obj-y := entry_$(BITS).o thunk_$(BITS).o syscall_$(BITS).o
+obj-y += common.o
+
+obj-y += vdso/
+obj-y += vsyscall/
+
+obj-$(CONFIG_IA32_EMULATION) += entry_64_compat.o syscall_32.o
+
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
new file mode 100644
index 000000000..993dd06c8
--- /dev/null
+++ b/arch/x86/entry/calling.h
@@ -0,0 +1,350 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/jump_label.h>
+#include <asm/unwind_hints.h>
+#include <asm/cpufeatures.h>
+#include <asm/page_types.h>
+#include <asm/percpu.h>
+#include <asm/asm-offsets.h>
+#include <asm/processor-flags.h>
+
+/*
+
+ x86 function call convention, 64-bit:
+ -------------------------------------
+ arguments | callee-saved | extra caller-saved | return
+ [callee-clobbered] | | [callee-clobbered] |
+ ---------------------------------------------------------------------------
+ rdi rsi rdx rcx r8-9 | rbx rbp [*] r12-15 | r10-11 | rax, rdx [**]
+
+ ( rsp is obviously invariant across normal function calls. (gcc can 'merge'
+ functions when it sees tail-call optimization possibilities) rflags is
+ clobbered. Leftover arguments are passed over the stack frame.)
+
+ [*] In the frame-pointers case rbp is fixed to the stack frame.
+
+ [**] for struct return values wider than 64 bits the return convention is a
+ bit more complex: up to 128 bits width we return small structures
+ straight in rax, rdx. For structures larger than that (3 words or
+ larger) the caller puts a pointer to an on-stack return struct
+ [allocated in the caller's stack frame] into the first argument - i.e.
+ into rdi. All other arguments shift up by one in this case.
+ Fortunately this case is rare in the kernel.
+
+For 32-bit we have the following conventions - kernel is built with
+-mregparm=3 and -freg-struct-return:
+
+ x86 function calling convention, 32-bit:
+ ----------------------------------------
+ arguments | callee-saved | extra caller-saved | return
+ [callee-clobbered] | | [callee-clobbered] |
+ -------------------------------------------------------------------------
+ eax edx ecx | ebx edi esi ebp [*] | <none> | eax, edx [**]
+
+ ( here too esp is obviously invariant across normal function calls. eflags
+ is clobbered. Leftover arguments are passed over the stack frame. )
+
+ [*] In the frame-pointers case ebp is fixed to the stack frame.
+
+ [**] We build with -freg-struct-return, which on 32-bit means similar
+ semantics as on 64-bit: edx can be used for a second return value
+ (i.e. covering integer and structure sizes up to 64 bits) - after that
+ it gets more complex and more expensive: 3-word or larger struct returns
+ get done in the caller's frame and the pointer to the return struct goes
+ into regparm0, i.e. eax - the other arguments shift up and the
+ function's register parameters degenerate to regparm=2 in essence.
+
+*/
+
+#ifdef CONFIG_X86_64
+
+/*
+ * 64-bit system call stack frame layout defines and helpers,
+ * for assembly code:
+ */
+
+/* The layout forms the "struct pt_regs" on the stack: */
+/*
+ * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
+ * unless syscall needs a complete, fully filled "struct pt_regs".
+ */
+#define R15 0*8
+#define R14 1*8
+#define R13 2*8
+#define R12 3*8
+#define RBP 4*8
+#define RBX 5*8
+/* These regs are callee-clobbered. Always saved on kernel entry. */
+#define R11 6*8
+#define R10 7*8
+#define R9 8*8
+#define R8 9*8
+#define RAX 10*8
+#define RCX 11*8
+#define RDX 12*8
+#define RSI 13*8
+#define RDI 14*8
+/*
+ * On syscall entry, this is syscall#. On CPU exception, this is error code.
+ * On hw interrupt, it's IRQ number:
+ */
+#define ORIG_RAX 15*8
+/* Return frame for iretq */
+#define RIP 16*8
+#define CS 17*8
+#define EFLAGS 18*8
+#define RSP 19*8
+#define SS 20*8
+
+#define SIZEOF_PTREGS 21*8
+
+.macro PUSH_AND_CLEAR_REGS rdx=%rdx rax=%rax save_ret=0
+ .if \save_ret
+ pushq %rsi /* pt_regs->si */
+ movq 8(%rsp), %rsi /* temporarily store the return address in %rsi */
+ movq %rdi, 8(%rsp) /* pt_regs->di (overwriting original return address) */
+ .else
+ pushq %rdi /* pt_regs->di */
+ pushq %rsi /* pt_regs->si */
+ .endif
+ pushq \rdx /* pt_regs->dx */
+ pushq %rcx /* pt_regs->cx */
+ pushq \rax /* pt_regs->ax */
+ pushq %r8 /* pt_regs->r8 */
+ pushq %r9 /* pt_regs->r9 */
+ pushq %r10 /* pt_regs->r10 */
+ pushq %r11 /* pt_regs->r11 */
+ pushq %rbx /* pt_regs->rbx */
+ pushq %rbp /* pt_regs->rbp */
+ pushq %r12 /* pt_regs->r12 */
+ pushq %r13 /* pt_regs->r13 */
+ pushq %r14 /* pt_regs->r14 */
+ pushq %r15 /* pt_regs->r15 */
+ UNWIND_HINT_REGS
+
+ .if \save_ret
+ pushq %rsi /* return address on top of stack */
+ .endif
+
+ /*
+ * Sanitize registers of values that a speculation attack might
+ * otherwise want to exploit. The lower registers are likely clobbered
+ * well before they could be put to use in a speculative execution
+ * gadget.
+ */
+ xorl %edx, %edx /* nospec dx */
+ xorl %ecx, %ecx /* nospec cx */
+ xorl %r8d, %r8d /* nospec r8 */
+ xorl %r9d, %r9d /* nospec r9 */
+ xorl %r10d, %r10d /* nospec r10 */
+ xorl %r11d, %r11d /* nospec r11 */
+ xorl %ebx, %ebx /* nospec rbx */
+ xorl %ebp, %ebp /* nospec rbp */
+ xorl %r12d, %r12d /* nospec r12 */
+ xorl %r13d, %r13d /* nospec r13 */
+ xorl %r14d, %r14d /* nospec r14 */
+ xorl %r15d, %r15d /* nospec r15 */
+
+.endm
+
+.macro POP_REGS pop_rdi=1 skip_r11rcx=0
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbp
+ popq %rbx
+ .if \skip_r11rcx
+ popq %rsi
+ .else
+ popq %r11
+ .endif
+ popq %r10
+ popq %r9
+ popq %r8
+ popq %rax
+ .if \skip_r11rcx
+ popq %rsi
+ .else
+ popq %rcx
+ .endif
+ popq %rdx
+ popq %rsi
+ .if \pop_rdi
+ popq %rdi
+ .endif
+.endm
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+
+/*
+ * PAGE_TABLE_ISOLATION PGDs are 8k. Flip bit 12 to switch between the two
+ * halves:
+ */
+#define PTI_USER_PGTABLE_BIT PAGE_SHIFT
+#define PTI_USER_PGTABLE_MASK (1 << PTI_USER_PGTABLE_BIT)
+#define PTI_USER_PCID_BIT X86_CR3_PTI_PCID_USER_BIT
+#define PTI_USER_PCID_MASK (1 << PTI_USER_PCID_BIT)
+#define PTI_USER_PGTABLE_AND_PCID_MASK (PTI_USER_PCID_MASK | PTI_USER_PGTABLE_MASK)
+
+.macro SET_NOFLUSH_BIT reg:req
+ bts $X86_CR3_PCID_NOFLUSH_BIT, \reg
+.endm
+
+.macro ADJUST_KERNEL_CR3 reg:req
+ ALTERNATIVE "", "SET_NOFLUSH_BIT \reg", X86_FEATURE_PCID
+ /* Clear PCID and "PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */
+ andq $(~PTI_USER_PGTABLE_AND_PCID_MASK), \reg
+.endm
+
+.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
+ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
+ mov %cr3, \scratch_reg
+ ADJUST_KERNEL_CR3 \scratch_reg
+ mov \scratch_reg, %cr3
+.Lend_\@:
+.endm
+
+#define THIS_CPU_user_pcid_flush_mask \
+ PER_CPU_VAR(cpu_tlbstate) + TLB_STATE_user_pcid_flush_mask
+
+.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
+ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
+ mov %cr3, \scratch_reg
+
+ ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID
+
+ /*
+ * Test if the ASID needs a flush.
+ */
+ movq \scratch_reg, \scratch_reg2
+ andq $(0x7FF), \scratch_reg /* mask ASID */
+ bt \scratch_reg, THIS_CPU_user_pcid_flush_mask
+ jnc .Lnoflush_\@
+
+ /* Flush needed, clear the bit */
+ btr \scratch_reg, THIS_CPU_user_pcid_flush_mask
+ movq \scratch_reg2, \scratch_reg
+ jmp .Lwrcr3_pcid_\@
+
+.Lnoflush_\@:
+ movq \scratch_reg2, \scratch_reg
+ SET_NOFLUSH_BIT \scratch_reg
+
+.Lwrcr3_pcid_\@:
+ /* Flip the ASID to the user version */
+ orq $(PTI_USER_PCID_MASK), \scratch_reg
+
+.Lwrcr3_\@:
+ /* Flip the PGD to the user version */
+ orq $(PTI_USER_PGTABLE_MASK), \scratch_reg
+ mov \scratch_reg, %cr3
+.Lend_\@:
+.endm
+
+.macro SWITCH_TO_USER_CR3_STACK scratch_reg:req
+ pushq %rax
+ SWITCH_TO_USER_CR3_NOSTACK scratch_reg=\scratch_reg scratch_reg2=%rax
+ popq %rax
+.endm
+
+.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
+ ALTERNATIVE "jmp .Ldone_\@", "", X86_FEATURE_PTI
+ movq %cr3, \scratch_reg
+ movq \scratch_reg, \save_reg
+ /*
+ * Test the user pagetable bit. If set, then the user page tables
+ * are active. If clear CR3 already has the kernel page table
+ * active.
+ */
+ bt $PTI_USER_PGTABLE_BIT, \scratch_reg
+ jnc .Ldone_\@
+
+ ADJUST_KERNEL_CR3 \scratch_reg
+ movq \scratch_reg, %cr3
+
+.Ldone_\@:
+.endm
+
+.macro RESTORE_CR3 scratch_reg:req save_reg:req
+ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
+
+ ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID
+
+ /*
+ * KERNEL pages can always resume with NOFLUSH as we do
+ * explicit flushes.
+ */
+ bt $PTI_USER_PGTABLE_BIT, \save_reg
+ jnc .Lnoflush_\@
+
+ /*
+ * Check if there's a pending flush for the user ASID we're
+ * about to set.
+ */
+ movq \save_reg, \scratch_reg
+ andq $(0x7FF), \scratch_reg
+ bt \scratch_reg, THIS_CPU_user_pcid_flush_mask
+ jnc .Lnoflush_\@
+
+ btr \scratch_reg, THIS_CPU_user_pcid_flush_mask
+ jmp .Lwrcr3_\@
+
+.Lnoflush_\@:
+ SET_NOFLUSH_BIT \save_reg
+
+.Lwrcr3_\@:
+ /*
+ * The CR3 write could be avoided when not changing its value,
+ * but would require a CR3 read *and* a scratch register.
+ */
+ movq \save_reg, %cr3
+.Lend_\@:
+.endm
+
+#else /* CONFIG_PAGE_TABLE_ISOLATION=n: */
+
+.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
+.endm
+.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
+.endm
+.macro SWITCH_TO_USER_CR3_STACK scratch_reg:req
+.endm
+.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
+.endm
+.macro RESTORE_CR3 scratch_reg:req save_reg:req
+.endm
+
+#endif
+
+/*
+ * Mitigate Spectre v1 for conditional swapgs code paths.
+ *
+ * FENCE_SWAPGS_USER_ENTRY is used in the user entry swapgs code path, to
+ * prevent a speculative swapgs when coming from kernel space.
+ *
+ * FENCE_SWAPGS_KERNEL_ENTRY is used in the kernel entry non-swapgs code path,
+ * to prevent the swapgs from getting speculatively skipped when coming from
+ * user space.
+ */
+.macro FENCE_SWAPGS_USER_ENTRY
+ ALTERNATIVE "", "lfence", X86_FEATURE_FENCE_SWAPGS_USER
+.endm
+.macro FENCE_SWAPGS_KERNEL_ENTRY
+ ALTERNATIVE "", "lfence", X86_FEATURE_FENCE_SWAPGS_KERNEL
+.endm
+
+#endif /* CONFIG_X86_64 */
+
+/*
+ * This does 'call enter_from_user_mode' unless we can avoid it based on
+ * kernel config or using the static jump infrastructure.
+ */
+.macro CALL_enter_from_user_mode
+#ifdef CONFIG_CONTEXT_TRACKING
+#ifdef CONFIG_JUMP_LABEL
+ STATIC_JUMP_IF_FALSE .Lafter_call_\@, context_tracking_enabled, def=0
+#endif
+ call enter_from_user_mode
+.Lafter_call_\@:
+#endif
+.endm
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
new file mode 100644
index 000000000..8353348dd
--- /dev/null
+++ b/arch/x86/entry/common.c
@@ -0,0 +1,432 @@
+/*
+ * common.c - C code for kernel entry and exit
+ * Copyright (c) 2015 Andrew Lutomirski
+ * GPL v2
+ *
+ * Based on asm and ptrace code by many authors. The code here originated
+ * in ptrace.c and signal.c.
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/errno.h>
+#include <linux/ptrace.h>
+#include <linux/tracehook.h>
+#include <linux/audit.h>
+#include <linux/seccomp.h>
+#include <linux/signal.h>
+#include <linux/export.h>
+#include <linux/context_tracking.h>
+#include <linux/user-return-notifier.h>
+#include <linux/nospec.h>
+#include <linux/uprobes.h>
+#include <linux/livepatch.h>
+#include <linux/syscalls.h>
+
+#include <asm/desc.h>
+#include <asm/traps.h>
+#include <asm/vdso.h>
+#include <linux/uaccess.h>
+#include <asm/cpufeature.h>
+#include <asm/nospec-branch.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/syscalls.h>
+
+#ifdef CONFIG_CONTEXT_TRACKING
+/* Called on entry from user mode with IRQs off. */
+__visible inline void enter_from_user_mode(void)
+{
+ CT_WARN_ON(ct_state() != CONTEXT_USER);
+ user_exit_irqoff();
+}
+#else
+static inline void enter_from_user_mode(void) {}
+#endif
+
+static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch)
+{
+#ifdef CONFIG_X86_64
+ if (arch == AUDIT_ARCH_X86_64) {
+ audit_syscall_entry(regs->orig_ax, regs->di,
+ regs->si, regs->dx, regs->r10);
+ } else
+#endif
+ {
+ audit_syscall_entry(regs->orig_ax, regs->bx,
+ regs->cx, regs->dx, regs->si);
+ }
+}
+
+/*
+ * Returns the syscall nr to run (which should match regs->orig_ax) or -1
+ * to skip the syscall.
+ */
+static long syscall_trace_enter(struct pt_regs *regs)
+{
+ u32 arch = in_ia32_syscall() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
+
+ struct thread_info *ti = current_thread_info();
+ unsigned long ret = 0;
+ bool emulated = false;
+ u32 work;
+
+ if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
+ BUG_ON(regs != task_pt_regs(current));
+
+ work = READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY;
+
+ if (unlikely(work & _TIF_SYSCALL_EMU))
+ emulated = true;
+
+ if ((emulated || (work & _TIF_SYSCALL_TRACE)) &&
+ tracehook_report_syscall_entry(regs))
+ return -1L;
+
+ if (emulated)
+ return -1L;
+
+#ifdef CONFIG_SECCOMP
+ /*
+ * Do seccomp after ptrace, to catch any tracer changes.
+ */
+ if (work & _TIF_SECCOMP) {
+ struct seccomp_data sd;
+
+ sd.arch = arch;
+ sd.nr = regs->orig_ax;
+ sd.instruction_pointer = regs->ip;
+#ifdef CONFIG_X86_64
+ if (arch == AUDIT_ARCH_X86_64) {
+ sd.args[0] = regs->di;
+ sd.args[1] = regs->si;
+ sd.args[2] = regs->dx;
+ sd.args[3] = regs->r10;
+ sd.args[4] = regs->r8;
+ sd.args[5] = regs->r9;
+ } else
+#endif
+ {
+ sd.args[0] = regs->bx;
+ sd.args[1] = regs->cx;
+ sd.args[2] = regs->dx;
+ sd.args[3] = regs->si;
+ sd.args[4] = regs->di;
+ sd.args[5] = regs->bp;
+ }
+
+ ret = __secure_computing(&sd);
+ if (ret == -1)
+ return ret;
+ }
+#endif
+
+ if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
+ trace_sys_enter(regs, regs->orig_ax);
+
+ do_audit_syscall_entry(regs, arch);
+
+ return ret ?: regs->orig_ax;
+}
+
+#define EXIT_TO_USERMODE_LOOP_FLAGS \
+ (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
+ _TIF_NEED_RESCHED | _TIF_USER_RETURN_NOTIFY | _TIF_PATCH_PENDING)
+
+static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
+{
+ /*
+ * In order to return to user mode, we need to have IRQs off with
+ * none of EXIT_TO_USERMODE_LOOP_FLAGS set. Several of these flags
+ * can be set at any time on preemptable kernels if we have IRQs on,
+ * so we need to loop. Disabling preemption wouldn't help: doing the
+ * work to clear some of the flags can sleep.
+ */
+ while (true) {
+ /* We have work to do. */
+ local_irq_enable();
+
+ if (cached_flags & _TIF_NEED_RESCHED)
+ schedule();
+
+ if (cached_flags & _TIF_UPROBE)
+ uprobe_notify_resume(regs);
+
+ if (cached_flags & _TIF_PATCH_PENDING)
+ klp_update_patch_state(current);
+
+ /* deal with pending signal delivery */
+ if (cached_flags & _TIF_SIGPENDING)
+ do_signal(regs);
+
+ if (cached_flags & _TIF_NOTIFY_RESUME) {
+ clear_thread_flag(TIF_NOTIFY_RESUME);
+ tracehook_notify_resume(regs);
+ rseq_handle_notify_resume(NULL, regs);
+ }
+
+ if (cached_flags & _TIF_USER_RETURN_NOTIFY)
+ fire_user_return_notifiers();
+
+ /* Disable IRQs and retry */
+ local_irq_disable();
+
+ cached_flags = READ_ONCE(current_thread_info()->flags);
+
+ if (!(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS))
+ break;
+ }
+}
+
+/* Called with IRQs disabled. */
+__visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
+{
+ struct thread_info *ti = current_thread_info();
+ u32 cached_flags;
+
+ addr_limit_user_check();
+
+ lockdep_assert_irqs_disabled();
+ lockdep_sys_exit();
+
+ cached_flags = READ_ONCE(ti->flags);
+
+ if (unlikely(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS))
+ exit_to_usermode_loop(regs, cached_flags);
+
+#ifdef CONFIG_COMPAT
+ /*
+ * Compat syscalls set TS_COMPAT. Make sure we clear it before
+ * returning to user mode. We need to clear it *after* signal
+ * handling, because syscall restart has a fixup for compat
+ * syscalls. The fixup is exercised by the ptrace_syscall_32
+ * selftest.
+ *
+ * We also need to clear TS_REGS_POKED_I386: the 32-bit tracer
+ * special case only applies after poking regs and before the
+ * very next return to user mode.
+ */
+ ti->status &= ~(TS_COMPAT|TS_I386_REGS_POKED);
+#endif
+
+ user_enter_irqoff();
+
+ mds_user_clear_cpu_buffers();
+}
+
+#define SYSCALL_EXIT_WORK_FLAGS \
+ (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
+ _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT)
+
+static void syscall_slow_exit_work(struct pt_regs *regs, u32 cached_flags)
+{
+ bool step;
+
+ audit_syscall_exit(regs);
+
+ if (cached_flags & _TIF_SYSCALL_TRACEPOINT)
+ trace_sys_exit(regs, regs->ax);
+
+ /*
+ * If TIF_SYSCALL_EMU is set, we only get here because of
+ * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP).
+ * We already reported this syscall instruction in
+ * syscall_trace_enter().
+ */
+ step = unlikely(
+ (cached_flags & (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU))
+ == _TIF_SINGLESTEP);
+ if (step || cached_flags & _TIF_SYSCALL_TRACE)
+ tracehook_report_syscall_exit(regs, step);
+}
+
+/*
+ * Called with IRQs on and fully valid regs. Returns with IRQs off in a
+ * state such that we can immediately switch to user mode.
+ */
+__visible inline void syscall_return_slowpath(struct pt_regs *regs)
+{
+ struct thread_info *ti = current_thread_info();
+ u32 cached_flags = READ_ONCE(ti->flags);
+
+ CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
+
+ if (IS_ENABLED(CONFIG_PROVE_LOCKING) &&
+ WARN(irqs_disabled(), "syscall %ld left IRQs disabled", regs->orig_ax))
+ local_irq_enable();
+
+ rseq_syscall(regs);
+
+ /*
+ * First do one-time work. If these work items are enabled, we
+ * want to run them exactly once per syscall exit with IRQs on.
+ */
+ if (unlikely(cached_flags & SYSCALL_EXIT_WORK_FLAGS))
+ syscall_slow_exit_work(regs, cached_flags);
+
+ local_irq_disable();
+ prepare_exit_to_usermode(regs);
+}
+
+#ifdef CONFIG_X86_64
+__visible void do_syscall_64(unsigned long nr, struct pt_regs *regs)
+{
+ struct thread_info *ti;
+
+ enter_from_user_mode();
+ local_irq_enable();
+ ti = current_thread_info();
+ if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY)
+ nr = syscall_trace_enter(regs);
+
+ /*
+ * NB: Native and x32 syscalls are dispatched from the same
+ * table. The only functional difference is the x32 bit in
+ * regs->orig_ax, which changes the behavior of some syscalls.
+ */
+ nr &= __SYSCALL_MASK;
+ if (likely(nr < NR_syscalls)) {
+ nr = array_index_nospec(nr, NR_syscalls);
+ regs->ax = sys_call_table[nr](regs);
+ }
+
+ syscall_return_slowpath(regs);
+}
+#endif
+
+#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
+/*
+ * Does a 32-bit syscall. Called with IRQs on in CONTEXT_KERNEL. Does
+ * all entry and exit work and returns with IRQs off. This function is
+ * extremely hot in workloads that use it, and it's usually called from
+ * do_fast_syscall_32, so forcibly inline it to improve performance.
+ */
+static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
+{
+ struct thread_info *ti = current_thread_info();
+ unsigned int nr = (unsigned int)regs->orig_ax;
+
+#ifdef CONFIG_IA32_EMULATION
+ ti->status |= TS_COMPAT;
+#endif
+
+ if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) {
+ /*
+ * Subtlety here: if ptrace pokes something larger than
+ * 2^32-1 into orig_ax, this truncates it. This may or
+ * may not be necessary, but it matches the old asm
+ * behavior.
+ */
+ nr = syscall_trace_enter(regs);
+ }
+
+ if (likely(nr < IA32_NR_syscalls)) {
+ nr = array_index_nospec(nr, IA32_NR_syscalls);
+#ifdef CONFIG_IA32_EMULATION
+ regs->ax = ia32_sys_call_table[nr](regs);
+#else
+ /*
+ * It's possible that a 32-bit syscall implementation
+ * takes a 64-bit parameter but nonetheless assumes that
+ * the high bits are zero. Make sure we zero-extend all
+ * of the args.
+ */
+ regs->ax = ia32_sys_call_table[nr](
+ (unsigned int)regs->bx, (unsigned int)regs->cx,
+ (unsigned int)regs->dx, (unsigned int)regs->si,
+ (unsigned int)regs->di, (unsigned int)regs->bp);
+#endif /* CONFIG_IA32_EMULATION */
+ }
+
+ syscall_return_slowpath(regs);
+}
+
+/* Handles int $0x80 */
+__visible void do_int80_syscall_32(struct pt_regs *regs)
+{
+ enter_from_user_mode();
+ local_irq_enable();
+ do_syscall_32_irqs_on(regs);
+}
+
+/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
+__visible long do_fast_syscall_32(struct pt_regs *regs)
+{
+ /*
+ * Called using the internal vDSO SYSENTER/SYSCALL32 calling
+ * convention. Adjust regs so it looks like we entered using int80.
+ */
+
+ unsigned long landing_pad = (unsigned long)current->mm->context.vdso +
+ vdso_image_32.sym_int80_landing_pad;
+
+ /*
+ * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward
+ * so that 'regs->ip -= 2' lands back on an int $0x80 instruction.
+ * Fix it up.
+ */
+ regs->ip = landing_pad;
+
+ enter_from_user_mode();
+
+ local_irq_enable();
+
+ /* Fetch EBP from where the vDSO stashed it. */
+ if (
+#ifdef CONFIG_X86_64
+ /*
+ * Micro-optimization: the pointer we're following is explicitly
+ * 32 bits, so it can't be out of range.
+ */
+ __get_user(*(u32 *)&regs->bp,
+ (u32 __user __force *)(unsigned long)(u32)regs->sp)
+#else
+ get_user(*(u32 *)&regs->bp,
+ (u32 __user __force *)(unsigned long)(u32)regs->sp)
+#endif
+ ) {
+
+ /* User code screwed up. */
+ local_irq_disable();
+ regs->ax = -EFAULT;
+ prepare_exit_to_usermode(regs);
+ return 0; /* Keep it simple: use IRET. */
+ }
+
+ /* Now this is just like a normal syscall. */
+ do_syscall_32_irqs_on(regs);
+
+#ifdef CONFIG_X86_64
+ /*
+ * Opportunistic SYSRETL: if possible, try to return using SYSRETL.
+ * SYSRETL is available on all 64-bit CPUs, so we don't need to
+ * bother with SYSEXIT.
+ *
+ * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,
+ * because the ECX fixup above will ensure that this is essentially
+ * never the case.
+ */
+ return regs->cs == __USER32_CS && regs->ss == __USER_DS &&
+ regs->ip == landing_pad &&
+ (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)) == 0;
+#else
+ /*
+ * Opportunistic SYSEXIT: if possible, try to return using SYSEXIT.
+ *
+ * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,
+ * because the ECX fixup above will ensure that this is essentially
+ * never the case.
+ *
+ * We don't allow syscalls at all from VM86 mode, but we still
+ * need to check VM, because we might be returning from sys_vm86.
+ */
+ return static_cpu_has(X86_FEATURE_SEP) &&
+ regs->cs == __USER_CS && regs->ss == __USER_DS &&
+ regs->ip == landing_pad &&
+ (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)) == 0;
+#endif
+}
+#endif
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
new file mode 100644
index 000000000..37d9016d4
--- /dev/null
+++ b/arch/x86/entry/entry_32.S
@@ -0,0 +1,1514 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 1991,1992 Linus Torvalds
+ *
+ * entry_32.S contains the system-call and low-level fault and trap handling routines.
+ *
+ * Stack layout while running C code:
+ * ptrace needs to have all registers on the stack.
+ * If the order here is changed, it needs to be
+ * updated in fork.c:copy_process(), signal.c:do_signal(),
+ * ptrace.c and ptrace.h
+ *
+ * 0(%esp) - %ebx
+ * 4(%esp) - %ecx
+ * 8(%esp) - %edx
+ * C(%esp) - %esi
+ * 10(%esp) - %edi
+ * 14(%esp) - %ebp
+ * 18(%esp) - %eax
+ * 1C(%esp) - %ds
+ * 20(%esp) - %es
+ * 24(%esp) - %fs
+ * 28(%esp) - %gs saved iff !CONFIG_X86_32_LAZY_GS
+ * 2C(%esp) - orig_eax
+ * 30(%esp) - %eip
+ * 34(%esp) - %cs
+ * 38(%esp) - %eflags
+ * 3C(%esp) - %oldesp
+ * 40(%esp) - %oldss
+ */
+
+#include <linux/linkage.h>
+#include <linux/err.h>
+#include <asm/thread_info.h>
+#include <asm/irqflags.h>
+#include <asm/errno.h>
+#include <asm/segment.h>
+#include <asm/smp.h>
+#include <asm/percpu.h>
+#include <asm/processor-flags.h>
+#include <asm/irq_vectors.h>
+#include <asm/cpufeatures.h>
+#include <asm/alternative-asm.h>
+#include <asm/asm.h>
+#include <asm/smap.h>
+#include <asm/frame.h>
+#include <asm/nospec-branch.h>
+
+ .section .entry.text, "ax"
+
+/*
+ * We use macros for low-level operations which need to be overridden
+ * for paravirtualization. The following will never clobber any registers:
+ * INTERRUPT_RETURN (aka. "iret")
+ * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
+ * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
+ *
+ * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
+ * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
+ * Allowing a register to be clobbered can shrink the paravirt replacement
+ * enough to patch inline, increasing performance.
+ */
+
+#ifdef CONFIG_PREEMPT
+# define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
+#else
+# define preempt_stop(clobbers)
+# define resume_kernel restore_all_kernel
+#endif
+
+.macro TRACE_IRQS_IRET
+#ifdef CONFIG_TRACE_IRQFLAGS
+ testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off?
+ jz 1f
+ TRACE_IRQS_ON
+1:
+#endif
+.endm
+
+#define PTI_SWITCH_MASK (1 << PAGE_SHIFT)
+
+/*
+ * User gs save/restore
+ *
+ * %gs is used for userland TLS and kernel only uses it for stack
+ * canary which is required to be at %gs:20 by gcc. Read the comment
+ * at the top of stackprotector.h for more info.
+ *
+ * Local labels 98 and 99 are used.
+ */
+#ifdef CONFIG_X86_32_LAZY_GS
+
+ /* unfortunately push/pop can't be no-op */
+.macro PUSH_GS
+ pushl $0
+.endm
+.macro POP_GS pop=0
+ addl $(4 + \pop), %esp
+.endm
+.macro POP_GS_EX
+.endm
+
+ /* all the rest are no-op */
+.macro PTGS_TO_GS
+.endm
+.macro PTGS_TO_GS_EX
+.endm
+.macro GS_TO_REG reg
+.endm
+.macro REG_TO_PTGS reg
+.endm
+.macro SET_KERNEL_GS reg
+.endm
+
+#else /* CONFIG_X86_32_LAZY_GS */
+
+.macro PUSH_GS
+ pushl %gs
+.endm
+
+.macro POP_GS pop=0
+98: popl %gs
+ .if \pop <> 0
+ add $\pop, %esp
+ .endif
+.endm
+.macro POP_GS_EX
+.pushsection .fixup, "ax"
+99: movl $0, (%esp)
+ jmp 98b
+.popsection
+ _ASM_EXTABLE(98b, 99b)
+.endm
+
+.macro PTGS_TO_GS
+98: mov PT_GS(%esp), %gs
+.endm
+.macro PTGS_TO_GS_EX
+.pushsection .fixup, "ax"
+99: movl $0, PT_GS(%esp)
+ jmp 98b
+.popsection
+ _ASM_EXTABLE(98b, 99b)
+.endm
+
+.macro GS_TO_REG reg
+ movl %gs, \reg
+.endm
+.macro REG_TO_PTGS reg
+ movl \reg, PT_GS(%esp)
+.endm
+.macro SET_KERNEL_GS reg
+ movl $(__KERNEL_STACK_CANARY), \reg
+ movl \reg, %gs
+.endm
+
+#endif /* CONFIG_X86_32_LAZY_GS */
+
+/* Unconditionally switch to user cr3 */
+.macro SWITCH_TO_USER_CR3 scratch_reg:req
+ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
+
+ movl %cr3, \scratch_reg
+ orl $PTI_SWITCH_MASK, \scratch_reg
+ movl \scratch_reg, %cr3
+.Lend_\@:
+.endm
+
+.macro BUG_IF_WRONG_CR3 no_user_check=0
+#ifdef CONFIG_DEBUG_ENTRY
+ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
+ .if \no_user_check == 0
+ /* coming from usermode? */
+ testl $SEGMENT_RPL_MASK, PT_CS(%esp)
+ jz .Lend_\@
+ .endif
+ /* On user-cr3? */
+ movl %cr3, %eax
+ testl $PTI_SWITCH_MASK, %eax
+ jnz .Lend_\@
+ /* From userspace with kernel cr3 - BUG */
+ ud2
+.Lend_\@:
+#endif
+.endm
+
+/*
+ * Switch to kernel cr3 if not already loaded and return current cr3 in
+ * \scratch_reg
+ */
+.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
+ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
+ movl %cr3, \scratch_reg
+ /* Test if we are already on kernel CR3 */
+ testl $PTI_SWITCH_MASK, \scratch_reg
+ jz .Lend_\@
+ andl $(~PTI_SWITCH_MASK), \scratch_reg
+ movl \scratch_reg, %cr3
+ /* Return original CR3 in \scratch_reg */
+ orl $PTI_SWITCH_MASK, \scratch_reg
+.Lend_\@:
+.endm
+
+.macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0
+ cld
+ PUSH_GS
+ pushl %fs
+ pushl %es
+ pushl %ds
+ pushl \pt_regs_ax
+ pushl %ebp
+ pushl %edi
+ pushl %esi
+ pushl %edx
+ pushl %ecx
+ pushl %ebx
+ movl $(__USER_DS), %edx
+ movl %edx, %ds
+ movl %edx, %es
+ movl $(__KERNEL_PERCPU), %edx
+ movl %edx, %fs
+ SET_KERNEL_GS %edx
+
+ /* Switch to kernel stack if necessary */
+.if \switch_stacks > 0
+ SWITCH_TO_KERNEL_STACK
+.endif
+
+.endm
+
+.macro SAVE_ALL_NMI cr3_reg:req
+ SAVE_ALL
+
+ BUG_IF_WRONG_CR3
+
+ /*
+ * Now switch the CR3 when PTI is enabled.
+ *
+ * We can enter with either user or kernel cr3, the code will
+ * store the old cr3 in \cr3_reg and switches to the kernel cr3
+ * if necessary.
+ */
+ SWITCH_TO_KERNEL_CR3 scratch_reg=\cr3_reg
+
+.Lend_\@:
+.endm
+
+.macro RESTORE_INT_REGS
+ popl %ebx
+ popl %ecx
+ popl %edx
+ popl %esi
+ popl %edi
+ popl %ebp
+ popl %eax
+.endm
+
+.macro RESTORE_REGS pop=0
+ RESTORE_INT_REGS
+1: popl %ds
+2: popl %es
+3: popl %fs
+ POP_GS \pop
+.pushsection .fixup, "ax"
+4: movl $0, (%esp)
+ jmp 1b
+5: movl $0, (%esp)
+ jmp 2b
+6: movl $0, (%esp)
+ jmp 3b
+.popsection
+ _ASM_EXTABLE(1b, 4b)
+ _ASM_EXTABLE(2b, 5b)
+ _ASM_EXTABLE(3b, 6b)
+ POP_GS_EX
+.endm
+
+.macro RESTORE_ALL_NMI cr3_reg:req pop=0
+ /*
+ * Now switch the CR3 when PTI is enabled.
+ *
+ * We enter with kernel cr3 and switch the cr3 to the value
+ * stored on \cr3_reg, which is either a user or a kernel cr3.
+ */
+ ALTERNATIVE "jmp .Lswitched_\@", "", X86_FEATURE_PTI
+
+ testl $PTI_SWITCH_MASK, \cr3_reg
+ jz .Lswitched_\@
+
+ /* User cr3 in \cr3_reg - write it to hardware cr3 */
+ movl \cr3_reg, %cr3
+
+.Lswitched_\@:
+
+ BUG_IF_WRONG_CR3
+
+ RESTORE_REGS pop=\pop
+.endm
+
+.macro CHECK_AND_APPLY_ESPFIX
+#ifdef CONFIG_X86_ESPFIX32
+#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8)
+
+ ALTERNATIVE "jmp .Lend_\@", "", X86_BUG_ESPFIX
+
+ movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
+ /*
+ * Warning: PT_OLDSS(%esp) contains the wrong/random values if we
+ * are returning to the kernel.
+ * See comments in process.c:copy_thread() for details.
+ */
+ movb PT_OLDSS(%esp), %ah
+ movb PT_CS(%esp), %al
+ andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
+ cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
+ jne .Lend_\@ # returning to user-space with LDT SS
+
+ /*
+ * Setup and switch to ESPFIX stack
+ *
+ * We're returning to userspace with a 16 bit stack. The CPU will not
+ * restore the high word of ESP for us on executing iret... This is an
+ * "official" bug of all the x86-compatible CPUs, which we can work
+ * around to make dosemu and wine happy. We do this by preloading the
+ * high word of ESP with the high word of the userspace ESP while
+ * compensating for the offset by changing to the ESPFIX segment with
+ * a base address that matches for the difference.
+ */
+ mov %esp, %edx /* load kernel esp */
+ mov PT_OLDESP(%esp), %eax /* load userspace esp */
+ mov %dx, %ax /* eax: new kernel esp */
+ sub %eax, %edx /* offset (low word is 0) */
+ shr $16, %edx
+ mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */
+ mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */
+ pushl $__ESPFIX_SS
+ pushl %eax /* new kernel esp */
+ /*
+ * Disable interrupts, but do not irqtrace this section: we
+ * will soon execute iret and the tracer was already set to
+ * the irqstate after the IRET:
+ */
+ DISABLE_INTERRUPTS(CLBR_ANY)
+ lss (%esp), %esp /* switch to espfix segment */
+.Lend_\@:
+#endif /* CONFIG_X86_ESPFIX32 */
+.endm
+
+/*
+ * Called with pt_regs fully populated and kernel segments loaded,
+ * so we can access PER_CPU and use the integer registers.
+ *
+ * We need to be very careful here with the %esp switch, because an NMI
+ * can happen everywhere. If the NMI handler finds itself on the
+ * entry-stack, it will overwrite the task-stack and everything we
+ * copied there. So allocate the stack-frame on the task-stack and
+ * switch to it before we do any copying.
+ */
+
+#define CS_FROM_ENTRY_STACK (1 << 31)
+#define CS_FROM_USER_CR3 (1 << 30)
+
+.macro SWITCH_TO_KERNEL_STACK
+
+ ALTERNATIVE "", "jmp .Lend_\@", X86_FEATURE_XENPV
+
+ BUG_IF_WRONG_CR3
+
+ SWITCH_TO_KERNEL_CR3 scratch_reg=%eax
+
+ /*
+ * %eax now contains the entry cr3 and we carry it forward in
+ * that register for the time this macro runs
+ */
+
+ /*
+ * The high bits of the CS dword (__csh) are used for
+ * CS_FROM_ENTRY_STACK and CS_FROM_USER_CR3. Clear them in case
+ * hardware didn't do this for us.
+ */
+ andl $(0x0000ffff), PT_CS(%esp)
+
+ /* Are we on the entry stack? Bail out if not! */
+ movl PER_CPU_VAR(cpu_entry_area), %ecx
+ addl $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx
+ subl %esp, %ecx /* ecx = (end of entry_stack) - esp */
+ cmpl $SIZEOF_entry_stack, %ecx
+ jae .Lend_\@
+
+ /* Load stack pointer into %esi and %edi */
+ movl %esp, %esi
+ movl %esi, %edi
+
+ /* Move %edi to the top of the entry stack */
+ andl $(MASK_entry_stack), %edi
+ addl $(SIZEOF_entry_stack), %edi
+
+ /* Load top of task-stack into %edi */
+ movl TSS_entry2task_stack(%edi), %edi
+
+ /* Special case - entry from kernel mode via entry stack */
+#ifdef CONFIG_VM86
+ movl PT_EFLAGS(%esp), %ecx # mix EFLAGS and CS
+ movb PT_CS(%esp), %cl
+ andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %ecx
+#else
+ movl PT_CS(%esp), %ecx
+ andl $SEGMENT_RPL_MASK, %ecx
+#endif
+ cmpl $USER_RPL, %ecx
+ jb .Lentry_from_kernel_\@
+
+ /* Bytes to copy */
+ movl $PTREGS_SIZE, %ecx
+
+#ifdef CONFIG_VM86
+ testl $X86_EFLAGS_VM, PT_EFLAGS(%esi)
+ jz .Lcopy_pt_regs_\@
+
+ /*
+ * Stack-frame contains 4 additional segment registers when
+ * coming from VM86 mode
+ */
+ addl $(4 * 4), %ecx
+
+#endif
+.Lcopy_pt_regs_\@:
+
+ /* Allocate frame on task-stack */
+ subl %ecx, %edi
+
+ /* Switch to task-stack */
+ movl %edi, %esp
+
+ /*
+ * We are now on the task-stack and can safely copy over the
+ * stack-frame
+ */
+ shrl $2, %ecx
+ cld
+ rep movsl
+
+ jmp .Lend_\@
+
+.Lentry_from_kernel_\@:
+
+ /*
+ * This handles the case when we enter the kernel from
+ * kernel-mode and %esp points to the entry-stack. When this
+ * happens we need to switch to the task-stack to run C code,
+ * but switch back to the entry-stack again when we approach
+ * iret and return to the interrupted code-path. This usually
+ * happens when we hit an exception while restoring user-space
+ * segment registers on the way back to user-space or when the
+ * sysenter handler runs with eflags.tf set.
+ *
+ * When we switch to the task-stack here, we can't trust the
+ * contents of the entry-stack anymore, as the exception handler
+ * might be scheduled out or moved to another CPU. Therefore we
+ * copy the complete entry-stack to the task-stack and set a
+ * marker in the iret-frame (bit 31 of the CS dword) to detect
+ * what we've done on the iret path.
+ *
+ * On the iret path we copy everything back and switch to the
+ * entry-stack, so that the interrupted kernel code-path
+ * continues on the same stack it was interrupted with.
+ *
+ * Be aware that an NMI can happen anytime in this code.
+ *
+ * %esi: Entry-Stack pointer (same as %esp)
+ * %edi: Top of the task stack
+ * %eax: CR3 on kernel entry
+ */
+
+ /* Calculate number of bytes on the entry stack in %ecx */
+ movl %esi, %ecx
+
+ /* %ecx to the top of entry-stack */
+ andl $(MASK_entry_stack), %ecx
+ addl $(SIZEOF_entry_stack), %ecx
+
+ /* Number of bytes on the entry stack to %ecx */
+ sub %esi, %ecx
+
+ /* Mark stackframe as coming from entry stack */
+ orl $CS_FROM_ENTRY_STACK, PT_CS(%esp)
+
+ /*
+ * Test the cr3 used to enter the kernel and add a marker
+ * so that we can switch back to it before iret.
+ */
+ testl $PTI_SWITCH_MASK, %eax
+ jz .Lcopy_pt_regs_\@
+ orl $CS_FROM_USER_CR3, PT_CS(%esp)
+
+ /*
+ * %esi and %edi are unchanged, %ecx contains the number of
+ * bytes to copy. The code at .Lcopy_pt_regs_\@ will allocate
+ * the stack-frame on task-stack and copy everything over
+ */
+ jmp .Lcopy_pt_regs_\@
+
+.Lend_\@:
+.endm
+
+/*
+ * Switch back from the kernel stack to the entry stack.
+ *
+ * The %esp register must point to pt_regs on the task stack. It will
+ * first calculate the size of the stack-frame to copy, depending on
+ * whether we return to VM86 mode or not. With that it uses 'rep movsl'
+ * to copy the contents of the stack over to the entry stack.
+ *
+ * We must be very careful here, as we can't trust the contents of the
+ * task-stack once we switched to the entry-stack. When an NMI happens
+ * while on the entry-stack, the NMI handler will switch back to the top
+ * of the task stack, overwriting our stack-frame we are about to copy.
+ * Therefore we switch the stack only after everything is copied over.
+ */
+.macro SWITCH_TO_ENTRY_STACK
+
+ ALTERNATIVE "", "jmp .Lend_\@", X86_FEATURE_XENPV
+
+ /* Bytes to copy */
+ movl $PTREGS_SIZE, %ecx
+
+#ifdef CONFIG_VM86
+ testl $(X86_EFLAGS_VM), PT_EFLAGS(%esp)
+ jz .Lcopy_pt_regs_\@
+
+ /* Additional 4 registers to copy when returning to VM86 mode */
+ addl $(4 * 4), %ecx
+
+.Lcopy_pt_regs_\@:
+#endif
+
+ /* Initialize source and destination for movsl */
+ movl PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %edi
+ subl %ecx, %edi
+ movl %esp, %esi
+
+ /* Save future stack pointer in %ebx */
+ movl %edi, %ebx
+
+ /* Copy over the stack-frame */
+ shrl $2, %ecx
+ cld
+ rep movsl
+
+ /*
+ * Switch to entry-stack - needs to happen after everything is
+ * copied because the NMI handler will overwrite the task-stack
+ * when on entry-stack
+ */
+ movl %ebx, %esp
+
+.Lend_\@:
+.endm
+
+/*
+ * This macro handles the case when we return to kernel-mode on the iret
+ * path and have to switch back to the entry stack and/or user-cr3
+ *
+ * See the comments below the .Lentry_from_kernel_\@ label in the
+ * SWITCH_TO_KERNEL_STACK macro for more details.
+ */
+.macro PARANOID_EXIT_TO_KERNEL_MODE
+
+ /*
+ * Test if we entered the kernel with the entry-stack. Most
+ * likely we did not, because this code only runs on the
+ * return-to-kernel path.
+ */
+ testl $CS_FROM_ENTRY_STACK, PT_CS(%esp)
+ jz .Lend_\@
+
+ /* Unlikely slow-path */
+
+ /* Clear marker from stack-frame */
+ andl $(~CS_FROM_ENTRY_STACK), PT_CS(%esp)
+
+ /* Copy the remaining task-stack contents to entry-stack */
+ movl %esp, %esi
+ movl PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %edi
+
+ /* Bytes on the task-stack to ecx */
+ movl PER_CPU_VAR(cpu_tss_rw + TSS_sp1), %ecx
+ subl %esi, %ecx
+
+ /* Allocate stack-frame on entry-stack */
+ subl %ecx, %edi
+
+ /*
+ * Save future stack-pointer, we must not switch until the
+ * copy is done, otherwise the NMI handler could destroy the
+ * contents of the task-stack we are about to copy.
+ */
+ movl %edi, %ebx
+
+ /* Do the copy */
+ shrl $2, %ecx
+ cld
+ rep movsl
+
+ /* Safe to switch to entry-stack now */
+ movl %ebx, %esp
+
+ /*
+ * We came from entry-stack and need to check if we also need to
+ * switch back to user cr3.
+ */
+ testl $CS_FROM_USER_CR3, PT_CS(%esp)
+ jz .Lend_\@
+
+ /* Clear marker from stack-frame */
+ andl $(~CS_FROM_USER_CR3), PT_CS(%esp)
+
+ SWITCH_TO_USER_CR3 scratch_reg=%eax
+
+.Lend_\@:
+.endm
+/*
+ * %eax: prev task
+ * %edx: next task
+ */
+ENTRY(__switch_to_asm)
+ /*
+ * Save callee-saved registers
+ * This must match the order in struct inactive_task_frame
+ */
+ pushl %ebp
+ pushl %ebx
+ pushl %edi
+ pushl %esi
+ pushfl
+
+ /* switch stack */
+ movl %esp, TASK_threadsp(%eax)
+ movl TASK_threadsp(%edx), %esp
+
+#ifdef CONFIG_STACKPROTECTOR
+ movl TASK_stack_canary(%edx), %ebx
+ movl %ebx, PER_CPU_VAR(stack_canary)+stack_canary_offset
+#endif
+
+#ifdef CONFIG_RETPOLINE
+ /*
+ * When switching from a shallower to a deeper call stack
+ * the RSB may either underflow or use entries populated
+ * with userspace addresses. On CPUs where those concerns
+ * exist, overwrite the RSB with entries which capture
+ * speculative execution to prevent attack.
+ */
+ FILL_RETURN_BUFFER %ebx, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
+#endif
+
+ /* restore callee-saved registers */
+ popfl
+ popl %esi
+ popl %edi
+ popl %ebx
+ popl %ebp
+
+ jmp __switch_to
+END(__switch_to_asm)
+
+/*
+ * The unwinder expects the last frame on the stack to always be at the same
+ * offset from the end of the page, which allows it to validate the stack.
+ * Calling schedule_tail() directly would break that convention because its an
+ * asmlinkage function so its argument has to be pushed on the stack. This
+ * wrapper creates a proper "end of stack" frame header before the call.
+ */
+ENTRY(schedule_tail_wrapper)
+ FRAME_BEGIN
+
+ pushl %eax
+ call schedule_tail
+ popl %eax
+
+ FRAME_END
+ ret
+ENDPROC(schedule_tail_wrapper)
+/*
+ * A newly forked process directly context switches into this address.
+ *
+ * eax: prev task we switched from
+ * ebx: kernel thread func (NULL for user thread)
+ * edi: kernel thread arg
+ */
+ENTRY(ret_from_fork)
+ call schedule_tail_wrapper
+
+ testl %ebx, %ebx
+ jnz 1f /* kernel threads are uncommon */
+
+2:
+ /* When we fork, we trace the syscall return in the child, too. */
+ movl %esp, %eax
+ call syscall_return_slowpath
+ jmp restore_all
+
+ /* kernel thread */
+1: movl %edi, %eax
+ CALL_NOSPEC %ebx
+ /*
+ * A kernel thread is allowed to return here after successfully
+ * calling do_execve(). Exit to userspace to complete the execve()
+ * syscall.
+ */
+ movl $0, PT_EAX(%esp)
+ jmp 2b
+END(ret_from_fork)
+
+/*
+ * Return to user mode is not as complex as all this looks,
+ * but we want the default path for a system call return to
+ * go as quickly as possible which is why some of this is
+ * less clear than it otherwise should be.
+ */
+
+ # userspace resumption stub bypassing syscall exit tracing
+ ALIGN
+ret_from_exception:
+ preempt_stop(CLBR_ANY)
+ret_from_intr:
+#ifdef CONFIG_VM86
+ movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS
+ movb PT_CS(%esp), %al
+ andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax
+#else
+ /*
+ * We can be coming here from child spawned by kernel_thread().
+ */
+ movl PT_CS(%esp), %eax
+ andl $SEGMENT_RPL_MASK, %eax
+#endif
+ cmpl $USER_RPL, %eax
+ jb resume_kernel # not returning to v8086 or userspace
+
+ENTRY(resume_userspace)
+ DISABLE_INTERRUPTS(CLBR_ANY)
+ TRACE_IRQS_OFF
+ movl %esp, %eax
+ call prepare_exit_to_usermode
+ jmp restore_all
+END(ret_from_exception)
+
+#ifdef CONFIG_PREEMPT
+ENTRY(resume_kernel)
+ DISABLE_INTERRUPTS(CLBR_ANY)
+.Lneed_resched:
+ cmpl $0, PER_CPU_VAR(__preempt_count)
+ jnz restore_all_kernel
+ testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ?
+ jz restore_all_kernel
+ call preempt_schedule_irq
+ jmp .Lneed_resched
+END(resume_kernel)
+#endif
+
+GLOBAL(__begin_SYSENTER_singlestep_region)
+/*
+ * All code from here through __end_SYSENTER_singlestep_region is subject
+ * to being single-stepped if a user program sets TF and executes SYSENTER.
+ * There is absolutely nothing that we can do to prevent this from happening
+ * (thanks Intel!). To keep our handling of this situation as simple as
+ * possible, we handle TF just like AC and NT, except that our #DB handler
+ * will ignore all of the single-step traps generated in this range.
+ */
+
+#ifdef CONFIG_XEN
+/*
+ * Xen doesn't set %esp to be precisely what the normal SYSENTER
+ * entry point expects, so fix it up before using the normal path.
+ */
+ENTRY(xen_sysenter_target)
+ addl $5*4, %esp /* remove xen-provided frame */
+ jmp .Lsysenter_past_esp
+#endif
+
+/*
+ * 32-bit SYSENTER entry.
+ *
+ * 32-bit system calls through the vDSO's __kernel_vsyscall enter here
+ * if X86_FEATURE_SEP is available. This is the preferred system call
+ * entry on 32-bit systems.
+ *
+ * The SYSENTER instruction, in principle, should *only* occur in the
+ * vDSO. In practice, a small number of Android devices were shipped
+ * with a copy of Bionic that inlined a SYSENTER instruction. This
+ * never happened in any of Google's Bionic versions -- it only happened
+ * in a narrow range of Intel-provided versions.
+ *
+ * SYSENTER loads SS, ESP, CS, and EIP from previously programmed MSRs.
+ * IF and VM in RFLAGS are cleared (IOW: interrupts are off).
+ * SYSENTER does not save anything on the stack,
+ * and does not save old EIP (!!!), ESP, or EFLAGS.
+ *
+ * To avoid losing track of EFLAGS.VM (and thus potentially corrupting
+ * user and/or vm86 state), we explicitly disable the SYSENTER
+ * instruction in vm86 mode by reprogramming the MSRs.
+ *
+ * Arguments:
+ * eax system call number
+ * ebx arg1
+ * ecx arg2
+ * edx arg3
+ * esi arg4
+ * edi arg5
+ * ebp user stack
+ * 0(%ebp) arg6
+ */
+ENTRY(entry_SYSENTER_32)
+ /*
+ * On entry-stack with all userspace-regs live - save and
+ * restore eflags and %eax to use it as scratch-reg for the cr3
+ * switch.
+ */
+ pushfl
+ pushl %eax
+ BUG_IF_WRONG_CR3 no_user_check=1
+ SWITCH_TO_KERNEL_CR3 scratch_reg=%eax
+ popl %eax
+ popfl
+
+ /* Stack empty again, switch to task stack */
+ movl TSS_entry2task_stack(%esp), %esp
+
+.Lsysenter_past_esp:
+ pushl $__USER_DS /* pt_regs->ss */
+ pushl %ebp /* pt_regs->sp (stashed in bp) */
+ pushfl /* pt_regs->flags (except IF = 0) */
+ orl $X86_EFLAGS_IF, (%esp) /* Fix IF */
+ pushl $__USER_CS /* pt_regs->cs */
+ pushl $0 /* pt_regs->ip = 0 (placeholder) */
+ pushl %eax /* pt_regs->orig_ax */
+ SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest, stack already switched */
+
+ /*
+ * SYSENTER doesn't filter flags, so we need to clear NT, AC
+ * and TF ourselves. To save a few cycles, we can check whether
+ * either was set instead of doing an unconditional popfq.
+ * This needs to happen before enabling interrupts so that
+ * we don't get preempted with NT set.
+ *
+ * If TF is set, we will single-step all the way to here -- do_debug
+ * will ignore all the traps. (Yes, this is slow, but so is
+ * single-stepping in general. This allows us to avoid having
+ * a more complicated code to handle the case where a user program
+ * forces us to single-step through the SYSENTER entry code.)
+ *
+ * NB.: .Lsysenter_fix_flags is a label with the code under it moved
+ * out-of-line as an optimization: NT is unlikely to be set in the
+ * majority of the cases and instead of polluting the I$ unnecessarily,
+ * we're keeping that code behind a branch which will predict as
+ * not-taken and therefore its instructions won't be fetched.
+ */
+ testl $X86_EFLAGS_NT|X86_EFLAGS_AC|X86_EFLAGS_TF, PT_EFLAGS(%esp)
+ jnz .Lsysenter_fix_flags
+.Lsysenter_flags_fixed:
+
+ /*
+ * User mode is traced as though IRQs are on, and SYSENTER
+ * turned them off.
+ */
+ TRACE_IRQS_OFF
+
+ movl %esp, %eax
+ call do_fast_syscall_32
+ /* XEN PV guests always use IRET path */
+ ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \
+ "jmp .Lsyscall_32_done", X86_FEATURE_XENPV
+
+/* Opportunistic SYSEXIT */
+ TRACE_IRQS_ON /* User mode traces as IRQs on. */
+
+ /*
+ * Setup entry stack - we keep the pointer in %eax and do the
+ * switch after almost all user-state is restored.
+ */
+
+ /* Load entry stack pointer and allocate frame for eflags/eax */
+ movl PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %eax
+ subl $(2*4), %eax
+
+ /* Copy eflags and eax to entry stack */
+ movl PT_EFLAGS(%esp), %edi
+ movl PT_EAX(%esp), %esi
+ movl %edi, (%eax)
+ movl %esi, 4(%eax)
+
+ /* Restore user registers and segments */
+ movl PT_EIP(%esp), %edx /* pt_regs->ip */
+ movl PT_OLDESP(%esp), %ecx /* pt_regs->sp */
+1: mov PT_FS(%esp), %fs
+ PTGS_TO_GS
+
+ popl %ebx /* pt_regs->bx */
+ addl $2*4, %esp /* skip pt_regs->cx and pt_regs->dx */
+ popl %esi /* pt_regs->si */
+ popl %edi /* pt_regs->di */
+ popl %ebp /* pt_regs->bp */
+
+ /* Switch to entry stack */
+ movl %eax, %esp
+
+ /* Now ready to switch the cr3 */
+ SWITCH_TO_USER_CR3 scratch_reg=%eax
+
+ /*
+ * Restore all flags except IF. (We restore IF separately because
+ * STI gives a one-instruction window in which we won't be interrupted,
+ * whereas POPF does not.)
+ */
+ btrl $X86_EFLAGS_IF_BIT, (%esp)
+ BUG_IF_WRONG_CR3 no_user_check=1
+ popfl
+ popl %eax
+
+ /*
+ * Return back to the vDSO, which will pop ecx and edx.
+ * Don't bother with DS and ES (they already contain __USER_DS).
+ */
+ sti
+ sysexit
+
+.pushsection .fixup, "ax"
+2: movl $0, PT_FS(%esp)
+ jmp 1b
+.popsection
+ _ASM_EXTABLE(1b, 2b)
+ PTGS_TO_GS_EX
+
+.Lsysenter_fix_flags:
+ pushl $X86_EFLAGS_FIXED
+ popfl
+ jmp .Lsysenter_flags_fixed
+GLOBAL(__end_SYSENTER_singlestep_region)
+ENDPROC(entry_SYSENTER_32)
+
+/*
+ * 32-bit legacy system call entry.
+ *
+ * 32-bit x86 Linux system calls traditionally used the INT $0x80
+ * instruction. INT $0x80 lands here.
+ *
+ * This entry point can be used by any 32-bit perform system calls.
+ * Instances of INT $0x80 can be found inline in various programs and
+ * libraries. It is also used by the vDSO's __kernel_vsyscall
+ * fallback for hardware that doesn't support a faster entry method.
+ * Restarted 32-bit system calls also fall back to INT $0x80
+ * regardless of what instruction was originally used to do the system
+ * call. (64-bit programs can use INT $0x80 as well, but they can
+ * only run on 64-bit kernels and therefore land in
+ * entry_INT80_compat.)
+ *
+ * This is considered a slow path. It is not used by most libc
+ * implementations on modern hardware except during process startup.
+ *
+ * Arguments:
+ * eax system call number
+ * ebx arg1
+ * ecx arg2
+ * edx arg3
+ * esi arg4
+ * edi arg5
+ * ebp arg6
+ */
+ENTRY(entry_INT80_32)
+ ASM_CLAC
+ pushl %eax /* pt_regs->orig_ax */
+
+ SAVE_ALL pt_regs_ax=$-ENOSYS switch_stacks=1 /* save rest */
+
+ /*
+ * User mode is traced as though IRQs are on, and the interrupt gate
+ * turned them off.
+ */
+ TRACE_IRQS_OFF
+
+ movl %esp, %eax
+ call do_int80_syscall_32
+.Lsyscall_32_done:
+
+restore_all:
+ TRACE_IRQS_IRET
+ SWITCH_TO_ENTRY_STACK
+.Lrestore_all_notrace:
+ CHECK_AND_APPLY_ESPFIX
+.Lrestore_nocheck:
+ /* Switch back to user CR3 */
+ SWITCH_TO_USER_CR3 scratch_reg=%eax
+
+ BUG_IF_WRONG_CR3
+
+ /* Restore user state */
+ RESTORE_REGS pop=4 # skip orig_eax/error_code
+.Lirq_return:
+ /*
+ * ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization
+ * when returning from IPI handler and when returning from
+ * scheduler to user-space.
+ */
+ INTERRUPT_RETURN
+
+restore_all_kernel:
+ TRACE_IRQS_IRET
+ PARANOID_EXIT_TO_KERNEL_MODE
+ BUG_IF_WRONG_CR3
+ RESTORE_REGS 4
+ jmp .Lirq_return
+
+.section .fixup, "ax"
+ENTRY(iret_exc )
+ pushl $0 # no error code
+ pushl $do_iret_error
+
+#ifdef CONFIG_DEBUG_ENTRY
+ /*
+ * The stack-frame here is the one that iret faulted on, so its a
+ * return-to-user frame. We are on kernel-cr3 because we come here from
+ * the fixup code. This confuses the CR3 checker, so switch to user-cr3
+ * as the checker expects it.
+ */
+ pushl %eax
+ SWITCH_TO_USER_CR3 scratch_reg=%eax
+ popl %eax
+#endif
+
+ jmp common_exception
+.previous
+ _ASM_EXTABLE(.Lirq_return, iret_exc)
+ENDPROC(entry_INT80_32)
+
+.macro FIXUP_ESPFIX_STACK
+/*
+ * Switch back for ESPFIX stack to the normal zerobased stack
+ *
+ * We can't call C functions using the ESPFIX stack. This code reads
+ * the high word of the segment base from the GDT and swiches to the
+ * normal stack and adjusts ESP with the matching offset.
+ */
+#ifdef CONFIG_X86_ESPFIX32
+ /* fixup the stack */
+ mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */
+ mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */
+ shl $16, %eax
+ addl %esp, %eax /* the adjusted stack pointer */
+ pushl $__KERNEL_DS
+ pushl %eax
+ lss (%esp), %esp /* switch to the normal stack segment */
+#endif
+.endm
+.macro UNWIND_ESPFIX_STACK
+#ifdef CONFIG_X86_ESPFIX32
+ movl %ss, %eax
+ /* see if on espfix stack */
+ cmpw $__ESPFIX_SS, %ax
+ jne 27f
+ movl $__KERNEL_DS, %eax
+ movl %eax, %ds
+ movl %eax, %es
+ /* switch to normal stack */
+ FIXUP_ESPFIX_STACK
+27:
+#endif
+.endm
+
+/*
+ * Build the entry stubs with some assembler magic.
+ * We pack 1 stub into every 8-byte block.
+ */
+ .align 8
+ENTRY(irq_entries_start)
+ vector=FIRST_EXTERNAL_VECTOR
+ .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
+ pushl $(~vector+0x80) /* Note: always in signed byte range */
+ vector=vector+1
+ jmp common_interrupt
+ .align 8
+ .endr
+END(irq_entries_start)
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ .align 8
+ENTRY(spurious_entries_start)
+ vector=FIRST_SYSTEM_VECTOR
+ .rept (NR_VECTORS - FIRST_SYSTEM_VECTOR)
+ pushl $(~vector+0x80) /* Note: always in signed byte range */
+ vector=vector+1
+ jmp common_spurious
+ .align 8
+ .endr
+END(spurious_entries_start)
+
+common_spurious:
+ ASM_CLAC
+ addl $-0x80, (%esp) /* Adjust vector into the [-256, -1] range */
+ SAVE_ALL switch_stacks=1
+ ENCODE_FRAME_POINTER
+ TRACE_IRQS_OFF
+ movl %esp, %eax
+ call smp_spurious_interrupt
+ jmp ret_from_intr
+ENDPROC(common_spurious)
+#endif
+
+/*
+ * the CPU automatically disables interrupts when executing an IRQ vector,
+ * so IRQ-flags tracing has to follow that:
+ */
+ .p2align CONFIG_X86_L1_CACHE_SHIFT
+common_interrupt:
+ ASM_CLAC
+ addl $-0x80, (%esp) /* Adjust vector into the [-256, -1] range */
+
+ SAVE_ALL switch_stacks=1
+ ENCODE_FRAME_POINTER
+ TRACE_IRQS_OFF
+ movl %esp, %eax
+ call do_IRQ
+ jmp ret_from_intr
+ENDPROC(common_interrupt)
+
+#define BUILD_INTERRUPT3(name, nr, fn) \
+ENTRY(name) \
+ ASM_CLAC; \
+ pushl $~(nr); \
+ SAVE_ALL switch_stacks=1; \
+ ENCODE_FRAME_POINTER; \
+ TRACE_IRQS_OFF \
+ movl %esp, %eax; \
+ call fn; \
+ jmp ret_from_intr; \
+ENDPROC(name)
+
+#define BUILD_INTERRUPT(name, nr) \
+ BUILD_INTERRUPT3(name, nr, smp_##name); \
+
+/* The include is where all of the SMP etc. interrupts come from */
+#include <asm/entry_arch.h>
+
+ENTRY(coprocessor_error)
+ ASM_CLAC
+ pushl $0
+ pushl $do_coprocessor_error
+ jmp common_exception
+END(coprocessor_error)
+
+ENTRY(simd_coprocessor_error)
+ ASM_CLAC
+ pushl $0
+#ifdef CONFIG_X86_INVD_BUG
+ /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */
+ ALTERNATIVE "pushl $do_general_protection", \
+ "pushl $do_simd_coprocessor_error", \
+ X86_FEATURE_XMM
+#else
+ pushl $do_simd_coprocessor_error
+#endif
+ jmp common_exception
+END(simd_coprocessor_error)
+
+ENTRY(device_not_available)
+ ASM_CLAC
+ pushl $-1 # mark this as an int
+ pushl $do_device_not_available
+ jmp common_exception
+END(device_not_available)
+
+#ifdef CONFIG_PARAVIRT
+ENTRY(native_iret)
+ iret
+ _ASM_EXTABLE(native_iret, iret_exc)
+END(native_iret)
+#endif
+
+ENTRY(overflow)
+ ASM_CLAC
+ pushl $0
+ pushl $do_overflow
+ jmp common_exception
+END(overflow)
+
+ENTRY(bounds)
+ ASM_CLAC
+ pushl $0
+ pushl $do_bounds
+ jmp common_exception
+END(bounds)
+
+ENTRY(invalid_op)
+ ASM_CLAC
+ pushl $0
+ pushl $do_invalid_op
+ jmp common_exception
+END(invalid_op)
+
+ENTRY(coprocessor_segment_overrun)
+ ASM_CLAC
+ pushl $0
+ pushl $do_coprocessor_segment_overrun
+ jmp common_exception
+END(coprocessor_segment_overrun)
+
+ENTRY(invalid_TSS)
+ ASM_CLAC
+ pushl $do_invalid_TSS
+ jmp common_exception
+END(invalid_TSS)
+
+ENTRY(segment_not_present)
+ ASM_CLAC
+ pushl $do_segment_not_present
+ jmp common_exception
+END(segment_not_present)
+
+ENTRY(stack_segment)
+ ASM_CLAC
+ pushl $do_stack_segment
+ jmp common_exception
+END(stack_segment)
+
+ENTRY(alignment_check)
+ ASM_CLAC
+ pushl $do_alignment_check
+ jmp common_exception
+END(alignment_check)
+
+ENTRY(divide_error)
+ ASM_CLAC
+ pushl $0 # no error code
+ pushl $do_divide_error
+ jmp common_exception
+END(divide_error)
+
+#ifdef CONFIG_X86_MCE
+ENTRY(machine_check)
+ ASM_CLAC
+ pushl $0
+ pushl machine_check_vector
+ jmp common_exception
+END(machine_check)
+#endif
+
+ENTRY(spurious_interrupt_bug)
+ ASM_CLAC
+ pushl $0
+ pushl $do_spurious_interrupt_bug
+ jmp common_exception
+END(spurious_interrupt_bug)
+
+#ifdef CONFIG_XEN
+ENTRY(xen_hypervisor_callback)
+ pushl $-1 /* orig_ax = -1 => not a system call */
+ SAVE_ALL
+ ENCODE_FRAME_POINTER
+ TRACE_IRQS_OFF
+
+ /*
+ * Check to see if we got the event in the critical
+ * region in xen_iret_direct, after we've reenabled
+ * events and checked for pending events. This simulates
+ * iret instruction's behaviour where it delivers a
+ * pending interrupt when enabling interrupts:
+ */
+ movl PT_EIP(%esp), %eax
+ cmpl $xen_iret_start_crit, %eax
+ jb 1f
+ cmpl $xen_iret_end_crit, %eax
+ jae 1f
+
+ jmp xen_iret_crit_fixup
+
+ENTRY(xen_do_upcall)
+1: mov %esp, %eax
+ call xen_evtchn_do_upcall
+#ifndef CONFIG_PREEMPT
+ call xen_maybe_preempt_hcall
+#endif
+ jmp ret_from_intr
+ENDPROC(xen_hypervisor_callback)
+
+/*
+ * Hypervisor uses this for application faults while it executes.
+ * We get here for two reasons:
+ * 1. Fault while reloading DS, ES, FS or GS
+ * 2. Fault while executing IRET
+ * Category 1 we fix up by reattempting the load, and zeroing the segment
+ * register if the load fails.
+ * Category 2 we fix up by jumping to do_iret_error. We cannot use the
+ * normal Linux return path in this case because if we use the IRET hypercall
+ * to pop the stack frame we end up in an infinite loop of failsafe callbacks.
+ * We distinguish between categories by maintaining a status value in EAX.
+ */
+ENTRY(xen_failsafe_callback)
+ pushl %eax
+ movl $1, %eax
+1: mov 4(%esp), %ds
+2: mov 8(%esp), %es
+3: mov 12(%esp), %fs
+4: mov 16(%esp), %gs
+ /* EAX == 0 => Category 1 (Bad segment)
+ EAX != 0 => Category 2 (Bad IRET) */
+ testl %eax, %eax
+ popl %eax
+ lea 16(%esp), %esp
+ jz 5f
+ jmp iret_exc
+5: pushl $-1 /* orig_ax = -1 => not a system call */
+ SAVE_ALL
+ ENCODE_FRAME_POINTER
+ jmp ret_from_exception
+
+.section .fixup, "ax"
+6: xorl %eax, %eax
+ movl %eax, 4(%esp)
+ jmp 1b
+7: xorl %eax, %eax
+ movl %eax, 8(%esp)
+ jmp 2b
+8: xorl %eax, %eax
+ movl %eax, 12(%esp)
+ jmp 3b
+9: xorl %eax, %eax
+ movl %eax, 16(%esp)
+ jmp 4b
+.previous
+ _ASM_EXTABLE(1b, 6b)
+ _ASM_EXTABLE(2b, 7b)
+ _ASM_EXTABLE(3b, 8b)
+ _ASM_EXTABLE(4b, 9b)
+ENDPROC(xen_failsafe_callback)
+
+BUILD_INTERRUPT3(xen_hvm_callback_vector, HYPERVISOR_CALLBACK_VECTOR,
+ xen_evtchn_do_upcall)
+
+#endif /* CONFIG_XEN */
+
+#if IS_ENABLED(CONFIG_HYPERV)
+
+BUILD_INTERRUPT3(hyperv_callback_vector, HYPERVISOR_CALLBACK_VECTOR,
+ hyperv_vector_handler)
+
+BUILD_INTERRUPT3(hyperv_reenlightenment_vector, HYPERV_REENLIGHTENMENT_VECTOR,
+ hyperv_reenlightenment_intr)
+
+BUILD_INTERRUPT3(hv_stimer0_callback_vector, HYPERV_STIMER0_VECTOR,
+ hv_stimer0_vector_handler)
+
+#endif /* CONFIG_HYPERV */
+
+ENTRY(page_fault)
+ ASM_CLAC
+ pushl $do_page_fault
+ ALIGN
+ jmp common_exception
+END(page_fault)
+
+common_exception:
+ /* the function address is in %gs's slot on the stack */
+ pushl %fs
+ pushl %es
+ pushl %ds
+ pushl %eax
+ movl $(__USER_DS), %eax
+ movl %eax, %ds
+ movl %eax, %es
+ movl $(__KERNEL_PERCPU), %eax
+ movl %eax, %fs
+ pushl %ebp
+ pushl %edi
+ pushl %esi
+ pushl %edx
+ pushl %ecx
+ pushl %ebx
+ SWITCH_TO_KERNEL_STACK
+ ENCODE_FRAME_POINTER
+ cld
+ UNWIND_ESPFIX_STACK
+ GS_TO_REG %ecx
+ movl PT_GS(%esp), %edi # get the function address
+ movl PT_ORIG_EAX(%esp), %edx # get the error code
+ movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
+ REG_TO_PTGS %ecx
+ SET_KERNEL_GS %ecx
+ TRACE_IRQS_OFF
+ movl %esp, %eax # pt_regs pointer
+ CALL_NOSPEC %edi
+ jmp ret_from_exception
+END(common_exception)
+
+ENTRY(debug)
+ /*
+ * Entry from sysenter is now handled in common_exception
+ */
+ ASM_CLAC
+ pushl $-1 # mark this as an int
+ pushl $do_debug
+ jmp common_exception
+END(debug)
+
+/*
+ * NMI is doubly nasty. It can happen on the first instruction of
+ * entry_SYSENTER_32 (just like #DB), but it can also interrupt the beginning
+ * of the #DB handler even if that #DB in turn hit before entry_SYSENTER_32
+ * switched stacks. We handle both conditions by simply checking whether we
+ * interrupted kernel code running on the SYSENTER stack.
+ */
+ENTRY(nmi)
+ ASM_CLAC
+
+#ifdef CONFIG_X86_ESPFIX32
+ pushl %eax
+ movl %ss, %eax
+ cmpw $__ESPFIX_SS, %ax
+ popl %eax
+ je .Lnmi_espfix_stack
+#endif
+
+ pushl %eax # pt_regs->orig_ax
+ SAVE_ALL_NMI cr3_reg=%edi
+ ENCODE_FRAME_POINTER
+ xorl %edx, %edx # zero error code
+ movl %esp, %eax # pt_regs pointer
+
+ /* Are we currently on the SYSENTER stack? */
+ movl PER_CPU_VAR(cpu_entry_area), %ecx
+ addl $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx
+ subl %eax, %ecx /* ecx = (end of entry_stack) - esp */
+ cmpl $SIZEOF_entry_stack, %ecx
+ jb .Lnmi_from_sysenter_stack
+
+ /* Not on SYSENTER stack. */
+ call do_nmi
+ jmp .Lnmi_return
+
+.Lnmi_from_sysenter_stack:
+ /*
+ * We're on the SYSENTER stack. Switch off. No one (not even debug)
+ * is using the thread stack right now, so it's safe for us to use it.
+ */
+ movl %esp, %ebx
+ movl PER_CPU_VAR(cpu_current_top_of_stack), %esp
+ call do_nmi
+ movl %ebx, %esp
+
+.Lnmi_return:
+ CHECK_AND_APPLY_ESPFIX
+ RESTORE_ALL_NMI cr3_reg=%edi pop=4
+ jmp .Lirq_return
+
+#ifdef CONFIG_X86_ESPFIX32
+.Lnmi_espfix_stack:
+ /*
+ * create the pointer to lss back
+ */
+ pushl %ss
+ pushl %esp
+ addl $4, (%esp)
+ /* copy the iret frame of 12 bytes */
+ .rept 3
+ pushl 16(%esp)
+ .endr
+ pushl %eax
+ SAVE_ALL_NMI cr3_reg=%edi
+ ENCODE_FRAME_POINTER
+ FIXUP_ESPFIX_STACK # %eax == %esp
+ xorl %edx, %edx # zero error code
+ call do_nmi
+ RESTORE_ALL_NMI cr3_reg=%edi
+ lss 12+4(%esp), %esp # back to espfix stack
+ jmp .Lirq_return
+#endif
+END(nmi)
+
+ENTRY(int3)
+ ASM_CLAC
+ pushl $-1 # mark this as an int
+
+ SAVE_ALL switch_stacks=1
+ ENCODE_FRAME_POINTER
+ TRACE_IRQS_OFF
+ xorl %edx, %edx # zero error code
+ movl %esp, %eax # pt_regs pointer
+ call do_int3
+ jmp ret_from_exception
+END(int3)
+
+ENTRY(general_protection)
+ ASM_CLAC
+ pushl $do_general_protection
+ jmp common_exception
+END(general_protection)
+
+#ifdef CONFIG_KVM_GUEST
+ENTRY(async_page_fault)
+ ASM_CLAC
+ pushl $do_async_page_fault
+ jmp common_exception
+END(async_page_fault)
+#endif
+
+ENTRY(rewind_stack_do_exit)
+ /* Prevent any naive code from trying to unwind to our caller. */
+ xorl %ebp, %ebp
+
+ movl PER_CPU_VAR(cpu_current_top_of_stack), %esi
+ leal -TOP_OF_KERNEL_STACK_PADDING-PTREGS_SIZE(%esi), %esp
+
+ call do_exit
+1: jmp 1b
+END(rewind_stack_do_exit)
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
new file mode 100644
index 000000000..dfe26f3cf
--- /dev/null
+++ b/arch/x86/entry/entry_64.S
@@ -0,0 +1,1751 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * linux/arch/x86_64/entry.S
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
+ * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
+ *
+ * entry.S contains the system-call and fault low-level handling routines.
+ *
+ * Some of this is documented in Documentation/x86/entry_64.txt
+ *
+ * A note on terminology:
+ * - iret frame: Architecture defined interrupt frame from SS to RIP
+ * at the top of the kernel process stack.
+ *
+ * Some macro usage:
+ * - ENTRY/END: Define functions in the symbol table.
+ * - TRACE_IRQ_*: Trace hardirq state for lock debugging.
+ * - idtentry: Define exception entry points.
+ */
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/cache.h>
+#include <asm/errno.h>
+#include <asm/asm-offsets.h>
+#include <asm/msr.h>
+#include <asm/unistd.h>
+#include <asm/thread_info.h>
+#include <asm/hw_irq.h>
+#include <asm/page_types.h>
+#include <asm/irqflags.h>
+#include <asm/paravirt.h>
+#include <asm/percpu.h>
+#include <asm/asm.h>
+#include <asm/smap.h>
+#include <asm/pgtable_types.h>
+#include <asm/export.h>
+#include <asm/frame.h>
+#include <asm/nospec-branch.h>
+#include <linux/err.h>
+
+#include "calling.h"
+
+.code64
+.section .entry.text, "ax"
+
+#ifdef CONFIG_PARAVIRT
+ENTRY(native_usergs_sysret64)
+ UNWIND_HINT_EMPTY
+ swapgs
+ sysretq
+END(native_usergs_sysret64)
+#endif /* CONFIG_PARAVIRT */
+
+.macro TRACE_IRQS_FLAGS flags:req
+#ifdef CONFIG_TRACE_IRQFLAGS
+ btl $9, \flags /* interrupts off? */
+ jnc 1f
+ TRACE_IRQS_ON
+1:
+#endif
+.endm
+
+.macro TRACE_IRQS_IRETQ
+ TRACE_IRQS_FLAGS EFLAGS(%rsp)
+.endm
+
+/*
+ * When dynamic function tracer is enabled it will add a breakpoint
+ * to all locations that it is about to modify, sync CPUs, update
+ * all the code, sync CPUs, then remove the breakpoints. In this time
+ * if lockdep is enabled, it might jump back into the debug handler
+ * outside the updating of the IST protection. (TRACE_IRQS_ON/OFF).
+ *
+ * We need to change the IDT table before calling TRACE_IRQS_ON/OFF to
+ * make sure the stack pointer does not get reset back to the top
+ * of the debug stack, and instead just reuses the current stack.
+ */
+#if defined(CONFIG_DYNAMIC_FTRACE) && defined(CONFIG_TRACE_IRQFLAGS)
+
+.macro TRACE_IRQS_OFF_DEBUG
+ call debug_stack_set_zero
+ TRACE_IRQS_OFF
+ call debug_stack_reset
+.endm
+
+.macro TRACE_IRQS_ON_DEBUG
+ call debug_stack_set_zero
+ TRACE_IRQS_ON
+ call debug_stack_reset
+.endm
+
+.macro TRACE_IRQS_IRETQ_DEBUG
+ btl $9, EFLAGS(%rsp) /* interrupts off? */
+ jnc 1f
+ TRACE_IRQS_ON_DEBUG
+1:
+.endm
+
+#else
+# define TRACE_IRQS_OFF_DEBUG TRACE_IRQS_OFF
+# define TRACE_IRQS_ON_DEBUG TRACE_IRQS_ON
+# define TRACE_IRQS_IRETQ_DEBUG TRACE_IRQS_IRETQ
+#endif
+
+/*
+ * 64-bit SYSCALL instruction entry. Up to 6 arguments in registers.
+ *
+ * This is the only entry point used for 64-bit system calls. The
+ * hardware interface is reasonably well designed and the register to
+ * argument mapping Linux uses fits well with the registers that are
+ * available when SYSCALL is used.
+ *
+ * SYSCALL instructions can be found inlined in libc implementations as
+ * well as some other programs and libraries. There are also a handful
+ * of SYSCALL instructions in the vDSO used, for example, as a
+ * clock_gettimeofday fallback.
+ *
+ * 64-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
+ * then loads new ss, cs, and rip from previously programmed MSRs.
+ * rflags gets masked by a value from another MSR (so CLD and CLAC
+ * are not needed). SYSCALL does not save anything on the stack
+ * and does not change rsp.
+ *
+ * Registers on entry:
+ * rax system call number
+ * rcx return address
+ * r11 saved rflags (note: r11 is callee-clobbered register in C ABI)
+ * rdi arg0
+ * rsi arg1
+ * rdx arg2
+ * r10 arg3 (needs to be moved to rcx to conform to C ABI)
+ * r8 arg4
+ * r9 arg5
+ * (note: r12-r15, rbp, rbx are callee-preserved in C ABI)
+ *
+ * Only called from user space.
+ *
+ * When user can change pt_regs->foo always force IRET. That is because
+ * it deals with uncanonical addresses better. SYSRET has trouble
+ * with them due to bugs in both AMD and Intel CPUs.
+ */
+
+ .pushsection .entry_trampoline, "ax"
+
+/*
+ * The code in here gets remapped into cpu_entry_area's trampoline. This means
+ * that the assembler and linker have the wrong idea as to where this code
+ * lives (and, in fact, it's mapped more than once, so it's not even at a
+ * fixed address). So we can't reference any symbols outside the entry
+ * trampoline and expect it to work.
+ *
+ * Instead, we carefully abuse %rip-relative addressing.
+ * _entry_trampoline(%rip) refers to the start of the remapped) entry
+ * trampoline. We can thus find cpu_entry_area with this macro:
+ */
+
+#define CPU_ENTRY_AREA \
+ _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip)
+
+/* The top word of the SYSENTER stack is hot and is usable as scratch space. */
+#define RSP_SCRATCH CPU_ENTRY_AREA_entry_stack + \
+ SIZEOF_entry_stack - 8 + CPU_ENTRY_AREA
+
+ENTRY(entry_SYSCALL_64_trampoline)
+ UNWIND_HINT_EMPTY
+ swapgs
+
+ /* Stash the user RSP. */
+ movq %rsp, RSP_SCRATCH
+
+ /* Note: using %rsp as a scratch reg. */
+ SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
+
+ /* Load the top of the task stack into RSP */
+ movq CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp
+
+ /* Start building the simulated IRET frame. */
+ pushq $__USER_DS /* pt_regs->ss */
+ pushq RSP_SCRATCH /* pt_regs->sp */
+ pushq %r11 /* pt_regs->flags */
+ pushq $__USER_CS /* pt_regs->cs */
+ pushq %rcx /* pt_regs->ip */
+
+ /*
+ * x86 lacks a near absolute jump, and we can't jump to the real
+ * entry text with a relative jump. We could push the target
+ * address and then use retq, but this destroys the pipeline on
+ * many CPUs (wasting over 20 cycles on Sandy Bridge). Instead,
+ * spill RDI and restore it in a second-stage trampoline.
+ */
+ pushq %rdi
+ movq $entry_SYSCALL_64_stage2, %rdi
+ JMP_NOSPEC %rdi
+END(entry_SYSCALL_64_trampoline)
+
+ .popsection
+
+ENTRY(entry_SYSCALL_64_stage2)
+ UNWIND_HINT_EMPTY
+ popq %rdi
+ jmp entry_SYSCALL_64_after_hwframe
+END(entry_SYSCALL_64_stage2)
+
+ENTRY(entry_SYSCALL_64)
+ UNWIND_HINT_EMPTY
+ /*
+ * Interrupts are off on entry.
+ * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
+ * it is too small to ever cause noticeable irq latency.
+ */
+
+ swapgs
+ /*
+ * This path is only taken when PAGE_TABLE_ISOLATION is disabled so it
+ * is not required to switch CR3.
+ */
+ movq %rsp, PER_CPU_VAR(rsp_scratch)
+ movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+
+ /* Construct struct pt_regs on stack */
+ pushq $__USER_DS /* pt_regs->ss */
+ pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */
+ pushq %r11 /* pt_regs->flags */
+ pushq $__USER_CS /* pt_regs->cs */
+ pushq %rcx /* pt_regs->ip */
+GLOBAL(entry_SYSCALL_64_after_hwframe)
+ pushq %rax /* pt_regs->orig_ax */
+
+ PUSH_AND_CLEAR_REGS rax=$-ENOSYS
+
+ TRACE_IRQS_OFF
+
+ /* IRQs are off. */
+ movq %rax, %rdi
+ movq %rsp, %rsi
+ call do_syscall_64 /* returns with IRQs disabled */
+
+ TRACE_IRQS_IRETQ /* we're about to change IF */
+
+ /*
+ * Try to use SYSRET instead of IRET if we're returning to
+ * a completely clean 64-bit userspace context. If we're not,
+ * go to the slow exit path.
+ */
+ movq RCX(%rsp), %rcx
+ movq RIP(%rsp), %r11
+
+ cmpq %rcx, %r11 /* SYSRET requires RCX == RIP */
+ jne swapgs_restore_regs_and_return_to_usermode
+
+ /*
+ * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
+ * in kernel space. This essentially lets the user take over
+ * the kernel, since userspace controls RSP.
+ *
+ * If width of "canonical tail" ever becomes variable, this will need
+ * to be updated to remain correct on both old and new CPUs.
+ *
+ * Change top bits to match most significant bit (47th or 56th bit
+ * depending on paging mode) in the address.
+ */
+#ifdef CONFIG_X86_5LEVEL
+ ALTERNATIVE "shl $(64 - 48), %rcx; sar $(64 - 48), %rcx", \
+ "shl $(64 - 57), %rcx; sar $(64 - 57), %rcx", X86_FEATURE_LA57
+#else
+ shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
+ sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
+#endif
+
+ /* If this changed %rcx, it was not canonical */
+ cmpq %rcx, %r11
+ jne swapgs_restore_regs_and_return_to_usermode
+
+ cmpq $__USER_CS, CS(%rsp) /* CS must match SYSRET */
+ jne swapgs_restore_regs_and_return_to_usermode
+
+ movq R11(%rsp), %r11
+ cmpq %r11, EFLAGS(%rsp) /* R11 == RFLAGS */
+ jne swapgs_restore_regs_and_return_to_usermode
+
+ /*
+ * SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot
+ * restore RF properly. If the slowpath sets it for whatever reason, we
+ * need to restore it correctly.
+ *
+ * SYSRET can restore TF, but unlike IRET, restoring TF results in a
+ * trap from userspace immediately after SYSRET. This would cause an
+ * infinite loop whenever #DB happens with register state that satisfies
+ * the opportunistic SYSRET conditions. For example, single-stepping
+ * this user code:
+ *
+ * movq $stuck_here, %rcx
+ * pushfq
+ * popq %r11
+ * stuck_here:
+ *
+ * would never get past 'stuck_here'.
+ */
+ testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
+ jnz swapgs_restore_regs_and_return_to_usermode
+
+ /* nothing to check for RSP */
+
+ cmpq $__USER_DS, SS(%rsp) /* SS must match SYSRET */
+ jne swapgs_restore_regs_and_return_to_usermode
+
+ /*
+ * We win! This label is here just for ease of understanding
+ * perf profiles. Nothing jumps here.
+ */
+syscall_return_via_sysret:
+ /* rcx and r11 are already restored (see code above) */
+ POP_REGS pop_rdi=0 skip_r11rcx=1
+
+ /*
+ * Now all regs are restored except RSP and RDI.
+ * Save old stack pointer and switch to trampoline stack.
+ */
+ movq %rsp, %rdi
+ movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
+ UNWIND_HINT_EMPTY
+
+ pushq RSP-RDI(%rdi) /* RSP */
+ pushq (%rdi) /* RDI */
+
+ /*
+ * We are on the trampoline stack. All regs except RDI are live.
+ * We can do future final exit work right here.
+ */
+ SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
+
+ popq %rdi
+ popq %rsp
+ USERGS_SYSRET64
+END(entry_SYSCALL_64)
+
+/*
+ * %rdi: prev task
+ * %rsi: next task
+ */
+ENTRY(__switch_to_asm)
+ UNWIND_HINT_FUNC
+ /*
+ * Save callee-saved registers
+ * This must match the order in inactive_task_frame
+ */
+ pushq %rbp
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ pushfq
+
+ /* switch stack */
+ movq %rsp, TASK_threadsp(%rdi)
+ movq TASK_threadsp(%rsi), %rsp
+
+#ifdef CONFIG_STACKPROTECTOR
+ movq TASK_stack_canary(%rsi), %rbx
+ movq %rbx, PER_CPU_VAR(irq_stack_union)+stack_canary_offset
+#endif
+
+#ifdef CONFIG_RETPOLINE
+ /*
+ * When switching from a shallower to a deeper call stack
+ * the RSB may either underflow or use entries populated
+ * with userspace addresses. On CPUs where those concerns
+ * exist, overwrite the RSB with entries which capture
+ * speculative execution to prevent attack.
+ */
+ FILL_RETURN_BUFFER %r12, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
+#endif
+
+ /* restore callee-saved registers */
+ popfq
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbx
+ popq %rbp
+
+ jmp __switch_to
+END(__switch_to_asm)
+
+/*
+ * A newly forked process directly context switches into this address.
+ *
+ * rax: prev task we switched from
+ * rbx: kernel thread func (NULL for user thread)
+ * r12: kernel thread arg
+ */
+ENTRY(ret_from_fork)
+ UNWIND_HINT_EMPTY
+ movq %rax, %rdi
+ call schedule_tail /* rdi: 'prev' task parameter */
+
+ testq %rbx, %rbx /* from kernel_thread? */
+ jnz 1f /* kernel threads are uncommon */
+
+2:
+ UNWIND_HINT_REGS
+ movq %rsp, %rdi
+ call syscall_return_slowpath /* returns with IRQs disabled */
+ TRACE_IRQS_ON /* user mode is traced as IRQS on */
+ jmp swapgs_restore_regs_and_return_to_usermode
+
+1:
+ /* kernel thread */
+ UNWIND_HINT_EMPTY
+ movq %r12, %rdi
+ CALL_NOSPEC %rbx
+ /*
+ * A kernel thread is allowed to return here after successfully
+ * calling do_execve(). Exit to userspace to complete the execve()
+ * syscall.
+ */
+ movq $0, RAX(%rsp)
+ jmp 2b
+END(ret_from_fork)
+
+/*
+ * Build the entry stubs with some assembler magic.
+ * We pack 1 stub into every 8-byte block.
+ */
+ .align 8
+ENTRY(irq_entries_start)
+ vector=FIRST_EXTERNAL_VECTOR
+ .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
+ UNWIND_HINT_IRET_REGS
+ pushq $(~vector+0x80) /* Note: always in signed byte range */
+ jmp common_interrupt
+ .align 8
+ vector=vector+1
+ .endr
+END(irq_entries_start)
+
+ .align 8
+ENTRY(spurious_entries_start)
+ vector=FIRST_SYSTEM_VECTOR
+ .rept (NR_VECTORS - FIRST_SYSTEM_VECTOR)
+ UNWIND_HINT_IRET_REGS
+ pushq $(~vector+0x80) /* Note: always in signed byte range */
+ jmp common_spurious
+ .align 8
+ vector=vector+1
+ .endr
+END(spurious_entries_start)
+
+.macro DEBUG_ENTRY_ASSERT_IRQS_OFF
+#ifdef CONFIG_DEBUG_ENTRY
+ pushq %rax
+ SAVE_FLAGS(CLBR_RAX)
+ testl $X86_EFLAGS_IF, %eax
+ jz .Lokay_\@
+ ud2
+.Lokay_\@:
+ popq %rax
+#endif
+.endm
+
+/*
+ * Enters the IRQ stack if we're not already using it. NMI-safe. Clobbers
+ * flags and puts old RSP into old_rsp, and leaves all other GPRs alone.
+ * Requires kernel GSBASE.
+ *
+ * The invariant is that, if irq_count != -1, then the IRQ stack is in use.
+ */
+.macro ENTER_IRQ_STACK regs=1 old_rsp save_ret=0
+ DEBUG_ENTRY_ASSERT_IRQS_OFF
+
+ .if \save_ret
+ /*
+ * If save_ret is set, the original stack contains one additional
+ * entry -- the return address. Therefore, move the address one
+ * entry below %rsp to \old_rsp.
+ */
+ leaq 8(%rsp), \old_rsp
+ .else
+ movq %rsp, \old_rsp
+ .endif
+
+ .if \regs
+ UNWIND_HINT_REGS base=\old_rsp
+ .endif
+
+ incl PER_CPU_VAR(irq_count)
+ jnz .Lirq_stack_push_old_rsp_\@
+
+ /*
+ * Right now, if we just incremented irq_count to zero, we've
+ * claimed the IRQ stack but we haven't switched to it yet.
+ *
+ * If anything is added that can interrupt us here without using IST,
+ * it must be *extremely* careful to limit its stack usage. This
+ * could include kprobes and a hypothetical future IST-less #DB
+ * handler.
+ *
+ * The OOPS unwinder relies on the word at the top of the IRQ
+ * stack linking back to the previous RSP for the entire time we're
+ * on the IRQ stack. For this to work reliably, we need to write
+ * it before we actually move ourselves to the IRQ stack.
+ */
+
+ movq \old_rsp, PER_CPU_VAR(irq_stack_union + IRQ_STACK_SIZE - 8)
+ movq PER_CPU_VAR(irq_stack_ptr), %rsp
+
+#ifdef CONFIG_DEBUG_ENTRY
+ /*
+ * If the first movq above becomes wrong due to IRQ stack layout
+ * changes, the only way we'll notice is if we try to unwind right
+ * here. Assert that we set up the stack right to catch this type
+ * of bug quickly.
+ */
+ cmpq -8(%rsp), \old_rsp
+ je .Lirq_stack_okay\@
+ ud2
+ .Lirq_stack_okay\@:
+#endif
+
+.Lirq_stack_push_old_rsp_\@:
+ pushq \old_rsp
+
+ .if \regs
+ UNWIND_HINT_REGS indirect=1
+ .endif
+
+ .if \save_ret
+ /*
+ * Push the return address to the stack. This return address can
+ * be found at the "real" original RSP, which was offset by 8 at
+ * the beginning of this macro.
+ */
+ pushq -8(\old_rsp)
+ .endif
+.endm
+
+/*
+ * Undoes ENTER_IRQ_STACK.
+ */
+.macro LEAVE_IRQ_STACK regs=1
+ DEBUG_ENTRY_ASSERT_IRQS_OFF
+ /* We need to be off the IRQ stack before decrementing irq_count. */
+ popq %rsp
+
+ .if \regs
+ UNWIND_HINT_REGS
+ .endif
+
+ /*
+ * As in ENTER_IRQ_STACK, irq_count == 0, we are still claiming
+ * the irq stack but we're not on it.
+ */
+
+ decl PER_CPU_VAR(irq_count)
+.endm
+
+/*
+ * Interrupt entry helper function.
+ *
+ * Entry runs with interrupts off. Stack layout at entry:
+ * +----------------------------------------------------+
+ * | regs->ss |
+ * | regs->rsp |
+ * | regs->eflags |
+ * | regs->cs |
+ * | regs->ip |
+ * +----------------------------------------------------+
+ * | regs->orig_ax = ~(interrupt number) |
+ * +----------------------------------------------------+
+ * | return address |
+ * +----------------------------------------------------+
+ */
+ENTRY(interrupt_entry)
+ UNWIND_HINT_IRET_REGS offset=16
+ ASM_CLAC
+ cld
+
+ testb $3, CS-ORIG_RAX+8(%rsp)
+ jz 1f
+ SWAPGS
+ FENCE_SWAPGS_USER_ENTRY
+ /*
+ * Switch to the thread stack. The IRET frame and orig_ax are
+ * on the stack, as well as the return address. RDI..R12 are
+ * not (yet) on the stack and space has not (yet) been
+ * allocated for them.
+ */
+ pushq %rdi
+
+ /* Need to switch before accessing the thread stack. */
+ SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
+ movq %rsp, %rdi
+ movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+
+ /*
+ * We have RDI, return address, and orig_ax on the stack on
+ * top of the IRET frame. That means offset=24
+ */
+ UNWIND_HINT_IRET_REGS base=%rdi offset=24
+
+ pushq 7*8(%rdi) /* regs->ss */
+ pushq 6*8(%rdi) /* regs->rsp */
+ pushq 5*8(%rdi) /* regs->eflags */
+ pushq 4*8(%rdi) /* regs->cs */
+ pushq 3*8(%rdi) /* regs->ip */
+ UNWIND_HINT_IRET_REGS
+ pushq 2*8(%rdi) /* regs->orig_ax */
+ pushq 8(%rdi) /* return address */
+
+ movq (%rdi), %rdi
+ jmp 2f
+1:
+ FENCE_SWAPGS_KERNEL_ENTRY
+2:
+ PUSH_AND_CLEAR_REGS save_ret=1
+ ENCODE_FRAME_POINTER 8
+
+ testb $3, CS+8(%rsp)
+ jz 1f
+
+ /*
+ * IRQ from user mode.
+ *
+ * We need to tell lockdep that IRQs are off. We can't do this until
+ * we fix gsbase, and we should do it before enter_from_user_mode
+ * (which can take locks). Since TRACE_IRQS_OFF is idempotent,
+ * the simplest way to handle it is to just call it twice if
+ * we enter from user mode. There's no reason to optimize this since
+ * TRACE_IRQS_OFF is a no-op if lockdep is off.
+ */
+ TRACE_IRQS_OFF
+
+ CALL_enter_from_user_mode
+
+1:
+ ENTER_IRQ_STACK old_rsp=%rdi save_ret=1
+ /* We entered an interrupt context - irqs are off: */
+ TRACE_IRQS_OFF
+
+ ret
+END(interrupt_entry)
+_ASM_NOKPROBE(interrupt_entry)
+
+
+/* Interrupt entry/exit. */
+
+/*
+ * The interrupt stubs push (~vector+0x80) onto the stack and
+ * then jump to common_spurious/interrupt.
+ */
+common_spurious:
+ addq $-0x80, (%rsp) /* Adjust vector to [-256, -1] range */
+ call interrupt_entry
+ UNWIND_HINT_REGS indirect=1
+ call smp_spurious_interrupt /* rdi points to pt_regs */
+ jmp ret_from_intr
+END(common_spurious)
+_ASM_NOKPROBE(common_spurious)
+
+/* common_interrupt is a hotpath. Align it */
+ .p2align CONFIG_X86_L1_CACHE_SHIFT
+common_interrupt:
+ addq $-0x80, (%rsp) /* Adjust vector to [-256, -1] range */
+ call interrupt_entry
+ UNWIND_HINT_REGS indirect=1
+ call do_IRQ /* rdi points to pt_regs */
+ /* 0(%rsp): old RSP */
+ret_from_intr:
+ DISABLE_INTERRUPTS(CLBR_ANY)
+ TRACE_IRQS_OFF
+
+ LEAVE_IRQ_STACK
+
+ testb $3, CS(%rsp)
+ jz retint_kernel
+
+ /* Interrupt came from user space */
+GLOBAL(retint_user)
+ mov %rsp,%rdi
+ call prepare_exit_to_usermode
+ TRACE_IRQS_IRETQ
+
+GLOBAL(swapgs_restore_regs_and_return_to_usermode)
+#ifdef CONFIG_DEBUG_ENTRY
+ /* Assert that pt_regs indicates user mode. */
+ testb $3, CS(%rsp)
+ jnz 1f
+ ud2
+1:
+#endif
+ POP_REGS pop_rdi=0
+
+ /*
+ * The stack is now user RDI, orig_ax, RIP, CS, EFLAGS, RSP, SS.
+ * Save old stack pointer and switch to trampoline stack.
+ */
+ movq %rsp, %rdi
+ movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
+ UNWIND_HINT_EMPTY
+
+ /* Copy the IRET frame to the trampoline stack. */
+ pushq 6*8(%rdi) /* SS */
+ pushq 5*8(%rdi) /* RSP */
+ pushq 4*8(%rdi) /* EFLAGS */
+ pushq 3*8(%rdi) /* CS */
+ pushq 2*8(%rdi) /* RIP */
+
+ /* Push user RDI on the trampoline stack. */
+ pushq (%rdi)
+
+ /*
+ * We are on the trampoline stack. All regs except RDI are live.
+ * We can do future final exit work right here.
+ */
+
+ SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
+
+ /* Restore RDI. */
+ popq %rdi
+ SWAPGS
+ INTERRUPT_RETURN
+
+
+/* Returning to kernel space */
+retint_kernel:
+#ifdef CONFIG_PREEMPT
+ /* Interrupts are off */
+ /* Check if we need preemption */
+ btl $9, EFLAGS(%rsp) /* were interrupts off? */
+ jnc 1f
+0: cmpl $0, PER_CPU_VAR(__preempt_count)
+ jnz 1f
+ call preempt_schedule_irq
+ jmp 0b
+1:
+#endif
+ /*
+ * The iretq could re-enable interrupts:
+ */
+ TRACE_IRQS_IRETQ
+
+GLOBAL(restore_regs_and_return_to_kernel)
+#ifdef CONFIG_DEBUG_ENTRY
+ /* Assert that pt_regs indicates kernel mode. */
+ testb $3, CS(%rsp)
+ jz 1f
+ ud2
+1:
+#endif
+ POP_REGS
+ addq $8, %rsp /* skip regs->orig_ax */
+ /*
+ * ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization
+ * when returning from IPI handler.
+ */
+ INTERRUPT_RETURN
+
+ENTRY(native_iret)
+ UNWIND_HINT_IRET_REGS
+ /*
+ * Are we returning to a stack segment from the LDT? Note: in
+ * 64-bit mode SS:RSP on the exception stack is always valid.
+ */
+#ifdef CONFIG_X86_ESPFIX64
+ testb $4, (SS-RIP)(%rsp)
+ jnz native_irq_return_ldt
+#endif
+
+.global native_irq_return_iret
+native_irq_return_iret:
+ /*
+ * This may fault. Non-paranoid faults on return to userspace are
+ * handled by fixup_bad_iret. These include #SS, #GP, and #NP.
+ * Double-faults due to espfix64 are handled in do_double_fault.
+ * Other faults here are fatal.
+ */
+ iretq
+
+#ifdef CONFIG_X86_ESPFIX64
+native_irq_return_ldt:
+ /*
+ * We are running with user GSBASE. All GPRs contain their user
+ * values. We have a percpu ESPFIX stack that is eight slots
+ * long (see ESPFIX_STACK_SIZE). espfix_waddr points to the bottom
+ * of the ESPFIX stack.
+ *
+ * We clobber RAX and RDI in this code. We stash RDI on the
+ * normal stack and RAX on the ESPFIX stack.
+ *
+ * The ESPFIX stack layout we set up looks like this:
+ *
+ * --- top of ESPFIX stack ---
+ * SS
+ * RSP
+ * RFLAGS
+ * CS
+ * RIP <-- RSP points here when we're done
+ * RAX <-- espfix_waddr points here
+ * --- bottom of ESPFIX stack ---
+ */
+
+ pushq %rdi /* Stash user RDI */
+ SWAPGS /* to kernel GS */
+ SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi /* to kernel CR3 */
+
+ movq PER_CPU_VAR(espfix_waddr), %rdi
+ movq %rax, (0*8)(%rdi) /* user RAX */
+ movq (1*8)(%rsp), %rax /* user RIP */
+ movq %rax, (1*8)(%rdi)
+ movq (2*8)(%rsp), %rax /* user CS */
+ movq %rax, (2*8)(%rdi)
+ movq (3*8)(%rsp), %rax /* user RFLAGS */
+ movq %rax, (3*8)(%rdi)
+ movq (5*8)(%rsp), %rax /* user SS */
+ movq %rax, (5*8)(%rdi)
+ movq (4*8)(%rsp), %rax /* user RSP */
+ movq %rax, (4*8)(%rdi)
+ /* Now RAX == RSP. */
+
+ andl $0xffff0000, %eax /* RAX = (RSP & 0xffff0000) */
+
+ /*
+ * espfix_stack[31:16] == 0. The page tables are set up such that
+ * (espfix_stack | (X & 0xffff0000)) points to a read-only alias of
+ * espfix_waddr for any X. That is, there are 65536 RO aliases of
+ * the same page. Set up RSP so that RSP[31:16] contains the
+ * respective 16 bits of the /userspace/ RSP and RSP nonetheless
+ * still points to an RO alias of the ESPFIX stack.
+ */
+ orq PER_CPU_VAR(espfix_stack), %rax
+
+ SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
+ SWAPGS /* to user GS */
+ popq %rdi /* Restore user RDI */
+
+ movq %rax, %rsp
+ UNWIND_HINT_IRET_REGS offset=8
+
+ /*
+ * At this point, we cannot write to the stack any more, but we can
+ * still read.
+ */
+ popq %rax /* Restore user RAX */
+
+ /*
+ * RSP now points to an ordinary IRET frame, except that the page
+ * is read-only and RSP[31:16] are preloaded with the userspace
+ * values. We can now IRET back to userspace.
+ */
+ jmp native_irq_return_iret
+#endif
+END(common_interrupt)
+_ASM_NOKPROBE(common_interrupt)
+
+/*
+ * APIC interrupts.
+ */
+.macro apicinterrupt3 num sym do_sym
+ENTRY(\sym)
+ UNWIND_HINT_IRET_REGS
+ pushq $~(\num)
+.Lcommon_\sym:
+ call interrupt_entry
+ UNWIND_HINT_REGS indirect=1
+ call \do_sym /* rdi points to pt_regs */
+ jmp ret_from_intr
+END(\sym)
+_ASM_NOKPROBE(\sym)
+.endm
+
+/* Make sure APIC interrupt handlers end up in the irqentry section: */
+#define PUSH_SECTION_IRQENTRY .pushsection .irqentry.text, "ax"
+#define POP_SECTION_IRQENTRY .popsection
+
+.macro apicinterrupt num sym do_sym
+PUSH_SECTION_IRQENTRY
+apicinterrupt3 \num \sym \do_sym
+POP_SECTION_IRQENTRY
+.endm
+
+#ifdef CONFIG_SMP
+apicinterrupt3 IRQ_MOVE_CLEANUP_VECTOR irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt
+apicinterrupt3 REBOOT_VECTOR reboot_interrupt smp_reboot_interrupt
+#endif
+
+#ifdef CONFIG_X86_UV
+apicinterrupt3 UV_BAU_MESSAGE uv_bau_message_intr1 uv_bau_message_interrupt
+#endif
+
+apicinterrupt LOCAL_TIMER_VECTOR apic_timer_interrupt smp_apic_timer_interrupt
+apicinterrupt X86_PLATFORM_IPI_VECTOR x86_platform_ipi smp_x86_platform_ipi
+
+#ifdef CONFIG_HAVE_KVM
+apicinterrupt3 POSTED_INTR_VECTOR kvm_posted_intr_ipi smp_kvm_posted_intr_ipi
+apicinterrupt3 POSTED_INTR_WAKEUP_VECTOR kvm_posted_intr_wakeup_ipi smp_kvm_posted_intr_wakeup_ipi
+apicinterrupt3 POSTED_INTR_NESTED_VECTOR kvm_posted_intr_nested_ipi smp_kvm_posted_intr_nested_ipi
+#endif
+
+#ifdef CONFIG_X86_MCE_THRESHOLD
+apicinterrupt THRESHOLD_APIC_VECTOR threshold_interrupt smp_threshold_interrupt
+#endif
+
+#ifdef CONFIG_X86_MCE_AMD
+apicinterrupt DEFERRED_ERROR_VECTOR deferred_error_interrupt smp_deferred_error_interrupt
+#endif
+
+#ifdef CONFIG_X86_THERMAL_VECTOR
+apicinterrupt THERMAL_APIC_VECTOR thermal_interrupt smp_thermal_interrupt
+#endif
+
+#ifdef CONFIG_SMP
+apicinterrupt CALL_FUNCTION_SINGLE_VECTOR call_function_single_interrupt smp_call_function_single_interrupt
+apicinterrupt CALL_FUNCTION_VECTOR call_function_interrupt smp_call_function_interrupt
+apicinterrupt RESCHEDULE_VECTOR reschedule_interrupt smp_reschedule_interrupt
+#endif
+
+apicinterrupt ERROR_APIC_VECTOR error_interrupt smp_error_interrupt
+apicinterrupt SPURIOUS_APIC_VECTOR spurious_interrupt smp_spurious_interrupt
+
+#ifdef CONFIG_IRQ_WORK
+apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt
+#endif
+
+/*
+ * Exception entry points.
+ */
+#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + ((x) - 1) * 8)
+
+.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 create_gap=0
+ENTRY(\sym)
+ UNWIND_HINT_IRET_REGS offset=\has_error_code*8
+
+ /* Sanity check */
+ .if \shift_ist != -1 && \paranoid == 0
+ .error "using shift_ist requires paranoid=1"
+ .endif
+
+ ASM_CLAC
+
+ .if \has_error_code == 0
+ pushq $-1 /* ORIG_RAX: no syscall to restart */
+ .endif
+
+ .if \paranoid == 1
+ testb $3, CS-ORIG_RAX(%rsp) /* If coming from userspace, switch stacks */
+ jnz .Lfrom_usermode_switch_stack_\@
+ .endif
+
+ .if \create_gap == 1
+ /*
+ * If coming from kernel space, create a 6-word gap to allow the
+ * int3 handler to emulate a call instruction.
+ */
+ testb $3, CS-ORIG_RAX(%rsp)
+ jnz .Lfrom_usermode_no_gap_\@
+ .rept 6
+ pushq 5*8(%rsp)
+ .endr
+ UNWIND_HINT_IRET_REGS offset=8
+.Lfrom_usermode_no_gap_\@:
+ .endif
+
+ .if \paranoid
+ call paranoid_entry
+ .else
+ call error_entry
+ .endif
+ UNWIND_HINT_REGS
+ /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */
+
+ .if \paranoid
+ .if \shift_ist != -1
+ TRACE_IRQS_OFF_DEBUG /* reload IDT in case of recursion */
+ .else
+ TRACE_IRQS_OFF
+ .endif
+ .endif
+
+ movq %rsp, %rdi /* pt_regs pointer */
+
+ .if \has_error_code
+ movq ORIG_RAX(%rsp), %rsi /* get error code */
+ movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
+ .else
+ xorl %esi, %esi /* no error code */
+ .endif
+
+ .if \shift_ist != -1
+ subq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist)
+ .endif
+
+ call \do_sym
+
+ .if \shift_ist != -1
+ addq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist)
+ .endif
+
+ /* these procedures expect "no swapgs" flag in ebx */
+ .if \paranoid
+ jmp paranoid_exit
+ .else
+ jmp error_exit
+ .endif
+
+ .if \paranoid == 1
+ /*
+ * Entry from userspace. Switch stacks and treat it
+ * as a normal entry. This means that paranoid handlers
+ * run in real process context if user_mode(regs).
+ */
+.Lfrom_usermode_switch_stack_\@:
+ call error_entry
+
+ movq %rsp, %rdi /* pt_regs pointer */
+
+ .if \has_error_code
+ movq ORIG_RAX(%rsp), %rsi /* get error code */
+ movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
+ .else
+ xorl %esi, %esi /* no error code */
+ .endif
+
+ call \do_sym
+
+ jmp error_exit
+ .endif
+_ASM_NOKPROBE(\sym)
+END(\sym)
+.endm
+
+idtentry divide_error do_divide_error has_error_code=0
+idtentry overflow do_overflow has_error_code=0
+idtentry bounds do_bounds has_error_code=0
+idtentry invalid_op do_invalid_op has_error_code=0
+idtentry device_not_available do_device_not_available has_error_code=0
+idtentry double_fault do_double_fault has_error_code=1 paranoid=2
+idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0
+idtentry invalid_TSS do_invalid_TSS has_error_code=1
+idtentry segment_not_present do_segment_not_present has_error_code=1
+idtentry spurious_interrupt_bug do_spurious_interrupt_bug has_error_code=0
+idtentry coprocessor_error do_coprocessor_error has_error_code=0
+idtentry alignment_check do_alignment_check has_error_code=1
+idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0
+
+
+ /*
+ * Reload gs selector with exception handling
+ * edi: new selector
+ */
+ENTRY(native_load_gs_index)
+ FRAME_BEGIN
+ pushfq
+ DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI)
+ TRACE_IRQS_OFF
+ SWAPGS
+.Lgs_change:
+ movl %edi, %gs
+2: ALTERNATIVE "", "mfence", X86_BUG_SWAPGS_FENCE
+ SWAPGS
+ TRACE_IRQS_FLAGS (%rsp)
+ popfq
+ FRAME_END
+ ret
+ENDPROC(native_load_gs_index)
+EXPORT_SYMBOL(native_load_gs_index)
+
+ _ASM_EXTABLE(.Lgs_change, bad_gs)
+ .section .fixup, "ax"
+ /* running with kernelgs */
+bad_gs:
+ SWAPGS /* switch back to user gs */
+.macro ZAP_GS
+ /* This can't be a string because the preprocessor needs to see it. */
+ movl $__USER_DS, %eax
+ movl %eax, %gs
+.endm
+ ALTERNATIVE "", "ZAP_GS", X86_BUG_NULL_SEG
+ xorl %eax, %eax
+ movl %eax, %gs
+ jmp 2b
+ .previous
+
+/* Call softirq on interrupt stack. Interrupts are off. */
+ENTRY(do_softirq_own_stack)
+ pushq %rbp
+ mov %rsp, %rbp
+ ENTER_IRQ_STACK regs=0 old_rsp=%r11
+ call __do_softirq
+ LEAVE_IRQ_STACK regs=0
+ leaveq
+ ret
+ENDPROC(do_softirq_own_stack)
+
+#ifdef CONFIG_XEN
+idtentry hypervisor_callback xen_do_hypervisor_callback has_error_code=0
+
+/*
+ * A note on the "critical region" in our callback handler.
+ * We want to avoid stacking callback handlers due to events occurring
+ * during handling of the last event. To do this, we keep events disabled
+ * until we've done all processing. HOWEVER, we must enable events before
+ * popping the stack frame (can't be done atomically) and so it would still
+ * be possible to get enough handler activations to overflow the stack.
+ * Although unlikely, bugs of that kind are hard to track down, so we'd
+ * like to avoid the possibility.
+ * So, on entry to the handler we detect whether we interrupted an
+ * existing activation in its critical region -- if so, we pop the current
+ * activation and restart the handler using the previous one.
+ */
+ENTRY(xen_do_hypervisor_callback) /* do_hypervisor_callback(struct *pt_regs) */
+
+/*
+ * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
+ * see the correct pointer to the pt_regs
+ */
+ UNWIND_HINT_FUNC
+ movq %rdi, %rsp /* we don't return, adjust the stack frame */
+ UNWIND_HINT_REGS
+
+ ENTER_IRQ_STACK old_rsp=%r10
+ call xen_evtchn_do_upcall
+ LEAVE_IRQ_STACK
+
+#ifndef CONFIG_PREEMPT
+ call xen_maybe_preempt_hcall
+#endif
+ jmp error_exit
+END(xen_do_hypervisor_callback)
+
+/*
+ * Hypervisor uses this for application faults while it executes.
+ * We get here for two reasons:
+ * 1. Fault while reloading DS, ES, FS or GS
+ * 2. Fault while executing IRET
+ * Category 1 we do not need to fix up as Xen has already reloaded all segment
+ * registers that could be reloaded and zeroed the others.
+ * Category 2 we fix up by killing the current process. We cannot use the
+ * normal Linux return path in this case because if we use the IRET hypercall
+ * to pop the stack frame we end up in an infinite loop of failsafe callbacks.
+ * We distinguish between categories by comparing each saved segment register
+ * with its current contents: any discrepancy means we in category 1.
+ */
+ENTRY(xen_failsafe_callback)
+ UNWIND_HINT_EMPTY
+ movl %ds, %ecx
+ cmpw %cx, 0x10(%rsp)
+ jne 1f
+ movl %es, %ecx
+ cmpw %cx, 0x18(%rsp)
+ jne 1f
+ movl %fs, %ecx
+ cmpw %cx, 0x20(%rsp)
+ jne 1f
+ movl %gs, %ecx
+ cmpw %cx, 0x28(%rsp)
+ jne 1f
+ /* All segments match their saved values => Category 2 (Bad IRET). */
+ movq (%rsp), %rcx
+ movq 8(%rsp), %r11
+ addq $0x30, %rsp
+ pushq $0 /* RIP */
+ UNWIND_HINT_IRET_REGS offset=8
+ jmp general_protection
+1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
+ movq (%rsp), %rcx
+ movq 8(%rsp), %r11
+ addq $0x30, %rsp
+ UNWIND_HINT_IRET_REGS
+ pushq $-1 /* orig_ax = -1 => not a system call */
+ PUSH_AND_CLEAR_REGS
+ ENCODE_FRAME_POINTER
+ jmp error_exit
+END(xen_failsafe_callback)
+
+apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
+ xen_hvm_callback_vector xen_evtchn_do_upcall
+
+#endif /* CONFIG_XEN */
+
+#if IS_ENABLED(CONFIG_HYPERV)
+apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
+ hyperv_callback_vector hyperv_vector_handler
+
+apicinterrupt3 HYPERV_REENLIGHTENMENT_VECTOR \
+ hyperv_reenlightenment_vector hyperv_reenlightenment_intr
+
+apicinterrupt3 HYPERV_STIMER0_VECTOR \
+ hv_stimer0_callback_vector hv_stimer0_vector_handler
+#endif /* CONFIG_HYPERV */
+
+idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK
+idtentry int3 do_int3 has_error_code=0 create_gap=1
+idtentry stack_segment do_stack_segment has_error_code=1
+
+#ifdef CONFIG_XEN
+idtentry xennmi do_nmi has_error_code=0
+idtentry xendebug do_debug has_error_code=0
+#endif
+
+idtentry general_protection do_general_protection has_error_code=1
+idtentry page_fault do_page_fault has_error_code=1
+
+#ifdef CONFIG_KVM_GUEST
+idtentry async_page_fault do_async_page_fault has_error_code=1
+#endif
+
+#ifdef CONFIG_X86_MCE
+idtentry machine_check do_mce has_error_code=0 paranoid=1
+#endif
+
+/*
+ * Save all registers in pt_regs, and switch gs if needed.
+ * Use slow, but surefire "are we in kernel?" check.
+ * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
+ */
+ENTRY(paranoid_entry)
+ UNWIND_HINT_FUNC
+ cld
+ PUSH_AND_CLEAR_REGS save_ret=1
+ ENCODE_FRAME_POINTER 8
+ movl $1, %ebx
+ movl $MSR_GS_BASE, %ecx
+ rdmsr
+ testl %edx, %edx
+ js 1f /* negative -> in kernel */
+ SWAPGS
+ xorl %ebx, %ebx
+
+1:
+ /*
+ * Always stash CR3 in %r14. This value will be restored,
+ * verbatim, at exit. Needed if paranoid_entry interrupted
+ * another entry that already switched to the user CR3 value
+ * but has not yet returned to userspace.
+ *
+ * This is also why CS (stashed in the "iret frame" by the
+ * hardware at entry) can not be used: this may be a return
+ * to kernel code, but with a user CR3 value.
+ */
+ SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14
+
+ /*
+ * The above SAVE_AND_SWITCH_TO_KERNEL_CR3 macro doesn't do an
+ * unconditional CR3 write, even in the PTI case. So do an lfence
+ * to prevent GS speculation, regardless of whether PTI is enabled.
+ */
+ FENCE_SWAPGS_KERNEL_ENTRY
+
+ ret
+END(paranoid_entry)
+
+/*
+ * "Paranoid" exit path from exception stack. This is invoked
+ * only on return from non-NMI IST interrupts that came
+ * from kernel space.
+ *
+ * We may be returning to very strange contexts (e.g. very early
+ * in syscall entry), so checking for preemption here would
+ * be complicated. Fortunately, we there's no good reason
+ * to try to handle preemption here.
+ *
+ * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it)
+ */
+ENTRY(paranoid_exit)
+ UNWIND_HINT_REGS
+ DISABLE_INTERRUPTS(CLBR_ANY)
+ TRACE_IRQS_OFF_DEBUG
+ testl %ebx, %ebx /* swapgs needed? */
+ jnz .Lparanoid_exit_no_swapgs
+ TRACE_IRQS_IRETQ
+ /* Always restore stashed CR3 value (see paranoid_entry) */
+ RESTORE_CR3 scratch_reg=%rbx save_reg=%r14
+ SWAPGS_UNSAFE_STACK
+ jmp .Lparanoid_exit_restore
+.Lparanoid_exit_no_swapgs:
+ TRACE_IRQS_IRETQ_DEBUG
+ /* Always restore stashed CR3 value (see paranoid_entry) */
+ RESTORE_CR3 scratch_reg=%rbx save_reg=%r14
+.Lparanoid_exit_restore:
+ jmp restore_regs_and_return_to_kernel
+END(paranoid_exit)
+
+/*
+ * Save all registers in pt_regs, and switch GS if needed.
+ */
+ENTRY(error_entry)
+ UNWIND_HINT_FUNC
+ cld
+ PUSH_AND_CLEAR_REGS save_ret=1
+ ENCODE_FRAME_POINTER 8
+ testb $3, CS+8(%rsp)
+ jz .Lerror_kernelspace
+
+ /*
+ * We entered from user mode or we're pretending to have entered
+ * from user mode due to an IRET fault.
+ */
+ SWAPGS
+ FENCE_SWAPGS_USER_ENTRY
+ /* We have user CR3. Change to kernel CR3. */
+ SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
+
+.Lerror_entry_from_usermode_after_swapgs:
+ /* Put us onto the real thread stack. */
+ popq %r12 /* save return addr in %12 */
+ movq %rsp, %rdi /* arg0 = pt_regs pointer */
+ call sync_regs
+ movq %rax, %rsp /* switch stack */
+ ENCODE_FRAME_POINTER
+ pushq %r12
+
+ /*
+ * We need to tell lockdep that IRQs are off. We can't do this until
+ * we fix gsbase, and we should do it before enter_from_user_mode
+ * (which can take locks).
+ */
+ TRACE_IRQS_OFF
+ CALL_enter_from_user_mode
+ ret
+
+.Lerror_entry_done_lfence:
+ FENCE_SWAPGS_KERNEL_ENTRY
+.Lerror_entry_done:
+ TRACE_IRQS_OFF
+ ret
+
+ /*
+ * There are two places in the kernel that can potentially fault with
+ * usergs. Handle them here. B stepping K8s sometimes report a
+ * truncated RIP for IRET exceptions returning to compat mode. Check
+ * for these here too.
+ */
+.Lerror_kernelspace:
+ leaq native_irq_return_iret(%rip), %rcx
+ cmpq %rcx, RIP+8(%rsp)
+ je .Lerror_bad_iret
+ movl %ecx, %eax /* zero extend */
+ cmpq %rax, RIP+8(%rsp)
+ je .Lbstep_iret
+ cmpq $.Lgs_change, RIP+8(%rsp)
+ jne .Lerror_entry_done_lfence
+
+ /*
+ * hack: .Lgs_change can fail with user gsbase. If this happens, fix up
+ * gsbase and proceed. We'll fix up the exception and land in
+ * .Lgs_change's error handler with kernel gsbase.
+ */
+ SWAPGS
+ FENCE_SWAPGS_USER_ENTRY
+ SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
+ jmp .Lerror_entry_done
+
+.Lbstep_iret:
+ /* Fix truncated RIP */
+ movq %rcx, RIP+8(%rsp)
+ /* fall through */
+
+.Lerror_bad_iret:
+ /*
+ * We came from an IRET to user mode, so we have user
+ * gsbase and CR3. Switch to kernel gsbase and CR3:
+ */
+ SWAPGS
+ FENCE_SWAPGS_USER_ENTRY
+ SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
+
+ /*
+ * Pretend that the exception came from user mode: set up pt_regs
+ * as if we faulted immediately after IRET.
+ */
+ mov %rsp, %rdi
+ call fixup_bad_iret
+ mov %rax, %rsp
+ jmp .Lerror_entry_from_usermode_after_swapgs
+END(error_entry)
+
+ENTRY(error_exit)
+ UNWIND_HINT_REGS
+ DISABLE_INTERRUPTS(CLBR_ANY)
+ TRACE_IRQS_OFF
+ testb $3, CS(%rsp)
+ jz retint_kernel
+ jmp retint_user
+END(error_exit)
+
+/*
+ * Runs on exception stack. Xen PV does not go through this path at all,
+ * so we can use real assembly here.
+ *
+ * Registers:
+ * %r14: Used to save/restore the CR3 of the interrupted context
+ * when PAGE_TABLE_ISOLATION is in use. Do not clobber.
+ */
+ENTRY(nmi)
+ UNWIND_HINT_IRET_REGS
+
+ /*
+ * We allow breakpoints in NMIs. If a breakpoint occurs, then
+ * the iretq it performs will take us out of NMI context.
+ * This means that we can have nested NMIs where the next
+ * NMI is using the top of the stack of the previous NMI. We
+ * can't let it execute because the nested NMI will corrupt the
+ * stack of the previous NMI. NMI handlers are not re-entrant
+ * anyway.
+ *
+ * To handle this case we do the following:
+ * Check the a special location on the stack that contains
+ * a variable that is set when NMIs are executing.
+ * The interrupted task's stack is also checked to see if it
+ * is an NMI stack.
+ * If the variable is not set and the stack is not the NMI
+ * stack then:
+ * o Set the special variable on the stack
+ * o Copy the interrupt frame into an "outermost" location on the
+ * stack
+ * o Copy the interrupt frame into an "iret" location on the stack
+ * o Continue processing the NMI
+ * If the variable is set or the previous stack is the NMI stack:
+ * o Modify the "iret" location to jump to the repeat_nmi
+ * o return back to the first NMI
+ *
+ * Now on exit of the first NMI, we first clear the stack variable
+ * The NMI stack will tell any nested NMIs at that point that it is
+ * nested. Then we pop the stack normally with iret, and if there was
+ * a nested NMI that updated the copy interrupt stack frame, a
+ * jump will be made to the repeat_nmi code that will handle the second
+ * NMI.
+ *
+ * However, espfix prevents us from directly returning to userspace
+ * with a single IRET instruction. Similarly, IRET to user mode
+ * can fault. We therefore handle NMIs from user space like
+ * other IST entries.
+ */
+
+ ASM_CLAC
+
+ /* Use %rdx as our temp variable throughout */
+ pushq %rdx
+
+ testb $3, CS-RIP+8(%rsp)
+ jz .Lnmi_from_kernel
+
+ /*
+ * NMI from user mode. We need to run on the thread stack, but we
+ * can't go through the normal entry paths: NMIs are masked, and
+ * we don't want to enable interrupts, because then we'll end
+ * up in an awkward situation in which IRQs are on but NMIs
+ * are off.
+ *
+ * We also must not push anything to the stack before switching
+ * stacks lest we corrupt the "NMI executing" variable.
+ */
+
+ swapgs
+ cld
+ FENCE_SWAPGS_USER_ENTRY
+ SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx
+ movq %rsp, %rdx
+ movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+ UNWIND_HINT_IRET_REGS base=%rdx offset=8
+ pushq 5*8(%rdx) /* pt_regs->ss */
+ pushq 4*8(%rdx) /* pt_regs->rsp */
+ pushq 3*8(%rdx) /* pt_regs->flags */
+ pushq 2*8(%rdx) /* pt_regs->cs */
+ pushq 1*8(%rdx) /* pt_regs->rip */
+ UNWIND_HINT_IRET_REGS
+ pushq $-1 /* pt_regs->orig_ax */
+ PUSH_AND_CLEAR_REGS rdx=(%rdx)
+ ENCODE_FRAME_POINTER
+
+ /*
+ * At this point we no longer need to worry about stack damage
+ * due to nesting -- we're on the normal thread stack and we're
+ * done with the NMI stack.
+ */
+
+ movq %rsp, %rdi
+ movq $-1, %rsi
+ call do_nmi
+
+ /*
+ * Return back to user mode. We must *not* do the normal exit
+ * work, because we don't want to enable interrupts.
+ */
+ jmp swapgs_restore_regs_and_return_to_usermode
+
+.Lnmi_from_kernel:
+ /*
+ * Here's what our stack frame will look like:
+ * +---------------------------------------------------------+
+ * | original SS |
+ * | original Return RSP |
+ * | original RFLAGS |
+ * | original CS |
+ * | original RIP |
+ * +---------------------------------------------------------+
+ * | temp storage for rdx |
+ * +---------------------------------------------------------+
+ * | "NMI executing" variable |
+ * +---------------------------------------------------------+
+ * | iret SS } Copied from "outermost" frame |
+ * | iret Return RSP } on each loop iteration; overwritten |
+ * | iret RFLAGS } by a nested NMI to force another |
+ * | iret CS } iteration if needed. |
+ * | iret RIP } |
+ * +---------------------------------------------------------+
+ * | outermost SS } initialized in first_nmi; |
+ * | outermost Return RSP } will not be changed before |
+ * | outermost RFLAGS } NMI processing is done. |
+ * | outermost CS } Copied to "iret" frame on each |
+ * | outermost RIP } iteration. |
+ * +---------------------------------------------------------+
+ * | pt_regs |
+ * +---------------------------------------------------------+
+ *
+ * The "original" frame is used by hardware. Before re-enabling
+ * NMIs, we need to be done with it, and we need to leave enough
+ * space for the asm code here.
+ *
+ * We return by executing IRET while RSP points to the "iret" frame.
+ * That will either return for real or it will loop back into NMI
+ * processing.
+ *
+ * The "outermost" frame is copied to the "iret" frame on each
+ * iteration of the loop, so each iteration starts with the "iret"
+ * frame pointing to the final return target.
+ */
+
+ /*
+ * Determine whether we're a nested NMI.
+ *
+ * If we interrupted kernel code between repeat_nmi and
+ * end_repeat_nmi, then we are a nested NMI. We must not
+ * modify the "iret" frame because it's being written by
+ * the outer NMI. That's okay; the outer NMI handler is
+ * about to about to call do_nmi anyway, so we can just
+ * resume the outer NMI.
+ */
+
+ movq $repeat_nmi, %rdx
+ cmpq 8(%rsp), %rdx
+ ja 1f
+ movq $end_repeat_nmi, %rdx
+ cmpq 8(%rsp), %rdx
+ ja nested_nmi_out
+1:
+
+ /*
+ * Now check "NMI executing". If it's set, then we're nested.
+ * This will not detect if we interrupted an outer NMI just
+ * before IRET.
+ */
+ cmpl $1, -8(%rsp)
+ je nested_nmi
+
+ /*
+ * Now test if the previous stack was an NMI stack. This covers
+ * the case where we interrupt an outer NMI after it clears
+ * "NMI executing" but before IRET. We need to be careful, though:
+ * there is one case in which RSP could point to the NMI stack
+ * despite there being no NMI active: naughty userspace controls
+ * RSP at the very beginning of the SYSCALL targets. We can
+ * pull a fast one on naughty userspace, though: we program
+ * SYSCALL to mask DF, so userspace cannot cause DF to be set
+ * if it controls the kernel's RSP. We set DF before we clear
+ * "NMI executing".
+ */
+ lea 6*8(%rsp), %rdx
+ /* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */
+ cmpq %rdx, 4*8(%rsp)
+ /* If the stack pointer is above the NMI stack, this is a normal NMI */
+ ja first_nmi
+
+ subq $EXCEPTION_STKSZ, %rdx
+ cmpq %rdx, 4*8(%rsp)
+ /* If it is below the NMI stack, it is a normal NMI */
+ jb first_nmi
+
+ /* Ah, it is within the NMI stack. */
+
+ testb $(X86_EFLAGS_DF >> 8), (3*8 + 1)(%rsp)
+ jz first_nmi /* RSP was user controlled. */
+
+ /* This is a nested NMI. */
+
+nested_nmi:
+ /*
+ * Modify the "iret" frame to point to repeat_nmi, forcing another
+ * iteration of NMI handling.
+ */
+ subq $8, %rsp
+ leaq -10*8(%rsp), %rdx
+ pushq $__KERNEL_DS
+ pushq %rdx
+ pushfq
+ pushq $__KERNEL_CS
+ pushq $repeat_nmi
+
+ /* Put stack back */
+ addq $(6*8), %rsp
+
+nested_nmi_out:
+ popq %rdx
+
+ /* We are returning to kernel mode, so this cannot result in a fault. */
+ iretq
+
+first_nmi:
+ /* Restore rdx. */
+ movq (%rsp), %rdx
+
+ /* Make room for "NMI executing". */
+ pushq $0
+
+ /* Leave room for the "iret" frame */
+ subq $(5*8), %rsp
+
+ /* Copy the "original" frame to the "outermost" frame */
+ .rept 5
+ pushq 11*8(%rsp)
+ .endr
+ UNWIND_HINT_IRET_REGS
+
+ /* Everything up to here is safe from nested NMIs */
+
+#ifdef CONFIG_DEBUG_ENTRY
+ /*
+ * For ease of testing, unmask NMIs right away. Disabled by
+ * default because IRET is very expensive.
+ */
+ pushq $0 /* SS */
+ pushq %rsp /* RSP (minus 8 because of the previous push) */
+ addq $8, (%rsp) /* Fix up RSP */
+ pushfq /* RFLAGS */
+ pushq $__KERNEL_CS /* CS */
+ pushq $1f /* RIP */
+ iretq /* continues at repeat_nmi below */
+ UNWIND_HINT_IRET_REGS
+1:
+#endif
+
+repeat_nmi:
+ /*
+ * If there was a nested NMI, the first NMI's iret will return
+ * here. But NMIs are still enabled and we can take another
+ * nested NMI. The nested NMI checks the interrupted RIP to see
+ * if it is between repeat_nmi and end_repeat_nmi, and if so
+ * it will just return, as we are about to repeat an NMI anyway.
+ * This makes it safe to copy to the stack frame that a nested
+ * NMI will update.
+ *
+ * RSP is pointing to "outermost RIP". gsbase is unknown, but, if
+ * we're repeating an NMI, gsbase has the same value that it had on
+ * the first iteration. paranoid_entry will load the kernel
+ * gsbase if needed before we call do_nmi. "NMI executing"
+ * is zero.
+ */
+ movq $1, 10*8(%rsp) /* Set "NMI executing". */
+
+ /*
+ * Copy the "outermost" frame to the "iret" frame. NMIs that nest
+ * here must not modify the "iret" frame while we're writing to
+ * it or it will end up containing garbage.
+ */
+ addq $(10*8), %rsp
+ .rept 5
+ pushq -6*8(%rsp)
+ .endr
+ subq $(5*8), %rsp
+end_repeat_nmi:
+
+ /*
+ * Everything below this point can be preempted by a nested NMI.
+ * If this happens, then the inner NMI will change the "iret"
+ * frame to point back to repeat_nmi.
+ */
+ pushq $-1 /* ORIG_RAX: no syscall to restart */
+
+ /*
+ * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit
+ * as we should not be calling schedule in NMI context.
+ * Even with normal interrupts enabled. An NMI should not be
+ * setting NEED_RESCHED or anything that normal interrupts and
+ * exceptions might do.
+ */
+ call paranoid_entry
+ UNWIND_HINT_REGS
+
+ /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
+ movq %rsp, %rdi
+ movq $-1, %rsi
+ call do_nmi
+
+ /* Always restore stashed CR3 value (see paranoid_entry) */
+ RESTORE_CR3 scratch_reg=%r15 save_reg=%r14
+
+ testl %ebx, %ebx /* swapgs needed? */
+ jnz nmi_restore
+nmi_swapgs:
+ SWAPGS_UNSAFE_STACK
+nmi_restore:
+ POP_REGS
+
+ /*
+ * Skip orig_ax and the "outermost" frame to point RSP at the "iret"
+ * at the "iret" frame.
+ */
+ addq $6*8, %rsp
+
+ /*
+ * Clear "NMI executing". Set DF first so that we can easily
+ * distinguish the remaining code between here and IRET from
+ * the SYSCALL entry and exit paths.
+ *
+ * We arguably should just inspect RIP instead, but I (Andy) wrote
+ * this code when I had the misapprehension that Xen PV supported
+ * NMIs, and Xen PV would break that approach.
+ */
+ std
+ movq $0, 5*8(%rsp) /* clear "NMI executing" */
+
+ /*
+ * iretq reads the "iret" frame and exits the NMI stack in a
+ * single instruction. We are returning to kernel mode, so this
+ * cannot result in a fault. Similarly, we don't need to worry
+ * about espfix64 on the way back to kernel mode.
+ */
+ iretq
+END(nmi)
+
+ENTRY(ignore_sysret)
+ UNWIND_HINT_EMPTY
+ mov $-ENOSYS, %eax
+ sysret
+END(ignore_sysret)
+
+ENTRY(rewind_stack_do_exit)
+ UNWIND_HINT_FUNC
+ /* Prevent any naive code from trying to unwind to our caller. */
+ xorl %ebp, %ebp
+
+ movq PER_CPU_VAR(cpu_current_top_of_stack), %rax
+ leaq -PTREGS_SIZE(%rax), %rsp
+ UNWIND_HINT_REGS
+
+ call do_exit
+END(rewind_stack_do_exit)
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
new file mode 100644
index 000000000..40d2834a8
--- /dev/null
+++ b/arch/x86/entry/entry_64_compat.S
@@ -0,0 +1,414 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Compatibility mode system call entry point for x86-64.
+ *
+ * Copyright 2000-2002 Andi Kleen, SuSE Labs.
+ */
+#include "calling.h"
+#include <asm/asm-offsets.h>
+#include <asm/current.h>
+#include <asm/errno.h>
+#include <asm/ia32_unistd.h>
+#include <asm/thread_info.h>
+#include <asm/segment.h>
+#include <asm/irqflags.h>
+#include <asm/asm.h>
+#include <asm/smap.h>
+#include <linux/linkage.h>
+#include <linux/err.h>
+
+ .section .entry.text, "ax"
+
+/*
+ * 32-bit SYSENTER entry.
+ *
+ * 32-bit system calls through the vDSO's __kernel_vsyscall enter here
+ * on 64-bit kernels running on Intel CPUs.
+ *
+ * The SYSENTER instruction, in principle, should *only* occur in the
+ * vDSO. In practice, a small number of Android devices were shipped
+ * with a copy of Bionic that inlined a SYSENTER instruction. This
+ * never happened in any of Google's Bionic versions -- it only happened
+ * in a narrow range of Intel-provided versions.
+ *
+ * SYSENTER loads SS, RSP, CS, and RIP from previously programmed MSRs.
+ * IF and VM in RFLAGS are cleared (IOW: interrupts are off).
+ * SYSENTER does not save anything on the stack,
+ * and does not save old RIP (!!!), RSP, or RFLAGS.
+ *
+ * Arguments:
+ * eax system call number
+ * ebx arg1
+ * ecx arg2
+ * edx arg3
+ * esi arg4
+ * edi arg5
+ * ebp user stack
+ * 0(%ebp) arg6
+ */
+ENTRY(entry_SYSENTER_compat)
+ /* Interrupts are off on entry. */
+ SWAPGS
+
+ /* We are about to clobber %rsp anyway, clobbering here is OK */
+ SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
+
+ movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+
+ /*
+ * User tracing code (ptrace or signal handlers) might assume that
+ * the saved RAX contains a 32-bit number when we're invoking a 32-bit
+ * syscall. Just in case the high bits are nonzero, zero-extend
+ * the syscall number. (This could almost certainly be deleted
+ * with no ill effects.)
+ */
+ movl %eax, %eax
+
+ /* Construct struct pt_regs on stack */
+ pushq $__USER32_DS /* pt_regs->ss */
+ pushq %rbp /* pt_regs->sp (stashed in bp) */
+
+ /*
+ * Push flags. This is nasty. First, interrupts are currently
+ * off, but we need pt_regs->flags to have IF set. Second, even
+ * if TF was set when SYSENTER started, it's clear by now. We fix
+ * that later using TIF_SINGLESTEP.
+ */
+ pushfq /* pt_regs->flags (except IF = 0) */
+ orl $X86_EFLAGS_IF, (%rsp) /* Fix saved flags */
+ pushq $__USER32_CS /* pt_regs->cs */
+ pushq $0 /* pt_regs->ip = 0 (placeholder) */
+ pushq %rax /* pt_regs->orig_ax */
+ pushq %rdi /* pt_regs->di */
+ pushq %rsi /* pt_regs->si */
+ pushq %rdx /* pt_regs->dx */
+ pushq %rcx /* pt_regs->cx */
+ pushq $-ENOSYS /* pt_regs->ax */
+ pushq $0 /* pt_regs->r8 = 0 */
+ xorl %r8d, %r8d /* nospec r8 */
+ pushq $0 /* pt_regs->r9 = 0 */
+ xorl %r9d, %r9d /* nospec r9 */
+ pushq $0 /* pt_regs->r10 = 0 */
+ xorl %r10d, %r10d /* nospec r10 */
+ pushq $0 /* pt_regs->r11 = 0 */
+ xorl %r11d, %r11d /* nospec r11 */
+ pushq %rbx /* pt_regs->rbx */
+ xorl %ebx, %ebx /* nospec rbx */
+ pushq %rbp /* pt_regs->rbp (will be overwritten) */
+ xorl %ebp, %ebp /* nospec rbp */
+ pushq $0 /* pt_regs->r12 = 0 */
+ xorl %r12d, %r12d /* nospec r12 */
+ pushq $0 /* pt_regs->r13 = 0 */
+ xorl %r13d, %r13d /* nospec r13 */
+ pushq $0 /* pt_regs->r14 = 0 */
+ xorl %r14d, %r14d /* nospec r14 */
+ pushq $0 /* pt_regs->r15 = 0 */
+ xorl %r15d, %r15d /* nospec r15 */
+ cld
+
+ /*
+ * SYSENTER doesn't filter flags, so we need to clear NT and AC
+ * ourselves. To save a few cycles, we can check whether
+ * either was set instead of doing an unconditional popfq.
+ * This needs to happen before enabling interrupts so that
+ * we don't get preempted with NT set.
+ *
+ * If TF is set, we will single-step all the way to here -- do_debug
+ * will ignore all the traps. (Yes, this is slow, but so is
+ * single-stepping in general. This allows us to avoid having
+ * a more complicated code to handle the case where a user program
+ * forces us to single-step through the SYSENTER entry code.)
+ *
+ * NB.: .Lsysenter_fix_flags is a label with the code under it moved
+ * out-of-line as an optimization: NT is unlikely to be set in the
+ * majority of the cases and instead of polluting the I$ unnecessarily,
+ * we're keeping that code behind a branch which will predict as
+ * not-taken and therefore its instructions won't be fetched.
+ */
+ testl $X86_EFLAGS_NT|X86_EFLAGS_AC|X86_EFLAGS_TF, EFLAGS(%rsp)
+ jnz .Lsysenter_fix_flags
+.Lsysenter_flags_fixed:
+
+ /*
+ * User mode is traced as though IRQs are on, and SYSENTER
+ * turned them off.
+ */
+ TRACE_IRQS_OFF
+
+ movq %rsp, %rdi
+ call do_fast_syscall_32
+ /* XEN PV guests always use IRET path */
+ ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \
+ "jmp .Lsyscall_32_done", X86_FEATURE_XENPV
+ jmp sysret32_from_system_call
+
+.Lsysenter_fix_flags:
+ pushq $X86_EFLAGS_FIXED
+ popfq
+ jmp .Lsysenter_flags_fixed
+GLOBAL(__end_entry_SYSENTER_compat)
+ENDPROC(entry_SYSENTER_compat)
+
+/*
+ * 32-bit SYSCALL entry.
+ *
+ * 32-bit system calls through the vDSO's __kernel_vsyscall enter here
+ * on 64-bit kernels running on AMD CPUs.
+ *
+ * The SYSCALL instruction, in principle, should *only* occur in the
+ * vDSO. In practice, it appears that this really is the case.
+ * As evidence:
+ *
+ * - The calling convention for SYSCALL has changed several times without
+ * anyone noticing.
+ *
+ * - Prior to the in-kernel X86_BUG_SYSRET_SS_ATTRS fixup, anything
+ * user task that did SYSCALL without immediately reloading SS
+ * would randomly crash.
+ *
+ * - Most programmers do not directly target AMD CPUs, and the 32-bit
+ * SYSCALL instruction does not exist on Intel CPUs. Even on AMD
+ * CPUs, Linux disables the SYSCALL instruction on 32-bit kernels
+ * because the SYSCALL instruction in legacy/native 32-bit mode (as
+ * opposed to compat mode) is sufficiently poorly designed as to be
+ * essentially unusable.
+ *
+ * 32-bit SYSCALL saves RIP to RCX, clears RFLAGS.RF, then saves
+ * RFLAGS to R11, then loads new SS, CS, and RIP from previously
+ * programmed MSRs. RFLAGS gets masked by a value from another MSR
+ * (so CLD and CLAC are not needed). SYSCALL does not save anything on
+ * the stack and does not change RSP.
+ *
+ * Note: RFLAGS saving+masking-with-MSR happens only in Long mode
+ * (in legacy 32-bit mode, IF, RF and VM bits are cleared and that's it).
+ * Don't get confused: RFLAGS saving+masking depends on Long Mode Active bit
+ * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes
+ * or target CS descriptor's L bit (SYSCALL does not read segment descriptors).
+ *
+ * Arguments:
+ * eax system call number
+ * ecx return address
+ * ebx arg1
+ * ebp arg2 (note: not saved in the stack frame, should not be touched)
+ * edx arg3
+ * esi arg4
+ * edi arg5
+ * esp user stack
+ * 0(%esp) arg6
+ */
+ENTRY(entry_SYSCALL_compat)
+ /* Interrupts are off on entry. */
+ swapgs
+
+ /* Stash user ESP */
+ movl %esp, %r8d
+
+ /* Use %rsp as scratch reg. User ESP is stashed in r8 */
+ SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
+
+ /* Switch to the kernel stack */
+ movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+
+ /* Construct struct pt_regs on stack */
+ pushq $__USER32_DS /* pt_regs->ss */
+ pushq %r8 /* pt_regs->sp */
+ pushq %r11 /* pt_regs->flags */
+ pushq $__USER32_CS /* pt_regs->cs */
+ pushq %rcx /* pt_regs->ip */
+GLOBAL(entry_SYSCALL_compat_after_hwframe)
+ movl %eax, %eax /* discard orig_ax high bits */
+ pushq %rax /* pt_regs->orig_ax */
+ pushq %rdi /* pt_regs->di */
+ pushq %rsi /* pt_regs->si */
+ xorl %esi, %esi /* nospec si */
+ pushq %rdx /* pt_regs->dx */
+ xorl %edx, %edx /* nospec dx */
+ pushq %rbp /* pt_regs->cx (stashed in bp) */
+ xorl %ecx, %ecx /* nospec cx */
+ pushq $-ENOSYS /* pt_regs->ax */
+ pushq $0 /* pt_regs->r8 = 0 */
+ xorl %r8d, %r8d /* nospec r8 */
+ pushq $0 /* pt_regs->r9 = 0 */
+ xorl %r9d, %r9d /* nospec r9 */
+ pushq $0 /* pt_regs->r10 = 0 */
+ xorl %r10d, %r10d /* nospec r10 */
+ pushq $0 /* pt_regs->r11 = 0 */
+ xorl %r11d, %r11d /* nospec r11 */
+ pushq %rbx /* pt_regs->rbx */
+ xorl %ebx, %ebx /* nospec rbx */
+ pushq %rbp /* pt_regs->rbp (will be overwritten) */
+ xorl %ebp, %ebp /* nospec rbp */
+ pushq $0 /* pt_regs->r12 = 0 */
+ xorl %r12d, %r12d /* nospec r12 */
+ pushq $0 /* pt_regs->r13 = 0 */
+ xorl %r13d, %r13d /* nospec r13 */
+ pushq $0 /* pt_regs->r14 = 0 */
+ xorl %r14d, %r14d /* nospec r14 */
+ pushq $0 /* pt_regs->r15 = 0 */
+ xorl %r15d, %r15d /* nospec r15 */
+
+ /*
+ * User mode is traced as though IRQs are on, and SYSENTER
+ * turned them off.
+ */
+ TRACE_IRQS_OFF
+
+ movq %rsp, %rdi
+ call do_fast_syscall_32
+ /* XEN PV guests always use IRET path */
+ ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \
+ "jmp .Lsyscall_32_done", X86_FEATURE_XENPV
+
+ /* Opportunistic SYSRET */
+sysret32_from_system_call:
+ TRACE_IRQS_ON /* User mode traces as IRQs on. */
+ movq RBX(%rsp), %rbx /* pt_regs->rbx */
+ movq RBP(%rsp), %rbp /* pt_regs->rbp */
+ movq EFLAGS(%rsp), %r11 /* pt_regs->flags (in r11) */
+ movq RIP(%rsp), %rcx /* pt_regs->ip (in rcx) */
+ addq $RAX, %rsp /* Skip r8-r15 */
+ popq %rax /* pt_regs->rax */
+ popq %rdx /* Skip pt_regs->cx */
+ popq %rdx /* pt_regs->dx */
+ popq %rsi /* pt_regs->si */
+ popq %rdi /* pt_regs->di */
+
+ /*
+ * USERGS_SYSRET32 does:
+ * GSBASE = user's GS base
+ * EIP = ECX
+ * RFLAGS = R11
+ * CS = __USER32_CS
+ * SS = __USER_DS
+ *
+ * ECX will not match pt_regs->cx, but we're returning to a vDSO
+ * trampoline that will fix up RCX, so this is okay.
+ *
+ * R12-R15 are callee-saved, so they contain whatever was in them
+ * when the system call started, which is already known to user
+ * code. We zero R8-R10 to avoid info leaks.
+ */
+ movq RSP-ORIG_RAX(%rsp), %rsp
+
+ /*
+ * The original userspace %rsp (RSP-ORIG_RAX(%rsp)) is stored
+ * on the process stack which is not mapped to userspace and
+ * not readable after we SWITCH_TO_USER_CR3. Delay the CR3
+ * switch until after after the last reference to the process
+ * stack.
+ *
+ * %r8/%r9 are zeroed before the sysret, thus safe to clobber.
+ */
+ SWITCH_TO_USER_CR3_NOSTACK scratch_reg=%r8 scratch_reg2=%r9
+
+ xorl %r8d, %r8d
+ xorl %r9d, %r9d
+ xorl %r10d, %r10d
+ swapgs
+ sysretl
+END(entry_SYSCALL_compat)
+
+/*
+ * 32-bit legacy system call entry.
+ *
+ * 32-bit x86 Linux system calls traditionally used the INT $0x80
+ * instruction. INT $0x80 lands here.
+ *
+ * This entry point can be used by 32-bit and 64-bit programs to perform
+ * 32-bit system calls. Instances of INT $0x80 can be found inline in
+ * various programs and libraries. It is also used by the vDSO's
+ * __kernel_vsyscall fallback for hardware that doesn't support a faster
+ * entry method. Restarted 32-bit system calls also fall back to INT
+ * $0x80 regardless of what instruction was originally used to do the
+ * system call.
+ *
+ * This is considered a slow path. It is not used by most libc
+ * implementations on modern hardware except during process startup.
+ *
+ * Arguments:
+ * eax system call number
+ * ebx arg1
+ * ecx arg2
+ * edx arg3
+ * esi arg4
+ * edi arg5
+ * ebp arg6
+ */
+ENTRY(entry_INT80_compat)
+ /*
+ * Interrupts are off on entry.
+ */
+ ASM_CLAC /* Do this early to minimize exposure */
+ SWAPGS
+
+ /*
+ * User tracing code (ptrace or signal handlers) might assume that
+ * the saved RAX contains a 32-bit number when we're invoking a 32-bit
+ * syscall. Just in case the high bits are nonzero, zero-extend
+ * the syscall number. (This could almost certainly be deleted
+ * with no ill effects.)
+ */
+ movl %eax, %eax
+
+ /* switch to thread stack expects orig_ax and rdi to be pushed */
+ pushq %rax /* pt_regs->orig_ax */
+ pushq %rdi /* pt_regs->di */
+
+ /* Need to switch before accessing the thread stack. */
+ SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
+ /* In the Xen PV case we already run on the thread stack. */
+ ALTERNATIVE "movq %rsp, %rdi", "jmp .Lint80_keep_stack", X86_FEATURE_XENPV
+ movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+
+ pushq 6*8(%rdi) /* regs->ss */
+ pushq 5*8(%rdi) /* regs->rsp */
+ pushq 4*8(%rdi) /* regs->eflags */
+ pushq 3*8(%rdi) /* regs->cs */
+ pushq 2*8(%rdi) /* regs->ip */
+ pushq 1*8(%rdi) /* regs->orig_ax */
+ pushq (%rdi) /* pt_regs->di */
+.Lint80_keep_stack:
+
+ pushq %rsi /* pt_regs->si */
+ xorl %esi, %esi /* nospec si */
+ pushq %rdx /* pt_regs->dx */
+ xorl %edx, %edx /* nospec dx */
+ pushq %rcx /* pt_regs->cx */
+ xorl %ecx, %ecx /* nospec cx */
+ pushq $-ENOSYS /* pt_regs->ax */
+ pushq %r8 /* pt_regs->r8 */
+ xorl %r8d, %r8d /* nospec r8 */
+ pushq %r9 /* pt_regs->r9 */
+ xorl %r9d, %r9d /* nospec r9 */
+ pushq %r10 /* pt_regs->r10*/
+ xorl %r10d, %r10d /* nospec r10 */
+ pushq %r11 /* pt_regs->r11 */
+ xorl %r11d, %r11d /* nospec r11 */
+ pushq %rbx /* pt_regs->rbx */
+ xorl %ebx, %ebx /* nospec rbx */
+ pushq %rbp /* pt_regs->rbp */
+ xorl %ebp, %ebp /* nospec rbp */
+ pushq %r12 /* pt_regs->r12 */
+ xorl %r12d, %r12d /* nospec r12 */
+ pushq %r13 /* pt_regs->r13 */
+ xorl %r13d, %r13d /* nospec r13 */
+ pushq %r14 /* pt_regs->r14 */
+ xorl %r14d, %r14d /* nospec r14 */
+ pushq %r15 /* pt_regs->r15 */
+ xorl %r15d, %r15d /* nospec r15 */
+ cld
+
+ /*
+ * User mode is traced as though IRQs are on, and the interrupt
+ * gate turned them off.
+ */
+ TRACE_IRQS_OFF
+
+ movq %rsp, %rdi
+ call do_int80_syscall_32
+.Lsyscall_32_done:
+
+ /* Go back to user mode. */
+ TRACE_IRQS_ON
+ jmp swapgs_restore_regs_and_return_to_usermode
+END(entry_INT80_compat)
diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c
new file mode 100644
index 000000000..aa3336a7c
--- /dev/null
+++ b/arch/x86/entry/syscall_32.c
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: GPL-2.0
+/* System call table for i386. */
+
+#include <linux/linkage.h>
+#include <linux/sys.h>
+#include <linux/cache.h>
+#include <asm/asm-offsets.h>
+#include <asm/syscall.h>
+
+#ifdef CONFIG_IA32_EMULATION
+/* On X86_64, we use struct pt_regs * to pass parameters to syscalls */
+#define __SYSCALL_I386(nr, sym, qual) extern asmlinkage long sym(const struct pt_regs *);
+
+/* this is a lie, but it does not hurt as sys_ni_syscall just returns -EINVAL */
+extern asmlinkage long sys_ni_syscall(const struct pt_regs *);
+
+#else /* CONFIG_IA32_EMULATION */
+#define __SYSCALL_I386(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
+extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
+#endif /* CONFIG_IA32_EMULATION */
+
+#include <asm/syscalls_32.h>
+#undef __SYSCALL_I386
+
+#define __SYSCALL_I386(nr, sym, qual) [nr] = sym,
+
+__visible const sys_call_ptr_t ia32_sys_call_table[__NR_syscall_compat_max+1] = {
+ /*
+ * Smells like a compiler bug -- it doesn't work
+ * when the & below is removed.
+ */
+ [0 ... __NR_syscall_compat_max] = &sys_ni_syscall,
+#include <asm/syscalls_32.h>
+};
diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c
new file mode 100644
index 000000000..d5252bc1e
--- /dev/null
+++ b/arch/x86/entry/syscall_64.c
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: GPL-2.0
+/* System call table for x86-64. */
+
+#include <linux/linkage.h>
+#include <linux/sys.h>
+#include <linux/cache.h>
+#include <asm/asm-offsets.h>
+#include <asm/syscall.h>
+
+/* this is a lie, but it does not hurt as sys_ni_syscall just returns -EINVAL */
+extern asmlinkage long sys_ni_syscall(const struct pt_regs *);
+#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long sym(const struct pt_regs *);
+#include <asm/syscalls_64.h>
+#undef __SYSCALL_64
+
+#define __SYSCALL_64(nr, sym, qual) [nr] = sym,
+
+asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
+ /*
+ * Smells like a compiler bug -- it doesn't work
+ * when the & below is removed.
+ */
+ [0 ... __NR_syscall_max] = &sys_ni_syscall,
+#include <asm/syscalls_64.h>
+};
diff --git a/arch/x86/entry/syscalls/Makefile b/arch/x86/entry/syscalls/Makefile
new file mode 100644
index 000000000..6fb9b57ed
--- /dev/null
+++ b/arch/x86/entry/syscalls/Makefile
@@ -0,0 +1,70 @@
+# SPDX-License-Identifier: GPL-2.0
+out := arch/$(SRCARCH)/include/generated/asm
+uapi := arch/$(SRCARCH)/include/generated/uapi/asm
+
+# Create output directory if not already present
+_dummy := $(shell [ -d '$(out)' ] || mkdir -p '$(out)') \
+ $(shell [ -d '$(uapi)' ] || mkdir -p '$(uapi)')
+
+syscall32 := $(srctree)/$(src)/syscall_32.tbl
+syscall64 := $(srctree)/$(src)/syscall_64.tbl
+
+syshdr := $(srctree)/$(src)/syscallhdr.sh
+systbl := $(srctree)/$(src)/syscalltbl.sh
+
+quiet_cmd_syshdr = SYSHDR $@
+ cmd_syshdr = $(CONFIG_SHELL) '$(syshdr)' '$<' '$@' \
+ '$(syshdr_abi_$(basetarget))' \
+ '$(syshdr_pfx_$(basetarget))' \
+ '$(syshdr_offset_$(basetarget))'
+quiet_cmd_systbl = SYSTBL $@
+ cmd_systbl = $(CONFIG_SHELL) '$(systbl)' $< $@
+
+quiet_cmd_hypercalls = HYPERCALLS $@
+ cmd_hypercalls = $(CONFIG_SHELL) '$<' $@ $(filter-out $<,$^)
+
+syshdr_abi_unistd_32 := i386
+$(uapi)/unistd_32.h: $(syscall32) $(syshdr)
+ $(call if_changed,syshdr)
+
+syshdr_abi_unistd_32_ia32 := i386
+syshdr_pfx_unistd_32_ia32 := ia32_
+$(out)/unistd_32_ia32.h: $(syscall32) $(syshdr)
+ $(call if_changed,syshdr)
+
+syshdr_abi_unistd_x32 := common,x32
+syshdr_offset_unistd_x32 := __X32_SYSCALL_BIT
+$(uapi)/unistd_x32.h: $(syscall64) $(syshdr)
+ $(call if_changed,syshdr)
+
+syshdr_abi_unistd_64 := common,64
+$(uapi)/unistd_64.h: $(syscall64) $(syshdr)
+ $(call if_changed,syshdr)
+
+syshdr_abi_unistd_64_x32 := x32
+syshdr_pfx_unistd_64_x32 := x32_
+$(out)/unistd_64_x32.h: $(syscall64) $(syshdr)
+ $(call if_changed,syshdr)
+
+$(out)/syscalls_32.h: $(syscall32) $(systbl)
+ $(call if_changed,systbl)
+$(out)/syscalls_64.h: $(syscall64) $(systbl)
+ $(call if_changed,systbl)
+
+$(out)/xen-hypercalls.h: $(srctree)/scripts/xen-hypercalls.sh
+ $(call if_changed,hypercalls)
+
+$(out)/xen-hypercalls.h: $(srctree)/include/xen/interface/xen*.h
+
+uapisyshdr-y += unistd_32.h unistd_64.h unistd_x32.h
+syshdr-y += syscalls_32.h
+syshdr-$(CONFIG_X86_64) += unistd_32_ia32.h unistd_64_x32.h
+syshdr-$(CONFIG_X86_64) += syscalls_64.h
+syshdr-$(CONFIG_XEN) += xen-hypercalls.h
+
+targets += $(uapisyshdr-y) $(syshdr-y)
+
+PHONY += all
+all: $(addprefix $(uapi)/,$(uapisyshdr-y))
+all: $(addprefix $(out)/,$(syshdr-y))
+ @:
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
new file mode 100644
index 000000000..3cf7b533b
--- /dev/null
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -0,0 +1,400 @@
+#
+# 32-bit system call numbers and entry vectors
+#
+# The format is:
+# <number> <abi> <name> <entry point> <compat entry point>
+#
+# The __ia32_sys and __ia32_compat_sys stubs are created on-the-fly for
+# sys_*() system calls and compat_sys_*() compat system calls if
+# IA32_EMULATION is defined, and expect struct pt_regs *regs as their only
+# parameter.
+#
+# The abi is always "i386" for this file.
+#
+0 i386 restart_syscall sys_restart_syscall __ia32_sys_restart_syscall
+1 i386 exit sys_exit __ia32_sys_exit
+2 i386 fork sys_fork __ia32_sys_fork
+3 i386 read sys_read __ia32_sys_read
+4 i386 write sys_write __ia32_sys_write
+5 i386 open sys_open __ia32_compat_sys_open
+6 i386 close sys_close __ia32_sys_close
+7 i386 waitpid sys_waitpid __ia32_sys_waitpid
+8 i386 creat sys_creat __ia32_sys_creat
+9 i386 link sys_link __ia32_sys_link
+10 i386 unlink sys_unlink __ia32_sys_unlink
+11 i386 execve sys_execve __ia32_compat_sys_execve
+12 i386 chdir sys_chdir __ia32_sys_chdir
+13 i386 time sys_time __ia32_compat_sys_time
+14 i386 mknod sys_mknod __ia32_sys_mknod
+15 i386 chmod sys_chmod __ia32_sys_chmod
+16 i386 lchown sys_lchown16 __ia32_sys_lchown16
+17 i386 break
+18 i386 oldstat sys_stat __ia32_sys_stat
+19 i386 lseek sys_lseek __ia32_compat_sys_lseek
+20 i386 getpid sys_getpid __ia32_sys_getpid
+21 i386 mount sys_mount __ia32_compat_sys_mount
+22 i386 umount sys_oldumount __ia32_sys_oldumount
+23 i386 setuid sys_setuid16 __ia32_sys_setuid16
+24 i386 getuid sys_getuid16 __ia32_sys_getuid16
+25 i386 stime sys_stime __ia32_compat_sys_stime
+26 i386 ptrace sys_ptrace __ia32_compat_sys_ptrace
+27 i386 alarm sys_alarm __ia32_sys_alarm
+28 i386 oldfstat sys_fstat __ia32_sys_fstat
+29 i386 pause sys_pause __ia32_sys_pause
+30 i386 utime sys_utime __ia32_compat_sys_utime
+31 i386 stty
+32 i386 gtty
+33 i386 access sys_access __ia32_sys_access
+34 i386 nice sys_nice __ia32_sys_nice
+35 i386 ftime
+36 i386 sync sys_sync __ia32_sys_sync
+37 i386 kill sys_kill __ia32_sys_kill
+38 i386 rename sys_rename __ia32_sys_rename
+39 i386 mkdir sys_mkdir __ia32_sys_mkdir
+40 i386 rmdir sys_rmdir __ia32_sys_rmdir
+41 i386 dup sys_dup __ia32_sys_dup
+42 i386 pipe sys_pipe __ia32_sys_pipe
+43 i386 times sys_times __ia32_compat_sys_times
+44 i386 prof
+45 i386 brk sys_brk __ia32_sys_brk
+46 i386 setgid sys_setgid16 __ia32_sys_setgid16
+47 i386 getgid sys_getgid16 __ia32_sys_getgid16
+48 i386 signal sys_signal __ia32_sys_signal
+49 i386 geteuid sys_geteuid16 __ia32_sys_geteuid16
+50 i386 getegid sys_getegid16 __ia32_sys_getegid16
+51 i386 acct sys_acct __ia32_sys_acct
+52 i386 umount2 sys_umount __ia32_sys_umount
+53 i386 lock
+54 i386 ioctl sys_ioctl __ia32_compat_sys_ioctl
+55 i386 fcntl sys_fcntl __ia32_compat_sys_fcntl64
+56 i386 mpx
+57 i386 setpgid sys_setpgid __ia32_sys_setpgid
+58 i386 ulimit
+59 i386 oldolduname sys_olduname __ia32_sys_olduname
+60 i386 umask sys_umask __ia32_sys_umask
+61 i386 chroot sys_chroot __ia32_sys_chroot
+62 i386 ustat sys_ustat __ia32_compat_sys_ustat
+63 i386 dup2 sys_dup2 __ia32_sys_dup2
+64 i386 getppid sys_getppid __ia32_sys_getppid
+65 i386 getpgrp sys_getpgrp __ia32_sys_getpgrp
+66 i386 setsid sys_setsid __ia32_sys_setsid
+67 i386 sigaction sys_sigaction __ia32_compat_sys_sigaction
+68 i386 sgetmask sys_sgetmask __ia32_sys_sgetmask
+69 i386 ssetmask sys_ssetmask __ia32_sys_ssetmask
+70 i386 setreuid sys_setreuid16 __ia32_sys_setreuid16
+71 i386 setregid sys_setregid16 __ia32_sys_setregid16
+72 i386 sigsuspend sys_sigsuspend __ia32_sys_sigsuspend
+73 i386 sigpending sys_sigpending __ia32_compat_sys_sigpending
+74 i386 sethostname sys_sethostname __ia32_sys_sethostname
+75 i386 setrlimit sys_setrlimit __ia32_compat_sys_setrlimit
+76 i386 getrlimit sys_old_getrlimit __ia32_compat_sys_old_getrlimit
+77 i386 getrusage sys_getrusage __ia32_compat_sys_getrusage
+78 i386 gettimeofday sys_gettimeofday __ia32_compat_sys_gettimeofday
+79 i386 settimeofday sys_settimeofday __ia32_compat_sys_settimeofday
+80 i386 getgroups sys_getgroups16 __ia32_sys_getgroups16
+81 i386 setgroups sys_setgroups16 __ia32_sys_setgroups16
+82 i386 select sys_old_select __ia32_compat_sys_old_select
+83 i386 symlink sys_symlink __ia32_sys_symlink
+84 i386 oldlstat sys_lstat __ia32_sys_lstat
+85 i386 readlink sys_readlink __ia32_sys_readlink
+86 i386 uselib sys_uselib __ia32_sys_uselib
+87 i386 swapon sys_swapon __ia32_sys_swapon
+88 i386 reboot sys_reboot __ia32_sys_reboot
+89 i386 readdir sys_old_readdir __ia32_compat_sys_old_readdir
+90 i386 mmap sys_old_mmap __ia32_compat_sys_x86_mmap
+91 i386 munmap sys_munmap __ia32_sys_munmap
+92 i386 truncate sys_truncate __ia32_compat_sys_truncate
+93 i386 ftruncate sys_ftruncate __ia32_compat_sys_ftruncate
+94 i386 fchmod sys_fchmod __ia32_sys_fchmod
+95 i386 fchown sys_fchown16 __ia32_sys_fchown16
+96 i386 getpriority sys_getpriority __ia32_sys_getpriority
+97 i386 setpriority sys_setpriority __ia32_sys_setpriority
+98 i386 profil
+99 i386 statfs sys_statfs __ia32_compat_sys_statfs
+100 i386 fstatfs sys_fstatfs __ia32_compat_sys_fstatfs
+101 i386 ioperm sys_ioperm __ia32_sys_ioperm
+102 i386 socketcall sys_socketcall __ia32_compat_sys_socketcall
+103 i386 syslog sys_syslog __ia32_sys_syslog
+104 i386 setitimer sys_setitimer __ia32_compat_sys_setitimer
+105 i386 getitimer sys_getitimer __ia32_compat_sys_getitimer
+106 i386 stat sys_newstat __ia32_compat_sys_newstat
+107 i386 lstat sys_newlstat __ia32_compat_sys_newlstat
+108 i386 fstat sys_newfstat __ia32_compat_sys_newfstat
+109 i386 olduname sys_uname __ia32_sys_uname
+110 i386 iopl sys_iopl __ia32_sys_iopl
+111 i386 vhangup sys_vhangup __ia32_sys_vhangup
+112 i386 idle
+113 i386 vm86old sys_vm86old sys_ni_syscall
+114 i386 wait4 sys_wait4 __ia32_compat_sys_wait4
+115 i386 swapoff sys_swapoff __ia32_sys_swapoff
+116 i386 sysinfo sys_sysinfo __ia32_compat_sys_sysinfo
+117 i386 ipc sys_ipc __ia32_compat_sys_ipc
+118 i386 fsync sys_fsync __ia32_sys_fsync
+119 i386 sigreturn sys_sigreturn sys32_sigreturn
+120 i386 clone sys_clone __ia32_compat_sys_x86_clone
+121 i386 setdomainname sys_setdomainname __ia32_sys_setdomainname
+122 i386 uname sys_newuname __ia32_sys_newuname
+123 i386 modify_ldt sys_modify_ldt __ia32_sys_modify_ldt
+124 i386 adjtimex sys_adjtimex __ia32_compat_sys_adjtimex
+125 i386 mprotect sys_mprotect __ia32_sys_mprotect
+126 i386 sigprocmask sys_sigprocmask __ia32_compat_sys_sigprocmask
+127 i386 create_module
+128 i386 init_module sys_init_module __ia32_sys_init_module
+129 i386 delete_module sys_delete_module __ia32_sys_delete_module
+130 i386 get_kernel_syms
+131 i386 quotactl sys_quotactl __ia32_compat_sys_quotactl32
+132 i386 getpgid sys_getpgid __ia32_sys_getpgid
+133 i386 fchdir sys_fchdir __ia32_sys_fchdir
+134 i386 bdflush sys_bdflush __ia32_sys_bdflush
+135 i386 sysfs sys_sysfs __ia32_sys_sysfs
+136 i386 personality sys_personality __ia32_sys_personality
+137 i386 afs_syscall
+138 i386 setfsuid sys_setfsuid16 __ia32_sys_setfsuid16
+139 i386 setfsgid sys_setfsgid16 __ia32_sys_setfsgid16
+140 i386 _llseek sys_llseek __ia32_sys_llseek
+141 i386 getdents sys_getdents __ia32_compat_sys_getdents
+142 i386 _newselect sys_select __ia32_compat_sys_select
+143 i386 flock sys_flock __ia32_sys_flock
+144 i386 msync sys_msync __ia32_sys_msync
+145 i386 readv sys_readv __ia32_compat_sys_readv
+146 i386 writev sys_writev __ia32_compat_sys_writev
+147 i386 getsid sys_getsid __ia32_sys_getsid
+148 i386 fdatasync sys_fdatasync __ia32_sys_fdatasync
+149 i386 _sysctl sys_sysctl __ia32_compat_sys_sysctl
+150 i386 mlock sys_mlock __ia32_sys_mlock
+151 i386 munlock sys_munlock __ia32_sys_munlock
+152 i386 mlockall sys_mlockall __ia32_sys_mlockall
+153 i386 munlockall sys_munlockall __ia32_sys_munlockall
+154 i386 sched_setparam sys_sched_setparam __ia32_sys_sched_setparam
+155 i386 sched_getparam sys_sched_getparam __ia32_sys_sched_getparam
+156 i386 sched_setscheduler sys_sched_setscheduler __ia32_sys_sched_setscheduler
+157 i386 sched_getscheduler sys_sched_getscheduler __ia32_sys_sched_getscheduler
+158 i386 sched_yield sys_sched_yield __ia32_sys_sched_yield
+159 i386 sched_get_priority_max sys_sched_get_priority_max __ia32_sys_sched_get_priority_max
+160 i386 sched_get_priority_min sys_sched_get_priority_min __ia32_sys_sched_get_priority_min
+161 i386 sched_rr_get_interval sys_sched_rr_get_interval __ia32_compat_sys_sched_rr_get_interval
+162 i386 nanosleep sys_nanosleep __ia32_compat_sys_nanosleep
+163 i386 mremap sys_mremap __ia32_sys_mremap
+164 i386 setresuid sys_setresuid16 __ia32_sys_setresuid16
+165 i386 getresuid sys_getresuid16 __ia32_sys_getresuid16
+166 i386 vm86 sys_vm86 sys_ni_syscall
+167 i386 query_module
+168 i386 poll sys_poll __ia32_sys_poll
+169 i386 nfsservctl
+170 i386 setresgid sys_setresgid16 __ia32_sys_setresgid16
+171 i386 getresgid sys_getresgid16 __ia32_sys_getresgid16
+172 i386 prctl sys_prctl __ia32_sys_prctl
+173 i386 rt_sigreturn sys_rt_sigreturn sys32_rt_sigreturn
+174 i386 rt_sigaction sys_rt_sigaction __ia32_compat_sys_rt_sigaction
+175 i386 rt_sigprocmask sys_rt_sigprocmask __ia32_sys_rt_sigprocmask
+176 i386 rt_sigpending sys_rt_sigpending __ia32_compat_sys_rt_sigpending
+177 i386 rt_sigtimedwait sys_rt_sigtimedwait __ia32_compat_sys_rt_sigtimedwait
+178 i386 rt_sigqueueinfo sys_rt_sigqueueinfo __ia32_compat_sys_rt_sigqueueinfo
+179 i386 rt_sigsuspend sys_rt_sigsuspend __ia32_sys_rt_sigsuspend
+180 i386 pread64 sys_pread64 __ia32_compat_sys_x86_pread
+181 i386 pwrite64 sys_pwrite64 __ia32_compat_sys_x86_pwrite
+182 i386 chown sys_chown16 __ia32_sys_chown16
+183 i386 getcwd sys_getcwd __ia32_sys_getcwd
+184 i386 capget sys_capget __ia32_sys_capget
+185 i386 capset sys_capset __ia32_sys_capset
+186 i386 sigaltstack sys_sigaltstack __ia32_compat_sys_sigaltstack
+187 i386 sendfile sys_sendfile __ia32_compat_sys_sendfile
+188 i386 getpmsg
+189 i386 putpmsg
+190 i386 vfork sys_vfork __ia32_sys_vfork
+191 i386 ugetrlimit sys_getrlimit __ia32_compat_sys_getrlimit
+192 i386 mmap2 sys_mmap_pgoff __ia32_sys_mmap_pgoff
+193 i386 truncate64 sys_truncate64 __ia32_compat_sys_x86_truncate64
+194 i386 ftruncate64 sys_ftruncate64 __ia32_compat_sys_x86_ftruncate64
+195 i386 stat64 sys_stat64 __ia32_compat_sys_x86_stat64
+196 i386 lstat64 sys_lstat64 __ia32_compat_sys_x86_lstat64
+197 i386 fstat64 sys_fstat64 __ia32_compat_sys_x86_fstat64
+198 i386 lchown32 sys_lchown __ia32_sys_lchown
+199 i386 getuid32 sys_getuid __ia32_sys_getuid
+200 i386 getgid32 sys_getgid __ia32_sys_getgid
+201 i386 geteuid32 sys_geteuid __ia32_sys_geteuid
+202 i386 getegid32 sys_getegid __ia32_sys_getegid
+203 i386 setreuid32 sys_setreuid __ia32_sys_setreuid
+204 i386 setregid32 sys_setregid __ia32_sys_setregid
+205 i386 getgroups32 sys_getgroups __ia32_sys_getgroups
+206 i386 setgroups32 sys_setgroups __ia32_sys_setgroups
+207 i386 fchown32 sys_fchown __ia32_sys_fchown
+208 i386 setresuid32 sys_setresuid __ia32_sys_setresuid
+209 i386 getresuid32 sys_getresuid __ia32_sys_getresuid
+210 i386 setresgid32 sys_setresgid __ia32_sys_setresgid
+211 i386 getresgid32 sys_getresgid __ia32_sys_getresgid
+212 i386 chown32 sys_chown __ia32_sys_chown
+213 i386 setuid32 sys_setuid __ia32_sys_setuid
+214 i386 setgid32 sys_setgid __ia32_sys_setgid
+215 i386 setfsuid32 sys_setfsuid __ia32_sys_setfsuid
+216 i386 setfsgid32 sys_setfsgid __ia32_sys_setfsgid
+217 i386 pivot_root sys_pivot_root __ia32_sys_pivot_root
+218 i386 mincore sys_mincore __ia32_sys_mincore
+219 i386 madvise sys_madvise __ia32_sys_madvise
+220 i386 getdents64 sys_getdents64 __ia32_sys_getdents64
+221 i386 fcntl64 sys_fcntl64 __ia32_compat_sys_fcntl64
+# 222 is unused
+# 223 is unused
+224 i386 gettid sys_gettid __ia32_sys_gettid
+225 i386 readahead sys_readahead __ia32_compat_sys_x86_readahead
+226 i386 setxattr sys_setxattr __ia32_sys_setxattr
+227 i386 lsetxattr sys_lsetxattr __ia32_sys_lsetxattr
+228 i386 fsetxattr sys_fsetxattr __ia32_sys_fsetxattr
+229 i386 getxattr sys_getxattr __ia32_sys_getxattr
+230 i386 lgetxattr sys_lgetxattr __ia32_sys_lgetxattr
+231 i386 fgetxattr sys_fgetxattr __ia32_sys_fgetxattr
+232 i386 listxattr sys_listxattr __ia32_sys_listxattr
+233 i386 llistxattr sys_llistxattr __ia32_sys_llistxattr
+234 i386 flistxattr sys_flistxattr __ia32_sys_flistxattr
+235 i386 removexattr sys_removexattr __ia32_sys_removexattr
+236 i386 lremovexattr sys_lremovexattr __ia32_sys_lremovexattr
+237 i386 fremovexattr sys_fremovexattr __ia32_sys_fremovexattr
+238 i386 tkill sys_tkill __ia32_sys_tkill
+239 i386 sendfile64 sys_sendfile64 __ia32_sys_sendfile64
+240 i386 futex sys_futex __ia32_compat_sys_futex
+241 i386 sched_setaffinity sys_sched_setaffinity __ia32_compat_sys_sched_setaffinity
+242 i386 sched_getaffinity sys_sched_getaffinity __ia32_compat_sys_sched_getaffinity
+243 i386 set_thread_area sys_set_thread_area __ia32_sys_set_thread_area
+244 i386 get_thread_area sys_get_thread_area __ia32_sys_get_thread_area
+245 i386 io_setup sys_io_setup __ia32_compat_sys_io_setup
+246 i386 io_destroy sys_io_destroy __ia32_sys_io_destroy
+247 i386 io_getevents sys_io_getevents __ia32_compat_sys_io_getevents
+248 i386 io_submit sys_io_submit __ia32_compat_sys_io_submit
+249 i386 io_cancel sys_io_cancel __ia32_sys_io_cancel
+250 i386 fadvise64 sys_fadvise64 __ia32_compat_sys_x86_fadvise64
+# 251 is available for reuse (was briefly sys_set_zone_reclaim)
+252 i386 exit_group sys_exit_group __ia32_sys_exit_group
+253 i386 lookup_dcookie sys_lookup_dcookie __ia32_compat_sys_lookup_dcookie
+254 i386 epoll_create sys_epoll_create __ia32_sys_epoll_create
+255 i386 epoll_ctl sys_epoll_ctl __ia32_sys_epoll_ctl
+256 i386 epoll_wait sys_epoll_wait __ia32_sys_epoll_wait
+257 i386 remap_file_pages sys_remap_file_pages __ia32_sys_remap_file_pages
+258 i386 set_tid_address sys_set_tid_address __ia32_sys_set_tid_address
+259 i386 timer_create sys_timer_create __ia32_compat_sys_timer_create
+260 i386 timer_settime sys_timer_settime __ia32_compat_sys_timer_settime
+261 i386 timer_gettime sys_timer_gettime __ia32_compat_sys_timer_gettime
+262 i386 timer_getoverrun sys_timer_getoverrun __ia32_sys_timer_getoverrun
+263 i386 timer_delete sys_timer_delete __ia32_sys_timer_delete
+264 i386 clock_settime sys_clock_settime __ia32_compat_sys_clock_settime
+265 i386 clock_gettime sys_clock_gettime __ia32_compat_sys_clock_gettime
+266 i386 clock_getres sys_clock_getres __ia32_compat_sys_clock_getres
+267 i386 clock_nanosleep sys_clock_nanosleep __ia32_compat_sys_clock_nanosleep
+268 i386 statfs64 sys_statfs64 __ia32_compat_sys_statfs64
+269 i386 fstatfs64 sys_fstatfs64 __ia32_compat_sys_fstatfs64
+270 i386 tgkill sys_tgkill __ia32_sys_tgkill
+271 i386 utimes sys_utimes __ia32_compat_sys_utimes
+272 i386 fadvise64_64 sys_fadvise64_64 __ia32_compat_sys_x86_fadvise64_64
+273 i386 vserver
+274 i386 mbind sys_mbind __ia32_sys_mbind
+275 i386 get_mempolicy sys_get_mempolicy __ia32_compat_sys_get_mempolicy
+276 i386 set_mempolicy sys_set_mempolicy __ia32_sys_set_mempolicy
+277 i386 mq_open sys_mq_open __ia32_compat_sys_mq_open
+278 i386 mq_unlink sys_mq_unlink __ia32_sys_mq_unlink
+279 i386 mq_timedsend sys_mq_timedsend __ia32_compat_sys_mq_timedsend
+280 i386 mq_timedreceive sys_mq_timedreceive __ia32_compat_sys_mq_timedreceive
+281 i386 mq_notify sys_mq_notify __ia32_compat_sys_mq_notify
+282 i386 mq_getsetattr sys_mq_getsetattr __ia32_compat_sys_mq_getsetattr
+283 i386 kexec_load sys_kexec_load __ia32_compat_sys_kexec_load
+284 i386 waitid sys_waitid __ia32_compat_sys_waitid
+# 285 sys_setaltroot
+286 i386 add_key sys_add_key __ia32_sys_add_key
+287 i386 request_key sys_request_key __ia32_sys_request_key
+288 i386 keyctl sys_keyctl __ia32_compat_sys_keyctl
+289 i386 ioprio_set sys_ioprio_set __ia32_sys_ioprio_set
+290 i386 ioprio_get sys_ioprio_get __ia32_sys_ioprio_get
+291 i386 inotify_init sys_inotify_init __ia32_sys_inotify_init
+292 i386 inotify_add_watch sys_inotify_add_watch __ia32_sys_inotify_add_watch
+293 i386 inotify_rm_watch sys_inotify_rm_watch __ia32_sys_inotify_rm_watch
+294 i386 migrate_pages sys_migrate_pages __ia32_sys_migrate_pages
+295 i386 openat sys_openat __ia32_compat_sys_openat
+296 i386 mkdirat sys_mkdirat __ia32_sys_mkdirat
+297 i386 mknodat sys_mknodat __ia32_sys_mknodat
+298 i386 fchownat sys_fchownat __ia32_sys_fchownat
+299 i386 futimesat sys_futimesat __ia32_compat_sys_futimesat
+300 i386 fstatat64 sys_fstatat64 __ia32_compat_sys_x86_fstatat
+301 i386 unlinkat sys_unlinkat __ia32_sys_unlinkat
+302 i386 renameat sys_renameat __ia32_sys_renameat
+303 i386 linkat sys_linkat __ia32_sys_linkat
+304 i386 symlinkat sys_symlinkat __ia32_sys_symlinkat
+305 i386 readlinkat sys_readlinkat __ia32_sys_readlinkat
+306 i386 fchmodat sys_fchmodat __ia32_sys_fchmodat
+307 i386 faccessat sys_faccessat __ia32_sys_faccessat
+308 i386 pselect6 sys_pselect6 __ia32_compat_sys_pselect6
+309 i386 ppoll sys_ppoll __ia32_compat_sys_ppoll
+310 i386 unshare sys_unshare __ia32_sys_unshare
+311 i386 set_robust_list sys_set_robust_list __ia32_compat_sys_set_robust_list
+312 i386 get_robust_list sys_get_robust_list __ia32_compat_sys_get_robust_list
+313 i386 splice sys_splice __ia32_sys_splice
+314 i386 sync_file_range sys_sync_file_range __ia32_compat_sys_x86_sync_file_range
+315 i386 tee sys_tee __ia32_sys_tee
+316 i386 vmsplice sys_vmsplice __ia32_compat_sys_vmsplice
+317 i386 move_pages sys_move_pages __ia32_compat_sys_move_pages
+318 i386 getcpu sys_getcpu __ia32_sys_getcpu
+319 i386 epoll_pwait sys_epoll_pwait __ia32_sys_epoll_pwait
+320 i386 utimensat sys_utimensat __ia32_compat_sys_utimensat
+321 i386 signalfd sys_signalfd __ia32_compat_sys_signalfd
+322 i386 timerfd_create sys_timerfd_create __ia32_sys_timerfd_create
+323 i386 eventfd sys_eventfd __ia32_sys_eventfd
+324 i386 fallocate sys_fallocate __ia32_compat_sys_x86_fallocate
+325 i386 timerfd_settime sys_timerfd_settime __ia32_compat_sys_timerfd_settime
+326 i386 timerfd_gettime sys_timerfd_gettime __ia32_compat_sys_timerfd_gettime
+327 i386 signalfd4 sys_signalfd4 __ia32_compat_sys_signalfd4
+328 i386 eventfd2 sys_eventfd2 __ia32_sys_eventfd2
+329 i386 epoll_create1 sys_epoll_create1 __ia32_sys_epoll_create1
+330 i386 dup3 sys_dup3 __ia32_sys_dup3
+331 i386 pipe2 sys_pipe2 __ia32_sys_pipe2
+332 i386 inotify_init1 sys_inotify_init1 __ia32_sys_inotify_init1
+333 i386 preadv sys_preadv __ia32_compat_sys_preadv
+334 i386 pwritev sys_pwritev __ia32_compat_sys_pwritev
+335 i386 rt_tgsigqueueinfo sys_rt_tgsigqueueinfo __ia32_compat_sys_rt_tgsigqueueinfo
+336 i386 perf_event_open sys_perf_event_open __ia32_sys_perf_event_open
+337 i386 recvmmsg sys_recvmmsg __ia32_compat_sys_recvmmsg
+338 i386 fanotify_init sys_fanotify_init __ia32_sys_fanotify_init
+339 i386 fanotify_mark sys_fanotify_mark __ia32_compat_sys_fanotify_mark
+340 i386 prlimit64 sys_prlimit64 __ia32_sys_prlimit64
+341 i386 name_to_handle_at sys_name_to_handle_at __ia32_sys_name_to_handle_at
+342 i386 open_by_handle_at sys_open_by_handle_at __ia32_compat_sys_open_by_handle_at
+343 i386 clock_adjtime sys_clock_adjtime __ia32_compat_sys_clock_adjtime
+344 i386 syncfs sys_syncfs __ia32_sys_syncfs
+345 i386 sendmmsg sys_sendmmsg __ia32_compat_sys_sendmmsg
+346 i386 setns sys_setns __ia32_sys_setns
+347 i386 process_vm_readv sys_process_vm_readv __ia32_compat_sys_process_vm_readv
+348 i386 process_vm_writev sys_process_vm_writev __ia32_compat_sys_process_vm_writev
+349 i386 kcmp sys_kcmp __ia32_sys_kcmp
+350 i386 finit_module sys_finit_module __ia32_sys_finit_module
+351 i386 sched_setattr sys_sched_setattr __ia32_sys_sched_setattr
+352 i386 sched_getattr sys_sched_getattr __ia32_sys_sched_getattr
+353 i386 renameat2 sys_renameat2 __ia32_sys_renameat2
+354 i386 seccomp sys_seccomp __ia32_sys_seccomp
+355 i386 getrandom sys_getrandom __ia32_sys_getrandom
+356 i386 memfd_create sys_memfd_create __ia32_sys_memfd_create
+357 i386 bpf sys_bpf __ia32_sys_bpf
+358 i386 execveat sys_execveat __ia32_compat_sys_execveat
+359 i386 socket sys_socket __ia32_sys_socket
+360 i386 socketpair sys_socketpair __ia32_sys_socketpair
+361 i386 bind sys_bind __ia32_sys_bind
+362 i386 connect sys_connect __ia32_sys_connect
+363 i386 listen sys_listen __ia32_sys_listen
+364 i386 accept4 sys_accept4 __ia32_sys_accept4
+365 i386 getsockopt sys_getsockopt __ia32_compat_sys_getsockopt
+366 i386 setsockopt sys_setsockopt __ia32_compat_sys_setsockopt
+367 i386 getsockname sys_getsockname __ia32_sys_getsockname
+368 i386 getpeername sys_getpeername __ia32_sys_getpeername
+369 i386 sendto sys_sendto __ia32_sys_sendto
+370 i386 sendmsg sys_sendmsg __ia32_compat_sys_sendmsg
+371 i386 recvfrom sys_recvfrom __ia32_compat_sys_recvfrom
+372 i386 recvmsg sys_recvmsg __ia32_compat_sys_recvmsg
+373 i386 shutdown sys_shutdown __ia32_sys_shutdown
+374 i386 userfaultfd sys_userfaultfd __ia32_sys_userfaultfd
+375 i386 membarrier sys_membarrier __ia32_sys_membarrier
+376 i386 mlock2 sys_mlock2 __ia32_sys_mlock2
+377 i386 copy_file_range sys_copy_file_range __ia32_sys_copy_file_range
+378 i386 preadv2 sys_preadv2 __ia32_compat_sys_preadv2
+379 i386 pwritev2 sys_pwritev2 __ia32_compat_sys_pwritev2
+380 i386 pkey_mprotect sys_pkey_mprotect __ia32_sys_pkey_mprotect
+381 i386 pkey_alloc sys_pkey_alloc __ia32_sys_pkey_alloc
+382 i386 pkey_free sys_pkey_free __ia32_sys_pkey_free
+383 i386 statx sys_statx __ia32_sys_statx
+384 i386 arch_prctl sys_arch_prctl __ia32_compat_sys_arch_prctl
+385 i386 io_pgetevents sys_io_pgetevents __ia32_compat_sys_io_pgetevents
+386 i386 rseq sys_rseq __ia32_sys_rseq
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
new file mode 100644
index 000000000..f0b1709a5
--- /dev/null
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -0,0 +1,388 @@
+#
+# 64-bit system call numbers and entry vectors
+#
+# The format is:
+# <number> <abi> <name> <entry point>
+#
+# The __x64_sys_*() stubs are created on-the-fly for sys_*() system calls
+#
+# The abi is "common", "64" or "x32" for this file.
+#
+0 common read __x64_sys_read
+1 common write __x64_sys_write
+2 common open __x64_sys_open
+3 common close __x64_sys_close
+4 common stat __x64_sys_newstat
+5 common fstat __x64_sys_newfstat
+6 common lstat __x64_sys_newlstat
+7 common poll __x64_sys_poll
+8 common lseek __x64_sys_lseek
+9 common mmap __x64_sys_mmap
+10 common mprotect __x64_sys_mprotect
+11 common munmap __x64_sys_munmap
+12 common brk __x64_sys_brk
+13 64 rt_sigaction __x64_sys_rt_sigaction
+14 common rt_sigprocmask __x64_sys_rt_sigprocmask
+15 64 rt_sigreturn __x64_sys_rt_sigreturn/ptregs
+16 64 ioctl __x64_sys_ioctl
+17 common pread64 __x64_sys_pread64
+18 common pwrite64 __x64_sys_pwrite64
+19 64 readv __x64_sys_readv
+20 64 writev __x64_sys_writev
+21 common access __x64_sys_access
+22 common pipe __x64_sys_pipe
+23 common select __x64_sys_select
+24 common sched_yield __x64_sys_sched_yield
+25 common mremap __x64_sys_mremap
+26 common msync __x64_sys_msync
+27 common mincore __x64_sys_mincore
+28 common madvise __x64_sys_madvise
+29 common shmget __x64_sys_shmget
+30 common shmat __x64_sys_shmat
+31 common shmctl __x64_sys_shmctl
+32 common dup __x64_sys_dup
+33 common dup2 __x64_sys_dup2
+34 common pause __x64_sys_pause
+35 common nanosleep __x64_sys_nanosleep
+36 common getitimer __x64_sys_getitimer
+37 common alarm __x64_sys_alarm
+38 common setitimer __x64_sys_setitimer
+39 common getpid __x64_sys_getpid
+40 common sendfile __x64_sys_sendfile64
+41 common socket __x64_sys_socket
+42 common connect __x64_sys_connect
+43 common accept __x64_sys_accept
+44 common sendto __x64_sys_sendto
+45 64 recvfrom __x64_sys_recvfrom
+46 64 sendmsg __x64_sys_sendmsg
+47 64 recvmsg __x64_sys_recvmsg
+48 common shutdown __x64_sys_shutdown
+49 common bind __x64_sys_bind
+50 common listen __x64_sys_listen
+51 common getsockname __x64_sys_getsockname
+52 common getpeername __x64_sys_getpeername
+53 common socketpair __x64_sys_socketpair
+54 64 setsockopt __x64_sys_setsockopt
+55 64 getsockopt __x64_sys_getsockopt
+56 common clone __x64_sys_clone/ptregs
+57 common fork __x64_sys_fork/ptregs
+58 common vfork __x64_sys_vfork/ptregs
+59 64 execve __x64_sys_execve/ptregs
+60 common exit __x64_sys_exit
+61 common wait4 __x64_sys_wait4
+62 common kill __x64_sys_kill
+63 common uname __x64_sys_newuname
+64 common semget __x64_sys_semget
+65 common semop __x64_sys_semop
+66 common semctl __x64_sys_semctl
+67 common shmdt __x64_sys_shmdt
+68 common msgget __x64_sys_msgget
+69 common msgsnd __x64_sys_msgsnd
+70 common msgrcv __x64_sys_msgrcv
+71 common msgctl __x64_sys_msgctl
+72 common fcntl __x64_sys_fcntl
+73 common flock __x64_sys_flock
+74 common fsync __x64_sys_fsync
+75 common fdatasync __x64_sys_fdatasync
+76 common truncate __x64_sys_truncate
+77 common ftruncate __x64_sys_ftruncate
+78 common getdents __x64_sys_getdents
+79 common getcwd __x64_sys_getcwd
+80 common chdir __x64_sys_chdir
+81 common fchdir __x64_sys_fchdir
+82 common rename __x64_sys_rename
+83 common mkdir __x64_sys_mkdir
+84 common rmdir __x64_sys_rmdir
+85 common creat __x64_sys_creat
+86 common link __x64_sys_link
+87 common unlink __x64_sys_unlink
+88 common symlink __x64_sys_symlink
+89 common readlink __x64_sys_readlink
+90 common chmod __x64_sys_chmod
+91 common fchmod __x64_sys_fchmod
+92 common chown __x64_sys_chown
+93 common fchown __x64_sys_fchown
+94 common lchown __x64_sys_lchown
+95 common umask __x64_sys_umask
+96 common gettimeofday __x64_sys_gettimeofday
+97 common getrlimit __x64_sys_getrlimit
+98 common getrusage __x64_sys_getrusage
+99 common sysinfo __x64_sys_sysinfo
+100 common times __x64_sys_times
+101 64 ptrace __x64_sys_ptrace
+102 common getuid __x64_sys_getuid
+103 common syslog __x64_sys_syslog
+104 common getgid __x64_sys_getgid
+105 common setuid __x64_sys_setuid
+106 common setgid __x64_sys_setgid
+107 common geteuid __x64_sys_geteuid
+108 common getegid __x64_sys_getegid
+109 common setpgid __x64_sys_setpgid
+110 common getppid __x64_sys_getppid
+111 common getpgrp __x64_sys_getpgrp
+112 common setsid __x64_sys_setsid
+113 common setreuid __x64_sys_setreuid
+114 common setregid __x64_sys_setregid
+115 common getgroups __x64_sys_getgroups
+116 common setgroups __x64_sys_setgroups
+117 common setresuid __x64_sys_setresuid
+118 common getresuid __x64_sys_getresuid
+119 common setresgid __x64_sys_setresgid
+120 common getresgid __x64_sys_getresgid
+121 common getpgid __x64_sys_getpgid
+122 common setfsuid __x64_sys_setfsuid
+123 common setfsgid __x64_sys_setfsgid
+124 common getsid __x64_sys_getsid
+125 common capget __x64_sys_capget
+126 common capset __x64_sys_capset
+127 64 rt_sigpending __x64_sys_rt_sigpending
+128 64 rt_sigtimedwait __x64_sys_rt_sigtimedwait
+129 64 rt_sigqueueinfo __x64_sys_rt_sigqueueinfo
+130 common rt_sigsuspend __x64_sys_rt_sigsuspend
+131 64 sigaltstack __x64_sys_sigaltstack
+132 common utime __x64_sys_utime
+133 common mknod __x64_sys_mknod
+134 64 uselib
+135 common personality __x64_sys_personality
+136 common ustat __x64_sys_ustat
+137 common statfs __x64_sys_statfs
+138 common fstatfs __x64_sys_fstatfs
+139 common sysfs __x64_sys_sysfs
+140 common getpriority __x64_sys_getpriority
+141 common setpriority __x64_sys_setpriority
+142 common sched_setparam __x64_sys_sched_setparam
+143 common sched_getparam __x64_sys_sched_getparam
+144 common sched_setscheduler __x64_sys_sched_setscheduler
+145 common sched_getscheduler __x64_sys_sched_getscheduler
+146 common sched_get_priority_max __x64_sys_sched_get_priority_max
+147 common sched_get_priority_min __x64_sys_sched_get_priority_min
+148 common sched_rr_get_interval __x64_sys_sched_rr_get_interval
+149 common mlock __x64_sys_mlock
+150 common munlock __x64_sys_munlock
+151 common mlockall __x64_sys_mlockall
+152 common munlockall __x64_sys_munlockall
+153 common vhangup __x64_sys_vhangup
+154 common modify_ldt __x64_sys_modify_ldt
+155 common pivot_root __x64_sys_pivot_root
+156 64 _sysctl __x64_sys_sysctl
+157 common prctl __x64_sys_prctl
+158 common arch_prctl __x64_sys_arch_prctl
+159 common adjtimex __x64_sys_adjtimex
+160 common setrlimit __x64_sys_setrlimit
+161 common chroot __x64_sys_chroot
+162 common sync __x64_sys_sync
+163 common acct __x64_sys_acct
+164 common settimeofday __x64_sys_settimeofday
+165 common mount __x64_sys_mount
+166 common umount2 __x64_sys_umount
+167 common swapon __x64_sys_swapon
+168 common swapoff __x64_sys_swapoff
+169 common reboot __x64_sys_reboot
+170 common sethostname __x64_sys_sethostname
+171 common setdomainname __x64_sys_setdomainname
+172 common iopl __x64_sys_iopl/ptregs
+173 common ioperm __x64_sys_ioperm
+174 64 create_module
+175 common init_module __x64_sys_init_module
+176 common delete_module __x64_sys_delete_module
+177 64 get_kernel_syms
+178 64 query_module
+179 common quotactl __x64_sys_quotactl
+180 64 nfsservctl
+181 common getpmsg
+182 common putpmsg
+183 common afs_syscall
+184 common tuxcall
+185 common security
+186 common gettid __x64_sys_gettid
+187 common readahead __x64_sys_readahead
+188 common setxattr __x64_sys_setxattr
+189 common lsetxattr __x64_sys_lsetxattr
+190 common fsetxattr __x64_sys_fsetxattr
+191 common getxattr __x64_sys_getxattr
+192 common lgetxattr __x64_sys_lgetxattr
+193 common fgetxattr __x64_sys_fgetxattr
+194 common listxattr __x64_sys_listxattr
+195 common llistxattr __x64_sys_llistxattr
+196 common flistxattr __x64_sys_flistxattr
+197 common removexattr __x64_sys_removexattr
+198 common lremovexattr __x64_sys_lremovexattr
+199 common fremovexattr __x64_sys_fremovexattr
+200 common tkill __x64_sys_tkill
+201 common time __x64_sys_time
+202 common futex __x64_sys_futex
+203 common sched_setaffinity __x64_sys_sched_setaffinity
+204 common sched_getaffinity __x64_sys_sched_getaffinity
+205 64 set_thread_area
+206 64 io_setup __x64_sys_io_setup
+207 common io_destroy __x64_sys_io_destroy
+208 common io_getevents __x64_sys_io_getevents
+209 64 io_submit __x64_sys_io_submit
+210 common io_cancel __x64_sys_io_cancel
+211 64 get_thread_area
+212 common lookup_dcookie __x64_sys_lookup_dcookie
+213 common epoll_create __x64_sys_epoll_create
+214 64 epoll_ctl_old
+215 64 epoll_wait_old
+216 common remap_file_pages __x64_sys_remap_file_pages
+217 common getdents64 __x64_sys_getdents64
+218 common set_tid_address __x64_sys_set_tid_address
+219 common restart_syscall __x64_sys_restart_syscall
+220 common semtimedop __x64_sys_semtimedop
+221 common fadvise64 __x64_sys_fadvise64
+222 64 timer_create __x64_sys_timer_create
+223 common timer_settime __x64_sys_timer_settime
+224 common timer_gettime __x64_sys_timer_gettime
+225 common timer_getoverrun __x64_sys_timer_getoverrun
+226 common timer_delete __x64_sys_timer_delete
+227 common clock_settime __x64_sys_clock_settime
+228 common clock_gettime __x64_sys_clock_gettime
+229 common clock_getres __x64_sys_clock_getres
+230 common clock_nanosleep __x64_sys_clock_nanosleep
+231 common exit_group __x64_sys_exit_group
+232 common epoll_wait __x64_sys_epoll_wait
+233 common epoll_ctl __x64_sys_epoll_ctl
+234 common tgkill __x64_sys_tgkill
+235 common utimes __x64_sys_utimes
+236 64 vserver
+237 common mbind __x64_sys_mbind
+238 common set_mempolicy __x64_sys_set_mempolicy
+239 common get_mempolicy __x64_sys_get_mempolicy
+240 common mq_open __x64_sys_mq_open
+241 common mq_unlink __x64_sys_mq_unlink
+242 common mq_timedsend __x64_sys_mq_timedsend
+243 common mq_timedreceive __x64_sys_mq_timedreceive
+244 64 mq_notify __x64_sys_mq_notify
+245 common mq_getsetattr __x64_sys_mq_getsetattr
+246 64 kexec_load __x64_sys_kexec_load
+247 64 waitid __x64_sys_waitid
+248 common add_key __x64_sys_add_key
+249 common request_key __x64_sys_request_key
+250 common keyctl __x64_sys_keyctl
+251 common ioprio_set __x64_sys_ioprio_set
+252 common ioprio_get __x64_sys_ioprio_get
+253 common inotify_init __x64_sys_inotify_init
+254 common inotify_add_watch __x64_sys_inotify_add_watch
+255 common inotify_rm_watch __x64_sys_inotify_rm_watch
+256 common migrate_pages __x64_sys_migrate_pages
+257 common openat __x64_sys_openat
+258 common mkdirat __x64_sys_mkdirat
+259 common mknodat __x64_sys_mknodat
+260 common fchownat __x64_sys_fchownat
+261 common futimesat __x64_sys_futimesat
+262 common newfstatat __x64_sys_newfstatat
+263 common unlinkat __x64_sys_unlinkat
+264 common renameat __x64_sys_renameat
+265 common linkat __x64_sys_linkat
+266 common symlinkat __x64_sys_symlinkat
+267 common readlinkat __x64_sys_readlinkat
+268 common fchmodat __x64_sys_fchmodat
+269 common faccessat __x64_sys_faccessat
+270 common pselect6 __x64_sys_pselect6
+271 common ppoll __x64_sys_ppoll
+272 common unshare __x64_sys_unshare
+273 64 set_robust_list __x64_sys_set_robust_list
+274 64 get_robust_list __x64_sys_get_robust_list
+275 common splice __x64_sys_splice
+276 common tee __x64_sys_tee
+277 common sync_file_range __x64_sys_sync_file_range
+278 64 vmsplice __x64_sys_vmsplice
+279 64 move_pages __x64_sys_move_pages
+280 common utimensat __x64_sys_utimensat
+281 common epoll_pwait __x64_sys_epoll_pwait
+282 common signalfd __x64_sys_signalfd
+283 common timerfd_create __x64_sys_timerfd_create
+284 common eventfd __x64_sys_eventfd
+285 common fallocate __x64_sys_fallocate
+286 common timerfd_settime __x64_sys_timerfd_settime
+287 common timerfd_gettime __x64_sys_timerfd_gettime
+288 common accept4 __x64_sys_accept4
+289 common signalfd4 __x64_sys_signalfd4
+290 common eventfd2 __x64_sys_eventfd2
+291 common epoll_create1 __x64_sys_epoll_create1
+292 common dup3 __x64_sys_dup3
+293 common pipe2 __x64_sys_pipe2
+294 common inotify_init1 __x64_sys_inotify_init1
+295 64 preadv __x64_sys_preadv
+296 64 pwritev __x64_sys_pwritev
+297 64 rt_tgsigqueueinfo __x64_sys_rt_tgsigqueueinfo
+298 common perf_event_open __x64_sys_perf_event_open
+299 64 recvmmsg __x64_sys_recvmmsg
+300 common fanotify_init __x64_sys_fanotify_init
+301 common fanotify_mark __x64_sys_fanotify_mark
+302 common prlimit64 __x64_sys_prlimit64
+303 common name_to_handle_at __x64_sys_name_to_handle_at
+304 common open_by_handle_at __x64_sys_open_by_handle_at
+305 common clock_adjtime __x64_sys_clock_adjtime
+306 common syncfs __x64_sys_syncfs
+307 64 sendmmsg __x64_sys_sendmmsg
+308 common setns __x64_sys_setns
+309 common getcpu __x64_sys_getcpu
+310 64 process_vm_readv __x64_sys_process_vm_readv
+311 64 process_vm_writev __x64_sys_process_vm_writev
+312 common kcmp __x64_sys_kcmp
+313 common finit_module __x64_sys_finit_module
+314 common sched_setattr __x64_sys_sched_setattr
+315 common sched_getattr __x64_sys_sched_getattr
+316 common renameat2 __x64_sys_renameat2
+317 common seccomp __x64_sys_seccomp
+318 common getrandom __x64_sys_getrandom
+319 common memfd_create __x64_sys_memfd_create
+320 common kexec_file_load __x64_sys_kexec_file_load
+321 common bpf __x64_sys_bpf
+322 64 execveat __x64_sys_execveat/ptregs
+323 common userfaultfd __x64_sys_userfaultfd
+324 common membarrier __x64_sys_membarrier
+325 common mlock2 __x64_sys_mlock2
+326 common copy_file_range __x64_sys_copy_file_range
+327 64 preadv2 __x64_sys_preadv2
+328 64 pwritev2 __x64_sys_pwritev2
+329 common pkey_mprotect __x64_sys_pkey_mprotect
+330 common pkey_alloc __x64_sys_pkey_alloc
+331 common pkey_free __x64_sys_pkey_free
+332 common statx __x64_sys_statx
+333 common io_pgetevents __x64_sys_io_pgetevents
+334 common rseq __x64_sys_rseq
+
+#
+# x32-specific system call numbers start at 512 to avoid cache impact
+# for native 64-bit operation. The __x32_compat_sys stubs are created
+# on-the-fly for compat_sys_*() compatibility system calls if X86_X32
+# is defined.
+#
+512 x32 rt_sigaction __x32_compat_sys_rt_sigaction
+513 x32 rt_sigreturn sys32_x32_rt_sigreturn
+514 x32 ioctl __x32_compat_sys_ioctl
+515 x32 readv __x32_compat_sys_readv
+516 x32 writev __x32_compat_sys_writev
+517 x32 recvfrom __x32_compat_sys_recvfrom
+518 x32 sendmsg __x32_compat_sys_sendmsg
+519 x32 recvmsg __x32_compat_sys_recvmsg
+520 x32 execve __x32_compat_sys_execve/ptregs
+521 x32 ptrace __x32_compat_sys_ptrace
+522 x32 rt_sigpending __x32_compat_sys_rt_sigpending
+523 x32 rt_sigtimedwait __x32_compat_sys_rt_sigtimedwait
+524 x32 rt_sigqueueinfo __x32_compat_sys_rt_sigqueueinfo
+525 x32 sigaltstack __x32_compat_sys_sigaltstack
+526 x32 timer_create __x32_compat_sys_timer_create
+527 x32 mq_notify __x32_compat_sys_mq_notify
+528 x32 kexec_load __x32_compat_sys_kexec_load
+529 x32 waitid __x32_compat_sys_waitid
+530 x32 set_robust_list __x32_compat_sys_set_robust_list
+531 x32 get_robust_list __x32_compat_sys_get_robust_list
+532 x32 vmsplice __x32_compat_sys_vmsplice
+533 x32 move_pages __x32_compat_sys_move_pages
+534 x32 preadv __x32_compat_sys_preadv64
+535 x32 pwritev __x32_compat_sys_pwritev64
+536 x32 rt_tgsigqueueinfo __x32_compat_sys_rt_tgsigqueueinfo
+537 x32 recvmmsg __x32_compat_sys_recvmmsg
+538 x32 sendmmsg __x32_compat_sys_sendmmsg
+539 x32 process_vm_readv __x32_compat_sys_process_vm_readv
+540 x32 process_vm_writev __x32_compat_sys_process_vm_writev
+541 x32 setsockopt __x32_compat_sys_setsockopt
+542 x32 getsockopt __x32_compat_sys_getsockopt
+543 x32 io_setup __x32_compat_sys_io_setup
+544 x32 io_submit __x32_compat_sys_io_submit
+545 x32 execveat __x32_compat_sys_execveat/ptregs
+546 x32 preadv2 __x32_compat_sys_preadv64v2
+547 x32 pwritev2 __x32_compat_sys_pwritev64v2
diff --git a/arch/x86/entry/syscalls/syscallhdr.sh b/arch/x86/entry/syscalls/syscallhdr.sh
new file mode 100644
index 000000000..12fbbcfe7
--- /dev/null
+++ b/arch/x86/entry/syscalls/syscallhdr.sh
@@ -0,0 +1,28 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+
+in="$1"
+out="$2"
+my_abis=`echo "($3)" | tr ',' '|'`
+prefix="$4"
+offset="$5"
+
+fileguard=_ASM_X86_`basename "$out" | sed \
+ -e 'y/abcdefghijklmnopqrstuvwxyz/ABCDEFGHIJKLMNOPQRSTUVWXYZ/' \
+ -e 's/[^A-Z0-9_]/_/g' -e 's/__/_/g'`
+grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | (
+ echo "#ifndef ${fileguard}"
+ echo "#define ${fileguard} 1"
+ echo ""
+
+ while read nr abi name entry ; do
+ if [ -z "$offset" ]; then
+ echo "#define __NR_${prefix}${name} $nr"
+ else
+ echo "#define __NR_${prefix}${name} ($offset + $nr)"
+ fi
+ done
+
+ echo ""
+ echo "#endif /* ${fileguard} */"
+) > "$out"
diff --git a/arch/x86/entry/syscalls/syscalltbl.sh b/arch/x86/entry/syscalls/syscalltbl.sh
new file mode 100644
index 000000000..94fcd1951
--- /dev/null
+++ b/arch/x86/entry/syscalls/syscalltbl.sh
@@ -0,0 +1,81 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+
+in="$1"
+out="$2"
+
+syscall_macro() {
+ abi="$1"
+ nr="$2"
+ entry="$3"
+
+ # Entry can be either just a function name or "function/qualifier"
+ real_entry="${entry%%/*}"
+ if [ "$entry" = "$real_entry" ]; then
+ qualifier=
+ else
+ qualifier=${entry#*/}
+ fi
+
+ echo "__SYSCALL_${abi}($nr, $real_entry, $qualifier)"
+}
+
+emit() {
+ abi="$1"
+ nr="$2"
+ entry="$3"
+ compat="$4"
+ umlentry=""
+
+ if [ "$abi" = "64" -a -n "$compat" ]; then
+ echo "a compat entry for a 64-bit syscall makes no sense" >&2
+ exit 1
+ fi
+
+ # For CONFIG_UML, we need to strip the __x64_sys prefix
+ if [ "$abi" = "64" -a "${entry}" != "${entry#__x64_sys}" ]; then
+ umlentry="sys${entry#__x64_sys}"
+ fi
+
+ if [ -z "$compat" ]; then
+ if [ -n "$entry" -a -z "$umlentry" ]; then
+ syscall_macro "$abi" "$nr" "$entry"
+ elif [ -n "$umlentry" ]; then # implies -n "$entry"
+ echo "#ifdef CONFIG_X86"
+ syscall_macro "$abi" "$nr" "$entry"
+ echo "#else /* CONFIG_UML */"
+ syscall_macro "$abi" "$nr" "$umlentry"
+ echo "#endif"
+ fi
+ else
+ echo "#ifdef CONFIG_X86_32"
+ if [ -n "$entry" ]; then
+ syscall_macro "$abi" "$nr" "$entry"
+ fi
+ echo "#else"
+ syscall_macro "$abi" "$nr" "$compat"
+ echo "#endif"
+ fi
+}
+
+grep '^[0-9]' "$in" | sort -n | (
+ while read nr abi name entry compat; do
+ abi=`echo "$abi" | tr '[a-z]' '[A-Z]'`
+ if [ "$abi" = "COMMON" -o "$abi" = "64" ]; then
+ # COMMON is the same as 64, except that we don't expect X32
+ # programs to use it. Our expectation has nothing to do with
+ # any generated code, so treat them the same.
+ emit 64 "$nr" "$entry" "$compat"
+ elif [ "$abi" = "X32" ]; then
+ # X32 is equivalent to 64 on an X32-compatible kernel.
+ echo "#ifdef CONFIG_X86_X32_ABI"
+ emit 64 "$nr" "$entry" "$compat"
+ echo "#endif"
+ elif [ "$abi" = "I386" ]; then
+ emit "$abi" "$nr" "$entry" "$compat"
+ else
+ echo "Unknown abi $abi" >&2
+ exit 1
+ fi
+ done
+) > "$out"
diff --git a/arch/x86/entry/thunk_32.S b/arch/x86/entry/thunk_32.S
new file mode 100644
index 000000000..fee6bc79b
--- /dev/null
+++ b/arch/x86/entry/thunk_32.S
@@ -0,0 +1,43 @@
+/*
+ * Trampoline to trace irqs off. (otherwise CALLER_ADDR1 might crash)
+ * Copyright 2008 by Steven Rostedt, Red Hat, Inc
+ * (inspired by Andi Kleen's thunk_64.S)
+ * Subject to the GNU public license, v.2. No warranty of any kind.
+ */
+ #include <linux/linkage.h>
+ #include <asm/asm.h>
+ #include <asm/export.h>
+
+ /* put return address in eax (arg1) */
+ .macro THUNK name, func, put_ret_addr_in_eax=0
+ .globl \name
+\name:
+ pushl %eax
+ pushl %ecx
+ pushl %edx
+
+ .if \put_ret_addr_in_eax
+ /* Place EIP in the arg1 */
+ movl 3*4(%esp), %eax
+ .endif
+
+ call \func
+ popl %edx
+ popl %ecx
+ popl %eax
+ ret
+ _ASM_NOKPROBE(\name)
+ .endm
+
+#ifdef CONFIG_TRACE_IRQFLAGS
+ THUNK trace_hardirqs_on_thunk,trace_hardirqs_on_caller,1
+ THUNK trace_hardirqs_off_thunk,trace_hardirqs_off_caller,1
+#endif
+
+#ifdef CONFIG_PREEMPT
+ THUNK ___preempt_schedule, preempt_schedule
+ THUNK ___preempt_schedule_notrace, preempt_schedule_notrace
+ EXPORT_SYMBOL(___preempt_schedule)
+ EXPORT_SYMBOL(___preempt_schedule_notrace)
+#endif
+
diff --git a/arch/x86/entry/thunk_64.S b/arch/x86/entry/thunk_64.S
new file mode 100644
index 000000000..be36bf4e0
--- /dev/null
+++ b/arch/x86/entry/thunk_64.S
@@ -0,0 +1,73 @@
+/*
+ * Save registers before calling assembly functions. This avoids
+ * disturbance of register allocation in some inline assembly constructs.
+ * Copyright 2001,2002 by Andi Kleen, SuSE Labs.
+ * Added trace_hardirqs callers - Copyright 2007 Steven Rostedt, Red Hat, Inc.
+ * Subject to the GNU public license, v.2. No warranty of any kind.
+ */
+#include <linux/linkage.h>
+#include "calling.h"
+#include <asm/asm.h>
+#include <asm/export.h>
+
+ /* rdi: arg1 ... normal C conventions. rax is saved/restored. */
+ .macro THUNK name, func, put_ret_addr_in_rdi=0
+ .globl \name
+ .type \name, @function
+\name:
+ pushq %rbp
+ movq %rsp, %rbp
+
+ pushq %rdi
+ pushq %rsi
+ pushq %rdx
+ pushq %rcx
+ pushq %rax
+ pushq %r8
+ pushq %r9
+ pushq %r10
+ pushq %r11
+
+ .if \put_ret_addr_in_rdi
+ /* 8(%rbp) is return addr on stack */
+ movq 8(%rbp), %rdi
+ .endif
+
+ call \func
+ jmp .L_restore
+ _ASM_NOKPROBE(\name)
+ .endm
+
+#ifdef CONFIG_TRACE_IRQFLAGS
+ THUNK trace_hardirqs_on_thunk,trace_hardirqs_on_caller,1
+ THUNK trace_hardirqs_off_thunk,trace_hardirqs_off_caller,1
+#endif
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ THUNK lockdep_sys_exit_thunk,lockdep_sys_exit
+#endif
+
+#ifdef CONFIG_PREEMPT
+ THUNK ___preempt_schedule, preempt_schedule
+ THUNK ___preempt_schedule_notrace, preempt_schedule_notrace
+ EXPORT_SYMBOL(___preempt_schedule)
+ EXPORT_SYMBOL(___preempt_schedule_notrace)
+#endif
+
+#if defined(CONFIG_TRACE_IRQFLAGS) \
+ || defined(CONFIG_DEBUG_LOCK_ALLOC) \
+ || defined(CONFIG_PREEMPT)
+.L_restore:
+ popq %r11
+ popq %r10
+ popq %r9
+ popq %r8
+ popq %rax
+ popq %rcx
+ popq %rdx
+ popq %rsi
+ popq %rdi
+ popq %rbp
+ ret
+ _ASM_NOKPROBE(.L_restore)
+#endif
diff --git a/arch/x86/entry/vdso/.gitignore b/arch/x86/entry/vdso/.gitignore
new file mode 100644
index 000000000..aae8ffdd5
--- /dev/null
+++ b/arch/x86/entry/vdso/.gitignore
@@ -0,0 +1,7 @@
+vdso.lds
+vdsox32.lds
+vdso32-syscall-syms.lds
+vdso32-sysenter-syms.lds
+vdso32-int80-syms.lds
+vdso-image-*.c
+vdso2c
diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
new file mode 100644
index 000000000..5bfe2243a
--- /dev/null
+++ b/arch/x86/entry/vdso/Makefile
@@ -0,0 +1,205 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Building vDSO images for x86.
+#
+
+KBUILD_CFLAGS += $(DISABLE_LTO)
+KASAN_SANITIZE := n
+UBSAN_SANITIZE := n
+OBJECT_FILES_NON_STANDARD := y
+
+# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in.
+KCOV_INSTRUMENT := n
+
+VDSO64-$(CONFIG_X86_64) := y
+VDSOX32-$(CONFIG_X86_X32_ABI) := y
+VDSO32-$(CONFIG_X86_32) := y
+VDSO32-$(CONFIG_IA32_EMULATION) := y
+
+# files to link into the vdso
+vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o
+
+# files to link into kernel
+obj-y += vma.o
+OBJECT_FILES_NON_STANDARD_vma.o := n
+
+# vDSO images to build
+vdso_img-$(VDSO64-y) += 64
+vdso_img-$(VDSOX32-y) += x32
+vdso_img-$(VDSO32-y) += 32
+
+obj-$(VDSO32-y) += vdso32-setup.o
+
+vobjs := $(foreach F,$(vobjs-y),$(obj)/$F)
+
+$(obj)/vdso.o: $(obj)/vdso.so
+
+targets += vdso.lds $(vobjs-y)
+
+# Build the vDSO image C files and link them in.
+vdso_img_objs := $(vdso_img-y:%=vdso-image-%.o)
+vdso_img_cfiles := $(vdso_img-y:%=vdso-image-%.c)
+vdso_img_sodbg := $(vdso_img-y:%=vdso%.so.dbg)
+obj-y += $(vdso_img_objs)
+targets += $(vdso_img_cfiles)
+targets += $(vdso_img_sodbg) $(vdso_img-y:%=vdso%.so)
+
+CPPFLAGS_vdso.lds += -P -C
+
+VDSO_LDFLAGS_vdso.lds = -m elf_x86_64 -soname linux-vdso.so.1 --no-undefined \
+ -z max-page-size=4096
+
+$(obj)/vdso64.so.dbg: $(obj)/vdso.lds $(vobjs) FORCE
+ $(call if_changed,vdso)
+
+HOST_EXTRACFLAGS += -I$(srctree)/tools/include -I$(srctree)/include/uapi -I$(srctree)/arch/$(SUBARCH)/include/uapi
+hostprogs-y += vdso2c
+
+quiet_cmd_vdso2c = VDSO2C $@
+ cmd_vdso2c = $(obj)/vdso2c $< $(<:%.dbg=%) $@
+
+$(obj)/vdso-image-%.c: $(obj)/vdso%.so.dbg $(obj)/vdso%.so $(obj)/vdso2c FORCE
+ $(call if_changed,vdso2c)
+
+#
+# Don't omit frame pointers for ease of userspace debugging, but do
+# optimize sibling calls.
+#
+CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \
+ $(filter -g%,$(KBUILD_CFLAGS)) $(call cc-option, -fno-stack-protector) \
+ -fno-omit-frame-pointer -foptimize-sibling-calls \
+ -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO
+
+ifdef CONFIG_RETPOLINE
+ifneq ($(RETPOLINE_VDSO_CFLAGS),)
+ CFL += $(RETPOLINE_VDSO_CFLAGS)
+endif
+endif
+
+$(vobjs): KBUILD_CFLAGS := $(filter-out $(GCC_PLUGINS_CFLAGS) $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS)) $(CFL)
+
+#
+# vDSO code runs in userspace and -pg doesn't help with profiling anyway.
+#
+CFLAGS_REMOVE_vdso-note.o = -pg
+CFLAGS_REMOVE_vclock_gettime.o = -pg
+CFLAGS_REMOVE_vgetcpu.o = -pg
+CFLAGS_REMOVE_vvar.o = -pg
+
+#
+# X32 processes use x32 vDSO to access 64bit kernel data.
+#
+# Build x32 vDSO image:
+# 1. Compile x32 vDSO as 64bit.
+# 2. Convert object files to x32.
+# 3. Build x32 VDSO image with x32 objects, which contains 64bit codes
+# so that it can reach 64bit address space with 64bit pointers.
+#
+
+CPPFLAGS_vdsox32.lds = $(CPPFLAGS_vdso.lds)
+VDSO_LDFLAGS_vdsox32.lds = -m elf32_x86_64 -soname linux-vdso.so.1 \
+ -z max-page-size=4096
+
+# x32-rebranded versions
+vobjx32s-y := $(vobjs-y:.o=-x32.o)
+
+# same thing, but in the output directory
+vobjx32s := $(foreach F,$(vobjx32s-y),$(obj)/$F)
+
+# Convert 64bit object file to x32 for x32 vDSO.
+quiet_cmd_x32 = X32 $@
+ cmd_x32 = $(OBJCOPY) -O elf32-x86-64 $< $@
+
+$(obj)/%-x32.o: $(obj)/%.o FORCE
+ $(call if_changed,x32)
+
+targets += vdsox32.lds $(vobjx32s-y)
+
+$(obj)/%.so: OBJCOPYFLAGS := -S
+$(obj)/%.so: $(obj)/%.so.dbg
+ $(call if_changed,objcopy)
+
+$(obj)/vdsox32.so.dbg: $(obj)/vdsox32.lds $(vobjx32s) FORCE
+ $(call if_changed,vdso)
+
+CPPFLAGS_vdso32.lds = $(CPPFLAGS_vdso.lds)
+VDSO_LDFLAGS_vdso32.lds = -m elf_i386 -soname linux-gate.so.1
+
+targets += vdso32/vdso32.lds
+targets += vdso32/note.o vdso32/system_call.o vdso32/sigreturn.o
+targets += vdso32/vclock_gettime.o
+
+KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS)) -DBUILD_VDSO
+$(obj)/vdso32.so.dbg: KBUILD_AFLAGS = $(KBUILD_AFLAGS_32)
+$(obj)/vdso32.so.dbg: asflags-$(CONFIG_X86_64) += -m32
+
+KBUILD_CFLAGS_32 := $(filter-out -m64,$(KBUILD_CFLAGS))
+KBUILD_CFLAGS_32 := $(filter-out -mcmodel=kernel,$(KBUILD_CFLAGS_32))
+KBUILD_CFLAGS_32 := $(filter-out -fno-pic,$(KBUILD_CFLAGS_32))
+KBUILD_CFLAGS_32 := $(filter-out -mfentry,$(KBUILD_CFLAGS_32))
+KBUILD_CFLAGS_32 := $(filter-out $(GCC_PLUGINS_CFLAGS),$(KBUILD_CFLAGS_32))
+KBUILD_CFLAGS_32 := $(filter-out $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS_32))
+KBUILD_CFLAGS_32 += -m32 -msoft-float -mregparm=0 -fpic
+KBUILD_CFLAGS_32 += $(call cc-option, -fno-stack-protector)
+KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls)
+KBUILD_CFLAGS_32 += -fno-omit-frame-pointer
+KBUILD_CFLAGS_32 += -DDISABLE_BRANCH_PROFILING
+
+ifdef CONFIG_RETPOLINE
+ifneq ($(RETPOLINE_VDSO_CFLAGS),)
+ KBUILD_CFLAGS_32 += $(RETPOLINE_VDSO_CFLAGS)
+endif
+endif
+
+$(obj)/vdso32.so.dbg: KBUILD_CFLAGS = $(KBUILD_CFLAGS_32)
+
+$(obj)/vdso32.so.dbg: FORCE \
+ $(obj)/vdso32/vdso32.lds \
+ $(obj)/vdso32/vclock_gettime.o \
+ $(obj)/vdso32/note.o \
+ $(obj)/vdso32/system_call.o \
+ $(obj)/vdso32/sigreturn.o
+ $(call if_changed,vdso)
+
+#
+# The DSO images are built using a special linker script.
+#
+quiet_cmd_vdso = VDSO $@
+ cmd_vdso = $(LD) -nostdlib -o $@ \
+ $(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \
+ -T $(filter %.lds,$^) $(filter %.o,$^) && \
+ sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@'
+
+VDSO_LDFLAGS = -shared $(call ld-option, --hash-style=both) \
+ $(call ld-option, --build-id) $(call ld-option, --eh-frame-hdr) \
+ -Bsymbolic
+GCOV_PROFILE := n
+
+#
+# Install the unstripped copies of vdso*.so. If our toolchain supports
+# build-id, install .build-id links as well.
+#
+quiet_cmd_vdso_install = INSTALL $(@:install_%=%)
+define cmd_vdso_install
+ cp $< "$(MODLIB)/vdso/$(@:install_%=%)"; \
+ if readelf -n $< |grep -q 'Build ID'; then \
+ buildid=`readelf -n $< |grep 'Build ID' |sed -e 's/^.*Build ID: \(.*\)$$/\1/'`; \
+ first=`echo $$buildid | cut -b-2`; \
+ last=`echo $$buildid | cut -b3-`; \
+ mkdir -p "$(MODLIB)/vdso/.build-id/$$first"; \
+ ln -sf "../../$(@:install_%=%)" "$(MODLIB)/vdso/.build-id/$$first/$$last.debug"; \
+ fi
+endef
+
+vdso_img_insttargets := $(vdso_img_sodbg:%.dbg=install_%)
+
+$(MODLIB)/vdso: FORCE
+ @mkdir -p $(MODLIB)/vdso
+
+$(vdso_img_insttargets): install_%: $(obj)/%.dbg $(MODLIB)/vdso
+ $(call cmd,vdso_install)
+
+PHONY += vdso_install $(vdso_img_insttargets)
+vdso_install: $(vdso_img_insttargets)
+
+clean-files := vdso32.so vdso32.so.dbg vdso64* vdso-image-*.c vdsox32.so*
diff --git a/arch/x86/entry/vdso/checkundef.sh b/arch/x86/entry/vdso/checkundef.sh
new file mode 100755
index 000000000..7ee90a9b5
--- /dev/null
+++ b/arch/x86/entry/vdso/checkundef.sh
@@ -0,0 +1,10 @@
+#!/bin/sh
+nm="$1"
+file="$2"
+$nm "$file" | grep '^ *U' > /dev/null 2>&1
+if [ $? -eq 1 ]; then
+ exit 0
+else
+ echo "$file: undefined symbols found" >&2
+ exit 1
+fi
diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
new file mode 100644
index 000000000..8a88e738f
--- /dev/null
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -0,0 +1,341 @@
+/*
+ * Copyright 2006 Andi Kleen, SUSE Labs.
+ * Subject to the GNU Public License, v.2
+ *
+ * Fast user context implementation of clock_gettime, gettimeofday, and time.
+ *
+ * 32 Bit compat layer by Stefani Seibold <stefani@seibold.net>
+ * sponsored by Rohde & Schwarz GmbH & Co. KG Munich/Germany
+ *
+ * The code should have no internal unresolved relocations.
+ * Check with readelf after changing.
+ */
+
+#include <uapi/linux/time.h>
+#include <asm/vgtod.h>
+#include <asm/vvar.h>
+#include <asm/unistd.h>
+#include <asm/msr.h>
+#include <asm/pvclock.h>
+#include <asm/mshyperv.h>
+#include <linux/math64.h>
+#include <linux/time.h>
+#include <linux/kernel.h>
+
+#define gtod (&VVAR(vsyscall_gtod_data))
+
+extern int __vdso_clock_gettime(clockid_t clock, struct timespec *ts);
+extern int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz);
+extern time_t __vdso_time(time_t *t);
+
+#ifdef CONFIG_PARAVIRT_CLOCK
+extern u8 pvclock_page[PAGE_SIZE]
+ __attribute__((visibility("hidden")));
+#endif
+
+#ifdef CONFIG_HYPERV_TSCPAGE
+extern u8 hvclock_page[PAGE_SIZE]
+ __attribute__((visibility("hidden")));
+#endif
+
+#ifndef BUILD_VDSO32
+
+notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
+{
+ long ret;
+ asm ("syscall" : "=a" (ret), "=m" (*ts) :
+ "0" (__NR_clock_gettime), "D" (clock), "S" (ts) :
+ "memory", "rcx", "r11");
+ return ret;
+}
+
+notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz)
+{
+ long ret;
+
+ asm ("syscall" : "=a" (ret), "=m" (*tv), "=m" (*tz) :
+ "0" (__NR_gettimeofday), "D" (tv), "S" (tz) :
+ "memory", "rcx", "r11");
+ return ret;
+}
+
+
+#else
+
+notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
+{
+ long ret;
+
+ asm (
+ "mov %%ebx, %%edx \n"
+ "mov %[clock], %%ebx \n"
+ "call __kernel_vsyscall \n"
+ "mov %%edx, %%ebx \n"
+ : "=a" (ret), "=m" (*ts)
+ : "0" (__NR_clock_gettime), [clock] "g" (clock), "c" (ts)
+ : "memory", "edx");
+ return ret;
+}
+
+notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz)
+{
+ long ret;
+
+ asm (
+ "mov %%ebx, %%edx \n"
+ "mov %[tv], %%ebx \n"
+ "call __kernel_vsyscall \n"
+ "mov %%edx, %%ebx \n"
+ : "=a" (ret), "=m" (*tv), "=m" (*tz)
+ : "0" (__NR_gettimeofday), [tv] "g" (tv), "c" (tz)
+ : "memory", "edx");
+ return ret;
+}
+
+#endif
+
+#ifdef CONFIG_PARAVIRT_CLOCK
+static notrace const struct pvclock_vsyscall_time_info *get_pvti0(void)
+{
+ return (const struct pvclock_vsyscall_time_info *)&pvclock_page;
+}
+
+static notrace u64 vread_pvclock(int *mode)
+{
+ const struct pvclock_vcpu_time_info *pvti = &get_pvti0()->pvti;
+ u64 ret;
+ u64 last;
+ u32 version;
+
+ /*
+ * Note: The kernel and hypervisor must guarantee that cpu ID
+ * number maps 1:1 to per-CPU pvclock time info.
+ *
+ * Because the hypervisor is entirely unaware of guest userspace
+ * preemption, it cannot guarantee that per-CPU pvclock time
+ * info is updated if the underlying CPU changes or that that
+ * version is increased whenever underlying CPU changes.
+ *
+ * On KVM, we are guaranteed that pvti updates for any vCPU are
+ * atomic as seen by *all* vCPUs. This is an even stronger
+ * guarantee than we get with a normal seqlock.
+ *
+ * On Xen, we don't appear to have that guarantee, but Xen still
+ * supplies a valid seqlock using the version field.
+ *
+ * We only do pvclock vdso timing at all if
+ * PVCLOCK_TSC_STABLE_BIT is set, and we interpret that bit to
+ * mean that all vCPUs have matching pvti and that the TSC is
+ * synced, so we can just look at vCPU 0's pvti.
+ */
+
+ do {
+ version = pvclock_read_begin(pvti);
+
+ if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT))) {
+ *mode = VCLOCK_NONE;
+ return 0;
+ }
+
+ ret = __pvclock_read_cycles(pvti, rdtsc_ordered());
+ } while (pvclock_read_retry(pvti, version));
+
+ /* refer to vread_tsc() comment for rationale */
+ last = gtod->cycle_last;
+
+ if (likely(ret >= last))
+ return ret;
+
+ return last;
+}
+#endif
+#ifdef CONFIG_HYPERV_TSCPAGE
+static notrace u64 vread_hvclock(int *mode)
+{
+ const struct ms_hyperv_tsc_page *tsc_pg =
+ (const struct ms_hyperv_tsc_page *)&hvclock_page;
+ u64 current_tick = hv_read_tsc_page(tsc_pg);
+
+ if (current_tick != U64_MAX)
+ return current_tick;
+
+ *mode = VCLOCK_NONE;
+ return 0;
+}
+#endif
+
+notrace static u64 vread_tsc(void)
+{
+ u64 ret = (u64)rdtsc_ordered();
+ u64 last = gtod->cycle_last;
+
+ if (likely(ret >= last))
+ return ret;
+
+ /*
+ * GCC likes to generate cmov here, but this branch is extremely
+ * predictable (it's just a function of time and the likely is
+ * very likely) and there's a data dependence, so force GCC
+ * to generate a branch instead. I don't barrier() because
+ * we don't actually need a barrier, and if this function
+ * ever gets inlined it will generate worse code.
+ */
+ asm volatile ("");
+ return last;
+}
+
+notrace static inline u64 vgetsns(int *mode)
+{
+ u64 v;
+ cycles_t cycles;
+
+ if (gtod->vclock_mode == VCLOCK_TSC)
+ cycles = vread_tsc();
+
+ /*
+ * For any memory-mapped vclock type, we need to make sure that gcc
+ * doesn't cleverly hoist a load before the mode check. Otherwise we
+ * might end up touching the memory-mapped page even if the vclock in
+ * question isn't enabled, which will segfault. Hence the barriers.
+ */
+#ifdef CONFIG_PARAVIRT_CLOCK
+ else if (gtod->vclock_mode == VCLOCK_PVCLOCK) {
+ barrier();
+ cycles = vread_pvclock(mode);
+ }
+#endif
+#ifdef CONFIG_HYPERV_TSCPAGE
+ else if (gtod->vclock_mode == VCLOCK_HVCLOCK) {
+ barrier();
+ cycles = vread_hvclock(mode);
+ }
+#endif
+ else
+ return 0;
+ v = (cycles - gtod->cycle_last) & gtod->mask;
+ return v * gtod->mult;
+}
+
+/* Code size doesn't matter (vdso is 4k anyway) and this is faster. */
+notrace static int __always_inline do_realtime(struct timespec *ts)
+{
+ unsigned long seq;
+ u64 ns;
+ int mode;
+
+ do {
+ seq = gtod_read_begin(gtod);
+ mode = gtod->vclock_mode;
+ ts->tv_sec = gtod->wall_time_sec;
+ ns = gtod->wall_time_snsec;
+ ns += vgetsns(&mode);
+ ns >>= gtod->shift;
+ } while (unlikely(gtod_read_retry(gtod, seq)));
+
+ ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
+ ts->tv_nsec = ns;
+
+ return mode;
+}
+
+notrace static int __always_inline do_monotonic(struct timespec *ts)
+{
+ unsigned long seq;
+ u64 ns;
+ int mode;
+
+ do {
+ seq = gtod_read_begin(gtod);
+ mode = gtod->vclock_mode;
+ ts->tv_sec = gtod->monotonic_time_sec;
+ ns = gtod->monotonic_time_snsec;
+ ns += vgetsns(&mode);
+ ns >>= gtod->shift;
+ } while (unlikely(gtod_read_retry(gtod, seq)));
+
+ ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
+ ts->tv_nsec = ns;
+
+ return mode;
+}
+
+notrace static void do_realtime_coarse(struct timespec *ts)
+{
+ unsigned long seq;
+ do {
+ seq = gtod_read_begin(gtod);
+ ts->tv_sec = gtod->wall_time_coarse_sec;
+ ts->tv_nsec = gtod->wall_time_coarse_nsec;
+ } while (unlikely(gtod_read_retry(gtod, seq)));
+}
+
+notrace static void do_monotonic_coarse(struct timespec *ts)
+{
+ unsigned long seq;
+ do {
+ seq = gtod_read_begin(gtod);
+ ts->tv_sec = gtod->monotonic_time_coarse_sec;
+ ts->tv_nsec = gtod->monotonic_time_coarse_nsec;
+ } while (unlikely(gtod_read_retry(gtod, seq)));
+}
+
+notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
+{
+ switch (clock) {
+ case CLOCK_REALTIME:
+ if (do_realtime(ts) == VCLOCK_NONE)
+ goto fallback;
+ break;
+ case CLOCK_MONOTONIC:
+ if (do_monotonic(ts) == VCLOCK_NONE)
+ goto fallback;
+ break;
+ case CLOCK_REALTIME_COARSE:
+ do_realtime_coarse(ts);
+ break;
+ case CLOCK_MONOTONIC_COARSE:
+ do_monotonic_coarse(ts);
+ break;
+ default:
+ goto fallback;
+ }
+
+ return 0;
+fallback:
+ return vdso_fallback_gettime(clock, ts);
+}
+int clock_gettime(clockid_t, struct timespec *)
+ __attribute__((weak, alias("__vdso_clock_gettime")));
+
+notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
+{
+ if (likely(tv != NULL)) {
+ if (unlikely(do_realtime((struct timespec *)tv) == VCLOCK_NONE))
+ return vdso_fallback_gtod(tv, tz);
+ tv->tv_usec /= 1000;
+ }
+ if (unlikely(tz != NULL)) {
+ tz->tz_minuteswest = gtod->tz_minuteswest;
+ tz->tz_dsttime = gtod->tz_dsttime;
+ }
+
+ return 0;
+}
+int gettimeofday(struct timeval *, struct timezone *)
+ __attribute__((weak, alias("__vdso_gettimeofday")));
+
+/*
+ * This will break when the xtime seconds get inaccurate, but that is
+ * unlikely
+ */
+notrace time_t __vdso_time(time_t *t)
+{
+ /* This is atomic on x86 so we don't need any locks. */
+ time_t result = READ_ONCE(gtod->wall_time_sec);
+
+ if (t)
+ *t = result;
+ return result;
+}
+time_t time(time_t *t)
+ __attribute__((weak, alias("__vdso_time")));
diff --git a/arch/x86/entry/vdso/vdso-layout.lds.S b/arch/x86/entry/vdso/vdso-layout.lds.S
new file mode 100644
index 000000000..acfd5ba7d
--- /dev/null
+++ b/arch/x86/entry/vdso/vdso-layout.lds.S
@@ -0,0 +1,120 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <asm/vdso.h>
+
+/*
+ * Linker script for vDSO. This is an ELF shared object prelinked to
+ * its virtual address, and with only one read-only segment.
+ * This script controls its layout.
+ */
+
+#if defined(BUILD_VDSO64)
+# define SHDR_SIZE 64
+#elif defined(BUILD_VDSO32) || defined(BUILD_VDSOX32)
+# define SHDR_SIZE 40
+#else
+# error unknown VDSO target
+#endif
+
+#define NUM_FAKE_SHDRS 13
+
+SECTIONS
+{
+ /*
+ * User/kernel shared data is before the vDSO. This may be a little
+ * uglier than putting it after the vDSO, but it avoids issues with
+ * non-allocatable things that dangle past the end of the PT_LOAD
+ * segment.
+ */
+
+ vvar_start = . - 3 * PAGE_SIZE;
+ vvar_page = vvar_start;
+
+ /* Place all vvars at the offsets in asm/vvar.h. */
+#define EMIT_VVAR(name, offset) vvar_ ## name = vvar_page + offset;
+#define __VVAR_KERNEL_LDS
+#include <asm/vvar.h>
+#undef __VVAR_KERNEL_LDS
+#undef EMIT_VVAR
+
+ pvclock_page = vvar_start + PAGE_SIZE;
+ hvclock_page = vvar_start + 2 * PAGE_SIZE;
+
+ . = SIZEOF_HEADERS;
+
+ .hash : { *(.hash) } :text
+ .gnu.hash : { *(.gnu.hash) }
+ .dynsym : { *(.dynsym) }
+ .dynstr : { *(.dynstr) }
+ .gnu.version : { *(.gnu.version) }
+ .gnu.version_d : { *(.gnu.version_d) }
+ .gnu.version_r : { *(.gnu.version_r) }
+
+ .dynamic : { *(.dynamic) } :text :dynamic
+
+ .rodata : {
+ *(.rodata*)
+ *(.data*)
+ *(.sdata*)
+ *(.got.plt) *(.got)
+ *(.gnu.linkonce.d.*)
+ *(.bss*)
+ *(.dynbss*)
+ *(.gnu.linkonce.b.*)
+
+ /*
+ * Ideally this would live in a C file, but that won't
+ * work cleanly for x32 until we start building the x32
+ * C code using an x32 toolchain.
+ */
+ VDSO_FAKE_SECTION_TABLE_START = .;
+ . = . + NUM_FAKE_SHDRS * SHDR_SIZE;
+ VDSO_FAKE_SECTION_TABLE_END = .;
+ } :text
+
+ .fake_shstrtab : { *(.fake_shstrtab) } :text
+
+
+ .note : { *(.note.*) } :text :note
+
+ .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr
+ .eh_frame : { KEEP (*(.eh_frame)) } :text
+
+
+ /*
+ * Text is well-separated from actual data: there's plenty of
+ * stuff that isn't used at runtime in between.
+ */
+
+ .text : { *(.text*) } :text =0x90909090,
+
+ /*
+ * At the end so that eu-elflint stays happy when vdso2c strips
+ * these. A better implementation would avoid allocating space
+ * for these.
+ */
+ .altinstructions : { *(.altinstructions) } :text
+ .altinstr_replacement : { *(.altinstr_replacement) } :text
+
+ /DISCARD/ : {
+ *(.discard)
+ *(.discard.*)
+ *(__bug_table)
+ }
+}
+
+/*
+ * Very old versions of ld do not recognize this name token; use the constant.
+ */
+#define PT_GNU_EH_FRAME 0x6474e550
+
+/*
+ * We must supply the ELF program headers explicitly to get just one
+ * PT_LOAD segment, and set the flags explicitly to make segments read-only.
+ */
+PHDRS
+{
+ text PT_LOAD FLAGS(5) FILEHDR PHDRS; /* PF_R|PF_X */
+ dynamic PT_DYNAMIC FLAGS(4); /* PF_R */
+ note PT_NOTE FLAGS(4); /* PF_R */
+ eh_frame_hdr PT_GNU_EH_FRAME;
+}
diff --git a/arch/x86/entry/vdso/vdso-note.S b/arch/x86/entry/vdso/vdso-note.S
new file mode 100644
index 000000000..794231701
--- /dev/null
+++ b/arch/x86/entry/vdso/vdso-note.S
@@ -0,0 +1,15 @@
+/*
+ * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text.
+ * Here we can supply some information useful to userland.
+ */
+
+#include <linux/build-salt.h>
+#include <linux/uts.h>
+#include <linux/version.h>
+#include <linux/elfnote.h>
+
+ELFNOTE_START(Linux, 0, "a")
+ .long LINUX_VERSION_CODE
+ELFNOTE_END
+
+BUILD_SALT
diff --git a/arch/x86/entry/vdso/vdso.lds.S b/arch/x86/entry/vdso/vdso.lds.S
new file mode 100644
index 000000000..d3a2dce4c
--- /dev/null
+++ b/arch/x86/entry/vdso/vdso.lds.S
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Linker script for 64-bit vDSO.
+ * We #include the file to define the layout details.
+ *
+ * This file defines the version script giving the user-exported symbols in
+ * the DSO.
+ */
+
+#define BUILD_VDSO64
+
+#include "vdso-layout.lds.S"
+
+/*
+ * This controls what userland symbols we export from the vDSO.
+ */
+VERSION {
+ LINUX_2.6 {
+ global:
+ clock_gettime;
+ __vdso_clock_gettime;
+ gettimeofday;
+ __vdso_gettimeofday;
+ getcpu;
+ __vdso_getcpu;
+ time;
+ __vdso_time;
+ local: *;
+ };
+}
diff --git a/arch/x86/entry/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c
new file mode 100644
index 000000000..4674f5858
--- /dev/null
+++ b/arch/x86/entry/vdso/vdso2c.c
@@ -0,0 +1,260 @@
+/*
+ * vdso2c - A vdso image preparation tool
+ * Copyright (c) 2014 Andy Lutomirski and others
+ * Licensed under the GPL v2
+ *
+ * vdso2c requires stripped and unstripped input. It would be trivial
+ * to fully strip the input in here, but, for reasons described below,
+ * we need to write a section table. Doing this is more or less
+ * equivalent to dropping all non-allocatable sections, but it's
+ * easier to let objcopy handle that instead of doing it ourselves.
+ * If we ever need to do something fancier than what objcopy provides,
+ * it would be straightforward to add here.
+ *
+ * We're keep a section table for a few reasons:
+ *
+ * The Go runtime had a couple of bugs: it would read the section
+ * table to try to figure out how many dynamic symbols there were (it
+ * shouldn't have looked at the section table at all) and, if there
+ * were no SHT_SYNDYM section table entry, it would use an
+ * uninitialized value for the number of symbols. An empty DYNSYM
+ * table would work, but I see no reason not to write a valid one (and
+ * keep full performance for old Go programs). This hack is only
+ * needed on x86_64.
+ *
+ * The bug was introduced on 2012-08-31 by:
+ * https://code.google.com/p/go/source/detail?r=56ea40aac72b
+ * and was fixed on 2014-06-13 by:
+ * https://code.google.com/p/go/source/detail?r=fc1cd5e12595
+ *
+ * Binutils has issues debugging the vDSO: it reads the section table to
+ * find SHT_NOTE; it won't look at PT_NOTE for the in-memory vDSO, which
+ * would break build-id if we removed the section table. Binutils
+ * also requires that shstrndx != 0. See:
+ * https://sourceware.org/bugzilla/show_bug.cgi?id=17064
+ *
+ * elfutils might not look for PT_NOTE if there is a section table at
+ * all. I don't know whether this matters for any practical purpose.
+ *
+ * For simplicity, rather than hacking up a partial section table, we
+ * just write a mostly complete one. We omit non-dynamic symbols,
+ * though, since they're rather large.
+ *
+ * Once binutils gets fixed, we might be able to drop this for all but
+ * the 64-bit vdso, since build-id only works in kernel RPMs, and
+ * systems that update to new enough kernel RPMs will likely update
+ * binutils in sync. build-id has never worked for home-built kernel
+ * RPMs without manual symlinking, and I suspect that no one ever does
+ * that.
+ */
+
+#include <inttypes.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <fcntl.h>
+#include <err.h>
+
+#include <sys/mman.h>
+#include <sys/types.h>
+
+#include <tools/le_byteshift.h>
+
+#include <linux/elf.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+
+const char *outfilename;
+
+/* Symbols that we need in vdso2c. */
+enum {
+ sym_vvar_start,
+ sym_vvar_page,
+ sym_hpet_page,
+ sym_pvclock_page,
+ sym_hvclock_page,
+ sym_VDSO_FAKE_SECTION_TABLE_START,
+ sym_VDSO_FAKE_SECTION_TABLE_END,
+};
+
+const int special_pages[] = {
+ sym_vvar_page,
+ sym_hpet_page,
+ sym_pvclock_page,
+ sym_hvclock_page,
+};
+
+struct vdso_sym {
+ const char *name;
+ bool export;
+};
+
+struct vdso_sym required_syms[] = {
+ [sym_vvar_start] = {"vvar_start", true},
+ [sym_vvar_page] = {"vvar_page", true},
+ [sym_hpet_page] = {"hpet_page", true},
+ [sym_pvclock_page] = {"pvclock_page", true},
+ [sym_hvclock_page] = {"hvclock_page", true},
+ [sym_VDSO_FAKE_SECTION_TABLE_START] = {
+ "VDSO_FAKE_SECTION_TABLE_START", false
+ },
+ [sym_VDSO_FAKE_SECTION_TABLE_END] = {
+ "VDSO_FAKE_SECTION_TABLE_END", false
+ },
+ {"VDSO32_NOTE_MASK", true},
+ {"__kernel_vsyscall", true},
+ {"__kernel_sigreturn", true},
+ {"__kernel_rt_sigreturn", true},
+ {"int80_landing_pad", true},
+};
+
+__attribute__((format(printf, 1, 2))) __attribute__((noreturn))
+static void fail(const char *format, ...)
+{
+ va_list ap;
+ va_start(ap, format);
+ fprintf(stderr, "Error: ");
+ vfprintf(stderr, format, ap);
+ if (outfilename)
+ unlink(outfilename);
+ exit(1);
+ va_end(ap);
+}
+
+/*
+ * Evil macros for little-endian reads and writes
+ */
+#define GLE(x, bits, ifnot) \
+ __builtin_choose_expr( \
+ (sizeof(*(x)) == bits/8), \
+ (__typeof__(*(x)))get_unaligned_le##bits(x), ifnot)
+
+extern void bad_get_le(void);
+#define LAST_GLE(x) \
+ __builtin_choose_expr(sizeof(*(x)) == 1, *(x), bad_get_le())
+
+#define GET_LE(x) \
+ GLE(x, 64, GLE(x, 32, GLE(x, 16, LAST_GLE(x))))
+
+#define PLE(x, val, bits, ifnot) \
+ __builtin_choose_expr( \
+ (sizeof(*(x)) == bits/8), \
+ put_unaligned_le##bits((val), (x)), ifnot)
+
+extern void bad_put_le(void);
+#define LAST_PLE(x, val) \
+ __builtin_choose_expr(sizeof(*(x)) == 1, *(x) = (val), bad_put_le())
+
+#define PUT_LE(x, val) \
+ PLE(x, val, 64, PLE(x, val, 32, PLE(x, val, 16, LAST_PLE(x, val))))
+
+
+#define NSYMS ARRAY_SIZE(required_syms)
+
+#define BITSFUNC3(name, bits, suffix) name##bits##suffix
+#define BITSFUNC2(name, bits, suffix) BITSFUNC3(name, bits, suffix)
+#define BITSFUNC(name) BITSFUNC2(name, ELF_BITS, )
+
+#define INT_BITS BITSFUNC2(int, ELF_BITS, _t)
+
+#define ELF_BITS_XFORM2(bits, x) Elf##bits##_##x
+#define ELF_BITS_XFORM(bits, x) ELF_BITS_XFORM2(bits, x)
+#define ELF(x) ELF_BITS_XFORM(ELF_BITS, x)
+
+#define ELF_BITS 64
+#include "vdso2c.h"
+#undef ELF_BITS
+
+#define ELF_BITS 32
+#include "vdso2c.h"
+#undef ELF_BITS
+
+static void go(void *raw_addr, size_t raw_len,
+ void *stripped_addr, size_t stripped_len,
+ FILE *outfile, const char *name)
+{
+ Elf64_Ehdr *hdr = (Elf64_Ehdr *)raw_addr;
+
+ if (hdr->e_ident[EI_CLASS] == ELFCLASS64) {
+ go64(raw_addr, raw_len, stripped_addr, stripped_len,
+ outfile, name);
+ } else if (hdr->e_ident[EI_CLASS] == ELFCLASS32) {
+ go32(raw_addr, raw_len, stripped_addr, stripped_len,
+ outfile, name);
+ } else {
+ fail("unknown ELF class\n");
+ }
+}
+
+static void map_input(const char *name, void **addr, size_t *len, int prot)
+{
+ off_t tmp_len;
+
+ int fd = open(name, O_RDONLY);
+ if (fd == -1)
+ err(1, "%s", name);
+
+ tmp_len = lseek(fd, 0, SEEK_END);
+ if (tmp_len == (off_t)-1)
+ err(1, "lseek");
+ *len = (size_t)tmp_len;
+
+ *addr = mmap(NULL, tmp_len, prot, MAP_PRIVATE, fd, 0);
+ if (*addr == MAP_FAILED)
+ err(1, "mmap");
+
+ close(fd);
+}
+
+int main(int argc, char **argv)
+{
+ size_t raw_len, stripped_len;
+ void *raw_addr, *stripped_addr;
+ FILE *outfile;
+ char *name, *tmp;
+ int namelen;
+
+ if (argc != 4) {
+ printf("Usage: vdso2c RAW_INPUT STRIPPED_INPUT OUTPUT\n");
+ return 1;
+ }
+
+ /*
+ * Figure out the struct name. If we're writing to a .so file,
+ * generate raw output insted.
+ */
+ name = strdup(argv[3]);
+ namelen = strlen(name);
+ if (namelen >= 3 && !strcmp(name + namelen - 3, ".so")) {
+ name = NULL;
+ } else {
+ tmp = strrchr(name, '/');
+ if (tmp)
+ name = tmp + 1;
+ tmp = strchr(name, '.');
+ if (tmp)
+ *tmp = '\0';
+ for (tmp = name; *tmp; tmp++)
+ if (*tmp == '-')
+ *tmp = '_';
+ }
+
+ map_input(argv[1], &raw_addr, &raw_len, PROT_READ);
+ map_input(argv[2], &stripped_addr, &stripped_len, PROT_READ);
+
+ outfilename = argv[3];
+ outfile = fopen(outfilename, "w");
+ if (!outfile)
+ err(1, "%s", argv[2]);
+
+ go(raw_addr, raw_len, stripped_addr, stripped_len, outfile, name);
+
+ munmap(raw_addr, raw_len);
+ munmap(stripped_addr, stripped_len);
+ fclose(outfile);
+
+ return 0;
+}
diff --git a/arch/x86/entry/vdso/vdso2c.h b/arch/x86/entry/vdso/vdso2c.h
new file mode 100644
index 000000000..fa847a620
--- /dev/null
+++ b/arch/x86/entry/vdso/vdso2c.h
@@ -0,0 +1,175 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * This file is included twice from vdso2c.c. It generates code for 32-bit
+ * and 64-bit vDSOs. We need both for 64-bit builds, since 32-bit vDSOs
+ * are built for 32-bit userspace.
+ */
+
+static void BITSFUNC(go)(void *raw_addr, size_t raw_len,
+ void *stripped_addr, size_t stripped_len,
+ FILE *outfile, const char *name)
+{
+ int found_load = 0;
+ unsigned long load_size = -1; /* Work around bogus warning */
+ unsigned long mapping_size;
+ ELF(Ehdr) *hdr = (ELF(Ehdr) *)raw_addr;
+ int i;
+ unsigned long j;
+ ELF(Shdr) *symtab_hdr = NULL, *strtab_hdr, *secstrings_hdr,
+ *alt_sec = NULL;
+ ELF(Dyn) *dyn = 0, *dyn_end = 0;
+ const char *secstrings;
+ INT_BITS syms[NSYMS] = {};
+
+ ELF(Phdr) *pt = (ELF(Phdr) *)(raw_addr + GET_LE(&hdr->e_phoff));
+
+ if (GET_LE(&hdr->e_type) != ET_DYN)
+ fail("input is not a shared object\n");
+
+ /* Walk the segment table. */
+ for (i = 0; i < GET_LE(&hdr->e_phnum); i++) {
+ if (GET_LE(&pt[i].p_type) == PT_LOAD) {
+ if (found_load)
+ fail("multiple PT_LOAD segs\n");
+
+ if (GET_LE(&pt[i].p_offset) != 0 ||
+ GET_LE(&pt[i].p_vaddr) != 0)
+ fail("PT_LOAD in wrong place\n");
+
+ if (GET_LE(&pt[i].p_memsz) != GET_LE(&pt[i].p_filesz))
+ fail("cannot handle memsz != filesz\n");
+
+ load_size = GET_LE(&pt[i].p_memsz);
+ found_load = 1;
+ } else if (GET_LE(&pt[i].p_type) == PT_DYNAMIC) {
+ dyn = raw_addr + GET_LE(&pt[i].p_offset);
+ dyn_end = raw_addr + GET_LE(&pt[i].p_offset) +
+ GET_LE(&pt[i].p_memsz);
+ }
+ }
+ if (!found_load)
+ fail("no PT_LOAD seg\n");
+
+ if (stripped_len < load_size)
+ fail("stripped input is too short\n");
+
+ if (!dyn)
+ fail("input has no PT_DYNAMIC section -- your toolchain is buggy\n");
+
+ /* Walk the dynamic table */
+ for (i = 0; dyn + i < dyn_end &&
+ GET_LE(&dyn[i].d_tag) != DT_NULL; i++) {
+ typeof(dyn[i].d_tag) tag = GET_LE(&dyn[i].d_tag);
+ if (tag == DT_REL || tag == DT_RELSZ || tag == DT_RELA ||
+ tag == DT_RELENT || tag == DT_TEXTREL)
+ fail("vdso image contains dynamic relocations\n");
+ }
+
+ /* Walk the section table */
+ secstrings_hdr = raw_addr + GET_LE(&hdr->e_shoff) +
+ GET_LE(&hdr->e_shentsize)*GET_LE(&hdr->e_shstrndx);
+ secstrings = raw_addr + GET_LE(&secstrings_hdr->sh_offset);
+ for (i = 0; i < GET_LE(&hdr->e_shnum); i++) {
+ ELF(Shdr) *sh = raw_addr + GET_LE(&hdr->e_shoff) +
+ GET_LE(&hdr->e_shentsize) * i;
+ if (GET_LE(&sh->sh_type) == SHT_SYMTAB)
+ symtab_hdr = sh;
+
+ if (!strcmp(secstrings + GET_LE(&sh->sh_name),
+ ".altinstructions"))
+ alt_sec = sh;
+ }
+
+ if (!symtab_hdr)
+ fail("no symbol table\n");
+
+ strtab_hdr = raw_addr + GET_LE(&hdr->e_shoff) +
+ GET_LE(&hdr->e_shentsize) * GET_LE(&symtab_hdr->sh_link);
+
+ /* Walk the symbol table */
+ for (i = 0;
+ i < GET_LE(&symtab_hdr->sh_size) / GET_LE(&symtab_hdr->sh_entsize);
+ i++) {
+ int k;
+ ELF(Sym) *sym = raw_addr + GET_LE(&symtab_hdr->sh_offset) +
+ GET_LE(&symtab_hdr->sh_entsize) * i;
+ const char *name = raw_addr + GET_LE(&strtab_hdr->sh_offset) +
+ GET_LE(&sym->st_name);
+
+ for (k = 0; k < NSYMS; k++) {
+ if (!strcmp(name, required_syms[k].name)) {
+ if (syms[k]) {
+ fail("duplicate symbol %s\n",
+ required_syms[k].name);
+ }
+
+ /*
+ * Careful: we use negative addresses, but
+ * st_value is unsigned, so we rely
+ * on syms[k] being a signed type of the
+ * correct width.
+ */
+ syms[k] = GET_LE(&sym->st_value);
+ }
+ }
+ }
+
+ /* Validate mapping addresses. */
+ for (i = 0; i < sizeof(special_pages) / sizeof(special_pages[0]); i++) {
+ INT_BITS symval = syms[special_pages[i]];
+
+ if (!symval)
+ continue; /* The mapping isn't used; ignore it. */
+
+ if (symval % 4096)
+ fail("%s must be a multiple of 4096\n",
+ required_syms[i].name);
+ if (symval + 4096 < syms[sym_vvar_start])
+ fail("%s underruns vvar_start\n",
+ required_syms[i].name);
+ if (symval + 4096 > 0)
+ fail("%s is on the wrong side of the vdso text\n",
+ required_syms[i].name);
+ }
+ if (syms[sym_vvar_start] % 4096)
+ fail("vvar_begin must be a multiple of 4096\n");
+
+ if (!name) {
+ fwrite(stripped_addr, stripped_len, 1, outfile);
+ return;
+ }
+
+ mapping_size = (stripped_len + 4095) / 4096 * 4096;
+
+ fprintf(outfile, "/* AUTOMATICALLY GENERATED -- DO NOT EDIT */\n\n");
+ fprintf(outfile, "#include <linux/linkage.h>\n");
+ fprintf(outfile, "#include <asm/page_types.h>\n");
+ fprintf(outfile, "#include <asm/vdso.h>\n");
+ fprintf(outfile, "\n");
+ fprintf(outfile,
+ "static unsigned char raw_data[%lu] __ro_after_init __aligned(PAGE_SIZE) = {",
+ mapping_size);
+ for (j = 0; j < stripped_len; j++) {
+ if (j % 10 == 0)
+ fprintf(outfile, "\n\t");
+ fprintf(outfile, "0x%02X, ",
+ (int)((unsigned char *)stripped_addr)[j]);
+ }
+ fprintf(outfile, "\n};\n\n");
+
+ fprintf(outfile, "const struct vdso_image %s = {\n", name);
+ fprintf(outfile, "\t.data = raw_data,\n");
+ fprintf(outfile, "\t.size = %lu,\n", mapping_size);
+ if (alt_sec) {
+ fprintf(outfile, "\t.alt = %lu,\n",
+ (unsigned long)GET_LE(&alt_sec->sh_offset));
+ fprintf(outfile, "\t.alt_len = %lu,\n",
+ (unsigned long)GET_LE(&alt_sec->sh_size));
+ }
+ for (i = 0; i < NSYMS; i++) {
+ if (required_syms[i].export && syms[i])
+ fprintf(outfile, "\t.sym_%s = %" PRIi64 ",\n",
+ required_syms[i].name, (int64_t)syms[i]);
+ }
+ fprintf(outfile, "};\n");
+}
diff --git a/arch/x86/entry/vdso/vdso32-setup.c b/arch/x86/entry/vdso/vdso32-setup.c
new file mode 100644
index 000000000..ddff0ca6f
--- /dev/null
+++ b/arch/x86/entry/vdso/vdso32-setup.c
@@ -0,0 +1,102 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * (C) Copyright 2002 Linus Torvalds
+ * Portions based on the vdso-randomization code from exec-shield:
+ * Copyright(C) 2005-2006, Red Hat, Inc., Ingo Molnar
+ *
+ * This file contains the needed initializations to support sysenter.
+ */
+
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/kernel.h>
+#include <linux/mm_types.h>
+#include <linux/elf.h>
+
+#include <asm/processor.h>
+#include <asm/vdso.h>
+
+#ifdef CONFIG_COMPAT_VDSO
+#define VDSO_DEFAULT 0
+#else
+#define VDSO_DEFAULT 1
+#endif
+
+/*
+ * Should the kernel map a VDSO page into processes and pass its
+ * address down to glibc upon exec()?
+ */
+unsigned int __read_mostly vdso32_enabled = VDSO_DEFAULT;
+
+static int __init vdso32_setup(char *s)
+{
+ vdso32_enabled = simple_strtoul(s, NULL, 0);
+
+ if (vdso32_enabled > 1) {
+ pr_warn("vdso32 values other than 0 and 1 are no longer allowed; vdso disabled\n");
+ vdso32_enabled = 0;
+ }
+
+ return 1;
+}
+
+/*
+ * For consistency, the argument vdso32=[012] affects the 32-bit vDSO
+ * behavior on both 64-bit and 32-bit kernels.
+ * On 32-bit kernels, vdso=[012] means the same thing.
+ */
+__setup("vdso32=", vdso32_setup);
+
+#ifdef CONFIG_X86_32
+__setup_param("vdso=", vdso_setup, vdso32_setup, 0);
+#endif
+
+int __init sysenter_setup(void)
+{
+ init_vdso_image(&vdso_image_32);
+
+ return 0;
+}
+
+#ifdef CONFIG_X86_64
+
+subsys_initcall(sysenter_setup);
+
+#ifdef CONFIG_SYSCTL
+/* Register vsyscall32 into the ABI table */
+#include <linux/sysctl.h>
+
+static const int zero;
+static const int one = 1;
+
+static struct ctl_table abi_table2[] = {
+ {
+ .procname = "vsyscall32",
+ .data = &vdso32_enabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = (int *)&zero,
+ .extra2 = (int *)&one,
+ },
+ {}
+};
+
+static struct ctl_table abi_root_table2[] = {
+ {
+ .procname = "abi",
+ .mode = 0555,
+ .child = abi_table2
+ },
+ {}
+};
+
+static __init int ia32_binfmt_init(void)
+{
+ register_sysctl_table(abi_root_table2);
+ return 0;
+}
+__initcall(ia32_binfmt_init);
+#endif /* CONFIG_SYSCTL */
+
+#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/entry/vdso/vdso32/.gitignore b/arch/x86/entry/vdso/vdso32/.gitignore
new file mode 100644
index 000000000..e45fba9d0
--- /dev/null
+++ b/arch/x86/entry/vdso/vdso32/.gitignore
@@ -0,0 +1 @@
+vdso32.lds
diff --git a/arch/x86/entry/vdso/vdso32/note.S b/arch/x86/entry/vdso/vdso32/note.S
new file mode 100644
index 000000000..e78047d11
--- /dev/null
+++ b/arch/x86/entry/vdso/vdso32/note.S
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text.
+ * Here we can supply some information useful to userland.
+ */
+
+#include <linux/build-salt.h>
+#include <linux/version.h>
+#include <linux/elfnote.h>
+
+/* Ideally this would use UTS_NAME, but using a quoted string here
+ doesn't work. Remember to change this when changing the
+ kernel's name. */
+ELFNOTE_START(Linux, 0, "a")
+ .long LINUX_VERSION_CODE
+ELFNOTE_END
+
+BUILD_SALT
+
+#ifdef CONFIG_XEN
+/*
+ * Add a special note telling glibc's dynamic linker a fake hardware
+ * flavor that it will use to choose the search path for libraries in the
+ * same way it uses real hardware capabilities like "mmx".
+ * We supply "nosegneg" as the fake capability, to indicate that we
+ * do not like negative offsets in instructions using segment overrides,
+ * since we implement those inefficiently. This makes it possible to
+ * install libraries optimized to avoid those access patterns in someplace
+ * like /lib/i686/tls/nosegneg. Note that an /etc/ld.so.conf.d/file
+ * corresponding to the bits here is needed to make ldconfig work right.
+ * It should contain:
+ * hwcap 1 nosegneg
+ * to match the mapping of bit to name that we give here.
+ *
+ * At runtime, the fake hardware feature will be considered to be present
+ * if its bit is set in the mask word. So, we start with the mask 0, and
+ * at boot time we set VDSO_NOTE_NONEGSEG_BIT if running under Xen.
+ */
+
+#include "../../xen/vdso.h" /* Defines VDSO_NOTE_NONEGSEG_BIT. */
+
+ELFNOTE_START(GNU, 2, "a")
+ .long 1 /* ncaps */
+VDSO32_NOTE_MASK: /* Symbol used by arch/x86/xen/setup.c */
+ .long 0 /* mask */
+ .byte VDSO_NOTE_NONEGSEG_BIT; .asciz "nosegneg" /* bit, name */
+ELFNOTE_END
+#endif
diff --git a/arch/x86/entry/vdso/vdso32/sigreturn.S b/arch/x86/entry/vdso/vdso32/sigreturn.S
new file mode 100644
index 000000000..c3233ee98
--- /dev/null
+++ b/arch/x86/entry/vdso/vdso32/sigreturn.S
@@ -0,0 +1,138 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/linkage.h>
+#include <asm/unistd_32.h>
+#include <asm/asm-offsets.h>
+
+#ifndef SYSCALL_ENTER_KERNEL
+#define SYSCALL_ENTER_KERNEL int $0x80
+#endif
+
+ .text
+ .globl __kernel_sigreturn
+ .type __kernel_sigreturn,@function
+ nop /* this guy is needed for .LSTARTFDEDLSI1 below (watch for HACK) */
+ ALIGN
+__kernel_sigreturn:
+.LSTART_sigreturn:
+ popl %eax /* XXX does this mean it needs unwind info? */
+ movl $__NR_sigreturn, %eax
+ SYSCALL_ENTER_KERNEL
+.LEND_sigreturn:
+ nop
+ .size __kernel_sigreturn,.-.LSTART_sigreturn
+
+ .globl __kernel_rt_sigreturn
+ .type __kernel_rt_sigreturn,@function
+ ALIGN
+__kernel_rt_sigreturn:
+.LSTART_rt_sigreturn:
+ movl $__NR_rt_sigreturn, %eax
+ SYSCALL_ENTER_KERNEL
+.LEND_rt_sigreturn:
+ nop
+ .size __kernel_rt_sigreturn,.-.LSTART_rt_sigreturn
+ .previous
+
+ .section .eh_frame,"a",@progbits
+.LSTARTFRAMEDLSI1:
+ .long .LENDCIEDLSI1-.LSTARTCIEDLSI1
+.LSTARTCIEDLSI1:
+ .long 0 /* CIE ID */
+ .byte 1 /* Version number */
+ .string "zRS" /* NUL-terminated augmentation string */
+ .uleb128 1 /* Code alignment factor */
+ .sleb128 -4 /* Data alignment factor */
+ .byte 8 /* Return address register column */
+ .uleb128 1 /* Augmentation value length */
+ .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
+ .byte 0 /* DW_CFA_nop */
+ .align 4
+.LENDCIEDLSI1:
+ .long .LENDFDEDLSI1-.LSTARTFDEDLSI1 /* Length FDE */
+.LSTARTFDEDLSI1:
+ .long .LSTARTFDEDLSI1-.LSTARTFRAMEDLSI1 /* CIE pointer */
+ /* HACK: The dwarf2 unwind routines will subtract 1 from the
+ return address to get an address in the middle of the
+ presumed call instruction. Since we didn't get here via
+ a call, we need to include the nop before the real start
+ to make up for it. */
+ .long .LSTART_sigreturn-1-. /* PC-relative start address */
+ .long .LEND_sigreturn-.LSTART_sigreturn+1
+ .uleb128 0 /* Augmentation */
+ /* What follows are the instructions for the table generation.
+ We record the locations of each register saved. This is
+ complicated by the fact that the "CFA" is always assumed to
+ be the value of the stack pointer in the caller. This means
+ that we must define the CFA of this body of code to be the
+ saved value of the stack pointer in the sigcontext. Which
+ also means that there is no fixed relation to the other
+ saved registers, which means that we must use DW_CFA_expression
+ to compute their addresses. It also means that when we
+ adjust the stack with the popl, we have to do it all over again. */
+
+#define do_cfa_expr(offset) \
+ .byte 0x0f; /* DW_CFA_def_cfa_expression */ \
+ .uleb128 1f-0f; /* length */ \
+0: .byte 0x74; /* DW_OP_breg4 */ \
+ .sleb128 offset; /* offset */ \
+ .byte 0x06; /* DW_OP_deref */ \
+1:
+
+#define do_expr(regno, offset) \
+ .byte 0x10; /* DW_CFA_expression */ \
+ .uleb128 regno; /* regno */ \
+ .uleb128 1f-0f; /* length */ \
+0: .byte 0x74; /* DW_OP_breg4 */ \
+ .sleb128 offset; /* offset */ \
+1:
+
+ do_cfa_expr(IA32_SIGCONTEXT_sp+4)
+ do_expr(0, IA32_SIGCONTEXT_ax+4)
+ do_expr(1, IA32_SIGCONTEXT_cx+4)
+ do_expr(2, IA32_SIGCONTEXT_dx+4)
+ do_expr(3, IA32_SIGCONTEXT_bx+4)
+ do_expr(5, IA32_SIGCONTEXT_bp+4)
+ do_expr(6, IA32_SIGCONTEXT_si+4)
+ do_expr(7, IA32_SIGCONTEXT_di+4)
+ do_expr(8, IA32_SIGCONTEXT_ip+4)
+
+ .byte 0x42 /* DW_CFA_advance_loc 2 -- nop; popl eax. */
+
+ do_cfa_expr(IA32_SIGCONTEXT_sp)
+ do_expr(0, IA32_SIGCONTEXT_ax)
+ do_expr(1, IA32_SIGCONTEXT_cx)
+ do_expr(2, IA32_SIGCONTEXT_dx)
+ do_expr(3, IA32_SIGCONTEXT_bx)
+ do_expr(5, IA32_SIGCONTEXT_bp)
+ do_expr(6, IA32_SIGCONTEXT_si)
+ do_expr(7, IA32_SIGCONTEXT_di)
+ do_expr(8, IA32_SIGCONTEXT_ip)
+
+ .align 4
+.LENDFDEDLSI1:
+
+ .long .LENDFDEDLSI2-.LSTARTFDEDLSI2 /* Length FDE */
+.LSTARTFDEDLSI2:
+ .long .LSTARTFDEDLSI2-.LSTARTFRAMEDLSI1 /* CIE pointer */
+ /* HACK: See above wrt unwind library assumptions. */
+ .long .LSTART_rt_sigreturn-1-. /* PC-relative start address */
+ .long .LEND_rt_sigreturn-.LSTART_rt_sigreturn+1
+ .uleb128 0 /* Augmentation */
+ /* What follows are the instructions for the table generation.
+ We record the locations of each register saved. This is
+ slightly less complicated than the above, since we don't
+ modify the stack pointer in the process. */
+
+ do_cfa_expr(IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_sp)
+ do_expr(0, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ax)
+ do_expr(1, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_cx)
+ do_expr(2, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_dx)
+ do_expr(3, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_bx)
+ do_expr(5, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_bp)
+ do_expr(6, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_si)
+ do_expr(7, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_di)
+ do_expr(8, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ip)
+
+ .align 4
+.LENDFDEDLSI2:
+ .previous
diff --git a/arch/x86/entry/vdso/vdso32/system_call.S b/arch/x86/entry/vdso/vdso32/system_call.S
new file mode 100644
index 000000000..263d7433d
--- /dev/null
+++ b/arch/x86/entry/vdso/vdso32/system_call.S
@@ -0,0 +1,85 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * AT_SYSINFO entry point
+*/
+
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+#include <asm/cpufeatures.h>
+#include <asm/alternative-asm.h>
+
+ .text
+ .globl __kernel_vsyscall
+ .type __kernel_vsyscall,@function
+ ALIGN
+__kernel_vsyscall:
+ CFI_STARTPROC
+ /*
+ * Reshuffle regs so that all of any of the entry instructions
+ * will preserve enough state.
+ *
+ * A really nice entry sequence would be:
+ * pushl %edx
+ * pushl %ecx
+ * movl %esp, %ecx
+ *
+ * Unfortunately, naughty Android versions between July and December
+ * 2015 actually hardcode the traditional Linux SYSENTER entry
+ * sequence. That is severely broken for a number of reasons (ask
+ * anyone with an AMD CPU, for example). Nonetheless, we try to keep
+ * it working approximately as well as it ever worked.
+ *
+ * This link may eludicate some of the history:
+ * https://android-review.googlesource.com/#/q/Iac3295376d61ef83e713ac9b528f3b50aa780cd7
+ * personally, I find it hard to understand what's going on there.
+ *
+ * Note to future user developers: DO NOT USE SYSENTER IN YOUR CODE.
+ * Execute an indirect call to the address in the AT_SYSINFO auxv
+ * entry. That is the ONLY correct way to make a fast 32-bit system
+ * call on Linux. (Open-coding int $0x80 is also fine, but it's
+ * slow.)
+ */
+ pushl %ecx
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET ecx, 0
+ pushl %edx
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET edx, 0
+ pushl %ebp
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET ebp, 0
+
+ #define SYSENTER_SEQUENCE "movl %esp, %ebp; sysenter"
+ #define SYSCALL_SEQUENCE "movl %ecx, %ebp; syscall"
+
+#ifdef CONFIG_X86_64
+ /* If SYSENTER (Intel) or SYSCALL32 (AMD) is available, use it. */
+ ALTERNATIVE_2 "", SYSENTER_SEQUENCE, X86_FEATURE_SYSENTER32, \
+ SYSCALL_SEQUENCE, X86_FEATURE_SYSCALL32
+#else
+ ALTERNATIVE "", SYSENTER_SEQUENCE, X86_FEATURE_SEP
+#endif
+
+ /* Enter using int $0x80 */
+ int $0x80
+GLOBAL(int80_landing_pad)
+
+ /*
+ * Restore EDX and ECX in case they were clobbered. EBP is not
+ * clobbered (the kernel restores it), but it's cleaner and
+ * probably faster to pop it than to adjust ESP using addl.
+ */
+ popl %ebp
+ CFI_RESTORE ebp
+ CFI_ADJUST_CFA_OFFSET -4
+ popl %edx
+ CFI_RESTORE edx
+ CFI_ADJUST_CFA_OFFSET -4
+ popl %ecx
+ CFI_RESTORE ecx
+ CFI_ADJUST_CFA_OFFSET -4
+ ret
+ CFI_ENDPROC
+
+ .size __kernel_vsyscall,.-__kernel_vsyscall
+ .previous
diff --git a/arch/x86/entry/vdso/vdso32/vclock_gettime.c b/arch/x86/entry/vdso/vdso32/vclock_gettime.c
new file mode 100644
index 000000000..9242b2841
--- /dev/null
+++ b/arch/x86/entry/vdso/vdso32/vclock_gettime.c
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: GPL-2.0
+#define BUILD_VDSO32
+
+#ifndef CONFIG_CC_OPTIMIZE_FOR_SIZE
+#undef CONFIG_OPTIMIZE_INLINING
+#endif
+
+#ifdef CONFIG_X86_64
+
+/*
+ * in case of a 32 bit VDSO for a 64 bit kernel fake a 32 bit kernel
+ * configuration
+ */
+#undef CONFIG_64BIT
+#undef CONFIG_X86_64
+#undef CONFIG_PGTABLE_LEVELS
+#undef CONFIG_ILLEGAL_POINTER_VALUE
+#undef CONFIG_SPARSEMEM_VMEMMAP
+#undef CONFIG_NR_CPUS
+
+#define CONFIG_X86_32 1
+#define CONFIG_PGTABLE_LEVELS 2
+#define CONFIG_PAGE_OFFSET 0
+#define CONFIG_ILLEGAL_POINTER_VALUE 0
+#define CONFIG_NR_CPUS 1
+
+#define BUILD_VDSO32_64
+
+#endif
+
+#include "../vclock_gettime.c"
diff --git a/arch/x86/entry/vdso/vdso32/vdso32.lds.S b/arch/x86/entry/vdso/vdso32/vdso32.lds.S
new file mode 100644
index 000000000..422764a81
--- /dev/null
+++ b/arch/x86/entry/vdso/vdso32/vdso32.lds.S
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Linker script for 32-bit vDSO.
+ * We #include the file to define the layout details.
+ *
+ * This file defines the version script giving the user-exported symbols in
+ * the DSO.
+ */
+
+#include <asm/page.h>
+
+#define BUILD_VDSO32
+
+#include "../vdso-layout.lds.S"
+
+/* The ELF entry point can be used to set the AT_SYSINFO value. */
+ENTRY(__kernel_vsyscall);
+
+/*
+ * This controls what userland symbols we export from the vDSO.
+ */
+VERSION
+{
+ LINUX_2.6 {
+ global:
+ __vdso_clock_gettime;
+ __vdso_gettimeofday;
+ __vdso_time;
+ };
+
+ LINUX_2.5 {
+ global:
+ __kernel_vsyscall;
+ __kernel_sigreturn;
+ __kernel_rt_sigreturn;
+ local: *;
+ };
+}
diff --git a/arch/x86/entry/vdso/vdsox32.lds.S b/arch/x86/entry/vdso/vdsox32.lds.S
new file mode 100644
index 000000000..05cd1c5c4
--- /dev/null
+++ b/arch/x86/entry/vdso/vdsox32.lds.S
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Linker script for x32 vDSO.
+ * We #include the file to define the layout details.
+ *
+ * This file defines the version script giving the user-exported symbols in
+ * the DSO.
+ */
+
+#define BUILD_VDSOX32
+
+#include "vdso-layout.lds.S"
+
+/*
+ * This controls what userland symbols we export from the vDSO.
+ */
+VERSION {
+ LINUX_2.6 {
+ global:
+ __vdso_clock_gettime;
+ __vdso_gettimeofday;
+ __vdso_getcpu;
+ __vdso_time;
+ local: *;
+ };
+}
diff --git a/arch/x86/entry/vdso/vgetcpu.c b/arch/x86/entry/vdso/vgetcpu.c
new file mode 100644
index 000000000..8ec3d1f4c
--- /dev/null
+++ b/arch/x86/entry/vdso/vgetcpu.c
@@ -0,0 +1,28 @@
+/*
+ * Copyright 2006 Andi Kleen, SUSE Labs.
+ * Subject to the GNU Public License, v.2
+ *
+ * Fast user context implementation of getcpu()
+ */
+
+#include <linux/kernel.h>
+#include <linux/getcpu.h>
+#include <linux/time.h>
+#include <asm/vgtod.h>
+
+notrace long
+__vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused)
+{
+ unsigned int p;
+
+ p = __getcpu();
+
+ if (cpu)
+ *cpu = p & VGETCPU_CPU_MASK;
+ if (node)
+ *node = p >> 12;
+ return 0;
+}
+
+long getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
+ __attribute__((weak, alias("__vdso_getcpu")));
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
new file mode 100644
index 000000000..a1c31bb23
--- /dev/null
+++ b/arch/x86/entry/vdso/vma.c
@@ -0,0 +1,383 @@
+/*
+ * Copyright 2007 Andi Kleen, SUSE Labs.
+ * Subject to the GPL, v.2
+ *
+ * This contains most of the x86 vDSO kernel-side code.
+ */
+#include <linux/mm.h>
+#include <linux/err.h>
+#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/random.h>
+#include <linux/elf.h>
+#include <linux/cpu.h>
+#include <linux/ptrace.h>
+#include <asm/pvclock.h>
+#include <asm/vgtod.h>
+#include <asm/proto.h>
+#include <asm/vdso.h>
+#include <asm/vvar.h>
+#include <asm/page.h>
+#include <asm/desc.h>
+#include <asm/cpufeature.h>
+#include <asm/mshyperv.h>
+
+#if defined(CONFIG_X86_64)
+unsigned int __read_mostly vdso64_enabled = 1;
+#endif
+
+void __init init_vdso_image(const struct vdso_image *image)
+{
+ BUG_ON(image->size % PAGE_SIZE != 0);
+
+ apply_alternatives((struct alt_instr *)(image->data + image->alt),
+ (struct alt_instr *)(image->data + image->alt +
+ image->alt_len));
+}
+
+struct linux_binprm;
+
+static int vdso_fault(const struct vm_special_mapping *sm,
+ struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ const struct vdso_image *image = vma->vm_mm->context.vdso_image;
+
+ if (!image || (vmf->pgoff << PAGE_SHIFT) >= image->size)
+ return VM_FAULT_SIGBUS;
+
+ vmf->page = virt_to_page(image->data + (vmf->pgoff << PAGE_SHIFT));
+ get_page(vmf->page);
+ return 0;
+}
+
+static void vdso_fix_landing(const struct vdso_image *image,
+ struct vm_area_struct *new_vma)
+{
+#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
+ if (in_ia32_syscall() && image == &vdso_image_32) {
+ struct pt_regs *regs = current_pt_regs();
+ unsigned long vdso_land = image->sym_int80_landing_pad;
+ unsigned long old_land_addr = vdso_land +
+ (unsigned long)current->mm->context.vdso;
+
+ /* Fixing userspace landing - look at do_fast_syscall_32 */
+ if (regs->ip == old_land_addr)
+ regs->ip = new_vma->vm_start + vdso_land;
+ }
+#endif
+}
+
+static int vdso_mremap(const struct vm_special_mapping *sm,
+ struct vm_area_struct *new_vma)
+{
+ unsigned long new_size = new_vma->vm_end - new_vma->vm_start;
+ const struct vdso_image *image = current->mm->context.vdso_image;
+
+ if (image->size != new_size)
+ return -EINVAL;
+
+ vdso_fix_landing(image, new_vma);
+ current->mm->context.vdso = (void __user *)new_vma->vm_start;
+
+ return 0;
+}
+
+static int vvar_fault(const struct vm_special_mapping *sm,
+ struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ const struct vdso_image *image = vma->vm_mm->context.vdso_image;
+ long sym_offset;
+ int ret = -EFAULT;
+
+ if (!image)
+ return VM_FAULT_SIGBUS;
+
+ sym_offset = (long)(vmf->pgoff << PAGE_SHIFT) +
+ image->sym_vvar_start;
+
+ /*
+ * Sanity check: a symbol offset of zero means that the page
+ * does not exist for this vdso image, not that the page is at
+ * offset zero relative to the text mapping. This should be
+ * impossible here, because sym_offset should only be zero for
+ * the page past the end of the vvar mapping.
+ */
+ if (sym_offset == 0)
+ return VM_FAULT_SIGBUS;
+
+ if (sym_offset == image->sym_vvar_page) {
+ ret = vm_insert_pfn(vma, vmf->address,
+ __pa_symbol(&__vvar_page) >> PAGE_SHIFT);
+ } else if (sym_offset == image->sym_pvclock_page) {
+ struct pvclock_vsyscall_time_info *pvti =
+ pvclock_get_pvti_cpu0_va();
+ if (pvti && vclock_was_used(VCLOCK_PVCLOCK)) {
+ ret = vm_insert_pfn_prot(
+ vma,
+ vmf->address,
+ __pa(pvti) >> PAGE_SHIFT,
+ pgprot_decrypted(vma->vm_page_prot));
+ }
+ } else if (sym_offset == image->sym_hvclock_page) {
+ struct ms_hyperv_tsc_page *tsc_pg = hv_get_tsc_page();
+
+ if (tsc_pg && vclock_was_used(VCLOCK_HVCLOCK))
+ ret = vm_insert_pfn(vma, vmf->address,
+ vmalloc_to_pfn(tsc_pg));
+ }
+
+ if (ret == 0 || ret == -EBUSY)
+ return VM_FAULT_NOPAGE;
+
+ return VM_FAULT_SIGBUS;
+}
+
+static const struct vm_special_mapping vdso_mapping = {
+ .name = "[vdso]",
+ .fault = vdso_fault,
+ .mremap = vdso_mremap,
+};
+static const struct vm_special_mapping vvar_mapping = {
+ .name = "[vvar]",
+ .fault = vvar_fault,
+};
+
+/*
+ * Add vdso and vvar mappings to current process.
+ * @image - blob to map
+ * @addr - request a specific address (zero to map at free addr)
+ */
+static int map_vdso(const struct vdso_image *image, unsigned long addr)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ unsigned long text_start;
+ int ret = 0;
+
+ if (down_write_killable(&mm->mmap_sem))
+ return -EINTR;
+
+ addr = get_unmapped_area(NULL, addr,
+ image->size - image->sym_vvar_start, 0, 0);
+ if (IS_ERR_VALUE(addr)) {
+ ret = addr;
+ goto up_fail;
+ }
+
+ text_start = addr - image->sym_vvar_start;
+
+ /*
+ * MAYWRITE to allow gdb to COW and set breakpoints
+ */
+ vma = _install_special_mapping(mm,
+ text_start,
+ image->size,
+ VM_READ|VM_EXEC|
+ VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
+ &vdso_mapping);
+
+ if (IS_ERR(vma)) {
+ ret = PTR_ERR(vma);
+ goto up_fail;
+ }
+
+ vma = _install_special_mapping(mm,
+ addr,
+ -image->sym_vvar_start,
+ VM_READ|VM_MAYREAD|VM_IO|VM_DONTDUMP|
+ VM_PFNMAP,
+ &vvar_mapping);
+
+ if (IS_ERR(vma)) {
+ ret = PTR_ERR(vma);
+ do_munmap(mm, text_start, image->size, NULL);
+ } else {
+ current->mm->context.vdso = (void __user *)text_start;
+ current->mm->context.vdso_image = image;
+ }
+
+up_fail:
+ up_write(&mm->mmap_sem);
+ return ret;
+}
+
+#ifdef CONFIG_X86_64
+/*
+ * Put the vdso above the (randomized) stack with another randomized
+ * offset. This way there is no hole in the middle of address space.
+ * To save memory make sure it is still in the same PTE as the stack
+ * top. This doesn't give that many random bits.
+ *
+ * Note that this algorithm is imperfect: the distribution of the vdso
+ * start address within a PMD is biased toward the end.
+ *
+ * Only used for the 64-bit and x32 vdsos.
+ */
+static unsigned long vdso_addr(unsigned long start, unsigned len)
+{
+ unsigned long addr, end;
+ unsigned offset;
+
+ /*
+ * Round up the start address. It can start out unaligned as a result
+ * of stack start randomization.
+ */
+ start = PAGE_ALIGN(start);
+
+ /* Round the lowest possible end address up to a PMD boundary. */
+ end = (start + len + PMD_SIZE - 1) & PMD_MASK;
+ if (end >= TASK_SIZE_MAX)
+ end = TASK_SIZE_MAX;
+ end -= len;
+
+ if (end > start) {
+ offset = get_random_int() % (((end - start) >> PAGE_SHIFT) + 1);
+ addr = start + (offset << PAGE_SHIFT);
+ } else {
+ addr = start;
+ }
+
+ /*
+ * Forcibly align the final address in case we have a hardware
+ * issue that requires alignment for performance reasons.
+ */
+ addr = align_vdso_addr(addr);
+
+ return addr;
+}
+
+static int map_vdso_randomized(const struct vdso_image *image)
+{
+ unsigned long addr = vdso_addr(current->mm->start_stack, image->size-image->sym_vvar_start);
+
+ return map_vdso(image, addr);
+}
+#endif
+
+int map_vdso_once(const struct vdso_image *image, unsigned long addr)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+
+ down_write(&mm->mmap_sem);
+ /*
+ * Check if we have already mapped vdso blob - fail to prevent
+ * abusing from userspace install_speciall_mapping, which may
+ * not do accounting and rlimit right.
+ * We could search vma near context.vdso, but it's a slowpath,
+ * so let's explicitely check all VMAs to be completely sure.
+ */
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ if (vma_is_special_mapping(vma, &vdso_mapping) ||
+ vma_is_special_mapping(vma, &vvar_mapping)) {
+ up_write(&mm->mmap_sem);
+ return -EEXIST;
+ }
+ }
+ up_write(&mm->mmap_sem);
+
+ return map_vdso(image, addr);
+}
+
+#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
+static int load_vdso32(void)
+{
+ if (vdso32_enabled != 1) /* Other values all mean "disabled" */
+ return 0;
+
+ return map_vdso(&vdso_image_32, 0);
+}
+#endif
+
+#ifdef CONFIG_X86_64
+int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
+{
+ if (!vdso64_enabled)
+ return 0;
+
+ return map_vdso_randomized(&vdso_image_64);
+}
+
+#ifdef CONFIG_COMPAT
+int compat_arch_setup_additional_pages(struct linux_binprm *bprm,
+ int uses_interp)
+{
+#ifdef CONFIG_X86_X32_ABI
+ if (test_thread_flag(TIF_X32)) {
+ if (!vdso64_enabled)
+ return 0;
+ return map_vdso_randomized(&vdso_image_x32);
+ }
+#endif
+#ifdef CONFIG_IA32_EMULATION
+ return load_vdso32();
+#else
+ return 0;
+#endif
+}
+#endif
+#else
+int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
+{
+ return load_vdso32();
+}
+#endif
+
+#ifdef CONFIG_X86_64
+static __init int vdso_setup(char *s)
+{
+ vdso64_enabled = simple_strtoul(s, NULL, 0);
+ return 1;
+}
+__setup("vdso=", vdso_setup);
+#endif
+
+#ifdef CONFIG_X86_64
+static void vgetcpu_cpu_init(void *arg)
+{
+ int cpu = smp_processor_id();
+ struct desc_struct d = { };
+ unsigned long node = 0;
+#ifdef CONFIG_NUMA
+ node = cpu_to_node(cpu);
+#endif
+ if (boot_cpu_has(X86_FEATURE_RDTSCP) || boot_cpu_has(X86_FEATURE_RDPID))
+ write_rdtscp_aux((node << 12) | cpu);
+
+ /*
+ * Store cpu number in limit so that it can be loaded
+ * quickly in user space in vgetcpu. (12 bits for the CPU
+ * and 8 bits for the node)
+ */
+ d.limit0 = cpu | ((node & 0xf) << 12);
+ d.limit1 = node >> 4;
+ d.type = 5; /* RO data, expand down, accessed */
+ d.dpl = 3; /* Visible to user code */
+ d.s = 1; /* Not a system segment */
+ d.p = 1; /* Present */
+ d.d = 1; /* 32-bit */
+
+ write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
+}
+
+static int vgetcpu_online(unsigned int cpu)
+{
+ return smp_call_function_single(cpu, vgetcpu_cpu_init, NULL, 1);
+}
+
+static int __init init_vdso(void)
+{
+ init_vdso_image(&vdso_image_64);
+
+#ifdef CONFIG_X86_X32_ABI
+ init_vdso_image(&vdso_image_x32);
+#endif
+
+ /* notifier priority > KVM */
+ return cpuhp_setup_state(CPUHP_AP_X86_VDSO_VMA_ONLINE,
+ "x86/vdso/vma:online", vgetcpu_online, NULL);
+}
+subsys_initcall(init_vdso);
+#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/entry/vsyscall/Makefile b/arch/x86/entry/vsyscall/Makefile
new file mode 100644
index 000000000..a9f4856f6
--- /dev/null
+++ b/arch/x86/entry/vsyscall/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the x86 low level vsyscall code
+#
+obj-y := vsyscall_gtod.o
+
+obj-$(CONFIG_X86_VSYSCALL_EMULATION) += vsyscall_64.o vsyscall_emu_64.o
+
diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
new file mode 100644
index 000000000..82ed001e8
--- /dev/null
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -0,0 +1,375 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2012-2014 Andy Lutomirski <luto@amacapital.net>
+ *
+ * Based on the original implementation which is:
+ * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
+ * Copyright 2003 Andi Kleen, SuSE Labs.
+ *
+ * Parts of the original code have been moved to arch/x86/vdso/vma.c
+ *
+ * This file implements vsyscall emulation. vsyscalls are a legacy ABI:
+ * Userspace can request certain kernel services by calling fixed
+ * addresses. This concept is problematic:
+ *
+ * - It interferes with ASLR.
+ * - It's awkward to write code that lives in kernel addresses but is
+ * callable by userspace at fixed addresses.
+ * - The whole concept is impossible for 32-bit compat userspace.
+ * - UML cannot easily virtualize a vsyscall.
+ *
+ * As of mid-2014, I believe that there is no new userspace code that
+ * will use a vsyscall if the vDSO is present. I hope that there will
+ * soon be no new userspace code that will ever use a vsyscall.
+ *
+ * The code in this file emulates vsyscalls when notified of a page
+ * fault to a vsyscall address.
+ */
+
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/sched/signal.h>
+#include <linux/mm_types.h>
+#include <linux/syscalls.h>
+#include <linux/ratelimit.h>
+
+#include <asm/vsyscall.h>
+#include <asm/unistd.h>
+#include <asm/fixmap.h>
+#include <asm/traps.h>
+#include <asm/paravirt.h>
+
+#define CREATE_TRACE_POINTS
+#include "vsyscall_trace.h"
+
+static enum { EMULATE, NONE } vsyscall_mode =
+#ifdef CONFIG_LEGACY_VSYSCALL_NONE
+ NONE;
+#else
+ EMULATE;
+#endif
+
+static int __init vsyscall_setup(char *str)
+{
+ if (str) {
+ if (!strcmp("emulate", str))
+ vsyscall_mode = EMULATE;
+ else if (!strcmp("none", str))
+ vsyscall_mode = NONE;
+ else
+ return -EINVAL;
+
+ return 0;
+ }
+
+ return -EINVAL;
+}
+early_param("vsyscall", vsyscall_setup);
+
+static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
+ const char *message)
+{
+ if (!show_unhandled_signals)
+ return;
+
+ printk_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n",
+ level, current->comm, task_pid_nr(current),
+ message, regs->ip, regs->cs,
+ regs->sp, regs->ax, regs->si, regs->di);
+}
+
+static int addr_to_vsyscall_nr(unsigned long addr)
+{
+ int nr;
+
+ if ((addr & ~0xC00UL) != VSYSCALL_ADDR)
+ return -EINVAL;
+
+ nr = (addr & 0xC00UL) >> 10;
+ if (nr >= 3)
+ return -EINVAL;
+
+ return nr;
+}
+
+static bool write_ok_or_segv(unsigned long ptr, size_t size)
+{
+ /*
+ * XXX: if access_ok, get_user, and put_user handled
+ * sig_on_uaccess_err, this could go away.
+ */
+
+ if (!access_ok(VERIFY_WRITE, (void __user *)ptr, size)) {
+ siginfo_t info;
+ struct thread_struct *thread = &current->thread;
+
+ thread->error_code = 6; /* user fault, no page, write */
+ thread->cr2 = ptr;
+ thread->trap_nr = X86_TRAP_PF;
+
+ clear_siginfo(&info);
+ info.si_signo = SIGSEGV;
+ info.si_errno = 0;
+ info.si_code = SEGV_MAPERR;
+ info.si_addr = (void __user *)ptr;
+
+ force_sig_info(SIGSEGV, &info, current);
+ return false;
+ } else {
+ return true;
+ }
+}
+
+bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
+{
+ struct task_struct *tsk;
+ unsigned long caller;
+ int vsyscall_nr, syscall_nr, tmp;
+ int prev_sig_on_uaccess_err;
+ long ret;
+ unsigned long orig_dx;
+
+ /*
+ * No point in checking CS -- the only way to get here is a user mode
+ * trap to a high address, which means that we're in 64-bit user code.
+ */
+
+ WARN_ON_ONCE(address != regs->ip);
+
+ if (vsyscall_mode == NONE) {
+ warn_bad_vsyscall(KERN_INFO, regs,
+ "vsyscall attempted with vsyscall=none");
+ return false;
+ }
+
+ vsyscall_nr = addr_to_vsyscall_nr(address);
+
+ trace_emulate_vsyscall(vsyscall_nr);
+
+ if (vsyscall_nr < 0) {
+ warn_bad_vsyscall(KERN_WARNING, regs,
+ "misaligned vsyscall (exploit attempt or buggy program) -- look up the vsyscall kernel parameter if you need a workaround");
+ goto sigsegv;
+ }
+
+ if (get_user(caller, (unsigned long __user *)regs->sp) != 0) {
+ warn_bad_vsyscall(KERN_WARNING, regs,
+ "vsyscall with bad stack (exploit attempt?)");
+ goto sigsegv;
+ }
+
+ tsk = current;
+
+ /*
+ * Check for access_ok violations and find the syscall nr.
+ *
+ * NULL is a valid user pointer (in the access_ok sense) on 32-bit and
+ * 64-bit, so we don't need to special-case it here. For all the
+ * vsyscalls, NULL means "don't write anything" not "write it at
+ * address 0".
+ */
+ switch (vsyscall_nr) {
+ case 0:
+ if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) ||
+ !write_ok_or_segv(regs->si, sizeof(struct timezone))) {
+ ret = -EFAULT;
+ goto check_fault;
+ }
+
+ syscall_nr = __NR_gettimeofday;
+ break;
+
+ case 1:
+ if (!write_ok_or_segv(regs->di, sizeof(time_t))) {
+ ret = -EFAULT;
+ goto check_fault;
+ }
+
+ syscall_nr = __NR_time;
+ break;
+
+ case 2:
+ if (!write_ok_or_segv(regs->di, sizeof(unsigned)) ||
+ !write_ok_or_segv(regs->si, sizeof(unsigned))) {
+ ret = -EFAULT;
+ goto check_fault;
+ }
+
+ syscall_nr = __NR_getcpu;
+ break;
+ }
+
+ /*
+ * Handle seccomp. regs->ip must be the original value.
+ * See seccomp_send_sigsys and Documentation/userspace-api/seccomp_filter.rst.
+ *
+ * We could optimize the seccomp disabled case, but performance
+ * here doesn't matter.
+ */
+ regs->orig_ax = syscall_nr;
+ regs->ax = -ENOSYS;
+ tmp = secure_computing(NULL);
+ if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) {
+ warn_bad_vsyscall(KERN_DEBUG, regs,
+ "seccomp tried to change syscall nr or ip");
+ do_exit(SIGSYS);
+ }
+ regs->orig_ax = -1;
+ if (tmp)
+ goto do_ret; /* skip requested */
+
+ /*
+ * With a real vsyscall, page faults cause SIGSEGV. We want to
+ * preserve that behavior to make writing exploits harder.
+ */
+ prev_sig_on_uaccess_err = current->thread.sig_on_uaccess_err;
+ current->thread.sig_on_uaccess_err = 1;
+
+ ret = -EFAULT;
+ switch (vsyscall_nr) {
+ case 0:
+ /* this decodes regs->di and regs->si on its own */
+ ret = __x64_sys_gettimeofday(regs);
+ break;
+
+ case 1:
+ /* this decodes regs->di on its own */
+ ret = __x64_sys_time(regs);
+ break;
+
+ case 2:
+ /* while we could clobber regs->dx, we didn't in the past... */
+ orig_dx = regs->dx;
+ regs->dx = 0;
+ /* this decodes regs->di, regs->si and regs->dx on its own */
+ ret = __x64_sys_getcpu(regs);
+ regs->dx = orig_dx;
+ break;
+ }
+
+ current->thread.sig_on_uaccess_err = prev_sig_on_uaccess_err;
+
+check_fault:
+ if (ret == -EFAULT) {
+ /* Bad news -- userspace fed a bad pointer to a vsyscall. */
+ warn_bad_vsyscall(KERN_INFO, regs,
+ "vsyscall fault (exploit attempt?)");
+
+ /*
+ * If we failed to generate a signal for any reason,
+ * generate one here. (This should be impossible.)
+ */
+ if (WARN_ON_ONCE(!sigismember(&tsk->pending.signal, SIGBUS) &&
+ !sigismember(&tsk->pending.signal, SIGSEGV)))
+ goto sigsegv;
+
+ return true; /* Don't emulate the ret. */
+ }
+
+ regs->ax = ret;
+
+do_ret:
+ /* Emulate a ret instruction. */
+ regs->ip = caller;
+ regs->sp += 8;
+ return true;
+
+sigsegv:
+ force_sig(SIGSEGV, current);
+ return true;
+}
+
+/*
+ * A pseudo VMA to allow ptrace access for the vsyscall page. This only
+ * covers the 64bit vsyscall page now. 32bit has a real VMA now and does
+ * not need special handling anymore:
+ */
+static const char *gate_vma_name(struct vm_area_struct *vma)
+{
+ return "[vsyscall]";
+}
+static const struct vm_operations_struct gate_vma_ops = {
+ .name = gate_vma_name,
+};
+static struct vm_area_struct gate_vma = {
+ .vm_start = VSYSCALL_ADDR,
+ .vm_end = VSYSCALL_ADDR + PAGE_SIZE,
+ .vm_page_prot = PAGE_READONLY_EXEC,
+ .vm_flags = VM_READ | VM_EXEC,
+ .vm_ops = &gate_vma_ops,
+};
+
+struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
+{
+#ifdef CONFIG_COMPAT
+ if (!mm || mm->context.ia32_compat)
+ return NULL;
+#endif
+ if (vsyscall_mode == NONE)
+ return NULL;
+ return &gate_vma;
+}
+
+int in_gate_area(struct mm_struct *mm, unsigned long addr)
+{
+ struct vm_area_struct *vma = get_gate_vma(mm);
+
+ if (!vma)
+ return 0;
+
+ return (addr >= vma->vm_start) && (addr < vma->vm_end);
+}
+
+/*
+ * Use this when you have no reliable mm, typically from interrupt
+ * context. It is less reliable than using a task's mm and may give
+ * false positives.
+ */
+int in_gate_area_no_mm(unsigned long addr)
+{
+ return vsyscall_mode != NONE && (addr & PAGE_MASK) == VSYSCALL_ADDR;
+}
+
+/*
+ * The VSYSCALL page is the only user-accessible page in the kernel address
+ * range. Normally, the kernel page tables can have _PAGE_USER clear, but
+ * the tables covering VSYSCALL_ADDR need _PAGE_USER set if vsyscalls
+ * are enabled.
+ *
+ * Some day we may create a "minimal" vsyscall mode in which we emulate
+ * vsyscalls but leave the page not present. If so, we skip calling
+ * this.
+ */
+void __init set_vsyscall_pgtable_user_bits(pgd_t *root)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+
+ pgd = pgd_offset_pgd(root, VSYSCALL_ADDR);
+ set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER));
+ p4d = p4d_offset(pgd, VSYSCALL_ADDR);
+#if CONFIG_PGTABLE_LEVELS >= 5
+ set_p4d(p4d, __p4d(p4d_val(*p4d) | _PAGE_USER));
+#endif
+ pud = pud_offset(p4d, VSYSCALL_ADDR);
+ set_pud(pud, __pud(pud_val(*pud) | _PAGE_USER));
+ pmd = pmd_offset(pud, VSYSCALL_ADDR);
+ set_pmd(pmd, __pmd(pmd_val(*pmd) | _PAGE_USER));
+}
+
+void __init map_vsyscall(void)
+{
+ extern char __vsyscall_page;
+ unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page);
+
+ if (vsyscall_mode != NONE) {
+ __set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall,
+ PAGE_KERNEL_VVAR);
+ set_vsyscall_pgtable_user_bits(swapper_pg_dir);
+ }
+
+ BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) !=
+ (unsigned long)VSYSCALL_ADDR);
+}
diff --git a/arch/x86/entry/vsyscall/vsyscall_emu_64.S b/arch/x86/entry/vsyscall/vsyscall_emu_64.S
new file mode 100644
index 000000000..c9596a9af
--- /dev/null
+++ b/arch/x86/entry/vsyscall/vsyscall_emu_64.S
@@ -0,0 +1,37 @@
+/*
+ * vsyscall_emu_64.S: Vsyscall emulation page
+ *
+ * Copyright (c) 2011 Andy Lutomirski
+ *
+ * Subject to the GNU General Public License, version 2
+ */
+
+#include <linux/linkage.h>
+
+#include <asm/irq_vectors.h>
+#include <asm/page_types.h>
+#include <asm/unistd_64.h>
+
+__PAGE_ALIGNED_DATA
+ .globl __vsyscall_page
+ .balign PAGE_SIZE, 0xcc
+ .type __vsyscall_page, @object
+__vsyscall_page:
+
+ mov $__NR_gettimeofday, %rax
+ syscall
+ ret
+
+ .balign 1024, 0xcc
+ mov $__NR_time, %rax
+ syscall
+ ret
+
+ .balign 1024, 0xcc
+ mov $__NR_getcpu, %rax
+ syscall
+ ret
+
+ .balign 4096, 0xcc
+
+ .size __vsyscall_page, 4096
diff --git a/arch/x86/entry/vsyscall/vsyscall_gtod.c b/arch/x86/entry/vsyscall/vsyscall_gtod.c
new file mode 100644
index 000000000..e1216dd95
--- /dev/null
+++ b/arch/x86/entry/vsyscall/vsyscall_gtod.c
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
+ * Copyright 2003 Andi Kleen, SuSE Labs.
+ *
+ * Modified for x86 32 bit architecture by
+ * Stefani Seibold <stefani@seibold.net>
+ * sponsored by Rohde & Schwarz GmbH & Co. KG Munich/Germany
+ *
+ * Thanks to hpa@transmeta.com for some useful hint.
+ * Special thanks to Ingo Molnar for his early experience with
+ * a different vsyscall implementation for Linux/IA32 and for the name.
+ *
+ */
+
+#include <linux/timekeeper_internal.h>
+#include <asm/vgtod.h>
+#include <asm/vvar.h>
+
+int vclocks_used __read_mostly;
+
+DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data);
+
+void update_vsyscall_tz(void)
+{
+ vsyscall_gtod_data.tz_minuteswest = sys_tz.tz_minuteswest;
+ vsyscall_gtod_data.tz_dsttime = sys_tz.tz_dsttime;
+}
+
+void update_vsyscall(struct timekeeper *tk)
+{
+ int vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode;
+ struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data;
+
+ /* Mark the new vclock used. */
+ BUILD_BUG_ON(VCLOCK_MAX >= 32);
+ WRITE_ONCE(vclocks_used, READ_ONCE(vclocks_used) | (1 << vclock_mode));
+
+ gtod_write_begin(vdata);
+
+ /* copy vsyscall data */
+ vdata->vclock_mode = vclock_mode;
+ vdata->cycle_last = tk->tkr_mono.cycle_last;
+ vdata->mask = tk->tkr_mono.mask;
+ vdata->mult = tk->tkr_mono.mult;
+ vdata->shift = tk->tkr_mono.shift;
+
+ vdata->wall_time_sec = tk->xtime_sec;
+ vdata->wall_time_snsec = tk->tkr_mono.xtime_nsec;
+
+ vdata->monotonic_time_sec = tk->xtime_sec
+ + tk->wall_to_monotonic.tv_sec;
+ vdata->monotonic_time_snsec = tk->tkr_mono.xtime_nsec
+ + ((u64)tk->wall_to_monotonic.tv_nsec
+ << tk->tkr_mono.shift);
+ while (vdata->monotonic_time_snsec >=
+ (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) {
+ vdata->monotonic_time_snsec -=
+ ((u64)NSEC_PER_SEC) << tk->tkr_mono.shift;
+ vdata->monotonic_time_sec++;
+ }
+
+ vdata->wall_time_coarse_sec = tk->xtime_sec;
+ vdata->wall_time_coarse_nsec = (long)(tk->tkr_mono.xtime_nsec >>
+ tk->tkr_mono.shift);
+
+ vdata->monotonic_time_coarse_sec =
+ vdata->wall_time_coarse_sec + tk->wall_to_monotonic.tv_sec;
+ vdata->monotonic_time_coarse_nsec =
+ vdata->wall_time_coarse_nsec + tk->wall_to_monotonic.tv_nsec;
+
+ while (vdata->monotonic_time_coarse_nsec >= NSEC_PER_SEC) {
+ vdata->monotonic_time_coarse_nsec -= NSEC_PER_SEC;
+ vdata->monotonic_time_coarse_sec++;
+ }
+
+ gtod_write_end(vdata);
+}
diff --git a/arch/x86/entry/vsyscall/vsyscall_trace.h b/arch/x86/entry/vsyscall/vsyscall_trace.h
new file mode 100644
index 000000000..3c3f9765a
--- /dev/null
+++ b/arch/x86/entry/vsyscall/vsyscall_trace.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM vsyscall
+
+#if !defined(__VSYSCALL_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __VSYSCALL_TRACE_H
+
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(emulate_vsyscall,
+
+ TP_PROTO(int nr),
+
+ TP_ARGS(nr),
+
+ TP_STRUCT__entry(__field(int, nr)),
+
+ TP_fast_assign(
+ __entry->nr = nr;
+ ),
+
+ TP_printk("nr = %d", __entry->nr)
+);
+
+#endif
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH ../../arch/x86/entry/vsyscall/
+#define TRACE_INCLUDE_FILE vsyscall_trace
+#include <trace/define_trace.h>
diff --git a/arch/x86/events/Kconfig b/arch/x86/events/Kconfig
new file mode 100644
index 000000000..9a7a1446c
--- /dev/null
+++ b/arch/x86/events/Kconfig
@@ -0,0 +1,37 @@
+# SPDX-License-Identifier: GPL-2.0
+menu "Performance monitoring"
+
+config PERF_EVENTS_INTEL_UNCORE
+ tristate "Intel uncore performance events"
+ depends on PERF_EVENTS && CPU_SUP_INTEL && PCI
+ default y
+ ---help---
+ Include support for Intel uncore performance events. These are
+ available on NehalemEX and more modern processors.
+
+config PERF_EVENTS_INTEL_RAPL
+ tristate "Intel rapl performance events"
+ depends on PERF_EVENTS && CPU_SUP_INTEL && PCI
+ default y
+ ---help---
+ Include support for Intel rapl performance events for power
+ monitoring on modern processors.
+
+config PERF_EVENTS_INTEL_CSTATE
+ tristate "Intel cstate performance events"
+ depends on PERF_EVENTS && CPU_SUP_INTEL && PCI
+ default y
+ ---help---
+ Include support for Intel cstate performance events for power
+ monitoring on modern processors.
+
+config PERF_EVENTS_AMD_POWER
+ depends on PERF_EVENTS && CPU_SUP_AMD
+ tristate "AMD Processor Power Reporting Mechanism"
+ ---help---
+ Provide power reporting mechanism support for AMD processors.
+ Currently, it leverages X86_FEATURE_ACC_POWER
+ (CPUID Fn8000_0007_EDX[12]) interface to calculate the
+ average power consumption on Family 15h processors.
+
+endmenu
diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile
new file mode 100644
index 000000000..b8ccdb5c9
--- /dev/null
+++ b/arch/x86/events/Makefile
@@ -0,0 +1,4 @@
+obj-y += core.o
+obj-y += amd/
+obj-$(CONFIG_X86_LOCAL_APIC) += msr.o
+obj-$(CONFIG_CPU_SUP_INTEL) += intel/
diff --git a/arch/x86/events/amd/Makefile b/arch/x86/events/amd/Makefile
new file mode 100644
index 000000000..fe8795a67
--- /dev/null
+++ b/arch/x86/events/amd/Makefile
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_CPU_SUP_AMD) += core.o uncore.o
+obj-$(CONFIG_PERF_EVENTS_AMD_POWER) += power.o
+obj-$(CONFIG_X86_LOCAL_APIC) += ibs.o
+ifdef CONFIG_AMD_IOMMU
+obj-$(CONFIG_CPU_SUP_AMD) += iommu.o
+endif
+
diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c
new file mode 100644
index 000000000..c3ec535fd
--- /dev/null
+++ b/arch/x86/events/amd/core.c
@@ -0,0 +1,1009 @@
+#include <linux/perf_event.h>
+#include <linux/export.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/delay.h>
+#include <linux/jiffies.h>
+#include <asm/apicdef.h>
+#include <asm/nmi.h>
+
+#include "../perf_event.h"
+
+static DEFINE_PER_CPU(unsigned long, perf_nmi_tstamp);
+static unsigned long perf_nmi_window;
+
+static __initconst const u64 amd_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
+ [ C(RESULT_MISS) ] = 0x0141, /* Data Cache Misses */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts */
+ [ C(RESULT_MISS) ] = 0x0167, /* Data Prefetcher :cancelled */
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches */
+ [ C(RESULT_MISS) ] = 0x0081, /* Instruction cache misses */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */
+ [ C(RESULT_MISS) ] = 0x037E, /* L2 Cache Misses : IC+DC */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback */
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
+ [ C(RESULT_MISS) ] = 0x0746, /* L1_DTLB_AND_L2_DLTB_MISS.ALL */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */
+ [ C(RESULT_MISS) ] = 0x0385, /* L1_ITLB_AND_L2_ITLB_MISS.ALL */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr. */
+ [ C(RESULT_MISS) ] = 0x00c3, /* Retired Mispredicted BI */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0xb8e9, /* CPU Request to Memory, l+r */
+ [ C(RESULT_MISS) ] = 0x98e9, /* CPU Request to Memory, r */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+};
+
+static __initconst const u64 amd_hw_cache_event_ids_f17h
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
+[C(L1D)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = 0x0040, /* Data Cache Accesses */
+ [C(RESULT_MISS)] = 0xc860, /* L2$ access from DC Miss */
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = 0,
+ [C(RESULT_MISS)] = 0,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = 0xff5a, /* h/w prefetch DC Fills */
+ [C(RESULT_MISS)] = 0,
+ },
+},
+[C(L1I)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = 0x0080, /* Instruction cache fetches */
+ [C(RESULT_MISS)] = 0x0081, /* Instruction cache misses */
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = -1,
+ [C(RESULT_MISS)] = -1,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = 0,
+ [C(RESULT_MISS)] = 0,
+ },
+},
+[C(LL)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = 0,
+ [C(RESULT_MISS)] = 0,
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = 0,
+ [C(RESULT_MISS)] = 0,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = 0,
+ [C(RESULT_MISS)] = 0,
+ },
+},
+[C(DTLB)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = 0xff45, /* All L2 DTLB accesses */
+ [C(RESULT_MISS)] = 0xf045, /* L2 DTLB misses (PT walks) */
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = 0,
+ [C(RESULT_MISS)] = 0,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = 0,
+ [C(RESULT_MISS)] = 0,
+ },
+},
+[C(ITLB)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = 0x0084, /* L1 ITLB misses, L2 ITLB hits */
+ [C(RESULT_MISS)] = 0xff85, /* L1 ITLB misses, L2 misses */
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = -1,
+ [C(RESULT_MISS)] = -1,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = -1,
+ [C(RESULT_MISS)] = -1,
+ },
+},
+[C(BPU)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = 0x00c2, /* Retired Branch Instr. */
+ [C(RESULT_MISS)] = 0x00c3, /* Retired Mispredicted BI */
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = -1,
+ [C(RESULT_MISS)] = -1,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = -1,
+ [C(RESULT_MISS)] = -1,
+ },
+},
+[C(NODE)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = 0,
+ [C(RESULT_MISS)] = 0,
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = -1,
+ [C(RESULT_MISS)] = -1,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = -1,
+ [C(RESULT_MISS)] = -1,
+ },
+},
+};
+
+/*
+ * AMD Performance Monitor K7 and later, up to and including Family 16h:
+ */
+static const u64 amd_perfmon_event_map[PERF_COUNT_HW_MAX] =
+{
+ [PERF_COUNT_HW_CPU_CYCLES] = 0x0076,
+ [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
+ [PERF_COUNT_HW_CACHE_REFERENCES] = 0x077d,
+ [PERF_COUNT_HW_CACHE_MISSES] = 0x077e,
+ [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2,
+ [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3,
+ [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00d0, /* "Decoder empty" event */
+ [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x00d1, /* "Dispatch stalls" event */
+};
+
+/*
+ * AMD Performance Monitor Family 17h and later:
+ */
+static const u64 amd_f17h_perfmon_event_map[PERF_COUNT_HW_MAX] =
+{
+ [PERF_COUNT_HW_CPU_CYCLES] = 0x0076,
+ [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
+ [PERF_COUNT_HW_CACHE_REFERENCES] = 0xff60,
+ [PERF_COUNT_HW_CACHE_MISSES] = 0x0964,
+ [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2,
+ [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3,
+ [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x0287,
+ [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x0187,
+};
+
+static u64 amd_pmu_event_map(int hw_event)
+{
+ if (boot_cpu_data.x86 >= 0x17)
+ return amd_f17h_perfmon_event_map[hw_event];
+
+ return amd_perfmon_event_map[hw_event];
+}
+
+/*
+ * Previously calculated offsets
+ */
+static unsigned int event_offsets[X86_PMC_IDX_MAX] __read_mostly;
+static unsigned int count_offsets[X86_PMC_IDX_MAX] __read_mostly;
+
+/*
+ * Legacy CPUs:
+ * 4 counters starting at 0xc0010000 each offset by 1
+ *
+ * CPUs with core performance counter extensions:
+ * 6 counters starting at 0xc0010200 each offset by 2
+ */
+static inline int amd_pmu_addr_offset(int index, bool eventsel)
+{
+ int offset;
+
+ if (!index)
+ return index;
+
+ if (eventsel)
+ offset = event_offsets[index];
+ else
+ offset = count_offsets[index];
+
+ if (offset)
+ return offset;
+
+ if (!boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
+ offset = index;
+ else
+ offset = index << 1;
+
+ if (eventsel)
+ event_offsets[index] = offset;
+ else
+ count_offsets[index] = offset;
+
+ return offset;
+}
+
+static int amd_core_hw_config(struct perf_event *event)
+{
+ if (event->attr.exclude_host && event->attr.exclude_guest)
+ /*
+ * When HO == GO == 1 the hardware treats that as GO == HO == 0
+ * and will count in both modes. We don't want to count in that
+ * case so we emulate no-counting by setting US = OS = 0.
+ */
+ event->hw.config &= ~(ARCH_PERFMON_EVENTSEL_USR |
+ ARCH_PERFMON_EVENTSEL_OS);
+ else if (event->attr.exclude_host)
+ event->hw.config |= AMD64_EVENTSEL_GUESTONLY;
+ else if (event->attr.exclude_guest)
+ event->hw.config |= AMD64_EVENTSEL_HOSTONLY;
+
+ return 0;
+}
+
+/*
+ * AMD64 events are detected based on their event codes.
+ */
+static inline unsigned int amd_get_event_code(struct hw_perf_event *hwc)
+{
+ return ((hwc->config >> 24) & 0x0f00) | (hwc->config & 0x00ff);
+}
+
+static inline int amd_is_nb_event(struct hw_perf_event *hwc)
+{
+ return (hwc->config & 0xe0) == 0xe0;
+}
+
+static inline int amd_has_nb(struct cpu_hw_events *cpuc)
+{
+ struct amd_nb *nb = cpuc->amd_nb;
+
+ return nb && nb->nb_id != -1;
+}
+
+static int amd_pmu_hw_config(struct perf_event *event)
+{
+ int ret;
+
+ /* pass precise event sampling to ibs: */
+ if (event->attr.precise_ip && get_ibs_caps())
+ return -ENOENT;
+
+ if (has_branch_stack(event))
+ return -EOPNOTSUPP;
+
+ ret = x86_pmu_hw_config(event);
+ if (ret)
+ return ret;
+
+ if (event->attr.type == PERF_TYPE_RAW)
+ event->hw.config |= event->attr.config & AMD64_RAW_EVENT_MASK;
+
+ return amd_core_hw_config(event);
+}
+
+static void __amd_put_nb_event_constraints(struct cpu_hw_events *cpuc,
+ struct perf_event *event)
+{
+ struct amd_nb *nb = cpuc->amd_nb;
+ int i;
+
+ /*
+ * need to scan whole list because event may not have
+ * been assigned during scheduling
+ *
+ * no race condition possible because event can only
+ * be removed on one CPU at a time AND PMU is disabled
+ * when we come here
+ */
+ for (i = 0; i < x86_pmu.num_counters; i++) {
+ if (cmpxchg(nb->owners + i, event, NULL) == event)
+ break;
+ }
+}
+
+ /*
+ * AMD64 NorthBridge events need special treatment because
+ * counter access needs to be synchronized across all cores
+ * of a package. Refer to BKDG section 3.12
+ *
+ * NB events are events measuring L3 cache, Hypertransport
+ * traffic. They are identified by an event code >= 0xe00.
+ * They measure events on the NorthBride which is shared
+ * by all cores on a package. NB events are counted on a
+ * shared set of counters. When a NB event is programmed
+ * in a counter, the data actually comes from a shared
+ * counter. Thus, access to those counters needs to be
+ * synchronized.
+ *
+ * We implement the synchronization such that no two cores
+ * can be measuring NB events using the same counters. Thus,
+ * we maintain a per-NB allocation table. The available slot
+ * is propagated using the event_constraint structure.
+ *
+ * We provide only one choice for each NB event based on
+ * the fact that only NB events have restrictions. Consequently,
+ * if a counter is available, there is a guarantee the NB event
+ * will be assigned to it. If no slot is available, an empty
+ * constraint is returned and scheduling will eventually fail
+ * for this event.
+ *
+ * Note that all cores attached the same NB compete for the same
+ * counters to host NB events, this is why we use atomic ops. Some
+ * multi-chip CPUs may have more than one NB.
+ *
+ * Given that resources are allocated (cmpxchg), they must be
+ * eventually freed for others to use. This is accomplished by
+ * calling __amd_put_nb_event_constraints()
+ *
+ * Non NB events are not impacted by this restriction.
+ */
+static struct event_constraint *
+__amd_get_nb_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
+ struct event_constraint *c)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct amd_nb *nb = cpuc->amd_nb;
+ struct perf_event *old;
+ int idx, new = -1;
+
+ if (!c)
+ c = &unconstrained;
+
+ if (cpuc->is_fake)
+ return c;
+
+ /*
+ * detect if already present, if so reuse
+ *
+ * cannot merge with actual allocation
+ * because of possible holes
+ *
+ * event can already be present yet not assigned (in hwc->idx)
+ * because of successive calls to x86_schedule_events() from
+ * hw_perf_group_sched_in() without hw_perf_enable()
+ */
+ for_each_set_bit(idx, c->idxmsk, x86_pmu.num_counters) {
+ if (new == -1 || hwc->idx == idx)
+ /* assign free slot, prefer hwc->idx */
+ old = cmpxchg(nb->owners + idx, NULL, event);
+ else if (nb->owners[idx] == event)
+ /* event already present */
+ old = event;
+ else
+ continue;
+
+ if (old && old != event)
+ continue;
+
+ /* reassign to this slot */
+ if (new != -1)
+ cmpxchg(nb->owners + new, event, NULL);
+ new = idx;
+
+ /* already present, reuse */
+ if (old == event)
+ break;
+ }
+
+ if (new == -1)
+ return &emptyconstraint;
+
+ return &nb->event_constraints[new];
+}
+
+static struct amd_nb *amd_alloc_nb(int cpu)
+{
+ struct amd_nb *nb;
+ int i;
+
+ nb = kzalloc_node(sizeof(struct amd_nb), GFP_KERNEL, cpu_to_node(cpu));
+ if (!nb)
+ return NULL;
+
+ nb->nb_id = -1;
+
+ /*
+ * initialize all possible NB constraints
+ */
+ for (i = 0; i < x86_pmu.num_counters; i++) {
+ __set_bit(i, nb->event_constraints[i].idxmsk);
+ nb->event_constraints[i].weight = 1;
+ }
+ return nb;
+}
+
+static int amd_pmu_cpu_prepare(int cpu)
+{
+ struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+
+ WARN_ON_ONCE(cpuc->amd_nb);
+
+ if (!x86_pmu.amd_nb_constraints)
+ return 0;
+
+ cpuc->amd_nb = amd_alloc_nb(cpu);
+ if (!cpuc->amd_nb)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static void amd_pmu_cpu_starting(int cpu)
+{
+ struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+ void **onln = &cpuc->kfree_on_online[X86_PERF_KFREE_SHARED];
+ struct amd_nb *nb;
+ int i, nb_id;
+
+ cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY;
+
+ if (!x86_pmu.amd_nb_constraints)
+ return;
+
+ nb_id = amd_get_nb_id(cpu);
+ WARN_ON_ONCE(nb_id == BAD_APICID);
+
+ for_each_online_cpu(i) {
+ nb = per_cpu(cpu_hw_events, i).amd_nb;
+ if (WARN_ON_ONCE(!nb))
+ continue;
+
+ if (nb->nb_id == nb_id) {
+ *onln = cpuc->amd_nb;
+ cpuc->amd_nb = nb;
+ break;
+ }
+ }
+
+ cpuc->amd_nb->nb_id = nb_id;
+ cpuc->amd_nb->refcnt++;
+}
+
+static void amd_pmu_cpu_dead(int cpu)
+{
+ struct cpu_hw_events *cpuhw;
+
+ if (!x86_pmu.amd_nb_constraints)
+ return;
+
+ cpuhw = &per_cpu(cpu_hw_events, cpu);
+
+ if (cpuhw->amd_nb) {
+ struct amd_nb *nb = cpuhw->amd_nb;
+
+ if (nb->nb_id == -1 || --nb->refcnt == 0)
+ kfree(nb);
+
+ cpuhw->amd_nb = NULL;
+ }
+}
+
+/*
+ * When a PMC counter overflows, an NMI is used to process the event and
+ * reset the counter. NMI latency can result in the counter being updated
+ * before the NMI can run, which can result in what appear to be spurious
+ * NMIs. This function is intended to wait for the NMI to run and reset
+ * the counter to avoid possible unhandled NMI messages.
+ */
+#define OVERFLOW_WAIT_COUNT 50
+
+static void amd_pmu_wait_on_overflow(int idx)
+{
+ unsigned int i;
+ u64 counter;
+
+ /*
+ * Wait for the counter to be reset if it has overflowed. This loop
+ * should exit very, very quickly, but just in case, don't wait
+ * forever...
+ */
+ for (i = 0; i < OVERFLOW_WAIT_COUNT; i++) {
+ rdmsrl(x86_pmu_event_addr(idx), counter);
+ if (counter & (1ULL << (x86_pmu.cntval_bits - 1)))
+ break;
+
+ /* Might be in IRQ context, so can't sleep */
+ udelay(1);
+ }
+}
+
+static void amd_pmu_disable_all(void)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ int idx;
+
+ x86_pmu_disable_all();
+
+ /*
+ * This shouldn't be called from NMI context, but add a safeguard here
+ * to return, since if we're in NMI context we can't wait for an NMI
+ * to reset an overflowed counter value.
+ */
+ if (in_nmi())
+ return;
+
+ /*
+ * Check each counter for overflow and wait for it to be reset by the
+ * NMI if it has overflowed. This relies on the fact that all active
+ * counters are always enabled when this function is caled and
+ * ARCH_PERFMON_EVENTSEL_INT is always set.
+ */
+ for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ if (!test_bit(idx, cpuc->active_mask))
+ continue;
+
+ amd_pmu_wait_on_overflow(idx);
+ }
+}
+
+static void amd_pmu_disable_event(struct perf_event *event)
+{
+ x86_pmu_disable_event(event);
+
+ /*
+ * This can be called from NMI context (via x86_pmu_stop). The counter
+ * may have overflowed, but either way, we'll never see it get reset
+ * by the NMI if we're already in the NMI. And the NMI latency support
+ * below will take care of any pending NMI that might have been
+ * generated by the overflow.
+ */
+ if (in_nmi())
+ return;
+
+ amd_pmu_wait_on_overflow(event->hw.idx);
+}
+
+/*
+ * Because of NMI latency, if multiple PMC counters are active or other sources
+ * of NMIs are received, the perf NMI handler can handle one or more overflowed
+ * PMC counters outside of the NMI associated with the PMC overflow. If the NMI
+ * doesn't arrive at the LAPIC in time to become a pending NMI, then the kernel
+ * back-to-back NMI support won't be active. This PMC handler needs to take into
+ * account that this can occur, otherwise this could result in unknown NMI
+ * messages being issued. Examples of this is PMC overflow while in the NMI
+ * handler when multiple PMCs are active or PMC overflow while handling some
+ * other source of an NMI.
+ *
+ * Attempt to mitigate this by creating an NMI window in which un-handled NMIs
+ * received during this window will be claimed. This prevents extending the
+ * window past when it is possible that latent NMIs should be received. The
+ * per-CPU perf_nmi_tstamp will be set to the window end time whenever perf has
+ * handled a counter. When an un-handled NMI is received, it will be claimed
+ * only if arriving within that window.
+ */
+static int amd_pmu_handle_irq(struct pt_regs *regs)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ int active, handled;
+
+ /*
+ * Obtain the active count before calling x86_pmu_handle_irq() since
+ * it is possible that x86_pmu_handle_irq() may make a counter
+ * inactive (through x86_pmu_stop).
+ */
+ active = __bitmap_weight(cpuc->active_mask, X86_PMC_IDX_MAX);
+
+ /* Process any counter overflows */
+ handled = x86_pmu_handle_irq(regs);
+
+ /*
+ * If a counter was handled, record a timestamp such that un-handled
+ * NMIs will be claimed if arriving within that window.
+ */
+ if (handled) {
+ this_cpu_write(perf_nmi_tstamp,
+ jiffies + perf_nmi_window);
+
+ return handled;
+ }
+
+ if (time_after(jiffies, this_cpu_read(perf_nmi_tstamp)))
+ return NMI_DONE;
+
+ return NMI_HANDLED;
+}
+
+static struct event_constraint *
+amd_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
+{
+ /*
+ * if not NB event or no NB, then no constraints
+ */
+ if (!(amd_has_nb(cpuc) && amd_is_nb_event(&event->hw)))
+ return &unconstrained;
+
+ return __amd_get_nb_event_constraints(cpuc, event, NULL);
+}
+
+static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
+ struct perf_event *event)
+{
+ if (amd_has_nb(cpuc) && amd_is_nb_event(&event->hw))
+ __amd_put_nb_event_constraints(cpuc, event);
+}
+
+PMU_FORMAT_ATTR(event, "config:0-7,32-35");
+PMU_FORMAT_ATTR(umask, "config:8-15" );
+PMU_FORMAT_ATTR(edge, "config:18" );
+PMU_FORMAT_ATTR(inv, "config:23" );
+PMU_FORMAT_ATTR(cmask, "config:24-31" );
+
+static struct attribute *amd_format_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_cmask.attr,
+ NULL,
+};
+
+/* AMD Family 15h */
+
+#define AMD_EVENT_TYPE_MASK 0x000000F0ULL
+
+#define AMD_EVENT_FP 0x00000000ULL ... 0x00000010ULL
+#define AMD_EVENT_LS 0x00000020ULL ... 0x00000030ULL
+#define AMD_EVENT_DC 0x00000040ULL ... 0x00000050ULL
+#define AMD_EVENT_CU 0x00000060ULL ... 0x00000070ULL
+#define AMD_EVENT_IC_DE 0x00000080ULL ... 0x00000090ULL
+#define AMD_EVENT_EX_LS 0x000000C0ULL
+#define AMD_EVENT_DE 0x000000D0ULL
+#define AMD_EVENT_NB 0x000000E0ULL ... 0x000000F0ULL
+
+/*
+ * AMD family 15h event code/PMC mappings:
+ *
+ * type = event_code & 0x0F0:
+ *
+ * 0x000 FP PERF_CTL[5:3]
+ * 0x010 FP PERF_CTL[5:3]
+ * 0x020 LS PERF_CTL[5:0]
+ * 0x030 LS PERF_CTL[5:0]
+ * 0x040 DC PERF_CTL[5:0]
+ * 0x050 DC PERF_CTL[5:0]
+ * 0x060 CU PERF_CTL[2:0]
+ * 0x070 CU PERF_CTL[2:0]
+ * 0x080 IC/DE PERF_CTL[2:0]
+ * 0x090 IC/DE PERF_CTL[2:0]
+ * 0x0A0 ---
+ * 0x0B0 ---
+ * 0x0C0 EX/LS PERF_CTL[5:0]
+ * 0x0D0 DE PERF_CTL[2:0]
+ * 0x0E0 NB NB_PERF_CTL[3:0]
+ * 0x0F0 NB NB_PERF_CTL[3:0]
+ *
+ * Exceptions:
+ *
+ * 0x000 FP PERF_CTL[3], PERF_CTL[5:3] (*)
+ * 0x003 FP PERF_CTL[3]
+ * 0x004 FP PERF_CTL[3], PERF_CTL[5:3] (*)
+ * 0x00B FP PERF_CTL[3]
+ * 0x00D FP PERF_CTL[3]
+ * 0x023 DE PERF_CTL[2:0]
+ * 0x02D LS PERF_CTL[3]
+ * 0x02E LS PERF_CTL[3,0]
+ * 0x031 LS PERF_CTL[2:0] (**)
+ * 0x043 CU PERF_CTL[2:0]
+ * 0x045 CU PERF_CTL[2:0]
+ * 0x046 CU PERF_CTL[2:0]
+ * 0x054 CU PERF_CTL[2:0]
+ * 0x055 CU PERF_CTL[2:0]
+ * 0x08F IC PERF_CTL[0]
+ * 0x187 DE PERF_CTL[0]
+ * 0x188 DE PERF_CTL[0]
+ * 0x0DB EX PERF_CTL[5:0]
+ * 0x0DC LS PERF_CTL[5:0]
+ * 0x0DD LS PERF_CTL[5:0]
+ * 0x0DE LS PERF_CTL[5:0]
+ * 0x0DF LS PERF_CTL[5:0]
+ * 0x1C0 EX PERF_CTL[5:3]
+ * 0x1D6 EX PERF_CTL[5:0]
+ * 0x1D8 EX PERF_CTL[5:0]
+ *
+ * (*) depending on the umask all FPU counters may be used
+ * (**) only one unitmask enabled at a time
+ */
+
+static struct event_constraint amd_f15_PMC0 = EVENT_CONSTRAINT(0, 0x01, 0);
+static struct event_constraint amd_f15_PMC20 = EVENT_CONSTRAINT(0, 0x07, 0);
+static struct event_constraint amd_f15_PMC3 = EVENT_CONSTRAINT(0, 0x08, 0);
+static struct event_constraint amd_f15_PMC30 = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0);
+static struct event_constraint amd_f15_PMC50 = EVENT_CONSTRAINT(0, 0x3F, 0);
+static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0);
+
+static struct event_constraint *
+amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ unsigned int event_code = amd_get_event_code(hwc);
+
+ switch (event_code & AMD_EVENT_TYPE_MASK) {
+ case AMD_EVENT_FP:
+ switch (event_code) {
+ case 0x000:
+ if (!(hwc->config & 0x0000F000ULL))
+ break;
+ if (!(hwc->config & 0x00000F00ULL))
+ break;
+ return &amd_f15_PMC3;
+ case 0x004:
+ if (hweight_long(hwc->config & ARCH_PERFMON_EVENTSEL_UMASK) <= 1)
+ break;
+ return &amd_f15_PMC3;
+ case 0x003:
+ case 0x00B:
+ case 0x00D:
+ return &amd_f15_PMC3;
+ }
+ return &amd_f15_PMC53;
+ case AMD_EVENT_LS:
+ case AMD_EVENT_DC:
+ case AMD_EVENT_EX_LS:
+ switch (event_code) {
+ case 0x023:
+ case 0x043:
+ case 0x045:
+ case 0x046:
+ case 0x054:
+ case 0x055:
+ return &amd_f15_PMC20;
+ case 0x02D:
+ return &amd_f15_PMC3;
+ case 0x02E:
+ return &amd_f15_PMC30;
+ case 0x031:
+ if (hweight_long(hwc->config & ARCH_PERFMON_EVENTSEL_UMASK) <= 1)
+ return &amd_f15_PMC20;
+ return &emptyconstraint;
+ case 0x1C0:
+ return &amd_f15_PMC53;
+ default:
+ return &amd_f15_PMC50;
+ }
+ case AMD_EVENT_CU:
+ case AMD_EVENT_IC_DE:
+ case AMD_EVENT_DE:
+ switch (event_code) {
+ case 0x08F:
+ case 0x187:
+ case 0x188:
+ return &amd_f15_PMC0;
+ case 0x0DB ... 0x0DF:
+ case 0x1D6:
+ case 0x1D8:
+ return &amd_f15_PMC50;
+ default:
+ return &amd_f15_PMC20;
+ }
+ case AMD_EVENT_NB:
+ /* moved to uncore.c */
+ return &emptyconstraint;
+ default:
+ return &emptyconstraint;
+ }
+}
+
+static ssize_t amd_event_sysfs_show(char *page, u64 config)
+{
+ u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT) |
+ (config & AMD64_EVENTSEL_EVENT) >> 24;
+
+ return x86_event_sysfs_show(page, config, event);
+}
+
+static __initconst const struct x86_pmu amd_pmu = {
+ .name = "AMD",
+ .handle_irq = amd_pmu_handle_irq,
+ .disable_all = amd_pmu_disable_all,
+ .enable_all = x86_pmu_enable_all,
+ .enable = x86_pmu_enable_event,
+ .disable = amd_pmu_disable_event,
+ .hw_config = amd_pmu_hw_config,
+ .schedule_events = x86_schedule_events,
+ .eventsel = MSR_K7_EVNTSEL0,
+ .perfctr = MSR_K7_PERFCTR0,
+ .addr_offset = amd_pmu_addr_offset,
+ .event_map = amd_pmu_event_map,
+ .max_events = ARRAY_SIZE(amd_perfmon_event_map),
+ .num_counters = AMD64_NUM_COUNTERS,
+ .cntval_bits = 48,
+ .cntval_mask = (1ULL << 48) - 1,
+ .apic = 1,
+ /* use highest bit to detect overflow */
+ .max_period = (1ULL << 47) - 1,
+ .get_event_constraints = amd_get_event_constraints,
+ .put_event_constraints = amd_put_event_constraints,
+
+ .format_attrs = amd_format_attr,
+ .events_sysfs_show = amd_event_sysfs_show,
+
+ .cpu_prepare = amd_pmu_cpu_prepare,
+ .cpu_starting = amd_pmu_cpu_starting,
+ .cpu_dead = amd_pmu_cpu_dead,
+
+ .amd_nb_constraints = 1,
+};
+
+static int __init amd_core_pmu_init(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
+ return 0;
+
+ /* Avoid calulating the value each time in the NMI handler */
+ perf_nmi_window = msecs_to_jiffies(100);
+
+ switch (boot_cpu_data.x86) {
+ case 0x15:
+ pr_cont("Fam15h ");
+ x86_pmu.get_event_constraints = amd_get_event_constraints_f15h;
+ break;
+ case 0x17:
+ pr_cont("Fam17h ");
+ /*
+ * In family 17h, there are no event constraints in the PMC hardware.
+ * We fallback to using default amd_get_event_constraints.
+ */
+ break;
+ default:
+ pr_err("core perfctr but no constraints; unknown hardware!\n");
+ return -ENODEV;
+ }
+
+ /*
+ * If core performance counter extensions exists, we must use
+ * MSR_F15H_PERF_CTL/MSR_F15H_PERF_CTR msrs. See also
+ * amd_pmu_addr_offset().
+ */
+ x86_pmu.eventsel = MSR_F15H_PERF_CTL;
+ x86_pmu.perfctr = MSR_F15H_PERF_CTR;
+ x86_pmu.num_counters = AMD64_NUM_COUNTERS_CORE;
+ /*
+ * AMD Core perfctr has separate MSRs for the NB events, see
+ * the amd/uncore.c driver.
+ */
+ x86_pmu.amd_nb_constraints = 0;
+
+ pr_cont("core perfctr, ");
+ return 0;
+}
+
+__init int amd_pmu_init(void)
+{
+ int ret;
+
+ /* Performance-monitoring supported from K7 and later: */
+ if (boot_cpu_data.x86 < 6)
+ return -ENODEV;
+
+ x86_pmu = amd_pmu;
+
+ ret = amd_core_pmu_init();
+ if (ret)
+ return ret;
+
+ if (num_possible_cpus() == 1) {
+ /*
+ * No point in allocating data structures to serialize
+ * against other CPUs, when there is only the one CPU.
+ */
+ x86_pmu.amd_nb_constraints = 0;
+ }
+
+ if (boot_cpu_data.x86 >= 0x17)
+ memcpy(hw_cache_event_ids, amd_hw_cache_event_ids_f17h, sizeof(hw_cache_event_ids));
+ else
+ memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+
+ return 0;
+}
+
+void amd_pmu_enable_virt(void)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ cpuc->perf_ctr_virt_mask = 0;
+
+ /* Reload all events */
+ amd_pmu_disable_all();
+ x86_pmu_enable_all(0);
+}
+EXPORT_SYMBOL_GPL(amd_pmu_enable_virt);
+
+void amd_pmu_disable_virt(void)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ /*
+ * We only mask out the Host-only bit so that host-only counting works
+ * when SVM is disabled. If someone sets up a guest-only counter when
+ * SVM is disabled the Guest-only bits still gets set and the counter
+ * will not count anything.
+ */
+ cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY;
+
+ /* Reload all events */
+ amd_pmu_disable_all();
+ x86_pmu_enable_all(0);
+}
+EXPORT_SYMBOL_GPL(amd_pmu_disable_virt);
diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c
new file mode 100644
index 000000000..d157d0ade
--- /dev/null
+++ b/arch/x86/events/amd/ibs.c
@@ -0,0 +1,1086 @@
+/*
+ * Performance events - AMD IBS
+ *
+ * Copyright (C) 2011 Advanced Micro Devices, Inc., Robert Richter
+ *
+ * For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/perf_event.h>
+#include <linux/init.h>
+#include <linux/export.h>
+#include <linux/pci.h>
+#include <linux/ptrace.h>
+#include <linux/syscore_ops.h>
+#include <linux/sched/clock.h>
+
+#include <asm/apic.h>
+
+#include "../perf_event.h"
+
+static u32 ibs_caps;
+
+#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD)
+
+#include <linux/kprobes.h>
+#include <linux/hardirq.h>
+
+#include <asm/nmi.h>
+
+#define IBS_FETCH_CONFIG_MASK (IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT)
+#define IBS_OP_CONFIG_MASK IBS_OP_MAX_CNT
+
+
+/*
+ * IBS states:
+ *
+ * ENABLED; tracks the pmu::add(), pmu::del() state, when set the counter is taken
+ * and any further add()s must fail.
+ *
+ * STARTED/STOPPING/STOPPED; deal with pmu::start(), pmu::stop() state but are
+ * complicated by the fact that the IBS hardware can send late NMIs (ie. after
+ * we've cleared the EN bit).
+ *
+ * In order to consume these late NMIs we have the STOPPED state, any NMI that
+ * happens after we've cleared the EN state will clear this bit and report the
+ * NMI handled (this is fundamentally racy in the face or multiple NMI sources,
+ * someone else can consume our BIT and our NMI will go unhandled).
+ *
+ * And since we cannot set/clear this separate bit together with the EN bit,
+ * there are races; if we cleared STARTED early, an NMI could land in
+ * between clearing STARTED and clearing the EN bit (in fact multiple NMIs
+ * could happen if the period is small enough), and consume our STOPPED bit
+ * and trigger streams of unhandled NMIs.
+ *
+ * If, however, we clear STARTED late, an NMI can hit between clearing the
+ * EN bit and clearing STARTED, still see STARTED set and process the event.
+ * If this event will have the VALID bit clear, we bail properly, but this
+ * is not a given. With VALID set we can end up calling pmu::stop() again
+ * (the throttle logic) and trigger the WARNs in there.
+ *
+ * So what we do is set STOPPING before clearing EN to avoid the pmu::stop()
+ * nesting, and clear STARTED late, so that we have a well defined state over
+ * the clearing of the EN bit.
+ *
+ * XXX: we could probably be using !atomic bitops for all this.
+ */
+
+enum ibs_states {
+ IBS_ENABLED = 0,
+ IBS_STARTED = 1,
+ IBS_STOPPING = 2,
+ IBS_STOPPED = 3,
+
+ IBS_MAX_STATES,
+};
+
+struct cpu_perf_ibs {
+ struct perf_event *event;
+ unsigned long state[BITS_TO_LONGS(IBS_MAX_STATES)];
+};
+
+struct perf_ibs {
+ struct pmu pmu;
+ unsigned int msr;
+ u64 config_mask;
+ u64 cnt_mask;
+ u64 enable_mask;
+ u64 valid_mask;
+ u64 max_period;
+ unsigned long offset_mask[1];
+ int offset_max;
+ unsigned int fetch_count_reset_broken : 1;
+ unsigned int fetch_ignore_if_zero_rip : 1;
+ struct cpu_perf_ibs __percpu *pcpu;
+
+ struct attribute **format_attrs;
+ struct attribute_group format_group;
+ const struct attribute_group *attr_groups[2];
+
+ u64 (*get_count)(u64 config);
+};
+
+struct perf_ibs_data {
+ u32 size;
+ union {
+ u32 data[0]; /* data buffer starts here */
+ u32 caps;
+ };
+ u64 regs[MSR_AMD64_IBS_REG_COUNT_MAX];
+};
+
+static int
+perf_event_set_period(struct hw_perf_event *hwc, u64 min, u64 max, u64 *hw_period)
+{
+ s64 left = local64_read(&hwc->period_left);
+ s64 period = hwc->sample_period;
+ int overflow = 0;
+
+ /*
+ * If we are way outside a reasonable range then just skip forward:
+ */
+ if (unlikely(left <= -period)) {
+ left = period;
+ local64_set(&hwc->period_left, left);
+ hwc->last_period = period;
+ overflow = 1;
+ }
+
+ if (unlikely(left < (s64)min)) {
+ left += period;
+ local64_set(&hwc->period_left, left);
+ hwc->last_period = period;
+ overflow = 1;
+ }
+
+ /*
+ * If the hw period that triggers the sw overflow is too short
+ * we might hit the irq handler. This biases the results.
+ * Thus we shorten the next-to-last period and set the last
+ * period to the max period.
+ */
+ if (left > max) {
+ left -= max;
+ if (left > max)
+ left = max;
+ else if (left < min)
+ left = min;
+ }
+
+ *hw_period = (u64)left;
+
+ return overflow;
+}
+
+static int
+perf_event_try_update(struct perf_event *event, u64 new_raw_count, int width)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ int shift = 64 - width;
+ u64 prev_raw_count;
+ u64 delta;
+
+ /*
+ * Careful: an NMI might modify the previous event value.
+ *
+ * Our tactic to handle this is to first atomically read and
+ * exchange a new raw count - then add that new-prev delta
+ * count to the generic event atomically:
+ */
+ prev_raw_count = local64_read(&hwc->prev_count);
+ if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
+ new_raw_count) != prev_raw_count)
+ return 0;
+
+ /*
+ * Now we have the new raw value and have updated the prev
+ * timestamp already. We can now calculate the elapsed delta
+ * (event-)time and add that to the generic event.
+ *
+ * Careful, not all hw sign-extends above the physical width
+ * of the count.
+ */
+ delta = (new_raw_count << shift) - (prev_raw_count << shift);
+ delta >>= shift;
+
+ local64_add(delta, &event->count);
+ local64_sub(delta, &hwc->period_left);
+
+ return 1;
+}
+
+static struct perf_ibs perf_ibs_fetch;
+static struct perf_ibs perf_ibs_op;
+
+static struct perf_ibs *get_ibs_pmu(int type)
+{
+ if (perf_ibs_fetch.pmu.type == type)
+ return &perf_ibs_fetch;
+ if (perf_ibs_op.pmu.type == type)
+ return &perf_ibs_op;
+ return NULL;
+}
+
+/*
+ * Use IBS for precise event sampling:
+ *
+ * perf record -a -e cpu-cycles:p ... # use ibs op counting cycle count
+ * perf record -a -e r076:p ... # same as -e cpu-cycles:p
+ * perf record -a -e r0C1:p ... # use ibs op counting micro-ops
+ *
+ * IbsOpCntCtl (bit 19) of IBS Execution Control Register (IbsOpCtl,
+ * MSRC001_1033) is used to select either cycle or micro-ops counting
+ * mode.
+ *
+ * The rip of IBS samples has skid 0. Thus, IBS supports precise
+ * levels 1 and 2 and the PERF_EFLAGS_EXACT is set. In rare cases the
+ * rip is invalid when IBS was not able to record the rip correctly.
+ * We clear PERF_EFLAGS_EXACT and take the rip from pt_regs then.
+ *
+ */
+static int perf_ibs_precise_event(struct perf_event *event, u64 *config)
+{
+ switch (event->attr.precise_ip) {
+ case 0:
+ return -ENOENT;
+ case 1:
+ case 2:
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ switch (event->attr.type) {
+ case PERF_TYPE_HARDWARE:
+ switch (event->attr.config) {
+ case PERF_COUNT_HW_CPU_CYCLES:
+ *config = 0;
+ return 0;
+ }
+ break;
+ case PERF_TYPE_RAW:
+ switch (event->attr.config) {
+ case 0x0076:
+ *config = 0;
+ return 0;
+ case 0x00C1:
+ *config = IBS_OP_CNT_CTL;
+ return 0;
+ }
+ break;
+ default:
+ return -ENOENT;
+ }
+
+ return -EOPNOTSUPP;
+}
+
+static const struct perf_event_attr ibs_notsupp = {
+ .exclude_user = 1,
+ .exclude_kernel = 1,
+ .exclude_hv = 1,
+ .exclude_idle = 1,
+ .exclude_host = 1,
+ .exclude_guest = 1,
+};
+
+static int perf_ibs_init(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct perf_ibs *perf_ibs;
+ u64 max_cnt, config;
+ int ret;
+
+ perf_ibs = get_ibs_pmu(event->attr.type);
+ if (perf_ibs) {
+ config = event->attr.config;
+ } else {
+ perf_ibs = &perf_ibs_op;
+ ret = perf_ibs_precise_event(event, &config);
+ if (ret)
+ return ret;
+ }
+
+ if (event->pmu != &perf_ibs->pmu)
+ return -ENOENT;
+
+ if (perf_flags(&event->attr) & perf_flags(&ibs_notsupp))
+ return -EINVAL;
+
+ if (config & ~perf_ibs->config_mask)
+ return -EINVAL;
+
+ if (hwc->sample_period) {
+ if (config & perf_ibs->cnt_mask)
+ /* raw max_cnt may not be set */
+ return -EINVAL;
+ if (!event->attr.sample_freq && hwc->sample_period & 0x0f)
+ /*
+ * lower 4 bits can not be set in ibs max cnt,
+ * but allowing it in case we adjust the
+ * sample period to set a frequency.
+ */
+ return -EINVAL;
+ hwc->sample_period &= ~0x0FULL;
+ if (!hwc->sample_period)
+ hwc->sample_period = 0x10;
+ } else {
+ max_cnt = config & perf_ibs->cnt_mask;
+ config &= ~perf_ibs->cnt_mask;
+ event->attr.sample_period = max_cnt << 4;
+ hwc->sample_period = event->attr.sample_period;
+ }
+
+ if (!hwc->sample_period)
+ return -EINVAL;
+
+ /*
+ * If we modify hwc->sample_period, we also need to update
+ * hwc->last_period and hwc->period_left.
+ */
+ hwc->last_period = hwc->sample_period;
+ local64_set(&hwc->period_left, hwc->sample_period);
+
+ hwc->config_base = perf_ibs->msr;
+ hwc->config = config;
+
+ /*
+ * rip recorded by IbsOpRip will not be consistent with rsp and rbp
+ * recorded as part of interrupt regs. Thus we need to use rip from
+ * interrupt regs while unwinding call stack. Setting _EARLY flag
+ * makes sure we unwind call-stack before perf sample rip is set to
+ * IbsOpRip.
+ */
+ if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
+ event->attr.sample_type |= __PERF_SAMPLE_CALLCHAIN_EARLY;
+
+ return 0;
+}
+
+static int perf_ibs_set_period(struct perf_ibs *perf_ibs,
+ struct hw_perf_event *hwc, u64 *period)
+{
+ int overflow;
+
+ /* ignore lower 4 bits in min count: */
+ overflow = perf_event_set_period(hwc, 1<<4, perf_ibs->max_period, period);
+ local64_set(&hwc->prev_count, 0);
+
+ return overflow;
+}
+
+static u64 get_ibs_fetch_count(u64 config)
+{
+ return (config & IBS_FETCH_CNT) >> 12;
+}
+
+static u64 get_ibs_op_count(u64 config)
+{
+ u64 count = 0;
+
+ /*
+ * If the internal 27-bit counter rolled over, the count is MaxCnt
+ * and the lower 7 bits of CurCnt are randomized.
+ * Otherwise CurCnt has the full 27-bit current counter value.
+ */
+ if (config & IBS_OP_VAL)
+ count = (config & IBS_OP_MAX_CNT) << 4;
+ else if (ibs_caps & IBS_CAPS_RDWROPCNT)
+ count = (config & IBS_OP_CUR_CNT) >> 32;
+
+ return count;
+}
+
+static void
+perf_ibs_event_update(struct perf_ibs *perf_ibs, struct perf_event *event,
+ u64 *config)
+{
+ u64 count = perf_ibs->get_count(*config);
+
+ /*
+ * Set width to 64 since we do not overflow on max width but
+ * instead on max count. In perf_ibs_set_period() we clear
+ * prev count manually on overflow.
+ */
+ while (!perf_event_try_update(event, count, 64)) {
+ rdmsrl(event->hw.config_base, *config);
+ count = perf_ibs->get_count(*config);
+ }
+}
+
+static inline void perf_ibs_enable_event(struct perf_ibs *perf_ibs,
+ struct hw_perf_event *hwc, u64 config)
+{
+ u64 tmp = hwc->config | config;
+
+ if (perf_ibs->fetch_count_reset_broken)
+ wrmsrl(hwc->config_base, tmp & ~perf_ibs->enable_mask);
+
+ wrmsrl(hwc->config_base, tmp | perf_ibs->enable_mask);
+}
+
+/*
+ * Erratum #420 Instruction-Based Sampling Engine May Generate
+ * Interrupt that Cannot Be Cleared:
+ *
+ * Must clear counter mask first, then clear the enable bit. See
+ * Revision Guide for AMD Family 10h Processors, Publication #41322.
+ */
+static inline void perf_ibs_disable_event(struct perf_ibs *perf_ibs,
+ struct hw_perf_event *hwc, u64 config)
+{
+ config &= ~perf_ibs->cnt_mask;
+ if (boot_cpu_data.x86 == 0x10)
+ wrmsrl(hwc->config_base, config);
+ config &= ~perf_ibs->enable_mask;
+ wrmsrl(hwc->config_base, config);
+}
+
+/*
+ * We cannot restore the ibs pmu state, so we always needs to update
+ * the event while stopping it and then reset the state when starting
+ * again. Thus, ignoring PERF_EF_RELOAD and PERF_EF_UPDATE flags in
+ * perf_ibs_start()/perf_ibs_stop() and instead always do it.
+ */
+static void perf_ibs_start(struct perf_event *event, int flags)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
+ struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
+ u64 period;
+
+ if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
+ return;
+
+ WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
+ hwc->state = 0;
+
+ perf_ibs_set_period(perf_ibs, hwc, &period);
+ /*
+ * Set STARTED before enabling the hardware, such that a subsequent NMI
+ * must observe it.
+ */
+ set_bit(IBS_STARTED, pcpu->state);
+ clear_bit(IBS_STOPPING, pcpu->state);
+ perf_ibs_enable_event(perf_ibs, hwc, period >> 4);
+
+ perf_event_update_userpage(event);
+}
+
+static void perf_ibs_stop(struct perf_event *event, int flags)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
+ struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
+ u64 config;
+ int stopping;
+
+ if (test_and_set_bit(IBS_STOPPING, pcpu->state))
+ return;
+
+ stopping = test_bit(IBS_STARTED, pcpu->state);
+
+ if (!stopping && (hwc->state & PERF_HES_UPTODATE))
+ return;
+
+ rdmsrl(hwc->config_base, config);
+
+ if (stopping) {
+ /*
+ * Set STOPPED before disabling the hardware, such that it
+ * must be visible to NMIs the moment we clear the EN bit,
+ * at which point we can generate an !VALID sample which
+ * we need to consume.
+ */
+ set_bit(IBS_STOPPED, pcpu->state);
+ perf_ibs_disable_event(perf_ibs, hwc, config);
+ /*
+ * Clear STARTED after disabling the hardware; if it were
+ * cleared before an NMI hitting after the clear but before
+ * clearing the EN bit might think it a spurious NMI and not
+ * handle it.
+ *
+ * Clearing it after, however, creates the problem of the NMI
+ * handler seeing STARTED but not having a valid sample.
+ */
+ clear_bit(IBS_STARTED, pcpu->state);
+ WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+ hwc->state |= PERF_HES_STOPPED;
+ }
+
+ if (hwc->state & PERF_HES_UPTODATE)
+ return;
+
+ /*
+ * Clear valid bit to not count rollovers on update, rollovers
+ * are only updated in the irq handler.
+ */
+ config &= ~perf_ibs->valid_mask;
+
+ perf_ibs_event_update(perf_ibs, event, &config);
+ hwc->state |= PERF_HES_UPTODATE;
+}
+
+static int perf_ibs_add(struct perf_event *event, int flags)
+{
+ struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
+ struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
+
+ if (test_and_set_bit(IBS_ENABLED, pcpu->state))
+ return -ENOSPC;
+
+ event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+
+ pcpu->event = event;
+
+ if (flags & PERF_EF_START)
+ perf_ibs_start(event, PERF_EF_RELOAD);
+
+ return 0;
+}
+
+static void perf_ibs_del(struct perf_event *event, int flags)
+{
+ struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
+ struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
+
+ if (!test_and_clear_bit(IBS_ENABLED, pcpu->state))
+ return;
+
+ perf_ibs_stop(event, PERF_EF_UPDATE);
+
+ pcpu->event = NULL;
+
+ perf_event_update_userpage(event);
+}
+
+static void perf_ibs_read(struct perf_event *event) { }
+
+PMU_FORMAT_ATTR(rand_en, "config:57");
+PMU_FORMAT_ATTR(cnt_ctl, "config:19");
+
+static struct attribute *ibs_fetch_format_attrs[] = {
+ &format_attr_rand_en.attr,
+ NULL,
+};
+
+static struct attribute *ibs_op_format_attrs[] = {
+ NULL, /* &format_attr_cnt_ctl.attr if IBS_CAPS_OPCNT */
+ NULL,
+};
+
+static struct perf_ibs perf_ibs_fetch = {
+ .pmu = {
+ .task_ctx_nr = perf_invalid_context,
+
+ .event_init = perf_ibs_init,
+ .add = perf_ibs_add,
+ .del = perf_ibs_del,
+ .start = perf_ibs_start,
+ .stop = perf_ibs_stop,
+ .read = perf_ibs_read,
+ },
+ .msr = MSR_AMD64_IBSFETCHCTL,
+ .config_mask = IBS_FETCH_CONFIG_MASK,
+ .cnt_mask = IBS_FETCH_MAX_CNT,
+ .enable_mask = IBS_FETCH_ENABLE,
+ .valid_mask = IBS_FETCH_VAL,
+ .max_period = IBS_FETCH_MAX_CNT << 4,
+ .offset_mask = { MSR_AMD64_IBSFETCH_REG_MASK },
+ .offset_max = MSR_AMD64_IBSFETCH_REG_COUNT,
+ .format_attrs = ibs_fetch_format_attrs,
+
+ .get_count = get_ibs_fetch_count,
+};
+
+static struct perf_ibs perf_ibs_op = {
+ .pmu = {
+ .task_ctx_nr = perf_invalid_context,
+
+ .event_init = perf_ibs_init,
+ .add = perf_ibs_add,
+ .del = perf_ibs_del,
+ .start = perf_ibs_start,
+ .stop = perf_ibs_stop,
+ .read = perf_ibs_read,
+ },
+ .msr = MSR_AMD64_IBSOPCTL,
+ .config_mask = IBS_OP_CONFIG_MASK,
+ .cnt_mask = IBS_OP_MAX_CNT | IBS_OP_CUR_CNT |
+ IBS_OP_CUR_CNT_RAND,
+ .enable_mask = IBS_OP_ENABLE,
+ .valid_mask = IBS_OP_VAL,
+ .max_period = IBS_OP_MAX_CNT << 4,
+ .offset_mask = { MSR_AMD64_IBSOP_REG_MASK },
+ .offset_max = MSR_AMD64_IBSOP_REG_COUNT,
+ .format_attrs = ibs_op_format_attrs,
+
+ .get_count = get_ibs_op_count,
+};
+
+static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
+{
+ struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
+ struct perf_event *event = pcpu->event;
+ struct hw_perf_event *hwc;
+ struct perf_sample_data data;
+ struct perf_raw_record raw;
+ struct pt_regs regs;
+ struct perf_ibs_data ibs_data;
+ int offset, size, check_rip, offset_max, throttle = 0;
+ unsigned int msr;
+ u64 *buf, *config, period;
+
+ if (!test_bit(IBS_STARTED, pcpu->state)) {
+fail:
+ /*
+ * Catch spurious interrupts after stopping IBS: After
+ * disabling IBS there could be still incoming NMIs
+ * with samples that even have the valid bit cleared.
+ * Mark all this NMIs as handled.
+ */
+ if (test_and_clear_bit(IBS_STOPPED, pcpu->state))
+ return 1;
+
+ return 0;
+ }
+
+ if (WARN_ON_ONCE(!event))
+ goto fail;
+
+ hwc = &event->hw;
+ msr = hwc->config_base;
+ buf = ibs_data.regs;
+ rdmsrl(msr, *buf);
+ if (!(*buf++ & perf_ibs->valid_mask))
+ goto fail;
+
+ config = &ibs_data.regs[0];
+ perf_ibs_event_update(perf_ibs, event, config);
+ perf_sample_data_init(&data, 0, hwc->last_period);
+ if (!perf_ibs_set_period(perf_ibs, hwc, &period))
+ goto out; /* no sw counter overflow */
+
+ ibs_data.caps = ibs_caps;
+ size = 1;
+ offset = 1;
+ check_rip = (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_RIPINVALIDCHK));
+ if (event->attr.sample_type & PERF_SAMPLE_RAW)
+ offset_max = perf_ibs->offset_max;
+ else if (check_rip)
+ offset_max = 3;
+ else
+ offset_max = 1;
+ do {
+ rdmsrl(msr + offset, *buf++);
+ size++;
+ offset = find_next_bit(perf_ibs->offset_mask,
+ perf_ibs->offset_max,
+ offset + 1);
+ } while (offset < offset_max);
+ /*
+ * Read IbsBrTarget, IbsOpData4, and IbsExtdCtl separately
+ * depending on their availability.
+ * Can't add to offset_max as they are staggered
+ */
+ if (event->attr.sample_type & PERF_SAMPLE_RAW) {
+ if (perf_ibs == &perf_ibs_op) {
+ if (ibs_caps & IBS_CAPS_BRNTRGT) {
+ rdmsrl(MSR_AMD64_IBSBRTARGET, *buf++);
+ size++;
+ }
+ if (ibs_caps & IBS_CAPS_OPDATA4) {
+ rdmsrl(MSR_AMD64_IBSOPDATA4, *buf++);
+ size++;
+ }
+ }
+ if (perf_ibs == &perf_ibs_fetch && (ibs_caps & IBS_CAPS_FETCHCTLEXTD)) {
+ rdmsrl(MSR_AMD64_ICIBSEXTDCTL, *buf++);
+ size++;
+ }
+ }
+ ibs_data.size = sizeof(u64) * size;
+
+ regs = *iregs;
+ if (check_rip && (ibs_data.regs[2] & IBS_RIP_INVALID)) {
+ regs.flags &= ~PERF_EFLAGS_EXACT;
+ } else {
+ /* Workaround for erratum #1197 */
+ if (perf_ibs->fetch_ignore_if_zero_rip && !(ibs_data.regs[1]))
+ goto out;
+
+ set_linear_ip(&regs, ibs_data.regs[1]);
+ regs.flags |= PERF_EFLAGS_EXACT;
+ }
+
+ if (event->attr.sample_type & PERF_SAMPLE_RAW) {
+ raw = (struct perf_raw_record){
+ .frag = {
+ .size = sizeof(u32) + ibs_data.size,
+ .data = ibs_data.data,
+ },
+ };
+ data.raw = &raw;
+ }
+
+ /*
+ * rip recorded by IbsOpRip will not be consistent with rsp and rbp
+ * recorded as part of interrupt regs. Thus we need to use rip from
+ * interrupt regs while unwinding call stack.
+ */
+ if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
+ data.callchain = perf_callchain(event, iregs);
+
+ throttle = perf_event_overflow(event, &data, &regs);
+out:
+ if (throttle) {
+ perf_ibs_stop(event, 0);
+ } else {
+ period >>= 4;
+
+ if ((ibs_caps & IBS_CAPS_RDWROPCNT) &&
+ (*config & IBS_OP_CNT_CTL))
+ period |= *config & IBS_OP_CUR_CNT_RAND;
+
+ perf_ibs_enable_event(perf_ibs, hwc, period);
+ }
+
+ perf_event_update_userpage(event);
+
+ return 1;
+}
+
+static int
+perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs)
+{
+ u64 stamp = sched_clock();
+ int handled = 0;
+
+ handled += perf_ibs_handle_irq(&perf_ibs_fetch, regs);
+ handled += perf_ibs_handle_irq(&perf_ibs_op, regs);
+
+ if (handled)
+ inc_irq_stat(apic_perf_irqs);
+
+ perf_sample_event_took(sched_clock() - stamp);
+
+ return handled;
+}
+NOKPROBE_SYMBOL(perf_ibs_nmi_handler);
+
+static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name)
+{
+ struct cpu_perf_ibs __percpu *pcpu;
+ int ret;
+
+ pcpu = alloc_percpu(struct cpu_perf_ibs);
+ if (!pcpu)
+ return -ENOMEM;
+
+ perf_ibs->pcpu = pcpu;
+
+ /* register attributes */
+ if (perf_ibs->format_attrs[0]) {
+ memset(&perf_ibs->format_group, 0, sizeof(perf_ibs->format_group));
+ perf_ibs->format_group.name = "format";
+ perf_ibs->format_group.attrs = perf_ibs->format_attrs;
+
+ memset(&perf_ibs->attr_groups, 0, sizeof(perf_ibs->attr_groups));
+ perf_ibs->attr_groups[0] = &perf_ibs->format_group;
+ perf_ibs->pmu.attr_groups = perf_ibs->attr_groups;
+ }
+
+ ret = perf_pmu_register(&perf_ibs->pmu, name, -1);
+ if (ret) {
+ perf_ibs->pcpu = NULL;
+ free_percpu(pcpu);
+ }
+
+ return ret;
+}
+
+static __init void perf_event_ibs_init(void)
+{
+ struct attribute **attr = ibs_op_format_attrs;
+
+ /*
+ * Some chips fail to reset the fetch count when it is written; instead
+ * they need a 0-1 transition of IbsFetchEn.
+ */
+ if (boot_cpu_data.x86 >= 0x16 && boot_cpu_data.x86 <= 0x18)
+ perf_ibs_fetch.fetch_count_reset_broken = 1;
+
+ if (boot_cpu_data.x86 == 0x19 && boot_cpu_data.x86_model < 0x10)
+ perf_ibs_fetch.fetch_ignore_if_zero_rip = 1;
+
+ perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch");
+
+ if (ibs_caps & IBS_CAPS_OPCNT) {
+ perf_ibs_op.config_mask |= IBS_OP_CNT_CTL;
+ *attr++ = &format_attr_cnt_ctl.attr;
+ }
+ perf_ibs_pmu_init(&perf_ibs_op, "ibs_op");
+
+ register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs");
+ pr_info("perf: AMD IBS detected (0x%08x)\n", ibs_caps);
+}
+
+#else /* defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) */
+
+static __init void perf_event_ibs_init(void) { }
+
+#endif
+
+/* IBS - apic initialization, for perf and oprofile */
+
+static __init u32 __get_ibs_caps(void)
+{
+ u32 caps;
+ unsigned int max_level;
+
+ if (!boot_cpu_has(X86_FEATURE_IBS))
+ return 0;
+
+ /* check IBS cpuid feature flags */
+ max_level = cpuid_eax(0x80000000);
+ if (max_level < IBS_CPUID_FEATURES)
+ return IBS_CAPS_DEFAULT;
+
+ caps = cpuid_eax(IBS_CPUID_FEATURES);
+ if (!(caps & IBS_CAPS_AVAIL))
+ /* cpuid flags not valid */
+ return IBS_CAPS_DEFAULT;
+
+ return caps;
+}
+
+u32 get_ibs_caps(void)
+{
+ return ibs_caps;
+}
+
+EXPORT_SYMBOL(get_ibs_caps);
+
+static inline int get_eilvt(int offset)
+{
+ return !setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 1);
+}
+
+static inline int put_eilvt(int offset)
+{
+ return !setup_APIC_eilvt(offset, 0, 0, 1);
+}
+
+/*
+ * Check and reserve APIC extended interrupt LVT offset for IBS if available.
+ */
+static inline int ibs_eilvt_valid(void)
+{
+ int offset;
+ u64 val;
+ int valid = 0;
+
+ preempt_disable();
+
+ rdmsrl(MSR_AMD64_IBSCTL, val);
+ offset = val & IBSCTL_LVT_OFFSET_MASK;
+
+ if (!(val & IBSCTL_LVT_OFFSET_VALID)) {
+ pr_err(FW_BUG "cpu %d, invalid IBS interrupt offset %d (MSR%08X=0x%016llx)\n",
+ smp_processor_id(), offset, MSR_AMD64_IBSCTL, val);
+ goto out;
+ }
+
+ if (!get_eilvt(offset)) {
+ pr_err(FW_BUG "cpu %d, IBS interrupt offset %d not available (MSR%08X=0x%016llx)\n",
+ smp_processor_id(), offset, MSR_AMD64_IBSCTL, val);
+ goto out;
+ }
+
+ valid = 1;
+out:
+ preempt_enable();
+
+ return valid;
+}
+
+static int setup_ibs_ctl(int ibs_eilvt_off)
+{
+ struct pci_dev *cpu_cfg;
+ int nodes;
+ u32 value = 0;
+
+ nodes = 0;
+ cpu_cfg = NULL;
+ do {
+ cpu_cfg = pci_get_device(PCI_VENDOR_ID_AMD,
+ PCI_DEVICE_ID_AMD_10H_NB_MISC,
+ cpu_cfg);
+ if (!cpu_cfg)
+ break;
+ ++nodes;
+ pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off
+ | IBSCTL_LVT_OFFSET_VALID);
+ pci_read_config_dword(cpu_cfg, IBSCTL, &value);
+ if (value != (ibs_eilvt_off | IBSCTL_LVT_OFFSET_VALID)) {
+ pci_dev_put(cpu_cfg);
+ pr_debug("Failed to setup IBS LVT offset, IBSCTL = 0x%08x\n",
+ value);
+ return -EINVAL;
+ }
+ } while (1);
+
+ if (!nodes) {
+ pr_debug("No CPU node configured for IBS\n");
+ return -ENODEV;
+ }
+
+ return 0;
+}
+
+/*
+ * This runs only on the current cpu. We try to find an LVT offset and
+ * setup the local APIC. For this we must disable preemption. On
+ * success we initialize all nodes with this offset. This updates then
+ * the offset in the IBS_CTL per-node msr. The per-core APIC setup of
+ * the IBS interrupt vector is handled by perf_ibs_cpu_notifier that
+ * is using the new offset.
+ */
+static void force_ibs_eilvt_setup(void)
+{
+ int offset;
+ int ret;
+
+ preempt_disable();
+ /* find the next free available EILVT entry, skip offset 0 */
+ for (offset = 1; offset < APIC_EILVT_NR_MAX; offset++) {
+ if (get_eilvt(offset))
+ break;
+ }
+ preempt_enable();
+
+ if (offset == APIC_EILVT_NR_MAX) {
+ pr_debug("No EILVT entry available\n");
+ return;
+ }
+
+ ret = setup_ibs_ctl(offset);
+ if (ret)
+ goto out;
+
+ if (!ibs_eilvt_valid())
+ goto out;
+
+ pr_info("LVT offset %d assigned\n", offset);
+
+ return;
+out:
+ preempt_disable();
+ put_eilvt(offset);
+ preempt_enable();
+ return;
+}
+
+static void ibs_eilvt_setup(void)
+{
+ /*
+ * Force LVT offset assignment for family 10h: The offsets are
+ * not assigned by the BIOS for this family, so the OS is
+ * responsible for doing it. If the OS assignment fails, fall
+ * back to BIOS settings and try to setup this.
+ */
+ if (boot_cpu_data.x86 == 0x10)
+ force_ibs_eilvt_setup();
+}
+
+static inline int get_ibs_lvt_offset(void)
+{
+ u64 val;
+
+ rdmsrl(MSR_AMD64_IBSCTL, val);
+ if (!(val & IBSCTL_LVT_OFFSET_VALID))
+ return -EINVAL;
+
+ return val & IBSCTL_LVT_OFFSET_MASK;
+}
+
+static void setup_APIC_ibs(void)
+{
+ int offset;
+
+ offset = get_ibs_lvt_offset();
+ if (offset < 0)
+ goto failed;
+
+ if (!setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 0))
+ return;
+failed:
+ pr_warn("perf: IBS APIC setup failed on cpu #%d\n",
+ smp_processor_id());
+}
+
+static void clear_APIC_ibs(void)
+{
+ int offset;
+
+ offset = get_ibs_lvt_offset();
+ if (offset >= 0)
+ setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1);
+}
+
+static int x86_pmu_amd_ibs_starting_cpu(unsigned int cpu)
+{
+ setup_APIC_ibs();
+ return 0;
+}
+
+#ifdef CONFIG_PM
+
+static int perf_ibs_suspend(void)
+{
+ clear_APIC_ibs();
+ return 0;
+}
+
+static void perf_ibs_resume(void)
+{
+ ibs_eilvt_setup();
+ setup_APIC_ibs();
+}
+
+static struct syscore_ops perf_ibs_syscore_ops = {
+ .resume = perf_ibs_resume,
+ .suspend = perf_ibs_suspend,
+};
+
+static void perf_ibs_pm_init(void)
+{
+ register_syscore_ops(&perf_ibs_syscore_ops);
+}
+
+#else
+
+static inline void perf_ibs_pm_init(void) { }
+
+#endif
+
+static int x86_pmu_amd_ibs_dying_cpu(unsigned int cpu)
+{
+ clear_APIC_ibs();
+ return 0;
+}
+
+static __init int amd_ibs_init(void)
+{
+ u32 caps;
+
+ caps = __get_ibs_caps();
+ if (!caps)
+ return -ENODEV; /* ibs not supported by the cpu */
+
+ ibs_eilvt_setup();
+
+ if (!ibs_eilvt_valid())
+ return -EINVAL;
+
+ perf_ibs_pm_init();
+
+ ibs_caps = caps;
+ /* make ibs_caps visible to other cpus: */
+ smp_mb();
+ /*
+ * x86_pmu_amd_ibs_starting_cpu will be called from core on
+ * all online cpus.
+ */
+ cpuhp_setup_state(CPUHP_AP_PERF_X86_AMD_IBS_STARTING,
+ "perf/x86/amd/ibs:starting",
+ x86_pmu_amd_ibs_starting_cpu,
+ x86_pmu_amd_ibs_dying_cpu);
+
+ perf_event_ibs_init();
+
+ return 0;
+}
+
+/* Since we need the pci subsystem to init ibs we can't do this earlier: */
+device_initcall(amd_ibs_init);
diff --git a/arch/x86/events/amd/iommu.c b/arch/x86/events/amd/iommu.c
new file mode 100644
index 000000000..c08bcba5c
--- /dev/null
+++ b/arch/x86/events/amd/iommu.c
@@ -0,0 +1,490 @@
+/*
+ * Copyright (C) 2013 Advanced Micro Devices, Inc.
+ *
+ * Author: Steven Kinney <Steven.Kinney@amd.com>
+ * Author: Suravee Suthikulpanit <Suraveee.Suthikulpanit@amd.com>
+ *
+ * Perf: amd_iommu - AMD IOMMU Performance Counter PMU implementation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#define pr_fmt(fmt) "perf/amd_iommu: " fmt
+
+#include <linux/perf_event.h>
+#include <linux/init.h>
+#include <linux/cpumask.h>
+#include <linux/slab.h>
+
+#include "../perf_event.h"
+#include "iommu.h"
+
+#define COUNTER_SHIFT 16
+
+/* iommu pmu conf masks */
+#define GET_CSOURCE(x) ((x)->conf & 0xFFULL)
+#define GET_DEVID(x) (((x)->conf >> 8) & 0xFFFFULL)
+#define GET_DOMID(x) (((x)->conf >> 24) & 0xFFFFULL)
+#define GET_PASID(x) (((x)->conf >> 40) & 0xFFFFFULL)
+
+/* iommu pmu conf1 masks */
+#define GET_DEVID_MASK(x) ((x)->conf1 & 0xFFFFULL)
+#define GET_DOMID_MASK(x) (((x)->conf1 >> 16) & 0xFFFFULL)
+#define GET_PASID_MASK(x) (((x)->conf1 >> 32) & 0xFFFFFULL)
+
+#define IOMMU_NAME_SIZE 16
+
+struct perf_amd_iommu {
+ struct list_head list;
+ struct pmu pmu;
+ struct amd_iommu *iommu;
+ char name[IOMMU_NAME_SIZE];
+ u8 max_banks;
+ u8 max_counters;
+ u64 cntr_assign_mask;
+ raw_spinlock_t lock;
+};
+
+static LIST_HEAD(perf_amd_iommu_list);
+
+/*---------------------------------------------
+ * sysfs format attributes
+ *---------------------------------------------*/
+PMU_FORMAT_ATTR(csource, "config:0-7");
+PMU_FORMAT_ATTR(devid, "config:8-23");
+PMU_FORMAT_ATTR(domid, "config:24-39");
+PMU_FORMAT_ATTR(pasid, "config:40-59");
+PMU_FORMAT_ATTR(devid_mask, "config1:0-15");
+PMU_FORMAT_ATTR(domid_mask, "config1:16-31");
+PMU_FORMAT_ATTR(pasid_mask, "config1:32-51");
+
+static struct attribute *iommu_format_attrs[] = {
+ &format_attr_csource.attr,
+ &format_attr_devid.attr,
+ &format_attr_pasid.attr,
+ &format_attr_domid.attr,
+ &format_attr_devid_mask.attr,
+ &format_attr_pasid_mask.attr,
+ &format_attr_domid_mask.attr,
+ NULL,
+};
+
+static struct attribute_group amd_iommu_format_group = {
+ .name = "format",
+ .attrs = iommu_format_attrs,
+};
+
+/*---------------------------------------------
+ * sysfs events attributes
+ *---------------------------------------------*/
+static struct attribute_group amd_iommu_events_group = {
+ .name = "events",
+};
+
+struct amd_iommu_event_desc {
+ struct device_attribute attr;
+ const char *event;
+};
+
+static ssize_t _iommu_event_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct amd_iommu_event_desc *event =
+ container_of(attr, struct amd_iommu_event_desc, attr);
+ return sprintf(buf, "%s\n", event->event);
+}
+
+#define AMD_IOMMU_EVENT_DESC(_name, _event) \
+{ \
+ .attr = __ATTR(_name, 0444, _iommu_event_show, NULL), \
+ .event = _event, \
+}
+
+static struct amd_iommu_event_desc amd_iommu_v2_event_descs[] = {
+ AMD_IOMMU_EVENT_DESC(mem_pass_untrans, "csource=0x01"),
+ AMD_IOMMU_EVENT_DESC(mem_pass_pretrans, "csource=0x02"),
+ AMD_IOMMU_EVENT_DESC(mem_pass_excl, "csource=0x03"),
+ AMD_IOMMU_EVENT_DESC(mem_target_abort, "csource=0x04"),
+ AMD_IOMMU_EVENT_DESC(mem_trans_total, "csource=0x05"),
+ AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pte_hit, "csource=0x06"),
+ AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pte_mis, "csource=0x07"),
+ AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pde_hit, "csource=0x08"),
+ AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pde_mis, "csource=0x09"),
+ AMD_IOMMU_EVENT_DESC(mem_dte_hit, "csource=0x0a"),
+ AMD_IOMMU_EVENT_DESC(mem_dte_mis, "csource=0x0b"),
+ AMD_IOMMU_EVENT_DESC(page_tbl_read_tot, "csource=0x0c"),
+ AMD_IOMMU_EVENT_DESC(page_tbl_read_nst, "csource=0x0d"),
+ AMD_IOMMU_EVENT_DESC(page_tbl_read_gst, "csource=0x0e"),
+ AMD_IOMMU_EVENT_DESC(int_dte_hit, "csource=0x0f"),
+ AMD_IOMMU_EVENT_DESC(int_dte_mis, "csource=0x10"),
+ AMD_IOMMU_EVENT_DESC(cmd_processed, "csource=0x11"),
+ AMD_IOMMU_EVENT_DESC(cmd_processed_inv, "csource=0x12"),
+ AMD_IOMMU_EVENT_DESC(tlb_inv, "csource=0x13"),
+ AMD_IOMMU_EVENT_DESC(ign_rd_wr_mmio_1ff8h, "csource=0x14"),
+ AMD_IOMMU_EVENT_DESC(vapic_int_non_guest, "csource=0x15"),
+ AMD_IOMMU_EVENT_DESC(vapic_int_guest, "csource=0x16"),
+ AMD_IOMMU_EVENT_DESC(smi_recv, "csource=0x17"),
+ AMD_IOMMU_EVENT_DESC(smi_blk, "csource=0x18"),
+ { /* end: all zeroes */ },
+};
+
+/*---------------------------------------------
+ * sysfs cpumask attributes
+ *---------------------------------------------*/
+static cpumask_t iommu_cpumask;
+
+static ssize_t _iommu_cpumask_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ return cpumap_print_to_pagebuf(true, buf, &iommu_cpumask);
+}
+static DEVICE_ATTR(cpumask, S_IRUGO, _iommu_cpumask_show, NULL);
+
+static struct attribute *iommu_cpumask_attrs[] = {
+ &dev_attr_cpumask.attr,
+ NULL,
+};
+
+static struct attribute_group amd_iommu_cpumask_group = {
+ .attrs = iommu_cpumask_attrs,
+};
+
+/*---------------------------------------------*/
+
+static int get_next_avail_iommu_bnk_cntr(struct perf_event *event)
+{
+ struct perf_amd_iommu *piommu = container_of(event->pmu, struct perf_amd_iommu, pmu);
+ int max_cntrs = piommu->max_counters;
+ int max_banks = piommu->max_banks;
+ u32 shift, bank, cntr;
+ unsigned long flags;
+ int retval;
+
+ raw_spin_lock_irqsave(&piommu->lock, flags);
+
+ for (bank = 0, shift = 0; bank < max_banks; bank++) {
+ for (cntr = 0; cntr < max_cntrs; cntr++) {
+ shift = bank + (bank*3) + cntr;
+ if (piommu->cntr_assign_mask & BIT_ULL(shift)) {
+ continue;
+ } else {
+ piommu->cntr_assign_mask |= BIT_ULL(shift);
+ event->hw.iommu_bank = bank;
+ event->hw.iommu_cntr = cntr;
+ retval = 0;
+ goto out;
+ }
+ }
+ }
+ retval = -ENOSPC;
+out:
+ raw_spin_unlock_irqrestore(&piommu->lock, flags);
+ return retval;
+}
+
+static int clear_avail_iommu_bnk_cntr(struct perf_amd_iommu *perf_iommu,
+ u8 bank, u8 cntr)
+{
+ unsigned long flags;
+ int max_banks, max_cntrs;
+ int shift = 0;
+
+ max_banks = perf_iommu->max_banks;
+ max_cntrs = perf_iommu->max_counters;
+
+ if ((bank > max_banks) || (cntr > max_cntrs))
+ return -EINVAL;
+
+ shift = bank + cntr + (bank*3);
+
+ raw_spin_lock_irqsave(&perf_iommu->lock, flags);
+ perf_iommu->cntr_assign_mask &= ~(1ULL<<shift);
+ raw_spin_unlock_irqrestore(&perf_iommu->lock, flags);
+
+ return 0;
+}
+
+static int perf_iommu_event_init(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ /* test the event attr type check for PMU enumeration */
+ if (event->attr.type != event->pmu->type)
+ return -ENOENT;
+
+ /*
+ * IOMMU counters are shared across all cores.
+ * Therefore, it does not support per-process mode.
+ * Also, it does not support event sampling mode.
+ */
+ if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK)
+ return -EINVAL;
+
+ /* IOMMU counters do not have usr/os/guest/host bits */
+ if (event->attr.exclude_user || event->attr.exclude_kernel ||
+ event->attr.exclude_host || event->attr.exclude_guest)
+ return -EINVAL;
+
+ if (event->cpu < 0)
+ return -EINVAL;
+
+ /* update the hw_perf_event struct with the iommu config data */
+ hwc->conf = event->attr.config;
+ hwc->conf1 = event->attr.config1;
+
+ return 0;
+}
+
+static inline struct amd_iommu *perf_event_2_iommu(struct perf_event *ev)
+{
+ return (container_of(ev->pmu, struct perf_amd_iommu, pmu))->iommu;
+}
+
+static void perf_iommu_enable_event(struct perf_event *ev)
+{
+ struct amd_iommu *iommu = perf_event_2_iommu(ev);
+ struct hw_perf_event *hwc = &ev->hw;
+ u8 bank = hwc->iommu_bank;
+ u8 cntr = hwc->iommu_cntr;
+ u64 reg = 0ULL;
+
+ reg = GET_CSOURCE(hwc);
+ amd_iommu_pc_set_reg(iommu, bank, cntr, IOMMU_PC_COUNTER_SRC_REG, &reg);
+
+ reg = GET_DEVID_MASK(hwc);
+ reg = GET_DEVID(hwc) | (reg << 32);
+ if (reg)
+ reg |= BIT(31);
+ amd_iommu_pc_set_reg(iommu, bank, cntr, IOMMU_PC_DEVID_MATCH_REG, &reg);
+
+ reg = GET_PASID_MASK(hwc);
+ reg = GET_PASID(hwc) | (reg << 32);
+ if (reg)
+ reg |= BIT(31);
+ amd_iommu_pc_set_reg(iommu, bank, cntr, IOMMU_PC_PASID_MATCH_REG, &reg);
+
+ reg = GET_DOMID_MASK(hwc);
+ reg = GET_DOMID(hwc) | (reg << 32);
+ if (reg)
+ reg |= BIT(31);
+ amd_iommu_pc_set_reg(iommu, bank, cntr, IOMMU_PC_DOMID_MATCH_REG, &reg);
+}
+
+static void perf_iommu_disable_event(struct perf_event *event)
+{
+ struct amd_iommu *iommu = perf_event_2_iommu(event);
+ struct hw_perf_event *hwc = &event->hw;
+ u64 reg = 0ULL;
+
+ amd_iommu_pc_set_reg(iommu, hwc->iommu_bank, hwc->iommu_cntr,
+ IOMMU_PC_COUNTER_SRC_REG, &reg);
+}
+
+static void perf_iommu_start(struct perf_event *event, int flags)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
+ return;
+
+ WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
+ hwc->state = 0;
+
+ if (flags & PERF_EF_RELOAD) {
+ u64 prev_raw_count = local64_read(&hwc->prev_count);
+ struct amd_iommu *iommu = perf_event_2_iommu(event);
+
+ amd_iommu_pc_set_reg(iommu, hwc->iommu_bank, hwc->iommu_cntr,
+ IOMMU_PC_COUNTER_REG, &prev_raw_count);
+ }
+
+ perf_iommu_enable_event(event);
+ perf_event_update_userpage(event);
+
+}
+
+static void perf_iommu_read(struct perf_event *event)
+{
+ u64 count, prev, delta;
+ struct hw_perf_event *hwc = &event->hw;
+ struct amd_iommu *iommu = perf_event_2_iommu(event);
+
+ if (amd_iommu_pc_get_reg(iommu, hwc->iommu_bank, hwc->iommu_cntr,
+ IOMMU_PC_COUNTER_REG, &count))
+ return;
+
+ /* IOMMU pc counter register is only 48 bits */
+ count &= GENMASK_ULL(47, 0);
+
+ prev = local64_read(&hwc->prev_count);
+ if (local64_cmpxchg(&hwc->prev_count, prev, count) != prev)
+ return;
+
+ /* Handle 48-bit counter overflow */
+ delta = (count << COUNTER_SHIFT) - (prev << COUNTER_SHIFT);
+ delta >>= COUNTER_SHIFT;
+ local64_add(delta, &event->count);
+}
+
+static void perf_iommu_stop(struct perf_event *event, int flags)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (hwc->state & PERF_HES_UPTODATE)
+ return;
+
+ perf_iommu_disable_event(event);
+ WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+ hwc->state |= PERF_HES_STOPPED;
+
+ if (hwc->state & PERF_HES_UPTODATE)
+ return;
+
+ perf_iommu_read(event);
+ hwc->state |= PERF_HES_UPTODATE;
+}
+
+static int perf_iommu_add(struct perf_event *event, int flags)
+{
+ int retval;
+
+ event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+
+ /* request an iommu bank/counter */
+ retval = get_next_avail_iommu_bnk_cntr(event);
+ if (retval)
+ return retval;
+
+ if (flags & PERF_EF_START)
+ perf_iommu_start(event, PERF_EF_RELOAD);
+
+ return 0;
+}
+
+static void perf_iommu_del(struct perf_event *event, int flags)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct perf_amd_iommu *perf_iommu =
+ container_of(event->pmu, struct perf_amd_iommu, pmu);
+
+ perf_iommu_stop(event, PERF_EF_UPDATE);
+
+ /* clear the assigned iommu bank/counter */
+ clear_avail_iommu_bnk_cntr(perf_iommu,
+ hwc->iommu_bank, hwc->iommu_cntr);
+
+ perf_event_update_userpage(event);
+}
+
+static __init int _init_events_attrs(void)
+{
+ int i = 0, j;
+ struct attribute **attrs;
+
+ while (amd_iommu_v2_event_descs[i].attr.attr.name)
+ i++;
+
+ attrs = kcalloc(i + 1, sizeof(*attrs), GFP_KERNEL);
+ if (!attrs)
+ return -ENOMEM;
+
+ for (j = 0; j < i; j++)
+ attrs[j] = &amd_iommu_v2_event_descs[j].attr.attr;
+
+ amd_iommu_events_group.attrs = attrs;
+ return 0;
+}
+
+const struct attribute_group *amd_iommu_attr_groups[] = {
+ &amd_iommu_format_group,
+ &amd_iommu_cpumask_group,
+ &amd_iommu_events_group,
+ NULL,
+};
+
+static const struct pmu iommu_pmu __initconst = {
+ .event_init = perf_iommu_event_init,
+ .add = perf_iommu_add,
+ .del = perf_iommu_del,
+ .start = perf_iommu_start,
+ .stop = perf_iommu_stop,
+ .read = perf_iommu_read,
+ .task_ctx_nr = perf_invalid_context,
+ .attr_groups = amd_iommu_attr_groups,
+};
+
+static __init int init_one_iommu(unsigned int idx)
+{
+ struct perf_amd_iommu *perf_iommu;
+ int ret;
+
+ perf_iommu = kzalloc(sizeof(struct perf_amd_iommu), GFP_KERNEL);
+ if (!perf_iommu)
+ return -ENOMEM;
+
+ raw_spin_lock_init(&perf_iommu->lock);
+
+ perf_iommu->pmu = iommu_pmu;
+ perf_iommu->iommu = get_amd_iommu(idx);
+ perf_iommu->max_banks = amd_iommu_pc_get_max_banks(idx);
+ perf_iommu->max_counters = amd_iommu_pc_get_max_counters(idx);
+
+ if (!perf_iommu->iommu ||
+ !perf_iommu->max_banks ||
+ !perf_iommu->max_counters) {
+ kfree(perf_iommu);
+ return -EINVAL;
+ }
+
+ snprintf(perf_iommu->name, IOMMU_NAME_SIZE, "amd_iommu_%u", idx);
+
+ ret = perf_pmu_register(&perf_iommu->pmu, perf_iommu->name, -1);
+ if (!ret) {
+ pr_info("Detected AMD IOMMU #%d (%d banks, %d counters/bank).\n",
+ idx, perf_iommu->max_banks, perf_iommu->max_counters);
+ list_add_tail(&perf_iommu->list, &perf_amd_iommu_list);
+ } else {
+ pr_warn("Error initializing IOMMU %d.\n", idx);
+ kfree(perf_iommu);
+ }
+ return ret;
+}
+
+static __init int amd_iommu_pc_init(void)
+{
+ unsigned int i, cnt = 0;
+ int ret;
+
+ /* Make sure the IOMMU PC resource is available */
+ if (!amd_iommu_pc_supported())
+ return -ENODEV;
+
+ ret = _init_events_attrs();
+ if (ret)
+ return ret;
+
+ /*
+ * An IOMMU PMU is specific to an IOMMU, and can function independently.
+ * So we go through all IOMMUs and ignore the one that fails init
+ * unless all IOMMU are failing.
+ */
+ for (i = 0; i < amd_iommu_get_num_iommus(); i++) {
+ ret = init_one_iommu(i);
+ if (!ret)
+ cnt++;
+ }
+
+ if (!cnt) {
+ kfree(amd_iommu_events_group.attrs);
+ return -ENODEV;
+ }
+
+ /* Init cpumask attributes to only core 0 */
+ cpumask_set_cpu(0, &iommu_cpumask);
+ return 0;
+}
+
+device_initcall(amd_iommu_pc_init);
diff --git a/arch/x86/events/amd/iommu.h b/arch/x86/events/amd/iommu.h
new file mode 100644
index 000000000..62e0702c4
--- /dev/null
+++ b/arch/x86/events/amd/iommu.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (C) 2013 Advanced Micro Devices, Inc.
+ *
+ * Author: Steven Kinney <Steven.Kinney@amd.com>
+ * Author: Suravee Suthikulpanit <Suraveee.Suthikulpanit@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _PERF_EVENT_AMD_IOMMU_H_
+#define _PERF_EVENT_AMD_IOMMU_H_
+
+/* iommu pc mmio region register indexes */
+#define IOMMU_PC_COUNTER_REG 0x00
+#define IOMMU_PC_COUNTER_SRC_REG 0x08
+#define IOMMU_PC_PASID_MATCH_REG 0x10
+#define IOMMU_PC_DOMID_MATCH_REG 0x18
+#define IOMMU_PC_DEVID_MATCH_REG 0x20
+#define IOMMU_PC_COUNTER_REPORT_REG 0x28
+
+/* maximun specified bank/counters */
+#define PC_MAX_SPEC_BNKS 64
+#define PC_MAX_SPEC_CNTRS 16
+
+struct amd_iommu;
+
+/* amd_iommu_init.c external support functions */
+extern int amd_iommu_get_num_iommus(void);
+
+extern bool amd_iommu_pc_supported(void);
+
+extern u8 amd_iommu_pc_get_max_banks(unsigned int idx);
+
+extern u8 amd_iommu_pc_get_max_counters(unsigned int idx);
+
+extern int amd_iommu_pc_set_reg(struct amd_iommu *iommu, u8 bank, u8 cntr,
+ u8 fxn, u64 *value);
+
+extern int amd_iommu_pc_get_reg(struct amd_iommu *iommu, u8 bank, u8 cntr,
+ u8 fxn, u64 *value);
+
+extern struct amd_iommu *get_amd_iommu(int idx);
+
+#endif /*_PERF_EVENT_AMD_IOMMU_H_*/
diff --git a/arch/x86/events/amd/power.c b/arch/x86/events/amd/power.c
new file mode 100644
index 000000000..2aefacf5c
--- /dev/null
+++ b/arch/x86/events/amd/power.c
@@ -0,0 +1,317 @@
+/*
+ * Performance events - AMD Processor Power Reporting Mechanism
+ *
+ * Copyright (C) 2016 Advanced Micro Devices, Inc.
+ *
+ * Author: Huang Rui <ray.huang@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/perf_event.h>
+#include <asm/cpu_device_id.h>
+#include "../perf_event.h"
+
+#define MSR_F15H_CU_PWR_ACCUMULATOR 0xc001007a
+#define MSR_F15H_CU_MAX_PWR_ACCUMULATOR 0xc001007b
+#define MSR_F15H_PTSC 0xc0010280
+
+/* Event code: LSB 8 bits, passed in attr->config any other bit is reserved. */
+#define AMD_POWER_EVENT_MASK 0xFFULL
+
+/*
+ * Accumulated power status counters.
+ */
+#define AMD_POWER_EVENTSEL_PKG 1
+
+/*
+ * The ratio of compute unit power accumulator sample period to the
+ * PTSC period.
+ */
+static unsigned int cpu_pwr_sample_ratio;
+
+/* Maximum accumulated power of a compute unit. */
+static u64 max_cu_acc_power;
+
+static struct pmu pmu_class;
+
+/*
+ * Accumulated power represents the sum of each compute unit's (CU) power
+ * consumption. On any core of each CU we read the total accumulated power from
+ * MSR_F15H_CU_PWR_ACCUMULATOR. cpu_mask represents CPU bit map of all cores
+ * which are picked to measure the power for the CUs they belong to.
+ */
+static cpumask_t cpu_mask;
+
+static void event_update(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ u64 prev_pwr_acc, new_pwr_acc, prev_ptsc, new_ptsc;
+ u64 delta, tdelta;
+
+ prev_pwr_acc = hwc->pwr_acc;
+ prev_ptsc = hwc->ptsc;
+ rdmsrl(MSR_F15H_CU_PWR_ACCUMULATOR, new_pwr_acc);
+ rdmsrl(MSR_F15H_PTSC, new_ptsc);
+
+ /*
+ * Calculate the CU power consumption over a time period, the unit of
+ * final value (delta) is micro-Watts. Then add it to the event count.
+ */
+ if (new_pwr_acc < prev_pwr_acc) {
+ delta = max_cu_acc_power + new_pwr_acc;
+ delta -= prev_pwr_acc;
+ } else
+ delta = new_pwr_acc - prev_pwr_acc;
+
+ delta *= cpu_pwr_sample_ratio * 1000;
+ tdelta = new_ptsc - prev_ptsc;
+
+ do_div(delta, tdelta);
+ local64_add(delta, &event->count);
+}
+
+static void __pmu_event_start(struct perf_event *event)
+{
+ if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+ return;
+
+ event->hw.state = 0;
+
+ rdmsrl(MSR_F15H_PTSC, event->hw.ptsc);
+ rdmsrl(MSR_F15H_CU_PWR_ACCUMULATOR, event->hw.pwr_acc);
+}
+
+static void pmu_event_start(struct perf_event *event, int mode)
+{
+ __pmu_event_start(event);
+}
+
+static void pmu_event_stop(struct perf_event *event, int mode)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ /* Mark event as deactivated and stopped. */
+ if (!(hwc->state & PERF_HES_STOPPED))
+ hwc->state |= PERF_HES_STOPPED;
+
+ /* Check if software counter update is necessary. */
+ if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+ /*
+ * Drain the remaining delta count out of an event
+ * that we are disabling:
+ */
+ event_update(event);
+ hwc->state |= PERF_HES_UPTODATE;
+ }
+}
+
+static int pmu_event_add(struct perf_event *event, int mode)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+
+ if (mode & PERF_EF_START)
+ __pmu_event_start(event);
+
+ return 0;
+}
+
+static void pmu_event_del(struct perf_event *event, int flags)
+{
+ pmu_event_stop(event, PERF_EF_UPDATE);
+}
+
+static int pmu_event_init(struct perf_event *event)
+{
+ u64 cfg = event->attr.config & AMD_POWER_EVENT_MASK;
+
+ /* Only look at AMD power events. */
+ if (event->attr.type != pmu_class.type)
+ return -ENOENT;
+
+ /* Unsupported modes and filters. */
+ if (event->attr.exclude_user ||
+ event->attr.exclude_kernel ||
+ event->attr.exclude_hv ||
+ event->attr.exclude_idle ||
+ event->attr.exclude_host ||
+ event->attr.exclude_guest ||
+ /* no sampling */
+ event->attr.sample_period)
+ return -EINVAL;
+
+ if (cfg != AMD_POWER_EVENTSEL_PKG)
+ return -EINVAL;
+
+ return 0;
+}
+
+static void pmu_event_read(struct perf_event *event)
+{
+ event_update(event);
+}
+
+static ssize_t
+get_attr_cpumask(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpumap_print_to_pagebuf(true, buf, &cpu_mask);
+}
+
+static DEVICE_ATTR(cpumask, S_IRUGO, get_attr_cpumask, NULL);
+
+static struct attribute *pmu_attrs[] = {
+ &dev_attr_cpumask.attr,
+ NULL,
+};
+
+static struct attribute_group pmu_attr_group = {
+ .attrs = pmu_attrs,
+};
+
+/*
+ * Currently it only supports to report the power of each
+ * processor/package.
+ */
+EVENT_ATTR_STR(power-pkg, power_pkg, "event=0x01");
+
+EVENT_ATTR_STR(power-pkg.unit, power_pkg_unit, "mWatts");
+
+/* Convert the count from micro-Watts to milli-Watts. */
+EVENT_ATTR_STR(power-pkg.scale, power_pkg_scale, "1.000000e-3");
+
+static struct attribute *events_attr[] = {
+ EVENT_PTR(power_pkg),
+ EVENT_PTR(power_pkg_unit),
+ EVENT_PTR(power_pkg_scale),
+ NULL,
+};
+
+static struct attribute_group pmu_events_group = {
+ .name = "events",
+ .attrs = events_attr,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-7");
+
+static struct attribute *formats_attr[] = {
+ &format_attr_event.attr,
+ NULL,
+};
+
+static struct attribute_group pmu_format_group = {
+ .name = "format",
+ .attrs = formats_attr,
+};
+
+static const struct attribute_group *attr_groups[] = {
+ &pmu_attr_group,
+ &pmu_format_group,
+ &pmu_events_group,
+ NULL,
+};
+
+static struct pmu pmu_class = {
+ .attr_groups = attr_groups,
+ /* system-wide only */
+ .task_ctx_nr = perf_invalid_context,
+ .event_init = pmu_event_init,
+ .add = pmu_event_add,
+ .del = pmu_event_del,
+ .start = pmu_event_start,
+ .stop = pmu_event_stop,
+ .read = pmu_event_read,
+};
+
+static int power_cpu_exit(unsigned int cpu)
+{
+ int target;
+
+ if (!cpumask_test_and_clear_cpu(cpu, &cpu_mask))
+ return 0;
+
+ /*
+ * Find a new CPU on the same compute unit, if was set in cpumask
+ * and still some CPUs on compute unit. Then migrate event and
+ * context to new CPU.
+ */
+ target = cpumask_any_but(topology_sibling_cpumask(cpu), cpu);
+ if (target < nr_cpumask_bits) {
+ cpumask_set_cpu(target, &cpu_mask);
+ perf_pmu_migrate_context(&pmu_class, cpu, target);
+ }
+ return 0;
+}
+
+static int power_cpu_init(unsigned int cpu)
+{
+ int target;
+
+ /*
+ * 1) If any CPU is set at cpu_mask in the same compute unit, do
+ * nothing.
+ * 2) If no CPU is set at cpu_mask in the same compute unit,
+ * set current ONLINE CPU.
+ *
+ * Note: if there is a CPU aside of the new one already in the
+ * sibling mask, then it is also in cpu_mask.
+ */
+ target = cpumask_any_but(topology_sibling_cpumask(cpu), cpu);
+ if (target >= nr_cpumask_bits)
+ cpumask_set_cpu(cpu, &cpu_mask);
+ return 0;
+}
+
+static const struct x86_cpu_id cpu_match[] = {
+ { .vendor = X86_VENDOR_AMD, .family = 0x15 },
+ {},
+};
+
+static int __init amd_power_pmu_init(void)
+{
+ int ret;
+
+ if (!x86_match_cpu(cpu_match))
+ return -ENODEV;
+
+ if (!boot_cpu_has(X86_FEATURE_ACC_POWER))
+ return -ENODEV;
+
+ cpu_pwr_sample_ratio = cpuid_ecx(0x80000007);
+
+ if (rdmsrl_safe(MSR_F15H_CU_MAX_PWR_ACCUMULATOR, &max_cu_acc_power)) {
+ pr_err("Failed to read max compute unit power accumulator MSR\n");
+ return -ENODEV;
+ }
+
+
+ cpuhp_setup_state(CPUHP_AP_PERF_X86_AMD_POWER_ONLINE,
+ "perf/x86/amd/power:online",
+ power_cpu_init, power_cpu_exit);
+
+ ret = perf_pmu_register(&pmu_class, "power", -1);
+ if (WARN_ON(ret)) {
+ pr_warn("AMD Power PMU registration failed\n");
+ return ret;
+ }
+
+ pr_info("AMD Power PMU detected\n");
+ return ret;
+}
+module_init(amd_power_pmu_init);
+
+static void __exit amd_power_pmu_exit(void)
+{
+ cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_AMD_POWER_ONLINE);
+ perf_pmu_unregister(&pmu_class);
+}
+module_exit(amd_power_pmu_exit);
+
+MODULE_AUTHOR("Huang Rui <ray.huang@amd.com>");
+MODULE_DESCRIPTION("AMD Processor Power Reporting Mechanism");
+MODULE_LICENSE("GPL v2");
diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c
new file mode 100644
index 000000000..604a85587
--- /dev/null
+++ b/arch/x86/events/amd/uncore.c
@@ -0,0 +1,619 @@
+/*
+ * Copyright (C) 2013 Advanced Micro Devices, Inc.
+ *
+ * Author: Jacob Shin <jacob.shin@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/perf_event.h>
+#include <linux/percpu.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+
+#include <asm/cpufeature.h>
+#include <asm/perf_event.h>
+#include <asm/msr.h>
+#include <asm/smp.h>
+
+#define NUM_COUNTERS_NB 4
+#define NUM_COUNTERS_L2 4
+#define NUM_COUNTERS_L3 6
+#define MAX_COUNTERS 6
+
+#define RDPMC_BASE_NB 6
+#define RDPMC_BASE_LLC 10
+
+#define COUNTER_SHIFT 16
+
+#undef pr_fmt
+#define pr_fmt(fmt) "amd_uncore: " fmt
+
+static int num_counters_llc;
+static int num_counters_nb;
+static bool l3_mask;
+
+static HLIST_HEAD(uncore_unused_list);
+
+struct amd_uncore {
+ int id;
+ int refcnt;
+ int cpu;
+ int num_counters;
+ int rdpmc_base;
+ u32 msr_base;
+ cpumask_t *active_mask;
+ struct pmu *pmu;
+ struct perf_event *events[MAX_COUNTERS];
+ struct hlist_node node;
+};
+
+static struct amd_uncore * __percpu *amd_uncore_nb;
+static struct amd_uncore * __percpu *amd_uncore_llc;
+
+static struct pmu amd_nb_pmu;
+static struct pmu amd_llc_pmu;
+
+static cpumask_t amd_nb_active_mask;
+static cpumask_t amd_llc_active_mask;
+
+static bool is_nb_event(struct perf_event *event)
+{
+ return event->pmu->type == amd_nb_pmu.type;
+}
+
+static bool is_llc_event(struct perf_event *event)
+{
+ return event->pmu->type == amd_llc_pmu.type;
+}
+
+static struct amd_uncore *event_to_amd_uncore(struct perf_event *event)
+{
+ if (is_nb_event(event) && amd_uncore_nb)
+ return *per_cpu_ptr(amd_uncore_nb, event->cpu);
+ else if (is_llc_event(event) && amd_uncore_llc)
+ return *per_cpu_ptr(amd_uncore_llc, event->cpu);
+
+ return NULL;
+}
+
+static void amd_uncore_read(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ u64 prev, new;
+ s64 delta;
+
+ /*
+ * since we do not enable counter overflow interrupts,
+ * we do not have to worry about prev_count changing on us
+ */
+
+ prev = local64_read(&hwc->prev_count);
+ rdpmcl(hwc->event_base_rdpmc, new);
+ local64_set(&hwc->prev_count, new);
+ delta = (new << COUNTER_SHIFT) - (prev << COUNTER_SHIFT);
+ delta >>= COUNTER_SHIFT;
+ local64_add(delta, &event->count);
+}
+
+static void amd_uncore_start(struct perf_event *event, int flags)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (flags & PERF_EF_RELOAD)
+ wrmsrl(hwc->event_base, (u64)local64_read(&hwc->prev_count));
+
+ hwc->state = 0;
+ wrmsrl(hwc->config_base, (hwc->config | ARCH_PERFMON_EVENTSEL_ENABLE));
+ perf_event_update_userpage(event);
+}
+
+static void amd_uncore_stop(struct perf_event *event, int flags)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ wrmsrl(hwc->config_base, hwc->config);
+ hwc->state |= PERF_HES_STOPPED;
+
+ if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+ amd_uncore_read(event);
+ hwc->state |= PERF_HES_UPTODATE;
+ }
+}
+
+static int amd_uncore_add(struct perf_event *event, int flags)
+{
+ int i;
+ struct amd_uncore *uncore = event_to_amd_uncore(event);
+ struct hw_perf_event *hwc = &event->hw;
+
+ /* are we already assigned? */
+ if (hwc->idx != -1 && uncore->events[hwc->idx] == event)
+ goto out;
+
+ for (i = 0; i < uncore->num_counters; i++) {
+ if (uncore->events[i] == event) {
+ hwc->idx = i;
+ goto out;
+ }
+ }
+
+ /* if not, take the first available counter */
+ hwc->idx = -1;
+ for (i = 0; i < uncore->num_counters; i++) {
+ if (cmpxchg(&uncore->events[i], NULL, event) == NULL) {
+ hwc->idx = i;
+ break;
+ }
+ }
+
+out:
+ if (hwc->idx == -1)
+ return -EBUSY;
+
+ hwc->config_base = uncore->msr_base + (2 * hwc->idx);
+ hwc->event_base = uncore->msr_base + 1 + (2 * hwc->idx);
+ hwc->event_base_rdpmc = uncore->rdpmc_base + hwc->idx;
+ hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+
+ if (flags & PERF_EF_START)
+ amd_uncore_start(event, PERF_EF_RELOAD);
+
+ return 0;
+}
+
+static void amd_uncore_del(struct perf_event *event, int flags)
+{
+ int i;
+ struct amd_uncore *uncore = event_to_amd_uncore(event);
+ struct hw_perf_event *hwc = &event->hw;
+
+ amd_uncore_stop(event, PERF_EF_UPDATE);
+
+ for (i = 0; i < uncore->num_counters; i++) {
+ if (cmpxchg(&uncore->events[i], event, NULL) == event)
+ break;
+ }
+
+ hwc->idx = -1;
+}
+
+static int amd_uncore_event_init(struct perf_event *event)
+{
+ struct amd_uncore *uncore;
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (event->attr.type != event->pmu->type)
+ return -ENOENT;
+
+ /*
+ * NB and Last level cache counters (MSRs) are shared across all cores
+ * that share the same NB / Last level cache. On family 16h and below,
+ * Interrupts can be directed to a single target core, however, event
+ * counts generated by processes running on other cores cannot be masked
+ * out. So we do not support sampling and per-thread events via
+ * CAP_NO_INTERRUPT, and we do not enable counter overflow interrupts:
+ */
+
+ /* NB and Last level cache counters do not have usr/os/guest/host bits */
+ if (event->attr.exclude_user || event->attr.exclude_kernel ||
+ event->attr.exclude_host || event->attr.exclude_guest)
+ return -EINVAL;
+
+ hwc->config = event->attr.config & AMD64_RAW_EVENT_MASK_NB;
+ hwc->idx = -1;
+
+ if (event->cpu < 0)
+ return -EINVAL;
+
+ /*
+ * SliceMask and ThreadMask need to be set for certain L3 events in
+ * Family 17h. For other events, the two fields do not affect the count.
+ */
+ if (l3_mask && is_llc_event(event)) {
+ int thread = 2 * (cpu_data(event->cpu).cpu_core_id % 4);
+
+ if (smp_num_siblings > 1)
+ thread += cpu_data(event->cpu).apicid & 1;
+
+ hwc->config |= (1ULL << (AMD64_L3_THREAD_SHIFT + thread) &
+ AMD64_L3_THREAD_MASK) | AMD64_L3_SLICE_MASK;
+ }
+
+ uncore = event_to_amd_uncore(event);
+ if (!uncore)
+ return -ENODEV;
+
+ /*
+ * since request can come in to any of the shared cores, we will remap
+ * to a single common cpu.
+ */
+ event->cpu = uncore->cpu;
+
+ return 0;
+}
+
+static ssize_t amd_uncore_attr_show_cpumask(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ cpumask_t *active_mask;
+ struct pmu *pmu = dev_get_drvdata(dev);
+
+ if (pmu->type == amd_nb_pmu.type)
+ active_mask = &amd_nb_active_mask;
+ else if (pmu->type == amd_llc_pmu.type)
+ active_mask = &amd_llc_active_mask;
+ else
+ return 0;
+
+ return cpumap_print_to_pagebuf(true, buf, active_mask);
+}
+static DEVICE_ATTR(cpumask, S_IRUGO, amd_uncore_attr_show_cpumask, NULL);
+
+static struct attribute *amd_uncore_attrs[] = {
+ &dev_attr_cpumask.attr,
+ NULL,
+};
+
+static struct attribute_group amd_uncore_attr_group = {
+ .attrs = amd_uncore_attrs,
+};
+
+/*
+ * Similar to PMU_FORMAT_ATTR but allowing for format_attr to be assigned based
+ * on family
+ */
+#define AMD_FORMAT_ATTR(_dev, _name, _format) \
+static ssize_t \
+_dev##_show##_name(struct device *dev, \
+ struct device_attribute *attr, \
+ char *page) \
+{ \
+ BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \
+ return sprintf(page, _format "\n"); \
+} \
+static struct device_attribute format_attr_##_dev##_name = __ATTR_RO(_dev);
+
+/* Used for each uncore counter type */
+#define AMD_ATTRIBUTE(_name) \
+static struct attribute *amd_uncore_format_attr_##_name[] = { \
+ &format_attr_event_##_name.attr, \
+ &format_attr_umask.attr, \
+ NULL, \
+}; \
+static struct attribute_group amd_uncore_format_group_##_name = { \
+ .name = "format", \
+ .attrs = amd_uncore_format_attr_##_name, \
+}; \
+static const struct attribute_group *amd_uncore_attr_groups_##_name[] = { \
+ &amd_uncore_attr_group, \
+ &amd_uncore_format_group_##_name, \
+ NULL, \
+};
+
+AMD_FORMAT_ATTR(event, , "config:0-7,32-35");
+AMD_FORMAT_ATTR(umask, , "config:8-15");
+AMD_FORMAT_ATTR(event, _df, "config:0-7,32-35,59-60");
+AMD_FORMAT_ATTR(event, _l3, "config:0-7");
+AMD_ATTRIBUTE(df);
+AMD_ATTRIBUTE(l3);
+
+static struct pmu amd_nb_pmu = {
+ .task_ctx_nr = perf_invalid_context,
+ .event_init = amd_uncore_event_init,
+ .add = amd_uncore_add,
+ .del = amd_uncore_del,
+ .start = amd_uncore_start,
+ .stop = amd_uncore_stop,
+ .read = amd_uncore_read,
+ .capabilities = PERF_PMU_CAP_NO_INTERRUPT,
+};
+
+static struct pmu amd_llc_pmu = {
+ .task_ctx_nr = perf_invalid_context,
+ .event_init = amd_uncore_event_init,
+ .add = amd_uncore_add,
+ .del = amd_uncore_del,
+ .start = amd_uncore_start,
+ .stop = amd_uncore_stop,
+ .read = amd_uncore_read,
+ .capabilities = PERF_PMU_CAP_NO_INTERRUPT,
+};
+
+static struct amd_uncore *amd_uncore_alloc(unsigned int cpu)
+{
+ return kzalloc_node(sizeof(struct amd_uncore), GFP_KERNEL,
+ cpu_to_node(cpu));
+}
+
+static int amd_uncore_cpu_up_prepare(unsigned int cpu)
+{
+ struct amd_uncore *uncore_nb = NULL, *uncore_llc;
+
+ if (amd_uncore_nb) {
+ uncore_nb = amd_uncore_alloc(cpu);
+ if (!uncore_nb)
+ goto fail;
+ uncore_nb->cpu = cpu;
+ uncore_nb->num_counters = num_counters_nb;
+ uncore_nb->rdpmc_base = RDPMC_BASE_NB;
+ uncore_nb->msr_base = MSR_F15H_NB_PERF_CTL;
+ uncore_nb->active_mask = &amd_nb_active_mask;
+ uncore_nb->pmu = &amd_nb_pmu;
+ uncore_nb->id = -1;
+ *per_cpu_ptr(amd_uncore_nb, cpu) = uncore_nb;
+ }
+
+ if (amd_uncore_llc) {
+ uncore_llc = amd_uncore_alloc(cpu);
+ if (!uncore_llc)
+ goto fail;
+ uncore_llc->cpu = cpu;
+ uncore_llc->num_counters = num_counters_llc;
+ uncore_llc->rdpmc_base = RDPMC_BASE_LLC;
+ uncore_llc->msr_base = MSR_F16H_L2I_PERF_CTL;
+ uncore_llc->active_mask = &amd_llc_active_mask;
+ uncore_llc->pmu = &amd_llc_pmu;
+ uncore_llc->id = -1;
+ *per_cpu_ptr(amd_uncore_llc, cpu) = uncore_llc;
+ }
+
+ return 0;
+
+fail:
+ if (amd_uncore_nb)
+ *per_cpu_ptr(amd_uncore_nb, cpu) = NULL;
+ kfree(uncore_nb);
+ return -ENOMEM;
+}
+
+static struct amd_uncore *
+amd_uncore_find_online_sibling(struct amd_uncore *this,
+ struct amd_uncore * __percpu *uncores)
+{
+ unsigned int cpu;
+ struct amd_uncore *that;
+
+ for_each_online_cpu(cpu) {
+ that = *per_cpu_ptr(uncores, cpu);
+
+ if (!that)
+ continue;
+
+ if (this == that)
+ continue;
+
+ if (this->id == that->id) {
+ hlist_add_head(&this->node, &uncore_unused_list);
+ this = that;
+ break;
+ }
+ }
+
+ this->refcnt++;
+ return this;
+}
+
+static int amd_uncore_cpu_starting(unsigned int cpu)
+{
+ unsigned int eax, ebx, ecx, edx;
+ struct amd_uncore *uncore;
+
+ if (amd_uncore_nb) {
+ uncore = *per_cpu_ptr(amd_uncore_nb, cpu);
+ cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
+ uncore->id = ecx & 0xff;
+
+ uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_nb);
+ *per_cpu_ptr(amd_uncore_nb, cpu) = uncore;
+ }
+
+ if (amd_uncore_llc) {
+ uncore = *per_cpu_ptr(amd_uncore_llc, cpu);
+ uncore->id = per_cpu(cpu_llc_id, cpu);
+
+ uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_llc);
+ *per_cpu_ptr(amd_uncore_llc, cpu) = uncore;
+ }
+
+ return 0;
+}
+
+static void uncore_clean_online(void)
+{
+ struct amd_uncore *uncore;
+ struct hlist_node *n;
+
+ hlist_for_each_entry_safe(uncore, n, &uncore_unused_list, node) {
+ hlist_del(&uncore->node);
+ kfree(uncore);
+ }
+}
+
+static void uncore_online(unsigned int cpu,
+ struct amd_uncore * __percpu *uncores)
+{
+ struct amd_uncore *uncore = *per_cpu_ptr(uncores, cpu);
+
+ uncore_clean_online();
+
+ if (cpu == uncore->cpu)
+ cpumask_set_cpu(cpu, uncore->active_mask);
+}
+
+static int amd_uncore_cpu_online(unsigned int cpu)
+{
+ if (amd_uncore_nb)
+ uncore_online(cpu, amd_uncore_nb);
+
+ if (amd_uncore_llc)
+ uncore_online(cpu, amd_uncore_llc);
+
+ return 0;
+}
+
+static void uncore_down_prepare(unsigned int cpu,
+ struct amd_uncore * __percpu *uncores)
+{
+ unsigned int i;
+ struct amd_uncore *this = *per_cpu_ptr(uncores, cpu);
+
+ if (this->cpu != cpu)
+ return;
+
+ /* this cpu is going down, migrate to a shared sibling if possible */
+ for_each_online_cpu(i) {
+ struct amd_uncore *that = *per_cpu_ptr(uncores, i);
+
+ if (cpu == i)
+ continue;
+
+ if (this == that) {
+ perf_pmu_migrate_context(this->pmu, cpu, i);
+ cpumask_clear_cpu(cpu, that->active_mask);
+ cpumask_set_cpu(i, that->active_mask);
+ that->cpu = i;
+ break;
+ }
+ }
+}
+
+static int amd_uncore_cpu_down_prepare(unsigned int cpu)
+{
+ if (amd_uncore_nb)
+ uncore_down_prepare(cpu, amd_uncore_nb);
+
+ if (amd_uncore_llc)
+ uncore_down_prepare(cpu, amd_uncore_llc);
+
+ return 0;
+}
+
+static void uncore_dead(unsigned int cpu, struct amd_uncore * __percpu *uncores)
+{
+ struct amd_uncore *uncore = *per_cpu_ptr(uncores, cpu);
+
+ if (cpu == uncore->cpu)
+ cpumask_clear_cpu(cpu, uncore->active_mask);
+
+ if (!--uncore->refcnt)
+ kfree(uncore);
+ *per_cpu_ptr(uncores, cpu) = NULL;
+}
+
+static int amd_uncore_cpu_dead(unsigned int cpu)
+{
+ if (amd_uncore_nb)
+ uncore_dead(cpu, amd_uncore_nb);
+
+ if (amd_uncore_llc)
+ uncore_dead(cpu, amd_uncore_llc);
+
+ return 0;
+}
+
+static int __init amd_uncore_init(void)
+{
+ int ret = -ENODEV;
+
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
+ return -ENODEV;
+
+ if (!boot_cpu_has(X86_FEATURE_TOPOEXT))
+ return -ENODEV;
+
+ if (boot_cpu_data.x86 == 0x17) {
+ /*
+ * For F17h, the Northbridge counters are repurposed as Data
+ * Fabric counters. Also, L3 counters are supported too. The PMUs
+ * are exported based on family as either L2 or L3 and NB or DF.
+ */
+ num_counters_nb = NUM_COUNTERS_NB;
+ num_counters_llc = NUM_COUNTERS_L3;
+ amd_nb_pmu.name = "amd_df";
+ amd_llc_pmu.name = "amd_l3";
+ format_attr_event_df.show = &event_show_df;
+ format_attr_event_l3.show = &event_show_l3;
+ l3_mask = true;
+ } else {
+ num_counters_nb = NUM_COUNTERS_NB;
+ num_counters_llc = NUM_COUNTERS_L2;
+ amd_nb_pmu.name = "amd_nb";
+ amd_llc_pmu.name = "amd_l2";
+ format_attr_event_df = format_attr_event;
+ format_attr_event_l3 = format_attr_event;
+ l3_mask = false;
+ }
+
+ amd_nb_pmu.attr_groups = amd_uncore_attr_groups_df;
+ amd_llc_pmu.attr_groups = amd_uncore_attr_groups_l3;
+
+ if (boot_cpu_has(X86_FEATURE_PERFCTR_NB)) {
+ amd_uncore_nb = alloc_percpu(struct amd_uncore *);
+ if (!amd_uncore_nb) {
+ ret = -ENOMEM;
+ goto fail_nb;
+ }
+ ret = perf_pmu_register(&amd_nb_pmu, amd_nb_pmu.name, -1);
+ if (ret)
+ goto fail_nb;
+
+ pr_info("AMD NB counters detected\n");
+ ret = 0;
+ }
+
+ if (boot_cpu_has(X86_FEATURE_PERFCTR_LLC)) {
+ amd_uncore_llc = alloc_percpu(struct amd_uncore *);
+ if (!amd_uncore_llc) {
+ ret = -ENOMEM;
+ goto fail_llc;
+ }
+ ret = perf_pmu_register(&amd_llc_pmu, amd_llc_pmu.name, -1);
+ if (ret)
+ goto fail_llc;
+
+ pr_info("AMD LLC counters detected\n");
+ ret = 0;
+ }
+
+ /*
+ * Install callbacks. Core will call them for each online cpu.
+ */
+ if (cpuhp_setup_state(CPUHP_PERF_X86_AMD_UNCORE_PREP,
+ "perf/x86/amd/uncore:prepare",
+ amd_uncore_cpu_up_prepare, amd_uncore_cpu_dead))
+ goto fail_llc;
+
+ if (cpuhp_setup_state(CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING,
+ "perf/x86/amd/uncore:starting",
+ amd_uncore_cpu_starting, NULL))
+ goto fail_prep;
+ if (cpuhp_setup_state(CPUHP_AP_PERF_X86_AMD_UNCORE_ONLINE,
+ "perf/x86/amd/uncore:online",
+ amd_uncore_cpu_online,
+ amd_uncore_cpu_down_prepare))
+ goto fail_start;
+ return 0;
+
+fail_start:
+ cpuhp_remove_state(CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING);
+fail_prep:
+ cpuhp_remove_state(CPUHP_PERF_X86_AMD_UNCORE_PREP);
+fail_llc:
+ if (boot_cpu_has(X86_FEATURE_PERFCTR_NB))
+ perf_pmu_unregister(&amd_nb_pmu);
+ if (amd_uncore_llc)
+ free_percpu(amd_uncore_llc);
+fail_nb:
+ if (amd_uncore_nb)
+ free_percpu(amd_uncore_nb);
+
+ return ret;
+}
+device_initcall(amd_uncore_init);
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
new file mode 100644
index 000000000..f612eb1cc
--- /dev/null
+++ b/arch/x86/events/core.c
@@ -0,0 +1,2565 @@
+/*
+ * Performance events x86 architecture code
+ *
+ * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
+ * Copyright (C) 2009 Jaswinder Singh Rajput
+ * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
+ * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra
+ * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
+ * Copyright (C) 2009 Google, Inc., Stephane Eranian
+ *
+ * For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/perf_event.h>
+#include <linux/capability.h>
+#include <linux/notifier.h>
+#include <linux/hardirq.h>
+#include <linux/kprobes.h>
+#include <linux/export.h>
+#include <linux/init.h>
+#include <linux/kdebug.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/clock.h>
+#include <linux/uaccess.h>
+#include <linux/slab.h>
+#include <linux/cpu.h>
+#include <linux/bitops.h>
+#include <linux/device.h>
+#include <linux/nospec.h>
+
+#include <asm/apic.h>
+#include <asm/stacktrace.h>
+#include <asm/nmi.h>
+#include <asm/smp.h>
+#include <asm/alternative.h>
+#include <asm/mmu_context.h>
+#include <asm/tlbflush.h>
+#include <asm/timer.h>
+#include <asm/desc.h>
+#include <asm/ldt.h>
+#include <asm/unwind.h>
+
+#include "perf_event.h"
+
+struct x86_pmu x86_pmu __read_mostly;
+
+DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
+ .enabled = 1,
+};
+
+DEFINE_STATIC_KEY_FALSE(rdpmc_always_available_key);
+
+u64 __read_mostly hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX];
+u64 __read_mostly hw_cache_extra_regs
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX];
+
+/*
+ * Propagate event elapsed time into the generic event.
+ * Can only be executed on the CPU where the event is active.
+ * Returns the delta events processed.
+ */
+u64 x86_perf_event_update(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ int shift = 64 - x86_pmu.cntval_bits;
+ u64 prev_raw_count, new_raw_count;
+ int idx = hwc->idx;
+ u64 delta;
+
+ if (idx == INTEL_PMC_IDX_FIXED_BTS)
+ return 0;
+
+ /*
+ * Careful: an NMI might modify the previous event value.
+ *
+ * Our tactic to handle this is to first atomically read and
+ * exchange a new raw count - then add that new-prev delta
+ * count to the generic event atomically:
+ */
+again:
+ prev_raw_count = local64_read(&hwc->prev_count);
+ rdpmcl(hwc->event_base_rdpmc, new_raw_count);
+
+ if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
+ new_raw_count) != prev_raw_count)
+ goto again;
+
+ /*
+ * Now we have the new raw value and have updated the prev
+ * timestamp already. We can now calculate the elapsed delta
+ * (event-)time and add that to the generic event.
+ *
+ * Careful, not all hw sign-extends above the physical width
+ * of the count.
+ */
+ delta = (new_raw_count << shift) - (prev_raw_count << shift);
+ delta >>= shift;
+
+ local64_add(delta, &event->count);
+ local64_sub(delta, &hwc->period_left);
+
+ return new_raw_count;
+}
+
+/*
+ * Find and validate any extra registers to set up.
+ */
+static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
+{
+ struct hw_perf_event_extra *reg;
+ struct extra_reg *er;
+
+ reg = &event->hw.extra_reg;
+
+ if (!x86_pmu.extra_regs)
+ return 0;
+
+ for (er = x86_pmu.extra_regs; er->msr; er++) {
+ if (er->event != (config & er->config_mask))
+ continue;
+ if (event->attr.config1 & ~er->valid_mask)
+ return -EINVAL;
+ /* Check if the extra msrs can be safely accessed*/
+ if (!er->extra_msr_access)
+ return -ENXIO;
+
+ reg->idx = er->idx;
+ reg->config = event->attr.config1;
+ reg->reg = er->msr;
+ break;
+ }
+ return 0;
+}
+
+static atomic_t active_events;
+static atomic_t pmc_refcount;
+static DEFINE_MUTEX(pmc_reserve_mutex);
+
+#ifdef CONFIG_X86_LOCAL_APIC
+
+static bool reserve_pmc_hardware(void)
+{
+ int i;
+
+ for (i = 0; i < x86_pmu.num_counters; i++) {
+ if (!reserve_perfctr_nmi(x86_pmu_event_addr(i)))
+ goto perfctr_fail;
+ }
+
+ for (i = 0; i < x86_pmu.num_counters; i++) {
+ if (!reserve_evntsel_nmi(x86_pmu_config_addr(i)))
+ goto eventsel_fail;
+ }
+
+ return true;
+
+eventsel_fail:
+ for (i--; i >= 0; i--)
+ release_evntsel_nmi(x86_pmu_config_addr(i));
+
+ i = x86_pmu.num_counters;
+
+perfctr_fail:
+ for (i--; i >= 0; i--)
+ release_perfctr_nmi(x86_pmu_event_addr(i));
+
+ return false;
+}
+
+static void release_pmc_hardware(void)
+{
+ int i;
+
+ for (i = 0; i < x86_pmu.num_counters; i++) {
+ release_perfctr_nmi(x86_pmu_event_addr(i));
+ release_evntsel_nmi(x86_pmu_config_addr(i));
+ }
+}
+
+#else
+
+static bool reserve_pmc_hardware(void) { return true; }
+static void release_pmc_hardware(void) {}
+
+#endif
+
+static bool check_hw_exists(void)
+{
+ u64 val, val_fail = -1, val_new= ~0;
+ int i, reg, reg_fail = -1, ret = 0;
+ int bios_fail = 0;
+ int reg_safe = -1;
+
+ /*
+ * Check to see if the BIOS enabled any of the counters, if so
+ * complain and bail.
+ */
+ for (i = 0; i < x86_pmu.num_counters; i++) {
+ reg = x86_pmu_config_addr(i);
+ ret = rdmsrl_safe(reg, &val);
+ if (ret)
+ goto msr_fail;
+ if (val & ARCH_PERFMON_EVENTSEL_ENABLE) {
+ bios_fail = 1;
+ val_fail = val;
+ reg_fail = reg;
+ } else {
+ reg_safe = i;
+ }
+ }
+
+ if (x86_pmu.num_counters_fixed) {
+ reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
+ ret = rdmsrl_safe(reg, &val);
+ if (ret)
+ goto msr_fail;
+ for (i = 0; i < x86_pmu.num_counters_fixed; i++) {
+ if (val & (0x03 << i*4)) {
+ bios_fail = 1;
+ val_fail = val;
+ reg_fail = reg;
+ }
+ }
+ }
+
+ /*
+ * If all the counters are enabled, the below test will always
+ * fail. The tools will also become useless in this scenario.
+ * Just fail and disable the hardware counters.
+ */
+
+ if (reg_safe == -1) {
+ reg = reg_safe;
+ goto msr_fail;
+ }
+
+ /*
+ * Read the current value, change it and read it back to see if it
+ * matches, this is needed to detect certain hardware emulators
+ * (qemu/kvm) that don't trap on the MSR access and always return 0s.
+ */
+ reg = x86_pmu_event_addr(reg_safe);
+ if (rdmsrl_safe(reg, &val))
+ goto msr_fail;
+ val ^= 0xffffUL;
+ ret = wrmsrl_safe(reg, val);
+ ret |= rdmsrl_safe(reg, &val_new);
+ if (ret || val != val_new)
+ goto msr_fail;
+
+ /*
+ * We still allow the PMU driver to operate:
+ */
+ if (bios_fail) {
+ pr_cont("Broken BIOS detected, complain to your hardware vendor.\n");
+ pr_err(FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n",
+ reg_fail, val_fail);
+ }
+
+ return true;
+
+msr_fail:
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
+ pr_cont("PMU not available due to virtualization, using software events only.\n");
+ } else {
+ pr_cont("Broken PMU hardware detected, using software events only.\n");
+ pr_err("Failed to access perfctr msr (MSR %x is %Lx)\n",
+ reg, val_new);
+ }
+
+ return false;
+}
+
+static void hw_perf_event_destroy(struct perf_event *event)
+{
+ x86_release_hardware();
+ atomic_dec(&active_events);
+}
+
+void hw_perf_lbr_event_destroy(struct perf_event *event)
+{
+ hw_perf_event_destroy(event);
+
+ /* undo the lbr/bts event accounting */
+ x86_del_exclusive(x86_lbr_exclusive_lbr);
+}
+
+static inline int x86_pmu_initialized(void)
+{
+ return x86_pmu.handle_irq != NULL;
+}
+
+static inline int
+set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
+{
+ struct perf_event_attr *attr = &event->attr;
+ unsigned int cache_type, cache_op, cache_result;
+ u64 config, val;
+
+ config = attr->config;
+
+ cache_type = (config >> 0) & 0xff;
+ if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
+ return -EINVAL;
+ cache_type = array_index_nospec(cache_type, PERF_COUNT_HW_CACHE_MAX);
+
+ cache_op = (config >> 8) & 0xff;
+ if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
+ return -EINVAL;
+ cache_op = array_index_nospec(cache_op, PERF_COUNT_HW_CACHE_OP_MAX);
+
+ cache_result = (config >> 16) & 0xff;
+ if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
+ return -EINVAL;
+ cache_result = array_index_nospec(cache_result, PERF_COUNT_HW_CACHE_RESULT_MAX);
+
+ val = hw_cache_event_ids[cache_type][cache_op][cache_result];
+
+ if (val == 0)
+ return -ENOENT;
+
+ if (val == -1)
+ return -EINVAL;
+
+ hwc->config |= val;
+ attr->config1 = hw_cache_extra_regs[cache_type][cache_op][cache_result];
+ return x86_pmu_extra_regs(val, event);
+}
+
+int x86_reserve_hardware(void)
+{
+ int err = 0;
+
+ if (!atomic_inc_not_zero(&pmc_refcount)) {
+ mutex_lock(&pmc_reserve_mutex);
+ if (atomic_read(&pmc_refcount) == 0) {
+ if (!reserve_pmc_hardware())
+ err = -EBUSY;
+ else
+ reserve_ds_buffers();
+ }
+ if (!err)
+ atomic_inc(&pmc_refcount);
+ mutex_unlock(&pmc_reserve_mutex);
+ }
+
+ return err;
+}
+
+void x86_release_hardware(void)
+{
+ if (atomic_dec_and_mutex_lock(&pmc_refcount, &pmc_reserve_mutex)) {
+ release_pmc_hardware();
+ release_ds_buffers();
+ mutex_unlock(&pmc_reserve_mutex);
+ }
+}
+
+/*
+ * Check if we can create event of a certain type (that no conflicting events
+ * are present).
+ */
+int x86_add_exclusive(unsigned int what)
+{
+ int i;
+
+ /*
+ * When lbr_pt_coexist we allow PT to coexist with either LBR or BTS.
+ * LBR and BTS are still mutually exclusive.
+ */
+ if (x86_pmu.lbr_pt_coexist && what == x86_lbr_exclusive_pt)
+ goto out;
+
+ if (!atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what])) {
+ mutex_lock(&pmc_reserve_mutex);
+ for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++) {
+ if (i != what && atomic_read(&x86_pmu.lbr_exclusive[i]))
+ goto fail_unlock;
+ }
+ atomic_inc(&x86_pmu.lbr_exclusive[what]);
+ mutex_unlock(&pmc_reserve_mutex);
+ }
+
+out:
+ atomic_inc(&active_events);
+ return 0;
+
+fail_unlock:
+ mutex_unlock(&pmc_reserve_mutex);
+ return -EBUSY;
+}
+
+void x86_del_exclusive(unsigned int what)
+{
+ atomic_dec(&active_events);
+
+ /*
+ * See the comment in x86_add_exclusive().
+ */
+ if (x86_pmu.lbr_pt_coexist && what == x86_lbr_exclusive_pt)
+ return;
+
+ atomic_dec(&x86_pmu.lbr_exclusive[what]);
+}
+
+int x86_setup_perfctr(struct perf_event *event)
+{
+ struct perf_event_attr *attr = &event->attr;
+ struct hw_perf_event *hwc = &event->hw;
+ u64 config;
+
+ if (!is_sampling_event(event)) {
+ hwc->sample_period = x86_pmu.max_period;
+ hwc->last_period = hwc->sample_period;
+ local64_set(&hwc->period_left, hwc->sample_period);
+ }
+
+ if (attr->type == PERF_TYPE_RAW)
+ return x86_pmu_extra_regs(event->attr.config, event);
+
+ if (attr->type == PERF_TYPE_HW_CACHE)
+ return set_ext_hw_attr(hwc, event);
+
+ if (attr->config >= x86_pmu.max_events)
+ return -EINVAL;
+
+ attr->config = array_index_nospec((unsigned long)attr->config, x86_pmu.max_events);
+
+ /*
+ * The generic map:
+ */
+ config = x86_pmu.event_map(attr->config);
+
+ if (config == 0)
+ return -ENOENT;
+
+ if (config == -1LL)
+ return -EINVAL;
+
+ hwc->config |= config;
+
+ return 0;
+}
+
+/*
+ * check that branch_sample_type is compatible with
+ * settings needed for precise_ip > 1 which implies
+ * using the LBR to capture ALL taken branches at the
+ * priv levels of the measurement
+ */
+static inline int precise_br_compat(struct perf_event *event)
+{
+ u64 m = event->attr.branch_sample_type;
+ u64 b = 0;
+
+ /* must capture all branches */
+ if (!(m & PERF_SAMPLE_BRANCH_ANY))
+ return 0;
+
+ m &= PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_USER;
+
+ if (!event->attr.exclude_user)
+ b |= PERF_SAMPLE_BRANCH_USER;
+
+ if (!event->attr.exclude_kernel)
+ b |= PERF_SAMPLE_BRANCH_KERNEL;
+
+ /*
+ * ignore PERF_SAMPLE_BRANCH_HV, not supported on x86
+ */
+
+ return m == b;
+}
+
+int x86_pmu_max_precise(void)
+{
+ int precise = 0;
+
+ /* Support for constant skid */
+ if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) {
+ precise++;
+
+ /* Support for IP fixup */
+ if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2)
+ precise++;
+
+ if (x86_pmu.pebs_prec_dist)
+ precise++;
+ }
+ return precise;
+}
+
+int x86_pmu_hw_config(struct perf_event *event)
+{
+ if (event->attr.precise_ip) {
+ int precise = x86_pmu_max_precise();
+
+ if (event->attr.precise_ip > precise)
+ return -EOPNOTSUPP;
+
+ /* There's no sense in having PEBS for non sampling events: */
+ if (!is_sampling_event(event))
+ return -EINVAL;
+ }
+ /*
+ * check that PEBS LBR correction does not conflict with
+ * whatever the user is asking with attr->branch_sample_type
+ */
+ if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format < 2) {
+ u64 *br_type = &event->attr.branch_sample_type;
+
+ if (has_branch_stack(event)) {
+ if (!precise_br_compat(event))
+ return -EOPNOTSUPP;
+
+ /* branch_sample_type is compatible */
+
+ } else {
+ /*
+ * user did not specify branch_sample_type
+ *
+ * For PEBS fixups, we capture all
+ * the branches at the priv level of the
+ * event.
+ */
+ *br_type = PERF_SAMPLE_BRANCH_ANY;
+
+ if (!event->attr.exclude_user)
+ *br_type |= PERF_SAMPLE_BRANCH_USER;
+
+ if (!event->attr.exclude_kernel)
+ *br_type |= PERF_SAMPLE_BRANCH_KERNEL;
+ }
+ }
+
+ if (event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK)
+ event->attach_state |= PERF_ATTACH_TASK_DATA;
+
+ /*
+ * Generate PMC IRQs:
+ * (keep 'enabled' bit clear for now)
+ */
+ event->hw.config = ARCH_PERFMON_EVENTSEL_INT;
+
+ /*
+ * Count user and OS events unless requested not to
+ */
+ if (!event->attr.exclude_user)
+ event->hw.config |= ARCH_PERFMON_EVENTSEL_USR;
+ if (!event->attr.exclude_kernel)
+ event->hw.config |= ARCH_PERFMON_EVENTSEL_OS;
+
+ if (event->attr.type == PERF_TYPE_RAW)
+ event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
+
+ if (event->attr.sample_period && x86_pmu.limit_period) {
+ if (x86_pmu.limit_period(event, event->attr.sample_period) >
+ event->attr.sample_period)
+ return -EINVAL;
+ }
+
+ return x86_setup_perfctr(event);
+}
+
+/*
+ * Setup the hardware configuration for a given attr_type
+ */
+static int __x86_pmu_event_init(struct perf_event *event)
+{
+ int err;
+
+ if (!x86_pmu_initialized())
+ return -ENODEV;
+
+ err = x86_reserve_hardware();
+ if (err)
+ return err;
+
+ atomic_inc(&active_events);
+ event->destroy = hw_perf_event_destroy;
+
+ event->hw.idx = -1;
+ event->hw.last_cpu = -1;
+ event->hw.last_tag = ~0ULL;
+
+ /* mark unused */
+ event->hw.extra_reg.idx = EXTRA_REG_NONE;
+ event->hw.branch_reg.idx = EXTRA_REG_NONE;
+
+ return x86_pmu.hw_config(event);
+}
+
+void x86_pmu_disable_all(void)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ int idx;
+
+ for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ u64 val;
+
+ if (!test_bit(idx, cpuc->active_mask))
+ continue;
+ rdmsrl(x86_pmu_config_addr(idx), val);
+ if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE))
+ continue;
+ val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
+ wrmsrl(x86_pmu_config_addr(idx), val);
+ }
+}
+
+/*
+ * There may be PMI landing after enabled=0. The PMI hitting could be before or
+ * after disable_all.
+ *
+ * If PMI hits before disable_all, the PMU will be disabled in the NMI handler.
+ * It will not be re-enabled in the NMI handler again, because enabled=0. After
+ * handling the NMI, disable_all will be called, which will not change the
+ * state either. If PMI hits after disable_all, the PMU is already disabled
+ * before entering NMI handler. The NMI handler will not change the state
+ * either.
+ *
+ * So either situation is harmless.
+ */
+static void x86_pmu_disable(struct pmu *pmu)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ if (!x86_pmu_initialized())
+ return;
+
+ if (!cpuc->enabled)
+ return;
+
+ cpuc->n_added = 0;
+ cpuc->enabled = 0;
+ barrier();
+
+ x86_pmu.disable_all();
+}
+
+void x86_pmu_enable_all(int added)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ int idx;
+
+ for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
+
+ if (!test_bit(idx, cpuc->active_mask))
+ continue;
+
+ __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
+ }
+}
+
+static struct pmu pmu;
+
+static inline int is_x86_event(struct perf_event *event)
+{
+ return event->pmu == &pmu;
+}
+
+/*
+ * Event scheduler state:
+ *
+ * Assign events iterating over all events and counters, beginning
+ * with events with least weights first. Keep the current iterator
+ * state in struct sched_state.
+ */
+struct sched_state {
+ int weight;
+ int event; /* event index */
+ int counter; /* counter index */
+ int unassigned; /* number of events to be assigned left */
+ int nr_gp; /* number of GP counters used */
+ unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+};
+
+/* Total max is X86_PMC_IDX_MAX, but we are O(n!) limited */
+#define SCHED_STATES_MAX 2
+
+struct perf_sched {
+ int max_weight;
+ int max_events;
+ int max_gp;
+ int saved_states;
+ struct event_constraint **constraints;
+ struct sched_state state;
+ struct sched_state saved[SCHED_STATES_MAX];
+};
+
+/*
+ * Initialize interator that runs through all events and counters.
+ */
+static void perf_sched_init(struct perf_sched *sched, struct event_constraint **constraints,
+ int num, int wmin, int wmax, int gpmax)
+{
+ int idx;
+
+ memset(sched, 0, sizeof(*sched));
+ sched->max_events = num;
+ sched->max_weight = wmax;
+ sched->max_gp = gpmax;
+ sched->constraints = constraints;
+
+ for (idx = 0; idx < num; idx++) {
+ if (constraints[idx]->weight == wmin)
+ break;
+ }
+
+ sched->state.event = idx; /* start with min weight */
+ sched->state.weight = wmin;
+ sched->state.unassigned = num;
+}
+
+static void perf_sched_save_state(struct perf_sched *sched)
+{
+ if (WARN_ON_ONCE(sched->saved_states >= SCHED_STATES_MAX))
+ return;
+
+ sched->saved[sched->saved_states] = sched->state;
+ sched->saved_states++;
+}
+
+static bool perf_sched_restore_state(struct perf_sched *sched)
+{
+ if (!sched->saved_states)
+ return false;
+
+ sched->saved_states--;
+ sched->state = sched->saved[sched->saved_states];
+
+ /* continue with next counter: */
+ clear_bit(sched->state.counter++, sched->state.used);
+
+ return true;
+}
+
+/*
+ * Select a counter for the current event to schedule. Return true on
+ * success.
+ */
+static bool __perf_sched_find_counter(struct perf_sched *sched)
+{
+ struct event_constraint *c;
+ int idx;
+
+ if (!sched->state.unassigned)
+ return false;
+
+ if (sched->state.event >= sched->max_events)
+ return false;
+
+ c = sched->constraints[sched->state.event];
+ /* Prefer fixed purpose counters */
+ if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) {
+ idx = INTEL_PMC_IDX_FIXED;
+ for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) {
+ if (!__test_and_set_bit(idx, sched->state.used))
+ goto done;
+ }
+ }
+
+ /* Grab the first unused counter starting with idx */
+ idx = sched->state.counter;
+ for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) {
+ if (!__test_and_set_bit(idx, sched->state.used)) {
+ if (sched->state.nr_gp++ >= sched->max_gp)
+ return false;
+
+ goto done;
+ }
+ }
+
+ return false;
+
+done:
+ sched->state.counter = idx;
+
+ if (c->overlap)
+ perf_sched_save_state(sched);
+
+ return true;
+}
+
+static bool perf_sched_find_counter(struct perf_sched *sched)
+{
+ while (!__perf_sched_find_counter(sched)) {
+ if (!perf_sched_restore_state(sched))
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * Go through all unassigned events and find the next one to schedule.
+ * Take events with the least weight first. Return true on success.
+ */
+static bool perf_sched_next_event(struct perf_sched *sched)
+{
+ struct event_constraint *c;
+
+ if (!sched->state.unassigned || !--sched->state.unassigned)
+ return false;
+
+ do {
+ /* next event */
+ sched->state.event++;
+ if (sched->state.event >= sched->max_events) {
+ /* next weight */
+ sched->state.event = 0;
+ sched->state.weight++;
+ if (sched->state.weight > sched->max_weight)
+ return false;
+ }
+ c = sched->constraints[sched->state.event];
+ } while (c->weight != sched->state.weight);
+
+ sched->state.counter = 0; /* start with first counter */
+
+ return true;
+}
+
+/*
+ * Assign a counter for each event.
+ */
+int perf_assign_events(struct event_constraint **constraints, int n,
+ int wmin, int wmax, int gpmax, int *assign)
+{
+ struct perf_sched sched;
+
+ perf_sched_init(&sched, constraints, n, wmin, wmax, gpmax);
+
+ do {
+ if (!perf_sched_find_counter(&sched))
+ break; /* failed */
+ if (assign)
+ assign[sched.state.event] = sched.state.counter;
+ } while (perf_sched_next_event(&sched));
+
+ return sched.state.unassigned;
+}
+EXPORT_SYMBOL_GPL(perf_assign_events);
+
+int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
+{
+ struct event_constraint *c;
+ unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+ struct perf_event *e;
+ int i, wmin, wmax, unsched = 0;
+ struct hw_perf_event *hwc;
+
+ bitmap_zero(used_mask, X86_PMC_IDX_MAX);
+
+ if (x86_pmu.start_scheduling)
+ x86_pmu.start_scheduling(cpuc);
+
+ for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) {
+ cpuc->event_constraint[i] = NULL;
+ c = x86_pmu.get_event_constraints(cpuc, i, cpuc->event_list[i]);
+ cpuc->event_constraint[i] = c;
+
+ wmin = min(wmin, c->weight);
+ wmax = max(wmax, c->weight);
+ }
+
+ /*
+ * fastpath, try to reuse previous register
+ */
+ for (i = 0; i < n; i++) {
+ hwc = &cpuc->event_list[i]->hw;
+ c = cpuc->event_constraint[i];
+
+ /* never assigned */
+ if (hwc->idx == -1)
+ break;
+
+ /* constraint still honored */
+ if (!test_bit(hwc->idx, c->idxmsk))
+ break;
+
+ /* not already used */
+ if (test_bit(hwc->idx, used_mask))
+ break;
+
+ __set_bit(hwc->idx, used_mask);
+ if (assign)
+ assign[i] = hwc->idx;
+ }
+
+ /* slow path */
+ if (i != n) {
+ int gpmax = x86_pmu.num_counters;
+
+ /*
+ * Do not allow scheduling of more than half the available
+ * generic counters.
+ *
+ * This helps avoid counter starvation of sibling thread by
+ * ensuring at most half the counters cannot be in exclusive
+ * mode. There is no designated counters for the limits. Any
+ * N/2 counters can be used. This helps with events with
+ * specific counter constraints.
+ */
+ if (is_ht_workaround_enabled() && !cpuc->is_fake &&
+ READ_ONCE(cpuc->excl_cntrs->exclusive_present))
+ gpmax /= 2;
+
+ unsched = perf_assign_events(cpuc->event_constraint, n, wmin,
+ wmax, gpmax, assign);
+ }
+
+ /*
+ * In case of success (unsched = 0), mark events as committed,
+ * so we do not put_constraint() in case new events are added
+ * and fail to be scheduled
+ *
+ * We invoke the lower level commit callback to lock the resource
+ *
+ * We do not need to do all of this in case we are called to
+ * validate an event group (assign == NULL)
+ */
+ if (!unsched && assign) {
+ for (i = 0; i < n; i++) {
+ e = cpuc->event_list[i];
+ e->hw.flags |= PERF_X86_EVENT_COMMITTED;
+ if (x86_pmu.commit_scheduling)
+ x86_pmu.commit_scheduling(cpuc, i, assign[i]);
+ }
+ } else {
+ for (i = 0; i < n; i++) {
+ e = cpuc->event_list[i];
+ /*
+ * do not put_constraint() on comitted events,
+ * because they are good to go
+ */
+ if ((e->hw.flags & PERF_X86_EVENT_COMMITTED))
+ continue;
+
+ /*
+ * release events that failed scheduling
+ */
+ if (x86_pmu.put_event_constraints)
+ x86_pmu.put_event_constraints(cpuc, e);
+ }
+ }
+
+ if (x86_pmu.stop_scheduling)
+ x86_pmu.stop_scheduling(cpuc);
+
+ return unsched ? -EINVAL : 0;
+}
+
+/*
+ * dogrp: true if must collect siblings events (group)
+ * returns total number of events and error code
+ */
+static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp)
+{
+ struct perf_event *event;
+ int n, max_count;
+
+ max_count = x86_pmu.num_counters + x86_pmu.num_counters_fixed;
+
+ /* current number of events already accepted */
+ n = cpuc->n_events;
+
+ if (is_x86_event(leader)) {
+ if (n >= max_count)
+ return -EINVAL;
+ cpuc->event_list[n] = leader;
+ n++;
+ }
+ if (!dogrp)
+ return n;
+
+ for_each_sibling_event(event, leader) {
+ if (!is_x86_event(event) ||
+ event->state <= PERF_EVENT_STATE_OFF)
+ continue;
+
+ if (n >= max_count)
+ return -EINVAL;
+
+ cpuc->event_list[n] = event;
+ n++;
+ }
+ return n;
+}
+
+static inline void x86_assign_hw_event(struct perf_event *event,
+ struct cpu_hw_events *cpuc, int i)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ hwc->idx = cpuc->assign[i];
+ hwc->last_cpu = smp_processor_id();
+ hwc->last_tag = ++cpuc->tags[i];
+
+ if (hwc->idx == INTEL_PMC_IDX_FIXED_BTS) {
+ hwc->config_base = 0;
+ hwc->event_base = 0;
+ } else if (hwc->idx >= INTEL_PMC_IDX_FIXED) {
+ hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
+ hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - INTEL_PMC_IDX_FIXED);
+ hwc->event_base_rdpmc = (hwc->idx - INTEL_PMC_IDX_FIXED) | 1<<30;
+ } else {
+ hwc->config_base = x86_pmu_config_addr(hwc->idx);
+ hwc->event_base = x86_pmu_event_addr(hwc->idx);
+ hwc->event_base_rdpmc = x86_pmu_rdpmc_index(hwc->idx);
+ }
+}
+
+static inline int match_prev_assignment(struct hw_perf_event *hwc,
+ struct cpu_hw_events *cpuc,
+ int i)
+{
+ return hwc->idx == cpuc->assign[i] &&
+ hwc->last_cpu == smp_processor_id() &&
+ hwc->last_tag == cpuc->tags[i];
+}
+
+static void x86_pmu_start(struct perf_event *event, int flags);
+
+static void x86_pmu_enable(struct pmu *pmu)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct perf_event *event;
+ struct hw_perf_event *hwc;
+ int i, added = cpuc->n_added;
+
+ if (!x86_pmu_initialized())
+ return;
+
+ if (cpuc->enabled)
+ return;
+
+ if (cpuc->n_added) {
+ int n_running = cpuc->n_events - cpuc->n_added;
+ /*
+ * apply assignment obtained either from
+ * hw_perf_group_sched_in() or x86_pmu_enable()
+ *
+ * step1: save events moving to new counters
+ */
+ for (i = 0; i < n_running; i++) {
+ event = cpuc->event_list[i];
+ hwc = &event->hw;
+
+ /*
+ * we can avoid reprogramming counter if:
+ * - assigned same counter as last time
+ * - running on same CPU as last time
+ * - no other event has used the counter since
+ */
+ if (hwc->idx == -1 ||
+ match_prev_assignment(hwc, cpuc, i))
+ continue;
+
+ /*
+ * Ensure we don't accidentally enable a stopped
+ * counter simply because we rescheduled.
+ */
+ if (hwc->state & PERF_HES_STOPPED)
+ hwc->state |= PERF_HES_ARCH;
+
+ x86_pmu_stop(event, PERF_EF_UPDATE);
+ }
+
+ /*
+ * step2: reprogram moved events into new counters
+ */
+ for (i = 0; i < cpuc->n_events; i++) {
+ event = cpuc->event_list[i];
+ hwc = &event->hw;
+
+ if (!match_prev_assignment(hwc, cpuc, i))
+ x86_assign_hw_event(event, cpuc, i);
+ else if (i < n_running)
+ continue;
+
+ if (hwc->state & PERF_HES_ARCH)
+ continue;
+
+ x86_pmu_start(event, PERF_EF_RELOAD);
+ }
+ cpuc->n_added = 0;
+ perf_events_lapic_init();
+ }
+
+ cpuc->enabled = 1;
+ barrier();
+
+ x86_pmu.enable_all(added);
+}
+
+static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
+
+/*
+ * Set the next IRQ period, based on the hwc->period_left value.
+ * To be called with the event disabled in hw:
+ */
+int x86_perf_event_set_period(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ s64 left = local64_read(&hwc->period_left);
+ s64 period = hwc->sample_period;
+ int ret = 0, idx = hwc->idx;
+
+ if (idx == INTEL_PMC_IDX_FIXED_BTS)
+ return 0;
+
+ /*
+ * If we are way outside a reasonable range then just skip forward:
+ */
+ if (unlikely(left <= -period)) {
+ left = period;
+ local64_set(&hwc->period_left, left);
+ hwc->last_period = period;
+ ret = 1;
+ }
+
+ if (unlikely(left <= 0)) {
+ left += period;
+ local64_set(&hwc->period_left, left);
+ hwc->last_period = period;
+ ret = 1;
+ }
+ /*
+ * Quirk: certain CPUs dont like it if just 1 hw_event is left:
+ */
+ if (unlikely(left < 2))
+ left = 2;
+
+ if (left > x86_pmu.max_period)
+ left = x86_pmu.max_period;
+
+ if (x86_pmu.limit_period)
+ left = x86_pmu.limit_period(event, left);
+
+ per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
+
+ /*
+ * The hw event starts counting from this event offset,
+ * mark it to be able to extra future deltas:
+ */
+ local64_set(&hwc->prev_count, (u64)-left);
+
+ wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask);
+
+ /*
+ * Due to erratum on certan cpu we need
+ * a second write to be sure the register
+ * is updated properly
+ */
+ if (x86_pmu.perfctr_second_write) {
+ wrmsrl(hwc->event_base,
+ (u64)(-left) & x86_pmu.cntval_mask);
+ }
+
+ perf_event_update_userpage(event);
+
+ return ret;
+}
+
+void x86_pmu_enable_event(struct perf_event *event)
+{
+ if (__this_cpu_read(cpu_hw_events.enabled))
+ __x86_pmu_enable_event(&event->hw,
+ ARCH_PERFMON_EVENTSEL_ENABLE);
+}
+
+/*
+ * Add a single event to the PMU.
+ *
+ * The event is added to the group of enabled events
+ * but only if it can be scehduled with existing events.
+ */
+static int x86_pmu_add(struct perf_event *event, int flags)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct hw_perf_event *hwc;
+ int assign[X86_PMC_IDX_MAX];
+ int n, n0, ret;
+
+ hwc = &event->hw;
+
+ n0 = cpuc->n_events;
+ ret = n = collect_events(cpuc, event, false);
+ if (ret < 0)
+ goto out;
+
+ hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+ if (!(flags & PERF_EF_START))
+ hwc->state |= PERF_HES_ARCH;
+
+ /*
+ * If group events scheduling transaction was started,
+ * skip the schedulability test here, it will be performed
+ * at commit time (->commit_txn) as a whole.
+ *
+ * If commit fails, we'll call ->del() on all events
+ * for which ->add() was called.
+ */
+ if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
+ goto done_collect;
+
+ ret = x86_pmu.schedule_events(cpuc, n, assign);
+ if (ret)
+ goto out;
+ /*
+ * copy new assignment, now we know it is possible
+ * will be used by hw_perf_enable()
+ */
+ memcpy(cpuc->assign, assign, n*sizeof(int));
+
+done_collect:
+ /*
+ * Commit the collect_events() state. See x86_pmu_del() and
+ * x86_pmu_*_txn().
+ */
+ cpuc->n_events = n;
+ cpuc->n_added += n - n0;
+ cpuc->n_txn += n - n0;
+
+ if (x86_pmu.add) {
+ /*
+ * This is before x86_pmu_enable() will call x86_pmu_start(),
+ * so we enable LBRs before an event needs them etc..
+ */
+ x86_pmu.add(event);
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+
+static void x86_pmu_start(struct perf_event *event, int flags)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ int idx = event->hw.idx;
+
+ if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+ return;
+
+ if (WARN_ON_ONCE(idx == -1))
+ return;
+
+ if (flags & PERF_EF_RELOAD) {
+ WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
+ x86_perf_event_set_period(event);
+ }
+
+ event->hw.state = 0;
+
+ cpuc->events[idx] = event;
+ __set_bit(idx, cpuc->active_mask);
+ __set_bit(idx, cpuc->running);
+ x86_pmu.enable(event);
+ perf_event_update_userpage(event);
+}
+
+void perf_event_print_debug(void)
+{
+ u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
+ u64 pebs, debugctl;
+ struct cpu_hw_events *cpuc;
+ unsigned long flags;
+ int cpu, idx;
+
+ if (!x86_pmu.num_counters)
+ return;
+
+ local_irq_save(flags);
+
+ cpu = smp_processor_id();
+ cpuc = &per_cpu(cpu_hw_events, cpu);
+
+ if (x86_pmu.version >= 2) {
+ rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
+ rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+ rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
+ rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
+
+ pr_info("\n");
+ pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl);
+ pr_info("CPU#%d: status: %016llx\n", cpu, status);
+ pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow);
+ pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed);
+ if (x86_pmu.pebs_constraints) {
+ rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
+ pr_info("CPU#%d: pebs: %016llx\n", cpu, pebs);
+ }
+ if (x86_pmu.lbr_nr) {
+ rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+ pr_info("CPU#%d: debugctl: %016llx\n", cpu, debugctl);
+ }
+ }
+ pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask);
+
+ for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ rdmsrl(x86_pmu_config_addr(idx), pmc_ctrl);
+ rdmsrl(x86_pmu_event_addr(idx), pmc_count);
+
+ prev_left = per_cpu(pmc_prev_left[idx], cpu);
+
+ pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n",
+ cpu, idx, pmc_ctrl);
+ pr_info("CPU#%d: gen-PMC%d count: %016llx\n",
+ cpu, idx, pmc_count);
+ pr_info("CPU#%d: gen-PMC%d left: %016llx\n",
+ cpu, idx, prev_left);
+ }
+ for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
+ rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
+
+ pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
+ cpu, idx, pmc_count);
+ }
+ local_irq_restore(flags);
+}
+
+void x86_pmu_stop(struct perf_event *event, int flags)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (test_bit(hwc->idx, cpuc->active_mask)) {
+ x86_pmu.disable(event);
+ __clear_bit(hwc->idx, cpuc->active_mask);
+ cpuc->events[hwc->idx] = NULL;
+ WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+ hwc->state |= PERF_HES_STOPPED;
+ }
+
+ if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+ /*
+ * Drain the remaining delta count out of a event
+ * that we are disabling:
+ */
+ x86_perf_event_update(event);
+ hwc->state |= PERF_HES_UPTODATE;
+ }
+}
+
+static void x86_pmu_del(struct perf_event *event, int flags)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ int i;
+
+ /*
+ * event is descheduled
+ */
+ event->hw.flags &= ~PERF_X86_EVENT_COMMITTED;
+
+ /*
+ * If we're called during a txn, we only need to undo x86_pmu.add.
+ * The events never got scheduled and ->cancel_txn will truncate
+ * the event_list.
+ *
+ * XXX assumes any ->del() called during a TXN will only be on
+ * an event added during that same TXN.
+ */
+ if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
+ goto do_del;
+
+ /*
+ * Not a TXN, therefore cleanup properly.
+ */
+ x86_pmu_stop(event, PERF_EF_UPDATE);
+
+ for (i = 0; i < cpuc->n_events; i++) {
+ if (event == cpuc->event_list[i])
+ break;
+ }
+
+ if (WARN_ON_ONCE(i == cpuc->n_events)) /* called ->del() without ->add() ? */
+ return;
+
+ /* If we have a newly added event; make sure to decrease n_added. */
+ if (i >= cpuc->n_events - cpuc->n_added)
+ --cpuc->n_added;
+
+ if (x86_pmu.put_event_constraints)
+ x86_pmu.put_event_constraints(cpuc, event);
+
+ /* Delete the array entry. */
+ while (++i < cpuc->n_events) {
+ cpuc->event_list[i-1] = cpuc->event_list[i];
+ cpuc->event_constraint[i-1] = cpuc->event_constraint[i];
+ }
+ --cpuc->n_events;
+
+ perf_event_update_userpage(event);
+
+do_del:
+ if (x86_pmu.del) {
+ /*
+ * This is after x86_pmu_stop(); so we disable LBRs after any
+ * event can need them etc..
+ */
+ x86_pmu.del(event);
+ }
+}
+
+int x86_pmu_handle_irq(struct pt_regs *regs)
+{
+ struct perf_sample_data data;
+ struct cpu_hw_events *cpuc;
+ struct perf_event *event;
+ int idx, handled = 0;
+ u64 val;
+
+ cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ /*
+ * Some chipsets need to unmask the LVTPC in a particular spot
+ * inside the nmi handler. As a result, the unmasking was pushed
+ * into all the nmi handlers.
+ *
+ * This generic handler doesn't seem to have any issues where the
+ * unmasking occurs so it was left at the top.
+ */
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+
+ for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ if (!test_bit(idx, cpuc->active_mask))
+ continue;
+
+ event = cpuc->events[idx];
+
+ val = x86_perf_event_update(event);
+ if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
+ continue;
+
+ /*
+ * event overflow
+ */
+ handled++;
+ perf_sample_data_init(&data, 0, event->hw.last_period);
+
+ if (!x86_perf_event_set_period(event))
+ continue;
+
+ if (perf_event_overflow(event, &data, regs))
+ x86_pmu_stop(event, 0);
+ }
+
+ if (handled)
+ inc_irq_stat(apic_perf_irqs);
+
+ return handled;
+}
+
+void perf_events_lapic_init(void)
+{
+ if (!x86_pmu.apic || !x86_pmu_initialized())
+ return;
+
+ /*
+ * Always use NMI for PMU
+ */
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+}
+
+static int
+perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs)
+{
+ u64 start_clock;
+ u64 finish_clock;
+ int ret;
+
+ /*
+ * All PMUs/events that share this PMI handler should make sure to
+ * increment active_events for their events.
+ */
+ if (!atomic_read(&active_events))
+ return NMI_DONE;
+
+ start_clock = sched_clock();
+ ret = x86_pmu.handle_irq(regs);
+ finish_clock = sched_clock();
+
+ perf_sample_event_took(finish_clock - start_clock);
+
+ return ret;
+}
+NOKPROBE_SYMBOL(perf_event_nmi_handler);
+
+struct event_constraint emptyconstraint;
+struct event_constraint unconstrained;
+
+static int x86_pmu_prepare_cpu(unsigned int cpu)
+{
+ struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+ int i;
+
+ for (i = 0 ; i < X86_PERF_KFREE_MAX; i++)
+ cpuc->kfree_on_online[i] = NULL;
+ if (x86_pmu.cpu_prepare)
+ return x86_pmu.cpu_prepare(cpu);
+ return 0;
+}
+
+static int x86_pmu_dead_cpu(unsigned int cpu)
+{
+ if (x86_pmu.cpu_dead)
+ x86_pmu.cpu_dead(cpu);
+ return 0;
+}
+
+static int x86_pmu_online_cpu(unsigned int cpu)
+{
+ struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+ int i;
+
+ for (i = 0 ; i < X86_PERF_KFREE_MAX; i++) {
+ kfree(cpuc->kfree_on_online[i]);
+ cpuc->kfree_on_online[i] = NULL;
+ }
+ return 0;
+}
+
+static int x86_pmu_starting_cpu(unsigned int cpu)
+{
+ if (x86_pmu.cpu_starting)
+ x86_pmu.cpu_starting(cpu);
+ return 0;
+}
+
+static int x86_pmu_dying_cpu(unsigned int cpu)
+{
+ if (x86_pmu.cpu_dying)
+ x86_pmu.cpu_dying(cpu);
+ return 0;
+}
+
+static void __init pmu_check_apic(void)
+{
+ if (boot_cpu_has(X86_FEATURE_APIC))
+ return;
+
+ x86_pmu.apic = 0;
+ pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
+ pr_info("no hardware sampling interrupt available.\n");
+
+ /*
+ * If we have a PMU initialized but no APIC
+ * interrupts, we cannot sample hardware
+ * events (user-space has to fall back and
+ * sample via a hrtimer based software event):
+ */
+ pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT;
+
+}
+
+static struct attribute_group x86_pmu_format_group = {
+ .name = "format",
+ .attrs = NULL,
+};
+
+/*
+ * Remove all undefined events (x86_pmu.event_map(id) == 0)
+ * out of events_attr attributes.
+ */
+static void __init filter_events(struct attribute **attrs)
+{
+ struct device_attribute *d;
+ struct perf_pmu_events_attr *pmu_attr;
+ int offset = 0;
+ int i, j;
+
+ for (i = 0; attrs[i]; i++) {
+ d = (struct device_attribute *)attrs[i];
+ pmu_attr = container_of(d, struct perf_pmu_events_attr, attr);
+ /* str trumps id */
+ if (pmu_attr->event_str)
+ continue;
+ if (x86_pmu.event_map(i + offset))
+ continue;
+
+ for (j = i; attrs[j]; j++)
+ attrs[j] = attrs[j + 1];
+
+ /* Check the shifted attr. */
+ i--;
+
+ /*
+ * event_map() is index based, the attrs array is organized
+ * by increasing event index. If we shift the events, then
+ * we need to compensate for the event_map(), otherwise
+ * we are looking up the wrong event in the map
+ */
+ offset++;
+ }
+}
+
+/* Merge two pointer arrays */
+__init struct attribute **merge_attr(struct attribute **a, struct attribute **b)
+{
+ struct attribute **new;
+ int j, i;
+
+ for (j = 0; a[j]; j++)
+ ;
+ for (i = 0; b[i]; i++)
+ j++;
+ j++;
+
+ new = kmalloc_array(j, sizeof(struct attribute *), GFP_KERNEL);
+ if (!new)
+ return NULL;
+
+ j = 0;
+ for (i = 0; a[i]; i++)
+ new[j++] = a[i];
+ for (i = 0; b[i]; i++)
+ new[j++] = b[i];
+ new[j] = NULL;
+
+ return new;
+}
+
+ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, char *page)
+{
+ struct perf_pmu_events_attr *pmu_attr = \
+ container_of(attr, struct perf_pmu_events_attr, attr);
+ u64 config = x86_pmu.event_map(pmu_attr->id);
+
+ /* string trumps id */
+ if (pmu_attr->event_str)
+ return sprintf(page, "%s", pmu_attr->event_str);
+
+ return x86_pmu.events_sysfs_show(page, config);
+}
+EXPORT_SYMBOL_GPL(events_sysfs_show);
+
+ssize_t events_ht_sysfs_show(struct device *dev, struct device_attribute *attr,
+ char *page)
+{
+ struct perf_pmu_events_ht_attr *pmu_attr =
+ container_of(attr, struct perf_pmu_events_ht_attr, attr);
+
+ /*
+ * Report conditional events depending on Hyper-Threading.
+ *
+ * This is overly conservative as usually the HT special
+ * handling is not needed if the other CPU thread is idle.
+ *
+ * Note this does not (and cannot) handle the case when thread
+ * siblings are invisible, for example with virtualization
+ * if they are owned by some other guest. The user tool
+ * has to re-read when a thread sibling gets onlined later.
+ */
+ return sprintf(page, "%s",
+ topology_max_smt_threads() > 1 ?
+ pmu_attr->event_str_ht :
+ pmu_attr->event_str_noht);
+}
+
+EVENT_ATTR(cpu-cycles, CPU_CYCLES );
+EVENT_ATTR(instructions, INSTRUCTIONS );
+EVENT_ATTR(cache-references, CACHE_REFERENCES );
+EVENT_ATTR(cache-misses, CACHE_MISSES );
+EVENT_ATTR(branch-instructions, BRANCH_INSTRUCTIONS );
+EVENT_ATTR(branch-misses, BRANCH_MISSES );
+EVENT_ATTR(bus-cycles, BUS_CYCLES );
+EVENT_ATTR(stalled-cycles-frontend, STALLED_CYCLES_FRONTEND );
+EVENT_ATTR(stalled-cycles-backend, STALLED_CYCLES_BACKEND );
+EVENT_ATTR(ref-cycles, REF_CPU_CYCLES );
+
+static struct attribute *empty_attrs;
+
+static struct attribute *events_attr[] = {
+ EVENT_PTR(CPU_CYCLES),
+ EVENT_PTR(INSTRUCTIONS),
+ EVENT_PTR(CACHE_REFERENCES),
+ EVENT_PTR(CACHE_MISSES),
+ EVENT_PTR(BRANCH_INSTRUCTIONS),
+ EVENT_PTR(BRANCH_MISSES),
+ EVENT_PTR(BUS_CYCLES),
+ EVENT_PTR(STALLED_CYCLES_FRONTEND),
+ EVENT_PTR(STALLED_CYCLES_BACKEND),
+ EVENT_PTR(REF_CPU_CYCLES),
+ NULL,
+};
+
+static struct attribute_group x86_pmu_events_group = {
+ .name = "events",
+ .attrs = events_attr,
+};
+
+ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event)
+{
+ u64 umask = (config & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
+ u64 cmask = (config & ARCH_PERFMON_EVENTSEL_CMASK) >> 24;
+ bool edge = (config & ARCH_PERFMON_EVENTSEL_EDGE);
+ bool pc = (config & ARCH_PERFMON_EVENTSEL_PIN_CONTROL);
+ bool any = (config & ARCH_PERFMON_EVENTSEL_ANY);
+ bool inv = (config & ARCH_PERFMON_EVENTSEL_INV);
+ ssize_t ret;
+
+ /*
+ * We have whole page size to spend and just little data
+ * to write, so we can safely use sprintf.
+ */
+ ret = sprintf(page, "event=0x%02llx", event);
+
+ if (umask)
+ ret += sprintf(page + ret, ",umask=0x%02llx", umask);
+
+ if (edge)
+ ret += sprintf(page + ret, ",edge");
+
+ if (pc)
+ ret += sprintf(page + ret, ",pc");
+
+ if (any)
+ ret += sprintf(page + ret, ",any");
+
+ if (inv)
+ ret += sprintf(page + ret, ",inv");
+
+ if (cmask)
+ ret += sprintf(page + ret, ",cmask=0x%02llx", cmask);
+
+ ret += sprintf(page + ret, "\n");
+
+ return ret;
+}
+
+static struct attribute_group x86_pmu_attr_group;
+static struct attribute_group x86_pmu_caps_group;
+
+static int __init init_hw_perf_events(void)
+{
+ struct x86_pmu_quirk *quirk;
+ int err;
+
+ pr_info("Performance Events: ");
+
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_INTEL:
+ err = intel_pmu_init();
+ break;
+ case X86_VENDOR_AMD:
+ err = amd_pmu_init();
+ break;
+ default:
+ err = -ENOTSUPP;
+ }
+ if (err != 0) {
+ pr_cont("no PMU driver, software events only.\n");
+ return 0;
+ }
+
+ pmu_check_apic();
+
+ /* sanity check that the hardware exists or is emulated */
+ if (!check_hw_exists())
+ return 0;
+
+ pr_cont("%s PMU driver.\n", x86_pmu.name);
+
+ x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */
+
+ for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next)
+ quirk->func();
+
+ if (!x86_pmu.intel_ctrl)
+ x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
+
+ perf_events_lapic_init();
+ register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI");
+
+ unconstrained = (struct event_constraint)
+ __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1,
+ 0, x86_pmu.num_counters, 0, 0);
+
+ x86_pmu_format_group.attrs = x86_pmu.format_attrs;
+
+ if (x86_pmu.caps_attrs) {
+ struct attribute **tmp;
+
+ tmp = merge_attr(x86_pmu_caps_group.attrs, x86_pmu.caps_attrs);
+ if (!WARN_ON(!tmp))
+ x86_pmu_caps_group.attrs = tmp;
+ }
+
+ if (x86_pmu.event_attrs)
+ x86_pmu_events_group.attrs = x86_pmu.event_attrs;
+
+ if (!x86_pmu.events_sysfs_show)
+ x86_pmu_events_group.attrs = &empty_attrs;
+ else
+ filter_events(x86_pmu_events_group.attrs);
+
+ if (x86_pmu.cpu_events) {
+ struct attribute **tmp;
+
+ tmp = merge_attr(x86_pmu_events_group.attrs, x86_pmu.cpu_events);
+ if (!WARN_ON(!tmp))
+ x86_pmu_events_group.attrs = tmp;
+ }
+
+ if (x86_pmu.attrs) {
+ struct attribute **tmp;
+
+ tmp = merge_attr(x86_pmu_attr_group.attrs, x86_pmu.attrs);
+ if (!WARN_ON(!tmp))
+ x86_pmu_attr_group.attrs = tmp;
+ }
+
+ pr_info("... version: %d\n", x86_pmu.version);
+ pr_info("... bit width: %d\n", x86_pmu.cntval_bits);
+ pr_info("... generic registers: %d\n", x86_pmu.num_counters);
+ pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask);
+ pr_info("... max period: %016Lx\n", x86_pmu.max_period);
+ pr_info("... fixed-purpose events: %d\n", x86_pmu.num_counters_fixed);
+ pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl);
+
+ /*
+ * Install callbacks. Core will call them for each online
+ * cpu.
+ */
+ err = cpuhp_setup_state(CPUHP_PERF_X86_PREPARE, "perf/x86:prepare",
+ x86_pmu_prepare_cpu, x86_pmu_dead_cpu);
+ if (err)
+ return err;
+
+ err = cpuhp_setup_state(CPUHP_AP_PERF_X86_STARTING,
+ "perf/x86:starting", x86_pmu_starting_cpu,
+ x86_pmu_dying_cpu);
+ if (err)
+ goto out;
+
+ err = cpuhp_setup_state(CPUHP_AP_PERF_X86_ONLINE, "perf/x86:online",
+ x86_pmu_online_cpu, NULL);
+ if (err)
+ goto out1;
+
+ err = perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW);
+ if (err)
+ goto out2;
+
+ return 0;
+
+out2:
+ cpuhp_remove_state(CPUHP_AP_PERF_X86_ONLINE);
+out1:
+ cpuhp_remove_state(CPUHP_AP_PERF_X86_STARTING);
+out:
+ cpuhp_remove_state(CPUHP_PERF_X86_PREPARE);
+ return err;
+}
+early_initcall(init_hw_perf_events);
+
+static inline void x86_pmu_read(struct perf_event *event)
+{
+ if (x86_pmu.read)
+ return x86_pmu.read(event);
+ x86_perf_event_update(event);
+}
+
+/*
+ * Start group events scheduling transaction
+ * Set the flag to make pmu::enable() not perform the
+ * schedulability test, it will be performed at commit time
+ *
+ * We only support PERF_PMU_TXN_ADD transactions. Save the
+ * transaction flags but otherwise ignore non-PERF_PMU_TXN_ADD
+ * transactions.
+ */
+static void x86_pmu_start_txn(struct pmu *pmu, unsigned int txn_flags)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ WARN_ON_ONCE(cpuc->txn_flags); /* txn already in flight */
+
+ cpuc->txn_flags = txn_flags;
+ if (txn_flags & ~PERF_PMU_TXN_ADD)
+ return;
+
+ perf_pmu_disable(pmu);
+ __this_cpu_write(cpu_hw_events.n_txn, 0);
+}
+
+/*
+ * Stop group events scheduling transaction
+ * Clear the flag and pmu::enable() will perform the
+ * schedulability test.
+ */
+static void x86_pmu_cancel_txn(struct pmu *pmu)
+{
+ unsigned int txn_flags;
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ WARN_ON_ONCE(!cpuc->txn_flags); /* no txn in flight */
+
+ txn_flags = cpuc->txn_flags;
+ cpuc->txn_flags = 0;
+ if (txn_flags & ~PERF_PMU_TXN_ADD)
+ return;
+
+ /*
+ * Truncate collected array by the number of events added in this
+ * transaction. See x86_pmu_add() and x86_pmu_*_txn().
+ */
+ __this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn));
+ __this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn));
+ perf_pmu_enable(pmu);
+}
+
+/*
+ * Commit group events scheduling transaction
+ * Perform the group schedulability test as a whole
+ * Return 0 if success
+ *
+ * Does not cancel the transaction on failure; expects the caller to do this.
+ */
+static int x86_pmu_commit_txn(struct pmu *pmu)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ int assign[X86_PMC_IDX_MAX];
+ int n, ret;
+
+ WARN_ON_ONCE(!cpuc->txn_flags); /* no txn in flight */
+
+ if (cpuc->txn_flags & ~PERF_PMU_TXN_ADD) {
+ cpuc->txn_flags = 0;
+ return 0;
+ }
+
+ n = cpuc->n_events;
+
+ if (!x86_pmu_initialized())
+ return -EAGAIN;
+
+ ret = x86_pmu.schedule_events(cpuc, n, assign);
+ if (ret)
+ return ret;
+
+ /*
+ * copy new assignment, now we know it is possible
+ * will be used by hw_perf_enable()
+ */
+ memcpy(cpuc->assign, assign, n*sizeof(int));
+
+ cpuc->txn_flags = 0;
+ perf_pmu_enable(pmu);
+ return 0;
+}
+/*
+ * a fake_cpuc is used to validate event groups. Due to
+ * the extra reg logic, we need to also allocate a fake
+ * per_core and per_cpu structure. Otherwise, group events
+ * using extra reg may conflict without the kernel being
+ * able to catch this when the last event gets added to
+ * the group.
+ */
+static void free_fake_cpuc(struct cpu_hw_events *cpuc)
+{
+ intel_cpuc_finish(cpuc);
+ kfree(cpuc);
+}
+
+static struct cpu_hw_events *allocate_fake_cpuc(void)
+{
+ struct cpu_hw_events *cpuc;
+ int cpu = raw_smp_processor_id();
+
+ cpuc = kzalloc(sizeof(*cpuc), GFP_KERNEL);
+ if (!cpuc)
+ return ERR_PTR(-ENOMEM);
+ cpuc->is_fake = 1;
+
+ if (intel_cpuc_prepare(cpuc, cpu))
+ goto error;
+
+ return cpuc;
+error:
+ free_fake_cpuc(cpuc);
+ return ERR_PTR(-ENOMEM);
+}
+
+/*
+ * validate that we can schedule this event
+ */
+static int validate_event(struct perf_event *event)
+{
+ struct cpu_hw_events *fake_cpuc;
+ struct event_constraint *c;
+ int ret = 0;
+
+ fake_cpuc = allocate_fake_cpuc();
+ if (IS_ERR(fake_cpuc))
+ return PTR_ERR(fake_cpuc);
+
+ c = x86_pmu.get_event_constraints(fake_cpuc, -1, event);
+
+ if (!c || !c->weight)
+ ret = -EINVAL;
+
+ if (x86_pmu.put_event_constraints)
+ x86_pmu.put_event_constraints(fake_cpuc, event);
+
+ free_fake_cpuc(fake_cpuc);
+
+ return ret;
+}
+
+/*
+ * validate a single event group
+ *
+ * validation include:
+ * - check events are compatible which each other
+ * - events do not compete for the same counter
+ * - number of events <= number of counters
+ *
+ * validation ensures the group can be loaded onto the
+ * PMU if it was the only group available.
+ */
+static int validate_group(struct perf_event *event)
+{
+ struct perf_event *leader = event->group_leader;
+ struct cpu_hw_events *fake_cpuc;
+ int ret = -EINVAL, n;
+
+ fake_cpuc = allocate_fake_cpuc();
+ if (IS_ERR(fake_cpuc))
+ return PTR_ERR(fake_cpuc);
+ /*
+ * the event is not yet connected with its
+ * siblings therefore we must first collect
+ * existing siblings, then add the new event
+ * before we can simulate the scheduling
+ */
+ n = collect_events(fake_cpuc, leader, true);
+ if (n < 0)
+ goto out;
+
+ fake_cpuc->n_events = n;
+ n = collect_events(fake_cpuc, event, false);
+ if (n < 0)
+ goto out;
+
+ fake_cpuc->n_events = n;
+
+ ret = x86_pmu.schedule_events(fake_cpuc, n, NULL);
+
+out:
+ free_fake_cpuc(fake_cpuc);
+ return ret;
+}
+
+static int x86_pmu_event_init(struct perf_event *event)
+{
+ struct pmu *tmp;
+ int err;
+
+ switch (event->attr.type) {
+ case PERF_TYPE_RAW:
+ case PERF_TYPE_HARDWARE:
+ case PERF_TYPE_HW_CACHE:
+ break;
+
+ default:
+ return -ENOENT;
+ }
+
+ err = __x86_pmu_event_init(event);
+ if (!err) {
+ /*
+ * we temporarily connect event to its pmu
+ * such that validate_group() can classify
+ * it as an x86 event using is_x86_event()
+ */
+ tmp = event->pmu;
+ event->pmu = &pmu;
+
+ if (event->group_leader != event)
+ err = validate_group(event);
+ else
+ err = validate_event(event);
+
+ event->pmu = tmp;
+ }
+ if (err) {
+ if (event->destroy)
+ event->destroy(event);
+ event->destroy = NULL;
+ }
+
+ if (READ_ONCE(x86_pmu.attr_rdpmc) &&
+ !(event->hw.flags & PERF_X86_EVENT_LARGE_PEBS))
+ event->hw.flags |= PERF_X86_EVENT_RDPMC_ALLOWED;
+
+ return err;
+}
+
+static void refresh_pce(void *ignored)
+{
+ load_mm_cr4(this_cpu_read(cpu_tlbstate.loaded_mm));
+}
+
+static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm)
+{
+ if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
+ return;
+
+ /*
+ * This function relies on not being called concurrently in two
+ * tasks in the same mm. Otherwise one task could observe
+ * perf_rdpmc_allowed > 1 and return all the way back to
+ * userspace with CR4.PCE clear while another task is still
+ * doing on_each_cpu_mask() to propagate CR4.PCE.
+ *
+ * For now, this can't happen because all callers hold mmap_sem
+ * for write. If this changes, we'll need a different solution.
+ */
+ lockdep_assert_held_exclusive(&mm->mmap_sem);
+
+ if (atomic_inc_return(&mm->context.perf_rdpmc_allowed) == 1)
+ on_each_cpu_mask(mm_cpumask(mm), refresh_pce, NULL, 1);
+}
+
+static void x86_pmu_event_unmapped(struct perf_event *event, struct mm_struct *mm)
+{
+
+ if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
+ return;
+
+ if (atomic_dec_and_test(&mm->context.perf_rdpmc_allowed))
+ on_each_cpu_mask(mm_cpumask(mm), refresh_pce, NULL, 1);
+}
+
+static int x86_pmu_event_idx(struct perf_event *event)
+{
+ int idx = event->hw.idx;
+
+ if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
+ return 0;
+
+ if (x86_pmu.num_counters_fixed && idx >= INTEL_PMC_IDX_FIXED) {
+ idx -= INTEL_PMC_IDX_FIXED;
+ idx |= 1 << 30;
+ }
+
+ return idx + 1;
+}
+
+static ssize_t get_attr_rdpmc(struct device *cdev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc);
+}
+
+static ssize_t set_attr_rdpmc(struct device *cdev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned long val;
+ ssize_t ret;
+
+ ret = kstrtoul(buf, 0, &val);
+ if (ret)
+ return ret;
+
+ if (val > 2)
+ return -EINVAL;
+
+ if (x86_pmu.attr_rdpmc_broken)
+ return -ENOTSUPP;
+
+ if ((val == 2) != (x86_pmu.attr_rdpmc == 2)) {
+ /*
+ * Changing into or out of always available, aka
+ * perf-event-bypassing mode. This path is extremely slow,
+ * but only root can trigger it, so it's okay.
+ */
+ if (val == 2)
+ static_branch_inc(&rdpmc_always_available_key);
+ else
+ static_branch_dec(&rdpmc_always_available_key);
+ on_each_cpu(refresh_pce, NULL, 1);
+ }
+
+ x86_pmu.attr_rdpmc = val;
+
+ return count;
+}
+
+static DEVICE_ATTR(rdpmc, S_IRUSR | S_IWUSR, get_attr_rdpmc, set_attr_rdpmc);
+
+static struct attribute *x86_pmu_attrs[] = {
+ &dev_attr_rdpmc.attr,
+ NULL,
+};
+
+static struct attribute_group x86_pmu_attr_group = {
+ .attrs = x86_pmu_attrs,
+};
+
+static ssize_t max_precise_show(struct device *cdev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu_max_precise());
+}
+
+static DEVICE_ATTR_RO(max_precise);
+
+static struct attribute *x86_pmu_caps_attrs[] = {
+ &dev_attr_max_precise.attr,
+ NULL
+};
+
+static struct attribute_group x86_pmu_caps_group = {
+ .name = "caps",
+ .attrs = x86_pmu_caps_attrs,
+};
+
+static const struct attribute_group *x86_pmu_attr_groups[] = {
+ &x86_pmu_attr_group,
+ &x86_pmu_format_group,
+ &x86_pmu_events_group,
+ &x86_pmu_caps_group,
+ NULL,
+};
+
+static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in)
+{
+ if (x86_pmu.sched_task)
+ x86_pmu.sched_task(ctx, sched_in);
+}
+
+void perf_check_microcode(void)
+{
+ if (x86_pmu.check_microcode)
+ x86_pmu.check_microcode();
+}
+
+static int x86_pmu_check_period(struct perf_event *event, u64 value)
+{
+ if (x86_pmu.check_period && x86_pmu.check_period(event, value))
+ return -EINVAL;
+
+ if (value && x86_pmu.limit_period) {
+ if (x86_pmu.limit_period(event, value) > value)
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static struct pmu pmu = {
+ .pmu_enable = x86_pmu_enable,
+ .pmu_disable = x86_pmu_disable,
+
+ .attr_groups = x86_pmu_attr_groups,
+
+ .event_init = x86_pmu_event_init,
+
+ .event_mapped = x86_pmu_event_mapped,
+ .event_unmapped = x86_pmu_event_unmapped,
+
+ .add = x86_pmu_add,
+ .del = x86_pmu_del,
+ .start = x86_pmu_start,
+ .stop = x86_pmu_stop,
+ .read = x86_pmu_read,
+
+ .start_txn = x86_pmu_start_txn,
+ .cancel_txn = x86_pmu_cancel_txn,
+ .commit_txn = x86_pmu_commit_txn,
+
+ .event_idx = x86_pmu_event_idx,
+ .sched_task = x86_pmu_sched_task,
+ .task_ctx_size = sizeof(struct x86_perf_task_context),
+ .check_period = x86_pmu_check_period,
+};
+
+void arch_perf_update_userpage(struct perf_event *event,
+ struct perf_event_mmap_page *userpg, u64 now)
+{
+ struct cyc2ns_data data;
+ u64 offset;
+
+ userpg->cap_user_time = 0;
+ userpg->cap_user_time_zero = 0;
+ userpg->cap_user_rdpmc =
+ !!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED);
+ userpg->pmc_width = x86_pmu.cntval_bits;
+
+ if (!using_native_sched_clock() || !sched_clock_stable())
+ return;
+
+ cyc2ns_read_begin(&data);
+
+ offset = data.cyc2ns_offset + __sched_clock_offset;
+
+ /*
+ * Internal timekeeping for enabled/running/stopped times
+ * is always in the local_clock domain.
+ */
+ userpg->cap_user_time = 1;
+ userpg->time_mult = data.cyc2ns_mul;
+ userpg->time_shift = data.cyc2ns_shift;
+ userpg->time_offset = offset - now;
+
+ /*
+ * cap_user_time_zero doesn't make sense when we're using a different
+ * time base for the records.
+ */
+ if (!event->attr.use_clockid) {
+ userpg->cap_user_time_zero = 1;
+ userpg->time_zero = offset;
+ }
+
+ cyc2ns_read_end();
+}
+
+void
+perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
+{
+ struct unwind_state state;
+ unsigned long addr;
+
+ if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+ /* TODO: We don't support guest os callchain now */
+ return;
+ }
+
+ if (perf_callchain_store(entry, regs->ip))
+ return;
+
+ for (unwind_start(&state, current, regs, NULL); !unwind_done(&state);
+ unwind_next_frame(&state)) {
+ addr = unwind_get_return_address(&state);
+ if (!addr || perf_callchain_store(entry, addr))
+ return;
+ }
+}
+
+static inline int
+valid_user_frame(const void __user *fp, unsigned long size)
+{
+ return (__range_not_ok(fp, size, TASK_SIZE) == 0);
+}
+
+static unsigned long get_segment_base(unsigned int segment)
+{
+ struct desc_struct *desc;
+ unsigned int idx = segment >> 3;
+
+ if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) {
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+ struct ldt_struct *ldt;
+
+ /* IRQs are off, so this synchronizes with smp_store_release */
+ ldt = READ_ONCE(current->active_mm->context.ldt);
+ if (!ldt || idx >= ldt->nr_entries)
+ return 0;
+
+ desc = &ldt->entries[idx];
+#else
+ return 0;
+#endif
+ } else {
+ if (idx >= GDT_ENTRIES)
+ return 0;
+
+ desc = raw_cpu_ptr(gdt_page.gdt) + idx;
+ }
+
+ return get_desc_base(desc);
+}
+
+#ifdef CONFIG_IA32_EMULATION
+
+#include <linux/compat.h>
+
+static inline int
+perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *entry)
+{
+ /* 32-bit process in 64-bit kernel. */
+ unsigned long ss_base, cs_base;
+ struct stack_frame_ia32 frame;
+ const void __user *fp;
+
+ if (!test_thread_flag(TIF_IA32))
+ return 0;
+
+ cs_base = get_segment_base(regs->cs);
+ ss_base = get_segment_base(regs->ss);
+
+ fp = compat_ptr(ss_base + regs->bp);
+ pagefault_disable();
+ while (entry->nr < entry->max_stack) {
+ unsigned long bytes;
+ frame.next_frame = 0;
+ frame.return_address = 0;
+
+ if (!valid_user_frame(fp, sizeof(frame)))
+ break;
+
+ bytes = __copy_from_user_nmi(&frame.next_frame, fp, 4);
+ if (bytes != 0)
+ break;
+ bytes = __copy_from_user_nmi(&frame.return_address, fp+4, 4);
+ if (bytes != 0)
+ break;
+
+ perf_callchain_store(entry, cs_base + frame.return_address);
+ fp = compat_ptr(ss_base + frame.next_frame);
+ }
+ pagefault_enable();
+ return 1;
+}
+#else
+static inline int
+perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *entry)
+{
+ return 0;
+}
+#endif
+
+void
+perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
+{
+ struct stack_frame frame;
+ const unsigned long __user *fp;
+
+ if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+ /* TODO: We don't support guest os callchain now */
+ return;
+ }
+
+ /*
+ * We don't know what to do with VM86 stacks.. ignore them for now.
+ */
+ if (regs->flags & (X86_VM_MASK | PERF_EFLAGS_VM))
+ return;
+
+ fp = (unsigned long __user *)regs->bp;
+
+ perf_callchain_store(entry, regs->ip);
+
+ if (!nmi_uaccess_okay())
+ return;
+
+ if (perf_callchain_user32(regs, entry))
+ return;
+
+ pagefault_disable();
+ while (entry->nr < entry->max_stack) {
+ unsigned long bytes;
+
+ frame.next_frame = NULL;
+ frame.return_address = 0;
+
+ if (!valid_user_frame(fp, sizeof(frame)))
+ break;
+
+ bytes = __copy_from_user_nmi(&frame.next_frame, fp, sizeof(*fp));
+ if (bytes != 0)
+ break;
+ bytes = __copy_from_user_nmi(&frame.return_address, fp + 1, sizeof(*fp));
+ if (bytes != 0)
+ break;
+
+ perf_callchain_store(entry, frame.return_address);
+ fp = (void __user *)frame.next_frame;
+ }
+ pagefault_enable();
+}
+
+/*
+ * Deal with code segment offsets for the various execution modes:
+ *
+ * VM86 - the good olde 16 bit days, where the linear address is
+ * 20 bits and we use regs->ip + 0x10 * regs->cs.
+ *
+ * IA32 - Where we need to look at GDT/LDT segment descriptor tables
+ * to figure out what the 32bit base address is.
+ *
+ * X32 - has TIF_X32 set, but is running in x86_64
+ *
+ * X86_64 - CS,DS,SS,ES are all zero based.
+ */
+static unsigned long code_segment_base(struct pt_regs *regs)
+{
+ /*
+ * For IA32 we look at the GDT/LDT segment base to convert the
+ * effective IP to a linear address.
+ */
+
+#ifdef CONFIG_X86_32
+ /*
+ * If we are in VM86 mode, add the segment offset to convert to a
+ * linear address.
+ */
+ if (regs->flags & X86_VM_MASK)
+ return 0x10 * regs->cs;
+
+ if (user_mode(regs) && regs->cs != __USER_CS)
+ return get_segment_base(regs->cs);
+#else
+ if (user_mode(regs) && !user_64bit_mode(regs) &&
+ regs->cs != __USER32_CS)
+ return get_segment_base(regs->cs);
+#endif
+ return 0;
+}
+
+unsigned long perf_instruction_pointer(struct pt_regs *regs)
+{
+ if (perf_guest_cbs && perf_guest_cbs->is_in_guest())
+ return perf_guest_cbs->get_guest_ip();
+
+ return regs->ip + code_segment_base(regs);
+}
+
+unsigned long perf_misc_flags(struct pt_regs *regs)
+{
+ int misc = 0;
+
+ if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+ if (perf_guest_cbs->is_user_mode())
+ misc |= PERF_RECORD_MISC_GUEST_USER;
+ else
+ misc |= PERF_RECORD_MISC_GUEST_KERNEL;
+ } else {
+ if (user_mode(regs))
+ misc |= PERF_RECORD_MISC_USER;
+ else
+ misc |= PERF_RECORD_MISC_KERNEL;
+ }
+
+ if (regs->flags & PERF_EFLAGS_EXACT)
+ misc |= PERF_RECORD_MISC_EXACT_IP;
+
+ return misc;
+}
+
+void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
+{
+ cap->version = x86_pmu.version;
+ cap->num_counters_gp = x86_pmu.num_counters;
+ cap->num_counters_fixed = x86_pmu.num_counters_fixed;
+ cap->bit_width_gp = x86_pmu.cntval_bits;
+ cap->bit_width_fixed = x86_pmu.cntval_bits;
+ cap->events_mask = (unsigned int)x86_pmu.events_maskl;
+ cap->events_mask_len = x86_pmu.events_mask_len;
+}
+EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability);
diff --git a/arch/x86/events/intel/Makefile b/arch/x86/events/intel/Makefile
new file mode 100644
index 000000000..3468b0c1d
--- /dev/null
+++ b/arch/x86/events/intel/Makefile
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_CPU_SUP_INTEL) += core.o bts.o
+obj-$(CONFIG_CPU_SUP_INTEL) += ds.o knc.o
+obj-$(CONFIG_CPU_SUP_INTEL) += lbr.o p4.o p6.o pt.o
+obj-$(CONFIG_PERF_EVENTS_INTEL_RAPL) += intel-rapl-perf.o
+intel-rapl-perf-objs := rapl.o
+obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += intel-uncore.o
+intel-uncore-objs := uncore.o uncore_nhmex.o uncore_snb.o uncore_snbep.o
+obj-$(CONFIG_PERF_EVENTS_INTEL_CSTATE) += intel-cstate.o
+intel-cstate-objs := cstate.o
diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c
new file mode 100644
index 000000000..510f94614
--- /dev/null
+++ b/arch/x86/events/intel/bts.c
@@ -0,0 +1,625 @@
+/*
+ * BTS PMU driver for perf
+ * Copyright (c) 2013-2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+
+#undef DEBUG
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/debugfs.h>
+#include <linux/device.h>
+#include <linux/coredump.h>
+
+#include <asm-generic/sizes.h>
+#include <asm/perf_event.h>
+
+#include "../perf_event.h"
+
+struct bts_ctx {
+ struct perf_output_handle handle;
+ struct debug_store ds_back;
+ int state;
+};
+
+/* BTS context states: */
+enum {
+ /* no ongoing AUX transactions */
+ BTS_STATE_STOPPED = 0,
+ /* AUX transaction is on, BTS tracing is disabled */
+ BTS_STATE_INACTIVE,
+ /* AUX transaction is on, BTS tracing is running */
+ BTS_STATE_ACTIVE,
+};
+
+static DEFINE_PER_CPU(struct bts_ctx, bts_ctx);
+
+#define BTS_RECORD_SIZE 24
+#define BTS_SAFETY_MARGIN 4080
+
+struct bts_phys {
+ struct page *page;
+ unsigned long size;
+ unsigned long offset;
+ unsigned long displacement;
+};
+
+struct bts_buffer {
+ size_t real_size; /* multiple of BTS_RECORD_SIZE */
+ unsigned int nr_pages;
+ unsigned int nr_bufs;
+ unsigned int cur_buf;
+ bool snapshot;
+ local_t data_size;
+ local_t head;
+ unsigned long end;
+ void **data_pages;
+ struct bts_phys buf[0];
+};
+
+static struct pmu bts_pmu;
+
+static int buf_nr_pages(struct page *page)
+{
+ if (!PagePrivate(page))
+ return 1;
+
+ return 1 << page_private(page);
+}
+
+static size_t buf_size(struct page *page)
+{
+ return buf_nr_pages(page) * PAGE_SIZE;
+}
+
+static void *
+bts_buffer_setup_aux(struct perf_event *event, void **pages,
+ int nr_pages, bool overwrite)
+{
+ struct bts_buffer *buf;
+ struct page *page;
+ int cpu = event->cpu;
+ int node = (cpu == -1) ? cpu : cpu_to_node(cpu);
+ unsigned long offset;
+ size_t size = nr_pages << PAGE_SHIFT;
+ int pg, nbuf, pad;
+
+ /* count all the high order buffers */
+ for (pg = 0, nbuf = 0; pg < nr_pages;) {
+ page = virt_to_page(pages[pg]);
+ pg += buf_nr_pages(page);
+ nbuf++;
+ }
+
+ /*
+ * to avoid interrupts in overwrite mode, only allow one physical
+ */
+ if (overwrite && nbuf > 1)
+ return NULL;
+
+ buf = kzalloc_node(offsetof(struct bts_buffer, buf[nbuf]), GFP_KERNEL, node);
+ if (!buf)
+ return NULL;
+
+ buf->nr_pages = nr_pages;
+ buf->nr_bufs = nbuf;
+ buf->snapshot = overwrite;
+ buf->data_pages = pages;
+ buf->real_size = size - size % BTS_RECORD_SIZE;
+
+ for (pg = 0, nbuf = 0, offset = 0, pad = 0; nbuf < buf->nr_bufs; nbuf++) {
+ unsigned int __nr_pages;
+
+ page = virt_to_page(pages[pg]);
+ __nr_pages = buf_nr_pages(page);
+ buf->buf[nbuf].page = page;
+ buf->buf[nbuf].offset = offset;
+ buf->buf[nbuf].displacement = (pad ? BTS_RECORD_SIZE - pad : 0);
+ buf->buf[nbuf].size = buf_size(page) - buf->buf[nbuf].displacement;
+ pad = buf->buf[nbuf].size % BTS_RECORD_SIZE;
+ buf->buf[nbuf].size -= pad;
+
+ pg += __nr_pages;
+ offset += __nr_pages << PAGE_SHIFT;
+ }
+
+ return buf;
+}
+
+static void bts_buffer_free_aux(void *data)
+{
+ kfree(data);
+}
+
+static unsigned long bts_buffer_offset(struct bts_buffer *buf, unsigned int idx)
+{
+ return buf->buf[idx].offset + buf->buf[idx].displacement;
+}
+
+static void
+bts_config_buffer(struct bts_buffer *buf)
+{
+ int cpu = raw_smp_processor_id();
+ struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+ struct bts_phys *phys = &buf->buf[buf->cur_buf];
+ unsigned long index, thresh = 0, end = phys->size;
+ struct page *page = phys->page;
+
+ index = local_read(&buf->head);
+
+ if (!buf->snapshot) {
+ if (buf->end < phys->offset + buf_size(page))
+ end = buf->end - phys->offset - phys->displacement;
+
+ index -= phys->offset + phys->displacement;
+
+ if (end - index > BTS_SAFETY_MARGIN)
+ thresh = end - BTS_SAFETY_MARGIN;
+ else if (end - index > BTS_RECORD_SIZE)
+ thresh = end - BTS_RECORD_SIZE;
+ else
+ thresh = end;
+ }
+
+ ds->bts_buffer_base = (u64)(long)page_address(page) + phys->displacement;
+ ds->bts_index = ds->bts_buffer_base + index;
+ ds->bts_absolute_maximum = ds->bts_buffer_base + end;
+ ds->bts_interrupt_threshold = !buf->snapshot
+ ? ds->bts_buffer_base + thresh
+ : ds->bts_absolute_maximum + BTS_RECORD_SIZE;
+}
+
+static void bts_buffer_pad_out(struct bts_phys *phys, unsigned long head)
+{
+ unsigned long index = head - phys->offset;
+
+ memset(page_address(phys->page) + index, 0, phys->size - index);
+}
+
+static void bts_update(struct bts_ctx *bts)
+{
+ int cpu = raw_smp_processor_id();
+ struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+ struct bts_buffer *buf = perf_get_aux(&bts->handle);
+ unsigned long index = ds->bts_index - ds->bts_buffer_base, old, head;
+
+ if (!buf)
+ return;
+
+ head = index + bts_buffer_offset(buf, buf->cur_buf);
+ old = local_xchg(&buf->head, head);
+
+ if (!buf->snapshot) {
+ if (old == head)
+ return;
+
+ if (ds->bts_index >= ds->bts_absolute_maximum)
+ perf_aux_output_flag(&bts->handle,
+ PERF_AUX_FLAG_TRUNCATED);
+
+ /*
+ * old and head are always in the same physical buffer, so we
+ * can subtract them to get the data size.
+ */
+ local_add(head - old, &buf->data_size);
+ } else {
+ local_set(&buf->data_size, head);
+ }
+}
+
+static int
+bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle);
+
+/*
+ * Ordering PMU callbacks wrt themselves and the PMI is done by means
+ * of bts::state, which:
+ * - is set when bts::handle::event is valid, that is, between
+ * perf_aux_output_begin() and perf_aux_output_end();
+ * - is zero otherwise;
+ * - is ordered against bts::handle::event with a compiler barrier.
+ */
+
+static void __bts_event_start(struct perf_event *event)
+{
+ struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+ struct bts_buffer *buf = perf_get_aux(&bts->handle);
+ u64 config = 0;
+
+ if (!buf->snapshot)
+ config |= ARCH_PERFMON_EVENTSEL_INT;
+ if (!event->attr.exclude_kernel)
+ config |= ARCH_PERFMON_EVENTSEL_OS;
+ if (!event->attr.exclude_user)
+ config |= ARCH_PERFMON_EVENTSEL_USR;
+
+ bts_config_buffer(buf);
+
+ /*
+ * local barrier to make sure that ds configuration made it
+ * before we enable BTS and bts::state goes ACTIVE
+ */
+ wmb();
+
+ /* INACTIVE/STOPPED -> ACTIVE */
+ WRITE_ONCE(bts->state, BTS_STATE_ACTIVE);
+
+ intel_pmu_enable_bts(config);
+
+}
+
+static void bts_event_start(struct perf_event *event, int flags)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+ struct bts_buffer *buf;
+
+ buf = perf_aux_output_begin(&bts->handle, event);
+ if (!buf)
+ goto fail_stop;
+
+ if (bts_buffer_reset(buf, &bts->handle))
+ goto fail_end_stop;
+
+ bts->ds_back.bts_buffer_base = cpuc->ds->bts_buffer_base;
+ bts->ds_back.bts_absolute_maximum = cpuc->ds->bts_absolute_maximum;
+ bts->ds_back.bts_interrupt_threshold = cpuc->ds->bts_interrupt_threshold;
+
+ perf_event_itrace_started(event);
+ event->hw.state = 0;
+
+ __bts_event_start(event);
+
+ return;
+
+fail_end_stop:
+ perf_aux_output_end(&bts->handle, 0);
+
+fail_stop:
+ event->hw.state = PERF_HES_STOPPED;
+}
+
+static void __bts_event_stop(struct perf_event *event, int state)
+{
+ struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+
+ /* ACTIVE -> INACTIVE(PMI)/STOPPED(->stop()) */
+ WRITE_ONCE(bts->state, state);
+
+ /*
+ * No extra synchronization is mandated by the documentation to have
+ * BTS data stores globally visible.
+ */
+ intel_pmu_disable_bts();
+}
+
+static void bts_event_stop(struct perf_event *event, int flags)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+ struct bts_buffer *buf = NULL;
+ int state = READ_ONCE(bts->state);
+
+ if (state == BTS_STATE_ACTIVE)
+ __bts_event_stop(event, BTS_STATE_STOPPED);
+
+ if (state != BTS_STATE_STOPPED)
+ buf = perf_get_aux(&bts->handle);
+
+ event->hw.state |= PERF_HES_STOPPED;
+
+ if (flags & PERF_EF_UPDATE) {
+ bts_update(bts);
+
+ if (buf) {
+ if (buf->snapshot)
+ bts->handle.head =
+ local_xchg(&buf->data_size,
+ buf->nr_pages << PAGE_SHIFT);
+ perf_aux_output_end(&bts->handle,
+ local_xchg(&buf->data_size, 0));
+ }
+
+ cpuc->ds->bts_index = bts->ds_back.bts_buffer_base;
+ cpuc->ds->bts_buffer_base = bts->ds_back.bts_buffer_base;
+ cpuc->ds->bts_absolute_maximum = bts->ds_back.bts_absolute_maximum;
+ cpuc->ds->bts_interrupt_threshold = bts->ds_back.bts_interrupt_threshold;
+ }
+}
+
+void intel_bts_enable_local(void)
+{
+ struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+ int state = READ_ONCE(bts->state);
+
+ /*
+ * Here we transition from INACTIVE to ACTIVE;
+ * if we instead are STOPPED from the interrupt handler,
+ * stay that way. Can't be ACTIVE here though.
+ */
+ if (WARN_ON_ONCE(state == BTS_STATE_ACTIVE))
+ return;
+
+ if (state == BTS_STATE_STOPPED)
+ return;
+
+ if (bts->handle.event)
+ __bts_event_start(bts->handle.event);
+}
+
+void intel_bts_disable_local(void)
+{
+ struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+
+ /*
+ * Here we transition from ACTIVE to INACTIVE;
+ * do nothing for STOPPED or INACTIVE.
+ */
+ if (READ_ONCE(bts->state) != BTS_STATE_ACTIVE)
+ return;
+
+ if (bts->handle.event)
+ __bts_event_stop(bts->handle.event, BTS_STATE_INACTIVE);
+}
+
+static int
+bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle)
+{
+ unsigned long head, space, next_space, pad, gap, skip, wakeup;
+ unsigned int next_buf;
+ struct bts_phys *phys, *next_phys;
+ int ret;
+
+ if (buf->snapshot)
+ return 0;
+
+ head = handle->head & ((buf->nr_pages << PAGE_SHIFT) - 1);
+
+ phys = &buf->buf[buf->cur_buf];
+ space = phys->offset + phys->displacement + phys->size - head;
+ pad = space;
+ if (space > handle->size) {
+ space = handle->size;
+ space -= space % BTS_RECORD_SIZE;
+ }
+ if (space <= BTS_SAFETY_MARGIN) {
+ /* See if next phys buffer has more space */
+ next_buf = buf->cur_buf + 1;
+ if (next_buf >= buf->nr_bufs)
+ next_buf = 0;
+ next_phys = &buf->buf[next_buf];
+ gap = buf_size(phys->page) - phys->displacement - phys->size +
+ next_phys->displacement;
+ skip = pad + gap;
+ if (handle->size >= skip) {
+ next_space = next_phys->size;
+ if (next_space + skip > handle->size) {
+ next_space = handle->size - skip;
+ next_space -= next_space % BTS_RECORD_SIZE;
+ }
+ if (next_space > space || !space) {
+ if (pad)
+ bts_buffer_pad_out(phys, head);
+ ret = perf_aux_output_skip(handle, skip);
+ if (ret)
+ return ret;
+ /* Advance to next phys buffer */
+ phys = next_phys;
+ space = next_space;
+ head = phys->offset + phys->displacement;
+ /*
+ * After this, cur_buf and head won't match ds
+ * anymore, so we must not be racing with
+ * bts_update().
+ */
+ buf->cur_buf = next_buf;
+ local_set(&buf->head, head);
+ }
+ }
+ }
+
+ /* Don't go far beyond wakeup watermark */
+ wakeup = BTS_SAFETY_MARGIN + BTS_RECORD_SIZE + handle->wakeup -
+ handle->head;
+ if (space > wakeup) {
+ space = wakeup;
+ space -= space % BTS_RECORD_SIZE;
+ }
+
+ buf->end = head + space;
+
+ /*
+ * If we have no space, the lost notification would have been sent when
+ * we hit absolute_maximum - see bts_update()
+ */
+ if (!space)
+ return -ENOSPC;
+
+ return 0;
+}
+
+int intel_bts_interrupt(void)
+{
+ struct debug_store *ds = this_cpu_ptr(&cpu_hw_events)->ds;
+ struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+ struct perf_event *event = bts->handle.event;
+ struct bts_buffer *buf;
+ s64 old_head;
+ int err = -ENOSPC, handled = 0;
+
+ /*
+ * The only surefire way of knowing if this NMI is ours is by checking
+ * the write ptr against the PMI threshold.
+ */
+ if (ds && (ds->bts_index >= ds->bts_interrupt_threshold))
+ handled = 1;
+
+ /*
+ * this is wrapped in intel_bts_enable_local/intel_bts_disable_local,
+ * so we can only be INACTIVE or STOPPED
+ */
+ if (READ_ONCE(bts->state) == BTS_STATE_STOPPED)
+ return handled;
+
+ buf = perf_get_aux(&bts->handle);
+ if (!buf)
+ return handled;
+
+ /*
+ * Skip snapshot counters: they don't use the interrupt, but
+ * there's no other way of telling, because the pointer will
+ * keep moving
+ */
+ if (buf->snapshot)
+ return 0;
+
+ old_head = local_read(&buf->head);
+ bts_update(bts);
+
+ /* no new data */
+ if (old_head == local_read(&buf->head))
+ return handled;
+
+ perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0));
+
+ buf = perf_aux_output_begin(&bts->handle, event);
+ if (buf)
+ err = bts_buffer_reset(buf, &bts->handle);
+
+ if (err) {
+ WRITE_ONCE(bts->state, BTS_STATE_STOPPED);
+
+ if (buf) {
+ /*
+ * BTS_STATE_STOPPED should be visible before
+ * cleared handle::event
+ */
+ barrier();
+ perf_aux_output_end(&bts->handle, 0);
+ }
+ }
+
+ return 1;
+}
+
+static void bts_event_del(struct perf_event *event, int mode)
+{
+ bts_event_stop(event, PERF_EF_UPDATE);
+}
+
+static int bts_event_add(struct perf_event *event, int mode)
+{
+ struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct hw_perf_event *hwc = &event->hw;
+
+ event->hw.state = PERF_HES_STOPPED;
+
+ if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
+ return -EBUSY;
+
+ if (bts->handle.event)
+ return -EBUSY;
+
+ if (mode & PERF_EF_START) {
+ bts_event_start(event, 0);
+ if (hwc->state & PERF_HES_STOPPED)
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static void bts_event_destroy(struct perf_event *event)
+{
+ x86_release_hardware();
+ x86_del_exclusive(x86_lbr_exclusive_bts);
+}
+
+static int bts_event_init(struct perf_event *event)
+{
+ int ret;
+
+ if (event->attr.type != bts_pmu.type)
+ return -ENOENT;
+
+ /*
+ * BTS leaks kernel addresses even when CPL0 tracing is
+ * disabled, so disallow intel_bts driver for unprivileged
+ * users on paranoid systems since it provides trace data
+ * to the user in a zero-copy fashion.
+ *
+ * Note that the default paranoia setting permits unprivileged
+ * users to profile the kernel.
+ */
+ if (event->attr.exclude_kernel && perf_paranoid_kernel() &&
+ !capable(CAP_SYS_ADMIN))
+ return -EACCES;
+
+ if (x86_add_exclusive(x86_lbr_exclusive_bts))
+ return -EBUSY;
+
+ ret = x86_reserve_hardware();
+ if (ret) {
+ x86_del_exclusive(x86_lbr_exclusive_bts);
+ return ret;
+ }
+
+ event->destroy = bts_event_destroy;
+
+ return 0;
+}
+
+static void bts_event_read(struct perf_event *event)
+{
+}
+
+static __init int bts_init(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_DTES64) || !x86_pmu.bts)
+ return -ENODEV;
+
+ if (boot_cpu_has(X86_FEATURE_PTI)) {
+ /*
+ * BTS hardware writes through a virtual memory map we must
+ * either use the kernel physical map, or the user mapping of
+ * the AUX buffer.
+ *
+ * However, since this driver supports per-CPU and per-task inherit
+ * we cannot use the user mapping since it will not be availble
+ * if we're not running the owning process.
+ *
+ * With PTI we can't use the kernal map either, because its not
+ * there when we run userspace.
+ *
+ * For now, disable this driver when using PTI.
+ */
+ return -ENODEV;
+ }
+
+ bts_pmu.capabilities = PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_ITRACE |
+ PERF_PMU_CAP_EXCLUSIVE;
+ bts_pmu.task_ctx_nr = perf_sw_context;
+ bts_pmu.event_init = bts_event_init;
+ bts_pmu.add = bts_event_add;
+ bts_pmu.del = bts_event_del;
+ bts_pmu.start = bts_event_start;
+ bts_pmu.stop = bts_event_stop;
+ bts_pmu.read = bts_event_read;
+ bts_pmu.setup_aux = bts_buffer_setup_aux;
+ bts_pmu.free_aux = bts_buffer_free_aux;
+
+ return perf_pmu_register(&bts_pmu, "intel_bts", -1);
+}
+arch_initcall(bts_init);
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
new file mode 100644
index 000000000..2dd8b0d64
--- /dev/null
+++ b/arch/x86/events/intel/core.c
@@ -0,0 +1,4624 @@
+/*
+ * Per core/cpu state
+ *
+ * Used to coordinate shared registers between HT threads or
+ * among events on a single PMU.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/stddef.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/nmi.h>
+
+#include <asm/cpufeature.h>
+#include <asm/hardirq.h>
+#include <asm/intel-family.h>
+#include <asm/apic.h>
+
+#include "../perf_event.h"
+
+/*
+ * Intel PerfMon, used on Core and later.
+ */
+static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly =
+{
+ [PERF_COUNT_HW_CPU_CYCLES] = 0x003c,
+ [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
+ [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e,
+ [PERF_COUNT_HW_CACHE_MISSES] = 0x412e,
+ [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
+ [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
+ [PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
+ [PERF_COUNT_HW_REF_CPU_CYCLES] = 0x0300, /* pseudo-encoding */
+};
+
+static struct event_constraint intel_core_event_constraints[] __read_mostly =
+{
+ INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
+ INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
+ INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
+ INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
+ INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
+ INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FP_COMP_INSTR_RET */
+ EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint intel_core2_event_constraints[] __read_mostly =
+{
+ FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+ INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
+ INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
+ INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
+ INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
+ INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
+ INTEL_EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */
+ INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
+ INTEL_EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */
+ INTEL_EVENT_CONSTRAINT(0xc9, 0x1), /* ITLB_MISS_RETIRED (T30-9) */
+ INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */
+ EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint intel_nehalem_event_constraints[] __read_mostly =
+{
+ FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+ INTEL_EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */
+ INTEL_EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */
+ INTEL_EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */
+ INTEL_EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */
+ INTEL_EVENT_CONSTRAINT(0x48, 0x3), /* L1D_PEND_MISS */
+ INTEL_EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */
+ INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
+ INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */
+ EVENT_CONSTRAINT_END
+};
+
+static struct extra_reg intel_nehalem_extra_regs[] __read_mostly =
+{
+ /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
+ INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0),
+ INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x100b),
+ EVENT_EXTRA_END
+};
+
+static struct event_constraint intel_westmere_event_constraints[] __read_mostly =
+{
+ FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+ INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
+ INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */
+ INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */
+ INTEL_EVENT_CONSTRAINT(0xb3, 0x1), /* SNOOPQ_REQUEST_OUTSTANDING */
+ EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint intel_snb_event_constraints[] __read_mostly =
+{
+ FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+ INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_DISPATCH */
+ INTEL_UEVENT_CONSTRAINT(0x05a3, 0xf), /* CYCLE_ACTIVITY.STALLS_L2_PENDING */
+ INTEL_UEVENT_CONSTRAINT(0x02a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
+ INTEL_UEVENT_CONSTRAINT(0x06a3, 0x4), /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */
+ INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */
+ INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
+ INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
+ INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_DISPATCH */
+ INTEL_UEVENT_CONSTRAINT(0x02a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
+
+ /*
+ * When HT is off these events can only run on the bottom 4 counters
+ * When HT is on, they are impacted by the HT bug and require EXCL access
+ */
+ INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+
+ EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint intel_ivb_event_constraints[] __read_mostly =
+{
+ FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+ INTEL_UEVENT_CONSTRAINT(0x0148, 0x4), /* L1D_PEND_MISS.PENDING */
+ INTEL_UEVENT_CONSTRAINT(0x0279, 0xf), /* IDQ.EMTPY */
+ INTEL_UEVENT_CONSTRAINT(0x019c, 0xf), /* IDQ_UOPS_NOT_DELIVERED.CORE */
+ INTEL_UEVENT_CONSTRAINT(0x02a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_LDM_PENDING */
+ INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */
+ INTEL_UEVENT_CONSTRAINT(0x05a3, 0xf), /* CYCLE_ACTIVITY.STALLS_L2_PENDING */
+ INTEL_UEVENT_CONSTRAINT(0x06a3, 0xf), /* CYCLE_ACTIVITY.STALLS_LDM_PENDING */
+ INTEL_UEVENT_CONSTRAINT(0x08a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
+ INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4), /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */
+ INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
+
+ /*
+ * When HT is off these events can only run on the bottom 4 counters
+ * When HT is on, they are impacted by the HT bug and require EXCL access
+ */
+ INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+
+ EVENT_CONSTRAINT_END
+};
+
+static struct extra_reg intel_westmere_extra_regs[] __read_mostly =
+{
+ /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
+ INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0),
+ INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0xffff, RSP_1),
+ INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x100b),
+ EVENT_EXTRA_END
+};
+
+static struct event_constraint intel_v1_event_constraints[] __read_mostly =
+{
+ EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint intel_gen_event_constraints[] __read_mostly =
+{
+ FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+ EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint intel_slm_event_constraints[] __read_mostly =
+{
+ FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* pseudo CPU_CLK_UNHALTED.REF */
+ EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint intel_skl_event_constraints[] = {
+ FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+ INTEL_UEVENT_CONSTRAINT(0x1c0, 0x2), /* INST_RETIRED.PREC_DIST */
+
+ /*
+ * when HT is off, these can only run on the bottom 4 counters
+ */
+ INTEL_EVENT_CONSTRAINT(0xd0, 0xf), /* MEM_INST_RETIRED.* */
+ INTEL_EVENT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_RETIRED.* */
+ INTEL_EVENT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_L3_HIT_RETIRED.* */
+ INTEL_EVENT_CONSTRAINT(0xcd, 0xf), /* MEM_TRANS_RETIRED.* */
+ INTEL_EVENT_CONSTRAINT(0xc6, 0xf), /* FRONTEND_RETIRED.* */
+
+ EVENT_CONSTRAINT_END
+};
+
+static struct extra_reg intel_knl_extra_regs[] __read_mostly = {
+ INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x799ffbb6e7ull, RSP_0),
+ INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x399ffbffe7ull, RSP_1),
+ EVENT_EXTRA_END
+};
+
+static struct extra_reg intel_snb_extra_regs[] __read_mostly = {
+ /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
+ INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3f807f8fffull, RSP_0),
+ INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3f807f8fffull, RSP_1),
+ INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
+ EVENT_EXTRA_END
+};
+
+static struct extra_reg intel_snbep_extra_regs[] __read_mostly = {
+ /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
+ INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffff8fffull, RSP_0),
+ INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3fffff8fffull, RSP_1),
+ INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
+ EVENT_EXTRA_END
+};
+
+static struct extra_reg intel_skl_extra_regs[] __read_mostly = {
+ INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffff8fffull, RSP_0),
+ INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3fffff8fffull, RSP_1),
+ INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
+ /*
+ * Note the low 8 bits eventsel code is not a continuous field, containing
+ * some #GPing bits. These are masked out.
+ */
+ INTEL_UEVENT_EXTRA_REG(0x01c6, MSR_PEBS_FRONTEND, 0x7fff17, FE),
+ EVENT_EXTRA_END
+};
+
+EVENT_ATTR_STR(mem-loads, mem_ld_nhm, "event=0x0b,umask=0x10,ldlat=3");
+EVENT_ATTR_STR(mem-loads, mem_ld_snb, "event=0xcd,umask=0x1,ldlat=3");
+EVENT_ATTR_STR(mem-stores, mem_st_snb, "event=0xcd,umask=0x2");
+
+static struct attribute *nhm_events_attrs[] = {
+ EVENT_PTR(mem_ld_nhm),
+ NULL,
+};
+
+/*
+ * topdown events for Intel Core CPUs.
+ *
+ * The events are all in slots, which is a free slot in a 4 wide
+ * pipeline. Some events are already reported in slots, for cycle
+ * events we multiply by the pipeline width (4).
+ *
+ * With Hyper Threading on, topdown metrics are either summed or averaged
+ * between the threads of a core: (count_t0 + count_t1).
+ *
+ * For the average case the metric is always scaled to pipeline width,
+ * so we use factor 2 ((count_t0 + count_t1) / 2 * 4)
+ */
+
+EVENT_ATTR_STR_HT(topdown-total-slots, td_total_slots,
+ "event=0x3c,umask=0x0", /* cpu_clk_unhalted.thread */
+ "event=0x3c,umask=0x0,any=1"); /* cpu_clk_unhalted.thread_any */
+EVENT_ATTR_STR_HT(topdown-total-slots.scale, td_total_slots_scale, "4", "2");
+EVENT_ATTR_STR(topdown-slots-issued, td_slots_issued,
+ "event=0xe,umask=0x1"); /* uops_issued.any */
+EVENT_ATTR_STR(topdown-slots-retired, td_slots_retired,
+ "event=0xc2,umask=0x2"); /* uops_retired.retire_slots */
+EVENT_ATTR_STR(topdown-fetch-bubbles, td_fetch_bubbles,
+ "event=0x9c,umask=0x1"); /* idq_uops_not_delivered_core */
+EVENT_ATTR_STR_HT(topdown-recovery-bubbles, td_recovery_bubbles,
+ "event=0xd,umask=0x3,cmask=1", /* int_misc.recovery_cycles */
+ "event=0xd,umask=0x3,cmask=1,any=1"); /* int_misc.recovery_cycles_any */
+EVENT_ATTR_STR_HT(topdown-recovery-bubbles.scale, td_recovery_bubbles_scale,
+ "4", "2");
+
+static struct attribute *snb_events_attrs[] = {
+ EVENT_PTR(mem_ld_snb),
+ EVENT_PTR(mem_st_snb),
+ EVENT_PTR(td_slots_issued),
+ EVENT_PTR(td_slots_retired),
+ EVENT_PTR(td_fetch_bubbles),
+ EVENT_PTR(td_total_slots),
+ EVENT_PTR(td_total_slots_scale),
+ EVENT_PTR(td_recovery_bubbles),
+ EVENT_PTR(td_recovery_bubbles_scale),
+ NULL,
+};
+
+static struct event_constraint intel_hsw_event_constraints[] = {
+ FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+ INTEL_UEVENT_CONSTRAINT(0x148, 0x4), /* L1D_PEND_MISS.PENDING */
+ INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
+ INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
+ /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
+ INTEL_UEVENT_CONSTRAINT(0x08a3, 0x4),
+ /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */
+ INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4),
+ /* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */
+ INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf),
+
+ /*
+ * When HT is off these events can only run on the bottom 4 counters
+ * When HT is on, they are impacted by the HT bug and require EXCL access
+ */
+ INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+
+ EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint intel_bdw_event_constraints[] = {
+ FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+ INTEL_UEVENT_CONSTRAINT(0x148, 0x4), /* L1D_PEND_MISS.PENDING */
+ INTEL_UBIT_EVENT_CONSTRAINT(0x8a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_MISS */
+ /*
+ * when HT is off, these can only run on the bottom 4 counters
+ */
+ INTEL_EVENT_CONSTRAINT(0xd0, 0xf), /* MEM_INST_RETIRED.* */
+ INTEL_EVENT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_RETIRED.* */
+ INTEL_EVENT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_L3_HIT_RETIRED.* */
+ INTEL_EVENT_CONSTRAINT(0xcd, 0xf), /* MEM_TRANS_RETIRED.* */
+ EVENT_CONSTRAINT_END
+};
+
+static u64 intel_pmu_event_map(int hw_event)
+{
+ return intel_perfmon_event_map[hw_event];
+}
+
+/*
+ * Notes on the events:
+ * - data reads do not include code reads (comparable to earlier tables)
+ * - data counts include speculative execution (except L1 write, dtlb, bpu)
+ * - remote node access includes remote memory, remote cache, remote mmio.
+ * - prefetches are not included in the counts.
+ * - icache miss does not include decoded icache
+ */
+
+#define SKL_DEMAND_DATA_RD BIT_ULL(0)
+#define SKL_DEMAND_RFO BIT_ULL(1)
+#define SKL_ANY_RESPONSE BIT_ULL(16)
+#define SKL_SUPPLIER_NONE BIT_ULL(17)
+#define SKL_L3_MISS_LOCAL_DRAM BIT_ULL(26)
+#define SKL_L3_MISS_REMOTE_HOP0_DRAM BIT_ULL(27)
+#define SKL_L3_MISS_REMOTE_HOP1_DRAM BIT_ULL(28)
+#define SKL_L3_MISS_REMOTE_HOP2P_DRAM BIT_ULL(29)
+#define SKL_L3_MISS (SKL_L3_MISS_LOCAL_DRAM| \
+ SKL_L3_MISS_REMOTE_HOP0_DRAM| \
+ SKL_L3_MISS_REMOTE_HOP1_DRAM| \
+ SKL_L3_MISS_REMOTE_HOP2P_DRAM)
+#define SKL_SPL_HIT BIT_ULL(30)
+#define SKL_SNOOP_NONE BIT_ULL(31)
+#define SKL_SNOOP_NOT_NEEDED BIT_ULL(32)
+#define SKL_SNOOP_MISS BIT_ULL(33)
+#define SKL_SNOOP_HIT_NO_FWD BIT_ULL(34)
+#define SKL_SNOOP_HIT_WITH_FWD BIT_ULL(35)
+#define SKL_SNOOP_HITM BIT_ULL(36)
+#define SKL_SNOOP_NON_DRAM BIT_ULL(37)
+#define SKL_ANY_SNOOP (SKL_SPL_HIT|SKL_SNOOP_NONE| \
+ SKL_SNOOP_NOT_NEEDED|SKL_SNOOP_MISS| \
+ SKL_SNOOP_HIT_NO_FWD|SKL_SNOOP_HIT_WITH_FWD| \
+ SKL_SNOOP_HITM|SKL_SNOOP_NON_DRAM)
+#define SKL_DEMAND_READ SKL_DEMAND_DATA_RD
+#define SKL_SNOOP_DRAM (SKL_SNOOP_NONE| \
+ SKL_SNOOP_NOT_NEEDED|SKL_SNOOP_MISS| \
+ SKL_SNOOP_HIT_NO_FWD|SKL_SNOOP_HIT_WITH_FWD| \
+ SKL_SNOOP_HITM|SKL_SPL_HIT)
+#define SKL_DEMAND_WRITE SKL_DEMAND_RFO
+#define SKL_LLC_ACCESS SKL_ANY_RESPONSE
+#define SKL_L3_MISS_REMOTE (SKL_L3_MISS_REMOTE_HOP0_DRAM| \
+ SKL_L3_MISS_REMOTE_HOP1_DRAM| \
+ SKL_L3_MISS_REMOTE_HOP2P_DRAM)
+
+static __initconst const u64 skl_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_INST_RETIRED.ALL_LOADS */
+ [ C(RESULT_MISS) ] = 0x151, /* L1D.REPLACEMENT */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_INST_RETIRED.ALL_STORES */
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x283, /* ICACHE_64B.MISS */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_INST_RETIRED.ALL_LOADS */
+ [ C(RESULT_MISS) ] = 0xe08, /* DTLB_LOAD_MISSES.WALK_COMPLETED */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_INST_RETIRED.ALL_STORES */
+ [ C(RESULT_MISS) ] = 0xe49, /* DTLB_STORE_MISSES.WALK_COMPLETED */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x2085, /* ITLB_MISSES.STLB_HIT */
+ [ C(RESULT_MISS) ] = 0xe85, /* ITLB_MISSES.WALK_COMPLETED */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0xc4, /* BR_INST_RETIRED.ALL_BRANCHES */
+ [ C(RESULT_MISS) ] = 0xc5, /* BR_MISP_RETIRED.ALL_BRANCHES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+};
+
+static __initconst const u64 skl_hw_cache_extra_regs
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = SKL_DEMAND_READ|
+ SKL_LLC_ACCESS|SKL_ANY_SNOOP,
+ [ C(RESULT_MISS) ] = SKL_DEMAND_READ|
+ SKL_L3_MISS|SKL_ANY_SNOOP|
+ SKL_SUPPLIER_NONE,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = SKL_DEMAND_WRITE|
+ SKL_LLC_ACCESS|SKL_ANY_SNOOP,
+ [ C(RESULT_MISS) ] = SKL_DEMAND_WRITE|
+ SKL_L3_MISS|SKL_ANY_SNOOP|
+ SKL_SUPPLIER_NONE,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = SKL_DEMAND_READ|
+ SKL_L3_MISS_LOCAL_DRAM|SKL_SNOOP_DRAM,
+ [ C(RESULT_MISS) ] = SKL_DEMAND_READ|
+ SKL_L3_MISS_REMOTE|SKL_SNOOP_DRAM,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = SKL_DEMAND_WRITE|
+ SKL_L3_MISS_LOCAL_DRAM|SKL_SNOOP_DRAM,
+ [ C(RESULT_MISS) ] = SKL_DEMAND_WRITE|
+ SKL_L3_MISS_REMOTE|SKL_SNOOP_DRAM,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+};
+
+#define SNB_DMND_DATA_RD (1ULL << 0)
+#define SNB_DMND_RFO (1ULL << 1)
+#define SNB_DMND_IFETCH (1ULL << 2)
+#define SNB_DMND_WB (1ULL << 3)
+#define SNB_PF_DATA_RD (1ULL << 4)
+#define SNB_PF_RFO (1ULL << 5)
+#define SNB_PF_IFETCH (1ULL << 6)
+#define SNB_LLC_DATA_RD (1ULL << 7)
+#define SNB_LLC_RFO (1ULL << 8)
+#define SNB_LLC_IFETCH (1ULL << 9)
+#define SNB_BUS_LOCKS (1ULL << 10)
+#define SNB_STRM_ST (1ULL << 11)
+#define SNB_OTHER (1ULL << 15)
+#define SNB_RESP_ANY (1ULL << 16)
+#define SNB_NO_SUPP (1ULL << 17)
+#define SNB_LLC_HITM (1ULL << 18)
+#define SNB_LLC_HITE (1ULL << 19)
+#define SNB_LLC_HITS (1ULL << 20)
+#define SNB_LLC_HITF (1ULL << 21)
+#define SNB_LOCAL (1ULL << 22)
+#define SNB_REMOTE (0xffULL << 23)
+#define SNB_SNP_NONE (1ULL << 31)
+#define SNB_SNP_NOT_NEEDED (1ULL << 32)
+#define SNB_SNP_MISS (1ULL << 33)
+#define SNB_NO_FWD (1ULL << 34)
+#define SNB_SNP_FWD (1ULL << 35)
+#define SNB_HITM (1ULL << 36)
+#define SNB_NON_DRAM (1ULL << 37)
+
+#define SNB_DMND_READ (SNB_DMND_DATA_RD|SNB_LLC_DATA_RD)
+#define SNB_DMND_WRITE (SNB_DMND_RFO|SNB_LLC_RFO)
+#define SNB_DMND_PREFETCH (SNB_PF_DATA_RD|SNB_PF_RFO)
+
+#define SNB_SNP_ANY (SNB_SNP_NONE|SNB_SNP_NOT_NEEDED| \
+ SNB_SNP_MISS|SNB_NO_FWD|SNB_SNP_FWD| \
+ SNB_HITM)
+
+#define SNB_DRAM_ANY (SNB_LOCAL|SNB_REMOTE|SNB_SNP_ANY)
+#define SNB_DRAM_REMOTE (SNB_REMOTE|SNB_SNP_ANY)
+
+#define SNB_L3_ACCESS SNB_RESP_ANY
+#define SNB_L3_MISS (SNB_DRAM_ANY|SNB_NON_DRAM)
+
+static __initconst const u64 snb_hw_cache_extra_regs
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = SNB_DMND_READ|SNB_L3_ACCESS,
+ [ C(RESULT_MISS) ] = SNB_DMND_READ|SNB_L3_MISS,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = SNB_DMND_WRITE|SNB_L3_ACCESS,
+ [ C(RESULT_MISS) ] = SNB_DMND_WRITE|SNB_L3_MISS,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = SNB_DMND_PREFETCH|SNB_L3_ACCESS,
+ [ C(RESULT_MISS) ] = SNB_DMND_PREFETCH|SNB_L3_MISS,
+ },
+ },
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = SNB_DMND_READ|SNB_DRAM_ANY,
+ [ C(RESULT_MISS) ] = SNB_DMND_READ|SNB_DRAM_REMOTE,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = SNB_DMND_WRITE|SNB_DRAM_ANY,
+ [ C(RESULT_MISS) ] = SNB_DMND_WRITE|SNB_DRAM_REMOTE,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = SNB_DMND_PREFETCH|SNB_DRAM_ANY,
+ [ C(RESULT_MISS) ] = SNB_DMND_PREFETCH|SNB_DRAM_REMOTE,
+ },
+ },
+};
+
+static __initconst const u64 snb_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0xf1d0, /* MEM_UOP_RETIRED.LOADS */
+ [ C(RESULT_MISS) ] = 0x0151, /* L1D.REPLACEMENT */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0xf2d0, /* MEM_UOP_RETIRED.STORES */
+ [ C(RESULT_MISS) ] = 0x0851, /* L1D.ALL_M_REPLACEMENT */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x024e, /* HW_PRE_REQ.DL1_MISS */
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0280, /* ICACHE.MISSES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
+ [ C(RESULT_MISS) ] = 0x01b7,
+ },
+ [ C(OP_WRITE) ] = {
+ /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
+ [ C(RESULT_MISS) ] = 0x01b7,
+ },
+ [ C(OP_PREFETCH) ] = {
+ /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
+ [ C(RESULT_MISS) ] = 0x01b7,
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOP_RETIRED.ALL_LOADS */
+ [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.CAUSES_A_WALK */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOP_RETIRED.ALL_STORES */
+ [ C(RESULT_MISS) ] = 0x0149, /* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x1085, /* ITLB_MISSES.STLB_HIT */
+ [ C(RESULT_MISS) ] = 0x0185, /* ITLB_MISSES.CAUSES_A_WALK */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
+ [ C(RESULT_MISS) ] = 0x00c5, /* BR_MISP_RETIRED.ALL_BRANCHES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ [ C(RESULT_MISS) ] = 0x01b7,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ [ C(RESULT_MISS) ] = 0x01b7,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ [ C(RESULT_MISS) ] = 0x01b7,
+ },
+ },
+
+};
+
+/*
+ * Notes on the events:
+ * - data reads do not include code reads (comparable to earlier tables)
+ * - data counts include speculative execution (except L1 write, dtlb, bpu)
+ * - remote node access includes remote memory, remote cache, remote mmio.
+ * - prefetches are not included in the counts because they are not
+ * reliably counted.
+ */
+
+#define HSW_DEMAND_DATA_RD BIT_ULL(0)
+#define HSW_DEMAND_RFO BIT_ULL(1)
+#define HSW_ANY_RESPONSE BIT_ULL(16)
+#define HSW_SUPPLIER_NONE BIT_ULL(17)
+#define HSW_L3_MISS_LOCAL_DRAM BIT_ULL(22)
+#define HSW_L3_MISS_REMOTE_HOP0 BIT_ULL(27)
+#define HSW_L3_MISS_REMOTE_HOP1 BIT_ULL(28)
+#define HSW_L3_MISS_REMOTE_HOP2P BIT_ULL(29)
+#define HSW_L3_MISS (HSW_L3_MISS_LOCAL_DRAM| \
+ HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
+ HSW_L3_MISS_REMOTE_HOP2P)
+#define HSW_SNOOP_NONE BIT_ULL(31)
+#define HSW_SNOOP_NOT_NEEDED BIT_ULL(32)
+#define HSW_SNOOP_MISS BIT_ULL(33)
+#define HSW_SNOOP_HIT_NO_FWD BIT_ULL(34)
+#define HSW_SNOOP_HIT_WITH_FWD BIT_ULL(35)
+#define HSW_SNOOP_HITM BIT_ULL(36)
+#define HSW_SNOOP_NON_DRAM BIT_ULL(37)
+#define HSW_ANY_SNOOP (HSW_SNOOP_NONE| \
+ HSW_SNOOP_NOT_NEEDED|HSW_SNOOP_MISS| \
+ HSW_SNOOP_HIT_NO_FWD|HSW_SNOOP_HIT_WITH_FWD| \
+ HSW_SNOOP_HITM|HSW_SNOOP_NON_DRAM)
+#define HSW_SNOOP_DRAM (HSW_ANY_SNOOP & ~HSW_SNOOP_NON_DRAM)
+#define HSW_DEMAND_READ HSW_DEMAND_DATA_RD
+#define HSW_DEMAND_WRITE HSW_DEMAND_RFO
+#define HSW_L3_MISS_REMOTE (HSW_L3_MISS_REMOTE_HOP0|\
+ HSW_L3_MISS_REMOTE_HOP1|HSW_L3_MISS_REMOTE_HOP2P)
+#define HSW_LLC_ACCESS HSW_ANY_RESPONSE
+
+#define BDW_L3_MISS_LOCAL BIT(26)
+#define BDW_L3_MISS (BDW_L3_MISS_LOCAL| \
+ HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
+ HSW_L3_MISS_REMOTE_HOP2P)
+
+
+static __initconst const u64 hsw_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS */
+ [ C(RESULT_MISS) ] = 0x151, /* L1D.REPLACEMENT */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES */
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x280, /* ICACHE.MISSES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS */
+ [ C(RESULT_MISS) ] = 0x108, /* DTLB_LOAD_MISSES.MISS_CAUSES_A_WALK */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES */
+ [ C(RESULT_MISS) ] = 0x149, /* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x6085, /* ITLB_MISSES.STLB_HIT */
+ [ C(RESULT_MISS) ] = 0x185, /* ITLB_MISSES.MISS_CAUSES_A_WALK */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0xc4, /* BR_INST_RETIRED.ALL_BRANCHES */
+ [ C(RESULT_MISS) ] = 0xc5, /* BR_MISP_RETIRED.ALL_BRANCHES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+};
+
+static __initconst const u64 hsw_hw_cache_extra_regs
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
+ HSW_LLC_ACCESS,
+ [ C(RESULT_MISS) ] = HSW_DEMAND_READ|
+ HSW_L3_MISS|HSW_ANY_SNOOP,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE|
+ HSW_LLC_ACCESS,
+ [ C(RESULT_MISS) ] = HSW_DEMAND_WRITE|
+ HSW_L3_MISS|HSW_ANY_SNOOP,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
+ HSW_L3_MISS_LOCAL_DRAM|
+ HSW_SNOOP_DRAM,
+ [ C(RESULT_MISS) ] = HSW_DEMAND_READ|
+ HSW_L3_MISS_REMOTE|
+ HSW_SNOOP_DRAM,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE|
+ HSW_L3_MISS_LOCAL_DRAM|
+ HSW_SNOOP_DRAM,
+ [ C(RESULT_MISS) ] = HSW_DEMAND_WRITE|
+ HSW_L3_MISS_REMOTE|
+ HSW_SNOOP_DRAM,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+};
+
+static __initconst const u64 westmere_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */
+ [ C(RESULT_MISS) ] = 0x0151, /* L1D.REPL */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */
+ [ C(RESULT_MISS) ] = 0x0251, /* L1D.M_REPL */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */
+ [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
+ [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
+ [ C(RESULT_MISS) ] = 0x01b7,
+ },
+ /*
+ * Use RFO, not WRITEBACK, because a write miss would typically occur
+ * on RFO.
+ */
+ [ C(OP_WRITE) ] = {
+ /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
+ [ C(RESULT_MISS) ] = 0x01b7,
+ },
+ [ C(OP_PREFETCH) ] = {
+ /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
+ [ C(RESULT_MISS) ] = 0x01b7,
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */
+ [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */
+ [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */
+ [ C(RESULT_MISS) ] = 0x0185, /* ITLB_MISSES.ANY */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
+ [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ [ C(RESULT_MISS) ] = 0x01b7,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ [ C(RESULT_MISS) ] = 0x01b7,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ [ C(RESULT_MISS) ] = 0x01b7,
+ },
+ },
+};
+
+/*
+ * Nehalem/Westmere MSR_OFFCORE_RESPONSE bits;
+ * See IA32 SDM Vol 3B 30.6.1.3
+ */
+
+#define NHM_DMND_DATA_RD (1 << 0)
+#define NHM_DMND_RFO (1 << 1)
+#define NHM_DMND_IFETCH (1 << 2)
+#define NHM_DMND_WB (1 << 3)
+#define NHM_PF_DATA_RD (1 << 4)
+#define NHM_PF_DATA_RFO (1 << 5)
+#define NHM_PF_IFETCH (1 << 6)
+#define NHM_OFFCORE_OTHER (1 << 7)
+#define NHM_UNCORE_HIT (1 << 8)
+#define NHM_OTHER_CORE_HIT_SNP (1 << 9)
+#define NHM_OTHER_CORE_HITM (1 << 10)
+ /* reserved */
+#define NHM_REMOTE_CACHE_FWD (1 << 12)
+#define NHM_REMOTE_DRAM (1 << 13)
+#define NHM_LOCAL_DRAM (1 << 14)
+#define NHM_NON_DRAM (1 << 15)
+
+#define NHM_LOCAL (NHM_LOCAL_DRAM|NHM_REMOTE_CACHE_FWD)
+#define NHM_REMOTE (NHM_REMOTE_DRAM)
+
+#define NHM_DMND_READ (NHM_DMND_DATA_RD)
+#define NHM_DMND_WRITE (NHM_DMND_RFO|NHM_DMND_WB)
+#define NHM_DMND_PREFETCH (NHM_PF_DATA_RD|NHM_PF_DATA_RFO)
+
+#define NHM_L3_HIT (NHM_UNCORE_HIT|NHM_OTHER_CORE_HIT_SNP|NHM_OTHER_CORE_HITM)
+#define NHM_L3_MISS (NHM_NON_DRAM|NHM_LOCAL_DRAM|NHM_REMOTE_DRAM|NHM_REMOTE_CACHE_FWD)
+#define NHM_L3_ACCESS (NHM_L3_HIT|NHM_L3_MISS)
+
+static __initconst const u64 nehalem_hw_cache_extra_regs
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_L3_ACCESS,
+ [ C(RESULT_MISS) ] = NHM_DMND_READ|NHM_L3_MISS,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_L3_ACCESS,
+ [ C(RESULT_MISS) ] = NHM_DMND_WRITE|NHM_L3_MISS,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_L3_ACCESS,
+ [ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_L3_MISS,
+ },
+ },
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_LOCAL|NHM_REMOTE,
+ [ C(RESULT_MISS) ] = NHM_DMND_READ|NHM_REMOTE,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_LOCAL|NHM_REMOTE,
+ [ C(RESULT_MISS) ] = NHM_DMND_WRITE|NHM_REMOTE,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_LOCAL|NHM_REMOTE,
+ [ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_REMOTE,
+ },
+ },
+};
+
+static __initconst const u64 nehalem_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */
+ [ C(RESULT_MISS) ] = 0x0151, /* L1D.REPL */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */
+ [ C(RESULT_MISS) ] = 0x0251, /* L1D.M_REPL */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */
+ [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
+ [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
+ [ C(RESULT_MISS) ] = 0x01b7,
+ },
+ /*
+ * Use RFO, not WRITEBACK, because a write miss would typically occur
+ * on RFO.
+ */
+ [ C(OP_WRITE) ] = {
+ /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
+ [ C(RESULT_MISS) ] = 0x01b7,
+ },
+ [ C(OP_PREFETCH) ] = {
+ /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
+ [ C(RESULT_MISS) ] = 0x01b7,
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
+ [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
+ [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */
+ [ C(RESULT_MISS) ] = 0x20c8, /* ITLB_MISS_RETIRED */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
+ [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ [ C(RESULT_MISS) ] = 0x01b7,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ [ C(RESULT_MISS) ] = 0x01b7,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ [ C(RESULT_MISS) ] = 0x01b7,
+ },
+ },
+};
+
+static __initconst const u64 core2_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
+ [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
+ [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS */
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */
+ [ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
+ [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
+ [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
+ [ C(RESULT_MISS) ] = 0x0208, /* DTLB_MISSES.MISS_LD */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
+ [ C(RESULT_MISS) ] = 0x0808, /* DTLB_MISSES.MISS_ST */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
+ [ C(RESULT_MISS) ] = 0x1282, /* ITLBMISSES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
+ [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+};
+
+static __initconst const u64 atom_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD */
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST */
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
+ [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
+ [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
+ [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI (alias) */
+ [ C(RESULT_MISS) ] = 0x0508, /* DTLB_MISSES.MISS_LD */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI (alias) */
+ [ C(RESULT_MISS) ] = 0x0608, /* DTLB_MISSES.MISS_ST */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
+ [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
+ [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+};
+
+EVENT_ATTR_STR(topdown-total-slots, td_total_slots_slm, "event=0x3c");
+EVENT_ATTR_STR(topdown-total-slots.scale, td_total_slots_scale_slm, "2");
+/* no_alloc_cycles.not_delivered */
+EVENT_ATTR_STR(topdown-fetch-bubbles, td_fetch_bubbles_slm,
+ "event=0xca,umask=0x50");
+EVENT_ATTR_STR(topdown-fetch-bubbles.scale, td_fetch_bubbles_scale_slm, "2");
+/* uops_retired.all */
+EVENT_ATTR_STR(topdown-slots-issued, td_slots_issued_slm,
+ "event=0xc2,umask=0x10");
+/* uops_retired.all */
+EVENT_ATTR_STR(topdown-slots-retired, td_slots_retired_slm,
+ "event=0xc2,umask=0x10");
+
+static struct attribute *slm_events_attrs[] = {
+ EVENT_PTR(td_total_slots_slm),
+ EVENT_PTR(td_total_slots_scale_slm),
+ EVENT_PTR(td_fetch_bubbles_slm),
+ EVENT_PTR(td_fetch_bubbles_scale_slm),
+ EVENT_PTR(td_slots_issued_slm),
+ EVENT_PTR(td_slots_retired_slm),
+ NULL
+};
+
+static struct extra_reg intel_slm_extra_regs[] __read_mostly =
+{
+ /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
+ INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x768005ffffull, RSP_0),
+ INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x368005ffffull, RSP_1),
+ EVENT_EXTRA_END
+};
+
+#define SLM_DMND_READ SNB_DMND_DATA_RD
+#define SLM_DMND_WRITE SNB_DMND_RFO
+#define SLM_DMND_PREFETCH (SNB_PF_DATA_RD|SNB_PF_RFO)
+
+#define SLM_SNP_ANY (SNB_SNP_NONE|SNB_SNP_MISS|SNB_NO_FWD|SNB_HITM)
+#define SLM_LLC_ACCESS SNB_RESP_ANY
+#define SLM_LLC_MISS (SLM_SNP_ANY|SNB_NON_DRAM)
+
+static __initconst const u64 slm_hw_cache_extra_regs
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = SLM_DMND_READ|SLM_LLC_ACCESS,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = SLM_DMND_WRITE|SLM_LLC_ACCESS,
+ [ C(RESULT_MISS) ] = SLM_DMND_WRITE|SLM_LLC_MISS,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = SLM_DMND_PREFETCH|SLM_LLC_ACCESS,
+ [ C(RESULT_MISS) ] = SLM_DMND_PREFETCH|SLM_LLC_MISS,
+ },
+ },
+};
+
+static __initconst const u64 slm_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0x0104, /* LD_DCU_MISS */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0380, /* ICACHE.ACCESSES */
+ [ C(RESULT_MISS) ] = 0x0280, /* ICACGE.MISSES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_WRITE) ] = {
+ /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
+ [ C(RESULT_MISS) ] = 0x01b7,
+ },
+ [ C(OP_PREFETCH) ] = {
+ /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
+ [ C(RESULT_MISS) ] = 0x01b7,
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0x0804, /* LD_DTLB_MISS */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
+ [ C(RESULT_MISS) ] = 0x40205, /* PAGE_WALKS.I_SIDE_WALKS */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
+ [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+};
+
+EVENT_ATTR_STR(topdown-total-slots, td_total_slots_glm, "event=0x3c");
+EVENT_ATTR_STR(topdown-total-slots.scale, td_total_slots_scale_glm, "3");
+/* UOPS_NOT_DELIVERED.ANY */
+EVENT_ATTR_STR(topdown-fetch-bubbles, td_fetch_bubbles_glm, "event=0x9c");
+/* ISSUE_SLOTS_NOT_CONSUMED.RECOVERY */
+EVENT_ATTR_STR(topdown-recovery-bubbles, td_recovery_bubbles_glm, "event=0xca,umask=0x02");
+/* UOPS_RETIRED.ANY */
+EVENT_ATTR_STR(topdown-slots-retired, td_slots_retired_glm, "event=0xc2");
+/* UOPS_ISSUED.ANY */
+EVENT_ATTR_STR(topdown-slots-issued, td_slots_issued_glm, "event=0x0e");
+
+static struct attribute *glm_events_attrs[] = {
+ EVENT_PTR(td_total_slots_glm),
+ EVENT_PTR(td_total_slots_scale_glm),
+ EVENT_PTR(td_fetch_bubbles_glm),
+ EVENT_PTR(td_recovery_bubbles_glm),
+ EVENT_PTR(td_slots_issued_glm),
+ EVENT_PTR(td_slots_retired_glm),
+ NULL
+};
+
+static struct extra_reg intel_glm_extra_regs[] __read_mostly = {
+ /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
+ INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x760005ffbfull, RSP_0),
+ INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x360005ffbfull, RSP_1),
+ EVENT_EXTRA_END
+};
+
+#define GLM_DEMAND_DATA_RD BIT_ULL(0)
+#define GLM_DEMAND_RFO BIT_ULL(1)
+#define GLM_ANY_RESPONSE BIT_ULL(16)
+#define GLM_SNP_NONE_OR_MISS BIT_ULL(33)
+#define GLM_DEMAND_READ GLM_DEMAND_DATA_RD
+#define GLM_DEMAND_WRITE GLM_DEMAND_RFO
+#define GLM_DEMAND_PREFETCH (SNB_PF_DATA_RD|SNB_PF_RFO)
+#define GLM_LLC_ACCESS GLM_ANY_RESPONSE
+#define GLM_SNP_ANY (GLM_SNP_NONE_OR_MISS|SNB_NO_FWD|SNB_HITM)
+#define GLM_LLC_MISS (GLM_SNP_ANY|SNB_NON_DRAM)
+
+static __initconst const u64 glm_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
+ [C(L1D)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS */
+ [C(RESULT_MISS)] = 0x0,
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES */
+ [C(RESULT_MISS)] = 0x0,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = 0x0,
+ [C(RESULT_MISS)] = 0x0,
+ },
+ },
+ [C(L1I)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = 0x0380, /* ICACHE.ACCESSES */
+ [C(RESULT_MISS)] = 0x0280, /* ICACHE.MISSES */
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = -1,
+ [C(RESULT_MISS)] = -1,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = 0x0,
+ [C(RESULT_MISS)] = 0x0,
+ },
+ },
+ [C(LL)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = 0x1b7, /* OFFCORE_RESPONSE */
+ [C(RESULT_MISS)] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = 0x1b7, /* OFFCORE_RESPONSE */
+ [C(RESULT_MISS)] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = 0x1b7, /* OFFCORE_RESPONSE */
+ [C(RESULT_MISS)] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ },
+ [C(DTLB)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS */
+ [C(RESULT_MISS)] = 0x0,
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES */
+ [C(RESULT_MISS)] = 0x0,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = 0x0,
+ [C(RESULT_MISS)] = 0x0,
+ },
+ },
+ [C(ITLB)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = 0x00c0, /* INST_RETIRED.ANY_P */
+ [C(RESULT_MISS)] = 0x0481, /* ITLB.MISS */
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = -1,
+ [C(RESULT_MISS)] = -1,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = -1,
+ [C(RESULT_MISS)] = -1,
+ },
+ },
+ [C(BPU)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
+ [C(RESULT_MISS)] = 0x00c5, /* BR_MISP_RETIRED.ALL_BRANCHES */
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = -1,
+ [C(RESULT_MISS)] = -1,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = -1,
+ [C(RESULT_MISS)] = -1,
+ },
+ },
+};
+
+static __initconst const u64 glm_hw_cache_extra_regs
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
+ [C(LL)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = GLM_DEMAND_READ|
+ GLM_LLC_ACCESS,
+ [C(RESULT_MISS)] = GLM_DEMAND_READ|
+ GLM_LLC_MISS,
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = GLM_DEMAND_WRITE|
+ GLM_LLC_ACCESS,
+ [C(RESULT_MISS)] = GLM_DEMAND_WRITE|
+ GLM_LLC_MISS,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = GLM_DEMAND_PREFETCH|
+ GLM_LLC_ACCESS,
+ [C(RESULT_MISS)] = GLM_DEMAND_PREFETCH|
+ GLM_LLC_MISS,
+ },
+ },
+};
+
+static __initconst const u64 glp_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
+ [C(L1D)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS */
+ [C(RESULT_MISS)] = 0x0,
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES */
+ [C(RESULT_MISS)] = 0x0,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = 0x0,
+ [C(RESULT_MISS)] = 0x0,
+ },
+ },
+ [C(L1I)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = 0x0380, /* ICACHE.ACCESSES */
+ [C(RESULT_MISS)] = 0x0280, /* ICACHE.MISSES */
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = -1,
+ [C(RESULT_MISS)] = -1,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = 0x0,
+ [C(RESULT_MISS)] = 0x0,
+ },
+ },
+ [C(LL)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = 0x1b7, /* OFFCORE_RESPONSE */
+ [C(RESULT_MISS)] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = 0x1b7, /* OFFCORE_RESPONSE */
+ [C(RESULT_MISS)] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = 0x0,
+ [C(RESULT_MISS)] = 0x0,
+ },
+ },
+ [C(DTLB)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS */
+ [C(RESULT_MISS)] = 0xe08, /* DTLB_LOAD_MISSES.WALK_COMPLETED */
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES */
+ [C(RESULT_MISS)] = 0xe49, /* DTLB_STORE_MISSES.WALK_COMPLETED */
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = 0x0,
+ [C(RESULT_MISS)] = 0x0,
+ },
+ },
+ [C(ITLB)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = 0x00c0, /* INST_RETIRED.ANY_P */
+ [C(RESULT_MISS)] = 0x0481, /* ITLB.MISS */
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = -1,
+ [C(RESULT_MISS)] = -1,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = -1,
+ [C(RESULT_MISS)] = -1,
+ },
+ },
+ [C(BPU)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
+ [C(RESULT_MISS)] = 0x00c5, /* BR_MISP_RETIRED.ALL_BRANCHES */
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = -1,
+ [C(RESULT_MISS)] = -1,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = -1,
+ [C(RESULT_MISS)] = -1,
+ },
+ },
+};
+
+static __initconst const u64 glp_hw_cache_extra_regs
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
+ [C(LL)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = GLM_DEMAND_READ|
+ GLM_LLC_ACCESS,
+ [C(RESULT_MISS)] = GLM_DEMAND_READ|
+ GLM_LLC_MISS,
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = GLM_DEMAND_WRITE|
+ GLM_LLC_ACCESS,
+ [C(RESULT_MISS)] = GLM_DEMAND_WRITE|
+ GLM_LLC_MISS,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = 0x0,
+ [C(RESULT_MISS)] = 0x0,
+ },
+ },
+};
+
+#define KNL_OT_L2_HITE BIT_ULL(19) /* Other Tile L2 Hit */
+#define KNL_OT_L2_HITF BIT_ULL(20) /* Other Tile L2 Hit */
+#define KNL_MCDRAM_LOCAL BIT_ULL(21)
+#define KNL_MCDRAM_FAR BIT_ULL(22)
+#define KNL_DDR_LOCAL BIT_ULL(23)
+#define KNL_DDR_FAR BIT_ULL(24)
+#define KNL_DRAM_ANY (KNL_MCDRAM_LOCAL | KNL_MCDRAM_FAR | \
+ KNL_DDR_LOCAL | KNL_DDR_FAR)
+#define KNL_L2_READ SLM_DMND_READ
+#define KNL_L2_WRITE SLM_DMND_WRITE
+#define KNL_L2_PREFETCH SLM_DMND_PREFETCH
+#define KNL_L2_ACCESS SLM_LLC_ACCESS
+#define KNL_L2_MISS (KNL_OT_L2_HITE | KNL_OT_L2_HITF | \
+ KNL_DRAM_ANY | SNB_SNP_ANY | \
+ SNB_NON_DRAM)
+
+static __initconst const u64 knl_hw_cache_extra_regs
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
+ [C(LL)] = {
+ [C(OP_READ)] = {
+ [C(RESULT_ACCESS)] = KNL_L2_READ | KNL_L2_ACCESS,
+ [C(RESULT_MISS)] = 0,
+ },
+ [C(OP_WRITE)] = {
+ [C(RESULT_ACCESS)] = KNL_L2_WRITE | KNL_L2_ACCESS,
+ [C(RESULT_MISS)] = KNL_L2_WRITE | KNL_L2_MISS,
+ },
+ [C(OP_PREFETCH)] = {
+ [C(RESULT_ACCESS)] = KNL_L2_PREFETCH | KNL_L2_ACCESS,
+ [C(RESULT_MISS)] = KNL_L2_PREFETCH | KNL_L2_MISS,
+ },
+ },
+};
+
+/*
+ * Used from PMIs where the LBRs are already disabled.
+ *
+ * This function could be called consecutively. It is required to remain in
+ * disabled state if called consecutively.
+ *
+ * During consecutive calls, the same disable value will be written to related
+ * registers, so the PMU state remains unchanged.
+ *
+ * intel_bts events don't coexist with intel PMU's BTS events because of
+ * x86_add_exclusive(x86_lbr_exclusive_lbr); there's no need to keep them
+ * disabled around intel PMU's event batching etc, only inside the PMI handler.
+ */
+static void __intel_pmu_disable_all(void)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+
+ if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
+ intel_pmu_disable_bts();
+
+ intel_pmu_pebs_disable_all();
+}
+
+static void intel_pmu_disable_all(void)
+{
+ __intel_pmu_disable_all();
+ intel_pmu_lbr_disable_all();
+}
+
+static void __intel_pmu_enable_all(int added, bool pmi)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ intel_pmu_pebs_enable_all();
+ intel_pmu_lbr_enable_all(pmi);
+ wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL,
+ x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask);
+
+ if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
+ struct perf_event *event =
+ cpuc->events[INTEL_PMC_IDX_FIXED_BTS];
+
+ if (WARN_ON_ONCE(!event))
+ return;
+
+ intel_pmu_enable_bts(event->hw.config);
+ }
+}
+
+static void intel_pmu_enable_all(int added)
+{
+ __intel_pmu_enable_all(added, false);
+}
+
+/*
+ * Workaround for:
+ * Intel Errata AAK100 (model 26)
+ * Intel Errata AAP53 (model 30)
+ * Intel Errata BD53 (model 44)
+ *
+ * The official story:
+ * These chips need to be 'reset' when adding counters by programming the
+ * magic three (non-counting) events 0x4300B5, 0x4300D2, and 0x4300B1 either
+ * in sequence on the same PMC or on different PMCs.
+ *
+ * In practise it appears some of these events do in fact count, and
+ * we need to programm all 4 events.
+ */
+static void intel_pmu_nhm_workaround(void)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ static const unsigned long nhm_magic[4] = {
+ 0x4300B5,
+ 0x4300D2,
+ 0x4300B1,
+ 0x4300B1
+ };
+ struct perf_event *event;
+ int i;
+
+ /*
+ * The Errata requires below steps:
+ * 1) Clear MSR_IA32_PEBS_ENABLE and MSR_CORE_PERF_GLOBAL_CTRL;
+ * 2) Configure 4 PERFEVTSELx with the magic events and clear
+ * the corresponding PMCx;
+ * 3) set bit0~bit3 of MSR_CORE_PERF_GLOBAL_CTRL;
+ * 4) Clear MSR_CORE_PERF_GLOBAL_CTRL;
+ * 5) Clear 4 pairs of ERFEVTSELx and PMCx;
+ */
+
+ /*
+ * The real steps we choose are a little different from above.
+ * A) To reduce MSR operations, we don't run step 1) as they
+ * are already cleared before this function is called;
+ * B) Call x86_perf_event_update to save PMCx before configuring
+ * PERFEVTSELx with magic number;
+ * C) With step 5), we do clear only when the PERFEVTSELx is
+ * not used currently.
+ * D) Call x86_perf_event_set_period to restore PMCx;
+ */
+
+ /* We always operate 4 pairs of PERF Counters */
+ for (i = 0; i < 4; i++) {
+ event = cpuc->events[i];
+ if (event)
+ x86_perf_event_update(event);
+ }
+
+ for (i = 0; i < 4; i++) {
+ wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, nhm_magic[i]);
+ wrmsrl(MSR_ARCH_PERFMON_PERFCTR0 + i, 0x0);
+ }
+
+ wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0xf);
+ wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x0);
+
+ for (i = 0; i < 4; i++) {
+ event = cpuc->events[i];
+
+ if (event) {
+ x86_perf_event_set_period(event);
+ __x86_pmu_enable_event(&event->hw,
+ ARCH_PERFMON_EVENTSEL_ENABLE);
+ } else
+ wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, 0x0);
+ }
+}
+
+static void intel_pmu_nhm_enable_all(int added)
+{
+ if (added)
+ intel_pmu_nhm_workaround();
+ intel_pmu_enable_all(added);
+}
+
+static void intel_set_tfa(struct cpu_hw_events *cpuc, bool on)
+{
+ u64 val = on ? MSR_TFA_RTM_FORCE_ABORT : 0;
+
+ if (cpuc->tfa_shadow != val) {
+ cpuc->tfa_shadow = val;
+ wrmsrl(MSR_TSX_FORCE_ABORT, val);
+ }
+}
+
+static void intel_tfa_commit_scheduling(struct cpu_hw_events *cpuc, int idx, int cntr)
+{
+ /*
+ * We're going to use PMC3, make sure TFA is set before we touch it.
+ */
+ if (cntr == 3 && !cpuc->is_fake)
+ intel_set_tfa(cpuc, true);
+}
+
+static void intel_tfa_pmu_enable_all(int added)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ /*
+ * If we find PMC3 is no longer used when we enable the PMU, we can
+ * clear TFA.
+ */
+ if (!test_bit(3, cpuc->active_mask))
+ intel_set_tfa(cpuc, false);
+
+ intel_pmu_enable_all(added);
+}
+
+static inline u64 intel_pmu_get_status(void)
+{
+ u64 status;
+
+ rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+
+ return status;
+}
+
+static inline void intel_pmu_ack_status(u64 ack)
+{
+ wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
+}
+
+static void intel_pmu_disable_fixed(struct hw_perf_event *hwc)
+{
+ int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
+ u64 ctrl_val, mask;
+
+ mask = 0xfULL << (idx * 4);
+
+ rdmsrl(hwc->config_base, ctrl_val);
+ ctrl_val &= ~mask;
+ wrmsrl(hwc->config_base, ctrl_val);
+}
+
+static inline bool event_is_checkpointed(struct perf_event *event)
+{
+ return (event->hw.config & HSW_IN_TX_CHECKPOINTED) != 0;
+}
+
+static void intel_pmu_disable_event(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) {
+ intel_pmu_disable_bts();
+ intel_pmu_drain_bts_buffer();
+ return;
+ }
+
+ cpuc->intel_ctrl_guest_mask &= ~(1ull << hwc->idx);
+ cpuc->intel_ctrl_host_mask &= ~(1ull << hwc->idx);
+ cpuc->intel_cp_status &= ~(1ull << hwc->idx);
+
+ if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
+ intel_pmu_disable_fixed(hwc);
+ else
+ x86_pmu_disable_event(event);
+
+ /*
+ * Needs to be called after x86_pmu_disable_event,
+ * so we don't trigger the event without PEBS bit set.
+ */
+ if (unlikely(event->attr.precise_ip))
+ intel_pmu_pebs_disable(event);
+}
+
+static void intel_pmu_del_event(struct perf_event *event)
+{
+ if (needs_branch_stack(event))
+ intel_pmu_lbr_del(event);
+ if (event->attr.precise_ip)
+ intel_pmu_pebs_del(event);
+}
+
+static void intel_pmu_read_event(struct perf_event *event)
+{
+ if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD)
+ intel_pmu_auto_reload_read(event);
+ else
+ x86_perf_event_update(event);
+}
+
+static void intel_pmu_enable_fixed(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
+ u64 ctrl_val, mask, bits = 0;
+
+ /*
+ * Enable IRQ generation (0x8), if not PEBS,
+ * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
+ * if requested:
+ */
+ if (!event->attr.precise_ip)
+ bits |= 0x8;
+ if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
+ bits |= 0x2;
+ if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
+ bits |= 0x1;
+
+ /*
+ * ANY bit is supported in v3 and up
+ */
+ if (x86_pmu.version > 2 && hwc->config & ARCH_PERFMON_EVENTSEL_ANY)
+ bits |= 0x4;
+
+ bits <<= (idx * 4);
+ mask = 0xfULL << (idx * 4);
+
+ rdmsrl(hwc->config_base, ctrl_val);
+ ctrl_val &= ~mask;
+ ctrl_val |= bits;
+ wrmsrl(hwc->config_base, ctrl_val);
+}
+
+static void intel_pmu_enable_event(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) {
+ if (!__this_cpu_read(cpu_hw_events.enabled))
+ return;
+
+ intel_pmu_enable_bts(hwc->config);
+ return;
+ }
+
+ if (event->attr.exclude_host)
+ cpuc->intel_ctrl_guest_mask |= (1ull << hwc->idx);
+ if (event->attr.exclude_guest)
+ cpuc->intel_ctrl_host_mask |= (1ull << hwc->idx);
+
+ if (unlikely(event_is_checkpointed(event)))
+ cpuc->intel_cp_status |= (1ull << hwc->idx);
+
+ if (unlikely(event->attr.precise_ip))
+ intel_pmu_pebs_enable(event);
+
+ if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
+ intel_pmu_enable_fixed(event);
+ return;
+ }
+
+ __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
+}
+
+static void intel_pmu_add_event(struct perf_event *event)
+{
+ if (event->attr.precise_ip)
+ intel_pmu_pebs_add(event);
+ if (needs_branch_stack(event))
+ intel_pmu_lbr_add(event);
+}
+
+/*
+ * Save and restart an expired event. Called by NMI contexts,
+ * so it has to be careful about preempting normal event ops:
+ */
+int intel_pmu_save_and_restart(struct perf_event *event)
+{
+ x86_perf_event_update(event);
+ /*
+ * For a checkpointed counter always reset back to 0. This
+ * avoids a situation where the counter overflows, aborts the
+ * transaction and is then set back to shortly before the
+ * overflow, and overflows and aborts again.
+ */
+ if (unlikely(event_is_checkpointed(event))) {
+ /* No race with NMIs because the counter should not be armed */
+ wrmsrl(event->hw.event_base, 0);
+ local64_set(&event->hw.prev_count, 0);
+ }
+ return x86_perf_event_set_period(event);
+}
+
+static void intel_pmu_reset(void)
+{
+ struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds);
+ unsigned long flags;
+ int idx;
+
+ if (!x86_pmu.num_counters)
+ return;
+
+ local_irq_save(flags);
+
+ pr_info("clearing PMU state on CPU#%d\n", smp_processor_id());
+
+ for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ wrmsrl_safe(x86_pmu_config_addr(idx), 0ull);
+ wrmsrl_safe(x86_pmu_event_addr(idx), 0ull);
+ }
+ for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++)
+ wrmsrl_safe(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
+
+ if (ds)
+ ds->bts_index = ds->bts_buffer_base;
+
+ /* Ack all overflows and disable fixed counters */
+ if (x86_pmu.version >= 2) {
+ intel_pmu_ack_status(intel_pmu_get_status());
+ wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+ }
+
+ /* Reset LBRs and LBR freezing */
+ if (x86_pmu.lbr_nr) {
+ update_debugctlmsr(get_debugctlmsr() &
+ ~(DEBUGCTLMSR_FREEZE_LBRS_ON_PMI|DEBUGCTLMSR_LBR));
+ }
+
+ local_irq_restore(flags);
+}
+
+/*
+ * This handler is triggered by the local APIC, so the APIC IRQ handling
+ * rules apply:
+ */
+static int intel_pmu_handle_irq(struct pt_regs *regs)
+{
+ struct perf_sample_data data;
+ struct cpu_hw_events *cpuc;
+ int bit, loops;
+ u64 status;
+ int handled;
+ int pmu_enabled;
+
+ cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ /*
+ * Save the PMU state.
+ * It needs to be restored when leaving the handler.
+ */
+ pmu_enabled = cpuc->enabled;
+ /*
+ * No known reason to not always do late ACK,
+ * but just in case do it opt-in.
+ */
+ if (!x86_pmu.late_ack)
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+ intel_bts_disable_local();
+ cpuc->enabled = 0;
+ __intel_pmu_disable_all();
+ handled = intel_pmu_drain_bts_buffer();
+ handled += intel_bts_interrupt();
+ status = intel_pmu_get_status();
+ if (!status)
+ goto done;
+
+ loops = 0;
+again:
+ intel_pmu_lbr_read();
+ intel_pmu_ack_status(status);
+ if (++loops > 100) {
+ static bool warned = false;
+ if (!warned) {
+ WARN(1, "perfevents: irq loop stuck!\n");
+ perf_event_print_debug();
+ warned = true;
+ }
+ intel_pmu_reset();
+ goto done;
+ }
+
+ inc_irq_stat(apic_perf_irqs);
+
+
+ /*
+ * Ignore a range of extra bits in status that do not indicate
+ * overflow by themselves.
+ */
+ status &= ~(GLOBAL_STATUS_COND_CHG |
+ GLOBAL_STATUS_ASIF |
+ GLOBAL_STATUS_LBRS_FROZEN);
+ if (!status)
+ goto done;
+ /*
+ * In case multiple PEBS events are sampled at the same time,
+ * it is possible to have GLOBAL_STATUS bit 62 set indicating
+ * PEBS buffer overflow and also seeing at most 3 PEBS counters
+ * having their bits set in the status register. This is a sign
+ * that there was at least one PEBS record pending at the time
+ * of the PMU interrupt. PEBS counters must only be processed
+ * via the drain_pebs() calls and not via the regular sample
+ * processing loop coming after that the function, otherwise
+ * phony regular samples may be generated in the sampling buffer
+ * not marked with the EXACT tag. Another possibility is to have
+ * one PEBS event and at least one non-PEBS event whic hoverflows
+ * while PEBS has armed. In this case, bit 62 of GLOBAL_STATUS will
+ * not be set, yet the overflow status bit for the PEBS counter will
+ * be on Skylake.
+ *
+ * To avoid this problem, we systematically ignore the PEBS-enabled
+ * counters from the GLOBAL_STATUS mask and we always process PEBS
+ * events via drain_pebs().
+ */
+ if (x86_pmu.flags & PMU_FL_PEBS_ALL)
+ status &= ~cpuc->pebs_enabled;
+ else
+ status &= ~(cpuc->pebs_enabled & PEBS_COUNTER_MASK);
+
+ /*
+ * PEBS overflow sets bit 62 in the global status register
+ */
+ if (__test_and_clear_bit(62, (unsigned long *)&status)) {
+ handled++;
+ x86_pmu.drain_pebs(regs);
+ status &= x86_pmu.intel_ctrl | GLOBAL_STATUS_TRACE_TOPAPMI;
+ }
+
+ /*
+ * Intel PT
+ */
+ if (__test_and_clear_bit(55, (unsigned long *)&status)) {
+ handled++;
+ intel_pt_interrupt();
+ }
+
+ /*
+ * Checkpointed counters can lead to 'spurious' PMIs because the
+ * rollback caused by the PMI will have cleared the overflow status
+ * bit. Therefore always force probe these counters.
+ */
+ status |= cpuc->intel_cp_status;
+
+ for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
+ struct perf_event *event = cpuc->events[bit];
+
+ handled++;
+
+ if (!test_bit(bit, cpuc->active_mask))
+ continue;
+
+ if (!intel_pmu_save_and_restart(event))
+ continue;
+
+ perf_sample_data_init(&data, 0, event->hw.last_period);
+
+ if (has_branch_stack(event))
+ data.br_stack = &cpuc->lbr_stack;
+
+ if (perf_event_overflow(event, &data, regs))
+ x86_pmu_stop(event, 0);
+ }
+
+ /*
+ * Repeat if there is more work to be done:
+ */
+ status = intel_pmu_get_status();
+ if (status)
+ goto again;
+
+done:
+ /* Only restore PMU state when it's active. See x86_pmu_disable(). */
+ cpuc->enabled = pmu_enabled;
+ if (pmu_enabled)
+ __intel_pmu_enable_all(0, true);
+ intel_bts_enable_local();
+
+ /*
+ * Only unmask the NMI after the overflow counters
+ * have been reset. This avoids spurious NMIs on
+ * Haswell CPUs.
+ */
+ if (x86_pmu.late_ack)
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+ return handled;
+}
+
+static struct event_constraint *
+intel_bts_constraints(struct perf_event *event)
+{
+ if (unlikely(intel_pmu_has_bts(event)))
+ return &bts_constraint;
+
+ return NULL;
+}
+
+static int intel_alt_er(int idx, u64 config)
+{
+ int alt_idx = idx;
+
+ if (!(x86_pmu.flags & PMU_FL_HAS_RSP_1))
+ return idx;
+
+ if (idx == EXTRA_REG_RSP_0)
+ alt_idx = EXTRA_REG_RSP_1;
+
+ if (idx == EXTRA_REG_RSP_1)
+ alt_idx = EXTRA_REG_RSP_0;
+
+ if (config & ~x86_pmu.extra_regs[alt_idx].valid_mask)
+ return idx;
+
+ return alt_idx;
+}
+
+static void intel_fixup_er(struct perf_event *event, int idx)
+{
+ event->hw.extra_reg.idx = idx;
+
+ if (idx == EXTRA_REG_RSP_0) {
+ event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
+ event->hw.config |= x86_pmu.extra_regs[EXTRA_REG_RSP_0].event;
+ event->hw.extra_reg.reg = MSR_OFFCORE_RSP_0;
+ } else if (idx == EXTRA_REG_RSP_1) {
+ event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
+ event->hw.config |= x86_pmu.extra_regs[EXTRA_REG_RSP_1].event;
+ event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1;
+ }
+}
+
+/*
+ * manage allocation of shared extra msr for certain events
+ *
+ * sharing can be:
+ * per-cpu: to be shared between the various events on a single PMU
+ * per-core: per-cpu + shared by HT threads
+ */
+static struct event_constraint *
+__intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc,
+ struct perf_event *event,
+ struct hw_perf_event_extra *reg)
+{
+ struct event_constraint *c = &emptyconstraint;
+ struct er_account *era;
+ unsigned long flags;
+ int idx = reg->idx;
+
+ /*
+ * reg->alloc can be set due to existing state, so for fake cpuc we
+ * need to ignore this, otherwise we might fail to allocate proper fake
+ * state for this extra reg constraint. Also see the comment below.
+ */
+ if (reg->alloc && !cpuc->is_fake)
+ return NULL; /* call x86_get_event_constraint() */
+
+again:
+ era = &cpuc->shared_regs->regs[idx];
+ /*
+ * we use spin_lock_irqsave() to avoid lockdep issues when
+ * passing a fake cpuc
+ */
+ raw_spin_lock_irqsave(&era->lock, flags);
+
+ if (!atomic_read(&era->ref) || era->config == reg->config) {
+
+ /*
+ * If its a fake cpuc -- as per validate_{group,event}() we
+ * shouldn't touch event state and we can avoid doing so
+ * since both will only call get_event_constraints() once
+ * on each event, this avoids the need for reg->alloc.
+ *
+ * Not doing the ER fixup will only result in era->reg being
+ * wrong, but since we won't actually try and program hardware
+ * this isn't a problem either.
+ */
+ if (!cpuc->is_fake) {
+ if (idx != reg->idx)
+ intel_fixup_er(event, idx);
+
+ /*
+ * x86_schedule_events() can call get_event_constraints()
+ * multiple times on events in the case of incremental
+ * scheduling(). reg->alloc ensures we only do the ER
+ * allocation once.
+ */
+ reg->alloc = 1;
+ }
+
+ /* lock in msr value */
+ era->config = reg->config;
+ era->reg = reg->reg;
+
+ /* one more user */
+ atomic_inc(&era->ref);
+
+ /*
+ * need to call x86_get_event_constraint()
+ * to check if associated event has constraints
+ */
+ c = NULL;
+ } else {
+ idx = intel_alt_er(idx, reg->config);
+ if (idx != reg->idx) {
+ raw_spin_unlock_irqrestore(&era->lock, flags);
+ goto again;
+ }
+ }
+ raw_spin_unlock_irqrestore(&era->lock, flags);
+
+ return c;
+}
+
+static void
+__intel_shared_reg_put_constraints(struct cpu_hw_events *cpuc,
+ struct hw_perf_event_extra *reg)
+{
+ struct er_account *era;
+
+ /*
+ * Only put constraint if extra reg was actually allocated. Also takes
+ * care of event which do not use an extra shared reg.
+ *
+ * Also, if this is a fake cpuc we shouldn't touch any event state
+ * (reg->alloc) and we don't care about leaving inconsistent cpuc state
+ * either since it'll be thrown out.
+ */
+ if (!reg->alloc || cpuc->is_fake)
+ return;
+
+ era = &cpuc->shared_regs->regs[reg->idx];
+
+ /* one fewer user */
+ atomic_dec(&era->ref);
+
+ /* allocate again next time */
+ reg->alloc = 0;
+}
+
+static struct event_constraint *
+intel_shared_regs_constraints(struct cpu_hw_events *cpuc,
+ struct perf_event *event)
+{
+ struct event_constraint *c = NULL, *d;
+ struct hw_perf_event_extra *xreg, *breg;
+
+ xreg = &event->hw.extra_reg;
+ if (xreg->idx != EXTRA_REG_NONE) {
+ c = __intel_shared_reg_get_constraints(cpuc, event, xreg);
+ if (c == &emptyconstraint)
+ return c;
+ }
+ breg = &event->hw.branch_reg;
+ if (breg->idx != EXTRA_REG_NONE) {
+ d = __intel_shared_reg_get_constraints(cpuc, event, breg);
+ if (d == &emptyconstraint) {
+ __intel_shared_reg_put_constraints(cpuc, xreg);
+ c = d;
+ }
+ }
+ return c;
+}
+
+struct event_constraint *
+x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
+{
+ struct event_constraint *c;
+
+ if (x86_pmu.event_constraints) {
+ for_each_event_constraint(c, x86_pmu.event_constraints) {
+ if ((event->hw.config & c->cmask) == c->code) {
+ event->hw.flags |= c->flags;
+ return c;
+ }
+ }
+ }
+
+ return &unconstrained;
+}
+
+static struct event_constraint *
+__intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
+{
+ struct event_constraint *c;
+
+ c = intel_bts_constraints(event);
+ if (c)
+ return c;
+
+ c = intel_shared_regs_constraints(cpuc, event);
+ if (c)
+ return c;
+
+ c = intel_pebs_constraints(event);
+ if (c)
+ return c;
+
+ return x86_get_event_constraints(cpuc, idx, event);
+}
+
+static void
+intel_start_scheduling(struct cpu_hw_events *cpuc)
+{
+ struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+ struct intel_excl_states *xl;
+ int tid = cpuc->excl_thread_id;
+
+ /*
+ * nothing needed if in group validation mode
+ */
+ if (cpuc->is_fake || !is_ht_workaround_enabled())
+ return;
+
+ /*
+ * no exclusion needed
+ */
+ if (WARN_ON_ONCE(!excl_cntrs))
+ return;
+
+ xl = &excl_cntrs->states[tid];
+
+ xl->sched_started = true;
+ /*
+ * lock shared state until we are done scheduling
+ * in stop_event_scheduling()
+ * makes scheduling appear as a transaction
+ */
+ raw_spin_lock(&excl_cntrs->lock);
+}
+
+static void intel_commit_scheduling(struct cpu_hw_events *cpuc, int idx, int cntr)
+{
+ struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+ struct event_constraint *c = cpuc->event_constraint[idx];
+ struct intel_excl_states *xl;
+ int tid = cpuc->excl_thread_id;
+
+ if (cpuc->is_fake || !is_ht_workaround_enabled())
+ return;
+
+ if (WARN_ON_ONCE(!excl_cntrs))
+ return;
+
+ if (!(c->flags & PERF_X86_EVENT_DYNAMIC))
+ return;
+
+ xl = &excl_cntrs->states[tid];
+
+ lockdep_assert_held(&excl_cntrs->lock);
+
+ if (c->flags & PERF_X86_EVENT_EXCL)
+ xl->state[cntr] = INTEL_EXCL_EXCLUSIVE;
+ else
+ xl->state[cntr] = INTEL_EXCL_SHARED;
+}
+
+static void
+intel_stop_scheduling(struct cpu_hw_events *cpuc)
+{
+ struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+ struct intel_excl_states *xl;
+ int tid = cpuc->excl_thread_id;
+
+ /*
+ * nothing needed if in group validation mode
+ */
+ if (cpuc->is_fake || !is_ht_workaround_enabled())
+ return;
+ /*
+ * no exclusion needed
+ */
+ if (WARN_ON_ONCE(!excl_cntrs))
+ return;
+
+ xl = &excl_cntrs->states[tid];
+
+ xl->sched_started = false;
+ /*
+ * release shared state lock (acquired in intel_start_scheduling())
+ */
+ raw_spin_unlock(&excl_cntrs->lock);
+}
+
+static struct event_constraint *
+dyn_constraint(struct cpu_hw_events *cpuc, struct event_constraint *c, int idx)
+{
+ WARN_ON_ONCE(!cpuc->constraint_list);
+
+ if (!(c->flags & PERF_X86_EVENT_DYNAMIC)) {
+ struct event_constraint *cx;
+
+ /*
+ * grab pre-allocated constraint entry
+ */
+ cx = &cpuc->constraint_list[idx];
+
+ /*
+ * initialize dynamic constraint
+ * with static constraint
+ */
+ *cx = *c;
+
+ /*
+ * mark constraint as dynamic
+ */
+ cx->flags |= PERF_X86_EVENT_DYNAMIC;
+ c = cx;
+ }
+
+ return c;
+}
+
+static struct event_constraint *
+intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
+ int idx, struct event_constraint *c)
+{
+ struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+ struct intel_excl_states *xlo;
+ int tid = cpuc->excl_thread_id;
+ int is_excl, i;
+
+ /*
+ * validating a group does not require
+ * enforcing cross-thread exclusion
+ */
+ if (cpuc->is_fake || !is_ht_workaround_enabled())
+ return c;
+
+ /*
+ * no exclusion needed
+ */
+ if (WARN_ON_ONCE(!excl_cntrs))
+ return c;
+
+ /*
+ * because we modify the constraint, we need
+ * to make a copy. Static constraints come
+ * from static const tables.
+ *
+ * only needed when constraint has not yet
+ * been cloned (marked dynamic)
+ */
+ c = dyn_constraint(cpuc, c, idx);
+
+ /*
+ * From here on, the constraint is dynamic.
+ * Either it was just allocated above, or it
+ * was allocated during a earlier invocation
+ * of this function
+ */
+
+ /*
+ * state of sibling HT
+ */
+ xlo = &excl_cntrs->states[tid ^ 1];
+
+ /*
+ * event requires exclusive counter access
+ * across HT threads
+ */
+ is_excl = c->flags & PERF_X86_EVENT_EXCL;
+ if (is_excl && !(event->hw.flags & PERF_X86_EVENT_EXCL_ACCT)) {
+ event->hw.flags |= PERF_X86_EVENT_EXCL_ACCT;
+ if (!cpuc->n_excl++)
+ WRITE_ONCE(excl_cntrs->has_exclusive[tid], 1);
+ }
+
+ /*
+ * Modify static constraint with current dynamic
+ * state of thread
+ *
+ * EXCLUSIVE: sibling counter measuring exclusive event
+ * SHARED : sibling counter measuring non-exclusive event
+ * UNUSED : sibling counter unused
+ */
+ for_each_set_bit(i, c->idxmsk, X86_PMC_IDX_MAX) {
+ /*
+ * exclusive event in sibling counter
+ * our corresponding counter cannot be used
+ * regardless of our event
+ */
+ if (xlo->state[i] == INTEL_EXCL_EXCLUSIVE)
+ __clear_bit(i, c->idxmsk);
+ /*
+ * if measuring an exclusive event, sibling
+ * measuring non-exclusive, then counter cannot
+ * be used
+ */
+ if (is_excl && xlo->state[i] == INTEL_EXCL_SHARED)
+ __clear_bit(i, c->idxmsk);
+ }
+
+ /*
+ * recompute actual bit weight for scheduling algorithm
+ */
+ c->weight = hweight64(c->idxmsk64);
+
+ /*
+ * if we return an empty mask, then switch
+ * back to static empty constraint to avoid
+ * the cost of freeing later on
+ */
+ if (c->weight == 0)
+ c = &emptyconstraint;
+
+ return c;
+}
+
+static struct event_constraint *
+intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
+{
+ struct event_constraint *c1 = NULL;
+ struct event_constraint *c2;
+
+ if (idx >= 0) /* fake does < 0 */
+ c1 = cpuc->event_constraint[idx];
+
+ /*
+ * first time only
+ * - static constraint: no change across incremental scheduling calls
+ * - dynamic constraint: handled by intel_get_excl_constraints()
+ */
+ c2 = __intel_get_event_constraints(cpuc, idx, event);
+ if (c1 && (c1->flags & PERF_X86_EVENT_DYNAMIC)) {
+ bitmap_copy(c1->idxmsk, c2->idxmsk, X86_PMC_IDX_MAX);
+ c1->weight = c2->weight;
+ c2 = c1;
+ }
+
+ if (cpuc->excl_cntrs)
+ return intel_get_excl_constraints(cpuc, event, idx, c2);
+
+ return c2;
+}
+
+static void intel_put_excl_constraints(struct cpu_hw_events *cpuc,
+ struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+ int tid = cpuc->excl_thread_id;
+ struct intel_excl_states *xl;
+
+ /*
+ * nothing needed if in group validation mode
+ */
+ if (cpuc->is_fake)
+ return;
+
+ if (WARN_ON_ONCE(!excl_cntrs))
+ return;
+
+ if (hwc->flags & PERF_X86_EVENT_EXCL_ACCT) {
+ hwc->flags &= ~PERF_X86_EVENT_EXCL_ACCT;
+ if (!--cpuc->n_excl)
+ WRITE_ONCE(excl_cntrs->has_exclusive[tid], 0);
+ }
+
+ /*
+ * If event was actually assigned, then mark the counter state as
+ * unused now.
+ */
+ if (hwc->idx >= 0) {
+ xl = &excl_cntrs->states[tid];
+
+ /*
+ * put_constraint may be called from x86_schedule_events()
+ * which already has the lock held so here make locking
+ * conditional.
+ */
+ if (!xl->sched_started)
+ raw_spin_lock(&excl_cntrs->lock);
+
+ xl->state[hwc->idx] = INTEL_EXCL_UNUSED;
+
+ if (!xl->sched_started)
+ raw_spin_unlock(&excl_cntrs->lock);
+ }
+}
+
+static void
+intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc,
+ struct perf_event *event)
+{
+ struct hw_perf_event_extra *reg;
+
+ reg = &event->hw.extra_reg;
+ if (reg->idx != EXTRA_REG_NONE)
+ __intel_shared_reg_put_constraints(cpuc, reg);
+
+ reg = &event->hw.branch_reg;
+ if (reg->idx != EXTRA_REG_NONE)
+ __intel_shared_reg_put_constraints(cpuc, reg);
+}
+
+static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
+ struct perf_event *event)
+{
+ intel_put_shared_regs_event_constraints(cpuc, event);
+
+ /*
+ * is PMU has exclusive counter restrictions, then
+ * all events are subject to and must call the
+ * put_excl_constraints() routine
+ */
+ if (cpuc->excl_cntrs)
+ intel_put_excl_constraints(cpuc, event);
+}
+
+static void intel_pebs_aliases_core2(struct perf_event *event)
+{
+ if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
+ /*
+ * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
+ * (0x003c) so that we can use it with PEBS.
+ *
+ * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
+ * PEBS capable. However we can use INST_RETIRED.ANY_P
+ * (0x00c0), which is a PEBS capable event, to get the same
+ * count.
+ *
+ * INST_RETIRED.ANY_P counts the number of cycles that retires
+ * CNTMASK instructions. By setting CNTMASK to a value (16)
+ * larger than the maximum number of instructions that can be
+ * retired per cycle (4) and then inverting the condition, we
+ * count all cycles that retire 16 or less instructions, which
+ * is every cycle.
+ *
+ * Thereby we gain a PEBS capable cycle counter.
+ */
+ u64 alt_config = X86_CONFIG(.event=0xc0, .inv=1, .cmask=16);
+
+ alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
+ event->hw.config = alt_config;
+ }
+}
+
+static void intel_pebs_aliases_snb(struct perf_event *event)
+{
+ if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
+ /*
+ * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
+ * (0x003c) so that we can use it with PEBS.
+ *
+ * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
+ * PEBS capable. However we can use UOPS_RETIRED.ALL
+ * (0x01c2), which is a PEBS capable event, to get the same
+ * count.
+ *
+ * UOPS_RETIRED.ALL counts the number of cycles that retires
+ * CNTMASK micro-ops. By setting CNTMASK to a value (16)
+ * larger than the maximum number of micro-ops that can be
+ * retired per cycle (4) and then inverting the condition, we
+ * count all cycles that retire 16 or less micro-ops, which
+ * is every cycle.
+ *
+ * Thereby we gain a PEBS capable cycle counter.
+ */
+ u64 alt_config = X86_CONFIG(.event=0xc2, .umask=0x01, .inv=1, .cmask=16);
+
+ alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
+ event->hw.config = alt_config;
+ }
+}
+
+static void intel_pebs_aliases_precdist(struct perf_event *event)
+{
+ if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
+ /*
+ * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
+ * (0x003c) so that we can use it with PEBS.
+ *
+ * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
+ * PEBS capable. However we can use INST_RETIRED.PREC_DIST
+ * (0x01c0), which is a PEBS capable event, to get the same
+ * count.
+ *
+ * The PREC_DIST event has special support to minimize sample
+ * shadowing effects. One drawback is that it can be
+ * only programmed on counter 1, but that seems like an
+ * acceptable trade off.
+ */
+ u64 alt_config = X86_CONFIG(.event=0xc0, .umask=0x01, .inv=1, .cmask=16);
+
+ alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
+ event->hw.config = alt_config;
+ }
+}
+
+static void intel_pebs_aliases_ivb(struct perf_event *event)
+{
+ if (event->attr.precise_ip < 3)
+ return intel_pebs_aliases_snb(event);
+ return intel_pebs_aliases_precdist(event);
+}
+
+static void intel_pebs_aliases_skl(struct perf_event *event)
+{
+ if (event->attr.precise_ip < 3)
+ return intel_pebs_aliases_core2(event);
+ return intel_pebs_aliases_precdist(event);
+}
+
+static unsigned long intel_pmu_large_pebs_flags(struct perf_event *event)
+{
+ unsigned long flags = x86_pmu.large_pebs_flags;
+
+ if (event->attr.use_clockid)
+ flags &= ~PERF_SAMPLE_TIME;
+ if (!event->attr.exclude_kernel)
+ flags &= ~PERF_SAMPLE_REGS_USER;
+ if (event->attr.sample_regs_user & ~PEBS_GP_REGS)
+ flags &= ~(PERF_SAMPLE_REGS_USER | PERF_SAMPLE_REGS_INTR);
+ return flags;
+}
+
+static int intel_pmu_bts_config(struct perf_event *event)
+{
+ struct perf_event_attr *attr = &event->attr;
+
+ if (unlikely(intel_pmu_has_bts(event))) {
+ /* BTS is not supported by this architecture. */
+ if (!x86_pmu.bts_active)
+ return -EOPNOTSUPP;
+
+ /* BTS is currently only allowed for user-mode. */
+ if (!attr->exclude_kernel)
+ return -EOPNOTSUPP;
+
+ /* BTS is not allowed for precise events. */
+ if (attr->precise_ip)
+ return -EOPNOTSUPP;
+
+ /* disallow bts if conflicting events are present */
+ if (x86_add_exclusive(x86_lbr_exclusive_lbr))
+ return -EBUSY;
+
+ event->destroy = hw_perf_lbr_event_destroy;
+ }
+
+ return 0;
+}
+
+static int core_pmu_hw_config(struct perf_event *event)
+{
+ int ret = x86_pmu_hw_config(event);
+
+ if (ret)
+ return ret;
+
+ return intel_pmu_bts_config(event);
+}
+
+static int intel_pmu_hw_config(struct perf_event *event)
+{
+ int ret = x86_pmu_hw_config(event);
+
+ if (ret)
+ return ret;
+
+ ret = intel_pmu_bts_config(event);
+ if (ret)
+ return ret;
+
+ if (event->attr.precise_ip) {
+ if (!(event->attr.freq || (event->attr.wakeup_events && !event->attr.watermark))) {
+ event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD;
+ if (!(event->attr.sample_type &
+ ~intel_pmu_large_pebs_flags(event)))
+ event->hw.flags |= PERF_X86_EVENT_LARGE_PEBS;
+ }
+ if (x86_pmu.pebs_aliases)
+ x86_pmu.pebs_aliases(event);
+
+ if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
+ event->attr.sample_type |= __PERF_SAMPLE_CALLCHAIN_EARLY;
+ }
+
+ if (needs_branch_stack(event)) {
+ ret = intel_pmu_setup_lbr_filter(event);
+ if (ret)
+ return ret;
+
+ /*
+ * BTS is set up earlier in this path, so don't account twice
+ */
+ if (!unlikely(intel_pmu_has_bts(event))) {
+ /* disallow lbr if conflicting events are present */
+ if (x86_add_exclusive(x86_lbr_exclusive_lbr))
+ return -EBUSY;
+
+ event->destroy = hw_perf_lbr_event_destroy;
+ }
+ }
+
+ if (event->attr.type != PERF_TYPE_RAW)
+ return 0;
+
+ if (!(event->attr.config & ARCH_PERFMON_EVENTSEL_ANY))
+ return 0;
+
+ if (x86_pmu.version < 3)
+ return -EINVAL;
+
+ if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
+ return -EACCES;
+
+ event->hw.config |= ARCH_PERFMON_EVENTSEL_ANY;
+
+ return 0;
+}
+
+struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr)
+{
+ if (x86_pmu.guest_get_msrs)
+ return x86_pmu.guest_get_msrs(nr);
+ *nr = 0;
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(perf_guest_get_msrs);
+
+static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs;
+
+ arr[0].msr = MSR_CORE_PERF_GLOBAL_CTRL;
+ arr[0].host = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask;
+ arr[0].guest = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_host_mask;
+ /*
+ * If PMU counter has PEBS enabled it is not enough to disable counter
+ * on a guest entry since PEBS memory write can overshoot guest entry
+ * and corrupt guest memory. Disabling PEBS solves the problem.
+ */
+ arr[1].msr = MSR_IA32_PEBS_ENABLE;
+ arr[1].host = cpuc->pebs_enabled;
+ arr[1].guest = 0;
+
+ *nr = 2;
+ return arr;
+}
+
+static struct perf_guest_switch_msr *core_guest_get_msrs(int *nr)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs;
+ int idx;
+
+ for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ struct perf_event *event = cpuc->events[idx];
+
+ arr[idx].msr = x86_pmu_config_addr(idx);
+ arr[idx].host = arr[idx].guest = 0;
+
+ if (!test_bit(idx, cpuc->active_mask))
+ continue;
+
+ arr[idx].host = arr[idx].guest =
+ event->hw.config | ARCH_PERFMON_EVENTSEL_ENABLE;
+
+ if (event->attr.exclude_host)
+ arr[idx].host &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
+ else if (event->attr.exclude_guest)
+ arr[idx].guest &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
+ }
+
+ *nr = x86_pmu.num_counters;
+ return arr;
+}
+
+static void core_pmu_enable_event(struct perf_event *event)
+{
+ if (!event->attr.exclude_host)
+ x86_pmu_enable_event(event);
+}
+
+static void core_pmu_enable_all(int added)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ int idx;
+
+ for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
+
+ if (!test_bit(idx, cpuc->active_mask) ||
+ cpuc->events[idx]->attr.exclude_host)
+ continue;
+
+ __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
+ }
+}
+
+static int hsw_hw_config(struct perf_event *event)
+{
+ int ret = intel_pmu_hw_config(event);
+
+ if (ret)
+ return ret;
+ if (!boot_cpu_has(X86_FEATURE_RTM) && !boot_cpu_has(X86_FEATURE_HLE))
+ return 0;
+ event->hw.config |= event->attr.config & (HSW_IN_TX|HSW_IN_TX_CHECKPOINTED);
+
+ /*
+ * IN_TX/IN_TX-CP filters are not supported by the Haswell PMU with
+ * PEBS or in ANY thread mode. Since the results are non-sensical forbid
+ * this combination.
+ */
+ if ((event->hw.config & (HSW_IN_TX|HSW_IN_TX_CHECKPOINTED)) &&
+ ((event->hw.config & ARCH_PERFMON_EVENTSEL_ANY) ||
+ event->attr.precise_ip > 0))
+ return -EOPNOTSUPP;
+
+ if (event_is_checkpointed(event)) {
+ /*
+ * Sampling of checkpointed events can cause situations where
+ * the CPU constantly aborts because of a overflow, which is
+ * then checkpointed back and ignored. Forbid checkpointing
+ * for sampling.
+ *
+ * But still allow a long sampling period, so that perf stat
+ * from KVM works.
+ */
+ if (event->attr.sample_period > 0 &&
+ event->attr.sample_period < 0x7fffffff)
+ return -EOPNOTSUPP;
+ }
+ return 0;
+}
+
+static struct event_constraint counter0_constraint =
+ INTEL_ALL_EVENT_CONSTRAINT(0, 0x1);
+
+static struct event_constraint counter2_constraint =
+ EVENT_CONSTRAINT(0, 0x4, 0);
+
+static struct event_constraint *
+hsw_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
+{
+ struct event_constraint *c;
+
+ c = intel_get_event_constraints(cpuc, idx, event);
+
+ /* Handle special quirk on in_tx_checkpointed only in counter 2 */
+ if (event->hw.config & HSW_IN_TX_CHECKPOINTED) {
+ if (c->idxmsk64 & (1U << 2))
+ return &counter2_constraint;
+ return &emptyconstraint;
+ }
+
+ return c;
+}
+
+static struct event_constraint *
+glp_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
+{
+ struct event_constraint *c;
+
+ /* :ppp means to do reduced skid PEBS which is PMC0 only. */
+ if (event->attr.precise_ip == 3)
+ return &counter0_constraint;
+
+ c = intel_get_event_constraints(cpuc, idx, event);
+
+ return c;
+}
+
+static bool allow_tsx_force_abort = true;
+
+static struct event_constraint *
+tfa_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
+{
+ struct event_constraint *c = hsw_get_event_constraints(cpuc, idx, event);
+
+ /*
+ * Without TFA we must not use PMC3.
+ */
+ if (!allow_tsx_force_abort && test_bit(3, c->idxmsk) && idx >= 0) {
+ c = dyn_constraint(cpuc, c, idx);
+ c->idxmsk64 &= ~(1ULL << 3);
+ c->weight--;
+ }
+
+ return c;
+}
+
+/*
+ * Broadwell:
+ *
+ * The INST_RETIRED.ALL period always needs to have lowest 6 bits cleared
+ * (BDM55) and it must not use a period smaller than 100 (BDM11). We combine
+ * the two to enforce a minimum period of 128 (the smallest value that has bits
+ * 0-5 cleared and >= 100).
+ *
+ * Because of how the code in x86_perf_event_set_period() works, the truncation
+ * of the lower 6 bits is 'harmless' as we'll occasionally add a longer period
+ * to make up for the 'lost' events due to carrying the 'error' in period_left.
+ *
+ * Therefore the effective (average) period matches the requested period,
+ * despite coarser hardware granularity.
+ */
+static u64 bdw_limit_period(struct perf_event *event, u64 left)
+{
+ if ((event->hw.config & INTEL_ARCH_EVENT_MASK) ==
+ X86_CONFIG(.event=0xc0, .umask=0x01)) {
+ if (left < 128)
+ left = 128;
+ left &= ~0x3fULL;
+ }
+ return left;
+}
+
+static u64 nhm_limit_period(struct perf_event *event, u64 left)
+{
+ return max(left, 32ULL);
+}
+
+PMU_FORMAT_ATTR(event, "config:0-7" );
+PMU_FORMAT_ATTR(umask, "config:8-15" );
+PMU_FORMAT_ATTR(edge, "config:18" );
+PMU_FORMAT_ATTR(pc, "config:19" );
+PMU_FORMAT_ATTR(any, "config:21" ); /* v3 + */
+PMU_FORMAT_ATTR(inv, "config:23" );
+PMU_FORMAT_ATTR(cmask, "config:24-31" );
+PMU_FORMAT_ATTR(in_tx, "config:32");
+PMU_FORMAT_ATTR(in_tx_cp, "config:33");
+
+static struct attribute *intel_arch_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_pc.attr,
+ &format_attr_inv.attr,
+ &format_attr_cmask.attr,
+ NULL,
+};
+
+ssize_t intel_event_sysfs_show(char *page, u64 config)
+{
+ u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT);
+
+ return x86_event_sysfs_show(page, config, event);
+}
+
+static struct intel_shared_regs *allocate_shared_regs(int cpu)
+{
+ struct intel_shared_regs *regs;
+ int i;
+
+ regs = kzalloc_node(sizeof(struct intel_shared_regs),
+ GFP_KERNEL, cpu_to_node(cpu));
+ if (regs) {
+ /*
+ * initialize the locks to keep lockdep happy
+ */
+ for (i = 0; i < EXTRA_REG_MAX; i++)
+ raw_spin_lock_init(&regs->regs[i].lock);
+
+ regs->core_id = -1;
+ }
+ return regs;
+}
+
+static struct intel_excl_cntrs *allocate_excl_cntrs(int cpu)
+{
+ struct intel_excl_cntrs *c;
+
+ c = kzalloc_node(sizeof(struct intel_excl_cntrs),
+ GFP_KERNEL, cpu_to_node(cpu));
+ if (c) {
+ raw_spin_lock_init(&c->lock);
+ c->core_id = -1;
+ }
+ return c;
+}
+
+
+int intel_cpuc_prepare(struct cpu_hw_events *cpuc, int cpu)
+{
+ if (x86_pmu.extra_regs || x86_pmu.lbr_sel_map) {
+ cpuc->shared_regs = allocate_shared_regs(cpu);
+ if (!cpuc->shared_regs)
+ goto err;
+ }
+
+ if (x86_pmu.flags & (PMU_FL_EXCL_CNTRS | PMU_FL_TFA)) {
+ size_t sz = X86_PMC_IDX_MAX * sizeof(struct event_constraint);
+
+ cpuc->constraint_list = kzalloc_node(sz, GFP_KERNEL, cpu_to_node(cpu));
+ if (!cpuc->constraint_list)
+ goto err_shared_regs;
+ }
+
+ if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
+ cpuc->excl_cntrs = allocate_excl_cntrs(cpu);
+ if (!cpuc->excl_cntrs)
+ goto err_constraint_list;
+
+ cpuc->excl_thread_id = 0;
+ }
+
+ return 0;
+
+err_constraint_list:
+ kfree(cpuc->constraint_list);
+ cpuc->constraint_list = NULL;
+
+err_shared_regs:
+ kfree(cpuc->shared_regs);
+ cpuc->shared_regs = NULL;
+
+err:
+ return -ENOMEM;
+}
+
+static int intel_pmu_cpu_prepare(int cpu)
+{
+ return intel_cpuc_prepare(&per_cpu(cpu_hw_events, cpu), cpu);
+}
+
+static void flip_smm_bit(void *data)
+{
+ unsigned long set = *(unsigned long *)data;
+
+ if (set > 0) {
+ msr_set_bit(MSR_IA32_DEBUGCTLMSR,
+ DEBUGCTLMSR_FREEZE_IN_SMM_BIT);
+ } else {
+ msr_clear_bit(MSR_IA32_DEBUGCTLMSR,
+ DEBUGCTLMSR_FREEZE_IN_SMM_BIT);
+ }
+}
+
+static void intel_pmu_cpu_starting(int cpu)
+{
+ struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+ int core_id = topology_core_id(cpu);
+ int i;
+
+ init_debug_store_on_cpu(cpu);
+ /*
+ * Deal with CPUs that don't clear their LBRs on power-up.
+ */
+ intel_pmu_lbr_reset();
+
+ cpuc->lbr_sel = NULL;
+
+ if (x86_pmu.flags & PMU_FL_TFA) {
+ WARN_ON_ONCE(cpuc->tfa_shadow);
+ cpuc->tfa_shadow = ~0ULL;
+ intel_set_tfa(cpuc, false);
+ }
+
+ if (x86_pmu.version > 1)
+ flip_smm_bit(&x86_pmu.attr_freeze_on_smi);
+
+ if (!cpuc->shared_regs)
+ return;
+
+ if (!(x86_pmu.flags & PMU_FL_NO_HT_SHARING)) {
+ for_each_cpu(i, topology_sibling_cpumask(cpu)) {
+ struct intel_shared_regs *pc;
+
+ pc = per_cpu(cpu_hw_events, i).shared_regs;
+ if (pc && pc->core_id == core_id) {
+ cpuc->kfree_on_online[0] = cpuc->shared_regs;
+ cpuc->shared_regs = pc;
+ break;
+ }
+ }
+ cpuc->shared_regs->core_id = core_id;
+ cpuc->shared_regs->refcnt++;
+ }
+
+ if (x86_pmu.lbr_sel_map)
+ cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR];
+
+ if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
+ for_each_cpu(i, topology_sibling_cpumask(cpu)) {
+ struct cpu_hw_events *sibling;
+ struct intel_excl_cntrs *c;
+
+ sibling = &per_cpu(cpu_hw_events, i);
+ c = sibling->excl_cntrs;
+ if (c && c->core_id == core_id) {
+ cpuc->kfree_on_online[1] = cpuc->excl_cntrs;
+ cpuc->excl_cntrs = c;
+ if (!sibling->excl_thread_id)
+ cpuc->excl_thread_id = 1;
+ break;
+ }
+ }
+ cpuc->excl_cntrs->core_id = core_id;
+ cpuc->excl_cntrs->refcnt++;
+ }
+}
+
+static void free_excl_cntrs(struct cpu_hw_events *cpuc)
+{
+ struct intel_excl_cntrs *c;
+
+ c = cpuc->excl_cntrs;
+ if (c) {
+ if (c->core_id == -1 || --c->refcnt == 0)
+ kfree(c);
+ cpuc->excl_cntrs = NULL;
+ }
+
+ kfree(cpuc->constraint_list);
+ cpuc->constraint_list = NULL;
+}
+
+static void intel_pmu_cpu_dying(int cpu)
+{
+ fini_debug_store_on_cpu(cpu);
+}
+
+void intel_cpuc_finish(struct cpu_hw_events *cpuc)
+{
+ struct intel_shared_regs *pc;
+
+ pc = cpuc->shared_regs;
+ if (pc) {
+ if (pc->core_id == -1 || --pc->refcnt == 0)
+ kfree(pc);
+ cpuc->shared_regs = NULL;
+ }
+
+ free_excl_cntrs(cpuc);
+}
+
+static void intel_pmu_cpu_dead(int cpu)
+{
+ intel_cpuc_finish(&per_cpu(cpu_hw_events, cpu));
+}
+
+static void intel_pmu_sched_task(struct perf_event_context *ctx,
+ bool sched_in)
+{
+ intel_pmu_pebs_sched_task(ctx, sched_in);
+ intel_pmu_lbr_sched_task(ctx, sched_in);
+}
+
+static int intel_pmu_check_period(struct perf_event *event, u64 value)
+{
+ return intel_pmu_has_bts_period(event, value) ? -EINVAL : 0;
+}
+
+PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63");
+
+PMU_FORMAT_ATTR(ldlat, "config1:0-15");
+
+PMU_FORMAT_ATTR(frontend, "config1:0-23");
+
+static struct attribute *intel_arch3_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_pc.attr,
+ &format_attr_any.attr,
+ &format_attr_inv.attr,
+ &format_attr_cmask.attr,
+ NULL,
+};
+
+static struct attribute *hsw_format_attr[] = {
+ &format_attr_in_tx.attr,
+ &format_attr_in_tx_cp.attr,
+ &format_attr_offcore_rsp.attr,
+ &format_attr_ldlat.attr,
+ NULL
+};
+
+static struct attribute *nhm_format_attr[] = {
+ &format_attr_offcore_rsp.attr,
+ &format_attr_ldlat.attr,
+ NULL
+};
+
+static struct attribute *slm_format_attr[] = {
+ &format_attr_offcore_rsp.attr,
+ NULL
+};
+
+static struct attribute *skl_format_attr[] = {
+ &format_attr_frontend.attr,
+ NULL,
+};
+
+static __initconst const struct x86_pmu core_pmu = {
+ .name = "core",
+ .handle_irq = x86_pmu_handle_irq,
+ .disable_all = x86_pmu_disable_all,
+ .enable_all = core_pmu_enable_all,
+ .enable = core_pmu_enable_event,
+ .disable = x86_pmu_disable_event,
+ .hw_config = core_pmu_hw_config,
+ .schedule_events = x86_schedule_events,
+ .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
+ .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
+ .event_map = intel_pmu_event_map,
+ .max_events = ARRAY_SIZE(intel_perfmon_event_map),
+ .apic = 1,
+ .large_pebs_flags = LARGE_PEBS_FLAGS,
+
+ /*
+ * Intel PMCs cannot be accessed sanely above 32-bit width,
+ * so we install an artificial 1<<31 period regardless of
+ * the generic event period:
+ */
+ .max_period = (1ULL<<31) - 1,
+ .get_event_constraints = intel_get_event_constraints,
+ .put_event_constraints = intel_put_event_constraints,
+ .event_constraints = intel_core_event_constraints,
+ .guest_get_msrs = core_guest_get_msrs,
+ .format_attrs = intel_arch_formats_attr,
+ .events_sysfs_show = intel_event_sysfs_show,
+
+ /*
+ * Virtual (or funny metal) CPU can define x86_pmu.extra_regs
+ * together with PMU version 1 and thus be using core_pmu with
+ * shared_regs. We need following callbacks here to allocate
+ * it properly.
+ */
+ .cpu_prepare = intel_pmu_cpu_prepare,
+ .cpu_starting = intel_pmu_cpu_starting,
+ .cpu_dying = intel_pmu_cpu_dying,
+ .cpu_dead = intel_pmu_cpu_dead,
+
+ .check_period = intel_pmu_check_period,
+};
+
+static struct attribute *intel_pmu_attrs[];
+
+static __initconst const struct x86_pmu intel_pmu = {
+ .name = "Intel",
+ .handle_irq = intel_pmu_handle_irq,
+ .disable_all = intel_pmu_disable_all,
+ .enable_all = intel_pmu_enable_all,
+ .enable = intel_pmu_enable_event,
+ .disable = intel_pmu_disable_event,
+ .add = intel_pmu_add_event,
+ .del = intel_pmu_del_event,
+ .read = intel_pmu_read_event,
+ .hw_config = intel_pmu_hw_config,
+ .schedule_events = x86_schedule_events,
+ .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
+ .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
+ .event_map = intel_pmu_event_map,
+ .max_events = ARRAY_SIZE(intel_perfmon_event_map),
+ .apic = 1,
+ .large_pebs_flags = LARGE_PEBS_FLAGS,
+ /*
+ * Intel PMCs cannot be accessed sanely above 32 bit width,
+ * so we install an artificial 1<<31 period regardless of
+ * the generic event period:
+ */
+ .max_period = (1ULL << 31) - 1,
+ .get_event_constraints = intel_get_event_constraints,
+ .put_event_constraints = intel_put_event_constraints,
+ .pebs_aliases = intel_pebs_aliases_core2,
+
+ .format_attrs = intel_arch3_formats_attr,
+ .events_sysfs_show = intel_event_sysfs_show,
+
+ .attrs = intel_pmu_attrs,
+
+ .cpu_prepare = intel_pmu_cpu_prepare,
+ .cpu_starting = intel_pmu_cpu_starting,
+ .cpu_dying = intel_pmu_cpu_dying,
+ .cpu_dead = intel_pmu_cpu_dead,
+
+ .guest_get_msrs = intel_guest_get_msrs,
+ .sched_task = intel_pmu_sched_task,
+
+ .check_period = intel_pmu_check_period,
+};
+
+static __init void intel_clovertown_quirk(void)
+{
+ /*
+ * PEBS is unreliable due to:
+ *
+ * AJ67 - PEBS may experience CPL leaks
+ * AJ68 - PEBS PMI may be delayed by one event
+ * AJ69 - GLOBAL_STATUS[62] will only be set when DEBUGCTL[12]
+ * AJ106 - FREEZE_LBRS_ON_PMI doesn't work in combination with PEBS
+ *
+ * AJ67 could be worked around by restricting the OS/USR flags.
+ * AJ69 could be worked around by setting PMU_FREEZE_ON_PMI.
+ *
+ * AJ106 could possibly be worked around by not allowing LBR
+ * usage from PEBS, including the fixup.
+ * AJ68 could possibly be worked around by always programming
+ * a pebs_event_reset[0] value and coping with the lost events.
+ *
+ * But taken together it might just make sense to not enable PEBS on
+ * these chips.
+ */
+ pr_warn("PEBS disabled due to CPU errata\n");
+ x86_pmu.pebs = 0;
+ x86_pmu.pebs_constraints = NULL;
+}
+
+static int intel_snb_pebs_broken(int cpu)
+{
+ u32 rev = UINT_MAX; /* default to broken for unknown models */
+
+ switch (cpu_data(cpu).x86_model) {
+ case INTEL_FAM6_SANDYBRIDGE:
+ rev = 0x28;
+ break;
+
+ case INTEL_FAM6_SANDYBRIDGE_X:
+ switch (cpu_data(cpu).x86_stepping) {
+ case 6: rev = 0x618; break;
+ case 7: rev = 0x70c; break;
+ }
+ }
+
+ return (cpu_data(cpu).microcode < rev);
+}
+
+static void intel_snb_check_microcode(void)
+{
+ int pebs_broken = 0;
+ int cpu;
+
+ for_each_online_cpu(cpu) {
+ if ((pebs_broken = intel_snb_pebs_broken(cpu)))
+ break;
+ }
+
+ if (pebs_broken == x86_pmu.pebs_broken)
+ return;
+
+ /*
+ * Serialized by the microcode lock..
+ */
+ if (x86_pmu.pebs_broken) {
+ pr_info("PEBS enabled due to microcode update\n");
+ x86_pmu.pebs_broken = 0;
+ } else {
+ pr_info("PEBS disabled due to CPU errata, please upgrade microcode\n");
+ x86_pmu.pebs_broken = 1;
+ }
+}
+
+static bool is_lbr_from(unsigned long msr)
+{
+ unsigned long lbr_from_nr = x86_pmu.lbr_from + x86_pmu.lbr_nr;
+
+ return x86_pmu.lbr_from <= msr && msr < lbr_from_nr;
+}
+
+/*
+ * Under certain circumstances, access certain MSR may cause #GP.
+ * The function tests if the input MSR can be safely accessed.
+ */
+static bool check_msr(unsigned long msr, u64 mask)
+{
+ u64 val_old, val_new, val_tmp;
+
+ /*
+ * Read the current value, change it and read it back to see if it
+ * matches, this is needed to detect certain hardware emulators
+ * (qemu/kvm) that don't trap on the MSR access and always return 0s.
+ */
+ if (rdmsrl_safe(msr, &val_old))
+ return false;
+
+ /*
+ * Only change the bits which can be updated by wrmsrl.
+ */
+ val_tmp = val_old ^ mask;
+
+ if (is_lbr_from(msr))
+ val_tmp = lbr_from_signext_quirk_wr(val_tmp);
+
+ if (wrmsrl_safe(msr, val_tmp) ||
+ rdmsrl_safe(msr, &val_new))
+ return false;
+
+ /*
+ * Quirk only affects validation in wrmsr(), so wrmsrl()'s value
+ * should equal rdmsrl()'s even with the quirk.
+ */
+ if (val_new != val_tmp)
+ return false;
+
+ if (is_lbr_from(msr))
+ val_old = lbr_from_signext_quirk_wr(val_old);
+
+ /* Here it's sure that the MSR can be safely accessed.
+ * Restore the old value and return.
+ */
+ wrmsrl(msr, val_old);
+
+ return true;
+}
+
+static __init void intel_sandybridge_quirk(void)
+{
+ x86_pmu.check_microcode = intel_snb_check_microcode;
+ cpus_read_lock();
+ intel_snb_check_microcode();
+ cpus_read_unlock();
+}
+
+static const struct { int id; char *name; } intel_arch_events_map[] __initconst = {
+ { PERF_COUNT_HW_CPU_CYCLES, "cpu cycles" },
+ { PERF_COUNT_HW_INSTRUCTIONS, "instructions" },
+ { PERF_COUNT_HW_BUS_CYCLES, "bus cycles" },
+ { PERF_COUNT_HW_CACHE_REFERENCES, "cache references" },
+ { PERF_COUNT_HW_CACHE_MISSES, "cache misses" },
+ { PERF_COUNT_HW_BRANCH_INSTRUCTIONS, "branch instructions" },
+ { PERF_COUNT_HW_BRANCH_MISSES, "branch misses" },
+};
+
+static __init void intel_arch_events_quirk(void)
+{
+ int bit;
+
+ /* disable event that reported as not presend by cpuid */
+ for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(intel_arch_events_map)) {
+ intel_perfmon_event_map[intel_arch_events_map[bit].id] = 0;
+ pr_warn("CPUID marked event: \'%s\' unavailable\n",
+ intel_arch_events_map[bit].name);
+ }
+}
+
+static __init void intel_nehalem_quirk(void)
+{
+ union cpuid10_ebx ebx;
+
+ ebx.full = x86_pmu.events_maskl;
+ if (ebx.split.no_branch_misses_retired) {
+ /*
+ * Erratum AAJ80 detected, we work it around by using
+ * the BR_MISP_EXEC.ANY event. This will over-count
+ * branch-misses, but it's still much better than the
+ * architectural event which is often completely bogus:
+ */
+ intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89;
+ ebx.split.no_branch_misses_retired = 0;
+ x86_pmu.events_maskl = ebx.full;
+ pr_info("CPU erratum AAJ80 worked around\n");
+ }
+}
+
+/*
+ * enable software workaround for errata:
+ * SNB: BJ122
+ * IVB: BV98
+ * HSW: HSD29
+ *
+ * Only needed when HT is enabled. However detecting
+ * if HT is enabled is difficult (model specific). So instead,
+ * we enable the workaround in the early boot, and verify if
+ * it is needed in a later initcall phase once we have valid
+ * topology information to check if HT is actually enabled
+ */
+static __init void intel_ht_bug(void)
+{
+ x86_pmu.flags |= PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED;
+
+ x86_pmu.start_scheduling = intel_start_scheduling;
+ x86_pmu.commit_scheduling = intel_commit_scheduling;
+ x86_pmu.stop_scheduling = intel_stop_scheduling;
+}
+
+EVENT_ATTR_STR(mem-loads, mem_ld_hsw, "event=0xcd,umask=0x1,ldlat=3");
+EVENT_ATTR_STR(mem-stores, mem_st_hsw, "event=0xd0,umask=0x82")
+
+/* Haswell special events */
+EVENT_ATTR_STR(tx-start, tx_start, "event=0xc9,umask=0x1");
+EVENT_ATTR_STR(tx-commit, tx_commit, "event=0xc9,umask=0x2");
+EVENT_ATTR_STR(tx-abort, tx_abort, "event=0xc9,umask=0x4");
+EVENT_ATTR_STR(tx-capacity, tx_capacity, "event=0x54,umask=0x2");
+EVENT_ATTR_STR(tx-conflict, tx_conflict, "event=0x54,umask=0x1");
+EVENT_ATTR_STR(el-start, el_start, "event=0xc8,umask=0x1");
+EVENT_ATTR_STR(el-commit, el_commit, "event=0xc8,umask=0x2");
+EVENT_ATTR_STR(el-abort, el_abort, "event=0xc8,umask=0x4");
+EVENT_ATTR_STR(el-capacity, el_capacity, "event=0x54,umask=0x2");
+EVENT_ATTR_STR(el-conflict, el_conflict, "event=0x54,umask=0x1");
+EVENT_ATTR_STR(cycles-t, cycles_t, "event=0x3c,in_tx=1");
+EVENT_ATTR_STR(cycles-ct, cycles_ct, "event=0x3c,in_tx=1,in_tx_cp=1");
+
+static struct attribute *hsw_events_attrs[] = {
+ EVENT_PTR(mem_ld_hsw),
+ EVENT_PTR(mem_st_hsw),
+ EVENT_PTR(td_slots_issued),
+ EVENT_PTR(td_slots_retired),
+ EVENT_PTR(td_fetch_bubbles),
+ EVENT_PTR(td_total_slots),
+ EVENT_PTR(td_total_slots_scale),
+ EVENT_PTR(td_recovery_bubbles),
+ EVENT_PTR(td_recovery_bubbles_scale),
+ NULL
+};
+
+static struct attribute *hsw_tsx_events_attrs[] = {
+ EVENT_PTR(tx_start),
+ EVENT_PTR(tx_commit),
+ EVENT_PTR(tx_abort),
+ EVENT_PTR(tx_capacity),
+ EVENT_PTR(tx_conflict),
+ EVENT_PTR(el_start),
+ EVENT_PTR(el_commit),
+ EVENT_PTR(el_abort),
+ EVENT_PTR(el_capacity),
+ EVENT_PTR(el_conflict),
+ EVENT_PTR(cycles_t),
+ EVENT_PTR(cycles_ct),
+ NULL
+};
+
+static __init struct attribute **get_hsw_events_attrs(void)
+{
+ return boot_cpu_has(X86_FEATURE_RTM) ?
+ merge_attr(hsw_events_attrs, hsw_tsx_events_attrs) :
+ hsw_events_attrs;
+}
+
+static ssize_t freeze_on_smi_show(struct device *cdev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%lu\n", x86_pmu.attr_freeze_on_smi);
+}
+
+static DEFINE_MUTEX(freeze_on_smi_mutex);
+
+static ssize_t freeze_on_smi_store(struct device *cdev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned long val;
+ ssize_t ret;
+
+ ret = kstrtoul(buf, 0, &val);
+ if (ret)
+ return ret;
+
+ if (val > 1)
+ return -EINVAL;
+
+ mutex_lock(&freeze_on_smi_mutex);
+
+ if (x86_pmu.attr_freeze_on_smi == val)
+ goto done;
+
+ x86_pmu.attr_freeze_on_smi = val;
+
+ get_online_cpus();
+ on_each_cpu(flip_smm_bit, &val, 1);
+ put_online_cpus();
+done:
+ mutex_unlock(&freeze_on_smi_mutex);
+
+ return count;
+}
+
+static DEVICE_ATTR_RW(freeze_on_smi);
+
+static ssize_t branches_show(struct device *cdev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu.lbr_nr);
+}
+
+static DEVICE_ATTR_RO(branches);
+
+static struct attribute *lbr_attrs[] = {
+ &dev_attr_branches.attr,
+ NULL
+};
+
+static char pmu_name_str[30];
+
+static ssize_t pmu_name_show(struct device *cdev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%s\n", pmu_name_str);
+}
+
+static DEVICE_ATTR_RO(pmu_name);
+
+static struct attribute *intel_pmu_caps_attrs[] = {
+ &dev_attr_pmu_name.attr,
+ NULL
+};
+
+static DEVICE_BOOL_ATTR(allow_tsx_force_abort, 0644, allow_tsx_force_abort);
+
+static struct attribute *intel_pmu_attrs[] = {
+ &dev_attr_freeze_on_smi.attr,
+ NULL, /* &dev_attr_allow_tsx_force_abort.attr.attr */
+ NULL,
+};
+
+__init int intel_pmu_init(void)
+{
+ struct attribute **extra_attr = NULL;
+ struct attribute **to_free = NULL;
+ union cpuid10_edx edx;
+ union cpuid10_eax eax;
+ union cpuid10_ebx ebx;
+ struct event_constraint *c;
+ unsigned int unused;
+ struct extra_reg *er;
+ int version, i;
+ char *name;
+
+ if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
+ switch (boot_cpu_data.x86) {
+ case 0x6:
+ return p6_pmu_init();
+ case 0xb:
+ return knc_pmu_init();
+ case 0xf:
+ return p4_pmu_init();
+ }
+ return -ENODEV;
+ }
+
+ /*
+ * Check whether the Architectural PerfMon supports
+ * Branch Misses Retired hw_event or not.
+ */
+ cpuid(10, &eax.full, &ebx.full, &unused, &edx.full);
+ if (eax.split.mask_length < ARCH_PERFMON_EVENTS_COUNT)
+ return -ENODEV;
+
+ version = eax.split.version_id;
+ if (version < 2)
+ x86_pmu = core_pmu;
+ else
+ x86_pmu = intel_pmu;
+
+ x86_pmu.version = version;
+ x86_pmu.num_counters = eax.split.num_counters;
+ x86_pmu.cntval_bits = eax.split.bit_width;
+ x86_pmu.cntval_mask = (1ULL << eax.split.bit_width) - 1;
+
+ x86_pmu.events_maskl = ebx.full;
+ x86_pmu.events_mask_len = eax.split.mask_length;
+
+ x86_pmu.max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, x86_pmu.num_counters);
+
+ /*
+ * Quirk: v2 perfmon does not report fixed-purpose events, so
+ * assume at least 3 events, when not running in a hypervisor:
+ */
+ if (version > 1) {
+ int assume = 3 * !boot_cpu_has(X86_FEATURE_HYPERVISOR);
+
+ x86_pmu.num_counters_fixed =
+ max((int)edx.split.num_counters_fixed, assume);
+ }
+
+ if (boot_cpu_has(X86_FEATURE_PDCM)) {
+ u64 capabilities;
+
+ rdmsrl(MSR_IA32_PERF_CAPABILITIES, capabilities);
+ x86_pmu.intel_cap.capabilities = capabilities;
+ }
+
+ intel_ds_init();
+
+ x86_add_quirk(intel_arch_events_quirk); /* Install first, so it runs last */
+
+ /*
+ * Install the hw-cache-events table:
+ */
+ switch (boot_cpu_data.x86_model) {
+ case INTEL_FAM6_CORE_YONAH:
+ pr_cont("Core events, ");
+ name = "core";
+ break;
+
+ case INTEL_FAM6_CORE2_MEROM:
+ x86_add_quirk(intel_clovertown_quirk);
+ case INTEL_FAM6_CORE2_MEROM_L:
+ case INTEL_FAM6_CORE2_PENRYN:
+ case INTEL_FAM6_CORE2_DUNNINGTON:
+ memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
+ sizeof(hw_cache_event_ids));
+
+ intel_pmu_lbr_init_core();
+
+ x86_pmu.event_constraints = intel_core2_event_constraints;
+ x86_pmu.pebs_constraints = intel_core2_pebs_event_constraints;
+ pr_cont("Core2 events, ");
+ name = "core2";
+ break;
+
+ case INTEL_FAM6_NEHALEM:
+ case INTEL_FAM6_NEHALEM_EP:
+ case INTEL_FAM6_NEHALEM_EX:
+ memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
+ sizeof(hw_cache_event_ids));
+ memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs,
+ sizeof(hw_cache_extra_regs));
+
+ intel_pmu_lbr_init_nhm();
+
+ x86_pmu.event_constraints = intel_nehalem_event_constraints;
+ x86_pmu.pebs_constraints = intel_nehalem_pebs_event_constraints;
+ x86_pmu.enable_all = intel_pmu_nhm_enable_all;
+ x86_pmu.extra_regs = intel_nehalem_extra_regs;
+ x86_pmu.limit_period = nhm_limit_period;
+
+ x86_pmu.cpu_events = nhm_events_attrs;
+
+ /* UOPS_ISSUED.STALLED_CYCLES */
+ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
+ X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
+ /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
+ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
+ X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
+
+ intel_pmu_pebs_data_source_nhm();
+ x86_add_quirk(intel_nehalem_quirk);
+ x86_pmu.pebs_no_tlb = 1;
+ extra_attr = nhm_format_attr;
+
+ pr_cont("Nehalem events, ");
+ name = "nehalem";
+ break;
+
+ case INTEL_FAM6_ATOM_BONNELL:
+ case INTEL_FAM6_ATOM_BONNELL_MID:
+ case INTEL_FAM6_ATOM_SALTWELL:
+ case INTEL_FAM6_ATOM_SALTWELL_MID:
+ case INTEL_FAM6_ATOM_SALTWELL_TABLET:
+ memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
+ sizeof(hw_cache_event_ids));
+
+ intel_pmu_lbr_init_atom();
+
+ x86_pmu.event_constraints = intel_gen_event_constraints;
+ x86_pmu.pebs_constraints = intel_atom_pebs_event_constraints;
+ x86_pmu.pebs_aliases = intel_pebs_aliases_core2;
+ pr_cont("Atom events, ");
+ name = "bonnell";
+ break;
+
+ case INTEL_FAM6_ATOM_SILVERMONT:
+ case INTEL_FAM6_ATOM_SILVERMONT_X:
+ case INTEL_FAM6_ATOM_SILVERMONT_MID:
+ case INTEL_FAM6_ATOM_AIRMONT:
+ case INTEL_FAM6_ATOM_AIRMONT_MID:
+ memcpy(hw_cache_event_ids, slm_hw_cache_event_ids,
+ sizeof(hw_cache_event_ids));
+ memcpy(hw_cache_extra_regs, slm_hw_cache_extra_regs,
+ sizeof(hw_cache_extra_regs));
+
+ intel_pmu_lbr_init_slm();
+
+ x86_pmu.event_constraints = intel_slm_event_constraints;
+ x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints;
+ x86_pmu.extra_regs = intel_slm_extra_regs;
+ x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+ x86_pmu.cpu_events = slm_events_attrs;
+ extra_attr = slm_format_attr;
+ pr_cont("Silvermont events, ");
+ name = "silvermont";
+ break;
+
+ case INTEL_FAM6_ATOM_GOLDMONT:
+ case INTEL_FAM6_ATOM_GOLDMONT_X:
+ memcpy(hw_cache_event_ids, glm_hw_cache_event_ids,
+ sizeof(hw_cache_event_ids));
+ memcpy(hw_cache_extra_regs, glm_hw_cache_extra_regs,
+ sizeof(hw_cache_extra_regs));
+
+ intel_pmu_lbr_init_skl();
+
+ x86_pmu.event_constraints = intel_slm_event_constraints;
+ x86_pmu.pebs_constraints = intel_glm_pebs_event_constraints;
+ x86_pmu.extra_regs = intel_glm_extra_regs;
+ /*
+ * It's recommended to use CPU_CLK_UNHALTED.CORE_P + NPEBS
+ * for precise cycles.
+ * :pp is identical to :ppp
+ */
+ x86_pmu.pebs_aliases = NULL;
+ x86_pmu.pebs_prec_dist = true;
+ x86_pmu.lbr_pt_coexist = true;
+ x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+ x86_pmu.cpu_events = glm_events_attrs;
+ extra_attr = slm_format_attr;
+ pr_cont("Goldmont events, ");
+ name = "goldmont";
+ break;
+
+ case INTEL_FAM6_ATOM_GOLDMONT_PLUS:
+ memcpy(hw_cache_event_ids, glp_hw_cache_event_ids,
+ sizeof(hw_cache_event_ids));
+ memcpy(hw_cache_extra_regs, glp_hw_cache_extra_regs,
+ sizeof(hw_cache_extra_regs));
+
+ intel_pmu_lbr_init_skl();
+
+ x86_pmu.event_constraints = intel_slm_event_constraints;
+ x86_pmu.extra_regs = intel_glm_extra_regs;
+ /*
+ * It's recommended to use CPU_CLK_UNHALTED.CORE_P + NPEBS
+ * for precise cycles.
+ */
+ x86_pmu.pebs_aliases = NULL;
+ x86_pmu.pebs_prec_dist = true;
+ x86_pmu.lbr_pt_coexist = true;
+ x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+ x86_pmu.flags |= PMU_FL_PEBS_ALL;
+ x86_pmu.get_event_constraints = glp_get_event_constraints;
+ x86_pmu.cpu_events = glm_events_attrs;
+ /* Goldmont Plus has 4-wide pipeline */
+ event_attr_td_total_slots_scale_glm.event_str = "4";
+ extra_attr = slm_format_attr;
+ pr_cont("Goldmont plus events, ");
+ name = "goldmont_plus";
+ break;
+
+ case INTEL_FAM6_WESTMERE:
+ case INTEL_FAM6_WESTMERE_EP:
+ case INTEL_FAM6_WESTMERE_EX:
+ memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids,
+ sizeof(hw_cache_event_ids));
+ memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs,
+ sizeof(hw_cache_extra_regs));
+
+ intel_pmu_lbr_init_nhm();
+
+ x86_pmu.event_constraints = intel_westmere_event_constraints;
+ x86_pmu.enable_all = intel_pmu_nhm_enable_all;
+ x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints;
+ x86_pmu.extra_regs = intel_westmere_extra_regs;
+ x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+
+ x86_pmu.cpu_events = nhm_events_attrs;
+
+ /* UOPS_ISSUED.STALLED_CYCLES */
+ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
+ X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
+ /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
+ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
+ X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
+
+ intel_pmu_pebs_data_source_nhm();
+ extra_attr = nhm_format_attr;
+ pr_cont("Westmere events, ");
+ name = "westmere";
+ break;
+
+ case INTEL_FAM6_SANDYBRIDGE:
+ case INTEL_FAM6_SANDYBRIDGE_X:
+ x86_add_quirk(intel_sandybridge_quirk);
+ x86_add_quirk(intel_ht_bug);
+ memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
+ sizeof(hw_cache_event_ids));
+ memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs,
+ sizeof(hw_cache_extra_regs));
+
+ intel_pmu_lbr_init_snb();
+
+ x86_pmu.event_constraints = intel_snb_event_constraints;
+ x86_pmu.pebs_constraints = intel_snb_pebs_event_constraints;
+ x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
+ if (boot_cpu_data.x86_model == INTEL_FAM6_SANDYBRIDGE_X)
+ x86_pmu.extra_regs = intel_snbep_extra_regs;
+ else
+ x86_pmu.extra_regs = intel_snb_extra_regs;
+
+
+ /* all extra regs are per-cpu when HT is on */
+ x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+ x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+
+ x86_pmu.cpu_events = snb_events_attrs;
+
+ /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
+ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
+ X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
+ /* UOPS_DISPATCHED.THREAD,c=1,i=1 to count stall cycles*/
+ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
+ X86_CONFIG(.event=0xb1, .umask=0x01, .inv=1, .cmask=1);
+
+ extra_attr = nhm_format_attr;
+
+ pr_cont("SandyBridge events, ");
+ name = "sandybridge";
+ break;
+
+ case INTEL_FAM6_IVYBRIDGE:
+ case INTEL_FAM6_IVYBRIDGE_X:
+ x86_add_quirk(intel_ht_bug);
+ memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
+ sizeof(hw_cache_event_ids));
+ /* dTLB-load-misses on IVB is different than SNB */
+ hw_cache_event_ids[C(DTLB)][C(OP_READ)][C(RESULT_MISS)] = 0x8108; /* DTLB_LOAD_MISSES.DEMAND_LD_MISS_CAUSES_A_WALK */
+
+ memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs,
+ sizeof(hw_cache_extra_regs));
+
+ intel_pmu_lbr_init_snb();
+
+ x86_pmu.event_constraints = intel_ivb_event_constraints;
+ x86_pmu.pebs_constraints = intel_ivb_pebs_event_constraints;
+ x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
+ x86_pmu.pebs_prec_dist = true;
+ if (boot_cpu_data.x86_model == INTEL_FAM6_IVYBRIDGE_X)
+ x86_pmu.extra_regs = intel_snbep_extra_regs;
+ else
+ x86_pmu.extra_regs = intel_snb_extra_regs;
+ /* all extra regs are per-cpu when HT is on */
+ x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+ x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+
+ x86_pmu.cpu_events = snb_events_attrs;
+
+ /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
+ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
+ X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
+
+ extra_attr = nhm_format_attr;
+
+ pr_cont("IvyBridge events, ");
+ name = "ivybridge";
+ break;
+
+
+ case INTEL_FAM6_HASWELL_CORE:
+ case INTEL_FAM6_HASWELL_X:
+ case INTEL_FAM6_HASWELL_ULT:
+ case INTEL_FAM6_HASWELL_GT3E:
+ x86_add_quirk(intel_ht_bug);
+ x86_pmu.late_ack = true;
+ memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+ memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+
+ intel_pmu_lbr_init_hsw();
+
+ x86_pmu.event_constraints = intel_hsw_event_constraints;
+ x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
+ x86_pmu.extra_regs = intel_snbep_extra_regs;
+ x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
+ x86_pmu.pebs_prec_dist = true;
+ /* all extra regs are per-cpu when HT is on */
+ x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+ x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+
+ x86_pmu.hw_config = hsw_hw_config;
+ x86_pmu.get_event_constraints = hsw_get_event_constraints;
+ x86_pmu.cpu_events = get_hsw_events_attrs();
+ x86_pmu.lbr_double_abort = true;
+ extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
+ hsw_format_attr : nhm_format_attr;
+ pr_cont("Haswell events, ");
+ name = "haswell";
+ break;
+
+ case INTEL_FAM6_BROADWELL_CORE:
+ case INTEL_FAM6_BROADWELL_XEON_D:
+ case INTEL_FAM6_BROADWELL_GT3E:
+ case INTEL_FAM6_BROADWELL_X:
+ x86_pmu.late_ack = true;
+ memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+ memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+
+ /* L3_MISS_LOCAL_DRAM is BIT(26) in Broadwell */
+ hw_cache_extra_regs[C(LL)][C(OP_READ)][C(RESULT_MISS)] = HSW_DEMAND_READ |
+ BDW_L3_MISS|HSW_SNOOP_DRAM;
+ hw_cache_extra_regs[C(LL)][C(OP_WRITE)][C(RESULT_MISS)] = HSW_DEMAND_WRITE|BDW_L3_MISS|
+ HSW_SNOOP_DRAM;
+ hw_cache_extra_regs[C(NODE)][C(OP_READ)][C(RESULT_ACCESS)] = HSW_DEMAND_READ|
+ BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM;
+ hw_cache_extra_regs[C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = HSW_DEMAND_WRITE|
+ BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM;
+
+ intel_pmu_lbr_init_hsw();
+
+ x86_pmu.event_constraints = intel_bdw_event_constraints;
+ x86_pmu.pebs_constraints = intel_bdw_pebs_event_constraints;
+ x86_pmu.extra_regs = intel_snbep_extra_regs;
+ x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
+ x86_pmu.pebs_prec_dist = true;
+ /* all extra regs are per-cpu when HT is on */
+ x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+ x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+
+ x86_pmu.hw_config = hsw_hw_config;
+ x86_pmu.get_event_constraints = hsw_get_event_constraints;
+ x86_pmu.cpu_events = get_hsw_events_attrs();
+ x86_pmu.limit_period = bdw_limit_period;
+ extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
+ hsw_format_attr : nhm_format_attr;
+ pr_cont("Broadwell events, ");
+ name = "broadwell";
+ break;
+
+ case INTEL_FAM6_XEON_PHI_KNL:
+ case INTEL_FAM6_XEON_PHI_KNM:
+ memcpy(hw_cache_event_ids,
+ slm_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+ memcpy(hw_cache_extra_regs,
+ knl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+ intel_pmu_lbr_init_knl();
+
+ x86_pmu.event_constraints = intel_slm_event_constraints;
+ x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints;
+ x86_pmu.extra_regs = intel_knl_extra_regs;
+
+ /* all extra regs are per-cpu when HT is on */
+ x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+ x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+ extra_attr = slm_format_attr;
+ pr_cont("Knights Landing/Mill events, ");
+ name = "knights-landing";
+ break;
+
+ case INTEL_FAM6_SKYLAKE_MOBILE:
+ case INTEL_FAM6_SKYLAKE_DESKTOP:
+ case INTEL_FAM6_SKYLAKE_X:
+ case INTEL_FAM6_KABYLAKE_MOBILE:
+ case INTEL_FAM6_KABYLAKE_DESKTOP:
+ x86_pmu.late_ack = true;
+ memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+ memcpy(hw_cache_extra_regs, skl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+ intel_pmu_lbr_init_skl();
+
+ /* INT_MISC.RECOVERY_CYCLES has umask 1 in Skylake */
+ event_attr_td_recovery_bubbles.event_str_noht =
+ "event=0xd,umask=0x1,cmask=1";
+ event_attr_td_recovery_bubbles.event_str_ht =
+ "event=0xd,umask=0x1,cmask=1,any=1";
+
+ x86_pmu.event_constraints = intel_skl_event_constraints;
+ x86_pmu.pebs_constraints = intel_skl_pebs_event_constraints;
+ x86_pmu.extra_regs = intel_skl_extra_regs;
+ x86_pmu.pebs_aliases = intel_pebs_aliases_skl;
+ x86_pmu.pebs_prec_dist = true;
+ /* all extra regs are per-cpu when HT is on */
+ x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+ x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+
+ x86_pmu.hw_config = hsw_hw_config;
+ x86_pmu.get_event_constraints = hsw_get_event_constraints;
+ extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
+ hsw_format_attr : nhm_format_attr;
+ extra_attr = merge_attr(extra_attr, skl_format_attr);
+ to_free = extra_attr;
+ x86_pmu.cpu_events = get_hsw_events_attrs();
+ intel_pmu_pebs_data_source_skl(
+ boot_cpu_data.x86_model == INTEL_FAM6_SKYLAKE_X);
+
+ if (boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT)) {
+ x86_pmu.flags |= PMU_FL_TFA;
+ x86_pmu.get_event_constraints = tfa_get_event_constraints;
+ x86_pmu.enable_all = intel_tfa_pmu_enable_all;
+ x86_pmu.commit_scheduling = intel_tfa_commit_scheduling;
+ intel_pmu_attrs[1] = &dev_attr_allow_tsx_force_abort.attr.attr;
+ }
+
+ pr_cont("Skylake events, ");
+ name = "skylake";
+ break;
+
+ default:
+ switch (x86_pmu.version) {
+ case 1:
+ x86_pmu.event_constraints = intel_v1_event_constraints;
+ pr_cont("generic architected perfmon v1, ");
+ name = "generic_arch_v1";
+ break;
+ default:
+ /*
+ * default constraints for v2 and up
+ */
+ x86_pmu.event_constraints = intel_gen_event_constraints;
+ pr_cont("generic architected perfmon, ");
+ name = "generic_arch_v2+";
+ break;
+ }
+ }
+
+ snprintf(pmu_name_str, sizeof pmu_name_str, "%s", name);
+
+ if (version >= 2 && extra_attr) {
+ x86_pmu.format_attrs = merge_attr(intel_arch3_formats_attr,
+ extra_attr);
+ WARN_ON(!x86_pmu.format_attrs);
+ }
+
+ if (x86_pmu.num_counters > INTEL_PMC_MAX_GENERIC) {
+ WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
+ x86_pmu.num_counters, INTEL_PMC_MAX_GENERIC);
+ x86_pmu.num_counters = INTEL_PMC_MAX_GENERIC;
+ }
+ x86_pmu.intel_ctrl = (1ULL << x86_pmu.num_counters) - 1;
+
+ if (x86_pmu.num_counters_fixed > INTEL_PMC_MAX_FIXED) {
+ WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
+ x86_pmu.num_counters_fixed, INTEL_PMC_MAX_FIXED);
+ x86_pmu.num_counters_fixed = INTEL_PMC_MAX_FIXED;
+ }
+
+ x86_pmu.intel_ctrl |=
+ ((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED;
+
+ if (x86_pmu.event_constraints) {
+ /*
+ * event on fixed counter2 (REF_CYCLES) only works on this
+ * counter, so do not extend mask to generic counters
+ */
+ for_each_event_constraint(c, x86_pmu.event_constraints) {
+ if (c->cmask == FIXED_EVENT_FLAGS
+ && c->idxmsk64 != INTEL_PMC_MSK_FIXED_REF_CYCLES) {
+ c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
+ }
+ c->idxmsk64 &=
+ ~(~0ULL << (INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed));
+ c->weight = hweight64(c->idxmsk64);
+ }
+ }
+
+ /*
+ * Access LBR MSR may cause #GP under certain circumstances.
+ * E.g. KVM doesn't support LBR MSR
+ * Check all LBT MSR here.
+ * Disable LBR access if any LBR MSRs can not be accessed.
+ */
+ if (x86_pmu.lbr_nr && !check_msr(x86_pmu.lbr_tos, 0x3UL))
+ x86_pmu.lbr_nr = 0;
+ for (i = 0; i < x86_pmu.lbr_nr; i++) {
+ if (!(check_msr(x86_pmu.lbr_from + i, 0xffffUL) &&
+ check_msr(x86_pmu.lbr_to + i, 0xffffUL)))
+ x86_pmu.lbr_nr = 0;
+ }
+
+ x86_pmu.caps_attrs = intel_pmu_caps_attrs;
+
+ if (x86_pmu.lbr_nr) {
+ x86_pmu.caps_attrs = merge_attr(x86_pmu.caps_attrs, lbr_attrs);
+ pr_cont("%d-deep LBR, ", x86_pmu.lbr_nr);
+ }
+
+ /*
+ * Access extra MSR may cause #GP under certain circumstances.
+ * E.g. KVM doesn't support offcore event
+ * Check all extra_regs here.
+ */
+ if (x86_pmu.extra_regs) {
+ for (er = x86_pmu.extra_regs; er->msr; er++) {
+ er->extra_msr_access = check_msr(er->msr, 0x11UL);
+ /* Disable LBR select mapping */
+ if ((er->idx == EXTRA_REG_LBR) && !er->extra_msr_access)
+ x86_pmu.lbr_sel_map = NULL;
+ }
+ }
+
+ /* Support full width counters using alternative MSR range */
+ if (x86_pmu.intel_cap.full_width_write) {
+ x86_pmu.max_period = x86_pmu.cntval_mask >> 1;
+ x86_pmu.perfctr = MSR_IA32_PMC0;
+ pr_cont("full-width counters, ");
+ }
+
+ kfree(to_free);
+ return 0;
+}
+
+/*
+ * HT bug: phase 2 init
+ * Called once we have valid topology information to check
+ * whether or not HT is enabled
+ * If HT is off, then we disable the workaround
+ */
+static __init int fixup_ht_bug(void)
+{
+ int c;
+ /*
+ * problem not present on this CPU model, nothing to do
+ */
+ if (!(x86_pmu.flags & PMU_FL_EXCL_ENABLED))
+ return 0;
+
+ if (topology_max_smt_threads() > 1) {
+ pr_info("PMU erratum BJ122, BV98, HSD29 worked around, HT is on\n");
+ return 0;
+ }
+
+ cpus_read_lock();
+
+ hardlockup_detector_perf_stop();
+
+ x86_pmu.flags &= ~(PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED);
+
+ x86_pmu.start_scheduling = NULL;
+ x86_pmu.commit_scheduling = NULL;
+ x86_pmu.stop_scheduling = NULL;
+
+ hardlockup_detector_perf_restart();
+
+ for_each_online_cpu(c)
+ free_excl_cntrs(&per_cpu(cpu_hw_events, c));
+
+ cpus_read_unlock();
+ pr_info("PMU erratum BJ122, BV98, HSD29 workaround disabled, HT off\n");
+ return 0;
+}
+subsys_initcall(fixup_ht_bug)
diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c
new file mode 100644
index 000000000..3b3cd12c0
--- /dev/null
+++ b/arch/x86/events/intel/cstate.c
@@ -0,0 +1,707 @@
+/*
+ * Support cstate residency counters
+ *
+ * Copyright (C) 2015, Intel Corp.
+ * Author: Kan Liang (kan.liang@intel.com)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Library General Public License for more details.
+ *
+ */
+
+/*
+ * This file export cstate related free running (read-only) counters
+ * for perf. These counters may be use simultaneously by other tools,
+ * such as turbostat. However, it still make sense to implement them
+ * in perf. Because we can conveniently collect them together with
+ * other events, and allow to use them from tools without special MSR
+ * access code.
+ *
+ * The events only support system-wide mode counting. There is no
+ * sampling support because it is not supported by the hardware.
+ *
+ * According to counters' scope and category, two PMUs are registered
+ * with the perf_event core subsystem.
+ * - 'cstate_core': The counter is available for each physical core.
+ * The counters include CORE_C*_RESIDENCY.
+ * - 'cstate_pkg': The counter is available for each physical package.
+ * The counters include PKG_C*_RESIDENCY.
+ *
+ * All of these counters are specified in the Intel® 64 and IA-32
+ * Architectures Software Developer.s Manual Vol3b.
+ *
+ * Model specific counters:
+ * MSR_CORE_C1_RES: CORE C1 Residency Counter
+ * perf code: 0x00
+ * Available model: SLM,AMT,GLM,CNL
+ * Scope: Core (each processor core has a MSR)
+ * MSR_CORE_C3_RESIDENCY: CORE C3 Residency Counter
+ * perf code: 0x01
+ * Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,GLM,
+ CNL
+ * Scope: Core
+ * MSR_CORE_C6_RESIDENCY: CORE C6 Residency Counter
+ * perf code: 0x02
+ * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,
+ * SKL,KNL,GLM,CNL
+ * Scope: Core
+ * MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter
+ * perf code: 0x03
+ * Available model: SNB,IVB,HSW,BDW,SKL,CNL
+ * Scope: Core
+ * MSR_PKG_C2_RESIDENCY: Package C2 Residency Counter.
+ * perf code: 0x00
+ * Available model: SNB,IVB,HSW,BDW,SKL,KNL,GLM,CNL
+ * Scope: Package (physical package)
+ * MSR_PKG_C3_RESIDENCY: Package C3 Residency Counter.
+ * perf code: 0x01
+ * Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,KNL,
+ * GLM,CNL
+ * Scope: Package (physical package)
+ * MSR_PKG_C6_RESIDENCY: Package C6 Residency Counter.
+ * perf code: 0x02
+ * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW
+ * SKL,KNL,GLM,CNL
+ * Scope: Package (physical package)
+ * MSR_PKG_C7_RESIDENCY: Package C7 Residency Counter.
+ * perf code: 0x03
+ * Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,CNL
+ * Scope: Package (physical package)
+ * MSR_PKG_C8_RESIDENCY: Package C8 Residency Counter.
+ * perf code: 0x04
+ * Available model: HSW ULT,KBL,CNL
+ * Scope: Package (physical package)
+ * MSR_PKG_C9_RESIDENCY: Package C9 Residency Counter.
+ * perf code: 0x05
+ * Available model: HSW ULT,KBL,CNL
+ * Scope: Package (physical package)
+ * MSR_PKG_C10_RESIDENCY: Package C10 Residency Counter.
+ * perf code: 0x06
+ * Available model: HSW ULT,KBL,GLM,CNL
+ * Scope: Package (physical package)
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/perf_event.h>
+#include <linux/nospec.h>
+#include <asm/cpu_device_id.h>
+#include <asm/intel-family.h>
+#include "../perf_event.h"
+
+MODULE_LICENSE("GPL");
+
+#define DEFINE_CSTATE_FORMAT_ATTR(_var, _name, _format) \
+static ssize_t __cstate_##_var##_show(struct device *dev, \
+ struct device_attribute *attr, \
+ char *page) \
+{ \
+ BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \
+ return sprintf(page, _format "\n"); \
+} \
+static struct device_attribute format_attr_##_var = \
+ __ATTR(_name, 0444, __cstate_##_var##_show, NULL)
+
+static ssize_t cstate_get_attr_cpumask(struct device *dev,
+ struct device_attribute *attr,
+ char *buf);
+
+/* Model -> events mapping */
+struct cstate_model {
+ unsigned long core_events;
+ unsigned long pkg_events;
+ unsigned long quirks;
+};
+
+/* Quirk flags */
+#define SLM_PKG_C6_USE_C7_MSR (1UL << 0)
+#define KNL_CORE_C6_MSR (1UL << 1)
+
+struct perf_cstate_msr {
+ u64 msr;
+ struct perf_pmu_events_attr *attr;
+};
+
+
+/* cstate_core PMU */
+static struct pmu cstate_core_pmu;
+static bool has_cstate_core;
+
+enum perf_cstate_core_events {
+ PERF_CSTATE_CORE_C1_RES = 0,
+ PERF_CSTATE_CORE_C3_RES,
+ PERF_CSTATE_CORE_C6_RES,
+ PERF_CSTATE_CORE_C7_RES,
+
+ PERF_CSTATE_CORE_EVENT_MAX,
+};
+
+PMU_EVENT_ATTR_STRING(c1-residency, evattr_cstate_core_c1, "event=0x00");
+PMU_EVENT_ATTR_STRING(c3-residency, evattr_cstate_core_c3, "event=0x01");
+PMU_EVENT_ATTR_STRING(c6-residency, evattr_cstate_core_c6, "event=0x02");
+PMU_EVENT_ATTR_STRING(c7-residency, evattr_cstate_core_c7, "event=0x03");
+
+static struct perf_cstate_msr core_msr[] = {
+ [PERF_CSTATE_CORE_C1_RES] = { MSR_CORE_C1_RES, &evattr_cstate_core_c1 },
+ [PERF_CSTATE_CORE_C3_RES] = { MSR_CORE_C3_RESIDENCY, &evattr_cstate_core_c3 },
+ [PERF_CSTATE_CORE_C6_RES] = { MSR_CORE_C6_RESIDENCY, &evattr_cstate_core_c6 },
+ [PERF_CSTATE_CORE_C7_RES] = { MSR_CORE_C7_RESIDENCY, &evattr_cstate_core_c7 },
+};
+
+static struct attribute *core_events_attrs[PERF_CSTATE_CORE_EVENT_MAX + 1] = {
+ NULL,
+};
+
+static struct attribute_group core_events_attr_group = {
+ .name = "events",
+ .attrs = core_events_attrs,
+};
+
+DEFINE_CSTATE_FORMAT_ATTR(core_event, event, "config:0-63");
+static struct attribute *core_format_attrs[] = {
+ &format_attr_core_event.attr,
+ NULL,
+};
+
+static struct attribute_group core_format_attr_group = {
+ .name = "format",
+ .attrs = core_format_attrs,
+};
+
+static cpumask_t cstate_core_cpu_mask;
+static DEVICE_ATTR(cpumask, S_IRUGO, cstate_get_attr_cpumask, NULL);
+
+static struct attribute *cstate_cpumask_attrs[] = {
+ &dev_attr_cpumask.attr,
+ NULL,
+};
+
+static struct attribute_group cpumask_attr_group = {
+ .attrs = cstate_cpumask_attrs,
+};
+
+static const struct attribute_group *core_attr_groups[] = {
+ &core_events_attr_group,
+ &core_format_attr_group,
+ &cpumask_attr_group,
+ NULL,
+};
+
+/* cstate_pkg PMU */
+static struct pmu cstate_pkg_pmu;
+static bool has_cstate_pkg;
+
+enum perf_cstate_pkg_events {
+ PERF_CSTATE_PKG_C2_RES = 0,
+ PERF_CSTATE_PKG_C3_RES,
+ PERF_CSTATE_PKG_C6_RES,
+ PERF_CSTATE_PKG_C7_RES,
+ PERF_CSTATE_PKG_C8_RES,
+ PERF_CSTATE_PKG_C9_RES,
+ PERF_CSTATE_PKG_C10_RES,
+
+ PERF_CSTATE_PKG_EVENT_MAX,
+};
+
+PMU_EVENT_ATTR_STRING(c2-residency, evattr_cstate_pkg_c2, "event=0x00");
+PMU_EVENT_ATTR_STRING(c3-residency, evattr_cstate_pkg_c3, "event=0x01");
+PMU_EVENT_ATTR_STRING(c6-residency, evattr_cstate_pkg_c6, "event=0x02");
+PMU_EVENT_ATTR_STRING(c7-residency, evattr_cstate_pkg_c7, "event=0x03");
+PMU_EVENT_ATTR_STRING(c8-residency, evattr_cstate_pkg_c8, "event=0x04");
+PMU_EVENT_ATTR_STRING(c9-residency, evattr_cstate_pkg_c9, "event=0x05");
+PMU_EVENT_ATTR_STRING(c10-residency, evattr_cstate_pkg_c10, "event=0x06");
+
+static struct perf_cstate_msr pkg_msr[] = {
+ [PERF_CSTATE_PKG_C2_RES] = { MSR_PKG_C2_RESIDENCY, &evattr_cstate_pkg_c2 },
+ [PERF_CSTATE_PKG_C3_RES] = { MSR_PKG_C3_RESIDENCY, &evattr_cstate_pkg_c3 },
+ [PERF_CSTATE_PKG_C6_RES] = { MSR_PKG_C6_RESIDENCY, &evattr_cstate_pkg_c6 },
+ [PERF_CSTATE_PKG_C7_RES] = { MSR_PKG_C7_RESIDENCY, &evattr_cstate_pkg_c7 },
+ [PERF_CSTATE_PKG_C8_RES] = { MSR_PKG_C8_RESIDENCY, &evattr_cstate_pkg_c8 },
+ [PERF_CSTATE_PKG_C9_RES] = { MSR_PKG_C9_RESIDENCY, &evattr_cstate_pkg_c9 },
+ [PERF_CSTATE_PKG_C10_RES] = { MSR_PKG_C10_RESIDENCY, &evattr_cstate_pkg_c10 },
+};
+
+static struct attribute *pkg_events_attrs[PERF_CSTATE_PKG_EVENT_MAX + 1] = {
+ NULL,
+};
+
+static struct attribute_group pkg_events_attr_group = {
+ .name = "events",
+ .attrs = pkg_events_attrs,
+};
+
+DEFINE_CSTATE_FORMAT_ATTR(pkg_event, event, "config:0-63");
+static struct attribute *pkg_format_attrs[] = {
+ &format_attr_pkg_event.attr,
+ NULL,
+};
+static struct attribute_group pkg_format_attr_group = {
+ .name = "format",
+ .attrs = pkg_format_attrs,
+};
+
+static cpumask_t cstate_pkg_cpu_mask;
+
+static const struct attribute_group *pkg_attr_groups[] = {
+ &pkg_events_attr_group,
+ &pkg_format_attr_group,
+ &cpumask_attr_group,
+ NULL,
+};
+
+static ssize_t cstate_get_attr_cpumask(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct pmu *pmu = dev_get_drvdata(dev);
+
+ if (pmu == &cstate_core_pmu)
+ return cpumap_print_to_pagebuf(true, buf, &cstate_core_cpu_mask);
+ else if (pmu == &cstate_pkg_pmu)
+ return cpumap_print_to_pagebuf(true, buf, &cstate_pkg_cpu_mask);
+ else
+ return 0;
+}
+
+static int cstate_pmu_event_init(struct perf_event *event)
+{
+ u64 cfg = event->attr.config;
+ int cpu;
+
+ if (event->attr.type != event->pmu->type)
+ return -ENOENT;
+
+ /* unsupported modes and filters */
+ if (event->attr.exclude_user ||
+ event->attr.exclude_kernel ||
+ event->attr.exclude_hv ||
+ event->attr.exclude_idle ||
+ event->attr.exclude_host ||
+ event->attr.exclude_guest ||
+ event->attr.sample_period) /* no sampling */
+ return -EINVAL;
+
+ if (event->cpu < 0)
+ return -EINVAL;
+
+ if (event->pmu == &cstate_core_pmu) {
+ if (cfg >= PERF_CSTATE_CORE_EVENT_MAX)
+ return -EINVAL;
+ if (!core_msr[cfg].attr)
+ return -EINVAL;
+ event->hw.event_base = core_msr[cfg].msr;
+ cpu = cpumask_any_and(&cstate_core_cpu_mask,
+ topology_sibling_cpumask(event->cpu));
+ } else if (event->pmu == &cstate_pkg_pmu) {
+ if (cfg >= PERF_CSTATE_PKG_EVENT_MAX)
+ return -EINVAL;
+ cfg = array_index_nospec((unsigned long)cfg, PERF_CSTATE_PKG_EVENT_MAX);
+ if (!pkg_msr[cfg].attr)
+ return -EINVAL;
+ event->hw.event_base = pkg_msr[cfg].msr;
+ cpu = cpumask_any_and(&cstate_pkg_cpu_mask,
+ topology_core_cpumask(event->cpu));
+ } else {
+ return -ENOENT;
+ }
+
+ if (cpu >= nr_cpu_ids)
+ return -ENODEV;
+
+ event->cpu = cpu;
+ event->hw.config = cfg;
+ event->hw.idx = -1;
+ return 0;
+}
+
+static inline u64 cstate_pmu_read_counter(struct perf_event *event)
+{
+ u64 val;
+
+ rdmsrl(event->hw.event_base, val);
+ return val;
+}
+
+static void cstate_pmu_event_update(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ u64 prev_raw_count, new_raw_count;
+
+again:
+ prev_raw_count = local64_read(&hwc->prev_count);
+ new_raw_count = cstate_pmu_read_counter(event);
+
+ if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
+ new_raw_count) != prev_raw_count)
+ goto again;
+
+ local64_add(new_raw_count - prev_raw_count, &event->count);
+}
+
+static void cstate_pmu_event_start(struct perf_event *event, int mode)
+{
+ local64_set(&event->hw.prev_count, cstate_pmu_read_counter(event));
+}
+
+static void cstate_pmu_event_stop(struct perf_event *event, int mode)
+{
+ cstate_pmu_event_update(event);
+}
+
+static void cstate_pmu_event_del(struct perf_event *event, int mode)
+{
+ cstate_pmu_event_stop(event, PERF_EF_UPDATE);
+}
+
+static int cstate_pmu_event_add(struct perf_event *event, int mode)
+{
+ if (mode & PERF_EF_START)
+ cstate_pmu_event_start(event, mode);
+
+ return 0;
+}
+
+/*
+ * Check if exiting cpu is the designated reader. If so migrate the
+ * events when there is a valid target available
+ */
+static int cstate_cpu_exit(unsigned int cpu)
+{
+ unsigned int target;
+
+ if (has_cstate_core &&
+ cpumask_test_and_clear_cpu(cpu, &cstate_core_cpu_mask)) {
+
+ target = cpumask_any_but(topology_sibling_cpumask(cpu), cpu);
+ /* Migrate events if there is a valid target */
+ if (target < nr_cpu_ids) {
+ cpumask_set_cpu(target, &cstate_core_cpu_mask);
+ perf_pmu_migrate_context(&cstate_core_pmu, cpu, target);
+ }
+ }
+
+ if (has_cstate_pkg &&
+ cpumask_test_and_clear_cpu(cpu, &cstate_pkg_cpu_mask)) {
+
+ target = cpumask_any_but(topology_core_cpumask(cpu), cpu);
+ /* Migrate events if there is a valid target */
+ if (target < nr_cpu_ids) {
+ cpumask_set_cpu(target, &cstate_pkg_cpu_mask);
+ perf_pmu_migrate_context(&cstate_pkg_pmu, cpu, target);
+ }
+ }
+ return 0;
+}
+
+static int cstate_cpu_init(unsigned int cpu)
+{
+ unsigned int target;
+
+ /*
+ * If this is the first online thread of that core, set it in
+ * the core cpu mask as the designated reader.
+ */
+ target = cpumask_any_and(&cstate_core_cpu_mask,
+ topology_sibling_cpumask(cpu));
+
+ if (has_cstate_core && target >= nr_cpu_ids)
+ cpumask_set_cpu(cpu, &cstate_core_cpu_mask);
+
+ /*
+ * If this is the first online thread of that package, set it
+ * in the package cpu mask as the designated reader.
+ */
+ target = cpumask_any_and(&cstate_pkg_cpu_mask,
+ topology_core_cpumask(cpu));
+ if (has_cstate_pkg && target >= nr_cpu_ids)
+ cpumask_set_cpu(cpu, &cstate_pkg_cpu_mask);
+
+ return 0;
+}
+
+static struct pmu cstate_core_pmu = {
+ .attr_groups = core_attr_groups,
+ .name = "cstate_core",
+ .task_ctx_nr = perf_invalid_context,
+ .event_init = cstate_pmu_event_init,
+ .add = cstate_pmu_event_add,
+ .del = cstate_pmu_event_del,
+ .start = cstate_pmu_event_start,
+ .stop = cstate_pmu_event_stop,
+ .read = cstate_pmu_event_update,
+ .capabilities = PERF_PMU_CAP_NO_INTERRUPT,
+ .module = THIS_MODULE,
+};
+
+static struct pmu cstate_pkg_pmu = {
+ .attr_groups = pkg_attr_groups,
+ .name = "cstate_pkg",
+ .task_ctx_nr = perf_invalid_context,
+ .event_init = cstate_pmu_event_init,
+ .add = cstate_pmu_event_add,
+ .del = cstate_pmu_event_del,
+ .start = cstate_pmu_event_start,
+ .stop = cstate_pmu_event_stop,
+ .read = cstate_pmu_event_update,
+ .capabilities = PERF_PMU_CAP_NO_INTERRUPT,
+ .module = THIS_MODULE,
+};
+
+static const struct cstate_model nhm_cstates __initconst = {
+ .core_events = BIT(PERF_CSTATE_CORE_C3_RES) |
+ BIT(PERF_CSTATE_CORE_C6_RES),
+
+ .pkg_events = BIT(PERF_CSTATE_PKG_C3_RES) |
+ BIT(PERF_CSTATE_PKG_C6_RES) |
+ BIT(PERF_CSTATE_PKG_C7_RES),
+};
+
+static const struct cstate_model snb_cstates __initconst = {
+ .core_events = BIT(PERF_CSTATE_CORE_C3_RES) |
+ BIT(PERF_CSTATE_CORE_C6_RES) |
+ BIT(PERF_CSTATE_CORE_C7_RES),
+
+ .pkg_events = BIT(PERF_CSTATE_PKG_C2_RES) |
+ BIT(PERF_CSTATE_PKG_C3_RES) |
+ BIT(PERF_CSTATE_PKG_C6_RES) |
+ BIT(PERF_CSTATE_PKG_C7_RES),
+};
+
+static const struct cstate_model hswult_cstates __initconst = {
+ .core_events = BIT(PERF_CSTATE_CORE_C3_RES) |
+ BIT(PERF_CSTATE_CORE_C6_RES) |
+ BIT(PERF_CSTATE_CORE_C7_RES),
+
+ .pkg_events = BIT(PERF_CSTATE_PKG_C2_RES) |
+ BIT(PERF_CSTATE_PKG_C3_RES) |
+ BIT(PERF_CSTATE_PKG_C6_RES) |
+ BIT(PERF_CSTATE_PKG_C7_RES) |
+ BIT(PERF_CSTATE_PKG_C8_RES) |
+ BIT(PERF_CSTATE_PKG_C9_RES) |
+ BIT(PERF_CSTATE_PKG_C10_RES),
+};
+
+static const struct cstate_model cnl_cstates __initconst = {
+ .core_events = BIT(PERF_CSTATE_CORE_C1_RES) |
+ BIT(PERF_CSTATE_CORE_C3_RES) |
+ BIT(PERF_CSTATE_CORE_C6_RES) |
+ BIT(PERF_CSTATE_CORE_C7_RES),
+
+ .pkg_events = BIT(PERF_CSTATE_PKG_C2_RES) |
+ BIT(PERF_CSTATE_PKG_C3_RES) |
+ BIT(PERF_CSTATE_PKG_C6_RES) |
+ BIT(PERF_CSTATE_PKG_C7_RES) |
+ BIT(PERF_CSTATE_PKG_C8_RES) |
+ BIT(PERF_CSTATE_PKG_C9_RES) |
+ BIT(PERF_CSTATE_PKG_C10_RES),
+};
+
+static const struct cstate_model slm_cstates __initconst = {
+ .core_events = BIT(PERF_CSTATE_CORE_C1_RES) |
+ BIT(PERF_CSTATE_CORE_C6_RES),
+
+ .pkg_events = BIT(PERF_CSTATE_PKG_C6_RES),
+ .quirks = SLM_PKG_C6_USE_C7_MSR,
+};
+
+
+static const struct cstate_model knl_cstates __initconst = {
+ .core_events = BIT(PERF_CSTATE_CORE_C6_RES),
+
+ .pkg_events = BIT(PERF_CSTATE_PKG_C2_RES) |
+ BIT(PERF_CSTATE_PKG_C3_RES) |
+ BIT(PERF_CSTATE_PKG_C6_RES),
+ .quirks = KNL_CORE_C6_MSR,
+};
+
+
+static const struct cstate_model glm_cstates __initconst = {
+ .core_events = BIT(PERF_CSTATE_CORE_C1_RES) |
+ BIT(PERF_CSTATE_CORE_C3_RES) |
+ BIT(PERF_CSTATE_CORE_C6_RES),
+
+ .pkg_events = BIT(PERF_CSTATE_PKG_C2_RES) |
+ BIT(PERF_CSTATE_PKG_C3_RES) |
+ BIT(PERF_CSTATE_PKG_C6_RES) |
+ BIT(PERF_CSTATE_PKG_C10_RES),
+};
+
+
+#define X86_CSTATES_MODEL(model, states) \
+ { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long) &(states) }
+
+static const struct x86_cpu_id intel_cstates_match[] __initconst = {
+ X86_CSTATES_MODEL(INTEL_FAM6_NEHALEM, nhm_cstates),
+ X86_CSTATES_MODEL(INTEL_FAM6_NEHALEM_EP, nhm_cstates),
+ X86_CSTATES_MODEL(INTEL_FAM6_NEHALEM_EX, nhm_cstates),
+
+ X86_CSTATES_MODEL(INTEL_FAM6_WESTMERE, nhm_cstates),
+ X86_CSTATES_MODEL(INTEL_FAM6_WESTMERE_EP, nhm_cstates),
+ X86_CSTATES_MODEL(INTEL_FAM6_WESTMERE_EX, nhm_cstates),
+
+ X86_CSTATES_MODEL(INTEL_FAM6_SANDYBRIDGE, snb_cstates),
+ X86_CSTATES_MODEL(INTEL_FAM6_SANDYBRIDGE_X, snb_cstates),
+
+ X86_CSTATES_MODEL(INTEL_FAM6_IVYBRIDGE, snb_cstates),
+ X86_CSTATES_MODEL(INTEL_FAM6_IVYBRIDGE_X, snb_cstates),
+
+ X86_CSTATES_MODEL(INTEL_FAM6_HASWELL_CORE, snb_cstates),
+ X86_CSTATES_MODEL(INTEL_FAM6_HASWELL_X, snb_cstates),
+ X86_CSTATES_MODEL(INTEL_FAM6_HASWELL_GT3E, snb_cstates),
+
+ X86_CSTATES_MODEL(INTEL_FAM6_HASWELL_ULT, hswult_cstates),
+
+ X86_CSTATES_MODEL(INTEL_FAM6_ATOM_SILVERMONT, slm_cstates),
+ X86_CSTATES_MODEL(INTEL_FAM6_ATOM_SILVERMONT_X, slm_cstates),
+ X86_CSTATES_MODEL(INTEL_FAM6_ATOM_AIRMONT, slm_cstates),
+
+ X86_CSTATES_MODEL(INTEL_FAM6_BROADWELL_CORE, snb_cstates),
+ X86_CSTATES_MODEL(INTEL_FAM6_BROADWELL_XEON_D, snb_cstates),
+ X86_CSTATES_MODEL(INTEL_FAM6_BROADWELL_GT3E, snb_cstates),
+ X86_CSTATES_MODEL(INTEL_FAM6_BROADWELL_X, snb_cstates),
+
+ X86_CSTATES_MODEL(INTEL_FAM6_SKYLAKE_MOBILE, snb_cstates),
+ X86_CSTATES_MODEL(INTEL_FAM6_SKYLAKE_DESKTOP, snb_cstates),
+ X86_CSTATES_MODEL(INTEL_FAM6_SKYLAKE_X, snb_cstates),
+
+ X86_CSTATES_MODEL(INTEL_FAM6_KABYLAKE_MOBILE, hswult_cstates),
+ X86_CSTATES_MODEL(INTEL_FAM6_KABYLAKE_DESKTOP, hswult_cstates),
+
+ X86_CSTATES_MODEL(INTEL_FAM6_CANNONLAKE_MOBILE, cnl_cstates),
+
+ X86_CSTATES_MODEL(INTEL_FAM6_XEON_PHI_KNL, knl_cstates),
+ X86_CSTATES_MODEL(INTEL_FAM6_XEON_PHI_KNM, knl_cstates),
+
+ X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GOLDMONT, glm_cstates),
+ X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GOLDMONT_X, glm_cstates),
+
+ X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GOLDMONT_PLUS, glm_cstates),
+
+ X86_CSTATES_MODEL(INTEL_FAM6_ICELAKE_MOBILE, snb_cstates),
+ { },
+};
+MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match);
+
+/*
+ * Probe the cstate events and insert the available one into sysfs attrs
+ * Return false if there are no available events.
+ */
+static bool __init cstate_probe_msr(const unsigned long evmsk, int max,
+ struct perf_cstate_msr *msr,
+ struct attribute **attrs)
+{
+ bool found = false;
+ unsigned int bit;
+ u64 val;
+
+ for (bit = 0; bit < max; bit++) {
+ if (test_bit(bit, &evmsk) && !rdmsrl_safe(msr[bit].msr, &val)) {
+ *attrs++ = &msr[bit].attr->attr.attr;
+ found = true;
+ } else {
+ msr[bit].attr = NULL;
+ }
+ }
+ *attrs = NULL;
+
+ return found;
+}
+
+static int __init cstate_probe(const struct cstate_model *cm)
+{
+ /* SLM has different MSR for PKG C6 */
+ if (cm->quirks & SLM_PKG_C6_USE_C7_MSR)
+ pkg_msr[PERF_CSTATE_PKG_C6_RES].msr = MSR_PKG_C7_RESIDENCY;
+
+ /* KNL has different MSR for CORE C6 */
+ if (cm->quirks & KNL_CORE_C6_MSR)
+ pkg_msr[PERF_CSTATE_CORE_C6_RES].msr = MSR_KNL_CORE_C6_RESIDENCY;
+
+
+ has_cstate_core = cstate_probe_msr(cm->core_events,
+ PERF_CSTATE_CORE_EVENT_MAX,
+ core_msr, core_events_attrs);
+
+ has_cstate_pkg = cstate_probe_msr(cm->pkg_events,
+ PERF_CSTATE_PKG_EVENT_MAX,
+ pkg_msr, pkg_events_attrs);
+
+ return (has_cstate_core || has_cstate_pkg) ? 0 : -ENODEV;
+}
+
+static inline void cstate_cleanup(void)
+{
+ cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_CSTATE_ONLINE);
+ cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_CSTATE_STARTING);
+
+ if (has_cstate_core)
+ perf_pmu_unregister(&cstate_core_pmu);
+
+ if (has_cstate_pkg)
+ perf_pmu_unregister(&cstate_pkg_pmu);
+}
+
+static int __init cstate_init(void)
+{
+ int err;
+
+ cpuhp_setup_state(CPUHP_AP_PERF_X86_CSTATE_STARTING,
+ "perf/x86/cstate:starting", cstate_cpu_init, NULL);
+ cpuhp_setup_state(CPUHP_AP_PERF_X86_CSTATE_ONLINE,
+ "perf/x86/cstate:online", NULL, cstate_cpu_exit);
+
+ if (has_cstate_core) {
+ err = perf_pmu_register(&cstate_core_pmu, cstate_core_pmu.name, -1);
+ if (err) {
+ has_cstate_core = false;
+ pr_info("Failed to register cstate core pmu\n");
+ cstate_cleanup();
+ return err;
+ }
+ }
+
+ if (has_cstate_pkg) {
+ err = perf_pmu_register(&cstate_pkg_pmu, cstate_pkg_pmu.name, -1);
+ if (err) {
+ has_cstate_pkg = false;
+ pr_info("Failed to register cstate pkg pmu\n");
+ cstate_cleanup();
+ return err;
+ }
+ }
+ return 0;
+}
+
+static int __init cstate_pmu_init(void)
+{
+ const struct x86_cpu_id *id;
+ int err;
+
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ return -ENODEV;
+
+ id = x86_match_cpu(intel_cstates_match);
+ if (!id)
+ return -ENODEV;
+
+ err = cstate_probe((const struct cstate_model *) id->driver_data);
+ if (err)
+ return err;
+
+ return cstate_init();
+}
+module_init(cstate_pmu_init);
+
+static void __exit cstate_pmu_exit(void)
+{
+ cstate_cleanup();
+}
+module_exit(cstate_pmu_exit);
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
new file mode 100644
index 000000000..b3279feff
--- /dev/null
+++ b/arch/x86/events/intel/ds.c
@@ -0,0 +1,1687 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+
+#include <asm/cpu_entry_area.h>
+#include <asm/perf_event.h>
+#include <asm/tlbflush.h>
+#include <asm/insn.h>
+
+#include "../perf_event.h"
+
+/* Waste a full page so it can be mapped into the cpu_entry_area */
+DEFINE_PER_CPU_PAGE_ALIGNED(struct debug_store, cpu_debug_store);
+
+/* The size of a BTS record in bytes: */
+#define BTS_RECORD_SIZE 24
+
+#define PEBS_FIXUP_SIZE PAGE_SIZE
+
+/*
+ * pebs_record_32 for p4 and core not supported
+
+struct pebs_record_32 {
+ u32 flags, ip;
+ u32 ax, bc, cx, dx;
+ u32 si, di, bp, sp;
+};
+
+ */
+
+union intel_x86_pebs_dse {
+ u64 val;
+ struct {
+ unsigned int ld_dse:4;
+ unsigned int ld_stlb_miss:1;
+ unsigned int ld_locked:1;
+ unsigned int ld_reserved:26;
+ };
+ struct {
+ unsigned int st_l1d_hit:1;
+ unsigned int st_reserved1:3;
+ unsigned int st_stlb_miss:1;
+ unsigned int st_locked:1;
+ unsigned int st_reserved2:26;
+ };
+};
+
+
+/*
+ * Map PEBS Load Latency Data Source encodings to generic
+ * memory data source information
+ */
+#define P(a, b) PERF_MEM_S(a, b)
+#define OP_LH (P(OP, LOAD) | P(LVL, HIT))
+#define LEVEL(x) P(LVLNUM, x)
+#define REM P(REMOTE, REMOTE)
+#define SNOOP_NONE_MISS (P(SNOOP, NONE) | P(SNOOP, MISS))
+
+/* Version for Sandy Bridge and later */
+static u64 pebs_data_source[] = {
+ P(OP, LOAD) | P(LVL, MISS) | LEVEL(L3) | P(SNOOP, NA),/* 0x00:ukn L3 */
+ OP_LH | P(LVL, L1) | LEVEL(L1) | P(SNOOP, NONE), /* 0x01: L1 local */
+ OP_LH | P(LVL, LFB) | LEVEL(LFB) | P(SNOOP, NONE), /* 0x02: LFB hit */
+ OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, NONE), /* 0x03: L2 hit */
+ OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, NONE), /* 0x04: L3 hit */
+ OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, MISS), /* 0x05: L3 hit, snoop miss */
+ OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HIT), /* 0x06: L3 hit, snoop hit */
+ OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM), /* 0x07: L3 hit, snoop hitm */
+ OP_LH | P(LVL, REM_CCE1) | REM | LEVEL(L3) | P(SNOOP, HIT), /* 0x08: L3 miss snoop hit */
+ OP_LH | P(LVL, REM_CCE1) | REM | LEVEL(L3) | P(SNOOP, HITM), /* 0x09: L3 miss snoop hitm*/
+ OP_LH | P(LVL, LOC_RAM) | LEVEL(RAM) | P(SNOOP, HIT), /* 0x0a: L3 miss, shared */
+ OP_LH | P(LVL, REM_RAM1) | REM | LEVEL(L3) | P(SNOOP, HIT), /* 0x0b: L3 miss, shared */
+ OP_LH | P(LVL, LOC_RAM) | LEVEL(RAM) | SNOOP_NONE_MISS, /* 0x0c: L3 miss, excl */
+ OP_LH | P(LVL, REM_RAM1) | LEVEL(RAM) | REM | SNOOP_NONE_MISS, /* 0x0d: L3 miss, excl */
+ OP_LH | P(LVL, IO) | LEVEL(NA) | P(SNOOP, NONE), /* 0x0e: I/O */
+ OP_LH | P(LVL, UNC) | LEVEL(NA) | P(SNOOP, NONE), /* 0x0f: uncached */
+};
+
+/* Patch up minor differences in the bits */
+void __init intel_pmu_pebs_data_source_nhm(void)
+{
+ pebs_data_source[0x05] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HIT);
+ pebs_data_source[0x06] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM);
+ pebs_data_source[0x07] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM);
+}
+
+void __init intel_pmu_pebs_data_source_skl(bool pmem)
+{
+ u64 pmem_or_l4 = pmem ? LEVEL(PMEM) : LEVEL(L4);
+
+ pebs_data_source[0x08] = OP_LH | pmem_or_l4 | P(SNOOP, HIT);
+ pebs_data_source[0x09] = OP_LH | pmem_or_l4 | REM | P(SNOOP, HIT);
+ pebs_data_source[0x0b] = OP_LH | LEVEL(RAM) | REM | P(SNOOP, NONE);
+ pebs_data_source[0x0c] = OP_LH | LEVEL(ANY_CACHE) | REM | P(SNOOPX, FWD);
+ pebs_data_source[0x0d] = OP_LH | LEVEL(ANY_CACHE) | REM | P(SNOOP, HITM);
+}
+
+static u64 precise_store_data(u64 status)
+{
+ union intel_x86_pebs_dse dse;
+ u64 val = P(OP, STORE) | P(SNOOP, NA) | P(LVL, L1) | P(TLB, L2);
+
+ dse.val = status;
+
+ /*
+ * bit 4: TLB access
+ * 1 = stored missed 2nd level TLB
+ *
+ * so it either hit the walker or the OS
+ * otherwise hit 2nd level TLB
+ */
+ if (dse.st_stlb_miss)
+ val |= P(TLB, MISS);
+ else
+ val |= P(TLB, HIT);
+
+ /*
+ * bit 0: hit L1 data cache
+ * if not set, then all we know is that
+ * it missed L1D
+ */
+ if (dse.st_l1d_hit)
+ val |= P(LVL, HIT);
+ else
+ val |= P(LVL, MISS);
+
+ /*
+ * bit 5: Locked prefix
+ */
+ if (dse.st_locked)
+ val |= P(LOCK, LOCKED);
+
+ return val;
+}
+
+static u64 precise_datala_hsw(struct perf_event *event, u64 status)
+{
+ union perf_mem_data_src dse;
+
+ dse.val = PERF_MEM_NA;
+
+ if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW)
+ dse.mem_op = PERF_MEM_OP_STORE;
+ else if (event->hw.flags & PERF_X86_EVENT_PEBS_LD_HSW)
+ dse.mem_op = PERF_MEM_OP_LOAD;
+
+ /*
+ * L1 info only valid for following events:
+ *
+ * MEM_UOPS_RETIRED.STLB_MISS_STORES
+ * MEM_UOPS_RETIRED.LOCK_STORES
+ * MEM_UOPS_RETIRED.SPLIT_STORES
+ * MEM_UOPS_RETIRED.ALL_STORES
+ */
+ if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW) {
+ if (status & 1)
+ dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT;
+ else
+ dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_MISS;
+ }
+ return dse.val;
+}
+
+static u64 load_latency_data(u64 status)
+{
+ union intel_x86_pebs_dse dse;
+ u64 val;
+
+ dse.val = status;
+
+ /*
+ * use the mapping table for bit 0-3
+ */
+ val = pebs_data_source[dse.ld_dse];
+
+ /*
+ * Nehalem models do not support TLB, Lock infos
+ */
+ if (x86_pmu.pebs_no_tlb) {
+ val |= P(TLB, NA) | P(LOCK, NA);
+ return val;
+ }
+ /*
+ * bit 4: TLB access
+ * 0 = did not miss 2nd level TLB
+ * 1 = missed 2nd level TLB
+ */
+ if (dse.ld_stlb_miss)
+ val |= P(TLB, MISS) | P(TLB, L2);
+ else
+ val |= P(TLB, HIT) | P(TLB, L1) | P(TLB, L2);
+
+ /*
+ * bit 5: locked prefix
+ */
+ if (dse.ld_locked)
+ val |= P(LOCK, LOCKED);
+
+ return val;
+}
+
+struct pebs_record_core {
+ u64 flags, ip;
+ u64 ax, bx, cx, dx;
+ u64 si, di, bp, sp;
+ u64 r8, r9, r10, r11;
+ u64 r12, r13, r14, r15;
+};
+
+struct pebs_record_nhm {
+ u64 flags, ip;
+ u64 ax, bx, cx, dx;
+ u64 si, di, bp, sp;
+ u64 r8, r9, r10, r11;
+ u64 r12, r13, r14, r15;
+ u64 status, dla, dse, lat;
+};
+
+/*
+ * Same as pebs_record_nhm, with two additional fields.
+ */
+struct pebs_record_hsw {
+ u64 flags, ip;
+ u64 ax, bx, cx, dx;
+ u64 si, di, bp, sp;
+ u64 r8, r9, r10, r11;
+ u64 r12, r13, r14, r15;
+ u64 status, dla, dse, lat;
+ u64 real_ip, tsx_tuning;
+};
+
+union hsw_tsx_tuning {
+ struct {
+ u32 cycles_last_block : 32,
+ hle_abort : 1,
+ rtm_abort : 1,
+ instruction_abort : 1,
+ non_instruction_abort : 1,
+ retry : 1,
+ data_conflict : 1,
+ capacity_writes : 1,
+ capacity_reads : 1;
+ };
+ u64 value;
+};
+
+#define PEBS_HSW_TSX_FLAGS 0xff00000000ULL
+
+/* Same as HSW, plus TSC */
+
+struct pebs_record_skl {
+ u64 flags, ip;
+ u64 ax, bx, cx, dx;
+ u64 si, di, bp, sp;
+ u64 r8, r9, r10, r11;
+ u64 r12, r13, r14, r15;
+ u64 status, dla, dse, lat;
+ u64 real_ip, tsx_tuning;
+ u64 tsc;
+};
+
+void init_debug_store_on_cpu(int cpu)
+{
+ struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+
+ if (!ds)
+ return;
+
+ wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA,
+ (u32)((u64)(unsigned long)ds),
+ (u32)((u64)(unsigned long)ds >> 32));
+}
+
+void fini_debug_store_on_cpu(int cpu)
+{
+ if (!per_cpu(cpu_hw_events, cpu).ds)
+ return;
+
+ wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
+}
+
+static DEFINE_PER_CPU(void *, insn_buffer);
+
+static void ds_update_cea(void *cea, void *addr, size_t size, pgprot_t prot)
+{
+ unsigned long start = (unsigned long)cea;
+ phys_addr_t pa;
+ size_t msz = 0;
+
+ pa = virt_to_phys(addr);
+
+ preempt_disable();
+ for (; msz < size; msz += PAGE_SIZE, pa += PAGE_SIZE, cea += PAGE_SIZE)
+ cea_set_pte(cea, pa, prot);
+
+ /*
+ * This is a cross-CPU update of the cpu_entry_area, we must shoot down
+ * all TLB entries for it.
+ */
+ flush_tlb_kernel_range(start, start + size);
+ preempt_enable();
+}
+
+static void ds_clear_cea(void *cea, size_t size)
+{
+ unsigned long start = (unsigned long)cea;
+ size_t msz = 0;
+
+ preempt_disable();
+ for (; msz < size; msz += PAGE_SIZE, cea += PAGE_SIZE)
+ cea_set_pte(cea, 0, PAGE_NONE);
+
+ flush_tlb_kernel_range(start, start + size);
+ preempt_enable();
+}
+
+static void *dsalloc_pages(size_t size, gfp_t flags, int cpu)
+{
+ unsigned int order = get_order(size);
+ int node = cpu_to_node(cpu);
+ struct page *page;
+
+ page = __alloc_pages_node(node, flags | __GFP_ZERO, order);
+ return page ? page_address(page) : NULL;
+}
+
+static void dsfree_pages(const void *buffer, size_t size)
+{
+ if (buffer)
+ free_pages((unsigned long)buffer, get_order(size));
+}
+
+static int alloc_pebs_buffer(int cpu)
+{
+ struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
+ struct debug_store *ds = hwev->ds;
+ size_t bsiz = x86_pmu.pebs_buffer_size;
+ int max, node = cpu_to_node(cpu);
+ void *buffer, *ibuffer, *cea;
+
+ if (!x86_pmu.pebs)
+ return 0;
+
+ buffer = dsalloc_pages(bsiz, GFP_KERNEL, cpu);
+ if (unlikely(!buffer))
+ return -ENOMEM;
+
+ /*
+ * HSW+ already provides us the eventing ip; no need to allocate this
+ * buffer then.
+ */
+ if (x86_pmu.intel_cap.pebs_format < 2) {
+ ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node);
+ if (!ibuffer) {
+ dsfree_pages(buffer, bsiz);
+ return -ENOMEM;
+ }
+ per_cpu(insn_buffer, cpu) = ibuffer;
+ }
+ hwev->ds_pebs_vaddr = buffer;
+ /* Update the cpu entry area mapping */
+ cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer;
+ ds->pebs_buffer_base = (unsigned long) cea;
+ ds_update_cea(cea, buffer, bsiz, PAGE_KERNEL);
+ ds->pebs_index = ds->pebs_buffer_base;
+ max = x86_pmu.pebs_record_size * (bsiz / x86_pmu.pebs_record_size);
+ ds->pebs_absolute_maximum = ds->pebs_buffer_base + max;
+ return 0;
+}
+
+static void release_pebs_buffer(int cpu)
+{
+ struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
+ void *cea;
+
+ if (!x86_pmu.pebs)
+ return;
+
+ kfree(per_cpu(insn_buffer, cpu));
+ per_cpu(insn_buffer, cpu) = NULL;
+
+ /* Clear the fixmap */
+ cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer;
+ ds_clear_cea(cea, x86_pmu.pebs_buffer_size);
+ dsfree_pages(hwev->ds_pebs_vaddr, x86_pmu.pebs_buffer_size);
+ hwev->ds_pebs_vaddr = NULL;
+}
+
+static int alloc_bts_buffer(int cpu)
+{
+ struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
+ struct debug_store *ds = hwev->ds;
+ void *buffer, *cea;
+ int max;
+
+ if (!x86_pmu.bts)
+ return 0;
+
+ buffer = dsalloc_pages(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, cpu);
+ if (unlikely(!buffer)) {
+ WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__);
+ return -ENOMEM;
+ }
+ hwev->ds_bts_vaddr = buffer;
+ /* Update the fixmap */
+ cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer;
+ ds->bts_buffer_base = (unsigned long) cea;
+ ds_update_cea(cea, buffer, BTS_BUFFER_SIZE, PAGE_KERNEL);
+ ds->bts_index = ds->bts_buffer_base;
+ max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
+ ds->bts_absolute_maximum = ds->bts_buffer_base +
+ max * BTS_RECORD_SIZE;
+ ds->bts_interrupt_threshold = ds->bts_absolute_maximum -
+ (max / 16) * BTS_RECORD_SIZE;
+ return 0;
+}
+
+static void release_bts_buffer(int cpu)
+{
+ struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
+ void *cea;
+
+ if (!x86_pmu.bts)
+ return;
+
+ /* Clear the fixmap */
+ cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer;
+ ds_clear_cea(cea, BTS_BUFFER_SIZE);
+ dsfree_pages(hwev->ds_bts_vaddr, BTS_BUFFER_SIZE);
+ hwev->ds_bts_vaddr = NULL;
+}
+
+static int alloc_ds_buffer(int cpu)
+{
+ struct debug_store *ds = &get_cpu_entry_area(cpu)->cpu_debug_store;
+
+ memset(ds, 0, sizeof(*ds));
+ per_cpu(cpu_hw_events, cpu).ds = ds;
+ return 0;
+}
+
+static void release_ds_buffer(int cpu)
+{
+ per_cpu(cpu_hw_events, cpu).ds = NULL;
+}
+
+void release_ds_buffers(void)
+{
+ int cpu;
+
+ if (!x86_pmu.bts && !x86_pmu.pebs)
+ return;
+
+ for_each_possible_cpu(cpu)
+ release_ds_buffer(cpu);
+
+ for_each_possible_cpu(cpu) {
+ /*
+ * Again, ignore errors from offline CPUs, they will no longer
+ * observe cpu_hw_events.ds and not program the DS_AREA when
+ * they come up.
+ */
+ fini_debug_store_on_cpu(cpu);
+ }
+
+ for_each_possible_cpu(cpu) {
+ release_pebs_buffer(cpu);
+ release_bts_buffer(cpu);
+ }
+}
+
+void reserve_ds_buffers(void)
+{
+ int bts_err = 0, pebs_err = 0;
+ int cpu;
+
+ x86_pmu.bts_active = 0;
+ x86_pmu.pebs_active = 0;
+
+ if (!x86_pmu.bts && !x86_pmu.pebs)
+ return;
+
+ if (!x86_pmu.bts)
+ bts_err = 1;
+
+ if (!x86_pmu.pebs)
+ pebs_err = 1;
+
+ for_each_possible_cpu(cpu) {
+ if (alloc_ds_buffer(cpu)) {
+ bts_err = 1;
+ pebs_err = 1;
+ }
+
+ if (!bts_err && alloc_bts_buffer(cpu))
+ bts_err = 1;
+
+ if (!pebs_err && alloc_pebs_buffer(cpu))
+ pebs_err = 1;
+
+ if (bts_err && pebs_err)
+ break;
+ }
+
+ if (bts_err) {
+ for_each_possible_cpu(cpu)
+ release_bts_buffer(cpu);
+ }
+
+ if (pebs_err) {
+ for_each_possible_cpu(cpu)
+ release_pebs_buffer(cpu);
+ }
+
+ if (bts_err && pebs_err) {
+ for_each_possible_cpu(cpu)
+ release_ds_buffer(cpu);
+ } else {
+ if (x86_pmu.bts && !bts_err)
+ x86_pmu.bts_active = 1;
+
+ if (x86_pmu.pebs && !pebs_err)
+ x86_pmu.pebs_active = 1;
+
+ for_each_possible_cpu(cpu) {
+ /*
+ * Ignores wrmsr_on_cpu() errors for offline CPUs they
+ * will get this call through intel_pmu_cpu_starting().
+ */
+ init_debug_store_on_cpu(cpu);
+ }
+ }
+}
+
+/*
+ * BTS
+ */
+
+struct event_constraint bts_constraint =
+ EVENT_CONSTRAINT(0, 1ULL << INTEL_PMC_IDX_FIXED_BTS, 0);
+
+void intel_pmu_enable_bts(u64 config)
+{
+ unsigned long debugctlmsr;
+
+ debugctlmsr = get_debugctlmsr();
+
+ debugctlmsr |= DEBUGCTLMSR_TR;
+ debugctlmsr |= DEBUGCTLMSR_BTS;
+ if (config & ARCH_PERFMON_EVENTSEL_INT)
+ debugctlmsr |= DEBUGCTLMSR_BTINT;
+
+ if (!(config & ARCH_PERFMON_EVENTSEL_OS))
+ debugctlmsr |= DEBUGCTLMSR_BTS_OFF_OS;
+
+ if (!(config & ARCH_PERFMON_EVENTSEL_USR))
+ debugctlmsr |= DEBUGCTLMSR_BTS_OFF_USR;
+
+ update_debugctlmsr(debugctlmsr);
+}
+
+void intel_pmu_disable_bts(void)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ unsigned long debugctlmsr;
+
+ if (!cpuc->ds)
+ return;
+
+ debugctlmsr = get_debugctlmsr();
+
+ debugctlmsr &=
+ ~(DEBUGCTLMSR_TR | DEBUGCTLMSR_BTS | DEBUGCTLMSR_BTINT |
+ DEBUGCTLMSR_BTS_OFF_OS | DEBUGCTLMSR_BTS_OFF_USR);
+
+ update_debugctlmsr(debugctlmsr);
+}
+
+int intel_pmu_drain_bts_buffer(void)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct debug_store *ds = cpuc->ds;
+ struct bts_record {
+ u64 from;
+ u64 to;
+ u64 flags;
+ };
+ struct perf_event *event = cpuc->events[INTEL_PMC_IDX_FIXED_BTS];
+ struct bts_record *at, *base, *top;
+ struct perf_output_handle handle;
+ struct perf_event_header header;
+ struct perf_sample_data data;
+ unsigned long skip = 0;
+ struct pt_regs regs;
+
+ if (!event)
+ return 0;
+
+ if (!x86_pmu.bts_active)
+ return 0;
+
+ base = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
+ top = (struct bts_record *)(unsigned long)ds->bts_index;
+
+ if (top <= base)
+ return 0;
+
+ memset(&regs, 0, sizeof(regs));
+
+ ds->bts_index = ds->bts_buffer_base;
+
+ perf_sample_data_init(&data, 0, event->hw.last_period);
+
+ /*
+ * BTS leaks kernel addresses in branches across the cpl boundary,
+ * such as traps or system calls, so unless the user is asking for
+ * kernel tracing (and right now it's not possible), we'd need to
+ * filter them out. But first we need to count how many of those we
+ * have in the current batch. This is an extra O(n) pass, however,
+ * it's much faster than the other one especially considering that
+ * n <= 2560 (BTS_BUFFER_SIZE / BTS_RECORD_SIZE * 15/16; see the
+ * alloc_bts_buffer()).
+ */
+ for (at = base; at < top; at++) {
+ /*
+ * Note that right now *this* BTS code only works if
+ * attr::exclude_kernel is set, but let's keep this extra
+ * check here in case that changes.
+ */
+ if (event->attr.exclude_kernel &&
+ (kernel_ip(at->from) || kernel_ip(at->to)))
+ skip++;
+ }
+
+ /*
+ * Prepare a generic sample, i.e. fill in the invariant fields.
+ * We will overwrite the from and to address before we output
+ * the sample.
+ */
+ rcu_read_lock();
+ perf_prepare_sample(&header, &data, event, &regs);
+
+ if (perf_output_begin(&handle, event, header.size *
+ (top - base - skip)))
+ goto unlock;
+
+ for (at = base; at < top; at++) {
+ /* Filter out any records that contain kernel addresses. */
+ if (event->attr.exclude_kernel &&
+ (kernel_ip(at->from) || kernel_ip(at->to)))
+ continue;
+
+ data.ip = at->from;
+ data.addr = at->to;
+
+ perf_output_sample(&handle, &header, &data, event);
+ }
+
+ perf_output_end(&handle);
+
+ /* There's new data available. */
+ event->hw.interrupts++;
+ event->pending_kill = POLL_IN;
+unlock:
+ rcu_read_unlock();
+ return 1;
+}
+
+static inline void intel_pmu_drain_pebs_buffer(void)
+{
+ struct pt_regs regs;
+
+ x86_pmu.drain_pebs(&regs);
+}
+
+/*
+ * PEBS
+ */
+struct event_constraint intel_core2_pebs_event_constraints[] = {
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */
+ /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x108000c0, 0x01),
+ EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_atom_pebs_event_constraints[] = {
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* MISPREDICTED_BRANCH_RETIRED */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */
+ /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x108000c0, 0x01),
+ /* Allow all events as PEBS with no flags */
+ INTEL_ALL_EVENT_CONSTRAINT(0, 0x1),
+ EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_slm_pebs_event_constraints[] = {
+ /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x108000c0, 0x1),
+ /* Allow all events as PEBS with no flags */
+ INTEL_ALL_EVENT_CONSTRAINT(0, 0x1),
+ EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_glm_pebs_event_constraints[] = {
+ /* Allow all events as PEBS with no flags */
+ INTEL_ALL_EVENT_CONSTRAINT(0, 0x1),
+ EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_nehalem_pebs_event_constraints[] = {
+ INTEL_PLD_CONSTRAINT(0x100b, 0xf), /* MEM_INST_RETIRED.* */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0xc0, 0xf), /* INST_RETIRED.ANY */
+ INTEL_EVENT_CONSTRAINT(0xc2, 0xf), /* UOPS_RETIRED.* */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x02c5, 0xf), /* BR_MISP_RETIRED.NEAR_CALL */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0xc7, 0xf), /* SSEX_UOPS_RETIRED.* */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */
+ /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x108000c0, 0x0f),
+ EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_westmere_pebs_event_constraints[] = {
+ INTEL_PLD_CONSTRAINT(0x100b, 0xf), /* MEM_INST_RETIRED.* */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0xc0, 0xf), /* INSTR_RETIRED.* */
+ INTEL_EVENT_CONSTRAINT(0xc2, 0xf), /* UOPS_RETIRED.* */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0xc7, 0xf), /* SSEX_UOPS_RETIRED.* */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */
+ /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x108000c0, 0x0f),
+ EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_snb_pebs_event_constraints[] = {
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
+ INTEL_PLD_CONSTRAINT(0x01cd, 0x8), /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */
+ INTEL_PST_CONSTRAINT(0x02cd, 0x8), /* MEM_TRANS_RETIRED.PRECISE_STORES */
+ /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x108001c2, 0xf),
+ INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+ /* Allow all events as PEBS with no flags */
+ INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
+ EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_ivb_pebs_event_constraints[] = {
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
+ INTEL_PLD_CONSTRAINT(0x01cd, 0x8), /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */
+ INTEL_PST_CONSTRAINT(0x02cd, 0x8), /* MEM_TRANS_RETIRED.PRECISE_STORES */
+ /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x108001c2, 0xf),
+ /* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x108001c0, 0x2),
+ INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+ /* Allow all events as PEBS with no flags */
+ INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
+ EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_hsw_pebs_event_constraints[] = {
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
+ INTEL_PLD_CONSTRAINT(0x01cd, 0xf), /* MEM_TRANS_RETIRED.* */
+ /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x108001c2, 0xf),
+ /* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x108001c0, 0x2),
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x11d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x41d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x81d0, 0xf), /* MEM_UOPS_RETIRED.ALL_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x12d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_STORES */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x42d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_STORES */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x82d0, 0xf), /* MEM_UOPS_RETIRED.ALL_STORES */
+ INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+ INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd2, 0xf), /* MEM_LOAD_UOPS_L3_HIT_RETIRED.* */
+ INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd3, 0xf), /* MEM_LOAD_UOPS_L3_MISS_RETIRED.* */
+ /* Allow all events as PEBS with no flags */
+ INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
+ EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_bdw_pebs_event_constraints[] = {
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
+ INTEL_PLD_CONSTRAINT(0x01cd, 0xf), /* MEM_TRANS_RETIRED.* */
+ /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x108001c2, 0xf),
+ /* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x108001c0, 0x2),
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_UOPS_RETIRED.ALL_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_STORES */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_STORES */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_UOPS_RETIRED.ALL_STORES */
+ INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+ INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd2, 0xf), /* MEM_LOAD_UOPS_L3_HIT_RETIRED.* */
+ INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd3, 0xf), /* MEM_LOAD_UOPS_L3_MISS_RETIRED.* */
+ /* Allow all events as PEBS with no flags */
+ INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
+ EVENT_CONSTRAINT_END
+};
+
+
+struct event_constraint intel_skl_pebs_event_constraints[] = {
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x2), /* INST_RETIRED.PREC_DIST */
+ /* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x108001c0, 0x2),
+ /* INST_RETIRED.TOTAL_CYCLES_PS (inv=1, cmask=16) (cycles:p). */
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x108000c0, 0x0f),
+ INTEL_PLD_CONSTRAINT(0x1cd, 0xf), /* MEM_TRANS_RETIRED.* */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_STORES */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_INST_RETIRED.LOCK_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x22d0, 0xf), /* MEM_INST_RETIRED.LOCK_STORES */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_INST_RETIRED.SPLIT_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_INST_RETIRED.SPLIT_STORES */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_INST_RETIRED.ALL_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_INST_RETIRED.ALL_STORES */
+ INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd1, 0xf), /* MEM_LOAD_RETIRED.* */
+ INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd2, 0xf), /* MEM_LOAD_L3_HIT_RETIRED.* */
+ INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd3, 0xf), /* MEM_LOAD_L3_MISS_RETIRED.* */
+ /* Allow all events as PEBS with no flags */
+ INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
+ EVENT_CONSTRAINT_END
+};
+
+struct event_constraint *intel_pebs_constraints(struct perf_event *event)
+{
+ struct event_constraint *c;
+
+ if (!event->attr.precise_ip)
+ return NULL;
+
+ if (x86_pmu.pebs_constraints) {
+ for_each_event_constraint(c, x86_pmu.pebs_constraints) {
+ if ((event->hw.config & c->cmask) == c->code) {
+ event->hw.flags |= c->flags;
+ return c;
+ }
+ }
+ }
+
+ /*
+ * Extended PEBS support
+ * Makes the PEBS code search the normal constraints.
+ */
+ if (x86_pmu.flags & PMU_FL_PEBS_ALL)
+ return NULL;
+
+ return &emptyconstraint;
+}
+
+/*
+ * We need the sched_task callback even for per-cpu events when we use
+ * the large interrupt threshold, such that we can provide PID and TID
+ * to PEBS samples.
+ */
+static inline bool pebs_needs_sched_cb(struct cpu_hw_events *cpuc)
+{
+ return cpuc->n_pebs && (cpuc->n_pebs == cpuc->n_large_pebs);
+}
+
+void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ if (!sched_in && pebs_needs_sched_cb(cpuc))
+ intel_pmu_drain_pebs_buffer();
+}
+
+static inline void pebs_update_threshold(struct cpu_hw_events *cpuc)
+{
+ struct debug_store *ds = cpuc->ds;
+ u64 threshold;
+ int reserved;
+
+ if (x86_pmu.flags & PMU_FL_PEBS_ALL)
+ reserved = x86_pmu.max_pebs_events + x86_pmu.num_counters_fixed;
+ else
+ reserved = x86_pmu.max_pebs_events;
+
+ if (cpuc->n_pebs == cpuc->n_large_pebs) {
+ threshold = ds->pebs_absolute_maximum -
+ reserved * x86_pmu.pebs_record_size;
+ } else {
+ threshold = ds->pebs_buffer_base + x86_pmu.pebs_record_size;
+ }
+
+ ds->pebs_interrupt_threshold = threshold;
+}
+
+static void
+pebs_update_state(bool needed_cb, struct cpu_hw_events *cpuc, struct pmu *pmu)
+{
+ /*
+ * Make sure we get updated with the first PEBS
+ * event. It will trigger also during removal, but
+ * that does not hurt:
+ */
+ bool update = cpuc->n_pebs == 1;
+
+ if (needed_cb != pebs_needs_sched_cb(cpuc)) {
+ if (!needed_cb)
+ perf_sched_cb_inc(pmu);
+ else
+ perf_sched_cb_dec(pmu);
+
+ update = true;
+ }
+
+ if (update)
+ pebs_update_threshold(cpuc);
+}
+
+void intel_pmu_pebs_add(struct perf_event *event)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct hw_perf_event *hwc = &event->hw;
+ bool needed_cb = pebs_needs_sched_cb(cpuc);
+
+ cpuc->n_pebs++;
+ if (hwc->flags & PERF_X86_EVENT_LARGE_PEBS)
+ cpuc->n_large_pebs++;
+
+ pebs_update_state(needed_cb, cpuc, event->ctx->pmu);
+}
+
+void intel_pmu_pebs_enable(struct perf_event *event)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct hw_perf_event *hwc = &event->hw;
+ struct debug_store *ds = cpuc->ds;
+
+ hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
+
+ cpuc->pebs_enabled |= 1ULL << hwc->idx;
+
+ if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT)
+ cpuc->pebs_enabled |= 1ULL << (hwc->idx + 32);
+ else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST)
+ cpuc->pebs_enabled |= 1ULL << 63;
+
+ /*
+ * Use auto-reload if possible to save a MSR write in the PMI.
+ * This must be done in pmu::start(), because PERF_EVENT_IOC_PERIOD.
+ */
+ if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) {
+ unsigned int idx = hwc->idx;
+
+ if (idx >= INTEL_PMC_IDX_FIXED)
+ idx = MAX_PEBS_EVENTS + (idx - INTEL_PMC_IDX_FIXED);
+ ds->pebs_event_reset[idx] =
+ (u64)(-hwc->sample_period) & x86_pmu.cntval_mask;
+ } else {
+ ds->pebs_event_reset[hwc->idx] = 0;
+ }
+}
+
+void intel_pmu_pebs_del(struct perf_event *event)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct hw_perf_event *hwc = &event->hw;
+ bool needed_cb = pebs_needs_sched_cb(cpuc);
+
+ cpuc->n_pebs--;
+ if (hwc->flags & PERF_X86_EVENT_LARGE_PEBS)
+ cpuc->n_large_pebs--;
+
+ pebs_update_state(needed_cb, cpuc, event->ctx->pmu);
+}
+
+void intel_pmu_pebs_disable(struct perf_event *event)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (cpuc->n_pebs == cpuc->n_large_pebs)
+ intel_pmu_drain_pebs_buffer();
+
+ cpuc->pebs_enabled &= ~(1ULL << hwc->idx);
+
+ if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT)
+ cpuc->pebs_enabled &= ~(1ULL << (hwc->idx + 32));
+ else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST)
+ cpuc->pebs_enabled &= ~(1ULL << 63);
+
+ if (cpuc->enabled)
+ wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
+
+ hwc->config |= ARCH_PERFMON_EVENTSEL_INT;
+}
+
+void intel_pmu_pebs_enable_all(void)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ if (cpuc->pebs_enabled)
+ wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
+}
+
+void intel_pmu_pebs_disable_all(void)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ if (cpuc->pebs_enabled)
+ wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
+}
+
+static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ unsigned long from = cpuc->lbr_entries[0].from;
+ unsigned long old_to, to = cpuc->lbr_entries[0].to;
+ unsigned long ip = regs->ip;
+ int is_64bit = 0;
+ void *kaddr;
+ int size;
+
+ /*
+ * We don't need to fixup if the PEBS assist is fault like
+ */
+ if (!x86_pmu.intel_cap.pebs_trap)
+ return 1;
+
+ /*
+ * No LBR entry, no basic block, no rewinding
+ */
+ if (!cpuc->lbr_stack.nr || !from || !to)
+ return 0;
+
+ /*
+ * Basic blocks should never cross user/kernel boundaries
+ */
+ if (kernel_ip(ip) != kernel_ip(to))
+ return 0;
+
+ /*
+ * unsigned math, either ip is before the start (impossible) or
+ * the basic block is larger than 1 page (sanity)
+ */
+ if ((ip - to) > PEBS_FIXUP_SIZE)
+ return 0;
+
+ /*
+ * We sampled a branch insn, rewind using the LBR stack
+ */
+ if (ip == to) {
+ set_linear_ip(regs, from);
+ return 1;
+ }
+
+ size = ip - to;
+ if (!kernel_ip(ip)) {
+ int bytes;
+ u8 *buf = this_cpu_read(insn_buffer);
+
+ /* 'size' must fit our buffer, see above */
+ bytes = copy_from_user_nmi(buf, (void __user *)to, size);
+ if (bytes != 0)
+ return 0;
+
+ kaddr = buf;
+ } else {
+ kaddr = (void *)to;
+ }
+
+ do {
+ struct insn insn;
+
+ old_to = to;
+
+#ifdef CONFIG_X86_64
+ is_64bit = kernel_ip(to) || !test_thread_flag(TIF_IA32);
+#endif
+ insn_init(&insn, kaddr, size, is_64bit);
+ insn_get_length(&insn);
+ /*
+ * Make sure there was not a problem decoding the
+ * instruction and getting the length. This is
+ * doubly important because we have an infinite
+ * loop if insn.length=0.
+ */
+ if (!insn.length)
+ break;
+
+ to += insn.length;
+ kaddr += insn.length;
+ size -= insn.length;
+ } while (to < ip);
+
+ if (to == ip) {
+ set_linear_ip(regs, old_to);
+ return 1;
+ }
+
+ /*
+ * Even though we decoded the basic block, the instruction stream
+ * never matched the given IP, either the TO or the IP got corrupted.
+ */
+ return 0;
+}
+
+static inline u64 intel_hsw_weight(struct pebs_record_skl *pebs)
+{
+ if (pebs->tsx_tuning) {
+ union hsw_tsx_tuning tsx = { .value = pebs->tsx_tuning };
+ return tsx.cycles_last_block;
+ }
+ return 0;
+}
+
+static inline u64 intel_hsw_transaction(struct pebs_record_skl *pebs)
+{
+ u64 txn = (pebs->tsx_tuning & PEBS_HSW_TSX_FLAGS) >> 32;
+
+ /* For RTM XABORTs also log the abort code from AX */
+ if ((txn & PERF_TXN_TRANSACTION) && (pebs->ax & 1))
+ txn |= ((pebs->ax >> 24) & 0xff) << PERF_TXN_ABORT_SHIFT;
+ return txn;
+}
+
+static void setup_pebs_sample_data(struct perf_event *event,
+ struct pt_regs *iregs, void *__pebs,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+#define PERF_X86_EVENT_PEBS_HSW_PREC \
+ (PERF_X86_EVENT_PEBS_ST_HSW | \
+ PERF_X86_EVENT_PEBS_LD_HSW | \
+ PERF_X86_EVENT_PEBS_NA_HSW)
+ /*
+ * We cast to the biggest pebs_record but are careful not to
+ * unconditionally access the 'extra' entries.
+ */
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct pebs_record_skl *pebs = __pebs;
+ u64 sample_type;
+ int fll, fst, dsrc;
+ int fl = event->hw.flags;
+
+ if (pebs == NULL)
+ return;
+
+ sample_type = event->attr.sample_type;
+ dsrc = sample_type & PERF_SAMPLE_DATA_SRC;
+
+ fll = fl & PERF_X86_EVENT_PEBS_LDLAT;
+ fst = fl & (PERF_X86_EVENT_PEBS_ST | PERF_X86_EVENT_PEBS_HSW_PREC);
+
+ perf_sample_data_init(data, 0, event->hw.last_period);
+
+ data->period = event->hw.last_period;
+
+ /*
+ * Use latency for weight (only avail with PEBS-LL)
+ */
+ if (fll && (sample_type & PERF_SAMPLE_WEIGHT))
+ data->weight = pebs->lat;
+
+ /*
+ * data.data_src encodes the data source
+ */
+ if (dsrc) {
+ u64 val = PERF_MEM_NA;
+ if (fll)
+ val = load_latency_data(pebs->dse);
+ else if (fst && (fl & PERF_X86_EVENT_PEBS_HSW_PREC))
+ val = precise_datala_hsw(event, pebs->dse);
+ else if (fst)
+ val = precise_store_data(pebs->dse);
+ data->data_src.val = val;
+ }
+
+ /*
+ * We must however always use iregs for the unwinder to stay sane; the
+ * record BP,SP,IP can point into thin air when the record is from a
+ * previous PMI context or an (I)RET happend between the record and
+ * PMI.
+ */
+ if (sample_type & PERF_SAMPLE_CALLCHAIN)
+ data->callchain = perf_callchain(event, iregs);
+
+ /*
+ * We use the interrupt regs as a base because the PEBS record does not
+ * contain a full regs set, specifically it seems to lack segment
+ * descriptors, which get used by things like user_mode().
+ *
+ * In the simple case fix up only the IP for PERF_SAMPLE_IP.
+ */
+ *regs = *iregs;
+
+ /*
+ * Initialize regs_>flags from PEBS,
+ * Clear exact bit (which uses x86 EFLAGS Reserved bit 3),
+ * i.e., do not rely on it being zero:
+ */
+ regs->flags = pebs->flags & ~PERF_EFLAGS_EXACT;
+
+ if (sample_type & PERF_SAMPLE_REGS_INTR) {
+ regs->ax = pebs->ax;
+ regs->bx = pebs->bx;
+ regs->cx = pebs->cx;
+ regs->dx = pebs->dx;
+ regs->si = pebs->si;
+ regs->di = pebs->di;
+
+ regs->bp = pebs->bp;
+ regs->sp = pebs->sp;
+
+#ifndef CONFIG_X86_32
+ regs->r8 = pebs->r8;
+ regs->r9 = pebs->r9;
+ regs->r10 = pebs->r10;
+ regs->r11 = pebs->r11;
+ regs->r12 = pebs->r12;
+ regs->r13 = pebs->r13;
+ regs->r14 = pebs->r14;
+ regs->r15 = pebs->r15;
+#endif
+ }
+
+ if (event->attr.precise_ip > 1) {
+ /*
+ * Haswell and later processors have an 'eventing IP'
+ * (real IP) which fixes the off-by-1 skid in hardware.
+ * Use it when precise_ip >= 2 :
+ */
+ if (x86_pmu.intel_cap.pebs_format >= 2) {
+ set_linear_ip(regs, pebs->real_ip);
+ regs->flags |= PERF_EFLAGS_EXACT;
+ } else {
+ /* Otherwise, use PEBS off-by-1 IP: */
+ set_linear_ip(regs, pebs->ip);
+
+ /*
+ * With precise_ip >= 2, try to fix up the off-by-1 IP
+ * using the LBR. If successful, the fixup function
+ * corrects regs->ip and calls set_linear_ip() on regs:
+ */
+ if (intel_pmu_pebs_fixup_ip(regs))
+ regs->flags |= PERF_EFLAGS_EXACT;
+ }
+ } else {
+ /*
+ * When precise_ip == 1, return the PEBS off-by-1 IP,
+ * no fixup attempted:
+ */
+ set_linear_ip(regs, pebs->ip);
+ }
+
+
+ if ((sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR)) &&
+ x86_pmu.intel_cap.pebs_format >= 1)
+ data->addr = pebs->dla;
+
+ if (x86_pmu.intel_cap.pebs_format >= 2) {
+ /* Only set the TSX weight when no memory weight. */
+ if ((sample_type & PERF_SAMPLE_WEIGHT) && !fll)
+ data->weight = intel_hsw_weight(pebs);
+
+ if (sample_type & PERF_SAMPLE_TRANSACTION)
+ data->txn = intel_hsw_transaction(pebs);
+ }
+
+ /*
+ * v3 supplies an accurate time stamp, so we use that
+ * for the time stamp.
+ *
+ * We can only do this for the default trace clock.
+ */
+ if (x86_pmu.intel_cap.pebs_format >= 3 &&
+ event->attr.use_clockid == 0)
+ data->time = native_sched_clock_from_tsc(pebs->tsc);
+
+ if (has_branch_stack(event))
+ data->br_stack = &cpuc->lbr_stack;
+}
+
+static inline void *
+get_next_pebs_record_by_bit(void *base, void *top, int bit)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ void *at;
+ u64 pebs_status;
+
+ /*
+ * fmt0 does not have a status bitfield (does not use
+ * perf_record_nhm format)
+ */
+ if (x86_pmu.intel_cap.pebs_format < 1)
+ return base;
+
+ if (base == NULL)
+ return NULL;
+
+ for (at = base; at < top; at += x86_pmu.pebs_record_size) {
+ struct pebs_record_nhm *p = at;
+
+ if (test_bit(bit, (unsigned long *)&p->status)) {
+ /* PEBS v3 has accurate status bits */
+ if (x86_pmu.intel_cap.pebs_format >= 3)
+ return at;
+
+ if (p->status == (1 << bit))
+ return at;
+
+ /* clear non-PEBS bit and re-check */
+ pebs_status = p->status & cpuc->pebs_enabled;
+ pebs_status &= PEBS_COUNTER_MASK;
+ if (pebs_status == (1 << bit))
+ return at;
+ }
+ }
+ return NULL;
+}
+
+void intel_pmu_auto_reload_read(struct perf_event *event)
+{
+ WARN_ON(!(event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD));
+
+ perf_pmu_disable(event->pmu);
+ intel_pmu_drain_pebs_buffer();
+ perf_pmu_enable(event->pmu);
+}
+
+/*
+ * Special variant of intel_pmu_save_and_restart() for auto-reload.
+ */
+static int
+intel_pmu_save_and_restart_reload(struct perf_event *event, int count)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ int shift = 64 - x86_pmu.cntval_bits;
+ u64 period = hwc->sample_period;
+ u64 prev_raw_count, new_raw_count;
+ s64 new, old;
+
+ WARN_ON(!period);
+
+ /*
+ * drain_pebs() only happens when the PMU is disabled.
+ */
+ WARN_ON(this_cpu_read(cpu_hw_events.enabled));
+
+ prev_raw_count = local64_read(&hwc->prev_count);
+ rdpmcl(hwc->event_base_rdpmc, new_raw_count);
+ local64_set(&hwc->prev_count, new_raw_count);
+
+ /*
+ * Since the counter increments a negative counter value and
+ * overflows on the sign switch, giving the interval:
+ *
+ * [-period, 0]
+ *
+ * the difference between two consequtive reads is:
+ *
+ * A) value2 - value1;
+ * when no overflows have happened in between,
+ *
+ * B) (0 - value1) + (value2 - (-period));
+ * when one overflow happened in between,
+ *
+ * C) (0 - value1) + (n - 1) * (period) + (value2 - (-period));
+ * when @n overflows happened in between.
+ *
+ * Here A) is the obvious difference, B) is the extension to the
+ * discrete interval, where the first term is to the top of the
+ * interval and the second term is from the bottom of the next
+ * interval and C) the extension to multiple intervals, where the
+ * middle term is the whole intervals covered.
+ *
+ * An equivalent of C, by reduction, is:
+ *
+ * value2 - value1 + n * period
+ */
+ new = ((s64)(new_raw_count << shift) >> shift);
+ old = ((s64)(prev_raw_count << shift) >> shift);
+ local64_add(new - old + count * period, &event->count);
+
+ local64_set(&hwc->period_left, -new);
+
+ perf_event_update_userpage(event);
+
+ return 0;
+}
+
+static void __intel_pmu_pebs_event(struct perf_event *event,
+ struct pt_regs *iregs,
+ void *base, void *top,
+ int bit, int count)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct perf_sample_data data;
+ struct pt_regs regs;
+ void *at = get_next_pebs_record_by_bit(base, top, bit);
+
+ if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) {
+ /*
+ * Now, auto-reload is only enabled in fixed period mode.
+ * The reload value is always hwc->sample_period.
+ * May need to change it, if auto-reload is enabled in
+ * freq mode later.
+ */
+ intel_pmu_save_and_restart_reload(event, count);
+ } else if (!intel_pmu_save_and_restart(event))
+ return;
+
+ while (count > 1) {
+ setup_pebs_sample_data(event, iregs, at, &data, &regs);
+ perf_event_output(event, &data, &regs);
+ at += x86_pmu.pebs_record_size;
+ at = get_next_pebs_record_by_bit(at, top, bit);
+ count--;
+ }
+
+ setup_pebs_sample_data(event, iregs, at, &data, &regs);
+
+ /*
+ * All but the last records are processed.
+ * The last one is left to be able to call the overflow handler.
+ */
+ if (perf_event_overflow(event, &data, &regs)) {
+ x86_pmu_stop(event, 0);
+ return;
+ }
+
+}
+
+static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct debug_store *ds = cpuc->ds;
+ struct perf_event *event = cpuc->events[0]; /* PMC0 only */
+ struct pebs_record_core *at, *top;
+ int n;
+
+ if (!x86_pmu.pebs_active)
+ return;
+
+ at = (struct pebs_record_core *)(unsigned long)ds->pebs_buffer_base;
+ top = (struct pebs_record_core *)(unsigned long)ds->pebs_index;
+
+ /*
+ * Whatever else happens, drain the thing
+ */
+ ds->pebs_index = ds->pebs_buffer_base;
+
+ if (!test_bit(0, cpuc->active_mask))
+ return;
+
+ WARN_ON_ONCE(!event);
+
+ if (!event->attr.precise_ip)
+ return;
+
+ n = top - at;
+ if (n <= 0) {
+ if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD)
+ intel_pmu_save_and_restart_reload(event, 0);
+ return;
+ }
+
+ __intel_pmu_pebs_event(event, iregs, at, top, 0, n);
+}
+
+static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct debug_store *ds = cpuc->ds;
+ struct perf_event *event;
+ void *base, *at, *top;
+ short counts[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {};
+ short error[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {};
+ int bit, i, size;
+ u64 mask;
+
+ if (!x86_pmu.pebs_active)
+ return;
+
+ base = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base;
+ top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index;
+
+ ds->pebs_index = ds->pebs_buffer_base;
+
+ mask = (1ULL << x86_pmu.max_pebs_events) - 1;
+ size = x86_pmu.max_pebs_events;
+ if (x86_pmu.flags & PMU_FL_PEBS_ALL) {
+ mask |= ((1ULL << x86_pmu.num_counters_fixed) - 1) << INTEL_PMC_IDX_FIXED;
+ size = INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed;
+ }
+
+ if (unlikely(base >= top)) {
+ /*
+ * The drain_pebs() could be called twice in a short period
+ * for auto-reload event in pmu::read(). There are no
+ * overflows have happened in between.
+ * It needs to call intel_pmu_save_and_restart_reload() to
+ * update the event->count for this case.
+ */
+ for_each_set_bit(bit, (unsigned long *)&cpuc->pebs_enabled,
+ size) {
+ event = cpuc->events[bit];
+ if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD)
+ intel_pmu_save_and_restart_reload(event, 0);
+ }
+ return;
+ }
+
+ for (at = base; at < top; at += x86_pmu.pebs_record_size) {
+ struct pebs_record_nhm *p = at;
+ u64 pebs_status;
+
+ pebs_status = p->status & cpuc->pebs_enabled;
+ pebs_status &= mask;
+
+ /* PEBS v3 has more accurate status bits */
+ if (x86_pmu.intel_cap.pebs_format >= 3) {
+ for_each_set_bit(bit, (unsigned long *)&pebs_status,
+ size)
+ counts[bit]++;
+
+ continue;
+ }
+
+ /*
+ * On some CPUs the PEBS status can be zero when PEBS is
+ * racing with clearing of GLOBAL_STATUS.
+ *
+ * Normally we would drop that record, but in the
+ * case when there is only a single active PEBS event
+ * we can assume it's for that event.
+ */
+ if (!pebs_status && cpuc->pebs_enabled &&
+ !(cpuc->pebs_enabled & (cpuc->pebs_enabled-1)))
+ pebs_status = p->status = cpuc->pebs_enabled;
+
+ bit = find_first_bit((unsigned long *)&pebs_status,
+ x86_pmu.max_pebs_events);
+ if (bit >= x86_pmu.max_pebs_events)
+ continue;
+
+ /*
+ * The PEBS hardware does not deal well with the situation
+ * when events happen near to each other and multiple bits
+ * are set. But it should happen rarely.
+ *
+ * If these events include one PEBS and multiple non-PEBS
+ * events, it doesn't impact PEBS record. The record will
+ * be handled normally. (slow path)
+ *
+ * If these events include two or more PEBS events, the
+ * records for the events can be collapsed into a single
+ * one, and it's not possible to reconstruct all events
+ * that caused the PEBS record. It's called collision.
+ * If collision happened, the record will be dropped.
+ */
+ if (p->status != (1ULL << bit)) {
+ for_each_set_bit(i, (unsigned long *)&pebs_status,
+ x86_pmu.max_pebs_events)
+ error[i]++;
+ continue;
+ }
+
+ counts[bit]++;
+ }
+
+ for (bit = 0; bit < size; bit++) {
+ if ((counts[bit] == 0) && (error[bit] == 0))
+ continue;
+
+ event = cpuc->events[bit];
+ if (WARN_ON_ONCE(!event))
+ continue;
+
+ if (WARN_ON_ONCE(!event->attr.precise_ip))
+ continue;
+
+ /* log dropped samples number */
+ if (error[bit]) {
+ perf_log_lost_samples(event, error[bit]);
+
+ if (perf_event_account_interrupt(event))
+ x86_pmu_stop(event, 0);
+ }
+
+ if (counts[bit]) {
+ __intel_pmu_pebs_event(event, iregs, base,
+ top, bit, counts[bit]);
+ }
+ }
+}
+
+/*
+ * BTS, PEBS probe and setup
+ */
+
+void __init intel_ds_init(void)
+{
+ /*
+ * No support for 32bit formats
+ */
+ if (!boot_cpu_has(X86_FEATURE_DTES64))
+ return;
+
+ x86_pmu.bts = boot_cpu_has(X86_FEATURE_BTS);
+ x86_pmu.pebs = boot_cpu_has(X86_FEATURE_PEBS);
+ x86_pmu.pebs_buffer_size = PEBS_BUFFER_SIZE;
+ if (x86_pmu.pebs) {
+ char pebs_type = x86_pmu.intel_cap.pebs_trap ? '+' : '-';
+ int format = x86_pmu.intel_cap.pebs_format;
+
+ switch (format) {
+ case 0:
+ pr_cont("PEBS fmt0%c, ", pebs_type);
+ x86_pmu.pebs_record_size = sizeof(struct pebs_record_core);
+ /*
+ * Using >PAGE_SIZE buffers makes the WRMSR to
+ * PERF_GLOBAL_CTRL in intel_pmu_enable_all()
+ * mysteriously hang on Core2.
+ *
+ * As a workaround, we don't do this.
+ */
+ x86_pmu.pebs_buffer_size = PAGE_SIZE;
+ x86_pmu.drain_pebs = intel_pmu_drain_pebs_core;
+ break;
+
+ case 1:
+ pr_cont("PEBS fmt1%c, ", pebs_type);
+ x86_pmu.pebs_record_size = sizeof(struct pebs_record_nhm);
+ x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
+ break;
+
+ case 2:
+ pr_cont("PEBS fmt2%c, ", pebs_type);
+ x86_pmu.pebs_record_size = sizeof(struct pebs_record_hsw);
+ x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
+ break;
+
+ case 3:
+ pr_cont("PEBS fmt3%c, ", pebs_type);
+ x86_pmu.pebs_record_size =
+ sizeof(struct pebs_record_skl);
+ x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
+ x86_pmu.large_pebs_flags |= PERF_SAMPLE_TIME;
+ break;
+
+ default:
+ pr_cont("no PEBS fmt%d%c, ", format, pebs_type);
+ x86_pmu.pebs = 0;
+ }
+ }
+}
+
+void perf_restore_debug_store(void)
+{
+ struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds);
+
+ if (!x86_pmu.bts && !x86_pmu.pebs)
+ return;
+
+ wrmsrl(MSR_IA32_DS_AREA, (unsigned long)ds);
+}
diff --git a/arch/x86/events/intel/knc.c b/arch/x86/events/intel/knc.c
new file mode 100644
index 000000000..618001c20
--- /dev/null
+++ b/arch/x86/events/intel/knc.c
@@ -0,0 +1,322 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Driver for Intel Xeon Phi "Knights Corner" PMU */
+
+#include <linux/perf_event.h>
+#include <linux/types.h>
+
+#include <asm/hardirq.h>
+
+#include "../perf_event.h"
+
+static const u64 knc_perfmon_event_map[] =
+{
+ [PERF_COUNT_HW_CPU_CYCLES] = 0x002a,
+ [PERF_COUNT_HW_INSTRUCTIONS] = 0x0016,
+ [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0028,
+ [PERF_COUNT_HW_CACHE_MISSES] = 0x0029,
+ [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x0012,
+ [PERF_COUNT_HW_BRANCH_MISSES] = 0x002b,
+};
+
+static const u64 __initconst knc_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+ [ C(OP_READ) ] = {
+ /* On Xeon Phi event "0" is a valid DATA_READ */
+ /* (L1 Data Cache Reads) Instruction. */
+ /* We code this as ARCH_PERFMON_EVENTSEL_INT as this */
+ /* bit will always be set in x86_pmu_hw_config(). */
+ [ C(RESULT_ACCESS) ] = ARCH_PERFMON_EVENTSEL_INT,
+ /* DATA_READ */
+ [ C(RESULT_MISS) ] = 0x0003, /* DATA_READ_MISS */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0001, /* DATA_WRITE */
+ [ C(RESULT_MISS) ] = 0x0004, /* DATA_WRITE_MISS */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0011, /* L1_DATA_PF1 */
+ [ C(RESULT_MISS) ] = 0x001c, /* L1_DATA_PF1_MISS */
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x000c, /* CODE_READ */
+ [ C(RESULT_MISS) ] = 0x000e, /* CODE_CACHE_MISS */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0x10cb, /* L2_READ_MISS */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x10cc, /* L2_WRITE_HIT */
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x10fc, /* L2_DATA_PF2 */
+ [ C(RESULT_MISS) ] = 0x10fe, /* L2_DATA_PF2_MISS */
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = ARCH_PERFMON_EVENTSEL_INT,
+ /* DATA_READ */
+ /* see note on L1 OP_READ */
+ [ C(RESULT_MISS) ] = 0x0002, /* DATA_PAGE_WALK */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0001, /* DATA_WRITE */
+ [ C(RESULT_MISS) ] = 0x0002, /* DATA_PAGE_WALK */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x000c, /* CODE_READ */
+ [ C(RESULT_MISS) ] = 0x000d, /* CODE_PAGE_WALK */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0012, /* BRANCHES */
+ [ C(RESULT_MISS) ] = 0x002b, /* BRANCHES_MISPREDICTED */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+};
+
+
+static u64 knc_pmu_event_map(int hw_event)
+{
+ return knc_perfmon_event_map[hw_event];
+}
+
+static struct event_constraint knc_event_constraints[] =
+{
+ INTEL_EVENT_CONSTRAINT(0xc3, 0x1), /* HWP_L2HIT */
+ INTEL_EVENT_CONSTRAINT(0xc4, 0x1), /* HWP_L2MISS */
+ INTEL_EVENT_CONSTRAINT(0xc8, 0x1), /* L2_READ_HIT_E */
+ INTEL_EVENT_CONSTRAINT(0xc9, 0x1), /* L2_READ_HIT_M */
+ INTEL_EVENT_CONSTRAINT(0xca, 0x1), /* L2_READ_HIT_S */
+ INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* L2_READ_MISS */
+ INTEL_EVENT_CONSTRAINT(0xcc, 0x1), /* L2_WRITE_HIT */
+ INTEL_EVENT_CONSTRAINT(0xce, 0x1), /* L2_STRONGLY_ORDERED_STREAMING_VSTORES_MISS */
+ INTEL_EVENT_CONSTRAINT(0xcf, 0x1), /* L2_WEAKLY_ORDERED_STREAMING_VSTORE_MISS */
+ INTEL_EVENT_CONSTRAINT(0xd7, 0x1), /* L2_VICTIM_REQ_WITH_DATA */
+ INTEL_EVENT_CONSTRAINT(0xe3, 0x1), /* SNP_HITM_BUNIT */
+ INTEL_EVENT_CONSTRAINT(0xe6, 0x1), /* SNP_HIT_L2 */
+ INTEL_EVENT_CONSTRAINT(0xe7, 0x1), /* SNP_HITM_L2 */
+ INTEL_EVENT_CONSTRAINT(0xf1, 0x1), /* L2_DATA_READ_MISS_CACHE_FILL */
+ INTEL_EVENT_CONSTRAINT(0xf2, 0x1), /* L2_DATA_WRITE_MISS_CACHE_FILL */
+ INTEL_EVENT_CONSTRAINT(0xf6, 0x1), /* L2_DATA_READ_MISS_MEM_FILL */
+ INTEL_EVENT_CONSTRAINT(0xf7, 0x1), /* L2_DATA_WRITE_MISS_MEM_FILL */
+ INTEL_EVENT_CONSTRAINT(0xfc, 0x1), /* L2_DATA_PF2 */
+ INTEL_EVENT_CONSTRAINT(0xfd, 0x1), /* L2_DATA_PF2_DROP */
+ INTEL_EVENT_CONSTRAINT(0xfe, 0x1), /* L2_DATA_PF2_MISS */
+ INTEL_EVENT_CONSTRAINT(0xff, 0x1), /* L2_DATA_HIT_INFLIGHT_PF2 */
+ EVENT_CONSTRAINT_END
+};
+
+#define MSR_KNC_IA32_PERF_GLOBAL_STATUS 0x0000002d
+#define MSR_KNC_IA32_PERF_GLOBAL_OVF_CONTROL 0x0000002e
+#define MSR_KNC_IA32_PERF_GLOBAL_CTRL 0x0000002f
+
+#define KNC_ENABLE_COUNTER0 0x00000001
+#define KNC_ENABLE_COUNTER1 0x00000002
+
+static void knc_pmu_disable_all(void)
+{
+ u64 val;
+
+ rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
+ val &= ~(KNC_ENABLE_COUNTER0|KNC_ENABLE_COUNTER1);
+ wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
+}
+
+static void knc_pmu_enable_all(int added)
+{
+ u64 val;
+
+ rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
+ val |= (KNC_ENABLE_COUNTER0|KNC_ENABLE_COUNTER1);
+ wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
+}
+
+static inline void
+knc_pmu_disable_event(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ u64 val;
+
+ val = hwc->config;
+ val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
+
+ (void)wrmsrl_safe(hwc->config_base + hwc->idx, val);
+}
+
+static void knc_pmu_enable_event(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ u64 val;
+
+ val = hwc->config;
+ val |= ARCH_PERFMON_EVENTSEL_ENABLE;
+
+ (void)wrmsrl_safe(hwc->config_base + hwc->idx, val);
+}
+
+static inline u64 knc_pmu_get_status(void)
+{
+ u64 status;
+
+ rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_STATUS, status);
+
+ return status;
+}
+
+static inline void knc_pmu_ack_status(u64 ack)
+{
+ wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_OVF_CONTROL, ack);
+}
+
+static int knc_pmu_handle_irq(struct pt_regs *regs)
+{
+ struct perf_sample_data data;
+ struct cpu_hw_events *cpuc;
+ int handled = 0;
+ int bit, loops;
+ u64 status;
+
+ cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ knc_pmu_disable_all();
+
+ status = knc_pmu_get_status();
+ if (!status) {
+ knc_pmu_enable_all(0);
+ return handled;
+ }
+
+ loops = 0;
+again:
+ knc_pmu_ack_status(status);
+ if (++loops > 100) {
+ WARN_ONCE(1, "perf: irq loop stuck!\n");
+ perf_event_print_debug();
+ goto done;
+ }
+
+ inc_irq_stat(apic_perf_irqs);
+
+ for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
+ struct perf_event *event = cpuc->events[bit];
+
+ handled++;
+
+ if (!test_bit(bit, cpuc->active_mask))
+ continue;
+
+ if (!intel_pmu_save_and_restart(event))
+ continue;
+
+ perf_sample_data_init(&data, 0, event->hw.last_period);
+
+ if (perf_event_overflow(event, &data, regs))
+ x86_pmu_stop(event, 0);
+ }
+
+ /*
+ * Repeat if there is more work to be done:
+ */
+ status = knc_pmu_get_status();
+ if (status)
+ goto again;
+
+done:
+ /* Only restore PMU state when it's active. See x86_pmu_disable(). */
+ if (cpuc->enabled)
+ knc_pmu_enable_all(0);
+
+ return handled;
+}
+
+
+PMU_FORMAT_ATTR(event, "config:0-7" );
+PMU_FORMAT_ATTR(umask, "config:8-15" );
+PMU_FORMAT_ATTR(edge, "config:18" );
+PMU_FORMAT_ATTR(inv, "config:23" );
+PMU_FORMAT_ATTR(cmask, "config:24-31" );
+
+static struct attribute *intel_knc_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_cmask.attr,
+ NULL,
+};
+
+static const struct x86_pmu knc_pmu __initconst = {
+ .name = "knc",
+ .handle_irq = knc_pmu_handle_irq,
+ .disable_all = knc_pmu_disable_all,
+ .enable_all = knc_pmu_enable_all,
+ .enable = knc_pmu_enable_event,
+ .disable = knc_pmu_disable_event,
+ .hw_config = x86_pmu_hw_config,
+ .schedule_events = x86_schedule_events,
+ .eventsel = MSR_KNC_EVNTSEL0,
+ .perfctr = MSR_KNC_PERFCTR0,
+ .event_map = knc_pmu_event_map,
+ .max_events = ARRAY_SIZE(knc_perfmon_event_map),
+ .apic = 1,
+ .max_period = (1ULL << 39) - 1,
+ .version = 0,
+ .num_counters = 2,
+ .cntval_bits = 40,
+ .cntval_mask = (1ULL << 40) - 1,
+ .get_event_constraints = x86_get_event_constraints,
+ .event_constraints = knc_event_constraints,
+ .format_attrs = intel_knc_formats_attr,
+};
+
+__init int knc_pmu_init(void)
+{
+ x86_pmu = knc_pmu;
+
+ memcpy(hw_cache_event_ids, knc_hw_cache_event_ids,
+ sizeof(hw_cache_event_ids));
+
+ return 0;
+}
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
new file mode 100644
index 000000000..c88ed3958
--- /dev/null
+++ b/arch/x86/events/intel/lbr.c
@@ -0,0 +1,1279 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/perf_event.h>
+#include <linux/types.h>
+
+#include <asm/perf_event.h>
+#include <asm/msr.h>
+#include <asm/insn.h>
+
+#include "../perf_event.h"
+
+enum {
+ LBR_FORMAT_32 = 0x00,
+ LBR_FORMAT_LIP = 0x01,
+ LBR_FORMAT_EIP = 0x02,
+ LBR_FORMAT_EIP_FLAGS = 0x03,
+ LBR_FORMAT_EIP_FLAGS2 = 0x04,
+ LBR_FORMAT_INFO = 0x05,
+ LBR_FORMAT_TIME = 0x06,
+ LBR_FORMAT_MAX_KNOWN = LBR_FORMAT_TIME,
+};
+
+static const enum {
+ LBR_EIP_FLAGS = 1,
+ LBR_TSX = 2,
+} lbr_desc[LBR_FORMAT_MAX_KNOWN + 1] = {
+ [LBR_FORMAT_EIP_FLAGS] = LBR_EIP_FLAGS,
+ [LBR_FORMAT_EIP_FLAGS2] = LBR_EIP_FLAGS | LBR_TSX,
+};
+
+/*
+ * Intel LBR_SELECT bits
+ * Intel Vol3a, April 2011, Section 16.7 Table 16-10
+ *
+ * Hardware branch filter (not available on all CPUs)
+ */
+#define LBR_KERNEL_BIT 0 /* do not capture at ring0 */
+#define LBR_USER_BIT 1 /* do not capture at ring > 0 */
+#define LBR_JCC_BIT 2 /* do not capture conditional branches */
+#define LBR_REL_CALL_BIT 3 /* do not capture relative calls */
+#define LBR_IND_CALL_BIT 4 /* do not capture indirect calls */
+#define LBR_RETURN_BIT 5 /* do not capture near returns */
+#define LBR_IND_JMP_BIT 6 /* do not capture indirect jumps */
+#define LBR_REL_JMP_BIT 7 /* do not capture relative jumps */
+#define LBR_FAR_BIT 8 /* do not capture far branches */
+#define LBR_CALL_STACK_BIT 9 /* enable call stack */
+
+/*
+ * Following bit only exists in Linux; we mask it out before writing it to
+ * the actual MSR. But it helps the constraint perf code to understand
+ * that this is a separate configuration.
+ */
+#define LBR_NO_INFO_BIT 63 /* don't read LBR_INFO. */
+
+#define LBR_KERNEL (1 << LBR_KERNEL_BIT)
+#define LBR_USER (1 << LBR_USER_BIT)
+#define LBR_JCC (1 << LBR_JCC_BIT)
+#define LBR_REL_CALL (1 << LBR_REL_CALL_BIT)
+#define LBR_IND_CALL (1 << LBR_IND_CALL_BIT)
+#define LBR_RETURN (1 << LBR_RETURN_BIT)
+#define LBR_REL_JMP (1 << LBR_REL_JMP_BIT)
+#define LBR_IND_JMP (1 << LBR_IND_JMP_BIT)
+#define LBR_FAR (1 << LBR_FAR_BIT)
+#define LBR_CALL_STACK (1 << LBR_CALL_STACK_BIT)
+#define LBR_NO_INFO (1ULL << LBR_NO_INFO_BIT)
+
+#define LBR_PLM (LBR_KERNEL | LBR_USER)
+
+#define LBR_SEL_MASK 0x3ff /* valid bits in LBR_SELECT */
+#define LBR_NOT_SUPP -1 /* LBR filter not supported */
+#define LBR_IGN 0 /* ignored */
+
+#define LBR_ANY \
+ (LBR_JCC |\
+ LBR_REL_CALL |\
+ LBR_IND_CALL |\
+ LBR_RETURN |\
+ LBR_REL_JMP |\
+ LBR_IND_JMP |\
+ LBR_FAR)
+
+#define LBR_FROM_FLAG_MISPRED BIT_ULL(63)
+#define LBR_FROM_FLAG_IN_TX BIT_ULL(62)
+#define LBR_FROM_FLAG_ABORT BIT_ULL(61)
+
+#define LBR_FROM_SIGNEXT_2MSB (BIT_ULL(60) | BIT_ULL(59))
+
+/*
+ * x86control flow change classification
+ * x86control flow changes include branches, interrupts, traps, faults
+ */
+enum {
+ X86_BR_NONE = 0, /* unknown */
+
+ X86_BR_USER = 1 << 0, /* branch target is user */
+ X86_BR_KERNEL = 1 << 1, /* branch target is kernel */
+
+ X86_BR_CALL = 1 << 2, /* call */
+ X86_BR_RET = 1 << 3, /* return */
+ X86_BR_SYSCALL = 1 << 4, /* syscall */
+ X86_BR_SYSRET = 1 << 5, /* syscall return */
+ X86_BR_INT = 1 << 6, /* sw interrupt */
+ X86_BR_IRET = 1 << 7, /* return from interrupt */
+ X86_BR_JCC = 1 << 8, /* conditional */
+ X86_BR_JMP = 1 << 9, /* jump */
+ X86_BR_IRQ = 1 << 10,/* hw interrupt or trap or fault */
+ X86_BR_IND_CALL = 1 << 11,/* indirect calls */
+ X86_BR_ABORT = 1 << 12,/* transaction abort */
+ X86_BR_IN_TX = 1 << 13,/* in transaction */
+ X86_BR_NO_TX = 1 << 14,/* not in transaction */
+ X86_BR_ZERO_CALL = 1 << 15,/* zero length call */
+ X86_BR_CALL_STACK = 1 << 16,/* call stack */
+ X86_BR_IND_JMP = 1 << 17,/* indirect jump */
+
+ X86_BR_TYPE_SAVE = 1 << 18,/* indicate to save branch type */
+
+};
+
+#define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL)
+#define X86_BR_ANYTX (X86_BR_NO_TX | X86_BR_IN_TX)
+
+#define X86_BR_ANY \
+ (X86_BR_CALL |\
+ X86_BR_RET |\
+ X86_BR_SYSCALL |\
+ X86_BR_SYSRET |\
+ X86_BR_INT |\
+ X86_BR_IRET |\
+ X86_BR_JCC |\
+ X86_BR_JMP |\
+ X86_BR_IRQ |\
+ X86_BR_ABORT |\
+ X86_BR_IND_CALL |\
+ X86_BR_IND_JMP |\
+ X86_BR_ZERO_CALL)
+
+#define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY)
+
+#define X86_BR_ANY_CALL \
+ (X86_BR_CALL |\
+ X86_BR_IND_CALL |\
+ X86_BR_ZERO_CALL |\
+ X86_BR_SYSCALL |\
+ X86_BR_IRQ |\
+ X86_BR_INT)
+
+static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc);
+
+/*
+ * We only support LBR implementations that have FREEZE_LBRS_ON_PMI
+ * otherwise it becomes near impossible to get a reliable stack.
+ */
+
+static void __intel_pmu_lbr_enable(bool pmi)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ u64 debugctl, lbr_select = 0, orig_debugctl;
+
+ /*
+ * No need to unfreeze manually, as v4 can do that as part
+ * of the GLOBAL_STATUS ack.
+ */
+ if (pmi && x86_pmu.version >= 4)
+ return;
+
+ /*
+ * No need to reprogram LBR_SELECT in a PMI, as it
+ * did not change.
+ */
+ if (cpuc->lbr_sel)
+ lbr_select = cpuc->lbr_sel->config & x86_pmu.lbr_sel_mask;
+ if (!pmi && cpuc->lbr_sel)
+ wrmsrl(MSR_LBR_SELECT, lbr_select);
+
+ rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+ orig_debugctl = debugctl;
+ debugctl |= DEBUGCTLMSR_LBR;
+ /*
+ * LBR callstack does not work well with FREEZE_LBRS_ON_PMI.
+ * If FREEZE_LBRS_ON_PMI is set, PMI near call/return instructions
+ * may cause superfluous increase/decrease of LBR_TOS.
+ */
+ if (!(lbr_select & LBR_CALL_STACK))
+ debugctl |= DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
+ if (orig_debugctl != debugctl)
+ wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+}
+
+static void __intel_pmu_lbr_disable(void)
+{
+ u64 debugctl;
+
+ rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+ debugctl &= ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
+ wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+}
+
+static void intel_pmu_lbr_reset_32(void)
+{
+ int i;
+
+ for (i = 0; i < x86_pmu.lbr_nr; i++)
+ wrmsrl(x86_pmu.lbr_from + i, 0);
+}
+
+static void intel_pmu_lbr_reset_64(void)
+{
+ int i;
+
+ for (i = 0; i < x86_pmu.lbr_nr; i++) {
+ wrmsrl(x86_pmu.lbr_from + i, 0);
+ wrmsrl(x86_pmu.lbr_to + i, 0);
+ if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
+ wrmsrl(MSR_LBR_INFO_0 + i, 0);
+ }
+}
+
+void intel_pmu_lbr_reset(void)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ if (!x86_pmu.lbr_nr)
+ return;
+
+ if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
+ intel_pmu_lbr_reset_32();
+ else
+ intel_pmu_lbr_reset_64();
+
+ cpuc->last_task_ctx = NULL;
+ cpuc->last_log_id = 0;
+}
+
+/*
+ * TOS = most recently recorded branch
+ */
+static inline u64 intel_pmu_lbr_tos(void)
+{
+ u64 tos;
+
+ rdmsrl(x86_pmu.lbr_tos, tos);
+ return tos;
+}
+
+enum {
+ LBR_NONE,
+ LBR_VALID,
+};
+
+/*
+ * For formats with LBR_TSX flags (e.g. LBR_FORMAT_EIP_FLAGS2), bits 61:62 in
+ * MSR_LAST_BRANCH_FROM_x are the TSX flags when TSX is supported, but when
+ * TSX is not supported they have no consistent behavior:
+ *
+ * - For wrmsr(), bits 61:62 are considered part of the sign extension.
+ * - For HW updates (branch captures) bits 61:62 are always OFF and are not
+ * part of the sign extension.
+ *
+ * Therefore, if:
+ *
+ * 1) LBR has TSX format
+ * 2) CPU has no TSX support enabled
+ *
+ * ... then any value passed to wrmsr() must be sign extended to 63 bits and any
+ * value from rdmsr() must be converted to have a 61 bits sign extension,
+ * ignoring the TSX flags.
+ */
+static inline bool lbr_from_signext_quirk_needed(void)
+{
+ int lbr_format = x86_pmu.intel_cap.lbr_format;
+ bool tsx_support = boot_cpu_has(X86_FEATURE_HLE) ||
+ boot_cpu_has(X86_FEATURE_RTM);
+
+ return !tsx_support && (lbr_desc[lbr_format] & LBR_TSX);
+}
+
+DEFINE_STATIC_KEY_FALSE(lbr_from_quirk_key);
+
+/* If quirk is enabled, ensure sign extension is 63 bits: */
+inline u64 lbr_from_signext_quirk_wr(u64 val)
+{
+ if (static_branch_unlikely(&lbr_from_quirk_key)) {
+ /*
+ * Sign extend into bits 61:62 while preserving bit 63.
+ *
+ * Quirk is enabled when TSX is disabled. Therefore TSX bits
+ * in val are always OFF and must be changed to be sign
+ * extension bits. Since bits 59:60 are guaranteed to be
+ * part of the sign extension bits, we can just copy them
+ * to 61:62.
+ */
+ val |= (LBR_FROM_SIGNEXT_2MSB & val) << 2;
+ }
+ return val;
+}
+
+/*
+ * If quirk is needed, ensure sign extension is 61 bits:
+ */
+static u64 lbr_from_signext_quirk_rd(u64 val)
+{
+ if (static_branch_unlikely(&lbr_from_quirk_key)) {
+ /*
+ * Quirk is on when TSX is not enabled. Therefore TSX
+ * flags must be read as OFF.
+ */
+ val &= ~(LBR_FROM_FLAG_IN_TX | LBR_FROM_FLAG_ABORT);
+ }
+ return val;
+}
+
+static inline void wrlbr_from(unsigned int idx, u64 val)
+{
+ val = lbr_from_signext_quirk_wr(val);
+ wrmsrl(x86_pmu.lbr_from + idx, val);
+}
+
+static inline void wrlbr_to(unsigned int idx, u64 val)
+{
+ wrmsrl(x86_pmu.lbr_to + idx, val);
+}
+
+static inline u64 rdlbr_from(unsigned int idx)
+{
+ u64 val;
+
+ rdmsrl(x86_pmu.lbr_from + idx, val);
+
+ return lbr_from_signext_quirk_rd(val);
+}
+
+static inline u64 rdlbr_to(unsigned int idx)
+{
+ u64 val;
+
+ rdmsrl(x86_pmu.lbr_to + idx, val);
+
+ return val;
+}
+
+static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ int i;
+ unsigned lbr_idx, mask;
+ u64 tos;
+
+ if (task_ctx->lbr_callstack_users == 0 ||
+ task_ctx->lbr_stack_state == LBR_NONE) {
+ intel_pmu_lbr_reset();
+ return;
+ }
+
+ tos = task_ctx->tos;
+ /*
+ * Does not restore the LBR registers, if
+ * - No one else touched them, and
+ * - Did not enter C6
+ */
+ if ((task_ctx == cpuc->last_task_ctx) &&
+ (task_ctx->log_id == cpuc->last_log_id) &&
+ rdlbr_from(tos)) {
+ task_ctx->lbr_stack_state = LBR_NONE;
+ return;
+ }
+
+ mask = x86_pmu.lbr_nr - 1;
+ for (i = 0; i < task_ctx->valid_lbrs; i++) {
+ lbr_idx = (tos - i) & mask;
+ wrlbr_from(lbr_idx, task_ctx->lbr_from[i]);
+ wrlbr_to (lbr_idx, task_ctx->lbr_to[i]);
+
+ if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
+ wrmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]);
+ }
+
+ for (; i < x86_pmu.lbr_nr; i++) {
+ lbr_idx = (tos - i) & mask;
+ wrlbr_from(lbr_idx, 0);
+ wrlbr_to(lbr_idx, 0);
+ if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
+ wrmsrl(MSR_LBR_INFO_0 + lbr_idx, 0);
+ }
+
+ wrmsrl(x86_pmu.lbr_tos, tos);
+ task_ctx->lbr_stack_state = LBR_NONE;
+}
+
+static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ unsigned lbr_idx, mask;
+ u64 tos, from;
+ int i;
+
+ if (task_ctx->lbr_callstack_users == 0) {
+ task_ctx->lbr_stack_state = LBR_NONE;
+ return;
+ }
+
+ mask = x86_pmu.lbr_nr - 1;
+ tos = intel_pmu_lbr_tos();
+ for (i = 0; i < x86_pmu.lbr_nr; i++) {
+ lbr_idx = (tos - i) & mask;
+ from = rdlbr_from(lbr_idx);
+ if (!from)
+ break;
+ task_ctx->lbr_from[i] = from;
+ task_ctx->lbr_to[i] = rdlbr_to(lbr_idx);
+ if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
+ rdmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]);
+ }
+ task_ctx->valid_lbrs = i;
+ task_ctx->tos = tos;
+ task_ctx->lbr_stack_state = LBR_VALID;
+
+ cpuc->last_task_ctx = task_ctx;
+ cpuc->last_log_id = ++task_ctx->log_id;
+}
+
+void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct x86_perf_task_context *task_ctx;
+
+ if (!cpuc->lbr_users)
+ return;
+
+ /*
+ * If LBR callstack feature is enabled and the stack was saved when
+ * the task was scheduled out, restore the stack. Otherwise flush
+ * the LBR stack.
+ */
+ task_ctx = ctx ? ctx->task_ctx_data : NULL;
+ if (task_ctx) {
+ if (sched_in)
+ __intel_pmu_lbr_restore(task_ctx);
+ else
+ __intel_pmu_lbr_save(task_ctx);
+ return;
+ }
+
+ /*
+ * Since a context switch can flip the address space and LBR entries
+ * are not tagged with an identifier, we need to wipe the LBR, even for
+ * per-cpu events. You simply cannot resolve the branches from the old
+ * address space.
+ */
+ if (sched_in)
+ intel_pmu_lbr_reset();
+}
+
+static inline bool branch_user_callstack(unsigned br_sel)
+{
+ return (br_sel & X86_BR_USER) && (br_sel & X86_BR_CALL_STACK);
+}
+
+void intel_pmu_lbr_add(struct perf_event *event)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct x86_perf_task_context *task_ctx;
+
+ if (!x86_pmu.lbr_nr)
+ return;
+
+ cpuc->br_sel = event->hw.branch_reg.reg;
+
+ if (branch_user_callstack(cpuc->br_sel) && event->ctx->task_ctx_data) {
+ task_ctx = event->ctx->task_ctx_data;
+ task_ctx->lbr_callstack_users++;
+ }
+
+ /*
+ * Request pmu::sched_task() callback, which will fire inside the
+ * regular perf event scheduling, so that call will:
+ *
+ * - restore or wipe; when LBR-callstack,
+ * - wipe; otherwise,
+ *
+ * when this is from __perf_event_task_sched_in().
+ *
+ * However, if this is from perf_install_in_context(), no such callback
+ * will follow and we'll need to reset the LBR here if this is the
+ * first LBR event.
+ *
+ * The problem is, we cannot tell these cases apart... but we can
+ * exclude the biggest chunk of cases by looking at
+ * event->total_time_running. An event that has accrued runtime cannot
+ * be 'new'. Conversely, a new event can get installed through the
+ * context switch path for the first time.
+ */
+ perf_sched_cb_inc(event->ctx->pmu);
+ if (!cpuc->lbr_users++ && !event->total_time_running)
+ intel_pmu_lbr_reset();
+}
+
+void intel_pmu_lbr_del(struct perf_event *event)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct x86_perf_task_context *task_ctx;
+
+ if (!x86_pmu.lbr_nr)
+ return;
+
+ if (branch_user_callstack(cpuc->br_sel) &&
+ event->ctx->task_ctx_data) {
+ task_ctx = event->ctx->task_ctx_data;
+ task_ctx->lbr_callstack_users--;
+ }
+
+ cpuc->lbr_users--;
+ WARN_ON_ONCE(cpuc->lbr_users < 0);
+ perf_sched_cb_dec(event->ctx->pmu);
+}
+
+void intel_pmu_lbr_enable_all(bool pmi)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ if (cpuc->lbr_users)
+ __intel_pmu_lbr_enable(pmi);
+}
+
+void intel_pmu_lbr_disable_all(void)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ if (cpuc->lbr_users)
+ __intel_pmu_lbr_disable();
+}
+
+static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
+{
+ unsigned long mask = x86_pmu.lbr_nr - 1;
+ u64 tos = intel_pmu_lbr_tos();
+ int i;
+
+ for (i = 0; i < x86_pmu.lbr_nr; i++) {
+ unsigned long lbr_idx = (tos - i) & mask;
+ union {
+ struct {
+ u32 from;
+ u32 to;
+ };
+ u64 lbr;
+ } msr_lastbranch;
+
+ rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr);
+
+ cpuc->lbr_entries[i].from = msr_lastbranch.from;
+ cpuc->lbr_entries[i].to = msr_lastbranch.to;
+ cpuc->lbr_entries[i].mispred = 0;
+ cpuc->lbr_entries[i].predicted = 0;
+ cpuc->lbr_entries[i].in_tx = 0;
+ cpuc->lbr_entries[i].abort = 0;
+ cpuc->lbr_entries[i].cycles = 0;
+ cpuc->lbr_entries[i].type = 0;
+ cpuc->lbr_entries[i].reserved = 0;
+ }
+ cpuc->lbr_stack.nr = i;
+}
+
+/*
+ * Due to lack of segmentation in Linux the effective address (offset)
+ * is the same as the linear address, allowing us to merge the LIP and EIP
+ * LBR formats.
+ */
+static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
+{
+ bool need_info = false, call_stack = false;
+ unsigned long mask = x86_pmu.lbr_nr - 1;
+ int lbr_format = x86_pmu.intel_cap.lbr_format;
+ u64 tos = intel_pmu_lbr_tos();
+ int i;
+ int out = 0;
+ int num = x86_pmu.lbr_nr;
+
+ if (cpuc->lbr_sel) {
+ need_info = !(cpuc->lbr_sel->config & LBR_NO_INFO);
+ if (cpuc->lbr_sel->config & LBR_CALL_STACK)
+ call_stack = true;
+ }
+
+ for (i = 0; i < num; i++) {
+ unsigned long lbr_idx = (tos - i) & mask;
+ u64 from, to, mis = 0, pred = 0, in_tx = 0, abort = 0;
+ int skip = 0;
+ u16 cycles = 0;
+ int lbr_flags = lbr_desc[lbr_format];
+
+ from = rdlbr_from(lbr_idx);
+ to = rdlbr_to(lbr_idx);
+
+ /*
+ * Read LBR call stack entries
+ * until invalid entry (0s) is detected.
+ */
+ if (call_stack && !from)
+ break;
+
+ if (lbr_format == LBR_FORMAT_INFO && need_info) {
+ u64 info;
+
+ rdmsrl(MSR_LBR_INFO_0 + lbr_idx, info);
+ mis = !!(info & LBR_INFO_MISPRED);
+ pred = !mis;
+ in_tx = !!(info & LBR_INFO_IN_TX);
+ abort = !!(info & LBR_INFO_ABORT);
+ cycles = (info & LBR_INFO_CYCLES);
+ }
+
+ if (lbr_format == LBR_FORMAT_TIME) {
+ mis = !!(from & LBR_FROM_FLAG_MISPRED);
+ pred = !mis;
+ skip = 1;
+ cycles = ((to >> 48) & LBR_INFO_CYCLES);
+
+ to = (u64)((((s64)to) << 16) >> 16);
+ }
+
+ if (lbr_flags & LBR_EIP_FLAGS) {
+ mis = !!(from & LBR_FROM_FLAG_MISPRED);
+ pred = !mis;
+ skip = 1;
+ }
+ if (lbr_flags & LBR_TSX) {
+ in_tx = !!(from & LBR_FROM_FLAG_IN_TX);
+ abort = !!(from & LBR_FROM_FLAG_ABORT);
+ skip = 3;
+ }
+ from = (u64)((((s64)from) << skip) >> skip);
+
+ /*
+ * Some CPUs report duplicated abort records,
+ * with the second entry not having an abort bit set.
+ * Skip them here. This loop runs backwards,
+ * so we need to undo the previous record.
+ * If the abort just happened outside the window
+ * the extra entry cannot be removed.
+ */
+ if (abort && x86_pmu.lbr_double_abort && out > 0)
+ out--;
+
+ cpuc->lbr_entries[out].from = from;
+ cpuc->lbr_entries[out].to = to;
+ cpuc->lbr_entries[out].mispred = mis;
+ cpuc->lbr_entries[out].predicted = pred;
+ cpuc->lbr_entries[out].in_tx = in_tx;
+ cpuc->lbr_entries[out].abort = abort;
+ cpuc->lbr_entries[out].cycles = cycles;
+ cpuc->lbr_entries[out].type = 0;
+ cpuc->lbr_entries[out].reserved = 0;
+ out++;
+ }
+ cpuc->lbr_stack.nr = out;
+}
+
+void intel_pmu_lbr_read(void)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ if (!cpuc->lbr_users)
+ return;
+
+ if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
+ intel_pmu_lbr_read_32(cpuc);
+ else
+ intel_pmu_lbr_read_64(cpuc);
+
+ intel_pmu_lbr_filter(cpuc);
+}
+
+/*
+ * SW filter is used:
+ * - in case there is no HW filter
+ * - in case the HW filter has errata or limitations
+ */
+static int intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
+{
+ u64 br_type = event->attr.branch_sample_type;
+ int mask = 0;
+
+ if (br_type & PERF_SAMPLE_BRANCH_USER)
+ mask |= X86_BR_USER;
+
+ if (br_type & PERF_SAMPLE_BRANCH_KERNEL)
+ mask |= X86_BR_KERNEL;
+
+ /* we ignore BRANCH_HV here */
+
+ if (br_type & PERF_SAMPLE_BRANCH_ANY)
+ mask |= X86_BR_ANY;
+
+ if (br_type & PERF_SAMPLE_BRANCH_ANY_CALL)
+ mask |= X86_BR_ANY_CALL;
+
+ if (br_type & PERF_SAMPLE_BRANCH_ANY_RETURN)
+ mask |= X86_BR_RET | X86_BR_IRET | X86_BR_SYSRET;
+
+ if (br_type & PERF_SAMPLE_BRANCH_IND_CALL)
+ mask |= X86_BR_IND_CALL;
+
+ if (br_type & PERF_SAMPLE_BRANCH_ABORT_TX)
+ mask |= X86_BR_ABORT;
+
+ if (br_type & PERF_SAMPLE_BRANCH_IN_TX)
+ mask |= X86_BR_IN_TX;
+
+ if (br_type & PERF_SAMPLE_BRANCH_NO_TX)
+ mask |= X86_BR_NO_TX;
+
+ if (br_type & PERF_SAMPLE_BRANCH_COND)
+ mask |= X86_BR_JCC;
+
+ if (br_type & PERF_SAMPLE_BRANCH_CALL_STACK) {
+ if (!x86_pmu_has_lbr_callstack())
+ return -EOPNOTSUPP;
+ if (mask & ~(X86_BR_USER | X86_BR_KERNEL))
+ return -EINVAL;
+ mask |= X86_BR_CALL | X86_BR_IND_CALL | X86_BR_RET |
+ X86_BR_CALL_STACK;
+ }
+
+ if (br_type & PERF_SAMPLE_BRANCH_IND_JUMP)
+ mask |= X86_BR_IND_JMP;
+
+ if (br_type & PERF_SAMPLE_BRANCH_CALL)
+ mask |= X86_BR_CALL | X86_BR_ZERO_CALL;
+
+ if (br_type & PERF_SAMPLE_BRANCH_TYPE_SAVE)
+ mask |= X86_BR_TYPE_SAVE;
+
+ /*
+ * stash actual user request into reg, it may
+ * be used by fixup code for some CPU
+ */
+ event->hw.branch_reg.reg = mask;
+ return 0;
+}
+
+/*
+ * setup the HW LBR filter
+ * Used only when available, may not be enough to disambiguate
+ * all branches, may need the help of the SW filter
+ */
+static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
+{
+ struct hw_perf_event_extra *reg;
+ u64 br_type = event->attr.branch_sample_type;
+ u64 mask = 0, v;
+ int i;
+
+ for (i = 0; i < PERF_SAMPLE_BRANCH_MAX_SHIFT; i++) {
+ if (!(br_type & (1ULL << i)))
+ continue;
+
+ v = x86_pmu.lbr_sel_map[i];
+ if (v == LBR_NOT_SUPP)
+ return -EOPNOTSUPP;
+
+ if (v != LBR_IGN)
+ mask |= v;
+ }
+
+ reg = &event->hw.branch_reg;
+ reg->idx = EXTRA_REG_LBR;
+
+ /*
+ * The first 9 bits (LBR_SEL_MASK) in LBR_SELECT operate
+ * in suppress mode. So LBR_SELECT should be set to
+ * (~mask & LBR_SEL_MASK) | (mask & ~LBR_SEL_MASK)
+ * But the 10th bit LBR_CALL_STACK does not operate
+ * in suppress mode.
+ */
+ reg->config = mask ^ (x86_pmu.lbr_sel_mask & ~LBR_CALL_STACK);
+
+ if ((br_type & PERF_SAMPLE_BRANCH_NO_CYCLES) &&
+ (br_type & PERF_SAMPLE_BRANCH_NO_FLAGS) &&
+ (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO))
+ reg->config |= LBR_NO_INFO;
+
+ return 0;
+}
+
+int intel_pmu_setup_lbr_filter(struct perf_event *event)
+{
+ int ret = 0;
+
+ /*
+ * no LBR on this PMU
+ */
+ if (!x86_pmu.lbr_nr)
+ return -EOPNOTSUPP;
+
+ /*
+ * setup SW LBR filter
+ */
+ ret = intel_pmu_setup_sw_lbr_filter(event);
+ if (ret)
+ return ret;
+
+ /*
+ * setup HW LBR filter, if any
+ */
+ if (x86_pmu.lbr_sel_map)
+ ret = intel_pmu_setup_hw_lbr_filter(event);
+
+ return ret;
+}
+
+/*
+ * return the type of control flow change at address "from"
+ * instruction is not necessarily a branch (in case of interrupt).
+ *
+ * The branch type returned also includes the priv level of the
+ * target of the control flow change (X86_BR_USER, X86_BR_KERNEL).
+ *
+ * If a branch type is unknown OR the instruction cannot be
+ * decoded (e.g., text page not present), then X86_BR_NONE is
+ * returned.
+ */
+static int branch_type(unsigned long from, unsigned long to, int abort)
+{
+ struct insn insn;
+ void *addr;
+ int bytes_read, bytes_left;
+ int ret = X86_BR_NONE;
+ int ext, to_plm, from_plm;
+ u8 buf[MAX_INSN_SIZE];
+ int is64 = 0;
+
+ to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER;
+ from_plm = kernel_ip(from) ? X86_BR_KERNEL : X86_BR_USER;
+
+ /*
+ * maybe zero if lbr did not fill up after a reset by the time
+ * we get a PMU interrupt
+ */
+ if (from == 0 || to == 0)
+ return X86_BR_NONE;
+
+ if (abort)
+ return X86_BR_ABORT | to_plm;
+
+ if (from_plm == X86_BR_USER) {
+ /*
+ * can happen if measuring at the user level only
+ * and we interrupt in a kernel thread, e.g., idle.
+ */
+ if (!current->mm)
+ return X86_BR_NONE;
+
+ /* may fail if text not present */
+ bytes_left = copy_from_user_nmi(buf, (void __user *)from,
+ MAX_INSN_SIZE);
+ bytes_read = MAX_INSN_SIZE - bytes_left;
+ if (!bytes_read)
+ return X86_BR_NONE;
+
+ addr = buf;
+ } else {
+ /*
+ * The LBR logs any address in the IP, even if the IP just
+ * faulted. This means userspace can control the from address.
+ * Ensure we don't blindy read any address by validating it is
+ * a known text address.
+ */
+ if (kernel_text_address(from)) {
+ addr = (void *)from;
+ /*
+ * Assume we can get the maximum possible size
+ * when grabbing kernel data. This is not
+ * _strictly_ true since we could possibly be
+ * executing up next to a memory hole, but
+ * it is very unlikely to be a problem.
+ */
+ bytes_read = MAX_INSN_SIZE;
+ } else {
+ return X86_BR_NONE;
+ }
+ }
+
+ /*
+ * decoder needs to know the ABI especially
+ * on 64-bit systems running 32-bit apps
+ */
+#ifdef CONFIG_X86_64
+ is64 = kernel_ip((unsigned long)addr) || !test_thread_flag(TIF_IA32);
+#endif
+ insn_init(&insn, addr, bytes_read, is64);
+ insn_get_opcode(&insn);
+ if (!insn.opcode.got)
+ return X86_BR_ABORT;
+
+ switch (insn.opcode.bytes[0]) {
+ case 0xf:
+ switch (insn.opcode.bytes[1]) {
+ case 0x05: /* syscall */
+ case 0x34: /* sysenter */
+ ret = X86_BR_SYSCALL;
+ break;
+ case 0x07: /* sysret */
+ case 0x35: /* sysexit */
+ ret = X86_BR_SYSRET;
+ break;
+ case 0x80 ... 0x8f: /* conditional */
+ ret = X86_BR_JCC;
+ break;
+ default:
+ ret = X86_BR_NONE;
+ }
+ break;
+ case 0x70 ... 0x7f: /* conditional */
+ ret = X86_BR_JCC;
+ break;
+ case 0xc2: /* near ret */
+ case 0xc3: /* near ret */
+ case 0xca: /* far ret */
+ case 0xcb: /* far ret */
+ ret = X86_BR_RET;
+ break;
+ case 0xcf: /* iret */
+ ret = X86_BR_IRET;
+ break;
+ case 0xcc ... 0xce: /* int */
+ ret = X86_BR_INT;
+ break;
+ case 0xe8: /* call near rel */
+ insn_get_immediate(&insn);
+ if (insn.immediate1.value == 0) {
+ /* zero length call */
+ ret = X86_BR_ZERO_CALL;
+ break;
+ }
+ case 0x9a: /* call far absolute */
+ ret = X86_BR_CALL;
+ break;
+ case 0xe0 ... 0xe3: /* loop jmp */
+ ret = X86_BR_JCC;
+ break;
+ case 0xe9 ... 0xeb: /* jmp */
+ ret = X86_BR_JMP;
+ break;
+ case 0xff: /* call near absolute, call far absolute ind */
+ insn_get_modrm(&insn);
+ ext = (insn.modrm.bytes[0] >> 3) & 0x7;
+ switch (ext) {
+ case 2: /* near ind call */
+ case 3: /* far ind call */
+ ret = X86_BR_IND_CALL;
+ break;
+ case 4:
+ case 5:
+ ret = X86_BR_IND_JMP;
+ break;
+ }
+ break;
+ default:
+ ret = X86_BR_NONE;
+ }
+ /*
+ * interrupts, traps, faults (and thus ring transition) may
+ * occur on any instructions. Thus, to classify them correctly,
+ * we need to first look at the from and to priv levels. If they
+ * are different and to is in the kernel, then it indicates
+ * a ring transition. If the from instruction is not a ring
+ * transition instr (syscall, systenter, int), then it means
+ * it was a irq, trap or fault.
+ *
+ * we have no way of detecting kernel to kernel faults.
+ */
+ if (from_plm == X86_BR_USER && to_plm == X86_BR_KERNEL
+ && ret != X86_BR_SYSCALL && ret != X86_BR_INT)
+ ret = X86_BR_IRQ;
+
+ /*
+ * branch priv level determined by target as
+ * is done by HW when LBR_SELECT is implemented
+ */
+ if (ret != X86_BR_NONE)
+ ret |= to_plm;
+
+ return ret;
+}
+
+#define X86_BR_TYPE_MAP_MAX 16
+
+static int branch_map[X86_BR_TYPE_MAP_MAX] = {
+ PERF_BR_CALL, /* X86_BR_CALL */
+ PERF_BR_RET, /* X86_BR_RET */
+ PERF_BR_SYSCALL, /* X86_BR_SYSCALL */
+ PERF_BR_SYSRET, /* X86_BR_SYSRET */
+ PERF_BR_UNKNOWN, /* X86_BR_INT */
+ PERF_BR_UNKNOWN, /* X86_BR_IRET */
+ PERF_BR_COND, /* X86_BR_JCC */
+ PERF_BR_UNCOND, /* X86_BR_JMP */
+ PERF_BR_UNKNOWN, /* X86_BR_IRQ */
+ PERF_BR_IND_CALL, /* X86_BR_IND_CALL */
+ PERF_BR_UNKNOWN, /* X86_BR_ABORT */
+ PERF_BR_UNKNOWN, /* X86_BR_IN_TX */
+ PERF_BR_UNKNOWN, /* X86_BR_NO_TX */
+ PERF_BR_CALL, /* X86_BR_ZERO_CALL */
+ PERF_BR_UNKNOWN, /* X86_BR_CALL_STACK */
+ PERF_BR_IND, /* X86_BR_IND_JMP */
+};
+
+static int
+common_branch_type(int type)
+{
+ int i;
+
+ type >>= 2; /* skip X86_BR_USER and X86_BR_KERNEL */
+
+ if (type) {
+ i = __ffs(type);
+ if (i < X86_BR_TYPE_MAP_MAX)
+ return branch_map[i];
+ }
+
+ return PERF_BR_UNKNOWN;
+}
+
+/*
+ * implement actual branch filter based on user demand.
+ * Hardware may not exactly satisfy that request, thus
+ * we need to inspect opcodes. Mismatched branches are
+ * discarded. Therefore, the number of branches returned
+ * in PERF_SAMPLE_BRANCH_STACK sample may vary.
+ */
+static void
+intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
+{
+ u64 from, to;
+ int br_sel = cpuc->br_sel;
+ int i, j, type;
+ bool compress = false;
+
+ /* if sampling all branches, then nothing to filter */
+ if (((br_sel & X86_BR_ALL) == X86_BR_ALL) &&
+ ((br_sel & X86_BR_TYPE_SAVE) != X86_BR_TYPE_SAVE))
+ return;
+
+ for (i = 0; i < cpuc->lbr_stack.nr; i++) {
+
+ from = cpuc->lbr_entries[i].from;
+ to = cpuc->lbr_entries[i].to;
+
+ type = branch_type(from, to, cpuc->lbr_entries[i].abort);
+ if (type != X86_BR_NONE && (br_sel & X86_BR_ANYTX)) {
+ if (cpuc->lbr_entries[i].in_tx)
+ type |= X86_BR_IN_TX;
+ else
+ type |= X86_BR_NO_TX;
+ }
+
+ /* if type does not correspond, then discard */
+ if (type == X86_BR_NONE || (br_sel & type) != type) {
+ cpuc->lbr_entries[i].from = 0;
+ compress = true;
+ }
+
+ if ((br_sel & X86_BR_TYPE_SAVE) == X86_BR_TYPE_SAVE)
+ cpuc->lbr_entries[i].type = common_branch_type(type);
+ }
+
+ if (!compress)
+ return;
+
+ /* remove all entries with from=0 */
+ for (i = 0; i < cpuc->lbr_stack.nr; ) {
+ if (!cpuc->lbr_entries[i].from) {
+ j = i;
+ while (++j < cpuc->lbr_stack.nr)
+ cpuc->lbr_entries[j-1] = cpuc->lbr_entries[j];
+ cpuc->lbr_stack.nr--;
+ if (!cpuc->lbr_entries[i].from)
+ continue;
+ }
+ i++;
+ }
+}
+
+/*
+ * Map interface branch filters onto LBR filters
+ */
+static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
+ [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY,
+ [PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_USER,
+ [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_KERNEL,
+ [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGN,
+ [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_RETURN | LBR_REL_JMP
+ | LBR_IND_JMP | LBR_FAR,
+ /*
+ * NHM/WSM erratum: must include REL_JMP+IND_JMP to get CALL branches
+ */
+ [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] =
+ LBR_REL_CALL | LBR_IND_CALL | LBR_REL_JMP | LBR_IND_JMP | LBR_FAR,
+ /*
+ * NHM/WSM erratum: must include IND_JMP to capture IND_CALL
+ */
+ [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL | LBR_IND_JMP,
+ [PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC,
+ [PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = LBR_IND_JMP,
+};
+
+static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
+ [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY,
+ [PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_USER,
+ [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_KERNEL,
+ [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGN,
+ [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_RETURN | LBR_FAR,
+ [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = LBR_REL_CALL | LBR_IND_CALL
+ | LBR_FAR,
+ [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL,
+ [PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC,
+ [PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = LBR_IND_JMP,
+ [PERF_SAMPLE_BRANCH_CALL_SHIFT] = LBR_REL_CALL,
+};
+
+static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
+ [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY,
+ [PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_USER,
+ [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_KERNEL,
+ [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGN,
+ [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_RETURN | LBR_FAR,
+ [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = LBR_REL_CALL | LBR_IND_CALL
+ | LBR_FAR,
+ [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL,
+ [PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC,
+ [PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] = LBR_REL_CALL | LBR_IND_CALL
+ | LBR_RETURN | LBR_CALL_STACK,
+ [PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = LBR_IND_JMP,
+ [PERF_SAMPLE_BRANCH_CALL_SHIFT] = LBR_REL_CALL,
+};
+
+/* core */
+void __init intel_pmu_lbr_init_core(void)
+{
+ x86_pmu.lbr_nr = 4;
+ x86_pmu.lbr_tos = MSR_LBR_TOS;
+ x86_pmu.lbr_from = MSR_LBR_CORE_FROM;
+ x86_pmu.lbr_to = MSR_LBR_CORE_TO;
+
+ /*
+ * SW branch filter usage:
+ * - compensate for lack of HW filter
+ */
+}
+
+/* nehalem/westmere */
+void __init intel_pmu_lbr_init_nhm(void)
+{
+ x86_pmu.lbr_nr = 16;
+ x86_pmu.lbr_tos = MSR_LBR_TOS;
+ x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
+ x86_pmu.lbr_to = MSR_LBR_NHM_TO;
+
+ x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
+ x86_pmu.lbr_sel_map = nhm_lbr_sel_map;
+
+ /*
+ * SW branch filter usage:
+ * - workaround LBR_SEL errata (see above)
+ * - support syscall, sysret capture.
+ * That requires LBR_FAR but that means far
+ * jmp need to be filtered out
+ */
+}
+
+/* sandy bridge */
+void __init intel_pmu_lbr_init_snb(void)
+{
+ x86_pmu.lbr_nr = 16;
+ x86_pmu.lbr_tos = MSR_LBR_TOS;
+ x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
+ x86_pmu.lbr_to = MSR_LBR_NHM_TO;
+
+ x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
+ x86_pmu.lbr_sel_map = snb_lbr_sel_map;
+
+ /*
+ * SW branch filter usage:
+ * - support syscall, sysret capture.
+ * That requires LBR_FAR but that means far
+ * jmp need to be filtered out
+ */
+}
+
+/* haswell */
+void intel_pmu_lbr_init_hsw(void)
+{
+ x86_pmu.lbr_nr = 16;
+ x86_pmu.lbr_tos = MSR_LBR_TOS;
+ x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
+ x86_pmu.lbr_to = MSR_LBR_NHM_TO;
+
+ x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
+ x86_pmu.lbr_sel_map = hsw_lbr_sel_map;
+
+ if (lbr_from_signext_quirk_needed())
+ static_branch_enable(&lbr_from_quirk_key);
+}
+
+/* skylake */
+__init void intel_pmu_lbr_init_skl(void)
+{
+ x86_pmu.lbr_nr = 32;
+ x86_pmu.lbr_tos = MSR_LBR_TOS;
+ x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
+ x86_pmu.lbr_to = MSR_LBR_NHM_TO;
+
+ x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
+ x86_pmu.lbr_sel_map = hsw_lbr_sel_map;
+
+ /*
+ * SW branch filter usage:
+ * - support syscall, sysret capture.
+ * That requires LBR_FAR but that means far
+ * jmp need to be filtered out
+ */
+}
+
+/* atom */
+void __init intel_pmu_lbr_init_atom(void)
+{
+ /*
+ * only models starting at stepping 10 seems
+ * to have an operational LBR which can freeze
+ * on PMU interrupt
+ */
+ if (boot_cpu_data.x86_model == 28
+ && boot_cpu_data.x86_stepping < 10) {
+ pr_cont("LBR disabled due to erratum");
+ return;
+ }
+
+ x86_pmu.lbr_nr = 8;
+ x86_pmu.lbr_tos = MSR_LBR_TOS;
+ x86_pmu.lbr_from = MSR_LBR_CORE_FROM;
+ x86_pmu.lbr_to = MSR_LBR_CORE_TO;
+
+ /*
+ * SW branch filter usage:
+ * - compensate for lack of HW filter
+ */
+}
+
+/* slm */
+void __init intel_pmu_lbr_init_slm(void)
+{
+ x86_pmu.lbr_nr = 8;
+ x86_pmu.lbr_tos = MSR_LBR_TOS;
+ x86_pmu.lbr_from = MSR_LBR_CORE_FROM;
+ x86_pmu.lbr_to = MSR_LBR_CORE_TO;
+
+ x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
+ x86_pmu.lbr_sel_map = nhm_lbr_sel_map;
+
+ /*
+ * SW branch filter usage:
+ * - compensate for lack of HW filter
+ */
+ pr_cont("8-deep LBR, ");
+}
+
+/* Knights Landing */
+void intel_pmu_lbr_init_knl(void)
+{
+ x86_pmu.lbr_nr = 8;
+ x86_pmu.lbr_tos = MSR_LBR_TOS;
+ x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
+ x86_pmu.lbr_to = MSR_LBR_NHM_TO;
+
+ x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
+ x86_pmu.lbr_sel_map = snb_lbr_sel_map;
+
+ /* Knights Landing does have MISPREDICT bit */
+ if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_LIP)
+ x86_pmu.intel_cap.lbr_format = LBR_FORMAT_EIP_FLAGS;
+}
diff --git a/arch/x86/events/intel/p4.c b/arch/x86/events/intel/p4.c
new file mode 100644
index 000000000..d32c0eed3
--- /dev/null
+++ b/arch/x86/events/intel/p4.c
@@ -0,0 +1,1376 @@
+/*
+ * Netburst Performance Events (P4, old Xeon)
+ *
+ * Copyright (C) 2010 Parallels, Inc., Cyrill Gorcunov <gorcunov@openvz.org>
+ * Copyright (C) 2010 Intel Corporation, Lin Ming <ming.m.lin@intel.com>
+ *
+ * For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/perf_event.h>
+
+#include <asm/perf_event_p4.h>
+#include <asm/hardirq.h>
+#include <asm/apic.h>
+
+#include "../perf_event.h"
+
+#define P4_CNTR_LIMIT 3
+/*
+ * array indices: 0,1 - HT threads, used with HT enabled cpu
+ */
+struct p4_event_bind {
+ unsigned int opcode; /* Event code and ESCR selector */
+ unsigned int escr_msr[2]; /* ESCR MSR for this event */
+ unsigned int escr_emask; /* valid ESCR EventMask bits */
+ unsigned int shared; /* event is shared across threads */
+ char cntr[2][P4_CNTR_LIMIT]; /* counter index (offset), -1 on abscence */
+};
+
+struct p4_pebs_bind {
+ unsigned int metric_pebs;
+ unsigned int metric_vert;
+};
+
+/* it sets P4_PEBS_ENABLE_UOP_TAG as well */
+#define P4_GEN_PEBS_BIND(name, pebs, vert) \
+ [P4_PEBS_METRIC__##name] = { \
+ .metric_pebs = pebs | P4_PEBS_ENABLE_UOP_TAG, \
+ .metric_vert = vert, \
+ }
+
+/*
+ * note we have P4_PEBS_ENABLE_UOP_TAG always set here
+ *
+ * it's needed for mapping P4_PEBS_CONFIG_METRIC_MASK bits of
+ * event configuration to find out which values are to be
+ * written into MSR_IA32_PEBS_ENABLE and MSR_P4_PEBS_MATRIX_VERT
+ * resgisters
+ */
+static struct p4_pebs_bind p4_pebs_bind_map[] = {
+ P4_GEN_PEBS_BIND(1stl_cache_load_miss_retired, 0x0000001, 0x0000001),
+ P4_GEN_PEBS_BIND(2ndl_cache_load_miss_retired, 0x0000002, 0x0000001),
+ P4_GEN_PEBS_BIND(dtlb_load_miss_retired, 0x0000004, 0x0000001),
+ P4_GEN_PEBS_BIND(dtlb_store_miss_retired, 0x0000004, 0x0000002),
+ P4_GEN_PEBS_BIND(dtlb_all_miss_retired, 0x0000004, 0x0000003),
+ P4_GEN_PEBS_BIND(tagged_mispred_branch, 0x0018000, 0x0000010),
+ P4_GEN_PEBS_BIND(mob_load_replay_retired, 0x0000200, 0x0000001),
+ P4_GEN_PEBS_BIND(split_load_retired, 0x0000400, 0x0000001),
+ P4_GEN_PEBS_BIND(split_store_retired, 0x0000400, 0x0000002),
+};
+
+/*
+ * Note that we don't use CCCR1 here, there is an
+ * exception for P4_BSQ_ALLOCATION but we just have
+ * no workaround
+ *
+ * consider this binding as resources which particular
+ * event may borrow, it doesn't contain EventMask,
+ * Tags and friends -- they are left to a caller
+ */
+static struct p4_event_bind p4_event_bind_map[] = {
+ [P4_EVENT_TC_DELIVER_MODE] = {
+ .opcode = P4_OPCODE(P4_EVENT_TC_DELIVER_MODE),
+ .escr_msr = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DD) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DB) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DI) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BD) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BB) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BI) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, ID),
+ .shared = 1,
+ .cntr = { {4, 5, -1}, {6, 7, -1} },
+ },
+ [P4_EVENT_BPU_FETCH_REQUEST] = {
+ .opcode = P4_OPCODE(P4_EVENT_BPU_FETCH_REQUEST),
+ .escr_msr = { MSR_P4_BPU_ESCR0, MSR_P4_BPU_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_BPU_FETCH_REQUEST, TCMISS),
+ .cntr = { {0, -1, -1}, {2, -1, -1} },
+ },
+ [P4_EVENT_ITLB_REFERENCE] = {
+ .opcode = P4_OPCODE(P4_EVENT_ITLB_REFERENCE),
+ .escr_msr = { MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, HIT) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, MISS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, HIT_UK),
+ .cntr = { {0, -1, -1}, {2, -1, -1} },
+ },
+ [P4_EVENT_MEMORY_CANCEL] = {
+ .opcode = P4_OPCODE(P4_EVENT_MEMORY_CANCEL),
+ .escr_msr = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_CANCEL, ST_RB_FULL) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_CANCEL, 64K_CONF),
+ .cntr = { {8, 9, -1}, {10, 11, -1} },
+ },
+ [P4_EVENT_MEMORY_COMPLETE] = {
+ .opcode = P4_OPCODE(P4_EVENT_MEMORY_COMPLETE),
+ .escr_msr = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_COMPLETE, LSC) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_COMPLETE, SSC),
+ .cntr = { {8, 9, -1}, {10, 11, -1} },
+ },
+ [P4_EVENT_LOAD_PORT_REPLAY] = {
+ .opcode = P4_OPCODE(P4_EVENT_LOAD_PORT_REPLAY),
+ .escr_msr = { MSR_P4_SAAT_ESCR0, MSR_P4_SAAT_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_LOAD_PORT_REPLAY, SPLIT_LD),
+ .cntr = { {8, 9, -1}, {10, 11, -1} },
+ },
+ [P4_EVENT_STORE_PORT_REPLAY] = {
+ .opcode = P4_OPCODE(P4_EVENT_STORE_PORT_REPLAY),
+ .escr_msr = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_STORE_PORT_REPLAY, SPLIT_ST),
+ .cntr = { {8, 9, -1}, {10, 11, -1} },
+ },
+ [P4_EVENT_MOB_LOAD_REPLAY] = {
+ .opcode = P4_OPCODE(P4_EVENT_MOB_LOAD_REPLAY),
+ .escr_msr = { MSR_P4_MOB_ESCR0, MSR_P4_MOB_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, NO_STA) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, NO_STD) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, PARTIAL_DATA) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, UNALGN_ADDR),
+ .cntr = { {0, -1, -1}, {2, -1, -1} },
+ },
+ [P4_EVENT_PAGE_WALK_TYPE] = {
+ .opcode = P4_OPCODE(P4_EVENT_PAGE_WALK_TYPE),
+ .escr_msr = { MSR_P4_PMH_ESCR0, MSR_P4_PMH_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_PAGE_WALK_TYPE, DTMISS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_PAGE_WALK_TYPE, ITMISS),
+ .shared = 1,
+ .cntr = { {0, -1, -1}, {2, -1, -1} },
+ },
+ [P4_EVENT_BSQ_CACHE_REFERENCE] = {
+ .opcode = P4_OPCODE(P4_EVENT_BSQ_CACHE_REFERENCE),
+ .escr_msr = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS),
+ .cntr = { {0, -1, -1}, {2, -1, -1} },
+ },
+ [P4_EVENT_IOQ_ALLOCATION] = {
+ .opcode = P4_OPCODE(P4_EVENT_IOQ_ALLOCATION),
+ .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, DEFAULT) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, ALL_READ) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, ALL_WRITE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_UC) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WC) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WT) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WP) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WB) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, OWN) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, OTHER) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, PREFETCH),
+ .cntr = { {0, -1, -1}, {2, -1, -1} },
+ },
+ [P4_EVENT_IOQ_ACTIVE_ENTRIES] = { /* shared ESCR */
+ .opcode = P4_OPCODE(P4_EVENT_IOQ_ACTIVE_ENTRIES),
+ .escr_msr = { MSR_P4_FSB_ESCR1, MSR_P4_FSB_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, DEFAULT) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_READ) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_WRITE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_UC) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WC) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WT) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WP) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WB) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, OWN) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, OTHER) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, PREFETCH),
+ .cntr = { {2, -1, -1}, {3, -1, -1} },
+ },
+ [P4_EVENT_FSB_DATA_ACTIVITY] = {
+ .opcode = P4_OPCODE(P4_EVENT_FSB_DATA_ACTIVITY),
+ .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OTHER) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_DRV) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OWN) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OTHER),
+ .shared = 1,
+ .cntr = { {0, -1, -1}, {2, -1, -1} },
+ },
+ [P4_EVENT_BSQ_ALLOCATION] = { /* shared ESCR, broken CCCR1 */
+ .opcode = P4_OPCODE(P4_EVENT_BSQ_ALLOCATION),
+ .escr_msr = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR0 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE0) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE1) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LEN0) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LEN1) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_IO_TYPE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LOCK_TYPE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_CACHE_TYPE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_SPLIT_TYPE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_DEM_TYPE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_ORD_TYPE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE0) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE1) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE2),
+ .cntr = { {0, -1, -1}, {1, -1, -1} },
+ },
+ [P4_EVENT_BSQ_ACTIVE_ENTRIES] = { /* shared ESCR */
+ .opcode = P4_OPCODE(P4_EVENT_BSQ_ACTIVE_ENTRIES),
+ .escr_msr = { MSR_P4_BSU_ESCR1 , MSR_P4_BSU_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE0) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE1) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN0) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN1) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_IO_TYPE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LOCK_TYPE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_CACHE_TYPE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_SPLIT_TYPE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_DEM_TYPE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_ORD_TYPE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE0) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE1) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE2),
+ .cntr = { {2, -1, -1}, {3, -1, -1} },
+ },
+ [P4_EVENT_SSE_INPUT_ASSIST] = {
+ .opcode = P4_OPCODE(P4_EVENT_SSE_INPUT_ASSIST),
+ .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_SSE_INPUT_ASSIST, ALL),
+ .shared = 1,
+ .cntr = { {8, 9, -1}, {10, 11, -1} },
+ },
+ [P4_EVENT_PACKED_SP_UOP] = {
+ .opcode = P4_OPCODE(P4_EVENT_PACKED_SP_UOP),
+ .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_PACKED_SP_UOP, ALL),
+ .shared = 1,
+ .cntr = { {8, 9, -1}, {10, 11, -1} },
+ },
+ [P4_EVENT_PACKED_DP_UOP] = {
+ .opcode = P4_OPCODE(P4_EVENT_PACKED_DP_UOP),
+ .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_PACKED_DP_UOP, ALL),
+ .shared = 1,
+ .cntr = { {8, 9, -1}, {10, 11, -1} },
+ },
+ [P4_EVENT_SCALAR_SP_UOP] = {
+ .opcode = P4_OPCODE(P4_EVENT_SCALAR_SP_UOP),
+ .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_SCALAR_SP_UOP, ALL),
+ .shared = 1,
+ .cntr = { {8, 9, -1}, {10, 11, -1} },
+ },
+ [P4_EVENT_SCALAR_DP_UOP] = {
+ .opcode = P4_OPCODE(P4_EVENT_SCALAR_DP_UOP),
+ .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_SCALAR_DP_UOP, ALL),
+ .shared = 1,
+ .cntr = { {8, 9, -1}, {10, 11, -1} },
+ },
+ [P4_EVENT_64BIT_MMX_UOP] = {
+ .opcode = P4_OPCODE(P4_EVENT_64BIT_MMX_UOP),
+ .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_64BIT_MMX_UOP, ALL),
+ .shared = 1,
+ .cntr = { {8, 9, -1}, {10, 11, -1} },
+ },
+ [P4_EVENT_128BIT_MMX_UOP] = {
+ .opcode = P4_OPCODE(P4_EVENT_128BIT_MMX_UOP),
+ .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_128BIT_MMX_UOP, ALL),
+ .shared = 1,
+ .cntr = { {8, 9, -1}, {10, 11, -1} },
+ },
+ [P4_EVENT_X87_FP_UOP] = {
+ .opcode = P4_OPCODE(P4_EVENT_X87_FP_UOP),
+ .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_X87_FP_UOP, ALL),
+ .shared = 1,
+ .cntr = { {8, 9, -1}, {10, 11, -1} },
+ },
+ [P4_EVENT_TC_MISC] = {
+ .opcode = P4_OPCODE(P4_EVENT_TC_MISC),
+ .escr_msr = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_TC_MISC, FLUSH),
+ .cntr = { {4, 5, -1}, {6, 7, -1} },
+ },
+ [P4_EVENT_GLOBAL_POWER_EVENTS] = {
+ .opcode = P4_OPCODE(P4_EVENT_GLOBAL_POWER_EVENTS),
+ .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING),
+ .cntr = { {0, -1, -1}, {2, -1, -1} },
+ },
+ [P4_EVENT_TC_MS_XFER] = {
+ .opcode = P4_OPCODE(P4_EVENT_TC_MS_XFER),
+ .escr_msr = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_TC_MS_XFER, CISC),
+ .cntr = { {4, 5, -1}, {6, 7, -1} },
+ },
+ [P4_EVENT_UOP_QUEUE_WRITES] = {
+ .opcode = P4_OPCODE(P4_EVENT_UOP_QUEUE_WRITES),
+ .escr_msr = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_BUILD) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_DELIVER) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_ROM),
+ .cntr = { {4, 5, -1}, {6, 7, -1} },
+ },
+ [P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE] = {
+ .opcode = P4_OPCODE(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE),
+ .escr_msr = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR0 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CONDITIONAL) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CALL) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, RETURN) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, INDIRECT),
+ .cntr = { {4, 5, -1}, {6, 7, -1} },
+ },
+ [P4_EVENT_RETIRED_BRANCH_TYPE] = {
+ .opcode = P4_OPCODE(P4_EVENT_RETIRED_BRANCH_TYPE),
+ .escr_msr = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CALL) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT),
+ .cntr = { {4, 5, -1}, {6, 7, -1} },
+ },
+ [P4_EVENT_RESOURCE_STALL] = {
+ .opcode = P4_OPCODE(P4_EVENT_RESOURCE_STALL),
+ .escr_msr = { MSR_P4_ALF_ESCR0, MSR_P4_ALF_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_RESOURCE_STALL, SBFULL),
+ .cntr = { {12, 13, 16}, {14, 15, 17} },
+ },
+ [P4_EVENT_WC_BUFFER] = {
+ .opcode = P4_OPCODE(P4_EVENT_WC_BUFFER),
+ .escr_msr = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_WC_BUFFER, WCB_EVICTS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_WC_BUFFER, WCB_FULL_EVICTS),
+ .shared = 1,
+ .cntr = { {8, 9, -1}, {10, 11, -1} },
+ },
+ [P4_EVENT_B2B_CYCLES] = {
+ .opcode = P4_OPCODE(P4_EVENT_B2B_CYCLES),
+ .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+ .escr_emask = 0,
+ .cntr = { {0, -1, -1}, {2, -1, -1} },
+ },
+ [P4_EVENT_BNR] = {
+ .opcode = P4_OPCODE(P4_EVENT_BNR),
+ .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+ .escr_emask = 0,
+ .cntr = { {0, -1, -1}, {2, -1, -1} },
+ },
+ [P4_EVENT_SNOOP] = {
+ .opcode = P4_OPCODE(P4_EVENT_SNOOP),
+ .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+ .escr_emask = 0,
+ .cntr = { {0, -1, -1}, {2, -1, -1} },
+ },
+ [P4_EVENT_RESPONSE] = {
+ .opcode = P4_OPCODE(P4_EVENT_RESPONSE),
+ .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+ .escr_emask = 0,
+ .cntr = { {0, -1, -1}, {2, -1, -1} },
+ },
+ [P4_EVENT_FRONT_END_EVENT] = {
+ .opcode = P4_OPCODE(P4_EVENT_FRONT_END_EVENT),
+ .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_FRONT_END_EVENT, NBOGUS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_FRONT_END_EVENT, BOGUS),
+ .cntr = { {12, 13, 16}, {14, 15, 17} },
+ },
+ [P4_EVENT_EXECUTION_EVENT] = {
+ .opcode = P4_OPCODE(P4_EVENT_EXECUTION_EVENT),
+ .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3),
+ .cntr = { {12, 13, 16}, {14, 15, 17} },
+ },
+ [P4_EVENT_REPLAY_EVENT] = {
+ .opcode = P4_OPCODE(P4_EVENT_REPLAY_EVENT),
+ .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_REPLAY_EVENT, NBOGUS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_REPLAY_EVENT, BOGUS),
+ .cntr = { {12, 13, 16}, {14, 15, 17} },
+ },
+ [P4_EVENT_INSTR_RETIRED] = {
+ .opcode = P4_OPCODE(P4_EVENT_INSTR_RETIRED),
+ .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSTAG) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSNTAG) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSTAG),
+ .cntr = { {12, 13, 16}, {14, 15, 17} },
+ },
+ [P4_EVENT_UOPS_RETIRED] = {
+ .opcode = P4_OPCODE(P4_EVENT_UOPS_RETIRED),
+ .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_UOPS_RETIRED, NBOGUS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_UOPS_RETIRED, BOGUS),
+ .cntr = { {12, 13, 16}, {14, 15, 17} },
+ },
+ [P4_EVENT_UOP_TYPE] = {
+ .opcode = P4_OPCODE(P4_EVENT_UOP_TYPE),
+ .escr_msr = { MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_UOP_TYPE, TAGLOADS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_UOP_TYPE, TAGSTORES),
+ .cntr = { {12, 13, 16}, {14, 15, 17} },
+ },
+ [P4_EVENT_BRANCH_RETIRED] = {
+ .opcode = P4_OPCODE(P4_EVENT_BRANCH_RETIRED),
+ .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMNP) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMNM) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMTP) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMTM),
+ .cntr = { {12, 13, 16}, {14, 15, 17} },
+ },
+ [P4_EVENT_MISPRED_BRANCH_RETIRED] = {
+ .opcode = P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED),
+ .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS),
+ .cntr = { {12, 13, 16}, {14, 15, 17} },
+ },
+ [P4_EVENT_X87_ASSIST] = {
+ .opcode = P4_OPCODE(P4_EVENT_X87_ASSIST),
+ .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, FPSU) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, FPSO) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, POAO) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, POAU) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, PREA),
+ .cntr = { {12, 13, 16}, {14, 15, 17} },
+ },
+ [P4_EVENT_MACHINE_CLEAR] = {
+ .opcode = P4_OPCODE(P4_EVENT_MACHINE_CLEAR),
+ .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, CLEAR) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, MOCLEAR) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, SMCLEAR),
+ .cntr = { {12, 13, 16}, {14, 15, 17} },
+ },
+ [P4_EVENT_INSTR_COMPLETED] = {
+ .opcode = P4_OPCODE(P4_EVENT_INSTR_COMPLETED),
+ .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+ .escr_emask =
+ P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_COMPLETED, NBOGUS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_COMPLETED, BOGUS),
+ .cntr = { {12, 13, 16}, {14, 15, 17} },
+ },
+};
+
+#define P4_GEN_CACHE_EVENT(event, bit, metric) \
+ p4_config_pack_escr(P4_ESCR_EVENT(event) | \
+ P4_ESCR_EMASK_BIT(event, bit)) | \
+ p4_config_pack_cccr(metric | \
+ P4_CCCR_ESEL(P4_OPCODE_ESEL(P4_OPCODE(event))))
+
+static __initconst const u64 p4_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
+ P4_PEBS_METRIC__1stl_cache_load_miss_retired),
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
+ P4_PEBS_METRIC__2ndl_cache_load_miss_retired),
+ },
+},
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
+ P4_PEBS_METRIC__dtlb_load_miss_retired),
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
+ P4_PEBS_METRIC__dtlb_store_miss_retired),
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, HIT,
+ P4_PEBS_METRIC__none),
+ [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, MISS,
+ P4_PEBS_METRIC__none),
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+};
+
+/*
+ * Because of Netburst being quite restricted in how many
+ * identical events may run simultaneously, we introduce event aliases,
+ * ie the different events which have the same functionality but
+ * utilize non-intersected resources (ESCR/CCCR/counter registers).
+ *
+ * This allow us to relax restrictions a bit and run two or more
+ * identical events together.
+ *
+ * Never set any custom internal bits such as P4_CONFIG_HT,
+ * P4_CONFIG_ALIASABLE or bits for P4_PEBS_METRIC, they are
+ * either up to date automatically or not applicable at all.
+ */
+static struct p4_event_alias {
+ u64 original;
+ u64 alternative;
+} p4_event_aliases[] = {
+ {
+ /*
+ * Non-halted cycles can be substituted with non-sleeping cycles (see
+ * Intel SDM Vol3b for details). We need this alias to be able
+ * to run nmi-watchdog and 'perf top' (or any other user space tool
+ * which is interested in running PERF_COUNT_HW_CPU_CYCLES)
+ * simultaneously.
+ */
+ .original =
+ p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)),
+ .alternative =
+ p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_EXECUTION_EVENT) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0)|
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1)|
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2)|
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3)|
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3))|
+ p4_config_pack_cccr(P4_CCCR_THRESHOLD(15) | P4_CCCR_COMPLEMENT |
+ P4_CCCR_COMPARE),
+ },
+};
+
+static u64 p4_get_alias_event(u64 config)
+{
+ u64 config_match;
+ int i;
+
+ /*
+ * Only event with special mark is allowed,
+ * we're to be sure it didn't come as malformed
+ * RAW event.
+ */
+ if (!(config & P4_CONFIG_ALIASABLE))
+ return 0;
+
+ config_match = config & P4_CONFIG_EVENT_ALIAS_MASK;
+
+ for (i = 0; i < ARRAY_SIZE(p4_event_aliases); i++) {
+ if (config_match == p4_event_aliases[i].original) {
+ config_match = p4_event_aliases[i].alternative;
+ break;
+ } else if (config_match == p4_event_aliases[i].alternative) {
+ config_match = p4_event_aliases[i].original;
+ break;
+ }
+ }
+
+ if (i >= ARRAY_SIZE(p4_event_aliases))
+ return 0;
+
+ return config_match | (config & P4_CONFIG_EVENT_ALIAS_IMMUTABLE_BITS);
+}
+
+static u64 p4_general_events[PERF_COUNT_HW_MAX] = {
+ /* non-halted CPU clocks */
+ [PERF_COUNT_HW_CPU_CYCLES] =
+ p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)) |
+ P4_CONFIG_ALIASABLE,
+
+ /*
+ * retired instructions
+ * in a sake of simplicity we don't use the FSB tagging
+ */
+ [PERF_COUNT_HW_INSTRUCTIONS] =
+ p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_INSTR_RETIRED) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSNTAG)),
+
+ /* cache hits */
+ [PERF_COUNT_HW_CACHE_REFERENCES] =
+ p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_BSQ_CACHE_REFERENCE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM)),
+
+ /* cache misses */
+ [PERF_COUNT_HW_CACHE_MISSES] =
+ p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_BSQ_CACHE_REFERENCE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS)),
+
+ /* branch instructions retired */
+ [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] =
+ p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_RETIRED_BRANCH_TYPE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CALL) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT)),
+
+ /* mispredicted branches retired */
+ [PERF_COUNT_HW_BRANCH_MISSES] =
+ p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_MISPRED_BRANCH_RETIRED) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS)),
+
+ /* bus ready clocks (cpu is driving #DRDY_DRV\#DRDY_OWN): */
+ [PERF_COUNT_HW_BUS_CYCLES] =
+ p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_FSB_DATA_ACTIVITY) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN)) |
+ p4_config_pack_cccr(P4_CCCR_EDGE | P4_CCCR_COMPARE),
+};
+
+static struct p4_event_bind *p4_config_get_bind(u64 config)
+{
+ unsigned int evnt = p4_config_unpack_event(config);
+ struct p4_event_bind *bind = NULL;
+
+ if (evnt < ARRAY_SIZE(p4_event_bind_map))
+ bind = &p4_event_bind_map[evnt];
+
+ return bind;
+}
+
+static u64 p4_pmu_event_map(int hw_event)
+{
+ struct p4_event_bind *bind;
+ unsigned int esel;
+ u64 config;
+
+ config = p4_general_events[hw_event];
+ bind = p4_config_get_bind(config);
+ esel = P4_OPCODE_ESEL(bind->opcode);
+ config |= p4_config_pack_cccr(P4_CCCR_ESEL(esel));
+
+ return config;
+}
+
+/* check cpu model specifics */
+static bool p4_event_match_cpu_model(unsigned int event_idx)
+{
+ /* INSTR_COMPLETED event only exist for model 3, 4, 6 (Prescott) */
+ if (event_idx == P4_EVENT_INSTR_COMPLETED) {
+ if (boot_cpu_data.x86_model != 3 &&
+ boot_cpu_data.x86_model != 4 &&
+ boot_cpu_data.x86_model != 6)
+ return false;
+ }
+
+ /*
+ * For info
+ * - IQ_ESCR0, IQ_ESCR1 only for models 1 and 2
+ */
+
+ return true;
+}
+
+static int p4_validate_raw_event(struct perf_event *event)
+{
+ unsigned int v, emask;
+
+ /* User data may have out-of-bound event index */
+ v = p4_config_unpack_event(event->attr.config);
+ if (v >= ARRAY_SIZE(p4_event_bind_map))
+ return -EINVAL;
+
+ /* It may be unsupported: */
+ if (!p4_event_match_cpu_model(v))
+ return -EINVAL;
+
+ /*
+ * NOTE: P4_CCCR_THREAD_ANY has not the same meaning as
+ * in Architectural Performance Monitoring, it means not
+ * on _which_ logical cpu to count but rather _when_, ie it
+ * depends on logical cpu state -- count event if one cpu active,
+ * none, both or any, so we just allow user to pass any value
+ * desired.
+ *
+ * In turn we always set Tx_OS/Tx_USR bits bound to logical
+ * cpu without their propagation to another cpu
+ */
+
+ /*
+ * if an event is shared across the logical threads
+ * the user needs special permissions to be able to use it
+ */
+ if (p4_ht_active() && p4_event_bind_map[v].shared) {
+ if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
+ return -EACCES;
+ }
+
+ /* ESCR EventMask bits may be invalid */
+ emask = p4_config_unpack_escr(event->attr.config) & P4_ESCR_EVENTMASK_MASK;
+ if (emask & ~p4_event_bind_map[v].escr_emask)
+ return -EINVAL;
+
+ /*
+ * it may have some invalid PEBS bits
+ */
+ if (p4_config_pebs_has(event->attr.config, P4_PEBS_CONFIG_ENABLE))
+ return -EINVAL;
+
+ v = p4_config_unpack_metric(event->attr.config);
+ if (v >= ARRAY_SIZE(p4_pebs_bind_map))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int p4_hw_config(struct perf_event *event)
+{
+ int cpu = get_cpu();
+ int rc = 0;
+ u32 escr, cccr;
+
+ /*
+ * the reason we use cpu that early is that: if we get scheduled
+ * first time on the same cpu -- we will not need swap thread
+ * specific flags in config (and will save some cpu cycles)
+ */
+
+ cccr = p4_default_cccr_conf(cpu);
+ escr = p4_default_escr_conf(cpu, event->attr.exclude_kernel,
+ event->attr.exclude_user);
+ event->hw.config = p4_config_pack_escr(escr) |
+ p4_config_pack_cccr(cccr);
+
+ if (p4_ht_active() && p4_ht_thread(cpu))
+ event->hw.config = p4_set_ht_bit(event->hw.config);
+
+ if (event->attr.type == PERF_TYPE_RAW) {
+ struct p4_event_bind *bind;
+ unsigned int esel;
+ /*
+ * Clear bits we reserve to be managed by kernel itself
+ * and never allowed from a user space
+ */
+ event->attr.config &= P4_CONFIG_MASK;
+
+ rc = p4_validate_raw_event(event);
+ if (rc)
+ goto out;
+
+ /*
+ * Note that for RAW events we allow user to use P4_CCCR_RESERVED
+ * bits since we keep additional info here (for cache events and etc)
+ */
+ event->hw.config |= event->attr.config;
+ bind = p4_config_get_bind(event->attr.config);
+ if (!bind) {
+ rc = -EINVAL;
+ goto out;
+ }
+ esel = P4_OPCODE_ESEL(bind->opcode);
+ event->hw.config |= p4_config_pack_cccr(P4_CCCR_ESEL(esel));
+ }
+
+ rc = x86_setup_perfctr(event);
+out:
+ put_cpu();
+ return rc;
+}
+
+static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
+{
+ u64 v;
+
+ /* an official way for overflow indication */
+ rdmsrl(hwc->config_base, v);
+ if (v & P4_CCCR_OVF) {
+ wrmsrl(hwc->config_base, v & ~P4_CCCR_OVF);
+ return 1;
+ }
+
+ /*
+ * In some circumstances the overflow might issue an NMI but did
+ * not set P4_CCCR_OVF bit. Because a counter holds a negative value
+ * we simply check for high bit being set, if it's cleared it means
+ * the counter has reached zero value and continued counting before
+ * real NMI signal was received:
+ */
+ rdmsrl(hwc->event_base, v);
+ if (!(v & ARCH_P4_UNFLAGGED_BIT))
+ return 1;
+
+ return 0;
+}
+
+static void p4_pmu_disable_pebs(void)
+{
+ /*
+ * FIXME
+ *
+ * It's still allowed that two threads setup same cache
+ * events so we can't simply clear metrics until we knew
+ * no one is depending on us, so we need kind of counter
+ * for "ReplayEvent" users.
+ *
+ * What is more complex -- RAW events, if user (for some
+ * reason) will pass some cache event metric with improper
+ * event opcode -- it's fine from hardware point of view
+ * but completely nonsense from "meaning" of such action.
+ *
+ * So at moment let leave metrics turned on forever -- it's
+ * ok for now but need to be revisited!
+ *
+ * (void)wrmsrl_safe(MSR_IA32_PEBS_ENABLE, 0);
+ * (void)wrmsrl_safe(MSR_P4_PEBS_MATRIX_VERT, 0);
+ */
+}
+
+static inline void p4_pmu_disable_event(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ /*
+ * If event gets disabled while counter is in overflowed
+ * state we need to clear P4_CCCR_OVF, otherwise interrupt get
+ * asserted again and again
+ */
+ (void)wrmsrl_safe(hwc->config_base,
+ p4_config_unpack_cccr(hwc->config) & ~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED);
+}
+
+static void p4_pmu_disable_all(void)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ int idx;
+
+ for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ struct perf_event *event = cpuc->events[idx];
+ if (!test_bit(idx, cpuc->active_mask))
+ continue;
+ p4_pmu_disable_event(event);
+ }
+
+ p4_pmu_disable_pebs();
+}
+
+/* configuration must be valid */
+static void p4_pmu_enable_pebs(u64 config)
+{
+ struct p4_pebs_bind *bind;
+ unsigned int idx;
+
+ BUILD_BUG_ON(P4_PEBS_METRIC__max > P4_PEBS_CONFIG_METRIC_MASK);
+
+ idx = p4_config_unpack_metric(config);
+ if (idx == P4_PEBS_METRIC__none)
+ return;
+
+ bind = &p4_pebs_bind_map[idx];
+
+ (void)wrmsrl_safe(MSR_IA32_PEBS_ENABLE, (u64)bind->metric_pebs);
+ (void)wrmsrl_safe(MSR_P4_PEBS_MATRIX_VERT, (u64)bind->metric_vert);
+}
+
+static void p4_pmu_enable_event(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ int thread = p4_ht_config_thread(hwc->config);
+ u64 escr_conf = p4_config_unpack_escr(p4_clear_ht_bit(hwc->config));
+ unsigned int idx = p4_config_unpack_event(hwc->config);
+ struct p4_event_bind *bind;
+ u64 escr_addr, cccr;
+
+ bind = &p4_event_bind_map[idx];
+ escr_addr = bind->escr_msr[thread];
+
+ /*
+ * - we dont support cascaded counters yet
+ * - and counter 1 is broken (erratum)
+ */
+ WARN_ON_ONCE(p4_is_event_cascaded(hwc->config));
+ WARN_ON_ONCE(hwc->idx == 1);
+
+ /* we need a real Event value */
+ escr_conf &= ~P4_ESCR_EVENT_MASK;
+ escr_conf |= P4_ESCR_EVENT(P4_OPCODE_EVNT(bind->opcode));
+
+ cccr = p4_config_unpack_cccr(hwc->config);
+
+ /*
+ * it could be Cache event so we need to write metrics
+ * into additional MSRs
+ */
+ p4_pmu_enable_pebs(hwc->config);
+
+ (void)wrmsrl_safe(escr_addr, escr_conf);
+ (void)wrmsrl_safe(hwc->config_base,
+ (cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE);
+}
+
+static void p4_pmu_enable_all(int added)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ int idx;
+
+ for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ struct perf_event *event = cpuc->events[idx];
+ if (!test_bit(idx, cpuc->active_mask))
+ continue;
+ p4_pmu_enable_event(event);
+ }
+}
+
+static int p4_pmu_handle_irq(struct pt_regs *regs)
+{
+ struct perf_sample_data data;
+ struct cpu_hw_events *cpuc;
+ struct perf_event *event;
+ struct hw_perf_event *hwc;
+ int idx, handled = 0;
+ u64 val;
+
+ cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ int overflow;
+
+ if (!test_bit(idx, cpuc->active_mask)) {
+ /* catch in-flight IRQs */
+ if (__test_and_clear_bit(idx, cpuc->running))
+ handled++;
+ continue;
+ }
+
+ event = cpuc->events[idx];
+ hwc = &event->hw;
+
+ WARN_ON_ONCE(hwc->idx != idx);
+
+ /* it might be unflagged overflow */
+ overflow = p4_pmu_clear_cccr_ovf(hwc);
+
+ val = x86_perf_event_update(event);
+ if (!overflow && (val & (1ULL << (x86_pmu.cntval_bits - 1))))
+ continue;
+
+ handled += overflow;
+
+ /* event overflow for sure */
+ perf_sample_data_init(&data, 0, hwc->last_period);
+
+ if (!x86_perf_event_set_period(event))
+ continue;
+
+
+ if (perf_event_overflow(event, &data, regs))
+ x86_pmu_stop(event, 0);
+ }
+
+ if (handled)
+ inc_irq_stat(apic_perf_irqs);
+
+ /*
+ * When dealing with the unmasking of the LVTPC on P4 perf hw, it has
+ * been observed that the OVF bit flag has to be cleared first _before_
+ * the LVTPC can be unmasked.
+ *
+ * The reason is the NMI line will continue to be asserted while the OVF
+ * bit is set. This causes a second NMI to generate if the LVTPC is
+ * unmasked before the OVF bit is cleared, leading to unknown NMI
+ * messages.
+ */
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+
+ return handled;
+}
+
+/*
+ * swap thread specific fields according to a thread
+ * we are going to run on
+ */
+static void p4_pmu_swap_config_ts(struct hw_perf_event *hwc, int cpu)
+{
+ u32 escr, cccr;
+
+ /*
+ * we either lucky and continue on same cpu or no HT support
+ */
+ if (!p4_should_swap_ts(hwc->config, cpu))
+ return;
+
+ /*
+ * the event is migrated from an another logical
+ * cpu, so we need to swap thread specific flags
+ */
+
+ escr = p4_config_unpack_escr(hwc->config);
+ cccr = p4_config_unpack_cccr(hwc->config);
+
+ if (p4_ht_thread(cpu)) {
+ cccr &= ~P4_CCCR_OVF_PMI_T0;
+ cccr |= P4_CCCR_OVF_PMI_T1;
+ if (escr & P4_ESCR_T0_OS) {
+ escr &= ~P4_ESCR_T0_OS;
+ escr |= P4_ESCR_T1_OS;
+ }
+ if (escr & P4_ESCR_T0_USR) {
+ escr &= ~P4_ESCR_T0_USR;
+ escr |= P4_ESCR_T1_USR;
+ }
+ hwc->config = p4_config_pack_escr(escr);
+ hwc->config |= p4_config_pack_cccr(cccr);
+ hwc->config |= P4_CONFIG_HT;
+ } else {
+ cccr &= ~P4_CCCR_OVF_PMI_T1;
+ cccr |= P4_CCCR_OVF_PMI_T0;
+ if (escr & P4_ESCR_T1_OS) {
+ escr &= ~P4_ESCR_T1_OS;
+ escr |= P4_ESCR_T0_OS;
+ }
+ if (escr & P4_ESCR_T1_USR) {
+ escr &= ~P4_ESCR_T1_USR;
+ escr |= P4_ESCR_T0_USR;
+ }
+ hwc->config = p4_config_pack_escr(escr);
+ hwc->config |= p4_config_pack_cccr(cccr);
+ hwc->config &= ~P4_CONFIG_HT;
+ }
+}
+
+/*
+ * ESCR address hashing is tricky, ESCRs are not sequential
+ * in memory but all starts from MSR_P4_BSU_ESCR0 (0x03a0) and
+ * the metric between any ESCRs is laid in range [0xa0,0xe1]
+ *
+ * so we make ~70% filled hashtable
+ */
+
+#define P4_ESCR_MSR_BASE 0x000003a0
+#define P4_ESCR_MSR_MAX 0x000003e1
+#define P4_ESCR_MSR_TABLE_SIZE (P4_ESCR_MSR_MAX - P4_ESCR_MSR_BASE + 1)
+#define P4_ESCR_MSR_IDX(msr) (msr - P4_ESCR_MSR_BASE)
+#define P4_ESCR_MSR_TABLE_ENTRY(msr) [P4_ESCR_MSR_IDX(msr)] = msr
+
+static const unsigned int p4_escr_table[P4_ESCR_MSR_TABLE_SIZE] = {
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ALF_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ALF_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BPU_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BPU_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BSU_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BSU_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR2),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR3),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR4),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR5),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_DAC_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_DAC_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FIRM_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FIRM_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FLAME_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FLAME_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FSB_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FSB_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IQ_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IQ_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IS_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IS_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ITLB_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ITLB_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IX_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IX_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MOB_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MOB_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MS_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MS_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_PMH_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_PMH_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_RAT_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_RAT_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SAAT_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SAAT_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SSU_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SSU_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TBPU_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TBPU_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TC_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TC_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_U2L_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_U2L_ESCR1),
+};
+
+static int p4_get_escr_idx(unsigned int addr)
+{
+ unsigned int idx = P4_ESCR_MSR_IDX(addr);
+
+ if (unlikely(idx >= P4_ESCR_MSR_TABLE_SIZE ||
+ !p4_escr_table[idx] ||
+ p4_escr_table[idx] != addr)) {
+ WARN_ONCE(1, "P4 PMU: Wrong address passed: %x\n", addr);
+ return -1;
+ }
+
+ return idx;
+}
+
+static int p4_next_cntr(int thread, unsigned long *used_mask,
+ struct p4_event_bind *bind)
+{
+ int i, j;
+
+ for (i = 0; i < P4_CNTR_LIMIT; i++) {
+ j = bind->cntr[thread][i];
+ if (j != -1 && !test_bit(j, used_mask))
+ return j;
+ }
+
+ return -1;
+}
+
+static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
+{
+ unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+ unsigned long escr_mask[BITS_TO_LONGS(P4_ESCR_MSR_TABLE_SIZE)];
+ int cpu = smp_processor_id();
+ struct hw_perf_event *hwc;
+ struct p4_event_bind *bind;
+ unsigned int i, thread, num;
+ int cntr_idx, escr_idx;
+ u64 config_alias;
+ int pass;
+
+ bitmap_zero(used_mask, X86_PMC_IDX_MAX);
+ bitmap_zero(escr_mask, P4_ESCR_MSR_TABLE_SIZE);
+
+ for (i = 0, num = n; i < n; i++, num--) {
+
+ hwc = &cpuc->event_list[i]->hw;
+ thread = p4_ht_thread(cpu);
+ pass = 0;
+
+again:
+ /*
+ * It's possible to hit a circular lock
+ * between original and alternative events
+ * if both are scheduled already.
+ */
+ if (pass > 2)
+ goto done;
+
+ bind = p4_config_get_bind(hwc->config);
+ escr_idx = p4_get_escr_idx(bind->escr_msr[thread]);
+ if (unlikely(escr_idx == -1))
+ goto done;
+
+ if (hwc->idx != -1 && !p4_should_swap_ts(hwc->config, cpu)) {
+ cntr_idx = hwc->idx;
+ if (assign)
+ assign[i] = hwc->idx;
+ goto reserve;
+ }
+
+ cntr_idx = p4_next_cntr(thread, used_mask, bind);
+ if (cntr_idx == -1 || test_bit(escr_idx, escr_mask)) {
+ /*
+ * Check whether an event alias is still available.
+ */
+ config_alias = p4_get_alias_event(hwc->config);
+ if (!config_alias)
+ goto done;
+ hwc->config = config_alias;
+ pass++;
+ goto again;
+ }
+ /*
+ * Perf does test runs to see if a whole group can be assigned
+ * together succesfully. There can be multiple rounds of this.
+ * Unfortunately, p4_pmu_swap_config_ts touches the hwc->config
+ * bits, such that the next round of group assignments will
+ * cause the above p4_should_swap_ts to pass instead of fail.
+ * This leads to counters exclusive to thread0 being used by
+ * thread1.
+ *
+ * Solve this with a cheap hack, reset the idx back to -1 to
+ * force a new lookup (p4_next_cntr) to get the right counter
+ * for the right thread.
+ *
+ * This probably doesn't comply with the general spirit of how
+ * perf wants to work, but P4 is special. :-(
+ */
+ if (p4_should_swap_ts(hwc->config, cpu))
+ hwc->idx = -1;
+ p4_pmu_swap_config_ts(hwc, cpu);
+ if (assign)
+ assign[i] = cntr_idx;
+reserve:
+ set_bit(cntr_idx, used_mask);
+ set_bit(escr_idx, escr_mask);
+ }
+
+done:
+ return num ? -EINVAL : 0;
+}
+
+PMU_FORMAT_ATTR(cccr, "config:0-31" );
+PMU_FORMAT_ATTR(escr, "config:32-62");
+PMU_FORMAT_ATTR(ht, "config:63" );
+
+static struct attribute *intel_p4_formats_attr[] = {
+ &format_attr_cccr.attr,
+ &format_attr_escr.attr,
+ &format_attr_ht.attr,
+ NULL,
+};
+
+static __initconst const struct x86_pmu p4_pmu = {
+ .name = "Netburst P4/Xeon",
+ .handle_irq = p4_pmu_handle_irq,
+ .disable_all = p4_pmu_disable_all,
+ .enable_all = p4_pmu_enable_all,
+ .enable = p4_pmu_enable_event,
+ .disable = p4_pmu_disable_event,
+ .eventsel = MSR_P4_BPU_CCCR0,
+ .perfctr = MSR_P4_BPU_PERFCTR0,
+ .event_map = p4_pmu_event_map,
+ .max_events = ARRAY_SIZE(p4_general_events),
+ .get_event_constraints = x86_get_event_constraints,
+ /*
+ * IF HT disabled we may need to use all
+ * ARCH_P4_MAX_CCCR counters simulaneously
+ * though leave it restricted at moment assuming
+ * HT is on
+ */
+ .num_counters = ARCH_P4_MAX_CCCR,
+ .apic = 1,
+ .cntval_bits = ARCH_P4_CNTRVAL_BITS,
+ .cntval_mask = ARCH_P4_CNTRVAL_MASK,
+ .max_period = (1ULL << (ARCH_P4_CNTRVAL_BITS - 1)) - 1,
+ .hw_config = p4_hw_config,
+ .schedule_events = p4_pmu_schedule_events,
+ /*
+ * This handles erratum N15 in intel doc 249199-029,
+ * the counter may not be updated correctly on write
+ * so we need a second write operation to do the trick
+ * (the official workaround didn't work)
+ *
+ * the former idea is taken from OProfile code
+ */
+ .perfctr_second_write = 1,
+
+ .format_attrs = intel_p4_formats_attr,
+};
+
+__init int p4_pmu_init(void)
+{
+ unsigned int low, high;
+ int i, reg;
+
+ /* If we get stripped -- indexing fails */
+ BUILD_BUG_ON(ARCH_P4_MAX_CCCR > INTEL_PMC_MAX_GENERIC);
+
+ rdmsr(MSR_IA32_MISC_ENABLE, low, high);
+ if (!(low & (1 << 7))) {
+ pr_cont("unsupported Netburst CPU model %d ",
+ boot_cpu_data.x86_model);
+ return -ENODEV;
+ }
+
+ memcpy(hw_cache_event_ids, p4_hw_cache_event_ids,
+ sizeof(hw_cache_event_ids));
+
+ pr_cont("Netburst events, ");
+
+ x86_pmu = p4_pmu;
+
+ /*
+ * Even though the counters are configured to interrupt a particular
+ * logical processor when an overflow happens, testing has shown that
+ * on kdump kernels (which uses a single cpu), thread1's counter
+ * continues to run and will report an NMI on thread0. Due to the
+ * overflow bug, this leads to a stream of unknown NMIs.
+ *
+ * Solve this by zero'ing out the registers to mimic a reset.
+ */
+ for (i = 0; i < x86_pmu.num_counters; i++) {
+ reg = x86_pmu_config_addr(i);
+ wrmsrl_safe(reg, 0ULL);
+ }
+
+ return 0;
+}
diff --git a/arch/x86/events/intel/p6.c b/arch/x86/events/intel/p6.c
new file mode 100644
index 000000000..408879b0c
--- /dev/null
+++ b/arch/x86/events/intel/p6.c
@@ -0,0 +1,280 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/perf_event.h>
+#include <linux/types.h>
+
+#include "../perf_event.h"
+
+/*
+ * Not sure about some of these
+ */
+static const u64 p6_perfmon_event_map[] =
+{
+ [PERF_COUNT_HW_CPU_CYCLES] = 0x0079, /* CPU_CLK_UNHALTED */
+ [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, /* INST_RETIRED */
+ [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e, /* L2_RQSTS:M:E:S:I */
+ [PERF_COUNT_HW_CACHE_MISSES] = 0x012e, /* L2_RQSTS:I */
+ [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, /* BR_INST_RETIRED */
+ [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, /* BR_MISS_PRED_RETIRED */
+ [PERF_COUNT_HW_BUS_CYCLES] = 0x0062, /* BUS_DRDY_CLOCKS */
+ [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00a2, /* RESOURCE_STALLS */
+
+};
+
+static const u64 __initconst p6_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0043, /* DATA_MEM_REFS */
+ [ C(RESULT_MISS) ] = 0x0045, /* DCU_LINES_IN */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0x0f29, /* L2_LD:M:E:S:I */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0080, /* IFU_IFETCH */
+ [ C(RESULT_MISS) ] = 0x0f28, /* L2_IFETCH:M:E:S:I */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0x0025, /* L2_M_LINES_INM */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0043, /* DATA_MEM_REFS */
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0080, /* IFU_IFETCH */
+ [ C(RESULT_MISS) ] = 0x0085, /* ITLB_MISS */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED */
+ [ C(RESULT_MISS) ] = 0x00c5, /* BR_MISS_PRED_RETIRED */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+};
+
+static u64 p6_pmu_event_map(int hw_event)
+{
+ return p6_perfmon_event_map[hw_event];
+}
+
+/*
+ * Event setting that is specified not to count anything.
+ * We use this to effectively disable a counter.
+ *
+ * L2_RQSTS with 0 MESI unit mask.
+ */
+#define P6_NOP_EVENT 0x0000002EULL
+
+static struct event_constraint p6_event_constraints[] =
+{
+ INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */
+ INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
+ INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
+ INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
+ INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
+ INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
+ EVENT_CONSTRAINT_END
+};
+
+static void p6_pmu_disable_all(void)
+{
+ u64 val;
+
+ /* p6 only has one enable register */
+ rdmsrl(MSR_P6_EVNTSEL0, val);
+ val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
+ wrmsrl(MSR_P6_EVNTSEL0, val);
+}
+
+static void p6_pmu_enable_all(int added)
+{
+ unsigned long val;
+
+ /* p6 only has one enable register */
+ rdmsrl(MSR_P6_EVNTSEL0, val);
+ val |= ARCH_PERFMON_EVENTSEL_ENABLE;
+ wrmsrl(MSR_P6_EVNTSEL0, val);
+}
+
+static inline void
+p6_pmu_disable_event(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ u64 val = P6_NOP_EVENT;
+
+ (void)wrmsrl_safe(hwc->config_base, val);
+}
+
+static void p6_pmu_enable_event(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ u64 val;
+
+ val = hwc->config;
+
+ /*
+ * p6 only has a global event enable, set on PerfEvtSel0
+ * We "disable" events by programming P6_NOP_EVENT
+ * and we rely on p6_pmu_enable_all() being called
+ * to actually enable the events.
+ */
+
+ (void)wrmsrl_safe(hwc->config_base, val);
+}
+
+PMU_FORMAT_ATTR(event, "config:0-7" );
+PMU_FORMAT_ATTR(umask, "config:8-15" );
+PMU_FORMAT_ATTR(edge, "config:18" );
+PMU_FORMAT_ATTR(pc, "config:19" );
+PMU_FORMAT_ATTR(inv, "config:23" );
+PMU_FORMAT_ATTR(cmask, "config:24-31" );
+
+static struct attribute *intel_p6_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_pc.attr,
+ &format_attr_inv.attr,
+ &format_attr_cmask.attr,
+ NULL,
+};
+
+static __initconst const struct x86_pmu p6_pmu = {
+ .name = "p6",
+ .handle_irq = x86_pmu_handle_irq,
+ .disable_all = p6_pmu_disable_all,
+ .enable_all = p6_pmu_enable_all,
+ .enable = p6_pmu_enable_event,
+ .disable = p6_pmu_disable_event,
+ .hw_config = x86_pmu_hw_config,
+ .schedule_events = x86_schedule_events,
+ .eventsel = MSR_P6_EVNTSEL0,
+ .perfctr = MSR_P6_PERFCTR0,
+ .event_map = p6_pmu_event_map,
+ .max_events = ARRAY_SIZE(p6_perfmon_event_map),
+ .apic = 1,
+ .max_period = (1ULL << 31) - 1,
+ .version = 0,
+ .num_counters = 2,
+ /*
+ * Events have 40 bits implemented. However they are designed such
+ * that bits [32-39] are sign extensions of bit 31. As such the
+ * effective width of a event for P6-like PMU is 32 bits only.
+ *
+ * See IA-32 Intel Architecture Software developer manual Vol 3B
+ */
+ .cntval_bits = 32,
+ .cntval_mask = (1ULL << 32) - 1,
+ .get_event_constraints = x86_get_event_constraints,
+ .event_constraints = p6_event_constraints,
+
+ .format_attrs = intel_p6_formats_attr,
+ .events_sysfs_show = intel_event_sysfs_show,
+
+};
+
+static __init void p6_pmu_rdpmc_quirk(void)
+{
+ if (boot_cpu_data.x86_stepping < 9) {
+ /*
+ * PPro erratum 26; fixed in stepping 9 and above.
+ */
+ pr_warn("Userspace RDPMC support disabled due to a CPU erratum\n");
+ x86_pmu.attr_rdpmc_broken = 1;
+ x86_pmu.attr_rdpmc = 0;
+ }
+}
+
+__init int p6_pmu_init(void)
+{
+ x86_pmu = p6_pmu;
+
+ switch (boot_cpu_data.x86_model) {
+ case 1: /* Pentium Pro */
+ x86_add_quirk(p6_pmu_rdpmc_quirk);
+ break;
+
+ case 3: /* Pentium II - Klamath */
+ case 5: /* Pentium II - Deschutes */
+ case 6: /* Pentium II - Mendocino */
+ break;
+
+ case 7: /* Pentium III - Katmai */
+ case 8: /* Pentium III - Coppermine */
+ case 10: /* Pentium III Xeon */
+ case 11: /* Pentium III - Tualatin */
+ break;
+
+ case 9: /* Pentium M - Banias */
+ case 13: /* Pentium M - Dothan */
+ break;
+
+ default:
+ pr_cont("unsupported p6 CPU model %d ", boot_cpu_data.x86_model);
+ return -ENODEV;
+ }
+
+ memcpy(hw_cache_event_ids, p6_hw_cache_event_ids,
+ sizeof(hw_cache_event_ids));
+
+ return 0;
+}
diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
new file mode 100644
index 000000000..49b3ea1c1
--- /dev/null
+++ b/arch/x86/events/intel/pt.c
@@ -0,0 +1,1541 @@
+/*
+ * Intel(R) Processor Trace PMU driver for perf
+ * Copyright (c) 2013-2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * Intel PT is specified in the Intel Architecture Instruction Set Extensions
+ * Programming Reference:
+ * http://software.intel.com/en-us/intel-isa-extensions
+ */
+
+#undef DEBUG
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/device.h>
+
+#include <asm/perf_event.h>
+#include <asm/insn.h>
+#include <asm/io.h>
+#include <asm/intel_pt.h>
+#include <asm/intel-family.h>
+
+#include "../perf_event.h"
+#include "pt.h"
+
+static DEFINE_PER_CPU(struct pt, pt_ctx);
+
+static struct pt_pmu pt_pmu;
+
+/*
+ * Capabilities of Intel PT hardware, such as number of address bits or
+ * supported output schemes, are cached and exported to userspace as "caps"
+ * attribute group of pt pmu device
+ * (/sys/bus/event_source/devices/intel_pt/caps/) so that userspace can store
+ * relevant bits together with intel_pt traces.
+ *
+ * These are necessary for both trace decoding (payloads_lip, contains address
+ * width encoded in IP-related packets), and event configuration (bitmasks with
+ * permitted values for certain bit fields).
+ */
+#define PT_CAP(_n, _l, _r, _m) \
+ [PT_CAP_ ## _n] = { .name = __stringify(_n), .leaf = _l, \
+ .reg = _r, .mask = _m }
+
+static struct pt_cap_desc {
+ const char *name;
+ u32 leaf;
+ u8 reg;
+ u32 mask;
+} pt_caps[] = {
+ PT_CAP(max_subleaf, 0, CPUID_EAX, 0xffffffff),
+ PT_CAP(cr3_filtering, 0, CPUID_EBX, BIT(0)),
+ PT_CAP(psb_cyc, 0, CPUID_EBX, BIT(1)),
+ PT_CAP(ip_filtering, 0, CPUID_EBX, BIT(2)),
+ PT_CAP(mtc, 0, CPUID_EBX, BIT(3)),
+ PT_CAP(ptwrite, 0, CPUID_EBX, BIT(4)),
+ PT_CAP(power_event_trace, 0, CPUID_EBX, BIT(5)),
+ PT_CAP(topa_output, 0, CPUID_ECX, BIT(0)),
+ PT_CAP(topa_multiple_entries, 0, CPUID_ECX, BIT(1)),
+ PT_CAP(single_range_output, 0, CPUID_ECX, BIT(2)),
+ PT_CAP(payloads_lip, 0, CPUID_ECX, BIT(31)),
+ PT_CAP(num_address_ranges, 1, CPUID_EAX, 0x7),
+ PT_CAP(mtc_periods, 1, CPUID_EAX, 0xffff0000),
+ PT_CAP(cycle_thresholds, 1, CPUID_EBX, 0xffff),
+ PT_CAP(psb_periods, 1, CPUID_EBX, 0xffff0000),
+};
+
+static u32 pt_cap_get(enum pt_capabilities cap)
+{
+ struct pt_cap_desc *cd = &pt_caps[cap];
+ u32 c = pt_pmu.caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg];
+ unsigned int shift = __ffs(cd->mask);
+
+ return (c & cd->mask) >> shift;
+}
+
+static ssize_t pt_cap_show(struct device *cdev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct dev_ext_attribute *ea =
+ container_of(attr, struct dev_ext_attribute, attr);
+ enum pt_capabilities cap = (long)ea->var;
+
+ return snprintf(buf, PAGE_SIZE, "%x\n", pt_cap_get(cap));
+}
+
+static struct attribute_group pt_cap_group = {
+ .name = "caps",
+};
+
+PMU_FORMAT_ATTR(pt, "config:0" );
+PMU_FORMAT_ATTR(cyc, "config:1" );
+PMU_FORMAT_ATTR(pwr_evt, "config:4" );
+PMU_FORMAT_ATTR(fup_on_ptw, "config:5" );
+PMU_FORMAT_ATTR(mtc, "config:9" );
+PMU_FORMAT_ATTR(tsc, "config:10" );
+PMU_FORMAT_ATTR(noretcomp, "config:11" );
+PMU_FORMAT_ATTR(ptw, "config:12" );
+PMU_FORMAT_ATTR(branch, "config:13" );
+PMU_FORMAT_ATTR(mtc_period, "config:14-17" );
+PMU_FORMAT_ATTR(cyc_thresh, "config:19-22" );
+PMU_FORMAT_ATTR(psb_period, "config:24-27" );
+
+static struct attribute *pt_formats_attr[] = {
+ &format_attr_pt.attr,
+ &format_attr_cyc.attr,
+ &format_attr_pwr_evt.attr,
+ &format_attr_fup_on_ptw.attr,
+ &format_attr_mtc.attr,
+ &format_attr_tsc.attr,
+ &format_attr_noretcomp.attr,
+ &format_attr_ptw.attr,
+ &format_attr_branch.attr,
+ &format_attr_mtc_period.attr,
+ &format_attr_cyc_thresh.attr,
+ &format_attr_psb_period.attr,
+ NULL,
+};
+
+static struct attribute_group pt_format_group = {
+ .name = "format",
+ .attrs = pt_formats_attr,
+};
+
+static ssize_t
+pt_timing_attr_show(struct device *dev, struct device_attribute *attr,
+ char *page)
+{
+ struct perf_pmu_events_attr *pmu_attr =
+ container_of(attr, struct perf_pmu_events_attr, attr);
+
+ switch (pmu_attr->id) {
+ case 0:
+ return sprintf(page, "%lu\n", pt_pmu.max_nonturbo_ratio);
+ case 1:
+ return sprintf(page, "%u:%u\n",
+ pt_pmu.tsc_art_num,
+ pt_pmu.tsc_art_den);
+ default:
+ break;
+ }
+
+ return -EINVAL;
+}
+
+PMU_EVENT_ATTR(max_nonturbo_ratio, timing_attr_max_nonturbo_ratio, 0,
+ pt_timing_attr_show);
+PMU_EVENT_ATTR(tsc_art_ratio, timing_attr_tsc_art_ratio, 1,
+ pt_timing_attr_show);
+
+static struct attribute *pt_timing_attr[] = {
+ &timing_attr_max_nonturbo_ratio.attr.attr,
+ &timing_attr_tsc_art_ratio.attr.attr,
+ NULL,
+};
+
+static struct attribute_group pt_timing_group = {
+ .attrs = pt_timing_attr,
+};
+
+static const struct attribute_group *pt_attr_groups[] = {
+ &pt_cap_group,
+ &pt_format_group,
+ &pt_timing_group,
+ NULL,
+};
+
+static int __init pt_pmu_hw_init(void)
+{
+ struct dev_ext_attribute *de_attrs;
+ struct attribute **attrs;
+ size_t size;
+ u64 reg;
+ int ret;
+ long i;
+
+ rdmsrl(MSR_PLATFORM_INFO, reg);
+ pt_pmu.max_nonturbo_ratio = (reg & 0xff00) >> 8;
+
+ /*
+ * if available, read in TSC to core crystal clock ratio,
+ * otherwise, zero for numerator stands for "not enumerated"
+ * as per SDM
+ */
+ if (boot_cpu_data.cpuid_level >= CPUID_TSC_LEAF) {
+ u32 eax, ebx, ecx, edx;
+
+ cpuid(CPUID_TSC_LEAF, &eax, &ebx, &ecx, &edx);
+
+ pt_pmu.tsc_art_num = ebx;
+ pt_pmu.tsc_art_den = eax;
+ }
+
+ /* model-specific quirks */
+ switch (boot_cpu_data.x86_model) {
+ case INTEL_FAM6_BROADWELL_CORE:
+ case INTEL_FAM6_BROADWELL_XEON_D:
+ case INTEL_FAM6_BROADWELL_GT3E:
+ case INTEL_FAM6_BROADWELL_X:
+ /* not setting BRANCH_EN will #GP, erratum BDM106 */
+ pt_pmu.branch_en_always_on = true;
+ break;
+ default:
+ break;
+ }
+
+ if (boot_cpu_has(X86_FEATURE_VMX)) {
+ /*
+ * Intel SDM, 36.5 "Tracing post-VMXON" says that
+ * "IA32_VMX_MISC[bit 14]" being 1 means PT can trace
+ * post-VMXON.
+ */
+ rdmsrl(MSR_IA32_VMX_MISC, reg);
+ if (reg & BIT(14))
+ pt_pmu.vmx = true;
+ }
+
+ attrs = NULL;
+
+ for (i = 0; i < PT_CPUID_LEAVES; i++) {
+ cpuid_count(20, i,
+ &pt_pmu.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM],
+ &pt_pmu.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM],
+ &pt_pmu.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM],
+ &pt_pmu.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM]);
+ }
+
+ ret = -ENOMEM;
+ size = sizeof(struct attribute *) * (ARRAY_SIZE(pt_caps)+1);
+ attrs = kzalloc(size, GFP_KERNEL);
+ if (!attrs)
+ goto fail;
+
+ size = sizeof(struct dev_ext_attribute) * (ARRAY_SIZE(pt_caps)+1);
+ de_attrs = kzalloc(size, GFP_KERNEL);
+ if (!de_attrs)
+ goto fail;
+
+ for (i = 0; i < ARRAY_SIZE(pt_caps); i++) {
+ struct dev_ext_attribute *de_attr = de_attrs + i;
+
+ de_attr->attr.attr.name = pt_caps[i].name;
+
+ sysfs_attr_init(&de_attr->attr.attr);
+
+ de_attr->attr.attr.mode = S_IRUGO;
+ de_attr->attr.show = pt_cap_show;
+ de_attr->var = (void *)i;
+
+ attrs[i] = &de_attr->attr.attr;
+ }
+
+ pt_cap_group.attrs = attrs;
+
+ return 0;
+
+fail:
+ kfree(attrs);
+
+ return ret;
+}
+
+#define RTIT_CTL_CYC_PSB (RTIT_CTL_CYCLEACC | \
+ RTIT_CTL_CYC_THRESH | \
+ RTIT_CTL_PSB_FREQ)
+
+#define RTIT_CTL_MTC (RTIT_CTL_MTC_EN | \
+ RTIT_CTL_MTC_RANGE)
+
+#define RTIT_CTL_PTW (RTIT_CTL_PTW_EN | \
+ RTIT_CTL_FUP_ON_PTW)
+
+/*
+ * Bit 0 (TraceEn) in the attr.config is meaningless as the
+ * corresponding bit in the RTIT_CTL can only be controlled
+ * by the driver; therefore, repurpose it to mean: pass
+ * through the bit that was previously assumed to be always
+ * on for PT, thereby allowing the user to *not* set it if
+ * they so wish. See also pt_event_valid() and pt_config().
+ */
+#define RTIT_CTL_PASSTHROUGH RTIT_CTL_TRACEEN
+
+#define PT_CONFIG_MASK (RTIT_CTL_TRACEEN | \
+ RTIT_CTL_TSC_EN | \
+ RTIT_CTL_DISRETC | \
+ RTIT_CTL_BRANCH_EN | \
+ RTIT_CTL_CYC_PSB | \
+ RTIT_CTL_MTC | \
+ RTIT_CTL_PWR_EVT_EN | \
+ RTIT_CTL_FUP_ON_PTW | \
+ RTIT_CTL_PTW_EN)
+
+static bool pt_event_valid(struct perf_event *event)
+{
+ u64 config = event->attr.config;
+ u64 allowed, requested;
+
+ if ((config & PT_CONFIG_MASK) != config)
+ return false;
+
+ if (config & RTIT_CTL_CYC_PSB) {
+ if (!pt_cap_get(PT_CAP_psb_cyc))
+ return false;
+
+ allowed = pt_cap_get(PT_CAP_psb_periods);
+ requested = (config & RTIT_CTL_PSB_FREQ) >>
+ RTIT_CTL_PSB_FREQ_OFFSET;
+ if (requested && (!(allowed & BIT(requested))))
+ return false;
+
+ allowed = pt_cap_get(PT_CAP_cycle_thresholds);
+ requested = (config & RTIT_CTL_CYC_THRESH) >>
+ RTIT_CTL_CYC_THRESH_OFFSET;
+ if (requested && (!(allowed & BIT(requested))))
+ return false;
+ }
+
+ if (config & RTIT_CTL_MTC) {
+ /*
+ * In the unlikely case that CPUID lists valid mtc periods,
+ * but not the mtc capability, drop out here.
+ *
+ * Spec says that setting mtc period bits while mtc bit in
+ * CPUID is 0 will #GP, so better safe than sorry.
+ */
+ if (!pt_cap_get(PT_CAP_mtc))
+ return false;
+
+ allowed = pt_cap_get(PT_CAP_mtc_periods);
+ if (!allowed)
+ return false;
+
+ requested = (config & RTIT_CTL_MTC_RANGE) >>
+ RTIT_CTL_MTC_RANGE_OFFSET;
+
+ if (!(allowed & BIT(requested)))
+ return false;
+ }
+
+ if (config & RTIT_CTL_PWR_EVT_EN &&
+ !pt_cap_get(PT_CAP_power_event_trace))
+ return false;
+
+ if (config & RTIT_CTL_PTW) {
+ if (!pt_cap_get(PT_CAP_ptwrite))
+ return false;
+
+ /* FUPonPTW without PTW doesn't make sense */
+ if ((config & RTIT_CTL_FUP_ON_PTW) &&
+ !(config & RTIT_CTL_PTW_EN))
+ return false;
+ }
+
+ /*
+ * Setting bit 0 (TraceEn in RTIT_CTL MSR) in the attr.config
+ * clears the assomption that BranchEn must always be enabled,
+ * as was the case with the first implementation of PT.
+ * If this bit is not set, the legacy behavior is preserved
+ * for compatibility with the older userspace.
+ *
+ * Re-using bit 0 for this purpose is fine because it is never
+ * directly set by the user; previous attempts at setting it in
+ * the attr.config resulted in -EINVAL.
+ */
+ if (config & RTIT_CTL_PASSTHROUGH) {
+ /*
+ * Disallow not setting BRANCH_EN where BRANCH_EN is
+ * always required.
+ */
+ if (pt_pmu.branch_en_always_on &&
+ !(config & RTIT_CTL_BRANCH_EN))
+ return false;
+ } else {
+ /*
+ * Disallow BRANCH_EN without the PASSTHROUGH.
+ */
+ if (config & RTIT_CTL_BRANCH_EN)
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * PT configuration helpers
+ * These all are cpu affine and operate on a local PT
+ */
+
+/* Address ranges and their corresponding msr configuration registers */
+static const struct pt_address_range {
+ unsigned long msr_a;
+ unsigned long msr_b;
+ unsigned int reg_off;
+} pt_address_ranges[] = {
+ {
+ .msr_a = MSR_IA32_RTIT_ADDR0_A,
+ .msr_b = MSR_IA32_RTIT_ADDR0_B,
+ .reg_off = RTIT_CTL_ADDR0_OFFSET,
+ },
+ {
+ .msr_a = MSR_IA32_RTIT_ADDR1_A,
+ .msr_b = MSR_IA32_RTIT_ADDR1_B,
+ .reg_off = RTIT_CTL_ADDR1_OFFSET,
+ },
+ {
+ .msr_a = MSR_IA32_RTIT_ADDR2_A,
+ .msr_b = MSR_IA32_RTIT_ADDR2_B,
+ .reg_off = RTIT_CTL_ADDR2_OFFSET,
+ },
+ {
+ .msr_a = MSR_IA32_RTIT_ADDR3_A,
+ .msr_b = MSR_IA32_RTIT_ADDR3_B,
+ .reg_off = RTIT_CTL_ADDR3_OFFSET,
+ }
+};
+
+static u64 pt_config_filters(struct perf_event *event)
+{
+ struct pt_filters *filters = event->hw.addr_filters;
+ struct pt *pt = this_cpu_ptr(&pt_ctx);
+ unsigned int range = 0;
+ u64 rtit_ctl = 0;
+
+ if (!filters)
+ return 0;
+
+ perf_event_addr_filters_sync(event);
+
+ for (range = 0; range < filters->nr_filters; range++) {
+ struct pt_filter *filter = &filters->filter[range];
+
+ /*
+ * Note, if the range has zero start/end addresses due
+ * to its dynamic object not being loaded yet, we just
+ * go ahead and program zeroed range, which will simply
+ * produce no data. Note^2: if executable code at 0x0
+ * is a concern, we can set up an "invalid" configuration
+ * such as msr_b < msr_a.
+ */
+
+ /* avoid redundant msr writes */
+ if (pt->filters.filter[range].msr_a != filter->msr_a) {
+ wrmsrl(pt_address_ranges[range].msr_a, filter->msr_a);
+ pt->filters.filter[range].msr_a = filter->msr_a;
+ }
+
+ if (pt->filters.filter[range].msr_b != filter->msr_b) {
+ wrmsrl(pt_address_ranges[range].msr_b, filter->msr_b);
+ pt->filters.filter[range].msr_b = filter->msr_b;
+ }
+
+ rtit_ctl |= (u64)filter->config << pt_address_ranges[range].reg_off;
+ }
+
+ return rtit_ctl;
+}
+
+static void pt_config(struct perf_event *event)
+{
+ struct pt *pt = this_cpu_ptr(&pt_ctx);
+ u64 reg;
+
+ /* First round: clear STATUS, in particular the PSB byte counter. */
+ if (!event->hw.config) {
+ perf_event_itrace_started(event);
+ wrmsrl(MSR_IA32_RTIT_STATUS, 0);
+ }
+
+ reg = pt_config_filters(event);
+ reg |= RTIT_CTL_TOPA | RTIT_CTL_TRACEEN;
+
+ /*
+ * Previously, we had BRANCH_EN on by default, but now that PT has
+ * grown features outside of branch tracing, it is useful to allow
+ * the user to disable it. Setting bit 0 in the event's attr.config
+ * allows BRANCH_EN to pass through instead of being always on. See
+ * also the comment in pt_event_valid().
+ */
+ if (event->attr.config & BIT(0)) {
+ reg |= event->attr.config & RTIT_CTL_BRANCH_EN;
+ } else {
+ reg |= RTIT_CTL_BRANCH_EN;
+ }
+
+ if (!event->attr.exclude_kernel)
+ reg |= RTIT_CTL_OS;
+ if (!event->attr.exclude_user)
+ reg |= RTIT_CTL_USR;
+
+ reg |= (event->attr.config & PT_CONFIG_MASK);
+
+ event->hw.config = reg;
+ if (READ_ONCE(pt->vmx_on))
+ perf_aux_output_flag(&pt->handle, PERF_AUX_FLAG_PARTIAL);
+ else
+ wrmsrl(MSR_IA32_RTIT_CTL, reg);
+}
+
+static void pt_config_stop(struct perf_event *event)
+{
+ struct pt *pt = this_cpu_ptr(&pt_ctx);
+ u64 ctl = READ_ONCE(event->hw.config);
+
+ /* may be already stopped by a PMI */
+ if (!(ctl & RTIT_CTL_TRACEEN))
+ return;
+
+ ctl &= ~RTIT_CTL_TRACEEN;
+ if (!READ_ONCE(pt->vmx_on))
+ wrmsrl(MSR_IA32_RTIT_CTL, ctl);
+
+ WRITE_ONCE(event->hw.config, ctl);
+
+ /*
+ * A wrmsr that disables trace generation serializes other PT
+ * registers and causes all data packets to be written to memory,
+ * but a fence is required for the data to become globally visible.
+ *
+ * The below WMB, separating data store and aux_head store matches
+ * the consumer's RMB that separates aux_head load and data load.
+ */
+ wmb();
+}
+
+static void pt_config_buffer(void *buf, unsigned int topa_idx,
+ unsigned int output_off)
+{
+ u64 reg;
+
+ wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, virt_to_phys(buf));
+
+ reg = 0x7f | ((u64)topa_idx << 7) | ((u64)output_off << 32);
+
+ wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, reg);
+}
+
+/*
+ * Keep ToPA table-related metadata on the same page as the actual table,
+ * taking up a few words from the top
+ */
+
+#define TENTS_PER_PAGE (((PAGE_SIZE - 40) / sizeof(struct topa_entry)) - 1)
+
+/**
+ * struct topa - page-sized ToPA table with metadata at the top
+ * @table: actual ToPA table entries, as understood by PT hardware
+ * @list: linkage to struct pt_buffer's list of tables
+ * @phys: physical address of this page
+ * @offset: offset of the first entry in this table in the buffer
+ * @size: total size of all entries in this table
+ * @last: index of the last initialized entry in this table
+ */
+struct topa {
+ struct topa_entry table[TENTS_PER_PAGE];
+ struct list_head list;
+ u64 phys;
+ u64 offset;
+ size_t size;
+ int last;
+};
+
+/* make -1 stand for the last table entry */
+#define TOPA_ENTRY(t, i) ((i) == -1 ? &(t)->table[(t)->last] : &(t)->table[(i)])
+
+/**
+ * topa_alloc() - allocate page-sized ToPA table
+ * @cpu: CPU on which to allocate.
+ * @gfp: Allocation flags.
+ *
+ * Return: On success, return the pointer to ToPA table page.
+ */
+static struct topa *topa_alloc(int cpu, gfp_t gfp)
+{
+ int node = cpu_to_node(cpu);
+ struct topa *topa;
+ struct page *p;
+
+ p = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
+ if (!p)
+ return NULL;
+
+ topa = page_address(p);
+ topa->last = 0;
+ topa->phys = page_to_phys(p);
+
+ /*
+ * In case of singe-entry ToPA, always put the self-referencing END
+ * link as the 2nd entry in the table
+ */
+ if (!pt_cap_get(PT_CAP_topa_multiple_entries)) {
+ TOPA_ENTRY(topa, 1)->base = topa->phys >> TOPA_SHIFT;
+ TOPA_ENTRY(topa, 1)->end = 1;
+ }
+
+ return topa;
+}
+
+/**
+ * topa_free() - free a page-sized ToPA table
+ * @topa: Table to deallocate.
+ */
+static void topa_free(struct topa *topa)
+{
+ free_page((unsigned long)topa);
+}
+
+/**
+ * topa_insert_table() - insert a ToPA table into a buffer
+ * @buf: PT buffer that's being extended.
+ * @topa: New topa table to be inserted.
+ *
+ * If it's the first table in this buffer, set up buffer's pointers
+ * accordingly; otherwise, add a END=1 link entry to @topa to the current
+ * "last" table and adjust the last table pointer to @topa.
+ */
+static void topa_insert_table(struct pt_buffer *buf, struct topa *topa)
+{
+ struct topa *last = buf->last;
+
+ list_add_tail(&topa->list, &buf->tables);
+
+ if (!buf->first) {
+ buf->first = buf->last = buf->cur = topa;
+ return;
+ }
+
+ topa->offset = last->offset + last->size;
+ buf->last = topa;
+
+ if (!pt_cap_get(PT_CAP_topa_multiple_entries))
+ return;
+
+ BUG_ON(last->last != TENTS_PER_PAGE - 1);
+
+ TOPA_ENTRY(last, -1)->base = topa->phys >> TOPA_SHIFT;
+ TOPA_ENTRY(last, -1)->end = 1;
+}
+
+/**
+ * topa_table_full() - check if a ToPA table is filled up
+ * @topa: ToPA table.
+ */
+static bool topa_table_full(struct topa *topa)
+{
+ /* single-entry ToPA is a special case */
+ if (!pt_cap_get(PT_CAP_topa_multiple_entries))
+ return !!topa->last;
+
+ return topa->last == TENTS_PER_PAGE - 1;
+}
+
+/**
+ * topa_insert_pages() - create a list of ToPA tables
+ * @buf: PT buffer being initialized.
+ * @gfp: Allocation flags.
+ *
+ * This initializes a list of ToPA tables with entries from
+ * the data_pages provided by rb_alloc_aux().
+ *
+ * Return: 0 on success or error code.
+ */
+static int topa_insert_pages(struct pt_buffer *buf, gfp_t gfp)
+{
+ struct topa *topa = buf->last;
+ int order = 0;
+ struct page *p;
+
+ p = virt_to_page(buf->data_pages[buf->nr_pages]);
+ if (PagePrivate(p))
+ order = page_private(p);
+
+ if (topa_table_full(topa)) {
+ topa = topa_alloc(buf->cpu, gfp);
+ if (!topa)
+ return -ENOMEM;
+
+ topa_insert_table(buf, topa);
+ }
+
+ TOPA_ENTRY(topa, -1)->base = page_to_phys(p) >> TOPA_SHIFT;
+ TOPA_ENTRY(topa, -1)->size = order;
+ if (!buf->snapshot && !pt_cap_get(PT_CAP_topa_multiple_entries)) {
+ TOPA_ENTRY(topa, -1)->intr = 1;
+ TOPA_ENTRY(topa, -1)->stop = 1;
+ }
+
+ topa->last++;
+ topa->size += sizes(order);
+
+ buf->nr_pages += 1ul << order;
+
+ return 0;
+}
+
+/**
+ * pt_topa_dump() - print ToPA tables and their entries
+ * @buf: PT buffer.
+ */
+static void pt_topa_dump(struct pt_buffer *buf)
+{
+ struct topa *topa;
+
+ list_for_each_entry(topa, &buf->tables, list) {
+ int i;
+
+ pr_debug("# table @%p (%016Lx), off %llx size %zx\n", topa->table,
+ topa->phys, topa->offset, topa->size);
+ for (i = 0; i < TENTS_PER_PAGE; i++) {
+ pr_debug("# entry @%p (%lx sz %u %c%c%c) raw=%16llx\n",
+ &topa->table[i],
+ (unsigned long)topa->table[i].base << TOPA_SHIFT,
+ sizes(topa->table[i].size),
+ topa->table[i].end ? 'E' : ' ',
+ topa->table[i].intr ? 'I' : ' ',
+ topa->table[i].stop ? 'S' : ' ',
+ *(u64 *)&topa->table[i]);
+ if ((pt_cap_get(PT_CAP_topa_multiple_entries) &&
+ topa->table[i].stop) ||
+ topa->table[i].end)
+ break;
+ }
+ }
+}
+
+/**
+ * pt_buffer_advance() - advance to the next output region
+ * @buf: PT buffer.
+ *
+ * Advance the current pointers in the buffer to the next ToPA entry.
+ */
+static void pt_buffer_advance(struct pt_buffer *buf)
+{
+ buf->output_off = 0;
+ buf->cur_idx++;
+
+ if (buf->cur_idx == buf->cur->last) {
+ if (buf->cur == buf->last)
+ buf->cur = buf->first;
+ else
+ buf->cur = list_entry(buf->cur->list.next, struct topa,
+ list);
+ buf->cur_idx = 0;
+ }
+}
+
+/**
+ * pt_update_head() - calculate current offsets and sizes
+ * @pt: Per-cpu pt context.
+ *
+ * Update buffer's current write pointer position and data size.
+ */
+static void pt_update_head(struct pt *pt)
+{
+ struct pt_buffer *buf = perf_get_aux(&pt->handle);
+ u64 topa_idx, base, old;
+
+ /* offset of the first region in this table from the beginning of buf */
+ base = buf->cur->offset + buf->output_off;
+
+ /* offset of the current output region within this table */
+ for (topa_idx = 0; topa_idx < buf->cur_idx; topa_idx++)
+ base += sizes(buf->cur->table[topa_idx].size);
+
+ if (buf->snapshot) {
+ local_set(&buf->data_size, base);
+ } else {
+ old = (local64_xchg(&buf->head, base) &
+ ((buf->nr_pages << PAGE_SHIFT) - 1));
+ if (base < old)
+ base += buf->nr_pages << PAGE_SHIFT;
+
+ local_add(base - old, &buf->data_size);
+ }
+}
+
+/**
+ * pt_buffer_region() - obtain current output region's address
+ * @buf: PT buffer.
+ */
+static void *pt_buffer_region(struct pt_buffer *buf)
+{
+ return phys_to_virt(buf->cur->table[buf->cur_idx].base << TOPA_SHIFT);
+}
+
+/**
+ * pt_buffer_region_size() - obtain current output region's size
+ * @buf: PT buffer.
+ */
+static size_t pt_buffer_region_size(struct pt_buffer *buf)
+{
+ return sizes(buf->cur->table[buf->cur_idx].size);
+}
+
+/**
+ * pt_handle_status() - take care of possible status conditions
+ * @pt: Per-cpu pt context.
+ */
+static void pt_handle_status(struct pt *pt)
+{
+ struct pt_buffer *buf = perf_get_aux(&pt->handle);
+ int advance = 0;
+ u64 status;
+
+ rdmsrl(MSR_IA32_RTIT_STATUS, status);
+
+ if (status & RTIT_STATUS_ERROR) {
+ pr_err_ratelimited("ToPA ERROR encountered, trying to recover\n");
+ pt_topa_dump(buf);
+ status &= ~RTIT_STATUS_ERROR;
+ }
+
+ if (status & RTIT_STATUS_STOPPED) {
+ status &= ~RTIT_STATUS_STOPPED;
+
+ /*
+ * On systems that only do single-entry ToPA, hitting STOP
+ * means we are already losing data; need to let the decoder
+ * know.
+ */
+ if (!pt_cap_get(PT_CAP_topa_multiple_entries) ||
+ buf->output_off == sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size)) {
+ perf_aux_output_flag(&pt->handle,
+ PERF_AUX_FLAG_TRUNCATED);
+ advance++;
+ }
+ }
+
+ /*
+ * Also on single-entry ToPA implementations, interrupt will come
+ * before the output reaches its output region's boundary.
+ */
+ if (!pt_cap_get(PT_CAP_topa_multiple_entries) && !buf->snapshot &&
+ pt_buffer_region_size(buf) - buf->output_off <= TOPA_PMI_MARGIN) {
+ void *head = pt_buffer_region(buf);
+
+ /* everything within this margin needs to be zeroed out */
+ memset(head + buf->output_off, 0,
+ pt_buffer_region_size(buf) -
+ buf->output_off);
+ advance++;
+ }
+
+ if (advance)
+ pt_buffer_advance(buf);
+
+ wrmsrl(MSR_IA32_RTIT_STATUS, status);
+}
+
+/**
+ * pt_read_offset() - translate registers into buffer pointers
+ * @buf: PT buffer.
+ *
+ * Set buffer's output pointers from MSR values.
+ */
+static void pt_read_offset(struct pt_buffer *buf)
+{
+ u64 offset, base_topa;
+
+ rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, base_topa);
+ buf->cur = phys_to_virt(base_topa);
+
+ rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, offset);
+ /* offset within current output region */
+ buf->output_off = offset >> 32;
+ /* index of current output region within this table */
+ buf->cur_idx = (offset & 0xffffff80) >> 7;
+}
+
+/**
+ * pt_topa_next_entry() - obtain index of the first page in the next ToPA entry
+ * @buf: PT buffer.
+ * @pg: Page offset in the buffer.
+ *
+ * When advancing to the next output region (ToPA entry), given a page offset
+ * into the buffer, we need to find the offset of the first page in the next
+ * region.
+ */
+static unsigned int pt_topa_next_entry(struct pt_buffer *buf, unsigned int pg)
+{
+ struct topa_entry *te = buf->topa_index[pg];
+
+ /* one region */
+ if (buf->first == buf->last && buf->first->last == 1)
+ return pg;
+
+ do {
+ pg++;
+ pg &= buf->nr_pages - 1;
+ } while (buf->topa_index[pg] == te);
+
+ return pg;
+}
+
+/**
+ * pt_buffer_reset_markers() - place interrupt and stop bits in the buffer
+ * @buf: PT buffer.
+ * @handle: Current output handle.
+ *
+ * Place INT and STOP marks to prevent overwriting old data that the consumer
+ * hasn't yet collected and waking up the consumer after a certain fraction of
+ * the buffer has filled up. Only needed and sensible for non-snapshot counters.
+ *
+ * This obviously relies on buf::head to figure out buffer markers, so it has
+ * to be called after pt_buffer_reset_offsets() and before the hardware tracing
+ * is enabled.
+ */
+static int pt_buffer_reset_markers(struct pt_buffer *buf,
+ struct perf_output_handle *handle)
+
+{
+ unsigned long head = local64_read(&buf->head);
+ unsigned long idx, npages, wakeup;
+
+ /* can't stop in the middle of an output region */
+ if (buf->output_off + handle->size + 1 <
+ sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size)) {
+ perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED);
+ return -EINVAL;
+ }
+
+
+ /* single entry ToPA is handled by marking all regions STOP=1 INT=1 */
+ if (!pt_cap_get(PT_CAP_topa_multiple_entries))
+ return 0;
+
+ /* clear STOP and INT from current entry */
+ buf->topa_index[buf->stop_pos]->stop = 0;
+ buf->topa_index[buf->stop_pos]->intr = 0;
+ buf->topa_index[buf->intr_pos]->intr = 0;
+
+ /* how many pages till the STOP marker */
+ npages = handle->size >> PAGE_SHIFT;
+
+ /* if it's on a page boundary, fill up one more page */
+ if (!offset_in_page(head + handle->size + 1))
+ npages++;
+
+ idx = (head >> PAGE_SHIFT) + npages;
+ idx &= buf->nr_pages - 1;
+ buf->stop_pos = idx;
+
+ wakeup = handle->wakeup >> PAGE_SHIFT;
+
+ /* in the worst case, wake up the consumer one page before hard stop */
+ idx = (head >> PAGE_SHIFT) + npages - 1;
+ if (idx > wakeup)
+ idx = wakeup;
+
+ idx &= buf->nr_pages - 1;
+ buf->intr_pos = idx;
+
+ buf->topa_index[buf->stop_pos]->stop = 1;
+ buf->topa_index[buf->stop_pos]->intr = 1;
+ buf->topa_index[buf->intr_pos]->intr = 1;
+
+ return 0;
+}
+
+/**
+ * pt_buffer_setup_topa_index() - build topa_index[] table of regions
+ * @buf: PT buffer.
+ *
+ * topa_index[] references output regions indexed by offset into the
+ * buffer for purposes of quick reverse lookup.
+ */
+static void pt_buffer_setup_topa_index(struct pt_buffer *buf)
+{
+ struct topa *cur = buf->first, *prev = buf->last;
+ struct topa_entry *te_cur = TOPA_ENTRY(cur, 0),
+ *te_prev = TOPA_ENTRY(prev, prev->last - 1);
+ int pg = 0, idx = 0;
+
+ while (pg < buf->nr_pages) {
+ int tidx;
+
+ /* pages within one topa entry */
+ for (tidx = 0; tidx < 1 << te_cur->size; tidx++, pg++)
+ buf->topa_index[pg] = te_prev;
+
+ te_prev = te_cur;
+
+ if (idx == cur->last - 1) {
+ /* advance to next topa table */
+ idx = 0;
+ cur = list_entry(cur->list.next, struct topa, list);
+ } else {
+ idx++;
+ }
+ te_cur = TOPA_ENTRY(cur, idx);
+ }
+
+}
+
+/**
+ * pt_buffer_reset_offsets() - adjust buffer's write pointers from aux_head
+ * @buf: PT buffer.
+ * @head: Write pointer (aux_head) from AUX buffer.
+ *
+ * Find the ToPA table and entry corresponding to given @head and set buffer's
+ * "current" pointers accordingly. This is done after we have obtained the
+ * current aux_head position from a successful call to perf_aux_output_begin()
+ * to make sure the hardware is writing to the right place.
+ *
+ * This function modifies buf::{cur,cur_idx,output_off} that will be programmed
+ * into PT msrs when the tracing is enabled and buf::head and buf::data_size,
+ * which are used to determine INT and STOP markers' locations by a subsequent
+ * call to pt_buffer_reset_markers().
+ */
+static void pt_buffer_reset_offsets(struct pt_buffer *buf, unsigned long head)
+{
+ int pg;
+
+ if (buf->snapshot)
+ head &= (buf->nr_pages << PAGE_SHIFT) - 1;
+
+ pg = (head >> PAGE_SHIFT) & (buf->nr_pages - 1);
+ pg = pt_topa_next_entry(buf, pg);
+
+ buf->cur = (struct topa *)((unsigned long)buf->topa_index[pg] & PAGE_MASK);
+ buf->cur_idx = ((unsigned long)buf->topa_index[pg] -
+ (unsigned long)buf->cur) / sizeof(struct topa_entry);
+ buf->output_off = head & (sizes(buf->cur->table[buf->cur_idx].size) - 1);
+
+ local64_set(&buf->head, head);
+ local_set(&buf->data_size, 0);
+}
+
+/**
+ * pt_buffer_fini_topa() - deallocate ToPA structure of a buffer
+ * @buf: PT buffer.
+ */
+static void pt_buffer_fini_topa(struct pt_buffer *buf)
+{
+ struct topa *topa, *iter;
+
+ list_for_each_entry_safe(topa, iter, &buf->tables, list) {
+ /*
+ * right now, this is in free_aux() path only, so
+ * no need to unlink this table from the list
+ */
+ topa_free(topa);
+ }
+}
+
+/**
+ * pt_buffer_init_topa() - initialize ToPA table for pt buffer
+ * @buf: PT buffer.
+ * @size: Total size of all regions within this ToPA.
+ * @gfp: Allocation flags.
+ */
+static int pt_buffer_init_topa(struct pt_buffer *buf, unsigned long nr_pages,
+ gfp_t gfp)
+{
+ struct topa *topa;
+ int err;
+
+ topa = topa_alloc(buf->cpu, gfp);
+ if (!topa)
+ return -ENOMEM;
+
+ topa_insert_table(buf, topa);
+
+ while (buf->nr_pages < nr_pages) {
+ err = topa_insert_pages(buf, gfp);
+ if (err) {
+ pt_buffer_fini_topa(buf);
+ return -ENOMEM;
+ }
+ }
+
+ pt_buffer_setup_topa_index(buf);
+
+ /* link last table to the first one, unless we're double buffering */
+ if (pt_cap_get(PT_CAP_topa_multiple_entries)) {
+ TOPA_ENTRY(buf->last, -1)->base = buf->first->phys >> TOPA_SHIFT;
+ TOPA_ENTRY(buf->last, -1)->end = 1;
+ }
+
+ pt_topa_dump(buf);
+ return 0;
+}
+
+/**
+ * pt_buffer_setup_aux() - set up topa tables for a PT buffer
+ * @cpu: Cpu on which to allocate, -1 means current.
+ * @pages: Array of pointers to buffer pages passed from perf core.
+ * @nr_pages: Number of pages in the buffer.
+ * @snapshot: If this is a snapshot/overwrite counter.
+ *
+ * This is a pmu::setup_aux callback that sets up ToPA tables and all the
+ * bookkeeping for an AUX buffer.
+ *
+ * Return: Our private PT buffer structure.
+ */
+static void *
+pt_buffer_setup_aux(struct perf_event *event, void **pages,
+ int nr_pages, bool snapshot)
+{
+ struct pt_buffer *buf;
+ int node, ret, cpu = event->cpu;
+
+ if (!nr_pages)
+ return NULL;
+
+ if (cpu == -1)
+ cpu = raw_smp_processor_id();
+ node = cpu_to_node(cpu);
+
+ buf = kzalloc_node(offsetof(struct pt_buffer, topa_index[nr_pages]),
+ GFP_KERNEL, node);
+ if (!buf)
+ return NULL;
+
+ buf->cpu = cpu;
+ buf->snapshot = snapshot;
+ buf->data_pages = pages;
+
+ INIT_LIST_HEAD(&buf->tables);
+
+ ret = pt_buffer_init_topa(buf, nr_pages, GFP_KERNEL);
+ if (ret) {
+ kfree(buf);
+ return NULL;
+ }
+
+ return buf;
+}
+
+/**
+ * pt_buffer_free_aux() - perf AUX deallocation path callback
+ * @data: PT buffer.
+ */
+static void pt_buffer_free_aux(void *data)
+{
+ struct pt_buffer *buf = data;
+
+ pt_buffer_fini_topa(buf);
+ kfree(buf);
+}
+
+static int pt_addr_filters_init(struct perf_event *event)
+{
+ struct pt_filters *filters;
+ int node = event->cpu == -1 ? -1 : cpu_to_node(event->cpu);
+
+ if (!pt_cap_get(PT_CAP_num_address_ranges))
+ return 0;
+
+ filters = kzalloc_node(sizeof(struct pt_filters), GFP_KERNEL, node);
+ if (!filters)
+ return -ENOMEM;
+
+ if (event->parent)
+ memcpy(filters, event->parent->hw.addr_filters,
+ sizeof(*filters));
+
+ event->hw.addr_filters = filters;
+
+ return 0;
+}
+
+static void pt_addr_filters_fini(struct perf_event *event)
+{
+ kfree(event->hw.addr_filters);
+ event->hw.addr_filters = NULL;
+}
+
+static inline bool valid_kernel_ip(unsigned long ip)
+{
+ return virt_addr_valid(ip) && kernel_ip(ip);
+}
+
+static int pt_event_addr_filters_validate(struct list_head *filters)
+{
+ struct perf_addr_filter *filter;
+ int range = 0;
+
+ list_for_each_entry(filter, filters, entry) {
+ /*
+ * PT doesn't support single address triggers and
+ * 'start' filters.
+ */
+ if (!filter->size ||
+ filter->action == PERF_ADDR_FILTER_ACTION_START)
+ return -EOPNOTSUPP;
+
+ if (!filter->path.dentry) {
+ if (!valid_kernel_ip(filter->offset))
+ return -EINVAL;
+
+ if (!valid_kernel_ip(filter->offset + filter->size))
+ return -EINVAL;
+ }
+
+ if (++range > pt_cap_get(PT_CAP_num_address_ranges))
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
+static void pt_event_addr_filters_sync(struct perf_event *event)
+{
+ struct perf_addr_filters_head *head = perf_event_addr_filters(event);
+ unsigned long msr_a, msr_b;
+ struct perf_addr_filter_range *fr = event->addr_filter_ranges;
+ struct pt_filters *filters = event->hw.addr_filters;
+ struct perf_addr_filter *filter;
+ int range = 0;
+
+ if (!filters)
+ return;
+
+ list_for_each_entry(filter, &head->list, entry) {
+ if (filter->path.dentry && !fr[range].start) {
+ msr_a = msr_b = 0;
+ } else {
+ /* apply the offset */
+ msr_a = fr[range].start;
+ msr_b = msr_a + fr[range].size - 1;
+ }
+
+ filters->filter[range].msr_a = msr_a;
+ filters->filter[range].msr_b = msr_b;
+ if (filter->action == PERF_ADDR_FILTER_ACTION_FILTER)
+ filters->filter[range].config = 1;
+ else
+ filters->filter[range].config = 2;
+ range++;
+ }
+
+ filters->nr_filters = range;
+}
+
+/**
+ * intel_pt_interrupt() - PT PMI handler
+ */
+void intel_pt_interrupt(void)
+{
+ struct pt *pt = this_cpu_ptr(&pt_ctx);
+ struct pt_buffer *buf;
+ struct perf_event *event = pt->handle.event;
+
+ /*
+ * There may be a dangling PT bit in the interrupt status register
+ * after PT has been disabled by pt_event_stop(). Make sure we don't
+ * do anything (particularly, re-enable) for this event here.
+ */
+ if (!READ_ONCE(pt->handle_nmi))
+ return;
+
+ if (!event)
+ return;
+
+ pt_config_stop(event);
+
+ buf = perf_get_aux(&pt->handle);
+ if (!buf)
+ return;
+
+ pt_read_offset(buf);
+
+ pt_handle_status(pt);
+
+ pt_update_head(pt);
+
+ perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0));
+
+ if (!event->hw.state) {
+ int ret;
+
+ buf = perf_aux_output_begin(&pt->handle, event);
+ if (!buf) {
+ event->hw.state = PERF_HES_STOPPED;
+ return;
+ }
+
+ pt_buffer_reset_offsets(buf, pt->handle.head);
+ /* snapshot counters don't use PMI, so it's safe */
+ ret = pt_buffer_reset_markers(buf, &pt->handle);
+ if (ret) {
+ perf_aux_output_end(&pt->handle, 0);
+ return;
+ }
+
+ pt_config_buffer(buf->cur->table, buf->cur_idx,
+ buf->output_off);
+ pt_config(event);
+ }
+}
+
+void intel_pt_handle_vmx(int on)
+{
+ struct pt *pt = this_cpu_ptr(&pt_ctx);
+ struct perf_event *event;
+ unsigned long flags;
+
+ /* PT plays nice with VMX, do nothing */
+ if (pt_pmu.vmx)
+ return;
+
+ /*
+ * VMXON will clear RTIT_CTL.TraceEn; we need to make
+ * sure to not try to set it while VMX is on. Disable
+ * interrupts to avoid racing with pmu callbacks;
+ * concurrent PMI should be handled fine.
+ */
+ local_irq_save(flags);
+ WRITE_ONCE(pt->vmx_on, on);
+
+ /*
+ * If an AUX transaction is in progress, it will contain
+ * gap(s), so flag it PARTIAL to inform the user.
+ */
+ event = pt->handle.event;
+ if (event)
+ perf_aux_output_flag(&pt->handle,
+ PERF_AUX_FLAG_PARTIAL);
+
+ /* Turn PTs back on */
+ if (!on && event)
+ wrmsrl(MSR_IA32_RTIT_CTL, event->hw.config);
+
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(intel_pt_handle_vmx);
+
+/*
+ * PMU callbacks
+ */
+
+static void pt_event_start(struct perf_event *event, int mode)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct pt *pt = this_cpu_ptr(&pt_ctx);
+ struct pt_buffer *buf;
+
+ buf = perf_aux_output_begin(&pt->handle, event);
+ if (!buf)
+ goto fail_stop;
+
+ pt_buffer_reset_offsets(buf, pt->handle.head);
+ if (!buf->snapshot) {
+ if (pt_buffer_reset_markers(buf, &pt->handle))
+ goto fail_end_stop;
+ }
+
+ WRITE_ONCE(pt->handle_nmi, 1);
+ hwc->state = 0;
+
+ pt_config_buffer(buf->cur->table, buf->cur_idx,
+ buf->output_off);
+ pt_config(event);
+
+ return;
+
+fail_end_stop:
+ perf_aux_output_end(&pt->handle, 0);
+fail_stop:
+ hwc->state = PERF_HES_STOPPED;
+}
+
+static void pt_event_stop(struct perf_event *event, int mode)
+{
+ struct pt *pt = this_cpu_ptr(&pt_ctx);
+
+ /*
+ * Protect against the PMI racing with disabling wrmsr,
+ * see comment in intel_pt_interrupt().
+ */
+ WRITE_ONCE(pt->handle_nmi, 0);
+
+ pt_config_stop(event);
+
+ if (event->hw.state == PERF_HES_STOPPED)
+ return;
+
+ event->hw.state = PERF_HES_STOPPED;
+
+ if (mode & PERF_EF_UPDATE) {
+ struct pt_buffer *buf = perf_get_aux(&pt->handle);
+
+ if (!buf)
+ return;
+
+ if (WARN_ON_ONCE(pt->handle.event != event))
+ return;
+
+ pt_read_offset(buf);
+
+ pt_handle_status(pt);
+
+ pt_update_head(pt);
+
+ if (buf->snapshot)
+ pt->handle.head =
+ local_xchg(&buf->data_size,
+ buf->nr_pages << PAGE_SHIFT);
+ perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0));
+ }
+}
+
+static void pt_event_del(struct perf_event *event, int mode)
+{
+ pt_event_stop(event, PERF_EF_UPDATE);
+}
+
+static int pt_event_add(struct perf_event *event, int mode)
+{
+ struct pt *pt = this_cpu_ptr(&pt_ctx);
+ struct hw_perf_event *hwc = &event->hw;
+ int ret = -EBUSY;
+
+ if (pt->handle.event)
+ goto fail;
+
+ if (mode & PERF_EF_START) {
+ pt_event_start(event, 0);
+ ret = -EINVAL;
+ if (hwc->state == PERF_HES_STOPPED)
+ goto fail;
+ } else {
+ hwc->state = PERF_HES_STOPPED;
+ }
+
+ ret = 0;
+fail:
+
+ return ret;
+}
+
+static void pt_event_read(struct perf_event *event)
+{
+}
+
+static void pt_event_destroy(struct perf_event *event)
+{
+ pt_addr_filters_fini(event);
+ x86_del_exclusive(x86_lbr_exclusive_pt);
+}
+
+static int pt_event_init(struct perf_event *event)
+{
+ if (event->attr.type != pt_pmu.pmu.type)
+ return -ENOENT;
+
+ if (!pt_event_valid(event))
+ return -EINVAL;
+
+ if (x86_add_exclusive(x86_lbr_exclusive_pt))
+ return -EBUSY;
+
+ if (pt_addr_filters_init(event)) {
+ x86_del_exclusive(x86_lbr_exclusive_pt);
+ return -ENOMEM;
+ }
+
+ event->destroy = pt_event_destroy;
+
+ return 0;
+}
+
+void cpu_emergency_stop_pt(void)
+{
+ struct pt *pt = this_cpu_ptr(&pt_ctx);
+
+ if (pt->handle.event)
+ pt_event_stop(pt->handle.event, PERF_EF_UPDATE);
+}
+
+static __init int pt_init(void)
+{
+ int ret, cpu, prior_warn = 0;
+
+ BUILD_BUG_ON(sizeof(struct topa) > PAGE_SIZE);
+
+ if (!boot_cpu_has(X86_FEATURE_INTEL_PT))
+ return -ENODEV;
+
+ get_online_cpus();
+ for_each_online_cpu(cpu) {
+ u64 ctl;
+
+ ret = rdmsrl_safe_on_cpu(cpu, MSR_IA32_RTIT_CTL, &ctl);
+ if (!ret && (ctl & RTIT_CTL_TRACEEN))
+ prior_warn++;
+ }
+ put_online_cpus();
+
+ if (prior_warn) {
+ x86_add_exclusive(x86_lbr_exclusive_pt);
+ pr_warn("PT is enabled at boot time, doing nothing\n");
+
+ return -EBUSY;
+ }
+
+ ret = pt_pmu_hw_init();
+ if (ret)
+ return ret;
+
+ if (!pt_cap_get(PT_CAP_topa_output)) {
+ pr_warn("ToPA output is not supported on this CPU\n");
+ return -ENODEV;
+ }
+
+ if (!pt_cap_get(PT_CAP_topa_multiple_entries))
+ pt_pmu.pmu.capabilities =
+ PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_AUX_SW_DOUBLEBUF;
+
+ pt_pmu.pmu.capabilities |= PERF_PMU_CAP_EXCLUSIVE | PERF_PMU_CAP_ITRACE;
+ pt_pmu.pmu.attr_groups = pt_attr_groups;
+ pt_pmu.pmu.task_ctx_nr = perf_sw_context;
+ pt_pmu.pmu.event_init = pt_event_init;
+ pt_pmu.pmu.add = pt_event_add;
+ pt_pmu.pmu.del = pt_event_del;
+ pt_pmu.pmu.start = pt_event_start;
+ pt_pmu.pmu.stop = pt_event_stop;
+ pt_pmu.pmu.read = pt_event_read;
+ pt_pmu.pmu.setup_aux = pt_buffer_setup_aux;
+ pt_pmu.pmu.free_aux = pt_buffer_free_aux;
+ pt_pmu.pmu.addr_filters_sync = pt_event_addr_filters_sync;
+ pt_pmu.pmu.addr_filters_validate = pt_event_addr_filters_validate;
+ pt_pmu.pmu.nr_addr_filters =
+ pt_cap_get(PT_CAP_num_address_ranges);
+
+ ret = perf_pmu_register(&pt_pmu.pmu, "intel_pt", -1);
+
+ return ret;
+}
+arch_initcall(pt_init);
diff --git a/arch/x86/events/intel/pt.h b/arch/x86/events/intel/pt.h
new file mode 100644
index 000000000..0eb41d07b
--- /dev/null
+++ b/arch/x86/events/intel/pt.h
@@ -0,0 +1,192 @@
+/*
+ * Intel(R) Processor Trace PMU driver for perf
+ * Copyright (c) 2013-2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * Intel PT is specified in the Intel Architecture Instruction Set Extensions
+ * Programming Reference:
+ * http://software.intel.com/en-us/intel-isa-extensions
+ */
+
+#ifndef __INTEL_PT_H__
+#define __INTEL_PT_H__
+
+/*
+ * PT MSR bit definitions
+ */
+#define RTIT_CTL_TRACEEN BIT(0)
+#define RTIT_CTL_CYCLEACC BIT(1)
+#define RTIT_CTL_OS BIT(2)
+#define RTIT_CTL_USR BIT(3)
+#define RTIT_CTL_PWR_EVT_EN BIT(4)
+#define RTIT_CTL_FUP_ON_PTW BIT(5)
+#define RTIT_CTL_CR3EN BIT(7)
+#define RTIT_CTL_TOPA BIT(8)
+#define RTIT_CTL_MTC_EN BIT(9)
+#define RTIT_CTL_TSC_EN BIT(10)
+#define RTIT_CTL_DISRETC BIT(11)
+#define RTIT_CTL_PTW_EN BIT(12)
+#define RTIT_CTL_BRANCH_EN BIT(13)
+#define RTIT_CTL_MTC_RANGE_OFFSET 14
+#define RTIT_CTL_MTC_RANGE (0x0full << RTIT_CTL_MTC_RANGE_OFFSET)
+#define RTIT_CTL_CYC_THRESH_OFFSET 19
+#define RTIT_CTL_CYC_THRESH (0x0full << RTIT_CTL_CYC_THRESH_OFFSET)
+#define RTIT_CTL_PSB_FREQ_OFFSET 24
+#define RTIT_CTL_PSB_FREQ (0x0full << RTIT_CTL_PSB_FREQ_OFFSET)
+#define RTIT_CTL_ADDR0_OFFSET 32
+#define RTIT_CTL_ADDR0 (0x0full << RTIT_CTL_ADDR0_OFFSET)
+#define RTIT_CTL_ADDR1_OFFSET 36
+#define RTIT_CTL_ADDR1 (0x0full << RTIT_CTL_ADDR1_OFFSET)
+#define RTIT_CTL_ADDR2_OFFSET 40
+#define RTIT_CTL_ADDR2 (0x0full << RTIT_CTL_ADDR2_OFFSET)
+#define RTIT_CTL_ADDR3_OFFSET 44
+#define RTIT_CTL_ADDR3 (0x0full << RTIT_CTL_ADDR3_OFFSET)
+#define RTIT_STATUS_FILTEREN BIT(0)
+#define RTIT_STATUS_CONTEXTEN BIT(1)
+#define RTIT_STATUS_TRIGGEREN BIT(2)
+#define RTIT_STATUS_BUFFOVF BIT(3)
+#define RTIT_STATUS_ERROR BIT(4)
+#define RTIT_STATUS_STOPPED BIT(5)
+
+/*
+ * Single-entry ToPA: when this close to region boundary, switch
+ * buffers to avoid losing data.
+ */
+#define TOPA_PMI_MARGIN 512
+
+#define TOPA_SHIFT 12
+
+static inline unsigned int sizes(unsigned int tsz)
+{
+ return 1 << (tsz + TOPA_SHIFT);
+};
+
+struct topa_entry {
+ u64 end : 1;
+ u64 rsvd0 : 1;
+ u64 intr : 1;
+ u64 rsvd1 : 1;
+ u64 stop : 1;
+ u64 rsvd2 : 1;
+ u64 size : 4;
+ u64 rsvd3 : 2;
+ u64 base : 36;
+ u64 rsvd4 : 16;
+};
+
+#define PT_CPUID_LEAVES 2
+#define PT_CPUID_REGS_NUM 4 /* number of regsters (eax, ebx, ecx, edx) */
+
+/* TSC to Core Crystal Clock Ratio */
+#define CPUID_TSC_LEAF 0x15
+
+enum pt_capabilities {
+ PT_CAP_max_subleaf = 0,
+ PT_CAP_cr3_filtering,
+ PT_CAP_psb_cyc,
+ PT_CAP_ip_filtering,
+ PT_CAP_mtc,
+ PT_CAP_ptwrite,
+ PT_CAP_power_event_trace,
+ PT_CAP_topa_output,
+ PT_CAP_topa_multiple_entries,
+ PT_CAP_single_range_output,
+ PT_CAP_payloads_lip,
+ PT_CAP_num_address_ranges,
+ PT_CAP_mtc_periods,
+ PT_CAP_cycle_thresholds,
+ PT_CAP_psb_periods,
+};
+
+struct pt_pmu {
+ struct pmu pmu;
+ u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES];
+ bool vmx;
+ bool branch_en_always_on;
+ unsigned long max_nonturbo_ratio;
+ unsigned int tsc_art_num;
+ unsigned int tsc_art_den;
+};
+
+/**
+ * struct pt_buffer - buffer configuration; one buffer per task_struct or
+ * cpu, depending on perf event configuration
+ * @cpu: cpu for per-cpu allocation
+ * @tables: list of ToPA tables in this buffer
+ * @first: shorthand for first topa table
+ * @last: shorthand for last topa table
+ * @cur: current topa table
+ * @nr_pages: buffer size in pages
+ * @cur_idx: current output region's index within @cur table
+ * @output_off: offset within the current output region
+ * @data_size: running total of the amount of data in this buffer
+ * @lost: if data was lost/truncated
+ * @head: logical write offset inside the buffer
+ * @snapshot: if this is for a snapshot/overwrite counter
+ * @stop_pos: STOP topa entry in the buffer
+ * @intr_pos: INT topa entry in the buffer
+ * @data_pages: array of pages from perf
+ * @topa_index: table of topa entries indexed by page offset
+ */
+struct pt_buffer {
+ int cpu;
+ struct list_head tables;
+ struct topa *first, *last, *cur;
+ unsigned int cur_idx;
+ size_t output_off;
+ unsigned long nr_pages;
+ local_t data_size;
+ local64_t head;
+ bool snapshot;
+ unsigned long stop_pos, intr_pos;
+ void **data_pages;
+ struct topa_entry *topa_index[0];
+};
+
+#define PT_FILTERS_NUM 4
+
+/**
+ * struct pt_filter - IP range filter configuration
+ * @msr_a: range start, goes to RTIT_ADDRn_A
+ * @msr_b: range end, goes to RTIT_ADDRn_B
+ * @config: 4-bit field in RTIT_CTL
+ */
+struct pt_filter {
+ unsigned long msr_a;
+ unsigned long msr_b;
+ unsigned long config;
+};
+
+/**
+ * struct pt_filters - IP range filtering context
+ * @filter: filters defined for this context
+ * @nr_filters: number of defined filters in the @filter array
+ */
+struct pt_filters {
+ struct pt_filter filter[PT_FILTERS_NUM];
+ unsigned int nr_filters;
+};
+
+/**
+ * struct pt - per-cpu pt context
+ * @handle: perf output handle
+ * @filters: last configured filters
+ * @handle_nmi: do handle PT PMI on this cpu, there's an active event
+ * @vmx_on: 1 if VMX is ON on this cpu
+ */
+struct pt {
+ struct perf_output_handle handle;
+ struct pt_filters filters;
+ int handle_nmi;
+ int vmx_on;
+};
+
+#endif /* __INTEL_PT_H__ */
diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c
new file mode 100644
index 000000000..bc348663d
--- /dev/null
+++ b/arch/x86/events/intel/rapl.c
@@ -0,0 +1,833 @@
+/*
+ * Support Intel RAPL energy consumption counters
+ * Copyright (C) 2013 Google, Inc., Stephane Eranian
+ *
+ * Intel RAPL interface is specified in the IA-32 Manual Vol3b
+ * section 14.7.1 (September 2013)
+ *
+ * RAPL provides more controls than just reporting energy consumption
+ * however here we only expose the 3 energy consumption free running
+ * counters (pp0, pkg, dram).
+ *
+ * Each of those counters increments in a power unit defined by the
+ * RAPL_POWER_UNIT MSR. On SandyBridge, this unit is 1/(2^16) Joules
+ * but it can vary.
+ *
+ * Counter to rapl events mappings:
+ *
+ * pp0 counter: consumption of all physical cores (power plane 0)
+ * event: rapl_energy_cores
+ * perf code: 0x1
+ *
+ * pkg counter: consumption of the whole processor package
+ * event: rapl_energy_pkg
+ * perf code: 0x2
+ *
+ * dram counter: consumption of the dram domain (servers only)
+ * event: rapl_energy_dram
+ * perf code: 0x3
+ *
+ * gpu counter: consumption of the builtin-gpu domain (client only)
+ * event: rapl_energy_gpu
+ * perf code: 0x4
+ *
+ * psys counter: consumption of the builtin-psys domain (client only)
+ * event: rapl_energy_psys
+ * perf code: 0x5
+ *
+ * We manage those counters as free running (read-only). They may be
+ * use simultaneously by other tools, such as turbostat.
+ *
+ * The events only support system-wide mode counting. There is no
+ * sampling support because it does not make sense and is not
+ * supported by the RAPL hardware.
+ *
+ * Because we want to avoid floating-point operations in the kernel,
+ * the events are all reported in fixed point arithmetic (32.32).
+ * Tools must adjust the counts to convert them to Watts using
+ * the duration of the measurement. Tools may use a function such as
+ * ldexp(raw_count, -32);
+ */
+
+#define pr_fmt(fmt) "RAPL PMU: " fmt
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/perf_event.h>
+#include <asm/cpu_device_id.h>
+#include <asm/intel-family.h>
+#include "../perf_event.h"
+
+MODULE_LICENSE("GPL");
+
+/*
+ * RAPL energy status counters
+ */
+#define RAPL_IDX_PP0_NRG_STAT 0 /* all cores */
+#define INTEL_RAPL_PP0 0x1 /* pseudo-encoding */
+#define RAPL_IDX_PKG_NRG_STAT 1 /* entire package */
+#define INTEL_RAPL_PKG 0x2 /* pseudo-encoding */
+#define RAPL_IDX_RAM_NRG_STAT 2 /* DRAM */
+#define INTEL_RAPL_RAM 0x3 /* pseudo-encoding */
+#define RAPL_IDX_PP1_NRG_STAT 3 /* gpu */
+#define INTEL_RAPL_PP1 0x4 /* pseudo-encoding */
+#define RAPL_IDX_PSYS_NRG_STAT 4 /* psys */
+#define INTEL_RAPL_PSYS 0x5 /* pseudo-encoding */
+
+#define NR_RAPL_DOMAINS 0x5
+static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
+ "pp0-core",
+ "package",
+ "dram",
+ "pp1-gpu",
+ "psys",
+};
+
+/* Clients have PP0, PKG */
+#define RAPL_IDX_CLN (1<<RAPL_IDX_PP0_NRG_STAT|\
+ 1<<RAPL_IDX_PKG_NRG_STAT|\
+ 1<<RAPL_IDX_PP1_NRG_STAT)
+
+/* Servers have PP0, PKG, RAM */
+#define RAPL_IDX_SRV (1<<RAPL_IDX_PP0_NRG_STAT|\
+ 1<<RAPL_IDX_PKG_NRG_STAT|\
+ 1<<RAPL_IDX_RAM_NRG_STAT)
+
+/* Servers have PP0, PKG, RAM, PP1 */
+#define RAPL_IDX_HSW (1<<RAPL_IDX_PP0_NRG_STAT|\
+ 1<<RAPL_IDX_PKG_NRG_STAT|\
+ 1<<RAPL_IDX_RAM_NRG_STAT|\
+ 1<<RAPL_IDX_PP1_NRG_STAT)
+
+/* SKL clients have PP0, PKG, RAM, PP1, PSYS */
+#define RAPL_IDX_SKL_CLN (1<<RAPL_IDX_PP0_NRG_STAT|\
+ 1<<RAPL_IDX_PKG_NRG_STAT|\
+ 1<<RAPL_IDX_RAM_NRG_STAT|\
+ 1<<RAPL_IDX_PP1_NRG_STAT|\
+ 1<<RAPL_IDX_PSYS_NRG_STAT)
+
+/* Knights Landing has PKG, RAM */
+#define RAPL_IDX_KNL (1<<RAPL_IDX_PKG_NRG_STAT|\
+ 1<<RAPL_IDX_RAM_NRG_STAT)
+
+/*
+ * event code: LSB 8 bits, passed in attr->config
+ * any other bit is reserved
+ */
+#define RAPL_EVENT_MASK 0xFFULL
+#define RAPL_CNTR_WIDTH 32
+
+#define RAPL_EVENT_ATTR_STR(_name, v, str) \
+static struct perf_pmu_events_attr event_attr_##v = { \
+ .attr = __ATTR(_name, 0444, perf_event_sysfs_show, NULL), \
+ .id = 0, \
+ .event_str = str, \
+};
+
+struct rapl_pmu {
+ raw_spinlock_t lock;
+ int n_active;
+ int cpu;
+ struct list_head active_list;
+ struct pmu *pmu;
+ ktime_t timer_interval;
+ struct hrtimer hrtimer;
+};
+
+struct rapl_pmus {
+ struct pmu pmu;
+ unsigned int maxpkg;
+ struct rapl_pmu *pmus[];
+};
+
+ /* 1/2^hw_unit Joule */
+static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly;
+static struct rapl_pmus *rapl_pmus;
+static cpumask_t rapl_cpu_mask;
+static unsigned int rapl_cntr_mask;
+static u64 rapl_timer_ms;
+
+static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu)
+{
+ unsigned int pkgid = topology_logical_package_id(cpu);
+
+ /*
+ * The unsigned check also catches the '-1' return value for non
+ * existent mappings in the topology map.
+ */
+ return pkgid < rapl_pmus->maxpkg ? rapl_pmus->pmus[pkgid] : NULL;
+}
+
+static inline u64 rapl_read_counter(struct perf_event *event)
+{
+ u64 raw;
+ rdmsrl(event->hw.event_base, raw);
+ return raw;
+}
+
+static inline u64 rapl_scale(u64 v, int cfg)
+{
+ if (cfg > NR_RAPL_DOMAINS) {
+ pr_warn("Invalid domain %d, failed to scale data\n", cfg);
+ return v;
+ }
+ /*
+ * scale delta to smallest unit (1/2^32)
+ * users must then scale back: count * 1/(1e9*2^32) to get Joules
+ * or use ldexp(count, -32).
+ * Watts = Joules/Time delta
+ */
+ return v << (32 - rapl_hw_unit[cfg - 1]);
+}
+
+static u64 rapl_event_update(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ u64 prev_raw_count, new_raw_count;
+ s64 delta, sdelta;
+ int shift = RAPL_CNTR_WIDTH;
+
+again:
+ prev_raw_count = local64_read(&hwc->prev_count);
+ rdmsrl(event->hw.event_base, new_raw_count);
+
+ if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
+ new_raw_count) != prev_raw_count) {
+ cpu_relax();
+ goto again;
+ }
+
+ /*
+ * Now we have the new raw value and have updated the prev
+ * timestamp already. We can now calculate the elapsed delta
+ * (event-)time and add that to the generic event.
+ *
+ * Careful, not all hw sign-extends above the physical width
+ * of the count.
+ */
+ delta = (new_raw_count << shift) - (prev_raw_count << shift);
+ delta >>= shift;
+
+ sdelta = rapl_scale(delta, event->hw.config);
+
+ local64_add(sdelta, &event->count);
+
+ return new_raw_count;
+}
+
+static void rapl_start_hrtimer(struct rapl_pmu *pmu)
+{
+ hrtimer_start(&pmu->hrtimer, pmu->timer_interval,
+ HRTIMER_MODE_REL_PINNED);
+}
+
+static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
+{
+ struct rapl_pmu *pmu = container_of(hrtimer, struct rapl_pmu, hrtimer);
+ struct perf_event *event;
+ unsigned long flags;
+
+ if (!pmu->n_active)
+ return HRTIMER_NORESTART;
+
+ raw_spin_lock_irqsave(&pmu->lock, flags);
+
+ list_for_each_entry(event, &pmu->active_list, active_entry)
+ rapl_event_update(event);
+
+ raw_spin_unlock_irqrestore(&pmu->lock, flags);
+
+ hrtimer_forward_now(hrtimer, pmu->timer_interval);
+
+ return HRTIMER_RESTART;
+}
+
+static void rapl_hrtimer_init(struct rapl_pmu *pmu)
+{
+ struct hrtimer *hr = &pmu->hrtimer;
+
+ hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hr->function = rapl_hrtimer_handle;
+}
+
+static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
+ struct perf_event *event)
+{
+ if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+ return;
+
+ event->hw.state = 0;
+
+ list_add_tail(&event->active_entry, &pmu->active_list);
+
+ local64_set(&event->hw.prev_count, rapl_read_counter(event));
+
+ pmu->n_active++;
+ if (pmu->n_active == 1)
+ rapl_start_hrtimer(pmu);
+}
+
+static void rapl_pmu_event_start(struct perf_event *event, int mode)
+{
+ struct rapl_pmu *pmu = event->pmu_private;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&pmu->lock, flags);
+ __rapl_pmu_event_start(pmu, event);
+ raw_spin_unlock_irqrestore(&pmu->lock, flags);
+}
+
+static void rapl_pmu_event_stop(struct perf_event *event, int mode)
+{
+ struct rapl_pmu *pmu = event->pmu_private;
+ struct hw_perf_event *hwc = &event->hw;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&pmu->lock, flags);
+
+ /* mark event as deactivated and stopped */
+ if (!(hwc->state & PERF_HES_STOPPED)) {
+ WARN_ON_ONCE(pmu->n_active <= 0);
+ pmu->n_active--;
+ if (pmu->n_active == 0)
+ hrtimer_cancel(&pmu->hrtimer);
+
+ list_del(&event->active_entry);
+
+ WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+ hwc->state |= PERF_HES_STOPPED;
+ }
+
+ /* check if update of sw counter is necessary */
+ if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+ /*
+ * Drain the remaining delta count out of a event
+ * that we are disabling:
+ */
+ rapl_event_update(event);
+ hwc->state |= PERF_HES_UPTODATE;
+ }
+
+ raw_spin_unlock_irqrestore(&pmu->lock, flags);
+}
+
+static int rapl_pmu_event_add(struct perf_event *event, int mode)
+{
+ struct rapl_pmu *pmu = event->pmu_private;
+ struct hw_perf_event *hwc = &event->hw;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&pmu->lock, flags);
+
+ hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+
+ if (mode & PERF_EF_START)
+ __rapl_pmu_event_start(pmu, event);
+
+ raw_spin_unlock_irqrestore(&pmu->lock, flags);
+
+ return 0;
+}
+
+static void rapl_pmu_event_del(struct perf_event *event, int flags)
+{
+ rapl_pmu_event_stop(event, PERF_EF_UPDATE);
+}
+
+static int rapl_pmu_event_init(struct perf_event *event)
+{
+ u64 cfg = event->attr.config & RAPL_EVENT_MASK;
+ int bit, msr, ret = 0;
+ struct rapl_pmu *pmu;
+
+ /* only look at RAPL events */
+ if (event->attr.type != rapl_pmus->pmu.type)
+ return -ENOENT;
+
+ /* check only supported bits are set */
+ if (event->attr.config & ~RAPL_EVENT_MASK)
+ return -EINVAL;
+
+ if (event->cpu < 0)
+ return -EINVAL;
+
+ event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG;
+
+ /*
+ * check event is known (determines counter)
+ */
+ switch (cfg) {
+ case INTEL_RAPL_PP0:
+ bit = RAPL_IDX_PP0_NRG_STAT;
+ msr = MSR_PP0_ENERGY_STATUS;
+ break;
+ case INTEL_RAPL_PKG:
+ bit = RAPL_IDX_PKG_NRG_STAT;
+ msr = MSR_PKG_ENERGY_STATUS;
+ break;
+ case INTEL_RAPL_RAM:
+ bit = RAPL_IDX_RAM_NRG_STAT;
+ msr = MSR_DRAM_ENERGY_STATUS;
+ break;
+ case INTEL_RAPL_PP1:
+ bit = RAPL_IDX_PP1_NRG_STAT;
+ msr = MSR_PP1_ENERGY_STATUS;
+ break;
+ case INTEL_RAPL_PSYS:
+ bit = RAPL_IDX_PSYS_NRG_STAT;
+ msr = MSR_PLATFORM_ENERGY_STATUS;
+ break;
+ default:
+ return -EINVAL;
+ }
+ /* check event supported */
+ if (!(rapl_cntr_mask & (1 << bit)))
+ return -EINVAL;
+
+ /* unsupported modes and filters */
+ if (event->attr.exclude_user ||
+ event->attr.exclude_kernel ||
+ event->attr.exclude_hv ||
+ event->attr.exclude_idle ||
+ event->attr.exclude_host ||
+ event->attr.exclude_guest ||
+ event->attr.sample_period) /* no sampling */
+ return -EINVAL;
+
+ /* must be done before validate_group */
+ pmu = cpu_to_rapl_pmu(event->cpu);
+ if (!pmu)
+ return -EINVAL;
+ event->cpu = pmu->cpu;
+ event->pmu_private = pmu;
+ event->hw.event_base = msr;
+ event->hw.config = cfg;
+ event->hw.idx = bit;
+
+ return ret;
+}
+
+static void rapl_pmu_event_read(struct perf_event *event)
+{
+ rapl_event_update(event);
+}
+
+static ssize_t rapl_get_attr_cpumask(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return cpumap_print_to_pagebuf(true, buf, &rapl_cpu_mask);
+}
+
+static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL);
+
+static struct attribute *rapl_pmu_attrs[] = {
+ &dev_attr_cpumask.attr,
+ NULL,
+};
+
+static struct attribute_group rapl_pmu_attr_group = {
+ .attrs = rapl_pmu_attrs,
+};
+
+RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
+RAPL_EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02");
+RAPL_EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03");
+RAPL_EVENT_ATTR_STR(energy-gpu , rapl_gpu, "event=0x04");
+RAPL_EVENT_ATTR_STR(energy-psys, rapl_psys, "event=0x05");
+
+RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
+RAPL_EVENT_ATTR_STR(energy-pkg.unit , rapl_pkg_unit, "Joules");
+RAPL_EVENT_ATTR_STR(energy-ram.unit , rapl_ram_unit, "Joules");
+RAPL_EVENT_ATTR_STR(energy-gpu.unit , rapl_gpu_unit, "Joules");
+RAPL_EVENT_ATTR_STR(energy-psys.unit, rapl_psys_unit, "Joules");
+
+/*
+ * we compute in 0.23 nJ increments regardless of MSR
+ */
+RAPL_EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10");
+RAPL_EVENT_ATTR_STR(energy-pkg.scale, rapl_pkg_scale, "2.3283064365386962890625e-10");
+RAPL_EVENT_ATTR_STR(energy-ram.scale, rapl_ram_scale, "2.3283064365386962890625e-10");
+RAPL_EVENT_ATTR_STR(energy-gpu.scale, rapl_gpu_scale, "2.3283064365386962890625e-10");
+RAPL_EVENT_ATTR_STR(energy-psys.scale, rapl_psys_scale, "2.3283064365386962890625e-10");
+
+static struct attribute *rapl_events_srv_attr[] = {
+ EVENT_PTR(rapl_cores),
+ EVENT_PTR(rapl_pkg),
+ EVENT_PTR(rapl_ram),
+
+ EVENT_PTR(rapl_cores_unit),
+ EVENT_PTR(rapl_pkg_unit),
+ EVENT_PTR(rapl_ram_unit),
+
+ EVENT_PTR(rapl_cores_scale),
+ EVENT_PTR(rapl_pkg_scale),
+ EVENT_PTR(rapl_ram_scale),
+ NULL,
+};
+
+static struct attribute *rapl_events_cln_attr[] = {
+ EVENT_PTR(rapl_cores),
+ EVENT_PTR(rapl_pkg),
+ EVENT_PTR(rapl_gpu),
+
+ EVENT_PTR(rapl_cores_unit),
+ EVENT_PTR(rapl_pkg_unit),
+ EVENT_PTR(rapl_gpu_unit),
+
+ EVENT_PTR(rapl_cores_scale),
+ EVENT_PTR(rapl_pkg_scale),
+ EVENT_PTR(rapl_gpu_scale),
+ NULL,
+};
+
+static struct attribute *rapl_events_hsw_attr[] = {
+ EVENT_PTR(rapl_cores),
+ EVENT_PTR(rapl_pkg),
+ EVENT_PTR(rapl_gpu),
+ EVENT_PTR(rapl_ram),
+
+ EVENT_PTR(rapl_cores_unit),
+ EVENT_PTR(rapl_pkg_unit),
+ EVENT_PTR(rapl_gpu_unit),
+ EVENT_PTR(rapl_ram_unit),
+
+ EVENT_PTR(rapl_cores_scale),
+ EVENT_PTR(rapl_pkg_scale),
+ EVENT_PTR(rapl_gpu_scale),
+ EVENT_PTR(rapl_ram_scale),
+ NULL,
+};
+
+static struct attribute *rapl_events_skl_attr[] = {
+ EVENT_PTR(rapl_cores),
+ EVENT_PTR(rapl_pkg),
+ EVENT_PTR(rapl_gpu),
+ EVENT_PTR(rapl_ram),
+ EVENT_PTR(rapl_psys),
+
+ EVENT_PTR(rapl_cores_unit),
+ EVENT_PTR(rapl_pkg_unit),
+ EVENT_PTR(rapl_gpu_unit),
+ EVENT_PTR(rapl_ram_unit),
+ EVENT_PTR(rapl_psys_unit),
+
+ EVENT_PTR(rapl_cores_scale),
+ EVENT_PTR(rapl_pkg_scale),
+ EVENT_PTR(rapl_gpu_scale),
+ EVENT_PTR(rapl_ram_scale),
+ EVENT_PTR(rapl_psys_scale),
+ NULL,
+};
+
+static struct attribute *rapl_events_knl_attr[] = {
+ EVENT_PTR(rapl_pkg),
+ EVENT_PTR(rapl_ram),
+
+ EVENT_PTR(rapl_pkg_unit),
+ EVENT_PTR(rapl_ram_unit),
+
+ EVENT_PTR(rapl_pkg_scale),
+ EVENT_PTR(rapl_ram_scale),
+ NULL,
+};
+
+static struct attribute_group rapl_pmu_events_group = {
+ .name = "events",
+ .attrs = NULL, /* patched at runtime */
+};
+
+PMU_FORMAT_ATTR(event, "config:0-7");
+static struct attribute *rapl_formats_attr[] = {
+ &format_attr_event.attr,
+ NULL,
+};
+
+static struct attribute_group rapl_pmu_format_group = {
+ .name = "format",
+ .attrs = rapl_formats_attr,
+};
+
+static const struct attribute_group *rapl_attr_groups[] = {
+ &rapl_pmu_attr_group,
+ &rapl_pmu_format_group,
+ &rapl_pmu_events_group,
+ NULL,
+};
+
+static int rapl_cpu_offline(unsigned int cpu)
+{
+ struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
+ int target;
+
+ /* Check if exiting cpu is used for collecting rapl events */
+ if (!cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask))
+ return 0;
+
+ pmu->cpu = -1;
+ /* Find a new cpu to collect rapl events */
+ target = cpumask_any_but(topology_core_cpumask(cpu), cpu);
+
+ /* Migrate rapl events to the new target */
+ if (target < nr_cpu_ids) {
+ cpumask_set_cpu(target, &rapl_cpu_mask);
+ pmu->cpu = target;
+ perf_pmu_migrate_context(pmu->pmu, cpu, target);
+ }
+ return 0;
+}
+
+static int rapl_cpu_online(unsigned int cpu)
+{
+ struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
+ int target;
+
+ if (!pmu) {
+ pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
+ if (!pmu)
+ return -ENOMEM;
+
+ raw_spin_lock_init(&pmu->lock);
+ INIT_LIST_HEAD(&pmu->active_list);
+ pmu->pmu = &rapl_pmus->pmu;
+ pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
+ rapl_hrtimer_init(pmu);
+
+ rapl_pmus->pmus[topology_logical_package_id(cpu)] = pmu;
+ }
+
+ /*
+ * Check if there is an online cpu in the package which collects rapl
+ * events already.
+ */
+ target = cpumask_any_and(&rapl_cpu_mask, topology_core_cpumask(cpu));
+ if (target < nr_cpu_ids)
+ return 0;
+
+ cpumask_set_cpu(cpu, &rapl_cpu_mask);
+ pmu->cpu = cpu;
+ return 0;
+}
+
+static int rapl_check_hw_unit(bool apply_quirk)
+{
+ u64 msr_rapl_power_unit_bits;
+ int i;
+
+ /* protect rdmsrl() to handle virtualization */
+ if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits))
+ return -1;
+ for (i = 0; i < NR_RAPL_DOMAINS; i++)
+ rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
+
+ /*
+ * DRAM domain on HSW server and KNL has fixed energy unit which can be
+ * different than the unit from power unit MSR. See
+ * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2
+ * of 2. Datasheet, September 2014, Reference Number: 330784-001 "
+ */
+ if (apply_quirk)
+ rapl_hw_unit[RAPL_IDX_RAM_NRG_STAT] = 16;
+
+ /*
+ * Calculate the timer rate:
+ * Use reference of 200W for scaling the timeout to avoid counter
+ * overflows. 200W = 200 Joules/sec
+ * Divide interval by 2 to avoid lockstep (2 * 100)
+ * if hw unit is 32, then we use 2 ms 1/200/2
+ */
+ rapl_timer_ms = 2;
+ if (rapl_hw_unit[0] < 32) {
+ rapl_timer_ms = (1000 / (2 * 100));
+ rapl_timer_ms *= (1ULL << (32 - rapl_hw_unit[0] - 1));
+ }
+ return 0;
+}
+
+static void __init rapl_advertise(void)
+{
+ int i;
+
+ pr_info("API unit is 2^-32 Joules, %d fixed counters, %llu ms ovfl timer\n",
+ hweight32(rapl_cntr_mask), rapl_timer_ms);
+
+ for (i = 0; i < NR_RAPL_DOMAINS; i++) {
+ if (rapl_cntr_mask & (1 << i)) {
+ pr_info("hw unit of domain %s 2^-%d Joules\n",
+ rapl_domain_names[i], rapl_hw_unit[i]);
+ }
+ }
+}
+
+static void cleanup_rapl_pmus(void)
+{
+ int i;
+
+ for (i = 0; i < rapl_pmus->maxpkg; i++)
+ kfree(rapl_pmus->pmus[i]);
+ kfree(rapl_pmus);
+}
+
+static int __init init_rapl_pmus(void)
+{
+ int maxpkg = topology_max_packages();
+ size_t size;
+
+ size = sizeof(*rapl_pmus) + maxpkg * sizeof(struct rapl_pmu *);
+ rapl_pmus = kzalloc(size, GFP_KERNEL);
+ if (!rapl_pmus)
+ return -ENOMEM;
+
+ rapl_pmus->maxpkg = maxpkg;
+ rapl_pmus->pmu.attr_groups = rapl_attr_groups;
+ rapl_pmus->pmu.task_ctx_nr = perf_invalid_context;
+ rapl_pmus->pmu.event_init = rapl_pmu_event_init;
+ rapl_pmus->pmu.add = rapl_pmu_event_add;
+ rapl_pmus->pmu.del = rapl_pmu_event_del;
+ rapl_pmus->pmu.start = rapl_pmu_event_start;
+ rapl_pmus->pmu.stop = rapl_pmu_event_stop;
+ rapl_pmus->pmu.read = rapl_pmu_event_read;
+ rapl_pmus->pmu.module = THIS_MODULE;
+ return 0;
+}
+
+#define X86_RAPL_MODEL_MATCH(model, init) \
+ { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long)&init }
+
+struct intel_rapl_init_fun {
+ bool apply_quirk;
+ int cntr_mask;
+ struct attribute **attrs;
+};
+
+static const struct intel_rapl_init_fun snb_rapl_init __initconst = {
+ .apply_quirk = false,
+ .cntr_mask = RAPL_IDX_CLN,
+ .attrs = rapl_events_cln_attr,
+};
+
+static const struct intel_rapl_init_fun hsx_rapl_init __initconst = {
+ .apply_quirk = true,
+ .cntr_mask = RAPL_IDX_SRV,
+ .attrs = rapl_events_srv_attr,
+};
+
+static const struct intel_rapl_init_fun hsw_rapl_init __initconst = {
+ .apply_quirk = false,
+ .cntr_mask = RAPL_IDX_HSW,
+ .attrs = rapl_events_hsw_attr,
+};
+
+static const struct intel_rapl_init_fun snbep_rapl_init __initconst = {
+ .apply_quirk = false,
+ .cntr_mask = RAPL_IDX_SRV,
+ .attrs = rapl_events_srv_attr,
+};
+
+static const struct intel_rapl_init_fun knl_rapl_init __initconst = {
+ .apply_quirk = true,
+ .cntr_mask = RAPL_IDX_KNL,
+ .attrs = rapl_events_knl_attr,
+};
+
+static const struct intel_rapl_init_fun skl_rapl_init __initconst = {
+ .apply_quirk = false,
+ .cntr_mask = RAPL_IDX_SKL_CLN,
+ .attrs = rapl_events_skl_attr,
+};
+
+static const struct x86_cpu_id rapl_cpu_match[] __initconst = {
+ X86_RAPL_MODEL_MATCH(INTEL_FAM6_SANDYBRIDGE, snb_rapl_init),
+ X86_RAPL_MODEL_MATCH(INTEL_FAM6_SANDYBRIDGE_X, snbep_rapl_init),
+
+ X86_RAPL_MODEL_MATCH(INTEL_FAM6_IVYBRIDGE, snb_rapl_init),
+ X86_RAPL_MODEL_MATCH(INTEL_FAM6_IVYBRIDGE_X, snbep_rapl_init),
+
+ X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_CORE, hsw_rapl_init),
+ X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_X, hsx_rapl_init),
+ X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_ULT, hsw_rapl_init),
+ X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_GT3E, hsw_rapl_init),
+
+ X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_CORE, hsw_rapl_init),
+ X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_GT3E, hsw_rapl_init),
+ X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_X, hsx_rapl_init),
+ X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_XEON_D, hsx_rapl_init),
+
+ X86_RAPL_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNL, knl_rapl_init),
+ X86_RAPL_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNM, knl_rapl_init),
+
+ X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_MOBILE, skl_rapl_init),
+ X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_DESKTOP, skl_rapl_init),
+ X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_X, hsx_rapl_init),
+
+ X86_RAPL_MODEL_MATCH(INTEL_FAM6_KABYLAKE_MOBILE, skl_rapl_init),
+ X86_RAPL_MODEL_MATCH(INTEL_FAM6_KABYLAKE_DESKTOP, skl_rapl_init),
+
+ X86_RAPL_MODEL_MATCH(INTEL_FAM6_CANNONLAKE_MOBILE, skl_rapl_init),
+
+ X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT, hsw_rapl_init),
+ X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT_X, hsw_rapl_init),
+
+ X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT_PLUS, hsw_rapl_init),
+
+ X86_RAPL_MODEL_MATCH(INTEL_FAM6_ICELAKE_MOBILE, skl_rapl_init),
+ {},
+};
+
+MODULE_DEVICE_TABLE(x86cpu, rapl_cpu_match);
+
+static int __init rapl_pmu_init(void)
+{
+ const struct x86_cpu_id *id;
+ struct intel_rapl_init_fun *rapl_init;
+ bool apply_quirk;
+ int ret;
+
+ id = x86_match_cpu(rapl_cpu_match);
+ if (!id)
+ return -ENODEV;
+
+ rapl_init = (struct intel_rapl_init_fun *)id->driver_data;
+ apply_quirk = rapl_init->apply_quirk;
+ rapl_cntr_mask = rapl_init->cntr_mask;
+ rapl_pmu_events_group.attrs = rapl_init->attrs;
+
+ ret = rapl_check_hw_unit(apply_quirk);
+ if (ret)
+ return ret;
+
+ ret = init_rapl_pmus();
+ if (ret)
+ return ret;
+
+ /*
+ * Install callbacks. Core will call them for each online cpu.
+ */
+ ret = cpuhp_setup_state(CPUHP_AP_PERF_X86_RAPL_ONLINE,
+ "perf/x86/rapl:online",
+ rapl_cpu_online, rapl_cpu_offline);
+ if (ret)
+ goto out;
+
+ ret = perf_pmu_register(&rapl_pmus->pmu, "power", -1);
+ if (ret)
+ goto out1;
+
+ rapl_advertise();
+ return 0;
+
+out1:
+ cpuhp_remove_state(CPUHP_AP_PERF_X86_RAPL_ONLINE);
+out:
+ pr_warn("Initialization failed (%d), disabled\n", ret);
+ cleanup_rapl_pmus();
+ return ret;
+}
+module_init(rapl_pmu_init);
+
+static void __exit intel_rapl_exit(void)
+{
+ cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_RAPL_ONLINE);
+ perf_pmu_unregister(&rapl_pmus->pmu);
+ cleanup_rapl_pmus();
+}
+module_exit(intel_rapl_exit);
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
new file mode 100644
index 000000000..2f4ed5aa0
--- /dev/null
+++ b/arch/x86/events/intel/uncore.c
@@ -0,0 +1,1492 @@
+#include <linux/module.h>
+
+#include <asm/cpu_device_id.h>
+#include <asm/intel-family.h>
+#include "uncore.h"
+
+static struct intel_uncore_type *empty_uncore[] = { NULL, };
+struct intel_uncore_type **uncore_msr_uncores = empty_uncore;
+struct intel_uncore_type **uncore_pci_uncores = empty_uncore;
+
+static bool pcidrv_registered;
+struct pci_driver *uncore_pci_driver;
+/* pci bus to socket mapping */
+DEFINE_RAW_SPINLOCK(pci2phy_map_lock);
+struct list_head pci2phy_map_head = LIST_HEAD_INIT(pci2phy_map_head);
+struct pci_extra_dev *uncore_extra_pci_dev;
+static int max_packages;
+
+/* mask of cpus that collect uncore events */
+static cpumask_t uncore_cpu_mask;
+
+/* constraint for the fixed counter */
+static struct event_constraint uncore_constraint_fixed =
+ EVENT_CONSTRAINT(~0ULL, 1 << UNCORE_PMC_IDX_FIXED, ~0ULL);
+struct event_constraint uncore_constraint_empty =
+ EVENT_CONSTRAINT(0, 0, 0);
+
+MODULE_LICENSE("GPL");
+
+static int uncore_pcibus_to_physid(struct pci_bus *bus)
+{
+ struct pci2phy_map *map;
+ int phys_id = -1;
+
+ raw_spin_lock(&pci2phy_map_lock);
+ list_for_each_entry(map, &pci2phy_map_head, list) {
+ if (map->segment == pci_domain_nr(bus)) {
+ phys_id = map->pbus_to_physid[bus->number];
+ break;
+ }
+ }
+ raw_spin_unlock(&pci2phy_map_lock);
+
+ return phys_id;
+}
+
+static void uncore_free_pcibus_map(void)
+{
+ struct pci2phy_map *map, *tmp;
+
+ list_for_each_entry_safe(map, tmp, &pci2phy_map_head, list) {
+ list_del(&map->list);
+ kfree(map);
+ }
+}
+
+struct pci2phy_map *__find_pci2phy_map(int segment)
+{
+ struct pci2phy_map *map, *alloc = NULL;
+ int i;
+
+ lockdep_assert_held(&pci2phy_map_lock);
+
+lookup:
+ list_for_each_entry(map, &pci2phy_map_head, list) {
+ if (map->segment == segment)
+ goto end;
+ }
+
+ if (!alloc) {
+ raw_spin_unlock(&pci2phy_map_lock);
+ alloc = kmalloc(sizeof(struct pci2phy_map), GFP_KERNEL);
+ raw_spin_lock(&pci2phy_map_lock);
+
+ if (!alloc)
+ return NULL;
+
+ goto lookup;
+ }
+
+ map = alloc;
+ alloc = NULL;
+ map->segment = segment;
+ for (i = 0; i < 256; i++)
+ map->pbus_to_physid[i] = -1;
+ list_add_tail(&map->list, &pci2phy_map_head);
+
+end:
+ kfree(alloc);
+ return map;
+}
+
+ssize_t uncore_event_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct uncore_event_desc *event =
+ container_of(attr, struct uncore_event_desc, attr);
+ return sprintf(buf, "%s", event->config);
+}
+
+struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu)
+{
+ unsigned int pkgid = topology_logical_package_id(cpu);
+
+ /*
+ * The unsigned check also catches the '-1' return value for non
+ * existent mappings in the topology map.
+ */
+ return pkgid < max_packages ? pmu->boxes[pkgid] : NULL;
+}
+
+u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event)
+{
+ u64 count;
+
+ rdmsrl(event->hw.event_base, count);
+
+ return count;
+}
+
+/*
+ * generic get constraint function for shared match/mask registers.
+ */
+struct event_constraint *
+uncore_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct intel_uncore_extra_reg *er;
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
+ unsigned long flags;
+ bool ok = false;
+
+ /*
+ * reg->alloc can be set due to existing state, so for fake box we
+ * need to ignore this, otherwise we might fail to allocate proper
+ * fake state for this extra reg constraint.
+ */
+ if (reg1->idx == EXTRA_REG_NONE ||
+ (!uncore_box_is_fake(box) && reg1->alloc))
+ return NULL;
+
+ er = &box->shared_regs[reg1->idx];
+ raw_spin_lock_irqsave(&er->lock, flags);
+ if (!atomic_read(&er->ref) ||
+ (er->config1 == reg1->config && er->config2 == reg2->config)) {
+ atomic_inc(&er->ref);
+ er->config1 = reg1->config;
+ er->config2 = reg2->config;
+ ok = true;
+ }
+ raw_spin_unlock_irqrestore(&er->lock, flags);
+
+ if (ok) {
+ if (!uncore_box_is_fake(box))
+ reg1->alloc = 1;
+ return NULL;
+ }
+
+ return &uncore_constraint_empty;
+}
+
+void uncore_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct intel_uncore_extra_reg *er;
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+
+ /*
+ * Only put constraint if extra reg was actually allocated. Also
+ * takes care of event which do not use an extra shared reg.
+ *
+ * Also, if this is a fake box we shouldn't touch any event state
+ * (reg->alloc) and we don't care about leaving inconsistent box
+ * state either since it will be thrown out.
+ */
+ if (uncore_box_is_fake(box) || !reg1->alloc)
+ return;
+
+ er = &box->shared_regs[reg1->idx];
+ atomic_dec(&er->ref);
+ reg1->alloc = 0;
+}
+
+u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx)
+{
+ struct intel_uncore_extra_reg *er;
+ unsigned long flags;
+ u64 config;
+
+ er = &box->shared_regs[idx];
+
+ raw_spin_lock_irqsave(&er->lock, flags);
+ config = er->config;
+ raw_spin_unlock_irqrestore(&er->lock, flags);
+
+ return config;
+}
+
+static void uncore_assign_hw_event(struct intel_uncore_box *box,
+ struct perf_event *event, int idx)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ hwc->idx = idx;
+ hwc->last_tag = ++box->tags[idx];
+
+ if (uncore_pmc_fixed(hwc->idx)) {
+ hwc->event_base = uncore_fixed_ctr(box);
+ hwc->config_base = uncore_fixed_ctl(box);
+ return;
+ }
+
+ hwc->config_base = uncore_event_ctl(box, hwc->idx);
+ hwc->event_base = uncore_perf_ctr(box, hwc->idx);
+}
+
+void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event)
+{
+ u64 prev_count, new_count, delta;
+ int shift;
+
+ if (uncore_pmc_freerunning(event->hw.idx))
+ shift = 64 - uncore_freerunning_bits(box, event);
+ else if (uncore_pmc_fixed(event->hw.idx))
+ shift = 64 - uncore_fixed_ctr_bits(box);
+ else
+ shift = 64 - uncore_perf_ctr_bits(box);
+
+ /* the hrtimer might modify the previous event value */
+again:
+ prev_count = local64_read(&event->hw.prev_count);
+ new_count = uncore_read_counter(box, event);
+ if (local64_xchg(&event->hw.prev_count, new_count) != prev_count)
+ goto again;
+
+ delta = (new_count << shift) - (prev_count << shift);
+ delta >>= shift;
+
+ local64_add(delta, &event->count);
+}
+
+/*
+ * The overflow interrupt is unavailable for SandyBridge-EP, is broken
+ * for SandyBridge. So we use hrtimer to periodically poll the counter
+ * to avoid overflow.
+ */
+static enum hrtimer_restart uncore_pmu_hrtimer(struct hrtimer *hrtimer)
+{
+ struct intel_uncore_box *box;
+ struct perf_event *event;
+ unsigned long flags;
+ int bit;
+
+ box = container_of(hrtimer, struct intel_uncore_box, hrtimer);
+ if (!box->n_active || box->cpu != smp_processor_id())
+ return HRTIMER_NORESTART;
+ /*
+ * disable local interrupt to prevent uncore_pmu_event_start/stop
+ * to interrupt the update process
+ */
+ local_irq_save(flags);
+
+ /*
+ * handle boxes with an active event list as opposed to active
+ * counters
+ */
+ list_for_each_entry(event, &box->active_list, active_entry) {
+ uncore_perf_event_update(box, event);
+ }
+
+ for_each_set_bit(bit, box->active_mask, UNCORE_PMC_IDX_MAX)
+ uncore_perf_event_update(box, box->events[bit]);
+
+ local_irq_restore(flags);
+
+ hrtimer_forward_now(hrtimer, ns_to_ktime(box->hrtimer_duration));
+ return HRTIMER_RESTART;
+}
+
+void uncore_pmu_start_hrtimer(struct intel_uncore_box *box)
+{
+ hrtimer_start(&box->hrtimer, ns_to_ktime(box->hrtimer_duration),
+ HRTIMER_MODE_REL_PINNED);
+}
+
+void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box)
+{
+ hrtimer_cancel(&box->hrtimer);
+}
+
+static void uncore_pmu_init_hrtimer(struct intel_uncore_box *box)
+{
+ hrtimer_init(&box->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ box->hrtimer.function = uncore_pmu_hrtimer;
+}
+
+static struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type,
+ int node)
+{
+ int i, size, numshared = type->num_shared_regs ;
+ struct intel_uncore_box *box;
+
+ size = sizeof(*box) + numshared * sizeof(struct intel_uncore_extra_reg);
+
+ box = kzalloc_node(size, GFP_KERNEL, node);
+ if (!box)
+ return NULL;
+
+ for (i = 0; i < numshared; i++)
+ raw_spin_lock_init(&box->shared_regs[i].lock);
+
+ uncore_pmu_init_hrtimer(box);
+ box->cpu = -1;
+ box->pci_phys_id = -1;
+ box->pkgid = -1;
+
+ /* set default hrtimer timeout */
+ box->hrtimer_duration = UNCORE_PMU_HRTIMER_INTERVAL;
+
+ INIT_LIST_HEAD(&box->active_list);
+
+ return box;
+}
+
+/*
+ * Using uncore_pmu_event_init pmu event_init callback
+ * as a detection point for uncore events.
+ */
+static int uncore_pmu_event_init(struct perf_event *event);
+
+static bool is_box_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ return &box->pmu->pmu == event->pmu;
+}
+
+static int
+uncore_collect_events(struct intel_uncore_box *box, struct perf_event *leader,
+ bool dogrp)
+{
+ struct perf_event *event;
+ int n, max_count;
+
+ max_count = box->pmu->type->num_counters;
+ if (box->pmu->type->fixed_ctl)
+ max_count++;
+
+ if (box->n_events >= max_count)
+ return -EINVAL;
+
+ n = box->n_events;
+
+ if (is_box_event(box, leader)) {
+ box->event_list[n] = leader;
+ n++;
+ }
+
+ if (!dogrp)
+ return n;
+
+ for_each_sibling_event(event, leader) {
+ if (!is_box_event(box, event) ||
+ event->state <= PERF_EVENT_STATE_OFF)
+ continue;
+
+ if (n >= max_count)
+ return -EINVAL;
+
+ box->event_list[n] = event;
+ n++;
+ }
+ return n;
+}
+
+static struct event_constraint *
+uncore_get_event_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct intel_uncore_type *type = box->pmu->type;
+ struct event_constraint *c;
+
+ if (type->ops->get_constraint) {
+ c = type->ops->get_constraint(box, event);
+ if (c)
+ return c;
+ }
+
+ if (event->attr.config == UNCORE_FIXED_EVENT)
+ return &uncore_constraint_fixed;
+
+ if (type->constraints) {
+ for_each_event_constraint(c, type->constraints) {
+ if ((event->hw.config & c->cmask) == c->code)
+ return c;
+ }
+ }
+
+ return &type->unconstrainted;
+}
+
+static void uncore_put_event_constraint(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ if (box->pmu->type->ops->put_constraint)
+ box->pmu->type->ops->put_constraint(box, event);
+}
+
+static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int n)
+{
+ unsigned long used_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)];
+ struct event_constraint *c;
+ int i, wmin, wmax, ret = 0;
+ struct hw_perf_event *hwc;
+
+ bitmap_zero(used_mask, UNCORE_PMC_IDX_MAX);
+
+ for (i = 0, wmin = UNCORE_PMC_IDX_MAX, wmax = 0; i < n; i++) {
+ c = uncore_get_event_constraint(box, box->event_list[i]);
+ box->event_constraint[i] = c;
+ wmin = min(wmin, c->weight);
+ wmax = max(wmax, c->weight);
+ }
+
+ /* fastpath, try to reuse previous register */
+ for (i = 0; i < n; i++) {
+ hwc = &box->event_list[i]->hw;
+ c = box->event_constraint[i];
+
+ /* never assigned */
+ if (hwc->idx == -1)
+ break;
+
+ /* constraint still honored */
+ if (!test_bit(hwc->idx, c->idxmsk))
+ break;
+
+ /* not already used */
+ if (test_bit(hwc->idx, used_mask))
+ break;
+
+ __set_bit(hwc->idx, used_mask);
+ if (assign)
+ assign[i] = hwc->idx;
+ }
+ /* slow path */
+ if (i != n)
+ ret = perf_assign_events(box->event_constraint, n,
+ wmin, wmax, n, assign);
+
+ if (!assign || ret) {
+ for (i = 0; i < n; i++)
+ uncore_put_event_constraint(box, box->event_list[i]);
+ }
+ return ret ? -EINVAL : 0;
+}
+
+void uncore_pmu_event_start(struct perf_event *event, int flags)
+{
+ struct intel_uncore_box *box = uncore_event_to_box(event);
+ int idx = event->hw.idx;
+
+ if (WARN_ON_ONCE(idx == -1 || idx >= UNCORE_PMC_IDX_MAX))
+ return;
+
+ /*
+ * Free running counter is read-only and always active.
+ * Use the current counter value as start point.
+ * There is no overflow interrupt for free running counter.
+ * Use hrtimer to periodically poll the counter to avoid overflow.
+ */
+ if (uncore_pmc_freerunning(event->hw.idx)) {
+ list_add_tail(&event->active_entry, &box->active_list);
+ local64_set(&event->hw.prev_count,
+ uncore_read_counter(box, event));
+ if (box->n_active++ == 0)
+ uncore_pmu_start_hrtimer(box);
+ return;
+ }
+
+ if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+ return;
+
+ event->hw.state = 0;
+ box->events[idx] = event;
+ box->n_active++;
+ __set_bit(idx, box->active_mask);
+
+ local64_set(&event->hw.prev_count, uncore_read_counter(box, event));
+ uncore_enable_event(box, event);
+
+ if (box->n_active == 1)
+ uncore_pmu_start_hrtimer(box);
+}
+
+void uncore_pmu_event_stop(struct perf_event *event, int flags)
+{
+ struct intel_uncore_box *box = uncore_event_to_box(event);
+ struct hw_perf_event *hwc = &event->hw;
+
+ /* Cannot disable free running counter which is read-only */
+ if (uncore_pmc_freerunning(hwc->idx)) {
+ list_del(&event->active_entry);
+ if (--box->n_active == 0)
+ uncore_pmu_cancel_hrtimer(box);
+ uncore_perf_event_update(box, event);
+ return;
+ }
+
+ if (__test_and_clear_bit(hwc->idx, box->active_mask)) {
+ uncore_disable_event(box, event);
+ box->n_active--;
+ box->events[hwc->idx] = NULL;
+ WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+ hwc->state |= PERF_HES_STOPPED;
+
+ if (box->n_active == 0)
+ uncore_pmu_cancel_hrtimer(box);
+ }
+
+ if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+ /*
+ * Drain the remaining delta count out of a event
+ * that we are disabling:
+ */
+ uncore_perf_event_update(box, event);
+ hwc->state |= PERF_HES_UPTODATE;
+ }
+}
+
+int uncore_pmu_event_add(struct perf_event *event, int flags)
+{
+ struct intel_uncore_box *box = uncore_event_to_box(event);
+ struct hw_perf_event *hwc = &event->hw;
+ int assign[UNCORE_PMC_IDX_MAX];
+ int i, n, ret;
+
+ if (!box)
+ return -ENODEV;
+
+ /*
+ * The free funning counter is assigned in event_init().
+ * The free running counter event and free running counter
+ * are 1:1 mapped. It doesn't need to be tracked in event_list.
+ */
+ if (uncore_pmc_freerunning(hwc->idx)) {
+ if (flags & PERF_EF_START)
+ uncore_pmu_event_start(event, 0);
+ return 0;
+ }
+
+ ret = n = uncore_collect_events(box, event, false);
+ if (ret < 0)
+ return ret;
+
+ hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+ if (!(flags & PERF_EF_START))
+ hwc->state |= PERF_HES_ARCH;
+
+ ret = uncore_assign_events(box, assign, n);
+ if (ret)
+ return ret;
+
+ /* save events moving to new counters */
+ for (i = 0; i < box->n_events; i++) {
+ event = box->event_list[i];
+ hwc = &event->hw;
+
+ if (hwc->idx == assign[i] &&
+ hwc->last_tag == box->tags[assign[i]])
+ continue;
+ /*
+ * Ensure we don't accidentally enable a stopped
+ * counter simply because we rescheduled.
+ */
+ if (hwc->state & PERF_HES_STOPPED)
+ hwc->state |= PERF_HES_ARCH;
+
+ uncore_pmu_event_stop(event, PERF_EF_UPDATE);
+ }
+
+ /* reprogram moved events into new counters */
+ for (i = 0; i < n; i++) {
+ event = box->event_list[i];
+ hwc = &event->hw;
+
+ if (hwc->idx != assign[i] ||
+ hwc->last_tag != box->tags[assign[i]])
+ uncore_assign_hw_event(box, event, assign[i]);
+ else if (i < box->n_events)
+ continue;
+
+ if (hwc->state & PERF_HES_ARCH)
+ continue;
+
+ uncore_pmu_event_start(event, 0);
+ }
+ box->n_events = n;
+
+ return 0;
+}
+
+void uncore_pmu_event_del(struct perf_event *event, int flags)
+{
+ struct intel_uncore_box *box = uncore_event_to_box(event);
+ int i;
+
+ uncore_pmu_event_stop(event, PERF_EF_UPDATE);
+
+ /*
+ * The event for free running counter is not tracked by event_list.
+ * It doesn't need to force event->hw.idx = -1 to reassign the counter.
+ * Because the event and the free running counter are 1:1 mapped.
+ */
+ if (uncore_pmc_freerunning(event->hw.idx))
+ return;
+
+ for (i = 0; i < box->n_events; i++) {
+ if (event == box->event_list[i]) {
+ uncore_put_event_constraint(box, event);
+
+ for (++i; i < box->n_events; i++)
+ box->event_list[i - 1] = box->event_list[i];
+
+ --box->n_events;
+ break;
+ }
+ }
+
+ event->hw.idx = -1;
+ event->hw.last_tag = ~0ULL;
+}
+
+void uncore_pmu_event_read(struct perf_event *event)
+{
+ struct intel_uncore_box *box = uncore_event_to_box(event);
+ uncore_perf_event_update(box, event);
+}
+
+/*
+ * validation ensures the group can be loaded onto the
+ * PMU if it was the only group available.
+ */
+static int uncore_validate_group(struct intel_uncore_pmu *pmu,
+ struct perf_event *event)
+{
+ struct perf_event *leader = event->group_leader;
+ struct intel_uncore_box *fake_box;
+ int ret = -EINVAL, n;
+
+ /* The free running counter is always active. */
+ if (uncore_pmc_freerunning(event->hw.idx))
+ return 0;
+
+ fake_box = uncore_alloc_box(pmu->type, NUMA_NO_NODE);
+ if (!fake_box)
+ return -ENOMEM;
+
+ fake_box->pmu = pmu;
+ /*
+ * the event is not yet connected with its
+ * siblings therefore we must first collect
+ * existing siblings, then add the new event
+ * before we can simulate the scheduling
+ */
+ n = uncore_collect_events(fake_box, leader, true);
+ if (n < 0)
+ goto out;
+
+ fake_box->n_events = n;
+ n = uncore_collect_events(fake_box, event, false);
+ if (n < 0)
+ goto out;
+
+ fake_box->n_events = n;
+
+ ret = uncore_assign_events(fake_box, NULL, n);
+out:
+ kfree(fake_box);
+ return ret;
+}
+
+static int uncore_pmu_event_init(struct perf_event *event)
+{
+ struct intel_uncore_pmu *pmu;
+ struct intel_uncore_box *box;
+ struct hw_perf_event *hwc = &event->hw;
+ int ret;
+
+ if (event->attr.type != event->pmu->type)
+ return -ENOENT;
+
+ pmu = uncore_event_to_pmu(event);
+ /* no device found for this pmu */
+ if (pmu->func_id < 0)
+ return -ENOENT;
+
+ /*
+ * Uncore PMU does measure at all privilege level all the time.
+ * So it doesn't make sense to specify any exclude bits.
+ */
+ if (event->attr.exclude_user || event->attr.exclude_kernel ||
+ event->attr.exclude_hv || event->attr.exclude_idle)
+ return -EINVAL;
+
+ /* Sampling not supported yet */
+ if (hwc->sample_period)
+ return -EINVAL;
+
+ /*
+ * Place all uncore events for a particular physical package
+ * onto a single cpu
+ */
+ if (event->cpu < 0)
+ return -EINVAL;
+ box = uncore_pmu_to_box(pmu, event->cpu);
+ if (!box || box->cpu < 0)
+ return -EINVAL;
+ event->cpu = box->cpu;
+ event->pmu_private = box;
+
+ event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG;
+
+ event->hw.idx = -1;
+ event->hw.last_tag = ~0ULL;
+ event->hw.extra_reg.idx = EXTRA_REG_NONE;
+ event->hw.branch_reg.idx = EXTRA_REG_NONE;
+
+ if (event->attr.config == UNCORE_FIXED_EVENT) {
+ /* no fixed counter */
+ if (!pmu->type->fixed_ctl)
+ return -EINVAL;
+ /*
+ * if there is only one fixed counter, only the first pmu
+ * can access the fixed counter
+ */
+ if (pmu->type->single_fixed && pmu->pmu_idx > 0)
+ return -EINVAL;
+
+ /* fixed counters have event field hardcoded to zero */
+ hwc->config = 0ULL;
+ } else if (is_freerunning_event(event)) {
+ hwc->config = event->attr.config;
+ if (!check_valid_freerunning_event(box, event))
+ return -EINVAL;
+ event->hw.idx = UNCORE_PMC_IDX_FREERUNNING;
+ /*
+ * The free running counter event and free running counter
+ * are always 1:1 mapped.
+ * The free running counter is always active.
+ * Assign the free running counter here.
+ */
+ event->hw.event_base = uncore_freerunning_counter(box, event);
+ } else {
+ hwc->config = event->attr.config &
+ (pmu->type->event_mask | ((u64)pmu->type->event_mask_ext << 32));
+ if (pmu->type->ops->hw_config) {
+ ret = pmu->type->ops->hw_config(box, event);
+ if (ret)
+ return ret;
+ }
+ }
+
+ if (event->group_leader != event)
+ ret = uncore_validate_group(pmu, event);
+ else
+ ret = 0;
+
+ return ret;
+}
+
+static void uncore_pmu_enable(struct pmu *pmu)
+{
+ struct intel_uncore_pmu *uncore_pmu;
+ struct intel_uncore_box *box;
+
+ uncore_pmu = container_of(pmu, struct intel_uncore_pmu, pmu);
+ if (!uncore_pmu)
+ return;
+
+ box = uncore_pmu_to_box(uncore_pmu, smp_processor_id());
+ if (!box)
+ return;
+
+ if (uncore_pmu->type->ops->enable_box)
+ uncore_pmu->type->ops->enable_box(box);
+}
+
+static void uncore_pmu_disable(struct pmu *pmu)
+{
+ struct intel_uncore_pmu *uncore_pmu;
+ struct intel_uncore_box *box;
+
+ uncore_pmu = container_of(pmu, struct intel_uncore_pmu, pmu);
+ if (!uncore_pmu)
+ return;
+
+ box = uncore_pmu_to_box(uncore_pmu, smp_processor_id());
+ if (!box)
+ return;
+
+ if (uncore_pmu->type->ops->disable_box)
+ uncore_pmu->type->ops->disable_box(box);
+}
+
+static ssize_t uncore_get_attr_cpumask(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return cpumap_print_to_pagebuf(true, buf, &uncore_cpu_mask);
+}
+
+static DEVICE_ATTR(cpumask, S_IRUGO, uncore_get_attr_cpumask, NULL);
+
+static struct attribute *uncore_pmu_attrs[] = {
+ &dev_attr_cpumask.attr,
+ NULL,
+};
+
+static const struct attribute_group uncore_pmu_attr_group = {
+ .attrs = uncore_pmu_attrs,
+};
+
+static int uncore_pmu_register(struct intel_uncore_pmu *pmu)
+{
+ int ret;
+
+ if (!pmu->type->pmu) {
+ pmu->pmu = (struct pmu) {
+ .attr_groups = pmu->type->attr_groups,
+ .task_ctx_nr = perf_invalid_context,
+ .pmu_enable = uncore_pmu_enable,
+ .pmu_disable = uncore_pmu_disable,
+ .event_init = uncore_pmu_event_init,
+ .add = uncore_pmu_event_add,
+ .del = uncore_pmu_event_del,
+ .start = uncore_pmu_event_start,
+ .stop = uncore_pmu_event_stop,
+ .read = uncore_pmu_event_read,
+ .module = THIS_MODULE,
+ };
+ } else {
+ pmu->pmu = *pmu->type->pmu;
+ pmu->pmu.attr_groups = pmu->type->attr_groups;
+ }
+
+ if (pmu->type->num_boxes == 1) {
+ if (strlen(pmu->type->name) > 0)
+ sprintf(pmu->name, "uncore_%s", pmu->type->name);
+ else
+ sprintf(pmu->name, "uncore");
+ } else {
+ sprintf(pmu->name, "uncore_%s_%d", pmu->type->name,
+ pmu->pmu_idx);
+ }
+
+ ret = perf_pmu_register(&pmu->pmu, pmu->name, -1);
+ if (!ret)
+ pmu->registered = true;
+ return ret;
+}
+
+static void uncore_pmu_unregister(struct intel_uncore_pmu *pmu)
+{
+ if (!pmu->registered)
+ return;
+ perf_pmu_unregister(&pmu->pmu);
+ pmu->registered = false;
+}
+
+static void uncore_free_boxes(struct intel_uncore_pmu *pmu)
+{
+ int pkg;
+
+ for (pkg = 0; pkg < max_packages; pkg++)
+ kfree(pmu->boxes[pkg]);
+ kfree(pmu->boxes);
+}
+
+static void uncore_type_exit(struct intel_uncore_type *type)
+{
+ struct intel_uncore_pmu *pmu = type->pmus;
+ int i;
+
+ if (pmu) {
+ for (i = 0; i < type->num_boxes; i++, pmu++) {
+ uncore_pmu_unregister(pmu);
+ uncore_free_boxes(pmu);
+ }
+ kfree(type->pmus);
+ type->pmus = NULL;
+ }
+ kfree(type->events_group);
+ type->events_group = NULL;
+}
+
+static void uncore_types_exit(struct intel_uncore_type **types)
+{
+ for (; *types; types++)
+ uncore_type_exit(*types);
+}
+
+static int __init uncore_type_init(struct intel_uncore_type *type, bool setid)
+{
+ struct intel_uncore_pmu *pmus;
+ size_t size;
+ int i, j;
+
+ pmus = kcalloc(type->num_boxes, sizeof(*pmus), GFP_KERNEL);
+ if (!pmus)
+ return -ENOMEM;
+
+ size = max_packages * sizeof(struct intel_uncore_box *);
+
+ for (i = 0; i < type->num_boxes; i++) {
+ pmus[i].func_id = setid ? i : -1;
+ pmus[i].pmu_idx = i;
+ pmus[i].type = type;
+ pmus[i].boxes = kzalloc(size, GFP_KERNEL);
+ if (!pmus[i].boxes)
+ goto err;
+ }
+
+ type->pmus = pmus;
+ type->unconstrainted = (struct event_constraint)
+ __EVENT_CONSTRAINT(0, (1ULL << type->num_counters) - 1,
+ 0, type->num_counters, 0, 0);
+
+ if (type->event_descs) {
+ struct {
+ struct attribute_group group;
+ struct attribute *attrs[];
+ } *attr_group;
+ for (i = 0; type->event_descs[i].attr.attr.name; i++);
+
+ attr_group = kzalloc(struct_size(attr_group, attrs, i + 1),
+ GFP_KERNEL);
+ if (!attr_group)
+ goto err;
+
+ attr_group->group.name = "events";
+ attr_group->group.attrs = attr_group->attrs;
+
+ for (j = 0; j < i; j++)
+ attr_group->attrs[j] = &type->event_descs[j].attr.attr;
+
+ type->events_group = &attr_group->group;
+ }
+
+ type->pmu_group = &uncore_pmu_attr_group;
+
+ return 0;
+
+err:
+ for (i = 0; i < type->num_boxes; i++)
+ kfree(pmus[i].boxes);
+ kfree(pmus);
+
+ return -ENOMEM;
+}
+
+static int __init
+uncore_types_init(struct intel_uncore_type **types, bool setid)
+{
+ int ret;
+
+ for (; *types; types++) {
+ ret = uncore_type_init(*types, setid);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+/*
+ * add a pci uncore device
+ */
+static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+ struct intel_uncore_type *type;
+ struct intel_uncore_pmu *pmu = NULL;
+ struct intel_uncore_box *box;
+ int phys_id, pkg, ret;
+
+ phys_id = uncore_pcibus_to_physid(pdev->bus);
+ if (phys_id < 0)
+ return -ENODEV;
+
+ pkg = topology_phys_to_logical_pkg(phys_id);
+ if (pkg < 0)
+ return -EINVAL;
+
+ if (UNCORE_PCI_DEV_TYPE(id->driver_data) == UNCORE_EXTRA_PCI_DEV) {
+ int idx = UNCORE_PCI_DEV_IDX(id->driver_data);
+
+ uncore_extra_pci_dev[pkg].dev[idx] = pdev;
+ pci_set_drvdata(pdev, NULL);
+ return 0;
+ }
+
+ type = uncore_pci_uncores[UNCORE_PCI_DEV_TYPE(id->driver_data)];
+
+ /*
+ * Some platforms, e.g. Knights Landing, use a common PCI device ID
+ * for multiple instances of an uncore PMU device type. We should check
+ * PCI slot and func to indicate the uncore box.
+ */
+ if (id->driver_data & ~0xffff) {
+ struct pci_driver *pci_drv = pdev->driver;
+ const struct pci_device_id *ids = pci_drv->id_table;
+ unsigned int devfn;
+
+ while (ids && ids->vendor) {
+ if ((ids->vendor == pdev->vendor) &&
+ (ids->device == pdev->device)) {
+ devfn = PCI_DEVFN(UNCORE_PCI_DEV_DEV(ids->driver_data),
+ UNCORE_PCI_DEV_FUNC(ids->driver_data));
+ if (devfn == pdev->devfn) {
+ pmu = &type->pmus[UNCORE_PCI_DEV_IDX(ids->driver_data)];
+ break;
+ }
+ }
+ ids++;
+ }
+ if (pmu == NULL)
+ return -ENODEV;
+ } else {
+ /*
+ * for performance monitoring unit with multiple boxes,
+ * each box has a different function id.
+ */
+ pmu = &type->pmus[UNCORE_PCI_DEV_IDX(id->driver_data)];
+ }
+
+ if (WARN_ON_ONCE(pmu->boxes[pkg] != NULL))
+ return -EINVAL;
+
+ box = uncore_alloc_box(type, NUMA_NO_NODE);
+ if (!box)
+ return -ENOMEM;
+
+ if (pmu->func_id < 0)
+ pmu->func_id = pdev->devfn;
+ else
+ WARN_ON_ONCE(pmu->func_id != pdev->devfn);
+
+ atomic_inc(&box->refcnt);
+ box->pci_phys_id = phys_id;
+ box->pkgid = pkg;
+ box->pci_dev = pdev;
+ box->pmu = pmu;
+ uncore_box_init(box);
+ pci_set_drvdata(pdev, box);
+
+ pmu->boxes[pkg] = box;
+ if (atomic_inc_return(&pmu->activeboxes) > 1)
+ return 0;
+
+ /* First active box registers the pmu */
+ ret = uncore_pmu_register(pmu);
+ if (ret) {
+ pci_set_drvdata(pdev, NULL);
+ pmu->boxes[pkg] = NULL;
+ uncore_box_exit(box);
+ kfree(box);
+ }
+ return ret;
+}
+
+static void uncore_pci_remove(struct pci_dev *pdev)
+{
+ struct intel_uncore_box *box;
+ struct intel_uncore_pmu *pmu;
+ int i, phys_id, pkg;
+
+ phys_id = uncore_pcibus_to_physid(pdev->bus);
+
+ box = pci_get_drvdata(pdev);
+ if (!box) {
+ pkg = topology_phys_to_logical_pkg(phys_id);
+ for (i = 0; i < UNCORE_EXTRA_PCI_DEV_MAX; i++) {
+ if (uncore_extra_pci_dev[pkg].dev[i] == pdev) {
+ uncore_extra_pci_dev[pkg].dev[i] = NULL;
+ break;
+ }
+ }
+ WARN_ON_ONCE(i >= UNCORE_EXTRA_PCI_DEV_MAX);
+ return;
+ }
+
+ pmu = box->pmu;
+ if (WARN_ON_ONCE(phys_id != box->pci_phys_id))
+ return;
+
+ pci_set_drvdata(pdev, NULL);
+ pmu->boxes[box->pkgid] = NULL;
+ if (atomic_dec_return(&pmu->activeboxes) == 0)
+ uncore_pmu_unregister(pmu);
+ uncore_box_exit(box);
+ kfree(box);
+}
+
+static int __init uncore_pci_init(void)
+{
+ size_t size;
+ int ret;
+
+ size = max_packages * sizeof(struct pci_extra_dev);
+ uncore_extra_pci_dev = kzalloc(size, GFP_KERNEL);
+ if (!uncore_extra_pci_dev) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ ret = uncore_types_init(uncore_pci_uncores, false);
+ if (ret)
+ goto errtype;
+
+ uncore_pci_driver->probe = uncore_pci_probe;
+ uncore_pci_driver->remove = uncore_pci_remove;
+
+ ret = pci_register_driver(uncore_pci_driver);
+ if (ret)
+ goto errtype;
+
+ pcidrv_registered = true;
+ return 0;
+
+errtype:
+ uncore_types_exit(uncore_pci_uncores);
+ kfree(uncore_extra_pci_dev);
+ uncore_extra_pci_dev = NULL;
+ uncore_free_pcibus_map();
+err:
+ uncore_pci_uncores = empty_uncore;
+ return ret;
+}
+
+static void uncore_pci_exit(void)
+{
+ if (pcidrv_registered) {
+ pcidrv_registered = false;
+ pci_unregister_driver(uncore_pci_driver);
+ uncore_types_exit(uncore_pci_uncores);
+ kfree(uncore_extra_pci_dev);
+ uncore_free_pcibus_map();
+ }
+}
+
+static void uncore_change_type_ctx(struct intel_uncore_type *type, int old_cpu,
+ int new_cpu)
+{
+ struct intel_uncore_pmu *pmu = type->pmus;
+ struct intel_uncore_box *box;
+ int i, pkg;
+
+ pkg = topology_logical_package_id(old_cpu < 0 ? new_cpu : old_cpu);
+ for (i = 0; i < type->num_boxes; i++, pmu++) {
+ box = pmu->boxes[pkg];
+ if (!box)
+ continue;
+
+ if (old_cpu < 0) {
+ WARN_ON_ONCE(box->cpu != -1);
+ box->cpu = new_cpu;
+ continue;
+ }
+
+ WARN_ON_ONCE(box->cpu != old_cpu);
+ box->cpu = -1;
+ if (new_cpu < 0)
+ continue;
+
+ uncore_pmu_cancel_hrtimer(box);
+ perf_pmu_migrate_context(&pmu->pmu, old_cpu, new_cpu);
+ box->cpu = new_cpu;
+ }
+}
+
+static void uncore_change_context(struct intel_uncore_type **uncores,
+ int old_cpu, int new_cpu)
+{
+ for (; *uncores; uncores++)
+ uncore_change_type_ctx(*uncores, old_cpu, new_cpu);
+}
+
+static int uncore_event_cpu_offline(unsigned int cpu)
+{
+ struct intel_uncore_type *type, **types = uncore_msr_uncores;
+ struct intel_uncore_pmu *pmu;
+ struct intel_uncore_box *box;
+ int i, pkg, target;
+
+ /* Check if exiting cpu is used for collecting uncore events */
+ if (!cpumask_test_and_clear_cpu(cpu, &uncore_cpu_mask))
+ goto unref;
+ /* Find a new cpu to collect uncore events */
+ target = cpumask_any_but(topology_core_cpumask(cpu), cpu);
+
+ /* Migrate uncore events to the new target */
+ if (target < nr_cpu_ids)
+ cpumask_set_cpu(target, &uncore_cpu_mask);
+ else
+ target = -1;
+
+ uncore_change_context(uncore_msr_uncores, cpu, target);
+ uncore_change_context(uncore_pci_uncores, cpu, target);
+
+unref:
+ /* Clear the references */
+ pkg = topology_logical_package_id(cpu);
+ for (; *types; types++) {
+ type = *types;
+ pmu = type->pmus;
+ for (i = 0; i < type->num_boxes; i++, pmu++) {
+ box = pmu->boxes[pkg];
+ if (box && atomic_dec_return(&box->refcnt) == 0)
+ uncore_box_exit(box);
+ }
+ }
+ return 0;
+}
+
+static int allocate_boxes(struct intel_uncore_type **types,
+ unsigned int pkg, unsigned int cpu)
+{
+ struct intel_uncore_box *box, *tmp;
+ struct intel_uncore_type *type;
+ struct intel_uncore_pmu *pmu;
+ LIST_HEAD(allocated);
+ int i;
+
+ /* Try to allocate all required boxes */
+ for (; *types; types++) {
+ type = *types;
+ pmu = type->pmus;
+ for (i = 0; i < type->num_boxes; i++, pmu++) {
+ if (pmu->boxes[pkg])
+ continue;
+ box = uncore_alloc_box(type, cpu_to_node(cpu));
+ if (!box)
+ goto cleanup;
+ box->pmu = pmu;
+ box->pkgid = pkg;
+ list_add(&box->active_list, &allocated);
+ }
+ }
+ /* Install them in the pmus */
+ list_for_each_entry_safe(box, tmp, &allocated, active_list) {
+ list_del_init(&box->active_list);
+ box->pmu->boxes[pkg] = box;
+ }
+ return 0;
+
+cleanup:
+ list_for_each_entry_safe(box, tmp, &allocated, active_list) {
+ list_del_init(&box->active_list);
+ kfree(box);
+ }
+ return -ENOMEM;
+}
+
+static int uncore_event_cpu_online(unsigned int cpu)
+{
+ struct intel_uncore_type *type, **types = uncore_msr_uncores;
+ struct intel_uncore_pmu *pmu;
+ struct intel_uncore_box *box;
+ int i, ret, pkg, target;
+
+ pkg = topology_logical_package_id(cpu);
+ ret = allocate_boxes(types, pkg, cpu);
+ if (ret)
+ return ret;
+
+ for (; *types; types++) {
+ type = *types;
+ pmu = type->pmus;
+ for (i = 0; i < type->num_boxes; i++, pmu++) {
+ box = pmu->boxes[pkg];
+ if (box && atomic_inc_return(&box->refcnt) == 1)
+ uncore_box_init(box);
+ }
+ }
+
+ /*
+ * Check if there is an online cpu in the package
+ * which collects uncore events already.
+ */
+ target = cpumask_any_and(&uncore_cpu_mask, topology_core_cpumask(cpu));
+ if (target < nr_cpu_ids)
+ return 0;
+
+ cpumask_set_cpu(cpu, &uncore_cpu_mask);
+
+ uncore_change_context(uncore_msr_uncores, -1, cpu);
+ uncore_change_context(uncore_pci_uncores, -1, cpu);
+ return 0;
+}
+
+static int __init type_pmu_register(struct intel_uncore_type *type)
+{
+ int i, ret;
+
+ for (i = 0; i < type->num_boxes; i++) {
+ ret = uncore_pmu_register(&type->pmus[i]);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+static int __init uncore_msr_pmus_register(void)
+{
+ struct intel_uncore_type **types = uncore_msr_uncores;
+ int ret;
+
+ for (; *types; types++) {
+ ret = type_pmu_register(*types);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+static int __init uncore_cpu_init(void)
+{
+ int ret;
+
+ ret = uncore_types_init(uncore_msr_uncores, true);
+ if (ret)
+ goto err;
+
+ ret = uncore_msr_pmus_register();
+ if (ret)
+ goto err;
+ return 0;
+err:
+ uncore_types_exit(uncore_msr_uncores);
+ uncore_msr_uncores = empty_uncore;
+ return ret;
+}
+
+#define X86_UNCORE_MODEL_MATCH(model, init) \
+ { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long)&init }
+
+struct intel_uncore_init_fun {
+ void (*cpu_init)(void);
+ int (*pci_init)(void);
+};
+
+static const struct intel_uncore_init_fun nhm_uncore_init __initconst = {
+ .cpu_init = nhm_uncore_cpu_init,
+};
+
+static const struct intel_uncore_init_fun snb_uncore_init __initconst = {
+ .cpu_init = snb_uncore_cpu_init,
+ .pci_init = snb_uncore_pci_init,
+};
+
+static const struct intel_uncore_init_fun ivb_uncore_init __initconst = {
+ .cpu_init = snb_uncore_cpu_init,
+ .pci_init = ivb_uncore_pci_init,
+};
+
+static const struct intel_uncore_init_fun hsw_uncore_init __initconst = {
+ .cpu_init = snb_uncore_cpu_init,
+ .pci_init = hsw_uncore_pci_init,
+};
+
+static const struct intel_uncore_init_fun bdw_uncore_init __initconst = {
+ .cpu_init = snb_uncore_cpu_init,
+ .pci_init = bdw_uncore_pci_init,
+};
+
+static const struct intel_uncore_init_fun snbep_uncore_init __initconst = {
+ .cpu_init = snbep_uncore_cpu_init,
+ .pci_init = snbep_uncore_pci_init,
+};
+
+static const struct intel_uncore_init_fun nhmex_uncore_init __initconst = {
+ .cpu_init = nhmex_uncore_cpu_init,
+};
+
+static const struct intel_uncore_init_fun ivbep_uncore_init __initconst = {
+ .cpu_init = ivbep_uncore_cpu_init,
+ .pci_init = ivbep_uncore_pci_init,
+};
+
+static const struct intel_uncore_init_fun hswep_uncore_init __initconst = {
+ .cpu_init = hswep_uncore_cpu_init,
+ .pci_init = hswep_uncore_pci_init,
+};
+
+static const struct intel_uncore_init_fun bdx_uncore_init __initconst = {
+ .cpu_init = bdx_uncore_cpu_init,
+ .pci_init = bdx_uncore_pci_init,
+};
+
+static const struct intel_uncore_init_fun knl_uncore_init __initconst = {
+ .cpu_init = knl_uncore_cpu_init,
+ .pci_init = knl_uncore_pci_init,
+};
+
+static const struct intel_uncore_init_fun skl_uncore_init __initconst = {
+ .cpu_init = skl_uncore_cpu_init,
+ .pci_init = skl_uncore_pci_init,
+};
+
+static const struct intel_uncore_init_fun skx_uncore_init __initconst = {
+ .cpu_init = skx_uncore_cpu_init,
+ .pci_init = skx_uncore_pci_init,
+};
+
+static const struct x86_cpu_id intel_uncore_match[] __initconst = {
+ X86_UNCORE_MODEL_MATCH(INTEL_FAM6_NEHALEM_EP, nhm_uncore_init),
+ X86_UNCORE_MODEL_MATCH(INTEL_FAM6_NEHALEM, nhm_uncore_init),
+ X86_UNCORE_MODEL_MATCH(INTEL_FAM6_WESTMERE, nhm_uncore_init),
+ X86_UNCORE_MODEL_MATCH(INTEL_FAM6_WESTMERE_EP, nhm_uncore_init),
+ X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SANDYBRIDGE, snb_uncore_init),
+ X86_UNCORE_MODEL_MATCH(INTEL_FAM6_IVYBRIDGE, ivb_uncore_init),
+ X86_UNCORE_MODEL_MATCH(INTEL_FAM6_HASWELL_CORE, hsw_uncore_init),
+ X86_UNCORE_MODEL_MATCH(INTEL_FAM6_HASWELL_ULT, hsw_uncore_init),
+ X86_UNCORE_MODEL_MATCH(INTEL_FAM6_HASWELL_GT3E, hsw_uncore_init),
+ X86_UNCORE_MODEL_MATCH(INTEL_FAM6_BROADWELL_CORE, bdw_uncore_init),
+ X86_UNCORE_MODEL_MATCH(INTEL_FAM6_BROADWELL_GT3E, bdw_uncore_init),
+ X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SANDYBRIDGE_X, snbep_uncore_init),
+ X86_UNCORE_MODEL_MATCH(INTEL_FAM6_NEHALEM_EX, nhmex_uncore_init),
+ X86_UNCORE_MODEL_MATCH(INTEL_FAM6_WESTMERE_EX, nhmex_uncore_init),
+ X86_UNCORE_MODEL_MATCH(INTEL_FAM6_IVYBRIDGE_X, ivbep_uncore_init),
+ X86_UNCORE_MODEL_MATCH(INTEL_FAM6_HASWELL_X, hswep_uncore_init),
+ X86_UNCORE_MODEL_MATCH(INTEL_FAM6_BROADWELL_X, bdx_uncore_init),
+ X86_UNCORE_MODEL_MATCH(INTEL_FAM6_BROADWELL_XEON_D, bdx_uncore_init),
+ X86_UNCORE_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNL, knl_uncore_init),
+ X86_UNCORE_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNM, knl_uncore_init),
+ X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SKYLAKE_DESKTOP,skl_uncore_init),
+ X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SKYLAKE_MOBILE, skl_uncore_init),
+ X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SKYLAKE_X, skx_uncore_init),
+ X86_UNCORE_MODEL_MATCH(INTEL_FAM6_KABYLAKE_MOBILE, skl_uncore_init),
+ X86_UNCORE_MODEL_MATCH(INTEL_FAM6_KABYLAKE_DESKTOP, skl_uncore_init),
+ {},
+};
+
+MODULE_DEVICE_TABLE(x86cpu, intel_uncore_match);
+
+static int __init intel_uncore_init(void)
+{
+ const struct x86_cpu_id *id;
+ struct intel_uncore_init_fun *uncore_init;
+ int pret = 0, cret = 0, ret;
+
+ id = x86_match_cpu(intel_uncore_match);
+ if (!id)
+ return -ENODEV;
+
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ return -ENODEV;
+
+ max_packages = topology_max_packages();
+
+ uncore_init = (struct intel_uncore_init_fun *)id->driver_data;
+ if (uncore_init->pci_init) {
+ pret = uncore_init->pci_init();
+ if (!pret)
+ pret = uncore_pci_init();
+ }
+
+ if (uncore_init->cpu_init) {
+ uncore_init->cpu_init();
+ cret = uncore_cpu_init();
+ }
+
+ if (cret && pret)
+ return -ENODEV;
+
+ /* Install hotplug callbacks to setup the targets for each package */
+ ret = cpuhp_setup_state(CPUHP_AP_PERF_X86_UNCORE_ONLINE,
+ "perf/x86/intel/uncore:online",
+ uncore_event_cpu_online,
+ uncore_event_cpu_offline);
+ if (ret)
+ goto err;
+ return 0;
+
+err:
+ uncore_types_exit(uncore_msr_uncores);
+ uncore_pci_exit();
+ return ret;
+}
+module_init(intel_uncore_init);
+
+static void __exit intel_uncore_exit(void)
+{
+ cpuhp_remove_state(CPUHP_AP_PERF_X86_UNCORE_ONLINE);
+ uncore_types_exit(uncore_msr_uncores);
+ uncore_pci_exit();
+}
+module_exit(intel_uncore_exit);
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
new file mode 100644
index 000000000..0fc86ac73
--- /dev/null
+++ b/arch/x86/events/intel/uncore.h
@@ -0,0 +1,513 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/slab.h>
+#include <linux/pci.h>
+#include <asm/apicdef.h>
+
+#include <linux/perf_event.h>
+#include "../perf_event.h"
+
+#define UNCORE_PMU_NAME_LEN 32
+#define UNCORE_PMU_HRTIMER_INTERVAL (60LL * NSEC_PER_SEC)
+#define UNCORE_SNB_IMC_HRTIMER_INTERVAL (5ULL * NSEC_PER_SEC)
+
+#define UNCORE_FIXED_EVENT 0xff
+#define UNCORE_PMC_IDX_MAX_GENERIC 8
+#define UNCORE_PMC_IDX_MAX_FIXED 1
+#define UNCORE_PMC_IDX_MAX_FREERUNNING 1
+#define UNCORE_PMC_IDX_FIXED UNCORE_PMC_IDX_MAX_GENERIC
+#define UNCORE_PMC_IDX_FREERUNNING (UNCORE_PMC_IDX_FIXED + \
+ UNCORE_PMC_IDX_MAX_FIXED)
+#define UNCORE_PMC_IDX_MAX (UNCORE_PMC_IDX_FREERUNNING + \
+ UNCORE_PMC_IDX_MAX_FREERUNNING)
+
+#define UNCORE_PCI_DEV_FULL_DATA(dev, func, type, idx) \
+ ((dev << 24) | (func << 16) | (type << 8) | idx)
+#define UNCORE_PCI_DEV_DATA(type, idx) ((type << 8) | idx)
+#define UNCORE_PCI_DEV_DEV(data) ((data >> 24) & 0xff)
+#define UNCORE_PCI_DEV_FUNC(data) ((data >> 16) & 0xff)
+#define UNCORE_PCI_DEV_TYPE(data) ((data >> 8) & 0xff)
+#define UNCORE_PCI_DEV_IDX(data) (data & 0xff)
+#define UNCORE_EXTRA_PCI_DEV 0xff
+#define UNCORE_EXTRA_PCI_DEV_MAX 4
+
+#define UNCORE_EVENT_CONSTRAINT(c, n) EVENT_CONSTRAINT(c, n, 0xff)
+
+struct pci_extra_dev {
+ struct pci_dev *dev[UNCORE_EXTRA_PCI_DEV_MAX];
+};
+
+struct intel_uncore_ops;
+struct intel_uncore_pmu;
+struct intel_uncore_box;
+struct uncore_event_desc;
+struct freerunning_counters;
+
+struct intel_uncore_type {
+ const char *name;
+ int num_counters;
+ int num_boxes;
+ int perf_ctr_bits;
+ int fixed_ctr_bits;
+ int num_freerunning_types;
+ unsigned perf_ctr;
+ unsigned event_ctl;
+ unsigned event_mask;
+ unsigned event_mask_ext;
+ unsigned fixed_ctr;
+ unsigned fixed_ctl;
+ unsigned box_ctl;
+ unsigned msr_offset;
+ unsigned num_shared_regs:8;
+ unsigned single_fixed:1;
+ unsigned pair_ctr_ctl:1;
+ unsigned *msr_offsets;
+ struct event_constraint unconstrainted;
+ struct event_constraint *constraints;
+ struct intel_uncore_pmu *pmus;
+ struct intel_uncore_ops *ops;
+ struct uncore_event_desc *event_descs;
+ struct freerunning_counters *freerunning;
+ const struct attribute_group *attr_groups[4];
+ struct pmu *pmu; /* for custom pmu ops */
+};
+
+#define pmu_group attr_groups[0]
+#define format_group attr_groups[1]
+#define events_group attr_groups[2]
+
+struct intel_uncore_ops {
+ void (*init_box)(struct intel_uncore_box *);
+ void (*exit_box)(struct intel_uncore_box *);
+ void (*disable_box)(struct intel_uncore_box *);
+ void (*enable_box)(struct intel_uncore_box *);
+ void (*disable_event)(struct intel_uncore_box *, struct perf_event *);
+ void (*enable_event)(struct intel_uncore_box *, struct perf_event *);
+ u64 (*read_counter)(struct intel_uncore_box *, struct perf_event *);
+ int (*hw_config)(struct intel_uncore_box *, struct perf_event *);
+ struct event_constraint *(*get_constraint)(struct intel_uncore_box *,
+ struct perf_event *);
+ void (*put_constraint)(struct intel_uncore_box *, struct perf_event *);
+};
+
+struct intel_uncore_pmu {
+ struct pmu pmu;
+ char name[UNCORE_PMU_NAME_LEN];
+ int pmu_idx;
+ int func_id;
+ bool registered;
+ atomic_t activeboxes;
+ struct intel_uncore_type *type;
+ struct intel_uncore_box **boxes;
+};
+
+struct intel_uncore_extra_reg {
+ raw_spinlock_t lock;
+ u64 config, config1, config2;
+ atomic_t ref;
+};
+
+struct intel_uncore_box {
+ int pci_phys_id;
+ int pkgid; /* Logical package ID */
+ int n_active; /* number of active events */
+ int n_events;
+ int cpu; /* cpu to collect events */
+ unsigned long flags;
+ atomic_t refcnt;
+ struct perf_event *events[UNCORE_PMC_IDX_MAX];
+ struct perf_event *event_list[UNCORE_PMC_IDX_MAX];
+ struct event_constraint *event_constraint[UNCORE_PMC_IDX_MAX];
+ unsigned long active_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)];
+ u64 tags[UNCORE_PMC_IDX_MAX];
+ struct pci_dev *pci_dev;
+ struct intel_uncore_pmu *pmu;
+ u64 hrtimer_duration; /* hrtimer timeout for this box */
+ struct hrtimer hrtimer;
+ struct list_head list;
+ struct list_head active_list;
+ void *io_addr;
+ struct intel_uncore_extra_reg shared_regs[0];
+};
+
+#define UNCORE_BOX_FLAG_INITIATED 0
+#define UNCORE_BOX_FLAG_CTL_OFFS8 1 /* event config registers are 8-byte apart */
+
+struct uncore_event_desc {
+ struct device_attribute attr;
+ const char *config;
+};
+
+struct freerunning_counters {
+ unsigned int counter_base;
+ unsigned int counter_offset;
+ unsigned int box_offset;
+ unsigned int num_counters;
+ unsigned int bits;
+};
+
+struct pci2phy_map {
+ struct list_head list;
+ int segment;
+ int pbus_to_physid[256];
+};
+
+struct pci2phy_map *__find_pci2phy_map(int segment);
+
+ssize_t uncore_event_show(struct device *dev,
+ struct device_attribute *attr, char *buf);
+
+#define INTEL_UNCORE_EVENT_DESC(_name, _config) \
+{ \
+ .attr = __ATTR(_name, 0444, uncore_event_show, NULL), \
+ .config = _config, \
+}
+
+#define DEFINE_UNCORE_FORMAT_ATTR(_var, _name, _format) \
+static ssize_t __uncore_##_var##_show(struct device *dev, \
+ struct device_attribute *attr, \
+ char *page) \
+{ \
+ BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \
+ return sprintf(page, _format "\n"); \
+} \
+static struct device_attribute format_attr_##_var = \
+ __ATTR(_name, 0444, __uncore_##_var##_show, NULL)
+
+static inline bool uncore_pmc_fixed(int idx)
+{
+ return idx == UNCORE_PMC_IDX_FIXED;
+}
+
+static inline bool uncore_pmc_freerunning(int idx)
+{
+ return idx == UNCORE_PMC_IDX_FREERUNNING;
+}
+
+static inline unsigned uncore_pci_box_ctl(struct intel_uncore_box *box)
+{
+ return box->pmu->type->box_ctl;
+}
+
+static inline unsigned uncore_pci_fixed_ctl(struct intel_uncore_box *box)
+{
+ return box->pmu->type->fixed_ctl;
+}
+
+static inline unsigned uncore_pci_fixed_ctr(struct intel_uncore_box *box)
+{
+ return box->pmu->type->fixed_ctr;
+}
+
+static inline
+unsigned uncore_pci_event_ctl(struct intel_uncore_box *box, int idx)
+{
+ if (test_bit(UNCORE_BOX_FLAG_CTL_OFFS8, &box->flags))
+ return idx * 8 + box->pmu->type->event_ctl;
+
+ return idx * 4 + box->pmu->type->event_ctl;
+}
+
+static inline
+unsigned uncore_pci_perf_ctr(struct intel_uncore_box *box, int idx)
+{
+ return idx * 8 + box->pmu->type->perf_ctr;
+}
+
+static inline unsigned uncore_msr_box_offset(struct intel_uncore_box *box)
+{
+ struct intel_uncore_pmu *pmu = box->pmu;
+ return pmu->type->msr_offsets ?
+ pmu->type->msr_offsets[pmu->pmu_idx] :
+ pmu->type->msr_offset * pmu->pmu_idx;
+}
+
+static inline unsigned uncore_msr_box_ctl(struct intel_uncore_box *box)
+{
+ if (!box->pmu->type->box_ctl)
+ return 0;
+ return box->pmu->type->box_ctl + uncore_msr_box_offset(box);
+}
+
+static inline unsigned uncore_msr_fixed_ctl(struct intel_uncore_box *box)
+{
+ if (!box->pmu->type->fixed_ctl)
+ return 0;
+ return box->pmu->type->fixed_ctl + uncore_msr_box_offset(box);
+}
+
+static inline unsigned uncore_msr_fixed_ctr(struct intel_uncore_box *box)
+{
+ return box->pmu->type->fixed_ctr + uncore_msr_box_offset(box);
+}
+
+
+/*
+ * In the uncore document, there is no event-code assigned to free running
+ * counters. Some events need to be defined to indicate the free running
+ * counters. The events are encoded as event-code + umask-code.
+ *
+ * The event-code for all free running counters is 0xff, which is the same as
+ * the fixed counters.
+ *
+ * The umask-code is used to distinguish a fixed counter and a free running
+ * counter, and different types of free running counters.
+ * - For fixed counters, the umask-code is 0x0X.
+ * X indicates the index of the fixed counter, which starts from 0.
+ * - For free running counters, the umask-code uses the rest of the space.
+ * It would bare the format of 0xXY.
+ * X stands for the type of free running counters, which starts from 1.
+ * Y stands for the index of free running counters of same type, which
+ * starts from 0.
+ *
+ * For example, there are three types of IIO free running counters on Skylake
+ * server, IO CLOCKS counters, BANDWIDTH counters and UTILIZATION counters.
+ * The event-code for all the free running counters is 0xff.
+ * 'ioclk' is the first counter of IO CLOCKS. IO CLOCKS is the first type,
+ * which umask-code starts from 0x10.
+ * So 'ioclk' is encoded as event=0xff,umask=0x10
+ * 'bw_in_port2' is the third counter of BANDWIDTH counters. BANDWIDTH is
+ * the second type, which umask-code starts from 0x20.
+ * So 'bw_in_port2' is encoded as event=0xff,umask=0x22
+ */
+static inline unsigned int uncore_freerunning_idx(u64 config)
+{
+ return ((config >> 8) & 0xf);
+}
+
+#define UNCORE_FREERUNNING_UMASK_START 0x10
+
+static inline unsigned int uncore_freerunning_type(u64 config)
+{
+ return ((((config >> 8) - UNCORE_FREERUNNING_UMASK_START) >> 4) & 0xf);
+}
+
+static inline
+unsigned int uncore_freerunning_counter(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ unsigned int type = uncore_freerunning_type(event->hw.config);
+ unsigned int idx = uncore_freerunning_idx(event->hw.config);
+ struct intel_uncore_pmu *pmu = box->pmu;
+
+ return pmu->type->freerunning[type].counter_base +
+ pmu->type->freerunning[type].counter_offset * idx +
+ pmu->type->freerunning[type].box_offset * pmu->pmu_idx;
+}
+
+static inline
+unsigned uncore_msr_event_ctl(struct intel_uncore_box *box, int idx)
+{
+ return box->pmu->type->event_ctl +
+ (box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) +
+ uncore_msr_box_offset(box);
+}
+
+static inline
+unsigned uncore_msr_perf_ctr(struct intel_uncore_box *box, int idx)
+{
+ return box->pmu->type->perf_ctr +
+ (box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) +
+ uncore_msr_box_offset(box);
+}
+
+static inline
+unsigned uncore_fixed_ctl(struct intel_uncore_box *box)
+{
+ if (box->pci_dev)
+ return uncore_pci_fixed_ctl(box);
+ else
+ return uncore_msr_fixed_ctl(box);
+}
+
+static inline
+unsigned uncore_fixed_ctr(struct intel_uncore_box *box)
+{
+ if (box->pci_dev)
+ return uncore_pci_fixed_ctr(box);
+ else
+ return uncore_msr_fixed_ctr(box);
+}
+
+static inline
+unsigned uncore_event_ctl(struct intel_uncore_box *box, int idx)
+{
+ if (box->pci_dev)
+ return uncore_pci_event_ctl(box, idx);
+ else
+ return uncore_msr_event_ctl(box, idx);
+}
+
+static inline
+unsigned uncore_perf_ctr(struct intel_uncore_box *box, int idx)
+{
+ if (box->pci_dev)
+ return uncore_pci_perf_ctr(box, idx);
+ else
+ return uncore_msr_perf_ctr(box, idx);
+}
+
+static inline int uncore_perf_ctr_bits(struct intel_uncore_box *box)
+{
+ return box->pmu->type->perf_ctr_bits;
+}
+
+static inline int uncore_fixed_ctr_bits(struct intel_uncore_box *box)
+{
+ return box->pmu->type->fixed_ctr_bits;
+}
+
+static inline
+unsigned int uncore_freerunning_bits(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ unsigned int type = uncore_freerunning_type(event->hw.config);
+
+ return box->pmu->type->freerunning[type].bits;
+}
+
+static inline int uncore_num_freerunning(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ unsigned int type = uncore_freerunning_type(event->hw.config);
+
+ return box->pmu->type->freerunning[type].num_counters;
+}
+
+static inline int uncore_num_freerunning_types(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ return box->pmu->type->num_freerunning_types;
+}
+
+static inline bool check_valid_freerunning_event(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ unsigned int type = uncore_freerunning_type(event->hw.config);
+ unsigned int idx = uncore_freerunning_idx(event->hw.config);
+
+ return (type < uncore_num_freerunning_types(box, event)) &&
+ (idx < uncore_num_freerunning(box, event));
+}
+
+static inline int uncore_num_counters(struct intel_uncore_box *box)
+{
+ return box->pmu->type->num_counters;
+}
+
+static inline bool is_freerunning_event(struct perf_event *event)
+{
+ u64 cfg = event->attr.config;
+
+ return ((cfg & UNCORE_FIXED_EVENT) == UNCORE_FIXED_EVENT) &&
+ (((cfg >> 8) & 0xff) >= UNCORE_FREERUNNING_UMASK_START);
+}
+
+/* Check and reject invalid config */
+static inline int uncore_freerunning_hw_config(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ if (is_freerunning_event(event))
+ return 0;
+
+ return -EINVAL;
+}
+
+static inline void uncore_disable_event(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ box->pmu->type->ops->disable_event(box, event);
+}
+
+static inline void uncore_enable_event(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ box->pmu->type->ops->enable_event(box, event);
+}
+
+static inline u64 uncore_read_counter(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ return box->pmu->type->ops->read_counter(box, event);
+}
+
+static inline void uncore_box_init(struct intel_uncore_box *box)
+{
+ if (!test_and_set_bit(UNCORE_BOX_FLAG_INITIATED, &box->flags)) {
+ if (box->pmu->type->ops->init_box)
+ box->pmu->type->ops->init_box(box);
+ }
+}
+
+static inline void uncore_box_exit(struct intel_uncore_box *box)
+{
+ if (test_and_clear_bit(UNCORE_BOX_FLAG_INITIATED, &box->flags)) {
+ if (box->pmu->type->ops->exit_box)
+ box->pmu->type->ops->exit_box(box);
+ }
+}
+
+static inline bool uncore_box_is_fake(struct intel_uncore_box *box)
+{
+ return (box->pkgid < 0);
+}
+
+static inline struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event)
+{
+ return container_of(event->pmu, struct intel_uncore_pmu, pmu);
+}
+
+static inline struct intel_uncore_box *uncore_event_to_box(struct perf_event *event)
+{
+ return event->pmu_private;
+}
+
+struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu);
+u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event);
+void uncore_pmu_start_hrtimer(struct intel_uncore_box *box);
+void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box);
+void uncore_pmu_event_start(struct perf_event *event, int flags);
+void uncore_pmu_event_stop(struct perf_event *event, int flags);
+int uncore_pmu_event_add(struct perf_event *event, int flags);
+void uncore_pmu_event_del(struct perf_event *event, int flags);
+void uncore_pmu_event_read(struct perf_event *event);
+void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event);
+struct event_constraint *
+uncore_get_constraint(struct intel_uncore_box *box, struct perf_event *event);
+void uncore_put_constraint(struct intel_uncore_box *box, struct perf_event *event);
+u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx);
+
+extern struct intel_uncore_type **uncore_msr_uncores;
+extern struct intel_uncore_type **uncore_pci_uncores;
+extern struct pci_driver *uncore_pci_driver;
+extern raw_spinlock_t pci2phy_map_lock;
+extern struct list_head pci2phy_map_head;
+extern struct pci_extra_dev *uncore_extra_pci_dev;
+extern struct event_constraint uncore_constraint_empty;
+
+/* uncore_snb.c */
+int snb_uncore_pci_init(void);
+int ivb_uncore_pci_init(void);
+int hsw_uncore_pci_init(void);
+int bdw_uncore_pci_init(void);
+int skl_uncore_pci_init(void);
+void snb_uncore_cpu_init(void);
+void nhm_uncore_cpu_init(void);
+void skl_uncore_cpu_init(void);
+int snb_pci2phy_map_init(int devid);
+
+/* uncore_snbep.c */
+int snbep_uncore_pci_init(void);
+void snbep_uncore_cpu_init(void);
+int ivbep_uncore_pci_init(void);
+void ivbep_uncore_cpu_init(void);
+int hswep_uncore_pci_init(void);
+void hswep_uncore_cpu_init(void);
+int bdx_uncore_pci_init(void);
+void bdx_uncore_cpu_init(void);
+int knl_uncore_pci_init(void);
+void knl_uncore_cpu_init(void);
+int skx_uncore_pci_init(void);
+void skx_uncore_cpu_init(void);
+
+/* uncore_nhmex.c */
+void nhmex_uncore_cpu_init(void);
diff --git a/arch/x86/events/intel/uncore_nhmex.c b/arch/x86/events/intel/uncore_nhmex.c
new file mode 100644
index 000000000..173e2674b
--- /dev/null
+++ b/arch/x86/events/intel/uncore_nhmex.c
@@ -0,0 +1,1228 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Nehalem-EX/Westmere-EX uncore support */
+#include "uncore.h"
+
+/* NHM-EX event control */
+#define NHMEX_PMON_CTL_EV_SEL_MASK 0x000000ff
+#define NHMEX_PMON_CTL_UMASK_MASK 0x0000ff00
+#define NHMEX_PMON_CTL_EN_BIT0 (1 << 0)
+#define NHMEX_PMON_CTL_EDGE_DET (1 << 18)
+#define NHMEX_PMON_CTL_PMI_EN (1 << 20)
+#define NHMEX_PMON_CTL_EN_BIT22 (1 << 22)
+#define NHMEX_PMON_CTL_INVERT (1 << 23)
+#define NHMEX_PMON_CTL_TRESH_MASK 0xff000000
+#define NHMEX_PMON_RAW_EVENT_MASK (NHMEX_PMON_CTL_EV_SEL_MASK | \
+ NHMEX_PMON_CTL_UMASK_MASK | \
+ NHMEX_PMON_CTL_EDGE_DET | \
+ NHMEX_PMON_CTL_INVERT | \
+ NHMEX_PMON_CTL_TRESH_MASK)
+
+/* NHM-EX Ubox */
+#define NHMEX_U_MSR_PMON_GLOBAL_CTL 0xc00
+#define NHMEX_U_MSR_PMON_CTR 0xc11
+#define NHMEX_U_MSR_PMON_EV_SEL 0xc10
+
+#define NHMEX_U_PMON_GLOBAL_EN (1 << 0)
+#define NHMEX_U_PMON_GLOBAL_PMI_CORE_SEL 0x0000001e
+#define NHMEX_U_PMON_GLOBAL_EN_ALL (1 << 28)
+#define NHMEX_U_PMON_GLOBAL_RST_ALL (1 << 29)
+#define NHMEX_U_PMON_GLOBAL_FRZ_ALL (1 << 31)
+
+#define NHMEX_U_PMON_RAW_EVENT_MASK \
+ (NHMEX_PMON_CTL_EV_SEL_MASK | \
+ NHMEX_PMON_CTL_EDGE_DET)
+
+/* NHM-EX Cbox */
+#define NHMEX_C0_MSR_PMON_GLOBAL_CTL 0xd00
+#define NHMEX_C0_MSR_PMON_CTR0 0xd11
+#define NHMEX_C0_MSR_PMON_EV_SEL0 0xd10
+#define NHMEX_C_MSR_OFFSET 0x20
+
+/* NHM-EX Bbox */
+#define NHMEX_B0_MSR_PMON_GLOBAL_CTL 0xc20
+#define NHMEX_B0_MSR_PMON_CTR0 0xc31
+#define NHMEX_B0_MSR_PMON_CTL0 0xc30
+#define NHMEX_B_MSR_OFFSET 0x40
+#define NHMEX_B0_MSR_MATCH 0xe45
+#define NHMEX_B0_MSR_MASK 0xe46
+#define NHMEX_B1_MSR_MATCH 0xe4d
+#define NHMEX_B1_MSR_MASK 0xe4e
+
+#define NHMEX_B_PMON_CTL_EN (1 << 0)
+#define NHMEX_B_PMON_CTL_EV_SEL_SHIFT 1
+#define NHMEX_B_PMON_CTL_EV_SEL_MASK \
+ (0x1f << NHMEX_B_PMON_CTL_EV_SEL_SHIFT)
+#define NHMEX_B_PMON_CTR_SHIFT 6
+#define NHMEX_B_PMON_CTR_MASK \
+ (0x3 << NHMEX_B_PMON_CTR_SHIFT)
+#define NHMEX_B_PMON_RAW_EVENT_MASK \
+ (NHMEX_B_PMON_CTL_EV_SEL_MASK | \
+ NHMEX_B_PMON_CTR_MASK)
+
+/* NHM-EX Sbox */
+#define NHMEX_S0_MSR_PMON_GLOBAL_CTL 0xc40
+#define NHMEX_S0_MSR_PMON_CTR0 0xc51
+#define NHMEX_S0_MSR_PMON_CTL0 0xc50
+#define NHMEX_S_MSR_OFFSET 0x80
+#define NHMEX_S0_MSR_MM_CFG 0xe48
+#define NHMEX_S0_MSR_MATCH 0xe49
+#define NHMEX_S0_MSR_MASK 0xe4a
+#define NHMEX_S1_MSR_MM_CFG 0xe58
+#define NHMEX_S1_MSR_MATCH 0xe59
+#define NHMEX_S1_MSR_MASK 0xe5a
+
+#define NHMEX_S_PMON_MM_CFG_EN (0x1ULL << 63)
+#define NHMEX_S_EVENT_TO_R_PROG_EV 0
+
+/* NHM-EX Mbox */
+#define NHMEX_M0_MSR_GLOBAL_CTL 0xca0
+#define NHMEX_M0_MSR_PMU_DSP 0xca5
+#define NHMEX_M0_MSR_PMU_ISS 0xca6
+#define NHMEX_M0_MSR_PMU_MAP 0xca7
+#define NHMEX_M0_MSR_PMU_MSC_THR 0xca8
+#define NHMEX_M0_MSR_PMU_PGT 0xca9
+#define NHMEX_M0_MSR_PMU_PLD 0xcaa
+#define NHMEX_M0_MSR_PMU_ZDP_CTL_FVC 0xcab
+#define NHMEX_M0_MSR_PMU_CTL0 0xcb0
+#define NHMEX_M0_MSR_PMU_CNT0 0xcb1
+#define NHMEX_M_MSR_OFFSET 0x40
+#define NHMEX_M0_MSR_PMU_MM_CFG 0xe54
+#define NHMEX_M1_MSR_PMU_MM_CFG 0xe5c
+
+#define NHMEX_M_PMON_MM_CFG_EN (1ULL << 63)
+#define NHMEX_M_PMON_ADDR_MATCH_MASK 0x3ffffffffULL
+#define NHMEX_M_PMON_ADDR_MASK_MASK 0x7ffffffULL
+#define NHMEX_M_PMON_ADDR_MASK_SHIFT 34
+
+#define NHMEX_M_PMON_CTL_EN (1 << 0)
+#define NHMEX_M_PMON_CTL_PMI_EN (1 << 1)
+#define NHMEX_M_PMON_CTL_COUNT_MODE_SHIFT 2
+#define NHMEX_M_PMON_CTL_COUNT_MODE_MASK \
+ (0x3 << NHMEX_M_PMON_CTL_COUNT_MODE_SHIFT)
+#define NHMEX_M_PMON_CTL_STORAGE_MODE_SHIFT 4
+#define NHMEX_M_PMON_CTL_STORAGE_MODE_MASK \
+ (0x3 << NHMEX_M_PMON_CTL_STORAGE_MODE_SHIFT)
+#define NHMEX_M_PMON_CTL_WRAP_MODE (1 << 6)
+#define NHMEX_M_PMON_CTL_FLAG_MODE (1 << 7)
+#define NHMEX_M_PMON_CTL_INC_SEL_SHIFT 9
+#define NHMEX_M_PMON_CTL_INC_SEL_MASK \
+ (0x1f << NHMEX_M_PMON_CTL_INC_SEL_SHIFT)
+#define NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT 19
+#define NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK \
+ (0x7 << NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT)
+#define NHMEX_M_PMON_RAW_EVENT_MASK \
+ (NHMEX_M_PMON_CTL_COUNT_MODE_MASK | \
+ NHMEX_M_PMON_CTL_STORAGE_MODE_MASK | \
+ NHMEX_M_PMON_CTL_WRAP_MODE | \
+ NHMEX_M_PMON_CTL_FLAG_MODE | \
+ NHMEX_M_PMON_CTL_INC_SEL_MASK | \
+ NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK)
+
+#define NHMEX_M_PMON_ZDP_CTL_FVC_MASK (((1 << 11) - 1) | (1 << 23))
+#define NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n) (0x7ULL << (11 + 3 * (n)))
+
+#define WSMEX_M_PMON_ZDP_CTL_FVC_MASK (((1 << 12) - 1) | (1 << 24))
+#define WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n) (0x7ULL << (12 + 3 * (n)))
+
+/*
+ * use the 9~13 bits to select event If the 7th bit is not set,
+ * otherwise use the 19~21 bits to select event.
+ */
+#define MBOX_INC_SEL(x) ((x) << NHMEX_M_PMON_CTL_INC_SEL_SHIFT)
+#define MBOX_SET_FLAG_SEL(x) (((x) << NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT) | \
+ NHMEX_M_PMON_CTL_FLAG_MODE)
+#define MBOX_INC_SEL_MASK (NHMEX_M_PMON_CTL_INC_SEL_MASK | \
+ NHMEX_M_PMON_CTL_FLAG_MODE)
+#define MBOX_SET_FLAG_SEL_MASK (NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK | \
+ NHMEX_M_PMON_CTL_FLAG_MODE)
+#define MBOX_INC_SEL_EXTAR_REG(c, r) \
+ EVENT_EXTRA_REG(MBOX_INC_SEL(c), NHMEX_M0_MSR_PMU_##r, \
+ MBOX_INC_SEL_MASK, (u64)-1, NHMEX_M_##r)
+#define MBOX_SET_FLAG_SEL_EXTRA_REG(c, r) \
+ EVENT_EXTRA_REG(MBOX_SET_FLAG_SEL(c), NHMEX_M0_MSR_PMU_##r, \
+ MBOX_SET_FLAG_SEL_MASK, \
+ (u64)-1, NHMEX_M_##r)
+
+/* NHM-EX Rbox */
+#define NHMEX_R_MSR_GLOBAL_CTL 0xe00
+#define NHMEX_R_MSR_PMON_CTL0 0xe10
+#define NHMEX_R_MSR_PMON_CNT0 0xe11
+#define NHMEX_R_MSR_OFFSET 0x20
+
+#define NHMEX_R_MSR_PORTN_QLX_CFG(n) \
+ ((n) < 4 ? (0xe0c + (n)) : (0xe2c + (n) - 4))
+#define NHMEX_R_MSR_PORTN_IPERF_CFG0(n) (0xe04 + (n))
+#define NHMEX_R_MSR_PORTN_IPERF_CFG1(n) (0xe24 + (n))
+#define NHMEX_R_MSR_PORTN_XBR_OFFSET(n) \
+ (((n) < 4 ? 0 : 0x10) + (n) * 4)
+#define NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) \
+ (0xe60 + NHMEX_R_MSR_PORTN_XBR_OFFSET(n))
+#define NHMEX_R_MSR_PORTN_XBR_SET1_MATCH(n) \
+ (NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) + 1)
+#define NHMEX_R_MSR_PORTN_XBR_SET1_MASK(n) \
+ (NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) + 2)
+#define NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) \
+ (0xe70 + NHMEX_R_MSR_PORTN_XBR_OFFSET(n))
+#define NHMEX_R_MSR_PORTN_XBR_SET2_MATCH(n) \
+ (NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) + 1)
+#define NHMEX_R_MSR_PORTN_XBR_SET2_MASK(n) \
+ (NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) + 2)
+
+#define NHMEX_R_PMON_CTL_EN (1 << 0)
+#define NHMEX_R_PMON_CTL_EV_SEL_SHIFT 1
+#define NHMEX_R_PMON_CTL_EV_SEL_MASK \
+ (0x1f << NHMEX_R_PMON_CTL_EV_SEL_SHIFT)
+#define NHMEX_R_PMON_CTL_PMI_EN (1 << 6)
+#define NHMEX_R_PMON_RAW_EVENT_MASK NHMEX_R_PMON_CTL_EV_SEL_MASK
+
+/* NHM-EX Wbox */
+#define NHMEX_W_MSR_GLOBAL_CTL 0xc80
+#define NHMEX_W_MSR_PMON_CNT0 0xc90
+#define NHMEX_W_MSR_PMON_EVT_SEL0 0xc91
+#define NHMEX_W_MSR_PMON_FIXED_CTR 0x394
+#define NHMEX_W_MSR_PMON_FIXED_CTL 0x395
+
+#define NHMEX_W_PMON_GLOBAL_FIXED_EN (1ULL << 31)
+
+#define __BITS_VALUE(x, i, n) ((typeof(x))(((x) >> ((i) * (n))) & \
+ ((1ULL << (n)) - 1)))
+
+DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
+DEFINE_UNCORE_FORMAT_ATTR(event5, event, "config:1-5");
+DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
+DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
+DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
+DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31");
+DEFINE_UNCORE_FORMAT_ATTR(counter, counter, "config:6-7");
+DEFINE_UNCORE_FORMAT_ATTR(match, match, "config1:0-63");
+DEFINE_UNCORE_FORMAT_ATTR(mask, mask, "config2:0-63");
+
+static void nhmex_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+ wrmsrl(NHMEX_U_MSR_PMON_GLOBAL_CTL, NHMEX_U_PMON_GLOBAL_EN_ALL);
+}
+
+static void nhmex_uncore_msr_exit_box(struct intel_uncore_box *box)
+{
+ wrmsrl(NHMEX_U_MSR_PMON_GLOBAL_CTL, 0);
+}
+
+static void nhmex_uncore_msr_disable_box(struct intel_uncore_box *box)
+{
+ unsigned msr = uncore_msr_box_ctl(box);
+ u64 config;
+
+ if (msr) {
+ rdmsrl(msr, config);
+ config &= ~((1ULL << uncore_num_counters(box)) - 1);
+ /* WBox has a fixed counter */
+ if (uncore_msr_fixed_ctl(box))
+ config &= ~NHMEX_W_PMON_GLOBAL_FIXED_EN;
+ wrmsrl(msr, config);
+ }
+}
+
+static void nhmex_uncore_msr_enable_box(struct intel_uncore_box *box)
+{
+ unsigned msr = uncore_msr_box_ctl(box);
+ u64 config;
+
+ if (msr) {
+ rdmsrl(msr, config);
+ config |= (1ULL << uncore_num_counters(box)) - 1;
+ /* WBox has a fixed counter */
+ if (uncore_msr_fixed_ctl(box))
+ config |= NHMEX_W_PMON_GLOBAL_FIXED_EN;
+ wrmsrl(msr, config);
+ }
+}
+
+static void nhmex_uncore_msr_disable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ wrmsrl(event->hw.config_base, 0);
+}
+
+static void nhmex_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (hwc->idx == UNCORE_PMC_IDX_FIXED)
+ wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0);
+ else if (box->pmu->type->event_mask & NHMEX_PMON_CTL_EN_BIT0)
+ wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22);
+ else
+ wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0);
+}
+
+#define NHMEX_UNCORE_OPS_COMMON_INIT() \
+ .init_box = nhmex_uncore_msr_init_box, \
+ .exit_box = nhmex_uncore_msr_exit_box, \
+ .disable_box = nhmex_uncore_msr_disable_box, \
+ .enable_box = nhmex_uncore_msr_enable_box, \
+ .disable_event = nhmex_uncore_msr_disable_event, \
+ .read_counter = uncore_msr_read_counter
+
+static struct intel_uncore_ops nhmex_uncore_ops = {
+ NHMEX_UNCORE_OPS_COMMON_INIT(),
+ .enable_event = nhmex_uncore_msr_enable_event,
+};
+
+static struct attribute *nhmex_uncore_ubox_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_edge.attr,
+ NULL,
+};
+
+static const struct attribute_group nhmex_uncore_ubox_format_group = {
+ .name = "format",
+ .attrs = nhmex_uncore_ubox_formats_attr,
+};
+
+static struct intel_uncore_type nhmex_uncore_ubox = {
+ .name = "ubox",
+ .num_counters = 1,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .event_ctl = NHMEX_U_MSR_PMON_EV_SEL,
+ .perf_ctr = NHMEX_U_MSR_PMON_CTR,
+ .event_mask = NHMEX_U_PMON_RAW_EVENT_MASK,
+ .box_ctl = NHMEX_U_MSR_PMON_GLOBAL_CTL,
+ .ops = &nhmex_uncore_ops,
+ .format_group = &nhmex_uncore_ubox_format_group
+};
+
+static struct attribute *nhmex_uncore_cbox_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh8.attr,
+ NULL,
+};
+
+static const struct attribute_group nhmex_uncore_cbox_format_group = {
+ .name = "format",
+ .attrs = nhmex_uncore_cbox_formats_attr,
+};
+
+/* msr offset for each instance of cbox */
+static unsigned nhmex_cbox_msr_offsets[] = {
+ 0x0, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0, 0x240, 0x2c0,
+};
+
+static struct intel_uncore_type nhmex_uncore_cbox = {
+ .name = "cbox",
+ .num_counters = 6,
+ .num_boxes = 10,
+ .perf_ctr_bits = 48,
+ .event_ctl = NHMEX_C0_MSR_PMON_EV_SEL0,
+ .perf_ctr = NHMEX_C0_MSR_PMON_CTR0,
+ .event_mask = NHMEX_PMON_RAW_EVENT_MASK,
+ .box_ctl = NHMEX_C0_MSR_PMON_GLOBAL_CTL,
+ .msr_offsets = nhmex_cbox_msr_offsets,
+ .pair_ctr_ctl = 1,
+ .ops = &nhmex_uncore_ops,
+ .format_group = &nhmex_uncore_cbox_format_group
+};
+
+static struct uncore_event_desc nhmex_uncore_wbox_events[] = {
+ INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0"),
+ { /* end: all zeroes */ },
+};
+
+static struct intel_uncore_type nhmex_uncore_wbox = {
+ .name = "wbox",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .event_ctl = NHMEX_W_MSR_PMON_CNT0,
+ .perf_ctr = NHMEX_W_MSR_PMON_EVT_SEL0,
+ .fixed_ctr = NHMEX_W_MSR_PMON_FIXED_CTR,
+ .fixed_ctl = NHMEX_W_MSR_PMON_FIXED_CTL,
+ .event_mask = NHMEX_PMON_RAW_EVENT_MASK,
+ .box_ctl = NHMEX_W_MSR_GLOBAL_CTL,
+ .pair_ctr_ctl = 1,
+ .event_descs = nhmex_uncore_wbox_events,
+ .ops = &nhmex_uncore_ops,
+ .format_group = &nhmex_uncore_cbox_format_group
+};
+
+static int nhmex_bbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+ struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+ int ctr, ev_sel;
+
+ ctr = (hwc->config & NHMEX_B_PMON_CTR_MASK) >>
+ NHMEX_B_PMON_CTR_SHIFT;
+ ev_sel = (hwc->config & NHMEX_B_PMON_CTL_EV_SEL_MASK) >>
+ NHMEX_B_PMON_CTL_EV_SEL_SHIFT;
+
+ /* events that do not use the match/mask registers */
+ if ((ctr == 0 && ev_sel > 0x3) || (ctr == 1 && ev_sel > 0x6) ||
+ (ctr == 2 && ev_sel != 0x4) || ctr == 3)
+ return 0;
+
+ if (box->pmu->pmu_idx == 0)
+ reg1->reg = NHMEX_B0_MSR_MATCH;
+ else
+ reg1->reg = NHMEX_B1_MSR_MATCH;
+ reg1->idx = 0;
+ reg1->config = event->attr.config1;
+ reg2->config = event->attr.config2;
+ return 0;
+}
+
+static void nhmex_bbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+ struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+
+ if (reg1->idx != EXTRA_REG_NONE) {
+ wrmsrl(reg1->reg, reg1->config);
+ wrmsrl(reg1->reg + 1, reg2->config);
+ }
+ wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0 |
+ (hwc->config & NHMEX_B_PMON_CTL_EV_SEL_MASK));
+}
+
+/*
+ * The Bbox has 4 counters, but each counter monitors different events.
+ * Use bits 6-7 in the event config to select counter.
+ */
+static struct event_constraint nhmex_uncore_bbox_constraints[] = {
+ EVENT_CONSTRAINT(0 , 1, 0xc0),
+ EVENT_CONSTRAINT(0x40, 2, 0xc0),
+ EVENT_CONSTRAINT(0x80, 4, 0xc0),
+ EVENT_CONSTRAINT(0xc0, 8, 0xc0),
+ EVENT_CONSTRAINT_END,
+};
+
+static struct attribute *nhmex_uncore_bbox_formats_attr[] = {
+ &format_attr_event5.attr,
+ &format_attr_counter.attr,
+ &format_attr_match.attr,
+ &format_attr_mask.attr,
+ NULL,
+};
+
+static const struct attribute_group nhmex_uncore_bbox_format_group = {
+ .name = "format",
+ .attrs = nhmex_uncore_bbox_formats_attr,
+};
+
+static struct intel_uncore_ops nhmex_uncore_bbox_ops = {
+ NHMEX_UNCORE_OPS_COMMON_INIT(),
+ .enable_event = nhmex_bbox_msr_enable_event,
+ .hw_config = nhmex_bbox_hw_config,
+ .get_constraint = uncore_get_constraint,
+ .put_constraint = uncore_put_constraint,
+};
+
+static struct intel_uncore_type nhmex_uncore_bbox = {
+ .name = "bbox",
+ .num_counters = 4,
+ .num_boxes = 2,
+ .perf_ctr_bits = 48,
+ .event_ctl = NHMEX_B0_MSR_PMON_CTL0,
+ .perf_ctr = NHMEX_B0_MSR_PMON_CTR0,
+ .event_mask = NHMEX_B_PMON_RAW_EVENT_MASK,
+ .box_ctl = NHMEX_B0_MSR_PMON_GLOBAL_CTL,
+ .msr_offset = NHMEX_B_MSR_OFFSET,
+ .pair_ctr_ctl = 1,
+ .num_shared_regs = 1,
+ .constraints = nhmex_uncore_bbox_constraints,
+ .ops = &nhmex_uncore_bbox_ops,
+ .format_group = &nhmex_uncore_bbox_format_group
+};
+
+static int nhmex_sbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+ struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+
+ /* only TO_R_PROG_EV event uses the match/mask register */
+ if ((hwc->config & NHMEX_PMON_CTL_EV_SEL_MASK) !=
+ NHMEX_S_EVENT_TO_R_PROG_EV)
+ return 0;
+
+ if (box->pmu->pmu_idx == 0)
+ reg1->reg = NHMEX_S0_MSR_MM_CFG;
+ else
+ reg1->reg = NHMEX_S1_MSR_MM_CFG;
+ reg1->idx = 0;
+ reg1->config = event->attr.config1;
+ reg2->config = event->attr.config2;
+ return 0;
+}
+
+static void nhmex_sbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+ struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+
+ if (reg1->idx != EXTRA_REG_NONE) {
+ wrmsrl(reg1->reg, 0);
+ wrmsrl(reg1->reg + 1, reg1->config);
+ wrmsrl(reg1->reg + 2, reg2->config);
+ wrmsrl(reg1->reg, NHMEX_S_PMON_MM_CFG_EN);
+ }
+ wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22);
+}
+
+static struct attribute *nhmex_uncore_sbox_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh8.attr,
+ &format_attr_match.attr,
+ &format_attr_mask.attr,
+ NULL,
+};
+
+static const struct attribute_group nhmex_uncore_sbox_format_group = {
+ .name = "format",
+ .attrs = nhmex_uncore_sbox_formats_attr,
+};
+
+static struct intel_uncore_ops nhmex_uncore_sbox_ops = {
+ NHMEX_UNCORE_OPS_COMMON_INIT(),
+ .enable_event = nhmex_sbox_msr_enable_event,
+ .hw_config = nhmex_sbox_hw_config,
+ .get_constraint = uncore_get_constraint,
+ .put_constraint = uncore_put_constraint,
+};
+
+static struct intel_uncore_type nhmex_uncore_sbox = {
+ .name = "sbox",
+ .num_counters = 4,
+ .num_boxes = 2,
+ .perf_ctr_bits = 48,
+ .event_ctl = NHMEX_S0_MSR_PMON_CTL0,
+ .perf_ctr = NHMEX_S0_MSR_PMON_CTR0,
+ .event_mask = NHMEX_PMON_RAW_EVENT_MASK,
+ .box_ctl = NHMEX_S0_MSR_PMON_GLOBAL_CTL,
+ .msr_offset = NHMEX_S_MSR_OFFSET,
+ .pair_ctr_ctl = 1,
+ .num_shared_regs = 1,
+ .ops = &nhmex_uncore_sbox_ops,
+ .format_group = &nhmex_uncore_sbox_format_group
+};
+
+enum {
+ EXTRA_REG_NHMEX_M_FILTER,
+ EXTRA_REG_NHMEX_M_DSP,
+ EXTRA_REG_NHMEX_M_ISS,
+ EXTRA_REG_NHMEX_M_MAP,
+ EXTRA_REG_NHMEX_M_MSC_THR,
+ EXTRA_REG_NHMEX_M_PGT,
+ EXTRA_REG_NHMEX_M_PLD,
+ EXTRA_REG_NHMEX_M_ZDP_CTL_FVC,
+};
+
+static struct extra_reg nhmex_uncore_mbox_extra_regs[] = {
+ MBOX_INC_SEL_EXTAR_REG(0x0, DSP),
+ MBOX_INC_SEL_EXTAR_REG(0x4, MSC_THR),
+ MBOX_INC_SEL_EXTAR_REG(0x5, MSC_THR),
+ MBOX_INC_SEL_EXTAR_REG(0x9, ISS),
+ /* event 0xa uses two extra registers */
+ MBOX_INC_SEL_EXTAR_REG(0xa, ISS),
+ MBOX_INC_SEL_EXTAR_REG(0xa, PLD),
+ MBOX_INC_SEL_EXTAR_REG(0xb, PLD),
+ /* events 0xd ~ 0x10 use the same extra register */
+ MBOX_INC_SEL_EXTAR_REG(0xd, ZDP_CTL_FVC),
+ MBOX_INC_SEL_EXTAR_REG(0xe, ZDP_CTL_FVC),
+ MBOX_INC_SEL_EXTAR_REG(0xf, ZDP_CTL_FVC),
+ MBOX_INC_SEL_EXTAR_REG(0x10, ZDP_CTL_FVC),
+ MBOX_INC_SEL_EXTAR_REG(0x16, PGT),
+ MBOX_SET_FLAG_SEL_EXTRA_REG(0x0, DSP),
+ MBOX_SET_FLAG_SEL_EXTRA_REG(0x1, ISS),
+ MBOX_SET_FLAG_SEL_EXTRA_REG(0x5, PGT),
+ MBOX_SET_FLAG_SEL_EXTRA_REG(0x6, MAP),
+ EVENT_EXTRA_END
+};
+
+/* Nehalem-EX or Westmere-EX ? */
+static bool uncore_nhmex;
+
+static bool nhmex_mbox_get_shared_reg(struct intel_uncore_box *box, int idx, u64 config)
+{
+ struct intel_uncore_extra_reg *er;
+ unsigned long flags;
+ bool ret = false;
+ u64 mask;
+
+ if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
+ er = &box->shared_regs[idx];
+ raw_spin_lock_irqsave(&er->lock, flags);
+ if (!atomic_read(&er->ref) || er->config == config) {
+ atomic_inc(&er->ref);
+ er->config = config;
+ ret = true;
+ }
+ raw_spin_unlock_irqrestore(&er->lock, flags);
+
+ return ret;
+ }
+ /*
+ * The ZDP_CTL_FVC MSR has 4 fields which are used to control
+ * events 0xd ~ 0x10. Besides these 4 fields, there are additional
+ * fields which are shared.
+ */
+ idx -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
+ if (WARN_ON_ONCE(idx >= 4))
+ return false;
+
+ /* mask of the shared fields */
+ if (uncore_nhmex)
+ mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK;
+ else
+ mask = WSMEX_M_PMON_ZDP_CTL_FVC_MASK;
+ er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
+
+ raw_spin_lock_irqsave(&er->lock, flags);
+ /* add mask of the non-shared field if it's in use */
+ if (__BITS_VALUE(atomic_read(&er->ref), idx, 8)) {
+ if (uncore_nhmex)
+ mask |= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+ else
+ mask |= WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+ }
+
+ if (!atomic_read(&er->ref) || !((er->config ^ config) & mask)) {
+ atomic_add(1 << (idx * 8), &er->ref);
+ if (uncore_nhmex)
+ mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK |
+ NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+ else
+ mask = WSMEX_M_PMON_ZDP_CTL_FVC_MASK |
+ WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+ er->config &= ~mask;
+ er->config |= (config & mask);
+ ret = true;
+ }
+ raw_spin_unlock_irqrestore(&er->lock, flags);
+
+ return ret;
+}
+
+static void nhmex_mbox_put_shared_reg(struct intel_uncore_box *box, int idx)
+{
+ struct intel_uncore_extra_reg *er;
+
+ if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
+ er = &box->shared_regs[idx];
+ atomic_dec(&er->ref);
+ return;
+ }
+
+ idx -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
+ er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
+ atomic_sub(1 << (idx * 8), &er->ref);
+}
+
+static u64 nhmex_mbox_alter_er(struct perf_event *event, int new_idx, bool modify)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+ u64 idx, orig_idx = __BITS_VALUE(reg1->idx, 0, 8);
+ u64 config = reg1->config;
+
+ /* get the non-shared control bits and shift them */
+ idx = orig_idx - EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
+ if (uncore_nhmex)
+ config &= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+ else
+ config &= WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+ if (new_idx > orig_idx) {
+ idx = new_idx - orig_idx;
+ config <<= 3 * idx;
+ } else {
+ idx = orig_idx - new_idx;
+ config >>= 3 * idx;
+ }
+
+ /* add the shared control bits back */
+ if (uncore_nhmex)
+ config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
+ else
+ config |= WSMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
+ config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
+ if (modify) {
+ /* adjust the main event selector */
+ if (new_idx > orig_idx)
+ hwc->config += idx << NHMEX_M_PMON_CTL_INC_SEL_SHIFT;
+ else
+ hwc->config -= idx << NHMEX_M_PMON_CTL_INC_SEL_SHIFT;
+ reg1->config = config;
+ reg1->idx = ~0xff | new_idx;
+ }
+ return config;
+}
+
+static struct event_constraint *
+nhmex_mbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
+ int i, idx[2], alloc = 0;
+ u64 config1 = reg1->config;
+
+ idx[0] = __BITS_VALUE(reg1->idx, 0, 8);
+ idx[1] = __BITS_VALUE(reg1->idx, 1, 8);
+again:
+ for (i = 0; i < 2; i++) {
+ if (!uncore_box_is_fake(box) && (reg1->alloc & (0x1 << i)))
+ idx[i] = 0xff;
+
+ if (idx[i] == 0xff)
+ continue;
+
+ if (!nhmex_mbox_get_shared_reg(box, idx[i],
+ __BITS_VALUE(config1, i, 32)))
+ goto fail;
+ alloc |= (0x1 << i);
+ }
+
+ /* for the match/mask registers */
+ if (reg2->idx != EXTRA_REG_NONE &&
+ (uncore_box_is_fake(box) || !reg2->alloc) &&
+ !nhmex_mbox_get_shared_reg(box, reg2->idx, reg2->config))
+ goto fail;
+
+ /*
+ * If it's a fake box -- as per validate_{group,event}() we
+ * shouldn't touch event state and we can avoid doing so
+ * since both will only call get_event_constraints() once
+ * on each event, this avoids the need for reg->alloc.
+ */
+ if (!uncore_box_is_fake(box)) {
+ if (idx[0] != 0xff && idx[0] != __BITS_VALUE(reg1->idx, 0, 8))
+ nhmex_mbox_alter_er(event, idx[0], true);
+ reg1->alloc |= alloc;
+ if (reg2->idx != EXTRA_REG_NONE)
+ reg2->alloc = 1;
+ }
+ return NULL;
+fail:
+ if (idx[0] != 0xff && !(alloc & 0x1) &&
+ idx[0] >= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
+ /*
+ * events 0xd ~ 0x10 are functional identical, but are
+ * controlled by different fields in the ZDP_CTL_FVC
+ * register. If we failed to take one field, try the
+ * rest 3 choices.
+ */
+ BUG_ON(__BITS_VALUE(reg1->idx, 1, 8) != 0xff);
+ idx[0] -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
+ idx[0] = (idx[0] + 1) % 4;
+ idx[0] += EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
+ if (idx[0] != __BITS_VALUE(reg1->idx, 0, 8)) {
+ config1 = nhmex_mbox_alter_er(event, idx[0], false);
+ goto again;
+ }
+ }
+
+ if (alloc & 0x1)
+ nhmex_mbox_put_shared_reg(box, idx[0]);
+ if (alloc & 0x2)
+ nhmex_mbox_put_shared_reg(box, idx[1]);
+ return &uncore_constraint_empty;
+}
+
+static void nhmex_mbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
+
+ if (uncore_box_is_fake(box))
+ return;
+
+ if (reg1->alloc & 0x1)
+ nhmex_mbox_put_shared_reg(box, __BITS_VALUE(reg1->idx, 0, 8));
+ if (reg1->alloc & 0x2)
+ nhmex_mbox_put_shared_reg(box, __BITS_VALUE(reg1->idx, 1, 8));
+ reg1->alloc = 0;
+
+ if (reg2->alloc) {
+ nhmex_mbox_put_shared_reg(box, reg2->idx);
+ reg2->alloc = 0;
+ }
+}
+
+static int nhmex_mbox_extra_reg_idx(struct extra_reg *er)
+{
+ if (er->idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC)
+ return er->idx;
+ return er->idx + (er->event >> NHMEX_M_PMON_CTL_INC_SEL_SHIFT) - 0xd;
+}
+
+static int nhmex_mbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct intel_uncore_type *type = box->pmu->type;
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
+ struct extra_reg *er;
+ unsigned msr;
+ int reg_idx = 0;
+ /*
+ * The mbox events may require 2 extra MSRs at the most. But only
+ * the lower 32 bits in these MSRs are significant, so we can use
+ * config1 to pass two MSRs' config.
+ */
+ for (er = nhmex_uncore_mbox_extra_regs; er->msr; er++) {
+ if (er->event != (event->hw.config & er->config_mask))
+ continue;
+ if (event->attr.config1 & ~er->valid_mask)
+ return -EINVAL;
+
+ msr = er->msr + type->msr_offset * box->pmu->pmu_idx;
+ if (WARN_ON_ONCE(msr >= 0xffff || er->idx >= 0xff))
+ return -EINVAL;
+
+ /* always use the 32~63 bits to pass the PLD config */
+ if (er->idx == EXTRA_REG_NHMEX_M_PLD)
+ reg_idx = 1;
+ else if (WARN_ON_ONCE(reg_idx > 0))
+ return -EINVAL;
+
+ reg1->idx &= ~(0xff << (reg_idx * 8));
+ reg1->reg &= ~(0xffff << (reg_idx * 16));
+ reg1->idx |= nhmex_mbox_extra_reg_idx(er) << (reg_idx * 8);
+ reg1->reg |= msr << (reg_idx * 16);
+ reg1->config = event->attr.config1;
+ reg_idx++;
+ }
+ /*
+ * The mbox only provides ability to perform address matching
+ * for the PLD events.
+ */
+ if (reg_idx == 2) {
+ reg2->idx = EXTRA_REG_NHMEX_M_FILTER;
+ if (event->attr.config2 & NHMEX_M_PMON_MM_CFG_EN)
+ reg2->config = event->attr.config2;
+ else
+ reg2->config = ~0ULL;
+ if (box->pmu->pmu_idx == 0)
+ reg2->reg = NHMEX_M0_MSR_PMU_MM_CFG;
+ else
+ reg2->reg = NHMEX_M1_MSR_PMU_MM_CFG;
+ }
+ return 0;
+}
+
+static u64 nhmex_mbox_shared_reg_config(struct intel_uncore_box *box, int idx)
+{
+ struct intel_uncore_extra_reg *er;
+ unsigned long flags;
+ u64 config;
+
+ if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC)
+ return box->shared_regs[idx].config;
+
+ er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
+ raw_spin_lock_irqsave(&er->lock, flags);
+ config = er->config;
+ raw_spin_unlock_irqrestore(&er->lock, flags);
+ return config;
+}
+
+static void nhmex_mbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+ struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+ int idx;
+
+ idx = __BITS_VALUE(reg1->idx, 0, 8);
+ if (idx != 0xff)
+ wrmsrl(__BITS_VALUE(reg1->reg, 0, 16),
+ nhmex_mbox_shared_reg_config(box, idx));
+ idx = __BITS_VALUE(reg1->idx, 1, 8);
+ if (idx != 0xff)
+ wrmsrl(__BITS_VALUE(reg1->reg, 1, 16),
+ nhmex_mbox_shared_reg_config(box, idx));
+
+ if (reg2->idx != EXTRA_REG_NONE) {
+ wrmsrl(reg2->reg, 0);
+ if (reg2->config != ~0ULL) {
+ wrmsrl(reg2->reg + 1,
+ reg2->config & NHMEX_M_PMON_ADDR_MATCH_MASK);
+ wrmsrl(reg2->reg + 2, NHMEX_M_PMON_ADDR_MASK_MASK &
+ (reg2->config >> NHMEX_M_PMON_ADDR_MASK_SHIFT));
+ wrmsrl(reg2->reg, NHMEX_M_PMON_MM_CFG_EN);
+ }
+ }
+
+ wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0);
+}
+
+DEFINE_UNCORE_FORMAT_ATTR(count_mode, count_mode, "config:2-3");
+DEFINE_UNCORE_FORMAT_ATTR(storage_mode, storage_mode, "config:4-5");
+DEFINE_UNCORE_FORMAT_ATTR(wrap_mode, wrap_mode, "config:6");
+DEFINE_UNCORE_FORMAT_ATTR(flag_mode, flag_mode, "config:7");
+DEFINE_UNCORE_FORMAT_ATTR(inc_sel, inc_sel, "config:9-13");
+DEFINE_UNCORE_FORMAT_ATTR(set_flag_sel, set_flag_sel, "config:19-21");
+DEFINE_UNCORE_FORMAT_ATTR(filter_cfg_en, filter_cfg_en, "config2:63");
+DEFINE_UNCORE_FORMAT_ATTR(filter_match, filter_match, "config2:0-33");
+DEFINE_UNCORE_FORMAT_ATTR(filter_mask, filter_mask, "config2:34-61");
+DEFINE_UNCORE_FORMAT_ATTR(dsp, dsp, "config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(thr, thr, "config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(fvc, fvc, "config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(pgt, pgt, "config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(map, map, "config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(iss, iss, "config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(pld, pld, "config1:32-63");
+
+static struct attribute *nhmex_uncore_mbox_formats_attr[] = {
+ &format_attr_count_mode.attr,
+ &format_attr_storage_mode.attr,
+ &format_attr_wrap_mode.attr,
+ &format_attr_flag_mode.attr,
+ &format_attr_inc_sel.attr,
+ &format_attr_set_flag_sel.attr,
+ &format_attr_filter_cfg_en.attr,
+ &format_attr_filter_match.attr,
+ &format_attr_filter_mask.attr,
+ &format_attr_dsp.attr,
+ &format_attr_thr.attr,
+ &format_attr_fvc.attr,
+ &format_attr_pgt.attr,
+ &format_attr_map.attr,
+ &format_attr_iss.attr,
+ &format_attr_pld.attr,
+ NULL,
+};
+
+static const struct attribute_group nhmex_uncore_mbox_format_group = {
+ .name = "format",
+ .attrs = nhmex_uncore_mbox_formats_attr,
+};
+
+static struct uncore_event_desc nhmex_uncore_mbox_events[] = {
+ INTEL_UNCORE_EVENT_DESC(bbox_cmds_read, "inc_sel=0xd,fvc=0x2800"),
+ INTEL_UNCORE_EVENT_DESC(bbox_cmds_write, "inc_sel=0xd,fvc=0x2820"),
+ { /* end: all zeroes */ },
+};
+
+static struct uncore_event_desc wsmex_uncore_mbox_events[] = {
+ INTEL_UNCORE_EVENT_DESC(bbox_cmds_read, "inc_sel=0xd,fvc=0x5000"),
+ INTEL_UNCORE_EVENT_DESC(bbox_cmds_write, "inc_sel=0xd,fvc=0x5040"),
+ { /* end: all zeroes */ },
+};
+
+static struct intel_uncore_ops nhmex_uncore_mbox_ops = {
+ NHMEX_UNCORE_OPS_COMMON_INIT(),
+ .enable_event = nhmex_mbox_msr_enable_event,
+ .hw_config = nhmex_mbox_hw_config,
+ .get_constraint = nhmex_mbox_get_constraint,
+ .put_constraint = nhmex_mbox_put_constraint,
+};
+
+static struct intel_uncore_type nhmex_uncore_mbox = {
+ .name = "mbox",
+ .num_counters = 6,
+ .num_boxes = 2,
+ .perf_ctr_bits = 48,
+ .event_ctl = NHMEX_M0_MSR_PMU_CTL0,
+ .perf_ctr = NHMEX_M0_MSR_PMU_CNT0,
+ .event_mask = NHMEX_M_PMON_RAW_EVENT_MASK,
+ .box_ctl = NHMEX_M0_MSR_GLOBAL_CTL,
+ .msr_offset = NHMEX_M_MSR_OFFSET,
+ .pair_ctr_ctl = 1,
+ .num_shared_regs = 8,
+ .event_descs = nhmex_uncore_mbox_events,
+ .ops = &nhmex_uncore_mbox_ops,
+ .format_group = &nhmex_uncore_mbox_format_group,
+};
+
+static void nhmex_rbox_alter_er(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+
+ /* adjust the main event selector and extra register index */
+ if (reg1->idx % 2) {
+ reg1->idx--;
+ hwc->config -= 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
+ } else {
+ reg1->idx++;
+ hwc->config += 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
+ }
+
+ /* adjust extra register config */
+ switch (reg1->idx % 6) {
+ case 2:
+ /* shift the 8~15 bits to the 0~7 bits */
+ reg1->config >>= 8;
+ break;
+ case 3:
+ /* shift the 0~7 bits to the 8~15 bits */
+ reg1->config <<= 8;
+ break;
+ }
+}
+
+/*
+ * Each rbox has 4 event set which monitor PQI port 0~3 or 4~7.
+ * An event set consists of 6 events, the 3rd and 4th events in
+ * an event set use the same extra register. So an event set uses
+ * 5 extra registers.
+ */
+static struct event_constraint *
+nhmex_rbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+ struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+ struct intel_uncore_extra_reg *er;
+ unsigned long flags;
+ int idx, er_idx;
+ u64 config1;
+ bool ok = false;
+
+ if (!uncore_box_is_fake(box) && reg1->alloc)
+ return NULL;
+
+ idx = reg1->idx % 6;
+ config1 = reg1->config;
+again:
+ er_idx = idx;
+ /* the 3rd and 4th events use the same extra register */
+ if (er_idx > 2)
+ er_idx--;
+ er_idx += (reg1->idx / 6) * 5;
+
+ er = &box->shared_regs[er_idx];
+ raw_spin_lock_irqsave(&er->lock, flags);
+ if (idx < 2) {
+ if (!atomic_read(&er->ref) || er->config == reg1->config) {
+ atomic_inc(&er->ref);
+ er->config = reg1->config;
+ ok = true;
+ }
+ } else if (idx == 2 || idx == 3) {
+ /*
+ * these two events use different fields in a extra register,
+ * the 0~7 bits and the 8~15 bits respectively.
+ */
+ u64 mask = 0xff << ((idx - 2) * 8);
+ if (!__BITS_VALUE(atomic_read(&er->ref), idx - 2, 8) ||
+ !((er->config ^ config1) & mask)) {
+ atomic_add(1 << ((idx - 2) * 8), &er->ref);
+ er->config &= ~mask;
+ er->config |= config1 & mask;
+ ok = true;
+ }
+ } else {
+ if (!atomic_read(&er->ref) ||
+ (er->config == (hwc->config >> 32) &&
+ er->config1 == reg1->config &&
+ er->config2 == reg2->config)) {
+ atomic_inc(&er->ref);
+ er->config = (hwc->config >> 32);
+ er->config1 = reg1->config;
+ er->config2 = reg2->config;
+ ok = true;
+ }
+ }
+ raw_spin_unlock_irqrestore(&er->lock, flags);
+
+ if (!ok) {
+ /*
+ * The Rbox events are always in pairs. The paired
+ * events are functional identical, but use different
+ * extra registers. If we failed to take an extra
+ * register, try the alternative.
+ */
+ idx ^= 1;
+ if (idx != reg1->idx % 6) {
+ if (idx == 2)
+ config1 >>= 8;
+ else if (idx == 3)
+ config1 <<= 8;
+ goto again;
+ }
+ } else {
+ if (!uncore_box_is_fake(box)) {
+ if (idx != reg1->idx % 6)
+ nhmex_rbox_alter_er(box, event);
+ reg1->alloc = 1;
+ }
+ return NULL;
+ }
+ return &uncore_constraint_empty;
+}
+
+static void nhmex_rbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct intel_uncore_extra_reg *er;
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ int idx, er_idx;
+
+ if (uncore_box_is_fake(box) || !reg1->alloc)
+ return;
+
+ idx = reg1->idx % 6;
+ er_idx = idx;
+ if (er_idx > 2)
+ er_idx--;
+ er_idx += (reg1->idx / 6) * 5;
+
+ er = &box->shared_regs[er_idx];
+ if (idx == 2 || idx == 3)
+ atomic_sub(1 << ((idx - 2) * 8), &er->ref);
+ else
+ atomic_dec(&er->ref);
+
+ reg1->alloc = 0;
+}
+
+static int nhmex_rbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
+ int idx;
+
+ idx = (event->hw.config & NHMEX_R_PMON_CTL_EV_SEL_MASK) >>
+ NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
+ if (idx >= 0x18)
+ return -EINVAL;
+
+ reg1->idx = idx;
+ reg1->config = event->attr.config1;
+
+ switch (idx % 6) {
+ case 4:
+ case 5:
+ hwc->config |= event->attr.config & (~0ULL << 32);
+ reg2->config = event->attr.config2;
+ break;
+ }
+ return 0;
+}
+
+static void nhmex_rbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+ struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+ int idx, port;
+
+ idx = reg1->idx;
+ port = idx / 6 + box->pmu->pmu_idx * 4;
+
+ switch (idx % 6) {
+ case 0:
+ wrmsrl(NHMEX_R_MSR_PORTN_IPERF_CFG0(port), reg1->config);
+ break;
+ case 1:
+ wrmsrl(NHMEX_R_MSR_PORTN_IPERF_CFG1(port), reg1->config);
+ break;
+ case 2:
+ case 3:
+ wrmsrl(NHMEX_R_MSR_PORTN_QLX_CFG(port),
+ uncore_shared_reg_config(box, 2 + (idx / 6) * 5));
+ break;
+ case 4:
+ wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(port),
+ hwc->config >> 32);
+ wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MATCH(port), reg1->config);
+ wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MASK(port), reg2->config);
+ break;
+ case 5:
+ wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(port),
+ hwc->config >> 32);
+ wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MATCH(port), reg1->config);
+ wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MASK(port), reg2->config);
+ break;
+ }
+
+ wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0 |
+ (hwc->config & NHMEX_R_PMON_CTL_EV_SEL_MASK));
+}
+
+DEFINE_UNCORE_FORMAT_ATTR(xbr_mm_cfg, xbr_mm_cfg, "config:32-63");
+DEFINE_UNCORE_FORMAT_ATTR(xbr_match, xbr_match, "config1:0-63");
+DEFINE_UNCORE_FORMAT_ATTR(xbr_mask, xbr_mask, "config2:0-63");
+DEFINE_UNCORE_FORMAT_ATTR(qlx_cfg, qlx_cfg, "config1:0-15");
+DEFINE_UNCORE_FORMAT_ATTR(iperf_cfg, iperf_cfg, "config1:0-31");
+
+static struct attribute *nhmex_uncore_rbox_formats_attr[] = {
+ &format_attr_event5.attr,
+ &format_attr_xbr_mm_cfg.attr,
+ &format_attr_xbr_match.attr,
+ &format_attr_xbr_mask.attr,
+ &format_attr_qlx_cfg.attr,
+ &format_attr_iperf_cfg.attr,
+ NULL,
+};
+
+static const struct attribute_group nhmex_uncore_rbox_format_group = {
+ .name = "format",
+ .attrs = nhmex_uncore_rbox_formats_attr,
+};
+
+static struct uncore_event_desc nhmex_uncore_rbox_events[] = {
+ INTEL_UNCORE_EVENT_DESC(qpi0_flit_send, "event=0x0,iperf_cfg=0x80000000"),
+ INTEL_UNCORE_EVENT_DESC(qpi1_filt_send, "event=0x6,iperf_cfg=0x80000000"),
+ INTEL_UNCORE_EVENT_DESC(qpi0_idle_filt, "event=0x0,iperf_cfg=0x40000000"),
+ INTEL_UNCORE_EVENT_DESC(qpi1_idle_filt, "event=0x6,iperf_cfg=0x40000000"),
+ INTEL_UNCORE_EVENT_DESC(qpi0_date_response, "event=0x0,iperf_cfg=0xc4"),
+ INTEL_UNCORE_EVENT_DESC(qpi1_date_response, "event=0x6,iperf_cfg=0xc4"),
+ { /* end: all zeroes */ },
+};
+
+static struct intel_uncore_ops nhmex_uncore_rbox_ops = {
+ NHMEX_UNCORE_OPS_COMMON_INIT(),
+ .enable_event = nhmex_rbox_msr_enable_event,
+ .hw_config = nhmex_rbox_hw_config,
+ .get_constraint = nhmex_rbox_get_constraint,
+ .put_constraint = nhmex_rbox_put_constraint,
+};
+
+static struct intel_uncore_type nhmex_uncore_rbox = {
+ .name = "rbox",
+ .num_counters = 8,
+ .num_boxes = 2,
+ .perf_ctr_bits = 48,
+ .event_ctl = NHMEX_R_MSR_PMON_CTL0,
+ .perf_ctr = NHMEX_R_MSR_PMON_CNT0,
+ .event_mask = NHMEX_R_PMON_RAW_EVENT_MASK,
+ .box_ctl = NHMEX_R_MSR_GLOBAL_CTL,
+ .msr_offset = NHMEX_R_MSR_OFFSET,
+ .pair_ctr_ctl = 1,
+ .num_shared_regs = 20,
+ .event_descs = nhmex_uncore_rbox_events,
+ .ops = &nhmex_uncore_rbox_ops,
+ .format_group = &nhmex_uncore_rbox_format_group
+};
+
+static struct intel_uncore_type *nhmex_msr_uncores[] = {
+ &nhmex_uncore_ubox,
+ &nhmex_uncore_cbox,
+ &nhmex_uncore_bbox,
+ &nhmex_uncore_sbox,
+ &nhmex_uncore_mbox,
+ &nhmex_uncore_rbox,
+ &nhmex_uncore_wbox,
+ NULL,
+};
+
+void nhmex_uncore_cpu_init(void)
+{
+ if (boot_cpu_data.x86_model == 46)
+ uncore_nhmex = true;
+ else
+ nhmex_uncore_mbox.event_descs = wsmex_uncore_mbox_events;
+ if (nhmex_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
+ nhmex_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
+ uncore_msr_uncores = nhmex_msr_uncores;
+}
+/* end of Nehalem-EX uncore support */
diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c
new file mode 100644
index 000000000..2d328386f
--- /dev/null
+++ b/arch/x86/events/intel/uncore_snb.c
@@ -0,0 +1,877 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Nehalem/SandBridge/Haswell/Broadwell/Skylake uncore support */
+#include "uncore.h"
+
+/* Uncore IMC PCI IDs */
+#define PCI_DEVICE_ID_INTEL_SNB_IMC 0x0100
+#define PCI_DEVICE_ID_INTEL_IVB_IMC 0x0154
+#define PCI_DEVICE_ID_INTEL_IVB_E3_IMC 0x0150
+#define PCI_DEVICE_ID_INTEL_HSW_IMC 0x0c00
+#define PCI_DEVICE_ID_INTEL_HSW_U_IMC 0x0a04
+#define PCI_DEVICE_ID_INTEL_BDW_IMC 0x1604
+#define PCI_DEVICE_ID_INTEL_SKL_U_IMC 0x1904
+#define PCI_DEVICE_ID_INTEL_SKL_Y_IMC 0x190c
+#define PCI_DEVICE_ID_INTEL_SKL_HD_IMC 0x1900
+#define PCI_DEVICE_ID_INTEL_SKL_HQ_IMC 0x1910
+#define PCI_DEVICE_ID_INTEL_SKL_SD_IMC 0x190f
+#define PCI_DEVICE_ID_INTEL_SKL_SQ_IMC 0x191f
+#define PCI_DEVICE_ID_INTEL_KBL_Y_IMC 0x590c
+#define PCI_DEVICE_ID_INTEL_KBL_U_IMC 0x5904
+#define PCI_DEVICE_ID_INTEL_KBL_UQ_IMC 0x5914
+#define PCI_DEVICE_ID_INTEL_KBL_SD_IMC 0x590f
+#define PCI_DEVICE_ID_INTEL_KBL_SQ_IMC 0x591f
+#define PCI_DEVICE_ID_INTEL_CFL_2U_IMC 0x3ecc
+#define PCI_DEVICE_ID_INTEL_CFL_4U_IMC 0x3ed0
+#define PCI_DEVICE_ID_INTEL_CFL_4H_IMC 0x3e10
+#define PCI_DEVICE_ID_INTEL_CFL_6H_IMC 0x3ec4
+#define PCI_DEVICE_ID_INTEL_CFL_2S_D_IMC 0x3e0f
+#define PCI_DEVICE_ID_INTEL_CFL_4S_D_IMC 0x3e1f
+#define PCI_DEVICE_ID_INTEL_CFL_6S_D_IMC 0x3ec2
+#define PCI_DEVICE_ID_INTEL_CFL_8S_D_IMC 0x3e30
+#define PCI_DEVICE_ID_INTEL_CFL_4S_W_IMC 0x3e18
+#define PCI_DEVICE_ID_INTEL_CFL_6S_W_IMC 0x3ec6
+#define PCI_DEVICE_ID_INTEL_CFL_8S_W_IMC 0x3e31
+#define PCI_DEVICE_ID_INTEL_CFL_4S_S_IMC 0x3e33
+#define PCI_DEVICE_ID_INTEL_CFL_6S_S_IMC 0x3eca
+#define PCI_DEVICE_ID_INTEL_CFL_8S_S_IMC 0x3e32
+
+/* SNB event control */
+#define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff
+#define SNB_UNC_CTL_UMASK_MASK 0x0000ff00
+#define SNB_UNC_CTL_EDGE_DET (1 << 18)
+#define SNB_UNC_CTL_EN (1 << 22)
+#define SNB_UNC_CTL_INVERT (1 << 23)
+#define SNB_UNC_CTL_CMASK_MASK 0x1f000000
+#define NHM_UNC_CTL_CMASK_MASK 0xff000000
+#define NHM_UNC_FIXED_CTR_CTL_EN (1 << 0)
+
+#define SNB_UNC_RAW_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \
+ SNB_UNC_CTL_UMASK_MASK | \
+ SNB_UNC_CTL_EDGE_DET | \
+ SNB_UNC_CTL_INVERT | \
+ SNB_UNC_CTL_CMASK_MASK)
+
+#define NHM_UNC_RAW_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \
+ SNB_UNC_CTL_UMASK_MASK | \
+ SNB_UNC_CTL_EDGE_DET | \
+ SNB_UNC_CTL_INVERT | \
+ NHM_UNC_CTL_CMASK_MASK)
+
+/* SNB global control register */
+#define SNB_UNC_PERF_GLOBAL_CTL 0x391
+#define SNB_UNC_FIXED_CTR_CTRL 0x394
+#define SNB_UNC_FIXED_CTR 0x395
+
+/* SNB uncore global control */
+#define SNB_UNC_GLOBAL_CTL_CORE_ALL ((1 << 4) - 1)
+#define SNB_UNC_GLOBAL_CTL_EN (1 << 29)
+
+/* SNB Cbo register */
+#define SNB_UNC_CBO_0_PERFEVTSEL0 0x700
+#define SNB_UNC_CBO_0_PER_CTR0 0x706
+#define SNB_UNC_CBO_MSR_OFFSET 0x10
+
+/* SNB ARB register */
+#define SNB_UNC_ARB_PER_CTR0 0x3b0
+#define SNB_UNC_ARB_PERFEVTSEL0 0x3b2
+#define SNB_UNC_ARB_MSR_OFFSET 0x10
+
+/* NHM global control register */
+#define NHM_UNC_PERF_GLOBAL_CTL 0x391
+#define NHM_UNC_FIXED_CTR 0x394
+#define NHM_UNC_FIXED_CTR_CTRL 0x395
+
+/* NHM uncore global control */
+#define NHM_UNC_GLOBAL_CTL_EN_PC_ALL ((1ULL << 8) - 1)
+#define NHM_UNC_GLOBAL_CTL_EN_FC (1ULL << 32)
+
+/* NHM uncore register */
+#define NHM_UNC_PERFEVTSEL0 0x3c0
+#define NHM_UNC_UNCORE_PMC0 0x3b0
+
+/* SKL uncore global control */
+#define SKL_UNC_PERF_GLOBAL_CTL 0xe01
+#define SKL_UNC_GLOBAL_CTL_CORE_ALL ((1 << 5) - 1)
+
+DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
+DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
+DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
+DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
+DEFINE_UNCORE_FORMAT_ATTR(cmask5, cmask, "config:24-28");
+DEFINE_UNCORE_FORMAT_ATTR(cmask8, cmask, "config:24-31");
+
+/* Sandy Bridge uncore support */
+static void snb_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (hwc->idx < UNCORE_PMC_IDX_FIXED)
+ wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN);
+ else
+ wrmsrl(hwc->config_base, SNB_UNC_CTL_EN);
+}
+
+static void snb_uncore_msr_disable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ wrmsrl(event->hw.config_base, 0);
+}
+
+static void snb_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+ if (box->pmu->pmu_idx == 0) {
+ wrmsrl(SNB_UNC_PERF_GLOBAL_CTL,
+ SNB_UNC_GLOBAL_CTL_EN | SNB_UNC_GLOBAL_CTL_CORE_ALL);
+ }
+}
+
+static void snb_uncore_msr_enable_box(struct intel_uncore_box *box)
+{
+ wrmsrl(SNB_UNC_PERF_GLOBAL_CTL,
+ SNB_UNC_GLOBAL_CTL_EN | SNB_UNC_GLOBAL_CTL_CORE_ALL);
+}
+
+static void snb_uncore_msr_exit_box(struct intel_uncore_box *box)
+{
+ if (box->pmu->pmu_idx == 0)
+ wrmsrl(SNB_UNC_PERF_GLOBAL_CTL, 0);
+}
+
+static struct uncore_event_desc snb_uncore_events[] = {
+ INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"),
+ { /* end: all zeroes */ },
+};
+
+static struct attribute *snb_uncore_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_cmask5.attr,
+ NULL,
+};
+
+static const struct attribute_group snb_uncore_format_group = {
+ .name = "format",
+ .attrs = snb_uncore_formats_attr,
+};
+
+static struct intel_uncore_ops snb_uncore_msr_ops = {
+ .init_box = snb_uncore_msr_init_box,
+ .enable_box = snb_uncore_msr_enable_box,
+ .exit_box = snb_uncore_msr_exit_box,
+ .disable_event = snb_uncore_msr_disable_event,
+ .enable_event = snb_uncore_msr_enable_event,
+ .read_counter = uncore_msr_read_counter,
+};
+
+static struct event_constraint snb_uncore_arb_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x80, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x83, 0x1),
+ EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type snb_uncore_cbox = {
+ .name = "cbox",
+ .num_counters = 2,
+ .num_boxes = 4,
+ .perf_ctr_bits = 44,
+ .fixed_ctr_bits = 48,
+ .perf_ctr = SNB_UNC_CBO_0_PER_CTR0,
+ .event_ctl = SNB_UNC_CBO_0_PERFEVTSEL0,
+ .fixed_ctr = SNB_UNC_FIXED_CTR,
+ .fixed_ctl = SNB_UNC_FIXED_CTR_CTRL,
+ .single_fixed = 1,
+ .event_mask = SNB_UNC_RAW_EVENT_MASK,
+ .msr_offset = SNB_UNC_CBO_MSR_OFFSET,
+ .ops = &snb_uncore_msr_ops,
+ .format_group = &snb_uncore_format_group,
+ .event_descs = snb_uncore_events,
+};
+
+static struct intel_uncore_type snb_uncore_arb = {
+ .name = "arb",
+ .num_counters = 2,
+ .num_boxes = 1,
+ .perf_ctr_bits = 44,
+ .perf_ctr = SNB_UNC_ARB_PER_CTR0,
+ .event_ctl = SNB_UNC_ARB_PERFEVTSEL0,
+ .event_mask = SNB_UNC_RAW_EVENT_MASK,
+ .msr_offset = SNB_UNC_ARB_MSR_OFFSET,
+ .constraints = snb_uncore_arb_constraints,
+ .ops = &snb_uncore_msr_ops,
+ .format_group = &snb_uncore_format_group,
+};
+
+static struct intel_uncore_type *snb_msr_uncores[] = {
+ &snb_uncore_cbox,
+ &snb_uncore_arb,
+ NULL,
+};
+
+void snb_uncore_cpu_init(void)
+{
+ uncore_msr_uncores = snb_msr_uncores;
+ if (snb_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
+ snb_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
+}
+
+static void skl_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+ if (box->pmu->pmu_idx == 0) {
+ wrmsrl(SKL_UNC_PERF_GLOBAL_CTL,
+ SNB_UNC_GLOBAL_CTL_EN | SKL_UNC_GLOBAL_CTL_CORE_ALL);
+ }
+}
+
+static void skl_uncore_msr_enable_box(struct intel_uncore_box *box)
+{
+ wrmsrl(SKL_UNC_PERF_GLOBAL_CTL,
+ SNB_UNC_GLOBAL_CTL_EN | SKL_UNC_GLOBAL_CTL_CORE_ALL);
+}
+
+static void skl_uncore_msr_exit_box(struct intel_uncore_box *box)
+{
+ if (box->pmu->pmu_idx == 0)
+ wrmsrl(SKL_UNC_PERF_GLOBAL_CTL, 0);
+}
+
+static struct intel_uncore_ops skl_uncore_msr_ops = {
+ .init_box = skl_uncore_msr_init_box,
+ .enable_box = skl_uncore_msr_enable_box,
+ .exit_box = skl_uncore_msr_exit_box,
+ .disable_event = snb_uncore_msr_disable_event,
+ .enable_event = snb_uncore_msr_enable_event,
+ .read_counter = uncore_msr_read_counter,
+};
+
+static struct intel_uncore_type skl_uncore_cbox = {
+ .name = "cbox",
+ .num_counters = 4,
+ .num_boxes = 5,
+ .perf_ctr_bits = 44,
+ .fixed_ctr_bits = 48,
+ .perf_ctr = SNB_UNC_CBO_0_PER_CTR0,
+ .event_ctl = SNB_UNC_CBO_0_PERFEVTSEL0,
+ .fixed_ctr = SNB_UNC_FIXED_CTR,
+ .fixed_ctl = SNB_UNC_FIXED_CTR_CTRL,
+ .single_fixed = 1,
+ .event_mask = SNB_UNC_RAW_EVENT_MASK,
+ .msr_offset = SNB_UNC_CBO_MSR_OFFSET,
+ .ops = &skl_uncore_msr_ops,
+ .format_group = &snb_uncore_format_group,
+ .event_descs = snb_uncore_events,
+};
+
+static struct intel_uncore_type *skl_msr_uncores[] = {
+ &skl_uncore_cbox,
+ &snb_uncore_arb,
+ NULL,
+};
+
+void skl_uncore_cpu_init(void)
+{
+ uncore_msr_uncores = skl_msr_uncores;
+ if (skl_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
+ skl_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
+ snb_uncore_arb.ops = &skl_uncore_msr_ops;
+}
+
+enum {
+ SNB_PCI_UNCORE_IMC,
+};
+
+static struct uncore_event_desc snb_uncore_imc_events[] = {
+ INTEL_UNCORE_EVENT_DESC(data_reads, "event=0x01"),
+ INTEL_UNCORE_EVENT_DESC(data_reads.scale, "6.103515625e-5"),
+ INTEL_UNCORE_EVENT_DESC(data_reads.unit, "MiB"),
+
+ INTEL_UNCORE_EVENT_DESC(data_writes, "event=0x02"),
+ INTEL_UNCORE_EVENT_DESC(data_writes.scale, "6.103515625e-5"),
+ INTEL_UNCORE_EVENT_DESC(data_writes.unit, "MiB"),
+
+ { /* end: all zeroes */ },
+};
+
+#define SNB_UNCORE_PCI_IMC_EVENT_MASK 0xff
+#define SNB_UNCORE_PCI_IMC_BAR_OFFSET 0x48
+
+/* page size multiple covering all config regs */
+#define SNB_UNCORE_PCI_IMC_MAP_SIZE 0x6000
+
+#define SNB_UNCORE_PCI_IMC_DATA_READS 0x1
+#define SNB_UNCORE_PCI_IMC_DATA_READS_BASE 0x5050
+#define SNB_UNCORE_PCI_IMC_DATA_WRITES 0x2
+#define SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE 0x5054
+#define SNB_UNCORE_PCI_IMC_CTR_BASE SNB_UNCORE_PCI_IMC_DATA_READS_BASE
+
+enum perf_snb_uncore_imc_freerunning_types {
+ SNB_PCI_UNCORE_IMC_DATA = 0,
+ SNB_PCI_UNCORE_IMC_FREERUNNING_TYPE_MAX,
+};
+
+static struct freerunning_counters snb_uncore_imc_freerunning[] = {
+ [SNB_PCI_UNCORE_IMC_DATA] = { SNB_UNCORE_PCI_IMC_DATA_READS_BASE, 0x4, 0x0, 2, 32 },
+};
+
+static struct attribute *snb_uncore_imc_formats_attr[] = {
+ &format_attr_event.attr,
+ NULL,
+};
+
+static const struct attribute_group snb_uncore_imc_format_group = {
+ .name = "format",
+ .attrs = snb_uncore_imc_formats_attr,
+};
+
+static void snb_uncore_imc_init_box(struct intel_uncore_box *box)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ int where = SNB_UNCORE_PCI_IMC_BAR_OFFSET;
+ resource_size_t addr;
+ u32 pci_dword;
+
+ pci_read_config_dword(pdev, where, &pci_dword);
+ addr = pci_dword;
+
+#ifdef CONFIG_PHYS_ADDR_T_64BIT
+ pci_read_config_dword(pdev, where + 4, &pci_dword);
+ addr |= ((resource_size_t)pci_dword << 32);
+#endif
+
+ addr &= ~(PAGE_SIZE - 1);
+
+ box->io_addr = ioremap(addr, SNB_UNCORE_PCI_IMC_MAP_SIZE);
+ box->hrtimer_duration = UNCORE_SNB_IMC_HRTIMER_INTERVAL;
+}
+
+static void snb_uncore_imc_exit_box(struct intel_uncore_box *box)
+{
+ iounmap(box->io_addr);
+}
+
+static void snb_uncore_imc_enable_box(struct intel_uncore_box *box)
+{}
+
+static void snb_uncore_imc_disable_box(struct intel_uncore_box *box)
+{}
+
+static void snb_uncore_imc_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{}
+
+static void snb_uncore_imc_disable_event(struct intel_uncore_box *box, struct perf_event *event)
+{}
+
+static u64 snb_uncore_imc_read_counter(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ return (u64)*(unsigned int *)(box->io_addr + hwc->event_base);
+}
+
+/*
+ * Keep the custom event_init() function compatible with old event
+ * encoding for free running counters.
+ */
+static int snb_uncore_imc_event_init(struct perf_event *event)
+{
+ struct intel_uncore_pmu *pmu;
+ struct intel_uncore_box *box;
+ struct hw_perf_event *hwc = &event->hw;
+ u64 cfg = event->attr.config & SNB_UNCORE_PCI_IMC_EVENT_MASK;
+ int idx, base;
+
+ if (event->attr.type != event->pmu->type)
+ return -ENOENT;
+
+ pmu = uncore_event_to_pmu(event);
+ /* no device found for this pmu */
+ if (pmu->func_id < 0)
+ return -ENOENT;
+
+ /* Sampling not supported yet */
+ if (hwc->sample_period)
+ return -EINVAL;
+
+ /* unsupported modes and filters */
+ if (event->attr.exclude_user ||
+ event->attr.exclude_kernel ||
+ event->attr.exclude_hv ||
+ event->attr.exclude_idle ||
+ event->attr.exclude_host ||
+ event->attr.exclude_guest ||
+ event->attr.sample_period) /* no sampling */
+ return -EINVAL;
+
+ /*
+ * Place all uncore events for a particular physical package
+ * onto a single cpu
+ */
+ if (event->cpu < 0)
+ return -EINVAL;
+
+ /* check only supported bits are set */
+ if (event->attr.config & ~SNB_UNCORE_PCI_IMC_EVENT_MASK)
+ return -EINVAL;
+
+ box = uncore_pmu_to_box(pmu, event->cpu);
+ if (!box || box->cpu < 0)
+ return -EINVAL;
+
+ event->cpu = box->cpu;
+ event->pmu_private = box;
+
+ event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG;
+
+ event->hw.idx = -1;
+ event->hw.last_tag = ~0ULL;
+ event->hw.extra_reg.idx = EXTRA_REG_NONE;
+ event->hw.branch_reg.idx = EXTRA_REG_NONE;
+ /*
+ * check event is known (whitelist, determines counter)
+ */
+ switch (cfg) {
+ case SNB_UNCORE_PCI_IMC_DATA_READS:
+ base = SNB_UNCORE_PCI_IMC_DATA_READS_BASE;
+ idx = UNCORE_PMC_IDX_FREERUNNING;
+ break;
+ case SNB_UNCORE_PCI_IMC_DATA_WRITES:
+ base = SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE;
+ idx = UNCORE_PMC_IDX_FREERUNNING;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ /* must be done before validate_group */
+ event->hw.event_base = base;
+ event->hw.idx = idx;
+
+ /* Convert to standard encoding format for freerunning counters */
+ event->hw.config = ((cfg - 1) << 8) | 0x10ff;
+
+ /* no group validation needed, we have free running counters */
+
+ return 0;
+}
+
+static int snb_uncore_imc_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+ return 0;
+}
+
+int snb_pci2phy_map_init(int devid)
+{
+ struct pci_dev *dev = NULL;
+ struct pci2phy_map *map;
+ int bus, segment;
+
+ dev = pci_get_device(PCI_VENDOR_ID_INTEL, devid, dev);
+ if (!dev)
+ return -ENOTTY;
+
+ bus = dev->bus->number;
+ segment = pci_domain_nr(dev->bus);
+
+ raw_spin_lock(&pci2phy_map_lock);
+ map = __find_pci2phy_map(segment);
+ if (!map) {
+ raw_spin_unlock(&pci2phy_map_lock);
+ pci_dev_put(dev);
+ return -ENOMEM;
+ }
+ map->pbus_to_physid[bus] = 0;
+ raw_spin_unlock(&pci2phy_map_lock);
+
+ pci_dev_put(dev);
+
+ return 0;
+}
+
+static struct pmu snb_uncore_imc_pmu = {
+ .task_ctx_nr = perf_invalid_context,
+ .event_init = snb_uncore_imc_event_init,
+ .add = uncore_pmu_event_add,
+ .del = uncore_pmu_event_del,
+ .start = uncore_pmu_event_start,
+ .stop = uncore_pmu_event_stop,
+ .read = uncore_pmu_event_read,
+};
+
+static struct intel_uncore_ops snb_uncore_imc_ops = {
+ .init_box = snb_uncore_imc_init_box,
+ .exit_box = snb_uncore_imc_exit_box,
+ .enable_box = snb_uncore_imc_enable_box,
+ .disable_box = snb_uncore_imc_disable_box,
+ .disable_event = snb_uncore_imc_disable_event,
+ .enable_event = snb_uncore_imc_enable_event,
+ .hw_config = snb_uncore_imc_hw_config,
+ .read_counter = snb_uncore_imc_read_counter,
+};
+
+static struct intel_uncore_type snb_uncore_imc = {
+ .name = "imc",
+ .num_counters = 2,
+ .num_boxes = 1,
+ .num_freerunning_types = SNB_PCI_UNCORE_IMC_FREERUNNING_TYPE_MAX,
+ .freerunning = snb_uncore_imc_freerunning,
+ .event_descs = snb_uncore_imc_events,
+ .format_group = &snb_uncore_imc_format_group,
+ .ops = &snb_uncore_imc_ops,
+ .pmu = &snb_uncore_imc_pmu,
+};
+
+static struct intel_uncore_type *snb_pci_uncores[] = {
+ [SNB_PCI_UNCORE_IMC] = &snb_uncore_imc,
+ NULL,
+};
+
+static const struct pci_device_id snb_uncore_pci_ids[] = {
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SNB_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* end: all zeroes */ },
+};
+
+static const struct pci_device_id ivb_uncore_pci_ids[] = {
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_E3_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* end: all zeroes */ },
+};
+
+static const struct pci_device_id hsw_uncore_pci_ids[] = {
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_U_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* end: all zeroes */ },
+};
+
+static const struct pci_device_id bdw_uncore_pci_ids[] = {
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_BDW_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* end: all zeroes */ },
+};
+
+static const struct pci_device_id skl_uncore_pci_ids[] = {
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_Y_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_U_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_HD_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_HQ_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_SD_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_SQ_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBL_Y_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBL_U_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBL_UQ_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBL_SD_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBL_SQ_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_2U_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_4U_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_4H_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_6H_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_2S_D_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_4S_D_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_6S_D_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_8S_D_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_4S_W_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_6S_W_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_8S_W_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_4S_S_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_6S_S_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_8S_S_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* end: all zeroes */ },
+};
+
+static struct pci_driver snb_uncore_pci_driver = {
+ .name = "snb_uncore",
+ .id_table = snb_uncore_pci_ids,
+};
+
+static struct pci_driver ivb_uncore_pci_driver = {
+ .name = "ivb_uncore",
+ .id_table = ivb_uncore_pci_ids,
+};
+
+static struct pci_driver hsw_uncore_pci_driver = {
+ .name = "hsw_uncore",
+ .id_table = hsw_uncore_pci_ids,
+};
+
+static struct pci_driver bdw_uncore_pci_driver = {
+ .name = "bdw_uncore",
+ .id_table = bdw_uncore_pci_ids,
+};
+
+static struct pci_driver skl_uncore_pci_driver = {
+ .name = "skl_uncore",
+ .id_table = skl_uncore_pci_ids,
+};
+
+struct imc_uncore_pci_dev {
+ __u32 pci_id;
+ struct pci_driver *driver;
+};
+#define IMC_DEV(a, d) \
+ { .pci_id = PCI_DEVICE_ID_INTEL_##a, .driver = (d) }
+
+static const struct imc_uncore_pci_dev desktop_imc_pci_ids[] = {
+ IMC_DEV(SNB_IMC, &snb_uncore_pci_driver),
+ IMC_DEV(IVB_IMC, &ivb_uncore_pci_driver), /* 3rd Gen Core processor */
+ IMC_DEV(IVB_E3_IMC, &ivb_uncore_pci_driver), /* Xeon E3-1200 v2/3rd Gen Core processor */
+ IMC_DEV(HSW_IMC, &hsw_uncore_pci_driver), /* 4th Gen Core Processor */
+ IMC_DEV(HSW_U_IMC, &hsw_uncore_pci_driver), /* 4th Gen Core ULT Mobile Processor */
+ IMC_DEV(BDW_IMC, &bdw_uncore_pci_driver), /* 5th Gen Core U */
+ IMC_DEV(SKL_Y_IMC, &skl_uncore_pci_driver), /* 6th Gen Core Y */
+ IMC_DEV(SKL_U_IMC, &skl_uncore_pci_driver), /* 6th Gen Core U */
+ IMC_DEV(SKL_HD_IMC, &skl_uncore_pci_driver), /* 6th Gen Core H Dual Core */
+ IMC_DEV(SKL_HQ_IMC, &skl_uncore_pci_driver), /* 6th Gen Core H Quad Core */
+ IMC_DEV(SKL_SD_IMC, &skl_uncore_pci_driver), /* 6th Gen Core S Dual Core */
+ IMC_DEV(SKL_SQ_IMC, &skl_uncore_pci_driver), /* 6th Gen Core S Quad Core */
+ IMC_DEV(KBL_Y_IMC, &skl_uncore_pci_driver), /* 7th Gen Core Y */
+ IMC_DEV(KBL_U_IMC, &skl_uncore_pci_driver), /* 7th Gen Core U */
+ IMC_DEV(KBL_UQ_IMC, &skl_uncore_pci_driver), /* 7th Gen Core U Quad Core */
+ IMC_DEV(KBL_SD_IMC, &skl_uncore_pci_driver), /* 7th Gen Core S Dual Core */
+ IMC_DEV(KBL_SQ_IMC, &skl_uncore_pci_driver), /* 7th Gen Core S Quad Core */
+ IMC_DEV(CFL_2U_IMC, &skl_uncore_pci_driver), /* 8th Gen Core U 2 Cores */
+ IMC_DEV(CFL_4U_IMC, &skl_uncore_pci_driver), /* 8th Gen Core U 4 Cores */
+ IMC_DEV(CFL_4H_IMC, &skl_uncore_pci_driver), /* 8th Gen Core H 4 Cores */
+ IMC_DEV(CFL_6H_IMC, &skl_uncore_pci_driver), /* 8th Gen Core H 6 Cores */
+ IMC_DEV(CFL_2S_D_IMC, &skl_uncore_pci_driver), /* 8th Gen Core S 2 Cores Desktop */
+ IMC_DEV(CFL_4S_D_IMC, &skl_uncore_pci_driver), /* 8th Gen Core S 4 Cores Desktop */
+ IMC_DEV(CFL_6S_D_IMC, &skl_uncore_pci_driver), /* 8th Gen Core S 6 Cores Desktop */
+ IMC_DEV(CFL_8S_D_IMC, &skl_uncore_pci_driver), /* 8th Gen Core S 8 Cores Desktop */
+ IMC_DEV(CFL_4S_W_IMC, &skl_uncore_pci_driver), /* 8th Gen Core S 4 Cores Work Station */
+ IMC_DEV(CFL_6S_W_IMC, &skl_uncore_pci_driver), /* 8th Gen Core S 6 Cores Work Station */
+ IMC_DEV(CFL_8S_W_IMC, &skl_uncore_pci_driver), /* 8th Gen Core S 8 Cores Work Station */
+ IMC_DEV(CFL_4S_S_IMC, &skl_uncore_pci_driver), /* 8th Gen Core S 4 Cores Server */
+ IMC_DEV(CFL_6S_S_IMC, &skl_uncore_pci_driver), /* 8th Gen Core S 6 Cores Server */
+ IMC_DEV(CFL_8S_S_IMC, &skl_uncore_pci_driver), /* 8th Gen Core S 8 Cores Server */
+ { /* end marker */ }
+};
+
+
+#define for_each_imc_pci_id(x, t) \
+ for (x = (t); (x)->pci_id; x++)
+
+static struct pci_driver *imc_uncore_find_dev(void)
+{
+ const struct imc_uncore_pci_dev *p;
+ int ret;
+
+ for_each_imc_pci_id(p, desktop_imc_pci_ids) {
+ ret = snb_pci2phy_map_init(p->pci_id);
+ if (ret == 0)
+ return p->driver;
+ }
+ return NULL;
+}
+
+static int imc_uncore_pci_init(void)
+{
+ struct pci_driver *imc_drv = imc_uncore_find_dev();
+
+ if (!imc_drv)
+ return -ENODEV;
+
+ uncore_pci_uncores = snb_pci_uncores;
+ uncore_pci_driver = imc_drv;
+
+ return 0;
+}
+
+int snb_uncore_pci_init(void)
+{
+ return imc_uncore_pci_init();
+}
+
+int ivb_uncore_pci_init(void)
+{
+ return imc_uncore_pci_init();
+}
+int hsw_uncore_pci_init(void)
+{
+ return imc_uncore_pci_init();
+}
+
+int bdw_uncore_pci_init(void)
+{
+ return imc_uncore_pci_init();
+}
+
+int skl_uncore_pci_init(void)
+{
+ return imc_uncore_pci_init();
+}
+
+/* end of Sandy Bridge uncore support */
+
+/* Nehalem uncore support */
+static void nhm_uncore_msr_disable_box(struct intel_uncore_box *box)
+{
+ wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, 0);
+}
+
+static void nhm_uncore_msr_enable_box(struct intel_uncore_box *box)
+{
+ wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, NHM_UNC_GLOBAL_CTL_EN_PC_ALL | NHM_UNC_GLOBAL_CTL_EN_FC);
+}
+
+static void nhm_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (hwc->idx < UNCORE_PMC_IDX_FIXED)
+ wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN);
+ else
+ wrmsrl(hwc->config_base, NHM_UNC_FIXED_CTR_CTL_EN);
+}
+
+static struct attribute *nhm_uncore_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_cmask8.attr,
+ NULL,
+};
+
+static const struct attribute_group nhm_uncore_format_group = {
+ .name = "format",
+ .attrs = nhm_uncore_formats_attr,
+};
+
+static struct uncore_event_desc nhm_uncore_events[] = {
+ INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"),
+ INTEL_UNCORE_EVENT_DESC(qmc_writes_full_any, "event=0x2f,umask=0x0f"),
+ INTEL_UNCORE_EVENT_DESC(qmc_normal_reads_any, "event=0x2c,umask=0x0f"),
+ INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_reads, "event=0x20,umask=0x01"),
+ INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_writes, "event=0x20,umask=0x02"),
+ INTEL_UNCORE_EVENT_DESC(qhl_request_remote_reads, "event=0x20,umask=0x04"),
+ INTEL_UNCORE_EVENT_DESC(qhl_request_remote_writes, "event=0x20,umask=0x08"),
+ INTEL_UNCORE_EVENT_DESC(qhl_request_local_reads, "event=0x20,umask=0x10"),
+ INTEL_UNCORE_EVENT_DESC(qhl_request_local_writes, "event=0x20,umask=0x20"),
+ { /* end: all zeroes */ },
+};
+
+static struct intel_uncore_ops nhm_uncore_msr_ops = {
+ .disable_box = nhm_uncore_msr_disable_box,
+ .enable_box = nhm_uncore_msr_enable_box,
+ .disable_event = snb_uncore_msr_disable_event,
+ .enable_event = nhm_uncore_msr_enable_event,
+ .read_counter = uncore_msr_read_counter,
+};
+
+static struct intel_uncore_type nhm_uncore = {
+ .name = "",
+ .num_counters = 8,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .fixed_ctr_bits = 48,
+ .event_ctl = NHM_UNC_PERFEVTSEL0,
+ .perf_ctr = NHM_UNC_UNCORE_PMC0,
+ .fixed_ctr = NHM_UNC_FIXED_CTR,
+ .fixed_ctl = NHM_UNC_FIXED_CTR_CTRL,
+ .event_mask = NHM_UNC_RAW_EVENT_MASK,
+ .event_descs = nhm_uncore_events,
+ .ops = &nhm_uncore_msr_ops,
+ .format_group = &nhm_uncore_format_group,
+};
+
+static struct intel_uncore_type *nhm_msr_uncores[] = {
+ &nhm_uncore,
+ NULL,
+};
+
+void nhm_uncore_cpu_init(void)
+{
+ uncore_msr_uncores = nhm_msr_uncores;
+}
+
+/* end of Nehalem uncore support */
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
new file mode 100644
index 000000000..2bf1170f7
--- /dev/null
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -0,0 +1,3965 @@
+// SPDX-License-Identifier: GPL-2.0
+/* SandyBridge-EP/IvyTown uncore support */
+#include "uncore.h"
+
+/* SNB-EP pci bus to socket mapping */
+#define SNBEP_CPUNODEID 0x40
+#define SNBEP_GIDNIDMAP 0x54
+
+/* SNB-EP Box level control */
+#define SNBEP_PMON_BOX_CTL_RST_CTRL (1 << 0)
+#define SNBEP_PMON_BOX_CTL_RST_CTRS (1 << 1)
+#define SNBEP_PMON_BOX_CTL_FRZ (1 << 8)
+#define SNBEP_PMON_BOX_CTL_FRZ_EN (1 << 16)
+#define SNBEP_PMON_BOX_CTL_INT (SNBEP_PMON_BOX_CTL_RST_CTRL | \
+ SNBEP_PMON_BOX_CTL_RST_CTRS | \
+ SNBEP_PMON_BOX_CTL_FRZ_EN)
+/* SNB-EP event control */
+#define SNBEP_PMON_CTL_EV_SEL_MASK 0x000000ff
+#define SNBEP_PMON_CTL_UMASK_MASK 0x0000ff00
+#define SNBEP_PMON_CTL_RST (1 << 17)
+#define SNBEP_PMON_CTL_EDGE_DET (1 << 18)
+#define SNBEP_PMON_CTL_EV_SEL_EXT (1 << 21)
+#define SNBEP_PMON_CTL_EN (1 << 22)
+#define SNBEP_PMON_CTL_INVERT (1 << 23)
+#define SNBEP_PMON_CTL_TRESH_MASK 0xff000000
+#define SNBEP_PMON_RAW_EVENT_MASK (SNBEP_PMON_CTL_EV_SEL_MASK | \
+ SNBEP_PMON_CTL_UMASK_MASK | \
+ SNBEP_PMON_CTL_EDGE_DET | \
+ SNBEP_PMON_CTL_INVERT | \
+ SNBEP_PMON_CTL_TRESH_MASK)
+
+/* SNB-EP Ubox event control */
+#define SNBEP_U_MSR_PMON_CTL_TRESH_MASK 0x1f000000
+#define SNBEP_U_MSR_PMON_RAW_EVENT_MASK \
+ (SNBEP_PMON_CTL_EV_SEL_MASK | \
+ SNBEP_PMON_CTL_UMASK_MASK | \
+ SNBEP_PMON_CTL_EDGE_DET | \
+ SNBEP_PMON_CTL_INVERT | \
+ SNBEP_U_MSR_PMON_CTL_TRESH_MASK)
+
+#define SNBEP_CBO_PMON_CTL_TID_EN (1 << 19)
+#define SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK (SNBEP_PMON_RAW_EVENT_MASK | \
+ SNBEP_CBO_PMON_CTL_TID_EN)
+
+/* SNB-EP PCU event control */
+#define SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK 0x0000c000
+#define SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK 0x1f000000
+#define SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT (1 << 30)
+#define SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET (1 << 31)
+#define SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK \
+ (SNBEP_PMON_CTL_EV_SEL_MASK | \
+ SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
+ SNBEP_PMON_CTL_EDGE_DET | \
+ SNBEP_PMON_CTL_INVERT | \
+ SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \
+ SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
+ SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET)
+
+#define SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK \
+ (SNBEP_PMON_RAW_EVENT_MASK | \
+ SNBEP_PMON_CTL_EV_SEL_EXT)
+
+/* SNB-EP pci control register */
+#define SNBEP_PCI_PMON_BOX_CTL 0xf4
+#define SNBEP_PCI_PMON_CTL0 0xd8
+/* SNB-EP pci counter register */
+#define SNBEP_PCI_PMON_CTR0 0xa0
+
+/* SNB-EP home agent register */
+#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH0 0x40
+#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH1 0x44
+#define SNBEP_HA_PCI_PMON_BOX_OPCODEMATCH 0x48
+/* SNB-EP memory controller register */
+#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTL 0xf0
+#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTR 0xd0
+/* SNB-EP QPI register */
+#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH0 0x228
+#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH1 0x22c
+#define SNBEP_Q_Py_PCI_PMON_PKT_MASK0 0x238
+#define SNBEP_Q_Py_PCI_PMON_PKT_MASK1 0x23c
+
+/* SNB-EP Ubox register */
+#define SNBEP_U_MSR_PMON_CTR0 0xc16
+#define SNBEP_U_MSR_PMON_CTL0 0xc10
+
+#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTL 0xc08
+#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTR 0xc09
+
+/* SNB-EP Cbo register */
+#define SNBEP_C0_MSR_PMON_CTR0 0xd16
+#define SNBEP_C0_MSR_PMON_CTL0 0xd10
+#define SNBEP_C0_MSR_PMON_BOX_CTL 0xd04
+#define SNBEP_C0_MSR_PMON_BOX_FILTER 0xd14
+#define SNBEP_CBO_MSR_OFFSET 0x20
+
+#define SNBEP_CB0_MSR_PMON_BOX_FILTER_TID 0x1f
+#define SNBEP_CB0_MSR_PMON_BOX_FILTER_NID 0x3fc00
+#define SNBEP_CB0_MSR_PMON_BOX_FILTER_STATE 0x7c0000
+#define SNBEP_CB0_MSR_PMON_BOX_FILTER_OPC 0xff800000
+
+#define SNBEP_CBO_EVENT_EXTRA_REG(e, m, i) { \
+ .event = (e), \
+ .msr = SNBEP_C0_MSR_PMON_BOX_FILTER, \
+ .config_mask = (m), \
+ .idx = (i) \
+}
+
+/* SNB-EP PCU register */
+#define SNBEP_PCU_MSR_PMON_CTR0 0xc36
+#define SNBEP_PCU_MSR_PMON_CTL0 0xc30
+#define SNBEP_PCU_MSR_PMON_BOX_CTL 0xc24
+#define SNBEP_PCU_MSR_PMON_BOX_FILTER 0xc34
+#define SNBEP_PCU_MSR_PMON_BOX_FILTER_MASK 0xffffffff
+#define SNBEP_PCU_MSR_CORE_C3_CTR 0x3fc
+#define SNBEP_PCU_MSR_CORE_C6_CTR 0x3fd
+
+/* IVBEP event control */
+#define IVBEP_PMON_BOX_CTL_INT (SNBEP_PMON_BOX_CTL_RST_CTRL | \
+ SNBEP_PMON_BOX_CTL_RST_CTRS)
+#define IVBEP_PMON_RAW_EVENT_MASK (SNBEP_PMON_CTL_EV_SEL_MASK | \
+ SNBEP_PMON_CTL_UMASK_MASK | \
+ SNBEP_PMON_CTL_EDGE_DET | \
+ SNBEP_PMON_CTL_TRESH_MASK)
+/* IVBEP Ubox */
+#define IVBEP_U_MSR_PMON_GLOBAL_CTL 0xc00
+#define IVBEP_U_PMON_GLOBAL_FRZ_ALL (1 << 31)
+#define IVBEP_U_PMON_GLOBAL_UNFRZ_ALL (1 << 29)
+
+#define IVBEP_U_MSR_PMON_RAW_EVENT_MASK \
+ (SNBEP_PMON_CTL_EV_SEL_MASK | \
+ SNBEP_PMON_CTL_UMASK_MASK | \
+ SNBEP_PMON_CTL_EDGE_DET | \
+ SNBEP_U_MSR_PMON_CTL_TRESH_MASK)
+/* IVBEP Cbo */
+#define IVBEP_CBO_MSR_PMON_RAW_EVENT_MASK (IVBEP_PMON_RAW_EVENT_MASK | \
+ SNBEP_CBO_PMON_CTL_TID_EN)
+
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_TID (0x1fULL << 0)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_LINK (0xfULL << 5)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_STATE (0x3fULL << 17)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_NID (0xffffULL << 32)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_OPC (0x1ffULL << 52)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_C6 (0x1ULL << 61)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_NC (0x1ULL << 62)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_ISOC (0x1ULL << 63)
+
+/* IVBEP home agent */
+#define IVBEP_HA_PCI_PMON_CTL_Q_OCC_RST (1 << 16)
+#define IVBEP_HA_PCI_PMON_RAW_EVENT_MASK \
+ (IVBEP_PMON_RAW_EVENT_MASK | \
+ IVBEP_HA_PCI_PMON_CTL_Q_OCC_RST)
+/* IVBEP PCU */
+#define IVBEP_PCU_MSR_PMON_RAW_EVENT_MASK \
+ (SNBEP_PMON_CTL_EV_SEL_MASK | \
+ SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
+ SNBEP_PMON_CTL_EDGE_DET | \
+ SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \
+ SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
+ SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET)
+/* IVBEP QPI */
+#define IVBEP_QPI_PCI_PMON_RAW_EVENT_MASK \
+ (IVBEP_PMON_RAW_EVENT_MASK | \
+ SNBEP_PMON_CTL_EV_SEL_EXT)
+
+#define __BITS_VALUE(x, i, n) ((typeof(x))(((x) >> ((i) * (n))) & \
+ ((1ULL << (n)) - 1)))
+
+/* Haswell-EP Ubox */
+#define HSWEP_U_MSR_PMON_CTR0 0x709
+#define HSWEP_U_MSR_PMON_CTL0 0x705
+#define HSWEP_U_MSR_PMON_FILTER 0x707
+
+#define HSWEP_U_MSR_PMON_UCLK_FIXED_CTL 0x703
+#define HSWEP_U_MSR_PMON_UCLK_FIXED_CTR 0x704
+
+#define HSWEP_U_MSR_PMON_BOX_FILTER_TID (0x1 << 0)
+#define HSWEP_U_MSR_PMON_BOX_FILTER_CID (0x1fULL << 1)
+#define HSWEP_U_MSR_PMON_BOX_FILTER_MASK \
+ (HSWEP_U_MSR_PMON_BOX_FILTER_TID | \
+ HSWEP_U_MSR_PMON_BOX_FILTER_CID)
+
+/* Haswell-EP CBo */
+#define HSWEP_C0_MSR_PMON_CTR0 0xe08
+#define HSWEP_C0_MSR_PMON_CTL0 0xe01
+#define HSWEP_C0_MSR_PMON_BOX_CTL 0xe00
+#define HSWEP_C0_MSR_PMON_BOX_FILTER0 0xe05
+#define HSWEP_CBO_MSR_OFFSET 0x10
+
+
+#define HSWEP_CB0_MSR_PMON_BOX_FILTER_TID (0x3fULL << 0)
+#define HSWEP_CB0_MSR_PMON_BOX_FILTER_LINK (0xfULL << 6)
+#define HSWEP_CB0_MSR_PMON_BOX_FILTER_STATE (0x7fULL << 17)
+#define HSWEP_CB0_MSR_PMON_BOX_FILTER_NID (0xffffULL << 32)
+#define HSWEP_CB0_MSR_PMON_BOX_FILTER_OPC (0x1ffULL << 52)
+#define HSWEP_CB0_MSR_PMON_BOX_FILTER_C6 (0x1ULL << 61)
+#define HSWEP_CB0_MSR_PMON_BOX_FILTER_NC (0x1ULL << 62)
+#define HSWEP_CB0_MSR_PMON_BOX_FILTER_ISOC (0x1ULL << 63)
+
+
+/* Haswell-EP Sbox */
+#define HSWEP_S0_MSR_PMON_CTR0 0x726
+#define HSWEP_S0_MSR_PMON_CTL0 0x721
+#define HSWEP_S0_MSR_PMON_BOX_CTL 0x720
+#define HSWEP_SBOX_MSR_OFFSET 0xa
+#define HSWEP_S_MSR_PMON_RAW_EVENT_MASK (SNBEP_PMON_RAW_EVENT_MASK | \
+ SNBEP_CBO_PMON_CTL_TID_EN)
+
+/* Haswell-EP PCU */
+#define HSWEP_PCU_MSR_PMON_CTR0 0x717
+#define HSWEP_PCU_MSR_PMON_CTL0 0x711
+#define HSWEP_PCU_MSR_PMON_BOX_CTL 0x710
+#define HSWEP_PCU_MSR_PMON_BOX_FILTER 0x715
+
+/* KNL Ubox */
+#define KNL_U_MSR_PMON_RAW_EVENT_MASK \
+ (SNBEP_U_MSR_PMON_RAW_EVENT_MASK | \
+ SNBEP_CBO_PMON_CTL_TID_EN)
+/* KNL CHA */
+#define KNL_CHA_MSR_OFFSET 0xc
+#define KNL_CHA_MSR_PMON_CTL_QOR (1 << 16)
+#define KNL_CHA_MSR_PMON_RAW_EVENT_MASK \
+ (SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK | \
+ KNL_CHA_MSR_PMON_CTL_QOR)
+#define KNL_CHA_MSR_PMON_BOX_FILTER_TID 0x1ff
+#define KNL_CHA_MSR_PMON_BOX_FILTER_STATE (7 << 18)
+#define KNL_CHA_MSR_PMON_BOX_FILTER_OP (0xfffffe2aULL << 32)
+#define KNL_CHA_MSR_PMON_BOX_FILTER_REMOTE_NODE (0x1ULL << 32)
+#define KNL_CHA_MSR_PMON_BOX_FILTER_LOCAL_NODE (0x1ULL << 33)
+#define KNL_CHA_MSR_PMON_BOX_FILTER_NNC (0x1ULL << 37)
+
+/* KNL EDC/MC UCLK */
+#define KNL_UCLK_MSR_PMON_CTR0_LOW 0x400
+#define KNL_UCLK_MSR_PMON_CTL0 0x420
+#define KNL_UCLK_MSR_PMON_BOX_CTL 0x430
+#define KNL_UCLK_MSR_PMON_UCLK_FIXED_LOW 0x44c
+#define KNL_UCLK_MSR_PMON_UCLK_FIXED_CTL 0x454
+#define KNL_PMON_FIXED_CTL_EN 0x1
+
+/* KNL EDC */
+#define KNL_EDC0_ECLK_MSR_PMON_CTR0_LOW 0xa00
+#define KNL_EDC0_ECLK_MSR_PMON_CTL0 0xa20
+#define KNL_EDC0_ECLK_MSR_PMON_BOX_CTL 0xa30
+#define KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_LOW 0xa3c
+#define KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_CTL 0xa44
+
+/* KNL MC */
+#define KNL_MC0_CH0_MSR_PMON_CTR0_LOW 0xb00
+#define KNL_MC0_CH0_MSR_PMON_CTL0 0xb20
+#define KNL_MC0_CH0_MSR_PMON_BOX_CTL 0xb30
+#define KNL_MC0_CH0_MSR_PMON_FIXED_LOW 0xb3c
+#define KNL_MC0_CH0_MSR_PMON_FIXED_CTL 0xb44
+
+/* KNL IRP */
+#define KNL_IRP_PCI_PMON_BOX_CTL 0xf0
+#define KNL_IRP_PCI_PMON_RAW_EVENT_MASK (SNBEP_PMON_RAW_EVENT_MASK | \
+ KNL_CHA_MSR_PMON_CTL_QOR)
+/* KNL PCU */
+#define KNL_PCU_PMON_CTL_EV_SEL_MASK 0x0000007f
+#define KNL_PCU_PMON_CTL_USE_OCC_CTR (1 << 7)
+#define KNL_PCU_MSR_PMON_CTL_TRESH_MASK 0x3f000000
+#define KNL_PCU_MSR_PMON_RAW_EVENT_MASK \
+ (KNL_PCU_PMON_CTL_EV_SEL_MASK | \
+ KNL_PCU_PMON_CTL_USE_OCC_CTR | \
+ SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
+ SNBEP_PMON_CTL_EDGE_DET | \
+ SNBEP_CBO_PMON_CTL_TID_EN | \
+ SNBEP_PMON_CTL_INVERT | \
+ KNL_PCU_MSR_PMON_CTL_TRESH_MASK | \
+ SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
+ SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET)
+
+/* SKX pci bus to socket mapping */
+#define SKX_CPUNODEID 0xc0
+#define SKX_GIDNIDMAP 0xd4
+
+/* SKX CHA */
+#define SKX_CHA_MSR_PMON_BOX_FILTER_TID (0x1ffULL << 0)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_LINK (0xfULL << 9)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_STATE (0x3ffULL << 17)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_REM (0x1ULL << 32)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_LOC (0x1ULL << 33)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_ALL_OPC (0x1ULL << 35)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_NM (0x1ULL << 36)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_NOT_NM (0x1ULL << 37)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_OPC0 (0x3ffULL << 41)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_OPC1 (0x3ffULL << 51)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_C6 (0x1ULL << 61)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_NC (0x1ULL << 62)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_ISOC (0x1ULL << 63)
+
+/* SKX IIO */
+#define SKX_IIO0_MSR_PMON_CTL0 0xa48
+#define SKX_IIO0_MSR_PMON_CTR0 0xa41
+#define SKX_IIO0_MSR_PMON_BOX_CTL 0xa40
+#define SKX_IIO_MSR_OFFSET 0x20
+
+#define SKX_PMON_CTL_TRESH_MASK (0xff << 24)
+#define SKX_PMON_CTL_TRESH_MASK_EXT (0xf)
+#define SKX_PMON_CTL_CH_MASK (0xff << 4)
+#define SKX_PMON_CTL_FC_MASK (0x7 << 12)
+#define SKX_IIO_PMON_RAW_EVENT_MASK (SNBEP_PMON_CTL_EV_SEL_MASK | \
+ SNBEP_PMON_CTL_UMASK_MASK | \
+ SNBEP_PMON_CTL_EDGE_DET | \
+ SNBEP_PMON_CTL_INVERT | \
+ SKX_PMON_CTL_TRESH_MASK)
+#define SKX_IIO_PMON_RAW_EVENT_MASK_EXT (SKX_PMON_CTL_TRESH_MASK_EXT | \
+ SKX_PMON_CTL_CH_MASK | \
+ SKX_PMON_CTL_FC_MASK)
+
+/* SKX IRP */
+#define SKX_IRP0_MSR_PMON_CTL0 0xa5b
+#define SKX_IRP0_MSR_PMON_CTR0 0xa59
+#define SKX_IRP0_MSR_PMON_BOX_CTL 0xa58
+#define SKX_IRP_MSR_OFFSET 0x20
+
+/* SKX UPI */
+#define SKX_UPI_PCI_PMON_CTL0 0x350
+#define SKX_UPI_PCI_PMON_CTR0 0x318
+#define SKX_UPI_PCI_PMON_BOX_CTL 0x378
+#define SKX_UPI_CTL_UMASK_EXT 0xffefff
+
+/* SKX M2M */
+#define SKX_M2M_PCI_PMON_CTL0 0x228
+#define SKX_M2M_PCI_PMON_CTR0 0x200
+#define SKX_M2M_PCI_PMON_BOX_CTL 0x258
+
+DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
+DEFINE_UNCORE_FORMAT_ATTR(event2, event, "config:0-6");
+DEFINE_UNCORE_FORMAT_ATTR(event_ext, event, "config:0-7,21");
+DEFINE_UNCORE_FORMAT_ATTR(use_occ_ctr, use_occ_ctr, "config:7");
+DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
+DEFINE_UNCORE_FORMAT_ATTR(umask_ext, umask, "config:8-15,32-43,45-55");
+DEFINE_UNCORE_FORMAT_ATTR(qor, qor, "config:16");
+DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
+DEFINE_UNCORE_FORMAT_ATTR(tid_en, tid_en, "config:19");
+DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
+DEFINE_UNCORE_FORMAT_ATTR(thresh9, thresh, "config:24-35");
+DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31");
+DEFINE_UNCORE_FORMAT_ATTR(thresh6, thresh, "config:24-29");
+DEFINE_UNCORE_FORMAT_ATTR(thresh5, thresh, "config:24-28");
+DEFINE_UNCORE_FORMAT_ATTR(occ_sel, occ_sel, "config:14-15");
+DEFINE_UNCORE_FORMAT_ATTR(occ_invert, occ_invert, "config:30");
+DEFINE_UNCORE_FORMAT_ATTR(occ_edge, occ_edge, "config:14-51");
+DEFINE_UNCORE_FORMAT_ATTR(occ_edge_det, occ_edge_det, "config:31");
+DEFINE_UNCORE_FORMAT_ATTR(ch_mask, ch_mask, "config:36-43");
+DEFINE_UNCORE_FORMAT_ATTR(fc_mask, fc_mask, "config:44-46");
+DEFINE_UNCORE_FORMAT_ATTR(filter_tid, filter_tid, "config1:0-4");
+DEFINE_UNCORE_FORMAT_ATTR(filter_tid2, filter_tid, "config1:0");
+DEFINE_UNCORE_FORMAT_ATTR(filter_tid3, filter_tid, "config1:0-5");
+DEFINE_UNCORE_FORMAT_ATTR(filter_tid4, filter_tid, "config1:0-8");
+DEFINE_UNCORE_FORMAT_ATTR(filter_cid, filter_cid, "config1:5");
+DEFINE_UNCORE_FORMAT_ATTR(filter_link, filter_link, "config1:5-8");
+DEFINE_UNCORE_FORMAT_ATTR(filter_link2, filter_link, "config1:6-8");
+DEFINE_UNCORE_FORMAT_ATTR(filter_link3, filter_link, "config1:12");
+DEFINE_UNCORE_FORMAT_ATTR(filter_nid, filter_nid, "config1:10-17");
+DEFINE_UNCORE_FORMAT_ATTR(filter_nid2, filter_nid, "config1:32-47");
+DEFINE_UNCORE_FORMAT_ATTR(filter_state, filter_state, "config1:18-22");
+DEFINE_UNCORE_FORMAT_ATTR(filter_state2, filter_state, "config1:17-22");
+DEFINE_UNCORE_FORMAT_ATTR(filter_state3, filter_state, "config1:17-23");
+DEFINE_UNCORE_FORMAT_ATTR(filter_state4, filter_state, "config1:18-20");
+DEFINE_UNCORE_FORMAT_ATTR(filter_state5, filter_state, "config1:17-26");
+DEFINE_UNCORE_FORMAT_ATTR(filter_rem, filter_rem, "config1:32");
+DEFINE_UNCORE_FORMAT_ATTR(filter_loc, filter_loc, "config1:33");
+DEFINE_UNCORE_FORMAT_ATTR(filter_nm, filter_nm, "config1:36");
+DEFINE_UNCORE_FORMAT_ATTR(filter_not_nm, filter_not_nm, "config1:37");
+DEFINE_UNCORE_FORMAT_ATTR(filter_local, filter_local, "config1:33");
+DEFINE_UNCORE_FORMAT_ATTR(filter_all_op, filter_all_op, "config1:35");
+DEFINE_UNCORE_FORMAT_ATTR(filter_nnm, filter_nnm, "config1:37");
+DEFINE_UNCORE_FORMAT_ATTR(filter_opc, filter_opc, "config1:23-31");
+DEFINE_UNCORE_FORMAT_ATTR(filter_opc2, filter_opc, "config1:52-60");
+DEFINE_UNCORE_FORMAT_ATTR(filter_opc3, filter_opc, "config1:41-60");
+DEFINE_UNCORE_FORMAT_ATTR(filter_opc_0, filter_opc0, "config1:41-50");
+DEFINE_UNCORE_FORMAT_ATTR(filter_opc_1, filter_opc1, "config1:51-60");
+DEFINE_UNCORE_FORMAT_ATTR(filter_nc, filter_nc, "config1:62");
+DEFINE_UNCORE_FORMAT_ATTR(filter_c6, filter_c6, "config1:61");
+DEFINE_UNCORE_FORMAT_ATTR(filter_isoc, filter_isoc, "config1:63");
+DEFINE_UNCORE_FORMAT_ATTR(filter_band0, filter_band0, "config1:0-7");
+DEFINE_UNCORE_FORMAT_ATTR(filter_band1, filter_band1, "config1:8-15");
+DEFINE_UNCORE_FORMAT_ATTR(filter_band2, filter_band2, "config1:16-23");
+DEFINE_UNCORE_FORMAT_ATTR(filter_band3, filter_band3, "config1:24-31");
+DEFINE_UNCORE_FORMAT_ATTR(match_rds, match_rds, "config1:48-51");
+DEFINE_UNCORE_FORMAT_ATTR(match_rnid30, match_rnid30, "config1:32-35");
+DEFINE_UNCORE_FORMAT_ATTR(match_rnid4, match_rnid4, "config1:31");
+DEFINE_UNCORE_FORMAT_ATTR(match_dnid, match_dnid, "config1:13-17");
+DEFINE_UNCORE_FORMAT_ATTR(match_mc, match_mc, "config1:9-12");
+DEFINE_UNCORE_FORMAT_ATTR(match_opc, match_opc, "config1:5-8");
+DEFINE_UNCORE_FORMAT_ATTR(match_vnw, match_vnw, "config1:3-4");
+DEFINE_UNCORE_FORMAT_ATTR(match0, match0, "config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(match1, match1, "config1:32-63");
+DEFINE_UNCORE_FORMAT_ATTR(mask_rds, mask_rds, "config2:48-51");
+DEFINE_UNCORE_FORMAT_ATTR(mask_rnid30, mask_rnid30, "config2:32-35");
+DEFINE_UNCORE_FORMAT_ATTR(mask_rnid4, mask_rnid4, "config2:31");
+DEFINE_UNCORE_FORMAT_ATTR(mask_dnid, mask_dnid, "config2:13-17");
+DEFINE_UNCORE_FORMAT_ATTR(mask_mc, mask_mc, "config2:9-12");
+DEFINE_UNCORE_FORMAT_ATTR(mask_opc, mask_opc, "config2:5-8");
+DEFINE_UNCORE_FORMAT_ATTR(mask_vnw, mask_vnw, "config2:3-4");
+DEFINE_UNCORE_FORMAT_ATTR(mask0, mask0, "config2:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(mask1, mask1, "config2:32-63");
+
+static void snbep_uncore_pci_disable_box(struct intel_uncore_box *box)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ int box_ctl = uncore_pci_box_ctl(box);
+ u32 config = 0;
+
+ if (!pci_read_config_dword(pdev, box_ctl, &config)) {
+ config |= SNBEP_PMON_BOX_CTL_FRZ;
+ pci_write_config_dword(pdev, box_ctl, config);
+ }
+}
+
+static void snbep_uncore_pci_enable_box(struct intel_uncore_box *box)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ int box_ctl = uncore_pci_box_ctl(box);
+ u32 config = 0;
+
+ if (!pci_read_config_dword(pdev, box_ctl, &config)) {
+ config &= ~SNBEP_PMON_BOX_CTL_FRZ;
+ pci_write_config_dword(pdev, box_ctl, config);
+ }
+}
+
+static void snbep_uncore_pci_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ struct hw_perf_event *hwc = &event->hw;
+
+ pci_write_config_dword(pdev, hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static void snbep_uncore_pci_disable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ struct hw_perf_event *hwc = &event->hw;
+
+ pci_write_config_dword(pdev, hwc->config_base, hwc->config);
+}
+
+static u64 snbep_uncore_pci_read_counter(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ struct hw_perf_event *hwc = &event->hw;
+ u64 count = 0;
+
+ pci_read_config_dword(pdev, hwc->event_base, (u32 *)&count);
+ pci_read_config_dword(pdev, hwc->event_base + 4, (u32 *)&count + 1);
+
+ return count;
+}
+
+static void snbep_uncore_pci_init_box(struct intel_uncore_box *box)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ int box_ctl = uncore_pci_box_ctl(box);
+
+ pci_write_config_dword(pdev, box_ctl, SNBEP_PMON_BOX_CTL_INT);
+}
+
+static void snbep_uncore_msr_disable_box(struct intel_uncore_box *box)
+{
+ u64 config;
+ unsigned msr;
+
+ msr = uncore_msr_box_ctl(box);
+ if (msr) {
+ rdmsrl(msr, config);
+ config |= SNBEP_PMON_BOX_CTL_FRZ;
+ wrmsrl(msr, config);
+ }
+}
+
+static void snbep_uncore_msr_enable_box(struct intel_uncore_box *box)
+{
+ u64 config;
+ unsigned msr;
+
+ msr = uncore_msr_box_ctl(box);
+ if (msr) {
+ rdmsrl(msr, config);
+ config &= ~SNBEP_PMON_BOX_CTL_FRZ;
+ wrmsrl(msr, config);
+ }
+}
+
+static void snbep_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+
+ if (reg1->idx != EXTRA_REG_NONE)
+ wrmsrl(reg1->reg, uncore_shared_reg_config(box, 0));
+
+ wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static void snbep_uncore_msr_disable_event(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ wrmsrl(hwc->config_base, hwc->config);
+}
+
+static void snbep_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+ unsigned msr = uncore_msr_box_ctl(box);
+
+ if (msr)
+ wrmsrl(msr, SNBEP_PMON_BOX_CTL_INT);
+}
+
+static struct attribute *snbep_uncore_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh8.attr,
+ NULL,
+};
+
+static struct attribute *snbep_uncore_ubox_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh5.attr,
+ NULL,
+};
+
+static struct attribute *snbep_uncore_cbox_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_tid_en.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh8.attr,
+ &format_attr_filter_tid.attr,
+ &format_attr_filter_nid.attr,
+ &format_attr_filter_state.attr,
+ &format_attr_filter_opc.attr,
+ NULL,
+};
+
+static struct attribute *snbep_uncore_pcu_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_occ_sel.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh5.attr,
+ &format_attr_occ_invert.attr,
+ &format_attr_occ_edge.attr,
+ &format_attr_filter_band0.attr,
+ &format_attr_filter_band1.attr,
+ &format_attr_filter_band2.attr,
+ &format_attr_filter_band3.attr,
+ NULL,
+};
+
+static struct attribute *snbep_uncore_qpi_formats_attr[] = {
+ &format_attr_event_ext.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh8.attr,
+ &format_attr_match_rds.attr,
+ &format_attr_match_rnid30.attr,
+ &format_attr_match_rnid4.attr,
+ &format_attr_match_dnid.attr,
+ &format_attr_match_mc.attr,
+ &format_attr_match_opc.attr,
+ &format_attr_match_vnw.attr,
+ &format_attr_match0.attr,
+ &format_attr_match1.attr,
+ &format_attr_mask_rds.attr,
+ &format_attr_mask_rnid30.attr,
+ &format_attr_mask_rnid4.attr,
+ &format_attr_mask_dnid.attr,
+ &format_attr_mask_mc.attr,
+ &format_attr_mask_opc.attr,
+ &format_attr_mask_vnw.attr,
+ &format_attr_mask0.attr,
+ &format_attr_mask1.attr,
+ NULL,
+};
+
+static struct uncore_event_desc snbep_uncore_imc_events[] = {
+ INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"),
+ INTEL_UNCORE_EVENT_DESC(cas_count_read, "event=0x04,umask=0x03"),
+ INTEL_UNCORE_EVENT_DESC(cas_count_read.scale, "6.103515625e-5"),
+ INTEL_UNCORE_EVENT_DESC(cas_count_read.unit, "MiB"),
+ INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c"),
+ INTEL_UNCORE_EVENT_DESC(cas_count_write.scale, "6.103515625e-5"),
+ INTEL_UNCORE_EVENT_DESC(cas_count_write.unit, "MiB"),
+ { /* end: all zeroes */ },
+};
+
+static struct uncore_event_desc snbep_uncore_qpi_events[] = {
+ INTEL_UNCORE_EVENT_DESC(clockticks, "event=0x14"),
+ INTEL_UNCORE_EVENT_DESC(txl_flits_active, "event=0x00,umask=0x06"),
+ INTEL_UNCORE_EVENT_DESC(drs_data, "event=0x102,umask=0x08"),
+ INTEL_UNCORE_EVENT_DESC(ncb_data, "event=0x103,umask=0x04"),
+ { /* end: all zeroes */ },
+};
+
+static const struct attribute_group snbep_uncore_format_group = {
+ .name = "format",
+ .attrs = snbep_uncore_formats_attr,
+};
+
+static const struct attribute_group snbep_uncore_ubox_format_group = {
+ .name = "format",
+ .attrs = snbep_uncore_ubox_formats_attr,
+};
+
+static const struct attribute_group snbep_uncore_cbox_format_group = {
+ .name = "format",
+ .attrs = snbep_uncore_cbox_formats_attr,
+};
+
+static const struct attribute_group snbep_uncore_pcu_format_group = {
+ .name = "format",
+ .attrs = snbep_uncore_pcu_formats_attr,
+};
+
+static const struct attribute_group snbep_uncore_qpi_format_group = {
+ .name = "format",
+ .attrs = snbep_uncore_qpi_formats_attr,
+};
+
+#define __SNBEP_UNCORE_MSR_OPS_COMMON_INIT() \
+ .disable_box = snbep_uncore_msr_disable_box, \
+ .enable_box = snbep_uncore_msr_enable_box, \
+ .disable_event = snbep_uncore_msr_disable_event, \
+ .enable_event = snbep_uncore_msr_enable_event, \
+ .read_counter = uncore_msr_read_counter
+
+#define SNBEP_UNCORE_MSR_OPS_COMMON_INIT() \
+ __SNBEP_UNCORE_MSR_OPS_COMMON_INIT(), \
+ .init_box = snbep_uncore_msr_init_box \
+
+static struct intel_uncore_ops snbep_uncore_msr_ops = {
+ SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+};
+
+#define SNBEP_UNCORE_PCI_OPS_COMMON_INIT() \
+ .init_box = snbep_uncore_pci_init_box, \
+ .disable_box = snbep_uncore_pci_disable_box, \
+ .enable_box = snbep_uncore_pci_enable_box, \
+ .disable_event = snbep_uncore_pci_disable_event, \
+ .read_counter = snbep_uncore_pci_read_counter
+
+static struct intel_uncore_ops snbep_uncore_pci_ops = {
+ SNBEP_UNCORE_PCI_OPS_COMMON_INIT(),
+ .enable_event = snbep_uncore_pci_enable_event, \
+};
+
+static struct event_constraint snbep_uncore_cbox_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x01, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x02, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x04, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x05, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x07, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x09, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x12, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x13, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x1b, 0xc),
+ UNCORE_EVENT_CONSTRAINT(0x1c, 0xc),
+ UNCORE_EVENT_CONSTRAINT(0x1d, 0xc),
+ UNCORE_EVENT_CONSTRAINT(0x1e, 0xc),
+ UNCORE_EVENT_CONSTRAINT(0x1f, 0xe),
+ UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x31, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x35, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x3b, 0x1),
+ EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint snbep_uncore_r2pcie_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x12, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x24, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
+ EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint snbep_uncore_r3qpi_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x12, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x20, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x22, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x24, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x29, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2a, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2b, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2e, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2f, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x30, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x31, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x36, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
+ EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type snbep_uncore_ubox = {
+ .name = "ubox",
+ .num_counters = 2,
+ .num_boxes = 1,
+ .perf_ctr_bits = 44,
+ .fixed_ctr_bits = 48,
+ .perf_ctr = SNBEP_U_MSR_PMON_CTR0,
+ .event_ctl = SNBEP_U_MSR_PMON_CTL0,
+ .event_mask = SNBEP_U_MSR_PMON_RAW_EVENT_MASK,
+ .fixed_ctr = SNBEP_U_MSR_PMON_UCLK_FIXED_CTR,
+ .fixed_ctl = SNBEP_U_MSR_PMON_UCLK_FIXED_CTL,
+ .ops = &snbep_uncore_msr_ops,
+ .format_group = &snbep_uncore_ubox_format_group,
+};
+
+static struct extra_reg snbep_uncore_cbox_extra_regs[] = {
+ SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
+ SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4334, 0xffff, 0x6),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4534, 0xffff, 0x6),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4934, 0xffff, 0x6),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0x6),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0xa),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0xa),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x2),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x2),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x2),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x2),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0xa),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0xa),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x2),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x2),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x2),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4037, 0x40ff, 0x2),
+ EVENT_EXTRA_END
+};
+
+static void snbep_cbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ struct intel_uncore_extra_reg *er = &box->shared_regs[0];
+ int i;
+
+ if (uncore_box_is_fake(box))
+ return;
+
+ for (i = 0; i < 5; i++) {
+ if (reg1->alloc & (0x1 << i))
+ atomic_sub(1 << (i * 6), &er->ref);
+ }
+ reg1->alloc = 0;
+}
+
+static struct event_constraint *
+__snbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event,
+ u64 (*cbox_filter_mask)(int fields))
+{
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ struct intel_uncore_extra_reg *er = &box->shared_regs[0];
+ int i, alloc = 0;
+ unsigned long flags;
+ u64 mask;
+
+ if (reg1->idx == EXTRA_REG_NONE)
+ return NULL;
+
+ raw_spin_lock_irqsave(&er->lock, flags);
+ for (i = 0; i < 5; i++) {
+ if (!(reg1->idx & (0x1 << i)))
+ continue;
+ if (!uncore_box_is_fake(box) && (reg1->alloc & (0x1 << i)))
+ continue;
+
+ mask = cbox_filter_mask(0x1 << i);
+ if (!__BITS_VALUE(atomic_read(&er->ref), i, 6) ||
+ !((reg1->config ^ er->config) & mask)) {
+ atomic_add(1 << (i * 6), &er->ref);
+ er->config &= ~mask;
+ er->config |= reg1->config & mask;
+ alloc |= (0x1 << i);
+ } else {
+ break;
+ }
+ }
+ raw_spin_unlock_irqrestore(&er->lock, flags);
+ if (i < 5)
+ goto fail;
+
+ if (!uncore_box_is_fake(box))
+ reg1->alloc |= alloc;
+
+ return NULL;
+fail:
+ for (; i >= 0; i--) {
+ if (alloc & (0x1 << i))
+ atomic_sub(1 << (i * 6), &er->ref);
+ }
+ return &uncore_constraint_empty;
+}
+
+static u64 snbep_cbox_filter_mask(int fields)
+{
+ u64 mask = 0;
+
+ if (fields & 0x1)
+ mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_TID;
+ if (fields & 0x2)
+ mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_NID;
+ if (fields & 0x4)
+ mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_STATE;
+ if (fields & 0x8)
+ mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_OPC;
+
+ return mask;
+}
+
+static struct event_constraint *
+snbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+ return __snbep_cbox_get_constraint(box, event, snbep_cbox_filter_mask);
+}
+
+static int snbep_cbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ struct extra_reg *er;
+ int idx = 0;
+
+ for (er = snbep_uncore_cbox_extra_regs; er->msr; er++) {
+ if (er->event != (event->hw.config & er->config_mask))
+ continue;
+ idx |= er->idx;
+ }
+
+ if (idx) {
+ reg1->reg = SNBEP_C0_MSR_PMON_BOX_FILTER +
+ SNBEP_CBO_MSR_OFFSET * box->pmu->pmu_idx;
+ reg1->config = event->attr.config1 & snbep_cbox_filter_mask(idx);
+ reg1->idx = idx;
+ }
+ return 0;
+}
+
+static struct intel_uncore_ops snbep_uncore_cbox_ops = {
+ SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+ .hw_config = snbep_cbox_hw_config,
+ .get_constraint = snbep_cbox_get_constraint,
+ .put_constraint = snbep_cbox_put_constraint,
+};
+
+static struct intel_uncore_type snbep_uncore_cbox = {
+ .name = "cbox",
+ .num_counters = 4,
+ .num_boxes = 8,
+ .perf_ctr_bits = 44,
+ .event_ctl = SNBEP_C0_MSR_PMON_CTL0,
+ .perf_ctr = SNBEP_C0_MSR_PMON_CTR0,
+ .event_mask = SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNBEP_C0_MSR_PMON_BOX_CTL,
+ .msr_offset = SNBEP_CBO_MSR_OFFSET,
+ .num_shared_regs = 1,
+ .constraints = snbep_uncore_cbox_constraints,
+ .ops = &snbep_uncore_cbox_ops,
+ .format_group = &snbep_uncore_cbox_format_group,
+};
+
+static u64 snbep_pcu_alter_er(struct perf_event *event, int new_idx, bool modify)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+ u64 config = reg1->config;
+
+ if (new_idx > reg1->idx)
+ config <<= 8 * (new_idx - reg1->idx);
+ else
+ config >>= 8 * (reg1->idx - new_idx);
+
+ if (modify) {
+ hwc->config += new_idx - reg1->idx;
+ reg1->config = config;
+ reg1->idx = new_idx;
+ }
+ return config;
+}
+
+static struct event_constraint *
+snbep_pcu_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ struct intel_uncore_extra_reg *er = &box->shared_regs[0];
+ unsigned long flags;
+ int idx = reg1->idx;
+ u64 mask, config1 = reg1->config;
+ bool ok = false;
+
+ if (reg1->idx == EXTRA_REG_NONE ||
+ (!uncore_box_is_fake(box) && reg1->alloc))
+ return NULL;
+again:
+ mask = 0xffULL << (idx * 8);
+ raw_spin_lock_irqsave(&er->lock, flags);
+ if (!__BITS_VALUE(atomic_read(&er->ref), idx, 8) ||
+ !((config1 ^ er->config) & mask)) {
+ atomic_add(1 << (idx * 8), &er->ref);
+ er->config &= ~mask;
+ er->config |= config1 & mask;
+ ok = true;
+ }
+ raw_spin_unlock_irqrestore(&er->lock, flags);
+
+ if (!ok) {
+ idx = (idx + 1) % 4;
+ if (idx != reg1->idx) {
+ config1 = snbep_pcu_alter_er(event, idx, false);
+ goto again;
+ }
+ return &uncore_constraint_empty;
+ }
+
+ if (!uncore_box_is_fake(box)) {
+ if (idx != reg1->idx)
+ snbep_pcu_alter_er(event, idx, true);
+ reg1->alloc = 1;
+ }
+ return NULL;
+}
+
+static void snbep_pcu_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ struct intel_uncore_extra_reg *er = &box->shared_regs[0];
+
+ if (uncore_box_is_fake(box) || !reg1->alloc)
+ return;
+
+ atomic_sub(1 << (reg1->idx * 8), &er->ref);
+ reg1->alloc = 0;
+}
+
+static int snbep_pcu_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+ int ev_sel = hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK;
+
+ if (ev_sel >= 0xb && ev_sel <= 0xe) {
+ reg1->reg = SNBEP_PCU_MSR_PMON_BOX_FILTER;
+ reg1->idx = ev_sel - 0xb;
+ reg1->config = event->attr.config1 & (0xff << (reg1->idx * 8));
+ }
+ return 0;
+}
+
+static struct intel_uncore_ops snbep_uncore_pcu_ops = {
+ SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+ .hw_config = snbep_pcu_hw_config,
+ .get_constraint = snbep_pcu_get_constraint,
+ .put_constraint = snbep_pcu_put_constraint,
+};
+
+static struct intel_uncore_type snbep_uncore_pcu = {
+ .name = "pcu",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .perf_ctr = SNBEP_PCU_MSR_PMON_CTR0,
+ .event_ctl = SNBEP_PCU_MSR_PMON_CTL0,
+ .event_mask = SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNBEP_PCU_MSR_PMON_BOX_CTL,
+ .num_shared_regs = 1,
+ .ops = &snbep_uncore_pcu_ops,
+ .format_group = &snbep_uncore_pcu_format_group,
+};
+
+static struct intel_uncore_type *snbep_msr_uncores[] = {
+ &snbep_uncore_ubox,
+ &snbep_uncore_cbox,
+ &snbep_uncore_pcu,
+ NULL,
+};
+
+void snbep_uncore_cpu_init(void)
+{
+ if (snbep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
+ snbep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
+ uncore_msr_uncores = snbep_msr_uncores;
+}
+
+enum {
+ SNBEP_PCI_QPI_PORT0_FILTER,
+ SNBEP_PCI_QPI_PORT1_FILTER,
+ BDX_PCI_QPI_PORT2_FILTER,
+};
+
+static int snbep_qpi_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+ struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+
+ if ((hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK) == 0x38) {
+ reg1->idx = 0;
+ reg1->reg = SNBEP_Q_Py_PCI_PMON_PKT_MATCH0;
+ reg1->config = event->attr.config1;
+ reg2->reg = SNBEP_Q_Py_PCI_PMON_PKT_MASK0;
+ reg2->config = event->attr.config2;
+ }
+ return 0;
+}
+
+static void snbep_qpi_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+ struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+
+ if (reg1->idx != EXTRA_REG_NONE) {
+ int idx = box->pmu->pmu_idx + SNBEP_PCI_QPI_PORT0_FILTER;
+ int pkg = box->pkgid;
+ struct pci_dev *filter_pdev = uncore_extra_pci_dev[pkg].dev[idx];
+
+ if (filter_pdev) {
+ pci_write_config_dword(filter_pdev, reg1->reg,
+ (u32)reg1->config);
+ pci_write_config_dword(filter_pdev, reg1->reg + 4,
+ (u32)(reg1->config >> 32));
+ pci_write_config_dword(filter_pdev, reg2->reg,
+ (u32)reg2->config);
+ pci_write_config_dword(filter_pdev, reg2->reg + 4,
+ (u32)(reg2->config >> 32));
+ }
+ }
+
+ pci_write_config_dword(pdev, hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static struct intel_uncore_ops snbep_uncore_qpi_ops = {
+ SNBEP_UNCORE_PCI_OPS_COMMON_INIT(),
+ .enable_event = snbep_qpi_enable_event,
+ .hw_config = snbep_qpi_hw_config,
+ .get_constraint = uncore_get_constraint,
+ .put_constraint = uncore_put_constraint,
+};
+
+#define SNBEP_UNCORE_PCI_COMMON_INIT() \
+ .perf_ctr = SNBEP_PCI_PMON_CTR0, \
+ .event_ctl = SNBEP_PCI_PMON_CTL0, \
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK, \
+ .box_ctl = SNBEP_PCI_PMON_BOX_CTL, \
+ .ops = &snbep_uncore_pci_ops, \
+ .format_group = &snbep_uncore_format_group
+
+static struct intel_uncore_type snbep_uncore_ha = {
+ .name = "ha",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type snbep_uncore_imc = {
+ .name = "imc",
+ .num_counters = 4,
+ .num_boxes = 4,
+ .perf_ctr_bits = 48,
+ .fixed_ctr_bits = 48,
+ .fixed_ctr = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
+ .fixed_ctl = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
+ .event_descs = snbep_uncore_imc_events,
+ SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type snbep_uncore_qpi = {
+ .name = "qpi",
+ .num_counters = 4,
+ .num_boxes = 2,
+ .perf_ctr_bits = 48,
+ .perf_ctr = SNBEP_PCI_PMON_CTR0,
+ .event_ctl = SNBEP_PCI_PMON_CTL0,
+ .event_mask = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNBEP_PCI_PMON_BOX_CTL,
+ .num_shared_regs = 1,
+ .ops = &snbep_uncore_qpi_ops,
+ .event_descs = snbep_uncore_qpi_events,
+ .format_group = &snbep_uncore_qpi_format_group,
+};
+
+
+static struct intel_uncore_type snbep_uncore_r2pcie = {
+ .name = "r2pcie",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 44,
+ .constraints = snbep_uncore_r2pcie_constraints,
+ SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type snbep_uncore_r3qpi = {
+ .name = "r3qpi",
+ .num_counters = 3,
+ .num_boxes = 2,
+ .perf_ctr_bits = 44,
+ .constraints = snbep_uncore_r3qpi_constraints,
+ SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+enum {
+ SNBEP_PCI_UNCORE_HA,
+ SNBEP_PCI_UNCORE_IMC,
+ SNBEP_PCI_UNCORE_QPI,
+ SNBEP_PCI_UNCORE_R2PCIE,
+ SNBEP_PCI_UNCORE_R3QPI,
+};
+
+static struct intel_uncore_type *snbep_pci_uncores[] = {
+ [SNBEP_PCI_UNCORE_HA] = &snbep_uncore_ha,
+ [SNBEP_PCI_UNCORE_IMC] = &snbep_uncore_imc,
+ [SNBEP_PCI_UNCORE_QPI] = &snbep_uncore_qpi,
+ [SNBEP_PCI_UNCORE_R2PCIE] = &snbep_uncore_r2pcie,
+ [SNBEP_PCI_UNCORE_R3QPI] = &snbep_uncore_r3qpi,
+ NULL,
+};
+
+static const struct pci_device_id snbep_uncore_pci_ids[] = {
+ { /* Home Agent */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_HA),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_HA, 0),
+ },
+ { /* MC Channel 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC0),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 0),
+ },
+ { /* MC Channel 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC1),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 1),
+ },
+ { /* MC Channel 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC2),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 2),
+ },
+ { /* MC Channel 3 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC3),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 3),
+ },
+ { /* QPI Port 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI0),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_QPI, 0),
+ },
+ { /* QPI Port 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI1),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_QPI, 1),
+ },
+ { /* R2PCIe */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R2PCIE),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R2PCIE, 0),
+ },
+ { /* R3QPI Link 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI0),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R3QPI, 0),
+ },
+ { /* R3QPI Link 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI1),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R3QPI, 1),
+ },
+ { /* QPI Port 0 filter */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3c86),
+ .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+ SNBEP_PCI_QPI_PORT0_FILTER),
+ },
+ { /* QPI Port 0 filter */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3c96),
+ .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+ SNBEP_PCI_QPI_PORT1_FILTER),
+ },
+ { /* end: all zeroes */ }
+};
+
+static struct pci_driver snbep_uncore_pci_driver = {
+ .name = "snbep_uncore",
+ .id_table = snbep_uncore_pci_ids,
+};
+
+#define NODE_ID_MASK 0x7
+
+/*
+ * build pci bus to socket mapping
+ */
+static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool reverse)
+{
+ struct pci_dev *ubox_dev = NULL;
+ int i, bus, nodeid, segment;
+ struct pci2phy_map *map;
+ int err = 0;
+ u32 config = 0;
+
+ while (1) {
+ /* find the UBOX device */
+ ubox_dev = pci_get_device(PCI_VENDOR_ID_INTEL, devid, ubox_dev);
+ if (!ubox_dev)
+ break;
+ bus = ubox_dev->bus->number;
+ /* get the Node ID of the local register */
+ err = pci_read_config_dword(ubox_dev, nodeid_loc, &config);
+ if (err)
+ break;
+ nodeid = config & NODE_ID_MASK;
+ /* get the Node ID mapping */
+ err = pci_read_config_dword(ubox_dev, idmap_loc, &config);
+ if (err)
+ break;
+
+ segment = pci_domain_nr(ubox_dev->bus);
+ raw_spin_lock(&pci2phy_map_lock);
+ map = __find_pci2phy_map(segment);
+ if (!map) {
+ raw_spin_unlock(&pci2phy_map_lock);
+ err = -ENOMEM;
+ break;
+ }
+
+ /*
+ * every three bits in the Node ID mapping register maps
+ * to a particular node.
+ */
+ for (i = 0; i < 8; i++) {
+ if (nodeid == ((config >> (3 * i)) & 0x7)) {
+ map->pbus_to_physid[bus] = i;
+ break;
+ }
+ }
+ raw_spin_unlock(&pci2phy_map_lock);
+ }
+
+ if (!err) {
+ /*
+ * For PCI bus with no UBOX device, find the next bus
+ * that has UBOX device and use its mapping.
+ */
+ raw_spin_lock(&pci2phy_map_lock);
+ list_for_each_entry(map, &pci2phy_map_head, list) {
+ i = -1;
+ if (reverse) {
+ for (bus = 255; bus >= 0; bus--) {
+ if (map->pbus_to_physid[bus] >= 0)
+ i = map->pbus_to_physid[bus];
+ else
+ map->pbus_to_physid[bus] = i;
+ }
+ } else {
+ for (bus = 0; bus <= 255; bus++) {
+ if (map->pbus_to_physid[bus] >= 0)
+ i = map->pbus_to_physid[bus];
+ else
+ map->pbus_to_physid[bus] = i;
+ }
+ }
+ }
+ raw_spin_unlock(&pci2phy_map_lock);
+ }
+
+ pci_dev_put(ubox_dev);
+
+ return err ? pcibios_err_to_errno(err) : 0;
+}
+
+int snbep_uncore_pci_init(void)
+{
+ int ret = snbep_pci2phy_map_init(0x3ce0, SNBEP_CPUNODEID, SNBEP_GIDNIDMAP, true);
+ if (ret)
+ return ret;
+ uncore_pci_uncores = snbep_pci_uncores;
+ uncore_pci_driver = &snbep_uncore_pci_driver;
+ return 0;
+}
+/* end of Sandy Bridge-EP uncore support */
+
+/* IvyTown uncore support */
+static void ivbep_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+ unsigned msr = uncore_msr_box_ctl(box);
+ if (msr)
+ wrmsrl(msr, IVBEP_PMON_BOX_CTL_INT);
+}
+
+static void ivbep_uncore_pci_init_box(struct intel_uncore_box *box)
+{
+ struct pci_dev *pdev = box->pci_dev;
+
+ pci_write_config_dword(pdev, SNBEP_PCI_PMON_BOX_CTL, IVBEP_PMON_BOX_CTL_INT);
+}
+
+#define IVBEP_UNCORE_MSR_OPS_COMMON_INIT() \
+ .init_box = ivbep_uncore_msr_init_box, \
+ .disable_box = snbep_uncore_msr_disable_box, \
+ .enable_box = snbep_uncore_msr_enable_box, \
+ .disable_event = snbep_uncore_msr_disable_event, \
+ .enable_event = snbep_uncore_msr_enable_event, \
+ .read_counter = uncore_msr_read_counter
+
+static struct intel_uncore_ops ivbep_uncore_msr_ops = {
+ IVBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+};
+
+static struct intel_uncore_ops ivbep_uncore_pci_ops = {
+ .init_box = ivbep_uncore_pci_init_box,
+ .disable_box = snbep_uncore_pci_disable_box,
+ .enable_box = snbep_uncore_pci_enable_box,
+ .disable_event = snbep_uncore_pci_disable_event,
+ .enable_event = snbep_uncore_pci_enable_event,
+ .read_counter = snbep_uncore_pci_read_counter,
+};
+
+#define IVBEP_UNCORE_PCI_COMMON_INIT() \
+ .perf_ctr = SNBEP_PCI_PMON_CTR0, \
+ .event_ctl = SNBEP_PCI_PMON_CTL0, \
+ .event_mask = IVBEP_PMON_RAW_EVENT_MASK, \
+ .box_ctl = SNBEP_PCI_PMON_BOX_CTL, \
+ .ops = &ivbep_uncore_pci_ops, \
+ .format_group = &ivbep_uncore_format_group
+
+static struct attribute *ivbep_uncore_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh8.attr,
+ NULL,
+};
+
+static struct attribute *ivbep_uncore_ubox_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh5.attr,
+ NULL,
+};
+
+static struct attribute *ivbep_uncore_cbox_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_tid_en.attr,
+ &format_attr_thresh8.attr,
+ &format_attr_filter_tid.attr,
+ &format_attr_filter_link.attr,
+ &format_attr_filter_state2.attr,
+ &format_attr_filter_nid2.attr,
+ &format_attr_filter_opc2.attr,
+ &format_attr_filter_nc.attr,
+ &format_attr_filter_c6.attr,
+ &format_attr_filter_isoc.attr,
+ NULL,
+};
+
+static struct attribute *ivbep_uncore_pcu_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_occ_sel.attr,
+ &format_attr_edge.attr,
+ &format_attr_thresh5.attr,
+ &format_attr_occ_invert.attr,
+ &format_attr_occ_edge.attr,
+ &format_attr_filter_band0.attr,
+ &format_attr_filter_band1.attr,
+ &format_attr_filter_band2.attr,
+ &format_attr_filter_band3.attr,
+ NULL,
+};
+
+static struct attribute *ivbep_uncore_qpi_formats_attr[] = {
+ &format_attr_event_ext.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_thresh8.attr,
+ &format_attr_match_rds.attr,
+ &format_attr_match_rnid30.attr,
+ &format_attr_match_rnid4.attr,
+ &format_attr_match_dnid.attr,
+ &format_attr_match_mc.attr,
+ &format_attr_match_opc.attr,
+ &format_attr_match_vnw.attr,
+ &format_attr_match0.attr,
+ &format_attr_match1.attr,
+ &format_attr_mask_rds.attr,
+ &format_attr_mask_rnid30.attr,
+ &format_attr_mask_rnid4.attr,
+ &format_attr_mask_dnid.attr,
+ &format_attr_mask_mc.attr,
+ &format_attr_mask_opc.attr,
+ &format_attr_mask_vnw.attr,
+ &format_attr_mask0.attr,
+ &format_attr_mask1.attr,
+ NULL,
+};
+
+static const struct attribute_group ivbep_uncore_format_group = {
+ .name = "format",
+ .attrs = ivbep_uncore_formats_attr,
+};
+
+static const struct attribute_group ivbep_uncore_ubox_format_group = {
+ .name = "format",
+ .attrs = ivbep_uncore_ubox_formats_attr,
+};
+
+static const struct attribute_group ivbep_uncore_cbox_format_group = {
+ .name = "format",
+ .attrs = ivbep_uncore_cbox_formats_attr,
+};
+
+static const struct attribute_group ivbep_uncore_pcu_format_group = {
+ .name = "format",
+ .attrs = ivbep_uncore_pcu_formats_attr,
+};
+
+static const struct attribute_group ivbep_uncore_qpi_format_group = {
+ .name = "format",
+ .attrs = ivbep_uncore_qpi_formats_attr,
+};
+
+static struct intel_uncore_type ivbep_uncore_ubox = {
+ .name = "ubox",
+ .num_counters = 2,
+ .num_boxes = 1,
+ .perf_ctr_bits = 44,
+ .fixed_ctr_bits = 48,
+ .perf_ctr = SNBEP_U_MSR_PMON_CTR0,
+ .event_ctl = SNBEP_U_MSR_PMON_CTL0,
+ .event_mask = IVBEP_U_MSR_PMON_RAW_EVENT_MASK,
+ .fixed_ctr = SNBEP_U_MSR_PMON_UCLK_FIXED_CTR,
+ .fixed_ctl = SNBEP_U_MSR_PMON_UCLK_FIXED_CTL,
+ .ops = &ivbep_uncore_msr_ops,
+ .format_group = &ivbep_uncore_ubox_format_group,
+};
+
+static struct extra_reg ivbep_uncore_cbox_extra_regs[] = {
+ SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
+ SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x1031, 0x10ff, 0x2),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x1134, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0xc),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x5134, 0xffff, 0xc),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4334, 0xffff, 0xc),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4534, 0xffff, 0xc),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4934, 0xffff, 0xc),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x2135, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x2335, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0x18),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0x18),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x8135, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x8335, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x2136, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x2336, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0x18),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0x18),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x5036, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x8136, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x8336, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4037, 0x40ff, 0x8),
+ EVENT_EXTRA_END
+};
+
+static u64 ivbep_cbox_filter_mask(int fields)
+{
+ u64 mask = 0;
+
+ if (fields & 0x1)
+ mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_TID;
+ if (fields & 0x2)
+ mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_LINK;
+ if (fields & 0x4)
+ mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_STATE;
+ if (fields & 0x8)
+ mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_NID;
+ if (fields & 0x10) {
+ mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_OPC;
+ mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_NC;
+ mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_C6;
+ mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_ISOC;
+ }
+
+ return mask;
+}
+
+static struct event_constraint *
+ivbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+ return __snbep_cbox_get_constraint(box, event, ivbep_cbox_filter_mask);
+}
+
+static int ivbep_cbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ struct extra_reg *er;
+ int idx = 0;
+
+ for (er = ivbep_uncore_cbox_extra_regs; er->msr; er++) {
+ if (er->event != (event->hw.config & er->config_mask))
+ continue;
+ idx |= er->idx;
+ }
+
+ if (idx) {
+ reg1->reg = SNBEP_C0_MSR_PMON_BOX_FILTER +
+ SNBEP_CBO_MSR_OFFSET * box->pmu->pmu_idx;
+ reg1->config = event->attr.config1 & ivbep_cbox_filter_mask(idx);
+ reg1->idx = idx;
+ }
+ return 0;
+}
+
+static void ivbep_cbox_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+
+ if (reg1->idx != EXTRA_REG_NONE) {
+ u64 filter = uncore_shared_reg_config(box, 0);
+ wrmsrl(reg1->reg, filter & 0xffffffff);
+ wrmsrl(reg1->reg + 6, filter >> 32);
+ }
+
+ wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static struct intel_uncore_ops ivbep_uncore_cbox_ops = {
+ .init_box = ivbep_uncore_msr_init_box,
+ .disable_box = snbep_uncore_msr_disable_box,
+ .enable_box = snbep_uncore_msr_enable_box,
+ .disable_event = snbep_uncore_msr_disable_event,
+ .enable_event = ivbep_cbox_enable_event,
+ .read_counter = uncore_msr_read_counter,
+ .hw_config = ivbep_cbox_hw_config,
+ .get_constraint = ivbep_cbox_get_constraint,
+ .put_constraint = snbep_cbox_put_constraint,
+};
+
+static struct intel_uncore_type ivbep_uncore_cbox = {
+ .name = "cbox",
+ .num_counters = 4,
+ .num_boxes = 15,
+ .perf_ctr_bits = 44,
+ .event_ctl = SNBEP_C0_MSR_PMON_CTL0,
+ .perf_ctr = SNBEP_C0_MSR_PMON_CTR0,
+ .event_mask = IVBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNBEP_C0_MSR_PMON_BOX_CTL,
+ .msr_offset = SNBEP_CBO_MSR_OFFSET,
+ .num_shared_regs = 1,
+ .constraints = snbep_uncore_cbox_constraints,
+ .ops = &ivbep_uncore_cbox_ops,
+ .format_group = &ivbep_uncore_cbox_format_group,
+};
+
+static struct intel_uncore_ops ivbep_uncore_pcu_ops = {
+ IVBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+ .hw_config = snbep_pcu_hw_config,
+ .get_constraint = snbep_pcu_get_constraint,
+ .put_constraint = snbep_pcu_put_constraint,
+};
+
+static struct intel_uncore_type ivbep_uncore_pcu = {
+ .name = "pcu",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .perf_ctr = SNBEP_PCU_MSR_PMON_CTR0,
+ .event_ctl = SNBEP_PCU_MSR_PMON_CTL0,
+ .event_mask = IVBEP_PCU_MSR_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNBEP_PCU_MSR_PMON_BOX_CTL,
+ .num_shared_regs = 1,
+ .ops = &ivbep_uncore_pcu_ops,
+ .format_group = &ivbep_uncore_pcu_format_group,
+};
+
+static struct intel_uncore_type *ivbep_msr_uncores[] = {
+ &ivbep_uncore_ubox,
+ &ivbep_uncore_cbox,
+ &ivbep_uncore_pcu,
+ NULL,
+};
+
+void ivbep_uncore_cpu_init(void)
+{
+ if (ivbep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
+ ivbep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
+ uncore_msr_uncores = ivbep_msr_uncores;
+}
+
+static struct intel_uncore_type ivbep_uncore_ha = {
+ .name = "ha",
+ .num_counters = 4,
+ .num_boxes = 2,
+ .perf_ctr_bits = 48,
+ IVBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type ivbep_uncore_imc = {
+ .name = "imc",
+ .num_counters = 4,
+ .num_boxes = 8,
+ .perf_ctr_bits = 48,
+ .fixed_ctr_bits = 48,
+ .fixed_ctr = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
+ .fixed_ctl = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
+ .event_descs = snbep_uncore_imc_events,
+ IVBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+/* registers in IRP boxes are not properly aligned */
+static unsigned ivbep_uncore_irp_ctls[] = {0xd8, 0xdc, 0xe0, 0xe4};
+static unsigned ivbep_uncore_irp_ctrs[] = {0xa0, 0xb0, 0xb8, 0xc0};
+
+static void ivbep_uncore_irp_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ struct hw_perf_event *hwc = &event->hw;
+
+ pci_write_config_dword(pdev, ivbep_uncore_irp_ctls[hwc->idx],
+ hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static void ivbep_uncore_irp_disable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ struct hw_perf_event *hwc = &event->hw;
+
+ pci_write_config_dword(pdev, ivbep_uncore_irp_ctls[hwc->idx], hwc->config);
+}
+
+static u64 ivbep_uncore_irp_read_counter(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ struct hw_perf_event *hwc = &event->hw;
+ u64 count = 0;
+
+ pci_read_config_dword(pdev, ivbep_uncore_irp_ctrs[hwc->idx], (u32 *)&count);
+ pci_read_config_dword(pdev, ivbep_uncore_irp_ctrs[hwc->idx] + 4, (u32 *)&count + 1);
+
+ return count;
+}
+
+static struct intel_uncore_ops ivbep_uncore_irp_ops = {
+ .init_box = ivbep_uncore_pci_init_box,
+ .disable_box = snbep_uncore_pci_disable_box,
+ .enable_box = snbep_uncore_pci_enable_box,
+ .disable_event = ivbep_uncore_irp_disable_event,
+ .enable_event = ivbep_uncore_irp_enable_event,
+ .read_counter = ivbep_uncore_irp_read_counter,
+};
+
+static struct intel_uncore_type ivbep_uncore_irp = {
+ .name = "irp",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .event_mask = IVBEP_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNBEP_PCI_PMON_BOX_CTL,
+ .ops = &ivbep_uncore_irp_ops,
+ .format_group = &ivbep_uncore_format_group,
+};
+
+static struct intel_uncore_ops ivbep_uncore_qpi_ops = {
+ .init_box = ivbep_uncore_pci_init_box,
+ .disable_box = snbep_uncore_pci_disable_box,
+ .enable_box = snbep_uncore_pci_enable_box,
+ .disable_event = snbep_uncore_pci_disable_event,
+ .enable_event = snbep_qpi_enable_event,
+ .read_counter = snbep_uncore_pci_read_counter,
+ .hw_config = snbep_qpi_hw_config,
+ .get_constraint = uncore_get_constraint,
+ .put_constraint = uncore_put_constraint,
+};
+
+static struct intel_uncore_type ivbep_uncore_qpi = {
+ .name = "qpi",
+ .num_counters = 4,
+ .num_boxes = 3,
+ .perf_ctr_bits = 48,
+ .perf_ctr = SNBEP_PCI_PMON_CTR0,
+ .event_ctl = SNBEP_PCI_PMON_CTL0,
+ .event_mask = IVBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNBEP_PCI_PMON_BOX_CTL,
+ .num_shared_regs = 1,
+ .ops = &ivbep_uncore_qpi_ops,
+ .format_group = &ivbep_uncore_qpi_format_group,
+};
+
+static struct intel_uncore_type ivbep_uncore_r2pcie = {
+ .name = "r2pcie",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 44,
+ .constraints = snbep_uncore_r2pcie_constraints,
+ IVBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type ivbep_uncore_r3qpi = {
+ .name = "r3qpi",
+ .num_counters = 3,
+ .num_boxes = 2,
+ .perf_ctr_bits = 44,
+ .constraints = snbep_uncore_r3qpi_constraints,
+ IVBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+enum {
+ IVBEP_PCI_UNCORE_HA,
+ IVBEP_PCI_UNCORE_IMC,
+ IVBEP_PCI_UNCORE_IRP,
+ IVBEP_PCI_UNCORE_QPI,
+ IVBEP_PCI_UNCORE_R2PCIE,
+ IVBEP_PCI_UNCORE_R3QPI,
+};
+
+static struct intel_uncore_type *ivbep_pci_uncores[] = {
+ [IVBEP_PCI_UNCORE_HA] = &ivbep_uncore_ha,
+ [IVBEP_PCI_UNCORE_IMC] = &ivbep_uncore_imc,
+ [IVBEP_PCI_UNCORE_IRP] = &ivbep_uncore_irp,
+ [IVBEP_PCI_UNCORE_QPI] = &ivbep_uncore_qpi,
+ [IVBEP_PCI_UNCORE_R2PCIE] = &ivbep_uncore_r2pcie,
+ [IVBEP_PCI_UNCORE_R3QPI] = &ivbep_uncore_r3qpi,
+ NULL,
+};
+
+static const struct pci_device_id ivbep_uncore_pci_ids[] = {
+ { /* Home Agent 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe30),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_HA, 0),
+ },
+ { /* Home Agent 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe38),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_HA, 1),
+ },
+ { /* MC0 Channel 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb4),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 0),
+ },
+ { /* MC0 Channel 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb5),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 1),
+ },
+ { /* MC0 Channel 3 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb0),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 2),
+ },
+ { /* MC0 Channel 4 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb1),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 3),
+ },
+ { /* MC1 Channel 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef4),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 4),
+ },
+ { /* MC1 Channel 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef5),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 5),
+ },
+ { /* MC1 Channel 3 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef0),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 6),
+ },
+ { /* MC1 Channel 4 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef1),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 7),
+ },
+ { /* IRP */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe39),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IRP, 0),
+ },
+ { /* QPI0 Port 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe32),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_QPI, 0),
+ },
+ { /* QPI0 Port 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe33),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_QPI, 1),
+ },
+ { /* QPI1 Port 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3a),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_QPI, 2),
+ },
+ { /* R2PCIe */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe34),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R2PCIE, 0),
+ },
+ { /* R3QPI0 Link 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe36),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R3QPI, 0),
+ },
+ { /* R3QPI0 Link 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe37),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R3QPI, 1),
+ },
+ { /* R3QPI1 Link 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3e),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R3QPI, 2),
+ },
+ { /* QPI Port 0 filter */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe86),
+ .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+ SNBEP_PCI_QPI_PORT0_FILTER),
+ },
+ { /* QPI Port 0 filter */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe96),
+ .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+ SNBEP_PCI_QPI_PORT1_FILTER),
+ },
+ { /* end: all zeroes */ }
+};
+
+static struct pci_driver ivbep_uncore_pci_driver = {
+ .name = "ivbep_uncore",
+ .id_table = ivbep_uncore_pci_ids,
+};
+
+int ivbep_uncore_pci_init(void)
+{
+ int ret = snbep_pci2phy_map_init(0x0e1e, SNBEP_CPUNODEID, SNBEP_GIDNIDMAP, true);
+ if (ret)
+ return ret;
+ uncore_pci_uncores = ivbep_pci_uncores;
+ uncore_pci_driver = &ivbep_uncore_pci_driver;
+ return 0;
+}
+/* end of IvyTown uncore support */
+
+/* KNL uncore support */
+static struct attribute *knl_uncore_ubox_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_tid_en.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh5.attr,
+ NULL,
+};
+
+static const struct attribute_group knl_uncore_ubox_format_group = {
+ .name = "format",
+ .attrs = knl_uncore_ubox_formats_attr,
+};
+
+static struct intel_uncore_type knl_uncore_ubox = {
+ .name = "ubox",
+ .num_counters = 2,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .fixed_ctr_bits = 48,
+ .perf_ctr = HSWEP_U_MSR_PMON_CTR0,
+ .event_ctl = HSWEP_U_MSR_PMON_CTL0,
+ .event_mask = KNL_U_MSR_PMON_RAW_EVENT_MASK,
+ .fixed_ctr = HSWEP_U_MSR_PMON_UCLK_FIXED_CTR,
+ .fixed_ctl = HSWEP_U_MSR_PMON_UCLK_FIXED_CTL,
+ .ops = &snbep_uncore_msr_ops,
+ .format_group = &knl_uncore_ubox_format_group,
+};
+
+static struct attribute *knl_uncore_cha_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_qor.attr,
+ &format_attr_edge.attr,
+ &format_attr_tid_en.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh8.attr,
+ &format_attr_filter_tid4.attr,
+ &format_attr_filter_link3.attr,
+ &format_attr_filter_state4.attr,
+ &format_attr_filter_local.attr,
+ &format_attr_filter_all_op.attr,
+ &format_attr_filter_nnm.attr,
+ &format_attr_filter_opc3.attr,
+ &format_attr_filter_nc.attr,
+ &format_attr_filter_isoc.attr,
+ NULL,
+};
+
+static const struct attribute_group knl_uncore_cha_format_group = {
+ .name = "format",
+ .attrs = knl_uncore_cha_formats_attr,
+};
+
+static struct event_constraint knl_uncore_cha_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x1f, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
+ EVENT_CONSTRAINT_END
+};
+
+static struct extra_reg knl_uncore_cha_extra_regs[] = {
+ SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
+ SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x3d, 0xff, 0x2),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x35, 0xff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x36, 0xff, 0x4),
+ EVENT_EXTRA_END
+};
+
+static u64 knl_cha_filter_mask(int fields)
+{
+ u64 mask = 0;
+
+ if (fields & 0x1)
+ mask |= KNL_CHA_MSR_PMON_BOX_FILTER_TID;
+ if (fields & 0x2)
+ mask |= KNL_CHA_MSR_PMON_BOX_FILTER_STATE;
+ if (fields & 0x4)
+ mask |= KNL_CHA_MSR_PMON_BOX_FILTER_OP;
+ return mask;
+}
+
+static struct event_constraint *
+knl_cha_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+ return __snbep_cbox_get_constraint(box, event, knl_cha_filter_mask);
+}
+
+static int knl_cha_hw_config(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ struct extra_reg *er;
+ int idx = 0;
+
+ for (er = knl_uncore_cha_extra_regs; er->msr; er++) {
+ if (er->event != (event->hw.config & er->config_mask))
+ continue;
+ idx |= er->idx;
+ }
+
+ if (idx) {
+ reg1->reg = HSWEP_C0_MSR_PMON_BOX_FILTER0 +
+ KNL_CHA_MSR_OFFSET * box->pmu->pmu_idx;
+ reg1->config = event->attr.config1 & knl_cha_filter_mask(idx);
+
+ reg1->config |= KNL_CHA_MSR_PMON_BOX_FILTER_REMOTE_NODE;
+ reg1->config |= KNL_CHA_MSR_PMON_BOX_FILTER_LOCAL_NODE;
+ reg1->config |= KNL_CHA_MSR_PMON_BOX_FILTER_NNC;
+ reg1->idx = idx;
+ }
+ return 0;
+}
+
+static void hswep_cbox_enable_event(struct intel_uncore_box *box,
+ struct perf_event *event);
+
+static struct intel_uncore_ops knl_uncore_cha_ops = {
+ .init_box = snbep_uncore_msr_init_box,
+ .disable_box = snbep_uncore_msr_disable_box,
+ .enable_box = snbep_uncore_msr_enable_box,
+ .disable_event = snbep_uncore_msr_disable_event,
+ .enable_event = hswep_cbox_enable_event,
+ .read_counter = uncore_msr_read_counter,
+ .hw_config = knl_cha_hw_config,
+ .get_constraint = knl_cha_get_constraint,
+ .put_constraint = snbep_cbox_put_constraint,
+};
+
+static struct intel_uncore_type knl_uncore_cha = {
+ .name = "cha",
+ .num_counters = 4,
+ .num_boxes = 38,
+ .perf_ctr_bits = 48,
+ .event_ctl = HSWEP_C0_MSR_PMON_CTL0,
+ .perf_ctr = HSWEP_C0_MSR_PMON_CTR0,
+ .event_mask = KNL_CHA_MSR_PMON_RAW_EVENT_MASK,
+ .box_ctl = HSWEP_C0_MSR_PMON_BOX_CTL,
+ .msr_offset = KNL_CHA_MSR_OFFSET,
+ .num_shared_regs = 1,
+ .constraints = knl_uncore_cha_constraints,
+ .ops = &knl_uncore_cha_ops,
+ .format_group = &knl_uncore_cha_format_group,
+};
+
+static struct attribute *knl_uncore_pcu_formats_attr[] = {
+ &format_attr_event2.attr,
+ &format_attr_use_occ_ctr.attr,
+ &format_attr_occ_sel.attr,
+ &format_attr_edge.attr,
+ &format_attr_tid_en.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh6.attr,
+ &format_attr_occ_invert.attr,
+ &format_attr_occ_edge_det.attr,
+ NULL,
+};
+
+static const struct attribute_group knl_uncore_pcu_format_group = {
+ .name = "format",
+ .attrs = knl_uncore_pcu_formats_attr,
+};
+
+static struct intel_uncore_type knl_uncore_pcu = {
+ .name = "pcu",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .perf_ctr = HSWEP_PCU_MSR_PMON_CTR0,
+ .event_ctl = HSWEP_PCU_MSR_PMON_CTL0,
+ .event_mask = KNL_PCU_MSR_PMON_RAW_EVENT_MASK,
+ .box_ctl = HSWEP_PCU_MSR_PMON_BOX_CTL,
+ .ops = &snbep_uncore_msr_ops,
+ .format_group = &knl_uncore_pcu_format_group,
+};
+
+static struct intel_uncore_type *knl_msr_uncores[] = {
+ &knl_uncore_ubox,
+ &knl_uncore_cha,
+ &knl_uncore_pcu,
+ NULL,
+};
+
+void knl_uncore_cpu_init(void)
+{
+ uncore_msr_uncores = knl_msr_uncores;
+}
+
+static void knl_uncore_imc_enable_box(struct intel_uncore_box *box)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ int box_ctl = uncore_pci_box_ctl(box);
+
+ pci_write_config_dword(pdev, box_ctl, 0);
+}
+
+static void knl_uncore_imc_enable_event(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ struct hw_perf_event *hwc = &event->hw;
+
+ if ((event->attr.config & SNBEP_PMON_CTL_EV_SEL_MASK)
+ == UNCORE_FIXED_EVENT)
+ pci_write_config_dword(pdev, hwc->config_base,
+ hwc->config | KNL_PMON_FIXED_CTL_EN);
+ else
+ pci_write_config_dword(pdev, hwc->config_base,
+ hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static struct intel_uncore_ops knl_uncore_imc_ops = {
+ .init_box = snbep_uncore_pci_init_box,
+ .disable_box = snbep_uncore_pci_disable_box,
+ .enable_box = knl_uncore_imc_enable_box,
+ .read_counter = snbep_uncore_pci_read_counter,
+ .enable_event = knl_uncore_imc_enable_event,
+ .disable_event = snbep_uncore_pci_disable_event,
+};
+
+static struct intel_uncore_type knl_uncore_imc_uclk = {
+ .name = "imc_uclk",
+ .num_counters = 4,
+ .num_boxes = 2,
+ .perf_ctr_bits = 48,
+ .fixed_ctr_bits = 48,
+ .perf_ctr = KNL_UCLK_MSR_PMON_CTR0_LOW,
+ .event_ctl = KNL_UCLK_MSR_PMON_CTL0,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .fixed_ctr = KNL_UCLK_MSR_PMON_UCLK_FIXED_LOW,
+ .fixed_ctl = KNL_UCLK_MSR_PMON_UCLK_FIXED_CTL,
+ .box_ctl = KNL_UCLK_MSR_PMON_BOX_CTL,
+ .ops = &knl_uncore_imc_ops,
+ .format_group = &snbep_uncore_format_group,
+};
+
+static struct intel_uncore_type knl_uncore_imc_dclk = {
+ .name = "imc",
+ .num_counters = 4,
+ .num_boxes = 6,
+ .perf_ctr_bits = 48,
+ .fixed_ctr_bits = 48,
+ .perf_ctr = KNL_MC0_CH0_MSR_PMON_CTR0_LOW,
+ .event_ctl = KNL_MC0_CH0_MSR_PMON_CTL0,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .fixed_ctr = KNL_MC0_CH0_MSR_PMON_FIXED_LOW,
+ .fixed_ctl = KNL_MC0_CH0_MSR_PMON_FIXED_CTL,
+ .box_ctl = KNL_MC0_CH0_MSR_PMON_BOX_CTL,
+ .ops = &knl_uncore_imc_ops,
+ .format_group = &snbep_uncore_format_group,
+};
+
+static struct intel_uncore_type knl_uncore_edc_uclk = {
+ .name = "edc_uclk",
+ .num_counters = 4,
+ .num_boxes = 8,
+ .perf_ctr_bits = 48,
+ .fixed_ctr_bits = 48,
+ .perf_ctr = KNL_UCLK_MSR_PMON_CTR0_LOW,
+ .event_ctl = KNL_UCLK_MSR_PMON_CTL0,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .fixed_ctr = KNL_UCLK_MSR_PMON_UCLK_FIXED_LOW,
+ .fixed_ctl = KNL_UCLK_MSR_PMON_UCLK_FIXED_CTL,
+ .box_ctl = KNL_UCLK_MSR_PMON_BOX_CTL,
+ .ops = &knl_uncore_imc_ops,
+ .format_group = &snbep_uncore_format_group,
+};
+
+static struct intel_uncore_type knl_uncore_edc_eclk = {
+ .name = "edc_eclk",
+ .num_counters = 4,
+ .num_boxes = 8,
+ .perf_ctr_bits = 48,
+ .fixed_ctr_bits = 48,
+ .perf_ctr = KNL_EDC0_ECLK_MSR_PMON_CTR0_LOW,
+ .event_ctl = KNL_EDC0_ECLK_MSR_PMON_CTL0,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .fixed_ctr = KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_LOW,
+ .fixed_ctl = KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_CTL,
+ .box_ctl = KNL_EDC0_ECLK_MSR_PMON_BOX_CTL,
+ .ops = &knl_uncore_imc_ops,
+ .format_group = &snbep_uncore_format_group,
+};
+
+static struct event_constraint knl_uncore_m2pcie_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+ EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type knl_uncore_m2pcie = {
+ .name = "m2pcie",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .constraints = knl_uncore_m2pcie_constraints,
+ SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct attribute *knl_uncore_irp_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_qor.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh8.attr,
+ NULL,
+};
+
+static const struct attribute_group knl_uncore_irp_format_group = {
+ .name = "format",
+ .attrs = knl_uncore_irp_formats_attr,
+};
+
+static struct intel_uncore_type knl_uncore_irp = {
+ .name = "irp",
+ .num_counters = 2,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .perf_ctr = SNBEP_PCI_PMON_CTR0,
+ .event_ctl = SNBEP_PCI_PMON_CTL0,
+ .event_mask = KNL_IRP_PCI_PMON_RAW_EVENT_MASK,
+ .box_ctl = KNL_IRP_PCI_PMON_BOX_CTL,
+ .ops = &snbep_uncore_pci_ops,
+ .format_group = &knl_uncore_irp_format_group,
+};
+
+enum {
+ KNL_PCI_UNCORE_MC_UCLK,
+ KNL_PCI_UNCORE_MC_DCLK,
+ KNL_PCI_UNCORE_EDC_UCLK,
+ KNL_PCI_UNCORE_EDC_ECLK,
+ KNL_PCI_UNCORE_M2PCIE,
+ KNL_PCI_UNCORE_IRP,
+};
+
+static struct intel_uncore_type *knl_pci_uncores[] = {
+ [KNL_PCI_UNCORE_MC_UCLK] = &knl_uncore_imc_uclk,
+ [KNL_PCI_UNCORE_MC_DCLK] = &knl_uncore_imc_dclk,
+ [KNL_PCI_UNCORE_EDC_UCLK] = &knl_uncore_edc_uclk,
+ [KNL_PCI_UNCORE_EDC_ECLK] = &knl_uncore_edc_eclk,
+ [KNL_PCI_UNCORE_M2PCIE] = &knl_uncore_m2pcie,
+ [KNL_PCI_UNCORE_IRP] = &knl_uncore_irp,
+ NULL,
+};
+
+/*
+ * KNL uses a common PCI device ID for multiple instances of an Uncore PMU
+ * device type. prior to KNL, each instance of a PMU device type had a unique
+ * device ID.
+ *
+ * PCI Device ID Uncore PMU Devices
+ * ----------------------------------
+ * 0x7841 MC0 UClk, MC1 UClk
+ * 0x7843 MC0 DClk CH 0, MC0 DClk CH 1, MC0 DClk CH 2,
+ * MC1 DClk CH 0, MC1 DClk CH 1, MC1 DClk CH 2
+ * 0x7833 EDC0 UClk, EDC1 UClk, EDC2 UClk, EDC3 UClk,
+ * EDC4 UClk, EDC5 UClk, EDC6 UClk, EDC7 UClk
+ * 0x7835 EDC0 EClk, EDC1 EClk, EDC2 EClk, EDC3 EClk,
+ * EDC4 EClk, EDC5 EClk, EDC6 EClk, EDC7 EClk
+ * 0x7817 M2PCIe
+ * 0x7814 IRP
+*/
+
+static const struct pci_device_id knl_uncore_pci_ids[] = {
+ { /* MC0 UClk */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7841),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(10, 0, KNL_PCI_UNCORE_MC_UCLK, 0),
+ },
+ { /* MC1 UClk */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7841),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(11, 0, KNL_PCI_UNCORE_MC_UCLK, 1),
+ },
+ { /* MC0 DClk CH 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7843),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(8, 2, KNL_PCI_UNCORE_MC_DCLK, 0),
+ },
+ { /* MC0 DClk CH 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7843),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(8, 3, KNL_PCI_UNCORE_MC_DCLK, 1),
+ },
+ { /* MC0 DClk CH 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7843),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(8, 4, KNL_PCI_UNCORE_MC_DCLK, 2),
+ },
+ { /* MC1 DClk CH 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7843),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(9, 2, KNL_PCI_UNCORE_MC_DCLK, 3),
+ },
+ { /* MC1 DClk CH 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7843),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(9, 3, KNL_PCI_UNCORE_MC_DCLK, 4),
+ },
+ { /* MC1 DClk CH 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7843),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(9, 4, KNL_PCI_UNCORE_MC_DCLK, 5),
+ },
+ { /* EDC0 UClk */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7833),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(15, 0, KNL_PCI_UNCORE_EDC_UCLK, 0),
+ },
+ { /* EDC1 UClk */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7833),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(16, 0, KNL_PCI_UNCORE_EDC_UCLK, 1),
+ },
+ { /* EDC2 UClk */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7833),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(17, 0, KNL_PCI_UNCORE_EDC_UCLK, 2),
+ },
+ { /* EDC3 UClk */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7833),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(18, 0, KNL_PCI_UNCORE_EDC_UCLK, 3),
+ },
+ { /* EDC4 UClk */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7833),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(19, 0, KNL_PCI_UNCORE_EDC_UCLK, 4),
+ },
+ { /* EDC5 UClk */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7833),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(20, 0, KNL_PCI_UNCORE_EDC_UCLK, 5),
+ },
+ { /* EDC6 UClk */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7833),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(21, 0, KNL_PCI_UNCORE_EDC_UCLK, 6),
+ },
+ { /* EDC7 UClk */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7833),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(22, 0, KNL_PCI_UNCORE_EDC_UCLK, 7),
+ },
+ { /* EDC0 EClk */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7835),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(24, 2, KNL_PCI_UNCORE_EDC_ECLK, 0),
+ },
+ { /* EDC1 EClk */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7835),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(25, 2, KNL_PCI_UNCORE_EDC_ECLK, 1),
+ },
+ { /* EDC2 EClk */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7835),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(26, 2, KNL_PCI_UNCORE_EDC_ECLK, 2),
+ },
+ { /* EDC3 EClk */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7835),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(27, 2, KNL_PCI_UNCORE_EDC_ECLK, 3),
+ },
+ { /* EDC4 EClk */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7835),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(28, 2, KNL_PCI_UNCORE_EDC_ECLK, 4),
+ },
+ { /* EDC5 EClk */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7835),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(29, 2, KNL_PCI_UNCORE_EDC_ECLK, 5),
+ },
+ { /* EDC6 EClk */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7835),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(30, 2, KNL_PCI_UNCORE_EDC_ECLK, 6),
+ },
+ { /* EDC7 EClk */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7835),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(31, 2, KNL_PCI_UNCORE_EDC_ECLK, 7),
+ },
+ { /* M2PCIe */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7817),
+ .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_M2PCIE, 0),
+ },
+ { /* IRP */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7814),
+ .driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_IRP, 0),
+ },
+ { /* end: all zeroes */ }
+};
+
+static struct pci_driver knl_uncore_pci_driver = {
+ .name = "knl_uncore",
+ .id_table = knl_uncore_pci_ids,
+};
+
+int knl_uncore_pci_init(void)
+{
+ int ret;
+
+ /* All KNL PCI based PMON units are on the same PCI bus except IRP */
+ ret = snb_pci2phy_map_init(0x7814); /* IRP */
+ if (ret)
+ return ret;
+ ret = snb_pci2phy_map_init(0x7817); /* M2PCIe */
+ if (ret)
+ return ret;
+ uncore_pci_uncores = knl_pci_uncores;
+ uncore_pci_driver = &knl_uncore_pci_driver;
+ return 0;
+}
+
+/* end of KNL uncore support */
+
+/* Haswell-EP uncore support */
+static struct attribute *hswep_uncore_ubox_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh5.attr,
+ &format_attr_filter_tid2.attr,
+ &format_attr_filter_cid.attr,
+ NULL,
+};
+
+static const struct attribute_group hswep_uncore_ubox_format_group = {
+ .name = "format",
+ .attrs = hswep_uncore_ubox_formats_attr,
+};
+
+static int hswep_ubox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ reg1->reg = HSWEP_U_MSR_PMON_FILTER;
+ reg1->config = event->attr.config1 & HSWEP_U_MSR_PMON_BOX_FILTER_MASK;
+ reg1->idx = 0;
+ return 0;
+}
+
+static struct intel_uncore_ops hswep_uncore_ubox_ops = {
+ SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+ .hw_config = hswep_ubox_hw_config,
+ .get_constraint = uncore_get_constraint,
+ .put_constraint = uncore_put_constraint,
+};
+
+static struct intel_uncore_type hswep_uncore_ubox = {
+ .name = "ubox",
+ .num_counters = 2,
+ .num_boxes = 1,
+ .perf_ctr_bits = 44,
+ .fixed_ctr_bits = 48,
+ .perf_ctr = HSWEP_U_MSR_PMON_CTR0,
+ .event_ctl = HSWEP_U_MSR_PMON_CTL0,
+ .event_mask = SNBEP_U_MSR_PMON_RAW_EVENT_MASK,
+ .fixed_ctr = HSWEP_U_MSR_PMON_UCLK_FIXED_CTR,
+ .fixed_ctl = HSWEP_U_MSR_PMON_UCLK_FIXED_CTL,
+ .num_shared_regs = 1,
+ .ops = &hswep_uncore_ubox_ops,
+ .format_group = &hswep_uncore_ubox_format_group,
+};
+
+static struct attribute *hswep_uncore_cbox_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_tid_en.attr,
+ &format_attr_thresh8.attr,
+ &format_attr_filter_tid3.attr,
+ &format_attr_filter_link2.attr,
+ &format_attr_filter_state3.attr,
+ &format_attr_filter_nid2.attr,
+ &format_attr_filter_opc2.attr,
+ &format_attr_filter_nc.attr,
+ &format_attr_filter_c6.attr,
+ &format_attr_filter_isoc.attr,
+ NULL,
+};
+
+static const struct attribute_group hswep_uncore_cbox_format_group = {
+ .name = "format",
+ .attrs = hswep_uncore_cbox_formats_attr,
+};
+
+static struct event_constraint hswep_uncore_cbox_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x01, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x09, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x3b, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x3e, 0x1),
+ EVENT_CONSTRAINT_END
+};
+
+static struct extra_reg hswep_uncore_cbox_extra_regs[] = {
+ SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
+ SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x1134, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x2134, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4037, 0x40ff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4028, 0x40ff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4032, 0x40ff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4029, 0x40ff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4033, 0x40ff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x402A, 0x40ff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x12),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0x18),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0x18),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x2335, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x8335, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x2135, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x8135, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0x18),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0x18),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x2336, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x8336, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x2136, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x8136, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x5036, 0xffff, 0x8),
+ EVENT_EXTRA_END
+};
+
+static u64 hswep_cbox_filter_mask(int fields)
+{
+ u64 mask = 0;
+ if (fields & 0x1)
+ mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_TID;
+ if (fields & 0x2)
+ mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_LINK;
+ if (fields & 0x4)
+ mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_STATE;
+ if (fields & 0x8)
+ mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_NID;
+ if (fields & 0x10) {
+ mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_OPC;
+ mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_NC;
+ mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_C6;
+ mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_ISOC;
+ }
+ return mask;
+}
+
+static struct event_constraint *
+hswep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+ return __snbep_cbox_get_constraint(box, event, hswep_cbox_filter_mask);
+}
+
+static int hswep_cbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ struct extra_reg *er;
+ int idx = 0;
+
+ for (er = hswep_uncore_cbox_extra_regs; er->msr; er++) {
+ if (er->event != (event->hw.config & er->config_mask))
+ continue;
+ idx |= er->idx;
+ }
+
+ if (idx) {
+ reg1->reg = HSWEP_C0_MSR_PMON_BOX_FILTER0 +
+ HSWEP_CBO_MSR_OFFSET * box->pmu->pmu_idx;
+ reg1->config = event->attr.config1 & hswep_cbox_filter_mask(idx);
+ reg1->idx = idx;
+ }
+ return 0;
+}
+
+static void hswep_cbox_enable_event(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+
+ if (reg1->idx != EXTRA_REG_NONE) {
+ u64 filter = uncore_shared_reg_config(box, 0);
+ wrmsrl(reg1->reg, filter & 0xffffffff);
+ wrmsrl(reg1->reg + 1, filter >> 32);
+ }
+
+ wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static struct intel_uncore_ops hswep_uncore_cbox_ops = {
+ .init_box = snbep_uncore_msr_init_box,
+ .disable_box = snbep_uncore_msr_disable_box,
+ .enable_box = snbep_uncore_msr_enable_box,
+ .disable_event = snbep_uncore_msr_disable_event,
+ .enable_event = hswep_cbox_enable_event,
+ .read_counter = uncore_msr_read_counter,
+ .hw_config = hswep_cbox_hw_config,
+ .get_constraint = hswep_cbox_get_constraint,
+ .put_constraint = snbep_cbox_put_constraint,
+};
+
+static struct intel_uncore_type hswep_uncore_cbox = {
+ .name = "cbox",
+ .num_counters = 4,
+ .num_boxes = 18,
+ .perf_ctr_bits = 48,
+ .event_ctl = HSWEP_C0_MSR_PMON_CTL0,
+ .perf_ctr = HSWEP_C0_MSR_PMON_CTR0,
+ .event_mask = SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
+ .box_ctl = HSWEP_C0_MSR_PMON_BOX_CTL,
+ .msr_offset = HSWEP_CBO_MSR_OFFSET,
+ .num_shared_regs = 1,
+ .constraints = hswep_uncore_cbox_constraints,
+ .ops = &hswep_uncore_cbox_ops,
+ .format_group = &hswep_uncore_cbox_format_group,
+};
+
+/*
+ * Write SBOX Initialization register bit by bit to avoid spurious #GPs
+ */
+static void hswep_uncore_sbox_msr_init_box(struct intel_uncore_box *box)
+{
+ unsigned msr = uncore_msr_box_ctl(box);
+
+ if (msr) {
+ u64 init = SNBEP_PMON_BOX_CTL_INT;
+ u64 flags = 0;
+ int i;
+
+ for_each_set_bit(i, (unsigned long *)&init, 64) {
+ flags |= (1ULL << i);
+ wrmsrl(msr, flags);
+ }
+ }
+}
+
+static struct intel_uncore_ops hswep_uncore_sbox_msr_ops = {
+ __SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+ .init_box = hswep_uncore_sbox_msr_init_box
+};
+
+static struct attribute *hswep_uncore_sbox_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_tid_en.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh8.attr,
+ NULL,
+};
+
+static const struct attribute_group hswep_uncore_sbox_format_group = {
+ .name = "format",
+ .attrs = hswep_uncore_sbox_formats_attr,
+};
+
+static struct intel_uncore_type hswep_uncore_sbox = {
+ .name = "sbox",
+ .num_counters = 4,
+ .num_boxes = 4,
+ .perf_ctr_bits = 44,
+ .event_ctl = HSWEP_S0_MSR_PMON_CTL0,
+ .perf_ctr = HSWEP_S0_MSR_PMON_CTR0,
+ .event_mask = HSWEP_S_MSR_PMON_RAW_EVENT_MASK,
+ .box_ctl = HSWEP_S0_MSR_PMON_BOX_CTL,
+ .msr_offset = HSWEP_SBOX_MSR_OFFSET,
+ .ops = &hswep_uncore_sbox_msr_ops,
+ .format_group = &hswep_uncore_sbox_format_group,
+};
+
+static int hswep_pcu_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+ int ev_sel = hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK;
+
+ if (ev_sel >= 0xb && ev_sel <= 0xe) {
+ reg1->reg = HSWEP_PCU_MSR_PMON_BOX_FILTER;
+ reg1->idx = ev_sel - 0xb;
+ reg1->config = event->attr.config1 & (0xff << reg1->idx);
+ }
+ return 0;
+}
+
+static struct intel_uncore_ops hswep_uncore_pcu_ops = {
+ SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+ .hw_config = hswep_pcu_hw_config,
+ .get_constraint = snbep_pcu_get_constraint,
+ .put_constraint = snbep_pcu_put_constraint,
+};
+
+static struct intel_uncore_type hswep_uncore_pcu = {
+ .name = "pcu",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .perf_ctr = HSWEP_PCU_MSR_PMON_CTR0,
+ .event_ctl = HSWEP_PCU_MSR_PMON_CTL0,
+ .event_mask = SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK,
+ .box_ctl = HSWEP_PCU_MSR_PMON_BOX_CTL,
+ .num_shared_regs = 1,
+ .ops = &hswep_uncore_pcu_ops,
+ .format_group = &snbep_uncore_pcu_format_group,
+};
+
+static struct intel_uncore_type *hswep_msr_uncores[] = {
+ &hswep_uncore_ubox,
+ &hswep_uncore_cbox,
+ &hswep_uncore_sbox,
+ &hswep_uncore_pcu,
+ NULL,
+};
+
+#define HSWEP_PCU_DID 0x2fc0
+#define HSWEP_PCU_CAPID4_OFFET 0x94
+#define hswep_get_chop(_cap) (((_cap) >> 6) & 0x3)
+
+static bool hswep_has_limit_sbox(unsigned int device)
+{
+ struct pci_dev *dev = pci_get_device(PCI_VENDOR_ID_INTEL, device, NULL);
+ u32 capid4;
+
+ if (!dev)
+ return false;
+
+ pci_read_config_dword(dev, HSWEP_PCU_CAPID4_OFFET, &capid4);
+ if (!hswep_get_chop(capid4))
+ return true;
+
+ return false;
+}
+
+void hswep_uncore_cpu_init(void)
+{
+ if (hswep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
+ hswep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
+
+ /* Detect 6-8 core systems with only two SBOXes */
+ if (hswep_has_limit_sbox(HSWEP_PCU_DID))
+ hswep_uncore_sbox.num_boxes = 2;
+
+ uncore_msr_uncores = hswep_msr_uncores;
+}
+
+static struct intel_uncore_type hswep_uncore_ha = {
+ .name = "ha",
+ .num_counters = 4,
+ .num_boxes = 2,
+ .perf_ctr_bits = 48,
+ SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct uncore_event_desc hswep_uncore_imc_events[] = {
+ INTEL_UNCORE_EVENT_DESC(clockticks, "event=0x00,umask=0x00"),
+ INTEL_UNCORE_EVENT_DESC(cas_count_read, "event=0x04,umask=0x03"),
+ INTEL_UNCORE_EVENT_DESC(cas_count_read.scale, "6.103515625e-5"),
+ INTEL_UNCORE_EVENT_DESC(cas_count_read.unit, "MiB"),
+ INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c"),
+ INTEL_UNCORE_EVENT_DESC(cas_count_write.scale, "6.103515625e-5"),
+ INTEL_UNCORE_EVENT_DESC(cas_count_write.unit, "MiB"),
+ { /* end: all zeroes */ },
+};
+
+static struct intel_uncore_type hswep_uncore_imc = {
+ .name = "imc",
+ .num_counters = 4,
+ .num_boxes = 8,
+ .perf_ctr_bits = 48,
+ .fixed_ctr_bits = 48,
+ .fixed_ctr = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
+ .fixed_ctl = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
+ .event_descs = hswep_uncore_imc_events,
+ SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static unsigned hswep_uncore_irp_ctrs[] = {0xa0, 0xa8, 0xb0, 0xb8};
+
+static u64 hswep_uncore_irp_read_counter(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ struct hw_perf_event *hwc = &event->hw;
+ u64 count = 0;
+
+ pci_read_config_dword(pdev, hswep_uncore_irp_ctrs[hwc->idx], (u32 *)&count);
+ pci_read_config_dword(pdev, hswep_uncore_irp_ctrs[hwc->idx] + 4, (u32 *)&count + 1);
+
+ return count;
+}
+
+static struct intel_uncore_ops hswep_uncore_irp_ops = {
+ .init_box = snbep_uncore_pci_init_box,
+ .disable_box = snbep_uncore_pci_disable_box,
+ .enable_box = snbep_uncore_pci_enable_box,
+ .disable_event = ivbep_uncore_irp_disable_event,
+ .enable_event = ivbep_uncore_irp_enable_event,
+ .read_counter = hswep_uncore_irp_read_counter,
+};
+
+static struct intel_uncore_type hswep_uncore_irp = {
+ .name = "irp",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNBEP_PCI_PMON_BOX_CTL,
+ .ops = &hswep_uncore_irp_ops,
+ .format_group = &snbep_uncore_format_group,
+};
+
+static struct intel_uncore_type hswep_uncore_qpi = {
+ .name = "qpi",
+ .num_counters = 4,
+ .num_boxes = 3,
+ .perf_ctr_bits = 48,
+ .perf_ctr = SNBEP_PCI_PMON_CTR0,
+ .event_ctl = SNBEP_PCI_PMON_CTL0,
+ .event_mask = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNBEP_PCI_PMON_BOX_CTL,
+ .num_shared_regs = 1,
+ .ops = &snbep_uncore_qpi_ops,
+ .format_group = &snbep_uncore_qpi_format_group,
+};
+
+static struct event_constraint hswep_uncore_r2pcie_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x23, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x24, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x25, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x27, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x29, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2a, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x2b, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x35, 0x3),
+ EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type hswep_uncore_r2pcie = {
+ .name = "r2pcie",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .constraints = hswep_uncore_r2pcie_constraints,
+ SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct event_constraint hswep_uncore_r3qpi_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x01, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x07, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x08, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x09, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x0a, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x0e, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x12, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x14, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x15, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x1f, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x20, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x22, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x29, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2e, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2f, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x31, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x36, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
+ EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type hswep_uncore_r3qpi = {
+ .name = "r3qpi",
+ .num_counters = 3,
+ .num_boxes = 3,
+ .perf_ctr_bits = 44,
+ .constraints = hswep_uncore_r3qpi_constraints,
+ SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+enum {
+ HSWEP_PCI_UNCORE_HA,
+ HSWEP_PCI_UNCORE_IMC,
+ HSWEP_PCI_UNCORE_IRP,
+ HSWEP_PCI_UNCORE_QPI,
+ HSWEP_PCI_UNCORE_R2PCIE,
+ HSWEP_PCI_UNCORE_R3QPI,
+};
+
+static struct intel_uncore_type *hswep_pci_uncores[] = {
+ [HSWEP_PCI_UNCORE_HA] = &hswep_uncore_ha,
+ [HSWEP_PCI_UNCORE_IMC] = &hswep_uncore_imc,
+ [HSWEP_PCI_UNCORE_IRP] = &hswep_uncore_irp,
+ [HSWEP_PCI_UNCORE_QPI] = &hswep_uncore_qpi,
+ [HSWEP_PCI_UNCORE_R2PCIE] = &hswep_uncore_r2pcie,
+ [HSWEP_PCI_UNCORE_R3QPI] = &hswep_uncore_r3qpi,
+ NULL,
+};
+
+static const struct pci_device_id hswep_uncore_pci_ids[] = {
+ { /* Home Agent 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f30),
+ .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_HA, 0),
+ },
+ { /* Home Agent 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f38),
+ .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_HA, 1),
+ },
+ { /* MC0 Channel 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fb0),
+ .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 0),
+ },
+ { /* MC0 Channel 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fb1),
+ .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 1),
+ },
+ { /* MC0 Channel 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fb4),
+ .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 2),
+ },
+ { /* MC0 Channel 3 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fb5),
+ .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 3),
+ },
+ { /* MC1 Channel 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fd0),
+ .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 4),
+ },
+ { /* MC1 Channel 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fd1),
+ .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 5),
+ },
+ { /* MC1 Channel 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fd4),
+ .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 6),
+ },
+ { /* MC1 Channel 3 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fd5),
+ .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 7),
+ },
+ { /* IRP */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f39),
+ .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IRP, 0),
+ },
+ { /* QPI0 Port 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f32),
+ .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_QPI, 0),
+ },
+ { /* QPI0 Port 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f33),
+ .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_QPI, 1),
+ },
+ { /* QPI1 Port 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f3a),
+ .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_QPI, 2),
+ },
+ { /* R2PCIe */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f34),
+ .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_R2PCIE, 0),
+ },
+ { /* R3QPI0 Link 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f36),
+ .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_R3QPI, 0),
+ },
+ { /* R3QPI0 Link 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f37),
+ .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_R3QPI, 1),
+ },
+ { /* R3QPI1 Link 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f3e),
+ .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_R3QPI, 2),
+ },
+ { /* QPI Port 0 filter */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f86),
+ .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+ SNBEP_PCI_QPI_PORT0_FILTER),
+ },
+ { /* QPI Port 1 filter */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f96),
+ .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+ SNBEP_PCI_QPI_PORT1_FILTER),
+ },
+ { /* end: all zeroes */ }
+};
+
+static struct pci_driver hswep_uncore_pci_driver = {
+ .name = "hswep_uncore",
+ .id_table = hswep_uncore_pci_ids,
+};
+
+int hswep_uncore_pci_init(void)
+{
+ int ret = snbep_pci2phy_map_init(0x2f1e, SNBEP_CPUNODEID, SNBEP_GIDNIDMAP, true);
+ if (ret)
+ return ret;
+ uncore_pci_uncores = hswep_pci_uncores;
+ uncore_pci_driver = &hswep_uncore_pci_driver;
+ return 0;
+}
+/* end of Haswell-EP uncore support */
+
+/* BDX uncore support */
+
+static struct intel_uncore_type bdx_uncore_ubox = {
+ .name = "ubox",
+ .num_counters = 2,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .fixed_ctr_bits = 48,
+ .perf_ctr = HSWEP_U_MSR_PMON_CTR0,
+ .event_ctl = HSWEP_U_MSR_PMON_CTL0,
+ .event_mask = SNBEP_U_MSR_PMON_RAW_EVENT_MASK,
+ .fixed_ctr = HSWEP_U_MSR_PMON_UCLK_FIXED_CTR,
+ .fixed_ctl = HSWEP_U_MSR_PMON_UCLK_FIXED_CTL,
+ .num_shared_regs = 1,
+ .ops = &ivbep_uncore_msr_ops,
+ .format_group = &ivbep_uncore_ubox_format_group,
+};
+
+static struct event_constraint bdx_uncore_cbox_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x09, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x3e, 0x1),
+ EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type bdx_uncore_cbox = {
+ .name = "cbox",
+ .num_counters = 4,
+ .num_boxes = 24,
+ .perf_ctr_bits = 48,
+ .event_ctl = HSWEP_C0_MSR_PMON_CTL0,
+ .perf_ctr = HSWEP_C0_MSR_PMON_CTR0,
+ .event_mask = SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
+ .box_ctl = HSWEP_C0_MSR_PMON_BOX_CTL,
+ .msr_offset = HSWEP_CBO_MSR_OFFSET,
+ .num_shared_regs = 1,
+ .constraints = bdx_uncore_cbox_constraints,
+ .ops = &hswep_uncore_cbox_ops,
+ .format_group = &hswep_uncore_cbox_format_group,
+};
+
+static struct intel_uncore_type bdx_uncore_sbox = {
+ .name = "sbox",
+ .num_counters = 4,
+ .num_boxes = 4,
+ .perf_ctr_bits = 48,
+ .event_ctl = HSWEP_S0_MSR_PMON_CTL0,
+ .perf_ctr = HSWEP_S0_MSR_PMON_CTR0,
+ .event_mask = HSWEP_S_MSR_PMON_RAW_EVENT_MASK,
+ .box_ctl = HSWEP_S0_MSR_PMON_BOX_CTL,
+ .msr_offset = HSWEP_SBOX_MSR_OFFSET,
+ .ops = &hswep_uncore_sbox_msr_ops,
+ .format_group = &hswep_uncore_sbox_format_group,
+};
+
+#define BDX_MSR_UNCORE_SBOX 3
+
+static struct intel_uncore_type *bdx_msr_uncores[] = {
+ &bdx_uncore_ubox,
+ &bdx_uncore_cbox,
+ &hswep_uncore_pcu,
+ &bdx_uncore_sbox,
+ NULL,
+};
+
+/* Bit 7 'Use Occupancy' is not available for counter 0 on BDX */
+static struct event_constraint bdx_uncore_pcu_constraints[] = {
+ EVENT_CONSTRAINT(0x80, 0xe, 0x80),
+ EVENT_CONSTRAINT_END
+};
+
+#define BDX_PCU_DID 0x6fc0
+
+void bdx_uncore_cpu_init(void)
+{
+ if (bdx_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
+ bdx_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
+ uncore_msr_uncores = bdx_msr_uncores;
+
+ /* Detect systems with no SBOXes */
+ if ((boot_cpu_data.x86_model == 86) || hswep_has_limit_sbox(BDX_PCU_DID))
+ uncore_msr_uncores[BDX_MSR_UNCORE_SBOX] = NULL;
+
+ hswep_uncore_pcu.constraints = bdx_uncore_pcu_constraints;
+}
+
+static struct intel_uncore_type bdx_uncore_ha = {
+ .name = "ha",
+ .num_counters = 4,
+ .num_boxes = 2,
+ .perf_ctr_bits = 48,
+ SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type bdx_uncore_imc = {
+ .name = "imc",
+ .num_counters = 4,
+ .num_boxes = 8,
+ .perf_ctr_bits = 48,
+ .fixed_ctr_bits = 48,
+ .fixed_ctr = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
+ .fixed_ctl = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
+ .event_descs = hswep_uncore_imc_events,
+ SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type bdx_uncore_irp = {
+ .name = "irp",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNBEP_PCI_PMON_BOX_CTL,
+ .ops = &hswep_uncore_irp_ops,
+ .format_group = &snbep_uncore_format_group,
+};
+
+static struct intel_uncore_type bdx_uncore_qpi = {
+ .name = "qpi",
+ .num_counters = 4,
+ .num_boxes = 3,
+ .perf_ctr_bits = 48,
+ .perf_ctr = SNBEP_PCI_PMON_CTR0,
+ .event_ctl = SNBEP_PCI_PMON_CTL0,
+ .event_mask = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNBEP_PCI_PMON_BOX_CTL,
+ .num_shared_regs = 1,
+ .ops = &snbep_uncore_qpi_ops,
+ .format_group = &snbep_uncore_qpi_format_group,
+};
+
+static struct event_constraint bdx_uncore_r2pcie_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x23, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x25, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
+ EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type bdx_uncore_r2pcie = {
+ .name = "r2pcie",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .constraints = bdx_uncore_r2pcie_constraints,
+ SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct event_constraint bdx_uncore_r3qpi_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x01, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x07, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x08, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x09, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x0a, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x0e, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x14, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x15, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x1f, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x20, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x22, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x29, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2e, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2f, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x36, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
+ EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type bdx_uncore_r3qpi = {
+ .name = "r3qpi",
+ .num_counters = 3,
+ .num_boxes = 3,
+ .perf_ctr_bits = 48,
+ .constraints = bdx_uncore_r3qpi_constraints,
+ SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+enum {
+ BDX_PCI_UNCORE_HA,
+ BDX_PCI_UNCORE_IMC,
+ BDX_PCI_UNCORE_IRP,
+ BDX_PCI_UNCORE_QPI,
+ BDX_PCI_UNCORE_R2PCIE,
+ BDX_PCI_UNCORE_R3QPI,
+};
+
+static struct intel_uncore_type *bdx_pci_uncores[] = {
+ [BDX_PCI_UNCORE_HA] = &bdx_uncore_ha,
+ [BDX_PCI_UNCORE_IMC] = &bdx_uncore_imc,
+ [BDX_PCI_UNCORE_IRP] = &bdx_uncore_irp,
+ [BDX_PCI_UNCORE_QPI] = &bdx_uncore_qpi,
+ [BDX_PCI_UNCORE_R2PCIE] = &bdx_uncore_r2pcie,
+ [BDX_PCI_UNCORE_R3QPI] = &bdx_uncore_r3qpi,
+ NULL,
+};
+
+static const struct pci_device_id bdx_uncore_pci_ids[] = {
+ { /* Home Agent 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f30),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_HA, 0),
+ },
+ { /* Home Agent 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f38),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_HA, 1),
+ },
+ { /* MC0 Channel 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb0),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 0),
+ },
+ { /* MC0 Channel 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb1),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 1),
+ },
+ { /* MC0 Channel 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb4),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 2),
+ },
+ { /* MC0 Channel 3 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb5),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 3),
+ },
+ { /* MC1 Channel 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd0),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 4),
+ },
+ { /* MC1 Channel 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd1),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 5),
+ },
+ { /* MC1 Channel 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd4),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 6),
+ },
+ { /* MC1 Channel 3 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd5),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 7),
+ },
+ { /* IRP */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f39),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IRP, 0),
+ },
+ { /* QPI0 Port 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f32),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_QPI, 0),
+ },
+ { /* QPI0 Port 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f33),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_QPI, 1),
+ },
+ { /* QPI1 Port 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f3a),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_QPI, 2),
+ },
+ { /* R2PCIe */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f34),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R2PCIE, 0),
+ },
+ { /* R3QPI0 Link 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f36),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R3QPI, 0),
+ },
+ { /* R3QPI0 Link 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f37),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R3QPI, 1),
+ },
+ { /* R3QPI1 Link 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f3e),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R3QPI, 2),
+ },
+ { /* QPI Port 0 filter */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f86),
+ .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+ SNBEP_PCI_QPI_PORT0_FILTER),
+ },
+ { /* QPI Port 1 filter */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f96),
+ .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+ SNBEP_PCI_QPI_PORT1_FILTER),
+ },
+ { /* QPI Port 2 filter */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f46),
+ .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+ BDX_PCI_QPI_PORT2_FILTER),
+ },
+ { /* end: all zeroes */ }
+};
+
+static struct pci_driver bdx_uncore_pci_driver = {
+ .name = "bdx_uncore",
+ .id_table = bdx_uncore_pci_ids,
+};
+
+int bdx_uncore_pci_init(void)
+{
+ int ret = snbep_pci2phy_map_init(0x6f1e, SNBEP_CPUNODEID, SNBEP_GIDNIDMAP, true);
+
+ if (ret)
+ return ret;
+ uncore_pci_uncores = bdx_pci_uncores;
+ uncore_pci_driver = &bdx_uncore_pci_driver;
+ return 0;
+}
+
+/* end of BDX uncore support */
+
+/* SKX uncore support */
+
+static struct intel_uncore_type skx_uncore_ubox = {
+ .name = "ubox",
+ .num_counters = 2,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .fixed_ctr_bits = 48,
+ .perf_ctr = HSWEP_U_MSR_PMON_CTR0,
+ .event_ctl = HSWEP_U_MSR_PMON_CTL0,
+ .event_mask = SNBEP_U_MSR_PMON_RAW_EVENT_MASK,
+ .fixed_ctr = HSWEP_U_MSR_PMON_UCLK_FIXED_CTR,
+ .fixed_ctl = HSWEP_U_MSR_PMON_UCLK_FIXED_CTL,
+ .ops = &ivbep_uncore_msr_ops,
+ .format_group = &ivbep_uncore_ubox_format_group,
+};
+
+static struct attribute *skx_uncore_cha_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_tid_en.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh8.attr,
+ &format_attr_filter_tid4.attr,
+ &format_attr_filter_state5.attr,
+ &format_attr_filter_rem.attr,
+ &format_attr_filter_loc.attr,
+ &format_attr_filter_nm.attr,
+ &format_attr_filter_all_op.attr,
+ &format_attr_filter_not_nm.attr,
+ &format_attr_filter_opc_0.attr,
+ &format_attr_filter_opc_1.attr,
+ &format_attr_filter_nc.attr,
+ &format_attr_filter_isoc.attr,
+ NULL,
+};
+
+static const struct attribute_group skx_uncore_chabox_format_group = {
+ .name = "format",
+ .attrs = skx_uncore_cha_formats_attr,
+};
+
+static struct event_constraint skx_uncore_chabox_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
+ EVENT_CONSTRAINT_END
+};
+
+static struct extra_reg skx_uncore_cha_extra_regs[] = {
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x1134, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x3134, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x9134, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x35, 0xff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x36, 0xff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x38, 0xff, 0x3),
+ EVENT_EXTRA_END
+};
+
+static u64 skx_cha_filter_mask(int fields)
+{
+ u64 mask = 0;
+
+ if (fields & 0x1)
+ mask |= SKX_CHA_MSR_PMON_BOX_FILTER_TID;
+ if (fields & 0x2)
+ mask |= SKX_CHA_MSR_PMON_BOX_FILTER_LINK;
+ if (fields & 0x4)
+ mask |= SKX_CHA_MSR_PMON_BOX_FILTER_STATE;
+ if (fields & 0x8) {
+ mask |= SKX_CHA_MSR_PMON_BOX_FILTER_REM;
+ mask |= SKX_CHA_MSR_PMON_BOX_FILTER_LOC;
+ mask |= SKX_CHA_MSR_PMON_BOX_FILTER_ALL_OPC;
+ mask |= SKX_CHA_MSR_PMON_BOX_FILTER_NM;
+ mask |= SKX_CHA_MSR_PMON_BOX_FILTER_NOT_NM;
+ mask |= SKX_CHA_MSR_PMON_BOX_FILTER_OPC0;
+ mask |= SKX_CHA_MSR_PMON_BOX_FILTER_OPC1;
+ mask |= SKX_CHA_MSR_PMON_BOX_FILTER_NC;
+ mask |= SKX_CHA_MSR_PMON_BOX_FILTER_ISOC;
+ }
+ return mask;
+}
+
+static struct event_constraint *
+skx_cha_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+ return __snbep_cbox_get_constraint(box, event, skx_cha_filter_mask);
+}
+
+static int skx_cha_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ struct extra_reg *er;
+ int idx = 0;
+ /* Any of the CHA events may be filtered by Thread/Core-ID.*/
+ if (event->hw.config & SNBEP_CBO_PMON_CTL_TID_EN)
+ idx = SKX_CHA_MSR_PMON_BOX_FILTER_TID;
+
+ for (er = skx_uncore_cha_extra_regs; er->msr; er++) {
+ if (er->event != (event->hw.config & er->config_mask))
+ continue;
+ idx |= er->idx;
+ }
+
+ if (idx) {
+ reg1->reg = HSWEP_C0_MSR_PMON_BOX_FILTER0 +
+ HSWEP_CBO_MSR_OFFSET * box->pmu->pmu_idx;
+ reg1->config = event->attr.config1 & skx_cha_filter_mask(idx);
+ reg1->idx = idx;
+ }
+ return 0;
+}
+
+static struct intel_uncore_ops skx_uncore_chabox_ops = {
+ /* There is no frz_en for chabox ctl */
+ .init_box = ivbep_uncore_msr_init_box,
+ .disable_box = snbep_uncore_msr_disable_box,
+ .enable_box = snbep_uncore_msr_enable_box,
+ .disable_event = snbep_uncore_msr_disable_event,
+ .enable_event = hswep_cbox_enable_event,
+ .read_counter = uncore_msr_read_counter,
+ .hw_config = skx_cha_hw_config,
+ .get_constraint = skx_cha_get_constraint,
+ .put_constraint = snbep_cbox_put_constraint,
+};
+
+static struct intel_uncore_type skx_uncore_chabox = {
+ .name = "cha",
+ .num_counters = 4,
+ .perf_ctr_bits = 48,
+ .event_ctl = HSWEP_C0_MSR_PMON_CTL0,
+ .perf_ctr = HSWEP_C0_MSR_PMON_CTR0,
+ .event_mask = HSWEP_S_MSR_PMON_RAW_EVENT_MASK,
+ .box_ctl = HSWEP_C0_MSR_PMON_BOX_CTL,
+ .msr_offset = HSWEP_CBO_MSR_OFFSET,
+ .num_shared_regs = 1,
+ .constraints = skx_uncore_chabox_constraints,
+ .ops = &skx_uncore_chabox_ops,
+ .format_group = &skx_uncore_chabox_format_group,
+};
+
+static struct attribute *skx_uncore_iio_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh9.attr,
+ &format_attr_ch_mask.attr,
+ &format_attr_fc_mask.attr,
+ NULL,
+};
+
+static const struct attribute_group skx_uncore_iio_format_group = {
+ .name = "format",
+ .attrs = skx_uncore_iio_formats_attr,
+};
+
+static struct event_constraint skx_uncore_iio_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x83, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x88, 0xc),
+ UNCORE_EVENT_CONSTRAINT(0x95, 0xc),
+ UNCORE_EVENT_CONSTRAINT(0xc0, 0xc),
+ UNCORE_EVENT_CONSTRAINT(0xc5, 0xc),
+ UNCORE_EVENT_CONSTRAINT(0xd4, 0xc),
+ UNCORE_EVENT_CONSTRAINT(0xd5, 0xc),
+ EVENT_CONSTRAINT_END
+};
+
+static void skx_iio_enable_event(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static struct intel_uncore_ops skx_uncore_iio_ops = {
+ .init_box = ivbep_uncore_msr_init_box,
+ .disable_box = snbep_uncore_msr_disable_box,
+ .enable_box = snbep_uncore_msr_enable_box,
+ .disable_event = snbep_uncore_msr_disable_event,
+ .enable_event = skx_iio_enable_event,
+ .read_counter = uncore_msr_read_counter,
+};
+
+static struct intel_uncore_type skx_uncore_iio = {
+ .name = "iio",
+ .num_counters = 4,
+ .num_boxes = 6,
+ .perf_ctr_bits = 48,
+ .event_ctl = SKX_IIO0_MSR_PMON_CTL0,
+ .perf_ctr = SKX_IIO0_MSR_PMON_CTR0,
+ .event_mask = SKX_IIO_PMON_RAW_EVENT_MASK,
+ .event_mask_ext = SKX_IIO_PMON_RAW_EVENT_MASK_EXT,
+ .box_ctl = SKX_IIO0_MSR_PMON_BOX_CTL,
+ .msr_offset = SKX_IIO_MSR_OFFSET,
+ .constraints = skx_uncore_iio_constraints,
+ .ops = &skx_uncore_iio_ops,
+ .format_group = &skx_uncore_iio_format_group,
+};
+
+enum perf_uncore_iio_freerunning_type_id {
+ SKX_IIO_MSR_IOCLK = 0,
+ SKX_IIO_MSR_BW = 1,
+ SKX_IIO_MSR_UTIL = 2,
+
+ SKX_IIO_FREERUNNING_TYPE_MAX,
+};
+
+
+static struct freerunning_counters skx_iio_freerunning[] = {
+ [SKX_IIO_MSR_IOCLK] = { 0xa45, 0x1, 0x20, 1, 36 },
+ [SKX_IIO_MSR_BW] = { 0xb00, 0x1, 0x10, 8, 36 },
+ [SKX_IIO_MSR_UTIL] = { 0xb08, 0x1, 0x10, 8, 36 },
+};
+
+static struct uncore_event_desc skx_uncore_iio_freerunning_events[] = {
+ /* Free-Running IO CLOCKS Counter */
+ INTEL_UNCORE_EVENT_DESC(ioclk, "event=0xff,umask=0x10"),
+ /* Free-Running IIO BANDWIDTH Counters */
+ INTEL_UNCORE_EVENT_DESC(bw_in_port0, "event=0xff,umask=0x20"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port0.scale, "3.814697266e-6"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port0.unit, "MiB"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port1, "event=0xff,umask=0x21"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port1.scale, "3.814697266e-6"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port1.unit, "MiB"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port2, "event=0xff,umask=0x22"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port2.scale, "3.814697266e-6"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port2.unit, "MiB"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port3, "event=0xff,umask=0x23"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port3.scale, "3.814697266e-6"),
+ INTEL_UNCORE_EVENT_DESC(bw_in_port3.unit, "MiB"),
+ INTEL_UNCORE_EVENT_DESC(bw_out_port0, "event=0xff,umask=0x24"),
+ INTEL_UNCORE_EVENT_DESC(bw_out_port0.scale, "3.814697266e-6"),
+ INTEL_UNCORE_EVENT_DESC(bw_out_port0.unit, "MiB"),
+ INTEL_UNCORE_EVENT_DESC(bw_out_port1, "event=0xff,umask=0x25"),
+ INTEL_UNCORE_EVENT_DESC(bw_out_port1.scale, "3.814697266e-6"),
+ INTEL_UNCORE_EVENT_DESC(bw_out_port1.unit, "MiB"),
+ INTEL_UNCORE_EVENT_DESC(bw_out_port2, "event=0xff,umask=0x26"),
+ INTEL_UNCORE_EVENT_DESC(bw_out_port2.scale, "3.814697266e-6"),
+ INTEL_UNCORE_EVENT_DESC(bw_out_port2.unit, "MiB"),
+ INTEL_UNCORE_EVENT_DESC(bw_out_port3, "event=0xff,umask=0x27"),
+ INTEL_UNCORE_EVENT_DESC(bw_out_port3.scale, "3.814697266e-6"),
+ INTEL_UNCORE_EVENT_DESC(bw_out_port3.unit, "MiB"),
+ /* Free-running IIO UTILIZATION Counters */
+ INTEL_UNCORE_EVENT_DESC(util_in_port0, "event=0xff,umask=0x30"),
+ INTEL_UNCORE_EVENT_DESC(util_out_port0, "event=0xff,umask=0x31"),
+ INTEL_UNCORE_EVENT_DESC(util_in_port1, "event=0xff,umask=0x32"),
+ INTEL_UNCORE_EVENT_DESC(util_out_port1, "event=0xff,umask=0x33"),
+ INTEL_UNCORE_EVENT_DESC(util_in_port2, "event=0xff,umask=0x34"),
+ INTEL_UNCORE_EVENT_DESC(util_out_port2, "event=0xff,umask=0x35"),
+ INTEL_UNCORE_EVENT_DESC(util_in_port3, "event=0xff,umask=0x36"),
+ INTEL_UNCORE_EVENT_DESC(util_out_port3, "event=0xff,umask=0x37"),
+ { /* end: all zeroes */ },
+};
+
+static struct intel_uncore_ops skx_uncore_iio_freerunning_ops = {
+ .read_counter = uncore_msr_read_counter,
+ .hw_config = uncore_freerunning_hw_config,
+};
+
+static struct attribute *skx_uncore_iio_freerunning_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ NULL,
+};
+
+static const struct attribute_group skx_uncore_iio_freerunning_format_group = {
+ .name = "format",
+ .attrs = skx_uncore_iio_freerunning_formats_attr,
+};
+
+static struct intel_uncore_type skx_uncore_iio_free_running = {
+ .name = "iio_free_running",
+ .num_counters = 17,
+ .num_boxes = 6,
+ .num_freerunning_types = SKX_IIO_FREERUNNING_TYPE_MAX,
+ .freerunning = skx_iio_freerunning,
+ .ops = &skx_uncore_iio_freerunning_ops,
+ .event_descs = skx_uncore_iio_freerunning_events,
+ .format_group = &skx_uncore_iio_freerunning_format_group,
+};
+
+static struct attribute *skx_uncore_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh8.attr,
+ NULL,
+};
+
+static const struct attribute_group skx_uncore_format_group = {
+ .name = "format",
+ .attrs = skx_uncore_formats_attr,
+};
+
+static struct intel_uncore_type skx_uncore_irp = {
+ .name = "irp",
+ .num_counters = 2,
+ .num_boxes = 6,
+ .perf_ctr_bits = 48,
+ .event_ctl = SKX_IRP0_MSR_PMON_CTL0,
+ .perf_ctr = SKX_IRP0_MSR_PMON_CTR0,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .box_ctl = SKX_IRP0_MSR_PMON_BOX_CTL,
+ .msr_offset = SKX_IRP_MSR_OFFSET,
+ .ops = &skx_uncore_iio_ops,
+ .format_group = &skx_uncore_format_group,
+};
+
+static struct attribute *skx_uncore_pcu_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh8.attr,
+ &format_attr_occ_invert.attr,
+ &format_attr_occ_edge_det.attr,
+ &format_attr_filter_band0.attr,
+ &format_attr_filter_band1.attr,
+ &format_attr_filter_band2.attr,
+ &format_attr_filter_band3.attr,
+ NULL,
+};
+
+static struct attribute_group skx_uncore_pcu_format_group = {
+ .name = "format",
+ .attrs = skx_uncore_pcu_formats_attr,
+};
+
+static struct intel_uncore_ops skx_uncore_pcu_ops = {
+ IVBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+ .hw_config = hswep_pcu_hw_config,
+ .get_constraint = snbep_pcu_get_constraint,
+ .put_constraint = snbep_pcu_put_constraint,
+};
+
+static struct intel_uncore_type skx_uncore_pcu = {
+ .name = "pcu",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .perf_ctr = HSWEP_PCU_MSR_PMON_CTR0,
+ .event_ctl = HSWEP_PCU_MSR_PMON_CTL0,
+ .event_mask = SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK,
+ .box_ctl = HSWEP_PCU_MSR_PMON_BOX_CTL,
+ .num_shared_regs = 1,
+ .ops = &skx_uncore_pcu_ops,
+ .format_group = &skx_uncore_pcu_format_group,
+};
+
+static struct intel_uncore_type *skx_msr_uncores[] = {
+ &skx_uncore_ubox,
+ &skx_uncore_chabox,
+ &skx_uncore_iio,
+ &skx_uncore_iio_free_running,
+ &skx_uncore_irp,
+ &skx_uncore_pcu,
+ NULL,
+};
+
+/*
+ * To determine the number of CHAs, it should read bits 27:0 in the CAPID6
+ * register which located at Device 30, Function 3, Offset 0x9C. PCI ID 0x2083.
+ */
+#define SKX_CAPID6 0x9c
+#define SKX_CHA_BIT_MASK GENMASK(27, 0)
+
+static int skx_count_chabox(void)
+{
+ struct pci_dev *dev = NULL;
+ u32 val = 0;
+
+ dev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x2083, dev);
+ if (!dev)
+ goto out;
+
+ pci_read_config_dword(dev, SKX_CAPID6, &val);
+ val &= SKX_CHA_BIT_MASK;
+out:
+ pci_dev_put(dev);
+ return hweight32(val);
+}
+
+void skx_uncore_cpu_init(void)
+{
+ skx_uncore_chabox.num_boxes = skx_count_chabox();
+ uncore_msr_uncores = skx_msr_uncores;
+}
+
+static struct intel_uncore_type skx_uncore_imc = {
+ .name = "imc",
+ .num_counters = 4,
+ .num_boxes = 6,
+ .perf_ctr_bits = 48,
+ .fixed_ctr_bits = 48,
+ .fixed_ctr = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
+ .fixed_ctl = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
+ .event_descs = hswep_uncore_imc_events,
+ .perf_ctr = SNBEP_PCI_PMON_CTR0,
+ .event_ctl = SNBEP_PCI_PMON_CTL0,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNBEP_PCI_PMON_BOX_CTL,
+ .ops = &ivbep_uncore_pci_ops,
+ .format_group = &skx_uncore_format_group,
+};
+
+static struct attribute *skx_upi_uncore_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask_ext.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh8.attr,
+ NULL,
+};
+
+static const struct attribute_group skx_upi_uncore_format_group = {
+ .name = "format",
+ .attrs = skx_upi_uncore_formats_attr,
+};
+
+static void skx_upi_uncore_pci_init_box(struct intel_uncore_box *box)
+{
+ struct pci_dev *pdev = box->pci_dev;
+
+ __set_bit(UNCORE_BOX_FLAG_CTL_OFFS8, &box->flags);
+ pci_write_config_dword(pdev, SKX_UPI_PCI_PMON_BOX_CTL, IVBEP_PMON_BOX_CTL_INT);
+}
+
+static struct intel_uncore_ops skx_upi_uncore_pci_ops = {
+ .init_box = skx_upi_uncore_pci_init_box,
+ .disable_box = snbep_uncore_pci_disable_box,
+ .enable_box = snbep_uncore_pci_enable_box,
+ .disable_event = snbep_uncore_pci_disable_event,
+ .enable_event = snbep_uncore_pci_enable_event,
+ .read_counter = snbep_uncore_pci_read_counter,
+};
+
+static struct intel_uncore_type skx_uncore_upi = {
+ .name = "upi",
+ .num_counters = 4,
+ .num_boxes = 3,
+ .perf_ctr_bits = 48,
+ .perf_ctr = SKX_UPI_PCI_PMON_CTR0,
+ .event_ctl = SKX_UPI_PCI_PMON_CTL0,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .event_mask_ext = SKX_UPI_CTL_UMASK_EXT,
+ .box_ctl = SKX_UPI_PCI_PMON_BOX_CTL,
+ .ops = &skx_upi_uncore_pci_ops,
+ .format_group = &skx_upi_uncore_format_group,
+};
+
+static void skx_m2m_uncore_pci_init_box(struct intel_uncore_box *box)
+{
+ struct pci_dev *pdev = box->pci_dev;
+
+ __set_bit(UNCORE_BOX_FLAG_CTL_OFFS8, &box->flags);
+ pci_write_config_dword(pdev, SKX_M2M_PCI_PMON_BOX_CTL, IVBEP_PMON_BOX_CTL_INT);
+}
+
+static struct intel_uncore_ops skx_m2m_uncore_pci_ops = {
+ .init_box = skx_m2m_uncore_pci_init_box,
+ .disable_box = snbep_uncore_pci_disable_box,
+ .enable_box = snbep_uncore_pci_enable_box,
+ .disable_event = snbep_uncore_pci_disable_event,
+ .enable_event = snbep_uncore_pci_enable_event,
+ .read_counter = snbep_uncore_pci_read_counter,
+};
+
+static struct intel_uncore_type skx_uncore_m2m = {
+ .name = "m2m",
+ .num_counters = 4,
+ .num_boxes = 2,
+ .perf_ctr_bits = 48,
+ .perf_ctr = SKX_M2M_PCI_PMON_CTR0,
+ .event_ctl = SKX_M2M_PCI_PMON_CTL0,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .box_ctl = SKX_M2M_PCI_PMON_BOX_CTL,
+ .ops = &skx_m2m_uncore_pci_ops,
+ .format_group = &skx_uncore_format_group,
+};
+
+static struct event_constraint skx_uncore_m2pcie_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+ EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type skx_uncore_m2pcie = {
+ .name = "m2pcie",
+ .num_counters = 4,
+ .num_boxes = 4,
+ .perf_ctr_bits = 48,
+ .constraints = skx_uncore_m2pcie_constraints,
+ .perf_ctr = SNBEP_PCI_PMON_CTR0,
+ .event_ctl = SNBEP_PCI_PMON_CTL0,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNBEP_PCI_PMON_BOX_CTL,
+ .ops = &ivbep_uncore_pci_ops,
+ .format_group = &skx_uncore_format_group,
+};
+
+static struct event_constraint skx_uncore_m3upi_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x1d, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x1e, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x40, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x4e, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x4f, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x50, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x51, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x52, 0x7),
+ EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type skx_uncore_m3upi = {
+ .name = "m3upi",
+ .num_counters = 3,
+ .num_boxes = 3,
+ .perf_ctr_bits = 48,
+ .constraints = skx_uncore_m3upi_constraints,
+ .perf_ctr = SNBEP_PCI_PMON_CTR0,
+ .event_ctl = SNBEP_PCI_PMON_CTL0,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNBEP_PCI_PMON_BOX_CTL,
+ .ops = &ivbep_uncore_pci_ops,
+ .format_group = &skx_uncore_format_group,
+};
+
+enum {
+ SKX_PCI_UNCORE_IMC,
+ SKX_PCI_UNCORE_M2M,
+ SKX_PCI_UNCORE_UPI,
+ SKX_PCI_UNCORE_M2PCIE,
+ SKX_PCI_UNCORE_M3UPI,
+};
+
+static struct intel_uncore_type *skx_pci_uncores[] = {
+ [SKX_PCI_UNCORE_IMC] = &skx_uncore_imc,
+ [SKX_PCI_UNCORE_M2M] = &skx_uncore_m2m,
+ [SKX_PCI_UNCORE_UPI] = &skx_uncore_upi,
+ [SKX_PCI_UNCORE_M2PCIE] = &skx_uncore_m2pcie,
+ [SKX_PCI_UNCORE_M3UPI] = &skx_uncore_m3upi,
+ NULL,
+};
+
+static const struct pci_device_id skx_uncore_pci_ids[] = {
+ { /* MC0 Channel 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2042),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(10, 2, SKX_PCI_UNCORE_IMC, 0),
+ },
+ { /* MC0 Channel 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2046),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(10, 6, SKX_PCI_UNCORE_IMC, 1),
+ },
+ { /* MC0 Channel 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204a),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(11, 2, SKX_PCI_UNCORE_IMC, 2),
+ },
+ { /* MC1 Channel 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2042),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(12, 2, SKX_PCI_UNCORE_IMC, 3),
+ },
+ { /* MC1 Channel 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2046),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(12, 6, SKX_PCI_UNCORE_IMC, 4),
+ },
+ { /* MC1 Channel 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204a),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(13, 2, SKX_PCI_UNCORE_IMC, 5),
+ },
+ { /* M2M0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2066),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(8, 0, SKX_PCI_UNCORE_M2M, 0),
+ },
+ { /* M2M1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2066),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(9, 0, SKX_PCI_UNCORE_M2M, 1),
+ },
+ { /* UPI0 Link 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2058),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(14, 0, SKX_PCI_UNCORE_UPI, 0),
+ },
+ { /* UPI0 Link 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2058),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(15, 0, SKX_PCI_UNCORE_UPI, 1),
+ },
+ { /* UPI1 Link 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2058),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(16, 0, SKX_PCI_UNCORE_UPI, 2),
+ },
+ { /* M2PCIe 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2088),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(21, 1, SKX_PCI_UNCORE_M2PCIE, 0),
+ },
+ { /* M2PCIe 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2088),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(22, 1, SKX_PCI_UNCORE_M2PCIE, 1),
+ },
+ { /* M2PCIe 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2088),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(23, 1, SKX_PCI_UNCORE_M2PCIE, 2),
+ },
+ { /* M2PCIe 3 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2088),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(21, 5, SKX_PCI_UNCORE_M2PCIE, 3),
+ },
+ { /* M3UPI0 Link 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204D),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(18, 1, SKX_PCI_UNCORE_M3UPI, 0),
+ },
+ { /* M3UPI0 Link 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204E),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(18, 2, SKX_PCI_UNCORE_M3UPI, 1),
+ },
+ { /* M3UPI1 Link 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204D),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(18, 5, SKX_PCI_UNCORE_M3UPI, 2),
+ },
+ { /* end: all zeroes */ }
+};
+
+
+static struct pci_driver skx_uncore_pci_driver = {
+ .name = "skx_uncore",
+ .id_table = skx_uncore_pci_ids,
+};
+
+int skx_uncore_pci_init(void)
+{
+ /* need to double check pci address */
+ int ret = snbep_pci2phy_map_init(0x2014, SKX_CPUNODEID, SKX_GIDNIDMAP, false);
+
+ if (ret)
+ return ret;
+
+ uncore_pci_uncores = skx_pci_uncores;
+ uncore_pci_driver = &skx_uncore_pci_driver;
+ return 0;
+}
+
+/* end of SKX uncore support */
diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c
new file mode 100644
index 000000000..ace6c1e75
--- /dev/null
+++ b/arch/x86/events/msr.c
@@ -0,0 +1,292 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/perf_event.h>
+#include <linux/nospec.h>
+#include <asm/intel-family.h>
+
+enum perf_msr_id {
+ PERF_MSR_TSC = 0,
+ PERF_MSR_APERF = 1,
+ PERF_MSR_MPERF = 2,
+ PERF_MSR_PPERF = 3,
+ PERF_MSR_SMI = 4,
+ PERF_MSR_PTSC = 5,
+ PERF_MSR_IRPERF = 6,
+ PERF_MSR_THERM = 7,
+ PERF_MSR_THERM_SNAP = 8,
+ PERF_MSR_THERM_UNIT = 9,
+ PERF_MSR_EVENT_MAX,
+};
+
+static bool test_aperfmperf(int idx)
+{
+ return boot_cpu_has(X86_FEATURE_APERFMPERF);
+}
+
+static bool test_ptsc(int idx)
+{
+ return boot_cpu_has(X86_FEATURE_PTSC);
+}
+
+static bool test_irperf(int idx)
+{
+ return boot_cpu_has(X86_FEATURE_IRPERF);
+}
+
+static bool test_therm_status(int idx)
+{
+ return boot_cpu_has(X86_FEATURE_DTHERM);
+}
+
+static bool test_intel(int idx)
+{
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
+ boot_cpu_data.x86 != 6)
+ return false;
+
+ switch (boot_cpu_data.x86_model) {
+ case INTEL_FAM6_NEHALEM:
+ case INTEL_FAM6_NEHALEM_G:
+ case INTEL_FAM6_NEHALEM_EP:
+ case INTEL_FAM6_NEHALEM_EX:
+
+ case INTEL_FAM6_WESTMERE:
+ case INTEL_FAM6_WESTMERE_EP:
+ case INTEL_FAM6_WESTMERE_EX:
+
+ case INTEL_FAM6_SANDYBRIDGE:
+ case INTEL_FAM6_SANDYBRIDGE_X:
+
+ case INTEL_FAM6_IVYBRIDGE:
+ case INTEL_FAM6_IVYBRIDGE_X:
+
+ case INTEL_FAM6_HASWELL_CORE:
+ case INTEL_FAM6_HASWELL_X:
+ case INTEL_FAM6_HASWELL_ULT:
+ case INTEL_FAM6_HASWELL_GT3E:
+
+ case INTEL_FAM6_BROADWELL_CORE:
+ case INTEL_FAM6_BROADWELL_XEON_D:
+ case INTEL_FAM6_BROADWELL_GT3E:
+ case INTEL_FAM6_BROADWELL_X:
+
+ case INTEL_FAM6_ATOM_SILVERMONT:
+ case INTEL_FAM6_ATOM_SILVERMONT_X:
+ case INTEL_FAM6_ATOM_AIRMONT:
+
+ case INTEL_FAM6_ATOM_GOLDMONT:
+ case INTEL_FAM6_ATOM_GOLDMONT_X:
+
+ case INTEL_FAM6_ATOM_GOLDMONT_PLUS:
+
+ case INTEL_FAM6_XEON_PHI_KNL:
+ case INTEL_FAM6_XEON_PHI_KNM:
+ if (idx == PERF_MSR_SMI)
+ return true;
+ break;
+
+ case INTEL_FAM6_SKYLAKE_MOBILE:
+ case INTEL_FAM6_SKYLAKE_DESKTOP:
+ case INTEL_FAM6_SKYLAKE_X:
+ case INTEL_FAM6_KABYLAKE_MOBILE:
+ case INTEL_FAM6_KABYLAKE_DESKTOP:
+ case INTEL_FAM6_ICELAKE_MOBILE:
+ if (idx == PERF_MSR_SMI || idx == PERF_MSR_PPERF)
+ return true;
+ break;
+ }
+
+ return false;
+}
+
+struct perf_msr {
+ u64 msr;
+ struct perf_pmu_events_attr *attr;
+ bool (*test)(int idx);
+};
+
+PMU_EVENT_ATTR_STRING(tsc, evattr_tsc, "event=0x00" );
+PMU_EVENT_ATTR_STRING(aperf, evattr_aperf, "event=0x01" );
+PMU_EVENT_ATTR_STRING(mperf, evattr_mperf, "event=0x02" );
+PMU_EVENT_ATTR_STRING(pperf, evattr_pperf, "event=0x03" );
+PMU_EVENT_ATTR_STRING(smi, evattr_smi, "event=0x04" );
+PMU_EVENT_ATTR_STRING(ptsc, evattr_ptsc, "event=0x05" );
+PMU_EVENT_ATTR_STRING(irperf, evattr_irperf, "event=0x06" );
+PMU_EVENT_ATTR_STRING(cpu_thermal_margin, evattr_therm, "event=0x07" );
+PMU_EVENT_ATTR_STRING(cpu_thermal_margin.snapshot, evattr_therm_snap, "1" );
+PMU_EVENT_ATTR_STRING(cpu_thermal_margin.unit, evattr_therm_unit, "C" );
+
+static struct perf_msr msr[] = {
+ [PERF_MSR_TSC] = { 0, &evattr_tsc, NULL, },
+ [PERF_MSR_APERF] = { MSR_IA32_APERF, &evattr_aperf, test_aperfmperf, },
+ [PERF_MSR_MPERF] = { MSR_IA32_MPERF, &evattr_mperf, test_aperfmperf, },
+ [PERF_MSR_PPERF] = { MSR_PPERF, &evattr_pperf, test_intel, },
+ [PERF_MSR_SMI] = { MSR_SMI_COUNT, &evattr_smi, test_intel, },
+ [PERF_MSR_PTSC] = { MSR_F15H_PTSC, &evattr_ptsc, test_ptsc, },
+ [PERF_MSR_IRPERF] = { MSR_F17H_IRPERF, &evattr_irperf, test_irperf, },
+ [PERF_MSR_THERM] = { MSR_IA32_THERM_STATUS, &evattr_therm, test_therm_status, },
+ [PERF_MSR_THERM_SNAP] = { MSR_IA32_THERM_STATUS, &evattr_therm_snap, test_therm_status, },
+ [PERF_MSR_THERM_UNIT] = { MSR_IA32_THERM_STATUS, &evattr_therm_unit, test_therm_status, },
+};
+
+static struct attribute *events_attrs[PERF_MSR_EVENT_MAX + 1] = {
+ NULL,
+};
+
+static struct attribute_group events_attr_group = {
+ .name = "events",
+ .attrs = events_attrs,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-63");
+static struct attribute *format_attrs[] = {
+ &format_attr_event.attr,
+ NULL,
+};
+static struct attribute_group format_attr_group = {
+ .name = "format",
+ .attrs = format_attrs,
+};
+
+static const struct attribute_group *attr_groups[] = {
+ &events_attr_group,
+ &format_attr_group,
+ NULL,
+};
+
+static int msr_event_init(struct perf_event *event)
+{
+ u64 cfg = event->attr.config;
+
+ if (event->attr.type != event->pmu->type)
+ return -ENOENT;
+
+ /* unsupported modes and filters */
+ if (event->attr.exclude_user ||
+ event->attr.exclude_kernel ||
+ event->attr.exclude_hv ||
+ event->attr.exclude_idle ||
+ event->attr.exclude_host ||
+ event->attr.exclude_guest ||
+ event->attr.sample_period) /* no sampling */
+ return -EINVAL;
+
+ if (cfg >= PERF_MSR_EVENT_MAX)
+ return -EINVAL;
+
+ cfg = array_index_nospec((unsigned long)cfg, PERF_MSR_EVENT_MAX);
+
+ if (!msr[cfg].attr)
+ return -EINVAL;
+
+ event->hw.idx = -1;
+ event->hw.event_base = msr[cfg].msr;
+ event->hw.config = cfg;
+
+ return 0;
+}
+
+static inline u64 msr_read_counter(struct perf_event *event)
+{
+ u64 now;
+
+ if (event->hw.event_base)
+ rdmsrl(event->hw.event_base, now);
+ else
+ now = rdtsc_ordered();
+
+ return now;
+}
+
+static void msr_event_update(struct perf_event *event)
+{
+ u64 prev, now;
+ s64 delta;
+
+ /* Careful, an NMI might modify the previous event value: */
+again:
+ prev = local64_read(&event->hw.prev_count);
+ now = msr_read_counter(event);
+
+ if (local64_cmpxchg(&event->hw.prev_count, prev, now) != prev)
+ goto again;
+
+ delta = now - prev;
+ if (unlikely(event->hw.event_base == MSR_SMI_COUNT)) {
+ delta = sign_extend64(delta, 31);
+ local64_add(delta, &event->count);
+ } else if (unlikely(event->hw.event_base == MSR_IA32_THERM_STATUS)) {
+ /* If valid, extract digital readout, otherwise set to -1: */
+ now = now & (1ULL << 31) ? (now >> 16) & 0x3f : -1;
+ local64_set(&event->count, now);
+ } else {
+ local64_add(delta, &event->count);
+ }
+}
+
+static void msr_event_start(struct perf_event *event, int flags)
+{
+ u64 now = msr_read_counter(event);
+
+ local64_set(&event->hw.prev_count, now);
+}
+
+static void msr_event_stop(struct perf_event *event, int flags)
+{
+ msr_event_update(event);
+}
+
+static void msr_event_del(struct perf_event *event, int flags)
+{
+ msr_event_stop(event, PERF_EF_UPDATE);
+}
+
+static int msr_event_add(struct perf_event *event, int flags)
+{
+ if (flags & PERF_EF_START)
+ msr_event_start(event, flags);
+
+ return 0;
+}
+
+static struct pmu pmu_msr = {
+ .task_ctx_nr = perf_sw_context,
+ .attr_groups = attr_groups,
+ .event_init = msr_event_init,
+ .add = msr_event_add,
+ .del = msr_event_del,
+ .start = msr_event_start,
+ .stop = msr_event_stop,
+ .read = msr_event_update,
+ .capabilities = PERF_PMU_CAP_NO_INTERRUPT,
+};
+
+static int __init msr_init(void)
+{
+ int i, j = 0;
+
+ if (!boot_cpu_has(X86_FEATURE_TSC)) {
+ pr_cont("no MSR PMU driver.\n");
+ return 0;
+ }
+
+ /* Probe the MSRs. */
+ for (i = PERF_MSR_TSC + 1; i < PERF_MSR_EVENT_MAX; i++) {
+ u64 val;
+
+ /* Virt sucks; you cannot tell if a R/O MSR is present :/ */
+ if (!msr[i].test(i) || rdmsrl_safe(msr[i].msr, &val))
+ msr[i].attr = NULL;
+ }
+
+ /* List remaining MSRs in the sysfs attrs. */
+ for (i = 0; i < PERF_MSR_EVENT_MAX; i++) {
+ if (msr[i].attr)
+ events_attrs[j++] = &msr[i].attr->attr.attr;
+ }
+ events_attrs[j] = NULL;
+
+ perf_pmu_register(&pmu_msr, "msr", -1);
+
+ return 0;
+}
+device_initcall(msr_init);
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
new file mode 100644
index 000000000..d2e87dbc5
--- /dev/null
+++ b/arch/x86/events/perf_event.h
@@ -0,0 +1,1047 @@
+/*
+ * Performance events x86 architecture header
+ *
+ * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
+ * Copyright (C) 2009 Jaswinder Singh Rajput
+ * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
+ * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra
+ * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
+ * Copyright (C) 2009 Google, Inc., Stephane Eranian
+ *
+ * For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/perf_event.h>
+
+#include <asm/intel_ds.h>
+
+/* To enable MSR tracing please use the generic trace points. */
+
+/*
+ * | NHM/WSM | SNB |
+ * register -------------------------------
+ * | HT | no HT | HT | no HT |
+ *-----------------------------------------
+ * offcore | core | core | cpu | core |
+ * lbr_sel | core | core | cpu | core |
+ * ld_lat | cpu | core | cpu | core |
+ *-----------------------------------------
+ *
+ * Given that there is a small number of shared regs,
+ * we can pre-allocate their slot in the per-cpu
+ * per-core reg tables.
+ */
+enum extra_reg_type {
+ EXTRA_REG_NONE = -1, /* not used */
+
+ EXTRA_REG_RSP_0 = 0, /* offcore_response_0 */
+ EXTRA_REG_RSP_1 = 1, /* offcore_response_1 */
+ EXTRA_REG_LBR = 2, /* lbr_select */
+ EXTRA_REG_LDLAT = 3, /* ld_lat_threshold */
+ EXTRA_REG_FE = 4, /* fe_* */
+
+ EXTRA_REG_MAX /* number of entries needed */
+};
+
+struct event_constraint {
+ union {
+ unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+ u64 idxmsk64;
+ };
+ u64 code;
+ u64 cmask;
+ int weight;
+ int overlap;
+ int flags;
+};
+/*
+ * struct hw_perf_event.flags flags
+ */
+#define PERF_X86_EVENT_PEBS_LDLAT 0x0001 /* ld+ldlat data address sampling */
+#define PERF_X86_EVENT_PEBS_ST 0x0002 /* st data address sampling */
+#define PERF_X86_EVENT_PEBS_ST_HSW 0x0004 /* haswell style datala, store */
+#define PERF_X86_EVENT_COMMITTED 0x0008 /* event passed commit_txn */
+#define PERF_X86_EVENT_PEBS_LD_HSW 0x0010 /* haswell style datala, load */
+#define PERF_X86_EVENT_PEBS_NA_HSW 0x0020 /* haswell style datala, unknown */
+#define PERF_X86_EVENT_EXCL 0x0040 /* HT exclusivity on counter */
+#define PERF_X86_EVENT_DYNAMIC 0x0080 /* dynamic alloc'd constraint */
+#define PERF_X86_EVENT_RDPMC_ALLOWED 0x0100 /* grant rdpmc permission */
+#define PERF_X86_EVENT_EXCL_ACCT 0x0200 /* accounted EXCL event */
+#define PERF_X86_EVENT_AUTO_RELOAD 0x0400 /* use PEBS auto-reload */
+#define PERF_X86_EVENT_LARGE_PEBS 0x0800 /* use large PEBS */
+
+
+struct amd_nb {
+ int nb_id; /* NorthBridge id */
+ int refcnt; /* reference count */
+ struct perf_event *owners[X86_PMC_IDX_MAX];
+ struct event_constraint event_constraints[X86_PMC_IDX_MAX];
+};
+
+#define PEBS_COUNTER_MASK ((1ULL << MAX_PEBS_EVENTS) - 1)
+
+/*
+ * Flags PEBS can handle without an PMI.
+ *
+ * TID can only be handled by flushing at context switch.
+ * REGS_USER can be handled for events limited to ring 3.
+ *
+ */
+#define LARGE_PEBS_FLAGS \
+ (PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR | \
+ PERF_SAMPLE_ID | PERF_SAMPLE_CPU | PERF_SAMPLE_STREAM_ID | \
+ PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \
+ PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR | \
+ PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER | \
+ PERF_SAMPLE_PERIOD)
+
+#define PEBS_GP_REGS \
+ ((1ULL << PERF_REG_X86_AX) | \
+ (1ULL << PERF_REG_X86_BX) | \
+ (1ULL << PERF_REG_X86_CX) | \
+ (1ULL << PERF_REG_X86_DX) | \
+ (1ULL << PERF_REG_X86_DI) | \
+ (1ULL << PERF_REG_X86_SI) | \
+ (1ULL << PERF_REG_X86_SP) | \
+ (1ULL << PERF_REG_X86_BP) | \
+ (1ULL << PERF_REG_X86_IP) | \
+ (1ULL << PERF_REG_X86_FLAGS) | \
+ (1ULL << PERF_REG_X86_R8) | \
+ (1ULL << PERF_REG_X86_R9) | \
+ (1ULL << PERF_REG_X86_R10) | \
+ (1ULL << PERF_REG_X86_R11) | \
+ (1ULL << PERF_REG_X86_R12) | \
+ (1ULL << PERF_REG_X86_R13) | \
+ (1ULL << PERF_REG_X86_R14) | \
+ (1ULL << PERF_REG_X86_R15))
+
+/*
+ * Per register state.
+ */
+struct er_account {
+ raw_spinlock_t lock; /* per-core: protect structure */
+ u64 config; /* extra MSR config */
+ u64 reg; /* extra MSR number */
+ atomic_t ref; /* reference count */
+};
+
+/*
+ * Per core/cpu state
+ *
+ * Used to coordinate shared registers between HT threads or
+ * among events on a single PMU.
+ */
+struct intel_shared_regs {
+ struct er_account regs[EXTRA_REG_MAX];
+ int refcnt; /* per-core: #HT threads */
+ unsigned core_id; /* per-core: core id */
+};
+
+enum intel_excl_state_type {
+ INTEL_EXCL_UNUSED = 0, /* counter is unused */
+ INTEL_EXCL_SHARED = 1, /* counter can be used by both threads */
+ INTEL_EXCL_EXCLUSIVE = 2, /* counter can be used by one thread only */
+};
+
+struct intel_excl_states {
+ enum intel_excl_state_type state[X86_PMC_IDX_MAX];
+ bool sched_started; /* true if scheduling has started */
+};
+
+struct intel_excl_cntrs {
+ raw_spinlock_t lock;
+
+ struct intel_excl_states states[2];
+
+ union {
+ u16 has_exclusive[2];
+ u32 exclusive_present;
+ };
+
+ int refcnt; /* per-core: #HT threads */
+ unsigned core_id; /* per-core: core id */
+};
+
+struct x86_perf_task_context;
+#define MAX_LBR_ENTRIES 32
+
+enum {
+ X86_PERF_KFREE_SHARED = 0,
+ X86_PERF_KFREE_EXCL = 1,
+ X86_PERF_KFREE_MAX
+};
+
+struct cpu_hw_events {
+ /*
+ * Generic x86 PMC bits
+ */
+ struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */
+ unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+ unsigned long running[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+ int enabled;
+
+ int n_events; /* the # of events in the below arrays */
+ int n_added; /* the # last events in the below arrays;
+ they've never been enabled yet */
+ int n_txn; /* the # last events in the below arrays;
+ added in the current transaction */
+ int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
+ u64 tags[X86_PMC_IDX_MAX];
+
+ struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */
+ struct event_constraint *event_constraint[X86_PMC_IDX_MAX];
+
+ int n_excl; /* the number of exclusive events */
+
+ unsigned int txn_flags;
+ int is_fake;
+
+ /*
+ * Intel DebugStore bits
+ */
+ struct debug_store *ds;
+ void *ds_pebs_vaddr;
+ void *ds_bts_vaddr;
+ u64 pebs_enabled;
+ int n_pebs;
+ int n_large_pebs;
+
+ /*
+ * Intel LBR bits
+ */
+ int lbr_users;
+ struct perf_branch_stack lbr_stack;
+ struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES];
+ struct er_account *lbr_sel;
+ u64 br_sel;
+ struct x86_perf_task_context *last_task_ctx;
+ int last_log_id;
+
+ /*
+ * Intel host/guest exclude bits
+ */
+ u64 intel_ctrl_guest_mask;
+ u64 intel_ctrl_host_mask;
+ struct perf_guest_switch_msr guest_switch_msrs[X86_PMC_IDX_MAX];
+
+ /*
+ * Intel checkpoint mask
+ */
+ u64 intel_cp_status;
+
+ /*
+ * manage shared (per-core, per-cpu) registers
+ * used on Intel NHM/WSM/SNB
+ */
+ struct intel_shared_regs *shared_regs;
+ /*
+ * manage exclusive counter access between hyperthread
+ */
+ struct event_constraint *constraint_list; /* in enable order */
+ struct intel_excl_cntrs *excl_cntrs;
+ int excl_thread_id; /* 0 or 1 */
+
+ /*
+ * SKL TSX_FORCE_ABORT shadow
+ */
+ u64 tfa_shadow;
+
+ /*
+ * AMD specific bits
+ */
+ struct amd_nb *amd_nb;
+ /* Inverted mask of bits to clear in the perf_ctr ctrl registers */
+ u64 perf_ctr_virt_mask;
+
+ void *kfree_on_online[X86_PERF_KFREE_MAX];
+};
+
+#define __EVENT_CONSTRAINT(c, n, m, w, o, f) {\
+ { .idxmsk64 = (n) }, \
+ .code = (c), \
+ .cmask = (m), \
+ .weight = (w), \
+ .overlap = (o), \
+ .flags = f, \
+}
+
+#define EVENT_CONSTRAINT(c, n, m) \
+ __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 0, 0)
+
+#define INTEL_EXCLEVT_CONSTRAINT(c, n) \
+ __EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT, HWEIGHT(n),\
+ 0, PERF_X86_EVENT_EXCL)
+
+/*
+ * The overlap flag marks event constraints with overlapping counter
+ * masks. This is the case if the counter mask of such an event is not
+ * a subset of any other counter mask of a constraint with an equal or
+ * higher weight, e.g.:
+ *
+ * c_overlaps = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0);
+ * c_another1 = EVENT_CONSTRAINT(0, 0x07, 0);
+ * c_another2 = EVENT_CONSTRAINT(0, 0x38, 0);
+ *
+ * The event scheduler may not select the correct counter in the first
+ * cycle because it needs to know which subsequent events will be
+ * scheduled. It may fail to schedule the events then. So we set the
+ * overlap flag for such constraints to give the scheduler a hint which
+ * events to select for counter rescheduling.
+ *
+ * Care must be taken as the rescheduling algorithm is O(n!) which
+ * will increase scheduling cycles for an over-committed system
+ * dramatically. The number of such EVENT_CONSTRAINT_OVERLAP() macros
+ * and its counter masks must be kept at a minimum.
+ */
+#define EVENT_CONSTRAINT_OVERLAP(c, n, m) \
+ __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 1, 0)
+
+/*
+ * Constraint on the Event code.
+ */
+#define INTEL_EVENT_CONSTRAINT(c, n) \
+ EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT)
+
+/*
+ * Constraint on the Event code + UMask + fixed-mask
+ *
+ * filter mask to validate fixed counter events.
+ * the following filters disqualify for fixed counters:
+ * - inv
+ * - edge
+ * - cnt-mask
+ * - in_tx
+ * - in_tx_checkpointed
+ * The other filters are supported by fixed counters.
+ * The any-thread option is supported starting with v3.
+ */
+#define FIXED_EVENT_FLAGS (X86_RAW_EVENT_MASK|HSW_IN_TX|HSW_IN_TX_CHECKPOINTED)
+#define FIXED_EVENT_CONSTRAINT(c, n) \
+ EVENT_CONSTRAINT(c, (1ULL << (32+n)), FIXED_EVENT_FLAGS)
+
+/*
+ * Constraint on the Event code + UMask
+ */
+#define INTEL_UEVENT_CONSTRAINT(c, n) \
+ EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK)
+
+/* Constraint on specific umask bit only + event */
+#define INTEL_UBIT_EVENT_CONSTRAINT(c, n) \
+ EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT|(c))
+
+/* Like UEVENT_CONSTRAINT, but match flags too */
+#define INTEL_FLAGS_UEVENT_CONSTRAINT(c, n) \
+ EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS)
+
+#define INTEL_EXCLUEVT_CONSTRAINT(c, n) \
+ __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \
+ HWEIGHT(n), 0, PERF_X86_EVENT_EXCL)
+
+#define INTEL_PLD_CONSTRAINT(c, n) \
+ __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LDLAT)
+
+#define INTEL_PST_CONSTRAINT(c, n) \
+ __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST)
+
+/* Event constraint, but match on all event flags too. */
+#define INTEL_FLAGS_EVENT_CONSTRAINT(c, n) \
+ EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS)
+
+/* Check only flags, but allow all event/umask */
+#define INTEL_ALL_EVENT_CONSTRAINT(code, n) \
+ EVENT_CONSTRAINT(code, n, X86_ALL_EVENT_FLAGS)
+
+/* Check flags and event code, and set the HSW store flag */
+#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_ST(code, n) \
+ __EVENT_CONSTRAINT(code, n, \
+ ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
+ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW)
+
+/* Check flags and event code, and set the HSW load flag */
+#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(code, n) \
+ __EVENT_CONSTRAINT(code, n, \
+ ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
+ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW)
+
+#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(code, n) \
+ __EVENT_CONSTRAINT(code, n, \
+ ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
+ HWEIGHT(n), 0, \
+ PERF_X86_EVENT_PEBS_LD_HSW|PERF_X86_EVENT_EXCL)
+
+/* Check flags and event code/umask, and set the HSW store flag */
+#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(code, n) \
+ __EVENT_CONSTRAINT(code, n, \
+ INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW)
+
+#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(code, n) \
+ __EVENT_CONSTRAINT(code, n, \
+ INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+ HWEIGHT(n), 0, \
+ PERF_X86_EVENT_PEBS_ST_HSW|PERF_X86_EVENT_EXCL)
+
+/* Check flags and event code/umask, and set the HSW load flag */
+#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(code, n) \
+ __EVENT_CONSTRAINT(code, n, \
+ INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW)
+
+#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(code, n) \
+ __EVENT_CONSTRAINT(code, n, \
+ INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+ HWEIGHT(n), 0, \
+ PERF_X86_EVENT_PEBS_LD_HSW|PERF_X86_EVENT_EXCL)
+
+/* Check flags and event code/umask, and set the HSW N/A flag */
+#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(code, n) \
+ __EVENT_CONSTRAINT(code, n, \
+ INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_NA_HSW)
+
+
+/*
+ * We define the end marker as having a weight of -1
+ * to enable blacklisting of events using a counter bitmask
+ * of zero and thus a weight of zero.
+ * The end marker has a weight that cannot possibly be
+ * obtained from counting the bits in the bitmask.
+ */
+#define EVENT_CONSTRAINT_END { .weight = -1 }
+
+/*
+ * Check for end marker with weight == -1
+ */
+#define for_each_event_constraint(e, c) \
+ for ((e) = (c); (e)->weight != -1; (e)++)
+
+/*
+ * Extra registers for specific events.
+ *
+ * Some events need large masks and require external MSRs.
+ * Those extra MSRs end up being shared for all events on
+ * a PMU and sometimes between PMU of sibling HT threads.
+ * In either case, the kernel needs to handle conflicting
+ * accesses to those extra, shared, regs. The data structure
+ * to manage those registers is stored in cpu_hw_event.
+ */
+struct extra_reg {
+ unsigned int event;
+ unsigned int msr;
+ u64 config_mask;
+ u64 valid_mask;
+ int idx; /* per_xxx->regs[] reg index */
+ bool extra_msr_access;
+};
+
+#define EVENT_EXTRA_REG(e, ms, m, vm, i) { \
+ .event = (e), \
+ .msr = (ms), \
+ .config_mask = (m), \
+ .valid_mask = (vm), \
+ .idx = EXTRA_REG_##i, \
+ .extra_msr_access = true, \
+ }
+
+#define INTEL_EVENT_EXTRA_REG(event, msr, vm, idx) \
+ EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm, idx)
+
+#define INTEL_UEVENT_EXTRA_REG(event, msr, vm, idx) \
+ EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT | \
+ ARCH_PERFMON_EVENTSEL_UMASK, vm, idx)
+
+#define INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(c) \
+ INTEL_UEVENT_EXTRA_REG(c, \
+ MSR_PEBS_LD_LAT_THRESHOLD, \
+ 0xffff, \
+ LDLAT)
+
+#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0, RSP_0)
+
+union perf_capabilities {
+ struct {
+ u64 lbr_format:6;
+ u64 pebs_trap:1;
+ u64 pebs_arch_reg:1;
+ u64 pebs_format:4;
+ u64 smm_freeze:1;
+ /*
+ * PMU supports separate counter range for writing
+ * values > 32bit.
+ */
+ u64 full_width_write:1;
+ };
+ u64 capabilities;
+};
+
+struct x86_pmu_quirk {
+ struct x86_pmu_quirk *next;
+ void (*func)(void);
+};
+
+union x86_pmu_config {
+ struct {
+ u64 event:8,
+ umask:8,
+ usr:1,
+ os:1,
+ edge:1,
+ pc:1,
+ interrupt:1,
+ __reserved1:1,
+ en:1,
+ inv:1,
+ cmask:8,
+ event2:4,
+ __reserved2:4,
+ go:1,
+ ho:1;
+ } bits;
+ u64 value;
+};
+
+#define X86_CONFIG(args...) ((union x86_pmu_config){.bits = {args}}).value
+
+enum {
+ x86_lbr_exclusive_lbr,
+ x86_lbr_exclusive_bts,
+ x86_lbr_exclusive_pt,
+ x86_lbr_exclusive_max,
+};
+
+/*
+ * struct x86_pmu - generic x86 pmu
+ */
+struct x86_pmu {
+ /*
+ * Generic x86 PMC bits
+ */
+ const char *name;
+ int version;
+ int (*handle_irq)(struct pt_regs *);
+ void (*disable_all)(void);
+ void (*enable_all)(int added);
+ void (*enable)(struct perf_event *);
+ void (*disable)(struct perf_event *);
+ void (*add)(struct perf_event *);
+ void (*del)(struct perf_event *);
+ void (*read)(struct perf_event *event);
+ int (*hw_config)(struct perf_event *event);
+ int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign);
+ unsigned eventsel;
+ unsigned perfctr;
+ int (*addr_offset)(int index, bool eventsel);
+ int (*rdpmc_index)(int index);
+ u64 (*event_map)(int);
+ int max_events;
+ int num_counters;
+ int num_counters_fixed;
+ int cntval_bits;
+ u64 cntval_mask;
+ union {
+ unsigned long events_maskl;
+ unsigned long events_mask[BITS_TO_LONGS(ARCH_PERFMON_EVENTS_COUNT)];
+ };
+ int events_mask_len;
+ int apic;
+ u64 max_period;
+ struct event_constraint *
+ (*get_event_constraints)(struct cpu_hw_events *cpuc,
+ int idx,
+ struct perf_event *event);
+
+ void (*put_event_constraints)(struct cpu_hw_events *cpuc,
+ struct perf_event *event);
+
+ void (*start_scheduling)(struct cpu_hw_events *cpuc);
+
+ void (*commit_scheduling)(struct cpu_hw_events *cpuc, int idx, int cntr);
+
+ void (*stop_scheduling)(struct cpu_hw_events *cpuc);
+
+ struct event_constraint *event_constraints;
+ struct x86_pmu_quirk *quirks;
+ int perfctr_second_write;
+ bool late_ack;
+ u64 (*limit_period)(struct perf_event *event, u64 l);
+
+ /*
+ * sysfs attrs
+ */
+ int attr_rdpmc_broken;
+ int attr_rdpmc;
+ struct attribute **format_attrs;
+ struct attribute **event_attrs;
+ struct attribute **caps_attrs;
+
+ ssize_t (*events_sysfs_show)(char *page, u64 config);
+ struct attribute **cpu_events;
+
+ unsigned long attr_freeze_on_smi;
+ struct attribute **attrs;
+
+ /*
+ * CPU Hotplug hooks
+ */
+ int (*cpu_prepare)(int cpu);
+ void (*cpu_starting)(int cpu);
+ void (*cpu_dying)(int cpu);
+ void (*cpu_dead)(int cpu);
+
+ void (*check_microcode)(void);
+ void (*sched_task)(struct perf_event_context *ctx,
+ bool sched_in);
+
+ /*
+ * Intel Arch Perfmon v2+
+ */
+ u64 intel_ctrl;
+ union perf_capabilities intel_cap;
+
+ /*
+ * Intel DebugStore bits
+ */
+ unsigned int bts :1,
+ bts_active :1,
+ pebs :1,
+ pebs_active :1,
+ pebs_broken :1,
+ pebs_prec_dist :1,
+ pebs_no_tlb :1;
+ int pebs_record_size;
+ int pebs_buffer_size;
+ void (*drain_pebs)(struct pt_regs *regs);
+ struct event_constraint *pebs_constraints;
+ void (*pebs_aliases)(struct perf_event *event);
+ int max_pebs_events;
+ unsigned long large_pebs_flags;
+
+ /*
+ * Intel LBR
+ */
+ unsigned long lbr_tos, lbr_from, lbr_to; /* MSR base regs */
+ int lbr_nr; /* hardware stack size */
+ u64 lbr_sel_mask; /* LBR_SELECT valid bits */
+ const int *lbr_sel_map; /* lbr_select mappings */
+ bool lbr_double_abort; /* duplicated lbr aborts */
+ bool lbr_pt_coexist; /* (LBR|BTS) may coexist with PT */
+
+ /*
+ * Intel PT/LBR/BTS are exclusive
+ */
+ atomic_t lbr_exclusive[x86_lbr_exclusive_max];
+
+ /*
+ * AMD bits
+ */
+ unsigned int amd_nb_constraints : 1;
+
+ /*
+ * Extra registers for events
+ */
+ struct extra_reg *extra_regs;
+ unsigned int flags;
+
+ /*
+ * Intel host/guest support (KVM)
+ */
+ struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr);
+
+ /*
+ * Check period value for PERF_EVENT_IOC_PERIOD ioctl.
+ */
+ int (*check_period) (struct perf_event *event, u64 period);
+};
+
+struct x86_perf_task_context {
+ u64 lbr_from[MAX_LBR_ENTRIES];
+ u64 lbr_to[MAX_LBR_ENTRIES];
+ u64 lbr_info[MAX_LBR_ENTRIES];
+ int tos;
+ int valid_lbrs;
+ int lbr_callstack_users;
+ int lbr_stack_state;
+ int log_id;
+};
+
+#define x86_add_quirk(func_) \
+do { \
+ static struct x86_pmu_quirk __quirk __initdata = { \
+ .func = func_, \
+ }; \
+ __quirk.next = x86_pmu.quirks; \
+ x86_pmu.quirks = &__quirk; \
+} while (0)
+
+/*
+ * x86_pmu flags
+ */
+#define PMU_FL_NO_HT_SHARING 0x1 /* no hyper-threading resource sharing */
+#define PMU_FL_HAS_RSP_1 0x2 /* has 2 equivalent offcore_rsp regs */
+#define PMU_FL_EXCL_CNTRS 0x4 /* has exclusive counter requirements */
+#define PMU_FL_EXCL_ENABLED 0x8 /* exclusive counter active */
+#define PMU_FL_PEBS_ALL 0x10 /* all events are valid PEBS events */
+#define PMU_FL_TFA 0x20 /* deal with TSX force abort */
+
+#define EVENT_VAR(_id) event_attr_##_id
+#define EVENT_PTR(_id) &event_attr_##_id.attr.attr
+
+#define EVENT_ATTR(_name, _id) \
+static struct perf_pmu_events_attr EVENT_VAR(_id) = { \
+ .attr = __ATTR(_name, 0444, events_sysfs_show, NULL), \
+ .id = PERF_COUNT_HW_##_id, \
+ .event_str = NULL, \
+};
+
+#define EVENT_ATTR_STR(_name, v, str) \
+static struct perf_pmu_events_attr event_attr_##v = { \
+ .attr = __ATTR(_name, 0444, events_sysfs_show, NULL), \
+ .id = 0, \
+ .event_str = str, \
+};
+
+#define EVENT_ATTR_STR_HT(_name, v, noht, ht) \
+static struct perf_pmu_events_ht_attr event_attr_##v = { \
+ .attr = __ATTR(_name, 0444, events_ht_sysfs_show, NULL),\
+ .id = 0, \
+ .event_str_noht = noht, \
+ .event_str_ht = ht, \
+}
+
+extern struct x86_pmu x86_pmu __read_mostly;
+
+static inline bool x86_pmu_has_lbr_callstack(void)
+{
+ return x86_pmu.lbr_sel_map &&
+ x86_pmu.lbr_sel_map[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] > 0;
+}
+
+DECLARE_PER_CPU(struct cpu_hw_events, cpu_hw_events);
+
+int x86_perf_event_set_period(struct perf_event *event);
+
+/*
+ * Generalized hw caching related hw_event table, filled
+ * in on a per model basis. A value of 0 means
+ * 'not supported', -1 means 'hw_event makes no sense on
+ * this CPU', any other value means the raw hw_event
+ * ID.
+ */
+
+#define C(x) PERF_COUNT_HW_CACHE_##x
+
+extern u64 __read_mostly hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX];
+extern u64 __read_mostly hw_cache_extra_regs
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX];
+
+u64 x86_perf_event_update(struct perf_event *event);
+
+static inline unsigned int x86_pmu_config_addr(int index)
+{
+ return x86_pmu.eventsel + (x86_pmu.addr_offset ?
+ x86_pmu.addr_offset(index, true) : index);
+}
+
+static inline unsigned int x86_pmu_event_addr(int index)
+{
+ return x86_pmu.perfctr + (x86_pmu.addr_offset ?
+ x86_pmu.addr_offset(index, false) : index);
+}
+
+static inline int x86_pmu_rdpmc_index(int index)
+{
+ return x86_pmu.rdpmc_index ? x86_pmu.rdpmc_index(index) : index;
+}
+
+int x86_add_exclusive(unsigned int what);
+
+void x86_del_exclusive(unsigned int what);
+
+int x86_reserve_hardware(void);
+
+void x86_release_hardware(void);
+
+int x86_pmu_max_precise(void);
+
+void hw_perf_lbr_event_destroy(struct perf_event *event);
+
+int x86_setup_perfctr(struct perf_event *event);
+
+int x86_pmu_hw_config(struct perf_event *event);
+
+void x86_pmu_disable_all(void);
+
+static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
+ u64 enable_mask)
+{
+ u64 disable_mask = __this_cpu_read(cpu_hw_events.perf_ctr_virt_mask);
+
+ if (hwc->extra_reg.reg)
+ wrmsrl(hwc->extra_reg.reg, hwc->extra_reg.config);
+ wrmsrl(hwc->config_base, (hwc->config | enable_mask) & ~disable_mask);
+}
+
+void x86_pmu_enable_all(int added);
+
+int perf_assign_events(struct event_constraint **constraints, int n,
+ int wmin, int wmax, int gpmax, int *assign);
+int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign);
+
+void x86_pmu_stop(struct perf_event *event, int flags);
+
+static inline void x86_pmu_disable_event(struct perf_event *event)
+{
+ u64 disable_mask = __this_cpu_read(cpu_hw_events.perf_ctr_virt_mask);
+ struct hw_perf_event *hwc = &event->hw;
+
+ wrmsrl(hwc->config_base, hwc->config & ~disable_mask);
+}
+
+void x86_pmu_enable_event(struct perf_event *event);
+
+int x86_pmu_handle_irq(struct pt_regs *regs);
+
+extern struct event_constraint emptyconstraint;
+
+extern struct event_constraint unconstrained;
+
+static inline bool kernel_ip(unsigned long ip)
+{
+#ifdef CONFIG_X86_32
+ return ip > PAGE_OFFSET;
+#else
+ return (long)ip < 0;
+#endif
+}
+
+/*
+ * Not all PMUs provide the right context information to place the reported IP
+ * into full context. Specifically segment registers are typically not
+ * supplied.
+ *
+ * Assuming the address is a linear address (it is for IBS), we fake the CS and
+ * vm86 mode using the known zero-based code segment and 'fix up' the registers
+ * to reflect this.
+ *
+ * Intel PEBS/LBR appear to typically provide the effective address, nothing
+ * much we can do about that but pray and treat it like a linear address.
+ */
+static inline void set_linear_ip(struct pt_regs *regs, unsigned long ip)
+{
+ regs->cs = kernel_ip(ip) ? __KERNEL_CS : __USER_CS;
+ if (regs->flags & X86_VM_MASK)
+ regs->flags ^= (PERF_EFLAGS_VM | X86_VM_MASK);
+ regs->ip = ip;
+}
+
+ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event);
+ssize_t intel_event_sysfs_show(char *page, u64 config);
+
+struct attribute **merge_attr(struct attribute **a, struct attribute **b);
+
+ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr,
+ char *page);
+ssize_t events_ht_sysfs_show(struct device *dev, struct device_attribute *attr,
+ char *page);
+
+#ifdef CONFIG_CPU_SUP_AMD
+
+int amd_pmu_init(void);
+
+#else /* CONFIG_CPU_SUP_AMD */
+
+static inline int amd_pmu_init(void)
+{
+ return 0;
+}
+
+#endif /* CONFIG_CPU_SUP_AMD */
+
+#ifdef CONFIG_CPU_SUP_INTEL
+
+static inline bool intel_pmu_has_bts_period(struct perf_event *event, u64 period)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ unsigned int hw_event, bts_event;
+
+ if (event->attr.freq)
+ return false;
+
+ hw_event = hwc->config & INTEL_ARCH_EVENT_MASK;
+ bts_event = x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
+
+ return hw_event == bts_event && period == 1;
+}
+
+static inline bool intel_pmu_has_bts(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ return intel_pmu_has_bts_period(event, hwc->sample_period);
+}
+
+int intel_pmu_save_and_restart(struct perf_event *event);
+
+struct event_constraint *
+x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event);
+
+extern int intel_cpuc_prepare(struct cpu_hw_events *cpuc, int cpu);
+extern void intel_cpuc_finish(struct cpu_hw_events *cpuc);
+
+int intel_pmu_init(void);
+
+void init_debug_store_on_cpu(int cpu);
+
+void fini_debug_store_on_cpu(int cpu);
+
+void release_ds_buffers(void);
+
+void reserve_ds_buffers(void);
+
+extern struct event_constraint bts_constraint;
+
+void intel_pmu_enable_bts(u64 config);
+
+void intel_pmu_disable_bts(void);
+
+int intel_pmu_drain_bts_buffer(void);
+
+extern struct event_constraint intel_core2_pebs_event_constraints[];
+
+extern struct event_constraint intel_atom_pebs_event_constraints[];
+
+extern struct event_constraint intel_slm_pebs_event_constraints[];
+
+extern struct event_constraint intel_glm_pebs_event_constraints[];
+
+extern struct event_constraint intel_glp_pebs_event_constraints[];
+
+extern struct event_constraint intel_nehalem_pebs_event_constraints[];
+
+extern struct event_constraint intel_westmere_pebs_event_constraints[];
+
+extern struct event_constraint intel_snb_pebs_event_constraints[];
+
+extern struct event_constraint intel_ivb_pebs_event_constraints[];
+
+extern struct event_constraint intel_hsw_pebs_event_constraints[];
+
+extern struct event_constraint intel_bdw_pebs_event_constraints[];
+
+extern struct event_constraint intel_skl_pebs_event_constraints[];
+
+struct event_constraint *intel_pebs_constraints(struct perf_event *event);
+
+void intel_pmu_pebs_add(struct perf_event *event);
+
+void intel_pmu_pebs_del(struct perf_event *event);
+
+void intel_pmu_pebs_enable(struct perf_event *event);
+
+void intel_pmu_pebs_disable(struct perf_event *event);
+
+void intel_pmu_pebs_enable_all(void);
+
+void intel_pmu_pebs_disable_all(void);
+
+void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in);
+
+void intel_pmu_auto_reload_read(struct perf_event *event);
+
+void intel_ds_init(void);
+
+void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in);
+
+u64 lbr_from_signext_quirk_wr(u64 val);
+
+void intel_pmu_lbr_reset(void);
+
+void intel_pmu_lbr_add(struct perf_event *event);
+
+void intel_pmu_lbr_del(struct perf_event *event);
+
+void intel_pmu_lbr_enable_all(bool pmi);
+
+void intel_pmu_lbr_disable_all(void);
+
+void intel_pmu_lbr_read(void);
+
+void intel_pmu_lbr_init_core(void);
+
+void intel_pmu_lbr_init_nhm(void);
+
+void intel_pmu_lbr_init_atom(void);
+
+void intel_pmu_lbr_init_slm(void);
+
+void intel_pmu_lbr_init_snb(void);
+
+void intel_pmu_lbr_init_hsw(void);
+
+void intel_pmu_lbr_init_skl(void);
+
+void intel_pmu_lbr_init_knl(void);
+
+void intel_pmu_pebs_data_source_nhm(void);
+
+void intel_pmu_pebs_data_source_skl(bool pmem);
+
+int intel_pmu_setup_lbr_filter(struct perf_event *event);
+
+void intel_pt_interrupt(void);
+
+int intel_bts_interrupt(void);
+
+void intel_bts_enable_local(void);
+
+void intel_bts_disable_local(void);
+
+int p4_pmu_init(void);
+
+int p6_pmu_init(void);
+
+int knc_pmu_init(void);
+
+static inline int is_ht_workaround_enabled(void)
+{
+ return !!(x86_pmu.flags & PMU_FL_EXCL_ENABLED);
+}
+
+#else /* CONFIG_CPU_SUP_INTEL */
+
+static inline void reserve_ds_buffers(void)
+{
+}
+
+static inline void release_ds_buffers(void)
+{
+}
+
+static inline int intel_pmu_init(void)
+{
+ return 0;
+}
+
+static inline int intel_cpuc_prepare(struct cpu_hw_events *cpuc, int cpu)
+{
+ return 0;
+}
+
+static inline void intel_cpuc_finish(struct cpu_hw_events *cpuc)
+{
+}
+
+static inline int is_ht_workaround_enabled(void)
+{
+ return 0;
+}
+#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/hyperv/Makefile b/arch/x86/hyperv/Makefile
new file mode 100644
index 000000000..b21ee65c4
--- /dev/null
+++ b/arch/x86/hyperv/Makefile
@@ -0,0 +1,2 @@
+obj-y := hv_init.o mmu.o nested.o
+obj-$(CONFIG_X86_64) += hv_apic.o
diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c
new file mode 100644
index 000000000..2c43e3055
--- /dev/null
+++ b/arch/x86/hyperv/hv_apic.c
@@ -0,0 +1,266 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Hyper-V specific APIC code.
+ *
+ * Copyright (C) 2018, Microsoft, Inc.
+ *
+ * Author : K. Y. Srinivasan <kys@microsoft.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/version.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <linux/clockchips.h>
+#include <linux/hyperv.h>
+#include <linux/slab.h>
+#include <linux/cpuhotplug.h>
+#include <asm/hypervisor.h>
+#include <asm/mshyperv.h>
+#include <asm/apic.h>
+
+#include <asm/trace/hyperv.h>
+
+static struct apic orig_apic;
+
+static u64 hv_apic_icr_read(void)
+{
+ u64 reg_val;
+
+ rdmsrl(HV_X64_MSR_ICR, reg_val);
+ return reg_val;
+}
+
+static void hv_apic_icr_write(u32 low, u32 id)
+{
+ u64 reg_val;
+
+ reg_val = SET_APIC_DEST_FIELD(id);
+ reg_val = reg_val << 32;
+ reg_val |= low;
+
+ wrmsrl(HV_X64_MSR_ICR, reg_val);
+}
+
+static u32 hv_apic_read(u32 reg)
+{
+ u32 reg_val, hi;
+
+ switch (reg) {
+ case APIC_EOI:
+ rdmsr(HV_X64_MSR_EOI, reg_val, hi);
+ return reg_val;
+ case APIC_TASKPRI:
+ rdmsr(HV_X64_MSR_TPR, reg_val, hi);
+ return reg_val;
+
+ default:
+ return native_apic_mem_read(reg);
+ }
+}
+
+static void hv_apic_write(u32 reg, u32 val)
+{
+ switch (reg) {
+ case APIC_EOI:
+ wrmsr(HV_X64_MSR_EOI, val, 0);
+ break;
+ case APIC_TASKPRI:
+ wrmsr(HV_X64_MSR_TPR, val, 0);
+ break;
+ default:
+ native_apic_mem_write(reg, val);
+ }
+}
+
+static void hv_apic_eoi_write(u32 reg, u32 val)
+{
+ wrmsr(HV_X64_MSR_EOI, val, 0);
+}
+
+/*
+ * IPI implementation on Hyper-V.
+ */
+static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector)
+{
+ struct hv_send_ipi_ex **arg;
+ struct hv_send_ipi_ex *ipi_arg;
+ unsigned long flags;
+ int nr_bank = 0;
+ int ret = 1;
+
+ if (!(ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED))
+ return false;
+
+ local_irq_save(flags);
+ arg = (struct hv_send_ipi_ex **)this_cpu_ptr(hyperv_pcpu_input_arg);
+
+ ipi_arg = *arg;
+ if (unlikely(!ipi_arg))
+ goto ipi_mask_ex_done;
+
+ ipi_arg->vector = vector;
+ ipi_arg->reserved = 0;
+ ipi_arg->vp_set.valid_bank_mask = 0;
+
+ if (!cpumask_equal(mask, cpu_present_mask)) {
+ ipi_arg->vp_set.format = HV_GENERIC_SET_SPARSE_4K;
+ nr_bank = cpumask_to_vpset(&(ipi_arg->vp_set), mask);
+ }
+ if (nr_bank < 0)
+ goto ipi_mask_ex_done;
+ if (!nr_bank)
+ ipi_arg->vp_set.format = HV_GENERIC_SET_ALL;
+
+ ret = hv_do_rep_hypercall(HVCALL_SEND_IPI_EX, 0, nr_bank,
+ ipi_arg, NULL);
+
+ipi_mask_ex_done:
+ local_irq_restore(flags);
+ return ((ret == 0) ? true : false);
+}
+
+static bool __send_ipi_mask(const struct cpumask *mask, int vector)
+{
+ int cur_cpu, vcpu;
+ struct hv_send_ipi ipi_arg;
+ int ret = 1;
+
+ trace_hyperv_send_ipi_mask(mask, vector);
+
+ if (cpumask_empty(mask))
+ return true;
+
+ if (!hv_hypercall_pg)
+ return false;
+
+ if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR))
+ return false;
+
+ /*
+ * From the supplied CPU set we need to figure out if we can get away
+ * with cheaper HVCALL_SEND_IPI hypercall. This is possible when the
+ * highest VP number in the set is < 64. As VP numbers are usually in
+ * ascending order and match Linux CPU ids, here is an optimization:
+ * we check the VP number for the highest bit in the supplied set first
+ * so we can quickly find out if using HVCALL_SEND_IPI_EX hypercall is
+ * a must. We will also check all VP numbers when walking the supplied
+ * CPU set to remain correct in all cases.
+ */
+ if (hv_cpu_number_to_vp_number(cpumask_last(mask)) >= 64)
+ goto do_ex_hypercall;
+
+ ipi_arg.vector = vector;
+ ipi_arg.cpu_mask = 0;
+
+ for_each_cpu(cur_cpu, mask) {
+ vcpu = hv_cpu_number_to_vp_number(cur_cpu);
+ if (vcpu == VP_INVAL)
+ return false;
+
+ /*
+ * This particular version of the IPI hypercall can
+ * only target upto 64 CPUs.
+ */
+ if (vcpu >= 64)
+ goto do_ex_hypercall;
+
+ __set_bit(vcpu, (unsigned long *)&ipi_arg.cpu_mask);
+ }
+
+ ret = hv_do_fast_hypercall16(HVCALL_SEND_IPI, ipi_arg.vector,
+ ipi_arg.cpu_mask);
+ return ((ret == 0) ? true : false);
+
+do_ex_hypercall:
+ return __send_ipi_mask_ex(mask, vector);
+}
+
+static bool __send_ipi_one(int cpu, int vector)
+{
+ struct cpumask mask = CPU_MASK_NONE;
+
+ cpumask_set_cpu(cpu, &mask);
+ return __send_ipi_mask(&mask, vector);
+}
+
+static void hv_send_ipi(int cpu, int vector)
+{
+ if (!__send_ipi_one(cpu, vector))
+ orig_apic.send_IPI(cpu, vector);
+}
+
+static void hv_send_ipi_mask(const struct cpumask *mask, int vector)
+{
+ if (!__send_ipi_mask(mask, vector))
+ orig_apic.send_IPI_mask(mask, vector);
+}
+
+static void hv_send_ipi_mask_allbutself(const struct cpumask *mask, int vector)
+{
+ unsigned int this_cpu = smp_processor_id();
+ struct cpumask new_mask;
+ const struct cpumask *local_mask;
+
+ cpumask_copy(&new_mask, mask);
+ cpumask_clear_cpu(this_cpu, &new_mask);
+ local_mask = &new_mask;
+ if (!__send_ipi_mask(local_mask, vector))
+ orig_apic.send_IPI_mask_allbutself(mask, vector);
+}
+
+static void hv_send_ipi_allbutself(int vector)
+{
+ hv_send_ipi_mask_allbutself(cpu_online_mask, vector);
+}
+
+static void hv_send_ipi_all(int vector)
+{
+ if (!__send_ipi_mask(cpu_online_mask, vector))
+ orig_apic.send_IPI_all(vector);
+}
+
+static void hv_send_ipi_self(int vector)
+{
+ if (!__send_ipi_one(smp_processor_id(), vector))
+ orig_apic.send_IPI_self(vector);
+}
+
+void __init hv_apic_init(void)
+{
+ if (ms_hyperv.hints & HV_X64_CLUSTER_IPI_RECOMMENDED) {
+ pr_info("Hyper-V: Using IPI hypercalls\n");
+ /*
+ * Set the IPI entry points.
+ */
+ orig_apic = *apic;
+
+ apic->send_IPI = hv_send_ipi;
+ apic->send_IPI_mask = hv_send_ipi_mask;
+ apic->send_IPI_mask_allbutself = hv_send_ipi_mask_allbutself;
+ apic->send_IPI_allbutself = hv_send_ipi_allbutself;
+ apic->send_IPI_all = hv_send_ipi_all;
+ apic->send_IPI_self = hv_send_ipi_self;
+ }
+
+ if (ms_hyperv.hints & HV_X64_APIC_ACCESS_RECOMMENDED) {
+ pr_info("Hyper-V: Using MSR based APIC access\n");
+ apic_set_eoi_write(hv_apic_eoi_write);
+ apic->read = hv_apic_read;
+ apic->write = hv_apic_write;
+ apic->icr_write = hv_apic_icr_write;
+ apic->icr_read = hv_apic_icr_read;
+ }
+}
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
new file mode 100644
index 000000000..38c31e164
--- /dev/null
+++ b/arch/x86/hyperv/hv_init.c
@@ -0,0 +1,516 @@
+/*
+ * X86 specific Hyper-V initialization code.
+ *
+ * Copyright (C) 2016, Microsoft, Inc.
+ *
+ * Author : K. Y. Srinivasan <kys@microsoft.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ */
+
+#include <linux/efi.h>
+#include <linux/types.h>
+#include <asm/apic.h>
+#include <asm/desc.h>
+#include <asm/hypervisor.h>
+#include <asm/hyperv-tlfs.h>
+#include <asm/mshyperv.h>
+#include <linux/version.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <linux/clockchips.h>
+#include <linux/hyperv.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/cpuhotplug.h>
+
+#ifdef CONFIG_HYPERV_TSCPAGE
+
+static struct ms_hyperv_tsc_page *tsc_pg;
+
+struct ms_hyperv_tsc_page *hv_get_tsc_page(void)
+{
+ return tsc_pg;
+}
+EXPORT_SYMBOL_GPL(hv_get_tsc_page);
+
+static u64 read_hv_clock_tsc(struct clocksource *arg)
+{
+ u64 current_tick = hv_read_tsc_page(tsc_pg);
+
+ if (current_tick == U64_MAX)
+ rdmsrl(HV_X64_MSR_TIME_REF_COUNT, current_tick);
+
+ return current_tick;
+}
+
+static struct clocksource hyperv_cs_tsc = {
+ .name = "hyperv_clocksource_tsc_page",
+ .rating = 400,
+ .read = read_hv_clock_tsc,
+ .mask = CLOCKSOURCE_MASK(64),
+ .flags = CLOCK_SOURCE_IS_CONTINUOUS,
+};
+#endif
+
+static u64 read_hv_clock_msr(struct clocksource *arg)
+{
+ u64 current_tick;
+ /*
+ * Read the partition counter to get the current tick count. This count
+ * is set to 0 when the partition is created and is incremented in
+ * 100 nanosecond units.
+ */
+ rdmsrl(HV_X64_MSR_TIME_REF_COUNT, current_tick);
+ return current_tick;
+}
+
+static struct clocksource hyperv_cs_msr = {
+ .name = "hyperv_clocksource_msr",
+ .rating = 400,
+ .read = read_hv_clock_msr,
+ .mask = CLOCKSOURCE_MASK(64),
+ .flags = CLOCK_SOURCE_IS_CONTINUOUS,
+};
+
+void *hv_hypercall_pg;
+EXPORT_SYMBOL_GPL(hv_hypercall_pg);
+struct clocksource *hyperv_cs;
+EXPORT_SYMBOL_GPL(hyperv_cs);
+
+u32 *hv_vp_index;
+EXPORT_SYMBOL_GPL(hv_vp_index);
+
+struct hv_vp_assist_page **hv_vp_assist_page;
+EXPORT_SYMBOL_GPL(hv_vp_assist_page);
+
+void __percpu **hyperv_pcpu_input_arg;
+EXPORT_SYMBOL_GPL(hyperv_pcpu_input_arg);
+
+u32 hv_max_vp_index;
+
+static int hv_cpu_init(unsigned int cpu)
+{
+ u64 msr_vp_index;
+ struct hv_vp_assist_page **hvp = &hv_vp_assist_page[smp_processor_id()];
+ void **input_arg;
+ struct page *pg;
+
+ input_arg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg);
+ pg = alloc_page(GFP_KERNEL);
+ if (unlikely(!pg))
+ return -ENOMEM;
+ *input_arg = page_address(pg);
+
+ hv_get_vp_index(msr_vp_index);
+
+ hv_vp_index[smp_processor_id()] = msr_vp_index;
+
+ if (msr_vp_index > hv_max_vp_index)
+ hv_max_vp_index = msr_vp_index;
+
+ if (!hv_vp_assist_page)
+ return 0;
+
+ if (!*hvp)
+ *hvp = __vmalloc(PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL);
+
+ if (*hvp) {
+ u64 val;
+
+ val = vmalloc_to_pfn(*hvp);
+ val = (val << HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT) |
+ HV_X64_MSR_VP_ASSIST_PAGE_ENABLE;
+
+ wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, val);
+ }
+
+ return 0;
+}
+
+static void (*hv_reenlightenment_cb)(void);
+
+static void hv_reenlightenment_notify(struct work_struct *dummy)
+{
+ struct hv_tsc_emulation_status emu_status;
+
+ rdmsrl(HV_X64_MSR_TSC_EMULATION_STATUS, *(u64 *)&emu_status);
+
+ /* Don't issue the callback if TSC accesses are not emulated */
+ if (hv_reenlightenment_cb && emu_status.inprogress)
+ hv_reenlightenment_cb();
+}
+static DECLARE_DELAYED_WORK(hv_reenlightenment_work, hv_reenlightenment_notify);
+
+void hyperv_stop_tsc_emulation(void)
+{
+ u64 freq;
+ struct hv_tsc_emulation_status emu_status;
+
+ rdmsrl(HV_X64_MSR_TSC_EMULATION_STATUS, *(u64 *)&emu_status);
+ emu_status.inprogress = 0;
+ wrmsrl(HV_X64_MSR_TSC_EMULATION_STATUS, *(u64 *)&emu_status);
+
+ rdmsrl(HV_X64_MSR_TSC_FREQUENCY, freq);
+ tsc_khz = div64_u64(freq, 1000);
+}
+EXPORT_SYMBOL_GPL(hyperv_stop_tsc_emulation);
+
+static inline bool hv_reenlightenment_available(void)
+{
+ /*
+ * Check for required features and priviliges to make TSC frequency
+ * change notifications work.
+ */
+ return ms_hyperv.features & HV_X64_ACCESS_FREQUENCY_MSRS &&
+ ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE &&
+ ms_hyperv.features & HV_X64_ACCESS_REENLIGHTENMENT;
+}
+
+__visible void __irq_entry hyperv_reenlightenment_intr(struct pt_regs *regs)
+{
+ entering_ack_irq();
+
+ inc_irq_stat(irq_hv_reenlightenment_count);
+
+ schedule_delayed_work(&hv_reenlightenment_work, HZ/10);
+
+ exiting_irq();
+}
+
+void set_hv_tscchange_cb(void (*cb)(void))
+{
+ struct hv_reenlightenment_control re_ctrl = {
+ .vector = HYPERV_REENLIGHTENMENT_VECTOR,
+ .enabled = 1,
+ };
+ struct hv_tsc_emulation_control emu_ctrl = {.enabled = 1};
+
+ if (!hv_reenlightenment_available()) {
+ pr_warn("Hyper-V: reenlightenment support is unavailable\n");
+ return;
+ }
+
+ if (!hv_vp_index)
+ return;
+
+ hv_reenlightenment_cb = cb;
+
+ /* Make sure callback is registered before we write to MSRs */
+ wmb();
+
+ re_ctrl.target_vp = hv_vp_index[get_cpu()];
+
+ wrmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *((u64 *)&re_ctrl));
+ wrmsrl(HV_X64_MSR_TSC_EMULATION_CONTROL, *((u64 *)&emu_ctrl));
+
+ put_cpu();
+}
+EXPORT_SYMBOL_GPL(set_hv_tscchange_cb);
+
+void clear_hv_tscchange_cb(void)
+{
+ struct hv_reenlightenment_control re_ctrl;
+
+ if (!hv_reenlightenment_available())
+ return;
+
+ rdmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *(u64 *)&re_ctrl);
+ re_ctrl.enabled = 0;
+ wrmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *(u64 *)&re_ctrl);
+
+ hv_reenlightenment_cb = NULL;
+}
+EXPORT_SYMBOL_GPL(clear_hv_tscchange_cb);
+
+static int hv_cpu_die(unsigned int cpu)
+{
+ struct hv_reenlightenment_control re_ctrl;
+ unsigned int new_cpu;
+ unsigned long flags;
+ void **input_arg;
+ void *input_pg = NULL;
+
+ local_irq_save(flags);
+ input_arg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg);
+ input_pg = *input_arg;
+ *input_arg = NULL;
+ local_irq_restore(flags);
+ free_page((unsigned long)input_pg);
+
+ if (hv_vp_assist_page && hv_vp_assist_page[cpu])
+ wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, 0);
+
+ if (hv_reenlightenment_cb == NULL)
+ return 0;
+
+ rdmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *((u64 *)&re_ctrl));
+ if (re_ctrl.target_vp == hv_vp_index[cpu]) {
+ /* Reassign to some other online CPU */
+ new_cpu = cpumask_any_but(cpu_online_mask, cpu);
+
+ re_ctrl.target_vp = hv_vp_index[new_cpu];
+ wrmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *((u64 *)&re_ctrl));
+ }
+
+ return 0;
+}
+
+static int __init hv_pci_init(void)
+{
+ int gen2vm = efi_enabled(EFI_BOOT);
+
+ /*
+ * For Generation-2 VM, we exit from pci_arch_init() by returning 0.
+ * The purpose is to suppress the harmless warning:
+ * "PCI: Fatal: No config space access function found"
+ */
+ if (gen2vm)
+ return 0;
+
+ /* For Generation-1 VM, we'll proceed in pci_arch_init(). */
+ return 1;
+}
+
+/*
+ * This function is to be invoked early in the boot sequence after the
+ * hypervisor has been detected.
+ *
+ * 1. Setup the hypercall page.
+ * 2. Register Hyper-V specific clocksource.
+ * 3. Setup Hyper-V specific APIC entry points.
+ */
+void __init hyperv_init(void)
+{
+ u64 guest_id, required_msrs;
+ union hv_x64_msr_hypercall_contents hypercall_msr;
+ int cpuhp, i;
+
+ if (x86_hyper_type != X86_HYPER_MS_HYPERV)
+ return;
+
+ /* Absolutely required MSRs */
+ required_msrs = HV_X64_MSR_HYPERCALL_AVAILABLE |
+ HV_X64_MSR_VP_INDEX_AVAILABLE;
+
+ if ((ms_hyperv.features & required_msrs) != required_msrs)
+ return;
+
+ /*
+ * Allocate the per-CPU state for the hypercall input arg.
+ * If this allocation fails, we will not be able to setup
+ * (per-CPU) hypercall input page and thus this failure is
+ * fatal on Hyper-V.
+ */
+ hyperv_pcpu_input_arg = alloc_percpu(void *);
+
+ BUG_ON(hyperv_pcpu_input_arg == NULL);
+
+ /* Allocate percpu VP index */
+ hv_vp_index = kmalloc_array(num_possible_cpus(), sizeof(*hv_vp_index),
+ GFP_KERNEL);
+ if (!hv_vp_index)
+ return;
+
+ for (i = 0; i < num_possible_cpus(); i++)
+ hv_vp_index[i] = VP_INVAL;
+
+ hv_vp_assist_page = kcalloc(num_possible_cpus(),
+ sizeof(*hv_vp_assist_page), GFP_KERNEL);
+ if (!hv_vp_assist_page) {
+ ms_hyperv.hints &= ~HV_X64_ENLIGHTENED_VMCS_RECOMMENDED;
+ goto free_vp_index;
+ }
+
+ cpuhp = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/hyperv_init:online",
+ hv_cpu_init, hv_cpu_die);
+ if (cpuhp < 0)
+ goto free_vp_assist_page;
+
+ /*
+ * Setup the hypercall page and enable hypercalls.
+ * 1. Register the guest ID
+ * 2. Enable the hypercall and register the hypercall page
+ */
+ guest_id = generate_guest_id(0, LINUX_VERSION_CODE, 0);
+ wrmsrl(HV_X64_MSR_GUEST_OS_ID, guest_id);
+
+ hv_hypercall_pg = __vmalloc(PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL_RX);
+ if (hv_hypercall_pg == NULL) {
+ wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0);
+ goto remove_cpuhp_state;
+ }
+
+ rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
+ hypercall_msr.enable = 1;
+ hypercall_msr.guest_physical_address = vmalloc_to_pfn(hv_hypercall_pg);
+ wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
+
+ hv_apic_init();
+
+ x86_init.pci.arch_init = hv_pci_init;
+
+ /*
+ * Register Hyper-V specific clocksource.
+ */
+#ifdef CONFIG_HYPERV_TSCPAGE
+ if (ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE) {
+ union hv_x64_msr_hypercall_contents tsc_msr;
+
+ tsc_pg = __vmalloc(PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL);
+ if (!tsc_pg)
+ goto register_msr_cs;
+
+ hyperv_cs = &hyperv_cs_tsc;
+
+ rdmsrl(HV_X64_MSR_REFERENCE_TSC, tsc_msr.as_uint64);
+
+ tsc_msr.enable = 1;
+ tsc_msr.guest_physical_address = vmalloc_to_pfn(tsc_pg);
+
+ wrmsrl(HV_X64_MSR_REFERENCE_TSC, tsc_msr.as_uint64);
+
+ hyperv_cs_tsc.archdata.vclock_mode = VCLOCK_HVCLOCK;
+
+ clocksource_register_hz(&hyperv_cs_tsc, NSEC_PER_SEC/100);
+ return;
+ }
+register_msr_cs:
+#endif
+ /*
+ * For 32 bit guests just use the MSR based mechanism for reading
+ * the partition counter.
+ */
+
+ hyperv_cs = &hyperv_cs_msr;
+ if (ms_hyperv.features & HV_MSR_TIME_REF_COUNT_AVAILABLE)
+ clocksource_register_hz(&hyperv_cs_msr, NSEC_PER_SEC/100);
+
+ return;
+
+remove_cpuhp_state:
+ cpuhp_remove_state(cpuhp);
+free_vp_assist_page:
+ kfree(hv_vp_assist_page);
+ hv_vp_assist_page = NULL;
+free_vp_index:
+ kfree(hv_vp_index);
+ hv_vp_index = NULL;
+}
+
+/*
+ * This routine is called before kexec/kdump, it does the required cleanup.
+ */
+void hyperv_cleanup(void)
+{
+ union hv_x64_msr_hypercall_contents hypercall_msr;
+
+ /* Reset our OS id */
+ wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0);
+
+ /*
+ * Reset hypercall page reference before reset the page,
+ * let hypercall operations fail safely rather than
+ * panic the kernel for using invalid hypercall page
+ */
+ hv_hypercall_pg = NULL;
+
+ /* Reset the hypercall page */
+ hypercall_msr.as_uint64 = 0;
+ wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
+
+ /* Reset the TSC page */
+ hypercall_msr.as_uint64 = 0;
+ wrmsrl(HV_X64_MSR_REFERENCE_TSC, hypercall_msr.as_uint64);
+}
+EXPORT_SYMBOL_GPL(hyperv_cleanup);
+
+void hyperv_report_panic(struct pt_regs *regs, long err, bool in_die)
+{
+ static bool panic_reported;
+ u64 guest_id;
+
+ if (in_die && !panic_on_oops)
+ return;
+
+ /*
+ * We prefer to report panic on 'die' chain as we have proper
+ * registers to report, but if we miss it (e.g. on BUG()) we need
+ * to report it on 'panic'.
+ */
+ if (panic_reported)
+ return;
+ panic_reported = true;
+
+ rdmsrl(HV_X64_MSR_GUEST_OS_ID, guest_id);
+
+ wrmsrl(HV_X64_MSR_CRASH_P0, err);
+ wrmsrl(HV_X64_MSR_CRASH_P1, guest_id);
+ wrmsrl(HV_X64_MSR_CRASH_P2, regs->ip);
+ wrmsrl(HV_X64_MSR_CRASH_P3, regs->ax);
+ wrmsrl(HV_X64_MSR_CRASH_P4, regs->sp);
+
+ /*
+ * Let Hyper-V know there is crash data available
+ */
+ wrmsrl(HV_X64_MSR_CRASH_CTL, HV_CRASH_CTL_CRASH_NOTIFY);
+}
+EXPORT_SYMBOL_GPL(hyperv_report_panic);
+
+/**
+ * hyperv_report_panic_msg - report panic message to Hyper-V
+ * @pa: physical address of the panic page containing the message
+ * @size: size of the message in the page
+ */
+void hyperv_report_panic_msg(phys_addr_t pa, size_t size)
+{
+ /*
+ * P3 to contain the physical address of the panic page & P4 to
+ * contain the size of the panic data in that page. Rest of the
+ * registers are no-op when the NOTIFY_MSG flag is set.
+ */
+ wrmsrl(HV_X64_MSR_CRASH_P0, 0);
+ wrmsrl(HV_X64_MSR_CRASH_P1, 0);
+ wrmsrl(HV_X64_MSR_CRASH_P2, 0);
+ wrmsrl(HV_X64_MSR_CRASH_P3, pa);
+ wrmsrl(HV_X64_MSR_CRASH_P4, size);
+
+ /*
+ * Let Hyper-V know there is crash data available along with
+ * the panic message.
+ */
+ wrmsrl(HV_X64_MSR_CRASH_CTL,
+ (HV_CRASH_CTL_CRASH_NOTIFY | HV_CRASH_CTL_CRASH_NOTIFY_MSG));
+}
+EXPORT_SYMBOL_GPL(hyperv_report_panic_msg);
+
+bool hv_is_hyperv_initialized(void)
+{
+ union hv_x64_msr_hypercall_contents hypercall_msr;
+
+ /*
+ * Ensure that we're really on Hyper-V, and not a KVM or Xen
+ * emulation of Hyper-V
+ */
+ if (x86_hyper_type != X86_HYPER_MS_HYPERV)
+ return false;
+
+ /*
+ * Verify that earlier initialization succeeded by checking
+ * that the hypercall page is setup
+ */
+ hypercall_msr.as_uint64 = 0;
+ rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
+
+ return hypercall_msr.enable;
+}
+EXPORT_SYMBOL_GPL(hv_is_hyperv_initialized);
diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c
new file mode 100644
index 000000000..e666f7eaf
--- /dev/null
+++ b/arch/x86/hyperv/mmu.c
@@ -0,0 +1,244 @@
+#define pr_fmt(fmt) "Hyper-V: " fmt
+
+#include <linux/hyperv.h>
+#include <linux/log2.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+
+#include <asm/fpu/api.h>
+#include <asm/mshyperv.h>
+#include <asm/msr.h>
+#include <asm/tlbflush.h>
+#include <asm/tlb.h>
+
+#define CREATE_TRACE_POINTS
+#include <asm/trace/hyperv.h>
+
+/* Each gva in gva_list encodes up to 4096 pages to flush */
+#define HV_TLB_FLUSH_UNIT (4096 * PAGE_SIZE)
+
+static u64 hyperv_flush_tlb_others_ex(const struct cpumask *cpus,
+ const struct flush_tlb_info *info);
+
+/*
+ * Fills in gva_list starting from offset. Returns the number of items added.
+ */
+static inline int fill_gva_list(u64 gva_list[], int offset,
+ unsigned long start, unsigned long end)
+{
+ int gva_n = offset;
+ unsigned long cur = start, diff;
+
+ do {
+ diff = end > cur ? end - cur : 0;
+
+ gva_list[gva_n] = cur & PAGE_MASK;
+ /*
+ * Lower 12 bits encode the number of additional
+ * pages to flush (in addition to the 'cur' page).
+ */
+ if (diff >= HV_TLB_FLUSH_UNIT) {
+ gva_list[gva_n] |= ~PAGE_MASK;
+ cur += HV_TLB_FLUSH_UNIT;
+ } else if (diff) {
+ gva_list[gva_n] |= (diff - 1) >> PAGE_SHIFT;
+ cur = end;
+ }
+
+ gva_n++;
+
+ } while (cur < end);
+
+ return gva_n - offset;
+}
+
+static void hyperv_flush_tlb_others(const struct cpumask *cpus,
+ const struct flush_tlb_info *info)
+{
+ int cpu, vcpu, gva_n, max_gvas;
+ struct hv_tlb_flush **flush_pcpu;
+ struct hv_tlb_flush *flush;
+ u64 status = U64_MAX;
+ unsigned long flags;
+
+ trace_hyperv_mmu_flush_tlb_others(cpus, info);
+
+ if (!hv_hypercall_pg)
+ goto do_native;
+
+ local_irq_save(flags);
+
+ /*
+ * Only check the mask _after_ interrupt has been disabled to avoid the
+ * mask changing under our feet.
+ */
+ if (cpumask_empty(cpus)) {
+ local_irq_restore(flags);
+ return;
+ }
+
+ flush_pcpu = (struct hv_tlb_flush **)
+ this_cpu_ptr(hyperv_pcpu_input_arg);
+
+ flush = *flush_pcpu;
+
+ if (unlikely(!flush)) {
+ local_irq_restore(flags);
+ goto do_native;
+ }
+
+ if (info->mm) {
+ /*
+ * AddressSpace argument must match the CR3 with PCID bits
+ * stripped out.
+ */
+ flush->address_space = virt_to_phys(info->mm->pgd);
+ flush->address_space &= CR3_ADDR_MASK;
+ flush->flags = 0;
+ } else {
+ flush->address_space = 0;
+ flush->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES;
+ }
+
+ flush->processor_mask = 0;
+ if (cpumask_equal(cpus, cpu_present_mask)) {
+ flush->flags |= HV_FLUSH_ALL_PROCESSORS;
+ } else {
+ /*
+ * From the supplied CPU set we need to figure out if we can get
+ * away with cheaper HVCALL_FLUSH_VIRTUAL_ADDRESS_{LIST,SPACE}
+ * hypercalls. This is possible when the highest VP number in
+ * the set is < 64. As VP numbers are usually in ascending order
+ * and match Linux CPU ids, here is an optimization: we check
+ * the VP number for the highest bit in the supplied set first
+ * so we can quickly find out if using *_EX hypercalls is a
+ * must. We will also check all VP numbers when walking the
+ * supplied CPU set to remain correct in all cases.
+ */
+ if (hv_cpu_number_to_vp_number(cpumask_last(cpus)) >= 64)
+ goto do_ex_hypercall;
+
+ for_each_cpu(cpu, cpus) {
+ vcpu = hv_cpu_number_to_vp_number(cpu);
+ if (vcpu == VP_INVAL) {
+ local_irq_restore(flags);
+ goto do_native;
+ }
+
+ if (vcpu >= 64)
+ goto do_ex_hypercall;
+
+ __set_bit(vcpu, (unsigned long *)
+ &flush->processor_mask);
+ }
+ }
+
+ /*
+ * We can flush not more than max_gvas with one hypercall. Flush the
+ * whole address space if we were asked to do more.
+ */
+ max_gvas = (PAGE_SIZE - sizeof(*flush)) / sizeof(flush->gva_list[0]);
+
+ if (info->end == TLB_FLUSH_ALL) {
+ flush->flags |= HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY;
+ status = hv_do_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE,
+ flush, NULL);
+ } else if (info->end &&
+ ((info->end - info->start)/HV_TLB_FLUSH_UNIT) > max_gvas) {
+ status = hv_do_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE,
+ flush, NULL);
+ } else {
+ gva_n = fill_gva_list(flush->gva_list, 0,
+ info->start, info->end);
+ status = hv_do_rep_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST,
+ gva_n, 0, flush, NULL);
+ }
+ goto check_status;
+
+do_ex_hypercall:
+ status = hyperv_flush_tlb_others_ex(cpus, info);
+
+check_status:
+ local_irq_restore(flags);
+
+ if (!(status & HV_HYPERCALL_RESULT_MASK))
+ return;
+do_native:
+ native_flush_tlb_others(cpus, info);
+}
+
+static u64 hyperv_flush_tlb_others_ex(const struct cpumask *cpus,
+ const struct flush_tlb_info *info)
+{
+ int nr_bank = 0, max_gvas, gva_n;
+ struct hv_tlb_flush_ex **flush_pcpu;
+ struct hv_tlb_flush_ex *flush;
+ u64 status;
+
+ if (!(ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED))
+ return U64_MAX;
+
+ flush_pcpu = (struct hv_tlb_flush_ex **)
+ this_cpu_ptr(hyperv_pcpu_input_arg);
+
+ flush = *flush_pcpu;
+
+ if (info->mm) {
+ /*
+ * AddressSpace argument must match the CR3 with PCID bits
+ * stripped out.
+ */
+ flush->address_space = virt_to_phys(info->mm->pgd);
+ flush->address_space &= CR3_ADDR_MASK;
+ flush->flags = 0;
+ } else {
+ flush->address_space = 0;
+ flush->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES;
+ }
+
+ flush->hv_vp_set.valid_bank_mask = 0;
+
+ flush->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K;
+ nr_bank = cpumask_to_vpset(&(flush->hv_vp_set), cpus);
+ if (nr_bank < 0)
+ return U64_MAX;
+
+ /*
+ * We can flush not more than max_gvas with one hypercall. Flush the
+ * whole address space if we were asked to do more.
+ */
+ max_gvas =
+ (PAGE_SIZE - sizeof(*flush) - nr_bank *
+ sizeof(flush->hv_vp_set.bank_contents[0])) /
+ sizeof(flush->gva_list[0]);
+
+ if (info->end == TLB_FLUSH_ALL) {
+ flush->flags |= HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY;
+ status = hv_do_rep_hypercall(
+ HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX,
+ 0, nr_bank, flush, NULL);
+ } else if (info->end &&
+ ((info->end - info->start)/HV_TLB_FLUSH_UNIT) > max_gvas) {
+ status = hv_do_rep_hypercall(
+ HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX,
+ 0, nr_bank, flush, NULL);
+ } else {
+ gva_n = fill_gva_list(flush->gva_list, nr_bank,
+ info->start, info->end);
+ status = hv_do_rep_hypercall(
+ HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX,
+ gva_n, nr_bank, flush, NULL);
+ }
+
+ return status;
+}
+
+void hyperv_setup_mmu_ops(void)
+{
+ if (!(ms_hyperv.hints & HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED))
+ return;
+
+ pr_info("Using hypercall for remote TLB flush\n");
+ pv_mmu_ops.flush_tlb_others = hyperv_flush_tlb_others;
+ pv_mmu_ops.tlb_remove_table = tlb_remove_table;
+}
diff --git a/arch/x86/hyperv/nested.c b/arch/x86/hyperv/nested.c
new file mode 100644
index 000000000..b8e60cc50
--- /dev/null
+++ b/arch/x86/hyperv/nested.c
@@ -0,0 +1,56 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Hyper-V nested virtualization code.
+ *
+ * Copyright (C) 2018, Microsoft, Inc.
+ *
+ * Author : Lan Tianyu <Tianyu.Lan@microsoft.com>
+ */
+
+
+#include <linux/types.h>
+#include <asm/hyperv-tlfs.h>
+#include <asm/mshyperv.h>
+#include <asm/tlbflush.h>
+
+#include <asm/trace/hyperv.h>
+
+int hyperv_flush_guest_mapping(u64 as)
+{
+ struct hv_guest_mapping_flush **flush_pcpu;
+ struct hv_guest_mapping_flush *flush;
+ u64 status;
+ unsigned long flags;
+ int ret = -ENOTSUPP;
+
+ if (!hv_hypercall_pg)
+ goto fault;
+
+ local_irq_save(flags);
+
+ flush_pcpu = (struct hv_guest_mapping_flush **)
+ this_cpu_ptr(hyperv_pcpu_input_arg);
+
+ flush = *flush_pcpu;
+
+ if (unlikely(!flush)) {
+ local_irq_restore(flags);
+ goto fault;
+ }
+
+ flush->address_space = as;
+ flush->flags = 0;
+
+ status = hv_do_hypercall(HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE,
+ flush, NULL);
+ local_irq_restore(flags);
+
+ if (!(status & HV_HYPERCALL_RESULT_MASK))
+ ret = 0;
+
+fault:
+ trace_hyperv_nested_flush_guest_mapping(as, ret);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(hyperv_flush_guest_mapping);
diff --git a/arch/x86/ia32/Makefile b/arch/x86/ia32/Makefile
new file mode 100644
index 000000000..cd4339bae
--- /dev/null
+++ b/arch/x86/ia32/Makefile
@@ -0,0 +1,10 @@
+#
+# Makefile for the ia32 kernel emulation subsystem.
+#
+
+obj-$(CONFIG_IA32_EMULATION) := sys_ia32.o ia32_signal.o
+
+obj-$(CONFIG_IA32_AOUT) += ia32_aout.o
+
+audit-class-$(CONFIG_AUDIT) := audit.o
+obj-$(CONFIG_IA32_EMULATION) += $(audit-class-y)
diff --git a/arch/x86/ia32/audit.c b/arch/x86/ia32/audit.c
new file mode 100644
index 000000000..3d21eab7a
--- /dev/null
+++ b/arch/x86/ia32/audit.c
@@ -0,0 +1,44 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <asm/unistd_32.h>
+
+unsigned ia32_dir_class[] = {
+#include <asm-generic/audit_dir_write.h>
+~0U
+};
+
+unsigned ia32_chattr_class[] = {
+#include <asm-generic/audit_change_attr.h>
+~0U
+};
+
+unsigned ia32_write_class[] = {
+#include <asm-generic/audit_write.h>
+~0U
+};
+
+unsigned ia32_read_class[] = {
+#include <asm-generic/audit_read.h>
+~0U
+};
+
+unsigned ia32_signal_class[] = {
+#include <asm-generic/audit_signal.h>
+~0U
+};
+
+int ia32_classify_syscall(unsigned syscall)
+{
+ switch (syscall) {
+ case __NR_open:
+ return 2;
+ case __NR_openat:
+ return 3;
+ case __NR_socketcall:
+ return 4;
+ case __NR_execve:
+ case __NR_execveat:
+ return 5;
+ default:
+ return 1;
+ }
+}
diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
new file mode 100644
index 000000000..3ebd77770
--- /dev/null
+++ b/arch/x86/ia32/ia32_aout.c
@@ -0,0 +1,489 @@
+/*
+ * a.out loader for x86-64
+ *
+ * Copyright (C) 1991, 1992, 1996 Linus Torvalds
+ * Hacked together by Andi Kleen
+ */
+
+#include <linux/module.h>
+
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/a.out.h>
+#include <linux/errno.h>
+#include <linux/signal.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/fcntl.h>
+#include <linux/ptrace.h>
+#include <linux/user.h>
+#include <linux/binfmts.h>
+#include <linux/personality.h>
+#include <linux/init.h>
+#include <linux/jiffies.h>
+#include <linux/perf_event.h>
+#include <linux/sched/task_stack.h>
+
+#include <linux/uaccess.h>
+#include <asm/pgalloc.h>
+#include <asm/cacheflush.h>
+#include <asm/user32.h>
+#include <asm/ia32.h>
+
+#undef WARN_OLD
+
+static int load_aout_binary(struct linux_binprm *);
+static int load_aout_library(struct file *);
+
+#ifdef CONFIG_COREDUMP
+static int aout_core_dump(struct coredump_params *);
+
+static unsigned long get_dr(int n)
+{
+ struct perf_event *bp = current->thread.ptrace_bps[n];
+ return bp ? bp->hw.info.address : 0;
+}
+
+/*
+ * fill in the user structure for a core dump..
+ */
+static void fill_dump(struct pt_regs *regs, struct user32 *dump)
+{
+ u32 fs, gs;
+ memset(dump, 0, sizeof(*dump));
+
+/* changed the size calculations - should hopefully work better. lbt */
+ dump->magic = CMAGIC;
+ dump->start_code = 0;
+ dump->start_stack = regs->sp & ~(PAGE_SIZE - 1);
+ dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT;
+ dump->u_dsize = ((unsigned long)
+ (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT;
+ dump->u_dsize -= dump->u_tsize;
+ dump->u_debugreg[0] = get_dr(0);
+ dump->u_debugreg[1] = get_dr(1);
+ dump->u_debugreg[2] = get_dr(2);
+ dump->u_debugreg[3] = get_dr(3);
+ dump->u_debugreg[6] = current->thread.debugreg6;
+ dump->u_debugreg[7] = current->thread.ptrace_dr7;
+
+ if (dump->start_stack < 0xc0000000) {
+ unsigned long tmp;
+
+ tmp = (unsigned long) (0xc0000000 - dump->start_stack);
+ dump->u_ssize = tmp >> PAGE_SHIFT;
+ }
+
+ dump->regs.ebx = regs->bx;
+ dump->regs.ecx = regs->cx;
+ dump->regs.edx = regs->dx;
+ dump->regs.esi = regs->si;
+ dump->regs.edi = regs->di;
+ dump->regs.ebp = regs->bp;
+ dump->regs.eax = regs->ax;
+ dump->regs.ds = current->thread.ds;
+ dump->regs.es = current->thread.es;
+ savesegment(fs, fs);
+ dump->regs.fs = fs;
+ savesegment(gs, gs);
+ dump->regs.gs = gs;
+ dump->regs.orig_eax = regs->orig_ax;
+ dump->regs.eip = regs->ip;
+ dump->regs.cs = regs->cs;
+ dump->regs.eflags = regs->flags;
+ dump->regs.esp = regs->sp;
+ dump->regs.ss = regs->ss;
+
+#if 1 /* FIXME */
+ dump->u_fpvalid = 0;
+#else
+ dump->u_fpvalid = dump_fpu(regs, &dump->i387);
+#endif
+}
+
+#endif
+
+static struct linux_binfmt aout_format = {
+ .module = THIS_MODULE,
+ .load_binary = load_aout_binary,
+ .load_shlib = load_aout_library,
+#ifdef CONFIG_COREDUMP
+ .core_dump = aout_core_dump,
+#endif
+ .min_coredump = PAGE_SIZE
+};
+
+static int set_brk(unsigned long start, unsigned long end)
+{
+ start = PAGE_ALIGN(start);
+ end = PAGE_ALIGN(end);
+ if (end <= start)
+ return 0;
+ return vm_brk(start, end - start);
+}
+
+#ifdef CONFIG_COREDUMP
+/*
+ * These are the only things you should do on a core-file: use only these
+ * macros to write out all the necessary info.
+ */
+
+#include <linux/coredump.h>
+
+#define START_DATA(u) (u.u_tsize << PAGE_SHIFT)
+#define START_STACK(u) (u.start_stack)
+
+/*
+ * Routine writes a core dump image in the current directory.
+ * Currently only a stub-function.
+ *
+ * Note that setuid/setgid files won't make a core-dump if the uid/gid
+ * changed due to the set[u|g]id. It's enforced by the "current->mm->dumpable"
+ * field, which also makes sure the core-dumps won't be recursive if the
+ * dumping of the process results in another error..
+ */
+
+static int aout_core_dump(struct coredump_params *cprm)
+{
+ mm_segment_t fs;
+ int has_dumped = 0;
+ unsigned long dump_start, dump_size;
+ struct user32 dump;
+
+ fs = get_fs();
+ set_fs(KERNEL_DS);
+ has_dumped = 1;
+
+ fill_dump(cprm->regs, &dump);
+
+ strncpy(dump.u_comm, current->comm, sizeof(current->comm));
+ dump.u_ar0 = offsetof(struct user32, regs);
+ dump.signal = cprm->siginfo->si_signo;
+
+ /*
+ * If the size of the dump file exceeds the rlimit, then see
+ * what would happen if we wrote the stack, but not the data
+ * area.
+ */
+ if ((dump.u_dsize + dump.u_ssize + 1) * PAGE_SIZE > cprm->limit)
+ dump.u_dsize = 0;
+
+ /* Make sure we have enough room to write the stack and data areas. */
+ if ((dump.u_ssize + 1) * PAGE_SIZE > cprm->limit)
+ dump.u_ssize = 0;
+
+ /* make sure we actually have a data and stack area to dump */
+ set_fs(USER_DS);
+ if (!access_ok(VERIFY_READ, (void *) (unsigned long)START_DATA(dump),
+ dump.u_dsize << PAGE_SHIFT))
+ dump.u_dsize = 0;
+ if (!access_ok(VERIFY_READ, (void *) (unsigned long)START_STACK(dump),
+ dump.u_ssize << PAGE_SHIFT))
+ dump.u_ssize = 0;
+
+ set_fs(KERNEL_DS);
+ /* struct user */
+ if (!dump_emit(cprm, &dump, sizeof(dump)))
+ goto end_coredump;
+ /* Now dump all of the user data. Include malloced stuff as well */
+ if (!dump_skip(cprm, PAGE_SIZE - sizeof(dump)))
+ goto end_coredump;
+ /* now we start writing out the user space info */
+ set_fs(USER_DS);
+ /* Dump the data area */
+ if (dump.u_dsize != 0) {
+ dump_start = START_DATA(dump);
+ dump_size = dump.u_dsize << PAGE_SHIFT;
+ if (!dump_emit(cprm, (void *)dump_start, dump_size))
+ goto end_coredump;
+ }
+ /* Now prepare to dump the stack area */
+ if (dump.u_ssize != 0) {
+ dump_start = START_STACK(dump);
+ dump_size = dump.u_ssize << PAGE_SHIFT;
+ if (!dump_emit(cprm, (void *)dump_start, dump_size))
+ goto end_coredump;
+ }
+end_coredump:
+ set_fs(fs);
+ return has_dumped;
+}
+#endif
+
+/*
+ * create_aout_tables() parses the env- and arg-strings in new user
+ * memory and creates the pointer tables from them, and puts their
+ * addresses on the "stack", returning the new stack pointer value.
+ */
+static u32 __user *create_aout_tables(char __user *p, struct linux_binprm *bprm)
+{
+ u32 __user *argv, *envp, *sp;
+ int argc = bprm->argc, envc = bprm->envc;
+
+ sp = (u32 __user *) ((-(unsigned long)sizeof(u32)) & (unsigned long) p);
+ sp -= envc+1;
+ envp = sp;
+ sp -= argc+1;
+ argv = sp;
+ put_user((unsigned long) envp, --sp);
+ put_user((unsigned long) argv, --sp);
+ put_user(argc, --sp);
+ current->mm->arg_start = (unsigned long) p;
+ while (argc-- > 0) {
+ char c;
+
+ put_user((u32)(unsigned long)p, argv++);
+ do {
+ get_user(c, p++);
+ } while (c);
+ }
+ put_user(0, argv);
+ current->mm->arg_end = current->mm->env_start = (unsigned long) p;
+ while (envc-- > 0) {
+ char c;
+
+ put_user((u32)(unsigned long)p, envp++);
+ do {
+ get_user(c, p++);
+ } while (c);
+ }
+ put_user(0, envp);
+ current->mm->env_end = (unsigned long) p;
+ return sp;
+}
+
+/*
+ * These are the functions used to load a.out style executables and shared
+ * libraries. There is no binary dependent code anywhere else.
+ */
+static int load_aout_binary(struct linux_binprm *bprm)
+{
+ unsigned long error, fd_offset, rlim;
+ struct pt_regs *regs = current_pt_regs();
+ struct exec ex;
+ int retval;
+
+ ex = *((struct exec *) bprm->buf); /* exec-header */
+ if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != OMAGIC &&
+ N_MAGIC(ex) != QMAGIC && N_MAGIC(ex) != NMAGIC) ||
+ N_TRSIZE(ex) || N_DRSIZE(ex) ||
+ i_size_read(file_inode(bprm->file)) <
+ ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) {
+ return -ENOEXEC;
+ }
+
+ fd_offset = N_TXTOFF(ex);
+
+ /* Check initial limits. This avoids letting people circumvent
+ * size limits imposed on them by creating programs with large
+ * arrays in the data or bss.
+ */
+ rlim = rlimit(RLIMIT_DATA);
+ if (rlim >= RLIM_INFINITY)
+ rlim = ~0;
+ if (ex.a_data + ex.a_bss > rlim)
+ return -ENOMEM;
+
+ /* Flush all traces of the currently running executable */
+ retval = flush_old_exec(bprm);
+ if (retval)
+ return retval;
+
+ /* OK, This is the point of no return */
+ set_personality(PER_LINUX);
+ set_personality_ia32(false);
+
+ setup_new_exec(bprm);
+
+ regs->cs = __USER32_CS;
+ regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 =
+ regs->r13 = regs->r14 = regs->r15 = 0;
+
+ current->mm->end_code = ex.a_text +
+ (current->mm->start_code = N_TXTADDR(ex));
+ current->mm->end_data = ex.a_data +
+ (current->mm->start_data = N_DATADDR(ex));
+ current->mm->brk = ex.a_bss +
+ (current->mm->start_brk = N_BSSADDR(ex));
+
+ retval = setup_arg_pages(bprm, IA32_STACK_TOP, EXSTACK_DEFAULT);
+ if (retval < 0)
+ return retval;
+
+ install_exec_creds(bprm);
+
+ if (N_MAGIC(ex) == OMAGIC) {
+ unsigned long text_addr, map_size;
+
+ text_addr = N_TXTADDR(ex);
+ map_size = ex.a_text+ex.a_data;
+
+ error = vm_brk(text_addr & PAGE_MASK, map_size);
+
+ if (error)
+ return error;
+
+ error = read_code(bprm->file, text_addr, 32,
+ ex.a_text + ex.a_data);
+ if ((signed long)error < 0)
+ return error;
+ } else {
+#ifdef WARN_OLD
+ static unsigned long error_time, error_time2;
+ if ((ex.a_text & 0xfff || ex.a_data & 0xfff) &&
+ (N_MAGIC(ex) != NMAGIC) &&
+ time_after(jiffies, error_time2 + 5*HZ)) {
+ printk(KERN_NOTICE "executable not page aligned\n");
+ error_time2 = jiffies;
+ }
+
+ if ((fd_offset & ~PAGE_MASK) != 0 &&
+ time_after(jiffies, error_time + 5*HZ)) {
+ printk(KERN_WARNING
+ "fd_offset is not page aligned. Please convert "
+ "program: %pD\n",
+ bprm->file);
+ error_time = jiffies;
+ }
+#endif
+
+ if (!bprm->file->f_op->mmap || (fd_offset & ~PAGE_MASK) != 0) {
+ error = vm_brk(N_TXTADDR(ex), ex.a_text+ex.a_data);
+ if (error)
+ return error;
+
+ read_code(bprm->file, N_TXTADDR(ex), fd_offset,
+ ex.a_text+ex.a_data);
+ goto beyond_if;
+ }
+
+ error = vm_mmap(bprm->file, N_TXTADDR(ex), ex.a_text,
+ PROT_READ | PROT_EXEC,
+ MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE |
+ MAP_EXECUTABLE | MAP_32BIT,
+ fd_offset);
+
+ if (error != N_TXTADDR(ex))
+ return error;
+
+ error = vm_mmap(bprm->file, N_DATADDR(ex), ex.a_data,
+ PROT_READ | PROT_WRITE | PROT_EXEC,
+ MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE |
+ MAP_EXECUTABLE | MAP_32BIT,
+ fd_offset + ex.a_text);
+ if (error != N_DATADDR(ex))
+ return error;
+ }
+
+beyond_if:
+ error = set_brk(current->mm->start_brk, current->mm->brk);
+ if (error)
+ return error;
+
+ set_binfmt(&aout_format);
+
+ current->mm->start_stack =
+ (unsigned long)create_aout_tables((char __user *)bprm->p, bprm);
+ /* start thread */
+ loadsegment(fs, 0);
+ loadsegment(ds, __USER32_DS);
+ loadsegment(es, __USER32_DS);
+ load_gs_index(0);
+ (regs)->ip = ex.a_entry;
+ (regs)->sp = current->mm->start_stack;
+ (regs)->flags = 0x200;
+ (regs)->cs = __USER32_CS;
+ (regs)->ss = __USER32_DS;
+ regs->r8 = regs->r9 = regs->r10 = regs->r11 =
+ regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0;
+ set_fs(USER_DS);
+ return 0;
+}
+
+static int load_aout_library(struct file *file)
+{
+ unsigned long bss, start_addr, len, error;
+ int retval;
+ struct exec ex;
+ loff_t pos = 0;
+
+ retval = -ENOEXEC;
+ error = kernel_read(file, &ex, sizeof(ex), &pos);
+ if (error != sizeof(ex))
+ goto out;
+
+ /* We come in here for the regular a.out style of shared libraries */
+ if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != QMAGIC) || N_TRSIZE(ex) ||
+ N_DRSIZE(ex) || ((ex.a_entry & 0xfff) && N_MAGIC(ex) == ZMAGIC) ||
+ i_size_read(file_inode(file)) <
+ ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) {
+ goto out;
+ }
+
+ if (N_FLAGS(ex))
+ goto out;
+
+ /* For QMAGIC, the starting address is 0x20 into the page. We mask
+ this off to get the starting address for the page */
+
+ start_addr = ex.a_entry & 0xfffff000;
+
+ if ((N_TXTOFF(ex) & ~PAGE_MASK) != 0) {
+#ifdef WARN_OLD
+ static unsigned long error_time;
+ if (time_after(jiffies, error_time + 5*HZ)) {
+ printk(KERN_WARNING
+ "N_TXTOFF is not page aligned. Please convert "
+ "library: %pD\n",
+ file);
+ error_time = jiffies;
+ }
+#endif
+ retval = vm_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss);
+ if (retval)
+ goto out;
+
+ read_code(file, start_addr, N_TXTOFF(ex),
+ ex.a_text + ex.a_data);
+ retval = 0;
+ goto out;
+ }
+ /* Now use mmap to map the library into memory. */
+ error = vm_mmap(file, start_addr, ex.a_text + ex.a_data,
+ PROT_READ | PROT_WRITE | PROT_EXEC,
+ MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_32BIT,
+ N_TXTOFF(ex));
+ retval = error;
+ if (error != start_addr)
+ goto out;
+
+ len = PAGE_ALIGN(ex.a_text + ex.a_data);
+ bss = ex.a_text + ex.a_data + ex.a_bss;
+ if (bss > len) {
+ retval = vm_brk(start_addr + len, bss - len);
+ if (retval)
+ goto out;
+ }
+ retval = 0;
+out:
+ return retval;
+}
+
+static int __init init_aout_binfmt(void)
+{
+ register_binfmt(&aout_format);
+ return 0;
+}
+
+static void __exit exit_aout_binfmt(void)
+{
+ unregister_binfmt(&aout_format);
+}
+
+module_init(init_aout_binfmt);
+module_exit(exit_aout_binfmt);
+MODULE_LICENSE("GPL");
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
new file mode 100644
index 000000000..513ba49c2
--- /dev/null
+++ b/arch/x86/ia32/ia32_signal.c
@@ -0,0 +1,411 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/arch/x86_64/ia32/ia32_signal.c
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson
+ * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes
+ * 2000-12-* x86-64 compatibility mode signal handling by Andi Kleen
+ */
+
+#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/wait.h>
+#include <linux/unistd.h>
+#include <linux/stddef.h>
+#include <linux/personality.h>
+#include <linux/compat.h>
+#include <linux/binfmts.h>
+#include <asm/ucontext.h>
+#include <linux/uaccess.h>
+#include <asm/fpu/internal.h>
+#include <asm/fpu/signal.h>
+#include <asm/ptrace.h>
+#include <asm/ia32_unistd.h>
+#include <asm/user32.h>
+#include <uapi/asm/sigcontext.h>
+#include <asm/proto.h>
+#include <asm/vdso.h>
+#include <asm/sigframe.h>
+#include <asm/sighandling.h>
+#include <asm/smap.h>
+
+/*
+ * Do a signal return; undo the signal stack.
+ */
+#define loadsegment_gs(v) load_gs_index(v)
+#define loadsegment_fs(v) loadsegment(fs, v)
+#define loadsegment_ds(v) loadsegment(ds, v)
+#define loadsegment_es(v) loadsegment(es, v)
+
+#define get_user_seg(seg) ({ unsigned int v; savesegment(seg, v); v; })
+#define set_user_seg(seg, v) loadsegment_##seg(v)
+
+#define COPY(x) { \
+ get_user_ex(regs->x, &sc->x); \
+}
+
+#define GET_SEG(seg) ({ \
+ unsigned short tmp; \
+ get_user_ex(tmp, &sc->seg); \
+ tmp; \
+})
+
+#define COPY_SEG_CPL3(seg) do { \
+ regs->seg = GET_SEG(seg) | 3; \
+} while (0)
+
+#define RELOAD_SEG(seg) { \
+ unsigned int pre = (seg) | 3; \
+ unsigned int cur = get_user_seg(seg); \
+ if (pre != cur) \
+ set_user_seg(seg, pre); \
+}
+
+static int ia32_restore_sigcontext(struct pt_regs *regs,
+ struct sigcontext_32 __user *sc)
+{
+ unsigned int tmpflags, err = 0;
+ u16 gs, fs, es, ds;
+ void __user *buf;
+ u32 tmp;
+
+ /* Always make any pending restarted system calls return -EINTR */
+ current->restart_block.fn = do_no_restart_syscall;
+
+ get_user_try {
+ gs = GET_SEG(gs);
+ fs = GET_SEG(fs);
+ ds = GET_SEG(ds);
+ es = GET_SEG(es);
+
+ COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
+ COPY(dx); COPY(cx); COPY(ip); COPY(ax);
+ /* Don't touch extended registers */
+
+ COPY_SEG_CPL3(cs);
+ COPY_SEG_CPL3(ss);
+
+ get_user_ex(tmpflags, &sc->flags);
+ regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
+ /* disable syscall checks */
+ regs->orig_ax = -1;
+
+ get_user_ex(tmp, &sc->fpstate);
+ buf = compat_ptr(tmp);
+ } get_user_catch(err);
+
+ /*
+ * Reload fs and gs if they have changed in the signal
+ * handler. This does not handle long fs/gs base changes in
+ * the handler, but does not clobber them at least in the
+ * normal case.
+ */
+ RELOAD_SEG(gs);
+ RELOAD_SEG(fs);
+ RELOAD_SEG(ds);
+ RELOAD_SEG(es);
+
+ err |= fpu__restore_sig(buf, 1);
+
+ force_iret();
+
+ return err;
+}
+
+asmlinkage long sys32_sigreturn(void)
+{
+ struct pt_regs *regs = current_pt_regs();
+ struct sigframe_ia32 __user *frame = (struct sigframe_ia32 __user *)(regs->sp-8);
+ sigset_t set;
+
+ if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
+ goto badframe;
+ if (__get_user(set.sig[0], &frame->sc.oldmask)
+ || (_COMPAT_NSIG_WORDS > 1
+ && __copy_from_user((((char *) &set.sig) + 4),
+ &frame->extramask,
+ sizeof(frame->extramask))))
+ goto badframe;
+
+ set_current_blocked(&set);
+
+ if (ia32_restore_sigcontext(regs, &frame->sc))
+ goto badframe;
+ return regs->ax;
+
+badframe:
+ signal_fault(regs, frame, "32bit sigreturn");
+ return 0;
+}
+
+asmlinkage long sys32_rt_sigreturn(void)
+{
+ struct pt_regs *regs = current_pt_regs();
+ struct rt_sigframe_ia32 __user *frame;
+ sigset_t set;
+
+ frame = (struct rt_sigframe_ia32 __user *)(regs->sp - 4);
+
+ if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
+ goto badframe;
+ if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
+ goto badframe;
+
+ set_current_blocked(&set);
+
+ if (ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext))
+ goto badframe;
+
+ if (compat_restore_altstack(&frame->uc.uc_stack))
+ goto badframe;
+
+ return regs->ax;
+
+badframe:
+ signal_fault(regs, frame, "32bit rt sigreturn");
+ return 0;
+}
+
+/*
+ * Set up a signal frame.
+ */
+
+static int ia32_setup_sigcontext(struct sigcontext_32 __user *sc,
+ void __user *fpstate,
+ struct pt_regs *regs, unsigned int mask)
+{
+ int err = 0;
+
+ put_user_try {
+ put_user_ex(get_user_seg(gs), (unsigned int __user *)&sc->gs);
+ put_user_ex(get_user_seg(fs), (unsigned int __user *)&sc->fs);
+ put_user_ex(get_user_seg(ds), (unsigned int __user *)&sc->ds);
+ put_user_ex(get_user_seg(es), (unsigned int __user *)&sc->es);
+
+ put_user_ex(regs->di, &sc->di);
+ put_user_ex(regs->si, &sc->si);
+ put_user_ex(regs->bp, &sc->bp);
+ put_user_ex(regs->sp, &sc->sp);
+ put_user_ex(regs->bx, &sc->bx);
+ put_user_ex(regs->dx, &sc->dx);
+ put_user_ex(regs->cx, &sc->cx);
+ put_user_ex(regs->ax, &sc->ax);
+ put_user_ex(current->thread.trap_nr, &sc->trapno);
+ put_user_ex(current->thread.error_code, &sc->err);
+ put_user_ex(regs->ip, &sc->ip);
+ put_user_ex(regs->cs, (unsigned int __user *)&sc->cs);
+ put_user_ex(regs->flags, &sc->flags);
+ put_user_ex(regs->sp, &sc->sp_at_signal);
+ put_user_ex(regs->ss, (unsigned int __user *)&sc->ss);
+
+ put_user_ex(ptr_to_compat(fpstate), &sc->fpstate);
+
+ /* non-iBCS2 extensions.. */
+ put_user_ex(mask, &sc->oldmask);
+ put_user_ex(current->thread.cr2, &sc->cr2);
+ } put_user_catch(err);
+
+ return err;
+}
+
+/*
+ * Determine which stack to use..
+ */
+static void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs,
+ size_t frame_size,
+ void __user **fpstate)
+{
+ struct fpu *fpu = &current->thread.fpu;
+ unsigned long sp;
+
+ /* Default to using normal stack */
+ sp = regs->sp;
+
+ /* This is the X/Open sanctioned signal stack switching. */
+ if (ksig->ka.sa.sa_flags & SA_ONSTACK)
+ sp = sigsp(sp, ksig);
+ /* This is the legacy signal stack switching. */
+ else if (regs->ss != __USER32_DS &&
+ !(ksig->ka.sa.sa_flags & SA_RESTORER) &&
+ ksig->ka.sa.sa_restorer)
+ sp = (unsigned long) ksig->ka.sa.sa_restorer;
+
+ if (fpu->initialized) {
+ unsigned long fx_aligned, math_size;
+
+ sp = fpu__alloc_mathframe(sp, 1, &fx_aligned, &math_size);
+ *fpstate = (struct _fpstate_32 __user *) sp;
+ if (copy_fpstate_to_sigframe(*fpstate, (void __user *)fx_aligned,
+ math_size) < 0)
+ return (void __user *) -1L;
+ }
+
+ sp -= frame_size;
+ /* Align the stack pointer according to the i386 ABI,
+ * i.e. so that on function entry ((sp + 4) & 15) == 0. */
+ sp = ((sp + 4) & -16ul) - 4;
+ return (void __user *) sp;
+}
+
+int ia32_setup_frame(int sig, struct ksignal *ksig,
+ compat_sigset_t *set, struct pt_regs *regs)
+{
+ struct sigframe_ia32 __user *frame;
+ void __user *restorer;
+ int err = 0;
+ void __user *fpstate = NULL;
+
+ /* copy_to_user optimizes that into a single 8 byte store */
+ static const struct {
+ u16 poplmovl;
+ u32 val;
+ u16 int80;
+ } __attribute__((packed)) code = {
+ 0xb858, /* popl %eax ; movl $...,%eax */
+ __NR_ia32_sigreturn,
+ 0x80cd, /* int $0x80 */
+ };
+
+ frame = get_sigframe(ksig, regs, sizeof(*frame), &fpstate);
+
+ if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
+ return -EFAULT;
+
+ if (__put_user(sig, &frame->sig))
+ return -EFAULT;
+
+ if (ia32_setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0]))
+ return -EFAULT;
+
+ if (_COMPAT_NSIG_WORDS > 1) {
+ if (__copy_to_user(frame->extramask, &set->sig[1],
+ sizeof(frame->extramask)))
+ return -EFAULT;
+ }
+
+ if (ksig->ka.sa.sa_flags & SA_RESTORER) {
+ restorer = ksig->ka.sa.sa_restorer;
+ } else {
+ /* Return stub is in 32bit vsyscall page */
+ if (current->mm->context.vdso)
+ restorer = current->mm->context.vdso +
+ vdso_image_32.sym___kernel_sigreturn;
+ else
+ restorer = &frame->retcode;
+ }
+
+ put_user_try {
+ put_user_ex(ptr_to_compat(restorer), &frame->pretcode);
+
+ /*
+ * These are actually not used anymore, but left because some
+ * gdb versions depend on them as a marker.
+ */
+ put_user_ex(*((u64 *)&code), (u64 __user *)frame->retcode);
+ } put_user_catch(err);
+
+ if (err)
+ return -EFAULT;
+
+ /* Set up registers for signal handler */
+ regs->sp = (unsigned long) frame;
+ regs->ip = (unsigned long) ksig->ka.sa.sa_handler;
+
+ /* Make -mregparm=3 work */
+ regs->ax = sig;
+ regs->dx = 0;
+ regs->cx = 0;
+
+ loadsegment(ds, __USER32_DS);
+ loadsegment(es, __USER32_DS);
+
+ regs->cs = __USER32_CS;
+ regs->ss = __USER32_DS;
+
+ return 0;
+}
+
+int ia32_setup_rt_frame(int sig, struct ksignal *ksig,
+ compat_sigset_t *set, struct pt_regs *regs)
+{
+ struct rt_sigframe_ia32 __user *frame;
+ void __user *restorer;
+ int err = 0;
+ void __user *fpstate = NULL;
+
+ /* __copy_to_user optimizes that into a single 8 byte store */
+ static const struct {
+ u8 movl;
+ u32 val;
+ u16 int80;
+ u8 pad;
+ } __attribute__((packed)) code = {
+ 0xb8,
+ __NR_ia32_rt_sigreturn,
+ 0x80cd,
+ 0,
+ };
+
+ frame = get_sigframe(ksig, regs, sizeof(*frame), &fpstate);
+
+ if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
+ return -EFAULT;
+
+ put_user_try {
+ put_user_ex(sig, &frame->sig);
+ put_user_ex(ptr_to_compat(&frame->info), &frame->pinfo);
+ put_user_ex(ptr_to_compat(&frame->uc), &frame->puc);
+
+ /* Create the ucontext. */
+ if (boot_cpu_has(X86_FEATURE_XSAVE))
+ put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags);
+ else
+ put_user_ex(0, &frame->uc.uc_flags);
+ put_user_ex(0, &frame->uc.uc_link);
+ compat_save_altstack_ex(&frame->uc.uc_stack, regs->sp);
+
+ if (ksig->ka.sa.sa_flags & SA_RESTORER)
+ restorer = ksig->ka.sa.sa_restorer;
+ else
+ restorer = current->mm->context.vdso +
+ vdso_image_32.sym___kernel_rt_sigreturn;
+ put_user_ex(ptr_to_compat(restorer), &frame->pretcode);
+
+ /*
+ * Not actually used anymore, but left because some gdb
+ * versions need it.
+ */
+ put_user_ex(*((u64 *)&code), (u64 __user *)frame->retcode);
+ } put_user_catch(err);
+
+ err |= __copy_siginfo_to_user32(&frame->info, &ksig->info, false);
+ err |= ia32_setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
+ regs, set->sig[0]);
+ err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
+
+ if (err)
+ return -EFAULT;
+
+ /* Set up registers for signal handler */
+ regs->sp = (unsigned long) frame;
+ regs->ip = (unsigned long) ksig->ka.sa.sa_handler;
+
+ /* Make -mregparm=3 work */
+ regs->ax = sig;
+ regs->dx = (unsigned long) &frame->info;
+ regs->cx = (unsigned long) &frame->uc;
+
+ loadsegment(ds, __USER32_DS);
+ loadsegment(es, __USER32_DS);
+
+ regs->cs = __USER32_CS;
+ regs->ss = __USER32_DS;
+
+ return 0;
+}
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
new file mode 100644
index 000000000..11ef7b7c9
--- /dev/null
+++ b/arch/x86/ia32/sys_ia32.c
@@ -0,0 +1,242 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * sys_ia32.c: Conversion between 32bit and 64bit native syscalls. Based on
+ * sys_sparc32
+ *
+ * Copyright (C) 2000 VA Linux Co
+ * Copyright (C) 2000 Don Dugger <n0ano@valinux.com>
+ * Copyright (C) 1999 Arun Sharma <arun.sharma@intel.com>
+ * Copyright (C) 1997,1998 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
+ * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu)
+ * Copyright (C) 2000 Hewlett-Packard Co.
+ * Copyright (C) 2000 David Mosberger-Tang <davidm@hpl.hp.com>
+ * Copyright (C) 2000,2001,2002 Andi Kleen, SuSE Labs (x86-64 port)
+ *
+ * These routines maintain argument size conversion between 32bit and 64bit
+ * environment. In 2.5 most of this should be moved to a generic directory.
+ *
+ * This file assumes that there is a hole at the end of user address space.
+ *
+ * Some of the functions are LE specific currently. These are
+ * hopefully all marked. This should be fixed.
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/signal.h>
+#include <linux/syscalls.h>
+#include <linux/times.h>
+#include <linux/utsname.h>
+#include <linux/mm.h>
+#include <linux/uio.h>
+#include <linux/poll.h>
+#include <linux/personality.h>
+#include <linux/stat.h>
+#include <linux/rwsem.h>
+#include <linux/compat.h>
+#include <linux/vfs.h>
+#include <linux/ptrace.h>
+#include <linux/highuid.h>
+#include <linux/sysctl.h>
+#include <linux/slab.h>
+#include <linux/sched/task.h>
+#include <asm/mman.h>
+#include <asm/types.h>
+#include <linux/uaccess.h>
+#include <linux/atomic.h>
+#include <asm/vgtod.h>
+#include <asm/ia32.h>
+
+#define AA(__x) ((unsigned long)(__x))
+
+
+COMPAT_SYSCALL_DEFINE3(x86_truncate64, const char __user *, filename,
+ unsigned long, offset_low, unsigned long, offset_high)
+{
+ return ksys_truncate(filename,
+ ((loff_t) offset_high << 32) | offset_low);
+}
+
+COMPAT_SYSCALL_DEFINE3(x86_ftruncate64, unsigned int, fd,
+ unsigned long, offset_low, unsigned long, offset_high)
+{
+ return ksys_ftruncate(fd, ((loff_t) offset_high << 32) | offset_low);
+}
+
+/*
+ * Another set for IA32/LFS -- x86_64 struct stat is different due to
+ * support for 64bit inode numbers.
+ */
+static int cp_stat64(struct stat64 __user *ubuf, struct kstat *stat)
+{
+ typeof(ubuf->st_uid) uid = 0;
+ typeof(ubuf->st_gid) gid = 0;
+ SET_UID(uid, from_kuid_munged(current_user_ns(), stat->uid));
+ SET_GID(gid, from_kgid_munged(current_user_ns(), stat->gid));
+ if (!access_ok(VERIFY_WRITE, ubuf, sizeof(struct stat64)) ||
+ __put_user(huge_encode_dev(stat->dev), &ubuf->st_dev) ||
+ __put_user(stat->ino, &ubuf->__st_ino) ||
+ __put_user(stat->ino, &ubuf->st_ino) ||
+ __put_user(stat->mode, &ubuf->st_mode) ||
+ __put_user(stat->nlink, &ubuf->st_nlink) ||
+ __put_user(uid, &ubuf->st_uid) ||
+ __put_user(gid, &ubuf->st_gid) ||
+ __put_user(huge_encode_dev(stat->rdev), &ubuf->st_rdev) ||
+ __put_user(stat->size, &ubuf->st_size) ||
+ __put_user(stat->atime.tv_sec, &ubuf->st_atime) ||
+ __put_user(stat->atime.tv_nsec, &ubuf->st_atime_nsec) ||
+ __put_user(stat->mtime.tv_sec, &ubuf->st_mtime) ||
+ __put_user(stat->mtime.tv_nsec, &ubuf->st_mtime_nsec) ||
+ __put_user(stat->ctime.tv_sec, &ubuf->st_ctime) ||
+ __put_user(stat->ctime.tv_nsec, &ubuf->st_ctime_nsec) ||
+ __put_user(stat->blksize, &ubuf->st_blksize) ||
+ __put_user(stat->blocks, &ubuf->st_blocks))
+ return -EFAULT;
+ return 0;
+}
+
+COMPAT_SYSCALL_DEFINE2(x86_stat64, const char __user *, filename,
+ struct stat64 __user *, statbuf)
+{
+ struct kstat stat;
+ int ret = vfs_stat(filename, &stat);
+
+ if (!ret)
+ ret = cp_stat64(statbuf, &stat);
+ return ret;
+}
+
+COMPAT_SYSCALL_DEFINE2(x86_lstat64, const char __user *, filename,
+ struct stat64 __user *, statbuf)
+{
+ struct kstat stat;
+ int ret = vfs_lstat(filename, &stat);
+ if (!ret)
+ ret = cp_stat64(statbuf, &stat);
+ return ret;
+}
+
+COMPAT_SYSCALL_DEFINE2(x86_fstat64, unsigned int, fd,
+ struct stat64 __user *, statbuf)
+{
+ struct kstat stat;
+ int ret = vfs_fstat(fd, &stat);
+ if (!ret)
+ ret = cp_stat64(statbuf, &stat);
+ return ret;
+}
+
+COMPAT_SYSCALL_DEFINE4(x86_fstatat, unsigned int, dfd,
+ const char __user *, filename,
+ struct stat64 __user *, statbuf, int, flag)
+{
+ struct kstat stat;
+ int error;
+
+ error = vfs_fstatat(dfd, filename, &stat, flag);
+ if (error)
+ return error;
+ return cp_stat64(statbuf, &stat);
+}
+
+/*
+ * Linux/i386 didn't use to be able to handle more than
+ * 4 system call parameters, so these system calls used a memory
+ * block for parameter passing..
+ */
+
+struct mmap_arg_struct32 {
+ unsigned int addr;
+ unsigned int len;
+ unsigned int prot;
+ unsigned int flags;
+ unsigned int fd;
+ unsigned int offset;
+};
+
+COMPAT_SYSCALL_DEFINE1(x86_mmap, struct mmap_arg_struct32 __user *, arg)
+{
+ struct mmap_arg_struct32 a;
+
+ if (copy_from_user(&a, arg, sizeof(a)))
+ return -EFAULT;
+
+ if (a.offset & ~PAGE_MASK)
+ return -EINVAL;
+
+ return ksys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
+ a.offset>>PAGE_SHIFT);
+}
+
+/* warning: next two assume little endian */
+COMPAT_SYSCALL_DEFINE5(x86_pread, unsigned int, fd, char __user *, ubuf,
+ u32, count, u32, poslo, u32, poshi)
+{
+ return ksys_pread64(fd, ubuf, count,
+ ((loff_t)AA(poshi) << 32) | AA(poslo));
+}
+
+COMPAT_SYSCALL_DEFINE5(x86_pwrite, unsigned int, fd, const char __user *, ubuf,
+ u32, count, u32, poslo, u32, poshi)
+{
+ return ksys_pwrite64(fd, ubuf, count,
+ ((loff_t)AA(poshi) << 32) | AA(poslo));
+}
+
+
+/*
+ * Some system calls that need sign extended arguments. This could be
+ * done by a generic wrapper.
+ */
+COMPAT_SYSCALL_DEFINE6(x86_fadvise64_64, int, fd, __u32, offset_low,
+ __u32, offset_high, __u32, len_low, __u32, len_high,
+ int, advice)
+{
+ return ksys_fadvise64_64(fd,
+ (((u64)offset_high)<<32) | offset_low,
+ (((u64)len_high)<<32) | len_low,
+ advice);
+}
+
+COMPAT_SYSCALL_DEFINE4(x86_readahead, int, fd, unsigned int, off_lo,
+ unsigned int, off_hi, size_t, count)
+{
+ return ksys_readahead(fd, ((u64)off_hi << 32) | off_lo, count);
+}
+
+COMPAT_SYSCALL_DEFINE6(x86_sync_file_range, int, fd, unsigned int, off_low,
+ unsigned int, off_hi, unsigned int, n_low,
+ unsigned int, n_hi, int, flags)
+{
+ return ksys_sync_file_range(fd,
+ ((u64)off_hi << 32) | off_low,
+ ((u64)n_hi << 32) | n_low, flags);
+}
+
+COMPAT_SYSCALL_DEFINE5(x86_fadvise64, int, fd, unsigned int, offset_lo,
+ unsigned int, offset_hi, size_t, len, int, advice)
+{
+ return ksys_fadvise64_64(fd, ((u64)offset_hi << 32) | offset_lo,
+ len, advice);
+}
+
+COMPAT_SYSCALL_DEFINE6(x86_fallocate, int, fd, int, mode,
+ unsigned int, offset_lo, unsigned int, offset_hi,
+ unsigned int, len_lo, unsigned int, len_hi)
+{
+ return ksys_fallocate(fd, mode, ((u64)offset_hi << 32) | offset_lo,
+ ((u64)len_hi << 32) | len_lo);
+}
+
+/*
+ * The 32-bit clone ABI is CONFIG_CLONE_BACKWARDS
+ */
+COMPAT_SYSCALL_DEFINE5(x86_clone, unsigned long, clone_flags,
+ unsigned long, newsp, int __user *, parent_tidptr,
+ unsigned long, tls_val, int __user *, child_tidptr)
+{
+ return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr,
+ tls_val);
+}
diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild
new file mode 100644
index 000000000..a0ab9ab61
--- /dev/null
+++ b/arch/x86/include/asm/Kbuild
@@ -0,0 +1,13 @@
+
+
+generated-y += syscalls_32.h
+generated-y += syscalls_64.h
+generated-y += unistd_32_ia32.h
+generated-y += unistd_64_x32.h
+generated-y += xen-hypercalls.h
+
+generic-y += dma-contiguous.h
+generic-y += early_ioremap.h
+generic-y += export.h
+generic-y += mcs_spinlock.h
+generic-y += mm-arch-hooks.h
diff --git a/arch/x86/include/asm/a.out-core.h b/arch/x86/include/asm/a.out-core.h
new file mode 100644
index 000000000..7d3ece8bf
--- /dev/null
+++ b/arch/x86/include/asm/a.out-core.h
@@ -0,0 +1,67 @@
+/* a.out coredump register dumper
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#ifndef _ASM_X86_A_OUT_CORE_H
+#define _ASM_X86_A_OUT_CORE_H
+
+#ifdef __KERNEL__
+#ifdef CONFIG_X86_32
+
+#include <linux/user.h>
+#include <linux/elfcore.h>
+#include <linux/mm_types.h>
+
+#include <asm/debugreg.h>
+
+/*
+ * fill in the user structure for an a.out core dump
+ */
+static inline void aout_dump_thread(struct pt_regs *regs, struct user *dump)
+{
+/* changed the size calculations - should hopefully work better. lbt */
+ dump->magic = CMAGIC;
+ dump->start_code = 0;
+ dump->start_stack = regs->sp & ~(PAGE_SIZE - 1);
+ dump->u_tsize = ((unsigned long)current->mm->end_code) >> PAGE_SHIFT;
+ dump->u_dsize = ((unsigned long)(current->mm->brk + (PAGE_SIZE - 1)))
+ >> PAGE_SHIFT;
+ dump->u_dsize -= dump->u_tsize;
+ dump->u_ssize = 0;
+ aout_dump_debugregs(dump);
+
+ if (dump->start_stack < TASK_SIZE)
+ dump->u_ssize = ((unsigned long)(TASK_SIZE - dump->start_stack))
+ >> PAGE_SHIFT;
+
+ dump->regs.bx = regs->bx;
+ dump->regs.cx = regs->cx;
+ dump->regs.dx = regs->dx;
+ dump->regs.si = regs->si;
+ dump->regs.di = regs->di;
+ dump->regs.bp = regs->bp;
+ dump->regs.ax = regs->ax;
+ dump->regs.ds = (u16)regs->ds;
+ dump->regs.es = (u16)regs->es;
+ dump->regs.fs = (u16)regs->fs;
+ dump->regs.gs = get_user_gs(regs);
+ dump->regs.orig_ax = regs->orig_ax;
+ dump->regs.ip = regs->ip;
+ dump->regs.cs = (u16)regs->cs;
+ dump->regs.flags = regs->flags;
+ dump->regs.sp = regs->sp;
+ dump->regs.ss = (u16)regs->ss;
+
+ dump->u_fpvalid = dump_fpu(regs, &dump->i387);
+}
+
+#endif /* CONFIG_X86_32 */
+#endif /* __KERNEL__ */
+#endif /* _ASM_X86_A_OUT_CORE_H */
diff --git a/arch/x86/include/asm/acenv.h b/arch/x86/include/asm/acenv.h
new file mode 100644
index 000000000..6de59a4f7
--- /dev/null
+++ b/arch/x86/include/asm/acenv.h
@@ -0,0 +1,57 @@
+/*
+ * X86 specific ACPICA environments and implementation
+ *
+ * Copyright (C) 2014, Intel Corporation
+ * Author: Lv Zheng <lv.zheng@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_X86_ACENV_H
+#define _ASM_X86_ACENV_H
+
+#include <asm/special_insns.h>
+
+/* Asm macros */
+
+/*
+ * ACPI_FLUSH_CPU_CACHE() flushes caches on entering sleep states.
+ * It is required to prevent data loss.
+ *
+ * While running inside virtual machine, the kernel can bypass cache flushing.
+ * Changing sleep state in a virtual machine doesn't affect the host system
+ * sleep state and cannot lead to data loss.
+ */
+#define ACPI_FLUSH_CPU_CACHE() \
+do { \
+ if (!cpu_feature_enabled(X86_FEATURE_HYPERVISOR)) \
+ wbinvd(); \
+} while (0)
+
+int __acpi_acquire_global_lock(unsigned int *lock);
+int __acpi_release_global_lock(unsigned int *lock);
+
+#define ACPI_ACQUIRE_GLOBAL_LOCK(facs, Acq) \
+ ((Acq) = __acpi_acquire_global_lock(&facs->global_lock))
+
+#define ACPI_RELEASE_GLOBAL_LOCK(facs, Acq) \
+ ((Acq) = __acpi_release_global_lock(&facs->global_lock))
+
+/*
+ * Math helper asm macros
+ */
+#define ACPI_DIV_64_BY_32(n_hi, n_lo, d32, q32, r32) \
+ asm("divl %2;" \
+ : "=a"(q32), "=d"(r32) \
+ : "r"(d32), \
+ "0"(n_lo), "1"(n_hi))
+
+#define ACPI_SHIFT_RIGHT_64(n_hi, n_lo) \
+ asm("shrl $1,%2 ;" \
+ "rcrl $1,%3;" \
+ : "=r"(n_hi), "=r"(n_lo) \
+ : "0"(n_hi), "1"(n_lo))
+
+#endif /* _ASM_X86_ACENV_H */
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
new file mode 100644
index 000000000..a303d7b7d
--- /dev/null
+++ b/arch/x86/include/asm/acpi.h
@@ -0,0 +1,188 @@
+#ifndef _ASM_X86_ACPI_H
+#define _ASM_X86_ACPI_H
+
+/*
+ * Copyright (C) 2001 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
+ * Copyright (C) 2001 Patrick Mochel <mochel@osdl.org>
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+#include <acpi/pdc_intel.h>
+
+#include <asm/numa.h>
+#include <asm/fixmap.h>
+#include <asm/processor.h>
+#include <asm/mmu.h>
+#include <asm/mpspec.h>
+#include <asm/realmode.h>
+#include <asm/x86_init.h>
+
+#ifdef CONFIG_ACPI_APEI
+# include <asm/pgtable_types.h>
+#endif
+
+#ifdef CONFIG_ACPI
+extern int acpi_lapic;
+extern int acpi_ioapic;
+extern int acpi_noirq;
+extern int acpi_strict;
+extern int acpi_disabled;
+extern int acpi_pci_disabled;
+extern int acpi_skip_timer_override;
+extern int acpi_use_timer_override;
+extern int acpi_fix_pin2_polarity;
+extern int acpi_disable_cmcff;
+
+extern u8 acpi_sci_flags;
+extern u32 acpi_sci_override_gsi;
+void acpi_pic_sci_set_trigger(unsigned int, u16);
+
+struct device;
+
+extern int (*__acpi_register_gsi)(struct device *dev, u32 gsi,
+ int trigger, int polarity);
+extern void (*__acpi_unregister_gsi)(u32 gsi);
+
+static inline void disable_acpi(void)
+{
+ acpi_disabled = 1;
+ acpi_pci_disabled = 1;
+ acpi_noirq = 1;
+}
+
+extern int acpi_gsi_to_irq(u32 gsi, unsigned int *irq);
+
+static inline void acpi_noirq_set(void) { acpi_noirq = 1; }
+static inline void acpi_disable_pci(void)
+{
+ acpi_pci_disabled = 1;
+ acpi_noirq_set();
+}
+
+/* Low-level suspend routine. */
+extern int (*acpi_suspend_lowlevel)(void);
+
+/* Physical address to resume after wakeup */
+#define acpi_wakeup_address ((unsigned long)(real_mode_header->wakeup_start))
+
+/*
+ * Check if the CPU can handle C2 and deeper
+ */
+static inline unsigned int acpi_processor_cstate_check(unsigned int max_cstate)
+{
+ /*
+ * Early models (<=5) of AMD Opterons are not supposed to go into
+ * C2 state.
+ *
+ * Steppings 0x0A and later are good
+ */
+ if (boot_cpu_data.x86 == 0x0F &&
+ boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
+ boot_cpu_data.x86_model <= 0x05 &&
+ boot_cpu_data.x86_stepping < 0x0A)
+ return 1;
+ else if (boot_cpu_has(X86_BUG_AMD_APIC_C1E))
+ return 1;
+ else
+ return max_cstate;
+}
+
+static inline bool arch_has_acpi_pdc(void)
+{
+ struct cpuinfo_x86 *c = &cpu_data(0);
+ return (c->x86_vendor == X86_VENDOR_INTEL ||
+ c->x86_vendor == X86_VENDOR_CENTAUR);
+}
+
+static inline void arch_acpi_set_pdc_bits(u32 *buf)
+{
+ struct cpuinfo_x86 *c = &cpu_data(0);
+
+ buf[2] |= ACPI_PDC_C_CAPABILITY_SMP;
+
+ if (cpu_has(c, X86_FEATURE_EST))
+ buf[2] |= ACPI_PDC_EST_CAPABILITY_SWSMP;
+
+ if (cpu_has(c, X86_FEATURE_ACPI))
+ buf[2] |= ACPI_PDC_T_FFH;
+
+ /*
+ * If mwait/monitor is unsupported, C2/C3_FFH will be disabled
+ */
+ if (!cpu_has(c, X86_FEATURE_MWAIT))
+ buf[2] &= ~(ACPI_PDC_C_C2C3_FFH);
+}
+
+static inline bool acpi_has_cpu_in_madt(void)
+{
+ return !!acpi_lapic;
+}
+
+#define ACPI_HAVE_ARCH_GET_ROOT_POINTER
+static inline u64 acpi_arch_get_root_pointer(void)
+{
+ return x86_init.acpi.get_root_pointer();
+}
+
+void acpi_generic_reduced_hw_init(void);
+
+#else /* !CONFIG_ACPI */
+
+#define acpi_lapic 0
+#define acpi_ioapic 0
+#define acpi_disable_cmcff 0
+static inline void acpi_noirq_set(void) { }
+static inline void acpi_disable_pci(void) { }
+static inline void disable_acpi(void) { }
+
+static inline void acpi_generic_reduced_hw_init(void) { }
+
+#endif /* !CONFIG_ACPI */
+
+#define ARCH_HAS_POWER_INIT 1
+
+#ifdef CONFIG_ACPI_NUMA
+extern int x86_acpi_numa_init(void);
+#endif /* CONFIG_ACPI_NUMA */
+
+#define acpi_unlazy_tlb(x) leave_mm(x)
+
+#ifdef CONFIG_ACPI_APEI
+static inline pgprot_t arch_apei_get_mem_attribute(phys_addr_t addr)
+{
+ /*
+ * We currently have no way to look up the EFI memory map
+ * attributes for a region in a consistent way, because the
+ * memmap is discarded after efi_free_boot_services(). So if
+ * you call efi_mem_attributes() during boot and at runtime,
+ * you could theoretically see different attributes.
+ *
+ * We are yet to see any x86 platforms that require anything
+ * other than PAGE_KERNEL (some ARM64 platforms require the
+ * equivalent of PAGE_KERNEL_NOCACHE). Additionally, if SME
+ * is active, the ACPI information will not be encrypted,
+ * so return PAGE_KERNEL_NOENC until we know differently.
+ */
+ return PAGE_KERNEL_NOENC;
+}
+#endif
+
+#define ACPI_TABLE_UPGRADE_MAX_PHYS (max_low_pfn_mapped << PAGE_SHIFT)
+
+#endif /* _ASM_X86_ACPI_H */
diff --git a/arch/x86/include/asm/agp.h b/arch/x86/include/asm/agp.h
new file mode 100644
index 000000000..8e25bf4f3
--- /dev/null
+++ b/arch/x86/include/asm/agp.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_AGP_H
+#define _ASM_X86_AGP_H
+
+#include <asm/pgtable.h>
+#include <asm/cacheflush.h>
+
+/*
+ * Functions to keep the agpgart mappings coherent with the MMU. The
+ * GART gives the CPU a physical alias of pages in memory. The alias
+ * region is mapped uncacheable. Make sure there are no conflicting
+ * mappings with different cachability attributes for the same
+ * page. This avoids data corruption on some CPUs.
+ */
+
+#define map_page_into_agp(page) set_pages_uc(page, 1)
+#define unmap_page_from_agp(page) set_pages_wb(page, 1)
+
+/*
+ * Could use CLFLUSH here if the cpu supports it. But then it would
+ * need to be called for each cacheline of the whole page so it may
+ * not be worth it. Would need a page for it.
+ */
+#define flush_agp_cache() wbinvd()
+
+/* GATT allocation. Returns/accepts GATT kernel virtual address. */
+#define alloc_gatt_pages(order) \
+ ((char *)__get_free_pages(GFP_KERNEL, (order)))
+#define free_gatt_pages(table, order) \
+ free_pages((unsigned long)(table), (order))
+
+#endif /* _ASM_X86_AGP_H */
diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h
new file mode 100644
index 000000000..31b627b43
--- /dev/null
+++ b/arch/x86/include/asm/alternative-asm.h
@@ -0,0 +1,103 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_ALTERNATIVE_ASM_H
+#define _ASM_X86_ALTERNATIVE_ASM_H
+
+#ifdef __ASSEMBLY__
+
+#include <asm/asm.h>
+
+#ifdef CONFIG_SMP
+ .macro LOCK_PREFIX
+672: lock
+ .pushsection .smp_locks,"a"
+ .balign 4
+ .long 672b - .
+ .popsection
+ .endm
+#else
+ .macro LOCK_PREFIX
+ .endm
+#endif
+
+/*
+ * Issue one struct alt_instr descriptor entry (need to put it into
+ * the section .altinstructions, see below). This entry contains
+ * enough information for the alternatives patching code to patch an
+ * instruction. See apply_alternatives().
+ */
+.macro altinstruction_entry orig alt feature orig_len alt_len pad_len
+ .long \orig - .
+ .long \alt - .
+ .word \feature
+ .byte \orig_len
+ .byte \alt_len
+ .byte \pad_len
+.endm
+
+/*
+ * Define an alternative between two instructions. If @feature is
+ * present, early code in apply_alternatives() replaces @oldinstr with
+ * @newinstr. ".skip" directive takes care of proper instruction padding
+ * in case @newinstr is longer than @oldinstr.
+ */
+.macro ALTERNATIVE oldinstr, newinstr, feature
+140:
+ \oldinstr
+141:
+ .skip -(((144f-143f)-(141b-140b)) > 0) * ((144f-143f)-(141b-140b)),0x90
+142:
+
+ .pushsection .altinstructions,"a"
+ altinstruction_entry 140b,143f,\feature,142b-140b,144f-143f,142b-141b
+ .popsection
+
+ .pushsection .altinstr_replacement,"ax"
+143:
+ \newinstr
+144:
+ .popsection
+.endm
+
+#define old_len 141b-140b
+#define new_len1 144f-143f
+#define new_len2 145f-144f
+
+/*
+ * gas compatible max based on the idea from:
+ * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax
+ *
+ * The additional "-" is needed because gas uses a "true" value of -1.
+ */
+#define alt_max_short(a, b) ((a) ^ (((a) ^ (b)) & -(-((a) < (b)))))
+
+
+/*
+ * Same as ALTERNATIVE macro above but for two alternatives. If CPU
+ * has @feature1, it replaces @oldinstr with @newinstr1. If CPU has
+ * @feature2, it replaces @oldinstr with @feature2.
+ */
+.macro ALTERNATIVE_2 oldinstr, newinstr1, feature1, newinstr2, feature2
+140:
+ \oldinstr
+141:
+ .skip -((alt_max_short(new_len1, new_len2) - (old_len)) > 0) * \
+ (alt_max_short(new_len1, new_len2) - (old_len)),0x90
+142:
+
+ .pushsection .altinstructions,"a"
+ altinstruction_entry 140b,143f,\feature1,142b-140b,144f-143f,142b-141b
+ altinstruction_entry 140b,144f,\feature2,142b-140b,145f-144f,142b-141b
+ .popsection
+
+ .pushsection .altinstr_replacement,"ax"
+143:
+ \newinstr1
+144:
+ \newinstr2
+145:
+ .popsection
+.endm
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_X86_ALTERNATIVE_ASM_H */
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
new file mode 100644
index 000000000..4cd6a3b71
--- /dev/null
+++ b/arch/x86/include/asm/alternative.h
@@ -0,0 +1,241 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_ALTERNATIVE_H
+#define _ASM_X86_ALTERNATIVE_H
+
+#ifndef __ASSEMBLY__
+
+#include <linux/types.h>
+#include <linux/stddef.h>
+#include <linux/stringify.h>
+#include <asm/asm.h>
+
+/*
+ * Alternative inline assembly for SMP.
+ *
+ * The LOCK_PREFIX macro defined here replaces the LOCK and
+ * LOCK_PREFIX macros used everywhere in the source tree.
+ *
+ * SMP alternatives use the same data structures as the other
+ * alternatives and the X86_FEATURE_UP flag to indicate the case of a
+ * UP system running a SMP kernel. The existing apply_alternatives()
+ * works fine for patching a SMP kernel for UP.
+ *
+ * The SMP alternative tables can be kept after boot and contain both
+ * UP and SMP versions of the instructions to allow switching back to
+ * SMP at runtime, when hotplugging in a new CPU, which is especially
+ * useful in virtualized environments.
+ *
+ * The very common lock prefix is handled as special case in a
+ * separate table which is a pure address list without replacement ptr
+ * and size information. That keeps the table sizes small.
+ */
+
+#ifdef CONFIG_SMP
+#define LOCK_PREFIX_HERE \
+ ".pushsection .smp_locks,\"a\"\n" \
+ ".balign 4\n" \
+ ".long 671f - .\n" /* offset */ \
+ ".popsection\n" \
+ "671:"
+
+#define LOCK_PREFIX LOCK_PREFIX_HERE "\n\tlock; "
+
+#else /* ! CONFIG_SMP */
+#define LOCK_PREFIX_HERE ""
+#define LOCK_PREFIX ""
+#endif
+
+struct alt_instr {
+ s32 instr_offset; /* original instruction */
+ s32 repl_offset; /* offset to replacement instruction */
+ u16 cpuid; /* cpuid bit set for replacement */
+ u8 instrlen; /* length of original instruction */
+ u8 replacementlen; /* length of new instruction */
+ u8 padlen; /* length of build-time padding */
+} __packed;
+
+/*
+ * Debug flag that can be tested to see whether alternative
+ * instructions were patched in already:
+ */
+extern int alternatives_patched;
+
+extern void alternative_instructions(void);
+extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
+
+struct module;
+
+#ifdef CONFIG_SMP
+extern void alternatives_smp_module_add(struct module *mod, char *name,
+ void *locks, void *locks_end,
+ void *text, void *text_end);
+extern void alternatives_smp_module_del(struct module *mod);
+extern void alternatives_enable_smp(void);
+extern int alternatives_text_reserved(void *start, void *end);
+extern bool skip_smp_alternatives;
+#else
+static inline void alternatives_smp_module_add(struct module *mod, char *name,
+ void *locks, void *locks_end,
+ void *text, void *text_end) {}
+static inline void alternatives_smp_module_del(struct module *mod) {}
+static inline void alternatives_enable_smp(void) {}
+static inline int alternatives_text_reserved(void *start, void *end)
+{
+ return 0;
+}
+#endif /* CONFIG_SMP */
+
+#define b_replacement(num) "664"#num
+#define e_replacement(num) "665"#num
+
+#define alt_end_marker "663"
+#define alt_slen "662b-661b"
+#define alt_pad_len alt_end_marker"b-662b"
+#define alt_total_slen alt_end_marker"b-661b"
+#define alt_rlen(num) e_replacement(num)"f-"b_replacement(num)"f"
+
+#define __OLDINSTR(oldinstr, num) \
+ "661:\n\t" oldinstr "\n662:\n" \
+ ".skip -(((" alt_rlen(num) ")-(" alt_slen ")) > 0) * " \
+ "((" alt_rlen(num) ")-(" alt_slen ")),0x90\n"
+
+#define OLDINSTR(oldinstr, num) \
+ __OLDINSTR(oldinstr, num) \
+ alt_end_marker ":\n"
+
+/*
+ * gas compatible max based on the idea from:
+ * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax
+ *
+ * The additional "-" is needed because gas uses a "true" value of -1.
+ */
+#define alt_max_short(a, b) "((" a ") ^ (((" a ") ^ (" b ")) & -(-((" a ") < (" b ")))))"
+
+/*
+ * Pad the second replacement alternative with additional NOPs if it is
+ * additionally longer than the first replacement alternative.
+ */
+#define OLDINSTR_2(oldinstr, num1, num2) \
+ "661:\n\t" oldinstr "\n662:\n" \
+ ".skip -((" alt_max_short(alt_rlen(num1), alt_rlen(num2)) " - (" alt_slen ")) > 0) * " \
+ "(" alt_max_short(alt_rlen(num1), alt_rlen(num2)) " - (" alt_slen ")), 0x90\n" \
+ alt_end_marker ":\n"
+
+#define ALTINSTR_ENTRY(feature, num) \
+ " .long 661b - .\n" /* label */ \
+ " .long " b_replacement(num)"f - .\n" /* new instruction */ \
+ " .word " __stringify(feature) "\n" /* feature bit */ \
+ " .byte " alt_total_slen "\n" /* source len */ \
+ " .byte " alt_rlen(num) "\n" /* replacement len */ \
+ " .byte " alt_pad_len "\n" /* pad len */
+
+#define ALTINSTR_REPLACEMENT(newinstr, feature, num) /* replacement */ \
+ b_replacement(num)":\n\t" newinstr "\n" e_replacement(num) ":\n\t"
+
+/* alternative assembly primitive: */
+#define ALTERNATIVE(oldinstr, newinstr, feature) \
+ OLDINSTR(oldinstr, 1) \
+ ".pushsection .altinstructions,\"a\"\n" \
+ ALTINSTR_ENTRY(feature, 1) \
+ ".popsection\n" \
+ ".pushsection .altinstr_replacement, \"ax\"\n" \
+ ALTINSTR_REPLACEMENT(newinstr, feature, 1) \
+ ".popsection\n"
+
+#define ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)\
+ OLDINSTR_2(oldinstr, 1, 2) \
+ ".pushsection .altinstructions,\"a\"\n" \
+ ALTINSTR_ENTRY(feature1, 1) \
+ ALTINSTR_ENTRY(feature2, 2) \
+ ".popsection\n" \
+ ".pushsection .altinstr_replacement, \"ax\"\n" \
+ ALTINSTR_REPLACEMENT(newinstr1, feature1, 1) \
+ ALTINSTR_REPLACEMENT(newinstr2, feature2, 2) \
+ ".popsection\n"
+
+/*
+ * Alternative instructions for different CPU types or capabilities.
+ *
+ * This allows to use optimized instructions even on generic binary
+ * kernels.
+ *
+ * length of oldinstr must be longer or equal the length of newinstr
+ * It can be padded with nops as needed.
+ *
+ * For non barrier like inlines please define new variants
+ * without volatile and memory clobber.
+ */
+#define alternative(oldinstr, newinstr, feature) \
+ asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : "memory")
+
+#define alternative_2(oldinstr, newinstr1, feature1, newinstr2, feature2) \
+ asm volatile(ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2) ::: "memory")
+
+/*
+ * Alternative inline assembly with input.
+ *
+ * Pecularities:
+ * No memory clobber here.
+ * Argument numbers start with 1.
+ * Best is to use constraints that are fixed size (like (%1) ... "r")
+ * If you use variable sized constraints like "m" or "g" in the
+ * replacement make sure to pad to the worst case length.
+ * Leaving an unused argument 0 to keep API compatibility.
+ */
+#define alternative_input(oldinstr, newinstr, feature, input...) \
+ asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) \
+ : : "i" (0), ## input)
+
+/*
+ * This is similar to alternative_input. But it has two features and
+ * respective instructions.
+ *
+ * If CPU has feature2, newinstr2 is used.
+ * Otherwise, if CPU has feature1, newinstr1 is used.
+ * Otherwise, oldinstr is used.
+ */
+#define alternative_input_2(oldinstr, newinstr1, feature1, newinstr2, \
+ feature2, input...) \
+ asm volatile(ALTERNATIVE_2(oldinstr, newinstr1, feature1, \
+ newinstr2, feature2) \
+ : : "i" (0), ## input)
+
+/* Like alternative_input, but with a single output argument */
+#define alternative_io(oldinstr, newinstr, feature, output, input...) \
+ asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) \
+ : output : "i" (0), ## input)
+
+/* Like alternative_io, but for replacing a direct call with another one. */
+#define alternative_call(oldfunc, newfunc, feature, output, input...) \
+ asm volatile (ALTERNATIVE("call %P[old]", "call %P[new]", feature) \
+ : output : [old] "i" (oldfunc), [new] "i" (newfunc), ## input)
+
+/*
+ * Like alternative_call, but there are two features and respective functions.
+ * If CPU has feature2, function2 is used.
+ * Otherwise, if CPU has feature1, function1 is used.
+ * Otherwise, old function is used.
+ */
+#define alternative_call_2(oldfunc, newfunc1, feature1, newfunc2, feature2, \
+ output, input...) \
+ asm volatile (ALTERNATIVE_2("call %P[old]", "call %P[new1]", feature1,\
+ "call %P[new2]", feature2) \
+ : output, ASM_CALL_CONSTRAINT \
+ : [old] "i" (oldfunc), [new1] "i" (newfunc1), \
+ [new2] "i" (newfunc2), ## input)
+
+/*
+ * use this macro(s) if you need more than one output parameter
+ * in alternative_io
+ */
+#define ASM_OUTPUT2(a...) a
+
+/*
+ * use this macro if you need clobbers but no inputs in
+ * alternative_{input,io,call}()
+ */
+#define ASM_NO_INPUT_CLOBBER(clbr...) "i" (0) : clbr
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_X86_ALTERNATIVE_H */
diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
new file mode 100644
index 000000000..fddb6d262
--- /dev/null
+++ b/arch/x86/include/asm/amd_nb.h
@@ -0,0 +1,124 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_AMD_NB_H
+#define _ASM_X86_AMD_NB_H
+
+#include <linux/ioport.h>
+#include <linux/pci.h>
+#include <linux/refcount.h>
+
+struct amd_nb_bus_dev_range {
+ u8 bus;
+ u8 dev_base;
+ u8 dev_limit;
+};
+
+extern const struct pci_device_id amd_nb_misc_ids[];
+extern const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[];
+
+extern bool early_is_amd_nb(u32 value);
+extern struct resource *amd_get_mmconfig_range(struct resource *res);
+extern int amd_cache_northbridges(void);
+extern void amd_flush_garts(void);
+extern int amd_numa_init(void);
+extern int amd_get_subcaches(int);
+extern int amd_set_subcaches(int, unsigned long);
+
+extern int amd_smn_read(u16 node, u32 address, u32 *value);
+extern int amd_smn_write(u16 node, u32 address, u32 value);
+extern int amd_df_indirect_read(u16 node, u8 func, u16 reg, u8 instance_id, u32 *lo);
+
+struct amd_l3_cache {
+ unsigned indices;
+ u8 subcaches[4];
+};
+
+struct threshold_block {
+ unsigned int block; /* Number within bank */
+ unsigned int bank; /* MCA bank the block belongs to */
+ unsigned int cpu; /* CPU which controls MCA bank */
+ u32 address; /* MSR address for the block */
+ u16 interrupt_enable; /* Enable/Disable APIC interrupt */
+ bool interrupt_capable; /* Bank can generate an interrupt. */
+
+ u16 threshold_limit; /*
+ * Value upon which threshold
+ * interrupt is generated.
+ */
+
+ struct kobject kobj; /* sysfs object */
+ struct list_head miscj; /*
+ * List of threshold blocks
+ * within a bank.
+ */
+};
+
+struct threshold_bank {
+ struct kobject *kobj;
+ struct threshold_block *blocks;
+
+ /* initialized to the number of CPUs on the node sharing this bank */
+ refcount_t cpus;
+};
+
+struct amd_northbridge {
+ struct pci_dev *root;
+ struct pci_dev *misc;
+ struct pci_dev *link;
+ struct amd_l3_cache l3_cache;
+ struct threshold_bank *bank4;
+};
+
+struct amd_northbridge_info {
+ u16 num;
+ u64 flags;
+ struct amd_northbridge *nb;
+};
+
+#define AMD_NB_GART BIT(0)
+#define AMD_NB_L3_INDEX_DISABLE BIT(1)
+#define AMD_NB_L3_PARTITIONING BIT(2)
+
+#ifdef CONFIG_AMD_NB
+
+u16 amd_nb_num(void);
+bool amd_nb_has_feature(unsigned int feature);
+struct amd_northbridge *node_to_amd_nb(int node);
+
+static inline u16 amd_pci_dev_to_node_id(struct pci_dev *pdev)
+{
+ struct pci_dev *misc;
+ int i;
+
+ for (i = 0; i != amd_nb_num(); i++) {
+ misc = node_to_amd_nb(i)->misc;
+
+ if (pci_domain_nr(misc->bus) == pci_domain_nr(pdev->bus) &&
+ PCI_SLOT(misc->devfn) == PCI_SLOT(pdev->devfn))
+ return i;
+ }
+
+ WARN(1, "Unable to find AMD Northbridge id for %s\n", pci_name(pdev));
+ return 0;
+}
+
+static inline bool amd_gart_present(void)
+{
+ /* GART present only on Fam15h, upto model 0fh */
+ if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 ||
+ (boot_cpu_data.x86 == 0x15 && boot_cpu_data.x86_model < 0x10))
+ return true;
+
+ return false;
+}
+
+#else
+
+#define amd_nb_num(x) 0
+#define amd_nb_has_feature(x) false
+#define node_to_amd_nb(x) NULL
+#define amd_gart_present(x) false
+
+#endif
+
+
+#endif /* _ASM_X86_AMD_NB_H */
diff --git a/arch/x86/include/asm/apb_timer.h b/arch/x86/include/asm/apb_timer.h
new file mode 100644
index 000000000..0acbac299
--- /dev/null
+++ b/arch/x86/include/asm/apb_timer.h
@@ -0,0 +1,49 @@
+/*
+ * apb_timer.h: Driver for Langwell APB timer based on Synopsis DesignWare
+ *
+ * (C) Copyright 2009 Intel Corporation
+ * Author: Jacob Pan (jacob.jun.pan@intel.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ *
+ * Note:
+ */
+
+#ifndef ASM_X86_APBT_H
+#define ASM_X86_APBT_H
+#include <linux/sfi.h>
+
+#ifdef CONFIG_APB_TIMER
+
+/* default memory mapped register base */
+#define LNW_SCU_ADDR 0xFF100000
+#define LNW_EXT_TIMER_OFFSET 0x1B800
+#define APBT_DEFAULT_BASE (LNW_SCU_ADDR+LNW_EXT_TIMER_OFFSET)
+#define LNW_EXT_TIMER_PGOFFSET 0x800
+
+/* APBT clock speed range from PCLK to fabric base, 25-100MHz */
+#define APBT_MAX_FREQ 50000000
+#define APBT_MIN_FREQ 1000000
+#define APBT_MMAP_SIZE 1024
+
+#define APBT_DEV_USED 1
+
+extern void apbt_time_init(void);
+extern unsigned long apbt_quick_calibrate(void);
+extern int arch_setup_apbt_irqs(int irq, int trigger, int mask, int cpu);
+extern void apbt_setup_secondary_clock(void);
+
+extern struct sfi_timer_table_entry *sfi_get_mtmr(int hint);
+extern void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr);
+extern int sfi_mtimer_num;
+
+#else /* CONFIG_APB_TIMER */
+
+static inline unsigned long apbt_quick_calibrate(void) {return 0; }
+static inline void apbt_time_init(void) { }
+
+#endif
+#endif /* ASM_X86_APBT_H */
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
new file mode 100644
index 000000000..163c2af44
--- /dev/null
+++ b/arch/x86/include/asm/apic.h
@@ -0,0 +1,547 @@
+#ifndef _ASM_X86_APIC_H
+#define _ASM_X86_APIC_H
+
+#include <linux/cpumask.h>
+
+#include <asm/alternative.h>
+#include <asm/cpufeature.h>
+#include <asm/apicdef.h>
+#include <linux/atomic.h>
+#include <asm/fixmap.h>
+#include <asm/mpspec.h>
+#include <asm/msr.h>
+#include <asm/hardirq.h>
+
+#define ARCH_APICTIMER_STOPS_ON_C3 1
+
+/*
+ * Debugging macros
+ */
+#define APIC_QUIET 0
+#define APIC_VERBOSE 1
+#define APIC_DEBUG 2
+
+/* Macros for apic_extnmi which controls external NMI masking */
+#define APIC_EXTNMI_BSP 0 /* Default */
+#define APIC_EXTNMI_ALL 1
+#define APIC_EXTNMI_NONE 2
+
+/*
+ * Define the default level of output to be very little
+ * This can be turned up by using apic=verbose for more
+ * information and apic=debug for _lots_ of information.
+ * apic_verbosity is defined in apic.c
+ */
+#define apic_printk(v, s, a...) do { \
+ if ((v) <= apic_verbosity) \
+ printk(s, ##a); \
+ } while (0)
+
+
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32)
+extern void generic_apic_probe(void);
+#else
+static inline void generic_apic_probe(void)
+{
+}
+#endif
+
+#ifdef CONFIG_X86_LOCAL_APIC
+
+extern int apic_verbosity;
+extern int local_apic_timer_c2_ok;
+
+extern int disable_apic;
+extern unsigned int lapic_timer_frequency;
+
+extern enum apic_intr_mode_id apic_intr_mode;
+enum apic_intr_mode_id {
+ APIC_PIC,
+ APIC_VIRTUAL_WIRE,
+ APIC_VIRTUAL_WIRE_NO_CONFIG,
+ APIC_SYMMETRIC_IO,
+ APIC_SYMMETRIC_IO_NO_ROUTING
+};
+
+#ifdef CONFIG_SMP
+extern void __inquire_remote_apic(int apicid);
+#else /* CONFIG_SMP */
+static inline void __inquire_remote_apic(int apicid)
+{
+}
+#endif /* CONFIG_SMP */
+
+static inline void default_inquire_remote_apic(int apicid)
+{
+ if (apic_verbosity >= APIC_DEBUG)
+ __inquire_remote_apic(apicid);
+}
+
+/*
+ * With 82489DX we can't rely on apic feature bit
+ * retrieved via cpuid but still have to deal with
+ * such an apic chip so we assume that SMP configuration
+ * is found from MP table (64bit case uses ACPI mostly
+ * which set smp presence flag as well so we are safe
+ * to use this helper too).
+ */
+static inline bool apic_from_smp_config(void)
+{
+ return smp_found_config && !disable_apic;
+}
+
+/*
+ * Basic functions accessing APICs.
+ */
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#endif
+
+extern int setup_profiling_timer(unsigned int);
+
+static inline void native_apic_mem_write(u32 reg, u32 v)
+{
+ volatile u32 *addr = (volatile u32 *)(APIC_BASE + reg);
+
+ alternative_io("movl %0, %P1", "xchgl %0, %P1", X86_BUG_11AP,
+ ASM_OUTPUT2("=r" (v), "=m" (*addr)),
+ ASM_OUTPUT2("0" (v), "m" (*addr)));
+}
+
+static inline u32 native_apic_mem_read(u32 reg)
+{
+ return *((volatile u32 *)(APIC_BASE + reg));
+}
+
+extern void native_apic_wait_icr_idle(void);
+extern u32 native_safe_apic_wait_icr_idle(void);
+extern void native_apic_icr_write(u32 low, u32 id);
+extern u64 native_apic_icr_read(void);
+
+static inline bool apic_is_x2apic_enabled(void)
+{
+ u64 msr;
+
+ if (rdmsrl_safe(MSR_IA32_APICBASE, &msr))
+ return false;
+ return msr & X2APIC_ENABLE;
+}
+
+extern void enable_IR_x2apic(void);
+
+extern int get_physical_broadcast(void);
+
+extern int lapic_get_maxlvt(void);
+extern void clear_local_APIC(void);
+extern void disconnect_bsp_APIC(int virt_wire_setup);
+extern void disable_local_APIC(void);
+extern void lapic_shutdown(void);
+extern void sync_Arb_IDs(void);
+extern void init_bsp_APIC(void);
+extern void apic_intr_mode_init(void);
+extern void init_apic_mappings(void);
+void register_lapic_address(unsigned long address);
+extern void setup_boot_APIC_clock(void);
+extern void setup_secondary_APIC_clock(void);
+extern void lapic_update_tsc_freq(void);
+
+#ifdef CONFIG_X86_64
+static inline int apic_force_enable(unsigned long addr)
+{
+ return -1;
+}
+#else
+extern int apic_force_enable(unsigned long addr);
+#endif
+
+extern void apic_bsp_setup(bool upmode);
+extern void apic_ap_setup(void);
+
+/*
+ * On 32bit this is mach-xxx local
+ */
+#ifdef CONFIG_X86_64
+extern int apic_is_clustered_box(void);
+#else
+static inline int apic_is_clustered_box(void)
+{
+ return 0;
+}
+#endif
+
+extern int setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask);
+extern void lapic_assign_system_vectors(void);
+extern void lapic_assign_legacy_vector(unsigned int isairq, bool replace);
+extern void lapic_update_legacy_vectors(void);
+extern void lapic_online(void);
+extern void lapic_offline(void);
+
+#else /* !CONFIG_X86_LOCAL_APIC */
+static inline void lapic_shutdown(void) { }
+#define local_apic_timer_c2_ok 1
+static inline void init_apic_mappings(void) { }
+static inline void disable_local_APIC(void) { }
+# define setup_boot_APIC_clock x86_init_noop
+# define setup_secondary_APIC_clock x86_init_noop
+static inline void lapic_update_tsc_freq(void) { }
+static inline void init_bsp_APIC(void) { }
+static inline void apic_intr_mode_init(void) { }
+static inline void lapic_assign_system_vectors(void) { }
+static inline void lapic_assign_legacy_vector(unsigned int i, bool r) { }
+#endif /* !CONFIG_X86_LOCAL_APIC */
+
+#ifdef CONFIG_X86_X2APIC
+static inline void native_apic_msr_write(u32 reg, u32 v)
+{
+ if (reg == APIC_DFR || reg == APIC_ID || reg == APIC_LDR ||
+ reg == APIC_LVR)
+ return;
+
+ wrmsr(APIC_BASE_MSR + (reg >> 4), v, 0);
+}
+
+static inline void native_apic_msr_eoi_write(u32 reg, u32 v)
+{
+ __wrmsr(APIC_BASE_MSR + (APIC_EOI >> 4), APIC_EOI_ACK, 0);
+}
+
+static inline u32 native_apic_msr_read(u32 reg)
+{
+ u64 msr;
+
+ if (reg == APIC_DFR)
+ return -1;
+
+ rdmsrl(APIC_BASE_MSR + (reg >> 4), msr);
+ return (u32)msr;
+}
+
+static inline void native_x2apic_wait_icr_idle(void)
+{
+ /* no need to wait for icr idle in x2apic */
+ return;
+}
+
+static inline u32 native_safe_x2apic_wait_icr_idle(void)
+{
+ /* no need to wait for icr idle in x2apic */
+ return 0;
+}
+
+static inline void native_x2apic_icr_write(u32 low, u32 id)
+{
+ wrmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), ((__u64) id) << 32 | low);
+}
+
+static inline u64 native_x2apic_icr_read(void)
+{
+ unsigned long val;
+
+ rdmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), val);
+ return val;
+}
+
+extern int x2apic_mode;
+extern int x2apic_phys;
+extern void __init x2apic_set_max_apicid(u32 apicid);
+extern void __init check_x2apic(void);
+extern void x2apic_setup(void);
+static inline int x2apic_enabled(void)
+{
+ return boot_cpu_has(X86_FEATURE_X2APIC) && apic_is_x2apic_enabled();
+}
+
+#define x2apic_supported() (boot_cpu_has(X86_FEATURE_X2APIC))
+#else /* !CONFIG_X86_X2APIC */
+static inline void check_x2apic(void) { }
+static inline void x2apic_setup(void) { }
+static inline int x2apic_enabled(void) { return 0; }
+
+#define x2apic_mode (0)
+#define x2apic_supported() (0)
+#endif /* !CONFIG_X86_X2APIC */
+
+struct irq_data;
+
+/*
+ * Copyright 2004 James Cleverdon, IBM.
+ * Subject to the GNU Public License, v.2
+ *
+ * Generic APIC sub-arch data struct.
+ *
+ * Hacked for x86-64 by James Cleverdon from i386 architecture code by
+ * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
+ * James Cleverdon.
+ */
+struct apic {
+ /* Hotpath functions first */
+ void (*eoi_write)(u32 reg, u32 v);
+ void (*native_eoi_write)(u32 reg, u32 v);
+ void (*write)(u32 reg, u32 v);
+ u32 (*read)(u32 reg);
+
+ /* IPI related functions */
+ void (*wait_icr_idle)(void);
+ u32 (*safe_wait_icr_idle)(void);
+
+ void (*send_IPI)(int cpu, int vector);
+ void (*send_IPI_mask)(const struct cpumask *mask, int vector);
+ void (*send_IPI_mask_allbutself)(const struct cpumask *msk, int vec);
+ void (*send_IPI_allbutself)(int vector);
+ void (*send_IPI_all)(int vector);
+ void (*send_IPI_self)(int vector);
+
+ /* dest_logical is used by the IPI functions */
+ u32 dest_logical;
+ u32 disable_esr;
+ u32 irq_delivery_mode;
+ u32 irq_dest_mode;
+
+ u32 (*calc_dest_apicid)(unsigned int cpu);
+
+ /* ICR related functions */
+ u64 (*icr_read)(void);
+ void (*icr_write)(u32 low, u32 high);
+
+ /* Probe, setup and smpboot functions */
+ int (*probe)(void);
+ int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id);
+ int (*apic_id_valid)(u32 apicid);
+ int (*apic_id_registered)(void);
+
+ bool (*check_apicid_used)(physid_mask_t *map, int apicid);
+ void (*init_apic_ldr)(void);
+ void (*ioapic_phys_id_map)(physid_mask_t *phys_map, physid_mask_t *retmap);
+ void (*setup_apic_routing)(void);
+ int (*cpu_present_to_apicid)(int mps_cpu);
+ void (*apicid_to_cpu_present)(int phys_apicid, physid_mask_t *retmap);
+ int (*check_phys_apicid_present)(int phys_apicid);
+ int (*phys_pkg_id)(int cpuid_apic, int index_msb);
+
+ u32 (*get_apic_id)(unsigned long x);
+ u32 (*set_apic_id)(unsigned int id);
+
+ /* wakeup_secondary_cpu */
+ int (*wakeup_secondary_cpu)(int apicid, unsigned long start_eip);
+
+ void (*inquire_remote_apic)(int apicid);
+
+#ifdef CONFIG_X86_32
+ /*
+ * Called very early during boot from get_smp_config(). It should
+ * return the logical apicid. x86_[bios]_cpu_to_apicid is
+ * initialized before this function is called.
+ *
+ * If logical apicid can't be determined that early, the function
+ * may return BAD_APICID. Logical apicid will be configured after
+ * init_apic_ldr() while bringing up CPUs. Note that NUMA affinity
+ * won't be applied properly during early boot in this case.
+ */
+ int (*x86_32_early_logical_apicid)(int cpu);
+#endif
+ char *name;
+};
+
+/*
+ * Pointer to the local APIC driver in use on this system (there's
+ * always just one such driver in use - the kernel decides via an
+ * early probing process which one it picks - and then sticks to it):
+ */
+extern struct apic *apic;
+
+/*
+ * APIC drivers are probed based on how they are listed in the .apicdrivers
+ * section. So the order is important and enforced by the ordering
+ * of different apic driver files in the Makefile.
+ *
+ * For the files having two apic drivers, we use apic_drivers()
+ * to enforce the order with in them.
+ */
+#define apic_driver(sym) \
+ static const struct apic *__apicdrivers_##sym __used \
+ __aligned(sizeof(struct apic *)) \
+ __section(.apicdrivers) = { &sym }
+
+#define apic_drivers(sym1, sym2) \
+ static struct apic *__apicdrivers_##sym1##sym2[2] __used \
+ __aligned(sizeof(struct apic *)) \
+ __section(.apicdrivers) = { &sym1, &sym2 }
+
+extern struct apic *__apicdrivers[], *__apicdrivers_end[];
+
+/*
+ * APIC functionality to boot other CPUs - only used on SMP:
+ */
+#ifdef CONFIG_SMP
+extern int wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip);
+extern int lapic_can_unplug_cpu(void);
+#endif
+
+#ifdef CONFIG_X86_LOCAL_APIC
+
+static inline u32 apic_read(u32 reg)
+{
+ return apic->read(reg);
+}
+
+static inline void apic_write(u32 reg, u32 val)
+{
+ apic->write(reg, val);
+}
+
+static inline void apic_eoi(void)
+{
+ apic->eoi_write(APIC_EOI, APIC_EOI_ACK);
+}
+
+static inline u64 apic_icr_read(void)
+{
+ return apic->icr_read();
+}
+
+static inline void apic_icr_write(u32 low, u32 high)
+{
+ apic->icr_write(low, high);
+}
+
+static inline void apic_wait_icr_idle(void)
+{
+ apic->wait_icr_idle();
+}
+
+static inline u32 safe_apic_wait_icr_idle(void)
+{
+ return apic->safe_wait_icr_idle();
+}
+
+extern void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v));
+
+#else /* CONFIG_X86_LOCAL_APIC */
+
+static inline u32 apic_read(u32 reg) { return 0; }
+static inline void apic_write(u32 reg, u32 val) { }
+static inline void apic_eoi(void) { }
+static inline u64 apic_icr_read(void) { return 0; }
+static inline void apic_icr_write(u32 low, u32 high) { }
+static inline void apic_wait_icr_idle(void) { }
+static inline u32 safe_apic_wait_icr_idle(void) { return 0; }
+static inline void apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v)) {}
+
+#endif /* CONFIG_X86_LOCAL_APIC */
+
+extern void apic_ack_irq(struct irq_data *data);
+
+static inline void ack_APIC_irq(void)
+{
+ /*
+ * ack_APIC_irq() actually gets compiled as a single instruction
+ * ... yummie.
+ */
+ apic_eoi();
+}
+
+
+static inline bool lapic_vector_set_in_irr(unsigned int vector)
+{
+ u32 irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
+
+ return !!(irr & (1U << (vector % 32)));
+}
+
+static inline unsigned default_get_apic_id(unsigned long x)
+{
+ unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR));
+
+ if (APIC_XAPIC(ver) || boot_cpu_has(X86_FEATURE_EXTD_APICID))
+ return (x >> 24) & 0xFF;
+ else
+ return (x >> 24) & 0x0F;
+}
+
+/*
+ * Warm reset vector position:
+ */
+#define TRAMPOLINE_PHYS_LOW 0x467
+#define TRAMPOLINE_PHYS_HIGH 0x469
+
+#ifdef CONFIG_X86_64
+extern void apic_send_IPI_self(int vector);
+
+DECLARE_PER_CPU(int, x2apic_extra_bits);
+#endif
+
+extern void generic_bigsmp_probe(void);
+
+#ifdef CONFIG_X86_LOCAL_APIC
+
+#include <asm/smp.h>
+
+#define APIC_DFR_VALUE (APIC_DFR_FLAT)
+
+DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid);
+
+extern struct apic apic_noop;
+
+static inline unsigned int read_apic_id(void)
+{
+ unsigned int reg = apic_read(APIC_ID);
+
+ return apic->get_apic_id(reg);
+}
+
+extern int default_apic_id_valid(u32 apicid);
+extern int default_acpi_madt_oem_check(char *, char *);
+extern void default_setup_apic_routing(void);
+
+extern u32 apic_default_calc_apicid(unsigned int cpu);
+extern u32 apic_flat_calc_apicid(unsigned int cpu);
+
+extern bool default_check_apicid_used(physid_mask_t *map, int apicid);
+extern void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap);
+extern int default_cpu_present_to_apicid(int mps_cpu);
+extern int default_check_phys_apicid_present(int phys_apicid);
+
+#endif /* CONFIG_X86_LOCAL_APIC */
+
+#ifdef CONFIG_SMP
+bool apic_id_is_primary_thread(unsigned int id);
+#else
+static inline bool apic_id_is_primary_thread(unsigned int id) { return false; }
+#endif
+
+extern void irq_enter(void);
+extern void irq_exit(void);
+
+static inline void entering_irq(void)
+{
+ irq_enter();
+ kvm_set_cpu_l1tf_flush_l1d();
+}
+
+static inline void entering_ack_irq(void)
+{
+ entering_irq();
+ ack_APIC_irq();
+}
+
+static inline void ipi_entering_ack_irq(void)
+{
+ irq_enter();
+ ack_APIC_irq();
+ kvm_set_cpu_l1tf_flush_l1d();
+}
+
+static inline void exiting_irq(void)
+{
+ irq_exit();
+}
+
+static inline void exiting_ack_irq(void)
+{
+ ack_APIC_irq();
+ irq_exit();
+}
+
+extern void ioapic_zap_locks(void);
+
+#endif /* _ASM_X86_APIC_H */
diff --git a/arch/x86/include/asm/apic_flat_64.h b/arch/x86/include/asm/apic_flat_64.h
new file mode 100644
index 000000000..d3a2b3876
--- /dev/null
+++ b/arch/x86/include/asm/apic_flat_64.h
@@ -0,0 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_APIC_FLAT_64_H
+#define _ASM_X86_APIC_FLAT_64_H
+
+extern void flat_init_apic_ldr(void);
+
+#endif
+
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
new file mode 100644
index 000000000..05e694ed8
--- /dev/null
+++ b/arch/x86/include/asm/apicdef.h
@@ -0,0 +1,446 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_APICDEF_H
+#define _ASM_X86_APICDEF_H
+
+/*
+ * Constants for various Intel APICs. (local APIC, IOAPIC, etc.)
+ *
+ * Alan Cox <Alan.Cox@linux.org>, 1995.
+ * Ingo Molnar <mingo@redhat.com>, 1999, 2000
+ */
+
+#define IO_APIC_DEFAULT_PHYS_BASE 0xfec00000
+#define APIC_DEFAULT_PHYS_BASE 0xfee00000
+
+/*
+ * This is the IO-APIC register space as specified
+ * by Intel docs:
+ */
+#define IO_APIC_SLOT_SIZE 1024
+
+#define APIC_ID 0x20
+
+#define APIC_LVR 0x30
+#define APIC_LVR_MASK 0xFF00FF
+#define APIC_LVR_DIRECTED_EOI (1 << 24)
+#define GET_APIC_VERSION(x) ((x) & 0xFFu)
+#define GET_APIC_MAXLVT(x) (((x) >> 16) & 0xFFu)
+#ifdef CONFIG_X86_32
+# define APIC_INTEGRATED(x) ((x) & 0xF0u)
+#else
+# define APIC_INTEGRATED(x) (1)
+#endif
+#define APIC_XAPIC(x) ((x) >= 0x14)
+#define APIC_EXT_SPACE(x) ((x) & 0x80000000)
+#define APIC_TASKPRI 0x80
+#define APIC_TPRI_MASK 0xFFu
+#define APIC_ARBPRI 0x90
+#define APIC_ARBPRI_MASK 0xFFu
+#define APIC_PROCPRI 0xA0
+#define APIC_EOI 0xB0
+#define APIC_EOI_ACK 0x0 /* Docs say 0 for future compat. */
+#define APIC_RRR 0xC0
+#define APIC_LDR 0xD0
+#define APIC_LDR_MASK (0xFFu << 24)
+#define GET_APIC_LOGICAL_ID(x) (((x) >> 24) & 0xFFu)
+#define SET_APIC_LOGICAL_ID(x) (((x) << 24))
+#define APIC_ALL_CPUS 0xFFu
+#define APIC_DFR 0xE0
+#define APIC_DFR_CLUSTER 0x0FFFFFFFul
+#define APIC_DFR_FLAT 0xFFFFFFFFul
+#define APIC_SPIV 0xF0
+#define APIC_SPIV_DIRECTED_EOI (1 << 12)
+#define APIC_SPIV_FOCUS_DISABLED (1 << 9)
+#define APIC_SPIV_APIC_ENABLED (1 << 8)
+#define APIC_ISR 0x100
+#define APIC_ISR_NR 0x8 /* Number of 32 bit ISR registers. */
+#define APIC_TMR 0x180
+#define APIC_IRR 0x200
+#define APIC_ESR 0x280
+#define APIC_ESR_SEND_CS 0x00001
+#define APIC_ESR_RECV_CS 0x00002
+#define APIC_ESR_SEND_ACC 0x00004
+#define APIC_ESR_RECV_ACC 0x00008
+#define APIC_ESR_SENDILL 0x00020
+#define APIC_ESR_RECVILL 0x00040
+#define APIC_ESR_ILLREGA 0x00080
+#define APIC_LVTCMCI 0x2f0
+#define APIC_ICR 0x300
+#define APIC_DEST_SELF 0x40000
+#define APIC_DEST_ALLINC 0x80000
+#define APIC_DEST_ALLBUT 0xC0000
+#define APIC_ICR_RR_MASK 0x30000
+#define APIC_ICR_RR_INVALID 0x00000
+#define APIC_ICR_RR_INPROG 0x10000
+#define APIC_ICR_RR_VALID 0x20000
+#define APIC_INT_LEVELTRIG 0x08000
+#define APIC_INT_ASSERT 0x04000
+#define APIC_ICR_BUSY 0x01000
+#define APIC_DEST_LOGICAL 0x00800
+#define APIC_DEST_PHYSICAL 0x00000
+#define APIC_DM_FIXED 0x00000
+#define APIC_DM_FIXED_MASK 0x00700
+#define APIC_DM_LOWEST 0x00100
+#define APIC_DM_SMI 0x00200
+#define APIC_DM_REMRD 0x00300
+#define APIC_DM_NMI 0x00400
+#define APIC_DM_INIT 0x00500
+#define APIC_DM_STARTUP 0x00600
+#define APIC_DM_EXTINT 0x00700
+#define APIC_VECTOR_MASK 0x000FF
+#define APIC_ICR2 0x310
+#define GET_APIC_DEST_FIELD(x) (((x) >> 24) & 0xFF)
+#define SET_APIC_DEST_FIELD(x) ((x) << 24)
+#define APIC_LVTT 0x320
+#define APIC_LVTTHMR 0x330
+#define APIC_LVTPC 0x340
+#define APIC_LVT0 0x350
+#define APIC_LVT_TIMER_BASE_MASK (0x3 << 18)
+#define GET_APIC_TIMER_BASE(x) (((x) >> 18) & 0x3)
+#define SET_APIC_TIMER_BASE(x) (((x) << 18))
+#define APIC_TIMER_BASE_CLKIN 0x0
+#define APIC_TIMER_BASE_TMBASE 0x1
+#define APIC_TIMER_BASE_DIV 0x2
+#define APIC_LVT_TIMER_ONESHOT (0 << 17)
+#define APIC_LVT_TIMER_PERIODIC (1 << 17)
+#define APIC_LVT_TIMER_TSCDEADLINE (2 << 17)
+#define APIC_LVT_MASKED (1 << 16)
+#define APIC_LVT_LEVEL_TRIGGER (1 << 15)
+#define APIC_LVT_REMOTE_IRR (1 << 14)
+#define APIC_INPUT_POLARITY (1 << 13)
+#define APIC_SEND_PENDING (1 << 12)
+#define APIC_MODE_MASK 0x700
+#define GET_APIC_DELIVERY_MODE(x) (((x) >> 8) & 0x7)
+#define SET_APIC_DELIVERY_MODE(x, y) (((x) & ~0x700) | ((y) << 8))
+#define APIC_MODE_FIXED 0x0
+#define APIC_MODE_NMI 0x4
+#define APIC_MODE_EXTINT 0x7
+#define APIC_LVT1 0x360
+#define APIC_LVTERR 0x370
+#define APIC_TMICT 0x380
+#define APIC_TMCCT 0x390
+#define APIC_TDCR 0x3E0
+#define APIC_SELF_IPI 0x3F0
+#define APIC_TDR_DIV_TMBASE (1 << 2)
+#define APIC_TDR_DIV_1 0xB
+#define APIC_TDR_DIV_2 0x0
+#define APIC_TDR_DIV_4 0x1
+#define APIC_TDR_DIV_8 0x2
+#define APIC_TDR_DIV_16 0x3
+#define APIC_TDR_DIV_32 0x8
+#define APIC_TDR_DIV_64 0x9
+#define APIC_TDR_DIV_128 0xA
+#define APIC_EFEAT 0x400
+#define APIC_ECTRL 0x410
+#define APIC_EILVTn(n) (0x500 + 0x10 * n)
+#define APIC_EILVT_NR_AMD_K8 1 /* # of extended interrupts */
+#define APIC_EILVT_NR_AMD_10H 4
+#define APIC_EILVT_NR_MAX APIC_EILVT_NR_AMD_10H
+#define APIC_EILVT_LVTOFF(x) (((x) >> 4) & 0xF)
+#define APIC_EILVT_MSG_FIX 0x0
+#define APIC_EILVT_MSG_SMI 0x2
+#define APIC_EILVT_MSG_NMI 0x4
+#define APIC_EILVT_MSG_EXT 0x7
+#define APIC_EILVT_MASKED (1 << 16)
+
+#define APIC_BASE (fix_to_virt(FIX_APIC_BASE))
+#define APIC_BASE_MSR 0x800
+#define XAPIC_ENABLE (1UL << 11)
+#define X2APIC_ENABLE (1UL << 10)
+
+#ifdef CONFIG_X86_32
+# define MAX_IO_APICS 64
+# define MAX_LOCAL_APIC 256
+#else
+# define MAX_IO_APICS 128
+# define MAX_LOCAL_APIC 32768
+#endif
+
+/*
+ * All x86-64 systems are xAPIC compatible.
+ * In the following, "apicid" is a physical APIC ID.
+ */
+#define XAPIC_DEST_CPUS_SHIFT 4
+#define XAPIC_DEST_CPUS_MASK ((1u << XAPIC_DEST_CPUS_SHIFT) - 1)
+#define XAPIC_DEST_CLUSTER_MASK (XAPIC_DEST_CPUS_MASK << XAPIC_DEST_CPUS_SHIFT)
+#define APIC_CLUSTER(apicid) ((apicid) & XAPIC_DEST_CLUSTER_MASK)
+#define APIC_CLUSTERID(apicid) (APIC_CLUSTER(apicid) >> XAPIC_DEST_CPUS_SHIFT)
+#define APIC_CPUID(apicid) ((apicid) & XAPIC_DEST_CPUS_MASK)
+#define NUM_APIC_CLUSTERS ((BAD_APICID + 1) >> XAPIC_DEST_CPUS_SHIFT)
+
+/*
+ * the local APIC register structure, memory mapped. Not terribly well
+ * tested, but we might eventually use this one in the future - the
+ * problem why we cannot use it right now is the P5 APIC, it has an
+ * errata which cannot take 8-bit reads and writes, only 32-bit ones ...
+ */
+#define u32 unsigned int
+
+struct local_apic {
+
+/*000*/ struct { u32 __reserved[4]; } __reserved_01;
+
+/*010*/ struct { u32 __reserved[4]; } __reserved_02;
+
+/*020*/ struct { /* APIC ID Register */
+ u32 __reserved_1 : 24,
+ phys_apic_id : 4,
+ __reserved_2 : 4;
+ u32 __reserved[3];
+ } id;
+
+/*030*/ const
+ struct { /* APIC Version Register */
+ u32 version : 8,
+ __reserved_1 : 8,
+ max_lvt : 8,
+ __reserved_2 : 8;
+ u32 __reserved[3];
+ } version;
+
+/*040*/ struct { u32 __reserved[4]; } __reserved_03;
+
+/*050*/ struct { u32 __reserved[4]; } __reserved_04;
+
+/*060*/ struct { u32 __reserved[4]; } __reserved_05;
+
+/*070*/ struct { u32 __reserved[4]; } __reserved_06;
+
+/*080*/ struct { /* Task Priority Register */
+ u32 priority : 8,
+ __reserved_1 : 24;
+ u32 __reserved_2[3];
+ } tpr;
+
+/*090*/ const
+ struct { /* Arbitration Priority Register */
+ u32 priority : 8,
+ __reserved_1 : 24;
+ u32 __reserved_2[3];
+ } apr;
+
+/*0A0*/ const
+ struct { /* Processor Priority Register */
+ u32 priority : 8,
+ __reserved_1 : 24;
+ u32 __reserved_2[3];
+ } ppr;
+
+/*0B0*/ struct { /* End Of Interrupt Register */
+ u32 eoi;
+ u32 __reserved[3];
+ } eoi;
+
+/*0C0*/ struct { u32 __reserved[4]; } __reserved_07;
+
+/*0D0*/ struct { /* Logical Destination Register */
+ u32 __reserved_1 : 24,
+ logical_dest : 8;
+ u32 __reserved_2[3];
+ } ldr;
+
+/*0E0*/ struct { /* Destination Format Register */
+ u32 __reserved_1 : 28,
+ model : 4;
+ u32 __reserved_2[3];
+ } dfr;
+
+/*0F0*/ struct { /* Spurious Interrupt Vector Register */
+ u32 spurious_vector : 8,
+ apic_enabled : 1,
+ focus_cpu : 1,
+ __reserved_2 : 22;
+ u32 __reserved_3[3];
+ } svr;
+
+/*100*/ struct { /* In Service Register */
+/*170*/ u32 bitfield;
+ u32 __reserved[3];
+ } isr [8];
+
+/*180*/ struct { /* Trigger Mode Register */
+/*1F0*/ u32 bitfield;
+ u32 __reserved[3];
+ } tmr [8];
+
+/*200*/ struct { /* Interrupt Request Register */
+/*270*/ u32 bitfield;
+ u32 __reserved[3];
+ } irr [8];
+
+/*280*/ union { /* Error Status Register */
+ struct {
+ u32 send_cs_error : 1,
+ receive_cs_error : 1,
+ send_accept_error : 1,
+ receive_accept_error : 1,
+ __reserved_1 : 1,
+ send_illegal_vector : 1,
+ receive_illegal_vector : 1,
+ illegal_register_address : 1,
+ __reserved_2 : 24;
+ u32 __reserved_3[3];
+ } error_bits;
+ struct {
+ u32 errors;
+ u32 __reserved_3[3];
+ } all_errors;
+ } esr;
+
+/*290*/ struct { u32 __reserved[4]; } __reserved_08;
+
+/*2A0*/ struct { u32 __reserved[4]; } __reserved_09;
+
+/*2B0*/ struct { u32 __reserved[4]; } __reserved_10;
+
+/*2C0*/ struct { u32 __reserved[4]; } __reserved_11;
+
+/*2D0*/ struct { u32 __reserved[4]; } __reserved_12;
+
+/*2E0*/ struct { u32 __reserved[4]; } __reserved_13;
+
+/*2F0*/ struct { u32 __reserved[4]; } __reserved_14;
+
+/*300*/ struct { /* Interrupt Command Register 1 */
+ u32 vector : 8,
+ delivery_mode : 3,
+ destination_mode : 1,
+ delivery_status : 1,
+ __reserved_1 : 1,
+ level : 1,
+ trigger : 1,
+ __reserved_2 : 2,
+ shorthand : 2,
+ __reserved_3 : 12;
+ u32 __reserved_4[3];
+ } icr1;
+
+/*310*/ struct { /* Interrupt Command Register 2 */
+ union {
+ u32 __reserved_1 : 24,
+ phys_dest : 4,
+ __reserved_2 : 4;
+ u32 __reserved_3 : 24,
+ logical_dest : 8;
+ } dest;
+ u32 __reserved_4[3];
+ } icr2;
+
+/*320*/ struct { /* LVT - Timer */
+ u32 vector : 8,
+ __reserved_1 : 4,
+ delivery_status : 1,
+ __reserved_2 : 3,
+ mask : 1,
+ timer_mode : 1,
+ __reserved_3 : 14;
+ u32 __reserved_4[3];
+ } lvt_timer;
+
+/*330*/ struct { /* LVT - Thermal Sensor */
+ u32 vector : 8,
+ delivery_mode : 3,
+ __reserved_1 : 1,
+ delivery_status : 1,
+ __reserved_2 : 3,
+ mask : 1,
+ __reserved_3 : 15;
+ u32 __reserved_4[3];
+ } lvt_thermal;
+
+/*340*/ struct { /* LVT - Performance Counter */
+ u32 vector : 8,
+ delivery_mode : 3,
+ __reserved_1 : 1,
+ delivery_status : 1,
+ __reserved_2 : 3,
+ mask : 1,
+ __reserved_3 : 15;
+ u32 __reserved_4[3];
+ } lvt_pc;
+
+/*350*/ struct { /* LVT - LINT0 */
+ u32 vector : 8,
+ delivery_mode : 3,
+ __reserved_1 : 1,
+ delivery_status : 1,
+ polarity : 1,
+ remote_irr : 1,
+ trigger : 1,
+ mask : 1,
+ __reserved_2 : 15;
+ u32 __reserved_3[3];
+ } lvt_lint0;
+
+/*360*/ struct { /* LVT - LINT1 */
+ u32 vector : 8,
+ delivery_mode : 3,
+ __reserved_1 : 1,
+ delivery_status : 1,
+ polarity : 1,
+ remote_irr : 1,
+ trigger : 1,
+ mask : 1,
+ __reserved_2 : 15;
+ u32 __reserved_3[3];
+ } lvt_lint1;
+
+/*370*/ struct { /* LVT - Error */
+ u32 vector : 8,
+ __reserved_1 : 4,
+ delivery_status : 1,
+ __reserved_2 : 3,
+ mask : 1,
+ __reserved_3 : 15;
+ u32 __reserved_4[3];
+ } lvt_error;
+
+/*380*/ struct { /* Timer Initial Count Register */
+ u32 initial_count;
+ u32 __reserved_2[3];
+ } timer_icr;
+
+/*390*/ const
+ struct { /* Timer Current Count Register */
+ u32 curr_count;
+ u32 __reserved_2[3];
+ } timer_ccr;
+
+/*3A0*/ struct { u32 __reserved[4]; } __reserved_16;
+
+/*3B0*/ struct { u32 __reserved[4]; } __reserved_17;
+
+/*3C0*/ struct { u32 __reserved[4]; } __reserved_18;
+
+/*3D0*/ struct { u32 __reserved[4]; } __reserved_19;
+
+/*3E0*/ struct { /* Timer Divide Configuration Register */
+ u32 divisor : 4,
+ __reserved_1 : 28;
+ u32 __reserved_2[3];
+ } timer_dcr;
+
+/*3F0*/ struct { u32 __reserved[4]; } __reserved_20;
+
+} __attribute__ ((packed));
+
+#undef u32
+
+#ifdef CONFIG_X86_32
+ #define BAD_APICID 0xFFu
+#else
+ #define BAD_APICID 0xFFFFu
+#endif
+
+enum ioapic_irq_destination_types {
+ dest_Fixed = 0,
+ dest_LowestPrio = 1,
+ dest_SMI = 2,
+ dest__reserved_1 = 3,
+ dest_NMI = 4,
+ dest_INIT = 5,
+ dest__reserved_2 = 6,
+ dest_ExtINT = 7
+};
+
+#endif /* _ASM_X86_APICDEF_H */
diff --git a/arch/x86/include/asm/apm.h b/arch/x86/include/asm/apm.h
new file mode 100644
index 000000000..4d4015ddc
--- /dev/null
+++ b/arch/x86/include/asm/apm.h
@@ -0,0 +1,74 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Machine specific APM BIOS functions for generic.
+ * Split out from apm.c by Osamu Tomita <tomita@cinet.co.jp>
+ */
+
+#ifndef _ASM_X86_MACH_DEFAULT_APM_H
+#define _ASM_X86_MACH_DEFAULT_APM_H
+
+#ifdef APM_ZERO_SEGS
+# define APM_DO_ZERO_SEGS \
+ "pushl %%ds\n\t" \
+ "pushl %%es\n\t" \
+ "xorl %%edx, %%edx\n\t" \
+ "mov %%dx, %%ds\n\t" \
+ "mov %%dx, %%es\n\t" \
+ "mov %%dx, %%fs\n\t" \
+ "mov %%dx, %%gs\n\t"
+# define APM_DO_POP_SEGS \
+ "popl %%es\n\t" \
+ "popl %%ds\n\t"
+#else
+# define APM_DO_ZERO_SEGS
+# define APM_DO_POP_SEGS
+#endif
+
+static inline void apm_bios_call_asm(u32 func, u32 ebx_in, u32 ecx_in,
+ u32 *eax, u32 *ebx, u32 *ecx,
+ u32 *edx, u32 *esi)
+{
+ /*
+ * N.B. We do NOT need a cld after the BIOS call
+ * because we always save and restore the flags.
+ */
+ __asm__ __volatile__(APM_DO_ZERO_SEGS
+ "pushl %%edi\n\t"
+ "pushl %%ebp\n\t"
+ "lcall *%%cs:apm_bios_entry\n\t"
+ "setc %%al\n\t"
+ "popl %%ebp\n\t"
+ "popl %%edi\n\t"
+ APM_DO_POP_SEGS
+ : "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx),
+ "=S" (*esi)
+ : "a" (func), "b" (ebx_in), "c" (ecx_in)
+ : "memory", "cc");
+}
+
+static inline bool apm_bios_call_simple_asm(u32 func, u32 ebx_in,
+ u32 ecx_in, u32 *eax)
+{
+ int cx, dx, si;
+ bool error;
+
+ /*
+ * N.B. We do NOT need a cld after the BIOS call
+ * because we always save and restore the flags.
+ */
+ __asm__ __volatile__(APM_DO_ZERO_SEGS
+ "pushl %%edi\n\t"
+ "pushl %%ebp\n\t"
+ "lcall *%%cs:apm_bios_entry\n\t"
+ "setc %%bl\n\t"
+ "popl %%ebp\n\t"
+ "popl %%edi\n\t"
+ APM_DO_POP_SEGS
+ : "=a" (*eax), "=b" (error), "=c" (cx), "=d" (dx),
+ "=S" (si)
+ : "a" (func), "b" (ebx_in), "c" (ecx_in)
+ : "memory", "cc");
+ return error;
+}
+
+#endif /* _ASM_X86_MACH_DEFAULT_APM_H */
diff --git a/arch/x86/include/asm/arch_hweight.h b/arch/x86/include/asm/arch_hweight.h
new file mode 100644
index 000000000..34a10b2d5
--- /dev/null
+++ b/arch/x86/include/asm/arch_hweight.h
@@ -0,0 +1,63 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_HWEIGHT_H
+#define _ASM_X86_HWEIGHT_H
+
+#include <asm/cpufeatures.h>
+
+#ifdef CONFIG_64BIT
+/* popcnt %edi, %eax */
+#define POPCNT32 ".byte 0xf3,0x0f,0xb8,0xc7"
+/* popcnt %rdi, %rax */
+#define POPCNT64 ".byte 0xf3,0x48,0x0f,0xb8,0xc7"
+#define REG_IN "D"
+#define REG_OUT "a"
+#else
+/* popcnt %eax, %eax */
+#define POPCNT32 ".byte 0xf3,0x0f,0xb8,0xc0"
+#define REG_IN "a"
+#define REG_OUT "a"
+#endif
+
+#define __HAVE_ARCH_SW_HWEIGHT
+
+static __always_inline unsigned int __arch_hweight32(unsigned int w)
+{
+ unsigned int res;
+
+ asm (ALTERNATIVE("call __sw_hweight32", POPCNT32, X86_FEATURE_POPCNT)
+ : "="REG_OUT (res)
+ : REG_IN (w));
+
+ return res;
+}
+
+static inline unsigned int __arch_hweight16(unsigned int w)
+{
+ return __arch_hweight32(w & 0xffff);
+}
+
+static inline unsigned int __arch_hweight8(unsigned int w)
+{
+ return __arch_hweight32(w & 0xff);
+}
+
+#ifdef CONFIG_X86_32
+static inline unsigned long __arch_hweight64(__u64 w)
+{
+ return __arch_hweight32((u32)w) +
+ __arch_hweight32((u32)(w >> 32));
+}
+#else
+static __always_inline unsigned long __arch_hweight64(__u64 w)
+{
+ unsigned long res;
+
+ asm (ALTERNATIVE("call __sw_hweight64", POPCNT64, X86_FEATURE_POPCNT)
+ : "="REG_OUT (res)
+ : REG_IN (w));
+
+ return res;
+}
+#endif /* CONFIG_X86_32 */
+
+#endif
diff --git a/arch/x86/include/asm/archrandom.h b/arch/x86/include/asm/archrandom.h
new file mode 100644
index 000000000..4d3cac3c9
--- /dev/null
+++ b/arch/x86/include/asm/archrandom.h
@@ -0,0 +1,124 @@
+/*
+ * This file is part of the Linux kernel.
+ *
+ * Copyright (c) 2011-2014, Intel Corporation
+ * Authors: Fenghua Yu <fenghua.yu@intel.com>,
+ * H. Peter Anvin <hpa@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+
+#ifndef ASM_X86_ARCHRANDOM_H
+#define ASM_X86_ARCHRANDOM_H
+
+#include <asm/processor.h>
+#include <asm/cpufeature.h>
+
+#define RDRAND_RETRY_LOOPS 10
+
+#define RDRAND_INT ".byte 0x0f,0xc7,0xf0"
+#define RDSEED_INT ".byte 0x0f,0xc7,0xf8"
+#ifdef CONFIG_X86_64
+# define RDRAND_LONG ".byte 0x48,0x0f,0xc7,0xf0"
+# define RDSEED_LONG ".byte 0x48,0x0f,0xc7,0xf8"
+#else
+# define RDRAND_LONG RDRAND_INT
+# define RDSEED_LONG RDSEED_INT
+#endif
+
+/* Unconditional execution of RDRAND and RDSEED */
+
+static inline bool rdrand_long(unsigned long *v)
+{
+ bool ok;
+ unsigned int retry = RDRAND_RETRY_LOOPS;
+ do {
+ asm volatile(RDRAND_LONG
+ CC_SET(c)
+ : CC_OUT(c) (ok), "=a" (*v));
+ if (ok)
+ return true;
+ } while (--retry);
+ return false;
+}
+
+static inline bool rdrand_int(unsigned int *v)
+{
+ bool ok;
+ unsigned int retry = RDRAND_RETRY_LOOPS;
+ do {
+ asm volatile(RDRAND_INT
+ CC_SET(c)
+ : CC_OUT(c) (ok), "=a" (*v));
+ if (ok)
+ return true;
+ } while (--retry);
+ return false;
+}
+
+static inline bool rdseed_long(unsigned long *v)
+{
+ bool ok;
+ asm volatile(RDSEED_LONG
+ CC_SET(c)
+ : CC_OUT(c) (ok), "=a" (*v));
+ return ok;
+}
+
+static inline bool rdseed_int(unsigned int *v)
+{
+ bool ok;
+ asm volatile(RDSEED_INT
+ CC_SET(c)
+ : CC_OUT(c) (ok), "=a" (*v));
+ return ok;
+}
+
+/*
+ * These are the generic interfaces; they must not be declared if the
+ * stubs in <linux/random.h> are to be invoked,
+ * i.e. CONFIG_ARCH_RANDOM is not defined.
+ */
+#ifdef CONFIG_ARCH_RANDOM
+
+static inline bool arch_get_random_long(unsigned long *v)
+{
+ return static_cpu_has(X86_FEATURE_RDRAND) ? rdrand_long(v) : false;
+}
+
+static inline bool arch_get_random_int(unsigned int *v)
+{
+ return static_cpu_has(X86_FEATURE_RDRAND) ? rdrand_int(v) : false;
+}
+
+static inline bool arch_get_random_seed_long(unsigned long *v)
+{
+ return static_cpu_has(X86_FEATURE_RDSEED) ? rdseed_long(v) : false;
+}
+
+static inline bool arch_get_random_seed_int(unsigned int *v)
+{
+ return static_cpu_has(X86_FEATURE_RDSEED) ? rdseed_int(v) : false;
+}
+
+extern void x86_init_rdrand(struct cpuinfo_x86 *c);
+
+#else /* !CONFIG_ARCH_RANDOM */
+
+static inline void x86_init_rdrand(struct cpuinfo_x86 *c) { }
+
+#endif /* !CONFIG_ARCH_RANDOM */
+
+#endif /* ASM_X86_ARCHRANDOM_H */
diff --git a/arch/x86/include/asm/asm-offsets.h b/arch/x86/include/asm/asm-offsets.h
new file mode 100644
index 000000000..d370ee36a
--- /dev/null
+++ b/arch/x86/include/asm/asm-offsets.h
@@ -0,0 +1 @@
+#include <generated/asm-offsets.h>
diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h
new file mode 100644
index 000000000..1908214b9
--- /dev/null
+++ b/arch/x86/include/asm/asm-prototypes.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <asm/ftrace.h>
+#include <linux/uaccess.h>
+#include <asm/string.h>
+#include <asm/page.h>
+#include <asm/checksum.h>
+
+#include <asm-generic/asm-prototypes.h>
+
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/special_insns.h>
+#include <asm/preempt.h>
+#include <asm/asm.h>
+
+#ifndef CONFIG_X86_CMPXCHG64
+extern void cmpxchg8b_emu(void);
+#endif
+
+#ifdef CONFIG_RETPOLINE
+#ifdef CONFIG_X86_32
+#define INDIRECT_THUNK(reg) extern asmlinkage void __x86_indirect_thunk_e ## reg(void);
+#else
+#define INDIRECT_THUNK(reg) extern asmlinkage void __x86_indirect_thunk_r ## reg(void);
+INDIRECT_THUNK(8)
+INDIRECT_THUNK(9)
+INDIRECT_THUNK(10)
+INDIRECT_THUNK(11)
+INDIRECT_THUNK(12)
+INDIRECT_THUNK(13)
+INDIRECT_THUNK(14)
+INDIRECT_THUNK(15)
+#endif
+INDIRECT_THUNK(ax)
+INDIRECT_THUNK(bx)
+INDIRECT_THUNK(cx)
+INDIRECT_THUNK(dx)
+INDIRECT_THUNK(si)
+INDIRECT_THUNK(di)
+INDIRECT_THUNK(bp)
+#endif /* CONFIG_RETPOLINE */
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
new file mode 100644
index 000000000..990770f9e
--- /dev/null
+++ b/arch/x86/include/asm/asm.h
@@ -0,0 +1,208 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_ASM_H
+#define _ASM_X86_ASM_H
+
+#ifdef __ASSEMBLY__
+# define __ASM_FORM(x) x
+# define __ASM_FORM_RAW(x) x
+# define __ASM_FORM_COMMA(x) x,
+#else
+# define __ASM_FORM(x) " " #x " "
+# define __ASM_FORM_RAW(x) #x
+# define __ASM_FORM_COMMA(x) " " #x ","
+#endif
+
+#ifndef __x86_64__
+/* 32 bit */
+# define __ASM_SEL(a,b) __ASM_FORM(a)
+# define __ASM_SEL_RAW(a,b) __ASM_FORM_RAW(a)
+#else
+/* 64 bit */
+# define __ASM_SEL(a,b) __ASM_FORM(b)
+# define __ASM_SEL_RAW(a,b) __ASM_FORM_RAW(b)
+#endif
+
+#define __ASM_SIZE(inst, ...) __ASM_SEL(inst##l##__VA_ARGS__, \
+ inst##q##__VA_ARGS__)
+#define __ASM_REG(reg) __ASM_SEL_RAW(e##reg, r##reg)
+
+#define _ASM_PTR __ASM_SEL(.long, .quad)
+#define _ASM_ALIGN __ASM_SEL(.balign 4, .balign 8)
+
+#define _ASM_MOV __ASM_SIZE(mov)
+#define _ASM_INC __ASM_SIZE(inc)
+#define _ASM_DEC __ASM_SIZE(dec)
+#define _ASM_ADD __ASM_SIZE(add)
+#define _ASM_SUB __ASM_SIZE(sub)
+#define _ASM_XADD __ASM_SIZE(xadd)
+#define _ASM_MUL __ASM_SIZE(mul)
+
+#define _ASM_AX __ASM_REG(ax)
+#define _ASM_BX __ASM_REG(bx)
+#define _ASM_CX __ASM_REG(cx)
+#define _ASM_DX __ASM_REG(dx)
+#define _ASM_SP __ASM_REG(sp)
+#define _ASM_BP __ASM_REG(bp)
+#define _ASM_SI __ASM_REG(si)
+#define _ASM_DI __ASM_REG(di)
+
+#ifndef __x86_64__
+/* 32 bit */
+
+#define _ASM_ARG1 _ASM_AX
+#define _ASM_ARG2 _ASM_DX
+#define _ASM_ARG3 _ASM_CX
+
+#define _ASM_ARG1L eax
+#define _ASM_ARG2L edx
+#define _ASM_ARG3L ecx
+
+#define _ASM_ARG1W ax
+#define _ASM_ARG2W dx
+#define _ASM_ARG3W cx
+
+#define _ASM_ARG1B al
+#define _ASM_ARG2B dl
+#define _ASM_ARG3B cl
+
+#else
+/* 64 bit */
+
+#define _ASM_ARG1 _ASM_DI
+#define _ASM_ARG2 _ASM_SI
+#define _ASM_ARG3 _ASM_DX
+#define _ASM_ARG4 _ASM_CX
+#define _ASM_ARG5 r8
+#define _ASM_ARG6 r9
+
+#define _ASM_ARG1Q rdi
+#define _ASM_ARG2Q rsi
+#define _ASM_ARG3Q rdx
+#define _ASM_ARG4Q rcx
+#define _ASM_ARG5Q r8
+#define _ASM_ARG6Q r9
+
+#define _ASM_ARG1L edi
+#define _ASM_ARG2L esi
+#define _ASM_ARG3L edx
+#define _ASM_ARG4L ecx
+#define _ASM_ARG5L r8d
+#define _ASM_ARG6L r9d
+
+#define _ASM_ARG1W di
+#define _ASM_ARG2W si
+#define _ASM_ARG3W dx
+#define _ASM_ARG4W cx
+#define _ASM_ARG5W r8w
+#define _ASM_ARG6W r9w
+
+#define _ASM_ARG1B dil
+#define _ASM_ARG2B sil
+#define _ASM_ARG3B dl
+#define _ASM_ARG4B cl
+#define _ASM_ARG5B r8b
+#define _ASM_ARG6B r9b
+
+#endif
+
+/*
+ * Macros to generate condition code outputs from inline assembly,
+ * The output operand must be type "bool".
+ */
+#ifdef __GCC_ASM_FLAG_OUTPUTS__
+# define CC_SET(c) "\n\t/* output condition code " #c "*/\n"
+# define CC_OUT(c) "=@cc" #c
+#else
+# define CC_SET(c) "\n\tset" #c " %[_cc_" #c "]\n"
+# define CC_OUT(c) [_cc_ ## c] "=qm"
+#endif
+
+/* Exception table entry */
+#ifdef __ASSEMBLY__
+# define _ASM_EXTABLE_HANDLE(from, to, handler) \
+ .pushsection "__ex_table","a" ; \
+ .balign 4 ; \
+ .long (from) - . ; \
+ .long (to) - . ; \
+ .long (handler) - . ; \
+ .popsection
+
+# define _ASM_EXTABLE(from, to) \
+ _ASM_EXTABLE_HANDLE(from, to, ex_handler_default)
+
+# define _ASM_EXTABLE_FAULT(from, to) \
+ _ASM_EXTABLE_HANDLE(from, to, ex_handler_fault)
+
+# define _ASM_EXTABLE_EX(from, to) \
+ _ASM_EXTABLE_HANDLE(from, to, ex_handler_ext)
+
+# define _ASM_EXTABLE_REFCOUNT(from, to) \
+ _ASM_EXTABLE_HANDLE(from, to, ex_handler_refcount)
+
+# define _ASM_NOKPROBE(entry) \
+ .pushsection "_kprobe_blacklist","aw" ; \
+ _ASM_ALIGN ; \
+ _ASM_PTR (entry); \
+ .popsection
+
+.macro ALIGN_DESTINATION
+ /* check for bad alignment of destination */
+ movl %edi,%ecx
+ andl $7,%ecx
+ jz 102f /* already aligned */
+ subl $8,%ecx
+ negl %ecx
+ subl %ecx,%edx
+100: movb (%rsi),%al
+101: movb %al,(%rdi)
+ incq %rsi
+ incq %rdi
+ decl %ecx
+ jnz 100b
+102:
+ .section .fixup,"ax"
+103: addl %ecx,%edx /* ecx is zerorest also */
+ jmp copy_user_handle_tail
+ .previous
+
+ _ASM_EXTABLE(100b,103b)
+ _ASM_EXTABLE(101b,103b)
+ .endm
+
+#else
+# define _EXPAND_EXTABLE_HANDLE(x) #x
+# define _ASM_EXTABLE_HANDLE(from, to, handler) \
+ " .pushsection \"__ex_table\",\"a\"\n" \
+ " .balign 4\n" \
+ " .long (" #from ") - .\n" \
+ " .long (" #to ") - .\n" \
+ " .long (" _EXPAND_EXTABLE_HANDLE(handler) ") - .\n" \
+ " .popsection\n"
+
+# define _ASM_EXTABLE(from, to) \
+ _ASM_EXTABLE_HANDLE(from, to, ex_handler_default)
+
+# define _ASM_EXTABLE_FAULT(from, to) \
+ _ASM_EXTABLE_HANDLE(from, to, ex_handler_fault)
+
+# define _ASM_EXTABLE_EX(from, to) \
+ _ASM_EXTABLE_HANDLE(from, to, ex_handler_ext)
+
+# define _ASM_EXTABLE_REFCOUNT(from, to) \
+ _ASM_EXTABLE_HANDLE(from, to, ex_handler_refcount)
+
+/* For C file, we already have NOKPROBE_SYMBOL macro */
+#endif
+
+#ifndef __ASSEMBLY__
+/*
+ * This output constraint should be used for any inline asm which has a "call"
+ * instruction. Otherwise the asm may be inserted before the frame pointer
+ * gets set up by the containing function. If you forget to do this, objtool
+ * may print a "call without frame pointer save/setup" warning.
+ */
+register unsigned long current_stack_pointer asm(_ASM_SP);
+#define ASM_CALL_CONSTRAINT "+r" (current_stack_pointer)
+#endif
+
+#endif /* _ASM_X86_ASM_H */
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
new file mode 100644
index 000000000..d266a4066
--- /dev/null
+++ b/arch/x86/include/asm/atomic.h
@@ -0,0 +1,267 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_ATOMIC_H
+#define _ASM_X86_ATOMIC_H
+
+#include <linux/compiler.h>
+#include <linux/types.h>
+#include <asm/alternative.h>
+#include <asm/cmpxchg.h>
+#include <asm/rmwcc.h>
+#include <asm/barrier.h>
+
+/*
+ * Atomic operations that C can't guarantee us. Useful for
+ * resource counting etc..
+ */
+
+#define ATOMIC_INIT(i) { (i) }
+
+/**
+ * arch_atomic_read - read atomic variable
+ * @v: pointer of type atomic_t
+ *
+ * Atomically reads the value of @v.
+ */
+static __always_inline int arch_atomic_read(const atomic_t *v)
+{
+ /*
+ * Note for KASAN: we deliberately don't use READ_ONCE_NOCHECK() here,
+ * it's non-inlined function that increases binary size and stack usage.
+ */
+ return READ_ONCE((v)->counter);
+}
+
+/**
+ * arch_atomic_set - set atomic variable
+ * @v: pointer of type atomic_t
+ * @i: required value
+ *
+ * Atomically sets the value of @v to @i.
+ */
+static __always_inline void arch_atomic_set(atomic_t *v, int i)
+{
+ WRITE_ONCE(v->counter, i);
+}
+
+/**
+ * arch_atomic_add - add integer to atomic variable
+ * @i: integer value to add
+ * @v: pointer of type atomic_t
+ *
+ * Atomically adds @i to @v.
+ */
+static __always_inline void arch_atomic_add(int i, atomic_t *v)
+{
+ asm volatile(LOCK_PREFIX "addl %1,%0"
+ : "+m" (v->counter)
+ : "ir" (i) : "memory");
+}
+
+/**
+ * arch_atomic_sub - subtract integer from atomic variable
+ * @i: integer value to subtract
+ * @v: pointer of type atomic_t
+ *
+ * Atomically subtracts @i from @v.
+ */
+static __always_inline void arch_atomic_sub(int i, atomic_t *v)
+{
+ asm volatile(LOCK_PREFIX "subl %1,%0"
+ : "+m" (v->counter)
+ : "ir" (i) : "memory");
+}
+
+/**
+ * arch_atomic_sub_and_test - subtract value from variable and test result
+ * @i: integer value to subtract
+ * @v: pointer of type atomic_t
+ *
+ * Atomically subtracts @i from @v and returns
+ * true if the result is zero, or false for all
+ * other cases.
+ */
+static __always_inline bool arch_atomic_sub_and_test(int i, atomic_t *v)
+{
+ GEN_BINARY_RMWcc(LOCK_PREFIX "subl", v->counter, "er", i, "%0", e);
+}
+#define arch_atomic_sub_and_test arch_atomic_sub_and_test
+
+/**
+ * arch_atomic_inc - increment atomic variable
+ * @v: pointer of type atomic_t
+ *
+ * Atomically increments @v by 1.
+ */
+static __always_inline void arch_atomic_inc(atomic_t *v)
+{
+ asm volatile(LOCK_PREFIX "incl %0"
+ : "+m" (v->counter) :: "memory");
+}
+#define arch_atomic_inc arch_atomic_inc
+
+/**
+ * arch_atomic_dec - decrement atomic variable
+ * @v: pointer of type atomic_t
+ *
+ * Atomically decrements @v by 1.
+ */
+static __always_inline void arch_atomic_dec(atomic_t *v)
+{
+ asm volatile(LOCK_PREFIX "decl %0"
+ : "+m" (v->counter) :: "memory");
+}
+#define arch_atomic_dec arch_atomic_dec
+
+/**
+ * arch_atomic_dec_and_test - decrement and test
+ * @v: pointer of type atomic_t
+ *
+ * Atomically decrements @v by 1 and
+ * returns true if the result is 0, or false for all other
+ * cases.
+ */
+static __always_inline bool arch_atomic_dec_and_test(atomic_t *v)
+{
+ GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, "%0", e);
+}
+#define arch_atomic_dec_and_test arch_atomic_dec_and_test
+
+/**
+ * arch_atomic_inc_and_test - increment and test
+ * @v: pointer of type atomic_t
+ *
+ * Atomically increments @v by 1
+ * and returns true if the result is zero, or false for all
+ * other cases.
+ */
+static __always_inline bool arch_atomic_inc_and_test(atomic_t *v)
+{
+ GEN_UNARY_RMWcc(LOCK_PREFIX "incl", v->counter, "%0", e);
+}
+#define arch_atomic_inc_and_test arch_atomic_inc_and_test
+
+/**
+ * arch_atomic_add_negative - add and test if negative
+ * @i: integer value to add
+ * @v: pointer of type atomic_t
+ *
+ * Atomically adds @i to @v and returns true
+ * if the result is negative, or false when
+ * result is greater than or equal to zero.
+ */
+static __always_inline bool arch_atomic_add_negative(int i, atomic_t *v)
+{
+ GEN_BINARY_RMWcc(LOCK_PREFIX "addl", v->counter, "er", i, "%0", s);
+}
+#define arch_atomic_add_negative arch_atomic_add_negative
+
+/**
+ * arch_atomic_add_return - add integer and return
+ * @i: integer value to add
+ * @v: pointer of type atomic_t
+ *
+ * Atomically adds @i to @v and returns @i + @v
+ */
+static __always_inline int arch_atomic_add_return(int i, atomic_t *v)
+{
+ return i + xadd(&v->counter, i);
+}
+
+/**
+ * arch_atomic_sub_return - subtract integer and return
+ * @v: pointer of type atomic_t
+ * @i: integer value to subtract
+ *
+ * Atomically subtracts @i from @v and returns @v - @i
+ */
+static __always_inline int arch_atomic_sub_return(int i, atomic_t *v)
+{
+ return arch_atomic_add_return(-i, v);
+}
+
+static __always_inline int arch_atomic_fetch_add(int i, atomic_t *v)
+{
+ return xadd(&v->counter, i);
+}
+
+static __always_inline int arch_atomic_fetch_sub(int i, atomic_t *v)
+{
+ return xadd(&v->counter, -i);
+}
+
+static __always_inline int arch_atomic_cmpxchg(atomic_t *v, int old, int new)
+{
+ return arch_cmpxchg(&v->counter, old, new);
+}
+
+#define arch_atomic_try_cmpxchg arch_atomic_try_cmpxchg
+static __always_inline bool arch_atomic_try_cmpxchg(atomic_t *v, int *old, int new)
+{
+ return try_cmpxchg(&v->counter, old, new);
+}
+
+static inline int arch_atomic_xchg(atomic_t *v, int new)
+{
+ return arch_xchg(&v->counter, new);
+}
+
+static inline void arch_atomic_and(int i, atomic_t *v)
+{
+ asm volatile(LOCK_PREFIX "andl %1,%0"
+ : "+m" (v->counter)
+ : "ir" (i)
+ : "memory");
+}
+
+static inline int arch_atomic_fetch_and(int i, atomic_t *v)
+{
+ int val = arch_atomic_read(v);
+
+ do { } while (!arch_atomic_try_cmpxchg(v, &val, val & i));
+
+ return val;
+}
+
+static inline void arch_atomic_or(int i, atomic_t *v)
+{
+ asm volatile(LOCK_PREFIX "orl %1,%0"
+ : "+m" (v->counter)
+ : "ir" (i)
+ : "memory");
+}
+
+static inline int arch_atomic_fetch_or(int i, atomic_t *v)
+{
+ int val = arch_atomic_read(v);
+
+ do { } while (!arch_atomic_try_cmpxchg(v, &val, val | i));
+
+ return val;
+}
+
+static inline void arch_atomic_xor(int i, atomic_t *v)
+{
+ asm volatile(LOCK_PREFIX "xorl %1,%0"
+ : "+m" (v->counter)
+ : "ir" (i)
+ : "memory");
+}
+
+static inline int arch_atomic_fetch_xor(int i, atomic_t *v)
+{
+ int val = arch_atomic_read(v);
+
+ do { } while (!arch_atomic_try_cmpxchg(v, &val, val ^ i));
+
+ return val;
+}
+
+#ifdef CONFIG_X86_32
+# include <asm/atomic64_32.h>
+#else
+# include <asm/atomic64_64.h>
+#endif
+
+#include <asm-generic/atomic-instrumented.h>
+
+#endif /* _ASM_X86_ATOMIC_H */
diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h
new file mode 100644
index 000000000..6a5b0ec46
--- /dev/null
+++ b/arch/x86/include/asm/atomic64_32.h
@@ -0,0 +1,335 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_ATOMIC64_32_H
+#define _ASM_X86_ATOMIC64_32_H
+
+#include <linux/compiler.h>
+#include <linux/types.h>
+//#include <asm/cmpxchg.h>
+
+/* An 64bit atomic type */
+
+typedef struct {
+ u64 __aligned(8) counter;
+} atomic64_t;
+
+#define ATOMIC64_INIT(val) { (val) }
+
+#define __ATOMIC64_DECL(sym) void atomic64_##sym(atomic64_t *, ...)
+#ifndef ATOMIC64_EXPORT
+#define ATOMIC64_DECL_ONE __ATOMIC64_DECL
+#else
+#define ATOMIC64_DECL_ONE(sym) __ATOMIC64_DECL(sym); \
+ ATOMIC64_EXPORT(atomic64_##sym)
+#endif
+
+#ifdef CONFIG_X86_CMPXCHG64
+#define __alternative_atomic64(f, g, out, in...) \
+ asm volatile("call %P[func]" \
+ : out : [func] "i" (atomic64_##g##_cx8), ## in)
+
+#define ATOMIC64_DECL(sym) ATOMIC64_DECL_ONE(sym##_cx8)
+#else
+#define __alternative_atomic64(f, g, out, in...) \
+ alternative_call(atomic64_##f##_386, atomic64_##g##_cx8, \
+ X86_FEATURE_CX8, ASM_OUTPUT2(out), ## in)
+
+#define ATOMIC64_DECL(sym) ATOMIC64_DECL_ONE(sym##_cx8); \
+ ATOMIC64_DECL_ONE(sym##_386)
+
+ATOMIC64_DECL_ONE(add_386);
+ATOMIC64_DECL_ONE(sub_386);
+ATOMIC64_DECL_ONE(inc_386);
+ATOMIC64_DECL_ONE(dec_386);
+#endif
+
+#define alternative_atomic64(f, out, in...) \
+ __alternative_atomic64(f, f, ASM_OUTPUT2(out), ## in)
+
+ATOMIC64_DECL(read);
+ATOMIC64_DECL(set);
+ATOMIC64_DECL(xchg);
+ATOMIC64_DECL(add_return);
+ATOMIC64_DECL(sub_return);
+ATOMIC64_DECL(inc_return);
+ATOMIC64_DECL(dec_return);
+ATOMIC64_DECL(dec_if_positive);
+ATOMIC64_DECL(inc_not_zero);
+ATOMIC64_DECL(add_unless);
+
+#undef ATOMIC64_DECL
+#undef ATOMIC64_DECL_ONE
+#undef __ATOMIC64_DECL
+#undef ATOMIC64_EXPORT
+
+/**
+ * arch_atomic64_cmpxchg - cmpxchg atomic64 variable
+ * @v: pointer to type atomic64_t
+ * @o: expected value
+ * @n: new value
+ *
+ * Atomically sets @v to @n if it was equal to @o and returns
+ * the old value.
+ */
+
+static inline long long arch_atomic64_cmpxchg(atomic64_t *v, long long o,
+ long long n)
+{
+ return arch_cmpxchg64(&v->counter, o, n);
+}
+
+/**
+ * arch_atomic64_xchg - xchg atomic64 variable
+ * @v: pointer to type atomic64_t
+ * @n: value to assign
+ *
+ * Atomically xchgs the value of @v to @n and returns
+ * the old value.
+ */
+static inline long long arch_atomic64_xchg(atomic64_t *v, long long n)
+{
+ long long o;
+ unsigned high = (unsigned)(n >> 32);
+ unsigned low = (unsigned)n;
+ alternative_atomic64(xchg, "=&A" (o),
+ "S" (v), "b" (low), "c" (high)
+ : "memory");
+ return o;
+}
+
+/**
+ * arch_atomic64_set - set atomic64 variable
+ * @v: pointer to type atomic64_t
+ * @i: value to assign
+ *
+ * Atomically sets the value of @v to @n.
+ */
+static inline void arch_atomic64_set(atomic64_t *v, long long i)
+{
+ unsigned high = (unsigned)(i >> 32);
+ unsigned low = (unsigned)i;
+ alternative_atomic64(set, /* no output */,
+ "S" (v), "b" (low), "c" (high)
+ : "eax", "edx", "memory");
+}
+
+/**
+ * arch_atomic64_read - read atomic64 variable
+ * @v: pointer to type atomic64_t
+ *
+ * Atomically reads the value of @v and returns it.
+ */
+static inline long long arch_atomic64_read(const atomic64_t *v)
+{
+ long long r;
+ alternative_atomic64(read, "=&A" (r), "c" (v) : "memory");
+ return r;
+}
+
+/**
+ * arch_atomic64_add_return - add and return
+ * @i: integer value to add
+ * @v: pointer to type atomic64_t
+ *
+ * Atomically adds @i to @v and returns @i + *@v
+ */
+static inline long long arch_atomic64_add_return(long long i, atomic64_t *v)
+{
+ alternative_atomic64(add_return,
+ ASM_OUTPUT2("+A" (i), "+c" (v)),
+ ASM_NO_INPUT_CLOBBER("memory"));
+ return i;
+}
+
+/*
+ * Other variants with different arithmetic operators:
+ */
+static inline long long arch_atomic64_sub_return(long long i, atomic64_t *v)
+{
+ alternative_atomic64(sub_return,
+ ASM_OUTPUT2("+A" (i), "+c" (v)),
+ ASM_NO_INPUT_CLOBBER("memory"));
+ return i;
+}
+
+static inline long long arch_atomic64_inc_return(atomic64_t *v)
+{
+ long long a;
+ alternative_atomic64(inc_return, "=&A" (a),
+ "S" (v) : "memory", "ecx");
+ return a;
+}
+#define arch_atomic64_inc_return arch_atomic64_inc_return
+
+static inline long long arch_atomic64_dec_return(atomic64_t *v)
+{
+ long long a;
+ alternative_atomic64(dec_return, "=&A" (a),
+ "S" (v) : "memory", "ecx");
+ return a;
+}
+#define arch_atomic64_dec_return arch_atomic64_dec_return
+
+/**
+ * arch_atomic64_add - add integer to atomic64 variable
+ * @i: integer value to add
+ * @v: pointer to type atomic64_t
+ *
+ * Atomically adds @i to @v.
+ */
+static inline long long arch_atomic64_add(long long i, atomic64_t *v)
+{
+ __alternative_atomic64(add, add_return,
+ ASM_OUTPUT2("+A" (i), "+c" (v)),
+ ASM_NO_INPUT_CLOBBER("memory"));
+ return i;
+}
+
+/**
+ * arch_atomic64_sub - subtract the atomic64 variable
+ * @i: integer value to subtract
+ * @v: pointer to type atomic64_t
+ *
+ * Atomically subtracts @i from @v.
+ */
+static inline long long arch_atomic64_sub(long long i, atomic64_t *v)
+{
+ __alternative_atomic64(sub, sub_return,
+ ASM_OUTPUT2("+A" (i), "+c" (v)),
+ ASM_NO_INPUT_CLOBBER("memory"));
+ return i;
+}
+
+/**
+ * arch_atomic64_inc - increment atomic64 variable
+ * @v: pointer to type atomic64_t
+ *
+ * Atomically increments @v by 1.
+ */
+static inline void arch_atomic64_inc(atomic64_t *v)
+{
+ __alternative_atomic64(inc, inc_return, /* no output */,
+ "S" (v) : "memory", "eax", "ecx", "edx");
+}
+#define arch_atomic64_inc arch_atomic64_inc
+
+/**
+ * arch_atomic64_dec - decrement atomic64 variable
+ * @v: pointer to type atomic64_t
+ *
+ * Atomically decrements @v by 1.
+ */
+static inline void arch_atomic64_dec(atomic64_t *v)
+{
+ __alternative_atomic64(dec, dec_return, /* no output */,
+ "S" (v) : "memory", "eax", "ecx", "edx");
+}
+#define arch_atomic64_dec arch_atomic64_dec
+
+/**
+ * arch_atomic64_add_unless - add unless the number is a given value
+ * @v: pointer of type atomic64_t
+ * @a: the amount to add to v...
+ * @u: ...unless v is equal to u.
+ *
+ * Atomically adds @a to @v, so long as it was not @u.
+ * Returns non-zero if the add was done, zero otherwise.
+ */
+static inline int arch_atomic64_add_unless(atomic64_t *v, long long a,
+ long long u)
+{
+ unsigned low = (unsigned)u;
+ unsigned high = (unsigned)(u >> 32);
+ alternative_atomic64(add_unless,
+ ASM_OUTPUT2("+A" (a), "+c" (low), "+D" (high)),
+ "S" (v) : "memory");
+ return (int)a;
+}
+
+static inline int arch_atomic64_inc_not_zero(atomic64_t *v)
+{
+ int r;
+ alternative_atomic64(inc_not_zero, "=&a" (r),
+ "S" (v) : "ecx", "edx", "memory");
+ return r;
+}
+#define arch_atomic64_inc_not_zero arch_atomic64_inc_not_zero
+
+static inline long long arch_atomic64_dec_if_positive(atomic64_t *v)
+{
+ long long r;
+ alternative_atomic64(dec_if_positive, "=&A" (r),
+ "S" (v) : "ecx", "memory");
+ return r;
+}
+#define arch_atomic64_dec_if_positive arch_atomic64_dec_if_positive
+
+#undef alternative_atomic64
+#undef __alternative_atomic64
+
+static inline void arch_atomic64_and(long long i, atomic64_t *v)
+{
+ long long old, c = 0;
+
+ while ((old = arch_atomic64_cmpxchg(v, c, c & i)) != c)
+ c = old;
+}
+
+static inline long long arch_atomic64_fetch_and(long long i, atomic64_t *v)
+{
+ long long old, c = 0;
+
+ while ((old = arch_atomic64_cmpxchg(v, c, c & i)) != c)
+ c = old;
+
+ return old;
+}
+
+static inline void arch_atomic64_or(long long i, atomic64_t *v)
+{
+ long long old, c = 0;
+
+ while ((old = arch_atomic64_cmpxchg(v, c, c | i)) != c)
+ c = old;
+}
+
+static inline long long arch_atomic64_fetch_or(long long i, atomic64_t *v)
+{
+ long long old, c = 0;
+
+ while ((old = arch_atomic64_cmpxchg(v, c, c | i)) != c)
+ c = old;
+
+ return old;
+}
+
+static inline void arch_atomic64_xor(long long i, atomic64_t *v)
+{
+ long long old, c = 0;
+
+ while ((old = arch_atomic64_cmpxchg(v, c, c ^ i)) != c)
+ c = old;
+}
+
+static inline long long arch_atomic64_fetch_xor(long long i, atomic64_t *v)
+{
+ long long old, c = 0;
+
+ while ((old = arch_atomic64_cmpxchg(v, c, c ^ i)) != c)
+ c = old;
+
+ return old;
+}
+
+static inline long long arch_atomic64_fetch_add(long long i, atomic64_t *v)
+{
+ long long old, c = 0;
+
+ while ((old = arch_atomic64_cmpxchg(v, c, c + i)) != c)
+ c = old;
+
+ return old;
+}
+
+#define arch_atomic64_fetch_sub(i, v) arch_atomic64_fetch_add(-(i), (v))
+
+#endif /* _ASM_X86_ATOMIC64_32_H */
diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h
new file mode 100644
index 000000000..55ca027f8
--- /dev/null
+++ b/arch/x86/include/asm/atomic64_64.h
@@ -0,0 +1,245 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_ATOMIC64_64_H
+#define _ASM_X86_ATOMIC64_64_H
+
+#include <linux/types.h>
+#include <asm/alternative.h>
+#include <asm/cmpxchg.h>
+
+/* The 64-bit atomic type */
+
+#define ATOMIC64_INIT(i) { (i) }
+
+/**
+ * arch_atomic64_read - read atomic64 variable
+ * @v: pointer of type atomic64_t
+ *
+ * Atomically reads the value of @v.
+ * Doesn't imply a read memory barrier.
+ */
+static inline long arch_atomic64_read(const atomic64_t *v)
+{
+ return READ_ONCE((v)->counter);
+}
+
+/**
+ * arch_atomic64_set - set atomic64 variable
+ * @v: pointer to type atomic64_t
+ * @i: required value
+ *
+ * Atomically sets the value of @v to @i.
+ */
+static inline void arch_atomic64_set(atomic64_t *v, long i)
+{
+ WRITE_ONCE(v->counter, i);
+}
+
+/**
+ * arch_atomic64_add - add integer to atomic64 variable
+ * @i: integer value to add
+ * @v: pointer to type atomic64_t
+ *
+ * Atomically adds @i to @v.
+ */
+static __always_inline void arch_atomic64_add(long i, atomic64_t *v)
+{
+ asm volatile(LOCK_PREFIX "addq %1,%0"
+ : "=m" (v->counter)
+ : "er" (i), "m" (v->counter) : "memory");
+}
+
+/**
+ * arch_atomic64_sub - subtract the atomic64 variable
+ * @i: integer value to subtract
+ * @v: pointer to type atomic64_t
+ *
+ * Atomically subtracts @i from @v.
+ */
+static inline void arch_atomic64_sub(long i, atomic64_t *v)
+{
+ asm volatile(LOCK_PREFIX "subq %1,%0"
+ : "=m" (v->counter)
+ : "er" (i), "m" (v->counter) : "memory");
+}
+
+/**
+ * arch_atomic64_sub_and_test - subtract value from variable and test result
+ * @i: integer value to subtract
+ * @v: pointer to type atomic64_t
+ *
+ * Atomically subtracts @i from @v and returns
+ * true if the result is zero, or false for all
+ * other cases.
+ */
+static inline bool arch_atomic64_sub_and_test(long i, atomic64_t *v)
+{
+ GEN_BINARY_RMWcc(LOCK_PREFIX "subq", v->counter, "er", i, "%0", e);
+}
+#define arch_atomic64_sub_and_test arch_atomic64_sub_and_test
+
+/**
+ * arch_atomic64_inc - increment atomic64 variable
+ * @v: pointer to type atomic64_t
+ *
+ * Atomically increments @v by 1.
+ */
+static __always_inline void arch_atomic64_inc(atomic64_t *v)
+{
+ asm volatile(LOCK_PREFIX "incq %0"
+ : "=m" (v->counter)
+ : "m" (v->counter) : "memory");
+}
+#define arch_atomic64_inc arch_atomic64_inc
+
+/**
+ * arch_atomic64_dec - decrement atomic64 variable
+ * @v: pointer to type atomic64_t
+ *
+ * Atomically decrements @v by 1.
+ */
+static __always_inline void arch_atomic64_dec(atomic64_t *v)
+{
+ asm volatile(LOCK_PREFIX "decq %0"
+ : "=m" (v->counter)
+ : "m" (v->counter) : "memory");
+}
+#define arch_atomic64_dec arch_atomic64_dec
+
+/**
+ * arch_atomic64_dec_and_test - decrement and test
+ * @v: pointer to type atomic64_t
+ *
+ * Atomically decrements @v by 1 and
+ * returns true if the result is 0, or false for all other
+ * cases.
+ */
+static inline bool arch_atomic64_dec_and_test(atomic64_t *v)
+{
+ GEN_UNARY_RMWcc(LOCK_PREFIX "decq", v->counter, "%0", e);
+}
+#define arch_atomic64_dec_and_test arch_atomic64_dec_and_test
+
+/**
+ * arch_atomic64_inc_and_test - increment and test
+ * @v: pointer to type atomic64_t
+ *
+ * Atomically increments @v by 1
+ * and returns true if the result is zero, or false for all
+ * other cases.
+ */
+static inline bool arch_atomic64_inc_and_test(atomic64_t *v)
+{
+ GEN_UNARY_RMWcc(LOCK_PREFIX "incq", v->counter, "%0", e);
+}
+#define arch_atomic64_inc_and_test arch_atomic64_inc_and_test
+
+/**
+ * arch_atomic64_add_negative - add and test if negative
+ * @i: integer value to add
+ * @v: pointer to type atomic64_t
+ *
+ * Atomically adds @i to @v and returns true
+ * if the result is negative, or false when
+ * result is greater than or equal to zero.
+ */
+static inline bool arch_atomic64_add_negative(long i, atomic64_t *v)
+{
+ GEN_BINARY_RMWcc(LOCK_PREFIX "addq", v->counter, "er", i, "%0", s);
+}
+#define arch_atomic64_add_negative arch_atomic64_add_negative
+
+/**
+ * arch_atomic64_add_return - add and return
+ * @i: integer value to add
+ * @v: pointer to type atomic64_t
+ *
+ * Atomically adds @i to @v and returns @i + @v
+ */
+static __always_inline long arch_atomic64_add_return(long i, atomic64_t *v)
+{
+ return i + xadd(&v->counter, i);
+}
+
+static inline long arch_atomic64_sub_return(long i, atomic64_t *v)
+{
+ return arch_atomic64_add_return(-i, v);
+}
+
+static inline long arch_atomic64_fetch_add(long i, atomic64_t *v)
+{
+ return xadd(&v->counter, i);
+}
+
+static inline long arch_atomic64_fetch_sub(long i, atomic64_t *v)
+{
+ return xadd(&v->counter, -i);
+}
+
+static inline long arch_atomic64_cmpxchg(atomic64_t *v, long old, long new)
+{
+ return arch_cmpxchg(&v->counter, old, new);
+}
+
+#define arch_atomic64_try_cmpxchg arch_atomic64_try_cmpxchg
+static __always_inline bool arch_atomic64_try_cmpxchg(atomic64_t *v, s64 *old, long new)
+{
+ return try_cmpxchg(&v->counter, old, new);
+}
+
+static inline long arch_atomic64_xchg(atomic64_t *v, long new)
+{
+ return arch_xchg(&v->counter, new);
+}
+
+static inline void arch_atomic64_and(long i, atomic64_t *v)
+{
+ asm volatile(LOCK_PREFIX "andq %1,%0"
+ : "+m" (v->counter)
+ : "er" (i)
+ : "memory");
+}
+
+static inline long arch_atomic64_fetch_and(long i, atomic64_t *v)
+{
+ s64 val = arch_atomic64_read(v);
+
+ do {
+ } while (!arch_atomic64_try_cmpxchg(v, &val, val & i));
+ return val;
+}
+
+static inline void arch_atomic64_or(long i, atomic64_t *v)
+{
+ asm volatile(LOCK_PREFIX "orq %1,%0"
+ : "+m" (v->counter)
+ : "er" (i)
+ : "memory");
+}
+
+static inline long arch_atomic64_fetch_or(long i, atomic64_t *v)
+{
+ s64 val = arch_atomic64_read(v);
+
+ do {
+ } while (!arch_atomic64_try_cmpxchg(v, &val, val | i));
+ return val;
+}
+
+static inline void arch_atomic64_xor(long i, atomic64_t *v)
+{
+ asm volatile(LOCK_PREFIX "xorq %1,%0"
+ : "+m" (v->counter)
+ : "er" (i)
+ : "memory");
+}
+
+static inline long arch_atomic64_fetch_xor(long i, atomic64_t *v)
+{
+ s64 val = arch_atomic64_read(v);
+
+ do {
+ } while (!arch_atomic64_try_cmpxchg(v, &val, val ^ i));
+ return val;
+}
+
+#endif /* _ASM_X86_ATOMIC64_64_H */
diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
new file mode 100644
index 000000000..dd22cffc6
--- /dev/null
+++ b/arch/x86/include/asm/barrier.h
@@ -0,0 +1,106 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_BARRIER_H
+#define _ASM_X86_BARRIER_H
+
+#include <asm/alternative.h>
+#include <asm/nops.h>
+
+/*
+ * Force strict CPU ordering.
+ * And yes, this might be required on UP too when we're talking
+ * to devices.
+ */
+
+#ifdef CONFIG_X86_32
+#define mb() asm volatile(ALTERNATIVE("lock; addl $0,-4(%%esp)", "mfence", \
+ X86_FEATURE_XMM2) ::: "memory", "cc")
+#define rmb() asm volatile(ALTERNATIVE("lock; addl $0,-4(%%esp)", "lfence", \
+ X86_FEATURE_XMM2) ::: "memory", "cc")
+#define wmb() asm volatile(ALTERNATIVE("lock; addl $0,-4(%%esp)", "sfence", \
+ X86_FEATURE_XMM2) ::: "memory", "cc")
+#else
+#define mb() asm volatile("mfence":::"memory")
+#define rmb() asm volatile("lfence":::"memory")
+#define wmb() asm volatile("sfence" ::: "memory")
+#endif
+
+/**
+ * array_index_mask_nospec() - generate a mask that is ~0UL when the
+ * bounds check succeeds and 0 otherwise
+ * @index: array element index
+ * @size: number of elements in array
+ *
+ * Returns:
+ * 0 - (index < size)
+ */
+static inline unsigned long array_index_mask_nospec(unsigned long index,
+ unsigned long size)
+{
+ unsigned long mask;
+
+ asm volatile ("cmp %1,%2; sbb %0,%0;"
+ :"=r" (mask)
+ :"g"(size),"r" (index)
+ :"cc");
+ return mask;
+}
+
+/* Override the default implementation from linux/nospec.h. */
+#define array_index_mask_nospec array_index_mask_nospec
+
+/* Prevent speculative execution past this barrier. */
+#define barrier_nospec() alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC, \
+ "lfence", X86_FEATURE_LFENCE_RDTSC)
+
+#define dma_rmb() barrier()
+#define dma_wmb() barrier()
+
+#ifdef CONFIG_X86_32
+#define __smp_mb() asm volatile("lock; addl $0,-4(%%esp)" ::: "memory", "cc")
+#else
+#define __smp_mb() asm volatile("lock; addl $0,-4(%%rsp)" ::: "memory", "cc")
+#endif
+#define __smp_rmb() dma_rmb()
+#define __smp_wmb() barrier()
+#define __smp_store_mb(var, value) do { (void)xchg(&var, value); } while (0)
+
+#define __smp_store_release(p, v) \
+do { \
+ compiletime_assert_atomic_type(*p); \
+ barrier(); \
+ WRITE_ONCE(*p, v); \
+} while (0)
+
+#define __smp_load_acquire(p) \
+({ \
+ typeof(*p) ___p1 = READ_ONCE(*p); \
+ compiletime_assert_atomic_type(*p); \
+ barrier(); \
+ ___p1; \
+})
+
+/* Atomic operations are already serializing on x86 */
+#define __smp_mb__before_atomic() do { } while (0)
+#define __smp_mb__after_atomic() do { } while (0)
+
+#include <asm-generic/barrier.h>
+
+/*
+ * Make previous memory operations globally visible before
+ * a WRMSR.
+ *
+ * MFENCE makes writes visible, but only affects load/store
+ * instructions. WRMSR is unfortunately not a load/store
+ * instruction and is unaffected by MFENCE. The LFENCE ensures
+ * that the WRMSR is not reordered.
+ *
+ * Most WRMSRs are full serializing instructions themselves and
+ * do not require this barrier. This is only required for the
+ * IA32_TSC_DEADLINE and X2APIC MSRs.
+ */
+static inline void weak_wrmsr_fence(void)
+{
+ asm volatile("mfence; lfence" : : : "memory");
+}
+
+#endif /* _ASM_X86_BARRIER_H */
diff --git a/arch/x86/include/asm/bios_ebda.h b/arch/x86/include/asm/bios_ebda.h
new file mode 100644
index 000000000..4d5a17e2f
--- /dev/null
+++ b/arch/x86/include/asm/bios_ebda.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_BIOS_EBDA_H
+#define _ASM_X86_BIOS_EBDA_H
+
+#include <asm/io.h>
+
+/*
+ * Returns physical address of EBDA. Returns 0 if there is no EBDA.
+ */
+static inline unsigned int get_bios_ebda(void)
+{
+ /*
+ * There is a real-mode segmented pointer pointing to the
+ * 4K EBDA area at 0x40E.
+ */
+ unsigned int address = *(unsigned short *)phys_to_virt(0x40E);
+ address <<= 4;
+ return address; /* 0 means none */
+}
+
+void reserve_bios_regions(void);
+
+#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
+/*
+ * This is obviously not a great place for this, but we want to be
+ * able to scatter it around anywhere in the kernel.
+ */
+void check_for_bios_corruption(void);
+void start_periodic_check_for_corruption(void);
+#else
+static inline void check_for_bios_corruption(void)
+{
+}
+
+static inline void start_periodic_check_for_corruption(void)
+{
+}
+#endif
+
+#endif /* _ASM_X86_BIOS_EBDA_H */
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
new file mode 100644
index 000000000..33611a74b
--- /dev/null
+++ b/arch/x86/include/asm/bitops.h
@@ -0,0 +1,515 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_BITOPS_H
+#define _ASM_X86_BITOPS_H
+
+/*
+ * Copyright 1992, Linus Torvalds.
+ *
+ * Note: inlines with more than a single statement should be marked
+ * __always_inline to avoid problems with older gcc's inlining heuristics.
+ */
+
+#ifndef _LINUX_BITOPS_H
+#error only <linux/bitops.h> can be included directly
+#endif
+
+#include <linux/compiler.h>
+#include <asm/alternative.h>
+#include <asm/rmwcc.h>
+#include <asm/barrier.h>
+
+#if BITS_PER_LONG == 32
+# define _BITOPS_LONG_SHIFT 5
+#elif BITS_PER_LONG == 64
+# define _BITOPS_LONG_SHIFT 6
+#else
+# error "Unexpected BITS_PER_LONG"
+#endif
+
+#define BIT_64(n) (U64_C(1) << (n))
+
+/*
+ * These have to be done with inline assembly: that way the bit-setting
+ * is guaranteed to be atomic. All bit operations return 0 if the bit
+ * was cleared before the operation and != 0 if it was not.
+ *
+ * bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1).
+ */
+
+#define RLONG_ADDR(x) "m" (*(volatile long *) (x))
+#define WBYTE_ADDR(x) "+m" (*(volatile char *) (x))
+
+#define ADDR RLONG_ADDR(addr)
+
+/*
+ * We do the locked ops that don't return the old value as
+ * a mask operation on a byte.
+ */
+#define IS_IMMEDIATE(nr) (__builtin_constant_p(nr))
+#define CONST_MASK_ADDR(nr, addr) WBYTE_ADDR((void *)(addr) + ((nr)>>3))
+#define CONST_MASK(nr) (1 << ((nr) & 7))
+
+/**
+ * set_bit - Atomically set a bit in memory
+ * @nr: the bit to set
+ * @addr: the address to start counting from
+ *
+ * This function is atomic and may not be reordered. See __set_bit()
+ * if you do not require the atomic guarantees.
+ *
+ * Note: there are no guarantees that this function will not be reordered
+ * on non x86 architectures, so if you are writing portable code,
+ * make sure not to rely on its reordering guarantees.
+ *
+ * Note that @nr may be almost arbitrarily large; this function is not
+ * restricted to acting on a single-word quantity.
+ */
+static __always_inline void
+set_bit(long nr, volatile unsigned long *addr)
+{
+ if (IS_IMMEDIATE(nr)) {
+ asm volatile(LOCK_PREFIX "orb %1,%0"
+ : CONST_MASK_ADDR(nr, addr)
+ : "iq" ((u8)CONST_MASK(nr))
+ : "memory");
+ } else {
+ asm volatile(LOCK_PREFIX __ASM_SIZE(bts) " %1,%0"
+ : : RLONG_ADDR(addr), "Ir" (nr) : "memory");
+ }
+}
+
+/**
+ * __set_bit - Set a bit in memory
+ * @nr: the bit to set
+ * @addr: the address to start counting from
+ *
+ * Unlike set_bit(), this function is non-atomic and may be reordered.
+ * If it's called on the same region of memory simultaneously, the effect
+ * may be that only one operation succeeds.
+ */
+static __always_inline void __set_bit(long nr, volatile unsigned long *addr)
+{
+ asm volatile(__ASM_SIZE(bts) " %1,%0" : : ADDR, "Ir" (nr) : "memory");
+}
+
+/**
+ * clear_bit - Clears a bit in memory
+ * @nr: Bit to clear
+ * @addr: Address to start counting from
+ *
+ * clear_bit() is atomic and may not be reordered. However, it does
+ * not contain a memory barrier, so if it is used for locking purposes,
+ * you should call smp_mb__before_atomic() and/or smp_mb__after_atomic()
+ * in order to ensure changes are visible on other processors.
+ */
+static __always_inline void
+clear_bit(long nr, volatile unsigned long *addr)
+{
+ if (IS_IMMEDIATE(nr)) {
+ asm volatile(LOCK_PREFIX "andb %1,%0"
+ : CONST_MASK_ADDR(nr, addr)
+ : "iq" ((u8)~CONST_MASK(nr)));
+ } else {
+ asm volatile(LOCK_PREFIX __ASM_SIZE(btr) " %1,%0"
+ : : RLONG_ADDR(addr), "Ir" (nr) : "memory");
+ }
+}
+
+/*
+ * clear_bit_unlock - Clears a bit in memory
+ * @nr: Bit to clear
+ * @addr: Address to start counting from
+ *
+ * clear_bit() is atomic and implies release semantics before the memory
+ * operation. It can be used for an unlock.
+ */
+static __always_inline void clear_bit_unlock(long nr, volatile unsigned long *addr)
+{
+ barrier();
+ clear_bit(nr, addr);
+}
+
+static __always_inline void __clear_bit(long nr, volatile unsigned long *addr)
+{
+ asm volatile(__ASM_SIZE(btr) " %1,%0" : : ADDR, "Ir" (nr) : "memory");
+}
+
+static __always_inline bool clear_bit_unlock_is_negative_byte(long nr, volatile unsigned long *addr)
+{
+ bool negative;
+ asm volatile(LOCK_PREFIX "andb %2,%1"
+ CC_SET(s)
+ : CC_OUT(s) (negative), WBYTE_ADDR(addr)
+ : "ir" ((char) ~(1 << nr)) : "memory");
+ return negative;
+}
+
+// Let everybody know we have it
+#define clear_bit_unlock_is_negative_byte clear_bit_unlock_is_negative_byte
+
+/*
+ * __clear_bit_unlock - Clears a bit in memory
+ * @nr: Bit to clear
+ * @addr: Address to start counting from
+ *
+ * __clear_bit() is non-atomic and implies release semantics before the memory
+ * operation. It can be used for an unlock if no other CPUs can concurrently
+ * modify other bits in the word.
+ */
+static __always_inline void __clear_bit_unlock(long nr, volatile unsigned long *addr)
+{
+ __clear_bit(nr, addr);
+}
+
+/**
+ * __change_bit - Toggle a bit in memory
+ * @nr: the bit to change
+ * @addr: the address to start counting from
+ *
+ * Unlike change_bit(), this function is non-atomic and may be reordered.
+ * If it's called on the same region of memory simultaneously, the effect
+ * may be that only one operation succeeds.
+ */
+static __always_inline void __change_bit(long nr, volatile unsigned long *addr)
+{
+ asm volatile(__ASM_SIZE(btc) " %1,%0" : : ADDR, "Ir" (nr) : "memory");
+}
+
+/**
+ * change_bit - Toggle a bit in memory
+ * @nr: Bit to change
+ * @addr: Address to start counting from
+ *
+ * change_bit() is atomic and may not be reordered.
+ * Note that @nr may be almost arbitrarily large; this function is not
+ * restricted to acting on a single-word quantity.
+ */
+static __always_inline void change_bit(long nr, volatile unsigned long *addr)
+{
+ if (IS_IMMEDIATE(nr)) {
+ asm volatile(LOCK_PREFIX "xorb %1,%0"
+ : CONST_MASK_ADDR(nr, addr)
+ : "iq" ((u8)CONST_MASK(nr)));
+ } else {
+ asm volatile(LOCK_PREFIX __ASM_SIZE(btc) " %1,%0"
+ : : RLONG_ADDR(addr), "Ir" (nr) : "memory");
+ }
+}
+
+/**
+ * test_and_set_bit - Set a bit and return its old value
+ * @nr: Bit to set
+ * @addr: Address to count from
+ *
+ * This operation is atomic and cannot be reordered.
+ * It also implies a memory barrier.
+ */
+static __always_inline bool test_and_set_bit(long nr, volatile unsigned long *addr)
+{
+ GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(bts),
+ *addr, "Ir", nr, "%0", c);
+}
+
+/**
+ * test_and_set_bit_lock - Set a bit and return its old value for lock
+ * @nr: Bit to set
+ * @addr: Address to count from
+ *
+ * This is the same as test_and_set_bit on x86.
+ */
+static __always_inline bool
+test_and_set_bit_lock(long nr, volatile unsigned long *addr)
+{
+ return test_and_set_bit(nr, addr);
+}
+
+/**
+ * __test_and_set_bit - Set a bit and return its old value
+ * @nr: Bit to set
+ * @addr: Address to count from
+ *
+ * This operation is non-atomic and can be reordered.
+ * If two examples of this operation race, one can appear to succeed
+ * but actually fail. You must protect multiple accesses with a lock.
+ */
+static __always_inline bool __test_and_set_bit(long nr, volatile unsigned long *addr)
+{
+ bool oldbit;
+
+ asm(__ASM_SIZE(bts) " %2,%1"
+ CC_SET(c)
+ : CC_OUT(c) (oldbit)
+ : ADDR, "Ir" (nr) : "memory");
+ return oldbit;
+}
+
+/**
+ * test_and_clear_bit - Clear a bit and return its old value
+ * @nr: Bit to clear
+ * @addr: Address to count from
+ *
+ * This operation is atomic and cannot be reordered.
+ * It also implies a memory barrier.
+ */
+static __always_inline bool test_and_clear_bit(long nr, volatile unsigned long *addr)
+{
+ GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(btr),
+ *addr, "Ir", nr, "%0", c);
+}
+
+/**
+ * __test_and_clear_bit - Clear a bit and return its old value
+ * @nr: Bit to clear
+ * @addr: Address to count from
+ *
+ * This operation is non-atomic and can be reordered.
+ * If two examples of this operation race, one can appear to succeed
+ * but actually fail. You must protect multiple accesses with a lock.
+ *
+ * Note: the operation is performed atomically with respect to
+ * the local CPU, but not other CPUs. Portable code should not
+ * rely on this behaviour.
+ * KVM relies on this behaviour on x86 for modifying memory that is also
+ * accessed from a hypervisor on the same CPU if running in a VM: don't change
+ * this without also updating arch/x86/kernel/kvm.c
+ */
+static __always_inline bool __test_and_clear_bit(long nr, volatile unsigned long *addr)
+{
+ bool oldbit;
+
+ asm volatile(__ASM_SIZE(btr) " %2,%1"
+ CC_SET(c)
+ : CC_OUT(c) (oldbit)
+ : ADDR, "Ir" (nr) : "memory");
+ return oldbit;
+}
+
+/* WARNING: non atomic and it can be reordered! */
+static __always_inline bool __test_and_change_bit(long nr, volatile unsigned long *addr)
+{
+ bool oldbit;
+
+ asm volatile(__ASM_SIZE(btc) " %2,%1"
+ CC_SET(c)
+ : CC_OUT(c) (oldbit)
+ : ADDR, "Ir" (nr) : "memory");
+
+ return oldbit;
+}
+
+/**
+ * test_and_change_bit - Change a bit and return its old value
+ * @nr: Bit to change
+ * @addr: Address to count from
+ *
+ * This operation is atomic and cannot be reordered.
+ * It also implies a memory barrier.
+ */
+static __always_inline bool test_and_change_bit(long nr, volatile unsigned long *addr)
+{
+ GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(btc),
+ *addr, "Ir", nr, "%0", c);
+}
+
+static __always_inline bool constant_test_bit(long nr, const volatile unsigned long *addr)
+{
+ return ((1UL << (nr & (BITS_PER_LONG-1))) &
+ (addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
+}
+
+static __always_inline bool variable_test_bit(long nr, volatile const unsigned long *addr)
+{
+ bool oldbit;
+
+ asm volatile(__ASM_SIZE(bt) " %2,%1"
+ CC_SET(c)
+ : CC_OUT(c) (oldbit)
+ : "m" (*(unsigned long *)addr), "Ir" (nr) : "memory");
+
+ return oldbit;
+}
+
+#if 0 /* Fool kernel-doc since it doesn't do macros yet */
+/**
+ * test_bit - Determine whether a bit is set
+ * @nr: bit number to test
+ * @addr: Address to start counting from
+ */
+static bool test_bit(int nr, const volatile unsigned long *addr);
+#endif
+
+#define test_bit(nr, addr) \
+ (__builtin_constant_p((nr)) \
+ ? constant_test_bit((nr), (addr)) \
+ : variable_test_bit((nr), (addr)))
+
+/**
+ * __ffs - find first set bit in word
+ * @word: The word to search
+ *
+ * Undefined if no bit exists, so code should check against 0 first.
+ */
+static __always_inline unsigned long __ffs(unsigned long word)
+{
+ asm("rep; bsf %1,%0"
+ : "=r" (word)
+ : "rm" (word));
+ return word;
+}
+
+/**
+ * ffz - find first zero bit in word
+ * @word: The word to search
+ *
+ * Undefined if no zero exists, so code should check against ~0UL first.
+ */
+static __always_inline unsigned long ffz(unsigned long word)
+{
+ asm("rep; bsf %1,%0"
+ : "=r" (word)
+ : "r" (~word));
+ return word;
+}
+
+/*
+ * __fls: find last set bit in word
+ * @word: The word to search
+ *
+ * Undefined if no set bit exists, so code should check against 0 first.
+ */
+static __always_inline unsigned long __fls(unsigned long word)
+{
+ asm("bsr %1,%0"
+ : "=r" (word)
+ : "rm" (word));
+ return word;
+}
+
+#undef ADDR
+
+#ifdef __KERNEL__
+/**
+ * ffs - find first set bit in word
+ * @x: the word to search
+ *
+ * This is defined the same way as the libc and compiler builtin ffs
+ * routines, therefore differs in spirit from the other bitops.
+ *
+ * ffs(value) returns 0 if value is 0 or the position of the first
+ * set bit if value is nonzero. The first (least significant) bit
+ * is at position 1.
+ */
+static __always_inline int ffs(int x)
+{
+ int r;
+
+#ifdef CONFIG_X86_64
+ /*
+ * AMD64 says BSFL won't clobber the dest reg if x==0; Intel64 says the
+ * dest reg is undefined if x==0, but their CPU architect says its
+ * value is written to set it to the same as before, except that the
+ * top 32 bits will be cleared.
+ *
+ * We cannot do this on 32 bits because at the very least some
+ * 486 CPUs did not behave this way.
+ */
+ asm("bsfl %1,%0"
+ : "=r" (r)
+ : "rm" (x), "0" (-1));
+#elif defined(CONFIG_X86_CMOV)
+ asm("bsfl %1,%0\n\t"
+ "cmovzl %2,%0"
+ : "=&r" (r) : "rm" (x), "r" (-1));
+#else
+ asm("bsfl %1,%0\n\t"
+ "jnz 1f\n\t"
+ "movl $-1,%0\n"
+ "1:" : "=r" (r) : "rm" (x));
+#endif
+ return r + 1;
+}
+
+/**
+ * fls - find last set bit in word
+ * @x: the word to search
+ *
+ * This is defined in a similar way as the libc and compiler builtin
+ * ffs, but returns the position of the most significant set bit.
+ *
+ * fls(value) returns 0 if value is 0 or the position of the last
+ * set bit if value is nonzero. The last (most significant) bit is
+ * at position 32.
+ */
+static __always_inline int fls(int x)
+{
+ int r;
+
+#ifdef CONFIG_X86_64
+ /*
+ * AMD64 says BSRL won't clobber the dest reg if x==0; Intel64 says the
+ * dest reg is undefined if x==0, but their CPU architect says its
+ * value is written to set it to the same as before, except that the
+ * top 32 bits will be cleared.
+ *
+ * We cannot do this on 32 bits because at the very least some
+ * 486 CPUs did not behave this way.
+ */
+ asm("bsrl %1,%0"
+ : "=r" (r)
+ : "rm" (x), "0" (-1));
+#elif defined(CONFIG_X86_CMOV)
+ asm("bsrl %1,%0\n\t"
+ "cmovzl %2,%0"
+ : "=&r" (r) : "rm" (x), "rm" (-1));
+#else
+ asm("bsrl %1,%0\n\t"
+ "jnz 1f\n\t"
+ "movl $-1,%0\n"
+ "1:" : "=r" (r) : "rm" (x));
+#endif
+ return r + 1;
+}
+
+/**
+ * fls64 - find last set bit in a 64-bit word
+ * @x: the word to search
+ *
+ * This is defined in a similar way as the libc and compiler builtin
+ * ffsll, but returns the position of the most significant set bit.
+ *
+ * fls64(value) returns 0 if value is 0 or the position of the last
+ * set bit if value is nonzero. The last (most significant) bit is
+ * at position 64.
+ */
+#ifdef CONFIG_X86_64
+static __always_inline int fls64(__u64 x)
+{
+ int bitpos = -1;
+ /*
+ * AMD64 says BSRQ won't clobber the dest reg if x==0; Intel64 says the
+ * dest reg is undefined if x==0, but their CPU architect says its
+ * value is written to set it to the same as before.
+ */
+ asm("bsrq %1,%q0"
+ : "+r" (bitpos)
+ : "rm" (x));
+ return bitpos + 1;
+}
+#else
+#include <asm-generic/bitops/fls64.h>
+#endif
+
+#include <asm-generic/bitops/find.h>
+
+#include <asm-generic/bitops/sched.h>
+
+#include <asm/arch_hweight.h>
+
+#include <asm-generic/bitops/const_hweight.h>
+
+#include <asm-generic/bitops/le.h>
+
+#include <asm-generic/bitops/ext2-atomic-setbit.h>
+
+#endif /* __KERNEL__ */
+#endif /* _ASM_X86_BITOPS_H */
diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h
new file mode 100644
index 000000000..680c32036
--- /dev/null
+++ b/arch/x86/include/asm/boot.h
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_BOOT_H
+#define _ASM_X86_BOOT_H
+
+
+#include <asm/pgtable_types.h>
+#include <uapi/asm/boot.h>
+
+/* Physical address where kernel should be loaded. */
+#define LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \
+ + (CONFIG_PHYSICAL_ALIGN - 1)) \
+ & ~(CONFIG_PHYSICAL_ALIGN - 1))
+
+/* Minimum kernel alignment, as a power of two */
+#ifdef CONFIG_X86_64
+# define MIN_KERNEL_ALIGN_LG2 PMD_SHIFT
+#else
+# define MIN_KERNEL_ALIGN_LG2 (PAGE_SHIFT + THREAD_SIZE_ORDER)
+#endif
+#define MIN_KERNEL_ALIGN (_AC(1, UL) << MIN_KERNEL_ALIGN_LG2)
+
+#if (CONFIG_PHYSICAL_ALIGN & (CONFIG_PHYSICAL_ALIGN-1)) || \
+ (CONFIG_PHYSICAL_ALIGN < MIN_KERNEL_ALIGN)
+# error "Invalid value for CONFIG_PHYSICAL_ALIGN"
+#endif
+
+#ifdef CONFIG_KERNEL_BZIP2
+# define BOOT_HEAP_SIZE 0x400000
+#else /* !CONFIG_KERNEL_BZIP2 */
+# define BOOT_HEAP_SIZE 0x10000
+#endif
+
+#ifdef CONFIG_X86_64
+# define BOOT_STACK_SIZE 0x4000
+
+# define BOOT_INIT_PGT_SIZE (6*4096)
+# ifdef CONFIG_RANDOMIZE_BASE
+/*
+ * Assuming all cross the 512GB boundary:
+ * 1 page for level4
+ * (2+2)*4 pages for kernel, param, cmd_line, and randomized kernel
+ * 2 pages for first 2M (video RAM: CONFIG_X86_VERBOSE_BOOTUP).
+ * Total is 19 pages.
+ */
+# ifdef CONFIG_X86_VERBOSE_BOOTUP
+# define BOOT_PGT_SIZE (19*4096)
+# else /* !CONFIG_X86_VERBOSE_BOOTUP */
+# define BOOT_PGT_SIZE (17*4096)
+# endif
+# else /* !CONFIG_RANDOMIZE_BASE */
+# define BOOT_PGT_SIZE BOOT_INIT_PGT_SIZE
+# endif
+
+#else /* !CONFIG_X86_64 */
+# define BOOT_STACK_SIZE 0x1000
+#endif
+
+#endif /* _ASM_X86_BOOT_H */
diff --git a/arch/x86/include/asm/bootparam_utils.h b/arch/x86/include/asm/bootparam_utils.h
new file mode 100644
index 000000000..8fa49cf12
--- /dev/null
+++ b/arch/x86/include/asm/bootparam_utils.h
@@ -0,0 +1,91 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_BOOTPARAM_UTILS_H
+#define _ASM_X86_BOOTPARAM_UTILS_H
+
+#include <asm/bootparam.h>
+
+/*
+ * This file is included from multiple environments. Do not
+ * add completing #includes to make it standalone.
+ */
+
+/*
+ * Deal with bootloaders which fail to initialize unknown fields in
+ * boot_params to zero. The list fields in this list are taken from
+ * analysis of kexec-tools; if other broken bootloaders initialize a
+ * different set of fields we will need to figure out how to disambiguate.
+ *
+ * Note: efi_info is commonly left uninitialized, but that field has a
+ * private magic, so it is better to leave it unchanged.
+ */
+
+#define sizeof_mbr(type, member) ({ sizeof(((type *)0)->member); })
+
+#define BOOT_PARAM_PRESERVE(struct_member) \
+ { \
+ .start = offsetof(struct boot_params, struct_member), \
+ .len = sizeof_mbr(struct boot_params, struct_member), \
+ }
+
+struct boot_params_to_save {
+ unsigned int start;
+ unsigned int len;
+};
+
+static void sanitize_boot_params(struct boot_params *boot_params)
+{
+ /*
+ * IMPORTANT NOTE TO BOOTLOADER AUTHORS: do not simply clear
+ * this field. The purpose of this field is to guarantee
+ * compliance with the x86 boot spec located in
+ * Documentation/x86/boot.txt . That spec says that the
+ * *whole* structure should be cleared, after which only the
+ * portion defined by struct setup_header (boot_params->hdr)
+ * should be copied in.
+ *
+ * If you're having an issue because the sentinel is set, you
+ * need to change the whole structure to be cleared, not this
+ * (or any other) individual field, or you will soon have
+ * problems again.
+ */
+ if (boot_params->sentinel) {
+ /* fields in boot_params are left uninitialized, clear them */
+ static struct boot_params scratch;
+ char *bp_base = (char *)boot_params;
+ char *save_base = (char *)&scratch;
+ int i;
+
+ const struct boot_params_to_save to_save[] = {
+ BOOT_PARAM_PRESERVE(screen_info),
+ BOOT_PARAM_PRESERVE(apm_bios_info),
+ BOOT_PARAM_PRESERVE(tboot_addr),
+ BOOT_PARAM_PRESERVE(ist_info),
+ BOOT_PARAM_PRESERVE(hd0_info),
+ BOOT_PARAM_PRESERVE(hd1_info),
+ BOOT_PARAM_PRESERVE(sys_desc_table),
+ BOOT_PARAM_PRESERVE(olpc_ofw_header),
+ BOOT_PARAM_PRESERVE(efi_info),
+ BOOT_PARAM_PRESERVE(alt_mem_k),
+ BOOT_PARAM_PRESERVE(scratch),
+ BOOT_PARAM_PRESERVE(e820_entries),
+ BOOT_PARAM_PRESERVE(eddbuf_entries),
+ BOOT_PARAM_PRESERVE(edd_mbr_sig_buf_entries),
+ BOOT_PARAM_PRESERVE(edd_mbr_sig_buffer),
+ BOOT_PARAM_PRESERVE(secure_boot),
+ BOOT_PARAM_PRESERVE(hdr),
+ BOOT_PARAM_PRESERVE(e820_table),
+ BOOT_PARAM_PRESERVE(eddbuf),
+ };
+
+ memset(&scratch, 0, sizeof(scratch));
+
+ for (i = 0; i < ARRAY_SIZE(to_save); i++) {
+ memcpy(save_base + to_save[i].start,
+ bp_base + to_save[i].start, to_save[i].len);
+ }
+
+ memcpy(boot_params, save_base, sizeof(*boot_params));
+ }
+}
+
+#endif /* _ASM_X86_BOOTPARAM_UTILS_H */
diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h
new file mode 100644
index 000000000..6804d6642
--- /dev/null
+++ b/arch/x86/include/asm/bug.h
@@ -0,0 +1,85 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_BUG_H
+#define _ASM_X86_BUG_H
+
+#include <linux/stringify.h>
+
+/*
+ * Despite that some emulators terminate on UD2, we use it for WARN().
+ *
+ * Since various instruction decoders/specs disagree on the encoding of
+ * UD0/UD1.
+ */
+
+#define ASM_UD0 ".byte 0x0f, 0xff" /* + ModRM (for Intel) */
+#define ASM_UD1 ".byte 0x0f, 0xb9" /* + ModRM */
+#define ASM_UD2 ".byte 0x0f, 0x0b"
+
+#define INSN_UD0 0xff0f
+#define INSN_UD2 0x0b0f
+
+#define LEN_UD2 2
+
+#ifdef CONFIG_GENERIC_BUG
+
+#ifdef CONFIG_X86_32
+# define __BUG_REL(val) ".long " __stringify(val)
+#else
+# define __BUG_REL(val) ".long " __stringify(val) " - 2b"
+#endif
+
+#ifdef CONFIG_DEBUG_BUGVERBOSE
+
+#define _BUG_FLAGS(ins, flags) \
+do { \
+ asm volatile("1:\t" ins "\n" \
+ ".pushsection __bug_table,\"aw\"\n" \
+ "2:\t" __BUG_REL(1b) "\t# bug_entry::bug_addr\n" \
+ "\t" __BUG_REL(%c0) "\t# bug_entry::file\n" \
+ "\t.word %c1" "\t# bug_entry::line\n" \
+ "\t.word %c2" "\t# bug_entry::flags\n" \
+ "\t.org 2b+%c3\n" \
+ ".popsection" \
+ : : "i" (__FILE__), "i" (__LINE__), \
+ "i" (flags), \
+ "i" (sizeof(struct bug_entry))); \
+} while (0)
+
+#else /* !CONFIG_DEBUG_BUGVERBOSE */
+
+#define _BUG_FLAGS(ins, flags) \
+do { \
+ asm volatile("1:\t" ins "\n" \
+ ".pushsection __bug_table,\"aw\"\n" \
+ "2:\t" __BUG_REL(1b) "\t# bug_entry::bug_addr\n" \
+ "\t.word %c0" "\t# bug_entry::flags\n" \
+ "\t.org 2b+%c1\n" \
+ ".popsection" \
+ : : "i" (flags), \
+ "i" (sizeof(struct bug_entry))); \
+} while (0)
+
+#endif /* CONFIG_DEBUG_BUGVERBOSE */
+
+#else
+
+#define _BUG_FLAGS(ins, flags) asm volatile(ins)
+
+#endif /* CONFIG_GENERIC_BUG */
+
+#define HAVE_ARCH_BUG
+#define BUG() \
+do { \
+ _BUG_FLAGS(ASM_UD2, 0); \
+ unreachable(); \
+} while (0)
+
+#define __WARN_FLAGS(flags) \
+do { \
+ _BUG_FLAGS(ASM_UD2, BUGFLAG_WARNING|(flags)); \
+ annotate_reachable(); \
+} while (0)
+
+#include <asm-generic/bug.h>
+
+#endif /* _ASM_X86_BUG_H */
diff --git a/arch/x86/include/asm/bugs.h b/arch/x86/include/asm/bugs.h
new file mode 100644
index 000000000..542509b53
--- /dev/null
+++ b/arch/x86/include/asm/bugs.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_BUGS_H
+#define _ASM_X86_BUGS_H
+
+#include <asm/processor.h>
+
+extern void check_bugs(void);
+
+#if defined(CONFIG_CPU_SUP_INTEL)
+void check_mpx_erratum(struct cpuinfo_x86 *c);
+#else
+static inline void check_mpx_erratum(struct cpuinfo_x86 *c) {}
+#endif
+
+#if defined(CONFIG_CPU_SUP_INTEL) && defined(CONFIG_X86_32)
+int ppro_with_ram_bug(void);
+#else
+static inline int ppro_with_ram_bug(void) { return 0; }
+#endif
+
+#endif /* _ASM_X86_BUGS_H */
diff --git a/arch/x86/include/asm/cache.h b/arch/x86/include/asm/cache.h
new file mode 100644
index 000000000..abe08690a
--- /dev/null
+++ b/arch/x86/include/asm/cache.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_CACHE_H
+#define _ASM_X86_CACHE_H
+
+#include <linux/linkage.h>
+
+/* L1 cache line size */
+#define L1_CACHE_SHIFT (CONFIG_X86_L1_CACHE_SHIFT)
+#define L1_CACHE_BYTES (1 << L1_CACHE_SHIFT)
+
+#define __read_mostly __attribute__((__section__(".data..read_mostly")))
+
+#define INTERNODE_CACHE_SHIFT CONFIG_X86_INTERNODE_CACHE_SHIFT
+#define INTERNODE_CACHE_BYTES (1 << INTERNODE_CACHE_SHIFT)
+
+#ifdef CONFIG_X86_VSMP
+#ifdef CONFIG_SMP
+#define __cacheline_aligned_in_smp \
+ __attribute__((__aligned__(INTERNODE_CACHE_BYTES))) \
+ __page_aligned_data
+#endif
+#endif
+
+#endif /* _ASM_X86_CACHE_H */
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
new file mode 100644
index 000000000..63feaf2a5
--- /dev/null
+++ b/arch/x86/include/asm/cacheflush.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_CACHEFLUSH_H
+#define _ASM_X86_CACHEFLUSH_H
+
+/* Caches aren't brain-dead on the intel. */
+#include <asm-generic/cacheflush.h>
+#include <asm/special_insns.h>
+
+void clflush_cache_range(void *addr, unsigned int size);
+
+#endif /* _ASM_X86_CACHEFLUSH_H */
diff --git a/arch/x86/include/asm/cacheinfo.h b/arch/x86/include/asm/cacheinfo.h
new file mode 100644
index 000000000..e958e28f7
--- /dev/null
+++ b/arch/x86/include/asm/cacheinfo.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_CACHEINFO_H
+#define _ASM_X86_CACHEINFO_H
+
+void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu, u8 node_id);
+
+#endif /* _ASM_X86_CACHEINFO_H */
diff --git a/arch/x86/include/asm/calgary.h b/arch/x86/include/asm/calgary.h
new file mode 100644
index 000000000..a8303ebe0
--- /dev/null
+++ b/arch/x86/include/asm/calgary.h
@@ -0,0 +1,70 @@
+/*
+ * Derived from include/asm-powerpc/iommu.h
+ *
+ * Copyright IBM Corporation, 2006-2007
+ *
+ * Author: Jon Mason <jdmason@us.ibm.com>
+ * Author: Muli Ben-Yehuda <muli@il.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef _ASM_X86_CALGARY_H
+#define _ASM_X86_CALGARY_H
+
+#include <linux/spinlock.h>
+#include <linux/device.h>
+#include <linux/dma-mapping.h>
+#include <linux/timer.h>
+#include <asm/types.h>
+
+struct iommu_table {
+ const struct cal_chipset_ops *chip_ops; /* chipset specific funcs */
+ unsigned long it_base; /* mapped address of tce table */
+ unsigned long it_hint; /* Hint for next alloc */
+ unsigned long *it_map; /* A simple allocation bitmap for now */
+ void __iomem *bbar; /* Bridge BAR */
+ u64 tar_val; /* Table Address Register */
+ struct timer_list watchdog_timer;
+ spinlock_t it_lock; /* Protects it_map */
+ unsigned int it_size; /* Size of iommu table in entries */
+ unsigned char it_busno; /* Bus number this table belongs to */
+};
+
+struct cal_chipset_ops {
+ void (*handle_quirks)(struct iommu_table *tbl, struct pci_dev *dev);
+ void (*tce_cache_blast)(struct iommu_table *tbl);
+ void (*dump_error_regs)(struct iommu_table *tbl);
+};
+
+#define TCE_TABLE_SIZE_UNSPECIFIED ~0
+#define TCE_TABLE_SIZE_64K 0
+#define TCE_TABLE_SIZE_128K 1
+#define TCE_TABLE_SIZE_256K 2
+#define TCE_TABLE_SIZE_512K 3
+#define TCE_TABLE_SIZE_1M 4
+#define TCE_TABLE_SIZE_2M 5
+#define TCE_TABLE_SIZE_4M 6
+#define TCE_TABLE_SIZE_8M 7
+
+extern int use_calgary;
+
+#ifdef CONFIG_CALGARY_IOMMU
+extern int detect_calgary(void);
+#else
+static inline int detect_calgary(void) { return -ENODEV; }
+#endif
+
+#endif /* _ASM_X86_CALGARY_H */
diff --git a/arch/x86/include/asm/ce4100.h b/arch/x86/include/asm/ce4100.h
new file mode 100644
index 000000000..2930f560d
--- /dev/null
+++ b/arch/x86/include/asm/ce4100.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_CE4100_H_
+#define _ASM_CE4100_H_
+
+int ce4100_pci_init(void);
+
+#endif
diff --git a/arch/x86/include/asm/checksum.h b/arch/x86/include/asm/checksum.h
new file mode 100644
index 000000000..d79d1e622
--- /dev/null
+++ b/arch/x86/include/asm/checksum.h
@@ -0,0 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifdef CONFIG_X86_32
+# include <asm/checksum_32.h>
+#else
+# include <asm/checksum_64.h>
+#endif
diff --git a/arch/x86/include/asm/checksum_32.h b/arch/x86/include/asm/checksum_32.h
new file mode 100644
index 000000000..7a659c74c
--- /dev/null
+++ b/arch/x86/include/asm/checksum_32.h
@@ -0,0 +1,199 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_CHECKSUM_32_H
+#define _ASM_X86_CHECKSUM_32_H
+
+#include <linux/in6.h>
+#include <linux/uaccess.h>
+
+/*
+ * computes the checksum of a memory block at buff, length len,
+ * and adds in "sum" (32-bit)
+ *
+ * returns a 32-bit number suitable for feeding into itself
+ * or csum_tcpudp_magic
+ *
+ * this function must be called with even lengths, except
+ * for the last fragment, which may be odd
+ *
+ * it's best to have buff aligned on a 32-bit boundary
+ */
+asmlinkage __wsum csum_partial(const void *buff, int len, __wsum sum);
+
+/*
+ * the same as csum_partial, but copies from src while it
+ * checksums, and handles user-space pointer exceptions correctly, when needed.
+ *
+ * here even more important to align src and dst on a 32-bit (or even
+ * better 64-bit) boundary
+ */
+
+asmlinkage __wsum csum_partial_copy_generic(const void *src, void *dst,
+ int len, __wsum sum,
+ int *src_err_ptr, int *dst_err_ptr);
+
+/*
+ * Note: when you get a NULL pointer exception here this means someone
+ * passed in an incorrect kernel address to one of these functions.
+ *
+ * If you use these functions directly please don't forget the
+ * access_ok().
+ */
+static inline __wsum csum_partial_copy_nocheck(const void *src, void *dst,
+ int len, __wsum sum)
+{
+ return csum_partial_copy_generic(src, dst, len, sum, NULL, NULL);
+}
+
+static inline __wsum csum_partial_copy_from_user(const void __user *src,
+ void *dst,
+ int len, __wsum sum,
+ int *err_ptr)
+{
+ __wsum ret;
+
+ might_sleep();
+ stac();
+ ret = csum_partial_copy_generic((__force void *)src, dst,
+ len, sum, err_ptr, NULL);
+ clac();
+
+ return ret;
+}
+
+/*
+ * This is a version of ip_compute_csum() optimized for IP headers,
+ * which always checksum on 4 octet boundaries.
+ *
+ * By Jorge Cwik <jorge@laser.satlink.net>, adapted for linux by
+ * Arnt Gulbrandsen.
+ */
+static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
+{
+ unsigned int sum;
+
+ asm volatile("movl (%1), %0 ;\n"
+ "subl $4, %2 ;\n"
+ "jbe 2f ;\n"
+ "addl 4(%1), %0 ;\n"
+ "adcl 8(%1), %0 ;\n"
+ "adcl 12(%1), %0;\n"
+ "1: adcl 16(%1), %0 ;\n"
+ "lea 4(%1), %1 ;\n"
+ "decl %2 ;\n"
+ "jne 1b ;\n"
+ "adcl $0, %0 ;\n"
+ "movl %0, %2 ;\n"
+ "shrl $16, %0 ;\n"
+ "addw %w2, %w0 ;\n"
+ "adcl $0, %0 ;\n"
+ "notl %0 ;\n"
+ "2: ;\n"
+ /* Since the input registers which are loaded with iph and ihl
+ are modified, we must also specify them as outputs, or gcc
+ will assume they contain their original values. */
+ : "=r" (sum), "=r" (iph), "=r" (ihl)
+ : "1" (iph), "2" (ihl)
+ : "memory");
+ return (__force __sum16)sum;
+}
+
+/*
+ * Fold a partial checksum
+ */
+
+static inline __sum16 csum_fold(__wsum sum)
+{
+ asm("addl %1, %0 ;\n"
+ "adcl $0xffff, %0 ;\n"
+ : "=r" (sum)
+ : "r" ((__force u32)sum << 16),
+ "0" ((__force u32)sum & 0xffff0000));
+ return (__force __sum16)(~(__force u32)sum >> 16);
+}
+
+static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
+ __u32 len, __u8 proto,
+ __wsum sum)
+{
+ asm("addl %1, %0 ;\n"
+ "adcl %2, %0 ;\n"
+ "adcl %3, %0 ;\n"
+ "adcl $0, %0 ;\n"
+ : "=r" (sum)
+ : "g" (daddr), "g"(saddr),
+ "g" ((len + proto) << 8), "0" (sum));
+ return sum;
+}
+
+/*
+ * computes the checksum of the TCP/UDP pseudo-header
+ * returns a 16-bit checksum, already complemented
+ */
+static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
+ __u32 len, __u8 proto,
+ __wsum sum)
+{
+ return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum));
+}
+
+/*
+ * this routine is used for miscellaneous IP-like checksums, mainly
+ * in icmp.c
+ */
+
+static inline __sum16 ip_compute_csum(const void *buff, int len)
+{
+ return csum_fold(csum_partial(buff, len, 0));
+}
+
+#define _HAVE_ARCH_IPV6_CSUM
+static inline __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
+ const struct in6_addr *daddr,
+ __u32 len, __u8 proto, __wsum sum)
+{
+ asm("addl 0(%1), %0 ;\n"
+ "adcl 4(%1), %0 ;\n"
+ "adcl 8(%1), %0 ;\n"
+ "adcl 12(%1), %0 ;\n"
+ "adcl 0(%2), %0 ;\n"
+ "adcl 4(%2), %0 ;\n"
+ "adcl 8(%2), %0 ;\n"
+ "adcl 12(%2), %0 ;\n"
+ "adcl %3, %0 ;\n"
+ "adcl %4, %0 ;\n"
+ "adcl $0, %0 ;\n"
+ : "=&r" (sum)
+ : "r" (saddr), "r" (daddr),
+ "r" (htonl(len)), "r" (htonl(proto)), "0" (sum)
+ : "memory");
+
+ return csum_fold(sum);
+}
+
+/*
+ * Copy and checksum to user
+ */
+#define HAVE_CSUM_COPY_USER
+static inline __wsum csum_and_copy_to_user(const void *src,
+ void __user *dst,
+ int len, __wsum sum,
+ int *err_ptr)
+{
+ __wsum ret;
+
+ might_sleep();
+ if (access_ok(VERIFY_WRITE, dst, len)) {
+ stac();
+ ret = csum_partial_copy_generic(src, (__force void *)dst,
+ len, sum, NULL, err_ptr);
+ clac();
+ return ret;
+ }
+
+ if (len)
+ *err_ptr = -EFAULT;
+
+ return (__force __wsum)-1; /* invalid checksum */
+}
+
+#endif /* _ASM_X86_CHECKSUM_32_H */
diff --git a/arch/x86/include/asm/checksum_64.h b/arch/x86/include/asm/checksum_64.h
new file mode 100644
index 000000000..3ec6d3267
--- /dev/null
+++ b/arch/x86/include/asm/checksum_64.h
@@ -0,0 +1,199 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_CHECKSUM_64_H
+#define _ASM_X86_CHECKSUM_64_H
+
+/*
+ * Checksums for x86-64
+ * Copyright 2002 by Andi Kleen, SuSE Labs
+ * with some code from asm-x86/checksum.h
+ */
+
+#include <linux/compiler.h>
+#include <linux/uaccess.h>
+#include <asm/byteorder.h>
+
+/**
+ * csum_fold - Fold and invert a 32bit checksum.
+ * sum: 32bit unfolded sum
+ *
+ * Fold a 32bit running checksum to 16bit and invert it. This is usually
+ * the last step before putting a checksum into a packet.
+ * Make sure not to mix with 64bit checksums.
+ */
+static inline __sum16 csum_fold(__wsum sum)
+{
+ asm(" addl %1,%0\n"
+ " adcl $0xffff,%0"
+ : "=r" (sum)
+ : "r" ((__force u32)sum << 16),
+ "0" ((__force u32)sum & 0xffff0000));
+ return (__force __sum16)(~(__force u32)sum >> 16);
+}
+
+/*
+ * This is a version of ip_compute_csum() optimized for IP headers,
+ * which always checksum on 4 octet boundaries.
+ *
+ * By Jorge Cwik <jorge@laser.satlink.net>, adapted for linux by
+ * Arnt Gulbrandsen.
+ */
+
+/**
+ * ip_fast_csum - Compute the IPv4 header checksum efficiently.
+ * iph: ipv4 header
+ * ihl: length of header / 4
+ */
+static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
+{
+ unsigned int sum;
+
+ asm(" movl (%1), %0\n"
+ " subl $4, %2\n"
+ " jbe 2f\n"
+ " addl 4(%1), %0\n"
+ " adcl 8(%1), %0\n"
+ " adcl 12(%1), %0\n"
+ "1: adcl 16(%1), %0\n"
+ " lea 4(%1), %1\n"
+ " decl %2\n"
+ " jne 1b\n"
+ " adcl $0, %0\n"
+ " movl %0, %2\n"
+ " shrl $16, %0\n"
+ " addw %w2, %w0\n"
+ " adcl $0, %0\n"
+ " notl %0\n"
+ "2:"
+ /* Since the input registers which are loaded with iph and ihl
+ are modified, we must also specify them as outputs, or gcc
+ will assume they contain their original values. */
+ : "=r" (sum), "=r" (iph), "=r" (ihl)
+ : "1" (iph), "2" (ihl)
+ : "memory");
+ return (__force __sum16)sum;
+}
+
+/**
+ * csum_tcpup_nofold - Compute an IPv4 pseudo header checksum.
+ * @saddr: source address
+ * @daddr: destination address
+ * @len: length of packet
+ * @proto: ip protocol of packet
+ * @sum: initial sum to be added in (32bit unfolded)
+ *
+ * Returns the pseudo header checksum the input data. Result is
+ * 32bit unfolded.
+ */
+static inline __wsum
+csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len,
+ __u8 proto, __wsum sum)
+{
+ asm(" addl %1, %0\n"
+ " adcl %2, %0\n"
+ " adcl %3, %0\n"
+ " adcl $0, %0\n"
+ : "=r" (sum)
+ : "g" (daddr), "g" (saddr),
+ "g" ((len + proto)<<8), "0" (sum));
+ return sum;
+}
+
+
+/**
+ * csum_tcpup_magic - Compute an IPv4 pseudo header checksum.
+ * @saddr: source address
+ * @daddr: destination address
+ * @len: length of packet
+ * @proto: ip protocol of packet
+ * @sum: initial sum to be added in (32bit unfolded)
+ *
+ * Returns the 16bit pseudo header checksum the input data already
+ * complemented and ready to be filled in.
+ */
+static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
+ __u32 len, __u8 proto,
+ __wsum sum)
+{
+ return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum));
+}
+
+/**
+ * csum_partial - Compute an internet checksum.
+ * @buff: buffer to be checksummed
+ * @len: length of buffer.
+ * @sum: initial sum to be added in (32bit unfolded)
+ *
+ * Returns the 32bit unfolded internet checksum of the buffer.
+ * Before filling it in it needs to be csum_fold()'ed.
+ * buff should be aligned to a 64bit boundary if possible.
+ */
+extern __wsum csum_partial(const void *buff, int len, __wsum sum);
+
+#define _HAVE_ARCH_COPY_AND_CSUM_FROM_USER 1
+#define HAVE_CSUM_COPY_USER 1
+
+
+/* Do not call this directly. Use the wrappers below */
+extern __visible __wsum csum_partial_copy_generic(const void *src, const void *dst,
+ int len, __wsum sum,
+ int *src_err_ptr, int *dst_err_ptr);
+
+
+extern __wsum csum_partial_copy_from_user(const void __user *src, void *dst,
+ int len, __wsum isum, int *errp);
+extern __wsum csum_partial_copy_to_user(const void *src, void __user *dst,
+ int len, __wsum isum, int *errp);
+extern __wsum csum_partial_copy_nocheck(const void *src, void *dst,
+ int len, __wsum sum);
+
+/* Old names. To be removed. */
+#define csum_and_copy_to_user csum_partial_copy_to_user
+#define csum_and_copy_from_user csum_partial_copy_from_user
+
+/**
+ * ip_compute_csum - Compute an 16bit IP checksum.
+ * @buff: buffer address.
+ * @len: length of buffer.
+ *
+ * Returns the 16bit folded/inverted checksum of the passed buffer.
+ * Ready to fill in.
+ */
+extern __sum16 ip_compute_csum(const void *buff, int len);
+
+/**
+ * csum_ipv6_magic - Compute checksum of an IPv6 pseudo header.
+ * @saddr: source address
+ * @daddr: destination address
+ * @len: length of packet
+ * @proto: protocol of packet
+ * @sum: initial sum (32bit unfolded) to be added in
+ *
+ * Computes an IPv6 pseudo header checksum. This sum is added the checksum
+ * into UDP/TCP packets and contains some link layer information.
+ * Returns the unfolded 32bit checksum.
+ */
+
+struct in6_addr;
+
+#define _HAVE_ARCH_IPV6_CSUM 1
+extern __sum16
+csum_ipv6_magic(const struct in6_addr *saddr, const struct in6_addr *daddr,
+ __u32 len, __u8 proto, __wsum sum);
+
+static inline unsigned add32_with_carry(unsigned a, unsigned b)
+{
+ asm("addl %2,%0\n\t"
+ "adcl $0,%0"
+ : "=r" (a)
+ : "0" (a), "rm" (b));
+ return a;
+}
+
+#define HAVE_ARCH_CSUM_ADD
+static inline __wsum csum_add(__wsum csum, __wsum addend)
+{
+ return (__force __wsum)add32_with_carry((__force unsigned)csum,
+ (__force unsigned)addend);
+}
+
+#endif /* _ASM_X86_CHECKSUM_64_H */
diff --git a/arch/x86/include/asm/clocksource.h b/arch/x86/include/asm/clocksource.h
new file mode 100644
index 000000000..dc4cfc888
--- /dev/null
+++ b/arch/x86/include/asm/clocksource.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* x86-specific clocksource additions */
+
+#ifndef _ASM_X86_CLOCKSOURCE_H
+#define _ASM_X86_CLOCKSOURCE_H
+
+#define VCLOCK_NONE 0 /* No vDSO clock available. */
+#define VCLOCK_TSC 1 /* vDSO should use vread_tsc. */
+#define VCLOCK_PVCLOCK 2 /* vDSO should use vread_pvclock. */
+#define VCLOCK_HVCLOCK 3 /* vDSO should use vread_hvclock. */
+#define VCLOCK_MAX 3
+
+struct arch_clocksource_data {
+ int vclock_mode;
+};
+
+#endif /* _ASM_X86_CLOCKSOURCE_H */
diff --git a/arch/x86/include/asm/cmdline.h b/arch/x86/include/asm/cmdline.h
new file mode 100644
index 000000000..6faaf27e8
--- /dev/null
+++ b/arch/x86/include/asm/cmdline.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_CMDLINE_H
+#define _ASM_X86_CMDLINE_H
+
+int cmdline_find_option_bool(const char *cmdline_ptr, const char *option);
+int cmdline_find_option(const char *cmdline_ptr, const char *option,
+ char *buffer, int bufsize);
+
+#endif /* _ASM_X86_CMDLINE_H */
diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h
new file mode 100644
index 000000000..a55d79b23
--- /dev/null
+++ b/arch/x86/include/asm/cmpxchg.h
@@ -0,0 +1,259 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ASM_X86_CMPXCHG_H
+#define ASM_X86_CMPXCHG_H
+
+#include <linux/compiler.h>
+#include <asm/cpufeatures.h>
+#include <asm/alternative.h> /* Provides LOCK_PREFIX */
+
+/*
+ * Non-existant functions to indicate usage errors at link time
+ * (or compile-time if the compiler implements __compiletime_error().
+ */
+extern void __xchg_wrong_size(void)
+ __compiletime_error("Bad argument size for xchg");
+extern void __cmpxchg_wrong_size(void)
+ __compiletime_error("Bad argument size for cmpxchg");
+extern void __xadd_wrong_size(void)
+ __compiletime_error("Bad argument size for xadd");
+extern void __add_wrong_size(void)
+ __compiletime_error("Bad argument size for add");
+
+/*
+ * Constants for operation sizes. On 32-bit, the 64-bit size it set to
+ * -1 because sizeof will never return -1, thereby making those switch
+ * case statements guaranteeed dead code which the compiler will
+ * eliminate, and allowing the "missing symbol in the default case" to
+ * indicate a usage error.
+ */
+#define __X86_CASE_B 1
+#define __X86_CASE_W 2
+#define __X86_CASE_L 4
+#ifdef CONFIG_64BIT
+#define __X86_CASE_Q 8
+#else
+#define __X86_CASE_Q -1 /* sizeof will never return -1 */
+#endif
+
+/*
+ * An exchange-type operation, which takes a value and a pointer, and
+ * returns the old value.
+ */
+#define __xchg_op(ptr, arg, op, lock) \
+ ({ \
+ __typeof__ (*(ptr)) __ret = (arg); \
+ switch (sizeof(*(ptr))) { \
+ case __X86_CASE_B: \
+ asm volatile (lock #op "b %b0, %1\n" \
+ : "+q" (__ret), "+m" (*(ptr)) \
+ : : "memory", "cc"); \
+ break; \
+ case __X86_CASE_W: \
+ asm volatile (lock #op "w %w0, %1\n" \
+ : "+r" (__ret), "+m" (*(ptr)) \
+ : : "memory", "cc"); \
+ break; \
+ case __X86_CASE_L: \
+ asm volatile (lock #op "l %0, %1\n" \
+ : "+r" (__ret), "+m" (*(ptr)) \
+ : : "memory", "cc"); \
+ break; \
+ case __X86_CASE_Q: \
+ asm volatile (lock #op "q %q0, %1\n" \
+ : "+r" (__ret), "+m" (*(ptr)) \
+ : : "memory", "cc"); \
+ break; \
+ default: \
+ __ ## op ## _wrong_size(); \
+ } \
+ __ret; \
+ })
+
+/*
+ * Note: no "lock" prefix even on SMP: xchg always implies lock anyway.
+ * Since this is generally used to protect other memory information, we
+ * use "asm volatile" and "memory" clobbers to prevent gcc from moving
+ * information around.
+ */
+#define arch_xchg(ptr, v) __xchg_op((ptr), (v), xchg, "")
+
+/*
+ * Atomic compare and exchange. Compare OLD with MEM, if identical,
+ * store NEW in MEM. Return the initial value in MEM. Success is
+ * indicated by comparing RETURN with OLD.
+ */
+#define __raw_cmpxchg(ptr, old, new, size, lock) \
+({ \
+ __typeof__(*(ptr)) __ret; \
+ __typeof__(*(ptr)) __old = (old); \
+ __typeof__(*(ptr)) __new = (new); \
+ switch (size) { \
+ case __X86_CASE_B: \
+ { \
+ volatile u8 *__ptr = (volatile u8 *)(ptr); \
+ asm volatile(lock "cmpxchgb %2,%1" \
+ : "=a" (__ret), "+m" (*__ptr) \
+ : "q" (__new), "0" (__old) \
+ : "memory"); \
+ break; \
+ } \
+ case __X86_CASE_W: \
+ { \
+ volatile u16 *__ptr = (volatile u16 *)(ptr); \
+ asm volatile(lock "cmpxchgw %2,%1" \
+ : "=a" (__ret), "+m" (*__ptr) \
+ : "r" (__new), "0" (__old) \
+ : "memory"); \
+ break; \
+ } \
+ case __X86_CASE_L: \
+ { \
+ volatile u32 *__ptr = (volatile u32 *)(ptr); \
+ asm volatile(lock "cmpxchgl %2,%1" \
+ : "=a" (__ret), "+m" (*__ptr) \
+ : "r" (__new), "0" (__old) \
+ : "memory"); \
+ break; \
+ } \
+ case __X86_CASE_Q: \
+ { \
+ volatile u64 *__ptr = (volatile u64 *)(ptr); \
+ asm volatile(lock "cmpxchgq %2,%1" \
+ : "=a" (__ret), "+m" (*__ptr) \
+ : "r" (__new), "0" (__old) \
+ : "memory"); \
+ break; \
+ } \
+ default: \
+ __cmpxchg_wrong_size(); \
+ } \
+ __ret; \
+})
+
+#define __cmpxchg(ptr, old, new, size) \
+ __raw_cmpxchg((ptr), (old), (new), (size), LOCK_PREFIX)
+
+#define __sync_cmpxchg(ptr, old, new, size) \
+ __raw_cmpxchg((ptr), (old), (new), (size), "lock; ")
+
+#define __cmpxchg_local(ptr, old, new, size) \
+ __raw_cmpxchg((ptr), (old), (new), (size), "")
+
+#ifdef CONFIG_X86_32
+# include <asm/cmpxchg_32.h>
+#else
+# include <asm/cmpxchg_64.h>
+#endif
+
+#define arch_cmpxchg(ptr, old, new) \
+ __cmpxchg(ptr, old, new, sizeof(*(ptr)))
+
+#define arch_sync_cmpxchg(ptr, old, new) \
+ __sync_cmpxchg(ptr, old, new, sizeof(*(ptr)))
+
+#define arch_cmpxchg_local(ptr, old, new) \
+ __cmpxchg_local(ptr, old, new, sizeof(*(ptr)))
+
+
+#define __raw_try_cmpxchg(_ptr, _pold, _new, size, lock) \
+({ \
+ bool success; \
+ __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold); \
+ __typeof__(*(_ptr)) __old = *_old; \
+ __typeof__(*(_ptr)) __new = (_new); \
+ switch (size) { \
+ case __X86_CASE_B: \
+ { \
+ volatile u8 *__ptr = (volatile u8 *)(_ptr); \
+ asm volatile(lock "cmpxchgb %[new], %[ptr]" \
+ CC_SET(z) \
+ : CC_OUT(z) (success), \
+ [ptr] "+m" (*__ptr), \
+ [old] "+a" (__old) \
+ : [new] "q" (__new) \
+ : "memory"); \
+ break; \
+ } \
+ case __X86_CASE_W: \
+ { \
+ volatile u16 *__ptr = (volatile u16 *)(_ptr); \
+ asm volatile(lock "cmpxchgw %[new], %[ptr]" \
+ CC_SET(z) \
+ : CC_OUT(z) (success), \
+ [ptr] "+m" (*__ptr), \
+ [old] "+a" (__old) \
+ : [new] "r" (__new) \
+ : "memory"); \
+ break; \
+ } \
+ case __X86_CASE_L: \
+ { \
+ volatile u32 *__ptr = (volatile u32 *)(_ptr); \
+ asm volatile(lock "cmpxchgl %[new], %[ptr]" \
+ CC_SET(z) \
+ : CC_OUT(z) (success), \
+ [ptr] "+m" (*__ptr), \
+ [old] "+a" (__old) \
+ : [new] "r" (__new) \
+ : "memory"); \
+ break; \
+ } \
+ case __X86_CASE_Q: \
+ { \
+ volatile u64 *__ptr = (volatile u64 *)(_ptr); \
+ asm volatile(lock "cmpxchgq %[new], %[ptr]" \
+ CC_SET(z) \
+ : CC_OUT(z) (success), \
+ [ptr] "+m" (*__ptr), \
+ [old] "+a" (__old) \
+ : [new] "r" (__new) \
+ : "memory"); \
+ break; \
+ } \
+ default: \
+ __cmpxchg_wrong_size(); \
+ } \
+ if (unlikely(!success)) \
+ *_old = __old; \
+ likely(success); \
+})
+
+#define __try_cmpxchg(ptr, pold, new, size) \
+ __raw_try_cmpxchg((ptr), (pold), (new), (size), LOCK_PREFIX)
+
+#define try_cmpxchg(ptr, pold, new) \
+ __try_cmpxchg((ptr), (pold), (new), sizeof(*(ptr)))
+
+/*
+ * xadd() adds "inc" to "*ptr" and atomically returns the previous
+ * value of "*ptr".
+ *
+ * xadd() is locked when multiple CPUs are online
+ */
+#define __xadd(ptr, inc, lock) __xchg_op((ptr), (inc), xadd, lock)
+#define xadd(ptr, inc) __xadd((ptr), (inc), LOCK_PREFIX)
+
+#define __cmpxchg_double(pfx, p1, p2, o1, o2, n1, n2) \
+({ \
+ bool __ret; \
+ __typeof__(*(p1)) __old1 = (o1), __new1 = (n1); \
+ __typeof__(*(p2)) __old2 = (o2), __new2 = (n2); \
+ BUILD_BUG_ON(sizeof(*(p1)) != sizeof(long)); \
+ BUILD_BUG_ON(sizeof(*(p2)) != sizeof(long)); \
+ VM_BUG_ON((unsigned long)(p1) % (2 * sizeof(long))); \
+ VM_BUG_ON((unsigned long)((p1) + 1) != (unsigned long)(p2)); \
+ asm volatile(pfx "cmpxchg%c4b %2; sete %0" \
+ : "=a" (__ret), "+d" (__old2), \
+ "+m" (*(p1)), "+m" (*(p2)) \
+ : "i" (2 * sizeof(long)), "a" (__old1), \
+ "b" (__new1), "c" (__new2)); \
+ __ret; \
+})
+
+#define arch_cmpxchg_double(p1, p2, o1, o2, n1, n2) \
+ __cmpxchg_double(LOCK_PREFIX, p1, p2, o1, o2, n1, n2)
+
+#define arch_cmpxchg_double_local(p1, p2, o1, o2, n1, n2) \
+ __cmpxchg_double(, p1, p2, o1, o2, n1, n2)
+
+#endif /* ASM_X86_CMPXCHG_H */
diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h
new file mode 100644
index 000000000..1a2eafca7
--- /dev/null
+++ b/arch/x86/include/asm/cmpxchg_32.h
@@ -0,0 +1,115 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_CMPXCHG_32_H
+#define _ASM_X86_CMPXCHG_32_H
+
+/*
+ * Note: if you use set64_bit(), __cmpxchg64(), or their variants, you
+ * you need to test for the feature in boot_cpu_data.
+ */
+
+/*
+ * CMPXCHG8B only writes to the target if we had the previous
+ * value in registers, otherwise it acts as a read and gives us the
+ * "new previous" value. That is why there is a loop. Preloading
+ * EDX:EAX is a performance optimization: in the common case it means
+ * we need only one locked operation.
+ *
+ * A SIMD/3DNOW!/MMX/FPU 64-bit store here would require at the very
+ * least an FPU save and/or %cr0.ts manipulation.
+ *
+ * cmpxchg8b must be used with the lock prefix here to allow the
+ * instruction to be executed atomically. We need to have the reader
+ * side to see the coherent 64bit value.
+ */
+static inline void set_64bit(volatile u64 *ptr, u64 value)
+{
+ u32 low = value;
+ u32 high = value >> 32;
+ u64 prev = *ptr;
+
+ asm volatile("\n1:\t"
+ LOCK_PREFIX "cmpxchg8b %0\n\t"
+ "jnz 1b"
+ : "=m" (*ptr), "+A" (prev)
+ : "b" (low), "c" (high)
+ : "memory");
+}
+
+#ifdef CONFIG_X86_CMPXCHG64
+#define arch_cmpxchg64(ptr, o, n) \
+ ((__typeof__(*(ptr)))__cmpxchg64((ptr), (unsigned long long)(o), \
+ (unsigned long long)(n)))
+#define arch_cmpxchg64_local(ptr, o, n) \
+ ((__typeof__(*(ptr)))__cmpxchg64_local((ptr), (unsigned long long)(o), \
+ (unsigned long long)(n)))
+#endif
+
+static inline u64 __cmpxchg64(volatile u64 *ptr, u64 old, u64 new)
+{
+ u64 prev;
+ asm volatile(LOCK_PREFIX "cmpxchg8b %1"
+ : "=A" (prev),
+ "+m" (*ptr)
+ : "b" ((u32)new),
+ "c" ((u32)(new >> 32)),
+ "0" (old)
+ : "memory");
+ return prev;
+}
+
+static inline u64 __cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new)
+{
+ u64 prev;
+ asm volatile("cmpxchg8b %1"
+ : "=A" (prev),
+ "+m" (*ptr)
+ : "b" ((u32)new),
+ "c" ((u32)(new >> 32)),
+ "0" (old)
+ : "memory");
+ return prev;
+}
+
+#ifndef CONFIG_X86_CMPXCHG64
+/*
+ * Building a kernel capable running on 80386 and 80486. It may be necessary
+ * to simulate the cmpxchg8b on the 80386 and 80486 CPU.
+ */
+
+#define arch_cmpxchg64(ptr, o, n) \
+({ \
+ __typeof__(*(ptr)) __ret; \
+ __typeof__(*(ptr)) __old = (o); \
+ __typeof__(*(ptr)) __new = (n); \
+ alternative_io(LOCK_PREFIX_HERE \
+ "call cmpxchg8b_emu", \
+ "lock; cmpxchg8b (%%esi)" , \
+ X86_FEATURE_CX8, \
+ "=A" (__ret), \
+ "S" ((ptr)), "0" (__old), \
+ "b" ((unsigned int)__new), \
+ "c" ((unsigned int)(__new>>32)) \
+ : "memory"); \
+ __ret; })
+
+
+#define arch_cmpxchg64_local(ptr, o, n) \
+({ \
+ __typeof__(*(ptr)) __ret; \
+ __typeof__(*(ptr)) __old = (o); \
+ __typeof__(*(ptr)) __new = (n); \
+ alternative_io("call cmpxchg8b_emu", \
+ "cmpxchg8b (%%esi)" , \
+ X86_FEATURE_CX8, \
+ "=A" (__ret), \
+ "S" ((ptr)), "0" (__old), \
+ "b" ((unsigned int)__new), \
+ "c" ((unsigned int)(__new>>32)) \
+ : "memory"); \
+ __ret; })
+
+#endif
+
+#define system_has_cmpxchg_double() boot_cpu_has(X86_FEATURE_CX8)
+
+#endif /* _ASM_X86_CMPXCHG_32_H */
diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h
new file mode 100644
index 000000000..072e5459f
--- /dev/null
+++ b/arch/x86/include/asm/cmpxchg_64.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_CMPXCHG_64_H
+#define _ASM_X86_CMPXCHG_64_H
+
+static inline void set_64bit(volatile u64 *ptr, u64 val)
+{
+ *ptr = val;
+}
+
+#define arch_cmpxchg64(ptr, o, n) \
+({ \
+ BUILD_BUG_ON(sizeof(*(ptr)) != 8); \
+ arch_cmpxchg((ptr), (o), (n)); \
+})
+
+#define arch_cmpxchg64_local(ptr, o, n) \
+({ \
+ BUILD_BUG_ON(sizeof(*(ptr)) != 8); \
+ arch_cmpxchg_local((ptr), (o), (n)); \
+})
+
+#define system_has_cmpxchg_double() boot_cpu_has(X86_FEATURE_CX16)
+
+#endif /* _ASM_X86_CMPXCHG_64_H */
diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
new file mode 100644
index 000000000..1def972b6
--- /dev/null
+++ b/arch/x86/include/asm/compat.h
@@ -0,0 +1,243 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_COMPAT_H
+#define _ASM_X86_COMPAT_H
+
+/*
+ * Architecture specific compatibility types
+ */
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
+#include <asm/processor.h>
+#include <asm/user32.h>
+#include <asm/unistd.h>
+
+#define COMPAT_USER_HZ 100
+#define COMPAT_UTS_MACHINE "i686\0\0"
+
+typedef u32 compat_size_t;
+typedef s32 compat_ssize_t;
+typedef s32 compat_clock_t;
+typedef s32 compat_pid_t;
+typedef u16 __compat_uid_t;
+typedef u16 __compat_gid_t;
+typedef u32 __compat_uid32_t;
+typedef u32 __compat_gid32_t;
+typedef u16 compat_mode_t;
+typedef u32 compat_ino_t;
+typedef u16 compat_dev_t;
+typedef s32 compat_off_t;
+typedef s64 compat_loff_t;
+typedef u16 compat_nlink_t;
+typedef u16 compat_ipc_pid_t;
+typedef s32 compat_daddr_t;
+typedef u32 compat_caddr_t;
+typedef __kernel_fsid_t compat_fsid_t;
+typedef s32 compat_timer_t;
+typedef s32 compat_key_t;
+
+typedef s32 compat_int_t;
+typedef s32 compat_long_t;
+typedef s64 __attribute__((aligned(4))) compat_s64;
+typedef u32 compat_uint_t;
+typedef u32 compat_ulong_t;
+typedef u32 compat_u32;
+typedef u64 __attribute__((aligned(4))) compat_u64;
+typedef u32 compat_uptr_t;
+
+struct compat_stat {
+ u32 st_dev;
+ compat_ino_t st_ino;
+ compat_mode_t st_mode;
+ compat_nlink_t st_nlink;
+ __compat_uid_t st_uid;
+ __compat_gid_t st_gid;
+ u32 st_rdev;
+ u32 st_size;
+ u32 st_blksize;
+ u32 st_blocks;
+ u32 st_atime;
+ u32 st_atime_nsec;
+ u32 st_mtime;
+ u32 st_mtime_nsec;
+ u32 st_ctime;
+ u32 st_ctime_nsec;
+ u32 __unused4;
+ u32 __unused5;
+};
+
+struct compat_flock {
+ short l_type;
+ short l_whence;
+ compat_off_t l_start;
+ compat_off_t l_len;
+ compat_pid_t l_pid;
+};
+
+#define F_GETLK64 12 /* using 'struct flock64' */
+#define F_SETLK64 13
+#define F_SETLKW64 14
+
+/*
+ * IA32 uses 4 byte alignment for 64 bit quantities,
+ * so we need to pack this structure.
+ */
+struct compat_flock64 {
+ short l_type;
+ short l_whence;
+ compat_loff_t l_start;
+ compat_loff_t l_len;
+ compat_pid_t l_pid;
+} __attribute__((packed));
+
+struct compat_statfs {
+ int f_type;
+ int f_bsize;
+ int f_blocks;
+ int f_bfree;
+ int f_bavail;
+ int f_files;
+ int f_ffree;
+ compat_fsid_t f_fsid;
+ int f_namelen; /* SunOS ignores this field. */
+ int f_frsize;
+ int f_flags;
+ int f_spare[4];
+};
+
+#define COMPAT_RLIM_INFINITY 0xffffffff
+
+typedef u32 compat_old_sigset_t; /* at least 32 bits */
+
+#define _COMPAT_NSIG 64
+#define _COMPAT_NSIG_BPW 32
+
+typedef u32 compat_sigset_word;
+
+#define COMPAT_OFF_T_MAX 0x7fffffff
+
+struct compat_ipc64_perm {
+ compat_key_t key;
+ __compat_uid32_t uid;
+ __compat_gid32_t gid;
+ __compat_uid32_t cuid;
+ __compat_gid32_t cgid;
+ unsigned short mode;
+ unsigned short __pad1;
+ unsigned short seq;
+ unsigned short __pad2;
+ compat_ulong_t unused1;
+ compat_ulong_t unused2;
+};
+
+struct compat_semid64_ds {
+ struct compat_ipc64_perm sem_perm;
+ compat_ulong_t sem_otime;
+ compat_ulong_t sem_otime_high;
+ compat_ulong_t sem_ctime;
+ compat_ulong_t sem_ctime_high;
+ compat_ulong_t sem_nsems;
+ compat_ulong_t __unused3;
+ compat_ulong_t __unused4;
+};
+
+struct compat_msqid64_ds {
+ struct compat_ipc64_perm msg_perm;
+ compat_ulong_t msg_stime;
+ compat_ulong_t msg_stime_high;
+ compat_ulong_t msg_rtime;
+ compat_ulong_t msg_rtime_high;
+ compat_ulong_t msg_ctime;
+ compat_ulong_t msg_ctime_high;
+ compat_ulong_t msg_cbytes;
+ compat_ulong_t msg_qnum;
+ compat_ulong_t msg_qbytes;
+ compat_pid_t msg_lspid;
+ compat_pid_t msg_lrpid;
+ compat_ulong_t __unused4;
+ compat_ulong_t __unused5;
+};
+
+struct compat_shmid64_ds {
+ struct compat_ipc64_perm shm_perm;
+ compat_size_t shm_segsz;
+ compat_ulong_t shm_atime;
+ compat_ulong_t shm_atime_high;
+ compat_ulong_t shm_dtime;
+ compat_ulong_t shm_dtime_high;
+ compat_ulong_t shm_ctime;
+ compat_ulong_t shm_ctime_high;
+ compat_pid_t shm_cpid;
+ compat_pid_t shm_lpid;
+ compat_ulong_t shm_nattch;
+ compat_ulong_t __unused4;
+ compat_ulong_t __unused5;
+};
+
+/*
+ * The type of struct elf_prstatus.pr_reg in compatible core dumps.
+ */
+typedef struct user_regs_struct compat_elf_gregset_t;
+
+/* Full regset -- prstatus on x32, otherwise on ia32 */
+#define PRSTATUS_SIZE(S, R) (R != sizeof(S.pr_reg) ? 144 : 296)
+#define SET_PR_FPVALID(S, V, R) \
+ do { *(int *) (((void *) &((S)->pr_reg)) + R) = (V); } \
+ while (0)
+
+#ifdef CONFIG_X86_X32_ABI
+#define COMPAT_USE_64BIT_TIME \
+ (!!(task_pt_regs(current)->orig_ax & __X32_SYSCALL_BIT))
+#endif
+
+/*
+ * A pointer passed in from user mode. This should not
+ * be used for syscall parameters, just declare them
+ * as pointers because the syscall entry code will have
+ * appropriately converted them already.
+ */
+
+static inline void __user *compat_ptr(compat_uptr_t uptr)
+{
+ return (void __user *)(unsigned long)uptr;
+}
+
+static inline compat_uptr_t ptr_to_compat(void __user *uptr)
+{
+ return (u32)(unsigned long)uptr;
+}
+
+static inline void __user *arch_compat_alloc_user_space(long len)
+{
+ compat_uptr_t sp;
+
+ if (test_thread_flag(TIF_IA32)) {
+ sp = task_pt_regs(current)->sp;
+ } else {
+ /* -128 for the x32 ABI redzone */
+ sp = task_pt_regs(current)->sp - 128;
+ }
+
+ return (void __user *)round_down(sp - len, 16);
+}
+
+static inline bool in_x32_syscall(void)
+{
+#ifdef CONFIG_X86_X32_ABI
+ if (task_pt_regs(current)->orig_ax & __X32_SYSCALL_BIT)
+ return true;
+#endif
+ return false;
+}
+
+static inline bool in_compat_syscall(void)
+{
+ return in_ia32_syscall() || in_x32_syscall();
+}
+#define in_compat_syscall in_compat_syscall /* override the generic impl */
+
+struct compat_siginfo;
+int __copy_siginfo_to_user32(struct compat_siginfo __user *to,
+ const siginfo_t *from, bool x32_ABI);
+
+#endif /* _ASM_X86_COMPAT_H */
diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h
new file mode 100644
index 000000000..adc6cc86b
--- /dev/null
+++ b/arch/x86/include/asm/cpu.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_CPU_H
+#define _ASM_X86_CPU_H
+
+#include <linux/device.h>
+#include <linux/cpu.h>
+#include <linux/topology.h>
+#include <linux/nodemask.h>
+#include <linux/percpu.h>
+
+#ifdef CONFIG_SMP
+
+extern void prefill_possible_map(void);
+
+#else /* CONFIG_SMP */
+
+static inline void prefill_possible_map(void) {}
+
+#define cpu_physical_id(cpu) boot_cpu_physical_apicid
+#define cpu_acpi_id(cpu) 0
+#define safe_smp_processor_id() 0
+
+#endif /* CONFIG_SMP */
+
+struct x86_cpu {
+ struct cpu cpu;
+};
+
+#ifdef CONFIG_HOTPLUG_CPU
+extern int arch_register_cpu(int num);
+extern void arch_unregister_cpu(int);
+extern void start_cpu0(void);
+#ifdef CONFIG_DEBUG_HOTPLUG_CPU0
+extern int _debug_hotplug_cpu(int cpu, int action);
+#endif
+#endif
+
+int mwait_usable(const struct cpuinfo_x86 *);
+
+unsigned int x86_family(unsigned int sig);
+unsigned int x86_model(unsigned int sig);
+unsigned int x86_stepping(unsigned int sig);
+#endif /* _ASM_X86_CPU_H */
diff --git a/arch/x86/include/asm/cpu_device_id.h b/arch/x86/include/asm/cpu_device_id.h
new file mode 100644
index 000000000..884466592
--- /dev/null
+++ b/arch/x86/include/asm/cpu_device_id.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _CPU_DEVICE_ID
+#define _CPU_DEVICE_ID 1
+
+/*
+ * Declare drivers belonging to specific x86 CPUs
+ * Similar in spirit to pci_device_id and related PCI functions
+ */
+
+#include <linux/mod_devicetable.h>
+
+#define X86_STEPPINGS(mins, maxs) GENMASK(maxs, mins)
+
+/**
+ * X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE - Base macro for CPU matching
+ * @_vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY
+ * The name is expanded to X86_VENDOR_@_vendor
+ * @_family: The family number or X86_FAMILY_ANY
+ * @_model: The model number, model constant or X86_MODEL_ANY
+ * @_steppings: Bitmask for steppings, stepping constant or X86_STEPPING_ANY
+ * @_feature: A X86_FEATURE bit or X86_FEATURE_ANY
+ * @_data: Driver specific data or NULL. The internal storage
+ * format is unsigned long. The supplied value, pointer
+ * etc. is casted to unsigned long internally.
+ *
+ * Backport version to keep the SRBDS pile consistant. No shorter variants
+ * required for this.
+ */
+#define X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(_vendor, _family, _model, \
+ _steppings, _feature, _data) { \
+ .vendor = X86_VENDOR_##_vendor, \
+ .family = _family, \
+ .model = _model, \
+ .steppings = _steppings, \
+ .feature = _feature, \
+ .driver_data = (unsigned long) _data \
+}
+
+extern const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match);
+
+#endif
diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h
new file mode 100644
index 000000000..4a7884b8d
--- /dev/null
+++ b/arch/x86/include/asm/cpu_entry_area.h
@@ -0,0 +1,81 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#ifndef _ASM_X86_CPU_ENTRY_AREA_H
+#define _ASM_X86_CPU_ENTRY_AREA_H
+
+#include <linux/percpu-defs.h>
+#include <asm/processor.h>
+#include <asm/intel_ds.h>
+
+/*
+ * cpu_entry_area is a percpu region that contains things needed by the CPU
+ * and early entry/exit code. Real types aren't used for all fields here
+ * to avoid circular header dependencies.
+ *
+ * Every field is a virtual alias of some other allocated backing store.
+ * There is no direct allocation of a struct cpu_entry_area.
+ */
+struct cpu_entry_area {
+ char gdt[PAGE_SIZE];
+
+ /*
+ * The GDT is just below entry_stack and thus serves (on x86_64) as
+ * a a read-only guard page.
+ */
+ struct entry_stack_page entry_stack_page;
+
+ /*
+ * On x86_64, the TSS is mapped RO. On x86_32, it's mapped RW because
+ * we need task switches to work, and task switches write to the TSS.
+ */
+ struct tss_struct tss;
+
+ char entry_trampoline[PAGE_SIZE];
+
+#ifdef CONFIG_X86_64
+ /*
+ * Exception stacks used for IST entries.
+ *
+ * In the future, this should have a separate slot for each stack
+ * with guard pages between them.
+ */
+ char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ];
+#endif
+#ifdef CONFIG_CPU_SUP_INTEL
+ /*
+ * Per CPU debug store for Intel performance monitoring. Wastes a
+ * full page at the moment.
+ */
+ struct debug_store cpu_debug_store;
+ /*
+ * The actual PEBS/BTS buffers must be mapped to user space
+ * Reserve enough fixmap PTEs.
+ */
+ struct debug_store_buffers cpu_debug_buffers;
+#endif
+};
+
+#define CPU_ENTRY_AREA_SIZE (sizeof(struct cpu_entry_area))
+#define CPU_ENTRY_AREA_TOT_SIZE (CPU_ENTRY_AREA_SIZE * NR_CPUS)
+
+DECLARE_PER_CPU(struct cpu_entry_area *, cpu_entry_area);
+
+extern void setup_cpu_entry_areas(void);
+extern void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags);
+
+#define CPU_ENTRY_AREA_RO_IDT CPU_ENTRY_AREA_BASE
+#define CPU_ENTRY_AREA_PER_CPU (CPU_ENTRY_AREA_RO_IDT + PAGE_SIZE)
+
+#define CPU_ENTRY_AREA_RO_IDT_VADDR ((void *)CPU_ENTRY_AREA_RO_IDT)
+
+#define CPU_ENTRY_AREA_MAP_SIZE \
+ (CPU_ENTRY_AREA_PER_CPU + CPU_ENTRY_AREA_TOT_SIZE - CPU_ENTRY_AREA_BASE)
+
+extern struct cpu_entry_area *get_cpu_entry_area(int cpu);
+
+static inline struct entry_stack *cpu_entry_stack(int cpu)
+{
+ return &get_cpu_entry_area(cpu)->entry_stack_page.stack;
+}
+
+#endif
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
new file mode 100644
index 000000000..68889ace9
--- /dev/null
+++ b/arch/x86/include/asm/cpufeature.h
@@ -0,0 +1,230 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_CPUFEATURE_H
+#define _ASM_X86_CPUFEATURE_H
+
+#include <asm/processor.h>
+
+#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
+
+#include <asm/asm.h>
+#include <linux/bitops.h>
+
+enum cpuid_leafs
+{
+ CPUID_1_EDX = 0,
+ CPUID_8000_0001_EDX,
+ CPUID_8086_0001_EDX,
+ CPUID_LNX_1,
+ CPUID_1_ECX,
+ CPUID_C000_0001_EDX,
+ CPUID_8000_0001_ECX,
+ CPUID_LNX_2,
+ CPUID_LNX_3,
+ CPUID_7_0_EBX,
+ CPUID_D_1_EAX,
+ CPUID_LNX_4,
+ CPUID_DUMMY,
+ CPUID_8000_0008_EBX,
+ CPUID_6_EAX,
+ CPUID_8000_000A_EDX,
+ CPUID_7_ECX,
+ CPUID_8000_0007_EBX,
+ CPUID_7_EDX,
+};
+
+#ifdef CONFIG_X86_FEATURE_NAMES
+extern const char * const x86_cap_flags[NCAPINTS*32];
+extern const char * const x86_power_flags[32];
+#define X86_CAP_FMT "%s"
+#define x86_cap_flag(flag) x86_cap_flags[flag]
+#else
+#define X86_CAP_FMT "%d:%d"
+#define x86_cap_flag(flag) ((flag) >> 5), ((flag) & 31)
+#endif
+
+/*
+ * In order to save room, we index into this array by doing
+ * X86_BUG_<name> - NCAPINTS*32.
+ */
+extern const char * const x86_bug_flags[NBUGINTS*32];
+
+#define test_cpu_cap(c, bit) \
+ test_bit(bit, (unsigned long *)((c)->x86_capability))
+
+/*
+ * There are 32 bits/features in each mask word. The high bits
+ * (selected with (bit>>5) give us the word number and the low 5
+ * bits give us the bit/feature number inside the word.
+ * (1UL<<((bit)&31) gives us a mask for the feature_bit so we can
+ * see if it is set in the mask word.
+ */
+#define CHECK_BIT_IN_MASK_WORD(maskname, word, bit) \
+ (((bit)>>5)==(word) && (1UL<<((bit)&31) & maskname##word ))
+
+#define REQUIRED_MASK_BIT_SET(feature_bit) \
+ ( CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 0, feature_bit) || \
+ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 1, feature_bit) || \
+ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 2, feature_bit) || \
+ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 3, feature_bit) || \
+ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 4, feature_bit) || \
+ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 5, feature_bit) || \
+ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 6, feature_bit) || \
+ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 7, feature_bit) || \
+ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 8, feature_bit) || \
+ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 9, feature_bit) || \
+ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 10, feature_bit) || \
+ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 11, feature_bit) || \
+ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 12, feature_bit) || \
+ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 13, feature_bit) || \
+ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 14, feature_bit) || \
+ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 15, feature_bit) || \
+ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 16, feature_bit) || \
+ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 17, feature_bit) || \
+ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 18, feature_bit) || \
+ REQUIRED_MASK_CHECK || \
+ BUILD_BUG_ON_ZERO(NCAPINTS != 19))
+
+#define DISABLED_MASK_BIT_SET(feature_bit) \
+ ( CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 0, feature_bit) || \
+ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 1, feature_bit) || \
+ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 2, feature_bit) || \
+ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 3, feature_bit) || \
+ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 4, feature_bit) || \
+ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 5, feature_bit) || \
+ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 6, feature_bit) || \
+ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 7, feature_bit) || \
+ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 8, feature_bit) || \
+ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 9, feature_bit) || \
+ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 10, feature_bit) || \
+ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 11, feature_bit) || \
+ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 12, feature_bit) || \
+ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 13, feature_bit) || \
+ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 14, feature_bit) || \
+ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 15, feature_bit) || \
+ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 16, feature_bit) || \
+ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 17, feature_bit) || \
+ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 18, feature_bit) || \
+ DISABLED_MASK_CHECK || \
+ BUILD_BUG_ON_ZERO(NCAPINTS != 19))
+
+#define cpu_has(c, bit) \
+ (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \
+ test_cpu_cap(c, bit))
+
+#define this_cpu_has(bit) \
+ (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \
+ x86_this_cpu_test_bit(bit, (unsigned long *)&cpu_info.x86_capability))
+
+/*
+ * This macro is for detection of features which need kernel
+ * infrastructure to be used. It may *not* directly test the CPU
+ * itself. Use the cpu_has() family if you want true runtime
+ * testing of CPU features, like in hypervisor code where you are
+ * supporting a possible guest feature where host support for it
+ * is not relevant.
+ */
+#define cpu_feature_enabled(bit) \
+ (__builtin_constant_p(bit) && DISABLED_MASK_BIT_SET(bit) ? 0 : static_cpu_has(bit))
+
+#define boot_cpu_has(bit) cpu_has(&boot_cpu_data, bit)
+
+#define set_cpu_cap(c, bit) set_bit(bit, (unsigned long *)((c)->x86_capability))
+
+extern void setup_clear_cpu_cap(unsigned int bit);
+extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit);
+
+#define setup_force_cpu_cap(bit) do { \
+ set_cpu_cap(&boot_cpu_data, bit); \
+ set_bit(bit, (unsigned long *)cpu_caps_set); \
+} while (0)
+
+#define setup_force_cpu_bug(bit) setup_force_cpu_cap(bit)
+
+#if defined(__clang__) && !defined(CONFIG_CC_HAS_ASM_GOTO)
+
+/*
+ * Workaround for the sake of BPF compilation which utilizes kernel
+ * headers, but clang does not support ASM GOTO and fails the build.
+ */
+#ifndef __BPF_TRACING__
+#warning "Compiler lacks ASM_GOTO support. Add -D __BPF_TRACING__ to your compiler arguments"
+#endif
+
+#define static_cpu_has(bit) boot_cpu_has(bit)
+
+#else
+
+/*
+ * Static testing of CPU features. Used the same as boot_cpu_has().
+ * These will statically patch the target code for additional
+ * performance.
+ */
+static __always_inline __pure bool _static_cpu_has(u16 bit)
+{
+ asm_volatile_goto("1: jmp 6f\n"
+ "2:\n"
+ ".skip -(((5f-4f) - (2b-1b)) > 0) * "
+ "((5f-4f) - (2b-1b)),0x90\n"
+ "3:\n"
+ ".section .altinstructions,\"a\"\n"
+ " .long 1b - .\n" /* src offset */
+ " .long 4f - .\n" /* repl offset */
+ " .word %P[always]\n" /* always replace */
+ " .byte 3b - 1b\n" /* src len */
+ " .byte 5f - 4f\n" /* repl len */
+ " .byte 3b - 2b\n" /* pad len */
+ ".previous\n"
+ ".section .altinstr_replacement,\"ax\"\n"
+ "4: jmp %l[t_no]\n"
+ "5:\n"
+ ".previous\n"
+ ".section .altinstructions,\"a\"\n"
+ " .long 1b - .\n" /* src offset */
+ " .long 0\n" /* no replacement */
+ " .word %P[feature]\n" /* feature bit */
+ " .byte 3b - 1b\n" /* src len */
+ " .byte 0\n" /* repl len */
+ " .byte 0\n" /* pad len */
+ ".previous\n"
+ ".section .altinstr_aux,\"ax\"\n"
+ "6:\n"
+ " testb %[bitnum],%[cap_byte]\n"
+ " jnz %l[t_yes]\n"
+ " jmp %l[t_no]\n"
+ ".previous\n"
+ : : [feature] "i" (bit),
+ [always] "i" (X86_FEATURE_ALWAYS),
+ [bitnum] "i" (1 << (bit & 7)),
+ [cap_byte] "m" (((const char *)boot_cpu_data.x86_capability)[bit >> 3])
+ : : t_yes, t_no);
+t_yes:
+ return true;
+t_no:
+ return false;
+}
+
+#define static_cpu_has(bit) \
+( \
+ __builtin_constant_p(boot_cpu_has(bit)) ? \
+ boot_cpu_has(bit) : \
+ _static_cpu_has(bit) \
+)
+#endif
+
+#define cpu_has_bug(c, bit) cpu_has(c, (bit))
+#define set_cpu_bug(c, bit) set_cpu_cap(c, (bit))
+#define clear_cpu_bug(c, bit) clear_cpu_cap(c, (bit))
+
+#define static_cpu_has_bug(bit) static_cpu_has((bit))
+#define boot_cpu_has_bug(bit) cpu_has_bug(&boot_cpu_data, (bit))
+#define boot_cpu_set_bug(bit) set_cpu_cap(&boot_cpu_data, (bit))
+
+#define MAX_CPU_FEATURES (NCAPINTS * 32)
+#define cpu_have_feature boot_cpu_has
+
+#define CPU_FEATURE_TYPEFMT "x86,ven%04Xfam%04Xmod%04X"
+#define CPU_FEATURE_TYPEVAL boot_cpu_data.x86_vendor, boot_cpu_data.x86, \
+ boot_cpu_data.x86_model
+
+#endif /* defined(__KERNEL__) && !defined(__ASSEMBLY__) */
+#endif /* _ASM_X86_CPUFEATURE_H */
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
new file mode 100644
index 000000000..8025b7da0
--- /dev/null
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -0,0 +1,399 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_CPUFEATURES_H
+#define _ASM_X86_CPUFEATURES_H
+
+#ifndef _ASM_X86_REQUIRED_FEATURES_H
+#include <asm/required-features.h>
+#endif
+
+#ifndef _ASM_X86_DISABLED_FEATURES_H
+#include <asm/disabled-features.h>
+#endif
+
+/*
+ * Defines x86 CPU feature bits
+ */
+#define NCAPINTS 19 /* N 32-bit words worth of info */
+#define NBUGINTS 1 /* N 32-bit bug flags */
+
+/*
+ * Note: If the comment begins with a quoted string, that string is used
+ * in /proc/cpuinfo instead of the macro name. If the string is "",
+ * this feature bit is not displayed in /proc/cpuinfo at all.
+ *
+ * When adding new features here that depend on other features,
+ * please update the table in kernel/cpu/cpuid-deps.c as well.
+ */
+
+/* Intel-defined CPU features, CPUID level 0x00000001 (EDX), word 0 */
+#define X86_FEATURE_FPU ( 0*32+ 0) /* Onboard FPU */
+#define X86_FEATURE_VME ( 0*32+ 1) /* Virtual Mode Extensions */
+#define X86_FEATURE_DE ( 0*32+ 2) /* Debugging Extensions */
+#define X86_FEATURE_PSE ( 0*32+ 3) /* Page Size Extensions */
+#define X86_FEATURE_TSC ( 0*32+ 4) /* Time Stamp Counter */
+#define X86_FEATURE_MSR ( 0*32+ 5) /* Model-Specific Registers */
+#define X86_FEATURE_PAE ( 0*32+ 6) /* Physical Address Extensions */
+#define X86_FEATURE_MCE ( 0*32+ 7) /* Machine Check Exception */
+#define X86_FEATURE_CX8 ( 0*32+ 8) /* CMPXCHG8 instruction */
+#define X86_FEATURE_APIC ( 0*32+ 9) /* Onboard APIC */
+#define X86_FEATURE_SEP ( 0*32+11) /* SYSENTER/SYSEXIT */
+#define X86_FEATURE_MTRR ( 0*32+12) /* Memory Type Range Registers */
+#define X86_FEATURE_PGE ( 0*32+13) /* Page Global Enable */
+#define X86_FEATURE_MCA ( 0*32+14) /* Machine Check Architecture */
+#define X86_FEATURE_CMOV ( 0*32+15) /* CMOV instructions (plus FCMOVcc, FCOMI with FPU) */
+#define X86_FEATURE_PAT ( 0*32+16) /* Page Attribute Table */
+#define X86_FEATURE_PSE36 ( 0*32+17) /* 36-bit PSEs */
+#define X86_FEATURE_PN ( 0*32+18) /* Processor serial number */
+#define X86_FEATURE_CLFLUSH ( 0*32+19) /* CLFLUSH instruction */
+#define X86_FEATURE_DS ( 0*32+21) /* "dts" Debug Store */
+#define X86_FEATURE_ACPI ( 0*32+22) /* ACPI via MSR */
+#define X86_FEATURE_MMX ( 0*32+23) /* Multimedia Extensions */
+#define X86_FEATURE_FXSR ( 0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */
+#define X86_FEATURE_XMM ( 0*32+25) /* "sse" */
+#define X86_FEATURE_XMM2 ( 0*32+26) /* "sse2" */
+#define X86_FEATURE_SELFSNOOP ( 0*32+27) /* "ss" CPU self snoop */
+#define X86_FEATURE_HT ( 0*32+28) /* Hyper-Threading */
+#define X86_FEATURE_ACC ( 0*32+29) /* "tm" Automatic clock control */
+#define X86_FEATURE_IA64 ( 0*32+30) /* IA-64 processor */
+#define X86_FEATURE_PBE ( 0*32+31) /* Pending Break Enable */
+
+/* AMD-defined CPU features, CPUID level 0x80000001, word 1 */
+/* Don't duplicate feature flags which are redundant with Intel! */
+#define X86_FEATURE_SYSCALL ( 1*32+11) /* SYSCALL/SYSRET */
+#define X86_FEATURE_MP ( 1*32+19) /* MP Capable */
+#define X86_FEATURE_NX ( 1*32+20) /* Execute Disable */
+#define X86_FEATURE_MMXEXT ( 1*32+22) /* AMD MMX extensions */
+#define X86_FEATURE_FXSR_OPT ( 1*32+25) /* FXSAVE/FXRSTOR optimizations */
+#define X86_FEATURE_GBPAGES ( 1*32+26) /* "pdpe1gb" GB pages */
+#define X86_FEATURE_RDTSCP ( 1*32+27) /* RDTSCP */
+#define X86_FEATURE_LM ( 1*32+29) /* Long Mode (x86-64, 64-bit support) */
+#define X86_FEATURE_3DNOWEXT ( 1*32+30) /* AMD 3DNow extensions */
+#define X86_FEATURE_3DNOW ( 1*32+31) /* 3DNow */
+
+/* Transmeta-defined CPU features, CPUID level 0x80860001, word 2 */
+#define X86_FEATURE_RECOVERY ( 2*32+ 0) /* CPU in recovery mode */
+#define X86_FEATURE_LONGRUN ( 2*32+ 1) /* Longrun power control */
+#define X86_FEATURE_LRTI ( 2*32+ 3) /* LongRun table interface */
+
+/* Other features, Linux-defined mapping, word 3 */
+/* This range is used for feature bits which conflict or are synthesized */
+#define X86_FEATURE_CXMMX ( 3*32+ 0) /* Cyrix MMX extensions */
+#define X86_FEATURE_K6_MTRR ( 3*32+ 1) /* AMD K6 nonstandard MTRRs */
+#define X86_FEATURE_CYRIX_ARR ( 3*32+ 2) /* Cyrix ARRs (= MTRRs) */
+#define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* Centaur MCRs (= MTRRs) */
+
+/* CPU types for specific tunings: */
+#define X86_FEATURE_K8 ( 3*32+ 4) /* "" Opteron, Athlon64 */
+#define X86_FEATURE_K7 ( 3*32+ 5) /* "" Athlon */
+#define X86_FEATURE_P3 ( 3*32+ 6) /* "" P3 */
+#define X86_FEATURE_P4 ( 3*32+ 7) /* "" P4 */
+#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */
+#define X86_FEATURE_UP ( 3*32+ 9) /* SMP kernel running on UP */
+#define X86_FEATURE_ART ( 3*32+10) /* Always running timer (ART) */
+#define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* Intel Architectural PerfMon */
+#define X86_FEATURE_PEBS ( 3*32+12) /* Precise-Event Based Sampling */
+#define X86_FEATURE_BTS ( 3*32+13) /* Branch Trace Store */
+#define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in IA32 userspace */
+#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in IA32 userspace */
+#define X86_FEATURE_REP_GOOD ( 3*32+16) /* REP microcode works well */
+#define X86_FEATURE_MFENCE_RDTSC ( 3*32+17) /* "" MFENCE synchronizes RDTSC */
+#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" LFENCE synchronizes RDTSC */
+#define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */
+#define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */
+#define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */
+#define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* CPU topology enum extensions */
+#define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* TSC is known to be reliable */
+#define X86_FEATURE_NONSTOP_TSC ( 3*32+24) /* TSC does not stop in C states */
+#define X86_FEATURE_CPUID ( 3*32+25) /* CPU has CPUID instruction itself */
+#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* Extended APICID (8 bits) */
+#define X86_FEATURE_AMD_DCM ( 3*32+27) /* AMD multi-node processor */
+#define X86_FEATURE_APERFMPERF ( 3*32+28) /* P-State hardware coordination feedback capability (APERF/MPERF MSRs) */
+#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */
+#define X86_FEATURE_TSC_KNOWN_FREQ ( 3*32+31) /* TSC has known frequency */
+
+/* Intel-defined CPU features, CPUID level 0x00000001 (ECX), word 4 */
+#define X86_FEATURE_XMM3 ( 4*32+ 0) /* "pni" SSE-3 */
+#define X86_FEATURE_PCLMULQDQ ( 4*32+ 1) /* PCLMULQDQ instruction */
+#define X86_FEATURE_DTES64 ( 4*32+ 2) /* 64-bit Debug Store */
+#define X86_FEATURE_MWAIT ( 4*32+ 3) /* "monitor" MONITOR/MWAIT support */
+#define X86_FEATURE_DSCPL ( 4*32+ 4) /* "ds_cpl" CPL-qualified (filtered) Debug Store */
+#define X86_FEATURE_VMX ( 4*32+ 5) /* Hardware virtualization */
+#define X86_FEATURE_SMX ( 4*32+ 6) /* Safer Mode eXtensions */
+#define X86_FEATURE_EST ( 4*32+ 7) /* Enhanced SpeedStep */
+#define X86_FEATURE_TM2 ( 4*32+ 8) /* Thermal Monitor 2 */
+#define X86_FEATURE_SSSE3 ( 4*32+ 9) /* Supplemental SSE-3 */
+#define X86_FEATURE_CID ( 4*32+10) /* Context ID */
+#define X86_FEATURE_SDBG ( 4*32+11) /* Silicon Debug */
+#define X86_FEATURE_FMA ( 4*32+12) /* Fused multiply-add */
+#define X86_FEATURE_CX16 ( 4*32+13) /* CMPXCHG16B instruction */
+#define X86_FEATURE_XTPR ( 4*32+14) /* Send Task Priority Messages */
+#define X86_FEATURE_PDCM ( 4*32+15) /* Perf/Debug Capabilities MSR */
+#define X86_FEATURE_PCID ( 4*32+17) /* Process Context Identifiers */
+#define X86_FEATURE_DCA ( 4*32+18) /* Direct Cache Access */
+#define X86_FEATURE_XMM4_1 ( 4*32+19) /* "sse4_1" SSE-4.1 */
+#define X86_FEATURE_XMM4_2 ( 4*32+20) /* "sse4_2" SSE-4.2 */
+#define X86_FEATURE_X2APIC ( 4*32+21) /* X2APIC */
+#define X86_FEATURE_MOVBE ( 4*32+22) /* MOVBE instruction */
+#define X86_FEATURE_POPCNT ( 4*32+23) /* POPCNT instruction */
+#define X86_FEATURE_TSC_DEADLINE_TIMER ( 4*32+24) /* TSC deadline timer */
+#define X86_FEATURE_AES ( 4*32+25) /* AES instructions */
+#define X86_FEATURE_XSAVE ( 4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV instructions */
+#define X86_FEATURE_OSXSAVE ( 4*32+27) /* "" XSAVE instruction enabled in the OS */
+#define X86_FEATURE_AVX ( 4*32+28) /* Advanced Vector Extensions */
+#define X86_FEATURE_F16C ( 4*32+29) /* 16-bit FP conversions */
+#define X86_FEATURE_RDRAND ( 4*32+30) /* RDRAND instruction */
+#define X86_FEATURE_HYPERVISOR ( 4*32+31) /* Running on a hypervisor */
+
+/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */
+#define X86_FEATURE_XSTORE ( 5*32+ 2) /* "rng" RNG present (xstore) */
+#define X86_FEATURE_XSTORE_EN ( 5*32+ 3) /* "rng_en" RNG enabled */
+#define X86_FEATURE_XCRYPT ( 5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */
+#define X86_FEATURE_XCRYPT_EN ( 5*32+ 7) /* "ace_en" on-CPU crypto enabled */
+#define X86_FEATURE_ACE2 ( 5*32+ 8) /* Advanced Cryptography Engine v2 */
+#define X86_FEATURE_ACE2_EN ( 5*32+ 9) /* ACE v2 enabled */
+#define X86_FEATURE_PHE ( 5*32+10) /* PadLock Hash Engine */
+#define X86_FEATURE_PHE_EN ( 5*32+11) /* PHE enabled */
+#define X86_FEATURE_PMM ( 5*32+12) /* PadLock Montgomery Multiplier */
+#define X86_FEATURE_PMM_EN ( 5*32+13) /* PMM enabled */
+
+/* More extended AMD flags: CPUID level 0x80000001, ECX, word 6 */
+#define X86_FEATURE_LAHF_LM ( 6*32+ 0) /* LAHF/SAHF in long mode */
+#define X86_FEATURE_CMP_LEGACY ( 6*32+ 1) /* If yes HyperThreading not valid */
+#define X86_FEATURE_SVM ( 6*32+ 2) /* Secure Virtual Machine */
+#define X86_FEATURE_EXTAPIC ( 6*32+ 3) /* Extended APIC space */
+#define X86_FEATURE_CR8_LEGACY ( 6*32+ 4) /* CR8 in 32-bit mode */
+#define X86_FEATURE_ABM ( 6*32+ 5) /* Advanced bit manipulation */
+#define X86_FEATURE_SSE4A ( 6*32+ 6) /* SSE-4A */
+#define X86_FEATURE_MISALIGNSSE ( 6*32+ 7) /* Misaligned SSE mode */
+#define X86_FEATURE_3DNOWPREFETCH ( 6*32+ 8) /* 3DNow prefetch instructions */
+#define X86_FEATURE_OSVW ( 6*32+ 9) /* OS Visible Workaround */
+#define X86_FEATURE_IBS ( 6*32+10) /* Instruction Based Sampling */
+#define X86_FEATURE_XOP ( 6*32+11) /* extended AVX instructions */
+#define X86_FEATURE_SKINIT ( 6*32+12) /* SKINIT/STGI instructions */
+#define X86_FEATURE_WDT ( 6*32+13) /* Watchdog timer */
+#define X86_FEATURE_LWP ( 6*32+15) /* Light Weight Profiling */
+#define X86_FEATURE_FMA4 ( 6*32+16) /* 4 operands MAC instructions */
+#define X86_FEATURE_TCE ( 6*32+17) /* Translation Cache Extension */
+#define X86_FEATURE_NODEID_MSR ( 6*32+19) /* NodeId MSR */
+#define X86_FEATURE_TBM ( 6*32+21) /* Trailing Bit Manipulations */
+#define X86_FEATURE_TOPOEXT ( 6*32+22) /* Topology extensions CPUID leafs */
+#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* Core performance counter extensions */
+#define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */
+#define X86_FEATURE_BPEXT ( 6*32+26) /* Data breakpoint extension */
+#define X86_FEATURE_PTSC ( 6*32+27) /* Performance time-stamp counter */
+#define X86_FEATURE_PERFCTR_LLC ( 6*32+28) /* Last Level Cache performance counter extensions */
+#define X86_FEATURE_MWAITX ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX instructions) */
+
+/*
+ * Auxiliary flags: Linux defined - For features scattered in various
+ * CPUID levels like 0x6, 0xA etc, word 7.
+ *
+ * Reuse free bits when adding new feature flags!
+ */
+#define X86_FEATURE_RING3MWAIT ( 7*32+ 0) /* Ring 3 MONITOR/MWAIT instructions */
+#define X86_FEATURE_CPUID_FAULT ( 7*32+ 1) /* Intel CPUID faulting */
+#define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */
+#define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
+#define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* Cache Allocation Technology L3 */
+#define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* Cache Allocation Technology L2 */
+#define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* Code and Data Prioritization L3 */
+#define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 7) /* Effectively INVPCID && CR4.PCIDE=1 */
+#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */
+#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
+#define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */
+#define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */
+#define X86_FEATURE_RETPOLINE ( 7*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */
+#define X86_FEATURE_RETPOLINE_LFENCE ( 7*32+13) /* "" Use LFENCE for Spectre variant 2 */
+#define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */
+#define X86_FEATURE_CDP_L2 ( 7*32+15) /* Code and Data Prioritization L2 */
+#define X86_FEATURE_MSR_SPEC_CTRL ( 7*32+16) /* "" MSR SPEC_CTRL is implemented */
+#define X86_FEATURE_SSBD ( 7*32+17) /* Speculative Store Bypass Disable */
+#define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */
+#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* "" Fill RSB on context switches */
+#define X86_FEATURE_SEV ( 7*32+20) /* AMD Secure Encrypted Virtualization */
+#define X86_FEATURE_USE_IBPB ( 7*32+21) /* "" Indirect Branch Prediction Barrier enabled */
+#define X86_FEATURE_USE_IBRS_FW ( 7*32+22) /* "" Use IBRS during runtime firmware calls */
+#define X86_FEATURE_SPEC_STORE_BYPASS_DISABLE ( 7*32+23) /* "" Disable Speculative Store Bypass. */
+#define X86_FEATURE_LS_CFG_SSBD ( 7*32+24) /* "" AMD SSBD implementation via LS_CFG MSR */
+#define X86_FEATURE_IBRS ( 7*32+25) /* Indirect Branch Restricted Speculation */
+#define X86_FEATURE_IBPB ( 7*32+26) /* Indirect Branch Prediction Barrier */
+#define X86_FEATURE_STIBP ( 7*32+27) /* Single Thread Indirect Branch Predictors */
+#define X86_FEATURE_ZEN ( 7*32+28) /* "" CPU is AMD family 0x17 (Zen) */
+#define X86_FEATURE_L1TF_PTEINV ( 7*32+29) /* "" L1TF workaround PTE inversion */
+#define X86_FEATURE_IBRS_ENHANCED ( 7*32+30) /* Enhanced IBRS */
+
+/* Virtualization flags: Linux defined, word 8 */
+#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */
+#define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */
+#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 2) /* Intel FlexPriority */
+#define X86_FEATURE_EPT ( 8*32+ 3) /* Intel Extended Page Table */
+#define X86_FEATURE_VPID ( 8*32+ 4) /* Intel Virtual Processor ID */
+
+#define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer VMMCALL to VMCALL */
+#define X86_FEATURE_XENPV ( 8*32+16) /* "" Xen paravirtual guest */
+#define X86_FEATURE_EPT_AD ( 8*32+17) /* Intel Extended Page Table access-dirty bit */
+
+/* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */
+#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/
+#define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* TSC adjustment MSR 0x3B */
+#define X86_FEATURE_BMI1 ( 9*32+ 3) /* 1st group bit manipulation extensions */
+#define X86_FEATURE_HLE ( 9*32+ 4) /* Hardware Lock Elision */
+#define X86_FEATURE_AVX2 ( 9*32+ 5) /* AVX2 instructions */
+#define X86_FEATURE_FDP_EXCPTN_ONLY ( 9*32+ 6) /* "" FPU data pointer updated only on x87 exceptions */
+#define X86_FEATURE_SMEP ( 9*32+ 7) /* Supervisor Mode Execution Protection */
+#define X86_FEATURE_BMI2 ( 9*32+ 8) /* 2nd group bit manipulation extensions */
+#define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB instructions */
+#define X86_FEATURE_INVPCID ( 9*32+10) /* Invalidate Processor Context ID */
+#define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */
+#define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */
+#define X86_FEATURE_ZERO_FCS_FDS ( 9*32+13) /* "" Zero out FPU CS and FPU DS */
+#define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */
+#define X86_FEATURE_RDT_A ( 9*32+15) /* Resource Director Technology Allocation */
+#define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */
+#define X86_FEATURE_AVX512DQ ( 9*32+17) /* AVX-512 DQ (Double/Quad granular) Instructions */
+#define X86_FEATURE_RDSEED ( 9*32+18) /* RDSEED instruction */
+#define X86_FEATURE_ADX ( 9*32+19) /* ADCX and ADOX instructions */
+#define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */
+#define X86_FEATURE_AVX512IFMA ( 9*32+21) /* AVX-512 Integer Fused Multiply-Add instructions */
+#define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */
+#define X86_FEATURE_CLWB ( 9*32+24) /* CLWB instruction */
+#define X86_FEATURE_INTEL_PT ( 9*32+25) /* Intel Processor Trace */
+#define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */
+#define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */
+#define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */
+#define X86_FEATURE_SHA_NI ( 9*32+29) /* SHA1/SHA256 Instruction Extensions */
+#define X86_FEATURE_AVX512BW ( 9*32+30) /* AVX-512 BW (Byte/Word granular) Instructions */
+#define X86_FEATURE_AVX512VL ( 9*32+31) /* AVX-512 VL (128/256 Vector Length) Extensions */
+
+/* Extended state features, CPUID level 0x0000000d:1 (EAX), word 10 */
+#define X86_FEATURE_XSAVEOPT (10*32+ 0) /* XSAVEOPT instruction */
+#define X86_FEATURE_XSAVEC (10*32+ 1) /* XSAVEC instruction */
+#define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 instruction */
+#define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS instructions */
+
+/*
+ * Extended auxiliary flags: Linux defined - for features scattered in various
+ * CPUID levels like 0xf, etc.
+ *
+ * Reuse free bits when adding new feature flags!
+ */
+#define X86_FEATURE_CQM_LLC (11*32+ 0) /* LLC QoS if 1 */
+#define X86_FEATURE_CQM_OCCUP_LLC (11*32+ 1) /* LLC occupancy monitoring */
+#define X86_FEATURE_CQM_MBM_TOTAL (11*32+ 2) /* LLC Total MBM monitoring */
+#define X86_FEATURE_CQM_MBM_LOCAL (11*32+ 3) /* LLC Local MBM monitoring */
+#define X86_FEATURE_FENCE_SWAPGS_USER (11*32+ 4) /* "" LFENCE in user entry SWAPGS path */
+#define X86_FEATURE_FENCE_SWAPGS_KERNEL (11*32+ 5) /* "" LFENCE in kernel entry SWAPGS path */
+
+/* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */
+#define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */
+#define X86_FEATURE_IRPERF (13*32+ 1) /* Instructions Retired Count */
+#define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* Always save/restore FP error pointers */
+#define X86_FEATURE_AMD_IBPB (13*32+12) /* "" Indirect Branch Prediction Barrier */
+#define X86_FEATURE_AMD_IBRS (13*32+14) /* "" Indirect Branch Restricted Speculation */
+#define X86_FEATURE_AMD_STIBP (13*32+15) /* "" Single Thread Indirect Branch Predictors */
+#define X86_FEATURE_AMD_STIBP_ALWAYS_ON (13*32+17) /* "" Single Thread Indirect Branch Predictors always-on preferred */
+#define X86_FEATURE_AMD_SSBD (13*32+24) /* "" Speculative Store Bypass Disable */
+#define X86_FEATURE_VIRT_SSBD (13*32+25) /* Virtualized Speculative Store Bypass Disable */
+#define X86_FEATURE_AMD_SSB_NO (13*32+26) /* "" Speculative Store Bypass is fixed in hardware. */
+
+/* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */
+#define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */
+#define X86_FEATURE_IDA (14*32+ 1) /* Intel Dynamic Acceleration */
+#define X86_FEATURE_ARAT (14*32+ 2) /* Always Running APIC Timer */
+#define X86_FEATURE_PLN (14*32+ 4) /* Intel Power Limit Notification */
+#define X86_FEATURE_PTS (14*32+ 6) /* Intel Package Thermal Status */
+#define X86_FEATURE_HWP (14*32+ 7) /* Intel Hardware P-states */
+#define X86_FEATURE_HWP_NOTIFY (14*32+ 8) /* HWP Notification */
+#define X86_FEATURE_HWP_ACT_WINDOW (14*32+ 9) /* HWP Activity Window */
+#define X86_FEATURE_HWP_EPP (14*32+10) /* HWP Energy Perf. Preference */
+#define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* HWP Package Level Request */
+
+/* AMD SVM Feature Identification, CPUID level 0x8000000a (EDX), word 15 */
+#define X86_FEATURE_NPT (15*32+ 0) /* Nested Page Table support */
+#define X86_FEATURE_LBRV (15*32+ 1) /* LBR Virtualization support */
+#define X86_FEATURE_SVML (15*32+ 2) /* "svm_lock" SVM locking MSR */
+#define X86_FEATURE_NRIPS (15*32+ 3) /* "nrip_save" SVM next_rip save */
+#define X86_FEATURE_TSCRATEMSR (15*32+ 4) /* "tsc_scale" TSC scaling support */
+#define X86_FEATURE_VMCBCLEAN (15*32+ 5) /* "vmcb_clean" VMCB clean bits support */
+#define X86_FEATURE_FLUSHBYASID (15*32+ 6) /* flush-by-ASID support */
+#define X86_FEATURE_DECODEASSISTS (15*32+ 7) /* Decode Assists support */
+#define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */
+#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */
+#define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */
+#define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */
+#define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */
+
+/* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */
+#define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/
+#define X86_FEATURE_UMIP (16*32+ 2) /* User Mode Instruction Protection */
+#define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */
+#define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */
+#define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* Additional AVX512 Vector Bit Manipulation Instructions */
+#define X86_FEATURE_GFNI (16*32+ 8) /* Galois Field New Instructions */
+#define X86_FEATURE_VAES (16*32+ 9) /* Vector AES */
+#define X86_FEATURE_VPCLMULQDQ (16*32+10) /* Carry-Less Multiplication Double Quadword */
+#define X86_FEATURE_AVX512_VNNI (16*32+11) /* Vector Neural Network Instructions */
+#define X86_FEATURE_AVX512_BITALG (16*32+12) /* Support for VPOPCNT[B,W] and VPSHUF-BITQMB instructions */
+#define X86_FEATURE_TME (16*32+13) /* Intel Total Memory Encryption */
+#define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */
+#define X86_FEATURE_LA57 (16*32+16) /* 5-level page tables */
+#define X86_FEATURE_RDPID (16*32+22) /* RDPID instruction */
+#define X86_FEATURE_CLDEMOTE (16*32+25) /* CLDEMOTE instruction */
+
+/* AMD-defined CPU features, CPUID level 0x80000007 (EBX), word 17 */
+#define X86_FEATURE_OVERFLOW_RECOV (17*32+ 0) /* MCA overflow recovery support */
+#define X86_FEATURE_SUCCOR (17*32+ 1) /* Uncorrectable error containment and recovery */
+#define X86_FEATURE_SMCA (17*32+ 3) /* Scalable MCA */
+
+/* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */
+#define X86_FEATURE_AVX512_4VNNIW (18*32+ 2) /* AVX-512 Neural Network Instructions */
+#define X86_FEATURE_AVX512_4FMAPS (18*32+ 3) /* AVX-512 Multiply Accumulation Single precision */
+#define X86_FEATURE_SRBDS_CTRL (18*32+ 9) /* "" SRBDS mitigation MSR available */
+#define X86_FEATURE_TSX_FORCE_ABORT (18*32+13) /* "" TSX_FORCE_ABORT */
+#define X86_FEATURE_MD_CLEAR (18*32+10) /* VERW clears CPU buffers */
+#define X86_FEATURE_PCONFIG (18*32+18) /* Intel PCONFIG */
+#define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */
+#define X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Predictors */
+#define X86_FEATURE_FLUSH_L1D (18*32+28) /* Flush L1D cache */
+#define X86_FEATURE_ARCH_CAPABILITIES (18*32+29) /* IA32_ARCH_CAPABILITIES MSR (Intel) */
+#define X86_FEATURE_SPEC_CTRL_SSBD (18*32+31) /* "" Speculative Store Bypass Disable */
+
+/*
+ * BUG word(s)
+ */
+#define X86_BUG(x) (NCAPINTS*32 + (x))
+
+#define X86_BUG_F00F X86_BUG(0) /* Intel F00F */
+#define X86_BUG_FDIV X86_BUG(1) /* FPU FDIV */
+#define X86_BUG_COMA X86_BUG(2) /* Cyrix 6x86 coma */
+#define X86_BUG_AMD_TLB_MMATCH X86_BUG(3) /* "tlb_mmatch" AMD Erratum 383 */
+#define X86_BUG_AMD_APIC_C1E X86_BUG(4) /* "apic_c1e" AMD Erratum 400 */
+#define X86_BUG_11AP X86_BUG(5) /* Bad local APIC aka 11AP */
+#define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */
+#define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */
+#define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* SYSRET doesn't fix up SS attrs */
+#ifdef CONFIG_X86_32
+/*
+ * 64-bit kernels don't use X86_BUG_ESPFIX. Make the define conditional
+ * to avoid confusion.
+ */
+#define X86_BUG_ESPFIX X86_BUG(9) /* "" IRET to 16-bit SS corrupts ESP/RSP high bits */
+#endif
+#define X86_BUG_NULL_SEG X86_BUG(10) /* Nulling a selector preserves the base */
+#define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */
+#define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */
+#define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */
+#define X86_BUG_CPU_MELTDOWN X86_BUG(14) /* CPU is affected by meltdown attack and needs kernel page table isolation */
+#define X86_BUG_SPECTRE_V1 X86_BUG(15) /* CPU is affected by Spectre variant 1 attack with conditional branches */
+#define X86_BUG_SPECTRE_V2 X86_BUG(16) /* CPU is affected by Spectre variant 2 attack with indirect branches */
+#define X86_BUG_SPEC_STORE_BYPASS X86_BUG(17) /* CPU is affected by speculative store bypass attack */
+#define X86_BUG_L1TF X86_BUG(18) /* CPU is affected by L1 Terminal Fault */
+#define X86_BUG_MDS X86_BUG(19) /* CPU is affected by Microarchitectural data sampling */
+#define X86_BUG_MSBDS_ONLY X86_BUG(20) /* CPU is only affected by the MSDBS variant of BUG_MDS */
+#define X86_BUG_SWAPGS X86_BUG(21) /* CPU is affected by speculation through SWAPGS */
+#define X86_BUG_TAA X86_BUG(22) /* CPU is affected by TSX Async Abort(TAA) */
+#define X86_BUG_ITLB_MULTIHIT X86_BUG(23) /* CPU may incur MCE during certain page attribute changes */
+#define X86_BUG_SRBDS X86_BUG(24) /* CPU may leak RNG bits if not mitigated */
+#define X86_BUG_MMIO_STALE_DATA X86_BUG(25) /* CPU is affected by Processor MMIO Stale Data vulnerabilities */
+
+#endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/arch/x86/include/asm/cpumask.h b/arch/x86/include/asm/cpumask.h
new file mode 100644
index 000000000..6722ffcef
--- /dev/null
+++ b/arch/x86/include/asm/cpumask.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_CPUMASK_H
+#define _ASM_X86_CPUMASK_H
+#ifndef __ASSEMBLY__
+#include <linux/cpumask.h>
+
+extern cpumask_var_t cpu_callin_mask;
+extern cpumask_var_t cpu_callout_mask;
+extern cpumask_var_t cpu_initialized_mask;
+extern cpumask_var_t cpu_sibling_setup_mask;
+
+extern void setup_cpu_local_masks(void);
+
+#endif /* __ASSEMBLY__ */
+#endif /* _ASM_X86_CPUMASK_H */
diff --git a/arch/x86/include/asm/crash.h b/arch/x86/include/asm/crash.h
new file mode 100644
index 000000000..6b8ad6fa3
--- /dev/null
+++ b/arch/x86/include/asm/crash.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_CRASH_H
+#define _ASM_X86_CRASH_H
+
+struct kimage;
+
+int crash_load_segments(struct kimage *image);
+int crash_copy_backup_region(struct kimage *image);
+int crash_setup_memmap_entries(struct kimage *image,
+ struct boot_params *params);
+
+#endif /* _ASM_X86_CRASH_H */
diff --git a/arch/x86/include/asm/crypto/aes.h b/arch/x86/include/asm/crypto/aes.h
new file mode 100644
index 000000000..c508521dd
--- /dev/null
+++ b/arch/x86/include/asm/crypto/aes.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ASM_X86_AES_H
+#define ASM_X86_AES_H
+
+#include <linux/crypto.h>
+#include <crypto/aes.h>
+
+void crypto_aes_encrypt_x86(struct crypto_aes_ctx *ctx, u8 *dst,
+ const u8 *src);
+void crypto_aes_decrypt_x86(struct crypto_aes_ctx *ctx, u8 *dst,
+ const u8 *src);
+#endif
diff --git a/arch/x86/include/asm/crypto/camellia.h b/arch/x86/include/asm/crypto/camellia.h
new file mode 100644
index 000000000..a5d86fc05
--- /dev/null
+++ b/arch/x86/include/asm/crypto/camellia.h
@@ -0,0 +1,96 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ASM_X86_CAMELLIA_H
+#define ASM_X86_CAMELLIA_H
+
+#include <crypto/b128ops.h>
+#include <linux/crypto.h>
+#include <linux/kernel.h>
+
+#define CAMELLIA_MIN_KEY_SIZE 16
+#define CAMELLIA_MAX_KEY_SIZE 32
+#define CAMELLIA_BLOCK_SIZE 16
+#define CAMELLIA_TABLE_BYTE_LEN 272
+#define CAMELLIA_PARALLEL_BLOCKS 2
+
+struct crypto_skcipher;
+
+struct camellia_ctx {
+ u64 key_table[CAMELLIA_TABLE_BYTE_LEN / sizeof(u64)];
+ u32 key_length;
+};
+
+struct camellia_xts_ctx {
+ struct camellia_ctx tweak_ctx;
+ struct camellia_ctx crypt_ctx;
+};
+
+extern int __camellia_setkey(struct camellia_ctx *cctx,
+ const unsigned char *key,
+ unsigned int key_len, u32 *flags);
+
+extern int xts_camellia_setkey(struct crypto_skcipher *tfm, const u8 *key,
+ unsigned int keylen);
+
+/* regular block cipher functions */
+asmlinkage void __camellia_enc_blk(struct camellia_ctx *ctx, u8 *dst,
+ const u8 *src, bool xor);
+asmlinkage void camellia_dec_blk(struct camellia_ctx *ctx, u8 *dst,
+ const u8 *src);
+
+/* 2-way parallel cipher functions */
+asmlinkage void __camellia_enc_blk_2way(struct camellia_ctx *ctx, u8 *dst,
+ const u8 *src, bool xor);
+asmlinkage void camellia_dec_blk_2way(struct camellia_ctx *ctx, u8 *dst,
+ const u8 *src);
+
+/* 16-way parallel cipher functions (avx/aes-ni) */
+asmlinkage void camellia_ecb_enc_16way(struct camellia_ctx *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void camellia_ecb_dec_16way(struct camellia_ctx *ctx, u8 *dst,
+ const u8 *src);
+
+asmlinkage void camellia_cbc_dec_16way(struct camellia_ctx *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void camellia_ctr_16way(struct camellia_ctx *ctx, u8 *dst,
+ const u8 *src, le128 *iv);
+
+asmlinkage void camellia_xts_enc_16way(struct camellia_ctx *ctx, u8 *dst,
+ const u8 *src, le128 *iv);
+asmlinkage void camellia_xts_dec_16way(struct camellia_ctx *ctx, u8 *dst,
+ const u8 *src, le128 *iv);
+
+static inline void camellia_enc_blk(struct camellia_ctx *ctx, u8 *dst,
+ const u8 *src)
+{
+ __camellia_enc_blk(ctx, dst, src, false);
+}
+
+static inline void camellia_enc_blk_xor(struct camellia_ctx *ctx, u8 *dst,
+ const u8 *src)
+{
+ __camellia_enc_blk(ctx, dst, src, true);
+}
+
+static inline void camellia_enc_blk_2way(struct camellia_ctx *ctx, u8 *dst,
+ const u8 *src)
+{
+ __camellia_enc_blk_2way(ctx, dst, src, false);
+}
+
+static inline void camellia_enc_blk_xor_2way(struct camellia_ctx *ctx, u8 *dst,
+ const u8 *src)
+{
+ __camellia_enc_blk_2way(ctx, dst, src, true);
+}
+
+/* glue helpers */
+extern void camellia_decrypt_cbc_2way(void *ctx, u128 *dst, const u128 *src);
+extern void camellia_crypt_ctr(void *ctx, u128 *dst, const u128 *src,
+ le128 *iv);
+extern void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src,
+ le128 *iv);
+
+extern void camellia_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv);
+extern void camellia_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv);
+
+#endif /* ASM_X86_CAMELLIA_H */
diff --git a/arch/x86/include/asm/crypto/glue_helper.h b/arch/x86/include/asm/crypto/glue_helper.h
new file mode 100644
index 000000000..d1818634a
--- /dev/null
+++ b/arch/x86/include/asm/crypto/glue_helper.h
@@ -0,0 +1,122 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Shared glue code for 128bit block ciphers
+ */
+
+#ifndef _CRYPTO_GLUE_HELPER_H
+#define _CRYPTO_GLUE_HELPER_H
+
+#include <crypto/internal/skcipher.h>
+#include <linux/kernel.h>
+#include <asm/fpu/api.h>
+#include <crypto/b128ops.h>
+
+typedef void (*common_glue_func_t)(void *ctx, u8 *dst, const u8 *src);
+typedef void (*common_glue_cbc_func_t)(void *ctx, u128 *dst, const u128 *src);
+typedef void (*common_glue_ctr_func_t)(void *ctx, u128 *dst, const u128 *src,
+ le128 *iv);
+typedef void (*common_glue_xts_func_t)(void *ctx, u128 *dst, const u128 *src,
+ le128 *iv);
+
+#define GLUE_FUNC_CAST(fn) ((common_glue_func_t)(fn))
+#define GLUE_CBC_FUNC_CAST(fn) ((common_glue_cbc_func_t)(fn))
+#define GLUE_CTR_FUNC_CAST(fn) ((common_glue_ctr_func_t)(fn))
+#define GLUE_XTS_FUNC_CAST(fn) ((common_glue_xts_func_t)(fn))
+
+struct common_glue_func_entry {
+ unsigned int num_blocks; /* number of blocks that @fn will process */
+ union {
+ common_glue_func_t ecb;
+ common_glue_cbc_func_t cbc;
+ common_glue_ctr_func_t ctr;
+ common_glue_xts_func_t xts;
+ } fn_u;
+};
+
+struct common_glue_ctx {
+ unsigned int num_funcs;
+ int fpu_blocks_limit; /* -1 means fpu not needed at all */
+
+ /*
+ * First funcs entry must have largest num_blocks and last funcs entry
+ * must have num_blocks == 1!
+ */
+ struct common_glue_func_entry funcs[];
+};
+
+static inline bool glue_fpu_begin(unsigned int bsize, int fpu_blocks_limit,
+ struct skcipher_walk *walk,
+ bool fpu_enabled, unsigned int nbytes)
+{
+ if (likely(fpu_blocks_limit < 0))
+ return false;
+
+ if (fpu_enabled)
+ return true;
+
+ /*
+ * Vector-registers are only used when chunk to be processed is large
+ * enough, so do not enable FPU until it is necessary.
+ */
+ if (nbytes < bsize * (unsigned int)fpu_blocks_limit)
+ return false;
+
+ /* prevent sleeping if FPU is in use */
+ skcipher_walk_atomise(walk);
+
+ kernel_fpu_begin();
+ return true;
+}
+
+static inline void glue_fpu_end(bool fpu_enabled)
+{
+ if (fpu_enabled)
+ kernel_fpu_end();
+}
+
+static inline void le128_to_be128(be128 *dst, const le128 *src)
+{
+ dst->a = cpu_to_be64(le64_to_cpu(src->a));
+ dst->b = cpu_to_be64(le64_to_cpu(src->b));
+}
+
+static inline void be128_to_le128(le128 *dst, const be128 *src)
+{
+ dst->a = cpu_to_le64(be64_to_cpu(src->a));
+ dst->b = cpu_to_le64(be64_to_cpu(src->b));
+}
+
+static inline void le128_inc(le128 *i)
+{
+ u64 a = le64_to_cpu(i->a);
+ u64 b = le64_to_cpu(i->b);
+
+ b++;
+ if (!b)
+ a++;
+
+ i->a = cpu_to_le64(a);
+ i->b = cpu_to_le64(b);
+}
+
+extern int glue_ecb_req_128bit(const struct common_glue_ctx *gctx,
+ struct skcipher_request *req);
+
+extern int glue_cbc_encrypt_req_128bit(const common_glue_func_t fn,
+ struct skcipher_request *req);
+
+extern int glue_cbc_decrypt_req_128bit(const struct common_glue_ctx *gctx,
+ struct skcipher_request *req);
+
+extern int glue_ctr_req_128bit(const struct common_glue_ctx *gctx,
+ struct skcipher_request *req);
+
+extern int glue_xts_req_128bit(const struct common_glue_ctx *gctx,
+ struct skcipher_request *req,
+ common_glue_func_t tweak_fn, void *tweak_ctx,
+ void *crypt_ctx);
+
+extern void glue_xts_crypt_128bit_one(void *ctx, u128 *dst, const u128 *src,
+ le128 *iv, common_glue_func_t fn);
+
+#endif /* _CRYPTO_GLUE_HELPER_H */
diff --git a/arch/x86/include/asm/crypto/serpent-avx.h b/arch/x86/include/asm/crypto/serpent-avx.h
new file mode 100644
index 000000000..db7c9cc32
--- /dev/null
+++ b/arch/x86/include/asm/crypto/serpent-avx.h
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ASM_X86_SERPENT_AVX_H
+#define ASM_X86_SERPENT_AVX_H
+
+#include <crypto/b128ops.h>
+#include <crypto/serpent.h>
+#include <linux/types.h>
+
+struct crypto_skcipher;
+
+#define SERPENT_PARALLEL_BLOCKS 8
+
+struct serpent_xts_ctx {
+ struct serpent_ctx tweak_ctx;
+ struct serpent_ctx crypt_ctx;
+};
+
+asmlinkage void serpent_ecb_enc_8way_avx(struct serpent_ctx *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void serpent_ecb_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst,
+ const u8 *src);
+
+asmlinkage void serpent_cbc_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void serpent_ctr_8way_avx(struct serpent_ctx *ctx, u8 *dst,
+ const u8 *src, le128 *iv);
+
+asmlinkage void serpent_xts_enc_8way_avx(struct serpent_ctx *ctx, u8 *dst,
+ const u8 *src, le128 *iv);
+asmlinkage void serpent_xts_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst,
+ const u8 *src, le128 *iv);
+
+extern void __serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src,
+ le128 *iv);
+
+extern void serpent_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv);
+extern void serpent_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv);
+
+extern int xts_serpent_setkey(struct crypto_skcipher *tfm, const u8 *key,
+ unsigned int keylen);
+
+#endif
diff --git a/arch/x86/include/asm/crypto/serpent-sse2.h b/arch/x86/include/asm/crypto/serpent-sse2.h
new file mode 100644
index 000000000..1a345e8a7
--- /dev/null
+++ b/arch/x86/include/asm/crypto/serpent-sse2.h
@@ -0,0 +1,64 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ASM_X86_SERPENT_SSE2_H
+#define ASM_X86_SERPENT_SSE2_H
+
+#include <linux/crypto.h>
+#include <crypto/serpent.h>
+
+#ifdef CONFIG_X86_32
+
+#define SERPENT_PARALLEL_BLOCKS 4
+
+asmlinkage void __serpent_enc_blk_4way(struct serpent_ctx *ctx, u8 *dst,
+ const u8 *src, bool xor);
+asmlinkage void serpent_dec_blk_4way(struct serpent_ctx *ctx, u8 *dst,
+ const u8 *src);
+
+static inline void serpent_enc_blk_xway(struct serpent_ctx *ctx, u8 *dst,
+ const u8 *src)
+{
+ __serpent_enc_blk_4way(ctx, dst, src, false);
+}
+
+static inline void serpent_enc_blk_xway_xor(struct serpent_ctx *ctx, u8 *dst,
+ const u8 *src)
+{
+ __serpent_enc_blk_4way(ctx, dst, src, true);
+}
+
+static inline void serpent_dec_blk_xway(struct serpent_ctx *ctx, u8 *dst,
+ const u8 *src)
+{
+ serpent_dec_blk_4way(ctx, dst, src);
+}
+
+#else
+
+#define SERPENT_PARALLEL_BLOCKS 8
+
+asmlinkage void __serpent_enc_blk_8way(struct serpent_ctx *ctx, u8 *dst,
+ const u8 *src, bool xor);
+asmlinkage void serpent_dec_blk_8way(struct serpent_ctx *ctx, u8 *dst,
+ const u8 *src);
+
+static inline void serpent_enc_blk_xway(struct serpent_ctx *ctx, u8 *dst,
+ const u8 *src)
+{
+ __serpent_enc_blk_8way(ctx, dst, src, false);
+}
+
+static inline void serpent_enc_blk_xway_xor(struct serpent_ctx *ctx, u8 *dst,
+ const u8 *src)
+{
+ __serpent_enc_blk_8way(ctx, dst, src, true);
+}
+
+static inline void serpent_dec_blk_xway(struct serpent_ctx *ctx, u8 *dst,
+ const u8 *src)
+{
+ serpent_dec_blk_8way(ctx, dst, src);
+}
+
+#endif
+
+#endif
diff --git a/arch/x86/include/asm/crypto/twofish.h b/arch/x86/include/asm/crypto/twofish.h
new file mode 100644
index 000000000..f618bf272
--- /dev/null
+++ b/arch/x86/include/asm/crypto/twofish.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ASM_X86_TWOFISH_H
+#define ASM_X86_TWOFISH_H
+
+#include <linux/crypto.h>
+#include <crypto/twofish.h>
+#include <crypto/b128ops.h>
+
+/* regular block cipher functions from twofish_x86_64 module */
+asmlinkage void twofish_enc_blk(struct twofish_ctx *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void twofish_dec_blk(struct twofish_ctx *ctx, u8 *dst,
+ const u8 *src);
+
+/* 3-way parallel cipher functions */
+asmlinkage void __twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
+ const u8 *src, bool xor);
+asmlinkage void twofish_dec_blk_3way(struct twofish_ctx *ctx, u8 *dst,
+ const u8 *src);
+
+/* helpers from twofish_x86_64-3way module */
+extern void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src);
+extern void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src,
+ le128 *iv);
+extern void twofish_enc_blk_ctr_3way(void *ctx, u128 *dst, const u128 *src,
+ le128 *iv);
+
+#endif /* ASM_X86_TWOFISH_H */
diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h
new file mode 100644
index 000000000..3e204e614
--- /dev/null
+++ b/arch/x86/include/asm/current.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_CURRENT_H
+#define _ASM_X86_CURRENT_H
+
+#include <linux/compiler.h>
+#include <asm/percpu.h>
+
+#ifndef __ASSEMBLY__
+struct task_struct;
+
+DECLARE_PER_CPU(struct task_struct *, current_task);
+
+static __always_inline struct task_struct *get_current(void)
+{
+ return this_cpu_read_stable(current_task);
+}
+
+#define current get_current()
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_X86_CURRENT_H */
diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h
new file mode 100644
index 000000000..4505ac273
--- /dev/null
+++ b/arch/x86/include/asm/debugreg.h
@@ -0,0 +1,124 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_DEBUGREG_H
+#define _ASM_X86_DEBUGREG_H
+
+
+#include <linux/bug.h>
+#include <uapi/asm/debugreg.h>
+
+DECLARE_PER_CPU(unsigned long, cpu_dr7);
+
+#ifndef CONFIG_PARAVIRT
+/*
+ * These special macros can be used to get or set a debugging register
+ */
+#define get_debugreg(var, register) \
+ (var) = native_get_debugreg(register)
+#define set_debugreg(value, register) \
+ native_set_debugreg(register, value)
+#endif
+
+static inline unsigned long native_get_debugreg(int regno)
+{
+ unsigned long val = 0; /* Damn you, gcc! */
+
+ switch (regno) {
+ case 0:
+ asm("mov %%db0, %0" :"=r" (val));
+ break;
+ case 1:
+ asm("mov %%db1, %0" :"=r" (val));
+ break;
+ case 2:
+ asm("mov %%db2, %0" :"=r" (val));
+ break;
+ case 3:
+ asm("mov %%db3, %0" :"=r" (val));
+ break;
+ case 6:
+ asm("mov %%db6, %0" :"=r" (val));
+ break;
+ case 7:
+ asm("mov %%db7, %0" :"=r" (val));
+ break;
+ default:
+ BUG();
+ }
+ return val;
+}
+
+static inline void native_set_debugreg(int regno, unsigned long value)
+{
+ switch (regno) {
+ case 0:
+ asm("mov %0, %%db0" ::"r" (value));
+ break;
+ case 1:
+ asm("mov %0, %%db1" ::"r" (value));
+ break;
+ case 2:
+ asm("mov %0, %%db2" ::"r" (value));
+ break;
+ case 3:
+ asm("mov %0, %%db3" ::"r" (value));
+ break;
+ case 6:
+ asm("mov %0, %%db6" ::"r" (value));
+ break;
+ case 7:
+ asm("mov %0, %%db7" ::"r" (value));
+ break;
+ default:
+ BUG();
+ }
+}
+
+static inline void hw_breakpoint_disable(void)
+{
+ /* Zero the control register for HW Breakpoint */
+ set_debugreg(0UL, 7);
+
+ /* Zero-out the individual HW breakpoint address registers */
+ set_debugreg(0UL, 0);
+ set_debugreg(0UL, 1);
+ set_debugreg(0UL, 2);
+ set_debugreg(0UL, 3);
+}
+
+static inline int hw_breakpoint_active(void)
+{
+ return __this_cpu_read(cpu_dr7) & DR_GLOBAL_ENABLE_MASK;
+}
+
+extern void aout_dump_debugregs(struct user *dump);
+
+extern void hw_breakpoint_restore(void);
+
+#ifdef CONFIG_X86_64
+DECLARE_PER_CPU(int, debug_stack_usage);
+static inline void debug_stack_usage_inc(void)
+{
+ __this_cpu_inc(debug_stack_usage);
+}
+static inline void debug_stack_usage_dec(void)
+{
+ __this_cpu_dec(debug_stack_usage);
+}
+int is_debug_stack(unsigned long addr);
+void debug_stack_set_zero(void);
+void debug_stack_reset(void);
+#else /* !X86_64 */
+static inline int is_debug_stack(unsigned long addr) { return 0; }
+static inline void debug_stack_set_zero(void) { }
+static inline void debug_stack_reset(void) { }
+static inline void debug_stack_usage_inc(void) { }
+static inline void debug_stack_usage_dec(void) { }
+#endif /* X86_64 */
+
+#ifdef CONFIG_CPU_SUP_AMD
+extern void set_dr_addr_mask(unsigned long mask, int dr);
+#else
+static inline void set_dr_addr_mask(unsigned long mask, int dr) { }
+#endif
+
+#endif /* _ASM_X86_DEBUGREG_H */
diff --git a/arch/x86/include/asm/delay.h b/arch/x86/include/asm/delay.h
new file mode 100644
index 000000000..de9e7841f
--- /dev/null
+++ b/arch/x86/include/asm/delay.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_DELAY_H
+#define _ASM_X86_DELAY_H
+
+#include <asm-generic/delay.h>
+
+void use_tsc_delay(void);
+void use_mwaitx_delay(void);
+
+#endif /* _ASM_X86_DELAY_H */
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
new file mode 100644
index 000000000..13c5ee878
--- /dev/null
+++ b/arch/x86/include/asm/desc.h
@@ -0,0 +1,451 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_DESC_H
+#define _ASM_X86_DESC_H
+
+#include <asm/desc_defs.h>
+#include <asm/ldt.h>
+#include <asm/mmu.h>
+#include <asm/fixmap.h>
+#include <asm/irq_vectors.h>
+#include <asm/cpu_entry_area.h>
+
+#include <linux/smp.h>
+#include <linux/percpu.h>
+
+static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *info)
+{
+ desc->limit0 = info->limit & 0x0ffff;
+
+ desc->base0 = (info->base_addr & 0x0000ffff);
+ desc->base1 = (info->base_addr & 0x00ff0000) >> 16;
+
+ desc->type = (info->read_exec_only ^ 1) << 1;
+ desc->type |= info->contents << 2;
+ /* Set the ACCESS bit so it can be mapped RO */
+ desc->type |= 1;
+
+ desc->s = 1;
+ desc->dpl = 0x3;
+ desc->p = info->seg_not_present ^ 1;
+ desc->limit1 = (info->limit & 0xf0000) >> 16;
+ desc->avl = info->useable;
+ desc->d = info->seg_32bit;
+ desc->g = info->limit_in_pages;
+
+ desc->base2 = (info->base_addr & 0xff000000) >> 24;
+ /*
+ * Don't allow setting of the lm bit. It would confuse
+ * user_64bit_mode and would get overridden by sysret anyway.
+ */
+ desc->l = 0;
+}
+
+extern struct desc_ptr idt_descr;
+extern gate_desc idt_table[];
+extern const struct desc_ptr debug_idt_descr;
+extern gate_desc debug_idt_table[];
+
+struct gdt_page {
+ struct desc_struct gdt[GDT_ENTRIES];
+} __attribute__((aligned(PAGE_SIZE)));
+
+DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page);
+
+/* Provide the original GDT */
+static inline struct desc_struct *get_cpu_gdt_rw(unsigned int cpu)
+{
+ return per_cpu(gdt_page, cpu).gdt;
+}
+
+/* Provide the current original GDT */
+static inline struct desc_struct *get_current_gdt_rw(void)
+{
+ return this_cpu_ptr(&gdt_page)->gdt;
+}
+
+/* Provide the fixmap address of the remapped GDT */
+static inline struct desc_struct *get_cpu_gdt_ro(int cpu)
+{
+ return (struct desc_struct *)&get_cpu_entry_area(cpu)->gdt;
+}
+
+/* Provide the current read-only GDT */
+static inline struct desc_struct *get_current_gdt_ro(void)
+{
+ return get_cpu_gdt_ro(smp_processor_id());
+}
+
+/* Provide the physical address of the GDT page. */
+static inline phys_addr_t get_cpu_gdt_paddr(unsigned int cpu)
+{
+ return per_cpu_ptr_to_phys(get_cpu_gdt_rw(cpu));
+}
+
+static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func,
+ unsigned dpl, unsigned ist, unsigned seg)
+{
+ gate->offset_low = (u16) func;
+ gate->bits.p = 1;
+ gate->bits.dpl = dpl;
+ gate->bits.zero = 0;
+ gate->bits.type = type;
+ gate->offset_middle = (u16) (func >> 16);
+#ifdef CONFIG_X86_64
+ gate->segment = __KERNEL_CS;
+ gate->bits.ist = ist;
+ gate->reserved = 0;
+ gate->offset_high = (u32) (func >> 32);
+#else
+ gate->segment = seg;
+ gate->bits.ist = 0;
+#endif
+}
+
+static inline int desc_empty(const void *ptr)
+{
+ const u32 *desc = ptr;
+
+ return !(desc[0] | desc[1]);
+}
+
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+#define load_TR_desc() native_load_tr_desc()
+#define load_gdt(dtr) native_load_gdt(dtr)
+#define load_idt(dtr) native_load_idt(dtr)
+#define load_tr(tr) asm volatile("ltr %0"::"m" (tr))
+#define load_ldt(ldt) asm volatile("lldt %0"::"m" (ldt))
+
+#define store_gdt(dtr) native_store_gdt(dtr)
+#define store_tr(tr) (tr = native_store_tr())
+
+#define load_TLS(t, cpu) native_load_tls(t, cpu)
+#define set_ldt native_set_ldt
+
+#define write_ldt_entry(dt, entry, desc) native_write_ldt_entry(dt, entry, desc)
+#define write_gdt_entry(dt, entry, desc, type) native_write_gdt_entry(dt, entry, desc, type)
+#define write_idt_entry(dt, entry, g) native_write_idt_entry(dt, entry, g)
+
+static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries)
+{
+}
+
+static inline void paravirt_free_ldt(struct desc_struct *ldt, unsigned entries)
+{
+}
+#endif /* CONFIG_PARAVIRT */
+
+#define store_ldt(ldt) asm("sldt %0" : "=m"(ldt))
+
+static inline void native_write_idt_entry(gate_desc *idt, int entry, const gate_desc *gate)
+{
+ memcpy(&idt[entry], gate, sizeof(*gate));
+}
+
+static inline void native_write_ldt_entry(struct desc_struct *ldt, int entry, const void *desc)
+{
+ memcpy(&ldt[entry], desc, 8);
+}
+
+static inline void
+native_write_gdt_entry(struct desc_struct *gdt, int entry, const void *desc, int type)
+{
+ unsigned int size;
+
+ switch (type) {
+ case DESC_TSS: size = sizeof(tss_desc); break;
+ case DESC_LDT: size = sizeof(ldt_desc); break;
+ default: size = sizeof(*gdt); break;
+ }
+
+ memcpy(&gdt[entry], desc, size);
+}
+
+static inline void set_tssldt_descriptor(void *d, unsigned long addr,
+ unsigned type, unsigned size)
+{
+ struct ldttss_desc *desc = d;
+
+ memset(desc, 0, sizeof(*desc));
+
+ desc->limit0 = (u16) size;
+ desc->base0 = (u16) addr;
+ desc->base1 = (addr >> 16) & 0xFF;
+ desc->type = type;
+ desc->p = 1;
+ desc->limit1 = (size >> 16) & 0xF;
+ desc->base2 = (addr >> 24) & 0xFF;
+#ifdef CONFIG_X86_64
+ desc->base3 = (u32) (addr >> 32);
+#endif
+}
+
+static inline void __set_tss_desc(unsigned cpu, unsigned int entry, struct x86_hw_tss *addr)
+{
+ struct desc_struct *d = get_cpu_gdt_rw(cpu);
+ tss_desc tss;
+
+ set_tssldt_descriptor(&tss, (unsigned long)addr, DESC_TSS,
+ __KERNEL_TSS_LIMIT);
+ write_gdt_entry(d, entry, &tss, DESC_TSS);
+}
+
+#define set_tss_desc(cpu, addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr)
+
+static inline void native_set_ldt(const void *addr, unsigned int entries)
+{
+ if (likely(entries == 0))
+ asm volatile("lldt %w0"::"q" (0));
+ else {
+ unsigned cpu = smp_processor_id();
+ ldt_desc ldt;
+
+ set_tssldt_descriptor(&ldt, (unsigned long)addr, DESC_LDT,
+ entries * LDT_ENTRY_SIZE - 1);
+ write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_LDT,
+ &ldt, DESC_LDT);
+ asm volatile("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
+ }
+}
+
+static inline void native_load_gdt(const struct desc_ptr *dtr)
+{
+ asm volatile("lgdt %0"::"m" (*dtr));
+}
+
+static inline void native_load_idt(const struct desc_ptr *dtr)
+{
+ asm volatile("lidt %0"::"m" (*dtr));
+}
+
+static inline void native_store_gdt(struct desc_ptr *dtr)
+{
+ asm volatile("sgdt %0":"=m" (*dtr));
+}
+
+static inline void store_idt(struct desc_ptr *dtr)
+{
+ asm volatile("sidt %0":"=m" (*dtr));
+}
+
+/*
+ * The LTR instruction marks the TSS GDT entry as busy. On 64-bit, the GDT is
+ * a read-only remapping. To prevent a page fault, the GDT is switched to the
+ * original writeable version when needed.
+ */
+#ifdef CONFIG_X86_64
+static inline void native_load_tr_desc(void)
+{
+ struct desc_ptr gdt;
+ int cpu = raw_smp_processor_id();
+ bool restore = 0;
+ struct desc_struct *fixmap_gdt;
+
+ native_store_gdt(&gdt);
+ fixmap_gdt = get_cpu_gdt_ro(cpu);
+
+ /*
+ * If the current GDT is the read-only fixmap, swap to the original
+ * writeable version. Swap back at the end.
+ */
+ if (gdt.address == (unsigned long)fixmap_gdt) {
+ load_direct_gdt(cpu);
+ restore = 1;
+ }
+ asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
+ if (restore)
+ load_fixmap_gdt(cpu);
+}
+#else
+static inline void native_load_tr_desc(void)
+{
+ asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
+}
+#endif
+
+static inline unsigned long native_store_tr(void)
+{
+ unsigned long tr;
+
+ asm volatile("str %0":"=r" (tr));
+
+ return tr;
+}
+
+static inline void native_load_tls(struct thread_struct *t, unsigned int cpu)
+{
+ struct desc_struct *gdt = get_cpu_gdt_rw(cpu);
+ unsigned int i;
+
+ for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
+ gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i];
+}
+
+DECLARE_PER_CPU(bool, __tss_limit_invalid);
+
+static inline void force_reload_TR(void)
+{
+ struct desc_struct *d = get_current_gdt_rw();
+ tss_desc tss;
+
+ memcpy(&tss, &d[GDT_ENTRY_TSS], sizeof(tss_desc));
+
+ /*
+ * LTR requires an available TSS, and the TSS is currently
+ * busy. Make it be available so that LTR will work.
+ */
+ tss.type = DESC_TSS;
+ write_gdt_entry(d, GDT_ENTRY_TSS, &tss, DESC_TSS);
+
+ load_TR_desc();
+ this_cpu_write(__tss_limit_invalid, false);
+}
+
+/*
+ * Call this if you need the TSS limit to be correct, which should be the case
+ * if and only if you have TIF_IO_BITMAP set or you're switching to a task
+ * with TIF_IO_BITMAP set.
+ */
+static inline void refresh_tss_limit(void)
+{
+ DEBUG_LOCKS_WARN_ON(preemptible());
+
+ if (unlikely(this_cpu_read(__tss_limit_invalid)))
+ force_reload_TR();
+}
+
+/*
+ * If you do something evil that corrupts the cached TSS limit (I'm looking
+ * at you, VMX exits), call this function.
+ *
+ * The optimization here is that the TSS limit only matters for Linux if the
+ * IO bitmap is in use. If the TSS limit gets forced to its minimum value,
+ * everything works except that IO bitmap will be ignored and all CPL 3 IO
+ * instructions will #GP, which is exactly what we want for normal tasks.
+ */
+static inline void invalidate_tss_limit(void)
+{
+ DEBUG_LOCKS_WARN_ON(preemptible());
+
+ if (unlikely(test_thread_flag(TIF_IO_BITMAP)))
+ force_reload_TR();
+ else
+ this_cpu_write(__tss_limit_invalid, true);
+}
+
+/* This intentionally ignores lm, since 32-bit apps don't have that field. */
+#define LDT_empty(info) \
+ ((info)->base_addr == 0 && \
+ (info)->limit == 0 && \
+ (info)->contents == 0 && \
+ (info)->read_exec_only == 1 && \
+ (info)->seg_32bit == 0 && \
+ (info)->limit_in_pages == 0 && \
+ (info)->seg_not_present == 1 && \
+ (info)->useable == 0)
+
+/* Lots of programs expect an all-zero user_desc to mean "no segment at all". */
+static inline bool LDT_zero(const struct user_desc *info)
+{
+ return (info->base_addr == 0 &&
+ info->limit == 0 &&
+ info->contents == 0 &&
+ info->read_exec_only == 0 &&
+ info->seg_32bit == 0 &&
+ info->limit_in_pages == 0 &&
+ info->seg_not_present == 0 &&
+ info->useable == 0);
+}
+
+static inline void clear_LDT(void)
+{
+ set_ldt(NULL, 0);
+}
+
+static inline unsigned long get_desc_base(const struct desc_struct *desc)
+{
+ return (unsigned)(desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24));
+}
+
+static inline void set_desc_base(struct desc_struct *desc, unsigned long base)
+{
+ desc->base0 = base & 0xffff;
+ desc->base1 = (base >> 16) & 0xff;
+ desc->base2 = (base >> 24) & 0xff;
+}
+
+static inline unsigned long get_desc_limit(const struct desc_struct *desc)
+{
+ return desc->limit0 | (desc->limit1 << 16);
+}
+
+static inline void set_desc_limit(struct desc_struct *desc, unsigned long limit)
+{
+ desc->limit0 = limit & 0xffff;
+ desc->limit1 = (limit >> 16) & 0xf;
+}
+
+void update_intr_gate(unsigned int n, const void *addr);
+void alloc_intr_gate(unsigned int n, const void *addr);
+
+extern unsigned long system_vectors[];
+
+#ifdef CONFIG_X86_64
+DECLARE_PER_CPU(u32, debug_idt_ctr);
+static inline bool is_debug_idt_enabled(void)
+{
+ if (this_cpu_read(debug_idt_ctr))
+ return true;
+
+ return false;
+}
+
+static inline void load_debug_idt(void)
+{
+ load_idt((const struct desc_ptr *)&debug_idt_descr);
+}
+#else
+static inline bool is_debug_idt_enabled(void)
+{
+ return false;
+}
+
+static inline void load_debug_idt(void)
+{
+}
+#endif
+
+/*
+ * The load_current_idt() must be called with interrupts disabled
+ * to avoid races. That way the IDT will always be set back to the expected
+ * descriptor. It's also called when a CPU is being initialized, and
+ * that doesn't need to disable interrupts, as nothing should be
+ * bothering the CPU then.
+ */
+static inline void load_current_idt(void)
+{
+ if (is_debug_idt_enabled())
+ load_debug_idt();
+ else
+ load_idt((const struct desc_ptr *)&idt_descr);
+}
+
+extern void idt_setup_early_handler(void);
+extern void idt_setup_early_traps(void);
+extern void idt_setup_traps(void);
+extern void idt_setup_apic_and_irq_gates(void);
+
+#ifdef CONFIG_X86_64
+extern void idt_setup_early_pf(void);
+extern void idt_setup_ist_traps(void);
+extern void idt_setup_debugidt_traps(void);
+#else
+static inline void idt_setup_early_pf(void) { }
+static inline void idt_setup_ist_traps(void) { }
+static inline void idt_setup_debugidt_traps(void) { }
+#endif
+
+extern void idt_invalidate(void *addr);
+
+#endif /* _ASM_X86_DESC_H */
diff --git a/arch/x86/include/asm/desc_defs.h b/arch/x86/include/asm/desc_defs.h
new file mode 100644
index 000000000..a91f3b6e4
--- /dev/null
+++ b/arch/x86/include/asm/desc_defs.h
@@ -0,0 +1,135 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Written 2000 by Andi Kleen */
+#ifndef _ASM_X86_DESC_DEFS_H
+#define _ASM_X86_DESC_DEFS_H
+
+/*
+ * Segment descriptor structure definitions, usable from both x86_64 and i386
+ * archs.
+ */
+
+#ifndef __ASSEMBLY__
+
+#include <linux/types.h>
+
+/* 8 byte segment descriptor */
+struct desc_struct {
+ u16 limit0;
+ u16 base0;
+ u16 base1: 8, type: 4, s: 1, dpl: 2, p: 1;
+ u16 limit1: 4, avl: 1, l: 1, d: 1, g: 1, base2: 8;
+} __attribute__((packed));
+
+#define GDT_ENTRY_INIT(flags, base, limit) \
+ { \
+ .limit0 = (u16) (limit), \
+ .limit1 = ((limit) >> 16) & 0x0F, \
+ .base0 = (u16) (base), \
+ .base1 = ((base) >> 16) & 0xFF, \
+ .base2 = ((base) >> 24) & 0xFF, \
+ .type = (flags & 0x0f), \
+ .s = (flags >> 4) & 0x01, \
+ .dpl = (flags >> 5) & 0x03, \
+ .p = (flags >> 7) & 0x01, \
+ .avl = (flags >> 12) & 0x01, \
+ .l = (flags >> 13) & 0x01, \
+ .d = (flags >> 14) & 0x01, \
+ .g = (flags >> 15) & 0x01, \
+ }
+
+enum {
+ GATE_INTERRUPT = 0xE,
+ GATE_TRAP = 0xF,
+ GATE_CALL = 0xC,
+ GATE_TASK = 0x5,
+};
+
+enum {
+ DESC_TSS = 0x9,
+ DESC_LDT = 0x2,
+ DESCTYPE_S = 0x10, /* !system */
+};
+
+/* LDT or TSS descriptor in the GDT. */
+struct ldttss_desc {
+ u16 limit0;
+ u16 base0;
+
+ u16 base1 : 8, type : 5, dpl : 2, p : 1;
+ u16 limit1 : 4, zero0 : 3, g : 1, base2 : 8;
+#ifdef CONFIG_X86_64
+ u32 base3;
+ u32 zero1;
+#endif
+} __attribute__((packed));
+
+typedef struct ldttss_desc ldt_desc;
+typedef struct ldttss_desc tss_desc;
+
+struct idt_bits {
+ u16 ist : 3,
+ zero : 5,
+ type : 5,
+ dpl : 2,
+ p : 1;
+} __attribute__((packed));
+
+struct gate_struct {
+ u16 offset_low;
+ u16 segment;
+ struct idt_bits bits;
+ u16 offset_middle;
+#ifdef CONFIG_X86_64
+ u32 offset_high;
+ u32 reserved;
+#endif
+} __attribute__((packed));
+
+typedef struct gate_struct gate_desc;
+
+static inline unsigned long gate_offset(const gate_desc *g)
+{
+#ifdef CONFIG_X86_64
+ return g->offset_low | ((unsigned long)g->offset_middle << 16) |
+ ((unsigned long) g->offset_high << 32);
+#else
+ return g->offset_low | ((unsigned long)g->offset_middle << 16);
+#endif
+}
+
+static inline unsigned long gate_segment(const gate_desc *g)
+{
+ return g->segment;
+}
+
+struct desc_ptr {
+ unsigned short size;
+ unsigned long address;
+} __attribute__((packed)) ;
+
+#endif /* !__ASSEMBLY__ */
+
+/* Access rights as returned by LAR */
+#define AR_TYPE_RODATA (0 * (1 << 9))
+#define AR_TYPE_RWDATA (1 * (1 << 9))
+#define AR_TYPE_RODATA_EXPDOWN (2 * (1 << 9))
+#define AR_TYPE_RWDATA_EXPDOWN (3 * (1 << 9))
+#define AR_TYPE_XOCODE (4 * (1 << 9))
+#define AR_TYPE_XRCODE (5 * (1 << 9))
+#define AR_TYPE_XOCODE_CONF (6 * (1 << 9))
+#define AR_TYPE_XRCODE_CONF (7 * (1 << 9))
+#define AR_TYPE_MASK (7 * (1 << 9))
+
+#define AR_DPL0 (0 * (1 << 13))
+#define AR_DPL3 (3 * (1 << 13))
+#define AR_DPL_MASK (3 * (1 << 13))
+
+#define AR_A (1 << 8) /* "Accessed" */
+#define AR_S (1 << 12) /* If clear, "System" segment */
+#define AR_P (1 << 15) /* "Present" */
+#define AR_AVL (1 << 20) /* "AVaiLable" (no HW effect) */
+#define AR_L (1 << 21) /* "Long mode" for code segments */
+#define AR_DB (1 << 22) /* D/B, effect depends on type */
+#define AR_G (1 << 23) /* "Granularity" (limit in pages) */
+
+#endif /* _ASM_X86_DESC_DEFS_H */
diff --git a/arch/x86/include/asm/device.h b/arch/x86/include/asm/device.h
new file mode 100644
index 000000000..a8f6c809d
--- /dev/null
+++ b/arch/x86/include/asm/device.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_DEVICE_H
+#define _ASM_X86_DEVICE_H
+
+struct dev_archdata {
+#if defined(CONFIG_INTEL_IOMMU) || defined(CONFIG_AMD_IOMMU)
+ void *iommu; /* hook for IOMMU specific extension */
+#endif
+#ifdef CONFIG_STA2X11
+ bool is_sta2x11;
+#endif
+};
+
+#if defined(CONFIG_X86_DEV_DMA_OPS) && defined(CONFIG_PCI_DOMAINS)
+struct dma_domain {
+ struct list_head node;
+ const struct dma_map_ops *dma_ops;
+ int domain_nr;
+};
+void add_dma_domain(struct dma_domain *domain);
+void del_dma_domain(struct dma_domain *domain);
+#endif
+
+struct pdev_archdata {
+};
+
+#endif /* _ASM_X86_DEVICE_H */
diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
new file mode 100644
index 000000000..33833d190
--- /dev/null
+++ b/arch/x86/include/asm/disabled-features.h
@@ -0,0 +1,83 @@
+#ifndef _ASM_X86_DISABLED_FEATURES_H
+#define _ASM_X86_DISABLED_FEATURES_H
+
+/* These features, although they might be available in a CPU
+ * will not be used because the compile options to support
+ * them are not present.
+ *
+ * This code allows them to be checked and disabled at
+ * compile time without an explicit #ifdef. Use
+ * cpu_feature_enabled().
+ */
+
+#ifdef CONFIG_X86_INTEL_MPX
+# define DISABLE_MPX 0
+#else
+# define DISABLE_MPX (1<<(X86_FEATURE_MPX & 31))
+#endif
+
+#ifdef CONFIG_X86_INTEL_UMIP
+# define DISABLE_UMIP 0
+#else
+# define DISABLE_UMIP (1<<(X86_FEATURE_UMIP & 31))
+#endif
+
+#ifdef CONFIG_X86_64
+# define DISABLE_VME (1<<(X86_FEATURE_VME & 31))
+# define DISABLE_K6_MTRR (1<<(X86_FEATURE_K6_MTRR & 31))
+# define DISABLE_CYRIX_ARR (1<<(X86_FEATURE_CYRIX_ARR & 31))
+# define DISABLE_CENTAUR_MCR (1<<(X86_FEATURE_CENTAUR_MCR & 31))
+# define DISABLE_PCID 0
+#else
+# define DISABLE_VME 0
+# define DISABLE_K6_MTRR 0
+# define DISABLE_CYRIX_ARR 0
+# define DISABLE_CENTAUR_MCR 0
+# define DISABLE_PCID (1<<(X86_FEATURE_PCID & 31))
+#endif /* CONFIG_X86_64 */
+
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+# define DISABLE_PKU 0
+# define DISABLE_OSPKE 0
+#else
+# define DISABLE_PKU (1<<(X86_FEATURE_PKU & 31))
+# define DISABLE_OSPKE (1<<(X86_FEATURE_OSPKE & 31))
+#endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */
+
+#ifdef CONFIG_X86_5LEVEL
+# define DISABLE_LA57 0
+#else
+# define DISABLE_LA57 (1<<(X86_FEATURE_LA57 & 31))
+#endif
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+# define DISABLE_PTI 0
+#else
+# define DISABLE_PTI (1 << (X86_FEATURE_PTI & 31))
+#endif
+
+/*
+ * Make sure to add features to the correct mask
+ */
+#define DISABLED_MASK0 (DISABLE_VME)
+#define DISABLED_MASK1 0
+#define DISABLED_MASK2 0
+#define DISABLED_MASK3 (DISABLE_CYRIX_ARR|DISABLE_CENTAUR_MCR|DISABLE_K6_MTRR)
+#define DISABLED_MASK4 (DISABLE_PCID)
+#define DISABLED_MASK5 0
+#define DISABLED_MASK6 0
+#define DISABLED_MASK7 (DISABLE_PTI)
+#define DISABLED_MASK8 0
+#define DISABLED_MASK9 (DISABLE_MPX)
+#define DISABLED_MASK10 0
+#define DISABLED_MASK11 0
+#define DISABLED_MASK12 0
+#define DISABLED_MASK13 0
+#define DISABLED_MASK14 0
+#define DISABLED_MASK15 0
+#define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP)
+#define DISABLED_MASK17 0
+#define DISABLED_MASK18 0
+#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19)
+
+#endif /* _ASM_X86_DISABLED_FEATURES_H */
diff --git a/arch/x86/include/asm/div64.h b/arch/x86/include/asm/div64.h
new file mode 100644
index 000000000..20a46150e
--- /dev/null
+++ b/arch/x86/include/asm/div64.h
@@ -0,0 +1,78 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_DIV64_H
+#define _ASM_X86_DIV64_H
+
+#ifdef CONFIG_X86_32
+
+#include <linux/types.h>
+#include <linux/log2.h>
+
+/*
+ * do_div() is NOT a C function. It wants to return
+ * two values (the quotient and the remainder), but
+ * since that doesn't work very well in C, what it
+ * does is:
+ *
+ * - modifies the 64-bit dividend _in_place_
+ * - returns the 32-bit remainder
+ *
+ * This ends up being the most efficient "calling
+ * convention" on x86.
+ */
+#define do_div(n, base) \
+({ \
+ unsigned long __upper, __low, __high, __mod, __base; \
+ __base = (base); \
+ if (__builtin_constant_p(__base) && is_power_of_2(__base)) { \
+ __mod = n & (__base - 1); \
+ n >>= ilog2(__base); \
+ } else { \
+ asm("" : "=a" (__low), "=d" (__high) : "A" (n));\
+ __upper = __high; \
+ if (__high) { \
+ __upper = __high % (__base); \
+ __high = __high / (__base); \
+ } \
+ asm("divl %2" : "=a" (__low), "=d" (__mod) \
+ : "rm" (__base), "0" (__low), "1" (__upper)); \
+ asm("" : "=A" (n) : "a" (__low), "d" (__high)); \
+ } \
+ __mod; \
+})
+
+static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder)
+{
+ union {
+ u64 v64;
+ u32 v32[2];
+ } d = { dividend };
+ u32 upper;
+
+ upper = d.v32[1];
+ d.v32[1] = 0;
+ if (upper >= divisor) {
+ d.v32[1] = upper / divisor;
+ upper %= divisor;
+ }
+ asm ("divl %2" : "=a" (d.v32[0]), "=d" (*remainder) :
+ "rm" (divisor), "0" (d.v32[0]), "1" (upper));
+ return d.v64;
+}
+#define div_u64_rem div_u64_rem
+
+static inline u64 mul_u32_u32(u32 a, u32 b)
+{
+ u32 high, low;
+
+ asm ("mull %[b]" : "=a" (low), "=d" (high)
+ : [a] "a" (a), [b] "rm" (b) );
+
+ return low | ((u64)high) << 32;
+}
+#define mul_u32_u32 mul_u32_u32
+
+#else
+# include <asm-generic/div64.h>
+#endif /* CONFIG_X86_32 */
+
+#endif /* _ASM_X86_DIV64_H */
diff --git a/arch/x86/include/asm/dma-direct.h b/arch/x86/include/asm/dma-direct.h
new file mode 100644
index 000000000..1a19251ea
--- /dev/null
+++ b/arch/x86/include/asm/dma-direct.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ASM_X86_DMA_DIRECT_H
+#define ASM_X86_DMA_DIRECT_H 1
+
+bool dma_capable(struct device *dev, dma_addr_t addr, size_t size);
+dma_addr_t __phys_to_dma(struct device *dev, phys_addr_t paddr);
+phys_addr_t __dma_to_phys(struct device *dev, dma_addr_t daddr);
+
+#endif /* ASM_X86_DMA_DIRECT_H */
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
new file mode 100644
index 000000000..ce4d176b3
--- /dev/null
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_DMA_MAPPING_H
+#define _ASM_X86_DMA_MAPPING_H
+
+/*
+ * IOMMU interface. See Documentation/DMA-API-HOWTO.txt and
+ * Documentation/DMA-API.txt for documentation.
+ */
+
+#include <linux/scatterlist.h>
+#include <linux/dma-debug.h>
+#include <asm/io.h>
+#include <asm/swiotlb.h>
+#include <linux/dma-contiguous.h>
+
+#ifdef CONFIG_ISA
+# define ISA_DMA_BIT_MASK DMA_BIT_MASK(24)
+#else
+# define ISA_DMA_BIT_MASK DMA_BIT_MASK(32)
+#endif
+
+extern int iommu_merge;
+extern struct device x86_dma_fallback_dev;
+extern int panic_on_overflow;
+
+extern const struct dma_map_ops *dma_ops;
+
+static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
+{
+ return dma_ops;
+}
+
+bool arch_dma_alloc_attrs(struct device **dev);
+#define arch_dma_alloc_attrs arch_dma_alloc_attrs
+
+#endif
diff --git a/arch/x86/include/asm/dma.h b/arch/x86/include/asm/dma.h
new file mode 100644
index 000000000..8e95aa4b0
--- /dev/null
+++ b/arch/x86/include/asm/dma.h
@@ -0,0 +1,318 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * linux/include/asm/dma.h: Defines for using and allocating dma channels.
+ * Written by Hennus Bergman, 1992.
+ * High DMA channel support & info by Hannu Savolainen
+ * and John Boyd, Nov. 1992.
+ */
+
+#ifndef _ASM_X86_DMA_H
+#define _ASM_X86_DMA_H
+
+#include <linux/spinlock.h> /* And spinlocks */
+#include <asm/io.h> /* need byte IO */
+
+#ifdef HAVE_REALLY_SLOW_DMA_CONTROLLER
+#define dma_outb outb_p
+#else
+#define dma_outb outb
+#endif
+
+#define dma_inb inb
+
+/*
+ * NOTES about DMA transfers:
+ *
+ * controller 1: channels 0-3, byte operations, ports 00-1F
+ * controller 2: channels 4-7, word operations, ports C0-DF
+ *
+ * - ALL registers are 8 bits only, regardless of transfer size
+ * - channel 4 is not used - cascades 1 into 2.
+ * - channels 0-3 are byte - addresses/counts are for physical bytes
+ * - channels 5-7 are word - addresses/counts are for physical words
+ * - transfers must not cross physical 64K (0-3) or 128K (5-7) boundaries
+ * - transfer count loaded to registers is 1 less than actual count
+ * - controller 2 offsets are all even (2x offsets for controller 1)
+ * - page registers for 5-7 don't use data bit 0, represent 128K pages
+ * - page registers for 0-3 use bit 0, represent 64K pages
+ *
+ * DMA transfers are limited to the lower 16MB of _physical_ memory.
+ * Note that addresses loaded into registers must be _physical_ addresses,
+ * not logical addresses (which may differ if paging is active).
+ *
+ * Address mapping for channels 0-3:
+ *
+ * A23 ... A16 A15 ... A8 A7 ... A0 (Physical addresses)
+ * | ... | | ... | | ... |
+ * | ... | | ... | | ... |
+ * | ... | | ... | | ... |
+ * P7 ... P0 A7 ... A0 A7 ... A0
+ * | Page | Addr MSB | Addr LSB | (DMA registers)
+ *
+ * Address mapping for channels 5-7:
+ *
+ * A23 ... A17 A16 A15 ... A9 A8 A7 ... A1 A0 (Physical addresses)
+ * | ... | \ \ ... \ \ \ ... \ \
+ * | ... | \ \ ... \ \ \ ... \ (not used)
+ * | ... | \ \ ... \ \ \ ... \
+ * P7 ... P1 (0) A7 A6 ... A0 A7 A6 ... A0
+ * | Page | Addr MSB | Addr LSB | (DMA registers)
+ *
+ * Again, channels 5-7 transfer _physical_ words (16 bits), so addresses
+ * and counts _must_ be word-aligned (the lowest address bit is _ignored_ at
+ * the hardware level, so odd-byte transfers aren't possible).
+ *
+ * Transfer count (_not # bytes_) is limited to 64K, represented as actual
+ * count - 1 : 64K => 0xFFFF, 1 => 0x0000. Thus, count is always 1 or more,
+ * and up to 128K bytes may be transferred on channels 5-7 in one operation.
+ *
+ */
+
+#define MAX_DMA_CHANNELS 8
+
+/* 16MB ISA DMA zone */
+#define MAX_DMA_PFN ((16UL * 1024 * 1024) >> PAGE_SHIFT)
+
+/* 4GB broken PCI/AGP hardware bus master zone */
+#define MAX_DMA32_PFN (1UL << (32 - PAGE_SHIFT))
+
+#ifdef CONFIG_X86_32
+/* The maximum address that we can perform a DMA transfer to on this platform */
+#define MAX_DMA_ADDRESS (PAGE_OFFSET + 0x1000000)
+#else
+/* Compat define for old dma zone */
+#define MAX_DMA_ADDRESS ((unsigned long)__va(MAX_DMA_PFN << PAGE_SHIFT))
+#endif
+
+/* 8237 DMA controllers */
+#define IO_DMA1_BASE 0x00 /* 8 bit slave DMA, channels 0..3 */
+#define IO_DMA2_BASE 0xC0 /* 16 bit master DMA, ch 4(=slave input)..7 */
+
+/* DMA controller registers */
+#define DMA1_CMD_REG 0x08 /* command register (w) */
+#define DMA1_STAT_REG 0x08 /* status register (r) */
+#define DMA1_REQ_REG 0x09 /* request register (w) */
+#define DMA1_MASK_REG 0x0A /* single-channel mask (w) */
+#define DMA1_MODE_REG 0x0B /* mode register (w) */
+#define DMA1_CLEAR_FF_REG 0x0C /* clear pointer flip-flop (w) */
+#define DMA1_TEMP_REG 0x0D /* Temporary Register (r) */
+#define DMA1_RESET_REG 0x0D /* Master Clear (w) */
+#define DMA1_CLR_MASK_REG 0x0E /* Clear Mask */
+#define DMA1_MASK_ALL_REG 0x0F /* all-channels mask (w) */
+
+#define DMA2_CMD_REG 0xD0 /* command register (w) */
+#define DMA2_STAT_REG 0xD0 /* status register (r) */
+#define DMA2_REQ_REG 0xD2 /* request register (w) */
+#define DMA2_MASK_REG 0xD4 /* single-channel mask (w) */
+#define DMA2_MODE_REG 0xD6 /* mode register (w) */
+#define DMA2_CLEAR_FF_REG 0xD8 /* clear pointer flip-flop (w) */
+#define DMA2_TEMP_REG 0xDA /* Temporary Register (r) */
+#define DMA2_RESET_REG 0xDA /* Master Clear (w) */
+#define DMA2_CLR_MASK_REG 0xDC /* Clear Mask */
+#define DMA2_MASK_ALL_REG 0xDE /* all-channels mask (w) */
+
+#define DMA_ADDR_0 0x00 /* DMA address registers */
+#define DMA_ADDR_1 0x02
+#define DMA_ADDR_2 0x04
+#define DMA_ADDR_3 0x06
+#define DMA_ADDR_4 0xC0
+#define DMA_ADDR_5 0xC4
+#define DMA_ADDR_6 0xC8
+#define DMA_ADDR_7 0xCC
+
+#define DMA_CNT_0 0x01 /* DMA count registers */
+#define DMA_CNT_1 0x03
+#define DMA_CNT_2 0x05
+#define DMA_CNT_3 0x07
+#define DMA_CNT_4 0xC2
+#define DMA_CNT_5 0xC6
+#define DMA_CNT_6 0xCA
+#define DMA_CNT_7 0xCE
+
+#define DMA_PAGE_0 0x87 /* DMA page registers */
+#define DMA_PAGE_1 0x83
+#define DMA_PAGE_2 0x81
+#define DMA_PAGE_3 0x82
+#define DMA_PAGE_5 0x8B
+#define DMA_PAGE_6 0x89
+#define DMA_PAGE_7 0x8A
+
+/* I/O to memory, no autoinit, increment, single mode */
+#define DMA_MODE_READ 0x44
+/* memory to I/O, no autoinit, increment, single mode */
+#define DMA_MODE_WRITE 0x48
+/* pass thru DREQ->HRQ, DACK<-HLDA only */
+#define DMA_MODE_CASCADE 0xC0
+
+#define DMA_AUTOINIT 0x10
+
+
+#ifdef CONFIG_ISA_DMA_API
+extern spinlock_t dma_spin_lock;
+
+static inline unsigned long claim_dma_lock(void)
+{
+ unsigned long flags;
+ spin_lock_irqsave(&dma_spin_lock, flags);
+ return flags;
+}
+
+static inline void release_dma_lock(unsigned long flags)
+{
+ spin_unlock_irqrestore(&dma_spin_lock, flags);
+}
+#endif /* CONFIG_ISA_DMA_API */
+
+/* enable/disable a specific DMA channel */
+static inline void enable_dma(unsigned int dmanr)
+{
+ if (dmanr <= 3)
+ dma_outb(dmanr, DMA1_MASK_REG);
+ else
+ dma_outb(dmanr & 3, DMA2_MASK_REG);
+}
+
+static inline void disable_dma(unsigned int dmanr)
+{
+ if (dmanr <= 3)
+ dma_outb(dmanr | 4, DMA1_MASK_REG);
+ else
+ dma_outb((dmanr & 3) | 4, DMA2_MASK_REG);
+}
+
+/* Clear the 'DMA Pointer Flip Flop'.
+ * Write 0 for LSB/MSB, 1 for MSB/LSB access.
+ * Use this once to initialize the FF to a known state.
+ * After that, keep track of it. :-)
+ * --- In order to do that, the DMA routines below should ---
+ * --- only be used while holding the DMA lock ! ---
+ */
+static inline void clear_dma_ff(unsigned int dmanr)
+{
+ if (dmanr <= 3)
+ dma_outb(0, DMA1_CLEAR_FF_REG);
+ else
+ dma_outb(0, DMA2_CLEAR_FF_REG);
+}
+
+/* set mode (above) for a specific DMA channel */
+static inline void set_dma_mode(unsigned int dmanr, char mode)
+{
+ if (dmanr <= 3)
+ dma_outb(mode | dmanr, DMA1_MODE_REG);
+ else
+ dma_outb(mode | (dmanr & 3), DMA2_MODE_REG);
+}
+
+/* Set only the page register bits of the transfer address.
+ * This is used for successive transfers when we know the contents of
+ * the lower 16 bits of the DMA current address register, but a 64k boundary
+ * may have been crossed.
+ */
+static inline void set_dma_page(unsigned int dmanr, char pagenr)
+{
+ switch (dmanr) {
+ case 0:
+ dma_outb(pagenr, DMA_PAGE_0);
+ break;
+ case 1:
+ dma_outb(pagenr, DMA_PAGE_1);
+ break;
+ case 2:
+ dma_outb(pagenr, DMA_PAGE_2);
+ break;
+ case 3:
+ dma_outb(pagenr, DMA_PAGE_3);
+ break;
+ case 5:
+ dma_outb(pagenr & 0xfe, DMA_PAGE_5);
+ break;
+ case 6:
+ dma_outb(pagenr & 0xfe, DMA_PAGE_6);
+ break;
+ case 7:
+ dma_outb(pagenr & 0xfe, DMA_PAGE_7);
+ break;
+ }
+}
+
+
+/* Set transfer address & page bits for specific DMA channel.
+ * Assumes dma flipflop is clear.
+ */
+static inline void set_dma_addr(unsigned int dmanr, unsigned int a)
+{
+ set_dma_page(dmanr, a>>16);
+ if (dmanr <= 3) {
+ dma_outb(a & 0xff, ((dmanr & 3) << 1) + IO_DMA1_BASE);
+ dma_outb((a >> 8) & 0xff, ((dmanr & 3) << 1) + IO_DMA1_BASE);
+ } else {
+ dma_outb((a >> 1) & 0xff, ((dmanr & 3) << 2) + IO_DMA2_BASE);
+ dma_outb((a >> 9) & 0xff, ((dmanr & 3) << 2) + IO_DMA2_BASE);
+ }
+}
+
+
+/* Set transfer size (max 64k for DMA0..3, 128k for DMA5..7) for
+ * a specific DMA channel.
+ * You must ensure the parameters are valid.
+ * NOTE: from a manual: "the number of transfers is one more
+ * than the initial word count"! This is taken into account.
+ * Assumes dma flip-flop is clear.
+ * NOTE 2: "count" represents _bytes_ and must be even for channels 5-7.
+ */
+static inline void set_dma_count(unsigned int dmanr, unsigned int count)
+{
+ count--;
+ if (dmanr <= 3) {
+ dma_outb(count & 0xff, ((dmanr & 3) << 1) + 1 + IO_DMA1_BASE);
+ dma_outb((count >> 8) & 0xff,
+ ((dmanr & 3) << 1) + 1 + IO_DMA1_BASE);
+ } else {
+ dma_outb((count >> 1) & 0xff,
+ ((dmanr & 3) << 2) + 2 + IO_DMA2_BASE);
+ dma_outb((count >> 9) & 0xff,
+ ((dmanr & 3) << 2) + 2 + IO_DMA2_BASE);
+ }
+}
+
+
+/* Get DMA residue count. After a DMA transfer, this
+ * should return zero. Reading this while a DMA transfer is
+ * still in progress will return unpredictable results.
+ * If called before the channel has been used, it may return 1.
+ * Otherwise, it returns the number of _bytes_ left to transfer.
+ *
+ * Assumes DMA flip-flop is clear.
+ */
+static inline int get_dma_residue(unsigned int dmanr)
+{
+ unsigned int io_port;
+ /* using short to get 16-bit wrap around */
+ unsigned short count;
+
+ io_port = (dmanr <= 3) ? ((dmanr & 3) << 1) + 1 + IO_DMA1_BASE
+ : ((dmanr & 3) << 2) + 2 + IO_DMA2_BASE;
+
+ count = 1 + dma_inb(io_port);
+ count += dma_inb(io_port) << 8;
+
+ return (dmanr <= 3) ? count : (count << 1);
+}
+
+
+/* These are in kernel/dma.c because x86 uses CONFIG_GENERIC_ISA_DMA */
+#ifdef CONFIG_ISA_DMA_API
+extern int request_dma(unsigned int dmanr, const char *device_id);
+extern void free_dma(unsigned int dmanr);
+#endif
+
+/* From PCI */
+
+#ifdef CONFIG_PCI
+extern int isa_dma_bridge_buggy;
+#else
+#define isa_dma_bridge_buggy (0)
+#endif
+
+#endif /* _ASM_X86_DMA_H */
diff --git a/arch/x86/include/asm/dmi.h b/arch/x86/include/asm/dmi.h
new file mode 100644
index 000000000..b825cb201
--- /dev/null
+++ b/arch/x86/include/asm/dmi.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_DMI_H
+#define _ASM_X86_DMI_H
+
+#include <linux/compiler.h>
+#include <linux/init.h>
+#include <linux/io.h>
+
+#include <asm/setup.h>
+
+static __always_inline __init void *dmi_alloc(unsigned len)
+{
+ return extend_brk(len, sizeof(int));
+}
+
+/* Use early IO mappings for DMI because it's initialized early */
+#define dmi_early_remap early_memremap
+#define dmi_early_unmap early_memunmap
+#define dmi_remap(_x, _l) memremap(_x, _l, MEMREMAP_WB)
+#define dmi_unmap(_x) memunmap(_x)
+
+#endif /* _ASM_X86_DMI_H */
diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h
new file mode 100644
index 000000000..ae391f609
--- /dev/null
+++ b/arch/x86/include/asm/dwarf2.h
@@ -0,0 +1,85 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_DWARF2_H
+#define _ASM_X86_DWARF2_H
+
+#ifndef __ASSEMBLY__
+#warning "asm/dwarf2.h should be only included in pure assembly files"
+#endif
+
+/*
+ * Macros for dwarf2 CFI unwind table entries.
+ * See "as.info" for details on these pseudo ops. Unfortunately
+ * they are only supported in very new binutils, so define them
+ * away for older version.
+ */
+
+#ifdef CONFIG_AS_CFI
+
+#define CFI_STARTPROC .cfi_startproc
+#define CFI_ENDPROC .cfi_endproc
+#define CFI_DEF_CFA .cfi_def_cfa
+#define CFI_DEF_CFA_REGISTER .cfi_def_cfa_register
+#define CFI_DEF_CFA_OFFSET .cfi_def_cfa_offset
+#define CFI_ADJUST_CFA_OFFSET .cfi_adjust_cfa_offset
+#define CFI_OFFSET .cfi_offset
+#define CFI_REL_OFFSET .cfi_rel_offset
+#define CFI_REGISTER .cfi_register
+#define CFI_RESTORE .cfi_restore
+#define CFI_REMEMBER_STATE .cfi_remember_state
+#define CFI_RESTORE_STATE .cfi_restore_state
+#define CFI_UNDEFINED .cfi_undefined
+#define CFI_ESCAPE .cfi_escape
+
+#ifdef CONFIG_AS_CFI_SIGNAL_FRAME
+#define CFI_SIGNAL_FRAME .cfi_signal_frame
+#else
+#define CFI_SIGNAL_FRAME
+#endif
+
+#if defined(CONFIG_AS_CFI_SECTIONS) && defined(__ASSEMBLY__)
+#ifndef BUILD_VDSO
+ /*
+ * Emit CFI data in .debug_frame sections, not .eh_frame sections.
+ * The latter we currently just discard since we don't do DWARF
+ * unwinding at runtime. So only the offline DWARF information is
+ * useful to anyone. Note we should not use this directive if
+ * vmlinux.lds.S gets changed so it doesn't discard .eh_frame.
+ */
+ .cfi_sections .debug_frame
+#else
+ /*
+ * For the vDSO, emit both runtime unwind information and debug
+ * symbols for the .dbg file.
+ */
+ .cfi_sections .eh_frame, .debug_frame
+#endif
+#endif
+
+#else
+
+/*
+ * Due to the structure of pre-exisiting code, don't use assembler line
+ * comment character # to ignore the arguments. Instead, use a dummy macro.
+ */
+.macro cfi_ignore a=0, b=0, c=0, d=0
+.endm
+
+#define CFI_STARTPROC cfi_ignore
+#define CFI_ENDPROC cfi_ignore
+#define CFI_DEF_CFA cfi_ignore
+#define CFI_DEF_CFA_REGISTER cfi_ignore
+#define CFI_DEF_CFA_OFFSET cfi_ignore
+#define CFI_ADJUST_CFA_OFFSET cfi_ignore
+#define CFI_OFFSET cfi_ignore
+#define CFI_REL_OFFSET cfi_ignore
+#define CFI_REGISTER cfi_ignore
+#define CFI_RESTORE cfi_ignore
+#define CFI_REMEMBER_STATE cfi_ignore
+#define CFI_RESTORE_STATE cfi_ignore
+#define CFI_UNDEFINED cfi_ignore
+#define CFI_ESCAPE cfi_ignore
+#define CFI_SIGNAL_FRAME cfi_ignore
+
+#endif
+
+#endif /* _ASM_X86_DWARF2_H */
diff --git a/arch/x86/include/asm/e820/api.h b/arch/x86/include/asm/e820/api.h
new file mode 100644
index 000000000..62be73b23
--- /dev/null
+++ b/arch/x86/include/asm/e820/api.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_E820_API_H
+#define _ASM_E820_API_H
+
+#include <asm/e820/types.h>
+
+extern struct e820_table *e820_table;
+extern struct e820_table *e820_table_kexec;
+extern struct e820_table *e820_table_firmware;
+
+extern unsigned long pci_mem_start;
+
+extern bool e820__mapped_any(u64 start, u64 end, enum e820_type type);
+extern bool e820__mapped_all(u64 start, u64 end, enum e820_type type);
+
+extern void e820__range_add (u64 start, u64 size, enum e820_type type);
+extern u64 e820__range_update(u64 start, u64 size, enum e820_type old_type, enum e820_type new_type);
+extern u64 e820__range_remove(u64 start, u64 size, enum e820_type old_type, bool check_type);
+
+extern void e820__print_table(char *who);
+extern int e820__update_table(struct e820_table *table);
+extern void e820__update_table_print(void);
+
+extern unsigned long e820__end_of_ram_pfn(void);
+extern unsigned long e820__end_of_low_ram_pfn(void);
+
+extern u64 e820__memblock_alloc_reserved(u64 size, u64 align);
+extern void e820__memblock_setup(void);
+
+extern void e820__reserve_setup_data(void);
+extern void e820__finish_early_params(void);
+extern void e820__reserve_resources(void);
+extern void e820__reserve_resources_late(void);
+
+extern void e820__memory_setup(void);
+extern void e820__memory_setup_extended(u64 phys_addr, u32 data_len);
+extern char *e820__memory_setup_default(void);
+extern void e820__setup_pci_gap(void);
+
+extern void e820__reallocate_tables(void);
+extern void e820__register_nosave_regions(unsigned long limit_pfn);
+
+extern int e820__get_entry_type(u64 start, u64 end);
+
+/*
+ * Returns true iff the specified range [start,end) is completely contained inside
+ * the ISA region.
+ */
+static inline bool is_ISA_range(u64 start, u64 end)
+{
+ return start >= ISA_START_ADDRESS && end <= ISA_END_ADDRESS;
+}
+
+#endif /* _ASM_E820_API_H */
diff --git a/arch/x86/include/asm/e820/types.h b/arch/x86/include/asm/e820/types.h
new file mode 100644
index 000000000..c3aa4b5e4
--- /dev/null
+++ b/arch/x86/include/asm/e820/types.h
@@ -0,0 +1,105 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_E820_TYPES_H
+#define _ASM_E820_TYPES_H
+
+#include <uapi/asm/bootparam.h>
+
+/*
+ * These are the E820 types known to the kernel:
+ */
+enum e820_type {
+ E820_TYPE_RAM = 1,
+ E820_TYPE_RESERVED = 2,
+ E820_TYPE_ACPI = 3,
+ E820_TYPE_NVS = 4,
+ E820_TYPE_UNUSABLE = 5,
+ E820_TYPE_PMEM = 7,
+
+ /*
+ * This is a non-standardized way to represent ADR or
+ * NVDIMM regions that persist over a reboot.
+ *
+ * The kernel will ignore their special capabilities
+ * unless the CONFIG_X86_PMEM_LEGACY=y option is set.
+ *
+ * ( Note that older platforms also used 6 for the same
+ * type of memory, but newer versions switched to 12 as
+ * 6 was assigned differently. Some time they will learn... )
+ */
+ E820_TYPE_PRAM = 12,
+
+ /*
+ * Reserved RAM used by the kernel itself if
+ * CONFIG_INTEL_TXT=y is enabled, memory of this type
+ * will be included in the S3 integrity calculation
+ * and so should not include any memory that the BIOS
+ * might alter over the S3 transition:
+ */
+ E820_TYPE_RESERVED_KERN = 128,
+};
+
+/*
+ * A single E820 map entry, describing a memory range of [addr...addr+size-1],
+ * of 'type' memory type:
+ *
+ * (We pack it because there can be thousands of them on large systems.)
+ */
+struct e820_entry {
+ u64 addr;
+ u64 size;
+ enum e820_type type;
+} __attribute__((packed));
+
+/*
+ * The legacy E820 BIOS limits us to 128 (E820_MAX_ENTRIES_ZEROPAGE) nodes
+ * due to the constrained space in the zeropage.
+ *
+ * On large systems we can easily have thousands of nodes with RAM,
+ * which cannot be fit into so few entries - so we have a mechanism
+ * to extend the e820 table size at build-time, via the E820_MAX_ENTRIES
+ * define below.
+ *
+ * ( Those extra entries are enumerated via the EFI memory map, not
+ * via the legacy zeropage mechanism. )
+ *
+ * Size our internal memory map tables to have room for these additional
+ * entries, based on a heuristic calculation: up to three entries per
+ * NUMA node, plus E820_MAX_ENTRIES_ZEROPAGE for some extra space.
+ *
+ * This allows for bootstrap/firmware quirks such as possible duplicate
+ * E820 entries that might need room in the same arrays, prior to the
+ * call to e820__update_table() to remove duplicates. The allowance
+ * of three memory map entries per node is "enough" entries for
+ * the initial hardware platform motivating this mechanism to make
+ * use of additional EFI map entries. Future platforms may want
+ * to allow more than three entries per node or otherwise refine
+ * this size.
+ */
+
+#include <linux/numa.h>
+
+#define E820_MAX_ENTRIES (E820_MAX_ENTRIES_ZEROPAGE + 3*MAX_NUMNODES)
+
+/*
+ * The whole array of E820 entries:
+ */
+struct e820_table {
+ __u32 nr_entries;
+ struct e820_entry entries[E820_MAX_ENTRIES];
+};
+
+/*
+ * Various well-known legacy memory ranges in physical memory:
+ */
+#define ISA_START_ADDRESS 0x000a0000
+#define ISA_END_ADDRESS 0x00100000
+
+#define BIOS_BEGIN 0x000a0000
+#define BIOS_END 0x00100000
+
+#define HIGH_MEMORY 0x00100000
+
+#define BIOS_ROM_BASE 0xffe00000
+#define BIOS_ROM_END 0xffffffff
+
+#endif /* _ASM_E820_TYPES_H */
diff --git a/arch/x86/include/asm/edac.h b/arch/x86/include/asm/edac.h
new file mode 100644
index 000000000..426fc53ff
--- /dev/null
+++ b/arch/x86/include/asm/edac.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_EDAC_H
+#define _ASM_X86_EDAC_H
+
+/* ECC atomic, DMA, SMP and interrupt safe scrub function */
+
+static inline void edac_atomic_scrub(void *va, u32 size)
+{
+ u32 i, *virt_addr = va;
+
+ /*
+ * Very carefully read and write to memory atomically so we
+ * are interrupt, DMA and SMP safe.
+ */
+ for (i = 0; i < size / 4; i++, virt_addr++)
+ asm volatile("lock; addl $0, %0"::"m" (*virt_addr));
+}
+
+#endif /* _ASM_X86_EDAC_H */
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
new file mode 100644
index 000000000..baa549f8e
--- /dev/null
+++ b/arch/x86/include/asm/efi.h
@@ -0,0 +1,252 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_EFI_H
+#define _ASM_X86_EFI_H
+
+#include <asm/fpu/api.h>
+#include <asm/pgtable.h>
+#include <asm/processor-flags.h>
+#include <asm/tlb.h>
+#include <asm/nospec-branch.h>
+#include <asm/mmu_context.h>
+
+/*
+ * We map the EFI regions needed for runtime services non-contiguously,
+ * with preserved alignment on virtual addresses starting from -4G down
+ * for a total max space of 64G. This way, we provide for stable runtime
+ * services addresses across kernels so that a kexec'd kernel can still
+ * use them.
+ *
+ * This is the main reason why we're doing stable VA mappings for RT
+ * services.
+ *
+ * This flag is used in conjuction with a chicken bit called
+ * "efi=old_map" which can be used as a fallback to the old runtime
+ * services mapping method in case there's some b0rkage with a
+ * particular EFI implementation (haha, it is hard to hold up the
+ * sarcasm here...).
+ */
+#define EFI_OLD_MEMMAP EFI_ARCH_1
+
+#define EFI32_LOADER_SIGNATURE "EL32"
+#define EFI64_LOADER_SIGNATURE "EL64"
+
+#define MAX_CMDLINE_ADDRESS UINT_MAX
+
+#define ARCH_EFI_IRQ_FLAGS_MASK X86_EFLAGS_IF
+
+#ifdef CONFIG_X86_32
+
+extern asmlinkage unsigned long efi_call_phys(void *, ...);
+
+#define arch_efi_call_virt_setup() \
+({ \
+ kernel_fpu_begin(); \
+ firmware_restrict_branch_speculation_start(); \
+})
+
+#define arch_efi_call_virt_teardown() \
+({ \
+ firmware_restrict_branch_speculation_end(); \
+ kernel_fpu_end(); \
+})
+
+
+/*
+ * Wrap all the virtual calls in a way that forces the parameters on the stack.
+ */
+#define arch_efi_call_virt(p, f, args...) \
+({ \
+ ((efi_##f##_t __attribute__((regparm(0)))*) p->f)(args); \
+})
+
+#define efi_ioremap(addr, size, type, attr) ioremap_cache(addr, size)
+
+#else /* !CONFIG_X86_32 */
+
+#define EFI_LOADER_SIGNATURE "EL64"
+
+extern asmlinkage u64 efi_call(void *fp, ...);
+
+#define efi_call_phys(f, args...) efi_call((f), args)
+
+/*
+ * struct efi_scratch - Scratch space used while switching to/from efi_mm
+ * @phys_stack: stack used during EFI Mixed Mode
+ * @prev_mm: store/restore stolen mm_struct while switching to/from efi_mm
+ */
+struct efi_scratch {
+ u64 phys_stack;
+ struct mm_struct *prev_mm;
+} __packed;
+
+#define arch_efi_call_virt_setup() \
+({ \
+ efi_sync_low_kernel_mappings(); \
+ kernel_fpu_begin(); \
+ firmware_restrict_branch_speculation_start(); \
+ \
+ if (!efi_enabled(EFI_OLD_MEMMAP)) \
+ efi_switch_mm(&efi_mm); \
+})
+
+#define arch_efi_call_virt(p, f, args...) \
+ efi_call((void *)p->f, args) \
+
+#define arch_efi_call_virt_teardown() \
+({ \
+ if (!efi_enabled(EFI_OLD_MEMMAP)) \
+ efi_switch_mm(efi_scratch.prev_mm); \
+ \
+ firmware_restrict_branch_speculation_end(); \
+ kernel_fpu_end(); \
+})
+
+extern void __iomem *__init efi_ioremap(unsigned long addr, unsigned long size,
+ u32 type, u64 attribute);
+
+#ifdef CONFIG_KASAN
+/*
+ * CONFIG_KASAN may redefine memset to __memset. __memset function is present
+ * only in kernel binary. Since the EFI stub linked into a separate binary it
+ * doesn't have __memset(). So we should use standard memset from
+ * arch/x86/boot/compressed/string.c. The same applies to memcpy and memmove.
+ */
+#undef memcpy
+#undef memset
+#undef memmove
+#endif
+
+#endif /* CONFIG_X86_32 */
+
+extern struct efi_scratch efi_scratch;
+extern void __init efi_set_executable(efi_memory_desc_t *md, bool executable);
+extern int __init efi_memblock_x86_reserve_range(void);
+extern pgd_t * __init efi_call_phys_prolog(void);
+extern void __init efi_call_phys_epilog(pgd_t *save_pgd);
+extern void __init efi_print_memmap(void);
+extern void __init efi_memory_uc(u64 addr, unsigned long size);
+extern void __init efi_map_region(efi_memory_desc_t *md);
+extern void __init efi_map_region_fixed(efi_memory_desc_t *md);
+extern void efi_sync_low_kernel_mappings(void);
+extern int __init efi_alloc_page_tables(void);
+extern int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages);
+extern void __init old_map_region(efi_memory_desc_t *md);
+extern void __init runtime_code_page_mkexec(void);
+extern void __init efi_runtime_update_mappings(void);
+extern void __init efi_dump_pagetable(void);
+extern void __init efi_apply_memmap_quirks(void);
+extern int __init efi_reuse_config(u64 tables, int nr_tables);
+extern void efi_delete_dummy_variable(void);
+extern void efi_switch_mm(struct mm_struct *mm);
+
+struct efi_setup_data {
+ u64 fw_vendor;
+ u64 runtime;
+ u64 tables;
+ u64 smbios;
+ u64 reserved[8];
+};
+
+extern u64 efi_setup;
+
+#ifdef CONFIG_EFI
+
+static inline bool efi_is_native(void)
+{
+ return IS_ENABLED(CONFIG_X86_64) == efi_enabled(EFI_64BIT);
+}
+
+static inline bool efi_runtime_supported(void)
+{
+ if (efi_is_native())
+ return true;
+
+ if (IS_ENABLED(CONFIG_EFI_MIXED) && !efi_enabled(EFI_OLD_MEMMAP))
+ return true;
+
+ return false;
+}
+
+extern struct console early_efi_console;
+extern void parse_efi_setup(u64 phys_addr, u32 data_len);
+
+extern void efifb_setup_from_dmi(struct screen_info *si, const char *opt);
+
+#ifdef CONFIG_EFI_MIXED
+extern void efi_thunk_runtime_setup(void);
+extern efi_status_t efi_thunk_set_virtual_address_map(
+ void *phys_set_virtual_address_map,
+ unsigned long memory_map_size,
+ unsigned long descriptor_size,
+ u32 descriptor_version,
+ efi_memory_desc_t *virtual_map);
+#else
+static inline void efi_thunk_runtime_setup(void) {}
+static inline efi_status_t efi_thunk_set_virtual_address_map(
+ void *phys_set_virtual_address_map,
+ unsigned long memory_map_size,
+ unsigned long descriptor_size,
+ u32 descriptor_version,
+ efi_memory_desc_t *virtual_map)
+{
+ return EFI_SUCCESS;
+}
+#endif /* CONFIG_EFI_MIXED */
+
+
+/* arch specific definitions used by the stub code */
+
+struct efi_config {
+ u64 image_handle;
+ u64 table;
+ u64 runtime_services;
+ u64 boot_services;
+ u64 text_output;
+ efi_status_t (*call)(unsigned long, ...);
+ bool is64;
+} __packed;
+
+__pure const struct efi_config *__efi_early(void);
+
+static inline bool efi_is_64bit(void)
+{
+ if (!IS_ENABLED(CONFIG_X86_64))
+ return false;
+
+ if (!IS_ENABLED(CONFIG_EFI_MIXED))
+ return true;
+
+ return __efi_early()->is64;
+}
+
+#define efi_table_attr(table, attr, instance) \
+ (efi_is_64bit() ? \
+ ((table##_64_t *)(unsigned long)instance)->attr : \
+ ((table##_32_t *)(unsigned long)instance)->attr)
+
+#define efi_call_proto(protocol, f, instance, ...) \
+ __efi_early()->call(efi_table_attr(protocol, f, instance), \
+ instance, ##__VA_ARGS__)
+
+#define efi_call_early(f, ...) \
+ __efi_early()->call(efi_table_attr(efi_boot_services, f, \
+ __efi_early()->boot_services), __VA_ARGS__)
+
+#define __efi_call_early(f, ...) \
+ __efi_early()->call((unsigned long)f, __VA_ARGS__);
+
+#define efi_call_runtime(f, ...) \
+ __efi_early()->call(efi_table_attr(efi_runtime_services, f, \
+ __efi_early()->runtime_services), __VA_ARGS__)
+
+extern bool efi_reboot_required(void);
+
+#else
+static inline void parse_efi_setup(u64 phys_addr, u32 data_len) {}
+static inline bool efi_reboot_required(void)
+{
+ return false;
+}
+#endif /* CONFIG_EFI */
+
+#endif /* _ASM_X86_EFI_H */
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
new file mode 100644
index 000000000..0d157d2a1
--- /dev/null
+++ b/arch/x86/include/asm/elf.h
@@ -0,0 +1,385 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_ELF_H
+#define _ASM_X86_ELF_H
+
+/*
+ * ELF register definitions..
+ */
+#include <linux/thread_info.h>
+
+#include <asm/ptrace.h>
+#include <asm/user.h>
+#include <asm/auxvec.h>
+
+typedef unsigned long elf_greg_t;
+
+#define ELF_NGREG (sizeof(struct user_regs_struct) / sizeof(elf_greg_t))
+typedef elf_greg_t elf_gregset_t[ELF_NGREG];
+
+typedef struct user_i387_struct elf_fpregset_t;
+
+#ifdef __i386__
+
+typedef struct user_fxsr_struct elf_fpxregset_t;
+
+#define R_386_NONE 0
+#define R_386_32 1
+#define R_386_PC32 2
+#define R_386_GOT32 3
+#define R_386_PLT32 4
+#define R_386_COPY 5
+#define R_386_GLOB_DAT 6
+#define R_386_JMP_SLOT 7
+#define R_386_RELATIVE 8
+#define R_386_GOTOFF 9
+#define R_386_GOTPC 10
+#define R_386_NUM 11
+
+/*
+ * These are used to set parameters in the core dumps.
+ */
+#define ELF_CLASS ELFCLASS32
+#define ELF_DATA ELFDATA2LSB
+#define ELF_ARCH EM_386
+
+#else
+
+/* x86-64 relocation types */
+#define R_X86_64_NONE 0 /* No reloc */
+#define R_X86_64_64 1 /* Direct 64 bit */
+#define R_X86_64_PC32 2 /* PC relative 32 bit signed */
+#define R_X86_64_GOT32 3 /* 32 bit GOT entry */
+#define R_X86_64_PLT32 4 /* 32 bit PLT address */
+#define R_X86_64_COPY 5 /* Copy symbol at runtime */
+#define R_X86_64_GLOB_DAT 6 /* Create GOT entry */
+#define R_X86_64_JUMP_SLOT 7 /* Create PLT entry */
+#define R_X86_64_RELATIVE 8 /* Adjust by program base */
+#define R_X86_64_GOTPCREL 9 /* 32 bit signed pc relative
+ offset to GOT */
+#define R_X86_64_32 10 /* Direct 32 bit zero extended */
+#define R_X86_64_32S 11 /* Direct 32 bit sign extended */
+#define R_X86_64_16 12 /* Direct 16 bit zero extended */
+#define R_X86_64_PC16 13 /* 16 bit sign extended pc relative */
+#define R_X86_64_8 14 /* Direct 8 bit sign extended */
+#define R_X86_64_PC8 15 /* 8 bit sign extended pc relative */
+
+#define R_X86_64_NUM 16
+
+/*
+ * These are used to set parameters in the core dumps.
+ */
+#define ELF_CLASS ELFCLASS64
+#define ELF_DATA ELFDATA2LSB
+#define ELF_ARCH EM_X86_64
+
+#endif
+
+#include <asm/vdso.h>
+
+#ifdef CONFIG_X86_64
+extern unsigned int vdso64_enabled;
+#endif
+#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
+extern unsigned int vdso32_enabled;
+#endif
+
+/*
+ * This is used to ensure we don't load something for the wrong architecture.
+ */
+#define elf_check_arch_ia32(x) \
+ (((x)->e_machine == EM_386) || ((x)->e_machine == EM_486))
+
+#include <asm/processor.h>
+
+#ifdef CONFIG_X86_32
+#include <asm/desc.h>
+
+#define elf_check_arch(x) elf_check_arch_ia32(x)
+
+/* SVR4/i386 ABI (pages 3-31, 3-32) says that when the program starts %edx
+ contains a pointer to a function which might be registered using `atexit'.
+ This provides a mean for the dynamic linker to call DT_FINI functions for
+ shared libraries that have been loaded before the code runs.
+
+ A value of 0 tells we have no such handler.
+
+ We might as well make sure everything else is cleared too (except for %esp),
+ just to make things more deterministic.
+ */
+#define ELF_PLAT_INIT(_r, load_addr) \
+ do { \
+ _r->bx = 0; _r->cx = 0; _r->dx = 0; \
+ _r->si = 0; _r->di = 0; _r->bp = 0; \
+ _r->ax = 0; \
+} while (0)
+
+/*
+ * regs is struct pt_regs, pr_reg is elf_gregset_t (which is
+ * now struct_user_regs, they are different)
+ */
+
+#define ELF_CORE_COPY_REGS_COMMON(pr_reg, regs) \
+do { \
+ pr_reg[0] = regs->bx; \
+ pr_reg[1] = regs->cx; \
+ pr_reg[2] = regs->dx; \
+ pr_reg[3] = regs->si; \
+ pr_reg[4] = regs->di; \
+ pr_reg[5] = regs->bp; \
+ pr_reg[6] = regs->ax; \
+ pr_reg[7] = regs->ds; \
+ pr_reg[8] = regs->es; \
+ pr_reg[9] = regs->fs; \
+ pr_reg[11] = regs->orig_ax; \
+ pr_reg[12] = regs->ip; \
+ pr_reg[13] = regs->cs; \
+ pr_reg[14] = regs->flags; \
+ pr_reg[15] = regs->sp; \
+ pr_reg[16] = regs->ss; \
+} while (0);
+
+#define ELF_CORE_COPY_REGS(pr_reg, regs) \
+do { \
+ ELF_CORE_COPY_REGS_COMMON(pr_reg, regs);\
+ pr_reg[10] = get_user_gs(regs); \
+} while (0);
+
+#define ELF_CORE_COPY_KERNEL_REGS(pr_reg, regs) \
+do { \
+ ELF_CORE_COPY_REGS_COMMON(pr_reg, regs);\
+ savesegment(gs, pr_reg[10]); \
+} while (0);
+
+#define ELF_PLATFORM (utsname()->machine)
+#define set_personality_64bit() do { } while (0)
+
+#else /* CONFIG_X86_32 */
+
+/*
+ * This is used to ensure we don't load something for the wrong architecture.
+ */
+#define elf_check_arch(x) \
+ ((x)->e_machine == EM_X86_64)
+
+#define compat_elf_check_arch(x) \
+ (elf_check_arch_ia32(x) || \
+ (IS_ENABLED(CONFIG_X86_X32_ABI) && (x)->e_machine == EM_X86_64))
+
+#if __USER32_DS != __USER_DS
+# error "The following code assumes __USER32_DS == __USER_DS"
+#endif
+
+static inline void elf_common_init(struct thread_struct *t,
+ struct pt_regs *regs, const u16 ds)
+{
+ /* ax gets execve's return value. */
+ /*regs->ax = */ regs->bx = regs->cx = regs->dx = 0;
+ regs->si = regs->di = regs->bp = 0;
+ regs->r8 = regs->r9 = regs->r10 = regs->r11 = 0;
+ regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0;
+ t->fsbase = t->gsbase = 0;
+ t->fsindex = t->gsindex = 0;
+ t->ds = t->es = ds;
+}
+
+#define ELF_PLAT_INIT(_r, load_addr) \
+ elf_common_init(&current->thread, _r, 0)
+
+#define COMPAT_ELF_PLAT_INIT(regs, load_addr) \
+ elf_common_init(&current->thread, regs, __USER_DS)
+
+void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp);
+#define compat_start_thread compat_start_thread
+
+void set_personality_ia32(bool);
+#define COMPAT_SET_PERSONALITY(ex) \
+ set_personality_ia32((ex).e_machine == EM_X86_64)
+
+#define COMPAT_ELF_PLATFORM ("i686")
+
+/*
+ * regs is struct pt_regs, pr_reg is elf_gregset_t (which is
+ * now struct_user_regs, they are different). Assumes current is the process
+ * getting dumped.
+ */
+
+#define ELF_CORE_COPY_REGS(pr_reg, regs) \
+do { \
+ unsigned long base; \
+ unsigned v; \
+ (pr_reg)[0] = (regs)->r15; \
+ (pr_reg)[1] = (regs)->r14; \
+ (pr_reg)[2] = (regs)->r13; \
+ (pr_reg)[3] = (regs)->r12; \
+ (pr_reg)[4] = (regs)->bp; \
+ (pr_reg)[5] = (regs)->bx; \
+ (pr_reg)[6] = (regs)->r11; \
+ (pr_reg)[7] = (regs)->r10; \
+ (pr_reg)[8] = (regs)->r9; \
+ (pr_reg)[9] = (regs)->r8; \
+ (pr_reg)[10] = (regs)->ax; \
+ (pr_reg)[11] = (regs)->cx; \
+ (pr_reg)[12] = (regs)->dx; \
+ (pr_reg)[13] = (regs)->si; \
+ (pr_reg)[14] = (regs)->di; \
+ (pr_reg)[15] = (regs)->orig_ax; \
+ (pr_reg)[16] = (regs)->ip; \
+ (pr_reg)[17] = (regs)->cs; \
+ (pr_reg)[18] = (regs)->flags; \
+ (pr_reg)[19] = (regs)->sp; \
+ (pr_reg)[20] = (regs)->ss; \
+ rdmsrl(MSR_FS_BASE, base); (pr_reg)[21] = base; \
+ rdmsrl(MSR_KERNEL_GS_BASE, base); (pr_reg)[22] = base; \
+ asm("movl %%ds,%0" : "=r" (v)); (pr_reg)[23] = v; \
+ asm("movl %%es,%0" : "=r" (v)); (pr_reg)[24] = v; \
+ asm("movl %%fs,%0" : "=r" (v)); (pr_reg)[25] = v; \
+ asm("movl %%gs,%0" : "=r" (v)); (pr_reg)[26] = v; \
+} while (0);
+
+/* I'm not sure if we can use '-' here */
+#define ELF_PLATFORM ("x86_64")
+extern void set_personality_64bit(void);
+extern unsigned int sysctl_vsyscall32;
+extern int force_personality32;
+
+#endif /* !CONFIG_X86_32 */
+
+#define CORE_DUMP_USE_REGSET
+#define ELF_EXEC_PAGESIZE 4096
+
+/*
+ * This is the base location for PIE (ET_DYN with INTERP) loads. On
+ * 64-bit, this is above 4GB to leave the entire 32-bit address
+ * space open for things that want to use the area for 32-bit pointers.
+ */
+#define ELF_ET_DYN_BASE (mmap_is_ia32() ? 0x000400000UL : \
+ (DEFAULT_MAP_WINDOW / 3 * 2))
+
+/* This yields a mask that user programs can use to figure out what
+ instruction set this CPU supports. This could be done in user space,
+ but it's not easy, and we've already done it here. */
+
+#define ELF_HWCAP (boot_cpu_data.x86_capability[CPUID_1_EDX])
+
+extern u32 elf_hwcap2;
+
+/*
+ * HWCAP2 supplies mask with kernel enabled CPU features, so that
+ * the application can discover that it can safely use them.
+ * The bits are defined in uapi/asm/hwcap2.h.
+ */
+#define ELF_HWCAP2 (elf_hwcap2)
+
+/* This yields a string that ld.so will use to load implementation
+ specific libraries for optimization. This is more specific in
+ intent than poking at uname or /proc/cpuinfo.
+
+ For the moment, we have only optimizations for the Intel generations,
+ but that could change... */
+
+#define SET_PERSONALITY(ex) set_personality_64bit()
+
+/*
+ * An executable for which elf_read_implies_exec() returns TRUE will
+ * have the READ_IMPLIES_EXEC personality flag set automatically.
+ */
+#define elf_read_implies_exec(ex, executable_stack) \
+ (executable_stack != EXSTACK_DISABLE_X)
+
+struct task_struct;
+
+#define ARCH_DLINFO_IA32 \
+do { \
+ if (VDSO_CURRENT_BASE) { \
+ NEW_AUX_ENT(AT_SYSINFO, VDSO_ENTRY); \
+ NEW_AUX_ENT(AT_SYSINFO_EHDR, VDSO_CURRENT_BASE); \
+ } \
+} while (0)
+
+/*
+ * True on X86_32 or when emulating IA32 on X86_64
+ */
+static inline int mmap_is_ia32(void)
+{
+ return IS_ENABLED(CONFIG_X86_32) ||
+ (IS_ENABLED(CONFIG_COMPAT) &&
+ test_thread_flag(TIF_ADDR32));
+}
+
+extern unsigned long task_size_32bit(void);
+extern unsigned long task_size_64bit(int full_addr_space);
+extern unsigned long get_mmap_base(int is_legacy);
+extern bool mmap_address_hint_valid(unsigned long addr, unsigned long len);
+
+#ifdef CONFIG_X86_32
+
+#define __STACK_RND_MASK(is32bit) (0x7ff)
+#define STACK_RND_MASK (0x7ff)
+
+#define ARCH_DLINFO ARCH_DLINFO_IA32
+
+/* update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT entries changes */
+
+#else /* CONFIG_X86_32 */
+
+/* 1GB for 64bit, 8MB for 32bit */
+#define __STACK_RND_MASK(is32bit) ((is32bit) ? 0x7ff : 0x3fffff)
+#define STACK_RND_MASK __STACK_RND_MASK(mmap_is_ia32())
+
+#define ARCH_DLINFO \
+do { \
+ if (vdso64_enabled) \
+ NEW_AUX_ENT(AT_SYSINFO_EHDR, \
+ (unsigned long __force)current->mm->context.vdso); \
+} while (0)
+
+/* As a historical oddity, the x32 and x86_64 vDSOs are controlled together. */
+#define ARCH_DLINFO_X32 \
+do { \
+ if (vdso64_enabled) \
+ NEW_AUX_ENT(AT_SYSINFO_EHDR, \
+ (unsigned long __force)current->mm->context.vdso); \
+} while (0)
+
+#define AT_SYSINFO 32
+
+#define COMPAT_ARCH_DLINFO \
+if (test_thread_flag(TIF_X32)) \
+ ARCH_DLINFO_X32; \
+else \
+ ARCH_DLINFO_IA32
+
+#define COMPAT_ELF_ET_DYN_BASE (TASK_UNMAPPED_BASE + 0x1000000)
+
+#endif /* !CONFIG_X86_32 */
+
+#define VDSO_CURRENT_BASE ((unsigned long)current->mm->context.vdso)
+
+#define VDSO_ENTRY \
+ ((unsigned long)current->mm->context.vdso + \
+ vdso_image_32.sym___kernel_vsyscall)
+
+struct linux_binprm;
+
+#define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1
+extern int arch_setup_additional_pages(struct linux_binprm *bprm,
+ int uses_interp);
+extern int compat_arch_setup_additional_pages(struct linux_binprm *bprm,
+ int uses_interp);
+#define compat_arch_setup_additional_pages compat_arch_setup_additional_pages
+
+/* Do not change the values. See get_align_mask() */
+enum align_flags {
+ ALIGN_VA_32 = BIT(0),
+ ALIGN_VA_64 = BIT(1),
+};
+
+struct va_alignment {
+ int flags;
+ unsigned long mask;
+ unsigned long bits;
+} ____cacheline_aligned;
+
+extern struct va_alignment va_align;
+extern unsigned long align_vdso_addr(unsigned long);
+#endif /* _ASM_X86_ELF_H */
diff --git a/arch/x86/include/asm/emergency-restart.h b/arch/x86/include/asm/emergency-restart.h
new file mode 100644
index 000000000..2abde717d
--- /dev/null
+++ b/arch/x86/include/asm/emergency-restart.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_EMERGENCY_RESTART_H
+#define _ASM_X86_EMERGENCY_RESTART_H
+
+extern void machine_emergency_restart(void);
+
+#endif /* _ASM_X86_EMERGENCY_RESTART_H */
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
new file mode 100644
index 000000000..416422762
--- /dev/null
+++ b/arch/x86/include/asm/entry_arch.h
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * This file is designed to contain the BUILD_INTERRUPT specifications for
+ * all of the extra named interrupt vectors used by the architecture.
+ * Usually this is the Inter Process Interrupts (IPIs)
+ */
+
+/*
+ * The following vectors are part of the Linux architecture, there
+ * is no hardware IRQ pin equivalent for them, they are triggered
+ * through the ICC by us (IPIs)
+ */
+#ifdef CONFIG_SMP
+BUILD_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR)
+BUILD_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR)
+BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR)
+BUILD_INTERRUPT(irq_move_cleanup_interrupt, IRQ_MOVE_CLEANUP_VECTOR)
+BUILD_INTERRUPT(reboot_interrupt, REBOOT_VECTOR)
+#endif
+
+#ifdef CONFIG_HAVE_KVM
+BUILD_INTERRUPT(kvm_posted_intr_ipi, POSTED_INTR_VECTOR)
+BUILD_INTERRUPT(kvm_posted_intr_wakeup_ipi, POSTED_INTR_WAKEUP_VECTOR)
+BUILD_INTERRUPT(kvm_posted_intr_nested_ipi, POSTED_INTR_NESTED_VECTOR)
+#endif
+
+/*
+ * every pentium local APIC has two 'local interrupts', with a
+ * soft-definable vector attached to both interrupts, one of
+ * which is a timer interrupt, the other one is error counter
+ * overflow. Linux uses the local APIC timer interrupt to get
+ * a much simpler SMP time architecture:
+ */
+#ifdef CONFIG_X86_LOCAL_APIC
+
+BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR)
+BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR)
+BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
+BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR)
+
+#ifdef CONFIG_IRQ_WORK
+BUILD_INTERRUPT(irq_work_interrupt, IRQ_WORK_VECTOR)
+#endif
+
+#ifdef CONFIG_X86_THERMAL_VECTOR
+BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR)
+#endif
+
+#ifdef CONFIG_X86_MCE_THRESHOLD
+BUILD_INTERRUPT(threshold_interrupt,THRESHOLD_APIC_VECTOR)
+#endif
+
+#ifdef CONFIG_X86_MCE_AMD
+BUILD_INTERRUPT(deferred_error_interrupt, DEFERRED_ERROR_VECTOR)
+#endif
+#endif
diff --git a/arch/x86/include/asm/error-injection.h b/arch/x86/include/asm/error-injection.h
new file mode 100644
index 000000000..47b7a1296
--- /dev/null
+++ b/arch/x86/include/asm/error-injection.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_ERROR_INJECTION_H
+#define _ASM_ERROR_INJECTION_H
+
+#include <linux/compiler.h>
+#include <linux/linkage.h>
+#include <asm/ptrace.h>
+#include <asm-generic/error-injection.h>
+
+asmlinkage void just_return_func(void);
+void override_function_with_return(struct pt_regs *regs);
+
+#endif /* _ASM_ERROR_INJECTION_H */
diff --git a/arch/x86/include/asm/espfix.h b/arch/x86/include/asm/espfix.h
new file mode 100644
index 000000000..6777480d8
--- /dev/null
+++ b/arch/x86/include/asm/espfix.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_ESPFIX_H
+#define _ASM_X86_ESPFIX_H
+
+#ifdef CONFIG_X86_ESPFIX64
+
+#include <asm/percpu.h>
+
+DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack);
+DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr);
+
+extern void init_espfix_bsp(void);
+extern void init_espfix_ap(int cpu);
+#else
+static inline void init_espfix_ap(int cpu) { }
+#endif
+
+#endif /* _ASM_X86_ESPFIX_H */
diff --git a/arch/x86/include/asm/exec.h b/arch/x86/include/asm/exec.h
new file mode 100644
index 000000000..54c2e1db2
--- /dev/null
+++ b/arch/x86/include/asm/exec.h
@@ -0,0 +1 @@
+/* define arch_align_stack() here */
diff --git a/arch/x86/include/asm/extable.h b/arch/x86/include/asm/extable.h
new file mode 100644
index 000000000..f9c3a5d50
--- /dev/null
+++ b/arch/x86/include/asm/extable.h
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_EXTABLE_H
+#define _ASM_X86_EXTABLE_H
+/*
+ * The exception table consists of triples of addresses relative to the
+ * exception table entry itself. The first address is of an instruction
+ * that is allowed to fault, the second is the target at which the program
+ * should continue. The third is a handler function to deal with the fault
+ * caused by the instruction in the first field.
+ *
+ * All the routines below use bits of fixup code that are out of line
+ * with the main instruction path. This means when everything is well,
+ * we don't even have to jump over them. Further, they do not intrude
+ * on our cache or tlb entries.
+ */
+
+struct exception_table_entry {
+ int insn, fixup, handler;
+};
+struct pt_regs;
+
+#define ARCH_HAS_RELATIVE_EXTABLE
+
+#define swap_ex_entry_fixup(a, b, tmp, delta) \
+ do { \
+ (a)->fixup = (b)->fixup + (delta); \
+ (b)->fixup = (tmp).fixup - (delta); \
+ (a)->handler = (b)->handler + (delta); \
+ (b)->handler = (tmp).handler - (delta); \
+ } while (0)
+
+extern int fixup_exception(struct pt_regs *regs, int trapnr);
+extern int fixup_bug(struct pt_regs *regs, int trapnr);
+extern bool ex_has_fault_handler(unsigned long ip);
+extern void early_fixup_exception(struct pt_regs *regs, int trapnr);
+
+#endif
diff --git a/arch/x86/include/asm/fb.h b/arch/x86/include/asm/fb.h
new file mode 100644
index 000000000..ab4c96014
--- /dev/null
+++ b/arch/x86/include/asm/fb.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_FB_H
+#define _ASM_X86_FB_H
+
+#include <linux/fb.h>
+#include <linux/fs.h>
+#include <asm/page.h>
+
+static inline void fb_pgprotect(struct file *file, struct vm_area_struct *vma,
+ unsigned long off)
+{
+ unsigned long prot;
+
+ prot = pgprot_val(vma->vm_page_prot) & ~_PAGE_CACHE_MASK;
+ if (boot_cpu_data.x86 > 3)
+ pgprot_val(vma->vm_page_prot) =
+ prot | cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS);
+}
+
+extern int fb_is_primary_device(struct fb_info *info);
+
+#endif /* _ASM_X86_FB_H */
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
new file mode 100644
index 000000000..5e12b2319
--- /dev/null
+++ b/arch/x86/include/asm/fixmap.h
@@ -0,0 +1,202 @@
+/*
+ * fixmap.h: compile-time virtual memory allocation
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 1998 Ingo Molnar
+ *
+ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
+ * x86_32 and x86_64 integration by Gustavo F. Padovan, February 2009
+ */
+
+#ifndef _ASM_X86_FIXMAP_H
+#define _ASM_X86_FIXMAP_H
+
+/*
+ * Exposed to assembly code for setting up initial page tables. Cannot be
+ * calculated in assembly code (fixmap entries are an enum), but is sanity
+ * checked in the actual fixmap C code to make sure that the fixmap is
+ * covered fully.
+ */
+#define FIXMAP_PMD_NUM 2
+/* fixmap starts downwards from the 507th entry in level2_fixmap_pgt */
+#define FIXMAP_PMD_TOP 507
+
+#ifndef __ASSEMBLY__
+#include <linux/kernel.h>
+#include <asm/acpi.h>
+#include <asm/apicdef.h>
+#include <asm/page.h>
+#ifdef CONFIG_X86_32
+#include <linux/threads.h>
+#include <asm/kmap_types.h>
+#else
+#include <uapi/asm/vsyscall.h>
+#endif
+
+/*
+ * We can't declare FIXADDR_TOP as variable for x86_64 because vsyscall
+ * uses fixmaps that relies on FIXADDR_TOP for proper address calculation.
+ * Because of this, FIXADDR_TOP x86 integration was left as later work.
+ */
+#ifdef CONFIG_X86_32
+/* used by vmalloc.c, vsyscall.lds.S.
+ *
+ * Leave one empty page between vmalloc'ed areas and
+ * the start of the fixmap.
+ */
+extern unsigned long __FIXADDR_TOP;
+#define FIXADDR_TOP ((unsigned long)__FIXADDR_TOP)
+#else
+#define FIXADDR_TOP (round_up(VSYSCALL_ADDR + PAGE_SIZE, 1<<PMD_SHIFT) - \
+ PAGE_SIZE)
+#endif
+
+/*
+ * Here we define all the compile-time 'special' virtual
+ * addresses. The point is to have a constant address at
+ * compile time, but to set the physical address only
+ * in the boot process.
+ * for x86_32: We allocate these special addresses
+ * from the end of virtual memory (0xfffff000) backwards.
+ * Also this lets us do fail-safe vmalloc(), we
+ * can guarantee that these special addresses and
+ * vmalloc()-ed addresses never overlap.
+ *
+ * These 'compile-time allocated' memory buffers are
+ * fixed-size 4k pages (or larger if used with an increment
+ * higher than 1). Use set_fixmap(idx,phys) to associate
+ * physical memory with fixmap indices.
+ *
+ * TLB entries of such buffers will not be flushed across
+ * task switches.
+ */
+enum fixed_addresses {
+#ifdef CONFIG_X86_32
+ FIX_HOLE,
+#else
+#ifdef CONFIG_X86_VSYSCALL_EMULATION
+ VSYSCALL_PAGE = (FIXADDR_TOP - VSYSCALL_ADDR) >> PAGE_SHIFT,
+#endif
+#endif
+ FIX_DBGP_BASE,
+ FIX_EARLYCON_MEM_BASE,
+#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
+ FIX_OHCI1394_BASE,
+#endif
+#ifdef CONFIG_X86_LOCAL_APIC
+ FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
+#endif
+#ifdef CONFIG_X86_IO_APIC
+ FIX_IO_APIC_BASE_0,
+ FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1,
+#endif
+#ifdef CONFIG_X86_32
+ FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
+ FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
+#ifdef CONFIG_PCI_MMCONFIG
+ FIX_PCIE_MCFG,
+#endif
+#endif
+#ifdef CONFIG_PARAVIRT
+ FIX_PARAVIRT_BOOTMAP,
+#endif
+ FIX_TEXT_POKE1, /* reserve 2 pages for text_poke() */
+ FIX_TEXT_POKE0, /* first page is last, because allocation is backward */
+#ifdef CONFIG_X86_INTEL_MID
+ FIX_LNW_VRTC,
+#endif
+
+#ifdef CONFIG_ACPI_APEI_GHES
+ /* Used for GHES mapping from assorted contexts */
+ FIX_APEI_GHES_IRQ,
+ FIX_APEI_GHES_NMI,
+#endif
+
+ __end_of_permanent_fixed_addresses,
+
+ /*
+ * 512 temporary boot-time mappings, used by early_ioremap(),
+ * before ioremap() is functional.
+ *
+ * If necessary we round it up to the next 512 pages boundary so
+ * that we can have a single pgd entry and a single pte table:
+ */
+#define NR_FIX_BTMAPS 64
+#define FIX_BTMAPS_SLOTS 8
+#define TOTAL_FIX_BTMAPS (NR_FIX_BTMAPS * FIX_BTMAPS_SLOTS)
+ FIX_BTMAP_END =
+ (__end_of_permanent_fixed_addresses ^
+ (__end_of_permanent_fixed_addresses + TOTAL_FIX_BTMAPS - 1)) &
+ -PTRS_PER_PTE
+ ? __end_of_permanent_fixed_addresses + TOTAL_FIX_BTMAPS -
+ (__end_of_permanent_fixed_addresses & (TOTAL_FIX_BTMAPS - 1))
+ : __end_of_permanent_fixed_addresses,
+ FIX_BTMAP_BEGIN = FIX_BTMAP_END + TOTAL_FIX_BTMAPS - 1,
+#ifdef CONFIG_X86_32
+ FIX_WP_TEST,
+#endif
+#ifdef CONFIG_INTEL_TXT
+ FIX_TBOOT_BASE,
+#endif
+ __end_of_fixed_addresses
+};
+
+
+extern void reserve_top_address(unsigned long reserve);
+
+#define FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT)
+#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE)
+#define FIXADDR_TOT_SIZE (__end_of_fixed_addresses << PAGE_SHIFT)
+#define FIXADDR_TOT_START (FIXADDR_TOP - FIXADDR_TOT_SIZE)
+
+extern int fixmaps_set;
+
+extern pte_t *kmap_pte;
+#define kmap_prot PAGE_KERNEL
+extern pte_t *pkmap_page_table;
+
+void __native_set_fixmap(enum fixed_addresses idx, pte_t pte);
+void native_set_fixmap(unsigned /* enum fixed_addresses */ idx,
+ phys_addr_t phys, pgprot_t flags);
+
+#ifndef CONFIG_PARAVIRT
+static inline void __set_fixmap(enum fixed_addresses idx,
+ phys_addr_t phys, pgprot_t flags)
+{
+ native_set_fixmap(idx, phys, flags);
+}
+#endif
+
+/*
+ * FIXMAP_PAGE_NOCACHE is used for MMIO. Memory encryption is not
+ * supported for MMIO addresses, so make sure that the memory encryption
+ * mask is not part of the page attributes.
+ */
+#define FIXMAP_PAGE_NOCACHE PAGE_KERNEL_IO_NOCACHE
+
+/*
+ * Early memremap routines used for in-place encryption. The mappings created
+ * by these routines are intended to be used as temporary mappings.
+ */
+void __init *early_memremap_encrypted(resource_size_t phys_addr,
+ unsigned long size);
+void __init *early_memremap_encrypted_wp(resource_size_t phys_addr,
+ unsigned long size);
+void __init *early_memremap_decrypted(resource_size_t phys_addr,
+ unsigned long size);
+void __init *early_memremap_decrypted_wp(resource_size_t phys_addr,
+ unsigned long size);
+
+#include <asm-generic/fixmap.h>
+
+#define __late_set_fixmap(idx, phys, flags) __set_fixmap(idx, phys, flags)
+#define __late_clear_fixmap(idx) __set_fixmap(idx, 0, __pgprot(0))
+
+void __early_set_fixmap(enum fixed_addresses idx,
+ phys_addr_t phys, pgprot_t flags);
+
+#endif /* !__ASSEMBLY__ */
+#endif /* _ASM_X86_FIXMAP_H */
diff --git a/arch/x86/include/asm/floppy.h b/arch/x86/include/asm/floppy.h
new file mode 100644
index 000000000..7ec59edde
--- /dev/null
+++ b/arch/x86/include/asm/floppy.h
@@ -0,0 +1,281 @@
+/*
+ * Architecture specific parts of the Floppy driver
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 1995
+ */
+#ifndef _ASM_X86_FLOPPY_H
+#define _ASM_X86_FLOPPY_H
+
+#include <linux/vmalloc.h>
+
+/*
+ * The DMA channel used by the floppy controller cannot access data at
+ * addresses >= 16MB
+ *
+ * Went back to the 1MB limit, as some people had problems with the floppy
+ * driver otherwise. It doesn't matter much for performance anyway, as most
+ * floppy accesses go through the track buffer.
+ */
+#define _CROSS_64KB(a, s, vdma) \
+ (!(vdma) && \
+ ((unsigned long)(a)/K_64 != ((unsigned long)(a) + (s) - 1) / K_64))
+
+#define CROSS_64KB(a, s) _CROSS_64KB(a, s, use_virtual_dma & 1)
+
+
+#define SW fd_routine[use_virtual_dma & 1]
+#define CSW fd_routine[can_use_virtual_dma & 1]
+
+
+#define fd_inb(port) inb_p(port)
+#define fd_outb(value, port) outb_p(value, port)
+
+#define fd_request_dma() CSW._request_dma(FLOPPY_DMA, "floppy")
+#define fd_free_dma() CSW._free_dma(FLOPPY_DMA)
+#define fd_enable_irq() enable_irq(FLOPPY_IRQ)
+#define fd_disable_irq() disable_irq(FLOPPY_IRQ)
+#define fd_free_irq() free_irq(FLOPPY_IRQ, NULL)
+#define fd_get_dma_residue() SW._get_dma_residue(FLOPPY_DMA)
+#define fd_dma_mem_alloc(size) SW._dma_mem_alloc(size)
+#define fd_dma_setup(addr, size, mode, io) SW._dma_setup(addr, size, mode, io)
+
+#define FLOPPY_CAN_FALLBACK_ON_NODMA
+
+static int virtual_dma_count;
+static int virtual_dma_residue;
+static char *virtual_dma_addr;
+static int virtual_dma_mode;
+static int doing_pdma;
+
+static irqreturn_t floppy_hardint(int irq, void *dev_id)
+{
+ unsigned char st;
+
+#undef TRACE_FLPY_INT
+
+#ifdef TRACE_FLPY_INT
+ static int calls;
+ static int bytes;
+ static int dma_wait;
+#endif
+ if (!doing_pdma)
+ return floppy_interrupt(irq, dev_id);
+
+#ifdef TRACE_FLPY_INT
+ if (!calls)
+ bytes = virtual_dma_count;
+#endif
+
+ {
+ int lcount;
+ char *lptr;
+
+ st = 1;
+ for (lcount = virtual_dma_count, lptr = virtual_dma_addr;
+ lcount; lcount--, lptr++) {
+ st = inb(virtual_dma_port + 4) & 0xa0;
+ if (st != 0xa0)
+ break;
+ if (virtual_dma_mode)
+ outb_p(*lptr, virtual_dma_port + 5);
+ else
+ *lptr = inb_p(virtual_dma_port + 5);
+ }
+ virtual_dma_count = lcount;
+ virtual_dma_addr = lptr;
+ st = inb(virtual_dma_port + 4);
+ }
+
+#ifdef TRACE_FLPY_INT
+ calls++;
+#endif
+ if (st == 0x20)
+ return IRQ_HANDLED;
+ if (!(st & 0x20)) {
+ virtual_dma_residue += virtual_dma_count;
+ virtual_dma_count = 0;
+#ifdef TRACE_FLPY_INT
+ printk(KERN_DEBUG "count=%x, residue=%x calls=%d bytes=%d dma_wait=%d\n",
+ virtual_dma_count, virtual_dma_residue, calls, bytes,
+ dma_wait);
+ calls = 0;
+ dma_wait = 0;
+#endif
+ doing_pdma = 0;
+ floppy_interrupt(irq, dev_id);
+ return IRQ_HANDLED;
+ }
+#ifdef TRACE_FLPY_INT
+ if (!virtual_dma_count)
+ dma_wait++;
+#endif
+ return IRQ_HANDLED;
+}
+
+static void fd_disable_dma(void)
+{
+ if (!(can_use_virtual_dma & 1))
+ disable_dma(FLOPPY_DMA);
+ doing_pdma = 0;
+ virtual_dma_residue += virtual_dma_count;
+ virtual_dma_count = 0;
+}
+
+static int vdma_request_dma(unsigned int dmanr, const char *device_id)
+{
+ return 0;
+}
+
+static void vdma_nop(unsigned int dummy)
+{
+}
+
+
+static int vdma_get_dma_residue(unsigned int dummy)
+{
+ return virtual_dma_count + virtual_dma_residue;
+}
+
+
+static int fd_request_irq(void)
+{
+ if (can_use_virtual_dma)
+ return request_irq(FLOPPY_IRQ, floppy_hardint,
+ 0, "floppy", NULL);
+ else
+ return request_irq(FLOPPY_IRQ, floppy_interrupt,
+ 0, "floppy", NULL);
+}
+
+static unsigned long dma_mem_alloc(unsigned long size)
+{
+ return __get_dma_pages(GFP_KERNEL|__GFP_NORETRY, get_order(size));
+}
+
+
+static unsigned long vdma_mem_alloc(unsigned long size)
+{
+ return (unsigned long)vmalloc(size);
+
+}
+
+#define nodma_mem_alloc(size) vdma_mem_alloc(size)
+
+static void _fd_dma_mem_free(unsigned long addr, unsigned long size)
+{
+ if ((unsigned long)addr >= (unsigned long)high_memory)
+ vfree((void *)addr);
+ else
+ free_pages(addr, get_order(size));
+}
+
+#define fd_dma_mem_free(addr, size) _fd_dma_mem_free(addr, size)
+
+static void _fd_chose_dma_mode(char *addr, unsigned long size)
+{
+ if (can_use_virtual_dma == 2) {
+ if ((unsigned long)addr >= (unsigned long)high_memory ||
+ isa_virt_to_bus(addr) >= 0x1000000 ||
+ _CROSS_64KB(addr, size, 0))
+ use_virtual_dma = 1;
+ else
+ use_virtual_dma = 0;
+ } else {
+ use_virtual_dma = can_use_virtual_dma & 1;
+ }
+}
+
+#define fd_chose_dma_mode(addr, size) _fd_chose_dma_mode(addr, size)
+
+
+static int vdma_dma_setup(char *addr, unsigned long size, int mode, int io)
+{
+ doing_pdma = 1;
+ virtual_dma_port = io;
+ virtual_dma_mode = (mode == DMA_MODE_WRITE);
+ virtual_dma_addr = addr;
+ virtual_dma_count = size;
+ virtual_dma_residue = 0;
+ return 0;
+}
+
+static int hard_dma_setup(char *addr, unsigned long size, int mode, int io)
+{
+#ifdef FLOPPY_SANITY_CHECK
+ if (CROSS_64KB(addr, size)) {
+ printk("DMA crossing 64-K boundary %p-%p\n", addr, addr+size);
+ return -1;
+ }
+#endif
+ /* actual, physical DMA */
+ doing_pdma = 0;
+ clear_dma_ff(FLOPPY_DMA);
+ set_dma_mode(FLOPPY_DMA, mode);
+ set_dma_addr(FLOPPY_DMA, isa_virt_to_bus(addr));
+ set_dma_count(FLOPPY_DMA, size);
+ enable_dma(FLOPPY_DMA);
+ return 0;
+}
+
+static struct fd_routine_l {
+ int (*_request_dma)(unsigned int dmanr, const char *device_id);
+ void (*_free_dma)(unsigned int dmanr);
+ int (*_get_dma_residue)(unsigned int dummy);
+ unsigned long (*_dma_mem_alloc)(unsigned long size);
+ int (*_dma_setup)(char *addr, unsigned long size, int mode, int io);
+} fd_routine[] = {
+ {
+ ._request_dma = request_dma,
+ ._free_dma = free_dma,
+ ._get_dma_residue = get_dma_residue,
+ ._dma_mem_alloc = dma_mem_alloc,
+ ._dma_setup = hard_dma_setup
+ },
+ {
+ ._request_dma = vdma_request_dma,
+ ._free_dma = vdma_nop,
+ ._get_dma_residue = vdma_get_dma_residue,
+ ._dma_mem_alloc = vdma_mem_alloc,
+ ._dma_setup = vdma_dma_setup
+ }
+};
+
+
+static int FDC1 = 0x3f0;
+static int FDC2 = -1;
+
+/*
+ * Floppy types are stored in the rtc's CMOS RAM and so rtc_lock
+ * is needed to prevent corrupted CMOS RAM in case "insmod floppy"
+ * coincides with another rtc CMOS user. Paul G.
+ */
+#define FLOPPY0_TYPE \
+({ \
+ unsigned long flags; \
+ unsigned char val; \
+ spin_lock_irqsave(&rtc_lock, flags); \
+ val = (CMOS_READ(0x10) >> 4) & 15; \
+ spin_unlock_irqrestore(&rtc_lock, flags); \
+ val; \
+})
+
+#define FLOPPY1_TYPE \
+({ \
+ unsigned long flags; \
+ unsigned char val; \
+ spin_lock_irqsave(&rtc_lock, flags); \
+ val = CMOS_READ(0x10) & 15; \
+ spin_unlock_irqrestore(&rtc_lock, flags); \
+ val; \
+})
+
+#define N_FDC 2
+#define N_DRIVE 8
+
+#define EXTRA_FLOPPY_PARAMS
+
+#endif /* _ASM_X86_FLOPPY_H */
diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
new file mode 100644
index 000000000..b56d504af
--- /dev/null
+++ b/arch/x86/include/asm/fpu/api.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 1994 Linus Torvalds
+ *
+ * Pentium III FXSR, SSE support
+ * General FPU state handling cleanups
+ * Gareth Hughes <gareth@valinux.com>, May 2000
+ * x86-64 work by Andi Kleen 2002
+ */
+
+#ifndef _ASM_X86_FPU_API_H
+#define _ASM_X86_FPU_API_H
+
+/*
+ * Use kernel_fpu_begin/end() if you intend to use FPU in kernel context. It
+ * disables preemption so be careful if you intend to use it for long periods
+ * of time.
+ * If you intend to use the FPU in softirq you need to check first with
+ * irq_fpu_usable() if it is possible.
+ */
+extern void kernel_fpu_begin(void);
+extern void kernel_fpu_end(void);
+extern bool irq_fpu_usable(void);
+
+/*
+ * Query the presence of one or more xfeatures. Works on any legacy CPU as well.
+ *
+ * If 'feature_name' is set then put a human-readable description of
+ * the feature there as well - this can be used to print error (or success)
+ * messages.
+ */
+extern int cpu_has_xfeatures(u64 xfeatures_mask, const char **feature_name);
+
+#endif /* _ASM_X86_FPU_API_H */
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
new file mode 100644
index 000000000..4f274d851
--- /dev/null
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -0,0 +1,601 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 1994 Linus Torvalds
+ *
+ * Pentium III FXSR, SSE support
+ * General FPU state handling cleanups
+ * Gareth Hughes <gareth@valinux.com>, May 2000
+ * x86-64 work by Andi Kleen 2002
+ */
+
+#ifndef _ASM_X86_FPU_INTERNAL_H
+#define _ASM_X86_FPU_INTERNAL_H
+
+#include <linux/compat.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+#include <asm/user.h>
+#include <asm/fpu/api.h>
+#include <asm/fpu/xstate.h>
+#include <asm/cpufeature.h>
+#include <asm/trace/fpu.h>
+
+/*
+ * High level FPU state handling functions:
+ */
+extern void fpu__initialize(struct fpu *fpu);
+extern void fpu__prepare_read(struct fpu *fpu);
+extern void fpu__prepare_write(struct fpu *fpu);
+extern void fpu__save(struct fpu *fpu);
+extern void fpu__restore(struct fpu *fpu);
+extern int fpu__restore_sig(void __user *buf, int ia32_frame);
+extern void fpu__drop(struct fpu *fpu);
+extern int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu);
+extern void fpu__clear(struct fpu *fpu);
+extern int fpu__exception_code(struct fpu *fpu, int trap_nr);
+extern int dump_fpu(struct pt_regs *ptregs, struct user_i387_struct *fpstate);
+
+/*
+ * Boot time FPU initialization functions:
+ */
+extern void fpu__init_cpu(void);
+extern void fpu__init_system_xstate(void);
+extern void fpu__init_cpu_xstate(void);
+extern void fpu__init_system(struct cpuinfo_x86 *c);
+extern void fpu__init_check_bugs(void);
+extern void fpu__resume_cpu(void);
+extern u64 fpu__get_supported_xfeatures_mask(void);
+
+/*
+ * Debugging facility:
+ */
+#ifdef CONFIG_X86_DEBUG_FPU
+# define WARN_ON_FPU(x) WARN_ON_ONCE(x)
+#else
+# define WARN_ON_FPU(x) ({ (void)(x); 0; })
+#endif
+
+/*
+ * FPU related CPU feature flag helper routines:
+ */
+static __always_inline __pure bool use_xsaveopt(void)
+{
+ return static_cpu_has(X86_FEATURE_XSAVEOPT);
+}
+
+static __always_inline __pure bool use_xsave(void)
+{
+ return static_cpu_has(X86_FEATURE_XSAVE);
+}
+
+static __always_inline __pure bool use_fxsr(void)
+{
+ return static_cpu_has(X86_FEATURE_FXSR);
+}
+
+/*
+ * fpstate handling functions:
+ */
+
+extern union fpregs_state init_fpstate;
+
+extern void fpstate_init(union fpregs_state *state);
+#ifdef CONFIG_MATH_EMULATION
+extern void fpstate_init_soft(struct swregs_state *soft);
+#else
+static inline void fpstate_init_soft(struct swregs_state *soft) {}
+#endif
+
+static inline void fpstate_init_xstate(struct xregs_state *xsave)
+{
+ /*
+ * XRSTORS requires these bits set in xcomp_bv, or it will
+ * trigger #GP:
+ */
+ xsave->header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | xfeatures_mask;
+}
+
+static inline void fpstate_init_fxstate(struct fxregs_state *fx)
+{
+ fx->cwd = 0x37f;
+ fx->mxcsr = MXCSR_DEFAULT;
+}
+extern void fpstate_sanitize_xstate(struct fpu *fpu);
+
+/* Returns 0 or the negated trap number, which results in -EFAULT for #PF */
+#define user_insn(insn, output, input...) \
+({ \
+ int err; \
+ \
+ might_fault(); \
+ \
+ asm volatile(ASM_STAC "\n" \
+ "1: " #insn "\n" \
+ "2: " ASM_CLAC "\n" \
+ ".section .fixup,\"ax\"\n" \
+ "3: negl %%eax\n" \
+ " jmp 2b\n" \
+ ".previous\n" \
+ _ASM_EXTABLE_FAULT(1b, 3b) \
+ : [err] "=a" (err), output \
+ : "0"(0), input); \
+ err; \
+})
+
+#define kernel_insn(insn, output, input...) \
+ asm volatile("1:" #insn "\n\t" \
+ "2:\n" \
+ _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_fprestore) \
+ : output : input)
+
+static inline int copy_fregs_to_user(struct fregs_state __user *fx)
+{
+ return user_insn(fnsave %[fx]; fwait, [fx] "=m" (*fx), "m" (*fx));
+}
+
+static inline int copy_fxregs_to_user(struct fxregs_state __user *fx)
+{
+ if (IS_ENABLED(CONFIG_X86_32))
+ return user_insn(fxsave %[fx], [fx] "=m" (*fx), "m" (*fx));
+ else if (IS_ENABLED(CONFIG_AS_FXSAVEQ))
+ return user_insn(fxsaveq %[fx], [fx] "=m" (*fx), "m" (*fx));
+
+ /* See comment in copy_fxregs_to_kernel() below. */
+ return user_insn(rex64/fxsave (%[fx]), "=m" (*fx), [fx] "R" (fx));
+}
+
+static inline void copy_kernel_to_fxregs(struct fxregs_state *fx)
+{
+ if (IS_ENABLED(CONFIG_X86_32)) {
+ kernel_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+ } else {
+ if (IS_ENABLED(CONFIG_AS_FXSAVEQ)) {
+ kernel_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
+ } else {
+ /* See comment in copy_fxregs_to_kernel() below. */
+ kernel_insn(rex64/fxrstor (%[fx]), "=m" (*fx), [fx] "R" (fx), "m" (*fx));
+ }
+ }
+}
+
+static inline int copy_user_to_fxregs(struct fxregs_state __user *fx)
+{
+ if (IS_ENABLED(CONFIG_X86_32))
+ return user_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+ else if (IS_ENABLED(CONFIG_AS_FXSAVEQ))
+ return user_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
+
+ /* See comment in copy_fxregs_to_kernel() below. */
+ return user_insn(rex64/fxrstor (%[fx]), "=m" (*fx), [fx] "R" (fx),
+ "m" (*fx));
+}
+
+static inline void copy_kernel_to_fregs(struct fregs_state *fx)
+{
+ kernel_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+}
+
+static inline int copy_user_to_fregs(struct fregs_state __user *fx)
+{
+ return user_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+}
+
+static inline void copy_fxregs_to_kernel(struct fpu *fpu)
+{
+ if (IS_ENABLED(CONFIG_X86_32))
+ asm volatile( "fxsave %[fx]" : [fx] "=m" (fpu->state.fxsave));
+ else if (IS_ENABLED(CONFIG_AS_FXSAVEQ))
+ asm volatile("fxsaveq %[fx]" : [fx] "=m" (fpu->state.fxsave));
+ else {
+ /* Using "rex64; fxsave %0" is broken because, if the memory
+ * operand uses any extended registers for addressing, a second
+ * REX prefix will be generated (to the assembler, rex64
+ * followed by semicolon is a separate instruction), and hence
+ * the 64-bitness is lost.
+ *
+ * Using "fxsaveq %0" would be the ideal choice, but is only
+ * supported starting with gas 2.16.
+ *
+ * Using, as a workaround, the properly prefixed form below
+ * isn't accepted by any binutils version so far released,
+ * complaining that the same type of prefix is used twice if
+ * an extended register is needed for addressing (fix submitted
+ * to mainline 2005-11-21).
+ *
+ * asm volatile("rex64/fxsave %0" : "=m" (fpu->state.fxsave));
+ *
+ * This, however, we can work around by forcing the compiler to
+ * select an addressing mode that doesn't require extended
+ * registers.
+ */
+ asm volatile( "rex64/fxsave (%[fx])"
+ : "=m" (fpu->state.fxsave)
+ : [fx] "R" (&fpu->state.fxsave));
+ }
+}
+
+static inline void fxsave(struct fxregs_state *fx)
+{
+ if (IS_ENABLED(CONFIG_X86_32))
+ asm volatile( "fxsave %[fx]" : [fx] "=m" (*fx));
+ else
+ asm volatile("fxsaveq %[fx]" : [fx] "=m" (*fx));
+}
+
+/* These macros all use (%edi)/(%rdi) as the single memory argument. */
+#define XSAVE ".byte " REX_PREFIX "0x0f,0xae,0x27"
+#define XSAVEOPT ".byte " REX_PREFIX "0x0f,0xae,0x37"
+#define XSAVES ".byte " REX_PREFIX "0x0f,0xc7,0x2f"
+#define XRSTOR ".byte " REX_PREFIX "0x0f,0xae,0x2f"
+#define XRSTORS ".byte " REX_PREFIX "0x0f,0xc7,0x1f"
+
+/*
+ * After this @err contains 0 on success or the negated trap number when
+ * the operation raises an exception. For faults this results in -EFAULT.
+ */
+#define XSTATE_OP(op, st, lmask, hmask, err) \
+ asm volatile("1:" op "\n\t" \
+ "xor %[err], %[err]\n" \
+ "2:\n\t" \
+ ".pushsection .fixup,\"ax\"\n\t" \
+ "3: negl %%eax\n\t" \
+ "jmp 2b\n\t" \
+ ".popsection\n\t" \
+ _ASM_EXTABLE_FAULT(1b, 3b) \
+ : [err] "=a" (err) \
+ : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \
+ : "memory")
+
+/*
+ * If XSAVES is enabled, it replaces XSAVEOPT because it supports a compact
+ * format and supervisor states in addition to modified optimization in
+ * XSAVEOPT.
+ *
+ * Otherwise, if XSAVEOPT is enabled, XSAVEOPT replaces XSAVE because XSAVEOPT
+ * supports modified optimization which is not supported by XSAVE.
+ *
+ * We use XSAVE as a fallback.
+ *
+ * The 661 label is defined in the ALTERNATIVE* macros as the address of the
+ * original instruction which gets replaced. We need to use it here as the
+ * address of the instruction where we might get an exception at.
+ */
+#define XSTATE_XSAVE(st, lmask, hmask, err) \
+ asm volatile(ALTERNATIVE_2(XSAVE, \
+ XSAVEOPT, X86_FEATURE_XSAVEOPT, \
+ XSAVES, X86_FEATURE_XSAVES) \
+ "\n" \
+ "xor %[err], %[err]\n" \
+ "3:\n" \
+ ".pushsection .fixup,\"ax\"\n" \
+ "4: movl $-2, %[err]\n" \
+ "jmp 3b\n" \
+ ".popsection\n" \
+ _ASM_EXTABLE(661b, 4b) \
+ : [err] "=r" (err) \
+ : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \
+ : "memory")
+
+/*
+ * Use XRSTORS to restore context if it is enabled. XRSTORS supports compact
+ * XSAVE area format.
+ */
+#define XSTATE_XRESTORE(st, lmask, hmask) \
+ asm volatile(ALTERNATIVE(XRSTOR, \
+ XRSTORS, X86_FEATURE_XSAVES) \
+ "\n" \
+ "3:\n" \
+ _ASM_EXTABLE_HANDLE(661b, 3b, ex_handler_fprestore)\
+ : \
+ : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \
+ : "memory")
+
+/*
+ * This function is called only during boot time when x86 caps are not set
+ * up and alternative can not be used yet.
+ */
+static inline void copy_kernel_to_xregs_booting(struct xregs_state *xstate)
+{
+ u64 mask = -1;
+ u32 lmask = mask;
+ u32 hmask = mask >> 32;
+ int err;
+
+ WARN_ON(system_state != SYSTEM_BOOTING);
+
+ if (static_cpu_has(X86_FEATURE_XSAVES))
+ XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
+ else
+ XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
+
+ /*
+ * We should never fault when copying from a kernel buffer, and the FPU
+ * state we set at boot time should be valid.
+ */
+ WARN_ON_FPU(err);
+}
+
+/*
+ * Save processor xstate to xsave area.
+ */
+static inline void copy_xregs_to_kernel(struct xregs_state *xstate)
+{
+ u64 mask = -1;
+ u32 lmask = mask;
+ u32 hmask = mask >> 32;
+ int err;
+
+ WARN_ON_FPU(!alternatives_patched);
+
+ XSTATE_XSAVE(xstate, lmask, hmask, err);
+
+ /* We should never fault when copying to a kernel buffer: */
+ WARN_ON_FPU(err);
+}
+
+/*
+ * Restore processor xstate from xsave area.
+ */
+static inline void copy_kernel_to_xregs(struct xregs_state *xstate, u64 mask)
+{
+ u32 lmask = mask;
+ u32 hmask = mask >> 32;
+
+ XSTATE_XRESTORE(xstate, lmask, hmask);
+}
+
+/*
+ * Save xstate to user space xsave area.
+ *
+ * We don't use modified optimization because xrstor/xrstors might track
+ * a different application.
+ *
+ * We don't use compacted format xsave area for
+ * backward compatibility for old applications which don't understand
+ * compacted format of xsave area.
+ */
+static inline int copy_xregs_to_user(struct xregs_state __user *buf)
+{
+ int err;
+
+ /*
+ * Clear the xsave header first, so that reserved fields are
+ * initialized to zero.
+ */
+ err = __clear_user(&buf->header, sizeof(buf->header));
+ if (unlikely(err))
+ return -EFAULT;
+
+ stac();
+ XSTATE_OP(XSAVE, buf, -1, -1, err);
+ clac();
+
+ return err;
+}
+
+/*
+ * Restore xstate from user space xsave area.
+ */
+static inline int copy_user_to_xregs(struct xregs_state __user *buf, u64 mask)
+{
+ struct xregs_state *xstate = ((__force struct xregs_state *)buf);
+ u32 lmask = mask;
+ u32 hmask = mask >> 32;
+ int err;
+
+ stac();
+ XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
+ clac();
+
+ return err;
+}
+
+/*
+ * These must be called with preempt disabled. Returns
+ * 'true' if the FPU state is still intact and we can
+ * keep registers active.
+ *
+ * The legacy FNSAVE instruction cleared all FPU state
+ * unconditionally, so registers are essentially destroyed.
+ * Modern FPU state can be kept in registers, if there are
+ * no pending FP exceptions.
+ */
+static inline int copy_fpregs_to_fpstate(struct fpu *fpu)
+{
+ if (likely(use_xsave())) {
+ copy_xregs_to_kernel(&fpu->state.xsave);
+ return 1;
+ }
+
+ if (likely(use_fxsr())) {
+ copy_fxregs_to_kernel(fpu);
+ return 1;
+ }
+
+ /*
+ * Legacy FPU register saving, FNSAVE always clears FPU registers,
+ * so we have to mark them inactive:
+ */
+ asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->state.fsave));
+
+ return 0;
+}
+
+static inline void __copy_kernel_to_fpregs(union fpregs_state *fpstate, u64 mask)
+{
+ if (use_xsave()) {
+ copy_kernel_to_xregs(&fpstate->xsave, mask);
+ } else {
+ if (use_fxsr())
+ copy_kernel_to_fxregs(&fpstate->fxsave);
+ else
+ copy_kernel_to_fregs(&fpstate->fsave);
+ }
+}
+
+static inline void copy_kernel_to_fpregs(union fpregs_state *fpstate)
+{
+ /*
+ * AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception is
+ * pending. Clear the x87 state here by setting it to fixed values.
+ * "m" is a random variable that should be in L1.
+ */
+ if (unlikely(static_cpu_has_bug(X86_BUG_FXSAVE_LEAK))) {
+ asm volatile(
+ "fnclex\n\t"
+ "emms\n\t"
+ "fildl %P[addr]" /* set F?P to defined value */
+ : : [addr] "m" (fpstate));
+ }
+
+ __copy_kernel_to_fpregs(fpstate, -1);
+}
+
+extern int copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size);
+
+/*
+ * FPU context switch related helper methods:
+ */
+
+DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx);
+
+/*
+ * The in-register FPU state for an FPU context on a CPU is assumed to be
+ * valid if the fpu->last_cpu matches the CPU, and the fpu_fpregs_owner_ctx
+ * matches the FPU.
+ *
+ * If the FPU register state is valid, the kernel can skip restoring the
+ * FPU state from memory.
+ *
+ * Any code that clobbers the FPU registers or updates the in-memory
+ * FPU state for a task MUST let the rest of the kernel know that the
+ * FPU registers are no longer valid for this task.
+ *
+ * Either one of these invalidation functions is enough. Invalidate
+ * a resource you control: CPU if using the CPU for something else
+ * (with preemption disabled), FPU for the current task, or a task that
+ * is prevented from running by the current task.
+ */
+static inline void __cpu_invalidate_fpregs_state(void)
+{
+ __this_cpu_write(fpu_fpregs_owner_ctx, NULL);
+}
+
+static inline void __fpu_invalidate_fpregs_state(struct fpu *fpu)
+{
+ fpu->last_cpu = -1;
+}
+
+static inline int fpregs_state_valid(struct fpu *fpu, unsigned int cpu)
+{
+ return fpu == this_cpu_read_stable(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu;
+}
+
+/*
+ * These generally need preemption protection to work,
+ * do try to avoid using these on their own:
+ */
+static inline void fpregs_deactivate(struct fpu *fpu)
+{
+ this_cpu_write(fpu_fpregs_owner_ctx, NULL);
+ trace_x86_fpu_regs_deactivated(fpu);
+}
+
+static inline void fpregs_activate(struct fpu *fpu)
+{
+ this_cpu_write(fpu_fpregs_owner_ctx, fpu);
+ trace_x86_fpu_regs_activated(fpu);
+}
+
+/*
+ * FPU state switching for scheduling.
+ *
+ * This is a two-stage process:
+ *
+ * - switch_fpu_prepare() saves the old state.
+ * This is done within the context of the old process.
+ *
+ * - switch_fpu_finish() restores the new state as
+ * necessary.
+ */
+static inline void
+switch_fpu_prepare(struct fpu *old_fpu, int cpu)
+{
+ if (static_cpu_has(X86_FEATURE_FPU) && old_fpu->initialized) {
+ if (!copy_fpregs_to_fpstate(old_fpu))
+ old_fpu->last_cpu = -1;
+ else
+ old_fpu->last_cpu = cpu;
+
+ /* But leave fpu_fpregs_owner_ctx! */
+ trace_x86_fpu_regs_deactivated(old_fpu);
+ } else
+ old_fpu->last_cpu = -1;
+}
+
+/*
+ * Misc helper functions:
+ */
+
+/*
+ * Set up the userspace FPU context for the new task, if the task
+ * has used the FPU.
+ */
+static inline void switch_fpu_finish(struct fpu *new_fpu, int cpu)
+{
+ bool preload = static_cpu_has(X86_FEATURE_FPU) &&
+ new_fpu->initialized;
+
+ if (preload) {
+ if (!fpregs_state_valid(new_fpu, cpu))
+ copy_kernel_to_fpregs(&new_fpu->state);
+ fpregs_activate(new_fpu);
+ }
+}
+
+/*
+ * Needs to be preemption-safe.
+ *
+ * NOTE! user_fpu_begin() must be used only immediately before restoring
+ * the save state. It does not do any saving/restoring on its own. In
+ * lazy FPU mode, it is just an optimization to avoid a #NM exception,
+ * the task can lose the FPU right after preempt_enable().
+ */
+static inline void user_fpu_begin(void)
+{
+ struct fpu *fpu = &current->thread.fpu;
+
+ preempt_disable();
+ fpregs_activate(fpu);
+ preempt_enable();
+}
+
+/*
+ * MXCSR and XCR definitions:
+ */
+
+extern unsigned int mxcsr_feature_mask;
+
+#define XCR_XFEATURE_ENABLED_MASK 0x00000000
+
+static inline u64 xgetbv(u32 index)
+{
+ u32 eax, edx;
+
+ asm volatile(".byte 0x0f,0x01,0xd0" /* xgetbv */
+ : "=a" (eax), "=d" (edx)
+ : "c" (index));
+ return eax + ((u64)edx << 32);
+}
+
+static inline void xsetbv(u32 index, u64 value)
+{
+ u32 eax = value;
+ u32 edx = value >> 32;
+
+ asm volatile(".byte 0x0f,0x01,0xd1" /* xsetbv */
+ : : "a" (eax), "d" (edx), "c" (index));
+}
+
+#endif /* _ASM_X86_FPU_INTERNAL_H */
diff --git a/arch/x86/include/asm/fpu/regset.h b/arch/x86/include/asm/fpu/regset.h
new file mode 100644
index 000000000..d5bdffb9d
--- /dev/null
+++ b/arch/x86/include/asm/fpu/regset.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * FPU regset handling methods:
+ */
+#ifndef _ASM_X86_FPU_REGSET_H
+#define _ASM_X86_FPU_REGSET_H
+
+#include <linux/regset.h>
+
+extern user_regset_active_fn regset_fpregs_active, regset_xregset_fpregs_active;
+extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get,
+ xstateregs_get;
+extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set,
+ xstateregs_set;
+
+/*
+ * xstateregs_active == regset_fpregs_active. Please refer to the comment
+ * at the definition of regset_fpregs_active.
+ */
+#define xstateregs_active regset_fpregs_active
+
+#endif /* _ASM_X86_FPU_REGSET_H */
diff --git a/arch/x86/include/asm/fpu/signal.h b/arch/x86/include/asm/fpu/signal.h
new file mode 100644
index 000000000..44bbc39a5
--- /dev/null
+++ b/arch/x86/include/asm/fpu/signal.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * x86 FPU signal frame handling methods:
+ */
+#ifndef _ASM_X86_FPU_SIGNAL_H
+#define _ASM_X86_FPU_SIGNAL_H
+
+#ifdef CONFIG_X86_64
+# include <uapi/asm/sigcontext.h>
+# include <asm/user32.h>
+struct ksignal;
+int ia32_setup_rt_frame(int sig, struct ksignal *ksig,
+ compat_sigset_t *set, struct pt_regs *regs);
+int ia32_setup_frame(int sig, struct ksignal *ksig,
+ compat_sigset_t *set, struct pt_regs *regs);
+#else
+# define user_i387_ia32_struct user_i387_struct
+# define user32_fxsr_struct user_fxsr_struct
+# define ia32_setup_frame __setup_frame
+# define ia32_setup_rt_frame __setup_rt_frame
+#endif
+
+extern void convert_from_fxsr(struct user_i387_ia32_struct *env,
+ struct task_struct *tsk);
+extern void convert_to_fxsr(struct task_struct *tsk,
+ const struct user_i387_ia32_struct *env);
+
+unsigned long
+fpu__alloc_mathframe(unsigned long sp, int ia32_frame,
+ unsigned long *buf_fx, unsigned long *size);
+
+extern void fpu__init_prepare_fx_sw_frame(void);
+
+#endif /* _ASM_X86_FPU_SIGNAL_H */
diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
new file mode 100644
index 000000000..202c53918
--- /dev/null
+++ b/arch/x86/include/asm/fpu/types.h
@@ -0,0 +1,321 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * FPU data structures:
+ */
+#ifndef _ASM_X86_FPU_H
+#define _ASM_X86_FPU_H
+
+/*
+ * The legacy x87 FPU state format, as saved by FSAVE and
+ * restored by the FRSTOR instructions:
+ */
+struct fregs_state {
+ u32 cwd; /* FPU Control Word */
+ u32 swd; /* FPU Status Word */
+ u32 twd; /* FPU Tag Word */
+ u32 fip; /* FPU IP Offset */
+ u32 fcs; /* FPU IP Selector */
+ u32 foo; /* FPU Operand Pointer Offset */
+ u32 fos; /* FPU Operand Pointer Selector */
+
+ /* 8*10 bytes for each FP-reg = 80 bytes: */
+ u32 st_space[20];
+
+ /* Software status information [not touched by FSAVE]: */
+ u32 status;
+};
+
+/*
+ * The legacy fx SSE/MMX FPU state format, as saved by FXSAVE and
+ * restored by the FXRSTOR instructions. It's similar to the FSAVE
+ * format, but differs in some areas, plus has extensions at
+ * the end for the XMM registers.
+ */
+struct fxregs_state {
+ u16 cwd; /* Control Word */
+ u16 swd; /* Status Word */
+ u16 twd; /* Tag Word */
+ u16 fop; /* Last Instruction Opcode */
+ union {
+ struct {
+ u64 rip; /* Instruction Pointer */
+ u64 rdp; /* Data Pointer */
+ };
+ struct {
+ u32 fip; /* FPU IP Offset */
+ u32 fcs; /* FPU IP Selector */
+ u32 foo; /* FPU Operand Offset */
+ u32 fos; /* FPU Operand Selector */
+ };
+ };
+ u32 mxcsr; /* MXCSR Register State */
+ u32 mxcsr_mask; /* MXCSR Mask */
+
+ /* 8*16 bytes for each FP-reg = 128 bytes: */
+ u32 st_space[32];
+
+ /* 16*16 bytes for each XMM-reg = 256 bytes: */
+ u32 xmm_space[64];
+
+ u32 padding[12];
+
+ union {
+ u32 padding1[12];
+ u32 sw_reserved[12];
+ };
+
+} __attribute__((aligned(16)));
+
+/* Default value for fxregs_state.mxcsr: */
+#define MXCSR_DEFAULT 0x1f80
+
+/* Copy both mxcsr & mxcsr_flags with a single u64 memcpy: */
+#define MXCSR_AND_FLAGS_SIZE sizeof(u64)
+
+/*
+ * Software based FPU emulation state. This is arbitrary really,
+ * it matches the x87 format to make it easier to understand:
+ */
+struct swregs_state {
+ u32 cwd;
+ u32 swd;
+ u32 twd;
+ u32 fip;
+ u32 fcs;
+ u32 foo;
+ u32 fos;
+ /* 8*10 bytes for each FP-reg = 80 bytes: */
+ u32 st_space[20];
+ u8 ftop;
+ u8 changed;
+ u8 lookahead;
+ u8 no_update;
+ u8 rm;
+ u8 alimit;
+ struct math_emu_info *info;
+ u32 entry_eip;
+};
+
+/*
+ * List of XSAVE features Linux knows about:
+ */
+enum xfeature {
+ XFEATURE_FP,
+ XFEATURE_SSE,
+ /*
+ * Values above here are "legacy states".
+ * Those below are "extended states".
+ */
+ XFEATURE_YMM,
+ XFEATURE_BNDREGS,
+ XFEATURE_BNDCSR,
+ XFEATURE_OPMASK,
+ XFEATURE_ZMM_Hi256,
+ XFEATURE_Hi16_ZMM,
+ XFEATURE_PT_UNIMPLEMENTED_SO_FAR,
+ XFEATURE_PKRU,
+
+ XFEATURE_MAX,
+};
+
+#define XFEATURE_MASK_FP (1 << XFEATURE_FP)
+#define XFEATURE_MASK_SSE (1 << XFEATURE_SSE)
+#define XFEATURE_MASK_YMM (1 << XFEATURE_YMM)
+#define XFEATURE_MASK_BNDREGS (1 << XFEATURE_BNDREGS)
+#define XFEATURE_MASK_BNDCSR (1 << XFEATURE_BNDCSR)
+#define XFEATURE_MASK_OPMASK (1 << XFEATURE_OPMASK)
+#define XFEATURE_MASK_ZMM_Hi256 (1 << XFEATURE_ZMM_Hi256)
+#define XFEATURE_MASK_Hi16_ZMM (1 << XFEATURE_Hi16_ZMM)
+#define XFEATURE_MASK_PT (1 << XFEATURE_PT_UNIMPLEMENTED_SO_FAR)
+#define XFEATURE_MASK_PKRU (1 << XFEATURE_PKRU)
+
+#define XFEATURE_MASK_FPSSE (XFEATURE_MASK_FP | XFEATURE_MASK_SSE)
+#define XFEATURE_MASK_AVX512 (XFEATURE_MASK_OPMASK \
+ | XFEATURE_MASK_ZMM_Hi256 \
+ | XFEATURE_MASK_Hi16_ZMM)
+
+#define FIRST_EXTENDED_XFEATURE XFEATURE_YMM
+
+struct reg_128_bit {
+ u8 regbytes[128/8];
+};
+struct reg_256_bit {
+ u8 regbytes[256/8];
+};
+struct reg_512_bit {
+ u8 regbytes[512/8];
+};
+
+/*
+ * State component 2:
+ *
+ * There are 16x 256-bit AVX registers named YMM0-YMM15.
+ * The low 128 bits are aliased to the 16 SSE registers (XMM0-XMM15)
+ * and are stored in 'struct fxregs_state::xmm_space[]' in the
+ * "legacy" area.
+ *
+ * The high 128 bits are stored here.
+ */
+struct ymmh_struct {
+ struct reg_128_bit hi_ymm[16];
+} __packed;
+
+/* Intel MPX support: */
+
+struct mpx_bndreg {
+ u64 lower_bound;
+ u64 upper_bound;
+} __packed;
+/*
+ * State component 3 is used for the 4 128-bit bounds registers
+ */
+struct mpx_bndreg_state {
+ struct mpx_bndreg bndreg[4];
+} __packed;
+
+/*
+ * State component 4 is used for the 64-bit user-mode MPX
+ * configuration register BNDCFGU and the 64-bit MPX status
+ * register BNDSTATUS. We call the pair "BNDCSR".
+ */
+struct mpx_bndcsr {
+ u64 bndcfgu;
+ u64 bndstatus;
+} __packed;
+
+/*
+ * The BNDCSR state is padded out to be 64-bytes in size.
+ */
+struct mpx_bndcsr_state {
+ union {
+ struct mpx_bndcsr bndcsr;
+ u8 pad_to_64_bytes[64];
+ };
+} __packed;
+
+/* AVX-512 Components: */
+
+/*
+ * State component 5 is used for the 8 64-bit opmask registers
+ * k0-k7 (opmask state).
+ */
+struct avx_512_opmask_state {
+ u64 opmask_reg[8];
+} __packed;
+
+/*
+ * State component 6 is used for the upper 256 bits of the
+ * registers ZMM0-ZMM15. These 16 256-bit values are denoted
+ * ZMM0_H-ZMM15_H (ZMM_Hi256 state).
+ */
+struct avx_512_zmm_uppers_state {
+ struct reg_256_bit zmm_upper[16];
+} __packed;
+
+/*
+ * State component 7 is used for the 16 512-bit registers
+ * ZMM16-ZMM31 (Hi16_ZMM state).
+ */
+struct avx_512_hi16_state {
+ struct reg_512_bit hi16_zmm[16];
+} __packed;
+
+/*
+ * State component 9: 32-bit PKRU register. The state is
+ * 8 bytes long but only 4 bytes is used currently.
+ */
+struct pkru_state {
+ u32 pkru;
+ u32 pad;
+} __packed;
+
+struct xstate_header {
+ u64 xfeatures;
+ u64 xcomp_bv;
+ u64 reserved[6];
+} __attribute__((packed));
+
+/*
+ * xstate_header.xcomp_bv[63] indicates that the extended_state_area
+ * is in compacted format.
+ */
+#define XCOMP_BV_COMPACTED_FORMAT ((u64)1 << 63)
+
+/*
+ * This is our most modern FPU state format, as saved by the XSAVE
+ * and restored by the XRSTOR instructions.
+ *
+ * It consists of a legacy fxregs portion, an xstate header and
+ * subsequent areas as defined by the xstate header. Not all CPUs
+ * support all the extensions, so the size of the extended area
+ * can vary quite a bit between CPUs.
+ */
+struct xregs_state {
+ struct fxregs_state i387;
+ struct xstate_header header;
+ u8 extended_state_area[0];
+} __attribute__ ((packed, aligned (64)));
+
+/*
+ * This is a union of all the possible FPU state formats
+ * put together, so that we can pick the right one runtime.
+ *
+ * The size of the structure is determined by the largest
+ * member - which is the xsave area. The padding is there
+ * to ensure that statically-allocated task_structs (just
+ * the init_task today) have enough space.
+ */
+union fpregs_state {
+ struct fregs_state fsave;
+ struct fxregs_state fxsave;
+ struct swregs_state soft;
+ struct xregs_state xsave;
+ u8 __padding[PAGE_SIZE];
+};
+
+/*
+ * Highest level per task FPU state data structure that
+ * contains the FPU register state plus various FPU
+ * state fields:
+ */
+struct fpu {
+ /*
+ * @last_cpu:
+ *
+ * Records the last CPU on which this context was loaded into
+ * FPU registers. (In the lazy-restore case we might be
+ * able to reuse FPU registers across multiple context switches
+ * this way, if no intermediate task used the FPU.)
+ *
+ * A value of -1 is used to indicate that the FPU state in context
+ * memory is newer than the FPU state in registers, and that the
+ * FPU state should be reloaded next time the task is run.
+ */
+ unsigned int last_cpu;
+
+ /*
+ * @initialized:
+ *
+ * This flag indicates whether this context is initialized: if the task
+ * is not running then we can restore from this context, if the task
+ * is running then we should save into this context.
+ */
+ unsigned char initialized;
+
+ /*
+ * @state:
+ *
+ * In-memory copy of all FPU registers that we save/restore
+ * over context switches. If the task is using the FPU then
+ * the registers in the FPU are more recent than this state
+ * copy. If the task context-switches away then they get
+ * saved here and represent the FPU state.
+ */
+ union fpregs_state state;
+ /*
+ * WARNING: 'state' is dynamically-sized. Do not put
+ * anything after it here.
+ */
+};
+
+#endif /* _ASM_X86_FPU_H */
diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
new file mode 100644
index 000000000..48581988d
--- /dev/null
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -0,0 +1,60 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_X86_XSAVE_H
+#define __ASM_X86_XSAVE_H
+
+#include <linux/types.h>
+#include <asm/processor.h>
+#include <linux/uaccess.h>
+
+/* Bit 63 of XCR0 is reserved for future expansion */
+#define XFEATURE_MASK_EXTEND (~(XFEATURE_MASK_FPSSE | (1ULL << 63)))
+
+#define XSTATE_CPUID 0x0000000d
+
+#define FXSAVE_SIZE 512
+
+#define XSAVE_HDR_SIZE 64
+#define XSAVE_HDR_OFFSET FXSAVE_SIZE
+
+#define XSAVE_YMM_SIZE 256
+#define XSAVE_YMM_OFFSET (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET)
+
+/* Supervisor features */
+#define XFEATURE_MASK_SUPERVISOR (XFEATURE_MASK_PT)
+
+/* All currently supported features */
+#define XCNTXT_MASK (XFEATURE_MASK_FP | \
+ XFEATURE_MASK_SSE | \
+ XFEATURE_MASK_YMM | \
+ XFEATURE_MASK_OPMASK | \
+ XFEATURE_MASK_ZMM_Hi256 | \
+ XFEATURE_MASK_Hi16_ZMM | \
+ XFEATURE_MASK_PKRU | \
+ XFEATURE_MASK_BNDREGS | \
+ XFEATURE_MASK_BNDCSR)
+
+#ifdef CONFIG_X86_64
+#define REX_PREFIX "0x48, "
+#else
+#define REX_PREFIX
+#endif
+
+extern u64 xfeatures_mask;
+extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
+
+extern void __init update_regset_xstate_info(unsigned int size,
+ u64 xstate_mask);
+
+void fpu__xstate_clear_all_cpu_caps(void);
+void *get_xsave_addr(struct xregs_state *xsave, int xstate);
+const void *get_xsave_field_ptr(int xstate_field);
+int using_compacted_format(void);
+int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int offset, unsigned int size);
+int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int offset, unsigned int size);
+int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf);
+int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf);
+
+/* Validate an xstate header supplied by userspace (ptrace or sigreturn) */
+extern int validate_xstate_header(const struct xstate_header *hdr);
+
+#endif
diff --git a/arch/x86/include/asm/frame.h b/arch/x86/include/asm/frame.h
new file mode 100644
index 000000000..296b34618
--- /dev/null
+++ b/arch/x86/include/asm/frame.h
@@ -0,0 +1,94 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_FRAME_H
+#define _ASM_X86_FRAME_H
+
+#include <asm/asm.h>
+
+/*
+ * These are stack frame creation macros. They should be used by every
+ * callable non-leaf asm function to make kernel stack traces more reliable.
+ */
+
+#ifdef CONFIG_FRAME_POINTER
+
+#ifdef __ASSEMBLY__
+
+.macro FRAME_BEGIN
+ push %_ASM_BP
+ _ASM_MOV %_ASM_SP, %_ASM_BP
+.endm
+
+.macro FRAME_END
+ pop %_ASM_BP
+.endm
+
+#ifdef CONFIG_X86_64
+/*
+ * This is a sneaky trick to help the unwinder find pt_regs on the stack. The
+ * frame pointer is replaced with an encoded pointer to pt_regs. The encoding
+ * is just setting the LSB, which makes it an invalid stack address and is also
+ * a signal to the unwinder that it's a pt_regs pointer in disguise.
+ *
+ * NOTE: This macro must be used *after* PUSH_AND_CLEAR_REGS because it corrupts
+ * the original rbp.
+ */
+.macro ENCODE_FRAME_POINTER ptregs_offset=0
+ leaq 1+\ptregs_offset(%rsp), %rbp
+.endm
+#else /* !CONFIG_X86_64 */
+/*
+ * This is a sneaky trick to help the unwinder find pt_regs on the stack. The
+ * frame pointer is replaced with an encoded pointer to pt_regs. The encoding
+ * is just clearing the MSB, which makes it an invalid stack address and is also
+ * a signal to the unwinder that it's a pt_regs pointer in disguise.
+ *
+ * NOTE: This macro must be used *after* SAVE_ALL because it corrupts the
+ * original ebp.
+ */
+.macro ENCODE_FRAME_POINTER
+ mov %esp, %ebp
+ andl $0x7fffffff, %ebp
+.endm
+#endif /* CONFIG_X86_64 */
+
+#else /* !__ASSEMBLY__ */
+
+#define FRAME_BEGIN \
+ "push %" _ASM_BP "\n" \
+ _ASM_MOV "%" _ASM_SP ", %" _ASM_BP "\n"
+
+#define FRAME_END "pop %" _ASM_BP "\n"
+
+#ifdef CONFIG_X86_64
+#define ENCODE_FRAME_POINTER \
+ "lea 1(%rsp), %rbp\n\t"
+#else /* !CONFIG_X86_64 */
+#define ENCODE_FRAME_POINTER \
+ "movl %esp, %ebp\n\t" \
+ "andl $0x7fffffff, %ebp\n\t"
+#endif /* CONFIG_X86_64 */
+
+#endif /* __ASSEMBLY__ */
+
+#define FRAME_OFFSET __ASM_SEL(4, 8)
+
+#else /* !CONFIG_FRAME_POINTER */
+
+#ifdef __ASSEMBLY__
+
+.macro ENCODE_FRAME_POINTER ptregs_offset=0
+.endm
+
+#else /* !__ASSEMBLY */
+
+#define ENCODE_FRAME_POINTER
+
+#endif
+
+#define FRAME_BEGIN
+#define FRAME_END
+#define FRAME_OFFSET 0
+
+#endif /* CONFIG_FRAME_POINTER */
+
+#endif /* _ASM_X86_FRAME_H */
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
new file mode 100644
index 000000000..c18ed6528
--- /dev/null
+++ b/arch/x86/include/asm/ftrace.h
@@ -0,0 +1,87 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_FTRACE_H
+#define _ASM_X86_FTRACE_H
+
+#ifdef CONFIG_FUNCTION_TRACER
+#ifdef CC_USING_FENTRY
+# define MCOUNT_ADDR ((unsigned long)(__fentry__))
+#else
+# define MCOUNT_ADDR ((unsigned long)(mcount))
+# define HAVE_FUNCTION_GRAPH_FP_TEST
+#endif
+#define MCOUNT_INSN_SIZE 5 /* sizeof mcount call */
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+#define ARCH_SUPPORTS_FTRACE_OPS 1
+#endif
+
+#define HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
+
+#ifndef __ASSEMBLY__
+extern void mcount(void);
+extern atomic_t modifying_ftrace_code;
+extern void __fentry__(void);
+
+static inline unsigned long ftrace_call_adjust(unsigned long addr)
+{
+ /*
+ * addr is the address of the mcount call instruction.
+ * recordmcount does the necessary offset calculation.
+ */
+ return addr;
+}
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+struct dyn_arch_ftrace {
+ /* No extra data needed for x86 */
+};
+
+int ftrace_int3_handler(struct pt_regs *regs);
+
+#define FTRACE_GRAPH_TRAMP_ADDR FTRACE_GRAPH_ADDR
+
+#endif /* CONFIG_DYNAMIC_FTRACE */
+#endif /* __ASSEMBLY__ */
+#endif /* CONFIG_FUNCTION_TRACER */
+
+
+#ifndef __ASSEMBLY__
+
+#define ARCH_HAS_SYSCALL_MATCH_SYM_NAME
+static inline bool arch_syscall_match_sym_name(const char *sym, const char *name)
+{
+ /*
+ * Compare the symbol name with the system call name. Skip the
+ * "__x64_sys", "__ia32_sys" or simple "sys" prefix.
+ */
+ return !strcmp(sym + 3, name + 3) ||
+ (!strncmp(sym, "__x64_", 6) && !strcmp(sym + 9, name + 3)) ||
+ (!strncmp(sym, "__ia32_", 7) && !strcmp(sym + 10, name + 3));
+}
+
+#ifndef COMPILE_OFFSETS
+
+#if defined(CONFIG_FTRACE_SYSCALLS) && defined(CONFIG_IA32_EMULATION)
+#include <linux/compat.h>
+
+/*
+ * Because ia32 syscalls do not map to x86_64 syscall numbers
+ * this screws up the trace output when tracing a ia32 task.
+ * Instead of reporting bogus syscalls, just do not trace them.
+ *
+ * If the user really wants these, then they should use the
+ * raw syscall tracepoints with filtering.
+ */
+#define ARCH_TRACE_IGNORE_COMPAT_SYSCALLS 1
+static inline bool arch_trace_is_compat_syscall(struct pt_regs *regs)
+{
+ if (in_compat_syscall())
+ return true;
+ return false;
+}
+#endif /* CONFIG_FTRACE_SYSCALLS && CONFIG_IA32_EMULATION */
+#endif /* !COMPILE_OFFSETS */
+#endif /* !__ASSEMBLY__ */
+
+#endif /* _ASM_X86_FTRACE_H */
diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h
new file mode 100644
index 000000000..de4d68852
--- /dev/null
+++ b/arch/x86/include/asm/futex.h
@@ -0,0 +1,88 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_FUTEX_H
+#define _ASM_X86_FUTEX_H
+
+#ifdef __KERNEL__
+
+#include <linux/futex.h>
+#include <linux/uaccess.h>
+
+#include <asm/asm.h>
+#include <asm/errno.h>
+#include <asm/processor.h>
+#include <asm/smap.h>
+
+#define __futex_atomic_op1(insn, ret, oldval, uaddr, oparg) \
+ asm volatile("\t" ASM_STAC "\n" \
+ "1:\t" insn "\n" \
+ "2:\t" ASM_CLAC "\n" \
+ "\t.section .fixup,\"ax\"\n" \
+ "3:\tmov\t%3, %1\n" \
+ "\tjmp\t2b\n" \
+ "\t.previous\n" \
+ _ASM_EXTABLE(1b, 3b) \
+ : "=r" (oldval), "=r" (ret), "+m" (*uaddr) \
+ : "i" (-EFAULT), "0" (oparg), "1" (0))
+
+#define __futex_atomic_op2(insn, ret, oldval, uaddr, oparg) \
+ asm volatile("\t" ASM_STAC "\n" \
+ "1:\tmovl %2, %0\n" \
+ "\tmovl\t%0, %3\n" \
+ "\t" insn "\n" \
+ "2:\t" LOCK_PREFIX "cmpxchgl %3, %2\n" \
+ "\tjnz\t1b\n" \
+ "3:\t" ASM_CLAC "\n" \
+ "\t.section .fixup,\"ax\"\n" \
+ "4:\tmov\t%5, %1\n" \
+ "\tjmp\t3b\n" \
+ "\t.previous\n" \
+ _ASM_EXTABLE(1b, 4b) \
+ _ASM_EXTABLE(2b, 4b) \
+ : "=&a" (oldval), "=&r" (ret), \
+ "+m" (*uaddr), "=&r" (tem) \
+ : "r" (oparg), "i" (-EFAULT), "1" (0))
+
+static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval,
+ u32 __user *uaddr)
+{
+ int oldval = 0, ret, tem;
+
+ pagefault_disable();
+
+ switch (op) {
+ case FUTEX_OP_SET:
+ __futex_atomic_op1("xchgl %0, %2", ret, oldval, uaddr, oparg);
+ break;
+ case FUTEX_OP_ADD:
+ __futex_atomic_op1(LOCK_PREFIX "xaddl %0, %2", ret, oldval,
+ uaddr, oparg);
+ break;
+ case FUTEX_OP_OR:
+ __futex_atomic_op2("orl %4, %3", ret, oldval, uaddr, oparg);
+ break;
+ case FUTEX_OP_ANDN:
+ __futex_atomic_op2("andl %4, %3", ret, oldval, uaddr, ~oparg);
+ break;
+ case FUTEX_OP_XOR:
+ __futex_atomic_op2("xorl %4, %3", ret, oldval, uaddr, oparg);
+ break;
+ default:
+ ret = -ENOSYS;
+ }
+
+ pagefault_enable();
+
+ if (!ret)
+ *oval = oldval;
+
+ return ret;
+}
+
+static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
+ u32 oldval, u32 newval)
+{
+ return user_atomic_cmpxchg_inatomic(uval, uaddr, oldval, newval);
+}
+
+#endif
+#endif /* _ASM_X86_FUTEX_H */
diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h
new file mode 100644
index 000000000..318556574
--- /dev/null
+++ b/arch/x86/include/asm/gart.h
@@ -0,0 +1,114 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_GART_H
+#define _ASM_X86_GART_H
+
+#include <asm/e820/api.h>
+
+extern void set_up_gart_resume(u32, u32);
+
+extern int fallback_aper_order;
+extern int fallback_aper_force;
+extern int fix_aperture;
+
+/* PTE bits. */
+#define GPTE_VALID 1
+#define GPTE_COHERENT 2
+
+/* Aperture control register bits. */
+#define GARTEN (1<<0)
+#define DISGARTCPU (1<<4)
+#define DISGARTIO (1<<5)
+#define DISTLBWALKPRB (1<<6)
+
+/* GART cache control register bits. */
+#define INVGART (1<<0)
+#define GARTPTEERR (1<<1)
+
+/* K8 On-cpu GART registers */
+#define AMD64_GARTAPERTURECTL 0x90
+#define AMD64_GARTAPERTUREBASE 0x94
+#define AMD64_GARTTABLEBASE 0x98
+#define AMD64_GARTCACHECTL 0x9c
+
+#ifdef CONFIG_GART_IOMMU
+extern int gart_iommu_aperture;
+extern int gart_iommu_aperture_allowed;
+extern int gart_iommu_aperture_disabled;
+
+extern void early_gart_iommu_check(void);
+extern int gart_iommu_init(void);
+extern void __init gart_parse_options(char *);
+extern int gart_iommu_hole_init(void);
+
+#else
+#define gart_iommu_aperture 0
+#define gart_iommu_aperture_allowed 0
+#define gart_iommu_aperture_disabled 1
+
+static inline void early_gart_iommu_check(void)
+{
+}
+static inline void gart_parse_options(char *options)
+{
+}
+static inline int gart_iommu_hole_init(void)
+{
+ return -ENODEV;
+}
+#endif
+
+extern int agp_amd64_init(void);
+
+static inline void gart_set_size_and_enable(struct pci_dev *dev, u32 order)
+{
+ u32 ctl;
+
+ /*
+ * Don't enable translation but enable GART IO and CPU accesses.
+ * Also, set DISTLBWALKPRB since GART tables memory is UC.
+ */
+ ctl = order << 1;
+
+ pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl);
+}
+
+static inline void enable_gart_translation(struct pci_dev *dev, u64 addr)
+{
+ u32 tmp, ctl;
+
+ /* address of the mappings table */
+ addr >>= 12;
+ tmp = (u32) addr<<4;
+ tmp &= ~0xf;
+ pci_write_config_dword(dev, AMD64_GARTTABLEBASE, tmp);
+
+ /* Enable GART translation for this hammer. */
+ pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl);
+ ctl |= GARTEN | DISTLBWALKPRB;
+ ctl &= ~(DISGARTCPU | DISGARTIO);
+ pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl);
+}
+
+static inline int aperture_valid(u64 aper_base, u32 aper_size, u32 min_size)
+{
+ if (!aper_base)
+ return 0;
+
+ if (aper_base + aper_size > 0x100000000ULL) {
+ printk(KERN_INFO "Aperture beyond 4GB. Ignoring.\n");
+ return 0;
+ }
+ if (e820__mapped_any(aper_base, aper_base + aper_size, E820_TYPE_RAM)) {
+ printk(KERN_INFO "Aperture pointing to e820 RAM. Ignoring.\n");
+ return 0;
+ }
+ if (aper_size < min_size) {
+ printk(KERN_INFO "Aperture too small (%d MB) than (%d MB)\n",
+ aper_size>>20, min_size>>20);
+ return 0;
+ }
+
+ return 1;
+}
+
+#endif /* _ASM_X86_GART_H */
diff --git a/arch/x86/include/asm/genapic.h b/arch/x86/include/asm/genapic.h
new file mode 100644
index 000000000..4b8b98fa7
--- /dev/null
+++ b/arch/x86/include/asm/genapic.h
@@ -0,0 +1 @@
+#include <asm/apic.h>
diff --git a/arch/x86/include/asm/geode.h b/arch/x86/include/asm/geode.h
new file mode 100644
index 000000000..7cd73552a
--- /dev/null
+++ b/arch/x86/include/asm/geode.h
@@ -0,0 +1,36 @@
+/*
+ * AMD Geode definitions
+ * Copyright (C) 2006, Advanced Micro Devices, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ */
+
+#ifndef _ASM_X86_GEODE_H
+#define _ASM_X86_GEODE_H
+
+#include <asm/processor.h>
+#include <linux/io.h>
+#include <linux/cs5535.h>
+
+static inline int is_geode_gx(void)
+{
+ return ((boot_cpu_data.x86_vendor == X86_VENDOR_NSC) &&
+ (boot_cpu_data.x86 == 5) &&
+ (boot_cpu_data.x86_model == 5));
+}
+
+static inline int is_geode_lx(void)
+{
+ return ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) &&
+ (boot_cpu_data.x86 == 5) &&
+ (boot_cpu_data.x86_model == 10));
+}
+
+static inline int is_geode(void)
+{
+ return (is_geode_gx() || is_geode_lx());
+}
+
+#endif /* _ASM_X86_GEODE_H */
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
new file mode 100644
index 000000000..d9069bb26
--- /dev/null
+++ b/arch/x86/include/asm/hardirq.h
@@ -0,0 +1,83 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_HARDIRQ_H
+#define _ASM_X86_HARDIRQ_H
+
+#include <linux/threads.h>
+
+typedef struct {
+ u16 __softirq_pending;
+#if IS_ENABLED(CONFIG_KVM_INTEL)
+ u8 kvm_cpu_l1tf_flush_l1d;
+#endif
+ unsigned int __nmi_count; /* arch dependent */
+#ifdef CONFIG_X86_LOCAL_APIC
+ unsigned int apic_timer_irqs; /* arch dependent */
+ unsigned int irq_spurious_count;
+ unsigned int icr_read_retry_count;
+#endif
+#ifdef CONFIG_HAVE_KVM
+ unsigned int kvm_posted_intr_ipis;
+ unsigned int kvm_posted_intr_wakeup_ipis;
+ unsigned int kvm_posted_intr_nested_ipis;
+#endif
+ unsigned int x86_platform_ipis; /* arch dependent */
+ unsigned int apic_perf_irqs;
+ unsigned int apic_irq_work_irqs;
+#ifdef CONFIG_SMP
+ unsigned int irq_resched_count;
+ unsigned int irq_call_count;
+#endif
+ unsigned int irq_tlb_count;
+#ifdef CONFIG_X86_THERMAL_VECTOR
+ unsigned int irq_thermal_count;
+#endif
+#ifdef CONFIG_X86_MCE_THRESHOLD
+ unsigned int irq_threshold_count;
+#endif
+#ifdef CONFIG_X86_MCE_AMD
+ unsigned int irq_deferred_error_count;
+#endif
+#if IS_ENABLED(CONFIG_HYPERV) || defined(CONFIG_XEN)
+ unsigned int irq_hv_callback_count;
+#endif
+#if IS_ENABLED(CONFIG_HYPERV)
+ unsigned int irq_hv_reenlightenment_count;
+ unsigned int hyperv_stimer0_count;
+#endif
+} ____cacheline_aligned irq_cpustat_t;
+
+DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
+
+#define __ARCH_IRQ_STAT
+
+#define inc_irq_stat(member) this_cpu_inc(irq_stat.member)
+
+extern void ack_bad_irq(unsigned int irq);
+
+extern u64 arch_irq_stat_cpu(unsigned int cpu);
+#define arch_irq_stat_cpu arch_irq_stat_cpu
+
+extern u64 arch_irq_stat(void);
+#define arch_irq_stat arch_irq_stat
+
+
+#if IS_ENABLED(CONFIG_KVM_INTEL)
+static inline void kvm_set_cpu_l1tf_flush_l1d(void)
+{
+ __this_cpu_write(irq_stat.kvm_cpu_l1tf_flush_l1d, 1);
+}
+
+static inline void kvm_clear_cpu_l1tf_flush_l1d(void)
+{
+ __this_cpu_write(irq_stat.kvm_cpu_l1tf_flush_l1d, 0);
+}
+
+static inline bool kvm_get_cpu_l1tf_flush_l1d(void)
+{
+ return __this_cpu_read(irq_stat.kvm_cpu_l1tf_flush_l1d);
+}
+#else /* !IS_ENABLED(CONFIG_KVM_INTEL) */
+static inline void kvm_set_cpu_l1tf_flush_l1d(void) { }
+#endif /* IS_ENABLED(CONFIG_KVM_INTEL) */
+
+#endif /* _ASM_X86_HARDIRQ_H */
diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h
new file mode 100644
index 000000000..a80599300
--- /dev/null
+++ b/arch/x86/include/asm/highmem.h
@@ -0,0 +1,80 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * highmem.h: virtual kernel memory mappings for high memory
+ *
+ * Used in CONFIG_HIGHMEM systems for memory pages which
+ * are not addressable by direct kernel virtual addresses.
+ *
+ * Copyright (C) 1999 Gerhard Wichert, Siemens AG
+ * Gerhard.Wichert@pdb.siemens.de
+ *
+ *
+ * Redesigned the x86 32-bit VM architecture to deal with
+ * up to 16 Terabyte physical memory. With current x86 CPUs
+ * we now support up to 64 Gigabytes physical RAM.
+ *
+ * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
+ */
+
+#ifndef _ASM_X86_HIGHMEM_H
+#define _ASM_X86_HIGHMEM_H
+
+#ifdef __KERNEL__
+
+#include <linux/interrupt.h>
+#include <linux/threads.h>
+#include <asm/kmap_types.h>
+#include <asm/tlbflush.h>
+#include <asm/paravirt.h>
+#include <asm/fixmap.h>
+
+/* declarations for highmem.c */
+extern unsigned long highstart_pfn, highend_pfn;
+
+/*
+ * Right now we initialize only a single pte table. It can be extended
+ * easily, subsequent pte tables have to be allocated in one physical
+ * chunk of RAM.
+ */
+/*
+ * Ordering is:
+ *
+ * high memory on: high_memory off:
+ * FIXADDR_TOP FIXADDR_TOP
+ * fixed addresses fixed addresses
+ * FIXADDR_START FIXADDR_START
+ * temp fixed addresses/persistent kmap area VMALLOC_END
+ * PKMAP_BASE temp fixed addresses/vmalloc area
+ * VMALLOC_END VMALLOC_START
+ * vmalloc area high_memory
+ * VMALLOC_START
+ * high_memory
+ *
+ * The temp fixed area is only used during boot for early_ioremap(), and
+ * it is unused when the ioremap() is functional. vmalloc/pkmap area become
+ * available after early boot so the temp fixed area is available for re-use.
+ */
+#define LAST_PKMAP_MASK (LAST_PKMAP-1)
+#define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT)
+#define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT))
+
+extern void *kmap_high(struct page *page);
+extern void kunmap_high(struct page *page);
+
+void *kmap(struct page *page);
+void kunmap(struct page *page);
+
+void *kmap_atomic_prot(struct page *page, pgprot_t prot);
+void *kmap_atomic(struct page *page);
+void __kunmap_atomic(void *kvaddr);
+void *kmap_atomic_pfn(unsigned long pfn);
+void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot);
+
+#define flush_cache_kmaps() do { } while (0)
+
+extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn,
+ unsigned long end_pfn);
+
+#endif /* __KERNEL__ */
+
+#endif /* _ASM_X86_HIGHMEM_H */
diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h
new file mode 100644
index 000000000..67385d56d
--- /dev/null
+++ b/arch/x86/include/asm/hpet.h
@@ -0,0 +1,115 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_HPET_H
+#define _ASM_X86_HPET_H
+
+#include <linux/msi.h>
+
+#ifdef CONFIG_HPET_TIMER
+
+#define HPET_MMAP_SIZE 1024
+
+#define HPET_ID 0x000
+#define HPET_PERIOD 0x004
+#define HPET_CFG 0x010
+#define HPET_STATUS 0x020
+#define HPET_COUNTER 0x0f0
+
+#define HPET_Tn_CFG(n) (0x100 + 0x20 * n)
+#define HPET_Tn_CMP(n) (0x108 + 0x20 * n)
+#define HPET_Tn_ROUTE(n) (0x110 + 0x20 * n)
+
+#define HPET_T0_CFG 0x100
+#define HPET_T0_CMP 0x108
+#define HPET_T0_ROUTE 0x110
+#define HPET_T1_CFG 0x120
+#define HPET_T1_CMP 0x128
+#define HPET_T1_ROUTE 0x130
+#define HPET_T2_CFG 0x140
+#define HPET_T2_CMP 0x148
+#define HPET_T2_ROUTE 0x150
+
+#define HPET_ID_REV 0x000000ff
+#define HPET_ID_NUMBER 0x00001f00
+#define HPET_ID_64BIT 0x00002000
+#define HPET_ID_LEGSUP 0x00008000
+#define HPET_ID_VENDOR 0xffff0000
+#define HPET_ID_NUMBER_SHIFT 8
+#define HPET_ID_VENDOR_SHIFT 16
+
+#define HPET_CFG_ENABLE 0x001
+#define HPET_CFG_LEGACY 0x002
+#define HPET_LEGACY_8254 2
+#define HPET_LEGACY_RTC 8
+
+#define HPET_TN_LEVEL 0x0002
+#define HPET_TN_ENABLE 0x0004
+#define HPET_TN_PERIODIC 0x0008
+#define HPET_TN_PERIODIC_CAP 0x0010
+#define HPET_TN_64BIT_CAP 0x0020
+#define HPET_TN_SETVAL 0x0040
+#define HPET_TN_32BIT 0x0100
+#define HPET_TN_ROUTE 0x3e00
+#define HPET_TN_FSB 0x4000
+#define HPET_TN_FSB_CAP 0x8000
+#define HPET_TN_ROUTE_SHIFT 9
+
+/* Max HPET Period is 10^8 femto sec as in HPET spec */
+#define HPET_MAX_PERIOD 100000000UL
+/*
+ * Min HPET period is 10^5 femto sec just for safety. If it is less than this,
+ * then 32 bit HPET counter wrapsaround in less than 0.5 sec.
+ */
+#define HPET_MIN_PERIOD 100000UL
+
+/* hpet memory map physical address */
+extern unsigned long hpet_address;
+extern unsigned long force_hpet_address;
+extern bool boot_hpet_disable;
+extern u8 hpet_blockid;
+extern bool hpet_force_user;
+extern bool hpet_msi_disable;
+extern int is_hpet_enabled(void);
+extern int hpet_enable(void);
+extern void hpet_disable(void);
+extern unsigned int hpet_readl(unsigned int a);
+extern void force_hpet_resume(void);
+
+struct irq_data;
+struct hpet_dev;
+struct irq_domain;
+
+extern void hpet_msi_unmask(struct irq_data *data);
+extern void hpet_msi_mask(struct irq_data *data);
+extern void hpet_msi_write(struct hpet_dev *hdev, struct msi_msg *msg);
+extern void hpet_msi_read(struct hpet_dev *hdev, struct msi_msg *msg);
+extern struct irq_domain *hpet_create_irq_domain(int hpet_id);
+extern int hpet_assign_irq(struct irq_domain *domain,
+ struct hpet_dev *dev, int dev_num);
+
+#ifdef CONFIG_HPET_EMULATE_RTC
+
+#include <linux/interrupt.h>
+
+typedef irqreturn_t (*rtc_irq_handler)(int interrupt, void *cookie);
+extern int hpet_mask_rtc_irq_bit(unsigned long bit_mask);
+extern int hpet_set_rtc_irq_bit(unsigned long bit_mask);
+extern int hpet_set_alarm_time(unsigned char hrs, unsigned char min,
+ unsigned char sec);
+extern int hpet_set_periodic_freq(unsigned long freq);
+extern int hpet_rtc_dropped_irq(void);
+extern int hpet_rtc_timer_init(void);
+extern irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id);
+extern int hpet_register_irq_handler(rtc_irq_handler handler);
+extern void hpet_unregister_irq_handler(rtc_irq_handler handler);
+
+#endif /* CONFIG_HPET_EMULATE_RTC */
+
+#else /* CONFIG_HPET_TIMER */
+
+static inline int hpet_enable(void) { return 0; }
+static inline int is_hpet_enabled(void) { return 0; }
+#define hpet_readl(a) 0
+#define default_setup_hpet_msi NULL
+
+#endif
+#endif /* _ASM_X86_HPET_H */
diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h
new file mode 100644
index 000000000..5ed826da5
--- /dev/null
+++ b/arch/x86/include/asm/hugetlb.h
@@ -0,0 +1,93 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_HUGETLB_H
+#define _ASM_X86_HUGETLB_H
+
+#include <asm/page.h>
+#include <asm-generic/hugetlb.h>
+
+#define hugepages_supported() boot_cpu_has(X86_FEATURE_PSE)
+
+static inline int is_hugepage_only_range(struct mm_struct *mm,
+ unsigned long addr,
+ unsigned long len) {
+ return 0;
+}
+
+/*
+ * If the arch doesn't supply something else, assume that hugepage
+ * size aligned regions are ok without further preparation.
+ */
+static inline int prepare_hugepage_range(struct file *file,
+ unsigned long addr, unsigned long len)
+{
+ struct hstate *h = hstate_file(file);
+ if (len & ~huge_page_mask(h))
+ return -EINVAL;
+ if (addr & ~huge_page_mask(h))
+ return -EINVAL;
+ return 0;
+}
+
+static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
+ unsigned long addr, unsigned long end,
+ unsigned long floor,
+ unsigned long ceiling)
+{
+ free_pgd_range(tlb, addr, end, floor, ceiling);
+}
+
+static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte)
+{
+ set_pte_at(mm, addr, ptep, pte);
+}
+
+static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep)
+{
+ return ptep_get_and_clear(mm, addr, ptep);
+}
+
+static inline void huge_ptep_clear_flush(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep)
+{
+ ptep_clear_flush(vma, addr, ptep);
+}
+
+static inline int huge_pte_none(pte_t pte)
+{
+ return pte_none(pte);
+}
+
+static inline pte_t huge_pte_wrprotect(pte_t pte)
+{
+ return pte_wrprotect(pte);
+}
+
+static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep)
+{
+ ptep_set_wrprotect(mm, addr, ptep);
+}
+
+static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ pte_t pte, int dirty)
+{
+ return ptep_set_access_flags(vma, addr, ptep, pte, dirty);
+}
+
+static inline pte_t huge_ptep_get(pte_t *ptep)
+{
+ return *ptep;
+}
+
+static inline void arch_clear_hugepage_flags(struct page *page)
+{
+}
+
+#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
+static inline bool gigantic_page_supported(void) { return true; }
+#endif
+
+#endif /* _ASM_X86_HUGETLB_H */
diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h
new file mode 100644
index 000000000..a1f0e90d0
--- /dev/null
+++ b/arch/x86/include/asm/hw_breakpoint.h
@@ -0,0 +1,80 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _I386_HW_BREAKPOINT_H
+#define _I386_HW_BREAKPOINT_H
+
+#include <uapi/asm/hw_breakpoint.h>
+
+#define __ARCH_HW_BREAKPOINT_H
+
+/*
+ * The name should probably be something dealt in
+ * a higher level. While dealing with the user
+ * (display/resolving)
+ */
+struct arch_hw_breakpoint {
+ unsigned long address;
+ unsigned long mask;
+ u8 len;
+ u8 type;
+};
+
+#include <linux/kdebug.h>
+#include <linux/percpu.h>
+#include <linux/list.h>
+
+/* Available HW breakpoint length encodings */
+#define X86_BREAKPOINT_LEN_X 0x40
+#define X86_BREAKPOINT_LEN_1 0x40
+#define X86_BREAKPOINT_LEN_2 0x44
+#define X86_BREAKPOINT_LEN_4 0x4c
+
+#ifdef CONFIG_X86_64
+#define X86_BREAKPOINT_LEN_8 0x48
+#endif
+
+/* Available HW breakpoint type encodings */
+
+/* trigger on instruction execute */
+#define X86_BREAKPOINT_EXECUTE 0x80
+/* trigger on memory write */
+#define X86_BREAKPOINT_WRITE 0x81
+/* trigger on memory read or write */
+#define X86_BREAKPOINT_RW 0x83
+
+/* Total number of available HW breakpoint registers */
+#define HBP_NUM 4
+
+static inline int hw_breakpoint_slots(int type)
+{
+ return HBP_NUM;
+}
+
+struct perf_event_attr;
+struct perf_event;
+struct pmu;
+
+extern int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw);
+extern int hw_breakpoint_arch_parse(struct perf_event *bp,
+ const struct perf_event_attr *attr,
+ struct arch_hw_breakpoint *hw);
+extern int hw_breakpoint_exceptions_notify(struct notifier_block *unused,
+ unsigned long val, void *data);
+
+
+int arch_install_hw_breakpoint(struct perf_event *bp);
+void arch_uninstall_hw_breakpoint(struct perf_event *bp);
+void hw_breakpoint_pmu_read(struct perf_event *bp);
+void hw_breakpoint_pmu_unthrottle(struct perf_event *bp);
+
+extern void
+arch_fill_perf_breakpoint(struct perf_event *bp);
+
+unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type);
+int decode_dr7(unsigned long dr7, int bpnum, unsigned *len, unsigned *type);
+
+extern int arch_bp_generic_fields(int x86_len, int x86_type,
+ int *gen_len, int *gen_type);
+
+extern struct pmu perf_ops_bp;
+
+#endif /* _I386_HW_BREAKPOINT_H */
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
new file mode 100644
index 000000000..cbd97e22d
--- /dev/null
+++ b/arch/x86/include/asm/hw_irq.h
@@ -0,0 +1,164 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_HW_IRQ_H
+#define _ASM_X86_HW_IRQ_H
+
+/*
+ * (C) 1992, 1993 Linus Torvalds, (C) 1997 Ingo Molnar
+ *
+ * moved some of the old arch/i386/kernel/irq.h to here. VY
+ *
+ * IRQ/IPI changes taken from work by Thomas Radke
+ * <tomsoft@informatik.tu-chemnitz.de>
+ *
+ * hacked by Andi Kleen for x86-64.
+ * unified by tglx
+ */
+
+#include <asm/irq_vectors.h>
+
+#define IRQ_MATRIX_BITS NR_VECTORS
+
+#ifndef __ASSEMBLY__
+
+#include <linux/percpu.h>
+#include <linux/profile.h>
+#include <linux/smp.h>
+
+#include <linux/atomic.h>
+#include <asm/irq.h>
+#include <asm/sections.h>
+
+/* Interrupt handlers registered during init_IRQ */
+extern asmlinkage void apic_timer_interrupt(void);
+extern asmlinkage void x86_platform_ipi(void);
+extern asmlinkage void kvm_posted_intr_ipi(void);
+extern asmlinkage void kvm_posted_intr_wakeup_ipi(void);
+extern asmlinkage void kvm_posted_intr_nested_ipi(void);
+extern asmlinkage void error_interrupt(void);
+extern asmlinkage void irq_work_interrupt(void);
+extern asmlinkage void uv_bau_message_intr1(void);
+
+extern asmlinkage void spurious_interrupt(void);
+extern asmlinkage void thermal_interrupt(void);
+extern asmlinkage void reschedule_interrupt(void);
+
+extern asmlinkage void irq_move_cleanup_interrupt(void);
+extern asmlinkage void reboot_interrupt(void);
+extern asmlinkage void threshold_interrupt(void);
+extern asmlinkage void deferred_error_interrupt(void);
+
+extern asmlinkage void call_function_interrupt(void);
+extern asmlinkage void call_function_single_interrupt(void);
+
+#ifdef CONFIG_X86_LOCAL_APIC
+struct irq_data;
+struct pci_dev;
+struct msi_desc;
+
+enum irq_alloc_type {
+ X86_IRQ_ALLOC_TYPE_IOAPIC = 1,
+ X86_IRQ_ALLOC_TYPE_HPET,
+ X86_IRQ_ALLOC_TYPE_MSI,
+ X86_IRQ_ALLOC_TYPE_MSIX,
+ X86_IRQ_ALLOC_TYPE_DMAR,
+ X86_IRQ_ALLOC_TYPE_UV,
+};
+
+struct irq_alloc_info {
+ enum irq_alloc_type type;
+ u32 flags;
+ const struct cpumask *mask; /* CPU mask for vector allocation */
+ union {
+ int unused;
+#ifdef CONFIG_HPET_TIMER
+ struct {
+ int hpet_id;
+ int hpet_index;
+ void *hpet_data;
+ };
+#endif
+#ifdef CONFIG_PCI_MSI
+ struct {
+ struct pci_dev *msi_dev;
+ irq_hw_number_t msi_hwirq;
+ };
+#endif
+#ifdef CONFIG_X86_IO_APIC
+ struct {
+ int ioapic_id;
+ int ioapic_pin;
+ int ioapic_node;
+ u32 ioapic_trigger : 1;
+ u32 ioapic_polarity : 1;
+ u32 ioapic_valid : 1;
+ struct IO_APIC_route_entry *ioapic_entry;
+ };
+#endif
+#ifdef CONFIG_DMAR_TABLE
+ struct {
+ int dmar_id;
+ void *dmar_data;
+ };
+#endif
+#ifdef CONFIG_X86_UV
+ struct {
+ int uv_limit;
+ int uv_blade;
+ unsigned long uv_offset;
+ char *uv_name;
+ };
+#endif
+#if IS_ENABLED(CONFIG_VMD)
+ struct {
+ struct msi_desc *desc;
+ };
+#endif
+ };
+};
+
+struct irq_cfg {
+ unsigned int dest_apicid;
+ unsigned int vector;
+};
+
+extern struct irq_cfg *irq_cfg(unsigned int irq);
+extern struct irq_cfg *irqd_cfg(struct irq_data *irq_data);
+extern void lock_vector_lock(void);
+extern void unlock_vector_lock(void);
+#ifdef CONFIG_SMP
+extern void send_cleanup_vector(struct irq_cfg *);
+extern void irq_complete_move(struct irq_cfg *cfg);
+#else
+static inline void send_cleanup_vector(struct irq_cfg *c) { }
+static inline void irq_complete_move(struct irq_cfg *c) { }
+#endif
+
+extern void apic_ack_edge(struct irq_data *data);
+#else /* CONFIG_X86_LOCAL_APIC */
+static inline void lock_vector_lock(void) {}
+static inline void unlock_vector_lock(void) {}
+#endif /* CONFIG_X86_LOCAL_APIC */
+
+/* Statistics */
+extern atomic_t irq_err_count;
+extern atomic_t irq_mis_count;
+
+extern void elcr_set_level_irq(unsigned int irq);
+
+extern char irq_entries_start[];
+#ifdef CONFIG_TRACING
+#define trace_irq_entries_start irq_entries_start
+#endif
+
+extern char spurious_entries_start[];
+
+#define VECTOR_UNUSED NULL
+#define VECTOR_SHUTDOWN ((void *)~0UL)
+#define VECTOR_RETRIGGERED ((void *)~1UL)
+
+typedef struct irq_desc* vector_irq_t[NR_VECTORS];
+DECLARE_PER_CPU(vector_irq_t, vector_irq);
+
+#endif /* !ASSEMBLY_ */
+
+#endif /* _ASM_X86_HW_IRQ_H */
diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h
new file mode 100644
index 000000000..00e01d215
--- /dev/null
+++ b/arch/x86/include/asm/hyperv-tlfs.h
@@ -0,0 +1,771 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+
+/*
+ * This file contains definitions from Hyper-V Hypervisor Top-Level Functional
+ * Specification (TLFS):
+ * https://docs.microsoft.com/en-us/virtualization/hyper-v-on-windows/reference/tlfs
+ */
+
+#ifndef _ASM_X86_HYPERV_TLFS_H
+#define _ASM_X86_HYPERV_TLFS_H
+
+#include <linux/types.h>
+
+/*
+ * The below CPUID leaves are present if VersionAndFeatures.HypervisorPresent
+ * is set by CPUID(HvCpuIdFunctionVersionAndFeatures).
+ */
+#define HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS 0x40000000
+#define HYPERV_CPUID_INTERFACE 0x40000001
+#define HYPERV_CPUID_VERSION 0x40000002
+#define HYPERV_CPUID_FEATURES 0x40000003
+#define HYPERV_CPUID_ENLIGHTMENT_INFO 0x40000004
+#define HYPERV_CPUID_IMPLEMENT_LIMITS 0x40000005
+#define HYPERV_CPUID_NESTED_FEATURES 0x4000000A
+
+#define HYPERV_HYPERVISOR_PRESENT_BIT 0x80000000
+#define HYPERV_CPUID_MIN 0x40000005
+#define HYPERV_CPUID_MAX 0x4000ffff
+
+/*
+ * Feature identification. EAX indicates which features are available
+ * to the partition based upon the current partition privileges.
+ */
+
+/* VP Runtime (HV_X64_MSR_VP_RUNTIME) available */
+#define HV_X64_MSR_VP_RUNTIME_AVAILABLE (1 << 0)
+/* Partition Reference Counter (HV_X64_MSR_TIME_REF_COUNT) available*/
+#define HV_MSR_TIME_REF_COUNT_AVAILABLE (1 << 1)
+/* Partition reference TSC MSR is available */
+#define HV_MSR_REFERENCE_TSC_AVAILABLE (1 << 9)
+
+/* A partition's reference time stamp counter (TSC) page */
+#define HV_X64_MSR_REFERENCE_TSC 0x40000021
+
+/*
+ * There is a single feature flag that signifies if the partition has access
+ * to MSRs with local APIC and TSC frequencies.
+ */
+#define HV_X64_ACCESS_FREQUENCY_MSRS (1 << 11)
+
+/* AccessReenlightenmentControls privilege */
+#define HV_X64_ACCESS_REENLIGHTENMENT BIT(13)
+
+/*
+ * Basic SynIC MSRs (HV_X64_MSR_SCONTROL through HV_X64_MSR_EOM
+ * and HV_X64_MSR_SINT0 through HV_X64_MSR_SINT15) available
+ */
+#define HV_X64_MSR_SYNIC_AVAILABLE (1 << 2)
+/*
+ * Synthetic Timer MSRs (HV_X64_MSR_STIMER0_CONFIG through
+ * HV_X64_MSR_STIMER3_COUNT) available
+ */
+#define HV_MSR_SYNTIMER_AVAILABLE (1 << 3)
+/*
+ * APIC access MSRs (HV_X64_MSR_EOI, HV_X64_MSR_ICR and HV_X64_MSR_TPR)
+ * are available
+ */
+#define HV_X64_MSR_APIC_ACCESS_AVAILABLE (1 << 4)
+/* Hypercall MSRs (HV_X64_MSR_GUEST_OS_ID and HV_X64_MSR_HYPERCALL) available*/
+#define HV_X64_MSR_HYPERCALL_AVAILABLE (1 << 5)
+/* Access virtual processor index MSR (HV_X64_MSR_VP_INDEX) available*/
+#define HV_X64_MSR_VP_INDEX_AVAILABLE (1 << 6)
+/* Virtual system reset MSR (HV_X64_MSR_RESET) is available*/
+#define HV_X64_MSR_RESET_AVAILABLE (1 << 7)
+ /*
+ * Access statistics pages MSRs (HV_X64_MSR_STATS_PARTITION_RETAIL_PAGE,
+ * HV_X64_MSR_STATS_PARTITION_INTERNAL_PAGE, HV_X64_MSR_STATS_VP_RETAIL_PAGE,
+ * HV_X64_MSR_STATS_VP_INTERNAL_PAGE) available
+ */
+#define HV_X64_MSR_STAT_PAGES_AVAILABLE (1 << 8)
+
+/* Frequency MSRs available */
+#define HV_FEATURE_FREQUENCY_MSRS_AVAILABLE (1 << 8)
+
+/* Crash MSR available */
+#define HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE (1 << 10)
+
+/* stimer Direct Mode is available */
+#define HV_STIMER_DIRECT_MODE_AVAILABLE (1 << 19)
+
+/*
+ * Feature identification: EBX indicates which flags were specified at
+ * partition creation. The format is the same as the partition creation
+ * flag structure defined in section Partition Creation Flags.
+ */
+#define HV_X64_CREATE_PARTITIONS (1 << 0)
+#define HV_X64_ACCESS_PARTITION_ID (1 << 1)
+#define HV_X64_ACCESS_MEMORY_POOL (1 << 2)
+#define HV_X64_ADJUST_MESSAGE_BUFFERS (1 << 3)
+#define HV_X64_POST_MESSAGES (1 << 4)
+#define HV_X64_SIGNAL_EVENTS (1 << 5)
+#define HV_X64_CREATE_PORT (1 << 6)
+#define HV_X64_CONNECT_PORT (1 << 7)
+#define HV_X64_ACCESS_STATS (1 << 8)
+#define HV_X64_DEBUGGING (1 << 11)
+#define HV_X64_CPU_POWER_MANAGEMENT (1 << 12)
+#define HV_X64_CONFIGURE_PROFILER (1 << 13)
+
+/*
+ * Feature identification. EDX indicates which miscellaneous features
+ * are available to the partition.
+ */
+/* The MWAIT instruction is available (per section MONITOR / MWAIT) */
+#define HV_X64_MWAIT_AVAILABLE (1 << 0)
+/* Guest debugging support is available */
+#define HV_X64_GUEST_DEBUGGING_AVAILABLE (1 << 1)
+/* Performance Monitor support is available*/
+#define HV_X64_PERF_MONITOR_AVAILABLE (1 << 2)
+/* Support for physical CPU dynamic partitioning events is available*/
+#define HV_X64_CPU_DYNAMIC_PARTITIONING_AVAILABLE (1 << 3)
+/*
+ * Support for passing hypercall input parameter block via XMM
+ * registers is available
+ */
+#define HV_X64_HYPERCALL_PARAMS_XMM_AVAILABLE (1 << 4)
+/* Support for a virtual guest idle state is available */
+#define HV_X64_GUEST_IDLE_STATE_AVAILABLE (1 << 5)
+/* Guest crash data handler available */
+#define HV_X64_GUEST_CRASH_MSR_AVAILABLE (1 << 10)
+
+/*
+ * Implementation recommendations. Indicates which behaviors the hypervisor
+ * recommends the OS implement for optimal performance.
+ */
+ /*
+ * Recommend using hypercall for address space switches rather
+ * than MOV to CR3 instruction
+ */
+#define HV_X64_AS_SWITCH_RECOMMENDED (1 << 0)
+/* Recommend using hypercall for local TLB flushes rather
+ * than INVLPG or MOV to CR3 instructions */
+#define HV_X64_LOCAL_TLB_FLUSH_RECOMMENDED (1 << 1)
+/*
+ * Recommend using hypercall for remote TLB flushes rather
+ * than inter-processor interrupts
+ */
+#define HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED (1 << 2)
+/*
+ * Recommend using MSRs for accessing APIC registers
+ * EOI, ICR and TPR rather than their memory-mapped counterparts
+ */
+#define HV_X64_APIC_ACCESS_RECOMMENDED (1 << 3)
+/* Recommend using the hypervisor-provided MSR to initiate a system RESET */
+#define HV_X64_SYSTEM_RESET_RECOMMENDED (1 << 4)
+/*
+ * Recommend using relaxed timing for this partition. If used,
+ * the VM should disable any watchdog timeouts that rely on the
+ * timely delivery of external interrupts
+ */
+#define HV_X64_RELAXED_TIMING_RECOMMENDED (1 << 5)
+
+/*
+ * Recommend not using Auto End-Of-Interrupt feature
+ */
+#define HV_DEPRECATING_AEOI_RECOMMENDED (1 << 9)
+
+/*
+ * Recommend using cluster IPI hypercalls.
+ */
+#define HV_X64_CLUSTER_IPI_RECOMMENDED (1 << 10)
+
+/* Recommend using the newer ExProcessorMasks interface */
+#define HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED (1 << 11)
+
+/* Recommend using enlightened VMCS */
+#define HV_X64_ENLIGHTENED_VMCS_RECOMMENDED (1 << 14)
+
+/*
+ * Crash notification flags.
+ */
+#define HV_CRASH_CTL_CRASH_NOTIFY_MSG BIT_ULL(62)
+#define HV_CRASH_CTL_CRASH_NOTIFY BIT_ULL(63)
+
+/* MSR used to identify the guest OS. */
+#define HV_X64_MSR_GUEST_OS_ID 0x40000000
+
+/* MSR used to setup pages used to communicate with the hypervisor. */
+#define HV_X64_MSR_HYPERCALL 0x40000001
+
+/* MSR used to provide vcpu index */
+#define HV_X64_MSR_VP_INDEX 0x40000002
+
+/* MSR used to reset the guest OS. */
+#define HV_X64_MSR_RESET 0x40000003
+
+/* MSR used to provide vcpu runtime in 100ns units */
+#define HV_X64_MSR_VP_RUNTIME 0x40000010
+
+/* MSR used to read the per-partition time reference counter */
+#define HV_X64_MSR_TIME_REF_COUNT 0x40000020
+
+/* MSR used to retrieve the TSC frequency */
+#define HV_X64_MSR_TSC_FREQUENCY 0x40000022
+
+/* MSR used to retrieve the local APIC timer frequency */
+#define HV_X64_MSR_APIC_FREQUENCY 0x40000023
+
+/* Define the virtual APIC registers */
+#define HV_X64_MSR_EOI 0x40000070
+#define HV_X64_MSR_ICR 0x40000071
+#define HV_X64_MSR_TPR 0x40000072
+#define HV_X64_MSR_VP_ASSIST_PAGE 0x40000073
+
+/* Define synthetic interrupt controller model specific registers. */
+#define HV_X64_MSR_SCONTROL 0x40000080
+#define HV_X64_MSR_SVERSION 0x40000081
+#define HV_X64_MSR_SIEFP 0x40000082
+#define HV_X64_MSR_SIMP 0x40000083
+#define HV_X64_MSR_EOM 0x40000084
+#define HV_X64_MSR_SINT0 0x40000090
+#define HV_X64_MSR_SINT1 0x40000091
+#define HV_X64_MSR_SINT2 0x40000092
+#define HV_X64_MSR_SINT3 0x40000093
+#define HV_X64_MSR_SINT4 0x40000094
+#define HV_X64_MSR_SINT5 0x40000095
+#define HV_X64_MSR_SINT6 0x40000096
+#define HV_X64_MSR_SINT7 0x40000097
+#define HV_X64_MSR_SINT8 0x40000098
+#define HV_X64_MSR_SINT9 0x40000099
+#define HV_X64_MSR_SINT10 0x4000009A
+#define HV_X64_MSR_SINT11 0x4000009B
+#define HV_X64_MSR_SINT12 0x4000009C
+#define HV_X64_MSR_SINT13 0x4000009D
+#define HV_X64_MSR_SINT14 0x4000009E
+#define HV_X64_MSR_SINT15 0x4000009F
+
+/*
+ * Synthetic Timer MSRs. Four timers per vcpu.
+ */
+#define HV_X64_MSR_STIMER0_CONFIG 0x400000B0
+#define HV_X64_MSR_STIMER0_COUNT 0x400000B1
+#define HV_X64_MSR_STIMER1_CONFIG 0x400000B2
+#define HV_X64_MSR_STIMER1_COUNT 0x400000B3
+#define HV_X64_MSR_STIMER2_CONFIG 0x400000B4
+#define HV_X64_MSR_STIMER2_COUNT 0x400000B5
+#define HV_X64_MSR_STIMER3_CONFIG 0x400000B6
+#define HV_X64_MSR_STIMER3_COUNT 0x400000B7
+
+/* Hyper-V guest crash notification MSR's */
+#define HV_X64_MSR_CRASH_P0 0x40000100
+#define HV_X64_MSR_CRASH_P1 0x40000101
+#define HV_X64_MSR_CRASH_P2 0x40000102
+#define HV_X64_MSR_CRASH_P3 0x40000103
+#define HV_X64_MSR_CRASH_P4 0x40000104
+#define HV_X64_MSR_CRASH_CTL 0x40000105
+#define HV_X64_MSR_CRASH_CTL_NOTIFY (1ULL << 63)
+#define HV_X64_MSR_CRASH_PARAMS \
+ (1 + (HV_X64_MSR_CRASH_P4 - HV_X64_MSR_CRASH_P0))
+
+/*
+ * Declare the MSR used to setup pages used to communicate with the hypervisor.
+ */
+union hv_x64_msr_hypercall_contents {
+ u64 as_uint64;
+ struct {
+ u64 enable:1;
+ u64 reserved:11;
+ u64 guest_physical_address:52;
+ };
+};
+
+/*
+ * TSC page layout.
+ */
+struct ms_hyperv_tsc_page {
+ volatile u32 tsc_sequence;
+ u32 reserved1;
+ volatile u64 tsc_scale;
+ volatile s64 tsc_offset;
+ u64 reserved2[509];
+};
+
+/*
+ * The guest OS needs to register the guest ID with the hypervisor.
+ * The guest ID is a 64 bit entity and the structure of this ID is
+ * specified in the Hyper-V specification:
+ *
+ * msdn.microsoft.com/en-us/library/windows/hardware/ff542653%28v=vs.85%29.aspx
+ *
+ * While the current guideline does not specify how Linux guest ID(s)
+ * need to be generated, our plan is to publish the guidelines for
+ * Linux and other guest operating systems that currently are hosted
+ * on Hyper-V. The implementation here conforms to this yet
+ * unpublished guidelines.
+ *
+ *
+ * Bit(s)
+ * 63 - Indicates if the OS is Open Source or not; 1 is Open Source
+ * 62:56 - Os Type; Linux is 0x100
+ * 55:48 - Distro specific identification
+ * 47:16 - Linux kernel version number
+ * 15:0 - Distro specific identification
+ *
+ *
+ */
+
+#define HV_LINUX_VENDOR_ID 0x8100
+
+/* TSC emulation after migration */
+#define HV_X64_MSR_REENLIGHTENMENT_CONTROL 0x40000106
+
+/* Nested features (CPUID 0x4000000A) EAX */
+#define HV_X64_NESTED_GUEST_MAPPING_FLUSH BIT(18)
+#define HV_X64_NESTED_MSR_BITMAP BIT(19)
+
+struct hv_reenlightenment_control {
+ __u64 vector:8;
+ __u64 reserved1:8;
+ __u64 enabled:1;
+ __u64 reserved2:15;
+ __u64 target_vp:32;
+};
+
+#define HV_X64_MSR_TSC_EMULATION_CONTROL 0x40000107
+#define HV_X64_MSR_TSC_EMULATION_STATUS 0x40000108
+
+struct hv_tsc_emulation_control {
+ __u64 enabled:1;
+ __u64 reserved:63;
+};
+
+struct hv_tsc_emulation_status {
+ __u64 inprogress:1;
+ __u64 reserved:63;
+};
+
+#define HV_X64_MSR_HYPERCALL_ENABLE 0x00000001
+#define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT 12
+#define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK \
+ (~((1ull << HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT) - 1))
+
+#define HV_IPI_LOW_VECTOR 0x10
+#define HV_IPI_HIGH_VECTOR 0xff
+
+/* Declare the various hypercall operations. */
+#define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE 0x0002
+#define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST 0x0003
+#define HVCALL_NOTIFY_LONG_SPIN_WAIT 0x0008
+#define HVCALL_SEND_IPI 0x000b
+#define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX 0x0013
+#define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX 0x0014
+#define HVCALL_SEND_IPI_EX 0x0015
+#define HVCALL_POST_MESSAGE 0x005c
+#define HVCALL_SIGNAL_EVENT 0x005d
+#define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE 0x00af
+
+#define HV_X64_MSR_VP_ASSIST_PAGE_ENABLE 0x00000001
+#define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT 12
+#define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_MASK \
+ (~((1ull << HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT) - 1))
+
+/* Hyper-V Enlightened VMCS version mask in nested features CPUID */
+#define HV_X64_ENLIGHTENED_VMCS_VERSION 0xff
+
+#define HV_X64_MSR_TSC_REFERENCE_ENABLE 0x00000001
+#define HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT 12
+
+#define HV_PROCESSOR_POWER_STATE_C0 0
+#define HV_PROCESSOR_POWER_STATE_C1 1
+#define HV_PROCESSOR_POWER_STATE_C2 2
+#define HV_PROCESSOR_POWER_STATE_C3 3
+
+#define HV_FLUSH_ALL_PROCESSORS BIT(0)
+#define HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES BIT(1)
+#define HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY BIT(2)
+#define HV_FLUSH_USE_EXTENDED_RANGE_FORMAT BIT(3)
+
+enum HV_GENERIC_SET_FORMAT {
+ HV_GENERIC_SET_SPARSE_4K,
+ HV_GENERIC_SET_ALL,
+};
+
+#define HV_HYPERCALL_RESULT_MASK GENMASK_ULL(15, 0)
+#define HV_HYPERCALL_FAST_BIT BIT(16)
+#define HV_HYPERCALL_VARHEAD_OFFSET 17
+#define HV_HYPERCALL_REP_COMP_OFFSET 32
+#define HV_HYPERCALL_REP_COMP_MASK GENMASK_ULL(43, 32)
+#define HV_HYPERCALL_REP_START_OFFSET 48
+#define HV_HYPERCALL_REP_START_MASK GENMASK_ULL(59, 48)
+
+/* hypercall status code */
+#define HV_STATUS_SUCCESS 0
+#define HV_STATUS_INVALID_HYPERCALL_CODE 2
+#define HV_STATUS_INVALID_HYPERCALL_INPUT 3
+#define HV_STATUS_INVALID_ALIGNMENT 4
+#define HV_STATUS_INVALID_PARAMETER 5
+#define HV_STATUS_INSUFFICIENT_MEMORY 11
+#define HV_STATUS_INVALID_PORT_ID 17
+#define HV_STATUS_INVALID_CONNECTION_ID 18
+#define HV_STATUS_INSUFFICIENT_BUFFERS 19
+
+typedef struct _HV_REFERENCE_TSC_PAGE {
+ __u32 tsc_sequence;
+ __u32 res1;
+ __u64 tsc_scale;
+ __s64 tsc_offset;
+} HV_REFERENCE_TSC_PAGE, *PHV_REFERENCE_TSC_PAGE;
+
+/* Define the number of synthetic interrupt sources. */
+#define HV_SYNIC_SINT_COUNT (16)
+/* Define the expected SynIC version. */
+#define HV_SYNIC_VERSION_1 (0x1)
+/* Valid SynIC vectors are 16-255. */
+#define HV_SYNIC_FIRST_VALID_VECTOR (16)
+
+#define HV_SYNIC_CONTROL_ENABLE (1ULL << 0)
+#define HV_SYNIC_SIMP_ENABLE (1ULL << 0)
+#define HV_SYNIC_SIEFP_ENABLE (1ULL << 0)
+#define HV_SYNIC_SINT_MASKED (1ULL << 16)
+#define HV_SYNIC_SINT_AUTO_EOI (1ULL << 17)
+#define HV_SYNIC_SINT_VECTOR_MASK (0xFF)
+
+#define HV_SYNIC_STIMER_COUNT (4)
+
+/* Define synthetic interrupt controller message constants. */
+#define HV_MESSAGE_SIZE (256)
+#define HV_MESSAGE_PAYLOAD_BYTE_COUNT (240)
+#define HV_MESSAGE_PAYLOAD_QWORD_COUNT (30)
+
+/* Define hypervisor message types. */
+enum hv_message_type {
+ HVMSG_NONE = 0x00000000,
+
+ /* Memory access messages. */
+ HVMSG_UNMAPPED_GPA = 0x80000000,
+ HVMSG_GPA_INTERCEPT = 0x80000001,
+
+ /* Timer notification messages. */
+ HVMSG_TIMER_EXPIRED = 0x80000010,
+
+ /* Error messages. */
+ HVMSG_INVALID_VP_REGISTER_VALUE = 0x80000020,
+ HVMSG_UNRECOVERABLE_EXCEPTION = 0x80000021,
+ HVMSG_UNSUPPORTED_FEATURE = 0x80000022,
+
+ /* Trace buffer complete messages. */
+ HVMSG_EVENTLOG_BUFFERCOMPLETE = 0x80000040,
+
+ /* Platform-specific processor intercept messages. */
+ HVMSG_X64_IOPORT_INTERCEPT = 0x80010000,
+ HVMSG_X64_MSR_INTERCEPT = 0x80010001,
+ HVMSG_X64_CPUID_INTERCEPT = 0x80010002,
+ HVMSG_X64_EXCEPTION_INTERCEPT = 0x80010003,
+ HVMSG_X64_APIC_EOI = 0x80010004,
+ HVMSG_X64_LEGACY_FP_ERROR = 0x80010005
+};
+
+/* Define synthetic interrupt controller message flags. */
+union hv_message_flags {
+ __u8 asu8;
+ struct {
+ __u8 msg_pending:1;
+ __u8 reserved:7;
+ };
+};
+
+/* Define port identifier type. */
+union hv_port_id {
+ __u32 asu32;
+ struct {
+ __u32 id:24;
+ __u32 reserved:8;
+ } u;
+};
+
+/* Define synthetic interrupt controller message header. */
+struct hv_message_header {
+ __u32 message_type;
+ __u8 payload_size;
+ union hv_message_flags message_flags;
+ __u8 reserved[2];
+ union {
+ __u64 sender;
+ union hv_port_id port;
+ };
+};
+
+/* Define synthetic interrupt controller message format. */
+struct hv_message {
+ struct hv_message_header header;
+ union {
+ __u64 payload[HV_MESSAGE_PAYLOAD_QWORD_COUNT];
+ } u;
+};
+
+/* Define the synthetic interrupt message page layout. */
+struct hv_message_page {
+ struct hv_message sint_message[HV_SYNIC_SINT_COUNT];
+};
+
+/* Define timer message payload structure. */
+struct hv_timer_message_payload {
+ __u32 timer_index;
+ __u32 reserved;
+ __u64 expiration_time; /* When the timer expired */
+ __u64 delivery_time; /* When the message was delivered */
+};
+
+/* Define virtual processor assist page structure. */
+struct hv_vp_assist_page {
+ __u32 apic_assist;
+ __u32 reserved;
+ __u64 vtl_control[2];
+ __u64 nested_enlightenments_control[2];
+ __u32 enlighten_vmentry;
+ __u64 current_nested_vmcs;
+};
+
+struct hv_enlightened_vmcs {
+ u32 revision_id;
+ u32 abort;
+
+ u16 host_es_selector;
+ u16 host_cs_selector;
+ u16 host_ss_selector;
+ u16 host_ds_selector;
+ u16 host_fs_selector;
+ u16 host_gs_selector;
+ u16 host_tr_selector;
+
+ u64 host_ia32_pat;
+ u64 host_ia32_efer;
+
+ u64 host_cr0;
+ u64 host_cr3;
+ u64 host_cr4;
+
+ u64 host_ia32_sysenter_esp;
+ u64 host_ia32_sysenter_eip;
+ u64 host_rip;
+ u32 host_ia32_sysenter_cs;
+
+ u32 pin_based_vm_exec_control;
+ u32 vm_exit_controls;
+ u32 secondary_vm_exec_control;
+
+ u64 io_bitmap_a;
+ u64 io_bitmap_b;
+ u64 msr_bitmap;
+
+ u16 guest_es_selector;
+ u16 guest_cs_selector;
+ u16 guest_ss_selector;
+ u16 guest_ds_selector;
+ u16 guest_fs_selector;
+ u16 guest_gs_selector;
+ u16 guest_ldtr_selector;
+ u16 guest_tr_selector;
+
+ u32 guest_es_limit;
+ u32 guest_cs_limit;
+ u32 guest_ss_limit;
+ u32 guest_ds_limit;
+ u32 guest_fs_limit;
+ u32 guest_gs_limit;
+ u32 guest_ldtr_limit;
+ u32 guest_tr_limit;
+ u32 guest_gdtr_limit;
+ u32 guest_idtr_limit;
+
+ u32 guest_es_ar_bytes;
+ u32 guest_cs_ar_bytes;
+ u32 guest_ss_ar_bytes;
+ u32 guest_ds_ar_bytes;
+ u32 guest_fs_ar_bytes;
+ u32 guest_gs_ar_bytes;
+ u32 guest_ldtr_ar_bytes;
+ u32 guest_tr_ar_bytes;
+
+ u64 guest_es_base;
+ u64 guest_cs_base;
+ u64 guest_ss_base;
+ u64 guest_ds_base;
+ u64 guest_fs_base;
+ u64 guest_gs_base;
+ u64 guest_ldtr_base;
+ u64 guest_tr_base;
+ u64 guest_gdtr_base;
+ u64 guest_idtr_base;
+
+ u64 padding64_1[3];
+
+ u64 vm_exit_msr_store_addr;
+ u64 vm_exit_msr_load_addr;
+ u64 vm_entry_msr_load_addr;
+
+ u64 cr3_target_value0;
+ u64 cr3_target_value1;
+ u64 cr3_target_value2;
+ u64 cr3_target_value3;
+
+ u32 page_fault_error_code_mask;
+ u32 page_fault_error_code_match;
+
+ u32 cr3_target_count;
+ u32 vm_exit_msr_store_count;
+ u32 vm_exit_msr_load_count;
+ u32 vm_entry_msr_load_count;
+
+ u64 tsc_offset;
+ u64 virtual_apic_page_addr;
+ u64 vmcs_link_pointer;
+
+ u64 guest_ia32_debugctl;
+ u64 guest_ia32_pat;
+ u64 guest_ia32_efer;
+
+ u64 guest_pdptr0;
+ u64 guest_pdptr1;
+ u64 guest_pdptr2;
+ u64 guest_pdptr3;
+
+ u64 guest_pending_dbg_exceptions;
+ u64 guest_sysenter_esp;
+ u64 guest_sysenter_eip;
+
+ u32 guest_activity_state;
+ u32 guest_sysenter_cs;
+
+ u64 cr0_guest_host_mask;
+ u64 cr4_guest_host_mask;
+ u64 cr0_read_shadow;
+ u64 cr4_read_shadow;
+ u64 guest_cr0;
+ u64 guest_cr3;
+ u64 guest_cr4;
+ u64 guest_dr7;
+
+ u64 host_fs_base;
+ u64 host_gs_base;
+ u64 host_tr_base;
+ u64 host_gdtr_base;
+ u64 host_idtr_base;
+ u64 host_rsp;
+
+ u64 ept_pointer;
+
+ u16 virtual_processor_id;
+ u16 padding16[3];
+
+ u64 padding64_2[5];
+ u64 guest_physical_address;
+
+ u32 vm_instruction_error;
+ u32 vm_exit_reason;
+ u32 vm_exit_intr_info;
+ u32 vm_exit_intr_error_code;
+ u32 idt_vectoring_info_field;
+ u32 idt_vectoring_error_code;
+ u32 vm_exit_instruction_len;
+ u32 vmx_instruction_info;
+
+ u64 exit_qualification;
+ u64 exit_io_instruction_ecx;
+ u64 exit_io_instruction_esi;
+ u64 exit_io_instruction_edi;
+ u64 exit_io_instruction_eip;
+
+ u64 guest_linear_address;
+ u64 guest_rsp;
+ u64 guest_rflags;
+
+ u32 guest_interruptibility_info;
+ u32 cpu_based_vm_exec_control;
+ u32 exception_bitmap;
+ u32 vm_entry_controls;
+ u32 vm_entry_intr_info_field;
+ u32 vm_entry_exception_error_code;
+ u32 vm_entry_instruction_len;
+ u32 tpr_threshold;
+
+ u64 guest_rip;
+
+ u32 hv_clean_fields;
+ u32 hv_padding_32;
+ u32 hv_synthetic_controls;
+ struct {
+ u32 nested_flush_hypercall:1;
+ u32 msr_bitmap:1;
+ u32 reserved:30;
+ } hv_enlightenments_control;
+ u32 hv_vp_id;
+
+ u64 hv_vm_id;
+ u64 partition_assist_page;
+ u64 padding64_4[4];
+ u64 guest_bndcfgs;
+ u64 padding64_5[7];
+ u64 xss_exit_bitmap;
+ u64 padding64_6[7];
+};
+
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE 0
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP BIT(0)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP BIT(1)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2 BIT(2)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1 BIT(3)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC BIT(4)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT BIT(5)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY BIT(6)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN BIT(7)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR BIT(8)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT BIT(9)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC BIT(10)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1 BIT(11)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2 BIT(12)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER BIT(13)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1 BIT(14)
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_ENLIGHTENMENTSCONTROL BIT(15)
+
+#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL 0xFFFF
+
+#define HV_STIMER_ENABLE (1ULL << 0)
+#define HV_STIMER_PERIODIC (1ULL << 1)
+#define HV_STIMER_LAZY (1ULL << 2)
+#define HV_STIMER_AUTOENABLE (1ULL << 3)
+#define HV_STIMER_SINT(config) (__u8)(((config) >> 16) & 0x0F)
+
+struct hv_vpset {
+ u64 format;
+ u64 valid_bank_mask;
+ u64 bank_contents[];
+};
+
+/* HvCallSendSyntheticClusterIpi hypercall */
+struct hv_send_ipi {
+ u32 vector;
+ u32 reserved;
+ u64 cpu_mask;
+};
+
+/* HvCallSendSyntheticClusterIpiEx hypercall */
+struct hv_send_ipi_ex {
+ u32 vector;
+ u32 reserved;
+ struct hv_vpset vp_set;
+};
+
+/* HvFlushGuestPhysicalAddressSpace hypercalls */
+struct hv_guest_mapping_flush {
+ u64 address_space;
+ u64 flags;
+};
+
+/* HvFlushVirtualAddressSpace, HvFlushVirtualAddressList hypercalls */
+struct hv_tlb_flush {
+ u64 address_space;
+ u64 flags;
+ u64 processor_mask;
+ u64 gva_list[];
+};
+
+/* HvFlushVirtualAddressSpaceEx, HvFlushVirtualAddressListEx hypercalls */
+struct hv_tlb_flush_ex {
+ u64 address_space;
+ u64 flags;
+ struct hv_vpset hv_vp_set;
+ u64 gva_list[];
+};
+
+#endif
diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h
new file mode 100644
index 000000000..8c5aaba66
--- /dev/null
+++ b/arch/x86/include/asm/hypervisor.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (C) 2008, VMware, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+#ifndef _ASM_X86_HYPERVISOR_H
+#define _ASM_X86_HYPERVISOR_H
+
+/* x86 hypervisor types */
+enum x86_hypervisor_type {
+ X86_HYPER_NATIVE = 0,
+ X86_HYPER_VMWARE,
+ X86_HYPER_MS_HYPERV,
+ X86_HYPER_XEN_PV,
+ X86_HYPER_XEN_HVM,
+ X86_HYPER_KVM,
+ X86_HYPER_JAILHOUSE,
+};
+
+#ifdef CONFIG_HYPERVISOR_GUEST
+
+#include <asm/kvm_para.h>
+#include <asm/x86_init.h>
+#include <asm/xen/hypervisor.h>
+
+struct hypervisor_x86 {
+ /* Hypervisor name */
+ const char *name;
+
+ /* Detection routine */
+ uint32_t (*detect)(void);
+
+ /* Hypervisor type */
+ enum x86_hypervisor_type type;
+
+ /* init time callbacks */
+ struct x86_hyper_init init;
+
+ /* runtime callbacks */
+ struct x86_hyper_runtime runtime;
+};
+
+extern enum x86_hypervisor_type x86_hyper_type;
+extern void init_hypervisor_platform(void);
+static inline bool hypervisor_is_type(enum x86_hypervisor_type type)
+{
+ return x86_hyper_type == type;
+}
+#else
+static inline void init_hypervisor_platform(void) { }
+static inline bool hypervisor_is_type(enum x86_hypervisor_type type)
+{
+ return type == X86_HYPER_NATIVE;
+}
+#endif /* CONFIG_HYPERVISOR_GUEST */
+#endif /* _ASM_X86_HYPERVISOR_H */
diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h
new file mode 100644
index 000000000..89789e8c8
--- /dev/null
+++ b/arch/x86/include/asm/i8259.h
@@ -0,0 +1,83 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_I8259_H
+#define _ASM_X86_I8259_H
+
+#include <linux/delay.h>
+#include <asm/io.h>
+
+extern unsigned int cached_irq_mask;
+
+#define __byte(x, y) (((unsigned char *)&(y))[x])
+#define cached_master_mask (__byte(0, cached_irq_mask))
+#define cached_slave_mask (__byte(1, cached_irq_mask))
+
+/* i8259A PIC registers */
+#define PIC_MASTER_CMD 0x20
+#define PIC_MASTER_IMR 0x21
+#define PIC_MASTER_ISR PIC_MASTER_CMD
+#define PIC_MASTER_POLL PIC_MASTER_ISR
+#define PIC_MASTER_OCW3 PIC_MASTER_ISR
+#define PIC_SLAVE_CMD 0xa0
+#define PIC_SLAVE_IMR 0xa1
+
+/* i8259A PIC related value */
+#define PIC_CASCADE_IR 2
+#define MASTER_ICW4_DEFAULT 0x01
+#define SLAVE_ICW4_DEFAULT 0x01
+#define PIC_ICW4_AEOI 2
+
+extern raw_spinlock_t i8259A_lock;
+
+/* the PIC may need a careful delay on some platforms, hence specific calls */
+static inline unsigned char inb_pic(unsigned int port)
+{
+ unsigned char value = inb(port);
+
+ /*
+ * delay for some accesses to PIC on motherboard or in chipset
+ * must be at least one microsecond, so be safe here:
+ */
+ udelay(2);
+
+ return value;
+}
+
+static inline void outb_pic(unsigned char value, unsigned int port)
+{
+ outb(value, port);
+ /*
+ * delay for some accesses to PIC on motherboard or in chipset
+ * must be at least one microsecond, so be safe here:
+ */
+ udelay(2);
+}
+
+extern struct irq_chip i8259A_chip;
+
+struct legacy_pic {
+ int nr_legacy_irqs;
+ struct irq_chip *chip;
+ void (*mask)(unsigned int irq);
+ void (*unmask)(unsigned int irq);
+ void (*mask_all)(void);
+ void (*restore_mask)(void);
+ void (*init)(int auto_eoi);
+ int (*probe)(void);
+ int (*irq_pending)(unsigned int irq);
+ void (*make_irq)(unsigned int irq);
+};
+
+extern struct legacy_pic *legacy_pic;
+extern struct legacy_pic null_legacy_pic;
+
+static inline bool has_legacy_pic(void)
+{
+ return legacy_pic != &null_legacy_pic;
+}
+
+static inline int nr_legacy_irqs(void)
+{
+ return legacy_pic->nr_legacy_irqs;
+}
+
+#endif /* _ASM_X86_I8259_H */
diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h
new file mode 100644
index 000000000..2c5f7861d
--- /dev/null
+++ b/arch/x86/include/asm/ia32.h
@@ -0,0 +1,73 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_IA32_H
+#define _ASM_X86_IA32_H
+
+
+#ifdef CONFIG_IA32_EMULATION
+
+#include <linux/compat.h>
+
+/*
+ * 32 bit structures for IA32 support.
+ */
+
+#include <uapi/asm/sigcontext.h>
+
+/* signal.h */
+
+struct ucontext_ia32 {
+ unsigned int uc_flags;
+ unsigned int uc_link;
+ compat_stack_t uc_stack;
+ struct sigcontext_32 uc_mcontext;
+ compat_sigset_t uc_sigmask; /* mask last for extensibility */
+};
+
+/* This matches struct stat64 in glibc2.2, hence the absolutely
+ * insane amounts of padding around dev_t's.
+ */
+struct stat64 {
+ unsigned long long st_dev;
+ unsigned char __pad0[4];
+
+#define STAT64_HAS_BROKEN_ST_INO 1
+ unsigned int __st_ino;
+
+ unsigned int st_mode;
+ unsigned int st_nlink;
+
+ unsigned int st_uid;
+ unsigned int st_gid;
+
+ unsigned long long st_rdev;
+ unsigned char __pad3[4];
+
+ long long st_size;
+ unsigned int st_blksize;
+
+ long long st_blocks;/* Number 512-byte blocks allocated */
+
+ unsigned st_atime;
+ unsigned st_atime_nsec;
+ unsigned st_mtime;
+ unsigned st_mtime_nsec;
+ unsigned st_ctime;
+ unsigned st_ctime_nsec;
+
+ unsigned long long st_ino;
+} __attribute__((packed));
+
+#define IA32_STACK_TOP IA32_PAGE_OFFSET
+
+#ifdef __KERNEL__
+struct linux_binprm;
+extern int ia32_setup_arg_pages(struct linux_binprm *bprm,
+ unsigned long stack_top, int exec_stack);
+struct mm_struct;
+extern void ia32_pick_mmap_layout(struct mm_struct *mm);
+
+#endif
+
+#endif /* !CONFIG_IA32_SUPPORT */
+
+#endif /* _ASM_X86_IA32_H */
diff --git a/arch/x86/include/asm/ia32_unistd.h b/arch/x86/include/asm/ia32_unistd.h
new file mode 100644
index 000000000..aa065c94c
--- /dev/null
+++ b/arch/x86/include/asm/ia32_unistd.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_IA32_UNISTD_H
+#define _ASM_X86_IA32_UNISTD_H
+
+/*
+ * This file contains the system call numbers of the ia32 compat ABI,
+ * this is for the kernel only.
+ */
+#define __SYSCALL_ia32_NR(x) (x)
+#include <asm/unistd_32_ia32.h>
+
+#endif /* _ASM_X86_IA32_UNISTD_H */
diff --git a/arch/x86/include/asm/imr.h b/arch/x86/include/asm/imr.h
new file mode 100644
index 000000000..ebea2c9d2
--- /dev/null
+++ b/arch/x86/include/asm/imr.h
@@ -0,0 +1,60 @@
+/*
+ * imr.h: Isolated Memory Region API
+ *
+ * Copyright(c) 2013 Intel Corporation.
+ * Copyright(c) 2015 Bryan O'Donoghue <pure.logic@nexus-software.ie>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+#ifndef _IMR_H
+#define _IMR_H
+
+#include <linux/types.h>
+
+/*
+ * IMR agent access mask bits
+ * See section 12.7.4.7 from quark-x1000-datasheet.pdf for register
+ * definitions.
+ */
+#define IMR_ESRAM_FLUSH BIT(31)
+#define IMR_CPU_SNOOP BIT(30) /* Applicable only to write */
+#define IMR_RMU BIT(29)
+#define IMR_VC1_SAI_ID3 BIT(15)
+#define IMR_VC1_SAI_ID2 BIT(14)
+#define IMR_VC1_SAI_ID1 BIT(13)
+#define IMR_VC1_SAI_ID0 BIT(12)
+#define IMR_VC0_SAI_ID3 BIT(11)
+#define IMR_VC0_SAI_ID2 BIT(10)
+#define IMR_VC0_SAI_ID1 BIT(9)
+#define IMR_VC0_SAI_ID0 BIT(8)
+#define IMR_CPU_0 BIT(1) /* SMM mode */
+#define IMR_CPU BIT(0) /* Non SMM mode */
+#define IMR_ACCESS_NONE 0
+
+/*
+ * Read/Write access-all bits here include some reserved bits
+ * These are the values firmware uses and are accepted by hardware.
+ * The kernel defines read/write access-all in the same way as firmware
+ * in order to have a consistent and crisp definition across firmware,
+ * bootloader and kernel.
+ */
+#define IMR_READ_ACCESS_ALL 0xBFFFFFFF
+#define IMR_WRITE_ACCESS_ALL 0xFFFFFFFF
+
+/* Number of IMRs provided by Quark X1000 SoC */
+#define QUARK_X1000_IMR_MAX 0x08
+#define QUARK_X1000_IMR_REGBASE 0x40
+
+/* IMR alignment bits - only bits 31:10 are checked for IMR validity */
+#define IMR_ALIGN 0x400
+#define IMR_MASK (IMR_ALIGN - 1)
+
+int imr_add_range(phys_addr_t base, size_t size,
+ unsigned int rmask, unsigned int wmask);
+
+int imr_remove_range(phys_addr_t base, size_t size);
+
+#endif /* _IMR_H */
diff --git a/arch/x86/include/asm/inat.h b/arch/x86/include/asm/inat.h
new file mode 100644
index 000000000..1c78580e5
--- /dev/null
+++ b/arch/x86/include/asm/inat.h
@@ -0,0 +1,244 @@
+#ifndef _ASM_X86_INAT_H
+#define _ASM_X86_INAT_H
+/*
+ * x86 instruction attributes
+ *
+ * Written by Masami Hiramatsu <mhiramat@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ */
+#include <asm/inat_types.h>
+
+/*
+ * Internal bits. Don't use bitmasks directly, because these bits are
+ * unstable. You should use checking functions.
+ */
+
+#define INAT_OPCODE_TABLE_SIZE 256
+#define INAT_GROUP_TABLE_SIZE 8
+
+/* Legacy last prefixes */
+#define INAT_PFX_OPNDSZ 1 /* 0x66 */ /* LPFX1 */
+#define INAT_PFX_REPE 2 /* 0xF3 */ /* LPFX2 */
+#define INAT_PFX_REPNE 3 /* 0xF2 */ /* LPFX3 */
+/* Other Legacy prefixes */
+#define INAT_PFX_LOCK 4 /* 0xF0 */
+#define INAT_PFX_CS 5 /* 0x2E */
+#define INAT_PFX_DS 6 /* 0x3E */
+#define INAT_PFX_ES 7 /* 0x26 */
+#define INAT_PFX_FS 8 /* 0x64 */
+#define INAT_PFX_GS 9 /* 0x65 */
+#define INAT_PFX_SS 10 /* 0x36 */
+#define INAT_PFX_ADDRSZ 11 /* 0x67 */
+/* x86-64 REX prefix */
+#define INAT_PFX_REX 12 /* 0x4X */
+/* AVX VEX prefixes */
+#define INAT_PFX_VEX2 13 /* 2-bytes VEX prefix */
+#define INAT_PFX_VEX3 14 /* 3-bytes VEX prefix */
+#define INAT_PFX_EVEX 15 /* EVEX prefix */
+
+#define INAT_LSTPFX_MAX 3
+#define INAT_LGCPFX_MAX 11
+
+/* Immediate size */
+#define INAT_IMM_BYTE 1
+#define INAT_IMM_WORD 2
+#define INAT_IMM_DWORD 3
+#define INAT_IMM_QWORD 4
+#define INAT_IMM_PTR 5
+#define INAT_IMM_VWORD32 6
+#define INAT_IMM_VWORD 7
+
+/* Legacy prefix */
+#define INAT_PFX_OFFS 0
+#define INAT_PFX_BITS 4
+#define INAT_PFX_MAX ((1 << INAT_PFX_BITS) - 1)
+#define INAT_PFX_MASK (INAT_PFX_MAX << INAT_PFX_OFFS)
+/* Escape opcodes */
+#define INAT_ESC_OFFS (INAT_PFX_OFFS + INAT_PFX_BITS)
+#define INAT_ESC_BITS 2
+#define INAT_ESC_MAX ((1 << INAT_ESC_BITS) - 1)
+#define INAT_ESC_MASK (INAT_ESC_MAX << INAT_ESC_OFFS)
+/* Group opcodes (1-16) */
+#define INAT_GRP_OFFS (INAT_ESC_OFFS + INAT_ESC_BITS)
+#define INAT_GRP_BITS 5
+#define INAT_GRP_MAX ((1 << INAT_GRP_BITS) - 1)
+#define INAT_GRP_MASK (INAT_GRP_MAX << INAT_GRP_OFFS)
+/* Immediates */
+#define INAT_IMM_OFFS (INAT_GRP_OFFS + INAT_GRP_BITS)
+#define INAT_IMM_BITS 3
+#define INAT_IMM_MASK (((1 << INAT_IMM_BITS) - 1) << INAT_IMM_OFFS)
+/* Flags */
+#define INAT_FLAG_OFFS (INAT_IMM_OFFS + INAT_IMM_BITS)
+#define INAT_MODRM (1 << (INAT_FLAG_OFFS))
+#define INAT_FORCE64 (1 << (INAT_FLAG_OFFS + 1))
+#define INAT_SCNDIMM (1 << (INAT_FLAG_OFFS + 2))
+#define INAT_MOFFSET (1 << (INAT_FLAG_OFFS + 3))
+#define INAT_VARIANT (1 << (INAT_FLAG_OFFS + 4))
+#define INAT_VEXOK (1 << (INAT_FLAG_OFFS + 5))
+#define INAT_VEXONLY (1 << (INAT_FLAG_OFFS + 6))
+#define INAT_EVEXONLY (1 << (INAT_FLAG_OFFS + 7))
+/* Attribute making macros for attribute tables */
+#define INAT_MAKE_PREFIX(pfx) (pfx << INAT_PFX_OFFS)
+#define INAT_MAKE_ESCAPE(esc) (esc << INAT_ESC_OFFS)
+#define INAT_MAKE_GROUP(grp) ((grp << INAT_GRP_OFFS) | INAT_MODRM)
+#define INAT_MAKE_IMM(imm) (imm << INAT_IMM_OFFS)
+
+/* Identifiers for segment registers */
+#define INAT_SEG_REG_IGNORE 0
+#define INAT_SEG_REG_DEFAULT 1
+#define INAT_SEG_REG_CS 2
+#define INAT_SEG_REG_SS 3
+#define INAT_SEG_REG_DS 4
+#define INAT_SEG_REG_ES 5
+#define INAT_SEG_REG_FS 6
+#define INAT_SEG_REG_GS 7
+
+/* Attribute search APIs */
+extern insn_attr_t inat_get_opcode_attribute(insn_byte_t opcode);
+extern int inat_get_last_prefix_id(insn_byte_t last_pfx);
+extern insn_attr_t inat_get_escape_attribute(insn_byte_t opcode,
+ int lpfx_id,
+ insn_attr_t esc_attr);
+extern insn_attr_t inat_get_group_attribute(insn_byte_t modrm,
+ int lpfx_id,
+ insn_attr_t esc_attr);
+extern insn_attr_t inat_get_avx_attribute(insn_byte_t opcode,
+ insn_byte_t vex_m,
+ insn_byte_t vex_pp);
+
+/* Attribute checking functions */
+static inline int inat_is_legacy_prefix(insn_attr_t attr)
+{
+ attr &= INAT_PFX_MASK;
+ return attr && attr <= INAT_LGCPFX_MAX;
+}
+
+static inline int inat_is_address_size_prefix(insn_attr_t attr)
+{
+ return (attr & INAT_PFX_MASK) == INAT_PFX_ADDRSZ;
+}
+
+static inline int inat_is_operand_size_prefix(insn_attr_t attr)
+{
+ return (attr & INAT_PFX_MASK) == INAT_PFX_OPNDSZ;
+}
+
+static inline int inat_is_rex_prefix(insn_attr_t attr)
+{
+ return (attr & INAT_PFX_MASK) == INAT_PFX_REX;
+}
+
+static inline int inat_last_prefix_id(insn_attr_t attr)
+{
+ if ((attr & INAT_PFX_MASK) > INAT_LSTPFX_MAX)
+ return 0;
+ else
+ return attr & INAT_PFX_MASK;
+}
+
+static inline int inat_is_vex_prefix(insn_attr_t attr)
+{
+ attr &= INAT_PFX_MASK;
+ return attr == INAT_PFX_VEX2 || attr == INAT_PFX_VEX3 ||
+ attr == INAT_PFX_EVEX;
+}
+
+static inline int inat_is_evex_prefix(insn_attr_t attr)
+{
+ return (attr & INAT_PFX_MASK) == INAT_PFX_EVEX;
+}
+
+static inline int inat_is_vex3_prefix(insn_attr_t attr)
+{
+ return (attr & INAT_PFX_MASK) == INAT_PFX_VEX3;
+}
+
+static inline int inat_is_escape(insn_attr_t attr)
+{
+ return attr & INAT_ESC_MASK;
+}
+
+static inline int inat_escape_id(insn_attr_t attr)
+{
+ return (attr & INAT_ESC_MASK) >> INAT_ESC_OFFS;
+}
+
+static inline int inat_is_group(insn_attr_t attr)
+{
+ return attr & INAT_GRP_MASK;
+}
+
+static inline int inat_group_id(insn_attr_t attr)
+{
+ return (attr & INAT_GRP_MASK) >> INAT_GRP_OFFS;
+}
+
+static inline int inat_group_common_attribute(insn_attr_t attr)
+{
+ return attr & ~INAT_GRP_MASK;
+}
+
+static inline int inat_has_immediate(insn_attr_t attr)
+{
+ return attr & INAT_IMM_MASK;
+}
+
+static inline int inat_immediate_size(insn_attr_t attr)
+{
+ return (attr & INAT_IMM_MASK) >> INAT_IMM_OFFS;
+}
+
+static inline int inat_has_modrm(insn_attr_t attr)
+{
+ return attr & INAT_MODRM;
+}
+
+static inline int inat_is_force64(insn_attr_t attr)
+{
+ return attr & INAT_FORCE64;
+}
+
+static inline int inat_has_second_immediate(insn_attr_t attr)
+{
+ return attr & INAT_SCNDIMM;
+}
+
+static inline int inat_has_moffset(insn_attr_t attr)
+{
+ return attr & INAT_MOFFSET;
+}
+
+static inline int inat_has_variant(insn_attr_t attr)
+{
+ return attr & INAT_VARIANT;
+}
+
+static inline int inat_accept_vex(insn_attr_t attr)
+{
+ return attr & INAT_VEXOK;
+}
+
+static inline int inat_must_vex(insn_attr_t attr)
+{
+ return attr & (INAT_VEXONLY | INAT_EVEXONLY);
+}
+
+static inline int inat_must_evex(insn_attr_t attr)
+{
+ return attr & INAT_EVEXONLY;
+}
+#endif
diff --git a/arch/x86/include/asm/inat_types.h b/arch/x86/include/asm/inat_types.h
new file mode 100644
index 000000000..cb3c20ce3
--- /dev/null
+++ b/arch/x86/include/asm/inat_types.h
@@ -0,0 +1,29 @@
+#ifndef _ASM_X86_INAT_TYPES_H
+#define _ASM_X86_INAT_TYPES_H
+/*
+ * x86 instruction attributes
+ *
+ * Written by Masami Hiramatsu <mhiramat@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ */
+
+/* Instruction attributes */
+typedef unsigned int insn_attr_t;
+typedef unsigned char insn_byte_t;
+typedef signed int insn_value_t;
+
+#endif
diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h
new file mode 100644
index 000000000..5f1d3c421
--- /dev/null
+++ b/arch/x86/include/asm/init.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_INIT_H
+#define _ASM_X86_INIT_H
+
+struct x86_mapping_info {
+ void *(*alloc_pgt_page)(void *); /* allocate buf for page table */
+ void *context; /* context for alloc_pgt_page */
+ unsigned long page_flag; /* page flag for PMD or PUD entry */
+ unsigned long offset; /* ident mapping offset */
+ bool direct_gbpages; /* PUD level 1GB page support */
+ unsigned long kernpg_flag; /* kernel pagetable flag override */
+};
+
+int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
+ unsigned long pstart, unsigned long pend);
+
+#endif /* _ASM_X86_INIT_H */
diff --git a/arch/x86/include/asm/insn-eval.h b/arch/x86/include/asm/insn-eval.h
new file mode 100644
index 000000000..2b6ccf2c4
--- /dev/null
+++ b/arch/x86/include/asm/insn-eval.h
@@ -0,0 +1,23 @@
+#ifndef _ASM_X86_INSN_EVAL_H
+#define _ASM_X86_INSN_EVAL_H
+/*
+ * A collection of utility functions for x86 instruction analysis to be
+ * used in a kernel context. Useful when, for instance, making sense
+ * of the registers indicated by operands.
+ */
+
+#include <linux/compiler.h>
+#include <linux/bug.h>
+#include <linux/err.h>
+#include <asm/ptrace.h>
+
+#define INSN_CODE_SEG_ADDR_SZ(params) ((params >> 4) & 0xf)
+#define INSN_CODE_SEG_OPND_SZ(params) (params & 0xf)
+#define INSN_CODE_SEG_PARAMS(oper_sz, addr_sz) (oper_sz | (addr_sz << 4))
+
+void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs);
+int insn_get_modrm_rm_off(struct insn *insn, struct pt_regs *regs);
+unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx);
+int insn_get_code_seg_params(struct pt_regs *regs);
+
+#endif /* _ASM_X86_INSN_EVAL_H */
diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h
new file mode 100644
index 000000000..3e0e18d37
--- /dev/null
+++ b/arch/x86/include/asm/insn.h
@@ -0,0 +1,244 @@
+#ifndef _ASM_X86_INSN_H
+#define _ASM_X86_INSN_H
+/*
+ * x86 instruction analysis
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2009
+ */
+
+/* insn_attr_t is defined in inat.h */
+#include <asm/inat.h>
+
+struct insn_field {
+ union {
+ insn_value_t value;
+ insn_byte_t bytes[4];
+ };
+ /* !0 if we've run insn_get_xxx() for this field */
+ unsigned char got;
+ unsigned char nbytes;
+};
+
+struct insn {
+ struct insn_field prefixes; /*
+ * Prefixes
+ * prefixes.bytes[3]: last prefix
+ */
+ struct insn_field rex_prefix; /* REX prefix */
+ struct insn_field vex_prefix; /* VEX prefix */
+ struct insn_field opcode; /*
+ * opcode.bytes[0]: opcode1
+ * opcode.bytes[1]: opcode2
+ * opcode.bytes[2]: opcode3
+ */
+ struct insn_field modrm;
+ struct insn_field sib;
+ struct insn_field displacement;
+ union {
+ struct insn_field immediate;
+ struct insn_field moffset1; /* for 64bit MOV */
+ struct insn_field immediate1; /* for 64bit imm or off16/32 */
+ };
+ union {
+ struct insn_field moffset2; /* for 64bit MOV */
+ struct insn_field immediate2; /* for 64bit imm or seg16 */
+ };
+
+ insn_attr_t attr;
+ unsigned char opnd_bytes;
+ unsigned char addr_bytes;
+ unsigned char length;
+ unsigned char x86_64;
+
+ const insn_byte_t *kaddr; /* kernel address of insn to analyze */
+ const insn_byte_t *end_kaddr; /* kernel address of last insn in buffer */
+ const insn_byte_t *next_byte;
+};
+
+#define MAX_INSN_SIZE 15
+
+#define X86_MODRM_MOD(modrm) (((modrm) & 0xc0) >> 6)
+#define X86_MODRM_REG(modrm) (((modrm) & 0x38) >> 3)
+#define X86_MODRM_RM(modrm) ((modrm) & 0x07)
+
+#define X86_SIB_SCALE(sib) (((sib) & 0xc0) >> 6)
+#define X86_SIB_INDEX(sib) (((sib) & 0x38) >> 3)
+#define X86_SIB_BASE(sib) ((sib) & 0x07)
+
+#define X86_REX_W(rex) ((rex) & 8)
+#define X86_REX_R(rex) ((rex) & 4)
+#define X86_REX_X(rex) ((rex) & 2)
+#define X86_REX_B(rex) ((rex) & 1)
+
+/* VEX bit flags */
+#define X86_VEX_W(vex) ((vex) & 0x80) /* VEX3 Byte2 */
+#define X86_VEX_R(vex) ((vex) & 0x80) /* VEX2/3 Byte1 */
+#define X86_VEX_X(vex) ((vex) & 0x40) /* VEX3 Byte1 */
+#define X86_VEX_B(vex) ((vex) & 0x20) /* VEX3 Byte1 */
+#define X86_VEX_L(vex) ((vex) & 0x04) /* VEX3 Byte2, VEX2 Byte1 */
+/* VEX bit fields */
+#define X86_EVEX_M(vex) ((vex) & 0x03) /* EVEX Byte1 */
+#define X86_VEX3_M(vex) ((vex) & 0x1f) /* VEX3 Byte1 */
+#define X86_VEX2_M 1 /* VEX2.M always 1 */
+#define X86_VEX_V(vex) (((vex) & 0x78) >> 3) /* VEX3 Byte2, VEX2 Byte1 */
+#define X86_VEX_P(vex) ((vex) & 0x03) /* VEX3 Byte2, VEX2 Byte1 */
+#define X86_VEX_M_MAX 0x1f /* VEX3.M Maximum value */
+
+extern void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64);
+extern void insn_get_prefixes(struct insn *insn);
+extern void insn_get_opcode(struct insn *insn);
+extern void insn_get_modrm(struct insn *insn);
+extern void insn_get_sib(struct insn *insn);
+extern void insn_get_displacement(struct insn *insn);
+extern void insn_get_immediate(struct insn *insn);
+extern void insn_get_length(struct insn *insn);
+
+/* Attribute will be determined after getting ModRM (for opcode groups) */
+static inline void insn_get_attribute(struct insn *insn)
+{
+ insn_get_modrm(insn);
+}
+
+/* Instruction uses RIP-relative addressing */
+extern int insn_rip_relative(struct insn *insn);
+
+/* Init insn for kernel text */
+static inline void kernel_insn_init(struct insn *insn,
+ const void *kaddr, int buf_len)
+{
+#ifdef CONFIG_X86_64
+ insn_init(insn, kaddr, buf_len, 1);
+#else /* CONFIG_X86_32 */
+ insn_init(insn, kaddr, buf_len, 0);
+#endif
+}
+
+static inline int insn_is_avx(struct insn *insn)
+{
+ if (!insn->prefixes.got)
+ insn_get_prefixes(insn);
+ return (insn->vex_prefix.value != 0);
+}
+
+static inline int insn_is_evex(struct insn *insn)
+{
+ if (!insn->prefixes.got)
+ insn_get_prefixes(insn);
+ return (insn->vex_prefix.nbytes == 4);
+}
+
+/* Ensure this instruction is decoded completely */
+static inline int insn_complete(struct insn *insn)
+{
+ return insn->opcode.got && insn->modrm.got && insn->sib.got &&
+ insn->displacement.got && insn->immediate.got;
+}
+
+static inline insn_byte_t insn_vex_m_bits(struct insn *insn)
+{
+ if (insn->vex_prefix.nbytes == 2) /* 2 bytes VEX */
+ return X86_VEX2_M;
+ else if (insn->vex_prefix.nbytes == 3) /* 3 bytes VEX */
+ return X86_VEX3_M(insn->vex_prefix.bytes[1]);
+ else /* EVEX */
+ return X86_EVEX_M(insn->vex_prefix.bytes[1]);
+}
+
+static inline insn_byte_t insn_vex_p_bits(struct insn *insn)
+{
+ if (insn->vex_prefix.nbytes == 2) /* 2 bytes VEX */
+ return X86_VEX_P(insn->vex_prefix.bytes[1]);
+ else
+ return X86_VEX_P(insn->vex_prefix.bytes[2]);
+}
+
+/* Get the last prefix id from last prefix or VEX prefix */
+static inline int insn_last_prefix_id(struct insn *insn)
+{
+ if (insn_is_avx(insn))
+ return insn_vex_p_bits(insn); /* VEX_p is a SIMD prefix id */
+
+ if (insn->prefixes.bytes[3])
+ return inat_get_last_prefix_id(insn->prefixes.bytes[3]);
+
+ return 0;
+}
+
+/* Offset of each field from kaddr */
+static inline int insn_offset_rex_prefix(struct insn *insn)
+{
+ return insn->prefixes.nbytes;
+}
+static inline int insn_offset_vex_prefix(struct insn *insn)
+{
+ return insn_offset_rex_prefix(insn) + insn->rex_prefix.nbytes;
+}
+static inline int insn_offset_opcode(struct insn *insn)
+{
+ return insn_offset_vex_prefix(insn) + insn->vex_prefix.nbytes;
+}
+static inline int insn_offset_modrm(struct insn *insn)
+{
+ return insn_offset_opcode(insn) + insn->opcode.nbytes;
+}
+static inline int insn_offset_sib(struct insn *insn)
+{
+ return insn_offset_modrm(insn) + insn->modrm.nbytes;
+}
+static inline int insn_offset_displacement(struct insn *insn)
+{
+ return insn_offset_sib(insn) + insn->sib.nbytes;
+}
+static inline int insn_offset_immediate(struct insn *insn)
+{
+ return insn_offset_displacement(insn) + insn->displacement.nbytes;
+}
+
+/**
+ * for_each_insn_prefix() -- Iterate prefixes in the instruction
+ * @insn: Pointer to struct insn.
+ * @idx: Index storage.
+ * @prefix: Prefix byte.
+ *
+ * Iterate prefix bytes of given @insn. Each prefix byte is stored in @prefix
+ * and the index is stored in @idx (note that this @idx is just for a cursor,
+ * do not change it.)
+ * Since prefixes.nbytes can be bigger than 4 if some prefixes
+ * are repeated, it cannot be used for looping over the prefixes.
+ */
+#define for_each_insn_prefix(insn, idx, prefix) \
+ for (idx = 0; idx < ARRAY_SIZE(insn->prefixes.bytes) && (prefix = insn->prefixes.bytes[idx]) != 0; idx++)
+
+#define POP_SS_OPCODE 0x1f
+#define MOV_SREG_OPCODE 0x8e
+
+/*
+ * Intel SDM Vol.3A 6.8.3 states;
+ * "Any single-step trap that would be delivered following the MOV to SS
+ * instruction or POP to SS instruction (because EFLAGS.TF is 1) is
+ * suppressed."
+ * This function returns true if @insn is MOV SS or POP SS. On these
+ * instructions, single stepping is suppressed.
+ */
+static inline int insn_masking_exception(struct insn *insn)
+{
+ return insn->opcode.bytes[0] == POP_SS_OPCODE ||
+ (insn->opcode.bytes[0] == MOV_SREG_OPCODE &&
+ X86_MODRM_REG(insn->modrm.bytes[0]) == 2);
+}
+
+#endif /* _ASM_X86_INSN_H */
diff --git a/arch/x86/include/asm/inst.h b/arch/x86/include/asm/inst.h
new file mode 100644
index 000000000..f5a796da0
--- /dev/null
+++ b/arch/x86/include/asm/inst.h
@@ -0,0 +1,311 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Generate .byte code for some instructions not supported by old
+ * binutils.
+ */
+#ifndef X86_ASM_INST_H
+#define X86_ASM_INST_H
+
+#ifdef __ASSEMBLY__
+
+#define REG_NUM_INVALID 100
+
+#define REG_TYPE_R32 0
+#define REG_TYPE_R64 1
+#define REG_TYPE_XMM 2
+#define REG_TYPE_INVALID 100
+
+ .macro R32_NUM opd r32
+ \opd = REG_NUM_INVALID
+ .ifc \r32,%eax
+ \opd = 0
+ .endif
+ .ifc \r32,%ecx
+ \opd = 1
+ .endif
+ .ifc \r32,%edx
+ \opd = 2
+ .endif
+ .ifc \r32,%ebx
+ \opd = 3
+ .endif
+ .ifc \r32,%esp
+ \opd = 4
+ .endif
+ .ifc \r32,%ebp
+ \opd = 5
+ .endif
+ .ifc \r32,%esi
+ \opd = 6
+ .endif
+ .ifc \r32,%edi
+ \opd = 7
+ .endif
+#ifdef CONFIG_X86_64
+ .ifc \r32,%r8d
+ \opd = 8
+ .endif
+ .ifc \r32,%r9d
+ \opd = 9
+ .endif
+ .ifc \r32,%r10d
+ \opd = 10
+ .endif
+ .ifc \r32,%r11d
+ \opd = 11
+ .endif
+ .ifc \r32,%r12d
+ \opd = 12
+ .endif
+ .ifc \r32,%r13d
+ \opd = 13
+ .endif
+ .ifc \r32,%r14d
+ \opd = 14
+ .endif
+ .ifc \r32,%r15d
+ \opd = 15
+ .endif
+#endif
+ .endm
+
+ .macro R64_NUM opd r64
+ \opd = REG_NUM_INVALID
+#ifdef CONFIG_X86_64
+ .ifc \r64,%rax
+ \opd = 0
+ .endif
+ .ifc \r64,%rcx
+ \opd = 1
+ .endif
+ .ifc \r64,%rdx
+ \opd = 2
+ .endif
+ .ifc \r64,%rbx
+ \opd = 3
+ .endif
+ .ifc \r64,%rsp
+ \opd = 4
+ .endif
+ .ifc \r64,%rbp
+ \opd = 5
+ .endif
+ .ifc \r64,%rsi
+ \opd = 6
+ .endif
+ .ifc \r64,%rdi
+ \opd = 7
+ .endif
+ .ifc \r64,%r8
+ \opd = 8
+ .endif
+ .ifc \r64,%r9
+ \opd = 9
+ .endif
+ .ifc \r64,%r10
+ \opd = 10
+ .endif
+ .ifc \r64,%r11
+ \opd = 11
+ .endif
+ .ifc \r64,%r12
+ \opd = 12
+ .endif
+ .ifc \r64,%r13
+ \opd = 13
+ .endif
+ .ifc \r64,%r14
+ \opd = 14
+ .endif
+ .ifc \r64,%r15
+ \opd = 15
+ .endif
+#endif
+ .endm
+
+ .macro XMM_NUM opd xmm
+ \opd = REG_NUM_INVALID
+ .ifc \xmm,%xmm0
+ \opd = 0
+ .endif
+ .ifc \xmm,%xmm1
+ \opd = 1
+ .endif
+ .ifc \xmm,%xmm2
+ \opd = 2
+ .endif
+ .ifc \xmm,%xmm3
+ \opd = 3
+ .endif
+ .ifc \xmm,%xmm4
+ \opd = 4
+ .endif
+ .ifc \xmm,%xmm5
+ \opd = 5
+ .endif
+ .ifc \xmm,%xmm6
+ \opd = 6
+ .endif
+ .ifc \xmm,%xmm7
+ \opd = 7
+ .endif
+ .ifc \xmm,%xmm8
+ \opd = 8
+ .endif
+ .ifc \xmm,%xmm9
+ \opd = 9
+ .endif
+ .ifc \xmm,%xmm10
+ \opd = 10
+ .endif
+ .ifc \xmm,%xmm11
+ \opd = 11
+ .endif
+ .ifc \xmm,%xmm12
+ \opd = 12
+ .endif
+ .ifc \xmm,%xmm13
+ \opd = 13
+ .endif
+ .ifc \xmm,%xmm14
+ \opd = 14
+ .endif
+ .ifc \xmm,%xmm15
+ \opd = 15
+ .endif
+ .endm
+
+ .macro REG_TYPE type reg
+ R32_NUM reg_type_r32 \reg
+ R64_NUM reg_type_r64 \reg
+ XMM_NUM reg_type_xmm \reg
+ .if reg_type_r64 <> REG_NUM_INVALID
+ \type = REG_TYPE_R64
+ .elseif reg_type_r32 <> REG_NUM_INVALID
+ \type = REG_TYPE_R32
+ .elseif reg_type_xmm <> REG_NUM_INVALID
+ \type = REG_TYPE_XMM
+ .else
+ \type = REG_TYPE_INVALID
+ .endif
+ .endm
+
+ .macro PFX_OPD_SIZE
+ .byte 0x66
+ .endm
+
+ .macro PFX_REX opd1 opd2 W=0
+ .if ((\opd1 | \opd2) & 8) || \W
+ .byte 0x40 | ((\opd1 & 8) >> 3) | ((\opd2 & 8) >> 1) | (\W << 3)
+ .endif
+ .endm
+
+ .macro MODRM mod opd1 opd2
+ .byte \mod | (\opd1 & 7) | ((\opd2 & 7) << 3)
+ .endm
+
+ .macro PSHUFB_XMM xmm1 xmm2
+ XMM_NUM pshufb_opd1 \xmm1
+ XMM_NUM pshufb_opd2 \xmm2
+ PFX_OPD_SIZE
+ PFX_REX pshufb_opd1 pshufb_opd2
+ .byte 0x0f, 0x38, 0x00
+ MODRM 0xc0 pshufb_opd1 pshufb_opd2
+ .endm
+
+ .macro PCLMULQDQ imm8 xmm1 xmm2
+ XMM_NUM clmul_opd1 \xmm1
+ XMM_NUM clmul_opd2 \xmm2
+ PFX_OPD_SIZE
+ PFX_REX clmul_opd1 clmul_opd2
+ .byte 0x0f, 0x3a, 0x44
+ MODRM 0xc0 clmul_opd1 clmul_opd2
+ .byte \imm8
+ .endm
+
+ .macro PEXTRD imm8 xmm gpr
+ R32_NUM extrd_opd1 \gpr
+ XMM_NUM extrd_opd2 \xmm
+ PFX_OPD_SIZE
+ PFX_REX extrd_opd1 extrd_opd2
+ .byte 0x0f, 0x3a, 0x16
+ MODRM 0xc0 extrd_opd1 extrd_opd2
+ .byte \imm8
+ .endm
+
+ .macro AESKEYGENASSIST rcon xmm1 xmm2
+ XMM_NUM aeskeygen_opd1 \xmm1
+ XMM_NUM aeskeygen_opd2 \xmm2
+ PFX_OPD_SIZE
+ PFX_REX aeskeygen_opd1 aeskeygen_opd2
+ .byte 0x0f, 0x3a, 0xdf
+ MODRM 0xc0 aeskeygen_opd1 aeskeygen_opd2
+ .byte \rcon
+ .endm
+
+ .macro AESIMC xmm1 xmm2
+ XMM_NUM aesimc_opd1 \xmm1
+ XMM_NUM aesimc_opd2 \xmm2
+ PFX_OPD_SIZE
+ PFX_REX aesimc_opd1 aesimc_opd2
+ .byte 0x0f, 0x38, 0xdb
+ MODRM 0xc0 aesimc_opd1 aesimc_opd2
+ .endm
+
+ .macro AESENC xmm1 xmm2
+ XMM_NUM aesenc_opd1 \xmm1
+ XMM_NUM aesenc_opd2 \xmm2
+ PFX_OPD_SIZE
+ PFX_REX aesenc_opd1 aesenc_opd2
+ .byte 0x0f, 0x38, 0xdc
+ MODRM 0xc0 aesenc_opd1 aesenc_opd2
+ .endm
+
+ .macro AESENCLAST xmm1 xmm2
+ XMM_NUM aesenclast_opd1 \xmm1
+ XMM_NUM aesenclast_opd2 \xmm2
+ PFX_OPD_SIZE
+ PFX_REX aesenclast_opd1 aesenclast_opd2
+ .byte 0x0f, 0x38, 0xdd
+ MODRM 0xc0 aesenclast_opd1 aesenclast_opd2
+ .endm
+
+ .macro AESDEC xmm1 xmm2
+ XMM_NUM aesdec_opd1 \xmm1
+ XMM_NUM aesdec_opd2 \xmm2
+ PFX_OPD_SIZE
+ PFX_REX aesdec_opd1 aesdec_opd2
+ .byte 0x0f, 0x38, 0xde
+ MODRM 0xc0 aesdec_opd1 aesdec_opd2
+ .endm
+
+ .macro AESDECLAST xmm1 xmm2
+ XMM_NUM aesdeclast_opd1 \xmm1
+ XMM_NUM aesdeclast_opd2 \xmm2
+ PFX_OPD_SIZE
+ PFX_REX aesdeclast_opd1 aesdeclast_opd2
+ .byte 0x0f, 0x38, 0xdf
+ MODRM 0xc0 aesdeclast_opd1 aesdeclast_opd2
+ .endm
+
+ .macro MOVQ_R64_XMM opd1 opd2
+ REG_TYPE movq_r64_xmm_opd1_type \opd1
+ .if movq_r64_xmm_opd1_type == REG_TYPE_XMM
+ XMM_NUM movq_r64_xmm_opd1 \opd1
+ R64_NUM movq_r64_xmm_opd2 \opd2
+ .else
+ R64_NUM movq_r64_xmm_opd1 \opd1
+ XMM_NUM movq_r64_xmm_opd2 \opd2
+ .endif
+ PFX_OPD_SIZE
+ PFX_REX movq_r64_xmm_opd1 movq_r64_xmm_opd2 1
+ .if movq_r64_xmm_opd1_type == REG_TYPE_XMM
+ .byte 0x0f, 0x7e
+ .else
+ .byte 0x0f, 0x6e
+ .endif
+ MODRM 0xc0 movq_r64_xmm_opd1 movq_r64_xmm_opd2
+ .endm
+#endif
+
+#endif
diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h
new file mode 100644
index 000000000..db06f1662
--- /dev/null
+++ b/arch/x86/include/asm/intel-family.h
@@ -0,0 +1,117 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_INTEL_FAMILY_H
+#define _ASM_X86_INTEL_FAMILY_H
+
+/*
+ * "Big Core" Processors (Branded as Core, Xeon, etc...)
+ *
+ * The "_X" parts are generally the EP and EX Xeons, or the
+ * "Extreme" ones, like Broadwell-E, or Atom microserver.
+ *
+ * While adding a new CPUID for a new microarchitecture, add a new
+ * group to keep logically sorted out in chronological order. Within
+ * that group keep the CPUID for the variants sorted by model number.
+ */
+
+#define INTEL_FAM6_CORE_YONAH 0x0E
+
+#define INTEL_FAM6_CORE2_MEROM 0x0F
+#define INTEL_FAM6_CORE2_MEROM_L 0x16
+#define INTEL_FAM6_CORE2_PENRYN 0x17
+#define INTEL_FAM6_CORE2_DUNNINGTON 0x1D
+
+#define INTEL_FAM6_NEHALEM 0x1E
+#define INTEL_FAM6_NEHALEM_G 0x1F /* Auburndale / Havendale */
+#define INTEL_FAM6_NEHALEM_EP 0x1A
+#define INTEL_FAM6_NEHALEM_EX 0x2E
+
+#define INTEL_FAM6_WESTMERE 0x25
+#define INTEL_FAM6_WESTMERE_EP 0x2C
+#define INTEL_FAM6_WESTMERE_EX 0x2F
+
+#define INTEL_FAM6_SANDYBRIDGE 0x2A
+#define INTEL_FAM6_SANDYBRIDGE_X 0x2D
+#define INTEL_FAM6_IVYBRIDGE 0x3A
+#define INTEL_FAM6_IVYBRIDGE_X 0x3E
+
+#define INTEL_FAM6_HASWELL_CORE 0x3C
+#define INTEL_FAM6_HASWELL_X 0x3F
+#define INTEL_FAM6_HASWELL_ULT 0x45
+#define INTEL_FAM6_HASWELL_GT3E 0x46
+
+#define INTEL_FAM6_BROADWELL_CORE 0x3D
+#define INTEL_FAM6_BROADWELL_GT3E 0x47
+#define INTEL_FAM6_BROADWELL_X 0x4F
+#define INTEL_FAM6_BROADWELL_XEON_D 0x56
+
+#define INTEL_FAM6_SKYLAKE_MOBILE 0x4E
+#define INTEL_FAM6_SKYLAKE_DESKTOP 0x5E
+#define INTEL_FAM6_SKYLAKE_X 0x55
+#define INTEL_FAM6_KABYLAKE_MOBILE 0x8E
+#define INTEL_FAM6_KABYLAKE_DESKTOP 0x9E
+
+#define INTEL_FAM6_CANNONLAKE_MOBILE 0x66
+
+#define INTEL_FAM6_ICELAKE_X 0x6A
+#define INTEL_FAM6_ICELAKE_XEON_D 0x6C
+#define INTEL_FAM6_ICELAKE_DESKTOP 0x7D
+#define INTEL_FAM6_ICELAKE_MOBILE 0x7E
+#define INTEL_FAM6_ICELAKE_NNPI 0x9D
+
+#define INTEL_FAM6_TIGERLAKE_L 0x8C
+#define INTEL_FAM6_TIGERLAKE 0x8D
+
+#define INTEL_FAM6_COMETLAKE 0xA5
+#define INTEL_FAM6_COMETLAKE_L 0xA6
+
+#define INTEL_FAM6_ROCKETLAKE 0xA7
+
+/* Hybrid Core/Atom Processors */
+
+#define INTEL_FAM6_LAKEFIELD 0x8A
+#define INTEL_FAM6_ALDERLAKE 0x97
+#define INTEL_FAM6_ALDERLAKE_L 0x9A
+
+/* "Small Core" Processors (Atom) */
+
+#define INTEL_FAM6_ATOM_BONNELL 0x1C /* Diamondville, Pineview */
+#define INTEL_FAM6_ATOM_BONNELL_MID 0x26 /* Silverthorne, Lincroft */
+
+#define INTEL_FAM6_ATOM_SALTWELL 0x36 /* Cedarview */
+#define INTEL_FAM6_ATOM_SALTWELL_MID 0x27 /* Penwell */
+#define INTEL_FAM6_ATOM_SALTWELL_TABLET 0x35 /* Cloverview */
+
+#define INTEL_FAM6_ATOM_SILVERMONT 0x37 /* Bay Trail, Valleyview */
+#define INTEL_FAM6_ATOM_SILVERMONT_X 0x4D /* Avaton, Rangely */
+#define INTEL_FAM6_ATOM_SILVERMONT_MID 0x4A /* Merriefield */
+
+#define INTEL_FAM6_ATOM_AIRMONT 0x4C /* Cherry Trail, Braswell */
+#define INTEL_FAM6_ATOM_AIRMONT_MID 0x5A /* Moorefield */
+
+#define INTEL_FAM6_ATOM_GOLDMONT 0x5C /* Apollo Lake */
+#define INTEL_FAM6_ATOM_GOLDMONT_X 0x5F /* Denverton */
+#define INTEL_FAM6_ATOM_GOLDMONT_PLUS 0x7A /* Gemini Lake */
+
+#define INTEL_FAM6_ATOM_TREMONT_X 0x86 /* Jacobsville */
+#define INTEL_FAM6_ATOM_TREMONT 0x96 /* Elkhart Lake */
+#define INTEL_FAM6_ATOM_TREMONT_L 0x9C /* Jasper Lake */
+
+/* Xeon Phi */
+
+#define INTEL_FAM6_XEON_PHI_KNL 0x57 /* Knights Landing */
+#define INTEL_FAM6_XEON_PHI_KNM 0x85 /* Knights Mill */
+
+/* Useful macros */
+#define INTEL_CPU_FAM_ANY(_family, _model, _driver_data) \
+{ \
+ .vendor = X86_VENDOR_INTEL, \
+ .family = _family, \
+ .model = _model, \
+ .feature = X86_FEATURE_ANY, \
+ .driver_data = (kernel_ulong_t)&_driver_data \
+}
+
+#define INTEL_CPU_FAM6(_model, _driver_data) \
+ INTEL_CPU_FAM_ANY(6, INTEL_FAM6_##_model, _driver_data)
+
+#endif /* _ASM_X86_INTEL_FAMILY_H */
diff --git a/arch/x86/include/asm/intel-mid.h b/arch/x86/include/asm/intel-mid.h
new file mode 100644
index 000000000..52f815a80
--- /dev/null
+++ b/arch/x86/include/asm/intel-mid.h
@@ -0,0 +1,134 @@
+/*
+ * intel-mid.h: Intel MID specific setup code
+ *
+ * (C) Copyright 2009 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+#ifndef _ASM_X86_INTEL_MID_H
+#define _ASM_X86_INTEL_MID_H
+
+#include <linux/sfi.h>
+#include <linux/pci.h>
+#include <linux/platform_device.h>
+
+extern int intel_mid_pci_init(void);
+extern int intel_mid_pci_set_power_state(struct pci_dev *pdev, pci_power_t state);
+extern pci_power_t intel_mid_pci_get_power_state(struct pci_dev *pdev);
+
+extern void intel_mid_pwr_power_off(void);
+
+#define INTEL_MID_PWR_LSS_OFFSET 4
+#define INTEL_MID_PWR_LSS_TYPE (1 << 7)
+
+extern int intel_mid_pwr_get_lss_id(struct pci_dev *pdev);
+
+extern int get_gpio_by_name(const char *name);
+extern int __init sfi_parse_mrtc(struct sfi_table_header *table);
+extern int __init sfi_parse_mtmr(struct sfi_table_header *table);
+extern int sfi_mrtc_num;
+extern struct sfi_rtc_table_entry sfi_mrtc_array[];
+
+/*
+ * Here defines the array of devices platform data that IAFW would export
+ * through SFI "DEVS" table, we use name and type to match the device and
+ * its platform data.
+ */
+struct devs_id {
+ char name[SFI_NAME_LEN + 1];
+ u8 type;
+ u8 delay;
+ u8 msic;
+ void *(*get_platform_data)(void *info);
+};
+
+#define sfi_device(i) \
+ static const struct devs_id *const __intel_mid_sfi_##i##_dev __used \
+ __attribute__((__section__(".x86_intel_mid_dev.init"))) = &i
+
+/**
+* struct mid_sd_board_info - template for SD device creation
+* @name: identifies the driver
+* @bus_num: board-specific identifier for a given SD controller
+* @max_clk: the maximum frequency device supports
+* @platform_data: the particular data stored there is driver-specific
+*/
+struct mid_sd_board_info {
+ char name[SFI_NAME_LEN];
+ int bus_num;
+ unsigned short addr;
+ u32 max_clk;
+ void *platform_data;
+};
+
+/*
+ * Medfield is the follow-up of Moorestown, it combines two chip solution into
+ * one. Other than that it also added always-on and constant tsc and lapic
+ * timers. Medfield is the platform name, and the chip name is called Penwell
+ * we treat Medfield/Penwell as a variant of Moorestown. Penwell can be
+ * identified via MSRs.
+ */
+enum intel_mid_cpu_type {
+ /* 1 was Moorestown */
+ INTEL_MID_CPU_CHIP_PENWELL = 2,
+ INTEL_MID_CPU_CHIP_CLOVERVIEW,
+ INTEL_MID_CPU_CHIP_TANGIER,
+};
+
+extern enum intel_mid_cpu_type __intel_mid_cpu_chip;
+
+#ifdef CONFIG_X86_INTEL_MID
+
+static inline enum intel_mid_cpu_type intel_mid_identify_cpu(void)
+{
+ return __intel_mid_cpu_chip;
+}
+
+static inline bool intel_mid_has_msic(void)
+{
+ return (intel_mid_identify_cpu() == INTEL_MID_CPU_CHIP_PENWELL);
+}
+
+#else /* !CONFIG_X86_INTEL_MID */
+
+#define intel_mid_identify_cpu() 0
+#define intel_mid_has_msic() 0
+
+#endif /* !CONFIG_X86_INTEL_MID */
+
+enum intel_mid_timer_options {
+ INTEL_MID_TIMER_DEFAULT,
+ INTEL_MID_TIMER_APBT_ONLY,
+ INTEL_MID_TIMER_LAPIC_APBT,
+};
+
+extern enum intel_mid_timer_options intel_mid_timer_options;
+
+/* Bus Select SoC Fuse value */
+#define BSEL_SOC_FUSE_MASK 0x7
+/* FSB 133MHz */
+#define BSEL_SOC_FUSE_001 0x1
+/* FSB 100MHz */
+#define BSEL_SOC_FUSE_101 0x5
+/* FSB 83MHz */
+#define BSEL_SOC_FUSE_111 0x7
+
+#define SFI_MTMR_MAX_NUM 8
+#define SFI_MRTC_MAX 8
+
+extern void intel_scu_devices_create(void);
+extern void intel_scu_devices_destroy(void);
+
+/* VRTC timer */
+#define MRST_VRTC_MAP_SZ 1024
+/* #define MRST_VRTC_PGOFFSET 0xc00 */
+
+extern void intel_mid_rtc_init(void);
+
+/* The offset for the mapping of global gpio pin to irq */
+#define INTEL_MID_IRQ_OFFSET 0x100
+
+#endif /* _ASM_X86_INTEL_MID_H */
diff --git a/arch/x86/include/asm/intel_ds.h b/arch/x86/include/asm/intel_ds.h
new file mode 100644
index 000000000..ae26df1c2
--- /dev/null
+++ b/arch/x86/include/asm/intel_ds.h
@@ -0,0 +1,37 @@
+#ifndef _ASM_INTEL_DS_H
+#define _ASM_INTEL_DS_H
+
+#include <linux/percpu-defs.h>
+
+#define BTS_BUFFER_SIZE (PAGE_SIZE << 4)
+#define PEBS_BUFFER_SIZE (PAGE_SIZE << 4)
+
+/* The maximal number of PEBS events: */
+#define MAX_PEBS_EVENTS 8
+#define MAX_FIXED_PEBS_EVENTS 3
+
+/*
+ * A debug store configuration.
+ *
+ * We only support architectures that use 64bit fields.
+ */
+struct debug_store {
+ u64 bts_buffer_base;
+ u64 bts_index;
+ u64 bts_absolute_maximum;
+ u64 bts_interrupt_threshold;
+ u64 pebs_buffer_base;
+ u64 pebs_index;
+ u64 pebs_absolute_maximum;
+ u64 pebs_interrupt_threshold;
+ u64 pebs_event_reset[MAX_PEBS_EVENTS + MAX_FIXED_PEBS_EVENTS];
+} __aligned(PAGE_SIZE);
+
+DECLARE_PER_CPU_PAGE_ALIGNED(struct debug_store, cpu_debug_store);
+
+struct debug_store_buffers {
+ char bts_buffer[BTS_BUFFER_SIZE];
+ char pebs_buffer[PEBS_BUFFER_SIZE];
+};
+
+#endif
diff --git a/arch/x86/include/asm/intel_mid_vrtc.h b/arch/x86/include/asm/intel_mid_vrtc.h
new file mode 100644
index 000000000..0b44b1abe
--- /dev/null
+++ b/arch/x86/include/asm/intel_mid_vrtc.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _INTEL_MID_VRTC_H
+#define _INTEL_MID_VRTC_H
+
+extern unsigned char vrtc_cmos_read(unsigned char reg);
+extern void vrtc_cmos_write(unsigned char val, unsigned char reg);
+extern void vrtc_get_time(struct timespec64 *now);
+extern int vrtc_set_mmss(const struct timespec64 *now);
+
+#endif
diff --git a/arch/x86/include/asm/intel_pconfig.h b/arch/x86/include/asm/intel_pconfig.h
new file mode 100644
index 000000000..3cb002b1d
--- /dev/null
+++ b/arch/x86/include/asm/intel_pconfig.h
@@ -0,0 +1,65 @@
+#ifndef _ASM_X86_INTEL_PCONFIG_H
+#define _ASM_X86_INTEL_PCONFIG_H
+
+#include <asm/asm.h>
+#include <asm/processor.h>
+
+enum pconfig_target {
+ INVALID_TARGET = 0,
+ MKTME_TARGET = 1,
+ PCONFIG_TARGET_NR
+};
+
+int pconfig_target_supported(enum pconfig_target target);
+
+enum pconfig_leaf {
+ MKTME_KEY_PROGRAM = 0,
+ PCONFIG_LEAF_INVALID,
+};
+
+#define PCONFIG ".byte 0x0f, 0x01, 0xc5"
+
+/* Defines and structure for MKTME_KEY_PROGRAM of PCONFIG instruction */
+
+/* mktme_key_program::keyid_ctrl COMMAND, bits [7:0] */
+#define MKTME_KEYID_SET_KEY_DIRECT 0
+#define MKTME_KEYID_SET_KEY_RANDOM 1
+#define MKTME_KEYID_CLEAR_KEY 2
+#define MKTME_KEYID_NO_ENCRYPT 3
+
+/* mktme_key_program::keyid_ctrl ENC_ALG, bits [23:8] */
+#define MKTME_AES_XTS_128 (1 << 8)
+
+/* Return codes from the PCONFIG MKTME_KEY_PROGRAM */
+#define MKTME_PROG_SUCCESS 0
+#define MKTME_INVALID_PROG_CMD 1
+#define MKTME_ENTROPY_ERROR 2
+#define MKTME_INVALID_KEYID 3
+#define MKTME_INVALID_ENC_ALG 4
+#define MKTME_DEVICE_BUSY 5
+
+/* Hardware requires the structure to be 256 byte alinged. Otherwise #GP(0). */
+struct mktme_key_program {
+ u16 keyid;
+ u32 keyid_ctrl;
+ u8 __rsvd[58];
+ u8 key_field_1[64];
+ u8 key_field_2[64];
+} __packed __aligned(256);
+
+static inline int mktme_key_program(struct mktme_key_program *key_program)
+{
+ unsigned long rax = MKTME_KEY_PROGRAM;
+
+ if (!pconfig_target_supported(MKTME_TARGET))
+ return -ENXIO;
+
+ asm volatile(PCONFIG
+ : "=a" (rax), "=b" (key_program)
+ : "0" (rax), "1" (key_program)
+ : "memory", "cc");
+
+ return rax;
+}
+
+#endif /* _ASM_X86_INTEL_PCONFIG_H */
diff --git a/arch/x86/include/asm/intel_pmc_ipc.h b/arch/x86/include/asm/intel_pmc_ipc.h
new file mode 100644
index 000000000..9e7adcdbe
--- /dev/null
+++ b/arch/x86/include/asm/intel_pmc_ipc.h
@@ -0,0 +1,91 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_INTEL_PMC_IPC_H_
+#define _ASM_X86_INTEL_PMC_IPC_H_
+
+/* Commands */
+#define PMC_IPC_PMIC_ACCESS 0xFF
+#define PMC_IPC_PMIC_ACCESS_READ 0x0
+#define PMC_IPC_PMIC_ACCESS_WRITE 0x1
+#define PMC_IPC_USB_PWR_CTRL 0xF0
+#define PMC_IPC_PMIC_BLACKLIST_SEL 0xEF
+#define PMC_IPC_PHY_CONFIG 0xEE
+#define PMC_IPC_NORTHPEAK_CTRL 0xED
+#define PMC_IPC_PM_DEBUG 0xEC
+#define PMC_IPC_PMC_TELEMTRY 0xEB
+#define PMC_IPC_PMC_FW_MSG_CTRL 0xEA
+
+/* IPC return code */
+#define IPC_ERR_NONE 0
+#define IPC_ERR_CMD_NOT_SUPPORTED 1
+#define IPC_ERR_CMD_NOT_SERVICED 2
+#define IPC_ERR_UNABLE_TO_SERVICE 3
+#define IPC_ERR_CMD_INVALID 4
+#define IPC_ERR_CMD_FAILED 5
+#define IPC_ERR_EMSECURITY 6
+#define IPC_ERR_UNSIGNEDKERNEL 7
+
+/* GCR reg offsets from gcr base*/
+#define PMC_GCR_PMC_CFG_REG 0x08
+#define PMC_GCR_TELEM_DEEP_S0IX_REG 0x78
+#define PMC_GCR_TELEM_SHLW_S0IX_REG 0x80
+
+#if IS_ENABLED(CONFIG_INTEL_PMC_IPC)
+
+int intel_pmc_ipc_simple_command(int cmd, int sub);
+int intel_pmc_ipc_raw_cmd(u32 cmd, u32 sub, u8 *in, u32 inlen,
+ u32 *out, u32 outlen, u32 dptr, u32 sptr);
+int intel_pmc_ipc_command(u32 cmd, u32 sub, u8 *in, u32 inlen,
+ u32 *out, u32 outlen);
+int intel_pmc_s0ix_counter_read(u64 *data);
+int intel_pmc_gcr_read(u32 offset, u32 *data);
+int intel_pmc_gcr_read64(u32 offset, u64 *data);
+int intel_pmc_gcr_write(u32 offset, u32 data);
+int intel_pmc_gcr_update(u32 offset, u32 mask, u32 val);
+
+#else
+
+static inline int intel_pmc_ipc_simple_command(int cmd, int sub)
+{
+ return -EINVAL;
+}
+
+static inline int intel_pmc_ipc_raw_cmd(u32 cmd, u32 sub, u8 *in, u32 inlen,
+ u32 *out, u32 outlen, u32 dptr, u32 sptr)
+{
+ return -EINVAL;
+}
+
+static inline int intel_pmc_ipc_command(u32 cmd, u32 sub, u8 *in, u32 inlen,
+ u32 *out, u32 outlen)
+{
+ return -EINVAL;
+}
+
+static inline int intel_pmc_s0ix_counter_read(u64 *data)
+{
+ return -EINVAL;
+}
+
+static inline int intel_pmc_gcr_read(u32 offset, u32 *data)
+{
+ return -EINVAL;
+}
+
+static inline int intel_pmc_gcr_read64(u32 offset, u64 *data)
+{
+ return -EINVAL;
+}
+
+static inline int intel_pmc_gcr_write(u32 offset, u32 data)
+{
+ return -EINVAL;
+}
+
+static inline int intel_pmc_gcr_update(u32 offset, u32 mask, u32 val)
+{
+ return -EINVAL;
+}
+
+#endif /*CONFIG_INTEL_PMC_IPC*/
+
+#endif
diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h
new file mode 100644
index 000000000..b523f51c5
--- /dev/null
+++ b/arch/x86/include/asm/intel_pt.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_INTEL_PT_H
+#define _ASM_X86_INTEL_PT_H
+
+#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
+void cpu_emergency_stop_pt(void);
+#else
+static inline void cpu_emergency_stop_pt(void) {}
+#endif
+
+#endif /* _ASM_X86_INTEL_PT_H */
diff --git a/arch/x86/include/asm/intel_punit_ipc.h b/arch/x86/include/asm/intel_punit_ipc.h
new file mode 100644
index 000000000..ce16da719
--- /dev/null
+++ b/arch/x86/include/asm/intel_punit_ipc.h
@@ -0,0 +1,102 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_INTEL_PUNIT_IPC_H_
+#define _ASM_X86_INTEL_PUNIT_IPC_H_
+
+/*
+ * Three types of 8bit P-Unit IPC commands are supported,
+ * bit[7:6]: [00]: BIOS; [01]: GTD; [10]: ISPD.
+ */
+typedef enum {
+ BIOS_IPC = 0,
+ GTDRIVER_IPC,
+ ISPDRIVER_IPC,
+ RESERVED_IPC,
+} IPC_TYPE;
+
+#define IPC_TYPE_OFFSET 6
+#define IPC_PUNIT_BIOS_CMD_BASE (BIOS_IPC << IPC_TYPE_OFFSET)
+#define IPC_PUNIT_GTD_CMD_BASE (GTDDRIVER_IPC << IPC_TYPE_OFFSET)
+#define IPC_PUNIT_ISPD_CMD_BASE (ISPDRIVER_IPC << IPC_TYPE_OFFSET)
+#define IPC_PUNIT_CMD_TYPE_MASK (RESERVED_IPC << IPC_TYPE_OFFSET)
+
+/* BIOS => Pcode commands */
+#define IPC_PUNIT_BIOS_ZERO (IPC_PUNIT_BIOS_CMD_BASE | 0x00)
+#define IPC_PUNIT_BIOS_VR_INTERFACE (IPC_PUNIT_BIOS_CMD_BASE | 0x01)
+#define IPC_PUNIT_BIOS_READ_PCS (IPC_PUNIT_BIOS_CMD_BASE | 0x02)
+#define IPC_PUNIT_BIOS_WRITE_PCS (IPC_PUNIT_BIOS_CMD_BASE | 0x03)
+#define IPC_PUNIT_BIOS_READ_PCU_CONFIG (IPC_PUNIT_BIOS_CMD_BASE | 0x04)
+#define IPC_PUNIT_BIOS_WRITE_PCU_CONFIG (IPC_PUNIT_BIOS_CMD_BASE | 0x05)
+#define IPC_PUNIT_BIOS_READ_PL1_SETTING (IPC_PUNIT_BIOS_CMD_BASE | 0x06)
+#define IPC_PUNIT_BIOS_WRITE_PL1_SETTING (IPC_PUNIT_BIOS_CMD_BASE | 0x07)
+#define IPC_PUNIT_BIOS_TRIGGER_VDD_RAM (IPC_PUNIT_BIOS_CMD_BASE | 0x08)
+#define IPC_PUNIT_BIOS_READ_TELE_INFO (IPC_PUNIT_BIOS_CMD_BASE | 0x09)
+#define IPC_PUNIT_BIOS_READ_TELE_TRACE_CTRL (IPC_PUNIT_BIOS_CMD_BASE | 0x0a)
+#define IPC_PUNIT_BIOS_WRITE_TELE_TRACE_CTRL (IPC_PUNIT_BIOS_CMD_BASE | 0x0b)
+#define IPC_PUNIT_BIOS_READ_TELE_EVENT_CTRL (IPC_PUNIT_BIOS_CMD_BASE | 0x0c)
+#define IPC_PUNIT_BIOS_WRITE_TELE_EVENT_CTRL (IPC_PUNIT_BIOS_CMD_BASE | 0x0d)
+#define IPC_PUNIT_BIOS_READ_TELE_TRACE (IPC_PUNIT_BIOS_CMD_BASE | 0x0e)
+#define IPC_PUNIT_BIOS_WRITE_TELE_TRACE (IPC_PUNIT_BIOS_CMD_BASE | 0x0f)
+#define IPC_PUNIT_BIOS_READ_TELE_EVENT (IPC_PUNIT_BIOS_CMD_BASE | 0x10)
+#define IPC_PUNIT_BIOS_WRITE_TELE_EVENT (IPC_PUNIT_BIOS_CMD_BASE | 0x11)
+#define IPC_PUNIT_BIOS_READ_MODULE_TEMP (IPC_PUNIT_BIOS_CMD_BASE | 0x12)
+#define IPC_PUNIT_BIOS_RESERVED (IPC_PUNIT_BIOS_CMD_BASE | 0x13)
+#define IPC_PUNIT_BIOS_READ_VOLTAGE_OVER (IPC_PUNIT_BIOS_CMD_BASE | 0x14)
+#define IPC_PUNIT_BIOS_WRITE_VOLTAGE_OVER (IPC_PUNIT_BIOS_CMD_BASE | 0x15)
+#define IPC_PUNIT_BIOS_READ_RATIO_OVER (IPC_PUNIT_BIOS_CMD_BASE | 0x16)
+#define IPC_PUNIT_BIOS_WRITE_RATIO_OVER (IPC_PUNIT_BIOS_CMD_BASE | 0x17)
+#define IPC_PUNIT_BIOS_READ_VF_GL_CTRL (IPC_PUNIT_BIOS_CMD_BASE | 0x18)
+#define IPC_PUNIT_BIOS_WRITE_VF_GL_CTRL (IPC_PUNIT_BIOS_CMD_BASE | 0x19)
+#define IPC_PUNIT_BIOS_READ_FM_SOC_TEMP_THRESH (IPC_PUNIT_BIOS_CMD_BASE | 0x1a)
+#define IPC_PUNIT_BIOS_WRITE_FM_SOC_TEMP_THRESH (IPC_PUNIT_BIOS_CMD_BASE | 0x1b)
+
+/* GT Driver => Pcode commands */
+#define IPC_PUNIT_GTD_ZERO (IPC_PUNIT_GTD_CMD_BASE | 0x00)
+#define IPC_PUNIT_GTD_CONFIG (IPC_PUNIT_GTD_CMD_BASE | 0x01)
+#define IPC_PUNIT_GTD_READ_ICCP_LIC_CDYN_SCAL (IPC_PUNIT_GTD_CMD_BASE | 0x02)
+#define IPC_PUNIT_GTD_WRITE_ICCP_LIC_CDYN_SCAL (IPC_PUNIT_GTD_CMD_BASE | 0x03)
+#define IPC_PUNIT_GTD_GET_WM_VAL (IPC_PUNIT_GTD_CMD_BASE | 0x06)
+#define IPC_PUNIT_GTD_WRITE_CONFIG_WISHREQ (IPC_PUNIT_GTD_CMD_BASE | 0x07)
+#define IPC_PUNIT_GTD_READ_REQ_DUTY_CYCLE (IPC_PUNIT_GTD_CMD_BASE | 0x16)
+#define IPC_PUNIT_GTD_DIS_VOL_FREQ_CHG_REQUEST (IPC_PUNIT_GTD_CMD_BASE | 0x17)
+#define IPC_PUNIT_GTD_DYNA_DUTY_CYCLE_CTRL (IPC_PUNIT_GTD_CMD_BASE | 0x1a)
+#define IPC_PUNIT_GTD_DYNA_DUTY_CYCLE_TUNING (IPC_PUNIT_GTD_CMD_BASE | 0x1c)
+
+/* ISP Driver => Pcode commands */
+#define IPC_PUNIT_ISPD_ZERO (IPC_PUNIT_ISPD_CMD_BASE | 0x00)
+#define IPC_PUNIT_ISPD_CONFIG (IPC_PUNIT_ISPD_CMD_BASE | 0x01)
+#define IPC_PUNIT_ISPD_GET_ISP_LTR_VAL (IPC_PUNIT_ISPD_CMD_BASE | 0x02)
+#define IPC_PUNIT_ISPD_ACCESS_IU_FREQ_BOUNDS (IPC_PUNIT_ISPD_CMD_BASE | 0x03)
+#define IPC_PUNIT_ISPD_READ_CDYN_LEVEL (IPC_PUNIT_ISPD_CMD_BASE | 0x04)
+#define IPC_PUNIT_ISPD_WRITE_CDYN_LEVEL (IPC_PUNIT_ISPD_CMD_BASE | 0x05)
+
+/* Error codes */
+#define IPC_PUNIT_ERR_SUCCESS 0
+#define IPC_PUNIT_ERR_INVALID_CMD 1
+#define IPC_PUNIT_ERR_INVALID_PARAMETER 2
+#define IPC_PUNIT_ERR_CMD_TIMEOUT 3
+#define IPC_PUNIT_ERR_CMD_LOCKED 4
+#define IPC_PUNIT_ERR_INVALID_VR_ID 5
+#define IPC_PUNIT_ERR_VR_ERR 6
+
+#if IS_ENABLED(CONFIG_INTEL_PUNIT_IPC)
+
+int intel_punit_ipc_simple_command(int cmd, int para1, int para2);
+int intel_punit_ipc_command(u32 cmd, u32 para1, u32 para2, u32 *in, u32 *out);
+
+#else
+
+static inline int intel_punit_ipc_simple_command(int cmd,
+ int para1, int para2)
+{
+ return -ENODEV;
+}
+
+static inline int intel_punit_ipc_command(u32 cmd, u32 para1, u32 para2,
+ u32 *in, u32 *out)
+{
+ return -ENODEV;
+}
+
+#endif /* CONFIG_INTEL_PUNIT_IPC */
+
+#endif
diff --git a/arch/x86/include/asm/intel_rdt_sched.h b/arch/x86/include/asm/intel_rdt_sched.h
new file mode 100644
index 000000000..9acb06b6f
--- /dev/null
+++ b/arch/x86/include/asm/intel_rdt_sched.h
@@ -0,0 +1,93 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_INTEL_RDT_SCHED_H
+#define _ASM_X86_INTEL_RDT_SCHED_H
+
+#ifdef CONFIG_INTEL_RDT
+
+#include <linux/sched.h>
+#include <linux/jump_label.h>
+
+#define IA32_PQR_ASSOC 0x0c8f
+
+/**
+ * struct intel_pqr_state - State cache for the PQR MSR
+ * @cur_rmid: The cached Resource Monitoring ID
+ * @cur_closid: The cached Class Of Service ID
+ * @default_rmid: The user assigned Resource Monitoring ID
+ * @default_closid: The user assigned cached Class Of Service ID
+ *
+ * The upper 32 bits of IA32_PQR_ASSOC contain closid and the
+ * lower 10 bits rmid. The update to IA32_PQR_ASSOC always
+ * contains both parts, so we need to cache them. This also
+ * stores the user configured per cpu CLOSID and RMID.
+ *
+ * The cache also helps to avoid pointless updates if the value does
+ * not change.
+ */
+struct intel_pqr_state {
+ u32 cur_rmid;
+ u32 cur_closid;
+ u32 default_rmid;
+ u32 default_closid;
+};
+
+DECLARE_PER_CPU(struct intel_pqr_state, pqr_state);
+
+DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
+DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
+DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key);
+
+/*
+ * __intel_rdt_sched_in() - Writes the task's CLOSid/RMID to IA32_PQR_MSR
+ *
+ * Following considerations are made so that this has minimal impact
+ * on scheduler hot path:
+ * - This will stay as no-op unless we are running on an Intel SKU
+ * which supports resource control or monitoring and we enable by
+ * mounting the resctrl file system.
+ * - Caches the per cpu CLOSid/RMID values and does the MSR write only
+ * when a task with a different CLOSid/RMID is scheduled in.
+ * - We allocate RMIDs/CLOSids globally in order to keep this as
+ * simple as possible.
+ * Must be called with preemption disabled.
+ */
+static void __intel_rdt_sched_in(void)
+{
+ struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
+ u32 closid = state->default_closid;
+ u32 rmid = state->default_rmid;
+
+ /*
+ * If this task has a closid/rmid assigned, use it.
+ * Else use the closid/rmid assigned to this cpu.
+ */
+ if (static_branch_likely(&rdt_alloc_enable_key)) {
+ if (current->closid)
+ closid = current->closid;
+ }
+
+ if (static_branch_likely(&rdt_mon_enable_key)) {
+ if (current->rmid)
+ rmid = current->rmid;
+ }
+
+ if (closid != state->cur_closid || rmid != state->cur_rmid) {
+ state->cur_closid = closid;
+ state->cur_rmid = rmid;
+ wrmsr(IA32_PQR_ASSOC, rmid, closid);
+ }
+}
+
+static inline void intel_rdt_sched_in(void)
+{
+ if (static_branch_likely(&rdt_enable_key))
+ __intel_rdt_sched_in();
+}
+
+#else
+
+static inline void intel_rdt_sched_in(void) {}
+
+#endif /* CONFIG_INTEL_RDT */
+
+#endif /* _ASM_X86_INTEL_RDT_SCHED_H */
diff --git a/arch/x86/include/asm/intel_scu_ipc.h b/arch/x86/include/asm/intel_scu_ipc.h
new file mode 100644
index 000000000..4a8c6e817
--- /dev/null
+++ b/arch/x86/include/asm/intel_scu_ipc.h
@@ -0,0 +1,82 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_INTEL_SCU_IPC_H_
+#define _ASM_X86_INTEL_SCU_IPC_H_
+
+#include <linux/notifier.h>
+
+#define IPCMSG_INDIRECT_READ 0x02
+#define IPCMSG_INDIRECT_WRITE 0x05
+
+#define IPCMSG_COLD_OFF 0x80 /* Only for Tangier */
+
+#define IPCMSG_WARM_RESET 0xF0
+#define IPCMSG_COLD_RESET 0xF1
+#define IPCMSG_SOFT_RESET 0xF2
+#define IPCMSG_COLD_BOOT 0xF3
+
+#define IPCMSG_VRTC 0xFA /* Set vRTC device */
+ /* Command id associated with message IPCMSG_VRTC */
+ #define IPC_CMD_VRTC_SETTIME 1 /* Set time */
+ #define IPC_CMD_VRTC_SETALARM 2 /* Set alarm */
+
+/* Read single register */
+int intel_scu_ipc_ioread8(u16 addr, u8 *data);
+
+/* Read two sequential registers */
+int intel_scu_ipc_ioread16(u16 addr, u16 *data);
+
+/* Read four sequential registers */
+int intel_scu_ipc_ioread32(u16 addr, u32 *data);
+
+/* Read a vector */
+int intel_scu_ipc_readv(u16 *addr, u8 *data, int len);
+
+/* Write single register */
+int intel_scu_ipc_iowrite8(u16 addr, u8 data);
+
+/* Write two sequential registers */
+int intel_scu_ipc_iowrite16(u16 addr, u16 data);
+
+/* Write four sequential registers */
+int intel_scu_ipc_iowrite32(u16 addr, u32 data);
+
+/* Write a vector */
+int intel_scu_ipc_writev(u16 *addr, u8 *data, int len);
+
+/* Update single register based on the mask */
+int intel_scu_ipc_update_register(u16 addr, u8 data, u8 mask);
+
+/* Issue commands to the SCU with or without data */
+int intel_scu_ipc_simple_command(int cmd, int sub);
+int intel_scu_ipc_command(int cmd, int sub, u32 *in, int inlen,
+ u32 *out, int outlen);
+int intel_scu_ipc_raw_command(int cmd, int sub, u8 *in, int inlen,
+ u32 *out, int outlen, u32 dptr, u32 sptr);
+
+/* I2C control api */
+int intel_scu_ipc_i2c_cntrl(u32 addr, u32 *data);
+
+/* Update FW version */
+int intel_scu_ipc_fw_update(u8 *buffer, u32 length);
+
+extern struct blocking_notifier_head intel_scu_notifier;
+
+static inline void intel_scu_notifier_add(struct notifier_block *nb)
+{
+ blocking_notifier_chain_register(&intel_scu_notifier, nb);
+}
+
+static inline void intel_scu_notifier_remove(struct notifier_block *nb)
+{
+ blocking_notifier_chain_unregister(&intel_scu_notifier, nb);
+}
+
+static inline int intel_scu_notifier_post(unsigned long v, void *p)
+{
+ return blocking_notifier_call_chain(&intel_scu_notifier, v, p);
+}
+
+#define SCU_AVAILABLE 1
+#define SCU_DOWN 2
+
+#endif
diff --git a/arch/x86/include/asm/intel_telemetry.h b/arch/x86/include/asm/intel_telemetry.h
new file mode 100644
index 000000000..85029b58d
--- /dev/null
+++ b/arch/x86/include/asm/intel_telemetry.h
@@ -0,0 +1,147 @@
+/*
+ * Intel SOC Telemetry Driver Header File
+ * Copyright (C) 2015, Intel Corporation.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ */
+#ifndef INTEL_TELEMETRY_H
+#define INTEL_TELEMETRY_H
+
+#define TELEM_MAX_EVENTS_SRAM 28
+#define TELEM_MAX_OS_ALLOCATED_EVENTS 20
+
+enum telemetry_unit {
+ TELEM_PSS = 0,
+ TELEM_IOSS,
+ TELEM_UNIT_NONE
+};
+
+struct telemetry_evtlog {
+ u32 telem_evtid;
+ u64 telem_evtlog;
+};
+
+struct telemetry_evtconfig {
+ /* Array of Event-IDs to Enable */
+ u32 *evtmap;
+
+ /* Number of Events (<29) in evtmap */
+ u8 num_evts;
+
+ /* Sampling period */
+ u8 period;
+};
+
+struct telemetry_evtmap {
+ const char *name;
+ u32 evt_id;
+};
+
+struct telemetry_unit_config {
+ struct telemetry_evtmap *telem_evts;
+ void __iomem *regmap;
+ u32 ssram_base_addr;
+ u8 ssram_evts_used;
+ u8 curr_period;
+ u8 max_period;
+ u8 min_period;
+ u32 ssram_size;
+
+};
+
+struct telemetry_plt_config {
+ struct telemetry_unit_config pss_config;
+ struct telemetry_unit_config ioss_config;
+ struct mutex telem_trace_lock;
+ struct mutex telem_lock;
+ bool telem_in_use;
+};
+
+struct telemetry_core_ops {
+ int (*get_sampling_period)(u8 *pss_min_period, u8 *pss_max_period,
+ u8 *ioss_min_period, u8 *ioss_max_period);
+
+ int (*get_eventconfig)(struct telemetry_evtconfig *pss_evtconfig,
+ struct telemetry_evtconfig *ioss_evtconfig,
+ int pss_len, int ioss_len);
+
+ int (*update_events)(struct telemetry_evtconfig pss_evtconfig,
+ struct telemetry_evtconfig ioss_evtconfig);
+
+ int (*set_sampling_period)(u8 pss_period, u8 ioss_period);
+
+ int (*get_trace_verbosity)(enum telemetry_unit telem_unit,
+ u32 *verbosity);
+
+ int (*set_trace_verbosity)(enum telemetry_unit telem_unit,
+ u32 verbosity);
+
+ int (*raw_read_eventlog)(enum telemetry_unit telem_unit,
+ struct telemetry_evtlog *evtlog,
+ int len, int log_all_evts);
+
+ int (*read_eventlog)(enum telemetry_unit telem_unit,
+ struct telemetry_evtlog *evtlog,
+ int len, int log_all_evts);
+
+ int (*add_events)(u8 num_pss_evts, u8 num_ioss_evts,
+ u32 *pss_evtmap, u32 *ioss_evtmap);
+
+ int (*reset_events)(void);
+};
+
+int telemetry_set_pltdata(const struct telemetry_core_ops *ops,
+ struct telemetry_plt_config *pltconfig);
+
+int telemetry_clear_pltdata(void);
+
+int telemetry_pltconfig_valid(void);
+
+int telemetry_get_evtname(enum telemetry_unit telem_unit,
+ const char **name, int len);
+
+int telemetry_update_events(struct telemetry_evtconfig pss_evtconfig,
+ struct telemetry_evtconfig ioss_evtconfig);
+
+int telemetry_add_events(u8 num_pss_evts, u8 num_ioss_evts,
+ u32 *pss_evtmap, u32 *ioss_evtmap);
+
+int telemetry_reset_events(void);
+
+int telemetry_get_eventconfig(struct telemetry_evtconfig *pss_config,
+ struct telemetry_evtconfig *ioss_config,
+ int pss_len, int ioss_len);
+
+int telemetry_read_events(enum telemetry_unit telem_unit,
+ struct telemetry_evtlog *evtlog, int len);
+
+int telemetry_raw_read_events(enum telemetry_unit telem_unit,
+ struct telemetry_evtlog *evtlog, int len);
+
+int telemetry_read_eventlog(enum telemetry_unit telem_unit,
+ struct telemetry_evtlog *evtlog, int len);
+
+int telemetry_raw_read_eventlog(enum telemetry_unit telem_unit,
+ struct telemetry_evtlog *evtlog, int len);
+
+int telemetry_get_sampling_period(u8 *pss_min_period, u8 *pss_max_period,
+ u8 *ioss_min_period, u8 *ioss_max_period);
+
+int telemetry_set_sampling_period(u8 pss_period, u8 ioss_period);
+
+int telemetry_set_trace_verbosity(enum telemetry_unit telem_unit,
+ u32 verbosity);
+
+int telemetry_get_trace_verbosity(enum telemetry_unit telem_unit,
+ u32 *verbosity);
+
+#endif /* INTEL_TELEMETRY_H */
diff --git a/arch/x86/include/asm/invpcid.h b/arch/x86/include/asm/invpcid.h
new file mode 100644
index 000000000..989cfa86d
--- /dev/null
+++ b/arch/x86/include/asm/invpcid.h
@@ -0,0 +1,53 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_INVPCID
+#define _ASM_X86_INVPCID
+
+static inline void __invpcid(unsigned long pcid, unsigned long addr,
+ unsigned long type)
+{
+ struct { u64 d[2]; } desc = { { pcid, addr } };
+
+ /*
+ * The memory clobber is because the whole point is to invalidate
+ * stale TLB entries and, especially if we're flushing global
+ * mappings, we don't want the compiler to reorder any subsequent
+ * memory accesses before the TLB flush.
+ *
+ * The hex opcode is invpcid (%ecx), %eax in 32-bit mode and
+ * invpcid (%rcx), %rax in long mode.
+ */
+ asm volatile (".byte 0x66, 0x0f, 0x38, 0x82, 0x01"
+ : : "m" (desc), "a" (type), "c" (&desc) : "memory");
+}
+
+#define INVPCID_TYPE_INDIV_ADDR 0
+#define INVPCID_TYPE_SINGLE_CTXT 1
+#define INVPCID_TYPE_ALL_INCL_GLOBAL 2
+#define INVPCID_TYPE_ALL_NON_GLOBAL 3
+
+/* Flush all mappings for a given pcid and addr, not including globals. */
+static inline void invpcid_flush_one(unsigned long pcid,
+ unsigned long addr)
+{
+ __invpcid(pcid, addr, INVPCID_TYPE_INDIV_ADDR);
+}
+
+/* Flush all mappings for a given PCID, not including globals. */
+static inline void invpcid_flush_single_context(unsigned long pcid)
+{
+ __invpcid(pcid, 0, INVPCID_TYPE_SINGLE_CTXT);
+}
+
+/* Flush all mappings, including globals, for all PCIDs. */
+static inline void invpcid_flush_all(void)
+{
+ __invpcid(0, 0, INVPCID_TYPE_ALL_INCL_GLOBAL);
+}
+
+/* Flush all mappings for all PCIDs except globals. */
+static inline void invpcid_flush_all_nonglobals(void)
+{
+ __invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL);
+}
+
+#endif /* _ASM_X86_INVPCID */
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
new file mode 100644
index 000000000..6de64840d
--- /dev/null
+++ b/arch/x86/include/asm/io.h
@@ -0,0 +1,413 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_IO_H
+#define _ASM_X86_IO_H
+
+/*
+ * This file contains the definitions for the x86 IO instructions
+ * inb/inw/inl/outb/outw/outl and the "string versions" of the same
+ * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing"
+ * versions of the single-IO instructions (inb_p/inw_p/..).
+ *
+ * This file is not meant to be obfuscating: it's just complicated
+ * to (a) handle it all in a way that makes gcc able to optimize it
+ * as well as possible and (b) trying to avoid writing the same thing
+ * over and over again with slight variations and possibly making a
+ * mistake somewhere.
+ */
+
+/*
+ * Thanks to James van Artsdalen for a better timing-fix than
+ * the two short jumps: using outb's to a nonexistent port seems
+ * to guarantee better timings even on fast machines.
+ *
+ * On the other hand, I'd like to be sure of a non-existent port:
+ * I feel a bit unsafe about using 0x80 (should be safe, though)
+ *
+ * Linus
+ */
+
+ /*
+ * Bit simplified and optimized by Jan Hubicka
+ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999.
+ *
+ * isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added,
+ * isa_read[wl] and isa_write[wl] fixed
+ * - Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ */
+
+#define ARCH_HAS_IOREMAP_WC
+#define ARCH_HAS_IOREMAP_WT
+
+#include <linux/string.h>
+#include <linux/compiler.h>
+#include <asm/page.h>
+#include <asm/early_ioremap.h>
+#include <asm/pgtable_types.h>
+
+#define build_mmio_read(name, size, type, reg, barrier) \
+static inline type name(const volatile void __iomem *addr) \
+{ type ret; asm volatile("mov" size " %1,%0":reg (ret) \
+:"m" (*(volatile type __force *)addr) barrier); return ret; }
+
+#define build_mmio_write(name, size, type, reg, barrier) \
+static inline void name(type val, volatile void __iomem *addr) \
+{ asm volatile("mov" size " %0,%1": :reg (val), \
+"m" (*(volatile type __force *)addr) barrier); }
+
+build_mmio_read(readb, "b", unsigned char, "=q", :"memory")
+build_mmio_read(readw, "w", unsigned short, "=r", :"memory")
+build_mmio_read(readl, "l", unsigned int, "=r", :"memory")
+
+build_mmio_read(__readb, "b", unsigned char, "=q", )
+build_mmio_read(__readw, "w", unsigned short, "=r", )
+build_mmio_read(__readl, "l", unsigned int, "=r", )
+
+build_mmio_write(writeb, "b", unsigned char, "q", :"memory")
+build_mmio_write(writew, "w", unsigned short, "r", :"memory")
+build_mmio_write(writel, "l", unsigned int, "r", :"memory")
+
+build_mmio_write(__writeb, "b", unsigned char, "q", )
+build_mmio_write(__writew, "w", unsigned short, "r", )
+build_mmio_write(__writel, "l", unsigned int, "r", )
+
+#define readb readb
+#define readw readw
+#define readl readl
+#define readb_relaxed(a) __readb(a)
+#define readw_relaxed(a) __readw(a)
+#define readl_relaxed(a) __readl(a)
+#define __raw_readb __readb
+#define __raw_readw __readw
+#define __raw_readl __readl
+
+#define writeb writeb
+#define writew writew
+#define writel writel
+#define writeb_relaxed(v, a) __writeb(v, a)
+#define writew_relaxed(v, a) __writew(v, a)
+#define writel_relaxed(v, a) __writel(v, a)
+#define __raw_writeb __writeb
+#define __raw_writew __writew
+#define __raw_writel __writel
+
+#define mmiowb() barrier()
+
+#ifdef CONFIG_X86_64
+
+build_mmio_read(readq, "q", u64, "=r", :"memory")
+build_mmio_read(__readq, "q", u64, "=r", )
+build_mmio_write(writeq, "q", u64, "r", :"memory")
+build_mmio_write(__writeq, "q", u64, "r", )
+
+#define readq_relaxed(a) __readq(a)
+#define writeq_relaxed(v, a) __writeq(v, a)
+
+#define __raw_readq __readq
+#define __raw_writeq __writeq
+
+/* Let people know that we have them */
+#define readq readq
+#define writeq writeq
+
+#endif
+
+#define ARCH_HAS_VALID_PHYS_ADDR_RANGE
+extern int valid_phys_addr_range(phys_addr_t addr, size_t size);
+extern int valid_mmap_phys_addr_range(unsigned long pfn, size_t size);
+
+/**
+ * virt_to_phys - map virtual addresses to physical
+ * @address: address to remap
+ *
+ * The returned physical address is the physical (CPU) mapping for
+ * the memory address given. It is only valid to use this function on
+ * addresses directly mapped or allocated via kmalloc.
+ *
+ * This function does not give bus mappings for DMA transfers. In
+ * almost all conceivable cases a device driver should not be using
+ * this function
+ */
+
+static inline phys_addr_t virt_to_phys(volatile void *address)
+{
+ return __pa(address);
+}
+#define virt_to_phys virt_to_phys
+
+/**
+ * phys_to_virt - map physical address to virtual
+ * @address: address to remap
+ *
+ * The returned virtual address is a current CPU mapping for
+ * the memory address given. It is only valid to use this function on
+ * addresses that have a kernel mapping
+ *
+ * This function does not handle bus mappings for DMA transfers. In
+ * almost all conceivable cases a device driver should not be using
+ * this function
+ */
+
+static inline void *phys_to_virt(phys_addr_t address)
+{
+ return __va(address);
+}
+#define phys_to_virt phys_to_virt
+
+/*
+ * Change "struct page" to physical address.
+ */
+#define page_to_phys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
+
+/*
+ * ISA I/O bus memory addresses are 1:1 with the physical address.
+ * However, we truncate the address to unsigned int to avoid undesirable
+ * promitions in legacy drivers.
+ */
+static inline unsigned int isa_virt_to_bus(volatile void *address)
+{
+ return (unsigned int)virt_to_phys(address);
+}
+#define isa_page_to_bus(page) ((unsigned int)page_to_phys(page))
+#define isa_bus_to_virt phys_to_virt
+
+/*
+ * However PCI ones are not necessarily 1:1 and therefore these interfaces
+ * are forbidden in portable PCI drivers.
+ *
+ * Allow them on x86 for legacy drivers, though.
+ */
+#define virt_to_bus virt_to_phys
+#define bus_to_virt phys_to_virt
+
+/*
+ * The default ioremap() behavior is non-cached; if you need something
+ * else, you probably want one of the following.
+ */
+extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size);
+#define ioremap_nocache ioremap_nocache
+extern void __iomem *ioremap_uc(resource_size_t offset, unsigned long size);
+#define ioremap_uc ioremap_uc
+
+extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
+#define ioremap_cache ioremap_cache
+extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, unsigned long prot_val);
+#define ioremap_prot ioremap_prot
+
+/**
+ * ioremap - map bus memory into CPU space
+ * @offset: bus address of the memory
+ * @size: size of the resource to map
+ *
+ * ioremap performs a platform specific sequence of operations to
+ * make bus memory CPU accessible via the readb/readw/readl/writeb/
+ * writew/writel functions and the other mmio helpers. The returned
+ * address is not guaranteed to be usable directly as a virtual
+ * address.
+ *
+ * If the area you are trying to map is a PCI BAR you should have a
+ * look at pci_iomap().
+ */
+static inline void __iomem *ioremap(resource_size_t offset, unsigned long size)
+{
+ return ioremap_nocache(offset, size);
+}
+#define ioremap ioremap
+
+extern void iounmap(volatile void __iomem *addr);
+#define iounmap iounmap
+
+extern void set_iounmap_nonlazy(void);
+
+#ifdef __KERNEL__
+
+#include <asm-generic/iomap.h>
+
+/*
+ * ISA space is 'always mapped' on a typical x86 system, no need to
+ * explicitly ioremap() it. The fact that the ISA IO space is mapped
+ * to PAGE_OFFSET is pure coincidence - it does not mean ISA values
+ * are physical addresses. The following constant pointer can be
+ * used as the IO-area pointer (it can be iounmapped as well, so the
+ * analogy with PCI is quite large):
+ */
+#define __ISA_IO_base ((char __iomem *)(PAGE_OFFSET))
+
+#endif /* __KERNEL__ */
+
+extern void native_io_delay(void);
+
+extern int io_delay_type;
+extern void io_delay_init(void);
+
+#if defined(CONFIG_PARAVIRT)
+#include <asm/paravirt.h>
+#else
+
+static inline void slow_down_io(void)
+{
+ native_io_delay();
+#ifdef REALLY_SLOW_IO
+ native_io_delay();
+ native_io_delay();
+ native_io_delay();
+#endif
+}
+
+#endif
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+#include <linux/jump_label.h>
+
+extern struct static_key_false sev_enable_key;
+static inline bool sev_key_active(void)
+{
+ return static_branch_unlikely(&sev_enable_key);
+}
+
+#else /* !CONFIG_AMD_MEM_ENCRYPT */
+
+static inline bool sev_key_active(void) { return false; }
+
+#endif /* CONFIG_AMD_MEM_ENCRYPT */
+
+#define BUILDIO(bwl, bw, type) \
+static inline void out##bwl(unsigned type value, int port) \
+{ \
+ asm volatile("out" #bwl " %" #bw "0, %w1" \
+ : : "a"(value), "Nd"(port)); \
+} \
+ \
+static inline unsigned type in##bwl(int port) \
+{ \
+ unsigned type value; \
+ asm volatile("in" #bwl " %w1, %" #bw "0" \
+ : "=a"(value) : "Nd"(port)); \
+ return value; \
+} \
+ \
+static inline void out##bwl##_p(unsigned type value, int port) \
+{ \
+ out##bwl(value, port); \
+ slow_down_io(); \
+} \
+ \
+static inline unsigned type in##bwl##_p(int port) \
+{ \
+ unsigned type value = in##bwl(port); \
+ slow_down_io(); \
+ return value; \
+} \
+ \
+static inline void outs##bwl(int port, const void *addr, unsigned long count) \
+{ \
+ if (sev_key_active()) { \
+ unsigned type *value = (unsigned type *)addr; \
+ while (count) { \
+ out##bwl(*value, port); \
+ value++; \
+ count--; \
+ } \
+ } else { \
+ asm volatile("rep; outs" #bwl \
+ : "+S"(addr), "+c"(count) \
+ : "d"(port) : "memory"); \
+ } \
+} \
+ \
+static inline void ins##bwl(int port, void *addr, unsigned long count) \
+{ \
+ if (sev_key_active()) { \
+ unsigned type *value = (unsigned type *)addr; \
+ while (count) { \
+ *value = in##bwl(port); \
+ value++; \
+ count--; \
+ } \
+ } else { \
+ asm volatile("rep; ins" #bwl \
+ : "+D"(addr), "+c"(count) \
+ : "d"(port) : "memory"); \
+ } \
+}
+
+BUILDIO(b, b, char)
+BUILDIO(w, w, short)
+BUILDIO(l, , int)
+
+#define inb inb
+#define inw inw
+#define inl inl
+#define inb_p inb_p
+#define inw_p inw_p
+#define inl_p inl_p
+#define insb insb
+#define insw insw
+#define insl insl
+
+#define outb outb
+#define outw outw
+#define outl outl
+#define outb_p outb_p
+#define outw_p outw_p
+#define outl_p outl_p
+#define outsb outsb
+#define outsw outsw
+#define outsl outsl
+
+extern void *xlate_dev_mem_ptr(phys_addr_t phys);
+extern void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr);
+
+#define xlate_dev_mem_ptr xlate_dev_mem_ptr
+#define unxlate_dev_mem_ptr unxlate_dev_mem_ptr
+
+extern int ioremap_change_attr(unsigned long vaddr, unsigned long size,
+ enum page_cache_mode pcm);
+extern void __iomem *ioremap_wc(resource_size_t offset, unsigned long size);
+#define ioremap_wc ioremap_wc
+extern void __iomem *ioremap_wt(resource_size_t offset, unsigned long size);
+#define ioremap_wt ioremap_wt
+
+extern bool is_early_ioremap_ptep(pte_t *ptep);
+
+#ifdef CONFIG_XEN
+#include <xen/xen.h>
+struct bio_vec;
+
+extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1,
+ const struct bio_vec *vec2);
+
+#define BIOVEC_PHYS_MERGEABLE(vec1, vec2) \
+ (__BIOVEC_PHYS_MERGEABLE(vec1, vec2) && \
+ (!xen_domain() || xen_biovec_phys_mergeable(vec1, vec2)))
+#endif /* CONFIG_XEN */
+
+#define IO_SPACE_LIMIT 0xffff
+
+#include <asm-generic/io.h>
+#undef PCI_IOBASE
+
+#ifdef CONFIG_MTRR
+extern int __must_check arch_phys_wc_index(int handle);
+#define arch_phys_wc_index arch_phys_wc_index
+
+extern int __must_check arch_phys_wc_add(unsigned long base,
+ unsigned long size);
+extern void arch_phys_wc_del(int handle);
+#define arch_phys_wc_add arch_phys_wc_add
+#endif
+
+#ifdef CONFIG_X86_PAT
+extern int arch_io_reserve_memtype_wc(resource_size_t start, resource_size_t size);
+extern void arch_io_free_memtype_wc(resource_size_t start, resource_size_t size);
+#define arch_io_reserve_memtype_wc arch_io_reserve_memtype_wc
+#endif
+
+extern bool arch_memremap_can_ram_remap(resource_size_t offset,
+ unsigned long size,
+ unsigned long flags);
+#define arch_memremap_can_ram_remap arch_memremap_can_ram_remap
+
+extern bool phys_mem_access_encrypted(unsigned long phys_addr,
+ unsigned long size);
+
+#endif /* _ASM_X86_IO_H */
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
new file mode 100644
index 000000000..fd20a2334
--- /dev/null
+++ b/arch/x86/include/asm/io_apic.h
@@ -0,0 +1,240 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_IO_APIC_H
+#define _ASM_X86_IO_APIC_H
+
+#include <linux/types.h>
+#include <asm/mpspec.h>
+#include <asm/apicdef.h>
+#include <asm/irq_vectors.h>
+#include <asm/x86_init.h>
+/*
+ * Intel IO-APIC support for SMP and UP systems.
+ *
+ * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar
+ */
+
+/* I/O Unit Redirection Table */
+#define IO_APIC_REDIR_VECTOR_MASK 0x000FF
+#define IO_APIC_REDIR_DEST_LOGICAL 0x00800
+#define IO_APIC_REDIR_DEST_PHYSICAL 0x00000
+#define IO_APIC_REDIR_SEND_PENDING (1 << 12)
+#define IO_APIC_REDIR_REMOTE_IRR (1 << 14)
+#define IO_APIC_REDIR_LEVEL_TRIGGER (1 << 15)
+#define IO_APIC_REDIR_MASKED (1 << 16)
+
+/*
+ * The structure of the IO-APIC:
+ */
+union IO_APIC_reg_00 {
+ u32 raw;
+ struct {
+ u32 __reserved_2 : 14,
+ LTS : 1,
+ delivery_type : 1,
+ __reserved_1 : 8,
+ ID : 8;
+ } __attribute__ ((packed)) bits;
+};
+
+union IO_APIC_reg_01 {
+ u32 raw;
+ struct {
+ u32 version : 8,
+ __reserved_2 : 7,
+ PRQ : 1,
+ entries : 8,
+ __reserved_1 : 8;
+ } __attribute__ ((packed)) bits;
+};
+
+union IO_APIC_reg_02 {
+ u32 raw;
+ struct {
+ u32 __reserved_2 : 24,
+ arbitration : 4,
+ __reserved_1 : 4;
+ } __attribute__ ((packed)) bits;
+};
+
+union IO_APIC_reg_03 {
+ u32 raw;
+ struct {
+ u32 boot_DT : 1,
+ __reserved_1 : 31;
+ } __attribute__ ((packed)) bits;
+};
+
+struct IO_APIC_route_entry {
+ __u32 vector : 8,
+ delivery_mode : 3, /* 000: FIXED
+ * 001: lowest prio
+ * 111: ExtINT
+ */
+ dest_mode : 1, /* 0: physical, 1: logical */
+ delivery_status : 1,
+ polarity : 1,
+ irr : 1,
+ trigger : 1, /* 0: edge, 1: level */
+ mask : 1, /* 0: enabled, 1: disabled */
+ __reserved_2 : 15;
+
+ __u32 __reserved_3 : 24,
+ dest : 8;
+} __attribute__ ((packed));
+
+struct IR_IO_APIC_route_entry {
+ __u64 vector : 8,
+ zero : 3,
+ index2 : 1,
+ delivery_status : 1,
+ polarity : 1,
+ irr : 1,
+ trigger : 1,
+ mask : 1,
+ reserved : 31,
+ format : 1,
+ index : 15;
+} __attribute__ ((packed));
+
+struct irq_alloc_info;
+struct ioapic_domain_cfg;
+
+#define IOAPIC_AUTO -1
+#define IOAPIC_EDGE 0
+#define IOAPIC_LEVEL 1
+
+#define IOAPIC_MASKED 1
+#define IOAPIC_UNMASKED 0
+
+#define IOAPIC_POL_HIGH 0
+#define IOAPIC_POL_LOW 1
+
+#define IOAPIC_DEST_MODE_PHYSICAL 0
+#define IOAPIC_DEST_MODE_LOGICAL 1
+
+#define IOAPIC_MAP_ALLOC 0x1
+#define IOAPIC_MAP_CHECK 0x2
+
+#ifdef CONFIG_X86_IO_APIC
+
+/*
+ * # of IO-APICs and # of IRQ routing registers
+ */
+extern int nr_ioapics;
+
+extern int mpc_ioapic_id(int ioapic);
+extern unsigned int mpc_ioapic_addr(int ioapic);
+
+/* # of MP IRQ source entries */
+extern int mp_irq_entries;
+
+/* MP IRQ source entries */
+extern struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
+
+/* 1 if "noapic" boot option passed */
+extern int skip_ioapic_setup;
+
+/* 1 if "noapic" boot option passed */
+extern int noioapicquirk;
+
+/* -1 if "noapic" boot option passed */
+extern int noioapicreroute;
+
+extern u32 gsi_top;
+
+extern unsigned long io_apic_irqs;
+
+#define IO_APIC_IRQ(x) (((x) >= NR_IRQS_LEGACY) || ((1 << (x)) & io_apic_irqs))
+
+/*
+ * If we use the IO-APIC for IRQ routing, disable automatic
+ * assignment of PCI IRQ's.
+ */
+#define io_apic_assign_pci_irqs \
+ (mp_irq_entries && !skip_ioapic_setup && io_apic_irqs)
+
+struct irq_cfg;
+extern void ioapic_insert_resources(void);
+extern int arch_early_ioapic_init(void);
+
+extern int save_ioapic_entries(void);
+extern void mask_ioapic_entries(void);
+extern int restore_ioapic_entries(void);
+
+extern void setup_ioapic_ids_from_mpc(void);
+extern void setup_ioapic_ids_from_mpc_nocheck(void);
+
+extern int mp_find_ioapic(u32 gsi);
+extern int mp_find_ioapic_pin(int ioapic, u32 gsi);
+extern int mp_map_gsi_to_irq(u32 gsi, unsigned int flags,
+ struct irq_alloc_info *info);
+extern void mp_unmap_irq(int irq);
+extern int mp_register_ioapic(int id, u32 address, u32 gsi_base,
+ struct ioapic_domain_cfg *cfg);
+extern int mp_unregister_ioapic(u32 gsi_base);
+extern int mp_ioapic_registered(u32 gsi_base);
+
+extern void ioapic_set_alloc_attr(struct irq_alloc_info *info,
+ int node, int trigger, int polarity);
+
+extern void mp_save_irq(struct mpc_intsrc *m);
+
+extern void disable_ioapic_support(void);
+
+extern void __init io_apic_init_mappings(void);
+extern unsigned int native_io_apic_read(unsigned int apic, unsigned int reg);
+extern void native_restore_boot_irq_mode(void);
+
+static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
+{
+ return x86_apic_ops.io_apic_read(apic, reg);
+}
+
+extern void setup_IO_APIC(void);
+extern void enable_IO_APIC(void);
+extern void clear_IO_APIC(void);
+extern void restore_boot_irq_mode(void);
+extern int IO_APIC_get_PCI_irq_vector(int bus, int devfn, int pin);
+extern void print_IO_APICs(void);
+#else /* !CONFIG_X86_IO_APIC */
+
+#define IO_APIC_IRQ(x) 0
+#define io_apic_assign_pci_irqs 0
+#define setup_ioapic_ids_from_mpc x86_init_noop
+static inline void ioapic_insert_resources(void) { }
+static inline int arch_early_ioapic_init(void) { return 0; }
+static inline void print_IO_APICs(void) {}
+#define gsi_top (NR_IRQS_LEGACY)
+static inline int mp_find_ioapic(u32 gsi) { return 0; }
+static inline int mp_map_gsi_to_irq(u32 gsi, unsigned int flags,
+ struct irq_alloc_info *info)
+{
+ return gsi;
+}
+
+static inline void mp_unmap_irq(int irq) { }
+
+static inline int save_ioapic_entries(void)
+{
+ return -ENOMEM;
+}
+
+static inline void mask_ioapic_entries(void) { }
+static inline int restore_ioapic_entries(void)
+{
+ return -ENOMEM;
+}
+
+static inline void mp_save_irq(struct mpc_intsrc *m) { }
+static inline void disable_ioapic_support(void) { }
+static inline void io_apic_init_mappings(void) { }
+#define native_io_apic_read NULL
+#define native_restore_boot_irq_mode NULL
+
+static inline void setup_IO_APIC(void) { }
+static inline void enable_IO_APIC(void) { }
+static inline void restore_boot_irq_mode(void) { }
+
+#endif
+
+#endif /* _ASM_X86_IO_APIC_H */
diff --git a/arch/x86/include/asm/iomap.h b/arch/x86/include/asm/iomap.h
new file mode 100644
index 000000000..363e33eb6
--- /dev/null
+++ b/arch/x86/include/asm/iomap.h
@@ -0,0 +1,41 @@
+#ifndef _ASM_X86_IOMAP_H
+#define _ASM_X86_IOMAP_H
+
+/*
+ * Copyright © 2008 Ingo Molnar
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/uaccess.h>
+#include <asm/cacheflush.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+
+void __iomem *
+iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot);
+
+void
+iounmap_atomic(void __iomem *kvaddr);
+
+int
+iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot);
+
+void
+iomap_free(resource_size_t base, unsigned long size);
+
+#endif /* _ASM_X86_IOMAP_H */
diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h
new file mode 100644
index 000000000..baedab8ac
--- /dev/null
+++ b/arch/x86/include/asm/iommu.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_IOMMU_H
+#define _ASM_X86_IOMMU_H
+
+extern int force_iommu, no_iommu;
+extern int iommu_detected;
+extern int iommu_pass_through;
+
+/* 10 seconds */
+#define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000)
+
+#endif /* _ASM_X86_IOMMU_H */
diff --git a/arch/x86/include/asm/iommu_table.h b/arch/x86/include/asm/iommu_table.h
new file mode 100644
index 000000000..1fb3fd1a8
--- /dev/null
+++ b/arch/x86/include/asm/iommu_table.h
@@ -0,0 +1,102 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_IOMMU_TABLE_H
+#define _ASM_X86_IOMMU_TABLE_H
+
+#include <asm/swiotlb.h>
+
+/*
+ * History lesson:
+ * The execution chain of IOMMUs in 2.6.36 looks as so:
+ *
+ * [xen-swiotlb]
+ * |
+ * +----[swiotlb *]--+
+ * / | \
+ * / | \
+ * [GART] [Calgary] [Intel VT-d]
+ * /
+ * /
+ * [AMD-Vi]
+ *
+ * *: if SWIOTLB detected 'iommu=soft'/'swiotlb=force' it would skip
+ * over the rest of IOMMUs and unconditionally initialize the SWIOTLB.
+ * Also it would surreptitiously initialize set the swiotlb=1 if there were
+ * more than 4GB and if the user did not pass in 'iommu=off'. The swiotlb
+ * flag would be turned off by all IOMMUs except the Calgary one.
+ *
+ * The IOMMU_INIT* macros allow a similar tree (or more complex if desired)
+ * to be built by defining who we depend on.
+ *
+ * And all that needs to be done is to use one of the macros in the IOMMU
+ * and the pci-dma.c will take care of the rest.
+ */
+
+struct iommu_table_entry {
+ initcall_t detect;
+ initcall_t depend;
+ void (*early_init)(void); /* No memory allocate available. */
+ void (*late_init)(void); /* Yes, can allocate memory. */
+#define IOMMU_FINISH_IF_DETECTED (1<<0)
+#define IOMMU_DETECTED (1<<1)
+ int flags;
+};
+/*
+ * Macro fills out an entry in the .iommu_table that is equivalent
+ * to the fields that 'struct iommu_table_entry' has. The entries
+ * that are put in the .iommu_table section are not put in any order
+ * hence during boot-time we will have to resort them based on
+ * dependency. */
+
+
+#define __IOMMU_INIT(_detect, _depend, _early_init, _late_init, _finish)\
+ static const struct iommu_table_entry \
+ __iommu_entry_##_detect __used \
+ __attribute__ ((unused, __section__(".iommu_table"), \
+ aligned((sizeof(void *))))) \
+ = {_detect, _depend, _early_init, _late_init, \
+ _finish ? IOMMU_FINISH_IF_DETECTED : 0}
+/*
+ * The simplest IOMMU definition. Provide the detection routine
+ * and it will be run after the SWIOTLB and the other IOMMUs
+ * that utilize this macro. If the IOMMU is detected (ie, the
+ * detect routine returns a positive value), the other IOMMUs
+ * are also checked. You can use IOMMU_INIT_POST_FINISH if you prefer
+ * to stop detecting the other IOMMUs after yours has been detected.
+ */
+#define IOMMU_INIT_POST(_detect) \
+ __IOMMU_INIT(_detect, pci_swiotlb_detect_4gb, NULL, NULL, 0)
+
+#define IOMMU_INIT_POST_FINISH(detect) \
+ __IOMMU_INIT(_detect, pci_swiotlb_detect_4gb, NULL, NULL, 1)
+
+/*
+ * A more sophisticated version of IOMMU_INIT. This variant requires:
+ * a). A detection routine function.
+ * b). The name of the detection routine we depend on to get called
+ * before us.
+ * c). The init routine which gets called if the detection routine
+ * returns a positive value from the pci_iommu_alloc. This means
+ * no presence of a memory allocator.
+ * d). Similar to the 'init', except that this gets called from pci_iommu_init
+ * where we do have a memory allocator.
+ *
+ * The standard IOMMU_INIT differs from the IOMMU_INIT_FINISH variant
+ * in that the former will continue detecting other IOMMUs in the call
+ * list after the detection routine returns a positive number, while the
+ * latter will stop the execution chain upon first successful detection.
+ * Both variants will still call the 'init' and 'late_init' functions if
+ * they are set.
+ */
+#define IOMMU_INIT_FINISH(_detect, _depend, _init, _late_init) \
+ __IOMMU_INIT(_detect, _depend, _init, _late_init, 1)
+
+#define IOMMU_INIT(_detect, _depend, _init, _late_init) \
+ __IOMMU_INIT(_detect, _depend, _init, _late_init, 0)
+
+void sort_iommu_table(struct iommu_table_entry *start,
+ struct iommu_table_entry *finish);
+
+void check_iommu_entries(struct iommu_table_entry *start,
+ struct iommu_table_entry *finish);
+
+#endif /* _ASM_X86_IOMMU_TABLE_H */
diff --git a/arch/x86/include/asm/iosf_mbi.h b/arch/x86/include/asm/iosf_mbi.h
new file mode 100644
index 000000000..3de0489de
--- /dev/null
+++ b/arch/x86/include/asm/iosf_mbi.h
@@ -0,0 +1,233 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Intel OnChip System Fabric MailBox access support
+ */
+
+#ifndef IOSF_MBI_SYMS_H
+#define IOSF_MBI_SYMS_H
+
+#include <linux/notifier.h>
+
+#define MBI_MCR_OFFSET 0xD0
+#define MBI_MDR_OFFSET 0xD4
+#define MBI_MCRX_OFFSET 0xD8
+
+#define MBI_RD_MASK 0xFEFFFFFF
+#define MBI_WR_MASK 0X01000000
+
+#define MBI_MASK_HI 0xFFFFFF00
+#define MBI_MASK_LO 0x000000FF
+#define MBI_ENABLE 0xF0
+
+/* IOSF SB read/write opcodes */
+#define MBI_MMIO_READ 0x00
+#define MBI_MMIO_WRITE 0x01
+#define MBI_CFG_READ 0x04
+#define MBI_CFG_WRITE 0x05
+#define MBI_CR_READ 0x06
+#define MBI_CR_WRITE 0x07
+#define MBI_REG_READ 0x10
+#define MBI_REG_WRITE 0x11
+#define MBI_ESRAM_READ 0x12
+#define MBI_ESRAM_WRITE 0x13
+
+/* Baytrail available units */
+#define BT_MBI_UNIT_AUNIT 0x00
+#define BT_MBI_UNIT_SMC 0x01
+#define BT_MBI_UNIT_CPU 0x02
+#define BT_MBI_UNIT_BUNIT 0x03
+#define BT_MBI_UNIT_PMC 0x04
+#define BT_MBI_UNIT_GFX 0x06
+#define BT_MBI_UNIT_SMI 0x0C
+#define BT_MBI_UNIT_USB 0x43
+#define BT_MBI_UNIT_SATA 0xA3
+#define BT_MBI_UNIT_PCIE 0xA6
+
+/* Quark available units */
+#define QRK_MBI_UNIT_HBA 0x00
+#define QRK_MBI_UNIT_HB 0x03
+#define QRK_MBI_UNIT_RMU 0x04
+#define QRK_MBI_UNIT_MM 0x05
+#define QRK_MBI_UNIT_SOC 0x31
+
+/* Action values for the pmic_bus_access_notifier functions */
+#define MBI_PMIC_BUS_ACCESS_BEGIN 1
+#define MBI_PMIC_BUS_ACCESS_END 2
+
+#if IS_ENABLED(CONFIG_IOSF_MBI)
+
+bool iosf_mbi_available(void);
+
+/**
+ * iosf_mbi_read() - MailBox Interface read command
+ * @port: port indicating subunit being accessed
+ * @opcode: port specific read or write opcode
+ * @offset: register address offset
+ * @mdr: register data to be read
+ *
+ * Locking is handled by spinlock - cannot sleep.
+ * Return: Nonzero on error
+ */
+int iosf_mbi_read(u8 port, u8 opcode, u32 offset, u32 *mdr);
+
+/**
+ * iosf_mbi_write() - MailBox unmasked write command
+ * @port: port indicating subunit being accessed
+ * @opcode: port specific read or write opcode
+ * @offset: register address offset
+ * @mdr: register data to be written
+ *
+ * Locking is handled by spinlock - cannot sleep.
+ * Return: Nonzero on error
+ */
+int iosf_mbi_write(u8 port, u8 opcode, u32 offset, u32 mdr);
+
+/**
+ * iosf_mbi_modify() - MailBox masked write command
+ * @port: port indicating subunit being accessed
+ * @opcode: port specific read or write opcode
+ * @offset: register address offset
+ * @mdr: register data being modified
+ * @mask: mask indicating bits in mdr to be modified
+ *
+ * Locking is handled by spinlock - cannot sleep.
+ * Return: Nonzero on error
+ */
+int iosf_mbi_modify(u8 port, u8 opcode, u32 offset, u32 mdr, u32 mask);
+
+/**
+ * iosf_mbi_punit_acquire() - Acquire access to the P-Unit
+ *
+ * One some systems the P-Unit accesses the PMIC to change various voltages
+ * through the same bus as other kernel drivers use for e.g. battery monitoring.
+ *
+ * If a driver sends requests to the P-Unit which require the P-Unit to access
+ * the PMIC bus while another driver is also accessing the PMIC bus various bad
+ * things happen.
+ *
+ * To avoid these problems this function must be called before accessing the
+ * P-Unit or the PMIC, be it through iosf_mbi* functions or through other means.
+ *
+ * Note on these systems the i2c-bus driver will request a sempahore from the
+ * P-Unit for exclusive access to the PMIC bus when i2c drivers are accessing
+ * it, but this does not appear to be sufficient, we still need to avoid making
+ * certain P-Unit requests during the access window to avoid problems.
+ *
+ * This function locks a mutex, as such it may sleep.
+ */
+void iosf_mbi_punit_acquire(void);
+
+/**
+ * iosf_mbi_punit_release() - Release access to the P-Unit
+ */
+void iosf_mbi_punit_release(void);
+
+/**
+ * iosf_mbi_register_pmic_bus_access_notifier - Register PMIC bus notifier
+ *
+ * This function can be used by drivers which may need to acquire P-Unit
+ * managed resources from interrupt context, where iosf_mbi_punit_acquire()
+ * can not be used.
+ *
+ * This function allows a driver to register a notifier to get notified (in a
+ * process context) before other drivers start accessing the PMIC bus.
+ *
+ * This allows the driver to acquire any resources, which it may need during
+ * the window the other driver is accessing the PMIC, before hand.
+ *
+ * @nb: notifier_block to register
+ */
+int iosf_mbi_register_pmic_bus_access_notifier(struct notifier_block *nb);
+
+/**
+ * iosf_mbi_register_pmic_bus_access_notifier - Unregister PMIC bus notifier
+ *
+ * @nb: notifier_block to unregister
+ */
+int iosf_mbi_unregister_pmic_bus_access_notifier(struct notifier_block *nb);
+
+/**
+ * iosf_mbi_unregister_pmic_bus_access_notifier_unlocked - Unregister PMIC bus
+ * notifier, unlocked
+ *
+ * Like iosf_mbi_unregister_pmic_bus_access_notifier(), but for use when the
+ * caller has already called iosf_mbi_punit_acquire() itself.
+ *
+ * @nb: notifier_block to unregister
+ */
+int iosf_mbi_unregister_pmic_bus_access_notifier_unlocked(
+ struct notifier_block *nb);
+
+/**
+ * iosf_mbi_call_pmic_bus_access_notifier_chain - Call PMIC bus notifier chain
+ *
+ * @val: action to pass into listener's notifier_call function
+ * @v: data pointer to pass into listener's notifier_call function
+ */
+int iosf_mbi_call_pmic_bus_access_notifier_chain(unsigned long val, void *v);
+
+/**
+ * iosf_mbi_assert_punit_acquired - Assert that the P-Unit has been acquired.
+ */
+void iosf_mbi_assert_punit_acquired(void);
+
+#else /* CONFIG_IOSF_MBI is not enabled */
+static inline
+bool iosf_mbi_available(void)
+{
+ return false;
+}
+
+static inline
+int iosf_mbi_read(u8 port, u8 opcode, u32 offset, u32 *mdr)
+{
+ WARN(1, "IOSF_MBI driver not available");
+ return -EPERM;
+}
+
+static inline
+int iosf_mbi_write(u8 port, u8 opcode, u32 offset, u32 mdr)
+{
+ WARN(1, "IOSF_MBI driver not available");
+ return -EPERM;
+}
+
+static inline
+int iosf_mbi_modify(u8 port, u8 opcode, u32 offset, u32 mdr, u32 mask)
+{
+ WARN(1, "IOSF_MBI driver not available");
+ return -EPERM;
+}
+
+static inline void iosf_mbi_punit_acquire(void) {}
+static inline void iosf_mbi_punit_release(void) {}
+
+static inline
+int iosf_mbi_register_pmic_bus_access_notifier(struct notifier_block *nb)
+{
+ return 0;
+}
+
+static inline
+int iosf_mbi_unregister_pmic_bus_access_notifier(struct notifier_block *nb)
+{
+ return 0;
+}
+
+static inline int
+iosf_mbi_unregister_pmic_bus_access_notifier_unlocked(struct notifier_block *nb)
+{
+ return 0;
+}
+
+static inline
+int iosf_mbi_call_pmic_bus_access_notifier_chain(unsigned long val, void *v)
+{
+ return 0;
+}
+
+static inline void iosf_mbi_assert_punit_acquired(void) {}
+
+#endif /* CONFIG_IOSF_MBI */
+
+#endif /* IOSF_MBI_SYMS_H */
diff --git a/arch/x86/include/asm/ipi.h b/arch/x86/include/asm/ipi.h
new file mode 100644
index 000000000..a4fe16e42
--- /dev/null
+++ b/arch/x86/include/asm/ipi.h
@@ -0,0 +1,110 @@
+#ifndef _ASM_X86_IPI_H
+#define _ASM_X86_IPI_H
+
+#ifdef CONFIG_X86_LOCAL_APIC
+
+/*
+ * Copyright 2004 James Cleverdon, IBM.
+ * Subject to the GNU Public License, v.2
+ *
+ * Generic APIC InterProcessor Interrupt code.
+ *
+ * Moved to include file by James Cleverdon from
+ * arch/x86-64/kernel/smp.c
+ *
+ * Copyrights from kernel/smp.c:
+ *
+ * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
+ * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
+ * (c) 2002,2003 Andi Kleen, SuSE Labs.
+ * Subject to the GNU Public License, v.2
+ */
+
+#include <asm/hw_irq.h>
+#include <asm/apic.h>
+#include <asm/smp.h>
+
+/*
+ * the following functions deal with sending IPIs between CPUs.
+ *
+ * We use 'broadcast', CPU->CPU IPIs and self-IPIs too.
+ */
+
+static inline unsigned int __prepare_ICR(unsigned int shortcut, int vector,
+ unsigned int dest)
+{
+ unsigned int icr = shortcut | dest;
+
+ switch (vector) {
+ default:
+ icr |= APIC_DM_FIXED | vector;
+ break;
+ case NMI_VECTOR:
+ icr |= APIC_DM_NMI;
+ break;
+ }
+ return icr;
+}
+
+static inline int __prepare_ICR2(unsigned int mask)
+{
+ return SET_APIC_DEST_FIELD(mask);
+}
+
+static inline void __xapic_wait_icr_idle(void)
+{
+ while (native_apic_mem_read(APIC_ICR) & APIC_ICR_BUSY)
+ cpu_relax();
+}
+
+void __default_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest);
+
+/*
+ * This is used to send an IPI with no shorthand notation (the destination is
+ * specified in bits 56 to 63 of the ICR).
+ */
+void __default_send_IPI_dest_field(unsigned int mask, int vector, unsigned int dest);
+
+extern void default_send_IPI_single(int cpu, int vector);
+extern void default_send_IPI_single_phys(int cpu, int vector);
+extern void default_send_IPI_mask_sequence_phys(const struct cpumask *mask,
+ int vector);
+extern void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask,
+ int vector);
+
+/* Avoid include hell */
+#define NMI_VECTOR 0x02
+
+extern int no_broadcast;
+
+static inline void __default_local_send_IPI_allbutself(int vector)
+{
+ if (no_broadcast || vector == NMI_VECTOR)
+ apic->send_IPI_mask_allbutself(cpu_online_mask, vector);
+ else
+ __default_send_IPI_shortcut(APIC_DEST_ALLBUT, vector, apic->dest_logical);
+}
+
+static inline void __default_local_send_IPI_all(int vector)
+{
+ if (no_broadcast || vector == NMI_VECTOR)
+ apic->send_IPI_mask(cpu_online_mask, vector);
+ else
+ __default_send_IPI_shortcut(APIC_DEST_ALLINC, vector, apic->dest_logical);
+}
+
+#ifdef CONFIG_X86_32
+extern void default_send_IPI_mask_sequence_logical(const struct cpumask *mask,
+ int vector);
+extern void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask,
+ int vector);
+extern void default_send_IPI_mask_logical(const struct cpumask *mask,
+ int vector);
+extern void default_send_IPI_allbutself(int vector);
+extern void default_send_IPI_all(int vector);
+extern void default_send_IPI_self(int vector);
+#endif
+
+#endif
+
+#endif /* _ASM_X86_IPI_H */
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
new file mode 100644
index 000000000..2395bb794
--- /dev/null
+++ b/arch/x86/include/asm/irq.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_IRQ_H
+#define _ASM_X86_IRQ_H
+/*
+ * (C) 1992, 1993 Linus Torvalds, (C) 1997 Ingo Molnar
+ *
+ * IRQ/IPI changes taken from work by Thomas Radke
+ * <tomsoft@informatik.tu-chemnitz.de>
+ */
+
+#include <asm/apicdef.h>
+#include <asm/irq_vectors.h>
+
+static inline int irq_canonicalize(int irq)
+{
+ return ((irq == 2) ? 9 : irq);
+}
+
+#ifdef CONFIG_X86_32
+extern void irq_ctx_init(int cpu);
+#else
+# define irq_ctx_init(cpu) do { } while (0)
+#endif
+
+#define __ARCH_HAS_DO_SOFTIRQ
+
+struct irq_desc;
+
+extern void fixup_irqs(void);
+
+#ifdef CONFIG_HAVE_KVM
+extern void kvm_set_posted_intr_wakeup_handler(void (*handler)(void));
+#endif
+
+extern void (*x86_platform_ipi_callback)(void);
+extern void native_init_IRQ(void);
+
+extern bool handle_irq(struct irq_desc *desc, struct pt_regs *regs);
+
+extern __visible unsigned int do_IRQ(struct pt_regs *regs);
+
+extern void init_ISA_irqs(void);
+
+#ifdef CONFIG_X86_LOCAL_APIC
+void arch_trigger_cpumask_backtrace(const struct cpumask *mask,
+ bool exclude_self);
+#define arch_trigger_cpumask_backtrace arch_trigger_cpumask_backtrace
+#endif
+
+#endif /* _ASM_X86_IRQ_H */
diff --git a/arch/x86/include/asm/irq_regs.h b/arch/x86/include/asm/irq_regs.h
new file mode 100644
index 000000000..8f3bee821
--- /dev/null
+++ b/arch/x86/include/asm/irq_regs.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Per-cpu current frame pointer - the location of the last exception frame on
+ * the stack, stored in the per-cpu area.
+ *
+ * Jeremy Fitzhardinge <jeremy@goop.org>
+ */
+#ifndef _ASM_X86_IRQ_REGS_H
+#define _ASM_X86_IRQ_REGS_H
+
+#include <asm/percpu.h>
+
+#define ARCH_HAS_OWN_IRQ_REGS
+
+DECLARE_PER_CPU(struct pt_regs *, irq_regs);
+
+static inline struct pt_regs *get_irq_regs(void)
+{
+ return this_cpu_read(irq_regs);
+}
+
+static inline struct pt_regs *set_irq_regs(struct pt_regs *new_regs)
+{
+ struct pt_regs *old_regs;
+
+ old_regs = get_irq_regs();
+ this_cpu_write(irq_regs, new_regs);
+
+ return old_regs;
+}
+
+#endif /* _ASM_X86_IRQ_REGS_32_H */
diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
new file mode 100644
index 000000000..5f26962ef
--- /dev/null
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright (C) 2012 Advanced Micro Devices, Inc.
+ * Author: Joerg Roedel <joerg.roedel@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * This header file contains the interface of the interrupt remapping code to
+ * the x86 interrupt management code.
+ */
+
+#ifndef __X86_IRQ_REMAPPING_H
+#define __X86_IRQ_REMAPPING_H
+
+#include <asm/irqdomain.h>
+#include <asm/hw_irq.h>
+#include <asm/io_apic.h>
+
+struct msi_msg;
+struct irq_alloc_info;
+
+enum irq_remap_cap {
+ IRQ_POSTING_CAP = 0,
+};
+
+enum {
+ IRQ_REMAP_XAPIC_MODE,
+ IRQ_REMAP_X2APIC_MODE,
+};
+
+struct vcpu_data {
+ u64 pi_desc_addr; /* Physical address of PI Descriptor */
+ u32 vector; /* Guest vector of the interrupt */
+};
+
+#ifdef CONFIG_IRQ_REMAP
+
+extern bool irq_remapping_cap(enum irq_remap_cap cap);
+extern void set_irq_remapping_broken(void);
+extern int irq_remapping_prepare(void);
+extern int irq_remapping_enable(void);
+extern void irq_remapping_disable(void);
+extern int irq_remapping_reenable(int);
+extern int irq_remap_enable_fault_handling(void);
+extern void panic_if_irq_remap(const char *msg);
+
+extern struct irq_domain *
+irq_remapping_get_ir_irq_domain(struct irq_alloc_info *info);
+extern struct irq_domain *
+irq_remapping_get_irq_domain(struct irq_alloc_info *info);
+
+/* Create PCI MSI/MSIx irqdomain, use @parent as the parent irqdomain. */
+extern struct irq_domain *
+arch_create_remap_msi_irq_domain(struct irq_domain *par, const char *n, int id);
+
+/* Get parent irqdomain for interrupt remapping irqdomain */
+static inline struct irq_domain *arch_get_ir_parent_domain(void)
+{
+ return x86_vector_domain;
+}
+
+#else /* CONFIG_IRQ_REMAP */
+
+static inline bool irq_remapping_cap(enum irq_remap_cap cap) { return 0; }
+static inline void set_irq_remapping_broken(void) { }
+static inline int irq_remapping_prepare(void) { return -ENODEV; }
+static inline int irq_remapping_enable(void) { return -ENODEV; }
+static inline void irq_remapping_disable(void) { }
+static inline int irq_remapping_reenable(int eim) { return -ENODEV; }
+static inline int irq_remap_enable_fault_handling(void) { return -ENODEV; }
+
+static inline void panic_if_irq_remap(const char *msg)
+{
+}
+
+static inline struct irq_domain *
+irq_remapping_get_ir_irq_domain(struct irq_alloc_info *info)
+{
+ return NULL;
+}
+
+static inline struct irq_domain *
+irq_remapping_get_irq_domain(struct irq_alloc_info *info)
+{
+ return NULL;
+}
+
+#endif /* CONFIG_IRQ_REMAP */
+#endif /* __X86_IRQ_REMAPPING_H */
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
new file mode 100644
index 000000000..548d90bbf
--- /dev/null
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -0,0 +1,146 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_IRQ_VECTORS_H
+#define _ASM_X86_IRQ_VECTORS_H
+
+#include <linux/threads.h>
+/*
+ * Linux IRQ vector layout.
+ *
+ * There are 256 IDT entries (per CPU - each entry is 8 bytes) which can
+ * be defined by Linux. They are used as a jump table by the CPU when a
+ * given vector is triggered - by a CPU-external, CPU-internal or
+ * software-triggered event.
+ *
+ * Linux sets the kernel code address each entry jumps to early during
+ * bootup, and never changes them. This is the general layout of the
+ * IDT entries:
+ *
+ * Vectors 0 ... 31 : system traps and exceptions - hardcoded events
+ * Vectors 32 ... 127 : device interrupts
+ * Vector 128 : legacy int80 syscall interface
+ * Vectors 129 ... INVALIDATE_TLB_VECTOR_START-1 except 204 : device interrupts
+ * Vectors INVALIDATE_TLB_VECTOR_START ... 255 : special interrupts
+ *
+ * 64-bit x86 has per CPU IDT tables, 32-bit has one shared IDT table.
+ *
+ * This file enumerates the exact layout of them:
+ */
+
+#define NMI_VECTOR 0x02
+#define MCE_VECTOR 0x12
+
+/*
+ * IDT vectors usable for external interrupt sources start at 0x20.
+ * (0x80 is the syscall vector, 0x30-0x3f are for ISA)
+ */
+#define FIRST_EXTERNAL_VECTOR 0x20
+
+/*
+ * Reserve the lowest usable vector (and hence lowest priority) 0x20 for
+ * triggering cleanup after irq migration. 0x21-0x2f will still be used
+ * for device interrupts.
+ */
+#define IRQ_MOVE_CLEANUP_VECTOR FIRST_EXTERNAL_VECTOR
+
+#define IA32_SYSCALL_VECTOR 0x80
+
+/*
+ * Vectors 0x30-0x3f are used for ISA interrupts.
+ * round up to the next 16-vector boundary
+ */
+#define ISA_IRQ_VECTOR(irq) (((FIRST_EXTERNAL_VECTOR + 16) & ~15) + irq)
+
+/*
+ * Special IRQ vectors used by the SMP architecture, 0xf0-0xff
+ *
+ * some of the following vectors are 'rare', they are merged
+ * into a single vector (CALL_FUNCTION_VECTOR) to save vector space.
+ * TLB, reschedule and local APIC vectors are performance-critical.
+ */
+
+#define SPURIOUS_APIC_VECTOR 0xff
+/*
+ * Sanity check
+ */
+#if ((SPURIOUS_APIC_VECTOR & 0x0F) != 0x0F)
+# error SPURIOUS_APIC_VECTOR definition error
+#endif
+
+#define ERROR_APIC_VECTOR 0xfe
+#define RESCHEDULE_VECTOR 0xfd
+#define CALL_FUNCTION_VECTOR 0xfc
+#define CALL_FUNCTION_SINGLE_VECTOR 0xfb
+#define THERMAL_APIC_VECTOR 0xfa
+#define THRESHOLD_APIC_VECTOR 0xf9
+#define REBOOT_VECTOR 0xf8
+
+/*
+ * Generic system vector for platform specific use
+ */
+#define X86_PLATFORM_IPI_VECTOR 0xf7
+
+/*
+ * IRQ work vector:
+ */
+#define IRQ_WORK_VECTOR 0xf6
+
+#define UV_BAU_MESSAGE 0xf5
+#define DEFERRED_ERROR_VECTOR 0xf4
+
+/* Vector on which hypervisor callbacks will be delivered */
+#define HYPERVISOR_CALLBACK_VECTOR 0xf3
+
+/* Vector for KVM to deliver posted interrupt IPI */
+#ifdef CONFIG_HAVE_KVM
+#define POSTED_INTR_VECTOR 0xf2
+#define POSTED_INTR_WAKEUP_VECTOR 0xf1
+#define POSTED_INTR_NESTED_VECTOR 0xf0
+#endif
+
+#define MANAGED_IRQ_SHUTDOWN_VECTOR 0xef
+
+#if IS_ENABLED(CONFIG_HYPERV)
+#define HYPERV_REENLIGHTENMENT_VECTOR 0xee
+#define HYPERV_STIMER0_VECTOR 0xed
+#endif
+
+#define LOCAL_TIMER_VECTOR 0xec
+
+#define NR_VECTORS 256
+
+#ifdef CONFIG_X86_LOCAL_APIC
+#define FIRST_SYSTEM_VECTOR LOCAL_TIMER_VECTOR
+#else
+#define FIRST_SYSTEM_VECTOR NR_VECTORS
+#endif
+
+/*
+ * Size the maximum number of interrupts.
+ *
+ * If the irq_desc[] array has a sparse layout, we can size things
+ * generously - it scales up linearly with the maximum number of CPUs,
+ * and the maximum number of IO-APICs, whichever is higher.
+ *
+ * In other cases we size more conservatively, to not create too large
+ * static arrays.
+ */
+
+#define NR_IRQS_LEGACY 16
+
+#define CPU_VECTOR_LIMIT (64 * NR_CPUS)
+#define IO_APIC_VECTOR_LIMIT (32 * MAX_IO_APICS)
+
+#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_PCI_MSI)
+#define NR_IRQS \
+ (CPU_VECTOR_LIMIT > IO_APIC_VECTOR_LIMIT ? \
+ (NR_VECTORS + CPU_VECTOR_LIMIT) : \
+ (NR_VECTORS + IO_APIC_VECTOR_LIMIT))
+#elif defined(CONFIG_X86_IO_APIC)
+#define NR_IRQS (NR_VECTORS + IO_APIC_VECTOR_LIMIT)
+#elif defined(CONFIG_PCI_MSI)
+#define NR_IRQS (NR_VECTORS + CPU_VECTOR_LIMIT)
+#else
+#define NR_IRQS NR_IRQS_LEGACY
+#endif
+
+#endif /* _ASM_X86_IRQ_VECTORS_H */
diff --git a/arch/x86/include/asm/irq_work.h b/arch/x86/include/asm/irq_work.h
new file mode 100644
index 000000000..800ffce0d
--- /dev/null
+++ b/arch/x86/include/asm/irq_work.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_IRQ_WORK_H
+#define _ASM_IRQ_WORK_H
+
+#include <asm/cpufeature.h>
+
+#ifdef CONFIG_X86_LOCAL_APIC
+static inline bool arch_irq_work_has_interrupt(void)
+{
+ return boot_cpu_has(X86_FEATURE_APIC);
+}
+extern void arch_irq_work_raise(void);
+#else
+static inline bool arch_irq_work_has_interrupt(void)
+{
+ return false;
+}
+#endif
+
+#endif /* _ASM_IRQ_WORK_H */
diff --git a/arch/x86/include/asm/irqdomain.h b/arch/x86/include/asm/irqdomain.h
new file mode 100644
index 000000000..c066ffae2
--- /dev/null
+++ b/arch/x86/include/asm/irqdomain.h
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_IRQDOMAIN_H
+#define _ASM_IRQDOMAIN_H
+
+#include <linux/irqdomain.h>
+#include <asm/hw_irq.h>
+
+#ifdef CONFIG_X86_LOCAL_APIC
+enum {
+ /* Allocate contiguous CPU vectors */
+ X86_IRQ_ALLOC_CONTIGUOUS_VECTORS = 0x1,
+ X86_IRQ_ALLOC_LEGACY = 0x2,
+};
+
+extern struct irq_domain *x86_vector_domain;
+
+extern void init_irq_alloc_info(struct irq_alloc_info *info,
+ const struct cpumask *mask);
+extern void copy_irq_alloc_info(struct irq_alloc_info *dst,
+ struct irq_alloc_info *src);
+#endif /* CONFIG_X86_LOCAL_APIC */
+
+#ifdef CONFIG_X86_IO_APIC
+struct device_node;
+struct irq_data;
+
+enum ioapic_domain_type {
+ IOAPIC_DOMAIN_INVALID,
+ IOAPIC_DOMAIN_LEGACY,
+ IOAPIC_DOMAIN_STRICT,
+ IOAPIC_DOMAIN_DYNAMIC,
+};
+
+struct ioapic_domain_cfg {
+ enum ioapic_domain_type type;
+ const struct irq_domain_ops *ops;
+ struct device_node *dev;
+};
+
+extern const struct irq_domain_ops mp_ioapic_irqdomain_ops;
+
+extern int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs, void *arg);
+extern void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs);
+extern int mp_irqdomain_activate(struct irq_domain *domain,
+ struct irq_data *irq_data, bool reserve);
+extern void mp_irqdomain_deactivate(struct irq_domain *domain,
+ struct irq_data *irq_data);
+extern int mp_irqdomain_ioapic_idx(struct irq_domain *domain);
+#endif /* CONFIG_X86_IO_APIC */
+
+#ifdef CONFIG_PCI_MSI
+extern void arch_init_msi_domain(struct irq_domain *domain);
+#else
+static inline void arch_init_msi_domain(struct irq_domain *domain) { }
+#endif
+
+#endif
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
new file mode 100644
index 000000000..c99c66b41
--- /dev/null
+++ b/arch/x86/include/asm/irqflags.h
@@ -0,0 +1,213 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _X86_IRQFLAGS_H_
+#define _X86_IRQFLAGS_H_
+
+#include <asm/processor-flags.h>
+
+#ifndef __ASSEMBLY__
+
+#include <asm/nospec-branch.h>
+
+/* Provide __cpuidle; we can't safely include <linux/cpu.h> */
+#define __cpuidle __attribute__((__section__(".cpuidle.text")))
+
+/*
+ * Interrupt control:
+ */
+
+/* Declaration required for gcc < 4.9 to prevent -Werror=missing-prototypes */
+extern inline unsigned long native_save_fl(void);
+extern inline unsigned long native_save_fl(void)
+{
+ unsigned long flags;
+
+ /*
+ * "=rm" is safe here, because "pop" adjusts the stack before
+ * it evaluates its effective address -- this is part of the
+ * documented behavior of the "pop" instruction.
+ */
+ asm volatile("# __raw_save_flags\n\t"
+ "pushf ; pop %0"
+ : "=rm" (flags)
+ : /* no input */
+ : "memory");
+
+ return flags;
+}
+
+extern inline void native_restore_fl(unsigned long flags);
+extern inline void native_restore_fl(unsigned long flags)
+{
+ asm volatile("push %0 ; popf"
+ : /* no output */
+ :"g" (flags)
+ :"memory", "cc");
+}
+
+static inline void native_irq_disable(void)
+{
+ asm volatile("cli": : :"memory");
+}
+
+static inline void native_irq_enable(void)
+{
+ asm volatile("sti": : :"memory");
+}
+
+static inline __cpuidle void native_safe_halt(void)
+{
+ mds_idle_clear_cpu_buffers();
+ asm volatile("sti; hlt": : :"memory");
+}
+
+static inline __cpuidle void native_halt(void)
+{
+ mds_idle_clear_cpu_buffers();
+ asm volatile("hlt": : :"memory");
+}
+
+#endif
+
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+#ifndef __ASSEMBLY__
+#include <linux/types.h>
+
+static inline notrace unsigned long arch_local_save_flags(void)
+{
+ return native_save_fl();
+}
+
+static inline notrace void arch_local_irq_restore(unsigned long flags)
+{
+ native_restore_fl(flags);
+}
+
+static inline notrace void arch_local_irq_disable(void)
+{
+ native_irq_disable();
+}
+
+static inline notrace void arch_local_irq_enable(void)
+{
+ native_irq_enable();
+}
+
+/*
+ * Used in the idle loop; sti takes one instruction cycle
+ * to complete:
+ */
+static inline __cpuidle void arch_safe_halt(void)
+{
+ native_safe_halt();
+}
+
+/*
+ * Used when interrupts are already enabled or to
+ * shutdown the processor:
+ */
+static inline __cpuidle void halt(void)
+{
+ native_halt();
+}
+
+/*
+ * For spinlocks, etc:
+ */
+static inline notrace unsigned long arch_local_irq_save(void)
+{
+ unsigned long flags = arch_local_save_flags();
+ arch_local_irq_disable();
+ return flags;
+}
+#else
+
+#define ENABLE_INTERRUPTS(x) sti
+#define DISABLE_INTERRUPTS(x) cli
+
+#ifdef CONFIG_X86_64
+#define SWAPGS swapgs
+/*
+ * Currently paravirt can't handle swapgs nicely when we
+ * don't have a stack we can rely on (such as a user space
+ * stack). So we either find a way around these or just fault
+ * and emulate if a guest tries to call swapgs directly.
+ *
+ * Either way, this is a good way to document that we don't
+ * have a reliable stack. x86_64 only.
+ */
+#define SWAPGS_UNSAFE_STACK swapgs
+
+#define PARAVIRT_ADJUST_EXCEPTION_FRAME /* */
+
+#define INTERRUPT_RETURN jmp native_iret
+#define USERGS_SYSRET64 \
+ swapgs; \
+ sysretq;
+#define USERGS_SYSRET32 \
+ swapgs; \
+ sysretl
+
+#ifdef CONFIG_DEBUG_ENTRY
+#define SAVE_FLAGS(x) pushfq; popq %rax
+#endif
+#else
+#define INTERRUPT_RETURN iret
+#define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit
+#define GET_CR0_INTO_EAX movl %cr0, %eax
+#endif
+
+
+#endif /* __ASSEMBLY__ */
+#endif /* CONFIG_PARAVIRT */
+
+#ifndef __ASSEMBLY__
+static inline int arch_irqs_disabled_flags(unsigned long flags)
+{
+ return !(flags & X86_EFLAGS_IF);
+}
+
+static inline int arch_irqs_disabled(void)
+{
+ unsigned long flags = arch_local_save_flags();
+
+ return arch_irqs_disabled_flags(flags);
+}
+#endif /* !__ASSEMBLY__ */
+
+#ifdef __ASSEMBLY__
+#ifdef CONFIG_TRACE_IRQFLAGS
+# define TRACE_IRQS_ON call trace_hardirqs_on_thunk;
+# define TRACE_IRQS_OFF call trace_hardirqs_off_thunk;
+#else
+# define TRACE_IRQS_ON
+# define TRACE_IRQS_OFF
+#endif
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# ifdef CONFIG_X86_64
+# define LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk
+# define LOCKDEP_SYS_EXIT_IRQ \
+ TRACE_IRQS_ON; \
+ sti; \
+ call lockdep_sys_exit_thunk; \
+ cli; \
+ TRACE_IRQS_OFF;
+# else
+# define LOCKDEP_SYS_EXIT \
+ pushl %eax; \
+ pushl %ecx; \
+ pushl %edx; \
+ call lockdep_sys_exit; \
+ popl %edx; \
+ popl %ecx; \
+ popl %eax;
+# define LOCKDEP_SYS_EXIT_IRQ
+# endif
+#else
+# define LOCKDEP_SYS_EXIT
+# define LOCKDEP_SYS_EXIT_IRQ
+#endif
+#endif /* __ASSEMBLY__ */
+
+#endif
diff --git a/arch/x86/include/asm/ist.h b/arch/x86/include/asm/ist.h
new file mode 100644
index 000000000..c9803f1a2
--- /dev/null
+++ b/arch/x86/include/asm/ist.h
@@ -0,0 +1,23 @@
+/*
+ * Include file for the interface to IST BIOS
+ * Copyright 2002 Andy Grover <andrew.grover@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#ifndef _ASM_X86_IST_H
+#define _ASM_X86_IST_H
+
+#include <uapi/asm/ist.h>
+
+
+extern struct ist_info ist_info;
+
+#endif /* _ASM_X86_IST_H */
diff --git a/arch/x86/include/asm/jailhouse_para.h b/arch/x86/include/asm/jailhouse_para.h
new file mode 100644
index 000000000..a34897aef
--- /dev/null
+++ b/arch/x86/include/asm/jailhouse_para.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Jailhouse paravirt detection
+ *
+ * Copyright (c) Siemens AG, 2015-2017
+ *
+ * Authors:
+ * Jan Kiszka <jan.kiszka@siemens.com>
+ */
+
+#ifndef _ASM_X86_JAILHOUSE_PARA_H
+#define _ASM_X86_JAILHOUSE_PARA_H
+
+#include <linux/types.h>
+
+#ifdef CONFIG_JAILHOUSE_GUEST
+bool jailhouse_paravirt(void);
+#else
+static inline bool jailhouse_paravirt(void)
+{
+ return false;
+}
+#endif
+
+#endif /* _ASM_X86_JAILHOUSE_PARA_H */
diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h
new file mode 100644
index 000000000..7010e1c59
--- /dev/null
+++ b/arch/x86/include/asm/jump_label.h
@@ -0,0 +1,100 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_JUMP_LABEL_H
+#define _ASM_X86_JUMP_LABEL_H
+
+#define JUMP_LABEL_NOP_SIZE 5
+
+#ifdef CONFIG_X86_64
+# define STATIC_KEY_INIT_NOP P6_NOP5_ATOMIC
+#else
+# define STATIC_KEY_INIT_NOP GENERIC_NOP5_ATOMIC
+#endif
+
+#include <asm/asm.h>
+#include <asm/nops.h>
+
+#ifndef __ASSEMBLY__
+
+#include <linux/stringify.h>
+#include <linux/types.h>
+
+static __always_inline bool arch_static_branch(struct static_key *key, bool branch)
+{
+ asm_volatile_goto("1:"
+ ".byte " __stringify(STATIC_KEY_INIT_NOP) "\n\t"
+ ".pushsection __jump_table, \"aw\" \n\t"
+ _ASM_ALIGN "\n\t"
+ _ASM_PTR "1b, %l[l_yes], %c0 + %c1 \n\t"
+ ".popsection \n\t"
+ : : "i" (key), "i" (branch) : : l_yes);
+
+ return false;
+l_yes:
+ return true;
+}
+
+static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch)
+{
+ asm_volatile_goto("1:"
+ ".byte 0xe9\n\t .long %l[l_yes] - 2f\n\t"
+ "2:\n\t"
+ ".pushsection __jump_table, \"aw\" \n\t"
+ _ASM_ALIGN "\n\t"
+ _ASM_PTR "1b, %l[l_yes], %c0 + %c1 \n\t"
+ ".popsection \n\t"
+ : : "i" (key), "i" (branch) : : l_yes);
+
+ return false;
+l_yes:
+ return true;
+}
+
+#ifdef CONFIG_X86_64
+typedef u64 jump_label_t;
+#else
+typedef u32 jump_label_t;
+#endif
+
+struct jump_entry {
+ jump_label_t code;
+ jump_label_t target;
+ jump_label_t key;
+};
+
+#else /* __ASSEMBLY__ */
+
+.macro STATIC_JUMP_IF_TRUE target, key, def
+.Lstatic_jump_\@:
+ .if \def
+ /* Equivalent to "jmp.d32 \target" */
+ .byte 0xe9
+ .long \target - .Lstatic_jump_after_\@
+.Lstatic_jump_after_\@:
+ .else
+ .byte STATIC_KEY_INIT_NOP
+ .endif
+ .pushsection __jump_table, "aw"
+ _ASM_ALIGN
+ _ASM_PTR .Lstatic_jump_\@, \target, \key
+ .popsection
+.endm
+
+.macro STATIC_JUMP_IF_FALSE target, key, def
+.Lstatic_jump_\@:
+ .if \def
+ .byte STATIC_KEY_INIT_NOP
+ .else
+ /* Equivalent to "jmp.d32 \target" */
+ .byte 0xe9
+ .long \target - .Lstatic_jump_after_\@
+.Lstatic_jump_after_\@:
+ .endif
+ .pushsection __jump_table, "aw"
+ _ASM_ALIGN
+ _ASM_PTR .Lstatic_jump_\@, \target, \key + 1
+ .popsection
+.endm
+
+#endif /* __ASSEMBLY__ */
+
+#endif
diff --git a/arch/x86/include/asm/kasan.h b/arch/x86/include/asm/kasan.h
new file mode 100644
index 000000000..13e70da38
--- /dev/null
+++ b/arch/x86/include/asm/kasan.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_KASAN_H
+#define _ASM_X86_KASAN_H
+
+#include <linux/const.h>
+#define KASAN_SHADOW_OFFSET _AC(CONFIG_KASAN_SHADOW_OFFSET, UL)
+#define KASAN_SHADOW_SCALE_SHIFT 3
+
+/*
+ * Compiler uses shadow offset assuming that addresses start
+ * from 0. Kernel addresses don't start from 0, so shadow
+ * for kernel really starts from compiler's shadow offset +
+ * 'kernel address space start' >> KASAN_SHADOW_SCALE_SHIFT
+ */
+#define KASAN_SHADOW_START (KASAN_SHADOW_OFFSET + \
+ ((-1UL << __VIRTUAL_MASK_SHIFT) >> \
+ KASAN_SHADOW_SCALE_SHIFT))
+/*
+ * 47 bits for kernel address -> (47 - KASAN_SHADOW_SCALE_SHIFT) bits for shadow
+ * 56 bits for kernel address -> (56 - KASAN_SHADOW_SCALE_SHIFT) bits for shadow
+ */
+#define KASAN_SHADOW_END (KASAN_SHADOW_START + \
+ (1ULL << (__VIRTUAL_MASK_SHIFT - \
+ KASAN_SHADOW_SCALE_SHIFT)))
+
+#ifndef __ASSEMBLY__
+
+#ifdef CONFIG_KASAN
+void __init kasan_early_init(void);
+void __init kasan_init(void);
+#else
+static inline void kasan_early_init(void) { }
+static inline void kasan_init(void) { }
+#endif
+
+#endif
+
+#endif
diff --git a/arch/x86/include/asm/kaslr.h b/arch/x86/include/asm/kaslr.h
new file mode 100644
index 000000000..db7ba2feb
--- /dev/null
+++ b/arch/x86/include/asm/kaslr.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_KASLR_H_
+#define _ASM_KASLR_H_
+
+unsigned long kaslr_get_random_long(const char *purpose);
+
+#ifdef CONFIG_RANDOMIZE_MEMORY
+void kernel_randomize_memory(void);
+#else
+static inline void kernel_randomize_memory(void) { }
+#endif /* CONFIG_RANDOMIZE_MEMORY */
+
+#endif
diff --git a/arch/x86/include/asm/kbdleds.h b/arch/x86/include/asm/kbdleds.h
new file mode 100644
index 000000000..197ea4fed
--- /dev/null
+++ b/arch/x86/include/asm/kbdleds.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_KBDLEDS_H
+#define _ASM_X86_KBDLEDS_H
+
+/*
+ * Some laptops take the 789uiojklm,. keys as number pad when NumLock is on.
+ * This seems a good reason to start with NumLock off. That's why on X86 we
+ * ask the bios for the correct state.
+ */
+
+#include <asm/setup.h>
+
+static inline int kbd_defleds(void)
+{
+ return boot_params.kbd_status & 0x20 ? (1 << VC_NUMLOCK) : 0;
+}
+
+#endif /* _ASM_X86_KBDLEDS_H */
diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h
new file mode 100644
index 000000000..75f1e35e7
--- /dev/null
+++ b/arch/x86/include/asm/kdebug.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_KDEBUG_H
+#define _ASM_X86_KDEBUG_H
+
+#include <linux/notifier.h>
+
+struct pt_regs;
+
+/* Grossly misnamed. */
+enum die_val {
+ DIE_OOPS = 1,
+ DIE_INT3,
+ DIE_DEBUG,
+ DIE_PANIC,
+ DIE_NMI,
+ DIE_DIE,
+ DIE_KERNELDEBUG,
+ DIE_TRAP,
+ DIE_GPF,
+ DIE_CALL,
+ DIE_PAGE_FAULT,
+ DIE_NMIUNKNOWN,
+};
+
+enum show_regs_mode {
+ SHOW_REGS_SHORT,
+ /*
+ * For when userspace crashed, but we don't think it's our fault, and
+ * therefore don't print kernel registers.
+ */
+ SHOW_REGS_USER,
+ SHOW_REGS_ALL
+};
+
+extern void die(const char *, struct pt_regs *,long);
+extern int __must_check __die(const char *, struct pt_regs *, long);
+extern void show_stack_regs(struct pt_regs *regs);
+extern void __show_regs(struct pt_regs *regs, enum show_regs_mode);
+extern void show_iret_regs(struct pt_regs *regs);
+extern unsigned long oops_begin(void);
+extern void oops_end(unsigned long, struct pt_regs *, int signr);
+
+#endif /* _ASM_X86_KDEBUG_H */
diff --git a/arch/x86/include/asm/kexec-bzimage64.h b/arch/x86/include/asm/kexec-bzimage64.h
new file mode 100644
index 000000000..df89ee7d3
--- /dev/null
+++ b/arch/x86/include/asm/kexec-bzimage64.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_KEXEC_BZIMAGE64_H
+#define _ASM_KEXEC_BZIMAGE64_H
+
+extern const struct kexec_file_ops kexec_bzImage64_ops;
+
+#endif /* _ASM_KEXE_BZIMAGE64_H */
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
new file mode 100644
index 000000000..5125fca47
--- /dev/null
+++ b/arch/x86/include/asm/kexec.h
@@ -0,0 +1,228 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_KEXEC_H
+#define _ASM_X86_KEXEC_H
+
+#ifdef CONFIG_X86_32
+# define PA_CONTROL_PAGE 0
+# define VA_CONTROL_PAGE 1
+# define PA_PGD 2
+# define PA_SWAP_PAGE 3
+# define PAGES_NR 4
+#else
+# define PA_CONTROL_PAGE 0
+# define VA_CONTROL_PAGE 1
+# define PA_TABLE_PAGE 2
+# define PA_SWAP_PAGE 3
+# define PAGES_NR 4
+#endif
+
+# define KEXEC_CONTROL_CODE_MAX_SIZE 2048
+
+#ifndef __ASSEMBLY__
+
+#include <linux/string.h>
+
+#include <asm/page.h>
+#include <asm/ptrace.h>
+#include <asm/bootparam.h>
+
+struct kimage;
+
+/*
+ * KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return.
+ * I.e. Maximum page that is mapped directly into kernel memory,
+ * and kmap is not required.
+ *
+ * So far x86_64 is limited to 40 physical address bits.
+ */
+#ifdef CONFIG_X86_32
+/* Maximum physical address we can use pages from */
+# define KEXEC_SOURCE_MEMORY_LIMIT (-1UL)
+/* Maximum address we can reach in physical address mode */
+# define KEXEC_DESTINATION_MEMORY_LIMIT (-1UL)
+/* Maximum address we can use for the control code buffer */
+# define KEXEC_CONTROL_MEMORY_LIMIT TASK_SIZE
+
+# define KEXEC_CONTROL_PAGE_SIZE 4096
+
+/* The native architecture */
+# define KEXEC_ARCH KEXEC_ARCH_386
+
+/* We can also handle crash dumps from 64 bit kernel. */
+# define vmcore_elf_check_arch_cross(x) ((x)->e_machine == EM_X86_64)
+#else
+/* Maximum physical address we can use pages from */
+# define KEXEC_SOURCE_MEMORY_LIMIT (MAXMEM-1)
+/* Maximum address we can reach in physical address mode */
+# define KEXEC_DESTINATION_MEMORY_LIMIT (MAXMEM-1)
+/* Maximum address we can use for the control pages */
+# define KEXEC_CONTROL_MEMORY_LIMIT (MAXMEM-1)
+
+/* Allocate one page for the pdp and the second for the code */
+# define KEXEC_CONTROL_PAGE_SIZE (4096UL + 4096UL)
+
+/* The native architecture */
+# define KEXEC_ARCH KEXEC_ARCH_X86_64
+#endif
+
+/* Memory to backup during crash kdump */
+#define KEXEC_BACKUP_SRC_START (0UL)
+#define KEXEC_BACKUP_SRC_END (640 * 1024UL - 1) /* 640K */
+
+/*
+ * CPU does not save ss and sp on stack if execution is already
+ * running in kernel mode at the time of NMI occurrence. This code
+ * fixes it.
+ */
+static inline void crash_fixup_ss_esp(struct pt_regs *newregs,
+ struct pt_regs *oldregs)
+{
+#ifdef CONFIG_X86_32
+ newregs->sp = (unsigned long)&(oldregs->sp);
+ asm volatile("xorl %%eax, %%eax\n\t"
+ "movw %%ss, %%ax\n\t"
+ :"=a"(newregs->ss));
+#endif
+}
+
+/*
+ * This function is responsible for capturing register states if coming
+ * via panic otherwise just fix up the ss and sp if coming via kernel
+ * mode exception.
+ */
+static inline void crash_setup_regs(struct pt_regs *newregs,
+ struct pt_regs *oldregs)
+{
+ if (oldregs) {
+ memcpy(newregs, oldregs, sizeof(*newregs));
+ crash_fixup_ss_esp(newregs, oldregs);
+ } else {
+#ifdef CONFIG_X86_32
+ asm volatile("movl %%ebx,%0" : "=m"(newregs->bx));
+ asm volatile("movl %%ecx,%0" : "=m"(newregs->cx));
+ asm volatile("movl %%edx,%0" : "=m"(newregs->dx));
+ asm volatile("movl %%esi,%0" : "=m"(newregs->si));
+ asm volatile("movl %%edi,%0" : "=m"(newregs->di));
+ asm volatile("movl %%ebp,%0" : "=m"(newregs->bp));
+ asm volatile("movl %%eax,%0" : "=m"(newregs->ax));
+ asm volatile("movl %%esp,%0" : "=m"(newregs->sp));
+ asm volatile("movl %%ss, %%eax;" :"=a"(newregs->ss));
+ asm volatile("movl %%cs, %%eax;" :"=a"(newregs->cs));
+ asm volatile("movl %%ds, %%eax;" :"=a"(newregs->ds));
+ asm volatile("movl %%es, %%eax;" :"=a"(newregs->es));
+ asm volatile("pushfl; popl %0" :"=m"(newregs->flags));
+#else
+ asm volatile("movq %%rbx,%0" : "=m"(newregs->bx));
+ asm volatile("movq %%rcx,%0" : "=m"(newregs->cx));
+ asm volatile("movq %%rdx,%0" : "=m"(newregs->dx));
+ asm volatile("movq %%rsi,%0" : "=m"(newregs->si));
+ asm volatile("movq %%rdi,%0" : "=m"(newregs->di));
+ asm volatile("movq %%rbp,%0" : "=m"(newregs->bp));
+ asm volatile("movq %%rax,%0" : "=m"(newregs->ax));
+ asm volatile("movq %%rsp,%0" : "=m"(newregs->sp));
+ asm volatile("movq %%r8,%0" : "=m"(newregs->r8));
+ asm volatile("movq %%r9,%0" : "=m"(newregs->r9));
+ asm volatile("movq %%r10,%0" : "=m"(newregs->r10));
+ asm volatile("movq %%r11,%0" : "=m"(newregs->r11));
+ asm volatile("movq %%r12,%0" : "=m"(newregs->r12));
+ asm volatile("movq %%r13,%0" : "=m"(newregs->r13));
+ asm volatile("movq %%r14,%0" : "=m"(newregs->r14));
+ asm volatile("movq %%r15,%0" : "=m"(newregs->r15));
+ asm volatile("movl %%ss, %%eax;" :"=a"(newregs->ss));
+ asm volatile("movl %%cs, %%eax;" :"=a"(newregs->cs));
+ asm volatile("pushfq; popq %0" :"=m"(newregs->flags));
+#endif
+ newregs->ip = (unsigned long)current_text_addr();
+ }
+}
+
+#ifdef CONFIG_X86_32
+asmlinkage unsigned long
+relocate_kernel(unsigned long indirection_page,
+ unsigned long control_page,
+ unsigned long start_address,
+ unsigned int has_pae,
+ unsigned int preserve_context);
+#else
+unsigned long
+relocate_kernel(unsigned long indirection_page,
+ unsigned long page_list,
+ unsigned long start_address,
+ unsigned int preserve_context,
+ unsigned int sme_active);
+#endif
+
+#define ARCH_HAS_KIMAGE_ARCH
+
+#ifdef CONFIG_X86_32
+struct kimage_arch {
+ pgd_t *pgd;
+#ifdef CONFIG_X86_PAE
+ pmd_t *pmd0;
+ pmd_t *pmd1;
+#endif
+ pte_t *pte0;
+ pte_t *pte1;
+};
+#else
+struct kimage_arch {
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+ /* Details of backup region */
+ unsigned long backup_src_start;
+ unsigned long backup_src_sz;
+
+ /* Physical address of backup segment */
+ unsigned long backup_load_addr;
+
+ /* Core ELF header buffer */
+ void *elf_headers;
+ unsigned long elf_headers_sz;
+ unsigned long elf_load_addr;
+};
+#endif /* CONFIG_X86_32 */
+
+#ifdef CONFIG_X86_64
+/*
+ * Number of elements and order of elements in this structure should match
+ * with the ones in arch/x86/purgatory/entry64.S. If you make a change here
+ * make an appropriate change in purgatory too.
+ */
+struct kexec_entry64_regs {
+ uint64_t rax;
+ uint64_t rcx;
+ uint64_t rdx;
+ uint64_t rbx;
+ uint64_t rsp;
+ uint64_t rbp;
+ uint64_t rsi;
+ uint64_t rdi;
+ uint64_t r8;
+ uint64_t r9;
+ uint64_t r10;
+ uint64_t r11;
+ uint64_t r12;
+ uint64_t r13;
+ uint64_t r14;
+ uint64_t r15;
+ uint64_t rip;
+};
+
+extern int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages,
+ gfp_t gfp);
+#define arch_kexec_post_alloc_pages arch_kexec_post_alloc_pages
+
+extern void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages);
+#define arch_kexec_pre_free_pages arch_kexec_pre_free_pages
+
+#endif
+
+typedef void crash_vmclear_fn(void);
+extern crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss;
+extern void kdump_nmi_shootdown_cpus(void);
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_X86_KEXEC_H */
diff --git a/arch/x86/include/asm/kgdb.h b/arch/x86/include/asm/kgdb.h
new file mode 100644
index 000000000..aacaf2502
--- /dev/null
+++ b/arch/x86/include/asm/kgdb.h
@@ -0,0 +1,92 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_KGDB_H
+#define _ASM_X86_KGDB_H
+
+/*
+ * Copyright (C) 2001-2004 Amit S. Kale
+ * Copyright (C) 2008 Wind River Systems, Inc.
+ */
+
+#include <asm/ptrace.h>
+
+/*
+ * BUFMAX defines the maximum number of characters in inbound/outbound
+ * buffers at least NUMREGBYTES*2 are needed for register packets
+ * Longer buffer is needed to list all threads
+ */
+#define BUFMAX 1024
+
+/*
+ * Note that this register image is in a different order than
+ * the register image that Linux produces at interrupt time.
+ *
+ * Linux's register image is defined by struct pt_regs in ptrace.h.
+ * Just why GDB uses a different order is a historical mystery.
+ */
+#ifdef CONFIG_X86_32
+enum regnames {
+ GDB_AX, /* 0 */
+ GDB_CX, /* 1 */
+ GDB_DX, /* 2 */
+ GDB_BX, /* 3 */
+ GDB_SP, /* 4 */
+ GDB_BP, /* 5 */
+ GDB_SI, /* 6 */
+ GDB_DI, /* 7 */
+ GDB_PC, /* 8 also known as eip */
+ GDB_PS, /* 9 also known as eflags */
+ GDB_CS, /* 10 */
+ GDB_SS, /* 11 */
+ GDB_DS, /* 12 */
+ GDB_ES, /* 13 */
+ GDB_FS, /* 14 */
+ GDB_GS, /* 15 */
+};
+#define GDB_ORIG_AX 41
+#define DBG_MAX_REG_NUM 16
+#define NUMREGBYTES ((GDB_GS+1)*4)
+#else /* ! CONFIG_X86_32 */
+enum regnames {
+ GDB_AX, /* 0 */
+ GDB_BX, /* 1 */
+ GDB_CX, /* 2 */
+ GDB_DX, /* 3 */
+ GDB_SI, /* 4 */
+ GDB_DI, /* 5 */
+ GDB_BP, /* 6 */
+ GDB_SP, /* 7 */
+ GDB_R8, /* 8 */
+ GDB_R9, /* 9 */
+ GDB_R10, /* 10 */
+ GDB_R11, /* 11 */
+ GDB_R12, /* 12 */
+ GDB_R13, /* 13 */
+ GDB_R14, /* 14 */
+ GDB_R15, /* 15 */
+ GDB_PC, /* 16 */
+ GDB_PS, /* 17 */
+ GDB_CS, /* 18 */
+ GDB_SS, /* 19 */
+ GDB_DS, /* 20 */
+ GDB_ES, /* 21 */
+ GDB_FS, /* 22 */
+ GDB_GS, /* 23 */
+};
+#define GDB_ORIG_AX 57
+#define DBG_MAX_REG_NUM 24
+/* 17 64 bit regs and 5 32 bit regs */
+#define NUMREGBYTES ((17 * 8) + (5 * 4))
+#endif /* ! CONFIG_X86_32 */
+
+static inline void arch_kgdb_breakpoint(void)
+{
+ asm(" int $3");
+}
+#define BREAK_INSTR_SIZE 1
+#define CACHE_FLUSH_IS_SAFE 1
+#define GDB_ADJUSTS_BREAK_OFFSET
+
+extern int kgdb_ll_trap(int cmd, const char *str,
+ struct pt_regs *regs, long err, int trap, int sig);
+
+#endif /* _ASM_X86_KGDB_H */
diff --git a/arch/x86/include/asm/kmap_types.h b/arch/x86/include/asm/kmap_types.h
new file mode 100644
index 000000000..04ab8266e
--- /dev/null
+++ b/arch/x86/include/asm/kmap_types.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_KMAP_TYPES_H
+#define _ASM_X86_KMAP_TYPES_H
+
+#if defined(CONFIG_X86_32) && defined(CONFIG_DEBUG_HIGHMEM)
+#define __WITH_KM_FENCE
+#endif
+
+#include <asm-generic/kmap_types.h>
+
+#undef __WITH_KM_FENCE
+
+#endif /* _ASM_X86_KMAP_TYPES_H */
diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
new file mode 100644
index 000000000..c8cec1b39
--- /dev/null
+++ b/arch/x86/include/asm/kprobes.h
@@ -0,0 +1,124 @@
+#ifndef _ASM_X86_KPROBES_H
+#define _ASM_X86_KPROBES_H
+/*
+ * Kernel Probes (KProbes)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2002, 2004
+ *
+ * See arch/x86/kernel/kprobes.c for x86 kprobes history.
+ */
+
+#include <asm-generic/kprobes.h>
+
+#define BREAKPOINT_INSTRUCTION 0xcc
+
+#ifdef CONFIG_KPROBES
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/percpu.h>
+#include <asm/insn.h>
+
+#define __ARCH_WANT_KPROBES_INSN_SLOT
+
+struct pt_regs;
+struct kprobe;
+
+typedef u8 kprobe_opcode_t;
+#define RELATIVEJUMP_OPCODE 0xe9
+#define RELATIVEJUMP_SIZE 5
+#define RELATIVECALL_OPCODE 0xe8
+#define RELATIVE_ADDR_SIZE 4
+#define MAX_STACK_SIZE 64
+#define CUR_STACK_SIZE(ADDR) \
+ (current_top_of_stack() - (unsigned long)(ADDR))
+#define MIN_STACK_SIZE(ADDR) \
+ (MAX_STACK_SIZE < CUR_STACK_SIZE(ADDR) ? \
+ MAX_STACK_SIZE : CUR_STACK_SIZE(ADDR))
+
+#define flush_insn_slot(p) do { } while (0)
+
+/* optinsn template addresses */
+extern __visible kprobe_opcode_t optprobe_template_entry[];
+extern __visible kprobe_opcode_t optprobe_template_val[];
+extern __visible kprobe_opcode_t optprobe_template_call[];
+extern __visible kprobe_opcode_t optprobe_template_end[];
+#define MAX_OPTIMIZED_LENGTH (MAX_INSN_SIZE + RELATIVE_ADDR_SIZE)
+#define MAX_OPTINSN_SIZE \
+ (((unsigned long)optprobe_template_end - \
+ (unsigned long)optprobe_template_entry) + \
+ MAX_OPTIMIZED_LENGTH + RELATIVEJUMP_SIZE)
+
+extern const int kretprobe_blacklist_size;
+
+void arch_remove_kprobe(struct kprobe *p);
+asmlinkage void kretprobe_trampoline(void);
+
+extern void arch_kprobe_override_function(struct pt_regs *regs);
+
+/* Architecture specific copy of original instruction*/
+struct arch_specific_insn {
+ /* copy of the original instruction */
+ kprobe_opcode_t *insn;
+ /*
+ * boostable = false: This instruction type is not boostable.
+ * boostable = true: This instruction has been boosted: we have
+ * added a relative jump after the instruction copy in insn,
+ * so no single-step and fixup are needed (unless there's
+ * a post_handler).
+ */
+ bool boostable;
+ bool if_modifier;
+};
+
+struct arch_optimized_insn {
+ /* copy of the original instructions */
+ kprobe_opcode_t copied_insn[RELATIVE_ADDR_SIZE];
+ /* detour code buffer */
+ kprobe_opcode_t *insn;
+ /* the size of instructions copied to detour code buffer */
+ size_t size;
+};
+
+/* Return true (!0) if optinsn is prepared for optimization. */
+static inline int arch_prepared_optinsn(struct arch_optimized_insn *optinsn)
+{
+ return optinsn->size;
+}
+
+struct prev_kprobe {
+ struct kprobe *kp;
+ unsigned long status;
+ unsigned long old_flags;
+ unsigned long saved_flags;
+};
+
+/* per-cpu kprobe control block */
+struct kprobe_ctlblk {
+ unsigned long kprobe_status;
+ unsigned long kprobe_old_flags;
+ unsigned long kprobe_saved_flags;
+ struct prev_kprobe prev_kprobe;
+};
+
+extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr);
+extern int kprobe_exceptions_notify(struct notifier_block *self,
+ unsigned long val, void *data);
+extern int kprobe_int3_handler(struct pt_regs *regs);
+extern int kprobe_debug_handler(struct pt_regs *regs);
+
+#endif /* CONFIG_KPROBES */
+#endif /* _ASM_X86_KPROBES_H */
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
new file mode 100644
index 000000000..0f82cd91c
--- /dev/null
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -0,0 +1,453 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/******************************************************************************
+ * x86_emulate.h
+ *
+ * Generic x86 (32-bit and 64-bit) instruction decoder and emulator.
+ *
+ * Copyright (c) 2005 Keir Fraser
+ *
+ * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
+ */
+
+#ifndef _ASM_X86_KVM_X86_EMULATE_H
+#define _ASM_X86_KVM_X86_EMULATE_H
+
+#include <asm/desc_defs.h>
+
+struct x86_emulate_ctxt;
+enum x86_intercept;
+enum x86_intercept_stage;
+
+struct x86_exception {
+ u8 vector;
+ bool error_code_valid;
+ u16 error_code;
+ bool nested_page_fault;
+ u64 address; /* cr2 or nested page fault gpa */
+ u8 async_page_fault;
+};
+
+/*
+ * This struct is used to carry enough information from the instruction
+ * decoder to main KVM so that a decision can be made whether the
+ * instruction needs to be intercepted or not.
+ */
+struct x86_instruction_info {
+ u8 intercept; /* which intercept */
+ u8 rep_prefix; /* rep prefix? */
+ u8 modrm_mod; /* mod part of modrm */
+ u8 modrm_reg; /* index of register used */
+ u8 modrm_rm; /* rm part of modrm */
+ u64 src_val; /* value of source operand */
+ u64 dst_val; /* value of destination operand */
+ u8 src_bytes; /* size of source operand */
+ u8 dst_bytes; /* size of destination operand */
+ u8 ad_bytes; /* size of src/dst address */
+ u64 next_rip; /* rip following the instruction */
+};
+
+/*
+ * x86_emulate_ops:
+ *
+ * These operations represent the instruction emulator's interface to memory.
+ * There are two categories of operation: those that act on ordinary memory
+ * regions (*_std), and those that act on memory regions known to require
+ * special treatment or emulation (*_emulated).
+ *
+ * The emulator assumes that an instruction accesses only one 'emulated memory'
+ * location, that this location is the given linear faulting address (cr2), and
+ * that this is one of the instruction's data operands. Instruction fetches and
+ * stack operations are assumed never to access emulated memory. The emulator
+ * automatically deduces which operand of a string-move operation is accessing
+ * emulated memory, and assumes that the other operand accesses normal memory.
+ *
+ * NOTES:
+ * 1. The emulator isn't very smart about emulated vs. standard memory.
+ * 'Emulated memory' access addresses should be checked for sanity.
+ * 'Normal memory' accesses may fault, and the caller must arrange to
+ * detect and handle reentrancy into the emulator via recursive faults.
+ * Accesses may be unaligned and may cross page boundaries.
+ * 2. If the access fails (cannot emulate, or a standard access faults) then
+ * it is up to the memop to propagate the fault to the guest VM via
+ * some out-of-band mechanism, unknown to the emulator. The memop signals
+ * failure by returning X86EMUL_PROPAGATE_FAULT to the emulator, which will
+ * then immediately bail.
+ * 3. Valid access sizes are 1, 2, 4 and 8 bytes. On x86/32 systems only
+ * cmpxchg8b_emulated need support 8-byte accesses.
+ * 4. The emulator cannot handle 64-bit mode emulation on an x86/32 system.
+ */
+/* Access completed successfully: continue emulation as normal. */
+#define X86EMUL_CONTINUE 0
+/* Access is unhandleable: bail from emulation and return error to caller. */
+#define X86EMUL_UNHANDLEABLE 1
+/* Terminate emulation but return success to the caller. */
+#define X86EMUL_PROPAGATE_FAULT 2 /* propagate a generated fault to guest */
+#define X86EMUL_RETRY_INSTR 3 /* retry the instruction for some reason */
+#define X86EMUL_CMPXCHG_FAILED 4 /* cmpxchg did not see expected value */
+#define X86EMUL_IO_NEEDED 5 /* IO is needed to complete emulation */
+#define X86EMUL_INTERCEPTED 6 /* Intercepted by nested VMCB/VMCS */
+
+struct x86_emulate_ops {
+ /*
+ * read_gpr: read a general purpose register (rax - r15)
+ *
+ * @reg: gpr number.
+ */
+ ulong (*read_gpr)(struct x86_emulate_ctxt *ctxt, unsigned reg);
+ /*
+ * write_gpr: write a general purpose register (rax - r15)
+ *
+ * @reg: gpr number.
+ * @val: value to write.
+ */
+ void (*write_gpr)(struct x86_emulate_ctxt *ctxt, unsigned reg, ulong val);
+ /*
+ * read_std: Read bytes of standard (non-emulated/special) memory.
+ * Used for descriptor reading.
+ * @addr: [IN ] Linear address from which to read.
+ * @val: [OUT] Value read from memory, zero-extended to 'u_long'.
+ * @bytes: [IN ] Number of bytes to read from memory.
+ * @system:[IN ] Whether the access is forced to be at CPL0.
+ */
+ int (*read_std)(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr, void *val,
+ unsigned int bytes,
+ struct x86_exception *fault, bool system);
+
+ /*
+ * read_phys: Read bytes of standard (non-emulated/special) memory.
+ * Used for descriptor reading.
+ * @addr: [IN ] Physical address from which to read.
+ * @val: [OUT] Value read from memory.
+ * @bytes: [IN ] Number of bytes to read from memory.
+ */
+ int (*read_phys)(struct x86_emulate_ctxt *ctxt, unsigned long addr,
+ void *val, unsigned int bytes);
+
+ /*
+ * write_std: Write bytes of standard (non-emulated/special) memory.
+ * Used for descriptor writing.
+ * @addr: [IN ] Linear address to which to write.
+ * @val: [OUT] Value write to memory, zero-extended to 'u_long'.
+ * @bytes: [IN ] Number of bytes to write to memory.
+ * @system:[IN ] Whether the access is forced to be at CPL0.
+ */
+ int (*write_std)(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr, void *val, unsigned int bytes,
+ struct x86_exception *fault, bool system);
+ /*
+ * fetch: Read bytes of standard (non-emulated/special) memory.
+ * Used for instruction fetch.
+ * @addr: [IN ] Linear address from which to read.
+ * @val: [OUT] Value read from memory, zero-extended to 'u_long'.
+ * @bytes: [IN ] Number of bytes to read from memory.
+ */
+ int (*fetch)(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr, void *val, unsigned int bytes,
+ struct x86_exception *fault);
+
+ /*
+ * read_emulated: Read bytes from emulated/special memory area.
+ * @addr: [IN ] Linear address from which to read.
+ * @val: [OUT] Value read from memory, zero-extended to 'u_long'.
+ * @bytes: [IN ] Number of bytes to read from memory.
+ */
+ int (*read_emulated)(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr, void *val, unsigned int bytes,
+ struct x86_exception *fault);
+
+ /*
+ * write_emulated: Write bytes to emulated/special memory area.
+ * @addr: [IN ] Linear address to which to write.
+ * @val: [IN ] Value to write to memory (low-order bytes used as
+ * required).
+ * @bytes: [IN ] Number of bytes to write to memory.
+ */
+ int (*write_emulated)(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr, const void *val,
+ unsigned int bytes,
+ struct x86_exception *fault);
+
+ /*
+ * cmpxchg_emulated: Emulate an atomic (LOCKed) CMPXCHG operation on an
+ * emulated/special memory area.
+ * @addr: [IN ] Linear address to access.
+ * @old: [IN ] Value expected to be current at @addr.
+ * @new: [IN ] Value to write to @addr.
+ * @bytes: [IN ] Number of bytes to access using CMPXCHG.
+ */
+ int (*cmpxchg_emulated)(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr,
+ const void *old,
+ const void *new,
+ unsigned int bytes,
+ struct x86_exception *fault);
+ void (*invlpg)(struct x86_emulate_ctxt *ctxt, ulong addr);
+
+ int (*pio_in_emulated)(struct x86_emulate_ctxt *ctxt,
+ int size, unsigned short port, void *val,
+ unsigned int count);
+
+ int (*pio_out_emulated)(struct x86_emulate_ctxt *ctxt,
+ int size, unsigned short port, const void *val,
+ unsigned int count);
+
+ bool (*get_segment)(struct x86_emulate_ctxt *ctxt, u16 *selector,
+ struct desc_struct *desc, u32 *base3, int seg);
+ void (*set_segment)(struct x86_emulate_ctxt *ctxt, u16 selector,
+ struct desc_struct *desc, u32 base3, int seg);
+ unsigned long (*get_cached_segment_base)(struct x86_emulate_ctxt *ctxt,
+ int seg);
+ void (*get_gdt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt);
+ void (*get_idt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt);
+ void (*set_gdt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt);
+ void (*set_idt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt);
+ ulong (*get_cr)(struct x86_emulate_ctxt *ctxt, int cr);
+ int (*set_cr)(struct x86_emulate_ctxt *ctxt, int cr, ulong val);
+ int (*cpl)(struct x86_emulate_ctxt *ctxt);
+ int (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest);
+ int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value);
+ u64 (*get_smbase)(struct x86_emulate_ctxt *ctxt);
+ void (*set_smbase)(struct x86_emulate_ctxt *ctxt, u64 smbase);
+ int (*set_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data);
+ int (*get_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata);
+ int (*check_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc);
+ int (*read_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc, u64 *pdata);
+ void (*halt)(struct x86_emulate_ctxt *ctxt);
+ void (*wbinvd)(struct x86_emulate_ctxt *ctxt);
+ int (*fix_hypercall)(struct x86_emulate_ctxt *ctxt);
+ int (*intercept)(struct x86_emulate_ctxt *ctxt,
+ struct x86_instruction_info *info,
+ enum x86_intercept_stage stage);
+
+ bool (*get_cpuid)(struct x86_emulate_ctxt *ctxt, u32 *eax, u32 *ebx,
+ u32 *ecx, u32 *edx, bool check_limit);
+ void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked);
+
+ unsigned (*get_hflags)(struct x86_emulate_ctxt *ctxt);
+ void (*set_hflags)(struct x86_emulate_ctxt *ctxt, unsigned hflags);
+ int (*pre_leave_smm)(struct x86_emulate_ctxt *ctxt, u64 smbase);
+
+};
+
+typedef u32 __attribute__((vector_size(16))) sse128_t;
+
+/* Type, address-of, and value of an instruction's operand. */
+struct operand {
+ enum { OP_REG, OP_MEM, OP_MEM_STR, OP_IMM, OP_XMM, OP_MM, OP_NONE } type;
+ unsigned int bytes;
+ unsigned int count;
+ union {
+ unsigned long orig_val;
+ u64 orig_val64;
+ };
+ union {
+ unsigned long *reg;
+ struct segmented_address {
+ ulong ea;
+ unsigned seg;
+ } mem;
+ unsigned xmm;
+ unsigned mm;
+ } addr;
+ union {
+ unsigned long val;
+ u64 val64;
+ char valptr[sizeof(sse128_t)];
+ sse128_t vec_val;
+ u64 mm_val;
+ void *data;
+ };
+};
+
+struct fetch_cache {
+ u8 data[15];
+ u8 *ptr;
+ u8 *end;
+};
+
+struct read_cache {
+ u8 data[1024];
+ unsigned long pos;
+ unsigned long end;
+};
+
+/* Execution mode, passed to the emulator. */
+enum x86emul_mode {
+ X86EMUL_MODE_REAL, /* Real mode. */
+ X86EMUL_MODE_VM86, /* Virtual 8086 mode. */
+ X86EMUL_MODE_PROT16, /* 16-bit protected mode. */
+ X86EMUL_MODE_PROT32, /* 32-bit protected mode. */
+ X86EMUL_MODE_PROT64, /* 64-bit (long) mode. */
+};
+
+/* These match some of the HF_* flags defined in kvm_host.h */
+#define X86EMUL_GUEST_MASK (1 << 5) /* VCPU is in guest-mode */
+#define X86EMUL_SMM_MASK (1 << 6)
+#define X86EMUL_SMM_INSIDE_NMI_MASK (1 << 7)
+
+struct x86_emulate_ctxt {
+ const struct x86_emulate_ops *ops;
+
+ /* Register state before/after emulation. */
+ unsigned long eflags;
+ unsigned long eip; /* eip before instruction emulation */
+ /* Emulated execution mode, represented by an X86EMUL_MODE value. */
+ enum x86emul_mode mode;
+
+ /* interruptibility state, as a result of execution of STI or MOV SS */
+ int interruptibility;
+
+ bool perm_ok; /* do not check permissions if true */
+ bool ud; /* inject an #UD if host doesn't support insn */
+ bool tf; /* TF value before instruction (after for syscall/sysret) */
+
+ bool have_exception;
+ struct x86_exception exception;
+
+ /*
+ * decode cache
+ */
+
+ /* current opcode length in bytes */
+ u8 opcode_len;
+ u8 b;
+ u8 intercept;
+ u8 op_bytes;
+ u8 ad_bytes;
+ struct operand src;
+ struct operand src2;
+ struct operand dst;
+ int (*execute)(struct x86_emulate_ctxt *ctxt);
+ int (*check_perm)(struct x86_emulate_ctxt *ctxt);
+ /*
+ * The following six fields are cleared together,
+ * the rest are initialized unconditionally in x86_decode_insn
+ * or elsewhere
+ */
+ bool rip_relative;
+ u8 rex_prefix;
+ u8 lock_prefix;
+ u8 rep_prefix;
+ /* bitmaps of registers in _regs[] that can be read */
+ u32 regs_valid;
+ /* bitmaps of registers in _regs[] that have been written */
+ u32 regs_dirty;
+ /* modrm */
+ u8 modrm;
+ u8 modrm_mod;
+ u8 modrm_reg;
+ u8 modrm_rm;
+ u8 modrm_seg;
+ u8 seg_override;
+ u64 d;
+ unsigned long _eip;
+ struct operand memop;
+ /* Fields above regs are cleared together. */
+ unsigned long _regs[NR_VCPU_REGS];
+ struct operand *memopp;
+ struct fetch_cache fetch;
+ struct read_cache io_read;
+ struct read_cache mem_read;
+};
+
+/* Repeat String Operation Prefix */
+#define REPE_PREFIX 0xf3
+#define REPNE_PREFIX 0xf2
+
+/* CPUID vendors */
+#define X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx 0x68747541
+#define X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx 0x444d4163
+#define X86EMUL_CPUID_VENDOR_AuthenticAMD_edx 0x69746e65
+
+#define X86EMUL_CPUID_VENDOR_AMDisbetterI_ebx 0x69444d41
+#define X86EMUL_CPUID_VENDOR_AMDisbetterI_ecx 0x21726574
+#define X86EMUL_CPUID_VENDOR_AMDisbetterI_edx 0x74656273
+
+#define X86EMUL_CPUID_VENDOR_GenuineIntel_ebx 0x756e6547
+#define X86EMUL_CPUID_VENDOR_GenuineIntel_ecx 0x6c65746e
+#define X86EMUL_CPUID_VENDOR_GenuineIntel_edx 0x49656e69
+
+enum x86_intercept_stage {
+ X86_ICTP_NONE = 0, /* Allow zero-init to not match anything */
+ X86_ICPT_PRE_EXCEPT,
+ X86_ICPT_POST_EXCEPT,
+ X86_ICPT_POST_MEMACCESS,
+};
+
+enum x86_intercept {
+ x86_intercept_none,
+ x86_intercept_cr_read,
+ x86_intercept_cr_write,
+ x86_intercept_clts,
+ x86_intercept_lmsw,
+ x86_intercept_smsw,
+ x86_intercept_dr_read,
+ x86_intercept_dr_write,
+ x86_intercept_lidt,
+ x86_intercept_sidt,
+ x86_intercept_lgdt,
+ x86_intercept_sgdt,
+ x86_intercept_lldt,
+ x86_intercept_sldt,
+ x86_intercept_ltr,
+ x86_intercept_str,
+ x86_intercept_rdtsc,
+ x86_intercept_rdpmc,
+ x86_intercept_pushf,
+ x86_intercept_popf,
+ x86_intercept_cpuid,
+ x86_intercept_rsm,
+ x86_intercept_iret,
+ x86_intercept_intn,
+ x86_intercept_invd,
+ x86_intercept_pause,
+ x86_intercept_hlt,
+ x86_intercept_invlpg,
+ x86_intercept_invlpga,
+ x86_intercept_vmrun,
+ x86_intercept_vmload,
+ x86_intercept_vmsave,
+ x86_intercept_vmmcall,
+ x86_intercept_stgi,
+ x86_intercept_clgi,
+ x86_intercept_skinit,
+ x86_intercept_rdtscp,
+ x86_intercept_icebp,
+ x86_intercept_wbinvd,
+ x86_intercept_monitor,
+ x86_intercept_mwait,
+ x86_intercept_rdmsr,
+ x86_intercept_wrmsr,
+ x86_intercept_in,
+ x86_intercept_ins,
+ x86_intercept_out,
+ x86_intercept_outs,
+
+ nr_x86_intercepts
+};
+
+/* Host execution mode. */
+#if defined(CONFIG_X86_32)
+#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT32
+#elif defined(CONFIG_X86_64)
+#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64
+#endif
+
+int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len);
+bool x86_page_table_writing_insn(struct x86_emulate_ctxt *ctxt);
+#define EMULATION_FAILED -1
+#define EMULATION_OK 0
+#define EMULATION_RESTART 1
+#define EMULATION_INTERCEPTED 2
+void init_decode_cache(struct x86_emulate_ctxt *ctxt);
+int x86_emulate_insn(struct x86_emulate_ctxt *ctxt);
+int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
+ u16 tss_selector, int idt_index, int reason,
+ bool has_error_code, u32 error_code);
+int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq);
+void emulator_invalidate_register_cache(struct x86_emulate_ctxt *ctxt);
+void emulator_writeback_register_cache(struct x86_emulate_ctxt *ctxt);
+bool emulator_can_use_gpa(struct x86_emulate_ctxt *ctxt);
+
+#endif /* _ASM_X86_KVM_X86_EMULATE_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
new file mode 100644
index 000000000..98b74711e
--- /dev/null
+++ b/arch/x86/include/asm/kvm_host.h
@@ -0,0 +1,1551 @@
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * This header defines architecture specific interfaces, x86 version
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef _ASM_X86_KVM_HOST_H
+#define _ASM_X86_KVM_HOST_H
+
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/mmu_notifier.h>
+#include <linux/tracepoint.h>
+#include <linux/cpumask.h>
+#include <linux/irq_work.h>
+#include <linux/irq.h>
+
+#include <linux/kvm.h>
+#include <linux/kvm_para.h>
+#include <linux/kvm_types.h>
+#include <linux/perf_event.h>
+#include <linux/pvclock_gtod.h>
+#include <linux/clocksource.h>
+#include <linux/irqbypass.h>
+#include <linux/hyperv.h>
+
+#include <asm/apic.h>
+#include <asm/pvclock-abi.h>
+#include <asm/desc.h>
+#include <asm/mtrr.h>
+#include <asm/msr-index.h>
+#include <asm/asm.h>
+#include <asm/kvm_page_track.h>
+#include <asm/hyperv-tlfs.h>
+
+#define KVM_MAX_VCPUS 288
+#define KVM_SOFT_MAX_VCPUS 240
+#define KVM_MAX_VCPU_ID 1023
+#define KVM_USER_MEM_SLOTS 509
+/* memory slots that are not exposed to userspace */
+#define KVM_PRIVATE_MEM_SLOTS 3
+#define KVM_MEM_SLOTS_NUM (KVM_USER_MEM_SLOTS + KVM_PRIVATE_MEM_SLOTS)
+
+#define KVM_HALT_POLL_NS_DEFAULT 200000
+
+#define KVM_IRQCHIP_NUM_PINS KVM_IOAPIC_NUM_PINS
+
+/* x86-specific vcpu->requests bit members */
+#define KVM_REQ_MIGRATE_TIMER KVM_ARCH_REQ(0)
+#define KVM_REQ_REPORT_TPR_ACCESS KVM_ARCH_REQ(1)
+#define KVM_REQ_TRIPLE_FAULT KVM_ARCH_REQ(2)
+#define KVM_REQ_MMU_SYNC KVM_ARCH_REQ(3)
+#define KVM_REQ_CLOCK_UPDATE KVM_ARCH_REQ(4)
+#define KVM_REQ_LOAD_CR3 KVM_ARCH_REQ(5)
+#define KVM_REQ_EVENT KVM_ARCH_REQ(6)
+#define KVM_REQ_APF_HALT KVM_ARCH_REQ(7)
+#define KVM_REQ_STEAL_UPDATE KVM_ARCH_REQ(8)
+#define KVM_REQ_NMI KVM_ARCH_REQ(9)
+#define KVM_REQ_PMU KVM_ARCH_REQ(10)
+#define KVM_REQ_PMI KVM_ARCH_REQ(11)
+#define KVM_REQ_SMI KVM_ARCH_REQ(12)
+#define KVM_REQ_MASTERCLOCK_UPDATE KVM_ARCH_REQ(13)
+#define KVM_REQ_MCLOCK_INPROGRESS \
+ KVM_ARCH_REQ_FLAGS(14, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_SCAN_IOAPIC \
+ KVM_ARCH_REQ_FLAGS(15, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_GLOBAL_CLOCK_UPDATE KVM_ARCH_REQ(16)
+#define KVM_REQ_APIC_PAGE_RELOAD \
+ KVM_ARCH_REQ_FLAGS(17, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_HV_CRASH KVM_ARCH_REQ(18)
+#define KVM_REQ_IOAPIC_EOI_EXIT KVM_ARCH_REQ(19)
+#define KVM_REQ_HV_RESET KVM_ARCH_REQ(20)
+#define KVM_REQ_HV_EXIT KVM_ARCH_REQ(21)
+#define KVM_REQ_HV_STIMER KVM_ARCH_REQ(22)
+#define KVM_REQ_LOAD_EOI_EXITMAP KVM_ARCH_REQ(23)
+#define KVM_REQ_GET_VMCS12_PAGES KVM_ARCH_REQ(24)
+
+#define CR0_RESERVED_BITS \
+ (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
+ | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
+ | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
+
+#define CR4_RESERVED_BITS \
+ (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
+ | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
+ | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR | X86_CR4_PCIDE \
+ | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_FSGSBASE \
+ | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_VMXE \
+ | X86_CR4_SMAP | X86_CR4_PKE | X86_CR4_UMIP))
+
+#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
+
+
+
+#define INVALID_PAGE (~(hpa_t)0)
+#define VALID_PAGE(x) ((x) != INVALID_PAGE)
+
+#define UNMAPPED_GVA (~(gpa_t)0)
+
+/* KVM Hugepage definitions for x86 */
+#define KVM_NR_PAGE_SIZES 3
+#define KVM_HPAGE_GFN_SHIFT(x) (((x) - 1) * 9)
+#define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + KVM_HPAGE_GFN_SHIFT(x))
+#define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x))
+#define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1))
+#define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE)
+
+static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
+{
+ /* KVM_HPAGE_GFN_SHIFT(PT_PAGE_TABLE_LEVEL) must be 0. */
+ return (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
+ (base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
+}
+
+#define KVM_PERMILLE_MMU_PAGES 20
+#define KVM_MIN_ALLOC_MMU_PAGES 64UL
+#define KVM_MMU_HASH_SHIFT 12
+#define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT)
+#define KVM_MIN_FREE_MMU_PAGES 5
+#define KVM_REFILL_PAGES 25
+#define KVM_MAX_CPUID_ENTRIES 80
+#define KVM_NR_FIXED_MTRR_REGION 88
+#define KVM_NR_VAR_MTRR 8
+
+#define ASYNC_PF_PER_VCPU 64
+
+enum kvm_reg {
+ VCPU_REGS_RAX = 0,
+ VCPU_REGS_RCX = 1,
+ VCPU_REGS_RDX = 2,
+ VCPU_REGS_RBX = 3,
+ VCPU_REGS_RSP = 4,
+ VCPU_REGS_RBP = 5,
+ VCPU_REGS_RSI = 6,
+ VCPU_REGS_RDI = 7,
+#ifdef CONFIG_X86_64
+ VCPU_REGS_R8 = 8,
+ VCPU_REGS_R9 = 9,
+ VCPU_REGS_R10 = 10,
+ VCPU_REGS_R11 = 11,
+ VCPU_REGS_R12 = 12,
+ VCPU_REGS_R13 = 13,
+ VCPU_REGS_R14 = 14,
+ VCPU_REGS_R15 = 15,
+#endif
+ VCPU_REGS_RIP,
+ NR_VCPU_REGS
+};
+
+enum kvm_reg_ex {
+ VCPU_EXREG_PDPTR = NR_VCPU_REGS,
+ VCPU_EXREG_CR3,
+ VCPU_EXREG_RFLAGS,
+ VCPU_EXREG_SEGMENTS,
+};
+
+enum {
+ VCPU_SREG_ES,
+ VCPU_SREG_CS,
+ VCPU_SREG_SS,
+ VCPU_SREG_DS,
+ VCPU_SREG_FS,
+ VCPU_SREG_GS,
+ VCPU_SREG_TR,
+ VCPU_SREG_LDTR,
+};
+
+#include <asm/kvm_emulate.h>
+
+#define KVM_NR_MEM_OBJS 40
+
+#define KVM_NR_DB_REGS 4
+
+#define DR6_BD (1 << 13)
+#define DR6_BS (1 << 14)
+#define DR6_BT (1 << 15)
+#define DR6_RTM (1 << 16)
+#define DR6_FIXED_1 0xfffe0ff0
+#define DR6_INIT 0xffff0ff0
+#define DR6_VOLATILE 0x0001e00f
+
+#define DR7_BP_EN_MASK 0x000000ff
+#define DR7_GE (1 << 9)
+#define DR7_GD (1 << 13)
+#define DR7_FIXED_1 0x00000400
+#define DR7_VOLATILE 0xffff2bff
+
+#define PFERR_PRESENT_BIT 0
+#define PFERR_WRITE_BIT 1
+#define PFERR_USER_BIT 2
+#define PFERR_RSVD_BIT 3
+#define PFERR_FETCH_BIT 4
+#define PFERR_PK_BIT 5
+#define PFERR_GUEST_FINAL_BIT 32
+#define PFERR_GUEST_PAGE_BIT 33
+
+#define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT)
+#define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT)
+#define PFERR_USER_MASK (1U << PFERR_USER_BIT)
+#define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT)
+#define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT)
+#define PFERR_PK_MASK (1U << PFERR_PK_BIT)
+#define PFERR_GUEST_FINAL_MASK (1ULL << PFERR_GUEST_FINAL_BIT)
+#define PFERR_GUEST_PAGE_MASK (1ULL << PFERR_GUEST_PAGE_BIT)
+
+#define PFERR_NESTED_GUEST_PAGE (PFERR_GUEST_PAGE_MASK | \
+ PFERR_WRITE_MASK | \
+ PFERR_PRESENT_MASK)
+
+/*
+ * The mask used to denote special SPTEs, which can be either MMIO SPTEs or
+ * Access Tracking SPTEs. We use bit 62 instead of bit 63 to avoid conflicting
+ * with the SVE bit in EPT PTEs.
+ */
+#define SPTE_SPECIAL_MASK (1ULL << 62)
+
+/* apic attention bits */
+#define KVM_APIC_CHECK_VAPIC 0
+/*
+ * The following bit is set with PV-EOI, unset on EOI.
+ * We detect PV-EOI changes by guest by comparing
+ * this bit with PV-EOI in guest memory.
+ * See the implementation in apic_update_pv_eoi.
+ */
+#define KVM_APIC_PV_EOI_PENDING 1
+
+struct kvm_kernel_irq_routing_entry;
+
+/*
+ * We don't want allocation failures within the mmu code, so we preallocate
+ * enough memory for a single page fault in a cache.
+ */
+struct kvm_mmu_memory_cache {
+ int nobjs;
+ void *objects[KVM_NR_MEM_OBJS];
+};
+
+/*
+ * the pages used as guest page table on soft mmu are tracked by
+ * kvm_memory_slot.arch.gfn_track which is 16 bits, so the role bits used
+ * by indirect shadow page can not be more than 15 bits.
+ *
+ * Currently, we used 14 bits that are @level, @cr4_pae, @quadrant, @access,
+ * @nxe, @cr0_wp, @smep_andnot_wp and @smap_andnot_wp.
+ */
+union kvm_mmu_page_role {
+ unsigned word;
+ struct {
+ unsigned level:4;
+ unsigned cr4_pae:1;
+ unsigned quadrant:2;
+ unsigned direct:1;
+ unsigned access:3;
+ unsigned invalid:1;
+ unsigned nxe:1;
+ unsigned cr0_wp:1;
+ unsigned smep_andnot_wp:1;
+ unsigned smap_andnot_wp:1;
+ unsigned ad_disabled:1;
+ unsigned guest_mode:1;
+ unsigned :6;
+
+ /*
+ * This is left at the top of the word so that
+ * kvm_memslots_for_spte_role can extract it with a
+ * simple shift. While there is room, give it a whole
+ * byte so it is also faster to load it from memory.
+ */
+ unsigned smm:8;
+ };
+};
+
+struct kvm_rmap_head {
+ unsigned long val;
+};
+
+struct kvm_mmu_page {
+ struct list_head link;
+ struct hlist_node hash_link;
+ struct list_head lpage_disallowed_link;
+
+ /*
+ * The following two entries are used to key the shadow page in the
+ * hash table.
+ */
+ gfn_t gfn;
+ union kvm_mmu_page_role role;
+
+ u64 *spt;
+ /* hold the gfn of each spte inside spt */
+ gfn_t *gfns;
+ bool unsync;
+ bool lpage_disallowed; /* Can't be replaced by an equiv large page */
+ int root_count; /* Currently serving as active root */
+ unsigned int unsync_children;
+ struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */
+
+ /* The page is obsolete if mmu_valid_gen != kvm->arch.mmu_valid_gen. */
+ unsigned long mmu_valid_gen;
+
+ DECLARE_BITMAP(unsync_child_bitmap, 512);
+
+#ifdef CONFIG_X86_32
+ /*
+ * Used out of the mmu-lock to avoid reading spte values while an
+ * update is in progress; see the comments in __get_spte_lockless().
+ */
+ int clear_spte_count;
+#endif
+
+ /* Number of writes since the last time traversal visited this page. */
+ atomic_t write_flooding_count;
+};
+
+struct kvm_pio_request {
+ unsigned long linear_rip;
+ unsigned long count;
+ int in;
+ int port;
+ int size;
+};
+
+#define PT64_ROOT_MAX_LEVEL 5
+
+struct rsvd_bits_validate {
+ u64 rsvd_bits_mask[2][PT64_ROOT_MAX_LEVEL];
+ u64 bad_mt_xwr;
+};
+
+struct kvm_mmu_root_info {
+ gpa_t cr3;
+ hpa_t hpa;
+};
+
+#define KVM_MMU_ROOT_INFO_INVALID \
+ ((struct kvm_mmu_root_info) { .cr3 = INVALID_PAGE, .hpa = INVALID_PAGE })
+
+#define KVM_MMU_NUM_PREV_ROOTS 3
+
+/*
+ * x86 supports 4 paging modes (5-level 64-bit, 4-level 64-bit, 3-level 32-bit,
+ * and 2-level 32-bit). The kvm_mmu structure abstracts the details of the
+ * current mmu mode.
+ */
+struct kvm_mmu {
+ void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root);
+ unsigned long (*get_cr3)(struct kvm_vcpu *vcpu);
+ u64 (*get_pdptr)(struct kvm_vcpu *vcpu, int index);
+ int (*page_fault)(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u32 err,
+ bool prefault);
+ void (*inject_page_fault)(struct kvm_vcpu *vcpu,
+ struct x86_exception *fault);
+ gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gpa_t gva_or_gpa,
+ u32 access, struct x86_exception *exception);
+ gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
+ struct x86_exception *exception);
+ int (*sync_page)(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp);
+ void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa);
+ void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ u64 *spte, const void *pte);
+ hpa_t root_hpa;
+ union kvm_mmu_page_role base_role;
+ u8 root_level;
+ u8 shadow_root_level;
+ u8 ept_ad;
+ bool direct_map;
+ struct kvm_mmu_root_info prev_roots[KVM_MMU_NUM_PREV_ROOTS];
+
+ /*
+ * Bitmap; bit set = permission fault
+ * Byte index: page fault error code [4:1]
+ * Bit index: pte permissions in ACC_* format
+ */
+ u8 permissions[16];
+
+ /*
+ * The pkru_mask indicates if protection key checks are needed. It
+ * consists of 16 domains indexed by page fault error code bits [4:1],
+ * with PFEC.RSVD replaced by ACC_USER_MASK from the page tables.
+ * Each domain has 2 bits which are ANDed with AD and WD from PKRU.
+ */
+ u32 pkru_mask;
+
+ u64 *pae_root;
+ u64 *lm_root;
+
+ /*
+ * check zero bits on shadow page table entries, these
+ * bits include not only hardware reserved bits but also
+ * the bits spte never used.
+ */
+ struct rsvd_bits_validate shadow_zero_check;
+
+ struct rsvd_bits_validate guest_rsvd_check;
+
+ /* Can have large pages at levels 2..last_nonleaf_level-1. */
+ u8 last_nonleaf_level;
+
+ bool nx;
+
+ u64 pdptrs[4]; /* pae */
+};
+
+enum pmc_type {
+ KVM_PMC_GP = 0,
+ KVM_PMC_FIXED,
+};
+
+struct kvm_pmc {
+ enum pmc_type type;
+ u8 idx;
+ u64 counter;
+ u64 eventsel;
+ struct perf_event *perf_event;
+ struct kvm_vcpu *vcpu;
+};
+
+struct kvm_pmu {
+ unsigned nr_arch_gp_counters;
+ unsigned nr_arch_fixed_counters;
+ unsigned available_event_types;
+ u64 fixed_ctr_ctrl;
+ u64 global_ctrl;
+ u64 global_status;
+ u64 global_ovf_ctrl;
+ u64 counter_bitmask[2];
+ u64 global_ctrl_mask;
+ u64 reserved_bits;
+ u8 version;
+ struct kvm_pmc gp_counters[INTEL_PMC_MAX_GENERIC];
+ struct kvm_pmc fixed_counters[INTEL_PMC_MAX_FIXED];
+ struct irq_work irq_work;
+ u64 reprogram_pmi;
+};
+
+struct kvm_pmu_ops;
+
+enum {
+ KVM_DEBUGREG_BP_ENABLED = 1,
+ KVM_DEBUGREG_WONT_EXIT = 2,
+ KVM_DEBUGREG_RELOAD = 4,
+};
+
+struct kvm_mtrr_range {
+ u64 base;
+ u64 mask;
+ struct list_head node;
+};
+
+struct kvm_mtrr {
+ struct kvm_mtrr_range var_ranges[KVM_NR_VAR_MTRR];
+ mtrr_type fixed_ranges[KVM_NR_FIXED_MTRR_REGION];
+ u64 deftype;
+
+ struct list_head head;
+};
+
+/* Hyper-V SynIC timer */
+struct kvm_vcpu_hv_stimer {
+ struct hrtimer timer;
+ int index;
+ u64 config;
+ u64 count;
+ u64 exp_time;
+ struct hv_message msg;
+ bool msg_pending;
+};
+
+/* Hyper-V synthetic interrupt controller (SynIC)*/
+struct kvm_vcpu_hv_synic {
+ u64 version;
+ u64 control;
+ u64 msg_page;
+ u64 evt_page;
+ atomic64_t sint[HV_SYNIC_SINT_COUNT];
+ atomic_t sint_to_gsi[HV_SYNIC_SINT_COUNT];
+ DECLARE_BITMAP(auto_eoi_bitmap, 256);
+ DECLARE_BITMAP(vec_bitmap, 256);
+ bool active;
+ bool dont_zero_synic_pages;
+};
+
+/* Hyper-V per vcpu emulation context */
+struct kvm_vcpu_hv {
+ u32 vp_index;
+ u64 hv_vapic;
+ s64 runtime_offset;
+ struct kvm_vcpu_hv_synic synic;
+ struct kvm_hyperv_exit exit;
+ struct kvm_vcpu_hv_stimer stimer[HV_SYNIC_STIMER_COUNT];
+ DECLARE_BITMAP(stimer_pending_bitmap, HV_SYNIC_STIMER_COUNT);
+ cpumask_t tlb_lush;
+};
+
+struct kvm_vcpu_arch {
+ /*
+ * rip and regs accesses must go through
+ * kvm_{register,rip}_{read,write} functions.
+ */
+ unsigned long regs[NR_VCPU_REGS];
+ u32 regs_avail;
+ u32 regs_dirty;
+
+ unsigned long cr0;
+ unsigned long cr0_guest_owned_bits;
+ unsigned long cr2;
+ unsigned long cr3;
+ unsigned long cr4;
+ unsigned long cr4_guest_owned_bits;
+ unsigned long cr8;
+ u32 pkru;
+ u32 hflags;
+ u64 efer;
+ u64 apic_base;
+ struct kvm_lapic *apic; /* kernel irqchip context */
+ bool apicv_active;
+ bool load_eoi_exitmap_pending;
+ DECLARE_BITMAP(ioapic_handled_vectors, 256);
+ unsigned long apic_attention;
+ int32_t apic_arb_prio;
+ int mp_state;
+ u64 ia32_misc_enable_msr;
+ u64 smbase;
+ u64 smi_count;
+ bool tpr_access_reporting;
+ u64 ia32_xss;
+ u64 microcode_version;
+ u64 arch_capabilities;
+
+ /*
+ * Paging state of the vcpu
+ *
+ * If the vcpu runs in guest mode with two level paging this still saves
+ * the paging mode of the l1 guest. This context is always used to
+ * handle faults.
+ */
+ struct kvm_mmu mmu;
+
+ /*
+ * Paging state of an L2 guest (used for nested npt)
+ *
+ * This context will save all necessary information to walk page tables
+ * of the an L2 guest. This context is only initialized for page table
+ * walking and not for faulting since we never handle l2 page faults on
+ * the host.
+ */
+ struct kvm_mmu nested_mmu;
+
+ /*
+ * Pointer to the mmu context currently used for
+ * gva_to_gpa translations.
+ */
+ struct kvm_mmu *walk_mmu;
+
+ struct kvm_mmu_memory_cache mmu_pte_list_desc_cache;
+ struct kvm_mmu_memory_cache mmu_page_cache;
+ struct kvm_mmu_memory_cache mmu_page_header_cache;
+
+ /*
+ * QEMU userspace and the guest each have their own FPU state.
+ * In vcpu_run, we switch between the user and guest FPU contexts.
+ * While running a VCPU, the VCPU thread will have the guest FPU
+ * context.
+ *
+ * Note that while the PKRU state lives inside the fpu registers,
+ * it is switched out separately at VMENTER and VMEXIT time. The
+ * "guest_fpu" state here contains the guest FPU context, with the
+ * host PRKU bits.
+ */
+ struct fpu user_fpu;
+ struct fpu guest_fpu;
+
+ u64 xcr0;
+ u64 guest_supported_xcr0;
+ u32 guest_xstate_size;
+
+ struct kvm_pio_request pio;
+ void *pio_data;
+
+ u8 event_exit_inst_len;
+
+ struct kvm_queued_exception {
+ bool pending;
+ bool injected;
+ bool has_error_code;
+ u8 nr;
+ u32 error_code;
+ u8 nested_apf;
+ } exception;
+
+ struct kvm_queued_interrupt {
+ bool injected;
+ bool soft;
+ u8 nr;
+ } interrupt;
+
+ int halt_request; /* real mode on Intel only */
+
+ int cpuid_nent;
+ struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES];
+
+ int maxphyaddr;
+
+ /* emulate context */
+
+ struct x86_emulate_ctxt emulate_ctxt;
+ bool emulate_regs_need_sync_to_vcpu;
+ bool emulate_regs_need_sync_from_vcpu;
+ int (*complete_userspace_io)(struct kvm_vcpu *vcpu);
+
+ gpa_t time;
+ struct pvclock_vcpu_time_info hv_clock;
+ unsigned int hw_tsc_khz;
+ struct gfn_to_hva_cache pv_time;
+ bool pv_time_enabled;
+ /* set guest stopped flag in pvclock flags field */
+ bool pvclock_set_guest_stopped_request;
+
+ struct {
+ u8 preempted;
+ u64 msr_val;
+ u64 last_steal;
+ struct gfn_to_pfn_cache cache;
+ } st;
+
+ u64 tsc_offset;
+ u64 last_guest_tsc;
+ u64 last_host_tsc;
+ u64 tsc_offset_adjustment;
+ u64 this_tsc_nsec;
+ u64 this_tsc_write;
+ u64 this_tsc_generation;
+ bool tsc_catchup;
+ bool tsc_always_catchup;
+ s8 virtual_tsc_shift;
+ u32 virtual_tsc_mult;
+ u32 virtual_tsc_khz;
+ s64 ia32_tsc_adjust_msr;
+ u64 tsc_scaling_ratio;
+
+ atomic_t nmi_queued; /* unprocessed asynchronous NMIs */
+ unsigned nmi_pending; /* NMI queued after currently running handler */
+ bool nmi_injected; /* Trying to inject an NMI this entry */
+ bool smi_pending; /* SMI queued after currently running handler */
+
+ struct kvm_mtrr mtrr_state;
+ u64 pat;
+
+ unsigned switch_db_regs;
+ unsigned long db[KVM_NR_DB_REGS];
+ unsigned long dr6;
+ unsigned long dr7;
+ unsigned long eff_db[KVM_NR_DB_REGS];
+ unsigned long guest_debug_dr7;
+ u64 msr_platform_info;
+ u64 msr_misc_features_enables;
+
+ u64 mcg_cap;
+ u64 mcg_status;
+ u64 mcg_ctl;
+ u64 mcg_ext_ctl;
+ u64 *mce_banks;
+
+ /* Cache MMIO info */
+ u64 mmio_gva;
+ unsigned access;
+ gfn_t mmio_gfn;
+ u64 mmio_gen;
+
+ struct kvm_pmu pmu;
+
+ /* used for guest single stepping over the given code position */
+ unsigned long singlestep_rip;
+
+ struct kvm_vcpu_hv hyperv;
+
+ cpumask_var_t wbinvd_dirty_mask;
+
+ unsigned long last_retry_eip;
+ unsigned long last_retry_addr;
+
+ struct {
+ bool halted;
+ gfn_t gfns[roundup_pow_of_two(ASYNC_PF_PER_VCPU)];
+ struct gfn_to_hva_cache data;
+ u64 msr_val;
+ u32 id;
+ bool send_user_only;
+ u32 host_apf_reason;
+ unsigned long nested_apf_token;
+ bool delivery_as_pf_vmexit;
+ } apf;
+
+ /* OSVW MSRs (AMD only) */
+ struct {
+ u64 length;
+ u64 status;
+ } osvw;
+
+ struct {
+ u64 msr_val;
+ struct gfn_to_hva_cache data;
+ } pv_eoi;
+
+ /*
+ * Indicate whether the access faults on its page table in guest
+ * which is set when fix page fault and used to detect unhandeable
+ * instruction.
+ */
+ bool write_fault_to_shadow_pgtable;
+
+ /* set at EPT violation at this point */
+ unsigned long exit_qualification;
+
+ /* pv related host specific info */
+ struct {
+ bool pv_unhalted;
+ } pv;
+
+ int pending_ioapic_eoi;
+ int pending_external_vector;
+
+ /* GPA available */
+ bool gpa_available;
+ gpa_t gpa_val;
+
+ /* be preempted when it's in kernel-mode(cpl=0) */
+ bool preempted_in_kernel;
+
+ /* Flush the L1 Data cache for L1TF mitigation on VMENTER */
+ bool l1tf_flush_l1d;
+};
+
+struct kvm_lpage_info {
+ int disallow_lpage;
+};
+
+struct kvm_arch_memory_slot {
+ struct kvm_rmap_head *rmap[KVM_NR_PAGE_SIZES];
+ struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1];
+ unsigned short *gfn_track[KVM_PAGE_TRACK_MAX];
+};
+
+/*
+ * We use as the mode the number of bits allocated in the LDR for the
+ * logical processor ID. It happens that these are all powers of two.
+ * This makes it is very easy to detect cases where the APICs are
+ * configured for multiple modes; in that case, we cannot use the map and
+ * hence cannot use kvm_irq_delivery_to_apic_fast either.
+ */
+#define KVM_APIC_MODE_XAPIC_CLUSTER 4
+#define KVM_APIC_MODE_XAPIC_FLAT 8
+#define KVM_APIC_MODE_X2APIC 16
+
+struct kvm_apic_map {
+ struct rcu_head rcu;
+ u8 mode;
+ u32 max_apic_id;
+ union {
+ struct kvm_lapic *xapic_flat_map[8];
+ struct kvm_lapic *xapic_cluster_map[16][4];
+ };
+ struct kvm_lapic *phys_map[];
+};
+
+/* Hyper-V emulation context */
+struct kvm_hv {
+ struct mutex hv_lock;
+ u64 hv_guest_os_id;
+ u64 hv_hypercall;
+ u64 hv_tsc_page;
+
+ /* Hyper-v based guest crash (NT kernel bugcheck) parameters */
+ u64 hv_crash_param[HV_X64_MSR_CRASH_PARAMS];
+ u64 hv_crash_ctl;
+
+ HV_REFERENCE_TSC_PAGE tsc_ref;
+
+ struct idr conn_to_evt;
+
+ u64 hv_reenlightenment_control;
+ u64 hv_tsc_emulation_control;
+ u64 hv_tsc_emulation_status;
+
+ /* How many vCPUs have VP index != vCPU index */
+ atomic_t num_mismatched_vp_indexes;
+};
+
+enum kvm_irqchip_mode {
+ KVM_IRQCHIP_NONE,
+ KVM_IRQCHIP_KERNEL, /* created with KVM_CREATE_IRQCHIP */
+ KVM_IRQCHIP_SPLIT, /* created with KVM_CAP_SPLIT_IRQCHIP */
+};
+
+struct kvm_arch {
+ unsigned long n_used_mmu_pages;
+ unsigned long n_requested_mmu_pages;
+ unsigned long n_max_mmu_pages;
+ unsigned int indirect_shadow_pages;
+ unsigned long mmu_valid_gen;
+ struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
+ /*
+ * Hash table of struct kvm_mmu_page.
+ */
+ struct list_head active_mmu_pages;
+ struct list_head zapped_obsolete_pages;
+ struct list_head lpage_disallowed_mmu_pages;
+ struct kvm_page_track_notifier_node mmu_sp_tracker;
+ struct kvm_page_track_notifier_head track_notifier_head;
+
+ struct list_head assigned_dev_head;
+ struct iommu_domain *iommu_domain;
+ bool iommu_noncoherent;
+#define __KVM_HAVE_ARCH_NONCOHERENT_DMA
+ atomic_t noncoherent_dma_count;
+#define __KVM_HAVE_ARCH_ASSIGNED_DEVICE
+ atomic_t assigned_device_count;
+ struct kvm_pic *vpic;
+ struct kvm_ioapic *vioapic;
+ struct kvm_pit *vpit;
+ atomic_t vapics_in_nmi_mode;
+ struct mutex apic_map_lock;
+ struct kvm_apic_map *apic_map;
+
+ bool apic_access_page_done;
+
+ gpa_t wall_clock;
+
+ bool mwait_in_guest;
+ bool hlt_in_guest;
+ bool pause_in_guest;
+
+ unsigned long irq_sources_bitmap;
+ s64 kvmclock_offset;
+ raw_spinlock_t tsc_write_lock;
+ u64 last_tsc_nsec;
+ u64 last_tsc_write;
+ u32 last_tsc_khz;
+ u64 cur_tsc_nsec;
+ u64 cur_tsc_write;
+ u64 cur_tsc_offset;
+ u64 cur_tsc_generation;
+ int nr_vcpus_matched_tsc;
+
+ spinlock_t pvclock_gtod_sync_lock;
+ bool use_master_clock;
+ u64 master_kernel_ns;
+ u64 master_cycle_now;
+ struct delayed_work kvmclock_update_work;
+ struct delayed_work kvmclock_sync_work;
+
+ struct kvm_xen_hvm_config xen_hvm_config;
+
+ /* reads protected by irq_srcu, writes by irq_lock */
+ struct hlist_head mask_notifier_list;
+
+ struct kvm_hv hyperv;
+
+ #ifdef CONFIG_KVM_MMU_AUDIT
+ int audit_point;
+ #endif
+
+ bool backwards_tsc_observed;
+ bool boot_vcpu_runs_old_kvmclock;
+ u32 bsp_vcpu_id;
+
+ u64 disabled_quirks;
+
+ enum kvm_irqchip_mode irqchip_mode;
+ u8 nr_reserved_ioapic_pins;
+
+ bool disabled_lapic_found;
+
+ bool x2apic_format;
+ bool x2apic_broadcast_quirk_disabled;
+
+ bool guest_can_read_msr_platform_info;
+
+ struct task_struct *nx_lpage_recovery_thread;
+};
+
+struct kvm_vm_stat {
+ ulong mmu_shadow_zapped;
+ ulong mmu_pte_write;
+ ulong mmu_pte_updated;
+ ulong mmu_pde_zapped;
+ ulong mmu_flooded;
+ ulong mmu_recycled;
+ ulong mmu_cache_miss;
+ ulong mmu_unsync;
+ ulong remote_tlb_flush;
+ ulong lpages;
+ ulong nx_lpage_splits;
+ ulong max_mmu_page_hash_collisions;
+};
+
+struct kvm_vcpu_stat {
+ u64 pf_fixed;
+ u64 pf_guest;
+ u64 tlb_flush;
+ u64 invlpg;
+
+ u64 exits;
+ u64 io_exits;
+ u64 mmio_exits;
+ u64 signal_exits;
+ u64 irq_window_exits;
+ u64 nmi_window_exits;
+ u64 l1d_flush;
+ u64 halt_exits;
+ u64 halt_successful_poll;
+ u64 halt_attempted_poll;
+ u64 halt_poll_invalid;
+ u64 halt_wakeup;
+ u64 request_irq_exits;
+ u64 irq_exits;
+ u64 host_state_reload;
+ u64 fpu_reload;
+ u64 insn_emulation;
+ u64 insn_emulation_fail;
+ u64 hypercalls;
+ u64 irq_injections;
+ u64 nmi_injections;
+ u64 req_event;
+};
+
+struct x86_instruction_info;
+
+struct msr_data {
+ bool host_initiated;
+ u32 index;
+ u64 data;
+};
+
+struct kvm_lapic_irq {
+ u32 vector;
+ u16 delivery_mode;
+ u16 dest_mode;
+ bool level;
+ u16 trig_mode;
+ u32 shorthand;
+ u32 dest_id;
+ bool msi_redir_hint;
+};
+
+struct kvm_x86_ops {
+ int (*cpu_has_kvm_support)(void); /* __init */
+ int (*disabled_by_bios)(void); /* __init */
+ int (*hardware_enable)(void);
+ void (*hardware_disable)(void);
+ void (*check_processor_compatibility)(void *rtn);
+ int (*hardware_setup)(void); /* __init */
+ void (*hardware_unsetup)(void); /* __exit */
+ bool (*cpu_has_accelerated_tpr)(void);
+ bool (*has_emulated_msr)(int index);
+ void (*cpuid_update)(struct kvm_vcpu *vcpu);
+
+ struct kvm *(*vm_alloc)(void);
+ void (*vm_free)(struct kvm *);
+ int (*vm_init)(struct kvm *kvm);
+ void (*vm_destroy)(struct kvm *kvm);
+
+ /* Create, but do not attach this VCPU */
+ struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id);
+ void (*vcpu_free)(struct kvm_vcpu *vcpu);
+ void (*vcpu_reset)(struct kvm_vcpu *vcpu, bool init_event);
+
+ void (*prepare_guest_switch)(struct kvm_vcpu *vcpu);
+ void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
+ void (*vcpu_put)(struct kvm_vcpu *vcpu);
+
+ void (*update_bp_intercept)(struct kvm_vcpu *vcpu);
+ int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr);
+ int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr);
+ u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
+ void (*get_segment)(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg);
+ int (*get_cpl)(struct kvm_vcpu *vcpu);
+ void (*set_segment)(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg);
+ void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l);
+ void (*decache_cr0_guest_bits)(struct kvm_vcpu *vcpu);
+ void (*decache_cr3)(struct kvm_vcpu *vcpu);
+ void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu);
+ void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
+ void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
+ int (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
+ void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer);
+ void (*get_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+ void (*set_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+ void (*get_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+ void (*set_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+ u64 (*get_dr6)(struct kvm_vcpu *vcpu);
+ void (*set_dr6)(struct kvm_vcpu *vcpu, unsigned long value);
+ void (*sync_dirty_debug_regs)(struct kvm_vcpu *vcpu);
+ void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value);
+ void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
+ unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
+ void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
+
+ void (*tlb_flush)(struct kvm_vcpu *vcpu, bool invalidate_gpa);
+ int (*tlb_remote_flush)(struct kvm *kvm);
+
+ /*
+ * Flush any TLB entries associated with the given GVA.
+ * Does not need to flush GPA->HPA mappings.
+ * Can potentially get non-canonical addresses through INVLPGs, which
+ * the implementation may choose to ignore if appropriate.
+ */
+ void (*tlb_flush_gva)(struct kvm_vcpu *vcpu, gva_t addr);
+
+ void (*run)(struct kvm_vcpu *vcpu);
+ int (*handle_exit)(struct kvm_vcpu *vcpu);
+ void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
+ void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
+ u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu);
+ void (*patch_hypercall)(struct kvm_vcpu *vcpu,
+ unsigned char *hypercall_addr);
+ void (*set_irq)(struct kvm_vcpu *vcpu);
+ void (*set_nmi)(struct kvm_vcpu *vcpu);
+ void (*queue_exception)(struct kvm_vcpu *vcpu);
+ void (*cancel_injection)(struct kvm_vcpu *vcpu);
+ int (*interrupt_allowed)(struct kvm_vcpu *vcpu);
+ int (*nmi_allowed)(struct kvm_vcpu *vcpu);
+ bool (*get_nmi_mask)(struct kvm_vcpu *vcpu);
+ void (*set_nmi_mask)(struct kvm_vcpu *vcpu, bool masked);
+ void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
+ void (*enable_irq_window)(struct kvm_vcpu *vcpu);
+ void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
+ bool (*get_enable_apicv)(struct kvm_vcpu *vcpu);
+ void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu);
+ void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr);
+ void (*hwapic_isr_update)(struct kvm_vcpu *vcpu, int isr);
+ bool (*guest_apic_has_interrupt)(struct kvm_vcpu *vcpu);
+ void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
+ void (*set_virtual_apic_mode)(struct kvm_vcpu *vcpu);
+ void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa);
+ int (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
+ int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
+ int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
+ int (*set_identity_map_addr)(struct kvm *kvm, u64 ident_addr);
+ int (*get_tdp_level)(struct kvm_vcpu *vcpu);
+ u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
+ int (*get_lpage_level)(void);
+ bool (*rdtscp_supported)(void);
+ bool (*invpcid_supported)(void);
+
+ void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
+
+ void (*set_supported_cpuid)(u32 func, struct kvm_cpuid_entry2 *entry);
+
+ bool (*has_wbinvd_exit)(void);
+
+ u64 (*read_l1_tsc_offset)(struct kvm_vcpu *vcpu);
+ /* Returns actual tsc_offset set in active VMCS */
+ u64 (*write_l1_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
+
+ void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2);
+
+ int (*check_intercept)(struct kvm_vcpu *vcpu,
+ struct x86_instruction_info *info,
+ enum x86_intercept_stage stage);
+ void (*handle_external_intr)(struct kvm_vcpu *vcpu);
+ bool (*mpx_supported)(void);
+ bool (*xsaves_supported)(void);
+ bool (*umip_emulated)(void);
+
+ int (*check_nested_events)(struct kvm_vcpu *vcpu);
+ void (*request_immediate_exit)(struct kvm_vcpu *vcpu);
+
+ void (*sched_in)(struct kvm_vcpu *kvm, int cpu);
+
+ /*
+ * Arch-specific dirty logging hooks. These hooks are only supposed to
+ * be valid if the specific arch has hardware-accelerated dirty logging
+ * mechanism. Currently only for PML on VMX.
+ *
+ * - slot_enable_log_dirty:
+ * called when enabling log dirty mode for the slot.
+ * - slot_disable_log_dirty:
+ * called when disabling log dirty mode for the slot.
+ * also called when slot is created with log dirty disabled.
+ * - flush_log_dirty:
+ * called before reporting dirty_bitmap to userspace.
+ * - enable_log_dirty_pt_masked:
+ * called when reenabling log dirty for the GFNs in the mask after
+ * corresponding bits are cleared in slot->dirty_bitmap.
+ */
+ void (*slot_enable_log_dirty)(struct kvm *kvm,
+ struct kvm_memory_slot *slot);
+ void (*slot_disable_log_dirty)(struct kvm *kvm,
+ struct kvm_memory_slot *slot);
+ void (*flush_log_dirty)(struct kvm *kvm);
+ void (*enable_log_dirty_pt_masked)(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ gfn_t offset, unsigned long mask);
+ int (*write_log_dirty)(struct kvm_vcpu *vcpu, gpa_t l2_gpa);
+
+ /* pmu operations of sub-arch */
+ const struct kvm_pmu_ops *pmu_ops;
+
+ /*
+ * Architecture specific hooks for vCPU blocking due to
+ * HLT instruction.
+ * Returns for .pre_block():
+ * - 0 means continue to block the vCPU.
+ * - 1 means we cannot block the vCPU since some event
+ * happens during this period, such as, 'ON' bit in
+ * posted-interrupts descriptor is set.
+ */
+ int (*pre_block)(struct kvm_vcpu *vcpu);
+ void (*post_block)(struct kvm_vcpu *vcpu);
+
+ void (*vcpu_blocking)(struct kvm_vcpu *vcpu);
+ void (*vcpu_unblocking)(struct kvm_vcpu *vcpu);
+
+ int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq,
+ uint32_t guest_irq, bool set);
+ void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu);
+ bool (*dy_apicv_has_pending_interrupt)(struct kvm_vcpu *vcpu);
+
+ int (*set_hv_timer)(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc);
+ void (*cancel_hv_timer)(struct kvm_vcpu *vcpu);
+
+ void (*setup_mce)(struct kvm_vcpu *vcpu);
+
+ int (*get_nested_state)(struct kvm_vcpu *vcpu,
+ struct kvm_nested_state __user *user_kvm_nested_state,
+ unsigned user_data_size);
+ int (*set_nested_state)(struct kvm_vcpu *vcpu,
+ struct kvm_nested_state __user *user_kvm_nested_state,
+ struct kvm_nested_state *kvm_state);
+ void (*get_vmcs12_pages)(struct kvm_vcpu *vcpu);
+
+ int (*smi_allowed)(struct kvm_vcpu *vcpu);
+ int (*pre_enter_smm)(struct kvm_vcpu *vcpu, char *smstate);
+ int (*pre_leave_smm)(struct kvm_vcpu *vcpu, u64 smbase);
+ int (*enable_smi_window)(struct kvm_vcpu *vcpu);
+
+ int (*mem_enc_op)(struct kvm *kvm, void __user *argp);
+ int (*mem_enc_reg_region)(struct kvm *kvm, struct kvm_enc_region *argp);
+ int (*mem_enc_unreg_region)(struct kvm *kvm, struct kvm_enc_region *argp);
+
+ int (*get_msr_feature)(struct kvm_msr_entry *entry);
+};
+
+struct kvm_arch_async_pf {
+ u32 token;
+ gfn_t gfn;
+ unsigned long cr3;
+ bool direct_map;
+};
+
+extern struct kvm_x86_ops *kvm_x86_ops;
+
+#define __KVM_HAVE_ARCH_VM_ALLOC
+static inline struct kvm *kvm_arch_alloc_vm(void)
+{
+ return kvm_x86_ops->vm_alloc();
+}
+
+static inline void kvm_arch_free_vm(struct kvm *kvm)
+{
+ return kvm_x86_ops->vm_free(kvm);
+}
+
+#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLB
+static inline int kvm_arch_flush_remote_tlb(struct kvm *kvm)
+{
+ if (kvm_x86_ops->tlb_remote_flush &&
+ !kvm_x86_ops->tlb_remote_flush(kvm))
+ return 0;
+ else
+ return -ENOTSUPP;
+}
+
+int kvm_mmu_module_init(void);
+void kvm_mmu_module_exit(void);
+
+void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
+int kvm_mmu_create(struct kvm_vcpu *vcpu);
+void kvm_mmu_setup(struct kvm_vcpu *vcpu);
+void kvm_mmu_init_vm(struct kvm *kvm);
+void kvm_mmu_uninit_vm(struct kvm *kvm);
+void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
+ u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
+ u64 acc_track_mask, u64 me_mask);
+
+void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
+void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
+ struct kvm_memory_slot *memslot);
+void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
+ const struct kvm_memory_slot *memslot);
+void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
+ struct kvm_memory_slot *memslot);
+void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
+ struct kvm_memory_slot *memslot);
+void kvm_mmu_slot_set_dirty(struct kvm *kvm,
+ struct kvm_memory_slot *memslot);
+void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ gfn_t gfn_offset, unsigned long mask);
+void kvm_mmu_zap_all(struct kvm *kvm);
+void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen);
+unsigned long kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
+void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long kvm_nr_mmu_pages);
+
+int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3);
+bool pdptrs_changed(struct kvm_vcpu *vcpu);
+
+int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
+ const void *val, int bytes);
+
+struct kvm_irq_mask_notifier {
+ void (*func)(struct kvm_irq_mask_notifier *kimn, bool masked);
+ int irq;
+ struct hlist_node link;
+};
+
+void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
+ struct kvm_irq_mask_notifier *kimn);
+void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
+ struct kvm_irq_mask_notifier *kimn);
+void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
+ bool mask);
+
+extern bool tdp_enabled;
+
+u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu);
+
+/* control of guest tsc rate supported? */
+extern bool kvm_has_tsc_control;
+/* maximum supported tsc_khz for guests */
+extern u32 kvm_max_guest_tsc_khz;
+/* number of bits of the fractional part of the TSC scaling ratio */
+extern u8 kvm_tsc_scaling_ratio_frac_bits;
+/* maximum allowed value of TSC scaling ratio */
+extern u64 kvm_max_tsc_scaling_ratio;
+/* 1ull << kvm_tsc_scaling_ratio_frac_bits */
+extern u64 kvm_default_tsc_scaling_ratio;
+
+extern u64 kvm_mce_cap_supported;
+
+enum emulation_result {
+ EMULATE_DONE, /* no further processing */
+ EMULATE_USER_EXIT, /* kvm_run ready for userspace exit */
+ EMULATE_FAIL, /* can't emulate this instruction */
+};
+
+#define EMULTYPE_NO_DECODE (1 << 0)
+#define EMULTYPE_TRAP_UD (1 << 1)
+#define EMULTYPE_SKIP (1 << 2)
+#define EMULTYPE_ALLOW_RETRY (1 << 3)
+#define EMULTYPE_NO_UD_ON_FAIL (1 << 4)
+#define EMULTYPE_VMWARE (1 << 5)
+int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type);
+int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu,
+ void *insn, int insn_len);
+
+void kvm_enable_efer_bits(u64);
+bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer);
+int kvm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr);
+int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr);
+
+struct x86_emulate_ctxt;
+
+int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in);
+int kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
+int kvm_emulate_halt(struct kvm_vcpu *vcpu);
+int kvm_vcpu_halt(struct kvm_vcpu *vcpu);
+int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu);
+
+void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
+int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
+void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector);
+
+int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
+ int reason, bool has_error_code, u32 error_code);
+
+int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
+int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
+int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
+int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8);
+int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val);
+int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val);
+unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu);
+void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
+void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
+int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr);
+
+int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr);
+int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr);
+
+unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu);
+void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
+bool kvm_rdpmc(struct kvm_vcpu *vcpu);
+
+void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
+void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
+void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr);
+void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
+void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault);
+int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ gfn_t gfn, void *data, int offset, int len,
+ u32 access);
+bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
+bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr);
+
+static inline int __kvm_irq_line_state(unsigned long *irq_state,
+ int irq_source_id, int level)
+{
+ /* Logical OR for level trig interrupt */
+ if (level)
+ __set_bit(irq_source_id, irq_state);
+ else
+ __clear_bit(irq_source_id, irq_state);
+
+ return !!(*irq_state);
+}
+
+#define KVM_MMU_ROOT_CURRENT BIT(0)
+#define KVM_MMU_ROOT_PREVIOUS(i) BIT(1+i)
+#define KVM_MMU_ROOTS_ALL (~0UL)
+
+int kvm_pic_set_irq(struct kvm_pic *pic, int irq, int irq_source_id, int level);
+void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id);
+
+void kvm_inject_nmi(struct kvm_vcpu *vcpu);
+
+int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn);
+int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
+void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
+int kvm_mmu_load(struct kvm_vcpu *vcpu);
+void kvm_mmu_unload(struct kvm_vcpu *vcpu);
+void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
+void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, ulong roots_to_free);
+gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
+ struct x86_exception *exception);
+gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
+ struct x86_exception *exception);
+gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
+ struct x86_exception *exception);
+gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
+ struct x86_exception *exception);
+gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
+ struct x86_exception *exception);
+
+void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu);
+
+int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
+
+int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
+ void *insn, int insn_len);
+void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
+void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid);
+void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush);
+
+void kvm_enable_tdp(void);
+void kvm_disable_tdp(void);
+
+static inline gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
+ struct x86_exception *exception)
+{
+ return gpa;
+}
+
+static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
+{
+ struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT);
+
+ return (struct kvm_mmu_page *)page_private(page);
+}
+
+static inline u16 kvm_read_ldt(void)
+{
+ u16 ldt;
+ asm("sldt %0" : "=g"(ldt));
+ return ldt;
+}
+
+static inline void kvm_load_ldt(u16 sel)
+{
+ asm("lldt %0" : : "rm"(sel));
+}
+
+#ifdef CONFIG_X86_64
+static inline unsigned long read_msr(unsigned long msr)
+{
+ u64 value;
+
+ rdmsrl(msr, value);
+ return value;
+}
+#endif
+
+static inline u32 get_rdx_init_val(void)
+{
+ return 0x600; /* P6 family */
+}
+
+static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code)
+{
+ kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
+}
+
+#define TSS_IOPB_BASE_OFFSET 0x66
+#define TSS_BASE_SIZE 0x68
+#define TSS_IOPB_SIZE (65536 / 8)
+#define TSS_REDIRECTION_SIZE (256 / 8)
+#define RMODE_TSS_SIZE \
+ (TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1)
+
+enum {
+ TASK_SWITCH_CALL = 0,
+ TASK_SWITCH_IRET = 1,
+ TASK_SWITCH_JMP = 2,
+ TASK_SWITCH_GATE = 3,
+};
+
+#define HF_GIF_MASK (1 << 0)
+#define HF_HIF_MASK (1 << 1)
+#define HF_VINTR_MASK (1 << 2)
+#define HF_NMI_MASK (1 << 3)
+#define HF_IRET_MASK (1 << 4)
+#define HF_GUEST_MASK (1 << 5) /* VCPU is in guest-mode */
+#define HF_SMM_MASK (1 << 6)
+#define HF_SMM_INSIDE_NMI_MASK (1 << 7)
+
+#define __KVM_VCPU_MULTIPLE_ADDRESS_SPACE
+#define KVM_ADDRESS_SPACE_NUM 2
+
+#define kvm_arch_vcpu_memslots_id(vcpu) ((vcpu)->arch.hflags & HF_SMM_MASK ? 1 : 0)
+#define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, (role).smm)
+
+asmlinkage void __noreturn kvm_spurious_fault(void);
+
+/*
+ * Hardware virtualization extension instructions may fault if a
+ * reboot turns off virtualization while processes are running.
+ * Usually after catching the fault we just panic; during reboot
+ * instead the instruction is ignored.
+ */
+#define ____kvm_handle_fault_on_reboot(insn, cleanup_insn) \
+ "666: \n\t" \
+ insn "\n\t" \
+ "jmp 668f \n\t" \
+ "667: \n\t" \
+ "call kvm_spurious_fault \n\t" \
+ "668: \n\t" \
+ ".pushsection .fixup, \"ax\" \n\t" \
+ "700: \n\t" \
+ cleanup_insn "\n\t" \
+ "cmpb $0, kvm_rebooting\n\t" \
+ "je 667b \n\t" \
+ "jmp 668b \n\t" \
+ ".popsection \n\t" \
+ _ASM_EXTABLE(666b, 700b)
+
+#define __kvm_handle_fault_on_reboot(insn) \
+ ____kvm_handle_fault_on_reboot(insn, "")
+
+#define KVM_ARCH_WANT_MMU_NOTIFIER
+int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end,
+ bool blockable);
+int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
+int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
+void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
+int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v);
+int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
+int kvm_cpu_has_extint(struct kvm_vcpu *v);
+int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
+int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
+void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
+void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu);
+
+int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low,
+ unsigned long ipi_bitmap_high, u32 min,
+ unsigned long icr, int op_64_bit);
+
+u64 kvm_get_arch_capabilities(void);
+void kvm_define_shared_msr(unsigned index, u32 msr);
+int kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
+
+u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc);
+u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc);
+
+unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu);
+bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip);
+
+void kvm_make_mclock_inprogress_request(struct kvm *kvm);
+void kvm_make_scan_ioapic_request(struct kvm *kvm);
+
+void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
+ struct kvm_async_pf *work);
+void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
+ struct kvm_async_pf *work);
+void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu,
+ struct kvm_async_pf *work);
+bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu);
+extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn);
+
+int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu);
+int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err);
+void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu);
+
+int kvm_is_in_guest(void);
+
+int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size);
+int x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size);
+bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu);
+bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu);
+
+bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
+ struct kvm_vcpu **dest_vcpu);
+
+void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
+ struct kvm_lapic_irq *irq);
+
+static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
+{
+ if (kvm_x86_ops->vcpu_blocking)
+ kvm_x86_ops->vcpu_blocking(vcpu);
+}
+
+static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu)
+{
+ if (kvm_x86_ops->vcpu_unblocking)
+ kvm_x86_ops->vcpu_unblocking(vcpu);
+}
+
+static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {}
+
+static inline int kvm_cpu_get_apicid(int mps_cpu)
+{
+#ifdef CONFIG_X86_LOCAL_APIC
+ return default_cpu_present_to_apicid(mps_cpu);
+#else
+ WARN_ON_ONCE(1);
+ return BAD_APICID;
+#endif
+}
+
+#define put_smstate(type, buf, offset, val) \
+ *(type *)((buf) + (offset) - 0x7e00) = val
+
+#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/kvm_page_track.h b/arch/x86/include/asm/kvm_page_track.h
new file mode 100644
index 000000000..172f9749d
--- /dev/null
+++ b/arch/x86/include/asm/kvm_page_track.h
@@ -0,0 +1,75 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_KVM_PAGE_TRACK_H
+#define _ASM_X86_KVM_PAGE_TRACK_H
+
+enum kvm_page_track_mode {
+ KVM_PAGE_TRACK_WRITE,
+ KVM_PAGE_TRACK_MAX,
+};
+
+/*
+ * The notifier represented by @kvm_page_track_notifier_node is linked into
+ * the head which will be notified when guest is triggering the track event.
+ *
+ * Write access on the head is protected by kvm->mmu_lock, read access
+ * is protected by track_srcu.
+ */
+struct kvm_page_track_notifier_head {
+ struct srcu_struct track_srcu;
+ struct hlist_head track_notifier_list;
+};
+
+struct kvm_page_track_notifier_node {
+ struct hlist_node node;
+
+ /*
+ * It is called when guest is writing the write-tracked page
+ * and write emulation is finished at that time.
+ *
+ * @vcpu: the vcpu where the write access happened.
+ * @gpa: the physical address written by guest.
+ * @new: the data was written to the address.
+ * @bytes: the written length.
+ * @node: this node
+ */
+ void (*track_write)(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
+ int bytes, struct kvm_page_track_notifier_node *node);
+ /*
+ * It is called when memory slot is being moved or removed
+ * users can drop write-protection for the pages in that memory slot
+ *
+ * @kvm: the kvm where memory slot being moved or removed
+ * @slot: the memory slot being moved or removed
+ * @node: this node
+ */
+ void (*track_flush_slot)(struct kvm *kvm, struct kvm_memory_slot *slot,
+ struct kvm_page_track_notifier_node *node);
+};
+
+void kvm_page_track_init(struct kvm *kvm);
+void kvm_page_track_cleanup(struct kvm *kvm);
+
+void kvm_page_track_free_memslot(struct kvm_memory_slot *free,
+ struct kvm_memory_slot *dont);
+int kvm_page_track_create_memslot(struct kvm_memory_slot *slot,
+ unsigned long npages);
+
+void kvm_slot_page_track_add_page(struct kvm *kvm,
+ struct kvm_memory_slot *slot, gfn_t gfn,
+ enum kvm_page_track_mode mode);
+void kvm_slot_page_track_remove_page(struct kvm *kvm,
+ struct kvm_memory_slot *slot, gfn_t gfn,
+ enum kvm_page_track_mode mode);
+bool kvm_page_track_is_active(struct kvm_vcpu *vcpu, gfn_t gfn,
+ enum kvm_page_track_mode mode);
+
+void
+kvm_page_track_register_notifier(struct kvm *kvm,
+ struct kvm_page_track_notifier_node *n);
+void
+kvm_page_track_unregister_notifier(struct kvm *kvm,
+ struct kvm_page_track_notifier_node *n);
+void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
+ int bytes);
+void kvm_page_track_flush_slot(struct kvm *kvm, struct kvm_memory_slot *slot);
+#endif
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
new file mode 100644
index 000000000..4c723632c
--- /dev/null
+++ b/arch/x86/include/asm/kvm_para.h
@@ -0,0 +1,134 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_KVM_PARA_H
+#define _ASM_X86_KVM_PARA_H
+
+#include <asm/processor.h>
+#include <asm/alternative.h>
+#include <uapi/asm/kvm_para.h>
+
+extern void kvmclock_init(void);
+
+#ifdef CONFIG_KVM_GUEST
+bool kvm_check_and_clear_guest_paused(void);
+#else
+static inline bool kvm_check_and_clear_guest_paused(void)
+{
+ return false;
+}
+#endif /* CONFIG_KVM_GUEST */
+
+#define KVM_HYPERCALL \
+ ALTERNATIVE(".byte 0x0f,0x01,0xc1", ".byte 0x0f,0x01,0xd9", X86_FEATURE_VMMCALL)
+
+/* For KVM hypercalls, a three-byte sequence of either the vmcall or the vmmcall
+ * instruction. The hypervisor may replace it with something else but only the
+ * instructions are guaranteed to be supported.
+ *
+ * Up to four arguments may be passed in rbx, rcx, rdx, and rsi respectively.
+ * The hypercall number should be placed in rax and the return value will be
+ * placed in rax. No other registers will be clobbered unless explicitly
+ * noted by the particular hypercall.
+ */
+
+static inline long kvm_hypercall0(unsigned int nr)
+{
+ long ret;
+ asm volatile(KVM_HYPERCALL
+ : "=a"(ret)
+ : "a"(nr)
+ : "memory");
+ return ret;
+}
+
+static inline long kvm_hypercall1(unsigned int nr, unsigned long p1)
+{
+ long ret;
+ asm volatile(KVM_HYPERCALL
+ : "=a"(ret)
+ : "a"(nr), "b"(p1)
+ : "memory");
+ return ret;
+}
+
+static inline long kvm_hypercall2(unsigned int nr, unsigned long p1,
+ unsigned long p2)
+{
+ long ret;
+ asm volatile(KVM_HYPERCALL
+ : "=a"(ret)
+ : "a"(nr), "b"(p1), "c"(p2)
+ : "memory");
+ return ret;
+}
+
+static inline long kvm_hypercall3(unsigned int nr, unsigned long p1,
+ unsigned long p2, unsigned long p3)
+{
+ long ret;
+ asm volatile(KVM_HYPERCALL
+ : "=a"(ret)
+ : "a"(nr), "b"(p1), "c"(p2), "d"(p3)
+ : "memory");
+ return ret;
+}
+
+static inline long kvm_hypercall4(unsigned int nr, unsigned long p1,
+ unsigned long p2, unsigned long p3,
+ unsigned long p4)
+{
+ long ret;
+ asm volatile(KVM_HYPERCALL
+ : "=a"(ret)
+ : "a"(nr), "b"(p1), "c"(p2), "d"(p3), "S"(p4)
+ : "memory");
+ return ret;
+}
+
+#ifdef CONFIG_KVM_GUEST
+bool kvm_para_available(void);
+unsigned int kvm_arch_para_features(void);
+unsigned int kvm_arch_para_hints(void);
+void kvm_async_pf_task_wait(u32 token, int interrupt_kernel);
+void kvm_async_pf_task_wake(u32 token);
+u32 kvm_read_and_reset_pf_reason(void);
+extern void kvm_disable_steal_time(void);
+
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+void __init kvm_spinlock_init(void);
+#else /* !CONFIG_PARAVIRT_SPINLOCKS */
+static inline void kvm_spinlock_init(void)
+{
+}
+#endif /* CONFIG_PARAVIRT_SPINLOCKS */
+
+#else /* CONFIG_KVM_GUEST */
+#define kvm_async_pf_task_wait(T, I) do {} while(0)
+#define kvm_async_pf_task_wake(T) do {} while(0)
+
+static inline bool kvm_para_available(void)
+{
+ return false;
+}
+
+static inline unsigned int kvm_arch_para_features(void)
+{
+ return 0;
+}
+
+static inline unsigned int kvm_arch_para_hints(void)
+{
+ return 0;
+}
+
+static inline u32 kvm_read_and_reset_pf_reason(void)
+{
+ return 0;
+}
+
+static inline void kvm_disable_steal_time(void)
+{
+ return;
+}
+#endif
+
+#endif /* _ASM_X86_KVM_PARA_H */
diff --git a/arch/x86/include/asm/kvmclock.h b/arch/x86/include/asm/kvmclock.h
new file mode 100644
index 000000000..6c5765192
--- /dev/null
+++ b/arch/x86/include/asm/kvmclock.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_KVM_CLOCK_H
+#define _ASM_X86_KVM_CLOCK_H
+
+#include <linux/percpu.h>
+
+extern struct clocksource kvm_clock;
+
+DECLARE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu);
+
+static inline struct pvclock_vcpu_time_info *this_cpu_pvti(void)
+{
+ return &this_cpu_read(hv_clock_per_cpu)->pvti;
+}
+
+static inline struct pvclock_vsyscall_time_info *this_cpu_hvclock(void)
+{
+ return this_cpu_read(hv_clock_per_cpu);
+}
+
+#endif /* _ASM_X86_KVM_CLOCK_H */
diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h
new file mode 100644
index 000000000..14caa9d9f
--- /dev/null
+++ b/arch/x86/include/asm/linkage.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_LINKAGE_H
+#define _ASM_X86_LINKAGE_H
+
+#include <linux/stringify.h>
+
+#undef notrace
+#define notrace __attribute__((no_instrument_function))
+
+#ifdef CONFIG_X86_32
+#define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0)))
+#endif /* CONFIG_X86_32 */
+
+#ifdef __ASSEMBLY__
+
+#define GLOBAL(name) \
+ .globl name; \
+ name:
+
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_ALIGNMENT_16)
+#define __ALIGN .p2align 4, 0x90
+#define __ALIGN_STR __stringify(__ALIGN)
+#endif
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_X86_LINKAGE_H */
+
diff --git a/arch/x86/include/asm/livepatch.h b/arch/x86/include/asm/livepatch.h
new file mode 100644
index 000000000..ed80003ce
--- /dev/null
+++ b/arch/x86/include/asm/livepatch.h
@@ -0,0 +1,40 @@
+/*
+ * livepatch.h - x86-specific Kernel Live Patching Core
+ *
+ * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>
+ * Copyright (C) 2014 SUSE
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _ASM_X86_LIVEPATCH_H
+#define _ASM_X86_LIVEPATCH_H
+
+#include <asm/setup.h>
+#include <linux/ftrace.h>
+
+static inline int klp_check_compiler_support(void)
+{
+#ifndef CC_USING_FENTRY
+ return 1;
+#endif
+ return 0;
+}
+
+static inline void klp_arch_set_pc(struct pt_regs *regs, unsigned long ip)
+{
+ regs->ip = ip;
+}
+
+#endif /* _ASM_X86_LIVEPATCH_H */
diff --git a/arch/x86/include/asm/local.h b/arch/x86/include/asm/local.h
new file mode 100644
index 000000000..c91083c59
--- /dev/null
+++ b/arch/x86/include/asm/local.h
@@ -0,0 +1,162 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_LOCAL_H
+#define _ASM_X86_LOCAL_H
+
+#include <linux/percpu.h>
+
+#include <linux/atomic.h>
+#include <asm/asm.h>
+
+typedef struct {
+ atomic_long_t a;
+} local_t;
+
+#define LOCAL_INIT(i) { ATOMIC_LONG_INIT(i) }
+
+#define local_read(l) atomic_long_read(&(l)->a)
+#define local_set(l, i) atomic_long_set(&(l)->a, (i))
+
+static inline void local_inc(local_t *l)
+{
+ asm volatile(_ASM_INC "%0"
+ : "+m" (l->a.counter));
+}
+
+static inline void local_dec(local_t *l)
+{
+ asm volatile(_ASM_DEC "%0"
+ : "+m" (l->a.counter));
+}
+
+static inline void local_add(long i, local_t *l)
+{
+ asm volatile(_ASM_ADD "%1,%0"
+ : "+m" (l->a.counter)
+ : "ir" (i));
+}
+
+static inline void local_sub(long i, local_t *l)
+{
+ asm volatile(_ASM_SUB "%1,%0"
+ : "+m" (l->a.counter)
+ : "ir" (i));
+}
+
+/**
+ * local_sub_and_test - subtract value from variable and test result
+ * @i: integer value to subtract
+ * @l: pointer to type local_t
+ *
+ * Atomically subtracts @i from @l and returns
+ * true if the result is zero, or false for all
+ * other cases.
+ */
+static inline bool local_sub_and_test(long i, local_t *l)
+{
+ GEN_BINARY_RMWcc(_ASM_SUB, l->a.counter, "er", i, "%0", e);
+}
+
+/**
+ * local_dec_and_test - decrement and test
+ * @l: pointer to type local_t
+ *
+ * Atomically decrements @l by 1 and
+ * returns true if the result is 0, or false for all other
+ * cases.
+ */
+static inline bool local_dec_and_test(local_t *l)
+{
+ GEN_UNARY_RMWcc(_ASM_DEC, l->a.counter, "%0", e);
+}
+
+/**
+ * local_inc_and_test - increment and test
+ * @l: pointer to type local_t
+ *
+ * Atomically increments @l by 1
+ * and returns true if the result is zero, or false for all
+ * other cases.
+ */
+static inline bool local_inc_and_test(local_t *l)
+{
+ GEN_UNARY_RMWcc(_ASM_INC, l->a.counter, "%0", e);
+}
+
+/**
+ * local_add_negative - add and test if negative
+ * @i: integer value to add
+ * @l: pointer to type local_t
+ *
+ * Atomically adds @i to @l and returns true
+ * if the result is negative, or false when
+ * result is greater than or equal to zero.
+ */
+static inline bool local_add_negative(long i, local_t *l)
+{
+ GEN_BINARY_RMWcc(_ASM_ADD, l->a.counter, "er", i, "%0", s);
+}
+
+/**
+ * local_add_return - add and return
+ * @i: integer value to add
+ * @l: pointer to type local_t
+ *
+ * Atomically adds @i to @l and returns @i + @l
+ */
+static inline long local_add_return(long i, local_t *l)
+{
+ long __i = i;
+ asm volatile(_ASM_XADD "%0, %1;"
+ : "+r" (i), "+m" (l->a.counter)
+ : : "memory");
+ return i + __i;
+}
+
+static inline long local_sub_return(long i, local_t *l)
+{
+ return local_add_return(-i, l);
+}
+
+#define local_inc_return(l) (local_add_return(1, l))
+#define local_dec_return(l) (local_sub_return(1, l))
+
+#define local_cmpxchg(l, o, n) \
+ (cmpxchg_local(&((l)->a.counter), (o), (n)))
+/* Always has a lock prefix */
+#define local_xchg(l, n) (xchg(&((l)->a.counter), (n)))
+
+/**
+ * local_add_unless - add unless the number is a given value
+ * @l: pointer of type local_t
+ * @a: the amount to add to l...
+ * @u: ...unless l is equal to u.
+ *
+ * Atomically adds @a to @l, so long as it was not @u.
+ * Returns non-zero if @l was not @u, and zero otherwise.
+ */
+#define local_add_unless(l, a, u) \
+({ \
+ long c, old; \
+ c = local_read((l)); \
+ for (;;) { \
+ if (unlikely(c == (u))) \
+ break; \
+ old = local_cmpxchg((l), c, c + (a)); \
+ if (likely(old == c)) \
+ break; \
+ c = old; \
+ } \
+ c != (u); \
+})
+#define local_inc_not_zero(l) local_add_unless((l), 1, 0)
+
+/* On x86_32, these are no better than the atomic variants.
+ * On x86-64 these are better than the atomic variants on SMP kernels
+ * because they dont use a lock prefix.
+ */
+#define __local_inc(l) local_inc(l)
+#define __local_dec(l) local_dec(l)
+#define __local_add(i, l) local_add((i), (l))
+#define __local_sub(i, l) local_sub((i), (l))
+
+#endif /* _ASM_X86_LOCAL_H */
diff --git a/arch/x86/include/asm/local64.h b/arch/x86/include/asm/local64.h
new file mode 100644
index 000000000..36c93b5cc
--- /dev/null
+++ b/arch/x86/include/asm/local64.h
@@ -0,0 +1 @@
+#include <asm-generic/local64.h>
diff --git a/arch/x86/include/asm/mach_timer.h b/arch/x86/include/asm/mach_timer.h
new file mode 100644
index 000000000..044daf6fb
--- /dev/null
+++ b/arch/x86/include/asm/mach_timer.h
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Machine specific calibrate_tsc() for generic.
+ * Split out from timer_tsc.c by Osamu Tomita <tomita@cinet.co.jp>
+ */
+/* ------ Calibrate the TSC -------
+ * Return 2^32 * (1 / (TSC clocks per usec)) for do_fast_gettimeoffset().
+ * Too much 64-bit arithmetic here to do this cleanly in C, and for
+ * accuracy's sake we want to keep the overhead on the CTC speaker (channel 2)
+ * output busy loop as low as possible. We avoid reading the CTC registers
+ * directly because of the awkward 8-bit access mechanism of the 82C54
+ * device.
+ */
+#ifndef _ASM_X86_MACH_DEFAULT_MACH_TIMER_H
+#define _ASM_X86_MACH_DEFAULT_MACH_TIMER_H
+
+#define CALIBRATE_TIME_MSEC 30 /* 30 msecs */
+#define CALIBRATE_LATCH \
+ ((PIT_TICK_RATE * CALIBRATE_TIME_MSEC + 1000/2)/1000)
+
+static inline void mach_prepare_counter(void)
+{
+ /* Set the Gate high, disable speaker */
+ outb((inb(0x61) & ~0x02) | 0x01, 0x61);
+
+ /*
+ * Now let's take care of CTC channel 2
+ *
+ * Set the Gate high, program CTC channel 2 for mode 0,
+ * (interrupt on terminal count mode), binary count,
+ * load 5 * LATCH count, (LSB and MSB) to begin countdown.
+ *
+ * Some devices need a delay here.
+ */
+ outb(0xb0, 0x43); /* binary, mode 0, LSB/MSB, Ch 2 */
+ outb_p(CALIBRATE_LATCH & 0xff, 0x42); /* LSB of count */
+ outb_p(CALIBRATE_LATCH >> 8, 0x42); /* MSB of count */
+}
+
+static inline void mach_countup(unsigned long *count_p)
+{
+ unsigned long count = 0;
+ do {
+ count++;
+ } while ((inb_p(0x61) & 0x20) == 0);
+ *count_p = count;
+}
+
+#endif /* _ASM_X86_MACH_DEFAULT_MACH_TIMER_H */
diff --git a/arch/x86/include/asm/mach_traps.h b/arch/x86/include/asm/mach_traps.h
new file mode 100644
index 000000000..e39a51746
--- /dev/null
+++ b/arch/x86/include/asm/mach_traps.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Machine specific NMI handling for generic.
+ * Split out from traps.c by Osamu Tomita <tomita@cinet.co.jp>
+ */
+#ifndef _ASM_X86_MACH_DEFAULT_MACH_TRAPS_H
+#define _ASM_X86_MACH_DEFAULT_MACH_TRAPS_H
+
+#include <asm/mc146818rtc.h>
+
+#define NMI_REASON_PORT 0x61
+
+#define NMI_REASON_SERR 0x80
+#define NMI_REASON_IOCHK 0x40
+#define NMI_REASON_MASK (NMI_REASON_SERR | NMI_REASON_IOCHK)
+
+#define NMI_REASON_CLEAR_SERR 0x04
+#define NMI_REASON_CLEAR_IOCHK 0x08
+#define NMI_REASON_CLEAR_MASK 0x0f
+
+static inline unsigned char default_get_nmi_reason(void)
+{
+ return inb(NMI_REASON_PORT);
+}
+
+static inline void reassert_nmi(void)
+{
+ int old_reg = -1;
+
+ if (do_i_have_lock_cmos())
+ old_reg = current_lock_cmos_reg();
+ else
+ lock_cmos(0); /* register doesn't matter here */
+ outb(0x8f, 0x70);
+ inb(0x71); /* dummy */
+ outb(0x0f, 0x70);
+ inb(0x71); /* dummy */
+ if (old_reg >= 0)
+ outb(old_reg, 0x70);
+ else
+ unlock_cmos();
+}
+
+#endif /* _ASM_X86_MACH_DEFAULT_MACH_TRAPS_H */
diff --git a/arch/x86/include/asm/math_emu.h b/arch/x86/include/asm/math_emu.h
new file mode 100644
index 000000000..3c4274308
--- /dev/null
+++ b/arch/x86/include/asm/math_emu.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_MATH_EMU_H
+#define _ASM_X86_MATH_EMU_H
+
+#include <asm/ptrace.h>
+
+/* This structure matches the layout of the data saved to the stack
+ following a device-not-present interrupt, part of it saved
+ automatically by the 80386/80486.
+ */
+struct math_emu_info {
+ long ___orig_eip;
+ struct pt_regs *regs;
+};
+#endif /* _ASM_X86_MATH_EMU_H */
diff --git a/arch/x86/include/asm/mc146818rtc.h b/arch/x86/include/asm/mc146818rtc.h
new file mode 100644
index 000000000..97198001e
--- /dev/null
+++ b/arch/x86/include/asm/mc146818rtc.h
@@ -0,0 +1,103 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Machine dependent access functions for RTC registers.
+ */
+#ifndef _ASM_X86_MC146818RTC_H
+#define _ASM_X86_MC146818RTC_H
+
+#include <asm/io.h>
+#include <asm/processor.h>
+
+#ifndef RTC_PORT
+#define RTC_PORT(x) (0x70 + (x))
+#define RTC_ALWAYS_BCD 1 /* RTC operates in binary mode */
+#endif
+
+#if defined(CONFIG_X86_32)
+/*
+ * This lock provides nmi access to the CMOS/RTC registers. It has some
+ * special properties. It is owned by a CPU and stores the index register
+ * currently being accessed (if owned). The idea here is that it works
+ * like a normal lock (normally). However, in an NMI, the NMI code will
+ * first check to see if its CPU owns the lock, meaning that the NMI
+ * interrupted during the read/write of the device. If it does, it goes ahead
+ * and performs the access and then restores the index register. If it does
+ * not, it locks normally.
+ *
+ * Note that since we are working with NMIs, we need this lock even in
+ * a non-SMP machine just to mark that the lock is owned.
+ *
+ * This only works with compare-and-swap. There is no other way to
+ * atomically claim the lock and set the owner.
+ */
+#include <linux/smp.h>
+extern volatile unsigned long cmos_lock;
+
+/*
+ * All of these below must be called with interrupts off, preempt
+ * disabled, etc.
+ */
+
+static inline void lock_cmos(unsigned char reg)
+{
+ unsigned long new;
+ new = ((smp_processor_id() + 1) << 8) | reg;
+ for (;;) {
+ if (cmos_lock) {
+ cpu_relax();
+ continue;
+ }
+ if (__cmpxchg(&cmos_lock, 0, new, sizeof(cmos_lock)) == 0)
+ return;
+ }
+}
+
+static inline void unlock_cmos(void)
+{
+ cmos_lock = 0;
+}
+
+static inline int do_i_have_lock_cmos(void)
+{
+ return (cmos_lock >> 8) == (smp_processor_id() + 1);
+}
+
+static inline unsigned char current_lock_cmos_reg(void)
+{
+ return cmos_lock & 0xff;
+}
+
+#define lock_cmos_prefix(reg) \
+ do { \
+ unsigned long cmos_flags; \
+ local_irq_save(cmos_flags); \
+ lock_cmos(reg)
+
+#define lock_cmos_suffix(reg) \
+ unlock_cmos(); \
+ local_irq_restore(cmos_flags); \
+ } while (0)
+#else
+#define lock_cmos_prefix(reg) do {} while (0)
+#define lock_cmos_suffix(reg) do {} while (0)
+#define lock_cmos(reg) do { } while (0)
+#define unlock_cmos() do { } while (0)
+#define do_i_have_lock_cmos() 0
+#define current_lock_cmos_reg() 0
+#endif
+
+/*
+ * The yet supported machines all access the RTC index register via
+ * an ISA port access but the way to access the date register differs ...
+ */
+#define CMOS_READ(addr) rtc_cmos_read(addr)
+#define CMOS_WRITE(val, addr) rtc_cmos_write(val, addr)
+unsigned char rtc_cmos_read(unsigned char addr);
+void rtc_cmos_write(unsigned char val, unsigned char addr);
+
+extern int mach_set_rtc_mmss(const struct timespec64 *now);
+extern void mach_get_cmos_time(struct timespec64 *now);
+
+#define RTC_IRQ 8
+
+#endif /* _ASM_X86_MC146818RTC_H */
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
new file mode 100644
index 000000000..eb786f90f
--- /dev/null
+++ b/arch/x86/include/asm/mce.h
@@ -0,0 +1,344 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_MCE_H
+#define _ASM_X86_MCE_H
+
+#include <uapi/asm/mce.h>
+
+/*
+ * Machine Check support for x86
+ */
+
+/* MCG_CAP register defines */
+#define MCG_BANKCNT_MASK 0xff /* Number of Banks */
+#define MCG_CTL_P (1ULL<<8) /* MCG_CTL register available */
+#define MCG_EXT_P (1ULL<<9) /* Extended registers available */
+#define MCG_CMCI_P (1ULL<<10) /* CMCI supported */
+#define MCG_EXT_CNT_MASK 0xff0000 /* Number of Extended registers */
+#define MCG_EXT_CNT_SHIFT 16
+#define MCG_EXT_CNT(c) (((c) & MCG_EXT_CNT_MASK) >> MCG_EXT_CNT_SHIFT)
+#define MCG_SER_P (1ULL<<24) /* MCA recovery/new status bits */
+#define MCG_ELOG_P (1ULL<<26) /* Extended error log supported */
+#define MCG_LMCE_P (1ULL<<27) /* Local machine check supported */
+
+/* MCG_STATUS register defines */
+#define MCG_STATUS_RIPV (1ULL<<0) /* restart ip valid */
+#define MCG_STATUS_EIPV (1ULL<<1) /* ip points to correct instruction */
+#define MCG_STATUS_MCIP (1ULL<<2) /* machine check in progress */
+#define MCG_STATUS_LMCES (1ULL<<3) /* LMCE signaled */
+
+/* MCG_EXT_CTL register defines */
+#define MCG_EXT_CTL_LMCE_EN (1ULL<<0) /* Enable LMCE */
+
+/* MCi_STATUS register defines */
+#define MCI_STATUS_VAL (1ULL<<63) /* valid error */
+#define MCI_STATUS_OVER (1ULL<<62) /* previous errors lost */
+#define MCI_STATUS_UC (1ULL<<61) /* uncorrected error */
+#define MCI_STATUS_EN (1ULL<<60) /* error enabled */
+#define MCI_STATUS_MISCV (1ULL<<59) /* misc error reg. valid */
+#define MCI_STATUS_ADDRV (1ULL<<58) /* addr reg. valid */
+#define MCI_STATUS_PCC (1ULL<<57) /* processor context corrupt */
+#define MCI_STATUS_S (1ULL<<56) /* Signaled machine check */
+#define MCI_STATUS_AR (1ULL<<55) /* Action required */
+
+/* AMD-specific bits */
+#define MCI_STATUS_TCC (1ULL<<55) /* Task context corrupt */
+#define MCI_STATUS_SYNDV (1ULL<<53) /* synd reg. valid */
+#define MCI_STATUS_DEFERRED (1ULL<<44) /* uncorrected error, deferred exception */
+#define MCI_STATUS_POISON (1ULL<<43) /* access poisonous data */
+
+/*
+ * McaX field if set indicates a given bank supports MCA extensions:
+ * - Deferred error interrupt type is specifiable by bank.
+ * - MCx_MISC0[BlkPtr] field indicates presence of extended MISC registers,
+ * But should not be used to determine MSR numbers.
+ * - TCC bit is present in MCx_STATUS.
+ */
+#define MCI_CONFIG_MCAX 0x1
+#define MCI_IPID_MCATYPE 0xFFFF0000
+#define MCI_IPID_HWID 0xFFF
+
+/*
+ * Note that the full MCACOD field of IA32_MCi_STATUS MSR is
+ * bits 15:0. But bit 12 is the 'F' bit, defined for corrected
+ * errors to indicate that errors are being filtered by hardware.
+ * We should mask out bit 12 when looking for specific signatures
+ * of uncorrected errors - so the F bit is deliberately skipped
+ * in this #define.
+ */
+#define MCACOD 0xefff /* MCA Error Code */
+
+/* Architecturally defined codes from SDM Vol. 3B Chapter 15 */
+#define MCACOD_SCRUB 0x00C0 /* 0xC0-0xCF Memory Scrubbing */
+#define MCACOD_SCRUBMSK 0xeff0 /* Skip bit 12 ('F' bit) */
+#define MCACOD_L3WB 0x017A /* L3 Explicit Writeback */
+#define MCACOD_DATA 0x0134 /* Data Load */
+#define MCACOD_INSTR 0x0150 /* Instruction Fetch */
+
+/* MCi_MISC register defines */
+#define MCI_MISC_ADDR_LSB(m) ((m) & 0x3f)
+#define MCI_MISC_ADDR_MODE(m) (((m) >> 6) & 7)
+#define MCI_MISC_ADDR_SEGOFF 0 /* segment offset */
+#define MCI_MISC_ADDR_LINEAR 1 /* linear address */
+#define MCI_MISC_ADDR_PHYS 2 /* physical address */
+#define MCI_MISC_ADDR_MEM 3 /* memory address */
+#define MCI_MISC_ADDR_GENERIC 7 /* generic */
+
+/* CTL2 register defines */
+#define MCI_CTL2_CMCI_EN (1ULL << 30)
+#define MCI_CTL2_CMCI_THRESHOLD_MASK 0x7fffULL
+
+#define MCJ_CTX_MASK 3
+#define MCJ_CTX(flags) ((flags) & MCJ_CTX_MASK)
+#define MCJ_CTX_RANDOM 0 /* inject context: random */
+#define MCJ_CTX_PROCESS 0x1 /* inject context: process */
+#define MCJ_CTX_IRQ 0x2 /* inject context: IRQ */
+#define MCJ_NMI_BROADCAST 0x4 /* do NMI broadcasting */
+#define MCJ_EXCEPTION 0x8 /* raise as exception */
+#define MCJ_IRQ_BROADCAST 0x10 /* do IRQ broadcasting */
+
+#define MCE_OVERFLOW 0 /* bit 0 in flags means overflow */
+
+#define MCE_LOG_LEN 32
+#define MCE_LOG_SIGNATURE "MACHINECHECK"
+
+/* AMD Scalable MCA */
+#define MSR_AMD64_SMCA_MC0_CTL 0xc0002000
+#define MSR_AMD64_SMCA_MC0_STATUS 0xc0002001
+#define MSR_AMD64_SMCA_MC0_ADDR 0xc0002002
+#define MSR_AMD64_SMCA_MC0_MISC0 0xc0002003
+#define MSR_AMD64_SMCA_MC0_CONFIG 0xc0002004
+#define MSR_AMD64_SMCA_MC0_IPID 0xc0002005
+#define MSR_AMD64_SMCA_MC0_SYND 0xc0002006
+#define MSR_AMD64_SMCA_MC0_DESTAT 0xc0002008
+#define MSR_AMD64_SMCA_MC0_DEADDR 0xc0002009
+#define MSR_AMD64_SMCA_MC0_MISC1 0xc000200a
+#define MSR_AMD64_SMCA_MCx_CTL(x) (MSR_AMD64_SMCA_MC0_CTL + 0x10*(x))
+#define MSR_AMD64_SMCA_MCx_STATUS(x) (MSR_AMD64_SMCA_MC0_STATUS + 0x10*(x))
+#define MSR_AMD64_SMCA_MCx_ADDR(x) (MSR_AMD64_SMCA_MC0_ADDR + 0x10*(x))
+#define MSR_AMD64_SMCA_MCx_MISC(x) (MSR_AMD64_SMCA_MC0_MISC0 + 0x10*(x))
+#define MSR_AMD64_SMCA_MCx_CONFIG(x) (MSR_AMD64_SMCA_MC0_CONFIG + 0x10*(x))
+#define MSR_AMD64_SMCA_MCx_IPID(x) (MSR_AMD64_SMCA_MC0_IPID + 0x10*(x))
+#define MSR_AMD64_SMCA_MCx_SYND(x) (MSR_AMD64_SMCA_MC0_SYND + 0x10*(x))
+#define MSR_AMD64_SMCA_MCx_DESTAT(x) (MSR_AMD64_SMCA_MC0_DESTAT + 0x10*(x))
+#define MSR_AMD64_SMCA_MCx_DEADDR(x) (MSR_AMD64_SMCA_MC0_DEADDR + 0x10*(x))
+#define MSR_AMD64_SMCA_MCx_MISCy(x, y) ((MSR_AMD64_SMCA_MC0_MISC1 + y) + (0x10*(x)))
+
+/*
+ * This structure contains all data related to the MCE log. Also
+ * carries a signature to make it easier to find from external
+ * debugging tools. Each entry is only valid when its finished flag
+ * is set.
+ */
+struct mce_log_buffer {
+ char signature[12]; /* "MACHINECHECK" */
+ unsigned len; /* = MCE_LOG_LEN */
+ unsigned next;
+ unsigned flags;
+ unsigned recordlen; /* length of struct mce */
+ struct mce entry[MCE_LOG_LEN];
+};
+
+enum mce_notifier_prios {
+ MCE_PRIO_FIRST = INT_MAX,
+ MCE_PRIO_SRAO = INT_MAX - 1,
+ MCE_PRIO_EXTLOG = INT_MAX - 2,
+ MCE_PRIO_NFIT = INT_MAX - 3,
+ MCE_PRIO_EDAC = INT_MAX - 4,
+ MCE_PRIO_MCELOG = 1,
+ MCE_PRIO_LOWEST = 0,
+};
+
+struct notifier_block;
+extern void mce_register_decode_chain(struct notifier_block *nb);
+extern void mce_unregister_decode_chain(struct notifier_block *nb);
+
+#include <linux/percpu.h>
+#include <linux/atomic.h>
+
+extern int mce_p5_enabled;
+
+#ifdef CONFIG_X86_MCE
+int mcheck_init(void);
+void mcheck_cpu_init(struct cpuinfo_x86 *c);
+void mcheck_cpu_clear(struct cpuinfo_x86 *c);
+void mcheck_vendor_init_severity(void);
+#else
+static inline int mcheck_init(void) { return 0; }
+static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {}
+static inline void mcheck_cpu_clear(struct cpuinfo_x86 *c) {}
+static inline void mcheck_vendor_init_severity(void) {}
+#endif
+
+#ifdef CONFIG_X86_ANCIENT_MCE
+void intel_p5_mcheck_init(struct cpuinfo_x86 *c);
+void winchip_mcheck_init(struct cpuinfo_x86 *c);
+static inline void enable_p5_mce(void) { mce_p5_enabled = 1; }
+#else
+static inline void intel_p5_mcheck_init(struct cpuinfo_x86 *c) {}
+static inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {}
+static inline void enable_p5_mce(void) {}
+#endif
+
+void mce_setup(struct mce *m);
+void mce_log(struct mce *m);
+DECLARE_PER_CPU(struct device *, mce_device);
+
+/*
+ * Maximum banks number.
+ * This is the limit of the current register layout on
+ * Intel CPUs.
+ */
+#define MAX_NR_BANKS 32
+
+#ifdef CONFIG_X86_MCE_INTEL
+void mce_intel_feature_init(struct cpuinfo_x86 *c);
+void mce_intel_feature_clear(struct cpuinfo_x86 *c);
+void cmci_clear(void);
+void cmci_reenable(void);
+void cmci_rediscover(void);
+void cmci_recheck(void);
+#else
+static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) { }
+static inline void mce_intel_feature_clear(struct cpuinfo_x86 *c) { }
+static inline void cmci_clear(void) {}
+static inline void cmci_reenable(void) {}
+static inline void cmci_rediscover(void) {}
+static inline void cmci_recheck(void) {}
+#endif
+
+#ifdef CONFIG_X86_MCE_AMD
+void mce_amd_feature_init(struct cpuinfo_x86 *c);
+int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr);
+#else
+static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { }
+static inline int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr) { return -EINVAL; };
+#endif
+
+int mce_available(struct cpuinfo_x86 *c);
+bool mce_is_memory_error(struct mce *m);
+bool mce_is_correctable(struct mce *m);
+int mce_usable_address(struct mce *m);
+
+DECLARE_PER_CPU(unsigned, mce_exception_count);
+DECLARE_PER_CPU(unsigned, mce_poll_count);
+
+typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS);
+DECLARE_PER_CPU(mce_banks_t, mce_poll_banks);
+
+enum mcp_flags {
+ MCP_TIMESTAMP = BIT(0), /* log time stamp */
+ MCP_UC = BIT(1), /* log uncorrected errors */
+ MCP_DONTLOG = BIT(2), /* only clear, don't log */
+};
+bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
+
+int mce_notify_irq(void);
+
+DECLARE_PER_CPU(struct mce, injectm);
+
+/* Disable CMCI/polling for MCA bank claimed by firmware */
+extern void mce_disable_bank(int bank);
+
+/*
+ * Exception handler
+ */
+
+/* Call the installed machine check handler for this CPU setup. */
+extern void (*machine_check_vector)(struct pt_regs *, long error_code);
+void do_machine_check(struct pt_regs *, long);
+
+/*
+ * Threshold handler
+ */
+extern void (*mce_threshold_vector)(void);
+
+/* Deferred error interrupt handler */
+extern void (*deferred_error_int_vector)(void);
+
+/*
+ * Thermal handler
+ */
+
+void intel_init_thermal(struct cpuinfo_x86 *c);
+
+/* Interrupt Handler for core thermal thresholds */
+extern int (*platform_thermal_notify)(__u64 msr_val);
+
+/* Interrupt Handler for package thermal thresholds */
+extern int (*platform_thermal_package_notify)(__u64 msr_val);
+
+/* Callback support of rate control, return true, if
+ * callback has rate control */
+extern bool (*platform_thermal_package_rate_control)(void);
+
+#ifdef CONFIG_X86_THERMAL_VECTOR
+extern void mcheck_intel_therm_init(void);
+#else
+static inline void mcheck_intel_therm_init(void) { }
+#endif
+
+/*
+ * Used by APEI to report memory error via /dev/mcelog
+ */
+
+struct cper_sec_mem_err;
+extern void apei_mce_report_mem_error(int corrected,
+ struct cper_sec_mem_err *mem_err);
+
+/*
+ * Enumerate new IP types and HWID values in AMD processors which support
+ * Scalable MCA.
+ */
+#ifdef CONFIG_X86_MCE_AMD
+
+/* These may be used by multiple smca_hwid_mcatypes */
+enum smca_bank_types {
+ SMCA_LS = 0, /* Load Store */
+ SMCA_IF, /* Instruction Fetch */
+ SMCA_L2_CACHE, /* L2 Cache */
+ SMCA_DE, /* Decoder Unit */
+ SMCA_RESERVED, /* Reserved */
+ SMCA_EX, /* Execution Unit */
+ SMCA_FP, /* Floating Point */
+ SMCA_L3_CACHE, /* L3 Cache */
+ SMCA_CS, /* Coherent Slave */
+ SMCA_PIE, /* Power, Interrupts, etc. */
+ SMCA_UMC, /* Unified Memory Controller */
+ SMCA_PB, /* Parameter Block */
+ SMCA_PSP, /* Platform Security Processor */
+ SMCA_SMU, /* System Management Unit */
+ N_SMCA_BANK_TYPES
+};
+
+#define HWID_MCATYPE(hwid, mcatype) (((hwid) << 16) | (mcatype))
+
+struct smca_hwid {
+ unsigned int bank_type; /* Use with smca_bank_types for easy indexing. */
+ u32 hwid_mcatype; /* (hwid,mcatype) tuple */
+ u32 xec_bitmap; /* Bitmap of valid ExtErrorCodes; current max is 21. */
+ u8 count; /* Number of instances. */
+};
+
+struct smca_bank {
+ struct smca_hwid *hwid;
+ u32 id; /* Value of MCA_IPID[InstanceId]. */
+ u8 sysfs_id; /* Value used for sysfs name. */
+};
+
+extern struct smca_bank smca_banks[MAX_NR_BANKS];
+
+extern const char *smca_get_long_name(enum smca_bank_types t);
+extern bool amd_mce_is_memory_error(struct mce *m);
+
+extern int mce_threshold_create_device(unsigned int cpu);
+extern int mce_threshold_remove_device(unsigned int cpu);
+
+#else
+
+static inline int mce_threshold_create_device(unsigned int cpu) { return 0; };
+static inline int mce_threshold_remove_device(unsigned int cpu) { return 0; };
+static inline bool amd_mce_is_memory_error(struct mce *m) { return false; };
+
+#endif
+
+#endif /* _ASM_X86_MCE_H */
diff --git a/arch/x86/include/asm/mcsafe_test.h b/arch/x86/include/asm/mcsafe_test.h
new file mode 100644
index 000000000..eb59804b6
--- /dev/null
+++ b/arch/x86/include/asm/mcsafe_test.h
@@ -0,0 +1,75 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _MCSAFE_TEST_H_
+#define _MCSAFE_TEST_H_
+
+#ifndef __ASSEMBLY__
+#ifdef CONFIG_MCSAFE_TEST
+extern unsigned long mcsafe_test_src;
+extern unsigned long mcsafe_test_dst;
+
+static inline void mcsafe_inject_src(void *addr)
+{
+ if (addr)
+ mcsafe_test_src = (unsigned long) addr;
+ else
+ mcsafe_test_src = ~0UL;
+}
+
+static inline void mcsafe_inject_dst(void *addr)
+{
+ if (addr)
+ mcsafe_test_dst = (unsigned long) addr;
+ else
+ mcsafe_test_dst = ~0UL;
+}
+#else /* CONFIG_MCSAFE_TEST */
+static inline void mcsafe_inject_src(void *addr)
+{
+}
+
+static inline void mcsafe_inject_dst(void *addr)
+{
+}
+#endif /* CONFIG_MCSAFE_TEST */
+
+#else /* __ASSEMBLY__ */
+#include <asm/export.h>
+
+#ifdef CONFIG_MCSAFE_TEST
+.macro MCSAFE_TEST_CTL
+ .pushsection .data
+ .align 8
+ .globl mcsafe_test_src
+ mcsafe_test_src:
+ .quad 0
+ EXPORT_SYMBOL_GPL(mcsafe_test_src)
+ .globl mcsafe_test_dst
+ mcsafe_test_dst:
+ .quad 0
+ EXPORT_SYMBOL_GPL(mcsafe_test_dst)
+ .popsection
+.endm
+
+.macro MCSAFE_TEST_SRC reg count target
+ leaq \count(\reg), %r9
+ cmp mcsafe_test_src, %r9
+ ja \target
+.endm
+
+.macro MCSAFE_TEST_DST reg count target
+ leaq \count(\reg), %r9
+ cmp mcsafe_test_dst, %r9
+ ja \target
+.endm
+#else
+.macro MCSAFE_TEST_CTL
+.endm
+
+.macro MCSAFE_TEST_SRC reg count target
+.endm
+
+.macro MCSAFE_TEST_DST reg count target
+.endm
+#endif /* CONFIG_MCSAFE_TEST */
+#endif /* __ASSEMBLY__ */
+#endif /* _MCSAFE_TEST_H_ */
diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
new file mode 100644
index 000000000..616f8e637
--- /dev/null
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -0,0 +1,100 @@
+/*
+ * AMD Memory Encryption Support
+ *
+ * Copyright (C) 2016 Advanced Micro Devices, Inc.
+ *
+ * Author: Tom Lendacky <thomas.lendacky@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __X86_MEM_ENCRYPT_H__
+#define __X86_MEM_ENCRYPT_H__
+
+#ifndef __ASSEMBLY__
+
+#include <linux/init.h>
+
+#include <asm/bootparam.h>
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+
+extern u64 sme_me_mask;
+extern bool sev_enabled;
+
+void sme_encrypt_execute(unsigned long encrypted_kernel_vaddr,
+ unsigned long decrypted_kernel_vaddr,
+ unsigned long kernel_len,
+ unsigned long encryption_wa,
+ unsigned long encryption_pgd);
+
+void __init sme_early_encrypt(resource_size_t paddr,
+ unsigned long size);
+void __init sme_early_decrypt(resource_size_t paddr,
+ unsigned long size);
+
+void __init sme_map_bootdata(char *real_mode_data);
+void __init sme_unmap_bootdata(char *real_mode_data);
+
+void __init sme_early_init(void);
+
+void __init sme_encrypt_kernel(struct boot_params *bp);
+void __init sme_enable(struct boot_params *bp);
+
+int __init early_set_memory_decrypted(unsigned long vaddr, unsigned long size);
+int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size);
+
+/* Architecture __weak replacement functions */
+void __init mem_encrypt_init(void);
+void __init mem_encrypt_free_decrypted_mem(void);
+
+bool sme_active(void);
+bool sev_active(void);
+
+#define __bss_decrypted __attribute__((__section__(".bss..decrypted")))
+
+#else /* !CONFIG_AMD_MEM_ENCRYPT */
+
+#define sme_me_mask 0ULL
+
+static inline void __init sme_early_encrypt(resource_size_t paddr,
+ unsigned long size) { }
+static inline void __init sme_early_decrypt(resource_size_t paddr,
+ unsigned long size) { }
+
+static inline void __init sme_map_bootdata(char *real_mode_data) { }
+static inline void __init sme_unmap_bootdata(char *real_mode_data) { }
+
+static inline void __init sme_early_init(void) { }
+
+static inline void __init sme_encrypt_kernel(struct boot_params *bp) { }
+static inline void __init sme_enable(struct boot_params *bp) { }
+
+static inline bool sme_active(void) { return false; }
+static inline bool sev_active(void) { return false; }
+
+static inline int __init
+early_set_memory_decrypted(unsigned long vaddr, unsigned long size) { return 0; }
+static inline int __init
+early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 0; }
+
+#define __bss_decrypted
+
+#endif /* CONFIG_AMD_MEM_ENCRYPT */
+
+/*
+ * The __sme_pa() and __sme_pa_nodebug() macros are meant for use when
+ * writing to or comparing values from the cr3 register. Having the
+ * encryption mask set in cr3 enables the PGD entry to be encrypted and
+ * avoid special case handling of PGD allocations.
+ */
+#define __sme_pa(x) (__pa(x) | sme_me_mask)
+#define __sme_pa_nodebug(x) (__pa_nodebug(x) | sme_me_mask)
+
+extern char __start_bss_decrypted[], __end_bss_decrypted[], __start_bss_decrypted_unused[];
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* __X86_MEM_ENCRYPT_H__ */
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
new file mode 100644
index 000000000..91a06cef5
--- /dev/null
+++ b/arch/x86/include/asm/microcode.h
@@ -0,0 +1,147 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_MICROCODE_H
+#define _ASM_X86_MICROCODE_H
+
+#include <asm/cpu.h>
+#include <linux/earlycpio.h>
+#include <linux/initrd.h>
+
+struct ucode_patch {
+ struct list_head plist;
+ void *data; /* Intel uses only this one */
+ u32 patch_id;
+ u16 equiv_cpu;
+};
+
+extern struct list_head microcode_cache;
+
+struct cpu_signature {
+ unsigned int sig;
+ unsigned int pf;
+ unsigned int rev;
+};
+
+struct device;
+
+enum ucode_state {
+ UCODE_OK = 0,
+ UCODE_NEW,
+ UCODE_UPDATED,
+ UCODE_NFOUND,
+ UCODE_ERROR,
+};
+
+struct microcode_ops {
+ enum ucode_state (*request_microcode_user) (int cpu,
+ const void __user *buf, size_t size);
+
+ enum ucode_state (*request_microcode_fw) (int cpu, struct device *,
+ bool refresh_fw);
+
+ void (*microcode_fini_cpu) (int cpu);
+
+ /*
+ * The generic 'microcode_core' part guarantees that
+ * the callbacks below run on a target cpu when they
+ * are being called.
+ * See also the "Synchronization" section in microcode_core.c.
+ */
+ enum ucode_state (*apply_microcode) (int cpu);
+ int (*collect_cpu_info) (int cpu, struct cpu_signature *csig);
+};
+
+struct ucode_cpu_info {
+ struct cpu_signature cpu_sig;
+ int valid;
+ void *mc;
+};
+extern struct ucode_cpu_info ucode_cpu_info[];
+struct cpio_data find_microcode_in_initrd(const char *path, bool use_pa);
+
+#ifdef CONFIG_MICROCODE_INTEL
+extern struct microcode_ops * __init init_intel_microcode(void);
+#else
+static inline struct microcode_ops * __init init_intel_microcode(void)
+{
+ return NULL;
+}
+#endif /* CONFIG_MICROCODE_INTEL */
+
+#ifdef CONFIG_MICROCODE_AMD
+extern struct microcode_ops * __init init_amd_microcode(void);
+extern void __exit exit_amd_microcode(void);
+#else
+static inline struct microcode_ops * __init init_amd_microcode(void)
+{
+ return NULL;
+}
+static inline void __exit exit_amd_microcode(void) {}
+#endif
+
+#define MAX_UCODE_COUNT 128
+
+#define QCHAR(a, b, c, d) ((a) + ((b) << 8) + ((c) << 16) + ((d) << 24))
+#define CPUID_INTEL1 QCHAR('G', 'e', 'n', 'u')
+#define CPUID_INTEL2 QCHAR('i', 'n', 'e', 'I')
+#define CPUID_INTEL3 QCHAR('n', 't', 'e', 'l')
+#define CPUID_AMD1 QCHAR('A', 'u', 't', 'h')
+#define CPUID_AMD2 QCHAR('e', 'n', 't', 'i')
+#define CPUID_AMD3 QCHAR('c', 'A', 'M', 'D')
+
+#define CPUID_IS(a, b, c, ebx, ecx, edx) \
+ (!((ebx ^ (a))|(edx ^ (b))|(ecx ^ (c))))
+
+/*
+ * In early loading microcode phase on BSP, boot_cpu_data is not set up yet.
+ * x86_cpuid_vendor() gets vendor id for BSP.
+ *
+ * In 32 bit AP case, accessing boot_cpu_data needs linear address. To simplify
+ * coding, we still use x86_cpuid_vendor() to get vendor id for AP.
+ *
+ * x86_cpuid_vendor() gets vendor information directly from CPUID.
+ */
+static inline int x86_cpuid_vendor(void)
+{
+ u32 eax = 0x00000000;
+ u32 ebx, ecx = 0, edx;
+
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+
+ if (CPUID_IS(CPUID_INTEL1, CPUID_INTEL2, CPUID_INTEL3, ebx, ecx, edx))
+ return X86_VENDOR_INTEL;
+
+ if (CPUID_IS(CPUID_AMD1, CPUID_AMD2, CPUID_AMD3, ebx, ecx, edx))
+ return X86_VENDOR_AMD;
+
+ return X86_VENDOR_UNKNOWN;
+}
+
+static inline unsigned int x86_cpuid_family(void)
+{
+ u32 eax = 0x00000001;
+ u32 ebx, ecx = 0, edx;
+
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+
+ return x86_family(eax);
+}
+
+#ifdef CONFIG_MICROCODE
+int __init microcode_init(void);
+extern void __init load_ucode_bsp(void);
+extern void load_ucode_ap(void);
+void reload_early_microcode(void);
+extern bool get_builtin_firmware(struct cpio_data *cd, const char *name);
+extern bool initrd_gone;
+void microcode_bsp_resume(void);
+#else
+static inline int __init microcode_init(void) { return 0; };
+static inline void __init load_ucode_bsp(void) { }
+static inline void load_ucode_ap(void) { }
+static inline void reload_early_microcode(void) { }
+static inline void microcode_bsp_resume(void) { }
+static inline bool
+get_builtin_firmware(struct cpio_data *cd, const char *name) { return false; }
+#endif
+
+#endif /* _ASM_X86_MICROCODE_H */
diff --git a/arch/x86/include/asm/microcode_amd.h b/arch/x86/include/asm/microcode_amd.h
new file mode 100644
index 000000000..5c524d4f7
--- /dev/null
+++ b/arch/x86/include/asm/microcode_amd.h
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_MICROCODE_AMD_H
+#define _ASM_X86_MICROCODE_AMD_H
+
+#include <asm/microcode.h>
+
+#define UCODE_MAGIC 0x00414d44
+#define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000
+#define UCODE_UCODE_TYPE 0x00000001
+
+#define SECTION_HDR_SIZE 8
+#define CONTAINER_HDR_SZ 12
+
+struct equiv_cpu_entry {
+ u32 installed_cpu;
+ u32 fixed_errata_mask;
+ u32 fixed_errata_compare;
+ u16 equiv_cpu;
+ u16 res;
+} __attribute__((packed));
+
+struct microcode_header_amd {
+ u32 data_code;
+ u32 patch_id;
+ u16 mc_patch_data_id;
+ u8 mc_patch_data_len;
+ u8 init_flag;
+ u32 mc_patch_data_checksum;
+ u32 nb_dev_id;
+ u32 sb_dev_id;
+ u16 processor_rev_id;
+ u8 nb_rev_id;
+ u8 sb_rev_id;
+ u8 bios_api_rev;
+ u8 reserved1[3];
+ u32 match_reg[8];
+} __attribute__((packed));
+
+struct microcode_amd {
+ struct microcode_header_amd hdr;
+ unsigned int mpb[0];
+};
+
+#define PATCH_MAX_SIZE (3 * PAGE_SIZE)
+
+#ifdef CONFIG_MICROCODE_AMD
+extern void __init load_ucode_amd_bsp(unsigned int family);
+extern void load_ucode_amd_ap(unsigned int family);
+extern int __init save_microcode_in_initrd_amd(unsigned int family);
+void reload_ucode_amd(void);
+#else
+static inline void __init load_ucode_amd_bsp(unsigned int family) {}
+static inline void load_ucode_amd_ap(unsigned int family) {}
+static inline int __init
+save_microcode_in_initrd_amd(unsigned int family) { return -EINVAL; }
+void reload_ucode_amd(void) {}
+#endif
+#endif /* _ASM_X86_MICROCODE_AMD_H */
diff --git a/arch/x86/include/asm/microcode_intel.h b/arch/x86/include/asm/microcode_intel.h
new file mode 100644
index 000000000..d85a07d71
--- /dev/null
+++ b/arch/x86/include/asm/microcode_intel.h
@@ -0,0 +1,85 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_MICROCODE_INTEL_H
+#define _ASM_X86_MICROCODE_INTEL_H
+
+#include <asm/microcode.h>
+
+struct microcode_header_intel {
+ unsigned int hdrver;
+ unsigned int rev;
+ unsigned int date;
+ unsigned int sig;
+ unsigned int cksum;
+ unsigned int ldrver;
+ unsigned int pf;
+ unsigned int datasize;
+ unsigned int totalsize;
+ unsigned int reserved[3];
+};
+
+struct microcode_intel {
+ struct microcode_header_intel hdr;
+ unsigned int bits[0];
+};
+
+/* microcode format is extended from prescott processors */
+struct extended_signature {
+ unsigned int sig;
+ unsigned int pf;
+ unsigned int cksum;
+};
+
+struct extended_sigtable {
+ unsigned int count;
+ unsigned int cksum;
+ unsigned int reserved[3];
+ struct extended_signature sigs[0];
+};
+
+#define DEFAULT_UCODE_DATASIZE (2000)
+#define MC_HEADER_SIZE (sizeof(struct microcode_header_intel))
+#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE)
+#define EXT_HEADER_SIZE (sizeof(struct extended_sigtable))
+#define EXT_SIGNATURE_SIZE (sizeof(struct extended_signature))
+
+#define get_totalsize(mc) \
+ (((struct microcode_intel *)mc)->hdr.datasize ? \
+ ((struct microcode_intel *)mc)->hdr.totalsize : \
+ DEFAULT_UCODE_TOTALSIZE)
+
+#define get_datasize(mc) \
+ (((struct microcode_intel *)mc)->hdr.datasize ? \
+ ((struct microcode_intel *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE)
+
+#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
+
+static inline u32 intel_get_microcode_revision(void)
+{
+ u32 rev, dummy;
+
+ native_wrmsrl(MSR_IA32_UCODE_REV, 0);
+
+ /* As documented in the SDM: Do a CPUID 1 here */
+ native_cpuid_eax(1);
+
+ /* get the current revision from MSR 0x8B */
+ native_rdmsr(MSR_IA32_UCODE_REV, dummy, rev);
+
+ return rev;
+}
+
+#ifdef CONFIG_MICROCODE_INTEL
+extern void __init load_ucode_intel_bsp(void);
+extern void load_ucode_intel_ap(void);
+extern void show_ucode_info_early(void);
+extern int __init save_microcode_in_initrd_intel(void);
+void reload_ucode_intel(void);
+#else
+static inline __init void load_ucode_intel_bsp(void) {}
+static inline void load_ucode_intel_ap(void) {}
+static inline void show_ucode_info_early(void) {}
+static inline int __init save_microcode_in_initrd_intel(void) { return -EINVAL; }
+static inline void reload_ucode_intel(void) {}
+#endif
+
+#endif /* _ASM_X86_MICROCODE_INTEL_H */
diff --git a/arch/x86/include/asm/misc.h b/arch/x86/include/asm/misc.h
new file mode 100644
index 000000000..bb049cca3
--- /dev/null
+++ b/arch/x86/include/asm/misc.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_MISC_H
+#define _ASM_X86_MISC_H
+
+int num_digits(int val);
+
+#endif /* _ASM_X86_MISC_H */
diff --git a/arch/x86/include/asm/mmconfig.h b/arch/x86/include/asm/mmconfig.h
new file mode 100644
index 000000000..976486447
--- /dev/null
+++ b/arch/x86/include/asm/mmconfig.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_MMCONFIG_H
+#define _ASM_X86_MMCONFIG_H
+
+#ifdef CONFIG_PCI_MMCONFIG
+extern void fam10h_check_enable_mmcfg(void);
+extern void check_enable_amd_mmconf_dmi(void);
+#else
+static inline void fam10h_check_enable_mmcfg(void) { }
+static inline void check_enable_amd_mmconf_dmi(void) { }
+#endif
+
+#endif /* _ASM_X86_MMCONFIG_H */
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
new file mode 100644
index 000000000..5ff3e8af2
--- /dev/null
+++ b/arch/x86/include/asm/mmu.h
@@ -0,0 +1,66 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_MMU_H
+#define _ASM_X86_MMU_H
+
+#include <linux/spinlock.h>
+#include <linux/rwsem.h>
+#include <linux/mutex.h>
+#include <linux/atomic.h>
+
+/*
+ * x86 has arch-specific MMU state beyond what lives in mm_struct.
+ */
+typedef struct {
+ /*
+ * ctx_id uniquely identifies this mm_struct. A ctx_id will never
+ * be reused, and zero is not a valid ctx_id.
+ */
+ u64 ctx_id;
+
+ /*
+ * Any code that needs to do any sort of TLB flushing for this
+ * mm will first make its changes to the page tables, then
+ * increment tlb_gen, then flush. This lets the low-level
+ * flushing code keep track of what needs flushing.
+ *
+ * This is not used on Xen PV.
+ */
+ atomic64_t tlb_gen;
+
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+ struct rw_semaphore ldt_usr_sem;
+ struct ldt_struct *ldt;
+#endif
+
+#ifdef CONFIG_X86_64
+ /* True if mm supports a task running in 32 bit compatibility mode. */
+ unsigned short ia32_compat;
+#endif
+
+ struct mutex lock;
+ void __user *vdso; /* vdso base address */
+ const struct vdso_image *vdso_image; /* vdso image in use */
+
+ atomic_t perf_rdpmc_allowed; /* nonzero if rdpmc is allowed */
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+ /*
+ * One bit per protection key says whether userspace can
+ * use it or not. protected by mmap_sem.
+ */
+ u16 pkey_allocation_map;
+ s16 execute_only_pkey;
+#endif
+#ifdef CONFIG_X86_INTEL_MPX
+ /* address of the bounds directory */
+ void __user *bd_addr;
+#endif
+} mm_context_t;
+
+#define INIT_MM_CONTEXT(mm) \
+ .context = { \
+ .ctx_id = 1, \
+ }
+
+void leave_mm(int cpu);
+
+#endif /* _ASM_X86_MMU_H */
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
new file mode 100644
index 000000000..2252b63d3
--- /dev/null
+++ b/arch/x86/include/asm/mmu_context.h
@@ -0,0 +1,359 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_MMU_CONTEXT_H
+#define _ASM_X86_MMU_CONTEXT_H
+
+#include <asm/desc.h>
+#include <linux/atomic.h>
+#include <linux/mm_types.h>
+#include <linux/pkeys.h>
+
+#include <trace/events/tlb.h>
+
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+#include <asm/paravirt.h>
+#include <asm/mpx.h>
+
+extern atomic64_t last_mm_ctx_id;
+
+#ifndef CONFIG_PARAVIRT
+static inline void paravirt_activate_mm(struct mm_struct *prev,
+ struct mm_struct *next)
+{
+}
+#endif /* !CONFIG_PARAVIRT */
+
+#ifdef CONFIG_PERF_EVENTS
+
+DECLARE_STATIC_KEY_FALSE(rdpmc_always_available_key);
+
+static inline void load_mm_cr4(struct mm_struct *mm)
+{
+ if (static_branch_unlikely(&rdpmc_always_available_key) ||
+ atomic_read(&mm->context.perf_rdpmc_allowed))
+ cr4_set_bits(X86_CR4_PCE);
+ else
+ cr4_clear_bits(X86_CR4_PCE);
+}
+#else
+static inline void load_mm_cr4(struct mm_struct *mm) {}
+#endif
+
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+/*
+ * ldt_structs can be allocated, used, and freed, but they are never
+ * modified while live.
+ */
+struct ldt_struct {
+ /*
+ * Xen requires page-aligned LDTs with special permissions. This is
+ * needed to prevent us from installing evil descriptors such as
+ * call gates. On native, we could merge the ldt_struct and LDT
+ * allocations, but it's not worth trying to optimize.
+ */
+ struct desc_struct *entries;
+ unsigned int nr_entries;
+
+ /*
+ * If PTI is in use, then the entries array is not mapped while we're
+ * in user mode. The whole array will be aliased at the addressed
+ * given by ldt_slot_va(slot). We use two slots so that we can allocate
+ * and map, and enable a new LDT without invalidating the mapping
+ * of an older, still-in-use LDT.
+ *
+ * slot will be -1 if this LDT doesn't have an alias mapping.
+ */
+ int slot;
+};
+
+/* This is a multiple of PAGE_SIZE. */
+#define LDT_SLOT_STRIDE (LDT_ENTRIES * LDT_ENTRY_SIZE)
+
+static inline void *ldt_slot_va(int slot)
+{
+ return (void *)(LDT_BASE_ADDR + LDT_SLOT_STRIDE * slot);
+}
+
+/*
+ * Used for LDT copy/destruction.
+ */
+static inline void init_new_context_ldt(struct mm_struct *mm)
+{
+ mm->context.ldt = NULL;
+ init_rwsem(&mm->context.ldt_usr_sem);
+}
+int ldt_dup_context(struct mm_struct *oldmm, struct mm_struct *mm);
+void destroy_context_ldt(struct mm_struct *mm);
+void ldt_arch_exit_mmap(struct mm_struct *mm);
+#else /* CONFIG_MODIFY_LDT_SYSCALL */
+static inline void init_new_context_ldt(struct mm_struct *mm) { }
+static inline int ldt_dup_context(struct mm_struct *oldmm,
+ struct mm_struct *mm)
+{
+ return 0;
+}
+static inline void destroy_context_ldt(struct mm_struct *mm) { }
+static inline void ldt_arch_exit_mmap(struct mm_struct *mm) { }
+#endif
+
+static inline void load_mm_ldt(struct mm_struct *mm)
+{
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+ struct ldt_struct *ldt;
+
+ /* READ_ONCE synchronizes with smp_store_release */
+ ldt = READ_ONCE(mm->context.ldt);
+
+ /*
+ * Any change to mm->context.ldt is followed by an IPI to all
+ * CPUs with the mm active. The LDT will not be freed until
+ * after the IPI is handled by all such CPUs. This means that,
+ * if the ldt_struct changes before we return, the values we see
+ * will be safe, and the new values will be loaded before we run
+ * any user code.
+ *
+ * NB: don't try to convert this to use RCU without extreme care.
+ * We would still need IRQs off, because we don't want to change
+ * the local LDT after an IPI loaded a newer value than the one
+ * that we can see.
+ */
+
+ if (unlikely(ldt)) {
+ if (static_cpu_has(X86_FEATURE_PTI)) {
+ if (WARN_ON_ONCE((unsigned long)ldt->slot > 1)) {
+ /*
+ * Whoops -- either the new LDT isn't mapped
+ * (if slot == -1) or is mapped into a bogus
+ * slot (if slot > 1).
+ */
+ clear_LDT();
+ return;
+ }
+
+ /*
+ * If page table isolation is enabled, ldt->entries
+ * will not be mapped in the userspace pagetables.
+ * Tell the CPU to access the LDT through the alias
+ * at ldt_slot_va(ldt->slot).
+ */
+ set_ldt(ldt_slot_va(ldt->slot), ldt->nr_entries);
+ } else {
+ set_ldt(ldt->entries, ldt->nr_entries);
+ }
+ } else {
+ clear_LDT();
+ }
+#else
+ clear_LDT();
+#endif
+}
+
+static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
+{
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+ /*
+ * Load the LDT if either the old or new mm had an LDT.
+ *
+ * An mm will never go from having an LDT to not having an LDT. Two
+ * mms never share an LDT, so we don't gain anything by checking to
+ * see whether the LDT changed. There's also no guarantee that
+ * prev->context.ldt actually matches LDTR, but, if LDTR is non-NULL,
+ * then prev->context.ldt will also be non-NULL.
+ *
+ * If we really cared, we could optimize the case where prev == next
+ * and we're exiting lazy mode. Most of the time, if this happens,
+ * we don't actually need to reload LDTR, but modify_ldt() is mostly
+ * used by legacy code and emulators where we don't need this level of
+ * performance.
+ *
+ * This uses | instead of || because it generates better code.
+ */
+ if (unlikely((unsigned long)prev->context.ldt |
+ (unsigned long)next->context.ldt))
+ load_mm_ldt(next);
+#endif
+
+ DEBUG_LOCKS_WARN_ON(preemptible());
+}
+
+void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);
+
+/*
+ * Init a new mm. Used on mm copies, like at fork()
+ * and on mm's that are brand-new, like at execve().
+ */
+static inline int init_new_context(struct task_struct *tsk,
+ struct mm_struct *mm)
+{
+ mutex_init(&mm->context.lock);
+
+ mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id);
+ atomic64_set(&mm->context.tlb_gen, 0);
+
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+ if (cpu_feature_enabled(X86_FEATURE_OSPKE)) {
+ /* pkey 0 is the default and allocated implicitly */
+ mm->context.pkey_allocation_map = 0x1;
+ /* -1 means unallocated or invalid */
+ mm->context.execute_only_pkey = -1;
+ }
+#endif
+ init_new_context_ldt(mm);
+ return 0;
+}
+static inline void destroy_context(struct mm_struct *mm)
+{
+ destroy_context_ldt(mm);
+}
+
+extern void switch_mm(struct mm_struct *prev, struct mm_struct *next,
+ struct task_struct *tsk);
+
+extern void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
+ struct task_struct *tsk);
+#define switch_mm_irqs_off switch_mm_irqs_off
+
+#define activate_mm(prev, next) \
+do { \
+ paravirt_activate_mm((prev), (next)); \
+ switch_mm((prev), (next), NULL); \
+} while (0);
+
+#ifdef CONFIG_X86_32
+#define deactivate_mm(tsk, mm) \
+do { \
+ lazy_load_gs(0); \
+} while (0)
+#else
+#define deactivate_mm(tsk, mm) \
+do { \
+ load_gs_index(0); \
+ loadsegment(fs, 0); \
+} while (0)
+#endif
+
+static inline void arch_dup_pkeys(struct mm_struct *oldmm,
+ struct mm_struct *mm)
+{
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+ if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
+ return;
+
+ /* Duplicate the oldmm pkey state in mm: */
+ mm->context.pkey_allocation_map = oldmm->context.pkey_allocation_map;
+ mm->context.execute_only_pkey = oldmm->context.execute_only_pkey;
+#endif
+}
+
+static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
+{
+ arch_dup_pkeys(oldmm, mm);
+ paravirt_arch_dup_mmap(oldmm, mm);
+ return ldt_dup_context(oldmm, mm);
+}
+
+static inline void arch_exit_mmap(struct mm_struct *mm)
+{
+ paravirt_arch_exit_mmap(mm);
+ ldt_arch_exit_mmap(mm);
+}
+
+#ifdef CONFIG_X86_64
+static inline bool is_64bit_mm(struct mm_struct *mm)
+{
+ return !IS_ENABLED(CONFIG_IA32_EMULATION) ||
+ !(mm->context.ia32_compat == TIF_IA32);
+}
+#else
+static inline bool is_64bit_mm(struct mm_struct *mm)
+{
+ return false;
+}
+#endif
+
+static inline void arch_bprm_mm_init(struct mm_struct *mm,
+ struct vm_area_struct *vma)
+{
+ mpx_mm_init(mm);
+}
+
+static inline void arch_unmap(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long start, unsigned long end)
+{
+ /*
+ * mpx_notify_unmap() goes and reads a rarely-hot
+ * cacheline in the mm_struct. That can be expensive
+ * enough to be seen in profiles.
+ *
+ * The mpx_notify_unmap() call and its contents have been
+ * observed to affect munmap() performance on hardware
+ * where MPX is not present.
+ *
+ * The unlikely() optimizes for the fast case: no MPX
+ * in the CPU, or no MPX use in the process. Even if
+ * we get this wrong (in the unlikely event that MPX
+ * is widely enabled on some system) the overhead of
+ * MPX itself (reading bounds tables) is expected to
+ * overwhelm the overhead of getting this unlikely()
+ * consistently wrong.
+ */
+ if (unlikely(cpu_feature_enabled(X86_FEATURE_MPX)))
+ mpx_notify_unmap(mm, vma, start, end);
+}
+
+/*
+ * We only want to enforce protection keys on the current process
+ * because we effectively have no access to PKRU for other
+ * processes or any way to tell *which * PKRU in a threaded
+ * process we could use.
+ *
+ * So do not enforce things if the VMA is not from the current
+ * mm, or if we are in a kernel thread.
+ */
+static inline bool vma_is_foreign(struct vm_area_struct *vma)
+{
+ if (!current->mm)
+ return true;
+ /*
+ * Should PKRU be enforced on the access to this VMA? If
+ * the VMA is from another process, then PKRU has no
+ * relevance and should not be enforced.
+ */
+ if (current->mm != vma->vm_mm)
+ return true;
+
+ return false;
+}
+
+static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
+ bool write, bool execute, bool foreign)
+{
+ /* pkeys never affect instruction fetches */
+ if (execute)
+ return true;
+ /* allow access if the VMA is not one from this process */
+ if (foreign || vma_is_foreign(vma))
+ return true;
+ return __pkru_allows_pkey(vma_pkey(vma), write);
+}
+
+/*
+ * This can be used from process context to figure out what the value of
+ * CR3 is without needing to do a (slow) __read_cr3().
+ *
+ * It's intended to be used for code like KVM that sneakily changes CR3
+ * and needs to restore it. It needs to be used very carefully.
+ */
+static inline unsigned long __get_current_cr3_fast(void)
+{
+ unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd,
+ this_cpu_read(cpu_tlbstate.loaded_mm_asid));
+
+ /* For now, be very restrictive about when this can be called. */
+ VM_WARN_ON(in_nmi() || preemptible());
+
+ VM_BUG_ON(cr3 != __read_cr3());
+ return cr3;
+}
+
+#endif /* _ASM_X86_MMU_CONTEXT_H */
diff --git a/arch/x86/include/asm/mmx.h b/arch/x86/include/asm/mmx.h
new file mode 100644
index 000000000..f572d0f94
--- /dev/null
+++ b/arch/x86/include/asm/mmx.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_MMX_H
+#define _ASM_X86_MMX_H
+
+/*
+ * MMX 3Dnow! helper operations
+ */
+
+#include <linux/types.h>
+
+extern void *_mmx_memcpy(void *to, const void *from, size_t size);
+extern void mmx_clear_page(void *page);
+extern void mmx_copy_page(void *to, void *from);
+
+#endif /* _ASM_X86_MMX_H */
diff --git a/arch/x86/include/asm/mmzone.h b/arch/x86/include/asm/mmzone.h
new file mode 100644
index 000000000..c41b41edd
--- /dev/null
+++ b/arch/x86/include/asm/mmzone.h
@@ -0,0 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifdef CONFIG_X86_32
+# include <asm/mmzone_32.h>
+#else
+# include <asm/mmzone_64.h>
+#endif
diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h
new file mode 100644
index 000000000..73d8dd14d
--- /dev/null
+++ b/arch/x86/include/asm/mmzone_32.h
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Written by Pat Gaughen (gone@us.ibm.com) Mar 2002
+ *
+ */
+
+#ifndef _ASM_X86_MMZONE_32_H
+#define _ASM_X86_MMZONE_32_H
+
+#include <asm/smp.h>
+
+#ifdef CONFIG_NUMA
+extern struct pglist_data *node_data[];
+#define NODE_DATA(nid) (node_data[nid])
+#endif /* CONFIG_NUMA */
+
+#ifdef CONFIG_DISCONTIGMEM
+
+/*
+ * generic node memory support, the following assumptions apply:
+ *
+ * 1) memory comes in 64Mb contiguous chunks which are either present or not
+ * 2) we will not have more than 64Gb in total
+ *
+ * for now assume that 64Gb is max amount of RAM for whole system
+ * 64Gb / 4096bytes/page = 16777216 pages
+ */
+#define MAX_NR_PAGES 16777216
+#define MAX_SECTIONS 1024
+#define PAGES_PER_SECTION (MAX_NR_PAGES/MAX_SECTIONS)
+
+extern s8 physnode_map[];
+
+static inline int pfn_to_nid(unsigned long pfn)
+{
+#ifdef CONFIG_NUMA
+ return((int) physnode_map[(pfn) / PAGES_PER_SECTION]);
+#else
+ return 0;
+#endif
+}
+
+static inline int pfn_valid(int pfn)
+{
+ int nid = pfn_to_nid(pfn);
+
+ if (nid >= 0)
+ return (pfn < node_end_pfn(nid));
+ return 0;
+}
+
+#define early_pfn_valid(pfn) pfn_valid((pfn))
+
+#endif /* CONFIG_DISCONTIGMEM */
+
+#endif /* _ASM_X86_MMZONE_32_H */
diff --git a/arch/x86/include/asm/mmzone_64.h b/arch/x86/include/asm/mmzone_64.h
new file mode 100644
index 000000000..0c585046f
--- /dev/null
+++ b/arch/x86/include/asm/mmzone_64.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* K8 NUMA support */
+/* Copyright 2002,2003 by Andi Kleen, SuSE Labs */
+/* 2.5 Version loosely based on the NUMAQ Code by Pat Gaughen. */
+#ifndef _ASM_X86_MMZONE_64_H
+#define _ASM_X86_MMZONE_64_H
+
+#ifdef CONFIG_NUMA
+
+#include <linux/mmdebug.h>
+#include <asm/smp.h>
+
+extern struct pglist_data *node_data[];
+
+#define NODE_DATA(nid) (node_data[nid])
+
+#endif
+#endif /* _ASM_X86_MMZONE_64_H */
diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h
new file mode 100644
index 000000000..7948a17fe
--- /dev/null
+++ b/arch/x86/include/asm/module.h
@@ -0,0 +1,74 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_MODULE_H
+#define _ASM_X86_MODULE_H
+
+#include <asm-generic/module.h>
+#include <asm/orc_types.h>
+
+struct mod_arch_specific {
+#ifdef CONFIG_UNWINDER_ORC
+ unsigned int num_orcs;
+ int *orc_unwind_ip;
+ struct orc_entry *orc_unwind;
+#endif
+};
+
+#ifdef CONFIG_X86_64
+/* X86_64 does not define MODULE_PROC_FAMILY */
+#elif defined CONFIG_M486
+#define MODULE_PROC_FAMILY "486 "
+#elif defined CONFIG_M586
+#define MODULE_PROC_FAMILY "586 "
+#elif defined CONFIG_M586TSC
+#define MODULE_PROC_FAMILY "586TSC "
+#elif defined CONFIG_M586MMX
+#define MODULE_PROC_FAMILY "586MMX "
+#elif defined CONFIG_MCORE2
+#define MODULE_PROC_FAMILY "CORE2 "
+#elif defined CONFIG_MATOM
+#define MODULE_PROC_FAMILY "ATOM "
+#elif defined CONFIG_M686
+#define MODULE_PROC_FAMILY "686 "
+#elif defined CONFIG_MPENTIUMII
+#define MODULE_PROC_FAMILY "PENTIUMII "
+#elif defined CONFIG_MPENTIUMIII
+#define MODULE_PROC_FAMILY "PENTIUMIII "
+#elif defined CONFIG_MPENTIUMM
+#define MODULE_PROC_FAMILY "PENTIUMM "
+#elif defined CONFIG_MPENTIUM4
+#define MODULE_PROC_FAMILY "PENTIUM4 "
+#elif defined CONFIG_MK6
+#define MODULE_PROC_FAMILY "K6 "
+#elif defined CONFIG_MK7
+#define MODULE_PROC_FAMILY "K7 "
+#elif defined CONFIG_MK8
+#define MODULE_PROC_FAMILY "K8 "
+#elif defined CONFIG_MELAN
+#define MODULE_PROC_FAMILY "ELAN "
+#elif defined CONFIG_MCRUSOE
+#define MODULE_PROC_FAMILY "CRUSOE "
+#elif defined CONFIG_MEFFICEON
+#define MODULE_PROC_FAMILY "EFFICEON "
+#elif defined CONFIG_MWINCHIPC6
+#define MODULE_PROC_FAMILY "WINCHIPC6 "
+#elif defined CONFIG_MWINCHIP3D
+#define MODULE_PROC_FAMILY "WINCHIP3D "
+#elif defined CONFIG_MCYRIXIII
+#define MODULE_PROC_FAMILY "CYRIXIII "
+#elif defined CONFIG_MVIAC3_2
+#define MODULE_PROC_FAMILY "VIAC3-2 "
+#elif defined CONFIG_MVIAC7
+#define MODULE_PROC_FAMILY "VIAC7 "
+#elif defined CONFIG_MGEODEGX1
+#define MODULE_PROC_FAMILY "GEODEGX1 "
+#elif defined CONFIG_MGEODE_LX
+#define MODULE_PROC_FAMILY "GEODE "
+#else
+#error unknown processor family
+#endif
+
+#ifdef CONFIG_X86_32
+# define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY
+#endif
+
+#endif /* _ASM_X86_MODULE_H */
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
new file mode 100644
index 000000000..606cbaebd
--- /dev/null
+++ b/arch/x86/include/asm/mpspec.h
@@ -0,0 +1,154 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_MPSPEC_H
+#define _ASM_X86_MPSPEC_H
+
+
+#include <asm/mpspec_def.h>
+#include <asm/x86_init.h>
+#include <asm/apicdef.h>
+
+extern int pic_mode;
+
+#ifdef CONFIG_X86_32
+
+/*
+ * Summit or generic (i.e. installer) kernels need lots of bus entries.
+ * Maximum 256 PCI busses, plus 1 ISA bus in each of 4 cabinets.
+ */
+#if CONFIG_BASE_SMALL == 0
+# define MAX_MP_BUSSES 260
+#else
+# define MAX_MP_BUSSES 32
+#endif
+
+#define MAX_IRQ_SOURCES 256
+
+extern unsigned int def_to_bigsmp;
+
+#else /* CONFIG_X86_64: */
+
+#define MAX_MP_BUSSES 256
+/* Each PCI slot may be a combo card with its own bus. 4 IRQ pins per slot. */
+#define MAX_IRQ_SOURCES (MAX_MP_BUSSES * 4)
+
+#endif /* CONFIG_X86_64 */
+
+#ifdef CONFIG_EISA
+extern int mp_bus_id_to_type[MAX_MP_BUSSES];
+#endif
+
+extern DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
+
+extern unsigned int boot_cpu_physical_apicid;
+extern u8 boot_cpu_apic_version;
+extern unsigned long mp_lapic_addr;
+
+#ifdef CONFIG_X86_LOCAL_APIC
+extern int smp_found_config;
+#else
+# define smp_found_config 0
+#endif
+
+static inline void get_smp_config(void)
+{
+ x86_init.mpparse.get_smp_config(0);
+}
+
+static inline void early_get_smp_config(void)
+{
+ x86_init.mpparse.get_smp_config(1);
+}
+
+static inline void find_smp_config(void)
+{
+ x86_init.mpparse.find_smp_config();
+}
+
+#ifdef CONFIG_X86_MPPARSE
+extern void e820__memblock_alloc_reserved_mpc_new(void);
+extern int enable_update_mptable;
+extern int default_mpc_apic_id(struct mpc_cpu *m);
+extern void default_smp_read_mpc_oem(struct mpc_table *mpc);
+# ifdef CONFIG_X86_IO_APIC
+extern void default_mpc_oem_bus_info(struct mpc_bus *m, char *str);
+# else
+# define default_mpc_oem_bus_info NULL
+# endif
+extern void default_find_smp_config(void);
+extern void default_get_smp_config(unsigned int early);
+#else
+static inline void e820__memblock_alloc_reserved_mpc_new(void) { }
+#define enable_update_mptable 0
+#define default_mpc_apic_id NULL
+#define default_smp_read_mpc_oem NULL
+#define default_mpc_oem_bus_info NULL
+#define default_find_smp_config x86_init_noop
+#define default_get_smp_config x86_init_uint_noop
+#endif
+
+int generic_processor_info(int apicid, int version);
+
+#define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_LOCAL_APIC)
+
+struct physid_mask {
+ unsigned long mask[PHYSID_ARRAY_SIZE];
+};
+
+typedef struct physid_mask physid_mask_t;
+
+#define physid_set(physid, map) set_bit(physid, (map).mask)
+#define physid_clear(physid, map) clear_bit(physid, (map).mask)
+#define physid_isset(physid, map) test_bit(physid, (map).mask)
+#define physid_test_and_set(physid, map) \
+ test_and_set_bit(physid, (map).mask)
+
+#define physids_and(dst, src1, src2) \
+ bitmap_and((dst).mask, (src1).mask, (src2).mask, MAX_LOCAL_APIC)
+
+#define physids_or(dst, src1, src2) \
+ bitmap_or((dst).mask, (src1).mask, (src2).mask, MAX_LOCAL_APIC)
+
+#define physids_clear(map) \
+ bitmap_zero((map).mask, MAX_LOCAL_APIC)
+
+#define physids_complement(dst, src) \
+ bitmap_complement((dst).mask, (src).mask, MAX_LOCAL_APIC)
+
+#define physids_empty(map) \
+ bitmap_empty((map).mask, MAX_LOCAL_APIC)
+
+#define physids_equal(map1, map2) \
+ bitmap_equal((map1).mask, (map2).mask, MAX_LOCAL_APIC)
+
+#define physids_weight(map) \
+ bitmap_weight((map).mask, MAX_LOCAL_APIC)
+
+#define physids_shift_right(d, s, n) \
+ bitmap_shift_right((d).mask, (s).mask, n, MAX_LOCAL_APIC)
+
+#define physids_shift_left(d, s, n) \
+ bitmap_shift_left((d).mask, (s).mask, n, MAX_LOCAL_APIC)
+
+static inline unsigned long physids_coerce(physid_mask_t *map)
+{
+ return map->mask[0];
+}
+
+static inline void physids_promote(unsigned long physids, physid_mask_t *map)
+{
+ physids_clear(*map);
+ map->mask[0] = physids;
+}
+
+static inline void physid_set_mask_of_physid(int physid, physid_mask_t *map)
+{
+ physids_clear(*map);
+ physid_set(physid, *map);
+}
+
+#define PHYSID_MASK_ALL { {[0 ... PHYSID_ARRAY_SIZE-1] = ~0UL} }
+#define PHYSID_MASK_NONE { {[0 ... PHYSID_ARRAY_SIZE-1] = 0UL} }
+
+extern physid_mask_t phys_cpu_present_map;
+
+#endif /* _ASM_X86_MPSPEC_H */
diff --git a/arch/x86/include/asm/mpspec_def.h b/arch/x86/include/asm/mpspec_def.h
new file mode 100644
index 000000000..6fb923a34
--- /dev/null
+++ b/arch/x86/include/asm/mpspec_def.h
@@ -0,0 +1,182 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_MPSPEC_DEF_H
+#define _ASM_X86_MPSPEC_DEF_H
+
+/*
+ * Structure definitions for SMP machines following the
+ * Intel Multiprocessing Specification 1.1 and 1.4.
+ */
+
+/*
+ * This tag identifies where the SMP configuration
+ * information is.
+ */
+
+#define SMP_MAGIC_IDENT (('_'<<24) | ('P'<<16) | ('M'<<8) | '_')
+
+#ifdef CONFIG_X86_32
+# define MAX_MPC_ENTRY 1024
+#endif
+
+/* Intel MP Floating Pointer Structure */
+struct mpf_intel {
+ char signature[4]; /* "_MP_" */
+ unsigned int physptr; /* Configuration table address */
+ unsigned char length; /* Our length (paragraphs) */
+ unsigned char specification; /* Specification version */
+ unsigned char checksum; /* Checksum (makes sum 0) */
+ unsigned char feature1; /* Standard or configuration ? */
+ unsigned char feature2; /* Bit7 set for IMCR|PIC */
+ unsigned char feature3; /* Unused (0) */
+ unsigned char feature4; /* Unused (0) */
+ unsigned char feature5; /* Unused (0) */
+};
+
+#define MPC_SIGNATURE "PCMP"
+
+struct mpc_table {
+ char signature[4];
+ unsigned short length; /* Size of table */
+ char spec; /* 0x01 */
+ char checksum;
+ char oem[8];
+ char productid[12];
+ unsigned int oemptr; /* 0 if not present */
+ unsigned short oemsize; /* 0 if not present */
+ unsigned short oemcount;
+ unsigned int lapic; /* APIC address */
+ unsigned int reserved;
+};
+
+/* Followed by entries */
+
+#define MP_PROCESSOR 0
+#define MP_BUS 1
+#define MP_IOAPIC 2
+#define MP_INTSRC 3
+#define MP_LINTSRC 4
+/* Used by IBM NUMA-Q to describe node locality */
+#define MP_TRANSLATION 192
+
+#define CPU_ENABLED 1 /* Processor is available */
+#define CPU_BOOTPROCESSOR 2 /* Processor is the boot CPU */
+
+#define CPU_STEPPING_MASK 0x000F
+#define CPU_MODEL_MASK 0x00F0
+#define CPU_FAMILY_MASK 0x0F00
+
+struct mpc_cpu {
+ unsigned char type;
+ unsigned char apicid; /* Local APIC number */
+ unsigned char apicver; /* Its versions */
+ unsigned char cpuflag;
+ unsigned int cpufeature;
+ unsigned int featureflag; /* CPUID feature value */
+ unsigned int reserved[2];
+};
+
+struct mpc_bus {
+ unsigned char type;
+ unsigned char busid;
+ unsigned char bustype[6];
+};
+
+/* List of Bus Type string values, Intel MP Spec. */
+#define BUSTYPE_EISA "EISA"
+#define BUSTYPE_ISA "ISA"
+#define BUSTYPE_INTERN "INTERN" /* Internal BUS */
+#define BUSTYPE_MCA "MCA" /* Obsolete */
+#define BUSTYPE_VL "VL" /* Local bus */
+#define BUSTYPE_PCI "PCI"
+#define BUSTYPE_PCMCIA "PCMCIA"
+#define BUSTYPE_CBUS "CBUS"
+#define BUSTYPE_CBUSII "CBUSII"
+#define BUSTYPE_FUTURE "FUTURE"
+#define BUSTYPE_MBI "MBI"
+#define BUSTYPE_MBII "MBII"
+#define BUSTYPE_MPI "MPI"
+#define BUSTYPE_MPSA "MPSA"
+#define BUSTYPE_NUBUS "NUBUS"
+#define BUSTYPE_TC "TC"
+#define BUSTYPE_VME "VME"
+#define BUSTYPE_XPRESS "XPRESS"
+
+#define MPC_APIC_USABLE 0x01
+
+struct mpc_ioapic {
+ unsigned char type;
+ unsigned char apicid;
+ unsigned char apicver;
+ unsigned char flags;
+ unsigned int apicaddr;
+};
+
+struct mpc_intsrc {
+ unsigned char type;
+ unsigned char irqtype;
+ unsigned short irqflag;
+ unsigned char srcbus;
+ unsigned char srcbusirq;
+ unsigned char dstapic;
+ unsigned char dstirq;
+};
+
+enum mp_irq_source_types {
+ mp_INT = 0,
+ mp_NMI = 1,
+ mp_SMI = 2,
+ mp_ExtINT = 3
+};
+
+#define MP_IRQPOL_DEFAULT 0x0
+#define MP_IRQPOL_ACTIVE_HIGH 0x1
+#define MP_IRQPOL_RESERVED 0x2
+#define MP_IRQPOL_ACTIVE_LOW 0x3
+#define MP_IRQPOL_MASK 0x3
+
+#define MP_IRQTRIG_DEFAULT 0x0
+#define MP_IRQTRIG_EDGE 0x4
+#define MP_IRQTRIG_RESERVED 0x8
+#define MP_IRQTRIG_LEVEL 0xc
+#define MP_IRQTRIG_MASK 0xc
+
+#define MP_APIC_ALL 0xFF
+
+struct mpc_lintsrc {
+ unsigned char type;
+ unsigned char irqtype;
+ unsigned short irqflag;
+ unsigned char srcbusid;
+ unsigned char srcbusirq;
+ unsigned char destapic;
+ unsigned char destapiclint;
+};
+
+#define MPC_OEM_SIGNATURE "_OEM"
+
+struct mpc_oemtable {
+ char signature[4];
+ unsigned short length; /* Size of table */
+ char rev; /* 0x01 */
+ char checksum;
+ char mpc[8];
+};
+
+/*
+ * Default configurations
+ *
+ * 1 2 CPU ISA 82489DX
+ * 2 2 CPU EISA 82489DX neither IRQ 0 timer nor IRQ 13 DMA chaining
+ * 3 2 CPU EISA 82489DX
+ * 4 2 CPU MCA 82489DX
+ * 5 2 CPU ISA+PCI
+ * 6 2 CPU EISA+PCI
+ * 7 2 CPU MCA+PCI
+ */
+
+enum mp_bustype {
+ MP_BUS_ISA = 1,
+ MP_BUS_EISA,
+ MP_BUS_PCI,
+};
+#endif /* _ASM_X86_MPSPEC_DEF_H */
diff --git a/arch/x86/include/asm/mpx.h b/arch/x86/include/asm/mpx.h
new file mode 100644
index 000000000..61eb4b63c
--- /dev/null
+++ b/arch/x86/include/asm/mpx.h
@@ -0,0 +1,109 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_MPX_H
+#define _ASM_X86_MPX_H
+
+#include <linux/types.h>
+#include <linux/mm_types.h>
+
+#include <asm/ptrace.h>
+#include <asm/insn.h>
+
+/*
+ * NULL is theoretically a valid place to put the bounds
+ * directory, so point this at an invalid address.
+ */
+#define MPX_INVALID_BOUNDS_DIR ((void __user *)-1)
+#define MPX_BNDCFG_ENABLE_FLAG 0x1
+#define MPX_BD_ENTRY_VALID_FLAG 0x1
+
+/*
+ * The upper 28 bits [47:20] of the virtual address in 64-bit
+ * are used to index into bounds directory (BD).
+ *
+ * The directory is 2G (2^31) in size, and with 8-byte entries
+ * it has 2^28 entries.
+ */
+#define MPX_BD_SIZE_BYTES_64 (1UL<<31)
+#define MPX_BD_ENTRY_BYTES_64 8
+#define MPX_BD_NR_ENTRIES_64 (MPX_BD_SIZE_BYTES_64/MPX_BD_ENTRY_BYTES_64)
+
+/*
+ * The 32-bit directory is 4MB (2^22) in size, and with 4-byte
+ * entries it has 2^20 entries.
+ */
+#define MPX_BD_SIZE_BYTES_32 (1UL<<22)
+#define MPX_BD_ENTRY_BYTES_32 4
+#define MPX_BD_NR_ENTRIES_32 (MPX_BD_SIZE_BYTES_32/MPX_BD_ENTRY_BYTES_32)
+
+/*
+ * A 64-bit table is 4MB total in size, and an entry is
+ * 4 64-bit pointers in size.
+ */
+#define MPX_BT_SIZE_BYTES_64 (1UL<<22)
+#define MPX_BT_ENTRY_BYTES_64 32
+#define MPX_BT_NR_ENTRIES_64 (MPX_BT_SIZE_BYTES_64/MPX_BT_ENTRY_BYTES_64)
+
+/*
+ * A 32-bit table is 16kB total in size, and an entry is
+ * 4 32-bit pointers in size.
+ */
+#define MPX_BT_SIZE_BYTES_32 (1UL<<14)
+#define MPX_BT_ENTRY_BYTES_32 16
+#define MPX_BT_NR_ENTRIES_32 (MPX_BT_SIZE_BYTES_32/MPX_BT_ENTRY_BYTES_32)
+
+#define MPX_BNDSTA_TAIL 2
+#define MPX_BNDCFG_TAIL 12
+#define MPX_BNDSTA_ADDR_MASK (~((1UL<<MPX_BNDSTA_TAIL)-1))
+#define MPX_BNDCFG_ADDR_MASK (~((1UL<<MPX_BNDCFG_TAIL)-1))
+#define MPX_BNDSTA_ERROR_CODE 0x3
+
+#ifdef CONFIG_X86_INTEL_MPX
+siginfo_t *mpx_generate_siginfo(struct pt_regs *regs);
+int mpx_handle_bd_fault(void);
+static inline int kernel_managing_mpx_tables(struct mm_struct *mm)
+{
+ return (mm->context.bd_addr != MPX_INVALID_BOUNDS_DIR);
+}
+static inline void mpx_mm_init(struct mm_struct *mm)
+{
+ /*
+ * NULL is theoretically a valid place to put the bounds
+ * directory, so point this at an invalid address.
+ */
+ mm->context.bd_addr = MPX_INVALID_BOUNDS_DIR;
+}
+void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long start, unsigned long end);
+
+unsigned long mpx_unmapped_area_check(unsigned long addr, unsigned long len,
+ unsigned long flags);
+#else
+static inline siginfo_t *mpx_generate_siginfo(struct pt_regs *regs)
+{
+ return NULL;
+}
+static inline int mpx_handle_bd_fault(void)
+{
+ return -EINVAL;
+}
+static inline int kernel_managing_mpx_tables(struct mm_struct *mm)
+{
+ return 0;
+}
+static inline void mpx_mm_init(struct mm_struct *mm)
+{
+}
+static inline void mpx_notify_unmap(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long start, unsigned long end)
+{
+}
+
+static inline unsigned long mpx_unmapped_area_check(unsigned long addr,
+ unsigned long len, unsigned long flags)
+{
+ return addr;
+}
+#endif /* CONFIG_X86_INTEL_MPX */
+
+#endif /* _ASM_X86_MPX_H */
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
new file mode 100644
index 000000000..5b58a6cf4
--- /dev/null
+++ b/arch/x86/include/asm/mshyperv.h
@@ -0,0 +1,442 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_MSHYPER_H
+#define _ASM_X86_MSHYPER_H
+
+#include <linux/types.h>
+#include <linux/atomic.h>
+#include <linux/nmi.h>
+#include <asm/io.h>
+#include <asm/hyperv-tlfs.h>
+#include <asm/nospec-branch.h>
+
+#define VP_INVAL U32_MAX
+
+struct ms_hyperv_info {
+ u32 features;
+ u32 misc_features;
+ u32 hints;
+ u32 nested_features;
+ u32 max_vp_index;
+ u32 max_lp_index;
+};
+
+extern struct ms_hyperv_info ms_hyperv;
+
+/*
+ * Generate the guest ID.
+ */
+
+static inline __u64 generate_guest_id(__u64 d_info1, __u64 kernel_version,
+ __u64 d_info2)
+{
+ __u64 guest_id = 0;
+
+ guest_id = (((__u64)HV_LINUX_VENDOR_ID) << 48);
+ guest_id |= (d_info1 << 48);
+ guest_id |= (kernel_version << 16);
+ guest_id |= d_info2;
+
+ return guest_id;
+}
+
+
+/* Free the message slot and signal end-of-message if required */
+static inline void vmbus_signal_eom(struct hv_message *msg, u32 old_msg_type)
+{
+ /*
+ * On crash we're reading some other CPU's message page and we need
+ * to be careful: this other CPU may already had cleared the header
+ * and the host may already had delivered some other message there.
+ * In case we blindly write msg->header.message_type we're going
+ * to lose it. We can still lose a message of the same type but
+ * we count on the fact that there can only be one
+ * CHANNELMSG_UNLOAD_RESPONSE and we don't care about other messages
+ * on crash.
+ */
+ if (cmpxchg(&msg->header.message_type, old_msg_type,
+ HVMSG_NONE) != old_msg_type)
+ return;
+
+ /*
+ * Make sure the write to MessageType (ie set to
+ * HVMSG_NONE) happens before we read the
+ * MessagePending and EOMing. Otherwise, the EOMing
+ * will not deliver any more messages since there is
+ * no empty slot
+ */
+ mb();
+
+ if (msg->header.message_flags.msg_pending) {
+ /*
+ * This will cause message queue rescan to
+ * possibly deliver another msg from the
+ * hypervisor
+ */
+ wrmsrl(HV_X64_MSR_EOM, 0);
+ }
+}
+
+#define hv_init_timer(timer, tick) \
+ wrmsrl(HV_X64_MSR_STIMER0_COUNT + (2*timer), tick)
+#define hv_init_timer_config(timer, val) \
+ wrmsrl(HV_X64_MSR_STIMER0_CONFIG + (2*timer), val)
+
+#define hv_get_simp(val) rdmsrl(HV_X64_MSR_SIMP, val)
+#define hv_set_simp(val) wrmsrl(HV_X64_MSR_SIMP, val)
+
+#define hv_get_siefp(val) rdmsrl(HV_X64_MSR_SIEFP, val)
+#define hv_set_siefp(val) wrmsrl(HV_X64_MSR_SIEFP, val)
+
+#define hv_get_synic_state(val) rdmsrl(HV_X64_MSR_SCONTROL, val)
+#define hv_set_synic_state(val) wrmsrl(HV_X64_MSR_SCONTROL, val)
+
+#define hv_get_vp_index(index) rdmsrl(HV_X64_MSR_VP_INDEX, index)
+
+#define hv_get_synint_state(int_num, val) \
+ rdmsrl(HV_X64_MSR_SINT0 + int_num, val)
+#define hv_set_synint_state(int_num, val) \
+ wrmsrl(HV_X64_MSR_SINT0 + int_num, val)
+
+#define hv_get_crash_ctl(val) \
+ rdmsrl(HV_X64_MSR_CRASH_CTL, val)
+
+void hyperv_callback_vector(void);
+void hyperv_reenlightenment_vector(void);
+#ifdef CONFIG_TRACING
+#define trace_hyperv_callback_vector hyperv_callback_vector
+#endif
+void hyperv_vector_handler(struct pt_regs *regs);
+void hv_setup_vmbus_irq(void (*handler)(void));
+void hv_remove_vmbus_irq(void);
+
+void hv_setup_kexec_handler(void (*handler)(void));
+void hv_remove_kexec_handler(void);
+void hv_setup_crash_handler(void (*handler)(struct pt_regs *regs));
+void hv_remove_crash_handler(void);
+
+/*
+ * Routines for stimer0 Direct Mode handling.
+ * On x86/x64, there are no percpu actions to take.
+ */
+void hv_stimer0_vector_handler(struct pt_regs *regs);
+void hv_stimer0_callback_vector(void);
+int hv_setup_stimer0_irq(int *irq, int *vector, void (*handler)(void));
+void hv_remove_stimer0_irq(int irq);
+
+static inline void hv_enable_stimer0_percpu_irq(int irq) {}
+static inline void hv_disable_stimer0_percpu_irq(int irq) {}
+
+
+#if IS_ENABLED(CONFIG_HYPERV)
+extern struct clocksource *hyperv_cs;
+extern void *hv_hypercall_pg;
+extern void __percpu **hyperv_pcpu_input_arg;
+
+static inline u64 hv_do_hypercall(u64 control, void *input, void *output)
+{
+ u64 input_address = input ? virt_to_phys(input) : 0;
+ u64 output_address = output ? virt_to_phys(output) : 0;
+ u64 hv_status;
+
+#ifdef CONFIG_X86_64
+ if (!hv_hypercall_pg)
+ return U64_MAX;
+
+ __asm__ __volatile__("mov %4, %%r8\n"
+ CALL_NOSPEC
+ : "=a" (hv_status), ASM_CALL_CONSTRAINT,
+ "+c" (control), "+d" (input_address)
+ : "r" (output_address),
+ THUNK_TARGET(hv_hypercall_pg)
+ : "cc", "memory", "r8", "r9", "r10", "r11");
+#else
+ u32 input_address_hi = upper_32_bits(input_address);
+ u32 input_address_lo = lower_32_bits(input_address);
+ u32 output_address_hi = upper_32_bits(output_address);
+ u32 output_address_lo = lower_32_bits(output_address);
+
+ if (!hv_hypercall_pg)
+ return U64_MAX;
+
+ __asm__ __volatile__(CALL_NOSPEC
+ : "=A" (hv_status),
+ "+c" (input_address_lo), ASM_CALL_CONSTRAINT
+ : "A" (control),
+ "b" (input_address_hi),
+ "D"(output_address_hi), "S"(output_address_lo),
+ THUNK_TARGET(hv_hypercall_pg)
+ : "cc", "memory");
+#endif /* !x86_64 */
+ return hv_status;
+}
+
+/* Fast hypercall with 8 bytes of input and no output */
+static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1)
+{
+ u64 hv_status, control = (u64)code | HV_HYPERCALL_FAST_BIT;
+
+#ifdef CONFIG_X86_64
+ {
+ __asm__ __volatile__(CALL_NOSPEC
+ : "=a" (hv_status), ASM_CALL_CONSTRAINT,
+ "+c" (control), "+d" (input1)
+ : THUNK_TARGET(hv_hypercall_pg)
+ : "cc", "r8", "r9", "r10", "r11");
+ }
+#else
+ {
+ u32 input1_hi = upper_32_bits(input1);
+ u32 input1_lo = lower_32_bits(input1);
+
+ __asm__ __volatile__ (CALL_NOSPEC
+ : "=A"(hv_status),
+ "+c"(input1_lo),
+ ASM_CALL_CONSTRAINT
+ : "A" (control),
+ "b" (input1_hi),
+ THUNK_TARGET(hv_hypercall_pg)
+ : "cc", "edi", "esi");
+ }
+#endif
+ return hv_status;
+}
+
+/* Fast hypercall with 16 bytes of input */
+static inline u64 hv_do_fast_hypercall16(u16 code, u64 input1, u64 input2)
+{
+ u64 hv_status, control = (u64)code | HV_HYPERCALL_FAST_BIT;
+
+#ifdef CONFIG_X86_64
+ {
+ __asm__ __volatile__("mov %4, %%r8\n"
+ CALL_NOSPEC
+ : "=a" (hv_status), ASM_CALL_CONSTRAINT,
+ "+c" (control), "+d" (input1)
+ : "r" (input2),
+ THUNK_TARGET(hv_hypercall_pg)
+ : "cc", "r8", "r9", "r10", "r11");
+ }
+#else
+ {
+ u32 input1_hi = upper_32_bits(input1);
+ u32 input1_lo = lower_32_bits(input1);
+ u32 input2_hi = upper_32_bits(input2);
+ u32 input2_lo = lower_32_bits(input2);
+
+ __asm__ __volatile__ (CALL_NOSPEC
+ : "=A"(hv_status),
+ "+c"(input1_lo), ASM_CALL_CONSTRAINT
+ : "A" (control), "b" (input1_hi),
+ "D"(input2_hi), "S"(input2_lo),
+ THUNK_TARGET(hv_hypercall_pg)
+ : "cc");
+ }
+#endif
+ return hv_status;
+}
+
+/*
+ * Rep hypercalls. Callers of this functions are supposed to ensure that
+ * rep_count and varhead_size comply with Hyper-V hypercall definition.
+ */
+static inline u64 hv_do_rep_hypercall(u16 code, u16 rep_count, u16 varhead_size,
+ void *input, void *output)
+{
+ u64 control = code;
+ u64 status;
+ u16 rep_comp;
+
+ control |= (u64)varhead_size << HV_HYPERCALL_VARHEAD_OFFSET;
+ control |= (u64)rep_count << HV_HYPERCALL_REP_COMP_OFFSET;
+
+ do {
+ status = hv_do_hypercall(control, input, output);
+ if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS)
+ return status;
+
+ /* Bits 32-43 of status have 'Reps completed' data. */
+ rep_comp = (status & HV_HYPERCALL_REP_COMP_MASK) >>
+ HV_HYPERCALL_REP_COMP_OFFSET;
+
+ control &= ~HV_HYPERCALL_REP_START_MASK;
+ control |= (u64)rep_comp << HV_HYPERCALL_REP_START_OFFSET;
+
+ touch_nmi_watchdog();
+ } while (rep_comp < rep_count);
+
+ return status;
+}
+
+/*
+ * Hypervisor's notion of virtual processor ID is different from
+ * Linux' notion of CPU ID. This information can only be retrieved
+ * in the context of the calling CPU. Setup a map for easy access
+ * to this information.
+ */
+extern u32 *hv_vp_index;
+extern u32 hv_max_vp_index;
+extern struct hv_vp_assist_page **hv_vp_assist_page;
+
+static inline struct hv_vp_assist_page *hv_get_vp_assist_page(unsigned int cpu)
+{
+ if (!hv_vp_assist_page)
+ return NULL;
+
+ return hv_vp_assist_page[cpu];
+}
+
+/**
+ * hv_cpu_number_to_vp_number() - Map CPU to VP.
+ * @cpu_number: CPU number in Linux terms
+ *
+ * This function returns the mapping between the Linux processor
+ * number and the hypervisor's virtual processor number, useful
+ * in making hypercalls and such that talk about specific
+ * processors.
+ *
+ * Return: Virtual processor number in Hyper-V terms
+ */
+static inline int hv_cpu_number_to_vp_number(int cpu_number)
+{
+ return hv_vp_index[cpu_number];
+}
+
+static inline int cpumask_to_vpset(struct hv_vpset *vpset,
+ const struct cpumask *cpus)
+{
+ int cpu, vcpu, vcpu_bank, vcpu_offset, nr_bank = 1;
+
+ /* valid_bank_mask can represent up to 64 banks */
+ if (hv_max_vp_index / 64 >= 64)
+ return 0;
+
+ /*
+ * Clear all banks up to the maximum possible bank as hv_tlb_flush_ex
+ * structs are not cleared between calls, we risk flushing unneeded
+ * vCPUs otherwise.
+ */
+ for (vcpu_bank = 0; vcpu_bank <= hv_max_vp_index / 64; vcpu_bank++)
+ vpset->bank_contents[vcpu_bank] = 0;
+
+ /*
+ * Some banks may end up being empty but this is acceptable.
+ */
+ for_each_cpu(cpu, cpus) {
+ vcpu = hv_cpu_number_to_vp_number(cpu);
+ if (vcpu == VP_INVAL)
+ return -1;
+ vcpu_bank = vcpu / 64;
+ vcpu_offset = vcpu % 64;
+ __set_bit(vcpu_offset, (unsigned long *)
+ &vpset->bank_contents[vcpu_bank]);
+ if (vcpu_bank >= nr_bank)
+ nr_bank = vcpu_bank + 1;
+ }
+ vpset->valid_bank_mask = GENMASK_ULL(nr_bank - 1, 0);
+ return nr_bank;
+}
+
+void __init hyperv_init(void);
+void hyperv_setup_mmu_ops(void);
+void hyperv_report_panic(struct pt_regs *regs, long err, bool in_die);
+void hyperv_report_panic_msg(phys_addr_t pa, size_t size);
+bool hv_is_hyperv_initialized(void);
+void hyperv_cleanup(void);
+
+void hyperv_reenlightenment_intr(struct pt_regs *regs);
+void set_hv_tscchange_cb(void (*cb)(void));
+void clear_hv_tscchange_cb(void);
+void hyperv_stop_tsc_emulation(void);
+int hyperv_flush_guest_mapping(u64 as);
+
+#ifdef CONFIG_X86_64
+void hv_apic_init(void);
+#else
+static inline void hv_apic_init(void) {}
+#endif
+
+#else /* CONFIG_HYPERV */
+static inline void hyperv_init(void) {}
+static inline bool hv_is_hyperv_initialized(void) { return false; }
+static inline void hyperv_cleanup(void) {}
+static inline void hyperv_setup_mmu_ops(void) {}
+static inline void set_hv_tscchange_cb(void (*cb)(void)) {}
+static inline void clear_hv_tscchange_cb(void) {}
+static inline void hyperv_stop_tsc_emulation(void) {};
+static inline struct hv_vp_assist_page *hv_get_vp_assist_page(unsigned int cpu)
+{
+ return NULL;
+}
+static inline int hyperv_flush_guest_mapping(u64 as) { return -1; }
+#endif /* CONFIG_HYPERV */
+
+#ifdef CONFIG_HYPERV_TSCPAGE
+struct ms_hyperv_tsc_page *hv_get_tsc_page(void);
+static inline u64 hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg,
+ u64 *cur_tsc)
+{
+ u64 scale, offset;
+ u32 sequence;
+
+ /*
+ * The protocol for reading Hyper-V TSC page is specified in Hypervisor
+ * Top-Level Functional Specification ver. 3.0 and above. To get the
+ * reference time we must do the following:
+ * - READ ReferenceTscSequence
+ * A special '0' value indicates the time source is unreliable and we
+ * need to use something else. The currently published specification
+ * versions (up to 4.0b) contain a mistake and wrongly claim '-1'
+ * instead of '0' as the special value, see commit c35b82ef0294.
+ * - ReferenceTime =
+ * ((RDTSC() * ReferenceTscScale) >> 64) + ReferenceTscOffset
+ * - READ ReferenceTscSequence again. In case its value has changed
+ * since our first reading we need to discard ReferenceTime and repeat
+ * the whole sequence as the hypervisor was updating the page in
+ * between.
+ */
+ do {
+ sequence = READ_ONCE(tsc_pg->tsc_sequence);
+ if (!sequence)
+ return U64_MAX;
+ /*
+ * Make sure we read sequence before we read other values from
+ * TSC page.
+ */
+ smp_rmb();
+
+ scale = READ_ONCE(tsc_pg->tsc_scale);
+ offset = READ_ONCE(tsc_pg->tsc_offset);
+ *cur_tsc = rdtsc_ordered();
+
+ /*
+ * Make sure we read sequence after we read all other values
+ * from TSC page.
+ */
+ smp_rmb();
+
+ } while (READ_ONCE(tsc_pg->tsc_sequence) != sequence);
+
+ return mul_u64_u64_shr(*cur_tsc, scale, 64) + offset;
+}
+
+static inline u64 hv_read_tsc_page(const struct ms_hyperv_tsc_page *tsc_pg)
+{
+ u64 cur_tsc;
+
+ return hv_read_tsc_page_tsc(tsc_pg, &cur_tsc);
+}
+
+#else
+static inline struct ms_hyperv_tsc_page *hv_get_tsc_page(void)
+{
+ return NULL;
+}
+
+static inline u64 hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg,
+ u64 *cur_tsc)
+{
+ BUG();
+ return U64_MAX;
+}
+#endif
+#endif
diff --git a/arch/x86/include/asm/msi.h b/arch/x86/include/asm/msi.h
new file mode 100644
index 000000000..25ddd0916
--- /dev/null
+++ b/arch/x86/include/asm/msi.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_MSI_H
+#define _ASM_X86_MSI_H
+#include <asm/hw_irq.h>
+#include <asm/irqdomain.h>
+
+typedef struct irq_alloc_info msi_alloc_info_t;
+
+int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec,
+ msi_alloc_info_t *arg);
+
+void pci_msi_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc);
+
+#endif /* _ASM_X86_MSI_H */
diff --git a/arch/x86/include/asm/msidef.h b/arch/x86/include/asm/msidef.h
new file mode 100644
index 000000000..ee2f8ccc3
--- /dev/null
+++ b/arch/x86/include/asm/msidef.h
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_MSIDEF_H
+#define _ASM_X86_MSIDEF_H
+
+/*
+ * Constants for Intel APIC based MSI messages.
+ */
+
+/*
+ * Shifts for MSI data
+ */
+
+#define MSI_DATA_VECTOR_SHIFT 0
+#define MSI_DATA_VECTOR_MASK 0x000000ff
+#define MSI_DATA_VECTOR(v) (((v) << MSI_DATA_VECTOR_SHIFT) & \
+ MSI_DATA_VECTOR_MASK)
+
+#define MSI_DATA_DELIVERY_MODE_SHIFT 8
+#define MSI_DATA_DELIVERY_FIXED (0 << MSI_DATA_DELIVERY_MODE_SHIFT)
+#define MSI_DATA_DELIVERY_LOWPRI (1 << MSI_DATA_DELIVERY_MODE_SHIFT)
+
+#define MSI_DATA_LEVEL_SHIFT 14
+#define MSI_DATA_LEVEL_DEASSERT (0 << MSI_DATA_LEVEL_SHIFT)
+#define MSI_DATA_LEVEL_ASSERT (1 << MSI_DATA_LEVEL_SHIFT)
+
+#define MSI_DATA_TRIGGER_SHIFT 15
+#define MSI_DATA_TRIGGER_EDGE (0 << MSI_DATA_TRIGGER_SHIFT)
+#define MSI_DATA_TRIGGER_LEVEL (1 << MSI_DATA_TRIGGER_SHIFT)
+
+/*
+ * Shift/mask fields for msi address
+ */
+
+#define MSI_ADDR_BASE_HI 0
+#define MSI_ADDR_BASE_LO 0xfee00000
+
+#define MSI_ADDR_DEST_MODE_SHIFT 2
+#define MSI_ADDR_DEST_MODE_PHYSICAL (0 << MSI_ADDR_DEST_MODE_SHIFT)
+#define MSI_ADDR_DEST_MODE_LOGICAL (1 << MSI_ADDR_DEST_MODE_SHIFT)
+
+#define MSI_ADDR_REDIRECTION_SHIFT 3
+#define MSI_ADDR_REDIRECTION_CPU (0 << MSI_ADDR_REDIRECTION_SHIFT)
+ /* dedicated cpu */
+#define MSI_ADDR_REDIRECTION_LOWPRI (1 << MSI_ADDR_REDIRECTION_SHIFT)
+ /* lowest priority */
+
+#define MSI_ADDR_DEST_ID_SHIFT 12
+#define MSI_ADDR_DEST_ID_MASK 0x00ffff0
+#define MSI_ADDR_DEST_ID(dest) (((dest) << MSI_ADDR_DEST_ID_SHIFT) & \
+ MSI_ADDR_DEST_ID_MASK)
+#define MSI_ADDR_EXT_DEST_ID(dest) ((dest) & 0xffffff00)
+
+#define MSI_ADDR_IR_EXT_INT (1 << 4)
+#define MSI_ADDR_IR_SHV (1 << 3)
+#define MSI_ADDR_IR_INDEX1(index) ((index & 0x8000) >> 13)
+#define MSI_ADDR_IR_INDEX2(index) ((index & 0x7fff) << 5)
+#endif /* _ASM_X86_MSIDEF_H */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
new file mode 100644
index 000000000..586be095e
--- /dev/null
+++ b/arch/x86/include/asm/msr-index.h
@@ -0,0 +1,844 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_MSR_INDEX_H
+#define _ASM_X86_MSR_INDEX_H
+
+#include <linux/bits.h>
+
+/*
+ * CPU model specific register (MSR) numbers.
+ *
+ * Do not add new entries to this file unless the definitions are shared
+ * between multiple compilation units.
+ */
+
+/* x86-64 specific MSRs */
+#define MSR_EFER 0xc0000080 /* extended feature register */
+#define MSR_STAR 0xc0000081 /* legacy mode SYSCALL target */
+#define MSR_LSTAR 0xc0000082 /* long mode SYSCALL target */
+#define MSR_CSTAR 0xc0000083 /* compat mode SYSCALL target */
+#define MSR_SYSCALL_MASK 0xc0000084 /* EFLAGS mask for syscall */
+#define MSR_FS_BASE 0xc0000100 /* 64bit FS base */
+#define MSR_GS_BASE 0xc0000101 /* 64bit GS base */
+#define MSR_KERNEL_GS_BASE 0xc0000102 /* SwapGS GS shadow */
+#define MSR_TSC_AUX 0xc0000103 /* Auxiliary TSC */
+
+/* EFER bits: */
+#define _EFER_SCE 0 /* SYSCALL/SYSRET */
+#define _EFER_LME 8 /* Long mode enable */
+#define _EFER_LMA 10 /* Long mode active (read-only) */
+#define _EFER_NX 11 /* No execute enable */
+#define _EFER_SVME 12 /* Enable virtualization */
+#define _EFER_LMSLE 13 /* Long Mode Segment Limit Enable */
+#define _EFER_FFXSR 14 /* Enable Fast FXSAVE/FXRSTOR */
+
+#define EFER_SCE (1<<_EFER_SCE)
+#define EFER_LME (1<<_EFER_LME)
+#define EFER_LMA (1<<_EFER_LMA)
+#define EFER_NX (1<<_EFER_NX)
+#define EFER_SVME (1<<_EFER_SVME)
+#define EFER_LMSLE (1<<_EFER_LMSLE)
+#define EFER_FFXSR (1<<_EFER_FFXSR)
+
+/* Intel MSRs. Some also available on other CPUs */
+
+#define MSR_IA32_SPEC_CTRL 0x00000048 /* Speculation Control */
+#define SPEC_CTRL_IBRS BIT(0) /* Indirect Branch Restricted Speculation */
+#define SPEC_CTRL_STIBP_SHIFT 1 /* Single Thread Indirect Branch Predictor (STIBP) bit */
+#define SPEC_CTRL_STIBP BIT(SPEC_CTRL_STIBP_SHIFT) /* STIBP mask */
+#define SPEC_CTRL_SSBD_SHIFT 2 /* Speculative Store Bypass Disable bit */
+#define SPEC_CTRL_SSBD BIT(SPEC_CTRL_SSBD_SHIFT) /* Speculative Store Bypass Disable */
+
+#define MSR_IA32_PRED_CMD 0x00000049 /* Prediction Command */
+#define PRED_CMD_IBPB BIT(0) /* Indirect Branch Prediction Barrier */
+
+#define MSR_PPIN_CTL 0x0000004e
+#define MSR_PPIN 0x0000004f
+
+#define MSR_IA32_PERFCTR0 0x000000c1
+#define MSR_IA32_PERFCTR1 0x000000c2
+#define MSR_FSB_FREQ 0x000000cd
+#define MSR_PLATFORM_INFO 0x000000ce
+#define MSR_PLATFORM_INFO_CPUID_FAULT_BIT 31
+#define MSR_PLATFORM_INFO_CPUID_FAULT BIT_ULL(MSR_PLATFORM_INFO_CPUID_FAULT_BIT)
+
+#define MSR_PKG_CST_CONFIG_CONTROL 0x000000e2
+#define NHM_C3_AUTO_DEMOTE (1UL << 25)
+#define NHM_C1_AUTO_DEMOTE (1UL << 26)
+#define ATM_LNC_C6_AUTO_DEMOTE (1UL << 25)
+#define SNB_C3_AUTO_UNDEMOTE (1UL << 27)
+#define SNB_C1_AUTO_UNDEMOTE (1UL << 28)
+
+#define MSR_MTRRcap 0x000000fe
+
+#define MSR_IA32_ARCH_CAPABILITIES 0x0000010a
+#define ARCH_CAP_RDCL_NO BIT(0) /* Not susceptible to Meltdown */
+#define ARCH_CAP_IBRS_ALL BIT(1) /* Enhanced IBRS support */
+#define ARCH_CAP_SKIP_VMENTRY_L1DFLUSH BIT(3) /* Skip L1D flush on vmentry */
+#define ARCH_CAP_SSB_NO BIT(4) /*
+ * Not susceptible to Speculative Store Bypass
+ * attack, so no Speculative Store Bypass
+ * control required.
+ */
+#define ARCH_CAP_MDS_NO BIT(5) /*
+ * Not susceptible to
+ * Microarchitectural Data
+ * Sampling (MDS) vulnerabilities.
+ */
+#define ARCH_CAP_PSCHANGE_MC_NO BIT(6) /*
+ * The processor is not susceptible to a
+ * machine check error due to modifying the
+ * code page size along with either the
+ * physical address or cache type
+ * without TLB invalidation.
+ */
+#define ARCH_CAP_TSX_CTRL_MSR BIT(7) /* MSR for TSX control is available. */
+#define ARCH_CAP_TAA_NO BIT(8) /*
+ * Not susceptible to
+ * TSX Async Abort (TAA) vulnerabilities.
+ */
+#define ARCH_CAP_SBDR_SSDP_NO BIT(13) /*
+ * Not susceptible to SBDR and SSDP
+ * variants of Processor MMIO stale data
+ * vulnerabilities.
+ */
+#define ARCH_CAP_FBSDP_NO BIT(14) /*
+ * Not susceptible to FBSDP variant of
+ * Processor MMIO stale data
+ * vulnerabilities.
+ */
+#define ARCH_CAP_PSDP_NO BIT(15) /*
+ * Not susceptible to PSDP variant of
+ * Processor MMIO stale data
+ * vulnerabilities.
+ */
+#define ARCH_CAP_FB_CLEAR BIT(17) /*
+ * VERW clears CPU fill buffer
+ * even on MDS_NO CPUs.
+ */
+#define ARCH_CAP_FB_CLEAR_CTRL BIT(18) /*
+ * MSR_IA32_MCU_OPT_CTRL[FB_CLEAR_DIS]
+ * bit available to control VERW
+ * behavior.
+ */
+
+#define MSR_IA32_FLUSH_CMD 0x0000010b
+#define L1D_FLUSH BIT(0) /*
+ * Writeback and invalidate the
+ * L1 data cache.
+ */
+
+#define MSR_IA32_BBL_CR_CTL 0x00000119
+#define MSR_IA32_BBL_CR_CTL3 0x0000011e
+
+#define MSR_IA32_TSX_CTRL 0x00000122
+#define TSX_CTRL_RTM_DISABLE BIT(0) /* Disable RTM feature */
+#define TSX_CTRL_CPUID_CLEAR BIT(1) /* Disable TSX enumeration */
+
+/* SRBDS support */
+#define MSR_IA32_MCU_OPT_CTRL 0x00000123
+#define RNGDS_MITG_DIS BIT(0)
+#define FB_CLEAR_DIS BIT(3) /* CPU Fill buffer clear disable */
+
+#define MSR_IA32_SYSENTER_CS 0x00000174
+#define MSR_IA32_SYSENTER_ESP 0x00000175
+#define MSR_IA32_SYSENTER_EIP 0x00000176
+
+#define MSR_IA32_MCG_CAP 0x00000179
+#define MSR_IA32_MCG_STATUS 0x0000017a
+#define MSR_IA32_MCG_CTL 0x0000017b
+#define MSR_IA32_MCG_EXT_CTL 0x000004d0
+
+#define MSR_OFFCORE_RSP_0 0x000001a6
+#define MSR_OFFCORE_RSP_1 0x000001a7
+#define MSR_TURBO_RATIO_LIMIT 0x000001ad
+#define MSR_TURBO_RATIO_LIMIT1 0x000001ae
+#define MSR_TURBO_RATIO_LIMIT2 0x000001af
+
+#define MSR_LBR_SELECT 0x000001c8
+#define MSR_LBR_TOS 0x000001c9
+#define MSR_LBR_NHM_FROM 0x00000680
+#define MSR_LBR_NHM_TO 0x000006c0
+#define MSR_LBR_CORE_FROM 0x00000040
+#define MSR_LBR_CORE_TO 0x00000060
+
+#define MSR_LBR_INFO_0 0x00000dc0 /* ... 0xddf for _31 */
+#define LBR_INFO_MISPRED BIT_ULL(63)
+#define LBR_INFO_IN_TX BIT_ULL(62)
+#define LBR_INFO_ABORT BIT_ULL(61)
+#define LBR_INFO_CYCLES 0xffff
+
+#define MSR_IA32_PEBS_ENABLE 0x000003f1
+#define MSR_IA32_DS_AREA 0x00000600
+#define MSR_IA32_PERF_CAPABILITIES 0x00000345
+#define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6
+
+#define MSR_IA32_RTIT_CTL 0x00000570
+#define MSR_IA32_RTIT_STATUS 0x00000571
+#define MSR_IA32_RTIT_ADDR0_A 0x00000580
+#define MSR_IA32_RTIT_ADDR0_B 0x00000581
+#define MSR_IA32_RTIT_ADDR1_A 0x00000582
+#define MSR_IA32_RTIT_ADDR1_B 0x00000583
+#define MSR_IA32_RTIT_ADDR2_A 0x00000584
+#define MSR_IA32_RTIT_ADDR2_B 0x00000585
+#define MSR_IA32_RTIT_ADDR3_A 0x00000586
+#define MSR_IA32_RTIT_ADDR3_B 0x00000587
+#define MSR_IA32_RTIT_CR3_MATCH 0x00000572
+#define MSR_IA32_RTIT_OUTPUT_BASE 0x00000560
+#define MSR_IA32_RTIT_OUTPUT_MASK 0x00000561
+
+#define MSR_MTRRfix64K_00000 0x00000250
+#define MSR_MTRRfix16K_80000 0x00000258
+#define MSR_MTRRfix16K_A0000 0x00000259
+#define MSR_MTRRfix4K_C0000 0x00000268
+#define MSR_MTRRfix4K_C8000 0x00000269
+#define MSR_MTRRfix4K_D0000 0x0000026a
+#define MSR_MTRRfix4K_D8000 0x0000026b
+#define MSR_MTRRfix4K_E0000 0x0000026c
+#define MSR_MTRRfix4K_E8000 0x0000026d
+#define MSR_MTRRfix4K_F0000 0x0000026e
+#define MSR_MTRRfix4K_F8000 0x0000026f
+#define MSR_MTRRdefType 0x000002ff
+
+#define MSR_IA32_CR_PAT 0x00000277
+
+#define MSR_IA32_DEBUGCTLMSR 0x000001d9
+#define MSR_IA32_LASTBRANCHFROMIP 0x000001db
+#define MSR_IA32_LASTBRANCHTOIP 0x000001dc
+#define MSR_IA32_LASTINTFROMIP 0x000001dd
+#define MSR_IA32_LASTINTTOIP 0x000001de
+
+/* DEBUGCTLMSR bits (others vary by model): */
+#define DEBUGCTLMSR_LBR (1UL << 0) /* last branch recording */
+#define DEBUGCTLMSR_BTF_SHIFT 1
+#define DEBUGCTLMSR_BTF (1UL << 1) /* single-step on branches */
+#define DEBUGCTLMSR_TR (1UL << 6)
+#define DEBUGCTLMSR_BTS (1UL << 7)
+#define DEBUGCTLMSR_BTINT (1UL << 8)
+#define DEBUGCTLMSR_BTS_OFF_OS (1UL << 9)
+#define DEBUGCTLMSR_BTS_OFF_USR (1UL << 10)
+#define DEBUGCTLMSR_FREEZE_LBRS_ON_PMI (1UL << 11)
+#define DEBUGCTLMSR_FREEZE_IN_SMM_BIT 14
+#define DEBUGCTLMSR_FREEZE_IN_SMM (1UL << DEBUGCTLMSR_FREEZE_IN_SMM_BIT)
+
+#define MSR_PEBS_FRONTEND 0x000003f7
+
+#define MSR_IA32_POWER_CTL 0x000001fc
+
+#define MSR_IA32_MC0_CTL 0x00000400
+#define MSR_IA32_MC0_STATUS 0x00000401
+#define MSR_IA32_MC0_ADDR 0x00000402
+#define MSR_IA32_MC0_MISC 0x00000403
+
+/* C-state Residency Counters */
+#define MSR_PKG_C3_RESIDENCY 0x000003f8
+#define MSR_PKG_C6_RESIDENCY 0x000003f9
+#define MSR_ATOM_PKG_C6_RESIDENCY 0x000003fa
+#define MSR_PKG_C7_RESIDENCY 0x000003fa
+#define MSR_CORE_C3_RESIDENCY 0x000003fc
+#define MSR_CORE_C6_RESIDENCY 0x000003fd
+#define MSR_CORE_C7_RESIDENCY 0x000003fe
+#define MSR_KNL_CORE_C6_RESIDENCY 0x000003ff
+#define MSR_PKG_C2_RESIDENCY 0x0000060d
+#define MSR_PKG_C8_RESIDENCY 0x00000630
+#define MSR_PKG_C9_RESIDENCY 0x00000631
+#define MSR_PKG_C10_RESIDENCY 0x00000632
+
+/* Interrupt Response Limit */
+#define MSR_PKGC3_IRTL 0x0000060a
+#define MSR_PKGC6_IRTL 0x0000060b
+#define MSR_PKGC7_IRTL 0x0000060c
+#define MSR_PKGC8_IRTL 0x00000633
+#define MSR_PKGC9_IRTL 0x00000634
+#define MSR_PKGC10_IRTL 0x00000635
+
+/* Run Time Average Power Limiting (RAPL) Interface */
+
+#define MSR_RAPL_POWER_UNIT 0x00000606
+
+#define MSR_PKG_POWER_LIMIT 0x00000610
+#define MSR_PKG_ENERGY_STATUS 0x00000611
+#define MSR_PKG_PERF_STATUS 0x00000613
+#define MSR_PKG_POWER_INFO 0x00000614
+
+#define MSR_DRAM_POWER_LIMIT 0x00000618
+#define MSR_DRAM_ENERGY_STATUS 0x00000619
+#define MSR_DRAM_PERF_STATUS 0x0000061b
+#define MSR_DRAM_POWER_INFO 0x0000061c
+
+#define MSR_PP0_POWER_LIMIT 0x00000638
+#define MSR_PP0_ENERGY_STATUS 0x00000639
+#define MSR_PP0_POLICY 0x0000063a
+#define MSR_PP0_PERF_STATUS 0x0000063b
+
+#define MSR_PP1_POWER_LIMIT 0x00000640
+#define MSR_PP1_ENERGY_STATUS 0x00000641
+#define MSR_PP1_POLICY 0x00000642
+
+/* Config TDP MSRs */
+#define MSR_CONFIG_TDP_NOMINAL 0x00000648
+#define MSR_CONFIG_TDP_LEVEL_1 0x00000649
+#define MSR_CONFIG_TDP_LEVEL_2 0x0000064A
+#define MSR_CONFIG_TDP_CONTROL 0x0000064B
+#define MSR_TURBO_ACTIVATION_RATIO 0x0000064C
+
+#define MSR_PLATFORM_ENERGY_STATUS 0x0000064D
+
+#define MSR_PKG_WEIGHTED_CORE_C0_RES 0x00000658
+#define MSR_PKG_ANY_CORE_C0_RES 0x00000659
+#define MSR_PKG_ANY_GFXE_C0_RES 0x0000065A
+#define MSR_PKG_BOTH_CORE_GFXE_C0_RES 0x0000065B
+
+#define MSR_CORE_C1_RES 0x00000660
+#define MSR_MODULE_C6_RES_MS 0x00000664
+
+#define MSR_CC6_DEMOTION_POLICY_CONFIG 0x00000668
+#define MSR_MC6_DEMOTION_POLICY_CONFIG 0x00000669
+
+#define MSR_ATOM_CORE_RATIOS 0x0000066a
+#define MSR_ATOM_CORE_VIDS 0x0000066b
+#define MSR_ATOM_CORE_TURBO_RATIOS 0x0000066c
+#define MSR_ATOM_CORE_TURBO_VIDS 0x0000066d
+
+
+#define MSR_CORE_PERF_LIMIT_REASONS 0x00000690
+#define MSR_GFX_PERF_LIMIT_REASONS 0x000006B0
+#define MSR_RING_PERF_LIMIT_REASONS 0x000006B1
+
+/* Hardware P state interface */
+#define MSR_PPERF 0x0000064e
+#define MSR_PERF_LIMIT_REASONS 0x0000064f
+#define MSR_PM_ENABLE 0x00000770
+#define MSR_HWP_CAPABILITIES 0x00000771
+#define MSR_HWP_REQUEST_PKG 0x00000772
+#define MSR_HWP_INTERRUPT 0x00000773
+#define MSR_HWP_REQUEST 0x00000774
+#define MSR_HWP_STATUS 0x00000777
+
+/* CPUID.6.EAX */
+#define HWP_BASE_BIT (1<<7)
+#define HWP_NOTIFICATIONS_BIT (1<<8)
+#define HWP_ACTIVITY_WINDOW_BIT (1<<9)
+#define HWP_ENERGY_PERF_PREFERENCE_BIT (1<<10)
+#define HWP_PACKAGE_LEVEL_REQUEST_BIT (1<<11)
+
+/* IA32_HWP_CAPABILITIES */
+#define HWP_HIGHEST_PERF(x) (((x) >> 0) & 0xff)
+#define HWP_GUARANTEED_PERF(x) (((x) >> 8) & 0xff)
+#define HWP_MOSTEFFICIENT_PERF(x) (((x) >> 16) & 0xff)
+#define HWP_LOWEST_PERF(x) (((x) >> 24) & 0xff)
+
+/* IA32_HWP_REQUEST */
+#define HWP_MIN_PERF(x) (x & 0xff)
+#define HWP_MAX_PERF(x) ((x & 0xff) << 8)
+#define HWP_DESIRED_PERF(x) ((x & 0xff) << 16)
+#define HWP_ENERGY_PERF_PREFERENCE(x) (((unsigned long long) x & 0xff) << 24)
+#define HWP_EPP_PERFORMANCE 0x00
+#define HWP_EPP_BALANCE_PERFORMANCE 0x80
+#define HWP_EPP_BALANCE_POWERSAVE 0xC0
+#define HWP_EPP_POWERSAVE 0xFF
+#define HWP_ACTIVITY_WINDOW(x) ((unsigned long long)(x & 0xff3) << 32)
+#define HWP_PACKAGE_CONTROL(x) ((unsigned long long)(x & 0x1) << 42)
+
+/* IA32_HWP_STATUS */
+#define HWP_GUARANTEED_CHANGE(x) (x & 0x1)
+#define HWP_EXCURSION_TO_MINIMUM(x) (x & 0x4)
+
+/* IA32_HWP_INTERRUPT */
+#define HWP_CHANGE_TO_GUARANTEED_INT(x) (x & 0x1)
+#define HWP_EXCURSION_TO_MINIMUM_INT(x) (x & 0x2)
+
+#define MSR_AMD64_MC0_MASK 0xc0010044
+
+#define MSR_IA32_MCx_CTL(x) (MSR_IA32_MC0_CTL + 4*(x))
+#define MSR_IA32_MCx_STATUS(x) (MSR_IA32_MC0_STATUS + 4*(x))
+#define MSR_IA32_MCx_ADDR(x) (MSR_IA32_MC0_ADDR + 4*(x))
+#define MSR_IA32_MCx_MISC(x) (MSR_IA32_MC0_MISC + 4*(x))
+
+#define MSR_AMD64_MCx_MASK(x) (MSR_AMD64_MC0_MASK + (x))
+
+/* These are consecutive and not in the normal 4er MCE bank block */
+#define MSR_IA32_MC0_CTL2 0x00000280
+#define MSR_IA32_MCx_CTL2(x) (MSR_IA32_MC0_CTL2 + (x))
+
+#define MSR_P6_PERFCTR0 0x000000c1
+#define MSR_P6_PERFCTR1 0x000000c2
+#define MSR_P6_EVNTSEL0 0x00000186
+#define MSR_P6_EVNTSEL1 0x00000187
+
+#define MSR_KNC_PERFCTR0 0x00000020
+#define MSR_KNC_PERFCTR1 0x00000021
+#define MSR_KNC_EVNTSEL0 0x00000028
+#define MSR_KNC_EVNTSEL1 0x00000029
+
+/* Alternative perfctr range with full access. */
+#define MSR_IA32_PMC0 0x000004c1
+
+/* AMD64 MSRs. Not complete. See the architecture manual for a more
+ complete list. */
+
+#define MSR_AMD64_PATCH_LEVEL 0x0000008b
+#define MSR_AMD64_TSC_RATIO 0xc0000104
+#define MSR_AMD64_NB_CFG 0xc001001f
+#define MSR_AMD64_CPUID_FN_1 0xc0011004
+#define MSR_AMD64_PATCH_LOADER 0xc0010020
+#define MSR_AMD64_OSVW_ID_LENGTH 0xc0010140
+#define MSR_AMD64_OSVW_STATUS 0xc0010141
+#define MSR_AMD64_LS_CFG 0xc0011020
+#define MSR_AMD64_DC_CFG 0xc0011022
+#define MSR_AMD64_BU_CFG2 0xc001102a
+#define MSR_AMD64_IBSFETCHCTL 0xc0011030
+#define MSR_AMD64_IBSFETCHLINAD 0xc0011031
+#define MSR_AMD64_IBSFETCHPHYSAD 0xc0011032
+#define MSR_AMD64_IBSFETCH_REG_COUNT 3
+#define MSR_AMD64_IBSFETCH_REG_MASK ((1UL<<MSR_AMD64_IBSFETCH_REG_COUNT)-1)
+#define MSR_AMD64_IBSOPCTL 0xc0011033
+#define MSR_AMD64_IBSOPRIP 0xc0011034
+#define MSR_AMD64_IBSOPDATA 0xc0011035
+#define MSR_AMD64_IBSOPDATA2 0xc0011036
+#define MSR_AMD64_IBSOPDATA3 0xc0011037
+#define MSR_AMD64_IBSDCLINAD 0xc0011038
+#define MSR_AMD64_IBSDCPHYSAD 0xc0011039
+#define MSR_AMD64_IBSOP_REG_COUNT 7
+#define MSR_AMD64_IBSOP_REG_MASK ((1UL<<MSR_AMD64_IBSOP_REG_COUNT)-1)
+#define MSR_AMD64_IBSCTL 0xc001103a
+#define MSR_AMD64_IBSBRTARGET 0xc001103b
+#define MSR_AMD64_ICIBSEXTDCTL 0xc001103c
+#define MSR_AMD64_IBSOPDATA4 0xc001103d
+#define MSR_AMD64_IBS_REG_COUNT_MAX 8 /* includes MSR_AMD64_IBSBRTARGET */
+#define MSR_AMD64_SEV 0xc0010131
+#define MSR_AMD64_SEV_ENABLED_BIT 0
+#define MSR_AMD64_SEV_ENABLED BIT_ULL(MSR_AMD64_SEV_ENABLED_BIT)
+
+#define MSR_AMD64_VIRT_SPEC_CTRL 0xc001011f
+
+/* Fam 17h MSRs */
+#define MSR_F17H_IRPERF 0xc00000e9
+
+/* Fam 16h MSRs */
+#define MSR_F16H_L2I_PERF_CTL 0xc0010230
+#define MSR_F16H_L2I_PERF_CTR 0xc0010231
+#define MSR_F16H_DR1_ADDR_MASK 0xc0011019
+#define MSR_F16H_DR2_ADDR_MASK 0xc001101a
+#define MSR_F16H_DR3_ADDR_MASK 0xc001101b
+#define MSR_F16H_DR0_ADDR_MASK 0xc0011027
+
+/* Fam 15h MSRs */
+#define MSR_F15H_PERF_CTL 0xc0010200
+#define MSR_F15H_PERF_CTL0 MSR_F15H_PERF_CTL
+#define MSR_F15H_PERF_CTL1 (MSR_F15H_PERF_CTL + 2)
+#define MSR_F15H_PERF_CTL2 (MSR_F15H_PERF_CTL + 4)
+#define MSR_F15H_PERF_CTL3 (MSR_F15H_PERF_CTL + 6)
+#define MSR_F15H_PERF_CTL4 (MSR_F15H_PERF_CTL + 8)
+#define MSR_F15H_PERF_CTL5 (MSR_F15H_PERF_CTL + 10)
+
+#define MSR_F15H_PERF_CTR 0xc0010201
+#define MSR_F15H_PERF_CTR0 MSR_F15H_PERF_CTR
+#define MSR_F15H_PERF_CTR1 (MSR_F15H_PERF_CTR + 2)
+#define MSR_F15H_PERF_CTR2 (MSR_F15H_PERF_CTR + 4)
+#define MSR_F15H_PERF_CTR3 (MSR_F15H_PERF_CTR + 6)
+#define MSR_F15H_PERF_CTR4 (MSR_F15H_PERF_CTR + 8)
+#define MSR_F15H_PERF_CTR5 (MSR_F15H_PERF_CTR + 10)
+
+#define MSR_F15H_NB_PERF_CTL 0xc0010240
+#define MSR_F15H_NB_PERF_CTR 0xc0010241
+#define MSR_F15H_PTSC 0xc0010280
+#define MSR_F15H_IC_CFG 0xc0011021
+#define MSR_F15H_EX_CFG 0xc001102c
+
+/* Fam 10h MSRs */
+#define MSR_FAM10H_MMIO_CONF_BASE 0xc0010058
+#define FAM10H_MMIO_CONF_ENABLE (1<<0)
+#define FAM10H_MMIO_CONF_BUSRANGE_MASK 0xf
+#define FAM10H_MMIO_CONF_BUSRANGE_SHIFT 2
+#define FAM10H_MMIO_CONF_BASE_MASK 0xfffffffULL
+#define FAM10H_MMIO_CONF_BASE_SHIFT 20
+#define MSR_FAM10H_NODE_ID 0xc001100c
+#define MSR_F10H_DECFG 0xc0011029
+#define MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT 1
+#define MSR_F10H_DECFG_LFENCE_SERIALIZE BIT_ULL(MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT)
+
+/* K8 MSRs */
+#define MSR_K8_TOP_MEM1 0xc001001a
+#define MSR_K8_TOP_MEM2 0xc001001d
+#define MSR_K8_SYSCFG 0xc0010010
+#define MSR_K8_SYSCFG_MEM_ENCRYPT_BIT 23
+#define MSR_K8_SYSCFG_MEM_ENCRYPT BIT_ULL(MSR_K8_SYSCFG_MEM_ENCRYPT_BIT)
+#define MSR_K8_INT_PENDING_MSG 0xc0010055
+/* C1E active bits in int pending message */
+#define K8_INTP_C1E_ACTIVE_MASK 0x18000000
+#define MSR_K8_TSEG_ADDR 0xc0010112
+#define MSR_K8_TSEG_MASK 0xc0010113
+#define K8_MTRRFIXRANGE_DRAM_ENABLE 0x00040000 /* MtrrFixDramEn bit */
+#define K8_MTRRFIXRANGE_DRAM_MODIFY 0x00080000 /* MtrrFixDramModEn bit */
+#define K8_MTRR_RDMEM_WRMEM_MASK 0x18181818 /* Mask: RdMem|WrMem */
+
+/* K7 MSRs */
+#define MSR_K7_EVNTSEL0 0xc0010000
+#define MSR_K7_PERFCTR0 0xc0010004
+#define MSR_K7_EVNTSEL1 0xc0010001
+#define MSR_K7_PERFCTR1 0xc0010005
+#define MSR_K7_EVNTSEL2 0xc0010002
+#define MSR_K7_PERFCTR2 0xc0010006
+#define MSR_K7_EVNTSEL3 0xc0010003
+#define MSR_K7_PERFCTR3 0xc0010007
+#define MSR_K7_CLK_CTL 0xc001001b
+#define MSR_K7_HWCR 0xc0010015
+#define MSR_K7_HWCR_SMMLOCK_BIT 0
+#define MSR_K7_HWCR_SMMLOCK BIT_ULL(MSR_K7_HWCR_SMMLOCK_BIT)
+#define MSR_K7_HWCR_IRPERF_EN_BIT 30
+#define MSR_K7_HWCR_IRPERF_EN BIT_ULL(MSR_K7_HWCR_IRPERF_EN_BIT)
+#define MSR_K7_FID_VID_CTL 0xc0010041
+#define MSR_K7_FID_VID_STATUS 0xc0010042
+
+/* K6 MSRs */
+#define MSR_K6_WHCR 0xc0000082
+#define MSR_K6_UWCCR 0xc0000085
+#define MSR_K6_EPMR 0xc0000086
+#define MSR_K6_PSOR 0xc0000087
+#define MSR_K6_PFIR 0xc0000088
+
+/* Centaur-Hauls/IDT defined MSRs. */
+#define MSR_IDT_FCR1 0x00000107
+#define MSR_IDT_FCR2 0x00000108
+#define MSR_IDT_FCR3 0x00000109
+#define MSR_IDT_FCR4 0x0000010a
+
+#define MSR_IDT_MCR0 0x00000110
+#define MSR_IDT_MCR1 0x00000111
+#define MSR_IDT_MCR2 0x00000112
+#define MSR_IDT_MCR3 0x00000113
+#define MSR_IDT_MCR4 0x00000114
+#define MSR_IDT_MCR5 0x00000115
+#define MSR_IDT_MCR6 0x00000116
+#define MSR_IDT_MCR7 0x00000117
+#define MSR_IDT_MCR_CTRL 0x00000120
+
+/* VIA Cyrix defined MSRs*/
+#define MSR_VIA_FCR 0x00001107
+#define MSR_VIA_LONGHAUL 0x0000110a
+#define MSR_VIA_RNG 0x0000110b
+#define MSR_VIA_BCR2 0x00001147
+
+/* Transmeta defined MSRs */
+#define MSR_TMTA_LONGRUN_CTRL 0x80868010
+#define MSR_TMTA_LONGRUN_FLAGS 0x80868011
+#define MSR_TMTA_LRTI_READOUT 0x80868018
+#define MSR_TMTA_LRTI_VOLT_MHZ 0x8086801a
+
+/* Intel defined MSRs. */
+#define MSR_IA32_P5_MC_ADDR 0x00000000
+#define MSR_IA32_P5_MC_TYPE 0x00000001
+#define MSR_IA32_TSC 0x00000010
+#define MSR_IA32_PLATFORM_ID 0x00000017
+#define MSR_IA32_EBL_CR_POWERON 0x0000002a
+#define MSR_EBC_FREQUENCY_ID 0x0000002c
+#define MSR_SMI_COUNT 0x00000034
+#define MSR_IA32_FEATURE_CONTROL 0x0000003a
+#define MSR_IA32_TSC_ADJUST 0x0000003b
+#define MSR_IA32_BNDCFGS 0x00000d90
+
+#define MSR_IA32_BNDCFGS_RSVD 0x00000ffc
+
+#define MSR_IA32_XSS 0x00000da0
+
+#define FEATURE_CONTROL_LOCKED (1<<0)
+#define FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX (1<<1)
+#define FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX (1<<2)
+#define FEATURE_CONTROL_LMCE (1<<20)
+
+#define MSR_IA32_APICBASE 0x0000001b
+#define MSR_IA32_APICBASE_BSP (1<<8)
+#define MSR_IA32_APICBASE_ENABLE (1<<11)
+#define MSR_IA32_APICBASE_BASE (0xfffff<<12)
+
+#define MSR_IA32_TSCDEADLINE 0x000006e0
+
+#define MSR_IA32_UCODE_WRITE 0x00000079
+#define MSR_IA32_UCODE_REV 0x0000008b
+
+#define MSR_IA32_SMM_MONITOR_CTL 0x0000009b
+#define MSR_IA32_SMBASE 0x0000009e
+
+#define MSR_IA32_PERF_STATUS 0x00000198
+#define MSR_IA32_PERF_CTL 0x00000199
+#define INTEL_PERF_CTL_MASK 0xffff
+#define MSR_AMD_PSTATE_DEF_BASE 0xc0010064
+#define MSR_AMD_PERF_STATUS 0xc0010063
+#define MSR_AMD_PERF_CTL 0xc0010062
+
+#define MSR_IA32_MPERF 0x000000e7
+#define MSR_IA32_APERF 0x000000e8
+
+#define MSR_IA32_THERM_CONTROL 0x0000019a
+#define MSR_IA32_THERM_INTERRUPT 0x0000019b
+
+#define THERM_INT_HIGH_ENABLE (1 << 0)
+#define THERM_INT_LOW_ENABLE (1 << 1)
+#define THERM_INT_PLN_ENABLE (1 << 24)
+
+#define MSR_IA32_THERM_STATUS 0x0000019c
+
+#define THERM_STATUS_PROCHOT (1 << 0)
+#define THERM_STATUS_POWER_LIMIT (1 << 10)
+
+#define MSR_THERM2_CTL 0x0000019d
+
+#define MSR_THERM2_CTL_TM_SELECT (1ULL << 16)
+
+#define MSR_IA32_MISC_ENABLE 0x000001a0
+
+#define MSR_IA32_TEMPERATURE_TARGET 0x000001a2
+
+#define MSR_MISC_FEATURE_CONTROL 0x000001a4
+#define MSR_MISC_PWR_MGMT 0x000001aa
+
+#define MSR_IA32_ENERGY_PERF_BIAS 0x000001b0
+#define ENERGY_PERF_BIAS_PERFORMANCE 0
+#define ENERGY_PERF_BIAS_BALANCE_PERFORMANCE 4
+#define ENERGY_PERF_BIAS_NORMAL 6
+#define ENERGY_PERF_BIAS_BALANCE_POWERSAVE 8
+#define ENERGY_PERF_BIAS_POWERSAVE 15
+
+#define MSR_IA32_PACKAGE_THERM_STATUS 0x000001b1
+
+#define PACKAGE_THERM_STATUS_PROCHOT (1 << 0)
+#define PACKAGE_THERM_STATUS_POWER_LIMIT (1 << 10)
+
+#define MSR_IA32_PACKAGE_THERM_INTERRUPT 0x000001b2
+
+#define PACKAGE_THERM_INT_HIGH_ENABLE (1 << 0)
+#define PACKAGE_THERM_INT_LOW_ENABLE (1 << 1)
+#define PACKAGE_THERM_INT_PLN_ENABLE (1 << 24)
+
+/* Thermal Thresholds Support */
+#define THERM_INT_THRESHOLD0_ENABLE (1 << 15)
+#define THERM_SHIFT_THRESHOLD0 8
+#define THERM_MASK_THRESHOLD0 (0x7f << THERM_SHIFT_THRESHOLD0)
+#define THERM_INT_THRESHOLD1_ENABLE (1 << 23)
+#define THERM_SHIFT_THRESHOLD1 16
+#define THERM_MASK_THRESHOLD1 (0x7f << THERM_SHIFT_THRESHOLD1)
+#define THERM_STATUS_THRESHOLD0 (1 << 6)
+#define THERM_LOG_THRESHOLD0 (1 << 7)
+#define THERM_STATUS_THRESHOLD1 (1 << 8)
+#define THERM_LOG_THRESHOLD1 (1 << 9)
+
+/* MISC_ENABLE bits: architectural */
+#define MSR_IA32_MISC_ENABLE_FAST_STRING_BIT 0
+#define MSR_IA32_MISC_ENABLE_FAST_STRING (1ULL << MSR_IA32_MISC_ENABLE_FAST_STRING_BIT)
+#define MSR_IA32_MISC_ENABLE_TCC_BIT 1
+#define MSR_IA32_MISC_ENABLE_TCC (1ULL << MSR_IA32_MISC_ENABLE_TCC_BIT)
+#define MSR_IA32_MISC_ENABLE_EMON_BIT 7
+#define MSR_IA32_MISC_ENABLE_EMON (1ULL << MSR_IA32_MISC_ENABLE_EMON_BIT)
+#define MSR_IA32_MISC_ENABLE_BTS_UNAVAIL_BIT 11
+#define MSR_IA32_MISC_ENABLE_BTS_UNAVAIL (1ULL << MSR_IA32_MISC_ENABLE_BTS_UNAVAIL_BIT)
+#define MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL_BIT 12
+#define MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL (1ULL << MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL_BIT)
+#define MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP_BIT 16
+#define MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP (1ULL << MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP_BIT)
+#define MSR_IA32_MISC_ENABLE_MWAIT_BIT 18
+#define MSR_IA32_MISC_ENABLE_MWAIT (1ULL << MSR_IA32_MISC_ENABLE_MWAIT_BIT)
+#define MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT 22
+#define MSR_IA32_MISC_ENABLE_LIMIT_CPUID (1ULL << MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT)
+#define MSR_IA32_MISC_ENABLE_XTPR_DISABLE_BIT 23
+#define MSR_IA32_MISC_ENABLE_XTPR_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_XTPR_DISABLE_BIT)
+#define MSR_IA32_MISC_ENABLE_XD_DISABLE_BIT 34
+#define MSR_IA32_MISC_ENABLE_XD_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_XD_DISABLE_BIT)
+
+/* MISC_ENABLE bits: model-specific, meaning may vary from core to core */
+#define MSR_IA32_MISC_ENABLE_X87_COMPAT_BIT 2
+#define MSR_IA32_MISC_ENABLE_X87_COMPAT (1ULL << MSR_IA32_MISC_ENABLE_X87_COMPAT_BIT)
+#define MSR_IA32_MISC_ENABLE_TM1_BIT 3
+#define MSR_IA32_MISC_ENABLE_TM1 (1ULL << MSR_IA32_MISC_ENABLE_TM1_BIT)
+#define MSR_IA32_MISC_ENABLE_SPLIT_LOCK_DISABLE_BIT 4
+#define MSR_IA32_MISC_ENABLE_SPLIT_LOCK_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_SPLIT_LOCK_DISABLE_BIT)
+#define MSR_IA32_MISC_ENABLE_L3CACHE_DISABLE_BIT 6
+#define MSR_IA32_MISC_ENABLE_L3CACHE_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_L3CACHE_DISABLE_BIT)
+#define MSR_IA32_MISC_ENABLE_SUPPRESS_LOCK_BIT 8
+#define MSR_IA32_MISC_ENABLE_SUPPRESS_LOCK (1ULL << MSR_IA32_MISC_ENABLE_SUPPRESS_LOCK_BIT)
+#define MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE_BIT 9
+#define MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE_BIT)
+#define MSR_IA32_MISC_ENABLE_FERR_BIT 10
+#define MSR_IA32_MISC_ENABLE_FERR (1ULL << MSR_IA32_MISC_ENABLE_FERR_BIT)
+#define MSR_IA32_MISC_ENABLE_FERR_MULTIPLEX_BIT 10
+#define MSR_IA32_MISC_ENABLE_FERR_MULTIPLEX (1ULL << MSR_IA32_MISC_ENABLE_FERR_MULTIPLEX_BIT)
+#define MSR_IA32_MISC_ENABLE_TM2_BIT 13
+#define MSR_IA32_MISC_ENABLE_TM2 (1ULL << MSR_IA32_MISC_ENABLE_TM2_BIT)
+#define MSR_IA32_MISC_ENABLE_ADJ_PREF_DISABLE_BIT 19
+#define MSR_IA32_MISC_ENABLE_ADJ_PREF_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_ADJ_PREF_DISABLE_BIT)
+#define MSR_IA32_MISC_ENABLE_SPEEDSTEP_LOCK_BIT 20
+#define MSR_IA32_MISC_ENABLE_SPEEDSTEP_LOCK (1ULL << MSR_IA32_MISC_ENABLE_SPEEDSTEP_LOCK_BIT)
+#define MSR_IA32_MISC_ENABLE_L1D_CONTEXT_BIT 24
+#define MSR_IA32_MISC_ENABLE_L1D_CONTEXT (1ULL << MSR_IA32_MISC_ENABLE_L1D_CONTEXT_BIT)
+#define MSR_IA32_MISC_ENABLE_DCU_PREF_DISABLE_BIT 37
+#define MSR_IA32_MISC_ENABLE_DCU_PREF_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_DCU_PREF_DISABLE_BIT)
+#define MSR_IA32_MISC_ENABLE_TURBO_DISABLE_BIT 38
+#define MSR_IA32_MISC_ENABLE_TURBO_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_TURBO_DISABLE_BIT)
+#define MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE_BIT 39
+#define MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE_BIT)
+
+/* MISC_FEATURES_ENABLES non-architectural features */
+#define MSR_MISC_FEATURES_ENABLES 0x00000140
+
+#define MSR_MISC_FEATURES_ENABLES_CPUID_FAULT_BIT 0
+#define MSR_MISC_FEATURES_ENABLES_CPUID_FAULT BIT_ULL(MSR_MISC_FEATURES_ENABLES_CPUID_FAULT_BIT)
+#define MSR_MISC_FEATURES_ENABLES_RING3MWAIT_BIT 1
+
+#define MSR_IA32_TSC_DEADLINE 0x000006E0
+
+
+#define MSR_TSX_FORCE_ABORT 0x0000010F
+
+#define MSR_TFA_RTM_FORCE_ABORT_BIT 0
+#define MSR_TFA_RTM_FORCE_ABORT BIT_ULL(MSR_TFA_RTM_FORCE_ABORT_BIT)
+
+/* P4/Xeon+ specific */
+#define MSR_IA32_MCG_EAX 0x00000180
+#define MSR_IA32_MCG_EBX 0x00000181
+#define MSR_IA32_MCG_ECX 0x00000182
+#define MSR_IA32_MCG_EDX 0x00000183
+#define MSR_IA32_MCG_ESI 0x00000184
+#define MSR_IA32_MCG_EDI 0x00000185
+#define MSR_IA32_MCG_EBP 0x00000186
+#define MSR_IA32_MCG_ESP 0x00000187
+#define MSR_IA32_MCG_EFLAGS 0x00000188
+#define MSR_IA32_MCG_EIP 0x00000189
+#define MSR_IA32_MCG_RESERVED 0x0000018a
+
+/* Pentium IV performance counter MSRs */
+#define MSR_P4_BPU_PERFCTR0 0x00000300
+#define MSR_P4_BPU_PERFCTR1 0x00000301
+#define MSR_P4_BPU_PERFCTR2 0x00000302
+#define MSR_P4_BPU_PERFCTR3 0x00000303
+#define MSR_P4_MS_PERFCTR0 0x00000304
+#define MSR_P4_MS_PERFCTR1 0x00000305
+#define MSR_P4_MS_PERFCTR2 0x00000306
+#define MSR_P4_MS_PERFCTR3 0x00000307
+#define MSR_P4_FLAME_PERFCTR0 0x00000308
+#define MSR_P4_FLAME_PERFCTR1 0x00000309
+#define MSR_P4_FLAME_PERFCTR2 0x0000030a
+#define MSR_P4_FLAME_PERFCTR3 0x0000030b
+#define MSR_P4_IQ_PERFCTR0 0x0000030c
+#define MSR_P4_IQ_PERFCTR1 0x0000030d
+#define MSR_P4_IQ_PERFCTR2 0x0000030e
+#define MSR_P4_IQ_PERFCTR3 0x0000030f
+#define MSR_P4_IQ_PERFCTR4 0x00000310
+#define MSR_P4_IQ_PERFCTR5 0x00000311
+#define MSR_P4_BPU_CCCR0 0x00000360
+#define MSR_P4_BPU_CCCR1 0x00000361
+#define MSR_P4_BPU_CCCR2 0x00000362
+#define MSR_P4_BPU_CCCR3 0x00000363
+#define MSR_P4_MS_CCCR0 0x00000364
+#define MSR_P4_MS_CCCR1 0x00000365
+#define MSR_P4_MS_CCCR2 0x00000366
+#define MSR_P4_MS_CCCR3 0x00000367
+#define MSR_P4_FLAME_CCCR0 0x00000368
+#define MSR_P4_FLAME_CCCR1 0x00000369
+#define MSR_P4_FLAME_CCCR2 0x0000036a
+#define MSR_P4_FLAME_CCCR3 0x0000036b
+#define MSR_P4_IQ_CCCR0 0x0000036c
+#define MSR_P4_IQ_CCCR1 0x0000036d
+#define MSR_P4_IQ_CCCR2 0x0000036e
+#define MSR_P4_IQ_CCCR3 0x0000036f
+#define MSR_P4_IQ_CCCR4 0x00000370
+#define MSR_P4_IQ_CCCR5 0x00000371
+#define MSR_P4_ALF_ESCR0 0x000003ca
+#define MSR_P4_ALF_ESCR1 0x000003cb
+#define MSR_P4_BPU_ESCR0 0x000003b2
+#define MSR_P4_BPU_ESCR1 0x000003b3
+#define MSR_P4_BSU_ESCR0 0x000003a0
+#define MSR_P4_BSU_ESCR1 0x000003a1
+#define MSR_P4_CRU_ESCR0 0x000003b8
+#define MSR_P4_CRU_ESCR1 0x000003b9
+#define MSR_P4_CRU_ESCR2 0x000003cc
+#define MSR_P4_CRU_ESCR3 0x000003cd
+#define MSR_P4_CRU_ESCR4 0x000003e0
+#define MSR_P4_CRU_ESCR5 0x000003e1
+#define MSR_P4_DAC_ESCR0 0x000003a8
+#define MSR_P4_DAC_ESCR1 0x000003a9
+#define MSR_P4_FIRM_ESCR0 0x000003a4
+#define MSR_P4_FIRM_ESCR1 0x000003a5
+#define MSR_P4_FLAME_ESCR0 0x000003a6
+#define MSR_P4_FLAME_ESCR1 0x000003a7
+#define MSR_P4_FSB_ESCR0 0x000003a2
+#define MSR_P4_FSB_ESCR1 0x000003a3
+#define MSR_P4_IQ_ESCR0 0x000003ba
+#define MSR_P4_IQ_ESCR1 0x000003bb
+#define MSR_P4_IS_ESCR0 0x000003b4
+#define MSR_P4_IS_ESCR1 0x000003b5
+#define MSR_P4_ITLB_ESCR0 0x000003b6
+#define MSR_P4_ITLB_ESCR1 0x000003b7
+#define MSR_P4_IX_ESCR0 0x000003c8
+#define MSR_P4_IX_ESCR1 0x000003c9
+#define MSR_P4_MOB_ESCR0 0x000003aa
+#define MSR_P4_MOB_ESCR1 0x000003ab
+#define MSR_P4_MS_ESCR0 0x000003c0
+#define MSR_P4_MS_ESCR1 0x000003c1
+#define MSR_P4_PMH_ESCR0 0x000003ac
+#define MSR_P4_PMH_ESCR1 0x000003ad
+#define MSR_P4_RAT_ESCR0 0x000003bc
+#define MSR_P4_RAT_ESCR1 0x000003bd
+#define MSR_P4_SAAT_ESCR0 0x000003ae
+#define MSR_P4_SAAT_ESCR1 0x000003af
+#define MSR_P4_SSU_ESCR0 0x000003be
+#define MSR_P4_SSU_ESCR1 0x000003bf /* guess: not in manual */
+
+#define MSR_P4_TBPU_ESCR0 0x000003c2
+#define MSR_P4_TBPU_ESCR1 0x000003c3
+#define MSR_P4_TC_ESCR0 0x000003c4
+#define MSR_P4_TC_ESCR1 0x000003c5
+#define MSR_P4_U2L_ESCR0 0x000003b0
+#define MSR_P4_U2L_ESCR1 0x000003b1
+
+#define MSR_P4_PEBS_MATRIX_VERT 0x000003f2
+
+/* Intel Core-based CPU performance counters */
+#define MSR_CORE_PERF_FIXED_CTR0 0x00000309
+#define MSR_CORE_PERF_FIXED_CTR1 0x0000030a
+#define MSR_CORE_PERF_FIXED_CTR2 0x0000030b
+#define MSR_CORE_PERF_FIXED_CTR_CTRL 0x0000038d
+#define MSR_CORE_PERF_GLOBAL_STATUS 0x0000038e
+#define MSR_CORE_PERF_GLOBAL_CTRL 0x0000038f
+#define MSR_CORE_PERF_GLOBAL_OVF_CTRL 0x00000390
+
+/* Geode defined MSRs */
+#define MSR_GEODE_BUSCONT_CONF0 0x00001900
+
+/* Intel VT MSRs */
+#define MSR_IA32_VMX_BASIC 0x00000480
+#define MSR_IA32_VMX_PINBASED_CTLS 0x00000481
+#define MSR_IA32_VMX_PROCBASED_CTLS 0x00000482
+#define MSR_IA32_VMX_EXIT_CTLS 0x00000483
+#define MSR_IA32_VMX_ENTRY_CTLS 0x00000484
+#define MSR_IA32_VMX_MISC 0x00000485
+#define MSR_IA32_VMX_CR0_FIXED0 0x00000486
+#define MSR_IA32_VMX_CR0_FIXED1 0x00000487
+#define MSR_IA32_VMX_CR4_FIXED0 0x00000488
+#define MSR_IA32_VMX_CR4_FIXED1 0x00000489
+#define MSR_IA32_VMX_VMCS_ENUM 0x0000048a
+#define MSR_IA32_VMX_PROCBASED_CTLS2 0x0000048b
+#define MSR_IA32_VMX_EPT_VPID_CAP 0x0000048c
+#define MSR_IA32_VMX_TRUE_PINBASED_CTLS 0x0000048d
+#define MSR_IA32_VMX_TRUE_PROCBASED_CTLS 0x0000048e
+#define MSR_IA32_VMX_TRUE_EXIT_CTLS 0x0000048f
+#define MSR_IA32_VMX_TRUE_ENTRY_CTLS 0x00000490
+#define MSR_IA32_VMX_VMFUNC 0x00000491
+
+/* VMX_BASIC bits and bitmasks */
+#define VMX_BASIC_VMCS_SIZE_SHIFT 32
+#define VMX_BASIC_TRUE_CTLS (1ULL << 55)
+#define VMX_BASIC_64 0x0001000000000000LLU
+#define VMX_BASIC_MEM_TYPE_SHIFT 50
+#define VMX_BASIC_MEM_TYPE_MASK 0x003c000000000000LLU
+#define VMX_BASIC_MEM_TYPE_WB 6LLU
+#define VMX_BASIC_INOUT 0x0040000000000000LLU
+
+/* MSR_IA32_VMX_MISC bits */
+#define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29)
+#define MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE 0x1F
+/* AMD-V MSRs */
+
+#define MSR_VM_CR 0xc0010114
+#define MSR_VM_IGNNE 0xc0010115
+#define MSR_VM_HSAVE_PA 0xc0010117
+
+#endif /* _ASM_X86_MSR_INDEX_H */
diff --git a/arch/x86/include/asm/msr-trace.h b/arch/x86/include/asm/msr-trace.h
new file mode 100644
index 000000000..f6adbe968
--- /dev/null
+++ b/arch/x86/include/asm/msr-trace.h
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM msr
+
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE msr-trace
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH asm/
+
+#if !defined(_TRACE_MSR_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_MSR_H
+
+#include <linux/tracepoint.h>
+
+/*
+ * Tracing for x86 model specific registers. Directly maps to the
+ * RDMSR/WRMSR instructions.
+ */
+
+DECLARE_EVENT_CLASS(msr_trace_class,
+ TP_PROTO(unsigned msr, u64 val, int failed),
+ TP_ARGS(msr, val, failed),
+ TP_STRUCT__entry(
+ __field( unsigned, msr )
+ __field( u64, val )
+ __field( int, failed )
+ ),
+ TP_fast_assign(
+ __entry->msr = msr;
+ __entry->val = val;
+ __entry->failed = failed;
+ ),
+ TP_printk("%x, value %llx%s",
+ __entry->msr,
+ __entry->val,
+ __entry->failed ? " #GP" : "")
+);
+
+DEFINE_EVENT(msr_trace_class, read_msr,
+ TP_PROTO(unsigned msr, u64 val, int failed),
+ TP_ARGS(msr, val, failed)
+);
+
+DEFINE_EVENT(msr_trace_class, write_msr,
+ TP_PROTO(unsigned msr, u64 val, int failed),
+ TP_ARGS(msr, val, failed)
+);
+
+DEFINE_EVENT(msr_trace_class, rdpmc,
+ TP_PROTO(unsigned msr, u64 val, int failed),
+ TP_ARGS(msr, val, failed)
+);
+
+#endif /* _TRACE_MSR_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
new file mode 100644
index 000000000..2571e2017
--- /dev/null
+++ b/arch/x86/include/asm/msr.h
@@ -0,0 +1,398 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_MSR_H
+#define _ASM_X86_MSR_H
+
+#include "msr-index.h"
+
+#ifndef __ASSEMBLY__
+
+#include <asm/asm.h>
+#include <asm/errno.h>
+#include <asm/cpumask.h>
+#include <uapi/asm/msr.h>
+
+struct msr {
+ union {
+ struct {
+ u32 l;
+ u32 h;
+ };
+ u64 q;
+ };
+};
+
+struct msr_info {
+ u32 msr_no;
+ struct msr reg;
+ struct msr *msrs;
+ int err;
+};
+
+struct msr_regs_info {
+ u32 *regs;
+ int err;
+};
+
+struct saved_msr {
+ bool valid;
+ struct msr_info info;
+};
+
+struct saved_msrs {
+ unsigned int num;
+ struct saved_msr *array;
+};
+
+/*
+ * both i386 and x86_64 returns 64-bit value in edx:eax, but gcc's "A"
+ * constraint has different meanings. For i386, "A" means exactly
+ * edx:eax, while for x86_64 it doesn't mean rdx:rax or edx:eax. Instead,
+ * it means rax *or* rdx.
+ */
+#ifdef CONFIG_X86_64
+/* Using 64-bit values saves one instruction clearing the high half of low */
+#define DECLARE_ARGS(val, low, high) unsigned long low, high
+#define EAX_EDX_VAL(val, low, high) ((low) | (high) << 32)
+#define EAX_EDX_RET(val, low, high) "=a" (low), "=d" (high)
+#else
+#define DECLARE_ARGS(val, low, high) unsigned long long val
+#define EAX_EDX_VAL(val, low, high) (val)
+#define EAX_EDX_RET(val, low, high) "=A" (val)
+#endif
+
+#ifdef CONFIG_TRACEPOINTS
+/*
+ * Be very careful with includes. This header is prone to include loops.
+ */
+#include <asm/atomic.h>
+#include <linux/tracepoint-defs.h>
+
+extern struct tracepoint __tracepoint_read_msr;
+extern struct tracepoint __tracepoint_write_msr;
+extern struct tracepoint __tracepoint_rdpmc;
+#define msr_tracepoint_active(t) static_key_false(&(t).key)
+extern void do_trace_write_msr(unsigned int msr, u64 val, int failed);
+extern void do_trace_read_msr(unsigned int msr, u64 val, int failed);
+extern void do_trace_rdpmc(unsigned int msr, u64 val, int failed);
+#else
+#define msr_tracepoint_active(t) false
+static inline void do_trace_write_msr(unsigned int msr, u64 val, int failed) {}
+static inline void do_trace_read_msr(unsigned int msr, u64 val, int failed) {}
+static inline void do_trace_rdpmc(unsigned int msr, u64 val, int failed) {}
+#endif
+
+/*
+ * __rdmsr() and __wrmsr() are the two primitives which are the bare minimum MSR
+ * accessors and should not have any tracing or other functionality piggybacking
+ * on them - those are *purely* for accessing MSRs and nothing more. So don't even
+ * think of extending them - you will be slapped with a stinking trout or a frozen
+ * shark will reach you, wherever you are! You've been warned.
+ */
+static __always_inline unsigned long long __rdmsr(unsigned int msr)
+{
+ DECLARE_ARGS(val, low, high);
+
+ asm volatile("1: rdmsr\n"
+ "2:\n"
+ _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_rdmsr_unsafe)
+ : EAX_EDX_RET(val, low, high) : "c" (msr));
+
+ return EAX_EDX_VAL(val, low, high);
+}
+
+static __always_inline void __wrmsr(unsigned int msr, u32 low, u32 high)
+{
+ asm volatile("1: wrmsr\n"
+ "2:\n"
+ _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_wrmsr_unsafe)
+ : : "c" (msr), "a"(low), "d" (high) : "memory");
+}
+
+#define native_rdmsr(msr, val1, val2) \
+do { \
+ u64 __val = __rdmsr((msr)); \
+ (void)((val1) = (u32)__val); \
+ (void)((val2) = (u32)(__val >> 32)); \
+} while (0)
+
+#define native_wrmsr(msr, low, high) \
+ __wrmsr(msr, low, high)
+
+#define native_wrmsrl(msr, val) \
+ __wrmsr((msr), (u32)((u64)(val)), \
+ (u32)((u64)(val) >> 32))
+
+static inline unsigned long long native_read_msr(unsigned int msr)
+{
+ unsigned long long val;
+
+ val = __rdmsr(msr);
+
+ if (msr_tracepoint_active(__tracepoint_read_msr))
+ do_trace_read_msr(msr, val, 0);
+
+ return val;
+}
+
+static inline unsigned long long native_read_msr_safe(unsigned int msr,
+ int *err)
+{
+ DECLARE_ARGS(val, low, high);
+
+ asm volatile("2: rdmsr ; xor %[err],%[err]\n"
+ "1:\n\t"
+ ".section .fixup,\"ax\"\n\t"
+ "3: mov %[fault],%[err]\n\t"
+ "xorl %%eax, %%eax\n\t"
+ "xorl %%edx, %%edx\n\t"
+ "jmp 1b\n\t"
+ ".previous\n\t"
+ _ASM_EXTABLE(2b, 3b)
+ : [err] "=r" (*err), EAX_EDX_RET(val, low, high)
+ : "c" (msr), [fault] "i" (-EIO));
+ if (msr_tracepoint_active(__tracepoint_read_msr))
+ do_trace_read_msr(msr, EAX_EDX_VAL(val, low, high), *err);
+ return EAX_EDX_VAL(val, low, high);
+}
+
+/* Can be uninlined because referenced by paravirt */
+static inline void notrace
+native_write_msr(unsigned int msr, u32 low, u32 high)
+{
+ __wrmsr(msr, low, high);
+
+ if (msr_tracepoint_active(__tracepoint_write_msr))
+ do_trace_write_msr(msr, ((u64)high << 32 | low), 0);
+}
+
+/* Can be uninlined because referenced by paravirt */
+static inline int notrace
+native_write_msr_safe(unsigned int msr, u32 low, u32 high)
+{
+ int err;
+
+ asm volatile("2: wrmsr ; xor %[err],%[err]\n"
+ "1:\n\t"
+ ".section .fixup,\"ax\"\n\t"
+ "3: mov %[fault],%[err] ; jmp 1b\n\t"
+ ".previous\n\t"
+ _ASM_EXTABLE(2b, 3b)
+ : [err] "=a" (err)
+ : "c" (msr), "0" (low), "d" (high),
+ [fault] "i" (-EIO)
+ : "memory");
+ if (msr_tracepoint_active(__tracepoint_write_msr))
+ do_trace_write_msr(msr, ((u64)high << 32 | low), err);
+ return err;
+}
+
+extern int rdmsr_safe_regs(u32 regs[8]);
+extern int wrmsr_safe_regs(u32 regs[8]);
+
+/**
+ * rdtsc() - returns the current TSC without ordering constraints
+ *
+ * rdtsc() returns the result of RDTSC as a 64-bit integer. The
+ * only ordering constraint it supplies is the ordering implied by
+ * "asm volatile": it will put the RDTSC in the place you expect. The
+ * CPU can and will speculatively execute that RDTSC, though, so the
+ * results can be non-monotonic if compared on different CPUs.
+ */
+static __always_inline unsigned long long rdtsc(void)
+{
+ DECLARE_ARGS(val, low, high);
+
+ asm volatile("rdtsc" : EAX_EDX_RET(val, low, high));
+
+ return EAX_EDX_VAL(val, low, high);
+}
+
+/**
+ * rdtsc_ordered() - read the current TSC in program order
+ *
+ * rdtsc_ordered() returns the result of RDTSC as a 64-bit integer.
+ * It is ordered like a load to a global in-memory counter. It should
+ * be impossible to observe non-monotonic rdtsc_unordered() behavior
+ * across multiple CPUs as long as the TSC is synced.
+ */
+static __always_inline unsigned long long rdtsc_ordered(void)
+{
+ /*
+ * The RDTSC instruction is not ordered relative to memory
+ * access. The Intel SDM and the AMD APM are both vague on this
+ * point, but empirically an RDTSC instruction can be
+ * speculatively executed before prior loads. An RDTSC
+ * immediately after an appropriate barrier appears to be
+ * ordered as a normal load, that is, it provides the same
+ * ordering guarantees as reading from a global memory location
+ * that some other imaginary CPU is updating continuously with a
+ * time stamp.
+ */
+ barrier_nospec();
+ return rdtsc();
+}
+
+static inline unsigned long long native_read_pmc(int counter)
+{
+ DECLARE_ARGS(val, low, high);
+
+ asm volatile("rdpmc" : EAX_EDX_RET(val, low, high) : "c" (counter));
+ if (msr_tracepoint_active(__tracepoint_rdpmc))
+ do_trace_rdpmc(counter, EAX_EDX_VAL(val, low, high), 0);
+ return EAX_EDX_VAL(val, low, high);
+}
+
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+#include <linux/errno.h>
+/*
+ * Access to machine-specific registers (available on 586 and better only)
+ * Note: the rd* operations modify the parameters directly (without using
+ * pointer indirection), this allows gcc to optimize better
+ */
+
+#define rdmsr(msr, low, high) \
+do { \
+ u64 __val = native_read_msr((msr)); \
+ (void)((low) = (u32)__val); \
+ (void)((high) = (u32)(__val >> 32)); \
+} while (0)
+
+static inline void wrmsr(unsigned int msr, u32 low, u32 high)
+{
+ native_write_msr(msr, low, high);
+}
+
+#define rdmsrl(msr, val) \
+ ((val) = native_read_msr((msr)))
+
+static inline void wrmsrl(unsigned int msr, u64 val)
+{
+ native_write_msr(msr, (u32)(val & 0xffffffffULL), (u32)(val >> 32));
+}
+
+/* wrmsr with exception handling */
+static inline int wrmsr_safe(unsigned int msr, u32 low, u32 high)
+{
+ return native_write_msr_safe(msr, low, high);
+}
+
+/* rdmsr with exception handling */
+#define rdmsr_safe(msr, low, high) \
+({ \
+ int __err; \
+ u64 __val = native_read_msr_safe((msr), &__err); \
+ (*low) = (u32)__val; \
+ (*high) = (u32)(__val >> 32); \
+ __err; \
+})
+
+static inline int rdmsrl_safe(unsigned int msr, unsigned long long *p)
+{
+ int err;
+
+ *p = native_read_msr_safe(msr, &err);
+ return err;
+}
+
+#define rdpmc(counter, low, high) \
+do { \
+ u64 _l = native_read_pmc((counter)); \
+ (low) = (u32)_l; \
+ (high) = (u32)(_l >> 32); \
+} while (0)
+
+#define rdpmcl(counter, val) ((val) = native_read_pmc(counter))
+
+#endif /* !CONFIG_PARAVIRT */
+
+/*
+ * 64-bit version of wrmsr_safe():
+ */
+static inline int wrmsrl_safe(u32 msr, u64 val)
+{
+ return wrmsr_safe(msr, (u32)val, (u32)(val >> 32));
+}
+
+#define write_tsc(low, high) wrmsr(MSR_IA32_TSC, (low), (high))
+
+#define write_rdtscp_aux(val) wrmsr(MSR_TSC_AUX, (val), 0)
+
+struct msr *msrs_alloc(void);
+void msrs_free(struct msr *msrs);
+int msr_set_bit(u32 msr, u8 bit);
+int msr_clear_bit(u32 msr, u8 bit);
+
+#ifdef CONFIG_SMP
+int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
+int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
+int rdmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 *q);
+int wrmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 q);
+void rdmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs);
+void wrmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs);
+int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
+int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
+int rdmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 *q);
+int wrmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 q);
+int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]);
+int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]);
+#else /* CONFIG_SMP */
+static inline int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
+{
+ rdmsr(msr_no, *l, *h);
+ return 0;
+}
+static inline int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
+{
+ wrmsr(msr_no, l, h);
+ return 0;
+}
+static inline int rdmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 *q)
+{
+ rdmsrl(msr_no, *q);
+ return 0;
+}
+static inline int wrmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 q)
+{
+ wrmsrl(msr_no, q);
+ return 0;
+}
+static inline void rdmsr_on_cpus(const struct cpumask *m, u32 msr_no,
+ struct msr *msrs)
+{
+ rdmsr_on_cpu(0, msr_no, &(msrs[0].l), &(msrs[0].h));
+}
+static inline void wrmsr_on_cpus(const struct cpumask *m, u32 msr_no,
+ struct msr *msrs)
+{
+ wrmsr_on_cpu(0, msr_no, msrs[0].l, msrs[0].h);
+}
+static inline int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no,
+ u32 *l, u32 *h)
+{
+ return rdmsr_safe(msr_no, l, h);
+}
+static inline int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
+{
+ return wrmsr_safe(msr_no, l, h);
+}
+static inline int rdmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 *q)
+{
+ return rdmsrl_safe(msr_no, q);
+}
+static inline int wrmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 q)
+{
+ return wrmsrl_safe(msr_no, q);
+}
+static inline int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8])
+{
+ return rdmsr_safe_regs(regs);
+}
+static inline int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8])
+{
+ return wrmsr_safe_regs(regs);
+}
+#endif /* CONFIG_SMP */
+#endif /* __ASSEMBLY__ */
+#endif /* _ASM_X86_MSR_H */
diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h
new file mode 100644
index 000000000..dbff1456d
--- /dev/null
+++ b/arch/x86/include/asm/mtrr.h
@@ -0,0 +1,133 @@
+/* Generic MTRR (Memory Type Range Register) ioctls.
+
+ Copyright (C) 1997-1999 Richard Gooch
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Library General Public
+ License as published by the Free Software Foundation; either
+ version 2 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Library General Public License for more details.
+
+ You should have received a copy of the GNU Library General Public
+ License along with this library; if not, write to the Free
+ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ Richard Gooch may be reached by email at rgooch@atnf.csiro.au
+ The postal address is:
+ Richard Gooch, c/o ATNF, P. O. Box 76, Epping, N.S.W., 2121, Australia.
+*/
+#ifndef _ASM_X86_MTRR_H
+#define _ASM_X86_MTRR_H
+
+#include <uapi/asm/mtrr.h>
+#include <asm/pat.h>
+
+
+/*
+ * The following functions are for use by other drivers that cannot use
+ * arch_phys_wc_add and arch_phys_wc_del.
+ */
+# ifdef CONFIG_MTRR
+extern u8 mtrr_type_lookup(u64 addr, u64 end, u8 *uniform);
+extern void mtrr_save_fixed_ranges(void *);
+extern void mtrr_save_state(void);
+extern int mtrr_add(unsigned long base, unsigned long size,
+ unsigned int type, bool increment);
+extern int mtrr_add_page(unsigned long base, unsigned long size,
+ unsigned int type, bool increment);
+extern int mtrr_del(int reg, unsigned long base, unsigned long size);
+extern int mtrr_del_page(int reg, unsigned long base, unsigned long size);
+extern void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi);
+extern void mtrr_ap_init(void);
+extern void mtrr_bp_init(void);
+extern void set_mtrr_aps_delayed_init(void);
+extern void mtrr_aps_init(void);
+extern void mtrr_bp_restore(void);
+extern int mtrr_trim_uncached_memory(unsigned long end_pfn);
+extern int amd_special_default_mtrr(void);
+# else
+static inline u8 mtrr_type_lookup(u64 addr, u64 end, u8 *uniform)
+{
+ /*
+ * Return no-MTRRs:
+ */
+ return MTRR_TYPE_INVALID;
+}
+#define mtrr_save_fixed_ranges(arg) do {} while (0)
+#define mtrr_save_state() do {} while (0)
+static inline int mtrr_add(unsigned long base, unsigned long size,
+ unsigned int type, bool increment)
+{
+ return -ENODEV;
+}
+static inline int mtrr_add_page(unsigned long base, unsigned long size,
+ unsigned int type, bool increment)
+{
+ return -ENODEV;
+}
+static inline int mtrr_del(int reg, unsigned long base, unsigned long size)
+{
+ return -ENODEV;
+}
+static inline int mtrr_del_page(int reg, unsigned long base, unsigned long size)
+{
+ return -ENODEV;
+}
+static inline int mtrr_trim_uncached_memory(unsigned long end_pfn)
+{
+ return 0;
+}
+static inline void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi)
+{
+}
+static inline void mtrr_bp_init(void)
+{
+ pat_disable("MTRRs disabled, skipping PAT initialization too.");
+}
+
+#define mtrr_ap_init() do {} while (0)
+#define set_mtrr_aps_delayed_init() do {} while (0)
+#define mtrr_aps_init() do {} while (0)
+#define mtrr_bp_restore() do {} while (0)
+# endif
+
+#ifdef CONFIG_COMPAT
+#include <linux/compat.h>
+
+struct mtrr_sentry32 {
+ compat_ulong_t base; /* Base address */
+ compat_uint_t size; /* Size of region */
+ compat_uint_t type; /* Type of region */
+};
+
+struct mtrr_gentry32 {
+ compat_ulong_t regnum; /* Register number */
+ compat_uint_t base; /* Base address */
+ compat_uint_t size; /* Size of region */
+ compat_uint_t type; /* Type of region */
+};
+
+#define MTRR_IOCTL_BASE 'M'
+
+#define MTRRIOC32_ADD_ENTRY _IOW(MTRR_IOCTL_BASE, 0, struct mtrr_sentry32)
+#define MTRRIOC32_SET_ENTRY _IOW(MTRR_IOCTL_BASE, 1, struct mtrr_sentry32)
+#define MTRRIOC32_DEL_ENTRY _IOW(MTRR_IOCTL_BASE, 2, struct mtrr_sentry32)
+#define MTRRIOC32_GET_ENTRY _IOWR(MTRR_IOCTL_BASE, 3, struct mtrr_gentry32)
+#define MTRRIOC32_KILL_ENTRY _IOW(MTRR_IOCTL_BASE, 4, struct mtrr_sentry32)
+#define MTRRIOC32_ADD_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 5, struct mtrr_sentry32)
+#define MTRRIOC32_SET_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 6, struct mtrr_sentry32)
+#define MTRRIOC32_DEL_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 7, struct mtrr_sentry32)
+#define MTRRIOC32_GET_PAGE_ENTRY _IOWR(MTRR_IOCTL_BASE, 8, struct mtrr_gentry32)
+#define MTRRIOC32_KILL_PAGE_ENTRY \
+ _IOW(MTRR_IOCTL_BASE, 9, struct mtrr_sentry32)
+#endif /* CONFIG_COMPAT */
+
+/* Bit fields for enabled in struct mtrr_state_type */
+#define MTRR_STATE_MTRR_FIXED_ENABLED 0x01
+#define MTRR_STATE_MTRR_ENABLED 0x02
+
+#endif /* _ASM_X86_MTRR_H */
diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
new file mode 100644
index 000000000..3aa82deea
--- /dev/null
+++ b/arch/x86/include/asm/mwait.h
@@ -0,0 +1,123 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_MWAIT_H
+#define _ASM_X86_MWAIT_H
+
+#include <linux/sched.h>
+#include <linux/sched/idle.h>
+
+#include <asm/cpufeature.h>
+#include <asm/nospec-branch.h>
+
+#define MWAIT_SUBSTATE_MASK 0xf
+#define MWAIT_CSTATE_MASK 0xf
+#define MWAIT_SUBSTATE_SIZE 4
+#define MWAIT_HINT2CSTATE(hint) (((hint) >> MWAIT_SUBSTATE_SIZE) & MWAIT_CSTATE_MASK)
+#define MWAIT_HINT2SUBSTATE(hint) ((hint) & MWAIT_CSTATE_MASK)
+
+#define CPUID_MWAIT_LEAF 5
+#define CPUID5_ECX_EXTENSIONS_SUPPORTED 0x1
+#define CPUID5_ECX_INTERRUPT_BREAK 0x2
+
+#define MWAIT_ECX_INTERRUPT_BREAK 0x1
+#define MWAITX_ECX_TIMER_ENABLE BIT(1)
+#define MWAITX_MAX_LOOPS ((u32)-1)
+#define MWAITX_DISABLE_CSTATES 0xf0
+
+static inline void __monitor(const void *eax, unsigned long ecx,
+ unsigned long edx)
+{
+ /* "monitor %eax, %ecx, %edx;" */
+ asm volatile(".byte 0x0f, 0x01, 0xc8;"
+ :: "a" (eax), "c" (ecx), "d"(edx));
+}
+
+static inline void __monitorx(const void *eax, unsigned long ecx,
+ unsigned long edx)
+{
+ /* "monitorx %eax, %ecx, %edx;" */
+ asm volatile(".byte 0x0f, 0x01, 0xfa;"
+ :: "a" (eax), "c" (ecx), "d"(edx));
+}
+
+static inline void __mwait(unsigned long eax, unsigned long ecx)
+{
+ mds_idle_clear_cpu_buffers();
+
+ /* "mwait %eax, %ecx;" */
+ asm volatile(".byte 0x0f, 0x01, 0xc9;"
+ :: "a" (eax), "c" (ecx));
+}
+
+/*
+ * MWAITX allows for a timer expiration to get the core out a wait state in
+ * addition to the default MWAIT exit condition of a store appearing at a
+ * monitored virtual address.
+ *
+ * Registers:
+ *
+ * MWAITX ECX[1]: enable timer if set
+ * MWAITX EBX[31:0]: max wait time expressed in SW P0 clocks. The software P0
+ * frequency is the same as the TSC frequency.
+ *
+ * Below is a comparison between MWAIT and MWAITX on AMD processors:
+ *
+ * MWAIT MWAITX
+ * opcode 0f 01 c9 | 0f 01 fb
+ * ECX[0] value of RFLAGS.IF seen by instruction
+ * ECX[1] unused/#GP if set | enable timer if set
+ * ECX[31:2] unused/#GP if set
+ * EAX unused (reserve for hint)
+ * EBX[31:0] unused | max wait time (P0 clocks)
+ *
+ * MONITOR MONITORX
+ * opcode 0f 01 c8 | 0f 01 fa
+ * EAX (logical) address to monitor
+ * ECX #GP if not zero
+ */
+static inline void __mwaitx(unsigned long eax, unsigned long ebx,
+ unsigned long ecx)
+{
+ /* No MDS buffer clear as this is AMD/HYGON only */
+
+ /* "mwaitx %eax, %ebx, %ecx;" */
+ asm volatile(".byte 0x0f, 0x01, 0xfb;"
+ :: "a" (eax), "b" (ebx), "c" (ecx));
+}
+
+static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
+{
+ mds_idle_clear_cpu_buffers();
+
+ trace_hardirqs_on();
+ /* "mwait %eax, %ecx;" */
+ asm volatile("sti; .byte 0x0f, 0x01, 0xc9;"
+ :: "a" (eax), "c" (ecx));
+}
+
+/*
+ * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
+ * which can obviate IPI to trigger checking of need_resched.
+ * We execute MONITOR against need_resched and enter optimized wait state
+ * through MWAIT. Whenever someone changes need_resched, we would be woken
+ * up from MWAIT (without an IPI).
+ *
+ * New with Core Duo processors, MWAIT can take some hints based on CPU
+ * capability.
+ */
+static inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
+{
+ if (static_cpu_has_bug(X86_BUG_MONITOR) || !current_set_polling_and_test()) {
+ if (static_cpu_has_bug(X86_BUG_CLFLUSH_MONITOR)) {
+ mb();
+ clflush((void *)&current_thread_info()->flags);
+ mb();
+ }
+
+ __monitor((void *)&current_thread_info()->flags, 0, 0);
+ if (!need_resched())
+ __mwait(eax, ecx);
+ }
+ current_clr_polling();
+}
+
+#endif /* _ASM_X86_MWAIT_H */
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
new file mode 100644
index 000000000..9d5d949e6
--- /dev/null
+++ b/arch/x86/include/asm/nmi.h
@@ -0,0 +1,66 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_NMI_H
+#define _ASM_X86_NMI_H
+
+#include <linux/irq_work.h>
+#include <linux/pm.h>
+#include <asm/irq.h>
+#include <asm/io.h>
+
+#ifdef CONFIG_X86_LOCAL_APIC
+
+extern int avail_to_resrv_perfctr_nmi_bit(unsigned int);
+extern int reserve_perfctr_nmi(unsigned int);
+extern void release_perfctr_nmi(unsigned int);
+extern int reserve_evntsel_nmi(unsigned int);
+extern void release_evntsel_nmi(unsigned int);
+
+struct ctl_table;
+extern int proc_nmi_enabled(struct ctl_table *, int ,
+ void __user *, size_t *, loff_t *);
+extern int unknown_nmi_panic;
+
+#endif /* CONFIG_X86_LOCAL_APIC */
+
+#define NMI_FLAG_FIRST 1
+
+enum {
+ NMI_LOCAL=0,
+ NMI_UNKNOWN,
+ NMI_SERR,
+ NMI_IO_CHECK,
+ NMI_MAX
+};
+
+#define NMI_DONE 0
+#define NMI_HANDLED 1
+
+typedef int (*nmi_handler_t)(unsigned int, struct pt_regs *);
+
+struct nmiaction {
+ struct list_head list;
+ nmi_handler_t handler;
+ u64 max_duration;
+ unsigned long flags;
+ const char *name;
+};
+
+#define register_nmi_handler(t, fn, fg, n, init...) \
+({ \
+ static struct nmiaction init fn##_na = { \
+ .handler = (fn), \
+ .name = (n), \
+ .flags = (fg), \
+ }; \
+ __register_nmi_handler((t), &fn##_na); \
+})
+
+int __register_nmi_handler(unsigned int, struct nmiaction *);
+
+void unregister_nmi_handler(unsigned int, const char *);
+
+void stop_nmi(void);
+void restart_nmi(void);
+void local_touch_nmi(void);
+
+#endif /* _ASM_X86_NMI_H */
diff --git a/arch/x86/include/asm/nops.h b/arch/x86/include/asm/nops.h
new file mode 100644
index 000000000..12f12b5cf
--- /dev/null
+++ b/arch/x86/include/asm/nops.h
@@ -0,0 +1,147 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_NOPS_H
+#define _ASM_X86_NOPS_H
+
+/*
+ * Define nops for use with alternative() and for tracing.
+ *
+ * *_NOP5_ATOMIC must be a single instruction.
+ */
+
+#define NOP_DS_PREFIX 0x3e
+
+/* generic versions from gas
+ 1: nop
+ the following instructions are NOT nops in 64-bit mode,
+ for 64-bit mode use K8 or P6 nops instead
+ 2: movl %esi,%esi
+ 3: leal 0x00(%esi),%esi
+ 4: leal 0x00(,%esi,1),%esi
+ 6: leal 0x00000000(%esi),%esi
+ 7: leal 0x00000000(,%esi,1),%esi
+*/
+#define GENERIC_NOP1 0x90
+#define GENERIC_NOP2 0x89,0xf6
+#define GENERIC_NOP3 0x8d,0x76,0x00
+#define GENERIC_NOP4 0x8d,0x74,0x26,0x00
+#define GENERIC_NOP5 GENERIC_NOP1,GENERIC_NOP4
+#define GENERIC_NOP6 0x8d,0xb6,0x00,0x00,0x00,0x00
+#define GENERIC_NOP7 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00
+#define GENERIC_NOP8 GENERIC_NOP1,GENERIC_NOP7
+#define GENERIC_NOP5_ATOMIC NOP_DS_PREFIX,GENERIC_NOP4
+
+/* Opteron 64bit nops
+ 1: nop
+ 2: osp nop
+ 3: osp osp nop
+ 4: osp osp osp nop
+*/
+#define K8_NOP1 GENERIC_NOP1
+#define K8_NOP2 0x66,K8_NOP1
+#define K8_NOP3 0x66,K8_NOP2
+#define K8_NOP4 0x66,K8_NOP3
+#define K8_NOP5 K8_NOP3,K8_NOP2
+#define K8_NOP6 K8_NOP3,K8_NOP3
+#define K8_NOP7 K8_NOP4,K8_NOP3
+#define K8_NOP8 K8_NOP4,K8_NOP4
+#define K8_NOP5_ATOMIC 0x66,K8_NOP4
+
+/* K7 nops
+ uses eax dependencies (arbitrary choice)
+ 1: nop
+ 2: movl %eax,%eax
+ 3: leal (,%eax,1),%eax
+ 4: leal 0x00(,%eax,1),%eax
+ 6: leal 0x00000000(%eax),%eax
+ 7: leal 0x00000000(,%eax,1),%eax
+*/
+#define K7_NOP1 GENERIC_NOP1
+#define K7_NOP2 0x8b,0xc0
+#define K7_NOP3 0x8d,0x04,0x20
+#define K7_NOP4 0x8d,0x44,0x20,0x00
+#define K7_NOP5 K7_NOP4,K7_NOP1
+#define K7_NOP6 0x8d,0x80,0,0,0,0
+#define K7_NOP7 0x8D,0x04,0x05,0,0,0,0
+#define K7_NOP8 K7_NOP7,K7_NOP1
+#define K7_NOP5_ATOMIC NOP_DS_PREFIX,K7_NOP4
+
+/* P6 nops
+ uses eax dependencies (Intel-recommended choice)
+ 1: nop
+ 2: osp nop
+ 3: nopl (%eax)
+ 4: nopl 0x00(%eax)
+ 5: nopl 0x00(%eax,%eax,1)
+ 6: osp nopl 0x00(%eax,%eax,1)
+ 7: nopl 0x00000000(%eax)
+ 8: nopl 0x00000000(%eax,%eax,1)
+ Note: All the above are assumed to be a single instruction.
+ There is kernel code that depends on this.
+*/
+#define P6_NOP1 GENERIC_NOP1
+#define P6_NOP2 0x66,0x90
+#define P6_NOP3 0x0f,0x1f,0x00
+#define P6_NOP4 0x0f,0x1f,0x40,0
+#define P6_NOP5 0x0f,0x1f,0x44,0x00,0
+#define P6_NOP6 0x66,0x0f,0x1f,0x44,0x00,0
+#define P6_NOP7 0x0f,0x1f,0x80,0,0,0,0
+#define P6_NOP8 0x0f,0x1f,0x84,0x00,0,0,0,0
+#define P6_NOP5_ATOMIC P6_NOP5
+
+#ifdef __ASSEMBLY__
+#define _ASM_MK_NOP(x) .byte x
+#else
+#define _ASM_MK_NOP(x) ".byte " __stringify(x) "\n"
+#endif
+
+#if defined(CONFIG_MK7)
+#define ASM_NOP1 _ASM_MK_NOP(K7_NOP1)
+#define ASM_NOP2 _ASM_MK_NOP(K7_NOP2)
+#define ASM_NOP3 _ASM_MK_NOP(K7_NOP3)
+#define ASM_NOP4 _ASM_MK_NOP(K7_NOP4)
+#define ASM_NOP5 _ASM_MK_NOP(K7_NOP5)
+#define ASM_NOP6 _ASM_MK_NOP(K7_NOP6)
+#define ASM_NOP7 _ASM_MK_NOP(K7_NOP7)
+#define ASM_NOP8 _ASM_MK_NOP(K7_NOP8)
+#define ASM_NOP5_ATOMIC _ASM_MK_NOP(K7_NOP5_ATOMIC)
+#elif defined(CONFIG_X86_P6_NOP)
+#define ASM_NOP1 _ASM_MK_NOP(P6_NOP1)
+#define ASM_NOP2 _ASM_MK_NOP(P6_NOP2)
+#define ASM_NOP3 _ASM_MK_NOP(P6_NOP3)
+#define ASM_NOP4 _ASM_MK_NOP(P6_NOP4)
+#define ASM_NOP5 _ASM_MK_NOP(P6_NOP5)
+#define ASM_NOP6 _ASM_MK_NOP(P6_NOP6)
+#define ASM_NOP7 _ASM_MK_NOP(P6_NOP7)
+#define ASM_NOP8 _ASM_MK_NOP(P6_NOP8)
+#define ASM_NOP5_ATOMIC _ASM_MK_NOP(P6_NOP5_ATOMIC)
+#elif defined(CONFIG_X86_64)
+#define ASM_NOP1 _ASM_MK_NOP(K8_NOP1)
+#define ASM_NOP2 _ASM_MK_NOP(K8_NOP2)
+#define ASM_NOP3 _ASM_MK_NOP(K8_NOP3)
+#define ASM_NOP4 _ASM_MK_NOP(K8_NOP4)
+#define ASM_NOP5 _ASM_MK_NOP(K8_NOP5)
+#define ASM_NOP6 _ASM_MK_NOP(K8_NOP6)
+#define ASM_NOP7 _ASM_MK_NOP(K8_NOP7)
+#define ASM_NOP8 _ASM_MK_NOP(K8_NOP8)
+#define ASM_NOP5_ATOMIC _ASM_MK_NOP(K8_NOP5_ATOMIC)
+#else
+#define ASM_NOP1 _ASM_MK_NOP(GENERIC_NOP1)
+#define ASM_NOP2 _ASM_MK_NOP(GENERIC_NOP2)
+#define ASM_NOP3 _ASM_MK_NOP(GENERIC_NOP3)
+#define ASM_NOP4 _ASM_MK_NOP(GENERIC_NOP4)
+#define ASM_NOP5 _ASM_MK_NOP(GENERIC_NOP5)
+#define ASM_NOP6 _ASM_MK_NOP(GENERIC_NOP6)
+#define ASM_NOP7 _ASM_MK_NOP(GENERIC_NOP7)
+#define ASM_NOP8 _ASM_MK_NOP(GENERIC_NOP8)
+#define ASM_NOP5_ATOMIC _ASM_MK_NOP(GENERIC_NOP5_ATOMIC)
+#endif
+
+#define ASM_NOP_MAX 8
+#define NOP_ATOMIC5 (ASM_NOP_MAX+1) /* Entry for the 5-byte atomic NOP */
+
+#ifndef __ASSEMBLY__
+extern const unsigned char * const *ideal_nops;
+extern void arch_init_ideal_nops(void);
+#endif
+
+#endif /* _ASM_X86_NOPS_H */
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
new file mode 100644
index 000000000..d3d68b676
--- /dev/null
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -0,0 +1,436 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _ASM_X86_NOSPEC_BRANCH_H_
+#define _ASM_X86_NOSPEC_BRANCH_H_
+
+#include <linux/static_key.h>
+
+#include <asm/alternative.h>
+#include <asm/alternative-asm.h>
+#include <asm/cpufeatures.h>
+#include <asm/msr-index.h>
+
+/*
+ * Fill the CPU return stack buffer.
+ *
+ * Each entry in the RSB, if used for a speculative 'ret', contains an
+ * infinite 'pause; lfence; jmp' loop to capture speculative execution.
+ *
+ * This is required in various cases for retpoline and IBRS-based
+ * mitigations for the Spectre variant 2 vulnerability. Sometimes to
+ * eliminate potentially bogus entries from the RSB, and sometimes
+ * purely to ensure that it doesn't get empty, which on some CPUs would
+ * allow predictions from other (unwanted!) sources to be used.
+ *
+ * We define a CPP macro such that it can be used from both .S files and
+ * inline assembly. It's possible to do a .macro and then include that
+ * from C via asm(".include <asm/nospec-branch.h>") but let's not go there.
+ */
+
+#define RSB_CLEAR_LOOPS 32 /* To forcibly overwrite all entries */
+#define RSB_FILL_LOOPS 16 /* To avoid underflow */
+
+/*
+ * Google experimented with loop-unrolling and this turned out to be
+ * the optimal version — two calls, each with their own speculation
+ * trap should their return address end up getting used, in a loop.
+ */
+#define __FILL_RETURN_BUFFER(reg, nr, sp) \
+ mov $(nr/2), reg; \
+771: \
+ call 772f; \
+773: /* speculation trap */ \
+ pause; \
+ lfence; \
+ jmp 773b; \
+772: \
+ call 774f; \
+775: /* speculation trap */ \
+ pause; \
+ lfence; \
+ jmp 775b; \
+774: \
+ dec reg; \
+ jnz 771b; \
+ add $(BITS_PER_LONG/8) * nr, sp;
+
+#ifdef __ASSEMBLY__
+
+/*
+ * This should be used immediately before a retpoline alternative. It tells
+ * objtool where the retpolines are so that it can make sense of the control
+ * flow by just reading the original instruction(s) and ignoring the
+ * alternatives.
+ */
+.macro ANNOTATE_NOSPEC_ALTERNATIVE
+ .Lannotate_\@:
+ .pushsection .discard.nospec
+ .long .Lannotate_\@ - .
+ .popsection
+.endm
+
+/*
+ * This should be used immediately before an indirect jump/call. It tells
+ * objtool the subsequent indirect jump/call is vouched safe for retpoline
+ * builds.
+ */
+.macro ANNOTATE_RETPOLINE_SAFE
+ .Lannotate_\@:
+ .pushsection .discard.retpoline_safe
+ _ASM_PTR .Lannotate_\@
+ .popsection
+.endm
+
+/*
+ * These are the bare retpoline primitives for indirect jmp and call.
+ * Do not use these directly; they only exist to make the ALTERNATIVE
+ * invocation below less ugly.
+ */
+.macro RETPOLINE_JMP reg:req
+ call .Ldo_rop_\@
+.Lspec_trap_\@:
+ pause
+ lfence
+ jmp .Lspec_trap_\@
+.Ldo_rop_\@:
+ mov \reg, (%_ASM_SP)
+ ret
+.endm
+
+/*
+ * This is a wrapper around RETPOLINE_JMP so the called function in reg
+ * returns to the instruction after the macro.
+ */
+.macro RETPOLINE_CALL reg:req
+ jmp .Ldo_call_\@
+.Ldo_retpoline_jmp_\@:
+ RETPOLINE_JMP \reg
+.Ldo_call_\@:
+ call .Ldo_retpoline_jmp_\@
+.endm
+
+/*
+ * JMP_NOSPEC and CALL_NOSPEC macros can be used instead of a simple
+ * indirect jmp/call which may be susceptible to the Spectre variant 2
+ * attack.
+ */
+.macro JMP_NOSPEC reg:req
+#ifdef CONFIG_RETPOLINE
+ ANNOTATE_NOSPEC_ALTERNATIVE
+ ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *\reg), \
+ __stringify(RETPOLINE_JMP \reg), X86_FEATURE_RETPOLINE, \
+ __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *\reg), X86_FEATURE_RETPOLINE_LFENCE
+#else
+ jmp *\reg
+#endif
+.endm
+
+.macro CALL_NOSPEC reg:req
+#ifdef CONFIG_RETPOLINE
+ ANNOTATE_NOSPEC_ALTERNATIVE
+ ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; call *\reg), \
+ __stringify(RETPOLINE_CALL \reg), X86_FEATURE_RETPOLINE,\
+ __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; call *\reg), X86_FEATURE_RETPOLINE_LFENCE
+#else
+ call *\reg
+#endif
+.endm
+
+ /*
+ * A simpler FILL_RETURN_BUFFER macro. Don't make people use the CPP
+ * monstrosity above, manually.
+ */
+.macro FILL_RETURN_BUFFER reg:req nr:req ftr:req
+#ifdef CONFIG_RETPOLINE
+ ANNOTATE_NOSPEC_ALTERNATIVE
+ ALTERNATIVE "jmp .Lskip_rsb_\@", \
+ __stringify(__FILL_RETURN_BUFFER(\reg,\nr,%_ASM_SP)) \
+ \ftr
+.Lskip_rsb_\@:
+#endif
+.endm
+
+#else /* __ASSEMBLY__ */
+
+#define ANNOTATE_NOSPEC_ALTERNATIVE \
+ "999:\n\t" \
+ ".pushsection .discard.nospec\n\t" \
+ ".long 999b - .\n\t" \
+ ".popsection\n\t"
+
+#define ANNOTATE_RETPOLINE_SAFE \
+ "999:\n\t" \
+ ".pushsection .discard.retpoline_safe\n\t" \
+ _ASM_PTR " 999b\n\t" \
+ ".popsection\n\t"
+
+#ifdef CONFIG_RETPOLINE
+#ifdef CONFIG_X86_64
+
+/*
+ * Inline asm uses the %V modifier which is only in newer GCC
+ * which is ensured when CONFIG_RETPOLINE is defined.
+ */
+# define CALL_NOSPEC \
+ ANNOTATE_NOSPEC_ALTERNATIVE \
+ ALTERNATIVE_2( \
+ ANNOTATE_RETPOLINE_SAFE \
+ "call *%[thunk_target]\n", \
+ "call __x86_indirect_thunk_%V[thunk_target]\n", \
+ X86_FEATURE_RETPOLINE, \
+ "lfence;\n" \
+ ANNOTATE_RETPOLINE_SAFE \
+ "call *%[thunk_target]\n", \
+ X86_FEATURE_RETPOLINE_LFENCE)
+# define THUNK_TARGET(addr) [thunk_target] "r" (addr)
+
+#else /* CONFIG_X86_32 */
+/*
+ * For i386 we use the original ret-equivalent retpoline, because
+ * otherwise we'll run out of registers. We don't care about CET
+ * here, anyway.
+ */
+# define CALL_NOSPEC \
+ ANNOTATE_NOSPEC_ALTERNATIVE \
+ ALTERNATIVE_2( \
+ ANNOTATE_RETPOLINE_SAFE \
+ "call *%[thunk_target]\n", \
+ " jmp 904f;\n" \
+ " .align 16\n" \
+ "901: call 903f;\n" \
+ "902: pause;\n" \
+ " lfence;\n" \
+ " jmp 902b;\n" \
+ " .align 16\n" \
+ "903: lea 4(%%esp), %%esp;\n" \
+ " pushl %[thunk_target];\n" \
+ " ret;\n" \
+ " .align 16\n" \
+ "904: call 901b;\n", \
+ X86_FEATURE_RETPOLINE, \
+ "lfence;\n" \
+ ANNOTATE_RETPOLINE_SAFE \
+ "call *%[thunk_target]\n", \
+ X86_FEATURE_RETPOLINE_LFENCE)
+
+# define THUNK_TARGET(addr) [thunk_target] "rm" (addr)
+#endif
+#else /* No retpoline for C / inline asm */
+# define CALL_NOSPEC "call *%[thunk_target]\n"
+# define THUNK_TARGET(addr) [thunk_target] "rm" (addr)
+#endif
+
+/* The Spectre V2 mitigation variants */
+enum spectre_v2_mitigation {
+ SPECTRE_V2_NONE,
+ SPECTRE_V2_RETPOLINE,
+ SPECTRE_V2_LFENCE,
+ SPECTRE_V2_EIBRS,
+ SPECTRE_V2_EIBRS_RETPOLINE,
+ SPECTRE_V2_EIBRS_LFENCE,
+};
+
+/* The indirect branch speculation control variants */
+enum spectre_v2_user_mitigation {
+ SPECTRE_V2_USER_NONE,
+ SPECTRE_V2_USER_STRICT,
+ SPECTRE_V2_USER_STRICT_PREFERRED,
+ SPECTRE_V2_USER_PRCTL,
+ SPECTRE_V2_USER_SECCOMP,
+};
+
+/* The Speculative Store Bypass disable variants */
+enum ssb_mitigation {
+ SPEC_STORE_BYPASS_NONE,
+ SPEC_STORE_BYPASS_DISABLE,
+ SPEC_STORE_BYPASS_PRCTL,
+ SPEC_STORE_BYPASS_SECCOMP,
+};
+
+extern char __indirect_thunk_start[];
+extern char __indirect_thunk_end[];
+
+/*
+ * On VMEXIT we must ensure that no RSB predictions learned in the guest
+ * can be followed in the host, by overwriting the RSB completely. Both
+ * retpoline and IBRS mitigations for Spectre v2 need this; only on future
+ * CPUs with IBRS_ALL *might* it be avoided.
+ */
+static inline void vmexit_fill_RSB(void)
+{
+#ifdef CONFIG_RETPOLINE
+ unsigned long loops;
+
+ asm volatile (ANNOTATE_NOSPEC_ALTERNATIVE
+ ALTERNATIVE("jmp 910f",
+ __stringify(__FILL_RETURN_BUFFER(%0, RSB_CLEAR_LOOPS, %1)),
+ X86_FEATURE_RETPOLINE)
+ "910:"
+ : "=r" (loops), ASM_CALL_CONSTRAINT
+ : : "memory" );
+#endif
+}
+
+static __always_inline
+void alternative_msr_write(unsigned int msr, u64 val, unsigned int feature)
+{
+ asm volatile(ALTERNATIVE("", "wrmsr", %c[feature])
+ : : "c" (msr),
+ "a" ((u32)val),
+ "d" ((u32)(val >> 32)),
+ [feature] "i" (feature)
+ : "memory");
+}
+
+static inline void indirect_branch_prediction_barrier(void)
+{
+ u64 val = PRED_CMD_IBPB;
+
+ alternative_msr_write(MSR_IA32_PRED_CMD, val, X86_FEATURE_USE_IBPB);
+}
+
+/* The Intel SPEC CTRL MSR base value cache */
+extern u64 x86_spec_ctrl_base;
+
+/*
+ * With retpoline, we must use IBRS to restrict branch prediction
+ * before calling into firmware.
+ *
+ * (Implemented as CPP macros due to header hell.)
+ */
+#define firmware_restrict_branch_speculation_start() \
+do { \
+ u64 val = x86_spec_ctrl_base | SPEC_CTRL_IBRS; \
+ \
+ preempt_disable(); \
+ alternative_msr_write(MSR_IA32_SPEC_CTRL, val, \
+ X86_FEATURE_USE_IBRS_FW); \
+} while (0)
+
+#define firmware_restrict_branch_speculation_end() \
+do { \
+ u64 val = x86_spec_ctrl_base; \
+ \
+ alternative_msr_write(MSR_IA32_SPEC_CTRL, val, \
+ X86_FEATURE_USE_IBRS_FW); \
+ preempt_enable(); \
+} while (0)
+
+DECLARE_STATIC_KEY_FALSE(switch_to_cond_stibp);
+DECLARE_STATIC_KEY_FALSE(switch_mm_cond_ibpb);
+DECLARE_STATIC_KEY_FALSE(switch_mm_always_ibpb);
+
+DECLARE_STATIC_KEY_FALSE(mds_user_clear);
+DECLARE_STATIC_KEY_FALSE(mds_idle_clear);
+
+DECLARE_STATIC_KEY_FALSE(mmio_stale_data_clear);
+
+#include <asm/segment.h>
+
+/**
+ * mds_clear_cpu_buffers - Mitigation for MDS and TAA vulnerability
+ *
+ * This uses the otherwise unused and obsolete VERW instruction in
+ * combination with microcode which triggers a CPU buffer flush when the
+ * instruction is executed.
+ */
+static __always_inline void mds_clear_cpu_buffers(void)
+{
+ static const u16 ds = __KERNEL_DS;
+
+ /*
+ * Has to be the memory-operand variant because only that
+ * guarantees the CPU buffer flush functionality according to
+ * documentation. The register-operand variant does not.
+ * Works with any segment selector, but a valid writable
+ * data segment is the fastest variant.
+ *
+ * "cc" clobber is required because VERW modifies ZF.
+ */
+ asm volatile("verw %[ds]" : : [ds] "m" (ds) : "cc");
+}
+
+/**
+ * mds_user_clear_cpu_buffers - Mitigation for MDS and TAA vulnerability
+ *
+ * Clear CPU buffers if the corresponding static key is enabled
+ */
+static __always_inline void mds_user_clear_cpu_buffers(void)
+{
+ if (static_branch_likely(&mds_user_clear))
+ mds_clear_cpu_buffers();
+}
+
+/**
+ * mds_idle_clear_cpu_buffers - Mitigation for MDS vulnerability
+ *
+ * Clear CPU buffers if the corresponding static key is enabled
+ */
+static inline void mds_idle_clear_cpu_buffers(void)
+{
+ if (static_branch_likely(&mds_idle_clear))
+ mds_clear_cpu_buffers();
+}
+
+#endif /* __ASSEMBLY__ */
+
+/*
+ * Below is used in the eBPF JIT compiler and emits the byte sequence
+ * for the following assembly:
+ *
+ * With retpolines configured:
+ *
+ * callq do_rop
+ * spec_trap:
+ * pause
+ * lfence
+ * jmp spec_trap
+ * do_rop:
+ * mov %rax,(%rsp) for x86_64
+ * mov %edx,(%esp) for x86_32
+ * retq
+ *
+ * Without retpolines configured:
+ *
+ * jmp *%rax for x86_64
+ * jmp *%edx for x86_32
+ */
+#ifdef CONFIG_RETPOLINE
+# ifdef CONFIG_X86_64
+# define RETPOLINE_RAX_BPF_JIT_SIZE 17
+# define RETPOLINE_RAX_BPF_JIT() \
+do { \
+ EMIT1_off32(0xE8, 7); /* callq do_rop */ \
+ /* spec_trap: */ \
+ EMIT2(0xF3, 0x90); /* pause */ \
+ EMIT3(0x0F, 0xAE, 0xE8); /* lfence */ \
+ EMIT2(0xEB, 0xF9); /* jmp spec_trap */ \
+ /* do_rop: */ \
+ EMIT4(0x48, 0x89, 0x04, 0x24); /* mov %rax,(%rsp) */ \
+ EMIT1(0xC3); /* retq */ \
+} while (0)
+# else /* !CONFIG_X86_64 */
+# define RETPOLINE_EDX_BPF_JIT() \
+do { \
+ EMIT1_off32(0xE8, 7); /* call do_rop */ \
+ /* spec_trap: */ \
+ EMIT2(0xF3, 0x90); /* pause */ \
+ EMIT3(0x0F, 0xAE, 0xE8); /* lfence */ \
+ EMIT2(0xEB, 0xF9); /* jmp spec_trap */ \
+ /* do_rop: */ \
+ EMIT3(0x89, 0x14, 0x24); /* mov %edx,(%esp) */ \
+ EMIT1(0xC3); /* ret */ \
+} while (0)
+# endif
+#else /* !CONFIG_RETPOLINE */
+# ifdef CONFIG_X86_64
+# define RETPOLINE_RAX_BPF_JIT_SIZE 2
+# define RETPOLINE_RAX_BPF_JIT() \
+ EMIT2(0xFF, 0xE0); /* jmp *%rax */
+# else /* !CONFIG_X86_64 */
+# define RETPOLINE_EDX_BPF_JIT() \
+ EMIT2(0xFF, 0xE2) /* jmp *%edx */
+# endif
+#endif
+
+#endif /* _ASM_X86_NOSPEC_BRANCH_H_ */
diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h
new file mode 100644
index 000000000..bbfde3d26
--- /dev/null
+++ b/arch/x86/include/asm/numa.h
@@ -0,0 +1,83 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_NUMA_H
+#define _ASM_X86_NUMA_H
+
+#include <linux/nodemask.h>
+
+#include <asm/topology.h>
+#include <asm/apicdef.h>
+
+#ifdef CONFIG_NUMA
+
+#define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
+
+/*
+ * Too small node sizes may confuse the VM badly. Usually they
+ * result from BIOS bugs. So dont recognize nodes as standalone
+ * NUMA entities that have less than this amount of RAM listed:
+ */
+#define NODE_MIN_SIZE (4*1024*1024)
+
+extern int numa_off;
+
+/*
+ * __apicid_to_node[] stores the raw mapping between physical apicid and
+ * node and is used to initialize cpu_to_node mapping.
+ *
+ * The mapping may be overridden by apic->numa_cpu_node() on 32bit and thus
+ * should be accessed by the accessors - set_apicid_to_node() and
+ * numa_cpu_node().
+ */
+extern s16 __apicid_to_node[MAX_LOCAL_APIC];
+extern nodemask_t numa_nodes_parsed __initdata;
+
+extern int __init numa_add_memblk(int nodeid, u64 start, u64 end);
+extern void __init numa_set_distance(int from, int to, int distance);
+
+static inline void set_apicid_to_node(int apicid, s16 node)
+{
+ __apicid_to_node[apicid] = node;
+}
+
+extern int numa_cpu_node(int cpu);
+
+#else /* CONFIG_NUMA */
+static inline void set_apicid_to_node(int apicid, s16 node)
+{
+}
+
+static inline int numa_cpu_node(int cpu)
+{
+ return NUMA_NO_NODE;
+}
+#endif /* CONFIG_NUMA */
+
+#ifdef CONFIG_X86_32
+# include <asm/numa_32.h>
+#endif
+
+#ifdef CONFIG_NUMA
+extern void numa_set_node(int cpu, int node);
+extern void numa_clear_node(int cpu);
+extern void __init init_cpu_to_node(void);
+extern void numa_add_cpu(int cpu);
+extern void numa_remove_cpu(int cpu);
+#else /* CONFIG_NUMA */
+static inline void numa_set_node(int cpu, int node) { }
+static inline void numa_clear_node(int cpu) { }
+static inline void init_cpu_to_node(void) { }
+static inline void numa_add_cpu(int cpu) { }
+static inline void numa_remove_cpu(int cpu) { }
+#endif /* CONFIG_NUMA */
+
+#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+void debug_cpumask_set_cpu(int cpu, int node, bool enable);
+#endif
+
+#ifdef CONFIG_NUMA_EMU
+#define FAKE_NODE_MIN_SIZE ((u64)32 << 20)
+#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL))
+void numa_emu_cmdline(char *);
+#endif /* CONFIG_NUMA_EMU */
+
+#endif /* _ASM_X86_NUMA_H */
diff --git a/arch/x86/include/asm/numa_32.h b/arch/x86/include/asm/numa_32.h
new file mode 100644
index 000000000..9c8e9e85b
--- /dev/null
+++ b/arch/x86/include/asm/numa_32.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_NUMA_32_H
+#define _ASM_X86_NUMA_32_H
+
+#ifdef CONFIG_HIGHMEM
+extern void set_highmem_pages_init(void);
+#else
+static inline void set_highmem_pages_init(void)
+{
+}
+#endif
+
+#endif /* _ASM_X86_NUMA_32_H */
diff --git a/arch/x86/include/asm/numachip/numachip.h b/arch/x86/include/asm/numachip/numachip.h
new file mode 100644
index 000000000..c64373a2d
--- /dev/null
+++ b/arch/x86/include/asm/numachip/numachip.h
@@ -0,0 +1,20 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Numascale NumaConnect-specific header file
+ *
+ * Copyright (C) 2012 Numascale AS. All rights reserved.
+ *
+ * Send feedback to <support@numascale.com>
+ *
+ */
+
+#ifndef _ASM_X86_NUMACHIP_NUMACHIP_H
+#define _ASM_X86_NUMACHIP_NUMACHIP_H
+
+extern u8 numachip_system;
+extern int __init pci_numachip_init(void);
+
+#endif /* _ASM_X86_NUMACHIP_NUMACHIP_H */
diff --git a/arch/x86/include/asm/numachip/numachip_csr.h b/arch/x86/include/asm/numachip/numachip_csr.h
new file mode 100644
index 000000000..29719eecd
--- /dev/null
+++ b/arch/x86/include/asm/numachip/numachip_csr.h
@@ -0,0 +1,98 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Numascale NumaConnect-Specific Header file
+ *
+ * Copyright (C) 2011 Numascale AS. All rights reserved.
+ *
+ * Send feedback to <support@numascale.com>
+ *
+ */
+
+#ifndef _ASM_X86_NUMACHIP_NUMACHIP_CSR_H
+#define _ASM_X86_NUMACHIP_NUMACHIP_CSR_H
+
+#include <linux/smp.h>
+#include <linux/io.h>
+
+#define CSR_NODE_SHIFT 16
+#define CSR_NODE_BITS(p) (((unsigned long)(p)) << CSR_NODE_SHIFT)
+#define CSR_NODE_MASK 0x0fff /* 4K nodes */
+
+/* 32K CSR space, b15 indicates geo/non-geo */
+#define CSR_OFFSET_MASK 0x7fffUL
+#define CSR_G0_NODE_IDS (0x008 + (0 << 12))
+#define CSR_G3_EXT_IRQ_GEN (0x030 + (3 << 12))
+
+/*
+ * Local CSR space starts in global CSR space with "nodeid" = 0xfff0, however
+ * when using the direct mapping on x86_64, both start and size needs to be
+ * aligned with PMD_SIZE which is 2M
+ */
+#define NUMACHIP_LCSR_BASE 0x3ffffe000000ULL
+#define NUMACHIP_LCSR_LIM 0x3fffffffffffULL
+#define NUMACHIP_LCSR_SIZE (NUMACHIP_LCSR_LIM - NUMACHIP_LCSR_BASE + 1)
+#define NUMACHIP_LAPIC_BITS 8
+
+static inline void *lcsr_address(unsigned long offset)
+{
+ return __va(NUMACHIP_LCSR_BASE | (1UL << 15) |
+ CSR_NODE_BITS(0xfff0) | (offset & CSR_OFFSET_MASK));
+}
+
+static inline unsigned int read_lcsr(unsigned long offset)
+{
+ return swab32(readl(lcsr_address(offset)));
+}
+
+static inline void write_lcsr(unsigned long offset, unsigned int val)
+{
+ writel(swab32(val), lcsr_address(offset));
+}
+
+/*
+ * On NumaChip2, local CSR space is 16MB and starts at fixed offset below 4G
+ */
+
+#define NUMACHIP2_LCSR_BASE 0xf0000000UL
+#define NUMACHIP2_LCSR_SIZE 0x1000000UL
+#define NUMACHIP2_APIC_ICR 0x100000
+#define NUMACHIP2_TIMER_DEADLINE 0x200000
+#define NUMACHIP2_TIMER_INT 0x200008
+#define NUMACHIP2_TIMER_NOW 0x200018
+#define NUMACHIP2_TIMER_RESET 0x200020
+
+static inline void __iomem *numachip2_lcsr_address(unsigned long offset)
+{
+ return (void __iomem *)__va(NUMACHIP2_LCSR_BASE |
+ (offset & (NUMACHIP2_LCSR_SIZE - 1)));
+}
+
+static inline u32 numachip2_read32_lcsr(unsigned long offset)
+{
+ return readl(numachip2_lcsr_address(offset));
+}
+
+static inline u64 numachip2_read64_lcsr(unsigned long offset)
+{
+ return readq(numachip2_lcsr_address(offset));
+}
+
+static inline void numachip2_write32_lcsr(unsigned long offset, u32 val)
+{
+ writel(val, numachip2_lcsr_address(offset));
+}
+
+static inline void numachip2_write64_lcsr(unsigned long offset, u64 val)
+{
+ writeq(val, numachip2_lcsr_address(offset));
+}
+
+static inline unsigned int numachip2_timer(void)
+{
+ return (smp_processor_id() % 48) << 6;
+}
+
+#endif /* _ASM_X86_NUMACHIP_NUMACHIP_CSR_H */
diff --git a/arch/x86/include/asm/olpc.h b/arch/x86/include/asm/olpc.h
new file mode 100644
index 000000000..c2bf1de5d
--- /dev/null
+++ b/arch/x86/include/asm/olpc.h
@@ -0,0 +1,133 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* OLPC machine specific definitions */
+
+#ifndef _ASM_X86_OLPC_H
+#define _ASM_X86_OLPC_H
+
+#include <asm/geode.h>
+
+struct olpc_platform_t {
+ int flags;
+ uint32_t boardrev;
+ int ecver;
+};
+
+#define OLPC_F_PRESENT 0x01
+#define OLPC_F_DCON 0x02
+#define OLPC_F_EC_WIDE_SCI 0x04
+
+#ifdef CONFIG_OLPC
+
+extern struct olpc_platform_t olpc_platform_info;
+
+/*
+ * OLPC board IDs contain the major build number within the mask 0x0ff0,
+ * and the minor build number within 0x000f. Pre-builds have a minor
+ * number less than 8, and normal builds start at 8. For example, 0x0B10
+ * is a PreB1, and 0x0C18 is a C1.
+ */
+
+static inline uint32_t olpc_board(uint8_t id)
+{
+ return (id << 4) | 0x8;
+}
+
+static inline uint32_t olpc_board_pre(uint8_t id)
+{
+ return id << 4;
+}
+
+static inline int machine_is_olpc(void)
+{
+ return (olpc_platform_info.flags & OLPC_F_PRESENT) ? 1 : 0;
+}
+
+/*
+ * The DCON is OLPC's Display Controller. It has a number of unique
+ * features that we might want to take advantage of..
+ */
+static inline int olpc_has_dcon(void)
+{
+ return (olpc_platform_info.flags & OLPC_F_DCON) ? 1 : 0;
+}
+
+/*
+ * The "Mass Production" version of OLPC's XO is identified as being model
+ * C2. During the prototype phase, the following models (in chronological
+ * order) were created: A1, B1, B2, B3, B4, C1. The A1 through B2 models
+ * were based on Geode GX CPUs, and models after that were based upon
+ * Geode LX CPUs. There were also some hand-assembled models floating
+ * around, referred to as PreB1, PreB2, etc.
+ */
+static inline int olpc_board_at_least(uint32_t rev)
+{
+ return olpc_platform_info.boardrev >= rev;
+}
+
+extern void olpc_ec_wakeup_set(u16 value);
+extern void olpc_ec_wakeup_clear(u16 value);
+extern bool olpc_ec_wakeup_available(void);
+
+extern int olpc_ec_mask_write(u16 bits);
+extern int olpc_ec_sci_query(u16 *sci_value);
+
+#else
+
+static inline int machine_is_olpc(void)
+{
+ return 0;
+}
+
+static inline int olpc_has_dcon(void)
+{
+ return 0;
+}
+
+static inline void olpc_ec_wakeup_set(u16 value) { }
+static inline void olpc_ec_wakeup_clear(u16 value) { }
+
+static inline bool olpc_ec_wakeup_available(void)
+{
+ return false;
+}
+
+#endif
+
+#ifdef CONFIG_OLPC_XO1_PM
+extern void do_olpc_suspend_lowlevel(void);
+extern void olpc_xo1_pm_wakeup_set(u16 value);
+extern void olpc_xo1_pm_wakeup_clear(u16 value);
+#endif
+
+extern int pci_olpc_init(void);
+
+/* SCI source values */
+
+#define EC_SCI_SRC_EMPTY 0x00
+#define EC_SCI_SRC_GAME 0x01
+#define EC_SCI_SRC_BATTERY 0x02
+#define EC_SCI_SRC_BATSOC 0x04
+#define EC_SCI_SRC_BATERR 0x08
+#define EC_SCI_SRC_EBOOK 0x10 /* XO-1 only */
+#define EC_SCI_SRC_WLAN 0x20 /* XO-1 only */
+#define EC_SCI_SRC_ACPWR 0x40
+#define EC_SCI_SRC_BATCRIT 0x80
+#define EC_SCI_SRC_GPWAKE 0x100 /* XO-1.5 only */
+#define EC_SCI_SRC_ALL 0x1FF
+
+/* GPIO assignments */
+
+#define OLPC_GPIO_MIC_AC 1
+#define OLPC_GPIO_DCON_STAT0 5
+#define OLPC_GPIO_DCON_STAT1 6
+#define OLPC_GPIO_DCON_IRQ 7
+#define OLPC_GPIO_THRM_ALRM geode_gpio(10)
+#define OLPC_GPIO_DCON_LOAD 11
+#define OLPC_GPIO_DCON_BLANK 12
+#define OLPC_GPIO_SMB_CLK 14
+#define OLPC_GPIO_SMB_DATA 15
+#define OLPC_GPIO_WORKAUX geode_gpio(24)
+#define OLPC_GPIO_LID 26
+#define OLPC_GPIO_ECSCI 27
+
+#endif /* _ASM_X86_OLPC_H */
diff --git a/arch/x86/include/asm/olpc_ofw.h b/arch/x86/include/asm/olpc_ofw.h
new file mode 100644
index 000000000..8c2a1daf7
--- /dev/null
+++ b/arch/x86/include/asm/olpc_ofw.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_OLPC_OFW_H
+#define _ASM_X86_OLPC_OFW_H
+
+/* index into the page table containing the entry OFW occupies */
+#define OLPC_OFW_PDE_NR 1022
+
+#define OLPC_OFW_SIG 0x2057464F /* aka "OFW " */
+
+#ifdef CONFIG_OLPC
+
+extern bool olpc_ofw_is_installed(void);
+
+/* run an OFW command by calling into the firmware */
+#define olpc_ofw(name, args, res) \
+ __olpc_ofw((name), ARRAY_SIZE(args), args, ARRAY_SIZE(res), res)
+
+extern int __olpc_ofw(const char *name, int nr_args, const void **args, int nr_res,
+ void **res);
+
+/* determine whether OFW is available and lives in the proper memory */
+extern void olpc_ofw_detect(void);
+
+/* install OFW's pde permanently into the kernel's pgtable */
+extern void setup_olpc_ofw_pgd(void);
+
+/* check if OFW was detected during boot */
+extern bool olpc_ofw_present(void);
+
+extern void olpc_dt_build_devicetree(void);
+
+#else /* !CONFIG_OLPC */
+static inline void olpc_ofw_detect(void) { }
+static inline void setup_olpc_ofw_pgd(void) { }
+static inline void olpc_dt_build_devicetree(void) { }
+#endif /* !CONFIG_OLPC */
+
+#endif /* _ASM_X86_OLPC_OFW_H */
diff --git a/arch/x86/include/asm/orc_lookup.h b/arch/x86/include/asm/orc_lookup.h
new file mode 100644
index 000000000..91c8d8684
--- /dev/null
+++ b/arch/x86/include/asm/orc_lookup.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (C) 2017 Josh Poimboeuf <jpoimboe@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef _ORC_LOOKUP_H
+#define _ORC_LOOKUP_H
+
+/*
+ * This is a lookup table for speeding up access to the .orc_unwind table.
+ * Given an input address offset, the corresponding lookup table entry
+ * specifies a subset of the .orc_unwind table to search.
+ *
+ * Each block represents the end of the previous range and the start of the
+ * next range. An extra block is added to give the last range an end.
+ *
+ * The block size should be a power of 2 to avoid a costly 'div' instruction.
+ *
+ * A block size of 256 was chosen because it roughly doubles unwinder
+ * performance while only adding ~5% to the ORC data footprint.
+ */
+#define LOOKUP_BLOCK_ORDER 8
+#define LOOKUP_BLOCK_SIZE (1 << LOOKUP_BLOCK_ORDER)
+
+#ifndef LINKER_SCRIPT
+
+extern unsigned int orc_lookup[];
+extern unsigned int orc_lookup_end[];
+
+#define LOOKUP_START_IP (unsigned long)_stext
+#define LOOKUP_STOP_IP (unsigned long)_etext
+
+#endif /* LINKER_SCRIPT */
+
+#endif /* _ORC_LOOKUP_H */
diff --git a/arch/x86/include/asm/orc_types.h b/arch/x86/include/asm/orc_types.h
new file mode 100644
index 000000000..46f516dd8
--- /dev/null
+++ b/arch/x86/include/asm/orc_types.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright (C) 2017 Josh Poimboeuf <jpoimboe@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _ORC_TYPES_H
+#define _ORC_TYPES_H
+
+#include <linux/types.h>
+#include <linux/compiler.h>
+
+/*
+ * The ORC_REG_* registers are base registers which are used to find other
+ * registers on the stack.
+ *
+ * ORC_REG_PREV_SP, also known as DWARF Call Frame Address (CFA), is the
+ * address of the previous frame: the caller's SP before it called the current
+ * function.
+ *
+ * ORC_REG_UNDEFINED means the corresponding register's value didn't change in
+ * the current frame.
+ *
+ * The most commonly used base registers are SP and BP -- which the previous SP
+ * is usually based on -- and PREV_SP and UNDEFINED -- which the previous BP is
+ * usually based on.
+ *
+ * The rest of the base registers are needed for special cases like entry code
+ * and GCC realigned stacks.
+ */
+#define ORC_REG_UNDEFINED 0
+#define ORC_REG_PREV_SP 1
+#define ORC_REG_DX 2
+#define ORC_REG_DI 3
+#define ORC_REG_BP 4
+#define ORC_REG_SP 5
+#define ORC_REG_R10 6
+#define ORC_REG_R13 7
+#define ORC_REG_BP_INDIRECT 8
+#define ORC_REG_SP_INDIRECT 9
+#define ORC_REG_MAX 15
+
+/*
+ * ORC_TYPE_CALL: Indicates that sp_reg+sp_offset resolves to PREV_SP (the
+ * caller's SP right before it made the call). Used for all callable
+ * functions, i.e. all C code and all callable asm functions.
+ *
+ * ORC_TYPE_REGS: Used in entry code to indicate that sp_reg+sp_offset points
+ * to a fully populated pt_regs from a syscall, interrupt, or exception.
+ *
+ * ORC_TYPE_REGS_IRET: Used in entry code to indicate that sp_reg+sp_offset
+ * points to the iret return frame.
+ *
+ * The UNWIND_HINT macros are used only for the unwind_hint struct. They
+ * aren't used in struct orc_entry due to size and complexity constraints.
+ * Objtool converts them to real types when it converts the hints to orc
+ * entries.
+ */
+#define ORC_TYPE_CALL 0
+#define ORC_TYPE_REGS 1
+#define ORC_TYPE_REGS_IRET 2
+#define UNWIND_HINT_TYPE_SAVE 3
+#define UNWIND_HINT_TYPE_RESTORE 4
+
+#ifndef __ASSEMBLY__
+/*
+ * This struct is more or less a vastly simplified version of the DWARF Call
+ * Frame Information standard. It contains only the necessary parts of DWARF
+ * CFI, simplified for ease of access by the in-kernel unwinder. It tells the
+ * unwinder how to find the previous SP and BP (and sometimes entry regs) on
+ * the stack for a given code address. Each instance of the struct corresponds
+ * to one or more code locations.
+ */
+struct orc_entry {
+ s16 sp_offset;
+ s16 bp_offset;
+ unsigned sp_reg:4;
+ unsigned bp_reg:4;
+ unsigned type:2;
+ unsigned end:1;
+} __packed;
+
+/*
+ * This struct is used by asm and inline asm code to manually annotate the
+ * location of registers on the stack for the ORC unwinder.
+ *
+ * Type can be either ORC_TYPE_* or UNWIND_HINT_TYPE_*.
+ */
+struct unwind_hint {
+ u32 ip;
+ s16 sp_offset;
+ u8 sp_reg;
+ u8 type;
+ u8 end;
+};
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ORC_TYPES_H */
diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
new file mode 100644
index 000000000..7555b4880
--- /dev/null
+++ b/arch/x86/include/asm/page.h
@@ -0,0 +1,82 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_PAGE_H
+#define _ASM_X86_PAGE_H
+
+#include <linux/types.h>
+
+#ifdef __KERNEL__
+
+#include <asm/page_types.h>
+
+#ifdef CONFIG_X86_64
+#include <asm/page_64.h>
+#else
+#include <asm/page_32.h>
+#endif /* CONFIG_X86_64 */
+
+#ifndef __ASSEMBLY__
+
+struct page;
+
+#include <linux/range.h>
+extern struct range pfn_mapped[];
+extern int nr_pfn_mapped;
+
+static inline void clear_user_page(void *page, unsigned long vaddr,
+ struct page *pg)
+{
+ clear_page(page);
+}
+
+static inline void copy_user_page(void *to, void *from, unsigned long vaddr,
+ struct page *topage)
+{
+ copy_page(to, from);
+}
+
+#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \
+ alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr)
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+
+#ifndef __pa
+#define __pa(x) __phys_addr((unsigned long)(x))
+#endif
+
+#define __pa_nodebug(x) __phys_addr_nodebug((unsigned long)(x))
+/* __pa_symbol should be used for C visible symbols.
+ This seems to be the official gcc blessed way to do such arithmetic. */
+/*
+ * We need __phys_reloc_hide() here because gcc may assume that there is no
+ * overflow during __pa() calculation and can optimize it unexpectedly.
+ * Newer versions of gcc provide -fno-strict-overflow switch to handle this
+ * case properly. Once all supported versions of gcc understand it, we can
+ * remove this Voodoo magic stuff. (i.e. once gcc3.x is deprecated)
+ */
+#define __pa_symbol(x) \
+ __phys_addr_symbol(__phys_reloc_hide((unsigned long)(x)))
+
+#ifndef __va
+#define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET))
+#endif
+
+#define __boot_va(x) __va(x)
+#define __boot_pa(x) __pa(x)
+
+/*
+ * virt_to_page(kaddr) returns a valid pointer if and only if
+ * virt_addr_valid(kaddr) returns true.
+ */
+#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
+#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT)
+extern bool __virt_addr_valid(unsigned long kaddr);
+#define virt_addr_valid(kaddr) __virt_addr_valid((unsigned long) (kaddr))
+
+#endif /* __ASSEMBLY__ */
+
+#include <asm-generic/memory_model.h>
+#include <asm-generic/getorder.h>
+
+#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
+
+#endif /* __KERNEL__ */
+#endif /* _ASM_X86_PAGE_H */
diff --git a/arch/x86/include/asm/page_32.h b/arch/x86/include/asm/page_32.h
new file mode 100644
index 000000000..94dbd51df
--- /dev/null
+++ b/arch/x86/include/asm/page_32.h
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_PAGE_32_H
+#define _ASM_X86_PAGE_32_H
+
+#include <asm/page_32_types.h>
+
+#ifndef __ASSEMBLY__
+
+#define __phys_addr_nodebug(x) ((x) - PAGE_OFFSET)
+#ifdef CONFIG_DEBUG_VIRTUAL
+extern unsigned long __phys_addr(unsigned long);
+#else
+#define __phys_addr(x) __phys_addr_nodebug(x)
+#endif
+#define __phys_addr_symbol(x) __phys_addr(x)
+#define __phys_reloc_hide(x) RELOC_HIDE((x), 0)
+
+#ifdef CONFIG_FLATMEM
+#define pfn_valid(pfn) ((pfn) < max_mapnr)
+#endif /* CONFIG_FLATMEM */
+
+#ifdef CONFIG_X86_USE_3DNOW
+#include <asm/mmx.h>
+
+static inline void clear_page(void *page)
+{
+ mmx_clear_page(page);
+}
+
+static inline void copy_page(void *to, void *from)
+{
+ mmx_copy_page(to, from);
+}
+#else /* !CONFIG_X86_USE_3DNOW */
+#include <linux/string.h>
+
+static inline void clear_page(void *page)
+{
+ memset(page, 0, PAGE_SIZE);
+}
+
+static inline void copy_page(void *to, void *from)
+{
+ memcpy(to, from, PAGE_SIZE);
+}
+#endif /* CONFIG_X86_3DNOW */
+#endif /* !__ASSEMBLY__ */
+
+#endif /* _ASM_X86_PAGE_32_H */
diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h
new file mode 100644
index 000000000..0d5c739ee
--- /dev/null
+++ b/arch/x86/include/asm/page_32_types.h
@@ -0,0 +1,65 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_PAGE_32_DEFS_H
+#define _ASM_X86_PAGE_32_DEFS_H
+
+#include <linux/const.h>
+
+/*
+ * This handles the memory map.
+ *
+ * A __PAGE_OFFSET of 0xC0000000 means that the kernel has
+ * a virtual address space of one gigabyte, which limits the
+ * amount of physical memory you can use to about 950MB.
+ *
+ * If you want more physical memory than this then see the CONFIG_HIGHMEM4G
+ * and CONFIG_HIGHMEM64G options in the kernel configuration.
+ */
+#define __PAGE_OFFSET_BASE _AC(CONFIG_PAGE_OFFSET, UL)
+#define __PAGE_OFFSET __PAGE_OFFSET_BASE
+
+#define __START_KERNEL_map __PAGE_OFFSET
+
+#define THREAD_SIZE_ORDER 1
+#define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER)
+
+#define DOUBLEFAULT_STACK 1
+#define NMI_STACK 0
+#define DEBUG_STACK 0
+#define MCE_STACK 0
+#define N_EXCEPTION_STACKS 1
+
+#ifdef CONFIG_X86_PAE
+/*
+ * This is beyond the 44 bit limit imposed by the 32bit long pfns,
+ * but we need the full mask to make sure inverted PROT_NONE
+ * entries have all the host bits set in a guest.
+ * The real limit is still 44 bits.
+ */
+#define __PHYSICAL_MASK_SHIFT 52
+#define __VIRTUAL_MASK_SHIFT 32
+
+#else /* !CONFIG_X86_PAE */
+#define __PHYSICAL_MASK_SHIFT 32
+#define __VIRTUAL_MASK_SHIFT 32
+#endif /* CONFIG_X86_PAE */
+
+/*
+ * Kernel image size is limited to 512 MB (see in arch/x86/kernel/head_32.S)
+ */
+#define KERNEL_IMAGE_SIZE (512 * 1024 * 1024)
+
+#ifndef __ASSEMBLY__
+
+/*
+ * This much address space is reserved for vmalloc() and iomap()
+ * as well as fixmap mappings.
+ */
+extern unsigned int __VMALLOC_RESERVE;
+extern int sysctl_legacy_va_layout;
+
+extern void find_low_pfn_range(void);
+extern void setup_bootmem_allocator(void);
+
+#endif /* !__ASSEMBLY__ */
+
+#endif /* _ASM_X86_PAGE_32_DEFS_H */
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
new file mode 100644
index 000000000..939b1cff4
--- /dev/null
+++ b/arch/x86/include/asm/page_64.h
@@ -0,0 +1,65 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_PAGE_64_H
+#define _ASM_X86_PAGE_64_H
+
+#include <asm/page_64_types.h>
+
+#ifndef __ASSEMBLY__
+#include <asm/alternative.h>
+
+/* duplicated to the one in bootmem.h */
+extern unsigned long max_pfn;
+extern unsigned long phys_base;
+
+extern unsigned long page_offset_base;
+extern unsigned long vmalloc_base;
+extern unsigned long vmemmap_base;
+
+static inline unsigned long __phys_addr_nodebug(unsigned long x)
+{
+ unsigned long y = x - __START_KERNEL_map;
+
+ /* use the carry flag to determine if x was < __START_KERNEL_map */
+ x = y + ((x > y) ? phys_base : (__START_KERNEL_map - PAGE_OFFSET));
+
+ return x;
+}
+
+#ifdef CONFIG_DEBUG_VIRTUAL
+extern unsigned long __phys_addr(unsigned long);
+extern unsigned long __phys_addr_symbol(unsigned long);
+#else
+#define __phys_addr(x) __phys_addr_nodebug(x)
+#define __phys_addr_symbol(x) \
+ ((unsigned long)(x) - __START_KERNEL_map + phys_base)
+#endif
+
+#define __phys_reloc_hide(x) (x)
+
+#ifdef CONFIG_FLATMEM
+#define pfn_valid(pfn) ((pfn) < max_pfn)
+#endif
+
+void clear_page_orig(void *page);
+void clear_page_rep(void *page);
+void clear_page_erms(void *page);
+
+static inline void clear_page(void *page)
+{
+ alternative_call_2(clear_page_orig,
+ clear_page_rep, X86_FEATURE_REP_GOOD,
+ clear_page_erms, X86_FEATURE_ERMS,
+ "=D" (page),
+ "0" (page)
+ : "cc", "memory", "rax", "rcx");
+}
+
+void copy_page(void *to, void *from);
+
+#endif /* !__ASSEMBLY__ */
+
+#ifdef CONFIG_X86_VSYSCALL_EMULATION
+# define __HAVE_ARCH_GATE_AREA 1
+#endif
+
+#endif /* _ASM_X86_PAGE_64_H */
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
new file mode 100644
index 000000000..b16fb3e18
--- /dev/null
+++ b/arch/x86/include/asm/page_64_types.h
@@ -0,0 +1,80 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_PAGE_64_DEFS_H
+#define _ASM_X86_PAGE_64_DEFS_H
+
+#ifndef __ASSEMBLY__
+#include <asm/kaslr.h>
+#endif
+
+#ifdef CONFIG_KASAN
+#ifdef CONFIG_KASAN_EXTRA
+#define KASAN_STACK_ORDER 2
+#else
+#define KASAN_STACK_ORDER 1
+#endif
+#else
+#define KASAN_STACK_ORDER 0
+#endif
+
+#define THREAD_SIZE_ORDER (2 + KASAN_STACK_ORDER)
+#define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER)
+#define CURRENT_MASK (~(THREAD_SIZE - 1))
+
+#define EXCEPTION_STACK_ORDER (1 + KASAN_STACK_ORDER)
+#define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER)
+
+#define DEBUG_STACK_ORDER (EXCEPTION_STACK_ORDER + 1)
+#define DEBUG_STKSZ (PAGE_SIZE << DEBUG_STACK_ORDER)
+
+#define IRQ_STACK_ORDER (2 + KASAN_STACK_ORDER)
+#define IRQ_STACK_SIZE (PAGE_SIZE << IRQ_STACK_ORDER)
+
+#define DOUBLEFAULT_STACK 1
+#define NMI_STACK 2
+#define DEBUG_STACK 3
+#define MCE_STACK 4
+#define N_EXCEPTION_STACKS 4 /* hw limit: 7 */
+
+/*
+ * Set __PAGE_OFFSET to the most negative possible address +
+ * PGDIR_SIZE*17 (pgd slot 273).
+ *
+ * The gap is to allow a space for LDT remap for PTI (1 pgd slot) and space for
+ * a hypervisor (16 slots). Choosing 16 slots for a hypervisor is arbitrary,
+ * but it's what Xen requires.
+ */
+#define __PAGE_OFFSET_BASE_L5 _AC(0xff11000000000000, UL)
+#define __PAGE_OFFSET_BASE_L4 _AC(0xffff888000000000, UL)
+
+#ifdef CONFIG_DYNAMIC_MEMORY_LAYOUT
+#define __PAGE_OFFSET page_offset_base
+#else
+#define __PAGE_OFFSET __PAGE_OFFSET_BASE_L4
+#endif /* CONFIG_DYNAMIC_MEMORY_LAYOUT */
+
+#define __START_KERNEL_map _AC(0xffffffff80000000, UL)
+
+/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
+
+#define __PHYSICAL_MASK_SHIFT 52
+
+#ifdef CONFIG_X86_5LEVEL
+#define __VIRTUAL_MASK_SHIFT (pgtable_l5_enabled() ? 56 : 47)
+#else
+#define __VIRTUAL_MASK_SHIFT 47
+#endif
+
+/*
+ * Kernel image size is limited to 1GiB due to the fixmap living in the
+ * next 1GiB (see level2_kernel_pgt in arch/x86/kernel/head_64.S). Use
+ * 512MiB by default, leaving 1.5GiB for modules once the page tables
+ * are fully set up. If kernel ASLR is configured, it can extend the
+ * kernel page table mapping, reducing the size of the modules area.
+ */
+#if defined(CONFIG_RANDOMIZE_BASE)
+#define KERNEL_IMAGE_SIZE (1024 * 1024 * 1024)
+#else
+#define KERNEL_IMAGE_SIZE (512 * 1024 * 1024)
+#endif
+
+#endif /* _ASM_X86_PAGE_64_DEFS_H */
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
new file mode 100644
index 000000000..c85e15010
--- /dev/null
+++ b/arch/x86/include/asm/page_types.h
@@ -0,0 +1,83 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_PAGE_DEFS_H
+#define _ASM_X86_PAGE_DEFS_H
+
+#include <linux/const.h>
+#include <linux/types.h>
+#include <linux/mem_encrypt.h>
+
+/* PAGE_SHIFT determines the page size */
+#define PAGE_SHIFT 12
+#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT)
+#define PAGE_MASK (~(PAGE_SIZE-1))
+
+#define PMD_PAGE_SIZE (_AC(1, UL) << PMD_SHIFT)
+#define PMD_PAGE_MASK (~(PMD_PAGE_SIZE-1))
+
+#define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT)
+#define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1))
+
+#define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1)
+
+/* Cast *PAGE_MASK to a signed type so that it is sign-extended if
+ virtual addresses are 32-bits but physical addresses are larger
+ (ie, 32-bit PAE). */
+#define PHYSICAL_PAGE_MASK (((signed long)PAGE_MASK) & __PHYSICAL_MASK)
+#define PHYSICAL_PMD_PAGE_MASK (((signed long)PMD_PAGE_MASK) & __PHYSICAL_MASK)
+#define PHYSICAL_PUD_PAGE_MASK (((signed long)PUD_PAGE_MASK) & __PHYSICAL_MASK)
+
+#define HPAGE_SHIFT PMD_SHIFT
+#define HPAGE_SIZE (_AC(1,UL) << HPAGE_SHIFT)
+#define HPAGE_MASK (~(HPAGE_SIZE - 1))
+#define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
+
+#define HUGE_MAX_HSTATE 2
+
+#define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET)
+
+#define VM_DATA_DEFAULT_FLAGS \
+ (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \
+ VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
+
+#define __PHYSICAL_START ALIGN(CONFIG_PHYSICAL_START, \
+ CONFIG_PHYSICAL_ALIGN)
+
+#define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START)
+
+#ifdef CONFIG_X86_64
+#include <asm/page_64_types.h>
+#define IOREMAP_MAX_ORDER (PUD_SHIFT)
+#else
+#include <asm/page_32_types.h>
+#define IOREMAP_MAX_ORDER (PMD_SHIFT)
+#endif /* CONFIG_X86_64 */
+
+#ifndef __ASSEMBLY__
+
+#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
+extern phys_addr_t physical_mask;
+#define __PHYSICAL_MASK physical_mask
+#else
+#define __PHYSICAL_MASK ((phys_addr_t)((1ULL << __PHYSICAL_MASK_SHIFT) - 1))
+#endif
+
+extern int devmem_is_allowed(unsigned long pagenr);
+
+extern unsigned long max_low_pfn_mapped;
+extern unsigned long max_pfn_mapped;
+
+static inline phys_addr_t get_max_mapped(void)
+{
+ return (phys_addr_t)max_pfn_mapped << PAGE_SHIFT;
+}
+
+bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn);
+
+extern unsigned long init_memory_mapping(unsigned long start,
+ unsigned long end);
+
+extern void initmem_init(void);
+
+#endif /* !__ASSEMBLY__ */
+
+#endif /* _ASM_X86_PAGE_DEFS_H */
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
new file mode 100644
index 000000000..a04677038
--- /dev/null
+++ b/arch/x86/include/asm/paravirt.h
@@ -0,0 +1,975 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_PARAVIRT_H
+#define _ASM_X86_PARAVIRT_H
+/* Various instructions on x86 need to be replaced for
+ * para-virtualization: those hooks are defined here. */
+
+#ifdef CONFIG_PARAVIRT
+#include <asm/pgtable_types.h>
+#include <asm/asm.h>
+#include <asm/nospec-branch.h>
+
+#include <asm/paravirt_types.h>
+
+#ifndef __ASSEMBLY__
+#include <linux/bug.h>
+#include <linux/types.h>
+#include <linux/cpumask.h>
+#include <asm/frame.h>
+
+static inline void load_sp0(unsigned long sp0)
+{
+ PVOP_VCALL1(pv_cpu_ops.load_sp0, sp0);
+}
+
+/* The paravirtualized CPUID instruction. */
+static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx)
+{
+ PVOP_VCALL4(pv_cpu_ops.cpuid, eax, ebx, ecx, edx);
+}
+
+/*
+ * These special macros can be used to get or set a debugging register
+ */
+static inline unsigned long paravirt_get_debugreg(int reg)
+{
+ return PVOP_CALL1(unsigned long, pv_cpu_ops.get_debugreg, reg);
+}
+#define get_debugreg(var, reg) var = paravirt_get_debugreg(reg)
+static inline void set_debugreg(unsigned long val, int reg)
+{
+ PVOP_VCALL2(pv_cpu_ops.set_debugreg, reg, val);
+}
+
+static inline unsigned long read_cr0(void)
+{
+ return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr0);
+}
+
+static inline void write_cr0(unsigned long x)
+{
+ PVOP_VCALL1(pv_cpu_ops.write_cr0, x);
+}
+
+static inline unsigned long read_cr2(void)
+{
+ return PVOP_CALL0(unsigned long, pv_mmu_ops.read_cr2);
+}
+
+static inline void write_cr2(unsigned long x)
+{
+ PVOP_VCALL1(pv_mmu_ops.write_cr2, x);
+}
+
+static inline unsigned long __read_cr3(void)
+{
+ return PVOP_CALL0(unsigned long, pv_mmu_ops.read_cr3);
+}
+
+static inline void write_cr3(unsigned long x)
+{
+ PVOP_VCALL1(pv_mmu_ops.write_cr3, x);
+}
+
+static inline void __write_cr4(unsigned long x)
+{
+ PVOP_VCALL1(pv_cpu_ops.write_cr4, x);
+}
+
+#ifdef CONFIG_X86_64
+static inline unsigned long read_cr8(void)
+{
+ return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr8);
+}
+
+static inline void write_cr8(unsigned long x)
+{
+ PVOP_VCALL1(pv_cpu_ops.write_cr8, x);
+}
+#endif
+
+static inline void arch_safe_halt(void)
+{
+ PVOP_VCALL0(pv_irq_ops.safe_halt);
+}
+
+static inline void halt(void)
+{
+ PVOP_VCALL0(pv_irq_ops.halt);
+}
+
+static inline void wbinvd(void)
+{
+ PVOP_VCALL0(pv_cpu_ops.wbinvd);
+}
+
+#define get_kernel_rpl() (pv_info.kernel_rpl)
+
+static inline u64 paravirt_read_msr(unsigned msr)
+{
+ return PVOP_CALL1(u64, pv_cpu_ops.read_msr, msr);
+}
+
+static inline void paravirt_write_msr(unsigned msr,
+ unsigned low, unsigned high)
+{
+ PVOP_VCALL3(pv_cpu_ops.write_msr, msr, low, high);
+}
+
+static inline u64 paravirt_read_msr_safe(unsigned msr, int *err)
+{
+ return PVOP_CALL2(u64, pv_cpu_ops.read_msr_safe, msr, err);
+}
+
+static inline int paravirt_write_msr_safe(unsigned msr,
+ unsigned low, unsigned high)
+{
+ return PVOP_CALL3(int, pv_cpu_ops.write_msr_safe, msr, low, high);
+}
+
+#define rdmsr(msr, val1, val2) \
+do { \
+ u64 _l = paravirt_read_msr(msr); \
+ val1 = (u32)_l; \
+ val2 = _l >> 32; \
+} while (0)
+
+#define wrmsr(msr, val1, val2) \
+do { \
+ paravirt_write_msr(msr, val1, val2); \
+} while (0)
+
+#define rdmsrl(msr, val) \
+do { \
+ val = paravirt_read_msr(msr); \
+} while (0)
+
+static inline void wrmsrl(unsigned msr, u64 val)
+{
+ wrmsr(msr, (u32)val, (u32)(val>>32));
+}
+
+#define wrmsr_safe(msr, a, b) paravirt_write_msr_safe(msr, a, b)
+
+/* rdmsr with exception handling */
+#define rdmsr_safe(msr, a, b) \
+({ \
+ int _err; \
+ u64 _l = paravirt_read_msr_safe(msr, &_err); \
+ (*a) = (u32)_l; \
+ (*b) = _l >> 32; \
+ _err; \
+})
+
+static inline int rdmsrl_safe(unsigned msr, unsigned long long *p)
+{
+ int err;
+
+ *p = paravirt_read_msr_safe(msr, &err);
+ return err;
+}
+
+static inline unsigned long long paravirt_sched_clock(void)
+{
+ return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock);
+}
+
+struct static_key;
+extern struct static_key paravirt_steal_enabled;
+extern struct static_key paravirt_steal_rq_enabled;
+
+static inline u64 paravirt_steal_clock(int cpu)
+{
+ return PVOP_CALL1(u64, pv_time_ops.steal_clock, cpu);
+}
+
+static inline unsigned long long paravirt_read_pmc(int counter)
+{
+ return PVOP_CALL1(u64, pv_cpu_ops.read_pmc, counter);
+}
+
+#define rdpmc(counter, low, high) \
+do { \
+ u64 _l = paravirt_read_pmc(counter); \
+ low = (u32)_l; \
+ high = _l >> 32; \
+} while (0)
+
+#define rdpmcl(counter, val) ((val) = paravirt_read_pmc(counter))
+
+static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries)
+{
+ PVOP_VCALL2(pv_cpu_ops.alloc_ldt, ldt, entries);
+}
+
+static inline void paravirt_free_ldt(struct desc_struct *ldt, unsigned entries)
+{
+ PVOP_VCALL2(pv_cpu_ops.free_ldt, ldt, entries);
+}
+
+static inline void load_TR_desc(void)
+{
+ PVOP_VCALL0(pv_cpu_ops.load_tr_desc);
+}
+static inline void load_gdt(const struct desc_ptr *dtr)
+{
+ PVOP_VCALL1(pv_cpu_ops.load_gdt, dtr);
+}
+static inline void load_idt(const struct desc_ptr *dtr)
+{
+ PVOP_VCALL1(pv_cpu_ops.load_idt, dtr);
+}
+static inline void set_ldt(const void *addr, unsigned entries)
+{
+ PVOP_VCALL2(pv_cpu_ops.set_ldt, addr, entries);
+}
+static inline unsigned long paravirt_store_tr(void)
+{
+ return PVOP_CALL0(unsigned long, pv_cpu_ops.store_tr);
+}
+#define store_tr(tr) ((tr) = paravirt_store_tr())
+static inline void load_TLS(struct thread_struct *t, unsigned cpu)
+{
+ PVOP_VCALL2(pv_cpu_ops.load_tls, t, cpu);
+}
+
+#ifdef CONFIG_X86_64
+static inline void load_gs_index(unsigned int gs)
+{
+ PVOP_VCALL1(pv_cpu_ops.load_gs_index, gs);
+}
+#endif
+
+static inline void write_ldt_entry(struct desc_struct *dt, int entry,
+ const void *desc)
+{
+ PVOP_VCALL3(pv_cpu_ops.write_ldt_entry, dt, entry, desc);
+}
+
+static inline void write_gdt_entry(struct desc_struct *dt, int entry,
+ void *desc, int type)
+{
+ PVOP_VCALL4(pv_cpu_ops.write_gdt_entry, dt, entry, desc, type);
+}
+
+static inline void write_idt_entry(gate_desc *dt, int entry, const gate_desc *g)
+{
+ PVOP_VCALL3(pv_cpu_ops.write_idt_entry, dt, entry, g);
+}
+static inline void set_iopl_mask(unsigned mask)
+{
+ PVOP_VCALL1(pv_cpu_ops.set_iopl_mask, mask);
+}
+
+/* The paravirtualized I/O functions */
+static inline void slow_down_io(void)
+{
+ pv_cpu_ops.io_delay();
+#ifdef REALLY_SLOW_IO
+ pv_cpu_ops.io_delay();
+ pv_cpu_ops.io_delay();
+ pv_cpu_ops.io_delay();
+#endif
+}
+
+static inline void paravirt_activate_mm(struct mm_struct *prev,
+ struct mm_struct *next)
+{
+ PVOP_VCALL2(pv_mmu_ops.activate_mm, prev, next);
+}
+
+static inline void paravirt_arch_dup_mmap(struct mm_struct *oldmm,
+ struct mm_struct *mm)
+{
+ PVOP_VCALL2(pv_mmu_ops.dup_mmap, oldmm, mm);
+}
+
+static inline void paravirt_arch_exit_mmap(struct mm_struct *mm)
+{
+ PVOP_VCALL1(pv_mmu_ops.exit_mmap, mm);
+}
+
+static inline void __flush_tlb(void)
+{
+ PVOP_VCALL0(pv_mmu_ops.flush_tlb_user);
+}
+static inline void __flush_tlb_global(void)
+{
+ PVOP_VCALL0(pv_mmu_ops.flush_tlb_kernel);
+}
+static inline void __flush_tlb_one_user(unsigned long addr)
+{
+ PVOP_VCALL1(pv_mmu_ops.flush_tlb_one_user, addr);
+}
+
+static inline void flush_tlb_others(const struct cpumask *cpumask,
+ const struct flush_tlb_info *info)
+{
+ PVOP_VCALL2(pv_mmu_ops.flush_tlb_others, cpumask, info);
+}
+
+static inline void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table)
+{
+ PVOP_VCALL2(pv_mmu_ops.tlb_remove_table, tlb, table);
+}
+
+static inline int paravirt_pgd_alloc(struct mm_struct *mm)
+{
+ return PVOP_CALL1(int, pv_mmu_ops.pgd_alloc, mm);
+}
+
+static inline void paravirt_pgd_free(struct mm_struct *mm, pgd_t *pgd)
+{
+ PVOP_VCALL2(pv_mmu_ops.pgd_free, mm, pgd);
+}
+
+static inline void paravirt_alloc_pte(struct mm_struct *mm, unsigned long pfn)
+{
+ PVOP_VCALL2(pv_mmu_ops.alloc_pte, mm, pfn);
+}
+static inline void paravirt_release_pte(unsigned long pfn)
+{
+ PVOP_VCALL1(pv_mmu_ops.release_pte, pfn);
+}
+
+static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
+{
+ PVOP_VCALL2(pv_mmu_ops.alloc_pmd, mm, pfn);
+}
+
+static inline void paravirt_release_pmd(unsigned long pfn)
+{
+ PVOP_VCALL1(pv_mmu_ops.release_pmd, pfn);
+}
+
+static inline void paravirt_alloc_pud(struct mm_struct *mm, unsigned long pfn)
+{
+ PVOP_VCALL2(pv_mmu_ops.alloc_pud, mm, pfn);
+}
+static inline void paravirt_release_pud(unsigned long pfn)
+{
+ PVOP_VCALL1(pv_mmu_ops.release_pud, pfn);
+}
+
+static inline void paravirt_alloc_p4d(struct mm_struct *mm, unsigned long pfn)
+{
+ PVOP_VCALL2(pv_mmu_ops.alloc_p4d, mm, pfn);
+}
+
+static inline void paravirt_release_p4d(unsigned long pfn)
+{
+ PVOP_VCALL1(pv_mmu_ops.release_p4d, pfn);
+}
+
+static inline pte_t __pte(pteval_t val)
+{
+ pteval_t ret;
+
+ if (sizeof(pteval_t) > sizeof(long))
+ ret = PVOP_CALLEE2(pteval_t,
+ pv_mmu_ops.make_pte,
+ val, (u64)val >> 32);
+ else
+ ret = PVOP_CALLEE1(pteval_t,
+ pv_mmu_ops.make_pte,
+ val);
+
+ return (pte_t) { .pte = ret };
+}
+
+static inline pteval_t pte_val(pte_t pte)
+{
+ pteval_t ret;
+
+ if (sizeof(pteval_t) > sizeof(long))
+ ret = PVOP_CALLEE2(pteval_t, pv_mmu_ops.pte_val,
+ pte.pte, (u64)pte.pte >> 32);
+ else
+ ret = PVOP_CALLEE1(pteval_t, pv_mmu_ops.pte_val,
+ pte.pte);
+
+ return ret;
+}
+
+static inline pgd_t __pgd(pgdval_t val)
+{
+ pgdval_t ret;
+
+ if (sizeof(pgdval_t) > sizeof(long))
+ ret = PVOP_CALLEE2(pgdval_t, pv_mmu_ops.make_pgd,
+ val, (u64)val >> 32);
+ else
+ ret = PVOP_CALLEE1(pgdval_t, pv_mmu_ops.make_pgd,
+ val);
+
+ return (pgd_t) { ret };
+}
+
+static inline pgdval_t pgd_val(pgd_t pgd)
+{
+ pgdval_t ret;
+
+ if (sizeof(pgdval_t) > sizeof(long))
+ ret = PVOP_CALLEE2(pgdval_t, pv_mmu_ops.pgd_val,
+ pgd.pgd, (u64)pgd.pgd >> 32);
+ else
+ ret = PVOP_CALLEE1(pgdval_t, pv_mmu_ops.pgd_val,
+ pgd.pgd);
+
+ return ret;
+}
+
+#define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
+static inline pte_t ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep)
+{
+ pteval_t ret;
+
+ ret = PVOP_CALL3(pteval_t, pv_mmu_ops.ptep_modify_prot_start,
+ mm, addr, ptep);
+
+ return (pte_t) { .pte = ret };
+}
+
+static inline void ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte)
+{
+ if (sizeof(pteval_t) > sizeof(long))
+ /* 5 arg words */
+ pv_mmu_ops.ptep_modify_prot_commit(mm, addr, ptep, pte);
+ else
+ PVOP_VCALL4(pv_mmu_ops.ptep_modify_prot_commit,
+ mm, addr, ptep, pte.pte);
+}
+
+static inline void set_pte(pte_t *ptep, pte_t pte)
+{
+ if (sizeof(pteval_t) > sizeof(long))
+ PVOP_VCALL3(pv_mmu_ops.set_pte, ptep,
+ pte.pte, (u64)pte.pte >> 32);
+ else
+ PVOP_VCALL2(pv_mmu_ops.set_pte, ptep,
+ pte.pte);
+}
+
+static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte)
+{
+ if (sizeof(pteval_t) > sizeof(long))
+ /* 5 arg words */
+ pv_mmu_ops.set_pte_at(mm, addr, ptep, pte);
+ else
+ PVOP_VCALL4(pv_mmu_ops.set_pte_at, mm, addr, ptep, pte.pte);
+}
+
+static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
+{
+ pmdval_t val = native_pmd_val(pmd);
+
+ if (sizeof(pmdval_t) > sizeof(long))
+ PVOP_VCALL3(pv_mmu_ops.set_pmd, pmdp, val, (u64)val >> 32);
+ else
+ PVOP_VCALL2(pv_mmu_ops.set_pmd, pmdp, val);
+}
+
+#if CONFIG_PGTABLE_LEVELS >= 3
+static inline pmd_t __pmd(pmdval_t val)
+{
+ pmdval_t ret;
+
+ if (sizeof(pmdval_t) > sizeof(long))
+ ret = PVOP_CALLEE2(pmdval_t, pv_mmu_ops.make_pmd,
+ val, (u64)val >> 32);
+ else
+ ret = PVOP_CALLEE1(pmdval_t, pv_mmu_ops.make_pmd,
+ val);
+
+ return (pmd_t) { ret };
+}
+
+static inline pmdval_t pmd_val(pmd_t pmd)
+{
+ pmdval_t ret;
+
+ if (sizeof(pmdval_t) > sizeof(long))
+ ret = PVOP_CALLEE2(pmdval_t, pv_mmu_ops.pmd_val,
+ pmd.pmd, (u64)pmd.pmd >> 32);
+ else
+ ret = PVOP_CALLEE1(pmdval_t, pv_mmu_ops.pmd_val,
+ pmd.pmd);
+
+ return ret;
+}
+
+static inline void set_pud(pud_t *pudp, pud_t pud)
+{
+ pudval_t val = native_pud_val(pud);
+
+ if (sizeof(pudval_t) > sizeof(long))
+ PVOP_VCALL3(pv_mmu_ops.set_pud, pudp,
+ val, (u64)val >> 32);
+ else
+ PVOP_VCALL2(pv_mmu_ops.set_pud, pudp,
+ val);
+}
+#if CONFIG_PGTABLE_LEVELS >= 4
+static inline pud_t __pud(pudval_t val)
+{
+ pudval_t ret;
+
+ if (sizeof(pudval_t) > sizeof(long))
+ ret = PVOP_CALLEE2(pudval_t, pv_mmu_ops.make_pud,
+ val, (u64)val >> 32);
+ else
+ ret = PVOP_CALLEE1(pudval_t, pv_mmu_ops.make_pud,
+ val);
+
+ return (pud_t) { ret };
+}
+
+static inline pudval_t pud_val(pud_t pud)
+{
+ pudval_t ret;
+
+ if (sizeof(pudval_t) > sizeof(long))
+ ret = PVOP_CALLEE2(pudval_t, pv_mmu_ops.pud_val,
+ pud.pud, (u64)pud.pud >> 32);
+ else
+ ret = PVOP_CALLEE1(pudval_t, pv_mmu_ops.pud_val,
+ pud.pud);
+
+ return ret;
+}
+
+static inline void pud_clear(pud_t *pudp)
+{
+ set_pud(pudp, __pud(0));
+}
+
+static inline void set_p4d(p4d_t *p4dp, p4d_t p4d)
+{
+ p4dval_t val = native_p4d_val(p4d);
+
+ if (sizeof(p4dval_t) > sizeof(long))
+ PVOP_VCALL3(pv_mmu_ops.set_p4d, p4dp,
+ val, (u64)val >> 32);
+ else
+ PVOP_VCALL2(pv_mmu_ops.set_p4d, p4dp,
+ val);
+}
+
+#if CONFIG_PGTABLE_LEVELS >= 5
+
+static inline p4d_t __p4d(p4dval_t val)
+{
+ p4dval_t ret = PVOP_CALLEE1(p4dval_t, pv_mmu_ops.make_p4d, val);
+
+ return (p4d_t) { ret };
+}
+
+static inline p4dval_t p4d_val(p4d_t p4d)
+{
+ return PVOP_CALLEE1(p4dval_t, pv_mmu_ops.p4d_val, p4d.p4d);
+}
+
+static inline void __set_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+ PVOP_VCALL2(pv_mmu_ops.set_pgd, pgdp, native_pgd_val(pgd));
+}
+
+#define set_pgd(pgdp, pgdval) do { \
+ if (pgtable_l5_enabled()) \
+ __set_pgd(pgdp, pgdval); \
+ else \
+ set_p4d((p4d_t *)(pgdp), (p4d_t) { (pgdval).pgd }); \
+} while (0)
+
+#define pgd_clear(pgdp) do { \
+ if (pgtable_l5_enabled()) \
+ set_pgd(pgdp, __pgd(0)); \
+} while (0)
+
+#endif /* CONFIG_PGTABLE_LEVELS == 5 */
+
+static inline void p4d_clear(p4d_t *p4dp)
+{
+ set_p4d(p4dp, __p4d(0));
+}
+
+#endif /* CONFIG_PGTABLE_LEVELS == 4 */
+
+#endif /* CONFIG_PGTABLE_LEVELS >= 3 */
+
+#ifdef CONFIG_X86_PAE
+/* Special-case pte-setting operations for PAE, which can't update a
+ 64-bit pte atomically */
+static inline void set_pte_atomic(pte_t *ptep, pte_t pte)
+{
+ PVOP_VCALL3(pv_mmu_ops.set_pte_atomic, ptep,
+ pte.pte, pte.pte >> 32);
+}
+
+static inline void pte_clear(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep)
+{
+ PVOP_VCALL3(pv_mmu_ops.pte_clear, mm, addr, ptep);
+}
+
+static inline void pmd_clear(pmd_t *pmdp)
+{
+ PVOP_VCALL1(pv_mmu_ops.pmd_clear, pmdp);
+}
+#else /* !CONFIG_X86_PAE */
+static inline void set_pte_atomic(pte_t *ptep, pte_t pte)
+{
+ set_pte(ptep, pte);
+}
+
+static inline void pte_clear(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep)
+{
+ set_pte_at(mm, addr, ptep, __pte(0));
+}
+
+static inline void pmd_clear(pmd_t *pmdp)
+{
+ set_pmd(pmdp, __pmd(0));
+}
+#endif /* CONFIG_X86_PAE */
+
+#define __HAVE_ARCH_START_CONTEXT_SWITCH
+static inline void arch_start_context_switch(struct task_struct *prev)
+{
+ PVOP_VCALL1(pv_cpu_ops.start_context_switch, prev);
+}
+
+static inline void arch_end_context_switch(struct task_struct *next)
+{
+ PVOP_VCALL1(pv_cpu_ops.end_context_switch, next);
+}
+
+#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE
+static inline void arch_enter_lazy_mmu_mode(void)
+{
+ PVOP_VCALL0(pv_mmu_ops.lazy_mode.enter);
+}
+
+static inline void arch_leave_lazy_mmu_mode(void)
+{
+ PVOP_VCALL0(pv_mmu_ops.lazy_mode.leave);
+}
+
+static inline void arch_flush_lazy_mmu_mode(void)
+{
+ PVOP_VCALL0(pv_mmu_ops.lazy_mode.flush);
+}
+
+static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx,
+ phys_addr_t phys, pgprot_t flags)
+{
+ pv_mmu_ops.set_fixmap(idx, phys, flags);
+}
+
+#if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS)
+
+static __always_inline void pv_queued_spin_lock_slowpath(struct qspinlock *lock,
+ u32 val)
+{
+ PVOP_VCALL2(pv_lock_ops.queued_spin_lock_slowpath, lock, val);
+}
+
+static __always_inline void pv_queued_spin_unlock(struct qspinlock *lock)
+{
+ PVOP_VCALLEE1(pv_lock_ops.queued_spin_unlock, lock);
+}
+
+static __always_inline void pv_wait(u8 *ptr, u8 val)
+{
+ PVOP_VCALL2(pv_lock_ops.wait, ptr, val);
+}
+
+static __always_inline void pv_kick(int cpu)
+{
+ PVOP_VCALL1(pv_lock_ops.kick, cpu);
+}
+
+static __always_inline bool pv_vcpu_is_preempted(long cpu)
+{
+ return PVOP_CALLEE1(bool, pv_lock_ops.vcpu_is_preempted, cpu);
+}
+
+#endif /* SMP && PARAVIRT_SPINLOCKS */
+
+#ifdef CONFIG_X86_32
+#define PV_SAVE_REGS "pushl %ecx; pushl %edx;"
+#define PV_RESTORE_REGS "popl %edx; popl %ecx;"
+
+/* save and restore all caller-save registers, except return value */
+#define PV_SAVE_ALL_CALLER_REGS "pushl %ecx;"
+#define PV_RESTORE_ALL_CALLER_REGS "popl %ecx;"
+
+#define PV_FLAGS_ARG "0"
+#define PV_EXTRA_CLOBBERS
+#define PV_VEXTRA_CLOBBERS
+#else
+/* save and restore all caller-save registers, except return value */
+#define PV_SAVE_ALL_CALLER_REGS \
+ "push %rcx;" \
+ "push %rdx;" \
+ "push %rsi;" \
+ "push %rdi;" \
+ "push %r8;" \
+ "push %r9;" \
+ "push %r10;" \
+ "push %r11;"
+#define PV_RESTORE_ALL_CALLER_REGS \
+ "pop %r11;" \
+ "pop %r10;" \
+ "pop %r9;" \
+ "pop %r8;" \
+ "pop %rdi;" \
+ "pop %rsi;" \
+ "pop %rdx;" \
+ "pop %rcx;"
+
+/* We save some registers, but all of them, that's too much. We clobber all
+ * caller saved registers but the argument parameter */
+#define PV_SAVE_REGS "pushq %%rdi;"
+#define PV_RESTORE_REGS "popq %%rdi;"
+#define PV_EXTRA_CLOBBERS EXTRA_CLOBBERS, "rcx" , "rdx", "rsi"
+#define PV_VEXTRA_CLOBBERS EXTRA_CLOBBERS, "rdi", "rcx" , "rdx", "rsi"
+#define PV_FLAGS_ARG "D"
+#endif
+
+/*
+ * Generate a thunk around a function which saves all caller-save
+ * registers except for the return value. This allows C functions to
+ * be called from assembler code where fewer than normal registers are
+ * available. It may also help code generation around calls from C
+ * code if the common case doesn't use many registers.
+ *
+ * When a callee is wrapped in a thunk, the caller can assume that all
+ * arg regs and all scratch registers are preserved across the
+ * call. The return value in rax/eax will not be saved, even for void
+ * functions.
+ */
+#define PV_THUNK_NAME(func) "__raw_callee_save_" #func
+#define PV_CALLEE_SAVE_REGS_THUNK(func) \
+ extern typeof(func) __raw_callee_save_##func; \
+ \
+ asm(".pushsection .text;" \
+ ".globl " PV_THUNK_NAME(func) ";" \
+ ".type " PV_THUNK_NAME(func) ", @function;" \
+ PV_THUNK_NAME(func) ":" \
+ FRAME_BEGIN \
+ PV_SAVE_ALL_CALLER_REGS \
+ "call " #func ";" \
+ PV_RESTORE_ALL_CALLER_REGS \
+ FRAME_END \
+ "ret;" \
+ ".size " PV_THUNK_NAME(func) ", .-" PV_THUNK_NAME(func) ";" \
+ ".popsection")
+
+/* Get a reference to a callee-save function */
+#define PV_CALLEE_SAVE(func) \
+ ((struct paravirt_callee_save) { __raw_callee_save_##func })
+
+/* Promise that "func" already uses the right calling convention */
+#define __PV_IS_CALLEE_SAVE(func) \
+ ((struct paravirt_callee_save) { func })
+
+static inline notrace unsigned long arch_local_save_flags(void)
+{
+ return PVOP_CALLEE0(unsigned long, pv_irq_ops.save_fl);
+}
+
+static inline notrace void arch_local_irq_restore(unsigned long f)
+{
+ PVOP_VCALLEE1(pv_irq_ops.restore_fl, f);
+}
+
+static inline notrace void arch_local_irq_disable(void)
+{
+ PVOP_VCALLEE0(pv_irq_ops.irq_disable);
+}
+
+static inline notrace void arch_local_irq_enable(void)
+{
+ PVOP_VCALLEE0(pv_irq_ops.irq_enable);
+}
+
+static inline notrace unsigned long arch_local_irq_save(void)
+{
+ unsigned long f;
+
+ f = arch_local_save_flags();
+ arch_local_irq_disable();
+ return f;
+}
+
+
+/* Make sure as little as possible of this mess escapes. */
+#undef PARAVIRT_CALL
+#undef __PVOP_CALL
+#undef __PVOP_VCALL
+#undef PVOP_VCALL0
+#undef PVOP_CALL0
+#undef PVOP_VCALL1
+#undef PVOP_CALL1
+#undef PVOP_VCALL2
+#undef PVOP_CALL2
+#undef PVOP_VCALL3
+#undef PVOP_CALL3
+#undef PVOP_VCALL4
+#undef PVOP_CALL4
+
+extern void default_banner(void);
+
+#else /* __ASSEMBLY__ */
+
+#define _PVSITE(ptype, clobbers, ops, word, algn) \
+771:; \
+ ops; \
+772:; \
+ .pushsection .parainstructions,"a"; \
+ .align algn; \
+ word 771b; \
+ .byte ptype; \
+ .byte 772b-771b; \
+ .short clobbers; \
+ .popsection
+
+
+#define COND_PUSH(set, mask, reg) \
+ .if ((~(set)) & mask); push %reg; .endif
+#define COND_POP(set, mask, reg) \
+ .if ((~(set)) & mask); pop %reg; .endif
+
+#ifdef CONFIG_X86_64
+
+#define PV_SAVE_REGS(set) \
+ COND_PUSH(set, CLBR_RAX, rax); \
+ COND_PUSH(set, CLBR_RCX, rcx); \
+ COND_PUSH(set, CLBR_RDX, rdx); \
+ COND_PUSH(set, CLBR_RSI, rsi); \
+ COND_PUSH(set, CLBR_RDI, rdi); \
+ COND_PUSH(set, CLBR_R8, r8); \
+ COND_PUSH(set, CLBR_R9, r9); \
+ COND_PUSH(set, CLBR_R10, r10); \
+ COND_PUSH(set, CLBR_R11, r11)
+#define PV_RESTORE_REGS(set) \
+ COND_POP(set, CLBR_R11, r11); \
+ COND_POP(set, CLBR_R10, r10); \
+ COND_POP(set, CLBR_R9, r9); \
+ COND_POP(set, CLBR_R8, r8); \
+ COND_POP(set, CLBR_RDI, rdi); \
+ COND_POP(set, CLBR_RSI, rsi); \
+ COND_POP(set, CLBR_RDX, rdx); \
+ COND_POP(set, CLBR_RCX, rcx); \
+ COND_POP(set, CLBR_RAX, rax)
+
+#define PARA_PATCH(struct, off) ((PARAVIRT_PATCH_##struct + (off)) / 8)
+#define PARA_SITE(ptype, clobbers, ops) _PVSITE(ptype, clobbers, ops, .quad, 8)
+#define PARA_INDIRECT(addr) *addr(%rip)
+#else
+#define PV_SAVE_REGS(set) \
+ COND_PUSH(set, CLBR_EAX, eax); \
+ COND_PUSH(set, CLBR_EDI, edi); \
+ COND_PUSH(set, CLBR_ECX, ecx); \
+ COND_PUSH(set, CLBR_EDX, edx)
+#define PV_RESTORE_REGS(set) \
+ COND_POP(set, CLBR_EDX, edx); \
+ COND_POP(set, CLBR_ECX, ecx); \
+ COND_POP(set, CLBR_EDI, edi); \
+ COND_POP(set, CLBR_EAX, eax)
+
+#define PARA_PATCH(struct, off) ((PARAVIRT_PATCH_##struct + (off)) / 4)
+#define PARA_SITE(ptype, clobbers, ops) _PVSITE(ptype, clobbers, ops, .long, 4)
+#define PARA_INDIRECT(addr) *%cs:addr
+#endif
+
+#define INTERRUPT_RETURN \
+ PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_iret), CLBR_NONE, \
+ ANNOTATE_RETPOLINE_SAFE; \
+ jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_iret);)
+
+#define DISABLE_INTERRUPTS(clobbers) \
+ PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_disable), clobbers, \
+ PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \
+ ANNOTATE_RETPOLINE_SAFE; \
+ call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_disable); \
+ PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
+
+#define ENABLE_INTERRUPTS(clobbers) \
+ PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_enable), clobbers, \
+ PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \
+ ANNOTATE_RETPOLINE_SAFE; \
+ call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_enable); \
+ PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
+
+#ifdef CONFIG_X86_32
+#define GET_CR0_INTO_EAX \
+ push %ecx; push %edx; \
+ ANNOTATE_RETPOLINE_SAFE; \
+ call PARA_INDIRECT(pv_cpu_ops+PV_CPU_read_cr0); \
+ pop %edx; pop %ecx
+#else /* !CONFIG_X86_32 */
+
+/*
+ * If swapgs is used while the userspace stack is still current,
+ * there's no way to call a pvop. The PV replacement *must* be
+ * inlined, or the swapgs instruction must be trapped and emulated.
+ */
+#define SWAPGS_UNSAFE_STACK \
+ PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_swapgs), CLBR_NONE, \
+ swapgs)
+
+/*
+ * Note: swapgs is very special, and in practise is either going to be
+ * implemented with a single "swapgs" instruction or something very
+ * special. Either way, we don't need to save any registers for
+ * it.
+ */
+#define SWAPGS \
+ PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_swapgs), CLBR_NONE, \
+ ANNOTATE_RETPOLINE_SAFE; \
+ call PARA_INDIRECT(pv_cpu_ops+PV_CPU_swapgs); \
+ )
+
+#define GET_CR2_INTO_RAX \
+ ANNOTATE_RETPOLINE_SAFE; \
+ call PARA_INDIRECT(pv_mmu_ops+PV_MMU_read_cr2);
+
+#define USERGS_SYSRET64 \
+ PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64), \
+ CLBR_NONE, \
+ ANNOTATE_RETPOLINE_SAFE; \
+ jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret64);)
+
+#ifdef CONFIG_DEBUG_ENTRY
+#define SAVE_FLAGS(clobbers) \
+ PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_save_fl), clobbers, \
+ PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \
+ ANNOTATE_RETPOLINE_SAFE; \
+ call PARA_INDIRECT(pv_irq_ops+PV_IRQ_save_fl); \
+ PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
+#endif
+
+#endif /* CONFIG_X86_32 */
+
+#endif /* __ASSEMBLY__ */
+#else /* CONFIG_PARAVIRT */
+# define default_banner x86_init_noop
+#ifndef __ASSEMBLY__
+static inline void paravirt_arch_dup_mmap(struct mm_struct *oldmm,
+ struct mm_struct *mm)
+{
+}
+
+static inline void paravirt_arch_exit_mmap(struct mm_struct *mm)
+{
+}
+#endif /* __ASSEMBLY__ */
+#endif /* !CONFIG_PARAVIRT */
+#endif /* _ASM_X86_PARAVIRT_H */
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
new file mode 100644
index 000000000..4b75acc23
--- /dev/null
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -0,0 +1,699 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_PARAVIRT_TYPES_H
+#define _ASM_X86_PARAVIRT_TYPES_H
+
+/* Bitmask of what can be clobbered: usually at least eax. */
+#define CLBR_NONE 0
+#define CLBR_EAX (1 << 0)
+#define CLBR_ECX (1 << 1)
+#define CLBR_EDX (1 << 2)
+#define CLBR_EDI (1 << 3)
+
+#ifdef CONFIG_X86_32
+/* CLBR_ANY should match all regs platform has. For i386, that's just it */
+#define CLBR_ANY ((1 << 4) - 1)
+
+#define CLBR_ARG_REGS (CLBR_EAX | CLBR_EDX | CLBR_ECX)
+#define CLBR_RET_REG (CLBR_EAX | CLBR_EDX)
+#define CLBR_SCRATCH (0)
+#else
+#define CLBR_RAX CLBR_EAX
+#define CLBR_RCX CLBR_ECX
+#define CLBR_RDX CLBR_EDX
+#define CLBR_RDI CLBR_EDI
+#define CLBR_RSI (1 << 4)
+#define CLBR_R8 (1 << 5)
+#define CLBR_R9 (1 << 6)
+#define CLBR_R10 (1 << 7)
+#define CLBR_R11 (1 << 8)
+
+#define CLBR_ANY ((1 << 9) - 1)
+
+#define CLBR_ARG_REGS (CLBR_RDI | CLBR_RSI | CLBR_RDX | \
+ CLBR_RCX | CLBR_R8 | CLBR_R9)
+#define CLBR_RET_REG (CLBR_RAX)
+#define CLBR_SCRATCH (CLBR_R10 | CLBR_R11)
+
+#endif /* X86_64 */
+
+#define CLBR_CALLEE_SAVE ((CLBR_ARG_REGS | CLBR_SCRATCH) & ~CLBR_RET_REG)
+
+#ifndef __ASSEMBLY__
+
+#include <asm/desc_defs.h>
+#include <asm/kmap_types.h>
+#include <asm/pgtable_types.h>
+#include <asm/nospec-branch.h>
+
+struct page;
+struct thread_struct;
+struct desc_ptr;
+struct tss_struct;
+struct mm_struct;
+struct desc_struct;
+struct task_struct;
+struct cpumask;
+struct flush_tlb_info;
+struct mmu_gather;
+
+/*
+ * Wrapper type for pointers to code which uses the non-standard
+ * calling convention. See PV_CALL_SAVE_REGS_THUNK below.
+ */
+struct paravirt_callee_save {
+ void *func;
+};
+
+/* general info */
+struct pv_info {
+ unsigned int kernel_rpl;
+ int shared_kernel_pmd;
+
+#ifdef CONFIG_X86_64
+ u16 extra_user_64bit_cs; /* __USER_CS if none */
+#endif
+
+ const char *name;
+};
+
+struct pv_init_ops {
+ /*
+ * Patch may replace one of the defined code sequences with
+ * arbitrary code, subject to the same register constraints.
+ * This generally means the code is not free to clobber any
+ * registers other than EAX. The patch function should return
+ * the number of bytes of code generated, as we nop pad the
+ * rest in generic code.
+ */
+ unsigned (*patch)(u8 type, u16 clobber, void *insnbuf,
+ unsigned long addr, unsigned len);
+} __no_randomize_layout;
+
+
+struct pv_lazy_ops {
+ /* Set deferred update mode, used for batching operations. */
+ void (*enter)(void);
+ void (*leave)(void);
+ void (*flush)(void);
+} __no_randomize_layout;
+
+struct pv_time_ops {
+ unsigned long long (*sched_clock)(void);
+ unsigned long long (*steal_clock)(int cpu);
+} __no_randomize_layout;
+
+struct pv_cpu_ops {
+ /* hooks for various privileged instructions */
+ unsigned long (*get_debugreg)(int regno);
+ void (*set_debugreg)(int regno, unsigned long value);
+
+ unsigned long (*read_cr0)(void);
+ void (*write_cr0)(unsigned long);
+
+ void (*write_cr4)(unsigned long);
+
+#ifdef CONFIG_X86_64
+ unsigned long (*read_cr8)(void);
+ void (*write_cr8)(unsigned long);
+#endif
+
+ /* Segment descriptor handling */
+ void (*load_tr_desc)(void);
+ void (*load_gdt)(const struct desc_ptr *);
+ void (*load_idt)(const struct desc_ptr *);
+ void (*set_ldt)(const void *desc, unsigned entries);
+ unsigned long (*store_tr)(void);
+ void (*load_tls)(struct thread_struct *t, unsigned int cpu);
+#ifdef CONFIG_X86_64
+ void (*load_gs_index)(unsigned int idx);
+#endif
+ void (*write_ldt_entry)(struct desc_struct *ldt, int entrynum,
+ const void *desc);
+ void (*write_gdt_entry)(struct desc_struct *,
+ int entrynum, const void *desc, int size);
+ void (*write_idt_entry)(gate_desc *,
+ int entrynum, const gate_desc *gate);
+ void (*alloc_ldt)(struct desc_struct *ldt, unsigned entries);
+ void (*free_ldt)(struct desc_struct *ldt, unsigned entries);
+
+ void (*load_sp0)(unsigned long sp0);
+
+ void (*set_iopl_mask)(unsigned mask);
+
+ void (*wbinvd)(void);
+ void (*io_delay)(void);
+
+ /* cpuid emulation, mostly so that caps bits can be disabled */
+ void (*cpuid)(unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx);
+
+ /* Unsafe MSR operations. These will warn or panic on failure. */
+ u64 (*read_msr)(unsigned int msr);
+ void (*write_msr)(unsigned int msr, unsigned low, unsigned high);
+
+ /*
+ * Safe MSR operations.
+ * read sets err to 0 or -EIO. write returns 0 or -EIO.
+ */
+ u64 (*read_msr_safe)(unsigned int msr, int *err);
+ int (*write_msr_safe)(unsigned int msr, unsigned low, unsigned high);
+
+ u64 (*read_pmc)(int counter);
+
+ /*
+ * Switch to usermode gs and return to 64-bit usermode using
+ * sysret. Only used in 64-bit kernels to return to 64-bit
+ * processes. Usermode register state, including %rsp, must
+ * already be restored.
+ */
+ void (*usergs_sysret64)(void);
+
+ /* Normal iret. Jump to this with the standard iret stack
+ frame set up. */
+ void (*iret)(void);
+
+ void (*swapgs)(void);
+
+ void (*start_context_switch)(struct task_struct *prev);
+ void (*end_context_switch)(struct task_struct *next);
+} __no_randomize_layout;
+
+struct pv_irq_ops {
+ /*
+ * Get/set interrupt state. save_fl and restore_fl are only
+ * expected to use X86_EFLAGS_IF; all other bits
+ * returned from save_fl are undefined, and may be ignored by
+ * restore_fl.
+ *
+ * NOTE: These functions callers expect the callee to preserve
+ * more registers than the standard C calling convention.
+ */
+ struct paravirt_callee_save save_fl;
+ struct paravirt_callee_save restore_fl;
+ struct paravirt_callee_save irq_disable;
+ struct paravirt_callee_save irq_enable;
+
+ void (*safe_halt)(void);
+ void (*halt)(void);
+
+} __no_randomize_layout;
+
+struct pv_mmu_ops {
+ unsigned long (*read_cr2)(void);
+ void (*write_cr2)(unsigned long);
+
+ unsigned long (*read_cr3)(void);
+ void (*write_cr3)(unsigned long);
+
+ /*
+ * Hooks for intercepting the creation/use/destruction of an
+ * mm_struct.
+ */
+ void (*activate_mm)(struct mm_struct *prev,
+ struct mm_struct *next);
+ void (*dup_mmap)(struct mm_struct *oldmm,
+ struct mm_struct *mm);
+ void (*exit_mmap)(struct mm_struct *mm);
+
+
+ /* TLB operations */
+ void (*flush_tlb_user)(void);
+ void (*flush_tlb_kernel)(void);
+ void (*flush_tlb_one_user)(unsigned long addr);
+ void (*flush_tlb_others)(const struct cpumask *cpus,
+ const struct flush_tlb_info *info);
+
+ void (*tlb_remove_table)(struct mmu_gather *tlb, void *table);
+
+ /* Hooks for allocating and freeing a pagetable top-level */
+ int (*pgd_alloc)(struct mm_struct *mm);
+ void (*pgd_free)(struct mm_struct *mm, pgd_t *pgd);
+
+ /*
+ * Hooks for allocating/releasing pagetable pages when they're
+ * attached to a pagetable
+ */
+ void (*alloc_pte)(struct mm_struct *mm, unsigned long pfn);
+ void (*alloc_pmd)(struct mm_struct *mm, unsigned long pfn);
+ void (*alloc_pud)(struct mm_struct *mm, unsigned long pfn);
+ void (*alloc_p4d)(struct mm_struct *mm, unsigned long pfn);
+ void (*release_pte)(unsigned long pfn);
+ void (*release_pmd)(unsigned long pfn);
+ void (*release_pud)(unsigned long pfn);
+ void (*release_p4d)(unsigned long pfn);
+
+ /* Pagetable manipulation functions */
+ void (*set_pte)(pte_t *ptep, pte_t pteval);
+ void (*set_pte_at)(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pteval);
+ void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval);
+
+ pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep);
+ void (*ptep_modify_prot_commit)(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte);
+
+ struct paravirt_callee_save pte_val;
+ struct paravirt_callee_save make_pte;
+
+ struct paravirt_callee_save pgd_val;
+ struct paravirt_callee_save make_pgd;
+
+#if CONFIG_PGTABLE_LEVELS >= 3
+#ifdef CONFIG_X86_PAE
+ void (*set_pte_atomic)(pte_t *ptep, pte_t pteval);
+ void (*pte_clear)(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep);
+ void (*pmd_clear)(pmd_t *pmdp);
+
+#endif /* CONFIG_X86_PAE */
+
+ void (*set_pud)(pud_t *pudp, pud_t pudval);
+
+ struct paravirt_callee_save pmd_val;
+ struct paravirt_callee_save make_pmd;
+
+#if CONFIG_PGTABLE_LEVELS >= 4
+ struct paravirt_callee_save pud_val;
+ struct paravirt_callee_save make_pud;
+
+ void (*set_p4d)(p4d_t *p4dp, p4d_t p4dval);
+
+#if CONFIG_PGTABLE_LEVELS >= 5
+ struct paravirt_callee_save p4d_val;
+ struct paravirt_callee_save make_p4d;
+
+ void (*set_pgd)(pgd_t *pgdp, pgd_t pgdval);
+#endif /* CONFIG_PGTABLE_LEVELS >= 5 */
+
+#endif /* CONFIG_PGTABLE_LEVELS >= 4 */
+
+#endif /* CONFIG_PGTABLE_LEVELS >= 3 */
+
+ struct pv_lazy_ops lazy_mode;
+
+ /* dom0 ops */
+
+ /* Sometimes the physical address is a pfn, and sometimes its
+ an mfn. We can tell which is which from the index. */
+ void (*set_fixmap)(unsigned /* enum fixed_addresses */ idx,
+ phys_addr_t phys, pgprot_t flags);
+} __no_randomize_layout;
+
+struct arch_spinlock;
+#ifdef CONFIG_SMP
+#include <asm/spinlock_types.h>
+#endif
+
+struct qspinlock;
+
+struct pv_lock_ops {
+ void (*queued_spin_lock_slowpath)(struct qspinlock *lock, u32 val);
+ struct paravirt_callee_save queued_spin_unlock;
+
+ void (*wait)(u8 *ptr, u8 val);
+ void (*kick)(int cpu);
+
+ struct paravirt_callee_save vcpu_is_preempted;
+} __no_randomize_layout;
+
+/* This contains all the paravirt structures: we get a convenient
+ * number for each function using the offset which we use to indicate
+ * what to patch. */
+struct paravirt_patch_template {
+ struct pv_init_ops pv_init_ops;
+ struct pv_time_ops pv_time_ops;
+ struct pv_cpu_ops pv_cpu_ops;
+ struct pv_irq_ops pv_irq_ops;
+ struct pv_mmu_ops pv_mmu_ops;
+ struct pv_lock_ops pv_lock_ops;
+} __no_randomize_layout;
+
+extern struct pv_info pv_info;
+extern struct pv_init_ops pv_init_ops;
+extern struct pv_time_ops pv_time_ops;
+extern struct pv_cpu_ops pv_cpu_ops;
+extern struct pv_irq_ops pv_irq_ops;
+extern struct pv_mmu_ops pv_mmu_ops;
+extern struct pv_lock_ops pv_lock_ops;
+
+#define PARAVIRT_PATCH(x) \
+ (offsetof(struct paravirt_patch_template, x) / sizeof(void *))
+
+#define paravirt_type(op) \
+ [paravirt_typenum] "i" (PARAVIRT_PATCH(op)), \
+ [paravirt_opptr] "i" (&(op))
+#define paravirt_clobber(clobber) \
+ [paravirt_clobber] "i" (clobber)
+
+/*
+ * Generate some code, and mark it as patchable by the
+ * apply_paravirt() alternate instruction patcher.
+ */
+#define _paravirt_alt(insn_string, type, clobber) \
+ "771:\n\t" insn_string "\n" "772:\n" \
+ ".pushsection .parainstructions,\"a\"\n" \
+ _ASM_ALIGN "\n" \
+ _ASM_PTR " 771b\n" \
+ " .byte " type "\n" \
+ " .byte 772b-771b\n" \
+ " .short " clobber "\n" \
+ ".popsection\n"
+
+/* Generate patchable code, with the default asm parameters. */
+#define paravirt_alt(insn_string) \
+ _paravirt_alt(insn_string, "%c[paravirt_typenum]", "%c[paravirt_clobber]")
+
+/* Simple instruction patching code. */
+#define NATIVE_LABEL(a,x,b) "\n\t.globl " a #x "_" #b "\n" a #x "_" #b ":\n\t"
+
+#define DEF_NATIVE(ops, name, code) \
+ __visible extern const char start_##ops##_##name[], end_##ops##_##name[]; \
+ asm(NATIVE_LABEL("start_", ops, name) code NATIVE_LABEL("end_", ops, name))
+
+unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len);
+unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len);
+unsigned paravirt_patch_call(void *insnbuf,
+ const void *target, u16 tgt_clobbers,
+ unsigned long addr, u16 site_clobbers,
+ unsigned len);
+unsigned paravirt_patch_jmp(void *insnbuf, const void *target,
+ unsigned long addr, unsigned len);
+unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
+ unsigned long addr, unsigned len);
+
+unsigned paravirt_patch_insns(void *insnbuf, unsigned len,
+ const char *start, const char *end);
+
+unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
+ unsigned long addr, unsigned len);
+
+int paravirt_disable_iospace(void);
+
+/*
+ * This generates an indirect call based on the operation type number.
+ * The type number, computed in PARAVIRT_PATCH, is derived from the
+ * offset into the paravirt_patch_template structure, and can therefore be
+ * freely converted back into a structure offset.
+ */
+#define PARAVIRT_CALL \
+ ANNOTATE_RETPOLINE_SAFE \
+ "call *%c[paravirt_opptr];"
+
+/*
+ * These macros are intended to wrap calls through one of the paravirt
+ * ops structs, so that they can be later identified and patched at
+ * runtime.
+ *
+ * Normally, a call to a pv_op function is a simple indirect call:
+ * (pv_op_struct.operations)(args...).
+ *
+ * Unfortunately, this is a relatively slow operation for modern CPUs,
+ * because it cannot necessarily determine what the destination
+ * address is. In this case, the address is a runtime constant, so at
+ * the very least we can patch the call to e a simple direct call, or
+ * ideally, patch an inline implementation into the callsite. (Direct
+ * calls are essentially free, because the call and return addresses
+ * are completely predictable.)
+ *
+ * For i386, these macros rely on the standard gcc "regparm(3)" calling
+ * convention, in which the first three arguments are placed in %eax,
+ * %edx, %ecx (in that order), and the remaining arguments are placed
+ * on the stack. All caller-save registers (eax,edx,ecx) are expected
+ * to be modified (either clobbered or used for return values).
+ * X86_64, on the other hand, already specifies a register-based calling
+ * conventions, returning at %rax, with parameteres going on %rdi, %rsi,
+ * %rdx, and %rcx. Note that for this reason, x86_64 does not need any
+ * special handling for dealing with 4 arguments, unlike i386.
+ * However, x86_64 also have to clobber all caller saved registers, which
+ * unfortunately, are quite a bit (r8 - r11)
+ *
+ * The call instruction itself is marked by placing its start address
+ * and size into the .parainstructions section, so that
+ * apply_paravirt() in arch/i386/kernel/alternative.c can do the
+ * appropriate patching under the control of the backend pv_init_ops
+ * implementation.
+ *
+ * Unfortunately there's no way to get gcc to generate the args setup
+ * for the call, and then allow the call itself to be generated by an
+ * inline asm. Because of this, we must do the complete arg setup and
+ * return value handling from within these macros. This is fairly
+ * cumbersome.
+ *
+ * There are 5 sets of PVOP_* macros for dealing with 0-4 arguments.
+ * It could be extended to more arguments, but there would be little
+ * to be gained from that. For each number of arguments, there are
+ * the two VCALL and CALL variants for void and non-void functions.
+ *
+ * When there is a return value, the invoker of the macro must specify
+ * the return type. The macro then uses sizeof() on that type to
+ * determine whether its a 32 or 64 bit value, and places the return
+ * in the right register(s) (just %eax for 32-bit, and %edx:%eax for
+ * 64-bit). For x86_64 machines, it just returns at %rax regardless of
+ * the return value size.
+ *
+ * 64-bit arguments are passed as a pair of adjacent 32-bit arguments
+ * i386 also passes 64-bit arguments as a pair of adjacent 32-bit arguments
+ * in low,high order
+ *
+ * Small structures are passed and returned in registers. The macro
+ * calling convention can't directly deal with this, so the wrapper
+ * functions must do this.
+ *
+ * These PVOP_* macros are only defined within this header. This
+ * means that all uses must be wrapped in inline functions. This also
+ * makes sure the incoming and outgoing types are always correct.
+ */
+#ifdef CONFIG_X86_32
+#define PVOP_VCALL_ARGS \
+ unsigned long __eax = __eax, __edx = __edx, __ecx = __ecx;
+
+#define PVOP_CALL_ARGS PVOP_VCALL_ARGS
+
+#define PVOP_CALL_ARG1(x) "a" ((unsigned long)(x))
+#define PVOP_CALL_ARG2(x) "d" ((unsigned long)(x))
+#define PVOP_CALL_ARG3(x) "c" ((unsigned long)(x))
+
+#define PVOP_VCALL_CLOBBERS "=a" (__eax), "=d" (__edx), \
+ "=c" (__ecx)
+#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS
+
+#define PVOP_VCALLEE_CLOBBERS "=a" (__eax), "=d" (__edx)
+#define PVOP_CALLEE_CLOBBERS PVOP_VCALLEE_CLOBBERS
+
+#define EXTRA_CLOBBERS
+#define VEXTRA_CLOBBERS
+#else /* CONFIG_X86_64 */
+/* [re]ax isn't an arg, but the return val */
+#define PVOP_VCALL_ARGS \
+ unsigned long __edi = __edi, __esi = __esi, \
+ __edx = __edx, __ecx = __ecx, __eax = __eax;
+
+#define PVOP_CALL_ARGS PVOP_VCALL_ARGS
+
+#define PVOP_CALL_ARG1(x) "D" ((unsigned long)(x))
+#define PVOP_CALL_ARG2(x) "S" ((unsigned long)(x))
+#define PVOP_CALL_ARG3(x) "d" ((unsigned long)(x))
+#define PVOP_CALL_ARG4(x) "c" ((unsigned long)(x))
+
+#define PVOP_VCALL_CLOBBERS "=D" (__edi), \
+ "=S" (__esi), "=d" (__edx), \
+ "=c" (__ecx)
+#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS, "=a" (__eax)
+
+/* void functions are still allowed [re]ax for scratch */
+#define PVOP_VCALLEE_CLOBBERS "=a" (__eax)
+#define PVOP_CALLEE_CLOBBERS PVOP_VCALLEE_CLOBBERS
+
+#define EXTRA_CLOBBERS , "r8", "r9", "r10", "r11"
+#define VEXTRA_CLOBBERS , "rax", "r8", "r9", "r10", "r11"
+#endif /* CONFIG_X86_32 */
+
+#ifdef CONFIG_PARAVIRT_DEBUG
+#define PVOP_TEST_NULL(op) BUG_ON(op == NULL)
+#else
+#define PVOP_TEST_NULL(op) ((void)op)
+#endif
+
+#define PVOP_RETMASK(rettype) \
+ ({ unsigned long __mask = ~0UL; \
+ switch (sizeof(rettype)) { \
+ case 1: __mask = 0xffUL; break; \
+ case 2: __mask = 0xffffUL; break; \
+ case 4: __mask = 0xffffffffUL; break; \
+ default: break; \
+ } \
+ __mask; \
+ })
+
+
+#define ____PVOP_CALL(rettype, op, clbr, call_clbr, extra_clbr, \
+ pre, post, ...) \
+ ({ \
+ rettype __ret; \
+ PVOP_CALL_ARGS; \
+ PVOP_TEST_NULL(op); \
+ /* This is 32-bit specific, but is okay in 64-bit */ \
+ /* since this condition will never hold */ \
+ if (sizeof(rettype) > sizeof(unsigned long)) { \
+ asm volatile(pre \
+ paravirt_alt(PARAVIRT_CALL) \
+ post \
+ : call_clbr, ASM_CALL_CONSTRAINT \
+ : paravirt_type(op), \
+ paravirt_clobber(clbr), \
+ ##__VA_ARGS__ \
+ : "memory", "cc" extra_clbr); \
+ __ret = (rettype)((((u64)__edx) << 32) | __eax); \
+ } else { \
+ asm volatile(pre \
+ paravirt_alt(PARAVIRT_CALL) \
+ post \
+ : call_clbr, ASM_CALL_CONSTRAINT \
+ : paravirt_type(op), \
+ paravirt_clobber(clbr), \
+ ##__VA_ARGS__ \
+ : "memory", "cc" extra_clbr); \
+ __ret = (rettype)(__eax & PVOP_RETMASK(rettype)); \
+ } \
+ __ret; \
+ })
+
+#define __PVOP_CALL(rettype, op, pre, post, ...) \
+ ____PVOP_CALL(rettype, op, CLBR_ANY, PVOP_CALL_CLOBBERS, \
+ EXTRA_CLOBBERS, pre, post, ##__VA_ARGS__)
+
+#define __PVOP_CALLEESAVE(rettype, op, pre, post, ...) \
+ ____PVOP_CALL(rettype, op.func, CLBR_RET_REG, \
+ PVOP_CALLEE_CLOBBERS, , \
+ pre, post, ##__VA_ARGS__)
+
+
+#define ____PVOP_VCALL(op, clbr, call_clbr, extra_clbr, pre, post, ...) \
+ ({ \
+ PVOP_VCALL_ARGS; \
+ PVOP_TEST_NULL(op); \
+ asm volatile(pre \
+ paravirt_alt(PARAVIRT_CALL) \
+ post \
+ : call_clbr, ASM_CALL_CONSTRAINT \
+ : paravirt_type(op), \
+ paravirt_clobber(clbr), \
+ ##__VA_ARGS__ \
+ : "memory", "cc" extra_clbr); \
+ })
+
+#define __PVOP_VCALL(op, pre, post, ...) \
+ ____PVOP_VCALL(op, CLBR_ANY, PVOP_VCALL_CLOBBERS, \
+ VEXTRA_CLOBBERS, \
+ pre, post, ##__VA_ARGS__)
+
+#define __PVOP_VCALLEESAVE(op, pre, post, ...) \
+ ____PVOP_VCALL(op.func, CLBR_RET_REG, \
+ PVOP_VCALLEE_CLOBBERS, , \
+ pre, post, ##__VA_ARGS__)
+
+
+
+#define PVOP_CALL0(rettype, op) \
+ __PVOP_CALL(rettype, op, "", "")
+#define PVOP_VCALL0(op) \
+ __PVOP_VCALL(op, "", "")
+
+#define PVOP_CALLEE0(rettype, op) \
+ __PVOP_CALLEESAVE(rettype, op, "", "")
+#define PVOP_VCALLEE0(op) \
+ __PVOP_VCALLEESAVE(op, "", "")
+
+
+#define PVOP_CALL1(rettype, op, arg1) \
+ __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1))
+#define PVOP_VCALL1(op, arg1) \
+ __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1))
+
+#define PVOP_CALLEE1(rettype, op, arg1) \
+ __PVOP_CALLEESAVE(rettype, op, "", "", PVOP_CALL_ARG1(arg1))
+#define PVOP_VCALLEE1(op, arg1) \
+ __PVOP_VCALLEESAVE(op, "", "", PVOP_CALL_ARG1(arg1))
+
+
+#define PVOP_CALL2(rettype, op, arg1, arg2) \
+ __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \
+ PVOP_CALL_ARG2(arg2))
+#define PVOP_VCALL2(op, arg1, arg2) \
+ __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1), \
+ PVOP_CALL_ARG2(arg2))
+
+#define PVOP_CALLEE2(rettype, op, arg1, arg2) \
+ __PVOP_CALLEESAVE(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \
+ PVOP_CALL_ARG2(arg2))
+#define PVOP_VCALLEE2(op, arg1, arg2) \
+ __PVOP_VCALLEESAVE(op, "", "", PVOP_CALL_ARG1(arg1), \
+ PVOP_CALL_ARG2(arg2))
+
+
+#define PVOP_CALL3(rettype, op, arg1, arg2, arg3) \
+ __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \
+ PVOP_CALL_ARG2(arg2), PVOP_CALL_ARG3(arg3))
+#define PVOP_VCALL3(op, arg1, arg2, arg3) \
+ __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1), \
+ PVOP_CALL_ARG2(arg2), PVOP_CALL_ARG3(arg3))
+
+/* This is the only difference in x86_64. We can make it much simpler */
+#ifdef CONFIG_X86_32
+#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \
+ __PVOP_CALL(rettype, op, \
+ "push %[_arg4];", "lea 4(%%esp),%%esp;", \
+ PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \
+ PVOP_CALL_ARG3(arg3), [_arg4] "mr" ((u32)(arg4)))
+#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \
+ __PVOP_VCALL(op, \
+ "push %[_arg4];", "lea 4(%%esp),%%esp;", \
+ "0" ((u32)(arg1)), "1" ((u32)(arg2)), \
+ "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4)))
+#else
+#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \
+ __PVOP_CALL(rettype, op, "", "", \
+ PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \
+ PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4))
+#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \
+ __PVOP_VCALL(op, "", "", \
+ PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \
+ PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4))
+#endif
+
+/* Lazy mode for batching updates / context switch */
+enum paravirt_lazy_mode {
+ PARAVIRT_LAZY_NONE,
+ PARAVIRT_LAZY_MMU,
+ PARAVIRT_LAZY_CPU,
+};
+
+enum paravirt_lazy_mode paravirt_get_lazy_mode(void);
+void paravirt_start_context_switch(struct task_struct *prev);
+void paravirt_end_context_switch(struct task_struct *next);
+
+void paravirt_enter_lazy_mmu(void);
+void paravirt_leave_lazy_mmu(void);
+void paravirt_flush_lazy_mmu(void);
+
+void _paravirt_nop(void);
+u32 _paravirt_ident_32(u32);
+u64 _paravirt_ident_64(u64);
+
+#define paravirt_nop ((void *)_paravirt_nop)
+
+/* These all sit in the .parainstructions section to tell us what to patch. */
+struct paravirt_patch_site {
+ u8 *instr; /* original instructions */
+ u8 instrtype; /* type of this instruction */
+ u8 len; /* length of original instruction */
+ u16 clobbers; /* what registers you may clobber */
+};
+
+extern struct paravirt_patch_site __parainstructions[],
+ __parainstructions_end[];
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_X86_PARAVIRT_TYPES_H */
diff --git a/arch/x86/include/asm/parport.h b/arch/x86/include/asm/parport.h
new file mode 100644
index 000000000..163f78259
--- /dev/null
+++ b/arch/x86/include/asm/parport.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_PARPORT_H
+#define _ASM_X86_PARPORT_H
+
+static int parport_pc_find_isa_ports(int autoirq, int autodma);
+static int parport_pc_find_nonpci_ports(int autoirq, int autodma)
+{
+ return parport_pc_find_isa_ports(autoirq, autodma);
+}
+
+#endif /* _ASM_X86_PARPORT_H */
diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h
new file mode 100644
index 000000000..92015c65f
--- /dev/null
+++ b/arch/x86/include/asm/pat.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_PAT_H
+#define _ASM_X86_PAT_H
+
+#include <linux/types.h>
+#include <asm/pgtable_types.h>
+
+bool pat_enabled(void);
+void pat_disable(const char *reason);
+extern void pat_init(void);
+extern void init_cache_modes(void);
+
+extern int reserve_memtype(u64 start, u64 end,
+ enum page_cache_mode req_pcm, enum page_cache_mode *ret_pcm);
+extern int free_memtype(u64 start, u64 end);
+
+extern int kernel_map_sync_memtype(u64 base, unsigned long size,
+ enum page_cache_mode pcm);
+
+int io_reserve_memtype(resource_size_t start, resource_size_t end,
+ enum page_cache_mode *pcm);
+
+void io_free_memtype(resource_size_t start, resource_size_t end);
+
+bool pat_pfn_immune_to_uc_mtrr(unsigned long pfn);
+
+#endif /* _ASM_X86_PAT_H */
diff --git a/arch/x86/include/asm/pci-direct.h b/arch/x86/include/asm/pci-direct.h
new file mode 100644
index 000000000..94597a3cf
--- /dev/null
+++ b/arch/x86/include/asm/pci-direct.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_PCI_DIRECT_H
+#define _ASM_X86_PCI_DIRECT_H
+
+#include <linux/types.h>
+
+/* Direct PCI access. This is used for PCI accesses in early boot before
+ the PCI subsystem works. */
+
+extern u32 read_pci_config(u8 bus, u8 slot, u8 func, u8 offset);
+extern u8 read_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset);
+extern u16 read_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset);
+extern void write_pci_config(u8 bus, u8 slot, u8 func, u8 offset, u32 val);
+extern void write_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset, u8 val);
+extern void write_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset, u16 val);
+
+extern int early_pci_allowed(void);
+#endif /* _ASM_X86_PCI_DIRECT_H */
diff --git a/arch/x86/include/asm/pci-functions.h b/arch/x86/include/asm/pci-functions.h
new file mode 100644
index 000000000..1bbc10812
--- /dev/null
+++ b/arch/x86/include/asm/pci-functions.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * PCI BIOS function numbering for conventional PCI BIOS
+ * systems
+ */
+
+#define PCIBIOS_PCI_FUNCTION_ID 0xb1XX
+#define PCIBIOS_PCI_BIOS_PRESENT 0xb101
+#define PCIBIOS_FIND_PCI_DEVICE 0xb102
+#define PCIBIOS_FIND_PCI_CLASS_CODE 0xb103
+#define PCIBIOS_GENERATE_SPECIAL_CYCLE 0xb106
+#define PCIBIOS_READ_CONFIG_BYTE 0xb108
+#define PCIBIOS_READ_CONFIG_WORD 0xb109
+#define PCIBIOS_READ_CONFIG_DWORD 0xb10a
+#define PCIBIOS_WRITE_CONFIG_BYTE 0xb10b
+#define PCIBIOS_WRITE_CONFIG_WORD 0xb10c
+#define PCIBIOS_WRITE_CONFIG_DWORD 0xb10d
+#define PCIBIOS_GET_ROUTING_OPTIONS 0xb10e
+#define PCIBIOS_SET_PCI_HW_INT 0xb10f
+
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
new file mode 100644
index 000000000..662963681
--- /dev/null
+++ b/arch/x86/include/asm/pci.h
@@ -0,0 +1,161 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_PCI_H
+#define _ASM_X86_PCI_H
+
+#include <linux/mm.h> /* for struct page */
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/scatterlist.h>
+#include <asm/io.h>
+#include <asm/pat.h>
+#include <asm/x86_init.h>
+
+#ifdef __KERNEL__
+
+struct pci_sysdata {
+ int domain; /* PCI domain */
+ int node; /* NUMA node */
+#ifdef CONFIG_ACPI
+ struct acpi_device *companion; /* ACPI companion device */
+#endif
+#ifdef CONFIG_X86_64
+ void *iommu; /* IOMMU private data */
+#endif
+#ifdef CONFIG_PCI_MSI_IRQ_DOMAIN
+ void *fwnode; /* IRQ domain for MSI assignment */
+#endif
+#if IS_ENABLED(CONFIG_VMD)
+ bool vmd_domain; /* True if in Intel VMD domain */
+#endif
+};
+
+extern int pci_routeirq;
+extern int noioapicquirk;
+extern int noioapicreroute;
+
+#ifdef CONFIG_PCI
+
+#ifdef CONFIG_PCI_DOMAINS
+static inline int pci_domain_nr(struct pci_bus *bus)
+{
+ struct pci_sysdata *sd = bus->sysdata;
+
+ return sd->domain;
+}
+
+static inline int pci_proc_domain(struct pci_bus *bus)
+{
+ return pci_domain_nr(bus);
+}
+#endif
+
+#ifdef CONFIG_PCI_MSI_IRQ_DOMAIN
+static inline void *_pci_root_bus_fwnode(struct pci_bus *bus)
+{
+ struct pci_sysdata *sd = bus->sysdata;
+
+ return sd->fwnode;
+}
+
+#define pci_root_bus_fwnode _pci_root_bus_fwnode
+#endif
+
+static inline bool is_vmd(struct pci_bus *bus)
+{
+#if IS_ENABLED(CONFIG_VMD)
+ struct pci_sysdata *sd = bus->sysdata;
+
+ return sd->vmd_domain;
+#else
+ return false;
+#endif
+}
+
+/* Can be used to override the logic in pci_scan_bus for skipping
+ already-configured bus numbers - to be used for buggy BIOSes
+ or architectures with incomplete PCI setup by the loader */
+
+extern unsigned int pcibios_assign_all_busses(void);
+extern int pci_legacy_init(void);
+#else
+static inline int pcibios_assign_all_busses(void) { return 0; }
+#endif
+
+extern unsigned long pci_mem_start;
+#define PCIBIOS_MIN_IO 0x1000
+#define PCIBIOS_MIN_MEM (pci_mem_start)
+
+#define PCIBIOS_MIN_CARDBUS_IO 0x4000
+
+extern int pcibios_enabled;
+void pcibios_scan_root(int bus);
+
+struct irq_routing_table *pcibios_get_irq_routing_table(void);
+int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq);
+
+
+#define HAVE_PCI_MMAP
+#define arch_can_pci_mmap_wc() pat_enabled()
+#define ARCH_GENERIC_PCI_MMAP_RESOURCE
+
+#ifdef CONFIG_PCI
+extern void early_quirks(void);
+#else
+static inline void early_quirks(void) { }
+#endif
+
+extern void pci_iommu_alloc(void);
+
+#ifdef CONFIG_PCI_MSI
+/* implemented in arch/x86/kernel/apic/io_apic. */
+struct msi_desc;
+int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type);
+void native_teardown_msi_irq(unsigned int irq);
+void native_restore_msi_irqs(struct pci_dev *dev);
+#else
+#define native_setup_msi_irqs NULL
+#define native_teardown_msi_irq NULL
+#endif
+#endif /* __KERNEL__ */
+
+#ifdef CONFIG_X86_64
+#include <asm/pci_64.h>
+#endif
+
+/* generic pci stuff */
+#include <asm-generic/pci.h>
+
+#ifdef CONFIG_NUMA
+/* Returns the node based on pci bus */
+static inline int __pcibus_to_node(const struct pci_bus *bus)
+{
+ const struct pci_sysdata *sd = bus->sysdata;
+
+ return sd->node;
+}
+
+static inline const struct cpumask *
+cpumask_of_pcibus(const struct pci_bus *bus)
+{
+ int node;
+
+ node = __pcibus_to_node(bus);
+ return (node == -1) ? cpu_online_mask :
+ cpumask_of_node(node);
+}
+#endif
+
+struct pci_setup_rom {
+ struct setup_data data;
+ uint16_t vendor;
+ uint16_t devid;
+ uint64_t pcilen;
+ unsigned long segment;
+ unsigned long bus;
+ unsigned long device;
+ unsigned long function;
+ uint8_t romdata[0];
+};
+
+#endif /* _ASM_X86_PCI_H */
diff --git a/arch/x86/include/asm/pci_64.h b/arch/x86/include/asm/pci_64.h
new file mode 100644
index 000000000..f5411de0a
--- /dev/null
+++ b/arch/x86/include/asm/pci_64.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_PCI_64_H
+#define _ASM_X86_PCI_64_H
+
+#ifdef __KERNEL__
+
+#ifdef CONFIG_CALGARY_IOMMU
+static inline void *pci_iommu(struct pci_bus *bus)
+{
+ struct pci_sysdata *sd = bus->sysdata;
+ return sd->iommu;
+}
+
+static inline void set_pci_iommu(struct pci_bus *bus, void *val)
+{
+ struct pci_sysdata *sd = bus->sysdata;
+ sd->iommu = val;
+}
+#endif /* CONFIG_CALGARY_IOMMU */
+
+extern int (*pci_config_read)(int seg, int bus, int dev, int fn,
+ int reg, int len, u32 *value);
+extern int (*pci_config_write)(int seg, int bus, int dev, int fn,
+ int reg, int len, u32 value);
+
+#endif /* __KERNEL__ */
+
+#endif /* _ASM_X86_PCI_64_H */
diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
new file mode 100644
index 000000000..959d618db
--- /dev/null
+++ b/arch/x86/include/asm/pci_x86.h
@@ -0,0 +1,216 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Low-Level PCI Access for i386 machines.
+ *
+ * (c) 1999 Martin Mares <mj@ucw.cz>
+ */
+
+#include <linux/ioport.h>
+
+#undef DEBUG
+
+#ifdef DEBUG
+#define DBG(fmt, ...) printk(fmt, ##__VA_ARGS__)
+#else
+#define DBG(fmt, ...) \
+do { \
+ if (0) \
+ printk(fmt, ##__VA_ARGS__); \
+} while (0)
+#endif
+
+#define PCI_PROBE_BIOS 0x0001
+#define PCI_PROBE_CONF1 0x0002
+#define PCI_PROBE_CONF2 0x0004
+#define PCI_PROBE_MMCONF 0x0008
+#define PCI_PROBE_MASK 0x000f
+#define PCI_PROBE_NOEARLY 0x0010
+
+#define PCI_NO_CHECKS 0x0400
+#define PCI_USE_PIRQ_MASK 0x0800
+#define PCI_ASSIGN_ROMS 0x1000
+#define PCI_BIOS_IRQ_SCAN 0x2000
+#define PCI_ASSIGN_ALL_BUSSES 0x4000
+#define PCI_CAN_SKIP_ISA_ALIGN 0x8000
+#define PCI_USE__CRS 0x10000
+#define PCI_CHECK_ENABLE_AMD_MMCONF 0x20000
+#define PCI_HAS_IO_ECS 0x40000
+#define PCI_NOASSIGN_ROMS 0x80000
+#define PCI_ROOT_NO_CRS 0x100000
+#define PCI_NOASSIGN_BARS 0x200000
+#define PCI_BIG_ROOT_WINDOW 0x400000
+
+extern unsigned int pci_probe;
+extern unsigned long pirq_table_addr;
+
+enum pci_bf_sort_state {
+ pci_bf_sort_default,
+ pci_force_nobf,
+ pci_force_bf,
+ pci_dmi_bf,
+};
+
+/* pci-i386.c */
+
+void pcibios_resource_survey(void);
+void pcibios_set_cache_line_size(void);
+
+/* pci-pc.c */
+
+extern int pcibios_last_bus;
+extern struct pci_ops pci_root_ops;
+
+void pcibios_scan_specific_bus(int busn);
+
+/* pci-irq.c */
+
+struct irq_info {
+ u8 bus, devfn; /* Bus, device and function */
+ struct {
+ u8 link; /* IRQ line ID, chipset dependent,
+ 0 = not routed */
+ u16 bitmap; /* Available IRQs */
+ } __attribute__((packed)) irq[4];
+ u8 slot; /* Slot number, 0=onboard */
+ u8 rfu;
+} __attribute__((packed));
+
+struct irq_routing_table {
+ u32 signature; /* PIRQ_SIGNATURE should be here */
+ u16 version; /* PIRQ_VERSION */
+ u16 size; /* Table size in bytes */
+ u8 rtr_bus, rtr_devfn; /* Where the interrupt router lies */
+ u16 exclusive_irqs; /* IRQs devoted exclusively to
+ PCI usage */
+ u16 rtr_vendor, rtr_device; /* Vendor and device ID of
+ interrupt router */
+ u32 miniport_data; /* Crap */
+ u8 rfu[11];
+ u8 checksum; /* Modulo 256 checksum must give 0 */
+ struct irq_info slots[0];
+} __attribute__((packed));
+
+extern unsigned int pcibios_irq_mask;
+
+extern raw_spinlock_t pci_config_lock;
+
+extern int (*pcibios_enable_irq)(struct pci_dev *dev);
+extern void (*pcibios_disable_irq)(struct pci_dev *dev);
+
+extern bool mp_should_keep_irq(struct device *dev);
+
+struct pci_raw_ops {
+ int (*read)(unsigned int domain, unsigned int bus, unsigned int devfn,
+ int reg, int len, u32 *val);
+ int (*write)(unsigned int domain, unsigned int bus, unsigned int devfn,
+ int reg, int len, u32 val);
+};
+
+extern const struct pci_raw_ops *raw_pci_ops;
+extern const struct pci_raw_ops *raw_pci_ext_ops;
+
+extern const struct pci_raw_ops pci_mmcfg;
+extern const struct pci_raw_ops pci_direct_conf1;
+extern bool port_cf9_safe;
+
+/* arch_initcall level */
+extern int pci_direct_probe(void);
+extern void pci_direct_init(int type);
+extern void pci_pcbios_init(void);
+extern void __init dmi_check_pciprobe(void);
+extern void __init dmi_check_skip_isa_align(void);
+
+/* some common used subsys_initcalls */
+extern int __init pci_acpi_init(void);
+extern void __init pcibios_irq_init(void);
+extern int __init pcibios_init(void);
+extern int pci_legacy_init(void);
+extern void pcibios_fixup_irqs(void);
+
+/* pci-mmconfig.c */
+
+/* "PCI MMCONFIG %04x [bus %02x-%02x]" */
+#define PCI_MMCFG_RESOURCE_NAME_LEN (22 + 4 + 2 + 2)
+
+struct pci_mmcfg_region {
+ struct list_head list;
+ struct resource res;
+ u64 address;
+ char __iomem *virt;
+ u16 segment;
+ u8 start_bus;
+ u8 end_bus;
+ char name[PCI_MMCFG_RESOURCE_NAME_LEN];
+};
+
+extern int __init pci_mmcfg_arch_init(void);
+extern void __init pci_mmcfg_arch_free(void);
+extern int pci_mmcfg_arch_map(struct pci_mmcfg_region *cfg);
+extern void pci_mmcfg_arch_unmap(struct pci_mmcfg_region *cfg);
+extern int pci_mmconfig_insert(struct device *dev, u16 seg, u8 start, u8 end,
+ phys_addr_t addr);
+extern int pci_mmconfig_delete(u16 seg, u8 start, u8 end);
+extern struct pci_mmcfg_region *pci_mmconfig_lookup(int segment, int bus);
+extern struct pci_mmcfg_region *__init pci_mmconfig_add(int segment, int start,
+ int end, u64 addr);
+
+extern struct list_head pci_mmcfg_list;
+
+#define PCI_MMCFG_BUS_OFFSET(bus) ((bus) << 20)
+
+/*
+ * On AMD Fam10h CPUs, all PCI MMIO configuration space accesses must use
+ * %eax. No other source or target registers may be used. The following
+ * mmio_config_* accessors enforce this. See "BIOS and Kernel Developer's
+ * Guide (BKDG) For AMD Family 10h Processors", rev. 3.48, sec 2.11.1,
+ * "MMIO Configuration Coding Requirements".
+ */
+static inline unsigned char mmio_config_readb(void __iomem *pos)
+{
+ u8 val;
+ asm volatile("movb (%1),%%al" : "=a" (val) : "r" (pos));
+ return val;
+}
+
+static inline unsigned short mmio_config_readw(void __iomem *pos)
+{
+ u16 val;
+ asm volatile("movw (%1),%%ax" : "=a" (val) : "r" (pos));
+ return val;
+}
+
+static inline unsigned int mmio_config_readl(void __iomem *pos)
+{
+ u32 val;
+ asm volatile("movl (%1),%%eax" : "=a" (val) : "r" (pos));
+ return val;
+}
+
+static inline void mmio_config_writeb(void __iomem *pos, u8 val)
+{
+ asm volatile("movb %%al,(%1)" : : "a" (val), "r" (pos) : "memory");
+}
+
+static inline void mmio_config_writew(void __iomem *pos, u16 val)
+{
+ asm volatile("movw %%ax,(%1)" : : "a" (val), "r" (pos) : "memory");
+}
+
+static inline void mmio_config_writel(void __iomem *pos, u32 val)
+{
+ asm volatile("movl %%eax,(%1)" : : "a" (val), "r" (pos) : "memory");
+}
+
+#ifdef CONFIG_PCI
+# ifdef CONFIG_ACPI
+# define x86_default_pci_init pci_acpi_init
+# else
+# define x86_default_pci_init pci_legacy_init
+# endif
+# define x86_default_pci_init_irq pcibios_irq_init
+# define x86_default_pci_fixup_irqs pcibios_fixup_irqs
+#else
+# define x86_default_pci_init NULL
+# define x86_default_pci_init_irq NULL
+# define x86_default_pci_fixup_irqs NULL
+#endif
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
new file mode 100644
index 000000000..1a19d11cf
--- /dev/null
+++ b/arch/x86/include/asm/percpu.h
@@ -0,0 +1,613 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_PERCPU_H
+#define _ASM_X86_PERCPU_H
+
+#ifdef CONFIG_X86_64
+#define __percpu_seg gs
+#define __percpu_mov_op movq
+#else
+#define __percpu_seg fs
+#define __percpu_mov_op movl
+#endif
+
+#ifdef __ASSEMBLY__
+
+/*
+ * PER_CPU finds an address of a per-cpu variable.
+ *
+ * Args:
+ * var - variable name
+ * reg - 32bit register
+ *
+ * The resulting address is stored in the "reg" argument.
+ *
+ * Example:
+ * PER_CPU(cpu_gdt_descr, %ebx)
+ */
+#ifdef CONFIG_SMP
+#define PER_CPU(var, reg) \
+ __percpu_mov_op %__percpu_seg:this_cpu_off, reg; \
+ lea var(reg), reg
+#define PER_CPU_VAR(var) %__percpu_seg:var
+#else /* ! SMP */
+#define PER_CPU(var, reg) __percpu_mov_op $var, reg
+#define PER_CPU_VAR(var) var
+#endif /* SMP */
+
+#ifdef CONFIG_X86_64_SMP
+#define INIT_PER_CPU_VAR(var) init_per_cpu__##var
+#else
+#define INIT_PER_CPU_VAR(var) var
+#endif
+
+#else /* ...!ASSEMBLY */
+
+#include <linux/kernel.h>
+#include <linux/stringify.h>
+
+#ifdef CONFIG_SMP
+#define __percpu_prefix "%%"__stringify(__percpu_seg)":"
+#define __my_cpu_offset this_cpu_read(this_cpu_off)
+
+/*
+ * Compared to the generic __my_cpu_offset version, the following
+ * saves one instruction and avoids clobbering a temp register.
+ */
+#define arch_raw_cpu_ptr(ptr) \
+({ \
+ unsigned long tcp_ptr__; \
+ asm volatile("add " __percpu_arg(1) ", %0" \
+ : "=r" (tcp_ptr__) \
+ : "m" (this_cpu_off), "0" (ptr)); \
+ (typeof(*(ptr)) __kernel __force *)tcp_ptr__; \
+})
+#else
+#define __percpu_prefix ""
+#endif
+
+#define __percpu_arg(x) __percpu_prefix "%" #x
+
+/*
+ * Initialized pointers to per-cpu variables needed for the boot
+ * processor need to use these macros to get the proper address
+ * offset from __per_cpu_load on SMP.
+ *
+ * There also must be an entry in vmlinux_64.lds.S
+ */
+#define DECLARE_INIT_PER_CPU(var) \
+ extern typeof(var) init_per_cpu_var(var)
+
+#ifdef CONFIG_X86_64_SMP
+#define init_per_cpu_var(var) init_per_cpu__##var
+#else
+#define init_per_cpu_var(var) var
+#endif
+
+/* For arch-specific code, we can use direct single-insn ops (they
+ * don't give an lvalue though). */
+extern void __bad_percpu_size(void);
+
+#define percpu_to_op(op, var, val) \
+do { \
+ typedef typeof(var) pto_T__; \
+ if (0) { \
+ pto_T__ pto_tmp__; \
+ pto_tmp__ = (val); \
+ (void)pto_tmp__; \
+ } \
+ switch (sizeof(var)) { \
+ case 1: \
+ asm(op "b %1,"__percpu_arg(0) \
+ : "+m" (var) \
+ : "qi" ((pto_T__)(val))); \
+ break; \
+ case 2: \
+ asm(op "w %1,"__percpu_arg(0) \
+ : "+m" (var) \
+ : "ri" ((pto_T__)(val))); \
+ break; \
+ case 4: \
+ asm(op "l %1,"__percpu_arg(0) \
+ : "+m" (var) \
+ : "ri" ((pto_T__)(val))); \
+ break; \
+ case 8: \
+ asm(op "q %1,"__percpu_arg(0) \
+ : "+m" (var) \
+ : "re" ((pto_T__)(val))); \
+ break; \
+ default: __bad_percpu_size(); \
+ } \
+} while (0)
+
+/*
+ * Generate a percpu add to memory instruction and optimize code
+ * if one is added or subtracted.
+ */
+#define percpu_add_op(var, val) \
+do { \
+ typedef typeof(var) pao_T__; \
+ const int pao_ID__ = (__builtin_constant_p(val) && \
+ ((val) == 1 || (val) == -1)) ? \
+ (int)(val) : 0; \
+ if (0) { \
+ pao_T__ pao_tmp__; \
+ pao_tmp__ = (val); \
+ (void)pao_tmp__; \
+ } \
+ switch (sizeof(var)) { \
+ case 1: \
+ if (pao_ID__ == 1) \
+ asm("incb "__percpu_arg(0) : "+m" (var)); \
+ else if (pao_ID__ == -1) \
+ asm("decb "__percpu_arg(0) : "+m" (var)); \
+ else \
+ asm("addb %1, "__percpu_arg(0) \
+ : "+m" (var) \
+ : "qi" ((pao_T__)(val))); \
+ break; \
+ case 2: \
+ if (pao_ID__ == 1) \
+ asm("incw "__percpu_arg(0) : "+m" (var)); \
+ else if (pao_ID__ == -1) \
+ asm("decw "__percpu_arg(0) : "+m" (var)); \
+ else \
+ asm("addw %1, "__percpu_arg(0) \
+ : "+m" (var) \
+ : "ri" ((pao_T__)(val))); \
+ break; \
+ case 4: \
+ if (pao_ID__ == 1) \
+ asm("incl "__percpu_arg(0) : "+m" (var)); \
+ else if (pao_ID__ == -1) \
+ asm("decl "__percpu_arg(0) : "+m" (var)); \
+ else \
+ asm("addl %1, "__percpu_arg(0) \
+ : "+m" (var) \
+ : "ri" ((pao_T__)(val))); \
+ break; \
+ case 8: \
+ if (pao_ID__ == 1) \
+ asm("incq "__percpu_arg(0) : "+m" (var)); \
+ else if (pao_ID__ == -1) \
+ asm("decq "__percpu_arg(0) : "+m" (var)); \
+ else \
+ asm("addq %1, "__percpu_arg(0) \
+ : "+m" (var) \
+ : "re" ((pao_T__)(val))); \
+ break; \
+ default: __bad_percpu_size(); \
+ } \
+} while (0)
+
+#define percpu_from_op(op, var) \
+({ \
+ typeof(var) pfo_ret__; \
+ switch (sizeof(var)) { \
+ case 1: \
+ asm volatile(op "b "__percpu_arg(1)",%0"\
+ : "=q" (pfo_ret__) \
+ : "m" (var)); \
+ break; \
+ case 2: \
+ asm volatile(op "w "__percpu_arg(1)",%0"\
+ : "=r" (pfo_ret__) \
+ : "m" (var)); \
+ break; \
+ case 4: \
+ asm volatile(op "l "__percpu_arg(1)",%0"\
+ : "=r" (pfo_ret__) \
+ : "m" (var)); \
+ break; \
+ case 8: \
+ asm volatile(op "q "__percpu_arg(1)",%0"\
+ : "=r" (pfo_ret__) \
+ : "m" (var)); \
+ break; \
+ default: __bad_percpu_size(); \
+ } \
+ pfo_ret__; \
+})
+
+#define percpu_stable_op(op, var) \
+({ \
+ typeof(var) pfo_ret__; \
+ switch (sizeof(var)) { \
+ case 1: \
+ asm(op "b "__percpu_arg(P1)",%0" \
+ : "=q" (pfo_ret__) \
+ : "p" (&(var))); \
+ break; \
+ case 2: \
+ asm(op "w "__percpu_arg(P1)",%0" \
+ : "=r" (pfo_ret__) \
+ : "p" (&(var))); \
+ break; \
+ case 4: \
+ asm(op "l "__percpu_arg(P1)",%0" \
+ : "=r" (pfo_ret__) \
+ : "p" (&(var))); \
+ break; \
+ case 8: \
+ asm(op "q "__percpu_arg(P1)",%0" \
+ : "=r" (pfo_ret__) \
+ : "p" (&(var))); \
+ break; \
+ default: __bad_percpu_size(); \
+ } \
+ pfo_ret__; \
+})
+
+#define percpu_unary_op(op, var) \
+({ \
+ switch (sizeof(var)) { \
+ case 1: \
+ asm(op "b "__percpu_arg(0) \
+ : "+m" (var)); \
+ break; \
+ case 2: \
+ asm(op "w "__percpu_arg(0) \
+ : "+m" (var)); \
+ break; \
+ case 4: \
+ asm(op "l "__percpu_arg(0) \
+ : "+m" (var)); \
+ break; \
+ case 8: \
+ asm(op "q "__percpu_arg(0) \
+ : "+m" (var)); \
+ break; \
+ default: __bad_percpu_size(); \
+ } \
+})
+
+/*
+ * Add return operation
+ */
+#define percpu_add_return_op(var, val) \
+({ \
+ typeof(var) paro_ret__ = val; \
+ switch (sizeof(var)) { \
+ case 1: \
+ asm("xaddb %0, "__percpu_arg(1) \
+ : "+q" (paro_ret__), "+m" (var) \
+ : : "memory"); \
+ break; \
+ case 2: \
+ asm("xaddw %0, "__percpu_arg(1) \
+ : "+r" (paro_ret__), "+m" (var) \
+ : : "memory"); \
+ break; \
+ case 4: \
+ asm("xaddl %0, "__percpu_arg(1) \
+ : "+r" (paro_ret__), "+m" (var) \
+ : : "memory"); \
+ break; \
+ case 8: \
+ asm("xaddq %0, "__percpu_arg(1) \
+ : "+re" (paro_ret__), "+m" (var) \
+ : : "memory"); \
+ break; \
+ default: __bad_percpu_size(); \
+ } \
+ paro_ret__ += val; \
+ paro_ret__; \
+})
+
+/*
+ * xchg is implemented using cmpxchg without a lock prefix. xchg is
+ * expensive due to the implied lock prefix. The processor cannot prefetch
+ * cachelines if xchg is used.
+ */
+#define percpu_xchg_op(var, nval) \
+({ \
+ typeof(var) pxo_ret__; \
+ typeof(var) pxo_new__ = (nval); \
+ switch (sizeof(var)) { \
+ case 1: \
+ asm("\n\tmov "__percpu_arg(1)",%%al" \
+ "\n1:\tcmpxchgb %2, "__percpu_arg(1) \
+ "\n\tjnz 1b" \
+ : "=&a" (pxo_ret__), "+m" (var) \
+ : "q" (pxo_new__) \
+ : "memory"); \
+ break; \
+ case 2: \
+ asm("\n\tmov "__percpu_arg(1)",%%ax" \
+ "\n1:\tcmpxchgw %2, "__percpu_arg(1) \
+ "\n\tjnz 1b" \
+ : "=&a" (pxo_ret__), "+m" (var) \
+ : "r" (pxo_new__) \
+ : "memory"); \
+ break; \
+ case 4: \
+ asm("\n\tmov "__percpu_arg(1)",%%eax" \
+ "\n1:\tcmpxchgl %2, "__percpu_arg(1) \
+ "\n\tjnz 1b" \
+ : "=&a" (pxo_ret__), "+m" (var) \
+ : "r" (pxo_new__) \
+ : "memory"); \
+ break; \
+ case 8: \
+ asm("\n\tmov "__percpu_arg(1)",%%rax" \
+ "\n1:\tcmpxchgq %2, "__percpu_arg(1) \
+ "\n\tjnz 1b" \
+ : "=&a" (pxo_ret__), "+m" (var) \
+ : "r" (pxo_new__) \
+ : "memory"); \
+ break; \
+ default: __bad_percpu_size(); \
+ } \
+ pxo_ret__; \
+})
+
+/*
+ * cmpxchg has no such implied lock semantics as a result it is much
+ * more efficient for cpu local operations.
+ */
+#define percpu_cmpxchg_op(var, oval, nval) \
+({ \
+ typeof(var) pco_ret__; \
+ typeof(var) pco_old__ = (oval); \
+ typeof(var) pco_new__ = (nval); \
+ switch (sizeof(var)) { \
+ case 1: \
+ asm("cmpxchgb %2, "__percpu_arg(1) \
+ : "=a" (pco_ret__), "+m" (var) \
+ : "q" (pco_new__), "0" (pco_old__) \
+ : "memory"); \
+ break; \
+ case 2: \
+ asm("cmpxchgw %2, "__percpu_arg(1) \
+ : "=a" (pco_ret__), "+m" (var) \
+ : "r" (pco_new__), "0" (pco_old__) \
+ : "memory"); \
+ break; \
+ case 4: \
+ asm("cmpxchgl %2, "__percpu_arg(1) \
+ : "=a" (pco_ret__), "+m" (var) \
+ : "r" (pco_new__), "0" (pco_old__) \
+ : "memory"); \
+ break; \
+ case 8: \
+ asm("cmpxchgq %2, "__percpu_arg(1) \
+ : "=a" (pco_ret__), "+m" (var) \
+ : "r" (pco_new__), "0" (pco_old__) \
+ : "memory"); \
+ break; \
+ default: __bad_percpu_size(); \
+ } \
+ pco_ret__; \
+})
+
+/*
+ * this_cpu_read() makes gcc load the percpu variable every time it is
+ * accessed while this_cpu_read_stable() allows the value to be cached.
+ * this_cpu_read_stable() is more efficient and can be used if its value
+ * is guaranteed to be valid across cpus. The current users include
+ * get_current() and get_thread_info() both of which are actually
+ * per-thread variables implemented as per-cpu variables and thus
+ * stable for the duration of the respective task.
+ */
+#define this_cpu_read_stable(var) percpu_stable_op("mov", var)
+
+#define raw_cpu_read_1(pcp) percpu_from_op("mov", pcp)
+#define raw_cpu_read_2(pcp) percpu_from_op("mov", pcp)
+#define raw_cpu_read_4(pcp) percpu_from_op("mov", pcp)
+
+#define raw_cpu_write_1(pcp, val) percpu_to_op("mov", (pcp), val)
+#define raw_cpu_write_2(pcp, val) percpu_to_op("mov", (pcp), val)
+#define raw_cpu_write_4(pcp, val) percpu_to_op("mov", (pcp), val)
+#define raw_cpu_add_1(pcp, val) percpu_add_op((pcp), val)
+#define raw_cpu_add_2(pcp, val) percpu_add_op((pcp), val)
+#define raw_cpu_add_4(pcp, val) percpu_add_op((pcp), val)
+#define raw_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val)
+#define raw_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val)
+#define raw_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val)
+#define raw_cpu_or_1(pcp, val) percpu_to_op("or", (pcp), val)
+#define raw_cpu_or_2(pcp, val) percpu_to_op("or", (pcp), val)
+#define raw_cpu_or_4(pcp, val) percpu_to_op("or", (pcp), val)
+#define raw_cpu_xchg_1(pcp, val) percpu_xchg_op(pcp, val)
+#define raw_cpu_xchg_2(pcp, val) percpu_xchg_op(pcp, val)
+#define raw_cpu_xchg_4(pcp, val) percpu_xchg_op(pcp, val)
+
+#define this_cpu_read_1(pcp) percpu_from_op("mov", pcp)
+#define this_cpu_read_2(pcp) percpu_from_op("mov", pcp)
+#define this_cpu_read_4(pcp) percpu_from_op("mov", pcp)
+#define this_cpu_write_1(pcp, val) percpu_to_op("mov", (pcp), val)
+#define this_cpu_write_2(pcp, val) percpu_to_op("mov", (pcp), val)
+#define this_cpu_write_4(pcp, val) percpu_to_op("mov", (pcp), val)
+#define this_cpu_add_1(pcp, val) percpu_add_op((pcp), val)
+#define this_cpu_add_2(pcp, val) percpu_add_op((pcp), val)
+#define this_cpu_add_4(pcp, val) percpu_add_op((pcp), val)
+#define this_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val)
+#define this_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val)
+#define this_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val)
+#define this_cpu_or_1(pcp, val) percpu_to_op("or", (pcp), val)
+#define this_cpu_or_2(pcp, val) percpu_to_op("or", (pcp), val)
+#define this_cpu_or_4(pcp, val) percpu_to_op("or", (pcp), val)
+#define this_cpu_xchg_1(pcp, nval) percpu_xchg_op(pcp, nval)
+#define this_cpu_xchg_2(pcp, nval) percpu_xchg_op(pcp, nval)
+#define this_cpu_xchg_4(pcp, nval) percpu_xchg_op(pcp, nval)
+
+#define raw_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val)
+#define raw_cpu_add_return_2(pcp, val) percpu_add_return_op(pcp, val)
+#define raw_cpu_add_return_4(pcp, val) percpu_add_return_op(pcp, val)
+#define raw_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
+#define raw_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
+#define raw_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
+
+#define this_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val)
+#define this_cpu_add_return_2(pcp, val) percpu_add_return_op(pcp, val)
+#define this_cpu_add_return_4(pcp, val) percpu_add_return_op(pcp, val)
+#define this_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
+#define this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
+#define this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
+
+#ifdef CONFIG_X86_CMPXCHG64
+#define percpu_cmpxchg8b_double(pcp1, pcp2, o1, o2, n1, n2) \
+({ \
+ bool __ret; \
+ typeof(pcp1) __o1 = (o1), __n1 = (n1); \
+ typeof(pcp2) __o2 = (o2), __n2 = (n2); \
+ asm volatile("cmpxchg8b "__percpu_arg(1) \
+ CC_SET(z) \
+ : CC_OUT(z) (__ret), "+m" (pcp1), "+m" (pcp2), "+a" (__o1), "+d" (__o2) \
+ : "b" (__n1), "c" (__n2)); \
+ __ret; \
+})
+
+#define raw_cpu_cmpxchg_double_4 percpu_cmpxchg8b_double
+#define this_cpu_cmpxchg_double_4 percpu_cmpxchg8b_double
+#endif /* CONFIG_X86_CMPXCHG64 */
+
+/*
+ * Per cpu atomic 64 bit operations are only available under 64 bit.
+ * 32 bit must fall back to generic operations.
+ */
+#ifdef CONFIG_X86_64
+#define raw_cpu_read_8(pcp) percpu_from_op("mov", pcp)
+#define raw_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val)
+#define raw_cpu_add_8(pcp, val) percpu_add_op((pcp), val)
+#define raw_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val)
+#define raw_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val)
+#define raw_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val)
+#define raw_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval)
+#define raw_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
+
+#define this_cpu_read_8(pcp) percpu_from_op("mov", pcp)
+#define this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val)
+#define this_cpu_add_8(pcp, val) percpu_add_op((pcp), val)
+#define this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val)
+#define this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val)
+#define this_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val)
+#define this_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval)
+#define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
+
+/*
+ * Pretty complex macro to generate cmpxchg16 instruction. The instruction
+ * is not supported on early AMD64 processors so we must be able to emulate
+ * it in software. The address used in the cmpxchg16 instruction must be
+ * aligned to a 16 byte boundary.
+ */
+#define percpu_cmpxchg16b_double(pcp1, pcp2, o1, o2, n1, n2) \
+({ \
+ bool __ret; \
+ typeof(pcp1) __o1 = (o1), __n1 = (n1); \
+ typeof(pcp2) __o2 = (o2), __n2 = (n2); \
+ alternative_io("leaq %P1,%%rsi\n\tcall this_cpu_cmpxchg16b_emu\n\t", \
+ "cmpxchg16b " __percpu_arg(1) "\n\tsetz %0\n\t", \
+ X86_FEATURE_CX16, \
+ ASM_OUTPUT2("=a" (__ret), "+m" (pcp1), \
+ "+m" (pcp2), "+d" (__o2)), \
+ "b" (__n1), "c" (__n2), "a" (__o1) : "rsi"); \
+ __ret; \
+})
+
+#define raw_cpu_cmpxchg_double_8 percpu_cmpxchg16b_double
+#define this_cpu_cmpxchg_double_8 percpu_cmpxchg16b_double
+
+#endif
+
+static __always_inline bool x86_this_cpu_constant_test_bit(unsigned int nr,
+ const unsigned long __percpu *addr)
+{
+ unsigned long __percpu *a =
+ (unsigned long __percpu *)addr + nr / BITS_PER_LONG;
+
+#ifdef CONFIG_X86_64
+ return ((1UL << (nr % BITS_PER_LONG)) & raw_cpu_read_8(*a)) != 0;
+#else
+ return ((1UL << (nr % BITS_PER_LONG)) & raw_cpu_read_4(*a)) != 0;
+#endif
+}
+
+static inline bool x86_this_cpu_variable_test_bit(int nr,
+ const unsigned long __percpu *addr)
+{
+ bool oldbit;
+
+ asm volatile("btl "__percpu_arg(2)",%1"
+ CC_SET(c)
+ : CC_OUT(c) (oldbit)
+ : "m" (*(unsigned long __percpu *)addr), "Ir" (nr));
+
+ return oldbit;
+}
+
+#define x86_this_cpu_test_bit(nr, addr) \
+ (__builtin_constant_p((nr)) \
+ ? x86_this_cpu_constant_test_bit((nr), (addr)) \
+ : x86_this_cpu_variable_test_bit((nr), (addr)))
+
+
+#include <asm-generic/percpu.h>
+
+/* We can use this directly for local CPU (faster). */
+DECLARE_PER_CPU_READ_MOSTLY(unsigned long, this_cpu_off);
+
+#endif /* !__ASSEMBLY__ */
+
+#ifdef CONFIG_SMP
+
+/*
+ * Define the "EARLY_PER_CPU" macros. These are used for some per_cpu
+ * variables that are initialized and accessed before there are per_cpu
+ * areas allocated.
+ */
+
+#define DEFINE_EARLY_PER_CPU(_type, _name, _initvalue) \
+ DEFINE_PER_CPU(_type, _name) = _initvalue; \
+ __typeof__(_type) _name##_early_map[NR_CPUS] __initdata = \
+ { [0 ... NR_CPUS-1] = _initvalue }; \
+ __typeof__(_type) *_name##_early_ptr __refdata = _name##_early_map
+
+#define DEFINE_EARLY_PER_CPU_READ_MOSTLY(_type, _name, _initvalue) \
+ DEFINE_PER_CPU_READ_MOSTLY(_type, _name) = _initvalue; \
+ __typeof__(_type) _name##_early_map[NR_CPUS] __initdata = \
+ { [0 ... NR_CPUS-1] = _initvalue }; \
+ __typeof__(_type) *_name##_early_ptr __refdata = _name##_early_map
+
+#define EXPORT_EARLY_PER_CPU_SYMBOL(_name) \
+ EXPORT_PER_CPU_SYMBOL(_name)
+
+#define DECLARE_EARLY_PER_CPU(_type, _name) \
+ DECLARE_PER_CPU(_type, _name); \
+ extern __typeof__(_type) *_name##_early_ptr; \
+ extern __typeof__(_type) _name##_early_map[]
+
+#define DECLARE_EARLY_PER_CPU_READ_MOSTLY(_type, _name) \
+ DECLARE_PER_CPU_READ_MOSTLY(_type, _name); \
+ extern __typeof__(_type) *_name##_early_ptr; \
+ extern __typeof__(_type) _name##_early_map[]
+
+#define early_per_cpu_ptr(_name) (_name##_early_ptr)
+#define early_per_cpu_map(_name, _idx) (_name##_early_map[_idx])
+#define early_per_cpu(_name, _cpu) \
+ *(early_per_cpu_ptr(_name) ? \
+ &early_per_cpu_ptr(_name)[_cpu] : \
+ &per_cpu(_name, _cpu))
+
+#else /* !CONFIG_SMP */
+#define DEFINE_EARLY_PER_CPU(_type, _name, _initvalue) \
+ DEFINE_PER_CPU(_type, _name) = _initvalue
+
+#define DEFINE_EARLY_PER_CPU_READ_MOSTLY(_type, _name, _initvalue) \
+ DEFINE_PER_CPU_READ_MOSTLY(_type, _name) = _initvalue
+
+#define EXPORT_EARLY_PER_CPU_SYMBOL(_name) \
+ EXPORT_PER_CPU_SYMBOL(_name)
+
+#define DECLARE_EARLY_PER_CPU(_type, _name) \
+ DECLARE_PER_CPU(_type, _name)
+
+#define DECLARE_EARLY_PER_CPU_READ_MOSTLY(_type, _name) \
+ DECLARE_PER_CPU_READ_MOSTLY(_type, _name)
+
+#define early_per_cpu(_name, _cpu) per_cpu(_name, _cpu)
+#define early_per_cpu_ptr(_name) NULL
+/* no early_per_cpu_map() */
+
+#endif /* !CONFIG_SMP */
+
+#endif /* _ASM_X86_PERCPU_H */
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
new file mode 100644
index 000000000..f6c4915a8
--- /dev/null
+++ b/arch/x86/include/asm/perf_event.h
@@ -0,0 +1,315 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_PERF_EVENT_H
+#define _ASM_X86_PERF_EVENT_H
+
+/*
+ * Performance event hw details:
+ */
+
+#define INTEL_PMC_MAX_GENERIC 32
+#define INTEL_PMC_MAX_FIXED 3
+#define INTEL_PMC_IDX_FIXED 32
+
+#define X86_PMC_IDX_MAX 64
+
+#define MSR_ARCH_PERFMON_PERFCTR0 0xc1
+#define MSR_ARCH_PERFMON_PERFCTR1 0xc2
+
+#define MSR_ARCH_PERFMON_EVENTSEL0 0x186
+#define MSR_ARCH_PERFMON_EVENTSEL1 0x187
+
+#define ARCH_PERFMON_EVENTSEL_EVENT 0x000000FFULL
+#define ARCH_PERFMON_EVENTSEL_UMASK 0x0000FF00ULL
+#define ARCH_PERFMON_EVENTSEL_USR (1ULL << 16)
+#define ARCH_PERFMON_EVENTSEL_OS (1ULL << 17)
+#define ARCH_PERFMON_EVENTSEL_EDGE (1ULL << 18)
+#define ARCH_PERFMON_EVENTSEL_PIN_CONTROL (1ULL << 19)
+#define ARCH_PERFMON_EVENTSEL_INT (1ULL << 20)
+#define ARCH_PERFMON_EVENTSEL_ANY (1ULL << 21)
+#define ARCH_PERFMON_EVENTSEL_ENABLE (1ULL << 22)
+#define ARCH_PERFMON_EVENTSEL_INV (1ULL << 23)
+#define ARCH_PERFMON_EVENTSEL_CMASK 0xFF000000ULL
+
+#define HSW_IN_TX (1ULL << 32)
+#define HSW_IN_TX_CHECKPOINTED (1ULL << 33)
+
+#define AMD64_EVENTSEL_INT_CORE_ENABLE (1ULL << 36)
+#define AMD64_EVENTSEL_GUESTONLY (1ULL << 40)
+#define AMD64_EVENTSEL_HOSTONLY (1ULL << 41)
+
+#define AMD64_EVENTSEL_INT_CORE_SEL_SHIFT 37
+#define AMD64_EVENTSEL_INT_CORE_SEL_MASK \
+ (0xFULL << AMD64_EVENTSEL_INT_CORE_SEL_SHIFT)
+
+#define AMD64_EVENTSEL_EVENT \
+ (ARCH_PERFMON_EVENTSEL_EVENT | (0x0FULL << 32))
+#define INTEL_ARCH_EVENT_MASK \
+ (ARCH_PERFMON_EVENTSEL_UMASK | ARCH_PERFMON_EVENTSEL_EVENT)
+
+#define AMD64_L3_SLICE_SHIFT 48
+#define AMD64_L3_SLICE_MASK \
+ ((0xFULL) << AMD64_L3_SLICE_SHIFT)
+
+#define AMD64_L3_THREAD_SHIFT 56
+#define AMD64_L3_THREAD_MASK \
+ ((0xFFULL) << AMD64_L3_THREAD_SHIFT)
+
+#define X86_RAW_EVENT_MASK \
+ (ARCH_PERFMON_EVENTSEL_EVENT | \
+ ARCH_PERFMON_EVENTSEL_UMASK | \
+ ARCH_PERFMON_EVENTSEL_EDGE | \
+ ARCH_PERFMON_EVENTSEL_INV | \
+ ARCH_PERFMON_EVENTSEL_CMASK)
+#define X86_ALL_EVENT_FLAGS \
+ (ARCH_PERFMON_EVENTSEL_EDGE | \
+ ARCH_PERFMON_EVENTSEL_INV | \
+ ARCH_PERFMON_EVENTSEL_CMASK | \
+ ARCH_PERFMON_EVENTSEL_ANY | \
+ ARCH_PERFMON_EVENTSEL_PIN_CONTROL | \
+ HSW_IN_TX | \
+ HSW_IN_TX_CHECKPOINTED)
+#define AMD64_RAW_EVENT_MASK \
+ (X86_RAW_EVENT_MASK | \
+ AMD64_EVENTSEL_EVENT)
+#define AMD64_RAW_EVENT_MASK_NB \
+ (AMD64_EVENTSEL_EVENT | \
+ ARCH_PERFMON_EVENTSEL_UMASK)
+#define AMD64_NUM_COUNTERS 4
+#define AMD64_NUM_COUNTERS_CORE 6
+#define AMD64_NUM_COUNTERS_NB 4
+
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8)
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 0
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
+ (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
+
+#define ARCH_PERFMON_BRANCH_MISSES_RETIRED 6
+#define ARCH_PERFMON_EVENTS_COUNT 7
+
+/*
+ * Intel "Architectural Performance Monitoring" CPUID
+ * detection/enumeration details:
+ */
+union cpuid10_eax {
+ struct {
+ unsigned int version_id:8;
+ unsigned int num_counters:8;
+ unsigned int bit_width:8;
+ unsigned int mask_length:8;
+ } split;
+ unsigned int full;
+};
+
+union cpuid10_ebx {
+ struct {
+ unsigned int no_unhalted_core_cycles:1;
+ unsigned int no_instructions_retired:1;
+ unsigned int no_unhalted_reference_cycles:1;
+ unsigned int no_llc_reference:1;
+ unsigned int no_llc_misses:1;
+ unsigned int no_branch_instruction_retired:1;
+ unsigned int no_branch_misses_retired:1;
+ } split;
+ unsigned int full;
+};
+
+union cpuid10_edx {
+ struct {
+ unsigned int num_counters_fixed:5;
+ unsigned int bit_width_fixed:8;
+ unsigned int reserved:19;
+ } split;
+ unsigned int full;
+};
+
+struct x86_pmu_capability {
+ int version;
+ int num_counters_gp;
+ int num_counters_fixed;
+ int bit_width_gp;
+ int bit_width_fixed;
+ unsigned int events_mask;
+ int events_mask_len;
+};
+
+/*
+ * Fixed-purpose performance events:
+ */
+
+/*
+ * All 3 fixed-mode PMCs are configured via this single MSR:
+ */
+#define MSR_ARCH_PERFMON_FIXED_CTR_CTRL 0x38d
+
+/*
+ * The counts are available in three separate MSRs:
+ */
+
+/* Instr_Retired.Any: */
+#define MSR_ARCH_PERFMON_FIXED_CTR0 0x309
+#define INTEL_PMC_IDX_FIXED_INSTRUCTIONS (INTEL_PMC_IDX_FIXED + 0)
+
+/* CPU_CLK_Unhalted.Core: */
+#define MSR_ARCH_PERFMON_FIXED_CTR1 0x30a
+#define INTEL_PMC_IDX_FIXED_CPU_CYCLES (INTEL_PMC_IDX_FIXED + 1)
+
+/* CPU_CLK_Unhalted.Ref: */
+#define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b
+#define INTEL_PMC_IDX_FIXED_REF_CYCLES (INTEL_PMC_IDX_FIXED + 2)
+#define INTEL_PMC_MSK_FIXED_REF_CYCLES (1ULL << INTEL_PMC_IDX_FIXED_REF_CYCLES)
+
+/*
+ * We model BTS tracing as another fixed-mode PMC.
+ *
+ * We choose a value in the middle of the fixed event range, since lower
+ * values are used by actual fixed events and higher values are used
+ * to indicate other overflow conditions in the PERF_GLOBAL_STATUS msr.
+ */
+#define INTEL_PMC_IDX_FIXED_BTS (INTEL_PMC_IDX_FIXED + 16)
+
+#define GLOBAL_STATUS_COND_CHG BIT_ULL(63)
+#define GLOBAL_STATUS_BUFFER_OVF BIT_ULL(62)
+#define GLOBAL_STATUS_UNC_OVF BIT_ULL(61)
+#define GLOBAL_STATUS_ASIF BIT_ULL(60)
+#define GLOBAL_STATUS_COUNTERS_FROZEN BIT_ULL(59)
+#define GLOBAL_STATUS_LBRS_FROZEN BIT_ULL(58)
+#define GLOBAL_STATUS_TRACE_TOPAPMI BIT_ULL(55)
+
+/*
+ * IBS cpuid feature detection
+ */
+
+#define IBS_CPUID_FEATURES 0x8000001b
+
+/*
+ * Same bit mask as for IBS cpuid feature flags (Fn8000_001B_EAX), but
+ * bit 0 is used to indicate the existence of IBS.
+ */
+#define IBS_CAPS_AVAIL (1U<<0)
+#define IBS_CAPS_FETCHSAM (1U<<1)
+#define IBS_CAPS_OPSAM (1U<<2)
+#define IBS_CAPS_RDWROPCNT (1U<<3)
+#define IBS_CAPS_OPCNT (1U<<4)
+#define IBS_CAPS_BRNTRGT (1U<<5)
+#define IBS_CAPS_OPCNTEXT (1U<<6)
+#define IBS_CAPS_RIPINVALIDCHK (1U<<7)
+#define IBS_CAPS_OPBRNFUSE (1U<<8)
+#define IBS_CAPS_FETCHCTLEXTD (1U<<9)
+#define IBS_CAPS_OPDATA4 (1U<<10)
+
+#define IBS_CAPS_DEFAULT (IBS_CAPS_AVAIL \
+ | IBS_CAPS_FETCHSAM \
+ | IBS_CAPS_OPSAM)
+
+/*
+ * IBS APIC setup
+ */
+#define IBSCTL 0x1cc
+#define IBSCTL_LVT_OFFSET_VALID (1ULL<<8)
+#define IBSCTL_LVT_OFFSET_MASK 0x0F
+
+/* IBS fetch bits/masks */
+#define IBS_FETCH_RAND_EN (1ULL<<57)
+#define IBS_FETCH_VAL (1ULL<<49)
+#define IBS_FETCH_ENABLE (1ULL<<48)
+#define IBS_FETCH_CNT 0xFFFF0000ULL
+#define IBS_FETCH_MAX_CNT 0x0000FFFFULL
+
+/*
+ * IBS op bits/masks
+ * The lower 7 bits of the current count are random bits
+ * preloaded by hardware and ignored in software
+ */
+#define IBS_OP_CUR_CNT (0xFFF80ULL<<32)
+#define IBS_OP_CUR_CNT_RAND (0x0007FULL<<32)
+#define IBS_OP_CNT_CTL (1ULL<<19)
+#define IBS_OP_VAL (1ULL<<18)
+#define IBS_OP_ENABLE (1ULL<<17)
+#define IBS_OP_MAX_CNT 0x0000FFFFULL
+#define IBS_OP_MAX_CNT_EXT 0x007FFFFFULL /* not a register bit mask */
+#define IBS_RIP_INVALID (1ULL<<38)
+
+#ifdef CONFIG_X86_LOCAL_APIC
+extern u32 get_ibs_caps(void);
+#else
+static inline u32 get_ibs_caps(void) { return 0; }
+#endif
+
+#ifdef CONFIG_PERF_EVENTS
+extern void perf_events_lapic_init(void);
+
+/*
+ * Abuse bits {3,5} of the cpu eflags register. These flags are otherwise
+ * unused and ABI specified to be 0, so nobody should care what we do with
+ * them.
+ *
+ * EXACT - the IP points to the exact instruction that triggered the
+ * event (HW bugs exempt).
+ * VM - original X86_VM_MASK; see set_linear_ip().
+ */
+#define PERF_EFLAGS_EXACT (1UL << 3)
+#define PERF_EFLAGS_VM (1UL << 5)
+
+struct pt_regs;
+extern unsigned long perf_instruction_pointer(struct pt_regs *regs);
+extern unsigned long perf_misc_flags(struct pt_regs *regs);
+#define perf_misc_flags(regs) perf_misc_flags(regs)
+
+#include <asm/stacktrace.h>
+
+/*
+ * We abuse bit 3 from flags to pass exact information, see perf_misc_flags
+ * and the comment with PERF_EFLAGS_EXACT.
+ */
+#define perf_arch_fetch_caller_regs(regs, __ip) { \
+ (regs)->ip = (__ip); \
+ (regs)->bp = caller_frame_pointer(); \
+ (regs)->cs = __KERNEL_CS; \
+ regs->flags = 0; \
+ asm volatile( \
+ _ASM_MOV "%%"_ASM_SP ", %0\n" \
+ : "=m" ((regs)->sp) \
+ :: "memory" \
+ ); \
+}
+
+struct perf_guest_switch_msr {
+ unsigned msr;
+ u64 host, guest;
+};
+
+extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr);
+extern void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap);
+extern void perf_check_microcode(void);
+#else
+static inline struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr)
+{
+ *nr = 0;
+ return NULL;
+}
+
+static inline void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
+{
+ memset(cap, 0, sizeof(*cap));
+}
+
+static inline void perf_events_lapic_init(void) { }
+static inline void perf_check_microcode(void) { }
+#endif
+
+#ifdef CONFIG_CPU_SUP_INTEL
+ extern void intel_pt_handle_vmx(int on);
+#endif
+
+#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD)
+ extern void amd_pmu_enable_virt(void);
+ extern void amd_pmu_disable_virt(void);
+#else
+ static inline void amd_pmu_enable_virt(void) { }
+ static inline void amd_pmu_disable_virt(void) { }
+#endif
+
+#define arch_perf_out_copy_user copy_from_user_nmi
+
+#endif /* _ASM_X86_PERF_EVENT_H */
diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h
new file mode 100644
index 000000000..94de1a05a
--- /dev/null
+++ b/arch/x86/include/asm/perf_event_p4.h
@@ -0,0 +1,877 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Netburst Performance Events (P4, old Xeon)
+ */
+
+#ifndef PERF_EVENT_P4_H
+#define PERF_EVENT_P4_H
+
+#include <linux/cpu.h>
+#include <linux/bitops.h>
+
+/*
+ * NetBurst has performance MSRs shared between
+ * threads if HT is turned on, ie for both logical
+ * processors (mem: in turn in Atom with HT support
+ * perf-MSRs are not shared and every thread has its
+ * own perf-MSRs set)
+ */
+#define ARCH_P4_TOTAL_ESCR (46)
+#define ARCH_P4_RESERVED_ESCR (2) /* IQ_ESCR(0,1) not always present */
+#define ARCH_P4_MAX_ESCR (ARCH_P4_TOTAL_ESCR - ARCH_P4_RESERVED_ESCR)
+#define ARCH_P4_MAX_CCCR (18)
+
+#define ARCH_P4_CNTRVAL_BITS (40)
+#define ARCH_P4_CNTRVAL_MASK ((1ULL << ARCH_P4_CNTRVAL_BITS) - 1)
+#define ARCH_P4_UNFLAGGED_BIT ((1ULL) << (ARCH_P4_CNTRVAL_BITS - 1))
+
+#define P4_ESCR_EVENT_MASK 0x7e000000ULL
+#define P4_ESCR_EVENT_SHIFT 25
+#define P4_ESCR_EVENTMASK_MASK 0x01fffe00ULL
+#define P4_ESCR_EVENTMASK_SHIFT 9
+#define P4_ESCR_TAG_MASK 0x000001e0ULL
+#define P4_ESCR_TAG_SHIFT 5
+#define P4_ESCR_TAG_ENABLE 0x00000010ULL
+#define P4_ESCR_T0_OS 0x00000008ULL
+#define P4_ESCR_T0_USR 0x00000004ULL
+#define P4_ESCR_T1_OS 0x00000002ULL
+#define P4_ESCR_T1_USR 0x00000001ULL
+
+#define P4_ESCR_EVENT(v) ((v) << P4_ESCR_EVENT_SHIFT)
+#define P4_ESCR_EMASK(v) ((v) << P4_ESCR_EVENTMASK_SHIFT)
+#define P4_ESCR_TAG(v) ((v) << P4_ESCR_TAG_SHIFT)
+
+#define P4_CCCR_OVF 0x80000000ULL
+#define P4_CCCR_CASCADE 0x40000000ULL
+#define P4_CCCR_OVF_PMI_T0 0x04000000ULL
+#define P4_CCCR_OVF_PMI_T1 0x08000000ULL
+#define P4_CCCR_FORCE_OVF 0x02000000ULL
+#define P4_CCCR_EDGE 0x01000000ULL
+#define P4_CCCR_THRESHOLD_MASK 0x00f00000ULL
+#define P4_CCCR_THRESHOLD_SHIFT 20
+#define P4_CCCR_COMPLEMENT 0x00080000ULL
+#define P4_CCCR_COMPARE 0x00040000ULL
+#define P4_CCCR_ESCR_SELECT_MASK 0x0000e000ULL
+#define P4_CCCR_ESCR_SELECT_SHIFT 13
+#define P4_CCCR_ENABLE 0x00001000ULL
+#define P4_CCCR_THREAD_SINGLE 0x00010000ULL
+#define P4_CCCR_THREAD_BOTH 0x00020000ULL
+#define P4_CCCR_THREAD_ANY 0x00030000ULL
+#define P4_CCCR_RESERVED 0x00000fffULL
+
+#define P4_CCCR_THRESHOLD(v) ((v) << P4_CCCR_THRESHOLD_SHIFT)
+#define P4_CCCR_ESEL(v) ((v) << P4_CCCR_ESCR_SELECT_SHIFT)
+
+#define P4_GEN_ESCR_EMASK(class, name, bit) \
+ class##__##name = ((1ULL << bit) << P4_ESCR_EVENTMASK_SHIFT)
+#define P4_ESCR_EMASK_BIT(class, name) class##__##name
+
+/*
+ * config field is 64bit width and consists of
+ * HT << 63 | ESCR << 32 | CCCR
+ * where HT is HyperThreading bit (since ESCR
+ * has it reserved we may use it for own purpose)
+ *
+ * note that this is NOT the addresses of respective
+ * ESCR and CCCR but rather an only packed value should
+ * be unpacked and written to a proper addresses
+ *
+ * the base idea is to pack as much info as possible
+ */
+#define p4_config_pack_escr(v) (((u64)(v)) << 32)
+#define p4_config_pack_cccr(v) (((u64)(v)) & 0xffffffffULL)
+#define p4_config_unpack_escr(v) (((u64)(v)) >> 32)
+#define p4_config_unpack_cccr(v) (((u64)(v)) & 0xffffffffULL)
+
+#define p4_config_unpack_emask(v) \
+ ({ \
+ u32 t = p4_config_unpack_escr((v)); \
+ t = t & P4_ESCR_EVENTMASK_MASK; \
+ t = t >> P4_ESCR_EVENTMASK_SHIFT; \
+ t; \
+ })
+
+#define p4_config_unpack_event(v) \
+ ({ \
+ u32 t = p4_config_unpack_escr((v)); \
+ t = t & P4_ESCR_EVENT_MASK; \
+ t = t >> P4_ESCR_EVENT_SHIFT; \
+ t; \
+ })
+
+#define P4_CONFIG_HT_SHIFT 63
+#define P4_CONFIG_HT (1ULL << P4_CONFIG_HT_SHIFT)
+
+/*
+ * If an event has alias it should be marked
+ * with a special bit. (Don't forget to check
+ * P4_PEBS_CONFIG_MASK and related bits on
+ * modification.)
+ */
+#define P4_CONFIG_ALIASABLE (1ULL << 9)
+
+/*
+ * The bits we allow to pass for RAW events
+ */
+#define P4_CONFIG_MASK_ESCR \
+ P4_ESCR_EVENT_MASK | \
+ P4_ESCR_EVENTMASK_MASK | \
+ P4_ESCR_TAG_MASK | \
+ P4_ESCR_TAG_ENABLE
+
+#define P4_CONFIG_MASK_CCCR \
+ P4_CCCR_EDGE | \
+ P4_CCCR_THRESHOLD_MASK | \
+ P4_CCCR_COMPLEMENT | \
+ P4_CCCR_COMPARE | \
+ P4_CCCR_THREAD_ANY | \
+ P4_CCCR_RESERVED
+
+/* some dangerous bits are reserved for kernel internals */
+#define P4_CONFIG_MASK \
+ (p4_config_pack_escr(P4_CONFIG_MASK_ESCR)) | \
+ (p4_config_pack_cccr(P4_CONFIG_MASK_CCCR))
+
+/*
+ * In case of event aliasing we need to preserve some
+ * caller bits, otherwise the mapping won't be complete.
+ */
+#define P4_CONFIG_EVENT_ALIAS_MASK \
+ (p4_config_pack_escr(P4_CONFIG_MASK_ESCR) | \
+ p4_config_pack_cccr(P4_CCCR_EDGE | \
+ P4_CCCR_THRESHOLD_MASK | \
+ P4_CCCR_COMPLEMENT | \
+ P4_CCCR_COMPARE))
+
+#define P4_CONFIG_EVENT_ALIAS_IMMUTABLE_BITS \
+ ((P4_CONFIG_HT) | \
+ p4_config_pack_escr(P4_ESCR_T0_OS | \
+ P4_ESCR_T0_USR | \
+ P4_ESCR_T1_OS | \
+ P4_ESCR_T1_USR) | \
+ p4_config_pack_cccr(P4_CCCR_OVF | \
+ P4_CCCR_CASCADE | \
+ P4_CCCR_FORCE_OVF | \
+ P4_CCCR_THREAD_ANY | \
+ P4_CCCR_OVF_PMI_T0 | \
+ P4_CCCR_OVF_PMI_T1 | \
+ P4_CONFIG_ALIASABLE))
+
+static inline bool p4_is_event_cascaded(u64 config)
+{
+ u32 cccr = p4_config_unpack_cccr(config);
+ return !!(cccr & P4_CCCR_CASCADE);
+}
+
+static inline int p4_ht_config_thread(u64 config)
+{
+ return !!(config & P4_CONFIG_HT);
+}
+
+static inline u64 p4_set_ht_bit(u64 config)
+{
+ return config | P4_CONFIG_HT;
+}
+
+static inline u64 p4_clear_ht_bit(u64 config)
+{
+ return config & ~P4_CONFIG_HT;
+}
+
+static inline int p4_ht_active(void)
+{
+#ifdef CONFIG_SMP
+ return smp_num_siblings > 1;
+#endif
+ return 0;
+}
+
+static inline int p4_ht_thread(int cpu)
+{
+#ifdef CONFIG_SMP
+ if (smp_num_siblings == 2)
+ return cpu != cpumask_first(this_cpu_cpumask_var_ptr(cpu_sibling_map));
+#endif
+ return 0;
+}
+
+static inline int p4_should_swap_ts(u64 config, int cpu)
+{
+ return p4_ht_config_thread(config) ^ p4_ht_thread(cpu);
+}
+
+static inline u32 p4_default_cccr_conf(int cpu)
+{
+ /*
+ * Note that P4_CCCR_THREAD_ANY is "required" on
+ * non-HT machines (on HT machines we count TS events
+ * regardless the state of second logical processor
+ */
+ u32 cccr = P4_CCCR_THREAD_ANY;
+
+ if (!p4_ht_thread(cpu))
+ cccr |= P4_CCCR_OVF_PMI_T0;
+ else
+ cccr |= P4_CCCR_OVF_PMI_T1;
+
+ return cccr;
+}
+
+static inline u32 p4_default_escr_conf(int cpu, int exclude_os, int exclude_usr)
+{
+ u32 escr = 0;
+
+ if (!p4_ht_thread(cpu)) {
+ if (!exclude_os)
+ escr |= P4_ESCR_T0_OS;
+ if (!exclude_usr)
+ escr |= P4_ESCR_T0_USR;
+ } else {
+ if (!exclude_os)
+ escr |= P4_ESCR_T1_OS;
+ if (!exclude_usr)
+ escr |= P4_ESCR_T1_USR;
+ }
+
+ return escr;
+}
+
+/*
+ * This are the events which should be used in "Event Select"
+ * field of ESCR register, they are like unique keys which allow
+ * the kernel to determinate which CCCR and COUNTER should be
+ * used to track an event
+ */
+enum P4_EVENTS {
+ P4_EVENT_TC_DELIVER_MODE,
+ P4_EVENT_BPU_FETCH_REQUEST,
+ P4_EVENT_ITLB_REFERENCE,
+ P4_EVENT_MEMORY_CANCEL,
+ P4_EVENT_MEMORY_COMPLETE,
+ P4_EVENT_LOAD_PORT_REPLAY,
+ P4_EVENT_STORE_PORT_REPLAY,
+ P4_EVENT_MOB_LOAD_REPLAY,
+ P4_EVENT_PAGE_WALK_TYPE,
+ P4_EVENT_BSQ_CACHE_REFERENCE,
+ P4_EVENT_IOQ_ALLOCATION,
+ P4_EVENT_IOQ_ACTIVE_ENTRIES,
+ P4_EVENT_FSB_DATA_ACTIVITY,
+ P4_EVENT_BSQ_ALLOCATION,
+ P4_EVENT_BSQ_ACTIVE_ENTRIES,
+ P4_EVENT_SSE_INPUT_ASSIST,
+ P4_EVENT_PACKED_SP_UOP,
+ P4_EVENT_PACKED_DP_UOP,
+ P4_EVENT_SCALAR_SP_UOP,
+ P4_EVENT_SCALAR_DP_UOP,
+ P4_EVENT_64BIT_MMX_UOP,
+ P4_EVENT_128BIT_MMX_UOP,
+ P4_EVENT_X87_FP_UOP,
+ P4_EVENT_TC_MISC,
+ P4_EVENT_GLOBAL_POWER_EVENTS,
+ P4_EVENT_TC_MS_XFER,
+ P4_EVENT_UOP_QUEUE_WRITES,
+ P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE,
+ P4_EVENT_RETIRED_BRANCH_TYPE,
+ P4_EVENT_RESOURCE_STALL,
+ P4_EVENT_WC_BUFFER,
+ P4_EVENT_B2B_CYCLES,
+ P4_EVENT_BNR,
+ P4_EVENT_SNOOP,
+ P4_EVENT_RESPONSE,
+ P4_EVENT_FRONT_END_EVENT,
+ P4_EVENT_EXECUTION_EVENT,
+ P4_EVENT_REPLAY_EVENT,
+ P4_EVENT_INSTR_RETIRED,
+ P4_EVENT_UOPS_RETIRED,
+ P4_EVENT_UOP_TYPE,
+ P4_EVENT_BRANCH_RETIRED,
+ P4_EVENT_MISPRED_BRANCH_RETIRED,
+ P4_EVENT_X87_ASSIST,
+ P4_EVENT_MACHINE_CLEAR,
+ P4_EVENT_INSTR_COMPLETED,
+};
+
+#define P4_OPCODE(event) event##_OPCODE
+#define P4_OPCODE_ESEL(opcode) ((opcode & 0x00ff) >> 0)
+#define P4_OPCODE_EVNT(opcode) ((opcode & 0xff00) >> 8)
+#define P4_OPCODE_PACK(event, sel) (((event) << 8) | sel)
+
+/*
+ * Comments below the event represent ESCR restriction
+ * for this event and counter index per ESCR
+ *
+ * MSR_P4_IQ_ESCR0 and MSR_P4_IQ_ESCR1 are available only on early
+ * processor builds (family 0FH, models 01H-02H). These MSRs
+ * are not available on later versions, so that we don't use
+ * them completely
+ *
+ * Also note that CCCR1 do not have P4_CCCR_ENABLE bit properly
+ * working so that we should not use this CCCR and respective
+ * counter as result
+ */
+enum P4_EVENT_OPCODES {
+ P4_OPCODE(P4_EVENT_TC_DELIVER_MODE) = P4_OPCODE_PACK(0x01, 0x01),
+ /*
+ * MSR_P4_TC_ESCR0: 4, 5
+ * MSR_P4_TC_ESCR1: 6, 7
+ */
+
+ P4_OPCODE(P4_EVENT_BPU_FETCH_REQUEST) = P4_OPCODE_PACK(0x03, 0x00),
+ /*
+ * MSR_P4_BPU_ESCR0: 0, 1
+ * MSR_P4_BPU_ESCR1: 2, 3
+ */
+
+ P4_OPCODE(P4_EVENT_ITLB_REFERENCE) = P4_OPCODE_PACK(0x18, 0x03),
+ /*
+ * MSR_P4_ITLB_ESCR0: 0, 1
+ * MSR_P4_ITLB_ESCR1: 2, 3
+ */
+
+ P4_OPCODE(P4_EVENT_MEMORY_CANCEL) = P4_OPCODE_PACK(0x02, 0x05),
+ /*
+ * MSR_P4_DAC_ESCR0: 8, 9
+ * MSR_P4_DAC_ESCR1: 10, 11
+ */
+
+ P4_OPCODE(P4_EVENT_MEMORY_COMPLETE) = P4_OPCODE_PACK(0x08, 0x02),
+ /*
+ * MSR_P4_SAAT_ESCR0: 8, 9
+ * MSR_P4_SAAT_ESCR1: 10, 11
+ */
+
+ P4_OPCODE(P4_EVENT_LOAD_PORT_REPLAY) = P4_OPCODE_PACK(0x04, 0x02),
+ /*
+ * MSR_P4_SAAT_ESCR0: 8, 9
+ * MSR_P4_SAAT_ESCR1: 10, 11
+ */
+
+ P4_OPCODE(P4_EVENT_STORE_PORT_REPLAY) = P4_OPCODE_PACK(0x05, 0x02),
+ /*
+ * MSR_P4_SAAT_ESCR0: 8, 9
+ * MSR_P4_SAAT_ESCR1: 10, 11
+ */
+
+ P4_OPCODE(P4_EVENT_MOB_LOAD_REPLAY) = P4_OPCODE_PACK(0x03, 0x02),
+ /*
+ * MSR_P4_MOB_ESCR0: 0, 1
+ * MSR_P4_MOB_ESCR1: 2, 3
+ */
+
+ P4_OPCODE(P4_EVENT_PAGE_WALK_TYPE) = P4_OPCODE_PACK(0x01, 0x04),
+ /*
+ * MSR_P4_PMH_ESCR0: 0, 1
+ * MSR_P4_PMH_ESCR1: 2, 3
+ */
+
+ P4_OPCODE(P4_EVENT_BSQ_CACHE_REFERENCE) = P4_OPCODE_PACK(0x0c, 0x07),
+ /*
+ * MSR_P4_BSU_ESCR0: 0, 1
+ * MSR_P4_BSU_ESCR1: 2, 3
+ */
+
+ P4_OPCODE(P4_EVENT_IOQ_ALLOCATION) = P4_OPCODE_PACK(0x03, 0x06),
+ /*
+ * MSR_P4_FSB_ESCR0: 0, 1
+ * MSR_P4_FSB_ESCR1: 2, 3
+ */
+
+ P4_OPCODE(P4_EVENT_IOQ_ACTIVE_ENTRIES) = P4_OPCODE_PACK(0x1a, 0x06),
+ /*
+ * MSR_P4_FSB_ESCR1: 2, 3
+ */
+
+ P4_OPCODE(P4_EVENT_FSB_DATA_ACTIVITY) = P4_OPCODE_PACK(0x17, 0x06),
+ /*
+ * MSR_P4_FSB_ESCR0: 0, 1
+ * MSR_P4_FSB_ESCR1: 2, 3
+ */
+
+ P4_OPCODE(P4_EVENT_BSQ_ALLOCATION) = P4_OPCODE_PACK(0x05, 0x07),
+ /*
+ * MSR_P4_BSU_ESCR0: 0, 1
+ */
+
+ P4_OPCODE(P4_EVENT_BSQ_ACTIVE_ENTRIES) = P4_OPCODE_PACK(0x06, 0x07),
+ /*
+ * NOTE: no ESCR name in docs, it's guessed
+ * MSR_P4_BSU_ESCR1: 2, 3
+ */
+
+ P4_OPCODE(P4_EVENT_SSE_INPUT_ASSIST) = P4_OPCODE_PACK(0x34, 0x01),
+ /*
+ * MSR_P4_FIRM_ESCR0: 8, 9
+ * MSR_P4_FIRM_ESCR1: 10, 11
+ */
+
+ P4_OPCODE(P4_EVENT_PACKED_SP_UOP) = P4_OPCODE_PACK(0x08, 0x01),
+ /*
+ * MSR_P4_FIRM_ESCR0: 8, 9
+ * MSR_P4_FIRM_ESCR1: 10, 11
+ */
+
+ P4_OPCODE(P4_EVENT_PACKED_DP_UOP) = P4_OPCODE_PACK(0x0c, 0x01),
+ /*
+ * MSR_P4_FIRM_ESCR0: 8, 9
+ * MSR_P4_FIRM_ESCR1: 10, 11
+ */
+
+ P4_OPCODE(P4_EVENT_SCALAR_SP_UOP) = P4_OPCODE_PACK(0x0a, 0x01),
+ /*
+ * MSR_P4_FIRM_ESCR0: 8, 9
+ * MSR_P4_FIRM_ESCR1: 10, 11
+ */
+
+ P4_OPCODE(P4_EVENT_SCALAR_DP_UOP) = P4_OPCODE_PACK(0x0e, 0x01),
+ /*
+ * MSR_P4_FIRM_ESCR0: 8, 9
+ * MSR_P4_FIRM_ESCR1: 10, 11
+ */
+
+ P4_OPCODE(P4_EVENT_64BIT_MMX_UOP) = P4_OPCODE_PACK(0x02, 0x01),
+ /*
+ * MSR_P4_FIRM_ESCR0: 8, 9
+ * MSR_P4_FIRM_ESCR1: 10, 11
+ */
+
+ P4_OPCODE(P4_EVENT_128BIT_MMX_UOP) = P4_OPCODE_PACK(0x1a, 0x01),
+ /*
+ * MSR_P4_FIRM_ESCR0: 8, 9
+ * MSR_P4_FIRM_ESCR1: 10, 11
+ */
+
+ P4_OPCODE(P4_EVENT_X87_FP_UOP) = P4_OPCODE_PACK(0x04, 0x01),
+ /*
+ * MSR_P4_FIRM_ESCR0: 8, 9
+ * MSR_P4_FIRM_ESCR1: 10, 11
+ */
+
+ P4_OPCODE(P4_EVENT_TC_MISC) = P4_OPCODE_PACK(0x06, 0x01),
+ /*
+ * MSR_P4_TC_ESCR0: 4, 5
+ * MSR_P4_TC_ESCR1: 6, 7
+ */
+
+ P4_OPCODE(P4_EVENT_GLOBAL_POWER_EVENTS) = P4_OPCODE_PACK(0x13, 0x06),
+ /*
+ * MSR_P4_FSB_ESCR0: 0, 1
+ * MSR_P4_FSB_ESCR1: 2, 3
+ */
+
+ P4_OPCODE(P4_EVENT_TC_MS_XFER) = P4_OPCODE_PACK(0x05, 0x00),
+ /*
+ * MSR_P4_MS_ESCR0: 4, 5
+ * MSR_P4_MS_ESCR1: 6, 7
+ */
+
+ P4_OPCODE(P4_EVENT_UOP_QUEUE_WRITES) = P4_OPCODE_PACK(0x09, 0x00),
+ /*
+ * MSR_P4_MS_ESCR0: 4, 5
+ * MSR_P4_MS_ESCR1: 6, 7
+ */
+
+ P4_OPCODE(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE) = P4_OPCODE_PACK(0x05, 0x02),
+ /*
+ * MSR_P4_TBPU_ESCR0: 4, 5
+ * MSR_P4_TBPU_ESCR1: 6, 7
+ */
+
+ P4_OPCODE(P4_EVENT_RETIRED_BRANCH_TYPE) = P4_OPCODE_PACK(0x04, 0x02),
+ /*
+ * MSR_P4_TBPU_ESCR0: 4, 5
+ * MSR_P4_TBPU_ESCR1: 6, 7
+ */
+
+ P4_OPCODE(P4_EVENT_RESOURCE_STALL) = P4_OPCODE_PACK(0x01, 0x01),
+ /*
+ * MSR_P4_ALF_ESCR0: 12, 13, 16
+ * MSR_P4_ALF_ESCR1: 14, 15, 17
+ */
+
+ P4_OPCODE(P4_EVENT_WC_BUFFER) = P4_OPCODE_PACK(0x05, 0x05),
+ /*
+ * MSR_P4_DAC_ESCR0: 8, 9
+ * MSR_P4_DAC_ESCR1: 10, 11
+ */
+
+ P4_OPCODE(P4_EVENT_B2B_CYCLES) = P4_OPCODE_PACK(0x16, 0x03),
+ /*
+ * MSR_P4_FSB_ESCR0: 0, 1
+ * MSR_P4_FSB_ESCR1: 2, 3
+ */
+
+ P4_OPCODE(P4_EVENT_BNR) = P4_OPCODE_PACK(0x08, 0x03),
+ /*
+ * MSR_P4_FSB_ESCR0: 0, 1
+ * MSR_P4_FSB_ESCR1: 2, 3
+ */
+
+ P4_OPCODE(P4_EVENT_SNOOP) = P4_OPCODE_PACK(0x06, 0x03),
+ /*
+ * MSR_P4_FSB_ESCR0: 0, 1
+ * MSR_P4_FSB_ESCR1: 2, 3
+ */
+
+ P4_OPCODE(P4_EVENT_RESPONSE) = P4_OPCODE_PACK(0x04, 0x03),
+ /*
+ * MSR_P4_FSB_ESCR0: 0, 1
+ * MSR_P4_FSB_ESCR1: 2, 3
+ */
+
+ P4_OPCODE(P4_EVENT_FRONT_END_EVENT) = P4_OPCODE_PACK(0x08, 0x05),
+ /*
+ * MSR_P4_CRU_ESCR2: 12, 13, 16
+ * MSR_P4_CRU_ESCR3: 14, 15, 17
+ */
+
+ P4_OPCODE(P4_EVENT_EXECUTION_EVENT) = P4_OPCODE_PACK(0x0c, 0x05),
+ /*
+ * MSR_P4_CRU_ESCR2: 12, 13, 16
+ * MSR_P4_CRU_ESCR3: 14, 15, 17
+ */
+
+ P4_OPCODE(P4_EVENT_REPLAY_EVENT) = P4_OPCODE_PACK(0x09, 0x05),
+ /*
+ * MSR_P4_CRU_ESCR2: 12, 13, 16
+ * MSR_P4_CRU_ESCR3: 14, 15, 17
+ */
+
+ P4_OPCODE(P4_EVENT_INSTR_RETIRED) = P4_OPCODE_PACK(0x02, 0x04),
+ /*
+ * MSR_P4_CRU_ESCR0: 12, 13, 16
+ * MSR_P4_CRU_ESCR1: 14, 15, 17
+ */
+
+ P4_OPCODE(P4_EVENT_UOPS_RETIRED) = P4_OPCODE_PACK(0x01, 0x04),
+ /*
+ * MSR_P4_CRU_ESCR0: 12, 13, 16
+ * MSR_P4_CRU_ESCR1: 14, 15, 17
+ */
+
+ P4_OPCODE(P4_EVENT_UOP_TYPE) = P4_OPCODE_PACK(0x02, 0x02),
+ /*
+ * MSR_P4_RAT_ESCR0: 12, 13, 16
+ * MSR_P4_RAT_ESCR1: 14, 15, 17
+ */
+
+ P4_OPCODE(P4_EVENT_BRANCH_RETIRED) = P4_OPCODE_PACK(0x06, 0x05),
+ /*
+ * MSR_P4_CRU_ESCR2: 12, 13, 16
+ * MSR_P4_CRU_ESCR3: 14, 15, 17
+ */
+
+ P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED) = P4_OPCODE_PACK(0x03, 0x04),
+ /*
+ * MSR_P4_CRU_ESCR0: 12, 13, 16
+ * MSR_P4_CRU_ESCR1: 14, 15, 17
+ */
+
+ P4_OPCODE(P4_EVENT_X87_ASSIST) = P4_OPCODE_PACK(0x03, 0x05),
+ /*
+ * MSR_P4_CRU_ESCR2: 12, 13, 16
+ * MSR_P4_CRU_ESCR3: 14, 15, 17
+ */
+
+ P4_OPCODE(P4_EVENT_MACHINE_CLEAR) = P4_OPCODE_PACK(0x02, 0x05),
+ /*
+ * MSR_P4_CRU_ESCR2: 12, 13, 16
+ * MSR_P4_CRU_ESCR3: 14, 15, 17
+ */
+
+ P4_OPCODE(P4_EVENT_INSTR_COMPLETED) = P4_OPCODE_PACK(0x07, 0x04),
+ /*
+ * MSR_P4_CRU_ESCR0: 12, 13, 16
+ * MSR_P4_CRU_ESCR1: 14, 15, 17
+ */
+};
+
+/*
+ * a caller should use P4_ESCR_EMASK_NAME helper to
+ * pick the EventMask needed, for example
+ *
+ * P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DD)
+ */
+enum P4_ESCR_EMASKS {
+ P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, DD, 0),
+ P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, DB, 1),
+ P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, DI, 2),
+ P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, BD, 3),
+ P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, BB, 4),
+ P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, BI, 5),
+ P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, ID, 6),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_BPU_FETCH_REQUEST, TCMISS, 0),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_ITLB_REFERENCE, HIT, 0),
+ P4_GEN_ESCR_EMASK(P4_EVENT_ITLB_REFERENCE, MISS, 1),
+ P4_GEN_ESCR_EMASK(P4_EVENT_ITLB_REFERENCE, HIT_UK, 2),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_MEMORY_CANCEL, ST_RB_FULL, 2),
+ P4_GEN_ESCR_EMASK(P4_EVENT_MEMORY_CANCEL, 64K_CONF, 3),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_MEMORY_COMPLETE, LSC, 0),
+ P4_GEN_ESCR_EMASK(P4_EVENT_MEMORY_COMPLETE, SSC, 1),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_LOAD_PORT_REPLAY, SPLIT_LD, 1),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_STORE_PORT_REPLAY, SPLIT_ST, 1),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_MOB_LOAD_REPLAY, NO_STA, 1),
+ P4_GEN_ESCR_EMASK(P4_EVENT_MOB_LOAD_REPLAY, NO_STD, 3),
+ P4_GEN_ESCR_EMASK(P4_EVENT_MOB_LOAD_REPLAY, PARTIAL_DATA, 4),
+ P4_GEN_ESCR_EMASK(P4_EVENT_MOB_LOAD_REPLAY, UNALGN_ADDR, 5),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_PAGE_WALK_TYPE, DTMISS, 0),
+ P4_GEN_ESCR_EMASK(P4_EVENT_PAGE_WALK_TYPE, ITMISS, 1),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS, 0),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE, 1),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM, 2),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS, 3),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE, 4),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM, 5),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS, 8),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS, 9),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS, 10),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, DEFAULT, 0),
+ P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, ALL_READ, 5),
+ P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, ALL_WRITE, 6),
+ P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, MEM_UC, 7),
+ P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, MEM_WC, 8),
+ P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, MEM_WT, 9),
+ P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, MEM_WP, 10),
+ P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, MEM_WB, 11),
+ P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, OWN, 13),
+ P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, OTHER, 14),
+ P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, PREFETCH, 15),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, DEFAULT, 0),
+ P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_READ, 5),
+ P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_WRITE, 6),
+ P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_UC, 7),
+ P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WC, 8),
+ P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WT, 9),
+ P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WP, 10),
+ P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WB, 11),
+ P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, OWN, 13),
+ P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, OTHER, 14),
+ P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, PREFETCH, 15),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV, 0),
+ P4_GEN_ESCR_EMASK(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN, 1),
+ P4_GEN_ESCR_EMASK(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OTHER, 2),
+ P4_GEN_ESCR_EMASK(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_DRV, 3),
+ P4_GEN_ESCR_EMASK(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OWN, 4),
+ P4_GEN_ESCR_EMASK(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OTHER, 5),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE0, 0),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE1, 1),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_LEN0, 2),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_LEN1, 3),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_IO_TYPE, 5),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_LOCK_TYPE, 6),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_CACHE_TYPE, 7),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_SPLIT_TYPE, 8),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_DEM_TYPE, 9),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_ORD_TYPE, 10),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE0, 11),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE1, 12),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE2, 13),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE0, 0),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE1, 1),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN0, 2),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN1, 3),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_IO_TYPE, 5),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LOCK_TYPE, 6),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_CACHE_TYPE, 7),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_SPLIT_TYPE, 8),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_DEM_TYPE, 9),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_ORD_TYPE, 10),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE0, 11),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE1, 12),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE2, 13),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_SSE_INPUT_ASSIST, ALL, 15),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_PACKED_SP_UOP, ALL, 15),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_PACKED_DP_UOP, ALL, 15),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_SCALAR_SP_UOP, ALL, 15),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_SCALAR_DP_UOP, ALL, 15),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_64BIT_MMX_UOP, ALL, 15),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_128BIT_MMX_UOP, ALL, 15),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_X87_FP_UOP, ALL, 15),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_TC_MISC, FLUSH, 4),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING, 0),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_TC_MS_XFER, CISC, 0),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_BUILD, 0),
+ P4_GEN_ESCR_EMASK(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_DELIVER, 1),
+ P4_GEN_ESCR_EMASK(P4_EVENT_UOP_QUEUE_WRITES, FROM_ROM, 2),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CONDITIONAL, 1),
+ P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CALL, 2),
+ P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, RETURN, 3),
+ P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, INDIRECT, 4),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL, 1),
+ P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_BRANCH_TYPE, CALL, 2),
+ P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN, 3),
+ P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT, 4),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_RESOURCE_STALL, SBFULL, 5),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_WC_BUFFER, WCB_EVICTS, 0),
+ P4_GEN_ESCR_EMASK(P4_EVENT_WC_BUFFER, WCB_FULL_EVICTS, 1),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_FRONT_END_EVENT, NBOGUS, 0),
+ P4_GEN_ESCR_EMASK(P4_EVENT_FRONT_END_EVENT, BOGUS, 1),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, NBOGUS0, 0),
+ P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, NBOGUS1, 1),
+ P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, NBOGUS2, 2),
+ P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, NBOGUS3, 3),
+ P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, BOGUS0, 4),
+ P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, BOGUS1, 5),
+ P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, BOGUS2, 6),
+ P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, BOGUS3, 7),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_REPLAY_EVENT, NBOGUS, 0),
+ P4_GEN_ESCR_EMASK(P4_EVENT_REPLAY_EVENT, BOGUS, 1),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG, 0),
+ P4_GEN_ESCR_EMASK(P4_EVENT_INSTR_RETIRED, NBOGUSTAG, 1),
+ P4_GEN_ESCR_EMASK(P4_EVENT_INSTR_RETIRED, BOGUSNTAG, 2),
+ P4_GEN_ESCR_EMASK(P4_EVENT_INSTR_RETIRED, BOGUSTAG, 3),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_UOPS_RETIRED, NBOGUS, 0),
+ P4_GEN_ESCR_EMASK(P4_EVENT_UOPS_RETIRED, BOGUS, 1),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_UOP_TYPE, TAGLOADS, 1),
+ P4_GEN_ESCR_EMASK(P4_EVENT_UOP_TYPE, TAGSTORES, 2),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_BRANCH_RETIRED, MMNP, 0),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BRANCH_RETIRED, MMNM, 1),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BRANCH_RETIRED, MMTP, 2),
+ P4_GEN_ESCR_EMASK(P4_EVENT_BRANCH_RETIRED, MMTM, 3),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS, 0),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_X87_ASSIST, FPSU, 0),
+ P4_GEN_ESCR_EMASK(P4_EVENT_X87_ASSIST, FPSO, 1),
+ P4_GEN_ESCR_EMASK(P4_EVENT_X87_ASSIST, POAO, 2),
+ P4_GEN_ESCR_EMASK(P4_EVENT_X87_ASSIST, POAU, 3),
+ P4_GEN_ESCR_EMASK(P4_EVENT_X87_ASSIST, PREA, 4),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_MACHINE_CLEAR, CLEAR, 0),
+ P4_GEN_ESCR_EMASK(P4_EVENT_MACHINE_CLEAR, MOCLEAR, 1),
+ P4_GEN_ESCR_EMASK(P4_EVENT_MACHINE_CLEAR, SMCLEAR, 2),
+
+ P4_GEN_ESCR_EMASK(P4_EVENT_INSTR_COMPLETED, NBOGUS, 0),
+ P4_GEN_ESCR_EMASK(P4_EVENT_INSTR_COMPLETED, BOGUS, 1),
+};
+
+/*
+ * Note we have UOP and PEBS bits reserved for now
+ * just in case if we will need them once
+ */
+#define P4_PEBS_CONFIG_ENABLE (1ULL << 7)
+#define P4_PEBS_CONFIG_UOP_TAG (1ULL << 8)
+#define P4_PEBS_CONFIG_METRIC_MASK 0x3FLL
+#define P4_PEBS_CONFIG_MASK 0xFFLL
+
+/*
+ * mem: Only counters MSR_IQ_COUNTER4 (16) and
+ * MSR_IQ_COUNTER5 (17) are allowed for PEBS sampling
+ */
+#define P4_PEBS_ENABLE 0x02000000ULL
+#define P4_PEBS_ENABLE_UOP_TAG 0x01000000ULL
+
+#define p4_config_unpack_metric(v) (((u64)(v)) & P4_PEBS_CONFIG_METRIC_MASK)
+#define p4_config_unpack_pebs(v) (((u64)(v)) & P4_PEBS_CONFIG_MASK)
+
+#define p4_config_pebs_has(v, mask) (p4_config_unpack_pebs(v) & (mask))
+
+enum P4_PEBS_METRIC {
+ P4_PEBS_METRIC__none,
+
+ P4_PEBS_METRIC__1stl_cache_load_miss_retired,
+ P4_PEBS_METRIC__2ndl_cache_load_miss_retired,
+ P4_PEBS_METRIC__dtlb_load_miss_retired,
+ P4_PEBS_METRIC__dtlb_store_miss_retired,
+ P4_PEBS_METRIC__dtlb_all_miss_retired,
+ P4_PEBS_METRIC__tagged_mispred_branch,
+ P4_PEBS_METRIC__mob_load_replay_retired,
+ P4_PEBS_METRIC__split_load_retired,
+ P4_PEBS_METRIC__split_store_retired,
+
+ P4_PEBS_METRIC__max
+};
+
+/*
+ * Notes on internal configuration of ESCR+CCCR tuples
+ *
+ * Since P4 has quite the different architecture of
+ * performance registers in compare with "architectural"
+ * once and we have on 64 bits to keep configuration
+ * of performance event, the following trick is used.
+ *
+ * 1) Since both ESCR and CCCR registers have only low
+ * 32 bits valuable, we pack them into a single 64 bit
+ * configuration. Low 32 bits of such config correspond
+ * to low 32 bits of CCCR register and high 32 bits
+ * correspond to low 32 bits of ESCR register.
+ *
+ * 2) The meaning of every bit of such config field can
+ * be found in Intel SDM but it should be noted that
+ * we "borrow" some reserved bits for own usage and
+ * clean them or set to a proper value when we do
+ * a real write to hardware registers.
+ *
+ * 3) The format of bits of config is the following
+ * and should be either 0 or set to some predefined
+ * values:
+ *
+ * Low 32 bits
+ * -----------
+ * 0-6: P4_PEBS_METRIC enum
+ * 7-11: reserved
+ * 12: reserved (Enable)
+ * 13-15: reserved (ESCR select)
+ * 16-17: Active Thread
+ * 18: Compare
+ * 19: Complement
+ * 20-23: Threshold
+ * 24: Edge
+ * 25: reserved (FORCE_OVF)
+ * 26: reserved (OVF_PMI_T0)
+ * 27: reserved (OVF_PMI_T1)
+ * 28-29: reserved
+ * 30: reserved (Cascade)
+ * 31: reserved (OVF)
+ *
+ * High 32 bits
+ * ------------
+ * 0: reserved (T1_USR)
+ * 1: reserved (T1_OS)
+ * 2: reserved (T0_USR)
+ * 3: reserved (T0_OS)
+ * 4: Tag Enable
+ * 5-8: Tag Value
+ * 9-24: Event Mask (may use P4_ESCR_EMASK_BIT helper)
+ * 25-30: enum P4_EVENTS
+ * 31: reserved (HT thread)
+ */
+
+#endif /* PERF_EVENT_P4_H */
+
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
new file mode 100644
index 000000000..fbd578daa
--- /dev/null
+++ b/arch/x86/include/asm/pgalloc.h
@@ -0,0 +1,207 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_PGALLOC_H
+#define _ASM_X86_PGALLOC_H
+
+#include <linux/threads.h>
+#include <linux/mm.h> /* for struct page */
+#include <linux/pagemap.h>
+
+static inline int __paravirt_pgd_alloc(struct mm_struct *mm) { return 0; }
+
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+#define paravirt_pgd_alloc(mm) __paravirt_pgd_alloc(mm)
+static inline void paravirt_pgd_free(struct mm_struct *mm, pgd_t *pgd) {}
+static inline void paravirt_alloc_pte(struct mm_struct *mm, unsigned long pfn) {}
+static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn) {}
+static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn,
+ unsigned long start, unsigned long count) {}
+static inline void paravirt_alloc_pud(struct mm_struct *mm, unsigned long pfn) {}
+static inline void paravirt_alloc_p4d(struct mm_struct *mm, unsigned long pfn) {}
+static inline void paravirt_release_pte(unsigned long pfn) {}
+static inline void paravirt_release_pmd(unsigned long pfn) {}
+static inline void paravirt_release_pud(unsigned long pfn) {}
+static inline void paravirt_release_p4d(unsigned long pfn) {}
+#endif
+
+/*
+ * Flags to use when allocating a user page table page.
+ */
+extern gfp_t __userpte_alloc_gfp;
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+/*
+ * Instead of one PGD, we acquire two PGDs. Being order-1, it is
+ * both 8k in size and 8k-aligned. That lets us just flip bit 12
+ * in a pointer to swap between the two 4k halves.
+ */
+#define PGD_ALLOCATION_ORDER 1
+#else
+#define PGD_ALLOCATION_ORDER 0
+#endif
+
+/*
+ * Allocate and free page tables.
+ */
+extern pgd_t *pgd_alloc(struct mm_struct *);
+extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
+
+extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long);
+extern pgtable_t pte_alloc_one(struct mm_struct *, unsigned long);
+
+/* Should really implement gc for free page table pages. This could be
+ done with a reference count in struct page. */
+
+static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
+{
+ BUG_ON((unsigned long)pte & (PAGE_SIZE-1));
+ free_page((unsigned long)pte);
+}
+
+static inline void pte_free(struct mm_struct *mm, struct page *pte)
+{
+ pgtable_page_dtor(pte);
+ __free_page(pte);
+}
+
+extern void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte);
+
+static inline void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte,
+ unsigned long address)
+{
+ ___pte_free_tlb(tlb, pte);
+}
+
+static inline void pmd_populate_kernel(struct mm_struct *mm,
+ pmd_t *pmd, pte_t *pte)
+{
+ paravirt_alloc_pte(mm, __pa(pte) >> PAGE_SHIFT);
+ set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE));
+}
+
+static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
+ struct page *pte)
+{
+ unsigned long pfn = page_to_pfn(pte);
+
+ paravirt_alloc_pte(mm, pfn);
+ set_pmd(pmd, __pmd(((pteval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE));
+}
+
+#define pmd_pgtable(pmd) pmd_page(pmd)
+
+#if CONFIG_PGTABLE_LEVELS > 2
+static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
+{
+ struct page *page;
+ gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO;
+
+ if (mm == &init_mm)
+ gfp &= ~__GFP_ACCOUNT;
+ page = alloc_pages(gfp, 0);
+ if (!page)
+ return NULL;
+ if (!pgtable_pmd_page_ctor(page)) {
+ __free_pages(page, 0);
+ return NULL;
+ }
+ return (pmd_t *)page_address(page);
+}
+
+static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
+{
+ BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
+ pgtable_pmd_page_dtor(virt_to_page(pmd));
+ free_page((unsigned long)pmd);
+}
+
+extern void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
+
+static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd,
+ unsigned long address)
+{
+ ___pmd_free_tlb(tlb, pmd);
+}
+
+#ifdef CONFIG_X86_PAE
+extern void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd);
+#else /* !CONFIG_X86_PAE */
+static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
+{
+ paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
+ set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd)));
+}
+#endif /* CONFIG_X86_PAE */
+
+#if CONFIG_PGTABLE_LEVELS > 3
+static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4d, pud_t *pud)
+{
+ paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT);
+ set_p4d(p4d, __p4d(_PAGE_TABLE | __pa(pud)));
+}
+
+static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
+{
+ gfp_t gfp = GFP_KERNEL_ACCOUNT;
+
+ if (mm == &init_mm)
+ gfp &= ~__GFP_ACCOUNT;
+ return (pud_t *)get_zeroed_page(gfp);
+}
+
+static inline void pud_free(struct mm_struct *mm, pud_t *pud)
+{
+ BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
+ free_page((unsigned long)pud);
+}
+
+extern void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud);
+
+static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
+ unsigned long address)
+{
+ ___pud_free_tlb(tlb, pud);
+}
+
+#if CONFIG_PGTABLE_LEVELS > 4
+static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, p4d_t *p4d)
+{
+ if (!pgtable_l5_enabled())
+ return;
+ paravirt_alloc_p4d(mm, __pa(p4d) >> PAGE_SHIFT);
+ set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(p4d)));
+}
+
+static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long addr)
+{
+ gfp_t gfp = GFP_KERNEL_ACCOUNT;
+
+ if (mm == &init_mm)
+ gfp &= ~__GFP_ACCOUNT;
+ return (p4d_t *)get_zeroed_page(gfp);
+}
+
+static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d)
+{
+ if (!pgtable_l5_enabled())
+ return;
+
+ BUG_ON((unsigned long)p4d & (PAGE_SIZE-1));
+ free_page((unsigned long)p4d);
+}
+
+extern void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d);
+
+static inline void __p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d,
+ unsigned long address)
+{
+ if (pgtable_l5_enabled())
+ ___p4d_free_tlb(tlb, p4d);
+}
+
+#endif /* CONFIG_PGTABLE_LEVELS > 4 */
+#endif /* CONFIG_PGTABLE_LEVELS > 3 */
+#endif /* CONFIG_PGTABLE_LEVELS > 2 */
+
+#endif /* _ASM_X86_PGALLOC_H */
diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h
new file mode 100644
index 000000000..60d0f9015
--- /dev/null
+++ b/arch/x86/include/asm/pgtable-2level.h
@@ -0,0 +1,115 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_PGTABLE_2LEVEL_H
+#define _ASM_X86_PGTABLE_2LEVEL_H
+
+#define pte_ERROR(e) \
+ pr_err("%s:%d: bad pte %08lx\n", __FILE__, __LINE__, (e).pte_low)
+#define pgd_ERROR(e) \
+ pr_err("%s:%d: bad pgd %08lx\n", __FILE__, __LINE__, pgd_val(e))
+
+/*
+ * Certain architectures need to do special things when PTEs
+ * within a page table are directly modified. Thus, the following
+ * hook is made available.
+ */
+static inline void native_set_pte(pte_t *ptep , pte_t pte)
+{
+ *ptep = pte;
+}
+
+static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
+{
+ *pmdp = pmd;
+}
+
+static inline void native_set_pud(pud_t *pudp, pud_t pud)
+{
+}
+
+static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
+{
+ native_set_pte(ptep, pte);
+}
+
+static inline void native_pmd_clear(pmd_t *pmdp)
+{
+ native_set_pmd(pmdp, __pmd(0));
+}
+
+static inline void native_pud_clear(pud_t *pudp)
+{
+}
+
+static inline void native_pte_clear(struct mm_struct *mm,
+ unsigned long addr, pte_t *xp)
+{
+ *xp = native_make_pte(0);
+}
+
+#ifdef CONFIG_SMP
+static inline pte_t native_ptep_get_and_clear(pte_t *xp)
+{
+ return __pte(xchg(&xp->pte_low, 0));
+}
+#else
+#define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp)
+#endif
+
+#ifdef CONFIG_SMP
+static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
+{
+ return __pmd(xchg((pmdval_t *)xp, 0));
+}
+#else
+#define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp)
+#endif
+
+#ifdef CONFIG_SMP
+static inline pud_t native_pudp_get_and_clear(pud_t *xp)
+{
+ return __pud(xchg((pudval_t *)xp, 0));
+}
+#else
+#define native_pudp_get_and_clear(xp) native_local_pudp_get_and_clear(xp)
+#endif
+
+/* Bit manipulation helper on pte/pgoff entry */
+static inline unsigned long pte_bitop(unsigned long value, unsigned int rightshift,
+ unsigned long mask, unsigned int leftshift)
+{
+ return ((value >> rightshift) & mask) << leftshift;
+}
+
+/* Encode and de-code a swap entry */
+#define SWP_TYPE_BITS 5
+#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1)
+
+#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS)
+
+#define __swp_type(x) (((x).val >> (_PAGE_BIT_PRESENT + 1)) \
+ & ((1U << SWP_TYPE_BITS) - 1))
+#define __swp_offset(x) ((x).val >> SWP_OFFSET_SHIFT)
+#define __swp_entry(type, offset) ((swp_entry_t) { \
+ ((type) << (_PAGE_BIT_PRESENT + 1)) \
+ | ((offset) << SWP_OFFSET_SHIFT) })
+#define __pte_to_swp_entry(pte) ((swp_entry_t) { (pte).pte_low })
+#define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val })
+
+/* No inverted PFNs on 2 level page tables */
+
+static inline u64 protnone_mask(u64 val)
+{
+ return 0;
+}
+
+static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask)
+{
+ return val;
+}
+
+static inline bool __pte_needs_invert(u64 val)
+{
+ return false;
+}
+
+#endif /* _ASM_X86_PGTABLE_2LEVEL_H */
diff --git a/arch/x86/include/asm/pgtable-2level_types.h b/arch/x86/include/asm/pgtable-2level_types.h
new file mode 100644
index 000000000..6deb6cd23
--- /dev/null
+++ b/arch/x86/include/asm/pgtable-2level_types.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_PGTABLE_2LEVEL_DEFS_H
+#define _ASM_X86_PGTABLE_2LEVEL_DEFS_H
+
+#ifndef __ASSEMBLY__
+#include <linux/types.h>
+
+typedef unsigned long pteval_t;
+typedef unsigned long pmdval_t;
+typedef unsigned long pudval_t;
+typedef unsigned long p4dval_t;
+typedef unsigned long pgdval_t;
+typedef unsigned long pgprotval_t;
+
+typedef union {
+ pteval_t pte;
+ pteval_t pte_low;
+} pte_t;
+#endif /* !__ASSEMBLY__ */
+
+#define SHARED_KERNEL_PMD 0
+
+/*
+ * traditional i386 two-level paging structure:
+ */
+
+#define PGDIR_SHIFT 22
+#define PTRS_PER_PGD 1024
+
+
+/*
+ * the i386 is two-level, so we don't really have any
+ * PMD directory physically.
+ */
+
+#define PTRS_PER_PTE 1024
+
+/* This covers all VMSPLIT_* and VMSPLIT_*_OPT variants */
+#define PGD_KERNEL_START (CONFIG_PAGE_OFFSET >> PGDIR_SHIFT)
+
+#endif /* _ASM_X86_PGTABLE_2LEVEL_DEFS_H */
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
new file mode 100644
index 000000000..f8b1ad2c3
--- /dev/null
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -0,0 +1,337 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_PGTABLE_3LEVEL_H
+#define _ASM_X86_PGTABLE_3LEVEL_H
+
+#include <asm/atomic64_32.h>
+
+/*
+ * Intel Physical Address Extension (PAE) Mode - three-level page
+ * tables on PPro+ CPUs.
+ *
+ * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
+ */
+
+#define pte_ERROR(e) \
+ pr_err("%s:%d: bad pte %p(%08lx%08lx)\n", \
+ __FILE__, __LINE__, &(e), (e).pte_high, (e).pte_low)
+#define pmd_ERROR(e) \
+ pr_err("%s:%d: bad pmd %p(%016Lx)\n", \
+ __FILE__, __LINE__, &(e), pmd_val(e))
+#define pgd_ERROR(e) \
+ pr_err("%s:%d: bad pgd %p(%016Lx)\n", \
+ __FILE__, __LINE__, &(e), pgd_val(e))
+
+/* Rules for using set_pte: the pte being assigned *must* be
+ * either not present or in a state where the hardware will
+ * not attempt to update the pte. In places where this is
+ * not possible, use pte_get_and_clear to obtain the old pte
+ * value and then use set_pte to update it. -ben
+ */
+static inline void native_set_pte(pte_t *ptep, pte_t pte)
+{
+ ptep->pte_high = pte.pte_high;
+ smp_wmb();
+ ptep->pte_low = pte.pte_low;
+}
+
+#define pmd_read_atomic pmd_read_atomic
+/*
+ * pte_offset_map_lock on 32bit PAE kernels was reading the pmd_t with
+ * a "*pmdp" dereference done by gcc. Problem is, in certain places
+ * where pte_offset_map_lock is called, concurrent page faults are
+ * allowed, if the mmap_sem is hold for reading. An example is mincore
+ * vs page faults vs MADV_DONTNEED. On the page fault side
+ * pmd_populate rightfully does a set_64bit, but if we're reading the
+ * pmd_t with a "*pmdp" on the mincore side, a SMP race can happen
+ * because gcc will not read the 64bit of the pmd atomically. To fix
+ * this all places running pmd_offset_map_lock() while holding the
+ * mmap_sem in read mode, shall read the pmdp pointer using this
+ * function to know if the pmd is null nor not, and in turn to know if
+ * they can run pmd_offset_map_lock or pmd_trans_huge or other pmd
+ * operations.
+ *
+ * Without THP if the mmap_sem is hold for reading, the pmd can only
+ * transition from null to not null while pmd_read_atomic runs. So
+ * we can always return atomic pmd values with this function.
+ *
+ * With THP if the mmap_sem is hold for reading, the pmd can become
+ * trans_huge or none or point to a pte (and in turn become "stable")
+ * at any time under pmd_read_atomic. We could read it really
+ * atomically here with a atomic64_read for the THP enabled case (and
+ * it would be a whole lot simpler), but to avoid using cmpxchg8b we
+ * only return an atomic pmdval if the low part of the pmdval is later
+ * found stable (i.e. pointing to a pte). And we're returning a none
+ * pmdval if the low part of the pmd is none. In some cases the high
+ * and low part of the pmdval returned may not be consistent if THP is
+ * enabled (the low part may point to previously mapped hugepage,
+ * while the high part may point to a more recently mapped hugepage),
+ * but pmd_none_or_trans_huge_or_clear_bad() only needs the low part
+ * of the pmd to be read atomically to decide if the pmd is unstable
+ * or not, with the only exception of when the low part of the pmd is
+ * zero in which case we return a none pmd.
+ */
+static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
+{
+ pmdval_t ret;
+ u32 *tmp = (u32 *)pmdp;
+
+ ret = (pmdval_t) (*tmp);
+ if (ret) {
+ /*
+ * If the low part is null, we must not read the high part
+ * or we can end up with a partial pmd.
+ */
+ smp_rmb();
+ ret |= ((pmdval_t)*(tmp + 1)) << 32;
+ }
+
+ return (pmd_t) { ret };
+}
+
+static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
+{
+ set_64bit((unsigned long long *)(ptep), native_pte_val(pte));
+}
+
+static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
+{
+ set_64bit((unsigned long long *)(pmdp), native_pmd_val(pmd));
+}
+
+static inline void native_set_pud(pud_t *pudp, pud_t pud)
+{
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+ pud.p4d.pgd = pti_set_user_pgtbl(&pudp->p4d.pgd, pud.p4d.pgd);
+#endif
+ set_64bit((unsigned long long *)(pudp), native_pud_val(pud));
+}
+
+/*
+ * For PTEs and PDEs, we must clear the P-bit first when clearing a page table
+ * entry, so clear the bottom half first and enforce ordering with a compiler
+ * barrier.
+ */
+static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep)
+{
+ ptep->pte_low = 0;
+ smp_wmb();
+ ptep->pte_high = 0;
+}
+
+static inline void native_pmd_clear(pmd_t *pmd)
+{
+ u32 *tmp = (u32 *)pmd;
+ *tmp = 0;
+ smp_wmb();
+ *(tmp + 1) = 0;
+}
+
+static inline void native_pud_clear(pud_t *pudp)
+{
+}
+
+static inline void pud_clear(pud_t *pudp)
+{
+ set_pud(pudp, __pud(0));
+
+ /*
+ * According to Intel App note "TLBs, Paging-Structure Caches,
+ * and Their Invalidation", April 2007, document 317080-001,
+ * section 8.1: in PAE mode we explicitly have to flush the
+ * TLB via cr3 if the top-level pgd is changed...
+ *
+ * Currently all places where pud_clear() is called either have
+ * flush_tlb_mm() followed or don't need TLB flush (x86_64 code or
+ * pud_clear_bad()), so we don't need TLB flush here.
+ */
+}
+
+#ifdef CONFIG_SMP
+static inline pte_t native_ptep_get_and_clear(pte_t *ptep)
+{
+ pte_t res;
+
+ res.pte = (pteval_t)arch_atomic64_xchg((atomic64_t *)ptep, 0);
+
+ return res;
+}
+#else
+#define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp)
+#endif
+
+union split_pmd {
+ struct {
+ u32 pmd_low;
+ u32 pmd_high;
+ };
+ pmd_t pmd;
+};
+
+#ifdef CONFIG_SMP
+static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp)
+{
+ union split_pmd res, *orig = (union split_pmd *)pmdp;
+
+ /* xchg acts as a barrier before setting of the high bits */
+ res.pmd_low = xchg(&orig->pmd_low, 0);
+ res.pmd_high = orig->pmd_high;
+ orig->pmd_high = 0;
+
+ return res.pmd;
+}
+#else
+#define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp)
+#endif
+
+#ifndef pmdp_establish
+#define pmdp_establish pmdp_establish
+static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp, pmd_t pmd)
+{
+ pmd_t old;
+
+ /*
+ * If pmd has present bit cleared we can get away without expensive
+ * cmpxchg64: we can update pmdp half-by-half without racing with
+ * anybody.
+ */
+ if (!(pmd_val(pmd) & _PAGE_PRESENT)) {
+ union split_pmd old, new, *ptr;
+
+ ptr = (union split_pmd *)pmdp;
+
+ new.pmd = pmd;
+
+ /* xchg acts as a barrier before setting of the high bits */
+ old.pmd_low = xchg(&ptr->pmd_low, new.pmd_low);
+ old.pmd_high = ptr->pmd_high;
+ ptr->pmd_high = new.pmd_high;
+ return old.pmd;
+ }
+
+ do {
+ old = *pmdp;
+ } while (cmpxchg64(&pmdp->pmd, old.pmd, pmd.pmd) != old.pmd);
+
+ return old;
+}
+#endif
+
+#ifdef CONFIG_SMP
+union split_pud {
+ struct {
+ u32 pud_low;
+ u32 pud_high;
+ };
+ pud_t pud;
+};
+
+static inline pud_t native_pudp_get_and_clear(pud_t *pudp)
+{
+ union split_pud res, *orig = (union split_pud *)pudp;
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+ pti_set_user_pgtbl(&pudp->p4d.pgd, __pgd(0));
+#endif
+
+ /* xchg acts as a barrier before setting of the high bits */
+ res.pud_low = xchg(&orig->pud_low, 0);
+ res.pud_high = orig->pud_high;
+ orig->pud_high = 0;
+
+ return res.pud;
+}
+#else
+#define native_pudp_get_and_clear(xp) native_local_pudp_get_and_clear(xp)
+#endif
+
+/* Encode and de-code a swap entry */
+#define SWP_TYPE_BITS 5
+
+#define SWP_OFFSET_FIRST_BIT (_PAGE_BIT_PROTNONE + 1)
+
+/* We always extract/encode the offset by shifting it all the way up, and then down again */
+#define SWP_OFFSET_SHIFT (SWP_OFFSET_FIRST_BIT + SWP_TYPE_BITS)
+
+#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > 5)
+#define __swp_type(x) (((x).val) & 0x1f)
+#define __swp_offset(x) ((x).val >> 5)
+#define __swp_entry(type, offset) ((swp_entry_t){(type) | (offset) << 5})
+
+/*
+ * Normally, __swp_entry() converts from arch-independent swp_entry_t to
+ * arch-dependent swp_entry_t, and __swp_entry_to_pte() just stores the result
+ * to pte. But here we have 32bit swp_entry_t and 64bit pte, and need to use the
+ * whole 64 bits. Thus, we shift the "real" arch-dependent conversion to
+ * __swp_entry_to_pte() through the following helper macro based on 64bit
+ * __swp_entry().
+ */
+#define __swp_pteval_entry(type, offset) ((pteval_t) { \
+ (~(pteval_t)(offset) << SWP_OFFSET_SHIFT >> SWP_TYPE_BITS) \
+ | ((pteval_t)(type) << (64 - SWP_TYPE_BITS)) })
+
+#define __swp_entry_to_pte(x) ((pte_t){ .pte = \
+ __swp_pteval_entry(__swp_type(x), __swp_offset(x)) })
+/*
+ * Analogically, __pte_to_swp_entry() doesn't just extract the arch-dependent
+ * swp_entry_t, but also has to convert it from 64bit to the 32bit
+ * intermediate representation, using the following macros based on 64bit
+ * __swp_type() and __swp_offset().
+ */
+#define __pteval_swp_type(x) ((unsigned long)((x).pte >> (64 - SWP_TYPE_BITS)))
+#define __pteval_swp_offset(x) ((unsigned long)(~((x).pte) << SWP_TYPE_BITS >> SWP_OFFSET_SHIFT))
+
+#define __pte_to_swp_entry(pte) (__swp_entry(__pteval_swp_type(pte), \
+ __pteval_swp_offset(pte)))
+
+#define gup_get_pte gup_get_pte
+/*
+ * WARNING: only to be used in the get_user_pages_fast() implementation.
+ *
+ * With get_user_pages_fast(), we walk down the pagetables without taking
+ * any locks. For this we would like to load the pointers atomically,
+ * but that is not possible (without expensive cmpxchg8b) on PAE. What
+ * we do have is the guarantee that a PTE will only either go from not
+ * present to present, or present to not present or both -- it will not
+ * switch to a completely different present page without a TLB flush in
+ * between; something that we are blocking by holding interrupts off.
+ *
+ * Setting ptes from not present to present goes:
+ *
+ * ptep->pte_high = h;
+ * smp_wmb();
+ * ptep->pte_low = l;
+ *
+ * And present to not present goes:
+ *
+ * ptep->pte_low = 0;
+ * smp_wmb();
+ * ptep->pte_high = 0;
+ *
+ * We must ensure here that the load of pte_low sees 'l' iff pte_high
+ * sees 'h'. We load pte_high *after* loading pte_low, which ensures we
+ * don't see an older value of pte_high. *Then* we recheck pte_low,
+ * which ensures that we haven't picked up a changed pte high. We might
+ * have gotten rubbish values from pte_low and pte_high, but we are
+ * guaranteed that pte_low will not have the present bit set *unless*
+ * it is 'l'. Because get_user_pages_fast() only operates on present ptes
+ * we're safe.
+ */
+static inline pte_t gup_get_pte(pte_t *ptep)
+{
+ pte_t pte;
+
+ do {
+ pte.pte_low = ptep->pte_low;
+ smp_rmb();
+ pte.pte_high = ptep->pte_high;
+ smp_rmb();
+ } while (unlikely(pte.pte_low != ptep->pte_low));
+
+ return pte;
+}
+
+#include <asm/pgtable-invert.h>
+
+#endif /* _ASM_X86_PGTABLE_3LEVEL_H */
diff --git a/arch/x86/include/asm/pgtable-3level_types.h b/arch/x86/include/asm/pgtable-3level_types.h
new file mode 100644
index 000000000..858358a82
--- /dev/null
+++ b/arch/x86/include/asm/pgtable-3level_types.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_PGTABLE_3LEVEL_DEFS_H
+#define _ASM_X86_PGTABLE_3LEVEL_DEFS_H
+
+#ifndef __ASSEMBLY__
+#include <linux/types.h>
+
+typedef u64 pteval_t;
+typedef u64 pmdval_t;
+typedef u64 pudval_t;
+typedef u64 p4dval_t;
+typedef u64 pgdval_t;
+typedef u64 pgprotval_t;
+
+typedef union {
+ struct {
+ unsigned long pte_low, pte_high;
+ };
+ pteval_t pte;
+} pte_t;
+#endif /* !__ASSEMBLY__ */
+
+#ifdef CONFIG_PARAVIRT
+#define SHARED_KERNEL_PMD ((!static_cpu_has(X86_FEATURE_PTI) && \
+ (pv_info.shared_kernel_pmd)))
+#else
+#define SHARED_KERNEL_PMD (!static_cpu_has(X86_FEATURE_PTI))
+#endif
+
+/*
+ * PGDIR_SHIFT determines what a top-level page table entry can map
+ */
+#define PGDIR_SHIFT 30
+#define PTRS_PER_PGD 4
+
+/*
+ * PMD_SHIFT determines the size of the area a middle-level
+ * page table can map
+ */
+#define PMD_SHIFT 21
+#define PTRS_PER_PMD 512
+
+/*
+ * entries per page directory level
+ */
+#define PTRS_PER_PTE 512
+
+#define MAX_POSSIBLE_PHYSMEM_BITS 36
+#define PGD_KERNEL_START (CONFIG_PAGE_OFFSET >> PGDIR_SHIFT)
+
+#endif /* _ASM_X86_PGTABLE_3LEVEL_DEFS_H */
diff --git a/arch/x86/include/asm/pgtable-invert.h b/arch/x86/include/asm/pgtable-invert.h
new file mode 100644
index 000000000..a0c1525f1
--- /dev/null
+++ b/arch/x86/include/asm/pgtable-invert.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_PGTABLE_INVERT_H
+#define _ASM_PGTABLE_INVERT_H 1
+
+#ifndef __ASSEMBLY__
+
+/*
+ * A clear pte value is special, and doesn't get inverted.
+ *
+ * Note that even users that only pass a pgprot_t (rather
+ * than a full pte) won't trigger the special zero case,
+ * because even PAGE_NONE has _PAGE_PROTNONE | _PAGE_ACCESSED
+ * set. So the all zero case really is limited to just the
+ * cleared page table entry case.
+ */
+static inline bool __pte_needs_invert(u64 val)
+{
+ return val && !(val & _PAGE_PRESENT);
+}
+
+/* Get a mask to xor with the page table entry to get the correct pfn. */
+static inline u64 protnone_mask(u64 val)
+{
+ return __pte_needs_invert(val) ? ~0ull : 0;
+}
+
+static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask)
+{
+ /*
+ * When a PTE transitions from NONE to !NONE or vice-versa
+ * invert the PFN part to stop speculation.
+ * pte_pfn undoes this when needed.
+ */
+ if (__pte_needs_invert(oldval) != __pte_needs_invert(val))
+ val = (val & ~mask) | (~val & mask);
+ return val;
+}
+
+#endif /* __ASSEMBLY__ */
+
+#endif
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
new file mode 100644
index 000000000..7de459cf3
--- /dev/null
+++ b/arch/x86/include/asm/pgtable.h
@@ -0,0 +1,1448 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_PGTABLE_H
+#define _ASM_X86_PGTABLE_H
+
+#include <linux/mem_encrypt.h>
+#include <asm/page.h>
+#include <asm/pgtable_types.h>
+
+/*
+ * Macro to mark a page protection value as UC-
+ */
+#define pgprot_noncached(prot) \
+ ((boot_cpu_data.x86 > 3) \
+ ? (__pgprot(pgprot_val(prot) | \
+ cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS))) \
+ : (prot))
+
+/*
+ * Macros to add or remove encryption attribute
+ */
+#define pgprot_encrypted(prot) __pgprot(__sme_set(pgprot_val(prot)))
+#define pgprot_decrypted(prot) __pgprot(__sme_clr(pgprot_val(prot)))
+
+#ifndef __ASSEMBLY__
+#include <asm/x86_init.h>
+
+extern pgd_t early_top_pgt[PTRS_PER_PGD];
+int __init __early_make_pgtable(unsigned long address, pmdval_t pmd);
+
+void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd);
+void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user);
+void ptdump_walk_pgd_level_checkwx(void);
+void ptdump_walk_user_pgd_level_checkwx(void);
+
+#ifdef CONFIG_DEBUG_WX
+#define debug_checkwx() ptdump_walk_pgd_level_checkwx()
+#define debug_checkwx_user() ptdump_walk_user_pgd_level_checkwx()
+#else
+#define debug_checkwx() do { } while (0)
+#define debug_checkwx_user() do { } while (0)
+#endif
+
+/*
+ * ZERO_PAGE is a global shared page that is always zero: used
+ * for zero-mapped memory areas etc..
+ */
+extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]
+ __visible;
+#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
+
+extern spinlock_t pgd_lock;
+extern struct list_head pgd_list;
+
+extern struct mm_struct *pgd_page_get_mm(struct page *page);
+
+extern pmdval_t early_pmd_flags;
+
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else /* !CONFIG_PARAVIRT */
+#define set_pte(ptep, pte) native_set_pte(ptep, pte)
+#define set_pte_at(mm, addr, ptep, pte) native_set_pte_at(mm, addr, ptep, pte)
+
+#define set_pte_atomic(ptep, pte) \
+ native_set_pte_atomic(ptep, pte)
+
+#define set_pmd(pmdp, pmd) native_set_pmd(pmdp, pmd)
+
+#ifndef __PAGETABLE_P4D_FOLDED
+#define set_pgd(pgdp, pgd) native_set_pgd(pgdp, pgd)
+#define pgd_clear(pgd) (pgtable_l5_enabled() ? native_pgd_clear(pgd) : 0)
+#endif
+
+#ifndef set_p4d
+# define set_p4d(p4dp, p4d) native_set_p4d(p4dp, p4d)
+#endif
+
+#ifndef __PAGETABLE_PUD_FOLDED
+#define p4d_clear(p4d) native_p4d_clear(p4d)
+#endif
+
+#ifndef set_pud
+# define set_pud(pudp, pud) native_set_pud(pudp, pud)
+#endif
+
+#ifndef __PAGETABLE_PUD_FOLDED
+#define pud_clear(pud) native_pud_clear(pud)
+#endif
+
+#define pte_clear(mm, addr, ptep) native_pte_clear(mm, addr, ptep)
+#define pmd_clear(pmd) native_pmd_clear(pmd)
+
+#define pgd_val(x) native_pgd_val(x)
+#define __pgd(x) native_make_pgd(x)
+
+#ifndef __PAGETABLE_P4D_FOLDED
+#define p4d_val(x) native_p4d_val(x)
+#define __p4d(x) native_make_p4d(x)
+#endif
+
+#ifndef __PAGETABLE_PUD_FOLDED
+#define pud_val(x) native_pud_val(x)
+#define __pud(x) native_make_pud(x)
+#endif
+
+#ifndef __PAGETABLE_PMD_FOLDED
+#define pmd_val(x) native_pmd_val(x)
+#define __pmd(x) native_make_pmd(x)
+#endif
+
+#define pte_val(x) native_pte_val(x)
+#define __pte(x) native_make_pte(x)
+
+#define arch_end_context_switch(prev) do {} while(0)
+
+#endif /* CONFIG_PARAVIRT */
+
+/*
+ * The following only work if pte_present() is true.
+ * Undefined behaviour if not..
+ */
+static inline int pte_dirty(pte_t pte)
+{
+ return pte_flags(pte) & _PAGE_DIRTY;
+}
+
+
+static inline u32 read_pkru(void)
+{
+ if (boot_cpu_has(X86_FEATURE_OSPKE))
+ return __read_pkru();
+ return 0;
+}
+
+static inline void write_pkru(u32 pkru)
+{
+ if (boot_cpu_has(X86_FEATURE_OSPKE))
+ __write_pkru(pkru);
+}
+
+static inline int pte_young(pte_t pte)
+{
+ return pte_flags(pte) & _PAGE_ACCESSED;
+}
+
+static inline int pmd_dirty(pmd_t pmd)
+{
+ return pmd_flags(pmd) & _PAGE_DIRTY;
+}
+
+static inline int pmd_young(pmd_t pmd)
+{
+ return pmd_flags(pmd) & _PAGE_ACCESSED;
+}
+
+static inline int pud_dirty(pud_t pud)
+{
+ return pud_flags(pud) & _PAGE_DIRTY;
+}
+
+static inline int pud_young(pud_t pud)
+{
+ return pud_flags(pud) & _PAGE_ACCESSED;
+}
+
+static inline int pte_write(pte_t pte)
+{
+ return pte_flags(pte) & _PAGE_RW;
+}
+
+static inline int pte_huge(pte_t pte)
+{
+ return pte_flags(pte) & _PAGE_PSE;
+}
+
+static inline int pte_global(pte_t pte)
+{
+ return pte_flags(pte) & _PAGE_GLOBAL;
+}
+
+static inline int pte_exec(pte_t pte)
+{
+ return !(pte_flags(pte) & _PAGE_NX);
+}
+
+static inline int pte_special(pte_t pte)
+{
+ return pte_flags(pte) & _PAGE_SPECIAL;
+}
+
+/* Entries that were set to PROT_NONE are inverted */
+
+static inline u64 protnone_mask(u64 val);
+
+static inline unsigned long pte_pfn(pte_t pte)
+{
+ phys_addr_t pfn = pte_val(pte);
+ pfn ^= protnone_mask(pfn);
+ return (pfn & PTE_PFN_MASK) >> PAGE_SHIFT;
+}
+
+static inline unsigned long pmd_pfn(pmd_t pmd)
+{
+ phys_addr_t pfn = pmd_val(pmd);
+ pfn ^= protnone_mask(pfn);
+ return (pfn & pmd_pfn_mask(pmd)) >> PAGE_SHIFT;
+}
+
+static inline unsigned long pud_pfn(pud_t pud)
+{
+ phys_addr_t pfn = pud_val(pud);
+ pfn ^= protnone_mask(pfn);
+ return (pfn & pud_pfn_mask(pud)) >> PAGE_SHIFT;
+}
+
+static inline unsigned long p4d_pfn(p4d_t p4d)
+{
+ return (p4d_val(p4d) & p4d_pfn_mask(p4d)) >> PAGE_SHIFT;
+}
+
+static inline unsigned long pgd_pfn(pgd_t pgd)
+{
+ return (pgd_val(pgd) & PTE_PFN_MASK) >> PAGE_SHIFT;
+}
+
+static inline int p4d_large(p4d_t p4d)
+{
+ /* No 512 GiB pages yet */
+ return 0;
+}
+
+#define pte_page(pte) pfn_to_page(pte_pfn(pte))
+
+static inline int pmd_large(pmd_t pte)
+{
+ return pmd_flags(pte) & _PAGE_PSE;
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/* NOTE: when predicate huge page, consider also pmd_devmap, or use pmd_large */
+static inline int pmd_trans_huge(pmd_t pmd)
+{
+ return (pmd_val(pmd) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE;
+}
+
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+static inline int pud_trans_huge(pud_t pud)
+{
+ return (pud_val(pud) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE;
+}
+#endif
+
+#define has_transparent_hugepage has_transparent_hugepage
+static inline int has_transparent_hugepage(void)
+{
+ return boot_cpu_has(X86_FEATURE_PSE);
+}
+
+#ifdef __HAVE_ARCH_PTE_DEVMAP
+static inline int pmd_devmap(pmd_t pmd)
+{
+ return !!(pmd_val(pmd) & _PAGE_DEVMAP);
+}
+
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+static inline int pud_devmap(pud_t pud)
+{
+ return !!(pud_val(pud) & _PAGE_DEVMAP);
+}
+#else
+static inline int pud_devmap(pud_t pud)
+{
+ return 0;
+}
+#endif
+
+static inline int pgd_devmap(pgd_t pgd)
+{
+ return 0;
+}
+#endif
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+static inline pte_t pte_set_flags(pte_t pte, pteval_t set)
+{
+ pteval_t v = native_pte_val(pte);
+
+ return native_make_pte(v | set);
+}
+
+static inline pte_t pte_clear_flags(pte_t pte, pteval_t clear)
+{
+ pteval_t v = native_pte_val(pte);
+
+ return native_make_pte(v & ~clear);
+}
+
+static inline pte_t pte_mkclean(pte_t pte)
+{
+ return pte_clear_flags(pte, _PAGE_DIRTY);
+}
+
+static inline pte_t pte_mkold(pte_t pte)
+{
+ return pte_clear_flags(pte, _PAGE_ACCESSED);
+}
+
+static inline pte_t pte_wrprotect(pte_t pte)
+{
+ return pte_clear_flags(pte, _PAGE_RW);
+}
+
+static inline pte_t pte_mkexec(pte_t pte)
+{
+ return pte_clear_flags(pte, _PAGE_NX);
+}
+
+static inline pte_t pte_mkdirty(pte_t pte)
+{
+ return pte_set_flags(pte, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
+}
+
+static inline pte_t pte_mkyoung(pte_t pte)
+{
+ return pte_set_flags(pte, _PAGE_ACCESSED);
+}
+
+static inline pte_t pte_mkwrite(pte_t pte)
+{
+ return pte_set_flags(pte, _PAGE_RW);
+}
+
+static inline pte_t pte_mkhuge(pte_t pte)
+{
+ return pte_set_flags(pte, _PAGE_PSE);
+}
+
+static inline pte_t pte_clrhuge(pte_t pte)
+{
+ return pte_clear_flags(pte, _PAGE_PSE);
+}
+
+static inline pte_t pte_mkglobal(pte_t pte)
+{
+ return pte_set_flags(pte, _PAGE_GLOBAL);
+}
+
+static inline pte_t pte_clrglobal(pte_t pte)
+{
+ return pte_clear_flags(pte, _PAGE_GLOBAL);
+}
+
+static inline pte_t pte_mkspecial(pte_t pte)
+{
+ return pte_set_flags(pte, _PAGE_SPECIAL);
+}
+
+static inline pte_t pte_mkdevmap(pte_t pte)
+{
+ return pte_set_flags(pte, _PAGE_SPECIAL|_PAGE_DEVMAP);
+}
+
+static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set)
+{
+ pmdval_t v = native_pmd_val(pmd);
+
+ return native_make_pmd(v | set);
+}
+
+static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear)
+{
+ pmdval_t v = native_pmd_val(pmd);
+
+ return native_make_pmd(v & ~clear);
+}
+
+static inline pmd_t pmd_mkold(pmd_t pmd)
+{
+ return pmd_clear_flags(pmd, _PAGE_ACCESSED);
+}
+
+static inline pmd_t pmd_mkclean(pmd_t pmd)
+{
+ return pmd_clear_flags(pmd, _PAGE_DIRTY);
+}
+
+static inline pmd_t pmd_wrprotect(pmd_t pmd)
+{
+ return pmd_clear_flags(pmd, _PAGE_RW);
+}
+
+static inline pmd_t pmd_mkdirty(pmd_t pmd)
+{
+ return pmd_set_flags(pmd, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
+}
+
+static inline pmd_t pmd_mkdevmap(pmd_t pmd)
+{
+ return pmd_set_flags(pmd, _PAGE_DEVMAP);
+}
+
+static inline pmd_t pmd_mkhuge(pmd_t pmd)
+{
+ return pmd_set_flags(pmd, _PAGE_PSE);
+}
+
+static inline pmd_t pmd_mkyoung(pmd_t pmd)
+{
+ return pmd_set_flags(pmd, _PAGE_ACCESSED);
+}
+
+static inline pmd_t pmd_mkwrite(pmd_t pmd)
+{
+ return pmd_set_flags(pmd, _PAGE_RW);
+}
+
+static inline pud_t pud_set_flags(pud_t pud, pudval_t set)
+{
+ pudval_t v = native_pud_val(pud);
+
+ return native_make_pud(v | set);
+}
+
+static inline pud_t pud_clear_flags(pud_t pud, pudval_t clear)
+{
+ pudval_t v = native_pud_val(pud);
+
+ return native_make_pud(v & ~clear);
+}
+
+static inline pud_t pud_mkold(pud_t pud)
+{
+ return pud_clear_flags(pud, _PAGE_ACCESSED);
+}
+
+static inline pud_t pud_mkclean(pud_t pud)
+{
+ return pud_clear_flags(pud, _PAGE_DIRTY);
+}
+
+static inline pud_t pud_wrprotect(pud_t pud)
+{
+ return pud_clear_flags(pud, _PAGE_RW);
+}
+
+static inline pud_t pud_mkdirty(pud_t pud)
+{
+ return pud_set_flags(pud, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
+}
+
+static inline pud_t pud_mkdevmap(pud_t pud)
+{
+ return pud_set_flags(pud, _PAGE_DEVMAP);
+}
+
+static inline pud_t pud_mkhuge(pud_t pud)
+{
+ return pud_set_flags(pud, _PAGE_PSE);
+}
+
+static inline pud_t pud_mkyoung(pud_t pud)
+{
+ return pud_set_flags(pud, _PAGE_ACCESSED);
+}
+
+static inline pud_t pud_mkwrite(pud_t pud)
+{
+ return pud_set_flags(pud, _PAGE_RW);
+}
+
+#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
+static inline int pte_soft_dirty(pte_t pte)
+{
+ return pte_flags(pte) & _PAGE_SOFT_DIRTY;
+}
+
+static inline int pmd_soft_dirty(pmd_t pmd)
+{
+ return pmd_flags(pmd) & _PAGE_SOFT_DIRTY;
+}
+
+static inline int pud_soft_dirty(pud_t pud)
+{
+ return pud_flags(pud) & _PAGE_SOFT_DIRTY;
+}
+
+static inline pte_t pte_mksoft_dirty(pte_t pte)
+{
+ return pte_set_flags(pte, _PAGE_SOFT_DIRTY);
+}
+
+static inline pmd_t pmd_mksoft_dirty(pmd_t pmd)
+{
+ return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY);
+}
+
+static inline pud_t pud_mksoft_dirty(pud_t pud)
+{
+ return pud_set_flags(pud, _PAGE_SOFT_DIRTY);
+}
+
+static inline pte_t pte_clear_soft_dirty(pte_t pte)
+{
+ return pte_clear_flags(pte, _PAGE_SOFT_DIRTY);
+}
+
+static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd)
+{
+ return pmd_clear_flags(pmd, _PAGE_SOFT_DIRTY);
+}
+
+static inline pud_t pud_clear_soft_dirty(pud_t pud)
+{
+ return pud_clear_flags(pud, _PAGE_SOFT_DIRTY);
+}
+
+#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */
+
+/*
+ * Mask out unsupported bits in a present pgprot. Non-present pgprots
+ * can use those bits for other purposes, so leave them be.
+ */
+static inline pgprotval_t massage_pgprot(pgprot_t pgprot)
+{
+ pgprotval_t protval = pgprot_val(pgprot);
+
+ if (protval & _PAGE_PRESENT)
+ protval &= __supported_pte_mask;
+
+ return protval;
+}
+
+static inline pgprotval_t check_pgprot(pgprot_t pgprot)
+{
+ pgprotval_t massaged_val = massage_pgprot(pgprot);
+
+ /* mmdebug.h can not be included here because of dependencies */
+#ifdef CONFIG_DEBUG_VM
+ WARN_ONCE(pgprot_val(pgprot) != massaged_val,
+ "attempted to set unsupported pgprot: %016llx "
+ "bits: %016llx supported: %016llx\n",
+ (u64)pgprot_val(pgprot),
+ (u64)pgprot_val(pgprot) ^ massaged_val,
+ (u64)__supported_pte_mask);
+#endif
+
+ return massaged_val;
+}
+
+static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
+{
+ phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT;
+ pfn ^= protnone_mask(pgprot_val(pgprot));
+ pfn &= PTE_PFN_MASK;
+ return __pte(pfn | check_pgprot(pgprot));
+}
+
+static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
+{
+ phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT;
+ pfn ^= protnone_mask(pgprot_val(pgprot));
+ pfn &= PHYSICAL_PMD_PAGE_MASK;
+ return __pmd(pfn | check_pgprot(pgprot));
+}
+
+static inline pud_t pfn_pud(unsigned long page_nr, pgprot_t pgprot)
+{
+ phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT;
+ pfn ^= protnone_mask(pgprot_val(pgprot));
+ pfn &= PHYSICAL_PUD_PAGE_MASK;
+ return __pud(pfn | check_pgprot(pgprot));
+}
+
+static inline pmd_t pmd_mknotpresent(pmd_t pmd)
+{
+ return pfn_pmd(pmd_pfn(pmd),
+ __pgprot(pmd_flags(pmd) & ~(_PAGE_PRESENT|_PAGE_PROTNONE)));
+}
+
+static inline pud_t pud_mknotpresent(pud_t pud)
+{
+ return pfn_pud(pud_pfn(pud),
+ __pgprot(pud_flags(pud) & ~(_PAGE_PRESENT|_PAGE_PROTNONE)));
+}
+
+static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask);
+
+static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
+{
+ pteval_t val = pte_val(pte), oldval = val;
+
+ /*
+ * Chop off the NX bit (if present), and add the NX portion of
+ * the newprot (if present):
+ */
+ val &= _PAGE_CHG_MASK;
+ val |= check_pgprot(newprot) & ~_PAGE_CHG_MASK;
+ val = flip_protnone_guard(oldval, val, PTE_PFN_MASK);
+ return __pte(val);
+}
+
+static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
+{
+ pmdval_t val = pmd_val(pmd), oldval = val;
+
+ val &= _HPAGE_CHG_MASK;
+ val |= check_pgprot(newprot) & ~_HPAGE_CHG_MASK;
+ val = flip_protnone_guard(oldval, val, PHYSICAL_PMD_PAGE_MASK);
+ return __pmd(val);
+}
+
+/*
+ * mprotect needs to preserve PAT and encryption bits when updating
+ * vm_page_prot
+ */
+#define pgprot_modify pgprot_modify
+static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
+{
+ pgprotval_t preservebits = pgprot_val(oldprot) & _PAGE_CHG_MASK;
+ pgprotval_t addbits = pgprot_val(newprot) & ~_PAGE_CHG_MASK;
+ return __pgprot(preservebits | addbits);
+}
+
+#define pte_pgprot(x) __pgprot(pte_flags(x))
+#define pmd_pgprot(x) __pgprot(pmd_flags(x))
+#define pud_pgprot(x) __pgprot(pud_flags(x))
+#define p4d_pgprot(x) __pgprot(p4d_flags(x))
+
+#define canon_pgprot(p) __pgprot(massage_pgprot(p))
+
+static inline pgprot_t arch_filter_pgprot(pgprot_t prot)
+{
+ return canon_pgprot(prot);
+}
+
+static inline int is_new_memtype_allowed(u64 paddr, unsigned long size,
+ enum page_cache_mode pcm,
+ enum page_cache_mode new_pcm)
+{
+ /*
+ * PAT type is always WB for untracked ranges, so no need to check.
+ */
+ if (x86_platform.is_untracked_pat_range(paddr, paddr + size))
+ return 1;
+
+ /*
+ * Certain new memtypes are not allowed with certain
+ * requested memtype:
+ * - request is uncached, return cannot be write-back
+ * - request is write-combine, return cannot be write-back
+ * - request is write-through, return cannot be write-back
+ * - request is write-through, return cannot be write-combine
+ */
+ if ((pcm == _PAGE_CACHE_MODE_UC_MINUS &&
+ new_pcm == _PAGE_CACHE_MODE_WB) ||
+ (pcm == _PAGE_CACHE_MODE_WC &&
+ new_pcm == _PAGE_CACHE_MODE_WB) ||
+ (pcm == _PAGE_CACHE_MODE_WT &&
+ new_pcm == _PAGE_CACHE_MODE_WB) ||
+ (pcm == _PAGE_CACHE_MODE_WT &&
+ new_pcm == _PAGE_CACHE_MODE_WC)) {
+ return 0;
+ }
+
+ return 1;
+}
+
+pmd_t *populate_extra_pmd(unsigned long vaddr);
+pte_t *populate_extra_pte(unsigned long vaddr);
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+pgd_t __pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd);
+
+/*
+ * Take a PGD location (pgdp) and a pgd value that needs to be set there.
+ * Populates the user and returns the resulting PGD that must be set in
+ * the kernel copy of the page tables.
+ */
+static inline pgd_t pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd)
+{
+ if (!static_cpu_has(X86_FEATURE_PTI))
+ return pgd;
+ return __pti_set_user_pgtbl(pgdp, pgd);
+}
+#else /* CONFIG_PAGE_TABLE_ISOLATION */
+static inline pgd_t pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd)
+{
+ return pgd;
+}
+#endif /* CONFIG_PAGE_TABLE_ISOLATION */
+
+#endif /* __ASSEMBLY__ */
+
+
+#ifdef CONFIG_X86_32
+# include <asm/pgtable_32.h>
+#else
+# include <asm/pgtable_64.h>
+#endif
+
+#ifndef __ASSEMBLY__
+#include <linux/mm_types.h>
+#include <linux/mmdebug.h>
+#include <linux/log2.h>
+#include <asm/fixmap.h>
+
+static inline int pte_none(pte_t pte)
+{
+ return !(pte.pte & ~(_PAGE_KNL_ERRATUM_MASK));
+}
+
+#define __HAVE_ARCH_PTE_SAME
+static inline int pte_same(pte_t a, pte_t b)
+{
+ return a.pte == b.pte;
+}
+
+static inline int pte_present(pte_t a)
+{
+ return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE);
+}
+
+#ifdef __HAVE_ARCH_PTE_DEVMAP
+static inline int pte_devmap(pte_t a)
+{
+ return (pte_flags(a) & _PAGE_DEVMAP) == _PAGE_DEVMAP;
+}
+#endif
+
+#define pte_accessible pte_accessible
+static inline bool pte_accessible(struct mm_struct *mm, pte_t a)
+{
+ if (pte_flags(a) & _PAGE_PRESENT)
+ return true;
+
+ if ((pte_flags(a) & _PAGE_PROTNONE) &&
+ mm_tlb_flush_pending(mm))
+ return true;
+
+ return false;
+}
+
+static inline int pmd_present(pmd_t pmd)
+{
+ /*
+ * Checking for _PAGE_PSE is needed too because
+ * split_huge_page will temporarily clear the present bit (but
+ * the _PAGE_PSE flag will remain set at all times while the
+ * _PAGE_PRESENT bit is clear).
+ */
+ return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE);
+}
+
+#ifdef CONFIG_NUMA_BALANCING
+/*
+ * These work without NUMA balancing but the kernel does not care. See the
+ * comment in include/asm-generic/pgtable.h
+ */
+static inline int pte_protnone(pte_t pte)
+{
+ return (pte_flags(pte) & (_PAGE_PROTNONE | _PAGE_PRESENT))
+ == _PAGE_PROTNONE;
+}
+
+static inline int pmd_protnone(pmd_t pmd)
+{
+ return (pmd_flags(pmd) & (_PAGE_PROTNONE | _PAGE_PRESENT))
+ == _PAGE_PROTNONE;
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
+static inline int pmd_none(pmd_t pmd)
+{
+ /* Only check low word on 32-bit platforms, since it might be
+ out of sync with upper half. */
+ unsigned long val = native_pmd_val(pmd);
+ return (val & ~_PAGE_KNL_ERRATUM_MASK) == 0;
+}
+
+static inline unsigned long pmd_page_vaddr(pmd_t pmd)
+{
+ return (unsigned long)__va(pmd_val(pmd) & pmd_pfn_mask(pmd));
+}
+
+/*
+ * Currently stuck as a macro due to indirect forward reference to
+ * linux/mmzone.h's __section_mem_map_addr() definition:
+ */
+#define pmd_page(pmd) pfn_to_page(pmd_pfn(pmd))
+
+/*
+ * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD]
+ *
+ * this macro returns the index of the entry in the pmd page which would
+ * control the given virtual address
+ */
+static inline unsigned long pmd_index(unsigned long address)
+{
+ return (address >> PMD_SHIFT) & (PTRS_PER_PMD - 1);
+}
+
+/*
+ * Conversion functions: convert a page and protection to a page entry,
+ * and a page entry and page directory to the page they refer to.
+ *
+ * (Currently stuck as a macro because of indirect forward reference
+ * to linux/mm.h:page_to_nid())
+ */
+#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
+
+/*
+ * the pte page can be thought of an array like this: pte_t[PTRS_PER_PTE]
+ *
+ * this function returns the index of the entry in the pte page which would
+ * control the given virtual address
+ */
+static inline unsigned long pte_index(unsigned long address)
+{
+ return (address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
+}
+
+static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address)
+{
+ return (pte_t *)pmd_page_vaddr(*pmd) + pte_index(address);
+}
+
+static inline int pmd_bad(pmd_t pmd)
+{
+ return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE;
+}
+
+static inline unsigned long pages_to_mb(unsigned long npg)
+{
+ return npg >> (20 - PAGE_SHIFT);
+}
+
+#if CONFIG_PGTABLE_LEVELS > 2
+static inline int pud_none(pud_t pud)
+{
+ return (native_pud_val(pud) & ~(_PAGE_KNL_ERRATUM_MASK)) == 0;
+}
+
+static inline int pud_present(pud_t pud)
+{
+ return pud_flags(pud) & _PAGE_PRESENT;
+}
+
+static inline unsigned long pud_page_vaddr(pud_t pud)
+{
+ return (unsigned long)__va(pud_val(pud) & pud_pfn_mask(pud));
+}
+
+/*
+ * Currently stuck as a macro due to indirect forward reference to
+ * linux/mmzone.h's __section_mem_map_addr() definition:
+ */
+#define pud_page(pud) pfn_to_page(pud_pfn(pud))
+
+/* Find an entry in the second-level page table.. */
+static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address)
+{
+ return (pmd_t *)pud_page_vaddr(*pud) + pmd_index(address);
+}
+
+static inline int pud_large(pud_t pud)
+{
+ return (pud_val(pud) & (_PAGE_PSE | _PAGE_PRESENT)) ==
+ (_PAGE_PSE | _PAGE_PRESENT);
+}
+
+static inline int pud_bad(pud_t pud)
+{
+ return (pud_flags(pud) & ~(_KERNPG_TABLE | _PAGE_USER)) != 0;
+}
+#else
+static inline int pud_large(pud_t pud)
+{
+ return 0;
+}
+#endif /* CONFIG_PGTABLE_LEVELS > 2 */
+
+static inline unsigned long pud_index(unsigned long address)
+{
+ return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1);
+}
+
+#if CONFIG_PGTABLE_LEVELS > 3
+static inline int p4d_none(p4d_t p4d)
+{
+ return (native_p4d_val(p4d) & ~(_PAGE_KNL_ERRATUM_MASK)) == 0;
+}
+
+static inline int p4d_present(p4d_t p4d)
+{
+ return p4d_flags(p4d) & _PAGE_PRESENT;
+}
+
+static inline unsigned long p4d_page_vaddr(p4d_t p4d)
+{
+ return (unsigned long)__va(p4d_val(p4d) & p4d_pfn_mask(p4d));
+}
+
+/*
+ * Currently stuck as a macro due to indirect forward reference to
+ * linux/mmzone.h's __section_mem_map_addr() definition:
+ */
+#define p4d_page(p4d) pfn_to_page(p4d_pfn(p4d))
+
+/* Find an entry in the third-level page table.. */
+static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)
+{
+ return (pud_t *)p4d_page_vaddr(*p4d) + pud_index(address);
+}
+
+static inline int p4d_bad(p4d_t p4d)
+{
+ unsigned long ignore_flags = _KERNPG_TABLE | _PAGE_USER;
+
+ if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
+ ignore_flags |= _PAGE_NX;
+
+ return (p4d_flags(p4d) & ~ignore_flags) != 0;
+}
+#endif /* CONFIG_PGTABLE_LEVELS > 3 */
+
+static inline unsigned long p4d_index(unsigned long address)
+{
+ return (address >> P4D_SHIFT) & (PTRS_PER_P4D - 1);
+}
+
+#if CONFIG_PGTABLE_LEVELS > 4
+static inline int pgd_present(pgd_t pgd)
+{
+ if (!pgtable_l5_enabled())
+ return 1;
+ return pgd_flags(pgd) & _PAGE_PRESENT;
+}
+
+static inline unsigned long pgd_page_vaddr(pgd_t pgd)
+{
+ return (unsigned long)__va((unsigned long)pgd_val(pgd) & PTE_PFN_MASK);
+}
+
+/*
+ * Currently stuck as a macro due to indirect forward reference to
+ * linux/mmzone.h's __section_mem_map_addr() definition:
+ */
+#define pgd_page(pgd) pfn_to_page(pgd_pfn(pgd))
+
+/* to find an entry in a page-table-directory. */
+static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)
+{
+ if (!pgtable_l5_enabled())
+ return (p4d_t *)pgd;
+ return (p4d_t *)pgd_page_vaddr(*pgd) + p4d_index(address);
+}
+
+static inline int pgd_bad(pgd_t pgd)
+{
+ unsigned long ignore_flags = _PAGE_USER;
+
+ if (!pgtable_l5_enabled())
+ return 0;
+
+ if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
+ ignore_flags |= _PAGE_NX;
+
+ return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
+}
+
+static inline int pgd_none(pgd_t pgd)
+{
+ if (!pgtable_l5_enabled())
+ return 0;
+ /*
+ * There is no need to do a workaround for the KNL stray
+ * A/D bit erratum here. PGDs only point to page tables
+ * except on 32-bit non-PAE which is not supported on
+ * KNL.
+ */
+ return !native_pgd_val(pgd);
+}
+#endif /* CONFIG_PGTABLE_LEVELS > 4 */
+
+#endif /* __ASSEMBLY__ */
+
+/*
+ * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD]
+ *
+ * this macro returns the index of the entry in the pgd page which would
+ * control the given virtual address
+ */
+#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
+
+/*
+ * pgd_offset() returns a (pgd_t *)
+ * pgd_index() is used get the offset into the pgd page's array of pgd_t's;
+ */
+#define pgd_offset_pgd(pgd, address) (pgd + pgd_index((address)))
+/*
+ * a shortcut to get a pgd_t in a given mm
+ */
+#define pgd_offset(mm, address) pgd_offset_pgd((mm)->pgd, (address))
+/*
+ * a shortcut which implies the use of the kernel's pgd, instead
+ * of a process's
+ */
+#define pgd_offset_k(address) pgd_offset(&init_mm, (address))
+
+
+#define KERNEL_PGD_BOUNDARY pgd_index(PAGE_OFFSET)
+#define KERNEL_PGD_PTRS (PTRS_PER_PGD - KERNEL_PGD_BOUNDARY)
+
+#ifndef __ASSEMBLY__
+
+extern int direct_gbpages;
+void init_mem_mapping(void);
+void early_alloc_pgt_buf(void);
+extern void memblock_find_dma_reserve(void);
+
+#ifdef CONFIG_X86_64
+/* Realmode trampoline initialization. */
+extern pgd_t trampoline_pgd_entry;
+static inline void __meminit init_trampoline_default(void)
+{
+ /* Default trampoline pgd value */
+ trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)];
+}
+# ifdef CONFIG_RANDOMIZE_MEMORY
+void __meminit init_trampoline(void);
+# else
+# define init_trampoline init_trampoline_default
+# endif
+#else
+static inline void init_trampoline(void) { }
+#endif
+
+/* local pte updates need not use xchg for locking */
+static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep)
+{
+ pte_t res = *ptep;
+
+ /* Pure native function needs no input for mm, addr */
+ native_pte_clear(NULL, 0, ptep);
+ return res;
+}
+
+static inline pmd_t native_local_pmdp_get_and_clear(pmd_t *pmdp)
+{
+ pmd_t res = *pmdp;
+
+ native_pmd_clear(pmdp);
+ return res;
+}
+
+static inline pud_t native_local_pudp_get_and_clear(pud_t *pudp)
+{
+ pud_t res = *pudp;
+
+ native_pud_clear(pudp);
+ return res;
+}
+
+static inline void native_set_pte_at(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep , pte_t pte)
+{
+ native_set_pte(ptep, pte);
+}
+
+static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp, pmd_t pmd)
+{
+ native_set_pmd(pmdp, pmd);
+}
+
+static inline void set_pud_at(struct mm_struct *mm, unsigned long addr,
+ pud_t *pudp, pud_t pud)
+{
+ native_set_pud(pudp, pud);
+}
+
+/*
+ * We only update the dirty/accessed state if we set
+ * the dirty bit by hand in the kernel, since the hardware
+ * will do the accessed bit for us, and we don't want to
+ * race with other CPU's that might be updating the dirty
+ * bit at the same time.
+ */
+struct vm_area_struct;
+
+#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
+extern int ptep_set_access_flags(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep,
+ pte_t entry, int dirty);
+
+#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
+extern int ptep_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep);
+
+#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
+extern int ptep_clear_flush_young(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep);
+
+#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
+static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep)
+{
+ pte_t pte = native_ptep_get_and_clear(ptep);
+ return pte;
+}
+
+#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
+static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep,
+ int full)
+{
+ pte_t pte;
+ if (full) {
+ /*
+ * Full address destruction in progress; paravirt does not
+ * care about updates and native needs no locking
+ */
+ pte = native_local_ptep_get_and_clear(ptep);
+ } else {
+ pte = ptep_get_and_clear(mm, addr, ptep);
+ }
+ return pte;
+}
+
+#define __HAVE_ARCH_PTEP_SET_WRPROTECT
+static inline void ptep_set_wrprotect(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep)
+{
+ clear_bit(_PAGE_BIT_RW, (unsigned long *)&ptep->pte);
+}
+
+#define flush_tlb_fix_spurious_fault(vma, address) do { } while (0)
+
+#define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot))
+
+#define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
+extern int pmdp_set_access_flags(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp,
+ pmd_t entry, int dirty);
+extern int pudp_set_access_flags(struct vm_area_struct *vma,
+ unsigned long address, pud_t *pudp,
+ pud_t entry, int dirty);
+
+#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
+extern int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmdp);
+extern int pudp_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long addr, pud_t *pudp);
+
+#define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
+extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp);
+
+
+#define pmd_write pmd_write
+static inline int pmd_write(pmd_t pmd)
+{
+ return pmd_flags(pmd) & _PAGE_RW;
+}
+
+#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
+static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp)
+{
+ return native_pmdp_get_and_clear(pmdp);
+}
+
+#define __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR
+static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm,
+ unsigned long addr, pud_t *pudp)
+{
+ return native_pudp_get_and_clear(pudp);
+}
+
+#define __HAVE_ARCH_PMDP_SET_WRPROTECT
+static inline void pmdp_set_wrprotect(struct mm_struct *mm,
+ unsigned long addr, pmd_t *pmdp)
+{
+ clear_bit(_PAGE_BIT_RW, (unsigned long *)pmdp);
+}
+
+#define pud_write pud_write
+static inline int pud_write(pud_t pud)
+{
+ return pud_flags(pud) & _PAGE_RW;
+}
+
+#ifndef pmdp_establish
+#define pmdp_establish pmdp_establish
+static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp, pmd_t pmd)
+{
+ if (IS_ENABLED(CONFIG_SMP)) {
+ return xchg(pmdp, pmd);
+ } else {
+ pmd_t old = *pmdp;
+ WRITE_ONCE(*pmdp, pmd);
+ return old;
+ }
+}
+#endif
+/*
+ * Page table pages are page-aligned. The lower half of the top
+ * level is used for userspace and the top half for the kernel.
+ *
+ * Returns true for parts of the PGD that map userspace and
+ * false for the parts that map the kernel.
+ */
+static inline bool pgdp_maps_userspace(void *__ptr)
+{
+ unsigned long ptr = (unsigned long)__ptr;
+
+ return (((ptr & ~PAGE_MASK) / sizeof(pgd_t)) < PGD_KERNEL_START);
+}
+
+static inline int pgd_large(pgd_t pgd) { return 0; }
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+/*
+ * All top-level PAGE_TABLE_ISOLATION page tables are order-1 pages
+ * (8k-aligned and 8k in size). The kernel one is at the beginning 4k and
+ * the user one is in the last 4k. To switch between them, you
+ * just need to flip the 12th bit in their addresses.
+ */
+#define PTI_PGTABLE_SWITCH_BIT PAGE_SHIFT
+
+/*
+ * This generates better code than the inline assembly in
+ * __set_bit().
+ */
+static inline void *ptr_set_bit(void *ptr, int bit)
+{
+ unsigned long __ptr = (unsigned long)ptr;
+
+ __ptr |= BIT(bit);
+ return (void *)__ptr;
+}
+static inline void *ptr_clear_bit(void *ptr, int bit)
+{
+ unsigned long __ptr = (unsigned long)ptr;
+
+ __ptr &= ~BIT(bit);
+ return (void *)__ptr;
+}
+
+static inline pgd_t *kernel_to_user_pgdp(pgd_t *pgdp)
+{
+ return ptr_set_bit(pgdp, PTI_PGTABLE_SWITCH_BIT);
+}
+
+static inline pgd_t *user_to_kernel_pgdp(pgd_t *pgdp)
+{
+ return ptr_clear_bit(pgdp, PTI_PGTABLE_SWITCH_BIT);
+}
+
+static inline p4d_t *kernel_to_user_p4dp(p4d_t *p4dp)
+{
+ return ptr_set_bit(p4dp, PTI_PGTABLE_SWITCH_BIT);
+}
+
+static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp)
+{
+ return ptr_clear_bit(p4dp, PTI_PGTABLE_SWITCH_BIT);
+}
+#endif /* CONFIG_PAGE_TABLE_ISOLATION */
+
+/*
+ * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
+ *
+ * dst - pointer to pgd range anwhere on a pgd page
+ * src - ""
+ * count - the number of pgds to copy.
+ *
+ * dst and src can be on the same page, but the range must not overlap,
+ * and must not cross a page boundary.
+ */
+static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
+{
+ memcpy(dst, src, count * sizeof(pgd_t));
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+ if (!static_cpu_has(X86_FEATURE_PTI))
+ return;
+ /* Clone the user space pgd as well */
+ memcpy(kernel_to_user_pgdp(dst), kernel_to_user_pgdp(src),
+ count * sizeof(pgd_t));
+#endif
+}
+
+#define PTE_SHIFT ilog2(PTRS_PER_PTE)
+static inline int page_level_shift(enum pg_level level)
+{
+ return (PAGE_SHIFT - PTE_SHIFT) + level * PTE_SHIFT;
+}
+static inline unsigned long page_level_size(enum pg_level level)
+{
+ return 1UL << page_level_shift(level);
+}
+static inline unsigned long page_level_mask(enum pg_level level)
+{
+ return ~(page_level_size(level) - 1);
+}
+
+/*
+ * The x86 doesn't have any external MMU info: the kernel page
+ * tables contain all the necessary information.
+ */
+static inline void update_mmu_cache(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep)
+{
+}
+static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmd)
+{
+}
+static inline void update_mmu_cache_pud(struct vm_area_struct *vma,
+ unsigned long addr, pud_t *pud)
+{
+}
+
+#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
+static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
+{
+ return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY);
+}
+
+static inline int pte_swp_soft_dirty(pte_t pte)
+{
+ return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY;
+}
+
+static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
+{
+ return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY);
+}
+
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd)
+{
+ return pmd_set_flags(pmd, _PAGE_SWP_SOFT_DIRTY);
+}
+
+static inline int pmd_swp_soft_dirty(pmd_t pmd)
+{
+ return pmd_flags(pmd) & _PAGE_SWP_SOFT_DIRTY;
+}
+
+static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
+{
+ return pmd_clear_flags(pmd, _PAGE_SWP_SOFT_DIRTY);
+}
+#endif
+#endif
+
+#define PKRU_AD_BIT 0x1u
+#define PKRU_WD_BIT 0x2u
+#define PKRU_BITS_PER_PKEY 2
+
+static inline bool __pkru_allows_read(u32 pkru, u16 pkey)
+{
+ int pkru_pkey_bits = pkey * PKRU_BITS_PER_PKEY;
+ return !(pkru & (PKRU_AD_BIT << pkru_pkey_bits));
+}
+
+static inline bool __pkru_allows_write(u32 pkru, u16 pkey)
+{
+ int pkru_pkey_bits = pkey * PKRU_BITS_PER_PKEY;
+ /*
+ * Access-disable disables writes too so we need to check
+ * both bits here.
+ */
+ return !(pkru & ((PKRU_AD_BIT|PKRU_WD_BIT) << pkru_pkey_bits));
+}
+
+static inline u16 pte_flags_pkey(unsigned long pte_flags)
+{
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+ /* ifdef to avoid doing 59-bit shift on 32-bit values */
+ return (pte_flags & _PAGE_PKEY_MASK) >> _PAGE_BIT_PKEY_BIT0;
+#else
+ return 0;
+#endif
+}
+
+static inline bool __pkru_allows_pkey(u16 pkey, bool write)
+{
+ u32 pkru = read_pkru();
+
+ if (!__pkru_allows_read(pkru, pkey))
+ return false;
+ if (write && !__pkru_allows_write(pkru, pkey))
+ return false;
+
+ return true;
+}
+
+/*
+ * 'pteval' can come from a PTE, PMD or PUD. We only check
+ * _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the
+ * same value on all 3 types.
+ */
+static inline bool __pte_access_permitted(unsigned long pteval, bool write)
+{
+ unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER;
+
+ if (write)
+ need_pte_bits |= _PAGE_RW;
+
+ if ((pteval & need_pte_bits) != need_pte_bits)
+ return 0;
+
+ return __pkru_allows_pkey(pte_flags_pkey(pteval), write);
+}
+
+#define pte_access_permitted pte_access_permitted
+static inline bool pte_access_permitted(pte_t pte, bool write)
+{
+ return __pte_access_permitted(pte_val(pte), write);
+}
+
+#define pmd_access_permitted pmd_access_permitted
+static inline bool pmd_access_permitted(pmd_t pmd, bool write)
+{
+ return __pte_access_permitted(pmd_val(pmd), write);
+}
+
+#define pud_access_permitted pud_access_permitted
+static inline bool pud_access_permitted(pud_t pud, bool write)
+{
+ return __pte_access_permitted(pud_val(pud), write);
+}
+
+#define __HAVE_ARCH_PFN_MODIFY_ALLOWED 1
+extern bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot);
+
+static inline bool arch_has_pfn_modify_check(void)
+{
+ return boot_cpu_has_bug(X86_BUG_L1TF);
+}
+
+#include <asm-generic/pgtable.h>
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_X86_PGTABLE_H */
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
new file mode 100644
index 000000000..71e1df860
--- /dev/null
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -0,0 +1,111 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_PGTABLE_32_H
+#define _ASM_X86_PGTABLE_32_H
+
+#include <asm/pgtable_32_types.h>
+
+/*
+ * The Linux memory management assumes a three-level page table setup. On
+ * the i386, we use that, but "fold" the mid level into the top-level page
+ * table, so that we physically have the same two-level page table as the
+ * i386 mmu expects.
+ *
+ * This file contains the functions and defines necessary to modify and use
+ * the i386 page table tree.
+ */
+#ifndef __ASSEMBLY__
+#include <asm/processor.h>
+#include <linux/threads.h>
+#include <asm/paravirt.h>
+
+#include <linux/bitops.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+
+struct mm_struct;
+struct vm_area_struct;
+
+extern pgd_t swapper_pg_dir[1024];
+extern pgd_t initial_page_table[1024];
+extern pmd_t initial_pg_pmd[];
+
+static inline void pgtable_cache_init(void) { }
+static inline void check_pgt_cache(void) { }
+void paging_init(void);
+void sync_initial_page_table(void);
+
+/*
+ * Define this if things work differently on an i386 and an i486:
+ * it will (on an i486) warn about kernel memory accesses that are
+ * done without a 'access_ok(VERIFY_WRITE,..)'
+ */
+#undef TEST_ACCESS_OK
+
+#ifdef CONFIG_X86_PAE
+# include <asm/pgtable-3level.h>
+#else
+# include <asm/pgtable-2level.h>
+#endif
+
+#if defined(CONFIG_HIGHPTE)
+#define pte_offset_map(dir, address) \
+ ((pte_t *)kmap_atomic(pmd_page(*(dir))) + \
+ pte_index((address)))
+#define pte_unmap(pte) kunmap_atomic((pte))
+#else
+#define pte_offset_map(dir, address) \
+ ((pte_t *)page_address(pmd_page(*(dir))) + pte_index((address)))
+#define pte_unmap(pte) do { } while (0)
+#endif
+
+/* Clear a kernel PTE and flush it from the TLB */
+#define kpte_clear_flush(ptep, vaddr) \
+do { \
+ pte_clear(&init_mm, (vaddr), (ptep)); \
+ __flush_tlb_one_kernel((vaddr)); \
+} while (0)
+
+#endif /* !__ASSEMBLY__ */
+
+/*
+ * kern_addr_valid() is (1) for FLATMEM and (0) for
+ * SPARSEMEM and DISCONTIGMEM
+ */
+#ifdef CONFIG_FLATMEM
+#define kern_addr_valid(addr) (1)
+#else
+#define kern_addr_valid(kaddr) (0)
+#endif
+
+/*
+ * This is how much memory in addition to the memory covered up to
+ * and including _end we need mapped initially.
+ * We need:
+ * (KERNEL_IMAGE_SIZE/4096) / 1024 pages (worst case, non PAE)
+ * (KERNEL_IMAGE_SIZE/4096) / 512 + 4 pages (worst case for PAE)
+ *
+ * Modulo rounding, each megabyte assigned here requires a kilobyte of
+ * memory, which is currently unreclaimed.
+ *
+ * This should be a multiple of a page.
+ *
+ * KERNEL_IMAGE_SIZE should be greater than pa(_end)
+ * and small than max_low_pfn, otherwise will waste some page table entries
+ */
+#if PTRS_PER_PMD > 1
+#define PAGE_TABLE_SIZE(pages) (((pages) / PTRS_PER_PMD) + PTRS_PER_PGD)
+#else
+#define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD)
+#endif
+
+/*
+ * Number of possible pages in the lowmem region.
+ *
+ * We shift 2 by 31 instead of 1 by 32 to the left in order to avoid a
+ * gas warning about overflowing shift count when gas has been compiled
+ * with only a host target support using a 32-bit type for internal
+ * representation.
+ */
+#define LOWMEM_PAGES ((((_ULL(2)<<31) - __PAGE_OFFSET) >> PAGE_SHIFT))
+
+#endif /* _ASM_X86_PGTABLE_32_H */
diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h
new file mode 100644
index 000000000..b0bc0fff5
--- /dev/null
+++ b/arch/x86/include/asm/pgtable_32_types.h
@@ -0,0 +1,73 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_PGTABLE_32_DEFS_H
+#define _ASM_X86_PGTABLE_32_DEFS_H
+
+/*
+ * The Linux x86 paging architecture is 'compile-time dual-mode', it
+ * implements both the traditional 2-level x86 page tables and the
+ * newer 3-level PAE-mode page tables.
+ */
+#ifdef CONFIG_X86_PAE
+# include <asm/pgtable-3level_types.h>
+# define PMD_SIZE (1UL << PMD_SHIFT)
+# define PMD_MASK (~(PMD_SIZE - 1))
+#else
+# include <asm/pgtable-2level_types.h>
+#endif
+
+#define pgtable_l5_enabled() 0
+
+#define PGDIR_SIZE (1UL << PGDIR_SHIFT)
+#define PGDIR_MASK (~(PGDIR_SIZE - 1))
+
+/* Just any arbitrary offset to the start of the vmalloc VM area: the
+ * current 8MB value just means that there will be a 8MB "hole" after the
+ * physical memory until the kernel virtual memory starts. That means that
+ * any out-of-bounds memory accesses will hopefully be caught.
+ * The vmalloc() routines leaves a hole of 4kB between each vmalloced
+ * area for the same reason. ;)
+ */
+#define VMALLOC_OFFSET (8 * 1024 * 1024)
+
+#ifndef __ASSEMBLY__
+extern bool __vmalloc_start_set; /* set once high_memory is set */
+#endif
+
+#define VMALLOC_START ((unsigned long)high_memory + VMALLOC_OFFSET)
+#ifdef CONFIG_X86_PAE
+#define LAST_PKMAP 512
+#else
+#define LAST_PKMAP 1024
+#endif
+
+/*
+ * Define this here and validate with BUILD_BUG_ON() in pgtable_32.c
+ * to avoid include recursion hell
+ */
+#define CPU_ENTRY_AREA_PAGES (NR_CPUS * 40)
+
+#define CPU_ENTRY_AREA_BASE \
+ ((FIXADDR_TOT_START - PAGE_SIZE * (CPU_ENTRY_AREA_PAGES + 1)) \
+ & PMD_MASK)
+
+#define LDT_BASE_ADDR \
+ ((CPU_ENTRY_AREA_BASE - PAGE_SIZE) & PMD_MASK)
+
+#define LDT_END_ADDR (LDT_BASE_ADDR + PMD_SIZE)
+
+#define PKMAP_BASE \
+ ((LDT_BASE_ADDR - PAGE_SIZE) & PMD_MASK)
+
+#ifdef CONFIG_HIGHMEM
+# define VMALLOC_END (PKMAP_BASE - 2 * PAGE_SIZE)
+#else
+# define VMALLOC_END (LDT_BASE_ADDR - 2 * PAGE_SIZE)
+#endif
+
+#define MODULES_VADDR VMALLOC_START
+#define MODULES_END VMALLOC_END
+#define MODULES_LEN (MODULES_VADDR - MODULES_END)
+
+#define MAXMEM (VMALLOC_END - PAGE_OFFSET - __VMALLOC_RESERVE)
+
+#endif /* _ASM_X86_PGTABLE_32_DEFS_H */
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
new file mode 100644
index 000000000..9c85b54bf
--- /dev/null
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -0,0 +1,279 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_PGTABLE_64_H
+#define _ASM_X86_PGTABLE_64_H
+
+#include <linux/const.h>
+#include <asm/pgtable_64_types.h>
+
+#ifndef __ASSEMBLY__
+
+/*
+ * This file contains the functions and defines necessary to modify and use
+ * the x86-64 page table tree.
+ */
+#include <asm/processor.h>
+#include <linux/bitops.h>
+#include <linux/threads.h>
+#include <asm/fixmap.h>
+
+extern p4d_t level4_kernel_pgt[512];
+extern p4d_t level4_ident_pgt[512];
+extern pud_t level3_kernel_pgt[512];
+extern pud_t level3_ident_pgt[512];
+extern pmd_t level2_kernel_pgt[512];
+extern pmd_t level2_fixmap_pgt[512];
+extern pmd_t level2_ident_pgt[512];
+extern pte_t level1_fixmap_pgt[512 * FIXMAP_PMD_NUM];
+extern pgd_t init_top_pgt[];
+
+#define swapper_pg_dir init_top_pgt
+
+extern void paging_init(void);
+static inline void sync_initial_page_table(void) { }
+
+#define pte_ERROR(e) \
+ pr_err("%s:%d: bad pte %p(%016lx)\n", \
+ __FILE__, __LINE__, &(e), pte_val(e))
+#define pmd_ERROR(e) \
+ pr_err("%s:%d: bad pmd %p(%016lx)\n", \
+ __FILE__, __LINE__, &(e), pmd_val(e))
+#define pud_ERROR(e) \
+ pr_err("%s:%d: bad pud %p(%016lx)\n", \
+ __FILE__, __LINE__, &(e), pud_val(e))
+
+#if CONFIG_PGTABLE_LEVELS >= 5
+#define p4d_ERROR(e) \
+ pr_err("%s:%d: bad p4d %p(%016lx)\n", \
+ __FILE__, __LINE__, &(e), p4d_val(e))
+#endif
+
+#define pgd_ERROR(e) \
+ pr_err("%s:%d: bad pgd %p(%016lx)\n", \
+ __FILE__, __LINE__, &(e), pgd_val(e))
+
+struct mm_struct;
+
+void set_pte_vaddr_p4d(p4d_t *p4d_page, unsigned long vaddr, pte_t new_pte);
+void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte);
+
+static inline void native_set_pte(pte_t *ptep, pte_t pte)
+{
+ WRITE_ONCE(*ptep, pte);
+}
+
+static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep)
+{
+ native_set_pte(ptep, native_make_pte(0));
+}
+
+static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
+{
+ native_set_pte(ptep, pte);
+}
+
+static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
+{
+ WRITE_ONCE(*pmdp, pmd);
+}
+
+static inline void native_pmd_clear(pmd_t *pmd)
+{
+ native_set_pmd(pmd, native_make_pmd(0));
+}
+
+static inline pte_t native_ptep_get_and_clear(pte_t *xp)
+{
+#ifdef CONFIG_SMP
+ return native_make_pte(xchg(&xp->pte, 0));
+#else
+ /* native_local_ptep_get_and_clear,
+ but duplicated because of cyclic dependency */
+ pte_t ret = *xp;
+ native_pte_clear(NULL, 0, xp);
+ return ret;
+#endif
+}
+
+static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
+{
+#ifdef CONFIG_SMP
+ return native_make_pmd(xchg(&xp->pmd, 0));
+#else
+ /* native_local_pmdp_get_and_clear,
+ but duplicated because of cyclic dependency */
+ pmd_t ret = *xp;
+ native_pmd_clear(xp);
+ return ret;
+#endif
+}
+
+static inline void native_set_pud(pud_t *pudp, pud_t pud)
+{
+ WRITE_ONCE(*pudp, pud);
+}
+
+static inline void native_pud_clear(pud_t *pud)
+{
+ native_set_pud(pud, native_make_pud(0));
+}
+
+static inline pud_t native_pudp_get_and_clear(pud_t *xp)
+{
+#ifdef CONFIG_SMP
+ return native_make_pud(xchg(&xp->pud, 0));
+#else
+ /* native_local_pudp_get_and_clear,
+ * but duplicated because of cyclic dependency
+ */
+ pud_t ret = *xp;
+
+ native_pud_clear(xp);
+ return ret;
+#endif
+}
+
+static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d)
+{
+ pgd_t pgd;
+
+ if (pgtable_l5_enabled() || !IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) {
+ WRITE_ONCE(*p4dp, p4d);
+ return;
+ }
+
+ pgd = native_make_pgd(native_p4d_val(p4d));
+ pgd = pti_set_user_pgtbl((pgd_t *)p4dp, pgd);
+ WRITE_ONCE(*p4dp, native_make_p4d(native_pgd_val(pgd)));
+}
+
+static inline void native_p4d_clear(p4d_t *p4d)
+{
+ native_set_p4d(p4d, native_make_p4d(0));
+}
+
+static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+ WRITE_ONCE(*pgdp, pti_set_user_pgtbl(pgdp, pgd));
+}
+
+static inline void native_pgd_clear(pgd_t *pgd)
+{
+ native_set_pgd(pgd, native_make_pgd(0));
+}
+
+extern void sync_global_pgds(unsigned long start, unsigned long end);
+
+/*
+ * Conversion functions: convert a page and protection to a page entry,
+ * and a page entry and page directory to the page they refer to.
+ */
+
+/*
+ * Level 4 access.
+ */
+#define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE)
+
+/* PUD - Level3 access */
+
+/* PMD - Level 2 access */
+
+/* PTE - Level 1 access. */
+
+/* x86-64 always has all page tables mapped. */
+#define pte_offset_map(dir, address) pte_offset_kernel((dir), (address))
+#define pte_unmap(pte) ((void)(pte))/* NOP */
+
+/*
+ * Encode and de-code a swap entry
+ *
+ * | ... | 11| 10| 9|8|7|6|5| 4| 3|2| 1|0| <- bit number
+ * | ... |SW3|SW2|SW1|G|L|D|A|CD|WT|U| W|P| <- bit names
+ * | TYPE (59-63) | ~OFFSET (9-58) |0|0|X|X| X| X|X|SD|0| <- swp entry
+ *
+ * G (8) is aliased and used as a PROT_NONE indicator for
+ * !present ptes. We need to start storing swap entries above
+ * there. We also need to avoid using A and D because of an
+ * erratum where they can be incorrectly set by hardware on
+ * non-present PTEs.
+ *
+ * SD (1) in swp entry is used to store soft dirty bit, which helps us
+ * remember soft dirty over page migration
+ *
+ * Bit 7 in swp entry should be 0 because pmd_present checks not only P,
+ * but also L and G.
+ *
+ * The offset is inverted by a binary not operation to make the high
+ * physical bits set.
+ */
+#define SWP_TYPE_BITS 5
+
+#define SWP_OFFSET_FIRST_BIT (_PAGE_BIT_PROTNONE + 1)
+
+/* We always extract/encode the offset by shifting it all the way up, and then down again */
+#define SWP_OFFSET_SHIFT (SWP_OFFSET_FIRST_BIT+SWP_TYPE_BITS)
+
+#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS)
+
+/* Extract the high bits for type */
+#define __swp_type(x) ((x).val >> (64 - SWP_TYPE_BITS))
+
+/* Shift up (to get rid of type), then down to get value */
+#define __swp_offset(x) (~(x).val << SWP_TYPE_BITS >> SWP_OFFSET_SHIFT)
+
+/*
+ * Shift the offset up "too far" by TYPE bits, then down again
+ * The offset is inverted by a binary not operation to make the high
+ * physical bits set.
+ */
+#define __swp_entry(type, offset) ((swp_entry_t) { \
+ (~(unsigned long)(offset) << SWP_OFFSET_SHIFT >> SWP_TYPE_BITS) \
+ | ((unsigned long)(type) << (64-SWP_TYPE_BITS)) })
+
+#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) })
+#define __pmd_to_swp_entry(pmd) ((swp_entry_t) { pmd_val((pmd)) })
+#define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val })
+#define __swp_entry_to_pmd(x) ((pmd_t) { .pmd = (x).val })
+
+extern int kern_addr_valid(unsigned long addr);
+extern void cleanup_highmap(void);
+
+#define HAVE_ARCH_UNMAPPED_AREA
+#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
+
+#define pgtable_cache_init() do { } while (0)
+#define check_pgt_cache() do { } while (0)
+
+#define PAGE_AGP PAGE_KERNEL_NOCACHE
+#define HAVE_PAGE_AGP 1
+
+/* fs/proc/kcore.c */
+#define kc_vaddr_to_offset(v) ((v) & __VIRTUAL_MASK)
+#define kc_offset_to_vaddr(o) ((o) | ~__VIRTUAL_MASK)
+
+#define __HAVE_ARCH_PTE_SAME
+
+#define vmemmap ((struct page *)VMEMMAP_START)
+
+extern void init_extra_mapping_uc(unsigned long phys, unsigned long size);
+extern void init_extra_mapping_wb(unsigned long phys, unsigned long size);
+
+#define gup_fast_permitted gup_fast_permitted
+static inline bool gup_fast_permitted(unsigned long start, int nr_pages,
+ int write)
+{
+ unsigned long len, end;
+
+ len = (unsigned long)nr_pages << PAGE_SHIFT;
+ end = start + len;
+ if (end < start)
+ return false;
+ if (end >> __VIRTUAL_MASK_SHIFT)
+ return false;
+ return true;
+}
+
+#include <asm/pgtable-invert.h>
+
+#endif /* !__ASSEMBLY__ */
+#endif /* _ASM_X86_PGTABLE_64_H */
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
new file mode 100644
index 000000000..88bca456d
--- /dev/null
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -0,0 +1,162 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_PGTABLE_64_DEFS_H
+#define _ASM_X86_PGTABLE_64_DEFS_H
+
+#include <asm/sparsemem.h>
+
+#ifndef __ASSEMBLY__
+#include <linux/types.h>
+#include <asm/kaslr.h>
+
+/*
+ * These are used to make use of C type-checking..
+ */
+typedef unsigned long pteval_t;
+typedef unsigned long pmdval_t;
+typedef unsigned long pudval_t;
+typedef unsigned long p4dval_t;
+typedef unsigned long pgdval_t;
+typedef unsigned long pgprotval_t;
+
+typedef struct { pteval_t pte; } pte_t;
+
+#ifdef CONFIG_X86_5LEVEL
+extern unsigned int __pgtable_l5_enabled;
+
+#ifdef USE_EARLY_PGTABLE_L5
+/*
+ * cpu_feature_enabled() is not available in early boot code.
+ * Use variable instead.
+ */
+static inline bool pgtable_l5_enabled(void)
+{
+ return __pgtable_l5_enabled;
+}
+#else
+#define pgtable_l5_enabled() cpu_feature_enabled(X86_FEATURE_LA57)
+#endif /* USE_EARLY_PGTABLE_L5 */
+
+#else
+#define pgtable_l5_enabled() 0
+#endif /* CONFIG_X86_5LEVEL */
+
+extern unsigned int pgdir_shift;
+extern unsigned int ptrs_per_p4d;
+
+#endif /* !__ASSEMBLY__ */
+
+#define SHARED_KERNEL_PMD 0
+
+#ifdef CONFIG_X86_5LEVEL
+
+/*
+ * PGDIR_SHIFT determines what a top-level page table entry can map
+ */
+#define PGDIR_SHIFT pgdir_shift
+#define PTRS_PER_PGD 512
+
+/*
+ * 4th level page in 5-level paging case
+ */
+#define P4D_SHIFT 39
+#define MAX_PTRS_PER_P4D 512
+#define PTRS_PER_P4D ptrs_per_p4d
+#define P4D_SIZE (_AC(1, UL) << P4D_SHIFT)
+#define P4D_MASK (~(P4D_SIZE - 1))
+
+#define MAX_POSSIBLE_PHYSMEM_BITS 52
+
+#else /* CONFIG_X86_5LEVEL */
+
+/*
+ * PGDIR_SHIFT determines what a top-level page table entry can map
+ */
+#define PGDIR_SHIFT 39
+#define PTRS_PER_PGD 512
+#define MAX_PTRS_PER_P4D 1
+
+#endif /* CONFIG_X86_5LEVEL */
+
+/*
+ * 3rd level page
+ */
+#define PUD_SHIFT 30
+#define PTRS_PER_PUD 512
+
+/*
+ * PMD_SHIFT determines the size of the area a middle-level
+ * page table can map
+ */
+#define PMD_SHIFT 21
+#define PTRS_PER_PMD 512
+
+/*
+ * entries per page directory level
+ */
+#define PTRS_PER_PTE 512
+
+#define PMD_SIZE (_AC(1, UL) << PMD_SHIFT)
+#define PMD_MASK (~(PMD_SIZE - 1))
+#define PUD_SIZE (_AC(1, UL) << PUD_SHIFT)
+#define PUD_MASK (~(PUD_SIZE - 1))
+#define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT)
+#define PGDIR_MASK (~(PGDIR_SIZE - 1))
+
+/*
+ * See Documentation/x86/x86_64/mm.txt for a description of the memory map.
+ *
+ * Be very careful vs. KASLR when changing anything here. The KASLR address
+ * range must not overlap with anything except the KASAN shadow area, which
+ * is correct as KASAN disables KASLR.
+ */
+#define MAXMEM (1UL << MAX_PHYSMEM_BITS)
+
+#define GUARD_HOLE_PGD_ENTRY -256UL
+#define GUARD_HOLE_SIZE (16UL << PGDIR_SHIFT)
+#define GUARD_HOLE_BASE_ADDR (GUARD_HOLE_PGD_ENTRY << PGDIR_SHIFT)
+#define GUARD_HOLE_END_ADDR (GUARD_HOLE_BASE_ADDR + GUARD_HOLE_SIZE)
+
+#define LDT_PGD_ENTRY -240UL
+#define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT)
+#define LDT_END_ADDR (LDT_BASE_ADDR + PGDIR_SIZE)
+
+#define __VMALLOC_BASE_L4 0xffffc90000000000UL
+#define __VMALLOC_BASE_L5 0xffa0000000000000UL
+
+#define VMALLOC_SIZE_TB_L4 32UL
+#define VMALLOC_SIZE_TB_L5 12800UL
+
+#define __VMEMMAP_BASE_L4 0xffffea0000000000UL
+#define __VMEMMAP_BASE_L5 0xffd4000000000000UL
+
+#ifdef CONFIG_DYNAMIC_MEMORY_LAYOUT
+# define VMALLOC_START vmalloc_base
+# define VMALLOC_SIZE_TB (pgtable_l5_enabled() ? VMALLOC_SIZE_TB_L5 : VMALLOC_SIZE_TB_L4)
+# define VMEMMAP_START vmemmap_base
+#else
+# define VMALLOC_START __VMALLOC_BASE_L4
+# define VMALLOC_SIZE_TB VMALLOC_SIZE_TB_L4
+# define VMEMMAP_START __VMEMMAP_BASE_L4
+#endif /* CONFIG_DYNAMIC_MEMORY_LAYOUT */
+
+#define VMALLOC_END (VMALLOC_START + (VMALLOC_SIZE_TB << 40) - 1)
+
+#define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE)
+/* The module sections ends with the start of the fixmap */
+#define MODULES_END _AC(0xffffffffff000000, UL)
+#define MODULES_LEN (MODULES_END - MODULES_VADDR)
+
+#define ESPFIX_PGD_ENTRY _AC(-2, UL)
+#define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << P4D_SHIFT)
+
+#define CPU_ENTRY_AREA_PGD _AC(-4, UL)
+#define CPU_ENTRY_AREA_BASE (CPU_ENTRY_AREA_PGD << P4D_SHIFT)
+
+#define EFI_VA_START ( -4 * (_AC(1, UL) << 30))
+#define EFI_VA_END (-68 * (_AC(1, UL) << 30))
+
+#define EARLY_DYNAMIC_PAGE_TABLES 64
+
+#define PGD_KERNEL_START ((PAGE_SIZE / 2) / sizeof(pgd_t))
+
+#endif /* _ASM_X86_PGTABLE_64_DEFS_H */
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
new file mode 100644
index 000000000..02806d95a
--- /dev/null
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -0,0 +1,572 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_PGTABLE_DEFS_H
+#define _ASM_X86_PGTABLE_DEFS_H
+
+#include <linux/const.h>
+#include <linux/mem_encrypt.h>
+
+#include <asm/page_types.h>
+
+#define FIRST_USER_ADDRESS 0UL
+
+#define _PAGE_BIT_PRESENT 0 /* is present */
+#define _PAGE_BIT_RW 1 /* writeable */
+#define _PAGE_BIT_USER 2 /* userspace addressable */
+#define _PAGE_BIT_PWT 3 /* page write through */
+#define _PAGE_BIT_PCD 4 /* page cache disabled */
+#define _PAGE_BIT_ACCESSED 5 /* was accessed (raised by CPU) */
+#define _PAGE_BIT_DIRTY 6 /* was written to (raised by CPU) */
+#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */
+#define _PAGE_BIT_PAT 7 /* on 4KB pages */
+#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
+#define _PAGE_BIT_SOFTW1 9 /* available for programmer */
+#define _PAGE_BIT_SOFTW2 10 /* " */
+#define _PAGE_BIT_SOFTW3 11 /* " */
+#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
+#define _PAGE_BIT_SOFTW4 58 /* available for programmer */
+#define _PAGE_BIT_PKEY_BIT0 59 /* Protection Keys, bit 1/4 */
+#define _PAGE_BIT_PKEY_BIT1 60 /* Protection Keys, bit 2/4 */
+#define _PAGE_BIT_PKEY_BIT2 61 /* Protection Keys, bit 3/4 */
+#define _PAGE_BIT_PKEY_BIT3 62 /* Protection Keys, bit 4/4 */
+#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
+
+#define _PAGE_BIT_SPECIAL _PAGE_BIT_SOFTW1
+#define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1
+#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */
+#define _PAGE_BIT_DEVMAP _PAGE_BIT_SOFTW4
+
+/* If _PAGE_BIT_PRESENT is clear, we use these: */
+/* - if the user mapped it with PROT_NONE; pte_present gives true */
+#define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL
+
+#define _PAGE_PRESENT (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT)
+#define _PAGE_RW (_AT(pteval_t, 1) << _PAGE_BIT_RW)
+#define _PAGE_USER (_AT(pteval_t, 1) << _PAGE_BIT_USER)
+#define _PAGE_PWT (_AT(pteval_t, 1) << _PAGE_BIT_PWT)
+#define _PAGE_PCD (_AT(pteval_t, 1) << _PAGE_BIT_PCD)
+#define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED)
+#define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
+#define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE)
+#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
+#define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1)
+#define _PAGE_SOFTW2 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2)
+#define _PAGE_SOFTW3 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW3)
+#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
+#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
+#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
+#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+#define _PAGE_PKEY_BIT0 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT0)
+#define _PAGE_PKEY_BIT1 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT1)
+#define _PAGE_PKEY_BIT2 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT2)
+#define _PAGE_PKEY_BIT3 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT3)
+#else
+#define _PAGE_PKEY_BIT0 (_AT(pteval_t, 0))
+#define _PAGE_PKEY_BIT1 (_AT(pteval_t, 0))
+#define _PAGE_PKEY_BIT2 (_AT(pteval_t, 0))
+#define _PAGE_PKEY_BIT3 (_AT(pteval_t, 0))
+#endif
+
+#define _PAGE_PKEY_MASK (_PAGE_PKEY_BIT0 | \
+ _PAGE_PKEY_BIT1 | \
+ _PAGE_PKEY_BIT2 | \
+ _PAGE_PKEY_BIT3)
+
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
+#define _PAGE_KNL_ERRATUM_MASK (_PAGE_DIRTY | _PAGE_ACCESSED)
+#else
+#define _PAGE_KNL_ERRATUM_MASK 0
+#endif
+
+#ifdef CONFIG_MEM_SOFT_DIRTY
+#define _PAGE_SOFT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_SOFT_DIRTY)
+#else
+#define _PAGE_SOFT_DIRTY (_AT(pteval_t, 0))
+#endif
+
+/*
+ * Tracking soft dirty bit when a page goes to a swap is tricky.
+ * We need a bit which can be stored in pte _and_ not conflict
+ * with swap entry format. On x86 bits 1-4 are *not* involved
+ * into swap entry computation, but bit 7 is used for thp migration,
+ * so we borrow bit 1 for soft dirty tracking.
+ *
+ * Please note that this bit must be treated as swap dirty page
+ * mark if and only if the PTE/PMD has present bit clear!
+ */
+#ifdef CONFIG_MEM_SOFT_DIRTY
+#define _PAGE_SWP_SOFT_DIRTY _PAGE_RW
+#else
+#define _PAGE_SWP_SOFT_DIRTY (_AT(pteval_t, 0))
+#endif
+
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
+#define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX)
+#define _PAGE_DEVMAP (_AT(u64, 1) << _PAGE_BIT_DEVMAP)
+#define __HAVE_ARCH_PTE_DEVMAP
+#else
+#define _PAGE_NX (_AT(pteval_t, 0))
+#define _PAGE_DEVMAP (_AT(pteval_t, 0))
+#endif
+
+#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
+
+#define _PAGE_TABLE_NOENC (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |\
+ _PAGE_ACCESSED | _PAGE_DIRTY)
+#define _KERNPG_TABLE_NOENC (_PAGE_PRESENT | _PAGE_RW | \
+ _PAGE_ACCESSED | _PAGE_DIRTY)
+
+/*
+ * Set of bits not changed in pte_modify. The pte's
+ * protection key is treated like _PAGE_RW, for
+ * instance, and is *not* included in this mask since
+ * pte_modify() does modify it.
+ */
+#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \
+ _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \
+ _PAGE_SOFT_DIRTY | _PAGE_DEVMAP | _PAGE_ENC)
+#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
+
+/*
+ * The cache modes defined here are used to translate between pure SW usage
+ * and the HW defined cache mode bits and/or PAT entries.
+ *
+ * The resulting bits for PWT, PCD and PAT should be chosen in a way
+ * to have the WB mode at index 0 (all bits clear). This is the default
+ * right now and likely would break too much if changed.
+ */
+#ifndef __ASSEMBLY__
+enum page_cache_mode {
+ _PAGE_CACHE_MODE_WB = 0,
+ _PAGE_CACHE_MODE_WC = 1,
+ _PAGE_CACHE_MODE_UC_MINUS = 2,
+ _PAGE_CACHE_MODE_UC = 3,
+ _PAGE_CACHE_MODE_WT = 4,
+ _PAGE_CACHE_MODE_WP = 5,
+ _PAGE_CACHE_MODE_NUM = 8
+};
+#endif
+
+#define _PAGE_CACHE_MASK (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)
+#define _PAGE_LARGE_CACHE_MASK (_PAGE_PWT | _PAGE_PCD | _PAGE_PAT_LARGE)
+#define _PAGE_NOCACHE (cachemode2protval(_PAGE_CACHE_MODE_UC))
+#define _PAGE_CACHE_WP (cachemode2protval(_PAGE_CACHE_MODE_WP))
+
+#define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
+#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
+ _PAGE_ACCESSED | _PAGE_NX)
+
+#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | \
+ _PAGE_USER | _PAGE_ACCESSED)
+#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
+ _PAGE_ACCESSED | _PAGE_NX)
+#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
+ _PAGE_ACCESSED)
+#define PAGE_COPY PAGE_COPY_NOEXEC
+#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | \
+ _PAGE_ACCESSED | _PAGE_NX)
+#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
+ _PAGE_ACCESSED)
+
+#define __PAGE_KERNEL_EXEC \
+ (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_GLOBAL)
+#define __PAGE_KERNEL (__PAGE_KERNEL_EXEC | _PAGE_NX)
+
+#define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW)
+#define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW)
+#define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_NOCACHE)
+#define __PAGE_KERNEL_VVAR (__PAGE_KERNEL_RO | _PAGE_USER)
+#define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE)
+#define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE)
+#define __PAGE_KERNEL_WP (__PAGE_KERNEL | _PAGE_CACHE_WP)
+
+#define __PAGE_KERNEL_IO (__PAGE_KERNEL)
+#define __PAGE_KERNEL_IO_NOCACHE (__PAGE_KERNEL_NOCACHE)
+
+#ifndef __ASSEMBLY__
+
+#define _PAGE_ENC (_AT(pteval_t, sme_me_mask))
+
+#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \
+ _PAGE_DIRTY | _PAGE_ENC)
+#define _PAGE_TABLE (_KERNPG_TABLE | _PAGE_USER)
+
+#define __PAGE_KERNEL_ENC (__PAGE_KERNEL | _PAGE_ENC)
+#define __PAGE_KERNEL_ENC_WP (__PAGE_KERNEL_WP | _PAGE_ENC)
+
+#define __PAGE_KERNEL_NOENC (__PAGE_KERNEL)
+#define __PAGE_KERNEL_NOENC_WP (__PAGE_KERNEL_WP)
+
+#define default_pgprot(x) __pgprot((x) & __default_kernel_pte_mask)
+
+#define PAGE_KERNEL default_pgprot(__PAGE_KERNEL | _PAGE_ENC)
+#define PAGE_KERNEL_NOENC default_pgprot(__PAGE_KERNEL)
+#define PAGE_KERNEL_RO default_pgprot(__PAGE_KERNEL_RO | _PAGE_ENC)
+#define PAGE_KERNEL_EXEC default_pgprot(__PAGE_KERNEL_EXEC | _PAGE_ENC)
+#define PAGE_KERNEL_EXEC_NOENC default_pgprot(__PAGE_KERNEL_EXEC)
+#define PAGE_KERNEL_RX default_pgprot(__PAGE_KERNEL_RX | _PAGE_ENC)
+#define PAGE_KERNEL_NOCACHE default_pgprot(__PAGE_KERNEL_NOCACHE | _PAGE_ENC)
+#define PAGE_KERNEL_LARGE default_pgprot(__PAGE_KERNEL_LARGE | _PAGE_ENC)
+#define PAGE_KERNEL_LARGE_EXEC default_pgprot(__PAGE_KERNEL_LARGE_EXEC | _PAGE_ENC)
+#define PAGE_KERNEL_VVAR default_pgprot(__PAGE_KERNEL_VVAR | _PAGE_ENC)
+
+#define PAGE_KERNEL_IO default_pgprot(__PAGE_KERNEL_IO)
+#define PAGE_KERNEL_IO_NOCACHE default_pgprot(__PAGE_KERNEL_IO_NOCACHE)
+
+#endif /* __ASSEMBLY__ */
+
+/* xwr */
+#define __P000 PAGE_NONE
+#define __P001 PAGE_READONLY
+#define __P010 PAGE_COPY
+#define __P011 PAGE_COPY
+#define __P100 PAGE_READONLY_EXEC
+#define __P101 PAGE_READONLY_EXEC
+#define __P110 PAGE_COPY_EXEC
+#define __P111 PAGE_COPY_EXEC
+
+#define __S000 PAGE_NONE
+#define __S001 PAGE_READONLY
+#define __S010 PAGE_SHARED
+#define __S011 PAGE_SHARED
+#define __S100 PAGE_READONLY_EXEC
+#define __S101 PAGE_READONLY_EXEC
+#define __S110 PAGE_SHARED_EXEC
+#define __S111 PAGE_SHARED_EXEC
+
+/*
+ * early identity mapping pte attrib macros.
+ */
+#ifdef CONFIG_X86_64
+#define __PAGE_KERNEL_IDENT_LARGE_EXEC __PAGE_KERNEL_LARGE_EXEC
+#else
+#define PTE_IDENT_ATTR 0x003 /* PRESENT+RW */
+#define PDE_IDENT_ATTR 0x063 /* PRESENT+RW+DIRTY+ACCESSED */
+#define PGD_IDENT_ATTR 0x001 /* PRESENT (no other attributes) */
+#endif
+
+#ifdef CONFIG_X86_32
+# include <asm/pgtable_32_types.h>
+#else
+# include <asm/pgtable_64_types.h>
+#endif
+
+#ifndef __ASSEMBLY__
+
+#include <linux/types.h>
+
+/* Extracts the PFN from a (pte|pmd|pud|pgd)val_t of a 4KB page */
+#define PTE_PFN_MASK ((pteval_t)PHYSICAL_PAGE_MASK)
+
+/*
+ * Extracts the flags from a (pte|pmd|pud|pgd)val_t
+ * This includes the protection key value.
+ */
+#define PTE_FLAGS_MASK (~PTE_PFN_MASK)
+
+typedef struct pgprot { pgprotval_t pgprot; } pgprot_t;
+
+typedef struct { pgdval_t pgd; } pgd_t;
+
+#ifdef CONFIG_X86_PAE
+
+/*
+ * PHYSICAL_PAGE_MASK might be non-constant when SME is compiled in, so we can't
+ * use it here.
+ */
+
+#define PGD_PAE_PAGE_MASK ((signed long)PAGE_MASK)
+#define PGD_PAE_PHYS_MASK (((1ULL << __PHYSICAL_MASK_SHIFT)-1) & PGD_PAE_PAGE_MASK)
+
+/*
+ * PAE allows Base Address, P, PWT, PCD and AVL bits to be set in PGD entries.
+ * All other bits are Reserved MBZ
+ */
+#define PGD_ALLOWED_BITS (PGD_PAE_PHYS_MASK | _PAGE_PRESENT | \
+ _PAGE_PWT | _PAGE_PCD | \
+ _PAGE_SOFTW1 | _PAGE_SOFTW2 | _PAGE_SOFTW3)
+
+#else
+/* No need to mask any bits for !PAE */
+#define PGD_ALLOWED_BITS (~0ULL)
+#endif
+
+static inline pgd_t native_make_pgd(pgdval_t val)
+{
+ return (pgd_t) { val & PGD_ALLOWED_BITS };
+}
+
+static inline pgdval_t native_pgd_val(pgd_t pgd)
+{
+ return pgd.pgd & PGD_ALLOWED_BITS;
+}
+
+static inline pgdval_t pgd_flags(pgd_t pgd)
+{
+ return native_pgd_val(pgd) & PTE_FLAGS_MASK;
+}
+
+#if CONFIG_PGTABLE_LEVELS > 4
+typedef struct { p4dval_t p4d; } p4d_t;
+
+static inline p4d_t native_make_p4d(pudval_t val)
+{
+ return (p4d_t) { val };
+}
+
+static inline p4dval_t native_p4d_val(p4d_t p4d)
+{
+ return p4d.p4d;
+}
+#else
+#include <asm-generic/pgtable-nop4d.h>
+
+static inline p4d_t native_make_p4d(pudval_t val)
+{
+ return (p4d_t) { .pgd = native_make_pgd((pgdval_t)val) };
+}
+
+static inline p4dval_t native_p4d_val(p4d_t p4d)
+{
+ return native_pgd_val(p4d.pgd);
+}
+#endif
+
+#if CONFIG_PGTABLE_LEVELS > 3
+typedef struct { pudval_t pud; } pud_t;
+
+static inline pud_t native_make_pud(pmdval_t val)
+{
+ return (pud_t) { val };
+}
+
+static inline pudval_t native_pud_val(pud_t pud)
+{
+ return pud.pud;
+}
+#else
+#include <asm-generic/pgtable-nopud.h>
+
+static inline pud_t native_make_pud(pudval_t val)
+{
+ return (pud_t) { .p4d.pgd = native_make_pgd(val) };
+}
+
+static inline pudval_t native_pud_val(pud_t pud)
+{
+ return native_pgd_val(pud.p4d.pgd);
+}
+#endif
+
+#if CONFIG_PGTABLE_LEVELS > 2
+typedef struct { pmdval_t pmd; } pmd_t;
+
+static inline pmd_t native_make_pmd(pmdval_t val)
+{
+ return (pmd_t) { val };
+}
+
+static inline pmdval_t native_pmd_val(pmd_t pmd)
+{
+ return pmd.pmd;
+}
+#else
+#include <asm-generic/pgtable-nopmd.h>
+
+static inline pmd_t native_make_pmd(pmdval_t val)
+{
+ return (pmd_t) { .pud.p4d.pgd = native_make_pgd(val) };
+}
+
+static inline pmdval_t native_pmd_val(pmd_t pmd)
+{
+ return native_pgd_val(pmd.pud.p4d.pgd);
+}
+#endif
+
+static inline p4dval_t p4d_pfn_mask(p4d_t p4d)
+{
+ /* No 512 GiB huge pages yet */
+ return PTE_PFN_MASK;
+}
+
+static inline p4dval_t p4d_flags_mask(p4d_t p4d)
+{
+ return ~p4d_pfn_mask(p4d);
+}
+
+static inline p4dval_t p4d_flags(p4d_t p4d)
+{
+ return native_p4d_val(p4d) & p4d_flags_mask(p4d);
+}
+
+static inline pudval_t pud_pfn_mask(pud_t pud)
+{
+ if (native_pud_val(pud) & _PAGE_PSE)
+ return PHYSICAL_PUD_PAGE_MASK;
+ else
+ return PTE_PFN_MASK;
+}
+
+static inline pudval_t pud_flags_mask(pud_t pud)
+{
+ return ~pud_pfn_mask(pud);
+}
+
+static inline pudval_t pud_flags(pud_t pud)
+{
+ return native_pud_val(pud) & pud_flags_mask(pud);
+}
+
+static inline pmdval_t pmd_pfn_mask(pmd_t pmd)
+{
+ if (native_pmd_val(pmd) & _PAGE_PSE)
+ return PHYSICAL_PMD_PAGE_MASK;
+ else
+ return PTE_PFN_MASK;
+}
+
+static inline pmdval_t pmd_flags_mask(pmd_t pmd)
+{
+ return ~pmd_pfn_mask(pmd);
+}
+
+static inline pmdval_t pmd_flags(pmd_t pmd)
+{
+ return native_pmd_val(pmd) & pmd_flags_mask(pmd);
+}
+
+static inline pte_t native_make_pte(pteval_t val)
+{
+ return (pte_t) { .pte = val };
+}
+
+static inline pteval_t native_pte_val(pte_t pte)
+{
+ return pte.pte;
+}
+
+static inline pteval_t pte_flags(pte_t pte)
+{
+ return native_pte_val(pte) & PTE_FLAGS_MASK;
+}
+
+#define pgprot_val(x) ((x).pgprot)
+#define __pgprot(x) ((pgprot_t) { (x) } )
+
+extern uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM];
+extern uint8_t __pte2cachemode_tbl[8];
+
+#define __pte2cm_idx(cb) \
+ ((((cb) >> (_PAGE_BIT_PAT - 2)) & 4) | \
+ (((cb) >> (_PAGE_BIT_PCD - 1)) & 2) | \
+ (((cb) >> _PAGE_BIT_PWT) & 1))
+#define __cm_idx2pte(i) \
+ ((((i) & 4) << (_PAGE_BIT_PAT - 2)) | \
+ (((i) & 2) << (_PAGE_BIT_PCD - 1)) | \
+ (((i) & 1) << _PAGE_BIT_PWT))
+
+static inline unsigned long cachemode2protval(enum page_cache_mode pcm)
+{
+ if (likely(pcm == 0))
+ return 0;
+ return __cachemode2pte_tbl[pcm];
+}
+static inline pgprot_t cachemode2pgprot(enum page_cache_mode pcm)
+{
+ return __pgprot(cachemode2protval(pcm));
+}
+static inline enum page_cache_mode pgprot2cachemode(pgprot_t pgprot)
+{
+ unsigned long masked;
+
+ masked = pgprot_val(pgprot) & _PAGE_CACHE_MASK;
+ if (likely(masked == 0))
+ return 0;
+ return __pte2cachemode_tbl[__pte2cm_idx(masked)];
+}
+static inline pgprot_t pgprot_4k_2_large(pgprot_t pgprot)
+{
+ pgprotval_t val = pgprot_val(pgprot);
+ pgprot_t new;
+
+ pgprot_val(new) = (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) |
+ ((val & _PAGE_PAT) << (_PAGE_BIT_PAT_LARGE - _PAGE_BIT_PAT));
+ return new;
+}
+static inline pgprot_t pgprot_large_2_4k(pgprot_t pgprot)
+{
+ pgprotval_t val = pgprot_val(pgprot);
+ pgprot_t new;
+
+ pgprot_val(new) = (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) |
+ ((val & _PAGE_PAT_LARGE) >>
+ (_PAGE_BIT_PAT_LARGE - _PAGE_BIT_PAT));
+ return new;
+}
+
+
+typedef struct page *pgtable_t;
+
+extern pteval_t __supported_pte_mask;
+extern pteval_t __default_kernel_pte_mask;
+extern void set_nx(void);
+extern int nx_enabled;
+
+#define pgprot_writecombine pgprot_writecombine
+extern pgprot_t pgprot_writecombine(pgprot_t prot);
+
+#define pgprot_writethrough pgprot_writethrough
+extern pgprot_t pgprot_writethrough(pgprot_t prot);
+
+/* Indicate that x86 has its own track and untrack pfn vma functions */
+#define __HAVE_PFNMAP_TRACKING
+
+#define __HAVE_PHYS_MEM_ACCESS_PROT
+struct file;
+pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
+ unsigned long size, pgprot_t vma_prot);
+
+/* Install a pte for a particular vaddr in kernel space. */
+void set_pte_vaddr(unsigned long vaddr, pte_t pte);
+
+#ifdef CONFIG_X86_32
+extern void native_pagetable_init(void);
+#else
+#define native_pagetable_init paging_init
+#endif
+
+struct seq_file;
+extern void arch_report_meminfo(struct seq_file *m);
+
+enum pg_level {
+ PG_LEVEL_NONE,
+ PG_LEVEL_4K,
+ PG_LEVEL_2M,
+ PG_LEVEL_1G,
+ PG_LEVEL_512G,
+ PG_LEVEL_NUM
+};
+
+#ifdef CONFIG_PROC_FS
+extern void update_page_count(int level, unsigned long pages);
+#else
+static inline void update_page_count(int level, unsigned long pages) { }
+#endif
+
+/*
+ * Helper function that returns the kernel pagetable entry controlling
+ * the virtual address 'address'. NULL means no pagetable entry present.
+ * NOTE: the return type is pte_t but if the pmd is PSE then we return it
+ * as a pte too.
+ */
+extern pte_t *lookup_address(unsigned long address, unsigned int *level);
+extern pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
+ unsigned int *level);
+extern pmd_t *lookup_pmd_address(unsigned long address);
+extern phys_addr_t slow_virt_to_phys(void *__address);
+extern int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
+ unsigned numpages, unsigned long page_flags);
+#endif /* !__ASSEMBLY__ */
+
+#endif /* _ASM_X86_PGTABLE_DEFS_H */
diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h
new file mode 100644
index 000000000..2ff9b9881
--- /dev/null
+++ b/arch/x86/include/asm/pkeys.h
@@ -0,0 +1,137 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_PKEYS_H
+#define _ASM_X86_PKEYS_H
+
+#define ARCH_DEFAULT_PKEY 0
+
+/*
+ * If more than 16 keys are ever supported, a thorough audit
+ * will be necessary to ensure that the types that store key
+ * numbers and masks have sufficient capacity.
+ */
+#define arch_max_pkey() (boot_cpu_has(X86_FEATURE_OSPKE) ? 16 : 1)
+
+extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
+ unsigned long init_val);
+
+static inline bool arch_pkeys_enabled(void)
+{
+ return boot_cpu_has(X86_FEATURE_OSPKE);
+}
+
+/*
+ * Try to dedicate one of the protection keys to be used as an
+ * execute-only protection key.
+ */
+extern int __execute_only_pkey(struct mm_struct *mm);
+static inline int execute_only_pkey(struct mm_struct *mm)
+{
+ if (!boot_cpu_has(X86_FEATURE_OSPKE))
+ return ARCH_DEFAULT_PKEY;
+
+ return __execute_only_pkey(mm);
+}
+
+extern int __arch_override_mprotect_pkey(struct vm_area_struct *vma,
+ int prot, int pkey);
+static inline int arch_override_mprotect_pkey(struct vm_area_struct *vma,
+ int prot, int pkey)
+{
+ if (!boot_cpu_has(X86_FEATURE_OSPKE))
+ return 0;
+
+ return __arch_override_mprotect_pkey(vma, prot, pkey);
+}
+
+extern int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
+ unsigned long init_val);
+
+#define ARCH_VM_PKEY_FLAGS (VM_PKEY_BIT0 | VM_PKEY_BIT1 | VM_PKEY_BIT2 | VM_PKEY_BIT3)
+
+#define mm_pkey_allocation_map(mm) (mm->context.pkey_allocation_map)
+#define mm_set_pkey_allocated(mm, pkey) do { \
+ mm_pkey_allocation_map(mm) |= (1U << pkey); \
+} while (0)
+#define mm_set_pkey_free(mm, pkey) do { \
+ mm_pkey_allocation_map(mm) &= ~(1U << pkey); \
+} while (0)
+
+static inline
+bool mm_pkey_is_allocated(struct mm_struct *mm, int pkey)
+{
+ /*
+ * "Allocated" pkeys are those that have been returned
+ * from pkey_alloc() or pkey 0 which is allocated
+ * implicitly when the mm is created.
+ */
+ if (pkey < 0)
+ return false;
+ if (pkey >= arch_max_pkey())
+ return false;
+ /*
+ * The exec-only pkey is set in the allocation map, but
+ * is not available to any of the user interfaces like
+ * mprotect_pkey().
+ */
+ if (pkey == mm->context.execute_only_pkey)
+ return false;
+
+ return mm_pkey_allocation_map(mm) & (1U << pkey);
+}
+
+/*
+ * Returns a positive, 4-bit key on success, or -1 on failure.
+ */
+static inline
+int mm_pkey_alloc(struct mm_struct *mm)
+{
+ /*
+ * Note: this is the one and only place we make sure
+ * that the pkey is valid as far as the hardware is
+ * concerned. The rest of the kernel trusts that
+ * only good, valid pkeys come out of here.
+ */
+ u16 all_pkeys_mask = ((1U << arch_max_pkey()) - 1);
+ int ret;
+
+ /*
+ * Are we out of pkeys? We must handle this specially
+ * because ffz() behavior is undefined if there are no
+ * zeros.
+ */
+ if (mm_pkey_allocation_map(mm) == all_pkeys_mask)
+ return -1;
+
+ ret = ffz(mm_pkey_allocation_map(mm));
+
+ mm_set_pkey_allocated(mm, ret);
+
+ return ret;
+}
+
+static inline
+int mm_pkey_free(struct mm_struct *mm, int pkey)
+{
+ if (!mm_pkey_is_allocated(mm, pkey))
+ return -EINVAL;
+
+ mm_set_pkey_free(mm, pkey);
+
+ return 0;
+}
+
+extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
+ unsigned long init_val);
+extern int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
+ unsigned long init_val);
+extern void copy_init_pkru_to_fpregs(void);
+
+static inline int vma_pkey(struct vm_area_struct *vma)
+{
+ unsigned long vma_pkey_mask = VM_PKEY_BIT0 | VM_PKEY_BIT1 |
+ VM_PKEY_BIT2 | VM_PKEY_BIT3;
+
+ return (vma->vm_flags & vma_pkey_mask) >> VM_PKEY_SHIFT;
+}
+
+#endif /*_ASM_X86_PKEYS_H */
diff --git a/arch/x86/include/asm/platform_sst_audio.h b/arch/x86/include/asm/platform_sst_audio.h
new file mode 100644
index 000000000..059823bb8
--- /dev/null
+++ b/arch/x86/include/asm/platform_sst_audio.h
@@ -0,0 +1,142 @@
+/*
+ * platform_sst_audio.h: sst audio platform data header file
+ *
+ * Copyright (C) 2012-14 Intel Corporation
+ * Author: Jeeja KP <jeeja.kp@intel.com>
+ * Omair Mohammed Abdullah <omair.m.abdullah@intel.com>
+ * Vinod Koul ,vinod.koul@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+#ifndef _PLATFORM_SST_AUDIO_H_
+#define _PLATFORM_SST_AUDIO_H_
+
+#include <linux/sfi.h>
+
+#define MAX_NUM_STREAMS_MRFLD 25
+#define MAX_NUM_STREAMS MAX_NUM_STREAMS_MRFLD
+
+enum sst_audio_task_id_mrfld {
+ SST_TASK_ID_NONE = 0,
+ SST_TASK_ID_SBA = 1,
+ SST_TASK_ID_MEDIA = 3,
+ SST_TASK_ID_MAX = SST_TASK_ID_MEDIA,
+};
+
+/* Device IDs for Merrifield are Pipe IDs,
+ * ref: DSP spec v0.75 */
+enum sst_audio_device_id_mrfld {
+ /* Output pipeline IDs */
+ PIPE_ID_OUT_START = 0x0,
+ PIPE_CODEC_OUT0 = 0x2,
+ PIPE_CODEC_OUT1 = 0x3,
+ PIPE_SPROT_LOOP_OUT = 0x4,
+ PIPE_MEDIA_LOOP1_OUT = 0x5,
+ PIPE_MEDIA_LOOP2_OUT = 0x6,
+ PIPE_VOIP_OUT = 0xC,
+ PIPE_PCM0_OUT = 0xD,
+ PIPE_PCM1_OUT = 0xE,
+ PIPE_PCM2_OUT = 0xF,
+ PIPE_MEDIA0_OUT = 0x12,
+ PIPE_MEDIA1_OUT = 0x13,
+/* Input Pipeline IDs */
+ PIPE_ID_IN_START = 0x80,
+ PIPE_CODEC_IN0 = 0x82,
+ PIPE_CODEC_IN1 = 0x83,
+ PIPE_SPROT_LOOP_IN = 0x84,
+ PIPE_MEDIA_LOOP1_IN = 0x85,
+ PIPE_MEDIA_LOOP2_IN = 0x86,
+ PIPE_VOIP_IN = 0x8C,
+ PIPE_PCM0_IN = 0x8D,
+ PIPE_PCM1_IN = 0x8E,
+ PIPE_MEDIA0_IN = 0x8F,
+ PIPE_MEDIA1_IN = 0x90,
+ PIPE_MEDIA2_IN = 0x91,
+ PIPE_MEDIA3_IN = 0x9C,
+ PIPE_RSVD = 0xFF,
+};
+
+/* The stream map for each platform consists of an array of the below
+ * stream map structure.
+ */
+struct sst_dev_stream_map {
+ u8 dev_num; /* device id */
+ u8 subdev_num; /* substream */
+ u8 direction;
+ u8 device_id; /* fw id */
+ u8 task_id; /* fw task */
+ u8 status;
+};
+
+struct sst_platform_data {
+ /* Intel software platform id*/
+ struct sst_dev_stream_map *pdev_strm_map;
+ unsigned int strm_map_size;
+};
+
+struct sst_info {
+ u32 iram_start;
+ u32 iram_end;
+ bool iram_use;
+ u32 dram_start;
+ u32 dram_end;
+ bool dram_use;
+ u32 imr_start;
+ u32 imr_end;
+ bool imr_use;
+ u32 mailbox_start;
+ bool use_elf;
+ bool lpe_viewpt_rqd;
+ unsigned int max_streams;
+ u32 dma_max_len;
+ u8 num_probes;
+};
+
+struct sst_lib_dnld_info {
+ unsigned int mod_base;
+ unsigned int mod_end;
+ unsigned int mod_table_offset;
+ unsigned int mod_table_size;
+ bool mod_ddr_dnld;
+};
+
+struct sst_res_info {
+ unsigned int shim_offset;
+ unsigned int shim_size;
+ unsigned int shim_phy_addr;
+ unsigned int ssp0_offset;
+ unsigned int ssp0_size;
+ unsigned int dma0_offset;
+ unsigned int dma0_size;
+ unsigned int dma1_offset;
+ unsigned int dma1_size;
+ unsigned int iram_offset;
+ unsigned int iram_size;
+ unsigned int dram_offset;
+ unsigned int dram_size;
+ unsigned int mbox_offset;
+ unsigned int mbox_size;
+ unsigned int acpi_lpe_res_index;
+ unsigned int acpi_ddr_index;
+ unsigned int acpi_ipc_irq_index;
+};
+
+struct sst_ipc_info {
+ int ipc_offset;
+ unsigned int mbox_recv_off;
+};
+
+struct sst_platform_info {
+ const struct sst_info *probe_data;
+ const struct sst_ipc_info *ipc_info;
+ const struct sst_res_info *res_info;
+ const struct sst_lib_dnld_info *lib_info;
+ const char *platform;
+ bool streams_lost_on_suspend;
+};
+int add_sst_platform_device(void);
+#endif
+
diff --git a/arch/x86/include/asm/pm-trace.h b/arch/x86/include/asm/pm-trace.h
new file mode 100644
index 000000000..bfa32aa42
--- /dev/null
+++ b/arch/x86/include/asm/pm-trace.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_PM_TRACE_H
+#define _ASM_X86_PM_TRACE_H
+
+#include <asm/asm.h>
+
+#define TRACE_RESUME(user) \
+do { \
+ if (pm_trace_enabled) { \
+ const void *tracedata; \
+ asm volatile(_ASM_MOV " $1f,%0\n" \
+ ".section .tracedata,\"a\"\n" \
+ "1:\t.word %c1\n\t" \
+ _ASM_PTR " %c2\n" \
+ ".previous" \
+ :"=r" (tracedata) \
+ : "i" (__LINE__), "i" (__FILE__)); \
+ generate_pm_trace(tracedata, user); \
+ } \
+} while (0)
+
+#define TRACE_SUSPEND(user) TRACE_RESUME(user)
+
+#endif /* _ASM_X86_PM_TRACE_H */
diff --git a/arch/x86/include/asm/posix_types.h b/arch/x86/include/asm/posix_types.h
new file mode 100644
index 000000000..374336e21
--- /dev/null
+++ b/arch/x86/include/asm/posix_types.h
@@ -0,0 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+# ifdef CONFIG_X86_32
+# include <asm/posix_types_32.h>
+# else
+# include <asm/posix_types_64.h>
+# endif
diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
new file mode 100644
index 000000000..7f2dbd91f
--- /dev/null
+++ b/arch/x86/include/asm/preempt.h
@@ -0,0 +1,115 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_PREEMPT_H
+#define __ASM_PREEMPT_H
+
+#include <asm/rmwcc.h>
+#include <asm/percpu.h>
+#include <linux/thread_info.h>
+
+DECLARE_PER_CPU(int, __preempt_count);
+
+/*
+ * We use the PREEMPT_NEED_RESCHED bit as an inverted NEED_RESCHED such
+ * that a decrement hitting 0 means we can and should reschedule.
+ */
+#define PREEMPT_ENABLED (0 + PREEMPT_NEED_RESCHED)
+
+/*
+ * We mask the PREEMPT_NEED_RESCHED bit so as not to confuse all current users
+ * that think a non-zero value indicates we cannot preempt.
+ */
+static __always_inline int preempt_count(void)
+{
+ return raw_cpu_read_4(__preempt_count) & ~PREEMPT_NEED_RESCHED;
+}
+
+static __always_inline void preempt_count_set(int pc)
+{
+ int old, new;
+
+ do {
+ old = raw_cpu_read_4(__preempt_count);
+ new = (old & PREEMPT_NEED_RESCHED) |
+ (pc & ~PREEMPT_NEED_RESCHED);
+ } while (raw_cpu_cmpxchg_4(__preempt_count, old, new) != old);
+}
+
+/*
+ * must be macros to avoid header recursion hell
+ */
+#define init_task_preempt_count(p) do { } while (0)
+
+#define init_idle_preempt_count(p, cpu) do { \
+ per_cpu(__preempt_count, (cpu)) = PREEMPT_ENABLED; \
+} while (0)
+
+/*
+ * We fold the NEED_RESCHED bit into the preempt count such that
+ * preempt_enable() can decrement and test for needing to reschedule with a
+ * single instruction.
+ *
+ * We invert the actual bit, so that when the decrement hits 0 we know we both
+ * need to resched (the bit is cleared) and can resched (no preempt count).
+ */
+
+static __always_inline void set_preempt_need_resched(void)
+{
+ raw_cpu_and_4(__preempt_count, ~PREEMPT_NEED_RESCHED);
+}
+
+static __always_inline void clear_preempt_need_resched(void)
+{
+ raw_cpu_or_4(__preempt_count, PREEMPT_NEED_RESCHED);
+}
+
+static __always_inline bool test_preempt_need_resched(void)
+{
+ return !(raw_cpu_read_4(__preempt_count) & PREEMPT_NEED_RESCHED);
+}
+
+/*
+ * The various preempt_count add/sub methods
+ */
+
+static __always_inline void __preempt_count_add(int val)
+{
+ raw_cpu_add_4(__preempt_count, val);
+}
+
+static __always_inline void __preempt_count_sub(int val)
+{
+ raw_cpu_add_4(__preempt_count, -val);
+}
+
+/*
+ * Because we keep PREEMPT_NEED_RESCHED set when we do _not_ need to reschedule
+ * a decrement which hits zero means we have no preempt_count and should
+ * reschedule.
+ */
+static __always_inline bool __preempt_count_dec_and_test(void)
+{
+ GEN_UNARY_RMWcc("decl", __preempt_count, __percpu_arg(0), e);
+}
+
+/*
+ * Returns true when we need to resched and can (barring IRQ state).
+ */
+static __always_inline bool should_resched(int preempt_offset)
+{
+ return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset);
+}
+
+#ifdef CONFIG_PREEMPT
+ extern asmlinkage void ___preempt_schedule(void);
+# define __preempt_schedule() \
+ asm volatile ("call ___preempt_schedule" : ASM_CALL_CONSTRAINT)
+
+ extern asmlinkage void preempt_schedule(void);
+ extern asmlinkage void ___preempt_schedule_notrace(void);
+# define __preempt_schedule_notrace() \
+ asm volatile ("call ___preempt_schedule_notrace" : ASM_CALL_CONSTRAINT)
+
+ extern asmlinkage void preempt_schedule_notrace(void);
+#endif
+
+#endif /* __ASM_PREEMPT_H */
diff --git a/arch/x86/include/asm/probe_roms.h b/arch/x86/include/asm/probe_roms.h
new file mode 100644
index 000000000..1c7f3815b
--- /dev/null
+++ b/arch/x86/include/asm/probe_roms.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _PROBE_ROMS_H_
+#define _PROBE_ROMS_H_
+struct pci_dev;
+
+extern void __iomem *pci_map_biosrom(struct pci_dev *pdev);
+extern void pci_unmap_biosrom(void __iomem *rom);
+extern size_t pci_biosrom_size(struct pci_dev *pdev);
+#endif
diff --git a/arch/x86/include/asm/processor-cyrix.h b/arch/x86/include/asm/processor-cyrix.h
new file mode 100644
index 000000000..aaedd73ea
--- /dev/null
+++ b/arch/x86/include/asm/processor-cyrix.h
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * NSC/Cyrix CPU indexed register access. Must be inlined instead of
+ * macros to ensure correct access ordering
+ * Access order is always 0x22 (=offset), 0x23 (=value)
+ *
+ * When using the old macros a line like
+ * setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x88);
+ * gets expanded to:
+ * do {
+ * outb((CX86_CCR2), 0x22);
+ * outb((({
+ * outb((CX86_CCR2), 0x22);
+ * inb(0x23);
+ * }) | 0x88), 0x23);
+ * } while (0);
+ *
+ * which in fact violates the access order (= 0x22, 0x22, 0x23, 0x23).
+ */
+
+static inline u8 getCx86(u8 reg)
+{
+ outb(reg, 0x22);
+ return inb(0x23);
+}
+
+static inline void setCx86(u8 reg, u8 data)
+{
+ outb(reg, 0x22);
+ outb(data, 0x23);
+}
+
+#define getCx86_old(reg) ({ outb((reg), 0x22); inb(0x23); })
+
+#define setCx86_old(reg, data) do { \
+ outb((reg), 0x22); \
+ outb((data), 0x23); \
+} while (0)
+
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
new file mode 100644
index 000000000..02c2cbda4
--- /dev/null
+++ b/arch/x86/include/asm/processor-flags.h
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_PROCESSOR_FLAGS_H
+#define _ASM_X86_PROCESSOR_FLAGS_H
+
+#include <uapi/asm/processor-flags.h>
+#include <linux/mem_encrypt.h>
+
+#ifdef CONFIG_VM86
+#define X86_VM_MASK X86_EFLAGS_VM
+#else
+#define X86_VM_MASK 0 /* No VM86 support */
+#endif
+
+/*
+ * CR3's layout varies depending on several things.
+ *
+ * If CR4.PCIDE is set (64-bit only), then CR3[11:0] is the address space ID.
+ * If PAE is enabled, then CR3[11:5] is part of the PDPT address
+ * (i.e. it's 32-byte aligned, not page-aligned) and CR3[4:0] is ignored.
+ * Otherwise (non-PAE, non-PCID), CR3[3] is PWT, CR3[4] is PCD, and
+ * CR3[2:0] and CR3[11:5] are ignored.
+ *
+ * In all cases, Linux puts zeros in the low ignored bits and in PWT and PCD.
+ *
+ * CR3[63] is always read as zero. If CR4.PCIDE is set, then CR3[63] may be
+ * written as 1 to prevent the write to CR3 from flushing the TLB.
+ *
+ * On systems with SME, one bit (in a variable position!) is stolen to indicate
+ * that the top-level paging structure is encrypted.
+ *
+ * All of the remaining bits indicate the physical address of the top-level
+ * paging structure.
+ *
+ * CR3_ADDR_MASK is the mask used by read_cr3_pa().
+ */
+#ifdef CONFIG_X86_64
+/* Mask off the address space ID and SME encryption bits. */
+#define CR3_ADDR_MASK __sme_clr(0x7FFFFFFFFFFFF000ull)
+#define CR3_PCID_MASK 0xFFFull
+#define CR3_NOFLUSH BIT_ULL(63)
+
+#else
+/*
+ * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save
+ * a tiny bit of code size by setting all the bits.
+ */
+#define CR3_ADDR_MASK 0xFFFFFFFFull
+#define CR3_PCID_MASK 0ull
+#define CR3_NOFLUSH 0
+#endif
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+# define X86_CR3_PTI_PCID_USER_BIT 11
+#endif
+
+#endif /* _ASM_X86_PROCESSOR_FLAGS_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
new file mode 100644
index 000000000..cc4bb218f
--- /dev/null
+++ b/arch/x86/include/asm/processor.h
@@ -0,0 +1,1004 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_PROCESSOR_H
+#define _ASM_X86_PROCESSOR_H
+
+#include <asm/processor-flags.h>
+
+/* Forward declaration, a strange C thing */
+struct task_struct;
+struct mm_struct;
+struct vm86;
+
+#include <asm/math_emu.h>
+#include <asm/segment.h>
+#include <asm/types.h>
+#include <uapi/asm/sigcontext.h>
+#include <asm/current.h>
+#include <asm/cpufeatures.h>
+#include <asm/page.h>
+#include <asm/pgtable_types.h>
+#include <asm/percpu.h>
+#include <asm/msr.h>
+#include <asm/desc_defs.h>
+#include <asm/nops.h>
+#include <asm/special_insns.h>
+#include <asm/fpu/types.h>
+#include <asm/unwind_hints.h>
+
+#include <linux/personality.h>
+#include <linux/cache.h>
+#include <linux/threads.h>
+#include <linux/math64.h>
+#include <linux/err.h>
+#include <linux/irqflags.h>
+#include <linux/mem_encrypt.h>
+
+/*
+ * We handle most unaligned accesses in hardware. On the other hand
+ * unaligned DMA can be quite expensive on some Nehalem processors.
+ *
+ * Based on this we disable the IP header alignment in network drivers.
+ */
+#define NET_IP_ALIGN 0
+
+#define HBP_NUM 4
+/*
+ * Default implementation of macro that returns current
+ * instruction pointer ("program counter").
+ */
+static inline void *current_text_addr(void)
+{
+ void *pc;
+
+ asm volatile("mov $1f, %0; 1:":"=r" (pc));
+
+ return pc;
+}
+
+/*
+ * These alignment constraints are for performance in the vSMP case,
+ * but in the task_struct case we must also meet hardware imposed
+ * alignment requirements of the FPU state:
+ */
+#ifdef CONFIG_X86_VSMP
+# define ARCH_MIN_TASKALIGN (1 << INTERNODE_CACHE_SHIFT)
+# define ARCH_MIN_MMSTRUCT_ALIGN (1 << INTERNODE_CACHE_SHIFT)
+#else
+# define ARCH_MIN_TASKALIGN __alignof__(union fpregs_state)
+# define ARCH_MIN_MMSTRUCT_ALIGN 0
+#endif
+
+enum tlb_infos {
+ ENTRIES,
+ NR_INFO
+};
+
+extern u16 __read_mostly tlb_lli_4k[NR_INFO];
+extern u16 __read_mostly tlb_lli_2m[NR_INFO];
+extern u16 __read_mostly tlb_lli_4m[NR_INFO];
+extern u16 __read_mostly tlb_lld_4k[NR_INFO];
+extern u16 __read_mostly tlb_lld_2m[NR_INFO];
+extern u16 __read_mostly tlb_lld_4m[NR_INFO];
+extern u16 __read_mostly tlb_lld_1g[NR_INFO];
+
+/*
+ * CPU type and hardware bug flags. Kept separately for each CPU.
+ * Members of this structure are referenced in head_32.S, so think twice
+ * before touching them. [mj]
+ */
+
+struct cpuinfo_x86 {
+ __u8 x86; /* CPU family */
+ __u8 x86_vendor; /* CPU vendor */
+ __u8 x86_model;
+ __u8 x86_stepping;
+#ifdef CONFIG_X86_64
+ /* Number of 4K pages in DTLB/ITLB combined(in pages): */
+ int x86_tlbsize;
+#endif
+ __u8 x86_virt_bits;
+ __u8 x86_phys_bits;
+ /* CPUID returned core id bits: */
+ __u8 x86_coreid_bits;
+ __u8 cu_id;
+ /* Max extended CPUID function supported: */
+ __u32 extended_cpuid_level;
+ /* Maximum supported CPUID level, -1=no CPUID: */
+ int cpuid_level;
+ __u32 x86_capability[NCAPINTS + NBUGINTS];
+ char x86_vendor_id[16];
+ char x86_model_id[64];
+ /* in KB - valid for CPUS which support this call: */
+ unsigned int x86_cache_size;
+ int x86_cache_alignment; /* In bytes */
+ /* Cache QoS architectural values: */
+ int x86_cache_max_rmid; /* max index */
+ int x86_cache_occ_scale; /* scale to bytes */
+ int x86_power;
+ unsigned long loops_per_jiffy;
+ /* cpuid returned max cores value: */
+ u16 x86_max_cores;
+ u16 apicid;
+ u16 initial_apicid;
+ u16 x86_clflush_size;
+ /* number of cores as seen by the OS: */
+ u16 booted_cores;
+ /* Physical processor id: */
+ u16 phys_proc_id;
+ /* Logical processor id: */
+ u16 logical_proc_id;
+ /* Core id: */
+ u16 cpu_core_id;
+ /* Index into per_cpu list: */
+ u16 cpu_index;
+ u32 microcode;
+ /* Address space bits used by the cache internally */
+ u8 x86_cache_bits;
+ unsigned initialized : 1;
+} __randomize_layout;
+
+struct cpuid_regs {
+ u32 eax, ebx, ecx, edx;
+};
+
+enum cpuid_regs_idx {
+ CPUID_EAX = 0,
+ CPUID_EBX,
+ CPUID_ECX,
+ CPUID_EDX,
+};
+
+#define X86_VENDOR_INTEL 0
+#define X86_VENDOR_CYRIX 1
+#define X86_VENDOR_AMD 2
+#define X86_VENDOR_UMC 3
+#define X86_VENDOR_CENTAUR 5
+#define X86_VENDOR_TRANSMETA 7
+#define X86_VENDOR_NSC 8
+#define X86_VENDOR_NUM 9
+
+#define X86_VENDOR_UNKNOWN 0xff
+
+/*
+ * capabilities of CPUs
+ */
+extern struct cpuinfo_x86 boot_cpu_data;
+extern struct cpuinfo_x86 new_cpu_data;
+
+extern struct x86_hw_tss doublefault_tss;
+extern __u32 cpu_caps_cleared[NCAPINTS + NBUGINTS];
+extern __u32 cpu_caps_set[NCAPINTS + NBUGINTS];
+
+#ifdef CONFIG_SMP
+DECLARE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
+#define cpu_data(cpu) per_cpu(cpu_info, cpu)
+#else
+#define cpu_info boot_cpu_data
+#define cpu_data(cpu) boot_cpu_data
+#endif
+
+extern const struct seq_operations cpuinfo_op;
+
+#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
+
+extern void cpu_detect(struct cpuinfo_x86 *c);
+
+static inline unsigned long long l1tf_pfn_limit(void)
+{
+ return BIT_ULL(boot_cpu_data.x86_cache_bits - 1 - PAGE_SHIFT);
+}
+
+extern void early_cpu_init(void);
+extern void identify_boot_cpu(void);
+extern void identify_secondary_cpu(struct cpuinfo_x86 *);
+extern void print_cpu_info(struct cpuinfo_x86 *);
+void print_cpu_msr(struct cpuinfo_x86 *);
+
+#ifdef CONFIG_X86_32
+extern int have_cpuid_p(void);
+#else
+static inline int have_cpuid_p(void)
+{
+ return 1;
+}
+#endif
+static inline void native_cpuid(unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx)
+{
+ /* ecx is often an input as well as an output. */
+ asm volatile("cpuid"
+ : "=a" (*eax),
+ "=b" (*ebx),
+ "=c" (*ecx),
+ "=d" (*edx)
+ : "0" (*eax), "2" (*ecx)
+ : "memory");
+}
+
+#define native_cpuid_reg(reg) \
+static inline unsigned int native_cpuid_##reg(unsigned int op) \
+{ \
+ unsigned int eax = op, ebx, ecx = 0, edx; \
+ \
+ native_cpuid(&eax, &ebx, &ecx, &edx); \
+ \
+ return reg; \
+}
+
+/*
+ * Native CPUID functions returning a single datum.
+ */
+native_cpuid_reg(eax)
+native_cpuid_reg(ebx)
+native_cpuid_reg(ecx)
+native_cpuid_reg(edx)
+
+/*
+ * Friendlier CR3 helpers.
+ */
+static inline unsigned long read_cr3_pa(void)
+{
+ return __read_cr3() & CR3_ADDR_MASK;
+}
+
+static inline unsigned long native_read_cr3_pa(void)
+{
+ return __native_read_cr3() & CR3_ADDR_MASK;
+}
+
+static inline void load_cr3(pgd_t *pgdir)
+{
+ write_cr3(__sme_pa(pgdir));
+}
+
+/*
+ * Note that while the legacy 'TSS' name comes from 'Task State Segment',
+ * on modern x86 CPUs the TSS also holds information important to 64-bit mode,
+ * unrelated to the task-switch mechanism:
+ */
+#ifdef CONFIG_X86_32
+/* This is the TSS defined by the hardware. */
+struct x86_hw_tss {
+ unsigned short back_link, __blh;
+ unsigned long sp0;
+ unsigned short ss0, __ss0h;
+ unsigned long sp1;
+
+ /*
+ * We don't use ring 1, so ss1 is a convenient scratch space in
+ * the same cacheline as sp0. We use ss1 to cache the value in
+ * MSR_IA32_SYSENTER_CS. When we context switch
+ * MSR_IA32_SYSENTER_CS, we first check if the new value being
+ * written matches ss1, and, if it's not, then we wrmsr the new
+ * value and update ss1.
+ *
+ * The only reason we context switch MSR_IA32_SYSENTER_CS is
+ * that we set it to zero in vm86 tasks to avoid corrupting the
+ * stack if we were to go through the sysenter path from vm86
+ * mode.
+ */
+ unsigned short ss1; /* MSR_IA32_SYSENTER_CS */
+
+ unsigned short __ss1h;
+ unsigned long sp2;
+ unsigned short ss2, __ss2h;
+ unsigned long __cr3;
+ unsigned long ip;
+ unsigned long flags;
+ unsigned long ax;
+ unsigned long cx;
+ unsigned long dx;
+ unsigned long bx;
+ unsigned long sp;
+ unsigned long bp;
+ unsigned long si;
+ unsigned long di;
+ unsigned short es, __esh;
+ unsigned short cs, __csh;
+ unsigned short ss, __ssh;
+ unsigned short ds, __dsh;
+ unsigned short fs, __fsh;
+ unsigned short gs, __gsh;
+ unsigned short ldt, __ldth;
+ unsigned short trace;
+ unsigned short io_bitmap_base;
+
+} __attribute__((packed));
+#else
+struct x86_hw_tss {
+ u32 reserved1;
+ u64 sp0;
+
+ /*
+ * We store cpu_current_top_of_stack in sp1 so it's always accessible.
+ * Linux does not use ring 1, so sp1 is not otherwise needed.
+ */
+ u64 sp1;
+
+ u64 sp2;
+ u64 reserved2;
+ u64 ist[7];
+ u32 reserved3;
+ u32 reserved4;
+ u16 reserved5;
+ u16 io_bitmap_base;
+
+} __attribute__((packed));
+#endif
+
+/*
+ * IO-bitmap sizes:
+ */
+#define IO_BITMAP_BITS 65536
+#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
+#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
+#define IO_BITMAP_OFFSET (offsetof(struct tss_struct, io_bitmap) - offsetof(struct tss_struct, x86_tss))
+#define INVALID_IO_BITMAP_OFFSET 0x8000
+
+struct entry_stack {
+ char stack[PAGE_SIZE];
+};
+
+struct entry_stack_page {
+ struct entry_stack stack;
+} __aligned(PAGE_SIZE);
+
+struct tss_struct {
+ /*
+ * The fixed hardware portion. This must not cross a page boundary
+ * at risk of violating the SDM's advice and potentially triggering
+ * errata.
+ */
+ struct x86_hw_tss x86_tss;
+
+ /*
+ * The extra 1 is there because the CPU will access an
+ * additional byte beyond the end of the IO permission
+ * bitmap. The extra byte must be all 1 bits, and must
+ * be within the limit.
+ */
+ unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
+} __aligned(PAGE_SIZE);
+
+DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw);
+
+/*
+ * sizeof(unsigned long) coming from an extra "long" at the end
+ * of the iobitmap.
+ *
+ * -1? seg base+limit should be pointing to the address of the
+ * last valid byte
+ */
+#define __KERNEL_TSS_LIMIT \
+ (IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1)
+
+#ifdef CONFIG_X86_32
+DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
+#else
+/* The RO copy can't be accessed with this_cpu_xyz(), so use the RW copy. */
+#define cpu_current_top_of_stack cpu_tss_rw.x86_tss.sp1
+#endif
+
+/*
+ * Save the original ist values for checking stack pointers during debugging
+ */
+struct orig_ist {
+ unsigned long ist[7];
+};
+
+#ifdef CONFIG_X86_64
+DECLARE_PER_CPU(struct orig_ist, orig_ist);
+
+union irq_stack_union {
+ char irq_stack[IRQ_STACK_SIZE];
+ /*
+ * GCC hardcodes the stack canary as %gs:40. Since the
+ * irq_stack is the object at %gs:0, we reserve the bottom
+ * 48 bytes of the irq stack for the canary.
+ */
+ struct {
+ char gs_base[40];
+ unsigned long stack_canary;
+ };
+};
+
+DECLARE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __visible;
+DECLARE_INIT_PER_CPU(irq_stack_union);
+
+static inline unsigned long cpu_kernelmode_gs_base(int cpu)
+{
+ return (unsigned long)per_cpu(irq_stack_union.gs_base, cpu);
+}
+
+DECLARE_PER_CPU(char *, irq_stack_ptr);
+DECLARE_PER_CPU(unsigned int, irq_count);
+extern asmlinkage void ignore_sysret(void);
+
+#if IS_ENABLED(CONFIG_KVM)
+/* Save actual FS/GS selectors and bases to current->thread */
+void save_fsgs_for_kvm(void);
+#endif
+#else /* X86_64 */
+#ifdef CONFIG_STACKPROTECTOR
+/*
+ * Make sure stack canary segment base is cached-aligned:
+ * "For Intel Atom processors, avoid non zero segment base address
+ * that is not aligned to cache line boundary at all cost."
+ * (Optim Ref Manual Assembly/Compiler Coding Rule 15.)
+ */
+struct stack_canary {
+ char __pad[20]; /* canary at %gs:20 */
+ unsigned long canary;
+};
+DECLARE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
+#endif
+/*
+ * per-CPU IRQ handling stacks
+ */
+struct irq_stack {
+ u32 stack[THREAD_SIZE/sizeof(u32)];
+} __aligned(THREAD_SIZE);
+
+DECLARE_PER_CPU(struct irq_stack *, hardirq_stack);
+DECLARE_PER_CPU(struct irq_stack *, softirq_stack);
+#endif /* X86_64 */
+
+extern unsigned int fpu_kernel_xstate_size;
+extern unsigned int fpu_user_xstate_size;
+
+struct perf_event;
+
+typedef struct {
+ unsigned long seg;
+} mm_segment_t;
+
+struct thread_struct {
+ /* Cached TLS descriptors: */
+ struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
+#ifdef CONFIG_X86_32
+ unsigned long sp0;
+#endif
+ unsigned long sp;
+#ifdef CONFIG_X86_32
+ unsigned long sysenter_cs;
+#else
+ unsigned short es;
+ unsigned short ds;
+ unsigned short fsindex;
+ unsigned short gsindex;
+#endif
+
+#ifdef CONFIG_X86_64
+ unsigned long fsbase;
+ unsigned long gsbase;
+#else
+ /*
+ * XXX: this could presumably be unsigned short. Alternatively,
+ * 32-bit kernels could be taught to use fsindex instead.
+ */
+ unsigned long fs;
+ unsigned long gs;
+#endif
+
+ /* Save middle states of ptrace breakpoints */
+ struct perf_event *ptrace_bps[HBP_NUM];
+ /* Debug status used for traps, single steps, etc... */
+ unsigned long debugreg6;
+ /* Keep track of the exact dr7 value set by the user */
+ unsigned long ptrace_dr7;
+ /* Fault info: */
+ unsigned long cr2;
+ unsigned long trap_nr;
+ unsigned long error_code;
+#ifdef CONFIG_VM86
+ /* Virtual 86 mode info */
+ struct vm86 *vm86;
+#endif
+ /* IO permissions: */
+ unsigned long *io_bitmap_ptr;
+ unsigned long iopl;
+ /* Max allowed port in the bitmap, in bytes: */
+ unsigned io_bitmap_max;
+
+ mm_segment_t addr_limit;
+
+ unsigned int sig_on_uaccess_err:1;
+ unsigned int uaccess_err:1; /* uaccess failed */
+
+ /* Floating point and extended processor state */
+ struct fpu fpu;
+ /*
+ * WARNING: 'fpu' is dynamically-sized. It *MUST* be at
+ * the end.
+ */
+};
+
+/* Whitelist the FPU state from the task_struct for hardened usercopy. */
+static inline void arch_thread_struct_whitelist(unsigned long *offset,
+ unsigned long *size)
+{
+ *offset = offsetof(struct thread_struct, fpu.state);
+ *size = fpu_kernel_xstate_size;
+}
+
+/*
+ * Set IOPL bits in EFLAGS from given mask
+ */
+static inline void native_set_iopl_mask(unsigned mask)
+{
+#ifdef CONFIG_X86_32
+ unsigned int reg;
+
+ asm volatile ("pushfl;"
+ "popl %0;"
+ "andl %1, %0;"
+ "orl %2, %0;"
+ "pushl %0;"
+ "popfl"
+ : "=&r" (reg)
+ : "i" (~X86_EFLAGS_IOPL), "r" (mask));
+#endif
+}
+
+static inline void
+native_load_sp0(unsigned long sp0)
+{
+ this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0);
+}
+
+static inline void native_swapgs(void)
+{
+#ifdef CONFIG_X86_64
+ asm volatile("swapgs" ::: "memory");
+#endif
+}
+
+static inline unsigned long current_top_of_stack(void)
+{
+ /*
+ * We can't read directly from tss.sp0: sp0 on x86_32 is special in
+ * and around vm86 mode and sp0 on x86_64 is special because of the
+ * entry trampoline.
+ */
+ return this_cpu_read_stable(cpu_current_top_of_stack);
+}
+
+static inline bool on_thread_stack(void)
+{
+ return (unsigned long)(current_top_of_stack() -
+ current_stack_pointer) < THREAD_SIZE;
+}
+
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+#define __cpuid native_cpuid
+
+static inline void load_sp0(unsigned long sp0)
+{
+ native_load_sp0(sp0);
+}
+
+#define set_iopl_mask native_set_iopl_mask
+#endif /* CONFIG_PARAVIRT */
+
+/* Free all resources held by a thread. */
+extern void release_thread(struct task_struct *);
+
+unsigned long get_wchan(struct task_struct *p);
+
+/*
+ * Generic CPUID function
+ * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
+ * resulting in stale register contents being returned.
+ */
+static inline void cpuid(unsigned int op,
+ unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx)
+{
+ *eax = op;
+ *ecx = 0;
+ __cpuid(eax, ebx, ecx, edx);
+}
+
+/* Some CPUID calls want 'count' to be placed in ecx */
+static inline void cpuid_count(unsigned int op, int count,
+ unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx)
+{
+ *eax = op;
+ *ecx = count;
+ __cpuid(eax, ebx, ecx, edx);
+}
+
+/*
+ * CPUID functions returning a single datum
+ */
+static inline unsigned int cpuid_eax(unsigned int op)
+{
+ unsigned int eax, ebx, ecx, edx;
+
+ cpuid(op, &eax, &ebx, &ecx, &edx);
+
+ return eax;
+}
+
+static inline unsigned int cpuid_ebx(unsigned int op)
+{
+ unsigned int eax, ebx, ecx, edx;
+
+ cpuid(op, &eax, &ebx, &ecx, &edx);
+
+ return ebx;
+}
+
+static inline unsigned int cpuid_ecx(unsigned int op)
+{
+ unsigned int eax, ebx, ecx, edx;
+
+ cpuid(op, &eax, &ebx, &ecx, &edx);
+
+ return ecx;
+}
+
+static inline unsigned int cpuid_edx(unsigned int op)
+{
+ unsigned int eax, ebx, ecx, edx;
+
+ cpuid(op, &eax, &ebx, &ecx, &edx);
+
+ return edx;
+}
+
+/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
+static __always_inline void rep_nop(void)
+{
+ asm volatile("rep; nop" ::: "memory");
+}
+
+static __always_inline void cpu_relax(void)
+{
+ rep_nop();
+}
+
+/*
+ * This function forces the icache and prefetched instruction stream to
+ * catch up with reality in two very specific cases:
+ *
+ * a) Text was modified using one virtual address and is about to be executed
+ * from the same physical page at a different virtual address.
+ *
+ * b) Text was modified on a different CPU, may subsequently be
+ * executed on this CPU, and you want to make sure the new version
+ * gets executed. This generally means you're calling this in a IPI.
+ *
+ * If you're calling this for a different reason, you're probably doing
+ * it wrong.
+ */
+static inline void sync_core(void)
+{
+ /*
+ * There are quite a few ways to do this. IRET-to-self is nice
+ * because it works on every CPU, at any CPL (so it's compatible
+ * with paravirtualization), and it never exits to a hypervisor.
+ * The only down sides are that it's a bit slow (it seems to be
+ * a bit more than 2x slower than the fastest options) and that
+ * it unmasks NMIs. The "push %cs" is needed because, in
+ * paravirtual environments, __KERNEL_CS may not be a valid CS
+ * value when we do IRET directly.
+ *
+ * In case NMI unmasking or performance ever becomes a problem,
+ * the next best option appears to be MOV-to-CR2 and an
+ * unconditional jump. That sequence also works on all CPUs,
+ * but it will fault at CPL3 (i.e. Xen PV).
+ *
+ * CPUID is the conventional way, but it's nasty: it doesn't
+ * exist on some 486-like CPUs, and it usually exits to a
+ * hypervisor.
+ *
+ * Like all of Linux's memory ordering operations, this is a
+ * compiler barrier as well.
+ */
+#ifdef CONFIG_X86_32
+ asm volatile (
+ "pushfl\n\t"
+ "pushl %%cs\n\t"
+ "pushl $1f\n\t"
+ "iret\n\t"
+ "1:"
+ : ASM_CALL_CONSTRAINT : : "memory");
+#else
+ unsigned int tmp;
+
+ asm volatile (
+ UNWIND_HINT_SAVE
+ "mov %%ss, %0\n\t"
+ "pushq %q0\n\t"
+ "pushq %%rsp\n\t"
+ "addq $8, (%%rsp)\n\t"
+ "pushfq\n\t"
+ "mov %%cs, %0\n\t"
+ "pushq %q0\n\t"
+ "pushq $1f\n\t"
+ "iretq\n\t"
+ UNWIND_HINT_RESTORE
+ "1:"
+ : "=&r" (tmp), ASM_CALL_CONSTRAINT : : "cc", "memory");
+#endif
+}
+
+extern void select_idle_routine(const struct cpuinfo_x86 *c);
+extern void amd_e400_c1e_apic_setup(void);
+
+extern unsigned long boot_option_idle_override;
+
+enum idle_boot_override {IDLE_NO_OVERRIDE=0, IDLE_HALT, IDLE_NOMWAIT,
+ IDLE_POLL};
+
+extern void enable_sep_cpu(void);
+extern int sysenter_setup(void);
+
+void early_trap_pf_init(void);
+
+/* Defined in head.S */
+extern struct desc_ptr early_gdt_descr;
+
+extern void switch_to_new_gdt(int);
+extern void load_direct_gdt(int);
+extern void load_fixmap_gdt(int);
+extern void load_percpu_segment(int);
+extern void cpu_init(void);
+
+static inline unsigned long get_debugctlmsr(void)
+{
+ unsigned long debugctlmsr = 0;
+
+#ifndef CONFIG_X86_DEBUGCTLMSR
+ if (boot_cpu_data.x86 < 6)
+ return 0;
+#endif
+ rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
+
+ return debugctlmsr;
+}
+
+static inline void update_debugctlmsr(unsigned long debugctlmsr)
+{
+#ifndef CONFIG_X86_DEBUGCTLMSR
+ if (boot_cpu_data.x86 < 6)
+ return;
+#endif
+ wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
+}
+
+extern void set_task_blockstep(struct task_struct *task, bool on);
+
+/* Boot loader type from the setup header: */
+extern int bootloader_type;
+extern int bootloader_version;
+
+extern char ignore_fpu_irq;
+
+#define HAVE_ARCH_PICK_MMAP_LAYOUT 1
+#define ARCH_HAS_PREFETCHW
+#define ARCH_HAS_SPINLOCK_PREFETCH
+
+#ifdef CONFIG_X86_32
+# define BASE_PREFETCH ""
+# define ARCH_HAS_PREFETCH
+#else
+# define BASE_PREFETCH "prefetcht0 %P1"
+#endif
+
+/*
+ * Prefetch instructions for Pentium III (+) and AMD Athlon (+)
+ *
+ * It's not worth to care about 3dnow prefetches for the K6
+ * because they are microcoded there and very slow.
+ */
+static inline void prefetch(const void *x)
+{
+ alternative_input(BASE_PREFETCH, "prefetchnta %P1",
+ X86_FEATURE_XMM,
+ "m" (*(const char *)x));
+}
+
+/*
+ * 3dnow prefetch to get an exclusive cache line.
+ * Useful for spinlocks to avoid one state transition in the
+ * cache coherency protocol:
+ */
+static inline void prefetchw(const void *x)
+{
+ alternative_input(BASE_PREFETCH, "prefetchw %P1",
+ X86_FEATURE_3DNOWPREFETCH,
+ "m" (*(const char *)x));
+}
+
+static inline void spin_lock_prefetch(const void *x)
+{
+ prefetchw(x);
+}
+
+#define TOP_OF_INIT_STACK ((unsigned long)&init_stack + sizeof(init_stack) - \
+ TOP_OF_KERNEL_STACK_PADDING)
+
+#define task_top_of_stack(task) ((unsigned long)(task_pt_regs(task) + 1))
+
+#define task_pt_regs(task) \
+({ \
+ unsigned long __ptr = (unsigned long)task_stack_page(task); \
+ __ptr += THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; \
+ ((struct pt_regs *)__ptr) - 1; \
+})
+
+#ifdef CONFIG_X86_32
+/*
+ * User space process size: 3GB (default).
+ */
+#define IA32_PAGE_OFFSET PAGE_OFFSET
+#define TASK_SIZE PAGE_OFFSET
+#define TASK_SIZE_LOW TASK_SIZE
+#define TASK_SIZE_MAX TASK_SIZE
+#define DEFAULT_MAP_WINDOW TASK_SIZE
+#define STACK_TOP TASK_SIZE
+#define STACK_TOP_MAX STACK_TOP
+
+#define INIT_THREAD { \
+ .sp0 = TOP_OF_INIT_STACK, \
+ .sysenter_cs = __KERNEL_CS, \
+ .io_bitmap_ptr = NULL, \
+ .addr_limit = KERNEL_DS, \
+}
+
+#define KSTK_ESP(task) (task_pt_regs(task)->sp)
+
+#else
+/*
+ * User space process size. This is the first address outside the user range.
+ * There are a few constraints that determine this:
+ *
+ * On Intel CPUs, if a SYSCALL instruction is at the highest canonical
+ * address, then that syscall will enter the kernel with a
+ * non-canonical return address, and SYSRET will explode dangerously.
+ * We avoid this particular problem by preventing anything executable
+ * from being mapped at the maximum canonical address.
+ *
+ * On AMD CPUs in the Ryzen family, there's a nasty bug in which the
+ * CPUs malfunction if they execute code from the highest canonical page.
+ * They'll speculate right off the end of the canonical space, and
+ * bad things happen. This is worked around in the same way as the
+ * Intel problem.
+ *
+ * With page table isolation enabled, we map the LDT in ... [stay tuned]
+ */
+#define TASK_SIZE_MAX ((1UL << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE)
+
+#define DEFAULT_MAP_WINDOW ((1UL << 47) - PAGE_SIZE)
+
+/* This decides where the kernel will search for a free chunk of vm
+ * space during mmap's.
+ */
+#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? \
+ 0xc0000000 : 0xFFFFe000)
+
+#define TASK_SIZE_LOW (test_thread_flag(TIF_ADDR32) ? \
+ IA32_PAGE_OFFSET : DEFAULT_MAP_WINDOW)
+#define TASK_SIZE (test_thread_flag(TIF_ADDR32) ? \
+ IA32_PAGE_OFFSET : TASK_SIZE_MAX)
+#define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_ADDR32)) ? \
+ IA32_PAGE_OFFSET : TASK_SIZE_MAX)
+
+#define STACK_TOP TASK_SIZE_LOW
+#define STACK_TOP_MAX TASK_SIZE_MAX
+
+#define INIT_THREAD { \
+ .addr_limit = KERNEL_DS, \
+}
+
+extern unsigned long KSTK_ESP(struct task_struct *task);
+
+#endif /* CONFIG_X86_64 */
+
+extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
+ unsigned long new_sp);
+
+/*
+ * This decides where the kernel will search for a free chunk of vm
+ * space during mmap's.
+ */
+#define __TASK_UNMAPPED_BASE(task_size) (PAGE_ALIGN(task_size / 3))
+#define TASK_UNMAPPED_BASE __TASK_UNMAPPED_BASE(TASK_SIZE_LOW)
+
+#define KSTK_EIP(task) (task_pt_regs(task)->ip)
+
+/* Get/set a process' ability to use the timestamp counter instruction */
+#define GET_TSC_CTL(adr) get_tsc_mode((adr))
+#define SET_TSC_CTL(val) set_tsc_mode((val))
+
+extern int get_tsc_mode(unsigned long adr);
+extern int set_tsc_mode(unsigned int val);
+
+DECLARE_PER_CPU(u64, msr_misc_features_shadow);
+
+/* Register/unregister a process' MPX related resource */
+#define MPX_ENABLE_MANAGEMENT() mpx_enable_management()
+#define MPX_DISABLE_MANAGEMENT() mpx_disable_management()
+
+#ifdef CONFIG_X86_INTEL_MPX
+extern int mpx_enable_management(void);
+extern int mpx_disable_management(void);
+#else
+static inline int mpx_enable_management(void)
+{
+ return -EINVAL;
+}
+static inline int mpx_disable_management(void)
+{
+ return -EINVAL;
+}
+#endif /* CONFIG_X86_INTEL_MPX */
+
+#ifdef CONFIG_CPU_SUP_AMD
+extern u16 amd_get_nb_id(int cpu);
+extern u32 amd_get_nodes_per_socket(void);
+#else
+static inline u16 amd_get_nb_id(int cpu) { return 0; }
+static inline u32 amd_get_nodes_per_socket(void) { return 0; }
+#endif
+
+static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves)
+{
+ uint32_t base, eax, signature[3];
+
+ for (base = 0x40000000; base < 0x40010000; base += 0x100) {
+ cpuid(base, &eax, &signature[0], &signature[1], &signature[2]);
+
+ if (!memcmp(sig, signature, 12) &&
+ (leaves == 0 || ((eax - base) >= leaves)))
+ return base;
+ }
+
+ return 0;
+}
+
+extern unsigned long arch_align_stack(unsigned long sp);
+extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
+extern void free_kernel_image_pages(void *begin, void *end);
+
+void default_idle(void);
+#ifdef CONFIG_XEN
+bool xen_set_default_idle(void);
+#else
+#define xen_set_default_idle 0
+#endif
+
+void stop_this_cpu(void *dummy);
+void df_debug(struct pt_regs *regs, long error_code);
+void microcode_check(void);
+
+enum l1tf_mitigations {
+ L1TF_MITIGATION_OFF,
+ L1TF_MITIGATION_FLUSH_NOWARN,
+ L1TF_MITIGATION_FLUSH,
+ L1TF_MITIGATION_FLUSH_NOSMT,
+ L1TF_MITIGATION_FULL,
+ L1TF_MITIGATION_FULL_FORCE
+};
+
+extern enum l1tf_mitigations l1tf_mitigation;
+
+enum mds_mitigations {
+ MDS_MITIGATION_OFF,
+ MDS_MITIGATION_FULL,
+ MDS_MITIGATION_VMWERV,
+};
+
+enum taa_mitigations {
+ TAA_MITIGATION_OFF,
+ TAA_MITIGATION_UCODE_NEEDED,
+ TAA_MITIGATION_VERW,
+ TAA_MITIGATION_TSX_DISABLED,
+};
+
+#endif /* _ASM_X86_PROCESSOR_H */
diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h
new file mode 100644
index 000000000..1d081ac1c
--- /dev/null
+++ b/arch/x86/include/asm/prom.h
@@ -0,0 +1,41 @@
+/*
+ * Definitions for Device tree / OpenFirmware handling on X86
+ *
+ * based on arch/powerpc/include/asm/prom.h which is
+ * Copyright (C) 1996-2005 Paul Mackerras.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _ASM_X86_PROM_H
+#define _ASM_X86_PROM_H
+#ifndef __ASSEMBLY__
+
+#include <linux/of.h>
+#include <linux/types.h>
+#include <linux/pci.h>
+
+#include <asm/irq.h>
+#include <linux/atomic.h>
+#include <asm/setup.h>
+
+#ifdef CONFIG_OF
+extern int of_ioapic;
+extern u64 initial_dtb;
+extern void add_dtb(u64 data);
+void x86_of_pci_init(void);
+void x86_dtb_init(void);
+#else
+static inline void add_dtb(u64 data) { }
+static inline void x86_of_pci_init(void) { }
+static inline void x86_dtb_init(void) { }
+#define of_ioapic 0
+#endif
+
+extern char cmd_line[COMMAND_LINE_SIZE];
+
+#endif /* __ASSEMBLY__ */
+#endif
diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h
new file mode 100644
index 000000000..0eaca7a13
--- /dev/null
+++ b/arch/x86/include/asm/proto.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_PROTO_H
+#define _ASM_X86_PROTO_H
+
+#include <asm/ldt.h>
+
+struct task_struct;
+
+/* misc architecture specific prototypes */
+
+void syscall_init(void);
+
+#ifdef CONFIG_X86_64
+void entry_SYSCALL_64(void);
+long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2);
+#endif
+
+#ifdef CONFIG_X86_32
+void entry_INT80_32(void);
+void entry_SYSENTER_32(void);
+void __begin_SYSENTER_singlestep_region(void);
+void __end_SYSENTER_singlestep_region(void);
+#endif
+
+#ifdef CONFIG_IA32_EMULATION
+void entry_SYSENTER_compat(void);
+void __end_entry_SYSENTER_compat(void);
+void entry_SYSCALL_compat(void);
+void entry_INT80_compat(void);
+#if defined(CONFIG_X86_64) && defined(CONFIG_XEN_PV)
+void xen_entry_INT80_compat(void);
+#endif
+#endif
+
+void x86_configure_nx(void);
+void x86_report_nx(void);
+
+extern int reboot_force;
+
+long do_arch_prctl_common(struct task_struct *task, int option,
+ unsigned long cpuid_enabled);
+
+#endif /* _ASM_X86_PROTO_H */
diff --git a/arch/x86/include/asm/pti.h b/arch/x86/include/asm/pti.h
new file mode 100644
index 000000000..5df09a0b8
--- /dev/null
+++ b/arch/x86/include/asm/pti.h
@@ -0,0 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef _ASM_X86_PTI_H
+#define _ASM_X86_PTI_H
+#ifndef __ASSEMBLY__
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+extern void pti_init(void);
+extern void pti_check_boottime_disable(void);
+extern void pti_finalize(void);
+#else
+static inline void pti_check_boottime_disable(void) { }
+#endif
+
+#endif /* __ASSEMBLY__ */
+#endif /* _ASM_X86_PTI_H */
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
new file mode 100644
index 000000000..ee696efec
--- /dev/null
+++ b/arch/x86/include/asm/ptrace.h
@@ -0,0 +1,319 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_PTRACE_H
+#define _ASM_X86_PTRACE_H
+
+#include <asm/segment.h>
+#include <asm/page_types.h>
+#include <uapi/asm/ptrace.h>
+
+#ifndef __ASSEMBLY__
+#ifdef __i386__
+
+struct pt_regs {
+ /*
+ * NB: 32-bit x86 CPUs are inconsistent as what happens in the
+ * following cases (where %seg represents a segment register):
+ *
+ * - pushl %seg: some do a 16-bit write and leave the high
+ * bits alone
+ * - movl %seg, [mem]: some do a 16-bit write despite the movl
+ * - IDT entry: some (e.g. 486) will leave the high bits of CS
+ * and (if applicable) SS undefined.
+ *
+ * Fortunately, x86-32 doesn't read the high bits on POP or IRET,
+ * so we can just treat all of the segment registers as 16-bit
+ * values.
+ */
+ unsigned long bx;
+ unsigned long cx;
+ unsigned long dx;
+ unsigned long si;
+ unsigned long di;
+ unsigned long bp;
+ unsigned long ax;
+ unsigned short ds;
+ unsigned short __dsh;
+ unsigned short es;
+ unsigned short __esh;
+ unsigned short fs;
+ unsigned short __fsh;
+ unsigned short gs;
+ unsigned short __gsh;
+ unsigned long orig_ax;
+ unsigned long ip;
+ unsigned short cs;
+ unsigned short __csh;
+ unsigned long flags;
+ unsigned long sp;
+ unsigned short ss;
+ unsigned short __ssh;
+};
+
+#else /* __i386__ */
+
+struct pt_regs {
+/*
+ * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
+ * unless syscall needs a complete, fully filled "struct pt_regs".
+ */
+ unsigned long r15;
+ unsigned long r14;
+ unsigned long r13;
+ unsigned long r12;
+ unsigned long bp;
+ unsigned long bx;
+/* These regs are callee-clobbered. Always saved on kernel entry. */
+ unsigned long r11;
+ unsigned long r10;
+ unsigned long r9;
+ unsigned long r8;
+ unsigned long ax;
+ unsigned long cx;
+ unsigned long dx;
+ unsigned long si;
+ unsigned long di;
+/*
+ * On syscall entry, this is syscall#. On CPU exception, this is error code.
+ * On hw interrupt, it's IRQ number:
+ */
+ unsigned long orig_ax;
+/* Return frame for iretq */
+ unsigned long ip;
+ unsigned long cs;
+ unsigned long flags;
+ unsigned long sp;
+ unsigned long ss;
+/* top of stack page */
+};
+
+#endif /* !__i386__ */
+
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt_types.h>
+#endif
+
+struct cpuinfo_x86;
+struct task_struct;
+
+extern unsigned long profile_pc(struct pt_regs *regs);
+#define profile_pc profile_pc
+
+extern unsigned long
+convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs);
+extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
+ int error_code, int si_code);
+
+
+static inline unsigned long regs_return_value(struct pt_regs *regs)
+{
+ return regs->ax;
+}
+
+static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc)
+{
+ regs->ax = rc;
+}
+
+/*
+ * user_mode(regs) determines whether a register set came from user
+ * mode. On x86_32, this is true if V8086 mode was enabled OR if the
+ * register set was from protected mode with RPL-3 CS value. This
+ * tricky test checks that with one comparison.
+ *
+ * On x86_64, vm86 mode is mercifully nonexistent, and we don't need
+ * the extra check.
+ */
+static inline int user_mode(struct pt_regs *regs)
+{
+#ifdef CONFIG_X86_32
+ return ((regs->cs & SEGMENT_RPL_MASK) | (regs->flags & X86_VM_MASK)) >= USER_RPL;
+#else
+ return !!(regs->cs & 3);
+#endif
+}
+
+static inline int v8086_mode(struct pt_regs *regs)
+{
+#ifdef CONFIG_X86_32
+ return (regs->flags & X86_VM_MASK);
+#else
+ return 0; /* No V86 mode support in long mode */
+#endif
+}
+
+static inline bool user_64bit_mode(struct pt_regs *regs)
+{
+#ifdef CONFIG_X86_64
+#ifndef CONFIG_PARAVIRT
+ /*
+ * On non-paravirt systems, this is the only long mode CPL 3
+ * selector. We do not allow long mode selectors in the LDT.
+ */
+ return regs->cs == __USER_CS;
+#else
+ /* Headers are too twisted for this to go in paravirt.h. */
+ return regs->cs == __USER_CS || regs->cs == pv_info.extra_user_64bit_cs;
+#endif
+#else /* !CONFIG_X86_64 */
+ return false;
+#endif
+}
+
+#ifdef CONFIG_X86_64
+#define current_user_stack_pointer() current_pt_regs()->sp
+#define compat_user_stack_pointer() current_pt_regs()->sp
+#endif
+
+#ifdef CONFIG_X86_32
+extern unsigned long kernel_stack_pointer(struct pt_regs *regs);
+#else
+static inline unsigned long kernel_stack_pointer(struct pt_regs *regs)
+{
+ return regs->sp;
+}
+#endif
+
+#define GET_IP(regs) ((regs)->ip)
+#define GET_FP(regs) ((regs)->bp)
+#define GET_USP(regs) ((regs)->sp)
+
+#include <asm-generic/ptrace.h>
+
+/* Query offset/name of register from its name/offset */
+extern int regs_query_register_offset(const char *name);
+extern const char *regs_query_register_name(unsigned int offset);
+#define MAX_REG_OFFSET (offsetof(struct pt_regs, ss))
+
+/**
+ * regs_get_register() - get register value from its offset
+ * @regs: pt_regs from which register value is gotten.
+ * @offset: offset number of the register.
+ *
+ * regs_get_register returns the value of a register. The @offset is the
+ * offset of the register in struct pt_regs address which specified by @regs.
+ * If @offset is bigger than MAX_REG_OFFSET, this returns 0.
+ */
+static inline unsigned long regs_get_register(struct pt_regs *regs,
+ unsigned int offset)
+{
+ if (unlikely(offset > MAX_REG_OFFSET))
+ return 0;
+#ifdef CONFIG_X86_32
+ /*
+ * Traps from the kernel do not save sp and ss.
+ * Use the helper function to retrieve sp.
+ */
+ if (offset == offsetof(struct pt_regs, sp) &&
+ regs->cs == __KERNEL_CS)
+ return kernel_stack_pointer(regs);
+
+ /* The selector fields are 16-bit. */
+ if (offset == offsetof(struct pt_regs, cs) ||
+ offset == offsetof(struct pt_regs, ss) ||
+ offset == offsetof(struct pt_regs, ds) ||
+ offset == offsetof(struct pt_regs, es) ||
+ offset == offsetof(struct pt_regs, fs) ||
+ offset == offsetof(struct pt_regs, gs)) {
+ return *(u16 *)((unsigned long)regs + offset);
+
+ }
+#endif
+ return *(unsigned long *)((unsigned long)regs + offset);
+}
+
+/**
+ * regs_within_kernel_stack() - check the address in the stack
+ * @regs: pt_regs which contains kernel stack pointer.
+ * @addr: address which is checked.
+ *
+ * regs_within_kernel_stack() checks @addr is within the kernel stack page(s).
+ * If @addr is within the kernel stack, it returns true. If not, returns false.
+ */
+static inline int regs_within_kernel_stack(struct pt_regs *regs,
+ unsigned long addr)
+{
+ return ((addr & ~(THREAD_SIZE - 1)) ==
+ (kernel_stack_pointer(regs) & ~(THREAD_SIZE - 1)));
+}
+
+/**
+ * regs_get_kernel_stack_nth_addr() - get the address of the Nth entry on stack
+ * @regs: pt_regs which contains kernel stack pointer.
+ * @n: stack entry number.
+ *
+ * regs_get_kernel_stack_nth() returns the address of the @n th entry of the
+ * kernel stack which is specified by @regs. If the @n th entry is NOT in
+ * the kernel stack, this returns NULL.
+ */
+static inline unsigned long *regs_get_kernel_stack_nth_addr(struct pt_regs *regs, unsigned int n)
+{
+ unsigned long *addr = (unsigned long *)kernel_stack_pointer(regs);
+
+ addr += n;
+ if (regs_within_kernel_stack(regs, (unsigned long)addr))
+ return addr;
+ else
+ return NULL;
+}
+
+/* To avoid include hell, we can't include uaccess.h */
+extern long probe_kernel_read(void *dst, const void *src, size_t size);
+
+/**
+ * regs_get_kernel_stack_nth() - get Nth entry of the stack
+ * @regs: pt_regs which contains kernel stack pointer.
+ * @n: stack entry number.
+ *
+ * regs_get_kernel_stack_nth() returns @n th entry of the kernel stack which
+ * is specified by @regs. If the @n th entry is NOT in the kernel stack
+ * this returns 0.
+ */
+static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs,
+ unsigned int n)
+{
+ unsigned long *addr;
+ unsigned long val;
+ long ret;
+
+ addr = regs_get_kernel_stack_nth_addr(regs, n);
+ if (addr) {
+ ret = probe_kernel_read(&val, addr, sizeof(val));
+ if (!ret)
+ return val;
+ }
+ return 0;
+}
+
+#define arch_has_single_step() (1)
+#ifdef CONFIG_X86_DEBUGCTLMSR
+#define arch_has_block_step() (1)
+#else
+#define arch_has_block_step() (boot_cpu_data.x86 >= 6)
+#endif
+
+#define ARCH_HAS_USER_SINGLE_STEP_INFO
+
+/*
+ * When hitting ptrace_stop(), we cannot return using SYSRET because
+ * that does not restore the full CPU state, only a minimal set. The
+ * ptracer can change arbitrary register values, which is usually okay
+ * because the usual ptrace stops run off the signal delivery path which
+ * forces IRET; however, ptrace_event() stops happen in arbitrary places
+ * in the kernel and don't force IRET path.
+ *
+ * So force IRET path after a ptrace stop.
+ */
+#define arch_ptrace_stop_needed(code, info) \
+({ \
+ force_iret(); \
+ false; \
+})
+
+struct user_desc;
+extern int do_get_thread_area(struct task_struct *p, int idx,
+ struct user_desc __user *info);
+extern int do_set_thread_area(struct task_struct *p, int idx,
+ struct user_desc __user *info, int can_allocate);
+
+#endif /* !__ASSEMBLY__ */
+#endif /* _ASM_X86_PTRACE_H */
diff --git a/arch/x86/include/asm/purgatory.h b/arch/x86/include/asm/purgatory.h
new file mode 100644
index 000000000..92c34e517
--- /dev/null
+++ b/arch/x86/include/asm/purgatory.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_PURGATORY_H
+#define _ASM_X86_PURGATORY_H
+
+#ifndef __ASSEMBLY__
+#include <linux/purgatory.h>
+
+extern void purgatory(void);
+/*
+ * These forward declarations serve two purposes:
+ *
+ * 1) Make sparse happy when checking arch/purgatory
+ * 2) Document that these are required to be global so the symbol
+ * lookup in kexec works
+ */
+extern unsigned long purgatory_backup_dest;
+extern unsigned long purgatory_backup_src;
+extern unsigned long purgatory_backup_sz;
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_PURGATORY_H */
diff --git a/arch/x86/include/asm/pvclock-abi.h b/arch/x86/include/asm/pvclock-abi.h
new file mode 100644
index 000000000..1436226ef
--- /dev/null
+++ b/arch/x86/include/asm/pvclock-abi.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_PVCLOCK_ABI_H
+#define _ASM_X86_PVCLOCK_ABI_H
+#ifndef __ASSEMBLY__
+
+/*
+ * These structs MUST NOT be changed.
+ * They are the ABI between hypervisor and guest OS.
+ * Both Xen and KVM are using this.
+ *
+ * pvclock_vcpu_time_info holds the system time and the tsc timestamp
+ * of the last update. So the guest can use the tsc delta to get a
+ * more precise system time. There is one per virtual cpu.
+ *
+ * pvclock_wall_clock references the point in time when the system
+ * time was zero (usually boot time), thus the guest calculates the
+ * current wall clock by adding the system time.
+ *
+ * Protocol for the "version" fields is: hypervisor raises it (making
+ * it uneven) before it starts updating the fields and raises it again
+ * (making it even) when it is done. Thus the guest can make sure the
+ * time values it got are consistent by checking the version before
+ * and after reading them.
+ */
+
+struct pvclock_vcpu_time_info {
+ u32 version;
+ u32 pad0;
+ u64 tsc_timestamp;
+ u64 system_time;
+ u32 tsc_to_system_mul;
+ s8 tsc_shift;
+ u8 flags;
+ u8 pad[2];
+} __attribute__((__packed__)); /* 32 bytes */
+
+struct pvclock_wall_clock {
+ u32 version;
+ u32 sec;
+ u32 nsec;
+} __attribute__((__packed__));
+
+#define PVCLOCK_TSC_STABLE_BIT (1 << 0)
+#define PVCLOCK_GUEST_STOPPED (1 << 1)
+/* PVCLOCK_COUNTS_FROM_ZERO broke ABI and can't be used anymore. */
+#define PVCLOCK_COUNTS_FROM_ZERO (1 << 2)
+#endif /* __ASSEMBLY__ */
+#endif /* _ASM_X86_PVCLOCK_ABI_H */
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
new file mode 100644
index 000000000..b6033680d
--- /dev/null
+++ b/arch/x86/include/asm/pvclock.h
@@ -0,0 +1,106 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_PVCLOCK_H
+#define _ASM_X86_PVCLOCK_H
+
+#include <linux/clocksource.h>
+#include <asm/pvclock-abi.h>
+
+/* some helper functions for xen and kvm pv clock sources */
+u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src);
+u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src);
+void pvclock_set_flags(u8 flags);
+unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src);
+void pvclock_read_wallclock(struct pvclock_wall_clock *wall,
+ struct pvclock_vcpu_time_info *vcpu,
+ struct timespec64 *ts);
+void pvclock_resume(void);
+
+void pvclock_touch_watchdogs(void);
+
+static __always_inline
+unsigned pvclock_read_begin(const struct pvclock_vcpu_time_info *src)
+{
+ unsigned version = src->version & ~1;
+ /* Make sure that the version is read before the data. */
+ virt_rmb();
+ return version;
+}
+
+static __always_inline
+bool pvclock_read_retry(const struct pvclock_vcpu_time_info *src,
+ unsigned version)
+{
+ /* Make sure that the version is re-read after the data. */
+ virt_rmb();
+ return unlikely(version != src->version);
+}
+
+/*
+ * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
+ * yielding a 64-bit result.
+ */
+static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift)
+{
+ u64 product;
+#ifdef __i386__
+ u32 tmp1, tmp2;
+#else
+ ulong tmp;
+#endif
+
+ if (shift < 0)
+ delta >>= -shift;
+ else
+ delta <<= shift;
+
+#ifdef __i386__
+ __asm__ (
+ "mul %5 ; "
+ "mov %4,%%eax ; "
+ "mov %%edx,%4 ; "
+ "mul %5 ; "
+ "xor %5,%5 ; "
+ "add %4,%%eax ; "
+ "adc %5,%%edx ; "
+ : "=A" (product), "=r" (tmp1), "=r" (tmp2)
+ : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
+#elif defined(__x86_64__)
+ __asm__ (
+ "mulq %[mul_frac] ; shrd $32, %[hi], %[lo]"
+ : [lo]"=a"(product),
+ [hi]"=d"(tmp)
+ : "0"(delta),
+ [mul_frac]"rm"((u64)mul_frac));
+#else
+#error implement me!
+#endif
+
+ return product;
+}
+
+static __always_inline
+u64 __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src, u64 tsc)
+{
+ u64 delta = tsc - src->tsc_timestamp;
+ u64 offset = pvclock_scale_delta(delta, src->tsc_to_system_mul,
+ src->tsc_shift);
+ return src->system_time + offset;
+}
+
+struct pvclock_vsyscall_time_info {
+ struct pvclock_vcpu_time_info pvti;
+} __attribute__((__aligned__(SMP_CACHE_BYTES)));
+
+#define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info)
+
+#ifdef CONFIG_PARAVIRT_CLOCK
+void pvclock_set_pvti_cpu0_va(struct pvclock_vsyscall_time_info *pvti);
+struct pvclock_vsyscall_time_info *pvclock_get_pvti_cpu0_va(void);
+#else
+static inline struct pvclock_vsyscall_time_info *pvclock_get_pvti_cpu0_va(void)
+{
+ return NULL;
+}
+#endif
+
+#endif /* _ASM_X86_PVCLOCK_H */
diff --git a/arch/x86/include/asm/qrwlock.h b/arch/x86/include/asm/qrwlock.h
new file mode 100644
index 000000000..8656b5a6e
--- /dev/null
+++ b/arch/x86/include/asm/qrwlock.h
@@ -0,0 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_QRWLOCK_H
+#define _ASM_X86_QRWLOCK_H
+
+#include <asm-generic/qrwlock_types.h>
+#include <asm-generic/qrwlock.h>
+
+#endif /* _ASM_X86_QRWLOCK_H */
diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h
new file mode 100644
index 000000000..055c60a05
--- /dev/null
+++ b/arch/x86/include/asm/qspinlock.h
@@ -0,0 +1,100 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_QSPINLOCK_H
+#define _ASM_X86_QSPINLOCK_H
+
+#include <linux/jump_label.h>
+#include <asm/cpufeature.h>
+#include <asm-generic/qspinlock_types.h>
+#include <asm/paravirt.h>
+#include <asm/rmwcc.h>
+
+#define _Q_PENDING_LOOPS (1 << 9)
+
+#define queued_fetch_set_pending_acquire queued_fetch_set_pending_acquire
+
+static __always_inline bool __queued_RMW_btsl(struct qspinlock *lock)
+{
+ GEN_BINARY_RMWcc(LOCK_PREFIX "btsl", lock->val.counter,
+ "I", _Q_PENDING_OFFSET, "%0", c);
+}
+
+static __always_inline u32 queued_fetch_set_pending_acquire(struct qspinlock *lock)
+{
+ u32 val = 0;
+
+ if (__queued_RMW_btsl(lock))
+ val |= _Q_PENDING_VAL;
+
+ val |= atomic_read(&lock->val) & ~_Q_PENDING_MASK;
+
+ return val;
+}
+
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+extern void __pv_init_lock_hash(void);
+extern void __pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+extern void __raw_callee_save___pv_queued_spin_unlock(struct qspinlock *lock);
+
+#define queued_spin_unlock queued_spin_unlock
+/**
+ * queued_spin_unlock - release a queued spinlock
+ * @lock : Pointer to queued spinlock structure
+ *
+ * A smp_store_release() on the least-significant byte.
+ */
+static inline void native_queued_spin_unlock(struct qspinlock *lock)
+{
+ smp_store_release(&lock->locked, 0);
+}
+
+static inline void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
+{
+ pv_queued_spin_lock_slowpath(lock, val);
+}
+
+static inline void queued_spin_unlock(struct qspinlock *lock)
+{
+ pv_queued_spin_unlock(lock);
+}
+
+#define vcpu_is_preempted vcpu_is_preempted
+static inline bool vcpu_is_preempted(long cpu)
+{
+ return pv_vcpu_is_preempted(cpu);
+}
+#endif
+
+#ifdef CONFIG_PARAVIRT
+DECLARE_STATIC_KEY_TRUE(virt_spin_lock_key);
+
+void native_pv_lock_init(void) __init;
+
+#define virt_spin_lock virt_spin_lock
+static inline bool virt_spin_lock(struct qspinlock *lock)
+{
+ if (!static_branch_likely(&virt_spin_lock_key))
+ return false;
+
+ /*
+ * On hypervisors without PARAVIRT_SPINLOCKS support we fall
+ * back to a Test-and-Set spinlock, because fair locks have
+ * horrible lock 'holder' preemption issues.
+ */
+
+ do {
+ while (atomic_read(&lock->val) != 0)
+ cpu_relax();
+ } while (atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL) != 0);
+
+ return true;
+}
+#else
+static inline void native_pv_lock_init(void)
+{
+}
+#endif /* CONFIG_PARAVIRT */
+
+#include <asm-generic/qspinlock.h>
+
+#endif /* _ASM_X86_QSPINLOCK_H */
diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h
new file mode 100644
index 000000000..159622ee0
--- /dev/null
+++ b/arch/x86/include/asm/qspinlock_paravirt.h
@@ -0,0 +1,69 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_QSPINLOCK_PARAVIRT_H
+#define __ASM_QSPINLOCK_PARAVIRT_H
+
+/*
+ * For x86-64, PV_CALLEE_SAVE_REGS_THUNK() saves and restores 8 64-bit
+ * registers. For i386, however, only 1 32-bit register needs to be saved
+ * and restored. So an optimized version of __pv_queued_spin_unlock() is
+ * hand-coded for 64-bit, but it isn't worthwhile to do it for 32-bit.
+ */
+#ifdef CONFIG_64BIT
+
+PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath);
+#define __pv_queued_spin_unlock __pv_queued_spin_unlock
+#define PV_UNLOCK "__raw_callee_save___pv_queued_spin_unlock"
+#define PV_UNLOCK_SLOWPATH "__raw_callee_save___pv_queued_spin_unlock_slowpath"
+
+/*
+ * Optimized assembly version of __raw_callee_save___pv_queued_spin_unlock
+ * which combines the registers saving trunk and the body of the following
+ * C code:
+ *
+ * void __pv_queued_spin_unlock(struct qspinlock *lock)
+ * {
+ * u8 lockval = cmpxchg(&lock->locked, _Q_LOCKED_VAL, 0);
+ *
+ * if (likely(lockval == _Q_LOCKED_VAL))
+ * return;
+ * pv_queued_spin_unlock_slowpath(lock, lockval);
+ * }
+ *
+ * For x86-64,
+ * rdi = lock (first argument)
+ * rsi = lockval (second argument)
+ * rdx = internal variable (set to 0)
+ */
+asm (".pushsection .text;"
+ ".globl " PV_UNLOCK ";"
+ ".type " PV_UNLOCK ", @function;"
+ ".align 4,0x90;"
+ PV_UNLOCK ": "
+ FRAME_BEGIN
+ "push %rdx;"
+ "mov $0x1,%eax;"
+ "xor %edx,%edx;"
+ LOCK_PREFIX "cmpxchg %dl,(%rdi);"
+ "cmp $0x1,%al;"
+ "jne .slowpath;"
+ "pop %rdx;"
+ FRAME_END
+ "ret;"
+ ".slowpath: "
+ "push %rsi;"
+ "movzbl %al,%esi;"
+ "call " PV_UNLOCK_SLOWPATH ";"
+ "pop %rsi;"
+ "pop %rdx;"
+ FRAME_END
+ "ret;"
+ ".size " PV_UNLOCK ", .-" PV_UNLOCK ";"
+ ".popsection");
+
+#else /* CONFIG_64BIT */
+
+extern void __pv_queued_spin_unlock(struct qspinlock *lock);
+PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock);
+
+#endif /* CONFIG_64BIT */
+#endif
diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h
new file mode 100644
index 000000000..63b3393bd
--- /dev/null
+++ b/arch/x86/include/asm/realmode.h
@@ -0,0 +1,85 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ARCH_X86_REALMODE_H
+#define _ARCH_X86_REALMODE_H
+
+/*
+ * Flag bit definitions for use with the flags field of the trampoline header
+ * in the CONFIG_X86_64 variant.
+ */
+#define TH_FLAGS_SME_ACTIVE_BIT 0
+#define TH_FLAGS_SME_ACTIVE BIT(TH_FLAGS_SME_ACTIVE_BIT)
+
+#ifndef __ASSEMBLY__
+
+#include <linux/types.h>
+#include <asm/io.h>
+
+/* This must match data at realmode.S */
+struct real_mode_header {
+ u32 text_start;
+ u32 ro_end;
+ /* SMP trampoline */
+ u32 trampoline_start;
+ u32 trampoline_status;
+ u32 trampoline_header;
+#ifdef CONFIG_X86_64
+ u32 trampoline_pgd;
+#endif
+ /* ACPI S3 wakeup */
+#ifdef CONFIG_ACPI_SLEEP
+ u32 wakeup_start;
+ u32 wakeup_header;
+#endif
+ /* APM/BIOS reboot */
+ u32 machine_real_restart_asm;
+#ifdef CONFIG_X86_64
+ u32 machine_real_restart_seg;
+#endif
+};
+
+/* This must match data at trampoline_32/64.S */
+struct trampoline_header {
+#ifdef CONFIG_X86_32
+ u32 start;
+ u16 gdt_pad;
+ u16 gdt_limit;
+ u32 gdt_base;
+#else
+ u64 start;
+ u64 efer;
+ u32 cr4;
+ u32 flags;
+#endif
+};
+
+extern struct real_mode_header *real_mode_header;
+extern unsigned char real_mode_blob_end[];
+
+extern unsigned long initial_code;
+extern unsigned long initial_gs;
+extern unsigned long initial_stack;
+
+extern unsigned char real_mode_blob[];
+extern unsigned char real_mode_relocs[];
+
+#ifdef CONFIG_X86_32
+extern unsigned char startup_32_smp[];
+extern unsigned char boot_gdt[];
+#else
+extern unsigned char secondary_startup_64[];
+#endif
+
+static inline size_t real_mode_size_needed(void)
+{
+ if (real_mode_header)
+ return 0; /* already allocated. */
+
+ return ALIGN(real_mode_blob_end - real_mode_blob, PAGE_SIZE);
+}
+
+void set_real_mode_mem(phys_addr_t mem, size_t size);
+void reserve_real_mode(void);
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ARCH_X86_REALMODE_H */
diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h
new file mode 100644
index 000000000..a671a1145
--- /dev/null
+++ b/arch/x86/include/asm/reboot.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_REBOOT_H
+#define _ASM_X86_REBOOT_H
+
+#include <linux/kdebug.h>
+
+struct pt_regs;
+
+struct machine_ops {
+ void (*restart)(char *cmd);
+ void (*halt)(void);
+ void (*power_off)(void);
+ void (*shutdown)(void);
+ void (*crash_shutdown)(struct pt_regs *);
+ void (*emergency_restart)(void);
+};
+
+extern struct machine_ops machine_ops;
+extern int crashing_cpu;
+
+void native_machine_crash_shutdown(struct pt_regs *regs);
+void native_machine_shutdown(void);
+void __noreturn machine_real_restart(unsigned int type);
+/* These must match dispatch in arch/x86/realmore/rm/reboot.S */
+#define MRR_BIOS 0
+#define MRR_APM 1
+
+typedef void (*nmi_shootdown_cb)(int, struct pt_regs*);
+void nmi_shootdown_cpus(nmi_shootdown_cb callback);
+void run_crash_ipi_callback(struct pt_regs *regs);
+
+#endif /* _ASM_X86_REBOOT_H */
diff --git a/arch/x86/include/asm/reboot_fixups.h b/arch/x86/include/asm/reboot_fixups.h
new file mode 100644
index 000000000..96515658c
--- /dev/null
+++ b/arch/x86/include/asm/reboot_fixups.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_REBOOT_FIXUPS_H
+#define _ASM_X86_REBOOT_FIXUPS_H
+
+extern void mach_reboot_fixups(void);
+
+#endif /* _ASM_X86_REBOOT_FIXUPS_H */
diff --git a/arch/x86/include/asm/refcount.h b/arch/x86/include/asm/refcount.h
new file mode 100644
index 000000000..19b905219
--- /dev/null
+++ b/arch/x86/include/asm/refcount.h
@@ -0,0 +1,110 @@
+#ifndef __ASM_X86_REFCOUNT_H
+#define __ASM_X86_REFCOUNT_H
+/*
+ * x86-specific implementation of refcount_t. Based on PAX_REFCOUNT from
+ * PaX/grsecurity.
+ */
+#include <linux/refcount.h>
+#include <asm/bug.h>
+
+/*
+ * This is the first portion of the refcount error handling, which lives in
+ * .text.unlikely, and is jumped to from the CPU flag check (in the
+ * following macros). This saves the refcount value location into CX for
+ * the exception handler to use (in mm/extable.c), and then triggers the
+ * central refcount exception. The fixup address for the exception points
+ * back to the regular execution flow in .text.
+ */
+#define _REFCOUNT_EXCEPTION \
+ ".pushsection .text..refcount\n" \
+ "111:\tlea %[counter], %%" _ASM_CX "\n" \
+ "112:\t" ASM_UD2 "\n" \
+ ASM_UNREACHABLE \
+ ".popsection\n" \
+ "113:\n" \
+ _ASM_EXTABLE_REFCOUNT(112b, 113b)
+
+/* Trigger refcount exception if refcount result is negative. */
+#define REFCOUNT_CHECK_LT_ZERO \
+ "js 111f\n\t" \
+ _REFCOUNT_EXCEPTION
+
+/* Trigger refcount exception if refcount result is zero or negative. */
+#define REFCOUNT_CHECK_LE_ZERO \
+ "jz 111f\n\t" \
+ REFCOUNT_CHECK_LT_ZERO
+
+/* Trigger refcount exception unconditionally. */
+#define REFCOUNT_ERROR \
+ "jmp 111f\n\t" \
+ _REFCOUNT_EXCEPTION
+
+static __always_inline void refcount_add(unsigned int i, refcount_t *r)
+{
+ asm volatile(LOCK_PREFIX "addl %1,%0\n\t"
+ REFCOUNT_CHECK_LT_ZERO
+ : [counter] "+m" (r->refs.counter)
+ : "ir" (i)
+ : "cc", "cx");
+}
+
+static __always_inline void refcount_inc(refcount_t *r)
+{
+ asm volatile(LOCK_PREFIX "incl %0\n\t"
+ REFCOUNT_CHECK_LT_ZERO
+ : [counter] "+m" (r->refs.counter)
+ : : "cc", "cx");
+}
+
+static __always_inline void refcount_dec(refcount_t *r)
+{
+ asm volatile(LOCK_PREFIX "decl %0\n\t"
+ REFCOUNT_CHECK_LE_ZERO
+ : [counter] "+m" (r->refs.counter)
+ : : "cc", "cx");
+}
+
+static __always_inline __must_check
+bool refcount_sub_and_test(unsigned int i, refcount_t *r)
+{
+ GEN_BINARY_SUFFIXED_RMWcc(LOCK_PREFIX "subl", REFCOUNT_CHECK_LT_ZERO,
+ r->refs.counter, "er", i, "%0", e, "cx");
+}
+
+static __always_inline __must_check bool refcount_dec_and_test(refcount_t *r)
+{
+ GEN_UNARY_SUFFIXED_RMWcc(LOCK_PREFIX "decl", REFCOUNT_CHECK_LT_ZERO,
+ r->refs.counter, "%0", e, "cx");
+}
+
+static __always_inline __must_check
+bool refcount_add_not_zero(unsigned int i, refcount_t *r)
+{
+ int c, result;
+
+ c = atomic_read(&(r->refs));
+ do {
+ if (unlikely(c == 0))
+ return false;
+
+ result = c + i;
+
+ /* Did we try to increment from/to an undesirable state? */
+ if (unlikely(c < 0 || c == INT_MAX || result < c)) {
+ asm volatile(REFCOUNT_ERROR
+ : : [counter] "m" (r->refs.counter)
+ : "cc", "cx");
+ break;
+ }
+
+ } while (!atomic_try_cmpxchg(&(r->refs), &c, result));
+
+ return c != 0;
+}
+
+static __always_inline __must_check bool refcount_inc_not_zero(refcount_t *r)
+{
+ return refcount_add_not_zero(1, r);
+}
+
+#endif
diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h
new file mode 100644
index 000000000..6847d8540
--- /dev/null
+++ b/arch/x86/include/asm/required-features.h
@@ -0,0 +1,106 @@
+#ifndef _ASM_X86_REQUIRED_FEATURES_H
+#define _ASM_X86_REQUIRED_FEATURES_H
+
+/* Define minimum CPUID feature set for kernel These bits are checked
+ really early to actually display a visible error message before the
+ kernel dies. Make sure to assign features to the proper mask!
+
+ Some requirements that are not in CPUID yet are also in the
+ CONFIG_X86_MINIMUM_CPU_FAMILY which is checked too.
+
+ The real information is in arch/x86/Kconfig.cpu, this just converts
+ the CONFIGs into a bitmask */
+
+#ifndef CONFIG_MATH_EMULATION
+# define NEED_FPU (1<<(X86_FEATURE_FPU & 31))
+#else
+# define NEED_FPU 0
+#endif
+
+#if defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64)
+# define NEED_PAE (1<<(X86_FEATURE_PAE & 31))
+#else
+# define NEED_PAE 0
+#endif
+
+#ifdef CONFIG_X86_CMPXCHG64
+# define NEED_CX8 (1<<(X86_FEATURE_CX8 & 31))
+#else
+# define NEED_CX8 0
+#endif
+
+#if defined(CONFIG_X86_CMOV) || defined(CONFIG_X86_64)
+# define NEED_CMOV (1<<(X86_FEATURE_CMOV & 31))
+#else
+# define NEED_CMOV 0
+#endif
+
+#ifdef CONFIG_X86_USE_3DNOW
+# define NEED_3DNOW (1<<(X86_FEATURE_3DNOW & 31))
+#else
+# define NEED_3DNOW 0
+#endif
+
+#if defined(CONFIG_X86_P6_NOP) || defined(CONFIG_X86_64)
+# define NEED_NOPL (1<<(X86_FEATURE_NOPL & 31))
+#else
+# define NEED_NOPL 0
+#endif
+
+#ifdef CONFIG_MATOM
+# define NEED_MOVBE (1<<(X86_FEATURE_MOVBE & 31))
+#else
+# define NEED_MOVBE 0
+#endif
+
+#ifdef CONFIG_X86_64
+#ifdef CONFIG_PARAVIRT
+/* Paravirtualized systems may not have PSE or PGE available */
+#define NEED_PSE 0
+#define NEED_PGE 0
+#else
+#define NEED_PSE (1<<(X86_FEATURE_PSE) & 31)
+#define NEED_PGE (1<<(X86_FEATURE_PGE) & 31)
+#endif
+#define NEED_MSR (1<<(X86_FEATURE_MSR & 31))
+#define NEED_FXSR (1<<(X86_FEATURE_FXSR & 31))
+#define NEED_XMM (1<<(X86_FEATURE_XMM & 31))
+#define NEED_XMM2 (1<<(X86_FEATURE_XMM2 & 31))
+#define NEED_LM (1<<(X86_FEATURE_LM & 31))
+#else
+#define NEED_PSE 0
+#define NEED_MSR 0
+#define NEED_PGE 0
+#define NEED_FXSR 0
+#define NEED_XMM 0
+#define NEED_XMM2 0
+#define NEED_LM 0
+#endif
+
+#define REQUIRED_MASK0 (NEED_FPU|NEED_PSE|NEED_MSR|NEED_PAE|\
+ NEED_CX8|NEED_PGE|NEED_FXSR|NEED_CMOV|\
+ NEED_XMM|NEED_XMM2)
+#define SSE_MASK (NEED_XMM|NEED_XMM2)
+
+#define REQUIRED_MASK1 (NEED_LM|NEED_3DNOW)
+
+#define REQUIRED_MASK2 0
+#define REQUIRED_MASK3 (NEED_NOPL)
+#define REQUIRED_MASK4 (NEED_MOVBE)
+#define REQUIRED_MASK5 0
+#define REQUIRED_MASK6 0
+#define REQUIRED_MASK7 0
+#define REQUIRED_MASK8 0
+#define REQUIRED_MASK9 0
+#define REQUIRED_MASK10 0
+#define REQUIRED_MASK11 0
+#define REQUIRED_MASK12 0
+#define REQUIRED_MASK13 0
+#define REQUIRED_MASK14 0
+#define REQUIRED_MASK15 0
+#define REQUIRED_MASK16 0
+#define REQUIRED_MASK17 0
+#define REQUIRED_MASK18 0
+#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19)
+
+#endif /* _ASM_X86_REQUIRED_FEATURES_H */
diff --git a/arch/x86/include/asm/rio.h b/arch/x86/include/asm/rio.h
new file mode 100644
index 000000000..0a21986d2
--- /dev/null
+++ b/arch/x86/include/asm/rio.h
@@ -0,0 +1,64 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Derived from include/asm-x86/mach-summit/mach_mpparse.h
+ * and include/asm-x86/mach-default/bios_ebda.h
+ *
+ * Author: Laurent Vivier <Laurent.Vivier@bull.net>
+ */
+
+#ifndef _ASM_X86_RIO_H
+#define _ASM_X86_RIO_H
+
+#define RIO_TABLE_VERSION 3
+
+struct rio_table_hdr {
+ u8 version; /* Version number of this data structure */
+ u8 num_scal_dev; /* # of Scalability devices */
+ u8 num_rio_dev; /* # of RIO I/O devices */
+} __attribute__((packed));
+
+struct scal_detail {
+ u8 node_id; /* Scalability Node ID */
+ u32 CBAR; /* Address of 1MB register space */
+ u8 port0node; /* Node ID port connected to: 0xFF=None */
+ u8 port0port; /* Port num port connected to: 0,1,2, or */
+ /* 0xFF=None */
+ u8 port1node; /* Node ID port connected to: 0xFF = None */
+ u8 port1port; /* Port num port connected to: 0,1,2, or */
+ /* 0xFF=None */
+ u8 port2node; /* Node ID port connected to: 0xFF = None */
+ u8 port2port; /* Port num port connected to: 0,1,2, or */
+ /* 0xFF=None */
+ u8 chassis_num; /* 1 based Chassis number (1 = boot node) */
+} __attribute__((packed));
+
+struct rio_detail {
+ u8 node_id; /* RIO Node ID */
+ u32 BBAR; /* Address of 1MB register space */
+ u8 type; /* Type of device */
+ u8 owner_id; /* Node ID of Hurricane that owns this */
+ /* node */
+ u8 port0node; /* Node ID port connected to: 0xFF=None */
+ u8 port0port; /* Port num port connected to: 0,1,2, or */
+ /* 0xFF=None */
+ u8 port1node; /* Node ID port connected to: 0xFF=None */
+ u8 port1port; /* Port num port connected to: 0,1,2, or */
+ /* 0xFF=None */
+ u8 first_slot; /* Lowest slot number below this Calgary */
+ u8 status; /* Bit 0 = 1 : the XAPIC is used */
+ /* = 0 : the XAPIC is not used, ie: */
+ /* ints fwded to another XAPIC */
+ /* Bits1:7 Reserved */
+ u8 WP_index; /* instance index - lower ones have */
+ /* lower slot numbers/PCI bus numbers */
+ u8 chassis_num; /* 1 based Chassis number */
+} __attribute__((packed));
+
+enum {
+ HURR_SCALABILTY = 0, /* Hurricane Scalability info */
+ HURR_RIOIB = 2, /* Hurricane RIOIB info */
+ COMPAT_CALGARY = 4, /* Compatibility Calgary */
+ ALT_CALGARY = 5, /* Second Planar Calgary */
+};
+
+#endif /* _ASM_X86_RIO_H */
diff --git a/arch/x86/include/asm/rmwcc.h b/arch/x86/include/asm/rmwcc.h
new file mode 100644
index 000000000..033dc7ca4
--- /dev/null
+++ b/arch/x86/include/asm/rmwcc.h
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_RMWcc
+#define _ASM_X86_RMWcc
+
+#define __CLOBBERS_MEM(clb...) "memory", ## clb
+
+#if !defined(__GCC_ASM_FLAG_OUTPUTS__) && defined(CONFIG_CC_HAS_ASM_GOTO)
+
+/* Use asm goto */
+
+#define __GEN_RMWcc(fullop, var, cc, clobbers, ...) \
+do { \
+ asm_volatile_goto (fullop "; j" #cc " %l[cc_label]" \
+ : : [counter] "m" (var), ## __VA_ARGS__ \
+ : clobbers : cc_label); \
+ return 0; \
+cc_label: \
+ return 1; \
+} while (0)
+
+#define __BINARY_RMWcc_ARG " %1, "
+
+
+#else /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CONFIG_CC_HAS_ASM_GOTO) */
+
+/* Use flags output or a set instruction */
+
+#define __GEN_RMWcc(fullop, var, cc, clobbers, ...) \
+do { \
+ bool c; \
+ asm volatile (fullop CC_SET(cc) \
+ : [counter] "+m" (var), CC_OUT(cc) (c) \
+ : __VA_ARGS__ : clobbers); \
+ return c; \
+} while (0)
+
+#define __BINARY_RMWcc_ARG " %2, "
+
+#endif /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CONFIG_CC_HAS_ASM_GOTO) */
+
+#define GEN_UNARY_RMWcc(op, var, arg0, cc) \
+ __GEN_RMWcc(op " " arg0, var, cc, __CLOBBERS_MEM())
+
+#define GEN_UNARY_SUFFIXED_RMWcc(op, suffix, var, arg0, cc, clobbers...)\
+ __GEN_RMWcc(op " " arg0 "\n\t" suffix, var, cc, \
+ __CLOBBERS_MEM(clobbers))
+
+#define GEN_BINARY_RMWcc(op, var, vcon, val, arg0, cc) \
+ __GEN_RMWcc(op __BINARY_RMWcc_ARG arg0, var, cc, \
+ __CLOBBERS_MEM(), vcon (val))
+
+#define GEN_BINARY_SUFFIXED_RMWcc(op, suffix, var, vcon, val, arg0, cc, \
+ clobbers...) \
+ __GEN_RMWcc(op __BINARY_RMWcc_ARG arg0 "\n\t" suffix, var, cc, \
+ __CLOBBERS_MEM(clobbers), vcon (val))
+
+#endif /* _ASM_X86_RMWcc */
diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h
new file mode 100644
index 000000000..4c25cf6ca
--- /dev/null
+++ b/arch/x86/include/asm/rwsem.h
@@ -0,0 +1,237 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* rwsem.h: R/W semaphores implemented using XADD/CMPXCHG for i486+
+ *
+ * Written by David Howells (dhowells@redhat.com).
+ *
+ * Derived from asm-x86/semaphore.h
+ *
+ *
+ * The MSW of the count is the negated number of active writers and waiting
+ * lockers, and the LSW is the total number of active locks
+ *
+ * The lock count is initialized to 0 (no active and no waiting lockers).
+ *
+ * When a writer subtracts WRITE_BIAS, it'll get 0xffff0001 for the case of an
+ * uncontended lock. This can be determined because XADD returns the old value.
+ * Readers increment by 1 and see a positive value when uncontended, negative
+ * if there are writers (and maybe) readers waiting (in which case it goes to
+ * sleep).
+ *
+ * The value of WAITING_BIAS supports up to 32766 waiting processes. This can
+ * be extended to 65534 by manually checking the whole MSW rather than relying
+ * on the S flag.
+ *
+ * The value of ACTIVE_BIAS supports up to 65535 active processes.
+ *
+ * This should be totally fair - if anything is waiting, a process that wants a
+ * lock will go to the back of the queue. When the currently active lock is
+ * released, if there's a writer at the front of the queue, then that and only
+ * that will be woken up; if there's a bunch of consecutive readers at the
+ * front, then they'll all be woken up, but no other readers will be.
+ */
+
+#ifndef _ASM_X86_RWSEM_H
+#define _ASM_X86_RWSEM_H
+
+#ifndef _LINUX_RWSEM_H
+#error "please don't include asm/rwsem.h directly, use linux/rwsem.h instead"
+#endif
+
+#ifdef __KERNEL__
+#include <asm/asm.h>
+
+/*
+ * The bias values and the counter type limits the number of
+ * potential readers/writers to 32767 for 32 bits and 2147483647
+ * for 64 bits.
+ */
+
+#ifdef CONFIG_X86_64
+# define RWSEM_ACTIVE_MASK 0xffffffffL
+#else
+# define RWSEM_ACTIVE_MASK 0x0000ffffL
+#endif
+
+#define RWSEM_UNLOCKED_VALUE 0x00000000L
+#define RWSEM_ACTIVE_BIAS 0x00000001L
+#define RWSEM_WAITING_BIAS (-RWSEM_ACTIVE_MASK-1)
+#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS
+#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
+
+/*
+ * lock for reading
+ */
+#define ____down_read(sem, slow_path) \
+({ \
+ struct rw_semaphore* ret; \
+ asm volatile("# beginning down_read\n\t" \
+ LOCK_PREFIX _ASM_INC "(%[sem])\n\t" \
+ /* adds 0x00000001 */ \
+ " jns 1f\n" \
+ " call " slow_path "\n" \
+ "1:\n\t" \
+ "# ending down_read\n\t" \
+ : "+m" (sem->count), "=a" (ret), \
+ ASM_CALL_CONSTRAINT \
+ : [sem] "a" (sem) \
+ : "memory", "cc"); \
+ ret; \
+})
+
+static inline void __down_read(struct rw_semaphore *sem)
+{
+ ____down_read(sem, "call_rwsem_down_read_failed");
+}
+
+static inline int __down_read_killable(struct rw_semaphore *sem)
+{
+ if (IS_ERR(____down_read(sem, "call_rwsem_down_read_failed_killable")))
+ return -EINTR;
+ return 0;
+}
+
+/*
+ * trylock for reading -- returns 1 if successful, 0 if contention
+ */
+static inline bool __down_read_trylock(struct rw_semaphore *sem)
+{
+ long result, tmp;
+ asm volatile("# beginning __down_read_trylock\n\t"
+ " mov %[count],%[result]\n\t"
+ "1:\n\t"
+ " mov %[result],%[tmp]\n\t"
+ " add %[inc],%[tmp]\n\t"
+ " jle 2f\n\t"
+ LOCK_PREFIX " cmpxchg %[tmp],%[count]\n\t"
+ " jnz 1b\n\t"
+ "2:\n\t"
+ "# ending __down_read_trylock\n\t"
+ : [count] "+m" (sem->count), [result] "=&a" (result),
+ [tmp] "=&r" (tmp)
+ : [inc] "i" (RWSEM_ACTIVE_READ_BIAS)
+ : "memory", "cc");
+ return result >= 0;
+}
+
+/*
+ * lock for writing
+ */
+#define ____down_write(sem, slow_path) \
+({ \
+ long tmp; \
+ struct rw_semaphore* ret; \
+ \
+ asm volatile("# beginning down_write\n\t" \
+ LOCK_PREFIX " xadd %[tmp],(%[sem])\n\t" \
+ /* adds 0xffff0001, returns the old value */ \
+ " test " __ASM_SEL(%w1,%k1) "," __ASM_SEL(%w1,%k1) "\n\t" \
+ /* was the active mask 0 before? */\
+ " jz 1f\n" \
+ " call " slow_path "\n" \
+ "1:\n" \
+ "# ending down_write" \
+ : "+m" (sem->count), [tmp] "=d" (tmp), \
+ "=a" (ret), ASM_CALL_CONSTRAINT \
+ : [sem] "a" (sem), "[tmp]" (RWSEM_ACTIVE_WRITE_BIAS) \
+ : "memory", "cc"); \
+ ret; \
+})
+
+static inline void __down_write(struct rw_semaphore *sem)
+{
+ ____down_write(sem, "call_rwsem_down_write_failed");
+}
+
+static inline int __down_write_killable(struct rw_semaphore *sem)
+{
+ if (IS_ERR(____down_write(sem, "call_rwsem_down_write_failed_killable")))
+ return -EINTR;
+
+ return 0;
+}
+
+/*
+ * trylock for writing -- returns 1 if successful, 0 if contention
+ */
+static inline bool __down_write_trylock(struct rw_semaphore *sem)
+{
+ bool result;
+ long tmp0, tmp1;
+ asm volatile("# beginning __down_write_trylock\n\t"
+ " mov %[count],%[tmp0]\n\t"
+ "1:\n\t"
+ " test " __ASM_SEL(%w1,%k1) "," __ASM_SEL(%w1,%k1) "\n\t"
+ /* was the active mask 0 before? */
+ " jnz 2f\n\t"
+ " mov %[tmp0],%[tmp1]\n\t"
+ " add %[inc],%[tmp1]\n\t"
+ LOCK_PREFIX " cmpxchg %[tmp1],%[count]\n\t"
+ " jnz 1b\n\t"
+ "2:\n\t"
+ CC_SET(e)
+ "# ending __down_write_trylock\n\t"
+ : [count] "+m" (sem->count), [tmp0] "=&a" (tmp0),
+ [tmp1] "=&r" (tmp1), CC_OUT(e) (result)
+ : [inc] "er" (RWSEM_ACTIVE_WRITE_BIAS)
+ : "memory");
+ return result;
+}
+
+/*
+ * unlock after reading
+ */
+static inline void __up_read(struct rw_semaphore *sem)
+{
+ long tmp;
+ asm volatile("# beginning __up_read\n\t"
+ LOCK_PREFIX " xadd %[tmp],(%[sem])\n\t"
+ /* subtracts 1, returns the old value */
+ " jns 1f\n\t"
+ " call call_rwsem_wake\n" /* expects old value in %edx */
+ "1:\n"
+ "# ending __up_read\n"
+ : "+m" (sem->count), [tmp] "=d" (tmp)
+ : [sem] "a" (sem), "[tmp]" (-RWSEM_ACTIVE_READ_BIAS)
+ : "memory", "cc");
+}
+
+/*
+ * unlock after writing
+ */
+static inline void __up_write(struct rw_semaphore *sem)
+{
+ long tmp;
+ asm volatile("# beginning __up_write\n\t"
+ LOCK_PREFIX " xadd %[tmp],(%[sem])\n\t"
+ /* subtracts 0xffff0001, returns the old value */
+ " jns 1f\n\t"
+ " call call_rwsem_wake\n" /* expects old value in %edx */
+ "1:\n\t"
+ "# ending __up_write\n"
+ : "+m" (sem->count), [tmp] "=d" (tmp)
+ : [sem] "a" (sem), "[tmp]" (-RWSEM_ACTIVE_WRITE_BIAS)
+ : "memory", "cc");
+}
+
+/*
+ * downgrade write lock to read lock
+ */
+static inline void __downgrade_write(struct rw_semaphore *sem)
+{
+ asm volatile("# beginning __downgrade_write\n\t"
+ LOCK_PREFIX _ASM_ADD "%[inc],(%[sem])\n\t"
+ /*
+ * transitions 0xZZZZ0001 -> 0xYYYY0001 (i386)
+ * 0xZZZZZZZZ00000001 -> 0xYYYYYYYY00000001 (x86_64)
+ */
+ " jns 1f\n\t"
+ " call call_rwsem_downgrade_wake\n"
+ "1:\n\t"
+ "# ending __downgrade_write\n"
+ : "+m" (sem->count)
+ : [sem] "a" (sem), [inc] "er" (-RWSEM_WAITING_BIAS)
+ : "memory", "cc");
+}
+
+#endif /* __KERNEL__ */
+#endif /* _ASM_X86_RWSEM_H */
diff --git a/arch/x86/include/asm/seccomp.h b/arch/x86/include/asm/seccomp.h
new file mode 100644
index 000000000..2bd1338de
--- /dev/null
+++ b/arch/x86/include/asm/seccomp.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_SECCOMP_H
+#define _ASM_X86_SECCOMP_H
+
+#include <asm/unistd.h>
+
+#ifdef CONFIG_X86_32
+#define __NR_seccomp_sigreturn __NR_sigreturn
+#endif
+
+#ifdef CONFIG_COMPAT
+#include <asm/ia32_unistd.h>
+#define __NR_seccomp_read_32 __NR_ia32_read
+#define __NR_seccomp_write_32 __NR_ia32_write
+#define __NR_seccomp_exit_32 __NR_ia32_exit
+#define __NR_seccomp_sigreturn_32 __NR_ia32_sigreturn
+#endif
+
+#include <asm-generic/seccomp.h>
+
+#endif /* _ASM_X86_SECCOMP_H */
diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h
new file mode 100644
index 000000000..4a911a382
--- /dev/null
+++ b/arch/x86/include/asm/sections.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_SECTIONS_H
+#define _ASM_X86_SECTIONS_H
+
+#include <asm-generic/sections.h>
+#include <asm/extable.h>
+
+extern char __brk_base[], __brk_limit[];
+extern struct exception_table_entry __stop___ex_table[];
+extern char __end_rodata_aligned[];
+
+#if defined(CONFIG_X86_64)
+extern char __end_rodata_hpage_align[];
+extern char __entry_trampoline_start[], __entry_trampoline_end[];
+#endif
+
+#endif /* _ASM_X86_SECTIONS_H */
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
new file mode 100644
index 000000000..e293c122d
--- /dev/null
+++ b/arch/x86/include/asm/segment.h
@@ -0,0 +1,340 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_SEGMENT_H
+#define _ASM_X86_SEGMENT_H
+
+#include <linux/const.h>
+#include <asm/alternative.h>
+
+/*
+ * Constructor for a conventional segment GDT (or LDT) entry.
+ * This is a macro so it can be used in initializers.
+ */
+#define GDT_ENTRY(flags, base, limit) \
+ ((((base) & _AC(0xff000000,ULL)) << (56-24)) | \
+ (((flags) & _AC(0x0000f0ff,ULL)) << 40) | \
+ (((limit) & _AC(0x000f0000,ULL)) << (48-16)) | \
+ (((base) & _AC(0x00ffffff,ULL)) << 16) | \
+ (((limit) & _AC(0x0000ffff,ULL))))
+
+/* Simple and small GDT entries for booting only: */
+
+#define GDT_ENTRY_BOOT_CS 2
+#define GDT_ENTRY_BOOT_DS 3
+#define GDT_ENTRY_BOOT_TSS 4
+#define __BOOT_CS (GDT_ENTRY_BOOT_CS*8)
+#define __BOOT_DS (GDT_ENTRY_BOOT_DS*8)
+#define __BOOT_TSS (GDT_ENTRY_BOOT_TSS*8)
+
+/*
+ * Bottom two bits of selector give the ring
+ * privilege level
+ */
+#define SEGMENT_RPL_MASK 0x3
+
+/* User mode is privilege level 3: */
+#define USER_RPL 0x3
+
+/* Bit 2 is Table Indicator (TI): selects between LDT or GDT */
+#define SEGMENT_TI_MASK 0x4
+/* LDT segment has TI set ... */
+#define SEGMENT_LDT 0x4
+/* ... GDT has it cleared */
+#define SEGMENT_GDT 0x0
+
+#define GDT_ENTRY_INVALID_SEG 0
+
+#ifdef CONFIG_X86_32
+/*
+ * The layout of the per-CPU GDT under Linux:
+ *
+ * 0 - null <=== cacheline #1
+ * 1 - reserved
+ * 2 - reserved
+ * 3 - reserved
+ *
+ * 4 - unused <=== cacheline #2
+ * 5 - unused
+ *
+ * ------- start of TLS (Thread-Local Storage) segments:
+ *
+ * 6 - TLS segment #1 [ glibc's TLS segment ]
+ * 7 - TLS segment #2 [ Wine's %fs Win32 segment ]
+ * 8 - TLS segment #3 <=== cacheline #3
+ * 9 - reserved
+ * 10 - reserved
+ * 11 - reserved
+ *
+ * ------- start of kernel segments:
+ *
+ * 12 - kernel code segment <=== cacheline #4
+ * 13 - kernel data segment
+ * 14 - default user CS
+ * 15 - default user DS
+ * 16 - TSS <=== cacheline #5
+ * 17 - LDT
+ * 18 - PNPBIOS support (16->32 gate)
+ * 19 - PNPBIOS support
+ * 20 - PNPBIOS support <=== cacheline #6
+ * 21 - PNPBIOS support
+ * 22 - PNPBIOS support
+ * 23 - APM BIOS support
+ * 24 - APM BIOS support <=== cacheline #7
+ * 25 - APM BIOS support
+ *
+ * 26 - ESPFIX small SS
+ * 27 - per-cpu [ offset to per-cpu data area ]
+ * 28 - stack_canary-20 [ for stack protector ] <=== cacheline #8
+ * 29 - unused
+ * 30 - unused
+ * 31 - TSS for double fault handler
+ */
+#define GDT_ENTRY_TLS_MIN 6
+#define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
+
+#define GDT_ENTRY_KERNEL_CS 12
+#define GDT_ENTRY_KERNEL_DS 13
+#define GDT_ENTRY_DEFAULT_USER_CS 14
+#define GDT_ENTRY_DEFAULT_USER_DS 15
+#define GDT_ENTRY_TSS 16
+#define GDT_ENTRY_LDT 17
+#define GDT_ENTRY_PNPBIOS_CS32 18
+#define GDT_ENTRY_PNPBIOS_CS16 19
+#define GDT_ENTRY_PNPBIOS_DS 20
+#define GDT_ENTRY_PNPBIOS_TS1 21
+#define GDT_ENTRY_PNPBIOS_TS2 22
+#define GDT_ENTRY_APMBIOS_BASE 23
+
+#define GDT_ENTRY_ESPFIX_SS 26
+#define GDT_ENTRY_PERCPU 27
+#define GDT_ENTRY_STACK_CANARY 28
+
+#define GDT_ENTRY_DOUBLEFAULT_TSS 31
+
+/*
+ * Number of entries in the GDT table:
+ */
+#define GDT_ENTRIES 32
+
+/*
+ * Segment selector values corresponding to the above entries:
+ */
+
+#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS*8)
+#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS*8)
+#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8 + 3)
+#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8 + 3)
+#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS*8)
+
+/* segment for calling fn: */
+#define PNP_CS32 (GDT_ENTRY_PNPBIOS_CS32*8)
+/* code segment for BIOS: */
+#define PNP_CS16 (GDT_ENTRY_PNPBIOS_CS16*8)
+
+/* "Is this PNP code selector (PNP_CS32 or PNP_CS16)?" */
+#define SEGMENT_IS_PNP_CODE(x) (((x) & 0xf4) == PNP_CS32)
+
+/* data segment for BIOS: */
+#define PNP_DS (GDT_ENTRY_PNPBIOS_DS*8)
+/* transfer data segment: */
+#define PNP_TS1 (GDT_ENTRY_PNPBIOS_TS1*8)
+/* another data segment: */
+#define PNP_TS2 (GDT_ENTRY_PNPBIOS_TS2*8)
+
+#ifdef CONFIG_SMP
+# define __KERNEL_PERCPU (GDT_ENTRY_PERCPU*8)
+#else
+# define __KERNEL_PERCPU 0
+#endif
+
+#ifdef CONFIG_STACKPROTECTOR
+# define __KERNEL_STACK_CANARY (GDT_ENTRY_STACK_CANARY*8)
+#else
+# define __KERNEL_STACK_CANARY 0
+#endif
+
+#else /* 64-bit: */
+
+#include <asm/cache.h>
+
+#define GDT_ENTRY_KERNEL32_CS 1
+#define GDT_ENTRY_KERNEL_CS 2
+#define GDT_ENTRY_KERNEL_DS 3
+
+/*
+ * We cannot use the same code segment descriptor for user and kernel mode,
+ * not even in long flat mode, because of different DPL.
+ *
+ * GDT layout to get 64-bit SYSCALL/SYSRET support right. SYSRET hardcodes
+ * selectors:
+ *
+ * if returning to 32-bit userspace: cs = STAR.SYSRET_CS,
+ * if returning to 64-bit userspace: cs = STAR.SYSRET_CS+16,
+ *
+ * ss = STAR.SYSRET_CS+8 (in either case)
+ *
+ * thus USER_DS should be between 32-bit and 64-bit code selectors:
+ */
+#define GDT_ENTRY_DEFAULT_USER32_CS 4
+#define GDT_ENTRY_DEFAULT_USER_DS 5
+#define GDT_ENTRY_DEFAULT_USER_CS 6
+
+/* Needs two entries */
+#define GDT_ENTRY_TSS 8
+/* Needs two entries */
+#define GDT_ENTRY_LDT 10
+
+#define GDT_ENTRY_TLS_MIN 12
+#define GDT_ENTRY_TLS_MAX 14
+
+/* Abused to load per CPU data from limit */
+#define GDT_ENTRY_PER_CPU 15
+
+/*
+ * Number of entries in the GDT table:
+ */
+#define GDT_ENTRIES 16
+
+/*
+ * Segment selector values corresponding to the above entries:
+ *
+ * Note, selectors also need to have a correct RPL,
+ * expressed with the +3 value for user-space selectors:
+ */
+#define __KERNEL32_CS (GDT_ENTRY_KERNEL32_CS*8)
+#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS*8)
+#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS*8)
+#define __USER32_CS (GDT_ENTRY_DEFAULT_USER32_CS*8 + 3)
+#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8 + 3)
+#define __USER32_DS __USER_DS
+#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8 + 3)
+#define __PER_CPU_SEG (GDT_ENTRY_PER_CPU*8 + 3)
+
+#endif
+
+#ifndef CONFIG_PARAVIRT
+# define get_kernel_rpl() 0
+#endif
+
+#define IDT_ENTRIES 256
+#define NUM_EXCEPTION_VECTORS 32
+
+/* Bitmask of exception vectors which push an error code on the stack: */
+#define EXCEPTION_ERRCODE_MASK 0x00027d00
+
+#define GDT_SIZE (GDT_ENTRIES*8)
+#define GDT_ENTRY_TLS_ENTRIES 3
+#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES* 8)
+
+#ifdef __KERNEL__
+
+/*
+ * early_idt_handler_array is an array of entry points referenced in the
+ * early IDT. For simplicity, it's a real array with one entry point
+ * every nine bytes. That leaves room for an optional 'push $0' if the
+ * vector has no error code (two bytes), a 'push $vector_number' (two
+ * bytes), and a jump to the common entry code (up to five bytes).
+ */
+#define EARLY_IDT_HANDLER_SIZE 9
+
+/*
+ * xen_early_idt_handler_array is for Xen pv guests: for each entry in
+ * early_idt_handler_array it contains a prequel in the form of
+ * pop %rcx; pop %r11; jmp early_idt_handler_array[i]; summing up to
+ * max 8 bytes.
+ */
+#define XEN_EARLY_IDT_HANDLER_SIZE 8
+
+#ifndef __ASSEMBLY__
+
+extern const char early_idt_handler_array[NUM_EXCEPTION_VECTORS][EARLY_IDT_HANDLER_SIZE];
+extern void early_ignore_irq(void);
+
+#if defined(CONFIG_X86_64) && defined(CONFIG_XEN_PV)
+extern const char xen_early_idt_handler_array[NUM_EXCEPTION_VECTORS][XEN_EARLY_IDT_HANDLER_SIZE];
+#endif
+
+/*
+ * Load a segment. Fall back on loading the zero segment if something goes
+ * wrong. This variant assumes that loading zero fully clears the segment.
+ * This is always the case on Intel CPUs and, even on 64-bit AMD CPUs, any
+ * failure to fully clear the cached descriptor is only observable for
+ * FS and GS.
+ */
+#define __loadsegment_simple(seg, value) \
+do { \
+ unsigned short __val = (value); \
+ \
+ asm volatile(" \n" \
+ "1: movl %k0,%%" #seg " \n" \
+ \
+ ".section .fixup,\"ax\" \n" \
+ "2: xorl %k0,%k0 \n" \
+ " jmp 1b \n" \
+ ".previous \n" \
+ \
+ _ASM_EXTABLE(1b, 2b) \
+ \
+ : "+r" (__val) : : "memory"); \
+} while (0)
+
+#define __loadsegment_ss(value) __loadsegment_simple(ss, (value))
+#define __loadsegment_ds(value) __loadsegment_simple(ds, (value))
+#define __loadsegment_es(value) __loadsegment_simple(es, (value))
+
+#ifdef CONFIG_X86_32
+
+/*
+ * On 32-bit systems, the hidden parts of FS and GS are unobservable if
+ * the selector is NULL, so there's no funny business here.
+ */
+#define __loadsegment_fs(value) __loadsegment_simple(fs, (value))
+#define __loadsegment_gs(value) __loadsegment_simple(gs, (value))
+
+#else
+
+static inline void __loadsegment_fs(unsigned short value)
+{
+ asm volatile(" \n"
+ "1: movw %0, %%fs \n"
+ "2: \n"
+
+ _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_clear_fs)
+
+ : : "rm" (value) : "memory");
+}
+
+/* __loadsegment_gs is intentionally undefined. Use load_gs_index instead. */
+
+#endif
+
+#define loadsegment(seg, value) __loadsegment_ ## seg (value)
+
+/*
+ * Save a segment register away:
+ */
+#define savesegment(seg, value) \
+ asm("mov %%" #seg ",%0":"=r" (value) : : "memory")
+
+/*
+ * x86-32 user GS accessors:
+ */
+#ifdef CONFIG_X86_32
+# ifdef CONFIG_X86_32_LAZY_GS
+# define get_user_gs(regs) (u16)({ unsigned long v; savesegment(gs, v); v; })
+# define set_user_gs(regs, v) loadsegment(gs, (unsigned long)(v))
+# define task_user_gs(tsk) ((tsk)->thread.gs)
+# define lazy_save_gs(v) savesegment(gs, (v))
+# define lazy_load_gs(v) loadsegment(gs, (v))
+# else /* X86_32_LAZY_GS */
+# define get_user_gs(regs) (u16)((regs)->gs)
+# define set_user_gs(regs, v) do { (regs)->gs = (v); } while (0)
+# define task_user_gs(tsk) (task_pt_regs(tsk)->gs)
+# define lazy_save_gs(v) do { } while (0)
+# define lazy_load_gs(v) do { } while (0)
+# endif /* X86_32_LAZY_GS */
+#endif /* X86_32 */
+
+#endif /* !__ASSEMBLY__ */
+#endif /* __KERNEL__ */
+
+#endif /* _ASM_X86_SEGMENT_H */
diff --git a/arch/x86/include/asm/serial.h b/arch/x86/include/asm/serial.h
new file mode 100644
index 000000000..ece8299d2
--- /dev/null
+++ b/arch/x86/include/asm/serial.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_SERIAL_H
+#define _ASM_X86_SERIAL_H
+
+/*
+ * This assumes you have a 1.8432 MHz clock for your UART.
+ *
+ * It'd be nice if someone built a serial card with a 24.576 MHz
+ * clock, since the 16550A is capable of handling a top speed of 1.5
+ * megabits/second; but this requires a faster clock.
+ */
+#define BASE_BAUD (1843200/16)
+
+/* Standard COM flags (except for COM4, because of the 8514 problem) */
+#ifdef CONFIG_SERIAL_8250_DETECT_IRQ
+# define STD_COMX_FLAGS (UPF_BOOT_AUTOCONF | UPF_SKIP_TEST | UPF_AUTO_IRQ)
+# define STD_COM4_FLAGS (UPF_BOOT_AUTOCONF | 0 | UPF_AUTO_IRQ)
+#else
+# define STD_COMX_FLAGS (UPF_BOOT_AUTOCONF | UPF_SKIP_TEST | 0 )
+# define STD_COM4_FLAGS (UPF_BOOT_AUTOCONF | 0 | 0 )
+#endif
+
+#define SERIAL_PORT_DFNS \
+ /* UART CLK PORT IRQ FLAGS */ \
+ { .uart = 0, BASE_BAUD, 0x3F8, 4, STD_COMX_FLAGS }, /* ttyS0 */ \
+ { .uart = 0, BASE_BAUD, 0x2F8, 3, STD_COMX_FLAGS }, /* ttyS1 */ \
+ { .uart = 0, BASE_BAUD, 0x3E8, 4, STD_COMX_FLAGS }, /* ttyS2 */ \
+ { .uart = 0, BASE_BAUD, 0x2E8, 3, STD_COM4_FLAGS }, /* ttyS3 */
+
+#endif /* _ASM_X86_SERIAL_H */
diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h
new file mode 100644
index 000000000..bc97b8c0f
--- /dev/null
+++ b/arch/x86/include/asm/set_memory.h
@@ -0,0 +1,141 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_SET_MEMORY_H
+#define _ASM_X86_SET_MEMORY_H
+
+#include <asm/page.h>
+#include <asm-generic/set_memory.h>
+
+/*
+ * The set_memory_* API can be used to change various attributes of a virtual
+ * address range. The attributes include:
+ * Cachability : UnCached, WriteCombining, WriteThrough, WriteBack
+ * Executability : eXeutable, NoteXecutable
+ * Read/Write : ReadOnly, ReadWrite
+ * Presence : NotPresent
+ * Encryption : Encrypted, Decrypted
+ *
+ * Within a category, the attributes are mutually exclusive.
+ *
+ * The implementation of this API will take care of various aspects that
+ * are associated with changing such attributes, such as:
+ * - Flushing TLBs
+ * - Flushing CPU caches
+ * - Making sure aliases of the memory behind the mapping don't violate
+ * coherency rules as defined by the CPU in the system.
+ *
+ * What this API does not do:
+ * - Provide exclusion between various callers - including callers that
+ * operation on other mappings of the same physical page
+ * - Restore default attributes when a page is freed
+ * - Guarantee that mappings other than the requested one are
+ * in any state, other than that these do not violate rules for
+ * the CPU you have. Do not depend on any effects on other mappings,
+ * CPUs other than the one you have may have more relaxed rules.
+ * The caller is required to take care of these.
+ */
+
+int _set_memory_uc(unsigned long addr, int numpages);
+int _set_memory_wc(unsigned long addr, int numpages);
+int _set_memory_wt(unsigned long addr, int numpages);
+int _set_memory_wb(unsigned long addr, int numpages);
+int set_memory_uc(unsigned long addr, int numpages);
+int set_memory_wc(unsigned long addr, int numpages);
+int set_memory_wt(unsigned long addr, int numpages);
+int set_memory_wb(unsigned long addr, int numpages);
+int set_memory_np(unsigned long addr, int numpages);
+int set_memory_4k(unsigned long addr, int numpages);
+int set_memory_encrypted(unsigned long addr, int numpages);
+int set_memory_decrypted(unsigned long addr, int numpages);
+int set_memory_np_noalias(unsigned long addr, int numpages);
+
+int set_memory_array_uc(unsigned long *addr, int addrinarray);
+int set_memory_array_wc(unsigned long *addr, int addrinarray);
+int set_memory_array_wt(unsigned long *addr, int addrinarray);
+int set_memory_array_wb(unsigned long *addr, int addrinarray);
+
+int set_pages_array_uc(struct page **pages, int addrinarray);
+int set_pages_array_wc(struct page **pages, int addrinarray);
+int set_pages_array_wt(struct page **pages, int addrinarray);
+int set_pages_array_wb(struct page **pages, int addrinarray);
+
+/*
+ * For legacy compatibility with the old APIs, a few functions
+ * are provided that work on a "struct page".
+ * These functions operate ONLY on the 1:1 kernel mapping of the
+ * memory that the struct page represents, and internally just
+ * call the set_memory_* function. See the description of the
+ * set_memory_* function for more details on conventions.
+ *
+ * These APIs should be considered *deprecated* and are likely going to
+ * be removed in the future.
+ * The reason for this is the implicit operation on the 1:1 mapping only,
+ * making this not a generally useful API.
+ *
+ * Specifically, many users of the old APIs had a virtual address,
+ * called virt_to_page() or vmalloc_to_page() on that address to
+ * get a struct page* that the old API required.
+ * To convert these cases, use set_memory_*() on the original
+ * virtual address, do not use these functions.
+ */
+
+int set_pages_uc(struct page *page, int numpages);
+int set_pages_wb(struct page *page, int numpages);
+int set_pages_x(struct page *page, int numpages);
+int set_pages_nx(struct page *page, int numpages);
+int set_pages_ro(struct page *page, int numpages);
+int set_pages_rw(struct page *page, int numpages);
+
+extern int kernel_set_to_readonly;
+void set_kernel_text_rw(void);
+void set_kernel_text_ro(void);
+
+#ifdef CONFIG_X86_64
+/*
+ * Prevent speculative access to the page by either unmapping
+ * it (if we do not require access to any part of the page) or
+ * marking it uncacheable (if we want to try to retrieve data
+ * from non-poisoned lines in the page).
+ */
+static inline int set_mce_nospec(unsigned long pfn, bool unmap)
+{
+ unsigned long decoy_addr;
+ int rc;
+
+ /*
+ * We would like to just call:
+ * set_memory_XX((unsigned long)pfn_to_kaddr(pfn), 1);
+ * but doing that would radically increase the odds of a
+ * speculative access to the poison page because we'd have
+ * the virtual address of the kernel 1:1 mapping sitting
+ * around in registers.
+ * Instead we get tricky. We create a non-canonical address
+ * that looks just like the one we want, but has bit 63 flipped.
+ * This relies on set_memory_XX() properly sanitizing any __pa()
+ * results with __PHYSICAL_MASK or PTE_PFN_MASK.
+ */
+ decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63));
+
+ if (unmap)
+ rc = set_memory_np(decoy_addr, 1);
+ else
+ rc = set_memory_uc(decoy_addr, 1);
+ if (rc)
+ pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn);
+ return rc;
+}
+#define set_mce_nospec set_mce_nospec
+
+/* Restore full speculative operation to the pfn. */
+static inline int clear_mce_nospec(unsigned long pfn)
+{
+ return set_memory_wb((unsigned long) pfn_to_kaddr(pfn), 1);
+}
+#define clear_mce_nospec clear_mce_nospec
+#else
+/*
+ * Few people would run a 32-bit kernel on a machine that supports
+ * recoverable errors because they have too much memory to boot 32-bit.
+ */
+#endif
+
+#endif /* _ASM_X86_SET_MEMORY_H */
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
new file mode 100644
index 000000000..ae13bc974
--- /dev/null
+++ b/arch/x86/include/asm/setup.h
@@ -0,0 +1,140 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_SETUP_H
+#define _ASM_X86_SETUP_H
+
+#include <uapi/asm/setup.h>
+
+#define COMMAND_LINE_SIZE 2048
+
+#include <linux/linkage.h>
+#include <asm/page_types.h>
+
+#ifdef __i386__
+
+#include <linux/pfn.h>
+/*
+ * Reserved space for vmalloc and iomap - defined in asm/page.h
+ */
+#define MAXMEM_PFN PFN_DOWN(MAXMEM)
+#define MAX_NONPAE_PFN (1 << 20)
+
+#endif /* __i386__ */
+
+#define PARAM_SIZE 4096 /* sizeof(struct boot_params) */
+
+#define OLD_CL_MAGIC 0xA33F
+#define OLD_CL_ADDRESS 0x020 /* Relative to real mode data */
+#define NEW_CL_POINTER 0x228 /* Relative to real mode data */
+
+#ifndef __ASSEMBLY__
+#include <asm/bootparam.h>
+#include <asm/x86_init.h>
+
+extern u64 relocated_ramdisk;
+
+/* Interrupt control for vSMPowered x86_64 systems */
+#ifdef CONFIG_X86_64
+void vsmp_init(void);
+#else
+static inline void vsmp_init(void) { }
+#endif
+
+void setup_bios_corruption_check(void);
+void early_platform_quirks(void);
+
+extern unsigned long saved_video_mode;
+
+extern void reserve_standard_io_resources(void);
+extern void i386_reserve_resources(void);
+
+#ifdef CONFIG_X86_INTEL_MID
+extern void x86_intel_mid_early_setup(void);
+#else
+static inline void x86_intel_mid_early_setup(void) { }
+#endif
+
+#ifdef CONFIG_X86_INTEL_CE
+extern void x86_ce4100_early_setup(void);
+#else
+static inline void x86_ce4100_early_setup(void) { }
+#endif
+
+#ifndef _SETUP
+
+#include <asm/espfix.h>
+#include <linux/kernel.h>
+
+/*
+ * This is set up by the setup-routine at boot-time
+ */
+extern struct boot_params boot_params;
+extern char _text[];
+
+static inline bool kaslr_enabled(void)
+{
+ return !!(boot_params.hdr.loadflags & KASLR_FLAG);
+}
+
+static inline unsigned long kaslr_offset(void)
+{
+ return (unsigned long)&_text - __START_KERNEL;
+}
+
+/*
+ * Do NOT EVER look at the BIOS memory size location.
+ * It does not work on many machines.
+ */
+#define LOWMEMSIZE() (0x9f000)
+
+/* exceedingly early brk-like allocator */
+extern unsigned long _brk_end;
+void *extend_brk(size_t size, size_t align);
+
+/*
+ * Reserve space in the brk section. The name must be unique within
+ * the file, and somewhat descriptive. The size is in bytes. Must be
+ * used at file scope.
+ *
+ * (This uses a temp function to wrap the asm so we can pass it the
+ * size parameter; otherwise we wouldn't be able to. We can't use a
+ * "section" attribute on a normal variable because it always ends up
+ * being @progbits, which ends up allocating space in the vmlinux
+ * executable.)
+ */
+#define RESERVE_BRK(name,sz) \
+ static void __section(.discard.text) __used notrace \
+ __brk_reservation_fn_##name##__(void) { \
+ asm volatile ( \
+ ".pushsection .brk_reservation,\"aw\",@nobits;" \
+ ".brk." #name ":" \
+ " 1:.skip %c0;" \
+ " .size .brk." #name ", . - 1b;" \
+ " .popsection" \
+ : : "i" (sz)); \
+ }
+
+/* Helper for reserving space for arrays of things */
+#define RESERVE_BRK_ARRAY(type, name, entries) \
+ type *name; \
+ RESERVE_BRK(name, sizeof(type) * entries)
+
+extern void probe_roms(void);
+#ifdef __i386__
+
+asmlinkage void __init i386_start_kernel(void);
+
+#else
+asmlinkage void __init x86_64_start_kernel(char *real_mode);
+asmlinkage void __init x86_64_start_reservations(char *real_mode_data);
+
+#endif /* __i386__ */
+#endif /* _SETUP */
+#else
+#define RESERVE_BRK(name,sz) \
+ .pushsection .brk_reservation,"aw",@nobits; \
+.brk.name: \
+1: .skip sz; \
+ .size .brk.name,.-1b; \
+ .popsection
+#endif /* __ASSEMBLY__ */
+#endif /* _ASM_X86_SETUP_H */
diff --git a/arch/x86/include/asm/setup_arch.h b/arch/x86/include/asm/setup_arch.h
new file mode 100644
index 000000000..38846208b
--- /dev/null
+++ b/arch/x86/include/asm/setup_arch.h
@@ -0,0 +1,3 @@
+/* Hook to call BIOS initialisation function */
+
+/* no action for generic */
diff --git a/arch/x86/include/asm/shmparam.h b/arch/x86/include/asm/shmparam.h
new file mode 100644
index 000000000..c4041819c
--- /dev/null
+++ b/arch/x86/include/asm/shmparam.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_SHMPARAM_H
+#define _ASM_X86_SHMPARAM_H
+
+#define SHMLBA PAGE_SIZE /* attach addr a multiple of this */
+
+#endif /* _ASM_X86_SHMPARAM_H */
diff --git a/arch/x86/include/asm/sigcontext.h b/arch/x86/include/asm/sigcontext.h
new file mode 100644
index 000000000..140d890c2
--- /dev/null
+++ b/arch/x86/include/asm/sigcontext.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_SIGCONTEXT_H
+#define _ASM_X86_SIGCONTEXT_H
+
+/* This is a legacy header - all kernel code includes <uapi/asm/sigcontext.h> directly. */
+
+#include <uapi/asm/sigcontext.h>
+
+#endif /* _ASM_X86_SIGCONTEXT_H */
diff --git a/arch/x86/include/asm/sigframe.h b/arch/x86/include/asm/sigframe.h
new file mode 100644
index 000000000..f176114c0
--- /dev/null
+++ b/arch/x86/include/asm/sigframe.h
@@ -0,0 +1,92 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_SIGFRAME_H
+#define _ASM_X86_SIGFRAME_H
+
+#include <uapi/asm/sigcontext.h>
+#include <asm/siginfo.h>
+#include <asm/ucontext.h>
+#include <linux/compat.h>
+
+#ifdef CONFIG_X86_32
+#define sigframe_ia32 sigframe
+#define rt_sigframe_ia32 rt_sigframe
+#define ucontext_ia32 ucontext
+#else /* !CONFIG_X86_32 */
+
+#ifdef CONFIG_IA32_EMULATION
+#include <asm/ia32.h>
+#endif /* CONFIG_IA32_EMULATION */
+
+#endif /* CONFIG_X86_32 */
+
+#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
+struct sigframe_ia32 {
+ u32 pretcode;
+ int sig;
+ struct sigcontext_32 sc;
+ /*
+ * fpstate is unused. fpstate is moved/allocated after
+ * retcode[] below. This movement allows to have the FP state and the
+ * future state extensions (xsave) stay together.
+ * And at the same time retaining the unused fpstate, prevents changing
+ * the offset of extramask[] in the sigframe and thus prevent any
+ * legacy application accessing/modifying it.
+ */
+ struct _fpstate_32 fpstate_unused;
+#ifdef CONFIG_IA32_EMULATION
+ unsigned int extramask[_COMPAT_NSIG_WORDS-1];
+#else /* !CONFIG_IA32_EMULATION */
+ unsigned long extramask[_NSIG_WORDS-1];
+#endif /* CONFIG_IA32_EMULATION */
+ char retcode[8];
+ /* fp state follows here */
+};
+
+struct rt_sigframe_ia32 {
+ u32 pretcode;
+ int sig;
+ u32 pinfo;
+ u32 puc;
+#ifdef CONFIG_IA32_EMULATION
+ compat_siginfo_t info;
+#else /* !CONFIG_IA32_EMULATION */
+ struct siginfo info;
+#endif /* CONFIG_IA32_EMULATION */
+ struct ucontext_ia32 uc;
+ char retcode[8];
+ /* fp state follows here */
+};
+#endif /* defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) */
+
+#ifdef CONFIG_X86_64
+
+struct rt_sigframe {
+ char __user *pretcode;
+ struct ucontext uc;
+ struct siginfo info;
+ /* fp state follows here */
+};
+
+#ifdef CONFIG_X86_X32_ABI
+
+struct ucontext_x32 {
+ unsigned int uc_flags;
+ unsigned int uc_link;
+ compat_stack_t uc_stack;
+ unsigned int uc__pad0; /* needed for alignment */
+ struct sigcontext uc_mcontext; /* the 64-bit sigcontext type */
+ compat_sigset_t uc_sigmask; /* mask last for extensibility */
+};
+
+struct rt_sigframe_x32 {
+ u64 pretcode;
+ struct ucontext_x32 uc;
+ compat_siginfo_t info;
+ /* fp state follows here */
+};
+
+#endif /* CONFIG_X86_X32_ABI */
+
+#endif /* CONFIG_X86_64 */
+
+#endif /* _ASM_X86_SIGFRAME_H */
diff --git a/arch/x86/include/asm/sighandling.h b/arch/x86/include/asm/sighandling.h
new file mode 100644
index 000000000..bd2683472
--- /dev/null
+++ b/arch/x86/include/asm/sighandling.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_SIGHANDLING_H
+#define _ASM_X86_SIGHANDLING_H
+
+#include <linux/compiler.h>
+#include <linux/ptrace.h>
+#include <linux/signal.h>
+
+#include <asm/processor-flags.h>
+
+#define FIX_EFLAGS (X86_EFLAGS_AC | X86_EFLAGS_OF | \
+ X86_EFLAGS_DF | X86_EFLAGS_TF | X86_EFLAGS_SF | \
+ X86_EFLAGS_ZF | X86_EFLAGS_AF | X86_EFLAGS_PF | \
+ X86_EFLAGS_CF | X86_EFLAGS_RF)
+
+void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
+int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
+ struct pt_regs *regs, unsigned long mask);
+
+#endif /* _ASM_X86_SIGHANDLING_H */
diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h
new file mode 100644
index 000000000..33d3c88a7
--- /dev/null
+++ b/arch/x86/include/asm/signal.h
@@ -0,0 +1,110 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_SIGNAL_H
+#define _ASM_X86_SIGNAL_H
+
+#ifndef __ASSEMBLY__
+#include <linux/linkage.h>
+
+/* Most things should be clean enough to redefine this at will, if care
+ is taken to make libc match. */
+
+#define _NSIG 64
+
+#ifdef __i386__
+# define _NSIG_BPW 32
+#else
+# define _NSIG_BPW 64
+#endif
+
+#define _NSIG_WORDS (_NSIG / _NSIG_BPW)
+
+typedef unsigned long old_sigset_t; /* at least 32 bits */
+
+typedef struct {
+ unsigned long sig[_NSIG_WORDS];
+} sigset_t;
+
+/* non-uapi in-kernel SA_FLAGS for those indicates ABI for a signal frame */
+#define SA_IA32_ABI 0x02000000u
+#define SA_X32_ABI 0x01000000u
+
+#ifndef CONFIG_COMPAT
+typedef sigset_t compat_sigset_t;
+#endif
+
+#endif /* __ASSEMBLY__ */
+#include <uapi/asm/signal.h>
+#ifndef __ASSEMBLY__
+extern void do_signal(struct pt_regs *regs);
+
+#define __ARCH_HAS_SA_RESTORER
+
+#include <asm/asm.h>
+#include <uapi/asm/sigcontext.h>
+
+#ifdef __i386__
+
+#define __HAVE_ARCH_SIG_BITOPS
+
+#define sigaddset(set,sig) \
+ (__builtin_constant_p(sig) \
+ ? __const_sigaddset((set), (sig)) \
+ : __gen_sigaddset((set), (sig)))
+
+static inline void __gen_sigaddset(sigset_t *set, int _sig)
+{
+ asm("btsl %1,%0" : "+m"(*set) : "Ir"(_sig - 1) : "cc");
+}
+
+static inline void __const_sigaddset(sigset_t *set, int _sig)
+{
+ unsigned long sig = _sig - 1;
+ set->sig[sig / _NSIG_BPW] |= 1 << (sig % _NSIG_BPW);
+}
+
+#define sigdelset(set, sig) \
+ (__builtin_constant_p(sig) \
+ ? __const_sigdelset((set), (sig)) \
+ : __gen_sigdelset((set), (sig)))
+
+
+static inline void __gen_sigdelset(sigset_t *set, int _sig)
+{
+ asm("btrl %1,%0" : "+m"(*set) : "Ir"(_sig - 1) : "cc");
+}
+
+static inline void __const_sigdelset(sigset_t *set, int _sig)
+{
+ unsigned long sig = _sig - 1;
+ set->sig[sig / _NSIG_BPW] &= ~(1 << (sig % _NSIG_BPW));
+}
+
+static inline int __const_sigismember(sigset_t *set, int _sig)
+{
+ unsigned long sig = _sig - 1;
+ return 1 & (set->sig[sig / _NSIG_BPW] >> (sig % _NSIG_BPW));
+}
+
+static inline int __gen_sigismember(sigset_t *set, int _sig)
+{
+ bool ret;
+ asm("btl %2,%1" CC_SET(c)
+ : CC_OUT(c) (ret) : "m"(*set), "Ir"(_sig-1));
+ return ret;
+}
+
+#define sigismember(set, sig) \
+ (__builtin_constant_p(sig) \
+ ? __const_sigismember((set), (sig)) \
+ : __gen_sigismember((set), (sig)))
+
+struct pt_regs;
+
+#else /* __i386__ */
+
+#undef __HAVE_ARCH_SIG_BITOPS
+
+#endif /* !__i386__ */
+
+#endif /* __ASSEMBLY__ */
+#endif /* _ASM_X86_SIGNAL_H */
diff --git a/arch/x86/include/asm/simd.h b/arch/x86/include/asm/simd.h
new file mode 100644
index 000000000..a341c878e
--- /dev/null
+++ b/arch/x86/include/asm/simd.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <asm/fpu/api.h>
+
+/*
+ * may_use_simd - whether it is allowable at this time to issue SIMD
+ * instructions or access the SIMD register file
+ */
+static __must_check inline bool may_use_simd(void)
+{
+ return irq_fpu_usable();
+}
diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h
new file mode 100644
index 000000000..db333300b
--- /dev/null
+++ b/arch/x86/include/asm/smap.h
@@ -0,0 +1,79 @@
+/*
+ * Supervisor Mode Access Prevention support
+ *
+ * Copyright (C) 2012 Intel Corporation
+ * Author: H. Peter Anvin <hpa@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#ifndef _ASM_X86_SMAP_H
+#define _ASM_X86_SMAP_H
+
+#include <linux/stringify.h>
+#include <asm/nops.h>
+#include <asm/cpufeatures.h>
+
+/* "Raw" instruction opcodes */
+#define __ASM_CLAC .byte 0x0f,0x01,0xca
+#define __ASM_STAC .byte 0x0f,0x01,0xcb
+
+#ifdef __ASSEMBLY__
+
+#include <asm/alternative-asm.h>
+
+#ifdef CONFIG_X86_SMAP
+
+#define ASM_CLAC \
+ ALTERNATIVE "", __stringify(__ASM_CLAC), X86_FEATURE_SMAP
+
+#define ASM_STAC \
+ ALTERNATIVE "", __stringify(__ASM_STAC), X86_FEATURE_SMAP
+
+#else /* CONFIG_X86_SMAP */
+
+#define ASM_CLAC
+#define ASM_STAC
+
+#endif /* CONFIG_X86_SMAP */
+
+#else /* __ASSEMBLY__ */
+
+#include <asm/alternative.h>
+
+#ifdef CONFIG_X86_SMAP
+
+static __always_inline void clac(void)
+{
+ /* Note: a barrier is implicit in alternative() */
+ alternative("", __stringify(__ASM_CLAC), X86_FEATURE_SMAP);
+}
+
+static __always_inline void stac(void)
+{
+ /* Note: a barrier is implicit in alternative() */
+ alternative("", __stringify(__ASM_STAC), X86_FEATURE_SMAP);
+}
+
+/* These macros can be used in asm() statements */
+#define ASM_CLAC \
+ ALTERNATIVE("", __stringify(__ASM_CLAC), X86_FEATURE_SMAP)
+#define ASM_STAC \
+ ALTERNATIVE("", __stringify(__ASM_STAC), X86_FEATURE_SMAP)
+
+#else /* CONFIG_X86_SMAP */
+
+static inline void clac(void) { }
+static inline void stac(void) { }
+
+#define ASM_CLAC
+#define ASM_STAC
+
+#endif /* CONFIG_X86_SMAP */
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_X86_SMAP_H */
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
new file mode 100644
index 000000000..547c4fe50
--- /dev/null
+++ b/arch/x86/include/asm/smp.h
@@ -0,0 +1,192 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_SMP_H
+#define _ASM_X86_SMP_H
+#ifndef __ASSEMBLY__
+#include <linux/cpumask.h>
+#include <asm/percpu.h>
+
+/*
+ * We need the APIC definitions automatically as part of 'smp.h'
+ */
+#ifdef CONFIG_X86_LOCAL_APIC
+# include <asm/mpspec.h>
+# include <asm/apic.h>
+# ifdef CONFIG_X86_IO_APIC
+# include <asm/io_apic.h>
+# endif
+#endif
+#include <asm/thread_info.h>
+#include <asm/cpumask.h>
+
+extern int smp_num_siblings;
+extern unsigned int num_processors;
+
+DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
+DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map);
+/* cpus sharing the last level cache: */
+DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
+DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id);
+DECLARE_PER_CPU_READ_MOSTLY(int, cpu_number);
+
+static inline struct cpumask *cpu_llc_shared_mask(int cpu)
+{
+ return per_cpu(cpu_llc_shared_map, cpu);
+}
+
+DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid);
+DECLARE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid);
+DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid);
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32)
+DECLARE_EARLY_PER_CPU_READ_MOSTLY(int, x86_cpu_to_logical_apicid);
+#endif
+
+struct task_struct;
+
+struct smp_ops {
+ void (*smp_prepare_boot_cpu)(void);
+ void (*smp_prepare_cpus)(unsigned max_cpus);
+ void (*smp_cpus_done)(unsigned max_cpus);
+
+ void (*stop_other_cpus)(int wait);
+ void (*crash_stop_other_cpus)(void);
+ void (*smp_send_reschedule)(int cpu);
+
+ int (*cpu_up)(unsigned cpu, struct task_struct *tidle);
+ int (*cpu_disable)(void);
+ void (*cpu_die)(unsigned int cpu);
+ void (*play_dead)(void);
+
+ void (*send_call_func_ipi)(const struct cpumask *mask);
+ void (*send_call_func_single_ipi)(int cpu);
+};
+
+/* Globals due to paravirt */
+extern void set_cpu_sibling_map(int cpu);
+
+#ifdef CONFIG_SMP
+extern struct smp_ops smp_ops;
+
+static inline void smp_send_stop(void)
+{
+ smp_ops.stop_other_cpus(0);
+}
+
+static inline void stop_other_cpus(void)
+{
+ smp_ops.stop_other_cpus(1);
+}
+
+static inline void smp_prepare_boot_cpu(void)
+{
+ smp_ops.smp_prepare_boot_cpu();
+}
+
+static inline void smp_prepare_cpus(unsigned int max_cpus)
+{
+ smp_ops.smp_prepare_cpus(max_cpus);
+}
+
+static inline void smp_cpus_done(unsigned int max_cpus)
+{
+ smp_ops.smp_cpus_done(max_cpus);
+}
+
+static inline int __cpu_up(unsigned int cpu, struct task_struct *tidle)
+{
+ return smp_ops.cpu_up(cpu, tidle);
+}
+
+static inline int __cpu_disable(void)
+{
+ return smp_ops.cpu_disable();
+}
+
+static inline void __cpu_die(unsigned int cpu)
+{
+ smp_ops.cpu_die(cpu);
+}
+
+static inline void play_dead(void)
+{
+ smp_ops.play_dead();
+}
+
+static inline void smp_send_reschedule(int cpu)
+{
+ smp_ops.smp_send_reschedule(cpu);
+}
+
+static inline void arch_send_call_function_single_ipi(int cpu)
+{
+ smp_ops.send_call_func_single_ipi(cpu);
+}
+
+static inline void arch_send_call_function_ipi_mask(const struct cpumask *mask)
+{
+ smp_ops.send_call_func_ipi(mask);
+}
+
+void cpu_disable_common(void);
+void native_smp_prepare_boot_cpu(void);
+void native_smp_prepare_cpus(unsigned int max_cpus);
+void calculate_max_logical_packages(void);
+void native_smp_cpus_done(unsigned int max_cpus);
+void common_cpu_up(unsigned int cpunum, struct task_struct *tidle);
+int native_cpu_up(unsigned int cpunum, struct task_struct *tidle);
+int native_cpu_disable(void);
+int common_cpu_die(unsigned int cpu);
+void native_cpu_die(unsigned int cpu);
+void hlt_play_dead(void);
+void native_play_dead(void);
+void play_dead_common(void);
+void wbinvd_on_cpu(int cpu);
+int wbinvd_on_all_cpus(void);
+
+void native_send_call_func_ipi(const struct cpumask *mask);
+void native_send_call_func_single_ipi(int cpu);
+void x86_idle_thread_init(unsigned int cpu, struct task_struct *idle);
+
+void smp_store_boot_cpu_info(void);
+void smp_store_cpu_info(int id);
+#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu)
+#define cpu_acpi_id(cpu) per_cpu(x86_cpu_to_acpiid, cpu)
+
+/*
+ * This function is needed by all SMP systems. It must _always_ be valid
+ * from the initial startup. We map APIC_BASE very early in page_setup(),
+ * so this is correct in the x86 case.
+ */
+#define raw_smp_processor_id() (this_cpu_read(cpu_number))
+
+#ifdef CONFIG_X86_32
+extern int safe_smp_processor_id(void);
+#else
+# define safe_smp_processor_id() smp_processor_id()
+#endif
+
+#else /* !CONFIG_SMP */
+#define wbinvd_on_cpu(cpu) wbinvd()
+static inline int wbinvd_on_all_cpus(void)
+{
+ wbinvd();
+ return 0;
+}
+#endif /* CONFIG_SMP */
+
+extern unsigned disabled_cpus;
+
+#ifdef CONFIG_X86_LOCAL_APIC
+extern int hard_smp_processor_id(void);
+
+#else /* CONFIG_X86_LOCAL_APIC */
+#define hard_smp_processor_id() 0
+#endif /* CONFIG_X86_LOCAL_APIC */
+
+#ifdef CONFIG_DEBUG_NMI_SELFTEST
+extern void nmi_selftest(void);
+#else
+#define nmi_selftest() do { } while (0)
+#endif
+
+#endif /* __ASSEMBLY__ */
+#endif /* _ASM_X86_SMP_H */
diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h
new file mode 100644
index 000000000..199218719
--- /dev/null
+++ b/arch/x86/include/asm/sparsemem.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_SPARSEMEM_H
+#define _ASM_X86_SPARSEMEM_H
+
+#ifdef CONFIG_SPARSEMEM
+/*
+ * generic non-linear memory support:
+ *
+ * 1) we will not split memory into more chunks than will fit into the flags
+ * field of the struct page
+ *
+ * SECTION_SIZE_BITS 2^n: size of each section
+ * MAX_PHYSADDR_BITS 2^n: max size of physical address space
+ * MAX_PHYSMEM_BITS 2^n: how much memory we can have in that space
+ *
+ */
+
+#ifdef CONFIG_X86_32
+# ifdef CONFIG_X86_PAE
+# define SECTION_SIZE_BITS 29
+# define MAX_PHYSADDR_BITS 36
+# define MAX_PHYSMEM_BITS 36
+# else
+# define SECTION_SIZE_BITS 26
+# define MAX_PHYSADDR_BITS 32
+# define MAX_PHYSMEM_BITS 32
+# endif
+#else /* CONFIG_X86_32 */
+# define SECTION_SIZE_BITS 27 /* matt - 128 is convenient right now */
+# define MAX_PHYSADDR_BITS (pgtable_l5_enabled() ? 52 : 44)
+# define MAX_PHYSMEM_BITS (pgtable_l5_enabled() ? 52 : 46)
+#endif
+
+#endif /* CONFIG_SPARSEMEM */
+#endif /* _ASM_X86_SPARSEMEM_H */
diff --git a/arch/x86/include/asm/spec-ctrl.h b/arch/x86/include/asm/spec-ctrl.h
new file mode 100644
index 000000000..5393babc0
--- /dev/null
+++ b/arch/x86/include/asm/spec-ctrl.h
@@ -0,0 +1,88 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_SPECCTRL_H_
+#define _ASM_X86_SPECCTRL_H_
+
+#include <linux/thread_info.h>
+#include <asm/nospec-branch.h>
+
+/*
+ * On VMENTER we must preserve whatever view of the SPEC_CTRL MSR
+ * the guest has, while on VMEXIT we restore the host view. This
+ * would be easier if SPEC_CTRL were architecturally maskable or
+ * shadowable for guests but this is not (currently) the case.
+ * Takes the guest view of SPEC_CTRL MSR as a parameter and also
+ * the guest's version of VIRT_SPEC_CTRL, if emulated.
+ */
+extern void x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool guest);
+
+/**
+ * x86_spec_ctrl_set_guest - Set speculation control registers for the guest
+ * @guest_spec_ctrl: The guest content of MSR_SPEC_CTRL
+ * @guest_virt_spec_ctrl: The guest controlled bits of MSR_VIRT_SPEC_CTRL
+ * (may get translated to MSR_AMD64_LS_CFG bits)
+ *
+ * Avoids writing to the MSR if the content/bits are the same
+ */
+static inline
+void x86_spec_ctrl_set_guest(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl)
+{
+ x86_virt_spec_ctrl(guest_spec_ctrl, guest_virt_spec_ctrl, true);
+}
+
+/**
+ * x86_spec_ctrl_restore_host - Restore host speculation control registers
+ * @guest_spec_ctrl: The guest content of MSR_SPEC_CTRL
+ * @guest_virt_spec_ctrl: The guest controlled bits of MSR_VIRT_SPEC_CTRL
+ * (may get translated to MSR_AMD64_LS_CFG bits)
+ *
+ * Avoids writing to the MSR if the content/bits are the same
+ */
+static inline
+void x86_spec_ctrl_restore_host(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl)
+{
+ x86_virt_spec_ctrl(guest_spec_ctrl, guest_virt_spec_ctrl, false);
+}
+
+/* AMD specific Speculative Store Bypass MSR data */
+extern u64 x86_amd_ls_cfg_base;
+extern u64 x86_amd_ls_cfg_ssbd_mask;
+
+static inline u64 ssbd_tif_to_spec_ctrl(u64 tifn)
+{
+ BUILD_BUG_ON(TIF_SSBD < SPEC_CTRL_SSBD_SHIFT);
+ return (tifn & _TIF_SSBD) >> (TIF_SSBD - SPEC_CTRL_SSBD_SHIFT);
+}
+
+static inline u64 stibp_tif_to_spec_ctrl(u64 tifn)
+{
+ BUILD_BUG_ON(TIF_SPEC_IB < SPEC_CTRL_STIBP_SHIFT);
+ return (tifn & _TIF_SPEC_IB) >> (TIF_SPEC_IB - SPEC_CTRL_STIBP_SHIFT);
+}
+
+static inline unsigned long ssbd_spec_ctrl_to_tif(u64 spec_ctrl)
+{
+ BUILD_BUG_ON(TIF_SSBD < SPEC_CTRL_SSBD_SHIFT);
+ return (spec_ctrl & SPEC_CTRL_SSBD) << (TIF_SSBD - SPEC_CTRL_SSBD_SHIFT);
+}
+
+static inline unsigned long stibp_spec_ctrl_to_tif(u64 spec_ctrl)
+{
+ BUILD_BUG_ON(TIF_SPEC_IB < SPEC_CTRL_STIBP_SHIFT);
+ return (spec_ctrl & SPEC_CTRL_STIBP) << (TIF_SPEC_IB - SPEC_CTRL_STIBP_SHIFT);
+}
+
+static inline u64 ssbd_tif_to_amd_ls_cfg(u64 tifn)
+{
+ return (tifn & _TIF_SSBD) ? x86_amd_ls_cfg_ssbd_mask : 0ULL;
+}
+
+#ifdef CONFIG_SMP
+extern void speculative_store_bypass_ht_init(void);
+#else
+static inline void speculative_store_bypass_ht_init(void) { }
+#endif
+
+extern void speculation_ctrl_update(unsigned long tif);
+extern void speculation_ctrl_update_current(void);
+
+#endif
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
new file mode 100644
index 000000000..317fc59b5
--- /dev/null
+++ b/arch/x86/include/asm/special_insns.h
@@ -0,0 +1,245 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_SPECIAL_INSNS_H
+#define _ASM_X86_SPECIAL_INSNS_H
+
+
+#ifdef __KERNEL__
+
+#include <asm/nops.h>
+
+/*
+ * Volatile isn't enough to prevent the compiler from reordering the
+ * read/write functions for the control registers and messing everything up.
+ * A memory clobber would solve the problem, but would prevent reordering of
+ * all loads stores around it, which can hurt performance. Solution is to
+ * use a variable and mimic reads and writes to it to enforce serialization
+ */
+extern unsigned long __force_order;
+
+static inline unsigned long native_read_cr0(void)
+{
+ unsigned long val;
+ asm volatile("mov %%cr0,%0\n\t" : "=r" (val), "=m" (__force_order));
+ return val;
+}
+
+static inline void native_write_cr0(unsigned long val)
+{
+ asm volatile("mov %0,%%cr0": : "r" (val), "m" (__force_order));
+}
+
+static inline unsigned long native_read_cr2(void)
+{
+ unsigned long val;
+ asm volatile("mov %%cr2,%0\n\t" : "=r" (val), "=m" (__force_order));
+ return val;
+}
+
+static inline void native_write_cr2(unsigned long val)
+{
+ asm volatile("mov %0,%%cr2": : "r" (val), "m" (__force_order));
+}
+
+static inline unsigned long __native_read_cr3(void)
+{
+ unsigned long val;
+ asm volatile("mov %%cr3,%0\n\t" : "=r" (val), "=m" (__force_order));
+ return val;
+}
+
+static inline void native_write_cr3(unsigned long val)
+{
+ asm volatile("mov %0,%%cr3": : "r" (val), "m" (__force_order));
+}
+
+static inline unsigned long native_read_cr4(void)
+{
+ unsigned long val;
+#ifdef CONFIG_X86_32
+ /*
+ * This could fault if CR4 does not exist. Non-existent CR4
+ * is functionally equivalent to CR4 == 0. Keep it simple and pretend
+ * that CR4 == 0 on CPUs that don't have CR4.
+ */
+ asm volatile("1: mov %%cr4, %0\n"
+ "2:\n"
+ _ASM_EXTABLE(1b, 2b)
+ : "=r" (val), "=m" (__force_order) : "0" (0));
+#else
+ /* CR4 always exists on x86_64. */
+ asm volatile("mov %%cr4,%0\n\t" : "=r" (val), "=m" (__force_order));
+#endif
+ return val;
+}
+
+static inline void native_write_cr4(unsigned long val)
+{
+ asm volatile("mov %0,%%cr4": : "r" (val), "m" (__force_order));
+}
+
+#ifdef CONFIG_X86_64
+static inline unsigned long native_read_cr8(void)
+{
+ unsigned long cr8;
+ asm volatile("movq %%cr8,%0" : "=r" (cr8));
+ return cr8;
+}
+
+static inline void native_write_cr8(unsigned long val)
+{
+ asm volatile("movq %0,%%cr8" :: "r" (val) : "memory");
+}
+#endif
+
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+static inline u32 __read_pkru(void)
+{
+ u32 ecx = 0;
+ u32 edx, pkru;
+
+ /*
+ * "rdpkru" instruction. Places PKRU contents in to EAX,
+ * clears EDX and requires that ecx=0.
+ */
+ asm volatile(".byte 0x0f,0x01,0xee\n\t"
+ : "=a" (pkru), "=d" (edx)
+ : "c" (ecx));
+ return pkru;
+}
+
+static inline void __write_pkru(u32 pkru)
+{
+ u32 ecx = 0, edx = 0;
+
+ /*
+ * "wrpkru" instruction. Loads contents in EAX to PKRU,
+ * requires that ecx = edx = 0.
+ */
+ asm volatile(".byte 0x0f,0x01,0xef\n\t"
+ : : "a" (pkru), "c"(ecx), "d"(edx));
+}
+#else
+static inline u32 __read_pkru(void)
+{
+ return 0;
+}
+
+static inline void __write_pkru(u32 pkru)
+{
+}
+#endif
+
+static inline void native_wbinvd(void)
+{
+ asm volatile("wbinvd": : :"memory");
+}
+
+extern asmlinkage void native_load_gs_index(unsigned);
+
+static inline unsigned long __read_cr4(void)
+{
+ return native_read_cr4();
+}
+
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+
+static inline unsigned long read_cr0(void)
+{
+ return native_read_cr0();
+}
+
+static inline void write_cr0(unsigned long x)
+{
+ native_write_cr0(x);
+}
+
+static inline unsigned long read_cr2(void)
+{
+ return native_read_cr2();
+}
+
+static inline void write_cr2(unsigned long x)
+{
+ native_write_cr2(x);
+}
+
+/*
+ * Careful! CR3 contains more than just an address. You probably want
+ * read_cr3_pa() instead.
+ */
+static inline unsigned long __read_cr3(void)
+{
+ return __native_read_cr3();
+}
+
+static inline void write_cr3(unsigned long x)
+{
+ native_write_cr3(x);
+}
+
+static inline void __write_cr4(unsigned long x)
+{
+ native_write_cr4(x);
+}
+
+static inline void wbinvd(void)
+{
+ native_wbinvd();
+}
+
+#ifdef CONFIG_X86_64
+
+static inline unsigned long read_cr8(void)
+{
+ return native_read_cr8();
+}
+
+static inline void write_cr8(unsigned long x)
+{
+ native_write_cr8(x);
+}
+
+static inline void load_gs_index(unsigned selector)
+{
+ native_load_gs_index(selector);
+}
+
+#endif
+
+#endif/* CONFIG_PARAVIRT */
+
+static inline void clflush(volatile void *__p)
+{
+ asm volatile("clflush %0" : "+m" (*(volatile char __force *)__p));
+}
+
+static inline void clflushopt(volatile void *__p)
+{
+ alternative_io(".byte " __stringify(NOP_DS_PREFIX) "; clflush %P0",
+ ".byte 0x66; clflush %P0",
+ X86_FEATURE_CLFLUSHOPT,
+ "+m" (*(volatile char __force *)__p));
+}
+
+static inline void clwb(volatile void *__p)
+{
+ volatile struct { char x[64]; } *p = __p;
+
+ asm volatile(ALTERNATIVE_2(
+ ".byte " __stringify(NOP_DS_PREFIX) "; clflush (%[pax])",
+ ".byte 0x66; clflush (%[pax])", /* clflushopt (%%rax) */
+ X86_FEATURE_CLFLUSHOPT,
+ ".byte 0x66, 0x0f, 0xae, 0x30", /* clwb (%%rax) */
+ X86_FEATURE_CLWB)
+ : [p] "+m" (*p)
+ : [pax] "a" (p));
+}
+
+#define nop() asm volatile ("nop")
+
+
+#endif /* __KERNEL__ */
+
+#endif /* _ASM_X86_SPECIAL_INSNS_H */
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
new file mode 100644
index 000000000..5b6bc7016
--- /dev/null
+++ b/arch/x86/include/asm/spinlock.h
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_SPINLOCK_H
+#define _ASM_X86_SPINLOCK_H
+
+#include <linux/jump_label.h>
+#include <linux/atomic.h>
+#include <asm/page.h>
+#include <asm/processor.h>
+#include <linux/compiler.h>
+#include <asm/paravirt.h>
+#include <asm/bitops.h>
+
+/*
+ * Your basic SMP spinlocks, allowing only a single CPU anywhere
+ *
+ * Simple spin lock operations. There are two variants, one clears IRQ's
+ * on the local processor, one does not.
+ *
+ * These are fair FIFO ticket locks, which support up to 2^16 CPUs.
+ *
+ * (the type definitions are in asm/spinlock_types.h)
+ */
+
+/* How long a lock should spin before we consider blocking */
+#define SPIN_THRESHOLD (1 << 15)
+
+#include <asm/qspinlock.h>
+
+/*
+ * Read-write spinlocks, allowing multiple readers
+ * but only one writer.
+ *
+ * NOTE! it is quite common to have readers in interrupts
+ * but no interrupt writers. For those circumstances we
+ * can "mix" irq-safe locks - any writer needs to get a
+ * irq-safe write-lock, but readers can get non-irqsafe
+ * read-locks.
+ *
+ * On x86, we implement read-write locks using the generic qrwlock with
+ * x86 specific optimization.
+ */
+
+#include <asm/qrwlock.h>
+
+#endif /* _ASM_X86_SPINLOCK_H */
diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h
new file mode 100644
index 000000000..bf3e34b25
--- /dev/null
+++ b/arch/x86/include/asm/spinlock_types.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_SPINLOCK_TYPES_H
+#define _ASM_X86_SPINLOCK_TYPES_H
+
+#include <linux/types.h>
+
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+#define __TICKET_LOCK_INC 2
+#define TICKET_SLOWPATH_FLAG ((__ticket_t)1)
+#else
+#define __TICKET_LOCK_INC 1
+#define TICKET_SLOWPATH_FLAG ((__ticket_t)0)
+#endif
+
+#if (CONFIG_NR_CPUS < (256 / __TICKET_LOCK_INC))
+typedef u8 __ticket_t;
+typedef u16 __ticketpair_t;
+#else
+typedef u16 __ticket_t;
+typedef u32 __ticketpair_t;
+#endif
+
+#define TICKET_LOCK_INC ((__ticket_t)__TICKET_LOCK_INC)
+
+#define TICKET_SHIFT (sizeof(__ticket_t) * 8)
+
+#include <asm-generic/qspinlock_types.h>
+
+#include <asm-generic/qrwlock_types.h>
+
+#endif /* _ASM_X86_SPINLOCK_TYPES_H */
diff --git a/arch/x86/include/asm/sta2x11.h b/arch/x86/include/asm/sta2x11.h
new file mode 100644
index 000000000..e0975e9c4
--- /dev/null
+++ b/arch/x86/include/asm/sta2x11.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Header file for STMicroelectronics ConneXt (STA2X11) IOHub
+ */
+#ifndef __ASM_STA2X11_H
+#define __ASM_STA2X11_H
+
+#include <linux/pci.h>
+
+/* This needs to be called from the MFD to configure its sub-devices */
+struct sta2x11_instance *sta2x11_get_instance(struct pci_dev *pdev);
+
+#endif /* __ASM_STA2X11_H */
diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h
new file mode 100644
index 000000000..9c556ea2e
--- /dev/null
+++ b/arch/x86/include/asm/stackprotector.h
@@ -0,0 +1,130 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * GCC stack protector support.
+ *
+ * Stack protector works by putting predefined pattern at the start of
+ * the stack frame and verifying that it hasn't been overwritten when
+ * returning from the function. The pattern is called stack canary
+ * and unfortunately gcc requires it to be at a fixed offset from %gs.
+ * On x86_64, the offset is 40 bytes and on x86_32 20 bytes. x86_64
+ * and x86_32 use segment registers differently and thus handles this
+ * requirement differently.
+ *
+ * On x86_64, %gs is shared by percpu area and stack canary. All
+ * percpu symbols are zero based and %gs points to the base of percpu
+ * area. The first occupant of the percpu area is always
+ * irq_stack_union which contains stack_canary at offset 40. Userland
+ * %gs is always saved and restored on kernel entry and exit using
+ * swapgs, so stack protector doesn't add any complexity there.
+ *
+ * On x86_32, it's slightly more complicated. As in x86_64, %gs is
+ * used for userland TLS. Unfortunately, some processors are much
+ * slower at loading segment registers with different value when
+ * entering and leaving the kernel, so the kernel uses %fs for percpu
+ * area and manages %gs lazily so that %gs is switched only when
+ * necessary, usually during task switch.
+ *
+ * As gcc requires the stack canary at %gs:20, %gs can't be managed
+ * lazily if stack protector is enabled, so the kernel saves and
+ * restores userland %gs on kernel entry and exit. This behavior is
+ * controlled by CONFIG_X86_32_LAZY_GS and accessors are defined in
+ * system.h to hide the details.
+ */
+
+#ifndef _ASM_STACKPROTECTOR_H
+#define _ASM_STACKPROTECTOR_H 1
+
+#ifdef CONFIG_STACKPROTECTOR
+
+#include <asm/tsc.h>
+#include <asm/processor.h>
+#include <asm/percpu.h>
+#include <asm/desc.h>
+
+#include <linux/random.h>
+#include <linux/sched.h>
+
+/*
+ * 24 byte read-only segment initializer for stack canary. Linker
+ * can't handle the address bit shifting. Address will be set in
+ * head_32 for boot CPU and setup_per_cpu_areas() for others.
+ */
+#define GDT_STACK_CANARY_INIT \
+ [GDT_ENTRY_STACK_CANARY] = GDT_ENTRY_INIT(0x4090, 0, 0x18),
+
+/*
+ * Initialize the stackprotector canary value.
+ *
+ * NOTE: this must only be called from functions that never return
+ * and it must always be inlined.
+ *
+ * In addition, it should be called from a compilation unit for which
+ * stack protector is disabled. Alternatively, the caller should not end
+ * with a function call which gets tail-call optimized as that would
+ * lead to checking a modified canary value.
+ */
+static __always_inline void boot_init_stack_canary(void)
+{
+ u64 canary;
+ u64 tsc;
+
+#ifdef CONFIG_X86_64
+ BUILD_BUG_ON(offsetof(union irq_stack_union, stack_canary) != 40);
+#endif
+ /*
+ * We both use the random pool and the current TSC as a source
+ * of randomness. The TSC only matters for very early init,
+ * there it already has some randomness on most systems. Later
+ * on during the bootup the random pool has true entropy too.
+ */
+ get_random_bytes(&canary, sizeof(canary));
+ tsc = rdtsc();
+ canary += tsc + (tsc << 32UL);
+ canary &= CANARY_MASK;
+
+ current->stack_canary = canary;
+#ifdef CONFIG_X86_64
+ this_cpu_write(irq_stack_union.stack_canary, canary);
+#else
+ this_cpu_write(stack_canary.canary, canary);
+#endif
+}
+
+static inline void setup_stack_canary_segment(int cpu)
+{
+#ifdef CONFIG_X86_32
+ unsigned long canary = (unsigned long)&per_cpu(stack_canary, cpu);
+ struct desc_struct *gdt_table = get_cpu_gdt_rw(cpu);
+ struct desc_struct desc;
+
+ desc = gdt_table[GDT_ENTRY_STACK_CANARY];
+ set_desc_base(&desc, canary);
+ write_gdt_entry(gdt_table, GDT_ENTRY_STACK_CANARY, &desc, DESCTYPE_S);
+#endif
+}
+
+static inline void load_stack_canary_segment(void)
+{
+#ifdef CONFIG_X86_32
+ asm("mov %0, %%gs" : : "r" (__KERNEL_STACK_CANARY) : "memory");
+#endif
+}
+
+#else /* STACKPROTECTOR */
+
+#define GDT_STACK_CANARY_INIT
+
+/* dummy boot_init_stack_canary() is defined in linux/stackprotector.h */
+
+static inline void setup_stack_canary_segment(int cpu)
+{ }
+
+static inline void load_stack_canary_segment(void)
+{
+#ifdef CONFIG_X86_32
+ asm volatile ("mov %0, %%gs" : : "r" (0));
+#endif
+}
+
+#endif /* STACKPROTECTOR */
+#endif /* _ASM_STACKPROTECTOR_H */
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
new file mode 100644
index 000000000..f335aad40
--- /dev/null
+++ b/arch/x86/include/asm/stacktrace.h
@@ -0,0 +1,116 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
+ */
+
+#ifndef _ASM_X86_STACKTRACE_H
+#define _ASM_X86_STACKTRACE_H
+
+#include <linux/uaccess.h>
+#include <linux/ptrace.h>
+#include <asm/switch_to.h>
+
+enum stack_type {
+ STACK_TYPE_UNKNOWN,
+ STACK_TYPE_TASK,
+ STACK_TYPE_IRQ,
+ STACK_TYPE_SOFTIRQ,
+ STACK_TYPE_ENTRY,
+ STACK_TYPE_EXCEPTION,
+ STACK_TYPE_EXCEPTION_LAST = STACK_TYPE_EXCEPTION + N_EXCEPTION_STACKS-1,
+};
+
+struct stack_info {
+ enum stack_type type;
+ unsigned long *begin, *end, *next_sp;
+};
+
+bool in_task_stack(unsigned long *stack, struct task_struct *task,
+ struct stack_info *info);
+
+bool in_entry_stack(unsigned long *stack, struct stack_info *info);
+
+int get_stack_info(unsigned long *stack, struct task_struct *task,
+ struct stack_info *info, unsigned long *visit_mask);
+
+const char *stack_type_name(enum stack_type type);
+
+static inline bool on_stack(struct stack_info *info, void *addr, size_t len)
+{
+ void *begin = info->begin;
+ void *end = info->end;
+
+ return (info->type != STACK_TYPE_UNKNOWN &&
+ addr >= begin && addr < end &&
+ addr + len > begin && addr + len <= end);
+}
+
+#ifdef CONFIG_X86_32
+#define STACKSLOTS_PER_LINE 8
+#else
+#define STACKSLOTS_PER_LINE 4
+#endif
+
+#ifdef CONFIG_FRAME_POINTER
+static inline unsigned long *
+get_frame_pointer(struct task_struct *task, struct pt_regs *regs)
+{
+ if (regs)
+ return (unsigned long *)regs->bp;
+
+ if (task == current)
+ return __builtin_frame_address(0);
+
+ return &((struct inactive_task_frame *)task->thread.sp)->bp;
+}
+#else
+static inline unsigned long *
+get_frame_pointer(struct task_struct *task, struct pt_regs *regs)
+{
+ return NULL;
+}
+#endif /* CONFIG_FRAME_POINTER */
+
+static inline unsigned long *
+get_stack_pointer(struct task_struct *task, struct pt_regs *regs)
+{
+ if (regs)
+ return (unsigned long *)kernel_stack_pointer(regs);
+
+ if (task == current)
+ return __builtin_frame_address(0);
+
+ return (unsigned long *)task->thread.sp;
+}
+
+void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
+ unsigned long *stack, char *log_lvl);
+
+/* The form of the top of the frame on the stack */
+struct stack_frame {
+ struct stack_frame *next_frame;
+ unsigned long return_address;
+};
+
+struct stack_frame_ia32 {
+ u32 next_frame;
+ u32 return_address;
+};
+
+static inline unsigned long caller_frame_pointer(void)
+{
+ struct stack_frame *frame;
+
+ frame = __builtin_frame_address(0);
+
+#ifdef CONFIG_FRAME_POINTER
+ frame = frame->next_frame;
+#endif
+
+ return (unsigned long)frame;
+}
+
+void show_opcodes(struct pt_regs *regs, const char *loglvl);
+void show_ip(struct pt_regs *regs, const char *loglvl);
+#endif /* _ASM_X86_STACKTRACE_H */
diff --git a/arch/x86/include/asm/string.h b/arch/x86/include/asm/string.h
new file mode 100644
index 000000000..c3c2c1914
--- /dev/null
+++ b/arch/x86/include/asm/string.h
@@ -0,0 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifdef CONFIG_X86_32
+# include <asm/string_32.h>
+#else
+# include <asm/string_64.h>
+#endif
diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h
new file mode 100644
index 000000000..2fd165f1c
--- /dev/null
+++ b/arch/x86/include/asm/string_32.h
@@ -0,0 +1,347 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_STRING_32_H
+#define _ASM_X86_STRING_32_H
+
+#ifdef __KERNEL__
+
+/* Let gcc decide whether to inline or use the out of line functions */
+
+#define __HAVE_ARCH_STRCPY
+extern char *strcpy(char *dest, const char *src);
+
+#define __HAVE_ARCH_STRNCPY
+extern char *strncpy(char *dest, const char *src, size_t count);
+
+#define __HAVE_ARCH_STRCAT
+extern char *strcat(char *dest, const char *src);
+
+#define __HAVE_ARCH_STRNCAT
+extern char *strncat(char *dest, const char *src, size_t count);
+
+#define __HAVE_ARCH_STRCMP
+extern int strcmp(const char *cs, const char *ct);
+
+#define __HAVE_ARCH_STRNCMP
+extern int strncmp(const char *cs, const char *ct, size_t count);
+
+#define __HAVE_ARCH_STRCHR
+extern char *strchr(const char *s, int c);
+
+#define __HAVE_ARCH_STRLEN
+extern size_t strlen(const char *s);
+
+static __always_inline void *__memcpy(void *to, const void *from, size_t n)
+{
+ int d0, d1, d2;
+ asm volatile("rep ; movsl\n\t"
+ "movl %4,%%ecx\n\t"
+ "andl $3,%%ecx\n\t"
+ "jz 1f\n\t"
+ "rep ; movsb\n\t"
+ "1:"
+ : "=&c" (d0), "=&D" (d1), "=&S" (d2)
+ : "0" (n / 4), "g" (n), "1" ((long)to), "2" ((long)from)
+ : "memory");
+ return to;
+}
+
+/*
+ * This looks ugly, but the compiler can optimize it totally,
+ * as the count is constant.
+ */
+static __always_inline void *__constant_memcpy(void *to, const void *from,
+ size_t n)
+{
+ long esi, edi;
+ if (!n)
+ return to;
+
+ switch (n) {
+ case 1:
+ *(char *)to = *(char *)from;
+ return to;
+ case 2:
+ *(short *)to = *(short *)from;
+ return to;
+ case 4:
+ *(int *)to = *(int *)from;
+ return to;
+ case 3:
+ *(short *)to = *(short *)from;
+ *((char *)to + 2) = *((char *)from + 2);
+ return to;
+ case 5:
+ *(int *)to = *(int *)from;
+ *((char *)to + 4) = *((char *)from + 4);
+ return to;
+ case 6:
+ *(int *)to = *(int *)from;
+ *((short *)to + 2) = *((short *)from + 2);
+ return to;
+ case 8:
+ *(int *)to = *(int *)from;
+ *((int *)to + 1) = *((int *)from + 1);
+ return to;
+ }
+
+ esi = (long)from;
+ edi = (long)to;
+ if (n >= 5 * 4) {
+ /* large block: use rep prefix */
+ int ecx;
+ asm volatile("rep ; movsl"
+ : "=&c" (ecx), "=&D" (edi), "=&S" (esi)
+ : "0" (n / 4), "1" (edi), "2" (esi)
+ : "memory"
+ );
+ } else {
+ /* small block: don't clobber ecx + smaller code */
+ if (n >= 4 * 4)
+ asm volatile("movsl"
+ : "=&D"(edi), "=&S"(esi)
+ : "0"(edi), "1"(esi)
+ : "memory");
+ if (n >= 3 * 4)
+ asm volatile("movsl"
+ : "=&D"(edi), "=&S"(esi)
+ : "0"(edi), "1"(esi)
+ : "memory");
+ if (n >= 2 * 4)
+ asm volatile("movsl"
+ : "=&D"(edi), "=&S"(esi)
+ : "0"(edi), "1"(esi)
+ : "memory");
+ if (n >= 1 * 4)
+ asm volatile("movsl"
+ : "=&D"(edi), "=&S"(esi)
+ : "0"(edi), "1"(esi)
+ : "memory");
+ }
+ switch (n % 4) {
+ /* tail */
+ case 0:
+ return to;
+ case 1:
+ asm volatile("movsb"
+ : "=&D"(edi), "=&S"(esi)
+ : "0"(edi), "1"(esi)
+ : "memory");
+ return to;
+ case 2:
+ asm volatile("movsw"
+ : "=&D"(edi), "=&S"(esi)
+ : "0"(edi), "1"(esi)
+ : "memory");
+ return to;
+ default:
+ asm volatile("movsw\n\tmovsb"
+ : "=&D"(edi), "=&S"(esi)
+ : "0"(edi), "1"(esi)
+ : "memory");
+ return to;
+ }
+}
+
+#define __HAVE_ARCH_MEMCPY
+extern void *memcpy(void *, const void *, size_t);
+
+#ifndef CONFIG_FORTIFY_SOURCE
+#ifdef CONFIG_X86_USE_3DNOW
+
+#include <asm/mmx.h>
+
+/*
+ * This CPU favours 3DNow strongly (eg AMD Athlon)
+ */
+
+static inline void *__constant_memcpy3d(void *to, const void *from, size_t len)
+{
+ if (len < 512)
+ return __constant_memcpy(to, from, len);
+ return _mmx_memcpy(to, from, len);
+}
+
+static inline void *__memcpy3d(void *to, const void *from, size_t len)
+{
+ if (len < 512)
+ return __memcpy(to, from, len);
+ return _mmx_memcpy(to, from, len);
+}
+
+#define memcpy(t, f, n) \
+ (__builtin_constant_p((n)) \
+ ? __constant_memcpy3d((t), (f), (n)) \
+ : __memcpy3d((t), (f), (n)))
+
+#else
+
+/*
+ * No 3D Now!
+ */
+
+#define memcpy(t, f, n) __builtin_memcpy(t, f, n)
+
+#endif
+#endif /* !CONFIG_FORTIFY_SOURCE */
+
+#define __HAVE_ARCH_MEMMOVE
+void *memmove(void *dest, const void *src, size_t n);
+
+extern int memcmp(const void *, const void *, size_t);
+#ifndef CONFIG_FORTIFY_SOURCE
+#define memcmp __builtin_memcmp
+#endif
+
+#define __HAVE_ARCH_MEMCHR
+extern void *memchr(const void *cs, int c, size_t count);
+
+static inline void *__memset_generic(void *s, char c, size_t count)
+{
+ int d0, d1;
+ asm volatile("rep\n\t"
+ "stosb"
+ : "=&c" (d0), "=&D" (d1)
+ : "a" (c), "1" (s), "0" (count)
+ : "memory");
+ return s;
+}
+
+/* we might want to write optimized versions of these later */
+#define __constant_count_memset(s, c, count) __memset_generic((s), (c), (count))
+
+/*
+ * memset(x, 0, y) is a reasonably common thing to do, so we want to fill
+ * things 32 bits at a time even when we don't know the size of the
+ * area at compile-time..
+ */
+static __always_inline
+void *__constant_c_memset(void *s, unsigned long c, size_t count)
+{
+ int d0, d1;
+ asm volatile("rep ; stosl\n\t"
+ "testb $2,%b3\n\t"
+ "je 1f\n\t"
+ "stosw\n"
+ "1:\ttestb $1,%b3\n\t"
+ "je 2f\n\t"
+ "stosb\n"
+ "2:"
+ : "=&c" (d0), "=&D" (d1)
+ : "a" (c), "q" (count), "0" (count/4), "1" ((long)s)
+ : "memory");
+ return s;
+}
+
+/* Added by Gertjan van Wingerde to make minix and sysv module work */
+#define __HAVE_ARCH_STRNLEN
+extern size_t strnlen(const char *s, size_t count);
+/* end of additional stuff */
+
+#define __HAVE_ARCH_STRSTR
+extern char *strstr(const char *cs, const char *ct);
+
+/*
+ * This looks horribly ugly, but the compiler can optimize it totally,
+ * as we by now know that both pattern and count is constant..
+ */
+static __always_inline
+void *__constant_c_and_count_memset(void *s, unsigned long pattern,
+ size_t count)
+{
+ switch (count) {
+ case 0:
+ return s;
+ case 1:
+ *(unsigned char *)s = pattern & 0xff;
+ return s;
+ case 2:
+ *(unsigned short *)s = pattern & 0xffff;
+ return s;
+ case 3:
+ *(unsigned short *)s = pattern & 0xffff;
+ *((unsigned char *)s + 2) = pattern & 0xff;
+ return s;
+ case 4:
+ *(unsigned long *)s = pattern;
+ return s;
+ }
+
+#define COMMON(x) \
+ asm volatile("rep ; stosl" \
+ x \
+ : "=&c" (d0), "=&D" (d1) \
+ : "a" (eax), "0" (count/4), "1" ((long)s) \
+ : "memory")
+
+ {
+ int d0, d1;
+ unsigned long eax = pattern;
+
+ switch (count % 4) {
+ case 0:
+ COMMON("");
+ return s;
+ case 1:
+ COMMON("\n\tstosb");
+ return s;
+ case 2:
+ COMMON("\n\tstosw");
+ return s;
+ default:
+ COMMON("\n\tstosw\n\tstosb");
+ return s;
+ }
+ }
+
+#undef COMMON
+}
+
+#define __constant_c_x_memset(s, c, count) \
+ (__builtin_constant_p(count) \
+ ? __constant_c_and_count_memset((s), (c), (count)) \
+ : __constant_c_memset((s), (c), (count)))
+
+#define __memset(s, c, count) \
+ (__builtin_constant_p(count) \
+ ? __constant_count_memset((s), (c), (count)) \
+ : __memset_generic((s), (c), (count)))
+
+#define __HAVE_ARCH_MEMSET
+extern void *memset(void *, int, size_t);
+#ifndef CONFIG_FORTIFY_SOURCE
+#define memset(s, c, count) __builtin_memset(s, c, count)
+#endif /* !CONFIG_FORTIFY_SOURCE */
+
+#define __HAVE_ARCH_MEMSET16
+static inline void *memset16(uint16_t *s, uint16_t v, size_t n)
+{
+ int d0, d1;
+ asm volatile("rep\n\t"
+ "stosw"
+ : "=&c" (d0), "=&D" (d1)
+ : "a" (v), "1" (s), "0" (n)
+ : "memory");
+ return s;
+}
+
+#define __HAVE_ARCH_MEMSET32
+static inline void *memset32(uint32_t *s, uint32_t v, size_t n)
+{
+ int d0, d1;
+ asm volatile("rep\n\t"
+ "stosl"
+ : "=&c" (d0), "=&D" (d1)
+ : "a" (v), "1" (s), "0" (n)
+ : "memory");
+ return s;
+}
+
+/*
+ * find the first occurrence of byte 'c', or 1 past the area if none
+ */
+#define __HAVE_ARCH_MEMSCAN
+extern void *memscan(void *addr, int c, size_t size);
+
+#endif /* __KERNEL__ */
+
+#endif /* _ASM_X86_STRING_32_H */
diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
new file mode 100644
index 000000000..052a7a4ac
--- /dev/null
+++ b/arch/x86/include/asm/string_64.h
@@ -0,0 +1,142 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_STRING_64_H
+#define _ASM_X86_STRING_64_H
+
+#ifdef __KERNEL__
+#include <linux/jump_label.h>
+
+/* Written 2002 by Andi Kleen */
+
+/* Only used for special circumstances. Stolen from i386/string.h */
+static __always_inline void *__inline_memcpy(void *to, const void *from, size_t n)
+{
+ unsigned long d0, d1, d2;
+ asm volatile("rep ; movsl\n\t"
+ "testb $2,%b4\n\t"
+ "je 1f\n\t"
+ "movsw\n"
+ "1:\ttestb $1,%b4\n\t"
+ "je 2f\n\t"
+ "movsb\n"
+ "2:"
+ : "=&c" (d0), "=&D" (d1), "=&S" (d2)
+ : "0" (n / 4), "q" (n), "1" ((long)to), "2" ((long)from)
+ : "memory");
+ return to;
+}
+
+/* Even with __builtin_ the compiler may decide to use the out of line
+ function. */
+
+#define __HAVE_ARCH_MEMCPY 1
+extern void *memcpy(void *to, const void *from, size_t len);
+extern void *__memcpy(void *to, const void *from, size_t len);
+
+#define __HAVE_ARCH_MEMSET
+void *memset(void *s, int c, size_t n);
+void *__memset(void *s, int c, size_t n);
+
+#define __HAVE_ARCH_MEMSET16
+static inline void *memset16(uint16_t *s, uint16_t v, size_t n)
+{
+ long d0, d1;
+ asm volatile("rep\n\t"
+ "stosw"
+ : "=&c" (d0), "=&D" (d1)
+ : "a" (v), "1" (s), "0" (n)
+ : "memory");
+ return s;
+}
+
+#define __HAVE_ARCH_MEMSET32
+static inline void *memset32(uint32_t *s, uint32_t v, size_t n)
+{
+ long d0, d1;
+ asm volatile("rep\n\t"
+ "stosl"
+ : "=&c" (d0), "=&D" (d1)
+ : "a" (v), "1" (s), "0" (n)
+ : "memory");
+ return s;
+}
+
+#define __HAVE_ARCH_MEMSET64
+static inline void *memset64(uint64_t *s, uint64_t v, size_t n)
+{
+ long d0, d1;
+ asm volatile("rep\n\t"
+ "stosq"
+ : "=&c" (d0), "=&D" (d1)
+ : "a" (v), "1" (s), "0" (n)
+ : "memory");
+ return s;
+}
+
+#define __HAVE_ARCH_MEMMOVE
+void *memmove(void *dest, const void *src, size_t count);
+void *__memmove(void *dest, const void *src, size_t count);
+
+int memcmp(const void *cs, const void *ct, size_t count);
+size_t strlen(const char *s);
+char *strcpy(char *dest, const char *src);
+char *strcat(char *dest, const char *src);
+int strcmp(const char *cs, const char *ct);
+
+#if defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__)
+
+/*
+ * For files that not instrumented (e.g. mm/slub.c) we
+ * should use not instrumented version of mem* functions.
+ */
+
+#undef memcpy
+#define memcpy(dst, src, len) __memcpy(dst, src, len)
+#define memmove(dst, src, len) __memmove(dst, src, len)
+#define memset(s, c, n) __memset(s, c, n)
+
+#ifndef __NO_FORTIFY
+#define __NO_FORTIFY /* FORTIFY_SOURCE uses __builtin_memcpy, etc. */
+#endif
+
+#endif
+
+#define __HAVE_ARCH_MEMCPY_MCSAFE 1
+__must_check unsigned long __memcpy_mcsafe(void *dst, const void *src,
+ size_t cnt);
+DECLARE_STATIC_KEY_FALSE(mcsafe_key);
+
+/**
+ * memcpy_mcsafe - copy memory with indication if a machine check happened
+ *
+ * @dst: destination address
+ * @src: source address
+ * @cnt: number of bytes to copy
+ *
+ * Low level memory copy function that catches machine checks
+ * We only call into the "safe" function on systems that can
+ * actually do machine check recovery. Everyone else can just
+ * use memcpy().
+ *
+ * Return 0 for success, or number of bytes not copied if there was an
+ * exception.
+ */
+static __always_inline __must_check unsigned long
+memcpy_mcsafe(void *dst, const void *src, size_t cnt)
+{
+#ifdef CONFIG_X86_MCE
+ if (static_branch_unlikely(&mcsafe_key))
+ return __memcpy_mcsafe(dst, src, cnt);
+ else
+#endif
+ memcpy(dst, src, cnt);
+ return 0;
+}
+
+#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
+#define __HAVE_ARCH_MEMCPY_FLUSHCACHE 1
+void memcpy_flushcache(void *dst, const void *src, size_t cnt);
+#endif
+
+#endif /* __KERNEL__ */
+
+#endif /* _ASM_X86_STRING_64_H */
diff --git a/arch/x86/include/asm/suspend.h b/arch/x86/include/asm/suspend.h
new file mode 100644
index 000000000..ecffe81ff
--- /dev/null
+++ b/arch/x86/include/asm/suspend.h
@@ -0,0 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifdef CONFIG_X86_32
+# include <asm/suspend_32.h>
+#else
+# include <asm/suspend_64.h>
+#endif
diff --git a/arch/x86/include/asm/suspend_32.h b/arch/x86/include/asm/suspend_32.h
new file mode 100644
index 000000000..32662cbaa
--- /dev/null
+++ b/arch/x86/include/asm/suspend_32.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright 2001-2002 Pavel Machek <pavel@suse.cz>
+ * Based on code
+ * Copyright 2001 Patrick Mochel <mochel@osdl.org>
+ */
+#ifndef _ASM_X86_SUSPEND_32_H
+#define _ASM_X86_SUSPEND_32_H
+
+#include <asm/desc.h>
+#include <asm/fpu/api.h>
+
+/* image of the saved processor state */
+struct saved_context {
+ /*
+ * On x86_32, all segment registers, with the possible exception of
+ * gs, are saved at kernel entry in pt_regs.
+ */
+#ifdef CONFIG_X86_32_LAZY_GS
+ u16 gs;
+#endif
+ unsigned long cr0, cr2, cr3, cr4;
+ u64 misc_enable;
+ struct saved_msrs saved_msrs;
+ struct desc_ptr gdt_desc;
+ struct desc_ptr idt;
+ u16 ldt;
+ u16 tss;
+ unsigned long tr;
+ unsigned long safety;
+ unsigned long return_address;
+ bool misc_enable_saved;
+} __attribute__((packed));
+
+#endif /* _ASM_X86_SUSPEND_32_H */
diff --git a/arch/x86/include/asm/suspend_64.h b/arch/x86/include/asm/suspend_64.h
new file mode 100644
index 000000000..b2861400c
--- /dev/null
+++ b/arch/x86/include/asm/suspend_64.h
@@ -0,0 +1,64 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright 2001-2003 Pavel Machek <pavel@suse.cz>
+ * Based on code
+ * Copyright 2001 Patrick Mochel <mochel@osdl.org>
+ */
+#ifndef _ASM_X86_SUSPEND_64_H
+#define _ASM_X86_SUSPEND_64_H
+
+#include <asm/desc.h>
+#include <asm/fpu/api.h>
+
+/*
+ * Image of the saved processor state, used by the low level ACPI suspend to
+ * RAM code and by the low level hibernation code.
+ *
+ * If you modify it, check how it is used in arch/x86/kernel/acpi/wakeup_64.S
+ * and make sure that __save/__restore_processor_state(), defined in
+ * arch/x86/power/cpu.c, still work as required.
+ *
+ * Because the structure is packed, make sure to avoid unaligned members. For
+ * optimisation purposes but also because tools like kmemleak only search for
+ * pointers that are aligned.
+ */
+struct saved_context {
+ struct pt_regs regs;
+
+ /*
+ * User CS and SS are saved in current_pt_regs(). The rest of the
+ * segment selectors need to be saved and restored here.
+ */
+ u16 ds, es, fs, gs;
+
+ /*
+ * Usermode FSBASE and GSBASE may not match the fs and gs selectors,
+ * so we save them separately. We save the kernelmode GSBASE to
+ * restore percpu access after resume.
+ */
+ unsigned long kernelmode_gs_base, usermode_gs_base, fs_base;
+
+ unsigned long cr0, cr2, cr3, cr4, cr8;
+ u64 misc_enable;
+ struct saved_msrs saved_msrs;
+ unsigned long efer;
+ u16 gdt_pad; /* Unused */
+ struct desc_ptr gdt_desc;
+ u16 idt_pad;
+ struct desc_ptr idt;
+ u16 ldt;
+ u16 tss;
+ unsigned long tr;
+ unsigned long safety;
+ unsigned long return_address;
+ bool misc_enable_saved;
+} __attribute__((packed));
+
+#define loaddebug(thread,register) \
+ set_debugreg((thread)->debugreg##register, register)
+
+/* routines for saving/restoring kernel state */
+extern char core_restore_code[];
+extern char restore_registers[];
+
+#endif /* _ASM_X86_SUSPEND_64_H */
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
new file mode 100644
index 000000000..b6dedf6c8
--- /dev/null
+++ b/arch/x86/include/asm/svm.h
@@ -0,0 +1,302 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __SVM_H
+#define __SVM_H
+
+#include <uapi/asm/svm.h>
+
+
+enum {
+ INTERCEPT_INTR,
+ INTERCEPT_NMI,
+ INTERCEPT_SMI,
+ INTERCEPT_INIT,
+ INTERCEPT_VINTR,
+ INTERCEPT_SELECTIVE_CR0,
+ INTERCEPT_STORE_IDTR,
+ INTERCEPT_STORE_GDTR,
+ INTERCEPT_STORE_LDTR,
+ INTERCEPT_STORE_TR,
+ INTERCEPT_LOAD_IDTR,
+ INTERCEPT_LOAD_GDTR,
+ INTERCEPT_LOAD_LDTR,
+ INTERCEPT_LOAD_TR,
+ INTERCEPT_RDTSC,
+ INTERCEPT_RDPMC,
+ INTERCEPT_PUSHF,
+ INTERCEPT_POPF,
+ INTERCEPT_CPUID,
+ INTERCEPT_RSM,
+ INTERCEPT_IRET,
+ INTERCEPT_INTn,
+ INTERCEPT_INVD,
+ INTERCEPT_PAUSE,
+ INTERCEPT_HLT,
+ INTERCEPT_INVLPG,
+ INTERCEPT_INVLPGA,
+ INTERCEPT_IOIO_PROT,
+ INTERCEPT_MSR_PROT,
+ INTERCEPT_TASK_SWITCH,
+ INTERCEPT_FERR_FREEZE,
+ INTERCEPT_SHUTDOWN,
+ INTERCEPT_VMRUN,
+ INTERCEPT_VMMCALL,
+ INTERCEPT_VMLOAD,
+ INTERCEPT_VMSAVE,
+ INTERCEPT_STGI,
+ INTERCEPT_CLGI,
+ INTERCEPT_SKINIT,
+ INTERCEPT_RDTSCP,
+ INTERCEPT_ICEBP,
+ INTERCEPT_WBINVD,
+ INTERCEPT_MONITOR,
+ INTERCEPT_MWAIT,
+ INTERCEPT_MWAIT_COND,
+ INTERCEPT_XSETBV,
+};
+
+
+struct __attribute__ ((__packed__)) vmcb_control_area {
+ u32 intercept_cr;
+ u32 intercept_dr;
+ u32 intercept_exceptions;
+ u64 intercept;
+ u8 reserved_1[40];
+ u16 pause_filter_thresh;
+ u16 pause_filter_count;
+ u64 iopm_base_pa;
+ u64 msrpm_base_pa;
+ u64 tsc_offset;
+ u32 asid;
+ u8 tlb_ctl;
+ u8 reserved_2[3];
+ u32 int_ctl;
+ u32 int_vector;
+ u32 int_state;
+ u8 reserved_3[4];
+ u32 exit_code;
+ u32 exit_code_hi;
+ u64 exit_info_1;
+ u64 exit_info_2;
+ u32 exit_int_info;
+ u32 exit_int_info_err;
+ u64 nested_ctl;
+ u64 avic_vapic_bar;
+ u8 reserved_4[8];
+ u32 event_inj;
+ u32 event_inj_err;
+ u64 nested_cr3;
+ u64 virt_ext;
+ u32 clean;
+ u32 reserved_5;
+ u64 next_rip;
+ u8 insn_len;
+ u8 insn_bytes[15];
+ u64 avic_backing_page; /* Offset 0xe0 */
+ u8 reserved_6[8]; /* Offset 0xe8 */
+ u64 avic_logical_id; /* Offset 0xf0 */
+ u64 avic_physical_id; /* Offset 0xf8 */
+ u8 reserved_7[768];
+};
+
+
+#define TLB_CONTROL_DO_NOTHING 0
+#define TLB_CONTROL_FLUSH_ALL_ASID 1
+#define TLB_CONTROL_FLUSH_ASID 3
+#define TLB_CONTROL_FLUSH_ASID_LOCAL 7
+
+#define V_TPR_MASK 0x0f
+
+#define V_IRQ_SHIFT 8
+#define V_IRQ_MASK (1 << V_IRQ_SHIFT)
+
+#define V_GIF_SHIFT 9
+#define V_GIF_MASK (1 << V_GIF_SHIFT)
+
+#define V_INTR_PRIO_SHIFT 16
+#define V_INTR_PRIO_MASK (0x0f << V_INTR_PRIO_SHIFT)
+
+#define V_IGN_TPR_SHIFT 20
+#define V_IGN_TPR_MASK (1 << V_IGN_TPR_SHIFT)
+
+#define V_IRQ_INJECTION_BITS_MASK (V_IRQ_MASK | V_INTR_PRIO_MASK | V_IGN_TPR_MASK)
+
+#define V_INTR_MASKING_SHIFT 24
+#define V_INTR_MASKING_MASK (1 << V_INTR_MASKING_SHIFT)
+
+#define V_GIF_ENABLE_SHIFT 25
+#define V_GIF_ENABLE_MASK (1 << V_GIF_ENABLE_SHIFT)
+
+#define AVIC_ENABLE_SHIFT 31
+#define AVIC_ENABLE_MASK (1 << AVIC_ENABLE_SHIFT)
+
+#define LBR_CTL_ENABLE_MASK BIT_ULL(0)
+#define VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK BIT_ULL(1)
+
+#define SVM_INTERRUPT_SHADOW_MASK 1
+
+#define SVM_IOIO_STR_SHIFT 2
+#define SVM_IOIO_REP_SHIFT 3
+#define SVM_IOIO_SIZE_SHIFT 4
+#define SVM_IOIO_ASIZE_SHIFT 7
+
+#define SVM_IOIO_TYPE_MASK 1
+#define SVM_IOIO_STR_MASK (1 << SVM_IOIO_STR_SHIFT)
+#define SVM_IOIO_REP_MASK (1 << SVM_IOIO_REP_SHIFT)
+#define SVM_IOIO_SIZE_MASK (7 << SVM_IOIO_SIZE_SHIFT)
+#define SVM_IOIO_ASIZE_MASK (7 << SVM_IOIO_ASIZE_SHIFT)
+
+#define SVM_VM_CR_VALID_MASK 0x001fULL
+#define SVM_VM_CR_SVM_LOCK_MASK 0x0008ULL
+#define SVM_VM_CR_SVM_DIS_MASK 0x0010ULL
+
+#define SVM_NESTED_CTL_NP_ENABLE BIT(0)
+#define SVM_NESTED_CTL_SEV_ENABLE BIT(1)
+
+struct __attribute__ ((__packed__)) vmcb_seg {
+ u16 selector;
+ u16 attrib;
+ u32 limit;
+ u64 base;
+};
+
+struct __attribute__ ((__packed__)) vmcb_save_area {
+ struct vmcb_seg es;
+ struct vmcb_seg cs;
+ struct vmcb_seg ss;
+ struct vmcb_seg ds;
+ struct vmcb_seg fs;
+ struct vmcb_seg gs;
+ struct vmcb_seg gdtr;
+ struct vmcb_seg ldtr;
+ struct vmcb_seg idtr;
+ struct vmcb_seg tr;
+ u8 reserved_1[43];
+ u8 cpl;
+ u8 reserved_2[4];
+ u64 efer;
+ u8 reserved_3[112];
+ u64 cr4;
+ u64 cr3;
+ u64 cr0;
+ u64 dr7;
+ u64 dr6;
+ u64 rflags;
+ u64 rip;
+ u8 reserved_4[88];
+ u64 rsp;
+ u8 reserved_5[24];
+ u64 rax;
+ u64 star;
+ u64 lstar;
+ u64 cstar;
+ u64 sfmask;
+ u64 kernel_gs_base;
+ u64 sysenter_cs;
+ u64 sysenter_esp;
+ u64 sysenter_eip;
+ u64 cr2;
+ u8 reserved_6[32];
+ u64 g_pat;
+ u64 dbgctl;
+ u64 br_from;
+ u64 br_to;
+ u64 last_excp_from;
+ u64 last_excp_to;
+};
+
+struct __attribute__ ((__packed__)) vmcb {
+ struct vmcb_control_area control;
+ struct vmcb_save_area save;
+};
+
+#define SVM_CPUID_FUNC 0x8000000a
+
+#define SVM_VM_CR_SVM_DISABLE 4
+
+#define SVM_SELECTOR_S_SHIFT 4
+#define SVM_SELECTOR_DPL_SHIFT 5
+#define SVM_SELECTOR_P_SHIFT 7
+#define SVM_SELECTOR_AVL_SHIFT 8
+#define SVM_SELECTOR_L_SHIFT 9
+#define SVM_SELECTOR_DB_SHIFT 10
+#define SVM_SELECTOR_G_SHIFT 11
+
+#define SVM_SELECTOR_TYPE_MASK (0xf)
+#define SVM_SELECTOR_S_MASK (1 << SVM_SELECTOR_S_SHIFT)
+#define SVM_SELECTOR_DPL_MASK (3 << SVM_SELECTOR_DPL_SHIFT)
+#define SVM_SELECTOR_P_MASK (1 << SVM_SELECTOR_P_SHIFT)
+#define SVM_SELECTOR_AVL_MASK (1 << SVM_SELECTOR_AVL_SHIFT)
+#define SVM_SELECTOR_L_MASK (1 << SVM_SELECTOR_L_SHIFT)
+#define SVM_SELECTOR_DB_MASK (1 << SVM_SELECTOR_DB_SHIFT)
+#define SVM_SELECTOR_G_MASK (1 << SVM_SELECTOR_G_SHIFT)
+
+#define SVM_SELECTOR_WRITE_MASK (1 << 1)
+#define SVM_SELECTOR_READ_MASK SVM_SELECTOR_WRITE_MASK
+#define SVM_SELECTOR_CODE_MASK (1 << 3)
+
+#define INTERCEPT_CR0_READ 0
+#define INTERCEPT_CR3_READ 3
+#define INTERCEPT_CR4_READ 4
+#define INTERCEPT_CR8_READ 8
+#define INTERCEPT_CR0_WRITE (16 + 0)
+#define INTERCEPT_CR3_WRITE (16 + 3)
+#define INTERCEPT_CR4_WRITE (16 + 4)
+#define INTERCEPT_CR8_WRITE (16 + 8)
+
+#define INTERCEPT_DR0_READ 0
+#define INTERCEPT_DR1_READ 1
+#define INTERCEPT_DR2_READ 2
+#define INTERCEPT_DR3_READ 3
+#define INTERCEPT_DR4_READ 4
+#define INTERCEPT_DR5_READ 5
+#define INTERCEPT_DR6_READ 6
+#define INTERCEPT_DR7_READ 7
+#define INTERCEPT_DR0_WRITE (16 + 0)
+#define INTERCEPT_DR1_WRITE (16 + 1)
+#define INTERCEPT_DR2_WRITE (16 + 2)
+#define INTERCEPT_DR3_WRITE (16 + 3)
+#define INTERCEPT_DR4_WRITE (16 + 4)
+#define INTERCEPT_DR5_WRITE (16 + 5)
+#define INTERCEPT_DR6_WRITE (16 + 6)
+#define INTERCEPT_DR7_WRITE (16 + 7)
+
+#define SVM_EVTINJ_VEC_MASK 0xff
+
+#define SVM_EVTINJ_TYPE_SHIFT 8
+#define SVM_EVTINJ_TYPE_MASK (7 << SVM_EVTINJ_TYPE_SHIFT)
+
+#define SVM_EVTINJ_TYPE_INTR (0 << SVM_EVTINJ_TYPE_SHIFT)
+#define SVM_EVTINJ_TYPE_NMI (2 << SVM_EVTINJ_TYPE_SHIFT)
+#define SVM_EVTINJ_TYPE_EXEPT (3 << SVM_EVTINJ_TYPE_SHIFT)
+#define SVM_EVTINJ_TYPE_SOFT (4 << SVM_EVTINJ_TYPE_SHIFT)
+
+#define SVM_EVTINJ_VALID (1 << 31)
+#define SVM_EVTINJ_VALID_ERR (1 << 11)
+
+#define SVM_EXITINTINFO_VEC_MASK SVM_EVTINJ_VEC_MASK
+#define SVM_EXITINTINFO_TYPE_MASK SVM_EVTINJ_TYPE_MASK
+
+#define SVM_EXITINTINFO_TYPE_INTR SVM_EVTINJ_TYPE_INTR
+#define SVM_EXITINTINFO_TYPE_NMI SVM_EVTINJ_TYPE_NMI
+#define SVM_EXITINTINFO_TYPE_EXEPT SVM_EVTINJ_TYPE_EXEPT
+#define SVM_EXITINTINFO_TYPE_SOFT SVM_EVTINJ_TYPE_SOFT
+
+#define SVM_EXITINTINFO_VALID SVM_EVTINJ_VALID
+#define SVM_EXITINTINFO_VALID_ERR SVM_EVTINJ_VALID_ERR
+
+#define SVM_EXITINFOSHIFT_TS_REASON_IRET 36
+#define SVM_EXITINFOSHIFT_TS_REASON_JMP 38
+#define SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE 44
+
+#define SVM_EXITINFO_REG_MASK 0x0F
+
+#define SVM_CR0_SELECTIVE_MASK (X86_CR0_TS | X86_CR0_MP)
+
+#define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda"
+#define SVM_VMRUN ".byte 0x0f, 0x01, 0xd8"
+#define SVM_VMSAVE ".byte 0x0f, 0x01, 0xdb"
+#define SVM_CLGI ".byte 0x0f, 0x01, 0xdd"
+#define SVM_STGI ".byte 0x0f, 0x01, 0xdc"
+#define SVM_INVLPGA ".byte 0x0f, 0x01, 0xdf"
+
+#endif
diff --git a/arch/x86/include/asm/swiotlb.h b/arch/x86/include/asm/swiotlb.h
new file mode 100644
index 000000000..ff6c92eff
--- /dev/null
+++ b/arch/x86/include/asm/swiotlb.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_SWIOTLB_H
+#define _ASM_X86_SWIOTLB_H
+
+#include <linux/swiotlb.h>
+
+#ifdef CONFIG_SWIOTLB
+extern int swiotlb;
+extern int __init pci_swiotlb_detect_override(void);
+extern int __init pci_swiotlb_detect_4gb(void);
+extern void __init pci_swiotlb_init(void);
+extern void __init pci_swiotlb_late_init(void);
+#else
+#define swiotlb 0
+static inline int pci_swiotlb_detect_override(void)
+{
+ return 0;
+}
+static inline int pci_swiotlb_detect_4gb(void)
+{
+ return 0;
+}
+static inline void pci_swiotlb_init(void)
+{
+}
+static inline void pci_swiotlb_late_init(void)
+{
+}
+#endif
+#endif /* _ASM_X86_SWIOTLB_H */
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
new file mode 100644
index 000000000..157149d41
--- /dev/null
+++ b/arch/x86/include/asm/switch_to.h
@@ -0,0 +1,109 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_SWITCH_TO_H
+#define _ASM_X86_SWITCH_TO_H
+
+#include <linux/sched/task_stack.h>
+
+struct task_struct; /* one of the stranger aspects of C forward declarations */
+
+struct task_struct *__switch_to_asm(struct task_struct *prev,
+ struct task_struct *next);
+
+__visible struct task_struct *__switch_to(struct task_struct *prev,
+ struct task_struct *next);
+
+/* This runs runs on the previous thread's stack. */
+static inline void prepare_switch_to(struct task_struct *next)
+{
+#ifdef CONFIG_VMAP_STACK
+ /*
+ * If we switch to a stack that has a top-level paging entry
+ * that is not present in the current mm, the resulting #PF will
+ * will be promoted to a double-fault and we'll panic. Probe
+ * the new stack now so that vmalloc_fault can fix up the page
+ * tables if needed. This can only happen if we use a stack
+ * in vmap space.
+ *
+ * We assume that the stack is aligned so that it never spans
+ * more than one top-level paging entry.
+ *
+ * To minimize cache pollution, just follow the stack pointer.
+ */
+ READ_ONCE(*(unsigned char *)next->thread.sp);
+#endif
+}
+
+asmlinkage void ret_from_fork(void);
+
+/*
+ * This is the structure pointed to by thread.sp for an inactive task. The
+ * order of the fields must match the code in __switch_to_asm().
+ */
+struct inactive_task_frame {
+ unsigned long flags;
+#ifdef CONFIG_X86_64
+ unsigned long r15;
+ unsigned long r14;
+ unsigned long r13;
+ unsigned long r12;
+#else
+ unsigned long si;
+ unsigned long di;
+#endif
+ unsigned long bx;
+
+ /*
+ * These two fields must be together. They form a stack frame header,
+ * needed by get_frame_pointer().
+ */
+ unsigned long bp;
+ unsigned long ret_addr;
+};
+
+struct fork_frame {
+ struct inactive_task_frame frame;
+ struct pt_regs regs;
+};
+
+#define switch_to(prev, next, last) \
+do { \
+ prepare_switch_to(next); \
+ \
+ ((last) = __switch_to_asm((prev), (next))); \
+} while (0)
+
+#ifdef CONFIG_X86_32
+static inline void refresh_sysenter_cs(struct thread_struct *thread)
+{
+ /* Only happens when SEP is enabled, no need to test "SEP"arately: */
+ if (unlikely(this_cpu_read(cpu_tss_rw.x86_tss.ss1) == thread->sysenter_cs))
+ return;
+
+ this_cpu_write(cpu_tss_rw.x86_tss.ss1, thread->sysenter_cs);
+ wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
+}
+#endif
+
+/* This is used when switching tasks or entering/exiting vm86 mode. */
+static inline void update_task_stack(struct task_struct *task)
+{
+ /* sp0 always points to the entry trampoline stack, which is constant: */
+#ifdef CONFIG_X86_32
+ if (static_cpu_has(X86_FEATURE_XENPV))
+ load_sp0(task->thread.sp0);
+ else
+ this_cpu_write(cpu_tss_rw.x86_tss.sp1, task->thread.sp0);
+#else
+ /*
+ * x86-64 updates x86_tss.sp1 via cpu_current_top_of_stack. That
+ * doesn't work on x86-32 because sp1 and
+ * cpu_current_top_of_stack have different values (because of
+ * the non-zero stack-padding on 32bit).
+ */
+ if (static_cpu_has(X86_FEATURE_XENPV))
+ load_sp0(task_top_of_stack(task));
+#endif
+
+}
+
+#endif /* _ASM_X86_SWITCH_TO_H */
diff --git a/arch/x86/include/asm/sync_bitops.h b/arch/x86/include/asm/sync_bitops.h
new file mode 100644
index 000000000..2fe745356
--- /dev/null
+++ b/arch/x86/include/asm/sync_bitops.h
@@ -0,0 +1,131 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_SYNC_BITOPS_H
+#define _ASM_X86_SYNC_BITOPS_H
+
+/*
+ * Copyright 1992, Linus Torvalds.
+ */
+
+/*
+ * These have to be done with inline assembly: that way the bit-setting
+ * is guaranteed to be atomic. All bit operations return 0 if the bit
+ * was cleared before the operation and != 0 if it was not.
+ *
+ * bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1).
+ */
+
+#define ADDR (*(volatile long *)addr)
+
+/**
+ * sync_set_bit - Atomically set a bit in memory
+ * @nr: the bit to set
+ * @addr: the address to start counting from
+ *
+ * This function is atomic and may not be reordered. See __set_bit()
+ * if you do not require the atomic guarantees.
+ *
+ * Note that @nr may be almost arbitrarily large; this function is not
+ * restricted to acting on a single-word quantity.
+ */
+static inline void sync_set_bit(long nr, volatile unsigned long *addr)
+{
+ asm volatile("lock; bts %1,%0"
+ : "+m" (ADDR)
+ : "Ir" (nr)
+ : "memory");
+}
+
+/**
+ * sync_clear_bit - Clears a bit in memory
+ * @nr: Bit to clear
+ * @addr: Address to start counting from
+ *
+ * sync_clear_bit() is atomic and may not be reordered. However, it does
+ * not contain a memory barrier, so if it is used for locking purposes,
+ * you should call smp_mb__before_atomic() and/or smp_mb__after_atomic()
+ * in order to ensure changes are visible on other processors.
+ */
+static inline void sync_clear_bit(long nr, volatile unsigned long *addr)
+{
+ asm volatile("lock; btr %1,%0"
+ : "+m" (ADDR)
+ : "Ir" (nr)
+ : "memory");
+}
+
+/**
+ * sync_change_bit - Toggle a bit in memory
+ * @nr: Bit to change
+ * @addr: Address to start counting from
+ *
+ * sync_change_bit() is atomic and may not be reordered.
+ * Note that @nr may be almost arbitrarily large; this function is not
+ * restricted to acting on a single-word quantity.
+ */
+static inline void sync_change_bit(long nr, volatile unsigned long *addr)
+{
+ asm volatile("lock; btc %1,%0"
+ : "+m" (ADDR)
+ : "Ir" (nr)
+ : "memory");
+}
+
+/**
+ * sync_test_and_set_bit - Set a bit and return its old value
+ * @nr: Bit to set
+ * @addr: Address to count from
+ *
+ * This operation is atomic and cannot be reordered.
+ * It also implies a memory barrier.
+ */
+static inline int sync_test_and_set_bit(long nr, volatile unsigned long *addr)
+{
+ unsigned char oldbit;
+
+ asm volatile("lock; bts %2,%1\n\tsetc %0"
+ : "=qm" (oldbit), "+m" (ADDR)
+ : "Ir" (nr) : "memory");
+ return oldbit;
+}
+
+/**
+ * sync_test_and_clear_bit - Clear a bit and return its old value
+ * @nr: Bit to clear
+ * @addr: Address to count from
+ *
+ * This operation is atomic and cannot be reordered.
+ * It also implies a memory barrier.
+ */
+static inline int sync_test_and_clear_bit(long nr, volatile unsigned long *addr)
+{
+ unsigned char oldbit;
+
+ asm volatile("lock; btr %2,%1\n\tsetc %0"
+ : "=qm" (oldbit), "+m" (ADDR)
+ : "Ir" (nr) : "memory");
+ return oldbit;
+}
+
+/**
+ * sync_test_and_change_bit - Change a bit and return its old value
+ * @nr: Bit to change
+ * @addr: Address to count from
+ *
+ * This operation is atomic and cannot be reordered.
+ * It also implies a memory barrier.
+ */
+static inline int sync_test_and_change_bit(long nr, volatile unsigned long *addr)
+{
+ unsigned char oldbit;
+
+ asm volatile("lock; btc %2,%1\n\tsetc %0"
+ : "=qm" (oldbit), "+m" (ADDR)
+ : "Ir" (nr) : "memory");
+ return oldbit;
+}
+
+#define sync_test_bit(nr, addr) test_bit(nr, addr)
+
+#undef ADDR
+
+#endif /* _ASM_X86_SYNC_BITOPS_H */
diff --git a/arch/x86/include/asm/sync_core.h b/arch/x86/include/asm/sync_core.h
new file mode 100644
index 000000000..43b5e02a7
--- /dev/null
+++ b/arch/x86/include/asm/sync_core.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_SYNC_CORE_H
+#define _ASM_X86_SYNC_CORE_H
+
+#include <linux/preempt.h>
+#include <asm/processor.h>
+#include <asm/cpufeature.h>
+
+/*
+ * Ensure that a core serializing instruction is issued before returning
+ * to user-mode. x86 implements return to user-space through sysexit,
+ * sysrel, and sysretq, which are not core serializing.
+ */
+static inline void sync_core_before_usermode(void)
+{
+ /* With PTI, we unconditionally serialize before running user code. */
+ if (static_cpu_has(X86_FEATURE_PTI))
+ return;
+
+ /*
+ * Even if we're in an interrupt, we might reschedule before returning,
+ * in which case we could switch to a different thread in the same mm
+ * and return using SYSRET or SYSEXIT. Instead of trying to keep
+ * track of our need to sync the core, just sync right away.
+ */
+ sync_core();
+}
+
+#endif /* _ASM_X86_SYNC_CORE_H */
diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h
new file mode 100644
index 000000000..d65313985
--- /dev/null
+++ b/arch/x86/include/asm/syscall.h
@@ -0,0 +1,246 @@
+/*
+ * Access to user system call parameters and results
+ *
+ * Copyright (C) 2008-2009 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v.2.
+ *
+ * See asm-generic/syscall.h for descriptions of what we must do here.
+ */
+
+#ifndef _ASM_X86_SYSCALL_H
+#define _ASM_X86_SYSCALL_H
+
+#include <uapi/linux/audit.h>
+#include <linux/sched.h>
+#include <linux/err.h>
+#include <asm/asm-offsets.h> /* For NR_syscalls */
+#include <asm/thread_info.h> /* for TS_COMPAT */
+#include <asm/unistd.h>
+
+#ifdef CONFIG_X86_64
+typedef asmlinkage long (*sys_call_ptr_t)(const struct pt_regs *);
+#else
+typedef asmlinkage long (*sys_call_ptr_t)(unsigned long, unsigned long,
+ unsigned long, unsigned long,
+ unsigned long, unsigned long);
+#endif /* CONFIG_X86_64 */
+extern const sys_call_ptr_t sys_call_table[];
+
+#if defined(CONFIG_X86_32)
+#define ia32_sys_call_table sys_call_table
+#define __NR_syscall_compat_max __NR_syscall_max
+#define IA32_NR_syscalls NR_syscalls
+#endif
+
+#if defined(CONFIG_IA32_EMULATION)
+extern const sys_call_ptr_t ia32_sys_call_table[];
+#endif
+
+/*
+ * Only the low 32 bits of orig_ax are meaningful, so we return int.
+ * This importantly ignores the high bits on 64-bit, so comparisons
+ * sign-extend the low 32 bits.
+ */
+static inline int syscall_get_nr(struct task_struct *task, struct pt_regs *regs)
+{
+ return regs->orig_ax;
+}
+
+static inline void syscall_rollback(struct task_struct *task,
+ struct pt_regs *regs)
+{
+ regs->ax = regs->orig_ax;
+}
+
+static inline long syscall_get_error(struct task_struct *task,
+ struct pt_regs *regs)
+{
+ unsigned long error = regs->ax;
+#ifdef CONFIG_IA32_EMULATION
+ /*
+ * TS_COMPAT is set for 32-bit syscall entries and then
+ * remains set until we return to user mode.
+ */
+ if (task->thread_info.status & (TS_COMPAT|TS_I386_REGS_POKED))
+ /*
+ * Sign-extend the value so (int)-EFOO becomes (long)-EFOO
+ * and will match correctly in comparisons.
+ */
+ error = (long) (int) error;
+#endif
+ return IS_ERR_VALUE(error) ? error : 0;
+}
+
+static inline long syscall_get_return_value(struct task_struct *task,
+ struct pt_regs *regs)
+{
+ return regs->ax;
+}
+
+static inline void syscall_set_return_value(struct task_struct *task,
+ struct pt_regs *regs,
+ int error, long val)
+{
+ regs->ax = (long) error ?: val;
+}
+
+#ifdef CONFIG_X86_32
+
+static inline void syscall_get_arguments(struct task_struct *task,
+ struct pt_regs *regs,
+ unsigned int i, unsigned int n,
+ unsigned long *args)
+{
+ BUG_ON(i + n > 6);
+ memcpy(args, &regs->bx + i, n * sizeof(args[0]));
+}
+
+static inline void syscall_set_arguments(struct task_struct *task,
+ struct pt_regs *regs,
+ unsigned int i, unsigned int n,
+ const unsigned long *args)
+{
+ BUG_ON(i + n > 6);
+ memcpy(&regs->bx + i, args, n * sizeof(args[0]));
+}
+
+static inline int syscall_get_arch(void)
+{
+ return AUDIT_ARCH_I386;
+}
+
+#else /* CONFIG_X86_64 */
+
+static inline void syscall_get_arguments(struct task_struct *task,
+ struct pt_regs *regs,
+ unsigned int i, unsigned int n,
+ unsigned long *args)
+{
+# ifdef CONFIG_IA32_EMULATION
+ if (task->thread_info.status & TS_COMPAT)
+ switch (i) {
+ case 0:
+ if (!n--) break;
+ *args++ = regs->bx;
+ case 1:
+ if (!n--) break;
+ *args++ = regs->cx;
+ case 2:
+ if (!n--) break;
+ *args++ = regs->dx;
+ case 3:
+ if (!n--) break;
+ *args++ = regs->si;
+ case 4:
+ if (!n--) break;
+ *args++ = regs->di;
+ case 5:
+ if (!n--) break;
+ *args++ = regs->bp;
+ case 6:
+ if (!n--) break;
+ default:
+ BUG();
+ break;
+ }
+ else
+# endif
+ switch (i) {
+ case 0:
+ if (!n--) break;
+ *args++ = regs->di;
+ case 1:
+ if (!n--) break;
+ *args++ = regs->si;
+ case 2:
+ if (!n--) break;
+ *args++ = regs->dx;
+ case 3:
+ if (!n--) break;
+ *args++ = regs->r10;
+ case 4:
+ if (!n--) break;
+ *args++ = regs->r8;
+ case 5:
+ if (!n--) break;
+ *args++ = regs->r9;
+ case 6:
+ if (!n--) break;
+ default:
+ BUG();
+ break;
+ }
+}
+
+static inline void syscall_set_arguments(struct task_struct *task,
+ struct pt_regs *regs,
+ unsigned int i, unsigned int n,
+ const unsigned long *args)
+{
+# ifdef CONFIG_IA32_EMULATION
+ if (task->thread_info.status & TS_COMPAT)
+ switch (i) {
+ case 0:
+ if (!n--) break;
+ regs->bx = *args++;
+ case 1:
+ if (!n--) break;
+ regs->cx = *args++;
+ case 2:
+ if (!n--) break;
+ regs->dx = *args++;
+ case 3:
+ if (!n--) break;
+ regs->si = *args++;
+ case 4:
+ if (!n--) break;
+ regs->di = *args++;
+ case 5:
+ if (!n--) break;
+ regs->bp = *args++;
+ case 6:
+ if (!n--) break;
+ default:
+ BUG();
+ break;
+ }
+ else
+# endif
+ switch (i) {
+ case 0:
+ if (!n--) break;
+ regs->di = *args++;
+ case 1:
+ if (!n--) break;
+ regs->si = *args++;
+ case 2:
+ if (!n--) break;
+ regs->dx = *args++;
+ case 3:
+ if (!n--) break;
+ regs->r10 = *args++;
+ case 4:
+ if (!n--) break;
+ regs->r8 = *args++;
+ case 5:
+ if (!n--) break;
+ regs->r9 = *args++;
+ case 6:
+ if (!n--) break;
+ default:
+ BUG();
+ break;
+ }
+}
+
+static inline int syscall_get_arch(void)
+{
+ /* x32 tasks should be considered AUDIT_ARCH_X86_64. */
+ return in_ia32_syscall() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
+}
+#endif /* CONFIG_X86_32 */
+
+#endif /* _ASM_X86_SYSCALL_H */
diff --git a/arch/x86/include/asm/syscall_wrapper.h b/arch/x86/include/asm/syscall_wrapper.h
new file mode 100644
index 000000000..90eb70df0
--- /dev/null
+++ b/arch/x86/include/asm/syscall_wrapper.h
@@ -0,0 +1,210 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * syscall_wrapper.h - x86 specific wrappers to syscall definitions
+ */
+
+#ifndef _ASM_X86_SYSCALL_WRAPPER_H
+#define _ASM_X86_SYSCALL_WRAPPER_H
+
+/* Mapping of registers to parameters for syscalls on x86-64 and x32 */
+#define SC_X86_64_REGS_TO_ARGS(x, ...) \
+ __MAP(x,__SC_ARGS \
+ ,,regs->di,,regs->si,,regs->dx \
+ ,,regs->r10,,regs->r8,,regs->r9) \
+
+/* Mapping of registers to parameters for syscalls on i386 */
+#define SC_IA32_REGS_TO_ARGS(x, ...) \
+ __MAP(x,__SC_ARGS \
+ ,,(unsigned int)regs->bx,,(unsigned int)regs->cx \
+ ,,(unsigned int)regs->dx,,(unsigned int)regs->si \
+ ,,(unsigned int)regs->di,,(unsigned int)regs->bp)
+
+#ifdef CONFIG_IA32_EMULATION
+/*
+ * For IA32 emulation, we need to handle "compat" syscalls *and* create
+ * additional wrappers (aptly named __ia32_sys_xyzzy) which decode the
+ * ia32 regs in the proper order for shared or "common" syscalls. As some
+ * syscalls may not be implemented, we need to expand COND_SYSCALL in
+ * kernel/sys_ni.c and SYS_NI in kernel/time/posix-stubs.c to cover this
+ * case as well.
+ */
+#define __IA32_COMPAT_SYS_STUBx(x, name, ...) \
+ asmlinkage long __ia32_compat_sys##name(const struct pt_regs *regs);\
+ ALLOW_ERROR_INJECTION(__ia32_compat_sys##name, ERRNO); \
+ asmlinkage long __ia32_compat_sys##name(const struct pt_regs *regs)\
+ { \
+ return __se_compat_sys##name(SC_IA32_REGS_TO_ARGS(x,__VA_ARGS__));\
+ } \
+
+#define __IA32_SYS_STUBx(x, name, ...) \
+ asmlinkage long __ia32_sys##name(const struct pt_regs *regs); \
+ ALLOW_ERROR_INJECTION(__ia32_sys##name, ERRNO); \
+ asmlinkage long __ia32_sys##name(const struct pt_regs *regs) \
+ { \
+ return __se_sys##name(SC_IA32_REGS_TO_ARGS(x,__VA_ARGS__));\
+ }
+
+/*
+ * To keep the naming coherent, re-define SYSCALL_DEFINE0 to create an alias
+ * named __ia32_sys_*()
+ */
+
+#define SYSCALL_DEFINE0(sname) \
+ SYSCALL_METADATA(_##sname, 0); \
+ asmlinkage long __x64_sys_##sname(const struct pt_regs *__unused);\
+ ALLOW_ERROR_INJECTION(__x64_sys_##sname, ERRNO); \
+ SYSCALL_ALIAS(__ia32_sys_##sname, __x64_sys_##sname); \
+ asmlinkage long __x64_sys_##sname(const struct pt_regs *__unused)
+
+#define COND_SYSCALL(name) \
+ cond_syscall(__x64_sys_##name); \
+ cond_syscall(__ia32_sys_##name)
+
+#define SYS_NI(name) \
+ SYSCALL_ALIAS(__x64_sys_##name, sys_ni_posix_timers); \
+ SYSCALL_ALIAS(__ia32_sys_##name, sys_ni_posix_timers)
+
+#else /* CONFIG_IA32_EMULATION */
+#define __IA32_COMPAT_SYS_STUBx(x, name, ...)
+#define __IA32_SYS_STUBx(x, fullname, name, ...)
+#endif /* CONFIG_IA32_EMULATION */
+
+
+#ifdef CONFIG_X86_X32
+/*
+ * For the x32 ABI, we need to create a stub for compat_sys_*() which is aware
+ * of the x86-64-style parameter ordering of x32 syscalls. The syscalls common
+ * with x86_64 obviously do not need such care.
+ */
+#define __X32_COMPAT_SYS_STUBx(x, name, ...) \
+ asmlinkage long __x32_compat_sys##name(const struct pt_regs *regs);\
+ ALLOW_ERROR_INJECTION(__x32_compat_sys##name, ERRNO); \
+ asmlinkage long __x32_compat_sys##name(const struct pt_regs *regs)\
+ { \
+ return __se_compat_sys##name(SC_X86_64_REGS_TO_ARGS(x,__VA_ARGS__));\
+ } \
+
+#else /* CONFIG_X86_X32 */
+#define __X32_COMPAT_SYS_STUBx(x, name, ...)
+#endif /* CONFIG_X86_X32 */
+
+
+#ifdef CONFIG_COMPAT
+/*
+ * Compat means IA32_EMULATION and/or X86_X32. As they use a different
+ * mapping of registers to parameters, we need to generate stubs for each
+ * of them.
+ */
+#define COMPAT_SYSCALL_DEFINEx(x, name, ...) \
+ static long __se_compat_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \
+ static inline long __do_compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));\
+ __IA32_COMPAT_SYS_STUBx(x, name, __VA_ARGS__) \
+ __X32_COMPAT_SYS_STUBx(x, name, __VA_ARGS__) \
+ static long __se_compat_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \
+ { \
+ return __do_compat_sys##name(__MAP(x,__SC_DELOUSE,__VA_ARGS__));\
+ } \
+ static inline long __do_compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))
+
+/*
+ * As some compat syscalls may not be implemented, we need to expand
+ * COND_SYSCALL_COMPAT in kernel/sys_ni.c and COMPAT_SYS_NI in
+ * kernel/time/posix-stubs.c to cover this case as well.
+ */
+#define COND_SYSCALL_COMPAT(name) \
+ cond_syscall(__ia32_compat_sys_##name); \
+ cond_syscall(__x32_compat_sys_##name)
+
+#define COMPAT_SYS_NI(name) \
+ SYSCALL_ALIAS(__ia32_compat_sys_##name, sys_ni_posix_timers); \
+ SYSCALL_ALIAS(__x32_compat_sys_##name, sys_ni_posix_timers)
+
+#endif /* CONFIG_COMPAT */
+
+
+/*
+ * Instead of the generic __SYSCALL_DEFINEx() definition, this macro takes
+ * struct pt_regs *regs as the only argument of the syscall stub named
+ * __x64_sys_*(). It decodes just the registers it needs and passes them on to
+ * the __se_sys_*() wrapper performing sign extension and then to the
+ * __do_sys_*() function doing the actual job. These wrappers and functions
+ * are inlined (at least in very most cases), meaning that the assembly looks
+ * as follows (slightly re-ordered for better readability):
+ *
+ * <__x64_sys_recv>: <-- syscall with 4 parameters
+ * callq <__fentry__>
+ *
+ * mov 0x70(%rdi),%rdi <-- decode regs->di
+ * mov 0x68(%rdi),%rsi <-- decode regs->si
+ * mov 0x60(%rdi),%rdx <-- decode regs->dx
+ * mov 0x38(%rdi),%rcx <-- decode regs->r10
+ *
+ * xor %r9d,%r9d <-- clear %r9
+ * xor %r8d,%r8d <-- clear %r8
+ *
+ * callq __sys_recvfrom <-- do the actual work in __sys_recvfrom()
+ * which takes 6 arguments
+ *
+ * cltq <-- extend return value to 64-bit
+ * retq <-- return
+ *
+ * This approach avoids leaking random user-provided register content down
+ * the call chain.
+ *
+ * If IA32_EMULATION is enabled, this macro generates an additional wrapper
+ * named __ia32_sys_*() which decodes the struct pt_regs *regs according
+ * to the i386 calling convention (bx, cx, dx, si, di, bp).
+ */
+#define __SYSCALL_DEFINEx(x, name, ...) \
+ asmlinkage long __x64_sys##name(const struct pt_regs *regs); \
+ ALLOW_ERROR_INJECTION(__x64_sys##name, ERRNO); \
+ static long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \
+ static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));\
+ asmlinkage long __x64_sys##name(const struct pt_regs *regs) \
+ { \
+ return __se_sys##name(SC_X86_64_REGS_TO_ARGS(x,__VA_ARGS__));\
+ } \
+ __IA32_SYS_STUBx(x, name, __VA_ARGS__) \
+ static long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \
+ { \
+ long ret = __do_sys##name(__MAP(x,__SC_CAST,__VA_ARGS__));\
+ __MAP(x,__SC_TEST,__VA_ARGS__); \
+ __PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__)); \
+ return ret; \
+ } \
+ static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))
+
+/*
+ * As the generic SYSCALL_DEFINE0() macro does not decode any parameters for
+ * obvious reasons, and passing struct pt_regs *regs to it in %rdi does not
+ * hurt, we only need to re-define it here to keep the naming congruent to
+ * SYSCALL_DEFINEx() -- which is essential for the COND_SYSCALL() and SYS_NI()
+ * macros to work correctly.
+ */
+#ifndef SYSCALL_DEFINE0
+#define SYSCALL_DEFINE0(sname) \
+ SYSCALL_METADATA(_##sname, 0); \
+ asmlinkage long __x64_sys_##sname(const struct pt_regs *__unused);\
+ ALLOW_ERROR_INJECTION(__x64_sys_##sname, ERRNO); \
+ asmlinkage long __x64_sys_##sname(const struct pt_regs *__unused)
+#endif
+
+#ifndef COND_SYSCALL
+#define COND_SYSCALL(name) cond_syscall(__x64_sys_##name)
+#endif
+
+#ifndef SYS_NI
+#define SYS_NI(name) SYSCALL_ALIAS(__x64_sys_##name, sys_ni_posix_timers);
+#endif
+
+
+/*
+ * For VSYSCALLS, we need to declare these three syscalls with the new
+ * pt_regs-based calling convention for in-kernel use.
+ */
+struct pt_regs;
+asmlinkage long __x64_sys_getcpu(const struct pt_regs *regs);
+asmlinkage long __x64_sys_gettimeofday(const struct pt_regs *regs);
+asmlinkage long __x64_sys_time(const struct pt_regs *regs);
+
+#endif /* _ASM_X86_SYSCALL_WRAPPER_H */
diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h
new file mode 100644
index 000000000..9fa979dd0
--- /dev/null
+++ b/arch/x86/include/asm/syscalls.h
@@ -0,0 +1,51 @@
+/*
+ * syscalls.h - Linux syscall interfaces (arch-specific)
+ *
+ * Copyright (c) 2008 Jaswinder Singh Rajput
+ *
+ * This file is released under the GPLv2.
+ * See the file COPYING for more details.
+ */
+
+#ifndef _ASM_X86_SYSCALLS_H
+#define _ASM_X86_SYSCALLS_H
+
+#include <linux/compiler.h>
+#include <linux/linkage.h>
+#include <linux/signal.h>
+#include <linux/types.h>
+
+/* Common in X86_32 and X86_64 */
+/* kernel/ioport.c */
+long ksys_ioperm(unsigned long from, unsigned long num, int turn_on);
+
+#ifdef CONFIG_X86_32
+/*
+ * These definitions are only valid on pure 32-bit systems; x86-64 uses a
+ * different syscall calling convention
+ */
+asmlinkage long sys_ioperm(unsigned long, unsigned long, int);
+asmlinkage long sys_iopl(unsigned int);
+
+/* kernel/ldt.c */
+asmlinkage long sys_modify_ldt(int, void __user *, unsigned long);
+
+/* kernel/signal.c */
+asmlinkage long sys_rt_sigreturn(void);
+
+/* kernel/tls.c */
+asmlinkage long sys_set_thread_area(struct user_desc __user *);
+asmlinkage long sys_get_thread_area(struct user_desc __user *);
+
+/* X86_32 only */
+
+/* kernel/signal.c */
+asmlinkage long sys_sigreturn(void);
+
+/* kernel/vm86_32.c */
+struct vm86_struct;
+asmlinkage long sys_vm86old(struct vm86_struct __user *);
+asmlinkage long sys_vm86(unsigned long, unsigned long);
+
+#endif /* CONFIG_X86_32 */
+#endif /* _ASM_X86_SYSCALLS_H */
diff --git a/arch/x86/include/asm/sysfb.h b/arch/x86/include/asm/sysfb.h
new file mode 100644
index 000000000..2aeb3e255
--- /dev/null
+++ b/arch/x86/include/asm/sysfb.h
@@ -0,0 +1,98 @@
+#ifndef _ARCH_X86_KERNEL_SYSFB_H
+#define _ARCH_X86_KERNEL_SYSFB_H
+
+/*
+ * Generic System Framebuffers on x86
+ * Copyright (c) 2012-2013 David Herrmann <dh.herrmann@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/platform_data/simplefb.h>
+#include <linux/screen_info.h>
+
+enum {
+ M_I17, /* 17-Inch iMac */
+ M_I20, /* 20-Inch iMac */
+ M_I20_SR, /* 20-Inch iMac (Santa Rosa) */
+ M_I24, /* 24-Inch iMac */
+ M_I24_8_1, /* 24-Inch iMac, 8,1th gen */
+ M_I24_10_1, /* 24-Inch iMac, 10,1th gen */
+ M_I27_11_1, /* 27-Inch iMac, 11,1th gen */
+ M_MINI, /* Mac Mini */
+ M_MINI_3_1, /* Mac Mini, 3,1th gen */
+ M_MINI_4_1, /* Mac Mini, 4,1th gen */
+ M_MB, /* MacBook */
+ M_MB_2, /* MacBook, 2nd rev. */
+ M_MB_3, /* MacBook, 3rd rev. */
+ M_MB_5_1, /* MacBook, 5th rev. */
+ M_MB_6_1, /* MacBook, 6th rev. */
+ M_MB_7_1, /* MacBook, 7th rev. */
+ M_MB_SR, /* MacBook, 2nd gen, (Santa Rosa) */
+ M_MBA, /* MacBook Air */
+ M_MBA_3, /* Macbook Air, 3rd rev */
+ M_MBP, /* MacBook Pro */
+ M_MBP_2, /* MacBook Pro 2nd gen */
+ M_MBP_2_2, /* MacBook Pro 2,2nd gen */
+ M_MBP_SR, /* MacBook Pro (Santa Rosa) */
+ M_MBP_4, /* MacBook Pro, 4th gen */
+ M_MBP_5_1, /* MacBook Pro, 5,1th gen */
+ M_MBP_5_2, /* MacBook Pro, 5,2th gen */
+ M_MBP_5_3, /* MacBook Pro, 5,3rd gen */
+ M_MBP_6_1, /* MacBook Pro, 6,1th gen */
+ M_MBP_6_2, /* MacBook Pro, 6,2th gen */
+ M_MBP_7_1, /* MacBook Pro, 7,1th gen */
+ M_MBP_8_2, /* MacBook Pro, 8,2nd gen */
+ M_UNKNOWN /* placeholder */
+};
+
+struct efifb_dmi_info {
+ char *optname;
+ unsigned long base;
+ int stride;
+ int width;
+ int height;
+ int flags;
+};
+
+#ifdef CONFIG_EFI
+
+extern struct efifb_dmi_info efifb_dmi_list[];
+void sysfb_apply_efi_quirks(void);
+
+#else /* CONFIG_EFI */
+
+static inline void sysfb_apply_efi_quirks(void)
+{
+}
+
+#endif /* CONFIG_EFI */
+
+#ifdef CONFIG_X86_SYSFB
+
+bool parse_mode(const struct screen_info *si,
+ struct simplefb_platform_data *mode);
+int create_simplefb(const struct screen_info *si,
+ const struct simplefb_platform_data *mode);
+
+#else /* CONFIG_X86_SYSFB */
+
+static inline bool parse_mode(const struct screen_info *si,
+ struct simplefb_platform_data *mode)
+{
+ return false;
+}
+
+static inline int create_simplefb(const struct screen_info *si,
+ const struct simplefb_platform_data *mode)
+{
+ return -EINVAL;
+}
+
+#endif /* CONFIG_X86_SYSFB */
+
+#endif /* _ARCH_X86_KERNEL_SYSFB_H */
diff --git a/arch/x86/include/asm/tce.h b/arch/x86/include/asm/tce.h
new file mode 100644
index 000000000..7a6677c1a
--- /dev/null
+++ b/arch/x86/include/asm/tce.h
@@ -0,0 +1,48 @@
+/*
+ * This file is derived from asm-powerpc/tce.h.
+ *
+ * Copyright (C) IBM Corporation, 2006
+ *
+ * Author: Muli Ben-Yehuda <muli@il.ibm.com>
+ * Author: Jon Mason <jdmason@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef _ASM_X86_TCE_H
+#define _ASM_X86_TCE_H
+
+extern unsigned int specified_table_size;
+struct iommu_table;
+
+#define TCE_ENTRY_SIZE 8 /* in bytes */
+
+#define TCE_READ_SHIFT 0
+#define TCE_WRITE_SHIFT 1
+#define TCE_HUBID_SHIFT 2 /* unused */
+#define TCE_RSVD_SHIFT 8 /* unused */
+#define TCE_RPN_SHIFT 12
+#define TCE_UNUSED_SHIFT 48 /* unused */
+
+#define TCE_RPN_MASK 0x0000fffffffff000ULL
+
+extern void tce_build(struct iommu_table *tbl, unsigned long index,
+ unsigned int npages, unsigned long uaddr, int direction);
+extern void tce_free(struct iommu_table *tbl, long index, unsigned int npages);
+extern void * __init alloc_tce_table(void);
+extern void __init free_tce_table(void *tbl);
+extern int __init build_tce_table(struct pci_dev *dev, void __iomem *bbar);
+
+#endif /* _ASM_X86_TCE_H */
diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h
new file mode 100644
index 000000000..0bbb07eae
--- /dev/null
+++ b/arch/x86/include/asm/text-patching.h
@@ -0,0 +1,72 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_TEXT_PATCHING_H
+#define _ASM_X86_TEXT_PATCHING_H
+
+#include <linux/types.h>
+#include <linux/stddef.h>
+#include <asm/ptrace.h>
+
+struct paravirt_patch_site;
+#ifdef CONFIG_PARAVIRT
+void apply_paravirt(struct paravirt_patch_site *start,
+ struct paravirt_patch_site *end);
+#else
+static inline void apply_paravirt(struct paravirt_patch_site *start,
+ struct paravirt_patch_site *end)
+{}
+#define __parainstructions NULL
+#define __parainstructions_end NULL
+#endif
+
+extern void *text_poke_early(void *addr, const void *opcode, size_t len);
+
+/*
+ * Clear and restore the kernel write-protection flag on the local CPU.
+ * Allows the kernel to edit read-only pages.
+ * Side-effect: any interrupt handler running between save and restore will have
+ * the ability to write to read-only pages.
+ *
+ * Warning:
+ * Code patching in the UP case is safe if NMIs and MCE handlers are stopped and
+ * no thread can be preempted in the instructions being modified (no iret to an
+ * invalid instruction possible) or if the instructions are changed from a
+ * consistent state to another consistent state atomically.
+ * On the local CPU you need to be protected again NMI or MCE handlers seeing an
+ * inconsistent instruction while you patch.
+ */
+extern void *text_poke(void *addr, const void *opcode, size_t len);
+extern int poke_int3_handler(struct pt_regs *regs);
+extern void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler);
+extern int after_bootmem;
+
+#ifndef CONFIG_UML_X86
+static inline void int3_emulate_jmp(struct pt_regs *regs, unsigned long ip)
+{
+ regs->ip = ip;
+}
+
+#define INT3_INSN_SIZE 1
+#define CALL_INSN_SIZE 5
+
+#ifdef CONFIG_X86_64
+static inline void int3_emulate_push(struct pt_regs *regs, unsigned long val)
+{
+ /*
+ * The int3 handler in entry_64.S adds a gap between the
+ * stack where the break point happened, and the saving of
+ * pt_regs. We can extend the original stack because of
+ * this gap. See the idtentry macro's create_gap option.
+ */
+ regs->sp -= sizeof(unsigned long);
+ *(unsigned long *)regs->sp = val;
+}
+
+static inline void int3_emulate_call(struct pt_regs *regs, unsigned long func)
+{
+ int3_emulate_push(regs, regs->ip - INT3_INSN_SIZE + CALL_INSN_SIZE);
+ int3_emulate_jmp(regs, func);
+}
+#endif /* CONFIG_X86_64 */
+#endif /* !CONFIG_UML_X86 */
+
+#endif /* _ASM_X86_TEXT_PATCHING_H */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
new file mode 100644
index 000000000..b5e4c3575
--- /dev/null
+++ b/arch/x86/include/asm/thread_info.h
@@ -0,0 +1,279 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* thread_info.h: low-level thread information
+ *
+ * Copyright (C) 2002 David Howells (dhowells@redhat.com)
+ * - Incorporating suggestions made by Linus Torvalds and Dave Miller
+ */
+
+#ifndef _ASM_X86_THREAD_INFO_H
+#define _ASM_X86_THREAD_INFO_H
+
+#include <linux/compiler.h>
+#include <asm/page.h>
+#include <asm/percpu.h>
+#include <asm/types.h>
+
+/*
+ * TOP_OF_KERNEL_STACK_PADDING is a number of unused bytes that we
+ * reserve at the top of the kernel stack. We do it because of a nasty
+ * 32-bit corner case. On x86_32, the hardware stack frame is
+ * variable-length. Except for vm86 mode, struct pt_regs assumes a
+ * maximum-length frame. If we enter from CPL 0, the top 8 bytes of
+ * pt_regs don't actually exist. Ordinarily this doesn't matter, but it
+ * does in at least one case:
+ *
+ * If we take an NMI early enough in SYSENTER, then we can end up with
+ * pt_regs that extends above sp0. On the way out, in the espfix code,
+ * we can read the saved SS value, but that value will be above sp0.
+ * Without this offset, that can result in a page fault. (We are
+ * careful that, in this case, the value we read doesn't matter.)
+ *
+ * In vm86 mode, the hardware frame is much longer still, so add 16
+ * bytes to make room for the real-mode segments.
+ *
+ * x86_64 has a fixed-length stack frame.
+ */
+#ifdef CONFIG_X86_32
+# ifdef CONFIG_VM86
+# define TOP_OF_KERNEL_STACK_PADDING 16
+# else
+# define TOP_OF_KERNEL_STACK_PADDING 8
+# endif
+#else
+# define TOP_OF_KERNEL_STACK_PADDING 0
+#endif
+
+/*
+ * low level task data that entry.S needs immediate access to
+ * - this struct should fit entirely inside of one cache line
+ * - this struct shares the supervisor stack pages
+ */
+#ifndef __ASSEMBLY__
+struct task_struct;
+#include <asm/cpufeature.h>
+#include <linux/atomic.h>
+
+struct thread_info {
+ unsigned long flags; /* low level flags */
+ u32 status; /* thread synchronous flags */
+};
+
+#define INIT_THREAD_INFO(tsk) \
+{ \
+ .flags = 0, \
+}
+
+#else /* !__ASSEMBLY__ */
+
+#include <asm/asm-offsets.h>
+
+#endif
+
+/*
+ * thread information flags
+ * - these are process state flags that various assembly files
+ * may need to access
+ */
+#define TIF_SYSCALL_TRACE 0 /* syscall trace active */
+#define TIF_NOTIFY_RESUME 1 /* callback before returning to user */
+#define TIF_SIGPENDING 2 /* signal pending */
+#define TIF_NEED_RESCHED 3 /* rescheduling necessary */
+#define TIF_SINGLESTEP 4 /* reenable singlestep on user return*/
+#define TIF_SSBD 5 /* Speculative store bypass disable */
+#define TIF_SYSCALL_EMU 6 /* syscall emulation active */
+#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
+#define TIF_SECCOMP 8 /* secure computing */
+#define TIF_SPEC_IB 9 /* Indirect branch speculation mitigation */
+#define TIF_SPEC_FORCE_UPDATE 10 /* Force speculation MSR update in context switch */
+#define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */
+#define TIF_UPROBE 12 /* breakpointed or singlestepping */
+#define TIF_PATCH_PENDING 13 /* pending live patching update */
+#define TIF_NOCPUID 15 /* CPUID is not accessible in userland */
+#define TIF_NOTSC 16 /* TSC is not accessible in userland */
+#define TIF_IA32 17 /* IA32 compatibility process */
+#define TIF_NOHZ 19 /* in adaptive nohz mode */
+#define TIF_MEMDIE 20 /* is terminating due to OOM killer */
+#define TIF_POLLING_NRFLAG 21 /* idle is polling for TIF_NEED_RESCHED */
+#define TIF_IO_BITMAP 22 /* uses I/O bitmap */
+#define TIF_FORCED_TF 24 /* true if TF in eflags artificially */
+#define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF */
+#define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */
+#define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */
+#define TIF_ADDR32 29 /* 32-bit address space on 64 bits */
+#define TIF_X32 30 /* 32-bit native x86-64 binary */
+#define TIF_FSCHECK 31 /* Check FS is USER_DS on return */
+
+#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
+#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
+#define _TIF_SIGPENDING (1 << TIF_SIGPENDING)
+#define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED)
+#define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP)
+#define _TIF_SSBD (1 << TIF_SSBD)
+#define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU)
+#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
+#define _TIF_SECCOMP (1 << TIF_SECCOMP)
+#define _TIF_SPEC_IB (1 << TIF_SPEC_IB)
+#define _TIF_SPEC_FORCE_UPDATE (1 << TIF_SPEC_FORCE_UPDATE)
+#define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY)
+#define _TIF_UPROBE (1 << TIF_UPROBE)
+#define _TIF_PATCH_PENDING (1 << TIF_PATCH_PENDING)
+#define _TIF_NOCPUID (1 << TIF_NOCPUID)
+#define _TIF_NOTSC (1 << TIF_NOTSC)
+#define _TIF_IA32 (1 << TIF_IA32)
+#define _TIF_NOHZ (1 << TIF_NOHZ)
+#define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG)
+#define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP)
+#define _TIF_FORCED_TF (1 << TIF_FORCED_TF)
+#define _TIF_BLOCKSTEP (1 << TIF_BLOCKSTEP)
+#define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES)
+#define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT)
+#define _TIF_ADDR32 (1 << TIF_ADDR32)
+#define _TIF_X32 (1 << TIF_X32)
+#define _TIF_FSCHECK (1 << TIF_FSCHECK)
+
+/*
+ * work to do in syscall_trace_enter(). Also includes TIF_NOHZ for
+ * enter_from_user_mode()
+ */
+#define _TIF_WORK_SYSCALL_ENTRY \
+ (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_AUDIT | \
+ _TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT | \
+ _TIF_NOHZ)
+
+/* work to do on any return to user space */
+#define _TIF_ALLWORK_MASK \
+ (_TIF_SYSCALL_TRACE | _TIF_NOTIFY_RESUME | _TIF_SIGPENDING | \
+ _TIF_NEED_RESCHED | _TIF_SINGLESTEP | _TIF_SYSCALL_EMU | \
+ _TIF_SYSCALL_AUDIT | _TIF_USER_RETURN_NOTIFY | _TIF_UPROBE | \
+ _TIF_PATCH_PENDING | _TIF_NOHZ | _TIF_SYSCALL_TRACEPOINT | \
+ _TIF_FSCHECK)
+
+/* flags to check in __switch_to() */
+#define _TIF_WORK_CTXSW_BASE \
+ (_TIF_IO_BITMAP|_TIF_NOCPUID|_TIF_NOTSC|_TIF_BLOCKSTEP| \
+ _TIF_SSBD | _TIF_SPEC_FORCE_UPDATE)
+
+/*
+ * Avoid calls to __switch_to_xtra() on UP as STIBP is not evaluated.
+ */
+#ifdef CONFIG_SMP
+# define _TIF_WORK_CTXSW (_TIF_WORK_CTXSW_BASE | _TIF_SPEC_IB)
+#else
+# define _TIF_WORK_CTXSW (_TIF_WORK_CTXSW_BASE)
+#endif
+
+#define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
+#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW)
+
+#define STACK_WARN (THREAD_SIZE/8)
+
+/*
+ * macros/functions for gaining access to the thread information structure
+ *
+ * preempt_count needs to be 1 initially, until the scheduler is functional.
+ */
+#ifndef __ASSEMBLY__
+
+/*
+ * Walks up the stack frames to make sure that the specified object is
+ * entirely contained by a single stack frame.
+ *
+ * Returns:
+ * GOOD_FRAME if within a frame
+ * BAD_STACK if placed across a frame boundary (or outside stack)
+ * NOT_STACK unable to determine (no frame pointers, etc)
+ */
+static inline int arch_within_stack_frames(const void * const stack,
+ const void * const stackend,
+ const void *obj, unsigned long len)
+{
+#if defined(CONFIG_FRAME_POINTER)
+ const void *frame = NULL;
+ const void *oldframe;
+
+ oldframe = __builtin_frame_address(1);
+ if (oldframe)
+ frame = __builtin_frame_address(2);
+ /*
+ * low ----------------------------------------------> high
+ * [saved bp][saved ip][args][local vars][saved bp][saved ip]
+ * ^----------------^
+ * allow copies only within here
+ */
+ while (stack <= frame && frame < stackend) {
+ /*
+ * If obj + len extends past the last frame, this
+ * check won't pass and the next frame will be 0,
+ * causing us to bail out and correctly report
+ * the copy as invalid.
+ */
+ if (obj + len <= frame)
+ return obj >= oldframe + 2 * sizeof(void *) ?
+ GOOD_FRAME : BAD_STACK;
+ oldframe = frame;
+ frame = *(const void * const *)frame;
+ }
+ return BAD_STACK;
+#else
+ return NOT_STACK;
+#endif
+}
+
+#else /* !__ASSEMBLY__ */
+
+#ifdef CONFIG_X86_64
+# define cpu_current_top_of_stack (cpu_tss_rw + TSS_sp1)
+#endif
+
+#endif
+
+/*
+ * Thread-synchronous status.
+ *
+ * This is different from the flags in that nobody else
+ * ever touches our thread-synchronous status, so we don't
+ * have to worry about atomic accesses.
+ */
+#define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/
+
+#ifndef __ASSEMBLY__
+#ifdef CONFIG_COMPAT
+#define TS_I386_REGS_POKED 0x0004 /* regs poked by 32-bit ptracer */
+#define TS_COMPAT_RESTART 0x0008
+
+#define arch_set_restart_data arch_set_restart_data
+
+static inline void arch_set_restart_data(struct restart_block *restart)
+{
+ struct thread_info *ti = current_thread_info();
+ if (ti->status & TS_COMPAT)
+ ti->status |= TS_COMPAT_RESTART;
+ else
+ ti->status &= ~TS_COMPAT_RESTART;
+}
+#endif
+
+#ifdef CONFIG_X86_32
+#define in_ia32_syscall() true
+#else
+#define in_ia32_syscall() (IS_ENABLED(CONFIG_IA32_EMULATION) && \
+ current_thread_info()->status & TS_COMPAT)
+#endif
+
+/*
+ * Force syscall return via IRET by making it look as if there was
+ * some work pending. IRET is our most capable (but slowest) syscall
+ * return path, which is able to restore modified SS, CS and certain
+ * EFLAGS values that other (fast) syscall return instructions
+ * are not able to restore properly.
+ */
+#define force_iret() set_thread_flag(TIF_NOTIFY_RESUME)
+
+extern void arch_task_cache_init(void);
+extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src);
+extern void arch_release_task_struct(struct task_struct *tsk);
+extern void arch_setup_new_exec(void);
+#define arch_setup_new_exec arch_setup_new_exec
+#endif /* !__ASSEMBLY__ */
+
+#endif /* _ASM_X86_THREAD_INFO_H */
diff --git a/arch/x86/include/asm/time.h b/arch/x86/include/asm/time.h
new file mode 100644
index 000000000..cef818b16
--- /dev/null
+++ b/arch/x86/include/asm/time.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_TIME_H
+#define _ASM_X86_TIME_H
+
+#include <linux/clocksource.h>
+#include <asm/mc146818rtc.h>
+
+extern void hpet_time_init(void);
+extern void time_init(void);
+
+extern struct clock_event_device *global_clock_event;
+
+#endif /* _ASM_X86_TIME_H */
diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
new file mode 100644
index 000000000..7365dd4ac
--- /dev/null
+++ b/arch/x86/include/asm/timer.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_TIMER_H
+#define _ASM_X86_TIMER_H
+#include <linux/pm.h>
+#include <linux/percpu.h>
+#include <linux/interrupt.h>
+#include <linux/math64.h>
+
+#define TICK_SIZE (tick_nsec / 1000)
+
+unsigned long long native_sched_clock(void);
+extern void recalibrate_cpu_khz(void);
+
+extern int no_timer_check;
+
+extern bool using_native_sched_clock(void);
+
+/*
+ * We use the full linear equation: f(x) = a + b*x, in order to allow
+ * a continuous function in the face of dynamic freq changes.
+ *
+ * Continuity means that when our frequency changes our slope (b); we want to
+ * ensure that: f(t) == f'(t), which gives: a + b*t == a' + b'*t.
+ *
+ * Without an offset (a) the above would not be possible.
+ *
+ * See the comment near cycles_2_ns() for details on how we compute (b).
+ */
+struct cyc2ns_data {
+ u32 cyc2ns_mul;
+ u32 cyc2ns_shift;
+ u64 cyc2ns_offset;
+}; /* 16 bytes */
+
+extern void cyc2ns_read_begin(struct cyc2ns_data *);
+extern void cyc2ns_read_end(void);
+
+#endif /* _ASM_X86_TIMER_H */
diff --git a/arch/x86/include/asm/timex.h b/arch/x86/include/asm/timex.h
new file mode 100644
index 000000000..956e41453
--- /dev/null
+++ b/arch/x86/include/asm/timex.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_TIMEX_H
+#define _ASM_X86_TIMEX_H
+
+#include <asm/processor.h>
+#include <asm/tsc.h>
+
+static inline unsigned long random_get_entropy(void)
+{
+ if (!IS_ENABLED(CONFIG_X86_TSC) &&
+ !cpu_feature_enabled(X86_FEATURE_TSC))
+ return random_get_entropy_fallback();
+ return rdtsc();
+}
+#define random_get_entropy random_get_entropy
+
+/* Assume we use the PIT time source for the clock tick */
+#define CLOCK_TICK_RATE PIT_TICK_RATE
+
+#define ARCH_HAS_READ_CURRENT_TIMER
+
+#endif /* _ASM_X86_TIMEX_H */
diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h
new file mode 100644
index 000000000..cb0a1f470
--- /dev/null
+++ b/arch/x86/include/asm/tlb.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_TLB_H
+#define _ASM_X86_TLB_H
+
+#define tlb_start_vma(tlb, vma) do { } while (0)
+#define tlb_end_vma(tlb, vma) do { } while (0)
+#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0)
+
+#define tlb_flush(tlb) \
+{ \
+ if (!tlb->fullmm && !tlb->need_flush_all) \
+ flush_tlb_mm_range(tlb->mm, tlb->start, tlb->end, 0UL); \
+ else \
+ flush_tlb_mm_range(tlb->mm, 0UL, TLB_FLUSH_ALL, 0UL); \
+}
+
+#include <asm-generic/tlb.h>
+
+/*
+ * While x86 architecture in general requires an IPI to perform TLB
+ * shootdown, enablement code for several hypervisors overrides
+ * .flush_tlb_others hook in pv_mmu_ops and implements it by issuing
+ * a hypercall. To keep software pagetable walkers safe in this case we
+ * switch to RCU based table free (HAVE_RCU_TABLE_FREE). See the comment
+ * below 'ifdef CONFIG_HAVE_RCU_TABLE_FREE' in include/asm-generic/tlb.h
+ * for more details.
+ */
+static inline void __tlb_remove_table(void *table)
+{
+ free_page_and_swap_cache(table);
+}
+
+#endif /* _ASM_X86_TLB_H */
diff --git a/arch/x86/include/asm/tlbbatch.h b/arch/x86/include/asm/tlbbatch.h
new file mode 100644
index 000000000..1ad56eb3e
--- /dev/null
+++ b/arch/x86/include/asm/tlbbatch.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ARCH_X86_TLBBATCH_H
+#define _ARCH_X86_TLBBATCH_H
+
+#include <linux/cpumask.h>
+
+struct arch_tlbflush_unmap_batch {
+ /*
+ * Each bit set is a CPU that potentially has a TLB entry for one of
+ * the PFNs being flushed..
+ */
+ struct cpumask cpumask;
+};
+
+#endif /* _ARCH_X86_TLBBATCH_H */
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
new file mode 100644
index 000000000..79ec7add5
--- /dev/null
+++ b/arch/x86/include/asm/tlbflush.h
@@ -0,0 +1,610 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_TLBFLUSH_H
+#define _ASM_X86_TLBFLUSH_H
+
+#include <linux/mm.h>
+#include <linux/sched.h>
+
+#include <asm/processor.h>
+#include <asm/cpufeature.h>
+#include <asm/special_insns.h>
+#include <asm/smp.h>
+#include <asm/invpcid.h>
+#include <asm/pti.h>
+#include <asm/processor-flags.h>
+
+/*
+ * The x86 feature is called PCID (Process Context IDentifier). It is similar
+ * to what is traditionally called ASID on the RISC processors.
+ *
+ * We don't use the traditional ASID implementation, where each process/mm gets
+ * its own ASID and flush/restart when we run out of ASID space.
+ *
+ * Instead we have a small per-cpu array of ASIDs and cache the last few mm's
+ * that came by on this CPU, allowing cheaper switch_mm between processes on
+ * this CPU.
+ *
+ * We end up with different spaces for different things. To avoid confusion we
+ * use different names for each of them:
+ *
+ * ASID - [0, TLB_NR_DYN_ASIDS-1]
+ * the canonical identifier for an mm
+ *
+ * kPCID - [1, TLB_NR_DYN_ASIDS]
+ * the value we write into the PCID part of CR3; corresponds to the
+ * ASID+1, because PCID 0 is special.
+ *
+ * uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS]
+ * for KPTI each mm has two address spaces and thus needs two
+ * PCID values, but we can still do with a single ASID denomination
+ * for each mm. Corresponds to kPCID + 2048.
+ *
+ */
+
+/* There are 12 bits of space for ASIDS in CR3 */
+#define CR3_HW_ASID_BITS 12
+
+/*
+ * When enabled, PAGE_TABLE_ISOLATION consumes a single bit for
+ * user/kernel switches
+ */
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+# define PTI_CONSUMED_PCID_BITS 1
+#else
+# define PTI_CONSUMED_PCID_BITS 0
+#endif
+
+#define CR3_AVAIL_PCID_BITS (X86_CR3_PCID_BITS - PTI_CONSUMED_PCID_BITS)
+
+/*
+ * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account
+ * for them being zero-based. Another -1 is because PCID 0 is reserved for
+ * use by non-PCID-aware users.
+ */
+#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2)
+
+/*
+ * 6 because 6 should be plenty and struct tlb_state will fit in two cache
+ * lines.
+ */
+#define TLB_NR_DYN_ASIDS 6
+
+/*
+ * Given @asid, compute kPCID
+ */
+static inline u16 kern_pcid(u16 asid)
+{
+ VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+ /*
+ * Make sure that the dynamic ASID space does not confict with the
+ * bit we are using to switch between user and kernel ASIDs.
+ */
+ BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_PCID_USER_BIT));
+
+ /*
+ * The ASID being passed in here should have respected the
+ * MAX_ASID_AVAILABLE and thus never have the switch bit set.
+ */
+ VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_PCID_USER_BIT));
+#endif
+ /*
+ * The dynamically-assigned ASIDs that get passed in are small
+ * (<TLB_NR_DYN_ASIDS). They never have the high switch bit set,
+ * so do not bother to clear it.
+ *
+ * If PCID is on, ASID-aware code paths put the ASID+1 into the
+ * PCID bits. This serves two purposes. It prevents a nasty
+ * situation in which PCID-unaware code saves CR3, loads some other
+ * value (with PCID == 0), and then restores CR3, thus corrupting
+ * the TLB for ASID 0 if the saved ASID was nonzero. It also means
+ * that any bugs involving loading a PCID-enabled CR3 with
+ * CR4.PCIDE off will trigger deterministically.
+ */
+ return asid + 1;
+}
+
+/*
+ * Given @asid, compute uPCID
+ */
+static inline u16 user_pcid(u16 asid)
+{
+ u16 ret = kern_pcid(asid);
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+ ret |= 1 << X86_CR3_PTI_PCID_USER_BIT;
+#endif
+ return ret;
+}
+
+struct pgd_t;
+static inline unsigned long build_cr3(pgd_t *pgd, u16 asid)
+{
+ if (static_cpu_has(X86_FEATURE_PCID)) {
+ return __sme_pa(pgd) | kern_pcid(asid);
+ } else {
+ VM_WARN_ON_ONCE(asid != 0);
+ return __sme_pa(pgd);
+ }
+}
+
+static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid)
+{
+ VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
+ /*
+ * Use boot_cpu_has() instead of this_cpu_has() as this function
+ * might be called during early boot. This should work even after
+ * boot because all CPU's the have same capabilities:
+ */
+ VM_WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_PCID));
+ return __sme_pa(pgd) | kern_pcid(asid) | CR3_NOFLUSH;
+}
+
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+#define __flush_tlb() __native_flush_tlb()
+#define __flush_tlb_global() __native_flush_tlb_global()
+#define __flush_tlb_one_user(addr) __native_flush_tlb_one_user(addr)
+#endif
+
+static inline bool tlb_defer_switch_to_init_mm(void)
+{
+ /*
+ * If we have PCID, then switching to init_mm is reasonably
+ * fast. If we don't have PCID, then switching to init_mm is
+ * quite slow, so we try to defer it in the hopes that we can
+ * avoid it entirely. The latter approach runs the risk of
+ * receiving otherwise unnecessary IPIs.
+ *
+ * This choice is just a heuristic. The tlb code can handle this
+ * function returning true or false regardless of whether we have
+ * PCID.
+ */
+ return !static_cpu_has(X86_FEATURE_PCID);
+}
+
+struct tlb_context {
+ u64 ctx_id;
+ u64 tlb_gen;
+};
+
+struct tlb_state {
+ /*
+ * cpu_tlbstate.loaded_mm should match CR3 whenever interrupts
+ * are on. This means that it may not match current->active_mm,
+ * which will contain the previous user mm when we're in lazy TLB
+ * mode even if we've already switched back to swapper_pg_dir.
+ *
+ * During switch_mm_irqs_off(), loaded_mm will be set to
+ * LOADED_MM_SWITCHING during the brief interrupts-off window
+ * when CR3 and loaded_mm would otherwise be inconsistent. This
+ * is for nmi_uaccess_okay()'s benefit.
+ */
+ struct mm_struct *loaded_mm;
+
+#define LOADED_MM_SWITCHING ((struct mm_struct *)1)
+
+ /* Last user mm for optimizing IBPB */
+ union {
+ struct mm_struct *last_user_mm;
+ unsigned long last_user_mm_ibpb;
+ };
+
+ u16 loaded_mm_asid;
+ u16 next_asid;
+
+ /*
+ * We can be in one of several states:
+ *
+ * - Actively using an mm. Our CPU's bit will be set in
+ * mm_cpumask(loaded_mm) and is_lazy == false;
+ *
+ * - Not using a real mm. loaded_mm == &init_mm. Our CPU's bit
+ * will not be set in mm_cpumask(&init_mm) and is_lazy == false.
+ *
+ * - Lazily using a real mm. loaded_mm != &init_mm, our bit
+ * is set in mm_cpumask(loaded_mm), but is_lazy == true.
+ * We're heuristically guessing that the CR3 load we
+ * skipped more than makes up for the overhead added by
+ * lazy mode.
+ */
+ bool is_lazy;
+
+ /*
+ * If set we changed the page tables in such a way that we
+ * needed an invalidation of all contexts (aka. PCIDs / ASIDs).
+ * This tells us to go invalidate all the non-loaded ctxs[]
+ * on the next context switch.
+ *
+ * The current ctx was kept up-to-date as it ran and does not
+ * need to be invalidated.
+ */
+ bool invalidate_other;
+
+ /*
+ * Mask that contains TLB_NR_DYN_ASIDS+1 bits to indicate
+ * the corresponding user PCID needs a flush next time we
+ * switch to it; see SWITCH_TO_USER_CR3.
+ */
+ unsigned short user_pcid_flush_mask;
+
+ /*
+ * Access to this CR4 shadow and to H/W CR4 is protected by
+ * disabling interrupts when modifying either one.
+ */
+ unsigned long cr4;
+
+ /*
+ * This is a list of all contexts that might exist in the TLB.
+ * There is one per ASID that we use, and the ASID (what the
+ * CPU calls PCID) is the index into ctxts.
+ *
+ * For each context, ctx_id indicates which mm the TLB's user
+ * entries came from. As an invariant, the TLB will never
+ * contain entries that are out-of-date as when that mm reached
+ * the tlb_gen in the list.
+ *
+ * To be clear, this means that it's legal for the TLB code to
+ * flush the TLB without updating tlb_gen. This can happen
+ * (for now, at least) due to paravirt remote flushes.
+ *
+ * NB: context 0 is a bit special, since it's also used by
+ * various bits of init code. This is fine -- code that
+ * isn't aware of PCID will end up harmlessly flushing
+ * context 0.
+ */
+ struct tlb_context ctxs[TLB_NR_DYN_ASIDS];
+};
+DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);
+
+/*
+ * Blindly accessing user memory from NMI context can be dangerous
+ * if we're in the middle of switching the current user task or
+ * switching the loaded mm. It can also be dangerous if we
+ * interrupted some kernel code that was temporarily using a
+ * different mm.
+ */
+static inline bool nmi_uaccess_okay(void)
+{
+ struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
+ struct mm_struct *current_mm = current->mm;
+
+ VM_WARN_ON_ONCE(!loaded_mm);
+
+ /*
+ * The condition we want to check is
+ * current_mm->pgd == __va(read_cr3_pa()). This may be slow, though,
+ * if we're running in a VM with shadow paging, and nmi_uaccess_okay()
+ * is supposed to be reasonably fast.
+ *
+ * Instead, we check the almost equivalent but somewhat conservative
+ * condition below, and we rely on the fact that switch_mm_irqs_off()
+ * sets loaded_mm to LOADED_MM_SWITCHING before writing to CR3.
+ */
+ if (loaded_mm != current_mm)
+ return false;
+
+ VM_WARN_ON_ONCE(current_mm->pgd != __va(read_cr3_pa()));
+
+ return true;
+}
+
+/* Initialize cr4 shadow for this CPU. */
+static inline void cr4_init_shadow(void)
+{
+ this_cpu_write(cpu_tlbstate.cr4, __read_cr4());
+}
+
+static inline void __cr4_set(unsigned long cr4)
+{
+ lockdep_assert_irqs_disabled();
+ this_cpu_write(cpu_tlbstate.cr4, cr4);
+ __write_cr4(cr4);
+}
+
+/* Set in this cpu's CR4. */
+static inline void cr4_set_bits(unsigned long mask)
+{
+ unsigned long cr4, flags;
+
+ local_irq_save(flags);
+ cr4 = this_cpu_read(cpu_tlbstate.cr4);
+ if ((cr4 | mask) != cr4)
+ __cr4_set(cr4 | mask);
+ local_irq_restore(flags);
+}
+
+/* Clear in this cpu's CR4. */
+static inline void cr4_clear_bits(unsigned long mask)
+{
+ unsigned long cr4, flags;
+
+ local_irq_save(flags);
+ cr4 = this_cpu_read(cpu_tlbstate.cr4);
+ if ((cr4 & ~mask) != cr4)
+ __cr4_set(cr4 & ~mask);
+ local_irq_restore(flags);
+}
+
+static inline void cr4_toggle_bits_irqsoff(unsigned long mask)
+{
+ unsigned long cr4;
+
+ cr4 = this_cpu_read(cpu_tlbstate.cr4);
+ __cr4_set(cr4 ^ mask);
+}
+
+/* Read the CR4 shadow. */
+static inline unsigned long cr4_read_shadow(void)
+{
+ return this_cpu_read(cpu_tlbstate.cr4);
+}
+
+/*
+ * Mark all other ASIDs as invalid, preserves the current.
+ */
+static inline void invalidate_other_asid(void)
+{
+ this_cpu_write(cpu_tlbstate.invalidate_other, true);
+}
+
+/*
+ * Save some of cr4 feature set we're using (e.g. Pentium 4MB
+ * enable and PPro Global page enable), so that any CPU's that boot
+ * up after us can get the correct flags. This should only be used
+ * during boot on the boot cpu.
+ */
+extern unsigned long mmu_cr4_features;
+extern u32 *trampoline_cr4_features;
+
+static inline void cr4_set_bits_and_update_boot(unsigned long mask)
+{
+ mmu_cr4_features |= mask;
+ if (trampoline_cr4_features)
+ *trampoline_cr4_features = mmu_cr4_features;
+ cr4_set_bits(mask);
+}
+
+extern void initialize_tlbstate_and_flush(void);
+
+/*
+ * Given an ASID, flush the corresponding user ASID. We can delay this
+ * until the next time we switch to it.
+ *
+ * See SWITCH_TO_USER_CR3.
+ */
+static inline void invalidate_user_asid(u16 asid)
+{
+ /* There is no user ASID if address space separation is off */
+ if (!IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
+ return;
+
+ /*
+ * We only have a single ASID if PCID is off and the CR3
+ * write will have flushed it.
+ */
+ if (!cpu_feature_enabled(X86_FEATURE_PCID))
+ return;
+
+ if (!static_cpu_has(X86_FEATURE_PTI))
+ return;
+
+ __set_bit(kern_pcid(asid),
+ (unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask));
+}
+
+/*
+ * flush the entire current user mapping
+ */
+static inline void __native_flush_tlb(void)
+{
+ /*
+ * Preemption or interrupts must be disabled to protect the access
+ * to the per CPU variable and to prevent being preempted between
+ * read_cr3() and write_cr3().
+ */
+ WARN_ON_ONCE(preemptible());
+
+ invalidate_user_asid(this_cpu_read(cpu_tlbstate.loaded_mm_asid));
+
+ /* If current->mm == NULL then the read_cr3() "borrows" an mm */
+ native_write_cr3(__native_read_cr3());
+}
+
+/*
+ * flush everything
+ */
+static inline void __native_flush_tlb_global(void)
+{
+ unsigned long cr4, flags;
+
+ if (static_cpu_has(X86_FEATURE_INVPCID)) {
+ /*
+ * Using INVPCID is considerably faster than a pair of writes
+ * to CR4 sandwiched inside an IRQ flag save/restore.
+ *
+ * Note, this works with CR4.PCIDE=0 or 1.
+ */
+ invpcid_flush_all();
+ return;
+ }
+
+ /*
+ * Read-modify-write to CR4 - protect it from preemption and
+ * from interrupts. (Use the raw variant because this code can
+ * be called from deep inside debugging code.)
+ */
+ raw_local_irq_save(flags);
+
+ cr4 = this_cpu_read(cpu_tlbstate.cr4);
+ /* toggle PGE */
+ native_write_cr4(cr4 ^ X86_CR4_PGE);
+ /* write old PGE again and flush TLBs */
+ native_write_cr4(cr4);
+
+ raw_local_irq_restore(flags);
+}
+
+/*
+ * flush one page in the user mapping
+ */
+static inline void __native_flush_tlb_one_user(unsigned long addr)
+{
+ u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+
+ asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
+
+ if (!static_cpu_has(X86_FEATURE_PTI))
+ return;
+
+ /*
+ * Some platforms #GP if we call invpcid(type=1/2) before CR4.PCIDE=1.
+ * Just use invalidate_user_asid() in case we are called early.
+ */
+ if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE))
+ invalidate_user_asid(loaded_mm_asid);
+ else
+ invpcid_flush_one(user_pcid(loaded_mm_asid), addr);
+}
+
+/*
+ * flush everything
+ */
+static inline void __flush_tlb_all(void)
+{
+ /*
+ * This is to catch users with enabled preemption and the PGE feature
+ * and don't trigger the warning in __native_flush_tlb().
+ */
+ VM_WARN_ON_ONCE(preemptible());
+
+ if (boot_cpu_has(X86_FEATURE_PGE)) {
+ __flush_tlb_global();
+ } else {
+ /*
+ * !PGE -> !PCID (setup_pcid()), thus every flush is total.
+ */
+ __flush_tlb();
+ }
+}
+
+/*
+ * flush one page in the kernel mapping
+ */
+static inline void __flush_tlb_one_kernel(unsigned long addr)
+{
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
+
+ /*
+ * If PTI is off, then __flush_tlb_one_user() is just INVLPG or its
+ * paravirt equivalent. Even with PCID, this is sufficient: we only
+ * use PCID if we also use global PTEs for the kernel mapping, and
+ * INVLPG flushes global translations across all address spaces.
+ *
+ * If PTI is on, then the kernel is mapped with non-global PTEs, and
+ * __flush_tlb_one_user() will flush the given address for the current
+ * kernel address space and for its usermode counterpart, but it does
+ * not flush it for other address spaces.
+ */
+ __flush_tlb_one_user(addr);
+
+ if (!static_cpu_has(X86_FEATURE_PTI))
+ return;
+
+ /*
+ * See above. We need to propagate the flush to all other address
+ * spaces. In principle, we only need to propagate it to kernelmode
+ * address spaces, but the extra bookkeeping we would need is not
+ * worth it.
+ */
+ invalidate_other_asid();
+}
+
+#define TLB_FLUSH_ALL -1UL
+
+/*
+ * TLB flushing:
+ *
+ * - flush_tlb_all() flushes all processes TLBs
+ * - flush_tlb_mm(mm) flushes the specified mm context TLB's
+ * - flush_tlb_page(vma, vmaddr) flushes one page
+ * - flush_tlb_range(vma, start, end) flushes a range of pages
+ * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
+ * - flush_tlb_others(cpumask, info) flushes TLBs on other cpus
+ *
+ * ..but the i386 has somewhat limited tlb flushing capabilities,
+ * and page-granular flushes are available only on i486 and up.
+ */
+struct flush_tlb_info {
+ /*
+ * We support several kinds of flushes.
+ *
+ * - Fully flush a single mm. .mm will be set, .end will be
+ * TLB_FLUSH_ALL, and .new_tlb_gen will be the tlb_gen to
+ * which the IPI sender is trying to catch us up.
+ *
+ * - Partially flush a single mm. .mm will be set, .start and
+ * .end will indicate the range, and .new_tlb_gen will be set
+ * such that the changes between generation .new_tlb_gen-1 and
+ * .new_tlb_gen are entirely contained in the indicated range.
+ *
+ * - Fully flush all mms whose tlb_gens have been updated. .mm
+ * will be NULL, .end will be TLB_FLUSH_ALL, and .new_tlb_gen
+ * will be zero.
+ */
+ struct mm_struct *mm;
+ unsigned long start;
+ unsigned long end;
+ u64 new_tlb_gen;
+};
+
+#define local_flush_tlb() __flush_tlb()
+
+#define flush_tlb_mm(mm) flush_tlb_mm_range(mm, 0UL, TLB_FLUSH_ALL, 0UL)
+
+#define flush_tlb_range(vma, start, end) \
+ flush_tlb_mm_range(vma->vm_mm, start, end, vma->vm_flags)
+
+extern void flush_tlb_all(void);
+extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
+ unsigned long end, unsigned long vmflag);
+extern void flush_tlb_kernel_range(unsigned long start, unsigned long end);
+
+static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a)
+{
+ flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, VM_NONE);
+}
+
+void native_flush_tlb_others(const struct cpumask *cpumask,
+ const struct flush_tlb_info *info);
+
+static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
+{
+ /*
+ * Bump the generation count. This also serves as a full barrier
+ * that synchronizes with switch_mm(): callers are required to order
+ * their read of mm_cpumask after their writes to the paging
+ * structures.
+ */
+ return atomic64_inc_return(&mm->context.tlb_gen);
+}
+
+static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch,
+ struct mm_struct *mm)
+{
+ inc_mm_tlb_gen(mm);
+ cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
+}
+
+extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
+
+#ifndef CONFIG_PARAVIRT
+#define flush_tlb_others(mask, info) \
+ native_flush_tlb_others(mask, info)
+
+#define paravirt_tlb_remove_table(tlb, page) \
+ tlb_remove_page(tlb, (void *)(page))
+#endif
+
+#endif /* _ASM_X86_TLBFLUSH_H */
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
new file mode 100644
index 000000000..453cf38a1
--- /dev/null
+++ b/arch/x86/include/asm/topology.h
@@ -0,0 +1,179 @@
+/*
+ * Written by: Matthew Dobson, IBM Corporation
+ *
+ * Copyright (C) 2002, IBM Corp.
+ *
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send feedback to <colpatch@us.ibm.com>
+ */
+#ifndef _ASM_X86_TOPOLOGY_H
+#define _ASM_X86_TOPOLOGY_H
+
+/*
+ * to preserve the visibility of NUMA_NO_NODE definition,
+ * moved to there from here. May be used independent of
+ * CONFIG_NUMA.
+ */
+#include <linux/numa.h>
+
+#ifdef CONFIG_NUMA
+#include <linux/cpumask.h>
+
+#include <asm/mpspec.h>
+#include <asm/percpu.h>
+
+/* Mappings between logical cpu number and node number */
+DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map);
+
+#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+/*
+ * override generic percpu implementation of cpu_to_node
+ */
+extern int __cpu_to_node(int cpu);
+#define cpu_to_node __cpu_to_node
+
+extern int early_cpu_to_node(int cpu);
+
+#else /* !CONFIG_DEBUG_PER_CPU_MAPS */
+
+/* Same function but used if called before per_cpu areas are setup */
+static inline int early_cpu_to_node(int cpu)
+{
+ return early_per_cpu(x86_cpu_to_node_map, cpu);
+}
+
+#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
+
+/* Mappings between node number and cpus on that node. */
+extern cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
+
+#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+extern const struct cpumask *cpumask_of_node(int node);
+#else
+/* Returns a pointer to the cpumask of CPUs on Node 'node'. */
+static inline const struct cpumask *cpumask_of_node(int node)
+{
+ return node_to_cpumask_map[node];
+}
+#endif
+
+extern void setup_node_to_cpumask_map(void);
+
+#define pcibus_to_node(bus) __pcibus_to_node(bus)
+
+extern int __node_distance(int, int);
+#define node_distance(a, b) __node_distance(a, b)
+
+#else /* !CONFIG_NUMA */
+
+static inline int numa_node_id(void)
+{
+ return 0;
+}
+/*
+ * indicate override:
+ */
+#define numa_node_id numa_node_id
+
+static inline int early_cpu_to_node(int cpu)
+{
+ return 0;
+}
+
+static inline void setup_node_to_cpumask_map(void) { }
+
+#endif
+
+#include <asm-generic/topology.h>
+
+extern const struct cpumask *cpu_coregroup_mask(int cpu);
+
+#define topology_logical_package_id(cpu) (cpu_data(cpu).logical_proc_id)
+#define topology_physical_package_id(cpu) (cpu_data(cpu).phys_proc_id)
+#define topology_core_id(cpu) (cpu_data(cpu).cpu_core_id)
+
+#ifdef CONFIG_SMP
+#define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu))
+#define topology_sibling_cpumask(cpu) (per_cpu(cpu_sibling_map, cpu))
+
+extern unsigned int __max_logical_packages;
+#define topology_max_packages() (__max_logical_packages)
+
+extern int __max_smt_threads;
+
+static inline int topology_max_smt_threads(void)
+{
+ return __max_smt_threads;
+}
+
+int topology_update_package_map(unsigned int apicid, unsigned int cpu);
+int topology_phys_to_logical_pkg(unsigned int pkg);
+bool topology_is_primary_thread(unsigned int cpu);
+bool topology_smt_supported(void);
+#else
+#define topology_max_packages() (1)
+static inline int
+topology_update_package_map(unsigned int apicid, unsigned int cpu) { return 0; }
+static inline int topology_phys_to_logical_pkg(unsigned int pkg) { return 0; }
+static inline int topology_max_smt_threads(void) { return 1; }
+static inline bool topology_is_primary_thread(unsigned int cpu) { return true; }
+static inline bool topology_smt_supported(void) { return false; }
+#endif
+
+static inline void arch_fix_phys_package_id(int num, u32 slot)
+{
+}
+
+struct pci_bus;
+int x86_pci_root_bus_node(int bus);
+void x86_pci_root_bus_resources(int bus, struct list_head *resources);
+
+extern bool x86_topology_update;
+
+#ifdef CONFIG_SCHED_MC_PRIO
+#include <asm/percpu.h>
+
+DECLARE_PER_CPU_READ_MOSTLY(int, sched_core_priority);
+extern unsigned int __read_mostly sysctl_sched_itmt_enabled;
+
+/* Interface to set priority of a cpu */
+void sched_set_itmt_core_prio(int prio, int core_cpu);
+
+/* Interface to notify scheduler that system supports ITMT */
+int sched_set_itmt_support(void);
+
+/* Interface to notify scheduler that system revokes ITMT support */
+void sched_clear_itmt_support(void);
+
+#else /* CONFIG_SCHED_MC_PRIO */
+
+#define sysctl_sched_itmt_enabled 0
+static inline void sched_set_itmt_core_prio(int prio, int core_cpu)
+{
+}
+static inline int sched_set_itmt_support(void)
+{
+ return 0;
+}
+static inline void sched_clear_itmt_support(void)
+{
+}
+#endif /* CONFIG_SCHED_MC_PRIO */
+
+#endif /* _ASM_X86_TOPOLOGY_H */
diff --git a/arch/x86/include/asm/trace/common.h b/arch/x86/include/asm/trace/common.h
new file mode 100644
index 000000000..57c8da027
--- /dev/null
+++ b/arch/x86/include/asm/trace/common.h
@@ -0,0 +1,16 @@
+#ifndef _ASM_TRACE_COMMON_H
+#define _ASM_TRACE_COMMON_H
+
+#ifdef CONFIG_TRACING
+DECLARE_STATIC_KEY_FALSE(trace_pagefault_key);
+#define trace_pagefault_enabled() \
+ static_branch_unlikely(&trace_pagefault_key)
+DECLARE_STATIC_KEY_FALSE(trace_resched_ipi_key);
+#define trace_resched_ipi_enabled() \
+ static_branch_unlikely(&trace_resched_ipi_key)
+#else
+static inline bool trace_pagefault_enabled(void) { return false; }
+static inline bool trace_resched_ipi_enabled(void) { return false; }
+#endif
+
+#endif
diff --git a/arch/x86/include/asm/trace/exceptions.h b/arch/x86/include/asm/trace/exceptions.h
new file mode 100644
index 000000000..69615e387
--- /dev/null
+++ b/arch/x86/include/asm/trace/exceptions.h
@@ -0,0 +1,53 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM exceptions
+
+#if !defined(_TRACE_PAGE_FAULT_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_PAGE_FAULT_H
+
+#include <linux/tracepoint.h>
+#include <asm/trace/common.h>
+
+extern int trace_pagefault_reg(void);
+extern void trace_pagefault_unreg(void);
+
+DECLARE_EVENT_CLASS(x86_exceptions,
+
+ TP_PROTO(unsigned long address, struct pt_regs *regs,
+ unsigned long error_code),
+
+ TP_ARGS(address, regs, error_code),
+
+ TP_STRUCT__entry(
+ __field( unsigned long, address )
+ __field( unsigned long, ip )
+ __field( unsigned long, error_code )
+ ),
+
+ TP_fast_assign(
+ __entry->address = address;
+ __entry->ip = regs->ip;
+ __entry->error_code = error_code;
+ ),
+
+ TP_printk("address=%pf ip=%pf error_code=0x%lx",
+ (void *)__entry->address, (void *)__entry->ip,
+ __entry->error_code) );
+
+#define DEFINE_PAGE_FAULT_EVENT(name) \
+DEFINE_EVENT_FN(x86_exceptions, name, \
+ TP_PROTO(unsigned long address, struct pt_regs *regs, \
+ unsigned long error_code), \
+ TP_ARGS(address, regs, error_code), \
+ trace_pagefault_reg, trace_pagefault_unreg);
+
+DEFINE_PAGE_FAULT_EVENT(page_fault_user);
+DEFINE_PAGE_FAULT_EVENT(page_fault_kernel);
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE exceptions
+#endif /* _TRACE_PAGE_FAULT_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/arch/x86/include/asm/trace/fpu.h b/arch/x86/include/asm/trace/fpu.h
new file mode 100644
index 000000000..069c04be1
--- /dev/null
+++ b/arch/x86/include/asm/trace/fpu.h
@@ -0,0 +1,104 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM x86_fpu
+
+#if !defined(_TRACE_FPU_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_FPU_H
+
+#include <linux/tracepoint.h>
+
+DECLARE_EVENT_CLASS(x86_fpu,
+ TP_PROTO(struct fpu *fpu),
+ TP_ARGS(fpu),
+
+ TP_STRUCT__entry(
+ __field(struct fpu *, fpu)
+ __field(bool, initialized)
+ __field(u64, xfeatures)
+ __field(u64, xcomp_bv)
+ ),
+
+ TP_fast_assign(
+ __entry->fpu = fpu;
+ __entry->initialized = fpu->initialized;
+ if (boot_cpu_has(X86_FEATURE_OSXSAVE)) {
+ __entry->xfeatures = fpu->state.xsave.header.xfeatures;
+ __entry->xcomp_bv = fpu->state.xsave.header.xcomp_bv;
+ }
+ ),
+ TP_printk("x86/fpu: %p initialized: %d xfeatures: %llx xcomp_bv: %llx",
+ __entry->fpu,
+ __entry->initialized,
+ __entry->xfeatures,
+ __entry->xcomp_bv
+ )
+);
+
+DEFINE_EVENT(x86_fpu, x86_fpu_before_save,
+ TP_PROTO(struct fpu *fpu),
+ TP_ARGS(fpu)
+);
+
+DEFINE_EVENT(x86_fpu, x86_fpu_after_save,
+ TP_PROTO(struct fpu *fpu),
+ TP_ARGS(fpu)
+);
+
+DEFINE_EVENT(x86_fpu, x86_fpu_before_restore,
+ TP_PROTO(struct fpu *fpu),
+ TP_ARGS(fpu)
+);
+
+DEFINE_EVENT(x86_fpu, x86_fpu_after_restore,
+ TP_PROTO(struct fpu *fpu),
+ TP_ARGS(fpu)
+);
+
+DEFINE_EVENT(x86_fpu, x86_fpu_regs_activated,
+ TP_PROTO(struct fpu *fpu),
+ TP_ARGS(fpu)
+);
+
+DEFINE_EVENT(x86_fpu, x86_fpu_regs_deactivated,
+ TP_PROTO(struct fpu *fpu),
+ TP_ARGS(fpu)
+);
+
+DEFINE_EVENT(x86_fpu, x86_fpu_activate_state,
+ TP_PROTO(struct fpu *fpu),
+ TP_ARGS(fpu)
+);
+
+DEFINE_EVENT(x86_fpu, x86_fpu_init_state,
+ TP_PROTO(struct fpu *fpu),
+ TP_ARGS(fpu)
+);
+
+DEFINE_EVENT(x86_fpu, x86_fpu_dropped,
+ TP_PROTO(struct fpu *fpu),
+ TP_ARGS(fpu)
+);
+
+DEFINE_EVENT(x86_fpu, x86_fpu_copy_src,
+ TP_PROTO(struct fpu *fpu),
+ TP_ARGS(fpu)
+);
+
+DEFINE_EVENT(x86_fpu, x86_fpu_copy_dst,
+ TP_PROTO(struct fpu *fpu),
+ TP_ARGS(fpu)
+);
+
+DEFINE_EVENT(x86_fpu, x86_fpu_xstate_check_failed,
+ TP_PROTO(struct fpu *fpu),
+ TP_ARGS(fpu)
+);
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH asm/trace/
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE fpu
+#endif /* _TRACE_FPU_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/arch/x86/include/asm/trace/hyperv.h b/arch/x86/include/asm/trace/hyperv.h
new file mode 100644
index 000000000..2e6245a02
--- /dev/null
+++ b/arch/x86/include/asm/trace/hyperv.h
@@ -0,0 +1,69 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hyperv
+
+#if !defined(_TRACE_HYPERV_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_HYPERV_H
+
+#include <linux/tracepoint.h>
+
+#if IS_ENABLED(CONFIG_HYPERV)
+
+TRACE_EVENT(hyperv_mmu_flush_tlb_others,
+ TP_PROTO(const struct cpumask *cpus,
+ const struct flush_tlb_info *info),
+ TP_ARGS(cpus, info),
+ TP_STRUCT__entry(
+ __field(unsigned int, ncpus)
+ __field(struct mm_struct *, mm)
+ __field(unsigned long, addr)
+ __field(unsigned long, end)
+ ),
+ TP_fast_assign(__entry->ncpus = cpumask_weight(cpus);
+ __entry->mm = info->mm;
+ __entry->addr = info->start;
+ __entry->end = info->end;
+ ),
+ TP_printk("ncpus %d mm %p addr %lx, end %lx",
+ __entry->ncpus, __entry->mm,
+ __entry->addr, __entry->end)
+ );
+
+TRACE_EVENT(hyperv_nested_flush_guest_mapping,
+ TP_PROTO(u64 as, int ret),
+ TP_ARGS(as, ret),
+
+ TP_STRUCT__entry(
+ __field(u64, as)
+ __field(int, ret)
+ ),
+ TP_fast_assign(__entry->as = as;
+ __entry->ret = ret;
+ ),
+ TP_printk("address space %llx ret %d", __entry->as, __entry->ret)
+ );
+
+TRACE_EVENT(hyperv_send_ipi_mask,
+ TP_PROTO(const struct cpumask *cpus,
+ int vector),
+ TP_ARGS(cpus, vector),
+ TP_STRUCT__entry(
+ __field(unsigned int, ncpus)
+ __field(int, vector)
+ ),
+ TP_fast_assign(__entry->ncpus = cpumask_weight(cpus);
+ __entry->vector = vector;
+ ),
+ TP_printk("ncpus %d vector %x",
+ __entry->ncpus, __entry->vector)
+ );
+
+#endif /* CONFIG_HYPERV */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH asm/trace/
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE hyperv
+#endif /* _TRACE_HYPERV_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/arch/x86/include/asm/trace/irq_vectors.h b/arch/x86/include/asm/trace/irq_vectors.h
new file mode 100644
index 000000000..0af81b590
--- /dev/null
+++ b/arch/x86/include/asm/trace/irq_vectors.h
@@ -0,0 +1,397 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM irq_vectors
+
+#if !defined(_TRACE_IRQ_VECTORS_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_IRQ_VECTORS_H
+
+#include <linux/tracepoint.h>
+#include <asm/trace/common.h>
+
+#ifdef CONFIG_X86_LOCAL_APIC
+
+extern int trace_resched_ipi_reg(void);
+extern void trace_resched_ipi_unreg(void);
+
+DECLARE_EVENT_CLASS(x86_irq_vector,
+
+ TP_PROTO(int vector),
+
+ TP_ARGS(vector),
+
+ TP_STRUCT__entry(
+ __field( int, vector )
+ ),
+
+ TP_fast_assign(
+ __entry->vector = vector;
+ ),
+
+ TP_printk("vector=%d", __entry->vector) );
+
+#define DEFINE_IRQ_VECTOR_EVENT(name) \
+DEFINE_EVENT_FN(x86_irq_vector, name##_entry, \
+ TP_PROTO(int vector), \
+ TP_ARGS(vector), NULL, NULL); \
+DEFINE_EVENT_FN(x86_irq_vector, name##_exit, \
+ TP_PROTO(int vector), \
+ TP_ARGS(vector), NULL, NULL);
+
+#define DEFINE_RESCHED_IPI_EVENT(name) \
+DEFINE_EVENT_FN(x86_irq_vector, name##_entry, \
+ TP_PROTO(int vector), \
+ TP_ARGS(vector), \
+ trace_resched_ipi_reg, \
+ trace_resched_ipi_unreg); \
+DEFINE_EVENT_FN(x86_irq_vector, name##_exit, \
+ TP_PROTO(int vector), \
+ TP_ARGS(vector), \
+ trace_resched_ipi_reg, \
+ trace_resched_ipi_unreg);
+
+/*
+ * local_timer - called when entering/exiting a local timer interrupt
+ * vector handler
+ */
+DEFINE_IRQ_VECTOR_EVENT(local_timer);
+
+/*
+ * spurious_apic - called when entering/exiting a spurious apic vector handler
+ */
+DEFINE_IRQ_VECTOR_EVENT(spurious_apic);
+
+/*
+ * error_apic - called when entering/exiting an error apic vector handler
+ */
+DEFINE_IRQ_VECTOR_EVENT(error_apic);
+
+/*
+ * x86_platform_ipi - called when entering/exiting a x86 platform ipi interrupt
+ * vector handler
+ */
+DEFINE_IRQ_VECTOR_EVENT(x86_platform_ipi);
+
+#ifdef CONFIG_IRQ_WORK
+/*
+ * irq_work - called when entering/exiting a irq work interrupt
+ * vector handler
+ */
+DEFINE_IRQ_VECTOR_EVENT(irq_work);
+
+/*
+ * We must dis-allow sampling irq_work_exit() because perf event sampling
+ * itself can cause irq_work, which would lead to an infinite loop;
+ *
+ * 1) irq_work_exit happens
+ * 2) generates perf sample
+ * 3) generates irq_work
+ * 4) goto 1
+ */
+TRACE_EVENT_PERF_PERM(irq_work_exit, is_sampling_event(p_event) ? -EPERM : 0);
+#endif
+
+/*
+ * The ifdef is required because that tracepoint macro hell emits tracepoint
+ * code in files which include this header even if the tracepoint is not
+ * enabled. Brilliant stuff that.
+ */
+#ifdef CONFIG_SMP
+/*
+ * reschedule - called when entering/exiting a reschedule vector handler
+ */
+DEFINE_RESCHED_IPI_EVENT(reschedule);
+
+/*
+ * call_function - called when entering/exiting a call function interrupt
+ * vector handler
+ */
+DEFINE_IRQ_VECTOR_EVENT(call_function);
+
+/*
+ * call_function_single - called when entering/exiting a call function
+ * single interrupt vector handler
+ */
+DEFINE_IRQ_VECTOR_EVENT(call_function_single);
+#endif
+
+#ifdef CONFIG_X86_MCE_THRESHOLD
+/*
+ * threshold_apic - called when entering/exiting a threshold apic interrupt
+ * vector handler
+ */
+DEFINE_IRQ_VECTOR_EVENT(threshold_apic);
+#endif
+
+#ifdef CONFIG_X86_MCE_AMD
+/*
+ * deferred_error_apic - called when entering/exiting a deferred apic interrupt
+ * vector handler
+ */
+DEFINE_IRQ_VECTOR_EVENT(deferred_error_apic);
+#endif
+
+#ifdef CONFIG_X86_THERMAL_VECTOR
+/*
+ * thermal_apic - called when entering/exiting a thermal apic interrupt
+ * vector handler
+ */
+DEFINE_IRQ_VECTOR_EVENT(thermal_apic);
+#endif
+
+TRACE_EVENT(vector_config,
+
+ TP_PROTO(unsigned int irq, unsigned int vector,
+ unsigned int cpu, unsigned int apicdest),
+
+ TP_ARGS(irq, vector, cpu, apicdest),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, irq )
+ __field( unsigned int, vector )
+ __field( unsigned int, cpu )
+ __field( unsigned int, apicdest )
+ ),
+
+ TP_fast_assign(
+ __entry->irq = irq;
+ __entry->vector = vector;
+ __entry->cpu = cpu;
+ __entry->apicdest = apicdest;
+ ),
+
+ TP_printk("irq=%u vector=%u cpu=%u apicdest=0x%08x",
+ __entry->irq, __entry->vector, __entry->cpu,
+ __entry->apicdest)
+);
+
+DECLARE_EVENT_CLASS(vector_mod,
+
+ TP_PROTO(unsigned int irq, unsigned int vector,
+ unsigned int cpu, unsigned int prev_vector,
+ unsigned int prev_cpu),
+
+ TP_ARGS(irq, vector, cpu, prev_vector, prev_cpu),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, irq )
+ __field( unsigned int, vector )
+ __field( unsigned int, cpu )
+ __field( unsigned int, prev_vector )
+ __field( unsigned int, prev_cpu )
+ ),
+
+ TP_fast_assign(
+ __entry->irq = irq;
+ __entry->vector = vector;
+ __entry->cpu = cpu;
+ __entry->prev_vector = prev_vector;
+ __entry->prev_cpu = prev_cpu;
+
+ ),
+
+ TP_printk("irq=%u vector=%u cpu=%u prev_vector=%u prev_cpu=%u",
+ __entry->irq, __entry->vector, __entry->cpu,
+ __entry->prev_vector, __entry->prev_cpu)
+);
+
+#define DEFINE_IRQ_VECTOR_MOD_EVENT(name) \
+DEFINE_EVENT_FN(vector_mod, name, \
+ TP_PROTO(unsigned int irq, unsigned int vector, \
+ unsigned int cpu, unsigned int prev_vector, \
+ unsigned int prev_cpu), \
+ TP_ARGS(irq, vector, cpu, prev_vector, prev_cpu), NULL, NULL); \
+
+DEFINE_IRQ_VECTOR_MOD_EVENT(vector_update);
+DEFINE_IRQ_VECTOR_MOD_EVENT(vector_clear);
+
+DECLARE_EVENT_CLASS(vector_reserve,
+
+ TP_PROTO(unsigned int irq, int ret),
+
+ TP_ARGS(irq, ret),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, irq )
+ __field( int, ret )
+ ),
+
+ TP_fast_assign(
+ __entry->irq = irq;
+ __entry->ret = ret;
+ ),
+
+ TP_printk("irq=%u ret=%d", __entry->irq, __entry->ret)
+);
+
+#define DEFINE_IRQ_VECTOR_RESERVE_EVENT(name) \
+DEFINE_EVENT_FN(vector_reserve, name, \
+ TP_PROTO(unsigned int irq, int ret), \
+ TP_ARGS(irq, ret), NULL, NULL); \
+
+DEFINE_IRQ_VECTOR_RESERVE_EVENT(vector_reserve_managed);
+DEFINE_IRQ_VECTOR_RESERVE_EVENT(vector_reserve);
+
+TRACE_EVENT(vector_alloc,
+
+ TP_PROTO(unsigned int irq, unsigned int vector, bool reserved,
+ int ret),
+
+ TP_ARGS(irq, vector, reserved, ret),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, irq )
+ __field( unsigned int, vector )
+ __field( bool, reserved )
+ __field( int, ret )
+ ),
+
+ TP_fast_assign(
+ __entry->irq = irq;
+ __entry->vector = ret < 0 ? 0 : vector;
+ __entry->reserved = reserved;
+ __entry->ret = ret > 0 ? 0 : ret;
+ ),
+
+ TP_printk("irq=%u vector=%u reserved=%d ret=%d",
+ __entry->irq, __entry->vector,
+ __entry->reserved, __entry->ret)
+);
+
+TRACE_EVENT(vector_alloc_managed,
+
+ TP_PROTO(unsigned int irq, unsigned int vector,
+ int ret),
+
+ TP_ARGS(irq, vector, ret),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, irq )
+ __field( unsigned int, vector )
+ __field( int, ret )
+ ),
+
+ TP_fast_assign(
+ __entry->irq = irq;
+ __entry->vector = ret < 0 ? 0 : vector;
+ __entry->ret = ret > 0 ? 0 : ret;
+ ),
+
+ TP_printk("irq=%u vector=%u ret=%d",
+ __entry->irq, __entry->vector, __entry->ret)
+);
+
+DECLARE_EVENT_CLASS(vector_activate,
+
+ TP_PROTO(unsigned int irq, bool is_managed, bool can_reserve,
+ bool reserve),
+
+ TP_ARGS(irq, is_managed, can_reserve, reserve),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, irq )
+ __field( bool, is_managed )
+ __field( bool, can_reserve )
+ __field( bool, reserve )
+ ),
+
+ TP_fast_assign(
+ __entry->irq = irq;
+ __entry->is_managed = is_managed;
+ __entry->can_reserve = can_reserve;
+ __entry->reserve = reserve;
+ ),
+
+ TP_printk("irq=%u is_managed=%d can_reserve=%d reserve=%d",
+ __entry->irq, __entry->is_managed, __entry->can_reserve,
+ __entry->reserve)
+);
+
+#define DEFINE_IRQ_VECTOR_ACTIVATE_EVENT(name) \
+DEFINE_EVENT_FN(vector_activate, name, \
+ TP_PROTO(unsigned int irq, bool is_managed, \
+ bool can_reserve, bool reserve), \
+ TP_ARGS(irq, is_managed, can_reserve, reserve), NULL, NULL); \
+
+DEFINE_IRQ_VECTOR_ACTIVATE_EVENT(vector_activate);
+DEFINE_IRQ_VECTOR_ACTIVATE_EVENT(vector_deactivate);
+
+TRACE_EVENT(vector_teardown,
+
+ TP_PROTO(unsigned int irq, bool is_managed, bool has_reserved),
+
+ TP_ARGS(irq, is_managed, has_reserved),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, irq )
+ __field( bool, is_managed )
+ __field( bool, has_reserved )
+ ),
+
+ TP_fast_assign(
+ __entry->irq = irq;
+ __entry->is_managed = is_managed;
+ __entry->has_reserved = has_reserved;
+ ),
+
+ TP_printk("irq=%u is_managed=%d has_reserved=%d",
+ __entry->irq, __entry->is_managed, __entry->has_reserved)
+);
+
+TRACE_EVENT(vector_setup,
+
+ TP_PROTO(unsigned int irq, bool is_legacy, int ret),
+
+ TP_ARGS(irq, is_legacy, ret),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, irq )
+ __field( bool, is_legacy )
+ __field( int, ret )
+ ),
+
+ TP_fast_assign(
+ __entry->irq = irq;
+ __entry->is_legacy = is_legacy;
+ __entry->ret = ret;
+ ),
+
+ TP_printk("irq=%u is_legacy=%d ret=%d",
+ __entry->irq, __entry->is_legacy, __entry->ret)
+);
+
+TRACE_EVENT(vector_free_moved,
+
+ TP_PROTO(unsigned int irq, unsigned int cpu, unsigned int vector,
+ bool is_managed),
+
+ TP_ARGS(irq, cpu, vector, is_managed),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, irq )
+ __field( unsigned int, cpu )
+ __field( unsigned int, vector )
+ __field( bool, is_managed )
+ ),
+
+ TP_fast_assign(
+ __entry->irq = irq;
+ __entry->cpu = cpu;
+ __entry->vector = vector;
+ __entry->is_managed = is_managed;
+ ),
+
+ TP_printk("irq=%u cpu=%u vector=%u is_managed=%d",
+ __entry->irq, __entry->cpu, __entry->vector,
+ __entry->is_managed)
+);
+
+
+#endif /* CONFIG_X86_LOCAL_APIC */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE irq_vectors
+#endif /* _TRACE_IRQ_VECTORS_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/arch/x86/include/asm/trace/mpx.h b/arch/x86/include/asm/trace/mpx.h
new file mode 100644
index 000000000..7bd92db09
--- /dev/null
+++ b/arch/x86/include/asm/trace/mpx.h
@@ -0,0 +1,134 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM mpx
+
+#if !defined(_TRACE_MPX_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_MPX_H
+
+#include <linux/tracepoint.h>
+
+#ifdef CONFIG_X86_INTEL_MPX
+
+TRACE_EVENT(mpx_bounds_register_exception,
+
+ TP_PROTO(void *addr_referenced,
+ const struct mpx_bndreg *bndreg),
+ TP_ARGS(addr_referenced, bndreg),
+
+ TP_STRUCT__entry(
+ __field(void *, addr_referenced)
+ __field(u64, lower_bound)
+ __field(u64, upper_bound)
+ ),
+
+ TP_fast_assign(
+ __entry->addr_referenced = addr_referenced;
+ __entry->lower_bound = bndreg->lower_bound;
+ __entry->upper_bound = bndreg->upper_bound;
+ ),
+ /*
+ * Note that we are printing out the '~' of the upper
+ * bounds register here. It is actually stored in its
+ * one's complement form so that its 'init' state
+ * corresponds to all 0's. But, that looks like
+ * gibberish when printed out, so print out the 1's
+ * complement instead of the actual value here. Note
+ * though that you still need to specify filters for the
+ * actual value, not the displayed one.
+ */
+ TP_printk("address referenced: 0x%p bounds: lower: 0x%llx ~upper: 0x%llx",
+ __entry->addr_referenced,
+ __entry->lower_bound,
+ ~__entry->upper_bound
+ )
+);
+
+TRACE_EVENT(bounds_exception_mpx,
+
+ TP_PROTO(const struct mpx_bndcsr *bndcsr),
+ TP_ARGS(bndcsr),
+
+ TP_STRUCT__entry(
+ __field(u64, bndcfgu)
+ __field(u64, bndstatus)
+ ),
+
+ TP_fast_assign(
+ /* need to get rid of the 'const' on bndcsr */
+ __entry->bndcfgu = (u64)bndcsr->bndcfgu;
+ __entry->bndstatus = (u64)bndcsr->bndstatus;
+ ),
+
+ TP_printk("bndcfgu:0x%llx bndstatus:0x%llx",
+ __entry->bndcfgu,
+ __entry->bndstatus)
+);
+
+DECLARE_EVENT_CLASS(mpx_range_trace,
+
+ TP_PROTO(unsigned long start,
+ unsigned long end),
+ TP_ARGS(start, end),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, start)
+ __field(unsigned long, end)
+ ),
+
+ TP_fast_assign(
+ __entry->start = start;
+ __entry->end = end;
+ ),
+
+ TP_printk("[0x%p:0x%p]",
+ (void *)__entry->start,
+ (void *)__entry->end
+ )
+);
+
+DEFINE_EVENT(mpx_range_trace, mpx_unmap_zap,
+ TP_PROTO(unsigned long start, unsigned long end),
+ TP_ARGS(start, end)
+);
+
+DEFINE_EVENT(mpx_range_trace, mpx_unmap_search,
+ TP_PROTO(unsigned long start, unsigned long end),
+ TP_ARGS(start, end)
+);
+
+TRACE_EVENT(mpx_new_bounds_table,
+
+ TP_PROTO(unsigned long table_vaddr),
+ TP_ARGS(table_vaddr),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, table_vaddr)
+ ),
+
+ TP_fast_assign(
+ __entry->table_vaddr = table_vaddr;
+ ),
+
+ TP_printk("table vaddr:%p", (void *)__entry->table_vaddr)
+);
+
+#else
+
+/*
+ * This gets used outside of MPX-specific code, so we need a stub.
+ */
+static inline
+void trace_bounds_exception_mpx(const struct mpx_bndcsr *bndcsr)
+{
+}
+
+#endif /* CONFIG_X86_INTEL_MPX */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH asm/trace/
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE mpx
+#endif /* _TRACE_MPX_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/arch/x86/include/asm/trace_clock.h b/arch/x86/include/asm/trace_clock.h
new file mode 100644
index 000000000..7061a5650
--- /dev/null
+++ b/arch/x86/include/asm/trace_clock.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_TRACE_CLOCK_H
+#define _ASM_X86_TRACE_CLOCK_H
+
+#include <linux/compiler.h>
+#include <linux/types.h>
+
+#ifdef CONFIG_X86_TSC
+
+extern u64 notrace trace_clock_x86_tsc(void);
+
+# define ARCH_TRACE_CLOCKS \
+ { trace_clock_x86_tsc, "x86-tsc", .in_ns = 0 },
+
+#else /* !CONFIG_X86_TSC */
+
+#define ARCH_TRACE_CLOCKS
+
+#endif
+
+#endif /* _ASM_X86_TRACE_CLOCK_H */
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
new file mode 100644
index 000000000..b771bb3d1
--- /dev/null
+++ b/arch/x86/include/asm/traps.h
@@ -0,0 +1,166 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_TRAPS_H
+#define _ASM_X86_TRAPS_H
+
+#include <linux/context_tracking_state.h>
+#include <linux/kprobes.h>
+
+#include <asm/debugreg.h>
+#include <asm/siginfo.h> /* TRAP_TRACE, ... */
+
+#define dotraplinkage __visible
+
+asmlinkage void divide_error(void);
+asmlinkage void debug(void);
+asmlinkage void nmi(void);
+asmlinkage void int3(void);
+asmlinkage void overflow(void);
+asmlinkage void bounds(void);
+asmlinkage void invalid_op(void);
+asmlinkage void device_not_available(void);
+#ifdef CONFIG_X86_64
+asmlinkage void double_fault(void);
+#endif
+asmlinkage void coprocessor_segment_overrun(void);
+asmlinkage void invalid_TSS(void);
+asmlinkage void segment_not_present(void);
+asmlinkage void stack_segment(void);
+asmlinkage void general_protection(void);
+asmlinkage void page_fault(void);
+asmlinkage void async_page_fault(void);
+asmlinkage void spurious_interrupt_bug(void);
+asmlinkage void coprocessor_error(void);
+asmlinkage void alignment_check(void);
+#ifdef CONFIG_X86_MCE
+asmlinkage void machine_check(void);
+#endif /* CONFIG_X86_MCE */
+asmlinkage void simd_coprocessor_error(void);
+
+#if defined(CONFIG_X86_64) && defined(CONFIG_XEN_PV)
+asmlinkage void xen_divide_error(void);
+asmlinkage void xen_xennmi(void);
+asmlinkage void xen_xendebug(void);
+asmlinkage void xen_int3(void);
+asmlinkage void xen_overflow(void);
+asmlinkage void xen_bounds(void);
+asmlinkage void xen_invalid_op(void);
+asmlinkage void xen_device_not_available(void);
+asmlinkage void xen_double_fault(void);
+asmlinkage void xen_coprocessor_segment_overrun(void);
+asmlinkage void xen_invalid_TSS(void);
+asmlinkage void xen_segment_not_present(void);
+asmlinkage void xen_stack_segment(void);
+asmlinkage void xen_general_protection(void);
+asmlinkage void xen_page_fault(void);
+asmlinkage void xen_spurious_interrupt_bug(void);
+asmlinkage void xen_coprocessor_error(void);
+asmlinkage void xen_alignment_check(void);
+#ifdef CONFIG_X86_MCE
+asmlinkage void xen_machine_check(void);
+#endif /* CONFIG_X86_MCE */
+asmlinkage void xen_simd_coprocessor_error(void);
+#endif
+
+dotraplinkage void do_divide_error(struct pt_regs *, long);
+dotraplinkage void do_debug(struct pt_regs *, long);
+dotraplinkage void do_nmi(struct pt_regs *, long);
+dotraplinkage void do_int3(struct pt_regs *, long);
+dotraplinkage void do_overflow(struct pt_regs *, long);
+dotraplinkage void do_bounds(struct pt_regs *, long);
+dotraplinkage void do_invalid_op(struct pt_regs *, long);
+dotraplinkage void do_device_not_available(struct pt_regs *, long);
+dotraplinkage void do_coprocessor_segment_overrun(struct pt_regs *, long);
+dotraplinkage void do_invalid_TSS(struct pt_regs *, long);
+dotraplinkage void do_segment_not_present(struct pt_regs *, long);
+dotraplinkage void do_stack_segment(struct pt_regs *, long);
+#ifdef CONFIG_X86_64
+dotraplinkage void do_double_fault(struct pt_regs *, long);
+#endif
+dotraplinkage void do_general_protection(struct pt_regs *, long);
+dotraplinkage void do_page_fault(struct pt_regs *, unsigned long);
+dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *, long);
+dotraplinkage void do_coprocessor_error(struct pt_regs *, long);
+dotraplinkage void do_alignment_check(struct pt_regs *, long);
+#ifdef CONFIG_X86_MCE
+dotraplinkage void do_machine_check(struct pt_regs *, long);
+#endif
+dotraplinkage void do_simd_coprocessor_error(struct pt_regs *, long);
+#ifdef CONFIG_X86_32
+dotraplinkage void do_iret_error(struct pt_regs *, long);
+#endif
+dotraplinkage void do_mce(struct pt_regs *, long);
+
+static inline int get_si_code(unsigned long condition)
+{
+ if (condition & DR_STEP)
+ return TRAP_TRACE;
+ else if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3))
+ return TRAP_HWBKPT;
+ else
+ return TRAP_BRKPT;
+}
+
+extern int panic_on_unrecovered_nmi;
+
+void math_emulate(struct math_emu_info *);
+#ifndef CONFIG_X86_32
+asmlinkage void smp_thermal_interrupt(struct pt_regs *regs);
+asmlinkage void smp_threshold_interrupt(struct pt_regs *regs);
+asmlinkage void smp_deferred_error_interrupt(struct pt_regs *regs);
+#endif
+
+extern void ist_enter(struct pt_regs *regs);
+extern void ist_exit(struct pt_regs *regs);
+extern void ist_begin_non_atomic(struct pt_regs *regs);
+extern void ist_end_non_atomic(void);
+
+#ifdef CONFIG_VMAP_STACK
+void __noreturn handle_stack_overflow(const char *message,
+ struct pt_regs *regs,
+ unsigned long fault_address);
+#endif
+
+/* Interrupts/Exceptions */
+enum {
+ X86_TRAP_DE = 0, /* 0, Divide-by-zero */
+ X86_TRAP_DB, /* 1, Debug */
+ X86_TRAP_NMI, /* 2, Non-maskable Interrupt */
+ X86_TRAP_BP, /* 3, Breakpoint */
+ X86_TRAP_OF, /* 4, Overflow */
+ X86_TRAP_BR, /* 5, Bound Range Exceeded */
+ X86_TRAP_UD, /* 6, Invalid Opcode */
+ X86_TRAP_NM, /* 7, Device Not Available */
+ X86_TRAP_DF, /* 8, Double Fault */
+ X86_TRAP_OLD_MF, /* 9, Coprocessor Segment Overrun */
+ X86_TRAP_TS, /* 10, Invalid TSS */
+ X86_TRAP_NP, /* 11, Segment Not Present */
+ X86_TRAP_SS, /* 12, Stack Segment Fault */
+ X86_TRAP_GP, /* 13, General Protection Fault */
+ X86_TRAP_PF, /* 14, Page Fault */
+ X86_TRAP_SPURIOUS, /* 15, Spurious Interrupt */
+ X86_TRAP_MF, /* 16, x87 Floating-Point Exception */
+ X86_TRAP_AC, /* 17, Alignment Check */
+ X86_TRAP_MC, /* 18, Machine Check */
+ X86_TRAP_XF, /* 19, SIMD Floating-Point Exception */
+ X86_TRAP_IRET = 32, /* 32, IRET Exception */
+};
+
+/*
+ * Page fault error code bits:
+ *
+ * bit 0 == 0: no page found 1: protection fault
+ * bit 1 == 0: read access 1: write access
+ * bit 2 == 0: kernel-mode access 1: user-mode access
+ * bit 3 == 1: use of reserved bit detected
+ * bit 4 == 1: fault was an instruction fetch
+ * bit 5 == 1: protection keys block access
+ */
+enum x86_pf_error_code {
+ X86_PF_PROT = 1 << 0,
+ X86_PF_WRITE = 1 << 1,
+ X86_PF_USER = 1 << 2,
+ X86_PF_RSVD = 1 << 3,
+ X86_PF_INSTR = 1 << 4,
+ X86_PF_PK = 1 << 5,
+};
+#endif /* _ASM_X86_TRAPS_H */
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
new file mode 100644
index 000000000..196cf01f5
--- /dev/null
+++ b/arch/x86/include/asm/tsc.h
@@ -0,0 +1,74 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * x86 TSC related functions
+ */
+#ifndef _ASM_X86_TSC_H
+#define _ASM_X86_TSC_H
+
+#include <asm/processor.h>
+
+#define NS_SCALE 10 /* 2^10, carefully chosen */
+#define US_SCALE 32 /* 2^32, arbitralrily chosen */
+
+/*
+ * Standard way to access the cycle counter.
+ */
+typedef unsigned long long cycles_t;
+
+extern unsigned int cpu_khz;
+extern unsigned int tsc_khz;
+
+extern void disable_TSC(void);
+
+static inline cycles_t get_cycles(void)
+{
+ if (!IS_ENABLED(CONFIG_X86_TSC) &&
+ !cpu_feature_enabled(X86_FEATURE_TSC))
+ return 0;
+ return rdtsc();
+}
+#define get_cycles get_cycles
+
+extern struct system_counterval_t convert_art_to_tsc(u64 art);
+extern struct system_counterval_t convert_art_ns_to_tsc(u64 art_ns);
+
+extern void tsc_early_init(void);
+extern void tsc_init(void);
+extern void mark_tsc_unstable(char *reason);
+extern int unsynchronized_tsc(void);
+extern int check_tsc_unstable(void);
+extern void mark_tsc_async_resets(char *reason);
+extern unsigned long native_calibrate_cpu_early(void);
+extern unsigned long native_calibrate_tsc(void);
+extern unsigned long long native_sched_clock_from_tsc(u64 tsc);
+
+extern int tsc_clocksource_reliable;
+#ifdef CONFIG_X86_TSC
+extern bool tsc_async_resets;
+#else
+# define tsc_async_resets false
+#endif
+
+/*
+ * Boot-time check whether the TSCs are synchronized across
+ * all CPUs/cores:
+ */
+#ifdef CONFIG_X86_TSC
+extern bool tsc_store_and_check_tsc_adjust(bool bootcpu);
+extern void tsc_verify_tsc_adjust(bool resume);
+extern void check_tsc_sync_source(int cpu);
+extern void check_tsc_sync_target(void);
+#else
+static inline bool tsc_store_and_check_tsc_adjust(bool bootcpu) { return false; }
+static inline void tsc_verify_tsc_adjust(bool resume) { }
+static inline void check_tsc_sync_source(int cpu) { }
+static inline void check_tsc_sync_target(void) { }
+#endif
+
+extern int notsc_setup(char *);
+extern void tsc_save_sched_clock_state(void);
+extern void tsc_restore_sched_clock_state(void);
+
+unsigned long cpu_khz_from_msr(void);
+
+#endif /* _ASM_X86_TSC_H */
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
new file mode 100644
index 000000000..82b0ff6ca
--- /dev/null
+++ b/arch/x86/include/asm/uaccess.h
@@ -0,0 +1,745 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_UACCESS_H
+#define _ASM_X86_UACCESS_H
+/*
+ * User space memory access functions
+ */
+#include <linux/compiler.h>
+#include <linux/kasan-checks.h>
+#include <linux/string.h>
+#include <asm/asm.h>
+#include <asm/page.h>
+#include <asm/smap.h>
+#include <asm/extable.h>
+
+/*
+ * The fs value determines whether argument validity checking should be
+ * performed or not. If get_fs() == USER_DS, checking is performed, with
+ * get_fs() == KERNEL_DS, checking is bypassed.
+ *
+ * For historical reasons, these macros are grossly misnamed.
+ */
+
+#define MAKE_MM_SEG(s) ((mm_segment_t) { (s) })
+
+#define KERNEL_DS MAKE_MM_SEG(-1UL)
+#define USER_DS MAKE_MM_SEG(TASK_SIZE_MAX)
+
+#define get_ds() (KERNEL_DS)
+#define get_fs() (current->thread.addr_limit)
+static inline void set_fs(mm_segment_t fs)
+{
+ current->thread.addr_limit = fs;
+ /* On user-mode return, check fs is correct */
+ set_thread_flag(TIF_FSCHECK);
+}
+
+#define segment_eq(a, b) ((a).seg == (b).seg)
+
+#define user_addr_max() (current->thread.addr_limit.seg)
+#define __addr_ok(addr) \
+ ((unsigned long __force)(addr) < user_addr_max())
+
+/*
+ * Test whether a block of memory is a valid user space address.
+ * Returns 0 if the range is valid, nonzero otherwise.
+ */
+static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, unsigned long limit)
+{
+ /*
+ * If we have used "sizeof()" for the size,
+ * we know it won't overflow the limit (but
+ * it might overflow the 'addr', so it's
+ * important to subtract the size from the
+ * limit, not add it to the address).
+ */
+ if (__builtin_constant_p(size))
+ return unlikely(addr > limit - size);
+
+ /* Arbitrary sizes? Be careful about overflow */
+ addr += size;
+ if (unlikely(addr < size))
+ return true;
+ return unlikely(addr > limit);
+}
+
+#define __range_not_ok(addr, size, limit) \
+({ \
+ __chk_user_ptr(addr); \
+ __chk_range_not_ok((unsigned long __force)(addr), size, limit); \
+})
+
+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
+# define WARN_ON_IN_IRQ() WARN_ON_ONCE(!in_task())
+#else
+# define WARN_ON_IN_IRQ()
+#endif
+
+/**
+ * access_ok: - Checks if a user space pointer is valid
+ * @type: Type of access: %VERIFY_READ or %VERIFY_WRITE. Note that
+ * %VERIFY_WRITE is a superset of %VERIFY_READ - if it is safe
+ * to write to a block, it is always safe to read from it.
+ * @addr: User space pointer to start of block to check
+ * @size: Size of block to check
+ *
+ * Context: User context only. This function may sleep if pagefaults are
+ * enabled.
+ *
+ * Checks if a pointer to a block of memory in user space is valid.
+ *
+ * Returns true (nonzero) if the memory block may be valid, false (zero)
+ * if it is definitely invalid.
+ *
+ * Note that, depending on architecture, this function probably just
+ * checks that the pointer is in the user space range - after calling
+ * this function, memory access functions may still return -EFAULT.
+ */
+#define access_ok(type, addr, size) \
+({ \
+ WARN_ON_IN_IRQ(); \
+ likely(!__range_not_ok(addr, size, user_addr_max())); \
+})
+
+/*
+ * These are the main single-value transfer routines. They automatically
+ * use the right size if we just have the right pointer type.
+ *
+ * This gets kind of ugly. We want to return _two_ values in "get_user()"
+ * and yet we don't want to do any pointers, because that is too much
+ * of a performance impact. Thus we have a few rather ugly macros here,
+ * and hide all the ugliness from the user.
+ *
+ * The "__xxx" versions of the user access functions are versions that
+ * do not verify the address space, that must have been done previously
+ * with a separate "access_ok()" call (this is used when we do multiple
+ * accesses to the same area of user memory).
+ */
+
+extern int __get_user_1(void);
+extern int __get_user_2(void);
+extern int __get_user_4(void);
+extern int __get_user_8(void);
+extern int __get_user_bad(void);
+
+#define __uaccess_begin() stac()
+#define __uaccess_end() clac()
+#define __uaccess_begin_nospec() \
+({ \
+ stac(); \
+ barrier_nospec(); \
+})
+
+/*
+ * This is a type: either unsigned long, if the argument fits into
+ * that type, or otherwise unsigned long long.
+ */
+#define __inttype(x) \
+__typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL))
+
+/**
+ * get_user: - Get a simple variable from user space.
+ * @x: Variable to store result.
+ * @ptr: Source address, in user space.
+ *
+ * Context: User context only. This function may sleep if pagefaults are
+ * enabled.
+ *
+ * This macro copies a single simple variable from user space to kernel
+ * space. It supports simple types like char and int, but not larger
+ * data types like structures or arrays.
+ *
+ * @ptr must have pointer-to-simple-variable type, and the result of
+ * dereferencing @ptr must be assignable to @x without a cast.
+ *
+ * Returns zero on success, or -EFAULT on error.
+ * On error, the variable @x is set to zero.
+ */
+/*
+ * Careful: we have to cast the result to the type of the pointer
+ * for sign reasons.
+ *
+ * The use of _ASM_DX as the register specifier is a bit of a
+ * simplification, as gcc only cares about it as the starting point
+ * and not size: for a 64-bit value it will use %ecx:%edx on 32 bits
+ * (%ecx being the next register in gcc's x86 register sequence), and
+ * %rdx on 64 bits.
+ *
+ * Clang/LLVM cares about the size of the register, but still wants
+ * the base register for something that ends up being a pair.
+ */
+#define get_user(x, ptr) \
+({ \
+ int __ret_gu; \
+ register __inttype(*(ptr)) __val_gu asm("%"_ASM_DX); \
+ __chk_user_ptr(ptr); \
+ might_fault(); \
+ asm volatile("call __get_user_%P4" \
+ : "=a" (__ret_gu), "=r" (__val_gu), \
+ ASM_CALL_CONSTRAINT \
+ : "0" (ptr), "i" (sizeof(*(ptr)))); \
+ (x) = (__force __typeof__(*(ptr))) __val_gu; \
+ __builtin_expect(__ret_gu, 0); \
+})
+
+#define __put_user_x(size, x, ptr, __ret_pu) \
+ asm volatile("call __put_user_" #size : "=a" (__ret_pu) \
+ : "0" ((typeof(*(ptr)))(x)), "c" (ptr) : "ebx")
+
+
+
+#ifdef CONFIG_X86_32
+#define __put_user_asm_u64(x, addr, err, errret) \
+ asm volatile("\n" \
+ "1: movl %%eax,0(%2)\n" \
+ "2: movl %%edx,4(%2)\n" \
+ "3:" \
+ ".section .fixup,\"ax\"\n" \
+ "4: movl %3,%0\n" \
+ " jmp 3b\n" \
+ ".previous\n" \
+ _ASM_EXTABLE(1b, 4b) \
+ _ASM_EXTABLE(2b, 4b) \
+ : "=r" (err) \
+ : "A" (x), "r" (addr), "i" (errret), "0" (err))
+
+#define __put_user_asm_ex_u64(x, addr) \
+ asm volatile("\n" \
+ "1: movl %%eax,0(%1)\n" \
+ "2: movl %%edx,4(%1)\n" \
+ "3:" \
+ _ASM_EXTABLE_EX(1b, 2b) \
+ _ASM_EXTABLE_EX(2b, 3b) \
+ : : "A" (x), "r" (addr))
+
+#define __put_user_x8(x, ptr, __ret_pu) \
+ asm volatile("call __put_user_8" : "=a" (__ret_pu) \
+ : "A" ((typeof(*(ptr)))(x)), "c" (ptr) : "ebx")
+#else
+#define __put_user_asm_u64(x, ptr, retval, errret) \
+ __put_user_asm(x, ptr, retval, "q", "", "er", errret)
+#define __put_user_asm_ex_u64(x, addr) \
+ __put_user_asm_ex(x, addr, "q", "", "er")
+#define __put_user_x8(x, ptr, __ret_pu) __put_user_x(8, x, ptr, __ret_pu)
+#endif
+
+extern void __put_user_bad(void);
+
+/*
+ * Strange magic calling convention: pointer in %ecx,
+ * value in %eax(:%edx), return value in %eax. clobbers %rbx
+ */
+extern void __put_user_1(void);
+extern void __put_user_2(void);
+extern void __put_user_4(void);
+extern void __put_user_8(void);
+
+/**
+ * put_user: - Write a simple value into user space.
+ * @x: Value to copy to user space.
+ * @ptr: Destination address, in user space.
+ *
+ * Context: User context only. This function may sleep if pagefaults are
+ * enabled.
+ *
+ * This macro copies a single simple value from kernel space to user
+ * space. It supports simple types like char and int, but not larger
+ * data types like structures or arrays.
+ *
+ * @ptr must have pointer-to-simple-variable type, and @x must be assignable
+ * to the result of dereferencing @ptr.
+ *
+ * Returns zero on success, or -EFAULT on error.
+ */
+#define put_user(x, ptr) \
+({ \
+ int __ret_pu; \
+ __typeof__(*(ptr)) __pu_val; \
+ __chk_user_ptr(ptr); \
+ might_fault(); \
+ __pu_val = x; \
+ switch (sizeof(*(ptr))) { \
+ case 1: \
+ __put_user_x(1, __pu_val, ptr, __ret_pu); \
+ break; \
+ case 2: \
+ __put_user_x(2, __pu_val, ptr, __ret_pu); \
+ break; \
+ case 4: \
+ __put_user_x(4, __pu_val, ptr, __ret_pu); \
+ break; \
+ case 8: \
+ __put_user_x8(__pu_val, ptr, __ret_pu); \
+ break; \
+ default: \
+ __put_user_x(X, __pu_val, ptr, __ret_pu); \
+ break; \
+ } \
+ __builtin_expect(__ret_pu, 0); \
+})
+
+#define __put_user_size(x, ptr, size, retval, errret) \
+do { \
+ retval = 0; \
+ __chk_user_ptr(ptr); \
+ switch (size) { \
+ case 1: \
+ __put_user_asm(x, ptr, retval, "b", "b", "iq", errret); \
+ break; \
+ case 2: \
+ __put_user_asm(x, ptr, retval, "w", "w", "ir", errret); \
+ break; \
+ case 4: \
+ __put_user_asm(x, ptr, retval, "l", "k", "ir", errret); \
+ break; \
+ case 8: \
+ __put_user_asm_u64(x, ptr, retval, errret); \
+ break; \
+ default: \
+ __put_user_bad(); \
+ } \
+} while (0)
+
+/*
+ * This doesn't do __uaccess_begin/end - the exception handling
+ * around it must do that.
+ */
+#define __put_user_size_ex(x, ptr, size) \
+do { \
+ __chk_user_ptr(ptr); \
+ switch (size) { \
+ case 1: \
+ __put_user_asm_ex(x, ptr, "b", "b", "iq"); \
+ break; \
+ case 2: \
+ __put_user_asm_ex(x, ptr, "w", "w", "ir"); \
+ break; \
+ case 4: \
+ __put_user_asm_ex(x, ptr, "l", "k", "ir"); \
+ break; \
+ case 8: \
+ __put_user_asm_ex_u64((__typeof__(*ptr))(x), ptr); \
+ break; \
+ default: \
+ __put_user_bad(); \
+ } \
+} while (0)
+
+#ifdef CONFIG_X86_32
+#define __get_user_asm_u64(x, ptr, retval, errret) \
+({ \
+ __typeof__(ptr) __ptr = (ptr); \
+ asm volatile("\n" \
+ "1: movl %2,%%eax\n" \
+ "2: movl %3,%%edx\n" \
+ "3:\n" \
+ ".section .fixup,\"ax\"\n" \
+ "4: mov %4,%0\n" \
+ " xorl %%eax,%%eax\n" \
+ " xorl %%edx,%%edx\n" \
+ " jmp 3b\n" \
+ ".previous\n" \
+ _ASM_EXTABLE(1b, 4b) \
+ _ASM_EXTABLE(2b, 4b) \
+ : "=r" (retval), "=&A"(x) \
+ : "m" (__m(__ptr)), "m" __m(((u32 __user *)(__ptr)) + 1), \
+ "i" (errret), "0" (retval)); \
+})
+
+#define __get_user_asm_ex_u64(x, ptr) (x) = __get_user_bad()
+#else
+#define __get_user_asm_u64(x, ptr, retval, errret) \
+ __get_user_asm(x, ptr, retval, "q", "", "=r", errret)
+#define __get_user_asm_ex_u64(x, ptr) \
+ __get_user_asm_ex(x, ptr, "q", "", "=r")
+#endif
+
+#define __get_user_size(x, ptr, size, retval, errret) \
+do { \
+ retval = 0; \
+ __chk_user_ptr(ptr); \
+ switch (size) { \
+ case 1: \
+ __get_user_asm(x, ptr, retval, "b", "b", "=q", errret); \
+ break; \
+ case 2: \
+ __get_user_asm(x, ptr, retval, "w", "w", "=r", errret); \
+ break; \
+ case 4: \
+ __get_user_asm(x, ptr, retval, "l", "k", "=r", errret); \
+ break; \
+ case 8: \
+ __get_user_asm_u64(x, ptr, retval, errret); \
+ break; \
+ default: \
+ (x) = __get_user_bad(); \
+ } \
+} while (0)
+
+#define __get_user_asm(x, addr, err, itype, rtype, ltype, errret) \
+ asm volatile("\n" \
+ "1: mov"itype" %2,%"rtype"1\n" \
+ "2:\n" \
+ ".section .fixup,\"ax\"\n" \
+ "3: mov %3,%0\n" \
+ " xor"itype" %"rtype"1,%"rtype"1\n" \
+ " jmp 2b\n" \
+ ".previous\n" \
+ _ASM_EXTABLE(1b, 3b) \
+ : "=r" (err), ltype(x) \
+ : "m" (__m(addr)), "i" (errret), "0" (err))
+
+#define __get_user_asm_nozero(x, addr, err, itype, rtype, ltype, errret) \
+ asm volatile("\n" \
+ "1: mov"itype" %2,%"rtype"1\n" \
+ "2:\n" \
+ ".section .fixup,\"ax\"\n" \
+ "3: mov %3,%0\n" \
+ " jmp 2b\n" \
+ ".previous\n" \
+ _ASM_EXTABLE(1b, 3b) \
+ : "=r" (err), ltype(x) \
+ : "m" (__m(addr)), "i" (errret), "0" (err))
+
+/*
+ * This doesn't do __uaccess_begin/end - the exception handling
+ * around it must do that.
+ */
+#define __get_user_size_ex(x, ptr, size) \
+do { \
+ __chk_user_ptr(ptr); \
+ switch (size) { \
+ case 1: \
+ __get_user_asm_ex(x, ptr, "b", "b", "=q"); \
+ break; \
+ case 2: \
+ __get_user_asm_ex(x, ptr, "w", "w", "=r"); \
+ break; \
+ case 4: \
+ __get_user_asm_ex(x, ptr, "l", "k", "=r"); \
+ break; \
+ case 8: \
+ __get_user_asm_ex_u64(x, ptr); \
+ break; \
+ default: \
+ (x) = __get_user_bad(); \
+ } \
+} while (0)
+
+#define __get_user_asm_ex(x, addr, itype, rtype, ltype) \
+ asm volatile("1: mov"itype" %1,%"rtype"0\n" \
+ "2:\n" \
+ ".section .fixup,\"ax\"\n" \
+ "3:xor"itype" %"rtype"0,%"rtype"0\n" \
+ " jmp 2b\n" \
+ ".previous\n" \
+ _ASM_EXTABLE_EX(1b, 3b) \
+ : ltype(x) : "m" (__m(addr)))
+
+#define __put_user_nocheck(x, ptr, size) \
+({ \
+ int __pu_err; \
+ __typeof__(*(ptr)) __pu_val; \
+ __pu_val = x; \
+ __uaccess_begin(); \
+ __put_user_size(__pu_val, (ptr), (size), __pu_err, -EFAULT);\
+ __uaccess_end(); \
+ __builtin_expect(__pu_err, 0); \
+})
+
+#define __get_user_nocheck(x, ptr, size) \
+({ \
+ int __gu_err; \
+ __inttype(*(ptr)) __gu_val; \
+ __typeof__(ptr) __gu_ptr = (ptr); \
+ __typeof__(size) __gu_size = (size); \
+ __uaccess_begin_nospec(); \
+ __get_user_size(__gu_val, __gu_ptr, __gu_size, __gu_err, -EFAULT); \
+ __uaccess_end(); \
+ (x) = (__force __typeof__(*(ptr)))__gu_val; \
+ __builtin_expect(__gu_err, 0); \
+})
+
+/* FIXME: this hack is definitely wrong -AK */
+struct __large_struct { unsigned long buf[100]; };
+#define __m(x) (*(struct __large_struct __user *)(x))
+
+/*
+ * Tell gcc we read from memory instead of writing: this is because
+ * we do not write to any memory gcc knows about, so there are no
+ * aliasing issues.
+ */
+#define __put_user_asm(x, addr, err, itype, rtype, ltype, errret) \
+ asm volatile("\n" \
+ "1: mov"itype" %"rtype"1,%2\n" \
+ "2:\n" \
+ ".section .fixup,\"ax\"\n" \
+ "3: mov %3,%0\n" \
+ " jmp 2b\n" \
+ ".previous\n" \
+ _ASM_EXTABLE(1b, 3b) \
+ : "=r"(err) \
+ : ltype(x), "m" (__m(addr)), "i" (errret), "0" (err))
+
+#define __put_user_asm_ex(x, addr, itype, rtype, ltype) \
+ asm volatile("1: mov"itype" %"rtype"0,%1\n" \
+ "2:\n" \
+ _ASM_EXTABLE_EX(1b, 2b) \
+ : : ltype(x), "m" (__m(addr)))
+
+/*
+ * uaccess_try and catch
+ */
+#define uaccess_try do { \
+ current->thread.uaccess_err = 0; \
+ __uaccess_begin(); \
+ barrier();
+
+#define uaccess_try_nospec do { \
+ current->thread.uaccess_err = 0; \
+ __uaccess_begin_nospec(); \
+
+#define uaccess_catch(err) \
+ __uaccess_end(); \
+ (err) |= (current->thread.uaccess_err ? -EFAULT : 0); \
+} while (0)
+
+/**
+ * __get_user: - Get a simple variable from user space, with less checking.
+ * @x: Variable to store result.
+ * @ptr: Source address, in user space.
+ *
+ * Context: User context only. This function may sleep if pagefaults are
+ * enabled.
+ *
+ * This macro copies a single simple variable from user space to kernel
+ * space. It supports simple types like char and int, but not larger
+ * data types like structures or arrays.
+ *
+ * @ptr must have pointer-to-simple-variable type, and the result of
+ * dereferencing @ptr must be assignable to @x without a cast.
+ *
+ * Caller must check the pointer with access_ok() before calling this
+ * function.
+ *
+ * Returns zero on success, or -EFAULT on error.
+ * On error, the variable @x is set to zero.
+ */
+
+#define __get_user(x, ptr) \
+ __get_user_nocheck((x), (ptr), sizeof(*(ptr)))
+
+/**
+ * __put_user: - Write a simple value into user space, with less checking.
+ * @x: Value to copy to user space.
+ * @ptr: Destination address, in user space.
+ *
+ * Context: User context only. This function may sleep if pagefaults are
+ * enabled.
+ *
+ * This macro copies a single simple value from kernel space to user
+ * space. It supports simple types like char and int, but not larger
+ * data types like structures or arrays.
+ *
+ * @ptr must have pointer-to-simple-variable type, and @x must be assignable
+ * to the result of dereferencing @ptr.
+ *
+ * Caller must check the pointer with access_ok() before calling this
+ * function.
+ *
+ * Returns zero on success, or -EFAULT on error.
+ */
+
+#define __put_user(x, ptr) \
+ __put_user_nocheck((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr)))
+
+/*
+ * {get|put}_user_try and catch
+ *
+ * get_user_try {
+ * get_user_ex(...);
+ * } get_user_catch(err)
+ */
+#define get_user_try uaccess_try_nospec
+#define get_user_catch(err) uaccess_catch(err)
+
+#define get_user_ex(x, ptr) do { \
+ unsigned long __gue_val; \
+ __get_user_size_ex((__gue_val), (ptr), (sizeof(*(ptr)))); \
+ (x) = (__force __typeof__(*(ptr)))__gue_val; \
+} while (0)
+
+#define put_user_try uaccess_try
+#define put_user_catch(err) uaccess_catch(err)
+
+#define put_user_ex(x, ptr) \
+ __put_user_size_ex((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr)))
+
+extern unsigned long
+copy_from_user_nmi(void *to, const void __user *from, unsigned long n);
+extern __must_check long
+strncpy_from_user(char *dst, const char __user *src, long count);
+
+extern __must_check long strnlen_user(const char __user *str, long n);
+
+unsigned long __must_check clear_user(void __user *mem, unsigned long len);
+unsigned long __must_check __clear_user(void __user *mem, unsigned long len);
+
+extern void __cmpxchg_wrong_size(void)
+ __compiletime_error("Bad argument size for cmpxchg");
+
+#define __user_atomic_cmpxchg_inatomic(uval, ptr, old, new, size) \
+({ \
+ int __ret = 0; \
+ __typeof__(ptr) __uval = (uval); \
+ __typeof__(*(ptr)) __old = (old); \
+ __typeof__(*(ptr)) __new = (new); \
+ __uaccess_begin_nospec(); \
+ switch (size) { \
+ case 1: \
+ { \
+ asm volatile("\n" \
+ "1:\t" LOCK_PREFIX "cmpxchgb %4, %2\n" \
+ "2:\n" \
+ "\t.section .fixup, \"ax\"\n" \
+ "3:\tmov %3, %0\n" \
+ "\tjmp 2b\n" \
+ "\t.previous\n" \
+ _ASM_EXTABLE(1b, 3b) \
+ : "+r" (__ret), "=a" (__old), "+m" (*(ptr)) \
+ : "i" (-EFAULT), "q" (__new), "1" (__old) \
+ : "memory" \
+ ); \
+ break; \
+ } \
+ case 2: \
+ { \
+ asm volatile("\n" \
+ "1:\t" LOCK_PREFIX "cmpxchgw %4, %2\n" \
+ "2:\n" \
+ "\t.section .fixup, \"ax\"\n" \
+ "3:\tmov %3, %0\n" \
+ "\tjmp 2b\n" \
+ "\t.previous\n" \
+ _ASM_EXTABLE(1b, 3b) \
+ : "+r" (__ret), "=a" (__old), "+m" (*(ptr)) \
+ : "i" (-EFAULT), "r" (__new), "1" (__old) \
+ : "memory" \
+ ); \
+ break; \
+ } \
+ case 4: \
+ { \
+ asm volatile("\n" \
+ "1:\t" LOCK_PREFIX "cmpxchgl %4, %2\n" \
+ "2:\n" \
+ "\t.section .fixup, \"ax\"\n" \
+ "3:\tmov %3, %0\n" \
+ "\tjmp 2b\n" \
+ "\t.previous\n" \
+ _ASM_EXTABLE(1b, 3b) \
+ : "+r" (__ret), "=a" (__old), "+m" (*(ptr)) \
+ : "i" (-EFAULT), "r" (__new), "1" (__old) \
+ : "memory" \
+ ); \
+ break; \
+ } \
+ case 8: \
+ { \
+ if (!IS_ENABLED(CONFIG_X86_64)) \
+ __cmpxchg_wrong_size(); \
+ \
+ asm volatile("\n" \
+ "1:\t" LOCK_PREFIX "cmpxchgq %4, %2\n" \
+ "2:\n" \
+ "\t.section .fixup, \"ax\"\n" \
+ "3:\tmov %3, %0\n" \
+ "\tjmp 2b\n" \
+ "\t.previous\n" \
+ _ASM_EXTABLE(1b, 3b) \
+ : "+r" (__ret), "=a" (__old), "+m" (*(ptr)) \
+ : "i" (-EFAULT), "r" (__new), "1" (__old) \
+ : "memory" \
+ ); \
+ break; \
+ } \
+ default: \
+ __cmpxchg_wrong_size(); \
+ } \
+ __uaccess_end(); \
+ *__uval = __old; \
+ __ret; \
+})
+
+#define user_atomic_cmpxchg_inatomic(uval, ptr, old, new) \
+({ \
+ access_ok(VERIFY_WRITE, (ptr), sizeof(*(ptr))) ? \
+ __user_atomic_cmpxchg_inatomic((uval), (ptr), \
+ (old), (new), sizeof(*(ptr))) : \
+ -EFAULT; \
+})
+
+/*
+ * movsl can be slow when source and dest are not both 8-byte aligned
+ */
+#ifdef CONFIG_X86_INTEL_USERCOPY
+extern struct movsl_mask {
+ int mask;
+} ____cacheline_aligned_in_smp movsl_mask;
+#endif
+
+#define ARCH_HAS_NOCACHE_UACCESS 1
+
+#ifdef CONFIG_X86_32
+# include <asm/uaccess_32.h>
+#else
+# include <asm/uaccess_64.h>
+#endif
+
+/*
+ * We rely on the nested NMI work to allow atomic faults from the NMI path; the
+ * nested NMI paths are careful to preserve CR2.
+ *
+ * Caller must use pagefault_enable/disable, or run in interrupt context,
+ * and also do a uaccess_ok() check
+ */
+#define __copy_from_user_nmi __copy_from_user_inatomic
+
+/*
+ * The "unsafe" user accesses aren't really "unsafe", but the naming
+ * is a big fat warning: you have to not only do the access_ok()
+ * checking before using them, but you have to surround them with the
+ * user_access_begin/end() pair.
+ */
+static __must_check inline bool user_access_begin(int type,
+ const void __user *ptr,
+ size_t len)
+{
+ if (unlikely(!access_ok(type, ptr, len)))
+ return 0;
+ __uaccess_begin_nospec();
+ return 1;
+}
+
+#define user_access_begin(a, b, c) user_access_begin(a, b, c)
+#define user_access_end() __uaccess_end()
+
+#define unsafe_put_user(x, ptr, err_label) \
+do { \
+ int __pu_err; \
+ __typeof__(*(ptr)) __pu_val = (x); \
+ __put_user_size(__pu_val, (ptr), sizeof(*(ptr)), __pu_err, -EFAULT); \
+ if (unlikely(__pu_err)) goto err_label; \
+} while (0)
+
+#define unsafe_get_user(x, ptr, err_label) \
+do { \
+ int __gu_err; \
+ __inttype(*(ptr)) __gu_val; \
+ __get_user_size(__gu_val, (ptr), sizeof(*(ptr)), __gu_err, -EFAULT); \
+ (x) = (__force __typeof__(*(ptr)))__gu_val; \
+ if (unlikely(__gu_err)) goto err_label; \
+} while (0)
+
+#endif /* _ASM_X86_UACCESS_H */
+
diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
new file mode 100644
index 000000000..ba2dc1930
--- /dev/null
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -0,0 +1,63 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_UACCESS_32_H
+#define _ASM_X86_UACCESS_32_H
+
+/*
+ * User space memory access functions
+ */
+#include <linux/string.h>
+#include <asm/asm.h>
+#include <asm/page.h>
+
+unsigned long __must_check __copy_user_ll
+ (void *to, const void *from, unsigned long n);
+unsigned long __must_check __copy_from_user_ll_nocache_nozero
+ (void *to, const void __user *from, unsigned long n);
+
+static __always_inline unsigned long __must_check
+raw_copy_to_user(void __user *to, const void *from, unsigned long n)
+{
+ return __copy_user_ll((__force void *)to, from, n);
+}
+
+static __always_inline unsigned long
+raw_copy_from_user(void *to, const void __user *from, unsigned long n)
+{
+ if (__builtin_constant_p(n)) {
+ unsigned long ret;
+
+ switch (n) {
+ case 1:
+ ret = 0;
+ __uaccess_begin_nospec();
+ __get_user_asm_nozero(*(u8 *)to, from, ret,
+ "b", "b", "=q", 1);
+ __uaccess_end();
+ return ret;
+ case 2:
+ ret = 0;
+ __uaccess_begin_nospec();
+ __get_user_asm_nozero(*(u16 *)to, from, ret,
+ "w", "w", "=r", 2);
+ __uaccess_end();
+ return ret;
+ case 4:
+ ret = 0;
+ __uaccess_begin_nospec();
+ __get_user_asm_nozero(*(u32 *)to, from, ret,
+ "l", "k", "=r", 4);
+ __uaccess_end();
+ return ret;
+ }
+ }
+ return __copy_user_ll(to, (__force const void *)from, n);
+}
+
+static __always_inline unsigned long
+__copy_from_user_inatomic_nocache(void *to, const void __user *from,
+ unsigned long n)
+{
+ return __copy_from_user_ll_nocache_nozero(to, from, n);
+}
+
+#endif /* _ASM_X86_UACCESS_32_H */
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
new file mode 100644
index 000000000..a9d637bc3
--- /dev/null
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -0,0 +1,216 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_UACCESS_64_H
+#define _ASM_X86_UACCESS_64_H
+
+/*
+ * User space memory access functions
+ */
+#include <linux/compiler.h>
+#include <linux/lockdep.h>
+#include <linux/kasan-checks.h>
+#include <asm/alternative.h>
+#include <asm/cpufeatures.h>
+#include <asm/page.h>
+
+/*
+ * Copy To/From Userspace
+ */
+
+/* Handles exceptions in both to and from, but doesn't do access_ok */
+__must_check unsigned long
+copy_user_enhanced_fast_string(void *to, const void *from, unsigned len);
+__must_check unsigned long
+copy_user_generic_string(void *to, const void *from, unsigned len);
+__must_check unsigned long
+copy_user_generic_unrolled(void *to, const void *from, unsigned len);
+
+static __always_inline __must_check unsigned long
+copy_user_generic(void *to, const void *from, unsigned len)
+{
+ unsigned ret;
+
+ /*
+ * If CPU has ERMS feature, use copy_user_enhanced_fast_string.
+ * Otherwise, if CPU has rep_good feature, use copy_user_generic_string.
+ * Otherwise, use copy_user_generic_unrolled.
+ */
+ alternative_call_2(copy_user_generic_unrolled,
+ copy_user_generic_string,
+ X86_FEATURE_REP_GOOD,
+ copy_user_enhanced_fast_string,
+ X86_FEATURE_ERMS,
+ ASM_OUTPUT2("=a" (ret), "=D" (to), "=S" (from),
+ "=d" (len)),
+ "1" (to), "2" (from), "3" (len)
+ : "memory", "rcx", "r8", "r9", "r10", "r11");
+ return ret;
+}
+
+static __always_inline __must_check unsigned long
+copy_to_user_mcsafe(void *to, const void *from, unsigned len)
+{
+ unsigned long ret;
+
+ __uaccess_begin();
+ /*
+ * Note, __memcpy_mcsafe() is explicitly used since it can
+ * handle exceptions / faults. memcpy_mcsafe() may fall back to
+ * memcpy() which lacks this handling.
+ */
+ ret = __memcpy_mcsafe(to, from, len);
+ __uaccess_end();
+ return ret;
+}
+
+static __always_inline __must_check unsigned long
+raw_copy_from_user(void *dst, const void __user *src, unsigned long size)
+{
+ int ret = 0;
+
+ if (!__builtin_constant_p(size))
+ return copy_user_generic(dst, (__force void *)src, size);
+ switch (size) {
+ case 1:
+ __uaccess_begin_nospec();
+ __get_user_asm_nozero(*(u8 *)dst, (u8 __user *)src,
+ ret, "b", "b", "=q", 1);
+ __uaccess_end();
+ return ret;
+ case 2:
+ __uaccess_begin_nospec();
+ __get_user_asm_nozero(*(u16 *)dst, (u16 __user *)src,
+ ret, "w", "w", "=r", 2);
+ __uaccess_end();
+ return ret;
+ case 4:
+ __uaccess_begin_nospec();
+ __get_user_asm_nozero(*(u32 *)dst, (u32 __user *)src,
+ ret, "l", "k", "=r", 4);
+ __uaccess_end();
+ return ret;
+ case 8:
+ __uaccess_begin_nospec();
+ __get_user_asm_nozero(*(u64 *)dst, (u64 __user *)src,
+ ret, "q", "", "=r", 8);
+ __uaccess_end();
+ return ret;
+ case 10:
+ __uaccess_begin_nospec();
+ __get_user_asm_nozero(*(u64 *)dst, (u64 __user *)src,
+ ret, "q", "", "=r", 10);
+ if (likely(!ret))
+ __get_user_asm_nozero(*(u16 *)(8 + (char *)dst),
+ (u16 __user *)(8 + (char __user *)src),
+ ret, "w", "w", "=r", 2);
+ __uaccess_end();
+ return ret;
+ case 16:
+ __uaccess_begin_nospec();
+ __get_user_asm_nozero(*(u64 *)dst, (u64 __user *)src,
+ ret, "q", "", "=r", 16);
+ if (likely(!ret))
+ __get_user_asm_nozero(*(u64 *)(8 + (char *)dst),
+ (u64 __user *)(8 + (char __user *)src),
+ ret, "q", "", "=r", 8);
+ __uaccess_end();
+ return ret;
+ default:
+ return copy_user_generic(dst, (__force void *)src, size);
+ }
+}
+
+static __always_inline __must_check unsigned long
+raw_copy_to_user(void __user *dst, const void *src, unsigned long size)
+{
+ int ret = 0;
+
+ if (!__builtin_constant_p(size))
+ return copy_user_generic((__force void *)dst, src, size);
+ switch (size) {
+ case 1:
+ __uaccess_begin();
+ __put_user_asm(*(u8 *)src, (u8 __user *)dst,
+ ret, "b", "b", "iq", 1);
+ __uaccess_end();
+ return ret;
+ case 2:
+ __uaccess_begin();
+ __put_user_asm(*(u16 *)src, (u16 __user *)dst,
+ ret, "w", "w", "ir", 2);
+ __uaccess_end();
+ return ret;
+ case 4:
+ __uaccess_begin();
+ __put_user_asm(*(u32 *)src, (u32 __user *)dst,
+ ret, "l", "k", "ir", 4);
+ __uaccess_end();
+ return ret;
+ case 8:
+ __uaccess_begin();
+ __put_user_asm(*(u64 *)src, (u64 __user *)dst,
+ ret, "q", "", "er", 8);
+ __uaccess_end();
+ return ret;
+ case 10:
+ __uaccess_begin();
+ __put_user_asm(*(u64 *)src, (u64 __user *)dst,
+ ret, "q", "", "er", 10);
+ if (likely(!ret)) {
+ asm("":::"memory");
+ __put_user_asm(4[(u16 *)src], 4 + (u16 __user *)dst,
+ ret, "w", "w", "ir", 2);
+ }
+ __uaccess_end();
+ return ret;
+ case 16:
+ __uaccess_begin();
+ __put_user_asm(*(u64 *)src, (u64 __user *)dst,
+ ret, "q", "", "er", 16);
+ if (likely(!ret)) {
+ asm("":::"memory");
+ __put_user_asm(1[(u64 *)src], 1 + (u64 __user *)dst,
+ ret, "q", "", "er", 8);
+ }
+ __uaccess_end();
+ return ret;
+ default:
+ return copy_user_generic((__force void *)dst, src, size);
+ }
+}
+
+static __always_inline __must_check
+unsigned long raw_copy_in_user(void __user *dst, const void __user *src, unsigned long size)
+{
+ return copy_user_generic((__force void *)dst,
+ (__force void *)src, size);
+}
+
+extern long __copy_user_nocache(void *dst, const void __user *src,
+ unsigned size, int zerorest);
+
+extern long __copy_user_flushcache(void *dst, const void __user *src, unsigned size);
+extern void memcpy_page_flushcache(char *to, struct page *page, size_t offset,
+ size_t len);
+
+static inline int
+__copy_from_user_inatomic_nocache(void *dst, const void __user *src,
+ unsigned size)
+{
+ kasan_check_write(dst, size);
+ return __copy_user_nocache(dst, src, size, 0);
+}
+
+static inline int
+__copy_from_user_flushcache(void *dst, const void __user *src, unsigned size)
+{
+ kasan_check_write(dst, size);
+ return __copy_user_flushcache(dst, src, size);
+}
+
+unsigned long
+copy_user_handle_tail(char *to, char *from, unsigned len);
+
+unsigned long
+mcsafe_handle_tail(char *to, char *from, unsigned len);
+
+#endif /* _ASM_X86_UACCESS_64_H */
diff --git a/arch/x86/include/asm/umip.h b/arch/x86/include/asm/umip.h
new file mode 100644
index 000000000..db43f2a0d
--- /dev/null
+++ b/arch/x86/include/asm/umip.h
@@ -0,0 +1,12 @@
+#ifndef _ASM_X86_UMIP_H
+#define _ASM_X86_UMIP_H
+
+#include <linux/types.h>
+#include <asm/ptrace.h>
+
+#ifdef CONFIG_X86_INTEL_UMIP
+bool fixup_umip_exception(struct pt_regs *regs);
+#else
+static inline bool fixup_umip_exception(struct pt_regs *regs) { return false; }
+#endif /* CONFIG_X86_INTEL_UMIP */
+#endif /* _ASM_X86_UMIP_H */
diff --git a/arch/x86/include/asm/unaligned.h b/arch/x86/include/asm/unaligned.h
new file mode 100644
index 000000000..9c754a744
--- /dev/null
+++ b/arch/x86/include/asm/unaligned.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_UNALIGNED_H
+#define _ASM_X86_UNALIGNED_H
+
+/*
+ * The x86 can do unaligned accesses itself.
+ */
+
+#include <linux/unaligned/access_ok.h>
+#include <linux/unaligned/generic.h>
+
+#define get_unaligned __get_unaligned_le
+#define put_unaligned __put_unaligned_le
+
+#endif /* _ASM_X86_UNALIGNED_H */
diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h
new file mode 100644
index 000000000..51c4eee00
--- /dev/null
+++ b/arch/x86/include/asm/unistd.h
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_UNISTD_H
+#define _ASM_X86_UNISTD_H 1
+
+#include <uapi/asm/unistd.h>
+
+
+# ifdef CONFIG_X86_X32_ABI
+# define __SYSCALL_MASK (~(__X32_SYSCALL_BIT))
+# else
+# define __SYSCALL_MASK (~0)
+# endif
+
+# ifdef CONFIG_X86_32
+
+# include <asm/unistd_32.h>
+# define __ARCH_WANT_STAT64
+# define __ARCH_WANT_SYS_IPC
+# define __ARCH_WANT_SYS_OLD_MMAP
+# define __ARCH_WANT_SYS_OLD_SELECT
+
+# else
+
+# include <asm/unistd_64.h>
+# include <asm/unistd_64_x32.h>
+# define __ARCH_WANT_COMPAT_SYS_TIME
+# define __ARCH_WANT_COMPAT_SYS_PREADV64
+# define __ARCH_WANT_COMPAT_SYS_PWRITEV64
+# define __ARCH_WANT_COMPAT_SYS_PREADV64V2
+# define __ARCH_WANT_COMPAT_SYS_PWRITEV64V2
+
+# endif
+
+# define __ARCH_WANT_OLD_READDIR
+# define __ARCH_WANT_OLD_STAT
+# define __ARCH_WANT_SYS_ALARM
+# define __ARCH_WANT_SYS_FADVISE64
+# define __ARCH_WANT_SYS_GETHOSTNAME
+# define __ARCH_WANT_SYS_GETPGRP
+# define __ARCH_WANT_SYS_LLSEEK
+# define __ARCH_WANT_SYS_NICE
+# define __ARCH_WANT_SYS_OLDUMOUNT
+# define __ARCH_WANT_SYS_OLD_GETRLIMIT
+# define __ARCH_WANT_SYS_OLD_UNAME
+# define __ARCH_WANT_SYS_PAUSE
+# define __ARCH_WANT_SYS_SIGNAL
+# define __ARCH_WANT_SYS_SIGPENDING
+# define __ARCH_WANT_SYS_SIGPROCMASK
+# define __ARCH_WANT_SYS_SOCKETCALL
+# define __ARCH_WANT_SYS_TIME
+# define __ARCH_WANT_SYS_UTIME
+# define __ARCH_WANT_SYS_WAITPID
+# define __ARCH_WANT_SYS_FORK
+# define __ARCH_WANT_SYS_VFORK
+# define __ARCH_WANT_SYS_CLONE
+
+#endif /* _ASM_X86_UNISTD_H */
diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h
new file mode 100644
index 000000000..70fc159eb
--- /dev/null
+++ b/arch/x86/include/asm/unwind.h
@@ -0,0 +1,126 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_UNWIND_H
+#define _ASM_X86_UNWIND_H
+
+#include <linux/sched.h>
+#include <linux/ftrace.h>
+#include <asm/ptrace.h>
+#include <asm/stacktrace.h>
+
+#define IRET_FRAME_OFFSET (offsetof(struct pt_regs, ip))
+#define IRET_FRAME_SIZE (sizeof(struct pt_regs) - IRET_FRAME_OFFSET)
+
+struct unwind_state {
+ struct stack_info stack_info;
+ unsigned long stack_mask;
+ struct task_struct *task;
+ int graph_idx;
+ bool error;
+#if defined(CONFIG_UNWINDER_ORC)
+ bool signal, full_regs;
+ unsigned long sp, bp, ip;
+ struct pt_regs *regs, *prev_regs;
+#elif defined(CONFIG_UNWINDER_FRAME_POINTER)
+ bool got_irq;
+ unsigned long *bp, *orig_sp, ip;
+ /*
+ * If non-NULL: The current frame is incomplete and doesn't contain a
+ * valid BP. When looking for the next frame, use this instead of the
+ * non-existent saved BP.
+ */
+ unsigned long *next_bp;
+ struct pt_regs *regs;
+#else
+ unsigned long *sp;
+#endif
+};
+
+void __unwind_start(struct unwind_state *state, struct task_struct *task,
+ struct pt_regs *regs, unsigned long *first_frame);
+bool unwind_next_frame(struct unwind_state *state);
+unsigned long unwind_get_return_address(struct unwind_state *state);
+unsigned long *unwind_get_return_address_ptr(struct unwind_state *state);
+
+static inline bool unwind_done(struct unwind_state *state)
+{
+ return state->stack_info.type == STACK_TYPE_UNKNOWN;
+}
+
+static inline bool unwind_error(struct unwind_state *state)
+{
+ return state->error;
+}
+
+static inline
+void unwind_start(struct unwind_state *state, struct task_struct *task,
+ struct pt_regs *regs, unsigned long *first_frame)
+{
+ first_frame = first_frame ? : get_stack_pointer(task, regs);
+
+ __unwind_start(state, task, regs, first_frame);
+}
+
+#if defined(CONFIG_UNWINDER_ORC) || defined(CONFIG_UNWINDER_FRAME_POINTER)
+/*
+ * If 'partial' returns true, only the iret frame registers are valid.
+ */
+static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state,
+ bool *partial)
+{
+ if (unwind_done(state))
+ return NULL;
+
+ if (partial) {
+#ifdef CONFIG_UNWINDER_ORC
+ *partial = !state->full_regs;
+#else
+ *partial = false;
+#endif
+ }
+
+ return state->regs;
+}
+#else
+static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state,
+ bool *partial)
+{
+ return NULL;
+}
+#endif
+
+#ifdef CONFIG_UNWINDER_ORC
+void unwind_init(void);
+void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size,
+ void *orc, size_t orc_size);
+#else
+static inline void unwind_init(void) {}
+static inline
+void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size,
+ void *orc, size_t orc_size) {}
+#endif
+
+/*
+ * This disables KASAN checking when reading a value from another task's stack,
+ * since the other task could be running on another CPU and could have poisoned
+ * the stack in the meantime.
+ */
+#define READ_ONCE_TASK_STACK(task, x) \
+({ \
+ unsigned long val; \
+ if (task == current) \
+ val = READ_ONCE(x); \
+ else \
+ val = READ_ONCE_NOCHECK(x); \
+ val; \
+})
+
+static inline bool task_on_another_cpu(struct task_struct *task)
+{
+#ifdef CONFIG_SMP
+ return task != current && task->on_cpu;
+#else
+ return false;
+#endif
+}
+
+#endif /* _ASM_X86_UNWIND_H */
diff --git a/arch/x86/include/asm/unwind_hints.h b/arch/x86/include/asm/unwind_hints.h
new file mode 100644
index 000000000..0bcdb1279
--- /dev/null
+++ b/arch/x86/include/asm/unwind_hints.h
@@ -0,0 +1,109 @@
+#ifndef _ASM_X86_UNWIND_HINTS_H
+#define _ASM_X86_UNWIND_HINTS_H
+
+#include "orc_types.h"
+
+#ifdef __ASSEMBLY__
+
+/*
+ * In asm, there are two kinds of code: normal C-type callable functions and
+ * the rest. The normal callable functions can be called by other code, and
+ * don't do anything unusual with the stack. Such normal callable functions
+ * are annotated with the ENTRY/ENDPROC macros. Most asm code falls in this
+ * category. In this case, no special debugging annotations are needed because
+ * objtool can automatically generate the ORC data for the ORC unwinder to read
+ * at runtime.
+ *
+ * Anything which doesn't fall into the above category, such as syscall and
+ * interrupt handlers, tends to not be called directly by other functions, and
+ * often does unusual non-C-function-type things with the stack pointer. Such
+ * code needs to be annotated such that objtool can understand it. The
+ * following CFI hint macros are for this type of code.
+ *
+ * These macros provide hints to objtool about the state of the stack at each
+ * instruction. Objtool starts from the hints and follows the code flow,
+ * making automatic CFI adjustments when it sees pushes and pops, filling out
+ * the debuginfo as necessary. It will also warn if it sees any
+ * inconsistencies.
+ */
+.macro UNWIND_HINT sp_reg=ORC_REG_SP sp_offset=0 type=ORC_TYPE_CALL end=0
+#ifdef CONFIG_STACK_VALIDATION
+.Lunwind_hint_ip_\@:
+ .pushsection .discard.unwind_hints
+ /* struct unwind_hint */
+ .long .Lunwind_hint_ip_\@ - .
+ .short \sp_offset
+ .byte \sp_reg
+ .byte \type
+ .byte \end
+ .balign 4
+ .popsection
+#endif
+.endm
+
+.macro UNWIND_HINT_EMPTY
+ UNWIND_HINT sp_reg=ORC_REG_UNDEFINED end=1
+.endm
+
+.macro UNWIND_HINT_REGS base=%rsp offset=0 indirect=0 extra=1 iret=0
+ .if \base == %rsp
+ .if \indirect
+ .set sp_reg, ORC_REG_SP_INDIRECT
+ .else
+ .set sp_reg, ORC_REG_SP
+ .endif
+ .elseif \base == %rbp
+ .set sp_reg, ORC_REG_BP
+ .elseif \base == %rdi
+ .set sp_reg, ORC_REG_DI
+ .elseif \base == %rdx
+ .set sp_reg, ORC_REG_DX
+ .elseif \base == %r10
+ .set sp_reg, ORC_REG_R10
+ .else
+ .error "UNWIND_HINT_REGS: bad base register"
+ .endif
+
+ .set sp_offset, \offset
+
+ .if \iret
+ .set type, ORC_TYPE_REGS_IRET
+ .elseif \extra == 0
+ .set type, ORC_TYPE_REGS_IRET
+ .set sp_offset, \offset + (16*8)
+ .else
+ .set type, ORC_TYPE_REGS
+ .endif
+
+ UNWIND_HINT sp_reg=sp_reg sp_offset=sp_offset type=type
+.endm
+
+.macro UNWIND_HINT_IRET_REGS base=%rsp offset=0
+ UNWIND_HINT_REGS base=\base offset=\offset iret=1
+.endm
+
+.macro UNWIND_HINT_FUNC sp_offset=8
+ UNWIND_HINT sp_offset=\sp_offset
+.endm
+
+#else /* !__ASSEMBLY__ */
+
+#define UNWIND_HINT(sp_reg, sp_offset, type, end) \
+ "987: \n\t" \
+ ".pushsection .discard.unwind_hints\n\t" \
+ /* struct unwind_hint */ \
+ ".long 987b - .\n\t" \
+ ".short " __stringify(sp_offset) "\n\t" \
+ ".byte " __stringify(sp_reg) "\n\t" \
+ ".byte " __stringify(type) "\n\t" \
+ ".byte " __stringify(end) "\n\t" \
+ ".balign 4 \n\t" \
+ ".popsection\n\t"
+
+#define UNWIND_HINT_SAVE UNWIND_HINT(0, 0, UNWIND_HINT_TYPE_SAVE, 0)
+
+#define UNWIND_HINT_RESTORE UNWIND_HINT(0, 0, UNWIND_HINT_TYPE_RESTORE, 0)
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_X86_UNWIND_HINTS_H */
diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h
new file mode 100644
index 000000000..d8bfa98fc
--- /dev/null
+++ b/arch/x86/include/asm/uprobes.h
@@ -0,0 +1,71 @@
+#ifndef _ASM_UPROBES_H
+#define _ASM_UPROBES_H
+/*
+ * User-space Probes (UProbes) for x86
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2008-2011
+ * Authors:
+ * Srikar Dronamraju
+ * Jim Keniston
+ */
+
+#include <linux/notifier.h>
+
+typedef u8 uprobe_opcode_t;
+
+#define MAX_UINSN_BYTES 16
+#define UPROBE_XOL_SLOT_BYTES 128 /* to keep it cache aligned */
+
+#define UPROBE_SWBP_INSN 0xcc
+#define UPROBE_SWBP_INSN_SIZE 1
+
+struct uprobe_xol_ops;
+
+struct arch_uprobe {
+ union {
+ u8 insn[MAX_UINSN_BYTES];
+ u8 ixol[MAX_UINSN_BYTES];
+ };
+
+ const struct uprobe_xol_ops *ops;
+
+ union {
+ struct {
+ s32 offs;
+ u8 ilen;
+ u8 opc1;
+ } branch;
+ struct {
+ u8 fixups;
+ u8 ilen;
+ } defparam;
+ struct {
+ u8 reg_offset; /* to the start of pt_regs */
+ u8 ilen;
+ } push;
+ };
+};
+
+struct arch_uprobe_task {
+#ifdef CONFIG_X86_64
+ unsigned long saved_scratch_register;
+#endif
+ unsigned int saved_trap_nr;
+ unsigned int saved_tf;
+};
+
+#endif /* _ASM_UPROBES_H */
diff --git a/arch/x86/include/asm/user.h b/arch/x86/include/asm/user.h
new file mode 100644
index 000000000..413c91746
--- /dev/null
+++ b/arch/x86/include/asm/user.h
@@ -0,0 +1,64 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_USER_H
+#define _ASM_X86_USER_H
+
+#ifdef CONFIG_X86_32
+# include <asm/user_32.h>
+#else
+# include <asm/user_64.h>
+#endif
+
+#include <asm/types.h>
+
+struct user_ymmh_regs {
+ /* 16 * 16 bytes for each YMMH-reg */
+ __u32 ymmh_space[64];
+};
+
+struct user_xstate_header {
+ __u64 xfeatures;
+ __u64 reserved1[2];
+ __u64 reserved2[5];
+};
+
+/*
+ * The structure layout of user_xstateregs, used for exporting the
+ * extended register state through ptrace and core-dump (NT_X86_XSTATE note)
+ * interfaces will be same as the memory layout of xsave used by the processor
+ * (except for the bytes 464..511, which can be used by the software) and hence
+ * the size of this structure varies depending on the features supported by the
+ * processor and OS. The size of the structure that users need to use can be
+ * obtained by doing:
+ * cpuid_count(0xd, 0, &eax, &ptrace_xstateregs_struct_size, &ecx, &edx);
+ * i.e., cpuid.(eax=0xd,ecx=0).ebx will be the size that user (debuggers, etc.)
+ * need to use.
+ *
+ * For now, only the first 8 bytes of the software usable bytes[464..471] will
+ * be used and will be set to OS enabled xstate mask (which is same as the
+ * 64bit mask returned by the xgetbv's xCR0). Users (analyzing core dump
+ * remotely, etc.) can use this mask as well as the mask saved in the
+ * xstate_hdr bytes and interpret what states the processor/OS supports
+ * and what states are in modified/initialized conditions for the
+ * particular process/thread.
+ *
+ * Also when the user modifies certain state FP/SSE/etc through the
+ * ptrace interface, they must ensure that the header.xfeatures
+ * bytes[512..519] of the memory layout are updated correspondingly.
+ * i.e., for example when FP state is modified to a non-init state,
+ * header.xfeatures's bit 0 must be set to '1', when SSE is modified to
+ * non-init state, header.xfeatures's bit 1 must to be set to '1', etc.
+ */
+#define USER_XSTATE_FX_SW_WORDS 6
+#define USER_XSTATE_XCR0_WORD 0
+
+struct user_xstateregs {
+ struct {
+ __u64 fpx_space[58];
+ __u64 xstate_fx_sw[USER_XSTATE_FX_SW_WORDS];
+ } i387;
+ struct user_xstate_header header;
+ struct user_ymmh_regs ymmh;
+ /* further processor state extensions go here */
+};
+
+#endif /* _ASM_X86_USER_H */
diff --git a/arch/x86/include/asm/user32.h b/arch/x86/include/asm/user32.h
new file mode 100644
index 000000000..fa577312f
--- /dev/null
+++ b/arch/x86/include/asm/user32.h
@@ -0,0 +1,71 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_USER32_H
+#define _ASM_X86_USER32_H
+
+/* IA32 compatible user structures for ptrace.
+ * These should be used for 32bit coredumps too. */
+
+struct user_i387_ia32_struct {
+ u32 cwd;
+ u32 swd;
+ u32 twd;
+ u32 fip;
+ u32 fcs;
+ u32 foo;
+ u32 fos;
+ u32 st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */
+};
+
+/* FSAVE frame with extensions */
+struct user32_fxsr_struct {
+ unsigned short cwd;
+ unsigned short swd;
+ unsigned short twd; /* not compatible to 64bit twd */
+ unsigned short fop;
+ int fip;
+ int fcs;
+ int foo;
+ int fos;
+ int mxcsr;
+ int reserved;
+ int st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
+ int xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */
+ int padding[56];
+};
+
+struct user_regs_struct32 {
+ __u32 ebx, ecx, edx, esi, edi, ebp, eax;
+ unsigned short ds, __ds, es, __es;
+ unsigned short fs, __fs, gs, __gs;
+ __u32 orig_eax, eip;
+ unsigned short cs, __cs;
+ __u32 eflags, esp;
+ unsigned short ss, __ss;
+};
+
+struct user32 {
+ struct user_regs_struct32 regs; /* Where the registers are actually stored */
+ int u_fpvalid; /* True if math co-processor being used. */
+ /* for this mess. Not yet used. */
+ struct user_i387_ia32_struct i387; /* Math Co-processor registers. */
+/* The rest of this junk is to help gdb figure out what goes where */
+ __u32 u_tsize; /* Text segment size (pages). */
+ __u32 u_dsize; /* Data segment size (pages). */
+ __u32 u_ssize; /* Stack segment size (pages). */
+ __u32 start_code; /* Starting virtual address of text. */
+ __u32 start_stack; /* Starting virtual address of stack area.
+ This is actually the bottom of the stack,
+ the top of the stack is always found in the
+ esp register. */
+ __u32 signal; /* Signal that caused the core dump. */
+ int reserved; /* No __u32er used */
+ __u32 u_ar0; /* Used by gdb to help find the values for */
+ /* the registers. */
+ __u32 u_fpstate; /* Math Co-processor pointer. */
+ __u32 magic; /* To uniquely identify a core file */
+ char u_comm[32]; /* User command that was responsible */
+ int u_debugreg[8];
+};
+
+
+#endif /* _ASM_X86_USER32_H */
diff --git a/arch/x86/include/asm/user_32.h b/arch/x86/include/asm/user_32.h
new file mode 100644
index 000000000..d72c3d66e
--- /dev/null
+++ b/arch/x86/include/asm/user_32.h
@@ -0,0 +1,132 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_USER_32_H
+#define _ASM_X86_USER_32_H
+
+#include <asm/page.h>
+/* Core file format: The core file is written in such a way that gdb
+ can understand it and provide useful information to the user (under
+ linux we use the 'trad-core' bfd). There are quite a number of
+ obstacles to being able to view the contents of the floating point
+ registers, and until these are solved you will not be able to view the
+ contents of them. Actually, you can read in the core file and look at
+ the contents of the user struct to find out what the floating point
+ registers contain.
+ The actual file contents are as follows:
+ UPAGE: 1 page consisting of a user struct that tells gdb what is present
+ in the file. Directly after this is a copy of the task_struct, which
+ is currently not used by gdb, but it may come in useful at some point.
+ All of the registers are stored as part of the upage. The upage should
+ always be only one page.
+ DATA: The data area is stored. We use current->end_text to
+ current->brk to pick up all of the user variables, plus any memory
+ that may have been malloced. No attempt is made to determine if a page
+ is demand-zero or if a page is totally unused, we just cover the entire
+ range. All of the addresses are rounded in such a way that an integral
+ number of pages is written.
+ STACK: We need the stack information in order to get a meaningful
+ backtrace. We need to write the data from (esp) to
+ current->start_stack, so we round each of these off in order to be able
+ to write an integer number of pages.
+ The minimum core file size is 3 pages, or 12288 bytes.
+*/
+
+/*
+ * Pentium III FXSR, SSE support
+ * Gareth Hughes <gareth@valinux.com>, May 2000
+ *
+ * Provide support for the GDB 5.0+ PTRACE_{GET|SET}FPXREGS requests for
+ * interacting with the FXSR-format floating point environment. Floating
+ * point data can be accessed in the regular format in the usual manner,
+ * and both the standard and SIMD floating point data can be accessed via
+ * the new ptrace requests. In either case, changes to the FPU environment
+ * will be reflected in the task's state as expected.
+ */
+
+struct user_i387_struct {
+ long cwd;
+ long swd;
+ long twd;
+ long fip;
+ long fcs;
+ long foo;
+ long fos;
+ long st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */
+};
+
+struct user_fxsr_struct {
+ unsigned short cwd;
+ unsigned short swd;
+ unsigned short twd;
+ unsigned short fop;
+ long fip;
+ long fcs;
+ long foo;
+ long fos;
+ long mxcsr;
+ long reserved;
+ long st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
+ long xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */
+ long padding[56];
+};
+
+/*
+ * This is the old layout of "struct pt_regs", and
+ * is still the layout used by user mode (the new
+ * pt_regs doesn't have all registers as the kernel
+ * doesn't use the extra segment registers)
+ */
+struct user_regs_struct {
+ unsigned long bx;
+ unsigned long cx;
+ unsigned long dx;
+ unsigned long si;
+ unsigned long di;
+ unsigned long bp;
+ unsigned long ax;
+ unsigned long ds;
+ unsigned long es;
+ unsigned long fs;
+ unsigned long gs;
+ unsigned long orig_ax;
+ unsigned long ip;
+ unsigned long cs;
+ unsigned long flags;
+ unsigned long sp;
+ unsigned long ss;
+};
+
+/* When the kernel dumps core, it starts by dumping the user struct -
+ this will be used by gdb to figure out where the data and stack segments
+ are within the file, and what virtual addresses to use. */
+struct user{
+/* We start with the registers, to mimic the way that "memory" is returned
+ from the ptrace(3,...) function. */
+ struct user_regs_struct regs; /* Where the registers are actually stored */
+/* ptrace does not yet supply these. Someday.... */
+ int u_fpvalid; /* True if math co-processor being used. */
+ /* for this mess. Not yet used. */
+ struct user_i387_struct i387; /* Math Co-processor registers. */
+/* The rest of this junk is to help gdb figure out what goes where */
+ unsigned long int u_tsize; /* Text segment size (pages). */
+ unsigned long int u_dsize; /* Data segment size (pages). */
+ unsigned long int u_ssize; /* Stack segment size (pages). */
+ unsigned long start_code; /* Starting virtual address of text. */
+ unsigned long start_stack; /* Starting virtual address of stack area.
+ This is actually the bottom of the stack,
+ the top of the stack is always found in the
+ esp register. */
+ long int signal; /* Signal that caused the core dump. */
+ int reserved; /* No longer used */
+ unsigned long u_ar0; /* Used by gdb to help find the values for */
+ /* the registers. */
+ struct user_i387_struct *u_fpstate; /* Math Co-processor pointer. */
+ unsigned long magic; /* To uniquely identify a core file */
+ char u_comm[32]; /* User command that was responsible */
+ int u_debugreg[8];
+};
+#define NBPG PAGE_SIZE
+#define UPAGES 1
+#define HOST_TEXT_START_ADDR (u.start_code)
+#define HOST_STACK_END_ADDR (u.start_stack + u.u_ssize * NBPG)
+
+#endif /* _ASM_X86_USER_32_H */
diff --git a/arch/x86/include/asm/user_64.h b/arch/x86/include/asm/user_64.h
new file mode 100644
index 000000000..db9099236
--- /dev/null
+++ b/arch/x86/include/asm/user_64.h
@@ -0,0 +1,138 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_USER_64_H
+#define _ASM_X86_USER_64_H
+
+#include <asm/types.h>
+#include <asm/page.h>
+/* Core file format: The core file is written in such a way that gdb
+ can understand it and provide useful information to the user.
+ There are quite a number of obstacles to being able to view the
+ contents of the floating point registers, and until these are
+ solved you will not be able to view the contents of them.
+ Actually, you can read in the core file and look at the contents of
+ the user struct to find out what the floating point registers
+ contain.
+
+ The actual file contents are as follows:
+ UPAGE: 1 page consisting of a user struct that tells gdb what is present
+ in the file. Directly after this is a copy of the task_struct, which
+ is currently not used by gdb, but it may come in useful at some point.
+ All of the registers are stored as part of the upage. The upage should
+ always be only one page.
+ DATA: The data area is stored. We use current->end_text to
+ current->brk to pick up all of the user variables, plus any memory
+ that may have been malloced. No attempt is made to determine if a page
+ is demand-zero or if a page is totally unused, we just cover the entire
+ range. All of the addresses are rounded in such a way that an integral
+ number of pages is written.
+ STACK: We need the stack information in order to get a meaningful
+ backtrace. We need to write the data from (esp) to
+ current->start_stack, so we round each of these off in order to be able
+ to write an integer number of pages.
+ The minimum core file size is 3 pages, or 12288 bytes. */
+
+/*
+ * Pentium III FXSR, SSE support
+ * Gareth Hughes <gareth@valinux.com>, May 2000
+ *
+ * Provide support for the GDB 5.0+ PTRACE_{GET|SET}FPXREGS requests for
+ * interacting with the FXSR-format floating point environment. Floating
+ * point data can be accessed in the regular format in the usual manner,
+ * and both the standard and SIMD floating point data can be accessed via
+ * the new ptrace requests. In either case, changes to the FPU environment
+ * will be reflected in the task's state as expected.
+ *
+ * x86-64 support by Andi Kleen.
+ */
+
+/* This matches the 64bit FXSAVE format as defined by AMD. It is the same
+ as the 32bit format defined by Intel, except that the selector:offset pairs
+ for data and eip are replaced with flat 64bit pointers. */
+struct user_i387_struct {
+ unsigned short cwd;
+ unsigned short swd;
+ unsigned short twd; /* Note this is not the same as
+ the 32bit/x87/FSAVE twd */
+ unsigned short fop;
+ __u64 rip;
+ __u64 rdp;
+ __u32 mxcsr;
+ __u32 mxcsr_mask;
+ __u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
+ __u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */
+ __u32 padding[24];
+};
+
+/*
+ * Segment register layout in coredumps.
+ */
+struct user_regs_struct {
+ unsigned long r15;
+ unsigned long r14;
+ unsigned long r13;
+ unsigned long r12;
+ unsigned long bp;
+ unsigned long bx;
+ unsigned long r11;
+ unsigned long r10;
+ unsigned long r9;
+ unsigned long r8;
+ unsigned long ax;
+ unsigned long cx;
+ unsigned long dx;
+ unsigned long si;
+ unsigned long di;
+ unsigned long orig_ax;
+ unsigned long ip;
+ unsigned long cs;
+ unsigned long flags;
+ unsigned long sp;
+ unsigned long ss;
+ unsigned long fs_base;
+ unsigned long gs_base;
+ unsigned long ds;
+ unsigned long es;
+ unsigned long fs;
+ unsigned long gs;
+};
+
+/* When the kernel dumps core, it starts by dumping the user struct -
+ this will be used by gdb to figure out where the data and stack segments
+ are within the file, and what virtual addresses to use. */
+
+struct user {
+/* We start with the registers, to mimic the way that "memory" is returned
+ from the ptrace(3,...) function. */
+ struct user_regs_struct regs; /* Where the registers are actually stored */
+/* ptrace does not yet supply these. Someday.... */
+ int u_fpvalid; /* True if math co-processor being used. */
+ /* for this mess. Not yet used. */
+ int pad0;
+ struct user_i387_struct i387; /* Math Co-processor registers. */
+/* The rest of this junk is to help gdb figure out what goes where */
+ unsigned long int u_tsize; /* Text segment size (pages). */
+ unsigned long int u_dsize; /* Data segment size (pages). */
+ unsigned long int u_ssize; /* Stack segment size (pages). */
+ unsigned long start_code; /* Starting virtual address of text. */
+ unsigned long start_stack; /* Starting virtual address of stack area.
+ This is actually the bottom of the stack,
+ the top of the stack is always found in the
+ esp register. */
+ long int signal; /* Signal that caused the core dump. */
+ int reserved; /* No longer used */
+ int pad1;
+ unsigned long u_ar0; /* Used by gdb to help find the values for */
+ /* the registers. */
+ struct user_i387_struct *u_fpstate; /* Math Co-processor pointer. */
+ unsigned long magic; /* To uniquely identify a core file */
+ char u_comm[32]; /* User command that was responsible */
+ unsigned long u_debugreg[8];
+ unsigned long error_code; /* CPU error code or 0 */
+ unsigned long fault_address; /* CR3 or 0 */
+};
+#define NBPG PAGE_SIZE
+#define UPAGES 1
+#define HOST_TEXT_START_ADDR (u.start_code)
+#define HOST_STACK_END_ADDR (u.start_stack + u.u_ssize * NBPG)
+
+#endif /* _ASM_X86_USER_64_H */
diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h
new file mode 100644
index 000000000..3f697a9e3
--- /dev/null
+++ b/arch/x86/include/asm/uv/bios.h
@@ -0,0 +1,176 @@
+#ifndef _ASM_X86_UV_BIOS_H
+#define _ASM_X86_UV_BIOS_H
+
+/*
+ * UV BIOS layer definitions.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Copyright (c) 2008-2009 Silicon Graphics, Inc. All Rights Reserved.
+ * Copyright (c) Russ Anderson <rja@sgi.com>
+ */
+
+#include <linux/rtc.h>
+
+/*
+ * Values for the BIOS calls. It is passed as the first * argument in the
+ * BIOS call. Passing any other value in the first argument will result
+ * in a BIOS_STATUS_UNIMPLEMENTED return status.
+ */
+enum uv_bios_cmd {
+ UV_BIOS_COMMON,
+ UV_BIOS_GET_SN_INFO,
+ UV_BIOS_FREQ_BASE,
+ UV_BIOS_WATCHLIST_ALLOC,
+ UV_BIOS_WATCHLIST_FREE,
+ UV_BIOS_MEMPROTECT,
+ UV_BIOS_GET_PARTITION_ADDR,
+ UV_BIOS_SET_LEGACY_VGA_TARGET
+};
+
+/*
+ * Status values returned from a BIOS call.
+ */
+enum {
+ BIOS_STATUS_MORE_PASSES = 1,
+ BIOS_STATUS_SUCCESS = 0,
+ BIOS_STATUS_UNIMPLEMENTED = -ENOSYS,
+ BIOS_STATUS_EINVAL = -EINVAL,
+ BIOS_STATUS_UNAVAIL = -EBUSY,
+ BIOS_STATUS_ABORT = -EINTR,
+};
+
+/* Address map parameters */
+struct uv_gam_parameters {
+ u64 mmr_base;
+ u64 gru_base;
+ u8 mmr_shift; /* Convert PNode to MMR space offset */
+ u8 gru_shift; /* Convert PNode to GRU space offset */
+ u8 gpa_shift; /* Size of offset field in GRU phys addr */
+ u8 unused1;
+};
+
+/* UV_TABLE_GAM_RANGE_ENTRY values */
+#define UV_GAM_RANGE_TYPE_UNUSED 0 /* End of table */
+#define UV_GAM_RANGE_TYPE_RAM 1 /* Normal RAM */
+#define UV_GAM_RANGE_TYPE_NVRAM 2 /* Non-volatile memory */
+#define UV_GAM_RANGE_TYPE_NV_WINDOW 3 /* NVMDIMM block window */
+#define UV_GAM_RANGE_TYPE_NV_MAILBOX 4 /* NVMDIMM mailbox */
+#define UV_GAM_RANGE_TYPE_HOLE 5 /* Unused address range */
+#define UV_GAM_RANGE_TYPE_MAX 6
+
+/* The structure stores PA bits 56:26, for 64MB granularity */
+#define UV_GAM_RANGE_SHFT 26 /* 64MB */
+
+struct uv_gam_range_entry {
+ char type; /* Entry type: GAM_RANGE_TYPE_UNUSED, etc. */
+ char unused1;
+ u16 nasid; /* HNasid */
+ u16 sockid; /* Socket ID, high bits of APIC ID */
+ u16 pnode; /* Index to MMR and GRU spaces */
+ u32 unused2;
+ u32 limit; /* PA bits 56:26 (UV_GAM_RANGE_SHFT) */
+};
+
+#define UV_SYSTAB_SIG "UVST"
+#define UV_SYSTAB_VERSION_1 1 /* UV1/2/3 BIOS version */
+#define UV_SYSTAB_VERSION_UV4 0x400 /* UV4 BIOS base version */
+#define UV_SYSTAB_VERSION_UV4_1 0x401 /* + gpa_shift */
+#define UV_SYSTAB_VERSION_UV4_2 0x402 /* + TYPE_NVRAM/WINDOW/MBOX */
+#define UV_SYSTAB_VERSION_UV4_3 0x403 /* - GAM Range PXM Value */
+#define UV_SYSTAB_VERSION_UV4_LATEST UV_SYSTAB_VERSION_UV4_3
+
+#define UV_SYSTAB_TYPE_UNUSED 0 /* End of table (offset == 0) */
+#define UV_SYSTAB_TYPE_GAM_PARAMS 1 /* GAM PARAM conversions */
+#define UV_SYSTAB_TYPE_GAM_RNG_TBL 2 /* GAM entry table */
+#define UV_SYSTAB_TYPE_MAX 3
+
+/*
+ * The UV system table describes specific firmware
+ * capabilities available to the Linux kernel at runtime.
+ */
+struct uv_systab {
+ char signature[4]; /* must be UV_SYSTAB_SIG */
+ u32 revision; /* distinguish different firmware revs */
+ u64 function; /* BIOS runtime callback function ptr */
+ u32 size; /* systab size (starting with _VERSION_UV4) */
+ struct {
+ u32 type:8; /* type of entry */
+ u32 offset:24; /* byte offset from struct start to entry */
+ } entry[1]; /* additional entries follow */
+};
+extern struct uv_systab *uv_systab;
+/* (... end of definitions from UV BIOS ...) */
+
+enum {
+ BIOS_FREQ_BASE_PLATFORM = 0,
+ BIOS_FREQ_BASE_INTERVAL_TIMER = 1,
+ BIOS_FREQ_BASE_REALTIME_CLOCK = 2
+};
+
+union partition_info_u {
+ u64 val;
+ struct {
+ u64 hub_version : 8,
+ partition_id : 16,
+ coherence_id : 16,
+ region_size : 24;
+ };
+};
+
+enum uv_memprotect {
+ UV_MEMPROT_RESTRICT_ACCESS,
+ UV_MEMPROT_ALLOW_AMO,
+ UV_MEMPROT_ALLOW_RW
+};
+
+/*
+ * bios calls have 6 parameters
+ */
+extern s64 uv_bios_call(enum uv_bios_cmd, u64, u64, u64, u64, u64);
+extern s64 uv_bios_call_irqsave(enum uv_bios_cmd, u64, u64, u64, u64, u64);
+extern s64 uv_bios_call_reentrant(enum uv_bios_cmd, u64, u64, u64, u64, u64);
+
+extern s64 uv_bios_get_sn_info(int, int *, long *, long *, long *, long *);
+extern s64 uv_bios_freq_base(u64, u64 *);
+extern int uv_bios_mq_watchlist_alloc(unsigned long, unsigned int,
+ unsigned long *);
+extern int uv_bios_mq_watchlist_free(int, int);
+extern s64 uv_bios_change_memprotect(u64, u64, enum uv_memprotect);
+extern s64 uv_bios_reserved_page_pa(u64, u64 *, u64 *, u64 *);
+extern int uv_bios_set_legacy_vga_target(bool decode, int domain, int bus);
+
+#ifdef CONFIG_EFI
+extern void uv_bios_init(void);
+#else
+void uv_bios_init(void) { }
+#endif
+
+extern unsigned long sn_rtc_cycles_per_second;
+extern int uv_type;
+extern long sn_partition_id;
+extern long sn_coherency_id;
+extern long sn_region_size;
+extern long system_serial_number;
+#define uv_partition_coherence_id() (sn_coherency_id)
+
+extern struct kobject *sgi_uv_kobj; /* /sys/firmware/sgi_uv */
+
+/*
+ * EFI runtime lock; cf. firmware/efi/runtime-wrappers.c for details
+ */
+extern struct semaphore __efi_uv_runtime_lock;
+
+#endif /* _ASM_X86_UV_BIOS_H */
diff --git a/arch/x86/include/asm/uv/uv.h b/arch/x86/include/asm/uv/uv.h
new file mode 100644
index 000000000..e60c45fd3
--- /dev/null
+++ b/arch/x86/include/asm/uv/uv.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_UV_UV_H
+#define _ASM_X86_UV_UV_H
+
+#include <asm/tlbflush.h>
+
+enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC};
+
+struct cpumask;
+struct mm_struct;
+
+#ifdef CONFIG_X86_UV
+#include <linux/efi.h>
+
+extern enum uv_system_type get_uv_system_type(void);
+static inline bool is_early_uv_system(void)
+{
+ return !((efi.uv_systab == EFI_INVALID_TABLE_ADDR) || !efi.uv_systab);
+}
+extern int is_uv_system(void);
+extern int is_uv_hubless(void);
+extern void uv_cpu_init(void);
+extern void uv_nmi_init(void);
+extern void uv_system_init(void);
+extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
+ const struct flush_tlb_info *info);
+
+#else /* X86_UV */
+
+static inline enum uv_system_type get_uv_system_type(void) { return UV_NONE; }
+static inline bool is_early_uv_system(void) { return 0; }
+static inline int is_uv_system(void) { return 0; }
+static inline int is_uv_hubless(void) { return 0; }
+static inline void uv_cpu_init(void) { }
+static inline void uv_system_init(void) { }
+static inline const struct cpumask *
+uv_flush_tlb_others(const struct cpumask *cpumask,
+ const struct flush_tlb_info *info)
+{ return cpumask; }
+
+#endif /* X86_UV */
+
+#endif /* _ASM_X86_UV_UV_H */
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
new file mode 100644
index 000000000..7803114aa
--- /dev/null
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -0,0 +1,861 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * SGI UV Broadcast Assist Unit definitions
+ *
+ * Copyright (C) 2008-2011 Silicon Graphics, Inc. All rights reserved.
+ */
+
+#ifndef _ASM_X86_UV_UV_BAU_H
+#define _ASM_X86_UV_UV_BAU_H
+
+#include <linux/bitmap.h>
+#define BITSPERBYTE 8
+
+/*
+ * Broadcast Assist Unit messaging structures
+ *
+ * Selective Broadcast activations are induced by software action
+ * specifying a particular 8-descriptor "set" via a 6-bit index written
+ * to an MMR.
+ * Thus there are 64 unique 512-byte sets of SB descriptors - one set for
+ * each 6-bit index value. These descriptor sets are mapped in sequence
+ * starting with set 0 located at the address specified in the
+ * BAU_SB_DESCRIPTOR_BASE register, set 1 is located at BASE + 512,
+ * set 2 is at BASE + 2*512, set 3 at BASE + 3*512, and so on.
+ *
+ * We will use one set for sending BAU messages from each of the
+ * cpu's on the uvhub.
+ *
+ * TLB shootdown will use the first of the 8 descriptors of each set.
+ * Each of the descriptors is 64 bytes in size (8*64 = 512 bytes in a set).
+ */
+
+#define MAX_CPUS_PER_UVHUB 128
+#define MAX_CPUS_PER_SOCKET 64
+#define ADP_SZ 64 /* hardware-provided max. */
+#define UV_CPUS_PER_AS 32 /* hardware-provided max. */
+#define ITEMS_PER_DESC 8
+/* the 'throttle' to prevent the hardware stay-busy bug */
+#define MAX_BAU_CONCURRENT 3
+#define UV_ACT_STATUS_MASK 0x3
+#define UV_ACT_STATUS_SIZE 2
+#define UV_DISTRIBUTION_SIZE 256
+#define UV_SW_ACK_NPENDING 8
+#define UV1_NET_ENDPOINT_INTD 0x38
+#define UV2_NET_ENDPOINT_INTD 0x28
+#define UV_NET_ENDPOINT_INTD (is_uv1_hub() ? \
+ UV1_NET_ENDPOINT_INTD : UV2_NET_ENDPOINT_INTD)
+#define UV_PAYLOADQ_GNODE_SHIFT 49
+#define UV_PTC_BASENAME "sgi_uv/ptc_statistics"
+#define UV_BAU_BASENAME "sgi_uv/bau_tunables"
+#define UV_BAU_TUNABLES_DIR "sgi_uv"
+#define UV_BAU_TUNABLES_FILE "bau_tunables"
+#define WHITESPACE " \t\n"
+#define cpubit_isset(cpu, bau_local_cpumask) \
+ test_bit((cpu), (bau_local_cpumask).bits)
+
+/* [19:16] SOFT_ACK timeout period 19: 1 is urgency 7 17:16 1 is multiplier */
+/*
+ * UV2: Bit 19 selects between
+ * (0): 10 microsecond timebase and
+ * (1): 80 microseconds
+ * we're using 560us, similar to UV1: 65 units of 10us
+ */
+#define UV1_INTD_SOFT_ACK_TIMEOUT_PERIOD (9UL)
+#define UV2_INTD_SOFT_ACK_TIMEOUT_PERIOD (15UL)
+
+#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD (is_uv1_hub() ? \
+ UV1_INTD_SOFT_ACK_TIMEOUT_PERIOD : \
+ UV2_INTD_SOFT_ACK_TIMEOUT_PERIOD)
+/* assuming UV3 is the same */
+
+#define BAU_MISC_CONTROL_MULT_MASK 3
+
+#define UVH_AGING_PRESCALE_SEL 0x000000b000UL
+/* [30:28] URGENCY_7 an index into a table of times */
+#define BAU_URGENCY_7_SHIFT 28
+#define BAU_URGENCY_7_MASK 7
+
+#define UVH_TRANSACTION_TIMEOUT 0x000000b200UL
+/* [45:40] BAU - BAU transaction timeout select - a multiplier */
+#define BAU_TRANS_SHIFT 40
+#define BAU_TRANS_MASK 0x3f
+
+/*
+ * shorten some awkward names
+ */
+#define AS_PUSH_SHIFT UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT
+#define SOFTACK_MSHIFT UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT
+#define SOFTACK_PSHIFT UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT
+#define SOFTACK_TIMEOUT_PERIOD UV_INTD_SOFT_ACK_TIMEOUT_PERIOD
+#define PREFETCH_HINT_SHFT UV3H_LB_BAU_MISC_CONTROL_ENABLE_INTD_PREFETCH_HINT_SHFT
+#define SB_STATUS_SHFT UV3H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_SHFT
+#define write_gmmr uv_write_global_mmr64
+#define write_lmmr uv_write_local_mmr
+#define read_lmmr uv_read_local_mmr
+#define read_gmmr uv_read_global_mmr64
+
+/*
+ * bits in UVH_LB_BAU_SB_ACTIVATION_STATUS_0/1
+ */
+#define DS_IDLE 0
+#define DS_ACTIVE 1
+#define DS_DESTINATION_TIMEOUT 2
+#define DS_SOURCE_TIMEOUT 3
+/*
+ * bits put together from HRP_LB_BAU_SB_ACTIVATION_STATUS_0/1/2
+ * values 1 and 3 will not occur
+ * Decoded meaning ERROR BUSY AUX ERR
+ * ------------------------------- ---- ----- -------
+ * IDLE 0 0 0
+ * BUSY (active) 0 1 0
+ * SW Ack Timeout (destination) 1 0 0
+ * SW Ack INTD rejected (strong NACK) 1 0 1
+ * Source Side Time Out Detected 1 1 0
+ * Destination Side PUT Failed 1 1 1
+ */
+#define UV2H_DESC_IDLE 0
+#define UV2H_DESC_BUSY 2
+#define UV2H_DESC_DEST_TIMEOUT 4
+#define UV2H_DESC_DEST_STRONG_NACK 5
+#define UV2H_DESC_SOURCE_TIMEOUT 6
+#define UV2H_DESC_DEST_PUT_ERR 7
+
+/*
+ * delay for 'plugged' timeout retries, in microseconds
+ */
+#define PLUGGED_DELAY 10
+
+/*
+ * threshholds at which to use IPI to free resources
+ */
+/* after this # consecutive 'plugged' timeouts, use IPI to release resources */
+#define PLUGSB4RESET 100
+/* after this many consecutive timeouts, use IPI to release resources */
+#define TIMEOUTSB4RESET 1
+/* at this number uses of IPI to release resources, giveup the request */
+#define IPI_RESET_LIMIT 1
+/* after this # consecutive successes, bump up the throttle if it was lowered */
+#define COMPLETE_THRESHOLD 5
+/* after this # of giveups (fall back to kernel IPI's) disable the use of
+ the BAU for a period of time */
+#define GIVEUP_LIMIT 100
+
+#define UV_LB_SUBNODEID 0x10
+
+/* these two are the same for UV1 and UV2: */
+#define UV_SA_SHFT UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT
+#define UV_SA_MASK UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK
+/* 4 bits of software ack period */
+#define UV2_ACK_MASK 0x7UL
+#define UV2_ACK_UNITS_SHFT 3
+#define UV2_EXT_SHFT UV2H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_SHFT
+
+/*
+ * number of entries in the destination side payload queue
+ */
+#define DEST_Q_SIZE 20
+/*
+ * number of destination side software ack resources
+ */
+#define DEST_NUM_RESOURCES 8
+/*
+ * completion statuses for sending a TLB flush message
+ */
+#define FLUSH_RETRY_PLUGGED 1
+#define FLUSH_RETRY_TIMEOUT 2
+#define FLUSH_GIVEUP 3
+#define FLUSH_COMPLETE 4
+
+/*
+ * tuning the action when the numalink network is extremely delayed
+ */
+#define CONGESTED_RESPONSE_US 1000 /* 'long' response time, in
+ microseconds */
+#define CONGESTED_REPS 10 /* long delays averaged over
+ this many broadcasts */
+#define DISABLED_PERIOD 10 /* time for the bau to be
+ disabled, in seconds */
+/* see msg_type: */
+#define MSG_NOOP 0
+#define MSG_REGULAR 1
+#define MSG_RETRY 2
+
+#define BAU_DESC_QUALIFIER 0x534749
+
+enum uv_bau_version {
+ UV_BAU_V1 = 1,
+ UV_BAU_V2,
+ UV_BAU_V3,
+ UV_BAU_V4,
+};
+
+/*
+ * Distribution: 32 bytes (256 bits) (bytes 0-0x1f of descriptor)
+ * If the 'multilevel' flag in the header portion of the descriptor
+ * has been set to 0, then endpoint multi-unicast mode is selected.
+ * The distribution specification (32 bytes) is interpreted as a 256-bit
+ * distribution vector. Adjacent bits correspond to consecutive even numbered
+ * nodeIDs. The result of adding the index of a given bit to the 15-bit
+ * 'base_dest_nasid' field of the header corresponds to the
+ * destination nodeID associated with that specified bit.
+ */
+struct pnmask {
+ unsigned long bits[BITS_TO_LONGS(UV_DISTRIBUTION_SIZE)];
+};
+
+/*
+ * mask of cpu's on a uvhub
+ * (during initialization we need to check that unsigned long has
+ * enough bits for max. cpu's per uvhub)
+ */
+struct bau_local_cpumask {
+ unsigned long bits;
+};
+
+/*
+ * Payload: 16 bytes (128 bits) (bytes 0x20-0x2f of descriptor)
+ * only 12 bytes (96 bits) of the payload area are usable.
+ * An additional 3 bytes (bits 27:4) of the header address are carried
+ * to the next bytes of the destination payload queue.
+ * And an additional 2 bytes of the header Suppl_A field are also
+ * carried to the destination payload queue.
+ * But the first byte of the Suppl_A becomes bits 127:120 (the 16th byte)
+ * of the destination payload queue, which is written by the hardware
+ * with the s/w ack resource bit vector.
+ * [ effective message contents (16 bytes (128 bits) maximum), not counting
+ * the s/w ack bit vector ]
+ */
+
+/**
+ * struct uv1_2_3_bau_msg_payload - defines payload for INTD transactions
+ * @address: Signifies a page or all TLB's of the cpu
+ * @sending_cpu: CPU from which the message originates
+ * @acknowledge_count: CPUs on the destination Hub that received the interrupt
+ */
+struct uv1_2_3_bau_msg_payload {
+ u64 address;
+ u16 sending_cpu;
+ u16 acknowledge_count;
+};
+
+/**
+ * struct uv4_bau_msg_payload - defines payload for INTD transactions
+ * @address: Signifies a page or all TLB's of the cpu
+ * @sending_cpu: CPU from which the message originates
+ * @acknowledge_count: CPUs on the destination Hub that received the interrupt
+ * @qualifier: Set by source to verify origin of INTD broadcast
+ */
+struct uv4_bau_msg_payload {
+ u64 address;
+ u16 sending_cpu;
+ u16 acknowledge_count;
+ u32 reserved:8;
+ u32 qualifier:24;
+};
+
+/*
+ * UV1 Message header: 16 bytes (128 bits) (bytes 0x30-0x3f of descriptor)
+ * see table 4.2.3.0.1 in broacast_assist spec.
+ */
+struct uv1_bau_msg_header {
+ unsigned int dest_subnodeid:6; /* must be 0x10, for the LB */
+ /* bits 5:0 */
+ unsigned int base_dest_nasid:15; /* nasid of the first bit */
+ /* bits 20:6 */ /* in uvhub map */
+ unsigned int command:8; /* message type */
+ /* bits 28:21 */
+ /* 0x38: SN3net EndPoint Message */
+ unsigned int rsvd_1:3; /* must be zero */
+ /* bits 31:29 */
+ /* int will align on 32 bits */
+ unsigned int rsvd_2:9; /* must be zero */
+ /* bits 40:32 */
+ /* Suppl_A is 56-41 */
+ unsigned int sequence:16; /* message sequence number */
+ /* bits 56:41 */ /* becomes bytes 16-17 of msg */
+ /* Address field (96:57) is
+ never used as an address
+ (these are address bits
+ 42:3) */
+
+ unsigned int rsvd_3:1; /* must be zero */
+ /* bit 57 */
+ /* address bits 27:4 are payload */
+ /* these next 24 (58-81) bits become bytes 12-14 of msg */
+ /* bits 65:58 land in byte 12 */
+ unsigned int replied_to:1; /* sent as 0 by the source to
+ byte 12 */
+ /* bit 58 */
+ unsigned int msg_type:3; /* software type of the
+ message */
+ /* bits 61:59 */
+ unsigned int canceled:1; /* message canceled, resource
+ is to be freed*/
+ /* bit 62 */
+ unsigned int payload_1a:1; /* not currently used */
+ /* bit 63 */
+ unsigned int payload_1b:2; /* not currently used */
+ /* bits 65:64 */
+
+ /* bits 73:66 land in byte 13 */
+ unsigned int payload_1ca:6; /* not currently used */
+ /* bits 71:66 */
+ unsigned int payload_1c:2; /* not currently used */
+ /* bits 73:72 */
+
+ /* bits 81:74 land in byte 14 */
+ unsigned int payload_1d:6; /* not currently used */
+ /* bits 79:74 */
+ unsigned int payload_1e:2; /* not currently used */
+ /* bits 81:80 */
+
+ unsigned int rsvd_4:7; /* must be zero */
+ /* bits 88:82 */
+ unsigned int swack_flag:1; /* software acknowledge flag */
+ /* bit 89 */
+ /* INTD trasactions at
+ destination are to wait for
+ software acknowledge */
+ unsigned int rsvd_5:6; /* must be zero */
+ /* bits 95:90 */
+ unsigned int rsvd_6:5; /* must be zero */
+ /* bits 100:96 */
+ unsigned int int_both:1; /* if 1, interrupt both sockets
+ on the uvhub */
+ /* bit 101*/
+ unsigned int fairness:3; /* usually zero */
+ /* bits 104:102 */
+ unsigned int multilevel:1; /* multi-level multicast
+ format */
+ /* bit 105 */
+ /* 0 for TLB: endpoint multi-unicast messages */
+ unsigned int chaining:1; /* next descriptor is part of
+ this activation*/
+ /* bit 106 */
+ unsigned int rsvd_7:21; /* must be zero */
+ /* bits 127:107 */
+};
+
+/*
+ * UV2 Message header: 16 bytes (128 bits) (bytes 0x30-0x3f of descriptor)
+ * see figure 9-2 of harp_sys.pdf
+ * assuming UV3 is the same
+ */
+struct uv2_3_bau_msg_header {
+ unsigned int base_dest_nasid:15; /* nasid of the first bit */
+ /* bits 14:0 */ /* in uvhub map */
+ unsigned int dest_subnodeid:5; /* must be 0x10, for the LB */
+ /* bits 19:15 */
+ unsigned int rsvd_1:1; /* must be zero */
+ /* bit 20 */
+ /* Address bits 59:21 */
+ /* bits 25:2 of address (44:21) are payload */
+ /* these next 24 bits become bytes 12-14 of msg */
+ /* bits 28:21 land in byte 12 */
+ unsigned int replied_to:1; /* sent as 0 by the source to
+ byte 12 */
+ /* bit 21 */
+ unsigned int msg_type:3; /* software type of the
+ message */
+ /* bits 24:22 */
+ unsigned int canceled:1; /* message canceled, resource
+ is to be freed*/
+ /* bit 25 */
+ unsigned int payload_1:3; /* not currently used */
+ /* bits 28:26 */
+
+ /* bits 36:29 land in byte 13 */
+ unsigned int payload_2a:3; /* not currently used */
+ unsigned int payload_2b:5; /* not currently used */
+ /* bits 36:29 */
+
+ /* bits 44:37 land in byte 14 */
+ unsigned int payload_3:8; /* not currently used */
+ /* bits 44:37 */
+
+ unsigned int rsvd_2:7; /* reserved */
+ /* bits 51:45 */
+ unsigned int swack_flag:1; /* software acknowledge flag */
+ /* bit 52 */
+ unsigned int rsvd_3a:3; /* must be zero */
+ unsigned int rsvd_3b:8; /* must be zero */
+ unsigned int rsvd_3c:8; /* must be zero */
+ unsigned int rsvd_3d:3; /* must be zero */
+ /* bits 74:53 */
+ unsigned int fairness:3; /* usually zero */
+ /* bits 77:75 */
+
+ unsigned int sequence:16; /* message sequence number */
+ /* bits 93:78 Suppl_A */
+ unsigned int chaining:1; /* next descriptor is part of
+ this activation*/
+ /* bit 94 */
+ unsigned int multilevel:1; /* multi-level multicast
+ format */
+ /* bit 95 */
+ unsigned int rsvd_4:24; /* ordered / source node /
+ source subnode / aging
+ must be zero */
+ /* bits 119:96 */
+ unsigned int command:8; /* message type */
+ /* bits 127:120 */
+};
+
+/*
+ * The activation descriptor:
+ * The format of the message to send, plus all accompanying control
+ * Should be 64 bytes
+ */
+struct bau_desc {
+ struct pnmask distribution;
+ /*
+ * message template, consisting of header and payload:
+ */
+ union bau_msg_header {
+ struct uv1_bau_msg_header uv1_hdr;
+ struct uv2_3_bau_msg_header uv2_3_hdr;
+ } header;
+
+ union bau_payload_header {
+ struct uv1_2_3_bau_msg_payload uv1_2_3;
+ struct uv4_bau_msg_payload uv4;
+ } payload;
+};
+/* UV1:
+ * -payload-- ---------header------
+ * bytes 0-11 bits 41-56 bits 58-81
+ * A B (2) C (3)
+ *
+ * A/B/C are moved to:
+ * A C B
+ * bytes 0-11 bytes 12-14 bytes 16-17 (byte 15 filled in by hw as vector)
+ * ------------payload queue-----------
+ */
+/* UV2:
+ * -payload-- ---------header------
+ * bytes 0-11 bits 70-78 bits 21-44
+ * A B (2) C (3)
+ *
+ * A/B/C are moved to:
+ * A C B
+ * bytes 0-11 bytes 12-14 bytes 16-17 (byte 15 filled in by hw as vector)
+ * ------------payload queue-----------
+ */
+
+/*
+ * The payload queue on the destination side is an array of these.
+ * With BAU_MISC_CONTROL set for software acknowledge mode, the messages
+ * are 32 bytes (2 micropackets) (256 bits) in length, but contain only 17
+ * bytes of usable data, including the sw ack vector in byte 15 (bits 127:120)
+ * (12 bytes come from bau_msg_payload, 3 from payload_1, 2 from
+ * swack_vec and payload_2)
+ * "Enabling Software Acknowledgment mode (see Section 4.3.3 Software
+ * Acknowledge Processing) also selects 32 byte (17 bytes usable) payload
+ * operation."
+ */
+struct bau_pq_entry {
+ unsigned long address; /* signifies a page or all TLB's
+ of the cpu */
+ /* 64 bits, bytes 0-7 */
+ unsigned short sending_cpu; /* cpu that sent the message */
+ /* 16 bits, bytes 8-9 */
+ unsigned short acknowledge_count; /* filled in by destination */
+ /* 16 bits, bytes 10-11 */
+ /* these next 3 bytes come from bits 58-81 of the message header */
+ unsigned short replied_to:1; /* sent as 0 by the source */
+ unsigned short msg_type:3; /* software message type */
+ unsigned short canceled:1; /* sent as 0 by the source */
+ unsigned short unused1:3; /* not currently using */
+ /* byte 12 */
+ unsigned char unused2a; /* not currently using */
+ /* byte 13 */
+ unsigned char unused2; /* not currently using */
+ /* byte 14 */
+ unsigned char swack_vec; /* filled in by the hardware */
+ /* byte 15 (bits 127:120) */
+ unsigned short sequence; /* message sequence number */
+ /* bytes 16-17 */
+ unsigned char unused4[2]; /* not currently using bytes 18-19 */
+ /* bytes 18-19 */
+ int number_of_cpus; /* filled in at destination */
+ /* 32 bits, bytes 20-23 (aligned) */
+ unsigned char unused5[8]; /* not using */
+ /* bytes 24-31 */
+};
+
+struct msg_desc {
+ struct bau_pq_entry *msg;
+ int msg_slot;
+ struct bau_pq_entry *queue_first;
+ struct bau_pq_entry *queue_last;
+};
+
+struct reset_args {
+ int sender;
+};
+
+/*
+ * This structure is allocated per_cpu for UV TLB shootdown statistics.
+ */
+struct ptc_stats {
+ /* sender statistics */
+ unsigned long s_giveup; /* number of fall backs to
+ IPI-style flushes */
+ unsigned long s_requestor; /* number of shootdown
+ requests */
+ unsigned long s_stimeout; /* source side timeouts */
+ unsigned long s_dtimeout; /* destination side timeouts */
+ unsigned long s_strongnacks; /* number of strong nack's */
+ unsigned long s_time; /* time spent in sending side */
+ unsigned long s_retriesok; /* successful retries */
+ unsigned long s_ntargcpu; /* total number of cpu's
+ targeted */
+ unsigned long s_ntargself; /* times the sending cpu was
+ targeted */
+ unsigned long s_ntarglocals; /* targets of cpus on the local
+ blade */
+ unsigned long s_ntargremotes; /* targets of cpus on remote
+ blades */
+ unsigned long s_ntarglocaluvhub; /* targets of the local hub */
+ unsigned long s_ntargremoteuvhub; /* remotes hubs targeted */
+ unsigned long s_ntarguvhub; /* total number of uvhubs
+ targeted */
+ unsigned long s_ntarguvhub16; /* number of times target
+ hubs >= 16*/
+ unsigned long s_ntarguvhub8; /* number of times target
+ hubs >= 8 */
+ unsigned long s_ntarguvhub4; /* number of times target
+ hubs >= 4 */
+ unsigned long s_ntarguvhub2; /* number of times target
+ hubs >= 2 */
+ unsigned long s_ntarguvhub1; /* number of times target
+ hubs == 1 */
+ unsigned long s_resets_plug; /* ipi-style resets from plug
+ state */
+ unsigned long s_resets_timeout; /* ipi-style resets from
+ timeouts */
+ unsigned long s_busy; /* status stayed busy past
+ s/w timer */
+ unsigned long s_throttles; /* waits in throttle */
+ unsigned long s_retry_messages; /* retry broadcasts */
+ unsigned long s_bau_reenabled; /* for bau enable/disable */
+ unsigned long s_bau_disabled; /* for bau enable/disable */
+ unsigned long s_uv2_wars; /* uv2 workaround, perm. busy */
+ unsigned long s_uv2_wars_hw; /* uv2 workaround, hiwater */
+ unsigned long s_uv2_war_waits; /* uv2 workaround, long waits */
+ unsigned long s_overipilimit; /* over the ipi reset limit */
+ unsigned long s_giveuplimit; /* disables, over giveup limit*/
+ unsigned long s_enters; /* entries to the driver */
+ unsigned long s_ipifordisabled; /* fall back to IPI; disabled */
+ unsigned long s_plugged; /* plugged by h/w bug*/
+ unsigned long s_congested; /* giveup on long wait */
+ /* destination statistics */
+ unsigned long d_alltlb; /* times all tlb's on this
+ cpu were flushed */
+ unsigned long d_onetlb; /* times just one tlb on this
+ cpu was flushed */
+ unsigned long d_multmsg; /* interrupts with multiple
+ messages */
+ unsigned long d_nomsg; /* interrupts with no message */
+ unsigned long d_time; /* time spent on destination
+ side */
+ unsigned long d_requestee; /* number of messages
+ processed */
+ unsigned long d_retries; /* number of retry messages
+ processed */
+ unsigned long d_canceled; /* number of messages canceled
+ by retries */
+ unsigned long d_nocanceled; /* retries that found nothing
+ to cancel */
+ unsigned long d_resets; /* number of ipi-style requests
+ processed */
+ unsigned long d_rcanceled; /* number of messages canceled
+ by resets */
+};
+
+struct tunables {
+ int *tunp;
+ int deflt;
+};
+
+struct hub_and_pnode {
+ short uvhub;
+ short pnode;
+};
+
+struct socket_desc {
+ short num_cpus;
+ short cpu_number[MAX_CPUS_PER_SOCKET];
+};
+
+struct uvhub_desc {
+ unsigned short socket_mask;
+ short num_cpus;
+ short uvhub;
+ short pnode;
+ struct socket_desc socket[2];
+};
+
+/**
+ * struct bau_control
+ * @status_mmr: location of status mmr, determined by uvhub_cpu
+ * @status_index: index of ERR|BUSY bits in status mmr, determined by uvhub_cpu
+ *
+ * Per-cpu control struct containing CPU topology information and BAU tuneables.
+ */
+struct bau_control {
+ struct bau_desc *descriptor_base;
+ struct bau_pq_entry *queue_first;
+ struct bau_pq_entry *queue_last;
+ struct bau_pq_entry *bau_msg_head;
+ struct bau_control *uvhub_master;
+ struct bau_control *socket_master;
+ struct ptc_stats *statp;
+ cpumask_t *cpumask;
+ unsigned long timeout_interval;
+ unsigned long set_bau_on_time;
+ atomic_t active_descriptor_count;
+ int plugged_tries;
+ int timeout_tries;
+ int ipi_attempts;
+ int conseccompletes;
+ u64 status_mmr;
+ int status_index;
+ bool nobau;
+ short baudisabled;
+ short cpu;
+ short osnode;
+ short uvhub_cpu;
+ short uvhub;
+ short uvhub_version;
+ short cpus_in_socket;
+ short cpus_in_uvhub;
+ short partition_base_pnode;
+ short busy; /* all were busy (war) */
+ unsigned short message_number;
+ unsigned short uvhub_quiesce;
+ short socket_acknowledge_count[DEST_Q_SIZE];
+ cycles_t send_message;
+ cycles_t period_end;
+ cycles_t period_time;
+ spinlock_t uvhub_lock;
+ spinlock_t queue_lock;
+ spinlock_t disable_lock;
+ /* tunables */
+ int max_concurr;
+ int max_concurr_const;
+ int plugged_delay;
+ int plugsb4reset;
+ int timeoutsb4reset;
+ int ipi_reset_limit;
+ int complete_threshold;
+ int cong_response_us;
+ int cong_reps;
+ cycles_t disabled_period;
+ int period_giveups;
+ int giveup_limit;
+ long period_requests;
+ struct hub_and_pnode *thp;
+};
+
+/* Abstracted BAU functions */
+struct bau_operations {
+ unsigned long (*read_l_sw_ack)(void);
+ unsigned long (*read_g_sw_ack)(int pnode);
+ unsigned long (*bau_gpa_to_offset)(unsigned long vaddr);
+ void (*write_l_sw_ack)(unsigned long mmr);
+ void (*write_g_sw_ack)(int pnode, unsigned long mmr);
+ void (*write_payload_first)(int pnode, unsigned long mmr);
+ void (*write_payload_last)(int pnode, unsigned long mmr);
+ int (*wait_completion)(struct bau_desc*,
+ struct bau_control*, long try);
+};
+
+static inline void write_mmr_data_broadcast(int pnode, unsigned long mmr_image)
+{
+ write_gmmr(pnode, UVH_BAU_DATA_BROADCAST, mmr_image);
+}
+
+static inline void write_mmr_descriptor_base(int pnode, unsigned long mmr_image)
+{
+ write_gmmr(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE, mmr_image);
+}
+
+static inline void write_mmr_activation(unsigned long index)
+{
+ write_lmmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index);
+}
+
+static inline void write_gmmr_activation(int pnode, unsigned long mmr_image)
+{
+ write_gmmr(pnode, UVH_LB_BAU_SB_ACTIVATION_CONTROL, mmr_image);
+}
+
+static inline void write_mmr_proc_payload_first(int pnode, unsigned long mmr_image)
+{
+ write_gmmr(pnode, UV4H_LB_PROC_INTD_QUEUE_FIRST, mmr_image);
+}
+
+static inline void write_mmr_proc_payload_last(int pnode, unsigned long mmr_image)
+{
+ write_gmmr(pnode, UV4H_LB_PROC_INTD_QUEUE_LAST, mmr_image);
+}
+
+static inline void write_mmr_payload_first(int pnode, unsigned long mmr_image)
+{
+ write_gmmr(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST, mmr_image);
+}
+
+static inline void write_mmr_payload_tail(int pnode, unsigned long mmr_image)
+{
+ write_gmmr(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL, mmr_image);
+}
+
+static inline void write_mmr_payload_last(int pnode, unsigned long mmr_image)
+{
+ write_gmmr(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST, mmr_image);
+}
+
+static inline void write_mmr_misc_control(int pnode, unsigned long mmr_image)
+{
+ write_gmmr(pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
+}
+
+static inline unsigned long read_mmr_misc_control(int pnode)
+{
+ return read_gmmr(pnode, UVH_LB_BAU_MISC_CONTROL);
+}
+
+static inline void write_mmr_sw_ack(unsigned long mr)
+{
+ uv_write_local_mmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, mr);
+}
+
+static inline void write_gmmr_sw_ack(int pnode, unsigned long mr)
+{
+ write_gmmr(pnode, UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, mr);
+}
+
+static inline unsigned long read_mmr_sw_ack(void)
+{
+ return read_lmmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
+}
+
+static inline unsigned long read_gmmr_sw_ack(int pnode)
+{
+ return read_gmmr(pnode, UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
+}
+
+static inline void write_mmr_proc_sw_ack(unsigned long mr)
+{
+ uv_write_local_mmr(UV4H_LB_PROC_INTD_SOFT_ACK_CLEAR, mr);
+}
+
+static inline void write_gmmr_proc_sw_ack(int pnode, unsigned long mr)
+{
+ write_gmmr(pnode, UV4H_LB_PROC_INTD_SOFT_ACK_CLEAR, mr);
+}
+
+static inline unsigned long read_mmr_proc_sw_ack(void)
+{
+ return read_lmmr(UV4H_LB_PROC_INTD_SOFT_ACK_PENDING);
+}
+
+static inline unsigned long read_gmmr_proc_sw_ack(int pnode)
+{
+ return read_gmmr(pnode, UV4H_LB_PROC_INTD_SOFT_ACK_PENDING);
+}
+
+static inline void write_mmr_data_config(int pnode, unsigned long mr)
+{
+ uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG, mr);
+}
+
+static inline int bau_uvhub_isset(int uvhub, struct pnmask *dstp)
+{
+ return constant_test_bit(uvhub, &dstp->bits[0]);
+}
+static inline void bau_uvhub_set(int pnode, struct pnmask *dstp)
+{
+ __set_bit(pnode, &dstp->bits[0]);
+}
+static inline void bau_uvhubs_clear(struct pnmask *dstp,
+ int nbits)
+{
+ bitmap_zero(&dstp->bits[0], nbits);
+}
+static inline int bau_uvhub_weight(struct pnmask *dstp)
+{
+ return bitmap_weight((unsigned long *)&dstp->bits[0],
+ UV_DISTRIBUTION_SIZE);
+}
+
+static inline void bau_cpubits_clear(struct bau_local_cpumask *dstp, int nbits)
+{
+ bitmap_zero(&dstp->bits, nbits);
+}
+
+extern void uv_bau_message_intr1(void);
+#ifdef CONFIG_TRACING
+#define trace_uv_bau_message_intr1 uv_bau_message_intr1
+#endif
+extern void uv_bau_timeout_intr1(void);
+
+struct atomic_short {
+ short counter;
+};
+
+/*
+ * atomic_read_short - read a short atomic variable
+ * @v: pointer of type atomic_short
+ *
+ * Atomically reads the value of @v.
+ */
+static inline int atomic_read_short(const struct atomic_short *v)
+{
+ return v->counter;
+}
+
+/*
+ * atom_asr - add and return a short int
+ * @i: short value to add
+ * @v: pointer of type atomic_short
+ *
+ * Atomically adds @i to @v and returns @i + @v
+ */
+static inline int atom_asr(short i, struct atomic_short *v)
+{
+ short __i = i;
+ asm volatile(LOCK_PREFIX "xaddw %0, %1"
+ : "+r" (i), "+m" (v->counter)
+ : : "memory");
+ return i + __i;
+}
+
+/*
+ * conditionally add 1 to *v, unless *v is >= u
+ * return 0 if we cannot add 1 to *v because it is >= u
+ * return 1 if we can add 1 to *v because it is < u
+ * the add is atomic
+ *
+ * This is close to atomic_add_unless(), but this allows the 'u' value
+ * to be lowered below the current 'v'. atomic_add_unless can only stop
+ * on equal.
+ */
+static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u)
+{
+ spin_lock(lock);
+ if (atomic_read(v) >= u) {
+ spin_unlock(lock);
+ return 0;
+ }
+ atomic_inc(v);
+ spin_unlock(lock);
+ return 1;
+}
+
+#endif /* _ASM_X86_UV_UV_BAU_H */
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
new file mode 100644
index 000000000..44cf6d6de
--- /dev/null
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -0,0 +1,911 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * SGI UV architectural definitions
+ *
+ * Copyright (C) 2007-2014 Silicon Graphics, Inc. All rights reserved.
+ */
+
+#ifndef _ASM_X86_UV_UV_HUB_H
+#define _ASM_X86_UV_UV_HUB_H
+
+#ifdef CONFIG_X86_64
+#include <linux/numa.h>
+#include <linux/percpu.h>
+#include <linux/timer.h>
+#include <linux/io.h>
+#include <linux/topology.h>
+#include <asm/types.h>
+#include <asm/percpu.h>
+#include <asm/uv/uv_mmrs.h>
+#include <asm/uv/bios.h>
+#include <asm/irq_vectors.h>
+#include <asm/io_apic.h>
+
+
+/*
+ * Addressing Terminology
+ *
+ * M - The low M bits of a physical address represent the offset
+ * into the blade local memory. RAM memory on a blade is physically
+ * contiguous (although various IO spaces may punch holes in
+ * it)..
+ *
+ * N - Number of bits in the node portion of a socket physical
+ * address.
+ *
+ * NASID - network ID of a router, Mbrick or Cbrick. Nasid values of
+ * routers always have low bit of 1, C/MBricks have low bit
+ * equal to 0. Most addressing macros that target UV hub chips
+ * right shift the NASID by 1 to exclude the always-zero bit.
+ * NASIDs contain up to 15 bits.
+ *
+ * GNODE - NASID right shifted by 1 bit. Most mmrs contain gnodes instead
+ * of nasids.
+ *
+ * PNODE - the low N bits of the GNODE. The PNODE is the most useful variant
+ * of the nasid for socket usage.
+ *
+ * GPA - (global physical address) a socket physical address converted
+ * so that it can be used by the GRU as a global address. Socket
+ * physical addresses 1) need additional NASID (node) bits added
+ * to the high end of the address, and 2) unaliased if the
+ * partition does not have a physical address 0. In addition, on
+ * UV2 rev 1, GPAs need the gnode left shifted to bits 39 or 40.
+ *
+ *
+ * NumaLink Global Physical Address Format:
+ * +--------------------------------+---------------------+
+ * |00..000| GNODE | NodeOffset |
+ * +--------------------------------+---------------------+
+ * |<-------53 - M bits --->|<--------M bits ----->
+ *
+ * M - number of node offset bits (35 .. 40)
+ *
+ *
+ * Memory/UV-HUB Processor Socket Address Format:
+ * +----------------+---------------+---------------------+
+ * |00..000000000000| PNODE | NodeOffset |
+ * +----------------+---------------+---------------------+
+ * <--- N bits --->|<--------M bits ----->
+ *
+ * M - number of node offset bits (35 .. 40)
+ * N - number of PNODE bits (0 .. 10)
+ *
+ * Note: M + N cannot currently exceed 44 (x86_64) or 46 (IA64).
+ * The actual values are configuration dependent and are set at
+ * boot time. M & N values are set by the hardware/BIOS at boot.
+ *
+ *
+ * APICID format
+ * NOTE!!!!!! This is the current format of the APICID. However, code
+ * should assume that this will change in the future. Use functions
+ * in this file for all APICID bit manipulations and conversion.
+ *
+ * 1111110000000000
+ * 5432109876543210
+ * pppppppppplc0cch Nehalem-EX (12 bits in hdw reg)
+ * ppppppppplcc0cch Westmere-EX (12 bits in hdw reg)
+ * pppppppppppcccch SandyBridge (15 bits in hdw reg)
+ * sssssssssss
+ *
+ * p = pnode bits
+ * l = socket number on board
+ * c = core
+ * h = hyperthread
+ * s = bits that are in the SOCKET_ID CSR
+ *
+ * Note: Processor may support fewer bits in the APICID register. The ACPI
+ * tables hold all 16 bits. Software needs to be aware of this.
+ *
+ * Unless otherwise specified, all references to APICID refer to
+ * the FULL value contained in ACPI tables, not the subset in the
+ * processor APICID register.
+ */
+
+/*
+ * Maximum number of bricks in all partitions and in all coherency domains.
+ * This is the total number of bricks accessible in the numalink fabric. It
+ * includes all C & M bricks. Routers are NOT included.
+ *
+ * This value is also the value of the maximum number of non-router NASIDs
+ * in the numalink fabric.
+ *
+ * NOTE: a brick may contain 1 or 2 OS nodes. Don't get these confused.
+ */
+#define UV_MAX_NUMALINK_BLADES 16384
+
+/*
+ * Maximum number of C/Mbricks within a software SSI (hardware may support
+ * more).
+ */
+#define UV_MAX_SSI_BLADES 256
+
+/*
+ * The largest possible NASID of a C or M brick (+ 2)
+ */
+#define UV_MAX_NASID_VALUE (UV_MAX_NUMALINK_BLADES * 2)
+
+/* System Controller Interface Reg info */
+struct uv_scir_s {
+ struct timer_list timer;
+ unsigned long offset;
+ unsigned long last;
+ unsigned long idle_on;
+ unsigned long idle_off;
+ unsigned char state;
+ unsigned char enabled;
+};
+
+/* GAM (globally addressed memory) range table */
+struct uv_gam_range_s {
+ u32 limit; /* PA bits 56:26 (GAM_RANGE_SHFT) */
+ u16 nasid; /* node's global physical address */
+ s8 base; /* entry index of node's base addr */
+ u8 reserved;
+};
+
+/*
+ * The following defines attributes of the HUB chip. These attributes are
+ * frequently referenced and are kept in a common per hub struct.
+ * After setup, the struct is read only, so it should be readily
+ * available in the L3 cache on the cpu socket for the node.
+ */
+struct uv_hub_info_s {
+ unsigned long global_mmr_base;
+ unsigned long global_mmr_shift;
+ unsigned long gpa_mask;
+ unsigned short *socket_to_node;
+ unsigned short *socket_to_pnode;
+ unsigned short *pnode_to_socket;
+ struct uv_gam_range_s *gr_table;
+ unsigned short min_socket;
+ unsigned short min_pnode;
+ unsigned char m_val;
+ unsigned char n_val;
+ unsigned char gr_table_len;
+ unsigned char hub_revision;
+ unsigned char apic_pnode_shift;
+ unsigned char gpa_shift;
+ unsigned char m_shift;
+ unsigned char n_lshift;
+ unsigned int gnode_extra;
+ unsigned long gnode_upper;
+ unsigned long lowmem_remap_top;
+ unsigned long lowmem_remap_base;
+ unsigned long global_gru_base;
+ unsigned long global_gru_shift;
+ unsigned short pnode;
+ unsigned short pnode_mask;
+ unsigned short coherency_domain_number;
+ unsigned short numa_blade_id;
+ unsigned short nr_possible_cpus;
+ unsigned short nr_online_cpus;
+ short memory_nid;
+};
+
+/* CPU specific info with a pointer to the hub common info struct */
+struct uv_cpu_info_s {
+ void *p_uv_hub_info;
+ unsigned char blade_cpu_id;
+ struct uv_scir_s scir;
+};
+DECLARE_PER_CPU(struct uv_cpu_info_s, __uv_cpu_info);
+
+#define uv_cpu_info this_cpu_ptr(&__uv_cpu_info)
+#define uv_cpu_info_per(cpu) (&per_cpu(__uv_cpu_info, cpu))
+
+#define uv_scir_info (&uv_cpu_info->scir)
+#define uv_cpu_scir_info(cpu) (&uv_cpu_info_per(cpu)->scir)
+
+/* Node specific hub common info struct */
+extern void **__uv_hub_info_list;
+static inline struct uv_hub_info_s *uv_hub_info_list(int node)
+{
+ return (struct uv_hub_info_s *)__uv_hub_info_list[node];
+}
+
+static inline struct uv_hub_info_s *_uv_hub_info(void)
+{
+ return (struct uv_hub_info_s *)uv_cpu_info->p_uv_hub_info;
+}
+#define uv_hub_info _uv_hub_info()
+
+static inline struct uv_hub_info_s *uv_cpu_hub_info(int cpu)
+{
+ return (struct uv_hub_info_s *)uv_cpu_info_per(cpu)->p_uv_hub_info;
+}
+
+#define UV_HUB_INFO_VERSION 0x7150
+extern int uv_hub_info_version(void);
+static inline int uv_hub_info_check(int version)
+{
+ if (uv_hub_info_version() == version)
+ return 0;
+
+ pr_crit("UV: uv_hub_info version(%x) mismatch, expecting(%x)\n",
+ uv_hub_info_version(), version);
+
+ BUG(); /* Catastrophic - cannot continue on unknown UV system */
+}
+#define _uv_hub_info_check() uv_hub_info_check(UV_HUB_INFO_VERSION)
+
+/*
+ * HUB revision ranges for each UV HUB architecture.
+ * This is a software convention - NOT the hardware revision numbers in
+ * the hub chip.
+ */
+#define UV1_HUB_REVISION_BASE 1
+#define UV2_HUB_REVISION_BASE 3
+#define UV3_HUB_REVISION_BASE 5
+#define UV4_HUB_REVISION_BASE 7
+#define UV4A_HUB_REVISION_BASE 8 /* UV4 (fixed) rev 2 */
+
+#ifdef UV1_HUB_IS_SUPPORTED
+static inline int is_uv1_hub(void)
+{
+ return uv_hub_info->hub_revision < UV2_HUB_REVISION_BASE;
+}
+#else
+static inline int is_uv1_hub(void)
+{
+ return 0;
+}
+#endif
+
+#ifdef UV2_HUB_IS_SUPPORTED
+static inline int is_uv2_hub(void)
+{
+ return ((uv_hub_info->hub_revision >= UV2_HUB_REVISION_BASE) &&
+ (uv_hub_info->hub_revision < UV3_HUB_REVISION_BASE));
+}
+#else
+static inline int is_uv2_hub(void)
+{
+ return 0;
+}
+#endif
+
+#ifdef UV3_HUB_IS_SUPPORTED
+static inline int is_uv3_hub(void)
+{
+ return ((uv_hub_info->hub_revision >= UV3_HUB_REVISION_BASE) &&
+ (uv_hub_info->hub_revision < UV4_HUB_REVISION_BASE));
+}
+#else
+static inline int is_uv3_hub(void)
+{
+ return 0;
+}
+#endif
+
+/* First test "is UV4A", then "is UV4" */
+#ifdef UV4A_HUB_IS_SUPPORTED
+static inline int is_uv4a_hub(void)
+{
+ return (uv_hub_info->hub_revision >= UV4A_HUB_REVISION_BASE);
+}
+#else
+static inline int is_uv4a_hub(void)
+{
+ return 0;
+}
+#endif
+
+#ifdef UV4_HUB_IS_SUPPORTED
+static inline int is_uv4_hub(void)
+{
+ return uv_hub_info->hub_revision >= UV4_HUB_REVISION_BASE;
+}
+#else
+static inline int is_uv4_hub(void)
+{
+ return 0;
+}
+#endif
+
+static inline int is_uvx_hub(void)
+{
+ if (uv_hub_info->hub_revision >= UV2_HUB_REVISION_BASE)
+ return uv_hub_info->hub_revision;
+
+ return 0;
+}
+
+static inline int is_uv_hub(void)
+{
+#ifdef UV1_HUB_IS_SUPPORTED
+ return uv_hub_info->hub_revision;
+#endif
+ return is_uvx_hub();
+}
+
+union uvh_apicid {
+ unsigned long v;
+ struct uvh_apicid_s {
+ unsigned long local_apic_mask : 24;
+ unsigned long local_apic_shift : 5;
+ unsigned long unused1 : 3;
+ unsigned long pnode_mask : 24;
+ unsigned long pnode_shift : 5;
+ unsigned long unused2 : 3;
+ } s;
+};
+
+/*
+ * Local & Global MMR space macros.
+ * Note: macros are intended to be used ONLY by inline functions
+ * in this file - not by other kernel code.
+ * n - NASID (full 15-bit global nasid)
+ * g - GNODE (full 15-bit global nasid, right shifted 1)
+ * p - PNODE (local part of nsids, right shifted 1)
+ */
+#define UV_NASID_TO_PNODE(n) (((n) >> 1) & uv_hub_info->pnode_mask)
+#define UV_PNODE_TO_GNODE(p) ((p) |uv_hub_info->gnode_extra)
+#define UV_PNODE_TO_NASID(p) (UV_PNODE_TO_GNODE(p) << 1)
+
+#define UV1_LOCAL_MMR_BASE 0xf4000000UL
+#define UV1_GLOBAL_MMR32_BASE 0xf8000000UL
+#define UV1_LOCAL_MMR_SIZE (64UL * 1024 * 1024)
+#define UV1_GLOBAL_MMR32_SIZE (64UL * 1024 * 1024)
+
+#define UV2_LOCAL_MMR_BASE 0xfa000000UL
+#define UV2_GLOBAL_MMR32_BASE 0xfc000000UL
+#define UV2_LOCAL_MMR_SIZE (32UL * 1024 * 1024)
+#define UV2_GLOBAL_MMR32_SIZE (32UL * 1024 * 1024)
+
+#define UV3_LOCAL_MMR_BASE 0xfa000000UL
+#define UV3_GLOBAL_MMR32_BASE 0xfc000000UL
+#define UV3_LOCAL_MMR_SIZE (32UL * 1024 * 1024)
+#define UV3_GLOBAL_MMR32_SIZE (32UL * 1024 * 1024)
+
+#define UV4_LOCAL_MMR_BASE 0xfa000000UL
+#define UV4_GLOBAL_MMR32_BASE 0xfc000000UL
+#define UV4_LOCAL_MMR_SIZE (32UL * 1024 * 1024)
+#define UV4_GLOBAL_MMR32_SIZE (16UL * 1024 * 1024)
+
+#define UV_LOCAL_MMR_BASE ( \
+ is_uv1_hub() ? UV1_LOCAL_MMR_BASE : \
+ is_uv2_hub() ? UV2_LOCAL_MMR_BASE : \
+ is_uv3_hub() ? UV3_LOCAL_MMR_BASE : \
+ /*is_uv4_hub*/ UV4_LOCAL_MMR_BASE)
+
+#define UV_GLOBAL_MMR32_BASE ( \
+ is_uv1_hub() ? UV1_GLOBAL_MMR32_BASE : \
+ is_uv2_hub() ? UV2_GLOBAL_MMR32_BASE : \
+ is_uv3_hub() ? UV3_GLOBAL_MMR32_BASE : \
+ /*is_uv4_hub*/ UV4_GLOBAL_MMR32_BASE)
+
+#define UV_LOCAL_MMR_SIZE ( \
+ is_uv1_hub() ? UV1_LOCAL_MMR_SIZE : \
+ is_uv2_hub() ? UV2_LOCAL_MMR_SIZE : \
+ is_uv3_hub() ? UV3_LOCAL_MMR_SIZE : \
+ /*is_uv4_hub*/ UV4_LOCAL_MMR_SIZE)
+
+#define UV_GLOBAL_MMR32_SIZE ( \
+ is_uv1_hub() ? UV1_GLOBAL_MMR32_SIZE : \
+ is_uv2_hub() ? UV2_GLOBAL_MMR32_SIZE : \
+ is_uv3_hub() ? UV3_GLOBAL_MMR32_SIZE : \
+ /*is_uv4_hub*/ UV4_GLOBAL_MMR32_SIZE)
+
+#define UV_GLOBAL_MMR64_BASE (uv_hub_info->global_mmr_base)
+
+#define UV_GLOBAL_GRU_MMR_BASE 0x4000000
+
+#define UV_GLOBAL_MMR32_PNODE_SHIFT 15
+#define _UV_GLOBAL_MMR64_PNODE_SHIFT 26
+#define UV_GLOBAL_MMR64_PNODE_SHIFT (uv_hub_info->global_mmr_shift)
+
+#define UV_GLOBAL_MMR32_PNODE_BITS(p) ((p) << (UV_GLOBAL_MMR32_PNODE_SHIFT))
+
+#define UV_GLOBAL_MMR64_PNODE_BITS(p) \
+ (((unsigned long)(p)) << UV_GLOBAL_MMR64_PNODE_SHIFT)
+
+#define UVH_APICID 0x002D0E00L
+#define UV_APIC_PNODE_SHIFT 6
+
+#define UV_APICID_HIBIT_MASK 0xffff0000
+
+/* Local Bus from cpu's perspective */
+#define LOCAL_BUS_BASE 0x1c00000
+#define LOCAL_BUS_SIZE (4 * 1024 * 1024)
+
+/*
+ * System Controller Interface Reg
+ *
+ * Note there are NO leds on a UV system. This register is only
+ * used by the system controller to monitor system-wide operation.
+ * There are 64 regs per node. With Nahelem cpus (2 cores per node,
+ * 8 cpus per core, 2 threads per cpu) there are 32 cpu threads on
+ * a node.
+ *
+ * The window is located at top of ACPI MMR space
+ */
+#define SCIR_WINDOW_COUNT 64
+#define SCIR_LOCAL_MMR_BASE (LOCAL_BUS_BASE + \
+ LOCAL_BUS_SIZE - \
+ SCIR_WINDOW_COUNT)
+
+#define SCIR_CPU_HEARTBEAT 0x01 /* timer interrupt */
+#define SCIR_CPU_ACTIVITY 0x02 /* not idle */
+#define SCIR_CPU_HB_INTERVAL (HZ) /* once per second */
+
+/* Loop through all installed blades */
+#define for_each_possible_blade(bid) \
+ for ((bid) = 0; (bid) < uv_num_possible_blades(); (bid)++)
+
+/*
+ * Macros for converting between kernel virtual addresses, socket local physical
+ * addresses, and UV global physical addresses.
+ * Note: use the standard __pa() & __va() macros for converting
+ * between socket virtual and socket physical addresses.
+ */
+
+/* global bits offset - number of local address bits in gpa for this UV arch */
+static inline unsigned int uv_gpa_shift(void)
+{
+ return uv_hub_info->gpa_shift;
+}
+#define _uv_gpa_shift
+
+/* Find node that has the address range that contains global address */
+static inline struct uv_gam_range_s *uv_gam_range(unsigned long pa)
+{
+ struct uv_gam_range_s *gr = uv_hub_info->gr_table;
+ unsigned long pal = (pa & uv_hub_info->gpa_mask) >> UV_GAM_RANGE_SHFT;
+ int i, num = uv_hub_info->gr_table_len;
+
+ if (gr) {
+ for (i = 0; i < num; i++, gr++) {
+ if (pal < gr->limit)
+ return gr;
+ }
+ }
+ pr_crit("UV: GAM Range for 0x%lx not found at %p!\n", pa, gr);
+ BUG();
+}
+
+/* Return base address of node that contains global address */
+static inline unsigned long uv_gam_range_base(unsigned long pa)
+{
+ struct uv_gam_range_s *gr = uv_gam_range(pa);
+ int base = gr->base;
+
+ if (base < 0)
+ return 0UL;
+
+ return uv_hub_info->gr_table[base].limit;
+}
+
+/* socket phys RAM --> UV global NASID (UV4+) */
+static inline unsigned long uv_soc_phys_ram_to_nasid(unsigned long paddr)
+{
+ return uv_gam_range(paddr)->nasid;
+}
+#define _uv_soc_phys_ram_to_nasid
+
+/* socket virtual --> UV global NASID (UV4+) */
+static inline unsigned long uv_gpa_nasid(void *v)
+{
+ return uv_soc_phys_ram_to_nasid(__pa(v));
+}
+
+/* socket phys RAM --> UV global physical address */
+static inline unsigned long uv_soc_phys_ram_to_gpa(unsigned long paddr)
+{
+ unsigned int m_val = uv_hub_info->m_val;
+
+ if (paddr < uv_hub_info->lowmem_remap_top)
+ paddr |= uv_hub_info->lowmem_remap_base;
+
+ if (m_val) {
+ paddr |= uv_hub_info->gnode_upper;
+ paddr = ((paddr << uv_hub_info->m_shift)
+ >> uv_hub_info->m_shift) |
+ ((paddr >> uv_hub_info->m_val)
+ << uv_hub_info->n_lshift);
+ } else {
+ paddr |= uv_soc_phys_ram_to_nasid(paddr)
+ << uv_hub_info->gpa_shift;
+ }
+ return paddr;
+}
+
+/* socket virtual --> UV global physical address */
+static inline unsigned long uv_gpa(void *v)
+{
+ return uv_soc_phys_ram_to_gpa(__pa(v));
+}
+
+/* Top two bits indicate the requested address is in MMR space. */
+static inline int
+uv_gpa_in_mmr_space(unsigned long gpa)
+{
+ return (gpa >> 62) == 0x3UL;
+}
+
+/* UV global physical address --> socket phys RAM */
+static inline unsigned long uv_gpa_to_soc_phys_ram(unsigned long gpa)
+{
+ unsigned long paddr;
+ unsigned long remap_base = uv_hub_info->lowmem_remap_base;
+ unsigned long remap_top = uv_hub_info->lowmem_remap_top;
+ unsigned int m_val = uv_hub_info->m_val;
+
+ if (m_val)
+ gpa = ((gpa << uv_hub_info->m_shift) >> uv_hub_info->m_shift) |
+ ((gpa >> uv_hub_info->n_lshift) << uv_hub_info->m_val);
+
+ paddr = gpa & uv_hub_info->gpa_mask;
+ if (paddr >= remap_base && paddr < remap_base + remap_top)
+ paddr -= remap_base;
+ return paddr;
+}
+
+/* gpa -> gnode */
+static inline unsigned long uv_gpa_to_gnode(unsigned long gpa)
+{
+ unsigned int n_lshift = uv_hub_info->n_lshift;
+
+ if (n_lshift)
+ return gpa >> n_lshift;
+
+ return uv_gam_range(gpa)->nasid >> 1;
+}
+
+/* gpa -> pnode */
+static inline int uv_gpa_to_pnode(unsigned long gpa)
+{
+ return uv_gpa_to_gnode(gpa) & uv_hub_info->pnode_mask;
+}
+
+/* gpa -> node offset */
+static inline unsigned long uv_gpa_to_offset(unsigned long gpa)
+{
+ unsigned int m_shift = uv_hub_info->m_shift;
+
+ if (m_shift)
+ return (gpa << m_shift) >> m_shift;
+
+ return (gpa & uv_hub_info->gpa_mask) - uv_gam_range_base(gpa);
+}
+
+/* Convert socket to node */
+static inline int _uv_socket_to_node(int socket, unsigned short *s2nid)
+{
+ return s2nid ? s2nid[socket - uv_hub_info->min_socket] : socket;
+}
+
+static inline int uv_socket_to_node(int socket)
+{
+ return _uv_socket_to_node(socket, uv_hub_info->socket_to_node);
+}
+
+/* pnode, offset --> socket virtual */
+static inline void *uv_pnode_offset_to_vaddr(int pnode, unsigned long offset)
+{
+ unsigned int m_val = uv_hub_info->m_val;
+ unsigned long base;
+ unsigned short sockid, node, *p2s;
+
+ if (m_val)
+ return __va(((unsigned long)pnode << m_val) | offset);
+
+ p2s = uv_hub_info->pnode_to_socket;
+ sockid = p2s ? p2s[pnode - uv_hub_info->min_pnode] : pnode;
+ node = uv_socket_to_node(sockid);
+
+ /* limit address of previous socket is our base, except node 0 is 0 */
+ if (!node)
+ return __va((unsigned long)offset);
+
+ base = (unsigned long)(uv_hub_info->gr_table[node - 1].limit);
+ return __va(base << UV_GAM_RANGE_SHFT | offset);
+}
+
+/* Extract/Convert a PNODE from an APICID (full apicid, not processor subset) */
+static inline int uv_apicid_to_pnode(int apicid)
+{
+ int pnode = apicid >> uv_hub_info->apic_pnode_shift;
+ unsigned short *s2pn = uv_hub_info->socket_to_pnode;
+
+ return s2pn ? s2pn[pnode - uv_hub_info->min_socket] : pnode;
+}
+
+/* Convert an apicid to the socket number on the blade */
+static inline int uv_apicid_to_socket(int apicid)
+{
+ if (is_uv1_hub())
+ return (apicid >> (uv_hub_info->apic_pnode_shift - 1)) & 1;
+ else
+ return 0;
+}
+
+/*
+ * Access global MMRs using the low memory MMR32 space. This region supports
+ * faster MMR access but not all MMRs are accessible in this space.
+ */
+static inline unsigned long *uv_global_mmr32_address(int pnode, unsigned long offset)
+{
+ return __va(UV_GLOBAL_MMR32_BASE |
+ UV_GLOBAL_MMR32_PNODE_BITS(pnode) | offset);
+}
+
+static inline void uv_write_global_mmr32(int pnode, unsigned long offset, unsigned long val)
+{
+ writeq(val, uv_global_mmr32_address(pnode, offset));
+}
+
+static inline unsigned long uv_read_global_mmr32(int pnode, unsigned long offset)
+{
+ return readq(uv_global_mmr32_address(pnode, offset));
+}
+
+/*
+ * Access Global MMR space using the MMR space located at the top of physical
+ * memory.
+ */
+static inline volatile void __iomem *uv_global_mmr64_address(int pnode, unsigned long offset)
+{
+ return __va(UV_GLOBAL_MMR64_BASE |
+ UV_GLOBAL_MMR64_PNODE_BITS(pnode) | offset);
+}
+
+static inline void uv_write_global_mmr64(int pnode, unsigned long offset, unsigned long val)
+{
+ writeq(val, uv_global_mmr64_address(pnode, offset));
+}
+
+static inline unsigned long uv_read_global_mmr64(int pnode, unsigned long offset)
+{
+ return readq(uv_global_mmr64_address(pnode, offset));
+}
+
+static inline void uv_write_global_mmr8(int pnode, unsigned long offset, unsigned char val)
+{
+ writeb(val, uv_global_mmr64_address(pnode, offset));
+}
+
+static inline unsigned char uv_read_global_mmr8(int pnode, unsigned long offset)
+{
+ return readb(uv_global_mmr64_address(pnode, offset));
+}
+
+/*
+ * Access hub local MMRs. Faster than using global space but only local MMRs
+ * are accessible.
+ */
+static inline unsigned long *uv_local_mmr_address(unsigned long offset)
+{
+ return __va(UV_LOCAL_MMR_BASE | offset);
+}
+
+static inline unsigned long uv_read_local_mmr(unsigned long offset)
+{
+ return readq(uv_local_mmr_address(offset));
+}
+
+static inline void uv_write_local_mmr(unsigned long offset, unsigned long val)
+{
+ writeq(val, uv_local_mmr_address(offset));
+}
+
+static inline unsigned char uv_read_local_mmr8(unsigned long offset)
+{
+ return readb(uv_local_mmr_address(offset));
+}
+
+static inline void uv_write_local_mmr8(unsigned long offset, unsigned char val)
+{
+ writeb(val, uv_local_mmr_address(offset));
+}
+
+/* Blade-local cpu number of current cpu. Numbered 0 .. <# cpus on the blade> */
+static inline int uv_blade_processor_id(void)
+{
+ return uv_cpu_info->blade_cpu_id;
+}
+
+/* Blade-local cpu number of cpu N. Numbered 0 .. <# cpus on the blade> */
+static inline int uv_cpu_blade_processor_id(int cpu)
+{
+ return uv_cpu_info_per(cpu)->blade_cpu_id;
+}
+#define _uv_cpu_blade_processor_id 1 /* indicate function available */
+
+/* Blade number to Node number (UV1..UV4 is 1:1) */
+static inline int uv_blade_to_node(int blade)
+{
+ return blade;
+}
+
+/* Blade number of current cpu. Numnbered 0 .. <#blades -1> */
+static inline int uv_numa_blade_id(void)
+{
+ return uv_hub_info->numa_blade_id;
+}
+
+/*
+ * Convert linux node number to the UV blade number.
+ * .. Currently for UV1 thru UV4 the node and the blade are identical.
+ * .. If this changes then you MUST check references to this function!
+ */
+static inline int uv_node_to_blade_id(int nid)
+{
+ return nid;
+}
+
+/* Convert a cpu number to the the UV blade number */
+static inline int uv_cpu_to_blade_id(int cpu)
+{
+ return uv_node_to_blade_id(cpu_to_node(cpu));
+}
+
+/* Convert a blade id to the PNODE of the blade */
+static inline int uv_blade_to_pnode(int bid)
+{
+ return uv_hub_info_list(uv_blade_to_node(bid))->pnode;
+}
+
+/* Nid of memory node on blade. -1 if no blade-local memory */
+static inline int uv_blade_to_memory_nid(int bid)
+{
+ return uv_hub_info_list(uv_blade_to_node(bid))->memory_nid;
+}
+
+/* Determine the number of possible cpus on a blade */
+static inline int uv_blade_nr_possible_cpus(int bid)
+{
+ return uv_hub_info_list(uv_blade_to_node(bid))->nr_possible_cpus;
+}
+
+/* Determine the number of online cpus on a blade */
+static inline int uv_blade_nr_online_cpus(int bid)
+{
+ return uv_hub_info_list(uv_blade_to_node(bid))->nr_online_cpus;
+}
+
+/* Convert a cpu id to the PNODE of the blade containing the cpu */
+static inline int uv_cpu_to_pnode(int cpu)
+{
+ return uv_cpu_hub_info(cpu)->pnode;
+}
+
+/* Convert a linux node number to the PNODE of the blade */
+static inline int uv_node_to_pnode(int nid)
+{
+ return uv_hub_info_list(nid)->pnode;
+}
+
+/* Maximum possible number of blades */
+extern short uv_possible_blades;
+static inline int uv_num_possible_blades(void)
+{
+ return uv_possible_blades;
+}
+
+/* Per Hub NMI support */
+extern void uv_nmi_setup(void);
+extern void uv_nmi_setup_hubless(void);
+
+/* BIOS/Kernel flags exchange MMR */
+#define UVH_BIOS_KERNEL_MMR UVH_SCRATCH5
+#define UVH_BIOS_KERNEL_MMR_ALIAS UVH_SCRATCH5_ALIAS
+#define UVH_BIOS_KERNEL_MMR_ALIAS_2 UVH_SCRATCH5_ALIAS_2
+
+/* TSC sync valid, set by BIOS */
+#define UVH_TSC_SYNC_MMR UVH_BIOS_KERNEL_MMR
+#define UVH_TSC_SYNC_SHIFT 10
+#define UVH_TSC_SYNC_SHIFT_UV2K 16 /* UV2/3k have different bits */
+#define UVH_TSC_SYNC_MASK 3 /* 0011 */
+#define UVH_TSC_SYNC_VALID 3 /* 0011 */
+#define UVH_TSC_SYNC_INVALID 2 /* 0010 */
+
+/* BMC sets a bit this MMR non-zero before sending an NMI */
+#define UVH_NMI_MMR UVH_BIOS_KERNEL_MMR
+#define UVH_NMI_MMR_CLEAR UVH_BIOS_KERNEL_MMR_ALIAS
+#define UVH_NMI_MMR_SHIFT 63
+#define UVH_NMI_MMR_TYPE "SCRATCH5"
+
+/* Newer SMM NMI handler, not present in all systems */
+#define UVH_NMI_MMRX UVH_EVENT_OCCURRED0
+#define UVH_NMI_MMRX_CLEAR UVH_EVENT_OCCURRED0_ALIAS
+#define UVH_NMI_MMRX_SHIFT UVH_EVENT_OCCURRED0_EXTIO_INT0_SHFT
+#define UVH_NMI_MMRX_TYPE "EXTIO_INT0"
+
+/* Non-zero indicates newer SMM NMI handler present */
+#define UVH_NMI_MMRX_SUPPORTED UVH_EXTIO_INT0_BROADCAST
+
+/* Indicates to BIOS that we want to use the newer SMM NMI handler */
+#define UVH_NMI_MMRX_REQ UVH_BIOS_KERNEL_MMR_ALIAS_2
+#define UVH_NMI_MMRX_REQ_SHIFT 62
+
+struct uv_hub_nmi_s {
+ raw_spinlock_t nmi_lock;
+ atomic_t in_nmi; /* flag this node in UV NMI IRQ */
+ atomic_t cpu_owner; /* last locker of this struct */
+ atomic_t read_mmr_count; /* count of MMR reads */
+ atomic_t nmi_count; /* count of true UV NMIs */
+ unsigned long nmi_value; /* last value read from NMI MMR */
+ bool hub_present; /* false means UV hubless system */
+ bool pch_owner; /* indicates this hub owns PCH */
+};
+
+struct uv_cpu_nmi_s {
+ struct uv_hub_nmi_s *hub;
+ int state;
+ int pinging;
+ int queries;
+ int pings;
+};
+
+DECLARE_PER_CPU(struct uv_cpu_nmi_s, uv_cpu_nmi);
+
+#define uv_hub_nmi this_cpu_read(uv_cpu_nmi.hub)
+#define uv_cpu_nmi_per(cpu) (per_cpu(uv_cpu_nmi, cpu))
+#define uv_hub_nmi_per(cpu) (uv_cpu_nmi_per(cpu).hub)
+
+/* uv_cpu_nmi_states */
+#define UV_NMI_STATE_OUT 0
+#define UV_NMI_STATE_IN 1
+#define UV_NMI_STATE_DUMP 2
+#define UV_NMI_STATE_DUMP_DONE 3
+
+/* Update SCIR state */
+static inline void uv_set_scir_bits(unsigned char value)
+{
+ if (uv_scir_info->state != value) {
+ uv_scir_info->state = value;
+ uv_write_local_mmr8(uv_scir_info->offset, value);
+ }
+}
+
+static inline unsigned long uv_scir_offset(int apicid)
+{
+ return SCIR_LOCAL_MMR_BASE | (apicid & 0x3f);
+}
+
+static inline void uv_set_cpu_scir_bits(int cpu, unsigned char value)
+{
+ if (uv_cpu_scir_info(cpu)->state != value) {
+ uv_write_global_mmr8(uv_cpu_to_pnode(cpu),
+ uv_cpu_scir_info(cpu)->offset, value);
+ uv_cpu_scir_info(cpu)->state = value;
+ }
+}
+
+extern unsigned int uv_apicid_hibits;
+static unsigned long uv_hub_ipi_value(int apicid, int vector, int mode)
+{
+ apicid |= uv_apicid_hibits;
+ return (1UL << UVH_IPI_INT_SEND_SHFT) |
+ ((apicid) << UVH_IPI_INT_APIC_ID_SHFT) |
+ (mode << UVH_IPI_INT_DELIVERY_MODE_SHFT) |
+ (vector << UVH_IPI_INT_VECTOR_SHFT);
+}
+
+static inline void uv_hub_send_ipi(int pnode, int apicid, int vector)
+{
+ unsigned long val;
+ unsigned long dmode = dest_Fixed;
+
+ if (vector == NMI_VECTOR)
+ dmode = dest_NMI;
+
+ val = uv_hub_ipi_value(apicid, vector, dmode);
+ uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
+}
+
+/*
+ * Get the minimum revision number of the hub chips within the partition.
+ * (See UVx_HUB_REVISION_BASE above for specific values.)
+ */
+static inline int uv_get_min_hub_revision_id(void)
+{
+ return uv_hub_info->hub_revision;
+}
+
+#endif /* CONFIG_X86_64 */
+#endif /* _ASM_X86_UV_UV_HUB_H */
diff --git a/arch/x86/include/asm/uv/uv_irq.h b/arch/x86/include/asm/uv/uv_irq.h
new file mode 100644
index 000000000..d6b17c760
--- /dev/null
+++ b/arch/x86/include/asm/uv/uv_irq.h
@@ -0,0 +1,38 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * SGI UV IRQ definitions
+ *
+ * Copyright (C) 2008 Silicon Graphics, Inc. All rights reserved.
+ */
+
+#ifndef _ASM_X86_UV_UV_IRQ_H
+#define _ASM_X86_UV_UV_IRQ_H
+
+/* If a generic version of this structure gets defined, eliminate this one. */
+struct uv_IO_APIC_route_entry {
+ __u64 vector : 8,
+ delivery_mode : 3,
+ dest_mode : 1,
+ delivery_status : 1,
+ polarity : 1,
+ __reserved_1 : 1,
+ trigger : 1,
+ mask : 1,
+ __reserved_2 : 15,
+ dest : 32;
+};
+
+enum {
+ UV_AFFINITY_ALL,
+ UV_AFFINITY_NODE,
+ UV_AFFINITY_CPU
+};
+
+extern int uv_irq_2_mmr_info(int, unsigned long *, int *);
+extern int uv_setup_irq(char *, int, int, unsigned long, int);
+extern void uv_teardown_irq(unsigned int);
+
+#endif /* _ASM_X86_UV_UV_IRQ_H */
diff --git a/arch/x86/include/asm/uv/uv_mmrs.h b/arch/x86/include/asm/uv/uv_mmrs.h
new file mode 100644
index 000000000..62c79e26a
--- /dev/null
+++ b/arch/x86/include/asm/uv/uv_mmrs.h
@@ -0,0 +1,4828 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * SGI UV MMR definitions
+ *
+ * Copyright (C) 2007-2016 Silicon Graphics, Inc. All rights reserved.
+ */
+
+#ifndef _ASM_X86_UV_UV_MMRS_H
+#define _ASM_X86_UV_UV_MMRS_H
+
+/*
+ * This file contains MMR definitions for all UV hubs types.
+ *
+ * To minimize coding differences between hub types, the symbols are
+ * grouped by architecture types.
+ *
+ * UVH - definitions common to all UV hub types.
+ * UVXH - definitions common to all UV eXtended hub types (currently 2, 3, 4).
+ * UV1H - definitions specific to UV type 1 hub.
+ * UV2H - definitions specific to UV type 2 hub.
+ * UV3H - definitions specific to UV type 3 hub.
+ * UV4H - definitions specific to UV type 4 hub.
+ *
+ * So in general, MMR addresses and structures are identical on all hubs types.
+ * These MMRs are identified as:
+ * #define UVH_xxx <address>
+ * union uvh_xxx {
+ * unsigned long v;
+ * struct uvh_int_cmpd_s {
+ * } s;
+ * };
+ *
+ * If the MMR exists on all hub types but have different addresses,
+ * use a conditional operator to define the value at runtime.
+ * #define UV1Hxxx a
+ * #define UV2Hxxx b
+ * #define UV3Hxxx c
+ * #define UV4Hxxx d
+ * #define UV4AHxxx e
+ * #define UVHxxx (is_uv1_hub() ? UV1Hxxx :
+ * (is_uv2_hub() ? UV2Hxxx :
+ * (is_uv3_hub() ? UV3Hxxx :
+ * (is_uv4a_hub() ? UV4AHxxx :
+ * UV4Hxxx))
+ *
+ * If the MMR exists on all hub types > 1 but have different addresses, the
+ * variation using "UVX" as the prefix exists.
+ * #define UV2Hxxx b
+ * #define UV3Hxxx c
+ * #define UV4Hxxx d
+ * #define UV4AHxxx e
+ * #define UVHxxx (is_uv2_hub() ? UV2Hxxx :
+ * (is_uv3_hub() ? UV3Hxxx :
+ * (is_uv4a_hub() ? UV4AHxxx :
+ * UV4Hxxx))
+ *
+ * union uvh_xxx {
+ * unsigned long v;
+ * struct uvh_xxx_s { # Common fields only
+ * } s;
+ * struct uv1h_xxx_s { # Full UV1 definition (*)
+ * } s1;
+ * struct uv2h_xxx_s { # Full UV2 definition (*)
+ * } s2;
+ * struct uv3h_xxx_s { # Full UV3 definition (*)
+ * } s3;
+ * (NOTE: No struct uv4ah_xxx_s members exist)
+ * struct uv4h_xxx_s { # Full UV4 definition (*)
+ * } s4;
+ * };
+ * (* - if present and different than the common struct)
+ *
+ * Only essential differences are enumerated. For example, if the address is
+ * the same for all UV's, only a single #define is generated. Likewise,
+ * if the contents is the same for all hubs, only the "s" structure is
+ * generated.
+ *
+ * If the MMR exists on ONLY 1 type of hub, no generic definition is
+ * generated:
+ * #define UVnH_xxx <uvn address>
+ * union uvnh_xxx {
+ * unsigned long v;
+ * struct uvh_int_cmpd_s {
+ * } sn;
+ * };
+ *
+ * (GEN Flags: mflags_opt= undefs=function UV234=UVXH)
+ */
+
+#define UV_MMR_ENABLE (1UL << 63)
+
+#define UV1_HUB_PART_NUMBER 0x88a5
+#define UV2_HUB_PART_NUMBER 0x8eb8
+#define UV2_HUB_PART_NUMBER_X 0x1111
+#define UV3_HUB_PART_NUMBER 0x9578
+#define UV3_HUB_PART_NUMBER_X 0x4321
+#define UV4_HUB_PART_NUMBER 0x99a1
+
+/* Compat: Indicate which UV Hubs are supported. */
+#define UV1_HUB_IS_SUPPORTED 1
+#define UV2_HUB_IS_SUPPORTED 1
+#define UV3_HUB_IS_SUPPORTED 1
+#define UV4_HUB_IS_SUPPORTED 1
+#define UV4A_HUB_IS_SUPPORTED 1
+
+/* Error function to catch undefined references */
+extern unsigned long uv_undefined(char *str);
+
+/* ========================================================================= */
+/* UVH_BAU_DATA_BROADCAST */
+/* ========================================================================= */
+#define UVH_BAU_DATA_BROADCAST 0x61688UL
+
+#define UV1H_BAU_DATA_BROADCAST_32 0x440
+#define UV2H_BAU_DATA_BROADCAST_32 0x440
+#define UV3H_BAU_DATA_BROADCAST_32 0x440
+#define UV4H_BAU_DATA_BROADCAST_32 0x360
+#define UVH_BAU_DATA_BROADCAST_32 ( \
+ is_uv1_hub() ? UV1H_BAU_DATA_BROADCAST_32 : \
+ is_uv2_hub() ? UV2H_BAU_DATA_BROADCAST_32 : \
+ is_uv3_hub() ? UV3H_BAU_DATA_BROADCAST_32 : \
+ /*is_uv4_hub*/ UV4H_BAU_DATA_BROADCAST_32)
+
+#define UVH_BAU_DATA_BROADCAST_ENABLE_SHFT 0
+#define UVH_BAU_DATA_BROADCAST_ENABLE_MASK 0x0000000000000001UL
+
+
+union uvh_bau_data_broadcast_u {
+ unsigned long v;
+ struct uvh_bau_data_broadcast_s {
+ unsigned long enable:1; /* RW */
+ unsigned long rsvd_1_63:63;
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_BAU_DATA_CONFIG */
+/* ========================================================================= */
+#define UVH_BAU_DATA_CONFIG 0x61680UL
+
+#define UV1H_BAU_DATA_CONFIG_32 0x438
+#define UV2H_BAU_DATA_CONFIG_32 0x438
+#define UV3H_BAU_DATA_CONFIG_32 0x438
+#define UV4H_BAU_DATA_CONFIG_32 0x358
+#define UVH_BAU_DATA_CONFIG_32 ( \
+ is_uv1_hub() ? UV1H_BAU_DATA_CONFIG_32 : \
+ is_uv2_hub() ? UV2H_BAU_DATA_CONFIG_32 : \
+ is_uv3_hub() ? UV3H_BAU_DATA_CONFIG_32 : \
+ /*is_uv4_hub*/ UV4H_BAU_DATA_CONFIG_32)
+
+#define UVH_BAU_DATA_CONFIG_VECTOR_SHFT 0
+#define UVH_BAU_DATA_CONFIG_DM_SHFT 8
+#define UVH_BAU_DATA_CONFIG_DESTMODE_SHFT 11
+#define UVH_BAU_DATA_CONFIG_STATUS_SHFT 12
+#define UVH_BAU_DATA_CONFIG_P_SHFT 13
+#define UVH_BAU_DATA_CONFIG_T_SHFT 15
+#define UVH_BAU_DATA_CONFIG_M_SHFT 16
+#define UVH_BAU_DATA_CONFIG_APIC_ID_SHFT 32
+#define UVH_BAU_DATA_CONFIG_VECTOR_MASK 0x00000000000000ffUL
+#define UVH_BAU_DATA_CONFIG_DM_MASK 0x0000000000000700UL
+#define UVH_BAU_DATA_CONFIG_DESTMODE_MASK 0x0000000000000800UL
+#define UVH_BAU_DATA_CONFIG_STATUS_MASK 0x0000000000001000UL
+#define UVH_BAU_DATA_CONFIG_P_MASK 0x0000000000002000UL
+#define UVH_BAU_DATA_CONFIG_T_MASK 0x0000000000008000UL
+#define UVH_BAU_DATA_CONFIG_M_MASK 0x0000000000010000UL
+#define UVH_BAU_DATA_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
+
+
+union uvh_bau_data_config_u {
+ unsigned long v;
+ struct uvh_bau_data_config_s {
+ unsigned long vector_:8; /* RW */
+ unsigned long dm:3; /* RW */
+ unsigned long destmode:1; /* RW */
+ unsigned long status:1; /* RO */
+ unsigned long p:1; /* RO */
+ unsigned long rsvd_14:1;
+ unsigned long t:1; /* RO */
+ unsigned long m:1; /* RW */
+ unsigned long rsvd_17_31:15;
+ unsigned long apic_id:32; /* RW */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_EVENT_OCCURRED0 */
+/* ========================================================================= */
+#define UVH_EVENT_OCCURRED0 0x70000UL
+#define UVH_EVENT_OCCURRED0_32 0x5e8
+
+#define UVH_EVENT_OCCURRED0_LB_HCERR_SHFT 0
+#define UVH_EVENT_OCCURRED0_RH_AOERR0_SHFT 11
+#define UVH_EVENT_OCCURRED0_LB_HCERR_MASK 0x0000000000000001UL
+#define UVH_EVENT_OCCURRED0_RH_AOERR0_MASK 0x0000000000000800UL
+
+#define UV1H_EVENT_OCCURRED0_GR0_HCERR_SHFT 1
+#define UV1H_EVENT_OCCURRED0_GR1_HCERR_SHFT 2
+#define UV1H_EVENT_OCCURRED0_LH_HCERR_SHFT 3
+#define UV1H_EVENT_OCCURRED0_RH_HCERR_SHFT 4
+#define UV1H_EVENT_OCCURRED0_XN_HCERR_SHFT 5
+#define UV1H_EVENT_OCCURRED0_SI_HCERR_SHFT 6
+#define UV1H_EVENT_OCCURRED0_LB_AOERR0_SHFT 7
+#define UV1H_EVENT_OCCURRED0_GR0_AOERR0_SHFT 8
+#define UV1H_EVENT_OCCURRED0_GR1_AOERR0_SHFT 9
+#define UV1H_EVENT_OCCURRED0_LH_AOERR0_SHFT 10
+#define UV1H_EVENT_OCCURRED0_XN_AOERR0_SHFT 12
+#define UV1H_EVENT_OCCURRED0_SI_AOERR0_SHFT 13
+#define UV1H_EVENT_OCCURRED0_LB_AOERR1_SHFT 14
+#define UV1H_EVENT_OCCURRED0_GR0_AOERR1_SHFT 15
+#define UV1H_EVENT_OCCURRED0_GR1_AOERR1_SHFT 16
+#define UV1H_EVENT_OCCURRED0_LH_AOERR1_SHFT 17
+#define UV1H_EVENT_OCCURRED0_RH_AOERR1_SHFT 18
+#define UV1H_EVENT_OCCURRED0_XN_AOERR1_SHFT 19
+#define UV1H_EVENT_OCCURRED0_SI_AOERR1_SHFT 20
+#define UV1H_EVENT_OCCURRED0_RH_VPI_INT_SHFT 21
+#define UV1H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT 22
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT 23
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT 24
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT 25
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT 26
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT 27
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT 28
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT 29
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT 30
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT 31
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT 32
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT 33
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT 34
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT 35
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT 36
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT 37
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT 38
+#define UV1H_EVENT_OCCURRED0_L1_NMI_INT_SHFT 39
+#define UV1H_EVENT_OCCURRED0_STOP_CLOCK_SHFT 40
+#define UV1H_EVENT_OCCURRED0_ASIC_TO_L1_SHFT 41
+#define UV1H_EVENT_OCCURRED0_L1_TO_ASIC_SHFT 42
+#define UV1H_EVENT_OCCURRED0_LTC_INT_SHFT 43
+#define UV1H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT 44
+#define UV1H_EVENT_OCCURRED0_IPI_INT_SHFT 45
+#define UV1H_EVENT_OCCURRED0_EXTIO_INT0_SHFT 46
+#define UV1H_EVENT_OCCURRED0_EXTIO_INT1_SHFT 47
+#define UV1H_EVENT_OCCURRED0_EXTIO_INT2_SHFT 48
+#define UV1H_EVENT_OCCURRED0_EXTIO_INT3_SHFT 49
+#define UV1H_EVENT_OCCURRED0_PROFILE_INT_SHFT 50
+#define UV1H_EVENT_OCCURRED0_RTC0_SHFT 51
+#define UV1H_EVENT_OCCURRED0_RTC1_SHFT 52
+#define UV1H_EVENT_OCCURRED0_RTC2_SHFT 53
+#define UV1H_EVENT_OCCURRED0_RTC3_SHFT 54
+#define UV1H_EVENT_OCCURRED0_BAU_DATA_SHFT 55
+#define UV1H_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_SHFT 56
+#define UV1H_EVENT_OCCURRED0_GR0_HCERR_MASK 0x0000000000000002UL
+#define UV1H_EVENT_OCCURRED0_GR1_HCERR_MASK 0x0000000000000004UL
+#define UV1H_EVENT_OCCURRED0_LH_HCERR_MASK 0x0000000000000008UL
+#define UV1H_EVENT_OCCURRED0_RH_HCERR_MASK 0x0000000000000010UL
+#define UV1H_EVENT_OCCURRED0_XN_HCERR_MASK 0x0000000000000020UL
+#define UV1H_EVENT_OCCURRED0_SI_HCERR_MASK 0x0000000000000040UL
+#define UV1H_EVENT_OCCURRED0_LB_AOERR0_MASK 0x0000000000000080UL
+#define UV1H_EVENT_OCCURRED0_GR0_AOERR0_MASK 0x0000000000000100UL
+#define UV1H_EVENT_OCCURRED0_GR1_AOERR0_MASK 0x0000000000000200UL
+#define UV1H_EVENT_OCCURRED0_LH_AOERR0_MASK 0x0000000000000400UL
+#define UV1H_EVENT_OCCURRED0_XN_AOERR0_MASK 0x0000000000001000UL
+#define UV1H_EVENT_OCCURRED0_SI_AOERR0_MASK 0x0000000000002000UL
+#define UV1H_EVENT_OCCURRED0_LB_AOERR1_MASK 0x0000000000004000UL
+#define UV1H_EVENT_OCCURRED0_GR0_AOERR1_MASK 0x0000000000008000UL
+#define UV1H_EVENT_OCCURRED0_GR1_AOERR1_MASK 0x0000000000010000UL
+#define UV1H_EVENT_OCCURRED0_LH_AOERR1_MASK 0x0000000000020000UL
+#define UV1H_EVENT_OCCURRED0_RH_AOERR1_MASK 0x0000000000040000UL
+#define UV1H_EVENT_OCCURRED0_XN_AOERR1_MASK 0x0000000000080000UL
+#define UV1H_EVENT_OCCURRED0_SI_AOERR1_MASK 0x0000000000100000UL
+#define UV1H_EVENT_OCCURRED0_RH_VPI_INT_MASK 0x0000000000200000UL
+#define UV1H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK 0x0000000000400000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK 0x0000000000800000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK 0x0000000001000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK 0x0000000002000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK 0x0000000004000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK 0x0000000008000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK 0x0000000010000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK 0x0000000020000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK 0x0000000040000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK 0x0000000080000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK 0x0000000100000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK 0x0000000200000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK 0x0000000400000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK 0x0000000800000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK 0x0000001000000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK 0x0000002000000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK 0x0000004000000000UL
+#define UV1H_EVENT_OCCURRED0_L1_NMI_INT_MASK 0x0000008000000000UL
+#define UV1H_EVENT_OCCURRED0_STOP_CLOCK_MASK 0x0000010000000000UL
+#define UV1H_EVENT_OCCURRED0_ASIC_TO_L1_MASK 0x0000020000000000UL
+#define UV1H_EVENT_OCCURRED0_L1_TO_ASIC_MASK 0x0000040000000000UL
+#define UV1H_EVENT_OCCURRED0_LTC_INT_MASK 0x0000080000000000UL
+#define UV1H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK 0x0000100000000000UL
+#define UV1H_EVENT_OCCURRED0_IPI_INT_MASK 0x0000200000000000UL
+#define UV1H_EVENT_OCCURRED0_EXTIO_INT0_MASK 0x0000400000000000UL
+#define UV1H_EVENT_OCCURRED0_EXTIO_INT1_MASK 0x0000800000000000UL
+#define UV1H_EVENT_OCCURRED0_EXTIO_INT2_MASK 0x0001000000000000UL
+#define UV1H_EVENT_OCCURRED0_EXTIO_INT3_MASK 0x0002000000000000UL
+#define UV1H_EVENT_OCCURRED0_PROFILE_INT_MASK 0x0004000000000000UL
+#define UV1H_EVENT_OCCURRED0_RTC0_MASK 0x0008000000000000UL
+#define UV1H_EVENT_OCCURRED0_RTC1_MASK 0x0010000000000000UL
+#define UV1H_EVENT_OCCURRED0_RTC2_MASK 0x0020000000000000UL
+#define UV1H_EVENT_OCCURRED0_RTC3_MASK 0x0040000000000000UL
+#define UV1H_EVENT_OCCURRED0_BAU_DATA_MASK 0x0080000000000000UL
+#define UV1H_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_MASK 0x0100000000000000UL
+
+#define UVXH_EVENT_OCCURRED0_RH_HCERR_SHFT 2
+#define UVXH_EVENT_OCCURRED0_LH0_HCERR_SHFT 3
+#define UVXH_EVENT_OCCURRED0_LH1_HCERR_SHFT 4
+#define UVXH_EVENT_OCCURRED0_GR0_HCERR_SHFT 5
+#define UVXH_EVENT_OCCURRED0_GR1_HCERR_SHFT 6
+#define UVXH_EVENT_OCCURRED0_NI0_HCERR_SHFT 7
+#define UVXH_EVENT_OCCURRED0_NI1_HCERR_SHFT 8
+#define UVXH_EVENT_OCCURRED0_LB_AOERR0_SHFT 9
+#define UVXH_EVENT_OCCURRED0_LH0_AOERR0_SHFT 12
+#define UVXH_EVENT_OCCURRED0_LH1_AOERR0_SHFT 13
+#define UVXH_EVENT_OCCURRED0_GR0_AOERR0_SHFT 14
+#define UVXH_EVENT_OCCURRED0_GR1_AOERR0_SHFT 15
+#define UVXH_EVENT_OCCURRED0_XB_AOERR0_SHFT 16
+#define UVXH_EVENT_OCCURRED0_RH_HCERR_MASK 0x0000000000000004UL
+#define UVXH_EVENT_OCCURRED0_LH0_HCERR_MASK 0x0000000000000008UL
+#define UVXH_EVENT_OCCURRED0_LH1_HCERR_MASK 0x0000000000000010UL
+#define UVXH_EVENT_OCCURRED0_GR0_HCERR_MASK 0x0000000000000020UL
+#define UVXH_EVENT_OCCURRED0_GR1_HCERR_MASK 0x0000000000000040UL
+#define UVXH_EVENT_OCCURRED0_NI0_HCERR_MASK 0x0000000000000080UL
+#define UVXH_EVENT_OCCURRED0_NI1_HCERR_MASK 0x0000000000000100UL
+#define UVXH_EVENT_OCCURRED0_LB_AOERR0_MASK 0x0000000000000200UL
+#define UVXH_EVENT_OCCURRED0_LH0_AOERR0_MASK 0x0000000000001000UL
+#define UVXH_EVENT_OCCURRED0_LH1_AOERR0_MASK 0x0000000000002000UL
+#define UVXH_EVENT_OCCURRED0_GR0_AOERR0_MASK 0x0000000000004000UL
+#define UVXH_EVENT_OCCURRED0_GR1_AOERR0_MASK 0x0000000000008000UL
+#define UVXH_EVENT_OCCURRED0_XB_AOERR0_MASK 0x0000000000010000UL
+
+#define UV2H_EVENT_OCCURRED0_QP_HCERR_SHFT 1
+#define UV2H_EVENT_OCCURRED0_QP_AOERR0_SHFT 10
+#define UV2H_EVENT_OCCURRED0_RT_AOERR0_SHFT 17
+#define UV2H_EVENT_OCCURRED0_NI0_AOERR0_SHFT 18
+#define UV2H_EVENT_OCCURRED0_NI1_AOERR0_SHFT 19
+#define UV2H_EVENT_OCCURRED0_LB_AOERR1_SHFT 20
+#define UV2H_EVENT_OCCURRED0_QP_AOERR1_SHFT 21
+#define UV2H_EVENT_OCCURRED0_RH_AOERR1_SHFT 22
+#define UV2H_EVENT_OCCURRED0_LH0_AOERR1_SHFT 23
+#define UV2H_EVENT_OCCURRED0_LH1_AOERR1_SHFT 24
+#define UV2H_EVENT_OCCURRED0_GR0_AOERR1_SHFT 25
+#define UV2H_EVENT_OCCURRED0_GR1_AOERR1_SHFT 26
+#define UV2H_EVENT_OCCURRED0_XB_AOERR1_SHFT 27
+#define UV2H_EVENT_OCCURRED0_RT_AOERR1_SHFT 28
+#define UV2H_EVENT_OCCURRED0_NI0_AOERR1_SHFT 29
+#define UV2H_EVENT_OCCURRED0_NI1_AOERR1_SHFT 30
+#define UV2H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT 31
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT 32
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT 33
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT 34
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT 35
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT 36
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT 37
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT 38
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT 39
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT 40
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT 41
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT 42
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT 43
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT 44
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT 45
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT 46
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT 47
+#define UV2H_EVENT_OCCURRED0_L1_NMI_INT_SHFT 48
+#define UV2H_EVENT_OCCURRED0_STOP_CLOCK_SHFT 49
+#define UV2H_EVENT_OCCURRED0_ASIC_TO_L1_SHFT 50
+#define UV2H_EVENT_OCCURRED0_L1_TO_ASIC_SHFT 51
+#define UV2H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT 52
+#define UV2H_EVENT_OCCURRED0_IPI_INT_SHFT 53
+#define UV2H_EVENT_OCCURRED0_EXTIO_INT0_SHFT 54
+#define UV2H_EVENT_OCCURRED0_EXTIO_INT1_SHFT 55
+#define UV2H_EVENT_OCCURRED0_EXTIO_INT2_SHFT 56
+#define UV2H_EVENT_OCCURRED0_EXTIO_INT3_SHFT 57
+#define UV2H_EVENT_OCCURRED0_PROFILE_INT_SHFT 58
+#define UV2H_EVENT_OCCURRED0_QP_HCERR_MASK 0x0000000000000002UL
+#define UV2H_EVENT_OCCURRED0_QP_AOERR0_MASK 0x0000000000000400UL
+#define UV2H_EVENT_OCCURRED0_RT_AOERR0_MASK 0x0000000000020000UL
+#define UV2H_EVENT_OCCURRED0_NI0_AOERR0_MASK 0x0000000000040000UL
+#define UV2H_EVENT_OCCURRED0_NI1_AOERR0_MASK 0x0000000000080000UL
+#define UV2H_EVENT_OCCURRED0_LB_AOERR1_MASK 0x0000000000100000UL
+#define UV2H_EVENT_OCCURRED0_QP_AOERR1_MASK 0x0000000000200000UL
+#define UV2H_EVENT_OCCURRED0_RH_AOERR1_MASK 0x0000000000400000UL
+#define UV2H_EVENT_OCCURRED0_LH0_AOERR1_MASK 0x0000000000800000UL
+#define UV2H_EVENT_OCCURRED0_LH1_AOERR1_MASK 0x0000000001000000UL
+#define UV2H_EVENT_OCCURRED0_GR0_AOERR1_MASK 0x0000000002000000UL
+#define UV2H_EVENT_OCCURRED0_GR1_AOERR1_MASK 0x0000000004000000UL
+#define UV2H_EVENT_OCCURRED0_XB_AOERR1_MASK 0x0000000008000000UL
+#define UV2H_EVENT_OCCURRED0_RT_AOERR1_MASK 0x0000000010000000UL
+#define UV2H_EVENT_OCCURRED0_NI0_AOERR1_MASK 0x0000000020000000UL
+#define UV2H_EVENT_OCCURRED0_NI1_AOERR1_MASK 0x0000000040000000UL
+#define UV2H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK 0x0000000080000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK 0x0000000100000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK 0x0000000200000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK 0x0000000400000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK 0x0000000800000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK 0x0000001000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK 0x0000002000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK 0x0000004000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK 0x0000008000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK 0x0000010000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK 0x0000020000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK 0x0000040000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK 0x0000080000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK 0x0000100000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK 0x0000200000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK 0x0000400000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK 0x0000800000000000UL
+#define UV2H_EVENT_OCCURRED0_L1_NMI_INT_MASK 0x0001000000000000UL
+#define UV2H_EVENT_OCCURRED0_STOP_CLOCK_MASK 0x0002000000000000UL
+#define UV2H_EVENT_OCCURRED0_ASIC_TO_L1_MASK 0x0004000000000000UL
+#define UV2H_EVENT_OCCURRED0_L1_TO_ASIC_MASK 0x0008000000000000UL
+#define UV2H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK 0x0010000000000000UL
+#define UV2H_EVENT_OCCURRED0_IPI_INT_MASK 0x0020000000000000UL
+#define UV2H_EVENT_OCCURRED0_EXTIO_INT0_MASK 0x0040000000000000UL
+#define UV2H_EVENT_OCCURRED0_EXTIO_INT1_MASK 0x0080000000000000UL
+#define UV2H_EVENT_OCCURRED0_EXTIO_INT2_MASK 0x0100000000000000UL
+#define UV2H_EVENT_OCCURRED0_EXTIO_INT3_MASK 0x0200000000000000UL
+#define UV2H_EVENT_OCCURRED0_PROFILE_INT_MASK 0x0400000000000000UL
+
+#define UV3H_EVENT_OCCURRED0_QP_HCERR_SHFT 1
+#define UV3H_EVENT_OCCURRED0_QP_AOERR0_SHFT 10
+#define UV3H_EVENT_OCCURRED0_RT_AOERR0_SHFT 17
+#define UV3H_EVENT_OCCURRED0_NI0_AOERR0_SHFT 18
+#define UV3H_EVENT_OCCURRED0_NI1_AOERR0_SHFT 19
+#define UV3H_EVENT_OCCURRED0_LB_AOERR1_SHFT 20
+#define UV3H_EVENT_OCCURRED0_QP_AOERR1_SHFT 21
+#define UV3H_EVENT_OCCURRED0_RH_AOERR1_SHFT 22
+#define UV3H_EVENT_OCCURRED0_LH0_AOERR1_SHFT 23
+#define UV3H_EVENT_OCCURRED0_LH1_AOERR1_SHFT 24
+#define UV3H_EVENT_OCCURRED0_GR0_AOERR1_SHFT 25
+#define UV3H_EVENT_OCCURRED0_GR1_AOERR1_SHFT 26
+#define UV3H_EVENT_OCCURRED0_XB_AOERR1_SHFT 27
+#define UV3H_EVENT_OCCURRED0_RT_AOERR1_SHFT 28
+#define UV3H_EVENT_OCCURRED0_NI0_AOERR1_SHFT 29
+#define UV3H_EVENT_OCCURRED0_NI1_AOERR1_SHFT 30
+#define UV3H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT 31
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT 32
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT 33
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT 34
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT 35
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT 36
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT 37
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT 38
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT 39
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT 40
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT 41
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT 42
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT 43
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT 44
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT 45
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT 46
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT 47
+#define UV3H_EVENT_OCCURRED0_L1_NMI_INT_SHFT 48
+#define UV3H_EVENT_OCCURRED0_STOP_CLOCK_SHFT 49
+#define UV3H_EVENT_OCCURRED0_ASIC_TO_L1_SHFT 50
+#define UV3H_EVENT_OCCURRED0_L1_TO_ASIC_SHFT 51
+#define UV3H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT 52
+#define UV3H_EVENT_OCCURRED0_IPI_INT_SHFT 53
+#define UV3H_EVENT_OCCURRED0_EXTIO_INT0_SHFT 54
+#define UV3H_EVENT_OCCURRED0_EXTIO_INT1_SHFT 55
+#define UV3H_EVENT_OCCURRED0_EXTIO_INT2_SHFT 56
+#define UV3H_EVENT_OCCURRED0_EXTIO_INT3_SHFT 57
+#define UV3H_EVENT_OCCURRED0_PROFILE_INT_SHFT 58
+#define UV3H_EVENT_OCCURRED0_QP_HCERR_MASK 0x0000000000000002UL
+#define UV3H_EVENT_OCCURRED0_QP_AOERR0_MASK 0x0000000000000400UL
+#define UV3H_EVENT_OCCURRED0_RT_AOERR0_MASK 0x0000000000020000UL
+#define UV3H_EVENT_OCCURRED0_NI0_AOERR0_MASK 0x0000000000040000UL
+#define UV3H_EVENT_OCCURRED0_NI1_AOERR0_MASK 0x0000000000080000UL
+#define UV3H_EVENT_OCCURRED0_LB_AOERR1_MASK 0x0000000000100000UL
+#define UV3H_EVENT_OCCURRED0_QP_AOERR1_MASK 0x0000000000200000UL
+#define UV3H_EVENT_OCCURRED0_RH_AOERR1_MASK 0x0000000000400000UL
+#define UV3H_EVENT_OCCURRED0_LH0_AOERR1_MASK 0x0000000000800000UL
+#define UV3H_EVENT_OCCURRED0_LH1_AOERR1_MASK 0x0000000001000000UL
+#define UV3H_EVENT_OCCURRED0_GR0_AOERR1_MASK 0x0000000002000000UL
+#define UV3H_EVENT_OCCURRED0_GR1_AOERR1_MASK 0x0000000004000000UL
+#define UV3H_EVENT_OCCURRED0_XB_AOERR1_MASK 0x0000000008000000UL
+#define UV3H_EVENT_OCCURRED0_RT_AOERR1_MASK 0x0000000010000000UL
+#define UV3H_EVENT_OCCURRED0_NI0_AOERR1_MASK 0x0000000020000000UL
+#define UV3H_EVENT_OCCURRED0_NI1_AOERR1_MASK 0x0000000040000000UL
+#define UV3H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK 0x0000000080000000UL
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK 0x0000000100000000UL
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK 0x0000000200000000UL
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK 0x0000000400000000UL
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK 0x0000000800000000UL
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK 0x0000001000000000UL
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK 0x0000002000000000UL
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK 0x0000004000000000UL
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK 0x0000008000000000UL
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK 0x0000010000000000UL
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK 0x0000020000000000UL
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK 0x0000040000000000UL
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK 0x0000080000000000UL
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK 0x0000100000000000UL
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK 0x0000200000000000UL
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK 0x0000400000000000UL
+#define UV3H_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK 0x0000800000000000UL
+#define UV3H_EVENT_OCCURRED0_L1_NMI_INT_MASK 0x0001000000000000UL
+#define UV3H_EVENT_OCCURRED0_STOP_CLOCK_MASK 0x0002000000000000UL
+#define UV3H_EVENT_OCCURRED0_ASIC_TO_L1_MASK 0x0004000000000000UL
+#define UV3H_EVENT_OCCURRED0_L1_TO_ASIC_MASK 0x0008000000000000UL
+#define UV3H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK 0x0010000000000000UL
+#define UV3H_EVENT_OCCURRED0_IPI_INT_MASK 0x0020000000000000UL
+#define UV3H_EVENT_OCCURRED0_EXTIO_INT0_MASK 0x0040000000000000UL
+#define UV3H_EVENT_OCCURRED0_EXTIO_INT1_MASK 0x0080000000000000UL
+#define UV3H_EVENT_OCCURRED0_EXTIO_INT2_MASK 0x0100000000000000UL
+#define UV3H_EVENT_OCCURRED0_EXTIO_INT3_MASK 0x0200000000000000UL
+#define UV3H_EVENT_OCCURRED0_PROFILE_INT_MASK 0x0400000000000000UL
+
+#define UV4H_EVENT_OCCURRED0_KT_HCERR_SHFT 1
+#define UV4H_EVENT_OCCURRED0_KT_AOERR0_SHFT 10
+#define UV4H_EVENT_OCCURRED0_RTQ0_AOERR0_SHFT 17
+#define UV4H_EVENT_OCCURRED0_RTQ1_AOERR0_SHFT 18
+#define UV4H_EVENT_OCCURRED0_RTQ2_AOERR0_SHFT 19
+#define UV4H_EVENT_OCCURRED0_RTQ3_AOERR0_SHFT 20
+#define UV4H_EVENT_OCCURRED0_NI0_AOERR0_SHFT 21
+#define UV4H_EVENT_OCCURRED0_NI1_AOERR0_SHFT 22
+#define UV4H_EVENT_OCCURRED0_LB_AOERR1_SHFT 23
+#define UV4H_EVENT_OCCURRED0_KT_AOERR1_SHFT 24
+#define UV4H_EVENT_OCCURRED0_RH_AOERR1_SHFT 25
+#define UV4H_EVENT_OCCURRED0_LH0_AOERR1_SHFT 26
+#define UV4H_EVENT_OCCURRED0_LH1_AOERR1_SHFT 27
+#define UV4H_EVENT_OCCURRED0_GR0_AOERR1_SHFT 28
+#define UV4H_EVENT_OCCURRED0_GR1_AOERR1_SHFT 29
+#define UV4H_EVENT_OCCURRED0_XB_AOERR1_SHFT 30
+#define UV4H_EVENT_OCCURRED0_RTQ0_AOERR1_SHFT 31
+#define UV4H_EVENT_OCCURRED0_RTQ1_AOERR1_SHFT 32
+#define UV4H_EVENT_OCCURRED0_RTQ2_AOERR1_SHFT 33
+#define UV4H_EVENT_OCCURRED0_RTQ3_AOERR1_SHFT 34
+#define UV4H_EVENT_OCCURRED0_NI0_AOERR1_SHFT 35
+#define UV4H_EVENT_OCCURRED0_NI1_AOERR1_SHFT 36
+#define UV4H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT 37
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT 38
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT 39
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT 40
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT 41
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT 42
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT 43
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT 44
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT 45
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT 46
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT 47
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT 48
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT 49
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT 50
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT 51
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT 52
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT 53
+#define UV4H_EVENT_OCCURRED0_L1_NMI_INT_SHFT 54
+#define UV4H_EVENT_OCCURRED0_STOP_CLOCK_SHFT 55
+#define UV4H_EVENT_OCCURRED0_ASIC_TO_L1_SHFT 56
+#define UV4H_EVENT_OCCURRED0_L1_TO_ASIC_SHFT 57
+#define UV4H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT 58
+#define UV4H_EVENT_OCCURRED0_IPI_INT_SHFT 59
+#define UV4H_EVENT_OCCURRED0_EXTIO_INT0_SHFT 60
+#define UV4H_EVENT_OCCURRED0_EXTIO_INT1_SHFT 61
+#define UV4H_EVENT_OCCURRED0_EXTIO_INT2_SHFT 62
+#define UV4H_EVENT_OCCURRED0_EXTIO_INT3_SHFT 63
+#define UV4H_EVENT_OCCURRED0_KT_HCERR_MASK 0x0000000000000002UL
+#define UV4H_EVENT_OCCURRED0_KT_AOERR0_MASK 0x0000000000000400UL
+#define UV4H_EVENT_OCCURRED0_RTQ0_AOERR0_MASK 0x0000000000020000UL
+#define UV4H_EVENT_OCCURRED0_RTQ1_AOERR0_MASK 0x0000000000040000UL
+#define UV4H_EVENT_OCCURRED0_RTQ2_AOERR0_MASK 0x0000000000080000UL
+#define UV4H_EVENT_OCCURRED0_RTQ3_AOERR0_MASK 0x0000000000100000UL
+#define UV4H_EVENT_OCCURRED0_NI0_AOERR0_MASK 0x0000000000200000UL
+#define UV4H_EVENT_OCCURRED0_NI1_AOERR0_MASK 0x0000000000400000UL
+#define UV4H_EVENT_OCCURRED0_LB_AOERR1_MASK 0x0000000000800000UL
+#define UV4H_EVENT_OCCURRED0_KT_AOERR1_MASK 0x0000000001000000UL
+#define UV4H_EVENT_OCCURRED0_RH_AOERR1_MASK 0x0000000002000000UL
+#define UV4H_EVENT_OCCURRED0_LH0_AOERR1_MASK 0x0000000004000000UL
+#define UV4H_EVENT_OCCURRED0_LH1_AOERR1_MASK 0x0000000008000000UL
+#define UV4H_EVENT_OCCURRED0_GR0_AOERR1_MASK 0x0000000010000000UL
+#define UV4H_EVENT_OCCURRED0_GR1_AOERR1_MASK 0x0000000020000000UL
+#define UV4H_EVENT_OCCURRED0_XB_AOERR1_MASK 0x0000000040000000UL
+#define UV4H_EVENT_OCCURRED0_RTQ0_AOERR1_MASK 0x0000000080000000UL
+#define UV4H_EVENT_OCCURRED0_RTQ1_AOERR1_MASK 0x0000000100000000UL
+#define UV4H_EVENT_OCCURRED0_RTQ2_AOERR1_MASK 0x0000000200000000UL
+#define UV4H_EVENT_OCCURRED0_RTQ3_AOERR1_MASK 0x0000000400000000UL
+#define UV4H_EVENT_OCCURRED0_NI0_AOERR1_MASK 0x0000000800000000UL
+#define UV4H_EVENT_OCCURRED0_NI1_AOERR1_MASK 0x0000001000000000UL
+#define UV4H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK 0x0000002000000000UL
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK 0x0000004000000000UL
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK 0x0000008000000000UL
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK 0x0000010000000000UL
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK 0x0000020000000000UL
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK 0x0000040000000000UL
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK 0x0000080000000000UL
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK 0x0000100000000000UL
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK 0x0000200000000000UL
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK 0x0000400000000000UL
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK 0x0000800000000000UL
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK 0x0001000000000000UL
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK 0x0002000000000000UL
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK 0x0004000000000000UL
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK 0x0008000000000000UL
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK 0x0010000000000000UL
+#define UV4H_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK 0x0020000000000000UL
+#define UV4H_EVENT_OCCURRED0_L1_NMI_INT_MASK 0x0040000000000000UL
+#define UV4H_EVENT_OCCURRED0_STOP_CLOCK_MASK 0x0080000000000000UL
+#define UV4H_EVENT_OCCURRED0_ASIC_TO_L1_MASK 0x0100000000000000UL
+#define UV4H_EVENT_OCCURRED0_L1_TO_ASIC_MASK 0x0200000000000000UL
+#define UV4H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK 0x0400000000000000UL
+#define UV4H_EVENT_OCCURRED0_IPI_INT_MASK 0x0800000000000000UL
+#define UV4H_EVENT_OCCURRED0_EXTIO_INT0_MASK 0x1000000000000000UL
+#define UV4H_EVENT_OCCURRED0_EXTIO_INT1_MASK 0x2000000000000000UL
+#define UV4H_EVENT_OCCURRED0_EXTIO_INT2_MASK 0x4000000000000000UL
+#define UV4H_EVENT_OCCURRED0_EXTIO_INT3_MASK 0x8000000000000000UL
+
+#define UVH_EVENT_OCCURRED0_EXTIO_INT0_SHFT ( \
+ is_uv1_hub() ? UV1H_EVENT_OCCURRED0_EXTIO_INT0_SHFT : \
+ is_uv2_hub() ? UV2H_EVENT_OCCURRED0_EXTIO_INT0_SHFT : \
+ is_uv3_hub() ? UV3H_EVENT_OCCURRED0_EXTIO_INT0_SHFT : \
+ /*is_uv4_hub*/ UV4H_EVENT_OCCURRED0_EXTIO_INT0_SHFT)
+
+union uvh_event_occurred0_u {
+ unsigned long v;
+ struct uvh_event_occurred0_s {
+ unsigned long lb_hcerr:1; /* RW, W1C */
+ unsigned long rsvd_1_10:10;
+ unsigned long rh_aoerr0:1; /* RW, W1C */
+ unsigned long rsvd_12_63:52;
+ } s;
+ struct uvxh_event_occurred0_s {
+ unsigned long lb_hcerr:1; /* RW */
+ unsigned long rsvd_1:1;
+ unsigned long rh_hcerr:1; /* RW */
+ unsigned long lh0_hcerr:1; /* RW */
+ unsigned long lh1_hcerr:1; /* RW */
+ unsigned long gr0_hcerr:1; /* RW */
+ unsigned long gr1_hcerr:1; /* RW */
+ unsigned long ni0_hcerr:1; /* RW */
+ unsigned long ni1_hcerr:1; /* RW */
+ unsigned long lb_aoerr0:1; /* RW */
+ unsigned long rsvd_10:1;
+ unsigned long rh_aoerr0:1; /* RW */
+ unsigned long lh0_aoerr0:1; /* RW */
+ unsigned long lh1_aoerr0:1; /* RW */
+ unsigned long gr0_aoerr0:1; /* RW */
+ unsigned long gr1_aoerr0:1; /* RW */
+ unsigned long xb_aoerr0:1; /* RW */
+ unsigned long rsvd_17_63:47;
+ } sx;
+ struct uv4h_event_occurred0_s {
+ unsigned long lb_hcerr:1; /* RW */
+ unsigned long kt_hcerr:1; /* RW */
+ unsigned long rh_hcerr:1; /* RW */
+ unsigned long lh0_hcerr:1; /* RW */
+ unsigned long lh1_hcerr:1; /* RW */
+ unsigned long gr0_hcerr:1; /* RW */
+ unsigned long gr1_hcerr:1; /* RW */
+ unsigned long ni0_hcerr:1; /* RW */
+ unsigned long ni1_hcerr:1; /* RW */
+ unsigned long lb_aoerr0:1; /* RW */
+ unsigned long kt_aoerr0:1; /* RW */
+ unsigned long rh_aoerr0:1; /* RW */
+ unsigned long lh0_aoerr0:1; /* RW */
+ unsigned long lh1_aoerr0:1; /* RW */
+ unsigned long gr0_aoerr0:1; /* RW */
+ unsigned long gr1_aoerr0:1; /* RW */
+ unsigned long xb_aoerr0:1; /* RW */
+ unsigned long rtq0_aoerr0:1; /* RW */
+ unsigned long rtq1_aoerr0:1; /* RW */
+ unsigned long rtq2_aoerr0:1; /* RW */
+ unsigned long rtq3_aoerr0:1; /* RW */
+ unsigned long ni0_aoerr0:1; /* RW */
+ unsigned long ni1_aoerr0:1; /* RW */
+ unsigned long lb_aoerr1:1; /* RW */
+ unsigned long kt_aoerr1:1; /* RW */
+ unsigned long rh_aoerr1:1; /* RW */
+ unsigned long lh0_aoerr1:1; /* RW */
+ unsigned long lh1_aoerr1:1; /* RW */
+ unsigned long gr0_aoerr1:1; /* RW */
+ unsigned long gr1_aoerr1:1; /* RW */
+ unsigned long xb_aoerr1:1; /* RW */
+ unsigned long rtq0_aoerr1:1; /* RW */
+ unsigned long rtq1_aoerr1:1; /* RW */
+ unsigned long rtq2_aoerr1:1; /* RW */
+ unsigned long rtq3_aoerr1:1; /* RW */
+ unsigned long ni0_aoerr1:1; /* RW */
+ unsigned long ni1_aoerr1:1; /* RW */
+ unsigned long system_shutdown_int:1; /* RW */
+ unsigned long lb_irq_int_0:1; /* RW */
+ unsigned long lb_irq_int_1:1; /* RW */
+ unsigned long lb_irq_int_2:1; /* RW */
+ unsigned long lb_irq_int_3:1; /* RW */
+ unsigned long lb_irq_int_4:1; /* RW */
+ unsigned long lb_irq_int_5:1; /* RW */
+ unsigned long lb_irq_int_6:1; /* RW */
+ unsigned long lb_irq_int_7:1; /* RW */
+ unsigned long lb_irq_int_8:1; /* RW */
+ unsigned long lb_irq_int_9:1; /* RW */
+ unsigned long lb_irq_int_10:1; /* RW */
+ unsigned long lb_irq_int_11:1; /* RW */
+ unsigned long lb_irq_int_12:1; /* RW */
+ unsigned long lb_irq_int_13:1; /* RW */
+ unsigned long lb_irq_int_14:1; /* RW */
+ unsigned long lb_irq_int_15:1; /* RW */
+ unsigned long l1_nmi_int:1; /* RW */
+ unsigned long stop_clock:1; /* RW */
+ unsigned long asic_to_l1:1; /* RW */
+ unsigned long l1_to_asic:1; /* RW */
+ unsigned long la_seq_trigger:1; /* RW */
+ unsigned long ipi_int:1; /* RW */
+ unsigned long extio_int0:1; /* RW */
+ unsigned long extio_int1:1; /* RW */
+ unsigned long extio_int2:1; /* RW */
+ unsigned long extio_int3:1; /* RW */
+ } s4;
+};
+
+/* ========================================================================= */
+/* UVH_EVENT_OCCURRED0_ALIAS */
+/* ========================================================================= */
+#define UVH_EVENT_OCCURRED0_ALIAS 0x70008UL
+#define UVH_EVENT_OCCURRED0_ALIAS_32 0x5f0
+
+
+/* ========================================================================= */
+/* UVH_EXTIO_INT0_BROADCAST */
+/* ========================================================================= */
+#define UVH_EXTIO_INT0_BROADCAST 0x61448UL
+
+#define UV1H_EXTIO_INT0_BROADCAST_32 0x3f0
+#define UV2H_EXTIO_INT0_BROADCAST_32 0x3f0
+#define UV3H_EXTIO_INT0_BROADCAST_32 0x3f0
+#define UV4H_EXTIO_INT0_BROADCAST_32 0x310
+#define UVH_EXTIO_INT0_BROADCAST_32 ( \
+ is_uv1_hub() ? UV1H_EXTIO_INT0_BROADCAST_32 : \
+ is_uv2_hub() ? UV2H_EXTIO_INT0_BROADCAST_32 : \
+ is_uv3_hub() ? UV3H_EXTIO_INT0_BROADCAST_32 : \
+ /*is_uv4_hub*/ UV4H_EXTIO_INT0_BROADCAST_32)
+
+#define UVH_EXTIO_INT0_BROADCAST_ENABLE_SHFT 0
+#define UVH_EXTIO_INT0_BROADCAST_ENABLE_MASK 0x0000000000000001UL
+
+
+union uvh_extio_int0_broadcast_u {
+ unsigned long v;
+ struct uvh_extio_int0_broadcast_s {
+ unsigned long enable:1; /* RW */
+ unsigned long rsvd_1_63:63;
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_GR0_TLB_INT0_CONFIG */
+/* ========================================================================= */
+#define UVH_GR0_TLB_INT0_CONFIG 0x61b00UL
+
+#define UVH_GR0_TLB_INT0_CONFIG_VECTOR_SHFT 0
+#define UVH_GR0_TLB_INT0_CONFIG_DM_SHFT 8
+#define UVH_GR0_TLB_INT0_CONFIG_DESTMODE_SHFT 11
+#define UVH_GR0_TLB_INT0_CONFIG_STATUS_SHFT 12
+#define UVH_GR0_TLB_INT0_CONFIG_P_SHFT 13
+#define UVH_GR0_TLB_INT0_CONFIG_T_SHFT 15
+#define UVH_GR0_TLB_INT0_CONFIG_M_SHFT 16
+#define UVH_GR0_TLB_INT0_CONFIG_APIC_ID_SHFT 32
+#define UVH_GR0_TLB_INT0_CONFIG_VECTOR_MASK 0x00000000000000ffUL
+#define UVH_GR0_TLB_INT0_CONFIG_DM_MASK 0x0000000000000700UL
+#define UVH_GR0_TLB_INT0_CONFIG_DESTMODE_MASK 0x0000000000000800UL
+#define UVH_GR0_TLB_INT0_CONFIG_STATUS_MASK 0x0000000000001000UL
+#define UVH_GR0_TLB_INT0_CONFIG_P_MASK 0x0000000000002000UL
+#define UVH_GR0_TLB_INT0_CONFIG_T_MASK 0x0000000000008000UL
+#define UVH_GR0_TLB_INT0_CONFIG_M_MASK 0x0000000000010000UL
+#define UVH_GR0_TLB_INT0_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
+
+
+union uvh_gr0_tlb_int0_config_u {
+ unsigned long v;
+ struct uvh_gr0_tlb_int0_config_s {
+ unsigned long vector_:8; /* RW */
+ unsigned long dm:3; /* RW */
+ unsigned long destmode:1; /* RW */
+ unsigned long status:1; /* RO */
+ unsigned long p:1; /* RO */
+ unsigned long rsvd_14:1;
+ unsigned long t:1; /* RO */
+ unsigned long m:1; /* RW */
+ unsigned long rsvd_17_31:15;
+ unsigned long apic_id:32; /* RW */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_GR0_TLB_INT1_CONFIG */
+/* ========================================================================= */
+#define UVH_GR0_TLB_INT1_CONFIG 0x61b40UL
+
+#define UVH_GR0_TLB_INT1_CONFIG_VECTOR_SHFT 0
+#define UVH_GR0_TLB_INT1_CONFIG_DM_SHFT 8
+#define UVH_GR0_TLB_INT1_CONFIG_DESTMODE_SHFT 11
+#define UVH_GR0_TLB_INT1_CONFIG_STATUS_SHFT 12
+#define UVH_GR0_TLB_INT1_CONFIG_P_SHFT 13
+#define UVH_GR0_TLB_INT1_CONFIG_T_SHFT 15
+#define UVH_GR0_TLB_INT1_CONFIG_M_SHFT 16
+#define UVH_GR0_TLB_INT1_CONFIG_APIC_ID_SHFT 32
+#define UVH_GR0_TLB_INT1_CONFIG_VECTOR_MASK 0x00000000000000ffUL
+#define UVH_GR0_TLB_INT1_CONFIG_DM_MASK 0x0000000000000700UL
+#define UVH_GR0_TLB_INT1_CONFIG_DESTMODE_MASK 0x0000000000000800UL
+#define UVH_GR0_TLB_INT1_CONFIG_STATUS_MASK 0x0000000000001000UL
+#define UVH_GR0_TLB_INT1_CONFIG_P_MASK 0x0000000000002000UL
+#define UVH_GR0_TLB_INT1_CONFIG_T_MASK 0x0000000000008000UL
+#define UVH_GR0_TLB_INT1_CONFIG_M_MASK 0x0000000000010000UL
+#define UVH_GR0_TLB_INT1_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
+
+
+union uvh_gr0_tlb_int1_config_u {
+ unsigned long v;
+ struct uvh_gr0_tlb_int1_config_s {
+ unsigned long vector_:8; /* RW */
+ unsigned long dm:3; /* RW */
+ unsigned long destmode:1; /* RW */
+ unsigned long status:1; /* RO */
+ unsigned long p:1; /* RO */
+ unsigned long rsvd_14:1;
+ unsigned long t:1; /* RO */
+ unsigned long m:1; /* RW */
+ unsigned long rsvd_17_31:15;
+ unsigned long apic_id:32; /* RW */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_GR0_TLB_MMR_CONTROL */
+/* ========================================================================= */
+#define UV1H_GR0_TLB_MMR_CONTROL 0x401080UL
+#define UV2H_GR0_TLB_MMR_CONTROL 0xc01080UL
+#define UV3H_GR0_TLB_MMR_CONTROL 0xc01080UL
+#define UV4H_GR0_TLB_MMR_CONTROL 0x601080UL
+#define UVH_GR0_TLB_MMR_CONTROL ( \
+ is_uv1_hub() ? UV1H_GR0_TLB_MMR_CONTROL : \
+ is_uv2_hub() ? UV2H_GR0_TLB_MMR_CONTROL : \
+ is_uv3_hub() ? UV3H_GR0_TLB_MMR_CONTROL : \
+ /*is_uv4_hub*/ UV4H_GR0_TLB_MMR_CONTROL)
+
+#define UVH_GR0_TLB_MMR_CONTROL_INDEX_SHFT 0
+#define UVH_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16
+#define UVH_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20
+#define UVH_GR0_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30
+#define UVH_GR0_TLB_MMR_CONTROL_MMR_READ_SHFT 31
+#define UVH_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL
+#define UVH_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL
+#define UVH_GR0_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL
+#define UVH_GR0_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL
+
+#define UV1H_GR0_TLB_MMR_CONTROL_INDEX_SHFT 0
+#define UV1H_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT 12
+#define UV1H_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16
+#define UV1H_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20
+#define UV1H_GR0_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30
+#define UV1H_GR0_TLB_MMR_CONTROL_MMR_READ_SHFT 31
+#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_CON_SHFT 48
+#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_SHFT 52
+#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBPGSIZE_SHFT 54
+#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBRREG_SHFT 56
+#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBLRUV_SHFT 60
+#define UV1H_GR0_TLB_MMR_CONTROL_INDEX_MASK 0x0000000000000fffUL
+#define UV1H_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK 0x0000000000003000UL
+#define UV1H_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL
+#define UV1H_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL
+#define UV1H_GR0_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL
+#define UV1H_GR0_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL
+#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_CON_MASK 0x0001000000000000UL
+#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_MASK 0x0010000000000000UL
+#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBPGSIZE_MASK 0x0040000000000000UL
+#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBRREG_MASK 0x0100000000000000UL
+#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBLRUV_MASK 0x1000000000000000UL
+
+#define UVXH_GR0_TLB_MMR_CONTROL_INDEX_SHFT 0
+#define UVXH_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16
+#define UVXH_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20
+#define UVXH_GR0_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30
+#define UVXH_GR0_TLB_MMR_CONTROL_MMR_READ_SHFT 31
+#define UVXH_GR0_TLB_MMR_CONTROL_MMR_OP_DONE_SHFT 32
+#define UVXH_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL
+#define UVXH_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL
+#define UVXH_GR0_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL
+#define UVXH_GR0_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL
+#define UVXH_GR0_TLB_MMR_CONTROL_MMR_OP_DONE_MASK 0x0000000100000000UL
+
+#define UV2H_GR0_TLB_MMR_CONTROL_INDEX_SHFT 0
+#define UV2H_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT 12
+#define UV2H_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16
+#define UV2H_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20
+#define UV2H_GR0_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30
+#define UV2H_GR0_TLB_MMR_CONTROL_MMR_READ_SHFT 31
+#define UV2H_GR0_TLB_MMR_CONTROL_MMR_OP_DONE_SHFT 32
+#define UV2H_GR0_TLB_MMR_CONTROL_MMR_INJ_CON_SHFT 48
+#define UV2H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_SHFT 52
+#define UV2H_GR0_TLB_MMR_CONTROL_INDEX_MASK 0x0000000000000fffUL
+#define UV2H_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK 0x0000000000003000UL
+#define UV2H_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL
+#define UV2H_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL
+#define UV2H_GR0_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL
+#define UV2H_GR0_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL
+#define UV2H_GR0_TLB_MMR_CONTROL_MMR_OP_DONE_MASK 0x0000000100000000UL
+#define UV2H_GR0_TLB_MMR_CONTROL_MMR_INJ_CON_MASK 0x0001000000000000UL
+#define UV2H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_MASK 0x0010000000000000UL
+
+#define UV3H_GR0_TLB_MMR_CONTROL_INDEX_SHFT 0
+#define UV3H_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT 12
+#define UV3H_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16
+#define UV3H_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20
+#define UV3H_GR0_TLB_MMR_CONTROL_ECC_SEL_SHFT 21
+#define UV3H_GR0_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30
+#define UV3H_GR0_TLB_MMR_CONTROL_MMR_READ_SHFT 31
+#define UV3H_GR0_TLB_MMR_CONTROL_MMR_OP_DONE_SHFT 32
+#define UV3H_GR0_TLB_MMR_CONTROL_INDEX_MASK 0x0000000000000fffUL
+#define UV3H_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK 0x0000000000003000UL
+#define UV3H_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL
+#define UV3H_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL
+#define UV3H_GR0_TLB_MMR_CONTROL_ECC_SEL_MASK 0x0000000000200000UL
+#define UV3H_GR0_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL
+#define UV3H_GR0_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL
+#define UV3H_GR0_TLB_MMR_CONTROL_MMR_OP_DONE_MASK 0x0000000100000000UL
+
+#define UV4H_GR0_TLB_MMR_CONTROL_INDEX_SHFT 0
+#define UV4H_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT 13
+#define UV4H_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16
+#define UV4H_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20
+#define UV4H_GR0_TLB_MMR_CONTROL_ECC_SEL_SHFT 21
+#define UV4H_GR0_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30
+#define UV4H_GR0_TLB_MMR_CONTROL_MMR_READ_SHFT 31
+#define UV4H_GR0_TLB_MMR_CONTROL_MMR_OP_DONE_SHFT 32
+#define UV4H_GR0_TLB_MMR_CONTROL_PAGE_SIZE_SHFT 59
+#define UV4H_GR0_TLB_MMR_CONTROL_INDEX_MASK 0x0000000000001fffUL
+#define UV4H_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK 0x0000000000006000UL
+#define UV4H_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL
+#define UV4H_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL
+#define UV4H_GR0_TLB_MMR_CONTROL_ECC_SEL_MASK 0x0000000000200000UL
+#define UV4H_GR0_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL
+#define UV4H_GR0_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL
+#define UV4H_GR0_TLB_MMR_CONTROL_MMR_OP_DONE_MASK 0x0000000100000000UL
+#define UV4H_GR0_TLB_MMR_CONTROL_PAGE_SIZE_MASK 0xf800000000000000UL
+
+#define UVH_GR0_TLB_MMR_CONTROL_INDEX_MASK ( \
+ is_uv1_hub() ? UV1H_GR0_TLB_MMR_CONTROL_INDEX_MASK : \
+ is_uv2_hub() ? UV2H_GR0_TLB_MMR_CONTROL_INDEX_MASK : \
+ is_uv3_hub() ? UV3H_GR0_TLB_MMR_CONTROL_INDEX_MASK : \
+ /*is_uv4_hub*/ UV4H_GR0_TLB_MMR_CONTROL_INDEX_MASK)
+#define UVH_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK ( \
+ is_uv1_hub() ? UV1H_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK : \
+ is_uv2_hub() ? UV2H_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK : \
+ is_uv3_hub() ? UV3H_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK : \
+ /*is_uv4_hub*/ UV4H_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK)
+#define UVH_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT ( \
+ is_uv1_hub() ? UV1H_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT : \
+ is_uv2_hub() ? UV2H_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT : \
+ is_uv3_hub() ? UV3H_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT : \
+ /*is_uv4_hub*/ UV4H_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT)
+
+union uvh_gr0_tlb_mmr_control_u {
+ unsigned long v;
+ struct uvh_gr0_tlb_mmr_control_s {
+ unsigned long rsvd_0_15:16;
+ unsigned long auto_valid_en:1; /* RW */
+ unsigned long rsvd_17_19:3;
+ unsigned long mmr_hash_index_en:1; /* RW */
+ unsigned long rsvd_21_29:9;
+ unsigned long mmr_write:1; /* WP */
+ unsigned long mmr_read:1; /* WP */
+ unsigned long rsvd_32_48:17;
+ unsigned long rsvd_49_51:3;
+ unsigned long rsvd_52_63:12;
+ } s;
+ struct uv1h_gr0_tlb_mmr_control_s {
+ unsigned long index:12; /* RW */
+ unsigned long mem_sel:2; /* RW */
+ unsigned long rsvd_14_15:2;
+ unsigned long auto_valid_en:1; /* RW */
+ unsigned long rsvd_17_19:3;
+ unsigned long mmr_hash_index_en:1; /* RW */
+ unsigned long rsvd_21_29:9;
+ unsigned long mmr_write:1; /* WP */
+ unsigned long mmr_read:1; /* WP */
+ unsigned long rsvd_32_47:16;
+ unsigned long mmr_inj_con:1; /* RW */
+ unsigned long rsvd_49_51:3;
+ unsigned long mmr_inj_tlbram:1; /* RW */
+ unsigned long rsvd_53:1;
+ unsigned long mmr_inj_tlbpgsize:1; /* RW */
+ unsigned long rsvd_55:1;
+ unsigned long mmr_inj_tlbrreg:1; /* RW */
+ unsigned long rsvd_57_59:3;
+ unsigned long mmr_inj_tlblruv:1; /* RW */
+ unsigned long rsvd_61_63:3;
+ } s1;
+ struct uvxh_gr0_tlb_mmr_control_s {
+ unsigned long rsvd_0_15:16;
+ unsigned long auto_valid_en:1; /* RW */
+ unsigned long rsvd_17_19:3;
+ unsigned long mmr_hash_index_en:1; /* RW */
+ unsigned long rsvd_21_29:9;
+ unsigned long mmr_write:1; /* WP */
+ unsigned long mmr_read:1; /* WP */
+ unsigned long mmr_op_done:1; /* RW */
+ unsigned long rsvd_33_47:15;
+ unsigned long rsvd_48:1;
+ unsigned long rsvd_49_51:3;
+ unsigned long rsvd_52_63:12;
+ } sx;
+ struct uv2h_gr0_tlb_mmr_control_s {
+ unsigned long index:12; /* RW */
+ unsigned long mem_sel:2; /* RW */
+ unsigned long rsvd_14_15:2;
+ unsigned long auto_valid_en:1; /* RW */
+ unsigned long rsvd_17_19:3;
+ unsigned long mmr_hash_index_en:1; /* RW */
+ unsigned long rsvd_21_29:9;
+ unsigned long mmr_write:1; /* WP */
+ unsigned long mmr_read:1; /* WP */
+ unsigned long mmr_op_done:1; /* RW */
+ unsigned long rsvd_33_47:15;
+ unsigned long mmr_inj_con:1; /* RW */
+ unsigned long rsvd_49_51:3;
+ unsigned long mmr_inj_tlbram:1; /* RW */
+ unsigned long rsvd_53_63:11;
+ } s2;
+ struct uv3h_gr0_tlb_mmr_control_s {
+ unsigned long index:12; /* RW */
+ unsigned long mem_sel:2; /* RW */
+ unsigned long rsvd_14_15:2;
+ unsigned long auto_valid_en:1; /* RW */
+ unsigned long rsvd_17_19:3;
+ unsigned long mmr_hash_index_en:1; /* RW */
+ unsigned long ecc_sel:1; /* RW */
+ unsigned long rsvd_22_29:8;
+ unsigned long mmr_write:1; /* WP */
+ unsigned long mmr_read:1; /* WP */
+ unsigned long mmr_op_done:1; /* RW */
+ unsigned long rsvd_33_47:15;
+ unsigned long undef_48:1; /* Undefined */
+ unsigned long rsvd_49_51:3;
+ unsigned long undef_52:1; /* Undefined */
+ unsigned long rsvd_53_63:11;
+ } s3;
+ struct uv4h_gr0_tlb_mmr_control_s {
+ unsigned long index:13; /* RW */
+ unsigned long mem_sel:2; /* RW */
+ unsigned long rsvd_15:1;
+ unsigned long auto_valid_en:1; /* RW */
+ unsigned long rsvd_17_19:3;
+ unsigned long mmr_hash_index_en:1; /* RW */
+ unsigned long ecc_sel:1; /* RW */
+ unsigned long rsvd_22_29:8;
+ unsigned long mmr_write:1; /* WP */
+ unsigned long mmr_read:1; /* WP */
+ unsigned long mmr_op_done:1; /* RW */
+ unsigned long rsvd_33_47:15;
+ unsigned long undef_48:1; /* Undefined */
+ unsigned long rsvd_49_51:3;
+ unsigned long rsvd_52_58:7;
+ unsigned long page_size:5; /* RW */
+ } s4;
+};
+
+/* ========================================================================= */
+/* UVH_GR0_TLB_MMR_READ_DATA_HI */
+/* ========================================================================= */
+#define UV1H_GR0_TLB_MMR_READ_DATA_HI 0x4010a0UL
+#define UV2H_GR0_TLB_MMR_READ_DATA_HI 0xc010a0UL
+#define UV3H_GR0_TLB_MMR_READ_DATA_HI 0xc010a0UL
+#define UV4H_GR0_TLB_MMR_READ_DATA_HI 0x6010a0UL
+#define UVH_GR0_TLB_MMR_READ_DATA_HI ( \
+ is_uv1_hub() ? UV1H_GR0_TLB_MMR_READ_DATA_HI : \
+ is_uv2_hub() ? UV2H_GR0_TLB_MMR_READ_DATA_HI : \
+ is_uv3_hub() ? UV3H_GR0_TLB_MMR_READ_DATA_HI : \
+ /*is_uv4_hub*/ UV4H_GR0_TLB_MMR_READ_DATA_HI)
+
+#define UVH_GR0_TLB_MMR_READ_DATA_HI_PFN_SHFT 0
+
+#define UV1H_GR0_TLB_MMR_READ_DATA_HI_PFN_SHFT 0
+#define UV1H_GR0_TLB_MMR_READ_DATA_HI_GAA_SHFT 41
+#define UV1H_GR0_TLB_MMR_READ_DATA_HI_DIRTY_SHFT 43
+#define UV1H_GR0_TLB_MMR_READ_DATA_HI_LARGER_SHFT 44
+#define UV1H_GR0_TLB_MMR_READ_DATA_HI_PFN_MASK 0x000001ffffffffffUL
+#define UV1H_GR0_TLB_MMR_READ_DATA_HI_GAA_MASK 0x0000060000000000UL
+#define UV1H_GR0_TLB_MMR_READ_DATA_HI_DIRTY_MASK 0x0000080000000000UL
+#define UV1H_GR0_TLB_MMR_READ_DATA_HI_LARGER_MASK 0x0000100000000000UL
+
+#define UVXH_GR0_TLB_MMR_READ_DATA_HI_PFN_SHFT 0
+
+#define UV2H_GR0_TLB_MMR_READ_DATA_HI_PFN_SHFT 0
+#define UV2H_GR0_TLB_MMR_READ_DATA_HI_GAA_SHFT 41
+#define UV2H_GR0_TLB_MMR_READ_DATA_HI_DIRTY_SHFT 43
+#define UV2H_GR0_TLB_MMR_READ_DATA_HI_LARGER_SHFT 44
+#define UV2H_GR0_TLB_MMR_READ_DATA_HI_PFN_MASK 0x000001ffffffffffUL
+#define UV2H_GR0_TLB_MMR_READ_DATA_HI_GAA_MASK 0x0000060000000000UL
+#define UV2H_GR0_TLB_MMR_READ_DATA_HI_DIRTY_MASK 0x0000080000000000UL
+#define UV2H_GR0_TLB_MMR_READ_DATA_HI_LARGER_MASK 0x0000100000000000UL
+
+#define UV3H_GR0_TLB_MMR_READ_DATA_HI_PFN_SHFT 0
+#define UV3H_GR0_TLB_MMR_READ_DATA_HI_GAA_SHFT 41
+#define UV3H_GR0_TLB_MMR_READ_DATA_HI_DIRTY_SHFT 43
+#define UV3H_GR0_TLB_MMR_READ_DATA_HI_LARGER_SHFT 44
+#define UV3H_GR0_TLB_MMR_READ_DATA_HI_AA_EXT_SHFT 45
+#define UV3H_GR0_TLB_MMR_READ_DATA_HI_WAY_ECC_SHFT 55
+#define UV3H_GR0_TLB_MMR_READ_DATA_HI_PFN_MASK 0x000001ffffffffffUL
+#define UV3H_GR0_TLB_MMR_READ_DATA_HI_GAA_MASK 0x0000060000000000UL
+#define UV3H_GR0_TLB_MMR_READ_DATA_HI_DIRTY_MASK 0x0000080000000000UL
+#define UV3H_GR0_TLB_MMR_READ_DATA_HI_LARGER_MASK 0x0000100000000000UL
+#define UV3H_GR0_TLB_MMR_READ_DATA_HI_AA_EXT_MASK 0x0000200000000000UL
+#define UV3H_GR0_TLB_MMR_READ_DATA_HI_WAY_ECC_MASK 0xff80000000000000UL
+
+#define UV4H_GR0_TLB_MMR_READ_DATA_HI_PFN_SHFT 0
+#define UV4H_GR0_TLB_MMR_READ_DATA_HI_PNID_SHFT 34
+#define UV4H_GR0_TLB_MMR_READ_DATA_HI_GAA_SHFT 49
+#define UV4H_GR0_TLB_MMR_READ_DATA_HI_DIRTY_SHFT 51
+#define UV4H_GR0_TLB_MMR_READ_DATA_HI_LARGER_SHFT 52
+#define UV4H_GR0_TLB_MMR_READ_DATA_HI_AA_EXT_SHFT 53
+#define UV4H_GR0_TLB_MMR_READ_DATA_HI_WAY_ECC_SHFT 55
+#define UV4H_GR0_TLB_MMR_READ_DATA_HI_PFN_MASK 0x00000003ffffffffUL
+#define UV4H_GR0_TLB_MMR_READ_DATA_HI_PNID_MASK 0x0001fffc00000000UL
+#define UV4H_GR0_TLB_MMR_READ_DATA_HI_GAA_MASK 0x0006000000000000UL
+#define UV4H_GR0_TLB_MMR_READ_DATA_HI_DIRTY_MASK 0x0008000000000000UL
+#define UV4H_GR0_TLB_MMR_READ_DATA_HI_LARGER_MASK 0x0010000000000000UL
+#define UV4H_GR0_TLB_MMR_READ_DATA_HI_AA_EXT_MASK 0x0020000000000000UL
+#define UV4H_GR0_TLB_MMR_READ_DATA_HI_WAY_ECC_MASK 0xff80000000000000UL
+
+
+union uvh_gr0_tlb_mmr_read_data_hi_u {
+ unsigned long v;
+ struct uv1h_gr0_tlb_mmr_read_data_hi_s {
+ unsigned long pfn:41; /* RO */
+ unsigned long gaa:2; /* RO */
+ unsigned long dirty:1; /* RO */
+ unsigned long larger:1; /* RO */
+ unsigned long rsvd_45_63:19;
+ } s1;
+ struct uv2h_gr0_tlb_mmr_read_data_hi_s {
+ unsigned long pfn:41; /* RO */
+ unsigned long gaa:2; /* RO */
+ unsigned long dirty:1; /* RO */
+ unsigned long larger:1; /* RO */
+ unsigned long rsvd_45_63:19;
+ } s2;
+ struct uv3h_gr0_tlb_mmr_read_data_hi_s {
+ unsigned long pfn:41; /* RO */
+ unsigned long gaa:2; /* RO */
+ unsigned long dirty:1; /* RO */
+ unsigned long larger:1; /* RO */
+ unsigned long aa_ext:1; /* RO */
+ unsigned long undef_46_54:9; /* Undefined */
+ unsigned long way_ecc:9; /* RO */
+ } s3;
+ struct uv4h_gr0_tlb_mmr_read_data_hi_s {
+ unsigned long pfn:34; /* RO */
+ unsigned long pnid:15; /* RO */
+ unsigned long gaa:2; /* RO */
+ unsigned long dirty:1; /* RO */
+ unsigned long larger:1; /* RO */
+ unsigned long aa_ext:1; /* RO */
+ unsigned long undef_54:1; /* Undefined */
+ unsigned long way_ecc:9; /* RO */
+ } s4;
+};
+
+/* ========================================================================= */
+/* UVH_GR0_TLB_MMR_READ_DATA_LO */
+/* ========================================================================= */
+#define UV1H_GR0_TLB_MMR_READ_DATA_LO 0x4010a8UL
+#define UV2H_GR0_TLB_MMR_READ_DATA_LO 0xc010a8UL
+#define UV3H_GR0_TLB_MMR_READ_DATA_LO 0xc010a8UL
+#define UV4H_GR0_TLB_MMR_READ_DATA_LO 0x6010a8UL
+#define UVH_GR0_TLB_MMR_READ_DATA_LO ( \
+ is_uv1_hub() ? UV1H_GR0_TLB_MMR_READ_DATA_LO : \
+ is_uv2_hub() ? UV2H_GR0_TLB_MMR_READ_DATA_LO : \
+ is_uv3_hub() ? UV3H_GR0_TLB_MMR_READ_DATA_LO : \
+ /*is_uv4_hub*/ UV4H_GR0_TLB_MMR_READ_DATA_LO)
+
+#define UVH_GR0_TLB_MMR_READ_DATA_LO_VPN_SHFT 0
+#define UVH_GR0_TLB_MMR_READ_DATA_LO_ASID_SHFT 39
+#define UVH_GR0_TLB_MMR_READ_DATA_LO_VALID_SHFT 63
+#define UVH_GR0_TLB_MMR_READ_DATA_LO_VPN_MASK 0x0000007fffffffffUL
+#define UVH_GR0_TLB_MMR_READ_DATA_LO_ASID_MASK 0x7fffff8000000000UL
+#define UVH_GR0_TLB_MMR_READ_DATA_LO_VALID_MASK 0x8000000000000000UL
+
+#define UV1H_GR0_TLB_MMR_READ_DATA_LO_VPN_SHFT 0
+#define UV1H_GR0_TLB_MMR_READ_DATA_LO_ASID_SHFT 39
+#define UV1H_GR0_TLB_MMR_READ_DATA_LO_VALID_SHFT 63
+#define UV1H_GR0_TLB_MMR_READ_DATA_LO_VPN_MASK 0x0000007fffffffffUL
+#define UV1H_GR0_TLB_MMR_READ_DATA_LO_ASID_MASK 0x7fffff8000000000UL
+#define UV1H_GR0_TLB_MMR_READ_DATA_LO_VALID_MASK 0x8000000000000000UL
+
+#define UVXH_GR0_TLB_MMR_READ_DATA_LO_VPN_SHFT 0
+#define UVXH_GR0_TLB_MMR_READ_DATA_LO_ASID_SHFT 39
+#define UVXH_GR0_TLB_MMR_READ_DATA_LO_VALID_SHFT 63
+#define UVXH_GR0_TLB_MMR_READ_DATA_LO_VPN_MASK 0x0000007fffffffffUL
+#define UVXH_GR0_TLB_MMR_READ_DATA_LO_ASID_MASK 0x7fffff8000000000UL
+#define UVXH_GR0_TLB_MMR_READ_DATA_LO_VALID_MASK 0x8000000000000000UL
+
+#define UV2H_GR0_TLB_MMR_READ_DATA_LO_VPN_SHFT 0
+#define UV2H_GR0_TLB_MMR_READ_DATA_LO_ASID_SHFT 39
+#define UV2H_GR0_TLB_MMR_READ_DATA_LO_VALID_SHFT 63
+#define UV2H_GR0_TLB_MMR_READ_DATA_LO_VPN_MASK 0x0000007fffffffffUL
+#define UV2H_GR0_TLB_MMR_READ_DATA_LO_ASID_MASK 0x7fffff8000000000UL
+#define UV2H_GR0_TLB_MMR_READ_DATA_LO_VALID_MASK 0x8000000000000000UL
+
+#define UV3H_GR0_TLB_MMR_READ_DATA_LO_VPN_SHFT 0
+#define UV3H_GR0_TLB_MMR_READ_DATA_LO_ASID_SHFT 39
+#define UV3H_GR0_TLB_MMR_READ_DATA_LO_VALID_SHFT 63
+#define UV3H_GR0_TLB_MMR_READ_DATA_LO_VPN_MASK 0x0000007fffffffffUL
+#define UV3H_GR0_TLB_MMR_READ_DATA_LO_ASID_MASK 0x7fffff8000000000UL
+#define UV3H_GR0_TLB_MMR_READ_DATA_LO_VALID_MASK 0x8000000000000000UL
+
+#define UV4H_GR0_TLB_MMR_READ_DATA_LO_VPN_SHFT 0
+#define UV4H_GR0_TLB_MMR_READ_DATA_LO_ASID_SHFT 39
+#define UV4H_GR0_TLB_MMR_READ_DATA_LO_VALID_SHFT 63
+#define UV4H_GR0_TLB_MMR_READ_DATA_LO_VPN_MASK 0x0000007fffffffffUL
+#define UV4H_GR0_TLB_MMR_READ_DATA_LO_ASID_MASK 0x7fffff8000000000UL
+#define UV4H_GR0_TLB_MMR_READ_DATA_LO_VALID_MASK 0x8000000000000000UL
+
+
+union uvh_gr0_tlb_mmr_read_data_lo_u {
+ unsigned long v;
+ struct uvh_gr0_tlb_mmr_read_data_lo_s {
+ unsigned long vpn:39; /* RO */
+ unsigned long asid:24; /* RO */
+ unsigned long valid:1; /* RO */
+ } s;
+ struct uv1h_gr0_tlb_mmr_read_data_lo_s {
+ unsigned long vpn:39; /* RO */
+ unsigned long asid:24; /* RO */
+ unsigned long valid:1; /* RO */
+ } s1;
+ struct uvxh_gr0_tlb_mmr_read_data_lo_s {
+ unsigned long vpn:39; /* RO */
+ unsigned long asid:24; /* RO */
+ unsigned long valid:1; /* RO */
+ } sx;
+ struct uv2h_gr0_tlb_mmr_read_data_lo_s {
+ unsigned long vpn:39; /* RO */
+ unsigned long asid:24; /* RO */
+ unsigned long valid:1; /* RO */
+ } s2;
+ struct uv3h_gr0_tlb_mmr_read_data_lo_s {
+ unsigned long vpn:39; /* RO */
+ unsigned long asid:24; /* RO */
+ unsigned long valid:1; /* RO */
+ } s3;
+ struct uv4h_gr0_tlb_mmr_read_data_lo_s {
+ unsigned long vpn:39; /* RO */
+ unsigned long asid:24; /* RO */
+ unsigned long valid:1; /* RO */
+ } s4;
+};
+
+/* ========================================================================= */
+/* UVH_GR1_TLB_INT0_CONFIG */
+/* ========================================================================= */
+#define UV1H_GR1_TLB_INT0_CONFIG 0x61f00UL
+#define UV2H_GR1_TLB_INT0_CONFIG 0x61f00UL
+#define UV3H_GR1_TLB_INT0_CONFIG 0x61f00UL
+#define UV4H_GR1_TLB_INT0_CONFIG 0x62100UL
+#define UVH_GR1_TLB_INT0_CONFIG ( \
+ is_uv1_hub() ? UV1H_GR1_TLB_INT0_CONFIG : \
+ is_uv2_hub() ? UV2H_GR1_TLB_INT0_CONFIG : \
+ is_uv3_hub() ? UV3H_GR1_TLB_INT0_CONFIG : \
+ /*is_uv4_hub*/ UV4H_GR1_TLB_INT0_CONFIG)
+
+#define UVH_GR1_TLB_INT0_CONFIG_VECTOR_SHFT 0
+#define UVH_GR1_TLB_INT0_CONFIG_DM_SHFT 8
+#define UVH_GR1_TLB_INT0_CONFIG_DESTMODE_SHFT 11
+#define UVH_GR1_TLB_INT0_CONFIG_STATUS_SHFT 12
+#define UVH_GR1_TLB_INT0_CONFIG_P_SHFT 13
+#define UVH_GR1_TLB_INT0_CONFIG_T_SHFT 15
+#define UVH_GR1_TLB_INT0_CONFIG_M_SHFT 16
+#define UVH_GR1_TLB_INT0_CONFIG_APIC_ID_SHFT 32
+#define UVH_GR1_TLB_INT0_CONFIG_VECTOR_MASK 0x00000000000000ffUL
+#define UVH_GR1_TLB_INT0_CONFIG_DM_MASK 0x0000000000000700UL
+#define UVH_GR1_TLB_INT0_CONFIG_DESTMODE_MASK 0x0000000000000800UL
+#define UVH_GR1_TLB_INT0_CONFIG_STATUS_MASK 0x0000000000001000UL
+#define UVH_GR1_TLB_INT0_CONFIG_P_MASK 0x0000000000002000UL
+#define UVH_GR1_TLB_INT0_CONFIG_T_MASK 0x0000000000008000UL
+#define UVH_GR1_TLB_INT0_CONFIG_M_MASK 0x0000000000010000UL
+#define UVH_GR1_TLB_INT0_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
+
+
+union uvh_gr1_tlb_int0_config_u {
+ unsigned long v;
+ struct uvh_gr1_tlb_int0_config_s {
+ unsigned long vector_:8; /* RW */
+ unsigned long dm:3; /* RW */
+ unsigned long destmode:1; /* RW */
+ unsigned long status:1; /* RO */
+ unsigned long p:1; /* RO */
+ unsigned long rsvd_14:1;
+ unsigned long t:1; /* RO */
+ unsigned long m:1; /* RW */
+ unsigned long rsvd_17_31:15;
+ unsigned long apic_id:32; /* RW */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_GR1_TLB_INT1_CONFIG */
+/* ========================================================================= */
+#define UV1H_GR1_TLB_INT1_CONFIG 0x61f40UL
+#define UV2H_GR1_TLB_INT1_CONFIG 0x61f40UL
+#define UV3H_GR1_TLB_INT1_CONFIG 0x61f40UL
+#define UV4H_GR1_TLB_INT1_CONFIG 0x62140UL
+#define UVH_GR1_TLB_INT1_CONFIG ( \
+ is_uv1_hub() ? UV1H_GR1_TLB_INT1_CONFIG : \
+ is_uv2_hub() ? UV2H_GR1_TLB_INT1_CONFIG : \
+ is_uv3_hub() ? UV3H_GR1_TLB_INT1_CONFIG : \
+ /*is_uv4_hub*/ UV4H_GR1_TLB_INT1_CONFIG)
+
+#define UVH_GR1_TLB_INT1_CONFIG_VECTOR_SHFT 0
+#define UVH_GR1_TLB_INT1_CONFIG_DM_SHFT 8
+#define UVH_GR1_TLB_INT1_CONFIG_DESTMODE_SHFT 11
+#define UVH_GR1_TLB_INT1_CONFIG_STATUS_SHFT 12
+#define UVH_GR1_TLB_INT1_CONFIG_P_SHFT 13
+#define UVH_GR1_TLB_INT1_CONFIG_T_SHFT 15
+#define UVH_GR1_TLB_INT1_CONFIG_M_SHFT 16
+#define UVH_GR1_TLB_INT1_CONFIG_APIC_ID_SHFT 32
+#define UVH_GR1_TLB_INT1_CONFIG_VECTOR_MASK 0x00000000000000ffUL
+#define UVH_GR1_TLB_INT1_CONFIG_DM_MASK 0x0000000000000700UL
+#define UVH_GR1_TLB_INT1_CONFIG_DESTMODE_MASK 0x0000000000000800UL
+#define UVH_GR1_TLB_INT1_CONFIG_STATUS_MASK 0x0000000000001000UL
+#define UVH_GR1_TLB_INT1_CONFIG_P_MASK 0x0000000000002000UL
+#define UVH_GR1_TLB_INT1_CONFIG_T_MASK 0x0000000000008000UL
+#define UVH_GR1_TLB_INT1_CONFIG_M_MASK 0x0000000000010000UL
+#define UVH_GR1_TLB_INT1_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
+
+
+union uvh_gr1_tlb_int1_config_u {
+ unsigned long v;
+ struct uvh_gr1_tlb_int1_config_s {
+ unsigned long vector_:8; /* RW */
+ unsigned long dm:3; /* RW */
+ unsigned long destmode:1; /* RW */
+ unsigned long status:1; /* RO */
+ unsigned long p:1; /* RO */
+ unsigned long rsvd_14:1;
+ unsigned long t:1; /* RO */
+ unsigned long m:1; /* RW */
+ unsigned long rsvd_17_31:15;
+ unsigned long apic_id:32; /* RW */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_GR1_TLB_MMR_CONTROL */
+/* ========================================================================= */
+#define UV1H_GR1_TLB_MMR_CONTROL 0x801080UL
+#define UV2H_GR1_TLB_MMR_CONTROL 0x1001080UL
+#define UV3H_GR1_TLB_MMR_CONTROL 0x1001080UL
+#define UV4H_GR1_TLB_MMR_CONTROL 0x701080UL
+#define UVH_GR1_TLB_MMR_CONTROL ( \
+ is_uv1_hub() ? UV1H_GR1_TLB_MMR_CONTROL : \
+ is_uv2_hub() ? UV2H_GR1_TLB_MMR_CONTROL : \
+ is_uv3_hub() ? UV3H_GR1_TLB_MMR_CONTROL : \
+ /*is_uv4_hub*/ UV4H_GR1_TLB_MMR_CONTROL)
+
+#define UVH_GR1_TLB_MMR_CONTROL_INDEX_SHFT 0
+#define UVH_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16
+#define UVH_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20
+#define UVH_GR1_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30
+#define UVH_GR1_TLB_MMR_CONTROL_MMR_READ_SHFT 31
+#define UVH_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL
+#define UVH_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL
+#define UVH_GR1_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL
+#define UVH_GR1_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL
+
+#define UV1H_GR1_TLB_MMR_CONTROL_INDEX_SHFT 0
+#define UV1H_GR1_TLB_MMR_CONTROL_MEM_SEL_SHFT 12
+#define UV1H_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16
+#define UV1H_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20
+#define UV1H_GR1_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30
+#define UV1H_GR1_TLB_MMR_CONTROL_MMR_READ_SHFT 31
+#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_CON_SHFT 48
+#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_SHFT 52
+#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBPGSIZE_SHFT 54
+#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBRREG_SHFT 56
+#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBLRUV_SHFT 60
+#define UV1H_GR1_TLB_MMR_CONTROL_INDEX_MASK 0x0000000000000fffUL
+#define UV1H_GR1_TLB_MMR_CONTROL_MEM_SEL_MASK 0x0000000000003000UL
+#define UV1H_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL
+#define UV1H_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL
+#define UV1H_GR1_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL
+#define UV1H_GR1_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL
+#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_CON_MASK 0x0001000000000000UL
+#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_MASK 0x0010000000000000UL
+#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBPGSIZE_MASK 0x0040000000000000UL
+#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBRREG_MASK 0x0100000000000000UL
+#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBLRUV_MASK 0x1000000000000000UL
+
+#define UVXH_GR1_TLB_MMR_CONTROL_INDEX_SHFT 0
+#define UVXH_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16
+#define UVXH_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20
+#define UVXH_GR1_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30
+#define UVXH_GR1_TLB_MMR_CONTROL_MMR_READ_SHFT 31
+#define UVXH_GR1_TLB_MMR_CONTROL_MMR_OP_DONE_SHFT 32
+#define UVXH_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL
+#define UVXH_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL
+#define UVXH_GR1_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL
+#define UVXH_GR1_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL
+#define UVXH_GR1_TLB_MMR_CONTROL_MMR_OP_DONE_MASK 0x0000000100000000UL
+
+#define UV2H_GR1_TLB_MMR_CONTROL_INDEX_SHFT 0
+#define UV2H_GR1_TLB_MMR_CONTROL_MEM_SEL_SHFT 12
+#define UV2H_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16
+#define UV2H_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20
+#define UV2H_GR1_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30
+#define UV2H_GR1_TLB_MMR_CONTROL_MMR_READ_SHFT 31
+#define UV2H_GR1_TLB_MMR_CONTROL_MMR_OP_DONE_SHFT 32
+#define UV2H_GR1_TLB_MMR_CONTROL_MMR_INJ_CON_SHFT 48
+#define UV2H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_SHFT 52
+#define UV2H_GR1_TLB_MMR_CONTROL_INDEX_MASK 0x0000000000000fffUL
+#define UV2H_GR1_TLB_MMR_CONTROL_MEM_SEL_MASK 0x0000000000003000UL
+#define UV2H_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL
+#define UV2H_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL
+#define UV2H_GR1_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL
+#define UV2H_GR1_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL
+#define UV2H_GR1_TLB_MMR_CONTROL_MMR_OP_DONE_MASK 0x0000000100000000UL
+#define UV2H_GR1_TLB_MMR_CONTROL_MMR_INJ_CON_MASK 0x0001000000000000UL
+#define UV2H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_MASK 0x0010000000000000UL
+
+#define UV3H_GR1_TLB_MMR_CONTROL_INDEX_SHFT 0
+#define UV3H_GR1_TLB_MMR_CONTROL_MEM_SEL_SHFT 12
+#define UV3H_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16
+#define UV3H_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20
+#define UV3H_GR1_TLB_MMR_CONTROL_ECC_SEL_SHFT 21
+#define UV3H_GR1_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30
+#define UV3H_GR1_TLB_MMR_CONTROL_MMR_READ_SHFT 31
+#define UV3H_GR1_TLB_MMR_CONTROL_MMR_OP_DONE_SHFT 32
+#define UV3H_GR1_TLB_MMR_CONTROL_INDEX_MASK 0x0000000000000fffUL
+#define UV3H_GR1_TLB_MMR_CONTROL_MEM_SEL_MASK 0x0000000000003000UL
+#define UV3H_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL
+#define UV3H_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL
+#define UV3H_GR1_TLB_MMR_CONTROL_ECC_SEL_MASK 0x0000000000200000UL
+#define UV3H_GR1_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL
+#define UV3H_GR1_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL
+#define UV3H_GR1_TLB_MMR_CONTROL_MMR_OP_DONE_MASK 0x0000000100000000UL
+
+#define UV4H_GR1_TLB_MMR_CONTROL_INDEX_SHFT 0
+#define UV4H_GR1_TLB_MMR_CONTROL_MEM_SEL_SHFT 13
+#define UV4H_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16
+#define UV4H_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20
+#define UV4H_GR1_TLB_MMR_CONTROL_ECC_SEL_SHFT 21
+#define UV4H_GR1_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30
+#define UV4H_GR1_TLB_MMR_CONTROL_MMR_READ_SHFT 31
+#define UV4H_GR1_TLB_MMR_CONTROL_MMR_OP_DONE_SHFT 32
+#define UV4H_GR1_TLB_MMR_CONTROL_PAGE_SIZE_SHFT 59
+#define UV4H_GR1_TLB_MMR_CONTROL_INDEX_MASK 0x0000000000001fffUL
+#define UV4H_GR1_TLB_MMR_CONTROL_MEM_SEL_MASK 0x0000000000006000UL
+#define UV4H_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL
+#define UV4H_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL
+#define UV4H_GR1_TLB_MMR_CONTROL_ECC_SEL_MASK 0x0000000000200000UL
+#define UV4H_GR1_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL
+#define UV4H_GR1_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL
+#define UV4H_GR1_TLB_MMR_CONTROL_MMR_OP_DONE_MASK 0x0000000100000000UL
+#define UV4H_GR1_TLB_MMR_CONTROL_PAGE_SIZE_MASK 0xf800000000000000UL
+
+
+union uvh_gr1_tlb_mmr_control_u {
+ unsigned long v;
+ struct uvh_gr1_tlb_mmr_control_s {
+ unsigned long rsvd_0_15:16;
+ unsigned long auto_valid_en:1; /* RW */
+ unsigned long rsvd_17_19:3;
+ unsigned long mmr_hash_index_en:1; /* RW */
+ unsigned long rsvd_21_29:9;
+ unsigned long mmr_write:1; /* WP */
+ unsigned long mmr_read:1; /* WP */
+ unsigned long rsvd_32_48:17;
+ unsigned long rsvd_49_51:3;
+ unsigned long rsvd_52_63:12;
+ } s;
+ struct uv1h_gr1_tlb_mmr_control_s {
+ unsigned long index:12; /* RW */
+ unsigned long mem_sel:2; /* RW */
+ unsigned long rsvd_14_15:2;
+ unsigned long auto_valid_en:1; /* RW */
+ unsigned long rsvd_17_19:3;
+ unsigned long mmr_hash_index_en:1; /* RW */
+ unsigned long rsvd_21_29:9;
+ unsigned long mmr_write:1; /* WP */
+ unsigned long mmr_read:1; /* WP */
+ unsigned long rsvd_32_47:16;
+ unsigned long mmr_inj_con:1; /* RW */
+ unsigned long rsvd_49_51:3;
+ unsigned long mmr_inj_tlbram:1; /* RW */
+ unsigned long rsvd_53:1;
+ unsigned long mmr_inj_tlbpgsize:1; /* RW */
+ unsigned long rsvd_55:1;
+ unsigned long mmr_inj_tlbrreg:1; /* RW */
+ unsigned long rsvd_57_59:3;
+ unsigned long mmr_inj_tlblruv:1; /* RW */
+ unsigned long rsvd_61_63:3;
+ } s1;
+ struct uvxh_gr1_tlb_mmr_control_s {
+ unsigned long rsvd_0_15:16;
+ unsigned long auto_valid_en:1; /* RW */
+ unsigned long rsvd_17_19:3;
+ unsigned long mmr_hash_index_en:1; /* RW */
+ unsigned long rsvd_21_29:9;
+ unsigned long mmr_write:1; /* WP */
+ unsigned long mmr_read:1; /* WP */
+ unsigned long mmr_op_done:1; /* RW */
+ unsigned long rsvd_33_47:15;
+ unsigned long rsvd_48:1;
+ unsigned long rsvd_49_51:3;
+ unsigned long rsvd_52_63:12;
+ } sx;
+ struct uv2h_gr1_tlb_mmr_control_s {
+ unsigned long index:12; /* RW */
+ unsigned long mem_sel:2; /* RW */
+ unsigned long rsvd_14_15:2;
+ unsigned long auto_valid_en:1; /* RW */
+ unsigned long rsvd_17_19:3;
+ unsigned long mmr_hash_index_en:1; /* RW */
+ unsigned long rsvd_21_29:9;
+ unsigned long mmr_write:1; /* WP */
+ unsigned long mmr_read:1; /* WP */
+ unsigned long mmr_op_done:1; /* RW */
+ unsigned long rsvd_33_47:15;
+ unsigned long mmr_inj_con:1; /* RW */
+ unsigned long rsvd_49_51:3;
+ unsigned long mmr_inj_tlbram:1; /* RW */
+ unsigned long rsvd_53_63:11;
+ } s2;
+ struct uv3h_gr1_tlb_mmr_control_s {
+ unsigned long index:12; /* RW */
+ unsigned long mem_sel:2; /* RW */
+ unsigned long rsvd_14_15:2;
+ unsigned long auto_valid_en:1; /* RW */
+ unsigned long rsvd_17_19:3;
+ unsigned long mmr_hash_index_en:1; /* RW */
+ unsigned long ecc_sel:1; /* RW */
+ unsigned long rsvd_22_29:8;
+ unsigned long mmr_write:1; /* WP */
+ unsigned long mmr_read:1; /* WP */
+ unsigned long mmr_op_done:1; /* RW */
+ unsigned long rsvd_33_47:15;
+ unsigned long undef_48:1; /* Undefined */
+ unsigned long rsvd_49_51:3;
+ unsigned long undef_52:1; /* Undefined */
+ unsigned long rsvd_53_63:11;
+ } s3;
+ struct uv4h_gr1_tlb_mmr_control_s {
+ unsigned long index:13; /* RW */
+ unsigned long mem_sel:2; /* RW */
+ unsigned long rsvd_15:1;
+ unsigned long auto_valid_en:1; /* RW */
+ unsigned long rsvd_17_19:3;
+ unsigned long mmr_hash_index_en:1; /* RW */
+ unsigned long ecc_sel:1; /* RW */
+ unsigned long rsvd_22_29:8;
+ unsigned long mmr_write:1; /* WP */
+ unsigned long mmr_read:1; /* WP */
+ unsigned long mmr_op_done:1; /* RW */
+ unsigned long rsvd_33_47:15;
+ unsigned long undef_48:1; /* Undefined */
+ unsigned long rsvd_49_51:3;
+ unsigned long rsvd_52_58:7;
+ unsigned long page_size:5; /* RW */
+ } s4;
+};
+
+/* ========================================================================= */
+/* UVH_GR1_TLB_MMR_READ_DATA_HI */
+/* ========================================================================= */
+#define UV1H_GR1_TLB_MMR_READ_DATA_HI 0x8010a0UL
+#define UV2H_GR1_TLB_MMR_READ_DATA_HI 0x10010a0UL
+#define UV3H_GR1_TLB_MMR_READ_DATA_HI 0x10010a0UL
+#define UV4H_GR1_TLB_MMR_READ_DATA_HI 0x7010a0UL
+#define UVH_GR1_TLB_MMR_READ_DATA_HI ( \
+ is_uv1_hub() ? UV1H_GR1_TLB_MMR_READ_DATA_HI : \
+ is_uv2_hub() ? UV2H_GR1_TLB_MMR_READ_DATA_HI : \
+ is_uv3_hub() ? UV3H_GR1_TLB_MMR_READ_DATA_HI : \
+ /*is_uv4_hub*/ UV4H_GR1_TLB_MMR_READ_DATA_HI)
+
+#define UVH_GR1_TLB_MMR_READ_DATA_HI_PFN_SHFT 0
+
+#define UV1H_GR1_TLB_MMR_READ_DATA_HI_PFN_SHFT 0
+#define UV1H_GR1_TLB_MMR_READ_DATA_HI_GAA_SHFT 41
+#define UV1H_GR1_TLB_MMR_READ_DATA_HI_DIRTY_SHFT 43
+#define UV1H_GR1_TLB_MMR_READ_DATA_HI_LARGER_SHFT 44
+#define UV1H_GR1_TLB_MMR_READ_DATA_HI_PFN_MASK 0x000001ffffffffffUL
+#define UV1H_GR1_TLB_MMR_READ_DATA_HI_GAA_MASK 0x0000060000000000UL
+#define UV1H_GR1_TLB_MMR_READ_DATA_HI_DIRTY_MASK 0x0000080000000000UL
+#define UV1H_GR1_TLB_MMR_READ_DATA_HI_LARGER_MASK 0x0000100000000000UL
+
+#define UVXH_GR1_TLB_MMR_READ_DATA_HI_PFN_SHFT 0
+
+#define UV2H_GR1_TLB_MMR_READ_DATA_HI_PFN_SHFT 0
+#define UV2H_GR1_TLB_MMR_READ_DATA_HI_GAA_SHFT 41
+#define UV2H_GR1_TLB_MMR_READ_DATA_HI_DIRTY_SHFT 43
+#define UV2H_GR1_TLB_MMR_READ_DATA_HI_LARGER_SHFT 44
+#define UV2H_GR1_TLB_MMR_READ_DATA_HI_PFN_MASK 0x000001ffffffffffUL
+#define UV2H_GR1_TLB_MMR_READ_DATA_HI_GAA_MASK 0x0000060000000000UL
+#define UV2H_GR1_TLB_MMR_READ_DATA_HI_DIRTY_MASK 0x0000080000000000UL
+#define UV2H_GR1_TLB_MMR_READ_DATA_HI_LARGER_MASK 0x0000100000000000UL
+
+#define UV3H_GR1_TLB_MMR_READ_DATA_HI_PFN_SHFT 0
+#define UV3H_GR1_TLB_MMR_READ_DATA_HI_GAA_SHFT 41
+#define UV3H_GR1_TLB_MMR_READ_DATA_HI_DIRTY_SHFT 43
+#define UV3H_GR1_TLB_MMR_READ_DATA_HI_LARGER_SHFT 44
+#define UV3H_GR1_TLB_MMR_READ_DATA_HI_AA_EXT_SHFT 45
+#define UV3H_GR1_TLB_MMR_READ_DATA_HI_WAY_ECC_SHFT 55
+#define UV3H_GR1_TLB_MMR_READ_DATA_HI_PFN_MASK 0x000001ffffffffffUL
+#define UV3H_GR1_TLB_MMR_READ_DATA_HI_GAA_MASK 0x0000060000000000UL
+#define UV3H_GR1_TLB_MMR_READ_DATA_HI_DIRTY_MASK 0x0000080000000000UL
+#define UV3H_GR1_TLB_MMR_READ_DATA_HI_LARGER_MASK 0x0000100000000000UL
+#define UV3H_GR1_TLB_MMR_READ_DATA_HI_AA_EXT_MASK 0x0000200000000000UL
+#define UV3H_GR1_TLB_MMR_READ_DATA_HI_WAY_ECC_MASK 0xff80000000000000UL
+
+#define UV4H_GR1_TLB_MMR_READ_DATA_HI_PFN_SHFT 0
+#define UV4H_GR1_TLB_MMR_READ_DATA_HI_PNID_SHFT 34
+#define UV4H_GR1_TLB_MMR_READ_DATA_HI_GAA_SHFT 49
+#define UV4H_GR1_TLB_MMR_READ_DATA_HI_DIRTY_SHFT 51
+#define UV4H_GR1_TLB_MMR_READ_DATA_HI_LARGER_SHFT 52
+#define UV4H_GR1_TLB_MMR_READ_DATA_HI_AA_EXT_SHFT 53
+#define UV4H_GR1_TLB_MMR_READ_DATA_HI_WAY_ECC_SHFT 55
+#define UV4H_GR1_TLB_MMR_READ_DATA_HI_PFN_MASK 0x00000003ffffffffUL
+#define UV4H_GR1_TLB_MMR_READ_DATA_HI_PNID_MASK 0x0001fffc00000000UL
+#define UV4H_GR1_TLB_MMR_READ_DATA_HI_GAA_MASK 0x0006000000000000UL
+#define UV4H_GR1_TLB_MMR_READ_DATA_HI_DIRTY_MASK 0x0008000000000000UL
+#define UV4H_GR1_TLB_MMR_READ_DATA_HI_LARGER_MASK 0x0010000000000000UL
+#define UV4H_GR1_TLB_MMR_READ_DATA_HI_AA_EXT_MASK 0x0020000000000000UL
+#define UV4H_GR1_TLB_MMR_READ_DATA_HI_WAY_ECC_MASK 0xff80000000000000UL
+
+
+union uvh_gr1_tlb_mmr_read_data_hi_u {
+ unsigned long v;
+ struct uv1h_gr1_tlb_mmr_read_data_hi_s {
+ unsigned long pfn:41; /* RO */
+ unsigned long gaa:2; /* RO */
+ unsigned long dirty:1; /* RO */
+ unsigned long larger:1; /* RO */
+ unsigned long rsvd_45_63:19;
+ } s1;
+ struct uv2h_gr1_tlb_mmr_read_data_hi_s {
+ unsigned long pfn:41; /* RO */
+ unsigned long gaa:2; /* RO */
+ unsigned long dirty:1; /* RO */
+ unsigned long larger:1; /* RO */
+ unsigned long rsvd_45_63:19;
+ } s2;
+ struct uv3h_gr1_tlb_mmr_read_data_hi_s {
+ unsigned long pfn:41; /* RO */
+ unsigned long gaa:2; /* RO */
+ unsigned long dirty:1; /* RO */
+ unsigned long larger:1; /* RO */
+ unsigned long aa_ext:1; /* RO */
+ unsigned long undef_46_54:9; /* Undefined */
+ unsigned long way_ecc:9; /* RO */
+ } s3;
+ struct uv4h_gr1_tlb_mmr_read_data_hi_s {
+ unsigned long pfn:34; /* RO */
+ unsigned long pnid:15; /* RO */
+ unsigned long gaa:2; /* RO */
+ unsigned long dirty:1; /* RO */
+ unsigned long larger:1; /* RO */
+ unsigned long aa_ext:1; /* RO */
+ unsigned long undef_54:1; /* Undefined */
+ unsigned long way_ecc:9; /* RO */
+ } s4;
+};
+
+/* ========================================================================= */
+/* UVH_GR1_TLB_MMR_READ_DATA_LO */
+/* ========================================================================= */
+#define UV1H_GR1_TLB_MMR_READ_DATA_LO 0x8010a8UL
+#define UV2H_GR1_TLB_MMR_READ_DATA_LO 0x10010a8UL
+#define UV3H_GR1_TLB_MMR_READ_DATA_LO 0x10010a8UL
+#define UV4H_GR1_TLB_MMR_READ_DATA_LO 0x7010a8UL
+#define UVH_GR1_TLB_MMR_READ_DATA_LO ( \
+ is_uv1_hub() ? UV1H_GR1_TLB_MMR_READ_DATA_LO : \
+ is_uv2_hub() ? UV2H_GR1_TLB_MMR_READ_DATA_LO : \
+ is_uv3_hub() ? UV3H_GR1_TLB_MMR_READ_DATA_LO : \
+ /*is_uv4_hub*/ UV4H_GR1_TLB_MMR_READ_DATA_LO)
+
+#define UVH_GR1_TLB_MMR_READ_DATA_LO_VPN_SHFT 0
+#define UVH_GR1_TLB_MMR_READ_DATA_LO_ASID_SHFT 39
+#define UVH_GR1_TLB_MMR_READ_DATA_LO_VALID_SHFT 63
+#define UVH_GR1_TLB_MMR_READ_DATA_LO_VPN_MASK 0x0000007fffffffffUL
+#define UVH_GR1_TLB_MMR_READ_DATA_LO_ASID_MASK 0x7fffff8000000000UL
+#define UVH_GR1_TLB_MMR_READ_DATA_LO_VALID_MASK 0x8000000000000000UL
+
+#define UV1H_GR1_TLB_MMR_READ_DATA_LO_VPN_SHFT 0
+#define UV1H_GR1_TLB_MMR_READ_DATA_LO_ASID_SHFT 39
+#define UV1H_GR1_TLB_MMR_READ_DATA_LO_VALID_SHFT 63
+#define UV1H_GR1_TLB_MMR_READ_DATA_LO_VPN_MASK 0x0000007fffffffffUL
+#define UV1H_GR1_TLB_MMR_READ_DATA_LO_ASID_MASK 0x7fffff8000000000UL
+#define UV1H_GR1_TLB_MMR_READ_DATA_LO_VALID_MASK 0x8000000000000000UL
+
+#define UVXH_GR1_TLB_MMR_READ_DATA_LO_VPN_SHFT 0
+#define UVXH_GR1_TLB_MMR_READ_DATA_LO_ASID_SHFT 39
+#define UVXH_GR1_TLB_MMR_READ_DATA_LO_VALID_SHFT 63
+#define UVXH_GR1_TLB_MMR_READ_DATA_LO_VPN_MASK 0x0000007fffffffffUL
+#define UVXH_GR1_TLB_MMR_READ_DATA_LO_ASID_MASK 0x7fffff8000000000UL
+#define UVXH_GR1_TLB_MMR_READ_DATA_LO_VALID_MASK 0x8000000000000000UL
+
+#define UV2H_GR1_TLB_MMR_READ_DATA_LO_VPN_SHFT 0
+#define UV2H_GR1_TLB_MMR_READ_DATA_LO_ASID_SHFT 39
+#define UV2H_GR1_TLB_MMR_READ_DATA_LO_VALID_SHFT 63
+#define UV2H_GR1_TLB_MMR_READ_DATA_LO_VPN_MASK 0x0000007fffffffffUL
+#define UV2H_GR1_TLB_MMR_READ_DATA_LO_ASID_MASK 0x7fffff8000000000UL
+#define UV2H_GR1_TLB_MMR_READ_DATA_LO_VALID_MASK 0x8000000000000000UL
+
+#define UV3H_GR1_TLB_MMR_READ_DATA_LO_VPN_SHFT 0
+#define UV3H_GR1_TLB_MMR_READ_DATA_LO_ASID_SHFT 39
+#define UV3H_GR1_TLB_MMR_READ_DATA_LO_VALID_SHFT 63
+#define UV3H_GR1_TLB_MMR_READ_DATA_LO_VPN_MASK 0x0000007fffffffffUL
+#define UV3H_GR1_TLB_MMR_READ_DATA_LO_ASID_MASK 0x7fffff8000000000UL
+#define UV3H_GR1_TLB_MMR_READ_DATA_LO_VALID_MASK 0x8000000000000000UL
+
+#define UV4H_GR1_TLB_MMR_READ_DATA_LO_VPN_SHFT 0
+#define UV4H_GR1_TLB_MMR_READ_DATA_LO_ASID_SHFT 39
+#define UV4H_GR1_TLB_MMR_READ_DATA_LO_VALID_SHFT 63
+#define UV4H_GR1_TLB_MMR_READ_DATA_LO_VPN_MASK 0x0000007fffffffffUL
+#define UV4H_GR1_TLB_MMR_READ_DATA_LO_ASID_MASK 0x7fffff8000000000UL
+#define UV4H_GR1_TLB_MMR_READ_DATA_LO_VALID_MASK 0x8000000000000000UL
+
+
+union uvh_gr1_tlb_mmr_read_data_lo_u {
+ unsigned long v;
+ struct uvh_gr1_tlb_mmr_read_data_lo_s {
+ unsigned long vpn:39; /* RO */
+ unsigned long asid:24; /* RO */
+ unsigned long valid:1; /* RO */
+ } s;
+ struct uv1h_gr1_tlb_mmr_read_data_lo_s {
+ unsigned long vpn:39; /* RO */
+ unsigned long asid:24; /* RO */
+ unsigned long valid:1; /* RO */
+ } s1;
+ struct uvxh_gr1_tlb_mmr_read_data_lo_s {
+ unsigned long vpn:39; /* RO */
+ unsigned long asid:24; /* RO */
+ unsigned long valid:1; /* RO */
+ } sx;
+ struct uv2h_gr1_tlb_mmr_read_data_lo_s {
+ unsigned long vpn:39; /* RO */
+ unsigned long asid:24; /* RO */
+ unsigned long valid:1; /* RO */
+ } s2;
+ struct uv3h_gr1_tlb_mmr_read_data_lo_s {
+ unsigned long vpn:39; /* RO */
+ unsigned long asid:24; /* RO */
+ unsigned long valid:1; /* RO */
+ } s3;
+ struct uv4h_gr1_tlb_mmr_read_data_lo_s {
+ unsigned long vpn:39; /* RO */
+ unsigned long asid:24; /* RO */
+ unsigned long valid:1; /* RO */
+ } s4;
+};
+
+/* ========================================================================= */
+/* UVH_INT_CMPB */
+/* ========================================================================= */
+#define UVH_INT_CMPB 0x22080UL
+
+#define UVH_INT_CMPB_REAL_TIME_CMPB_SHFT 0
+#define UVH_INT_CMPB_REAL_TIME_CMPB_MASK 0x00ffffffffffffffUL
+
+
+union uvh_int_cmpb_u {
+ unsigned long v;
+ struct uvh_int_cmpb_s {
+ unsigned long real_time_cmpb:56; /* RW */
+ unsigned long rsvd_56_63:8;
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_INT_CMPC */
+/* ========================================================================= */
+#define UVH_INT_CMPC 0x22100UL
+
+
+#define UV1H_INT_CMPC_REAL_TIME_CMPC_SHFT 0
+#define UV1H_INT_CMPC_REAL_TIME_CMPC_MASK 0x00ffffffffffffffUL
+
+#define UVXH_INT_CMPC_REAL_TIME_CMP_2_SHFT 0
+#define UVXH_INT_CMPC_REAL_TIME_CMP_2_MASK 0x00ffffffffffffffUL
+
+
+union uvh_int_cmpc_u {
+ unsigned long v;
+ struct uvh_int_cmpc_s {
+ unsigned long real_time_cmpc:56; /* RW */
+ unsigned long rsvd_56_63:8;
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_INT_CMPD */
+/* ========================================================================= */
+#define UVH_INT_CMPD 0x22180UL
+
+
+#define UV1H_INT_CMPD_REAL_TIME_CMPD_SHFT 0
+#define UV1H_INT_CMPD_REAL_TIME_CMPD_MASK 0x00ffffffffffffffUL
+
+#define UVXH_INT_CMPD_REAL_TIME_CMP_3_SHFT 0
+#define UVXH_INT_CMPD_REAL_TIME_CMP_3_MASK 0x00ffffffffffffffUL
+
+
+union uvh_int_cmpd_u {
+ unsigned long v;
+ struct uvh_int_cmpd_s {
+ unsigned long real_time_cmpd:56; /* RW */
+ unsigned long rsvd_56_63:8;
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_IPI_INT */
+/* ========================================================================= */
+#define UVH_IPI_INT 0x60500UL
+
+#define UV1H_IPI_INT_32 0x348
+#define UV2H_IPI_INT_32 0x348
+#define UV3H_IPI_INT_32 0x348
+#define UV4H_IPI_INT_32 0x268
+#define UVH_IPI_INT_32 ( \
+ is_uv1_hub() ? UV1H_IPI_INT_32 : \
+ is_uv2_hub() ? UV2H_IPI_INT_32 : \
+ is_uv3_hub() ? UV3H_IPI_INT_32 : \
+ /*is_uv4_hub*/ UV4H_IPI_INT_32)
+
+#define UVH_IPI_INT_VECTOR_SHFT 0
+#define UVH_IPI_INT_DELIVERY_MODE_SHFT 8
+#define UVH_IPI_INT_DESTMODE_SHFT 11
+#define UVH_IPI_INT_APIC_ID_SHFT 16
+#define UVH_IPI_INT_SEND_SHFT 63
+#define UVH_IPI_INT_VECTOR_MASK 0x00000000000000ffUL
+#define UVH_IPI_INT_DELIVERY_MODE_MASK 0x0000000000000700UL
+#define UVH_IPI_INT_DESTMODE_MASK 0x0000000000000800UL
+#define UVH_IPI_INT_APIC_ID_MASK 0x0000ffffffff0000UL
+#define UVH_IPI_INT_SEND_MASK 0x8000000000000000UL
+
+
+union uvh_ipi_int_u {
+ unsigned long v;
+ struct uvh_ipi_int_s {
+ unsigned long vector_:8; /* RW */
+ unsigned long delivery_mode:3; /* RW */
+ unsigned long destmode:1; /* RW */
+ unsigned long rsvd_12_15:4;
+ unsigned long apic_id:32; /* RW */
+ unsigned long rsvd_48_62:15;
+ unsigned long send:1; /* WP */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST */
+/* ========================================================================= */
+#define UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST 0x320050UL
+#define UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST 0x320050UL
+#define UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST 0x320050UL
+#define UV4H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST uv_undefined("UV4H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST")
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST ( \
+ is_uv1_hub() ? UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST : \
+ is_uv2_hub() ? UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST : \
+ is_uv3_hub() ? UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST : \
+ /*is_uv4_hub*/ UV4H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST)
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_32 0x9c0
+
+
+#define UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_SHFT 4
+#define UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_NODE_ID_SHFT 49
+#define UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_MASK 0x000007fffffffff0UL
+#define UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_NODE_ID_MASK 0x7ffe000000000000UL
+
+
+#define UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_SHFT 4
+#define UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_NODE_ID_SHFT 49
+#define UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_MASK 0x000007fffffffff0UL
+#define UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_NODE_ID_MASK 0x7ffe000000000000UL
+
+#define UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_SHFT 4
+#define UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_NODE_ID_SHFT 49
+#define UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_MASK 0x000007fffffffff0UL
+#define UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_NODE_ID_MASK 0x7ffe000000000000UL
+
+
+union uvh_lb_bau_intd_payload_queue_first_u {
+ unsigned long v;
+ struct uv1h_lb_bau_intd_payload_queue_first_s {
+ unsigned long rsvd_0_3:4;
+ unsigned long address:39; /* RW */
+ unsigned long rsvd_43_48:6;
+ unsigned long node_id:14; /* RW */
+ unsigned long rsvd_63:1;
+ } s1;
+ struct uv2h_lb_bau_intd_payload_queue_first_s {
+ unsigned long rsvd_0_3:4;
+ unsigned long address:39; /* RW */
+ unsigned long rsvd_43_48:6;
+ unsigned long node_id:14; /* RW */
+ unsigned long rsvd_63:1;
+ } s2;
+ struct uv3h_lb_bau_intd_payload_queue_first_s {
+ unsigned long rsvd_0_3:4;
+ unsigned long address:39; /* RW */
+ unsigned long rsvd_43_48:6;
+ unsigned long node_id:14; /* RW */
+ unsigned long rsvd_63:1;
+ } s3;
+};
+
+/* ========================================================================= */
+/* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST */
+/* ========================================================================= */
+#define UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST 0x320060UL
+#define UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST 0x320060UL
+#define UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST 0x320060UL
+#define UV4H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST uv_undefined("UV4H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST")
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST ( \
+ is_uv1_hub() ? UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST : \
+ is_uv2_hub() ? UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST : \
+ is_uv3_hub() ? UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST : \
+ /*is_uv4_hub*/ UV4H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST)
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_32 0x9c8
+
+
+#define UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_SHFT 4
+#define UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_MASK 0x000007fffffffff0UL
+
+
+#define UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_SHFT 4
+#define UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_MASK 0x000007fffffffff0UL
+
+#define UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_SHFT 4
+#define UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_MASK 0x000007fffffffff0UL
+
+
+union uvh_lb_bau_intd_payload_queue_last_u {
+ unsigned long v;
+ struct uv1h_lb_bau_intd_payload_queue_last_s {
+ unsigned long rsvd_0_3:4;
+ unsigned long address:39; /* RW */
+ unsigned long rsvd_43_63:21;
+ } s1;
+ struct uv2h_lb_bau_intd_payload_queue_last_s {
+ unsigned long rsvd_0_3:4;
+ unsigned long address:39; /* RW */
+ unsigned long rsvd_43_63:21;
+ } s2;
+ struct uv3h_lb_bau_intd_payload_queue_last_s {
+ unsigned long rsvd_0_3:4;
+ unsigned long address:39; /* RW */
+ unsigned long rsvd_43_63:21;
+ } s3;
+};
+
+/* ========================================================================= */
+/* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL */
+/* ========================================================================= */
+#define UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL 0x320070UL
+#define UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL 0x320070UL
+#define UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL 0x320070UL
+#define UV4H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL uv_undefined("UV4H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL")
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL ( \
+ is_uv1_hub() ? UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL : \
+ is_uv2_hub() ? UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL : \
+ is_uv3_hub() ? UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL : \
+ /*is_uv4_hub*/ UV4H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL)
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_32 0x9d0
+
+
+#define UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_SHFT 4
+#define UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_MASK 0x000007fffffffff0UL
+
+
+#define UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_SHFT 4
+#define UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_MASK 0x000007fffffffff0UL
+
+#define UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_SHFT 4
+#define UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_MASK 0x000007fffffffff0UL
+
+
+union uvh_lb_bau_intd_payload_queue_tail_u {
+ unsigned long v;
+ struct uv1h_lb_bau_intd_payload_queue_tail_s {
+ unsigned long rsvd_0_3:4;
+ unsigned long address:39; /* RW */
+ unsigned long rsvd_43_63:21;
+ } s1;
+ struct uv2h_lb_bau_intd_payload_queue_tail_s {
+ unsigned long rsvd_0_3:4;
+ unsigned long address:39; /* RW */
+ unsigned long rsvd_43_63:21;
+ } s2;
+ struct uv3h_lb_bau_intd_payload_queue_tail_s {
+ unsigned long rsvd_0_3:4;
+ unsigned long address:39; /* RW */
+ unsigned long rsvd_43_63:21;
+ } s3;
+};
+
+/* ========================================================================= */
+/* UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE */
+/* ========================================================================= */
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE 0x320080UL
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE 0x320080UL
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE 0x320080UL
+#define UV4H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE uv_undefined("UV4H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE")
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE ( \
+ is_uv1_hub() ? UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE : \
+ is_uv2_hub() ? UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE : \
+ is_uv3_hub() ? UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE : \
+ /*is_uv4_hub*/ UV4H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE)
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_32 0xa68
+
+
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_SHFT 0
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_SHFT 1
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_SHFT 2
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_3_SHFT 3
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_4_SHFT 4
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_5_SHFT 5
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_6_SHFT 6
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_7_SHFT 7
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_0_SHFT 8
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_1_SHFT 9
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_2_SHFT 10
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_3_SHFT 11
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_4_SHFT 12
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_5_SHFT 13
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_SHFT 14
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_SHFT 15
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_MASK 0x0000000000000001UL
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_MASK 0x0000000000000002UL
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_MASK 0x0000000000000004UL
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_3_MASK 0x0000000000000008UL
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_4_MASK 0x0000000000000010UL
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_5_MASK 0x0000000000000020UL
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_6_MASK 0x0000000000000040UL
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_7_MASK 0x0000000000000080UL
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_0_MASK 0x0000000000000100UL
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_1_MASK 0x0000000000000200UL
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_2_MASK 0x0000000000000400UL
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_3_MASK 0x0000000000000800UL
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_4_MASK 0x0000000000001000UL
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_5_MASK 0x0000000000002000UL
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_MASK 0x0000000000004000UL
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_MASK 0x0000000000008000UL
+
+
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_SHFT 0
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_SHFT 1
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_SHFT 2
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_3_SHFT 3
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_4_SHFT 4
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_5_SHFT 5
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_6_SHFT 6
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_7_SHFT 7
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_0_SHFT 8
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_1_SHFT 9
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_2_SHFT 10
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_3_SHFT 11
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_4_SHFT 12
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_5_SHFT 13
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_SHFT 14
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_SHFT 15
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_MASK 0x0000000000000001UL
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_MASK 0x0000000000000002UL
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_MASK 0x0000000000000004UL
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_3_MASK 0x0000000000000008UL
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_4_MASK 0x0000000000000010UL
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_5_MASK 0x0000000000000020UL
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_6_MASK 0x0000000000000040UL
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_7_MASK 0x0000000000000080UL
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_0_MASK 0x0000000000000100UL
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_1_MASK 0x0000000000000200UL
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_2_MASK 0x0000000000000400UL
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_3_MASK 0x0000000000000800UL
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_4_MASK 0x0000000000001000UL
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_5_MASK 0x0000000000002000UL
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_MASK 0x0000000000004000UL
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_MASK 0x0000000000008000UL
+
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_SHFT 0
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_SHFT 1
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_SHFT 2
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_3_SHFT 3
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_4_SHFT 4
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_5_SHFT 5
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_6_SHFT 6
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_7_SHFT 7
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_0_SHFT 8
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_1_SHFT 9
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_2_SHFT 10
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_3_SHFT 11
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_4_SHFT 12
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_5_SHFT 13
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_SHFT 14
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_SHFT 15
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_MASK 0x0000000000000001UL
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_MASK 0x0000000000000002UL
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_MASK 0x0000000000000004UL
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_3_MASK 0x0000000000000008UL
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_4_MASK 0x0000000000000010UL
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_5_MASK 0x0000000000000020UL
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_6_MASK 0x0000000000000040UL
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_7_MASK 0x0000000000000080UL
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_0_MASK 0x0000000000000100UL
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_1_MASK 0x0000000000000200UL
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_2_MASK 0x0000000000000400UL
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_3_MASK 0x0000000000000800UL
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_4_MASK 0x0000000000001000UL
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_5_MASK 0x0000000000002000UL
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_MASK 0x0000000000004000UL
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_MASK 0x0000000000008000UL
+
+
+union uvh_lb_bau_intd_software_acknowledge_u {
+ unsigned long v;
+ struct uv1h_lb_bau_intd_software_acknowledge_s {
+ unsigned long pending_0:1; /* RW, W1C */
+ unsigned long pending_1:1; /* RW, W1C */
+ unsigned long pending_2:1; /* RW, W1C */
+ unsigned long pending_3:1; /* RW, W1C */
+ unsigned long pending_4:1; /* RW, W1C */
+ unsigned long pending_5:1; /* RW, W1C */
+ unsigned long pending_6:1; /* RW, W1C */
+ unsigned long pending_7:1; /* RW, W1C */
+ unsigned long timeout_0:1; /* RW, W1C */
+ unsigned long timeout_1:1; /* RW, W1C */
+ unsigned long timeout_2:1; /* RW, W1C */
+ unsigned long timeout_3:1; /* RW, W1C */
+ unsigned long timeout_4:1; /* RW, W1C */
+ unsigned long timeout_5:1; /* RW, W1C */
+ unsigned long timeout_6:1; /* RW, W1C */
+ unsigned long timeout_7:1; /* RW, W1C */
+ unsigned long rsvd_16_63:48;
+ } s1;
+ struct uv2h_lb_bau_intd_software_acknowledge_s {
+ unsigned long pending_0:1; /* RW */
+ unsigned long pending_1:1; /* RW */
+ unsigned long pending_2:1; /* RW */
+ unsigned long pending_3:1; /* RW */
+ unsigned long pending_4:1; /* RW */
+ unsigned long pending_5:1; /* RW */
+ unsigned long pending_6:1; /* RW */
+ unsigned long pending_7:1; /* RW */
+ unsigned long timeout_0:1; /* RW */
+ unsigned long timeout_1:1; /* RW */
+ unsigned long timeout_2:1; /* RW */
+ unsigned long timeout_3:1; /* RW */
+ unsigned long timeout_4:1; /* RW */
+ unsigned long timeout_5:1; /* RW */
+ unsigned long timeout_6:1; /* RW */
+ unsigned long timeout_7:1; /* RW */
+ unsigned long rsvd_16_63:48;
+ } s2;
+ struct uv3h_lb_bau_intd_software_acknowledge_s {
+ unsigned long pending_0:1; /* RW */
+ unsigned long pending_1:1; /* RW */
+ unsigned long pending_2:1; /* RW */
+ unsigned long pending_3:1; /* RW */
+ unsigned long pending_4:1; /* RW */
+ unsigned long pending_5:1; /* RW */
+ unsigned long pending_6:1; /* RW */
+ unsigned long pending_7:1; /* RW */
+ unsigned long timeout_0:1; /* RW */
+ unsigned long timeout_1:1; /* RW */
+ unsigned long timeout_2:1; /* RW */
+ unsigned long timeout_3:1; /* RW */
+ unsigned long timeout_4:1; /* RW */
+ unsigned long timeout_5:1; /* RW */
+ unsigned long timeout_6:1; /* RW */
+ unsigned long timeout_7:1; /* RW */
+ unsigned long rsvd_16_63:48;
+ } s3;
+};
+
+/* ========================================================================= */
+/* UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS */
+/* ========================================================================= */
+#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS 0x320088UL
+#define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS 0x320088UL
+#define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS 0x320088UL
+#define UV4H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS uv_undefined("UV4H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS")
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS ( \
+ is_uv1_hub() ? UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS : \
+ is_uv2_hub() ? UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS : \
+ is_uv3_hub() ? UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS : \
+ /*is_uv4_hub*/ UV4H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS)
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS_32 0xa70
+
+
+/* ========================================================================= */
+/* UVH_LB_BAU_MISC_CONTROL */
+/* ========================================================================= */
+#define UV1H_LB_BAU_MISC_CONTROL 0x320170UL
+#define UV2H_LB_BAU_MISC_CONTROL 0x320170UL
+#define UV3H_LB_BAU_MISC_CONTROL 0x320170UL
+#define UV4H_LB_BAU_MISC_CONTROL 0xc8170UL
+#define UVH_LB_BAU_MISC_CONTROL ( \
+ is_uv1_hub() ? UV1H_LB_BAU_MISC_CONTROL : \
+ is_uv2_hub() ? UV2H_LB_BAU_MISC_CONTROL : \
+ is_uv3_hub() ? UV3H_LB_BAU_MISC_CONTROL : \
+ /*is_uv4_hub*/ UV4H_LB_BAU_MISC_CONTROL)
+
+#define UV1H_LB_BAU_MISC_CONTROL_32 0xa10
+#define UV2H_LB_BAU_MISC_CONTROL_32 0xa10
+#define UV3H_LB_BAU_MISC_CONTROL_32 0xa10
+#define UV4H_LB_BAU_MISC_CONTROL_32 0xa18
+#define UVH_LB_BAU_MISC_CONTROL_32 ( \
+ is_uv1_hub() ? UV1H_LB_BAU_MISC_CONTROL_32 : \
+ is_uv2_hub() ? UV2H_LB_BAU_MISC_CONTROL_32 : \
+ is_uv3_hub() ? UV3H_LB_BAU_MISC_CONTROL_32 : \
+ /*is_uv4_hub*/ UV4H_LB_BAU_MISC_CONTROL_32)
+
+#define UVH_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0
+#define UVH_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT 8
+#define UVH_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_SHFT 9
+#define UVH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT 10
+#define UVH_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_SHFT 11
+#define UVH_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_SHFT 14
+#define UVH_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_SHFT 20
+#define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_SHFT 21
+#define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_SHFT 22
+#define UVH_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_SHFT 23
+#define UVH_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_SHFT 24
+#define UVH_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_SHFT 27
+#define UVH_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_SHFT 28
+#define UVH_LB_BAU_MISC_CONTROL_FUN_SHFT 48
+#define UVH_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK 0x00000000000000ffUL
+#define UVH_LB_BAU_MISC_CONTROL_APIC_MODE_MASK 0x0000000000000100UL
+#define UVH_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK 0x0000000000000200UL
+#define UVH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK 0x0000000000000400UL
+#define UVH_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL
+#define UVH_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL
+#define UVH_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL
+#define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL
+#define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL
+#define UVH_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_MASK 0x0000000000800000UL
+#define UVH_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000007000000UL
+#define UVH_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL
+#define UVH_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000010000000UL
+#define UVH_LB_BAU_MISC_CONTROL_FUN_MASK 0xffff000000000000UL
+
+#define UV1H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0
+#define UV1H_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT 8
+#define UV1H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_SHFT 9
+#define UV1H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT 10
+#define UV1H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_SHFT 11
+#define UV1H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_SHFT 14
+#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT 15
+#define UV1H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT 16
+#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_SHFT 20
+#define UV1H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_SHFT 21
+#define UV1H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_SHFT 22
+#define UV1H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_SHFT 23
+#define UV1H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_SHFT 24
+#define UV1H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_SHFT 27
+#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_SHFT 28
+#define UV1H_LB_BAU_MISC_CONTROL_FUN_SHFT 48
+#define UV1H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK 0x00000000000000ffUL
+#define UV1H_LB_BAU_MISC_CONTROL_APIC_MODE_MASK 0x0000000000000100UL
+#define UV1H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK 0x0000000000000200UL
+#define UV1H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK 0x0000000000000400UL
+#define UV1H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL
+#define UV1H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL
+#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK 0x0000000000008000UL
+#define UV1H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK 0x00000000000f0000UL
+#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL
+#define UV1H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL
+#define UV1H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL
+#define UV1H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_MASK 0x0000000000800000UL
+#define UV1H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000007000000UL
+#define UV1H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL
+#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000010000000UL
+#define UV1H_LB_BAU_MISC_CONTROL_FUN_MASK 0xffff000000000000UL
+
+#define UVXH_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0
+#define UVXH_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT 8
+#define UVXH_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_SHFT 9
+#define UVXH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT 10
+#define UVXH_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_SHFT 11
+#define UVXH_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_SHFT 14
+#define UVXH_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_SHFT 20
+#define UVXH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_SHFT 21
+#define UVXH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_SHFT 22
+#define UVXH_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_SHFT 23
+#define UVXH_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_SHFT 24
+#define UVXH_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_SHFT 27
+#define UVXH_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_SHFT 28
+#define UVXH_LB_BAU_MISC_CONTROL_ENABLE_AUTOMATIC_APIC_MODE_SELECTION_SHFT 29
+#define UVXH_LB_BAU_MISC_CONTROL_APIC_MODE_STATUS_SHFT 30
+#define UVXH_LB_BAU_MISC_CONTROL_SUPPRESS_INTERRUPTS_TO_SELF_SHFT 31
+#define UVXH_LB_BAU_MISC_CONTROL_ENABLE_LOCK_BASED_SYSTEM_FLUSH_SHFT 32
+#define UVXH_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_SHFT 33
+#define UVXH_LB_BAU_MISC_CONTROL_SUPPRESS_INT_PRIO_UDT_TO_SELF_SHFT 34
+#define UVXH_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_SHFT 35
+#define UVXH_LB_BAU_MISC_CONTROL_FUN_SHFT 48
+#define UVXH_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK 0x00000000000000ffUL
+#define UVXH_LB_BAU_MISC_CONTROL_APIC_MODE_MASK 0x0000000000000100UL
+#define UVXH_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK 0x0000000000000200UL
+#define UVXH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK 0x0000000000000400UL
+#define UVXH_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL
+#define UVXH_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL
+#define UVXH_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL
+#define UVXH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL
+#define UVXH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL
+#define UVXH_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_MASK 0x0000000000800000UL
+#define UVXH_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000007000000UL
+#define UVXH_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL
+#define UVXH_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000010000000UL
+#define UVXH_LB_BAU_MISC_CONTROL_ENABLE_AUTOMATIC_APIC_MODE_SELECTION_MASK 0x0000000020000000UL
+#define UVXH_LB_BAU_MISC_CONTROL_APIC_MODE_STATUS_MASK 0x0000000040000000UL
+#define UVXH_LB_BAU_MISC_CONTROL_SUPPRESS_INTERRUPTS_TO_SELF_MASK 0x0000000080000000UL
+#define UVXH_LB_BAU_MISC_CONTROL_ENABLE_LOCK_BASED_SYSTEM_FLUSH_MASK 0x0000000100000000UL
+#define UVXH_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_MASK 0x0000000200000000UL
+#define UVXH_LB_BAU_MISC_CONTROL_SUPPRESS_INT_PRIO_UDT_TO_SELF_MASK 0x0000000400000000UL
+#define UVXH_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_MASK 0x0000000800000000UL
+#define UVXH_LB_BAU_MISC_CONTROL_FUN_MASK 0xffff000000000000UL
+
+#define UV2H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0
+#define UV2H_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT 8
+#define UV2H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_SHFT 9
+#define UV2H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT 10
+#define UV2H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_SHFT 11
+#define UV2H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_SHFT 14
+#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT 15
+#define UV2H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT 16
+#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_SHFT 20
+#define UV2H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_SHFT 21
+#define UV2H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_SHFT 22
+#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_SHFT 23
+#define UV2H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_SHFT 24
+#define UV2H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_SHFT 27
+#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_SHFT 28
+#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_AUTOMATIC_APIC_MODE_SELECTION_SHFT 29
+#define UV2H_LB_BAU_MISC_CONTROL_APIC_MODE_STATUS_SHFT 30
+#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_INTERRUPTS_TO_SELF_SHFT 31
+#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_LOCK_BASED_SYSTEM_FLUSH_SHFT 32
+#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_SHFT 33
+#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_INT_PRIO_UDT_TO_SELF_SHFT 34
+#define UV2H_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_SHFT 35
+#define UV2H_LB_BAU_MISC_CONTROL_FUN_SHFT 48
+#define UV2H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK 0x00000000000000ffUL
+#define UV2H_LB_BAU_MISC_CONTROL_APIC_MODE_MASK 0x0000000000000100UL
+#define UV2H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK 0x0000000000000200UL
+#define UV2H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK 0x0000000000000400UL
+#define UV2H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL
+#define UV2H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL
+#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK 0x0000000000008000UL
+#define UV2H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK 0x00000000000f0000UL
+#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL
+#define UV2H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL
+#define UV2H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL
+#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_MASK 0x0000000000800000UL
+#define UV2H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000007000000UL
+#define UV2H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL
+#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000010000000UL
+#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_AUTOMATIC_APIC_MODE_SELECTION_MASK 0x0000000020000000UL
+#define UV2H_LB_BAU_MISC_CONTROL_APIC_MODE_STATUS_MASK 0x0000000040000000UL
+#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_INTERRUPTS_TO_SELF_MASK 0x0000000080000000UL
+#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_LOCK_BASED_SYSTEM_FLUSH_MASK 0x0000000100000000UL
+#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_MASK 0x0000000200000000UL
+#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_INT_PRIO_UDT_TO_SELF_MASK 0x0000000400000000UL
+#define UV2H_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_MASK 0x0000000800000000UL
+#define UV2H_LB_BAU_MISC_CONTROL_FUN_MASK 0xffff000000000000UL
+
+#define UV3H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0
+#define UV3H_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT 8
+#define UV3H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_SHFT 9
+#define UV3H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT 10
+#define UV3H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_SHFT 11
+#define UV3H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_SHFT 14
+#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT 15
+#define UV3H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT 16
+#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_SHFT 20
+#define UV3H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_SHFT 21
+#define UV3H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_SHFT 22
+#define UV3H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_SHFT 23
+#define UV3H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_SHFT 24
+#define UV3H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_SHFT 27
+#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_SHFT 28
+#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_AUTOMATIC_APIC_MODE_SELECTION_SHFT 29
+#define UV3H_LB_BAU_MISC_CONTROL_APIC_MODE_STATUS_SHFT 30
+#define UV3H_LB_BAU_MISC_CONTROL_SUPPRESS_INTERRUPTS_TO_SELF_SHFT 31
+#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_LOCK_BASED_SYSTEM_FLUSH_SHFT 32
+#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_SHFT 33
+#define UV3H_LB_BAU_MISC_CONTROL_SUPPRESS_INT_PRIO_UDT_TO_SELF_SHFT 34
+#define UV3H_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_SHFT 35
+#define UV3H_LB_BAU_MISC_CONTROL_SUPPRESS_QUIESCE_MSGS_TO_QPI_SHFT 36
+#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_INTD_PREFETCH_HINT_SHFT 37
+#define UV3H_LB_BAU_MISC_CONTROL_THREAD_KILL_TIMEBASE_SHFT 38
+#define UV3H_LB_BAU_MISC_CONTROL_FUN_SHFT 48
+#define UV3H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK 0x00000000000000ffUL
+#define UV3H_LB_BAU_MISC_CONTROL_APIC_MODE_MASK 0x0000000000000100UL
+#define UV3H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK 0x0000000000000200UL
+#define UV3H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK 0x0000000000000400UL
+#define UV3H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL
+#define UV3H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL
+#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK 0x0000000000008000UL
+#define UV3H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK 0x00000000000f0000UL
+#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL
+#define UV3H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL
+#define UV3H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL
+#define UV3H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_MASK 0x0000000000800000UL
+#define UV3H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000007000000UL
+#define UV3H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL
+#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000010000000UL
+#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_AUTOMATIC_APIC_MODE_SELECTION_MASK 0x0000000020000000UL
+#define UV3H_LB_BAU_MISC_CONTROL_APIC_MODE_STATUS_MASK 0x0000000040000000UL
+#define UV3H_LB_BAU_MISC_CONTROL_SUPPRESS_INTERRUPTS_TO_SELF_MASK 0x0000000080000000UL
+#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_LOCK_BASED_SYSTEM_FLUSH_MASK 0x0000000100000000UL
+#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_MASK 0x0000000200000000UL
+#define UV3H_LB_BAU_MISC_CONTROL_SUPPRESS_INT_PRIO_UDT_TO_SELF_MASK 0x0000000400000000UL
+#define UV3H_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_MASK 0x0000000800000000UL
+#define UV3H_LB_BAU_MISC_CONTROL_SUPPRESS_QUIESCE_MSGS_TO_QPI_MASK 0x0000001000000000UL
+#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_INTD_PREFETCH_HINT_MASK 0x0000002000000000UL
+#define UV3H_LB_BAU_MISC_CONTROL_THREAD_KILL_TIMEBASE_MASK 0x00003fc000000000UL
+#define UV3H_LB_BAU_MISC_CONTROL_FUN_MASK 0xffff000000000000UL
+
+#define UV4H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0
+#define UV4H_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT 8
+#define UV4H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_SHFT 9
+#define UV4H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT 10
+#define UV4H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_SHFT 11
+#define UV4H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_SHFT 14
+#define UV4H_LB_BAU_MISC_CONTROL_RESERVED_15_19_SHFT 15
+#define UV4H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_SHFT 20
+#define UV4H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_SHFT 21
+#define UV4H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_SHFT 22
+#define UV4H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_SHFT 23
+#define UV4H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_SHFT 24
+#define UV4H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_SHFT 27
+#define UV4H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_SHFT 28
+#define UV4H_LB_BAU_MISC_CONTROL_ENABLE_AUTOMATIC_APIC_MODE_SELECTION_SHFT 29
+#define UV4H_LB_BAU_MISC_CONTROL_APIC_MODE_STATUS_SHFT 30
+#define UV4H_LB_BAU_MISC_CONTROL_SUPPRESS_INTERRUPTS_TO_SELF_SHFT 31
+#define UV4H_LB_BAU_MISC_CONTROL_ENABLE_LOCK_BASED_SYSTEM_FLUSH_SHFT 32
+#define UV4H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_SHFT 33
+#define UV4H_LB_BAU_MISC_CONTROL_SUPPRESS_INT_PRIO_UDT_TO_SELF_SHFT 34
+#define UV4H_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_SHFT 35
+#define UV4H_LB_BAU_MISC_CONTROL_SUPPRESS_QUIESCE_MSGS_TO_QPI_SHFT 36
+#define UV4H_LB_BAU_MISC_CONTROL_RESERVED_37_SHFT 37
+#define UV4H_LB_BAU_MISC_CONTROL_THREAD_KILL_TIMEBASE_SHFT 38
+#define UV4H_LB_BAU_MISC_CONTROL_ADDRESS_INTERLEAVE_SELECT_SHFT 46
+#define UV4H_LB_BAU_MISC_CONTROL_FUN_SHFT 48
+#define UV4H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK 0x00000000000000ffUL
+#define UV4H_LB_BAU_MISC_CONTROL_APIC_MODE_MASK 0x0000000000000100UL
+#define UV4H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK 0x0000000000000200UL
+#define UV4H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK 0x0000000000000400UL
+#define UV4H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL
+#define UV4H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL
+#define UV4H_LB_BAU_MISC_CONTROL_RESERVED_15_19_MASK 0x00000000000f8000UL
+#define UV4H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL
+#define UV4H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL
+#define UV4H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL
+#define UV4H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_MASK 0x0000000000800000UL
+#define UV4H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000007000000UL
+#define UV4H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL
+#define UV4H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000010000000UL
+#define UV4H_LB_BAU_MISC_CONTROL_ENABLE_AUTOMATIC_APIC_MODE_SELECTION_MASK 0x0000000020000000UL
+#define UV4H_LB_BAU_MISC_CONTROL_APIC_MODE_STATUS_MASK 0x0000000040000000UL
+#define UV4H_LB_BAU_MISC_CONTROL_SUPPRESS_INTERRUPTS_TO_SELF_MASK 0x0000000080000000UL
+#define UV4H_LB_BAU_MISC_CONTROL_ENABLE_LOCK_BASED_SYSTEM_FLUSH_MASK 0x0000000100000000UL
+#define UV4H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_MASK 0x0000000200000000UL
+#define UV4H_LB_BAU_MISC_CONTROL_SUPPRESS_INT_PRIO_UDT_TO_SELF_MASK 0x0000000400000000UL
+#define UV4H_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_MASK 0x0000000800000000UL
+#define UV4H_LB_BAU_MISC_CONTROL_SUPPRESS_QUIESCE_MSGS_TO_QPI_MASK 0x0000001000000000UL
+#define UV4H_LB_BAU_MISC_CONTROL_RESERVED_37_MASK 0x0000002000000000UL
+#define UV4H_LB_BAU_MISC_CONTROL_THREAD_KILL_TIMEBASE_MASK 0x00003fc000000000UL
+#define UV4H_LB_BAU_MISC_CONTROL_ADDRESS_INTERLEAVE_SELECT_MASK 0x0000400000000000UL
+#define UV4H_LB_BAU_MISC_CONTROL_FUN_MASK 0xffff000000000000UL
+
+#define UV4H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK \
+ uv_undefined("UV4H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK")
+#define UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK ( \
+ is_uv1_hub() ? UV1H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK : \
+ is_uv2_hub() ? UV2H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK : \
+ is_uv3_hub() ? UV3H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK : \
+ /*is_uv4_hub*/ UV4H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK)
+#define UV4H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT \
+ uv_undefined("UV4H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT")
+#define UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT ( \
+ is_uv1_hub() ? UV1H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT : \
+ is_uv2_hub() ? UV2H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT : \
+ is_uv3_hub() ? UV3H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT : \
+ /*is_uv4_hub*/ UV4H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT)
+#define UV4H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK \
+ uv_undefined("UV4H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK")
+#define UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK ( \
+ is_uv1_hub() ? UV1H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK : \
+ is_uv2_hub() ? UV2H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK : \
+ is_uv3_hub() ? UV3H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK : \
+ /*is_uv4_hub*/ UV4H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK)
+#define UV4H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT \
+ uv_undefined("UV4H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT")
+#define UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT ( \
+ is_uv1_hub() ? UV1H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT : \
+ is_uv2_hub() ? UV2H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT : \
+ is_uv3_hub() ? UV3H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT : \
+ /*is_uv4_hub*/ UV4H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT)
+
+union uvh_lb_bau_misc_control_u {
+ unsigned long v;
+ struct uvh_lb_bau_misc_control_s {
+ unsigned long rejection_delay:8; /* RW */
+ unsigned long apic_mode:1; /* RW */
+ unsigned long force_broadcast:1; /* RW */
+ unsigned long force_lock_nop:1; /* RW */
+ unsigned long qpi_agent_presence_vector:3; /* RW */
+ unsigned long descriptor_fetch_mode:1; /* RW */
+ unsigned long rsvd_15_19:5;
+ unsigned long enable_dual_mapping_mode:1; /* RW */
+ unsigned long vga_io_port_decode_enable:1; /* RW */
+ unsigned long vga_io_port_16_bit_decode:1; /* RW */
+ unsigned long suppress_dest_registration:1; /* RW */
+ unsigned long programmed_initial_priority:3; /* RW */
+ unsigned long use_incoming_priority:1; /* RW */
+ unsigned long enable_programmed_initial_priority:1;/* RW */
+ unsigned long rsvd_29_47:19;
+ unsigned long fun:16; /* RW */
+ } s;
+ struct uv1h_lb_bau_misc_control_s {
+ unsigned long rejection_delay:8; /* RW */
+ unsigned long apic_mode:1; /* RW */
+ unsigned long force_broadcast:1; /* RW */
+ unsigned long force_lock_nop:1; /* RW */
+ unsigned long qpi_agent_presence_vector:3; /* RW */
+ unsigned long descriptor_fetch_mode:1; /* RW */
+ unsigned long enable_intd_soft_ack_mode:1; /* RW */
+ unsigned long intd_soft_ack_timeout_period:4; /* RW */
+ unsigned long enable_dual_mapping_mode:1; /* RW */
+ unsigned long vga_io_port_decode_enable:1; /* RW */
+ unsigned long vga_io_port_16_bit_decode:1; /* RW */
+ unsigned long suppress_dest_registration:1; /* RW */
+ unsigned long programmed_initial_priority:3; /* RW */
+ unsigned long use_incoming_priority:1; /* RW */
+ unsigned long enable_programmed_initial_priority:1;/* RW */
+ unsigned long rsvd_29_47:19;
+ unsigned long fun:16; /* RW */
+ } s1;
+ struct uvxh_lb_bau_misc_control_s {
+ unsigned long rejection_delay:8; /* RW */
+ unsigned long apic_mode:1; /* RW */
+ unsigned long force_broadcast:1; /* RW */
+ unsigned long force_lock_nop:1; /* RW */
+ unsigned long qpi_agent_presence_vector:3; /* RW */
+ unsigned long descriptor_fetch_mode:1; /* RW */
+ unsigned long rsvd_15_19:5;
+ unsigned long enable_dual_mapping_mode:1; /* RW */
+ unsigned long vga_io_port_decode_enable:1; /* RW */
+ unsigned long vga_io_port_16_bit_decode:1; /* RW */
+ unsigned long suppress_dest_registration:1; /* RW */
+ unsigned long programmed_initial_priority:3; /* RW */
+ unsigned long use_incoming_priority:1; /* RW */
+ unsigned long enable_programmed_initial_priority:1;/* RW */
+ unsigned long enable_automatic_apic_mode_selection:1;/* RW */
+ unsigned long apic_mode_status:1; /* RO */
+ unsigned long suppress_interrupts_to_self:1; /* RW */
+ unsigned long enable_lock_based_system_flush:1;/* RW */
+ unsigned long enable_extended_sb_status:1; /* RW */
+ unsigned long suppress_int_prio_udt_to_self:1;/* RW */
+ unsigned long use_legacy_descriptor_formats:1;/* RW */
+ unsigned long rsvd_36_47:12;
+ unsigned long fun:16; /* RW */
+ } sx;
+ struct uv2h_lb_bau_misc_control_s {
+ unsigned long rejection_delay:8; /* RW */
+ unsigned long apic_mode:1; /* RW */
+ unsigned long force_broadcast:1; /* RW */
+ unsigned long force_lock_nop:1; /* RW */
+ unsigned long qpi_agent_presence_vector:3; /* RW */
+ unsigned long descriptor_fetch_mode:1; /* RW */
+ unsigned long enable_intd_soft_ack_mode:1; /* RW */
+ unsigned long intd_soft_ack_timeout_period:4; /* RW */
+ unsigned long enable_dual_mapping_mode:1; /* RW */
+ unsigned long vga_io_port_decode_enable:1; /* RW */
+ unsigned long vga_io_port_16_bit_decode:1; /* RW */
+ unsigned long suppress_dest_registration:1; /* RW */
+ unsigned long programmed_initial_priority:3; /* RW */
+ unsigned long use_incoming_priority:1; /* RW */
+ unsigned long enable_programmed_initial_priority:1;/* RW */
+ unsigned long enable_automatic_apic_mode_selection:1;/* RW */
+ unsigned long apic_mode_status:1; /* RO */
+ unsigned long suppress_interrupts_to_self:1; /* RW */
+ unsigned long enable_lock_based_system_flush:1;/* RW */
+ unsigned long enable_extended_sb_status:1; /* RW */
+ unsigned long suppress_int_prio_udt_to_self:1;/* RW */
+ unsigned long use_legacy_descriptor_formats:1;/* RW */
+ unsigned long rsvd_36_47:12;
+ unsigned long fun:16; /* RW */
+ } s2;
+ struct uv3h_lb_bau_misc_control_s {
+ unsigned long rejection_delay:8; /* RW */
+ unsigned long apic_mode:1; /* RW */
+ unsigned long force_broadcast:1; /* RW */
+ unsigned long force_lock_nop:1; /* RW */
+ unsigned long qpi_agent_presence_vector:3; /* RW */
+ unsigned long descriptor_fetch_mode:1; /* RW */
+ unsigned long enable_intd_soft_ack_mode:1; /* RW */
+ unsigned long intd_soft_ack_timeout_period:4; /* RW */
+ unsigned long enable_dual_mapping_mode:1; /* RW */
+ unsigned long vga_io_port_decode_enable:1; /* RW */
+ unsigned long vga_io_port_16_bit_decode:1; /* RW */
+ unsigned long suppress_dest_registration:1; /* RW */
+ unsigned long programmed_initial_priority:3; /* RW */
+ unsigned long use_incoming_priority:1; /* RW */
+ unsigned long enable_programmed_initial_priority:1;/* RW */
+ unsigned long enable_automatic_apic_mode_selection:1;/* RW */
+ unsigned long apic_mode_status:1; /* RO */
+ unsigned long suppress_interrupts_to_self:1; /* RW */
+ unsigned long enable_lock_based_system_flush:1;/* RW */
+ unsigned long enable_extended_sb_status:1; /* RW */
+ unsigned long suppress_int_prio_udt_to_self:1;/* RW */
+ unsigned long use_legacy_descriptor_formats:1;/* RW */
+ unsigned long suppress_quiesce_msgs_to_qpi:1; /* RW */
+ unsigned long enable_intd_prefetch_hint:1; /* RW */
+ unsigned long thread_kill_timebase:8; /* RW */
+ unsigned long rsvd_46_47:2;
+ unsigned long fun:16; /* RW */
+ } s3;
+ struct uv4h_lb_bau_misc_control_s {
+ unsigned long rejection_delay:8; /* RW */
+ unsigned long apic_mode:1; /* RW */
+ unsigned long force_broadcast:1; /* RW */
+ unsigned long force_lock_nop:1; /* RW */
+ unsigned long qpi_agent_presence_vector:3; /* RW */
+ unsigned long descriptor_fetch_mode:1; /* RW */
+ unsigned long rsvd_15_19:5;
+ unsigned long enable_dual_mapping_mode:1; /* RW */
+ unsigned long vga_io_port_decode_enable:1; /* RW */
+ unsigned long vga_io_port_16_bit_decode:1; /* RW */
+ unsigned long suppress_dest_registration:1; /* RW */
+ unsigned long programmed_initial_priority:3; /* RW */
+ unsigned long use_incoming_priority:1; /* RW */
+ unsigned long enable_programmed_initial_priority:1;/* RW */
+ unsigned long enable_automatic_apic_mode_selection:1;/* RW */
+ unsigned long apic_mode_status:1; /* RO */
+ unsigned long suppress_interrupts_to_self:1; /* RW */
+ unsigned long enable_lock_based_system_flush:1;/* RW */
+ unsigned long enable_extended_sb_status:1; /* RW */
+ unsigned long suppress_int_prio_udt_to_self:1;/* RW */
+ unsigned long use_legacy_descriptor_formats:1;/* RW */
+ unsigned long suppress_quiesce_msgs_to_qpi:1; /* RW */
+ unsigned long rsvd_37:1;
+ unsigned long thread_kill_timebase:8; /* RW */
+ unsigned long address_interleave_select:1; /* RW */
+ unsigned long rsvd_47:1;
+ unsigned long fun:16; /* RW */
+ } s4;
+};
+
+/* ========================================================================= */
+/* UVH_LB_BAU_SB_ACTIVATION_CONTROL */
+/* ========================================================================= */
+#define UV1H_LB_BAU_SB_ACTIVATION_CONTROL 0x320020UL
+#define UV2H_LB_BAU_SB_ACTIVATION_CONTROL 0x320020UL
+#define UV3H_LB_BAU_SB_ACTIVATION_CONTROL 0x320020UL
+#define UV4H_LB_BAU_SB_ACTIVATION_CONTROL 0xc8020UL
+#define UVH_LB_BAU_SB_ACTIVATION_CONTROL ( \
+ is_uv1_hub() ? UV1H_LB_BAU_SB_ACTIVATION_CONTROL : \
+ is_uv2_hub() ? UV2H_LB_BAU_SB_ACTIVATION_CONTROL : \
+ is_uv3_hub() ? UV3H_LB_BAU_SB_ACTIVATION_CONTROL : \
+ /*is_uv4_hub*/ UV4H_LB_BAU_SB_ACTIVATION_CONTROL)
+
+#define UV1H_LB_BAU_SB_ACTIVATION_CONTROL_32 0x9a8
+#define UV2H_LB_BAU_SB_ACTIVATION_CONTROL_32 0x9a8
+#define UV3H_LB_BAU_SB_ACTIVATION_CONTROL_32 0x9a8
+#define UV4H_LB_BAU_SB_ACTIVATION_CONTROL_32 0x9c8
+#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_32 ( \
+ is_uv1_hub() ? UV1H_LB_BAU_SB_ACTIVATION_CONTROL_32 : \
+ is_uv2_hub() ? UV2H_LB_BAU_SB_ACTIVATION_CONTROL_32 : \
+ is_uv3_hub() ? UV3H_LB_BAU_SB_ACTIVATION_CONTROL_32 : \
+ /*is_uv4_hub*/ UV4H_LB_BAU_SB_ACTIVATION_CONTROL_32)
+
+#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INDEX_SHFT 0
+#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT 62
+#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INIT_SHFT 63
+#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INDEX_MASK 0x000000000000003fUL
+#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_MASK 0x4000000000000000UL
+#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INIT_MASK 0x8000000000000000UL
+
+
+union uvh_lb_bau_sb_activation_control_u {
+ unsigned long v;
+ struct uvh_lb_bau_sb_activation_control_s {
+ unsigned long index:6; /* RW */
+ unsigned long rsvd_6_61:56;
+ unsigned long push:1; /* WP */
+ unsigned long init:1; /* WP */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_LB_BAU_SB_ACTIVATION_STATUS_0 */
+/* ========================================================================= */
+#define UV1H_LB_BAU_SB_ACTIVATION_STATUS_0 0x320030UL
+#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_0 0x320030UL
+#define UV3H_LB_BAU_SB_ACTIVATION_STATUS_0 0x320030UL
+#define UV4H_LB_BAU_SB_ACTIVATION_STATUS_0 0xc8030UL
+#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0 ( \
+ is_uv1_hub() ? UV1H_LB_BAU_SB_ACTIVATION_STATUS_0 : \
+ is_uv2_hub() ? UV2H_LB_BAU_SB_ACTIVATION_STATUS_0 : \
+ is_uv3_hub() ? UV3H_LB_BAU_SB_ACTIVATION_STATUS_0 : \
+ /*is_uv4_hub*/ UV4H_LB_BAU_SB_ACTIVATION_STATUS_0)
+
+#define UV1H_LB_BAU_SB_ACTIVATION_STATUS_0_32 0x9b0
+#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_0_32 0x9b0
+#define UV3H_LB_BAU_SB_ACTIVATION_STATUS_0_32 0x9b0
+#define UV4H_LB_BAU_SB_ACTIVATION_STATUS_0_32 0x9d0
+#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_32 ( \
+ is_uv1_hub() ? UV1H_LB_BAU_SB_ACTIVATION_STATUS_0_32 : \
+ is_uv2_hub() ? UV2H_LB_BAU_SB_ACTIVATION_STATUS_0_32 : \
+ is_uv3_hub() ? UV3H_LB_BAU_SB_ACTIVATION_STATUS_0_32 : \
+ /*is_uv4_hub*/ UV4H_LB_BAU_SB_ACTIVATION_STATUS_0_32)
+
+#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_STATUS_SHFT 0
+#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_STATUS_MASK 0xffffffffffffffffUL
+
+
+union uvh_lb_bau_sb_activation_status_0_u {
+ unsigned long v;
+ struct uvh_lb_bau_sb_activation_status_0_s {
+ unsigned long status:64; /* RW */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_LB_BAU_SB_ACTIVATION_STATUS_1 */
+/* ========================================================================= */
+#define UV1H_LB_BAU_SB_ACTIVATION_STATUS_1 0x320040UL
+#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_1 0x320040UL
+#define UV3H_LB_BAU_SB_ACTIVATION_STATUS_1 0x320040UL
+#define UV4H_LB_BAU_SB_ACTIVATION_STATUS_1 0xc8040UL
+#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1 ( \
+ is_uv1_hub() ? UV1H_LB_BAU_SB_ACTIVATION_STATUS_1 : \
+ is_uv2_hub() ? UV2H_LB_BAU_SB_ACTIVATION_STATUS_1 : \
+ is_uv3_hub() ? UV3H_LB_BAU_SB_ACTIVATION_STATUS_1 : \
+ /*is_uv4_hub*/ UV4H_LB_BAU_SB_ACTIVATION_STATUS_1)
+
+#define UV1H_LB_BAU_SB_ACTIVATION_STATUS_1_32 0x9b8
+#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_1_32 0x9b8
+#define UV3H_LB_BAU_SB_ACTIVATION_STATUS_1_32 0x9b8
+#define UV4H_LB_BAU_SB_ACTIVATION_STATUS_1_32 0x9d8
+#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_32 ( \
+ is_uv1_hub() ? UV1H_LB_BAU_SB_ACTIVATION_STATUS_1_32 : \
+ is_uv2_hub() ? UV2H_LB_BAU_SB_ACTIVATION_STATUS_1_32 : \
+ is_uv3_hub() ? UV3H_LB_BAU_SB_ACTIVATION_STATUS_1_32 : \
+ /*is_uv4_hub*/ UV4H_LB_BAU_SB_ACTIVATION_STATUS_1_32)
+
+#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_STATUS_SHFT 0
+#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_STATUS_MASK 0xffffffffffffffffUL
+
+
+union uvh_lb_bau_sb_activation_status_1_u {
+ unsigned long v;
+ struct uvh_lb_bau_sb_activation_status_1_s {
+ unsigned long status:64; /* RW */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_LB_BAU_SB_DESCRIPTOR_BASE */
+/* ========================================================================= */
+#define UV1H_LB_BAU_SB_DESCRIPTOR_BASE 0x320010UL
+#define UV2H_LB_BAU_SB_DESCRIPTOR_BASE 0x320010UL
+#define UV3H_LB_BAU_SB_DESCRIPTOR_BASE 0x320010UL
+#define UV4H_LB_BAU_SB_DESCRIPTOR_BASE 0xc8010UL
+#define UVH_LB_BAU_SB_DESCRIPTOR_BASE ( \
+ is_uv1_hub() ? UV1H_LB_BAU_SB_DESCRIPTOR_BASE : \
+ is_uv2_hub() ? UV2H_LB_BAU_SB_DESCRIPTOR_BASE : \
+ is_uv3_hub() ? UV3H_LB_BAU_SB_DESCRIPTOR_BASE : \
+ /*is_uv4_hub*/ UV4H_LB_BAU_SB_DESCRIPTOR_BASE)
+
+#define UV1H_LB_BAU_SB_DESCRIPTOR_BASE_32 0x9a0
+#define UV2H_LB_BAU_SB_DESCRIPTOR_BASE_32 0x9a0
+#define UV3H_LB_BAU_SB_DESCRIPTOR_BASE_32 0x9a0
+#define UV4H_LB_BAU_SB_DESCRIPTOR_BASE_32 0x9c0
+#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_32 ( \
+ is_uv1_hub() ? UV1H_LB_BAU_SB_DESCRIPTOR_BASE_32 : \
+ is_uv2_hub() ? UV2H_LB_BAU_SB_DESCRIPTOR_BASE_32 : \
+ is_uv3_hub() ? UV3H_LB_BAU_SB_DESCRIPTOR_BASE_32 : \
+ /*is_uv4_hub*/ UV4H_LB_BAU_SB_DESCRIPTOR_BASE_32)
+
+#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_SHFT 12
+
+#define UV1H_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_SHFT 49
+#define UV1H_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_MASK 0x000007fffffff000UL
+#define UV1H_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_MASK 0x7ffe000000000000UL
+
+#define UV2H_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_SHFT 49
+#define UV2H_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_MASK 0x000007fffffff000UL
+#define UV2H_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_MASK 0x7ffe000000000000UL
+
+#define UV3H_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_SHFT 49
+#define UV3H_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_MASK 0x000007fffffff000UL
+#define UV3H_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_MASK 0x7ffe000000000000UL
+
+#define UV4H_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_SHFT 49
+#define UV4H_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_MASK 0x00003ffffffff000UL
+#define UV4H_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_MASK 0x7ffe000000000000UL
+
+#define UV4AH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_SHFT 53
+#define UV4AH_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_MASK 0x000ffffffffff000UL
+#define UV4AH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_MASK 0xffe0000000000000UL
+
+#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_SHFT ( \
+ is_uv1_hub() ? UV1H_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_SHFT : \
+ is_uv2_hub() ? UV2H_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_SHFT : \
+ is_uv3_hub() ? UV3H_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_SHFT : \
+ is_uv4a_hub() ? UV4AH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_SHFT : \
+ /*is_uv4_hub*/ UV4H_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_SHFT)
+
+#define UVH_LB_BAU_SB_DESCRIPTOR_PAGE_ADDRESS_MASK ( \
+ is_uv1_hub() ? UV1H_LB_BAU_SB_DESCRIPTOR_PAGE_ADDRESS_MASK : \
+ is_uv2_hub() ? UV2H_LB_BAU_SB_DESCRIPTOR_PAGE_ADDRESS_MASK : \
+ is_uv3_hub() ? UV3H_LB_BAU_SB_DESCRIPTOR_PAGE_ADDRESS_MASK : \
+ is_uv4a_hub() ? UV4AH_LB_BAU_SB_DESCRIPTOR_PAGE_ADDRESS_MASK : \
+ /*is_uv4_hub*/ UV4H_LB_BAU_SB_DESCRIPTOR_PAGE_ADDRESS_MASK)
+
+#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_MASK ( \
+ is_uv1_hub() ? UV1H_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_MASK : \
+ is_uv2_hub() ? UV2H_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_MASK : \
+ is_uv3_hub() ? UV3H_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_MASK : \
+ is_uv4a_hub() ? UV4AH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_MASK : \
+ /*is_uv4_hub*/ UV4H_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_MASK)
+
+/* ========================================================================= */
+/* UVH_NODE_ID */
+/* ========================================================================= */
+#define UVH_NODE_ID 0x0UL
+#define UV1H_NODE_ID 0x0UL
+#define UV2H_NODE_ID 0x0UL
+#define UV3H_NODE_ID 0x0UL
+#define UV4H_NODE_ID 0x0UL
+
+#define UVH_NODE_ID_FORCE1_SHFT 0
+#define UVH_NODE_ID_MANUFACTURER_SHFT 1
+#define UVH_NODE_ID_PART_NUMBER_SHFT 12
+#define UVH_NODE_ID_REVISION_SHFT 28
+#define UVH_NODE_ID_NODE_ID_SHFT 32
+#define UVH_NODE_ID_FORCE1_MASK 0x0000000000000001UL
+#define UVH_NODE_ID_MANUFACTURER_MASK 0x0000000000000ffeUL
+#define UVH_NODE_ID_PART_NUMBER_MASK 0x000000000ffff000UL
+#define UVH_NODE_ID_REVISION_MASK 0x00000000f0000000UL
+#define UVH_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL
+
+#define UV1H_NODE_ID_FORCE1_SHFT 0
+#define UV1H_NODE_ID_MANUFACTURER_SHFT 1
+#define UV1H_NODE_ID_PART_NUMBER_SHFT 12
+#define UV1H_NODE_ID_REVISION_SHFT 28
+#define UV1H_NODE_ID_NODE_ID_SHFT 32
+#define UV1H_NODE_ID_NODES_PER_BIT_SHFT 48
+#define UV1H_NODE_ID_NI_PORT_SHFT 56
+#define UV1H_NODE_ID_FORCE1_MASK 0x0000000000000001UL
+#define UV1H_NODE_ID_MANUFACTURER_MASK 0x0000000000000ffeUL
+#define UV1H_NODE_ID_PART_NUMBER_MASK 0x000000000ffff000UL
+#define UV1H_NODE_ID_REVISION_MASK 0x00000000f0000000UL
+#define UV1H_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL
+#define UV1H_NODE_ID_NODES_PER_BIT_MASK 0x007f000000000000UL
+#define UV1H_NODE_ID_NI_PORT_MASK 0x0f00000000000000UL
+
+#define UVXH_NODE_ID_FORCE1_SHFT 0
+#define UVXH_NODE_ID_MANUFACTURER_SHFT 1
+#define UVXH_NODE_ID_PART_NUMBER_SHFT 12
+#define UVXH_NODE_ID_REVISION_SHFT 28
+#define UVXH_NODE_ID_NODE_ID_SHFT 32
+#define UVXH_NODE_ID_NODES_PER_BIT_SHFT 50
+#define UVXH_NODE_ID_NI_PORT_SHFT 57
+#define UVXH_NODE_ID_FORCE1_MASK 0x0000000000000001UL
+#define UVXH_NODE_ID_MANUFACTURER_MASK 0x0000000000000ffeUL
+#define UVXH_NODE_ID_PART_NUMBER_MASK 0x000000000ffff000UL
+#define UVXH_NODE_ID_REVISION_MASK 0x00000000f0000000UL
+#define UVXH_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL
+#define UVXH_NODE_ID_NODES_PER_BIT_MASK 0x01fc000000000000UL
+#define UVXH_NODE_ID_NI_PORT_MASK 0x3e00000000000000UL
+
+#define UV2H_NODE_ID_FORCE1_SHFT 0
+#define UV2H_NODE_ID_MANUFACTURER_SHFT 1
+#define UV2H_NODE_ID_PART_NUMBER_SHFT 12
+#define UV2H_NODE_ID_REVISION_SHFT 28
+#define UV2H_NODE_ID_NODE_ID_SHFT 32
+#define UV2H_NODE_ID_NODES_PER_BIT_SHFT 50
+#define UV2H_NODE_ID_NI_PORT_SHFT 57
+#define UV2H_NODE_ID_FORCE1_MASK 0x0000000000000001UL
+#define UV2H_NODE_ID_MANUFACTURER_MASK 0x0000000000000ffeUL
+#define UV2H_NODE_ID_PART_NUMBER_MASK 0x000000000ffff000UL
+#define UV2H_NODE_ID_REVISION_MASK 0x00000000f0000000UL
+#define UV2H_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL
+#define UV2H_NODE_ID_NODES_PER_BIT_MASK 0x01fc000000000000UL
+#define UV2H_NODE_ID_NI_PORT_MASK 0x3e00000000000000UL
+
+#define UV3H_NODE_ID_FORCE1_SHFT 0
+#define UV3H_NODE_ID_MANUFACTURER_SHFT 1
+#define UV3H_NODE_ID_PART_NUMBER_SHFT 12
+#define UV3H_NODE_ID_REVISION_SHFT 28
+#define UV3H_NODE_ID_NODE_ID_SHFT 32
+#define UV3H_NODE_ID_ROUTER_SELECT_SHFT 48
+#define UV3H_NODE_ID_RESERVED_2_SHFT 49
+#define UV3H_NODE_ID_NODES_PER_BIT_SHFT 50
+#define UV3H_NODE_ID_NI_PORT_SHFT 57
+#define UV3H_NODE_ID_FORCE1_MASK 0x0000000000000001UL
+#define UV3H_NODE_ID_MANUFACTURER_MASK 0x0000000000000ffeUL
+#define UV3H_NODE_ID_PART_NUMBER_MASK 0x000000000ffff000UL
+#define UV3H_NODE_ID_REVISION_MASK 0x00000000f0000000UL
+#define UV3H_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL
+#define UV3H_NODE_ID_ROUTER_SELECT_MASK 0x0001000000000000UL
+#define UV3H_NODE_ID_RESERVED_2_MASK 0x0002000000000000UL
+#define UV3H_NODE_ID_NODES_PER_BIT_MASK 0x01fc000000000000UL
+#define UV3H_NODE_ID_NI_PORT_MASK 0x3e00000000000000UL
+
+#define UV4H_NODE_ID_FORCE1_SHFT 0
+#define UV4H_NODE_ID_MANUFACTURER_SHFT 1
+#define UV4H_NODE_ID_PART_NUMBER_SHFT 12
+#define UV4H_NODE_ID_REVISION_SHFT 28
+#define UV4H_NODE_ID_NODE_ID_SHFT 32
+#define UV4H_NODE_ID_ROUTER_SELECT_SHFT 48
+#define UV4H_NODE_ID_RESERVED_2_SHFT 49
+#define UV4H_NODE_ID_NODES_PER_BIT_SHFT 50
+#define UV4H_NODE_ID_NI_PORT_SHFT 57
+#define UV4H_NODE_ID_FORCE1_MASK 0x0000000000000001UL
+#define UV4H_NODE_ID_MANUFACTURER_MASK 0x0000000000000ffeUL
+#define UV4H_NODE_ID_PART_NUMBER_MASK 0x000000000ffff000UL
+#define UV4H_NODE_ID_REVISION_MASK 0x00000000f0000000UL
+#define UV4H_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL
+#define UV4H_NODE_ID_ROUTER_SELECT_MASK 0x0001000000000000UL
+#define UV4H_NODE_ID_RESERVED_2_MASK 0x0002000000000000UL
+#define UV4H_NODE_ID_NODES_PER_BIT_MASK 0x01fc000000000000UL
+#define UV4H_NODE_ID_NI_PORT_MASK 0x3e00000000000000UL
+
+
+union uvh_node_id_u {
+ unsigned long v;
+ struct uvh_node_id_s {
+ unsigned long force1:1; /* RO */
+ unsigned long manufacturer:11; /* RO */
+ unsigned long part_number:16; /* RO */
+ unsigned long revision:4; /* RO */
+ unsigned long node_id:15; /* RW */
+ unsigned long rsvd_47_63:17;
+ } s;
+ struct uv1h_node_id_s {
+ unsigned long force1:1; /* RO */
+ unsigned long manufacturer:11; /* RO */
+ unsigned long part_number:16; /* RO */
+ unsigned long revision:4; /* RO */
+ unsigned long node_id:15; /* RW */
+ unsigned long rsvd_47:1;
+ unsigned long nodes_per_bit:7; /* RW */
+ unsigned long rsvd_55:1;
+ unsigned long ni_port:4; /* RO */
+ unsigned long rsvd_60_63:4;
+ } s1;
+ struct uvxh_node_id_s {
+ unsigned long force1:1; /* RO */
+ unsigned long manufacturer:11; /* RO */
+ unsigned long part_number:16; /* RO */
+ unsigned long revision:4; /* RO */
+ unsigned long node_id:15; /* RW */
+ unsigned long rsvd_47_49:3;
+ unsigned long nodes_per_bit:7; /* RO */
+ unsigned long ni_port:5; /* RO */
+ unsigned long rsvd_62_63:2;
+ } sx;
+ struct uv2h_node_id_s {
+ unsigned long force1:1; /* RO */
+ unsigned long manufacturer:11; /* RO */
+ unsigned long part_number:16; /* RO */
+ unsigned long revision:4; /* RO */
+ unsigned long node_id:15; /* RW */
+ unsigned long rsvd_47_49:3;
+ unsigned long nodes_per_bit:7; /* RO */
+ unsigned long ni_port:5; /* RO */
+ unsigned long rsvd_62_63:2;
+ } s2;
+ struct uv3h_node_id_s {
+ unsigned long force1:1; /* RO */
+ unsigned long manufacturer:11; /* RO */
+ unsigned long part_number:16; /* RO */
+ unsigned long revision:4; /* RO */
+ unsigned long node_id:15; /* RW */
+ unsigned long rsvd_47:1;
+ unsigned long router_select:1; /* RO */
+ unsigned long rsvd_49:1;
+ unsigned long nodes_per_bit:7; /* RO */
+ unsigned long ni_port:5; /* RO */
+ unsigned long rsvd_62_63:2;
+ } s3;
+ struct uv4h_node_id_s {
+ unsigned long force1:1; /* RO */
+ unsigned long manufacturer:11; /* RO */
+ unsigned long part_number:16; /* RO */
+ unsigned long revision:4; /* RO */
+ unsigned long node_id:15; /* RW */
+ unsigned long rsvd_47:1;
+ unsigned long router_select:1; /* RO */
+ unsigned long rsvd_49:1;
+ unsigned long nodes_per_bit:7; /* RO */
+ unsigned long ni_port:5; /* RO */
+ unsigned long rsvd_62_63:2;
+ } s4;
+};
+
+/* ========================================================================= */
+/* UVH_NODE_PRESENT_TABLE */
+/* ========================================================================= */
+#define UVH_NODE_PRESENT_TABLE 0x1400UL
+
+#define UV1H_NODE_PRESENT_TABLE_DEPTH 16
+#define UV2H_NODE_PRESENT_TABLE_DEPTH 16
+#define UV3H_NODE_PRESENT_TABLE_DEPTH 16
+#define UV4H_NODE_PRESENT_TABLE_DEPTH 4
+#define UVH_NODE_PRESENT_TABLE_DEPTH ( \
+ is_uv1_hub() ? UV1H_NODE_PRESENT_TABLE_DEPTH : \
+ is_uv2_hub() ? UV2H_NODE_PRESENT_TABLE_DEPTH : \
+ is_uv3_hub() ? UV3H_NODE_PRESENT_TABLE_DEPTH : \
+ /*is_uv4_hub*/ UV4H_NODE_PRESENT_TABLE_DEPTH)
+
+#define UVH_NODE_PRESENT_TABLE_NODES_SHFT 0
+#define UVH_NODE_PRESENT_TABLE_NODES_MASK 0xffffffffffffffffUL
+
+
+union uvh_node_present_table_u {
+ unsigned long v;
+ struct uvh_node_present_table_s {
+ unsigned long nodes:64; /* RW */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR */
+/* ========================================================================= */
+#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR 0x16000c8UL
+#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR 0x16000c8UL
+#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR 0x16000c8UL
+#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR 0x4800c8UL
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR ( \
+ is_uv1_hub() ? UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR : \
+ is_uv2_hub() ? UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR : \
+ is_uv3_hub() ? UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR : \
+ /*is_uv4_hub*/ UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR)
+
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_SHFT 24
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_SHFT 48
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_SHFT 63
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_MASK 0x00000000ff000000UL
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_MASK 0x001f000000000000UL
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_MASK 0x8000000000000000UL
+
+#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_SHFT 24
+#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_SHFT 48
+#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_SHFT 63
+#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_MASK 0x00000000ff000000UL
+#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_MASK 0x001f000000000000UL
+#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_MASK 0x8000000000000000UL
+
+#define UVXH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_SHFT 24
+#define UVXH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_SHFT 48
+#define UVXH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_SHFT 63
+#define UVXH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_MASK 0x00000000ff000000UL
+#define UVXH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_MASK 0x001f000000000000UL
+#define UVXH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_MASK 0x8000000000000000UL
+
+#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_SHFT 24
+#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_SHFT 48
+#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_SHFT 63
+#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_MASK 0x00000000ff000000UL
+#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_MASK 0x001f000000000000UL
+#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_MASK 0x8000000000000000UL
+
+#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_SHFT 24
+#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_SHFT 48
+#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_SHFT 63
+#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_MASK 0x00000000ff000000UL
+#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_MASK 0x001f000000000000UL
+#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_MASK 0x8000000000000000UL
+
+#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_SHFT 24
+#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_SHFT 48
+#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_SHFT 63
+#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_MASK 0x00000000ff000000UL
+#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_MASK 0x001f000000000000UL
+#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_MASK 0x8000000000000000UL
+
+
+union uvh_rh_gam_alias210_overlay_config_0_mmr_u {
+ unsigned long v;
+ struct uvh_rh_gam_alias210_overlay_config_0_mmr_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long base:8; /* RW */
+ unsigned long rsvd_32_47:16;
+ unsigned long m_alias:5; /* RW */
+ unsigned long rsvd_53_62:10;
+ unsigned long enable:1; /* RW */
+ } s;
+ struct uv1h_rh_gam_alias210_overlay_config_0_mmr_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long base:8; /* RW */
+ unsigned long rsvd_32_47:16;
+ unsigned long m_alias:5; /* RW */
+ unsigned long rsvd_53_62:10;
+ unsigned long enable:1; /* RW */
+ } s1;
+ struct uvxh_rh_gam_alias210_overlay_config_0_mmr_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long base:8; /* RW */
+ unsigned long rsvd_32_47:16;
+ unsigned long m_alias:5; /* RW */
+ unsigned long rsvd_53_62:10;
+ unsigned long enable:1; /* RW */
+ } sx;
+ struct uv2h_rh_gam_alias210_overlay_config_0_mmr_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long base:8; /* RW */
+ unsigned long rsvd_32_47:16;
+ unsigned long m_alias:5; /* RW */
+ unsigned long rsvd_53_62:10;
+ unsigned long enable:1; /* RW */
+ } s2;
+ struct uv3h_rh_gam_alias210_overlay_config_0_mmr_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long base:8; /* RW */
+ unsigned long rsvd_32_47:16;
+ unsigned long m_alias:5; /* RW */
+ unsigned long rsvd_53_62:10;
+ unsigned long enable:1; /* RW */
+ } s3;
+ struct uv4h_rh_gam_alias210_overlay_config_0_mmr_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long base:8; /* RW */
+ unsigned long rsvd_32_47:16;
+ unsigned long m_alias:5; /* RW */
+ unsigned long rsvd_53_62:10;
+ unsigned long enable:1; /* RW */
+ } s4;
+};
+
+/* ========================================================================= */
+/* UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR */
+/* ========================================================================= */
+#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR 0x16000d8UL
+#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR 0x16000d8UL
+#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR 0x16000d8UL
+#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR 0x4800d8UL
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR ( \
+ is_uv1_hub() ? UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR : \
+ is_uv2_hub() ? UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR : \
+ is_uv3_hub() ? UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR : \
+ /*is_uv4_hub*/ UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR)
+
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_SHFT 24
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_SHFT 48
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_SHFT 63
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_MASK 0x00000000ff000000UL
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_MASK 0x001f000000000000UL
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_MASK 0x8000000000000000UL
+
+#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_SHFT 24
+#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_SHFT 48
+#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_SHFT 63
+#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_MASK 0x00000000ff000000UL
+#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_MASK 0x001f000000000000UL
+#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_MASK 0x8000000000000000UL
+
+#define UVXH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_SHFT 24
+#define UVXH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_SHFT 48
+#define UVXH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_SHFT 63
+#define UVXH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_MASK 0x00000000ff000000UL
+#define UVXH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_MASK 0x001f000000000000UL
+#define UVXH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_MASK 0x8000000000000000UL
+
+#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_SHFT 24
+#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_SHFT 48
+#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_SHFT 63
+#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_MASK 0x00000000ff000000UL
+#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_MASK 0x001f000000000000UL
+#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_MASK 0x8000000000000000UL
+
+#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_SHFT 24
+#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_SHFT 48
+#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_SHFT 63
+#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_MASK 0x00000000ff000000UL
+#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_MASK 0x001f000000000000UL
+#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_MASK 0x8000000000000000UL
+
+#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_SHFT 24
+#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_SHFT 48
+#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_SHFT 63
+#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_MASK 0x00000000ff000000UL
+#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_MASK 0x001f000000000000UL
+#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_MASK 0x8000000000000000UL
+
+
+union uvh_rh_gam_alias210_overlay_config_1_mmr_u {
+ unsigned long v;
+ struct uvh_rh_gam_alias210_overlay_config_1_mmr_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long base:8; /* RW */
+ unsigned long rsvd_32_47:16;
+ unsigned long m_alias:5; /* RW */
+ unsigned long rsvd_53_62:10;
+ unsigned long enable:1; /* RW */
+ } s;
+ struct uv1h_rh_gam_alias210_overlay_config_1_mmr_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long base:8; /* RW */
+ unsigned long rsvd_32_47:16;
+ unsigned long m_alias:5; /* RW */
+ unsigned long rsvd_53_62:10;
+ unsigned long enable:1; /* RW */
+ } s1;
+ struct uvxh_rh_gam_alias210_overlay_config_1_mmr_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long base:8; /* RW */
+ unsigned long rsvd_32_47:16;
+ unsigned long m_alias:5; /* RW */
+ unsigned long rsvd_53_62:10;
+ unsigned long enable:1; /* RW */
+ } sx;
+ struct uv2h_rh_gam_alias210_overlay_config_1_mmr_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long base:8; /* RW */
+ unsigned long rsvd_32_47:16;
+ unsigned long m_alias:5; /* RW */
+ unsigned long rsvd_53_62:10;
+ unsigned long enable:1; /* RW */
+ } s2;
+ struct uv3h_rh_gam_alias210_overlay_config_1_mmr_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long base:8; /* RW */
+ unsigned long rsvd_32_47:16;
+ unsigned long m_alias:5; /* RW */
+ unsigned long rsvd_53_62:10;
+ unsigned long enable:1; /* RW */
+ } s3;
+ struct uv4h_rh_gam_alias210_overlay_config_1_mmr_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long base:8; /* RW */
+ unsigned long rsvd_32_47:16;
+ unsigned long m_alias:5; /* RW */
+ unsigned long rsvd_53_62:10;
+ unsigned long enable:1; /* RW */
+ } s4;
+};
+
+/* ========================================================================= */
+/* UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR */
+/* ========================================================================= */
+#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR 0x16000e8UL
+#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR 0x16000e8UL
+#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR 0x16000e8UL
+#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR 0x4800e8UL
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR ( \
+ is_uv1_hub() ? UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR : \
+ is_uv2_hub() ? UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR : \
+ is_uv3_hub() ? UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR : \
+ /*is_uv4_hub*/ UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR)
+
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_SHFT 24
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_SHFT 48
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_SHFT 63
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_MASK 0x00000000ff000000UL
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_MASK 0x001f000000000000UL
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_MASK 0x8000000000000000UL
+
+#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_SHFT 24
+#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_SHFT 48
+#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_SHFT 63
+#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_MASK 0x00000000ff000000UL
+#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_MASK 0x001f000000000000UL
+#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_MASK 0x8000000000000000UL
+
+#define UVXH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_SHFT 24
+#define UVXH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_SHFT 48
+#define UVXH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_SHFT 63
+#define UVXH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_MASK 0x00000000ff000000UL
+#define UVXH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_MASK 0x001f000000000000UL
+#define UVXH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_MASK 0x8000000000000000UL
+
+#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_SHFT 24
+#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_SHFT 48
+#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_SHFT 63
+#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_MASK 0x00000000ff000000UL
+#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_MASK 0x001f000000000000UL
+#define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_MASK 0x8000000000000000UL
+
+#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_SHFT 24
+#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_SHFT 48
+#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_SHFT 63
+#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_MASK 0x00000000ff000000UL
+#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_MASK 0x001f000000000000UL
+#define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_MASK 0x8000000000000000UL
+
+#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_SHFT 24
+#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_SHFT 48
+#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_SHFT 63
+#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_MASK 0x00000000ff000000UL
+#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_MASK 0x001f000000000000UL
+#define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_MASK 0x8000000000000000UL
+
+
+union uvh_rh_gam_alias210_overlay_config_2_mmr_u {
+ unsigned long v;
+ struct uvh_rh_gam_alias210_overlay_config_2_mmr_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long base:8; /* RW */
+ unsigned long rsvd_32_47:16;
+ unsigned long m_alias:5; /* RW */
+ unsigned long rsvd_53_62:10;
+ unsigned long enable:1; /* RW */
+ } s;
+ struct uv1h_rh_gam_alias210_overlay_config_2_mmr_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long base:8; /* RW */
+ unsigned long rsvd_32_47:16;
+ unsigned long m_alias:5; /* RW */
+ unsigned long rsvd_53_62:10;
+ unsigned long enable:1; /* RW */
+ } s1;
+ struct uvxh_rh_gam_alias210_overlay_config_2_mmr_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long base:8; /* RW */
+ unsigned long rsvd_32_47:16;
+ unsigned long m_alias:5; /* RW */
+ unsigned long rsvd_53_62:10;
+ unsigned long enable:1; /* RW */
+ } sx;
+ struct uv2h_rh_gam_alias210_overlay_config_2_mmr_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long base:8; /* RW */
+ unsigned long rsvd_32_47:16;
+ unsigned long m_alias:5; /* RW */
+ unsigned long rsvd_53_62:10;
+ unsigned long enable:1; /* RW */
+ } s2;
+ struct uv3h_rh_gam_alias210_overlay_config_2_mmr_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long base:8; /* RW */
+ unsigned long rsvd_32_47:16;
+ unsigned long m_alias:5; /* RW */
+ unsigned long rsvd_53_62:10;
+ unsigned long enable:1; /* RW */
+ } s3;
+ struct uv4h_rh_gam_alias210_overlay_config_2_mmr_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long base:8; /* RW */
+ unsigned long rsvd_32_47:16;
+ unsigned long m_alias:5; /* RW */
+ unsigned long rsvd_53_62:10;
+ unsigned long enable:1; /* RW */
+ } s4;
+};
+
+/* ========================================================================= */
+/* UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR */
+/* ========================================================================= */
+#define UV1H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR 0x16000d0UL
+#define UV2H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR 0x16000d0UL
+#define UV3H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR 0x16000d0UL
+#define UV4H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR 0x4800d0UL
+#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR ( \
+ is_uv1_hub() ? UV1H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR : \
+ is_uv2_hub() ? UV2H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR : \
+ is_uv3_hub() ? UV3H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR : \
+ /*is_uv4_hub*/ UV4H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR)
+
+#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT 24
+#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_MASK 0x00003fffff000000UL
+
+#define UV1H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT 24
+#define UV1H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_MASK 0x00003fffff000000UL
+
+#define UVXH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT 24
+#define UVXH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_MASK 0x00003fffff000000UL
+
+#define UV2H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT 24
+#define UV2H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_MASK 0x00003fffff000000UL
+
+#define UV3H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT 24
+#define UV3H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_MASK 0x00003fffff000000UL
+
+#define UV4H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT 24
+#define UV4H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_MASK 0x00003fffff000000UL
+
+
+union uvh_rh_gam_alias210_redirect_config_0_mmr_u {
+ unsigned long v;
+ struct uvh_rh_gam_alias210_redirect_config_0_mmr_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long dest_base:22; /* RW */
+ unsigned long rsvd_46_63:18;
+ } s;
+ struct uv1h_rh_gam_alias210_redirect_config_0_mmr_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long dest_base:22; /* RW */
+ unsigned long rsvd_46_63:18;
+ } s1;
+ struct uvxh_rh_gam_alias210_redirect_config_0_mmr_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long dest_base:22; /* RW */
+ unsigned long rsvd_46_63:18;
+ } sx;
+ struct uv2h_rh_gam_alias210_redirect_config_0_mmr_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long dest_base:22; /* RW */
+ unsigned long rsvd_46_63:18;
+ } s2;
+ struct uv3h_rh_gam_alias210_redirect_config_0_mmr_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long dest_base:22; /* RW */
+ unsigned long rsvd_46_63:18;
+ } s3;
+ struct uv4h_rh_gam_alias210_redirect_config_0_mmr_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long dest_base:22; /* RW */
+ unsigned long rsvd_46_63:18;
+ } s4;
+};
+
+/* ========================================================================= */
+/* UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR */
+/* ========================================================================= */
+#define UV1H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR 0x16000e0UL
+#define UV2H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR 0x16000e0UL
+#define UV3H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR 0x16000e0UL
+#define UV4H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR 0x4800e0UL
+#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR ( \
+ is_uv1_hub() ? UV1H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR : \
+ is_uv2_hub() ? UV2H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR : \
+ is_uv3_hub() ? UV3H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR : \
+ /*is_uv4_hub*/ UV4H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR)
+
+#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_SHFT 24
+#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_MASK 0x00003fffff000000UL
+
+#define UV1H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_SHFT 24
+#define UV1H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_MASK 0x00003fffff000000UL
+
+#define UVXH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_SHFT 24
+#define UVXH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_MASK 0x00003fffff000000UL
+
+#define UV2H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_SHFT 24
+#define UV2H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_MASK 0x00003fffff000000UL
+
+#define UV3H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_SHFT 24
+#define UV3H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_MASK 0x00003fffff000000UL
+
+#define UV4H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_SHFT 24
+#define UV4H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_MASK 0x00003fffff000000UL
+
+
+union uvh_rh_gam_alias210_redirect_config_1_mmr_u {
+ unsigned long v;
+ struct uvh_rh_gam_alias210_redirect_config_1_mmr_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long dest_base:22; /* RW */
+ unsigned long rsvd_46_63:18;
+ } s;
+ struct uv1h_rh_gam_alias210_redirect_config_1_mmr_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long dest_base:22; /* RW */
+ unsigned long rsvd_46_63:18;
+ } s1;
+ struct uvxh_rh_gam_alias210_redirect_config_1_mmr_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long dest_base:22; /* RW */
+ unsigned long rsvd_46_63:18;
+ } sx;
+ struct uv2h_rh_gam_alias210_redirect_config_1_mmr_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long dest_base:22; /* RW */
+ unsigned long rsvd_46_63:18;
+ } s2;
+ struct uv3h_rh_gam_alias210_redirect_config_1_mmr_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long dest_base:22; /* RW */
+ unsigned long rsvd_46_63:18;
+ } s3;
+ struct uv4h_rh_gam_alias210_redirect_config_1_mmr_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long dest_base:22; /* RW */
+ unsigned long rsvd_46_63:18;
+ } s4;
+};
+
+/* ========================================================================= */
+/* UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR */
+/* ========================================================================= */
+#define UV1H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR 0x16000f0UL
+#define UV2H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR 0x16000f0UL
+#define UV3H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR 0x16000f0UL
+#define UV4H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR 0x4800f0UL
+#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR ( \
+ is_uv1_hub() ? UV1H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR : \
+ is_uv2_hub() ? UV2H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR : \
+ is_uv3_hub() ? UV3H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR : \
+ /*is_uv4_hub*/ UV4H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR)
+
+#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_SHFT 24
+#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_MASK 0x00003fffff000000UL
+
+#define UV1H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_SHFT 24
+#define UV1H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_MASK 0x00003fffff000000UL
+
+#define UVXH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_SHFT 24
+#define UVXH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_MASK 0x00003fffff000000UL
+
+#define UV2H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_SHFT 24
+#define UV2H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_MASK 0x00003fffff000000UL
+
+#define UV3H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_SHFT 24
+#define UV3H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_MASK 0x00003fffff000000UL
+
+#define UV4H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_SHFT 24
+#define UV4H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_MASK 0x00003fffff000000UL
+
+
+union uvh_rh_gam_alias210_redirect_config_2_mmr_u {
+ unsigned long v;
+ struct uvh_rh_gam_alias210_redirect_config_2_mmr_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long dest_base:22; /* RW */
+ unsigned long rsvd_46_63:18;
+ } s;
+ struct uv1h_rh_gam_alias210_redirect_config_2_mmr_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long dest_base:22; /* RW */
+ unsigned long rsvd_46_63:18;
+ } s1;
+ struct uvxh_rh_gam_alias210_redirect_config_2_mmr_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long dest_base:22; /* RW */
+ unsigned long rsvd_46_63:18;
+ } sx;
+ struct uv2h_rh_gam_alias210_redirect_config_2_mmr_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long dest_base:22; /* RW */
+ unsigned long rsvd_46_63:18;
+ } s2;
+ struct uv3h_rh_gam_alias210_redirect_config_2_mmr_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long dest_base:22; /* RW */
+ unsigned long rsvd_46_63:18;
+ } s3;
+ struct uv4h_rh_gam_alias210_redirect_config_2_mmr_s {
+ unsigned long rsvd_0_23:24;
+ unsigned long dest_base:22; /* RW */
+ unsigned long rsvd_46_63:18;
+ } s4;
+};
+
+/* ========================================================================= */
+/* UVH_RH_GAM_CONFIG_MMR */
+/* ========================================================================= */
+#define UV1H_RH_GAM_CONFIG_MMR 0x1600000UL
+#define UV2H_RH_GAM_CONFIG_MMR 0x1600000UL
+#define UV3H_RH_GAM_CONFIG_MMR 0x1600000UL
+#define UV4H_RH_GAM_CONFIG_MMR 0x480000UL
+#define UVH_RH_GAM_CONFIG_MMR ( \
+ is_uv1_hub() ? UV1H_RH_GAM_CONFIG_MMR : \
+ is_uv2_hub() ? UV2H_RH_GAM_CONFIG_MMR : \
+ is_uv3_hub() ? UV3H_RH_GAM_CONFIG_MMR : \
+ /*is_uv4_hub*/ UV4H_RH_GAM_CONFIG_MMR)
+
+#define UVH_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6
+#define UVH_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL
+
+#define UV1H_RH_GAM_CONFIG_MMR_M_SKT_SHFT 0
+#define UV1H_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6
+#define UV1H_RH_GAM_CONFIG_MMR_MMIOL_CFG_SHFT 12
+#define UV1H_RH_GAM_CONFIG_MMR_M_SKT_MASK 0x000000000000003fUL
+#define UV1H_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL
+#define UV1H_RH_GAM_CONFIG_MMR_MMIOL_CFG_MASK 0x0000000000001000UL
+
+#define UVXH_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6
+#define UVXH_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL
+
+#define UV2H_RH_GAM_CONFIG_MMR_M_SKT_SHFT 0
+#define UV2H_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6
+#define UV2H_RH_GAM_CONFIG_MMR_M_SKT_MASK 0x000000000000003fUL
+#define UV2H_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL
+
+#define UV3H_RH_GAM_CONFIG_MMR_M_SKT_SHFT 0
+#define UV3H_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6
+#define UV3H_RH_GAM_CONFIG_MMR_M_SKT_MASK 0x000000000000003fUL
+#define UV3H_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL
+
+#define UV4H_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6
+#define UV4H_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL
+
+
+union uvh_rh_gam_config_mmr_u {
+ unsigned long v;
+ struct uvh_rh_gam_config_mmr_s {
+ unsigned long rsvd_0_5:6;
+ unsigned long n_skt:4; /* RW */
+ unsigned long rsvd_10_63:54;
+ } s;
+ struct uv1h_rh_gam_config_mmr_s {
+ unsigned long m_skt:6; /* RW */
+ unsigned long n_skt:4; /* RW */
+ unsigned long rsvd_10_11:2;
+ unsigned long mmiol_cfg:1; /* RW */
+ unsigned long rsvd_13_63:51;
+ } s1;
+ struct uvxh_rh_gam_config_mmr_s {
+ unsigned long rsvd_0_5:6;
+ unsigned long n_skt:4; /* RW */
+ unsigned long rsvd_10_63:54;
+ } sx;
+ struct uv2h_rh_gam_config_mmr_s {
+ unsigned long m_skt:6; /* RW */
+ unsigned long n_skt:4; /* RW */
+ unsigned long rsvd_10_63:54;
+ } s2;
+ struct uv3h_rh_gam_config_mmr_s {
+ unsigned long m_skt:6; /* RW */
+ unsigned long n_skt:4; /* RW */
+ unsigned long rsvd_10_63:54;
+ } s3;
+ struct uv4h_rh_gam_config_mmr_s {
+ unsigned long rsvd_0_5:6;
+ unsigned long n_skt:4; /* RW */
+ unsigned long rsvd_10_63:54;
+ } s4;
+};
+
+/* ========================================================================= */
+/* UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR */
+/* ========================================================================= */
+#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x1600010UL
+#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x1600010UL
+#define UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x1600010UL
+#define UV4H_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x480010UL
+#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR ( \
+ is_uv1_hub() ? UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR : \
+ is_uv2_hub() ? UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR : \
+ is_uv3_hub() ? UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR : \
+ /*is_uv4_hub*/ UV4H_RH_GAM_GRU_OVERLAY_CONFIG_MMR)
+
+#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT 52
+#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
+#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL
+#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
+
+#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 28
+#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_GR4_SHFT 48
+#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT 52
+#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
+#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff0000000UL
+#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_GR4_MASK 0x0001000000000000UL
+#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL
+#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
+
+#define UVXH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT 52
+#define UVXH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
+#define UVXH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL
+#define UVXH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
+
+#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 28
+#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT 52
+#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
+#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff0000000UL
+#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL
+#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
+
+#define UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 28
+#define UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT 52
+#define UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_MODE_SHFT 62
+#define UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
+#define UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff0000000UL
+#define UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL
+#define UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_MODE_MASK 0x4000000000000000UL
+#define UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
+
+#define UV4H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 26
+#define UV4H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT 52
+#define UV4H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
+#define UV4H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL
+#define UV4H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL
+#define UV4H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
+
+#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK ( \
+ is_uv1_hub() ? UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK : \
+ is_uv2_hub() ? UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK : \
+ is_uv3_hub() ? UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK : \
+ /*is_uv4_hub*/ UV4H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK)
+#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT ( \
+ is_uv1_hub() ? UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT : \
+ is_uv2_hub() ? UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT : \
+ is_uv3_hub() ? UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT : \
+ /*is_uv4_hub*/ UV4H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT)
+
+union uvh_rh_gam_gru_overlay_config_mmr_u {
+ unsigned long v;
+ struct uvh_rh_gam_gru_overlay_config_mmr_s {
+ unsigned long rsvd_0_51:52;
+ unsigned long n_gru:4; /* RW */
+ unsigned long rsvd_56_62:7;
+ unsigned long enable:1; /* RW */
+ } s;
+ struct uv1h_rh_gam_gru_overlay_config_mmr_s {
+ unsigned long rsvd_0_27:28;
+ unsigned long base:18; /* RW */
+ unsigned long rsvd_46_47:2;
+ unsigned long gr4:1; /* RW */
+ unsigned long rsvd_49_51:3;
+ unsigned long n_gru:4; /* RW */
+ unsigned long rsvd_56_62:7;
+ unsigned long enable:1; /* RW */
+ } s1;
+ struct uvxh_rh_gam_gru_overlay_config_mmr_s {
+ unsigned long rsvd_0_45:46;
+ unsigned long rsvd_46_51:6;
+ unsigned long n_gru:4; /* RW */
+ unsigned long rsvd_56_62:7;
+ unsigned long enable:1; /* RW */
+ } sx;
+ struct uv2h_rh_gam_gru_overlay_config_mmr_s {
+ unsigned long rsvd_0_27:28;
+ unsigned long base:18; /* RW */
+ unsigned long rsvd_46_51:6;
+ unsigned long n_gru:4; /* RW */
+ unsigned long rsvd_56_62:7;
+ unsigned long enable:1; /* RW */
+ } s2;
+ struct uv3h_rh_gam_gru_overlay_config_mmr_s {
+ unsigned long rsvd_0_27:28;
+ unsigned long base:18; /* RW */
+ unsigned long rsvd_46_51:6;
+ unsigned long n_gru:4; /* RW */
+ unsigned long rsvd_56_61:6;
+ unsigned long mode:1; /* RW */
+ unsigned long enable:1; /* RW */
+ } s3;
+ struct uv4h_rh_gam_gru_overlay_config_mmr_s {
+ unsigned long rsvd_0_24:25;
+ unsigned long undef_25:1; /* Undefined */
+ unsigned long base:20; /* RW */
+ unsigned long rsvd_46_51:6;
+ unsigned long n_gru:4; /* RW */
+ unsigned long rsvd_56_62:7;
+ unsigned long enable:1; /* RW */
+ } s4;
+};
+
+/* ========================================================================= */
+/* UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR */
+/* ========================================================================= */
+#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR uv_undefined("UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR")
+#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR uv_undefined("UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR")
+#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR 0x1603000UL
+#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR 0x483000UL
+#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR ( \
+ is_uv1_hub() ? UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR : \
+ is_uv2_hub() ? UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR : \
+ is_uv3_hub() ? UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR : \
+ /*is_uv4_hub*/ UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR)
+
+
+#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_BASE_SHFT 26
+#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_SHFT 46
+#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_ENABLE_SHFT 63
+#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_BASE_MASK 0x00003ffffc000000UL
+#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_MASK 0x000fc00000000000UL
+#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_ENABLE_MASK 0x8000000000000000UL
+
+#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_BASE_SHFT 26
+#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_SHFT 46
+#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_ENABLE_SHFT 63
+#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_BASE_MASK 0x00003ffffc000000UL
+#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_MASK 0x000fc00000000000UL
+#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_ENABLE_MASK 0x8000000000000000UL
+
+#define UV4AH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_SHFT 52
+#define UV4AH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_BASE_MASK 0x000ffffffc000000UL
+#define UV4AH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_MASK 0x03f0000000000000UL
+#define UV4AH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_ENABLE_MASK 0x8000000000000000UL
+
+#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_SHFT ( \
+ is_uv3_hub() ? UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_SHFT : \
+ is_uv4a_hub() ? UV4AH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_SHFT : \
+ /*is_uv4_hub*/ UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_SHFT)
+
+#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_BASE_MASK ( \
+ is_uv3_hub() ? UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_BASE_MASK : \
+ is_uv4a_hub() ? UV4AH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_BASE_MASK : \
+ /*is_uv4_hub*/ UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_BASE_MASK)
+
+#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_MASK ( \
+ is_uv3_hub() ? UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_MASK : \
+ is_uv4a_hub() ? UV4AH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_MASK : \
+ /*is_uv4_hub*/ UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_MASK)
+
+#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_ENABLE_MASK ( \
+ is_uv3_hub() ? UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_ENABLE_MASK : \
+ is_uv4a_hub() ? UV4AH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_ENABLE_MASK : \
+ /*is_uv4_hub*/ UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_ENABLE_MASK)
+
+union uvh_rh_gam_mmioh_overlay_config0_mmr_u {
+ unsigned long v;
+ struct uv3h_rh_gam_mmioh_overlay_config0_mmr_s {
+ unsigned long rsvd_0_25:26;
+ unsigned long base:20; /* RW */
+ unsigned long m_io:6; /* RW */
+ unsigned long n_io:4;
+ unsigned long rsvd_56_62:7;
+ unsigned long enable:1; /* RW */
+ } s3;
+ struct uv4h_rh_gam_mmioh_overlay_config0_mmr_s {
+ unsigned long rsvd_0_25:26;
+ unsigned long base:20; /* RW */
+ unsigned long m_io:6; /* RW */
+ unsigned long n_io:4;
+ unsigned long rsvd_56_62:7;
+ unsigned long enable:1; /* RW */
+ } s4;
+ struct uv4ah_rh_gam_mmioh_overlay_config0_mmr_s {
+ unsigned long rsvd_0_25:26;
+ unsigned long base:26; /* RW */
+ unsigned long m_io:6; /* RW */
+ unsigned long n_io:4;
+ unsigned long undef_62:1; /* Undefined */
+ unsigned long enable:1; /* RW */
+ } s4a;
+};
+
+/* ========================================================================= */
+/* UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR */
+/* ========================================================================= */
+#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR uv_undefined("UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR")
+#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR uv_undefined("UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR")
+#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR 0x1603000UL
+#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR 0x484000UL
+#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR ( \
+ is_uv1_hub() ? UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR : \
+ is_uv2_hub() ? UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR : \
+ is_uv3_hub() ? UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR : \
+ /*is_uv4_hub*/ UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR)
+
+
+#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_BASE_SHFT 26
+#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_SHFT 46
+#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_ENABLE_SHFT 63
+#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_BASE_MASK 0x00003ffffc000000UL
+#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_MASK 0x000fc00000000000UL
+#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_ENABLE_MASK 0x8000000000000000UL
+
+#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_BASE_SHFT 26
+#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_SHFT 46
+#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_ENABLE_SHFT 63
+#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_BASE_MASK 0x00003ffffc000000UL
+#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_MASK 0x000fc00000000000UL
+#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_ENABLE_MASK 0x8000000000000000UL
+
+#define UV4AH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_SHFT 52
+#define UV4AH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_BASE_MASK 0x000ffffffc000000UL
+#define UV4AH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_MASK 0x03f0000000000000UL
+
+#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_SHFT ( \
+ is_uv3_hub() ? UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_SHFT : \
+ is_uv4a_hub() ? UV4AH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_SHFT : \
+ /*is_uv4_hub*/ UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_SHFT)
+
+#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_BASE_MASK ( \
+ is_uv3_hub() ? UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_BASE_MASK : \
+ is_uv4a_hub() ? UV4AH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_BASE_MASK : \
+ /*is_uv4_hub*/ UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_BASE_MASK)
+
+#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_MASK ( \
+ is_uv3_hub() ? UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_MASK : \
+ is_uv4a_hub() ? UV4AH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_MASK : \
+ /*is_uv4_hub*/ UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_MASK)
+
+union uvh_rh_gam_mmioh_overlay_config1_mmr_u {
+ unsigned long v;
+ struct uv3h_rh_gam_mmioh_overlay_config1_mmr_s {
+ unsigned long rsvd_0_25:26;
+ unsigned long base:20; /* RW */
+ unsigned long m_io:6; /* RW */
+ unsigned long n_io:4;
+ unsigned long rsvd_56_62:7;
+ unsigned long enable:1; /* RW */
+ } s3;
+ struct uv4h_rh_gam_mmioh_overlay_config1_mmr_s {
+ unsigned long rsvd_0_25:26;
+ unsigned long base:20; /* RW */
+ unsigned long m_io:6; /* RW */
+ unsigned long n_io:4;
+ unsigned long rsvd_56_62:7;
+ unsigned long enable:1; /* RW */
+ } s4;
+ struct uv4ah_rh_gam_mmioh_overlay_config1_mmr_s {
+ unsigned long rsvd_0_25:26;
+ unsigned long base:26; /* RW */
+ unsigned long m_io:6; /* RW */
+ unsigned long n_io:4;
+ unsigned long undef_62:1; /* Undefined */
+ unsigned long enable:1; /* RW */
+ } s4a;
+};
+
+/* ========================================================================= */
+/* UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR */
+/* ========================================================================= */
+#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR 0x1600030UL
+#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR 0x1600030UL
+#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR uv_undefined("UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR")
+#define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR uv_undefined("UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR")
+#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR ( \
+ is_uv1_hub() ? UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR : \
+ is_uv2_hub() ? UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR : \
+ is_uv3_hub() ? UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR : \
+ /*is_uv4_hub*/ UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR)
+
+
+#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT 30
+#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_SHFT 46
+#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_SHFT 52
+#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
+#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003fffc0000000UL
+#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_MASK 0x000fc00000000000UL
+#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_MASK 0x00f0000000000000UL
+#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
+
+
+#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT 27
+#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_SHFT 46
+#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_SHFT 52
+#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
+#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff8000000UL
+#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_MASK 0x000fc00000000000UL
+#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_MASK 0x00f0000000000000UL
+#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
+
+
+union uvh_rh_gam_mmioh_overlay_config_mmr_u {
+ unsigned long v;
+ struct uv1h_rh_gam_mmioh_overlay_config_mmr_s {
+ unsigned long rsvd_0_29:30;
+ unsigned long base:16; /* RW */
+ unsigned long m_io:6; /* RW */
+ unsigned long n_io:4; /* RW */
+ unsigned long rsvd_56_62:7;
+ unsigned long enable:1; /* RW */
+ } s1;
+ struct uv2h_rh_gam_mmioh_overlay_config_mmr_s {
+ unsigned long rsvd_0_26:27;
+ unsigned long base:19; /* RW */
+ unsigned long m_io:6; /* RW */
+ unsigned long n_io:4; /* RW */
+ unsigned long rsvd_56_62:7;
+ unsigned long enable:1; /* RW */
+ } s2;
+};
+
+/* ========================================================================= */
+/* UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR */
+/* ========================================================================= */
+#define UV1H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR uv_undefined("UV1H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR")
+#define UV2H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR uv_undefined("UV2H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR")
+#define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR 0x1603800UL
+#define UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR 0x483800UL
+#define UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR ( \
+ is_uv1_hub() ? UV1H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR : \
+ is_uv2_hub() ? UV2H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR : \
+ is_uv3_hub() ? UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR : \
+ /*is_uv4_hub*/ UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR)
+
+#define UV1H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH uv_undefined("UV1H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH")
+#define UV2H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH uv_undefined("UV2H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH")
+#define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH 128
+#define UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH 128
+#define UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH ( \
+ is_uv1_hub() ? UV1H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH : \
+ is_uv2_hub() ? UV2H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH : \
+ is_uv3_hub() ? UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH : \
+ /*is_uv4_hub*/ UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH)
+
+
+#define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_NASID_SHFT 0
+#define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_NASID_MASK 0x0000000000007fffUL
+
+#define UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_NASID_SHFT 0
+#define UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_NASID_MASK 0x0000000000007fffUL
+
+#define UV4AH_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_NASID_MASK 0x0000000000000fffUL
+
+#define UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_NASID_MASK ( \
+ is_uv3_hub() ? UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_NASID_MASK : \
+ is_uv4a_hub() ? UV4AH_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_NASID_MASK : \
+ /*is_uv4_hub*/ UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_NASID_MASK)
+
+union uvh_rh_gam_mmioh_redirect_config0_mmr_u {
+ unsigned long v;
+ struct uv3h_rh_gam_mmioh_redirect_config0_mmr_s {
+ unsigned long nasid:15; /* RW */
+ unsigned long rsvd_15_63:49;
+ } s3;
+ struct uv4h_rh_gam_mmioh_redirect_config0_mmr_s {
+ unsigned long nasid:15; /* RW */
+ unsigned long rsvd_15_63:49;
+ } s4;
+ struct uv4ah_rh_gam_mmioh_redirect_config0_mmr_s {
+ unsigned long nasid:12; /* RW */
+ unsigned long rsvd_12_63:52;
+ } s4a;
+};
+
+/* ========================================================================= */
+/* UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR */
+/* ========================================================================= */
+#define UV1H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR uv_undefined("UV1H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR")
+#define UV2H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR uv_undefined("UV2H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR")
+#define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR 0x1604800UL
+#define UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR 0x484800UL
+#define UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR ( \
+ is_uv1_hub() ? UV1H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR : \
+ is_uv2_hub() ? UV2H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR : \
+ is_uv3_hub() ? UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR : \
+ /*is_uv4_hub*/ UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR)
+
+#define UV1H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_DEPTH uv_undefined("UV1H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_DEPTH")
+#define UV2H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_DEPTH uv_undefined("UV2H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_DEPTH")
+#define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_DEPTH 128
+#define UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_DEPTH 128
+#define UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_DEPTH ( \
+ is_uv1_hub() ? UV1H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_DEPTH : \
+ is_uv2_hub() ? UV2H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_DEPTH : \
+ is_uv3_hub() ? UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_DEPTH : \
+ /*is_uv4_hub*/ UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_DEPTH)
+
+
+#define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_NASID_SHFT 0
+#define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_NASID_MASK 0x0000000000007fffUL
+
+#define UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_NASID_SHFT 0
+#define UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_NASID_MASK 0x0000000000007fffUL
+
+#define UV4AH_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_NASID_MASK 0x0000000000000fffUL
+
+#define UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_NASID_MASK ( \
+ is_uv3_hub() ? UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_NASID_MASK : \
+ is_uv4a_hub() ? UV4AH_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_NASID_MASK : \
+ /*is_uv4_hub*/ UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_NASID_MASK)
+
+union uvh_rh_gam_mmioh_redirect_config1_mmr_u {
+ unsigned long v;
+ struct uv3h_rh_gam_mmioh_redirect_config1_mmr_s {
+ unsigned long nasid:15; /* RW */
+ unsigned long rsvd_15_63:49;
+ } s3;
+ struct uv4h_rh_gam_mmioh_redirect_config1_mmr_s {
+ unsigned long nasid:15; /* RW */
+ unsigned long rsvd_15_63:49;
+ } s4;
+ struct uv4ah_rh_gam_mmioh_redirect_config1_mmr_s {
+ unsigned long nasid:12; /* RW */
+ unsigned long rsvd_12_63:52;
+ } s4a;
+};
+
+/* ========================================================================= */
+/* UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR */
+/* ========================================================================= */
+#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR 0x1600028UL
+#define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR 0x1600028UL
+#define UV3H_RH_GAM_MMR_OVERLAY_CONFIG_MMR 0x1600028UL
+#define UV4H_RH_GAM_MMR_OVERLAY_CONFIG_MMR 0x480028UL
+#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR ( \
+ is_uv1_hub() ? UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR : \
+ is_uv2_hub() ? UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR : \
+ is_uv3_hub() ? UV3H_RH_GAM_MMR_OVERLAY_CONFIG_MMR : \
+ /*is_uv4_hub*/ UV4H_RH_GAM_MMR_OVERLAY_CONFIG_MMR)
+
+#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26
+#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
+#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL
+#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
+
+#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26
+#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_DUAL_HUB_SHFT 46
+#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
+#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL
+#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_DUAL_HUB_MASK 0x0000400000000000UL
+#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
+
+#define UVXH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26
+#define UVXH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
+#define UVXH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL
+#define UVXH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
+
+#define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26
+#define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
+#define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL
+#define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
+
+#define UV3H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26
+#define UV3H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
+#define UV3H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL
+#define UV3H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
+
+#define UV4H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26
+#define UV4H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
+#define UV4H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL
+#define UV4H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
+
+
+union uvh_rh_gam_mmr_overlay_config_mmr_u {
+ unsigned long v;
+ struct uvh_rh_gam_mmr_overlay_config_mmr_s {
+ unsigned long rsvd_0_25:26;
+ unsigned long base:20; /* RW */
+ unsigned long rsvd_46_62:17;
+ unsigned long enable:1; /* RW */
+ } s;
+ struct uv1h_rh_gam_mmr_overlay_config_mmr_s {
+ unsigned long rsvd_0_25:26;
+ unsigned long base:20; /* RW */
+ unsigned long dual_hub:1; /* RW */
+ unsigned long rsvd_47_62:16;
+ unsigned long enable:1; /* RW */
+ } s1;
+ struct uvxh_rh_gam_mmr_overlay_config_mmr_s {
+ unsigned long rsvd_0_25:26;
+ unsigned long base:20; /* RW */
+ unsigned long rsvd_46_62:17;
+ unsigned long enable:1; /* RW */
+ } sx;
+ struct uv2h_rh_gam_mmr_overlay_config_mmr_s {
+ unsigned long rsvd_0_25:26;
+ unsigned long base:20; /* RW */
+ unsigned long rsvd_46_62:17;
+ unsigned long enable:1; /* RW */
+ } s2;
+ struct uv3h_rh_gam_mmr_overlay_config_mmr_s {
+ unsigned long rsvd_0_25:26;
+ unsigned long base:20; /* RW */
+ unsigned long rsvd_46_62:17;
+ unsigned long enable:1; /* RW */
+ } s3;
+ struct uv4h_rh_gam_mmr_overlay_config_mmr_s {
+ unsigned long rsvd_0_25:26;
+ unsigned long base:20; /* RW */
+ unsigned long rsvd_46_62:17;
+ unsigned long enable:1; /* RW */
+ } s4;
+};
+
+/* ========================================================================= */
+/* UVH_RTC */
+/* ========================================================================= */
+#define UV1H_RTC 0x340000UL
+#define UV2H_RTC 0x340000UL
+#define UV3H_RTC 0x340000UL
+#define UV4H_RTC 0xe0000UL
+#define UVH_RTC ( \
+ is_uv1_hub() ? UV1H_RTC : \
+ is_uv2_hub() ? UV2H_RTC : \
+ is_uv3_hub() ? UV3H_RTC : \
+ /*is_uv4_hub*/ UV4H_RTC)
+
+#define UVH_RTC_REAL_TIME_CLOCK_SHFT 0
+#define UVH_RTC_REAL_TIME_CLOCK_MASK 0x00ffffffffffffffUL
+
+
+union uvh_rtc_u {
+ unsigned long v;
+ struct uvh_rtc_s {
+ unsigned long real_time_clock:56; /* RW */
+ unsigned long rsvd_56_63:8;
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_RTC1_INT_CONFIG */
+/* ========================================================================= */
+#define UVH_RTC1_INT_CONFIG 0x615c0UL
+
+#define UVH_RTC1_INT_CONFIG_VECTOR_SHFT 0
+#define UVH_RTC1_INT_CONFIG_DM_SHFT 8
+#define UVH_RTC1_INT_CONFIG_DESTMODE_SHFT 11
+#define UVH_RTC1_INT_CONFIG_STATUS_SHFT 12
+#define UVH_RTC1_INT_CONFIG_P_SHFT 13
+#define UVH_RTC1_INT_CONFIG_T_SHFT 15
+#define UVH_RTC1_INT_CONFIG_M_SHFT 16
+#define UVH_RTC1_INT_CONFIG_APIC_ID_SHFT 32
+#define UVH_RTC1_INT_CONFIG_VECTOR_MASK 0x00000000000000ffUL
+#define UVH_RTC1_INT_CONFIG_DM_MASK 0x0000000000000700UL
+#define UVH_RTC1_INT_CONFIG_DESTMODE_MASK 0x0000000000000800UL
+#define UVH_RTC1_INT_CONFIG_STATUS_MASK 0x0000000000001000UL
+#define UVH_RTC1_INT_CONFIG_P_MASK 0x0000000000002000UL
+#define UVH_RTC1_INT_CONFIG_T_MASK 0x0000000000008000UL
+#define UVH_RTC1_INT_CONFIG_M_MASK 0x0000000000010000UL
+#define UVH_RTC1_INT_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
+
+
+union uvh_rtc1_int_config_u {
+ unsigned long v;
+ struct uvh_rtc1_int_config_s {
+ unsigned long vector_:8; /* RW */
+ unsigned long dm:3; /* RW */
+ unsigned long destmode:1; /* RW */
+ unsigned long status:1; /* RO */
+ unsigned long p:1; /* RO */
+ unsigned long rsvd_14:1;
+ unsigned long t:1; /* RO */
+ unsigned long m:1; /* RW */
+ unsigned long rsvd_17_31:15;
+ unsigned long apic_id:32; /* RW */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_SCRATCH5 */
+/* ========================================================================= */
+#define UV1H_SCRATCH5 0x2d0200UL
+#define UV2H_SCRATCH5 0x2d0200UL
+#define UV3H_SCRATCH5 0x2d0200UL
+#define UV4H_SCRATCH5 0xb0200UL
+#define UVH_SCRATCH5 ( \
+ is_uv1_hub() ? UV1H_SCRATCH5 : \
+ is_uv2_hub() ? UV2H_SCRATCH5 : \
+ is_uv3_hub() ? UV3H_SCRATCH5 : \
+ /*is_uv4_hub*/ UV4H_SCRATCH5)
+
+#define UV1H_SCRATCH5_32 0x778
+#define UV2H_SCRATCH5_32 0x778
+#define UV3H_SCRATCH5_32 0x778
+#define UV4H_SCRATCH5_32 0x798
+#define UVH_SCRATCH5_32 ( \
+ is_uv1_hub() ? UV1H_SCRATCH5_32 : \
+ is_uv2_hub() ? UV2H_SCRATCH5_32 : \
+ is_uv3_hub() ? UV3H_SCRATCH5_32 : \
+ /*is_uv4_hub*/ UV4H_SCRATCH5_32)
+
+#define UVH_SCRATCH5_SCRATCH5_SHFT 0
+#define UVH_SCRATCH5_SCRATCH5_MASK 0xffffffffffffffffUL
+
+
+union uvh_scratch5_u {
+ unsigned long v;
+ struct uvh_scratch5_s {
+ unsigned long scratch5:64; /* RW, W1CS */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_SCRATCH5_ALIAS */
+/* ========================================================================= */
+#define UV1H_SCRATCH5_ALIAS 0x2d0208UL
+#define UV2H_SCRATCH5_ALIAS 0x2d0208UL
+#define UV3H_SCRATCH5_ALIAS 0x2d0208UL
+#define UV4H_SCRATCH5_ALIAS 0xb0208UL
+#define UVH_SCRATCH5_ALIAS ( \
+ is_uv1_hub() ? UV1H_SCRATCH5_ALIAS : \
+ is_uv2_hub() ? UV2H_SCRATCH5_ALIAS : \
+ is_uv3_hub() ? UV3H_SCRATCH5_ALIAS : \
+ /*is_uv4_hub*/ UV4H_SCRATCH5_ALIAS)
+
+#define UV1H_SCRATCH5_ALIAS_32 0x780
+#define UV2H_SCRATCH5_ALIAS_32 0x780
+#define UV3H_SCRATCH5_ALIAS_32 0x780
+#define UV4H_SCRATCH5_ALIAS_32 0x7a0
+#define UVH_SCRATCH5_ALIAS_32 ( \
+ is_uv1_hub() ? UV1H_SCRATCH5_ALIAS_32 : \
+ is_uv2_hub() ? UV2H_SCRATCH5_ALIAS_32 : \
+ is_uv3_hub() ? UV3H_SCRATCH5_ALIAS_32 : \
+ /*is_uv4_hub*/ UV4H_SCRATCH5_ALIAS_32)
+
+
+/* ========================================================================= */
+/* UVH_SCRATCH5_ALIAS_2 */
+/* ========================================================================= */
+#define UV1H_SCRATCH5_ALIAS_2 0x2d0210UL
+#define UV2H_SCRATCH5_ALIAS_2 0x2d0210UL
+#define UV3H_SCRATCH5_ALIAS_2 0x2d0210UL
+#define UV4H_SCRATCH5_ALIAS_2 0xb0210UL
+#define UVH_SCRATCH5_ALIAS_2 ( \
+ is_uv1_hub() ? UV1H_SCRATCH5_ALIAS_2 : \
+ is_uv2_hub() ? UV2H_SCRATCH5_ALIAS_2 : \
+ is_uv3_hub() ? UV3H_SCRATCH5_ALIAS_2 : \
+ /*is_uv4_hub*/ UV4H_SCRATCH5_ALIAS_2)
+#define UVH_SCRATCH5_ALIAS_2_32 0x788
+
+
+/* ========================================================================= */
+/* UVXH_EVENT_OCCURRED2 */
+/* ========================================================================= */
+#define UVXH_EVENT_OCCURRED2 0x70100UL
+
+#define UV2H_EVENT_OCCURRED2_32 0xb68
+#define UV3H_EVENT_OCCURRED2_32 0xb68
+#define UV4H_EVENT_OCCURRED2_32 0x608
+#define UVH_EVENT_OCCURRED2_32 ( \
+ is_uv2_hub() ? UV2H_EVENT_OCCURRED2_32 : \
+ is_uv3_hub() ? UV3H_EVENT_OCCURRED2_32 : \
+ /*is_uv4_hub*/ UV4H_EVENT_OCCURRED2_32)
+
+
+#define UV2H_EVENT_OCCURRED2_RTC_0_SHFT 0
+#define UV2H_EVENT_OCCURRED2_RTC_1_SHFT 1
+#define UV2H_EVENT_OCCURRED2_RTC_2_SHFT 2
+#define UV2H_EVENT_OCCURRED2_RTC_3_SHFT 3
+#define UV2H_EVENT_OCCURRED2_RTC_4_SHFT 4
+#define UV2H_EVENT_OCCURRED2_RTC_5_SHFT 5
+#define UV2H_EVENT_OCCURRED2_RTC_6_SHFT 6
+#define UV2H_EVENT_OCCURRED2_RTC_7_SHFT 7
+#define UV2H_EVENT_OCCURRED2_RTC_8_SHFT 8
+#define UV2H_EVENT_OCCURRED2_RTC_9_SHFT 9
+#define UV2H_EVENT_OCCURRED2_RTC_10_SHFT 10
+#define UV2H_EVENT_OCCURRED2_RTC_11_SHFT 11
+#define UV2H_EVENT_OCCURRED2_RTC_12_SHFT 12
+#define UV2H_EVENT_OCCURRED2_RTC_13_SHFT 13
+#define UV2H_EVENT_OCCURRED2_RTC_14_SHFT 14
+#define UV2H_EVENT_OCCURRED2_RTC_15_SHFT 15
+#define UV2H_EVENT_OCCURRED2_RTC_16_SHFT 16
+#define UV2H_EVENT_OCCURRED2_RTC_17_SHFT 17
+#define UV2H_EVENT_OCCURRED2_RTC_18_SHFT 18
+#define UV2H_EVENT_OCCURRED2_RTC_19_SHFT 19
+#define UV2H_EVENT_OCCURRED2_RTC_20_SHFT 20
+#define UV2H_EVENT_OCCURRED2_RTC_21_SHFT 21
+#define UV2H_EVENT_OCCURRED2_RTC_22_SHFT 22
+#define UV2H_EVENT_OCCURRED2_RTC_23_SHFT 23
+#define UV2H_EVENT_OCCURRED2_RTC_24_SHFT 24
+#define UV2H_EVENT_OCCURRED2_RTC_25_SHFT 25
+#define UV2H_EVENT_OCCURRED2_RTC_26_SHFT 26
+#define UV2H_EVENT_OCCURRED2_RTC_27_SHFT 27
+#define UV2H_EVENT_OCCURRED2_RTC_28_SHFT 28
+#define UV2H_EVENT_OCCURRED2_RTC_29_SHFT 29
+#define UV2H_EVENT_OCCURRED2_RTC_30_SHFT 30
+#define UV2H_EVENT_OCCURRED2_RTC_31_SHFT 31
+#define UV2H_EVENT_OCCURRED2_RTC_0_MASK 0x0000000000000001UL
+#define UV2H_EVENT_OCCURRED2_RTC_1_MASK 0x0000000000000002UL
+#define UV2H_EVENT_OCCURRED2_RTC_2_MASK 0x0000000000000004UL
+#define UV2H_EVENT_OCCURRED2_RTC_3_MASK 0x0000000000000008UL
+#define UV2H_EVENT_OCCURRED2_RTC_4_MASK 0x0000000000000010UL
+#define UV2H_EVENT_OCCURRED2_RTC_5_MASK 0x0000000000000020UL
+#define UV2H_EVENT_OCCURRED2_RTC_6_MASK 0x0000000000000040UL
+#define UV2H_EVENT_OCCURRED2_RTC_7_MASK 0x0000000000000080UL
+#define UV2H_EVENT_OCCURRED2_RTC_8_MASK 0x0000000000000100UL
+#define UV2H_EVENT_OCCURRED2_RTC_9_MASK 0x0000000000000200UL
+#define UV2H_EVENT_OCCURRED2_RTC_10_MASK 0x0000000000000400UL
+#define UV2H_EVENT_OCCURRED2_RTC_11_MASK 0x0000000000000800UL
+#define UV2H_EVENT_OCCURRED2_RTC_12_MASK 0x0000000000001000UL
+#define UV2H_EVENT_OCCURRED2_RTC_13_MASK 0x0000000000002000UL
+#define UV2H_EVENT_OCCURRED2_RTC_14_MASK 0x0000000000004000UL
+#define UV2H_EVENT_OCCURRED2_RTC_15_MASK 0x0000000000008000UL
+#define UV2H_EVENT_OCCURRED2_RTC_16_MASK 0x0000000000010000UL
+#define UV2H_EVENT_OCCURRED2_RTC_17_MASK 0x0000000000020000UL
+#define UV2H_EVENT_OCCURRED2_RTC_18_MASK 0x0000000000040000UL
+#define UV2H_EVENT_OCCURRED2_RTC_19_MASK 0x0000000000080000UL
+#define UV2H_EVENT_OCCURRED2_RTC_20_MASK 0x0000000000100000UL
+#define UV2H_EVENT_OCCURRED2_RTC_21_MASK 0x0000000000200000UL
+#define UV2H_EVENT_OCCURRED2_RTC_22_MASK 0x0000000000400000UL
+#define UV2H_EVENT_OCCURRED2_RTC_23_MASK 0x0000000000800000UL
+#define UV2H_EVENT_OCCURRED2_RTC_24_MASK 0x0000000001000000UL
+#define UV2H_EVENT_OCCURRED2_RTC_25_MASK 0x0000000002000000UL
+#define UV2H_EVENT_OCCURRED2_RTC_26_MASK 0x0000000004000000UL
+#define UV2H_EVENT_OCCURRED2_RTC_27_MASK 0x0000000008000000UL
+#define UV2H_EVENT_OCCURRED2_RTC_28_MASK 0x0000000010000000UL
+#define UV2H_EVENT_OCCURRED2_RTC_29_MASK 0x0000000020000000UL
+#define UV2H_EVENT_OCCURRED2_RTC_30_MASK 0x0000000040000000UL
+#define UV2H_EVENT_OCCURRED2_RTC_31_MASK 0x0000000080000000UL
+
+#define UV3H_EVENT_OCCURRED2_RTC_0_SHFT 0
+#define UV3H_EVENT_OCCURRED2_RTC_1_SHFT 1
+#define UV3H_EVENT_OCCURRED2_RTC_2_SHFT 2
+#define UV3H_EVENT_OCCURRED2_RTC_3_SHFT 3
+#define UV3H_EVENT_OCCURRED2_RTC_4_SHFT 4
+#define UV3H_EVENT_OCCURRED2_RTC_5_SHFT 5
+#define UV3H_EVENT_OCCURRED2_RTC_6_SHFT 6
+#define UV3H_EVENT_OCCURRED2_RTC_7_SHFT 7
+#define UV3H_EVENT_OCCURRED2_RTC_8_SHFT 8
+#define UV3H_EVENT_OCCURRED2_RTC_9_SHFT 9
+#define UV3H_EVENT_OCCURRED2_RTC_10_SHFT 10
+#define UV3H_EVENT_OCCURRED2_RTC_11_SHFT 11
+#define UV3H_EVENT_OCCURRED2_RTC_12_SHFT 12
+#define UV3H_EVENT_OCCURRED2_RTC_13_SHFT 13
+#define UV3H_EVENT_OCCURRED2_RTC_14_SHFT 14
+#define UV3H_EVENT_OCCURRED2_RTC_15_SHFT 15
+#define UV3H_EVENT_OCCURRED2_RTC_16_SHFT 16
+#define UV3H_EVENT_OCCURRED2_RTC_17_SHFT 17
+#define UV3H_EVENT_OCCURRED2_RTC_18_SHFT 18
+#define UV3H_EVENT_OCCURRED2_RTC_19_SHFT 19
+#define UV3H_EVENT_OCCURRED2_RTC_20_SHFT 20
+#define UV3H_EVENT_OCCURRED2_RTC_21_SHFT 21
+#define UV3H_EVENT_OCCURRED2_RTC_22_SHFT 22
+#define UV3H_EVENT_OCCURRED2_RTC_23_SHFT 23
+#define UV3H_EVENT_OCCURRED2_RTC_24_SHFT 24
+#define UV3H_EVENT_OCCURRED2_RTC_25_SHFT 25
+#define UV3H_EVENT_OCCURRED2_RTC_26_SHFT 26
+#define UV3H_EVENT_OCCURRED2_RTC_27_SHFT 27
+#define UV3H_EVENT_OCCURRED2_RTC_28_SHFT 28
+#define UV3H_EVENT_OCCURRED2_RTC_29_SHFT 29
+#define UV3H_EVENT_OCCURRED2_RTC_30_SHFT 30
+#define UV3H_EVENT_OCCURRED2_RTC_31_SHFT 31
+#define UV3H_EVENT_OCCURRED2_RTC_0_MASK 0x0000000000000001UL
+#define UV3H_EVENT_OCCURRED2_RTC_1_MASK 0x0000000000000002UL
+#define UV3H_EVENT_OCCURRED2_RTC_2_MASK 0x0000000000000004UL
+#define UV3H_EVENT_OCCURRED2_RTC_3_MASK 0x0000000000000008UL
+#define UV3H_EVENT_OCCURRED2_RTC_4_MASK 0x0000000000000010UL
+#define UV3H_EVENT_OCCURRED2_RTC_5_MASK 0x0000000000000020UL
+#define UV3H_EVENT_OCCURRED2_RTC_6_MASK 0x0000000000000040UL
+#define UV3H_EVENT_OCCURRED2_RTC_7_MASK 0x0000000000000080UL
+#define UV3H_EVENT_OCCURRED2_RTC_8_MASK 0x0000000000000100UL
+#define UV3H_EVENT_OCCURRED2_RTC_9_MASK 0x0000000000000200UL
+#define UV3H_EVENT_OCCURRED2_RTC_10_MASK 0x0000000000000400UL
+#define UV3H_EVENT_OCCURRED2_RTC_11_MASK 0x0000000000000800UL
+#define UV3H_EVENT_OCCURRED2_RTC_12_MASK 0x0000000000001000UL
+#define UV3H_EVENT_OCCURRED2_RTC_13_MASK 0x0000000000002000UL
+#define UV3H_EVENT_OCCURRED2_RTC_14_MASK 0x0000000000004000UL
+#define UV3H_EVENT_OCCURRED2_RTC_15_MASK 0x0000000000008000UL
+#define UV3H_EVENT_OCCURRED2_RTC_16_MASK 0x0000000000010000UL
+#define UV3H_EVENT_OCCURRED2_RTC_17_MASK 0x0000000000020000UL
+#define UV3H_EVENT_OCCURRED2_RTC_18_MASK 0x0000000000040000UL
+#define UV3H_EVENT_OCCURRED2_RTC_19_MASK 0x0000000000080000UL
+#define UV3H_EVENT_OCCURRED2_RTC_20_MASK 0x0000000000100000UL
+#define UV3H_EVENT_OCCURRED2_RTC_21_MASK 0x0000000000200000UL
+#define UV3H_EVENT_OCCURRED2_RTC_22_MASK 0x0000000000400000UL
+#define UV3H_EVENT_OCCURRED2_RTC_23_MASK 0x0000000000800000UL
+#define UV3H_EVENT_OCCURRED2_RTC_24_MASK 0x0000000001000000UL
+#define UV3H_EVENT_OCCURRED2_RTC_25_MASK 0x0000000002000000UL
+#define UV3H_EVENT_OCCURRED2_RTC_26_MASK 0x0000000004000000UL
+#define UV3H_EVENT_OCCURRED2_RTC_27_MASK 0x0000000008000000UL
+#define UV3H_EVENT_OCCURRED2_RTC_28_MASK 0x0000000010000000UL
+#define UV3H_EVENT_OCCURRED2_RTC_29_MASK 0x0000000020000000UL
+#define UV3H_EVENT_OCCURRED2_RTC_30_MASK 0x0000000040000000UL
+#define UV3H_EVENT_OCCURRED2_RTC_31_MASK 0x0000000080000000UL
+
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT0_SHFT 0
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT1_SHFT 1
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT2_SHFT 2
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT3_SHFT 3
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT4_SHFT 4
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT5_SHFT 5
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT6_SHFT 6
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT7_SHFT 7
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT8_SHFT 8
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT9_SHFT 9
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT10_SHFT 10
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT11_SHFT 11
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT12_SHFT 12
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT13_SHFT 13
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT14_SHFT 14
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT15_SHFT 15
+#define UV4H_EVENT_OCCURRED2_RTC_INTERVAL_INT_SHFT 16
+#define UV4H_EVENT_OCCURRED2_BAU_DASHBOARD_INT_SHFT 17
+#define UV4H_EVENT_OCCURRED2_RTC_0_SHFT 18
+#define UV4H_EVENT_OCCURRED2_RTC_1_SHFT 19
+#define UV4H_EVENT_OCCURRED2_RTC_2_SHFT 20
+#define UV4H_EVENT_OCCURRED2_RTC_3_SHFT 21
+#define UV4H_EVENT_OCCURRED2_RTC_4_SHFT 22
+#define UV4H_EVENT_OCCURRED2_RTC_5_SHFT 23
+#define UV4H_EVENT_OCCURRED2_RTC_6_SHFT 24
+#define UV4H_EVENT_OCCURRED2_RTC_7_SHFT 25
+#define UV4H_EVENT_OCCURRED2_RTC_8_SHFT 26
+#define UV4H_EVENT_OCCURRED2_RTC_9_SHFT 27
+#define UV4H_EVENT_OCCURRED2_RTC_10_SHFT 28
+#define UV4H_EVENT_OCCURRED2_RTC_11_SHFT 29
+#define UV4H_EVENT_OCCURRED2_RTC_12_SHFT 30
+#define UV4H_EVENT_OCCURRED2_RTC_13_SHFT 31
+#define UV4H_EVENT_OCCURRED2_RTC_14_SHFT 32
+#define UV4H_EVENT_OCCURRED2_RTC_15_SHFT 33
+#define UV4H_EVENT_OCCURRED2_RTC_16_SHFT 34
+#define UV4H_EVENT_OCCURRED2_RTC_17_SHFT 35
+#define UV4H_EVENT_OCCURRED2_RTC_18_SHFT 36
+#define UV4H_EVENT_OCCURRED2_RTC_19_SHFT 37
+#define UV4H_EVENT_OCCURRED2_RTC_20_SHFT 38
+#define UV4H_EVENT_OCCURRED2_RTC_21_SHFT 39
+#define UV4H_EVENT_OCCURRED2_RTC_22_SHFT 40
+#define UV4H_EVENT_OCCURRED2_RTC_23_SHFT 41
+#define UV4H_EVENT_OCCURRED2_RTC_24_SHFT 42
+#define UV4H_EVENT_OCCURRED2_RTC_25_SHFT 43
+#define UV4H_EVENT_OCCURRED2_RTC_26_SHFT 44
+#define UV4H_EVENT_OCCURRED2_RTC_27_SHFT 45
+#define UV4H_EVENT_OCCURRED2_RTC_28_SHFT 46
+#define UV4H_EVENT_OCCURRED2_RTC_29_SHFT 47
+#define UV4H_EVENT_OCCURRED2_RTC_30_SHFT 48
+#define UV4H_EVENT_OCCURRED2_RTC_31_SHFT 49
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT0_MASK 0x0000000000000001UL
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT1_MASK 0x0000000000000002UL
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT2_MASK 0x0000000000000004UL
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT3_MASK 0x0000000000000008UL
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT4_MASK 0x0000000000000010UL
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT5_MASK 0x0000000000000020UL
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT6_MASK 0x0000000000000040UL
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT7_MASK 0x0000000000000080UL
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT8_MASK 0x0000000000000100UL
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT9_MASK 0x0000000000000200UL
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT10_MASK 0x0000000000000400UL
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT11_MASK 0x0000000000000800UL
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT12_MASK 0x0000000000001000UL
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT13_MASK 0x0000000000002000UL
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT14_MASK 0x0000000000004000UL
+#define UV4H_EVENT_OCCURRED2_MESSAGE_ACCELERATOR_INT15_MASK 0x0000000000008000UL
+#define UV4H_EVENT_OCCURRED2_RTC_INTERVAL_INT_MASK 0x0000000000010000UL
+#define UV4H_EVENT_OCCURRED2_BAU_DASHBOARD_INT_MASK 0x0000000000020000UL
+#define UV4H_EVENT_OCCURRED2_RTC_0_MASK 0x0000000000040000UL
+#define UV4H_EVENT_OCCURRED2_RTC_1_MASK 0x0000000000080000UL
+#define UV4H_EVENT_OCCURRED2_RTC_2_MASK 0x0000000000100000UL
+#define UV4H_EVENT_OCCURRED2_RTC_3_MASK 0x0000000000200000UL
+#define UV4H_EVENT_OCCURRED2_RTC_4_MASK 0x0000000000400000UL
+#define UV4H_EVENT_OCCURRED2_RTC_5_MASK 0x0000000000800000UL
+#define UV4H_EVENT_OCCURRED2_RTC_6_MASK 0x0000000001000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_7_MASK 0x0000000002000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_8_MASK 0x0000000004000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_9_MASK 0x0000000008000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_10_MASK 0x0000000010000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_11_MASK 0x0000000020000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_12_MASK 0x0000000040000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_13_MASK 0x0000000080000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_14_MASK 0x0000000100000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_15_MASK 0x0000000200000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_16_MASK 0x0000000400000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_17_MASK 0x0000000800000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_18_MASK 0x0000001000000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_19_MASK 0x0000002000000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_20_MASK 0x0000004000000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_21_MASK 0x0000008000000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_22_MASK 0x0000010000000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_23_MASK 0x0000020000000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_24_MASK 0x0000040000000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_25_MASK 0x0000080000000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_26_MASK 0x0000100000000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_27_MASK 0x0000200000000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_28_MASK 0x0000400000000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_29_MASK 0x0000800000000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_30_MASK 0x0001000000000000UL
+#define UV4H_EVENT_OCCURRED2_RTC_31_MASK 0x0002000000000000UL
+
+#define UVXH_EVENT_OCCURRED2_RTC_1_MASK ( \
+ is_uv2_hub() ? UV2H_EVENT_OCCURRED2_RTC_1_MASK : \
+ is_uv3_hub() ? UV3H_EVENT_OCCURRED2_RTC_1_MASK : \
+ /*is_uv4_hub*/ UV4H_EVENT_OCCURRED2_RTC_1_MASK)
+
+union uvh_event_occurred2_u {
+ unsigned long v;
+ struct uv2h_event_occurred2_s {
+ unsigned long rtc_0:1; /* RW */
+ unsigned long rtc_1:1; /* RW */
+ unsigned long rtc_2:1; /* RW */
+ unsigned long rtc_3:1; /* RW */
+ unsigned long rtc_4:1; /* RW */
+ unsigned long rtc_5:1; /* RW */
+ unsigned long rtc_6:1; /* RW */
+ unsigned long rtc_7:1; /* RW */
+ unsigned long rtc_8:1; /* RW */
+ unsigned long rtc_9:1; /* RW */
+ unsigned long rtc_10:1; /* RW */
+ unsigned long rtc_11:1; /* RW */
+ unsigned long rtc_12:1; /* RW */
+ unsigned long rtc_13:1; /* RW */
+ unsigned long rtc_14:1; /* RW */
+ unsigned long rtc_15:1; /* RW */
+ unsigned long rtc_16:1; /* RW */
+ unsigned long rtc_17:1; /* RW */
+ unsigned long rtc_18:1; /* RW */
+ unsigned long rtc_19:1; /* RW */
+ unsigned long rtc_20:1; /* RW */
+ unsigned long rtc_21:1; /* RW */
+ unsigned long rtc_22:1; /* RW */
+ unsigned long rtc_23:1; /* RW */
+ unsigned long rtc_24:1; /* RW */
+ unsigned long rtc_25:1; /* RW */
+ unsigned long rtc_26:1; /* RW */
+ unsigned long rtc_27:1; /* RW */
+ unsigned long rtc_28:1; /* RW */
+ unsigned long rtc_29:1; /* RW */
+ unsigned long rtc_30:1; /* RW */
+ unsigned long rtc_31:1; /* RW */
+ unsigned long rsvd_32_63:32;
+ } s2;
+ struct uv3h_event_occurred2_s {
+ unsigned long rtc_0:1; /* RW */
+ unsigned long rtc_1:1; /* RW */
+ unsigned long rtc_2:1; /* RW */
+ unsigned long rtc_3:1; /* RW */
+ unsigned long rtc_4:1; /* RW */
+ unsigned long rtc_5:1; /* RW */
+ unsigned long rtc_6:1; /* RW */
+ unsigned long rtc_7:1; /* RW */
+ unsigned long rtc_8:1; /* RW */
+ unsigned long rtc_9:1; /* RW */
+ unsigned long rtc_10:1; /* RW */
+ unsigned long rtc_11:1; /* RW */
+ unsigned long rtc_12:1; /* RW */
+ unsigned long rtc_13:1; /* RW */
+ unsigned long rtc_14:1; /* RW */
+ unsigned long rtc_15:1; /* RW */
+ unsigned long rtc_16:1; /* RW */
+ unsigned long rtc_17:1; /* RW */
+ unsigned long rtc_18:1; /* RW */
+ unsigned long rtc_19:1; /* RW */
+ unsigned long rtc_20:1; /* RW */
+ unsigned long rtc_21:1; /* RW */
+ unsigned long rtc_22:1; /* RW */
+ unsigned long rtc_23:1; /* RW */
+ unsigned long rtc_24:1; /* RW */
+ unsigned long rtc_25:1; /* RW */
+ unsigned long rtc_26:1; /* RW */
+ unsigned long rtc_27:1; /* RW */
+ unsigned long rtc_28:1; /* RW */
+ unsigned long rtc_29:1; /* RW */
+ unsigned long rtc_30:1; /* RW */
+ unsigned long rtc_31:1; /* RW */
+ unsigned long rsvd_32_63:32;
+ } s3;
+ struct uv4h_event_occurred2_s {
+ unsigned long message_accelerator_int0:1; /* RW */
+ unsigned long message_accelerator_int1:1; /* RW */
+ unsigned long message_accelerator_int2:1; /* RW */
+ unsigned long message_accelerator_int3:1; /* RW */
+ unsigned long message_accelerator_int4:1; /* RW */
+ unsigned long message_accelerator_int5:1; /* RW */
+ unsigned long message_accelerator_int6:1; /* RW */
+ unsigned long message_accelerator_int7:1; /* RW */
+ unsigned long message_accelerator_int8:1; /* RW */
+ unsigned long message_accelerator_int9:1; /* RW */
+ unsigned long message_accelerator_int10:1; /* RW */
+ unsigned long message_accelerator_int11:1; /* RW */
+ unsigned long message_accelerator_int12:1; /* RW */
+ unsigned long message_accelerator_int13:1; /* RW */
+ unsigned long message_accelerator_int14:1; /* RW */
+ unsigned long message_accelerator_int15:1; /* RW */
+ unsigned long rtc_interval_int:1; /* RW */
+ unsigned long bau_dashboard_int:1; /* RW */
+ unsigned long rtc_0:1; /* RW */
+ unsigned long rtc_1:1; /* RW */
+ unsigned long rtc_2:1; /* RW */
+ unsigned long rtc_3:1; /* RW */
+ unsigned long rtc_4:1; /* RW */
+ unsigned long rtc_5:1; /* RW */
+ unsigned long rtc_6:1; /* RW */
+ unsigned long rtc_7:1; /* RW */
+ unsigned long rtc_8:1; /* RW */
+ unsigned long rtc_9:1; /* RW */
+ unsigned long rtc_10:1; /* RW */
+ unsigned long rtc_11:1; /* RW */
+ unsigned long rtc_12:1; /* RW */
+ unsigned long rtc_13:1; /* RW */
+ unsigned long rtc_14:1; /* RW */
+ unsigned long rtc_15:1; /* RW */
+ unsigned long rtc_16:1; /* RW */
+ unsigned long rtc_17:1; /* RW */
+ unsigned long rtc_18:1; /* RW */
+ unsigned long rtc_19:1; /* RW */
+ unsigned long rtc_20:1; /* RW */
+ unsigned long rtc_21:1; /* RW */
+ unsigned long rtc_22:1; /* RW */
+ unsigned long rtc_23:1; /* RW */
+ unsigned long rtc_24:1; /* RW */
+ unsigned long rtc_25:1; /* RW */
+ unsigned long rtc_26:1; /* RW */
+ unsigned long rtc_27:1; /* RW */
+ unsigned long rtc_28:1; /* RW */
+ unsigned long rtc_29:1; /* RW */
+ unsigned long rtc_30:1; /* RW */
+ unsigned long rtc_31:1; /* RW */
+ unsigned long rsvd_50_63:14;
+ } s4;
+};
+
+/* ========================================================================= */
+/* UVXH_EVENT_OCCURRED2_ALIAS */
+/* ========================================================================= */
+#define UVXH_EVENT_OCCURRED2_ALIAS 0x70108UL
+
+#define UV2H_EVENT_OCCURRED2_ALIAS_32 0xb70
+#define UV3H_EVENT_OCCURRED2_ALIAS_32 0xb70
+#define UV4H_EVENT_OCCURRED2_ALIAS_32 0x610
+#define UVH_EVENT_OCCURRED2_ALIAS_32 ( \
+ is_uv2_hub() ? UV2H_EVENT_OCCURRED2_ALIAS_32 : \
+ is_uv3_hub() ? UV3H_EVENT_OCCURRED2_ALIAS_32 : \
+ /*is_uv4_hub*/ UV4H_EVENT_OCCURRED2_ALIAS_32)
+
+
+/* ========================================================================= */
+/* UVXH_LB_BAU_SB_ACTIVATION_STATUS_2 */
+/* ========================================================================= */
+#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2 0x320130UL
+#define UV3H_LB_BAU_SB_ACTIVATION_STATUS_2 0x320130UL
+#define UV4H_LB_BAU_SB_ACTIVATION_STATUS_2 0xc8130UL
+#define UVH_LB_BAU_SB_ACTIVATION_STATUS_2 ( \
+ is_uv2_hub() ? UV2H_LB_BAU_SB_ACTIVATION_STATUS_2 : \
+ is_uv3_hub() ? UV3H_LB_BAU_SB_ACTIVATION_STATUS_2 : \
+ /*is_uv4_hub*/ UV4H_LB_BAU_SB_ACTIVATION_STATUS_2)
+
+#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2_32 0x9f0
+#define UV3H_LB_BAU_SB_ACTIVATION_STATUS_2_32 0x9f0
+#define UV4H_LB_BAU_SB_ACTIVATION_STATUS_2_32 0xa10
+#define UVH_LB_BAU_SB_ACTIVATION_STATUS_2_32 ( \
+ is_uv2_hub() ? UV2H_LB_BAU_SB_ACTIVATION_STATUS_2_32 : \
+ is_uv3_hub() ? UV3H_LB_BAU_SB_ACTIVATION_STATUS_2_32 : \
+ /*is_uv4_hub*/ UV4H_LB_BAU_SB_ACTIVATION_STATUS_2_32)
+
+#define UVXH_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_SHFT 0
+#define UVXH_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_MASK 0xffffffffffffffffUL
+
+#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_SHFT 0
+#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_MASK 0xffffffffffffffffUL
+
+#define UV3H_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_SHFT 0
+#define UV3H_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_MASK 0xffffffffffffffffUL
+
+#define UV4H_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_SHFT 0
+#define UV4H_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_MASK 0xffffffffffffffffUL
+
+
+union uvxh_lb_bau_sb_activation_status_2_u {
+ unsigned long v;
+ struct uvxh_lb_bau_sb_activation_status_2_s {
+ unsigned long aux_error:64; /* RW */
+ } sx;
+ struct uv2h_lb_bau_sb_activation_status_2_s {
+ unsigned long aux_error:64; /* RW */
+ } s2;
+ struct uv3h_lb_bau_sb_activation_status_2_s {
+ unsigned long aux_error:64; /* RW */
+ } s3;
+ struct uv4h_lb_bau_sb_activation_status_2_s {
+ unsigned long aux_error:64; /* RW */
+ } s4;
+};
+
+/* ========================================================================= */
+/* UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK */
+/* ========================================================================= */
+#define UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK 0x320130UL
+#define UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK_32 0x9f0
+
+#define UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK_BIT_ENABLES_SHFT 0
+#define UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK_BIT_ENABLES_MASK 0x00000000ffffffffUL
+
+union uv1h_lb_target_physical_apic_id_mask_u {
+ unsigned long v;
+ struct uv1h_lb_target_physical_apic_id_mask_s {
+ unsigned long bit_enables:32; /* RW */
+ unsigned long rsvd_32_63:32;
+ } s1;
+};
+
+/* ========================================================================= */
+/* UV3H_GR0_GAM_GR_CONFIG */
+/* ========================================================================= */
+#define UV3H_GR0_GAM_GR_CONFIG 0xc00028UL
+
+#define UV3H_GR0_GAM_GR_CONFIG_M_SKT_SHFT 0
+#define UV3H_GR0_GAM_GR_CONFIG_SUBSPACE_SHFT 10
+#define UV3H_GR0_GAM_GR_CONFIG_M_SKT_MASK 0x000000000000003fUL
+#define UV3H_GR0_GAM_GR_CONFIG_SUBSPACE_MASK 0x0000000000000400UL
+
+union uv3h_gr0_gam_gr_config_u {
+ unsigned long v;
+ struct uv3h_gr0_gam_gr_config_s {
+ unsigned long m_skt:6; /* RW */
+ unsigned long undef_6_9:4; /* Undefined */
+ unsigned long subspace:1; /* RW */
+ unsigned long reserved:53;
+ } s3;
+};
+
+/* ========================================================================= */
+/* UV4H_LB_PROC_INTD_QUEUE_FIRST */
+/* ========================================================================= */
+#define UV4H_LB_PROC_INTD_QUEUE_FIRST 0xa4100UL
+
+#define UV4H_LB_PROC_INTD_QUEUE_FIRST_FIRST_PAYLOAD_ADDRESS_SHFT 6
+#define UV4H_LB_PROC_INTD_QUEUE_FIRST_FIRST_PAYLOAD_ADDRESS_MASK 0x00003fffffffffc0UL
+
+union uv4h_lb_proc_intd_queue_first_u {
+ unsigned long v;
+ struct uv4h_lb_proc_intd_queue_first_s {
+ unsigned long undef_0_5:6; /* Undefined */
+ unsigned long first_payload_address:40; /* RW */
+ } s4;
+};
+
+/* ========================================================================= */
+/* UV4H_LB_PROC_INTD_QUEUE_LAST */
+/* ========================================================================= */
+#define UV4H_LB_PROC_INTD_QUEUE_LAST 0xa4108UL
+
+#define UV4H_LB_PROC_INTD_QUEUE_LAST_LAST_PAYLOAD_ADDRESS_SHFT 5
+#define UV4H_LB_PROC_INTD_QUEUE_LAST_LAST_PAYLOAD_ADDRESS_MASK 0x00003fffffffffe0UL
+
+union uv4h_lb_proc_intd_queue_last_u {
+ unsigned long v;
+ struct uv4h_lb_proc_intd_queue_last_s {
+ unsigned long undef_0_4:5; /* Undefined */
+ unsigned long last_payload_address:41; /* RW */
+ } s4;
+};
+
+/* ========================================================================= */
+/* UV4H_LB_PROC_INTD_SOFT_ACK_CLEAR */
+/* ========================================================================= */
+#define UV4H_LB_PROC_INTD_SOFT_ACK_CLEAR 0xa4118UL
+
+#define UV4H_LB_PROC_INTD_SOFT_ACK_CLEAR_SOFT_ACK_PENDING_FLAGS_SHFT 0
+#define UV4H_LB_PROC_INTD_SOFT_ACK_CLEAR_SOFT_ACK_PENDING_FLAGS_MASK 0x00000000000000ffUL
+
+union uv4h_lb_proc_intd_soft_ack_clear_u {
+ unsigned long v;
+ struct uv4h_lb_proc_intd_soft_ack_clear_s {
+ unsigned long soft_ack_pending_flags:8; /* WP */
+ } s4;
+};
+
+/* ========================================================================= */
+/* UV4H_LB_PROC_INTD_SOFT_ACK_PENDING */
+/* ========================================================================= */
+#define UV4H_LB_PROC_INTD_SOFT_ACK_PENDING 0xa4110UL
+
+#define UV4H_LB_PROC_INTD_SOFT_ACK_PENDING_SOFT_ACK_FLAGS_SHFT 0
+#define UV4H_LB_PROC_INTD_SOFT_ACK_PENDING_SOFT_ACK_FLAGS_MASK 0x00000000000000ffUL
+
+union uv4h_lb_proc_intd_soft_ack_pending_u {
+ unsigned long v;
+ struct uv4h_lb_proc_intd_soft_ack_pending_s {
+ unsigned long soft_ack_flags:8; /* RW */
+ } s4;
+};
+
+
+#endif /* _ASM_X86_UV_UV_MMRS_H */
diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
new file mode 100644
index 000000000..27566e57e
--- /dev/null
+++ b/arch/x86/include/asm/vdso.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_VDSO_H
+#define _ASM_X86_VDSO_H
+
+#include <asm/page_types.h>
+#include <linux/linkage.h>
+#include <linux/init.h>
+
+#ifndef __ASSEMBLER__
+
+#include <linux/mm_types.h>
+
+struct vdso_image {
+ void *data;
+ unsigned long size; /* Always a multiple of PAGE_SIZE */
+
+ unsigned long alt, alt_len;
+
+ long sym_vvar_start; /* Negative offset to the vvar area */
+
+ long sym_vvar_page;
+ long sym_hpet_page;
+ long sym_pvclock_page;
+ long sym_hvclock_page;
+ long sym_VDSO32_NOTE_MASK;
+ long sym___kernel_sigreturn;
+ long sym___kernel_rt_sigreturn;
+ long sym___kernel_vsyscall;
+ long sym_int80_landing_pad;
+};
+
+#ifdef CONFIG_X86_64
+extern const struct vdso_image vdso_image_64;
+#endif
+
+#ifdef CONFIG_X86_X32
+extern const struct vdso_image vdso_image_x32;
+#endif
+
+#if defined CONFIG_X86_32 || defined CONFIG_COMPAT
+extern const struct vdso_image vdso_image_32;
+#endif
+
+extern void __init init_vdso_image(const struct vdso_image *image);
+
+extern int map_vdso_once(const struct vdso_image *image, unsigned long addr);
+
+#endif /* __ASSEMBLER__ */
+
+#endif /* _ASM_X86_VDSO_H */
diff --git a/arch/x86/include/asm/vga.h b/arch/x86/include/asm/vga.h
new file mode 100644
index 000000000..46f9b2dea
--- /dev/null
+++ b/arch/x86/include/asm/vga.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Access to VGA videoram
+ *
+ * (c) 1998 Martin Mares <mj@ucw.cz>
+ */
+
+#ifndef _ASM_X86_VGA_H
+#define _ASM_X86_VGA_H
+
+#include <asm/set_memory.h>
+
+/*
+ * On the PC, we can just recalculate addresses and then
+ * access the videoram directly without any black magic.
+ * To support memory encryption however, we need to access
+ * the videoram as decrypted memory.
+ */
+
+#define VGA_MAP_MEM(x, s) \
+({ \
+ unsigned long start = (unsigned long)phys_to_virt(x); \
+ \
+ if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) \
+ set_memory_decrypted(start, (s) >> PAGE_SHIFT); \
+ \
+ start; \
+})
+
+#define vga_readb(x) (*(x))
+#define vga_writeb(x, y) (*(y) = (x))
+
+#endif /* _ASM_X86_VGA_H */
diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h
new file mode 100644
index 000000000..53748541c
--- /dev/null
+++ b/arch/x86/include/asm/vgtod.h
@@ -0,0 +1,106 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_VGTOD_H
+#define _ASM_X86_VGTOD_H
+
+#include <linux/compiler.h>
+#include <linux/clocksource.h>
+
+#ifdef BUILD_VDSO32_64
+typedef u64 gtod_long_t;
+#else
+typedef unsigned long gtod_long_t;
+#endif
+/*
+ * vsyscall_gtod_data will be accessed by 32 and 64 bit code at the same time
+ * so be carefull by modifying this structure.
+ */
+struct vsyscall_gtod_data {
+ unsigned seq;
+
+ int vclock_mode;
+ u64 cycle_last;
+ u64 mask;
+ u32 mult;
+ u32 shift;
+
+ /* open coded 'struct timespec' */
+ u64 wall_time_snsec;
+ gtod_long_t wall_time_sec;
+ gtod_long_t monotonic_time_sec;
+ u64 monotonic_time_snsec;
+ gtod_long_t wall_time_coarse_sec;
+ gtod_long_t wall_time_coarse_nsec;
+ gtod_long_t monotonic_time_coarse_sec;
+ gtod_long_t monotonic_time_coarse_nsec;
+
+ int tz_minuteswest;
+ int tz_dsttime;
+};
+extern struct vsyscall_gtod_data vsyscall_gtod_data;
+
+extern int vclocks_used;
+static inline bool vclock_was_used(int vclock)
+{
+ return READ_ONCE(vclocks_used) & (1 << vclock);
+}
+
+static inline unsigned gtod_read_begin(const struct vsyscall_gtod_data *s)
+{
+ unsigned ret;
+
+repeat:
+ ret = READ_ONCE(s->seq);
+ if (unlikely(ret & 1)) {
+ cpu_relax();
+ goto repeat;
+ }
+ smp_rmb();
+ return ret;
+}
+
+static inline int gtod_read_retry(const struct vsyscall_gtod_data *s,
+ unsigned start)
+{
+ smp_rmb();
+ return unlikely(s->seq != start);
+}
+
+static inline void gtod_write_begin(struct vsyscall_gtod_data *s)
+{
+ ++s->seq;
+ smp_wmb();
+}
+
+static inline void gtod_write_end(struct vsyscall_gtod_data *s)
+{
+ smp_wmb();
+ ++s->seq;
+}
+
+#ifdef CONFIG_X86_64
+
+#define VGETCPU_CPU_MASK 0xfff
+
+static inline unsigned int __getcpu(void)
+{
+ unsigned int p;
+
+ /*
+ * Load per CPU data from GDT. LSL is faster than RDTSCP and
+ * works on all CPUs. This is volatile so that it orders
+ * correctly wrt barrier() and to keep gcc from cleverly
+ * hoisting it out of the calling function.
+ *
+ * If RDPID is available, use it.
+ */
+ alternative_io ("lsl %[seg],%[p]",
+ ".byte 0xf3,0x0f,0xc7,0xf8", /* RDPID %eax/rax */
+ X86_FEATURE_RDPID,
+ [p] "=a" (p), [seg] "r" (__PER_CPU_SEG));
+
+ return p;
+}
+
+#endif /* CONFIG_X86_64 */
+
+#endif /* _ASM_X86_VGTOD_H */
diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h
new file mode 100644
index 000000000..0116b2ee9
--- /dev/null
+++ b/arch/x86/include/asm/virtext.h
@@ -0,0 +1,128 @@
+/* CPU virtualization extensions handling
+ *
+ * This should carry the code for handling CPU virtualization extensions
+ * that needs to live in the kernel core.
+ *
+ * Author: Eduardo Habkost <ehabkost@redhat.com>
+ *
+ * Copyright (C) 2008, Red Hat Inc.
+ *
+ * Contains code from KVM, Copyright (C) 2006 Qumranet, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+#ifndef _ASM_X86_VIRTEX_H
+#define _ASM_X86_VIRTEX_H
+
+#include <asm/processor.h>
+
+#include <asm/vmx.h>
+#include <asm/svm.h>
+#include <asm/tlbflush.h>
+
+/*
+ * VMX functions:
+ */
+
+static inline int cpu_has_vmx(void)
+{
+ unsigned long ecx = cpuid_ecx(1);
+ return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */
+}
+
+
+/** Disable VMX on the current CPU
+ *
+ * vmxoff causes a undefined-opcode exception if vmxon was not run
+ * on the CPU previously. Only call this function if you know VMX
+ * is enabled.
+ */
+static inline void cpu_vmxoff(void)
+{
+ asm volatile (ASM_VMX_VMXOFF : : : "cc");
+ cr4_clear_bits(X86_CR4_VMXE);
+}
+
+static inline int cpu_vmx_enabled(void)
+{
+ return __read_cr4() & X86_CR4_VMXE;
+}
+
+/** Disable VMX if it is enabled on the current CPU
+ *
+ * You shouldn't call this if cpu_has_vmx() returns 0.
+ */
+static inline void __cpu_emergency_vmxoff(void)
+{
+ if (cpu_vmx_enabled())
+ cpu_vmxoff();
+}
+
+/** Disable VMX if it is supported and enabled on the current CPU
+ */
+static inline void cpu_emergency_vmxoff(void)
+{
+ if (cpu_has_vmx())
+ __cpu_emergency_vmxoff();
+}
+
+
+
+
+/*
+ * SVM functions:
+ */
+
+/** Check if the CPU has SVM support
+ *
+ * You can use the 'msg' arg to get a message describing the problem,
+ * if the function returns zero. Simply pass NULL if you are not interested
+ * on the messages; gcc should take care of not generating code for
+ * the messages on this case.
+ */
+static inline int cpu_has_svm(const char **msg)
+{
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
+ if (msg)
+ *msg = "not amd";
+ return 0;
+ }
+
+ if (boot_cpu_data.extended_cpuid_level < SVM_CPUID_FUNC) {
+ if (msg)
+ *msg = "can't execute cpuid_8000000a";
+ return 0;
+ }
+
+ if (!boot_cpu_has(X86_FEATURE_SVM)) {
+ if (msg)
+ *msg = "svm not available";
+ return 0;
+ }
+ return 1;
+}
+
+
+/** Disable SVM on the current CPU
+ *
+ * You should call this only if cpu_has_svm() returned true.
+ */
+static inline void cpu_svm_disable(void)
+{
+ uint64_t efer;
+
+ wrmsrl(MSR_VM_HSAVE_PA, 0);
+ rdmsrl(MSR_EFER, efer);
+ wrmsrl(MSR_EFER, efer & ~EFER_SVME);
+}
+
+/** Makes sure SVM is disabled, if it is supported on the CPU
+ */
+static inline void cpu_emergency_svm_disable(void)
+{
+ if (cpu_has_svm(NULL))
+ cpu_svm_disable();
+}
+
+#endif /* _ASM_X86_VIRTEX_H */
diff --git a/arch/x86/include/asm/vm86.h b/arch/x86/include/asm/vm86.h
new file mode 100644
index 000000000..26efbec94
--- /dev/null
+++ b/arch/x86/include/asm/vm86.h
@@ -0,0 +1,92 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_VM86_H
+#define _ASM_X86_VM86_H
+
+#include <asm/ptrace.h>
+#include <uapi/asm/vm86.h>
+
+/*
+ * This is the (kernel) stack-layout when we have done a "SAVE_ALL" from vm86
+ * mode - the main change is that the old segment descriptors aren't
+ * useful any more and are forced to be zero by the kernel (and the
+ * hardware when a trap occurs), and the real segment descriptors are
+ * at the end of the structure. Look at ptrace.h to see the "normal"
+ * setup. For user space layout see 'struct vm86_regs' above.
+ */
+
+struct kernel_vm86_regs {
+/*
+ * normal regs, with special meaning for the segment descriptors..
+ */
+ struct pt_regs pt;
+/*
+ * these are specific to v86 mode:
+ */
+ unsigned short es, __esh;
+ unsigned short ds, __dsh;
+ unsigned short fs, __fsh;
+ unsigned short gs, __gsh;
+};
+
+struct vm86 {
+ struct vm86plus_struct __user *user_vm86;
+ struct pt_regs regs32;
+ unsigned long veflags;
+ unsigned long veflags_mask;
+ unsigned long saved_sp0;
+
+ unsigned long flags;
+ unsigned long screen_bitmap;
+ unsigned long cpu_type;
+ struct revectored_struct int_revectored;
+ struct revectored_struct int21_revectored;
+ struct vm86plus_info_struct vm86plus;
+};
+
+#ifdef CONFIG_VM86
+
+void handle_vm86_fault(struct kernel_vm86_regs *, long);
+int handle_vm86_trap(struct kernel_vm86_regs *, long, int);
+void save_v86_state(struct kernel_vm86_regs *, int);
+
+struct task_struct;
+
+#define free_vm86(t) do { \
+ struct thread_struct *__t = (t); \
+ if (__t->vm86 != NULL) { \
+ kfree(__t->vm86); \
+ __t->vm86 = NULL; \
+ } \
+} while (0)
+
+/*
+ * Support for VM86 programs to request interrupts for
+ * real mode hardware drivers:
+ */
+#define FIRST_VM86_IRQ 3
+#define LAST_VM86_IRQ 15
+
+static inline int invalid_vm86_irq(int irq)
+{
+ return irq < FIRST_VM86_IRQ || irq > LAST_VM86_IRQ;
+}
+
+void release_vm86_irqs(struct task_struct *);
+
+#else
+
+#define handle_vm86_fault(a, b)
+#define release_vm86_irqs(a)
+
+static inline int handle_vm86_trap(struct kernel_vm86_regs *a, long b, int c)
+{
+ return 0;
+}
+
+static inline void save_v86_state(struct kernel_vm86_regs *a, int b) { }
+
+#define free_vm86(t) do { } while(0)
+
+#endif /* CONFIG_VM86 */
+
+#endif /* _ASM_X86_VM86_H */
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
new file mode 100644
index 000000000..9527ba5d6
--- /dev/null
+++ b/arch/x86/include/asm/vmx.h
@@ -0,0 +1,593 @@
+/*
+ * vmx.h: VMX Architecture related definitions
+ * Copyright (c) 2004, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * A few random additions are:
+ * Copyright (C) 2006 Qumranet
+ * Avi Kivity <avi@qumranet.com>
+ * Yaniv Kamay <yaniv@qumranet.com>
+ *
+ */
+#ifndef VMX_H
+#define VMX_H
+
+
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <uapi/asm/vmx.h>
+
+/*
+ * Definitions of Primary Processor-Based VM-Execution Controls.
+ */
+#define CPU_BASED_VIRTUAL_INTR_PENDING 0x00000004
+#define CPU_BASED_USE_TSC_OFFSETING 0x00000008
+#define CPU_BASED_HLT_EXITING 0x00000080
+#define CPU_BASED_INVLPG_EXITING 0x00000200
+#define CPU_BASED_MWAIT_EXITING 0x00000400
+#define CPU_BASED_RDPMC_EXITING 0x00000800
+#define CPU_BASED_RDTSC_EXITING 0x00001000
+#define CPU_BASED_CR3_LOAD_EXITING 0x00008000
+#define CPU_BASED_CR3_STORE_EXITING 0x00010000
+#define CPU_BASED_CR8_LOAD_EXITING 0x00080000
+#define CPU_BASED_CR8_STORE_EXITING 0x00100000
+#define CPU_BASED_TPR_SHADOW 0x00200000
+#define CPU_BASED_VIRTUAL_NMI_PENDING 0x00400000
+#define CPU_BASED_MOV_DR_EXITING 0x00800000
+#define CPU_BASED_UNCOND_IO_EXITING 0x01000000
+#define CPU_BASED_USE_IO_BITMAPS 0x02000000
+#define CPU_BASED_MONITOR_TRAP_FLAG 0x08000000
+#define CPU_BASED_USE_MSR_BITMAPS 0x10000000
+#define CPU_BASED_MONITOR_EXITING 0x20000000
+#define CPU_BASED_PAUSE_EXITING 0x40000000
+#define CPU_BASED_ACTIVATE_SECONDARY_CONTROLS 0x80000000
+
+#define CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR 0x0401e172
+
+/*
+ * Definitions of Secondary Processor-Based VM-Execution Controls.
+ */
+#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
+#define SECONDARY_EXEC_ENABLE_EPT 0x00000002
+#define SECONDARY_EXEC_DESC 0x00000004
+#define SECONDARY_EXEC_RDTSCP 0x00000008
+#define SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE 0x00000010
+#define SECONDARY_EXEC_ENABLE_VPID 0x00000020
+#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040
+#define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080
+#define SECONDARY_EXEC_APIC_REGISTER_VIRT 0x00000100
+#define SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY 0x00000200
+#define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400
+#define SECONDARY_EXEC_RDRAND_EXITING 0x00000800
+#define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000
+#define SECONDARY_EXEC_ENABLE_VMFUNC 0x00002000
+#define SECONDARY_EXEC_SHADOW_VMCS 0x00004000
+#define SECONDARY_EXEC_ENCLS_EXITING 0x00008000
+#define SECONDARY_EXEC_RDSEED_EXITING 0x00010000
+#define SECONDARY_EXEC_ENABLE_PML 0x00020000
+#define SECONDARY_EXEC_XSAVES 0x00100000
+#define SECONDARY_EXEC_TSC_SCALING 0x02000000
+
+#define PIN_BASED_EXT_INTR_MASK 0x00000001
+#define PIN_BASED_NMI_EXITING 0x00000008
+#define PIN_BASED_VIRTUAL_NMIS 0x00000020
+#define PIN_BASED_VMX_PREEMPTION_TIMER 0x00000040
+#define PIN_BASED_POSTED_INTR 0x00000080
+
+#define PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR 0x00000016
+
+#define VM_EXIT_SAVE_DEBUG_CONTROLS 0x00000004
+#define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200
+#define VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL 0x00001000
+#define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000
+#define VM_EXIT_SAVE_IA32_PAT 0x00040000
+#define VM_EXIT_LOAD_IA32_PAT 0x00080000
+#define VM_EXIT_SAVE_IA32_EFER 0x00100000
+#define VM_EXIT_LOAD_IA32_EFER 0x00200000
+#define VM_EXIT_SAVE_VMX_PREEMPTION_TIMER 0x00400000
+#define VM_EXIT_CLEAR_BNDCFGS 0x00800000
+
+#define VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR 0x00036dff
+
+#define VM_ENTRY_LOAD_DEBUG_CONTROLS 0x00000004
+#define VM_ENTRY_IA32E_MODE 0x00000200
+#define VM_ENTRY_SMM 0x00000400
+#define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800
+#define VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL 0x00002000
+#define VM_ENTRY_LOAD_IA32_PAT 0x00004000
+#define VM_ENTRY_LOAD_IA32_EFER 0x00008000
+#define VM_ENTRY_LOAD_BNDCFGS 0x00010000
+
+#define VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR 0x000011ff
+
+#define VMX_MISC_PREEMPTION_TIMER_RATE_MASK 0x0000001f
+#define VMX_MISC_SAVE_EFER_LMA 0x00000020
+#define VMX_MISC_ACTIVITY_HLT 0x00000040
+#define VMX_MISC_ZERO_LEN_INS 0x40000000
+
+/* VMFUNC functions */
+#define VMX_VMFUNC_EPTP_SWITCHING 0x00000001
+#define VMFUNC_EPTP_ENTRIES 512
+
+static inline u32 vmx_basic_vmcs_revision_id(u64 vmx_basic)
+{
+ return vmx_basic & GENMASK_ULL(30, 0);
+}
+
+static inline u32 vmx_basic_vmcs_size(u64 vmx_basic)
+{
+ return (vmx_basic & GENMASK_ULL(44, 32)) >> 32;
+}
+
+static inline int vmx_misc_preemption_timer_rate(u64 vmx_misc)
+{
+ return vmx_misc & VMX_MISC_PREEMPTION_TIMER_RATE_MASK;
+}
+
+static inline int vmx_misc_cr3_count(u64 vmx_misc)
+{
+ return (vmx_misc & GENMASK_ULL(24, 16)) >> 16;
+}
+
+static inline int vmx_misc_max_msr(u64 vmx_misc)
+{
+ return (vmx_misc & GENMASK_ULL(27, 25)) >> 25;
+}
+
+static inline int vmx_misc_mseg_revid(u64 vmx_misc)
+{
+ return (vmx_misc & GENMASK_ULL(63, 32)) >> 32;
+}
+
+/* VMCS Encodings */
+enum vmcs_field {
+ VIRTUAL_PROCESSOR_ID = 0x00000000,
+ POSTED_INTR_NV = 0x00000002,
+ GUEST_ES_SELECTOR = 0x00000800,
+ GUEST_CS_SELECTOR = 0x00000802,
+ GUEST_SS_SELECTOR = 0x00000804,
+ GUEST_DS_SELECTOR = 0x00000806,
+ GUEST_FS_SELECTOR = 0x00000808,
+ GUEST_GS_SELECTOR = 0x0000080a,
+ GUEST_LDTR_SELECTOR = 0x0000080c,
+ GUEST_TR_SELECTOR = 0x0000080e,
+ GUEST_INTR_STATUS = 0x00000810,
+ GUEST_PML_INDEX = 0x00000812,
+ HOST_ES_SELECTOR = 0x00000c00,
+ HOST_CS_SELECTOR = 0x00000c02,
+ HOST_SS_SELECTOR = 0x00000c04,
+ HOST_DS_SELECTOR = 0x00000c06,
+ HOST_FS_SELECTOR = 0x00000c08,
+ HOST_GS_SELECTOR = 0x00000c0a,
+ HOST_TR_SELECTOR = 0x00000c0c,
+ IO_BITMAP_A = 0x00002000,
+ IO_BITMAP_A_HIGH = 0x00002001,
+ IO_BITMAP_B = 0x00002002,
+ IO_BITMAP_B_HIGH = 0x00002003,
+ MSR_BITMAP = 0x00002004,
+ MSR_BITMAP_HIGH = 0x00002005,
+ VM_EXIT_MSR_STORE_ADDR = 0x00002006,
+ VM_EXIT_MSR_STORE_ADDR_HIGH = 0x00002007,
+ VM_EXIT_MSR_LOAD_ADDR = 0x00002008,
+ VM_EXIT_MSR_LOAD_ADDR_HIGH = 0x00002009,
+ VM_ENTRY_MSR_LOAD_ADDR = 0x0000200a,
+ VM_ENTRY_MSR_LOAD_ADDR_HIGH = 0x0000200b,
+ PML_ADDRESS = 0x0000200e,
+ PML_ADDRESS_HIGH = 0x0000200f,
+ TSC_OFFSET = 0x00002010,
+ TSC_OFFSET_HIGH = 0x00002011,
+ VIRTUAL_APIC_PAGE_ADDR = 0x00002012,
+ VIRTUAL_APIC_PAGE_ADDR_HIGH = 0x00002013,
+ APIC_ACCESS_ADDR = 0x00002014,
+ APIC_ACCESS_ADDR_HIGH = 0x00002015,
+ POSTED_INTR_DESC_ADDR = 0x00002016,
+ POSTED_INTR_DESC_ADDR_HIGH = 0x00002017,
+ VM_FUNCTION_CONTROL = 0x00002018,
+ VM_FUNCTION_CONTROL_HIGH = 0x00002019,
+ EPT_POINTER = 0x0000201a,
+ EPT_POINTER_HIGH = 0x0000201b,
+ EOI_EXIT_BITMAP0 = 0x0000201c,
+ EOI_EXIT_BITMAP0_HIGH = 0x0000201d,
+ EOI_EXIT_BITMAP1 = 0x0000201e,
+ EOI_EXIT_BITMAP1_HIGH = 0x0000201f,
+ EOI_EXIT_BITMAP2 = 0x00002020,
+ EOI_EXIT_BITMAP2_HIGH = 0x00002021,
+ EOI_EXIT_BITMAP3 = 0x00002022,
+ EOI_EXIT_BITMAP3_HIGH = 0x00002023,
+ EPTP_LIST_ADDRESS = 0x00002024,
+ EPTP_LIST_ADDRESS_HIGH = 0x00002025,
+ VMREAD_BITMAP = 0x00002026,
+ VMREAD_BITMAP_HIGH = 0x00002027,
+ VMWRITE_BITMAP = 0x00002028,
+ VMWRITE_BITMAP_HIGH = 0x00002029,
+ XSS_EXIT_BITMAP = 0x0000202C,
+ XSS_EXIT_BITMAP_HIGH = 0x0000202D,
+ ENCLS_EXITING_BITMAP = 0x0000202E,
+ ENCLS_EXITING_BITMAP_HIGH = 0x0000202F,
+ TSC_MULTIPLIER = 0x00002032,
+ TSC_MULTIPLIER_HIGH = 0x00002033,
+ GUEST_PHYSICAL_ADDRESS = 0x00002400,
+ GUEST_PHYSICAL_ADDRESS_HIGH = 0x00002401,
+ VMCS_LINK_POINTER = 0x00002800,
+ VMCS_LINK_POINTER_HIGH = 0x00002801,
+ GUEST_IA32_DEBUGCTL = 0x00002802,
+ GUEST_IA32_DEBUGCTL_HIGH = 0x00002803,
+ GUEST_IA32_PAT = 0x00002804,
+ GUEST_IA32_PAT_HIGH = 0x00002805,
+ GUEST_IA32_EFER = 0x00002806,
+ GUEST_IA32_EFER_HIGH = 0x00002807,
+ GUEST_IA32_PERF_GLOBAL_CTRL = 0x00002808,
+ GUEST_IA32_PERF_GLOBAL_CTRL_HIGH= 0x00002809,
+ GUEST_PDPTR0 = 0x0000280a,
+ GUEST_PDPTR0_HIGH = 0x0000280b,
+ GUEST_PDPTR1 = 0x0000280c,
+ GUEST_PDPTR1_HIGH = 0x0000280d,
+ GUEST_PDPTR2 = 0x0000280e,
+ GUEST_PDPTR2_HIGH = 0x0000280f,
+ GUEST_PDPTR3 = 0x00002810,
+ GUEST_PDPTR3_HIGH = 0x00002811,
+ GUEST_BNDCFGS = 0x00002812,
+ GUEST_BNDCFGS_HIGH = 0x00002813,
+ HOST_IA32_PAT = 0x00002c00,
+ HOST_IA32_PAT_HIGH = 0x00002c01,
+ HOST_IA32_EFER = 0x00002c02,
+ HOST_IA32_EFER_HIGH = 0x00002c03,
+ HOST_IA32_PERF_GLOBAL_CTRL = 0x00002c04,
+ HOST_IA32_PERF_GLOBAL_CTRL_HIGH = 0x00002c05,
+ PIN_BASED_VM_EXEC_CONTROL = 0x00004000,
+ CPU_BASED_VM_EXEC_CONTROL = 0x00004002,
+ EXCEPTION_BITMAP = 0x00004004,
+ PAGE_FAULT_ERROR_CODE_MASK = 0x00004006,
+ PAGE_FAULT_ERROR_CODE_MATCH = 0x00004008,
+ CR3_TARGET_COUNT = 0x0000400a,
+ VM_EXIT_CONTROLS = 0x0000400c,
+ VM_EXIT_MSR_STORE_COUNT = 0x0000400e,
+ VM_EXIT_MSR_LOAD_COUNT = 0x00004010,
+ VM_ENTRY_CONTROLS = 0x00004012,
+ VM_ENTRY_MSR_LOAD_COUNT = 0x00004014,
+ VM_ENTRY_INTR_INFO_FIELD = 0x00004016,
+ VM_ENTRY_EXCEPTION_ERROR_CODE = 0x00004018,
+ VM_ENTRY_INSTRUCTION_LEN = 0x0000401a,
+ TPR_THRESHOLD = 0x0000401c,
+ SECONDARY_VM_EXEC_CONTROL = 0x0000401e,
+ PLE_GAP = 0x00004020,
+ PLE_WINDOW = 0x00004022,
+ VM_INSTRUCTION_ERROR = 0x00004400,
+ VM_EXIT_REASON = 0x00004402,
+ VM_EXIT_INTR_INFO = 0x00004404,
+ VM_EXIT_INTR_ERROR_CODE = 0x00004406,
+ IDT_VECTORING_INFO_FIELD = 0x00004408,
+ IDT_VECTORING_ERROR_CODE = 0x0000440a,
+ VM_EXIT_INSTRUCTION_LEN = 0x0000440c,
+ VMX_INSTRUCTION_INFO = 0x0000440e,
+ GUEST_ES_LIMIT = 0x00004800,
+ GUEST_CS_LIMIT = 0x00004802,
+ GUEST_SS_LIMIT = 0x00004804,
+ GUEST_DS_LIMIT = 0x00004806,
+ GUEST_FS_LIMIT = 0x00004808,
+ GUEST_GS_LIMIT = 0x0000480a,
+ GUEST_LDTR_LIMIT = 0x0000480c,
+ GUEST_TR_LIMIT = 0x0000480e,
+ GUEST_GDTR_LIMIT = 0x00004810,
+ GUEST_IDTR_LIMIT = 0x00004812,
+ GUEST_ES_AR_BYTES = 0x00004814,
+ GUEST_CS_AR_BYTES = 0x00004816,
+ GUEST_SS_AR_BYTES = 0x00004818,
+ GUEST_DS_AR_BYTES = 0x0000481a,
+ GUEST_FS_AR_BYTES = 0x0000481c,
+ GUEST_GS_AR_BYTES = 0x0000481e,
+ GUEST_LDTR_AR_BYTES = 0x00004820,
+ GUEST_TR_AR_BYTES = 0x00004822,
+ GUEST_INTERRUPTIBILITY_INFO = 0x00004824,
+ GUEST_ACTIVITY_STATE = 0X00004826,
+ GUEST_SYSENTER_CS = 0x0000482A,
+ VMX_PREEMPTION_TIMER_VALUE = 0x0000482E,
+ HOST_IA32_SYSENTER_CS = 0x00004c00,
+ CR0_GUEST_HOST_MASK = 0x00006000,
+ CR4_GUEST_HOST_MASK = 0x00006002,
+ CR0_READ_SHADOW = 0x00006004,
+ CR4_READ_SHADOW = 0x00006006,
+ CR3_TARGET_VALUE0 = 0x00006008,
+ CR3_TARGET_VALUE1 = 0x0000600a,
+ CR3_TARGET_VALUE2 = 0x0000600c,
+ CR3_TARGET_VALUE3 = 0x0000600e,
+ EXIT_QUALIFICATION = 0x00006400,
+ GUEST_LINEAR_ADDRESS = 0x0000640a,
+ GUEST_CR0 = 0x00006800,
+ GUEST_CR3 = 0x00006802,
+ GUEST_CR4 = 0x00006804,
+ GUEST_ES_BASE = 0x00006806,
+ GUEST_CS_BASE = 0x00006808,
+ GUEST_SS_BASE = 0x0000680a,
+ GUEST_DS_BASE = 0x0000680c,
+ GUEST_FS_BASE = 0x0000680e,
+ GUEST_GS_BASE = 0x00006810,
+ GUEST_LDTR_BASE = 0x00006812,
+ GUEST_TR_BASE = 0x00006814,
+ GUEST_GDTR_BASE = 0x00006816,
+ GUEST_IDTR_BASE = 0x00006818,
+ GUEST_DR7 = 0x0000681a,
+ GUEST_RSP = 0x0000681c,
+ GUEST_RIP = 0x0000681e,
+ GUEST_RFLAGS = 0x00006820,
+ GUEST_PENDING_DBG_EXCEPTIONS = 0x00006822,
+ GUEST_SYSENTER_ESP = 0x00006824,
+ GUEST_SYSENTER_EIP = 0x00006826,
+ HOST_CR0 = 0x00006c00,
+ HOST_CR3 = 0x00006c02,
+ HOST_CR4 = 0x00006c04,
+ HOST_FS_BASE = 0x00006c06,
+ HOST_GS_BASE = 0x00006c08,
+ HOST_TR_BASE = 0x00006c0a,
+ HOST_GDTR_BASE = 0x00006c0c,
+ HOST_IDTR_BASE = 0x00006c0e,
+ HOST_IA32_SYSENTER_ESP = 0x00006c10,
+ HOST_IA32_SYSENTER_EIP = 0x00006c12,
+ HOST_RSP = 0x00006c14,
+ HOST_RIP = 0x00006c16,
+};
+
+/*
+ * Interruption-information format
+ */
+#define INTR_INFO_VECTOR_MASK 0xff /* 7:0 */
+#define INTR_INFO_INTR_TYPE_MASK 0x700 /* 10:8 */
+#define INTR_INFO_DELIVER_CODE_MASK 0x800 /* 11 */
+#define INTR_INFO_UNBLOCK_NMI 0x1000 /* 12 */
+#define INTR_INFO_VALID_MASK 0x80000000 /* 31 */
+#define INTR_INFO_RESVD_BITS_MASK 0x7ffff000
+
+#define VECTORING_INFO_VECTOR_MASK INTR_INFO_VECTOR_MASK
+#define VECTORING_INFO_TYPE_MASK INTR_INFO_INTR_TYPE_MASK
+#define VECTORING_INFO_DELIVER_CODE_MASK INTR_INFO_DELIVER_CODE_MASK
+#define VECTORING_INFO_VALID_MASK INTR_INFO_VALID_MASK
+
+#define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */
+#define INTR_TYPE_RESERVED (1 << 8) /* reserved */
+#define INTR_TYPE_NMI_INTR (2 << 8) /* NMI */
+#define INTR_TYPE_HARD_EXCEPTION (3 << 8) /* processor exception */
+#define INTR_TYPE_SOFT_INTR (4 << 8) /* software interrupt */
+#define INTR_TYPE_PRIV_SW_EXCEPTION (5 << 8) /* ICE breakpoint - undocumented */
+#define INTR_TYPE_SOFT_EXCEPTION (6 << 8) /* software exception */
+#define INTR_TYPE_OTHER_EVENT (7 << 8) /* other event */
+
+/* GUEST_INTERRUPTIBILITY_INFO flags. */
+#define GUEST_INTR_STATE_STI 0x00000001
+#define GUEST_INTR_STATE_MOV_SS 0x00000002
+#define GUEST_INTR_STATE_SMI 0x00000004
+#define GUEST_INTR_STATE_NMI 0x00000008
+
+/* GUEST_ACTIVITY_STATE flags */
+#define GUEST_ACTIVITY_ACTIVE 0
+#define GUEST_ACTIVITY_HLT 1
+#define GUEST_ACTIVITY_SHUTDOWN 2
+#define GUEST_ACTIVITY_WAIT_SIPI 3
+
+/*
+ * Exit Qualifications for MOV for Control Register Access
+ */
+#define CONTROL_REG_ACCESS_NUM 0x7 /* 2:0, number of control reg.*/
+#define CONTROL_REG_ACCESS_TYPE 0x30 /* 5:4, access type */
+#define CONTROL_REG_ACCESS_REG 0xf00 /* 10:8, general purpose reg. */
+#define LMSW_SOURCE_DATA_SHIFT 16
+#define LMSW_SOURCE_DATA (0xFFFF << LMSW_SOURCE_DATA_SHIFT) /* 16:31 lmsw source */
+#define REG_EAX (0 << 8)
+#define REG_ECX (1 << 8)
+#define REG_EDX (2 << 8)
+#define REG_EBX (3 << 8)
+#define REG_ESP (4 << 8)
+#define REG_EBP (5 << 8)
+#define REG_ESI (6 << 8)
+#define REG_EDI (7 << 8)
+#define REG_R8 (8 << 8)
+#define REG_R9 (9 << 8)
+#define REG_R10 (10 << 8)
+#define REG_R11 (11 << 8)
+#define REG_R12 (12 << 8)
+#define REG_R13 (13 << 8)
+#define REG_R14 (14 << 8)
+#define REG_R15 (15 << 8)
+
+/*
+ * Exit Qualifications for MOV for Debug Register Access
+ */
+#define DEBUG_REG_ACCESS_NUM 0x7 /* 2:0, number of debug reg. */
+#define DEBUG_REG_ACCESS_TYPE 0x10 /* 4, direction of access */
+#define TYPE_MOV_TO_DR (0 << 4)
+#define TYPE_MOV_FROM_DR (1 << 4)
+#define DEBUG_REG_ACCESS_REG(eq) (((eq) >> 8) & 0xf) /* 11:8, general purpose reg. */
+
+
+/*
+ * Exit Qualifications for APIC-Access
+ */
+#define APIC_ACCESS_OFFSET 0xfff /* 11:0, offset within the APIC page */
+#define APIC_ACCESS_TYPE 0xf000 /* 15:12, access type */
+#define TYPE_LINEAR_APIC_INST_READ (0 << 12)
+#define TYPE_LINEAR_APIC_INST_WRITE (1 << 12)
+#define TYPE_LINEAR_APIC_INST_FETCH (2 << 12)
+#define TYPE_LINEAR_APIC_EVENT (3 << 12)
+#define TYPE_PHYSICAL_APIC_EVENT (10 << 12)
+#define TYPE_PHYSICAL_APIC_INST (15 << 12)
+
+/* segment AR in VMCS -- these are different from what LAR reports */
+#define VMX_SEGMENT_AR_L_MASK (1 << 13)
+
+#define VMX_AR_TYPE_ACCESSES_MASK 1
+#define VMX_AR_TYPE_READABLE_MASK (1 << 1)
+#define VMX_AR_TYPE_WRITEABLE_MASK (1 << 2)
+#define VMX_AR_TYPE_CODE_MASK (1 << 3)
+#define VMX_AR_TYPE_MASK 0x0f
+#define VMX_AR_TYPE_BUSY_64_TSS 11
+#define VMX_AR_TYPE_BUSY_32_TSS 11
+#define VMX_AR_TYPE_BUSY_16_TSS 3
+#define VMX_AR_TYPE_LDT 2
+
+#define VMX_AR_UNUSABLE_MASK (1 << 16)
+#define VMX_AR_S_MASK (1 << 4)
+#define VMX_AR_P_MASK (1 << 7)
+#define VMX_AR_L_MASK (1 << 13)
+#define VMX_AR_DB_MASK (1 << 14)
+#define VMX_AR_G_MASK (1 << 15)
+#define VMX_AR_DPL_SHIFT 5
+#define VMX_AR_DPL(ar) (((ar) >> VMX_AR_DPL_SHIFT) & 3)
+
+#define VMX_AR_RESERVD_MASK 0xfffe0f00
+
+#define TSS_PRIVATE_MEMSLOT (KVM_USER_MEM_SLOTS + 0)
+#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (KVM_USER_MEM_SLOTS + 1)
+#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT (KVM_USER_MEM_SLOTS + 2)
+
+#define VMX_NR_VPIDS (1 << 16)
+#define VMX_VPID_EXTENT_INDIVIDUAL_ADDR 0
+#define VMX_VPID_EXTENT_SINGLE_CONTEXT 1
+#define VMX_VPID_EXTENT_ALL_CONTEXT 2
+#define VMX_VPID_EXTENT_SINGLE_NON_GLOBAL 3
+
+#define VMX_EPT_EXTENT_CONTEXT 1
+#define VMX_EPT_EXTENT_GLOBAL 2
+#define VMX_EPT_EXTENT_SHIFT 24
+
+#define VMX_EPT_EXECUTE_ONLY_BIT (1ull)
+#define VMX_EPT_PAGE_WALK_4_BIT (1ull << 6)
+#define VMX_EPT_PAGE_WALK_5_BIT (1ull << 7)
+#define VMX_EPTP_UC_BIT (1ull << 8)
+#define VMX_EPTP_WB_BIT (1ull << 14)
+#define VMX_EPT_2MB_PAGE_BIT (1ull << 16)
+#define VMX_EPT_1GB_PAGE_BIT (1ull << 17)
+#define VMX_EPT_INVEPT_BIT (1ull << 20)
+#define VMX_EPT_AD_BIT (1ull << 21)
+#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25)
+#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26)
+
+#define VMX_VPID_INVVPID_BIT (1ull << 0) /* (32 - 32) */
+#define VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT (1ull << 8) /* (40 - 32) */
+#define VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT (1ull << 9) /* (41 - 32) */
+#define VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT (1ull << 10) /* (42 - 32) */
+#define VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT (1ull << 11) /* (43 - 32) */
+
+#define VMX_EPT_MT_EPTE_SHIFT 3
+#define VMX_EPTP_PWL_MASK 0x38ull
+#define VMX_EPTP_PWL_4 0x18ull
+#define VMX_EPTP_PWL_5 0x20ull
+#define VMX_EPTP_AD_ENABLE_BIT (1ull << 6)
+#define VMX_EPTP_MT_MASK 0x7ull
+#define VMX_EPTP_MT_WB 0x6ull
+#define VMX_EPTP_MT_UC 0x0ull
+#define VMX_EPT_READABLE_MASK 0x1ull
+#define VMX_EPT_WRITABLE_MASK 0x2ull
+#define VMX_EPT_EXECUTABLE_MASK 0x4ull
+#define VMX_EPT_IPAT_BIT (1ull << 6)
+#define VMX_EPT_ACCESS_BIT (1ull << 8)
+#define VMX_EPT_DIRTY_BIT (1ull << 9)
+#define VMX_EPT_RWX_MASK (VMX_EPT_READABLE_MASK | \
+ VMX_EPT_WRITABLE_MASK | \
+ VMX_EPT_EXECUTABLE_MASK)
+#define VMX_EPT_MT_MASK (7ull << VMX_EPT_MT_EPTE_SHIFT)
+
+/* The mask to use to trigger an EPT Misconfiguration in order to track MMIO */
+#define VMX_EPT_MISCONFIG_WX_VALUE (VMX_EPT_WRITABLE_MASK | \
+ VMX_EPT_EXECUTABLE_MASK)
+
+#define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul
+
+
+#define ASM_VMX_VMCLEAR_RAX ".byte 0x66, 0x0f, 0xc7, 0x30"
+#define ASM_VMX_VMLAUNCH ".byte 0x0f, 0x01, 0xc2"
+#define ASM_VMX_VMRESUME ".byte 0x0f, 0x01, 0xc3"
+#define ASM_VMX_VMPTRLD_RAX ".byte 0x0f, 0xc7, 0x30"
+#define ASM_VMX_VMREAD_RDX_RAX ".byte 0x0f, 0x78, 0xd0"
+#define ASM_VMX_VMWRITE_RAX_RDX ".byte 0x0f, 0x79, 0xd0"
+#define ASM_VMX_VMWRITE_RSP_RDX ".byte 0x0f, 0x79, 0xd4"
+#define ASM_VMX_VMXOFF ".byte 0x0f, 0x01, 0xc4"
+#define ASM_VMX_VMXON_RAX ".byte 0xf3, 0x0f, 0xc7, 0x30"
+#define ASM_VMX_INVEPT ".byte 0x66, 0x0f, 0x38, 0x80, 0x08"
+#define ASM_VMX_INVVPID ".byte 0x66, 0x0f, 0x38, 0x81, 0x08"
+
+struct vmx_msr_entry {
+ u32 index;
+ u32 reserved;
+ u64 value;
+} __aligned(16);
+
+/*
+ * Exit Qualifications for entry failure during or after loading guest state
+ */
+#define ENTRY_FAIL_DEFAULT 0
+#define ENTRY_FAIL_PDPTE 2
+#define ENTRY_FAIL_NMI 3
+#define ENTRY_FAIL_VMCS_LINK_PTR 4
+
+/*
+ * Exit Qualifications for EPT Violations
+ */
+#define EPT_VIOLATION_ACC_READ_BIT 0
+#define EPT_VIOLATION_ACC_WRITE_BIT 1
+#define EPT_VIOLATION_ACC_INSTR_BIT 2
+#define EPT_VIOLATION_READABLE_BIT 3
+#define EPT_VIOLATION_WRITABLE_BIT 4
+#define EPT_VIOLATION_EXECUTABLE_BIT 5
+#define EPT_VIOLATION_GVA_TRANSLATED_BIT 8
+#define EPT_VIOLATION_ACC_READ (1 << EPT_VIOLATION_ACC_READ_BIT)
+#define EPT_VIOLATION_ACC_WRITE (1 << EPT_VIOLATION_ACC_WRITE_BIT)
+#define EPT_VIOLATION_ACC_INSTR (1 << EPT_VIOLATION_ACC_INSTR_BIT)
+#define EPT_VIOLATION_READABLE (1 << EPT_VIOLATION_READABLE_BIT)
+#define EPT_VIOLATION_WRITABLE (1 << EPT_VIOLATION_WRITABLE_BIT)
+#define EPT_VIOLATION_EXECUTABLE (1 << EPT_VIOLATION_EXECUTABLE_BIT)
+#define EPT_VIOLATION_GVA_TRANSLATED (1 << EPT_VIOLATION_GVA_TRANSLATED_BIT)
+
+/*
+ * VM-instruction error numbers
+ */
+enum vm_instruction_error_number {
+ VMXERR_VMCALL_IN_VMX_ROOT_OPERATION = 1,
+ VMXERR_VMCLEAR_INVALID_ADDRESS = 2,
+ VMXERR_VMCLEAR_VMXON_POINTER = 3,
+ VMXERR_VMLAUNCH_NONCLEAR_VMCS = 4,
+ VMXERR_VMRESUME_NONLAUNCHED_VMCS = 5,
+ VMXERR_VMRESUME_AFTER_VMXOFF = 6,
+ VMXERR_ENTRY_INVALID_CONTROL_FIELD = 7,
+ VMXERR_ENTRY_INVALID_HOST_STATE_FIELD = 8,
+ VMXERR_VMPTRLD_INVALID_ADDRESS = 9,
+ VMXERR_VMPTRLD_VMXON_POINTER = 10,
+ VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID = 11,
+ VMXERR_UNSUPPORTED_VMCS_COMPONENT = 12,
+ VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT = 13,
+ VMXERR_VMXON_IN_VMX_ROOT_OPERATION = 15,
+ VMXERR_ENTRY_INVALID_EXECUTIVE_VMCS_POINTER = 16,
+ VMXERR_ENTRY_NONLAUNCHED_EXECUTIVE_VMCS = 17,
+ VMXERR_ENTRY_EXECUTIVE_VMCS_POINTER_NOT_VMXON_POINTER = 18,
+ VMXERR_VMCALL_NONCLEAR_VMCS = 19,
+ VMXERR_VMCALL_INVALID_VM_EXIT_CONTROL_FIELDS = 20,
+ VMXERR_VMCALL_INCORRECT_MSEG_REVISION_ID = 22,
+ VMXERR_VMXOFF_UNDER_DUAL_MONITOR_TREATMENT_OF_SMIS_AND_SMM = 23,
+ VMXERR_VMCALL_INVALID_SMM_MONITOR_FEATURES = 24,
+ VMXERR_ENTRY_INVALID_VM_EXECUTION_CONTROL_FIELDS_IN_EXECUTIVE_VMCS = 25,
+ VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS = 26,
+ VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID = 28,
+};
+
+enum vmx_l1d_flush_state {
+ VMENTER_L1D_FLUSH_AUTO,
+ VMENTER_L1D_FLUSH_NEVER,
+ VMENTER_L1D_FLUSH_COND,
+ VMENTER_L1D_FLUSH_ALWAYS,
+ VMENTER_L1D_FLUSH_EPT_DISABLED,
+ VMENTER_L1D_FLUSH_NOT_REQUIRED,
+};
+
+extern enum vmx_l1d_flush_state l1tf_vmx_mitigation;
+
+#endif
diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h
new file mode 100644
index 000000000..b986b2ca6
--- /dev/null
+++ b/arch/x86/include/asm/vsyscall.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_VSYSCALL_H
+#define _ASM_X86_VSYSCALL_H
+
+#include <linux/seqlock.h>
+#include <uapi/asm/vsyscall.h>
+
+#ifdef CONFIG_X86_VSYSCALL_EMULATION
+extern void map_vsyscall(void);
+extern void set_vsyscall_pgtable_user_bits(pgd_t *root);
+
+/*
+ * Called on instruction fetch fault in vsyscall page.
+ * Returns true if handled.
+ */
+extern bool emulate_vsyscall(struct pt_regs *regs, unsigned long address);
+#else
+static inline void map_vsyscall(void) {}
+static inline bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
+{
+ return false;
+}
+#endif
+
+#endif /* _ASM_X86_VSYSCALL_H */
diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h
new file mode 100644
index 000000000..3f32dfc2a
--- /dev/null
+++ b/arch/x86/include/asm/vvar.h
@@ -0,0 +1,51 @@
+/*
+ * vvar.h: Shared vDSO/kernel variable declarations
+ * Copyright (c) 2011 Andy Lutomirski
+ * Subject to the GNU General Public License, version 2
+ *
+ * A handful of variables are accessible (read-only) from userspace
+ * code in the vsyscall page and the vdso. They are declared here.
+ * Some other file must define them with DEFINE_VVAR.
+ *
+ * In normal kernel code, they are used like any other variable.
+ * In user code, they are accessed through the VVAR macro.
+ *
+ * These variables live in a page of kernel data that has an extra RO
+ * mapping for userspace. Each variable needs a unique offset within
+ * that page; specify that offset with the DECLARE_VVAR macro. (If
+ * you mess up, the linker will catch it.)
+ */
+
+#ifndef _ASM_X86_VVAR_H
+#define _ASM_X86_VVAR_H
+
+#if defined(__VVAR_KERNEL_LDS)
+
+/* The kernel linker script defines its own magic to put vvars in the
+ * right place.
+ */
+#define DECLARE_VVAR(offset, type, name) \
+ EMIT_VVAR(name, offset)
+
+#else
+
+extern char __vvar_page;
+
+#define DECLARE_VVAR(offset, type, name) \
+ extern type vvar_ ## name __attribute__((visibility("hidden")));
+
+#define VVAR(name) (vvar_ ## name)
+
+#define DEFINE_VVAR(type, name) \
+ type name \
+ __attribute__((section(".vvar_" #name), aligned(16))) __visible
+
+#endif
+
+/* DECLARE_VVAR(offset, type, name) */
+
+DECLARE_VVAR(128, struct vsyscall_gtod_data, vsyscall_gtod_data)
+
+#undef DECLARE_VVAR
+
+#endif
diff --git a/arch/x86/include/asm/word-at-a-time.h b/arch/x86/include/asm/word-at-a-time.h
new file mode 100644
index 000000000..06006b035
--- /dev/null
+++ b/arch/x86/include/asm/word-at-a-time.h
@@ -0,0 +1,106 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_WORD_AT_A_TIME_H
+#define _ASM_WORD_AT_A_TIME_H
+
+#include <linux/kernel.h>
+
+/*
+ * This is largely generic for little-endian machines, but the
+ * optimal byte mask counting is probably going to be something
+ * that is architecture-specific. If you have a reliably fast
+ * bit count instruction, that might be better than the multiply
+ * and shift, for example.
+ */
+struct word_at_a_time {
+ const unsigned long one_bits, high_bits;
+};
+
+#define WORD_AT_A_TIME_CONSTANTS { REPEAT_BYTE(0x01), REPEAT_BYTE(0x80) }
+
+#ifdef CONFIG_64BIT
+
+/*
+ * Jan Achrenius on G+: microoptimized version of
+ * the simpler "(mask & ONEBYTES) * ONEBYTES >> 56"
+ * that works for the bytemasks without having to
+ * mask them first.
+ */
+static inline long count_masked_bytes(unsigned long mask)
+{
+ return mask*0x0001020304050608ul >> 56;
+}
+
+#else /* 32-bit case */
+
+/* Carl Chatfield / Jan Achrenius G+ version for 32-bit */
+static inline long count_masked_bytes(long mask)
+{
+ /* (000000 0000ff 00ffff ffffff) -> ( 1 1 2 3 ) */
+ long a = (0x0ff0001+mask) >> 23;
+ /* Fix the 1 for 00 case */
+ return a & mask;
+}
+
+#endif
+
+/* Return nonzero if it has a zero */
+static inline unsigned long has_zero(unsigned long a, unsigned long *bits, const struct word_at_a_time *c)
+{
+ unsigned long mask = ((a - c->one_bits) & ~a) & c->high_bits;
+ *bits = mask;
+ return mask;
+}
+
+static inline unsigned long prep_zero_mask(unsigned long a, unsigned long bits, const struct word_at_a_time *c)
+{
+ return bits;
+}
+
+static inline unsigned long create_zero_mask(unsigned long bits)
+{
+ bits = (bits - 1) & ~bits;
+ return bits >> 7;
+}
+
+/* The mask we created is directly usable as a bytemask */
+#define zero_bytemask(mask) (mask)
+
+static inline unsigned long find_zero(unsigned long mask)
+{
+ return count_masked_bytes(mask);
+}
+
+/*
+ * Load an unaligned word from kernel space.
+ *
+ * In the (very unlikely) case of the word being a page-crosser
+ * and the next page not being mapped, take the exception and
+ * return zeroes in the non-existing part.
+ */
+static inline unsigned long load_unaligned_zeropad(const void *addr)
+{
+ unsigned long ret, dummy;
+
+ asm(
+ "1:\tmov %2,%0\n"
+ "2:\n"
+ ".section .fixup,\"ax\"\n"
+ "3:\t"
+ "lea %2,%1\n\t"
+ "and %3,%1\n\t"
+ "mov (%1),%0\n\t"
+ "leal %2,%%ecx\n\t"
+ "andl %4,%%ecx\n\t"
+ "shll $3,%%ecx\n\t"
+ "shr %%cl,%0\n\t"
+ "jmp 2b\n"
+ ".previous\n"
+ _ASM_EXTABLE(1b, 3b)
+ :"=&r" (ret),"=&c" (dummy)
+ :"m" (*(unsigned long *)addr),
+ "i" (-sizeof(unsigned long)),
+ "i" (sizeof(unsigned long)-1));
+ return ret;
+}
+
+#endif /* _ASM_WORD_AT_A_TIME_H */
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
new file mode 100644
index 000000000..b85a7c54c
--- /dev/null
+++ b/arch/x86/include/asm/x86_init.h
@@ -0,0 +1,306 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_PLATFORM_H
+#define _ASM_X86_PLATFORM_H
+
+#include <asm/bootparam.h>
+
+struct mpc_bus;
+struct mpc_cpu;
+struct mpc_table;
+struct cpuinfo_x86;
+
+/**
+ * struct x86_init_mpparse - platform specific mpparse ops
+ * @mpc_record: platform specific mpc record accounting
+ * @setup_ioapic_ids: platform specific ioapic id override
+ * @mpc_apic_id: platform specific mpc apic id assignment
+ * @smp_read_mpc_oem: platform specific oem mpc table setup
+ * @mpc_oem_pci_bus: platform specific pci bus setup (default NULL)
+ * @mpc_oem_bus_info: platform specific mpc bus info
+ * @find_smp_config: find the smp configuration
+ * @get_smp_config: get the smp configuration
+ */
+struct x86_init_mpparse {
+ void (*mpc_record)(unsigned int mode);
+ void (*setup_ioapic_ids)(void);
+ int (*mpc_apic_id)(struct mpc_cpu *m);
+ void (*smp_read_mpc_oem)(struct mpc_table *mpc);
+ void (*mpc_oem_pci_bus)(struct mpc_bus *m);
+ void (*mpc_oem_bus_info)(struct mpc_bus *m, char *name);
+ void (*find_smp_config)(void);
+ void (*get_smp_config)(unsigned int early);
+};
+
+/**
+ * struct x86_init_resources - platform specific resource related ops
+ * @probe_roms: probe BIOS roms
+ * @reserve_resources: reserve the standard resources for the
+ * platform
+ * @memory_setup: platform specific memory setup
+ *
+ */
+struct x86_init_resources {
+ void (*probe_roms)(void);
+ void (*reserve_resources)(void);
+ char *(*memory_setup)(void);
+};
+
+/**
+ * struct x86_init_irqs - platform specific interrupt setup
+ * @pre_vector_init: init code to run before interrupt vectors
+ * are set up.
+ * @intr_init: interrupt init code
+ * @trap_init: platform specific trap setup
+ * @intr_mode_init: interrupt delivery mode setup
+ */
+struct x86_init_irqs {
+ void (*pre_vector_init)(void);
+ void (*intr_init)(void);
+ void (*trap_init)(void);
+ void (*intr_mode_init)(void);
+};
+
+/**
+ * struct x86_init_oem - oem platform specific customizing functions
+ * @arch_setup: platform specific architecture setup
+ * @banner: print a platform specific banner
+ */
+struct x86_init_oem {
+ void (*arch_setup)(void);
+ void (*banner)(void);
+};
+
+/**
+ * struct x86_init_paging - platform specific paging functions
+ * @pagetable_init: platform specific paging initialization call to setup
+ * the kernel pagetables and prepare accessors functions.
+ * Callback must call paging_init(). Called once after the
+ * direct mapping for phys memory is available.
+ */
+struct x86_init_paging {
+ void (*pagetable_init)(void);
+};
+
+/**
+ * struct x86_init_timers - platform specific timer setup
+ * @setup_perpcu_clockev: set up the per cpu clock event device for the
+ * boot cpu
+ * @timer_init: initialize the platform timer (default PIT/HPET)
+ * @wallclock_init: init the wallclock device
+ */
+struct x86_init_timers {
+ void (*setup_percpu_clockev)(void);
+ void (*timer_init)(void);
+ void (*wallclock_init)(void);
+};
+
+/**
+ * struct x86_init_iommu - platform specific iommu setup
+ * @iommu_init: platform specific iommu setup
+ */
+struct x86_init_iommu {
+ int (*iommu_init)(void);
+};
+
+/**
+ * struct x86_init_pci - platform specific pci init functions
+ * @arch_init: platform specific pci arch init call
+ * @init: platform specific pci subsystem init
+ * @init_irq: platform specific pci irq init
+ * @fixup_irqs: platform specific pci irq fixup
+ */
+struct x86_init_pci {
+ int (*arch_init)(void);
+ int (*init)(void);
+ void (*init_irq)(void);
+ void (*fixup_irqs)(void);
+};
+
+/**
+ * struct x86_hyper_init - x86 hypervisor init functions
+ * @init_platform: platform setup
+ * @guest_late_init: guest late init
+ * @x2apic_available: X2APIC detection
+ * @init_mem_mapping: setup early mappings during init_mem_mapping()
+ * @init_after_bootmem: guest init after boot allocator is finished
+ */
+struct x86_hyper_init {
+ void (*init_platform)(void);
+ void (*guest_late_init)(void);
+ bool (*x2apic_available)(void);
+ void (*init_mem_mapping)(void);
+ void (*init_after_bootmem)(void);
+};
+
+/**
+ * struct x86_init_acpi - x86 ACPI init functions
+ * @get_root_pointer: get RSDP address
+ * @reduced_hw_early_init: hardware reduced platform early init
+ */
+struct x86_init_acpi {
+ u64 (*get_root_pointer)(void);
+ void (*reduced_hw_early_init)(void);
+};
+
+/**
+ * struct x86_init_ops - functions for platform specific setup
+ *
+ */
+struct x86_init_ops {
+ struct x86_init_resources resources;
+ struct x86_init_mpparse mpparse;
+ struct x86_init_irqs irqs;
+ struct x86_init_oem oem;
+ struct x86_init_paging paging;
+ struct x86_init_timers timers;
+ struct x86_init_iommu iommu;
+ struct x86_init_pci pci;
+ struct x86_hyper_init hyper;
+ struct x86_init_acpi acpi;
+};
+
+/**
+ * struct x86_cpuinit_ops - platform specific cpu hotplug setups
+ * @setup_percpu_clockev: set up the per cpu clock event device
+ * @early_percpu_clock_init: early init of the per cpu clock event device
+ */
+struct x86_cpuinit_ops {
+ void (*setup_percpu_clockev)(void);
+ void (*early_percpu_clock_init)(void);
+ void (*fixup_cpu_id)(struct cpuinfo_x86 *c, int node);
+};
+
+struct timespec64;
+
+/**
+ * struct x86_legacy_devices - legacy x86 devices
+ *
+ * @pnpbios: this platform can have a PNPBIOS. If this is disabled the platform
+ * is known to never have a PNPBIOS.
+ *
+ * These are devices known to require LPC or ISA bus. The definition of legacy
+ * devices adheres to the ACPI 5.2.9.3 IA-PC Boot Architecture flag
+ * ACPI_FADT_LEGACY_DEVICES. These devices consist of user visible devices on
+ * the LPC or ISA bus. User visible devices are devices that have end-user
+ * accessible connectors (for example, LPT parallel port). Legacy devices on
+ * the LPC bus consist for example of serial and parallel ports, PS/2 keyboard
+ * / mouse, and the floppy disk controller. A system that lacks all known
+ * legacy devices can assume all devices can be detected exclusively via
+ * standard device enumeration mechanisms including the ACPI namespace.
+ *
+ * A system which has does not have ACPI_FADT_LEGACY_DEVICES enabled must not
+ * have any of the legacy devices enumerated below present.
+ */
+struct x86_legacy_devices {
+ int pnpbios;
+};
+
+/**
+ * enum x86_legacy_i8042_state - i8042 keyboard controller state
+ * @X86_LEGACY_I8042_PLATFORM_ABSENT: the controller is always absent on
+ * given platform/subarch.
+ * @X86_LEGACY_I8042_FIRMWARE_ABSENT: firmware reports that the controller
+ * is absent.
+ * @X86_LEGACY_i8042_EXPECTED_PRESENT: the controller is likely to be
+ * present, the i8042 driver should probe for controller existence.
+ */
+enum x86_legacy_i8042_state {
+ X86_LEGACY_I8042_PLATFORM_ABSENT,
+ X86_LEGACY_I8042_FIRMWARE_ABSENT,
+ X86_LEGACY_I8042_EXPECTED_PRESENT,
+};
+
+/**
+ * struct x86_legacy_features - legacy x86 features
+ *
+ * @i8042: indicated if we expect the device to have i8042 controller
+ * present.
+ * @rtc: this device has a CMOS real-time clock present
+ * @reserve_bios_regions: boot code will search for the EBDA address and the
+ * start of the 640k - 1M BIOS region. If false, the platform must
+ * ensure that its memory map correctly reserves sub-1MB regions as needed.
+ * @devices: legacy x86 devices, refer to struct x86_legacy_devices
+ * documentation for further details.
+ */
+struct x86_legacy_features {
+ enum x86_legacy_i8042_state i8042;
+ int rtc;
+ int warm_reset;
+ int no_vga;
+ int reserve_bios_regions;
+ struct x86_legacy_devices devices;
+};
+
+/**
+ * struct x86_hyper_runtime - x86 hypervisor specific runtime callbacks
+ *
+ * @pin_vcpu: pin current vcpu to specified physical cpu (run rarely)
+ */
+struct x86_hyper_runtime {
+ void (*pin_vcpu)(int cpu);
+};
+
+/**
+ * struct x86_platform_ops - platform specific runtime functions
+ * @calibrate_cpu: calibrate CPU
+ * @calibrate_tsc: calibrate TSC, if different from CPU
+ * @get_wallclock: get time from HW clock like RTC etc.
+ * @set_wallclock: set time back to HW clock
+ * @is_untracked_pat_range exclude from PAT logic
+ * @nmi_init enable NMI on cpus
+ * @save_sched_clock_state: save state for sched_clock() on suspend
+ * @restore_sched_clock_state: restore state for sched_clock() on resume
+ * @apic_post_init: adjust apic if needed
+ * @legacy: legacy features
+ * @set_legacy_features: override legacy features. Use of this callback
+ * is highly discouraged. You should only need
+ * this if your hardware platform requires further
+ * custom fine tuning far beyond what may be
+ * possible in x86_early_init_platform_quirks() by
+ * only using the current x86_hardware_subarch
+ * semantics.
+ * @hyper: x86 hypervisor specific runtime callbacks
+ */
+struct x86_platform_ops {
+ unsigned long (*calibrate_cpu)(void);
+ unsigned long (*calibrate_tsc)(void);
+ void (*get_wallclock)(struct timespec64 *ts);
+ int (*set_wallclock)(const struct timespec64 *ts);
+ void (*iommu_shutdown)(void);
+ bool (*is_untracked_pat_range)(u64 start, u64 end);
+ void (*nmi_init)(void);
+ unsigned char (*get_nmi_reason)(void);
+ void (*save_sched_clock_state)(void);
+ void (*restore_sched_clock_state)(void);
+ void (*apic_post_init)(void);
+ struct x86_legacy_features legacy;
+ void (*set_legacy_features)(void);
+ struct x86_hyper_runtime hyper;
+};
+
+struct pci_dev;
+
+struct x86_msi_ops {
+ int (*setup_msi_irqs)(struct pci_dev *dev, int nvec, int type);
+ void (*teardown_msi_irq)(unsigned int irq);
+ void (*teardown_msi_irqs)(struct pci_dev *dev);
+ void (*restore_msi_irqs)(struct pci_dev *dev);
+};
+
+struct x86_apic_ops {
+ unsigned int (*io_apic_read) (unsigned int apic, unsigned int reg);
+ void (*restore)(void);
+};
+
+extern struct x86_init_ops x86_init;
+extern struct x86_cpuinit_ops x86_cpuinit;
+extern struct x86_platform_ops x86_platform;
+extern struct x86_msi_ops x86_msi;
+extern struct x86_apic_ops x86_apic_ops;
+
+extern void x86_early_init_platform_quirks(void);
+extern void x86_init_noop(void);
+extern void x86_init_uint_noop(unsigned int unused);
+extern bool x86_pnpbios_disabled(void);
+
+#endif
diff --git a/arch/x86/include/asm/xen/cpuid.h b/arch/x86/include/asm/xen/cpuid.h
new file mode 100644
index 000000000..a9630104f
--- /dev/null
+++ b/arch/x86/include/asm/xen/cpuid.h
@@ -0,0 +1,116 @@
+/******************************************************************************
+ * arch-x86/cpuid.h
+ *
+ * CPUID interface to Xen.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2007 Citrix Systems, Inc.
+ *
+ * Authors:
+ * Keir Fraser <keir@xen.org>
+ */
+
+#ifndef __XEN_PUBLIC_ARCH_X86_CPUID_H__
+#define __XEN_PUBLIC_ARCH_X86_CPUID_H__
+
+/*
+ * For compatibility with other hypervisor interfaces, the Xen cpuid leaves
+ * can be found at the first otherwise unused 0x100 aligned boundary starting
+ * from 0x40000000.
+ *
+ * e.g If viridian extensions are enabled for an HVM domain, the Xen cpuid
+ * leaves will start at 0x40000100
+ */
+
+#define XEN_CPUID_FIRST_LEAF 0x40000000
+#define XEN_CPUID_LEAF(i) (XEN_CPUID_FIRST_LEAF + (i))
+
+/*
+ * Leaf 1 (0x40000x00)
+ * EAX: Largest Xen-information leaf. All leaves up to an including @EAX
+ * are supported by the Xen host.
+ * EBX-EDX: "XenVMMXenVMM" signature, allowing positive identification
+ * of a Xen host.
+ */
+#define XEN_CPUID_SIGNATURE_EBX 0x566e6558 /* "XenV" */
+#define XEN_CPUID_SIGNATURE_ECX 0x65584d4d /* "MMXe" */
+#define XEN_CPUID_SIGNATURE_EDX 0x4d4d566e /* "nVMM" */
+
+/*
+ * Leaf 2 (0x40000x01)
+ * EAX[31:16]: Xen major version.
+ * EAX[15: 0]: Xen minor version.
+ * EBX-EDX: Reserved (currently all zeroes).
+ */
+
+/*
+ * Leaf 3 (0x40000x02)
+ * EAX: Number of hypercall transfer pages. This register is always guaranteed
+ * to specify one hypercall page.
+ * EBX: Base address of Xen-specific MSRs.
+ * ECX: Features 1. Unused bits are set to zero.
+ * EDX: Features 2. Unused bits are set to zero.
+ */
+
+/* Does the host support MMU_PT_UPDATE_PRESERVE_AD for this guest? */
+#define _XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD 0
+#define XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD (1u<<0)
+
+/*
+ * Leaf 4 (0x40000x03)
+ * Sub-leaf 0: EAX: bit 0: emulated tsc
+ * bit 1: host tsc is known to be reliable
+ * bit 2: RDTSCP instruction available
+ * EBX: tsc_mode: 0=default (emulate if necessary), 1=emulate,
+ * 2=no emulation, 3=no emulation + TSC_AUX support
+ * ECX: guest tsc frequency in kHz
+ * EDX: guest tsc incarnation (migration count)
+ * Sub-leaf 1: EAX: tsc offset low part
+ * EBX: tsc offset high part
+ * ECX: multiplicator for tsc->ns conversion
+ * EDX: shift amount for tsc->ns conversion
+ * Sub-leaf 2: EAX: host tsc frequency in kHz
+ */
+
+/*
+ * Leaf 5 (0x40000x04)
+ * HVM-specific features
+ * Sub-leaf 0: EAX: Features
+ * Sub-leaf 0: EBX: vcpu id (iff EAX has XEN_HVM_CPUID_VCPU_ID_PRESENT flag)
+ */
+#define XEN_HVM_CPUID_APIC_ACCESS_VIRT (1u << 0) /* Virtualized APIC registers */
+#define XEN_HVM_CPUID_X2APIC_VIRT (1u << 1) /* Virtualized x2APIC accesses */
+/* Memory mapped from other domains has valid IOMMU entries */
+#define XEN_HVM_CPUID_IOMMU_MAPPINGS (1u << 2)
+#define XEN_HVM_CPUID_VCPU_ID_PRESENT (1u << 3) /* vcpu id is present in EBX */
+
+/*
+ * Leaf 6 (0x40000x05)
+ * PV-specific parameters
+ * Sub-leaf 0: EAX: max available sub-leaf
+ * Sub-leaf 0: EBX: bits 0-7: max machine address width
+ */
+
+/* Max. address width in bits taking memory hotplug into account. */
+#define XEN_CPUID_MACHINE_ADDRESS_WIDTH_MASK (0xffu << 0)
+
+#define XEN_CPUID_MAX_NUM_LEAVES 5
+
+#endif /* __XEN_PUBLIC_ARCH_X86_CPUID_H__ */
diff --git a/arch/x86/include/asm/xen/events.h b/arch/x86/include/asm/xen/events.h
new file mode 100644
index 000000000..d383140e1
--- /dev/null
+++ b/arch/x86/include/asm/xen/events.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_XEN_EVENTS_H
+#define _ASM_X86_XEN_EVENTS_H
+
+enum ipi_vector {
+ XEN_RESCHEDULE_VECTOR,
+ XEN_CALL_FUNCTION_VECTOR,
+ XEN_CALL_FUNCTION_SINGLE_VECTOR,
+ XEN_SPIN_UNLOCK_VECTOR,
+ XEN_IRQ_WORK_VECTOR,
+ XEN_NMI_VECTOR,
+
+ XEN_NR_IPIS,
+};
+
+static inline int xen_irqs_disabled(struct pt_regs *regs)
+{
+ return raw_irqs_disabled_flags(regs->flags);
+}
+
+/* No need for a barrier -- XCHG is a barrier on x86. */
+#define xchg_xen_ulong(ptr, val) xchg((ptr), (val))
+
+extern int xen_have_vector_callback;
+
+/*
+ * Events delivered via platform PCI interrupts are always
+ * routed to vcpu 0 and hence cannot be rebound.
+ */
+static inline bool xen_support_evtchn_rebind(void)
+{
+ return (!xen_hvm_domain() || xen_have_vector_callback);
+}
+
+#endif /* _ASM_X86_XEN_EVENTS_H */
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
new file mode 100644
index 000000000..6b5c71084
--- /dev/null
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -0,0 +1,526 @@
+/******************************************************************************
+ * hypercall.h
+ *
+ * Linux-specific hypervisor handling.
+ *
+ * Copyright (c) 2002-2004, K A Fraser
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef _ASM_X86_XEN_HYPERCALL_H
+#define _ASM_X86_XEN_HYPERCALL_H
+
+#include <linux/kernel.h>
+#include <linux/spinlock.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+
+#include <trace/events/xen.h>
+
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/smap.h>
+#include <asm/nospec-branch.h>
+
+#include <xen/interface/xen.h>
+#include <xen/interface/sched.h>
+#include <xen/interface/physdev.h>
+#include <xen/interface/platform.h>
+#include <xen/interface/xen-mca.h>
+
+struct xen_dm_op_buf;
+
+/*
+ * The hypercall asms have to meet several constraints:
+ * - Work on 32- and 64-bit.
+ * The two architectures put their arguments in different sets of
+ * registers.
+ *
+ * - Work around asm syntax quirks
+ * It isn't possible to specify one of the rNN registers in a
+ * constraint, so we use explicit register variables to get the
+ * args into the right place.
+ *
+ * - Mark all registers as potentially clobbered
+ * Even unused parameters can be clobbered by the hypervisor, so we
+ * need to make sure gcc knows it.
+ *
+ * - Avoid compiler bugs.
+ * This is the tricky part. Because x86_32 has such a constrained
+ * register set, gcc versions below 4.3 have trouble generating
+ * code when all the arg registers and memory are trashed by the
+ * asm. There are syntactically simpler ways of achieving the
+ * semantics below, but they cause the compiler to crash.
+ *
+ * The only combination I found which works is:
+ * - assign the __argX variables first
+ * - list all actually used parameters as "+r" (__argX)
+ * - clobber the rest
+ *
+ * The result certainly isn't pretty, and it really shows up cpp's
+ * weakness as as macro language. Sorry. (But let's just give thanks
+ * there aren't more than 5 arguments...)
+ */
+
+extern struct { char _entry[32]; } hypercall_page[];
+
+#define __HYPERCALL "call hypercall_page+%c[offset]"
+#define __HYPERCALL_ENTRY(x) \
+ [offset] "i" (__HYPERVISOR_##x * sizeof(hypercall_page[0]))
+
+#ifdef CONFIG_X86_32
+#define __HYPERCALL_RETREG "eax"
+#define __HYPERCALL_ARG1REG "ebx"
+#define __HYPERCALL_ARG2REG "ecx"
+#define __HYPERCALL_ARG3REG "edx"
+#define __HYPERCALL_ARG4REG "esi"
+#define __HYPERCALL_ARG5REG "edi"
+#else
+#define __HYPERCALL_RETREG "rax"
+#define __HYPERCALL_ARG1REG "rdi"
+#define __HYPERCALL_ARG2REG "rsi"
+#define __HYPERCALL_ARG3REG "rdx"
+#define __HYPERCALL_ARG4REG "r10"
+#define __HYPERCALL_ARG5REG "r8"
+#endif
+
+#define __HYPERCALL_DECLS \
+ register unsigned long __res asm(__HYPERCALL_RETREG); \
+ register unsigned long __arg1 asm(__HYPERCALL_ARG1REG) = __arg1; \
+ register unsigned long __arg2 asm(__HYPERCALL_ARG2REG) = __arg2; \
+ register unsigned long __arg3 asm(__HYPERCALL_ARG3REG) = __arg3; \
+ register unsigned long __arg4 asm(__HYPERCALL_ARG4REG) = __arg4; \
+ register unsigned long __arg5 asm(__HYPERCALL_ARG5REG) = __arg5;
+
+#define __HYPERCALL_0PARAM "=r" (__res), ASM_CALL_CONSTRAINT
+#define __HYPERCALL_1PARAM __HYPERCALL_0PARAM, "+r" (__arg1)
+#define __HYPERCALL_2PARAM __HYPERCALL_1PARAM, "+r" (__arg2)
+#define __HYPERCALL_3PARAM __HYPERCALL_2PARAM, "+r" (__arg3)
+#define __HYPERCALL_4PARAM __HYPERCALL_3PARAM, "+r" (__arg4)
+#define __HYPERCALL_5PARAM __HYPERCALL_4PARAM, "+r" (__arg5)
+
+#define __HYPERCALL_0ARG()
+#define __HYPERCALL_1ARG(a1) \
+ __HYPERCALL_0ARG() __arg1 = (unsigned long)(a1);
+#define __HYPERCALL_2ARG(a1,a2) \
+ __HYPERCALL_1ARG(a1) __arg2 = (unsigned long)(a2);
+#define __HYPERCALL_3ARG(a1,a2,a3) \
+ __HYPERCALL_2ARG(a1,a2) __arg3 = (unsigned long)(a3);
+#define __HYPERCALL_4ARG(a1,a2,a3,a4) \
+ __HYPERCALL_3ARG(a1,a2,a3) __arg4 = (unsigned long)(a4);
+#define __HYPERCALL_5ARG(a1,a2,a3,a4,a5) \
+ __HYPERCALL_4ARG(a1,a2,a3,a4) __arg5 = (unsigned long)(a5);
+
+#define __HYPERCALL_CLOBBER5 "memory"
+#define __HYPERCALL_CLOBBER4 __HYPERCALL_CLOBBER5, __HYPERCALL_ARG5REG
+#define __HYPERCALL_CLOBBER3 __HYPERCALL_CLOBBER4, __HYPERCALL_ARG4REG
+#define __HYPERCALL_CLOBBER2 __HYPERCALL_CLOBBER3, __HYPERCALL_ARG3REG
+#define __HYPERCALL_CLOBBER1 __HYPERCALL_CLOBBER2, __HYPERCALL_ARG2REG
+#define __HYPERCALL_CLOBBER0 __HYPERCALL_CLOBBER1, __HYPERCALL_ARG1REG
+
+#define _hypercall0(type, name) \
+({ \
+ __HYPERCALL_DECLS; \
+ __HYPERCALL_0ARG(); \
+ asm volatile (__HYPERCALL \
+ : __HYPERCALL_0PARAM \
+ : __HYPERCALL_ENTRY(name) \
+ : __HYPERCALL_CLOBBER0); \
+ (type)__res; \
+})
+
+#define _hypercall1(type, name, a1) \
+({ \
+ __HYPERCALL_DECLS; \
+ __HYPERCALL_1ARG(a1); \
+ asm volatile (__HYPERCALL \
+ : __HYPERCALL_1PARAM \
+ : __HYPERCALL_ENTRY(name) \
+ : __HYPERCALL_CLOBBER1); \
+ (type)__res; \
+})
+
+#define _hypercall2(type, name, a1, a2) \
+({ \
+ __HYPERCALL_DECLS; \
+ __HYPERCALL_2ARG(a1, a2); \
+ asm volatile (__HYPERCALL \
+ : __HYPERCALL_2PARAM \
+ : __HYPERCALL_ENTRY(name) \
+ : __HYPERCALL_CLOBBER2); \
+ (type)__res; \
+})
+
+#define _hypercall3(type, name, a1, a2, a3) \
+({ \
+ __HYPERCALL_DECLS; \
+ __HYPERCALL_3ARG(a1, a2, a3); \
+ asm volatile (__HYPERCALL \
+ : __HYPERCALL_3PARAM \
+ : __HYPERCALL_ENTRY(name) \
+ : __HYPERCALL_CLOBBER3); \
+ (type)__res; \
+})
+
+#define _hypercall4(type, name, a1, a2, a3, a4) \
+({ \
+ __HYPERCALL_DECLS; \
+ __HYPERCALL_4ARG(a1, a2, a3, a4); \
+ asm volatile (__HYPERCALL \
+ : __HYPERCALL_4PARAM \
+ : __HYPERCALL_ENTRY(name) \
+ : __HYPERCALL_CLOBBER4); \
+ (type)__res; \
+})
+
+static inline long
+xen_single_call(unsigned int call,
+ unsigned long a1, unsigned long a2,
+ unsigned long a3, unsigned long a4,
+ unsigned long a5)
+{
+ __HYPERCALL_DECLS;
+ __HYPERCALL_5ARG(a1, a2, a3, a4, a5);
+
+ if (call >= PAGE_SIZE / sizeof(hypercall_page[0]))
+ return -EINVAL;
+
+ asm volatile(CALL_NOSPEC
+ : __HYPERCALL_5PARAM
+ : [thunk_target] "a" (&hypercall_page[call])
+ : __HYPERCALL_CLOBBER5);
+
+ return (long)__res;
+}
+
+static inline long
+privcmd_call(unsigned int call,
+ unsigned long a1, unsigned long a2,
+ unsigned long a3, unsigned long a4,
+ unsigned long a5)
+{
+ long res;
+
+ stac();
+ res = xen_single_call(call, a1, a2, a3, a4, a5);
+ clac();
+
+ return res;
+}
+
+static inline int
+HYPERVISOR_set_trap_table(struct trap_info *table)
+{
+ return _hypercall1(int, set_trap_table, table);
+}
+
+static inline int
+HYPERVISOR_mmu_update(struct mmu_update *req, int count,
+ int *success_count, domid_t domid)
+{
+ return _hypercall4(int, mmu_update, req, count, success_count, domid);
+}
+
+static inline int
+HYPERVISOR_mmuext_op(struct mmuext_op *op, int count,
+ int *success_count, domid_t domid)
+{
+ return _hypercall4(int, mmuext_op, op, count, success_count, domid);
+}
+
+static inline int
+HYPERVISOR_set_gdt(unsigned long *frame_list, int entries)
+{
+ return _hypercall2(int, set_gdt, frame_list, entries);
+}
+
+static inline int
+HYPERVISOR_callback_op(int cmd, void *arg)
+{
+ return _hypercall2(int, callback_op, cmd, arg);
+}
+
+static inline int
+HYPERVISOR_sched_op(int cmd, void *arg)
+{
+ return _hypercall2(int, sched_op, cmd, arg);
+}
+
+static inline long
+HYPERVISOR_set_timer_op(u64 timeout)
+{
+ unsigned long timeout_hi = (unsigned long)(timeout>>32);
+ unsigned long timeout_lo = (unsigned long)timeout;
+ return _hypercall2(long, set_timer_op, timeout_lo, timeout_hi);
+}
+
+static inline int
+HYPERVISOR_mca(struct xen_mc *mc_op)
+{
+ mc_op->interface_version = XEN_MCA_INTERFACE_VERSION;
+ return _hypercall1(int, mca, mc_op);
+}
+
+static inline int
+HYPERVISOR_platform_op(struct xen_platform_op *op)
+{
+ op->interface_version = XENPF_INTERFACE_VERSION;
+ return _hypercall1(int, platform_op, op);
+}
+
+static inline int
+HYPERVISOR_set_debugreg(int reg, unsigned long value)
+{
+ return _hypercall2(int, set_debugreg, reg, value);
+}
+
+static inline unsigned long
+HYPERVISOR_get_debugreg(int reg)
+{
+ return _hypercall1(unsigned long, get_debugreg, reg);
+}
+
+static inline int
+HYPERVISOR_update_descriptor(u64 ma, u64 desc)
+{
+ if (sizeof(u64) == sizeof(long))
+ return _hypercall2(int, update_descriptor, ma, desc);
+ return _hypercall4(int, update_descriptor, ma, ma>>32, desc, desc>>32);
+}
+
+static inline long
+HYPERVISOR_memory_op(unsigned int cmd, void *arg)
+{
+ return _hypercall2(long, memory_op, cmd, arg);
+}
+
+static inline int
+HYPERVISOR_multicall(void *call_list, uint32_t nr_calls)
+{
+ return _hypercall2(int, multicall, call_list, nr_calls);
+}
+
+static inline int
+HYPERVISOR_update_va_mapping(unsigned long va, pte_t new_val,
+ unsigned long flags)
+{
+ if (sizeof(new_val) == sizeof(long))
+ return _hypercall3(int, update_va_mapping, va,
+ new_val.pte, flags);
+ else
+ return _hypercall4(int, update_va_mapping, va,
+ new_val.pte, new_val.pte >> 32, flags);
+}
+extern int __must_check xen_event_channel_op_compat(int, void *);
+
+static inline int
+HYPERVISOR_event_channel_op(int cmd, void *arg)
+{
+ int rc = _hypercall2(int, event_channel_op, cmd, arg);
+ if (unlikely(rc == -ENOSYS))
+ rc = xen_event_channel_op_compat(cmd, arg);
+ return rc;
+}
+
+static inline int
+HYPERVISOR_xen_version(int cmd, void *arg)
+{
+ return _hypercall2(int, xen_version, cmd, arg);
+}
+
+static inline int
+HYPERVISOR_console_io(int cmd, int count, char *str)
+{
+ return _hypercall3(int, console_io, cmd, count, str);
+}
+
+extern int __must_check xen_physdev_op_compat(int, void *);
+
+static inline int
+HYPERVISOR_physdev_op(int cmd, void *arg)
+{
+ int rc = _hypercall2(int, physdev_op, cmd, arg);
+ if (unlikely(rc == -ENOSYS))
+ rc = xen_physdev_op_compat(cmd, arg);
+ return rc;
+}
+
+static inline int
+HYPERVISOR_grant_table_op(unsigned int cmd, void *uop, unsigned int count)
+{
+ return _hypercall3(int, grant_table_op, cmd, uop, count);
+}
+
+static inline int
+HYPERVISOR_vm_assist(unsigned int cmd, unsigned int type)
+{
+ return _hypercall2(int, vm_assist, cmd, type);
+}
+
+static inline int
+HYPERVISOR_vcpu_op(int cmd, int vcpuid, void *extra_args)
+{
+ return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args);
+}
+
+#ifdef CONFIG_X86_64
+static inline int
+HYPERVISOR_set_segment_base(int reg, unsigned long value)
+{
+ return _hypercall2(int, set_segment_base, reg, value);
+}
+#endif
+
+static inline int
+HYPERVISOR_suspend(unsigned long start_info_mfn)
+{
+ struct sched_shutdown r = { .reason = SHUTDOWN_suspend };
+
+ /*
+ * For a PV guest the tools require that the start_info mfn be
+ * present in rdx/edx when the hypercall is made. Per the
+ * hypercall calling convention this is the third hypercall
+ * argument, which is start_info_mfn here.
+ */
+ return _hypercall3(int, sched_op, SCHEDOP_shutdown, &r, start_info_mfn);
+}
+
+static inline unsigned long __must_check
+HYPERVISOR_hvm_op(int op, void *arg)
+{
+ return _hypercall2(unsigned long, hvm_op, op, arg);
+}
+
+static inline int
+HYPERVISOR_tmem_op(
+ struct tmem_op *op)
+{
+ return _hypercall1(int, tmem_op, op);
+}
+
+static inline int
+HYPERVISOR_xenpmu_op(unsigned int op, void *arg)
+{
+ return _hypercall2(int, xenpmu_op, op, arg);
+}
+
+static inline int
+HYPERVISOR_dm_op(
+ domid_t dom, unsigned int nr_bufs, struct xen_dm_op_buf *bufs)
+{
+ int ret;
+ stac();
+ ret = _hypercall3(int, dm_op, dom, nr_bufs, bufs);
+ clac();
+ return ret;
+}
+
+static inline void
+MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set)
+{
+ mcl->op = __HYPERVISOR_fpu_taskswitch;
+ mcl->args[0] = set;
+
+ trace_xen_mc_entry(mcl, 1);
+}
+
+static inline void
+MULTI_update_va_mapping(struct multicall_entry *mcl, unsigned long va,
+ pte_t new_val, unsigned long flags)
+{
+ mcl->op = __HYPERVISOR_update_va_mapping;
+ mcl->args[0] = va;
+ if (sizeof(new_val) == sizeof(long)) {
+ mcl->args[1] = new_val.pte;
+ mcl->args[2] = flags;
+ } else {
+ mcl->args[1] = new_val.pte;
+ mcl->args[2] = new_val.pte >> 32;
+ mcl->args[3] = flags;
+ }
+
+ trace_xen_mc_entry(mcl, sizeof(new_val) == sizeof(long) ? 3 : 4);
+}
+
+static inline void
+MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr,
+ struct desc_struct desc)
+{
+ mcl->op = __HYPERVISOR_update_descriptor;
+ if (sizeof(maddr) == sizeof(long)) {
+ mcl->args[0] = maddr;
+ mcl->args[1] = *(unsigned long *)&desc;
+ } else {
+ u32 *p = (u32 *)&desc;
+
+ mcl->args[0] = maddr;
+ mcl->args[1] = maddr >> 32;
+ mcl->args[2] = *p++;
+ mcl->args[3] = *p;
+ }
+
+ trace_xen_mc_entry(mcl, sizeof(maddr) == sizeof(long) ? 2 : 4);
+}
+
+static inline void
+MULTI_mmu_update(struct multicall_entry *mcl, struct mmu_update *req,
+ int count, int *success_count, domid_t domid)
+{
+ mcl->op = __HYPERVISOR_mmu_update;
+ mcl->args[0] = (unsigned long)req;
+ mcl->args[1] = count;
+ mcl->args[2] = (unsigned long)success_count;
+ mcl->args[3] = domid;
+
+ trace_xen_mc_entry(mcl, 4);
+}
+
+static inline void
+MULTI_mmuext_op(struct multicall_entry *mcl, struct mmuext_op *op, int count,
+ int *success_count, domid_t domid)
+{
+ mcl->op = __HYPERVISOR_mmuext_op;
+ mcl->args[0] = (unsigned long)op;
+ mcl->args[1] = count;
+ mcl->args[2] = (unsigned long)success_count;
+ mcl->args[3] = domid;
+
+ trace_xen_mc_entry(mcl, 4);
+}
+
+static inline void
+MULTI_stack_switch(struct multicall_entry *mcl,
+ unsigned long ss, unsigned long esp)
+{
+ mcl->op = __HYPERVISOR_stack_switch;
+ mcl->args[0] = ss;
+ mcl->args[1] = esp;
+
+ trace_xen_mc_entry(mcl, 2);
+}
+
+#endif /* _ASM_X86_XEN_HYPERCALL_H */
diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h
new file mode 100644
index 000000000..39171b364
--- /dev/null
+++ b/arch/x86/include/asm/xen/hypervisor.h
@@ -0,0 +1,67 @@
+/******************************************************************************
+ * hypervisor.h
+ *
+ * Linux-specific hypervisor handling.
+ *
+ * Copyright (c) 2002-2004, K A Fraser
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef _ASM_X86_XEN_HYPERVISOR_H
+#define _ASM_X86_XEN_HYPERVISOR_H
+
+extern struct shared_info *HYPERVISOR_shared_info;
+extern struct start_info *xen_start_info;
+
+#include <asm/processor.h>
+
+static inline uint32_t xen_cpuid_base(void)
+{
+ return hypervisor_cpuid_base("XenVMMXenVMM", 2);
+}
+
+#ifdef CONFIG_XEN
+extern bool xen_hvm_need_lapic(void);
+
+static inline bool xen_x2apic_para_available(void)
+{
+ return xen_hvm_need_lapic();
+}
+#else
+static inline bool xen_x2apic_para_available(void)
+{
+ return (xen_cpuid_base() != 0);
+}
+#endif
+
+#ifdef CONFIG_HOTPLUG_CPU
+void xen_arch_register_cpu(int num);
+void xen_arch_unregister_cpu(int num);
+#endif
+
+extern void xen_set_iopl_mask(unsigned mask);
+
+#endif /* _ASM_X86_XEN_HYPERVISOR_H */
diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h
new file mode 100644
index 000000000..62ca03ef5
--- /dev/null
+++ b/arch/x86/include/asm/xen/interface.h
@@ -0,0 +1,390 @@
+/******************************************************************************
+ * arch-x86_32.h
+ *
+ * Guest OS interface to x86 Xen.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2004-2006, K A Fraser
+ */
+
+#ifndef _ASM_X86_XEN_INTERFACE_H
+#define _ASM_X86_XEN_INTERFACE_H
+
+/*
+ * XEN_GUEST_HANDLE represents a guest pointer, when passed as a field
+ * in a struct in memory.
+ * XEN_GUEST_HANDLE_PARAM represent a guest pointer, when passed as an
+ * hypercall argument.
+ * XEN_GUEST_HANDLE_PARAM and XEN_GUEST_HANDLE are the same on X86 but
+ * they might not be on other architectures.
+ */
+#ifdef __XEN__
+#define __DEFINE_GUEST_HANDLE(name, type) \
+ typedef struct { type *p; } __guest_handle_ ## name
+#else
+#define __DEFINE_GUEST_HANDLE(name, type) \
+ typedef type * __guest_handle_ ## name
+#endif
+
+#define DEFINE_GUEST_HANDLE_STRUCT(name) \
+ __DEFINE_GUEST_HANDLE(name, struct name)
+#define DEFINE_GUEST_HANDLE(name) __DEFINE_GUEST_HANDLE(name, name)
+#define GUEST_HANDLE(name) __guest_handle_ ## name
+
+#ifdef __XEN__
+#if defined(__i386__)
+#define set_xen_guest_handle(hnd, val) \
+ do { \
+ if (sizeof(hnd) == 8) \
+ *(uint64_t *)&(hnd) = 0; \
+ (hnd).p = val; \
+ } while (0)
+#elif defined(__x86_64__)
+#define set_xen_guest_handle(hnd, val) do { (hnd).p = val; } while (0)
+#endif
+#else
+#if defined(__i386__)
+#define set_xen_guest_handle(hnd, val) \
+ do { \
+ if (sizeof(hnd) == 8) \
+ *(uint64_t *)&(hnd) = 0; \
+ (hnd) = val; \
+ } while (0)
+#elif defined(__x86_64__)
+#define set_xen_guest_handle(hnd, val) do { (hnd) = val; } while (0)
+#endif
+#endif
+
+#ifndef __ASSEMBLY__
+/* Explicitly size integers that represent pfns in the public interface
+ * with Xen so that on ARM we can have one ABI that works for 32 and 64
+ * bit guests. */
+typedef unsigned long xen_pfn_t;
+#define PRI_xen_pfn "lx"
+typedef unsigned long xen_ulong_t;
+#define PRI_xen_ulong "lx"
+typedef long xen_long_t;
+#define PRI_xen_long "lx"
+
+/* Guest handles for primitive C types. */
+__DEFINE_GUEST_HANDLE(uchar, unsigned char);
+__DEFINE_GUEST_HANDLE(uint, unsigned int);
+DEFINE_GUEST_HANDLE(char);
+DEFINE_GUEST_HANDLE(int);
+DEFINE_GUEST_HANDLE(void);
+DEFINE_GUEST_HANDLE(uint64_t);
+DEFINE_GUEST_HANDLE(uint32_t);
+DEFINE_GUEST_HANDLE(xen_pfn_t);
+DEFINE_GUEST_HANDLE(xen_ulong_t);
+#endif
+
+#ifndef HYPERVISOR_VIRT_START
+#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
+#endif
+
+#define MACH2PHYS_VIRT_START mk_unsigned_long(__MACH2PHYS_VIRT_START)
+#define MACH2PHYS_VIRT_END mk_unsigned_long(__MACH2PHYS_VIRT_END)
+#define MACH2PHYS_NR_ENTRIES ((MACH2PHYS_VIRT_END-MACH2PHYS_VIRT_START)>>__MACH2PHYS_SHIFT)
+
+/* Maximum number of virtual CPUs in multi-processor guests. */
+#define MAX_VIRT_CPUS 32
+
+/*
+ * SEGMENT DESCRIPTOR TABLES
+ */
+/*
+ * A number of GDT entries are reserved by Xen. These are not situated at the
+ * start of the GDT because some stupid OSes export hard-coded selector values
+ * in their ABI. These hard-coded values are always near the start of the GDT,
+ * so Xen places itself out of the way, at the far end of the GDT.
+ *
+ * NB The LDT is set using the MMUEXT_SET_LDT op of HYPERVISOR_mmuext_op
+ */
+#define FIRST_RESERVED_GDT_PAGE 14
+#define FIRST_RESERVED_GDT_BYTE (FIRST_RESERVED_GDT_PAGE * 4096)
+#define FIRST_RESERVED_GDT_ENTRY (FIRST_RESERVED_GDT_BYTE / 8)
+
+/*
+ * Send an array of these to HYPERVISOR_set_trap_table().
+ * Terminate the array with a sentinel entry, with traps[].address==0.
+ * The privilege level specifies which modes may enter a trap via a software
+ * interrupt. On x86/64, since rings 1 and 2 are unavailable, we allocate
+ * privilege levels as follows:
+ * Level == 0: No one may enter
+ * Level == 1: Kernel may enter
+ * Level == 2: Kernel may enter
+ * Level == 3: Everyone may enter
+ */
+#define TI_GET_DPL(_ti) ((_ti)->flags & 3)
+#define TI_GET_IF(_ti) ((_ti)->flags & 4)
+#define TI_SET_DPL(_ti, _dpl) ((_ti)->flags |= (_dpl))
+#define TI_SET_IF(_ti, _if) ((_ti)->flags |= ((!!(_if))<<2))
+
+#ifndef __ASSEMBLY__
+struct trap_info {
+ uint8_t vector; /* exception vector */
+ uint8_t flags; /* 0-3: privilege level; 4: clear event enable? */
+ uint16_t cs; /* code selector */
+ unsigned long address; /* code offset */
+};
+DEFINE_GUEST_HANDLE_STRUCT(trap_info);
+
+struct arch_shared_info {
+ /*
+ * Number of valid entries in the p2m table(s) anchored at
+ * pfn_to_mfn_frame_list_list and/or p2m_vaddr.
+ */
+ unsigned long max_pfn;
+ /*
+ * Frame containing list of mfns containing list of mfns containing p2m.
+ * A value of 0 indicates it has not yet been set up, ~0 indicates it
+ * has been set to invalid e.g. due to the p2m being too large for the
+ * 3-level p2m tree. In this case the linear mapper p2m list anchored
+ * at p2m_vaddr is to be used.
+ */
+ xen_pfn_t pfn_to_mfn_frame_list_list;
+ unsigned long nmi_reason;
+ /*
+ * Following three fields are valid if p2m_cr3 contains a value
+ * different from 0.
+ * p2m_cr3 is the root of the address space where p2m_vaddr is valid.
+ * p2m_cr3 is in the same format as a cr3 value in the vcpu register
+ * state and holds the folded machine frame number (via xen_pfn_to_cr3)
+ * of a L3 or L4 page table.
+ * p2m_vaddr holds the virtual address of the linear p2m list. All
+ * entries in the range [0...max_pfn[ are accessible via this pointer.
+ * p2m_generation will be incremented by the guest before and after each
+ * change of the mappings of the p2m list. p2m_generation starts at 0
+ * and a value with the least significant bit set indicates that a
+ * mapping update is in progress. This allows guest external software
+ * (e.g. in Dom0) to verify that read mappings are consistent and
+ * whether they have changed since the last check.
+ * Modifying a p2m element in the linear p2m list is allowed via an
+ * atomic write only.
+ */
+ unsigned long p2m_cr3; /* cr3 value of the p2m address space */
+ unsigned long p2m_vaddr; /* virtual address of the p2m list */
+ unsigned long p2m_generation; /* generation count of p2m mapping */
+};
+#endif /* !__ASSEMBLY__ */
+
+#ifdef CONFIG_X86_32
+#include <asm/xen/interface_32.h>
+#else
+#include <asm/xen/interface_64.h>
+#endif
+
+#include <asm/pvclock-abi.h>
+
+#ifndef __ASSEMBLY__
+/*
+ * The following is all CPU context. Note that the fpu_ctxt block is filled
+ * in by FXSAVE if the CPU has feature FXSR; otherwise FSAVE is used.
+ *
+ * Also note that when calling DOMCTL_setvcpucontext and VCPU_initialise
+ * for HVM and PVH guests, not all information in this structure is updated:
+ *
+ * - For HVM guests, the structures read include: fpu_ctxt (if
+ * VGCT_I387_VALID is set), flags, user_regs, debugreg[*]
+ *
+ * - PVH guests are the same as HVM guests, but additionally use ctrlreg[3] to
+ * set cr3. All other fields not used should be set to 0.
+ */
+struct vcpu_guest_context {
+ /* FPU registers come first so they can be aligned for FXSAVE/FXRSTOR. */
+ struct { char x[512]; } fpu_ctxt; /* User-level FPU registers */
+#define VGCF_I387_VALID (1<<0)
+#define VGCF_IN_KERNEL (1<<2)
+#define _VGCF_i387_valid 0
+#define VGCF_i387_valid (1<<_VGCF_i387_valid)
+#define _VGCF_in_kernel 2
+#define VGCF_in_kernel (1<<_VGCF_in_kernel)
+#define _VGCF_failsafe_disables_events 3
+#define VGCF_failsafe_disables_events (1<<_VGCF_failsafe_disables_events)
+#define _VGCF_syscall_disables_events 4
+#define VGCF_syscall_disables_events (1<<_VGCF_syscall_disables_events)
+#define _VGCF_online 5
+#define VGCF_online (1<<_VGCF_online)
+ unsigned long flags; /* VGCF_* flags */
+ struct cpu_user_regs user_regs; /* User-level CPU registers */
+ struct trap_info trap_ctxt[256]; /* Virtual IDT */
+ unsigned long ldt_base, ldt_ents; /* LDT (linear address, # ents) */
+ unsigned long gdt_frames[16], gdt_ents; /* GDT (machine frames, # ents) */
+ unsigned long kernel_ss, kernel_sp; /* Virtual TSS (only SS1/SP1) */
+ /* NB. User pagetable on x86/64 is placed in ctrlreg[1]. */
+ unsigned long ctrlreg[8]; /* CR0-CR7 (control registers) */
+ unsigned long debugreg[8]; /* DB0-DB7 (debug registers) */
+#ifdef __i386__
+ unsigned long event_callback_cs; /* CS:EIP of event callback */
+ unsigned long event_callback_eip;
+ unsigned long failsafe_callback_cs; /* CS:EIP of failsafe callback */
+ unsigned long failsafe_callback_eip;
+#else
+ unsigned long event_callback_eip;
+ unsigned long failsafe_callback_eip;
+ unsigned long syscall_callback_eip;
+#endif
+ unsigned long vm_assist; /* VMASST_TYPE_* bitmap */
+#ifdef __x86_64__
+ /* Segment base addresses. */
+ uint64_t fs_base;
+ uint64_t gs_base_kernel;
+ uint64_t gs_base_user;
+#endif
+};
+DEFINE_GUEST_HANDLE_STRUCT(vcpu_guest_context);
+
+/* AMD PMU registers and structures */
+struct xen_pmu_amd_ctxt {
+ /*
+ * Offsets to counter and control MSRs (relative to xen_pmu_arch.c.amd).
+ * For PV(H) guests these fields are RO.
+ */
+ uint32_t counters;
+ uint32_t ctrls;
+
+ /* Counter MSRs */
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+ uint64_t regs[];
+#elif defined(__GNUC__)
+ uint64_t regs[0];
+#endif
+};
+
+/* Intel PMU registers and structures */
+struct xen_pmu_cntr_pair {
+ uint64_t counter;
+ uint64_t control;
+};
+
+struct xen_pmu_intel_ctxt {
+ /*
+ * Offsets to fixed and architectural counter MSRs (relative to
+ * xen_pmu_arch.c.intel).
+ * For PV(H) guests these fields are RO.
+ */
+ uint32_t fixed_counters;
+ uint32_t arch_counters;
+
+ /* PMU registers */
+ uint64_t global_ctrl;
+ uint64_t global_ovf_ctrl;
+ uint64_t global_status;
+ uint64_t fixed_ctrl;
+ uint64_t ds_area;
+ uint64_t pebs_enable;
+ uint64_t debugctl;
+
+ /* Fixed and architectural counter MSRs */
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+ uint64_t regs[];
+#elif defined(__GNUC__)
+ uint64_t regs[0];
+#endif
+};
+
+/* Sampled domain's registers */
+struct xen_pmu_regs {
+ uint64_t ip;
+ uint64_t sp;
+ uint64_t flags;
+ uint16_t cs;
+ uint16_t ss;
+ uint8_t cpl;
+ uint8_t pad[3];
+};
+
+/* PMU flags */
+#define PMU_CACHED (1<<0) /* PMU MSRs are cached in the context */
+#define PMU_SAMPLE_USER (1<<1) /* Sample is from user or kernel mode */
+#define PMU_SAMPLE_REAL (1<<2) /* Sample is from realmode */
+#define PMU_SAMPLE_PV (1<<3) /* Sample from a PV guest */
+
+/*
+ * Architecture-specific information describing state of the processor at
+ * the time of PMU interrupt.
+ * Fields of this structure marked as RW for guest should only be written by
+ * the guest when PMU_CACHED bit in pmu_flags is set (which is done by the
+ * hypervisor during PMU interrupt). Hypervisor will read updated data in
+ * XENPMU_flush hypercall and clear PMU_CACHED bit.
+ */
+struct xen_pmu_arch {
+ union {
+ /*
+ * Processor's registers at the time of interrupt.
+ * WO for hypervisor, RO for guests.
+ */
+ struct xen_pmu_regs regs;
+ /*
+ * Padding for adding new registers to xen_pmu_regs in
+ * the future
+ */
+#define XENPMU_REGS_PAD_SZ 64
+ uint8_t pad[XENPMU_REGS_PAD_SZ];
+ } r;
+
+ /* WO for hypervisor, RO for guest */
+ uint64_t pmu_flags;
+
+ /*
+ * APIC LVTPC register.
+ * RW for both hypervisor and guest.
+ * Only APIC_LVT_MASKED bit is loaded by the hypervisor into hardware
+ * during XENPMU_flush or XENPMU_lvtpc_set.
+ */
+ union {
+ uint32_t lapic_lvtpc;
+ uint64_t pad;
+ } l;
+
+ /*
+ * Vendor-specific PMU registers.
+ * RW for both hypervisor and guest (see exceptions above).
+ * Guest's updates to this field are verified and then loaded by the
+ * hypervisor into hardware during XENPMU_flush
+ */
+ union {
+ struct xen_pmu_amd_ctxt amd;
+ struct xen_pmu_intel_ctxt intel;
+
+ /*
+ * Padding for contexts (fixed parts only, does not include
+ * MSR banks that are specified by offsets)
+ */
+#define XENPMU_CTXT_PAD_SZ 128
+ uint8_t pad[XENPMU_CTXT_PAD_SZ];
+ } c;
+};
+
+#endif /* !__ASSEMBLY__ */
+
+/*
+ * Prefix forces emulation of some non-trapping instructions.
+ * Currently only CPUID.
+ */
+#ifdef __ASSEMBLY__
+#define XEN_EMULATE_PREFIX .byte 0x0f,0x0b,0x78,0x65,0x6e ;
+#define XEN_CPUID XEN_EMULATE_PREFIX cpuid
+#else
+#define XEN_EMULATE_PREFIX ".byte 0x0f,0x0b,0x78,0x65,0x6e ; "
+#define XEN_CPUID XEN_EMULATE_PREFIX "cpuid"
+#endif
+
+#endif /* _ASM_X86_XEN_INTERFACE_H */
diff --git a/arch/x86/include/asm/xen/interface_32.h b/arch/x86/include/asm/xen/interface_32.h
new file mode 100644
index 000000000..dc40578ab
--- /dev/null
+++ b/arch/x86/include/asm/xen/interface_32.h
@@ -0,0 +1,103 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/******************************************************************************
+ * arch-x86_32.h
+ *
+ * Guest OS interface to x86 32-bit Xen.
+ *
+ * Copyright (c) 2004, K A Fraser
+ */
+
+#ifndef _ASM_X86_XEN_INTERFACE_32_H
+#define _ASM_X86_XEN_INTERFACE_32_H
+
+
+/*
+ * These flat segments are in the Xen-private section of every GDT. Since these
+ * are also present in the initial GDT, many OSes will be able to avoid
+ * installing their own GDT.
+ */
+#define FLAT_RING1_CS 0xe019 /* GDT index 259 */
+#define FLAT_RING1_DS 0xe021 /* GDT index 260 */
+#define FLAT_RING1_SS 0xe021 /* GDT index 260 */
+#define FLAT_RING3_CS 0xe02b /* GDT index 261 */
+#define FLAT_RING3_DS 0xe033 /* GDT index 262 */
+#define FLAT_RING3_SS 0xe033 /* GDT index 262 */
+
+#define FLAT_KERNEL_CS FLAT_RING1_CS
+#define FLAT_KERNEL_DS FLAT_RING1_DS
+#define FLAT_KERNEL_SS FLAT_RING1_SS
+#define FLAT_USER_CS FLAT_RING3_CS
+#define FLAT_USER_DS FLAT_RING3_DS
+#define FLAT_USER_SS FLAT_RING3_SS
+
+/* And the trap vector is... */
+#define TRAP_INSTR "int $0x82"
+
+#define __MACH2PHYS_VIRT_START 0xF5800000
+#define __MACH2PHYS_VIRT_END 0xF6800000
+
+#define __MACH2PHYS_SHIFT 2
+
+/*
+ * Virtual addresses beyond this are not modifiable by guest OSes. The
+ * machine->physical mapping table starts at this address, read-only.
+ */
+#define __HYPERVISOR_VIRT_START 0xF5800000
+
+#ifndef __ASSEMBLY__
+
+struct cpu_user_regs {
+ uint32_t ebx;
+ uint32_t ecx;
+ uint32_t edx;
+ uint32_t esi;
+ uint32_t edi;
+ uint32_t ebp;
+ uint32_t eax;
+ uint16_t error_code; /* private */
+ uint16_t entry_vector; /* private */
+ uint32_t eip;
+ uint16_t cs;
+ uint8_t saved_upcall_mask;
+ uint8_t _pad0;
+ uint32_t eflags; /* eflags.IF == !saved_upcall_mask */
+ uint32_t esp;
+ uint16_t ss, _pad1;
+ uint16_t es, _pad2;
+ uint16_t ds, _pad3;
+ uint16_t fs, _pad4;
+ uint16_t gs, _pad5;
+};
+DEFINE_GUEST_HANDLE_STRUCT(cpu_user_regs);
+
+typedef uint64_t tsc_timestamp_t; /* RDTSC timestamp */
+
+struct arch_vcpu_info {
+ unsigned long cr2;
+ unsigned long pad[5]; /* sizeof(struct vcpu_info) == 64 */
+};
+
+struct xen_callback {
+ unsigned long cs;
+ unsigned long eip;
+};
+typedef struct xen_callback xen_callback_t;
+
+#define XEN_CALLBACK(__cs, __eip) \
+ ((struct xen_callback){ .cs = (__cs), .eip = (unsigned long)(__eip) })
+#endif /* !__ASSEMBLY__ */
+
+
+/*
+ * Page-directory addresses above 4GB do not fit into architectural %cr3.
+ * When accessing %cr3, or equivalent field in vcpu_guest_context, guests
+ * must use the following accessor macros to pack/unpack valid MFNs.
+ *
+ * Note that Xen is using the fact that the pagetable base is always
+ * page-aligned, and putting the 12 MSB of the address into the 12 LSB
+ * of cr3.
+ */
+#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
+#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))
+
+#endif /* _ASM_X86_XEN_INTERFACE_32_H */
diff --git a/arch/x86/include/asm/xen/interface_64.h b/arch/x86/include/asm/xen/interface_64.h
new file mode 100644
index 000000000..c599ec269
--- /dev/null
+++ b/arch/x86/include/asm/xen/interface_64.h
@@ -0,0 +1,149 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_XEN_INTERFACE_64_H
+#define _ASM_X86_XEN_INTERFACE_64_H
+
+/*
+ * 64-bit segment selectors
+ * These flat segments are in the Xen-private section of every GDT. Since these
+ * are also present in the initial GDT, many OSes will be able to avoid
+ * installing their own GDT.
+ */
+
+#define FLAT_RING3_CS32 0xe023 /* GDT index 260 */
+#define FLAT_RING3_CS64 0xe033 /* GDT index 261 */
+#define FLAT_RING3_DS32 0xe02b /* GDT index 262 */
+#define FLAT_RING3_DS64 0x0000 /* NULL selector */
+#define FLAT_RING3_SS32 0xe02b /* GDT index 262 */
+#define FLAT_RING3_SS64 0xe02b /* GDT index 262 */
+
+#define FLAT_KERNEL_DS64 FLAT_RING3_DS64
+#define FLAT_KERNEL_DS32 FLAT_RING3_DS32
+#define FLAT_KERNEL_DS FLAT_KERNEL_DS64
+#define FLAT_KERNEL_CS64 FLAT_RING3_CS64
+#define FLAT_KERNEL_CS32 FLAT_RING3_CS32
+#define FLAT_KERNEL_CS FLAT_KERNEL_CS64
+#define FLAT_KERNEL_SS64 FLAT_RING3_SS64
+#define FLAT_KERNEL_SS32 FLAT_RING3_SS32
+#define FLAT_KERNEL_SS FLAT_KERNEL_SS64
+
+#define FLAT_USER_DS64 FLAT_RING3_DS64
+#define FLAT_USER_DS32 FLAT_RING3_DS32
+#define FLAT_USER_DS FLAT_USER_DS64
+#define FLAT_USER_CS64 FLAT_RING3_CS64
+#define FLAT_USER_CS32 FLAT_RING3_CS32
+#define FLAT_USER_CS FLAT_USER_CS64
+#define FLAT_USER_SS64 FLAT_RING3_SS64
+#define FLAT_USER_SS32 FLAT_RING3_SS32
+#define FLAT_USER_SS FLAT_USER_SS64
+
+#define __HYPERVISOR_VIRT_START 0xFFFF800000000000
+#define __HYPERVISOR_VIRT_END 0xFFFF880000000000
+#define __MACH2PHYS_VIRT_START 0xFFFF800000000000
+#define __MACH2PHYS_VIRT_END 0xFFFF804000000000
+#define __MACH2PHYS_SHIFT 3
+
+/*
+ * int HYPERVISOR_set_segment_base(unsigned int which, unsigned long base)
+ * @which == SEGBASE_* ; @base == 64-bit base address
+ * Returns 0 on success.
+ */
+#define SEGBASE_FS 0
+#define SEGBASE_GS_USER 1
+#define SEGBASE_GS_KERNEL 2
+#define SEGBASE_GS_USER_SEL 3 /* Set user %gs specified in base[15:0] */
+
+/*
+ * int HYPERVISOR_iret(void)
+ * All arguments are on the kernel stack, in the following format.
+ * Never returns if successful. Current kernel context is lost.
+ * The saved CS is mapped as follows:
+ * RING0 -> RING3 kernel mode.
+ * RING1 -> RING3 kernel mode.
+ * RING2 -> RING3 kernel mode.
+ * RING3 -> RING3 user mode.
+ * However RING0 indicates that the guest kernel should return to iteself
+ * directly with
+ * orb $3,1*8(%rsp)
+ * iretq
+ * If flags contains VGCF_in_syscall:
+ * Restore RAX, RIP, RFLAGS, RSP.
+ * Discard R11, RCX, CS, SS.
+ * Otherwise:
+ * Restore RAX, R11, RCX, CS:RIP, RFLAGS, SS:RSP.
+ * All other registers are saved on hypercall entry and restored to user.
+ */
+/* Guest exited in SYSCALL context? Return to guest with SYSRET? */
+#define _VGCF_in_syscall 8
+#define VGCF_in_syscall (1<<_VGCF_in_syscall)
+#define VGCF_IN_SYSCALL VGCF_in_syscall
+
+#ifndef __ASSEMBLY__
+
+struct iret_context {
+ /* Top of stack (%rsp at point of hypercall). */
+ uint64_t rax, r11, rcx, flags, rip, cs, rflags, rsp, ss;
+ /* Bottom of iret stack frame. */
+};
+
+#if defined(__GNUC__) && !defined(__STRICT_ANSI__)
+/* Anonymous union includes both 32- and 64-bit names (e.g., eax/rax). */
+#define __DECL_REG(name) union { \
+ uint64_t r ## name, e ## name; \
+ uint32_t _e ## name; \
+}
+#else
+/* Non-gcc sources must always use the proper 64-bit name (e.g., rax). */
+#define __DECL_REG(name) uint64_t r ## name
+#endif
+
+struct cpu_user_regs {
+ uint64_t r15;
+ uint64_t r14;
+ uint64_t r13;
+ uint64_t r12;
+ __DECL_REG(bp);
+ __DECL_REG(bx);
+ uint64_t r11;
+ uint64_t r10;
+ uint64_t r9;
+ uint64_t r8;
+ __DECL_REG(ax);
+ __DECL_REG(cx);
+ __DECL_REG(dx);
+ __DECL_REG(si);
+ __DECL_REG(di);
+ uint32_t error_code; /* private */
+ uint32_t entry_vector; /* private */
+ __DECL_REG(ip);
+ uint16_t cs, _pad0[1];
+ uint8_t saved_upcall_mask;
+ uint8_t _pad1[3];
+ __DECL_REG(flags); /* rflags.IF == !saved_upcall_mask */
+ __DECL_REG(sp);
+ uint16_t ss, _pad2[3];
+ uint16_t es, _pad3[3];
+ uint16_t ds, _pad4[3];
+ uint16_t fs, _pad5[3]; /* Non-zero => takes precedence over fs_base. */
+ uint16_t gs, _pad6[3]; /* Non-zero => takes precedence over gs_base_usr. */
+};
+DEFINE_GUEST_HANDLE_STRUCT(cpu_user_regs);
+
+#undef __DECL_REG
+
+#define xen_pfn_to_cr3(pfn) ((unsigned long)(pfn) << 12)
+#define xen_cr3_to_pfn(cr3) ((unsigned long)(cr3) >> 12)
+
+struct arch_vcpu_info {
+ unsigned long cr2;
+ unsigned long pad; /* sizeof(vcpu_info_t) == 64 */
+};
+
+typedef unsigned long xen_callback_t;
+
+#define XEN_CALLBACK(__cs, __rip) \
+ ((unsigned long)(__rip))
+
+#endif /* !__ASSEMBLY__ */
+
+
+#endif /* _ASM_X86_XEN_INTERFACE_64_H */
diff --git a/arch/x86/include/asm/xen/page-coherent.h b/arch/x86/include/asm/xen/page-coherent.h
new file mode 100644
index 000000000..116777e7f
--- /dev/null
+++ b/arch/x86/include/asm/xen/page-coherent.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_XEN_PAGE_COHERENT_H
+#define _ASM_X86_XEN_PAGE_COHERENT_H
+
+#include <asm/page.h>
+#include <linux/dma-mapping.h>
+
+static inline void *xen_alloc_coherent_pages(struct device *hwdev, size_t size,
+ dma_addr_t *dma_handle, gfp_t flags,
+ unsigned long attrs)
+{
+ void *vstart = (void*)__get_free_pages(flags, get_order(size));
+ *dma_handle = virt_to_phys(vstart);
+ return vstart;
+}
+
+static inline void xen_free_coherent_pages(struct device *hwdev, size_t size,
+ void *cpu_addr, dma_addr_t dma_handle,
+ unsigned long attrs)
+{
+ free_pages((unsigned long) cpu_addr, get_order(size));
+}
+
+static inline void xen_dma_map_page(struct device *hwdev, struct page *page,
+ dma_addr_t dev_addr, unsigned long offset, size_t size,
+ enum dma_data_direction dir, unsigned long attrs) { }
+
+static inline void xen_dma_unmap_page(struct device *hwdev, dma_addr_t handle,
+ size_t size, enum dma_data_direction dir,
+ unsigned long attrs) { }
+
+static inline void xen_dma_sync_single_for_cpu(struct device *hwdev,
+ dma_addr_t handle, size_t size, enum dma_data_direction dir) { }
+
+static inline void xen_dma_sync_single_for_device(struct device *hwdev,
+ dma_addr_t handle, size_t size, enum dma_data_direction dir) { }
+
+#endif /* _ASM_X86_XEN_PAGE_COHERENT_H */
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
new file mode 100644
index 000000000..123e669bf
--- /dev/null
+++ b/arch/x86/include/asm/xen/page.h
@@ -0,0 +1,347 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_XEN_PAGE_H
+#define _ASM_X86_XEN_PAGE_H
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/pfn.h>
+#include <linux/mm.h>
+#include <linux/device.h>
+
+#include <linux/uaccess.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+
+#include <xen/interface/xen.h>
+#include <xen/interface/grant_table.h>
+#include <xen/features.h>
+
+/* Xen machine address */
+typedef struct xmaddr {
+ phys_addr_t maddr;
+} xmaddr_t;
+
+/* Xen pseudo-physical address */
+typedef struct xpaddr {
+ phys_addr_t paddr;
+} xpaddr_t;
+
+#ifdef CONFIG_X86_64
+#define XEN_PHYSICAL_MASK __sme_clr((1UL << 52) - 1)
+#else
+#define XEN_PHYSICAL_MASK __PHYSICAL_MASK
+#endif
+
+#define XEN_PTE_MFN_MASK ((pteval_t)(((signed long)PAGE_MASK) & \
+ XEN_PHYSICAL_MASK))
+
+#define XMADDR(x) ((xmaddr_t) { .maddr = (x) })
+#define XPADDR(x) ((xpaddr_t) { .paddr = (x) })
+
+/**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/
+#define INVALID_P2M_ENTRY (~0UL)
+#define FOREIGN_FRAME_BIT (1UL<<(BITS_PER_LONG-1))
+#define IDENTITY_FRAME_BIT (1UL<<(BITS_PER_LONG-2))
+#define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT)
+#define IDENTITY_FRAME(m) ((m) | IDENTITY_FRAME_BIT)
+
+#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
+
+extern unsigned long *machine_to_phys_mapping;
+extern unsigned long machine_to_phys_nr;
+extern unsigned long *xen_p2m_addr;
+extern unsigned long xen_p2m_size;
+extern unsigned long xen_max_p2m_pfn;
+
+extern int xen_alloc_p2m_entry(unsigned long pfn);
+
+extern unsigned long get_phys_to_machine(unsigned long pfn);
+extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn);
+extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
+extern unsigned long __init set_phys_range_identity(unsigned long pfn_s,
+ unsigned long pfn_e);
+
+#ifdef CONFIG_XEN_PV
+extern int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
+ struct gnttab_map_grant_ref *kmap_ops,
+ struct page **pages, unsigned int count);
+extern int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops,
+ struct gnttab_unmap_grant_ref *kunmap_ops,
+ struct page **pages, unsigned int count);
+#else
+static inline int
+set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
+ struct gnttab_map_grant_ref *kmap_ops,
+ struct page **pages, unsigned int count)
+{
+ return 0;
+}
+
+static inline int
+clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops,
+ struct gnttab_unmap_grant_ref *kunmap_ops,
+ struct page **pages, unsigned int count)
+{
+ return 0;
+}
+#endif
+
+/*
+ * Helper functions to write or read unsigned long values to/from
+ * memory, when the access may fault.
+ */
+static inline int xen_safe_write_ulong(unsigned long *addr, unsigned long val)
+{
+ return __put_user(val, (unsigned long __user *)addr);
+}
+
+static inline int xen_safe_read_ulong(unsigned long *addr, unsigned long *val)
+{
+ return __get_user(*val, (unsigned long __user *)addr);
+}
+
+#ifdef CONFIG_XEN_PV
+/*
+ * When to use pfn_to_mfn(), __pfn_to_mfn() or get_phys_to_machine():
+ * - pfn_to_mfn() returns either INVALID_P2M_ENTRY or the mfn. No indicator
+ * bits (identity or foreign) are set.
+ * - __pfn_to_mfn() returns the found entry of the p2m table. A possibly set
+ * identity or foreign indicator will be still set. __pfn_to_mfn() is
+ * encapsulating get_phys_to_machine() which is called in special cases only.
+ * - get_phys_to_machine() is to be called by __pfn_to_mfn() only in special
+ * cases needing an extended handling.
+ */
+static inline unsigned long __pfn_to_mfn(unsigned long pfn)
+{
+ unsigned long mfn;
+
+ if (pfn < xen_p2m_size)
+ mfn = xen_p2m_addr[pfn];
+ else if (unlikely(pfn < xen_max_p2m_pfn))
+ return get_phys_to_machine(pfn);
+ else
+ return IDENTITY_FRAME(pfn);
+
+ if (unlikely(mfn == INVALID_P2M_ENTRY))
+ return get_phys_to_machine(pfn);
+
+ return mfn;
+}
+#else
+static inline unsigned long __pfn_to_mfn(unsigned long pfn)
+{
+ return pfn;
+}
+#endif
+
+static inline unsigned long pfn_to_mfn(unsigned long pfn)
+{
+ unsigned long mfn;
+
+ /*
+ * Some x86 code are still using pfn_to_mfn instead of
+ * pfn_to_mfn. This will have to be removed when we figured
+ * out which call.
+ */
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return pfn;
+
+ mfn = __pfn_to_mfn(pfn);
+
+ if (mfn != INVALID_P2M_ENTRY)
+ mfn &= ~(FOREIGN_FRAME_BIT | IDENTITY_FRAME_BIT);
+
+ return mfn;
+}
+
+static inline int phys_to_machine_mapping_valid(unsigned long pfn)
+{
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return 1;
+
+ return __pfn_to_mfn(pfn) != INVALID_P2M_ENTRY;
+}
+
+static inline unsigned long mfn_to_pfn_no_overrides(unsigned long mfn)
+{
+ unsigned long pfn;
+ int ret;
+
+ if (unlikely(mfn >= machine_to_phys_nr))
+ return ~0;
+
+ /*
+ * The array access can fail (e.g., device space beyond end of RAM).
+ * In such cases it doesn't matter what we return (we return garbage),
+ * but we must handle the fault without crashing!
+ */
+ ret = xen_safe_read_ulong(&machine_to_phys_mapping[mfn], &pfn);
+ if (ret < 0)
+ return ~0;
+
+ return pfn;
+}
+
+static inline unsigned long mfn_to_pfn(unsigned long mfn)
+{
+ unsigned long pfn;
+
+ /*
+ * Some x86 code are still using mfn_to_pfn instead of
+ * gfn_to_pfn. This will have to be removed when we figure
+ * out which call.
+ */
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return mfn;
+
+ pfn = mfn_to_pfn_no_overrides(mfn);
+ if (__pfn_to_mfn(pfn) != mfn)
+ pfn = ~0;
+
+ /*
+ * pfn is ~0 if there are no entries in the m2p for mfn or the
+ * entry doesn't map back to the mfn.
+ */
+ if (pfn == ~0 && __pfn_to_mfn(mfn) == IDENTITY_FRAME(mfn))
+ pfn = mfn;
+
+ return pfn;
+}
+
+static inline xmaddr_t phys_to_machine(xpaddr_t phys)
+{
+ unsigned offset = phys.paddr & ~PAGE_MASK;
+ return XMADDR(PFN_PHYS(pfn_to_mfn(PFN_DOWN(phys.paddr))) | offset);
+}
+
+static inline xpaddr_t machine_to_phys(xmaddr_t machine)
+{
+ unsigned offset = machine.maddr & ~PAGE_MASK;
+ return XPADDR(PFN_PHYS(mfn_to_pfn(PFN_DOWN(machine.maddr))) | offset);
+}
+
+/* Pseudo-physical <-> Guest conversion */
+static inline unsigned long pfn_to_gfn(unsigned long pfn)
+{
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return pfn;
+ else
+ return pfn_to_mfn(pfn);
+}
+
+static inline unsigned long gfn_to_pfn(unsigned long gfn)
+{
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return gfn;
+ else
+ return mfn_to_pfn(gfn);
+}
+
+/* Pseudo-physical <-> Bus conversion */
+#define pfn_to_bfn(pfn) pfn_to_gfn(pfn)
+#define bfn_to_pfn(bfn) gfn_to_pfn(bfn)
+
+/*
+ * We detect special mappings in one of two ways:
+ * 1. If the MFN is an I/O page then Xen will set the m2p entry
+ * to be outside our maximum possible pseudophys range.
+ * 2. If the MFN belongs to a different domain then we will certainly
+ * not have MFN in our p2m table. Conversely, if the page is ours,
+ * then we'll have p2m(m2p(MFN))==MFN.
+ * If we detect a special mapping then it doesn't have a 'struct page'.
+ * We force !pfn_valid() by returning an out-of-range pointer.
+ *
+ * NB. These checks require that, for any MFN that is not in our reservation,
+ * there is no PFN such that p2m(PFN) == MFN. Otherwise we can get confused if
+ * we are foreign-mapping the MFN, and the other domain as m2p(MFN) == PFN.
+ * Yikes! Various places must poke in INVALID_P2M_ENTRY for safety.
+ *
+ * NB2. When deliberately mapping foreign pages into the p2m table, you *must*
+ * use FOREIGN_FRAME(). This will cause pte_pfn() to choke on it, as we
+ * require. In all the cases we care about, the FOREIGN_FRAME bit is
+ * masked (e.g., pfn_to_mfn()) so behaviour there is correct.
+ */
+static inline unsigned long bfn_to_local_pfn(unsigned long mfn)
+{
+ unsigned long pfn;
+
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return mfn;
+
+ pfn = mfn_to_pfn(mfn);
+ if (__pfn_to_mfn(pfn) != mfn)
+ return -1; /* force !pfn_valid() */
+ return pfn;
+}
+
+/* VIRT <-> MACHINE conversion */
+#define virt_to_machine(v) (phys_to_machine(XPADDR(__pa(v))))
+#define virt_to_pfn(v) (PFN_DOWN(__pa(v)))
+#define virt_to_mfn(v) (pfn_to_mfn(virt_to_pfn(v)))
+#define mfn_to_virt(m) (__va(mfn_to_pfn(m) << PAGE_SHIFT))
+
+/* VIRT <-> GUEST conversion */
+#define virt_to_gfn(v) (pfn_to_gfn(virt_to_pfn(v)))
+#define gfn_to_virt(g) (__va(gfn_to_pfn(g) << PAGE_SHIFT))
+
+static inline unsigned long pte_mfn(pte_t pte)
+{
+ return (pte.pte & XEN_PTE_MFN_MASK) >> PAGE_SHIFT;
+}
+
+static inline pte_t mfn_pte(unsigned long page_nr, pgprot_t pgprot)
+{
+ pte_t pte;
+
+ pte.pte = ((phys_addr_t)page_nr << PAGE_SHIFT) |
+ massage_pgprot(pgprot);
+
+ return pte;
+}
+
+static inline pteval_t pte_val_ma(pte_t pte)
+{
+ return pte.pte;
+}
+
+static inline pte_t __pte_ma(pteval_t x)
+{
+ return (pte_t) { .pte = x };
+}
+
+#define pmd_val_ma(v) ((v).pmd)
+#ifdef __PAGETABLE_PUD_FOLDED
+#define pud_val_ma(v) ((v).p4d.pgd.pgd)
+#else
+#define pud_val_ma(v) ((v).pud)
+#endif
+#define __pmd_ma(x) ((pmd_t) { (x) } )
+
+#ifdef __PAGETABLE_P4D_FOLDED
+#define p4d_val_ma(x) ((x).pgd.pgd)
+#else
+#define p4d_val_ma(x) ((x).p4d)
+#endif
+
+xmaddr_t arbitrary_virt_to_machine(void *address);
+unsigned long arbitrary_virt_to_mfn(void *vaddr);
+void make_lowmem_page_readonly(void *vaddr);
+void make_lowmem_page_readwrite(void *vaddr);
+
+#define xen_remap(cookie, size) ioremap((cookie), (size));
+#define xen_unmap(cookie) iounmap((cookie))
+
+static inline bool xen_arch_need_swiotlb(struct device *dev,
+ phys_addr_t phys,
+ dma_addr_t dev_addr)
+{
+ return false;
+}
+
+static inline unsigned long xen_get_swiotlb_free_pages(unsigned int order)
+{
+ return __get_free_pages(__GFP_NOWARN, order);
+}
+
+#endif /* _ASM_X86_XEN_PAGE_H */
diff --git a/arch/x86/include/asm/xen/pci.h b/arch/x86/include/asm/xen/pci.h
new file mode 100644
index 000000000..3506d8c59
--- /dev/null
+++ b/arch/x86/include/asm/xen/pci.h
@@ -0,0 +1,83 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_XEN_PCI_H
+#define _ASM_X86_XEN_PCI_H
+
+#if defined(CONFIG_PCI_XEN)
+extern int __init pci_xen_init(void);
+extern int __init pci_xen_hvm_init(void);
+#define pci_xen 1
+#else
+#define pci_xen 0
+#define pci_xen_init (0)
+static inline int pci_xen_hvm_init(void)
+{
+ return -1;
+}
+#endif
+#if defined(CONFIG_XEN_DOM0)
+int __init pci_xen_initial_domain(void);
+int xen_find_device_domain_owner(struct pci_dev *dev);
+int xen_register_device_domain_owner(struct pci_dev *dev, uint16_t domain);
+int xen_unregister_device_domain_owner(struct pci_dev *dev);
+#else
+static inline int __init pci_xen_initial_domain(void)
+{
+ return -1;
+}
+static inline int xen_find_device_domain_owner(struct pci_dev *dev)
+{
+ return -1;
+}
+static inline int xen_register_device_domain_owner(struct pci_dev *dev,
+ uint16_t domain)
+{
+ return -1;
+}
+static inline int xen_unregister_device_domain_owner(struct pci_dev *dev)
+{
+ return -1;
+}
+#endif
+
+#if defined(CONFIG_PCI_MSI)
+#if defined(CONFIG_PCI_XEN)
+/* The drivers/pci/xen-pcifront.c sets this structure to
+ * its own functions.
+ */
+struct xen_pci_frontend_ops {
+ int (*enable_msi)(struct pci_dev *dev, int vectors[]);
+ void (*disable_msi)(struct pci_dev *dev);
+ int (*enable_msix)(struct pci_dev *dev, int vectors[], int nvec);
+ void (*disable_msix)(struct pci_dev *dev);
+};
+
+extern struct xen_pci_frontend_ops *xen_pci_frontend;
+
+static inline int xen_pci_frontend_enable_msi(struct pci_dev *dev,
+ int vectors[])
+{
+ if (xen_pci_frontend && xen_pci_frontend->enable_msi)
+ return xen_pci_frontend->enable_msi(dev, vectors);
+ return -ENOSYS;
+}
+static inline void xen_pci_frontend_disable_msi(struct pci_dev *dev)
+{
+ if (xen_pci_frontend && xen_pci_frontend->disable_msi)
+ xen_pci_frontend->disable_msi(dev);
+}
+static inline int xen_pci_frontend_enable_msix(struct pci_dev *dev,
+ int vectors[], int nvec)
+{
+ if (xen_pci_frontend && xen_pci_frontend->enable_msix)
+ return xen_pci_frontend->enable_msix(dev, vectors, nvec);
+ return -ENOSYS;
+}
+static inline void xen_pci_frontend_disable_msix(struct pci_dev *dev)
+{
+ if (xen_pci_frontend && xen_pci_frontend->disable_msix)
+ xen_pci_frontend->disable_msix(dev);
+}
+#endif /* CONFIG_PCI_XEN */
+#endif /* CONFIG_PCI_MSI */
+
+#endif /* _ASM_X86_XEN_PCI_H */
diff --git a/arch/x86/include/asm/xen/swiotlb-xen.h b/arch/x86/include/asm/xen/swiotlb-xen.h
new file mode 100644
index 000000000..6b56d0d45
--- /dev/null
+++ b/arch/x86/include/asm/xen/swiotlb-xen.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_SWIOTLB_XEN_H
+#define _ASM_X86_SWIOTLB_XEN_H
+
+#ifdef CONFIG_SWIOTLB_XEN
+extern int xen_swiotlb;
+extern int __init pci_xen_swiotlb_detect(void);
+extern void __init pci_xen_swiotlb_init(void);
+extern int pci_xen_swiotlb_init_late(void);
+#else
+#define xen_swiotlb (0)
+static inline int __init pci_xen_swiotlb_detect(void) { return 0; }
+static inline void __init pci_xen_swiotlb_init(void) { }
+static inline int pci_xen_swiotlb_init_late(void) { return -ENXIO; }
+#endif
+
+#endif /* _ASM_X86_SWIOTLB_XEN_H */
diff --git a/arch/x86/include/asm/xen/trace_types.h b/arch/x86/include/asm/xen/trace_types.h
new file mode 100644
index 000000000..2aad0abd6
--- /dev/null
+++ b/arch/x86/include/asm/xen/trace_types.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_XEN_TRACE_TYPES_H
+#define _ASM_XEN_TRACE_TYPES_H
+
+enum xen_mc_flush_reason {
+ XEN_MC_FL_NONE, /* explicit flush */
+ XEN_MC_FL_BATCH, /* out of hypercall space */
+ XEN_MC_FL_ARGS, /* out of argument space */
+ XEN_MC_FL_CALLBACK, /* out of callback space */
+};
+
+enum xen_mc_extend_args {
+ XEN_MC_XE_OK,
+ XEN_MC_XE_BAD_OP,
+ XEN_MC_XE_NO_SPACE
+};
+typedef void (*xen_mc_callback_fn_t)(void *);
+
+#endif /* _ASM_XEN_TRACE_TYPES_H */
diff --git a/arch/x86/include/asm/xor.h b/arch/x86/include/asm/xor.h
new file mode 100644
index 000000000..45c860546
--- /dev/null
+++ b/arch/x86/include/asm/xor.h
@@ -0,0 +1,496 @@
+#ifndef _ASM_X86_XOR_H
+#define _ASM_X86_XOR_H
+
+/*
+ * Optimized RAID-5 checksumming functions for SSE.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * You should have received a copy of the GNU General Public License
+ * (for example /usr/src/linux/COPYING); if not, write to the Free
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/*
+ * Cache avoiding checksumming functions utilizing KNI instructions
+ * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
+ */
+
+/*
+ * Based on
+ * High-speed RAID5 checksumming functions utilizing SSE instructions.
+ * Copyright (C) 1998 Ingo Molnar.
+ */
+
+/*
+ * x86-64 changes / gcc fixes from Andi Kleen.
+ * Copyright 2002 Andi Kleen, SuSE Labs.
+ *
+ * This hasn't been optimized for the hammer yet, but there are likely
+ * no advantages to be gotten from x86-64 here anyways.
+ */
+
+#include <asm/fpu/api.h>
+
+#ifdef CONFIG_X86_32
+/* reduce register pressure */
+# define XOR_CONSTANT_CONSTRAINT "i"
+#else
+# define XOR_CONSTANT_CONSTRAINT "re"
+#endif
+
+#define OFFS(x) "16*("#x")"
+#define PF_OFFS(x) "256+16*("#x")"
+#define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n"
+#define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n"
+#define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n"
+#define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n"
+#define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n"
+#define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n"
+#define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n"
+#define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n"
+#define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n"
+#define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n"
+#define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n"
+#define NOP(x)
+
+#define BLK64(pf, op, i) \
+ pf(i) \
+ op(i, 0) \
+ op(i + 1, 1) \
+ op(i + 2, 2) \
+ op(i + 3, 3)
+
+static void
+xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
+{
+ unsigned long lines = bytes >> 8;
+
+ kernel_fpu_begin();
+
+ asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+ LD(i, 0) \
+ LD(i + 1, 1) \
+ PF1(i) \
+ PF1(i + 2) \
+ LD(i + 2, 2) \
+ LD(i + 3, 3) \
+ PF0(i + 4) \
+ PF0(i + 6) \
+ XO1(i, 0) \
+ XO1(i + 1, 1) \
+ XO1(i + 2, 2) \
+ XO1(i + 3, 3) \
+ ST(i, 0) \
+ ST(i + 1, 1) \
+ ST(i + 2, 2) \
+ ST(i + 3, 3) \
+
+
+ PF0(0)
+ PF0(2)
+
+ " .align 32 ;\n"
+ " 1: ;\n"
+
+ BLOCK(0)
+ BLOCK(4)
+ BLOCK(8)
+ BLOCK(12)
+
+ " add %[inc], %[p1] ;\n"
+ " add %[inc], %[p2] ;\n"
+ " dec %[cnt] ;\n"
+ " jnz 1b ;\n"
+ : [cnt] "+r" (lines),
+ [p1] "+r" (p1), [p2] "+r" (p2)
+ : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
+ : "memory");
+
+ kernel_fpu_end();
+}
+
+static void
+xor_sse_2_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2)
+{
+ unsigned long lines = bytes >> 8;
+
+ kernel_fpu_begin();
+
+ asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+ BLK64(PF0, LD, i) \
+ BLK64(PF1, XO1, i) \
+ BLK64(NOP, ST, i) \
+
+ " .align 32 ;\n"
+ " 1: ;\n"
+
+ BLOCK(0)
+ BLOCK(4)
+ BLOCK(8)
+ BLOCK(12)
+
+ " add %[inc], %[p1] ;\n"
+ " add %[inc], %[p2] ;\n"
+ " dec %[cnt] ;\n"
+ " jnz 1b ;\n"
+ : [cnt] "+r" (lines),
+ [p1] "+r" (p1), [p2] "+r" (p2)
+ : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
+ : "memory");
+
+ kernel_fpu_end();
+}
+
+static void
+xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ unsigned long *p3)
+{
+ unsigned long lines = bytes >> 8;
+
+ kernel_fpu_begin();
+
+ asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+ PF1(i) \
+ PF1(i + 2) \
+ LD(i, 0) \
+ LD(i + 1, 1) \
+ LD(i + 2, 2) \
+ LD(i + 3, 3) \
+ PF2(i) \
+ PF2(i + 2) \
+ PF0(i + 4) \
+ PF0(i + 6) \
+ XO1(i, 0) \
+ XO1(i + 1, 1) \
+ XO1(i + 2, 2) \
+ XO1(i + 3, 3) \
+ XO2(i, 0) \
+ XO2(i + 1, 1) \
+ XO2(i + 2, 2) \
+ XO2(i + 3, 3) \
+ ST(i, 0) \
+ ST(i + 1, 1) \
+ ST(i + 2, 2) \
+ ST(i + 3, 3) \
+
+
+ PF0(0)
+ PF0(2)
+
+ " .align 32 ;\n"
+ " 1: ;\n"
+
+ BLOCK(0)
+ BLOCK(4)
+ BLOCK(8)
+ BLOCK(12)
+
+ " add %[inc], %[p1] ;\n"
+ " add %[inc], %[p2] ;\n"
+ " add %[inc], %[p3] ;\n"
+ " dec %[cnt] ;\n"
+ " jnz 1b ;\n"
+ : [cnt] "+r" (lines),
+ [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
+ : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
+ : "memory");
+
+ kernel_fpu_end();
+}
+
+static void
+xor_sse_3_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ unsigned long *p3)
+{
+ unsigned long lines = bytes >> 8;
+
+ kernel_fpu_begin();
+
+ asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+ BLK64(PF0, LD, i) \
+ BLK64(PF1, XO1, i) \
+ BLK64(PF2, XO2, i) \
+ BLK64(NOP, ST, i) \
+
+ " .align 32 ;\n"
+ " 1: ;\n"
+
+ BLOCK(0)
+ BLOCK(4)
+ BLOCK(8)
+ BLOCK(12)
+
+ " add %[inc], %[p1] ;\n"
+ " add %[inc], %[p2] ;\n"
+ " add %[inc], %[p3] ;\n"
+ " dec %[cnt] ;\n"
+ " jnz 1b ;\n"
+ : [cnt] "+r" (lines),
+ [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
+ : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
+ : "memory");
+
+ kernel_fpu_end();
+}
+
+static void
+xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ unsigned long *p3, unsigned long *p4)
+{
+ unsigned long lines = bytes >> 8;
+
+ kernel_fpu_begin();
+
+ asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+ PF1(i) \
+ PF1(i + 2) \
+ LD(i, 0) \
+ LD(i + 1, 1) \
+ LD(i + 2, 2) \
+ LD(i + 3, 3) \
+ PF2(i) \
+ PF2(i + 2) \
+ XO1(i, 0) \
+ XO1(i + 1, 1) \
+ XO1(i + 2, 2) \
+ XO1(i + 3, 3) \
+ PF3(i) \
+ PF3(i + 2) \
+ PF0(i + 4) \
+ PF0(i + 6) \
+ XO2(i, 0) \
+ XO2(i + 1, 1) \
+ XO2(i + 2, 2) \
+ XO2(i + 3, 3) \
+ XO3(i, 0) \
+ XO3(i + 1, 1) \
+ XO3(i + 2, 2) \
+ XO3(i + 3, 3) \
+ ST(i, 0) \
+ ST(i + 1, 1) \
+ ST(i + 2, 2) \
+ ST(i + 3, 3) \
+
+
+ PF0(0)
+ PF0(2)
+
+ " .align 32 ;\n"
+ " 1: ;\n"
+
+ BLOCK(0)
+ BLOCK(4)
+ BLOCK(8)
+ BLOCK(12)
+
+ " add %[inc], %[p1] ;\n"
+ " add %[inc], %[p2] ;\n"
+ " add %[inc], %[p3] ;\n"
+ " add %[inc], %[p4] ;\n"
+ " dec %[cnt] ;\n"
+ " jnz 1b ;\n"
+ : [cnt] "+r" (lines), [p1] "+r" (p1),
+ [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
+ : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
+ : "memory");
+
+ kernel_fpu_end();
+}
+
+static void
+xor_sse_4_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ unsigned long *p3, unsigned long *p4)
+{
+ unsigned long lines = bytes >> 8;
+
+ kernel_fpu_begin();
+
+ asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+ BLK64(PF0, LD, i) \
+ BLK64(PF1, XO1, i) \
+ BLK64(PF2, XO2, i) \
+ BLK64(PF3, XO3, i) \
+ BLK64(NOP, ST, i) \
+
+ " .align 32 ;\n"
+ " 1: ;\n"
+
+ BLOCK(0)
+ BLOCK(4)
+ BLOCK(8)
+ BLOCK(12)
+
+ " add %[inc], %[p1] ;\n"
+ " add %[inc], %[p2] ;\n"
+ " add %[inc], %[p3] ;\n"
+ " add %[inc], %[p4] ;\n"
+ " dec %[cnt] ;\n"
+ " jnz 1b ;\n"
+ : [cnt] "+r" (lines), [p1] "+r" (p1),
+ [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
+ : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
+ : "memory");
+
+ kernel_fpu_end();
+}
+
+static void
+xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ unsigned long *p3, unsigned long *p4, unsigned long *p5)
+{
+ unsigned long lines = bytes >> 8;
+
+ kernel_fpu_begin();
+
+ asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+ PF1(i) \
+ PF1(i + 2) \
+ LD(i, 0) \
+ LD(i + 1, 1) \
+ LD(i + 2, 2) \
+ LD(i + 3, 3) \
+ PF2(i) \
+ PF2(i + 2) \
+ XO1(i, 0) \
+ XO1(i + 1, 1) \
+ XO1(i + 2, 2) \
+ XO1(i + 3, 3) \
+ PF3(i) \
+ PF3(i + 2) \
+ XO2(i, 0) \
+ XO2(i + 1, 1) \
+ XO2(i + 2, 2) \
+ XO2(i + 3, 3) \
+ PF4(i) \
+ PF4(i + 2) \
+ PF0(i + 4) \
+ PF0(i + 6) \
+ XO3(i, 0) \
+ XO3(i + 1, 1) \
+ XO3(i + 2, 2) \
+ XO3(i + 3, 3) \
+ XO4(i, 0) \
+ XO4(i + 1, 1) \
+ XO4(i + 2, 2) \
+ XO4(i + 3, 3) \
+ ST(i, 0) \
+ ST(i + 1, 1) \
+ ST(i + 2, 2) \
+ ST(i + 3, 3) \
+
+
+ PF0(0)
+ PF0(2)
+
+ " .align 32 ;\n"
+ " 1: ;\n"
+
+ BLOCK(0)
+ BLOCK(4)
+ BLOCK(8)
+ BLOCK(12)
+
+ " add %[inc], %[p1] ;\n"
+ " add %[inc], %[p2] ;\n"
+ " add %[inc], %[p3] ;\n"
+ " add %[inc], %[p4] ;\n"
+ " add %[inc], %[p5] ;\n"
+ " dec %[cnt] ;\n"
+ " jnz 1b ;\n"
+ : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
+ [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
+ : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
+ : "memory");
+
+ kernel_fpu_end();
+}
+
+static void
+xor_sse_5_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ unsigned long *p3, unsigned long *p4, unsigned long *p5)
+{
+ unsigned long lines = bytes >> 8;
+
+ kernel_fpu_begin();
+
+ asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+ BLK64(PF0, LD, i) \
+ BLK64(PF1, XO1, i) \
+ BLK64(PF2, XO2, i) \
+ BLK64(PF3, XO3, i) \
+ BLK64(PF4, XO4, i) \
+ BLK64(NOP, ST, i) \
+
+ " .align 32 ;\n"
+ " 1: ;\n"
+
+ BLOCK(0)
+ BLOCK(4)
+ BLOCK(8)
+ BLOCK(12)
+
+ " add %[inc], %[p1] ;\n"
+ " add %[inc], %[p2] ;\n"
+ " add %[inc], %[p3] ;\n"
+ " add %[inc], %[p4] ;\n"
+ " add %[inc], %[p5] ;\n"
+ " dec %[cnt] ;\n"
+ " jnz 1b ;\n"
+ : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
+ [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
+ : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
+ : "memory");
+
+ kernel_fpu_end();
+}
+
+static struct xor_block_template xor_block_sse_pf64 = {
+ .name = "prefetch64-sse",
+ .do_2 = xor_sse_2_pf64,
+ .do_3 = xor_sse_3_pf64,
+ .do_4 = xor_sse_4_pf64,
+ .do_5 = xor_sse_5_pf64,
+};
+
+#undef LD
+#undef XO1
+#undef XO2
+#undef XO3
+#undef XO4
+#undef ST
+#undef NOP
+#undef BLK64
+#undef BLOCK
+
+#undef XOR_CONSTANT_CONSTRAINT
+
+#ifdef CONFIG_X86_32
+# include <asm/xor_32.h>
+#else
+# include <asm/xor_64.h>
+#endif
+
+#define XOR_SELECT_TEMPLATE(FASTEST) \
+ AVX_SELECT(FASTEST)
+
+#endif /* _ASM_X86_XOR_H */
diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h
new file mode 100644
index 000000000..635eac543
--- /dev/null
+++ b/arch/x86/include/asm/xor_32.h
@@ -0,0 +1,567 @@
+#ifndef _ASM_X86_XOR_32_H
+#define _ASM_X86_XOR_32_H
+
+/*
+ * Optimized RAID-5 checksumming functions for MMX.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * You should have received a copy of the GNU General Public License
+ * (for example /usr/src/linux/COPYING); if not, write to the Free
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/*
+ * High-speed RAID5 checksumming functions utilizing MMX instructions.
+ * Copyright (C) 1998 Ingo Molnar.
+ */
+
+#define LD(x, y) " movq 8*("#x")(%1), %%mm"#y" ;\n"
+#define ST(x, y) " movq %%mm"#y", 8*("#x")(%1) ;\n"
+#define XO1(x, y) " pxor 8*("#x")(%2), %%mm"#y" ;\n"
+#define XO2(x, y) " pxor 8*("#x")(%3), %%mm"#y" ;\n"
+#define XO3(x, y) " pxor 8*("#x")(%4), %%mm"#y" ;\n"
+#define XO4(x, y) " pxor 8*("#x")(%5), %%mm"#y" ;\n"
+
+#include <asm/fpu/api.h>
+
+static void
+xor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
+{
+ unsigned long lines = bytes >> 7;
+
+ kernel_fpu_begin();
+
+ asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+ LD(i, 0) \
+ LD(i + 1, 1) \
+ LD(i + 2, 2) \
+ LD(i + 3, 3) \
+ XO1(i, 0) \
+ ST(i, 0) \
+ XO1(i+1, 1) \
+ ST(i+1, 1) \
+ XO1(i + 2, 2) \
+ ST(i + 2, 2) \
+ XO1(i + 3, 3) \
+ ST(i + 3, 3)
+
+ " .align 32 ;\n"
+ " 1: ;\n"
+
+ BLOCK(0)
+ BLOCK(4)
+ BLOCK(8)
+ BLOCK(12)
+
+ " addl $128, %1 ;\n"
+ " addl $128, %2 ;\n"
+ " decl %0 ;\n"
+ " jnz 1b ;\n"
+ : "+r" (lines),
+ "+r" (p1), "+r" (p2)
+ :
+ : "memory");
+
+ kernel_fpu_end();
+}
+
+static void
+xor_pII_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ unsigned long *p3)
+{
+ unsigned long lines = bytes >> 7;
+
+ kernel_fpu_begin();
+
+ asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+ LD(i, 0) \
+ LD(i + 1, 1) \
+ LD(i + 2, 2) \
+ LD(i + 3, 3) \
+ XO1(i, 0) \
+ XO1(i + 1, 1) \
+ XO1(i + 2, 2) \
+ XO1(i + 3, 3) \
+ XO2(i, 0) \
+ ST(i, 0) \
+ XO2(i + 1, 1) \
+ ST(i + 1, 1) \
+ XO2(i + 2, 2) \
+ ST(i + 2, 2) \
+ XO2(i + 3, 3) \
+ ST(i + 3, 3)
+
+ " .align 32 ;\n"
+ " 1: ;\n"
+
+ BLOCK(0)
+ BLOCK(4)
+ BLOCK(8)
+ BLOCK(12)
+
+ " addl $128, %1 ;\n"
+ " addl $128, %2 ;\n"
+ " addl $128, %3 ;\n"
+ " decl %0 ;\n"
+ " jnz 1b ;\n"
+ : "+r" (lines),
+ "+r" (p1), "+r" (p2), "+r" (p3)
+ :
+ : "memory");
+
+ kernel_fpu_end();
+}
+
+static void
+xor_pII_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ unsigned long *p3, unsigned long *p4)
+{
+ unsigned long lines = bytes >> 7;
+
+ kernel_fpu_begin();
+
+ asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+ LD(i, 0) \
+ LD(i + 1, 1) \
+ LD(i + 2, 2) \
+ LD(i + 3, 3) \
+ XO1(i, 0) \
+ XO1(i + 1, 1) \
+ XO1(i + 2, 2) \
+ XO1(i + 3, 3) \
+ XO2(i, 0) \
+ XO2(i + 1, 1) \
+ XO2(i + 2, 2) \
+ XO2(i + 3, 3) \
+ XO3(i, 0) \
+ ST(i, 0) \
+ XO3(i + 1, 1) \
+ ST(i + 1, 1) \
+ XO3(i + 2, 2) \
+ ST(i + 2, 2) \
+ XO3(i + 3, 3) \
+ ST(i + 3, 3)
+
+ " .align 32 ;\n"
+ " 1: ;\n"
+
+ BLOCK(0)
+ BLOCK(4)
+ BLOCK(8)
+ BLOCK(12)
+
+ " addl $128, %1 ;\n"
+ " addl $128, %2 ;\n"
+ " addl $128, %3 ;\n"
+ " addl $128, %4 ;\n"
+ " decl %0 ;\n"
+ " jnz 1b ;\n"
+ : "+r" (lines),
+ "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
+ :
+ : "memory");
+
+ kernel_fpu_end();
+}
+
+
+static void
+xor_pII_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ unsigned long *p3, unsigned long *p4, unsigned long *p5)
+{
+ unsigned long lines = bytes >> 7;
+
+ kernel_fpu_begin();
+
+ /* Make sure GCC forgets anything it knows about p4 or p5,
+ such that it won't pass to the asm volatile below a
+ register that is shared with any other variable. That's
+ because we modify p4 and p5 there, but we can't mark them
+ as read/write, otherwise we'd overflow the 10-asm-operands
+ limit of GCC < 3.1. */
+ asm("" : "+r" (p4), "+r" (p5));
+
+ asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+ LD(i, 0) \
+ LD(i + 1, 1) \
+ LD(i + 2, 2) \
+ LD(i + 3, 3) \
+ XO1(i, 0) \
+ XO1(i + 1, 1) \
+ XO1(i + 2, 2) \
+ XO1(i + 3, 3) \
+ XO2(i, 0) \
+ XO2(i + 1, 1) \
+ XO2(i + 2, 2) \
+ XO2(i + 3, 3) \
+ XO3(i, 0) \
+ XO3(i + 1, 1) \
+ XO3(i + 2, 2) \
+ XO3(i + 3, 3) \
+ XO4(i, 0) \
+ ST(i, 0) \
+ XO4(i + 1, 1) \
+ ST(i + 1, 1) \
+ XO4(i + 2, 2) \
+ ST(i + 2, 2) \
+ XO4(i + 3, 3) \
+ ST(i + 3, 3)
+
+ " .align 32 ;\n"
+ " 1: ;\n"
+
+ BLOCK(0)
+ BLOCK(4)
+ BLOCK(8)
+ BLOCK(12)
+
+ " addl $128, %1 ;\n"
+ " addl $128, %2 ;\n"
+ " addl $128, %3 ;\n"
+ " addl $128, %4 ;\n"
+ " addl $128, %5 ;\n"
+ " decl %0 ;\n"
+ " jnz 1b ;\n"
+ : "+r" (lines),
+ "+r" (p1), "+r" (p2), "+r" (p3)
+ : "r" (p4), "r" (p5)
+ : "memory");
+
+ /* p4 and p5 were modified, and now the variables are dead.
+ Clobber them just to be sure nobody does something stupid
+ like assuming they have some legal value. */
+ asm("" : "=r" (p4), "=r" (p5));
+
+ kernel_fpu_end();
+}
+
+#undef LD
+#undef XO1
+#undef XO2
+#undef XO3
+#undef XO4
+#undef ST
+#undef BLOCK
+
+static void
+xor_p5_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
+{
+ unsigned long lines = bytes >> 6;
+
+ kernel_fpu_begin();
+
+ asm volatile(
+ " .align 32 ;\n"
+ " 1: ;\n"
+ " movq (%1), %%mm0 ;\n"
+ " movq 8(%1), %%mm1 ;\n"
+ " pxor (%2), %%mm0 ;\n"
+ " movq 16(%1), %%mm2 ;\n"
+ " movq %%mm0, (%1) ;\n"
+ " pxor 8(%2), %%mm1 ;\n"
+ " movq 24(%1), %%mm3 ;\n"
+ " movq %%mm1, 8(%1) ;\n"
+ " pxor 16(%2), %%mm2 ;\n"
+ " movq 32(%1), %%mm4 ;\n"
+ " movq %%mm2, 16(%1) ;\n"
+ " pxor 24(%2), %%mm3 ;\n"
+ " movq 40(%1), %%mm5 ;\n"
+ " movq %%mm3, 24(%1) ;\n"
+ " pxor 32(%2), %%mm4 ;\n"
+ " movq 48(%1), %%mm6 ;\n"
+ " movq %%mm4, 32(%1) ;\n"
+ " pxor 40(%2), %%mm5 ;\n"
+ " movq 56(%1), %%mm7 ;\n"
+ " movq %%mm5, 40(%1) ;\n"
+ " pxor 48(%2), %%mm6 ;\n"
+ " pxor 56(%2), %%mm7 ;\n"
+ " movq %%mm6, 48(%1) ;\n"
+ " movq %%mm7, 56(%1) ;\n"
+
+ " addl $64, %1 ;\n"
+ " addl $64, %2 ;\n"
+ " decl %0 ;\n"
+ " jnz 1b ;\n"
+ : "+r" (lines),
+ "+r" (p1), "+r" (p2)
+ :
+ : "memory");
+
+ kernel_fpu_end();
+}
+
+static void
+xor_p5_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ unsigned long *p3)
+{
+ unsigned long lines = bytes >> 6;
+
+ kernel_fpu_begin();
+
+ asm volatile(
+ " .align 32,0x90 ;\n"
+ " 1: ;\n"
+ " movq (%1), %%mm0 ;\n"
+ " movq 8(%1), %%mm1 ;\n"
+ " pxor (%2), %%mm0 ;\n"
+ " movq 16(%1), %%mm2 ;\n"
+ " pxor 8(%2), %%mm1 ;\n"
+ " pxor (%3), %%mm0 ;\n"
+ " pxor 16(%2), %%mm2 ;\n"
+ " movq %%mm0, (%1) ;\n"
+ " pxor 8(%3), %%mm1 ;\n"
+ " pxor 16(%3), %%mm2 ;\n"
+ " movq 24(%1), %%mm3 ;\n"
+ " movq %%mm1, 8(%1) ;\n"
+ " movq 32(%1), %%mm4 ;\n"
+ " movq 40(%1), %%mm5 ;\n"
+ " pxor 24(%2), %%mm3 ;\n"
+ " movq %%mm2, 16(%1) ;\n"
+ " pxor 32(%2), %%mm4 ;\n"
+ " pxor 24(%3), %%mm3 ;\n"
+ " pxor 40(%2), %%mm5 ;\n"
+ " movq %%mm3, 24(%1) ;\n"
+ " pxor 32(%3), %%mm4 ;\n"
+ " pxor 40(%3), %%mm5 ;\n"
+ " movq 48(%1), %%mm6 ;\n"
+ " movq %%mm4, 32(%1) ;\n"
+ " movq 56(%1), %%mm7 ;\n"
+ " pxor 48(%2), %%mm6 ;\n"
+ " movq %%mm5, 40(%1) ;\n"
+ " pxor 56(%2), %%mm7 ;\n"
+ " pxor 48(%3), %%mm6 ;\n"
+ " pxor 56(%3), %%mm7 ;\n"
+ " movq %%mm6, 48(%1) ;\n"
+ " movq %%mm7, 56(%1) ;\n"
+
+ " addl $64, %1 ;\n"
+ " addl $64, %2 ;\n"
+ " addl $64, %3 ;\n"
+ " decl %0 ;\n"
+ " jnz 1b ;\n"
+ : "+r" (lines),
+ "+r" (p1), "+r" (p2), "+r" (p3)
+ :
+ : "memory" );
+
+ kernel_fpu_end();
+}
+
+static void
+xor_p5_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ unsigned long *p3, unsigned long *p4)
+{
+ unsigned long lines = bytes >> 6;
+
+ kernel_fpu_begin();
+
+ asm volatile(
+ " .align 32,0x90 ;\n"
+ " 1: ;\n"
+ " movq (%1), %%mm0 ;\n"
+ " movq 8(%1), %%mm1 ;\n"
+ " pxor (%2), %%mm0 ;\n"
+ " movq 16(%1), %%mm2 ;\n"
+ " pxor 8(%2), %%mm1 ;\n"
+ " pxor (%3), %%mm0 ;\n"
+ " pxor 16(%2), %%mm2 ;\n"
+ " pxor 8(%3), %%mm1 ;\n"
+ " pxor (%4), %%mm0 ;\n"
+ " movq 24(%1), %%mm3 ;\n"
+ " pxor 16(%3), %%mm2 ;\n"
+ " pxor 8(%4), %%mm1 ;\n"
+ " movq %%mm0, (%1) ;\n"
+ " movq 32(%1), %%mm4 ;\n"
+ " pxor 24(%2), %%mm3 ;\n"
+ " pxor 16(%4), %%mm2 ;\n"
+ " movq %%mm1, 8(%1) ;\n"
+ " movq 40(%1), %%mm5 ;\n"
+ " pxor 32(%2), %%mm4 ;\n"
+ " pxor 24(%3), %%mm3 ;\n"
+ " movq %%mm2, 16(%1) ;\n"
+ " pxor 40(%2), %%mm5 ;\n"
+ " pxor 32(%3), %%mm4 ;\n"
+ " pxor 24(%4), %%mm3 ;\n"
+ " movq %%mm3, 24(%1) ;\n"
+ " movq 56(%1), %%mm7 ;\n"
+ " movq 48(%1), %%mm6 ;\n"
+ " pxor 40(%3), %%mm5 ;\n"
+ " pxor 32(%4), %%mm4 ;\n"
+ " pxor 48(%2), %%mm6 ;\n"
+ " movq %%mm4, 32(%1) ;\n"
+ " pxor 56(%2), %%mm7 ;\n"
+ " pxor 40(%4), %%mm5 ;\n"
+ " pxor 48(%3), %%mm6 ;\n"
+ " pxor 56(%3), %%mm7 ;\n"
+ " movq %%mm5, 40(%1) ;\n"
+ " pxor 48(%4), %%mm6 ;\n"
+ " pxor 56(%4), %%mm7 ;\n"
+ " movq %%mm6, 48(%1) ;\n"
+ " movq %%mm7, 56(%1) ;\n"
+
+ " addl $64, %1 ;\n"
+ " addl $64, %2 ;\n"
+ " addl $64, %3 ;\n"
+ " addl $64, %4 ;\n"
+ " decl %0 ;\n"
+ " jnz 1b ;\n"
+ : "+r" (lines),
+ "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
+ :
+ : "memory");
+
+ kernel_fpu_end();
+}
+
+static void
+xor_p5_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ unsigned long *p3, unsigned long *p4, unsigned long *p5)
+{
+ unsigned long lines = bytes >> 6;
+
+ kernel_fpu_begin();
+
+ /* Make sure GCC forgets anything it knows about p4 or p5,
+ such that it won't pass to the asm volatile below a
+ register that is shared with any other variable. That's
+ because we modify p4 and p5 there, but we can't mark them
+ as read/write, otherwise we'd overflow the 10-asm-operands
+ limit of GCC < 3.1. */
+ asm("" : "+r" (p4), "+r" (p5));
+
+ asm volatile(
+ " .align 32,0x90 ;\n"
+ " 1: ;\n"
+ " movq (%1), %%mm0 ;\n"
+ " movq 8(%1), %%mm1 ;\n"
+ " pxor (%2), %%mm0 ;\n"
+ " pxor 8(%2), %%mm1 ;\n"
+ " movq 16(%1), %%mm2 ;\n"
+ " pxor (%3), %%mm0 ;\n"
+ " pxor 8(%3), %%mm1 ;\n"
+ " pxor 16(%2), %%mm2 ;\n"
+ " pxor (%4), %%mm0 ;\n"
+ " pxor 8(%4), %%mm1 ;\n"
+ " pxor 16(%3), %%mm2 ;\n"
+ " movq 24(%1), %%mm3 ;\n"
+ " pxor (%5), %%mm0 ;\n"
+ " pxor 8(%5), %%mm1 ;\n"
+ " movq %%mm0, (%1) ;\n"
+ " pxor 16(%4), %%mm2 ;\n"
+ " pxor 24(%2), %%mm3 ;\n"
+ " movq %%mm1, 8(%1) ;\n"
+ " pxor 16(%5), %%mm2 ;\n"
+ " pxor 24(%3), %%mm3 ;\n"
+ " movq 32(%1), %%mm4 ;\n"
+ " movq %%mm2, 16(%1) ;\n"
+ " pxor 24(%4), %%mm3 ;\n"
+ " pxor 32(%2), %%mm4 ;\n"
+ " movq 40(%1), %%mm5 ;\n"
+ " pxor 24(%5), %%mm3 ;\n"
+ " pxor 32(%3), %%mm4 ;\n"
+ " pxor 40(%2), %%mm5 ;\n"
+ " movq %%mm3, 24(%1) ;\n"
+ " pxor 32(%4), %%mm4 ;\n"
+ " pxor 40(%3), %%mm5 ;\n"
+ " movq 48(%1), %%mm6 ;\n"
+ " movq 56(%1), %%mm7 ;\n"
+ " pxor 32(%5), %%mm4 ;\n"
+ " pxor 40(%4), %%mm5 ;\n"
+ " pxor 48(%2), %%mm6 ;\n"
+ " pxor 56(%2), %%mm7 ;\n"
+ " movq %%mm4, 32(%1) ;\n"
+ " pxor 48(%3), %%mm6 ;\n"
+ " pxor 56(%3), %%mm7 ;\n"
+ " pxor 40(%5), %%mm5 ;\n"
+ " pxor 48(%4), %%mm6 ;\n"
+ " pxor 56(%4), %%mm7 ;\n"
+ " movq %%mm5, 40(%1) ;\n"
+ " pxor 48(%5), %%mm6 ;\n"
+ " pxor 56(%5), %%mm7 ;\n"
+ " movq %%mm6, 48(%1) ;\n"
+ " movq %%mm7, 56(%1) ;\n"
+
+ " addl $64, %1 ;\n"
+ " addl $64, %2 ;\n"
+ " addl $64, %3 ;\n"
+ " addl $64, %4 ;\n"
+ " addl $64, %5 ;\n"
+ " decl %0 ;\n"
+ " jnz 1b ;\n"
+ : "+r" (lines),
+ "+r" (p1), "+r" (p2), "+r" (p3)
+ : "r" (p4), "r" (p5)
+ : "memory");
+
+ /* p4 and p5 were modified, and now the variables are dead.
+ Clobber them just to be sure nobody does something stupid
+ like assuming they have some legal value. */
+ asm("" : "=r" (p4), "=r" (p5));
+
+ kernel_fpu_end();
+}
+
+static struct xor_block_template xor_block_pII_mmx = {
+ .name = "pII_mmx",
+ .do_2 = xor_pII_mmx_2,
+ .do_3 = xor_pII_mmx_3,
+ .do_4 = xor_pII_mmx_4,
+ .do_5 = xor_pII_mmx_5,
+};
+
+static struct xor_block_template xor_block_p5_mmx = {
+ .name = "p5_mmx",
+ .do_2 = xor_p5_mmx_2,
+ .do_3 = xor_p5_mmx_3,
+ .do_4 = xor_p5_mmx_4,
+ .do_5 = xor_p5_mmx_5,
+};
+
+static struct xor_block_template xor_block_pIII_sse = {
+ .name = "pIII_sse",
+ .do_2 = xor_sse_2,
+ .do_3 = xor_sse_3,
+ .do_4 = xor_sse_4,
+ .do_5 = xor_sse_5,
+};
+
+/* Also try the AVX routines */
+#include <asm/xor_avx.h>
+
+/* Also try the generic routines. */
+#include <asm-generic/xor.h>
+
+/* We force the use of the SSE xor block because it can write around L2.
+ We may also be able to load into the L1 only depending on how the cpu
+ deals with a load to a line that is being prefetched. */
+#undef XOR_TRY_TEMPLATES
+#define XOR_TRY_TEMPLATES \
+do { \
+ AVX_XOR_SPEED; \
+ if (boot_cpu_has(X86_FEATURE_XMM)) { \
+ xor_speed(&xor_block_pIII_sse); \
+ xor_speed(&xor_block_sse_pf64); \
+ } else if (boot_cpu_has(X86_FEATURE_MMX)) { \
+ xor_speed(&xor_block_pII_mmx); \
+ xor_speed(&xor_block_p5_mmx); \
+ } else { \
+ xor_speed(&xor_block_8regs); \
+ xor_speed(&xor_block_8regs_p); \
+ xor_speed(&xor_block_32regs); \
+ xor_speed(&xor_block_32regs_p); \
+ } \
+} while (0)
+
+#endif /* _ASM_X86_XOR_32_H */
diff --git a/arch/x86/include/asm/xor_64.h b/arch/x86/include/asm/xor_64.h
new file mode 100644
index 000000000..0307e4ec5
--- /dev/null
+++ b/arch/x86/include/asm/xor_64.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_XOR_64_H
+#define _ASM_X86_XOR_64_H
+
+static struct xor_block_template xor_block_sse = {
+ .name = "generic_sse",
+ .do_2 = xor_sse_2,
+ .do_3 = xor_sse_3,
+ .do_4 = xor_sse_4,
+ .do_5 = xor_sse_5,
+};
+
+
+/* Also try the AVX routines */
+#include <asm/xor_avx.h>
+
+/* We force the use of the SSE xor block because it can write around L2.
+ We may also be able to load into the L1 only depending on how the cpu
+ deals with a load to a line that is being prefetched. */
+#undef XOR_TRY_TEMPLATES
+#define XOR_TRY_TEMPLATES \
+do { \
+ AVX_XOR_SPEED; \
+ xor_speed(&xor_block_sse_pf64); \
+ xor_speed(&xor_block_sse); \
+} while (0)
+
+#endif /* _ASM_X86_XOR_64_H */
diff --git a/arch/x86/include/asm/xor_avx.h b/arch/x86/include/asm/xor_avx.h
new file mode 100644
index 000000000..22a7b1870
--- /dev/null
+++ b/arch/x86/include/asm/xor_avx.h
@@ -0,0 +1,184 @@
+#ifndef _ASM_X86_XOR_AVX_H
+#define _ASM_X86_XOR_AVX_H
+
+/*
+ * Optimized RAID-5 checksumming functions for AVX
+ *
+ * Copyright (C) 2012 Intel Corporation
+ * Author: Jim Kukunas <james.t.kukunas@linux.intel.com>
+ *
+ * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#ifdef CONFIG_AS_AVX
+
+#include <linux/compiler.h>
+#include <asm/fpu/api.h>
+
+#define BLOCK4(i) \
+ BLOCK(32 * i, 0) \
+ BLOCK(32 * (i + 1), 1) \
+ BLOCK(32 * (i + 2), 2) \
+ BLOCK(32 * (i + 3), 3)
+
+#define BLOCK16() \
+ BLOCK4(0) \
+ BLOCK4(4) \
+ BLOCK4(8) \
+ BLOCK4(12)
+
+static void xor_avx_2(unsigned long bytes, unsigned long *p0, unsigned long *p1)
+{
+ unsigned long lines = bytes >> 9;
+
+ kernel_fpu_begin();
+
+ while (lines--) {
+#undef BLOCK
+#define BLOCK(i, reg) \
+do { \
+ asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \
+ asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+ "m" (p0[i / sizeof(*p0)])); \
+ asm volatile("vmovdqa %%ymm" #reg ", %0" : \
+ "=m" (p0[i / sizeof(*p0)])); \
+} while (0);
+
+ BLOCK16()
+
+ p0 = (unsigned long *)((uintptr_t)p0 + 512);
+ p1 = (unsigned long *)((uintptr_t)p1 + 512);
+ }
+
+ kernel_fpu_end();
+}
+
+static void xor_avx_3(unsigned long bytes, unsigned long *p0, unsigned long *p1,
+ unsigned long *p2)
+{
+ unsigned long lines = bytes >> 9;
+
+ kernel_fpu_begin();
+
+ while (lines--) {
+#undef BLOCK
+#define BLOCK(i, reg) \
+do { \
+ asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \
+ asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+ "m" (p1[i / sizeof(*p1)])); \
+ asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+ "m" (p0[i / sizeof(*p0)])); \
+ asm volatile("vmovdqa %%ymm" #reg ", %0" : \
+ "=m" (p0[i / sizeof(*p0)])); \
+} while (0);
+
+ BLOCK16()
+
+ p0 = (unsigned long *)((uintptr_t)p0 + 512);
+ p1 = (unsigned long *)((uintptr_t)p1 + 512);
+ p2 = (unsigned long *)((uintptr_t)p2 + 512);
+ }
+
+ kernel_fpu_end();
+}
+
+static void xor_avx_4(unsigned long bytes, unsigned long *p0, unsigned long *p1,
+ unsigned long *p2, unsigned long *p3)
+{
+ unsigned long lines = bytes >> 9;
+
+ kernel_fpu_begin();
+
+ while (lines--) {
+#undef BLOCK
+#define BLOCK(i, reg) \
+do { \
+ asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \
+ asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+ "m" (p2[i / sizeof(*p2)])); \
+ asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+ "m" (p1[i / sizeof(*p1)])); \
+ asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+ "m" (p0[i / sizeof(*p0)])); \
+ asm volatile("vmovdqa %%ymm" #reg ", %0" : \
+ "=m" (p0[i / sizeof(*p0)])); \
+} while (0);
+
+ BLOCK16();
+
+ p0 = (unsigned long *)((uintptr_t)p0 + 512);
+ p1 = (unsigned long *)((uintptr_t)p1 + 512);
+ p2 = (unsigned long *)((uintptr_t)p2 + 512);
+ p3 = (unsigned long *)((uintptr_t)p3 + 512);
+ }
+
+ kernel_fpu_end();
+}
+
+static void xor_avx_5(unsigned long bytes, unsigned long *p0, unsigned long *p1,
+ unsigned long *p2, unsigned long *p3, unsigned long *p4)
+{
+ unsigned long lines = bytes >> 9;
+
+ kernel_fpu_begin();
+
+ while (lines--) {
+#undef BLOCK
+#define BLOCK(i, reg) \
+do { \
+ asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \
+ asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+ "m" (p3[i / sizeof(*p3)])); \
+ asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+ "m" (p2[i / sizeof(*p2)])); \
+ asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+ "m" (p1[i / sizeof(*p1)])); \
+ asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+ "m" (p0[i / sizeof(*p0)])); \
+ asm volatile("vmovdqa %%ymm" #reg ", %0" : \
+ "=m" (p0[i / sizeof(*p0)])); \
+} while (0);
+
+ BLOCK16()
+
+ p0 = (unsigned long *)((uintptr_t)p0 + 512);
+ p1 = (unsigned long *)((uintptr_t)p1 + 512);
+ p2 = (unsigned long *)((uintptr_t)p2 + 512);
+ p3 = (unsigned long *)((uintptr_t)p3 + 512);
+ p4 = (unsigned long *)((uintptr_t)p4 + 512);
+ }
+
+ kernel_fpu_end();
+}
+
+static struct xor_block_template xor_block_avx = {
+ .name = "avx",
+ .do_2 = xor_avx_2,
+ .do_3 = xor_avx_3,
+ .do_4 = xor_avx_4,
+ .do_5 = xor_avx_5,
+};
+
+#define AVX_XOR_SPEED \
+do { \
+ if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE)) \
+ xor_speed(&xor_block_avx); \
+} while (0)
+
+#define AVX_SELECT(FASTEST) \
+ (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE) ? &xor_block_avx : FASTEST)
+
+#else
+
+#define AVX_XOR_SPEED {}
+
+#define AVX_SELECT(FASTEST) (FASTEST)
+
+#endif
+#endif
diff --git a/arch/x86/include/uapi/asm/Kbuild b/arch/x86/include/uapi/asm/Kbuild
new file mode 100644
index 000000000..322681622
--- /dev/null
+++ b/arch/x86/include/uapi/asm/Kbuild
@@ -0,0 +1,8 @@
+# UAPI Header export list
+include include/uapi/asm-generic/Kbuild.asm
+
+generic-y += bpf_perf_event.h
+generated-y += unistd_32.h
+generated-y += unistd_64.h
+generated-y += unistd_x32.h
+generic-y += poll.h
diff --git a/arch/x86/include/uapi/asm/a.out.h b/arch/x86/include/uapi/asm/a.out.h
new file mode 100644
index 000000000..094c49d8e
--- /dev/null
+++ b/arch/x86/include/uapi/asm/a.out.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _ASM_X86_A_OUT_H
+#define _ASM_X86_A_OUT_H
+
+struct exec
+{
+ unsigned int a_info; /* Use macros N_MAGIC, etc for access */
+ unsigned a_text; /* length of text, in bytes */
+ unsigned a_data; /* length of data, in bytes */
+ unsigned a_bss; /* length of uninitialized data area for file, in bytes */
+ unsigned a_syms; /* length of symbol table data in file, in bytes */
+ unsigned a_entry; /* start address */
+ unsigned a_trsize; /* length of relocation info for text, in bytes */
+ unsigned a_drsize; /* length of relocation info for data, in bytes */
+};
+
+#define N_TRSIZE(a) ((a).a_trsize)
+#define N_DRSIZE(a) ((a).a_drsize)
+#define N_SYMSIZE(a) ((a).a_syms)
+
+#endif /* _ASM_X86_A_OUT_H */
diff --git a/arch/x86/include/uapi/asm/auxvec.h b/arch/x86/include/uapi/asm/auxvec.h
new file mode 100644
index 000000000..580e3c567
--- /dev/null
+++ b/arch/x86/include/uapi/asm/auxvec.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _ASM_X86_AUXVEC_H
+#define _ASM_X86_AUXVEC_H
+/*
+ * Architecture-neutral AT_ values in 0-17, leave some room
+ * for more of them, start the x86-specific ones at 32.
+ */
+#ifdef __i386__
+#define AT_SYSINFO 32
+#endif
+#define AT_SYSINFO_EHDR 33
+
+/* entries in ARCH_DLINFO: */
+#if defined(CONFIG_IA32_EMULATION) || !defined(CONFIG_X86_64)
+# define AT_VECTOR_SIZE_ARCH 2
+#else /* else it's non-compat x86-64 */
+# define AT_VECTOR_SIZE_ARCH 1
+#endif
+
+#endif /* _ASM_X86_AUXVEC_H */
diff --git a/arch/x86/include/uapi/asm/bitsperlong.h b/arch/x86/include/uapi/asm/bitsperlong.h
new file mode 100644
index 000000000..5d72c8458
--- /dev/null
+++ b/arch/x86/include/uapi/asm/bitsperlong.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef __ASM_X86_BITSPERLONG_H
+#define __ASM_X86_BITSPERLONG_H
+
+#if defined(__x86_64__) && !defined(__ILP32__)
+# define __BITS_PER_LONG 64
+#else
+# define __BITS_PER_LONG 32
+#endif
+
+#include <asm-generic/bitsperlong.h>
+
+#endif /* __ASM_X86_BITSPERLONG_H */
+
diff --git a/arch/x86/include/uapi/asm/boot.h b/arch/x86/include/uapi/asm/boot.h
new file mode 100644
index 000000000..88ffc5aee
--- /dev/null
+++ b/arch/x86/include/uapi/asm/boot.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_ASM_X86_BOOT_H
+#define _UAPI_ASM_X86_BOOT_H
+
+/* Internal svga startup constants */
+#define NORMAL_VGA 0xffff /* 80x25 mode */
+#define EXTENDED_VGA 0xfffe /* 80x50 mode */
+#define ASK_VGA 0xfffd /* ask for it at bootup */
+
+
+#endif /* _UAPI_ASM_X86_BOOT_H */
diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h
new file mode 100644
index 000000000..a06cbf019
--- /dev/null
+++ b/arch/x86/include/uapi/asm/bootparam.h
@@ -0,0 +1,249 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _ASM_X86_BOOTPARAM_H
+#define _ASM_X86_BOOTPARAM_H
+
+/* setup_data types */
+#define SETUP_NONE 0
+#define SETUP_E820_EXT 1
+#define SETUP_DTB 2
+#define SETUP_PCI 3
+#define SETUP_EFI 4
+#define SETUP_APPLE_PROPERTIES 5
+#define SETUP_JAILHOUSE 6
+
+/* ram_size flags */
+#define RAMDISK_IMAGE_START_MASK 0x07FF
+#define RAMDISK_PROMPT_FLAG 0x8000
+#define RAMDISK_LOAD_FLAG 0x4000
+
+/* loadflags */
+#define LOADED_HIGH (1<<0)
+#define KASLR_FLAG (1<<1)
+#define QUIET_FLAG (1<<5)
+#define KEEP_SEGMENTS (1<<6)
+#define CAN_USE_HEAP (1<<7)
+
+/* xloadflags */
+#define XLF_KERNEL_64 (1<<0)
+#define XLF_CAN_BE_LOADED_ABOVE_4G (1<<1)
+#define XLF_EFI_HANDOVER_32 (1<<2)
+#define XLF_EFI_HANDOVER_64 (1<<3)
+#define XLF_EFI_KEXEC (1<<4)
+
+#ifndef __ASSEMBLY__
+
+#include <linux/types.h>
+#include <linux/screen_info.h>
+#include <linux/apm_bios.h>
+#include <linux/edd.h>
+#include <asm/ist.h>
+#include <video/edid.h>
+
+/* extensible setup data list node */
+struct setup_data {
+ __u64 next;
+ __u32 type;
+ __u32 len;
+ __u8 data[0];
+};
+
+struct setup_header {
+ __u8 setup_sects;
+ __u16 root_flags;
+ __u32 syssize;
+ __u16 ram_size;
+ __u16 vid_mode;
+ __u16 root_dev;
+ __u16 boot_flag;
+ __u16 jump;
+ __u32 header;
+ __u16 version;
+ __u32 realmode_swtch;
+ __u16 start_sys_seg;
+ __u16 kernel_version;
+ __u8 type_of_loader;
+ __u8 loadflags;
+ __u16 setup_move_size;
+ __u32 code32_start;
+ __u32 ramdisk_image;
+ __u32 ramdisk_size;
+ __u32 bootsect_kludge;
+ __u16 heap_end_ptr;
+ __u8 ext_loader_ver;
+ __u8 ext_loader_type;
+ __u32 cmd_line_ptr;
+ __u32 initrd_addr_max;
+ __u32 kernel_alignment;
+ __u8 relocatable_kernel;
+ __u8 min_alignment;
+ __u16 xloadflags;
+ __u32 cmdline_size;
+ __u32 hardware_subarch;
+ __u64 hardware_subarch_data;
+ __u32 payload_offset;
+ __u32 payload_length;
+ __u64 setup_data;
+ __u64 pref_address;
+ __u32 init_size;
+ __u32 handover_offset;
+} __attribute__((packed));
+
+struct sys_desc_table {
+ __u16 length;
+ __u8 table[14];
+};
+
+/* Gleaned from OFW's set-parameters in cpu/x86/pc/linux.fth */
+struct olpc_ofw_header {
+ __u32 ofw_magic; /* OFW signature */
+ __u32 ofw_version;
+ __u32 cif_handler; /* callback into OFW */
+ __u32 irq_desc_table;
+} __attribute__((packed));
+
+struct efi_info {
+ __u32 efi_loader_signature;
+ __u32 efi_systab;
+ __u32 efi_memdesc_size;
+ __u32 efi_memdesc_version;
+ __u32 efi_memmap;
+ __u32 efi_memmap_size;
+ __u32 efi_systab_hi;
+ __u32 efi_memmap_hi;
+};
+
+/*
+ * This is the maximum number of entries in struct boot_params::e820_table
+ * (the zeropage), which is part of the x86 boot protocol ABI:
+ */
+#define E820_MAX_ENTRIES_ZEROPAGE 128
+
+/*
+ * The E820 memory region entry of the boot protocol ABI:
+ */
+struct boot_e820_entry {
+ __u64 addr;
+ __u64 size;
+ __u32 type;
+} __attribute__((packed));
+
+/*
+ * Smallest compatible version of jailhouse_setup_data required by this kernel.
+ */
+#define JAILHOUSE_SETUP_REQUIRED_VERSION 1
+
+/*
+ * The boot loader is passing platform information via this Jailhouse-specific
+ * setup data structure.
+ */
+struct jailhouse_setup_data {
+ __u16 version;
+ __u16 compatible_version;
+ __u16 pm_timer_address;
+ __u16 num_cpus;
+ __u64 pci_mmconfig_base;
+ __u32 tsc_khz;
+ __u32 apic_khz;
+ __u8 standard_ioapic;
+ __u8 cpu_ids[255];
+} __attribute__((packed));
+
+/* The so-called "zeropage" */
+struct boot_params {
+ struct screen_info screen_info; /* 0x000 */
+ struct apm_bios_info apm_bios_info; /* 0x040 */
+ __u8 _pad2[4]; /* 0x054 */
+ __u64 tboot_addr; /* 0x058 */
+ struct ist_info ist_info; /* 0x060 */
+ __u8 _pad3[16]; /* 0x070 */
+ __u8 hd0_info[16]; /* obsolete! */ /* 0x080 */
+ __u8 hd1_info[16]; /* obsolete! */ /* 0x090 */
+ struct sys_desc_table sys_desc_table; /* obsolete! */ /* 0x0a0 */
+ struct olpc_ofw_header olpc_ofw_header; /* 0x0b0 */
+ __u32 ext_ramdisk_image; /* 0x0c0 */
+ __u32 ext_ramdisk_size; /* 0x0c4 */
+ __u32 ext_cmd_line_ptr; /* 0x0c8 */
+ __u8 _pad4[116]; /* 0x0cc */
+ struct edid_info edid_info; /* 0x140 */
+ struct efi_info efi_info; /* 0x1c0 */
+ __u32 alt_mem_k; /* 0x1e0 */
+ __u32 scratch; /* Scratch field! */ /* 0x1e4 */
+ __u8 e820_entries; /* 0x1e8 */
+ __u8 eddbuf_entries; /* 0x1e9 */
+ __u8 edd_mbr_sig_buf_entries; /* 0x1ea */
+ __u8 kbd_status; /* 0x1eb */
+ __u8 secure_boot; /* 0x1ec */
+ __u8 _pad5[2]; /* 0x1ed */
+ /*
+ * The sentinel is set to a nonzero value (0xff) in header.S.
+ *
+ * A bootloader is supposed to only take setup_header and put
+ * it into a clean boot_params buffer. If it turns out that
+ * it is clumsy or too generous with the buffer, it most
+ * probably will pick up the sentinel variable too. The fact
+ * that this variable then is still 0xff will let kernel
+ * know that some variables in boot_params are invalid and
+ * kernel should zero out certain portions of boot_params.
+ */
+ __u8 sentinel; /* 0x1ef */
+ __u8 _pad6[1]; /* 0x1f0 */
+ struct setup_header hdr; /* setup header */ /* 0x1f1 */
+ __u8 _pad7[0x290-0x1f1-sizeof(struct setup_header)];
+ __u32 edd_mbr_sig_buffer[EDD_MBR_SIG_MAX]; /* 0x290 */
+ struct boot_e820_entry e820_table[E820_MAX_ENTRIES_ZEROPAGE]; /* 0x2d0 */
+ __u8 _pad8[48]; /* 0xcd0 */
+ struct edd_info eddbuf[EDDMAXNR]; /* 0xd00 */
+ __u8 _pad9[276]; /* 0xeec */
+} __attribute__((packed));
+
+/**
+ * enum x86_hardware_subarch - x86 hardware subarchitecture
+ *
+ * The x86 hardware_subarch and hardware_subarch_data were added as of the x86
+ * boot protocol 2.07 to help distinguish and support custom x86 boot
+ * sequences. This enum represents accepted values for the x86
+ * hardware_subarch. Custom x86 boot sequences (not X86_SUBARCH_PC) do not
+ * have or simply *cannot* make use of natural stubs like BIOS or EFI, the
+ * hardware_subarch can be used on the Linux entry path to revector to a
+ * subarchitecture stub when needed. This subarchitecture stub can be used to
+ * set up Linux boot parameters or for special care to account for nonstandard
+ * handling of page tables.
+ *
+ * These enums should only ever be used by x86 code, and the code that uses
+ * it should be well contained and compartamentalized.
+ *
+ * KVM and Xen HVM do not have a subarch as these are expected to follow
+ * standard x86 boot entries. If there is a genuine need for "hypervisor" type
+ * that should be considered separately in the future. Future guest types
+ * should seriously consider working with standard x86 boot stubs such as
+ * the BIOS or EFI boot stubs.
+ *
+ * WARNING: this enum is only used for legacy hacks, for platform features that
+ * are not easily enumerated or discoverable. You should not ever use
+ * this for new features.
+ *
+ * @X86_SUBARCH_PC: Should be used if the hardware is enumerable using standard
+ * PC mechanisms (PCI, ACPI) and doesn't need a special boot flow.
+ * @X86_SUBARCH_LGUEST: Used for x86 hypervisor demo, lguest, deprecated
+ * @X86_SUBARCH_XEN: Used for Xen guest types which follow the PV boot path,
+ * which start at asm startup_xen() entry point and later jump to the C
+ * xen_start_kernel() entry point. Both domU and dom0 type of guests are
+ * currently supportd through this PV boot path.
+ * @X86_SUBARCH_INTEL_MID: Used for Intel MID (Mobile Internet Device) platform
+ * systems which do not have the PCI legacy interfaces.
+ * @X86_SUBARCH_CE4100: Used for Intel CE media processor (CE4100) SoC for
+ * for settop boxes and media devices, the use of a subarch for CE4100
+ * is more of a hack...
+ */
+enum x86_hardware_subarch {
+ X86_SUBARCH_PC = 0,
+ X86_SUBARCH_LGUEST,
+ X86_SUBARCH_XEN,
+ X86_SUBARCH_INTEL_MID,
+ X86_SUBARCH_CE4100,
+ X86_NR_SUBARCHS,
+};
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_X86_BOOTPARAM_H */
diff --git a/arch/x86/include/uapi/asm/byteorder.h b/arch/x86/include/uapi/asm/byteorder.h
new file mode 100644
index 000000000..484e3cfd7
--- /dev/null
+++ b/arch/x86/include/uapi/asm/byteorder.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_BYTEORDER_H
+#define _ASM_X86_BYTEORDER_H
+
+#include <linux/byteorder/little_endian.h>
+
+#endif /* _ASM_X86_BYTEORDER_H */
diff --git a/arch/x86/include/uapi/asm/debugreg.h b/arch/x86/include/uapi/asm/debugreg.h
new file mode 100644
index 000000000..d95d080b3
--- /dev/null
+++ b/arch/x86/include/uapi/asm/debugreg.h
@@ -0,0 +1,81 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_ASM_X86_DEBUGREG_H
+#define _UAPI_ASM_X86_DEBUGREG_H
+
+
+/* Indicate the register numbers for a number of the specific
+ debug registers. Registers 0-3 contain the addresses we wish to trap on */
+#define DR_FIRSTADDR 0 /* u_debugreg[DR_FIRSTADDR] */
+#define DR_LASTADDR 3 /* u_debugreg[DR_LASTADDR] */
+
+#define DR_STATUS 6 /* u_debugreg[DR_STATUS] */
+#define DR_CONTROL 7 /* u_debugreg[DR_CONTROL] */
+
+/* Define a few things for the status register. We can use this to determine
+ which debugging register was responsible for the trap. The other bits
+ are either reserved or not of interest to us. */
+
+/* Define reserved bits in DR6 which are always set to 1 */
+#define DR6_RESERVED (0xFFFF0FF0)
+
+#define DR_TRAP0 (0x1) /* db0 */
+#define DR_TRAP1 (0x2) /* db1 */
+#define DR_TRAP2 (0x4) /* db2 */
+#define DR_TRAP3 (0x8) /* db3 */
+#define DR_TRAP_BITS (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)
+
+#define DR_STEP (0x4000) /* single-step */
+#define DR_SWITCH (0x8000) /* task switch */
+
+/* Now define a bunch of things for manipulating the control register.
+ The top two bytes of the control register consist of 4 fields of 4
+ bits - each field corresponds to one of the four debug registers,
+ and indicates what types of access we trap on, and how large the data
+ field is that we are looking at */
+
+#define DR_CONTROL_SHIFT 16 /* Skip this many bits in ctl register */
+#define DR_CONTROL_SIZE 4 /* 4 control bits per register */
+
+#define DR_RW_EXECUTE (0x0) /* Settings for the access types to trap on */
+#define DR_RW_WRITE (0x1)
+#define DR_RW_READ (0x3)
+
+#define DR_LEN_1 (0x0) /* Settings for data length to trap on */
+#define DR_LEN_2 (0x4)
+#define DR_LEN_4 (0xC)
+#define DR_LEN_8 (0x8)
+
+/* The low byte to the control register determine which registers are
+ enabled. There are 4 fields of two bits. One bit is "local", meaning
+ that the processor will reset the bit after a task switch and the other
+ is global meaning that we have to explicitly reset the bit. With linux,
+ you can use either one, since we explicitly zero the register when we enter
+ kernel mode. */
+
+#define DR_LOCAL_ENABLE_SHIFT 0 /* Extra shift to the local enable bit */
+#define DR_GLOBAL_ENABLE_SHIFT 1 /* Extra shift to the global enable bit */
+#define DR_LOCAL_ENABLE (0x1) /* Local enable for reg 0 */
+#define DR_GLOBAL_ENABLE (0x2) /* Global enable for reg 0 */
+#define DR_ENABLE_SIZE 2 /* 2 enable bits per register */
+
+#define DR_LOCAL_ENABLE_MASK (0x55) /* Set local bits for all 4 regs */
+#define DR_GLOBAL_ENABLE_MASK (0xAA) /* Set global bits for all 4 regs */
+
+/* The second byte to the control register has a few special things.
+ We can slow the instruction pipeline for instructions coming via the
+ gdt or the ldt if we want to. I am not sure why this is an advantage */
+
+#ifdef __i386__
+#define DR_CONTROL_RESERVED (0xFC00) /* Reserved by Intel */
+#else
+#define DR_CONTROL_RESERVED (0xFFFFFFFF0000FC00UL) /* Reserved */
+#endif
+
+#define DR_LOCAL_SLOWDOWN (0x100) /* Local slow the pipeline */
+#define DR_GLOBAL_SLOWDOWN (0x200) /* Global slow the pipeline */
+
+/*
+ * HW breakpoint additions
+ */
+
+#endif /* _UAPI_ASM_X86_DEBUGREG_H */
diff --git a/arch/x86/include/uapi/asm/e820.h b/arch/x86/include/uapi/asm/e820.h
new file mode 100644
index 000000000..2f491efe3
--- /dev/null
+++ b/arch/x86/include/uapi/asm/e820.h
@@ -0,0 +1,82 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_ASM_X86_E820_H
+#define _UAPI_ASM_X86_E820_H
+#define E820MAP 0x2d0 /* our map */
+#define E820MAX 128 /* number of entries in E820MAP */
+
+/*
+ * Legacy E820 BIOS limits us to 128 (E820MAX) nodes due to the
+ * constrained space in the zeropage. If we have more nodes than
+ * that, and if we've booted off EFI firmware, then the EFI tables
+ * passed us from the EFI firmware can list more nodes. Size our
+ * internal memory map tables to have room for these additional
+ * nodes, based on up to three entries per node for which the
+ * kernel was built: MAX_NUMNODES == (1 << CONFIG_NODES_SHIFT),
+ * plus E820MAX, allowing space for the possible duplicate E820
+ * entries that might need room in the same arrays, prior to the
+ * call to sanitize_e820_map() to remove duplicates. The allowance
+ * of three memory map entries per node is "enough" entries for
+ * the initial hardware platform motivating this mechanism to make
+ * use of additional EFI map entries. Future platforms may want
+ * to allow more than three entries per node or otherwise refine
+ * this size.
+ */
+
+#ifndef __KERNEL__
+#define E820_X_MAX E820MAX
+#endif
+
+#define E820NR 0x1e8 /* # entries in E820MAP */
+
+#define E820_RAM 1
+#define E820_RESERVED 2
+#define E820_ACPI 3
+#define E820_NVS 4
+#define E820_UNUSABLE 5
+#define E820_PMEM 7
+
+/*
+ * This is a non-standardized way to represent ADR or NVDIMM regions that
+ * persist over a reboot. The kernel will ignore their special capabilities
+ * unless the CONFIG_X86_PMEM_LEGACY option is set.
+ *
+ * ( Note that older platforms also used 6 for the same type of memory,
+ * but newer versions switched to 12 as 6 was assigned differently. Some
+ * time they will learn... )
+ */
+#define E820_PRAM 12
+
+/*
+ * reserved RAM used by kernel itself
+ * if CONFIG_INTEL_TXT is enabled, memory of this type will be
+ * included in the S3 integrity calculation and so should not include
+ * any memory that BIOS might alter over the S3 transition
+ */
+#define E820_RESERVED_KERN 128
+
+#ifndef __ASSEMBLY__
+#include <linux/types.h>
+struct e820entry {
+ __u64 addr; /* start of memory segment */
+ __u64 size; /* size of memory segment */
+ __u32 type; /* type of memory segment */
+} __attribute__((packed));
+
+struct e820map {
+ __u32 nr_map;
+ struct e820entry map[E820_X_MAX];
+};
+
+#define ISA_START_ADDRESS 0xa0000
+#define ISA_END_ADDRESS 0x100000
+
+#define BIOS_BEGIN 0x000a0000
+#define BIOS_END 0x00100000
+
+#define BIOS_ROM_BASE 0xffe00000
+#define BIOS_ROM_END 0xffffffff
+
+#endif /* __ASSEMBLY__ */
+
+
+#endif /* _UAPI_ASM_X86_E820_H */
diff --git a/arch/x86/include/uapi/asm/errno.h b/arch/x86/include/uapi/asm/errno.h
new file mode 100644
index 000000000..4c82b503d
--- /dev/null
+++ b/arch/x86/include/uapi/asm/errno.h
@@ -0,0 +1 @@
+#include <asm-generic/errno.h>
diff --git a/arch/x86/include/uapi/asm/fcntl.h b/arch/x86/include/uapi/asm/fcntl.h
new file mode 100644
index 000000000..46ab12db5
--- /dev/null
+++ b/arch/x86/include/uapi/asm/fcntl.h
@@ -0,0 +1 @@
+#include <asm-generic/fcntl.h>
diff --git a/arch/x86/include/uapi/asm/hw_breakpoint.h b/arch/x86/include/uapi/asm/hw_breakpoint.h
new file mode 100644
index 000000000..6789884c7
--- /dev/null
+++ b/arch/x86/include/uapi/asm/hw_breakpoint.h
@@ -0,0 +1,2 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/* */
diff --git a/arch/x86/include/uapi/asm/hwcap2.h b/arch/x86/include/uapi/asm/hwcap2.h
new file mode 100644
index 000000000..6ebaae90e
--- /dev/null
+++ b/arch/x86/include/uapi/asm/hwcap2.h
@@ -0,0 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_HWCAP2_H
+#define _ASM_X86_HWCAP2_H
+
+/* MONITOR/MWAIT enabled in Ring 3 */
+#define HWCAP2_RING3MWAIT (1 << 0)
+
+#endif
diff --git a/arch/x86/include/uapi/asm/ioctl.h b/arch/x86/include/uapi/asm/ioctl.h
new file mode 100644
index 000000000..b279fe06d
--- /dev/null
+++ b/arch/x86/include/uapi/asm/ioctl.h
@@ -0,0 +1 @@
+#include <asm-generic/ioctl.h>
diff --git a/arch/x86/include/uapi/asm/ioctls.h b/arch/x86/include/uapi/asm/ioctls.h
new file mode 100644
index 000000000..ec34c7606
--- /dev/null
+++ b/arch/x86/include/uapi/asm/ioctls.h
@@ -0,0 +1 @@
+#include <asm-generic/ioctls.h>
diff --git a/arch/x86/include/uapi/asm/ipcbuf.h b/arch/x86/include/uapi/asm/ipcbuf.h
new file mode 100644
index 000000000..84c7e51cb
--- /dev/null
+++ b/arch/x86/include/uapi/asm/ipcbuf.h
@@ -0,0 +1 @@
+#include <asm-generic/ipcbuf.h>
diff --git a/arch/x86/include/uapi/asm/ist.h b/arch/x86/include/uapi/asm/ist.h
new file mode 100644
index 000000000..eac5b2079
--- /dev/null
+++ b/arch/x86/include/uapi/asm/ist.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
+/*
+ * Include file for the interface to IST BIOS
+ * Copyright 2002 Andy Grover <andrew.grover@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#ifndef _UAPI_ASM_X86_IST_H
+#define _UAPI_ASM_X86_IST_H
+
+
+
+#include <linux/types.h>
+
+struct ist_info {
+ __u32 signature;
+ __u32 command;
+ __u32 event;
+ __u32 perf_level;
+};
+
+#endif /* _UAPI_ASM_X86_IST_H */
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
new file mode 100644
index 000000000..f1645578d
--- /dev/null
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -0,0 +1,420 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _ASM_X86_KVM_H
+#define _ASM_X86_KVM_H
+
+/*
+ * KVM x86 specific structures and definitions
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+#define KVM_PIO_PAGE_OFFSET 1
+#define KVM_COALESCED_MMIO_PAGE_OFFSET 2
+
+#define DE_VECTOR 0
+#define DB_VECTOR 1
+#define BP_VECTOR 3
+#define OF_VECTOR 4
+#define BR_VECTOR 5
+#define UD_VECTOR 6
+#define NM_VECTOR 7
+#define DF_VECTOR 8
+#define TS_VECTOR 10
+#define NP_VECTOR 11
+#define SS_VECTOR 12
+#define GP_VECTOR 13
+#define PF_VECTOR 14
+#define MF_VECTOR 16
+#define AC_VECTOR 17
+#define MC_VECTOR 18
+#define XM_VECTOR 19
+#define VE_VECTOR 20
+
+/* Select x86 specific features in <linux/kvm.h> */
+#define __KVM_HAVE_PIT
+#define __KVM_HAVE_IOAPIC
+#define __KVM_HAVE_IRQ_LINE
+#define __KVM_HAVE_MSI
+#define __KVM_HAVE_USER_NMI
+#define __KVM_HAVE_GUEST_DEBUG
+#define __KVM_HAVE_MSIX
+#define __KVM_HAVE_MCE
+#define __KVM_HAVE_PIT_STATE2
+#define __KVM_HAVE_XEN_HVM
+#define __KVM_HAVE_VCPU_EVENTS
+#define __KVM_HAVE_DEBUGREGS
+#define __KVM_HAVE_XSAVE
+#define __KVM_HAVE_XCRS
+#define __KVM_HAVE_READONLY_MEM
+
+/* Architectural interrupt line count. */
+#define KVM_NR_INTERRUPTS 256
+
+struct kvm_memory_alias {
+ __u32 slot; /* this has a different namespace than memory slots */
+ __u32 flags;
+ __u64 guest_phys_addr;
+ __u64 memory_size;
+ __u64 target_phys_addr;
+};
+
+/* for KVM_GET_IRQCHIP and KVM_SET_IRQCHIP */
+struct kvm_pic_state {
+ __u8 last_irr; /* edge detection */
+ __u8 irr; /* interrupt request register */
+ __u8 imr; /* interrupt mask register */
+ __u8 isr; /* interrupt service register */
+ __u8 priority_add; /* highest irq priority */
+ __u8 irq_base;
+ __u8 read_reg_select;
+ __u8 poll;
+ __u8 special_mask;
+ __u8 init_state;
+ __u8 auto_eoi;
+ __u8 rotate_on_auto_eoi;
+ __u8 special_fully_nested_mode;
+ __u8 init4; /* true if 4 byte init */
+ __u8 elcr; /* PIIX edge/trigger selection */
+ __u8 elcr_mask;
+};
+
+#define KVM_IOAPIC_NUM_PINS 24
+struct kvm_ioapic_state {
+ __u64 base_address;
+ __u32 ioregsel;
+ __u32 id;
+ __u32 irr;
+ __u32 pad;
+ union {
+ __u64 bits;
+ struct {
+ __u8 vector;
+ __u8 delivery_mode:3;
+ __u8 dest_mode:1;
+ __u8 delivery_status:1;
+ __u8 polarity:1;
+ __u8 remote_irr:1;
+ __u8 trig_mode:1;
+ __u8 mask:1;
+ __u8 reserve:7;
+ __u8 reserved[4];
+ __u8 dest_id;
+ } fields;
+ } redirtbl[KVM_IOAPIC_NUM_PINS];
+};
+
+#define KVM_IRQCHIP_PIC_MASTER 0
+#define KVM_IRQCHIP_PIC_SLAVE 1
+#define KVM_IRQCHIP_IOAPIC 2
+#define KVM_NR_IRQCHIPS 3
+
+#define KVM_RUN_X86_SMM (1 << 0)
+
+/* for KVM_GET_REGS and KVM_SET_REGS */
+struct kvm_regs {
+ /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */
+ __u64 rax, rbx, rcx, rdx;
+ __u64 rsi, rdi, rsp, rbp;
+ __u64 r8, r9, r10, r11;
+ __u64 r12, r13, r14, r15;
+ __u64 rip, rflags;
+};
+
+/* for KVM_GET_LAPIC and KVM_SET_LAPIC */
+#define KVM_APIC_REG_SIZE 0x400
+struct kvm_lapic_state {
+ char regs[KVM_APIC_REG_SIZE];
+};
+
+struct kvm_segment {
+ __u64 base;
+ __u32 limit;
+ __u16 selector;
+ __u8 type;
+ __u8 present, dpl, db, s, l, g, avl;
+ __u8 unusable;
+ __u8 padding;
+};
+
+struct kvm_dtable {
+ __u64 base;
+ __u16 limit;
+ __u16 padding[3];
+};
+
+
+/* for KVM_GET_SREGS and KVM_SET_SREGS */
+struct kvm_sregs {
+ /* out (KVM_GET_SREGS) / in (KVM_SET_SREGS) */
+ struct kvm_segment cs, ds, es, fs, gs, ss;
+ struct kvm_segment tr, ldt;
+ struct kvm_dtable gdt, idt;
+ __u64 cr0, cr2, cr3, cr4, cr8;
+ __u64 efer;
+ __u64 apic_base;
+ __u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64];
+};
+
+/* for KVM_GET_FPU and KVM_SET_FPU */
+struct kvm_fpu {
+ __u8 fpr[8][16];
+ __u16 fcw;
+ __u16 fsw;
+ __u8 ftwx; /* in fxsave format */
+ __u8 pad1;
+ __u16 last_opcode;
+ __u64 last_ip;
+ __u64 last_dp;
+ __u8 xmm[16][16];
+ __u32 mxcsr;
+ __u32 pad2;
+};
+
+struct kvm_msr_entry {
+ __u32 index;
+ __u32 reserved;
+ __u64 data;
+};
+
+/* for KVM_GET_MSRS and KVM_SET_MSRS */
+struct kvm_msrs {
+ __u32 nmsrs; /* number of msrs in entries */
+ __u32 pad;
+
+ struct kvm_msr_entry entries[0];
+};
+
+/* for KVM_GET_MSR_INDEX_LIST */
+struct kvm_msr_list {
+ __u32 nmsrs; /* number of msrs in entries */
+ __u32 indices[0];
+};
+
+
+struct kvm_cpuid_entry {
+ __u32 function;
+ __u32 eax;
+ __u32 ebx;
+ __u32 ecx;
+ __u32 edx;
+ __u32 padding;
+};
+
+/* for KVM_SET_CPUID */
+struct kvm_cpuid {
+ __u32 nent;
+ __u32 padding;
+ struct kvm_cpuid_entry entries[0];
+};
+
+struct kvm_cpuid_entry2 {
+ __u32 function;
+ __u32 index;
+ __u32 flags;
+ __u32 eax;
+ __u32 ebx;
+ __u32 ecx;
+ __u32 edx;
+ __u32 padding[3];
+};
+
+#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX (1 << 0)
+#define KVM_CPUID_FLAG_STATEFUL_FUNC (1 << 1)
+#define KVM_CPUID_FLAG_STATE_READ_NEXT (1 << 2)
+
+/* for KVM_SET_CPUID2 */
+struct kvm_cpuid2 {
+ __u32 nent;
+ __u32 padding;
+ struct kvm_cpuid_entry2 entries[0];
+};
+
+/* for KVM_GET_PIT and KVM_SET_PIT */
+struct kvm_pit_channel_state {
+ __u32 count; /* can be 65536 */
+ __u16 latched_count;
+ __u8 count_latched;
+ __u8 status_latched;
+ __u8 status;
+ __u8 read_state;
+ __u8 write_state;
+ __u8 write_latch;
+ __u8 rw_mode;
+ __u8 mode;
+ __u8 bcd;
+ __u8 gate;
+ __s64 count_load_time;
+};
+
+struct kvm_debug_exit_arch {
+ __u32 exception;
+ __u32 pad;
+ __u64 pc;
+ __u64 dr6;
+ __u64 dr7;
+};
+
+#define KVM_GUESTDBG_USE_SW_BP 0x00010000
+#define KVM_GUESTDBG_USE_HW_BP 0x00020000
+#define KVM_GUESTDBG_INJECT_DB 0x00040000
+#define KVM_GUESTDBG_INJECT_BP 0x00080000
+
+/* for KVM_SET_GUEST_DEBUG */
+struct kvm_guest_debug_arch {
+ __u64 debugreg[8];
+};
+
+struct kvm_pit_state {
+ struct kvm_pit_channel_state channels[3];
+};
+
+#define KVM_PIT_FLAGS_HPET_LEGACY 0x00000001
+
+struct kvm_pit_state2 {
+ struct kvm_pit_channel_state channels[3];
+ __u32 flags;
+ __u32 reserved[9];
+};
+
+struct kvm_reinject_control {
+ __u8 pit_reinject;
+ __u8 reserved[31];
+};
+
+/* When set in flags, include corresponding fields on KVM_SET_VCPU_EVENTS */
+#define KVM_VCPUEVENT_VALID_NMI_PENDING 0x00000001
+#define KVM_VCPUEVENT_VALID_SIPI_VECTOR 0x00000002
+#define KVM_VCPUEVENT_VALID_SHADOW 0x00000004
+#define KVM_VCPUEVENT_VALID_SMM 0x00000008
+
+/* Interrupt shadow states */
+#define KVM_X86_SHADOW_INT_MOV_SS 0x01
+#define KVM_X86_SHADOW_INT_STI 0x02
+
+/* for KVM_GET/SET_VCPU_EVENTS */
+struct kvm_vcpu_events {
+ struct {
+ __u8 injected;
+ __u8 nr;
+ __u8 has_error_code;
+ __u8 pad;
+ __u32 error_code;
+ } exception;
+ struct {
+ __u8 injected;
+ __u8 nr;
+ __u8 soft;
+ __u8 shadow;
+ } interrupt;
+ struct {
+ __u8 injected;
+ __u8 pending;
+ __u8 masked;
+ __u8 pad;
+ } nmi;
+ __u32 sipi_vector;
+ __u32 flags;
+ struct {
+ __u8 smm;
+ __u8 pending;
+ __u8 smm_inside_nmi;
+ __u8 latched_init;
+ } smi;
+ __u32 reserved[9];
+};
+
+/* for KVM_GET/SET_DEBUGREGS */
+struct kvm_debugregs {
+ __u64 db[4];
+ __u64 dr6;
+ __u64 dr7;
+ __u64 flags;
+ __u64 reserved[9];
+};
+
+/* for KVM_CAP_XSAVE */
+struct kvm_xsave {
+ __u32 region[1024];
+};
+
+#define KVM_MAX_XCRS 16
+
+struct kvm_xcr {
+ __u32 xcr;
+ __u32 reserved;
+ __u64 value;
+};
+
+struct kvm_xcrs {
+ __u32 nr_xcrs;
+ __u32 flags;
+ struct kvm_xcr xcrs[KVM_MAX_XCRS];
+ __u64 padding[16];
+};
+
+#define KVM_SYNC_X86_REGS (1UL << 0)
+#define KVM_SYNC_X86_SREGS (1UL << 1)
+#define KVM_SYNC_X86_EVENTS (1UL << 2)
+
+#define KVM_SYNC_X86_VALID_FIELDS \
+ (KVM_SYNC_X86_REGS| \
+ KVM_SYNC_X86_SREGS| \
+ KVM_SYNC_X86_EVENTS)
+
+/* kvm_sync_regs struct included by kvm_run struct */
+struct kvm_sync_regs {
+ /* Members of this structure are potentially malicious.
+ * Care must be taken by code reading, esp. interpreting,
+ * data fields from them inside KVM to prevent TOCTOU and
+ * double-fetch types of vulnerabilities.
+ */
+ struct kvm_regs regs;
+ struct kvm_sregs sregs;
+ struct kvm_vcpu_events events;
+};
+
+#define KVM_X86_QUIRK_LINT0_REENABLED (1 << 0)
+#define KVM_X86_QUIRK_CD_NW_CLEARED (1 << 1)
+#define KVM_X86_QUIRK_LAPIC_MMIO_HOLE (1 << 2)
+#define KVM_X86_QUIRK_OUT_7E_INC_RIP (1 << 3)
+
+#define KVM_STATE_NESTED_GUEST_MODE 0x00000001
+#define KVM_STATE_NESTED_RUN_PENDING 0x00000002
+
+#define KVM_STATE_NESTED_SMM_GUEST_MODE 0x00000001
+#define KVM_STATE_NESTED_SMM_VMXON 0x00000002
+
+struct kvm_vmx_nested_state {
+ __u64 vmxon_pa;
+ __u64 vmcs_pa;
+
+ struct {
+ __u16 flags;
+ } smm;
+};
+
+/* for KVM_CAP_NESTED_STATE */
+struct kvm_nested_state {
+ /* KVM_STATE_* flags */
+ __u16 flags;
+
+ /* 0 for VMX, 1 for SVM. */
+ __u16 format;
+
+ /* 128 for SVM, 128 + VMCS size for VMX. */
+ __u32 size;
+
+ union {
+ /* VMXON, VMCS */
+ struct kvm_vmx_nested_state vmx;
+
+ /* Pad the header to 128 bytes. */
+ __u8 pad[120];
+ };
+
+ __u8 data[0];
+};
+
+#endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h
new file mode 100644
index 000000000..19980ec1a
--- /dev/null
+++ b/arch/x86/include/uapi/asm/kvm_para.h
@@ -0,0 +1,122 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_ASM_X86_KVM_PARA_H
+#define _UAPI_ASM_X86_KVM_PARA_H
+
+#include <linux/types.h>
+
+/* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It
+ * should be used to determine that a VM is running under KVM.
+ */
+#define KVM_CPUID_SIGNATURE 0x40000000
+
+/* This CPUID returns two feature bitmaps in eax, edx. Before enabling
+ * a particular paravirtualization, the appropriate feature bit should
+ * be checked in eax. The performance hint feature bit should be checked
+ * in edx.
+ */
+#define KVM_CPUID_FEATURES 0x40000001
+#define KVM_FEATURE_CLOCKSOURCE 0
+#define KVM_FEATURE_NOP_IO_DELAY 1
+#define KVM_FEATURE_MMU_OP 2
+/* This indicates that the new set of kvmclock msrs
+ * are available. The use of 0x11 and 0x12 is deprecated
+ */
+#define KVM_FEATURE_CLOCKSOURCE2 3
+#define KVM_FEATURE_ASYNC_PF 4
+#define KVM_FEATURE_STEAL_TIME 5
+#define KVM_FEATURE_PV_EOI 6
+#define KVM_FEATURE_PV_UNHALT 7
+#define KVM_FEATURE_PV_TLB_FLUSH 9
+#define KVM_FEATURE_ASYNC_PF_VMEXIT 10
+#define KVM_FEATURE_PV_SEND_IPI 11
+
+#define KVM_HINTS_REALTIME 0
+
+/* The last 8 bits are used to indicate how to interpret the flags field
+ * in pvclock structure. If no bits are set, all flags are ignored.
+ */
+#define KVM_FEATURE_CLOCKSOURCE_STABLE_BIT 24
+
+#define MSR_KVM_WALL_CLOCK 0x11
+#define MSR_KVM_SYSTEM_TIME 0x12
+
+#define KVM_MSR_ENABLED 1
+/* Custom MSRs falls in the range 0x4b564d00-0x4b564dff */
+#define MSR_KVM_WALL_CLOCK_NEW 0x4b564d00
+#define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01
+#define MSR_KVM_ASYNC_PF_EN 0x4b564d02
+#define MSR_KVM_STEAL_TIME 0x4b564d03
+#define MSR_KVM_PV_EOI_EN 0x4b564d04
+
+struct kvm_steal_time {
+ __u64 steal;
+ __u32 version;
+ __u32 flags;
+ __u8 preempted;
+ __u8 u8_pad[3];
+ __u32 pad[11];
+};
+
+#define KVM_VCPU_PREEMPTED (1 << 0)
+#define KVM_VCPU_FLUSH_TLB (1 << 1)
+
+#define KVM_CLOCK_PAIRING_WALLCLOCK 0
+struct kvm_clock_pairing {
+ __s64 sec;
+ __s64 nsec;
+ __u64 tsc;
+ __u32 flags;
+ __u32 pad[9];
+};
+
+#define KVM_STEAL_ALIGNMENT_BITS 5
+#define KVM_STEAL_VALID_BITS ((-1ULL << (KVM_STEAL_ALIGNMENT_BITS + 1)))
+#define KVM_STEAL_RESERVED_MASK (((1 << KVM_STEAL_ALIGNMENT_BITS) - 1 ) << 1)
+
+#define KVM_MAX_MMU_OP_BATCH 32
+
+#define KVM_ASYNC_PF_ENABLED (1 << 0)
+#define KVM_ASYNC_PF_SEND_ALWAYS (1 << 1)
+#define KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT (1 << 2)
+
+/* Operations for KVM_HC_MMU_OP */
+#define KVM_MMU_OP_WRITE_PTE 1
+#define KVM_MMU_OP_FLUSH_TLB 2
+#define KVM_MMU_OP_RELEASE_PT 3
+
+/* Payload for KVM_HC_MMU_OP */
+struct kvm_mmu_op_header {
+ __u32 op;
+ __u32 pad;
+};
+
+struct kvm_mmu_op_write_pte {
+ struct kvm_mmu_op_header header;
+ __u64 pte_phys;
+ __u64 pte_val;
+};
+
+struct kvm_mmu_op_flush_tlb {
+ struct kvm_mmu_op_header header;
+};
+
+struct kvm_mmu_op_release_pt {
+ struct kvm_mmu_op_header header;
+ __u64 pt_phys;
+};
+
+#define KVM_PV_REASON_PAGE_NOT_PRESENT 1
+#define KVM_PV_REASON_PAGE_READY 2
+
+struct kvm_vcpu_pv_apf_data {
+ __u32 reason;
+ __u8 pad[60];
+ __u32 enabled;
+};
+
+#define KVM_PV_EOI_BIT 0
+#define KVM_PV_EOI_MASK (0x1 << KVM_PV_EOI_BIT)
+#define KVM_PV_EOI_ENABLED KVM_PV_EOI_MASK
+#define KVM_PV_EOI_DISABLED 0x0
+
+#endif /* _UAPI_ASM_X86_KVM_PARA_H */
diff --git a/arch/x86/include/uapi/asm/kvm_perf.h b/arch/x86/include/uapi/asm/kvm_perf.h
new file mode 100644
index 000000000..125cf5cdf
--- /dev/null
+++ b/arch/x86/include/uapi/asm/kvm_perf.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _ASM_X86_KVM_PERF_H
+#define _ASM_X86_KVM_PERF_H
+
+#include <asm/svm.h>
+#include <asm/vmx.h>
+#include <asm/kvm.h>
+
+#define DECODE_STR_LEN 20
+
+#define VCPU_ID "vcpu_id"
+
+#define KVM_ENTRY_TRACE "kvm:kvm_entry"
+#define KVM_EXIT_TRACE "kvm:kvm_exit"
+#define KVM_EXIT_REASON "exit_reason"
+
+#endif /* _ASM_X86_KVM_PERF_H */
diff --git a/arch/x86/include/uapi/asm/ldt.h b/arch/x86/include/uapi/asm/ldt.h
new file mode 100644
index 000000000..d62ac5db0
--- /dev/null
+++ b/arch/x86/include/uapi/asm/ldt.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * ldt.h
+ *
+ * Definitions of structures used with the modify_ldt system call.
+ */
+#ifndef _ASM_X86_LDT_H
+#define _ASM_X86_LDT_H
+
+/* Maximum number of LDT entries supported. */
+#define LDT_ENTRIES 8192
+/* The size of each LDT entry. */
+#define LDT_ENTRY_SIZE 8
+
+#ifndef __ASSEMBLY__
+/*
+ * Note on 64bit base and limit is ignored and you cannot set DS/ES/CS
+ * not to the default values if you still want to do syscalls. This
+ * call is more for 32bit mode therefore.
+ */
+struct user_desc {
+ unsigned int entry_number;
+ unsigned int base_addr;
+ unsigned int limit;
+ unsigned int seg_32bit:1;
+ unsigned int contents:2;
+ unsigned int read_exec_only:1;
+ unsigned int limit_in_pages:1;
+ unsigned int seg_not_present:1;
+ unsigned int useable:1;
+#ifdef __x86_64__
+ /*
+ * Because this bit is not present in 32-bit user code, user
+ * programs can pass uninitialized values here. Therefore, in
+ * any context in which a user_desc comes from a 32-bit program,
+ * the kernel must act as though lm == 0, regardless of the
+ * actual value.
+ */
+ unsigned int lm:1;
+#endif
+};
+
+#define MODIFY_LDT_CONTENTS_DATA 0
+#define MODIFY_LDT_CONTENTS_STACK 1
+#define MODIFY_LDT_CONTENTS_CODE 2
+
+#endif /* !__ASSEMBLY__ */
+#endif /* _ASM_X86_LDT_H */
diff --git a/arch/x86/include/uapi/asm/mce.h b/arch/x86/include/uapi/asm/mce.h
new file mode 100644
index 000000000..955c2a2e1
--- /dev/null
+++ b/arch/x86/include/uapi/asm/mce.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_ASM_X86_MCE_H
+#define _UAPI_ASM_X86_MCE_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+/*
+ * Fields are zero when not available. Also, this struct is shared with
+ * userspace mcelog and thus must keep existing fields at current offsets.
+ * Only add new fields to the end of the structure
+ */
+struct mce {
+ __u64 status; /* Bank's MCi_STATUS MSR */
+ __u64 misc; /* Bank's MCi_MISC MSR */
+ __u64 addr; /* Bank's MCi_ADDR MSR */
+ __u64 mcgstatus; /* Machine Check Global Status MSR */
+ __u64 ip; /* Instruction Pointer when the error happened */
+ __u64 tsc; /* CPU time stamp counter */
+ __u64 time; /* Wall time_t when error was detected */
+ __u8 cpuvendor; /* Kernel's X86_VENDOR enum */
+ __u8 inject_flags; /* Software inject flags */
+ __u8 severity; /* Error severity */
+ __u8 pad;
+ __u32 cpuid; /* CPUID 1 EAX */
+ __u8 cs; /* Code segment */
+ __u8 bank; /* Machine check bank reporting the error */
+ __u8 cpu; /* CPU number; obsoleted by extcpu */
+ __u8 finished; /* Entry is valid */
+ __u32 extcpu; /* Linux CPU number that detected the error */
+ __u32 socketid; /* CPU socket ID */
+ __u32 apicid; /* CPU initial APIC ID */
+ __u64 mcgcap; /* MCGCAP MSR: machine check capabilities of CPU */
+ __u64 synd; /* MCA_SYND MSR: only valid on SMCA systems */
+ __u64 ipid; /* MCA_IPID MSR: only valid on SMCA systems */
+ __u64 ppin; /* Protected Processor Inventory Number */
+ __u32 microcode; /* Microcode revision */
+};
+
+#define MCE_GET_RECORD_LEN _IOR('M', 1, int)
+#define MCE_GET_LOG_LEN _IOR('M', 2, int)
+#define MCE_GETCLEAR_FLAGS _IOR('M', 3, int)
+
+#endif /* _UAPI_ASM_X86_MCE_H */
diff --git a/arch/x86/include/uapi/asm/mman.h b/arch/x86/include/uapi/asm/mman.h
new file mode 100644
index 000000000..d4a8d0424
--- /dev/null
+++ b/arch/x86/include/uapi/asm/mman.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _ASM_X86_MMAN_H
+#define _ASM_X86_MMAN_H
+
+#define MAP_32BIT 0x40 /* only give out 32bit addresses */
+
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+/*
+ * Take the 4 protection key bits out of the vma->vm_flags
+ * value and turn them in to the bits that we can put in
+ * to a pte.
+ *
+ * Only override these if Protection Keys are available
+ * (which is only on 64-bit).
+ */
+#define arch_vm_get_page_prot(vm_flags) __pgprot( \
+ ((vm_flags) & VM_PKEY_BIT0 ? _PAGE_PKEY_BIT0 : 0) | \
+ ((vm_flags) & VM_PKEY_BIT1 ? _PAGE_PKEY_BIT1 : 0) | \
+ ((vm_flags) & VM_PKEY_BIT2 ? _PAGE_PKEY_BIT2 : 0) | \
+ ((vm_flags) & VM_PKEY_BIT3 ? _PAGE_PKEY_BIT3 : 0))
+
+#define arch_calc_vm_prot_bits(prot, key) ( \
+ ((key) & 0x1 ? VM_PKEY_BIT0 : 0) | \
+ ((key) & 0x2 ? VM_PKEY_BIT1 : 0) | \
+ ((key) & 0x4 ? VM_PKEY_BIT2 : 0) | \
+ ((key) & 0x8 ? VM_PKEY_BIT3 : 0))
+#endif
+
+#include <asm-generic/mman.h>
+
+#endif /* _ASM_X86_MMAN_H */
diff --git a/arch/x86/include/uapi/asm/msgbuf.h b/arch/x86/include/uapi/asm/msgbuf.h
new file mode 100644
index 000000000..90ab9a795
--- /dev/null
+++ b/arch/x86/include/uapi/asm/msgbuf.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef __ASM_X64_MSGBUF_H
+#define __ASM_X64_MSGBUF_H
+
+#if !defined(__x86_64__) || !defined(__ILP32__)
+#include <asm-generic/msgbuf.h>
+#else
+/*
+ * The msqid64_ds structure for x86 architecture with x32 ABI.
+ *
+ * On x86-32 and x86-64 we can just use the generic definition, but
+ * x32 uses the same binary layout as x86_64, which is differnet
+ * from other 32-bit architectures.
+ */
+
+struct msqid64_ds {
+ struct ipc64_perm msg_perm;
+ __kernel_time_t msg_stime; /* last msgsnd time */
+ __kernel_time_t msg_rtime; /* last msgrcv time */
+ __kernel_time_t msg_ctime; /* last change time */
+ __kernel_ulong_t msg_cbytes; /* current number of bytes on queue */
+ __kernel_ulong_t msg_qnum; /* number of messages in queue */
+ __kernel_ulong_t msg_qbytes; /* max number of bytes on queue */
+ __kernel_pid_t msg_lspid; /* pid of last msgsnd */
+ __kernel_pid_t msg_lrpid; /* last receive pid */
+ __kernel_ulong_t __unused4;
+ __kernel_ulong_t __unused5;
+};
+
+#endif
+
+#endif /* __ASM_GENERIC_MSGBUF_H */
diff --git a/arch/x86/include/uapi/asm/msr.h b/arch/x86/include/uapi/asm/msr.h
new file mode 100644
index 000000000..e7516b402
--- /dev/null
+++ b/arch/x86/include/uapi/asm/msr.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_ASM_X86_MSR_H
+#define _UAPI_ASM_X86_MSR_H
+
+#ifndef __ASSEMBLY__
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+#define X86_IOC_RDMSR_REGS _IOWR('c', 0xA0, __u32[8])
+#define X86_IOC_WRMSR_REGS _IOWR('c', 0xA1, __u32[8])
+
+#endif /* __ASSEMBLY__ */
+#endif /* _UAPI_ASM_X86_MSR_H */
diff --git a/arch/x86/include/uapi/asm/mtrr.h b/arch/x86/include/uapi/asm/mtrr.h
new file mode 100644
index 000000000..376563f2b
--- /dev/null
+++ b/arch/x86/include/uapi/asm/mtrr.h
@@ -0,0 +1,124 @@
+/* SPDX-License-Identifier: LGPL-2.0+ WITH Linux-syscall-note */
+/* Generic MTRR (Memory Type Range Register) ioctls.
+
+ Copyright (C) 1997-1999 Richard Gooch
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Library General Public
+ License as published by the Free Software Foundation; either
+ version 2 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Library General Public License for more details.
+
+ You should have received a copy of the GNU Library General Public
+ License along with this library; if not, write to the Free
+ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ Richard Gooch may be reached by email at rgooch@atnf.csiro.au
+ The postal address is:
+ Richard Gooch, c/o ATNF, P. O. Box 76, Epping, N.S.W., 2121, Australia.
+*/
+#ifndef _UAPI_ASM_X86_MTRR_H
+#define _UAPI_ASM_X86_MTRR_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+#include <linux/errno.h>
+
+#define MTRR_IOCTL_BASE 'M'
+
+/* Warning: this structure has a different order from i386
+ on x86-64. The 32bit emulation code takes care of that.
+ But you need to use this for 64bit, otherwise your X server
+ will break. */
+
+#ifdef __i386__
+struct mtrr_sentry {
+ unsigned long base; /* Base address */
+ unsigned int size; /* Size of region */
+ unsigned int type; /* Type of region */
+};
+
+struct mtrr_gentry {
+ unsigned int regnum; /* Register number */
+ unsigned long base; /* Base address */
+ unsigned int size; /* Size of region */
+ unsigned int type; /* Type of region */
+};
+
+#else /* __i386__ */
+
+struct mtrr_sentry {
+ __u64 base; /* Base address */
+ __u32 size; /* Size of region */
+ __u32 type; /* Type of region */
+};
+
+struct mtrr_gentry {
+ __u64 base; /* Base address */
+ __u32 size; /* Size of region */
+ __u32 regnum; /* Register number */
+ __u32 type; /* Type of region */
+ __u32 _pad; /* Unused */
+};
+
+#endif /* !__i386__ */
+
+struct mtrr_var_range {
+ __u32 base_lo;
+ __u32 base_hi;
+ __u32 mask_lo;
+ __u32 mask_hi;
+};
+
+/* In the Intel processor's MTRR interface, the MTRR type is always held in
+ an 8 bit field: */
+typedef __u8 mtrr_type;
+
+#define MTRR_NUM_FIXED_RANGES 88
+#define MTRR_MAX_VAR_RANGES 256
+
+struct mtrr_state_type {
+ struct mtrr_var_range var_ranges[MTRR_MAX_VAR_RANGES];
+ mtrr_type fixed_ranges[MTRR_NUM_FIXED_RANGES];
+ unsigned char enabled;
+ unsigned char have_fixed;
+ mtrr_type def_type;
+};
+
+#define MTRRphysBase_MSR(reg) (0x200 + 2 * (reg))
+#define MTRRphysMask_MSR(reg) (0x200 + 2 * (reg) + 1)
+
+/* These are the various ioctls */
+#define MTRRIOC_ADD_ENTRY _IOW(MTRR_IOCTL_BASE, 0, struct mtrr_sentry)
+#define MTRRIOC_SET_ENTRY _IOW(MTRR_IOCTL_BASE, 1, struct mtrr_sentry)
+#define MTRRIOC_DEL_ENTRY _IOW(MTRR_IOCTL_BASE, 2, struct mtrr_sentry)
+#define MTRRIOC_GET_ENTRY _IOWR(MTRR_IOCTL_BASE, 3, struct mtrr_gentry)
+#define MTRRIOC_KILL_ENTRY _IOW(MTRR_IOCTL_BASE, 4, struct mtrr_sentry)
+#define MTRRIOC_ADD_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 5, struct mtrr_sentry)
+#define MTRRIOC_SET_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 6, struct mtrr_sentry)
+#define MTRRIOC_DEL_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 7, struct mtrr_sentry)
+#define MTRRIOC_GET_PAGE_ENTRY _IOWR(MTRR_IOCTL_BASE, 8, struct mtrr_gentry)
+#define MTRRIOC_KILL_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 9, struct mtrr_sentry)
+
+/* MTRR memory types, which are defined in SDM */
+#define MTRR_TYPE_UNCACHABLE 0
+#define MTRR_TYPE_WRCOMB 1
+/*#define MTRR_TYPE_ 2*/
+/*#define MTRR_TYPE_ 3*/
+#define MTRR_TYPE_WRTHROUGH 4
+#define MTRR_TYPE_WRPROT 5
+#define MTRR_TYPE_WRBACK 6
+#define MTRR_NUM_TYPES 7
+
+/*
+ * Invalid MTRR memory type. mtrr_type_lookup() returns this value when
+ * MTRRs are disabled. Note, this value is allocated from the reserved
+ * values (0x7-0xff) of the MTRR memory types.
+ */
+#define MTRR_TYPE_INVALID 0xff
+
+#endif /* _UAPI_ASM_X86_MTRR_H */
diff --git a/arch/x86/include/uapi/asm/param.h b/arch/x86/include/uapi/asm/param.h
new file mode 100644
index 000000000..965d45427
--- /dev/null
+++ b/arch/x86/include/uapi/asm/param.h
@@ -0,0 +1 @@
+#include <asm-generic/param.h>
diff --git a/arch/x86/include/uapi/asm/perf_regs.h b/arch/x86/include/uapi/asm/perf_regs.h
new file mode 100644
index 000000000..f3329cabc
--- /dev/null
+++ b/arch/x86/include/uapi/asm/perf_regs.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _ASM_X86_PERF_REGS_H
+#define _ASM_X86_PERF_REGS_H
+
+enum perf_event_x86_regs {
+ PERF_REG_X86_AX,
+ PERF_REG_X86_BX,
+ PERF_REG_X86_CX,
+ PERF_REG_X86_DX,
+ PERF_REG_X86_SI,
+ PERF_REG_X86_DI,
+ PERF_REG_X86_BP,
+ PERF_REG_X86_SP,
+ PERF_REG_X86_IP,
+ PERF_REG_X86_FLAGS,
+ PERF_REG_X86_CS,
+ PERF_REG_X86_SS,
+ PERF_REG_X86_DS,
+ PERF_REG_X86_ES,
+ PERF_REG_X86_FS,
+ PERF_REG_X86_GS,
+ PERF_REG_X86_R8,
+ PERF_REG_X86_R9,
+ PERF_REG_X86_R10,
+ PERF_REG_X86_R11,
+ PERF_REG_X86_R12,
+ PERF_REG_X86_R13,
+ PERF_REG_X86_R14,
+ PERF_REG_X86_R15,
+
+ PERF_REG_X86_32_MAX = PERF_REG_X86_GS + 1,
+ PERF_REG_X86_64_MAX = PERF_REG_X86_R15 + 1,
+};
+#endif /* _ASM_X86_PERF_REGS_H */
diff --git a/arch/x86/include/uapi/asm/posix_types.h b/arch/x86/include/uapi/asm/posix_types.h
new file mode 100644
index 000000000..c661e95f0
--- /dev/null
+++ b/arch/x86/include/uapi/asm/posix_types.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef __KERNEL__
+# ifdef __i386__
+# include <asm/posix_types_32.h>
+# elif defined(__ILP32__)
+# include <asm/posix_types_x32.h>
+# else
+# include <asm/posix_types_64.h>
+# endif
+#endif
diff --git a/arch/x86/include/uapi/asm/posix_types_32.h b/arch/x86/include/uapi/asm/posix_types_32.h
new file mode 100644
index 000000000..840659f4b
--- /dev/null
+++ b/arch/x86/include/uapi/asm/posix_types_32.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _ASM_X86_POSIX_TYPES_32_H
+#define _ASM_X86_POSIX_TYPES_32_H
+
+/*
+ * This file is generally used by user-level software, so you need to
+ * be a little careful about namespace pollution etc. Also, we cannot
+ * assume GCC is being used.
+ */
+
+typedef unsigned short __kernel_mode_t;
+#define __kernel_mode_t __kernel_mode_t
+
+typedef unsigned short __kernel_ipc_pid_t;
+#define __kernel_ipc_pid_t __kernel_ipc_pid_t
+
+typedef unsigned short __kernel_uid_t;
+typedef unsigned short __kernel_gid_t;
+#define __kernel_uid_t __kernel_uid_t
+
+typedef unsigned short __kernel_old_dev_t;
+#define __kernel_old_dev_t __kernel_old_dev_t
+
+#include <asm-generic/posix_types.h>
+
+#endif /* _ASM_X86_POSIX_TYPES_32_H */
diff --git a/arch/x86/include/uapi/asm/posix_types_64.h b/arch/x86/include/uapi/asm/posix_types_64.h
new file mode 100644
index 000000000..515afb805
--- /dev/null
+++ b/arch/x86/include/uapi/asm/posix_types_64.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _ASM_X86_POSIX_TYPES_64_H
+#define _ASM_X86_POSIX_TYPES_64_H
+
+/*
+ * This file is generally used by user-level software, so you need to
+ * be a little careful about namespace pollution etc. Also, we cannot
+ * assume GCC is being used.
+ */
+
+typedef unsigned short __kernel_old_uid_t;
+typedef unsigned short __kernel_old_gid_t;
+#define __kernel_old_uid_t __kernel_old_uid_t
+
+typedef unsigned long __kernel_old_dev_t;
+#define __kernel_old_dev_t __kernel_old_dev_t
+
+#include <asm-generic/posix_types.h>
+
+#endif /* _ASM_X86_POSIX_TYPES_64_H */
diff --git a/arch/x86/include/uapi/asm/posix_types_x32.h b/arch/x86/include/uapi/asm/posix_types_x32.h
new file mode 100644
index 000000000..f60479b07
--- /dev/null
+++ b/arch/x86/include/uapi/asm/posix_types_x32.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _ASM_X86_POSIX_TYPES_X32_H
+#define _ASM_X86_POSIX_TYPES_X32_H
+
+/*
+ * This file is only used by user-level software, so you need to
+ * be a little careful about namespace pollution etc. Also, we cannot
+ * assume GCC is being used.
+ *
+ * These types should generally match the ones used by the 64-bit kernel,
+ *
+ */
+
+typedef long long __kernel_long_t;
+typedef unsigned long long __kernel_ulong_t;
+#define __kernel_long_t __kernel_long_t
+
+#include <asm/posix_types_64.h>
+
+#endif /* _ASM_X86_POSIX_TYPES_X32_H */
diff --git a/arch/x86/include/uapi/asm/prctl.h b/arch/x86/include/uapi/asm/prctl.h
new file mode 100644
index 000000000..5a6aac9fa
--- /dev/null
+++ b/arch/x86/include/uapi/asm/prctl.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _ASM_X86_PRCTL_H
+#define _ASM_X86_PRCTL_H
+
+#define ARCH_SET_GS 0x1001
+#define ARCH_SET_FS 0x1002
+#define ARCH_GET_FS 0x1003
+#define ARCH_GET_GS 0x1004
+
+#define ARCH_GET_CPUID 0x1011
+#define ARCH_SET_CPUID 0x1012
+
+#define ARCH_MAP_VDSO_X32 0x2001
+#define ARCH_MAP_VDSO_32 0x2002
+#define ARCH_MAP_VDSO_64 0x2003
+
+#endif /* _ASM_X86_PRCTL_H */
diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h
new file mode 100644
index 000000000..bcba3c643
--- /dev/null
+++ b/arch/x86/include/uapi/asm/processor-flags.h
@@ -0,0 +1,166 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_ASM_X86_PROCESSOR_FLAGS_H
+#define _UAPI_ASM_X86_PROCESSOR_FLAGS_H
+/* Various flags defined: can be included from assembler. */
+
+#include <linux/const.h>
+
+/*
+ * EFLAGS bits
+ */
+#define X86_EFLAGS_CF_BIT 0 /* Carry Flag */
+#define X86_EFLAGS_CF _BITUL(X86_EFLAGS_CF_BIT)
+#define X86_EFLAGS_FIXED_BIT 1 /* Bit 1 - always on */
+#define X86_EFLAGS_FIXED _BITUL(X86_EFLAGS_FIXED_BIT)
+#define X86_EFLAGS_PF_BIT 2 /* Parity Flag */
+#define X86_EFLAGS_PF _BITUL(X86_EFLAGS_PF_BIT)
+#define X86_EFLAGS_AF_BIT 4 /* Auxiliary carry Flag */
+#define X86_EFLAGS_AF _BITUL(X86_EFLAGS_AF_BIT)
+#define X86_EFLAGS_ZF_BIT 6 /* Zero Flag */
+#define X86_EFLAGS_ZF _BITUL(X86_EFLAGS_ZF_BIT)
+#define X86_EFLAGS_SF_BIT 7 /* Sign Flag */
+#define X86_EFLAGS_SF _BITUL(X86_EFLAGS_SF_BIT)
+#define X86_EFLAGS_TF_BIT 8 /* Trap Flag */
+#define X86_EFLAGS_TF _BITUL(X86_EFLAGS_TF_BIT)
+#define X86_EFLAGS_IF_BIT 9 /* Interrupt Flag */
+#define X86_EFLAGS_IF _BITUL(X86_EFLAGS_IF_BIT)
+#define X86_EFLAGS_DF_BIT 10 /* Direction Flag */
+#define X86_EFLAGS_DF _BITUL(X86_EFLAGS_DF_BIT)
+#define X86_EFLAGS_OF_BIT 11 /* Overflow Flag */
+#define X86_EFLAGS_OF _BITUL(X86_EFLAGS_OF_BIT)
+#define X86_EFLAGS_IOPL_BIT 12 /* I/O Privilege Level (2 bits) */
+#define X86_EFLAGS_IOPL (_AC(3,UL) << X86_EFLAGS_IOPL_BIT)
+#define X86_EFLAGS_NT_BIT 14 /* Nested Task */
+#define X86_EFLAGS_NT _BITUL(X86_EFLAGS_NT_BIT)
+#define X86_EFLAGS_RF_BIT 16 /* Resume Flag */
+#define X86_EFLAGS_RF _BITUL(X86_EFLAGS_RF_BIT)
+#define X86_EFLAGS_VM_BIT 17 /* Virtual Mode */
+#define X86_EFLAGS_VM _BITUL(X86_EFLAGS_VM_BIT)
+#define X86_EFLAGS_AC_BIT 18 /* Alignment Check/Access Control */
+#define X86_EFLAGS_AC _BITUL(X86_EFLAGS_AC_BIT)
+#define X86_EFLAGS_VIF_BIT 19 /* Virtual Interrupt Flag */
+#define X86_EFLAGS_VIF _BITUL(X86_EFLAGS_VIF_BIT)
+#define X86_EFLAGS_VIP_BIT 20 /* Virtual Interrupt Pending */
+#define X86_EFLAGS_VIP _BITUL(X86_EFLAGS_VIP_BIT)
+#define X86_EFLAGS_ID_BIT 21 /* CPUID detection */
+#define X86_EFLAGS_ID _BITUL(X86_EFLAGS_ID_BIT)
+
+/*
+ * Basic CPU control in CR0
+ */
+#define X86_CR0_PE_BIT 0 /* Protection Enable */
+#define X86_CR0_PE _BITUL(X86_CR0_PE_BIT)
+#define X86_CR0_MP_BIT 1 /* Monitor Coprocessor */
+#define X86_CR0_MP _BITUL(X86_CR0_MP_BIT)
+#define X86_CR0_EM_BIT 2 /* Emulation */
+#define X86_CR0_EM _BITUL(X86_CR0_EM_BIT)
+#define X86_CR0_TS_BIT 3 /* Task Switched */
+#define X86_CR0_TS _BITUL(X86_CR0_TS_BIT)
+#define X86_CR0_ET_BIT 4 /* Extension Type */
+#define X86_CR0_ET _BITUL(X86_CR0_ET_BIT)
+#define X86_CR0_NE_BIT 5 /* Numeric Error */
+#define X86_CR0_NE _BITUL(X86_CR0_NE_BIT)
+#define X86_CR0_WP_BIT 16 /* Write Protect */
+#define X86_CR0_WP _BITUL(X86_CR0_WP_BIT)
+#define X86_CR0_AM_BIT 18 /* Alignment Mask */
+#define X86_CR0_AM _BITUL(X86_CR0_AM_BIT)
+#define X86_CR0_NW_BIT 29 /* Not Write-through */
+#define X86_CR0_NW _BITUL(X86_CR0_NW_BIT)
+#define X86_CR0_CD_BIT 30 /* Cache Disable */
+#define X86_CR0_CD _BITUL(X86_CR0_CD_BIT)
+#define X86_CR0_PG_BIT 31 /* Paging */
+#define X86_CR0_PG _BITUL(X86_CR0_PG_BIT)
+
+/*
+ * Paging options in CR3
+ */
+#define X86_CR3_PWT_BIT 3 /* Page Write Through */
+#define X86_CR3_PWT _BITUL(X86_CR3_PWT_BIT)
+#define X86_CR3_PCD_BIT 4 /* Page Cache Disable */
+#define X86_CR3_PCD _BITUL(X86_CR3_PCD_BIT)
+
+#define X86_CR3_PCID_BITS 12
+#define X86_CR3_PCID_MASK (_AC((1UL << X86_CR3_PCID_BITS) - 1, UL))
+
+#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */
+#define X86_CR3_PCID_NOFLUSH _BITULL(X86_CR3_PCID_NOFLUSH_BIT)
+
+/*
+ * Intel CPU features in CR4
+ */
+#define X86_CR4_VME_BIT 0 /* enable vm86 extensions */
+#define X86_CR4_VME _BITUL(X86_CR4_VME_BIT)
+#define X86_CR4_PVI_BIT 1 /* virtual interrupts flag enable */
+#define X86_CR4_PVI _BITUL(X86_CR4_PVI_BIT)
+#define X86_CR4_TSD_BIT 2 /* disable time stamp at ipl 3 */
+#define X86_CR4_TSD _BITUL(X86_CR4_TSD_BIT)
+#define X86_CR4_DE_BIT 3 /* enable debugging extensions */
+#define X86_CR4_DE _BITUL(X86_CR4_DE_BIT)
+#define X86_CR4_PSE_BIT 4 /* enable page size extensions */
+#define X86_CR4_PSE _BITUL(X86_CR4_PSE_BIT)
+#define X86_CR4_PAE_BIT 5 /* enable physical address extensions */
+#define X86_CR4_PAE _BITUL(X86_CR4_PAE_BIT)
+#define X86_CR4_MCE_BIT 6 /* Machine check enable */
+#define X86_CR4_MCE _BITUL(X86_CR4_MCE_BIT)
+#define X86_CR4_PGE_BIT 7 /* enable global pages */
+#define X86_CR4_PGE _BITUL(X86_CR4_PGE_BIT)
+#define X86_CR4_PCE_BIT 8 /* enable performance counters at ipl 3 */
+#define X86_CR4_PCE _BITUL(X86_CR4_PCE_BIT)
+#define X86_CR4_OSFXSR_BIT 9 /* enable fast FPU save and restore */
+#define X86_CR4_OSFXSR _BITUL(X86_CR4_OSFXSR_BIT)
+#define X86_CR4_OSXMMEXCPT_BIT 10 /* enable unmasked SSE exceptions */
+#define X86_CR4_OSXMMEXCPT _BITUL(X86_CR4_OSXMMEXCPT_BIT)
+#define X86_CR4_UMIP_BIT 11 /* enable UMIP support */
+#define X86_CR4_UMIP _BITUL(X86_CR4_UMIP_BIT)
+#define X86_CR4_LA57_BIT 12 /* enable 5-level page tables */
+#define X86_CR4_LA57 _BITUL(X86_CR4_LA57_BIT)
+#define X86_CR4_VMXE_BIT 13 /* enable VMX virtualization */
+#define X86_CR4_VMXE _BITUL(X86_CR4_VMXE_BIT)
+#define X86_CR4_SMXE_BIT 14 /* enable safer mode (TXT) */
+#define X86_CR4_SMXE _BITUL(X86_CR4_SMXE_BIT)
+#define X86_CR4_FSGSBASE_BIT 16 /* enable RDWRFSGS support */
+#define X86_CR4_FSGSBASE _BITUL(X86_CR4_FSGSBASE_BIT)
+#define X86_CR4_PCIDE_BIT 17 /* enable PCID support */
+#define X86_CR4_PCIDE _BITUL(X86_CR4_PCIDE_BIT)
+#define X86_CR4_OSXSAVE_BIT 18 /* enable xsave and xrestore */
+#define X86_CR4_OSXSAVE _BITUL(X86_CR4_OSXSAVE_BIT)
+#define X86_CR4_SMEP_BIT 20 /* enable SMEP support */
+#define X86_CR4_SMEP _BITUL(X86_CR4_SMEP_BIT)
+#define X86_CR4_SMAP_BIT 21 /* enable SMAP support */
+#define X86_CR4_SMAP _BITUL(X86_CR4_SMAP_BIT)
+#define X86_CR4_PKE_BIT 22 /* enable Protection Keys support */
+#define X86_CR4_PKE _BITUL(X86_CR4_PKE_BIT)
+
+/*
+ * x86-64 Task Priority Register, CR8
+ */
+#define X86_CR8_TPR _AC(0x0000000f,UL) /* task priority register */
+
+/*
+ * AMD and Transmeta use MSRs for configuration; see <asm/msr-index.h>
+ */
+
+/*
+ * NSC/Cyrix CPU configuration register indexes
+ */
+#define CX86_PCR0 0x20
+#define CX86_GCR 0xb8
+#define CX86_CCR0 0xc0
+#define CX86_CCR1 0xc1
+#define CX86_CCR2 0xc2
+#define CX86_CCR3 0xc3
+#define CX86_CCR4 0xe8
+#define CX86_CCR5 0xe9
+#define CX86_CCR6 0xea
+#define CX86_CCR7 0xeb
+#define CX86_PCR1 0xf0
+#define CX86_DIR0 0xfe
+#define CX86_DIR1 0xff
+#define CX86_ARR_BASE 0xc4
+#define CX86_RCR_BASE 0xdc
+
+#define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \
+ X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \
+ X86_CR0_PG)
+
+#endif /* _UAPI_ASM_X86_PROCESSOR_FLAGS_H */
diff --git a/arch/x86/include/uapi/asm/ptrace-abi.h b/arch/x86/include/uapi/asm/ptrace-abi.h
new file mode 100644
index 000000000..16074b9c9
--- /dev/null
+++ b/arch/x86/include/uapi/asm/ptrace-abi.h
@@ -0,0 +1,94 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _ASM_X86_PTRACE_ABI_H
+#define _ASM_X86_PTRACE_ABI_H
+
+#ifdef __i386__
+
+#define EBX 0
+#define ECX 1
+#define EDX 2
+#define ESI 3
+#define EDI 4
+#define EBP 5
+#define EAX 6
+#define DS 7
+#define ES 8
+#define FS 9
+#define GS 10
+#define ORIG_EAX 11
+#define EIP 12
+#define CS 13
+#define EFL 14
+#define UESP 15
+#define SS 16
+#define FRAME_SIZE 17
+
+#else /* __i386__ */
+
+#if defined(__ASSEMBLY__) || defined(__FRAME_OFFSETS)
+/*
+ * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
+ * unless syscall needs a complete, fully filled "struct pt_regs".
+ */
+#define R15 0
+#define R14 8
+#define R13 16
+#define R12 24
+#define RBP 32
+#define RBX 40
+/* These regs are callee-clobbered. Always saved on kernel entry. */
+#define R11 48
+#define R10 56
+#define R9 64
+#define R8 72
+#define RAX 80
+#define RCX 88
+#define RDX 96
+#define RSI 104
+#define RDI 112
+/*
+ * On syscall entry, this is syscall#. On CPU exception, this is error code.
+ * On hw interrupt, it's IRQ number:
+ */
+#define ORIG_RAX 120
+/* Return frame for iretq */
+#define RIP 128
+#define CS 136
+#define EFLAGS 144
+#define RSP 152
+#define SS 160
+#endif /* __ASSEMBLY__ */
+
+/* top of stack page */
+#define FRAME_SIZE 168
+
+#endif /* !__i386__ */
+
+/* Arbitrarily choose the same ptrace numbers as used by the Sparc code. */
+#define PTRACE_GETREGS 12
+#define PTRACE_SETREGS 13
+#define PTRACE_GETFPREGS 14
+#define PTRACE_SETFPREGS 15
+#define PTRACE_GETFPXREGS 18
+#define PTRACE_SETFPXREGS 19
+
+#define PTRACE_OLDSETOPTIONS 21
+
+/* only useful for access 32bit programs / kernels */
+#define PTRACE_GET_THREAD_AREA 25
+#define PTRACE_SET_THREAD_AREA 26
+
+#ifdef __x86_64__
+# define PTRACE_ARCH_PRCTL 30
+#endif
+
+#define PTRACE_SYSEMU 31
+#define PTRACE_SYSEMU_SINGLESTEP 32
+
+#define PTRACE_SINGLEBLOCK 33 /* resume execution until next branch */
+
+#ifndef __ASSEMBLY__
+#include <linux/types.h>
+#endif
+
+#endif /* _ASM_X86_PTRACE_ABI_H */
diff --git a/arch/x86/include/uapi/asm/ptrace.h b/arch/x86/include/uapi/asm/ptrace.h
new file mode 100644
index 000000000..85165c0ed
--- /dev/null
+++ b/arch/x86/include/uapi/asm/ptrace.h
@@ -0,0 +1,86 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_ASM_X86_PTRACE_H
+#define _UAPI_ASM_X86_PTRACE_H
+
+#include <linux/compiler.h> /* For __user */
+#include <asm/ptrace-abi.h>
+#include <asm/processor-flags.h>
+
+
+#ifndef __ASSEMBLY__
+
+#ifdef __i386__
+/* this struct defines the way the registers are stored on the
+ stack during a system call. */
+
+#ifndef __KERNEL__
+
+struct pt_regs {
+ long ebx;
+ long ecx;
+ long edx;
+ long esi;
+ long edi;
+ long ebp;
+ long eax;
+ int xds;
+ int xes;
+ int xfs;
+ int xgs;
+ long orig_eax;
+ long eip;
+ int xcs;
+ long eflags;
+ long esp;
+ int xss;
+};
+
+#endif /* __KERNEL__ */
+
+#else /* __i386__ */
+
+#ifndef __KERNEL__
+
+struct pt_regs {
+/*
+ * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
+ * unless syscall needs a complete, fully filled "struct pt_regs".
+ */
+ unsigned long r15;
+ unsigned long r14;
+ unsigned long r13;
+ unsigned long r12;
+ unsigned long rbp;
+ unsigned long rbx;
+/* These regs are callee-clobbered. Always saved on kernel entry. */
+ unsigned long r11;
+ unsigned long r10;
+ unsigned long r9;
+ unsigned long r8;
+ unsigned long rax;
+ unsigned long rcx;
+ unsigned long rdx;
+ unsigned long rsi;
+ unsigned long rdi;
+/*
+ * On syscall entry, this is syscall#. On CPU exception, this is error code.
+ * On hw interrupt, it's IRQ number:
+ */
+ unsigned long orig_rax;
+/* Return frame for iretq */
+ unsigned long rip;
+ unsigned long cs;
+ unsigned long eflags;
+ unsigned long rsp;
+ unsigned long ss;
+/* top of stack page */
+};
+
+#endif /* __KERNEL__ */
+#endif /* !__i386__ */
+
+
+
+#endif /* !__ASSEMBLY__ */
+
+#endif /* _UAPI_ASM_X86_PTRACE_H */
diff --git a/arch/x86/include/uapi/asm/resource.h b/arch/x86/include/uapi/asm/resource.h
new file mode 100644
index 000000000..04bc4db89
--- /dev/null
+++ b/arch/x86/include/uapi/asm/resource.h
@@ -0,0 +1 @@
+#include <asm-generic/resource.h>
diff --git a/arch/x86/include/uapi/asm/sembuf.h b/arch/x86/include/uapi/asm/sembuf.h
new file mode 100644
index 000000000..89de6cd9f
--- /dev/null
+++ b/arch/x86/include/uapi/asm/sembuf.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _ASM_X86_SEMBUF_H
+#define _ASM_X86_SEMBUF_H
+
+/*
+ * The semid64_ds structure for x86 architecture.
+ * Note extra padding because this structure is passed back and forth
+ * between kernel and user space.
+ *
+ * Pad space is left for:
+ * - 2 miscellaneous 32-bit values
+ *
+ * x86_64 and x32 incorrectly added padding here, so the structures
+ * are still incompatible with the padding on x86.
+ */
+struct semid64_ds {
+ struct ipc64_perm sem_perm; /* permissions .. see ipc.h */
+#ifdef __i386__
+ unsigned long sem_otime; /* last semop time */
+ unsigned long sem_otime_high;
+ unsigned long sem_ctime; /* last change time */
+ unsigned long sem_ctime_high;
+#else
+ __kernel_time_t sem_otime; /* last semop time */
+ __kernel_ulong_t __unused1;
+ __kernel_time_t sem_ctime; /* last change time */
+ __kernel_ulong_t __unused2;
+#endif
+ __kernel_ulong_t sem_nsems; /* no. of semaphores in array */
+ __kernel_ulong_t __unused3;
+ __kernel_ulong_t __unused4;
+};
+
+#endif /* _ASM_X86_SEMBUF_H */
diff --git a/arch/x86/include/uapi/asm/setup.h b/arch/x86/include/uapi/asm/setup.h
new file mode 100644
index 000000000..79a9626b5
--- /dev/null
+++ b/arch/x86/include/uapi/asm/setup.h
@@ -0,0 +1 @@
+/* */
diff --git a/arch/x86/include/uapi/asm/shmbuf.h b/arch/x86/include/uapi/asm/shmbuf.h
new file mode 100644
index 000000000..644421f38
--- /dev/null
+++ b/arch/x86/include/uapi/asm/shmbuf.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef __ASM_X86_SHMBUF_H
+#define __ASM_X86_SHMBUF_H
+
+#if !defined(__x86_64__) || !defined(__ILP32__)
+#include <asm-generic/shmbuf.h>
+#else
+/*
+ * The shmid64_ds structure for x86 architecture with x32 ABI.
+ *
+ * On x86-32 and x86-64 we can just use the generic definition, but
+ * x32 uses the same binary layout as x86_64, which is differnet
+ * from other 32-bit architectures.
+ */
+
+struct shmid64_ds {
+ struct ipc64_perm shm_perm; /* operation perms */
+ size_t shm_segsz; /* size of segment (bytes) */
+ __kernel_time_t shm_atime; /* last attach time */
+ __kernel_time_t shm_dtime; /* last detach time */
+ __kernel_time_t shm_ctime; /* last change time */
+ __kernel_pid_t shm_cpid; /* pid of creator */
+ __kernel_pid_t shm_lpid; /* pid of last operator */
+ __kernel_ulong_t shm_nattch; /* no. of current attaches */
+ __kernel_ulong_t __unused4;
+ __kernel_ulong_t __unused5;
+};
+
+struct shminfo64 {
+ __kernel_ulong_t shmmax;
+ __kernel_ulong_t shmmin;
+ __kernel_ulong_t shmmni;
+ __kernel_ulong_t shmseg;
+ __kernel_ulong_t shmall;
+ __kernel_ulong_t __unused1;
+ __kernel_ulong_t __unused2;
+ __kernel_ulong_t __unused3;
+ __kernel_ulong_t __unused4;
+};
+
+#endif
+
+#endif /* __ASM_X86_SHMBUF_H */
diff --git a/arch/x86/include/uapi/asm/sigcontext.h b/arch/x86/include/uapi/asm/sigcontext.h
new file mode 100644
index 000000000..844d60eb1
--- /dev/null
+++ b/arch/x86/include/uapi/asm/sigcontext.h
@@ -0,0 +1,389 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_ASM_X86_SIGCONTEXT_H
+#define _UAPI_ASM_X86_SIGCONTEXT_H
+
+/*
+ * Linux signal context definitions. The sigcontext includes a complex
+ * hierarchy of CPU and FPU state, available to user-space (on the stack) when
+ * a signal handler is executed.
+ *
+ * As over the years this ABI grew from its very simple roots towards
+ * supporting more and more CPU state organically, some of the details (which
+ * were rather clever hacks back in the days) became a bit quirky by today.
+ *
+ * The current ABI includes flexible provisions for future extensions, so we
+ * won't have to grow new quirks for quite some time. Promise!
+ */
+
+#include <linux/compiler.h>
+#include <linux/types.h>
+
+#define FP_XSTATE_MAGIC1 0x46505853U
+#define FP_XSTATE_MAGIC2 0x46505845U
+#define FP_XSTATE_MAGIC2_SIZE sizeof(FP_XSTATE_MAGIC2)
+
+/*
+ * Bytes 464..511 in the current 512-byte layout of the FXSAVE/FXRSTOR frame
+ * are reserved for SW usage. On CPUs supporting XSAVE/XRSTOR, these bytes are
+ * used to extend the fpstate pointer in the sigcontext, which now includes the
+ * extended state information along with fpstate information.
+ *
+ * If sw_reserved.magic1 == FP_XSTATE_MAGIC1 then there's a
+ * sw_reserved.extended_size bytes large extended context area present. (The
+ * last 32-bit word of this extended area (at the
+ * fpstate+extended_size-FP_XSTATE_MAGIC2_SIZE address) is set to
+ * FP_XSTATE_MAGIC2 so that you can sanity check your size calculations.)
+ *
+ * This extended area typically grows with newer CPUs that have larger and
+ * larger XSAVE areas.
+ */
+struct _fpx_sw_bytes {
+ /*
+ * If set to FP_XSTATE_MAGIC1 then this is an xstate context.
+ * 0 if a legacy frame.
+ */
+ __u32 magic1;
+
+ /*
+ * Total size of the fpstate area:
+ *
+ * - if magic1 == 0 then it's sizeof(struct _fpstate)
+ * - if magic1 == FP_XSTATE_MAGIC1 then it's sizeof(struct _xstate)
+ * plus extensions (if any)
+ */
+ __u32 extended_size;
+
+ /*
+ * Feature bit mask (including FP/SSE/extended state) that is present
+ * in the memory layout:
+ */
+ __u64 xfeatures;
+
+ /*
+ * Actual XSAVE state size, based on the xfeatures saved in the layout.
+ * 'extended_size' is greater than 'xstate_size':
+ */
+ __u32 xstate_size;
+
+ /* For future use: */
+ __u32 padding[7];
+};
+
+/*
+ * As documented in the iBCS2 standard:
+ *
+ * The first part of "struct _fpstate" is just the normal i387 hardware setup,
+ * the extra "status" word is used to save the coprocessor status word before
+ * entering the handler.
+ *
+ * The FPU state data structure has had to grow to accommodate the extended FPU
+ * state required by the Streaming SIMD Extensions. There is no documented
+ * standard to accomplish this at the moment.
+ */
+
+/* 10-byte legacy floating point register: */
+struct _fpreg {
+ __u16 significand[4];
+ __u16 exponent;
+};
+
+/* 16-byte floating point register: */
+struct _fpxreg {
+ __u16 significand[4];
+ __u16 exponent;
+ __u16 padding[3];
+};
+
+/* 16-byte XMM register: */
+struct _xmmreg {
+ __u32 element[4];
+};
+
+#define X86_FXSR_MAGIC 0x0000
+
+/*
+ * The 32-bit FPU frame:
+ */
+struct _fpstate_32 {
+ /* Legacy FPU environment: */
+ __u32 cw;
+ __u32 sw;
+ __u32 tag;
+ __u32 ipoff;
+ __u32 cssel;
+ __u32 dataoff;
+ __u32 datasel;
+ struct _fpreg _st[8];
+ __u16 status;
+ __u16 magic; /* 0xffff: regular FPU data only */
+ /* 0x0000: FXSR FPU data */
+
+ /* FXSR FPU environment */
+ __u32 _fxsr_env[6]; /* FXSR FPU env is ignored */
+ __u32 mxcsr;
+ __u32 reserved;
+ struct _fpxreg _fxsr_st[8]; /* FXSR FPU reg data is ignored */
+ struct _xmmreg _xmm[8]; /* First 8 XMM registers */
+ union {
+ __u32 padding1[44]; /* Second 8 XMM registers plus padding */
+ __u32 padding[44]; /* Alias name for old user-space */
+ };
+
+ union {
+ __u32 padding2[12];
+ struct _fpx_sw_bytes sw_reserved; /* Potential extended state is encoded here */
+ };
+};
+
+/*
+ * The 64-bit FPU frame. (FXSAVE format and later)
+ *
+ * Note1: If sw_reserved.magic1 == FP_XSTATE_MAGIC1 then the structure is
+ * larger: 'struct _xstate'. Note that 'struct _xstate' embedds
+ * 'struct _fpstate' so that you can always assume the _fpstate portion
+ * exists so that you can check the magic value.
+ *
+ * Note2: Reserved fields may someday contain valuable data. Always
+ * save/restore them when you change signal frames.
+ */
+struct _fpstate_64 {
+ __u16 cwd;
+ __u16 swd;
+ /* Note this is not the same as the 32-bit/x87/FSAVE twd: */
+ __u16 twd;
+ __u16 fop;
+ __u64 rip;
+ __u64 rdp;
+ __u32 mxcsr;
+ __u32 mxcsr_mask;
+ __u32 st_space[32]; /* 8x FP registers, 16 bytes each */
+ __u32 xmm_space[64]; /* 16x XMM registers, 16 bytes each */
+ __u32 reserved2[12];
+ union {
+ __u32 reserved3[12];
+ struct _fpx_sw_bytes sw_reserved; /* Potential extended state is encoded here */
+ };
+};
+
+#ifdef __i386__
+# define _fpstate _fpstate_32
+#else
+# define _fpstate _fpstate_64
+#endif
+
+struct _header {
+ __u64 xfeatures;
+ __u64 reserved1[2];
+ __u64 reserved2[5];
+};
+
+struct _ymmh_state {
+ /* 16x YMM registers, 16 bytes each: */
+ __u32 ymmh_space[64];
+};
+
+/*
+ * Extended state pointed to by sigcontext::fpstate.
+ *
+ * In addition to the fpstate, information encoded in _xstate::xstate_hdr
+ * indicates the presence of other extended state information supported
+ * by the CPU and kernel:
+ */
+struct _xstate {
+ struct _fpstate fpstate;
+ struct _header xstate_hdr;
+ struct _ymmh_state ymmh;
+ /* New processor state extensions go here: */
+};
+
+/*
+ * The 32-bit signal frame:
+ */
+struct sigcontext_32 {
+ __u16 gs, __gsh;
+ __u16 fs, __fsh;
+ __u16 es, __esh;
+ __u16 ds, __dsh;
+ __u32 di;
+ __u32 si;
+ __u32 bp;
+ __u32 sp;
+ __u32 bx;
+ __u32 dx;
+ __u32 cx;
+ __u32 ax;
+ __u32 trapno;
+ __u32 err;
+ __u32 ip;
+ __u16 cs, __csh;
+ __u32 flags;
+ __u32 sp_at_signal;
+ __u16 ss, __ssh;
+
+ /*
+ * fpstate is really (struct _fpstate *) or (struct _xstate *)
+ * depending on the FP_XSTATE_MAGIC1 encoded in the SW reserved
+ * bytes of (struct _fpstate) and FP_XSTATE_MAGIC2 present at the end
+ * of extended memory layout. See comments at the definition of
+ * (struct _fpx_sw_bytes)
+ */
+ __u32 fpstate; /* Zero when no FPU/extended context */
+ __u32 oldmask;
+ __u32 cr2;
+};
+
+/*
+ * The 64-bit signal frame:
+ */
+struct sigcontext_64 {
+ __u64 r8;
+ __u64 r9;
+ __u64 r10;
+ __u64 r11;
+ __u64 r12;
+ __u64 r13;
+ __u64 r14;
+ __u64 r15;
+ __u64 di;
+ __u64 si;
+ __u64 bp;
+ __u64 bx;
+ __u64 dx;
+ __u64 ax;
+ __u64 cx;
+ __u64 sp;
+ __u64 ip;
+ __u64 flags;
+ __u16 cs;
+ __u16 gs;
+ __u16 fs;
+ __u16 ss;
+ __u64 err;
+ __u64 trapno;
+ __u64 oldmask;
+ __u64 cr2;
+
+ /*
+ * fpstate is really (struct _fpstate *) or (struct _xstate *)
+ * depending on the FP_XSTATE_MAGIC1 encoded in the SW reserved
+ * bytes of (struct _fpstate) and FP_XSTATE_MAGIC2 present at the end
+ * of extended memory layout. See comments at the definition of
+ * (struct _fpx_sw_bytes)
+ */
+ __u64 fpstate; /* Zero when no FPU/extended context */
+ __u64 reserved1[8];
+};
+
+/*
+ * Create the real 'struct sigcontext' type:
+ */
+#ifdef __KERNEL__
+# ifdef __i386__
+# define sigcontext sigcontext_32
+# else
+# define sigcontext sigcontext_64
+# endif
+#endif
+
+/*
+ * The old user-space sigcontext definition, just in case user-space still
+ * relies on it. The kernel definition (in asm/sigcontext.h) has unified
+ * field names but otherwise the same layout.
+ */
+#ifndef __KERNEL__
+
+#define _fpstate_ia32 _fpstate_32
+#define sigcontext_ia32 sigcontext_32
+
+
+# ifdef __i386__
+struct sigcontext {
+ __u16 gs, __gsh;
+ __u16 fs, __fsh;
+ __u16 es, __esh;
+ __u16 ds, __dsh;
+ __u32 edi;
+ __u32 esi;
+ __u32 ebp;
+ __u32 esp;
+ __u32 ebx;
+ __u32 edx;
+ __u32 ecx;
+ __u32 eax;
+ __u32 trapno;
+ __u32 err;
+ __u32 eip;
+ __u16 cs, __csh;
+ __u32 eflags;
+ __u32 esp_at_signal;
+ __u16 ss, __ssh;
+ struct _fpstate __user *fpstate;
+ __u32 oldmask;
+ __u32 cr2;
+};
+# else /* __x86_64__: */
+struct sigcontext {
+ __u64 r8;
+ __u64 r9;
+ __u64 r10;
+ __u64 r11;
+ __u64 r12;
+ __u64 r13;
+ __u64 r14;
+ __u64 r15;
+ __u64 rdi;
+ __u64 rsi;
+ __u64 rbp;
+ __u64 rbx;
+ __u64 rdx;
+ __u64 rax;
+ __u64 rcx;
+ __u64 rsp;
+ __u64 rip;
+ __u64 eflags; /* RFLAGS */
+ __u16 cs;
+
+ /*
+ * Prior to 2.5.64 ("[PATCH] x86-64 updates for 2.5.64-bk3"),
+ * Linux saved and restored fs and gs in these slots. This
+ * was counterproductive, as fsbase and gsbase were never
+ * saved, so arch_prctl was presumably unreliable.
+ *
+ * These slots should never be reused without extreme caution:
+ *
+ * - Some DOSEMU versions stash fs and gs in these slots manually,
+ * thus overwriting anything the kernel expects to be preserved
+ * in these slots.
+ *
+ * - If these slots are ever needed for any other purpose,
+ * there is some risk that very old 64-bit binaries could get
+ * confused. I doubt that many such binaries still work,
+ * though, since the same patch in 2.5.64 also removed the
+ * 64-bit set_thread_area syscall, so it appears that there
+ * is no TLS API beyond modify_ldt that works in both pre-
+ * and post-2.5.64 kernels.
+ *
+ * If the kernel ever adds explicit fs, gs, fsbase, and gsbase
+ * save/restore, it will most likely need to be opt-in and use
+ * different context slots.
+ */
+ __u16 gs;
+ __u16 fs;
+ union {
+ __u16 ss; /* If UC_SIGCONTEXT_SS */
+ __u16 __pad0; /* Alias name for old (!UC_SIGCONTEXT_SS) user-space */
+ };
+ __u64 err;
+ __u64 trapno;
+ __u64 oldmask;
+ __u64 cr2;
+ struct _fpstate __user *fpstate; /* Zero when no FPU context */
+# ifdef __ILP32__
+ __u32 __fpstate_pad;
+# endif
+ __u64 reserved1[8];
+};
+# endif /* __x86_64__ */
+#endif /* !__KERNEL__ */
+
+#endif /* _UAPI_ASM_X86_SIGCONTEXT_H */
diff --git a/arch/x86/include/uapi/asm/sigcontext32.h b/arch/x86/include/uapi/asm/sigcontext32.h
new file mode 100644
index 000000000..6b18e88de
--- /dev/null
+++ b/arch/x86/include/uapi/asm/sigcontext32.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_SIGCONTEXT32_H
+#define _ASM_X86_SIGCONTEXT32_H
+
+/* This is a legacy file - all the type definitions are in sigcontext.h: */
+
+#include <asm/sigcontext.h>
+
+#endif /* _ASM_X86_SIGCONTEXT32_H */
diff --git a/arch/x86/include/uapi/asm/siginfo.h b/arch/x86/include/uapi/asm/siginfo.h
new file mode 100644
index 000000000..b3d157957
--- /dev/null
+++ b/arch/x86/include/uapi/asm/siginfo.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _ASM_X86_SIGINFO_H
+#define _ASM_X86_SIGINFO_H
+
+#ifdef __x86_64__
+# ifdef __ILP32__ /* x32 */
+typedef long long __kernel_si_clock_t __attribute__((aligned(4)));
+# define __ARCH_SI_CLOCK_T __kernel_si_clock_t
+# define __ARCH_SI_ATTRIBUTES __attribute__((aligned(8)))
+# else /* x86-64 */
+# define __ARCH_SI_PREAMBLE_SIZE (4 * sizeof(int))
+# endif
+#endif
+
+#include <asm-generic/siginfo.h>
+
+#endif /* _ASM_X86_SIGINFO_H */
diff --git a/arch/x86/include/uapi/asm/signal.h b/arch/x86/include/uapi/asm/signal.h
new file mode 100644
index 000000000..e5745d593
--- /dev/null
+++ b/arch/x86/include/uapi/asm/signal.h
@@ -0,0 +1,136 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_ASM_X86_SIGNAL_H
+#define _UAPI_ASM_X86_SIGNAL_H
+
+#ifndef __ASSEMBLY__
+#include <linux/types.h>
+#include <linux/time.h>
+#include <linux/compiler.h>
+
+/* Avoid too many header ordering problems. */
+struct siginfo;
+
+#ifndef __KERNEL__
+/* Here we must cater to libcs that poke about in kernel headers. */
+
+#define NSIG 32
+typedef unsigned long sigset_t;
+
+#endif /* __KERNEL__ */
+#endif /* __ASSEMBLY__ */
+
+
+#define SIGHUP 1
+#define SIGINT 2
+#define SIGQUIT 3
+#define SIGILL 4
+#define SIGTRAP 5
+#define SIGABRT 6
+#define SIGIOT 6
+#define SIGBUS 7
+#define SIGFPE 8
+#define SIGKILL 9
+#define SIGUSR1 10
+#define SIGSEGV 11
+#define SIGUSR2 12
+#define SIGPIPE 13
+#define SIGALRM 14
+#define SIGTERM 15
+#define SIGSTKFLT 16
+#define SIGCHLD 17
+#define SIGCONT 18
+#define SIGSTOP 19
+#define SIGTSTP 20
+#define SIGTTIN 21
+#define SIGTTOU 22
+#define SIGURG 23
+#define SIGXCPU 24
+#define SIGXFSZ 25
+#define SIGVTALRM 26
+#define SIGPROF 27
+#define SIGWINCH 28
+#define SIGIO 29
+#define SIGPOLL SIGIO
+/*
+#define SIGLOST 29
+*/
+#define SIGPWR 30
+#define SIGSYS 31
+#define SIGUNUSED 31
+
+/* These should not be considered constants from userland. */
+#define SIGRTMIN 32
+#define SIGRTMAX _NSIG
+
+/*
+ * SA_FLAGS values:
+ *
+ * SA_ONSTACK indicates that a registered stack_t will be used.
+ * SA_RESTART flag to get restarting signals (which were the default long ago)
+ * SA_NOCLDSTOP flag to turn off SIGCHLD when children stop.
+ * SA_RESETHAND clears the handler when the signal is delivered.
+ * SA_NOCLDWAIT flag on SIGCHLD to inhibit zombies.
+ * SA_NODEFER prevents the current signal from being masked in the handler.
+ *
+ * SA_ONESHOT and SA_NOMASK are the historical Linux names for the Single
+ * Unix names RESETHAND and NODEFER respectively.
+ */
+#define SA_NOCLDSTOP 0x00000001u
+#define SA_NOCLDWAIT 0x00000002u
+#define SA_SIGINFO 0x00000004u
+#define SA_ONSTACK 0x08000000u
+#define SA_RESTART 0x10000000u
+#define SA_NODEFER 0x40000000u
+#define SA_RESETHAND 0x80000000u
+
+#define SA_NOMASK SA_NODEFER
+#define SA_ONESHOT SA_RESETHAND
+
+#define SA_RESTORER 0x04000000
+
+#define MINSIGSTKSZ 2048
+#define SIGSTKSZ 8192
+
+#include <asm-generic/signal-defs.h>
+
+#ifndef __ASSEMBLY__
+
+
+# ifndef __KERNEL__
+/* Here we must cater to libcs that poke about in kernel headers. */
+#ifdef __i386__
+
+struct sigaction {
+ union {
+ __sighandler_t _sa_handler;
+ void (*_sa_sigaction)(int, struct siginfo *, void *);
+ } _u;
+ sigset_t sa_mask;
+ unsigned long sa_flags;
+ void (*sa_restorer)(void);
+};
+
+#define sa_handler _u._sa_handler
+#define sa_sigaction _u._sa_sigaction
+
+#else /* __i386__ */
+
+struct sigaction {
+ __sighandler_t sa_handler;
+ unsigned long sa_flags;
+ __sigrestore_t sa_restorer;
+ sigset_t sa_mask; /* mask last for extensibility */
+};
+
+#endif /* !__i386__ */
+# endif /* ! __KERNEL__ */
+
+typedef struct sigaltstack {
+ void __user *ss_sp;
+ int ss_flags;
+ size_t ss_size;
+} stack_t;
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _UAPI_ASM_X86_SIGNAL_H */
diff --git a/arch/x86/include/uapi/asm/socket.h b/arch/x86/include/uapi/asm/socket.h
new file mode 100644
index 000000000..6b71384b9
--- /dev/null
+++ b/arch/x86/include/uapi/asm/socket.h
@@ -0,0 +1 @@
+#include <asm-generic/socket.h>
diff --git a/arch/x86/include/uapi/asm/sockios.h b/arch/x86/include/uapi/asm/sockios.h
new file mode 100644
index 000000000..def6d4746
--- /dev/null
+++ b/arch/x86/include/uapi/asm/sockios.h
@@ -0,0 +1 @@
+#include <asm-generic/sockios.h>
diff --git a/arch/x86/include/uapi/asm/stat.h b/arch/x86/include/uapi/asm/stat.h
new file mode 100644
index 000000000..9e3982d95
--- /dev/null
+++ b/arch/x86/include/uapi/asm/stat.h
@@ -0,0 +1,138 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _ASM_X86_STAT_H
+#define _ASM_X86_STAT_H
+
+#include <asm/posix_types.h>
+
+#define STAT_HAVE_NSEC 1
+
+#ifdef __i386__
+struct stat {
+ unsigned long st_dev;
+ unsigned long st_ino;
+ unsigned short st_mode;
+ unsigned short st_nlink;
+ unsigned short st_uid;
+ unsigned short st_gid;
+ unsigned long st_rdev;
+ unsigned long st_size;
+ unsigned long st_blksize;
+ unsigned long st_blocks;
+ unsigned long st_atime;
+ unsigned long st_atime_nsec;
+ unsigned long st_mtime;
+ unsigned long st_mtime_nsec;
+ unsigned long st_ctime;
+ unsigned long st_ctime_nsec;
+ unsigned long __unused4;
+ unsigned long __unused5;
+};
+
+/* We don't need to memset the whole thing just to initialize the padding */
+#define INIT_STRUCT_STAT_PADDING(st) do { \
+ st.__unused4 = 0; \
+ st.__unused5 = 0; \
+} while (0)
+
+#define STAT64_HAS_BROKEN_ST_INO 1
+
+/* This matches struct stat64 in glibc2.1, hence the absolutely
+ * insane amounts of padding around dev_t's.
+ */
+struct stat64 {
+ unsigned long long st_dev;
+ unsigned char __pad0[4];
+
+ unsigned long __st_ino;
+
+ unsigned int st_mode;
+ unsigned int st_nlink;
+
+ unsigned long st_uid;
+ unsigned long st_gid;
+
+ unsigned long long st_rdev;
+ unsigned char __pad3[4];
+
+ long long st_size;
+ unsigned long st_blksize;
+
+ /* Number 512-byte blocks allocated. */
+ unsigned long long st_blocks;
+
+ unsigned long st_atime;
+ unsigned long st_atime_nsec;
+
+ unsigned long st_mtime;
+ unsigned int st_mtime_nsec;
+
+ unsigned long st_ctime;
+ unsigned long st_ctime_nsec;
+
+ unsigned long long st_ino;
+};
+
+/* We don't need to memset the whole thing just to initialize the padding */
+#define INIT_STRUCT_STAT64_PADDING(st) do { \
+ memset(&st.__pad0, 0, sizeof(st.__pad0)); \
+ memset(&st.__pad3, 0, sizeof(st.__pad3)); \
+} while (0)
+
+#else /* __i386__ */
+
+struct stat {
+ __kernel_ulong_t st_dev;
+ __kernel_ulong_t st_ino;
+ __kernel_ulong_t st_nlink;
+
+ unsigned int st_mode;
+ unsigned int st_uid;
+ unsigned int st_gid;
+ unsigned int __pad0;
+ __kernel_ulong_t st_rdev;
+ __kernel_long_t st_size;
+ __kernel_long_t st_blksize;
+ __kernel_long_t st_blocks; /* Number 512-byte blocks allocated. */
+
+ __kernel_ulong_t st_atime;
+ __kernel_ulong_t st_atime_nsec;
+ __kernel_ulong_t st_mtime;
+ __kernel_ulong_t st_mtime_nsec;
+ __kernel_ulong_t st_ctime;
+ __kernel_ulong_t st_ctime_nsec;
+ __kernel_long_t __unused[3];
+};
+
+/* We don't need to memset the whole thing just to initialize the padding */
+#define INIT_STRUCT_STAT_PADDING(st) do { \
+ st.__pad0 = 0; \
+ st.__unused[0] = 0; \
+ st.__unused[1] = 0; \
+ st.__unused[2] = 0; \
+} while (0)
+
+#endif
+
+/* for 32bit emulation and 32 bit kernels */
+struct __old_kernel_stat {
+ unsigned short st_dev;
+ unsigned short st_ino;
+ unsigned short st_mode;
+ unsigned short st_nlink;
+ unsigned short st_uid;
+ unsigned short st_gid;
+ unsigned short st_rdev;
+#ifdef __i386__
+ unsigned long st_size;
+ unsigned long st_atime;
+ unsigned long st_mtime;
+ unsigned long st_ctime;
+#else
+ unsigned int st_size;
+ unsigned int st_atime;
+ unsigned int st_mtime;
+ unsigned int st_ctime;
+#endif
+};
+
+#endif /* _ASM_X86_STAT_H */
diff --git a/arch/x86/include/uapi/asm/statfs.h b/arch/x86/include/uapi/asm/statfs.h
new file mode 100644
index 000000000..13c2464cd
--- /dev/null
+++ b/arch/x86/include/uapi/asm/statfs.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _ASM_X86_STATFS_H
+#define _ASM_X86_STATFS_H
+
+/*
+ * We need compat_statfs64 to be packed, because the i386 ABI won't
+ * add padding at the end to bring it to a multiple of 8 bytes, but
+ * the x86_64 ABI will.
+ */
+#define ARCH_PACK_COMPAT_STATFS64 __attribute__((packed,aligned(4)))
+
+#include <asm-generic/statfs.h>
+#endif /* _ASM_X86_STATFS_H */
diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h
new file mode 100644
index 000000000..a9731f8a4
--- /dev/null
+++ b/arch/x86/include/uapi/asm/svm.h
@@ -0,0 +1,179 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI__SVM_H
+#define _UAPI__SVM_H
+
+#define SVM_EXIT_READ_CR0 0x000
+#define SVM_EXIT_READ_CR2 0x002
+#define SVM_EXIT_READ_CR3 0x003
+#define SVM_EXIT_READ_CR4 0x004
+#define SVM_EXIT_READ_CR8 0x008
+#define SVM_EXIT_WRITE_CR0 0x010
+#define SVM_EXIT_WRITE_CR2 0x012
+#define SVM_EXIT_WRITE_CR3 0x013
+#define SVM_EXIT_WRITE_CR4 0x014
+#define SVM_EXIT_WRITE_CR8 0x018
+#define SVM_EXIT_READ_DR0 0x020
+#define SVM_EXIT_READ_DR1 0x021
+#define SVM_EXIT_READ_DR2 0x022
+#define SVM_EXIT_READ_DR3 0x023
+#define SVM_EXIT_READ_DR4 0x024
+#define SVM_EXIT_READ_DR5 0x025
+#define SVM_EXIT_READ_DR6 0x026
+#define SVM_EXIT_READ_DR7 0x027
+#define SVM_EXIT_WRITE_DR0 0x030
+#define SVM_EXIT_WRITE_DR1 0x031
+#define SVM_EXIT_WRITE_DR2 0x032
+#define SVM_EXIT_WRITE_DR3 0x033
+#define SVM_EXIT_WRITE_DR4 0x034
+#define SVM_EXIT_WRITE_DR5 0x035
+#define SVM_EXIT_WRITE_DR6 0x036
+#define SVM_EXIT_WRITE_DR7 0x037
+#define SVM_EXIT_EXCP_BASE 0x040
+#define SVM_EXIT_INTR 0x060
+#define SVM_EXIT_NMI 0x061
+#define SVM_EXIT_SMI 0x062
+#define SVM_EXIT_INIT 0x063
+#define SVM_EXIT_VINTR 0x064
+#define SVM_EXIT_CR0_SEL_WRITE 0x065
+#define SVM_EXIT_IDTR_READ 0x066
+#define SVM_EXIT_GDTR_READ 0x067
+#define SVM_EXIT_LDTR_READ 0x068
+#define SVM_EXIT_TR_READ 0x069
+#define SVM_EXIT_IDTR_WRITE 0x06a
+#define SVM_EXIT_GDTR_WRITE 0x06b
+#define SVM_EXIT_LDTR_WRITE 0x06c
+#define SVM_EXIT_TR_WRITE 0x06d
+#define SVM_EXIT_RDTSC 0x06e
+#define SVM_EXIT_RDPMC 0x06f
+#define SVM_EXIT_PUSHF 0x070
+#define SVM_EXIT_POPF 0x071
+#define SVM_EXIT_CPUID 0x072
+#define SVM_EXIT_RSM 0x073
+#define SVM_EXIT_IRET 0x074
+#define SVM_EXIT_SWINT 0x075
+#define SVM_EXIT_INVD 0x076
+#define SVM_EXIT_PAUSE 0x077
+#define SVM_EXIT_HLT 0x078
+#define SVM_EXIT_INVLPG 0x079
+#define SVM_EXIT_INVLPGA 0x07a
+#define SVM_EXIT_IOIO 0x07b
+#define SVM_EXIT_MSR 0x07c
+#define SVM_EXIT_TASK_SWITCH 0x07d
+#define SVM_EXIT_FERR_FREEZE 0x07e
+#define SVM_EXIT_SHUTDOWN 0x07f
+#define SVM_EXIT_VMRUN 0x080
+#define SVM_EXIT_VMMCALL 0x081
+#define SVM_EXIT_VMLOAD 0x082
+#define SVM_EXIT_VMSAVE 0x083
+#define SVM_EXIT_STGI 0x084
+#define SVM_EXIT_CLGI 0x085
+#define SVM_EXIT_SKINIT 0x086
+#define SVM_EXIT_RDTSCP 0x087
+#define SVM_EXIT_ICEBP 0x088
+#define SVM_EXIT_WBINVD 0x089
+#define SVM_EXIT_MONITOR 0x08a
+#define SVM_EXIT_MWAIT 0x08b
+#define SVM_EXIT_MWAIT_COND 0x08c
+#define SVM_EXIT_XSETBV 0x08d
+#define SVM_EXIT_NPF 0x400
+#define SVM_EXIT_AVIC_INCOMPLETE_IPI 0x401
+#define SVM_EXIT_AVIC_UNACCELERATED_ACCESS 0x402
+
+#define SVM_EXIT_ERR -1
+
+#define SVM_EXIT_REASONS \
+ { SVM_EXIT_READ_CR0, "read_cr0" }, \
+ { SVM_EXIT_READ_CR2, "read_cr2" }, \
+ { SVM_EXIT_READ_CR3, "read_cr3" }, \
+ { SVM_EXIT_READ_CR4, "read_cr4" }, \
+ { SVM_EXIT_READ_CR8, "read_cr8" }, \
+ { SVM_EXIT_WRITE_CR0, "write_cr0" }, \
+ { SVM_EXIT_WRITE_CR2, "write_cr2" }, \
+ { SVM_EXIT_WRITE_CR3, "write_cr3" }, \
+ { SVM_EXIT_WRITE_CR4, "write_cr4" }, \
+ { SVM_EXIT_WRITE_CR8, "write_cr8" }, \
+ { SVM_EXIT_READ_DR0, "read_dr0" }, \
+ { SVM_EXIT_READ_DR1, "read_dr1" }, \
+ { SVM_EXIT_READ_DR2, "read_dr2" }, \
+ { SVM_EXIT_READ_DR3, "read_dr3" }, \
+ { SVM_EXIT_READ_DR4, "read_dr4" }, \
+ { SVM_EXIT_READ_DR5, "read_dr5" }, \
+ { SVM_EXIT_READ_DR6, "read_dr6" }, \
+ { SVM_EXIT_READ_DR7, "read_dr7" }, \
+ { SVM_EXIT_WRITE_DR0, "write_dr0" }, \
+ { SVM_EXIT_WRITE_DR1, "write_dr1" }, \
+ { SVM_EXIT_WRITE_DR2, "write_dr2" }, \
+ { SVM_EXIT_WRITE_DR3, "write_dr3" }, \
+ { SVM_EXIT_WRITE_DR4, "write_dr4" }, \
+ { SVM_EXIT_WRITE_DR5, "write_dr5" }, \
+ { SVM_EXIT_WRITE_DR6, "write_dr6" }, \
+ { SVM_EXIT_WRITE_DR7, "write_dr7" }, \
+ { SVM_EXIT_EXCP_BASE + DE_VECTOR, "DE excp" }, \
+ { SVM_EXIT_EXCP_BASE + DB_VECTOR, "DB excp" }, \
+ { SVM_EXIT_EXCP_BASE + BP_VECTOR, "BP excp" }, \
+ { SVM_EXIT_EXCP_BASE + OF_VECTOR, "OF excp" }, \
+ { SVM_EXIT_EXCP_BASE + BR_VECTOR, "BR excp" }, \
+ { SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" }, \
+ { SVM_EXIT_EXCP_BASE + NM_VECTOR, "NM excp" }, \
+ { SVM_EXIT_EXCP_BASE + DF_VECTOR, "DF excp" }, \
+ { SVM_EXIT_EXCP_BASE + TS_VECTOR, "TS excp" }, \
+ { SVM_EXIT_EXCP_BASE + NP_VECTOR, "NP excp" }, \
+ { SVM_EXIT_EXCP_BASE + SS_VECTOR, "SS excp" }, \
+ { SVM_EXIT_EXCP_BASE + GP_VECTOR, "GP excp" }, \
+ { SVM_EXIT_EXCP_BASE + PF_VECTOR, "PF excp" }, \
+ { SVM_EXIT_EXCP_BASE + MF_VECTOR, "MF excp" }, \
+ { SVM_EXIT_EXCP_BASE + AC_VECTOR, "AC excp" }, \
+ { SVM_EXIT_EXCP_BASE + MC_VECTOR, "MC excp" }, \
+ { SVM_EXIT_EXCP_BASE + XM_VECTOR, "XF excp" }, \
+ { SVM_EXIT_INTR, "interrupt" }, \
+ { SVM_EXIT_NMI, "nmi" }, \
+ { SVM_EXIT_SMI, "smi" }, \
+ { SVM_EXIT_INIT, "init" }, \
+ { SVM_EXIT_VINTR, "vintr" }, \
+ { SVM_EXIT_CR0_SEL_WRITE, "cr0_sel_write" }, \
+ { SVM_EXIT_IDTR_READ, "read_idtr" }, \
+ { SVM_EXIT_GDTR_READ, "read_gdtr" }, \
+ { SVM_EXIT_LDTR_READ, "read_ldtr" }, \
+ { SVM_EXIT_TR_READ, "read_rt" }, \
+ { SVM_EXIT_IDTR_WRITE, "write_idtr" }, \
+ { SVM_EXIT_GDTR_WRITE, "write_gdtr" }, \
+ { SVM_EXIT_LDTR_WRITE, "write_ldtr" }, \
+ { SVM_EXIT_TR_WRITE, "write_rt" }, \
+ { SVM_EXIT_RDTSC, "rdtsc" }, \
+ { SVM_EXIT_RDPMC, "rdpmc" }, \
+ { SVM_EXIT_PUSHF, "pushf" }, \
+ { SVM_EXIT_POPF, "popf" }, \
+ { SVM_EXIT_CPUID, "cpuid" }, \
+ { SVM_EXIT_RSM, "rsm" }, \
+ { SVM_EXIT_IRET, "iret" }, \
+ { SVM_EXIT_SWINT, "swint" }, \
+ { SVM_EXIT_INVD, "invd" }, \
+ { SVM_EXIT_PAUSE, "pause" }, \
+ { SVM_EXIT_HLT, "hlt" }, \
+ { SVM_EXIT_INVLPG, "invlpg" }, \
+ { SVM_EXIT_INVLPGA, "invlpga" }, \
+ { SVM_EXIT_IOIO, "io" }, \
+ { SVM_EXIT_MSR, "msr" }, \
+ { SVM_EXIT_TASK_SWITCH, "task_switch" }, \
+ { SVM_EXIT_FERR_FREEZE, "ferr_freeze" }, \
+ { SVM_EXIT_SHUTDOWN, "shutdown" }, \
+ { SVM_EXIT_VMRUN, "vmrun" }, \
+ { SVM_EXIT_VMMCALL, "hypercall" }, \
+ { SVM_EXIT_VMLOAD, "vmload" }, \
+ { SVM_EXIT_VMSAVE, "vmsave" }, \
+ { SVM_EXIT_STGI, "stgi" }, \
+ { SVM_EXIT_CLGI, "clgi" }, \
+ { SVM_EXIT_SKINIT, "skinit" }, \
+ { SVM_EXIT_RDTSCP, "rdtscp" }, \
+ { SVM_EXIT_ICEBP, "icebp" }, \
+ { SVM_EXIT_WBINVD, "wbinvd" }, \
+ { SVM_EXIT_MONITOR, "monitor" }, \
+ { SVM_EXIT_MWAIT, "mwait" }, \
+ { SVM_EXIT_XSETBV, "xsetbv" }, \
+ { SVM_EXIT_NPF, "npf" }, \
+ { SVM_EXIT_AVIC_INCOMPLETE_IPI, "avic_incomplete_ipi" }, \
+ { SVM_EXIT_AVIC_UNACCELERATED_ACCESS, "avic_unaccelerated_access" }, \
+ { SVM_EXIT_ERR, "invalid_guest_state" }
+
+
+#endif /* _UAPI__SVM_H */
diff --git a/arch/x86/include/uapi/asm/swab.h b/arch/x86/include/uapi/asm/swab.h
new file mode 100644
index 000000000..cd3fd8ddb
--- /dev/null
+++ b/arch/x86/include/uapi/asm/swab.h
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _ASM_X86_SWAB_H
+#define _ASM_X86_SWAB_H
+
+#include <linux/types.h>
+#include <linux/compiler.h>
+
+static inline __attribute_const__ __u32 __arch_swab32(__u32 val)
+{
+ asm("bswapl %0" : "=r" (val) : "0" (val));
+ return val;
+}
+#define __arch_swab32 __arch_swab32
+
+static inline __attribute_const__ __u64 __arch_swab64(__u64 val)
+{
+#ifdef __i386__
+ union {
+ struct {
+ __u32 a;
+ __u32 b;
+ } s;
+ __u64 u;
+ } v;
+ v.u = val;
+ asm("bswapl %0 ; bswapl %1 ; xchgl %0,%1"
+ : "=r" (v.s.a), "=r" (v.s.b)
+ : "0" (v.s.a), "1" (v.s.b));
+ return v.u;
+#else /* __i386__ */
+ asm("bswapq %0" : "=r" (val) : "0" (val));
+ return val;
+#endif
+}
+#define __arch_swab64 __arch_swab64
+
+#endif /* _ASM_X86_SWAB_H */
diff --git a/arch/x86/include/uapi/asm/termbits.h b/arch/x86/include/uapi/asm/termbits.h
new file mode 100644
index 000000000..3935b106d
--- /dev/null
+++ b/arch/x86/include/uapi/asm/termbits.h
@@ -0,0 +1 @@
+#include <asm-generic/termbits.h>
diff --git a/arch/x86/include/uapi/asm/termios.h b/arch/x86/include/uapi/asm/termios.h
new file mode 100644
index 000000000..280d78a9d
--- /dev/null
+++ b/arch/x86/include/uapi/asm/termios.h
@@ -0,0 +1 @@
+#include <asm-generic/termios.h>
diff --git a/arch/x86/include/uapi/asm/types.h b/arch/x86/include/uapi/asm/types.h
new file mode 100644
index 000000000..df55e1ddb
--- /dev/null
+++ b/arch/x86/include/uapi/asm/types.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_TYPES_H
+#define _ASM_X86_TYPES_H
+
+#include <asm-generic/types.h>
+
+#endif /* _ASM_X86_TYPES_H */
diff --git a/arch/x86/include/uapi/asm/ucontext.h b/arch/x86/include/uapi/asm/ucontext.h
new file mode 100644
index 000000000..5657b7a49
--- /dev/null
+++ b/arch/x86/include/uapi/asm/ucontext.h
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _ASM_X86_UCONTEXT_H
+#define _ASM_X86_UCONTEXT_H
+
+/*
+ * Indicates the presence of extended state information in the memory
+ * layout pointed by the fpstate pointer in the ucontext's sigcontext
+ * struct (uc_mcontext).
+ */
+#define UC_FP_XSTATE 0x1
+
+#ifdef __x86_64__
+/*
+ * UC_SIGCONTEXT_SS will be set when delivering 64-bit or x32 signals on
+ * kernels that save SS in the sigcontext. All kernels that set
+ * UC_SIGCONTEXT_SS will correctly restore at least the low 32 bits of esp
+ * regardless of SS (i.e. they implement espfix).
+ *
+ * Kernels that set UC_SIGCONTEXT_SS will also set UC_STRICT_RESTORE_SS
+ * when delivering a signal that came from 64-bit code.
+ *
+ * Sigreturn restores SS as follows:
+ *
+ * if (saved SS is valid || UC_STRICT_RESTORE_SS is set ||
+ * saved CS is not 64-bit)
+ * new SS = saved SS (will fail IRET and signal if invalid)
+ * else
+ * new SS = a flat 32-bit data segment
+ *
+ * This behavior serves three purposes:
+ *
+ * - Legacy programs that construct a 64-bit sigcontext from scratch
+ * with zero or garbage in the SS slot (e.g. old CRIU) and call
+ * sigreturn will still work.
+ *
+ * - Old DOSEMU versions sometimes catch a signal from a segmented
+ * context, delete the old SS segment (with modify_ldt), and change
+ * the saved CS to a 64-bit segment. These DOSEMU versions expect
+ * sigreturn to send them back to 64-bit mode without killing them,
+ * despite the fact that the SS selector when the signal was raised is
+ * no longer valid. UC_STRICT_RESTORE_SS will be clear, so the kernel
+ * will fix up SS for these DOSEMU versions.
+ *
+ * - Old and new programs that catch a signal and return without
+ * modifying the saved context will end up in exactly the state they
+ * started in, even if they were running in a segmented context when
+ * the signal was raised.. Old kernels would lose track of the
+ * previous SS value.
+ */
+#define UC_SIGCONTEXT_SS 0x2
+#define UC_STRICT_RESTORE_SS 0x4
+#endif
+
+#include <asm-generic/ucontext.h>
+
+#endif /* _ASM_X86_UCONTEXT_H */
diff --git a/arch/x86/include/uapi/asm/unistd.h b/arch/x86/include/uapi/asm/unistd.h
new file mode 100644
index 000000000..30d7d04d7
--- /dev/null
+++ b/arch/x86/include/uapi/asm/unistd.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_ASM_X86_UNISTD_H
+#define _UAPI_ASM_X86_UNISTD_H
+
+/* x32 syscall flag bit */
+#define __X32_SYSCALL_BIT 0x40000000
+
+#ifndef __KERNEL__
+# ifdef __i386__
+# include <asm/unistd_32.h>
+# elif defined(__ILP32__)
+# include <asm/unistd_x32.h>
+# else
+# include <asm/unistd_64.h>
+# endif
+#endif
+
+#endif /* _UAPI_ASM_X86_UNISTD_H */
diff --git a/arch/x86/include/uapi/asm/vm86.h b/arch/x86/include/uapi/asm/vm86.h
new file mode 100644
index 000000000..d2ee4e307
--- /dev/null
+++ b/arch/x86/include/uapi/asm/vm86.h
@@ -0,0 +1,130 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_ASM_X86_VM86_H
+#define _UAPI_ASM_X86_VM86_H
+
+/*
+ * I'm guessing at the VIF/VIP flag usage, but hope that this is how
+ * the Pentium uses them. Linux will return from vm86 mode when both
+ * VIF and VIP is set.
+ *
+ * On a Pentium, we could probably optimize the virtual flags directly
+ * in the eflags register instead of doing it "by hand" in vflags...
+ *
+ * Linus
+ */
+
+#include <asm/processor-flags.h>
+
+#define BIOSSEG 0x0f000
+
+#define CPU_086 0
+#define CPU_186 1
+#define CPU_286 2
+#define CPU_386 3
+#define CPU_486 4
+#define CPU_586 5
+
+/*
+ * Return values for the 'vm86()' system call
+ */
+#define VM86_TYPE(retval) ((retval) & 0xff)
+#define VM86_ARG(retval) ((retval) >> 8)
+
+#define VM86_SIGNAL 0 /* return due to signal */
+#define VM86_UNKNOWN 1 /* unhandled GP fault
+ - IO-instruction or similar */
+#define VM86_INTx 2 /* int3/int x instruction (ARG = x) */
+#define VM86_STI 3 /* sti/popf/iret instruction enabled
+ virtual interrupts */
+
+/*
+ * Additional return values when invoking new vm86()
+ */
+#define VM86_PICRETURN 4 /* return due to pending PIC request */
+#define VM86_TRAP 6 /* return due to DOS-debugger request */
+
+/*
+ * function codes when invoking new vm86()
+ */
+#define VM86_PLUS_INSTALL_CHECK 0
+#define VM86_ENTER 1
+#define VM86_ENTER_NO_BYPASS 2
+#define VM86_REQUEST_IRQ 3
+#define VM86_FREE_IRQ 4
+#define VM86_GET_IRQ_BITS 5
+#define VM86_GET_AND_RESET_IRQ 6
+
+/*
+ * This is the stack-layout seen by the user space program when we have
+ * done a translation of "SAVE_ALL" from vm86 mode. The real kernel layout
+ * is 'kernel_vm86_regs' (see below).
+ */
+
+struct vm86_regs {
+/*
+ * normal regs, with special meaning for the segment descriptors..
+ */
+ long ebx;
+ long ecx;
+ long edx;
+ long esi;
+ long edi;
+ long ebp;
+ long eax;
+ long __null_ds;
+ long __null_es;
+ long __null_fs;
+ long __null_gs;
+ long orig_eax;
+ long eip;
+ unsigned short cs, __csh;
+ long eflags;
+ long esp;
+ unsigned short ss, __ssh;
+/*
+ * these are specific to v86 mode:
+ */
+ unsigned short es, __esh;
+ unsigned short ds, __dsh;
+ unsigned short fs, __fsh;
+ unsigned short gs, __gsh;
+};
+
+struct revectored_struct {
+ unsigned long __map[8]; /* 256 bits */
+};
+
+struct vm86_struct {
+ struct vm86_regs regs;
+ unsigned long flags;
+ unsigned long screen_bitmap;
+ unsigned long cpu_type;
+ struct revectored_struct int_revectored;
+ struct revectored_struct int21_revectored;
+};
+
+/*
+ * flags masks
+ */
+#define VM86_SCREEN_BITMAP 0x0001
+
+struct vm86plus_info_struct {
+ unsigned long force_return_for_pic:1;
+ unsigned long vm86dbg_active:1; /* for debugger */
+ unsigned long vm86dbg_TFpendig:1; /* for debugger */
+ unsigned long unused:28;
+ unsigned long is_vm86pus:1; /* for vm86 internal use */
+ unsigned char vm86dbg_intxxtab[32]; /* for debugger */
+};
+struct vm86plus_struct {
+ struct vm86_regs regs;
+ unsigned long flags;
+ unsigned long screen_bitmap;
+ unsigned long cpu_type;
+ struct revectored_struct int_revectored;
+ struct revectored_struct int21_revectored;
+ struct vm86plus_info_struct vm86plus;
+};
+
+
+#endif /* _UAPI_ASM_X86_VM86_H */
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
new file mode 100644
index 000000000..f0b0c90dd
--- /dev/null
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -0,0 +1,151 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * vmx.h: VMX Architecture related definitions
+ * Copyright (c) 2004, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * A few random additions are:
+ * Copyright (C) 2006 Qumranet
+ * Avi Kivity <avi@qumranet.com>
+ * Yaniv Kamay <yaniv@qumranet.com>
+ *
+ */
+#ifndef _UAPIVMX_H
+#define _UAPIVMX_H
+
+
+#define VMX_EXIT_REASONS_FAILED_VMENTRY 0x80000000
+
+#define EXIT_REASON_EXCEPTION_NMI 0
+#define EXIT_REASON_EXTERNAL_INTERRUPT 1
+#define EXIT_REASON_TRIPLE_FAULT 2
+
+#define EXIT_REASON_PENDING_INTERRUPT 7
+#define EXIT_REASON_NMI_WINDOW 8
+#define EXIT_REASON_TASK_SWITCH 9
+#define EXIT_REASON_CPUID 10
+#define EXIT_REASON_HLT 12
+#define EXIT_REASON_INVD 13
+#define EXIT_REASON_INVLPG 14
+#define EXIT_REASON_RDPMC 15
+#define EXIT_REASON_RDTSC 16
+#define EXIT_REASON_VMCALL 18
+#define EXIT_REASON_VMCLEAR 19
+#define EXIT_REASON_VMLAUNCH 20
+#define EXIT_REASON_VMPTRLD 21
+#define EXIT_REASON_VMPTRST 22
+#define EXIT_REASON_VMREAD 23
+#define EXIT_REASON_VMRESUME 24
+#define EXIT_REASON_VMWRITE 25
+#define EXIT_REASON_VMOFF 26
+#define EXIT_REASON_VMON 27
+#define EXIT_REASON_CR_ACCESS 28
+#define EXIT_REASON_DR_ACCESS 29
+#define EXIT_REASON_IO_INSTRUCTION 30
+#define EXIT_REASON_MSR_READ 31
+#define EXIT_REASON_MSR_WRITE 32
+#define EXIT_REASON_INVALID_STATE 33
+#define EXIT_REASON_MSR_LOAD_FAIL 34
+#define EXIT_REASON_MWAIT_INSTRUCTION 36
+#define EXIT_REASON_MONITOR_TRAP_FLAG 37
+#define EXIT_REASON_MONITOR_INSTRUCTION 39
+#define EXIT_REASON_PAUSE_INSTRUCTION 40
+#define EXIT_REASON_MCE_DURING_VMENTRY 41
+#define EXIT_REASON_TPR_BELOW_THRESHOLD 43
+#define EXIT_REASON_APIC_ACCESS 44
+#define EXIT_REASON_EOI_INDUCED 45
+#define EXIT_REASON_GDTR_IDTR 46
+#define EXIT_REASON_LDTR_TR 47
+#define EXIT_REASON_EPT_VIOLATION 48
+#define EXIT_REASON_EPT_MISCONFIG 49
+#define EXIT_REASON_INVEPT 50
+#define EXIT_REASON_RDTSCP 51
+#define EXIT_REASON_PREEMPTION_TIMER 52
+#define EXIT_REASON_INVVPID 53
+#define EXIT_REASON_WBINVD 54
+#define EXIT_REASON_XSETBV 55
+#define EXIT_REASON_APIC_WRITE 56
+#define EXIT_REASON_RDRAND 57
+#define EXIT_REASON_INVPCID 58
+#define EXIT_REASON_VMFUNC 59
+#define EXIT_REASON_ENCLS 60
+#define EXIT_REASON_RDSEED 61
+#define EXIT_REASON_PML_FULL 62
+#define EXIT_REASON_XSAVES 63
+#define EXIT_REASON_XRSTORS 64
+
+#define VMX_EXIT_REASONS \
+ { EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \
+ { EXIT_REASON_EXTERNAL_INTERRUPT, "EXTERNAL_INTERRUPT" }, \
+ { EXIT_REASON_TRIPLE_FAULT, "TRIPLE_FAULT" }, \
+ { EXIT_REASON_PENDING_INTERRUPT, "PENDING_INTERRUPT" }, \
+ { EXIT_REASON_NMI_WINDOW, "NMI_WINDOW" }, \
+ { EXIT_REASON_TASK_SWITCH, "TASK_SWITCH" }, \
+ { EXIT_REASON_CPUID, "CPUID" }, \
+ { EXIT_REASON_HLT, "HLT" }, \
+ { EXIT_REASON_INVD, "INVD" }, \
+ { EXIT_REASON_INVLPG, "INVLPG" }, \
+ { EXIT_REASON_RDPMC, "RDPMC" }, \
+ { EXIT_REASON_RDTSC, "RDTSC" }, \
+ { EXIT_REASON_VMCALL, "VMCALL" }, \
+ { EXIT_REASON_VMCLEAR, "VMCLEAR" }, \
+ { EXIT_REASON_VMLAUNCH, "VMLAUNCH" }, \
+ { EXIT_REASON_VMPTRLD, "VMPTRLD" }, \
+ { EXIT_REASON_VMPTRST, "VMPTRST" }, \
+ { EXIT_REASON_VMREAD, "VMREAD" }, \
+ { EXIT_REASON_VMRESUME, "VMRESUME" }, \
+ { EXIT_REASON_VMWRITE, "VMWRITE" }, \
+ { EXIT_REASON_VMOFF, "VMOFF" }, \
+ { EXIT_REASON_VMON, "VMON" }, \
+ { EXIT_REASON_CR_ACCESS, "CR_ACCESS" }, \
+ { EXIT_REASON_DR_ACCESS, "DR_ACCESS" }, \
+ { EXIT_REASON_IO_INSTRUCTION, "IO_INSTRUCTION" }, \
+ { EXIT_REASON_MSR_READ, "MSR_READ" }, \
+ { EXIT_REASON_MSR_WRITE, "MSR_WRITE" }, \
+ { EXIT_REASON_INVALID_STATE, "INVALID_STATE" }, \
+ { EXIT_REASON_MSR_LOAD_FAIL, "MSR_LOAD_FAIL" }, \
+ { EXIT_REASON_MWAIT_INSTRUCTION, "MWAIT_INSTRUCTION" }, \
+ { EXIT_REASON_MONITOR_TRAP_FLAG, "MONITOR_TRAP_FLAG" }, \
+ { EXIT_REASON_MONITOR_INSTRUCTION, "MONITOR_INSTRUCTION" }, \
+ { EXIT_REASON_PAUSE_INSTRUCTION, "PAUSE_INSTRUCTION" }, \
+ { EXIT_REASON_MCE_DURING_VMENTRY, "MCE_DURING_VMENTRY" }, \
+ { EXIT_REASON_TPR_BELOW_THRESHOLD, "TPR_BELOW_THRESHOLD" }, \
+ { EXIT_REASON_APIC_ACCESS, "APIC_ACCESS" }, \
+ { EXIT_REASON_EOI_INDUCED, "EOI_INDUCED" }, \
+ { EXIT_REASON_GDTR_IDTR, "GDTR_IDTR" }, \
+ { EXIT_REASON_LDTR_TR, "LDTR_TR" }, \
+ { EXIT_REASON_EPT_VIOLATION, "EPT_VIOLATION" }, \
+ { EXIT_REASON_EPT_MISCONFIG, "EPT_MISCONFIG" }, \
+ { EXIT_REASON_INVEPT, "INVEPT" }, \
+ { EXIT_REASON_RDTSCP, "RDTSCP" }, \
+ { EXIT_REASON_PREEMPTION_TIMER, "PREEMPTION_TIMER" }, \
+ { EXIT_REASON_INVVPID, "INVVPID" }, \
+ { EXIT_REASON_WBINVD, "WBINVD" }, \
+ { EXIT_REASON_XSETBV, "XSETBV" }, \
+ { EXIT_REASON_APIC_WRITE, "APIC_WRITE" }, \
+ { EXIT_REASON_RDRAND, "RDRAND" }, \
+ { EXIT_REASON_INVPCID, "INVPCID" }, \
+ { EXIT_REASON_VMFUNC, "VMFUNC" }, \
+ { EXIT_REASON_ENCLS, "ENCLS" }, \
+ { EXIT_REASON_RDSEED, "RDSEED" }, \
+ { EXIT_REASON_PML_FULL, "PML_FULL" }, \
+ { EXIT_REASON_XSAVES, "XSAVES" }, \
+ { EXIT_REASON_XRSTORS, "XRSTORS" }
+
+#define VMX_ABORT_SAVE_GUEST_MSR_FAIL 1
+#define VMX_ABORT_LOAD_HOST_PDPTE_FAIL 2
+#define VMX_ABORT_LOAD_HOST_MSR_FAIL 4
+
+#endif /* _UAPIVMX_H */
diff --git a/arch/x86/include/uapi/asm/vsyscall.h b/arch/x86/include/uapi/asm/vsyscall.h
new file mode 100644
index 000000000..75275f547
--- /dev/null
+++ b/arch/x86/include/uapi/asm/vsyscall.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_ASM_X86_VSYSCALL_H
+#define _UAPI_ASM_X86_VSYSCALL_H
+
+enum vsyscall_num {
+ __NR_vgettimeofday,
+ __NR_vtime,
+ __NR_vgetcpu,
+};
+
+#define VSYSCALL_ADDR (-10UL << 20)
+
+#endif /* _UAPI_ASM_X86_VSYSCALL_H */
diff --git a/arch/x86/kernel/.gitignore b/arch/x86/kernel/.gitignore
new file mode 100644
index 000000000..08f4fd731
--- /dev/null
+++ b/arch/x86/kernel/.gitignore
@@ -0,0 +1,3 @@
+vsyscall.lds
+vsyscall_32.lds
+vmlinux.lds
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
new file mode 100644
index 000000000..da0b6bc09
--- /dev/null
+++ b/arch/x86/kernel/Makefile
@@ -0,0 +1,153 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for the linux kernel.
+#
+
+extra-y := head_$(BITS).o
+extra-y += head$(BITS).o
+extra-y += ebda.o
+extra-y += platform-quirks.o
+extra-y += vmlinux.lds
+
+CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
+
+ifdef CONFIG_FUNCTION_TRACER
+# Do not profile debug and lowlevel utilities
+CFLAGS_REMOVE_tsc.o = -pg
+CFLAGS_REMOVE_paravirt-spinlocks.o = -pg
+CFLAGS_REMOVE_pvclock.o = -pg
+CFLAGS_REMOVE_kvmclock.o = -pg
+CFLAGS_REMOVE_ftrace.o = -pg
+CFLAGS_REMOVE_early_printk.o = -pg
+CFLAGS_REMOVE_head64.o = -pg
+endif
+
+KASAN_SANITIZE_head$(BITS).o := n
+KASAN_SANITIZE_dumpstack.o := n
+KASAN_SANITIZE_dumpstack_$(BITS).o := n
+KASAN_SANITIZE_stacktrace.o := n
+KASAN_SANITIZE_paravirt.o := n
+
+OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o := y
+OBJECT_FILES_NON_STANDARD_test_nx.o := y
+OBJECT_FILES_NON_STANDARD_paravirt_patch_$(BITS).o := y
+
+ifdef CONFIG_FRAME_POINTER
+OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o := y
+endif
+
+# If instrumentation of this dir is enabled, boot hangs during first second.
+# Probably could be more selective here, but note that files related to irqs,
+# boot, dumpstack/stacktrace, etc are either non-interesting or can lead to
+# non-deterministic coverage.
+KCOV_INSTRUMENT := n
+
+CFLAGS_irq.o := -I$(src)/../include/asm/trace
+
+obj-y := process_$(BITS).o signal.o
+obj-$(CONFIG_COMPAT) += signal_compat.o
+obj-y += traps.o idt.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
+obj-y += time.o ioport.o dumpstack.o nmi.o
+obj-$(CONFIG_MODIFY_LDT_SYSCALL) += ldt.o
+obj-y += setup.o x86_init.o i8259.o irqinit.o
+obj-$(CONFIG_JUMP_LABEL) += jump_label.o
+obj-$(CONFIG_IRQ_WORK) += irq_work.o
+obj-y += probe_roms.o
+obj-$(CONFIG_X86_64) += sys_x86_64.o
+obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o
+obj-$(CONFIG_SYSFS) += ksysfs.o
+obj-y += bootflag.o e820.o
+obj-y += pci-dma.o quirks.o topology.o kdebugfs.o
+obj-y += alternative.o i8253.o hw_breakpoint.o
+obj-y += tsc.o tsc_msr.o io_delay.o rtc.o
+obj-y += pci-iommu_table.o
+obj-y += resource.o
+obj-y += irqflags.o
+
+obj-y += process.o
+obj-y += fpu/
+obj-y += ptrace.o
+obj-$(CONFIG_X86_32) += tls.o
+obj-$(CONFIG_IA32_EMULATION) += tls.o
+obj-y += step.o
+obj-$(CONFIG_INTEL_TXT) += tboot.o
+obj-$(CONFIG_ISA_DMA_API) += i8237.o
+obj-$(CONFIG_STACKTRACE) += stacktrace.o
+obj-y += cpu/
+obj-y += acpi/
+obj-y += reboot.o
+obj-$(CONFIG_X86_MSR) += msr.o
+obj-$(CONFIG_X86_CPUID) += cpuid.o
+obj-$(CONFIG_PCI) += early-quirks.o
+apm-y := apm_32.o
+obj-$(CONFIG_APM) += apm.o
+obj-$(CONFIG_SMP) += smp.o
+obj-$(CONFIG_SMP) += smpboot.o
+obj-$(CONFIG_X86_TSC) += tsc_sync.o
+obj-$(CONFIG_SMP) += setup_percpu.o
+obj-$(CONFIG_X86_MPPARSE) += mpparse.o
+obj-y += apic/
+obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o
+obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o
+obj-$(CONFIG_LIVEPATCH) += livepatch.o
+obj-$(CONFIG_FUNCTION_TRACER) += ftrace_$(BITS).o
+obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
+obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o
+obj-$(CONFIG_X86_TSC) += trace_clock.o
+obj-$(CONFIG_KEXEC_CORE) += machine_kexec_$(BITS).o
+obj-$(CONFIG_KEXEC_CORE) += relocate_kernel_$(BITS).o crash.o
+obj-$(CONFIG_KEXEC_FILE) += kexec-bzimage64.o
+obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
+obj-y += kprobes/
+obj-$(CONFIG_MODULES) += module.o
+obj-$(CONFIG_DOUBLEFAULT) += doublefault.o
+obj-$(CONFIG_KGDB) += kgdb.o
+obj-$(CONFIG_VM86) += vm86_32.o
+obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
+
+obj-$(CONFIG_HPET_TIMER) += hpet.o
+obj-$(CONFIG_APB_TIMER) += apb_timer.o
+
+obj-$(CONFIG_AMD_NB) += amd_nb.o
+obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o
+
+obj-$(CONFIG_KVM_GUEST) += kvm.o kvmclock.o
+obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o
+obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o
+obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
+obj-$(CONFIG_X86_PMEM_LEGACY_DEVICE) += pmem.o
+
+obj-$(CONFIG_JAILHOUSE_GUEST) += jailhouse.o
+
+obj-$(CONFIG_EISA) += eisa.o
+obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o
+
+obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
+
+obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o
+obj-$(CONFIG_OF) += devicetree.o
+obj-$(CONFIG_UPROBES) += uprobes.o
+obj-y += sysfb.o
+obj-$(CONFIG_X86_SYSFB) += sysfb_simplefb.o
+obj-$(CONFIG_EFI) += sysfb_efi.o
+
+obj-$(CONFIG_PERF_EVENTS) += perf_regs.o
+obj-$(CONFIG_TRACING) += tracepoint.o
+obj-$(CONFIG_SCHED_MC_PRIO) += itmt.o
+obj-$(CONFIG_X86_INTEL_UMIP) += umip.o
+
+obj-$(CONFIG_UNWINDER_ORC) += unwind_orc.o
+obj-$(CONFIG_UNWINDER_FRAME_POINTER) += unwind_frame.o
+obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o
+
+###
+# 64 bit specific files
+ifeq ($(CONFIG_X86_64),y)
+ obj-$(CONFIG_AUDIT) += audit_64.o
+
+ obj-$(CONFIG_GART_IOMMU) += amd_gart_64.o aperture_64.o
+ obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o
+
+ obj-$(CONFIG_MMCONF_FAM10H) += mmconf-fam10h_64.o
+ obj-y += vsmp_64.o
+endif
diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile
new file mode 100644
index 000000000..f1bb57b0e
--- /dev/null
+++ b/arch/x86/kernel/acpi/Makefile
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: GPL-2.0
+OBJECT_FILES_NON_STANDARD_wakeup_$(BITS).o := y
+
+obj-$(CONFIG_ACPI) += boot.o
+obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o
+obj-$(CONFIG_ACPI_APEI) += apei.o
+obj-$(CONFIG_ACPI_CPPC_LIB) += cppc_msr.o
+
+ifneq ($(CONFIG_ACPI_PROCESSOR),)
+obj-y += cstate.o
+endif
+
diff --git a/arch/x86/kernel/acpi/apei.c b/arch/x86/kernel/acpi/apei.c
new file mode 100644
index 000000000..bb8d300fe
--- /dev/null
+++ b/arch/x86/kernel/acpi/apei.c
@@ -0,0 +1,54 @@
+/*
+ * Arch-specific APEI-related functions.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <acpi/apei.h>
+
+#include <asm/mce.h>
+#include <asm/tlbflush.h>
+
+int arch_apei_enable_cmcff(struct acpi_hest_header *hest_hdr, void *data)
+{
+#ifdef CONFIG_X86_MCE
+ int i;
+ struct acpi_hest_ia_corrected *cmc;
+ struct acpi_hest_ia_error_bank *mc_bank;
+
+ cmc = (struct acpi_hest_ia_corrected *)hest_hdr;
+ if (!cmc->enabled)
+ return 0;
+
+ /*
+ * We expect HEST to provide a list of MC banks that report errors
+ * in firmware first mode. Otherwise, return non-zero value to
+ * indicate that we are done parsing HEST.
+ */
+ if (!(cmc->flags & ACPI_HEST_FIRMWARE_FIRST) ||
+ !cmc->num_hardware_banks)
+ return 1;
+
+ pr_info("HEST: Enabling Firmware First mode for corrected errors.\n");
+
+ mc_bank = (struct acpi_hest_ia_error_bank *)(cmc + 1);
+ for (i = 0; i < cmc->num_hardware_banks; i++, mc_bank++)
+ mce_disable_bank(mc_bank->bank_number);
+#endif
+ return 1;
+}
+
+void arch_apei_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
+{
+#ifdef CONFIG_X86_MCE
+ apei_mce_report_mem_error(sev, mem_err);
+#endif
+}
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
new file mode 100644
index 000000000..8b1aa1206
--- /dev/null
+++ b/arch/x86/kernel/acpi/boot.c
@@ -0,0 +1,1796 @@
+/*
+ * boot.c - Architecture-Specific Low-Level ACPI Boot Support
+ *
+ * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
+ * Copyright (C) 2001 Jun Nakajima <jun.nakajima@intel.com>
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+
+#include <linux/init.h>
+#include <linux/acpi.h>
+#include <linux/acpi_pmtmr.h>
+#include <linux/efi.h>
+#include <linux/cpumask.h>
+#include <linux/export.h>
+#include <linux/dmi.h>
+#include <linux/irq.h>
+#include <linux/slab.h>
+#include <linux/bootmem.h>
+#include <linux/ioport.h>
+#include <linux/pci.h>
+#include <linux/efi-bgrt.h>
+#include <linux/serial_core.h>
+
+#include <asm/e820/api.h>
+#include <asm/irqdomain.h>
+#include <asm/pci_x86.h>
+#include <asm/pgtable.h>
+#include <asm/io_apic.h>
+#include <asm/apic.h>
+#include <asm/io.h>
+#include <asm/mpspec.h>
+#include <asm/smp.h>
+#include <asm/i8259.h>
+
+#include "sleep.h" /* To include x86_acpi_suspend_lowlevel */
+static int __initdata acpi_force = 0;
+int acpi_disabled;
+EXPORT_SYMBOL(acpi_disabled);
+
+#ifdef CONFIG_X86_64
+# include <asm/proto.h>
+#endif /* X86 */
+
+#define PREFIX "ACPI: "
+
+int acpi_noirq; /* skip ACPI IRQ initialization */
+int acpi_pci_disabled; /* skip ACPI PCI scan and IRQ initialization */
+EXPORT_SYMBOL(acpi_pci_disabled);
+
+int acpi_lapic;
+int acpi_ioapic;
+int acpi_strict;
+int acpi_disable_cmcff;
+
+/* ACPI SCI override configuration */
+u8 acpi_sci_flags __initdata;
+u32 acpi_sci_override_gsi __initdata = INVALID_ACPI_IRQ;
+int acpi_skip_timer_override __initdata;
+int acpi_use_timer_override __initdata;
+int acpi_fix_pin2_polarity __initdata;
+
+#ifdef CONFIG_X86_LOCAL_APIC
+static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
+#endif
+
+#ifdef CONFIG_X86_IO_APIC
+/*
+ * Locks related to IOAPIC hotplug
+ * Hotplug side:
+ * ->device_hotplug_lock
+ * ->acpi_ioapic_lock
+ * ->ioapic_lock
+ * Interrupt mapping side:
+ * ->acpi_ioapic_lock
+ * ->ioapic_mutex
+ * ->ioapic_lock
+ */
+static DEFINE_MUTEX(acpi_ioapic_lock);
+#endif
+
+/* --------------------------------------------------------------------------
+ Boot-time Configuration
+ -------------------------------------------------------------------------- */
+
+/*
+ * The default interrupt routing model is PIC (8259). This gets
+ * overridden if IOAPICs are enumerated (below).
+ */
+enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_PIC;
+
+
+/*
+ * ISA irqs by default are the first 16 gsis but can be
+ * any gsi as specified by an interrupt source override.
+ */
+static u32 isa_irq_to_gsi[NR_IRQS_LEGACY] __read_mostly = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+};
+
+/*
+ * This is just a simple wrapper around early_memremap(),
+ * with sanity checks for phys == 0 and size == 0.
+ */
+void __init __iomem *__acpi_map_table(unsigned long phys, unsigned long size)
+{
+
+ if (!phys || !size)
+ return NULL;
+
+ return early_memremap(phys, size);
+}
+
+void __init __acpi_unmap_table(void __iomem *map, unsigned long size)
+{
+ if (!map || !size)
+ return;
+
+ early_memunmap(map, size);
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+static int __init acpi_parse_madt(struct acpi_table_header *table)
+{
+ struct acpi_table_madt *madt = NULL;
+
+ if (!boot_cpu_has(X86_FEATURE_APIC))
+ return -EINVAL;
+
+ madt = (struct acpi_table_madt *)table;
+ if (!madt) {
+ printk(KERN_WARNING PREFIX "Unable to map MADT\n");
+ return -ENODEV;
+ }
+
+ if (madt->address) {
+ acpi_lapic_addr = (u64) madt->address;
+
+ printk(KERN_DEBUG PREFIX "Local APIC address 0x%08x\n",
+ madt->address);
+ }
+
+ default_acpi_madt_oem_check(madt->header.oem_id,
+ madt->header.oem_table_id);
+
+ return 0;
+}
+
+/**
+ * acpi_register_lapic - register a local apic and generates a logic cpu number
+ * @id: local apic id to register
+ * @acpiid: ACPI id to register
+ * @enabled: this cpu is enabled or not
+ *
+ * Returns the logic cpu number which maps to the local apic
+ */
+static int acpi_register_lapic(int id, u32 acpiid, u8 enabled)
+{
+ unsigned int ver = 0;
+ int cpu;
+
+ if (id >= MAX_LOCAL_APIC) {
+ printk(KERN_INFO PREFIX "skipped apicid that is too big\n");
+ return -EINVAL;
+ }
+
+ if (!enabled) {
+ ++disabled_cpus;
+ return -EINVAL;
+ }
+
+ if (boot_cpu_physical_apicid != -1U)
+ ver = boot_cpu_apic_version;
+
+ cpu = generic_processor_info(id, ver);
+ if (cpu >= 0)
+ early_per_cpu(x86_cpu_to_acpiid, cpu) = acpiid;
+
+ return cpu;
+}
+
+static int __init
+acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end)
+{
+ struct acpi_madt_local_x2apic *processor = NULL;
+#ifdef CONFIG_X86_X2APIC
+ u32 apic_id;
+ u8 enabled;
+#endif
+
+ processor = (struct acpi_madt_local_x2apic *)header;
+
+ if (BAD_MADT_ENTRY(processor, end))
+ return -EINVAL;
+
+ acpi_table_print_madt_entry(header);
+
+#ifdef CONFIG_X86_X2APIC
+ apic_id = processor->local_apic_id;
+ enabled = processor->lapic_flags & ACPI_MADT_ENABLED;
+
+ /* Ignore invalid ID */
+ if (apic_id == 0xffffffff)
+ return 0;
+
+ /*
+ * We need to register disabled CPU as well to permit
+ * counting disabled CPUs. This allows us to size
+ * cpus_possible_map more accurately, to permit
+ * to not preallocating memory for all NR_CPUS
+ * when we use CPU hotplug.
+ */
+ if (!apic->apic_id_valid(apic_id)) {
+ if (enabled)
+ pr_warn(PREFIX "x2apic entry ignored\n");
+ return 0;
+ }
+
+ acpi_register_lapic(apic_id, processor->uid, enabled);
+#else
+ printk(KERN_WARNING PREFIX "x2apic entry ignored\n");
+#endif
+
+ return 0;
+}
+
+static int __init
+acpi_parse_lapic(struct acpi_subtable_header * header, const unsigned long end)
+{
+ struct acpi_madt_local_apic *processor = NULL;
+
+ processor = (struct acpi_madt_local_apic *)header;
+
+ if (BAD_MADT_ENTRY(processor, end))
+ return -EINVAL;
+
+ acpi_table_print_madt_entry(header);
+
+ /* Ignore invalid ID */
+ if (processor->id == 0xff)
+ return 0;
+
+ /*
+ * We need to register disabled CPU as well to permit
+ * counting disabled CPUs. This allows us to size
+ * cpus_possible_map more accurately, to permit
+ * to not preallocating memory for all NR_CPUS
+ * when we use CPU hotplug.
+ */
+ acpi_register_lapic(processor->id, /* APIC ID */
+ processor->processor_id, /* ACPI ID */
+ processor->lapic_flags & ACPI_MADT_ENABLED);
+
+ return 0;
+}
+
+static int __init
+acpi_parse_sapic(struct acpi_subtable_header *header, const unsigned long end)
+{
+ struct acpi_madt_local_sapic *processor = NULL;
+
+ processor = (struct acpi_madt_local_sapic *)header;
+
+ if (BAD_MADT_ENTRY(processor, end))
+ return -EINVAL;
+
+ acpi_table_print_madt_entry(header);
+
+ acpi_register_lapic((processor->id << 8) | processor->eid,/* APIC ID */
+ processor->processor_id, /* ACPI ID */
+ processor->lapic_flags & ACPI_MADT_ENABLED);
+
+ return 0;
+}
+
+static int __init
+acpi_parse_lapic_addr_ovr(struct acpi_subtable_header * header,
+ const unsigned long end)
+{
+ struct acpi_madt_local_apic_override *lapic_addr_ovr = NULL;
+
+ lapic_addr_ovr = (struct acpi_madt_local_apic_override *)header;
+
+ if (BAD_MADT_ENTRY(lapic_addr_ovr, end))
+ return -EINVAL;
+
+ acpi_table_print_madt_entry(header);
+
+ acpi_lapic_addr = lapic_addr_ovr->address;
+
+ return 0;
+}
+
+static int __init
+acpi_parse_x2apic_nmi(struct acpi_subtable_header *header,
+ const unsigned long end)
+{
+ struct acpi_madt_local_x2apic_nmi *x2apic_nmi = NULL;
+
+ x2apic_nmi = (struct acpi_madt_local_x2apic_nmi *)header;
+
+ if (BAD_MADT_ENTRY(x2apic_nmi, end))
+ return -EINVAL;
+
+ acpi_table_print_madt_entry(header);
+
+ if (x2apic_nmi->lint != 1)
+ printk(KERN_WARNING PREFIX "NMI not connected to LINT 1!\n");
+
+ return 0;
+}
+
+static int __init
+acpi_parse_lapic_nmi(struct acpi_subtable_header * header, const unsigned long end)
+{
+ struct acpi_madt_local_apic_nmi *lapic_nmi = NULL;
+
+ lapic_nmi = (struct acpi_madt_local_apic_nmi *)header;
+
+ if (BAD_MADT_ENTRY(lapic_nmi, end))
+ return -EINVAL;
+
+ acpi_table_print_madt_entry(header);
+
+ if (lapic_nmi->lint != 1)
+ printk(KERN_WARNING PREFIX "NMI not connected to LINT 1!\n");
+
+ return 0;
+}
+
+#endif /*CONFIG_X86_LOCAL_APIC */
+
+#ifdef CONFIG_X86_IO_APIC
+#define MP_ISA_BUS 0
+
+static int __init mp_register_ioapic_irq(u8 bus_irq, u8 polarity,
+ u8 trigger, u32 gsi);
+
+static void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger,
+ u32 gsi)
+{
+ /*
+ * Check bus_irq boundary.
+ */
+ if (bus_irq >= NR_IRQS_LEGACY) {
+ pr_warn("Invalid bus_irq %u for legacy override\n", bus_irq);
+ return;
+ }
+
+ /*
+ * TBD: This check is for faulty timer entries, where the override
+ * erroneously sets the trigger to level, resulting in a HUGE
+ * increase of timer interrupts!
+ */
+ if ((bus_irq == 0) && (trigger == 3))
+ trigger = 1;
+
+ if (mp_register_ioapic_irq(bus_irq, polarity, trigger, gsi) < 0)
+ return;
+ /*
+ * Reset default identity mapping if gsi is also an legacy IRQ,
+ * otherwise there will be more than one entry with the same GSI
+ * and acpi_isa_irq_to_gsi() may give wrong result.
+ */
+ if (gsi < nr_legacy_irqs() && isa_irq_to_gsi[gsi] == gsi)
+ isa_irq_to_gsi[gsi] = INVALID_ACPI_IRQ;
+ isa_irq_to_gsi[bus_irq] = gsi;
+}
+
+static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger,
+ int polarity)
+{
+#ifdef CONFIG_X86_MPPARSE
+ struct mpc_intsrc mp_irq;
+ struct pci_dev *pdev;
+ unsigned char number;
+ unsigned int devfn;
+ int ioapic;
+ u8 pin;
+
+ if (!acpi_ioapic)
+ return 0;
+ if (!dev || !dev_is_pci(dev))
+ return 0;
+
+ pdev = to_pci_dev(dev);
+ number = pdev->bus->number;
+ devfn = pdev->devfn;
+ pin = pdev->pin;
+ /* print the entry should happen on mptable identically */
+ mp_irq.type = MP_INTSRC;
+ mp_irq.irqtype = mp_INT;
+ mp_irq.irqflag = (trigger == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) |
+ (polarity == ACPI_ACTIVE_HIGH ? 1 : 3);
+ mp_irq.srcbus = number;
+ mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
+ ioapic = mp_find_ioapic(gsi);
+ mp_irq.dstapic = mpc_ioapic_id(ioapic);
+ mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi);
+
+ mp_save_irq(&mp_irq);
+#endif
+ return 0;
+}
+
+static int __init mp_register_ioapic_irq(u8 bus_irq, u8 polarity,
+ u8 trigger, u32 gsi)
+{
+ struct mpc_intsrc mp_irq;
+ int ioapic, pin;
+
+ /* Convert 'gsi' to 'ioapic.pin'(INTIN#) */
+ ioapic = mp_find_ioapic(gsi);
+ if (ioapic < 0) {
+ pr_warn("Failed to find ioapic for gsi : %u\n", gsi);
+ return ioapic;
+ }
+
+ pin = mp_find_ioapic_pin(ioapic, gsi);
+
+ mp_irq.type = MP_INTSRC;
+ mp_irq.irqtype = mp_INT;
+ mp_irq.irqflag = (trigger << 2) | polarity;
+ mp_irq.srcbus = MP_ISA_BUS;
+ mp_irq.srcbusirq = bus_irq;
+ mp_irq.dstapic = mpc_ioapic_id(ioapic);
+ mp_irq.dstirq = pin;
+
+ mp_save_irq(&mp_irq);
+
+ return 0;
+}
+
+static int __init
+acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end)
+{
+ struct acpi_madt_io_apic *ioapic = NULL;
+ struct ioapic_domain_cfg cfg = {
+ .type = IOAPIC_DOMAIN_DYNAMIC,
+ .ops = &mp_ioapic_irqdomain_ops,
+ };
+
+ ioapic = (struct acpi_madt_io_apic *)header;
+
+ if (BAD_MADT_ENTRY(ioapic, end))
+ return -EINVAL;
+
+ acpi_table_print_madt_entry(header);
+
+ /* Statically assign IRQ numbers for IOAPICs hosting legacy IRQs */
+ if (ioapic->global_irq_base < nr_legacy_irqs())
+ cfg.type = IOAPIC_DOMAIN_LEGACY;
+
+ mp_register_ioapic(ioapic->id, ioapic->address, ioapic->global_irq_base,
+ &cfg);
+
+ return 0;
+}
+
+/*
+ * Parse Interrupt Source Override for the ACPI SCI
+ */
+static void __init acpi_sci_ioapic_setup(u8 bus_irq, u16 polarity, u16 trigger, u32 gsi)
+{
+ if (trigger == 0) /* compatible SCI trigger is level */
+ trigger = 3;
+
+ if (polarity == 0) /* compatible SCI polarity is low */
+ polarity = 3;
+
+ /* Command-line over-ride via acpi_sci= */
+ if (acpi_sci_flags & ACPI_MADT_TRIGGER_MASK)
+ trigger = (acpi_sci_flags & ACPI_MADT_TRIGGER_MASK) >> 2;
+
+ if (acpi_sci_flags & ACPI_MADT_POLARITY_MASK)
+ polarity = acpi_sci_flags & ACPI_MADT_POLARITY_MASK;
+
+ if (bus_irq < NR_IRQS_LEGACY)
+ mp_override_legacy_irq(bus_irq, polarity, trigger, gsi);
+ else
+ mp_register_ioapic_irq(bus_irq, polarity, trigger, gsi);
+
+ acpi_penalize_sci_irq(bus_irq, trigger, polarity);
+
+ /*
+ * stash over-ride to indicate we've been here
+ * and for later update of acpi_gbl_FADT
+ */
+ acpi_sci_override_gsi = gsi;
+ return;
+}
+
+static int __init
+acpi_parse_int_src_ovr(struct acpi_subtable_header * header,
+ const unsigned long end)
+{
+ struct acpi_madt_interrupt_override *intsrc = NULL;
+
+ intsrc = (struct acpi_madt_interrupt_override *)header;
+
+ if (BAD_MADT_ENTRY(intsrc, end))
+ return -EINVAL;
+
+ acpi_table_print_madt_entry(header);
+
+ if (intsrc->source_irq == acpi_gbl_FADT.sci_interrupt) {
+ acpi_sci_ioapic_setup(intsrc->source_irq,
+ intsrc->inti_flags & ACPI_MADT_POLARITY_MASK,
+ (intsrc->inti_flags & ACPI_MADT_TRIGGER_MASK) >> 2,
+ intsrc->global_irq);
+ return 0;
+ }
+
+ if (intsrc->source_irq == 0) {
+ if (acpi_skip_timer_override) {
+ printk(PREFIX "BIOS IRQ0 override ignored.\n");
+ return 0;
+ }
+
+ if ((intsrc->global_irq == 2) && acpi_fix_pin2_polarity
+ && (intsrc->inti_flags & ACPI_MADT_POLARITY_MASK)) {
+ intsrc->inti_flags &= ~ACPI_MADT_POLARITY_MASK;
+ printk(PREFIX "BIOS IRQ0 pin2 override: forcing polarity to high active.\n");
+ }
+ }
+
+ mp_override_legacy_irq(intsrc->source_irq,
+ intsrc->inti_flags & ACPI_MADT_POLARITY_MASK,
+ (intsrc->inti_flags & ACPI_MADT_TRIGGER_MASK) >> 2,
+ intsrc->global_irq);
+
+ return 0;
+}
+
+static int __init
+acpi_parse_nmi_src(struct acpi_subtable_header * header, const unsigned long end)
+{
+ struct acpi_madt_nmi_source *nmi_src = NULL;
+
+ nmi_src = (struct acpi_madt_nmi_source *)header;
+
+ if (BAD_MADT_ENTRY(nmi_src, end))
+ return -EINVAL;
+
+ acpi_table_print_madt_entry(header);
+
+ /* TBD: Support nimsrc entries? */
+
+ return 0;
+}
+
+#endif /* CONFIG_X86_IO_APIC */
+
+/*
+ * acpi_pic_sci_set_trigger()
+ *
+ * use ELCR to set PIC-mode trigger type for SCI
+ *
+ * If a PIC-mode SCI is not recognized or gives spurious IRQ7's
+ * it may require Edge Trigger -- use "acpi_sci=edge"
+ *
+ * Port 0x4d0-4d1 are ECLR1 and ECLR2, the Edge/Level Control Registers
+ * for the 8259 PIC. bit[n] = 1 means irq[n] is Level, otherwise Edge.
+ * ECLR1 is IRQs 0-7 (IRQ 0, 1, 2 must be 0)
+ * ECLR2 is IRQs 8-15 (IRQ 8, 13 must be 0)
+ */
+
+void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger)
+{
+ unsigned int mask = 1 << irq;
+ unsigned int old, new;
+
+ /* Real old ELCR mask */
+ old = inb(0x4d0) | (inb(0x4d1) << 8);
+
+ /*
+ * If we use ACPI to set PCI IRQs, then we should clear ELCR
+ * since we will set it correctly as we enable the PCI irq
+ * routing.
+ */
+ new = acpi_noirq ? old : 0;
+
+ /*
+ * Update SCI information in the ELCR, it isn't in the PCI
+ * routing tables..
+ */
+ switch (trigger) {
+ case 1: /* Edge - clear */
+ new &= ~mask;
+ break;
+ case 3: /* Level - set */
+ new |= mask;
+ break;
+ }
+
+ if (old == new)
+ return;
+
+ printk(PREFIX "setting ELCR to %04x (from %04x)\n", new, old);
+ outb(new, 0x4d0);
+ outb(new >> 8, 0x4d1);
+}
+
+int acpi_gsi_to_irq(u32 gsi, unsigned int *irqp)
+{
+ int rc, irq, trigger, polarity;
+
+ if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) {
+ *irqp = gsi;
+ return 0;
+ }
+
+ rc = acpi_get_override_irq(gsi, &trigger, &polarity);
+ if (rc)
+ return rc;
+
+ trigger = trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE;
+ polarity = polarity ? ACPI_ACTIVE_LOW : ACPI_ACTIVE_HIGH;
+ irq = acpi_register_gsi(NULL, gsi, trigger, polarity);
+ if (irq < 0)
+ return irq;
+
+ *irqp = irq;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(acpi_gsi_to_irq);
+
+int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi)
+{
+ if (isa_irq < nr_legacy_irqs() &&
+ isa_irq_to_gsi[isa_irq] != INVALID_ACPI_IRQ) {
+ *gsi = isa_irq_to_gsi[isa_irq];
+ return 0;
+ }
+
+ return -1;
+}
+
+static int acpi_register_gsi_pic(struct device *dev, u32 gsi,
+ int trigger, int polarity)
+{
+#ifdef CONFIG_PCI
+ /*
+ * Make sure all (legacy) PCI IRQs are set as level-triggered.
+ */
+ if (trigger == ACPI_LEVEL_SENSITIVE)
+ elcr_set_level_irq(gsi);
+#endif
+
+ return gsi;
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+static int acpi_register_gsi_ioapic(struct device *dev, u32 gsi,
+ int trigger, int polarity)
+{
+ int irq = gsi;
+#ifdef CONFIG_X86_IO_APIC
+ int node;
+ struct irq_alloc_info info;
+
+ node = dev ? dev_to_node(dev) : NUMA_NO_NODE;
+ trigger = trigger == ACPI_EDGE_SENSITIVE ? 0 : 1;
+ polarity = polarity == ACPI_ACTIVE_HIGH ? 0 : 1;
+ ioapic_set_alloc_attr(&info, node, trigger, polarity);
+
+ mutex_lock(&acpi_ioapic_lock);
+ irq = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC, &info);
+ /* Don't set up the ACPI SCI because it's already set up */
+ if (irq >= 0 && enable_update_mptable && gsi != acpi_gbl_FADT.sci_interrupt)
+ mp_config_acpi_gsi(dev, gsi, trigger, polarity);
+ mutex_unlock(&acpi_ioapic_lock);
+#endif
+
+ return irq;
+}
+
+static void acpi_unregister_gsi_ioapic(u32 gsi)
+{
+#ifdef CONFIG_X86_IO_APIC
+ int irq;
+
+ mutex_lock(&acpi_ioapic_lock);
+ irq = mp_map_gsi_to_irq(gsi, 0, NULL);
+ if (irq > 0)
+ mp_unmap_irq(irq);
+ mutex_unlock(&acpi_ioapic_lock);
+#endif
+}
+#endif
+
+int (*__acpi_register_gsi)(struct device *dev, u32 gsi,
+ int trigger, int polarity) = acpi_register_gsi_pic;
+void (*__acpi_unregister_gsi)(u32 gsi) = NULL;
+
+#ifdef CONFIG_ACPI_SLEEP
+int (*acpi_suspend_lowlevel)(void) = x86_acpi_suspend_lowlevel;
+#else
+int (*acpi_suspend_lowlevel)(void);
+#endif
+
+/*
+ * success: return IRQ number (>=0)
+ * failure: return < 0
+ */
+int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
+{
+ return __acpi_register_gsi(dev, gsi, trigger, polarity);
+}
+EXPORT_SYMBOL_GPL(acpi_register_gsi);
+
+void acpi_unregister_gsi(u32 gsi)
+{
+ if (__acpi_unregister_gsi)
+ __acpi_unregister_gsi(gsi);
+}
+EXPORT_SYMBOL_GPL(acpi_unregister_gsi);
+
+#ifdef CONFIG_X86_LOCAL_APIC
+static void __init acpi_set_irq_model_ioapic(void)
+{
+ acpi_irq_model = ACPI_IRQ_MODEL_IOAPIC;
+ __acpi_register_gsi = acpi_register_gsi_ioapic;
+ __acpi_unregister_gsi = acpi_unregister_gsi_ioapic;
+ acpi_ioapic = 1;
+}
+#endif
+
+/*
+ * ACPI based hotplug support for CPU
+ */
+#ifdef CONFIG_ACPI_HOTPLUG_CPU
+#include <acpi/processor.h>
+
+static int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
+{
+#ifdef CONFIG_ACPI_NUMA
+ int nid;
+
+ nid = acpi_get_node(handle);
+ if (nid != NUMA_NO_NODE) {
+ set_apicid_to_node(physid, nid);
+ numa_set_node(cpu, nid);
+ }
+#endif
+ return 0;
+}
+
+int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, u32 acpi_id,
+ int *pcpu)
+{
+ int cpu;
+
+ cpu = acpi_register_lapic(physid, acpi_id, ACPI_MADT_ENABLED);
+ if (cpu < 0) {
+ pr_info(PREFIX "Unable to map lapic to logical cpu number\n");
+ return cpu;
+ }
+
+ acpi_processor_set_pdc(handle);
+ acpi_map_cpu2node(handle, cpu, physid);
+
+ *pcpu = cpu;
+ return 0;
+}
+EXPORT_SYMBOL(acpi_map_cpu);
+
+int acpi_unmap_cpu(int cpu)
+{
+#ifdef CONFIG_ACPI_NUMA
+ set_apicid_to_node(per_cpu(x86_cpu_to_apicid, cpu), NUMA_NO_NODE);
+#endif
+
+ per_cpu(x86_cpu_to_apicid, cpu) = -1;
+ set_cpu_present(cpu, false);
+ num_processors--;
+
+ return (0);
+}
+EXPORT_SYMBOL(acpi_unmap_cpu);
+#endif /* CONFIG_ACPI_HOTPLUG_CPU */
+
+int acpi_register_ioapic(acpi_handle handle, u64 phys_addr, u32 gsi_base)
+{
+ int ret = -ENOSYS;
+#ifdef CONFIG_ACPI_HOTPLUG_IOAPIC
+ int ioapic_id;
+ u64 addr;
+ struct ioapic_domain_cfg cfg = {
+ .type = IOAPIC_DOMAIN_DYNAMIC,
+ .ops = &mp_ioapic_irqdomain_ops,
+ };
+
+ ioapic_id = acpi_get_ioapic_id(handle, gsi_base, &addr);
+ if (ioapic_id < 0) {
+ unsigned long long uid;
+ acpi_status status;
+
+ status = acpi_evaluate_integer(handle, METHOD_NAME__UID,
+ NULL, &uid);
+ if (ACPI_FAILURE(status)) {
+ acpi_handle_warn(handle, "failed to get IOAPIC ID.\n");
+ return -EINVAL;
+ }
+ ioapic_id = (int)uid;
+ }
+
+ mutex_lock(&acpi_ioapic_lock);
+ ret = mp_register_ioapic(ioapic_id, phys_addr, gsi_base, &cfg);
+ mutex_unlock(&acpi_ioapic_lock);
+#endif
+
+ return ret;
+}
+EXPORT_SYMBOL(acpi_register_ioapic);
+
+int acpi_unregister_ioapic(acpi_handle handle, u32 gsi_base)
+{
+ int ret = -ENOSYS;
+
+#ifdef CONFIG_ACPI_HOTPLUG_IOAPIC
+ mutex_lock(&acpi_ioapic_lock);
+ ret = mp_unregister_ioapic(gsi_base);
+ mutex_unlock(&acpi_ioapic_lock);
+#endif
+
+ return ret;
+}
+EXPORT_SYMBOL(acpi_unregister_ioapic);
+
+/**
+ * acpi_ioapic_registered - Check whether IOAPIC assoicatied with @gsi_base
+ * has been registered
+ * @handle: ACPI handle of the IOAPIC deivce
+ * @gsi_base: GSI base associated with the IOAPIC
+ *
+ * Assume caller holds some type of lock to serialize acpi_ioapic_registered()
+ * with acpi_register_ioapic()/acpi_unregister_ioapic().
+ */
+int acpi_ioapic_registered(acpi_handle handle, u32 gsi_base)
+{
+ int ret = 0;
+
+#ifdef CONFIG_ACPI_HOTPLUG_IOAPIC
+ mutex_lock(&acpi_ioapic_lock);
+ ret = mp_ioapic_registered(gsi_base);
+ mutex_unlock(&acpi_ioapic_lock);
+#endif
+
+ return ret;
+}
+
+static int __init acpi_parse_sbf(struct acpi_table_header *table)
+{
+ struct acpi_table_boot *sb = (struct acpi_table_boot *)table;
+
+ sbf_port = sb->cmos_index; /* Save CMOS port */
+
+ return 0;
+}
+
+#ifdef CONFIG_HPET_TIMER
+#include <asm/hpet.h>
+
+static struct resource *hpet_res __initdata;
+
+static int __init acpi_parse_hpet(struct acpi_table_header *table)
+{
+ struct acpi_table_hpet *hpet_tbl = (struct acpi_table_hpet *)table;
+
+ if (hpet_tbl->address.space_id != ACPI_SPACE_MEM) {
+ printk(KERN_WARNING PREFIX "HPET timers must be located in "
+ "memory.\n");
+ return -1;
+ }
+
+ hpet_address = hpet_tbl->address.address;
+ hpet_blockid = hpet_tbl->sequence;
+
+ /*
+ * Some broken BIOSes advertise HPET at 0x0. We really do not
+ * want to allocate a resource there.
+ */
+ if (!hpet_address) {
+ printk(KERN_WARNING PREFIX
+ "HPET id: %#x base: %#lx is invalid\n",
+ hpet_tbl->id, hpet_address);
+ return 0;
+ }
+#ifdef CONFIG_X86_64
+ /*
+ * Some even more broken BIOSes advertise HPET at
+ * 0xfed0000000000000 instead of 0xfed00000. Fix it up and add
+ * some noise:
+ */
+ if (hpet_address == 0xfed0000000000000UL) {
+ if (!hpet_force_user) {
+ printk(KERN_WARNING PREFIX "HPET id: %#x "
+ "base: 0xfed0000000000000 is bogus\n "
+ "try hpet=force on the kernel command line to "
+ "fix it up to 0xfed00000.\n", hpet_tbl->id);
+ hpet_address = 0;
+ return 0;
+ }
+ printk(KERN_WARNING PREFIX
+ "HPET id: %#x base: 0xfed0000000000000 fixed up "
+ "to 0xfed00000.\n", hpet_tbl->id);
+ hpet_address >>= 32;
+ }
+#endif
+ printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n",
+ hpet_tbl->id, hpet_address);
+
+ /*
+ * Allocate and initialize the HPET firmware resource for adding into
+ * the resource tree during the lateinit timeframe.
+ */
+#define HPET_RESOURCE_NAME_SIZE 9
+ hpet_res = alloc_bootmem(sizeof(*hpet_res) + HPET_RESOURCE_NAME_SIZE);
+
+ hpet_res->name = (void *)&hpet_res[1];
+ hpet_res->flags = IORESOURCE_MEM;
+ snprintf((char *)hpet_res->name, HPET_RESOURCE_NAME_SIZE, "HPET %u",
+ hpet_tbl->sequence);
+
+ hpet_res->start = hpet_address;
+ hpet_res->end = hpet_address + (1 * 1024) - 1;
+
+ return 0;
+}
+
+/*
+ * hpet_insert_resource inserts the HPET resources used into the resource
+ * tree.
+ */
+static __init int hpet_insert_resource(void)
+{
+ if (!hpet_res)
+ return 1;
+
+ return insert_resource(&iomem_resource, hpet_res);
+}
+
+late_initcall(hpet_insert_resource);
+
+#else
+#define acpi_parse_hpet NULL
+#endif
+
+static int __init acpi_parse_fadt(struct acpi_table_header *table)
+{
+ if (!(acpi_gbl_FADT.boot_flags & ACPI_FADT_LEGACY_DEVICES)) {
+ pr_debug("ACPI: no legacy devices present\n");
+ x86_platform.legacy.devices.pnpbios = 0;
+ }
+
+ if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID &&
+ !(acpi_gbl_FADT.boot_flags & ACPI_FADT_8042) &&
+ x86_platform.legacy.i8042 != X86_LEGACY_I8042_PLATFORM_ABSENT) {
+ pr_debug("ACPI: i8042 controller is absent\n");
+ x86_platform.legacy.i8042 = X86_LEGACY_I8042_FIRMWARE_ABSENT;
+ }
+
+ if (acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_CMOS_RTC) {
+ pr_debug("ACPI: not registering RTC platform device\n");
+ x86_platform.legacy.rtc = 0;
+ }
+
+ if (acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_VGA) {
+ pr_debug("ACPI: probing for VGA not safe\n");
+ x86_platform.legacy.no_vga = 1;
+ }
+
+#ifdef CONFIG_X86_PM_TIMER
+ /* detect the location of the ACPI PM Timer */
+ if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID) {
+ /* FADT rev. 2 */
+ if (acpi_gbl_FADT.xpm_timer_block.space_id !=
+ ACPI_ADR_SPACE_SYSTEM_IO)
+ return 0;
+
+ pmtmr_ioport = acpi_gbl_FADT.xpm_timer_block.address;
+ /*
+ * "X" fields are optional extensions to the original V1.0
+ * fields, so we must selectively expand V1.0 fields if the
+ * corresponding X field is zero.
+ */
+ if (!pmtmr_ioport)
+ pmtmr_ioport = acpi_gbl_FADT.pm_timer_block;
+ } else {
+ /* FADT rev. 1 */
+ pmtmr_ioport = acpi_gbl_FADT.pm_timer_block;
+ }
+ if (pmtmr_ioport)
+ printk(KERN_INFO PREFIX "PM-Timer IO Port: %#x\n",
+ pmtmr_ioport);
+#endif
+ return 0;
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+/*
+ * Parse LAPIC entries in MADT
+ * returns 0 on success, < 0 on error
+ */
+
+static int __init early_acpi_parse_madt_lapic_addr_ovr(void)
+{
+ int count;
+
+ if (!boot_cpu_has(X86_FEATURE_APIC))
+ return -ENODEV;
+
+ /*
+ * Note that the LAPIC address is obtained from the MADT (32-bit value)
+ * and (optionally) overridden by a LAPIC_ADDR_OVR entry (64-bit value).
+ */
+
+ count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE,
+ acpi_parse_lapic_addr_ovr, 0);
+ if (count < 0) {
+ printk(KERN_ERR PREFIX
+ "Error parsing LAPIC address override entry\n");
+ return count;
+ }
+
+ register_lapic_address(acpi_lapic_addr);
+
+ return count;
+}
+
+static int __init acpi_parse_madt_lapic_entries(void)
+{
+ int count;
+ int x2count = 0;
+ int ret;
+ struct acpi_subtable_proc madt_proc[2];
+
+ if (!boot_cpu_has(X86_FEATURE_APIC))
+ return -ENODEV;
+
+ count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_SAPIC,
+ acpi_parse_sapic, MAX_LOCAL_APIC);
+
+ if (!count) {
+ memset(madt_proc, 0, sizeof(madt_proc));
+ madt_proc[0].id = ACPI_MADT_TYPE_LOCAL_APIC;
+ madt_proc[0].handler = acpi_parse_lapic;
+ madt_proc[1].id = ACPI_MADT_TYPE_LOCAL_X2APIC;
+ madt_proc[1].handler = acpi_parse_x2apic;
+ ret = acpi_table_parse_entries_array(ACPI_SIG_MADT,
+ sizeof(struct acpi_table_madt),
+ madt_proc, ARRAY_SIZE(madt_proc), MAX_LOCAL_APIC);
+ if (ret < 0) {
+ printk(KERN_ERR PREFIX
+ "Error parsing LAPIC/X2APIC entries\n");
+ return ret;
+ }
+
+ count = madt_proc[0].count;
+ x2count = madt_proc[1].count;
+ }
+ if (!count && !x2count) {
+ printk(KERN_ERR PREFIX "No LAPIC entries present\n");
+ /* TBD: Cleanup to allow fallback to MPS */
+ return -ENODEV;
+ } else if (count < 0 || x2count < 0) {
+ printk(KERN_ERR PREFIX "Error parsing LAPIC entry\n");
+ /* TBD: Cleanup to allow fallback to MPS */
+ return count;
+ }
+
+ x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC_NMI,
+ acpi_parse_x2apic_nmi, 0);
+ count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_NMI,
+ acpi_parse_lapic_nmi, 0);
+ if (count < 0 || x2count < 0) {
+ printk(KERN_ERR PREFIX "Error parsing LAPIC NMI entry\n");
+ /* TBD: Cleanup to allow fallback to MPS */
+ return count;
+ }
+ return 0;
+}
+#endif /* CONFIG_X86_LOCAL_APIC */
+
+#ifdef CONFIG_X86_IO_APIC
+static void __init mp_config_acpi_legacy_irqs(void)
+{
+ int i;
+ struct mpc_intsrc mp_irq;
+
+#ifdef CONFIG_EISA
+ /*
+ * Fabricate the legacy ISA bus (bus #31).
+ */
+ mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
+#endif
+ set_bit(MP_ISA_BUS, mp_bus_not_pci);
+ pr_debug("Bus #%d is ISA (nIRQs: %d)\n", MP_ISA_BUS, nr_legacy_irqs());
+
+ /*
+ * Use the default configuration for the IRQs 0-15. Unless
+ * overridden by (MADT) interrupt source override entries.
+ */
+ for (i = 0; i < nr_legacy_irqs(); i++) {
+ int ioapic, pin;
+ unsigned int dstapic;
+ int idx;
+ u32 gsi;
+
+ /* Locate the gsi that irq i maps to. */
+ if (acpi_isa_irq_to_gsi(i, &gsi))
+ continue;
+
+ /*
+ * Locate the IOAPIC that manages the ISA IRQ.
+ */
+ ioapic = mp_find_ioapic(gsi);
+ if (ioapic < 0)
+ continue;
+ pin = mp_find_ioapic_pin(ioapic, gsi);
+ dstapic = mpc_ioapic_id(ioapic);
+
+ for (idx = 0; idx < mp_irq_entries; idx++) {
+ struct mpc_intsrc *irq = mp_irqs + idx;
+
+ /* Do we already have a mapping for this ISA IRQ? */
+ if (irq->srcbus == MP_ISA_BUS && irq->srcbusirq == i)
+ break;
+
+ /* Do we already have a mapping for this IOAPIC pin */
+ if (irq->dstapic == dstapic && irq->dstirq == pin)
+ break;
+ }
+
+ if (idx != mp_irq_entries) {
+ printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
+ continue; /* IRQ already used */
+ }
+
+ mp_irq.type = MP_INTSRC;
+ mp_irq.irqflag = 0; /* Conforming */
+ mp_irq.srcbus = MP_ISA_BUS;
+ mp_irq.dstapic = dstapic;
+ mp_irq.irqtype = mp_INT;
+ mp_irq.srcbusirq = i; /* Identity mapped */
+ mp_irq.dstirq = pin;
+
+ mp_save_irq(&mp_irq);
+ }
+}
+
+/*
+ * Parse IOAPIC related entries in MADT
+ * returns 0 on success, < 0 on error
+ */
+static int __init acpi_parse_madt_ioapic_entries(void)
+{
+ int count;
+
+ /*
+ * ACPI interpreter is required to complete interrupt setup,
+ * so if it is off, don't enumerate the io-apics with ACPI.
+ * If MPS is present, it will handle them,
+ * otherwise the system will stay in PIC mode
+ */
+ if (acpi_disabled || acpi_noirq)
+ return -ENODEV;
+
+ if (!boot_cpu_has(X86_FEATURE_APIC))
+ return -ENODEV;
+
+ /*
+ * if "noapic" boot option, don't look for IO-APICs
+ */
+ if (skip_ioapic_setup) {
+ printk(KERN_INFO PREFIX "Skipping IOAPIC probe "
+ "due to 'noapic' option.\n");
+ return -ENODEV;
+ }
+
+ count = acpi_table_parse_madt(ACPI_MADT_TYPE_IO_APIC, acpi_parse_ioapic,
+ MAX_IO_APICS);
+ if (!count) {
+ printk(KERN_ERR PREFIX "No IOAPIC entries present\n");
+ return -ENODEV;
+ } else if (count < 0) {
+ printk(KERN_ERR PREFIX "Error parsing IOAPIC entry\n");
+ return count;
+ }
+
+ count = acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE,
+ acpi_parse_int_src_ovr, nr_irqs);
+ if (count < 0) {
+ printk(KERN_ERR PREFIX
+ "Error parsing interrupt source overrides entry\n");
+ /* TBD: Cleanup to allow fallback to MPS */
+ return count;
+ }
+
+ /*
+ * If BIOS did not supply an INT_SRC_OVR for the SCI
+ * pretend we got one so we can set the SCI flags.
+ * But ignore setting up SCI on hardware reduced platforms.
+ */
+ if (acpi_sci_override_gsi == INVALID_ACPI_IRQ && !acpi_gbl_reduced_hardware)
+ acpi_sci_ioapic_setup(acpi_gbl_FADT.sci_interrupt, 0, 0,
+ acpi_gbl_FADT.sci_interrupt);
+
+ /* Fill in identity legacy mappings where no override */
+ mp_config_acpi_legacy_irqs();
+
+ count = acpi_table_parse_madt(ACPI_MADT_TYPE_NMI_SOURCE,
+ acpi_parse_nmi_src, nr_irqs);
+ if (count < 0) {
+ printk(KERN_ERR PREFIX "Error parsing NMI SRC entry\n");
+ /* TBD: Cleanup to allow fallback to MPS */
+ return count;
+ }
+
+ return 0;
+}
+#else
+static inline int acpi_parse_madt_ioapic_entries(void)
+{
+ return -1;
+}
+#endif /* !CONFIG_X86_IO_APIC */
+
+static void __init early_acpi_process_madt(void)
+{
+#ifdef CONFIG_X86_LOCAL_APIC
+ int error;
+
+ if (!acpi_table_parse(ACPI_SIG_MADT, acpi_parse_madt)) {
+
+ /*
+ * Parse MADT LAPIC entries
+ */
+ error = early_acpi_parse_madt_lapic_addr_ovr();
+ if (!error) {
+ acpi_lapic = 1;
+ smp_found_config = 1;
+ }
+ if (error == -EINVAL) {
+ /*
+ * Dell Precision Workstation 410, 610 come here.
+ */
+ printk(KERN_ERR PREFIX
+ "Invalid BIOS MADT, disabling ACPI\n");
+ disable_acpi();
+ }
+ }
+#endif
+}
+
+static void __init acpi_process_madt(void)
+{
+#ifdef CONFIG_X86_LOCAL_APIC
+ int error;
+
+ if (!acpi_table_parse(ACPI_SIG_MADT, acpi_parse_madt)) {
+
+ /*
+ * Parse MADT LAPIC entries
+ */
+ error = acpi_parse_madt_lapic_entries();
+ if (!error) {
+ acpi_lapic = 1;
+
+ /*
+ * Parse MADT IO-APIC entries
+ */
+ mutex_lock(&acpi_ioapic_lock);
+ error = acpi_parse_madt_ioapic_entries();
+ mutex_unlock(&acpi_ioapic_lock);
+ if (!error) {
+ acpi_set_irq_model_ioapic();
+
+ smp_found_config = 1;
+ }
+ }
+ if (error == -EINVAL) {
+ /*
+ * Dell Precision Workstation 410, 610 come here.
+ */
+ printk(KERN_ERR PREFIX
+ "Invalid BIOS MADT, disabling ACPI\n");
+ disable_acpi();
+ }
+ } else {
+ /*
+ * ACPI found no MADT, and so ACPI wants UP PIC mode.
+ * In the event an MPS table was found, forget it.
+ * Boot with "acpi=off" to use MPS on such a system.
+ */
+ if (smp_found_config) {
+ printk(KERN_WARNING PREFIX
+ "No APIC-table, disabling MPS\n");
+ smp_found_config = 0;
+ }
+ }
+
+ /*
+ * ACPI supports both logical (e.g. Hyper-Threading) and physical
+ * processors, where MPS only supports physical.
+ */
+ if (acpi_lapic && acpi_ioapic)
+ printk(KERN_INFO "Using ACPI (MADT) for SMP configuration "
+ "information\n");
+ else if (acpi_lapic)
+ printk(KERN_INFO "Using ACPI for processor (LAPIC) "
+ "configuration information\n");
+#endif
+ return;
+}
+
+static int __init disable_acpi_irq(const struct dmi_system_id *d)
+{
+ if (!acpi_force) {
+ printk(KERN_NOTICE "%s detected: force use of acpi=noirq\n",
+ d->ident);
+ acpi_noirq_set();
+ }
+ return 0;
+}
+
+static int __init disable_acpi_pci(const struct dmi_system_id *d)
+{
+ if (!acpi_force) {
+ printk(KERN_NOTICE "%s detected: force use of pci=noacpi\n",
+ d->ident);
+ acpi_disable_pci();
+ }
+ return 0;
+}
+
+static int __init disable_acpi_xsdt(const struct dmi_system_id *d)
+{
+ if (!acpi_force) {
+ pr_notice("%s detected: force use of acpi=rsdt\n", d->ident);
+ acpi_gbl_do_not_use_xsdt = TRUE;
+ } else {
+ pr_notice("Warning: DMI blacklist says broken, but acpi XSDT forced\n");
+ }
+ return 0;
+}
+
+static int __init dmi_disable_acpi(const struct dmi_system_id *d)
+{
+ if (!acpi_force) {
+ printk(KERN_NOTICE "%s detected: acpi off\n", d->ident);
+ disable_acpi();
+ } else {
+ printk(KERN_NOTICE
+ "Warning: DMI blacklist says broken, but acpi forced\n");
+ }
+ return 0;
+}
+
+/*
+ * Force ignoring BIOS IRQ0 override
+ */
+static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d)
+{
+ if (!acpi_skip_timer_override) {
+ pr_notice("%s detected: Ignoring BIOS IRQ0 override\n",
+ d->ident);
+ acpi_skip_timer_override = 1;
+ }
+ return 0;
+}
+
+/*
+ * ACPI offers an alternative platform interface model that removes
+ * ACPI hardware requirements for platforms that do not implement
+ * the PC Architecture.
+ *
+ * We initialize the Hardware-reduced ACPI model here:
+ */
+void __init acpi_generic_reduced_hw_init(void)
+{
+ /*
+ * Override x86_init functions and bypass legacy PIC in
+ * hardware reduced ACPI mode.
+ */
+ x86_init.timers.timer_init = x86_init_noop;
+ x86_init.irqs.pre_vector_init = x86_init_noop;
+ legacy_pic = &null_legacy_pic;
+}
+
+static void __init acpi_reduced_hw_init(void)
+{
+ if (acpi_gbl_reduced_hardware)
+ x86_init.acpi.reduced_hw_early_init();
+}
+
+/*
+ * If your system is blacklisted here, but you find that acpi=force
+ * works for you, please contact linux-acpi@vger.kernel.org
+ */
+static const struct dmi_system_id acpi_dmi_table[] __initconst = {
+ /*
+ * Boxes that need ACPI disabled
+ */
+ {
+ .callback = dmi_disable_acpi,
+ .ident = "IBM Thinkpad",
+ .matches = {
+ DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
+ DMI_MATCH(DMI_BOARD_NAME, "2629H1G"),
+ },
+ },
+
+ /*
+ * Boxes that need ACPI PCI IRQ routing disabled
+ */
+ {
+ .callback = disable_acpi_irq,
+ .ident = "ASUS A7V",
+ .matches = {
+ DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC"),
+ DMI_MATCH(DMI_BOARD_NAME, "<A7V>"),
+ /* newer BIOS, Revision 1011, does work */
+ DMI_MATCH(DMI_BIOS_VERSION,
+ "ASUS A7V ACPI BIOS Revision 1007"),
+ },
+ },
+ {
+ /*
+ * Latest BIOS for IBM 600E (1.16) has bad pcinum
+ * for LPC bridge, which is needed for the PCI
+ * interrupt links to work. DSDT fix is in bug 5966.
+ * 2645, 2646 model numbers are shared with 600/600E/600X
+ */
+ .callback = disable_acpi_irq,
+ .ident = "IBM Thinkpad 600 Series 2645",
+ .matches = {
+ DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
+ DMI_MATCH(DMI_BOARD_NAME, "2645"),
+ },
+ },
+ {
+ .callback = disable_acpi_irq,
+ .ident = "IBM Thinkpad 600 Series 2646",
+ .matches = {
+ DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
+ DMI_MATCH(DMI_BOARD_NAME, "2646"),
+ },
+ },
+ /*
+ * Boxes that need ACPI PCI IRQ routing and PCI scan disabled
+ */
+ { /* _BBN 0 bug */
+ .callback = disable_acpi_pci,
+ .ident = "ASUS PR-DLS",
+ .matches = {
+ DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
+ DMI_MATCH(DMI_BOARD_NAME, "PR-DLS"),
+ DMI_MATCH(DMI_BIOS_VERSION,
+ "ASUS PR-DLS ACPI BIOS Revision 1010"),
+ DMI_MATCH(DMI_BIOS_DATE, "03/21/2003")
+ },
+ },
+ {
+ .callback = disable_acpi_pci,
+ .ident = "Acer TravelMate 36x Laptop",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"),
+ },
+ },
+ /*
+ * Boxes that need ACPI XSDT use disabled due to corrupted tables
+ */
+ {
+ .callback = disable_acpi_xsdt,
+ .ident = "Advantech DAC-BJ01",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "NEC"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Bearlake CRB Board"),
+ DMI_MATCH(DMI_BIOS_VERSION, "V1.12"),
+ DMI_MATCH(DMI_BIOS_DATE, "02/01/2011"),
+ },
+ },
+ {}
+};
+
+/* second table for DMI checks that should run after early-quirks */
+static const struct dmi_system_id acpi_dmi_table_late[] __initconst = {
+ /*
+ * HP laptops which use a DSDT reporting as HP/SB400/10000,
+ * which includes some code which overrides all temperature
+ * trip points to 16C if the INTIN2 input of the I/O APIC
+ * is enabled. This input is incorrectly designated the
+ * ISA IRQ 0 via an interrupt source override even though
+ * it is wired to the output of the master 8259A and INTIN0
+ * is not connected at all. Force ignoring BIOS IRQ0
+ * override in that cases.
+ */
+ {
+ .callback = dmi_ignore_irq0_timer_override,
+ .ident = "HP nx6115 laptop",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6115"),
+ },
+ },
+ {
+ .callback = dmi_ignore_irq0_timer_override,
+ .ident = "HP NX6125 laptop",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6125"),
+ },
+ },
+ {
+ .callback = dmi_ignore_irq0_timer_override,
+ .ident = "HP NX6325 laptop",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6325"),
+ },
+ },
+ {
+ .callback = dmi_ignore_irq0_timer_override,
+ .ident = "HP 6715b laptop",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq 6715b"),
+ },
+ },
+ {
+ .callback = dmi_ignore_irq0_timer_override,
+ .ident = "FUJITSU SIEMENS",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "FUJITSU SIEMENS"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "AMILO PRO V2030"),
+ },
+ },
+ {}
+};
+
+/*
+ * acpi_boot_table_init() and acpi_boot_init()
+ * called from setup_arch(), always.
+ * 1. checksums all tables
+ * 2. enumerates lapics
+ * 3. enumerates io-apics
+ *
+ * acpi_table_init() is separate to allow reading SRAT without
+ * other side effects.
+ *
+ * side effects of acpi_boot_init:
+ * acpi_lapic = 1 if LAPIC found
+ * acpi_ioapic = 1 if IOAPIC found
+ * if (acpi_lapic && acpi_ioapic) smp_found_config = 1;
+ * if acpi_blacklisted() acpi_disabled = 1;
+ * acpi_irq_model=...
+ * ...
+ */
+
+void __init acpi_boot_table_init(void)
+{
+ dmi_check_system(acpi_dmi_table);
+
+ /*
+ * If acpi_disabled, bail out
+ */
+ if (acpi_disabled)
+ return;
+
+ /*
+ * Initialize the ACPI boot-time table parser.
+ */
+ if (acpi_locate_initial_tables())
+ disable_acpi();
+ else
+ acpi_reserve_initial_tables();
+}
+
+int __init early_acpi_boot_init(void)
+{
+ if (acpi_disabled)
+ return 1;
+
+ acpi_table_init_complete();
+
+ acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf);
+
+ /*
+ * blacklist may disable ACPI entirely
+ */
+ if (acpi_blacklisted()) {
+ if (acpi_force) {
+ printk(KERN_WARNING PREFIX "acpi=force override\n");
+ } else {
+ printk(KERN_WARNING PREFIX "Disabling ACPI support\n");
+ disable_acpi();
+ return 1;
+ }
+ }
+
+ /*
+ * Process the Multiple APIC Description Table (MADT), if present
+ */
+ early_acpi_process_madt();
+
+ /*
+ * Hardware-reduced ACPI mode initialization:
+ */
+ acpi_reduced_hw_init();
+
+ return 0;
+}
+
+int __init acpi_boot_init(void)
+{
+ /* those are executed after early-quirks are executed */
+ dmi_check_system(acpi_dmi_table_late);
+
+ /*
+ * If acpi_disabled, bail out
+ */
+ if (acpi_disabled)
+ return 1;
+
+ acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf);
+
+ /*
+ * set sci_int and PM timer address
+ */
+ acpi_table_parse(ACPI_SIG_FADT, acpi_parse_fadt);
+
+ /*
+ * Process the Multiple APIC Description Table (MADT), if present
+ */
+ acpi_process_madt();
+
+ acpi_table_parse(ACPI_SIG_HPET, acpi_parse_hpet);
+ if (IS_ENABLED(CONFIG_ACPI_BGRT))
+ acpi_table_parse(ACPI_SIG_BGRT, acpi_parse_bgrt);
+
+ if (!acpi_noirq)
+ x86_init.pci.init = pci_acpi_init;
+
+ /* Do not enable ACPI SPCR console by default */
+ acpi_parse_spcr(earlycon_acpi_spcr_enable, false);
+ return 0;
+}
+
+static int __init parse_acpi(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
+
+ /* "acpi=off" disables both ACPI table parsing and interpreter */
+ if (strcmp(arg, "off") == 0) {
+ disable_acpi();
+ }
+ /* acpi=force to over-ride black-list */
+ else if (strcmp(arg, "force") == 0) {
+ acpi_force = 1;
+ acpi_disabled = 0;
+ }
+ /* acpi=strict disables out-of-spec workarounds */
+ else if (strcmp(arg, "strict") == 0) {
+ acpi_strict = 1;
+ }
+ /* acpi=rsdt use RSDT instead of XSDT */
+ else if (strcmp(arg, "rsdt") == 0) {
+ acpi_gbl_do_not_use_xsdt = TRUE;
+ }
+ /* "acpi=noirq" disables ACPI interrupt routing */
+ else if (strcmp(arg, "noirq") == 0) {
+ acpi_noirq_set();
+ }
+ /* "acpi=copy_dsdt" copys DSDT */
+ else if (strcmp(arg, "copy_dsdt") == 0) {
+ acpi_gbl_copy_dsdt_locally = 1;
+ }
+ /* "acpi=nocmcff" disables FF mode for corrected errors */
+ else if (strcmp(arg, "nocmcff") == 0) {
+ acpi_disable_cmcff = 1;
+ } else {
+ /* Core will printk when we return error. */
+ return -EINVAL;
+ }
+ return 0;
+}
+early_param("acpi", parse_acpi);
+
+/* FIXME: Using pci= for an ACPI parameter is a travesty. */
+static int __init parse_pci(char *arg)
+{
+ if (arg && strcmp(arg, "noacpi") == 0)
+ acpi_disable_pci();
+ return 0;
+}
+early_param("pci", parse_pci);
+
+int __init acpi_mps_check(void)
+{
+#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_X86_MPPARSE)
+/* mptable code is not built-in*/
+ if (acpi_disabled || acpi_noirq) {
+ printk(KERN_WARNING "MPS support code is not built-in.\n"
+ "Using acpi=off or acpi=noirq or pci=noacpi "
+ "may have problem\n");
+ return 1;
+ }
+#endif
+ return 0;
+}
+
+#ifdef CONFIG_X86_IO_APIC
+static int __init parse_acpi_skip_timer_override(char *arg)
+{
+ acpi_skip_timer_override = 1;
+ return 0;
+}
+early_param("acpi_skip_timer_override", parse_acpi_skip_timer_override);
+
+static int __init parse_acpi_use_timer_override(char *arg)
+{
+ acpi_use_timer_override = 1;
+ return 0;
+}
+early_param("acpi_use_timer_override", parse_acpi_use_timer_override);
+#endif /* CONFIG_X86_IO_APIC */
+
+static int __init setup_acpi_sci(char *s)
+{
+ if (!s)
+ return -EINVAL;
+ if (!strcmp(s, "edge"))
+ acpi_sci_flags = ACPI_MADT_TRIGGER_EDGE |
+ (acpi_sci_flags & ~ACPI_MADT_TRIGGER_MASK);
+ else if (!strcmp(s, "level"))
+ acpi_sci_flags = ACPI_MADT_TRIGGER_LEVEL |
+ (acpi_sci_flags & ~ACPI_MADT_TRIGGER_MASK);
+ else if (!strcmp(s, "high"))
+ acpi_sci_flags = ACPI_MADT_POLARITY_ACTIVE_HIGH |
+ (acpi_sci_flags & ~ACPI_MADT_POLARITY_MASK);
+ else if (!strcmp(s, "low"))
+ acpi_sci_flags = ACPI_MADT_POLARITY_ACTIVE_LOW |
+ (acpi_sci_flags & ~ACPI_MADT_POLARITY_MASK);
+ else
+ return -EINVAL;
+ return 0;
+}
+early_param("acpi_sci", setup_acpi_sci);
+
+int __acpi_acquire_global_lock(unsigned int *lock)
+{
+ unsigned int old, new, val;
+ do {
+ old = *lock;
+ new = (((old & ~0x3) + 2) + ((old >> 1) & 0x1));
+ val = cmpxchg(lock, old, new);
+ } while (unlikely (val != old));
+ return ((new & 0x3) < 3) ? -1 : 0;
+}
+
+int __acpi_release_global_lock(unsigned int *lock)
+{
+ unsigned int old, new, val;
+ do {
+ old = *lock;
+ new = old & ~0x3;
+ val = cmpxchg(lock, old, new);
+ } while (unlikely (val != old));
+ return old & 0x1;
+}
+
+void __init arch_reserve_mem_area(acpi_physical_address addr, size_t size)
+{
+ e820__range_add(addr, size, E820_TYPE_ACPI);
+ e820__update_table_print();
+}
diff --git a/arch/x86/kernel/acpi/cppc_msr.c b/arch/x86/kernel/acpi/cppc_msr.c
new file mode 100644
index 000000000..6fb478bf8
--- /dev/null
+++ b/arch/x86/kernel/acpi/cppc_msr.c
@@ -0,0 +1,58 @@
+/*
+ * cppc_msr.c: MSR Interface for CPPC
+ * Copyright (c) 2016, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ */
+
+#include <acpi/cppc_acpi.h>
+#include <asm/msr.h>
+
+/* Refer to drivers/acpi/cppc_acpi.c for the description of functions */
+
+bool cpc_ffh_supported(void)
+{
+ return true;
+}
+
+int cpc_read_ffh(int cpunum, struct cpc_reg *reg, u64 *val)
+{
+ int err;
+
+ err = rdmsrl_safe_on_cpu(cpunum, reg->address, val);
+ if (!err) {
+ u64 mask = GENMASK_ULL(reg->bit_offset + reg->bit_width - 1,
+ reg->bit_offset);
+
+ *val &= mask;
+ *val >>= reg->bit_offset;
+ }
+ return err;
+}
+
+int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val)
+{
+ u64 rd_val;
+ int err;
+
+ err = rdmsrl_safe_on_cpu(cpunum, reg->address, &rd_val);
+ if (!err) {
+ u64 mask = GENMASK_ULL(reg->bit_offset + reg->bit_width - 1,
+ reg->bit_offset);
+
+ val <<= reg->bit_offset;
+ val &= mask;
+ rd_val &= ~mask;
+ rd_val |= val;
+ err = wrmsrl_safe_on_cpu(cpunum, reg->address, rd_val);
+ }
+ return err;
+}
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
new file mode 100644
index 000000000..92539a1c3
--- /dev/null
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -0,0 +1,186 @@
+/*
+ * Copyright (C) 2005 Intel Corporation
+ * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
+ * - Added _PDC for SMP C-states on Intel CPUs
+ */
+
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/init.h>
+#include <linux/acpi.h>
+#include <linux/cpu.h>
+#include <linux/sched.h>
+
+#include <acpi/processor.h>
+#include <asm/mwait.h>
+#include <asm/special_insns.h>
+
+/*
+ * Initialize bm_flags based on the CPU cache properties
+ * On SMP it depends on cache configuration
+ * - When cache is not shared among all CPUs, we flush cache
+ * before entering C3.
+ * - When cache is shared among all CPUs, we use bm_check
+ * mechanism as in UP case
+ *
+ * This routine is called only after all the CPUs are online
+ */
+void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags,
+ unsigned int cpu)
+{
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+ flags->bm_check = 0;
+ if (num_online_cpus() == 1)
+ flags->bm_check = 1;
+ else if (c->x86_vendor == X86_VENDOR_INTEL) {
+ /*
+ * Today all MP CPUs that support C3 share cache.
+ * And caches should not be flushed by software while
+ * entering C3 type state.
+ */
+ flags->bm_check = 1;
+ }
+
+ /*
+ * On all recent Intel platforms, ARB_DISABLE is a nop.
+ * So, set bm_control to zero to indicate that ARB_DISABLE
+ * is not required while entering C3 type state on
+ * P4, Core and beyond CPUs
+ */
+ if (c->x86_vendor == X86_VENDOR_INTEL &&
+ (c->x86 > 0xf || (c->x86 == 6 && c->x86_model >= 0x0f)))
+ flags->bm_control = 0;
+}
+EXPORT_SYMBOL(acpi_processor_power_init_bm_check);
+
+/* The code below handles cstate entry with monitor-mwait pair on Intel*/
+
+struct cstate_entry {
+ struct {
+ unsigned int eax;
+ unsigned int ecx;
+ } states[ACPI_PROCESSOR_MAX_POWER];
+};
+static struct cstate_entry __percpu *cpu_cstate_entry; /* per CPU ptr */
+
+static short mwait_supported[ACPI_PROCESSOR_MAX_POWER];
+
+#define NATIVE_CSTATE_BEYOND_HALT (2)
+
+static long acpi_processor_ffh_cstate_probe_cpu(void *_cx)
+{
+ struct acpi_processor_cx *cx = _cx;
+ long retval;
+ unsigned int eax, ebx, ecx, edx;
+ unsigned int edx_part;
+ unsigned int cstate_type; /* C-state type and not ACPI C-state type */
+ unsigned int num_cstate_subtype;
+
+ cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
+
+ /* Check whether this particular cx_type (in CST) is supported or not */
+ cstate_type = ((cx->address >> MWAIT_SUBSTATE_SIZE) &
+ MWAIT_CSTATE_MASK) + 1;
+ edx_part = edx >> (cstate_type * MWAIT_SUBSTATE_SIZE);
+ num_cstate_subtype = edx_part & MWAIT_SUBSTATE_MASK;
+
+ retval = 0;
+ /* If the HW does not support any sub-states in this C-state */
+ if (num_cstate_subtype == 0) {
+ pr_warn(FW_BUG "ACPI MWAIT C-state 0x%x not supported by HW (0x%x)\n",
+ cx->address, edx_part);
+ retval = -1;
+ goto out;
+ }
+
+ /* mwait ecx extensions INTERRUPT_BREAK should be supported for C2/C3 */
+ if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
+ !(ecx & CPUID5_ECX_INTERRUPT_BREAK)) {
+ retval = -1;
+ goto out;
+ }
+
+ if (!mwait_supported[cstate_type]) {
+ mwait_supported[cstate_type] = 1;
+ printk(KERN_DEBUG
+ "Monitor-Mwait will be used to enter C-%d state\n",
+ cx->type);
+ }
+ snprintf(cx->desc,
+ ACPI_CX_DESC_LEN, "ACPI FFH MWAIT 0x%x",
+ cx->address);
+out:
+ return retval;
+}
+
+int acpi_processor_ffh_cstate_probe(unsigned int cpu,
+ struct acpi_processor_cx *cx, struct acpi_power_register *reg)
+{
+ struct cstate_entry *percpu_entry;
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+ long retval;
+
+ if (!cpu_cstate_entry || c->cpuid_level < CPUID_MWAIT_LEAF)
+ return -1;
+
+ if (reg->bit_offset != NATIVE_CSTATE_BEYOND_HALT)
+ return -1;
+
+ percpu_entry = per_cpu_ptr(cpu_cstate_entry, cpu);
+ percpu_entry->states[cx->index].eax = 0;
+ percpu_entry->states[cx->index].ecx = 0;
+
+ /* Make sure we are running on right CPU */
+
+ retval = call_on_cpu(cpu, acpi_processor_ffh_cstate_probe_cpu, cx,
+ false);
+ if (retval == 0) {
+ /* Use the hint in CST */
+ percpu_entry->states[cx->index].eax = cx->address;
+ percpu_entry->states[cx->index].ecx = MWAIT_ECX_INTERRUPT_BREAK;
+ }
+
+ /*
+ * For _CST FFH on Intel, if GAS.access_size bit 1 is cleared,
+ * then we should skip checking BM_STS for this C-state.
+ * ref: "Intel Processor Vendor-Specific ACPI Interface Specification"
+ */
+ if ((c->x86_vendor == X86_VENDOR_INTEL) && !(reg->access_size & 0x2))
+ cx->bm_sts_skip = 1;
+
+ return retval;
+}
+EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe);
+
+void __cpuidle acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx)
+{
+ unsigned int cpu = smp_processor_id();
+ struct cstate_entry *percpu_entry;
+
+ percpu_entry = per_cpu_ptr(cpu_cstate_entry, cpu);
+ mwait_idle_with_hints(percpu_entry->states[cx->index].eax,
+ percpu_entry->states[cx->index].ecx);
+}
+EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_enter);
+
+static int __init ffh_cstate_init(void)
+{
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+
+ if (c->x86_vendor != X86_VENDOR_INTEL &&
+ c->x86_vendor != X86_VENDOR_AMD)
+ return -1;
+
+ cpu_cstate_entry = alloc_percpu(struct cstate_entry);
+ return 0;
+}
+
+static void __exit ffh_cstate_exit(void)
+{
+ free_percpu(cpu_cstate_entry);
+ cpu_cstate_entry = NULL;
+}
+
+arch_initcall(ffh_cstate_init);
+__exitcall(ffh_cstate_exit);
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
new file mode 100644
index 000000000..f1915b744
--- /dev/null
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -0,0 +1,150 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * sleep.c - x86-specific ACPI sleep support.
+ *
+ * Copyright (C) 2001-2003 Patrick Mochel
+ * Copyright (C) 2001-2003 Pavel Machek <pavel@ucw.cz>
+ */
+
+#include <linux/acpi.h>
+#include <linux/bootmem.h>
+#include <linux/memblock.h>
+#include <linux/dmi.h>
+#include <linux/cpumask.h>
+#include <asm/segment.h>
+#include <asm/desc.h>
+#include <asm/pgtable.h>
+#include <asm/cacheflush.h>
+#include <asm/realmode.h>
+
+#include <linux/ftrace.h>
+#include "../../realmode/rm/wakeup.h"
+#include "sleep.h"
+
+unsigned long acpi_realmode_flags;
+
+#if defined(CONFIG_SMP) && defined(CONFIG_64BIT)
+static char temp_stack[4096];
+#endif
+
+/**
+ * x86_acpi_enter_sleep_state - enter sleep state
+ * @state: Sleep state to enter.
+ *
+ * Wrapper around acpi_enter_sleep_state() to be called by assmebly.
+ */
+acpi_status asmlinkage __visible x86_acpi_enter_sleep_state(u8 state)
+{
+ return acpi_enter_sleep_state(state);
+}
+
+/**
+ * x86_acpi_suspend_lowlevel - save kernel state
+ *
+ * Create an identity mapped page table and copy the wakeup routine to
+ * low memory.
+ */
+int x86_acpi_suspend_lowlevel(void)
+{
+ struct wakeup_header *header =
+ (struct wakeup_header *) __va(real_mode_header->wakeup_header);
+
+ if (header->signature != WAKEUP_HEADER_SIGNATURE) {
+ printk(KERN_ERR "wakeup header does not match\n");
+ return -EINVAL;
+ }
+
+ header->video_mode = saved_video_mode;
+
+ header->pmode_behavior = 0;
+
+#ifndef CONFIG_64BIT
+ native_store_gdt((struct desc_ptr *)&header->pmode_gdt);
+
+ /*
+ * We have to check that we can write back the value, and not
+ * just read it. At least on 90 nm Pentium M (Family 6, Model
+ * 13), reading an invalid MSR is not guaranteed to trap, see
+ * Erratum X4 in "Intel Pentium M Processor on 90 nm Process
+ * with 2-MB L2 Cache and Intel® Processor A100 and A110 on 90
+ * nm process with 512-KB L2 Cache Specification Update".
+ */
+ if (!rdmsr_safe(MSR_EFER,
+ &header->pmode_efer_low,
+ &header->pmode_efer_high) &&
+ !wrmsr_safe(MSR_EFER,
+ header->pmode_efer_low,
+ header->pmode_efer_high))
+ header->pmode_behavior |= (1 << WAKEUP_BEHAVIOR_RESTORE_EFER);
+#endif /* !CONFIG_64BIT */
+
+ header->pmode_cr0 = read_cr0();
+ if (__this_cpu_read(cpu_info.cpuid_level) >= 0) {
+ header->pmode_cr4 = __read_cr4();
+ header->pmode_behavior |= (1 << WAKEUP_BEHAVIOR_RESTORE_CR4);
+ }
+ if (!rdmsr_safe(MSR_IA32_MISC_ENABLE,
+ &header->pmode_misc_en_low,
+ &header->pmode_misc_en_high) &&
+ !wrmsr_safe(MSR_IA32_MISC_ENABLE,
+ header->pmode_misc_en_low,
+ header->pmode_misc_en_high))
+ header->pmode_behavior |=
+ (1 << WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE);
+ header->realmode_flags = acpi_realmode_flags;
+ header->real_magic = 0x12345678;
+
+#ifndef CONFIG_64BIT
+ header->pmode_entry = (u32)&wakeup_pmode_return;
+ header->pmode_cr3 = (u32)__pa_symbol(initial_page_table);
+ saved_magic = 0x12345678;
+#else /* CONFIG_64BIT */
+#ifdef CONFIG_SMP
+ initial_stack = (unsigned long)temp_stack + sizeof(temp_stack);
+ early_gdt_descr.address =
+ (unsigned long)get_cpu_gdt_rw(smp_processor_id());
+ initial_gs = per_cpu_offset(smp_processor_id());
+#endif
+ initial_code = (unsigned long)wakeup_long64;
+ saved_magic = 0x123456789abcdef0L;
+#endif /* CONFIG_64BIT */
+
+ /*
+ * Pause/unpause graph tracing around do_suspend_lowlevel as it has
+ * inconsistent call/return info after it jumps to the wakeup vector.
+ */
+ pause_graph_tracing();
+ do_suspend_lowlevel();
+ unpause_graph_tracing();
+ return 0;
+}
+
+static int __init acpi_sleep_setup(char *str)
+{
+ while ((str != NULL) && (*str != '\0')) {
+ if (strncmp(str, "s3_bios", 7) == 0)
+ acpi_realmode_flags |= 1;
+ if (strncmp(str, "s3_mode", 7) == 0)
+ acpi_realmode_flags |= 2;
+ if (strncmp(str, "s3_beep", 7) == 0)
+ acpi_realmode_flags |= 4;
+#ifdef CONFIG_HIBERNATION
+ if (strncmp(str, "s4_nohwsig", 10) == 0)
+ acpi_no_s4_hw_signature();
+#endif
+ if (strncmp(str, "nonvs", 5) == 0)
+ acpi_nvs_nosave();
+ if (strncmp(str, "nonvs_s3", 8) == 0)
+ acpi_nvs_nosave_s3();
+ if (strncmp(str, "old_ordering", 12) == 0)
+ acpi_old_suspend_ordering();
+ if (strncmp(str, "nobl", 4) == 0)
+ acpi_sleep_no_blacklist();
+ str = strchr(str, ',');
+ if (str != NULL)
+ str += strspn(str, ", \t");
+ }
+ return 1;
+}
+
+__setup("acpi_sleep=", acpi_sleep_setup);
diff --git a/arch/x86/kernel/acpi/sleep.h b/arch/x86/kernel/acpi/sleep.h
new file mode 100644
index 000000000..fbb60ca42
--- /dev/null
+++ b/arch/x86/kernel/acpi/sleep.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Variables and functions used by the code in sleep.c
+ */
+
+#include <asm/realmode.h>
+
+extern unsigned long saved_video_mode;
+extern long saved_magic;
+
+extern int wakeup_pmode_return;
+
+extern u8 wake_sleep_flags;
+
+extern unsigned long acpi_copy_wakeup_routine(unsigned long);
+extern void wakeup_long64(void);
+
+extern void do_suspend_lowlevel(void);
+
+extern int x86_acpi_suspend_lowlevel(void);
+
+acpi_status asmlinkage x86_acpi_enter_sleep_state(u8 state);
diff --git a/arch/x86/kernel/acpi/wakeup_32.S b/arch/x86/kernel/acpi/wakeup_32.S
new file mode 100644
index 000000000..0c26b1b44
--- /dev/null
+++ b/arch/x86/kernel/acpi/wakeup_32.S
@@ -0,0 +1,99 @@
+ .text
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/page_types.h>
+
+# Copyright 2003, 2008 Pavel Machek <pavel@suse.cz>, distribute under GPLv2
+
+ .code32
+ ALIGN
+
+ENTRY(wakeup_pmode_return)
+wakeup_pmode_return:
+ movw $__KERNEL_DS, %ax
+ movw %ax, %ss
+ movw %ax, %fs
+ movw %ax, %gs
+
+ movw $__USER_DS, %ax
+ movw %ax, %ds
+ movw %ax, %es
+
+ # reload the gdt, as we need the full 32 bit address
+ lidt saved_idt
+ lldt saved_ldt
+ ljmp $(__KERNEL_CS), $1f
+1:
+ movl %cr3, %eax
+ movl %eax, %cr3
+ wbinvd
+
+ # and restore the stack ... but you need gdt for this to work
+ movl saved_context_esp, %esp
+
+ movl %cs:saved_magic, %eax
+ cmpl $0x12345678, %eax
+ jne bogus_magic
+
+ # jump to place where we left off
+ movl saved_eip, %eax
+ jmp *%eax
+
+bogus_magic:
+ jmp bogus_magic
+
+
+
+save_registers:
+ sidt saved_idt
+ sldt saved_ldt
+ str saved_tss
+
+ leal 4(%esp), %eax
+ movl %eax, saved_context_esp
+ movl %ebx, saved_context_ebx
+ movl %ebp, saved_context_ebp
+ movl %esi, saved_context_esi
+ movl %edi, saved_context_edi
+ pushfl
+ popl saved_context_eflags
+
+ movl $ret_point, saved_eip
+ ret
+
+
+restore_registers:
+ movl saved_context_ebp, %ebp
+ movl saved_context_ebx, %ebx
+ movl saved_context_esi, %esi
+ movl saved_context_edi, %edi
+ pushl saved_context_eflags
+ popfl
+ ret
+
+ENTRY(do_suspend_lowlevel)
+ call save_processor_state
+ call save_registers
+ pushl $3
+ call x86_acpi_enter_sleep_state
+ addl $4, %esp
+
+# In case of S3 failure, we'll emerge here. Jump
+# to ret_point to recover
+ jmp ret_point
+ .p2align 4,,7
+ret_point:
+ call restore_registers
+ call restore_processor_state
+ ret
+
+.data
+ALIGN
+ENTRY(saved_magic) .long 0
+ENTRY(saved_eip) .long 0
+
+# saved registers
+saved_idt: .long 0,0
+saved_ldt: .long 0
+saved_tss: .long 0
+
diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S
new file mode 100644
index 000000000..50b8ed031
--- /dev/null
+++ b/arch/x86/kernel/acpi/wakeup_64.S
@@ -0,0 +1,136 @@
+.text
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/pgtable_types.h>
+#include <asm/page_types.h>
+#include <asm/msr.h>
+#include <asm/asm-offsets.h>
+#include <asm/frame.h>
+
+# Copyright 2003 Pavel Machek <pavel@suse.cz>, distribute under GPLv2
+
+.code64
+ /*
+ * Hooray, we are in Long 64-bit mode (but still running in low memory)
+ */
+ENTRY(wakeup_long64)
+ movq saved_magic, %rax
+ movq $0x123456789abcdef0, %rdx
+ cmpq %rdx, %rax
+ jne bogus_64_magic
+
+ movw $__KERNEL_DS, %ax
+ movw %ax, %ss
+ movw %ax, %ds
+ movw %ax, %es
+ movw %ax, %fs
+ movw %ax, %gs
+ movq saved_rsp, %rsp
+
+ movq saved_rbx, %rbx
+ movq saved_rdi, %rdi
+ movq saved_rsi, %rsi
+ movq saved_rbp, %rbp
+
+ movq saved_rip, %rax
+ jmp *%rax
+ENDPROC(wakeup_long64)
+
+bogus_64_magic:
+ jmp bogus_64_magic
+
+ENTRY(do_suspend_lowlevel)
+ FRAME_BEGIN
+ subq $8, %rsp
+ xorl %eax, %eax
+ call save_processor_state
+
+ movq $saved_context, %rax
+ movq %rsp, pt_regs_sp(%rax)
+ movq %rbp, pt_regs_bp(%rax)
+ movq %rsi, pt_regs_si(%rax)
+ movq %rdi, pt_regs_di(%rax)
+ movq %rbx, pt_regs_bx(%rax)
+ movq %rcx, pt_regs_cx(%rax)
+ movq %rdx, pt_regs_dx(%rax)
+ movq %r8, pt_regs_r8(%rax)
+ movq %r9, pt_regs_r9(%rax)
+ movq %r10, pt_regs_r10(%rax)
+ movq %r11, pt_regs_r11(%rax)
+ movq %r12, pt_regs_r12(%rax)
+ movq %r13, pt_regs_r13(%rax)
+ movq %r14, pt_regs_r14(%rax)
+ movq %r15, pt_regs_r15(%rax)
+ pushfq
+ popq pt_regs_flags(%rax)
+
+ movq $.Lresume_point, saved_rip(%rip)
+
+ movq %rsp, saved_rsp
+ movq %rbp, saved_rbp
+ movq %rbx, saved_rbx
+ movq %rdi, saved_rdi
+ movq %rsi, saved_rsi
+
+ addq $8, %rsp
+ movl $3, %edi
+ xorl %eax, %eax
+ call x86_acpi_enter_sleep_state
+ /* in case something went wrong, restore the machine status and go on */
+ jmp .Lresume_point
+
+ .align 4
+.Lresume_point:
+ /* We don't restore %rax, it must be 0 anyway */
+ movq $saved_context, %rax
+ movq saved_context_cr4(%rax), %rbx
+ movq %rbx, %cr4
+ movq saved_context_cr3(%rax), %rbx
+ movq %rbx, %cr3
+ movq saved_context_cr2(%rax), %rbx
+ movq %rbx, %cr2
+ movq saved_context_cr0(%rax), %rbx
+ movq %rbx, %cr0
+ pushq pt_regs_flags(%rax)
+ popfq
+ movq pt_regs_sp(%rax), %rsp
+ movq pt_regs_bp(%rax), %rbp
+ movq pt_regs_si(%rax), %rsi
+ movq pt_regs_di(%rax), %rdi
+ movq pt_regs_bx(%rax), %rbx
+ movq pt_regs_cx(%rax), %rcx
+ movq pt_regs_dx(%rax), %rdx
+ movq pt_regs_r8(%rax), %r8
+ movq pt_regs_r9(%rax), %r9
+ movq pt_regs_r10(%rax), %r10
+ movq pt_regs_r11(%rax), %r11
+ movq pt_regs_r12(%rax), %r12
+ movq pt_regs_r13(%rax), %r13
+ movq pt_regs_r14(%rax), %r14
+ movq pt_regs_r15(%rax), %r15
+
+#ifdef CONFIG_KASAN
+ /*
+ * The suspend path may have poisoned some areas deeper in the stack,
+ * which we now need to unpoison.
+ */
+ movq %rsp, %rdi
+ call kasan_unpoison_task_stack_below
+#endif
+
+ xorl %eax, %eax
+ addq $8, %rsp
+ FRAME_END
+ jmp restore_processor_state
+ENDPROC(do_suspend_lowlevel)
+
+.data
+ENTRY(saved_rbp) .quad 0
+ENTRY(saved_rsi) .quad 0
+ENTRY(saved_rdi) .quad 0
+ENTRY(saved_rbx) .quad 0
+
+ENTRY(saved_rip) .quad 0
+ENTRY(saved_rsp) .quad 0
+
+ENTRY(saved_magic) .quad 0
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
new file mode 100644
index 000000000..918a23704
--- /dev/null
+++ b/arch/x86/kernel/alternative.c
@@ -0,0 +1,845 @@
+#define pr_fmt(fmt) "SMP alternatives: " fmt
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/stringify.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/memory.h>
+#include <linux/stop_machine.h>
+#include <linux/slab.h>
+#include <linux/kdebug.h>
+#include <asm/text-patching.h>
+#include <asm/alternative.h>
+#include <asm/sections.h>
+#include <asm/pgtable.h>
+#include <asm/mce.h>
+#include <asm/nmi.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+#include <asm/io.h>
+#include <asm/fixmap.h>
+
+int __read_mostly alternatives_patched;
+
+EXPORT_SYMBOL_GPL(alternatives_patched);
+
+#define MAX_PATCH_LEN (255-1)
+
+static int __initdata_or_module debug_alternative;
+
+static int __init debug_alt(char *str)
+{
+ debug_alternative = 1;
+ return 1;
+}
+__setup("debug-alternative", debug_alt);
+
+static int noreplace_smp;
+
+static int __init setup_noreplace_smp(char *str)
+{
+ noreplace_smp = 1;
+ return 1;
+}
+__setup("noreplace-smp", setup_noreplace_smp);
+
+#define DPRINTK(fmt, args...) \
+do { \
+ if (debug_alternative) \
+ printk(KERN_DEBUG "%s: " fmt "\n", __func__, ##args); \
+} while (0)
+
+#define DUMP_BYTES(buf, len, fmt, args...) \
+do { \
+ if (unlikely(debug_alternative)) { \
+ int j; \
+ \
+ if (!(len)) \
+ break; \
+ \
+ printk(KERN_DEBUG fmt, ##args); \
+ for (j = 0; j < (len) - 1; j++) \
+ printk(KERN_CONT "%02hhx ", buf[j]); \
+ printk(KERN_CONT "%02hhx\n", buf[j]); \
+ } \
+} while (0)
+
+/*
+ * Each GENERIC_NOPX is of X bytes, and defined as an array of bytes
+ * that correspond to that nop. Getting from one nop to the next, we
+ * add to the array the offset that is equal to the sum of all sizes of
+ * nops preceding the one we are after.
+ *
+ * Note: The GENERIC_NOP5_ATOMIC is at the end, as it breaks the
+ * nice symmetry of sizes of the previous nops.
+ */
+#if defined(GENERIC_NOP1) && !defined(CONFIG_X86_64)
+static const unsigned char intelnops[] =
+{
+ GENERIC_NOP1,
+ GENERIC_NOP2,
+ GENERIC_NOP3,
+ GENERIC_NOP4,
+ GENERIC_NOP5,
+ GENERIC_NOP6,
+ GENERIC_NOP7,
+ GENERIC_NOP8,
+ GENERIC_NOP5_ATOMIC
+};
+static const unsigned char * const intel_nops[ASM_NOP_MAX+2] =
+{
+ NULL,
+ intelnops,
+ intelnops + 1,
+ intelnops + 1 + 2,
+ intelnops + 1 + 2 + 3,
+ intelnops + 1 + 2 + 3 + 4,
+ intelnops + 1 + 2 + 3 + 4 + 5,
+ intelnops + 1 + 2 + 3 + 4 + 5 + 6,
+ intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
+ intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
+};
+#endif
+
+#ifdef K8_NOP1
+static const unsigned char k8nops[] =
+{
+ K8_NOP1,
+ K8_NOP2,
+ K8_NOP3,
+ K8_NOP4,
+ K8_NOP5,
+ K8_NOP6,
+ K8_NOP7,
+ K8_NOP8,
+ K8_NOP5_ATOMIC
+};
+static const unsigned char * const k8_nops[ASM_NOP_MAX+2] =
+{
+ NULL,
+ k8nops,
+ k8nops + 1,
+ k8nops + 1 + 2,
+ k8nops + 1 + 2 + 3,
+ k8nops + 1 + 2 + 3 + 4,
+ k8nops + 1 + 2 + 3 + 4 + 5,
+ k8nops + 1 + 2 + 3 + 4 + 5 + 6,
+ k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
+ k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
+};
+#endif
+
+#if defined(K7_NOP1) && !defined(CONFIG_X86_64)
+static const unsigned char k7nops[] =
+{
+ K7_NOP1,
+ K7_NOP2,
+ K7_NOP3,
+ K7_NOP4,
+ K7_NOP5,
+ K7_NOP6,
+ K7_NOP7,
+ K7_NOP8,
+ K7_NOP5_ATOMIC
+};
+static const unsigned char * const k7_nops[ASM_NOP_MAX+2] =
+{
+ NULL,
+ k7nops,
+ k7nops + 1,
+ k7nops + 1 + 2,
+ k7nops + 1 + 2 + 3,
+ k7nops + 1 + 2 + 3 + 4,
+ k7nops + 1 + 2 + 3 + 4 + 5,
+ k7nops + 1 + 2 + 3 + 4 + 5 + 6,
+ k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
+ k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
+};
+#endif
+
+#ifdef P6_NOP1
+static const unsigned char p6nops[] =
+{
+ P6_NOP1,
+ P6_NOP2,
+ P6_NOP3,
+ P6_NOP4,
+ P6_NOP5,
+ P6_NOP6,
+ P6_NOP7,
+ P6_NOP8,
+ P6_NOP5_ATOMIC
+};
+static const unsigned char * const p6_nops[ASM_NOP_MAX+2] =
+{
+ NULL,
+ p6nops,
+ p6nops + 1,
+ p6nops + 1 + 2,
+ p6nops + 1 + 2 + 3,
+ p6nops + 1 + 2 + 3 + 4,
+ p6nops + 1 + 2 + 3 + 4 + 5,
+ p6nops + 1 + 2 + 3 + 4 + 5 + 6,
+ p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
+ p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
+};
+#endif
+
+/* Initialize these to a safe default */
+#ifdef CONFIG_X86_64
+const unsigned char * const *ideal_nops = p6_nops;
+#else
+const unsigned char * const *ideal_nops = intel_nops;
+#endif
+
+void __init arch_init_ideal_nops(void)
+{
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_INTEL:
+ /*
+ * Due to a decoder implementation quirk, some
+ * specific Intel CPUs actually perform better with
+ * the "k8_nops" than with the SDM-recommended NOPs.
+ */
+ if (boot_cpu_data.x86 == 6 &&
+ boot_cpu_data.x86_model >= 0x0f &&
+ boot_cpu_data.x86_model != 0x1c &&
+ boot_cpu_data.x86_model != 0x26 &&
+ boot_cpu_data.x86_model != 0x27 &&
+ boot_cpu_data.x86_model < 0x30) {
+ ideal_nops = k8_nops;
+ } else if (boot_cpu_has(X86_FEATURE_NOPL)) {
+ ideal_nops = p6_nops;
+ } else {
+#ifdef CONFIG_X86_64
+ ideal_nops = k8_nops;
+#else
+ ideal_nops = intel_nops;
+#endif
+ }
+ break;
+
+ case X86_VENDOR_AMD:
+ if (boot_cpu_data.x86 > 0xf) {
+ ideal_nops = p6_nops;
+ return;
+ }
+
+ /* fall through */
+
+ default:
+#ifdef CONFIG_X86_64
+ ideal_nops = k8_nops;
+#else
+ if (boot_cpu_has(X86_FEATURE_K8))
+ ideal_nops = k8_nops;
+ else if (boot_cpu_has(X86_FEATURE_K7))
+ ideal_nops = k7_nops;
+ else
+ ideal_nops = intel_nops;
+#endif
+ }
+}
+
+/* Use this to add nops to a buffer, then text_poke the whole buffer. */
+static void __init_or_module add_nops(void *insns, unsigned int len)
+{
+ while (len > 0) {
+ unsigned int noplen = len;
+ if (noplen > ASM_NOP_MAX)
+ noplen = ASM_NOP_MAX;
+ memcpy(insns, ideal_nops[noplen], noplen);
+ insns += noplen;
+ len -= noplen;
+ }
+}
+
+extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
+extern s32 __smp_locks[], __smp_locks_end[];
+void *text_poke_early(void *addr, const void *opcode, size_t len);
+
+/*
+ * Are we looking at a near JMP with a 1 or 4-byte displacement.
+ */
+static inline bool is_jmp(const u8 opcode)
+{
+ return opcode == 0xeb || opcode == 0xe9;
+}
+
+static void __init_or_module
+recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insnbuf)
+{
+ u8 *next_rip, *tgt_rip;
+ s32 n_dspl, o_dspl;
+ int repl_len;
+
+ if (a->replacementlen != 5)
+ return;
+
+ o_dspl = *(s32 *)(insnbuf + 1);
+
+ /* next_rip of the replacement JMP */
+ next_rip = repl_insn + a->replacementlen;
+ /* target rip of the replacement JMP */
+ tgt_rip = next_rip + o_dspl;
+ n_dspl = tgt_rip - orig_insn;
+
+ DPRINTK("target RIP: %px, new_displ: 0x%x", tgt_rip, n_dspl);
+
+ if (tgt_rip - orig_insn >= 0) {
+ if (n_dspl - 2 <= 127)
+ goto two_byte_jmp;
+ else
+ goto five_byte_jmp;
+ /* negative offset */
+ } else {
+ if (((n_dspl - 2) & 0xff) == (n_dspl - 2))
+ goto two_byte_jmp;
+ else
+ goto five_byte_jmp;
+ }
+
+two_byte_jmp:
+ n_dspl -= 2;
+
+ insnbuf[0] = 0xeb;
+ insnbuf[1] = (s8)n_dspl;
+ add_nops(insnbuf + 2, 3);
+
+ repl_len = 2;
+ goto done;
+
+five_byte_jmp:
+ n_dspl -= 5;
+
+ insnbuf[0] = 0xe9;
+ *(s32 *)&insnbuf[1] = n_dspl;
+
+ repl_len = 5;
+
+done:
+
+ DPRINTK("final displ: 0x%08x, JMP 0x%lx",
+ n_dspl, (unsigned long)orig_insn + n_dspl + repl_len);
+}
+
+/*
+ * "noinline" to cause control flow change and thus invalidate I$ and
+ * cause refetch after modification.
+ */
+static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *instr)
+{
+ unsigned long flags;
+ int i;
+
+ for (i = 0; i < a->padlen; i++) {
+ if (instr[i] != 0x90)
+ return;
+ }
+
+ local_irq_save(flags);
+ add_nops(instr + (a->instrlen - a->padlen), a->padlen);
+ local_irq_restore(flags);
+
+ DUMP_BYTES(instr, a->instrlen, "%px: [%d:%d) optimized NOPs: ",
+ instr, a->instrlen - a->padlen, a->padlen);
+}
+
+/*
+ * Replace instructions with better alternatives for this CPU type. This runs
+ * before SMP is initialized to avoid SMP problems with self modifying code.
+ * This implies that asymmetric systems where APs have less capabilities than
+ * the boot processor are not handled. Tough. Make sure you disable such
+ * features by hand.
+ *
+ * Marked "noinline" to cause control flow change and thus insn cache
+ * to refetch changed I$ lines.
+ */
+void __init_or_module noinline apply_alternatives(struct alt_instr *start,
+ struct alt_instr *end)
+{
+ struct alt_instr *a;
+ u8 *instr, *replacement;
+ u8 insnbuf[MAX_PATCH_LEN];
+
+ DPRINTK("alt table %px, -> %px", start, end);
+ /*
+ * The scan order should be from start to end. A later scanned
+ * alternative code can overwrite previously scanned alternative code.
+ * Some kernel functions (e.g. memcpy, memset, etc) use this order to
+ * patch code.
+ *
+ * So be careful if you want to change the scan order to any other
+ * order.
+ */
+ for (a = start; a < end; a++) {
+ int insnbuf_sz = 0;
+
+ instr = (u8 *)&a->instr_offset + a->instr_offset;
+ replacement = (u8 *)&a->repl_offset + a->repl_offset;
+ BUG_ON(a->instrlen > sizeof(insnbuf));
+ BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32);
+ if (!boot_cpu_has(a->cpuid)) {
+ if (a->padlen > 1)
+ optimize_nops(a, instr);
+
+ continue;
+ }
+
+ DPRINTK("feat: %d*32+%d, old: (%px len: %d), repl: (%px, len: %d), pad: %d",
+ a->cpuid >> 5,
+ a->cpuid & 0x1f,
+ instr, a->instrlen,
+ replacement, a->replacementlen, a->padlen);
+
+ DUMP_BYTES(instr, a->instrlen, "%px: old_insn: ", instr);
+ DUMP_BYTES(replacement, a->replacementlen, "%px: rpl_insn: ", replacement);
+
+ memcpy(insnbuf, replacement, a->replacementlen);
+ insnbuf_sz = a->replacementlen;
+
+ /*
+ * 0xe8 is a relative jump; fix the offset.
+ *
+ * Instruction length is checked before the opcode to avoid
+ * accessing uninitialized bytes for zero-length replacements.
+ */
+ if (a->replacementlen == 5 && *insnbuf == 0xe8) {
+ *(s32 *)(insnbuf + 1) += replacement - instr;
+ DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx",
+ *(s32 *)(insnbuf + 1),
+ (unsigned long)instr + *(s32 *)(insnbuf + 1) + 5);
+ }
+
+ if (a->replacementlen && is_jmp(replacement[0]))
+ recompute_jump(a, instr, replacement, insnbuf);
+
+ if (a->instrlen > a->replacementlen) {
+ add_nops(insnbuf + a->replacementlen,
+ a->instrlen - a->replacementlen);
+ insnbuf_sz += a->instrlen - a->replacementlen;
+ }
+ DUMP_BYTES(insnbuf, insnbuf_sz, "%px: final_insn: ", instr);
+
+ text_poke_early(instr, insnbuf, insnbuf_sz);
+ }
+}
+
+#ifdef CONFIG_SMP
+static void alternatives_smp_lock(const s32 *start, const s32 *end,
+ u8 *text, u8 *text_end)
+{
+ const s32 *poff;
+
+ for (poff = start; poff < end; poff++) {
+ u8 *ptr = (u8 *)poff + *poff;
+
+ if (!*poff || ptr < text || ptr >= text_end)
+ continue;
+ /* turn DS segment override prefix into lock prefix */
+ if (*ptr == 0x3e)
+ text_poke(ptr, ((unsigned char []){0xf0}), 1);
+ }
+}
+
+static void alternatives_smp_unlock(const s32 *start, const s32 *end,
+ u8 *text, u8 *text_end)
+{
+ const s32 *poff;
+
+ for (poff = start; poff < end; poff++) {
+ u8 *ptr = (u8 *)poff + *poff;
+
+ if (!*poff || ptr < text || ptr >= text_end)
+ continue;
+ /* turn lock prefix into DS segment override prefix */
+ if (*ptr == 0xf0)
+ text_poke(ptr, ((unsigned char []){0x3E}), 1);
+ }
+}
+
+struct smp_alt_module {
+ /* what is this ??? */
+ struct module *mod;
+ char *name;
+
+ /* ptrs to lock prefixes */
+ const s32 *locks;
+ const s32 *locks_end;
+
+ /* .text segment, needed to avoid patching init code ;) */
+ u8 *text;
+ u8 *text_end;
+
+ struct list_head next;
+};
+static LIST_HEAD(smp_alt_modules);
+static bool uniproc_patched = false; /* protected by text_mutex */
+
+void __init_or_module alternatives_smp_module_add(struct module *mod,
+ char *name,
+ void *locks, void *locks_end,
+ void *text, void *text_end)
+{
+ struct smp_alt_module *smp;
+
+ mutex_lock(&text_mutex);
+ if (!uniproc_patched)
+ goto unlock;
+
+ if (num_possible_cpus() == 1)
+ /* Don't bother remembering, we'll never have to undo it. */
+ goto smp_unlock;
+
+ smp = kzalloc(sizeof(*smp), GFP_KERNEL);
+ if (NULL == smp)
+ /* we'll run the (safe but slow) SMP code then ... */
+ goto unlock;
+
+ smp->mod = mod;
+ smp->name = name;
+ smp->locks = locks;
+ smp->locks_end = locks_end;
+ smp->text = text;
+ smp->text_end = text_end;
+ DPRINTK("locks %p -> %p, text %p -> %p, name %s\n",
+ smp->locks, smp->locks_end,
+ smp->text, smp->text_end, smp->name);
+
+ list_add_tail(&smp->next, &smp_alt_modules);
+smp_unlock:
+ alternatives_smp_unlock(locks, locks_end, text, text_end);
+unlock:
+ mutex_unlock(&text_mutex);
+}
+
+void __init_or_module alternatives_smp_module_del(struct module *mod)
+{
+ struct smp_alt_module *item;
+
+ mutex_lock(&text_mutex);
+ list_for_each_entry(item, &smp_alt_modules, next) {
+ if (mod != item->mod)
+ continue;
+ list_del(&item->next);
+ kfree(item);
+ break;
+ }
+ mutex_unlock(&text_mutex);
+}
+
+void alternatives_enable_smp(void)
+{
+ struct smp_alt_module *mod;
+
+ /* Why bother if there are no other CPUs? */
+ BUG_ON(num_possible_cpus() == 1);
+
+ mutex_lock(&text_mutex);
+
+ if (uniproc_patched) {
+ pr_info("switching to SMP code\n");
+ BUG_ON(num_online_cpus() != 1);
+ clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
+ clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
+ list_for_each_entry(mod, &smp_alt_modules, next)
+ alternatives_smp_lock(mod->locks, mod->locks_end,
+ mod->text, mod->text_end);
+ uniproc_patched = false;
+ }
+ mutex_unlock(&text_mutex);
+}
+
+/*
+ * Return 1 if the address range is reserved for SMP-alternatives.
+ * Must hold text_mutex.
+ */
+int alternatives_text_reserved(void *start, void *end)
+{
+ struct smp_alt_module *mod;
+ const s32 *poff;
+ u8 *text_start = start;
+ u8 *text_end = end;
+
+ lockdep_assert_held(&text_mutex);
+
+ list_for_each_entry(mod, &smp_alt_modules, next) {
+ if (mod->text > text_end || mod->text_end < text_start)
+ continue;
+ for (poff = mod->locks; poff < mod->locks_end; poff++) {
+ const u8 *ptr = (const u8 *)poff + *poff;
+
+ if (text_start <= ptr && text_end > ptr)
+ return 1;
+ }
+ }
+
+ return 0;
+}
+#endif /* CONFIG_SMP */
+
+#ifdef CONFIG_PARAVIRT
+void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
+ struct paravirt_patch_site *end)
+{
+ struct paravirt_patch_site *p;
+ char insnbuf[MAX_PATCH_LEN];
+
+ for (p = start; p < end; p++) {
+ unsigned int used;
+
+ BUG_ON(p->len > MAX_PATCH_LEN);
+ /* prep the buffer with the original instructions */
+ memcpy(insnbuf, p->instr, p->len);
+ used = pv_init_ops.patch(p->instrtype, p->clobbers, insnbuf,
+ (unsigned long)p->instr, p->len);
+
+ BUG_ON(used > p->len);
+
+ /* Pad the rest with nops */
+ add_nops(insnbuf + used, p->len - used);
+ text_poke_early(p->instr, insnbuf, p->len);
+ }
+}
+extern struct paravirt_patch_site __start_parainstructions[],
+ __stop_parainstructions[];
+#endif /* CONFIG_PARAVIRT */
+
+void __init alternative_instructions(void)
+{
+ /* The patching is not fully atomic, so try to avoid local interruptions
+ that might execute the to be patched code.
+ Other CPUs are not running. */
+ stop_nmi();
+
+ /*
+ * Don't stop machine check exceptions while patching.
+ * MCEs only happen when something got corrupted and in this
+ * case we must do something about the corruption.
+ * Ignoring it is worse than a unlikely patching race.
+ * Also machine checks tend to be broadcast and if one CPU
+ * goes into machine check the others follow quickly, so we don't
+ * expect a machine check to cause undue problems during to code
+ * patching.
+ */
+
+ apply_alternatives(__alt_instructions, __alt_instructions_end);
+
+#ifdef CONFIG_SMP
+ /* Patch to UP if other cpus not imminent. */
+ if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) {
+ uniproc_patched = true;
+ alternatives_smp_module_add(NULL, "core kernel",
+ __smp_locks, __smp_locks_end,
+ _text, _etext);
+ }
+
+ if (!uniproc_patched || num_possible_cpus() == 1)
+ free_init_pages("SMP alternatives",
+ (unsigned long)__smp_locks,
+ (unsigned long)__smp_locks_end);
+#endif
+
+ apply_paravirt(__parainstructions, __parainstructions_end);
+
+ restart_nmi();
+ alternatives_patched = 1;
+}
+
+/**
+ * text_poke_early - Update instructions on a live kernel at boot time
+ * @addr: address to modify
+ * @opcode: source of the copy
+ * @len: length to copy
+ *
+ * When you use this code to patch more than one byte of an instruction
+ * you need to make sure that other CPUs cannot execute this code in parallel.
+ * Also no thread must be currently preempted in the middle of these
+ * instructions. And on the local CPU you need to be protected again NMI or MCE
+ * handlers seeing an inconsistent instruction while you patch.
+ */
+void *__init_or_module text_poke_early(void *addr, const void *opcode,
+ size_t len)
+{
+ unsigned long flags;
+
+ if (boot_cpu_has(X86_FEATURE_NX) &&
+ is_module_text_address((unsigned long)addr)) {
+ /*
+ * Modules text is marked initially as non-executable, so the
+ * code cannot be running and speculative code-fetches are
+ * prevented. Just change the code.
+ */
+ memcpy(addr, opcode, len);
+ } else {
+ local_irq_save(flags);
+ memcpy(addr, opcode, len);
+ local_irq_restore(flags);
+ sync_core();
+
+ /*
+ * Could also do a CLFLUSH here to speed up CPU recovery; but
+ * that causes hangs on some VIA CPUs.
+ */
+ }
+ return addr;
+}
+
+/**
+ * text_poke - Update instructions on a live kernel
+ * @addr: address to modify
+ * @opcode: source of the copy
+ * @len: length to copy
+ *
+ * Only atomic text poke/set should be allowed when not doing early patching.
+ * It means the size must be writable atomically and the address must be aligned
+ * in a way that permits an atomic write. It also makes sure we fit on a single
+ * page.
+ */
+void *text_poke(void *addr, const void *opcode, size_t len)
+{
+ unsigned long flags;
+ char *vaddr;
+ struct page *pages[2];
+ int i;
+
+ /*
+ * While boot memory allocator is runnig we cannot use struct
+ * pages as they are not yet initialized.
+ */
+ BUG_ON(!after_bootmem);
+
+ lockdep_assert_held(&text_mutex);
+
+ if (!core_kernel_text((unsigned long)addr)) {
+ pages[0] = vmalloc_to_page(addr);
+ pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
+ } else {
+ pages[0] = virt_to_page(addr);
+ WARN_ON(!PageReserved(pages[0]));
+ pages[1] = virt_to_page(addr + PAGE_SIZE);
+ }
+ BUG_ON(!pages[0]);
+ local_irq_save(flags);
+ set_fixmap(FIX_TEXT_POKE0, page_to_phys(pages[0]));
+ if (pages[1])
+ set_fixmap(FIX_TEXT_POKE1, page_to_phys(pages[1]));
+ vaddr = (char *)fix_to_virt(FIX_TEXT_POKE0);
+ memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len);
+ clear_fixmap(FIX_TEXT_POKE0);
+ if (pages[1])
+ clear_fixmap(FIX_TEXT_POKE1);
+ local_flush_tlb();
+ sync_core();
+ /* Could also do a CLFLUSH here to speed up CPU recovery; but
+ that causes hangs on some VIA CPUs. */
+ for (i = 0; i < len; i++)
+ BUG_ON(((char *)addr)[i] != ((char *)opcode)[i]);
+ local_irq_restore(flags);
+ return addr;
+}
+
+static void do_sync_core(void *info)
+{
+ sync_core();
+}
+
+static bool bp_patching_in_progress;
+static void *bp_int3_handler, *bp_int3_addr;
+
+int poke_int3_handler(struct pt_regs *regs)
+{
+ /*
+ * Having observed our INT3 instruction, we now must observe
+ * bp_patching_in_progress.
+ *
+ * in_progress = TRUE INT3
+ * WMB RMB
+ * write INT3 if (in_progress)
+ *
+ * Idem for bp_int3_handler.
+ */
+ smp_rmb();
+
+ if (likely(!bp_patching_in_progress))
+ return 0;
+
+ if (user_mode(regs) || regs->ip != (unsigned long)bp_int3_addr)
+ return 0;
+
+ /* set up the specified breakpoint handler */
+ regs->ip = (unsigned long) bp_int3_handler;
+
+ return 1;
+
+}
+
+/**
+ * text_poke_bp() -- update instructions on live kernel on SMP
+ * @addr: address to patch
+ * @opcode: opcode of new instruction
+ * @len: length to copy
+ * @handler: address to jump to when the temporary breakpoint is hit
+ *
+ * Modify multi-byte instruction by using int3 breakpoint on SMP.
+ * We completely avoid stop_machine() here, and achieve the
+ * synchronization using int3 breakpoint.
+ *
+ * The way it is done:
+ * - add a int3 trap to the address that will be patched
+ * - sync cores
+ * - update all but the first byte of the patched range
+ * - sync cores
+ * - replace the first byte (int3) by the first byte of
+ * replacing opcode
+ * - sync cores
+ */
+void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
+{
+ unsigned char int3 = 0xcc;
+
+ bp_int3_handler = handler;
+ bp_int3_addr = (u8 *)addr + sizeof(int3);
+ bp_patching_in_progress = true;
+
+ lockdep_assert_held(&text_mutex);
+
+ /*
+ * Corresponding read barrier in int3 notifier for making sure the
+ * in_progress and handler are correctly ordered wrt. patching.
+ */
+ smp_wmb();
+
+ text_poke(addr, &int3, sizeof(int3));
+
+ on_each_cpu(do_sync_core, NULL, 1);
+
+ if (len - sizeof(int3) > 0) {
+ /* patch all but the first byte */
+ text_poke((char *)addr + sizeof(int3),
+ (const char *) opcode + sizeof(int3),
+ len - sizeof(int3));
+ /*
+ * According to Intel, this core syncing is very likely
+ * not necessary and we'd be safe even without it. But
+ * better safe than sorry (plus there's not only Intel).
+ */
+ on_each_cpu(do_sync_core, NULL, 1);
+ }
+
+ /* patch the first byte */
+ text_poke(addr, opcode, sizeof(int3));
+
+ on_each_cpu(do_sync_core, NULL, 1);
+ /*
+ * sync_core() implies an smp_mb() and orders this store against
+ * the writing of the new instruction.
+ */
+ bp_patching_in_progress = false;
+
+ return addr;
+}
+
diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
new file mode 100644
index 000000000..f299d8a47
--- /dev/null
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -0,0 +1,891 @@
+/*
+ * Dynamic DMA mapping support for AMD Hammer.
+ *
+ * Use the integrated AGP GART in the Hammer northbridge as an IOMMU for PCI.
+ * This allows to use PCI devices that only support 32bit addresses on systems
+ * with more than 4GB.
+ *
+ * See Documentation/DMA-API-HOWTO.txt for the interface specification.
+ *
+ * Copyright 2002 Andi Kleen, SuSE Labs.
+ * Subject to the GNU General Public License v2 only.
+ */
+
+#include <linux/types.h>
+#include <linux/ctype.h>
+#include <linux/agp_backend.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/sched/debug.h>
+#include <linux/string.h>
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+#include <linux/topology.h>
+#include <linux/interrupt.h>
+#include <linux/bitmap.h>
+#include <linux/kdebug.h>
+#include <linux/scatterlist.h>
+#include <linux/iommu-helper.h>
+#include <linux/syscore_ops.h>
+#include <linux/io.h>
+#include <linux/gfp.h>
+#include <linux/atomic.h>
+#include <linux/dma-direct.h>
+#include <asm/mtrr.h>
+#include <asm/pgtable.h>
+#include <asm/proto.h>
+#include <asm/iommu.h>
+#include <asm/gart.h>
+#include <asm/set_memory.h>
+#include <asm/swiotlb.h>
+#include <asm/dma.h>
+#include <asm/amd_nb.h>
+#include <asm/x86_init.h>
+#include <asm/iommu_table.h>
+
+static unsigned long iommu_bus_base; /* GART remapping area (physical) */
+static unsigned long iommu_size; /* size of remapping area bytes */
+static unsigned long iommu_pages; /* .. and in pages */
+
+static u32 *iommu_gatt_base; /* Remapping table */
+
+static dma_addr_t bad_dma_addr;
+
+/*
+ * If this is disabled the IOMMU will use an optimized flushing strategy
+ * of only flushing when an mapping is reused. With it true the GART is
+ * flushed for every mapping. Problem is that doing the lazy flush seems
+ * to trigger bugs with some popular PCI cards, in particular 3ware (but
+ * has been also also seen with Qlogic at least).
+ */
+static int iommu_fullflush = 1;
+
+/* Allocation bitmap for the remapping area: */
+static DEFINE_SPINLOCK(iommu_bitmap_lock);
+/* Guarded by iommu_bitmap_lock: */
+static unsigned long *iommu_gart_bitmap;
+
+static u32 gart_unmapped_entry;
+
+#define GPTE_VALID 1
+#define GPTE_COHERENT 2
+#define GPTE_ENCODE(x) \
+ (((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT)
+#define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28))
+
+#define EMERGENCY_PAGES 32 /* = 128KB */
+
+#ifdef CONFIG_AGP
+#define AGPEXTERN extern
+#else
+#define AGPEXTERN
+#endif
+
+/* GART can only remap to physical addresses < 1TB */
+#define GART_MAX_PHYS_ADDR (1ULL << 40)
+
+/* backdoor interface to AGP driver */
+AGPEXTERN int agp_memory_reserved;
+AGPEXTERN __u32 *agp_gatt_table;
+
+static unsigned long next_bit; /* protected by iommu_bitmap_lock */
+static bool need_flush; /* global flush state. set for each gart wrap */
+
+static unsigned long alloc_iommu(struct device *dev, int size,
+ unsigned long align_mask)
+{
+ unsigned long offset, flags;
+ unsigned long boundary_size;
+ unsigned long base_index;
+
+ base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev),
+ PAGE_SIZE) >> PAGE_SHIFT;
+ boundary_size = ALIGN((u64)dma_get_seg_boundary(dev) + 1,
+ PAGE_SIZE) >> PAGE_SHIFT;
+
+ spin_lock_irqsave(&iommu_bitmap_lock, flags);
+ offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, next_bit,
+ size, base_index, boundary_size, align_mask);
+ if (offset == -1) {
+ need_flush = true;
+ offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, 0,
+ size, base_index, boundary_size,
+ align_mask);
+ }
+ if (offset != -1) {
+ next_bit = offset+size;
+ if (next_bit >= iommu_pages) {
+ next_bit = 0;
+ need_flush = true;
+ }
+ }
+ if (iommu_fullflush)
+ need_flush = true;
+ spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
+
+ return offset;
+}
+
+static void free_iommu(unsigned long offset, int size)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&iommu_bitmap_lock, flags);
+ bitmap_clear(iommu_gart_bitmap, offset, size);
+ if (offset >= next_bit)
+ next_bit = offset + size;
+ spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
+}
+
+/*
+ * Use global flush state to avoid races with multiple flushers.
+ */
+static void flush_gart(void)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&iommu_bitmap_lock, flags);
+ if (need_flush) {
+ amd_flush_garts();
+ need_flush = false;
+ }
+ spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
+}
+
+#ifdef CONFIG_IOMMU_LEAK
+/* Debugging aid for drivers that don't free their IOMMU tables */
+static int leak_trace;
+static int iommu_leak_pages = 20;
+
+static void dump_leak(void)
+{
+ static int dump;
+
+ if (dump)
+ return;
+ dump = 1;
+
+ show_stack(NULL, NULL);
+ debug_dma_dump_mappings(NULL);
+}
+#endif
+
+static void iommu_full(struct device *dev, size_t size, int dir)
+{
+ /*
+ * Ran out of IOMMU space for this operation. This is very bad.
+ * Unfortunately the drivers cannot handle this operation properly.
+ * Return some non mapped prereserved space in the aperture and
+ * let the Northbridge deal with it. This will result in garbage
+ * in the IO operation. When the size exceeds the prereserved space
+ * memory corruption will occur or random memory will be DMAed
+ * out. Hopefully no network devices use single mappings that big.
+ */
+
+ dev_err(dev, "PCI-DMA: Out of IOMMU space for %lu bytes\n", size);
+
+ if (size > PAGE_SIZE*EMERGENCY_PAGES) {
+ if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL)
+ panic("PCI-DMA: Memory would be corrupted\n");
+ if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL)
+ panic(KERN_ERR
+ "PCI-DMA: Random memory would be DMAed\n");
+ }
+#ifdef CONFIG_IOMMU_LEAK
+ dump_leak();
+#endif
+}
+
+static inline int
+need_iommu(struct device *dev, unsigned long addr, size_t size)
+{
+ return force_iommu || !dma_capable(dev, addr, size);
+}
+
+static inline int
+nonforced_iommu(struct device *dev, unsigned long addr, size_t size)
+{
+ return !dma_capable(dev, addr, size);
+}
+
+/* Map a single continuous physical area into the IOMMU.
+ * Caller needs to check if the iommu is needed and flush.
+ */
+static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
+ size_t size, int dir, unsigned long align_mask)
+{
+ unsigned long npages = iommu_num_pages(phys_mem, size, PAGE_SIZE);
+ unsigned long iommu_page;
+ int i;
+
+ if (unlikely(phys_mem + size > GART_MAX_PHYS_ADDR))
+ return bad_dma_addr;
+
+ iommu_page = alloc_iommu(dev, npages, align_mask);
+ if (iommu_page == -1) {
+ if (!nonforced_iommu(dev, phys_mem, size))
+ return phys_mem;
+ if (panic_on_overflow)
+ panic("dma_map_area overflow %lu bytes\n", size);
+ iommu_full(dev, size, dir);
+ return bad_dma_addr;
+ }
+
+ for (i = 0; i < npages; i++) {
+ iommu_gatt_base[iommu_page + i] = GPTE_ENCODE(phys_mem);
+ phys_mem += PAGE_SIZE;
+ }
+ return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK);
+}
+
+/* Map a single area into the IOMMU */
+static dma_addr_t gart_map_page(struct device *dev, struct page *page,
+ unsigned long offset, size_t size,
+ enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ unsigned long bus;
+ phys_addr_t paddr = page_to_phys(page) + offset;
+
+ if (!dev)
+ dev = &x86_dma_fallback_dev;
+
+ if (!need_iommu(dev, paddr, size))
+ return paddr;
+
+ bus = dma_map_area(dev, paddr, size, dir, 0);
+ flush_gart();
+
+ return bus;
+}
+
+/*
+ * Free a DMA mapping.
+ */
+static void gart_unmap_page(struct device *dev, dma_addr_t dma_addr,
+ size_t size, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ unsigned long iommu_page;
+ int npages;
+ int i;
+
+ if (dma_addr < iommu_bus_base + EMERGENCY_PAGES*PAGE_SIZE ||
+ dma_addr >= iommu_bus_base + iommu_size)
+ return;
+
+ iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT;
+ npages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
+ for (i = 0; i < npages; i++) {
+ iommu_gatt_base[iommu_page + i] = gart_unmapped_entry;
+ }
+ free_iommu(iommu_page, npages);
+}
+
+/*
+ * Wrapper for pci_unmap_single working with scatterlists.
+ */
+static void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents,
+ enum dma_data_direction dir, unsigned long attrs)
+{
+ struct scatterlist *s;
+ int i;
+
+ for_each_sg(sg, s, nents, i) {
+ if (!s->dma_length || !s->length)
+ break;
+ gart_unmap_page(dev, s->dma_address, s->dma_length, dir, 0);
+ }
+}
+
+/* Fallback for dma_map_sg in case of overflow */
+static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
+ int nents, int dir)
+{
+ struct scatterlist *s;
+ int i;
+
+#ifdef CONFIG_IOMMU_DEBUG
+ pr_debug("dma_map_sg overflow\n");
+#endif
+
+ for_each_sg(sg, s, nents, i) {
+ unsigned long addr = sg_phys(s);
+
+ if (nonforced_iommu(dev, addr, s->length)) {
+ addr = dma_map_area(dev, addr, s->length, dir, 0);
+ if (addr == bad_dma_addr) {
+ if (i > 0)
+ gart_unmap_sg(dev, sg, i, dir, 0);
+ nents = 0;
+ sg[0].dma_length = 0;
+ break;
+ }
+ }
+ s->dma_address = addr;
+ s->dma_length = s->length;
+ }
+ flush_gart();
+
+ return nents;
+}
+
+/* Map multiple scatterlist entries continuous into the first. */
+static int __dma_map_cont(struct device *dev, struct scatterlist *start,
+ int nelems, struct scatterlist *sout,
+ unsigned long pages)
+{
+ unsigned long iommu_start = alloc_iommu(dev, pages, 0);
+ unsigned long iommu_page = iommu_start;
+ struct scatterlist *s;
+ int i;
+
+ if (iommu_start == -1)
+ return -1;
+
+ for_each_sg(start, s, nelems, i) {
+ unsigned long pages, addr;
+ unsigned long phys_addr = s->dma_address;
+
+ BUG_ON(s != start && s->offset);
+ if (s == start) {
+ sout->dma_address = iommu_bus_base;
+ sout->dma_address += iommu_page*PAGE_SIZE + s->offset;
+ sout->dma_length = s->length;
+ } else {
+ sout->dma_length += s->length;
+ }
+
+ addr = phys_addr;
+ pages = iommu_num_pages(s->offset, s->length, PAGE_SIZE);
+ while (pages--) {
+ iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr);
+ addr += PAGE_SIZE;
+ iommu_page++;
+ }
+ }
+ BUG_ON(iommu_page - iommu_start != pages);
+
+ return 0;
+}
+
+static inline int
+dma_map_cont(struct device *dev, struct scatterlist *start, int nelems,
+ struct scatterlist *sout, unsigned long pages, int need)
+{
+ if (!need) {
+ BUG_ON(nelems != 1);
+ sout->dma_address = start->dma_address;
+ sout->dma_length = start->length;
+ return 0;
+ }
+ return __dma_map_cont(dev, start, nelems, sout, pages);
+}
+
+/*
+ * DMA map all entries in a scatterlist.
+ * Merge chunks that have page aligned sizes into a continuous mapping.
+ */
+static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
+ enum dma_data_direction dir, unsigned long attrs)
+{
+ struct scatterlist *s, *ps, *start_sg, *sgmap;
+ int need = 0, nextneed, i, out, start;
+ unsigned long pages = 0;
+ unsigned int seg_size;
+ unsigned int max_seg_size;
+
+ if (nents == 0)
+ return 0;
+
+ if (!dev)
+ dev = &x86_dma_fallback_dev;
+
+ out = 0;
+ start = 0;
+ start_sg = sg;
+ sgmap = sg;
+ seg_size = 0;
+ max_seg_size = dma_get_max_seg_size(dev);
+ ps = NULL; /* shut up gcc */
+
+ for_each_sg(sg, s, nents, i) {
+ dma_addr_t addr = sg_phys(s);
+
+ s->dma_address = addr;
+ BUG_ON(s->length == 0);
+
+ nextneed = need_iommu(dev, addr, s->length);
+
+ /* Handle the previous not yet processed entries */
+ if (i > start) {
+ /*
+ * Can only merge when the last chunk ends on a
+ * page boundary and the new one doesn't have an
+ * offset.
+ */
+ if (!iommu_merge || !nextneed || !need || s->offset ||
+ (s->length + seg_size > max_seg_size) ||
+ (ps->offset + ps->length) % PAGE_SIZE) {
+ if (dma_map_cont(dev, start_sg, i - start,
+ sgmap, pages, need) < 0)
+ goto error;
+ out++;
+
+ seg_size = 0;
+ sgmap = sg_next(sgmap);
+ pages = 0;
+ start = i;
+ start_sg = s;
+ }
+ }
+
+ seg_size += s->length;
+ need = nextneed;
+ pages += iommu_num_pages(s->offset, s->length, PAGE_SIZE);
+ ps = s;
+ }
+ if (dma_map_cont(dev, start_sg, i - start, sgmap, pages, need) < 0)
+ goto error;
+ out++;
+ flush_gart();
+ if (out < nents) {
+ sgmap = sg_next(sgmap);
+ sgmap->dma_length = 0;
+ }
+ return out;
+
+error:
+ flush_gart();
+ gart_unmap_sg(dev, sg, out, dir, 0);
+
+ /* When it was forced or merged try again in a dumb way */
+ if (force_iommu || iommu_merge) {
+ out = dma_map_sg_nonforce(dev, sg, nents, dir);
+ if (out > 0)
+ return out;
+ }
+ if (panic_on_overflow)
+ panic("dma_map_sg: overflow on %lu pages\n", pages);
+
+ iommu_full(dev, pages << PAGE_SHIFT, dir);
+ for_each_sg(sg, s, nents, i)
+ s->dma_address = bad_dma_addr;
+ return 0;
+}
+
+/* allocate and map a coherent mapping */
+static void *
+gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr,
+ gfp_t flag, unsigned long attrs)
+{
+ void *vaddr;
+
+ vaddr = dma_direct_alloc(dev, size, dma_addr, flag, attrs);
+ if (!vaddr ||
+ !force_iommu || dev->coherent_dma_mask <= DMA_BIT_MASK(24))
+ return vaddr;
+
+ *dma_addr = dma_map_area(dev, virt_to_phys(vaddr), size,
+ DMA_BIDIRECTIONAL, (1UL << get_order(size)) - 1);
+ flush_gart();
+ if (unlikely(*dma_addr == bad_dma_addr))
+ goto out_free;
+ return vaddr;
+out_free:
+ dma_direct_free(dev, size, vaddr, *dma_addr, attrs);
+ return NULL;
+}
+
+/* free a coherent mapping */
+static void
+gart_free_coherent(struct device *dev, size_t size, void *vaddr,
+ dma_addr_t dma_addr, unsigned long attrs)
+{
+ gart_unmap_page(dev, dma_addr, size, DMA_BIDIRECTIONAL, 0);
+ dma_direct_free(dev, size, vaddr, dma_addr, attrs);
+}
+
+static int gart_mapping_error(struct device *dev, dma_addr_t dma_addr)
+{
+ return (dma_addr == bad_dma_addr);
+}
+
+static int no_agp;
+
+static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
+{
+ unsigned long a;
+
+ if (!iommu_size) {
+ iommu_size = aper_size;
+ if (!no_agp)
+ iommu_size /= 2;
+ }
+
+ a = aper + iommu_size;
+ iommu_size -= round_up(a, PMD_PAGE_SIZE) - a;
+
+ if (iommu_size < 64*1024*1024) {
+ pr_warning(
+ "PCI-DMA: Warning: Small IOMMU %luMB."
+ " Consider increasing the AGP aperture in BIOS\n",
+ iommu_size >> 20);
+ }
+
+ return iommu_size;
+}
+
+static __init unsigned read_aperture(struct pci_dev *dev, u32 *size)
+{
+ unsigned aper_size = 0, aper_base_32, aper_order;
+ u64 aper_base;
+
+ pci_read_config_dword(dev, AMD64_GARTAPERTUREBASE, &aper_base_32);
+ pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &aper_order);
+ aper_order = (aper_order >> 1) & 7;
+
+ aper_base = aper_base_32 & 0x7fff;
+ aper_base <<= 25;
+
+ aper_size = (32 * 1024 * 1024) << aper_order;
+ if (aper_base + aper_size > 0x100000000UL || !aper_size)
+ aper_base = 0;
+
+ *size = aper_size;
+ return aper_base;
+}
+
+static void enable_gart_translations(void)
+{
+ int i;
+
+ if (!amd_nb_has_feature(AMD_NB_GART))
+ return;
+
+ for (i = 0; i < amd_nb_num(); i++) {
+ struct pci_dev *dev = node_to_amd_nb(i)->misc;
+
+ enable_gart_translation(dev, __pa(agp_gatt_table));
+ }
+
+ /* Flush the GART-TLB to remove stale entries */
+ amd_flush_garts();
+}
+
+/*
+ * If fix_up_north_bridges is set, the north bridges have to be fixed up on
+ * resume in the same way as they are handled in gart_iommu_hole_init().
+ */
+static bool fix_up_north_bridges;
+static u32 aperture_order;
+static u32 aperture_alloc;
+
+void set_up_gart_resume(u32 aper_order, u32 aper_alloc)
+{
+ fix_up_north_bridges = true;
+ aperture_order = aper_order;
+ aperture_alloc = aper_alloc;
+}
+
+static void gart_fixup_northbridges(void)
+{
+ int i;
+
+ if (!fix_up_north_bridges)
+ return;
+
+ if (!amd_nb_has_feature(AMD_NB_GART))
+ return;
+
+ pr_info("PCI-DMA: Restoring GART aperture settings\n");
+
+ for (i = 0; i < amd_nb_num(); i++) {
+ struct pci_dev *dev = node_to_amd_nb(i)->misc;
+
+ /*
+ * Don't enable translations just yet. That is the next
+ * step. Restore the pre-suspend aperture settings.
+ */
+ gart_set_size_and_enable(dev, aperture_order);
+ pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE, aperture_alloc >> 25);
+ }
+}
+
+static void gart_resume(void)
+{
+ pr_info("PCI-DMA: Resuming GART IOMMU\n");
+
+ gart_fixup_northbridges();
+
+ enable_gart_translations();
+}
+
+static struct syscore_ops gart_syscore_ops = {
+ .resume = gart_resume,
+
+};
+
+/*
+ * Private Northbridge GATT initialization in case we cannot use the
+ * AGP driver for some reason.
+ */
+static __init int init_amd_gatt(struct agp_kern_info *info)
+{
+ unsigned aper_size, gatt_size, new_aper_size;
+ unsigned aper_base, new_aper_base;
+ struct pci_dev *dev;
+ void *gatt;
+ int i;
+
+ pr_info("PCI-DMA: Disabling AGP.\n");
+
+ aper_size = aper_base = info->aper_size = 0;
+ dev = NULL;
+ for (i = 0; i < amd_nb_num(); i++) {
+ dev = node_to_amd_nb(i)->misc;
+ new_aper_base = read_aperture(dev, &new_aper_size);
+ if (!new_aper_base)
+ goto nommu;
+
+ if (!aper_base) {
+ aper_size = new_aper_size;
+ aper_base = new_aper_base;
+ }
+ if (aper_size != new_aper_size || aper_base != new_aper_base)
+ goto nommu;
+ }
+ if (!aper_base)
+ goto nommu;
+
+ info->aper_base = aper_base;
+ info->aper_size = aper_size >> 20;
+
+ gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32);
+ gatt = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+ get_order(gatt_size));
+ if (!gatt)
+ panic("Cannot allocate GATT table");
+ if (set_memory_uc((unsigned long)gatt, gatt_size >> PAGE_SHIFT))
+ panic("Could not set GART PTEs to uncacheable pages");
+
+ agp_gatt_table = gatt;
+
+ register_syscore_ops(&gart_syscore_ops);
+
+ flush_gart();
+
+ pr_info("PCI-DMA: aperture base @ %x size %u KB\n",
+ aper_base, aper_size>>10);
+
+ return 0;
+
+ nommu:
+ /* Should not happen anymore */
+ pr_warning("PCI-DMA: More than 4GB of RAM and no IOMMU\n"
+ "falling back to iommu=soft.\n");
+ return -1;
+}
+
+static const struct dma_map_ops gart_dma_ops = {
+ .map_sg = gart_map_sg,
+ .unmap_sg = gart_unmap_sg,
+ .map_page = gart_map_page,
+ .unmap_page = gart_unmap_page,
+ .alloc = gart_alloc_coherent,
+ .free = gart_free_coherent,
+ .mapping_error = gart_mapping_error,
+ .dma_supported = dma_direct_supported,
+};
+
+static void gart_iommu_shutdown(void)
+{
+ struct pci_dev *dev;
+ int i;
+
+ /* don't shutdown it if there is AGP installed */
+ if (!no_agp)
+ return;
+
+ if (!amd_nb_has_feature(AMD_NB_GART))
+ return;
+
+ for (i = 0; i < amd_nb_num(); i++) {
+ u32 ctl;
+
+ dev = node_to_amd_nb(i)->misc;
+ pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl);
+
+ ctl &= ~GARTEN;
+
+ pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl);
+ }
+}
+
+int __init gart_iommu_init(void)
+{
+ struct agp_kern_info info;
+ unsigned long iommu_start;
+ unsigned long aper_base, aper_size;
+ unsigned long start_pfn, end_pfn;
+ unsigned long scratch;
+ long i;
+
+ if (!amd_nb_has_feature(AMD_NB_GART))
+ return 0;
+
+#ifndef CONFIG_AGP_AMD64
+ no_agp = 1;
+#else
+ /* Makefile puts PCI initialization via subsys_initcall first. */
+ /* Add other AMD AGP bridge drivers here */
+ no_agp = no_agp ||
+ (agp_amd64_init() < 0) ||
+ (agp_copy_info(agp_bridge, &info) < 0);
+#endif
+
+ if (no_iommu ||
+ (!force_iommu && max_pfn <= MAX_DMA32_PFN) ||
+ !gart_iommu_aperture ||
+ (no_agp && init_amd_gatt(&info) < 0)) {
+ if (max_pfn > MAX_DMA32_PFN) {
+ pr_warning("More than 4GB of memory but GART IOMMU not available.\n");
+ pr_warning("falling back to iommu=soft.\n");
+ }
+ return 0;
+ }
+
+ /* need to map that range */
+ aper_size = info.aper_size << 20;
+ aper_base = info.aper_base;
+ end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT);
+
+ start_pfn = PFN_DOWN(aper_base);
+ if (!pfn_range_is_mapped(start_pfn, end_pfn))
+ init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
+
+ pr_info("PCI-DMA: using GART IOMMU.\n");
+ iommu_size = check_iommu_size(info.aper_base, aper_size);
+ iommu_pages = iommu_size >> PAGE_SHIFT;
+
+ iommu_gart_bitmap = (void *) __get_free_pages(GFP_KERNEL | __GFP_ZERO,
+ get_order(iommu_pages/8));
+ if (!iommu_gart_bitmap)
+ panic("Cannot allocate iommu bitmap\n");
+
+#ifdef CONFIG_IOMMU_LEAK
+ if (leak_trace) {
+ int ret;
+
+ ret = dma_debug_resize_entries(iommu_pages);
+ if (ret)
+ pr_debug("PCI-DMA: Cannot trace all the entries\n");
+ }
+#endif
+
+ /*
+ * Out of IOMMU space handling.
+ * Reserve some invalid pages at the beginning of the GART.
+ */
+ bitmap_set(iommu_gart_bitmap, 0, EMERGENCY_PAGES);
+
+ pr_info("PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n",
+ iommu_size >> 20);
+
+ agp_memory_reserved = iommu_size;
+ iommu_start = aper_size - iommu_size;
+ iommu_bus_base = info.aper_base + iommu_start;
+ bad_dma_addr = iommu_bus_base;
+ iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT);
+
+ /*
+ * Unmap the IOMMU part of the GART. The alias of the page is
+ * always mapped with cache enabled and there is no full cache
+ * coherency across the GART remapping. The unmapping avoids
+ * automatic prefetches from the CPU allocating cache lines in
+ * there. All CPU accesses are done via the direct mapping to
+ * the backing memory. The GART address is only used by PCI
+ * devices.
+ */
+ set_memory_np((unsigned long)__va(iommu_bus_base),
+ iommu_size >> PAGE_SHIFT);
+ /*
+ * Tricky. The GART table remaps the physical memory range,
+ * so the CPU wont notice potential aliases and if the memory
+ * is remapped to UC later on, we might surprise the PCI devices
+ * with a stray writeout of a cacheline. So play it sure and
+ * do an explicit, full-scale wbinvd() _after_ having marked all
+ * the pages as Not-Present:
+ */
+ wbinvd();
+
+ /*
+ * Now all caches are flushed and we can safely enable
+ * GART hardware. Doing it early leaves the possibility
+ * of stale cache entries that can lead to GART PTE
+ * errors.
+ */
+ enable_gart_translations();
+
+ /*
+ * Try to workaround a bug (thanks to BenH):
+ * Set unmapped entries to a scratch page instead of 0.
+ * Any prefetches that hit unmapped entries won't get an bus abort
+ * then. (P2P bridge may be prefetching on DMA reads).
+ */
+ scratch = get_zeroed_page(GFP_KERNEL);
+ if (!scratch)
+ panic("Cannot allocate iommu scratch page");
+ gart_unmapped_entry = GPTE_ENCODE(__pa(scratch));
+ for (i = EMERGENCY_PAGES; i < iommu_pages; i++)
+ iommu_gatt_base[i] = gart_unmapped_entry;
+
+ flush_gart();
+ dma_ops = &gart_dma_ops;
+ x86_platform.iommu_shutdown = gart_iommu_shutdown;
+ swiotlb = 0;
+
+ return 0;
+}
+
+void __init gart_parse_options(char *p)
+{
+ int arg;
+
+#ifdef CONFIG_IOMMU_LEAK
+ if (!strncmp(p, "leak", 4)) {
+ leak_trace = 1;
+ p += 4;
+ if (*p == '=')
+ ++p;
+ if (isdigit(*p) && get_option(&p, &arg))
+ iommu_leak_pages = arg;
+ }
+#endif
+ if (isdigit(*p) && get_option(&p, &arg))
+ iommu_size = arg;
+ if (!strncmp(p, "fullflush", 9))
+ iommu_fullflush = 1;
+ if (!strncmp(p, "nofullflush", 11))
+ iommu_fullflush = 0;
+ if (!strncmp(p, "noagp", 5))
+ no_agp = 1;
+ if (!strncmp(p, "noaperture", 10))
+ fix_aperture = 0;
+ /* duplicated from pci-dma.c */
+ if (!strncmp(p, "force", 5))
+ gart_iommu_aperture_allowed = 1;
+ if (!strncmp(p, "allowed", 7))
+ gart_iommu_aperture_allowed = 1;
+ if (!strncmp(p, "memaper", 7)) {
+ fallback_aper_force = 1;
+ p += 7;
+ if (*p == '=') {
+ ++p;
+ if (get_option(&p, &arg))
+ fallback_aper_order = arg;
+ }
+ }
+}
+IOMMU_INIT_POST(gart_iommu_hole_init);
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
new file mode 100644
index 000000000..923b4bac9
--- /dev/null
+++ b/arch/x86/kernel/amd_nb.c
@@ -0,0 +1,472 @@
+/*
+ * Shared support code for AMD K8 northbridges and derivates.
+ * Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/export.h>
+#include <linux/spinlock.h>
+#include <linux/pci_ids.h>
+#include <asm/amd_nb.h>
+
+#define PCI_DEVICE_ID_AMD_17H_ROOT 0x1450
+#define PCI_DEVICE_ID_AMD_17H_M10H_ROOT 0x15d0
+#define PCI_DEVICE_ID_AMD_17H_M30H_ROOT 0x1480
+#define PCI_DEVICE_ID_AMD_17H_DF_F4 0x1464
+#define PCI_DEVICE_ID_AMD_17H_M10H_DF_F4 0x15ec
+#define PCI_DEVICE_ID_AMD_17H_M30H_DF_F4 0x1494
+#define PCI_DEVICE_ID_AMD_17H_M70H_DF_F4 0x1444
+#define PCI_DEVICE_ID_AMD_19H_DF_F4 0x1654
+
+/* Protect the PCI config register pairs used for SMN and DF indirect access. */
+static DEFINE_MUTEX(smn_mutex);
+
+static u32 *flush_words;
+
+static const struct pci_device_id amd_root_ids[] = {
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_ROOT) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_ROOT) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_ROOT) },
+ {}
+};
+
+
+#define PCI_DEVICE_ID_AMD_CNB17H_F4 0x1704
+
+const struct pci_device_id amd_nb_misc_ids[] = {
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M10H_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M30H_NB_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M60H_NB_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_DF_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_DF_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F3) },
+ {}
+};
+EXPORT_SYMBOL_GPL(amd_nb_misc_ids);
+
+static const struct pci_device_id amd_nb_link_ids[] = {
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M30H_NB_F4) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M60H_NB_F4) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F4) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F4) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F4) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_DF_F4) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_DF_F4) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F4) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F4) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F4) },
+ {}
+};
+
+const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[] __initconst = {
+ { 0x00, 0x18, 0x20 },
+ { 0xff, 0x00, 0x20 },
+ { 0xfe, 0x00, 0x20 },
+ { }
+};
+
+static struct amd_northbridge_info amd_northbridges;
+
+u16 amd_nb_num(void)
+{
+ return amd_northbridges.num;
+}
+EXPORT_SYMBOL_GPL(amd_nb_num);
+
+bool amd_nb_has_feature(unsigned int feature)
+{
+ return ((amd_northbridges.flags & feature) == feature);
+}
+EXPORT_SYMBOL_GPL(amd_nb_has_feature);
+
+struct amd_northbridge *node_to_amd_nb(int node)
+{
+ return (node < amd_northbridges.num) ? &amd_northbridges.nb[node] : NULL;
+}
+EXPORT_SYMBOL_GPL(node_to_amd_nb);
+
+static struct pci_dev *next_northbridge(struct pci_dev *dev,
+ const struct pci_device_id *ids)
+{
+ do {
+ dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
+ if (!dev)
+ break;
+ } while (!pci_match_id(ids, dev));
+ return dev;
+}
+
+static int __amd_smn_rw(u16 node, u32 address, u32 *value, bool write)
+{
+ struct pci_dev *root;
+ int err = -ENODEV;
+
+ if (node >= amd_northbridges.num)
+ goto out;
+
+ root = node_to_amd_nb(node)->root;
+ if (!root)
+ goto out;
+
+ mutex_lock(&smn_mutex);
+
+ err = pci_write_config_dword(root, 0x60, address);
+ if (err) {
+ pr_warn("Error programming SMN address 0x%x.\n", address);
+ goto out_unlock;
+ }
+
+ err = (write ? pci_write_config_dword(root, 0x64, *value)
+ : pci_read_config_dword(root, 0x64, value));
+ if (err)
+ pr_warn("Error %s SMN address 0x%x.\n",
+ (write ? "writing to" : "reading from"), address);
+
+out_unlock:
+ mutex_unlock(&smn_mutex);
+
+out:
+ return err;
+}
+
+int amd_smn_read(u16 node, u32 address, u32 *value)
+{
+ return __amd_smn_rw(node, address, value, false);
+}
+EXPORT_SYMBOL_GPL(amd_smn_read);
+
+int amd_smn_write(u16 node, u32 address, u32 value)
+{
+ return __amd_smn_rw(node, address, &value, true);
+}
+EXPORT_SYMBOL_GPL(amd_smn_write);
+
+/*
+ * Data Fabric Indirect Access uses FICAA/FICAD.
+ *
+ * Fabric Indirect Configuration Access Address (FICAA): Constructed based
+ * on the device's Instance Id and the PCI function and register offset of
+ * the desired register.
+ *
+ * Fabric Indirect Configuration Access Data (FICAD): There are FICAD LO
+ * and FICAD HI registers but so far we only need the LO register.
+ */
+int amd_df_indirect_read(u16 node, u8 func, u16 reg, u8 instance_id, u32 *lo)
+{
+ struct pci_dev *F4;
+ u32 ficaa;
+ int err = -ENODEV;
+
+ if (node >= amd_northbridges.num)
+ goto out;
+
+ F4 = node_to_amd_nb(node)->link;
+ if (!F4)
+ goto out;
+
+ ficaa = 1;
+ ficaa |= reg & 0x3FC;
+ ficaa |= (func & 0x7) << 11;
+ ficaa |= instance_id << 16;
+
+ mutex_lock(&smn_mutex);
+
+ err = pci_write_config_dword(F4, 0x5C, ficaa);
+ if (err) {
+ pr_warn("Error writing DF Indirect FICAA, FICAA=0x%x\n", ficaa);
+ goto out_unlock;
+ }
+
+ err = pci_read_config_dword(F4, 0x98, lo);
+ if (err)
+ pr_warn("Error reading DF Indirect FICAD LO, FICAA=0x%x.\n", ficaa);
+
+out_unlock:
+ mutex_unlock(&smn_mutex);
+
+out:
+ return err;
+}
+EXPORT_SYMBOL_GPL(amd_df_indirect_read);
+
+int amd_cache_northbridges(void)
+{
+ u16 i = 0;
+ struct amd_northbridge *nb;
+ struct pci_dev *root, *misc, *link;
+
+ if (amd_northbridges.num)
+ return 0;
+
+ misc = NULL;
+ while ((misc = next_northbridge(misc, amd_nb_misc_ids)) != NULL)
+ i++;
+
+ if (!i)
+ return -ENODEV;
+
+ nb = kcalloc(i, sizeof(struct amd_northbridge), GFP_KERNEL);
+ if (!nb)
+ return -ENOMEM;
+
+ amd_northbridges.nb = nb;
+ amd_northbridges.num = i;
+
+ link = misc = root = NULL;
+ for (i = 0; i != amd_northbridges.num; i++) {
+ node_to_amd_nb(i)->root = root =
+ next_northbridge(root, amd_root_ids);
+ node_to_amd_nb(i)->misc = misc =
+ next_northbridge(misc, amd_nb_misc_ids);
+ node_to_amd_nb(i)->link = link =
+ next_northbridge(link, amd_nb_link_ids);
+ }
+
+ if (amd_gart_present())
+ amd_northbridges.flags |= AMD_NB_GART;
+
+ /*
+ * Check for L3 cache presence.
+ */
+ if (!cpuid_edx(0x80000006))
+ return 0;
+
+ /*
+ * Some CPU families support L3 Cache Index Disable. There are some
+ * limitations because of E382 and E388 on family 0x10.
+ */
+ if (boot_cpu_data.x86 == 0x10 &&
+ boot_cpu_data.x86_model >= 0x8 &&
+ (boot_cpu_data.x86_model > 0x9 ||
+ boot_cpu_data.x86_stepping >= 0x1))
+ amd_northbridges.flags |= AMD_NB_L3_INDEX_DISABLE;
+
+ if (boot_cpu_data.x86 == 0x15)
+ amd_northbridges.flags |= AMD_NB_L3_INDEX_DISABLE;
+
+ /* L3 cache partitioning is supported on family 0x15 */
+ if (boot_cpu_data.x86 == 0x15)
+ amd_northbridges.flags |= AMD_NB_L3_PARTITIONING;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(amd_cache_northbridges);
+
+/*
+ * Ignores subdevice/subvendor but as far as I can figure out
+ * they're useless anyways
+ */
+bool __init early_is_amd_nb(u32 device)
+{
+ const struct pci_device_id *id;
+ u32 vendor = device & 0xffff;
+
+ device >>= 16;
+ for (id = amd_nb_misc_ids; id->vendor; id++)
+ if (vendor == id->vendor && device == id->device)
+ return true;
+ return false;
+}
+
+struct resource *amd_get_mmconfig_range(struct resource *res)
+{
+ u32 address;
+ u64 base, msr;
+ unsigned int segn_busn_bits;
+
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
+ return NULL;
+
+ /* assume all cpus from fam10h have mmconfig */
+ if (boot_cpu_data.x86 < 0x10)
+ return NULL;
+
+ address = MSR_FAM10H_MMIO_CONF_BASE;
+ rdmsrl(address, msr);
+
+ /* mmconfig is not enabled */
+ if (!(msr & FAM10H_MMIO_CONF_ENABLE))
+ return NULL;
+
+ base = msr & (FAM10H_MMIO_CONF_BASE_MASK<<FAM10H_MMIO_CONF_BASE_SHIFT);
+
+ segn_busn_bits = (msr >> FAM10H_MMIO_CONF_BUSRANGE_SHIFT) &
+ FAM10H_MMIO_CONF_BUSRANGE_MASK;
+
+ res->flags = IORESOURCE_MEM;
+ res->start = base;
+ res->end = base + (1ULL<<(segn_busn_bits + 20)) - 1;
+ return res;
+}
+
+int amd_get_subcaches(int cpu)
+{
+ struct pci_dev *link = node_to_amd_nb(amd_get_nb_id(cpu))->link;
+ unsigned int mask;
+
+ if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
+ return 0;
+
+ pci_read_config_dword(link, 0x1d4, &mask);
+
+ return (mask >> (4 * cpu_data(cpu).cpu_core_id)) & 0xf;
+}
+
+int amd_set_subcaches(int cpu, unsigned long mask)
+{
+ static unsigned int reset, ban;
+ struct amd_northbridge *nb = node_to_amd_nb(amd_get_nb_id(cpu));
+ unsigned int reg;
+ int cuid;
+
+ if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING) || mask > 0xf)
+ return -EINVAL;
+
+ /* if necessary, collect reset state of L3 partitioning and BAN mode */
+ if (reset == 0) {
+ pci_read_config_dword(nb->link, 0x1d4, &reset);
+ pci_read_config_dword(nb->misc, 0x1b8, &ban);
+ ban &= 0x180000;
+ }
+
+ /* deactivate BAN mode if any subcaches are to be disabled */
+ if (mask != 0xf) {
+ pci_read_config_dword(nb->misc, 0x1b8, &reg);
+ pci_write_config_dword(nb->misc, 0x1b8, reg & ~0x180000);
+ }
+
+ cuid = cpu_data(cpu).cpu_core_id;
+ mask <<= 4 * cuid;
+ mask |= (0xf ^ (1 << cuid)) << 26;
+
+ pci_write_config_dword(nb->link, 0x1d4, mask);
+
+ /* reset BAN mode if L3 partitioning returned to reset state */
+ pci_read_config_dword(nb->link, 0x1d4, &reg);
+ if (reg == reset) {
+ pci_read_config_dword(nb->misc, 0x1b8, &reg);
+ reg &= ~0x180000;
+ pci_write_config_dword(nb->misc, 0x1b8, reg | ban);
+ }
+
+ return 0;
+}
+
+static void amd_cache_gart(void)
+{
+ u16 i;
+
+ if (!amd_nb_has_feature(AMD_NB_GART))
+ return;
+
+ flush_words = kmalloc_array(amd_northbridges.num, sizeof(u32), GFP_KERNEL);
+ if (!flush_words) {
+ amd_northbridges.flags &= ~AMD_NB_GART;
+ pr_notice("Cannot initialize GART flush words, GART support disabled\n");
+ return;
+ }
+
+ for (i = 0; i != amd_northbridges.num; i++)
+ pci_read_config_dword(node_to_amd_nb(i)->misc, 0x9c, &flush_words[i]);
+}
+
+void amd_flush_garts(void)
+{
+ int flushed, i;
+ unsigned long flags;
+ static DEFINE_SPINLOCK(gart_lock);
+
+ if (!amd_nb_has_feature(AMD_NB_GART))
+ return;
+
+ /*
+ * Avoid races between AGP and IOMMU. In theory it's not needed
+ * but I'm not sure if the hardware won't lose flush requests
+ * when another is pending. This whole thing is so expensive anyways
+ * that it doesn't matter to serialize more. -AK
+ */
+ spin_lock_irqsave(&gart_lock, flags);
+ flushed = 0;
+ for (i = 0; i < amd_northbridges.num; i++) {
+ pci_write_config_dword(node_to_amd_nb(i)->misc, 0x9c,
+ flush_words[i] | 1);
+ flushed++;
+ }
+ for (i = 0; i < amd_northbridges.num; i++) {
+ u32 w;
+ /* Make sure the hardware actually executed the flush*/
+ for (;;) {
+ pci_read_config_dword(node_to_amd_nb(i)->misc,
+ 0x9c, &w);
+ if (!(w & 1))
+ break;
+ cpu_relax();
+ }
+ }
+ spin_unlock_irqrestore(&gart_lock, flags);
+ if (!flushed)
+ pr_notice("nothing to flush?\n");
+}
+EXPORT_SYMBOL_GPL(amd_flush_garts);
+
+static void __fix_erratum_688(void *info)
+{
+#define MSR_AMD64_IC_CFG 0xC0011021
+
+ msr_set_bit(MSR_AMD64_IC_CFG, 3);
+ msr_set_bit(MSR_AMD64_IC_CFG, 14);
+}
+
+/* Apply erratum 688 fix so machines without a BIOS fix work. */
+static __init void fix_erratum_688(void)
+{
+ struct pci_dev *F4;
+ u32 val;
+
+ if (boot_cpu_data.x86 != 0x14)
+ return;
+
+ if (!amd_northbridges.num)
+ return;
+
+ F4 = node_to_amd_nb(0)->link;
+ if (!F4)
+ return;
+
+ if (pci_read_config_dword(F4, 0x164, &val))
+ return;
+
+ if (val & BIT(2))
+ return;
+
+ on_each_cpu(__fix_erratum_688, NULL, 0);
+
+ pr_info("x86/cpu/AMD: CPU erratum 688 worked around\n");
+}
+
+static __init int init_amd_nbs(void)
+{
+ amd_cache_northbridges();
+ amd_cache_gart();
+
+ fix_erratum_688();
+
+ return 0;
+}
+
+/* This has to go after the PCI subsystem */
+fs_initcall(init_amd_nbs);
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
new file mode 100644
index 000000000..65721dc73
--- /dev/null
+++ b/arch/x86/kernel/apb_timer.c
@@ -0,0 +1,404 @@
+/*
+ * apb_timer.c: Driver for Langwell APB timers
+ *
+ * (C) Copyright 2009 Intel Corporation
+ * Author: Jacob Pan (jacob.jun.pan@intel.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ *
+ * Note:
+ * Langwell is the south complex of Intel Moorestown MID platform. There are
+ * eight external timers in total that can be used by the operating system.
+ * The timer information, such as frequency and addresses, is provided to the
+ * OS via SFI tables.
+ * Timer interrupts are routed via FW/HW emulated IOAPIC independently via
+ * individual redirection table entries (RTE).
+ * Unlike HPET, there is no master counter, therefore one of the timers are
+ * used as clocksource. The overall allocation looks like:
+ * - timer 0 - NR_CPUs for per cpu timer
+ * - one timer for clocksource
+ * - one timer for watchdog driver.
+ * It is also worth notice that APB timer does not support true one-shot mode,
+ * free-running mode will be used here to emulate one-shot mode.
+ * APB timer can also be used as broadcast timer along with per cpu local APIC
+ * timer, but by default APB timer has higher rating than local APIC timers.
+ */
+
+#include <linux/delay.h>
+#include <linux/dw_apb_timer.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/pm.h>
+#include <linux/sfi.h>
+#include <linux/interrupt.h>
+#include <linux/cpu.h>
+#include <linux/irq.h>
+
+#include <asm/fixmap.h>
+#include <asm/apb_timer.h>
+#include <asm/intel-mid.h>
+#include <asm/time.h>
+
+#define APBT_CLOCKEVENT_RATING 110
+#define APBT_CLOCKSOURCE_RATING 250
+
+#define APBT_CLOCKEVENT0_NUM (0)
+#define APBT_CLOCKSOURCE_NUM (2)
+
+static phys_addr_t apbt_address;
+static int apb_timer_block_enabled;
+static void __iomem *apbt_virt_address;
+
+/*
+ * Common DW APB timer info
+ */
+static unsigned long apbt_freq;
+
+struct apbt_dev {
+ struct dw_apb_clock_event_device *timer;
+ unsigned int num;
+ int cpu;
+ unsigned int irq;
+ char name[10];
+};
+
+static struct dw_apb_clocksource *clocksource_apbt;
+
+static inline void __iomem *adev_virt_addr(struct apbt_dev *adev)
+{
+ return apbt_virt_address + adev->num * APBTMRS_REG_SIZE;
+}
+
+static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev);
+
+#ifdef CONFIG_SMP
+static unsigned int apbt_num_timers_used;
+#endif
+
+static inline void apbt_set_mapping(void)
+{
+ struct sfi_timer_table_entry *mtmr;
+ int phy_cs_timer_id = 0;
+
+ if (apbt_virt_address) {
+ pr_debug("APBT base already mapped\n");
+ return;
+ }
+ mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM);
+ if (mtmr == NULL) {
+ printk(KERN_ERR "Failed to get MTMR %d from SFI\n",
+ APBT_CLOCKEVENT0_NUM);
+ return;
+ }
+ apbt_address = (phys_addr_t)mtmr->phys_addr;
+ if (!apbt_address) {
+ printk(KERN_WARNING "No timer base from SFI, use default\n");
+ apbt_address = APBT_DEFAULT_BASE;
+ }
+ apbt_virt_address = ioremap_nocache(apbt_address, APBT_MMAP_SIZE);
+ if (!apbt_virt_address) {
+ pr_debug("Failed mapping APBT phy address at %lu\n",\
+ (unsigned long)apbt_address);
+ goto panic_noapbt;
+ }
+ apbt_freq = mtmr->freq_hz;
+ sfi_free_mtmr(mtmr);
+
+ /* Now figure out the physical timer id for clocksource device */
+ mtmr = sfi_get_mtmr(APBT_CLOCKSOURCE_NUM);
+ if (mtmr == NULL)
+ goto panic_noapbt;
+
+ /* Now figure out the physical timer id */
+ pr_debug("Use timer %d for clocksource\n",
+ (int)(mtmr->phys_addr & 0xff) / APBTMRS_REG_SIZE);
+ phy_cs_timer_id = (unsigned int)(mtmr->phys_addr & 0xff) /
+ APBTMRS_REG_SIZE;
+
+ clocksource_apbt = dw_apb_clocksource_init(APBT_CLOCKSOURCE_RATING,
+ "apbt0", apbt_virt_address + phy_cs_timer_id *
+ APBTMRS_REG_SIZE, apbt_freq);
+ return;
+
+panic_noapbt:
+ panic("Failed to setup APB system timer\n");
+
+}
+
+static inline void apbt_clear_mapping(void)
+{
+ iounmap(apbt_virt_address);
+ apbt_virt_address = NULL;
+}
+
+static int __init apbt_clockevent_register(void)
+{
+ struct sfi_timer_table_entry *mtmr;
+ struct apbt_dev *adev = this_cpu_ptr(&cpu_apbt_dev);
+
+ mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM);
+ if (mtmr == NULL) {
+ printk(KERN_ERR "Failed to get MTMR %d from SFI\n",
+ APBT_CLOCKEVENT0_NUM);
+ return -ENODEV;
+ }
+
+ adev->num = smp_processor_id();
+ adev->timer = dw_apb_clockevent_init(smp_processor_id(), "apbt0",
+ intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT ?
+ APBT_CLOCKEVENT_RATING - 100 : APBT_CLOCKEVENT_RATING,
+ adev_virt_addr(adev), 0, apbt_freq);
+ /* Firmware does EOI handling for us. */
+ adev->timer->eoi = NULL;
+
+ if (intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT) {
+ global_clock_event = &adev->timer->ced;
+ printk(KERN_DEBUG "%s clockevent registered as global\n",
+ global_clock_event->name);
+ }
+
+ dw_apb_clockevent_register(adev->timer);
+
+ sfi_free_mtmr(mtmr);
+ return 0;
+}
+
+#ifdef CONFIG_SMP
+
+static void apbt_setup_irq(struct apbt_dev *adev)
+{
+ irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT);
+ irq_set_affinity(adev->irq, cpumask_of(adev->cpu));
+}
+
+/* Should be called with per cpu */
+void apbt_setup_secondary_clock(void)
+{
+ struct apbt_dev *adev;
+ int cpu;
+
+ /* Don't register boot CPU clockevent */
+ cpu = smp_processor_id();
+ if (!cpu)
+ return;
+
+ adev = this_cpu_ptr(&cpu_apbt_dev);
+ if (!adev->timer) {
+ adev->timer = dw_apb_clockevent_init(cpu, adev->name,
+ APBT_CLOCKEVENT_RATING, adev_virt_addr(adev),
+ adev->irq, apbt_freq);
+ adev->timer->eoi = NULL;
+ } else {
+ dw_apb_clockevent_resume(adev->timer);
+ }
+
+ printk(KERN_INFO "Registering CPU %d clockevent device %s, cpu %08x\n",
+ cpu, adev->name, adev->cpu);
+
+ apbt_setup_irq(adev);
+ dw_apb_clockevent_register(adev->timer);
+
+ return;
+}
+
+/*
+ * this notify handler process CPU hotplug events. in case of S0i3, nonboot
+ * cpus are disabled/enabled frequently, for performance reasons, we keep the
+ * per cpu timer irq registered so that we do need to do free_irq/request_irq.
+ *
+ * TODO: it might be more reliable to directly disable percpu clockevent device
+ * without the notifier chain. currently, cpu 0 may get interrupts from other
+ * cpu timers during the offline process due to the ordering of notification.
+ * the extra interrupt is harmless.
+ */
+static int apbt_cpu_dead(unsigned int cpu)
+{
+ struct apbt_dev *adev = &per_cpu(cpu_apbt_dev, cpu);
+
+ dw_apb_clockevent_pause(adev->timer);
+ if (system_state == SYSTEM_RUNNING) {
+ pr_debug("skipping APBT CPU %u offline\n", cpu);
+ } else {
+ pr_debug("APBT clockevent for cpu %u offline\n", cpu);
+ dw_apb_clockevent_stop(adev->timer);
+ }
+ return 0;
+}
+
+static __init int apbt_late_init(void)
+{
+ if (intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT ||
+ !apb_timer_block_enabled)
+ return 0;
+ return cpuhp_setup_state(CPUHP_X86_APB_DEAD, "x86/apb:dead", NULL,
+ apbt_cpu_dead);
+}
+fs_initcall(apbt_late_init);
+#else
+
+void apbt_setup_secondary_clock(void) {}
+
+#endif /* CONFIG_SMP */
+
+static int apbt_clocksource_register(void)
+{
+ u64 start, now;
+ u64 t1;
+
+ /* Start the counter, use timer 2 as source, timer 0/1 for event */
+ dw_apb_clocksource_start(clocksource_apbt);
+
+ /* Verify whether apbt counter works */
+ t1 = dw_apb_clocksource_read(clocksource_apbt);
+ start = rdtsc();
+
+ /*
+ * We don't know the TSC frequency yet, but waiting for
+ * 200000 TSC cycles is safe:
+ * 4 GHz == 50us
+ * 1 GHz == 200us
+ */
+ do {
+ rep_nop();
+ now = rdtsc();
+ } while ((now - start) < 200000UL);
+
+ /* APBT is the only always on clocksource, it has to work! */
+ if (t1 == dw_apb_clocksource_read(clocksource_apbt))
+ panic("APBT counter not counting. APBT disabled\n");
+
+ dw_apb_clocksource_register(clocksource_apbt);
+
+ return 0;
+}
+
+/*
+ * Early setup the APBT timer, only use timer 0 for booting then switch to
+ * per CPU timer if possible.
+ * returns 1 if per cpu apbt is setup
+ * returns 0 if no per cpu apbt is chosen
+ * panic if set up failed, this is the only platform timer on Moorestown.
+ */
+void __init apbt_time_init(void)
+{
+#ifdef CONFIG_SMP
+ int i;
+ struct sfi_timer_table_entry *p_mtmr;
+ struct apbt_dev *adev;
+#endif
+
+ if (apb_timer_block_enabled)
+ return;
+ apbt_set_mapping();
+ if (!apbt_virt_address)
+ goto out_noapbt;
+ /*
+ * Read the frequency and check for a sane value, for ESL model
+ * we extend the possible clock range to allow time scaling.
+ */
+
+ if (apbt_freq < APBT_MIN_FREQ || apbt_freq > APBT_MAX_FREQ) {
+ pr_debug("APBT has invalid freq 0x%lx\n", apbt_freq);
+ goto out_noapbt;
+ }
+ if (apbt_clocksource_register()) {
+ pr_debug("APBT has failed to register clocksource\n");
+ goto out_noapbt;
+ }
+ if (!apbt_clockevent_register())
+ apb_timer_block_enabled = 1;
+ else {
+ pr_debug("APBT has failed to register clockevent\n");
+ goto out_noapbt;
+ }
+#ifdef CONFIG_SMP
+ /* kernel cmdline disable apb timer, so we will use lapic timers */
+ if (intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT) {
+ printk(KERN_INFO "apbt: disabled per cpu timer\n");
+ return;
+ }
+ pr_debug("%s: %d CPUs online\n", __func__, num_online_cpus());
+ if (num_possible_cpus() <= sfi_mtimer_num)
+ apbt_num_timers_used = num_possible_cpus();
+ else
+ apbt_num_timers_used = 1;
+ pr_debug("%s: %d APB timers used\n", __func__, apbt_num_timers_used);
+
+ /* here we set up per CPU timer data structure */
+ for (i = 0; i < apbt_num_timers_used; i++) {
+ adev = &per_cpu(cpu_apbt_dev, i);
+ adev->num = i;
+ adev->cpu = i;
+ p_mtmr = sfi_get_mtmr(i);
+ if (p_mtmr)
+ adev->irq = p_mtmr->irq;
+ else
+ printk(KERN_ERR "Failed to get timer for cpu %d\n", i);
+ snprintf(adev->name, sizeof(adev->name) - 1, "apbt%d", i);
+ }
+#endif
+
+ return;
+
+out_noapbt:
+ apbt_clear_mapping();
+ apb_timer_block_enabled = 0;
+ panic("failed to enable APB timer\n");
+}
+
+/* called before apb_timer_enable, use early map */
+unsigned long apbt_quick_calibrate(void)
+{
+ int i, scale;
+ u64 old, new;
+ u64 t1, t2;
+ unsigned long khz = 0;
+ u32 loop, shift;
+
+ apbt_set_mapping();
+ dw_apb_clocksource_start(clocksource_apbt);
+
+ /* check if the timer can count down, otherwise return */
+ old = dw_apb_clocksource_read(clocksource_apbt);
+ i = 10000;
+ while (--i) {
+ if (old != dw_apb_clocksource_read(clocksource_apbt))
+ break;
+ }
+ if (!i)
+ goto failed;
+
+ /* count 16 ms */
+ loop = (apbt_freq / 1000) << 4;
+
+ /* restart the timer to ensure it won't get to 0 in the calibration */
+ dw_apb_clocksource_start(clocksource_apbt);
+
+ old = dw_apb_clocksource_read(clocksource_apbt);
+ old += loop;
+
+ t1 = rdtsc();
+
+ do {
+ new = dw_apb_clocksource_read(clocksource_apbt);
+ } while (new < old);
+
+ t2 = rdtsc();
+
+ shift = 5;
+ if (unlikely(loop >> shift == 0)) {
+ printk(KERN_INFO
+ "APBT TSC calibration failed, not enough resolution\n");
+ return 0;
+ }
+ scale = (int)div_u64((t2 - t1), loop >> shift);
+ khz = (scale * (apbt_freq / 1000)) >> shift;
+ printk(KERN_INFO "TSC freq calculated by APB timer is %lu khz\n", khz);
+ return khz;
+failed:
+ return 0;
+}
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
new file mode 100644
index 000000000..93426c5fc
--- /dev/null
+++ b/arch/x86/kernel/aperture_64.c
@@ -0,0 +1,551 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Firmware replacement code.
+ *
+ * Work around broken BIOSes that don't set an aperture, only set the
+ * aperture in the AGP bridge, or set too small aperture.
+ *
+ * If all fails map the aperture over some low memory. This is cheaper than
+ * doing bounce buffering. The memory is lost. This is done at early boot
+ * because only the bootmem allocator can allocate 32+MB.
+ *
+ * Copyright 2002 Andi Kleen, SuSE Labs.
+ */
+#define pr_fmt(fmt) "AGP: " fmt
+
+#include <linux/kernel.h>
+#include <linux/kcore.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/memblock.h>
+#include <linux/mmzone.h>
+#include <linux/pci_ids.h>
+#include <linux/pci.h>
+#include <linux/bitops.h>
+#include <linux/suspend.h>
+#include <asm/e820/api.h>
+#include <asm/io.h>
+#include <asm/iommu.h>
+#include <asm/gart.h>
+#include <asm/pci-direct.h>
+#include <asm/dma.h>
+#include <asm/amd_nb.h>
+#include <asm/x86_init.h>
+#include <linux/crash_dump.h>
+
+/*
+ * Using 512M as goal, in case kexec will load kernel_big
+ * that will do the on-position decompress, and could overlap with
+ * with the gart aperture that is used.
+ * Sequence:
+ * kernel_small
+ * ==> kexec (with kdump trigger path or gart still enabled)
+ * ==> kernel_small (gart area become e820_reserved)
+ * ==> kexec (with kdump trigger path or gart still enabled)
+ * ==> kerne_big (uncompressed size will be big than 64M or 128M)
+ * So don't use 512M below as gart iommu, leave the space for kernel
+ * code for safe.
+ */
+#define GART_MIN_ADDR (512ULL << 20)
+#define GART_MAX_ADDR (1ULL << 32)
+
+int gart_iommu_aperture;
+int gart_iommu_aperture_disabled __initdata;
+int gart_iommu_aperture_allowed __initdata;
+
+int fallback_aper_order __initdata = 1; /* 64MB */
+int fallback_aper_force __initdata;
+
+int fix_aperture __initdata = 1;
+
+#if defined(CONFIG_PROC_VMCORE) || defined(CONFIG_PROC_KCORE)
+/*
+ * If the first kernel maps the aperture over e820 RAM, the kdump kernel will
+ * use the same range because it will remain configured in the northbridge.
+ * Trying to dump this area via /proc/vmcore may crash the machine, so exclude
+ * it from vmcore.
+ */
+static unsigned long aperture_pfn_start, aperture_page_count;
+
+static int gart_mem_pfn_is_ram(unsigned long pfn)
+{
+ return likely((pfn < aperture_pfn_start) ||
+ (pfn >= aperture_pfn_start + aperture_page_count));
+}
+
+static void __init exclude_from_core(u64 aper_base, u32 aper_order)
+{
+ aperture_pfn_start = aper_base >> PAGE_SHIFT;
+ aperture_page_count = (32 * 1024 * 1024) << aper_order >> PAGE_SHIFT;
+#ifdef CONFIG_PROC_VMCORE
+ WARN_ON(register_oldmem_pfn_is_ram(&gart_mem_pfn_is_ram));
+#endif
+#ifdef CONFIG_PROC_KCORE
+ WARN_ON(register_mem_pfn_is_ram(&gart_mem_pfn_is_ram));
+#endif
+}
+#else
+static void exclude_from_core(u64 aper_base, u32 aper_order)
+{
+}
+#endif
+
+/* This code runs before the PCI subsystem is initialized, so just
+ access the northbridge directly. */
+
+static u32 __init allocate_aperture(void)
+{
+ u32 aper_size;
+ unsigned long addr;
+
+ /* aper_size should <= 1G */
+ if (fallback_aper_order > 5)
+ fallback_aper_order = 5;
+ aper_size = (32 * 1024 * 1024) << fallback_aper_order;
+
+ /*
+ * Aperture has to be naturally aligned. This means a 2GB aperture
+ * won't have much chance of finding a place in the lower 4GB of
+ * memory. Unfortunately we cannot move it up because that would
+ * make the IOMMU useless.
+ */
+ addr = memblock_find_in_range(GART_MIN_ADDR, GART_MAX_ADDR,
+ aper_size, aper_size);
+ if (!addr) {
+ pr_err("Cannot allocate aperture memory hole [mem %#010lx-%#010lx] (%uKB)\n",
+ addr, addr + aper_size - 1, aper_size >> 10);
+ return 0;
+ }
+ memblock_reserve(addr, aper_size);
+ pr_info("Mapping aperture over RAM [mem %#010lx-%#010lx] (%uKB)\n",
+ addr, addr + aper_size - 1, aper_size >> 10);
+ register_nosave_region(addr >> PAGE_SHIFT,
+ (addr+aper_size) >> PAGE_SHIFT);
+
+ return (u32)addr;
+}
+
+
+/* Find a PCI capability */
+static u32 __init find_cap(int bus, int slot, int func, int cap)
+{
+ int bytes;
+ u8 pos;
+
+ if (!(read_pci_config_16(bus, slot, func, PCI_STATUS) &
+ PCI_STATUS_CAP_LIST))
+ return 0;
+
+ pos = read_pci_config_byte(bus, slot, func, PCI_CAPABILITY_LIST);
+ for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) {
+ u8 id;
+
+ pos &= ~3;
+ id = read_pci_config_byte(bus, slot, func, pos+PCI_CAP_LIST_ID);
+ if (id == 0xff)
+ break;
+ if (id == cap)
+ return pos;
+ pos = read_pci_config_byte(bus, slot, func,
+ pos+PCI_CAP_LIST_NEXT);
+ }
+ return 0;
+}
+
+/* Read a standard AGPv3 bridge header */
+static u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order)
+{
+ u32 apsize;
+ u32 apsizereg;
+ int nbits;
+ u32 aper_low, aper_hi;
+ u64 aper;
+ u32 old_order;
+
+ pr_info("pci 0000:%02x:%02x:%02x: AGP bridge\n", bus, slot, func);
+ apsizereg = read_pci_config_16(bus, slot, func, cap + 0x14);
+ if (apsizereg == 0xffffffff) {
+ pr_err("pci 0000:%02x:%02x.%d: APSIZE unreadable\n",
+ bus, slot, func);
+ return 0;
+ }
+
+ /* old_order could be the value from NB gart setting */
+ old_order = *order;
+
+ apsize = apsizereg & 0xfff;
+ /* Some BIOS use weird encodings not in the AGPv3 table. */
+ if (apsize & 0xff)
+ apsize |= 0xf00;
+ nbits = hweight16(apsize);
+ *order = 7 - nbits;
+ if ((int)*order < 0) /* < 32MB */
+ *order = 0;
+
+ aper_low = read_pci_config(bus, slot, func, 0x10);
+ aper_hi = read_pci_config(bus, slot, func, 0x14);
+ aper = (aper_low & ~((1<<22)-1)) | ((u64)aper_hi << 32);
+
+ /*
+ * On some sick chips, APSIZE is 0. It means it wants 4G
+ * so let double check that order, and lets trust AMD NB settings:
+ */
+ pr_info("pci 0000:%02x:%02x.%d: AGP aperture [bus addr %#010Lx-%#010Lx] (old size %uMB)\n",
+ bus, slot, func, aper, aper + (32ULL << (old_order + 20)) - 1,
+ 32 << old_order);
+ if (aper + (32ULL<<(20 + *order)) > 0x100000000ULL) {
+ pr_info("pci 0000:%02x:%02x.%d: AGP aperture size %uMB (APSIZE %#x) is not right, using settings from NB\n",
+ bus, slot, func, 32 << *order, apsizereg);
+ *order = old_order;
+ }
+
+ pr_info("pci 0000:%02x:%02x.%d: AGP aperture [bus addr %#010Lx-%#010Lx] (%uMB, APSIZE %#x)\n",
+ bus, slot, func, aper, aper + (32ULL << (*order + 20)) - 1,
+ 32 << *order, apsizereg);
+
+ if (!aperture_valid(aper, (32*1024*1024) << *order, 32<<20))
+ return 0;
+ return (u32)aper;
+}
+
+/*
+ * Look for an AGP bridge. Windows only expects the aperture in the
+ * AGP bridge and some BIOS forget to initialize the Northbridge too.
+ * Work around this here.
+ *
+ * Do an PCI bus scan by hand because we're running before the PCI
+ * subsystem.
+ *
+ * All AMD AGP bridges are AGPv3 compliant, so we can do this scan
+ * generically. It's probably overkill to always scan all slots because
+ * the AGP bridges should be always an own bus on the HT hierarchy,
+ * but do it here for future safety.
+ */
+static u32 __init search_agp_bridge(u32 *order, int *valid_agp)
+{
+ int bus, slot, func;
+
+ /* Poor man's PCI discovery */
+ for (bus = 0; bus < 256; bus++) {
+ for (slot = 0; slot < 32; slot++) {
+ for (func = 0; func < 8; func++) {
+ u32 class, cap;
+ u8 type;
+ class = read_pci_config(bus, slot, func,
+ PCI_CLASS_REVISION);
+ if (class == 0xffffffff)
+ break;
+
+ switch (class >> 16) {
+ case PCI_CLASS_BRIDGE_HOST:
+ case PCI_CLASS_BRIDGE_OTHER: /* needed? */
+ /* AGP bridge? */
+ cap = find_cap(bus, slot, func,
+ PCI_CAP_ID_AGP);
+ if (!cap)
+ break;
+ *valid_agp = 1;
+ return read_agp(bus, slot, func, cap,
+ order);
+ }
+
+ /* No multi-function device? */
+ type = read_pci_config_byte(bus, slot, func,
+ PCI_HEADER_TYPE);
+ if (!(type & 0x80))
+ break;
+ }
+ }
+ }
+ pr_info("No AGP bridge found\n");
+
+ return 0;
+}
+
+static bool gart_fix_e820 __initdata = true;
+
+static int __init parse_gart_mem(char *p)
+{
+ return kstrtobool(p, &gart_fix_e820);
+}
+early_param("gart_fix_e820", parse_gart_mem);
+
+void __init early_gart_iommu_check(void)
+{
+ /*
+ * in case it is enabled before, esp for kexec/kdump,
+ * previous kernel already enable that. memset called
+ * by allocate_aperture/__alloc_bootmem_nopanic cause restart.
+ * or second kernel have different position for GART hole. and new
+ * kernel could use hole as RAM that is still used by GART set by
+ * first kernel
+ * or BIOS forget to put that in reserved.
+ * try to update e820 to make that region as reserved.
+ */
+ u32 agp_aper_order = 0;
+ int i, fix, slot, valid_agp = 0;
+ u32 ctl;
+ u32 aper_size = 0, aper_order = 0, last_aper_order = 0;
+ u64 aper_base = 0, last_aper_base = 0;
+ int aper_enabled = 0, last_aper_enabled = 0, last_valid = 0;
+
+ if (!amd_gart_present())
+ return;
+
+ if (!early_pci_allowed())
+ return;
+
+ /* This is mostly duplicate of iommu_hole_init */
+ search_agp_bridge(&agp_aper_order, &valid_agp);
+
+ fix = 0;
+ for (i = 0; amd_nb_bus_dev_ranges[i].dev_limit; i++) {
+ int bus;
+ int dev_base, dev_limit;
+
+ bus = amd_nb_bus_dev_ranges[i].bus;
+ dev_base = amd_nb_bus_dev_ranges[i].dev_base;
+ dev_limit = amd_nb_bus_dev_ranges[i].dev_limit;
+
+ for (slot = dev_base; slot < dev_limit; slot++) {
+ if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
+ continue;
+
+ ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
+ aper_enabled = ctl & GARTEN;
+ aper_order = (ctl >> 1) & 7;
+ aper_size = (32 * 1024 * 1024) << aper_order;
+ aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff;
+ aper_base <<= 25;
+
+ if (last_valid) {
+ if ((aper_order != last_aper_order) ||
+ (aper_base != last_aper_base) ||
+ (aper_enabled != last_aper_enabled)) {
+ fix = 1;
+ break;
+ }
+ }
+
+ last_aper_order = aper_order;
+ last_aper_base = aper_base;
+ last_aper_enabled = aper_enabled;
+ last_valid = 1;
+ }
+ }
+
+ if (!fix && !aper_enabled)
+ return;
+
+ if (!aper_base || !aper_size || aper_base + aper_size > 0x100000000UL)
+ fix = 1;
+
+ if (gart_fix_e820 && !fix && aper_enabled) {
+ if (e820__mapped_any(aper_base, aper_base + aper_size,
+ E820_TYPE_RAM)) {
+ /* reserve it, so we can reuse it in second kernel */
+ pr_info("e820: reserve [mem %#010Lx-%#010Lx] for GART\n",
+ aper_base, aper_base + aper_size - 1);
+ e820__range_add(aper_base, aper_size, E820_TYPE_RESERVED);
+ e820__update_table_print();
+ }
+ }
+
+ if (valid_agp)
+ return;
+
+ /* disable them all at first */
+ for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) {
+ int bus;
+ int dev_base, dev_limit;
+
+ bus = amd_nb_bus_dev_ranges[i].bus;
+ dev_base = amd_nb_bus_dev_ranges[i].dev_base;
+ dev_limit = amd_nb_bus_dev_ranges[i].dev_limit;
+
+ for (slot = dev_base; slot < dev_limit; slot++) {
+ if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
+ continue;
+
+ ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
+ ctl &= ~GARTEN;
+ write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl);
+ }
+ }
+
+}
+
+static int __initdata printed_gart_size_msg;
+
+int __init gart_iommu_hole_init(void)
+{
+ u32 agp_aper_base = 0, agp_aper_order = 0;
+ u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0;
+ u64 aper_base, last_aper_base = 0;
+ int fix, slot, valid_agp = 0;
+ int i, node;
+
+ if (!amd_gart_present())
+ return -ENODEV;
+
+ if (gart_iommu_aperture_disabled || !fix_aperture ||
+ !early_pci_allowed())
+ return -ENODEV;
+
+ pr_info("Checking aperture...\n");
+
+ if (!fallback_aper_force)
+ agp_aper_base = search_agp_bridge(&agp_aper_order, &valid_agp);
+
+ fix = 0;
+ node = 0;
+ for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) {
+ int bus;
+ int dev_base, dev_limit;
+ u32 ctl;
+
+ bus = amd_nb_bus_dev_ranges[i].bus;
+ dev_base = amd_nb_bus_dev_ranges[i].dev_base;
+ dev_limit = amd_nb_bus_dev_ranges[i].dev_limit;
+
+ for (slot = dev_base; slot < dev_limit; slot++) {
+ if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
+ continue;
+
+ iommu_detected = 1;
+ gart_iommu_aperture = 1;
+ x86_init.iommu.iommu_init = gart_iommu_init;
+
+ ctl = read_pci_config(bus, slot, 3,
+ AMD64_GARTAPERTURECTL);
+
+ /*
+ * Before we do anything else disable the GART. It may
+ * still be enabled if we boot into a crash-kernel here.
+ * Reconfiguring the GART while it is enabled could have
+ * unknown side-effects.
+ */
+ ctl &= ~GARTEN;
+ write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl);
+
+ aper_order = (ctl >> 1) & 7;
+ aper_size = (32 * 1024 * 1024) << aper_order;
+ aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff;
+ aper_base <<= 25;
+
+ pr_info("Node %d: aperture [bus addr %#010Lx-%#010Lx] (%uMB)\n",
+ node, aper_base, aper_base + aper_size - 1,
+ aper_size >> 20);
+ node++;
+
+ if (!aperture_valid(aper_base, aper_size, 64<<20)) {
+ if (valid_agp && agp_aper_base &&
+ agp_aper_base == aper_base &&
+ agp_aper_order == aper_order) {
+ /* the same between two setting from NB and agp */
+ if (!no_iommu &&
+ max_pfn > MAX_DMA32_PFN &&
+ !printed_gart_size_msg) {
+ pr_err("you are using iommu with agp, but GART size is less than 64MB\n");
+ pr_err("please increase GART size in your BIOS setup\n");
+ pr_err("if BIOS doesn't have that option, contact your HW vendor!\n");
+ printed_gart_size_msg = 1;
+ }
+ } else {
+ fix = 1;
+ goto out;
+ }
+ }
+
+ if ((last_aper_order && aper_order != last_aper_order) ||
+ (last_aper_base && aper_base != last_aper_base)) {
+ fix = 1;
+ goto out;
+ }
+ last_aper_order = aper_order;
+ last_aper_base = aper_base;
+ }
+ }
+
+out:
+ if (!fix && !fallback_aper_force) {
+ if (last_aper_base) {
+ /*
+ * If this is the kdump kernel, the first kernel
+ * may have allocated the range over its e820 RAM
+ * and fixed up the northbridge
+ */
+ exclude_from_core(last_aper_base, last_aper_order);
+
+ return 1;
+ }
+ return 0;
+ }
+
+ if (!fallback_aper_force) {
+ aper_alloc = agp_aper_base;
+ aper_order = agp_aper_order;
+ }
+
+ if (aper_alloc) {
+ /* Got the aperture from the AGP bridge */
+ } else if ((!no_iommu && max_pfn > MAX_DMA32_PFN) ||
+ force_iommu ||
+ valid_agp ||
+ fallback_aper_force) {
+ pr_info("Your BIOS doesn't leave an aperture memory hole\n");
+ pr_info("Please enable the IOMMU option in the BIOS setup\n");
+ pr_info("This costs you %dMB of RAM\n",
+ 32 << fallback_aper_order);
+
+ aper_order = fallback_aper_order;
+ aper_alloc = allocate_aperture();
+ if (!aper_alloc) {
+ /*
+ * Could disable AGP and IOMMU here, but it's
+ * probably not worth it. But the later users
+ * cannot deal with bad apertures and turning
+ * on the aperture over memory causes very
+ * strange problems, so it's better to panic
+ * early.
+ */
+ panic("Not enough memory for aperture");
+ }
+ } else {
+ return 0;
+ }
+
+ /*
+ * If this is the kdump kernel _and_ the first kernel did not
+ * configure the aperture in the northbridge, this range may
+ * overlap with the first kernel's memory. We can't access the
+ * range through vmcore even though it should be part of the dump.
+ */
+ exclude_from_core(aper_alloc, aper_order);
+
+ /* Fix up the north bridges */
+ for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) {
+ int bus, dev_base, dev_limit;
+
+ /*
+ * Don't enable translation yet but enable GART IO and CPU
+ * accesses and set DISTLBWALKPRB since GART table memory is UC.
+ */
+ u32 ctl = aper_order << 1;
+
+ bus = amd_nb_bus_dev_ranges[i].bus;
+ dev_base = amd_nb_bus_dev_ranges[i].dev_base;
+ dev_limit = amd_nb_bus_dev_ranges[i].dev_limit;
+ for (slot = dev_base; slot < dev_limit; slot++) {
+ if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
+ continue;
+
+ write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl);
+ write_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE, aper_alloc >> 25);
+ }
+ }
+
+ set_up_gart_resume(aper_order, aper_alloc);
+
+ return 1;
+}
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
new file mode 100644
index 000000000..a6fcaf16c
--- /dev/null
+++ b/arch/x86/kernel/apic/Makefile
@@ -0,0 +1,30 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for local APIC drivers and for the IO-APIC code
+#
+
+# Leads to non-deterministic coverage that is not a function of syscall inputs.
+# In particualr, smp_apic_timer_interrupt() is called in random places.
+KCOV_INSTRUMENT := n
+
+obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_common.o apic_noop.o ipi.o vector.o
+obj-y += hw_nmi.o
+
+obj-$(CONFIG_X86_IO_APIC) += io_apic.o
+obj-$(CONFIG_PCI_MSI) += msi.o
+obj-$(CONFIG_SMP) += ipi.o
+
+ifeq ($(CONFIG_X86_64),y)
+# APIC probe will depend on the listing order here
+obj-$(CONFIG_X86_NUMACHIP) += apic_numachip.o
+obj-$(CONFIG_X86_UV) += x2apic_uv_x.o
+obj-$(CONFIG_X86_X2APIC) += x2apic_phys.o
+obj-$(CONFIG_X86_X2APIC) += x2apic_cluster.o
+obj-y += apic_flat_64.o
+endif
+
+# APIC probe will depend on the listing order here
+obj-$(CONFIG_X86_BIGSMP) += bigsmp_32.o
+
+# For 32bit, probe_32 need to be listed last
+obj-$(CONFIG_X86_LOCAL_APIC) += probe_$(BITS).o
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
new file mode 100644
index 000000000..926939978
--- /dev/null
+++ b/arch/x86/kernel/apic/apic.c
@@ -0,0 +1,2858 @@
+/*
+ * Local APIC handling, local APIC timers
+ *
+ * (c) 1999, 2000, 2009 Ingo Molnar <mingo@redhat.com>
+ *
+ * Fixes
+ * Maciej W. Rozycki : Bits for genuine 82489DX APICs;
+ * thanks to Eric Gilmore
+ * and Rolf G. Tews
+ * for testing these extensively.
+ * Maciej W. Rozycki : Various updates and fixes.
+ * Mikael Pettersson : Power Management for UP-APIC.
+ * Pavel Machek and
+ * Mikael Pettersson : PM converted to driver model.
+ */
+
+#include <linux/perf_event.h>
+#include <linux/kernel_stat.h>
+#include <linux/mc146818rtc.h>
+#include <linux/acpi_pmtmr.h>
+#include <linux/clockchips.h>
+#include <linux/interrupt.h>
+#include <linux/bootmem.h>
+#include <linux/ftrace.h>
+#include <linux/ioport.h>
+#include <linux/export.h>
+#include <linux/syscore_ops.h>
+#include <linux/delay.h>
+#include <linux/timex.h>
+#include <linux/i8253.h>
+#include <linux/dmar.h>
+#include <linux/init.h>
+#include <linux/cpu.h>
+#include <linux/dmi.h>
+#include <linux/smp.h>
+#include <linux/mm.h>
+
+#include <asm/trace/irq_vectors.h>
+#include <asm/irq_remapping.h>
+#include <asm/perf_event.h>
+#include <asm/x86_init.h>
+#include <asm/pgalloc.h>
+#include <linux/atomic.h>
+#include <asm/barrier.h>
+#include <asm/mpspec.h>
+#include <asm/i8259.h>
+#include <asm/proto.h>
+#include <asm/apic.h>
+#include <asm/io_apic.h>
+#include <asm/desc.h>
+#include <asm/hpet.h>
+#include <asm/mtrr.h>
+#include <asm/time.h>
+#include <asm/smp.h>
+#include <asm/mce.h>
+#include <asm/tsc.h>
+#include <asm/hypervisor.h>
+#include <asm/cpu_device_id.h>
+#include <asm/intel-family.h>
+#include <asm/irq_regs.h>
+
+unsigned int num_processors;
+
+unsigned disabled_cpus;
+
+/* Processor that is doing the boot up */
+unsigned int boot_cpu_physical_apicid = -1U;
+EXPORT_SYMBOL_GPL(boot_cpu_physical_apicid);
+
+u8 boot_cpu_apic_version;
+
+/*
+ * The highest APIC ID seen during enumeration.
+ */
+static unsigned int max_physical_apicid;
+
+/*
+ * Bitmask of physically existing CPUs:
+ */
+physid_mask_t phys_cpu_present_map;
+
+/*
+ * Processor to be disabled specified by kernel parameter
+ * disable_cpu_apicid=<int>, mostly used for the kdump 2nd kernel to
+ * avoid undefined behaviour caused by sending INIT from AP to BSP.
+ */
+static unsigned int disabled_cpu_apicid __read_mostly = BAD_APICID;
+
+/*
+ * This variable controls which CPUs receive external NMIs. By default,
+ * external NMIs are delivered only to the BSP.
+ */
+static int apic_extnmi = APIC_EXTNMI_BSP;
+
+/*
+ * Map cpu index to physical APIC ID
+ */
+DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid, BAD_APICID);
+DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid, BAD_APICID);
+DEFINE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid, U32_MAX);
+EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
+EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
+EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_acpiid);
+
+#ifdef CONFIG_X86_32
+
+/*
+ * On x86_32, the mapping between cpu and logical apicid may vary
+ * depending on apic in use. The following early percpu variable is
+ * used for the mapping. This is where the behaviors of x86_64 and 32
+ * actually diverge. Let's keep it ugly for now.
+ */
+DEFINE_EARLY_PER_CPU_READ_MOSTLY(int, x86_cpu_to_logical_apicid, BAD_APICID);
+
+/* Local APIC was disabled by the BIOS and enabled by the kernel */
+static int enabled_via_apicbase;
+
+/*
+ * Handle interrupt mode configuration register (IMCR).
+ * This register controls whether the interrupt signals
+ * that reach the BSP come from the master PIC or from the
+ * local APIC. Before entering Symmetric I/O Mode, either
+ * the BIOS or the operating system must switch out of
+ * PIC Mode by changing the IMCR.
+ */
+static inline void imcr_pic_to_apic(void)
+{
+ /* select IMCR register */
+ outb(0x70, 0x22);
+ /* NMI and 8259 INTR go through APIC */
+ outb(0x01, 0x23);
+}
+
+static inline void imcr_apic_to_pic(void)
+{
+ /* select IMCR register */
+ outb(0x70, 0x22);
+ /* NMI and 8259 INTR go directly to BSP */
+ outb(0x00, 0x23);
+}
+#endif
+
+/*
+ * Knob to control our willingness to enable the local APIC.
+ *
+ * +1=force-enable
+ */
+static int force_enable_local_apic __initdata;
+
+/*
+ * APIC command line parameters
+ */
+static int __init parse_lapic(char *arg)
+{
+ if (IS_ENABLED(CONFIG_X86_32) && !arg)
+ force_enable_local_apic = 1;
+ else if (arg && !strncmp(arg, "notscdeadline", 13))
+ setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
+ return 0;
+}
+early_param("lapic", parse_lapic);
+
+#ifdef CONFIG_X86_64
+static int apic_calibrate_pmtmr __initdata;
+static __init int setup_apicpmtimer(char *s)
+{
+ apic_calibrate_pmtmr = 1;
+ notsc_setup(NULL);
+ return 1;
+}
+__setup("apicpmtimer", setup_apicpmtimer);
+#endif
+
+unsigned long mp_lapic_addr;
+int disable_apic;
+/* Disable local APIC timer from the kernel commandline or via dmi quirk */
+static int disable_apic_timer __initdata;
+/* Local APIC timer works in C2 */
+int local_apic_timer_c2_ok;
+EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
+
+/*
+ * Debug level, exported for io_apic.c
+ */
+int apic_verbosity;
+
+int pic_mode;
+
+/* Have we found an MP table */
+int smp_found_config;
+
+static struct resource lapic_resource = {
+ .name = "Local APIC",
+ .flags = IORESOURCE_MEM | IORESOURCE_BUSY,
+};
+
+unsigned int lapic_timer_frequency = 0;
+
+static void apic_pm_activate(void);
+
+static unsigned long apic_phys;
+
+/*
+ * Get the LAPIC version
+ */
+static inline int lapic_get_version(void)
+{
+ return GET_APIC_VERSION(apic_read(APIC_LVR));
+}
+
+/*
+ * Check, if the APIC is integrated or a separate chip
+ */
+static inline int lapic_is_integrated(void)
+{
+ return APIC_INTEGRATED(lapic_get_version());
+}
+
+/*
+ * Check, whether this is a modern or a first generation APIC
+ */
+static int modern_apic(void)
+{
+ /* AMD systems use old APIC versions, so check the CPU */
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
+ boot_cpu_data.x86 >= 0xf)
+ return 1;
+ return lapic_get_version() >= 0x14;
+}
+
+/*
+ * right after this call apic become NOOP driven
+ * so apic->write/read doesn't do anything
+ */
+static void __init apic_disable(void)
+{
+ pr_info("APIC: switched to apic NOOP\n");
+ apic = &apic_noop;
+}
+
+void native_apic_wait_icr_idle(void)
+{
+ while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
+ cpu_relax();
+}
+
+u32 native_safe_apic_wait_icr_idle(void)
+{
+ u32 send_status;
+ int timeout;
+
+ timeout = 0;
+ do {
+ send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
+ if (!send_status)
+ break;
+ inc_irq_stat(icr_read_retry_count);
+ udelay(100);
+ } while (timeout++ < 1000);
+
+ return send_status;
+}
+
+void native_apic_icr_write(u32 low, u32 id)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id));
+ apic_write(APIC_ICR, low);
+ local_irq_restore(flags);
+}
+
+u64 native_apic_icr_read(void)
+{
+ u32 icr1, icr2;
+
+ icr2 = apic_read(APIC_ICR2);
+ icr1 = apic_read(APIC_ICR);
+
+ return icr1 | ((u64)icr2 << 32);
+}
+
+#ifdef CONFIG_X86_32
+/**
+ * get_physical_broadcast - Get number of physical broadcast IDs
+ */
+int get_physical_broadcast(void)
+{
+ return modern_apic() ? 0xff : 0xf;
+}
+#endif
+
+/**
+ * lapic_get_maxlvt - get the maximum number of local vector table entries
+ */
+int lapic_get_maxlvt(void)
+{
+ /*
+ * - we always have APIC integrated on 64bit mode
+ * - 82489DXs do not report # of LVT entries
+ */
+ return lapic_is_integrated() ? GET_APIC_MAXLVT(apic_read(APIC_LVR)) : 2;
+}
+
+/*
+ * Local APIC timer
+ */
+
+/* Clock divisor */
+#define APIC_DIVISOR 16
+#define TSC_DIVISOR 8
+
+/*
+ * This function sets up the local APIC timer, with a timeout of
+ * 'clocks' APIC bus clock. During calibration we actually call
+ * this function twice on the boot CPU, once with a bogus timeout
+ * value, second time for real. The other (noncalibrating) CPUs
+ * call this function only once, with the real, calibrated value.
+ *
+ * We do reads before writes even if unnecessary, to get around the
+ * P5 APIC double write bug.
+ */
+static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
+{
+ unsigned int lvtt_value, tmp_value;
+
+ lvtt_value = LOCAL_TIMER_VECTOR;
+ if (!oneshot)
+ lvtt_value |= APIC_LVT_TIMER_PERIODIC;
+ else if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER))
+ lvtt_value |= APIC_LVT_TIMER_TSCDEADLINE;
+
+ if (!lapic_is_integrated())
+ lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV);
+
+ if (!irqen)
+ lvtt_value |= APIC_LVT_MASKED;
+
+ apic_write(APIC_LVTT, lvtt_value);
+
+ if (lvtt_value & APIC_LVT_TIMER_TSCDEADLINE) {
+ /*
+ * See Intel SDM: TSC-Deadline Mode chapter. In xAPIC mode,
+ * writing to the APIC LVTT and TSC_DEADLINE MSR isn't serialized.
+ * According to Intel, MFENCE can do the serialization here.
+ */
+ asm volatile("mfence" : : : "memory");
+ return;
+ }
+
+ /*
+ * Divide PICLK by 16
+ */
+ tmp_value = apic_read(APIC_TDCR);
+ apic_write(APIC_TDCR,
+ (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) |
+ APIC_TDR_DIV_16);
+
+ if (!oneshot)
+ apic_write(APIC_TMICT, clocks / APIC_DIVISOR);
+}
+
+/*
+ * Setup extended LVT, AMD specific
+ *
+ * Software should use the LVT offsets the BIOS provides. The offsets
+ * are determined by the subsystems using it like those for MCE
+ * threshold or IBS. On K8 only offset 0 (APIC500) and MCE interrupts
+ * are supported. Beginning with family 10h at least 4 offsets are
+ * available.
+ *
+ * Since the offsets must be consistent for all cores, we keep track
+ * of the LVT offsets in software and reserve the offset for the same
+ * vector also to be used on other cores. An offset is freed by
+ * setting the entry to APIC_EILVT_MASKED.
+ *
+ * If the BIOS is right, there should be no conflicts. Otherwise a
+ * "[Firmware Bug]: ..." error message is generated. However, if
+ * software does not properly determines the offsets, it is not
+ * necessarily a BIOS bug.
+ */
+
+static atomic_t eilvt_offsets[APIC_EILVT_NR_MAX];
+
+static inline int eilvt_entry_is_changeable(unsigned int old, unsigned int new)
+{
+ return (old & APIC_EILVT_MASKED)
+ || (new == APIC_EILVT_MASKED)
+ || ((new & ~APIC_EILVT_MASKED) == old);
+}
+
+static unsigned int reserve_eilvt_offset(int offset, unsigned int new)
+{
+ unsigned int rsvd, vector;
+
+ if (offset >= APIC_EILVT_NR_MAX)
+ return ~0;
+
+ rsvd = atomic_read(&eilvt_offsets[offset]);
+ do {
+ vector = rsvd & ~APIC_EILVT_MASKED; /* 0: unassigned */
+ if (vector && !eilvt_entry_is_changeable(vector, new))
+ /* may not change if vectors are different */
+ return rsvd;
+ rsvd = atomic_cmpxchg(&eilvt_offsets[offset], rsvd, new);
+ } while (rsvd != new);
+
+ rsvd &= ~APIC_EILVT_MASKED;
+ if (rsvd && rsvd != vector)
+ pr_info("LVT offset %d assigned for vector 0x%02x\n",
+ offset, rsvd);
+
+ return new;
+}
+
+/*
+ * If mask=1, the LVT entry does not generate interrupts while mask=0
+ * enables the vector. See also the BKDGs. Must be called with
+ * preemption disabled.
+ */
+
+int setup_APIC_eilvt(u8 offset, u8 vector, u8 msg_type, u8 mask)
+{
+ unsigned long reg = APIC_EILVTn(offset);
+ unsigned int new, old, reserved;
+
+ new = (mask << 16) | (msg_type << 8) | vector;
+ old = apic_read(reg);
+ reserved = reserve_eilvt_offset(offset, new);
+
+ if (reserved != new) {
+ pr_err(FW_BUG "cpu %d, try to use APIC%lX (LVT offset %d) for "
+ "vector 0x%x, but the register is already in use for "
+ "vector 0x%x on another cpu\n",
+ smp_processor_id(), reg, offset, new, reserved);
+ return -EINVAL;
+ }
+
+ if (!eilvt_entry_is_changeable(old, new)) {
+ pr_err(FW_BUG "cpu %d, try to use APIC%lX (LVT offset %d) for "
+ "vector 0x%x, but the register is already in use for "
+ "vector 0x%x on this cpu\n",
+ smp_processor_id(), reg, offset, new, old);
+ return -EBUSY;
+ }
+
+ apic_write(reg, new);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(setup_APIC_eilvt);
+
+/*
+ * Program the next event, relative to now
+ */
+static int lapic_next_event(unsigned long delta,
+ struct clock_event_device *evt)
+{
+ apic_write(APIC_TMICT, delta);
+ return 0;
+}
+
+static int lapic_next_deadline(unsigned long delta,
+ struct clock_event_device *evt)
+{
+ u64 tsc;
+
+ /* This MSR is special and need a special fence: */
+ weak_wrmsr_fence();
+
+ tsc = rdtsc();
+ wrmsrl(MSR_IA32_TSC_DEADLINE, tsc + (((u64) delta) * TSC_DIVISOR));
+ return 0;
+}
+
+static int lapic_timer_shutdown(struct clock_event_device *evt)
+{
+ unsigned int v;
+
+ /* Lapic used as dummy for broadcast ? */
+ if (evt->features & CLOCK_EVT_FEAT_DUMMY)
+ return 0;
+
+ v = apic_read(APIC_LVTT);
+ v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
+ apic_write(APIC_LVTT, v);
+ apic_write(APIC_TMICT, 0);
+ return 0;
+}
+
+static inline int
+lapic_timer_set_periodic_oneshot(struct clock_event_device *evt, bool oneshot)
+{
+ /* Lapic used as dummy for broadcast ? */
+ if (evt->features & CLOCK_EVT_FEAT_DUMMY)
+ return 0;
+
+ __setup_APIC_LVTT(lapic_timer_frequency, oneshot, 1);
+ return 0;
+}
+
+static int lapic_timer_set_periodic(struct clock_event_device *evt)
+{
+ return lapic_timer_set_periodic_oneshot(evt, false);
+}
+
+static int lapic_timer_set_oneshot(struct clock_event_device *evt)
+{
+ return lapic_timer_set_periodic_oneshot(evt, true);
+}
+
+/*
+ * Local APIC timer broadcast function
+ */
+static void lapic_timer_broadcast(const struct cpumask *mask)
+{
+#ifdef CONFIG_SMP
+ apic->send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
+#endif
+}
+
+
+/*
+ * The local apic timer can be used for any function which is CPU local.
+ */
+static struct clock_event_device lapic_clockevent = {
+ .name = "lapic",
+ .features = CLOCK_EVT_FEAT_PERIODIC |
+ CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_C3STOP
+ | CLOCK_EVT_FEAT_DUMMY,
+ .shift = 32,
+ .set_state_shutdown = lapic_timer_shutdown,
+ .set_state_periodic = lapic_timer_set_periodic,
+ .set_state_oneshot = lapic_timer_set_oneshot,
+ .set_state_oneshot_stopped = lapic_timer_shutdown,
+ .set_next_event = lapic_next_event,
+ .broadcast = lapic_timer_broadcast,
+ .rating = 100,
+ .irq = -1,
+};
+static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
+
+#define DEADLINE_MODEL_MATCH_FUNC(model, func) \
+ { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long)&func }
+
+#define DEADLINE_MODEL_MATCH_REV(model, rev) \
+ { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long)rev }
+
+static __init u32 hsx_deadline_rev(void)
+{
+ switch (boot_cpu_data.x86_stepping) {
+ case 0x02: return 0x3a; /* EP */
+ case 0x04: return 0x0f; /* EX */
+ }
+
+ return ~0U;
+}
+
+static __init u32 bdx_deadline_rev(void)
+{
+ switch (boot_cpu_data.x86_stepping) {
+ case 0x02: return 0x00000011;
+ case 0x03: return 0x0700000e;
+ case 0x04: return 0x0f00000c;
+ case 0x05: return 0x0e000003;
+ }
+
+ return ~0U;
+}
+
+static __init u32 skx_deadline_rev(void)
+{
+ switch (boot_cpu_data.x86_stepping) {
+ case 0x03: return 0x01000136;
+ case 0x04: return 0x02000014;
+ }
+
+ if (boot_cpu_data.x86_stepping > 4)
+ return 0;
+
+ return ~0U;
+}
+
+static const struct x86_cpu_id deadline_match[] __initconst = {
+ DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_HASWELL_X, hsx_deadline_rev),
+ DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL_X, 0x0b000020),
+ DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_BROADWELL_XEON_D, bdx_deadline_rev),
+ DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_SKYLAKE_X, skx_deadline_rev),
+
+ DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_CORE, 0x22),
+ DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_ULT, 0x20),
+ DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_GT3E, 0x17),
+
+ DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL_CORE, 0x25),
+ DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL_GT3E, 0x17),
+
+ DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_SKYLAKE_MOBILE, 0xb2),
+ DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_SKYLAKE_DESKTOP, 0xb2),
+
+ DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_KABYLAKE_MOBILE, 0x52),
+ DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_KABYLAKE_DESKTOP, 0x52),
+
+ {},
+};
+
+static __init bool apic_validate_deadline_timer(void)
+{
+ const struct x86_cpu_id *m;
+ u32 rev;
+
+ if (!boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER))
+ return false;
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ return true;
+
+ m = x86_match_cpu(deadline_match);
+ if (!m)
+ return true;
+
+ /*
+ * Function pointers will have the MSB set due to address layout,
+ * immediate revisions will not.
+ */
+ if ((long)m->driver_data < 0)
+ rev = ((u32 (*)(void))(m->driver_data))();
+ else
+ rev = (u32)m->driver_data;
+
+ if (boot_cpu_data.microcode >= rev)
+ return true;
+
+ setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
+ pr_err(FW_BUG "TSC_DEADLINE disabled due to Errata; "
+ "please update microcode to version: 0x%x (or later)\n", rev);
+ return false;
+}
+
+/*
+ * Setup the local APIC timer for this CPU. Copy the initialized values
+ * of the boot CPU and register the clock event in the framework.
+ */
+static void setup_APIC_timer(void)
+{
+ struct clock_event_device *levt = this_cpu_ptr(&lapic_events);
+
+ if (this_cpu_has(X86_FEATURE_ARAT)) {
+ lapic_clockevent.features &= ~CLOCK_EVT_FEAT_C3STOP;
+ /* Make LAPIC timer preferrable over percpu HPET */
+ lapic_clockevent.rating = 150;
+ }
+
+ memcpy(levt, &lapic_clockevent, sizeof(*levt));
+ levt->cpumask = cpumask_of(smp_processor_id());
+
+ if (this_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) {
+ levt->name = "lapic-deadline";
+ levt->features &= ~(CLOCK_EVT_FEAT_PERIODIC |
+ CLOCK_EVT_FEAT_DUMMY);
+ levt->set_next_event = lapic_next_deadline;
+ clockevents_config_and_register(levt,
+ tsc_khz * (1000 / TSC_DIVISOR),
+ 0xF, ~0UL);
+ } else
+ clockevents_register_device(levt);
+}
+
+/*
+ * Install the updated TSC frequency from recalibration at the TSC
+ * deadline clockevent devices.
+ */
+static void __lapic_update_tsc_freq(void *info)
+{
+ struct clock_event_device *levt = this_cpu_ptr(&lapic_events);
+
+ if (!this_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER))
+ return;
+
+ clockevents_update_freq(levt, tsc_khz * (1000 / TSC_DIVISOR));
+}
+
+void lapic_update_tsc_freq(void)
+{
+ /*
+ * The clockevent device's ->mult and ->shift can both be
+ * changed. In order to avoid races, schedule the frequency
+ * update code on each CPU.
+ */
+ on_each_cpu(__lapic_update_tsc_freq, NULL, 0);
+}
+
+/*
+ * In this functions we calibrate APIC bus clocks to the external timer.
+ *
+ * We want to do the calibration only once since we want to have local timer
+ * irqs syncron. CPUs connected by the same APIC bus have the very same bus
+ * frequency.
+ *
+ * This was previously done by reading the PIT/HPET and waiting for a wrap
+ * around to find out, that a tick has elapsed. I have a box, where the PIT
+ * readout is broken, so it never gets out of the wait loop again. This was
+ * also reported by others.
+ *
+ * Monitoring the jiffies value is inaccurate and the clockevents
+ * infrastructure allows us to do a simple substitution of the interrupt
+ * handler.
+ *
+ * The calibration routine also uses the pm_timer when possible, as the PIT
+ * happens to run way too slow (factor 2.3 on my VAIO CoreDuo, which goes
+ * back to normal later in the boot process).
+ */
+
+#define LAPIC_CAL_LOOPS (HZ/10)
+
+static __initdata int lapic_cal_loops = -1;
+static __initdata long lapic_cal_t1, lapic_cal_t2;
+static __initdata unsigned long long lapic_cal_tsc1, lapic_cal_tsc2;
+static __initdata unsigned long lapic_cal_pm1, lapic_cal_pm2;
+static __initdata unsigned long lapic_cal_j1, lapic_cal_j2;
+
+/*
+ * Temporary interrupt handler and polled calibration function.
+ */
+static void __init lapic_cal_handler(struct clock_event_device *dev)
+{
+ unsigned long long tsc = 0;
+ long tapic = apic_read(APIC_TMCCT);
+ unsigned long pm = acpi_pm_read_early();
+
+ if (boot_cpu_has(X86_FEATURE_TSC))
+ tsc = rdtsc();
+
+ switch (lapic_cal_loops++) {
+ case 0:
+ lapic_cal_t1 = tapic;
+ lapic_cal_tsc1 = tsc;
+ lapic_cal_pm1 = pm;
+ lapic_cal_j1 = jiffies;
+ break;
+
+ case LAPIC_CAL_LOOPS:
+ lapic_cal_t2 = tapic;
+ lapic_cal_tsc2 = tsc;
+ if (pm < lapic_cal_pm1)
+ pm += ACPI_PM_OVRRUN;
+ lapic_cal_pm2 = pm;
+ lapic_cal_j2 = jiffies;
+ break;
+ }
+}
+
+static int __init
+calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc)
+{
+ const long pm_100ms = PMTMR_TICKS_PER_SEC / 10;
+ const long pm_thresh = pm_100ms / 100;
+ unsigned long mult;
+ u64 res;
+
+#ifndef CONFIG_X86_PM_TIMER
+ return -1;
+#endif
+
+ apic_printk(APIC_VERBOSE, "... PM-Timer delta = %ld\n", deltapm);
+
+ /* Check, if the PM timer is available */
+ if (!deltapm)
+ return -1;
+
+ mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC, 22);
+
+ if (deltapm > (pm_100ms - pm_thresh) &&
+ deltapm < (pm_100ms + pm_thresh)) {
+ apic_printk(APIC_VERBOSE, "... PM-Timer result ok\n");
+ return 0;
+ }
+
+ res = (((u64)deltapm) * mult) >> 22;
+ do_div(res, 1000000);
+ pr_warning("APIC calibration not consistent "
+ "with PM-Timer: %ldms instead of 100ms\n",(long)res);
+
+ /* Correct the lapic counter value */
+ res = (((u64)(*delta)) * pm_100ms);
+ do_div(res, deltapm);
+ pr_info("APIC delta adjusted to PM-Timer: "
+ "%lu (%ld)\n", (unsigned long)res, *delta);
+ *delta = (long)res;
+
+ /* Correct the tsc counter value */
+ if (boot_cpu_has(X86_FEATURE_TSC)) {
+ res = (((u64)(*deltatsc)) * pm_100ms);
+ do_div(res, deltapm);
+ apic_printk(APIC_VERBOSE, "TSC delta adjusted to "
+ "PM-Timer: %lu (%ld)\n",
+ (unsigned long)res, *deltatsc);
+ *deltatsc = (long)res;
+ }
+
+ return 0;
+}
+
+static int __init calibrate_APIC_clock(void)
+{
+ struct clock_event_device *levt = this_cpu_ptr(&lapic_events);
+ u64 tsc_perj = 0, tsc_start = 0;
+ unsigned long jif_start;
+ unsigned long deltaj;
+ long delta, deltatsc;
+ int pm_referenced = 0;
+
+ /**
+ * check if lapic timer has already been calibrated by platform
+ * specific routine, such as tsc calibration code. if so, we just fill
+ * in the clockevent structure and return.
+ */
+
+ if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) {
+ return 0;
+ } else if (lapic_timer_frequency) {
+ apic_printk(APIC_VERBOSE, "lapic timer already calibrated %d\n",
+ lapic_timer_frequency);
+ lapic_clockevent.mult = div_sc(lapic_timer_frequency/APIC_DIVISOR,
+ TICK_NSEC, lapic_clockevent.shift);
+ lapic_clockevent.max_delta_ns =
+ clockevent_delta2ns(0x7FFFFF, &lapic_clockevent);
+ lapic_clockevent.max_delta_ticks = 0x7FFFFF;
+ lapic_clockevent.min_delta_ns =
+ clockevent_delta2ns(0xF, &lapic_clockevent);
+ lapic_clockevent.min_delta_ticks = 0xF;
+ lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
+ return 0;
+ }
+
+ apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"
+ "calibrating APIC timer ...\n");
+
+ /*
+ * There are platforms w/o global clockevent devices. Instead of
+ * making the calibration conditional on that, use a polling based
+ * approach everywhere.
+ */
+ local_irq_disable();
+
+ /*
+ * Setup the APIC counter to maximum. There is no way the lapic
+ * can underflow in the 100ms detection time frame
+ */
+ __setup_APIC_LVTT(0xffffffff, 0, 0);
+
+ /*
+ * Methods to terminate the calibration loop:
+ * 1) Global clockevent if available (jiffies)
+ * 2) TSC if available and frequency is known
+ */
+ jif_start = READ_ONCE(jiffies);
+
+ if (tsc_khz) {
+ tsc_start = rdtsc();
+ tsc_perj = div_u64((u64)tsc_khz * 1000, HZ);
+ }
+
+ /*
+ * Enable interrupts so the tick can fire, if a global
+ * clockevent device is available
+ */
+ local_irq_enable();
+
+ while (lapic_cal_loops <= LAPIC_CAL_LOOPS) {
+ /* Wait for a tick to elapse */
+ while (1) {
+ if (tsc_khz) {
+ u64 tsc_now = rdtsc();
+ if ((tsc_now - tsc_start) >= tsc_perj) {
+ tsc_start += tsc_perj;
+ break;
+ }
+ } else {
+ unsigned long jif_now = READ_ONCE(jiffies);
+
+ if (time_after(jif_now, jif_start)) {
+ jif_start = jif_now;
+ break;
+ }
+ }
+ cpu_relax();
+ }
+
+ /* Invoke the calibration routine */
+ local_irq_disable();
+ lapic_cal_handler(NULL);
+ local_irq_enable();
+ }
+
+ local_irq_disable();
+
+ /* Build delta t1-t2 as apic timer counts down */
+ delta = lapic_cal_t1 - lapic_cal_t2;
+ apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta);
+
+ deltatsc = (long)(lapic_cal_tsc2 - lapic_cal_tsc1);
+
+ /* we trust the PM based calibration if possible */
+ pm_referenced = !calibrate_by_pmtimer(lapic_cal_pm2 - lapic_cal_pm1,
+ &delta, &deltatsc);
+
+ /* Calculate the scaled math multiplication factor */
+ lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS,
+ lapic_clockevent.shift);
+ lapic_clockevent.max_delta_ns =
+ clockevent_delta2ns(0x7FFFFFFF, &lapic_clockevent);
+ lapic_clockevent.max_delta_ticks = 0x7FFFFFFF;
+ lapic_clockevent.min_delta_ns =
+ clockevent_delta2ns(0xF, &lapic_clockevent);
+ lapic_clockevent.min_delta_ticks = 0xF;
+
+ lapic_timer_frequency = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS;
+
+ apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta);
+ apic_printk(APIC_VERBOSE, "..... mult: %u\n", lapic_clockevent.mult);
+ apic_printk(APIC_VERBOSE, "..... calibration result: %u\n",
+ lapic_timer_frequency);
+
+ if (boot_cpu_has(X86_FEATURE_TSC)) {
+ apic_printk(APIC_VERBOSE, "..... CPU clock speed is "
+ "%ld.%04ld MHz.\n",
+ (deltatsc / LAPIC_CAL_LOOPS) / (1000000 / HZ),
+ (deltatsc / LAPIC_CAL_LOOPS) % (1000000 / HZ));
+ }
+
+ apic_printk(APIC_VERBOSE, "..... host bus clock speed is "
+ "%u.%04u MHz.\n",
+ lapic_timer_frequency / (1000000 / HZ),
+ lapic_timer_frequency % (1000000 / HZ));
+
+ /*
+ * Do a sanity check on the APIC calibration result
+ */
+ if (lapic_timer_frequency < (1000000 / HZ)) {
+ local_irq_enable();
+ pr_warning("APIC frequency too slow, disabling apic timer\n");
+ return -1;
+ }
+
+ levt->features &= ~CLOCK_EVT_FEAT_DUMMY;
+
+ /*
+ * PM timer calibration failed or not turned on so lets try APIC
+ * timer based calibration, if a global clockevent device is
+ * available.
+ */
+ if (!pm_referenced && global_clock_event) {
+ apic_printk(APIC_VERBOSE, "... verify APIC timer\n");
+
+ /*
+ * Setup the apic timer manually
+ */
+ levt->event_handler = lapic_cal_handler;
+ lapic_timer_set_periodic(levt);
+ lapic_cal_loops = -1;
+
+ /* Let the interrupts run */
+ local_irq_enable();
+
+ while (lapic_cal_loops <= LAPIC_CAL_LOOPS)
+ cpu_relax();
+
+ /* Stop the lapic timer */
+ local_irq_disable();
+ lapic_timer_shutdown(levt);
+
+ /* Jiffies delta */
+ deltaj = lapic_cal_j2 - lapic_cal_j1;
+ apic_printk(APIC_VERBOSE, "... jiffies delta = %lu\n", deltaj);
+
+ /* Check, if the jiffies result is consistent */
+ if (deltaj >= LAPIC_CAL_LOOPS-2 && deltaj <= LAPIC_CAL_LOOPS+2)
+ apic_printk(APIC_VERBOSE, "... jiffies result ok\n");
+ else
+ levt->features |= CLOCK_EVT_FEAT_DUMMY;
+ }
+ local_irq_enable();
+
+ if (levt->features & CLOCK_EVT_FEAT_DUMMY) {
+ pr_warning("APIC timer disabled due to verification failure\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ * Setup the boot APIC
+ *
+ * Calibrate and verify the result.
+ */
+void __init setup_boot_APIC_clock(void)
+{
+ /*
+ * The local apic timer can be disabled via the kernel
+ * commandline or from the CPU detection code. Register the lapic
+ * timer as a dummy clock event source on SMP systems, so the
+ * broadcast mechanism is used. On UP systems simply ignore it.
+ */
+ if (disable_apic_timer) {
+ pr_info("Disabling APIC timer\n");
+ /* No broadcast on UP ! */
+ if (num_possible_cpus() > 1) {
+ lapic_clockevent.mult = 1;
+ setup_APIC_timer();
+ }
+ return;
+ }
+
+ if (calibrate_APIC_clock()) {
+ /* No broadcast on UP ! */
+ if (num_possible_cpus() > 1)
+ setup_APIC_timer();
+ return;
+ }
+
+ /*
+ * If nmi_watchdog is set to IO_APIC, we need the
+ * PIT/HPET going. Otherwise register lapic as a dummy
+ * device.
+ */
+ lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
+
+ /* Setup the lapic or request the broadcast */
+ setup_APIC_timer();
+ amd_e400_c1e_apic_setup();
+}
+
+void setup_secondary_APIC_clock(void)
+{
+ setup_APIC_timer();
+ amd_e400_c1e_apic_setup();
+}
+
+/*
+ * The guts of the apic timer interrupt
+ */
+static void local_apic_timer_interrupt(void)
+{
+ struct clock_event_device *evt = this_cpu_ptr(&lapic_events);
+
+ /*
+ * Normally we should not be here till LAPIC has been initialized but
+ * in some cases like kdump, its possible that there is a pending LAPIC
+ * timer interrupt from previous kernel's context and is delivered in
+ * new kernel the moment interrupts are enabled.
+ *
+ * Interrupts are enabled early and LAPIC is setup much later, hence
+ * its possible that when we get here evt->event_handler is NULL.
+ * Check for event_handler being NULL and discard the interrupt as
+ * spurious.
+ */
+ if (!evt->event_handler) {
+ pr_warning("Spurious LAPIC timer interrupt on cpu %d\n",
+ smp_processor_id());
+ /* Switch it off */
+ lapic_timer_shutdown(evt);
+ return;
+ }
+
+ /*
+ * the NMI deadlock-detector uses this.
+ */
+ inc_irq_stat(apic_timer_irqs);
+
+ evt->event_handler(evt);
+}
+
+/*
+ * Local APIC timer interrupt. This is the most natural way for doing
+ * local interrupts, but local timer interrupts can be emulated by
+ * broadcast interrupts too. [in case the hw doesn't support APIC timers]
+ *
+ * [ if a single-CPU system runs an SMP kernel then we call the local
+ * interrupt as well. Thus we cannot inline the local irq ... ]
+ */
+__visible void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+
+ /*
+ * NOTE! We'd better ACK the irq immediately,
+ * because timer handling can be slow.
+ *
+ * update_process_times() expects us to have done irq_enter().
+ * Besides, if we don't timer interrupts ignore the global
+ * interrupt lock, which is the WrongThing (tm) to do.
+ */
+ entering_ack_irq();
+ trace_local_timer_entry(LOCAL_TIMER_VECTOR);
+ local_apic_timer_interrupt();
+ trace_local_timer_exit(LOCAL_TIMER_VECTOR);
+ exiting_irq();
+
+ set_irq_regs(old_regs);
+}
+
+int setup_profiling_timer(unsigned int multiplier)
+{
+ return -EINVAL;
+}
+
+/*
+ * Local APIC start and shutdown
+ */
+
+/**
+ * clear_local_APIC - shutdown the local APIC
+ *
+ * This is called, when a CPU is disabled and before rebooting, so the state of
+ * the local APIC has no dangling leftovers. Also used to cleanout any BIOS
+ * leftovers during boot.
+ */
+void clear_local_APIC(void)
+{
+ int maxlvt;
+ u32 v;
+
+ /* APIC hasn't been mapped yet */
+ if (!x2apic_mode && !apic_phys)
+ return;
+
+ maxlvt = lapic_get_maxlvt();
+ /*
+ * Masking an LVT entry can trigger a local APIC error
+ * if the vector is zero. Mask LVTERR first to prevent this.
+ */
+ if (maxlvt >= 3) {
+ v = ERROR_APIC_VECTOR; /* any non-zero vector will do */
+ apic_write(APIC_LVTERR, v | APIC_LVT_MASKED);
+ }
+ /*
+ * Careful: we have to set masks only first to deassert
+ * any level-triggered sources.
+ */
+ v = apic_read(APIC_LVTT);
+ apic_write(APIC_LVTT, v | APIC_LVT_MASKED);
+ v = apic_read(APIC_LVT0);
+ apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
+ v = apic_read(APIC_LVT1);
+ apic_write(APIC_LVT1, v | APIC_LVT_MASKED);
+ if (maxlvt >= 4) {
+ v = apic_read(APIC_LVTPC);
+ apic_write(APIC_LVTPC, v | APIC_LVT_MASKED);
+ }
+
+ /* lets not touch this if we didn't frob it */
+#ifdef CONFIG_X86_THERMAL_VECTOR
+ if (maxlvt >= 5) {
+ v = apic_read(APIC_LVTTHMR);
+ apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
+ }
+#endif
+#ifdef CONFIG_X86_MCE_INTEL
+ if (maxlvt >= 6) {
+ v = apic_read(APIC_LVTCMCI);
+ if (!(v & APIC_LVT_MASKED))
+ apic_write(APIC_LVTCMCI, v | APIC_LVT_MASKED);
+ }
+#endif
+
+ /*
+ * Clean APIC state for other OSs:
+ */
+ apic_write(APIC_LVTT, APIC_LVT_MASKED);
+ apic_write(APIC_LVT0, APIC_LVT_MASKED);
+ apic_write(APIC_LVT1, APIC_LVT_MASKED);
+ if (maxlvt >= 3)
+ apic_write(APIC_LVTERR, APIC_LVT_MASKED);
+ if (maxlvt >= 4)
+ apic_write(APIC_LVTPC, APIC_LVT_MASKED);
+
+ /* Integrated APIC (!82489DX) ? */
+ if (lapic_is_integrated()) {
+ if (maxlvt > 3)
+ /* Clear ESR due to Pentium errata 3AP and 11AP */
+ apic_write(APIC_ESR, 0);
+ apic_read(APIC_ESR);
+ }
+}
+
+/**
+ * disable_local_APIC - clear and disable the local APIC
+ */
+void disable_local_APIC(void)
+{
+ unsigned int value;
+
+ /* APIC hasn't been mapped yet */
+ if (!x2apic_mode && !apic_phys)
+ return;
+
+ clear_local_APIC();
+
+ /*
+ * Disable APIC (implies clearing of registers
+ * for 82489DX!).
+ */
+ value = apic_read(APIC_SPIV);
+ value &= ~APIC_SPIV_APIC_ENABLED;
+ apic_write(APIC_SPIV, value);
+
+#ifdef CONFIG_X86_32
+ /*
+ * When LAPIC was disabled by the BIOS and enabled by the kernel,
+ * restore the disabled state.
+ */
+ if (enabled_via_apicbase) {
+ unsigned int l, h;
+
+ rdmsr(MSR_IA32_APICBASE, l, h);
+ l &= ~MSR_IA32_APICBASE_ENABLE;
+ wrmsr(MSR_IA32_APICBASE, l, h);
+ }
+#endif
+}
+
+/*
+ * If Linux enabled the LAPIC against the BIOS default disable it down before
+ * re-entering the BIOS on shutdown. Otherwise the BIOS may get confused and
+ * not power-off. Additionally clear all LVT entries before disable_local_APIC
+ * for the case where Linux didn't enable the LAPIC.
+ */
+void lapic_shutdown(void)
+{
+ unsigned long flags;
+
+ if (!boot_cpu_has(X86_FEATURE_APIC) && !apic_from_smp_config())
+ return;
+
+ local_irq_save(flags);
+
+#ifdef CONFIG_X86_32
+ if (!enabled_via_apicbase)
+ clear_local_APIC();
+ else
+#endif
+ disable_local_APIC();
+
+
+ local_irq_restore(flags);
+}
+
+/**
+ * sync_Arb_IDs - synchronize APIC bus arbitration IDs
+ */
+void __init sync_Arb_IDs(void)
+{
+ /*
+ * Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 And not
+ * needed on AMD.
+ */
+ if (modern_apic() || boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+ return;
+
+ /*
+ * Wait for idle.
+ */
+ apic_wait_icr_idle();
+
+ apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
+ apic_write(APIC_ICR, APIC_DEST_ALLINC |
+ APIC_INT_LEVELTRIG | APIC_DM_INIT);
+}
+
+enum apic_intr_mode_id apic_intr_mode;
+
+static int __init apic_intr_mode_select(void)
+{
+ /* Check kernel option */
+ if (disable_apic) {
+ pr_info("APIC disabled via kernel command line\n");
+ return APIC_PIC;
+ }
+
+ /* Check BIOS */
+#ifdef CONFIG_X86_64
+ /* On 64-bit, the APIC must be integrated, Check local APIC only */
+ if (!boot_cpu_has(X86_FEATURE_APIC)) {
+ disable_apic = 1;
+ pr_info("APIC disabled by BIOS\n");
+ return APIC_PIC;
+ }
+#else
+ /* On 32-bit, the APIC may be integrated APIC or 82489DX */
+
+ /* Neither 82489DX nor integrated APIC ? */
+ if (!boot_cpu_has(X86_FEATURE_APIC) && !smp_found_config) {
+ disable_apic = 1;
+ return APIC_PIC;
+ }
+
+ /* If the BIOS pretends there is an integrated APIC ? */
+ if (!boot_cpu_has(X86_FEATURE_APIC) &&
+ APIC_INTEGRATED(boot_cpu_apic_version)) {
+ disable_apic = 1;
+ pr_err(FW_BUG "Local APIC %d not detected, force emulation\n",
+ boot_cpu_physical_apicid);
+ return APIC_PIC;
+ }
+#endif
+
+ /* Check MP table or ACPI MADT configuration */
+ if (!smp_found_config) {
+ disable_ioapic_support();
+ if (!acpi_lapic) {
+ pr_info("APIC: ACPI MADT or MP tables are not detected\n");
+ return APIC_VIRTUAL_WIRE_NO_CONFIG;
+ }
+ return APIC_VIRTUAL_WIRE;
+ }
+
+#ifdef CONFIG_SMP
+ /* If SMP should be disabled, then really disable it! */
+ if (!setup_max_cpus) {
+ pr_info("APIC: SMP mode deactivated\n");
+ return APIC_SYMMETRIC_IO_NO_ROUTING;
+ }
+
+ if (read_apic_id() != boot_cpu_physical_apicid) {
+ panic("Boot APIC ID in local APIC unexpected (%d vs %d)",
+ read_apic_id(), boot_cpu_physical_apicid);
+ /* Or can we switch back to PIC here? */
+ }
+#endif
+
+ return APIC_SYMMETRIC_IO;
+}
+
+/*
+ * An initial setup of the virtual wire mode.
+ */
+void __init init_bsp_APIC(void)
+{
+ unsigned int value;
+
+ /*
+ * Don't do the setup now if we have a SMP BIOS as the
+ * through-I/O-APIC virtual wire mode might be active.
+ */
+ if (smp_found_config || !boot_cpu_has(X86_FEATURE_APIC))
+ return;
+
+ /*
+ * Do not trust the local APIC being empty at bootup.
+ */
+ clear_local_APIC();
+
+ /*
+ * Enable APIC.
+ */
+ value = apic_read(APIC_SPIV);
+ value &= ~APIC_VECTOR_MASK;
+ value |= APIC_SPIV_APIC_ENABLED;
+
+#ifdef CONFIG_X86_32
+ /* This bit is reserved on P4/Xeon and should be cleared */
+ if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
+ (boot_cpu_data.x86 == 15))
+ value &= ~APIC_SPIV_FOCUS_DISABLED;
+ else
+#endif
+ value |= APIC_SPIV_FOCUS_DISABLED;
+ value |= SPURIOUS_APIC_VECTOR;
+ apic_write(APIC_SPIV, value);
+
+ /*
+ * Set up the virtual wire mode.
+ */
+ apic_write(APIC_LVT0, APIC_DM_EXTINT);
+ value = APIC_DM_NMI;
+ if (!lapic_is_integrated()) /* 82489DX */
+ value |= APIC_LVT_LEVEL_TRIGGER;
+ if (apic_extnmi == APIC_EXTNMI_NONE)
+ value |= APIC_LVT_MASKED;
+ apic_write(APIC_LVT1, value);
+}
+
+/* Init the interrupt delivery mode for the BSP */
+void __init apic_intr_mode_init(void)
+{
+ bool upmode = IS_ENABLED(CONFIG_UP_LATE_INIT);
+
+ apic_intr_mode = apic_intr_mode_select();
+
+ switch (apic_intr_mode) {
+ case APIC_PIC:
+ pr_info("APIC: Keep in PIC mode(8259)\n");
+ return;
+ case APIC_VIRTUAL_WIRE:
+ pr_info("APIC: Switch to virtual wire mode setup\n");
+ default_setup_apic_routing();
+ break;
+ case APIC_VIRTUAL_WIRE_NO_CONFIG:
+ pr_info("APIC: Switch to virtual wire mode setup with no configuration\n");
+ upmode = true;
+ default_setup_apic_routing();
+ break;
+ case APIC_SYMMETRIC_IO:
+ pr_info("APIC: Switch to symmetric I/O mode setup\n");
+ default_setup_apic_routing();
+ break;
+ case APIC_SYMMETRIC_IO_NO_ROUTING:
+ pr_info("APIC: Switch to symmetric I/O mode setup in no SMP routine\n");
+ break;
+ }
+
+ apic_bsp_setup(upmode);
+}
+
+static void lapic_setup_esr(void)
+{
+ unsigned int oldvalue, value, maxlvt;
+
+ if (!lapic_is_integrated()) {
+ pr_info("No ESR for 82489DX.\n");
+ return;
+ }
+
+ if (apic->disable_esr) {
+ /*
+ * Something untraceable is creating bad interrupts on
+ * secondary quads ... for the moment, just leave the
+ * ESR disabled - we can't do anything useful with the
+ * errors anyway - mbligh
+ */
+ pr_info("Leaving ESR disabled.\n");
+ return;
+ }
+
+ maxlvt = lapic_get_maxlvt();
+ if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
+ apic_write(APIC_ESR, 0);
+ oldvalue = apic_read(APIC_ESR);
+
+ /* enables sending errors */
+ value = ERROR_APIC_VECTOR;
+ apic_write(APIC_LVTERR, value);
+
+ /*
+ * spec says clear errors after enabling vector.
+ */
+ if (maxlvt > 3)
+ apic_write(APIC_ESR, 0);
+ value = apic_read(APIC_ESR);
+ if (value != oldvalue)
+ apic_printk(APIC_VERBOSE, "ESR value before enabling "
+ "vector: 0x%08x after: 0x%08x\n",
+ oldvalue, value);
+}
+
+#define APIC_IR_REGS APIC_ISR_NR
+#define APIC_IR_BITS (APIC_IR_REGS * 32)
+#define APIC_IR_MAPSIZE (APIC_IR_BITS / BITS_PER_LONG)
+
+union apic_ir {
+ unsigned long map[APIC_IR_MAPSIZE];
+ u32 regs[APIC_IR_REGS];
+};
+
+static bool apic_check_and_ack(union apic_ir *irr, union apic_ir *isr)
+{
+ int i, bit;
+
+ /* Read the IRRs */
+ for (i = 0; i < APIC_IR_REGS; i++)
+ irr->regs[i] = apic_read(APIC_IRR + i * 0x10);
+
+ /* Read the ISRs */
+ for (i = 0; i < APIC_IR_REGS; i++)
+ isr->regs[i] = apic_read(APIC_ISR + i * 0x10);
+
+ /*
+ * If the ISR map is not empty. ACK the APIC and run another round
+ * to verify whether a pending IRR has been unblocked and turned
+ * into a ISR.
+ */
+ if (!bitmap_empty(isr->map, APIC_IR_BITS)) {
+ /*
+ * There can be multiple ISR bits set when a high priority
+ * interrupt preempted a lower priority one. Issue an ACK
+ * per set bit.
+ */
+ for_each_set_bit(bit, isr->map, APIC_IR_BITS)
+ ack_APIC_irq();
+ return true;
+ }
+
+ return !bitmap_empty(irr->map, APIC_IR_BITS);
+}
+
+/*
+ * After a crash, we no longer service the interrupts and a pending
+ * interrupt from previous kernel might still have ISR bit set.
+ *
+ * Most probably by now the CPU has serviced that pending interrupt and it
+ * might not have done the ack_APIC_irq() because it thought, interrupt
+ * came from i8259 as ExtInt. LAPIC did not get EOI so it does not clear
+ * the ISR bit and cpu thinks it has already serivced the interrupt. Hence
+ * a vector might get locked. It was noticed for timer irq (vector
+ * 0x31). Issue an extra EOI to clear ISR.
+ *
+ * If there are pending IRR bits they turn into ISR bits after a higher
+ * priority ISR bit has been acked.
+ */
+static void apic_pending_intr_clear(void)
+{
+ union apic_ir irr, isr;
+ unsigned int i;
+
+ /* 512 loops are way oversized and give the APIC a chance to obey. */
+ for (i = 0; i < 512; i++) {
+ if (!apic_check_and_ack(&irr, &isr))
+ return;
+ }
+ /* Dump the IRR/ISR content if that failed */
+ pr_warn("APIC: Stale IRR: %256pb ISR: %256pb\n", irr.map, isr.map);
+}
+
+/**
+ * setup_local_APIC - setup the local APIC
+ *
+ * Used to setup local APIC while initializing BSP or bringing up APs.
+ * Always called with preemption disabled.
+ */
+static void setup_local_APIC(void)
+{
+ int cpu = smp_processor_id();
+ unsigned int value;
+
+
+ if (disable_apic) {
+ disable_ioapic_support();
+ return;
+ }
+
+ /*
+ * If this comes from kexec/kcrash the APIC might be enabled in
+ * SPIV. Soft disable it before doing further initialization.
+ */
+ value = apic_read(APIC_SPIV);
+ value &= ~APIC_SPIV_APIC_ENABLED;
+ apic_write(APIC_SPIV, value);
+
+#ifdef CONFIG_X86_32
+ /* Pound the ESR really hard over the head with a big hammer - mbligh */
+ if (lapic_is_integrated() && apic->disable_esr) {
+ apic_write(APIC_ESR, 0);
+ apic_write(APIC_ESR, 0);
+ apic_write(APIC_ESR, 0);
+ apic_write(APIC_ESR, 0);
+ }
+#endif
+ perf_events_lapic_init();
+
+ /*
+ * Double-check whether this APIC is really registered.
+ * This is meaningless in clustered apic mode, so we skip it.
+ */
+ BUG_ON(!apic->apic_id_registered());
+
+ /*
+ * Intel recommends to set DFR, LDR and TPR before enabling
+ * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
+ * document number 292116). So here it goes...
+ */
+ apic->init_apic_ldr();
+
+#ifdef CONFIG_X86_32
+ if (apic->dest_logical) {
+ int logical_apicid, ldr_apicid;
+
+ /*
+ * APIC LDR is initialized. If logical_apicid mapping was
+ * initialized during get_smp_config(), make sure it matches
+ * the actual value.
+ */
+ logical_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
+ ldr_apicid = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR));
+ if (logical_apicid != BAD_APICID)
+ WARN_ON(logical_apicid != ldr_apicid);
+ /* Always use the value from LDR. */
+ early_per_cpu(x86_cpu_to_logical_apicid, cpu) = ldr_apicid;
+ }
+#endif
+
+ /*
+ * Set Task Priority to 'accept all'. We never change this
+ * later on.
+ */
+ value = apic_read(APIC_TASKPRI);
+ value &= ~APIC_TPRI_MASK;
+ apic_write(APIC_TASKPRI, value);
+
+ /* Clear eventually stale ISR/IRR bits */
+ apic_pending_intr_clear();
+
+ /*
+ * Now that we are all set up, enable the APIC
+ */
+ value = apic_read(APIC_SPIV);
+ value &= ~APIC_VECTOR_MASK;
+ /*
+ * Enable APIC
+ */
+ value |= APIC_SPIV_APIC_ENABLED;
+
+#ifdef CONFIG_X86_32
+ /*
+ * Some unknown Intel IO/APIC (or APIC) errata is biting us with
+ * certain networking cards. If high frequency interrupts are
+ * happening on a particular IOAPIC pin, plus the IOAPIC routing
+ * entry is masked/unmasked at a high rate as well then sooner or
+ * later IOAPIC line gets 'stuck', no more interrupts are received
+ * from the device. If focus CPU is disabled then the hang goes
+ * away, oh well :-(
+ *
+ * [ This bug can be reproduced easily with a level-triggered
+ * PCI Ne2000 networking cards and PII/PIII processors, dual
+ * BX chipset. ]
+ */
+ /*
+ * Actually disabling the focus CPU check just makes the hang less
+ * frequent as it makes the interrupt distributon model be more
+ * like LRU than MRU (the short-term load is more even across CPUs).
+ */
+
+ /*
+ * - enable focus processor (bit==0)
+ * - 64bit mode always use processor focus
+ * so no need to set it
+ */
+ value &= ~APIC_SPIV_FOCUS_DISABLED;
+#endif
+
+ /*
+ * Set spurious IRQ vector
+ */
+ value |= SPURIOUS_APIC_VECTOR;
+ apic_write(APIC_SPIV, value);
+
+ /*
+ * Set up LVT0, LVT1:
+ *
+ * set up through-local-APIC on the boot CPU's LINT0. This is not
+ * strictly necessary in pure symmetric-IO mode, but sometimes
+ * we delegate interrupts to the 8259A.
+ */
+ /*
+ * TODO: set up through-local-APIC from through-I/O-APIC? --macro
+ */
+ value = apic_read(APIC_LVT0) & APIC_LVT_MASKED;
+ if (!cpu && (pic_mode || !value || skip_ioapic_setup)) {
+ value = APIC_DM_EXTINT;
+ apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", cpu);
+ } else {
+ value = APIC_DM_EXTINT | APIC_LVT_MASKED;
+ apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", cpu);
+ }
+ apic_write(APIC_LVT0, value);
+
+ /*
+ * Only the BSP sees the LINT1 NMI signal by default. This can be
+ * modified by apic_extnmi= boot option.
+ */
+ if ((!cpu && apic_extnmi != APIC_EXTNMI_NONE) ||
+ apic_extnmi == APIC_EXTNMI_ALL)
+ value = APIC_DM_NMI;
+ else
+ value = APIC_DM_NMI | APIC_LVT_MASKED;
+
+ /* Is 82489DX ? */
+ if (!lapic_is_integrated())
+ value |= APIC_LVT_LEVEL_TRIGGER;
+ apic_write(APIC_LVT1, value);
+
+#ifdef CONFIG_X86_MCE_INTEL
+ /* Recheck CMCI information after local APIC is up on CPU #0 */
+ if (!cpu)
+ cmci_recheck();
+#endif
+}
+
+static void end_local_APIC_setup(void)
+{
+ lapic_setup_esr();
+
+#ifdef CONFIG_X86_32
+ {
+ unsigned int value;
+ /* Disable the local apic timer */
+ value = apic_read(APIC_LVTT);
+ value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
+ apic_write(APIC_LVTT, value);
+ }
+#endif
+
+ apic_pm_activate();
+}
+
+/*
+ * APIC setup function for application processors. Called from smpboot.c
+ */
+void apic_ap_setup(void)
+{
+ setup_local_APIC();
+ end_local_APIC_setup();
+}
+
+#ifdef CONFIG_X86_X2APIC
+int x2apic_mode;
+
+enum {
+ X2APIC_OFF,
+ X2APIC_ON,
+ X2APIC_DISABLED,
+};
+static int x2apic_state;
+
+static void __x2apic_disable(void)
+{
+ u64 msr;
+
+ if (!boot_cpu_has(X86_FEATURE_APIC))
+ return;
+
+ rdmsrl(MSR_IA32_APICBASE, msr);
+ if (!(msr & X2APIC_ENABLE))
+ return;
+ /* Disable xapic and x2apic first and then reenable xapic mode */
+ wrmsrl(MSR_IA32_APICBASE, msr & ~(X2APIC_ENABLE | XAPIC_ENABLE));
+ wrmsrl(MSR_IA32_APICBASE, msr & ~X2APIC_ENABLE);
+ printk_once(KERN_INFO "x2apic disabled\n");
+}
+
+static void __x2apic_enable(void)
+{
+ u64 msr;
+
+ rdmsrl(MSR_IA32_APICBASE, msr);
+ if (msr & X2APIC_ENABLE)
+ return;
+ wrmsrl(MSR_IA32_APICBASE, msr | X2APIC_ENABLE);
+ printk_once(KERN_INFO "x2apic enabled\n");
+}
+
+static int __init setup_nox2apic(char *str)
+{
+ if (x2apic_enabled()) {
+ int apicid = native_apic_msr_read(APIC_ID);
+
+ if (apicid >= 255) {
+ pr_warning("Apicid: %08x, cannot enforce nox2apic\n",
+ apicid);
+ return 0;
+ }
+ pr_warning("x2apic already enabled.\n");
+ __x2apic_disable();
+ }
+ setup_clear_cpu_cap(X86_FEATURE_X2APIC);
+ x2apic_state = X2APIC_DISABLED;
+ x2apic_mode = 0;
+ return 0;
+}
+early_param("nox2apic", setup_nox2apic);
+
+/* Called from cpu_init() to enable x2apic on (secondary) cpus */
+void x2apic_setup(void)
+{
+ /*
+ * If x2apic is not in ON state, disable it if already enabled
+ * from BIOS.
+ */
+ if (x2apic_state != X2APIC_ON) {
+ __x2apic_disable();
+ return;
+ }
+ __x2apic_enable();
+}
+
+static __init void x2apic_disable(void)
+{
+ u32 x2apic_id, state = x2apic_state;
+
+ x2apic_mode = 0;
+ x2apic_state = X2APIC_DISABLED;
+
+ if (state != X2APIC_ON)
+ return;
+
+ x2apic_id = read_apic_id();
+ if (x2apic_id >= 255)
+ panic("Cannot disable x2apic, id: %08x\n", x2apic_id);
+
+ __x2apic_disable();
+ register_lapic_address(mp_lapic_addr);
+}
+
+static __init void x2apic_enable(void)
+{
+ if (x2apic_state != X2APIC_OFF)
+ return;
+
+ x2apic_mode = 1;
+ x2apic_state = X2APIC_ON;
+ __x2apic_enable();
+}
+
+static __init void try_to_enable_x2apic(int remap_mode)
+{
+ if (x2apic_state == X2APIC_DISABLED)
+ return;
+
+ if (remap_mode != IRQ_REMAP_X2APIC_MODE) {
+ /*
+ * Using X2APIC without IR is not architecturally supported
+ * on bare metal but may be supported in guests.
+ */
+ if (!x86_init.hyper.x2apic_available()) {
+ pr_info("x2apic: IRQ remapping doesn't support X2APIC mode\n");
+ x2apic_disable();
+ return;
+ }
+
+ /*
+ * Without IR, all CPUs can be addressed by IOAPIC/MSI only
+ * in physical mode, and CPUs with an APIC ID that cannnot
+ * be addressed must not be brought online.
+ */
+ x2apic_set_max_apicid(255);
+ x2apic_phys = 1;
+ }
+ x2apic_enable();
+}
+
+void __init check_x2apic(void)
+{
+ if (x2apic_enabled()) {
+ pr_info("x2apic: enabled by BIOS, switching to x2apic ops\n");
+ x2apic_mode = 1;
+ x2apic_state = X2APIC_ON;
+ } else if (!boot_cpu_has(X86_FEATURE_X2APIC)) {
+ x2apic_state = X2APIC_DISABLED;
+ }
+}
+#else /* CONFIG_X86_X2APIC */
+static int __init validate_x2apic(void)
+{
+ if (!apic_is_x2apic_enabled())
+ return 0;
+ /*
+ * Checkme: Can we simply turn off x2apic here instead of panic?
+ */
+ panic("BIOS has enabled x2apic but kernel doesn't support x2apic, please disable x2apic in BIOS.\n");
+}
+early_initcall(validate_x2apic);
+
+static inline void try_to_enable_x2apic(int remap_mode) { }
+static inline void __x2apic_enable(void) { }
+#endif /* !CONFIG_X86_X2APIC */
+
+void __init enable_IR_x2apic(void)
+{
+ unsigned long flags;
+ int ret, ir_stat;
+
+ if (skip_ioapic_setup) {
+ pr_info("Not enabling interrupt remapping due to skipped IO-APIC setup\n");
+ return;
+ }
+
+ ir_stat = irq_remapping_prepare();
+ if (ir_stat < 0 && !x2apic_supported())
+ return;
+
+ ret = save_ioapic_entries();
+ if (ret) {
+ pr_info("Saving IO-APIC state failed: %d\n", ret);
+ return;
+ }
+
+ local_irq_save(flags);
+ legacy_pic->mask_all();
+ mask_ioapic_entries();
+
+ /* If irq_remapping_prepare() succeeded, try to enable it */
+ if (ir_stat >= 0)
+ ir_stat = irq_remapping_enable();
+ /* ir_stat contains the remap mode or an error code */
+ try_to_enable_x2apic(ir_stat);
+
+ if (ir_stat < 0)
+ restore_ioapic_entries();
+ legacy_pic->restore_mask();
+ local_irq_restore(flags);
+}
+
+#ifdef CONFIG_X86_64
+/*
+ * Detect and enable local APICs on non-SMP boards.
+ * Original code written by Keir Fraser.
+ * On AMD64 we trust the BIOS - if it says no APIC it is likely
+ * not correctly set up (usually the APIC timer won't work etc.)
+ */
+static int __init detect_init_APIC(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_APIC)) {
+ pr_info("No local APIC present\n");
+ return -1;
+ }
+
+ mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
+ return 0;
+}
+#else
+
+static int __init apic_verify(void)
+{
+ u32 features, h, l;
+
+ /*
+ * The APIC feature bit should now be enabled
+ * in `cpuid'
+ */
+ features = cpuid_edx(1);
+ if (!(features & (1 << X86_FEATURE_APIC))) {
+ pr_warning("Could not enable APIC!\n");
+ return -1;
+ }
+ set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
+ mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
+
+ /* The BIOS may have set up the APIC at some other address */
+ if (boot_cpu_data.x86 >= 6) {
+ rdmsr(MSR_IA32_APICBASE, l, h);
+ if (l & MSR_IA32_APICBASE_ENABLE)
+ mp_lapic_addr = l & MSR_IA32_APICBASE_BASE;
+ }
+
+ pr_info("Found and enabled local APIC!\n");
+ return 0;
+}
+
+int __init apic_force_enable(unsigned long addr)
+{
+ u32 h, l;
+
+ if (disable_apic)
+ return -1;
+
+ /*
+ * Some BIOSes disable the local APIC in the APIC_BASE
+ * MSR. This can only be done in software for Intel P6 or later
+ * and AMD K7 (Model > 1) or later.
+ */
+ if (boot_cpu_data.x86 >= 6) {
+ rdmsr(MSR_IA32_APICBASE, l, h);
+ if (!(l & MSR_IA32_APICBASE_ENABLE)) {
+ pr_info("Local APIC disabled by BIOS -- reenabling.\n");
+ l &= ~MSR_IA32_APICBASE_BASE;
+ l |= MSR_IA32_APICBASE_ENABLE | addr;
+ wrmsr(MSR_IA32_APICBASE, l, h);
+ enabled_via_apicbase = 1;
+ }
+ }
+ return apic_verify();
+}
+
+/*
+ * Detect and initialize APIC
+ */
+static int __init detect_init_APIC(void)
+{
+ /* Disabled by kernel option? */
+ if (disable_apic)
+ return -1;
+
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_AMD:
+ if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model > 1) ||
+ (boot_cpu_data.x86 >= 15))
+ break;
+ goto no_apic;
+ case X86_VENDOR_INTEL:
+ if (boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15 ||
+ (boot_cpu_data.x86 == 5 && boot_cpu_has(X86_FEATURE_APIC)))
+ break;
+ goto no_apic;
+ default:
+ goto no_apic;
+ }
+
+ if (!boot_cpu_has(X86_FEATURE_APIC)) {
+ /*
+ * Over-ride BIOS and try to enable the local APIC only if
+ * "lapic" specified.
+ */
+ if (!force_enable_local_apic) {
+ pr_info("Local APIC disabled by BIOS -- "
+ "you can enable it with \"lapic\"\n");
+ return -1;
+ }
+ if (apic_force_enable(APIC_DEFAULT_PHYS_BASE))
+ return -1;
+ } else {
+ if (apic_verify())
+ return -1;
+ }
+
+ apic_pm_activate();
+
+ return 0;
+
+no_apic:
+ pr_info("No local APIC present or hardware disabled\n");
+ return -1;
+}
+#endif
+
+/**
+ * init_apic_mappings - initialize APIC mappings
+ */
+void __init init_apic_mappings(void)
+{
+ unsigned int new_apicid;
+
+ if (apic_validate_deadline_timer())
+ pr_info("TSC deadline timer available\n");
+
+ if (x2apic_mode) {
+ boot_cpu_physical_apicid = read_apic_id();
+ return;
+ }
+
+ /* If no local APIC can be found return early */
+ if (!smp_found_config && detect_init_APIC()) {
+ /* lets NOP'ify apic operations */
+ pr_info("APIC: disable apic facility\n");
+ apic_disable();
+ } else {
+ apic_phys = mp_lapic_addr;
+
+ /*
+ * If the system has ACPI MADT tables or MP info, the LAPIC
+ * address is already registered.
+ */
+ if (!acpi_lapic && !smp_found_config)
+ register_lapic_address(apic_phys);
+ }
+
+ /*
+ * Fetch the APIC ID of the BSP in case we have a
+ * default configuration (or the MP table is broken).
+ */
+ new_apicid = read_apic_id();
+ if (boot_cpu_physical_apicid != new_apicid) {
+ boot_cpu_physical_apicid = new_apicid;
+ /*
+ * yeah -- we lie about apic_version
+ * in case if apic was disabled via boot option
+ * but it's not a problem for SMP compiled kernel
+ * since apic_intr_mode_select is prepared for such
+ * a case and disable smp mode
+ */
+ boot_cpu_apic_version = GET_APIC_VERSION(apic_read(APIC_LVR));
+ }
+}
+
+void __init register_lapic_address(unsigned long address)
+{
+ mp_lapic_addr = address;
+
+ if (!x2apic_mode) {
+ set_fixmap_nocache(FIX_APIC_BASE, address);
+ apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
+ APIC_BASE, address);
+ }
+ if (boot_cpu_physical_apicid == -1U) {
+ boot_cpu_physical_apicid = read_apic_id();
+ boot_cpu_apic_version = GET_APIC_VERSION(apic_read(APIC_LVR));
+ }
+}
+
+/*
+ * Local APIC interrupts
+ */
+
+/*
+ * This interrupt should _never_ happen with our APIC/SMP architecture
+ */
+__visible void __irq_entry smp_spurious_interrupt(struct pt_regs *regs)
+{
+ u8 vector = ~regs->orig_ax;
+ u32 v;
+
+ entering_irq();
+ trace_spurious_apic_entry(vector);
+
+ inc_irq_stat(irq_spurious_count);
+
+ /*
+ * If this is a spurious interrupt then do not acknowledge
+ */
+ if (vector == SPURIOUS_APIC_VECTOR) {
+ /* See SDM vol 3 */
+ pr_info("Spurious APIC interrupt (vector 0xFF) on CPU#%d, should never happen.\n",
+ smp_processor_id());
+ goto out;
+ }
+
+ /*
+ * If it is a vectored one, verify it's set in the ISR. If set,
+ * acknowledge it.
+ */
+ v = apic_read(APIC_ISR + ((vector & ~0x1f) >> 1));
+ if (v & (1 << (vector & 0x1f))) {
+ pr_info("Spurious interrupt (vector 0x%02x) on CPU#%d. Acked\n",
+ vector, smp_processor_id());
+ ack_APIC_irq();
+ } else {
+ pr_info("Spurious interrupt (vector 0x%02x) on CPU#%d. Not pending!\n",
+ vector, smp_processor_id());
+ }
+out:
+ trace_spurious_apic_exit(vector);
+ exiting_irq();
+}
+
+/*
+ * This interrupt should never happen with our APIC/SMP architecture
+ */
+__visible void __irq_entry smp_error_interrupt(struct pt_regs *regs)
+{
+ static const char * const error_interrupt_reason[] = {
+ "Send CS error", /* APIC Error Bit 0 */
+ "Receive CS error", /* APIC Error Bit 1 */
+ "Send accept error", /* APIC Error Bit 2 */
+ "Receive accept error", /* APIC Error Bit 3 */
+ "Redirectable IPI", /* APIC Error Bit 4 */
+ "Send illegal vector", /* APIC Error Bit 5 */
+ "Received illegal vector", /* APIC Error Bit 6 */
+ "Illegal register address", /* APIC Error Bit 7 */
+ };
+ u32 v, i = 0;
+
+ entering_irq();
+ trace_error_apic_entry(ERROR_APIC_VECTOR);
+
+ /* First tickle the hardware, only then report what went on. -- REW */
+ if (lapic_get_maxlvt() > 3) /* Due to the Pentium erratum 3AP. */
+ apic_write(APIC_ESR, 0);
+ v = apic_read(APIC_ESR);
+ ack_APIC_irq();
+ atomic_inc(&irq_err_count);
+
+ apic_printk(APIC_DEBUG, KERN_DEBUG "APIC error on CPU%d: %02x",
+ smp_processor_id(), v);
+
+ v &= 0xff;
+ while (v) {
+ if (v & 0x1)
+ apic_printk(APIC_DEBUG, KERN_CONT " : %s", error_interrupt_reason[i]);
+ i++;
+ v >>= 1;
+ }
+
+ apic_printk(APIC_DEBUG, KERN_CONT "\n");
+
+ trace_error_apic_exit(ERROR_APIC_VECTOR);
+ exiting_irq();
+}
+
+/**
+ * connect_bsp_APIC - attach the APIC to the interrupt system
+ */
+static void __init connect_bsp_APIC(void)
+{
+#ifdef CONFIG_X86_32
+ if (pic_mode) {
+ /*
+ * Do not trust the local APIC being empty at bootup.
+ */
+ clear_local_APIC();
+ /*
+ * PIC mode, enable APIC mode in the IMCR, i.e. connect BSP's
+ * local APIC to INT and NMI lines.
+ */
+ apic_printk(APIC_VERBOSE, "leaving PIC mode, "
+ "enabling APIC mode.\n");
+ imcr_pic_to_apic();
+ }
+#endif
+}
+
+/**
+ * disconnect_bsp_APIC - detach the APIC from the interrupt system
+ * @virt_wire_setup: indicates, whether virtual wire mode is selected
+ *
+ * Virtual wire mode is necessary to deliver legacy interrupts even when the
+ * APIC is disabled.
+ */
+void disconnect_bsp_APIC(int virt_wire_setup)
+{
+ unsigned int value;
+
+#ifdef CONFIG_X86_32
+ if (pic_mode) {
+ /*
+ * Put the board back into PIC mode (has an effect only on
+ * certain older boards). Note that APIC interrupts, including
+ * IPIs, won't work beyond this point! The only exception are
+ * INIT IPIs.
+ */
+ apic_printk(APIC_VERBOSE, "disabling APIC mode, "
+ "entering PIC mode.\n");
+ imcr_apic_to_pic();
+ return;
+ }
+#endif
+
+ /* Go back to Virtual Wire compatibility mode */
+
+ /* For the spurious interrupt use vector F, and enable it */
+ value = apic_read(APIC_SPIV);
+ value &= ~APIC_VECTOR_MASK;
+ value |= APIC_SPIV_APIC_ENABLED;
+ value |= 0xf;
+ apic_write(APIC_SPIV, value);
+
+ if (!virt_wire_setup) {
+ /*
+ * For LVT0 make it edge triggered, active high,
+ * external and enabled
+ */
+ value = apic_read(APIC_LVT0);
+ value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
+ APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
+ APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
+ value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
+ value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
+ apic_write(APIC_LVT0, value);
+ } else {
+ /* Disable LVT0 */
+ apic_write(APIC_LVT0, APIC_LVT_MASKED);
+ }
+
+ /*
+ * For LVT1 make it edge triggered, active high,
+ * nmi and enabled
+ */
+ value = apic_read(APIC_LVT1);
+ value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
+ APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
+ APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
+ value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
+ value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
+ apic_write(APIC_LVT1, value);
+}
+
+/*
+ * The number of allocated logical CPU IDs. Since logical CPU IDs are allocated
+ * contiguously, it equals to current allocated max logical CPU ID plus 1.
+ * All allocated CPU IDs should be in the [0, nr_logical_cpuids) range,
+ * so the maximum of nr_logical_cpuids is nr_cpu_ids.
+ *
+ * NOTE: Reserve 0 for BSP.
+ */
+static int nr_logical_cpuids = 1;
+
+/*
+ * Used to store mapping between logical CPU IDs and APIC IDs.
+ */
+static int cpuid_to_apicid[] = {
+ [0 ... NR_CPUS - 1] = -1,
+};
+
+bool arch_match_cpu_phys_id(int cpu, u64 phys_id)
+{
+ return phys_id == cpuid_to_apicid[cpu];
+}
+
+#ifdef CONFIG_SMP
+/**
+ * apic_id_is_primary_thread - Check whether APIC ID belongs to a primary thread
+ * @id: APIC ID to check
+ */
+bool apic_id_is_primary_thread(unsigned int apicid)
+{
+ u32 mask;
+
+ if (smp_num_siblings == 1)
+ return true;
+ /* Isolate the SMT bit(s) in the APICID and check for 0 */
+ mask = (1U << (fls(smp_num_siblings) - 1)) - 1;
+ return !(apicid & mask);
+}
+#endif
+
+/*
+ * Should use this API to allocate logical CPU IDs to keep nr_logical_cpuids
+ * and cpuid_to_apicid[] synchronized.
+ */
+static int allocate_logical_cpuid(int apicid)
+{
+ int i;
+
+ /*
+ * cpuid <-> apicid mapping is persistent, so when a cpu is up,
+ * check if the kernel has allocated a cpuid for it.
+ */
+ for (i = 0; i < nr_logical_cpuids; i++) {
+ if (cpuid_to_apicid[i] == apicid)
+ return i;
+ }
+
+ /* Allocate a new cpuid. */
+ if (nr_logical_cpuids >= nr_cpu_ids) {
+ WARN_ONCE(1, "APIC: NR_CPUS/possible_cpus limit of %u reached. "
+ "Processor %d/0x%x and the rest are ignored.\n",
+ nr_cpu_ids, nr_logical_cpuids, apicid);
+ return -EINVAL;
+ }
+
+ cpuid_to_apicid[nr_logical_cpuids] = apicid;
+ return nr_logical_cpuids++;
+}
+
+int generic_processor_info(int apicid, int version)
+{
+ int cpu, max = nr_cpu_ids;
+ bool boot_cpu_detected = physid_isset(boot_cpu_physical_apicid,
+ phys_cpu_present_map);
+
+ /*
+ * boot_cpu_physical_apicid is designed to have the apicid
+ * returned by read_apic_id(), i.e, the apicid of the
+ * currently booting-up processor. However, on some platforms,
+ * it is temporarily modified by the apicid reported as BSP
+ * through MP table. Concretely:
+ *
+ * - arch/x86/kernel/mpparse.c: MP_processor_info()
+ * - arch/x86/mm/amdtopology.c: amd_numa_init()
+ *
+ * This function is executed with the modified
+ * boot_cpu_physical_apicid. So, disabled_cpu_apicid kernel
+ * parameter doesn't work to disable APs on kdump 2nd kernel.
+ *
+ * Since fixing handling of boot_cpu_physical_apicid requires
+ * another discussion and tests on each platform, we leave it
+ * for now and here we use read_apic_id() directly in this
+ * function, generic_processor_info().
+ */
+ if (disabled_cpu_apicid != BAD_APICID &&
+ disabled_cpu_apicid != read_apic_id() &&
+ disabled_cpu_apicid == apicid) {
+ int thiscpu = num_processors + disabled_cpus;
+
+ pr_warning("APIC: Disabling requested cpu."
+ " Processor %d/0x%x ignored.\n",
+ thiscpu, apicid);
+
+ disabled_cpus++;
+ return -ENODEV;
+ }
+
+ /*
+ * If boot cpu has not been detected yet, then only allow upto
+ * nr_cpu_ids - 1 processors and keep one slot free for boot cpu
+ */
+ if (!boot_cpu_detected && num_processors >= nr_cpu_ids - 1 &&
+ apicid != boot_cpu_physical_apicid) {
+ int thiscpu = max + disabled_cpus - 1;
+
+ pr_warning(
+ "APIC: NR_CPUS/possible_cpus limit of %i almost"
+ " reached. Keeping one slot for boot cpu."
+ " Processor %d/0x%x ignored.\n", max, thiscpu, apicid);
+
+ disabled_cpus++;
+ return -ENODEV;
+ }
+
+ if (num_processors >= nr_cpu_ids) {
+ int thiscpu = max + disabled_cpus;
+
+ pr_warning("APIC: NR_CPUS/possible_cpus limit of %i "
+ "reached. Processor %d/0x%x ignored.\n",
+ max, thiscpu, apicid);
+
+ disabled_cpus++;
+ return -EINVAL;
+ }
+
+ if (apicid == boot_cpu_physical_apicid) {
+ /*
+ * x86_bios_cpu_apicid is required to have processors listed
+ * in same order as logical cpu numbers. Hence the first
+ * entry is BSP, and so on.
+ * boot_cpu_init() already hold bit 0 in cpu_present_mask
+ * for BSP.
+ */
+ cpu = 0;
+
+ /* Logical cpuid 0 is reserved for BSP. */
+ cpuid_to_apicid[0] = apicid;
+ } else {
+ cpu = allocate_logical_cpuid(apicid);
+ if (cpu < 0) {
+ disabled_cpus++;
+ return -EINVAL;
+ }
+ }
+
+ /*
+ * Validate version
+ */
+ if (version == 0x0) {
+ pr_warning("BIOS bug: APIC version is 0 for CPU %d/0x%x, fixing up to 0x10\n",
+ cpu, apicid);
+ version = 0x10;
+ }
+
+ if (version != boot_cpu_apic_version) {
+ pr_warning("BIOS bug: APIC version mismatch, boot CPU: %x, CPU %d: version %x\n",
+ boot_cpu_apic_version, cpu, version);
+ }
+
+ if (apicid > max_physical_apicid)
+ max_physical_apicid = apicid;
+
+#if defined(CONFIG_SMP) || defined(CONFIG_X86_64)
+ early_per_cpu(x86_cpu_to_apicid, cpu) = apicid;
+ early_per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
+#endif
+#ifdef CONFIG_X86_32
+ early_per_cpu(x86_cpu_to_logical_apicid, cpu) =
+ apic->x86_32_early_logical_apicid(cpu);
+#endif
+ set_cpu_possible(cpu, true);
+ physid_set(apicid, phys_cpu_present_map);
+ set_cpu_present(cpu, true);
+ num_processors++;
+
+ return cpu;
+}
+
+int hard_smp_processor_id(void)
+{
+ return read_apic_id();
+}
+
+/*
+ * Override the generic EOI implementation with an optimized version.
+ * Only called during early boot when only one CPU is active and with
+ * interrupts disabled, so we know this does not race with actual APIC driver
+ * use.
+ */
+void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v))
+{
+ struct apic **drv;
+
+ for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
+ /* Should happen once for each apic */
+ WARN_ON((*drv)->eoi_write == eoi_write);
+ (*drv)->native_eoi_write = (*drv)->eoi_write;
+ (*drv)->eoi_write = eoi_write;
+ }
+}
+
+static void __init apic_bsp_up_setup(void)
+{
+#ifdef CONFIG_X86_64
+ apic_write(APIC_ID, apic->set_apic_id(boot_cpu_physical_apicid));
+#else
+ /*
+ * Hack: In case of kdump, after a crash, kernel might be booting
+ * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid
+ * might be zero if read from MP tables. Get it from LAPIC.
+ */
+# ifdef CONFIG_CRASH_DUMP
+ boot_cpu_physical_apicid = read_apic_id();
+# endif
+#endif
+ physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
+}
+
+/**
+ * apic_bsp_setup - Setup function for local apic and io-apic
+ * @upmode: Force UP mode (for APIC_init_uniprocessor)
+ *
+ * Returns:
+ * apic_id of BSP APIC
+ */
+void __init apic_bsp_setup(bool upmode)
+{
+ connect_bsp_APIC();
+ if (upmode)
+ apic_bsp_up_setup();
+ setup_local_APIC();
+
+ enable_IO_APIC();
+ end_local_APIC_setup();
+ irq_remap_enable_fault_handling();
+ setup_IO_APIC();
+ lapic_update_legacy_vectors();
+}
+
+#ifdef CONFIG_UP_LATE_INIT
+void __init up_late_init(void)
+{
+ if (apic_intr_mode == APIC_PIC)
+ return;
+
+ /* Setup local timer */
+ x86_init.timers.setup_percpu_clockev();
+}
+#endif
+
+/*
+ * Power management
+ */
+#ifdef CONFIG_PM
+
+static struct {
+ /*
+ * 'active' is true if the local APIC was enabled by us and
+ * not the BIOS; this signifies that we are also responsible
+ * for disabling it before entering apm/acpi suspend
+ */
+ int active;
+ /* r/w apic fields */
+ unsigned int apic_id;
+ unsigned int apic_taskpri;
+ unsigned int apic_ldr;
+ unsigned int apic_dfr;
+ unsigned int apic_spiv;
+ unsigned int apic_lvtt;
+ unsigned int apic_lvtpc;
+ unsigned int apic_lvt0;
+ unsigned int apic_lvt1;
+ unsigned int apic_lvterr;
+ unsigned int apic_tmict;
+ unsigned int apic_tdcr;
+ unsigned int apic_thmr;
+ unsigned int apic_cmci;
+} apic_pm_state;
+
+static int lapic_suspend(void)
+{
+ unsigned long flags;
+ int maxlvt;
+
+ if (!apic_pm_state.active)
+ return 0;
+
+ maxlvt = lapic_get_maxlvt();
+
+ apic_pm_state.apic_id = apic_read(APIC_ID);
+ apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI);
+ apic_pm_state.apic_ldr = apic_read(APIC_LDR);
+ apic_pm_state.apic_dfr = apic_read(APIC_DFR);
+ apic_pm_state.apic_spiv = apic_read(APIC_SPIV);
+ apic_pm_state.apic_lvtt = apic_read(APIC_LVTT);
+ if (maxlvt >= 4)
+ apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC);
+ apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0);
+ apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1);
+ apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
+ apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
+ apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
+#ifdef CONFIG_X86_THERMAL_VECTOR
+ if (maxlvt >= 5)
+ apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
+#endif
+#ifdef CONFIG_X86_MCE_INTEL
+ if (maxlvt >= 6)
+ apic_pm_state.apic_cmci = apic_read(APIC_LVTCMCI);
+#endif
+
+ local_irq_save(flags);
+ disable_local_APIC();
+
+ irq_remapping_disable();
+
+ local_irq_restore(flags);
+ return 0;
+}
+
+static void lapic_resume(void)
+{
+ unsigned int l, h;
+ unsigned long flags;
+ int maxlvt;
+
+ if (!apic_pm_state.active)
+ return;
+
+ local_irq_save(flags);
+
+ /*
+ * IO-APIC and PIC have their own resume routines.
+ * We just mask them here to make sure the interrupt
+ * subsystem is completely quiet while we enable x2apic
+ * and interrupt-remapping.
+ */
+ mask_ioapic_entries();
+ legacy_pic->mask_all();
+
+ if (x2apic_mode) {
+ __x2apic_enable();
+ } else {
+ /*
+ * Make sure the APICBASE points to the right address
+ *
+ * FIXME! This will be wrong if we ever support suspend on
+ * SMP! We'll need to do this as part of the CPU restore!
+ */
+ if (boot_cpu_data.x86 >= 6) {
+ rdmsr(MSR_IA32_APICBASE, l, h);
+ l &= ~MSR_IA32_APICBASE_BASE;
+ l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
+ wrmsr(MSR_IA32_APICBASE, l, h);
+ }
+ }
+
+ maxlvt = lapic_get_maxlvt();
+ apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
+ apic_write(APIC_ID, apic_pm_state.apic_id);
+ apic_write(APIC_DFR, apic_pm_state.apic_dfr);
+ apic_write(APIC_LDR, apic_pm_state.apic_ldr);
+ apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri);
+ apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
+ apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
+ apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
+#ifdef CONFIG_X86_THERMAL_VECTOR
+ if (maxlvt >= 5)
+ apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
+#endif
+#ifdef CONFIG_X86_MCE_INTEL
+ if (maxlvt >= 6)
+ apic_write(APIC_LVTCMCI, apic_pm_state.apic_cmci);
+#endif
+ if (maxlvt >= 4)
+ apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
+ apic_write(APIC_LVTT, apic_pm_state.apic_lvtt);
+ apic_write(APIC_TDCR, apic_pm_state.apic_tdcr);
+ apic_write(APIC_TMICT, apic_pm_state.apic_tmict);
+ apic_write(APIC_ESR, 0);
+ apic_read(APIC_ESR);
+ apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr);
+ apic_write(APIC_ESR, 0);
+ apic_read(APIC_ESR);
+
+ irq_remapping_reenable(x2apic_mode);
+
+ local_irq_restore(flags);
+}
+
+/*
+ * This device has no shutdown method - fully functioning local APICs
+ * are needed on every CPU up until machine_halt/restart/poweroff.
+ */
+
+static struct syscore_ops lapic_syscore_ops = {
+ .resume = lapic_resume,
+ .suspend = lapic_suspend,
+};
+
+static void apic_pm_activate(void)
+{
+ apic_pm_state.active = 1;
+}
+
+static int __init init_lapic_sysfs(void)
+{
+ /* XXX: remove suspend/resume procs if !apic_pm_state.active? */
+ if (boot_cpu_has(X86_FEATURE_APIC))
+ register_syscore_ops(&lapic_syscore_ops);
+
+ return 0;
+}
+
+/* local apic needs to resume before other devices access its registers. */
+core_initcall(init_lapic_sysfs);
+
+#else /* CONFIG_PM */
+
+static void apic_pm_activate(void) { }
+
+#endif /* CONFIG_PM */
+
+#ifdef CONFIG_X86_64
+
+static int multi_checked;
+static int multi;
+
+static int set_multi(const struct dmi_system_id *d)
+{
+ if (multi)
+ return 0;
+ pr_info("APIC: %s detected, Multi Chassis\n", d->ident);
+ multi = 1;
+ return 0;
+}
+
+static const struct dmi_system_id multi_dmi_table[] = {
+ {
+ .callback = set_multi,
+ .ident = "IBM System Summit2",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "IBM"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Summit2"),
+ },
+ },
+ {}
+};
+
+static void dmi_check_multi(void)
+{
+ if (multi_checked)
+ return;
+
+ dmi_check_system(multi_dmi_table);
+ multi_checked = 1;
+}
+
+/*
+ * apic_is_clustered_box() -- Check if we can expect good TSC
+ *
+ * Thus far, the major user of this is IBM's Summit2 series:
+ * Clustered boxes may have unsynced TSC problems if they are
+ * multi-chassis.
+ * Use DMI to check them
+ */
+int apic_is_clustered_box(void)
+{
+ dmi_check_multi();
+ return multi;
+}
+#endif
+
+/*
+ * APIC command line parameters
+ */
+static int __init setup_disableapic(char *arg)
+{
+ disable_apic = 1;
+ setup_clear_cpu_cap(X86_FEATURE_APIC);
+ return 0;
+}
+early_param("disableapic", setup_disableapic);
+
+/* same as disableapic, for compatibility */
+static int __init setup_nolapic(char *arg)
+{
+ return setup_disableapic(arg);
+}
+early_param("nolapic", setup_nolapic);
+
+static int __init parse_lapic_timer_c2_ok(char *arg)
+{
+ local_apic_timer_c2_ok = 1;
+ return 0;
+}
+early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok);
+
+static int __init parse_disable_apic_timer(char *arg)
+{
+ disable_apic_timer = 1;
+ return 0;
+}
+early_param("noapictimer", parse_disable_apic_timer);
+
+static int __init parse_nolapic_timer(char *arg)
+{
+ disable_apic_timer = 1;
+ return 0;
+}
+early_param("nolapic_timer", parse_nolapic_timer);
+
+static int __init apic_set_verbosity(char *arg)
+{
+ if (!arg) {
+#ifdef CONFIG_X86_64
+ skip_ioapic_setup = 0;
+ return 0;
+#endif
+ return -EINVAL;
+ }
+
+ if (strcmp("debug", arg) == 0)
+ apic_verbosity = APIC_DEBUG;
+ else if (strcmp("verbose", arg) == 0)
+ apic_verbosity = APIC_VERBOSE;
+#ifdef CONFIG_X86_64
+ else {
+ pr_warning("APIC Verbosity level %s not recognised"
+ " use apic=verbose or apic=debug\n", arg);
+ return -EINVAL;
+ }
+#endif
+
+ return 0;
+}
+early_param("apic", apic_set_verbosity);
+
+static int __init lapic_insert_resource(void)
+{
+ if (!apic_phys)
+ return -1;
+
+ /* Put local APIC into the resource map. */
+ lapic_resource.start = apic_phys;
+ lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1;
+ insert_resource(&iomem_resource, &lapic_resource);
+
+ return 0;
+}
+
+/*
+ * need call insert after e820__reserve_resources()
+ * that is using request_resource
+ */
+late_initcall(lapic_insert_resource);
+
+static int __init apic_set_disabled_cpu_apicid(char *arg)
+{
+ if (!arg || !get_option(&arg, &disabled_cpu_apicid))
+ return -EINVAL;
+
+ return 0;
+}
+early_param("disable_cpu_apicid", apic_set_disabled_cpu_apicid);
+
+static int __init apic_set_extnmi(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
+
+ if (!strncmp("all", arg, 3))
+ apic_extnmi = APIC_EXTNMI_ALL;
+ else if (!strncmp("none", arg, 4))
+ apic_extnmi = APIC_EXTNMI_NONE;
+ else if (!strncmp("bsp", arg, 3))
+ apic_extnmi = APIC_EXTNMI_BSP;
+ else {
+ pr_warn("Unknown external NMI delivery mode `%s' ignored\n", arg);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+early_param("apic_extnmi", apic_set_extnmi);
diff --git a/arch/x86/kernel/apic/apic_common.c b/arch/x86/kernel/apic/apic_common.c
new file mode 100644
index 000000000..02b483947
--- /dev/null
+++ b/arch/x86/kernel/apic/apic_common.c
@@ -0,0 +1,46 @@
+/*
+ * Common functions shared between the various APIC flavours
+ *
+ * SPDX-License-Identifier: GPL-2.0
+ */
+#include <linux/irq.h>
+#include <asm/apic.h>
+
+u32 apic_default_calc_apicid(unsigned int cpu)
+{
+ return per_cpu(x86_cpu_to_apicid, cpu);
+}
+
+u32 apic_flat_calc_apicid(unsigned int cpu)
+{
+ return 1U << cpu;
+}
+
+bool default_check_apicid_used(physid_mask_t *map, int apicid)
+{
+ return physid_isset(apicid, *map);
+}
+
+void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
+{
+ *retmap = *phys_map;
+}
+
+int default_cpu_present_to_apicid(int mps_cpu)
+{
+ if (mps_cpu < nr_cpu_ids && cpu_present(mps_cpu))
+ return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu);
+ else
+ return BAD_APICID;
+}
+EXPORT_SYMBOL_GPL(default_cpu_present_to_apicid);
+
+int default_check_phys_apicid_present(int phys_apicid)
+{
+ return physid_isset(phys_apicid, phys_cpu_present_map);
+}
+
+int default_apic_id_valid(u32 apicid)
+{
+ return (apicid < 255);
+}
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
new file mode 100644
index 000000000..e84c9eb4e
--- /dev/null
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -0,0 +1,296 @@
+/*
+ * Copyright 2004 James Cleverdon, IBM.
+ * Subject to the GNU Public License, v.2
+ *
+ * Flat APIC subarch code.
+ *
+ * Hacked for x86-64 by James Cleverdon from i386 architecture code by
+ * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
+ * James Cleverdon.
+ */
+#include <linux/errno.h>
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/hardirq.h>
+#include <linux/export.h>
+#include <asm/smp.h>
+#include <asm/apic.h>
+#include <asm/ipi.h>
+#include <asm/jailhouse_para.h>
+
+#include <linux/acpi.h>
+
+static struct apic apic_physflat;
+static struct apic apic_flat;
+
+struct apic *apic __ro_after_init = &apic_flat;
+EXPORT_SYMBOL_GPL(apic);
+
+static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+ return 1;
+}
+
+/*
+ * Set up the logical destination ID.
+ *
+ * Intel recommends to set DFR, LDR and TPR before enabling
+ * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
+ * document number 292116). So here it goes...
+ */
+void flat_init_apic_ldr(void)
+{
+ unsigned long val;
+ unsigned long num, id;
+
+ num = smp_processor_id();
+ id = 1UL << num;
+ apic_write(APIC_DFR, APIC_DFR_FLAT);
+ val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
+ val |= SET_APIC_LOGICAL_ID(id);
+ apic_write(APIC_LDR, val);
+}
+
+static void _flat_send_IPI_mask(unsigned long mask, int vector)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __default_send_IPI_dest_field(mask, vector, apic->dest_logical);
+ local_irq_restore(flags);
+}
+
+static void flat_send_IPI_mask(const struct cpumask *cpumask, int vector)
+{
+ unsigned long mask = cpumask_bits(cpumask)[0];
+
+ _flat_send_IPI_mask(mask, vector);
+}
+
+static void
+flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector)
+{
+ unsigned long mask = cpumask_bits(cpumask)[0];
+ int cpu = smp_processor_id();
+
+ if (cpu < BITS_PER_LONG)
+ clear_bit(cpu, &mask);
+
+ _flat_send_IPI_mask(mask, vector);
+}
+
+static void flat_send_IPI_allbutself(int vector)
+{
+ int cpu = smp_processor_id();
+
+ if (IS_ENABLED(CONFIG_HOTPLUG_CPU) || vector == NMI_VECTOR) {
+ if (!cpumask_equal(cpu_online_mask, cpumask_of(cpu))) {
+ unsigned long mask = cpumask_bits(cpu_online_mask)[0];
+
+ if (cpu < BITS_PER_LONG)
+ clear_bit(cpu, &mask);
+
+ _flat_send_IPI_mask(mask, vector);
+ }
+ } else if (num_online_cpus() > 1) {
+ __default_send_IPI_shortcut(APIC_DEST_ALLBUT,
+ vector, apic->dest_logical);
+ }
+}
+
+static void flat_send_IPI_all(int vector)
+{
+ if (vector == NMI_VECTOR) {
+ flat_send_IPI_mask(cpu_online_mask, vector);
+ } else {
+ __default_send_IPI_shortcut(APIC_DEST_ALLINC,
+ vector, apic->dest_logical);
+ }
+}
+
+static unsigned int flat_get_apic_id(unsigned long x)
+{
+ return (x >> 24) & 0xFF;
+}
+
+static u32 set_apic_id(unsigned int id)
+{
+ return (id & 0xFF) << 24;
+}
+
+static unsigned int read_xapic_id(void)
+{
+ return flat_get_apic_id(apic_read(APIC_ID));
+}
+
+static int flat_apic_id_registered(void)
+{
+ return physid_isset(read_xapic_id(), phys_cpu_present_map);
+}
+
+static int flat_phys_pkg_id(int initial_apic_id, int index_msb)
+{
+ return initial_apic_id >> index_msb;
+}
+
+static int flat_probe(void)
+{
+ return 1;
+}
+
+static struct apic apic_flat __ro_after_init = {
+ .name = "flat",
+ .probe = flat_probe,
+ .acpi_madt_oem_check = flat_acpi_madt_oem_check,
+ .apic_id_valid = default_apic_id_valid,
+ .apic_id_registered = flat_apic_id_registered,
+
+ .irq_delivery_mode = dest_Fixed,
+ .irq_dest_mode = 1, /* logical */
+
+ .disable_esr = 0,
+ .dest_logical = APIC_DEST_LOGICAL,
+ .check_apicid_used = NULL,
+
+ .init_apic_ldr = flat_init_apic_ldr,
+
+ .ioapic_phys_id_map = NULL,
+ .setup_apic_routing = NULL,
+ .cpu_present_to_apicid = default_cpu_present_to_apicid,
+ .apicid_to_cpu_present = NULL,
+ .check_phys_apicid_present = default_check_phys_apicid_present,
+ .phys_pkg_id = flat_phys_pkg_id,
+
+ .get_apic_id = flat_get_apic_id,
+ .set_apic_id = set_apic_id,
+
+ .calc_dest_apicid = apic_flat_calc_apicid,
+
+ .send_IPI = default_send_IPI_single,
+ .send_IPI_mask = flat_send_IPI_mask,
+ .send_IPI_mask_allbutself = flat_send_IPI_mask_allbutself,
+ .send_IPI_allbutself = flat_send_IPI_allbutself,
+ .send_IPI_all = flat_send_IPI_all,
+ .send_IPI_self = apic_send_IPI_self,
+
+ .inquire_remote_apic = default_inquire_remote_apic,
+
+ .read = native_apic_mem_read,
+ .write = native_apic_mem_write,
+ .eoi_write = native_apic_mem_write,
+ .icr_read = native_apic_icr_read,
+ .icr_write = native_apic_icr_write,
+ .wait_icr_idle = native_apic_wait_icr_idle,
+ .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
+};
+
+/*
+ * Physflat mode is used when there are more than 8 CPUs on a system.
+ * We cannot use logical delivery in this case because the mask
+ * overflows, so use physical mode.
+ */
+static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+#ifdef CONFIG_ACPI
+ /*
+ * Quirk: some x86_64 machines can only use physical APIC mode
+ * regardless of how many processors are present (x86_64 ES7000
+ * is an example).
+ */
+ if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID &&
+ (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) {
+ printk(KERN_DEBUG "system APIC only can use physical flat");
+ return 1;
+ }
+
+ if (!strncmp(oem_id, "IBM", 3) && !strncmp(oem_table_id, "EXA", 3)) {
+ printk(KERN_DEBUG "IBM Summit detected, will use apic physical");
+ return 1;
+ }
+#endif
+
+ return 0;
+}
+
+static void physflat_init_apic_ldr(void)
+{
+ /*
+ * LDR and DFR are not involved in physflat mode, rather:
+ * "In physical destination mode, the destination processor is
+ * specified by its local APIC ID [...]." (Intel SDM, 10.6.2.1)
+ */
+}
+
+static void physflat_send_IPI_allbutself(int vector)
+{
+ default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector);
+}
+
+static void physflat_send_IPI_all(int vector)
+{
+ default_send_IPI_mask_sequence_phys(cpu_online_mask, vector);
+}
+
+static int physflat_probe(void)
+{
+ if (apic == &apic_physflat || num_possible_cpus() > 8 ||
+ jailhouse_paravirt())
+ return 1;
+
+ return 0;
+}
+
+static struct apic apic_physflat __ro_after_init = {
+
+ .name = "physical flat",
+ .probe = physflat_probe,
+ .acpi_madt_oem_check = physflat_acpi_madt_oem_check,
+ .apic_id_valid = default_apic_id_valid,
+ .apic_id_registered = flat_apic_id_registered,
+
+ .irq_delivery_mode = dest_Fixed,
+ .irq_dest_mode = 0, /* physical */
+
+ .disable_esr = 0,
+ .dest_logical = 0,
+ .check_apicid_used = NULL,
+
+ .init_apic_ldr = physflat_init_apic_ldr,
+
+ .ioapic_phys_id_map = NULL,
+ .setup_apic_routing = NULL,
+ .cpu_present_to_apicid = default_cpu_present_to_apicid,
+ .apicid_to_cpu_present = NULL,
+ .check_phys_apicid_present = default_check_phys_apicid_present,
+ .phys_pkg_id = flat_phys_pkg_id,
+
+ .get_apic_id = flat_get_apic_id,
+ .set_apic_id = set_apic_id,
+
+ .calc_dest_apicid = apic_default_calc_apicid,
+
+ .send_IPI = default_send_IPI_single_phys,
+ .send_IPI_mask = default_send_IPI_mask_sequence_phys,
+ .send_IPI_mask_allbutself = default_send_IPI_mask_allbutself_phys,
+ .send_IPI_allbutself = physflat_send_IPI_allbutself,
+ .send_IPI_all = physflat_send_IPI_all,
+ .send_IPI_self = apic_send_IPI_self,
+
+ .inquire_remote_apic = default_inquire_remote_apic,
+
+ .read = native_apic_mem_read,
+ .write = native_apic_mem_write,
+ .eoi_write = native_apic_mem_write,
+ .icr_read = native_apic_icr_read,
+ .icr_write = native_apic_icr_write,
+ .wait_icr_idle = native_apic_wait_icr_idle,
+ .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
+};
+
+/*
+ * We need to check for physflat first, so this order is important.
+ */
+apic_drivers(apic_physflat, apic_flat);
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
new file mode 100644
index 000000000..5078b5ce6
--- /dev/null
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -0,0 +1,160 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NOOP APIC driver.
+ *
+ * Does almost nothing and should be substituted by a real apic driver via
+ * probe routine.
+ *
+ * Though in case if apic is disabled (for some reason) we try
+ * to not uglify the caller's code and allow to call (some) apic routines
+ * like self-ipi, etc...
+ */
+
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/errno.h>
+#include <asm/fixmap.h>
+#include <asm/mpspec.h>
+#include <asm/apicdef.h>
+#include <asm/apic.h>
+#include <asm/setup.h>
+
+#include <linux/smp.h>
+#include <asm/ipi.h>
+
+#include <linux/interrupt.h>
+#include <asm/acpi.h>
+#include <asm/e820/api.h>
+
+static void noop_init_apic_ldr(void) { }
+static void noop_send_IPI(int cpu, int vector) { }
+static void noop_send_IPI_mask(const struct cpumask *cpumask, int vector) { }
+static void noop_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) { }
+static void noop_send_IPI_allbutself(int vector) { }
+static void noop_send_IPI_all(int vector) { }
+static void noop_send_IPI_self(int vector) { }
+static void noop_apic_wait_icr_idle(void) { }
+static void noop_apic_icr_write(u32 low, u32 id) { }
+
+static int noop_wakeup_secondary_cpu(int apicid, unsigned long start_eip)
+{
+ return -1;
+}
+
+static u32 noop_safe_apic_wait_icr_idle(void)
+{
+ return 0;
+}
+
+static u64 noop_apic_icr_read(void)
+{
+ return 0;
+}
+
+static int noop_phys_pkg_id(int cpuid_apic, int index_msb)
+{
+ return 0;
+}
+
+static unsigned int noop_get_apic_id(unsigned long x)
+{
+ return 0;
+}
+
+static int noop_probe(void)
+{
+ /*
+ * NOOP apic should not ever be
+ * enabled via probe routine
+ */
+ return 0;
+}
+
+static int noop_apic_id_registered(void)
+{
+ /*
+ * if we would be really "pedantic"
+ * we should pass read_apic_id() here
+ * but since NOOP suppose APIC ID = 0
+ * lets save a few cycles
+ */
+ return physid_isset(0, phys_cpu_present_map);
+}
+
+static u32 noop_apic_read(u32 reg)
+{
+ WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_APIC) && !disable_apic);
+ return 0;
+}
+
+static void noop_apic_write(u32 reg, u32 v)
+{
+ WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_APIC) && !disable_apic);
+}
+
+#ifdef CONFIG_X86_32
+static int noop_x86_32_early_logical_apicid(int cpu)
+{
+ return BAD_APICID;
+}
+#endif
+
+struct apic apic_noop __ro_after_init = {
+ .name = "noop",
+ .probe = noop_probe,
+ .acpi_madt_oem_check = NULL,
+
+ .apic_id_valid = default_apic_id_valid,
+ .apic_id_registered = noop_apic_id_registered,
+
+ .irq_delivery_mode = dest_Fixed,
+ /* logical delivery broadcast to all CPUs: */
+ .irq_dest_mode = 1,
+
+ .disable_esr = 0,
+ .dest_logical = APIC_DEST_LOGICAL,
+ .check_apicid_used = default_check_apicid_used,
+
+ .init_apic_ldr = noop_init_apic_ldr,
+
+ .ioapic_phys_id_map = default_ioapic_phys_id_map,
+ .setup_apic_routing = NULL,
+
+ .cpu_present_to_apicid = default_cpu_present_to_apicid,
+ .apicid_to_cpu_present = physid_set_mask_of_physid,
+
+ .check_phys_apicid_present = default_check_phys_apicid_present,
+
+ .phys_pkg_id = noop_phys_pkg_id,
+
+ .get_apic_id = noop_get_apic_id,
+ .set_apic_id = NULL,
+
+ .calc_dest_apicid = apic_flat_calc_apicid,
+
+ .send_IPI = noop_send_IPI,
+ .send_IPI_mask = noop_send_IPI_mask,
+ .send_IPI_mask_allbutself = noop_send_IPI_mask_allbutself,
+ .send_IPI_allbutself = noop_send_IPI_allbutself,
+ .send_IPI_all = noop_send_IPI_all,
+ .send_IPI_self = noop_send_IPI_self,
+
+ .wakeup_secondary_cpu = noop_wakeup_secondary_cpu,
+
+ .inquire_remote_apic = NULL,
+
+ .read = noop_apic_read,
+ .write = noop_apic_write,
+ .eoi_write = noop_apic_write,
+ .icr_read = noop_apic_icr_read,
+ .icr_write = noop_apic_icr_write,
+ .wait_icr_idle = noop_apic_wait_icr_idle,
+ .safe_wait_icr_idle = noop_safe_apic_wait_icr_idle,
+
+#ifdef CONFIG_X86_32
+ .x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid,
+#endif
+};
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
new file mode 100644
index 000000000..78778b54f
--- /dev/null
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -0,0 +1,338 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Numascale NumaConnect-Specific APIC Code
+ *
+ * Copyright (C) 2011 Numascale AS. All rights reserved.
+ *
+ * Send feedback to <support@numascale.com>
+ *
+ */
+
+#include <linux/init.h>
+
+#include <asm/numachip/numachip.h>
+#include <asm/numachip/numachip_csr.h>
+#include <asm/ipi.h>
+#include <asm/apic_flat_64.h>
+#include <asm/pgtable.h>
+#include <asm/pci_x86.h>
+
+u8 numachip_system __read_mostly;
+static const struct apic apic_numachip1;
+static const struct apic apic_numachip2;
+static void (*numachip_apic_icr_write)(int apicid, unsigned int val) __read_mostly;
+
+static unsigned int numachip1_get_apic_id(unsigned long x)
+{
+ unsigned long value;
+ unsigned int id = (x >> 24) & 0xff;
+
+ if (static_cpu_has(X86_FEATURE_NODEID_MSR)) {
+ rdmsrl(MSR_FAM10H_NODE_ID, value);
+ id |= (value << 2) & 0xff00;
+ }
+
+ return id;
+}
+
+static u32 numachip1_set_apic_id(unsigned int id)
+{
+ return (id & 0xff) << 24;
+}
+
+static unsigned int numachip2_get_apic_id(unsigned long x)
+{
+ u64 mcfg;
+
+ rdmsrl(MSR_FAM10H_MMIO_CONF_BASE, mcfg);
+ return ((mcfg >> (28 - 8)) & 0xfff00) | (x >> 24);
+}
+
+static u32 numachip2_set_apic_id(unsigned int id)
+{
+ return id << 24;
+}
+
+static int numachip_apic_id_valid(u32 apicid)
+{
+ /* Trust what bootloader passes in MADT */
+ return 1;
+}
+
+static int numachip_apic_id_registered(void)
+{
+ return 1;
+}
+
+static int numachip_phys_pkg_id(int initial_apic_id, int index_msb)
+{
+ return initial_apic_id >> index_msb;
+}
+
+static void numachip1_apic_icr_write(int apicid, unsigned int val)
+{
+ write_lcsr(CSR_G3_EXT_IRQ_GEN, (apicid << 16) | val);
+}
+
+static void numachip2_apic_icr_write(int apicid, unsigned int val)
+{
+ numachip2_write32_lcsr(NUMACHIP2_APIC_ICR, (apicid << 12) | val);
+}
+
+static int numachip_wakeup_secondary(int phys_apicid, unsigned long start_rip)
+{
+ numachip_apic_icr_write(phys_apicid, APIC_DM_INIT);
+ numachip_apic_icr_write(phys_apicid, APIC_DM_STARTUP |
+ (start_rip >> 12));
+
+ return 0;
+}
+
+static void numachip_send_IPI_one(int cpu, int vector)
+{
+ int local_apicid, apicid = per_cpu(x86_cpu_to_apicid, cpu);
+ unsigned int dmode;
+
+ preempt_disable();
+ local_apicid = __this_cpu_read(x86_cpu_to_apicid);
+
+ /* Send via local APIC where non-local part matches */
+ if (!((apicid ^ local_apicid) >> NUMACHIP_LAPIC_BITS)) {
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __default_send_IPI_dest_field(apicid, vector,
+ APIC_DEST_PHYSICAL);
+ local_irq_restore(flags);
+ preempt_enable();
+ return;
+ }
+ preempt_enable();
+
+ dmode = (vector == NMI_VECTOR) ? APIC_DM_NMI : APIC_DM_FIXED;
+ numachip_apic_icr_write(apicid, dmode | vector);
+}
+
+static void numachip_send_IPI_mask(const struct cpumask *mask, int vector)
+{
+ unsigned int cpu;
+
+ for_each_cpu(cpu, mask)
+ numachip_send_IPI_one(cpu, vector);
+}
+
+static void numachip_send_IPI_mask_allbutself(const struct cpumask *mask,
+ int vector)
+{
+ unsigned int this_cpu = smp_processor_id();
+ unsigned int cpu;
+
+ for_each_cpu(cpu, mask) {
+ if (cpu != this_cpu)
+ numachip_send_IPI_one(cpu, vector);
+ }
+}
+
+static void numachip_send_IPI_allbutself(int vector)
+{
+ unsigned int this_cpu = smp_processor_id();
+ unsigned int cpu;
+
+ for_each_online_cpu(cpu) {
+ if (cpu != this_cpu)
+ numachip_send_IPI_one(cpu, vector);
+ }
+}
+
+static void numachip_send_IPI_all(int vector)
+{
+ numachip_send_IPI_mask(cpu_online_mask, vector);
+}
+
+static void numachip_send_IPI_self(int vector)
+{
+ apic_write(APIC_SELF_IPI, vector);
+}
+
+static int __init numachip1_probe(void)
+{
+ return apic == &apic_numachip1;
+}
+
+static int __init numachip2_probe(void)
+{
+ return apic == &apic_numachip2;
+}
+
+static void fixup_cpu_id(struct cpuinfo_x86 *c, int node)
+{
+ u64 val;
+ u32 nodes = 1;
+
+ this_cpu_write(cpu_llc_id, node);
+
+ /* Account for nodes per socket in multi-core-module processors */
+ if (static_cpu_has(X86_FEATURE_NODEID_MSR)) {
+ rdmsrl(MSR_FAM10H_NODE_ID, val);
+ nodes = ((val >> 3) & 7) + 1;
+ }
+
+ c->phys_proc_id = node / nodes;
+}
+
+static int __init numachip_system_init(void)
+{
+ /* Map the LCSR area and set up the apic_icr_write function */
+ switch (numachip_system) {
+ case 1:
+ init_extra_mapping_uc(NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_SIZE);
+ numachip_apic_icr_write = numachip1_apic_icr_write;
+ break;
+ case 2:
+ init_extra_mapping_uc(NUMACHIP2_LCSR_BASE, NUMACHIP2_LCSR_SIZE);
+ numachip_apic_icr_write = numachip2_apic_icr_write;
+ break;
+ default:
+ return 0;
+ }
+
+ x86_cpuinit.fixup_cpu_id = fixup_cpu_id;
+ x86_init.pci.arch_init = pci_numachip_init;
+
+ return 0;
+}
+early_initcall(numachip_system_init);
+
+static int numachip1_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+ if ((strncmp(oem_id, "NUMASC", 6) != 0) ||
+ (strncmp(oem_table_id, "NCONNECT", 8) != 0))
+ return 0;
+
+ numachip_system = 1;
+
+ return 1;
+}
+
+static int numachip2_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+ if ((strncmp(oem_id, "NUMASC", 6) != 0) ||
+ (strncmp(oem_table_id, "NCONECT2", 8) != 0))
+ return 0;
+
+ numachip_system = 2;
+
+ return 1;
+}
+
+/* APIC IPIs are queued */
+static void numachip_apic_wait_icr_idle(void)
+{
+}
+
+/* APIC NMI IPIs are queued */
+static u32 numachip_safe_apic_wait_icr_idle(void)
+{
+ return 0;
+}
+
+static const struct apic apic_numachip1 __refconst = {
+ .name = "NumaConnect system",
+ .probe = numachip1_probe,
+ .acpi_madt_oem_check = numachip1_acpi_madt_oem_check,
+ .apic_id_valid = numachip_apic_id_valid,
+ .apic_id_registered = numachip_apic_id_registered,
+
+ .irq_delivery_mode = dest_Fixed,
+ .irq_dest_mode = 0, /* physical */
+
+ .disable_esr = 0,
+ .dest_logical = 0,
+ .check_apicid_used = NULL,
+
+ .init_apic_ldr = flat_init_apic_ldr,
+
+ .ioapic_phys_id_map = NULL,
+ .setup_apic_routing = NULL,
+ .cpu_present_to_apicid = default_cpu_present_to_apicid,
+ .apicid_to_cpu_present = NULL,
+ .check_phys_apicid_present = default_check_phys_apicid_present,
+ .phys_pkg_id = numachip_phys_pkg_id,
+
+ .get_apic_id = numachip1_get_apic_id,
+ .set_apic_id = numachip1_set_apic_id,
+
+ .calc_dest_apicid = apic_default_calc_apicid,
+
+ .send_IPI = numachip_send_IPI_one,
+ .send_IPI_mask = numachip_send_IPI_mask,
+ .send_IPI_mask_allbutself = numachip_send_IPI_mask_allbutself,
+ .send_IPI_allbutself = numachip_send_IPI_allbutself,
+ .send_IPI_all = numachip_send_IPI_all,
+ .send_IPI_self = numachip_send_IPI_self,
+
+ .wakeup_secondary_cpu = numachip_wakeup_secondary,
+ .inquire_remote_apic = NULL, /* REMRD not supported */
+
+ .read = native_apic_mem_read,
+ .write = native_apic_mem_write,
+ .eoi_write = native_apic_mem_write,
+ .icr_read = native_apic_icr_read,
+ .icr_write = native_apic_icr_write,
+ .wait_icr_idle = numachip_apic_wait_icr_idle,
+ .safe_wait_icr_idle = numachip_safe_apic_wait_icr_idle,
+};
+
+apic_driver(apic_numachip1);
+
+static const struct apic apic_numachip2 __refconst = {
+ .name = "NumaConnect2 system",
+ .probe = numachip2_probe,
+ .acpi_madt_oem_check = numachip2_acpi_madt_oem_check,
+ .apic_id_valid = numachip_apic_id_valid,
+ .apic_id_registered = numachip_apic_id_registered,
+
+ .irq_delivery_mode = dest_Fixed,
+ .irq_dest_mode = 0, /* physical */
+
+ .disable_esr = 0,
+ .dest_logical = 0,
+ .check_apicid_used = NULL,
+
+ .init_apic_ldr = flat_init_apic_ldr,
+
+ .ioapic_phys_id_map = NULL,
+ .setup_apic_routing = NULL,
+ .cpu_present_to_apicid = default_cpu_present_to_apicid,
+ .apicid_to_cpu_present = NULL,
+ .check_phys_apicid_present = default_check_phys_apicid_present,
+ .phys_pkg_id = numachip_phys_pkg_id,
+
+ .get_apic_id = numachip2_get_apic_id,
+ .set_apic_id = numachip2_set_apic_id,
+
+ .calc_dest_apicid = apic_default_calc_apicid,
+
+ .send_IPI = numachip_send_IPI_one,
+ .send_IPI_mask = numachip_send_IPI_mask,
+ .send_IPI_mask_allbutself = numachip_send_IPI_mask_allbutself,
+ .send_IPI_allbutself = numachip_send_IPI_allbutself,
+ .send_IPI_all = numachip_send_IPI_all,
+ .send_IPI_self = numachip_send_IPI_self,
+
+ .wakeup_secondary_cpu = numachip_wakeup_secondary,
+ .inquire_remote_apic = NULL, /* REMRD not supported */
+
+ .read = native_apic_mem_read,
+ .write = native_apic_mem_write,
+ .eoi_write = native_apic_mem_write,
+ .icr_read = native_apic_icr_read,
+ .icr_write = native_apic_icr_write,
+ .wait_icr_idle = numachip_apic_wait_icr_idle,
+ .safe_wait_icr_idle = numachip_safe_apic_wait_icr_idle,
+};
+
+apic_driver(apic_numachip2);
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
new file mode 100644
index 000000000..caedd8d60
--- /dev/null
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -0,0 +1,196 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * APIC driver for "bigsmp" xAPIC machines with more than 8 virtual CPUs.
+ *
+ * Drives the local APIC in "clustered mode".
+ */
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/dmi.h>
+#include <linux/smp.h>
+
+#include <asm/apicdef.h>
+#include <asm/fixmap.h>
+#include <asm/mpspec.h>
+#include <asm/apic.h>
+#include <asm/ipi.h>
+
+static unsigned bigsmp_get_apic_id(unsigned long x)
+{
+ return (x >> 24) & 0xFF;
+}
+
+static int bigsmp_apic_id_registered(void)
+{
+ return 1;
+}
+
+static bool bigsmp_check_apicid_used(physid_mask_t *map, int apicid)
+{
+ return false;
+}
+
+static int bigsmp_early_logical_apicid(int cpu)
+{
+ /* on bigsmp, logical apicid is the same as physical */
+ return early_per_cpu(x86_cpu_to_apicid, cpu);
+}
+
+/*
+ * bigsmp enables physical destination mode
+ * and doesn't use LDR and DFR
+ */
+static void bigsmp_init_apic_ldr(void)
+{
+}
+
+static void bigsmp_setup_apic_routing(void)
+{
+ printk(KERN_INFO
+ "Enabling APIC mode: Physflat. Using %d I/O APICs\n",
+ nr_ioapics);
+}
+
+static int bigsmp_cpu_present_to_apicid(int mps_cpu)
+{
+ if (mps_cpu < nr_cpu_ids)
+ return (int) per_cpu(x86_bios_cpu_apicid, mps_cpu);
+
+ return BAD_APICID;
+}
+
+static void bigsmp_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
+{
+ /* For clustered we don't have a good way to do this yet - hack */
+ physids_promote(0xFFL, retmap);
+}
+
+static int bigsmp_check_phys_apicid_present(int phys_apicid)
+{
+ return 1;
+}
+
+static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb)
+{
+ return cpuid_apic >> index_msb;
+}
+
+static void bigsmp_send_IPI_allbutself(int vector)
+{
+ default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector);
+}
+
+static void bigsmp_send_IPI_all(int vector)
+{
+ default_send_IPI_mask_sequence_phys(cpu_online_mask, vector);
+}
+
+static int dmi_bigsmp; /* can be set by dmi scanners */
+
+static int hp_ht_bigsmp(const struct dmi_system_id *d)
+{
+ printk(KERN_NOTICE "%s detected: force use of apic=bigsmp\n", d->ident);
+ dmi_bigsmp = 1;
+
+ return 0;
+}
+
+
+static const struct dmi_system_id bigsmp_dmi_table[] = {
+ { hp_ht_bigsmp, "HP ProLiant DL760 G2",
+ { DMI_MATCH(DMI_BIOS_VENDOR, "HP"),
+ DMI_MATCH(DMI_BIOS_VERSION, "P44-"),
+ }
+ },
+
+ { hp_ht_bigsmp, "HP ProLiant DL740",
+ { DMI_MATCH(DMI_BIOS_VENDOR, "HP"),
+ DMI_MATCH(DMI_BIOS_VERSION, "P47-"),
+ }
+ },
+ { } /* NULL entry stops DMI scanning */
+};
+
+static int probe_bigsmp(void)
+{
+ if (def_to_bigsmp)
+ dmi_bigsmp = 1;
+ else
+ dmi_check_system(bigsmp_dmi_table);
+
+ return dmi_bigsmp;
+}
+
+static struct apic apic_bigsmp __ro_after_init = {
+
+ .name = "bigsmp",
+ .probe = probe_bigsmp,
+ .acpi_madt_oem_check = NULL,
+ .apic_id_valid = default_apic_id_valid,
+ .apic_id_registered = bigsmp_apic_id_registered,
+
+ .irq_delivery_mode = dest_Fixed,
+ /* phys delivery to target CPU: */
+ .irq_dest_mode = 0,
+
+ .disable_esr = 1,
+ .dest_logical = 0,
+ .check_apicid_used = bigsmp_check_apicid_used,
+
+ .init_apic_ldr = bigsmp_init_apic_ldr,
+
+ .ioapic_phys_id_map = bigsmp_ioapic_phys_id_map,
+ .setup_apic_routing = bigsmp_setup_apic_routing,
+ .cpu_present_to_apicid = bigsmp_cpu_present_to_apicid,
+ .apicid_to_cpu_present = physid_set_mask_of_physid,
+ .check_phys_apicid_present = bigsmp_check_phys_apicid_present,
+ .phys_pkg_id = bigsmp_phys_pkg_id,
+
+ .get_apic_id = bigsmp_get_apic_id,
+ .set_apic_id = NULL,
+
+ .calc_dest_apicid = apic_default_calc_apicid,
+
+ .send_IPI = default_send_IPI_single_phys,
+ .send_IPI_mask = default_send_IPI_mask_sequence_phys,
+ .send_IPI_mask_allbutself = NULL,
+ .send_IPI_allbutself = bigsmp_send_IPI_allbutself,
+ .send_IPI_all = bigsmp_send_IPI_all,
+ .send_IPI_self = default_send_IPI_self,
+
+ .inquire_remote_apic = default_inquire_remote_apic,
+
+ .read = native_apic_mem_read,
+ .write = native_apic_mem_write,
+ .eoi_write = native_apic_mem_write,
+ .icr_read = native_apic_icr_read,
+ .icr_write = native_apic_icr_write,
+ .wait_icr_idle = native_apic_wait_icr_idle,
+ .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
+
+ .x86_32_early_logical_apicid = bigsmp_early_logical_apicid,
+};
+
+void __init generic_bigsmp_probe(void)
+{
+ unsigned int cpu;
+
+ if (!probe_bigsmp())
+ return;
+
+ apic = &apic_bigsmp;
+
+ for_each_possible_cpu(cpu) {
+ if (early_per_cpu(x86_cpu_to_logical_apicid,
+ cpu) == BAD_APICID)
+ continue;
+ early_per_cpu(x86_cpu_to_logical_apicid, cpu) =
+ bigsmp_early_logical_apicid(cpu);
+ }
+
+ pr_info("Overriding APIC driver with %s\n", apic_bigsmp.name);
+}
+
+apic_driver(apic_bigsmp);
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
new file mode 100644
index 000000000..d1fc62a67
--- /dev/null
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * HW NMI watchdog support
+ *
+ * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc.
+ *
+ * Arch specific calls to support NMI watchdog
+ *
+ * Bits copied from original nmi.c file
+ *
+ */
+#include <asm/apic.h>
+#include <asm/nmi.h>
+
+#include <linux/cpumask.h>
+#include <linux/kdebug.h>
+#include <linux/notifier.h>
+#include <linux/kprobes.h>
+#include <linux/nmi.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF
+u64 hw_nmi_get_sample_period(int watchdog_thresh)
+{
+ return (u64)(cpu_khz) * 1000 * watchdog_thresh;
+}
+#endif
+
+#ifdef arch_trigger_cpumask_backtrace
+static void nmi_raise_cpu_backtrace(cpumask_t *mask)
+{
+ apic->send_IPI_mask(mask, NMI_VECTOR);
+}
+
+void arch_trigger_cpumask_backtrace(const cpumask_t *mask, bool exclude_self)
+{
+ nmi_trigger_cpumask_backtrace(mask, exclude_self,
+ nmi_raise_cpu_backtrace);
+}
+
+static int nmi_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs)
+{
+ if (nmi_cpu_backtrace(regs))
+ return NMI_HANDLED;
+
+ return NMI_DONE;
+}
+NOKPROBE_SYMBOL(nmi_cpu_backtrace_handler);
+
+static int __init register_nmi_cpu_backtrace_handler(void)
+{
+ register_nmi_handler(NMI_LOCAL, nmi_cpu_backtrace_handler,
+ 0, "arch_bt");
+ return 0;
+}
+early_initcall(register_nmi_cpu_backtrace_handler);
+#endif
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
new file mode 100644
index 000000000..677508baf
--- /dev/null
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -0,0 +1,3085 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel IO-APIC support for multi-Pentium hosts.
+ *
+ * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo
+ *
+ * Many thanks to Stig Venaas for trying out countless experimental
+ * patches and reporting/debugging problems patiently!
+ *
+ * (c) 1999, Multiple IO-APIC support, developed by
+ * Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and
+ * Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>,
+ * further tested and cleaned up by Zach Brown <zab@redhat.com>
+ * and Ingo Molnar <mingo@redhat.com>
+ *
+ * Fixes
+ * Maciej W. Rozycki : Bits for genuine 82489DX APICs;
+ * thanks to Eric Gilmore
+ * and Rolf G. Tews
+ * for testing these extensively
+ * Paul Diefenbaugh : Added full ACPI support
+ *
+ * Historical information which is worth to be preserved:
+ *
+ * - SiS APIC rmw bug:
+ *
+ * We used to have a workaround for a bug in SiS chips which
+ * required to rewrite the index register for a read-modify-write
+ * operation as the chip lost the index information which was
+ * setup for the read already. We cache the data now, so that
+ * workaround has been removed.
+ */
+
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/sched.h>
+#include <linux/pci.h>
+#include <linux/mc146818rtc.h>
+#include <linux/compiler.h>
+#include <linux/acpi.h>
+#include <linux/export.h>
+#include <linux/syscore_ops.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/jiffies.h> /* time_after() */
+#include <linux/slab.h>
+#include <linux/bootmem.h>
+
+#include <asm/irqdomain.h>
+#include <asm/io.h>
+#include <asm/smp.h>
+#include <asm/cpu.h>
+#include <asm/desc.h>
+#include <asm/proto.h>
+#include <asm/acpi.h>
+#include <asm/dma.h>
+#include <asm/timer.h>
+#include <asm/i8259.h>
+#include <asm/setup.h>
+#include <asm/irq_remapping.h>
+#include <asm/hw_irq.h>
+
+#include <asm/apic.h>
+
+#define for_each_ioapic(idx) \
+ for ((idx) = 0; (idx) < nr_ioapics; (idx)++)
+#define for_each_ioapic_reverse(idx) \
+ for ((idx) = nr_ioapics - 1; (idx) >= 0; (idx)--)
+#define for_each_pin(idx, pin) \
+ for ((pin) = 0; (pin) < ioapics[(idx)].nr_registers; (pin)++)
+#define for_each_ioapic_pin(idx, pin) \
+ for_each_ioapic((idx)) \
+ for_each_pin((idx), (pin))
+#define for_each_irq_pin(entry, head) \
+ list_for_each_entry(entry, &head, list)
+
+static DEFINE_RAW_SPINLOCK(ioapic_lock);
+static DEFINE_MUTEX(ioapic_mutex);
+static unsigned int ioapic_dynirq_base;
+static int ioapic_initialized;
+
+struct irq_pin_list {
+ struct list_head list;
+ int apic, pin;
+};
+
+struct mp_chip_data {
+ struct list_head irq_2_pin;
+ struct IO_APIC_route_entry entry;
+ int trigger;
+ int polarity;
+ u32 count;
+ bool isa_irq;
+};
+
+struct mp_ioapic_gsi {
+ u32 gsi_base;
+ u32 gsi_end;
+};
+
+static struct ioapic {
+ /*
+ * # of IRQ routing registers
+ */
+ int nr_registers;
+ /*
+ * Saved state during suspend/resume, or while enabling intr-remap.
+ */
+ struct IO_APIC_route_entry *saved_registers;
+ /* I/O APIC config */
+ struct mpc_ioapic mp_config;
+ /* IO APIC gsi routing info */
+ struct mp_ioapic_gsi gsi_config;
+ struct ioapic_domain_cfg irqdomain_cfg;
+ struct irq_domain *irqdomain;
+ struct resource *iomem_res;
+} ioapics[MAX_IO_APICS];
+
+#define mpc_ioapic_ver(ioapic_idx) ioapics[ioapic_idx].mp_config.apicver
+
+int mpc_ioapic_id(int ioapic_idx)
+{
+ return ioapics[ioapic_idx].mp_config.apicid;
+}
+
+unsigned int mpc_ioapic_addr(int ioapic_idx)
+{
+ return ioapics[ioapic_idx].mp_config.apicaddr;
+}
+
+static inline struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int ioapic_idx)
+{
+ return &ioapics[ioapic_idx].gsi_config;
+}
+
+static inline int mp_ioapic_pin_count(int ioapic)
+{
+ struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(ioapic);
+
+ return gsi_cfg->gsi_end - gsi_cfg->gsi_base + 1;
+}
+
+static inline u32 mp_pin_to_gsi(int ioapic, int pin)
+{
+ return mp_ioapic_gsi_routing(ioapic)->gsi_base + pin;
+}
+
+static inline bool mp_is_legacy_irq(int irq)
+{
+ return irq >= 0 && irq < nr_legacy_irqs();
+}
+
+/*
+ * Initialize all legacy IRQs and all pins on the first IOAPIC
+ * if we have legacy interrupt controller. Kernel boot option "pirq="
+ * may rely on non-legacy pins on the first IOAPIC.
+ */
+static inline int mp_init_irq_at_boot(int ioapic, int irq)
+{
+ if (!nr_legacy_irqs())
+ return 0;
+
+ return ioapic == 0 || mp_is_legacy_irq(irq);
+}
+
+static inline struct irq_domain *mp_ioapic_irqdomain(int ioapic)
+{
+ return ioapics[ioapic].irqdomain;
+}
+
+int nr_ioapics;
+
+/* The one past the highest gsi number used */
+u32 gsi_top;
+
+/* MP IRQ source entries */
+struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
+
+/* # of MP IRQ source entries */
+int mp_irq_entries;
+
+#ifdef CONFIG_EISA
+int mp_bus_id_to_type[MAX_MP_BUSSES];
+#endif
+
+DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
+
+int skip_ioapic_setup;
+
+/**
+ * disable_ioapic_support() - disables ioapic support at runtime
+ */
+void disable_ioapic_support(void)
+{
+#ifdef CONFIG_PCI
+ noioapicquirk = 1;
+ noioapicreroute = -1;
+#endif
+ skip_ioapic_setup = 1;
+}
+
+static int __init parse_noapic(char *str)
+{
+ /* disable IO-APIC */
+ disable_ioapic_support();
+ return 0;
+}
+early_param("noapic", parse_noapic);
+
+/* Will be called in mpparse/acpi/sfi codes for saving IRQ info */
+void mp_save_irq(struct mpc_intsrc *m)
+{
+ int i;
+
+ apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
+ " IRQ %02x, APIC ID %x, APIC INT %02x\n",
+ m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbus,
+ m->srcbusirq, m->dstapic, m->dstirq);
+
+ for (i = 0; i < mp_irq_entries; i++) {
+ if (!memcmp(&mp_irqs[i], m, sizeof(*m)))
+ return;
+ }
+
+ memcpy(&mp_irqs[mp_irq_entries], m, sizeof(*m));
+ if (++mp_irq_entries == MAX_IRQ_SOURCES)
+ panic("Max # of irq sources exceeded!!\n");
+}
+
+static void alloc_ioapic_saved_registers(int idx)
+{
+ size_t size;
+
+ if (ioapics[idx].saved_registers)
+ return;
+
+ size = sizeof(struct IO_APIC_route_entry) * ioapics[idx].nr_registers;
+ ioapics[idx].saved_registers = kzalloc(size, GFP_KERNEL);
+ if (!ioapics[idx].saved_registers)
+ pr_err("IOAPIC %d: suspend/resume impossible!\n", idx);
+}
+
+static void free_ioapic_saved_registers(int idx)
+{
+ kfree(ioapics[idx].saved_registers);
+ ioapics[idx].saved_registers = NULL;
+}
+
+int __init arch_early_ioapic_init(void)
+{
+ int i;
+
+ if (!nr_legacy_irqs())
+ io_apic_irqs = ~0UL;
+
+ for_each_ioapic(i)
+ alloc_ioapic_saved_registers(i);
+
+ return 0;
+}
+
+struct io_apic {
+ unsigned int index;
+ unsigned int unused[3];
+ unsigned int data;
+ unsigned int unused2[11];
+ unsigned int eoi;
+};
+
+static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
+{
+ return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
+ + (mpc_ioapic_addr(idx) & ~PAGE_MASK);
+}
+
+static inline void io_apic_eoi(unsigned int apic, unsigned int vector)
+{
+ struct io_apic __iomem *io_apic = io_apic_base(apic);
+ writel(vector, &io_apic->eoi);
+}
+
+unsigned int native_io_apic_read(unsigned int apic, unsigned int reg)
+{
+ struct io_apic __iomem *io_apic = io_apic_base(apic);
+ writel(reg, &io_apic->index);
+ return readl(&io_apic->data);
+}
+
+static void io_apic_write(unsigned int apic, unsigned int reg,
+ unsigned int value)
+{
+ struct io_apic __iomem *io_apic = io_apic_base(apic);
+
+ writel(reg, &io_apic->index);
+ writel(value, &io_apic->data);
+}
+
+union entry_union {
+ struct { u32 w1, w2; };
+ struct IO_APIC_route_entry entry;
+};
+
+static struct IO_APIC_route_entry __ioapic_read_entry(int apic, int pin)
+{
+ union entry_union eu;
+
+ eu.w1 = io_apic_read(apic, 0x10 + 2 * pin);
+ eu.w2 = io_apic_read(apic, 0x11 + 2 * pin);
+
+ return eu.entry;
+}
+
+static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
+{
+ union entry_union eu;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ eu.entry = __ioapic_read_entry(apic, pin);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ return eu.entry;
+}
+
+/*
+ * When we write a new IO APIC routing entry, we need to write the high
+ * word first! If the mask bit in the low word is clear, we will enable
+ * the interrupt, and we need to make sure the entry is fully populated
+ * before that happens.
+ */
+static void __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
+{
+ union entry_union eu = {{0, 0}};
+
+ eu.entry = e;
+ io_apic_write(apic, 0x11 + 2*pin, eu.w2);
+ io_apic_write(apic, 0x10 + 2*pin, eu.w1);
+}
+
+static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ __ioapic_write_entry(apic, pin, e);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+/*
+ * When we mask an IO APIC routing entry, we need to write the low
+ * word first, in order to set the mask bit before we change the
+ * high bits!
+ */
+static void ioapic_mask_entry(int apic, int pin)
+{
+ unsigned long flags;
+ union entry_union eu = { .entry.mask = IOAPIC_MASKED };
+
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ io_apic_write(apic, 0x10 + 2*pin, eu.w1);
+ io_apic_write(apic, 0x11 + 2*pin, eu.w2);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+/*
+ * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
+ * shared ISA-space IRQs, so we have to support them. We are super
+ * fast in the common case, and fast for shared ISA-space IRQs.
+ */
+static int __add_pin_to_irq_node(struct mp_chip_data *data,
+ int node, int apic, int pin)
+{
+ struct irq_pin_list *entry;
+
+ /* don't allow duplicates */
+ for_each_irq_pin(entry, data->irq_2_pin)
+ if (entry->apic == apic && entry->pin == pin)
+ return 0;
+
+ entry = kzalloc_node(sizeof(struct irq_pin_list), GFP_ATOMIC, node);
+ if (!entry) {
+ pr_err("can not alloc irq_pin_list (%d,%d,%d)\n",
+ node, apic, pin);
+ return -ENOMEM;
+ }
+ entry->apic = apic;
+ entry->pin = pin;
+ list_add_tail(&entry->list, &data->irq_2_pin);
+
+ return 0;
+}
+
+static void __remove_pin_from_irq(struct mp_chip_data *data, int apic, int pin)
+{
+ struct irq_pin_list *tmp, *entry;
+
+ list_for_each_entry_safe(entry, tmp, &data->irq_2_pin, list)
+ if (entry->apic == apic && entry->pin == pin) {
+ list_del(&entry->list);
+ kfree(entry);
+ return;
+ }
+}
+
+static void add_pin_to_irq_node(struct mp_chip_data *data,
+ int node, int apic, int pin)
+{
+ if (__add_pin_to_irq_node(data, node, apic, pin))
+ panic("IO-APIC: failed to add irq-pin. Can not proceed\n");
+}
+
+/*
+ * Reroute an IRQ to a different pin.
+ */
+static void __init replace_pin_at_irq_node(struct mp_chip_data *data, int node,
+ int oldapic, int oldpin,
+ int newapic, int newpin)
+{
+ struct irq_pin_list *entry;
+
+ for_each_irq_pin(entry, data->irq_2_pin) {
+ if (entry->apic == oldapic && entry->pin == oldpin) {
+ entry->apic = newapic;
+ entry->pin = newpin;
+ /* every one is different, right? */
+ return;
+ }
+ }
+
+ /* old apic/pin didn't exist, so just add new ones */
+ add_pin_to_irq_node(data, node, newapic, newpin);
+}
+
+static void io_apic_modify_irq(struct mp_chip_data *data,
+ int mask_and, int mask_or,
+ void (*final)(struct irq_pin_list *entry))
+{
+ union entry_union eu;
+ struct irq_pin_list *entry;
+
+ eu.entry = data->entry;
+ eu.w1 &= mask_and;
+ eu.w1 |= mask_or;
+ data->entry = eu.entry;
+
+ for_each_irq_pin(entry, data->irq_2_pin) {
+ io_apic_write(entry->apic, 0x10 + 2 * entry->pin, eu.w1);
+ if (final)
+ final(entry);
+ }
+}
+
+static void io_apic_sync(struct irq_pin_list *entry)
+{
+ /*
+ * Synchronize the IO-APIC and the CPU by doing
+ * a dummy read from the IO-APIC
+ */
+ struct io_apic __iomem *io_apic;
+
+ io_apic = io_apic_base(entry->apic);
+ readl(&io_apic->data);
+}
+
+static void mask_ioapic_irq(struct irq_data *irq_data)
+{
+ struct mp_chip_data *data = irq_data->chip_data;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ io_apic_modify_irq(data, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+static void __unmask_ioapic(struct mp_chip_data *data)
+{
+ io_apic_modify_irq(data, ~IO_APIC_REDIR_MASKED, 0, NULL);
+}
+
+static void unmask_ioapic_irq(struct irq_data *irq_data)
+{
+ struct mp_chip_data *data = irq_data->chip_data;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ __unmask_ioapic(data);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+/*
+ * IO-APIC versions below 0x20 don't support EOI register.
+ * For the record, here is the information about various versions:
+ * 0Xh 82489DX
+ * 1Xh I/OAPIC or I/O(x)APIC which are not PCI 2.2 Compliant
+ * 2Xh I/O(x)APIC which is PCI 2.2 Compliant
+ * 30h-FFh Reserved
+ *
+ * Some of the Intel ICH Specs (ICH2 to ICH5) documents the io-apic
+ * version as 0x2. This is an error with documentation and these ICH chips
+ * use io-apic's of version 0x20.
+ *
+ * For IO-APIC's with EOI register, we use that to do an explicit EOI.
+ * Otherwise, we simulate the EOI message manually by changing the trigger
+ * mode to edge and then back to level, with RTE being masked during this.
+ */
+static void __eoi_ioapic_pin(int apic, int pin, int vector)
+{
+ if (mpc_ioapic_ver(apic) >= 0x20) {
+ io_apic_eoi(apic, vector);
+ } else {
+ struct IO_APIC_route_entry entry, entry1;
+
+ entry = entry1 = __ioapic_read_entry(apic, pin);
+
+ /*
+ * Mask the entry and change the trigger mode to edge.
+ */
+ entry1.mask = IOAPIC_MASKED;
+ entry1.trigger = IOAPIC_EDGE;
+
+ __ioapic_write_entry(apic, pin, entry1);
+
+ /*
+ * Restore the previous level triggered entry.
+ */
+ __ioapic_write_entry(apic, pin, entry);
+ }
+}
+
+static void eoi_ioapic_pin(int vector, struct mp_chip_data *data)
+{
+ unsigned long flags;
+ struct irq_pin_list *entry;
+
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ for_each_irq_pin(entry, data->irq_2_pin)
+ __eoi_ioapic_pin(entry->apic, entry->pin, vector);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
+{
+ struct IO_APIC_route_entry entry;
+
+ /* Check delivery_mode to be sure we're not clearing an SMI pin */
+ entry = ioapic_read_entry(apic, pin);
+ if (entry.delivery_mode == dest_SMI)
+ return;
+
+ /*
+ * Make sure the entry is masked and re-read the contents to check
+ * if it is a level triggered pin and if the remote-IRR is set.
+ */
+ if (entry.mask == IOAPIC_UNMASKED) {
+ entry.mask = IOAPIC_MASKED;
+ ioapic_write_entry(apic, pin, entry);
+ entry = ioapic_read_entry(apic, pin);
+ }
+
+ if (entry.irr) {
+ unsigned long flags;
+
+ /*
+ * Make sure the trigger mode is set to level. Explicit EOI
+ * doesn't clear the remote-IRR if the trigger mode is not
+ * set to level.
+ */
+ if (entry.trigger == IOAPIC_EDGE) {
+ entry.trigger = IOAPIC_LEVEL;
+ ioapic_write_entry(apic, pin, entry);
+ }
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ __eoi_ioapic_pin(apic, pin, entry.vector);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+ }
+
+ /*
+ * Clear the rest of the bits in the IO-APIC RTE except for the mask
+ * bit.
+ */
+ ioapic_mask_entry(apic, pin);
+ entry = ioapic_read_entry(apic, pin);
+ if (entry.irr)
+ pr_err("Unable to reset IRR for apic: %d, pin :%d\n",
+ mpc_ioapic_id(apic), pin);
+}
+
+void clear_IO_APIC (void)
+{
+ int apic, pin;
+
+ for_each_ioapic_pin(apic, pin)
+ clear_IO_APIC_pin(apic, pin);
+}
+
+#ifdef CONFIG_X86_32
+/*
+ * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
+ * specific CPU-side IRQs.
+ */
+
+#define MAX_PIRQS 8
+static int pirq_entries[MAX_PIRQS] = {
+ [0 ... MAX_PIRQS - 1] = -1
+};
+
+static int __init ioapic_pirq_setup(char *str)
+{
+ int i, max;
+ int ints[MAX_PIRQS+1];
+
+ get_options(str, ARRAY_SIZE(ints), ints);
+
+ apic_printk(APIC_VERBOSE, KERN_INFO
+ "PIRQ redirection, working around broken MP-BIOS.\n");
+ max = MAX_PIRQS;
+ if (ints[0] < MAX_PIRQS)
+ max = ints[0];
+
+ for (i = 0; i < max; i++) {
+ apic_printk(APIC_VERBOSE, KERN_DEBUG
+ "... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
+ /*
+ * PIRQs are mapped upside down, usually.
+ */
+ pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
+ }
+ return 1;
+}
+
+__setup("pirq=", ioapic_pirq_setup);
+#endif /* CONFIG_X86_32 */
+
+/*
+ * Saves all the IO-APIC RTE's
+ */
+int save_ioapic_entries(void)
+{
+ int apic, pin;
+ int err = 0;
+
+ for_each_ioapic(apic) {
+ if (!ioapics[apic].saved_registers) {
+ err = -ENOMEM;
+ continue;
+ }
+
+ for_each_pin(apic, pin)
+ ioapics[apic].saved_registers[pin] =
+ ioapic_read_entry(apic, pin);
+ }
+
+ return err;
+}
+
+/*
+ * Mask all IO APIC entries.
+ */
+void mask_ioapic_entries(void)
+{
+ int apic, pin;
+
+ for_each_ioapic(apic) {
+ if (!ioapics[apic].saved_registers)
+ continue;
+
+ for_each_pin(apic, pin) {
+ struct IO_APIC_route_entry entry;
+
+ entry = ioapics[apic].saved_registers[pin];
+ if (entry.mask == IOAPIC_UNMASKED) {
+ entry.mask = IOAPIC_MASKED;
+ ioapic_write_entry(apic, pin, entry);
+ }
+ }
+ }
+}
+
+/*
+ * Restore IO APIC entries which was saved in the ioapic structure.
+ */
+int restore_ioapic_entries(void)
+{
+ int apic, pin;
+
+ for_each_ioapic(apic) {
+ if (!ioapics[apic].saved_registers)
+ continue;
+
+ for_each_pin(apic, pin)
+ ioapic_write_entry(apic, pin,
+ ioapics[apic].saved_registers[pin]);
+ }
+ return 0;
+}
+
+/*
+ * Find the IRQ entry number of a certain pin.
+ */
+static int find_irq_entry(int ioapic_idx, int pin, int type)
+{
+ int i;
+
+ for (i = 0; i < mp_irq_entries; i++)
+ if (mp_irqs[i].irqtype == type &&
+ (mp_irqs[i].dstapic == mpc_ioapic_id(ioapic_idx) ||
+ mp_irqs[i].dstapic == MP_APIC_ALL) &&
+ mp_irqs[i].dstirq == pin)
+ return i;
+
+ return -1;
+}
+
+/*
+ * Find the pin to which IRQ[irq] (ISA) is connected
+ */
+static int __init find_isa_irq_pin(int irq, int type)
+{
+ int i;
+
+ for (i = 0; i < mp_irq_entries; i++) {
+ int lbus = mp_irqs[i].srcbus;
+
+ if (test_bit(lbus, mp_bus_not_pci) &&
+ (mp_irqs[i].irqtype == type) &&
+ (mp_irqs[i].srcbusirq == irq))
+
+ return mp_irqs[i].dstirq;
+ }
+ return -1;
+}
+
+static int __init find_isa_irq_apic(int irq, int type)
+{
+ int i;
+
+ for (i = 0; i < mp_irq_entries; i++) {
+ int lbus = mp_irqs[i].srcbus;
+
+ if (test_bit(lbus, mp_bus_not_pci) &&
+ (mp_irqs[i].irqtype == type) &&
+ (mp_irqs[i].srcbusirq == irq))
+ break;
+ }
+
+ if (i < mp_irq_entries) {
+ int ioapic_idx;
+
+ for_each_ioapic(ioapic_idx)
+ if (mpc_ioapic_id(ioapic_idx) == mp_irqs[i].dstapic)
+ return ioapic_idx;
+ }
+
+ return -1;
+}
+
+#ifdef CONFIG_EISA
+/*
+ * EISA Edge/Level control register, ELCR
+ */
+static int EISA_ELCR(unsigned int irq)
+{
+ if (irq < nr_legacy_irqs()) {
+ unsigned int port = 0x4d0 + (irq >> 3);
+ return (inb(port) >> (irq & 7)) & 1;
+ }
+ apic_printk(APIC_VERBOSE, KERN_INFO
+ "Broken MPtable reports ISA irq %d\n", irq);
+ return 0;
+}
+
+#endif
+
+/* ISA interrupts are always active high edge triggered,
+ * when listed as conforming in the MP table. */
+
+#define default_ISA_trigger(idx) (IOAPIC_EDGE)
+#define default_ISA_polarity(idx) (IOAPIC_POL_HIGH)
+
+/* EISA interrupts are always polarity zero and can be edge or level
+ * trigger depending on the ELCR value. If an interrupt is listed as
+ * EISA conforming in the MP table, that means its trigger type must
+ * be read in from the ELCR */
+
+#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].srcbusirq))
+#define default_EISA_polarity(idx) default_ISA_polarity(idx)
+
+/* PCI interrupts are always active low level triggered,
+ * when listed as conforming in the MP table. */
+
+#define default_PCI_trigger(idx) (IOAPIC_LEVEL)
+#define default_PCI_polarity(idx) (IOAPIC_POL_LOW)
+
+static int irq_polarity(int idx)
+{
+ int bus = mp_irqs[idx].srcbus;
+
+ /*
+ * Determine IRQ line polarity (high active or low active):
+ */
+ switch (mp_irqs[idx].irqflag & MP_IRQPOL_MASK) {
+ case MP_IRQPOL_DEFAULT:
+ /* conforms to spec, ie. bus-type dependent polarity */
+ if (test_bit(bus, mp_bus_not_pci))
+ return default_ISA_polarity(idx);
+ else
+ return default_PCI_polarity(idx);
+ case MP_IRQPOL_ACTIVE_HIGH:
+ return IOAPIC_POL_HIGH;
+ case MP_IRQPOL_RESERVED:
+ pr_warn("IOAPIC: Invalid polarity: 2, defaulting to low\n");
+ case MP_IRQPOL_ACTIVE_LOW:
+ default: /* Pointless default required due to do gcc stupidity */
+ return IOAPIC_POL_LOW;
+ }
+}
+
+#ifdef CONFIG_EISA
+static int eisa_irq_trigger(int idx, int bus, int trigger)
+{
+ switch (mp_bus_id_to_type[bus]) {
+ case MP_BUS_PCI:
+ case MP_BUS_ISA:
+ return trigger;
+ case MP_BUS_EISA:
+ return default_EISA_trigger(idx);
+ }
+ pr_warn("IOAPIC: Invalid srcbus: %d defaulting to level\n", bus);
+ return IOAPIC_LEVEL;
+}
+#else
+static inline int eisa_irq_trigger(int idx, int bus, int trigger)
+{
+ return trigger;
+}
+#endif
+
+static int irq_trigger(int idx)
+{
+ int bus = mp_irqs[idx].srcbus;
+ int trigger;
+
+ /*
+ * Determine IRQ trigger mode (edge or level sensitive):
+ */
+ switch (mp_irqs[idx].irqflag & MP_IRQTRIG_MASK) {
+ case MP_IRQTRIG_DEFAULT:
+ /* conforms to spec, ie. bus-type dependent trigger mode */
+ if (test_bit(bus, mp_bus_not_pci))
+ trigger = default_ISA_trigger(idx);
+ else
+ trigger = default_PCI_trigger(idx);
+ /* Take EISA into account */
+ return eisa_irq_trigger(idx, bus, trigger);
+ case MP_IRQTRIG_EDGE:
+ return IOAPIC_EDGE;
+ case MP_IRQTRIG_RESERVED:
+ pr_warn("IOAPIC: Invalid trigger mode 2 defaulting to level\n");
+ case MP_IRQTRIG_LEVEL:
+ default: /* Pointless default required due to do gcc stupidity */
+ return IOAPIC_LEVEL;
+ }
+}
+
+void ioapic_set_alloc_attr(struct irq_alloc_info *info, int node,
+ int trigger, int polarity)
+{
+ init_irq_alloc_info(info, NULL);
+ info->type = X86_IRQ_ALLOC_TYPE_IOAPIC;
+ info->ioapic_node = node;
+ info->ioapic_trigger = trigger;
+ info->ioapic_polarity = polarity;
+ info->ioapic_valid = 1;
+}
+
+#ifndef CONFIG_ACPI
+int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity);
+#endif
+
+static void ioapic_copy_alloc_attr(struct irq_alloc_info *dst,
+ struct irq_alloc_info *src,
+ u32 gsi, int ioapic_idx, int pin)
+{
+ int trigger, polarity;
+
+ copy_irq_alloc_info(dst, src);
+ dst->type = X86_IRQ_ALLOC_TYPE_IOAPIC;
+ dst->ioapic_id = mpc_ioapic_id(ioapic_idx);
+ dst->ioapic_pin = pin;
+ dst->ioapic_valid = 1;
+ if (src && src->ioapic_valid) {
+ dst->ioapic_node = src->ioapic_node;
+ dst->ioapic_trigger = src->ioapic_trigger;
+ dst->ioapic_polarity = src->ioapic_polarity;
+ } else {
+ dst->ioapic_node = NUMA_NO_NODE;
+ if (acpi_get_override_irq(gsi, &trigger, &polarity) >= 0) {
+ dst->ioapic_trigger = trigger;
+ dst->ioapic_polarity = polarity;
+ } else {
+ /*
+ * PCI interrupts are always active low level
+ * triggered.
+ */
+ dst->ioapic_trigger = IOAPIC_LEVEL;
+ dst->ioapic_polarity = IOAPIC_POL_LOW;
+ }
+ }
+}
+
+static int ioapic_alloc_attr_node(struct irq_alloc_info *info)
+{
+ return (info && info->ioapic_valid) ? info->ioapic_node : NUMA_NO_NODE;
+}
+
+static void mp_register_handler(unsigned int irq, unsigned long trigger)
+{
+ irq_flow_handler_t hdl;
+ bool fasteoi;
+
+ if (trigger) {
+ irq_set_status_flags(irq, IRQ_LEVEL);
+ fasteoi = true;
+ } else {
+ irq_clear_status_flags(irq, IRQ_LEVEL);
+ fasteoi = false;
+ }
+
+ hdl = fasteoi ? handle_fasteoi_irq : handle_edge_irq;
+ __irq_set_handler(irq, hdl, 0, fasteoi ? "fasteoi" : "edge");
+}
+
+static bool mp_check_pin_attr(int irq, struct irq_alloc_info *info)
+{
+ struct mp_chip_data *data = irq_get_chip_data(irq);
+
+ /*
+ * setup_IO_APIC_irqs() programs all legacy IRQs with default trigger
+ * and polarity attirbutes. So allow the first user to reprogram the
+ * pin with real trigger and polarity attributes.
+ */
+ if (irq < nr_legacy_irqs() && data->count == 1) {
+ if (info->ioapic_trigger != data->trigger)
+ mp_register_handler(irq, info->ioapic_trigger);
+ data->entry.trigger = data->trigger = info->ioapic_trigger;
+ data->entry.polarity = data->polarity = info->ioapic_polarity;
+ }
+
+ return data->trigger == info->ioapic_trigger &&
+ data->polarity == info->ioapic_polarity;
+}
+
+static int alloc_irq_from_domain(struct irq_domain *domain, int ioapic, u32 gsi,
+ struct irq_alloc_info *info)
+{
+ bool legacy = false;
+ int irq = -1;
+ int type = ioapics[ioapic].irqdomain_cfg.type;
+
+ switch (type) {
+ case IOAPIC_DOMAIN_LEGACY:
+ /*
+ * Dynamically allocate IRQ number for non-ISA IRQs in the first
+ * 16 GSIs on some weird platforms.
+ */
+ if (!ioapic_initialized || gsi >= nr_legacy_irqs())
+ irq = gsi;
+ legacy = mp_is_legacy_irq(irq);
+ break;
+ case IOAPIC_DOMAIN_STRICT:
+ irq = gsi;
+ break;
+ case IOAPIC_DOMAIN_DYNAMIC:
+ break;
+ default:
+ WARN(1, "ioapic: unknown irqdomain type %d\n", type);
+ return -1;
+ }
+
+ return __irq_domain_alloc_irqs(domain, irq, 1,
+ ioapic_alloc_attr_node(info),
+ info, legacy, NULL);
+}
+
+/*
+ * Need special handling for ISA IRQs because there may be multiple IOAPIC pins
+ * sharing the same ISA IRQ number and irqdomain only supports 1:1 mapping
+ * between IOAPIC pin and IRQ number. A typical IOAPIC has 24 pins, pin 0-15 are
+ * used for legacy IRQs and pin 16-23 are used for PCI IRQs (PIRQ A-H).
+ * When ACPI is disabled, only legacy IRQ numbers (IRQ0-15) are available, and
+ * some BIOSes may use MP Interrupt Source records to override IRQ numbers for
+ * PIRQs instead of reprogramming the interrupt routing logic. Thus there may be
+ * multiple pins sharing the same legacy IRQ number when ACPI is disabled.
+ */
+static int alloc_isa_irq_from_domain(struct irq_domain *domain,
+ int irq, int ioapic, int pin,
+ struct irq_alloc_info *info)
+{
+ struct mp_chip_data *data;
+ struct irq_data *irq_data = irq_get_irq_data(irq);
+ int node = ioapic_alloc_attr_node(info);
+
+ /*
+ * Legacy ISA IRQ has already been allocated, just add pin to
+ * the pin list assoicated with this IRQ and program the IOAPIC
+ * entry. The IOAPIC entry
+ */
+ if (irq_data && irq_data->parent_data) {
+ if (!mp_check_pin_attr(irq, info))
+ return -EBUSY;
+ if (__add_pin_to_irq_node(irq_data->chip_data, node, ioapic,
+ info->ioapic_pin))
+ return -ENOMEM;
+ } else {
+ info->flags |= X86_IRQ_ALLOC_LEGACY;
+ irq = __irq_domain_alloc_irqs(domain, irq, 1, node, info, true,
+ NULL);
+ if (irq >= 0) {
+ irq_data = irq_domain_get_irq_data(domain, irq);
+ data = irq_data->chip_data;
+ data->isa_irq = true;
+ }
+ }
+
+ return irq;
+}
+
+static int mp_map_pin_to_irq(u32 gsi, int idx, int ioapic, int pin,
+ unsigned int flags, struct irq_alloc_info *info)
+{
+ int irq;
+ bool legacy = false;
+ struct irq_alloc_info tmp;
+ struct mp_chip_data *data;
+ struct irq_domain *domain = mp_ioapic_irqdomain(ioapic);
+
+ if (!domain)
+ return -ENOSYS;
+
+ if (idx >= 0 && test_bit(mp_irqs[idx].srcbus, mp_bus_not_pci)) {
+ irq = mp_irqs[idx].srcbusirq;
+ legacy = mp_is_legacy_irq(irq);
+ /*
+ * IRQ2 is unusable for historical reasons on systems which
+ * have a legacy PIC. See the comment vs. IRQ2 further down.
+ *
+ * If this gets removed at some point then the related code
+ * in lapic_assign_system_vectors() needs to be adjusted as
+ * well.
+ */
+ if (legacy && irq == PIC_CASCADE_IR)
+ return -EINVAL;
+ }
+
+ mutex_lock(&ioapic_mutex);
+ if (!(flags & IOAPIC_MAP_ALLOC)) {
+ if (!legacy) {
+ irq = irq_find_mapping(domain, pin);
+ if (irq == 0)
+ irq = -ENOENT;
+ }
+ } else {
+ ioapic_copy_alloc_attr(&tmp, info, gsi, ioapic, pin);
+ if (legacy)
+ irq = alloc_isa_irq_from_domain(domain, irq,
+ ioapic, pin, &tmp);
+ else if ((irq = irq_find_mapping(domain, pin)) == 0)
+ irq = alloc_irq_from_domain(domain, ioapic, gsi, &tmp);
+ else if (!mp_check_pin_attr(irq, &tmp))
+ irq = -EBUSY;
+ if (irq >= 0) {
+ data = irq_get_chip_data(irq);
+ data->count++;
+ }
+ }
+ mutex_unlock(&ioapic_mutex);
+
+ return irq;
+}
+
+static int pin_2_irq(int idx, int ioapic, int pin, unsigned int flags)
+{
+ u32 gsi = mp_pin_to_gsi(ioapic, pin);
+
+ /*
+ * Debugging check, we are in big trouble if this message pops up!
+ */
+ if (mp_irqs[idx].dstirq != pin)
+ pr_err("broken BIOS or MPTABLE parser, ayiee!!\n");
+
+#ifdef CONFIG_X86_32
+ /*
+ * PCI IRQ command line redirection. Yes, limits are hardcoded.
+ */
+ if ((pin >= 16) && (pin <= 23)) {
+ if (pirq_entries[pin-16] != -1) {
+ if (!pirq_entries[pin-16]) {
+ apic_printk(APIC_VERBOSE, KERN_DEBUG
+ "disabling PIRQ%d\n", pin-16);
+ } else {
+ int irq = pirq_entries[pin-16];
+ apic_printk(APIC_VERBOSE, KERN_DEBUG
+ "using PIRQ%d -> IRQ %d\n",
+ pin-16, irq);
+ return irq;
+ }
+ }
+ }
+#endif
+
+ return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags, NULL);
+}
+
+int mp_map_gsi_to_irq(u32 gsi, unsigned int flags, struct irq_alloc_info *info)
+{
+ int ioapic, pin, idx;
+
+ ioapic = mp_find_ioapic(gsi);
+ if (ioapic < 0)
+ return -ENODEV;
+
+ pin = mp_find_ioapic_pin(ioapic, gsi);
+ idx = find_irq_entry(ioapic, pin, mp_INT);
+ if ((flags & IOAPIC_MAP_CHECK) && idx < 0)
+ return -ENODEV;
+
+ return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags, info);
+}
+
+void mp_unmap_irq(int irq)
+{
+ struct irq_data *irq_data = irq_get_irq_data(irq);
+ struct mp_chip_data *data;
+
+ if (!irq_data || !irq_data->domain)
+ return;
+
+ data = irq_data->chip_data;
+ if (!data || data->isa_irq)
+ return;
+
+ mutex_lock(&ioapic_mutex);
+ if (--data->count == 0)
+ irq_domain_free_irqs(irq, 1);
+ mutex_unlock(&ioapic_mutex);
+}
+
+/*
+ * Find a specific PCI IRQ entry.
+ * Not an __init, possibly needed by modules
+ */
+int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
+{
+ int irq, i, best_ioapic = -1, best_idx = -1;
+
+ apic_printk(APIC_DEBUG,
+ "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
+ bus, slot, pin);
+ if (test_bit(bus, mp_bus_not_pci)) {
+ apic_printk(APIC_VERBOSE,
+ "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
+ return -1;
+ }
+
+ for (i = 0; i < mp_irq_entries; i++) {
+ int lbus = mp_irqs[i].srcbus;
+ int ioapic_idx, found = 0;
+
+ if (bus != lbus || mp_irqs[i].irqtype != mp_INT ||
+ slot != ((mp_irqs[i].srcbusirq >> 2) & 0x1f))
+ continue;
+
+ for_each_ioapic(ioapic_idx)
+ if (mpc_ioapic_id(ioapic_idx) == mp_irqs[i].dstapic ||
+ mp_irqs[i].dstapic == MP_APIC_ALL) {
+ found = 1;
+ break;
+ }
+ if (!found)
+ continue;
+
+ /* Skip ISA IRQs */
+ irq = pin_2_irq(i, ioapic_idx, mp_irqs[i].dstirq, 0);
+ if (irq > 0 && !IO_APIC_IRQ(irq))
+ continue;
+
+ if (pin == (mp_irqs[i].srcbusirq & 3)) {
+ best_idx = i;
+ best_ioapic = ioapic_idx;
+ goto out;
+ }
+
+ /*
+ * Use the first all-but-pin matching entry as a
+ * best-guess fuzzy result for broken mptables.
+ */
+ if (best_idx < 0) {
+ best_idx = i;
+ best_ioapic = ioapic_idx;
+ }
+ }
+ if (best_idx < 0)
+ return -1;
+
+out:
+ return pin_2_irq(best_idx, best_ioapic, mp_irqs[best_idx].dstirq,
+ IOAPIC_MAP_ALLOC);
+}
+EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
+
+static struct irq_chip ioapic_chip, ioapic_ir_chip;
+
+static void __init setup_IO_APIC_irqs(void)
+{
+ unsigned int ioapic, pin;
+ int idx;
+
+ apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
+
+ for_each_ioapic_pin(ioapic, pin) {
+ idx = find_irq_entry(ioapic, pin, mp_INT);
+ if (idx < 0)
+ apic_printk(APIC_VERBOSE,
+ KERN_DEBUG " apic %d pin %d not connected\n",
+ mpc_ioapic_id(ioapic), pin);
+ else
+ pin_2_irq(idx, ioapic, pin,
+ ioapic ? 0 : IOAPIC_MAP_ALLOC);
+ }
+}
+
+void ioapic_zap_locks(void)
+{
+ raw_spin_lock_init(&ioapic_lock);
+}
+
+static void io_apic_print_entries(unsigned int apic, unsigned int nr_entries)
+{
+ int i;
+ char buf[256];
+ struct IO_APIC_route_entry entry;
+ struct IR_IO_APIC_route_entry *ir_entry = (void *)&entry;
+
+ printk(KERN_DEBUG "IOAPIC %d:\n", apic);
+ for (i = 0; i <= nr_entries; i++) {
+ entry = ioapic_read_entry(apic, i);
+ snprintf(buf, sizeof(buf),
+ " pin%02x, %s, %s, %s, V(%02X), IRR(%1d), S(%1d)",
+ i,
+ entry.mask == IOAPIC_MASKED ? "disabled" : "enabled ",
+ entry.trigger == IOAPIC_LEVEL ? "level" : "edge ",
+ entry.polarity == IOAPIC_POL_LOW ? "low " : "high",
+ entry.vector, entry.irr, entry.delivery_status);
+ if (ir_entry->format)
+ printk(KERN_DEBUG "%s, remapped, I(%04X), Z(%X)\n",
+ buf, (ir_entry->index2 << 15) | ir_entry->index,
+ ir_entry->zero);
+ else
+ printk(KERN_DEBUG "%s, %s, D(%02X), M(%1d)\n",
+ buf,
+ entry.dest_mode == IOAPIC_DEST_MODE_LOGICAL ?
+ "logical " : "physical",
+ entry.dest, entry.delivery_mode);
+ }
+}
+
+static void __init print_IO_APIC(int ioapic_idx)
+{
+ union IO_APIC_reg_00 reg_00;
+ union IO_APIC_reg_01 reg_01;
+ union IO_APIC_reg_02 reg_02;
+ union IO_APIC_reg_03 reg_03;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ reg_00.raw = io_apic_read(ioapic_idx, 0);
+ reg_01.raw = io_apic_read(ioapic_idx, 1);
+ if (reg_01.bits.version >= 0x10)
+ reg_02.raw = io_apic_read(ioapic_idx, 2);
+ if (reg_01.bits.version >= 0x20)
+ reg_03.raw = io_apic_read(ioapic_idx, 3);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ printk(KERN_DEBUG "IO APIC #%d......\n", mpc_ioapic_id(ioapic_idx));
+ printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
+ printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
+ printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
+ printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS);
+
+ printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)&reg_01);
+ printk(KERN_DEBUG "....... : max redirection entries: %02X\n",
+ reg_01.bits.entries);
+
+ printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ);
+ printk(KERN_DEBUG "....... : IO APIC version: %02X\n",
+ reg_01.bits.version);
+
+ /*
+ * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02,
+ * but the value of reg_02 is read as the previous read register
+ * value, so ignore it if reg_02 == reg_01.
+ */
+ if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) {
+ printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
+ printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration);
+ }
+
+ /*
+ * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02
+ * or reg_03, but the value of reg_0[23] is read as the previous read
+ * register value, so ignore it if reg_03 == reg_0[12].
+ */
+ if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw &&
+ reg_03.raw != reg_01.raw) {
+ printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw);
+ printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT);
+ }
+
+ printk(KERN_DEBUG ".... IRQ redirection table:\n");
+ io_apic_print_entries(ioapic_idx, reg_01.bits.entries);
+}
+
+void __init print_IO_APICs(void)
+{
+ int ioapic_idx;
+ unsigned int irq;
+
+ printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
+ for_each_ioapic(ioapic_idx)
+ printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
+ mpc_ioapic_id(ioapic_idx),
+ ioapics[ioapic_idx].nr_registers);
+
+ /*
+ * We are a bit conservative about what we expect. We have to
+ * know about every hardware change ASAP.
+ */
+ printk(KERN_INFO "testing the IO APIC.......................\n");
+
+ for_each_ioapic(ioapic_idx)
+ print_IO_APIC(ioapic_idx);
+
+ printk(KERN_DEBUG "IRQ to pin mappings:\n");
+ for_each_active_irq(irq) {
+ struct irq_pin_list *entry;
+ struct irq_chip *chip;
+ struct mp_chip_data *data;
+
+ chip = irq_get_chip(irq);
+ if (chip != &ioapic_chip && chip != &ioapic_ir_chip)
+ continue;
+ data = irq_get_chip_data(irq);
+ if (!data)
+ continue;
+ if (list_empty(&data->irq_2_pin))
+ continue;
+
+ printk(KERN_DEBUG "IRQ%d ", irq);
+ for_each_irq_pin(entry, data->irq_2_pin)
+ pr_cont("-> %d:%d", entry->apic, entry->pin);
+ pr_cont("\n");
+ }
+
+ printk(KERN_INFO ".................................... done.\n");
+}
+
+/* Where if anywhere is the i8259 connect in external int mode */
+static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
+
+void __init enable_IO_APIC(void)
+{
+ int i8259_apic, i8259_pin;
+ int apic, pin;
+
+ if (skip_ioapic_setup)
+ nr_ioapics = 0;
+
+ if (!nr_legacy_irqs() || !nr_ioapics)
+ return;
+
+ for_each_ioapic_pin(apic, pin) {
+ /* See if any of the pins is in ExtINT mode */
+ struct IO_APIC_route_entry entry = ioapic_read_entry(apic, pin);
+
+ /* If the interrupt line is enabled and in ExtInt mode
+ * I have found the pin where the i8259 is connected.
+ */
+ if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) {
+ ioapic_i8259.apic = apic;
+ ioapic_i8259.pin = pin;
+ goto found_i8259;
+ }
+ }
+ found_i8259:
+ /* Look to see what if the MP table has reported the ExtINT */
+ /* If we could not find the appropriate pin by looking at the ioapic
+ * the i8259 probably is not connected the ioapic but give the
+ * mptable a chance anyway.
+ */
+ i8259_pin = find_isa_irq_pin(0, mp_ExtINT);
+ i8259_apic = find_isa_irq_apic(0, mp_ExtINT);
+ /* Trust the MP table if nothing is setup in the hardware */
+ if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) {
+ printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n");
+ ioapic_i8259.pin = i8259_pin;
+ ioapic_i8259.apic = i8259_apic;
+ }
+ /* Complain if the MP table and the hardware disagree */
+ if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) &&
+ (i8259_pin >= 0) && (ioapic_i8259.pin >= 0))
+ {
+ printk(KERN_WARNING "ExtINT in hardware and MP table differ\n");
+ }
+
+ /*
+ * Do not trust the IO-APIC being empty at bootup
+ */
+ clear_IO_APIC();
+}
+
+void native_restore_boot_irq_mode(void)
+{
+ /*
+ * If the i8259 is routed through an IOAPIC
+ * Put that IOAPIC in virtual wire mode
+ * so legacy interrupts can be delivered.
+ */
+ if (ioapic_i8259.pin != -1) {
+ struct IO_APIC_route_entry entry;
+
+ memset(&entry, 0, sizeof(entry));
+ entry.mask = IOAPIC_UNMASKED;
+ entry.trigger = IOAPIC_EDGE;
+ entry.polarity = IOAPIC_POL_HIGH;
+ entry.dest_mode = IOAPIC_DEST_MODE_PHYSICAL;
+ entry.delivery_mode = dest_ExtINT;
+ entry.dest = read_apic_id();
+
+ /*
+ * Add it to the IO-APIC irq-routing table:
+ */
+ ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry);
+ }
+
+ if (boot_cpu_has(X86_FEATURE_APIC) || apic_from_smp_config())
+ disconnect_bsp_APIC(ioapic_i8259.pin != -1);
+}
+
+void restore_boot_irq_mode(void)
+{
+ if (!nr_legacy_irqs())
+ return;
+
+ x86_apic_ops.restore();
+}
+
+#ifdef CONFIG_X86_32
+/*
+ * function to set the IO-APIC physical IDs based on the
+ * values stored in the MPC table.
+ *
+ * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
+ */
+void __init setup_ioapic_ids_from_mpc_nocheck(void)
+{
+ union IO_APIC_reg_00 reg_00;
+ physid_mask_t phys_id_present_map;
+ int ioapic_idx;
+ int i;
+ unsigned char old_id;
+ unsigned long flags;
+
+ /*
+ * This is broken; anything with a real cpu count has to
+ * circumvent this idiocy regardless.
+ */
+ apic->ioapic_phys_id_map(&phys_cpu_present_map, &phys_id_present_map);
+
+ /*
+ * Set the IOAPIC ID to the value stored in the MPC table.
+ */
+ for_each_ioapic(ioapic_idx) {
+ /* Read the register 0 value */
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ reg_00.raw = io_apic_read(ioapic_idx, 0);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ old_id = mpc_ioapic_id(ioapic_idx);
+
+ if (mpc_ioapic_id(ioapic_idx) >= get_physical_broadcast()) {
+ printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
+ ioapic_idx, mpc_ioapic_id(ioapic_idx));
+ printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
+ reg_00.bits.ID);
+ ioapics[ioapic_idx].mp_config.apicid = reg_00.bits.ID;
+ }
+
+ /*
+ * Sanity check, is the ID really free? Every APIC in a
+ * system must have a unique ID or we get lots of nice
+ * 'stuck on smp_invalidate_needed IPI wait' messages.
+ */
+ if (apic->check_apicid_used(&phys_id_present_map,
+ mpc_ioapic_id(ioapic_idx))) {
+ printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
+ ioapic_idx, mpc_ioapic_id(ioapic_idx));
+ for (i = 0; i < get_physical_broadcast(); i++)
+ if (!physid_isset(i, phys_id_present_map))
+ break;
+ if (i >= get_physical_broadcast())
+ panic("Max APIC ID exceeded!\n");
+ printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
+ i);
+ physid_set(i, phys_id_present_map);
+ ioapics[ioapic_idx].mp_config.apicid = i;
+ } else {
+ physid_mask_t tmp;
+ apic->apicid_to_cpu_present(mpc_ioapic_id(ioapic_idx),
+ &tmp);
+ apic_printk(APIC_VERBOSE, "Setting %d in the "
+ "phys_id_present_map\n",
+ mpc_ioapic_id(ioapic_idx));
+ physids_or(phys_id_present_map, phys_id_present_map, tmp);
+ }
+
+ /*
+ * We need to adjust the IRQ routing table
+ * if the ID changed.
+ */
+ if (old_id != mpc_ioapic_id(ioapic_idx))
+ for (i = 0; i < mp_irq_entries; i++)
+ if (mp_irqs[i].dstapic == old_id)
+ mp_irqs[i].dstapic
+ = mpc_ioapic_id(ioapic_idx);
+
+ /*
+ * Update the ID register according to the right value
+ * from the MPC table if they are different.
+ */
+ if (mpc_ioapic_id(ioapic_idx) == reg_00.bits.ID)
+ continue;
+
+ apic_printk(APIC_VERBOSE, KERN_INFO
+ "...changing IO-APIC physical APIC ID to %d ...",
+ mpc_ioapic_id(ioapic_idx));
+
+ reg_00.bits.ID = mpc_ioapic_id(ioapic_idx);
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ io_apic_write(ioapic_idx, 0, reg_00.raw);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ /*
+ * Sanity check
+ */
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ reg_00.raw = io_apic_read(ioapic_idx, 0);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+ if (reg_00.bits.ID != mpc_ioapic_id(ioapic_idx))
+ pr_cont("could not set ID!\n");
+ else
+ apic_printk(APIC_VERBOSE, " ok.\n");
+ }
+}
+
+void __init setup_ioapic_ids_from_mpc(void)
+{
+
+ if (acpi_ioapic)
+ return;
+ /*
+ * Don't check I/O APIC IDs for xAPIC systems. They have
+ * no meaning without the serial APIC bus.
+ */
+ if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+ || APIC_XAPIC(boot_cpu_apic_version))
+ return;
+ setup_ioapic_ids_from_mpc_nocheck();
+}
+#endif
+
+int no_timer_check __initdata;
+
+static int __init notimercheck(char *s)
+{
+ no_timer_check = 1;
+ return 1;
+}
+__setup("no_timer_check", notimercheck);
+
+static void __init delay_with_tsc(void)
+{
+ unsigned long long start, now;
+ unsigned long end = jiffies + 4;
+
+ start = rdtsc();
+
+ /*
+ * We don't know the TSC frequency yet, but waiting for
+ * 40000000000/HZ TSC cycles is safe:
+ * 4 GHz == 10 jiffies
+ * 1 GHz == 40 jiffies
+ */
+ do {
+ rep_nop();
+ now = rdtsc();
+ } while ((now - start) < 40000000000ULL / HZ &&
+ time_before_eq(jiffies, end));
+}
+
+static void __init delay_without_tsc(void)
+{
+ unsigned long end = jiffies + 4;
+ int band = 1;
+
+ /*
+ * We don't know any frequency yet, but waiting for
+ * 40940000000/HZ cycles is safe:
+ * 4 GHz == 10 jiffies
+ * 1 GHz == 40 jiffies
+ * 1 << 1 + 1 << 2 +...+ 1 << 11 = 4094
+ */
+ do {
+ __delay(((1U << band++) * 10000000UL) / HZ);
+ } while (band < 12 && time_before_eq(jiffies, end));
+}
+
+/*
+ * There is a nasty bug in some older SMP boards, their mptable lies
+ * about the timer IRQ. We do the following to work around the situation:
+ *
+ * - timer IRQ defaults to IO-APIC IRQ
+ * - if this function detects that timer IRQs are defunct, then we fall
+ * back to ISA timer IRQs
+ */
+static int __init timer_irq_works(void)
+{
+ unsigned long t1 = jiffies;
+ unsigned long flags;
+
+ if (no_timer_check)
+ return 1;
+
+ local_save_flags(flags);
+ local_irq_enable();
+
+ if (boot_cpu_has(X86_FEATURE_TSC))
+ delay_with_tsc();
+ else
+ delay_without_tsc();
+
+ local_irq_restore(flags);
+
+ /*
+ * Expect a few ticks at least, to be sure some possible
+ * glue logic does not lock up after one or two first
+ * ticks in a non-ExtINT mode. Also the local APIC
+ * might have cached one ExtINT interrupt. Finally, at
+ * least one tick may be lost due to delays.
+ */
+
+ /* jiffies wrap? */
+ if (time_after(jiffies, t1 + 4))
+ return 1;
+ return 0;
+}
+
+/*
+ * In the SMP+IOAPIC case it might happen that there are an unspecified
+ * number of pending IRQ events unhandled. These cases are very rare,
+ * so we 'resend' these IRQs via IPIs, to the same CPU. It's much
+ * better to do it this way as thus we do not have to be aware of
+ * 'pending' interrupts in the IRQ path, except at this point.
+ */
+/*
+ * Edge triggered needs to resend any interrupt
+ * that was delayed but this is now handled in the device
+ * independent code.
+ */
+
+/*
+ * Starting up a edge-triggered IO-APIC interrupt is
+ * nasty - we need to make sure that we get the edge.
+ * If it is already asserted for some reason, we need
+ * return 1 to indicate that is was pending.
+ *
+ * This is not complete - we should be able to fake
+ * an edge even if it isn't on the 8259A...
+ */
+static unsigned int startup_ioapic_irq(struct irq_data *data)
+{
+ int was_pending = 0, irq = data->irq;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ if (irq < nr_legacy_irqs()) {
+ legacy_pic->mask(irq);
+ if (legacy_pic->irq_pending(irq))
+ was_pending = 1;
+ }
+ __unmask_ioapic(data->chip_data);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ return was_pending;
+}
+
+atomic_t irq_mis_count;
+
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+static bool io_apic_level_ack_pending(struct mp_chip_data *data)
+{
+ struct irq_pin_list *entry;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ for_each_irq_pin(entry, data->irq_2_pin) {
+ unsigned int reg;
+ int pin;
+
+ pin = entry->pin;
+ reg = io_apic_read(entry->apic, 0x10 + pin*2);
+ /* Is the remote IRR bit set? */
+ if (reg & IO_APIC_REDIR_REMOTE_IRR) {
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+ return true;
+ }
+ }
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ return false;
+}
+
+static inline bool ioapic_irqd_mask(struct irq_data *data)
+{
+ /* If we are moving the IRQ we need to mask it */
+ if (unlikely(irqd_is_setaffinity_pending(data))) {
+ if (!irqd_irq_masked(data))
+ mask_ioapic_irq(data);
+ return true;
+ }
+ return false;
+}
+
+static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked)
+{
+ if (unlikely(masked)) {
+ /* Only migrate the irq if the ack has been received.
+ *
+ * On rare occasions the broadcast level triggered ack gets
+ * delayed going to ioapics, and if we reprogram the
+ * vector while Remote IRR is still set the irq will never
+ * fire again.
+ *
+ * To prevent this scenario we read the Remote IRR bit
+ * of the ioapic. This has two effects.
+ * - On any sane system the read of the ioapic will
+ * flush writes (and acks) going to the ioapic from
+ * this cpu.
+ * - We get to see if the ACK has actually been delivered.
+ *
+ * Based on failed experiments of reprogramming the
+ * ioapic entry from outside of irq context starting
+ * with masking the ioapic entry and then polling until
+ * Remote IRR was clear before reprogramming the
+ * ioapic I don't trust the Remote IRR bit to be
+ * completey accurate.
+ *
+ * However there appears to be no other way to plug
+ * this race, so if the Remote IRR bit is not
+ * accurate and is causing problems then it is a hardware bug
+ * and you can go talk to the chipset vendor about it.
+ */
+ if (!io_apic_level_ack_pending(data->chip_data))
+ irq_move_masked_irq(data);
+ /* If the IRQ is masked in the core, leave it: */
+ if (!irqd_irq_masked(data))
+ unmask_ioapic_irq(data);
+ }
+}
+#else
+static inline bool ioapic_irqd_mask(struct irq_data *data)
+{
+ return false;
+}
+static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked)
+{
+}
+#endif
+
+static void ioapic_ack_level(struct irq_data *irq_data)
+{
+ struct irq_cfg *cfg = irqd_cfg(irq_data);
+ unsigned long v;
+ bool masked;
+ int i;
+
+ irq_complete_move(cfg);
+ masked = ioapic_irqd_mask(irq_data);
+
+ /*
+ * It appears there is an erratum which affects at least version 0x11
+ * of I/O APIC (that's the 82093AA and cores integrated into various
+ * chipsets). Under certain conditions a level-triggered interrupt is
+ * erroneously delivered as edge-triggered one but the respective IRR
+ * bit gets set nevertheless. As a result the I/O unit expects an EOI
+ * message but it will never arrive and further interrupts are blocked
+ * from the source. The exact reason is so far unknown, but the
+ * phenomenon was observed when two consecutive interrupt requests
+ * from a given source get delivered to the same CPU and the source is
+ * temporarily disabled in between.
+ *
+ * A workaround is to simulate an EOI message manually. We achieve it
+ * by setting the trigger mode to edge and then to level when the edge
+ * trigger mode gets detected in the TMR of a local APIC for a
+ * level-triggered interrupt. We mask the source for the time of the
+ * operation to prevent an edge-triggered interrupt escaping meanwhile.
+ * The idea is from Manfred Spraul. --macro
+ *
+ * Also in the case when cpu goes offline, fixup_irqs() will forward
+ * any unhandled interrupt on the offlined cpu to the new cpu
+ * destination that is handling the corresponding interrupt. This
+ * interrupt forwarding is done via IPI's. Hence, in this case also
+ * level-triggered io-apic interrupt will be seen as an edge
+ * interrupt in the IRR. And we can't rely on the cpu's EOI
+ * to be broadcasted to the IO-APIC's which will clear the remoteIRR
+ * corresponding to the level-triggered interrupt. Hence on IO-APIC's
+ * supporting EOI register, we do an explicit EOI to clear the
+ * remote IRR and on IO-APIC's which don't have an EOI register,
+ * we use the above logic (mask+edge followed by unmask+level) from
+ * Manfred Spraul to clear the remote IRR.
+ */
+ i = cfg->vector;
+ v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
+
+ /*
+ * We must acknowledge the irq before we move it or the acknowledge will
+ * not propagate properly.
+ */
+ ack_APIC_irq();
+
+ /*
+ * Tail end of clearing remote IRR bit (either by delivering the EOI
+ * message via io-apic EOI register write or simulating it using
+ * mask+edge followed by unnask+level logic) manually when the
+ * level triggered interrupt is seen as the edge triggered interrupt
+ * at the cpu.
+ */
+ if (!(v & (1 << (i & 0x1f)))) {
+ atomic_inc(&irq_mis_count);
+ eoi_ioapic_pin(cfg->vector, irq_data->chip_data);
+ }
+
+ ioapic_irqd_unmask(irq_data, masked);
+}
+
+static void ioapic_ir_ack_level(struct irq_data *irq_data)
+{
+ struct mp_chip_data *data = irq_data->chip_data;
+
+ /*
+ * Intr-remapping uses pin number as the virtual vector
+ * in the RTE. Actual vector is programmed in
+ * intr-remapping table entry. Hence for the io-apic
+ * EOI we use the pin number.
+ */
+ apic_ack_irq(irq_data);
+ eoi_ioapic_pin(data->entry.vector, data);
+}
+
+static void ioapic_configure_entry(struct irq_data *irqd)
+{
+ struct mp_chip_data *mpd = irqd->chip_data;
+ struct irq_cfg *cfg = irqd_cfg(irqd);
+ struct irq_pin_list *entry;
+
+ /*
+ * Only update when the parent is the vector domain, don't touch it
+ * if the parent is the remapping domain. Check the installed
+ * ioapic chip to verify that.
+ */
+ if (irqd->chip == &ioapic_chip) {
+ mpd->entry.dest = cfg->dest_apicid;
+ mpd->entry.vector = cfg->vector;
+ }
+ for_each_irq_pin(entry, mpd->irq_2_pin)
+ __ioapic_write_entry(entry->apic, entry->pin, mpd->entry);
+}
+
+static int ioapic_set_affinity(struct irq_data *irq_data,
+ const struct cpumask *mask, bool force)
+{
+ struct irq_data *parent = irq_data->parent_data;
+ unsigned long flags;
+ int ret;
+
+ ret = parent->chip->irq_set_affinity(parent, mask, force);
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ if (ret >= 0 && ret != IRQ_SET_MASK_OK_DONE)
+ ioapic_configure_entry(irq_data);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ return ret;
+}
+
+/*
+ * Interrupt shutdown masks the ioapic pin, but the interrupt might already
+ * be in flight, but not yet serviced by the target CPU. That means
+ * __synchronize_hardirq() would return and claim that everything is calmed
+ * down. So free_irq() would proceed and deactivate the interrupt and free
+ * resources.
+ *
+ * Once the target CPU comes around to service it it will find a cleared
+ * vector and complain. While the spurious interrupt is harmless, the full
+ * release of resources might prevent the interrupt from being acknowledged
+ * which keeps the hardware in a weird state.
+ *
+ * Verify that the corresponding Remote-IRR bits are clear.
+ */
+static int ioapic_irq_get_chip_state(struct irq_data *irqd,
+ enum irqchip_irq_state which,
+ bool *state)
+{
+ struct mp_chip_data *mcd = irqd->chip_data;
+ struct IO_APIC_route_entry rentry;
+ struct irq_pin_list *p;
+
+ if (which != IRQCHIP_STATE_ACTIVE)
+ return -EINVAL;
+
+ *state = false;
+ raw_spin_lock(&ioapic_lock);
+ for_each_irq_pin(p, mcd->irq_2_pin) {
+ rentry = __ioapic_read_entry(p->apic, p->pin);
+ /*
+ * The remote IRR is only valid in level trigger mode. It's
+ * meaning is undefined for edge triggered interrupts and
+ * irrelevant because the IO-APIC treats them as fire and
+ * forget.
+ */
+ if (rentry.irr && rentry.trigger) {
+ *state = true;
+ break;
+ }
+ }
+ raw_spin_unlock(&ioapic_lock);
+ return 0;
+}
+
+static struct irq_chip ioapic_chip __read_mostly = {
+ .name = "IO-APIC",
+ .irq_startup = startup_ioapic_irq,
+ .irq_mask = mask_ioapic_irq,
+ .irq_unmask = unmask_ioapic_irq,
+ .irq_ack = irq_chip_ack_parent,
+ .irq_eoi = ioapic_ack_level,
+ .irq_set_affinity = ioapic_set_affinity,
+ .irq_retrigger = irq_chip_retrigger_hierarchy,
+ .irq_get_irqchip_state = ioapic_irq_get_chip_state,
+ .flags = IRQCHIP_SKIP_SET_WAKE |
+ IRQCHIP_AFFINITY_PRE_STARTUP,
+};
+
+static struct irq_chip ioapic_ir_chip __read_mostly = {
+ .name = "IR-IO-APIC",
+ .irq_startup = startup_ioapic_irq,
+ .irq_mask = mask_ioapic_irq,
+ .irq_unmask = unmask_ioapic_irq,
+ .irq_ack = irq_chip_ack_parent,
+ .irq_eoi = ioapic_ir_ack_level,
+ .irq_set_affinity = ioapic_set_affinity,
+ .irq_retrigger = irq_chip_retrigger_hierarchy,
+ .irq_get_irqchip_state = ioapic_irq_get_chip_state,
+ .flags = IRQCHIP_SKIP_SET_WAKE |
+ IRQCHIP_AFFINITY_PRE_STARTUP,
+};
+
+static inline void init_IO_APIC_traps(void)
+{
+ struct irq_cfg *cfg;
+ unsigned int irq;
+
+ for_each_active_irq(irq) {
+ cfg = irq_cfg(irq);
+ if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) {
+ /*
+ * Hmm.. We don't have an entry for this,
+ * so default to an old-fashioned 8259
+ * interrupt if we can..
+ */
+ if (irq < nr_legacy_irqs())
+ legacy_pic->make_irq(irq);
+ else
+ /* Strange. Oh, well.. */
+ irq_set_chip(irq, &no_irq_chip);
+ }
+ }
+}
+
+/*
+ * The local APIC irq-chip implementation:
+ */
+
+static void mask_lapic_irq(struct irq_data *data)
+{
+ unsigned long v;
+
+ v = apic_read(APIC_LVT0);
+ apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
+}
+
+static void unmask_lapic_irq(struct irq_data *data)
+{
+ unsigned long v;
+
+ v = apic_read(APIC_LVT0);
+ apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
+}
+
+static void ack_lapic_irq(struct irq_data *data)
+{
+ ack_APIC_irq();
+}
+
+static struct irq_chip lapic_chip __read_mostly = {
+ .name = "local-APIC",
+ .irq_mask = mask_lapic_irq,
+ .irq_unmask = unmask_lapic_irq,
+ .irq_ack = ack_lapic_irq,
+};
+
+static void lapic_register_intr(int irq)
+{
+ irq_clear_status_flags(irq, IRQ_LEVEL);
+ irq_set_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
+ "edge");
+}
+
+/*
+ * This looks a bit hackish but it's about the only one way of sending
+ * a few INTA cycles to 8259As and any associated glue logic. ICR does
+ * not support the ExtINT mode, unfortunately. We need to send these
+ * cycles as some i82489DX-based boards have glue logic that keeps the
+ * 8259A interrupt line asserted until INTA. --macro
+ */
+static inline void __init unlock_ExtINT_logic(void)
+{
+ int apic, pin, i;
+ struct IO_APIC_route_entry entry0, entry1;
+ unsigned char save_control, save_freq_select;
+
+ pin = find_isa_irq_pin(8, mp_INT);
+ if (pin == -1) {
+ WARN_ON_ONCE(1);
+ return;
+ }
+ apic = find_isa_irq_apic(8, mp_INT);
+ if (apic == -1) {
+ WARN_ON_ONCE(1);
+ return;
+ }
+
+ entry0 = ioapic_read_entry(apic, pin);
+ clear_IO_APIC_pin(apic, pin);
+
+ memset(&entry1, 0, sizeof(entry1));
+
+ entry1.dest_mode = IOAPIC_DEST_MODE_PHYSICAL;
+ entry1.mask = IOAPIC_UNMASKED;
+ entry1.dest = hard_smp_processor_id();
+ entry1.delivery_mode = dest_ExtINT;
+ entry1.polarity = entry0.polarity;
+ entry1.trigger = IOAPIC_EDGE;
+ entry1.vector = 0;
+
+ ioapic_write_entry(apic, pin, entry1);
+
+ save_control = CMOS_READ(RTC_CONTROL);
+ save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
+ CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6,
+ RTC_FREQ_SELECT);
+ CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL);
+
+ i = 100;
+ while (i-- > 0) {
+ mdelay(10);
+ if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF)
+ i -= 10;
+ }
+
+ CMOS_WRITE(save_control, RTC_CONTROL);
+ CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
+ clear_IO_APIC_pin(apic, pin);
+
+ ioapic_write_entry(apic, pin, entry0);
+}
+
+static int disable_timer_pin_1 __initdata;
+/* Actually the next is obsolete, but keep it for paranoid reasons -AK */
+static int __init disable_timer_pin_setup(char *arg)
+{
+ disable_timer_pin_1 = 1;
+ return 0;
+}
+early_param("disable_timer_pin_1", disable_timer_pin_setup);
+
+static int mp_alloc_timer_irq(int ioapic, int pin)
+{
+ int irq = -1;
+ struct irq_domain *domain = mp_ioapic_irqdomain(ioapic);
+
+ if (domain) {
+ struct irq_alloc_info info;
+
+ ioapic_set_alloc_attr(&info, NUMA_NO_NODE, 0, 0);
+ info.ioapic_id = mpc_ioapic_id(ioapic);
+ info.ioapic_pin = pin;
+ mutex_lock(&ioapic_mutex);
+ irq = alloc_isa_irq_from_domain(domain, 0, ioapic, pin, &info);
+ mutex_unlock(&ioapic_mutex);
+ }
+
+ return irq;
+}
+
+/*
+ * This code may look a bit paranoid, but it's supposed to cooperate with
+ * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ
+ * is so screwy. Thanks to Brian Perkins for testing/hacking this beast
+ * fanatically on his truly buggy board.
+ *
+ * FIXME: really need to revamp this for all platforms.
+ */
+static inline void __init check_timer(void)
+{
+ struct irq_data *irq_data = irq_get_irq_data(0);
+ struct mp_chip_data *data = irq_data->chip_data;
+ struct irq_cfg *cfg = irqd_cfg(irq_data);
+ int node = cpu_to_node(0);
+ int apic1, pin1, apic2, pin2;
+ unsigned long flags;
+ int no_pin1 = 0;
+
+ local_irq_save(flags);
+
+ /*
+ * get/set the timer IRQ vector:
+ */
+ legacy_pic->mask(0);
+
+ /*
+ * As IRQ0 is to be enabled in the 8259A, the virtual
+ * wire has to be disabled in the local APIC. Also
+ * timer interrupts need to be acknowledged manually in
+ * the 8259A for the i82489DX when using the NMI
+ * watchdog as that APIC treats NMIs as level-triggered.
+ * The AEOI mode will finish them in the 8259A
+ * automatically.
+ */
+ apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
+ legacy_pic->init(1);
+
+ pin1 = find_isa_irq_pin(0, mp_INT);
+ apic1 = find_isa_irq_apic(0, mp_INT);
+ pin2 = ioapic_i8259.pin;
+ apic2 = ioapic_i8259.apic;
+
+ apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X "
+ "apic1=%d pin1=%d apic2=%d pin2=%d\n",
+ cfg->vector, apic1, pin1, apic2, pin2);
+
+ /*
+ * Some BIOS writers are clueless and report the ExtINTA
+ * I/O APIC input from the cascaded 8259A as the timer
+ * interrupt input. So just in case, if only one pin
+ * was found above, try it both directly and through the
+ * 8259A.
+ */
+ if (pin1 == -1) {
+ panic_if_irq_remap("BIOS bug: timer not connected to IO-APIC");
+ pin1 = pin2;
+ apic1 = apic2;
+ no_pin1 = 1;
+ } else if (pin2 == -1) {
+ pin2 = pin1;
+ apic2 = apic1;
+ }
+
+ if (pin1 != -1) {
+ /* Ok, does IRQ0 through the IOAPIC work? */
+ if (no_pin1) {
+ mp_alloc_timer_irq(apic1, pin1);
+ } else {
+ /*
+ * for edge trigger, it's already unmasked,
+ * so only need to unmask if it is level-trigger
+ * do we really have level trigger timer?
+ */
+ int idx;
+ idx = find_irq_entry(apic1, pin1, mp_INT);
+ if (idx != -1 && irq_trigger(idx))
+ unmask_ioapic_irq(irq_get_irq_data(0));
+ }
+ irq_domain_deactivate_irq(irq_data);
+ irq_domain_activate_irq(irq_data, false);
+ if (timer_irq_works()) {
+ if (disable_timer_pin_1 > 0)
+ clear_IO_APIC_pin(0, pin1);
+ goto out;
+ }
+ panic_if_irq_remap("timer doesn't work through Interrupt-remapped IO-APIC");
+ local_irq_disable();
+ clear_IO_APIC_pin(apic1, pin1);
+ if (!no_pin1)
+ apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
+ "8254 timer not connected to IO-APIC\n");
+
+ apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer "
+ "(IRQ0) through the 8259A ...\n");
+ apic_printk(APIC_QUIET, KERN_INFO
+ "..... (found apic %d pin %d) ...\n", apic2, pin2);
+ /*
+ * legacy devices should be connected to IO APIC #0
+ */
+ replace_pin_at_irq_node(data, node, apic1, pin1, apic2, pin2);
+ irq_domain_deactivate_irq(irq_data);
+ irq_domain_activate_irq(irq_data, false);
+ legacy_pic->unmask(0);
+ if (timer_irq_works()) {
+ apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
+ goto out;
+ }
+ /*
+ * Cleanup, just in case ...
+ */
+ local_irq_disable();
+ legacy_pic->mask(0);
+ clear_IO_APIC_pin(apic2, pin2);
+ apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
+ }
+
+ apic_printk(APIC_QUIET, KERN_INFO
+ "...trying to set up timer as Virtual Wire IRQ...\n");
+
+ lapic_register_intr(0);
+ apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */
+ legacy_pic->unmask(0);
+
+ if (timer_irq_works()) {
+ apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
+ goto out;
+ }
+ local_irq_disable();
+ legacy_pic->mask(0);
+ apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
+ apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
+
+ apic_printk(APIC_QUIET, KERN_INFO
+ "...trying to set up timer as ExtINT IRQ...\n");
+
+ legacy_pic->init(0);
+ legacy_pic->make_irq(0);
+ apic_write(APIC_LVT0, APIC_DM_EXTINT);
+ legacy_pic->unmask(0);
+
+ unlock_ExtINT_logic();
+
+ if (timer_irq_works()) {
+ apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
+ goto out;
+ }
+ local_irq_disable();
+ apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
+ if (apic_is_x2apic_enabled())
+ apic_printk(APIC_QUIET, KERN_INFO
+ "Perhaps problem with the pre-enabled x2apic mode\n"
+ "Try booting with x2apic and interrupt-remapping disabled in the bios.\n");
+ panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
+ "report. Then try booting with the 'noapic' option.\n");
+out:
+ local_irq_restore(flags);
+}
+
+/*
+ * Traditionally ISA IRQ2 is the cascade IRQ, and is not available
+ * to devices. However there may be an I/O APIC pin available for
+ * this interrupt regardless. The pin may be left unconnected, but
+ * typically it will be reused as an ExtINT cascade interrupt for
+ * the master 8259A. In the MPS case such a pin will normally be
+ * reported as an ExtINT interrupt in the MP table. With ACPI
+ * there is no provision for ExtINT interrupts, and in the absence
+ * of an override it would be treated as an ordinary ISA I/O APIC
+ * interrupt, that is edge-triggered and unmasked by default. We
+ * used to do this, but it caused problems on some systems because
+ * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using
+ * the same ExtINT cascade interrupt to drive the local APIC of the
+ * bootstrap processor. Therefore we refrain from routing IRQ2 to
+ * the I/O APIC in all cases now. No actual device should request
+ * it anyway. --macro
+ */
+#define PIC_IRQS (1UL << PIC_CASCADE_IR)
+
+static int mp_irqdomain_create(int ioapic)
+{
+ struct irq_alloc_info info;
+ struct irq_domain *parent;
+ int hwirqs = mp_ioapic_pin_count(ioapic);
+ struct ioapic *ip = &ioapics[ioapic];
+ struct ioapic_domain_cfg *cfg = &ip->irqdomain_cfg;
+ struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(ioapic);
+ struct fwnode_handle *fn;
+ char *name = "IO-APIC";
+
+ if (cfg->type == IOAPIC_DOMAIN_INVALID)
+ return 0;
+
+ init_irq_alloc_info(&info, NULL);
+ info.type = X86_IRQ_ALLOC_TYPE_IOAPIC;
+ info.ioapic_id = mpc_ioapic_id(ioapic);
+ parent = irq_remapping_get_ir_irq_domain(&info);
+ if (!parent)
+ parent = x86_vector_domain;
+ else
+ name = "IO-APIC-IR";
+
+ /* Handle device tree enumerated APICs proper */
+ if (cfg->dev) {
+ fn = of_node_to_fwnode(cfg->dev);
+ } else {
+ fn = irq_domain_alloc_named_id_fwnode(name, ioapic);
+ if (!fn)
+ return -ENOMEM;
+ }
+
+ ip->irqdomain = irq_domain_create_linear(fn, hwirqs, cfg->ops,
+ (void *)(long)ioapic);
+
+ if (!ip->irqdomain) {
+ /* Release fw handle if it was allocated above */
+ if (!cfg->dev)
+ irq_domain_free_fwnode(fn);
+ return -ENOMEM;
+ }
+
+ ip->irqdomain->parent = parent;
+
+ if (cfg->type == IOAPIC_DOMAIN_LEGACY ||
+ cfg->type == IOAPIC_DOMAIN_STRICT)
+ ioapic_dynirq_base = max(ioapic_dynirq_base,
+ gsi_cfg->gsi_end + 1);
+
+ return 0;
+}
+
+static void ioapic_destroy_irqdomain(int idx)
+{
+ struct ioapic_domain_cfg *cfg = &ioapics[idx].irqdomain_cfg;
+ struct fwnode_handle *fn = ioapics[idx].irqdomain->fwnode;
+
+ if (ioapics[idx].irqdomain) {
+ irq_domain_remove(ioapics[idx].irqdomain);
+ if (!cfg->dev)
+ irq_domain_free_fwnode(fn);
+ ioapics[idx].irqdomain = NULL;
+ }
+}
+
+void __init setup_IO_APIC(void)
+{
+ int ioapic;
+
+ if (skip_ioapic_setup || !nr_ioapics)
+ return;
+
+ io_apic_irqs = nr_legacy_irqs() ? ~PIC_IRQS : ~0UL;
+
+ apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
+ for_each_ioapic(ioapic)
+ BUG_ON(mp_irqdomain_create(ioapic));
+
+ /*
+ * Set up IO-APIC IRQ routing.
+ */
+ x86_init.mpparse.setup_ioapic_ids();
+
+ sync_Arb_IDs();
+ setup_IO_APIC_irqs();
+ init_IO_APIC_traps();
+ if (nr_legacy_irqs())
+ check_timer();
+
+ ioapic_initialized = 1;
+}
+
+static void resume_ioapic_id(int ioapic_idx)
+{
+ unsigned long flags;
+ union IO_APIC_reg_00 reg_00;
+
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ reg_00.raw = io_apic_read(ioapic_idx, 0);
+ if (reg_00.bits.ID != mpc_ioapic_id(ioapic_idx)) {
+ reg_00.bits.ID = mpc_ioapic_id(ioapic_idx);
+ io_apic_write(ioapic_idx, 0, reg_00.raw);
+ }
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+static void ioapic_resume(void)
+{
+ int ioapic_idx;
+
+ for_each_ioapic_reverse(ioapic_idx)
+ resume_ioapic_id(ioapic_idx);
+
+ restore_ioapic_entries();
+}
+
+static struct syscore_ops ioapic_syscore_ops = {
+ .suspend = save_ioapic_entries,
+ .resume = ioapic_resume,
+};
+
+static int __init ioapic_init_ops(void)
+{
+ register_syscore_ops(&ioapic_syscore_ops);
+
+ return 0;
+}
+
+device_initcall(ioapic_init_ops);
+
+static int io_apic_get_redir_entries(int ioapic)
+{
+ union IO_APIC_reg_01 reg_01;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ reg_01.raw = io_apic_read(ioapic, 1);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ /* The register returns the maximum index redir index
+ * supported, which is one less than the total number of redir
+ * entries.
+ */
+ return reg_01.bits.entries + 1;
+}
+
+unsigned int arch_dynirq_lower_bound(unsigned int from)
+{
+ /*
+ * dmar_alloc_hwirq() may be called before setup_IO_APIC(), so use
+ * gsi_top if ioapic_dynirq_base hasn't been initialized yet.
+ */
+ if (!ioapic_initialized)
+ return gsi_top;
+ /*
+ * For DT enabled machines ioapic_dynirq_base is irrelevant and not
+ * updated. So simply return @from if ioapic_dynirq_base == 0.
+ */
+ return ioapic_dynirq_base ? : from;
+}
+
+#ifdef CONFIG_X86_32
+static int io_apic_get_unique_id(int ioapic, int apic_id)
+{
+ union IO_APIC_reg_00 reg_00;
+ static physid_mask_t apic_id_map = PHYSID_MASK_NONE;
+ physid_mask_t tmp;
+ unsigned long flags;
+ int i = 0;
+
+ /*
+ * The P4 platform supports up to 256 APIC IDs on two separate APIC
+ * buses (one for LAPICs, one for IOAPICs), where predecessors only
+ * supports up to 16 on one shared APIC bus.
+ *
+ * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
+ * advantage of new APIC bus architecture.
+ */
+
+ if (physids_empty(apic_id_map))
+ apic->ioapic_phys_id_map(&phys_cpu_present_map, &apic_id_map);
+
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ reg_00.raw = io_apic_read(ioapic, 0);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ if (apic_id >= get_physical_broadcast()) {
+ printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying "
+ "%d\n", ioapic, apic_id, reg_00.bits.ID);
+ apic_id = reg_00.bits.ID;
+ }
+
+ /*
+ * Every APIC in a system must have a unique ID or we get lots of nice
+ * 'stuck on smp_invalidate_needed IPI wait' messages.
+ */
+ if (apic->check_apicid_used(&apic_id_map, apic_id)) {
+
+ for (i = 0; i < get_physical_broadcast(); i++) {
+ if (!apic->check_apicid_used(&apic_id_map, i))
+ break;
+ }
+
+ if (i == get_physical_broadcast())
+ panic("Max apic_id exceeded!\n");
+
+ printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, "
+ "trying %d\n", ioapic, apic_id, i);
+
+ apic_id = i;
+ }
+
+ apic->apicid_to_cpu_present(apic_id, &tmp);
+ physids_or(apic_id_map, apic_id_map, tmp);
+
+ if (reg_00.bits.ID != apic_id) {
+ reg_00.bits.ID = apic_id;
+
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ io_apic_write(ioapic, 0, reg_00.raw);
+ reg_00.raw = io_apic_read(ioapic, 0);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ /* Sanity check */
+ if (reg_00.bits.ID != apic_id) {
+ pr_err("IOAPIC[%d]: Unable to change apic_id!\n",
+ ioapic);
+ return -1;
+ }
+ }
+
+ apic_printk(APIC_VERBOSE, KERN_INFO
+ "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id);
+
+ return apic_id;
+}
+
+static u8 io_apic_unique_id(int idx, u8 id)
+{
+ if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
+ !APIC_XAPIC(boot_cpu_apic_version))
+ return io_apic_get_unique_id(idx, id);
+ else
+ return id;
+}
+#else
+static u8 io_apic_unique_id(int idx, u8 id)
+{
+ union IO_APIC_reg_00 reg_00;
+ DECLARE_BITMAP(used, 256);
+ unsigned long flags;
+ u8 new_id;
+ int i;
+
+ bitmap_zero(used, 256);
+ for_each_ioapic(i)
+ __set_bit(mpc_ioapic_id(i), used);
+
+ /* Hand out the requested id if available */
+ if (!test_bit(id, used))
+ return id;
+
+ /*
+ * Read the current id from the ioapic and keep it if
+ * available.
+ */
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ reg_00.raw = io_apic_read(idx, 0);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+ new_id = reg_00.bits.ID;
+ if (!test_bit(new_id, used)) {
+ apic_printk(APIC_VERBOSE, KERN_INFO
+ "IOAPIC[%d]: Using reg apic_id %d instead of %d\n",
+ idx, new_id, id);
+ return new_id;
+ }
+
+ /*
+ * Get the next free id and write it to the ioapic.
+ */
+ new_id = find_first_zero_bit(used, 256);
+ reg_00.bits.ID = new_id;
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ io_apic_write(idx, 0, reg_00.raw);
+ reg_00.raw = io_apic_read(idx, 0);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+ /* Sanity check */
+ BUG_ON(reg_00.bits.ID != new_id);
+
+ return new_id;
+}
+#endif
+
+static int io_apic_get_version(int ioapic)
+{
+ union IO_APIC_reg_01 reg_01;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ reg_01.raw = io_apic_read(ioapic, 1);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ return reg_01.bits.version;
+}
+
+int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity)
+{
+ int ioapic, pin, idx;
+
+ if (skip_ioapic_setup)
+ return -1;
+
+ ioapic = mp_find_ioapic(gsi);
+ if (ioapic < 0)
+ return -1;
+
+ pin = mp_find_ioapic_pin(ioapic, gsi);
+ if (pin < 0)
+ return -1;
+
+ idx = find_irq_entry(ioapic, pin, mp_INT);
+ if (idx < 0)
+ return -1;
+
+ *trigger = irq_trigger(idx);
+ *polarity = irq_polarity(idx);
+ return 0;
+}
+
+/*
+ * This function updates target affinity of IOAPIC interrupts to include
+ * the CPUs which came online during SMP bringup.
+ */
+#define IOAPIC_RESOURCE_NAME_SIZE 11
+
+static struct resource *ioapic_resources;
+
+static struct resource * __init ioapic_setup_resources(void)
+{
+ unsigned long n;
+ struct resource *res;
+ char *mem;
+ int i;
+
+ if (nr_ioapics == 0)
+ return NULL;
+
+ n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource);
+ n *= nr_ioapics;
+
+ mem = alloc_bootmem(n);
+ res = (void *)mem;
+
+ mem += sizeof(struct resource) * nr_ioapics;
+
+ for_each_ioapic(i) {
+ res[i].name = mem;
+ res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+ snprintf(mem, IOAPIC_RESOURCE_NAME_SIZE, "IOAPIC %u", i);
+ mem += IOAPIC_RESOURCE_NAME_SIZE;
+ ioapics[i].iomem_res = &res[i];
+ }
+
+ ioapic_resources = res;
+
+ return res;
+}
+
+void __init io_apic_init_mappings(void)
+{
+ unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
+ struct resource *ioapic_res;
+ int i;
+
+ ioapic_res = ioapic_setup_resources();
+ for_each_ioapic(i) {
+ if (smp_found_config) {
+ ioapic_phys = mpc_ioapic_addr(i);
+#ifdef CONFIG_X86_32
+ if (!ioapic_phys) {
+ printk(KERN_ERR
+ "WARNING: bogus zero IO-APIC "
+ "address found in MPTABLE, "
+ "disabling IO/APIC support!\n");
+ smp_found_config = 0;
+ skip_ioapic_setup = 1;
+ goto fake_ioapic_page;
+ }
+#endif
+ } else {
+#ifdef CONFIG_X86_32
+fake_ioapic_page:
+#endif
+ ioapic_phys = (unsigned long)alloc_bootmem_pages(PAGE_SIZE);
+ ioapic_phys = __pa(ioapic_phys);
+ }
+ set_fixmap_nocache(idx, ioapic_phys);
+ apic_printk(APIC_VERBOSE, "mapped IOAPIC to %08lx (%08lx)\n",
+ __fix_to_virt(idx) + (ioapic_phys & ~PAGE_MASK),
+ ioapic_phys);
+ idx++;
+
+ ioapic_res->start = ioapic_phys;
+ ioapic_res->end = ioapic_phys + IO_APIC_SLOT_SIZE - 1;
+ ioapic_res++;
+ }
+}
+
+void __init ioapic_insert_resources(void)
+{
+ int i;
+ struct resource *r = ioapic_resources;
+
+ if (!r) {
+ if (nr_ioapics > 0)
+ printk(KERN_ERR
+ "IO APIC resources couldn't be allocated.\n");
+ return;
+ }
+
+ for_each_ioapic(i) {
+ insert_resource(&iomem_resource, r);
+ r++;
+ }
+}
+
+int mp_find_ioapic(u32 gsi)
+{
+ int i;
+
+ if (nr_ioapics == 0)
+ return -1;
+
+ /* Find the IOAPIC that manages this GSI. */
+ for_each_ioapic(i) {
+ struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(i);
+ if (gsi >= gsi_cfg->gsi_base && gsi <= gsi_cfg->gsi_end)
+ return i;
+ }
+
+ printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
+ return -1;
+}
+
+int mp_find_ioapic_pin(int ioapic, u32 gsi)
+{
+ struct mp_ioapic_gsi *gsi_cfg;
+
+ if (WARN_ON(ioapic < 0))
+ return -1;
+
+ gsi_cfg = mp_ioapic_gsi_routing(ioapic);
+ if (WARN_ON(gsi > gsi_cfg->gsi_end))
+ return -1;
+
+ return gsi - gsi_cfg->gsi_base;
+}
+
+static int bad_ioapic_register(int idx)
+{
+ union IO_APIC_reg_00 reg_00;
+ union IO_APIC_reg_01 reg_01;
+ union IO_APIC_reg_02 reg_02;
+
+ reg_00.raw = io_apic_read(idx, 0);
+ reg_01.raw = io_apic_read(idx, 1);
+ reg_02.raw = io_apic_read(idx, 2);
+
+ if (reg_00.raw == -1 && reg_01.raw == -1 && reg_02.raw == -1) {
+ pr_warn("I/O APIC 0x%x registers return all ones, skipping!\n",
+ mpc_ioapic_addr(idx));
+ return 1;
+ }
+
+ return 0;
+}
+
+static int find_free_ioapic_entry(void)
+{
+ int idx;
+
+ for (idx = 0; idx < MAX_IO_APICS; idx++)
+ if (ioapics[idx].nr_registers == 0)
+ return idx;
+
+ return MAX_IO_APICS;
+}
+
+/**
+ * mp_register_ioapic - Register an IOAPIC device
+ * @id: hardware IOAPIC ID
+ * @address: physical address of IOAPIC register area
+ * @gsi_base: base of GSI associated with the IOAPIC
+ * @cfg: configuration information for the IOAPIC
+ */
+int mp_register_ioapic(int id, u32 address, u32 gsi_base,
+ struct ioapic_domain_cfg *cfg)
+{
+ bool hotplug = !!ioapic_initialized;
+ struct mp_ioapic_gsi *gsi_cfg;
+ int idx, ioapic, entries;
+ u32 gsi_end;
+
+ if (!address) {
+ pr_warn("Bogus (zero) I/O APIC address found, skipping!\n");
+ return -EINVAL;
+ }
+ for_each_ioapic(ioapic)
+ if (ioapics[ioapic].mp_config.apicaddr == address) {
+ pr_warn("address 0x%x conflicts with IOAPIC%d\n",
+ address, ioapic);
+ return -EEXIST;
+ }
+
+ idx = find_free_ioapic_entry();
+ if (idx >= MAX_IO_APICS) {
+ pr_warn("Max # of I/O APICs (%d) exceeded (found %d), skipping\n",
+ MAX_IO_APICS, idx);
+ return -ENOSPC;
+ }
+
+ ioapics[idx].mp_config.type = MP_IOAPIC;
+ ioapics[idx].mp_config.flags = MPC_APIC_USABLE;
+ ioapics[idx].mp_config.apicaddr = address;
+
+ set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
+ if (bad_ioapic_register(idx)) {
+ clear_fixmap(FIX_IO_APIC_BASE_0 + idx);
+ return -ENODEV;
+ }
+
+ ioapics[idx].mp_config.apicid = io_apic_unique_id(idx, id);
+ ioapics[idx].mp_config.apicver = io_apic_get_version(idx);
+
+ /*
+ * Build basic GSI lookup table to facilitate gsi->io_apic lookups
+ * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
+ */
+ entries = io_apic_get_redir_entries(idx);
+ gsi_end = gsi_base + entries - 1;
+ for_each_ioapic(ioapic) {
+ gsi_cfg = mp_ioapic_gsi_routing(ioapic);
+ if ((gsi_base >= gsi_cfg->gsi_base &&
+ gsi_base <= gsi_cfg->gsi_end) ||
+ (gsi_end >= gsi_cfg->gsi_base &&
+ gsi_end <= gsi_cfg->gsi_end)) {
+ pr_warn("GSI range [%u-%u] for new IOAPIC conflicts with GSI[%u-%u]\n",
+ gsi_base, gsi_end,
+ gsi_cfg->gsi_base, gsi_cfg->gsi_end);
+ clear_fixmap(FIX_IO_APIC_BASE_0 + idx);
+ return -ENOSPC;
+ }
+ }
+ gsi_cfg = mp_ioapic_gsi_routing(idx);
+ gsi_cfg->gsi_base = gsi_base;
+ gsi_cfg->gsi_end = gsi_end;
+
+ ioapics[idx].irqdomain = NULL;
+ ioapics[idx].irqdomain_cfg = *cfg;
+
+ /*
+ * If mp_register_ioapic() is called during early boot stage when
+ * walking ACPI/SFI/DT tables, it's too early to create irqdomain,
+ * we are still using bootmem allocator. So delay it to setup_IO_APIC().
+ */
+ if (hotplug) {
+ if (mp_irqdomain_create(idx)) {
+ clear_fixmap(FIX_IO_APIC_BASE_0 + idx);
+ return -ENOMEM;
+ }
+ alloc_ioapic_saved_registers(idx);
+ }
+
+ if (gsi_cfg->gsi_end >= gsi_top)
+ gsi_top = gsi_cfg->gsi_end + 1;
+ if (nr_ioapics <= idx)
+ nr_ioapics = idx + 1;
+
+ /* Set nr_registers to mark entry present */
+ ioapics[idx].nr_registers = entries;
+
+ pr_info("IOAPIC[%d]: apic_id %d, version %d, address 0x%x, GSI %d-%d\n",
+ idx, mpc_ioapic_id(idx),
+ mpc_ioapic_ver(idx), mpc_ioapic_addr(idx),
+ gsi_cfg->gsi_base, gsi_cfg->gsi_end);
+
+ return 0;
+}
+
+int mp_unregister_ioapic(u32 gsi_base)
+{
+ int ioapic, pin;
+ int found = 0;
+
+ for_each_ioapic(ioapic)
+ if (ioapics[ioapic].gsi_config.gsi_base == gsi_base) {
+ found = 1;
+ break;
+ }
+ if (!found) {
+ pr_warn("can't find IOAPIC for GSI %d\n", gsi_base);
+ return -ENODEV;
+ }
+
+ for_each_pin(ioapic, pin) {
+ u32 gsi = mp_pin_to_gsi(ioapic, pin);
+ int irq = mp_map_gsi_to_irq(gsi, 0, NULL);
+ struct mp_chip_data *data;
+
+ if (irq >= 0) {
+ data = irq_get_chip_data(irq);
+ if (data && data->count) {
+ pr_warn("pin%d on IOAPIC%d is still in use.\n",
+ pin, ioapic);
+ return -EBUSY;
+ }
+ }
+ }
+
+ /* Mark entry not present */
+ ioapics[ioapic].nr_registers = 0;
+ ioapic_destroy_irqdomain(ioapic);
+ free_ioapic_saved_registers(ioapic);
+ if (ioapics[ioapic].iomem_res)
+ release_resource(ioapics[ioapic].iomem_res);
+ clear_fixmap(FIX_IO_APIC_BASE_0 + ioapic);
+ memset(&ioapics[ioapic], 0, sizeof(ioapics[ioapic]));
+
+ return 0;
+}
+
+int mp_ioapic_registered(u32 gsi_base)
+{
+ int ioapic;
+
+ for_each_ioapic(ioapic)
+ if (ioapics[ioapic].gsi_config.gsi_base == gsi_base)
+ return 1;
+
+ return 0;
+}
+
+static void mp_irqdomain_get_attr(u32 gsi, struct mp_chip_data *data,
+ struct irq_alloc_info *info)
+{
+ if (info && info->ioapic_valid) {
+ data->trigger = info->ioapic_trigger;
+ data->polarity = info->ioapic_polarity;
+ } else if (acpi_get_override_irq(gsi, &data->trigger,
+ &data->polarity) < 0) {
+ /* PCI interrupts are always active low level triggered. */
+ data->trigger = IOAPIC_LEVEL;
+ data->polarity = IOAPIC_POL_LOW;
+ }
+}
+
+static void mp_setup_entry(struct irq_cfg *cfg, struct mp_chip_data *data,
+ struct IO_APIC_route_entry *entry)
+{
+ memset(entry, 0, sizeof(*entry));
+ entry->delivery_mode = apic->irq_delivery_mode;
+ entry->dest_mode = apic->irq_dest_mode;
+ entry->dest = cfg->dest_apicid;
+ entry->vector = cfg->vector;
+ entry->trigger = data->trigger;
+ entry->polarity = data->polarity;
+ /*
+ * Mask level triggered irqs. Edge triggered irqs are masked
+ * by the irq core code in case they fire.
+ */
+ if (data->trigger == IOAPIC_LEVEL)
+ entry->mask = IOAPIC_MASKED;
+ else
+ entry->mask = IOAPIC_UNMASKED;
+}
+
+int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs, void *arg)
+{
+ int ret, ioapic, pin;
+ struct irq_cfg *cfg;
+ struct irq_data *irq_data;
+ struct mp_chip_data *data;
+ struct irq_alloc_info *info = arg;
+ unsigned long flags;
+
+ if (!info || nr_irqs > 1)
+ return -EINVAL;
+ irq_data = irq_domain_get_irq_data(domain, virq);
+ if (!irq_data)
+ return -EINVAL;
+
+ ioapic = mp_irqdomain_ioapic_idx(domain);
+ pin = info->ioapic_pin;
+ if (irq_find_mapping(domain, (irq_hw_number_t)pin) > 0)
+ return -EEXIST;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+
+ info->ioapic_entry = &data->entry;
+ ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, info);
+ if (ret < 0) {
+ kfree(data);
+ return ret;
+ }
+
+ INIT_LIST_HEAD(&data->irq_2_pin);
+ irq_data->hwirq = info->ioapic_pin;
+ irq_data->chip = (domain->parent == x86_vector_domain) ?
+ &ioapic_chip : &ioapic_ir_chip;
+ irq_data->chip_data = data;
+ mp_irqdomain_get_attr(mp_pin_to_gsi(ioapic, pin), data, info);
+
+ cfg = irqd_cfg(irq_data);
+ add_pin_to_irq_node(data, ioapic_alloc_attr_node(info), ioapic, pin);
+
+ local_irq_save(flags);
+ if (info->ioapic_entry)
+ mp_setup_entry(cfg, data, info->ioapic_entry);
+ mp_register_handler(virq, data->trigger);
+ if (virq < nr_legacy_irqs())
+ legacy_pic->mask(virq);
+ local_irq_restore(flags);
+
+ apic_printk(APIC_VERBOSE, KERN_DEBUG
+ "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i Dest:%d)\n",
+ ioapic, mpc_ioapic_id(ioapic), pin, cfg->vector,
+ virq, data->trigger, data->polarity, cfg->dest_apicid);
+
+ return 0;
+}
+
+void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs)
+{
+ struct irq_data *irq_data;
+ struct mp_chip_data *data;
+
+ BUG_ON(nr_irqs != 1);
+ irq_data = irq_domain_get_irq_data(domain, virq);
+ if (irq_data && irq_data->chip_data) {
+ data = irq_data->chip_data;
+ __remove_pin_from_irq(data, mp_irqdomain_ioapic_idx(domain),
+ (int)irq_data->hwirq);
+ WARN_ON(!list_empty(&data->irq_2_pin));
+ kfree(irq_data->chip_data);
+ }
+ irq_domain_free_irqs_top(domain, virq, nr_irqs);
+}
+
+int mp_irqdomain_activate(struct irq_domain *domain,
+ struct irq_data *irq_data, bool reserve)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ ioapic_configure_entry(irq_data);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+ return 0;
+}
+
+void mp_irqdomain_deactivate(struct irq_domain *domain,
+ struct irq_data *irq_data)
+{
+ /* It won't be called for IRQ with multiple IOAPIC pins associated */
+ ioapic_mask_entry(mp_irqdomain_ioapic_idx(domain),
+ (int)irq_data->hwirq);
+}
+
+int mp_irqdomain_ioapic_idx(struct irq_domain *domain)
+{
+ return (int)(long)domain->host_data;
+}
+
+const struct irq_domain_ops mp_ioapic_irqdomain_ops = {
+ .alloc = mp_irqdomain_alloc,
+ .free = mp_irqdomain_free,
+ .activate = mp_irqdomain_activate,
+ .deactivate = mp_irqdomain_deactivate,
+};
diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c
new file mode 100644
index 000000000..82f9244fe
--- /dev/null
+++ b/arch/x86/kernel/apic/ipi.c
@@ -0,0 +1,244 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/cpumask.h>
+#include <linux/interrupt.h>
+
+#include <linux/mm.h>
+#include <linux/delay.h>
+#include <linux/spinlock.h>
+#include <linux/kernel_stat.h>
+#include <linux/mc146818rtc.h>
+#include <linux/cache.h>
+#include <linux/cpu.h>
+
+#include <asm/smp.h>
+#include <asm/mtrr.h>
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+#include <asm/apic.h>
+#include <asm/proto.h>
+#include <asm/ipi.h>
+
+void __default_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest)
+{
+ /*
+ * Subtle. In the case of the 'never do double writes' workaround
+ * we have to lock out interrupts to be safe. As we don't care
+ * of the value read we use an atomic rmw access to avoid costly
+ * cli/sti. Otherwise we use an even cheaper single atomic write
+ * to the APIC.
+ */
+ unsigned int cfg;
+
+ /*
+ * Wait for idle.
+ */
+ __xapic_wait_icr_idle();
+
+ /*
+ * No need to touch the target chip field
+ */
+ cfg = __prepare_ICR(shortcut, vector, dest);
+
+ /*
+ * Send the IPI. The write to APIC_ICR fires this off.
+ */
+ native_apic_mem_write(APIC_ICR, cfg);
+}
+
+/*
+ * This is used to send an IPI with no shorthand notation (the destination is
+ * specified in bits 56 to 63 of the ICR).
+ */
+void __default_send_IPI_dest_field(unsigned int mask, int vector, unsigned int dest)
+{
+ unsigned long cfg;
+
+ /*
+ * Wait for idle.
+ */
+ if (unlikely(vector == NMI_VECTOR))
+ safe_apic_wait_icr_idle();
+ else
+ __xapic_wait_icr_idle();
+
+ /*
+ * prepare target chip field
+ */
+ cfg = __prepare_ICR2(mask);
+ native_apic_mem_write(APIC_ICR2, cfg);
+
+ /*
+ * program the ICR
+ */
+ cfg = __prepare_ICR(0, vector, dest);
+
+ /*
+ * Send the IPI. The write to APIC_ICR fires this off.
+ */
+ native_apic_mem_write(APIC_ICR, cfg);
+}
+
+void default_send_IPI_single_phys(int cpu, int vector)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid, cpu),
+ vector, APIC_DEST_PHYSICAL);
+ local_irq_restore(flags);
+}
+
+void default_send_IPI_mask_sequence_phys(const struct cpumask *mask, int vector)
+{
+ unsigned long query_cpu;
+ unsigned long flags;
+
+ /*
+ * Hack. The clustered APIC addressing mode doesn't allow us to send
+ * to an arbitrary mask, so I do a unicast to each CPU instead.
+ * - mbligh
+ */
+ local_irq_save(flags);
+ for_each_cpu(query_cpu, mask) {
+ __default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid,
+ query_cpu), vector, APIC_DEST_PHYSICAL);
+ }
+ local_irq_restore(flags);
+}
+
+void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask,
+ int vector)
+{
+ unsigned int this_cpu = smp_processor_id();
+ unsigned int query_cpu;
+ unsigned long flags;
+
+ /* See Hack comment above */
+
+ local_irq_save(flags);
+ for_each_cpu(query_cpu, mask) {
+ if (query_cpu == this_cpu)
+ continue;
+ __default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid,
+ query_cpu), vector, APIC_DEST_PHYSICAL);
+ }
+ local_irq_restore(flags);
+}
+
+/*
+ * Helper function for APICs which insist on cpumasks
+ */
+void default_send_IPI_single(int cpu, int vector)
+{
+ apic->send_IPI_mask(cpumask_of(cpu), vector);
+}
+
+#ifdef CONFIG_X86_32
+
+void default_send_IPI_mask_sequence_logical(const struct cpumask *mask,
+ int vector)
+{
+ unsigned long flags;
+ unsigned int query_cpu;
+
+ /*
+ * Hack. The clustered APIC addressing mode doesn't allow us to send
+ * to an arbitrary mask, so I do a unicasts to each CPU instead. This
+ * should be modified to do 1 message per cluster ID - mbligh
+ */
+
+ local_irq_save(flags);
+ for_each_cpu(query_cpu, mask)
+ __default_send_IPI_dest_field(
+ early_per_cpu(x86_cpu_to_logical_apicid, query_cpu),
+ vector, apic->dest_logical);
+ local_irq_restore(flags);
+}
+
+void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask,
+ int vector)
+{
+ unsigned long flags;
+ unsigned int query_cpu;
+ unsigned int this_cpu = smp_processor_id();
+
+ /* See Hack comment above */
+
+ local_irq_save(flags);
+ for_each_cpu(query_cpu, mask) {
+ if (query_cpu == this_cpu)
+ continue;
+ __default_send_IPI_dest_field(
+ early_per_cpu(x86_cpu_to_logical_apicid, query_cpu),
+ vector, apic->dest_logical);
+ }
+ local_irq_restore(flags);
+}
+
+/*
+ * This is only used on smaller machines.
+ */
+void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector)
+{
+ unsigned long mask = cpumask_bits(cpumask)[0];
+ unsigned long flags;
+
+ if (!mask)
+ return;
+
+ local_irq_save(flags);
+ WARN_ON(mask & ~cpumask_bits(cpu_online_mask)[0]);
+ __default_send_IPI_dest_field(mask, vector, apic->dest_logical);
+ local_irq_restore(flags);
+}
+
+void default_send_IPI_allbutself(int vector)
+{
+ /*
+ * if there are no other CPUs in the system then we get an APIC send
+ * error if we try to broadcast, thus avoid sending IPIs in this case.
+ */
+ if (!(num_online_cpus() > 1))
+ return;
+
+ __default_local_send_IPI_allbutself(vector);
+}
+
+void default_send_IPI_all(int vector)
+{
+ __default_local_send_IPI_all(vector);
+}
+
+void default_send_IPI_self(int vector)
+{
+ __default_send_IPI_shortcut(APIC_DEST_SELF, vector, apic->dest_logical);
+}
+
+/* must come after the send_IPI functions above for inlining */
+static int convert_apicid_to_cpu(int apic_id)
+{
+ int i;
+
+ for_each_possible_cpu(i) {
+ if (per_cpu(x86_cpu_to_apicid, i) == apic_id)
+ return i;
+ }
+ return -1;
+}
+
+int safe_smp_processor_id(void)
+{
+ int apicid, cpuid;
+
+ if (!boot_cpu_has(X86_FEATURE_APIC))
+ return 0;
+
+ apicid = hard_smp_processor_id();
+ if (apicid == BAD_APICID)
+ return 0;
+
+ cpuid = convert_apicid_to_cpu(apicid);
+
+ return cpuid >= 0 ? cpuid : 0;
+}
+#endif
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
new file mode 100644
index 000000000..ca17a3848
--- /dev/null
+++ b/arch/x86/kernel/apic/msi.c
@@ -0,0 +1,522 @@
+/*
+ * Support of MSI, HPET and DMAR interrupts.
+ *
+ * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo
+ * Moved from arch/x86/kernel/apic/io_apic.c.
+ * Jiang Liu <jiang.liu@linux.intel.com>
+ * Convert to hierarchical irqdomain
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/pci.h>
+#include <linux/dmar.h>
+#include <linux/hpet.h>
+#include <linux/msi.h>
+#include <asm/irqdomain.h>
+#include <asm/msidef.h>
+#include <asm/hpet.h>
+#include <asm/hw_irq.h>
+#include <asm/apic.h>
+#include <asm/irq_remapping.h>
+
+static struct irq_domain *msi_default_domain;
+
+static void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg)
+{
+ msg->address_hi = MSI_ADDR_BASE_HI;
+
+ if (x2apic_enabled())
+ msg->address_hi |= MSI_ADDR_EXT_DEST_ID(cfg->dest_apicid);
+
+ msg->address_lo =
+ MSI_ADDR_BASE_LO |
+ ((apic->irq_dest_mode == 0) ?
+ MSI_ADDR_DEST_MODE_PHYSICAL :
+ MSI_ADDR_DEST_MODE_LOGICAL) |
+ MSI_ADDR_REDIRECTION_CPU |
+ MSI_ADDR_DEST_ID(cfg->dest_apicid);
+
+ msg->data =
+ MSI_DATA_TRIGGER_EDGE |
+ MSI_DATA_LEVEL_ASSERT |
+ MSI_DATA_DELIVERY_FIXED |
+ MSI_DATA_VECTOR(cfg->vector);
+}
+
+static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg)
+{
+ __irq_msi_compose_msg(irqd_cfg(data), msg);
+}
+
+static void irq_msi_update_msg(struct irq_data *irqd, struct irq_cfg *cfg)
+{
+ struct msi_msg msg[2] = { [1] = { }, };
+
+ __irq_msi_compose_msg(cfg, msg);
+ irq_data_get_irq_chip(irqd)->irq_write_msi_msg(irqd, msg);
+}
+
+static int
+msi_set_affinity(struct irq_data *irqd, const struct cpumask *mask, bool force)
+{
+ struct irq_cfg old_cfg, *cfg = irqd_cfg(irqd);
+ struct irq_data *parent = irqd->parent_data;
+ unsigned int cpu;
+ int ret;
+
+ /* Save the current configuration */
+ cpu = cpumask_first(irq_data_get_effective_affinity_mask(irqd));
+ old_cfg = *cfg;
+
+ /* Allocate a new target vector */
+ ret = parent->chip->irq_set_affinity(parent, mask, force);
+ if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE)
+ return ret;
+
+ /*
+ * For non-maskable and non-remapped MSI interrupts the migration
+ * to a different destination CPU and a different vector has to be
+ * done careful to handle the possible stray interrupt which can be
+ * caused by the non-atomic update of the address/data pair.
+ *
+ * Direct update is possible when:
+ * - The MSI is maskable (remapped MSI does not use this code path)).
+ * The quirk bit is not set in this case.
+ * - The new vector is the same as the old vector
+ * - The old vector is MANAGED_IRQ_SHUTDOWN_VECTOR (interrupt starts up)
+ * - The interrupt is not yet started up
+ * - The new destination CPU is the same as the old destination CPU
+ */
+ if (!irqd_msi_nomask_quirk(irqd) ||
+ cfg->vector == old_cfg.vector ||
+ old_cfg.vector == MANAGED_IRQ_SHUTDOWN_VECTOR ||
+ !irqd_is_started(irqd) ||
+ cfg->dest_apicid == old_cfg.dest_apicid) {
+ irq_msi_update_msg(irqd, cfg);
+ return ret;
+ }
+
+ /*
+ * Paranoia: Validate that the interrupt target is the local
+ * CPU.
+ */
+ if (WARN_ON_ONCE(cpu != smp_processor_id())) {
+ irq_msi_update_msg(irqd, cfg);
+ return ret;
+ }
+
+ /*
+ * Redirect the interrupt to the new vector on the current CPU
+ * first. This might cause a spurious interrupt on this vector if
+ * the device raises an interrupt right between this update and the
+ * update to the final destination CPU.
+ *
+ * If the vector is in use then the installed device handler will
+ * denote it as spurious which is no harm as this is a rare event
+ * and interrupt handlers have to cope with spurious interrupts
+ * anyway. If the vector is unused, then it is marked so it won't
+ * trigger the 'No irq handler for vector' warning in do_IRQ().
+ *
+ * This requires to hold vector lock to prevent concurrent updates to
+ * the affected vector.
+ */
+ lock_vector_lock();
+
+ /*
+ * Mark the new target vector on the local CPU if it is currently
+ * unused. Reuse the VECTOR_RETRIGGERED state which is also used in
+ * the CPU hotplug path for a similar purpose. This cannot be
+ * undone here as the current CPU has interrupts disabled and
+ * cannot handle the interrupt before the whole set_affinity()
+ * section is done. In the CPU unplug case, the current CPU is
+ * about to vanish and will not handle any interrupts anymore. The
+ * vector is cleaned up when the CPU comes online again.
+ */
+ if (IS_ERR_OR_NULL(this_cpu_read(vector_irq[cfg->vector])))
+ this_cpu_write(vector_irq[cfg->vector], VECTOR_RETRIGGERED);
+
+ /* Redirect it to the new vector on the local CPU temporarily */
+ old_cfg.vector = cfg->vector;
+ irq_msi_update_msg(irqd, &old_cfg);
+
+ /* Now transition it to the target CPU */
+ irq_msi_update_msg(irqd, cfg);
+
+ /*
+ * All interrupts after this point are now targeted at the new
+ * vector/CPU.
+ *
+ * Drop vector lock before testing whether the temporary assignment
+ * to the local CPU was hit by an interrupt raised in the device,
+ * because the retrigger function acquires vector lock again.
+ */
+ unlock_vector_lock();
+
+ /*
+ * Check whether the transition raced with a device interrupt and
+ * is pending in the local APICs IRR. It is safe to do this outside
+ * of vector lock as the irq_desc::lock of this interrupt is still
+ * held and interrupts are disabled: The check is not accessing the
+ * underlying vector store. It's just checking the local APIC's
+ * IRR.
+ */
+ if (lapic_vector_set_in_irr(cfg->vector))
+ irq_data_get_irq_chip(irqd)->irq_retrigger(irqd);
+
+ return ret;
+}
+
+/*
+ * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
+ * which implement the MSI or MSI-X Capability Structure.
+ */
+static struct irq_chip pci_msi_controller = {
+ .name = "PCI-MSI",
+ .irq_unmask = pci_msi_unmask_irq,
+ .irq_mask = pci_msi_mask_irq,
+ .irq_ack = irq_chip_ack_parent,
+ .irq_retrigger = irq_chip_retrigger_hierarchy,
+ .irq_compose_msi_msg = irq_msi_compose_msg,
+ .irq_set_affinity = msi_set_affinity,
+ .flags = IRQCHIP_SKIP_SET_WAKE |
+ IRQCHIP_AFFINITY_PRE_STARTUP,
+};
+
+int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+{
+ struct irq_domain *domain;
+ struct irq_alloc_info info;
+
+ init_irq_alloc_info(&info, NULL);
+ info.type = X86_IRQ_ALLOC_TYPE_MSI;
+ info.msi_dev = dev;
+
+ domain = irq_remapping_get_irq_domain(&info);
+ if (domain == NULL)
+ domain = msi_default_domain;
+ if (domain == NULL)
+ return -ENOSYS;
+
+ return msi_domain_alloc_irqs(domain, &dev->dev, nvec);
+}
+
+void native_teardown_msi_irq(unsigned int irq)
+{
+ irq_domain_free_irqs(irq, 1);
+}
+
+static irq_hw_number_t pci_msi_get_hwirq(struct msi_domain_info *info,
+ msi_alloc_info_t *arg)
+{
+ return arg->msi_hwirq;
+}
+
+int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec,
+ msi_alloc_info_t *arg)
+{
+ struct pci_dev *pdev = to_pci_dev(dev);
+ struct msi_desc *desc = first_pci_msi_entry(pdev);
+
+ init_irq_alloc_info(arg, NULL);
+ arg->msi_dev = pdev;
+ if (desc->msi_attrib.is_msix) {
+ arg->type = X86_IRQ_ALLOC_TYPE_MSIX;
+ } else {
+ arg->type = X86_IRQ_ALLOC_TYPE_MSI;
+ arg->flags |= X86_IRQ_ALLOC_CONTIGUOUS_VECTORS;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(pci_msi_prepare);
+
+void pci_msi_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc)
+{
+ arg->msi_hwirq = pci_msi_domain_calc_hwirq(arg->msi_dev, desc);
+}
+EXPORT_SYMBOL_GPL(pci_msi_set_desc);
+
+static struct msi_domain_ops pci_msi_domain_ops = {
+ .get_hwirq = pci_msi_get_hwirq,
+ .msi_prepare = pci_msi_prepare,
+ .set_desc = pci_msi_set_desc,
+};
+
+static struct msi_domain_info pci_msi_domain_info = {
+ .flags = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
+ MSI_FLAG_PCI_MSIX,
+ .ops = &pci_msi_domain_ops,
+ .chip = &pci_msi_controller,
+ .handler = handle_edge_irq,
+ .handler_name = "edge",
+};
+
+void __init arch_init_msi_domain(struct irq_domain *parent)
+{
+ struct fwnode_handle *fn;
+
+ if (disable_apic)
+ return;
+
+ fn = irq_domain_alloc_named_fwnode("PCI-MSI");
+ if (fn) {
+ msi_default_domain =
+ pci_msi_create_irq_domain(fn, &pci_msi_domain_info,
+ parent);
+ }
+ if (!msi_default_domain) {
+ irq_domain_free_fwnode(fn);
+ pr_warn("failed to initialize irqdomain for MSI/MSI-x.\n");
+ } else {
+ msi_default_domain->flags |= IRQ_DOMAIN_MSI_NOMASK_QUIRK;
+ }
+}
+
+#ifdef CONFIG_IRQ_REMAP
+static struct irq_chip pci_msi_ir_controller = {
+ .name = "IR-PCI-MSI",
+ .irq_unmask = pci_msi_unmask_irq,
+ .irq_mask = pci_msi_mask_irq,
+ .irq_ack = irq_chip_ack_parent,
+ .irq_retrigger = irq_chip_retrigger_hierarchy,
+ .irq_set_vcpu_affinity = irq_chip_set_vcpu_affinity_parent,
+ .flags = IRQCHIP_SKIP_SET_WAKE |
+ IRQCHIP_AFFINITY_PRE_STARTUP,
+};
+
+static struct msi_domain_info pci_msi_ir_domain_info = {
+ .flags = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
+ MSI_FLAG_MULTI_PCI_MSI | MSI_FLAG_PCI_MSIX,
+ .ops = &pci_msi_domain_ops,
+ .chip = &pci_msi_ir_controller,
+ .handler = handle_edge_irq,
+ .handler_name = "edge",
+};
+
+struct irq_domain *arch_create_remap_msi_irq_domain(struct irq_domain *parent,
+ const char *name, int id)
+{
+ struct fwnode_handle *fn;
+ struct irq_domain *d;
+
+ fn = irq_domain_alloc_named_id_fwnode(name, id);
+ if (!fn)
+ return NULL;
+ d = pci_msi_create_irq_domain(fn, &pci_msi_ir_domain_info, parent);
+ if (!d)
+ irq_domain_free_fwnode(fn);
+ return d;
+}
+#endif
+
+#ifdef CONFIG_DMAR_TABLE
+static void dmar_msi_write_msg(struct irq_data *data, struct msi_msg *msg)
+{
+ dmar_msi_write(data->irq, msg);
+}
+
+static struct irq_chip dmar_msi_controller = {
+ .name = "DMAR-MSI",
+ .irq_unmask = dmar_msi_unmask,
+ .irq_mask = dmar_msi_mask,
+ .irq_ack = irq_chip_ack_parent,
+ .irq_set_affinity = msi_domain_set_affinity,
+ .irq_retrigger = irq_chip_retrigger_hierarchy,
+ .irq_compose_msi_msg = irq_msi_compose_msg,
+ .irq_write_msi_msg = dmar_msi_write_msg,
+ .flags = IRQCHIP_SKIP_SET_WAKE |
+ IRQCHIP_AFFINITY_PRE_STARTUP,
+};
+
+static irq_hw_number_t dmar_msi_get_hwirq(struct msi_domain_info *info,
+ msi_alloc_info_t *arg)
+{
+ return arg->dmar_id;
+}
+
+static int dmar_msi_init(struct irq_domain *domain,
+ struct msi_domain_info *info, unsigned int virq,
+ irq_hw_number_t hwirq, msi_alloc_info_t *arg)
+{
+ irq_domain_set_info(domain, virq, arg->dmar_id, info->chip, NULL,
+ handle_edge_irq, arg->dmar_data, "edge");
+
+ return 0;
+}
+
+static struct msi_domain_ops dmar_msi_domain_ops = {
+ .get_hwirq = dmar_msi_get_hwirq,
+ .msi_init = dmar_msi_init,
+};
+
+static struct msi_domain_info dmar_msi_domain_info = {
+ .ops = &dmar_msi_domain_ops,
+ .chip = &dmar_msi_controller,
+};
+
+static struct irq_domain *dmar_get_irq_domain(void)
+{
+ static struct irq_domain *dmar_domain;
+ static DEFINE_MUTEX(dmar_lock);
+ struct fwnode_handle *fn;
+
+ mutex_lock(&dmar_lock);
+ if (dmar_domain)
+ goto out;
+
+ fn = irq_domain_alloc_named_fwnode("DMAR-MSI");
+ if (fn) {
+ dmar_domain = msi_create_irq_domain(fn, &dmar_msi_domain_info,
+ x86_vector_domain);
+ if (!dmar_domain)
+ irq_domain_free_fwnode(fn);
+ }
+out:
+ mutex_unlock(&dmar_lock);
+ return dmar_domain;
+}
+
+int dmar_alloc_hwirq(int id, int node, void *arg)
+{
+ struct irq_domain *domain = dmar_get_irq_domain();
+ struct irq_alloc_info info;
+
+ if (!domain)
+ return -1;
+
+ init_irq_alloc_info(&info, NULL);
+ info.type = X86_IRQ_ALLOC_TYPE_DMAR;
+ info.dmar_id = id;
+ info.dmar_data = arg;
+
+ return irq_domain_alloc_irqs(domain, 1, node, &info);
+}
+
+void dmar_free_hwirq(int irq)
+{
+ irq_domain_free_irqs(irq, 1);
+}
+#endif
+
+/*
+ * MSI message composition
+ */
+#ifdef CONFIG_HPET_TIMER
+static inline int hpet_dev_id(struct irq_domain *domain)
+{
+ struct msi_domain_info *info = msi_get_domain_info(domain);
+
+ return (int)(long)info->data;
+}
+
+static void hpet_msi_write_msg(struct irq_data *data, struct msi_msg *msg)
+{
+ hpet_msi_write(irq_data_get_irq_handler_data(data), msg);
+}
+
+static struct irq_chip hpet_msi_controller __ro_after_init = {
+ .name = "HPET-MSI",
+ .irq_unmask = hpet_msi_unmask,
+ .irq_mask = hpet_msi_mask,
+ .irq_ack = irq_chip_ack_parent,
+ .irq_set_affinity = msi_domain_set_affinity,
+ .irq_retrigger = irq_chip_retrigger_hierarchy,
+ .irq_compose_msi_msg = irq_msi_compose_msg,
+ .irq_write_msi_msg = hpet_msi_write_msg,
+ .flags = IRQCHIP_SKIP_SET_WAKE | IRQCHIP_AFFINITY_PRE_STARTUP,
+};
+
+static irq_hw_number_t hpet_msi_get_hwirq(struct msi_domain_info *info,
+ msi_alloc_info_t *arg)
+{
+ return arg->hpet_index;
+}
+
+static int hpet_msi_init(struct irq_domain *domain,
+ struct msi_domain_info *info, unsigned int virq,
+ irq_hw_number_t hwirq, msi_alloc_info_t *arg)
+{
+ irq_set_status_flags(virq, IRQ_MOVE_PCNTXT);
+ irq_domain_set_info(domain, virq, arg->hpet_index, info->chip, NULL,
+ handle_edge_irq, arg->hpet_data, "edge");
+
+ return 0;
+}
+
+static void hpet_msi_free(struct irq_domain *domain,
+ struct msi_domain_info *info, unsigned int virq)
+{
+ irq_clear_status_flags(virq, IRQ_MOVE_PCNTXT);
+}
+
+static struct msi_domain_ops hpet_msi_domain_ops = {
+ .get_hwirq = hpet_msi_get_hwirq,
+ .msi_init = hpet_msi_init,
+ .msi_free = hpet_msi_free,
+};
+
+static struct msi_domain_info hpet_msi_domain_info = {
+ .ops = &hpet_msi_domain_ops,
+ .chip = &hpet_msi_controller,
+};
+
+struct irq_domain *hpet_create_irq_domain(int hpet_id)
+{
+ struct msi_domain_info *domain_info;
+ struct irq_domain *parent, *d;
+ struct irq_alloc_info info;
+ struct fwnode_handle *fn;
+
+ if (x86_vector_domain == NULL)
+ return NULL;
+
+ domain_info = kzalloc(sizeof(*domain_info), GFP_KERNEL);
+ if (!domain_info)
+ return NULL;
+
+ *domain_info = hpet_msi_domain_info;
+ domain_info->data = (void *)(long)hpet_id;
+
+ init_irq_alloc_info(&info, NULL);
+ info.type = X86_IRQ_ALLOC_TYPE_HPET;
+ info.hpet_id = hpet_id;
+ parent = irq_remapping_get_ir_irq_domain(&info);
+ if (parent == NULL)
+ parent = x86_vector_domain;
+ else
+ hpet_msi_controller.name = "IR-HPET-MSI";
+
+ fn = irq_domain_alloc_named_id_fwnode(hpet_msi_controller.name,
+ hpet_id);
+ if (!fn) {
+ kfree(domain_info);
+ return NULL;
+ }
+
+ d = msi_create_irq_domain(fn, domain_info, parent);
+ if (!d) {
+ irq_domain_free_fwnode(fn);
+ kfree(domain_info);
+ }
+ return d;
+}
+
+int hpet_assign_irq(struct irq_domain *domain, struct hpet_dev *dev,
+ int dev_num)
+{
+ struct irq_alloc_info info;
+
+ init_irq_alloc_info(&info, NULL);
+ info.type = X86_IRQ_ALLOC_TYPE_HPET;
+ info.hpet_data = dev;
+ info.hpet_id = hpet_dev_id(domain);
+ info.hpet_index = dev_num;
+
+ return irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, &info);
+}
+#endif
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
new file mode 100644
index 000000000..02e8acb13
--- /dev/null
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -0,0 +1,249 @@
+/*
+ * Default generic APIC driver. This handles up to 8 CPUs.
+ *
+ * Copyright 2003 Andi Kleen, SuSE Labs.
+ * Subject to the GNU Public License, v.2
+ *
+ * Generic x86 APIC driver probe layer.
+ */
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <linux/export.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <asm/fixmap.h>
+#include <asm/mpspec.h>
+#include <asm/apicdef.h>
+#include <asm/apic.h>
+#include <asm/setup.h>
+
+#include <linux/smp.h>
+#include <asm/ipi.h>
+
+#include <linux/interrupt.h>
+#include <asm/acpi.h>
+#include <asm/e820/api.h>
+
+#ifdef CONFIG_HOTPLUG_CPU
+#define DEFAULT_SEND_IPI (1)
+#else
+#define DEFAULT_SEND_IPI (0)
+#endif
+
+int no_broadcast = DEFAULT_SEND_IPI;
+
+static __init int no_ipi_broadcast(char *str)
+{
+ get_option(&str, &no_broadcast);
+ pr_info("Using %s mode\n",
+ no_broadcast ? "No IPI Broadcast" : "IPI Broadcast");
+ return 1;
+}
+__setup("no_ipi_broadcast=", no_ipi_broadcast);
+
+static int __init print_ipi_mode(void)
+{
+ pr_info("Using IPI %s mode\n",
+ no_broadcast ? "No-Shortcut" : "Shortcut");
+ return 0;
+}
+late_initcall(print_ipi_mode);
+
+static int default_x86_32_early_logical_apicid(int cpu)
+{
+ return 1 << cpu;
+}
+
+static void setup_apic_flat_routing(void)
+{
+#ifdef CONFIG_X86_IO_APIC
+ printk(KERN_INFO
+ "Enabling APIC mode: Flat. Using %d I/O APICs\n",
+ nr_ioapics);
+#endif
+}
+
+static int default_apic_id_registered(void)
+{
+ return physid_isset(read_apic_id(), phys_cpu_present_map);
+}
+
+/*
+ * Set up the logical destination ID. Intel recommends to set DFR, LDR and
+ * TPR before enabling an APIC. See e.g. "AP-388 82489DX User's Manual"
+ * (Intel document number 292116).
+ */
+static void default_init_apic_ldr(void)
+{
+ unsigned long val;
+
+ apic_write(APIC_DFR, APIC_DFR_VALUE);
+ val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
+ val |= SET_APIC_LOGICAL_ID(1UL << smp_processor_id());
+ apic_write(APIC_LDR, val);
+}
+
+static int default_phys_pkg_id(int cpuid_apic, int index_msb)
+{
+ return cpuid_apic >> index_msb;
+}
+
+/* should be called last. */
+static int probe_default(void)
+{
+ return 1;
+}
+
+static struct apic apic_default __ro_after_init = {
+
+ .name = "default",
+ .probe = probe_default,
+ .acpi_madt_oem_check = NULL,
+ .apic_id_valid = default_apic_id_valid,
+ .apic_id_registered = default_apic_id_registered,
+
+ .irq_delivery_mode = dest_Fixed,
+ /* logical delivery broadcast to all CPUs: */
+ .irq_dest_mode = 1,
+
+ .disable_esr = 0,
+ .dest_logical = APIC_DEST_LOGICAL,
+ .check_apicid_used = default_check_apicid_used,
+
+ .init_apic_ldr = default_init_apic_ldr,
+
+ .ioapic_phys_id_map = default_ioapic_phys_id_map,
+ .setup_apic_routing = setup_apic_flat_routing,
+ .cpu_present_to_apicid = default_cpu_present_to_apicid,
+ .apicid_to_cpu_present = physid_set_mask_of_physid,
+ .check_phys_apicid_present = default_check_phys_apicid_present,
+ .phys_pkg_id = default_phys_pkg_id,
+
+ .get_apic_id = default_get_apic_id,
+ .set_apic_id = NULL,
+
+ .calc_dest_apicid = apic_flat_calc_apicid,
+
+ .send_IPI = default_send_IPI_single,
+ .send_IPI_mask = default_send_IPI_mask_logical,
+ .send_IPI_mask_allbutself = default_send_IPI_mask_allbutself_logical,
+ .send_IPI_allbutself = default_send_IPI_allbutself,
+ .send_IPI_all = default_send_IPI_all,
+ .send_IPI_self = default_send_IPI_self,
+
+ .inquire_remote_apic = default_inquire_remote_apic,
+
+ .read = native_apic_mem_read,
+ .write = native_apic_mem_write,
+ .eoi_write = native_apic_mem_write,
+ .icr_read = native_apic_icr_read,
+ .icr_write = native_apic_icr_write,
+ .wait_icr_idle = native_apic_wait_icr_idle,
+ .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
+
+ .x86_32_early_logical_apicid = default_x86_32_early_logical_apicid,
+};
+
+apic_driver(apic_default);
+
+struct apic *apic __ro_after_init = &apic_default;
+EXPORT_SYMBOL_GPL(apic);
+
+static int cmdline_apic __initdata;
+static int __init parse_apic(char *arg)
+{
+ struct apic **drv;
+
+ if (!arg)
+ return -EINVAL;
+
+ for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
+ if (!strcmp((*drv)->name, arg)) {
+ apic = *drv;
+ cmdline_apic = 1;
+ return 0;
+ }
+ }
+
+ /* Parsed again by __setup for debug/verbose */
+ return 0;
+}
+early_param("apic", parse_apic);
+
+void __init default_setup_apic_routing(void)
+{
+ int version = boot_cpu_apic_version;
+
+ if (num_possible_cpus() > 8) {
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_INTEL:
+ if (!APIC_XAPIC(version)) {
+ def_to_bigsmp = 0;
+ break;
+ }
+ /* If P4 and above fall through */
+ case X86_VENDOR_AMD:
+ def_to_bigsmp = 1;
+ }
+ }
+
+#ifdef CONFIG_X86_BIGSMP
+ /*
+ * This is used to switch to bigsmp mode when
+ * - There is no apic= option specified by the user
+ * - generic_apic_probe() has chosen apic_default as the sub_arch
+ * - we find more than 8 CPUs in acpi LAPIC listing with xAPIC support
+ */
+
+ if (!cmdline_apic && apic == &apic_default)
+ generic_bigsmp_probe();
+#endif
+
+ if (apic->setup_apic_routing)
+ apic->setup_apic_routing();
+
+ if (x86_platform.apic_post_init)
+ x86_platform.apic_post_init();
+}
+
+void __init generic_apic_probe(void)
+{
+ if (!cmdline_apic) {
+ struct apic **drv;
+
+ for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
+ if ((*drv)->probe()) {
+ apic = *drv;
+ break;
+ }
+ }
+ /* Not visible without early console */
+ if (drv == __apicdrivers_end)
+ panic("Didn't find an APIC driver");
+ }
+ printk(KERN_INFO "Using APIC driver %s\n", apic->name);
+}
+
+/* This function can switch the APIC even after the initial ->probe() */
+int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+ struct apic **drv;
+
+ for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
+ if (!(*drv)->acpi_madt_oem_check)
+ continue;
+ if (!(*drv)->acpi_madt_oem_check(oem_id, oem_table_id))
+ continue;
+
+ if (!cmdline_apic) {
+ apic = *drv;
+ printk(KERN_INFO "Switched to APIC driver `%s'.\n",
+ apic->name);
+ }
+ return 1;
+ }
+ return 0;
+}
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
new file mode 100644
index 000000000..c303054b9
--- /dev/null
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -0,0 +1,71 @@
+/*
+ * Copyright 2004 James Cleverdon, IBM.
+ * Subject to the GNU Public License, v.2
+ *
+ * Generic APIC sub-arch probe layer.
+ *
+ * Hacked for x86-64 by James Cleverdon from i386 architecture code by
+ * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
+ * James Cleverdon.
+ */
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/hardirq.h>
+#include <linux/dmar.h>
+
+#include <asm/smp.h>
+#include <asm/apic.h>
+#include <asm/ipi.h>
+#include <asm/setup.h>
+
+/*
+ * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
+ */
+void __init default_setup_apic_routing(void)
+{
+ struct apic **drv;
+
+ enable_IR_x2apic();
+
+ for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
+ if ((*drv)->probe && (*drv)->probe()) {
+ if (apic != *drv) {
+ apic = *drv;
+ pr_info("Switched APIC routing to %s.\n",
+ apic->name);
+ }
+ break;
+ }
+ }
+
+ if (x86_platform.apic_post_init)
+ x86_platform.apic_post_init();
+}
+
+/* Same for both flat and physical. */
+
+void apic_send_IPI_self(int vector)
+{
+ __default_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
+}
+
+int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+ struct apic **drv;
+
+ for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
+ if ((*drv)->acpi_madt_oem_check(oem_id, oem_table_id)) {
+ if (apic != *drv) {
+ apic = *drv;
+ pr_info("Setting APIC routing to %s.\n",
+ apic->name);
+ }
+ return 1;
+ }
+ }
+ return 0;
+}
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
new file mode 100644
index 000000000..dc7c75944
--- /dev/null
+++ b/arch/x86/kernel/apic/vector.c
@@ -0,0 +1,1272 @@
+/*
+ * Local APIC related interfaces to support IOAPIC, MSI, etc.
+ *
+ * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo
+ * Moved from arch/x86/kernel/apic/io_apic.c.
+ * Jiang Liu <jiang.liu@linux.intel.com>
+ * Enable support of hierarchical irqdomains
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <linux/compiler.h>
+#include <linux/slab.h>
+#include <asm/irqdomain.h>
+#include <asm/hw_irq.h>
+#include <asm/apic.h>
+#include <asm/i8259.h>
+#include <asm/desc.h>
+#include <asm/irq_remapping.h>
+
+#include <asm/trace/irq_vectors.h>
+
+struct apic_chip_data {
+ struct irq_cfg hw_irq_cfg;
+ unsigned int vector;
+ unsigned int prev_vector;
+ unsigned int cpu;
+ unsigned int prev_cpu;
+ unsigned int irq;
+ struct hlist_node clist;
+ unsigned int move_in_progress : 1,
+ is_managed : 1,
+ can_reserve : 1,
+ has_reserved : 1;
+};
+
+struct irq_domain *x86_vector_domain;
+EXPORT_SYMBOL_GPL(x86_vector_domain);
+static DEFINE_RAW_SPINLOCK(vector_lock);
+static cpumask_var_t vector_searchmask;
+static struct irq_chip lapic_controller;
+static struct irq_matrix *vector_matrix;
+#ifdef CONFIG_SMP
+static DEFINE_PER_CPU(struct hlist_head, cleanup_list);
+#endif
+
+void lock_vector_lock(void)
+{
+ /* Used to the online set of cpus does not change
+ * during assign_irq_vector.
+ */
+ raw_spin_lock(&vector_lock);
+}
+
+void unlock_vector_lock(void)
+{
+ raw_spin_unlock(&vector_lock);
+}
+
+void init_irq_alloc_info(struct irq_alloc_info *info,
+ const struct cpumask *mask)
+{
+ memset(info, 0, sizeof(*info));
+ info->mask = mask;
+}
+
+void copy_irq_alloc_info(struct irq_alloc_info *dst, struct irq_alloc_info *src)
+{
+ if (src)
+ *dst = *src;
+ else
+ memset(dst, 0, sizeof(*dst));
+}
+
+static struct apic_chip_data *apic_chip_data(struct irq_data *irqd)
+{
+ if (!irqd)
+ return NULL;
+
+ while (irqd->parent_data)
+ irqd = irqd->parent_data;
+
+ return irqd->chip_data;
+}
+
+struct irq_cfg *irqd_cfg(struct irq_data *irqd)
+{
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
+
+ return apicd ? &apicd->hw_irq_cfg : NULL;
+}
+EXPORT_SYMBOL_GPL(irqd_cfg);
+
+struct irq_cfg *irq_cfg(unsigned int irq)
+{
+ return irqd_cfg(irq_get_irq_data(irq));
+}
+
+static struct apic_chip_data *alloc_apic_chip_data(int node)
+{
+ struct apic_chip_data *apicd;
+
+ apicd = kzalloc_node(sizeof(*apicd), GFP_KERNEL, node);
+ if (apicd)
+ INIT_HLIST_NODE(&apicd->clist);
+ return apicd;
+}
+
+static void free_apic_chip_data(struct apic_chip_data *apicd)
+{
+ kfree(apicd);
+}
+
+static void apic_update_irq_cfg(struct irq_data *irqd, unsigned int vector,
+ unsigned int cpu)
+{
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
+
+ lockdep_assert_held(&vector_lock);
+
+ apicd->hw_irq_cfg.vector = vector;
+ apicd->hw_irq_cfg.dest_apicid = apic->calc_dest_apicid(cpu);
+ irq_data_update_effective_affinity(irqd, cpumask_of(cpu));
+ trace_vector_config(irqd->irq, vector, cpu,
+ apicd->hw_irq_cfg.dest_apicid);
+}
+
+static void apic_update_vector(struct irq_data *irqd, unsigned int newvec,
+ unsigned int newcpu)
+{
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
+ struct irq_desc *desc = irq_data_to_desc(irqd);
+ bool managed = irqd_affinity_is_managed(irqd);
+
+ lockdep_assert_held(&vector_lock);
+
+ trace_vector_update(irqd->irq, newvec, newcpu, apicd->vector,
+ apicd->cpu);
+
+ /*
+ * If there is no vector associated or if the associated vector is
+ * the shutdown vector, which is associated to make PCI/MSI
+ * shutdown mode work, then there is nothing to release. Clear out
+ * prev_vector for this and the offlined target case.
+ */
+ apicd->prev_vector = 0;
+ if (!apicd->vector || apicd->vector == MANAGED_IRQ_SHUTDOWN_VECTOR)
+ goto setnew;
+ /*
+ * If the target CPU of the previous vector is online, then mark
+ * the vector as move in progress and store it for cleanup when the
+ * first interrupt on the new vector arrives. If the target CPU is
+ * offline then the regular release mechanism via the cleanup
+ * vector is not possible and the vector can be immediately freed
+ * in the underlying matrix allocator.
+ */
+ if (cpu_online(apicd->cpu)) {
+ apicd->move_in_progress = true;
+ apicd->prev_vector = apicd->vector;
+ apicd->prev_cpu = apicd->cpu;
+ } else {
+ irq_matrix_free(vector_matrix, apicd->cpu, apicd->vector,
+ managed);
+ }
+
+setnew:
+ apicd->vector = newvec;
+ apicd->cpu = newcpu;
+ BUG_ON(!IS_ERR_OR_NULL(per_cpu(vector_irq, newcpu)[newvec]));
+ per_cpu(vector_irq, newcpu)[newvec] = desc;
+}
+
+static void vector_assign_managed_shutdown(struct irq_data *irqd)
+{
+ unsigned int cpu = cpumask_first(cpu_online_mask);
+
+ apic_update_irq_cfg(irqd, MANAGED_IRQ_SHUTDOWN_VECTOR, cpu);
+}
+
+static int reserve_managed_vector(struct irq_data *irqd)
+{
+ const struct cpumask *affmsk = irq_data_get_affinity_mask(irqd);
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
+ unsigned long flags;
+ int ret;
+
+ raw_spin_lock_irqsave(&vector_lock, flags);
+ apicd->is_managed = true;
+ ret = irq_matrix_reserve_managed(vector_matrix, affmsk);
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
+ trace_vector_reserve_managed(irqd->irq, ret);
+ return ret;
+}
+
+static void reserve_irq_vector_locked(struct irq_data *irqd)
+{
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
+
+ irq_matrix_reserve(vector_matrix);
+ apicd->can_reserve = true;
+ apicd->has_reserved = true;
+ irqd_set_can_reserve(irqd);
+ trace_vector_reserve(irqd->irq, 0);
+ vector_assign_managed_shutdown(irqd);
+}
+
+static int reserve_irq_vector(struct irq_data *irqd)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&vector_lock, flags);
+ reserve_irq_vector_locked(irqd);
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
+ return 0;
+}
+
+static int
+assign_vector_locked(struct irq_data *irqd, const struct cpumask *dest)
+{
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
+ bool resvd = apicd->has_reserved;
+ unsigned int cpu = apicd->cpu;
+ int vector = apicd->vector;
+
+ lockdep_assert_held(&vector_lock);
+
+ /*
+ * If the current target CPU is online and in the new requested
+ * affinity mask, there is no point in moving the interrupt from
+ * one CPU to another.
+ */
+ if (vector && cpu_online(cpu) && cpumask_test_cpu(cpu, dest))
+ return 0;
+
+ /*
+ * Careful here. @apicd might either have move_in_progress set or
+ * be enqueued for cleanup. Assigning a new vector would either
+ * leave a stale vector on some CPU around or in case of a pending
+ * cleanup corrupt the hlist.
+ */
+ if (apicd->move_in_progress || !hlist_unhashed(&apicd->clist))
+ return -EBUSY;
+
+ vector = irq_matrix_alloc(vector_matrix, dest, resvd, &cpu);
+ trace_vector_alloc(irqd->irq, vector, resvd, vector);
+ if (vector < 0)
+ return vector;
+ apic_update_vector(irqd, vector, cpu);
+ apic_update_irq_cfg(irqd, vector, cpu);
+
+ return 0;
+}
+
+static int assign_irq_vector(struct irq_data *irqd, const struct cpumask *dest)
+{
+ unsigned long flags;
+ int ret;
+
+ raw_spin_lock_irqsave(&vector_lock, flags);
+ cpumask_and(vector_searchmask, dest, cpu_online_mask);
+ ret = assign_vector_locked(irqd, vector_searchmask);
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
+ return ret;
+}
+
+static int assign_irq_vector_any_locked(struct irq_data *irqd)
+{
+ /* Get the affinity mask - either irq_default_affinity or (user) set */
+ const struct cpumask *affmsk = irq_data_get_affinity_mask(irqd);
+ int node = irq_data_get_node(irqd);
+
+ if (node != NUMA_NO_NODE) {
+ /* Try the intersection of @affmsk and node mask */
+ cpumask_and(vector_searchmask, cpumask_of_node(node), affmsk);
+ if (!assign_vector_locked(irqd, vector_searchmask))
+ return 0;
+ }
+
+ /* Try the full affinity mask */
+ cpumask_and(vector_searchmask, affmsk, cpu_online_mask);
+ if (!assign_vector_locked(irqd, vector_searchmask))
+ return 0;
+
+ if (node != NUMA_NO_NODE) {
+ /* Try the node mask */
+ if (!assign_vector_locked(irqd, cpumask_of_node(node)))
+ return 0;
+ }
+
+ /* Try the full online mask */
+ return assign_vector_locked(irqd, cpu_online_mask);
+}
+
+static int
+assign_irq_vector_policy(struct irq_data *irqd, struct irq_alloc_info *info)
+{
+ if (irqd_affinity_is_managed(irqd))
+ return reserve_managed_vector(irqd);
+ if (info->mask)
+ return assign_irq_vector(irqd, info->mask);
+ /*
+ * Make only a global reservation with no guarantee. A real vector
+ * is associated at activation time.
+ */
+ return reserve_irq_vector(irqd);
+}
+
+static int
+assign_managed_vector(struct irq_data *irqd, const struct cpumask *dest)
+{
+ const struct cpumask *affmsk = irq_data_get_affinity_mask(irqd);
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
+ int vector, cpu;
+
+ cpumask_and(vector_searchmask, dest, affmsk);
+
+ /* set_affinity might call here for nothing */
+ if (apicd->vector && cpumask_test_cpu(apicd->cpu, vector_searchmask))
+ return 0;
+ vector = irq_matrix_alloc_managed(vector_matrix, vector_searchmask,
+ &cpu);
+ trace_vector_alloc_managed(irqd->irq, vector, vector);
+ if (vector < 0)
+ return vector;
+ apic_update_vector(irqd, vector, cpu);
+ apic_update_irq_cfg(irqd, vector, cpu);
+ return 0;
+}
+
+static void clear_irq_vector(struct irq_data *irqd)
+{
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
+ bool managed = irqd_affinity_is_managed(irqd);
+ unsigned int vector = apicd->vector;
+
+ lockdep_assert_held(&vector_lock);
+
+ if (!vector)
+ return;
+
+ trace_vector_clear(irqd->irq, vector, apicd->cpu, apicd->prev_vector,
+ apicd->prev_cpu);
+
+ per_cpu(vector_irq, apicd->cpu)[vector] = VECTOR_SHUTDOWN;
+ irq_matrix_free(vector_matrix, apicd->cpu, vector, managed);
+ apicd->vector = 0;
+
+ /* Clean up move in progress */
+ vector = apicd->prev_vector;
+ if (!vector)
+ return;
+
+ per_cpu(vector_irq, apicd->prev_cpu)[vector] = VECTOR_SHUTDOWN;
+ irq_matrix_free(vector_matrix, apicd->prev_cpu, vector, managed);
+ apicd->prev_vector = 0;
+ apicd->move_in_progress = 0;
+ hlist_del_init(&apicd->clist);
+}
+
+static void x86_vector_deactivate(struct irq_domain *dom, struct irq_data *irqd)
+{
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
+ unsigned long flags;
+
+ trace_vector_deactivate(irqd->irq, apicd->is_managed,
+ apicd->can_reserve, false);
+
+ /* Regular fixed assigned interrupt */
+ if (!apicd->is_managed && !apicd->can_reserve)
+ return;
+ /* If the interrupt has a global reservation, nothing to do */
+ if (apicd->has_reserved)
+ return;
+
+ raw_spin_lock_irqsave(&vector_lock, flags);
+ clear_irq_vector(irqd);
+ if (apicd->can_reserve)
+ reserve_irq_vector_locked(irqd);
+ else
+ vector_assign_managed_shutdown(irqd);
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
+}
+
+static int activate_reserved(struct irq_data *irqd)
+{
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
+ int ret;
+
+ ret = assign_irq_vector_any_locked(irqd);
+ if (!ret) {
+ apicd->has_reserved = false;
+ /*
+ * Core might have disabled reservation mode after
+ * allocating the irq descriptor. Ideally this should
+ * happen before allocation time, but that would require
+ * completely convoluted ways of transporting that
+ * information.
+ */
+ if (!irqd_can_reserve(irqd))
+ apicd->can_reserve = false;
+ }
+
+ /*
+ * Check to ensure that the effective affinity mask is a subset
+ * the user supplied affinity mask, and warn the user if it is not
+ */
+ if (!cpumask_subset(irq_data_get_effective_affinity_mask(irqd),
+ irq_data_get_affinity_mask(irqd))) {
+ pr_warn("irq %u: Affinity broken due to vector space exhaustion.\n",
+ irqd->irq);
+ }
+
+ return ret;
+}
+
+static int activate_managed(struct irq_data *irqd)
+{
+ const struct cpumask *dest = irq_data_get_affinity_mask(irqd);
+ int ret;
+
+ cpumask_and(vector_searchmask, dest, cpu_online_mask);
+ if (WARN_ON_ONCE(cpumask_empty(vector_searchmask))) {
+ /* Something in the core code broke! Survive gracefully */
+ pr_err("Managed startup for irq %u, but no CPU\n", irqd->irq);
+ return -EINVAL;
+ }
+
+ ret = assign_managed_vector(irqd, vector_searchmask);
+ /*
+ * This should not happen. The vector reservation got buggered. Handle
+ * it gracefully.
+ */
+ if (WARN_ON_ONCE(ret < 0)) {
+ pr_err("Managed startup irq %u, no vector available\n",
+ irqd->irq);
+ }
+ return ret;
+}
+
+static int x86_vector_activate(struct irq_domain *dom, struct irq_data *irqd,
+ bool reserve)
+{
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
+ unsigned long flags;
+ int ret = 0;
+
+ trace_vector_activate(irqd->irq, apicd->is_managed,
+ apicd->can_reserve, reserve);
+
+ raw_spin_lock_irqsave(&vector_lock, flags);
+ if (!apicd->can_reserve && !apicd->is_managed)
+ assign_irq_vector_any_locked(irqd);
+ else if (reserve || irqd_is_managed_and_shutdown(irqd))
+ vector_assign_managed_shutdown(irqd);
+ else if (apicd->is_managed)
+ ret = activate_managed(irqd);
+ else if (apicd->has_reserved)
+ ret = activate_reserved(irqd);
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
+ return ret;
+}
+
+static void vector_free_reserved_and_managed(struct irq_data *irqd)
+{
+ const struct cpumask *dest = irq_data_get_affinity_mask(irqd);
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
+
+ trace_vector_teardown(irqd->irq, apicd->is_managed,
+ apicd->has_reserved);
+
+ if (apicd->has_reserved)
+ irq_matrix_remove_reserved(vector_matrix);
+ if (apicd->is_managed)
+ irq_matrix_remove_managed(vector_matrix, dest);
+}
+
+static void x86_vector_free_irqs(struct irq_domain *domain,
+ unsigned int virq, unsigned int nr_irqs)
+{
+ struct apic_chip_data *apicd;
+ struct irq_data *irqd;
+ unsigned long flags;
+ int i;
+
+ for (i = 0; i < nr_irqs; i++) {
+ irqd = irq_domain_get_irq_data(x86_vector_domain, virq + i);
+ if (irqd && irqd->chip_data) {
+ raw_spin_lock_irqsave(&vector_lock, flags);
+ clear_irq_vector(irqd);
+ vector_free_reserved_and_managed(irqd);
+ apicd = irqd->chip_data;
+ irq_domain_reset_irq_data(irqd);
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
+ free_apic_chip_data(apicd);
+ }
+ }
+}
+
+static bool vector_configure_legacy(unsigned int virq, struct irq_data *irqd,
+ struct apic_chip_data *apicd)
+{
+ unsigned long flags;
+ bool realloc = false;
+
+ apicd->vector = ISA_IRQ_VECTOR(virq);
+ apicd->cpu = 0;
+
+ raw_spin_lock_irqsave(&vector_lock, flags);
+ /*
+ * If the interrupt is activated, then it must stay at this vector
+ * position. That's usually the timer interrupt (0).
+ */
+ if (irqd_is_activated(irqd)) {
+ trace_vector_setup(virq, true, 0);
+ apic_update_irq_cfg(irqd, apicd->vector, apicd->cpu);
+ } else {
+ /* Release the vector */
+ apicd->can_reserve = true;
+ irqd_set_can_reserve(irqd);
+ clear_irq_vector(irqd);
+ realloc = true;
+ }
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
+ return realloc;
+}
+
+static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs, void *arg)
+{
+ struct irq_alloc_info *info = arg;
+ struct apic_chip_data *apicd;
+ struct irq_data *irqd;
+ int i, err, node;
+
+ if (disable_apic)
+ return -ENXIO;
+
+ /* Currently vector allocator can't guarantee contiguous allocations */
+ if ((info->flags & X86_IRQ_ALLOC_CONTIGUOUS_VECTORS) && nr_irqs > 1)
+ return -ENOSYS;
+
+ for (i = 0; i < nr_irqs; i++) {
+ irqd = irq_domain_get_irq_data(domain, virq + i);
+ BUG_ON(!irqd);
+ node = irq_data_get_node(irqd);
+ WARN_ON_ONCE(irqd->chip_data);
+ apicd = alloc_apic_chip_data(node);
+ if (!apicd) {
+ err = -ENOMEM;
+ goto error;
+ }
+
+ apicd->irq = virq + i;
+ irqd->chip = &lapic_controller;
+ irqd->chip_data = apicd;
+ irqd->hwirq = virq + i;
+ irqd_set_single_target(irqd);
+
+ /* Don't invoke affinity setter on deactivated interrupts */
+ irqd_set_affinity_on_activate(irqd);
+
+ /*
+ * Legacy vectors are already assigned when the IOAPIC
+ * takes them over. They stay on the same vector. This is
+ * required for check_timer() to work correctly as it might
+ * switch back to legacy mode. Only update the hardware
+ * config.
+ */
+ if (info->flags & X86_IRQ_ALLOC_LEGACY) {
+ if (!vector_configure_legacy(virq + i, irqd, apicd))
+ continue;
+ }
+
+ err = assign_irq_vector_policy(irqd, info);
+ trace_vector_setup(virq + i, false, err);
+ if (err) {
+ irqd->chip_data = NULL;
+ free_apic_chip_data(apicd);
+ goto error;
+ }
+ }
+
+ return 0;
+
+error:
+ x86_vector_free_irqs(domain, virq, i);
+ return err;
+}
+
+#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
+static void x86_vector_debug_show(struct seq_file *m, struct irq_domain *d,
+ struct irq_data *irqd, int ind)
+{
+ struct apic_chip_data apicd;
+ unsigned long flags;
+ int irq;
+
+ if (!irqd) {
+ irq_matrix_debug_show(m, vector_matrix, ind);
+ return;
+ }
+
+ irq = irqd->irq;
+ if (irq < nr_legacy_irqs() && !test_bit(irq, &io_apic_irqs)) {
+ seq_printf(m, "%*sVector: %5d\n", ind, "", ISA_IRQ_VECTOR(irq));
+ seq_printf(m, "%*sTarget: Legacy PIC all CPUs\n", ind, "");
+ return;
+ }
+
+ if (!irqd->chip_data) {
+ seq_printf(m, "%*sVector: Not assigned\n", ind, "");
+ return;
+ }
+
+ raw_spin_lock_irqsave(&vector_lock, flags);
+ memcpy(&apicd, irqd->chip_data, sizeof(apicd));
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
+
+ seq_printf(m, "%*sVector: %5u\n", ind, "", apicd.vector);
+ seq_printf(m, "%*sTarget: %5u\n", ind, "", apicd.cpu);
+ if (apicd.prev_vector) {
+ seq_printf(m, "%*sPrevious vector: %5u\n", ind, "", apicd.prev_vector);
+ seq_printf(m, "%*sPrevious target: %5u\n", ind, "", apicd.prev_cpu);
+ }
+ seq_printf(m, "%*smove_in_progress: %u\n", ind, "", apicd.move_in_progress ? 1 : 0);
+ seq_printf(m, "%*sis_managed: %u\n", ind, "", apicd.is_managed ? 1 : 0);
+ seq_printf(m, "%*scan_reserve: %u\n", ind, "", apicd.can_reserve ? 1 : 0);
+ seq_printf(m, "%*shas_reserved: %u\n", ind, "", apicd.has_reserved ? 1 : 0);
+ seq_printf(m, "%*scleanup_pending: %u\n", ind, "", !hlist_unhashed(&apicd.clist));
+}
+#endif
+
+static const struct irq_domain_ops x86_vector_domain_ops = {
+ .alloc = x86_vector_alloc_irqs,
+ .free = x86_vector_free_irqs,
+ .activate = x86_vector_activate,
+ .deactivate = x86_vector_deactivate,
+#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
+ .debug_show = x86_vector_debug_show,
+#endif
+};
+
+int __init arch_probe_nr_irqs(void)
+{
+ int nr;
+
+ if (nr_irqs > (NR_VECTORS * nr_cpu_ids))
+ nr_irqs = NR_VECTORS * nr_cpu_ids;
+
+ nr = (gsi_top + nr_legacy_irqs()) + 8 * nr_cpu_ids;
+#if defined(CONFIG_PCI_MSI)
+ /*
+ * for MSI and HT dyn irq
+ */
+ if (gsi_top <= NR_IRQS_LEGACY)
+ nr += 8 * nr_cpu_ids;
+ else
+ nr += gsi_top * 16;
+#endif
+ if (nr < nr_irqs)
+ nr_irqs = nr;
+
+ /*
+ * We don't know if PIC is present at this point so we need to do
+ * probe() to get the right number of legacy IRQs.
+ */
+ return legacy_pic->probe();
+}
+
+void lapic_assign_legacy_vector(unsigned int irq, bool replace)
+{
+ /*
+ * Use assign system here so it wont get accounted as allocated
+ * and moveable in the cpu hotplug check and it prevents managed
+ * irq reservation from touching it.
+ */
+ irq_matrix_assign_system(vector_matrix, ISA_IRQ_VECTOR(irq), replace);
+}
+
+void __init lapic_update_legacy_vectors(void)
+{
+ unsigned int i;
+
+ if (IS_ENABLED(CONFIG_X86_IO_APIC) && nr_ioapics > 0)
+ return;
+
+ /*
+ * If the IO/APIC is disabled via config, kernel command line or
+ * lack of enumeration then all legacy interrupts are routed
+ * through the PIC. Make sure that they are marked as legacy
+ * vectors. PIC_CASCADE_IRQ has already been marked in
+ * lapic_assign_system_vectors().
+ */
+ for (i = 0; i < nr_legacy_irqs(); i++) {
+ if (i != PIC_CASCADE_IR)
+ lapic_assign_legacy_vector(i, true);
+ }
+}
+
+void __init lapic_assign_system_vectors(void)
+{
+ unsigned int i, vector = 0;
+
+ for_each_set_bit_from(vector, system_vectors, NR_VECTORS)
+ irq_matrix_assign_system(vector_matrix, vector, false);
+
+ if (nr_legacy_irqs() > 1)
+ lapic_assign_legacy_vector(PIC_CASCADE_IR, false);
+
+ /* System vectors are reserved, online it */
+ irq_matrix_online(vector_matrix);
+
+ /* Mark the preallocated legacy interrupts */
+ for (i = 0; i < nr_legacy_irqs(); i++) {
+ if (i != PIC_CASCADE_IR)
+ irq_matrix_assign(vector_matrix, ISA_IRQ_VECTOR(i));
+ }
+}
+
+int __init arch_early_irq_init(void)
+{
+ struct fwnode_handle *fn;
+
+ fn = irq_domain_alloc_named_fwnode("VECTOR");
+ BUG_ON(!fn);
+ x86_vector_domain = irq_domain_create_tree(fn, &x86_vector_domain_ops,
+ NULL);
+ BUG_ON(x86_vector_domain == NULL);
+ irq_set_default_host(x86_vector_domain);
+
+ arch_init_msi_domain(x86_vector_domain);
+
+ BUG_ON(!alloc_cpumask_var(&vector_searchmask, GFP_KERNEL));
+
+ /*
+ * Allocate the vector matrix allocator data structure and limit the
+ * search area.
+ */
+ vector_matrix = irq_alloc_matrix(NR_VECTORS, FIRST_EXTERNAL_VECTOR,
+ FIRST_SYSTEM_VECTOR);
+ BUG_ON(!vector_matrix);
+
+ return arch_early_ioapic_init();
+}
+
+#ifdef CONFIG_SMP
+
+static struct irq_desc *__setup_vector_irq(int vector)
+{
+ int isairq = vector - ISA_IRQ_VECTOR(0);
+
+ /* Check whether the irq is in the legacy space */
+ if (isairq < 0 || isairq >= nr_legacy_irqs())
+ return VECTOR_UNUSED;
+ /* Check whether the irq is handled by the IOAPIC */
+ if (test_bit(isairq, &io_apic_irqs))
+ return VECTOR_UNUSED;
+ return irq_to_desc(isairq);
+}
+
+/* Online the local APIC infrastructure and initialize the vectors */
+void lapic_online(void)
+{
+ unsigned int vector;
+
+ lockdep_assert_held(&vector_lock);
+
+ /* Online the vector matrix array for this CPU */
+ irq_matrix_online(vector_matrix);
+
+ /*
+ * The interrupt affinity logic never targets interrupts to offline
+ * CPUs. The exception are the legacy PIC interrupts. In general
+ * they are only targeted to CPU0, but depending on the platform
+ * they can be distributed to any online CPU in hardware. The
+ * kernel has no influence on that. So all active legacy vectors
+ * must be installed on all CPUs. All non legacy interrupts can be
+ * cleared.
+ */
+ for (vector = 0; vector < NR_VECTORS; vector++)
+ this_cpu_write(vector_irq[vector], __setup_vector_irq(vector));
+}
+
+void lapic_offline(void)
+{
+ lock_vector_lock();
+ irq_matrix_offline(vector_matrix);
+ unlock_vector_lock();
+}
+
+static int apic_set_affinity(struct irq_data *irqd,
+ const struct cpumask *dest, bool force)
+{
+ int err;
+
+ if (WARN_ON_ONCE(!irqd_is_activated(irqd)))
+ return -EIO;
+
+ raw_spin_lock(&vector_lock);
+ cpumask_and(vector_searchmask, dest, cpu_online_mask);
+ if (irqd_affinity_is_managed(irqd))
+ err = assign_managed_vector(irqd, vector_searchmask);
+ else
+ err = assign_vector_locked(irqd, vector_searchmask);
+ raw_spin_unlock(&vector_lock);
+ return err ? err : IRQ_SET_MASK_OK;
+}
+
+#else
+# define apic_set_affinity NULL
+#endif
+
+static int apic_retrigger_irq(struct irq_data *irqd)
+{
+ struct apic_chip_data *apicd = apic_chip_data(irqd);
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&vector_lock, flags);
+ apic->send_IPI(apicd->cpu, apicd->vector);
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
+
+ return 1;
+}
+
+void apic_ack_irq(struct irq_data *irqd)
+{
+ irq_move_irq(irqd);
+ ack_APIC_irq();
+}
+
+void apic_ack_edge(struct irq_data *irqd)
+{
+ irq_complete_move(irqd_cfg(irqd));
+ apic_ack_irq(irqd);
+}
+
+static struct irq_chip lapic_controller = {
+ .name = "APIC",
+ .irq_ack = apic_ack_edge,
+ .irq_set_affinity = apic_set_affinity,
+ .irq_retrigger = apic_retrigger_irq,
+};
+
+#ifdef CONFIG_SMP
+
+static void free_moved_vector(struct apic_chip_data *apicd)
+{
+ unsigned int vector = apicd->prev_vector;
+ unsigned int cpu = apicd->prev_cpu;
+ bool managed = apicd->is_managed;
+
+ /*
+ * This should never happen. Managed interrupts are not
+ * migrated except on CPU down, which does not involve the
+ * cleanup vector. But try to keep the accounting correct
+ * nevertheless.
+ */
+ WARN_ON_ONCE(managed);
+
+ trace_vector_free_moved(apicd->irq, cpu, vector, managed);
+ irq_matrix_free(vector_matrix, cpu, vector, managed);
+ per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED;
+ hlist_del_init(&apicd->clist);
+ apicd->prev_vector = 0;
+ apicd->move_in_progress = 0;
+}
+
+asmlinkage __visible void __irq_entry smp_irq_move_cleanup_interrupt(void)
+{
+ struct hlist_head *clhead = this_cpu_ptr(&cleanup_list);
+ struct apic_chip_data *apicd;
+ struct hlist_node *tmp;
+
+ entering_ack_irq();
+ /* Prevent vectors vanishing under us */
+ raw_spin_lock(&vector_lock);
+
+ hlist_for_each_entry_safe(apicd, tmp, clhead, clist) {
+ unsigned int irr, vector = apicd->prev_vector;
+
+ /*
+ * Paranoia: Check if the vector that needs to be cleaned
+ * up is registered at the APICs IRR. If so, then this is
+ * not the best time to clean it up. Clean it up in the
+ * next attempt by sending another IRQ_MOVE_CLEANUP_VECTOR
+ * to this CPU. IRQ_MOVE_CLEANUP_VECTOR is the lowest
+ * priority external vector, so on return from this
+ * interrupt the device interrupt will happen first.
+ */
+ irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
+ if (irr & (1U << (vector % 32))) {
+ apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR);
+ continue;
+ }
+ free_moved_vector(apicd);
+ }
+
+ raw_spin_unlock(&vector_lock);
+ exiting_irq();
+}
+
+static void __send_cleanup_vector(struct apic_chip_data *apicd)
+{
+ unsigned int cpu;
+
+ raw_spin_lock(&vector_lock);
+ apicd->move_in_progress = 0;
+ cpu = apicd->prev_cpu;
+ if (cpu_online(cpu)) {
+ hlist_add_head(&apicd->clist, per_cpu_ptr(&cleanup_list, cpu));
+ apic->send_IPI(cpu, IRQ_MOVE_CLEANUP_VECTOR);
+ } else {
+ apicd->prev_vector = 0;
+ }
+ raw_spin_unlock(&vector_lock);
+}
+
+void send_cleanup_vector(struct irq_cfg *cfg)
+{
+ struct apic_chip_data *apicd;
+
+ apicd = container_of(cfg, struct apic_chip_data, hw_irq_cfg);
+ if (apicd->move_in_progress)
+ __send_cleanup_vector(apicd);
+}
+
+static void __irq_complete_move(struct irq_cfg *cfg, unsigned vector)
+{
+ struct apic_chip_data *apicd;
+
+ apicd = container_of(cfg, struct apic_chip_data, hw_irq_cfg);
+ if (likely(!apicd->move_in_progress))
+ return;
+
+ if (vector == apicd->vector && apicd->cpu == smp_processor_id())
+ __send_cleanup_vector(apicd);
+}
+
+void irq_complete_move(struct irq_cfg *cfg)
+{
+ __irq_complete_move(cfg, ~get_irq_regs()->orig_ax);
+}
+
+/*
+ * Called from fixup_irqs() with @desc->lock held and interrupts disabled.
+ */
+void irq_force_complete_move(struct irq_desc *desc)
+{
+ struct apic_chip_data *apicd;
+ struct irq_data *irqd;
+ unsigned int vector;
+
+ /*
+ * The function is called for all descriptors regardless of which
+ * irqdomain they belong to. For example if an IRQ is provided by
+ * an irq_chip as part of a GPIO driver, the chip data for that
+ * descriptor is specific to the irq_chip in question.
+ *
+ * Check first that the chip_data is what we expect
+ * (apic_chip_data) before touching it any further.
+ */
+ irqd = irq_domain_get_irq_data(x86_vector_domain,
+ irq_desc_get_irq(desc));
+ if (!irqd)
+ return;
+
+ raw_spin_lock(&vector_lock);
+ apicd = apic_chip_data(irqd);
+ if (!apicd)
+ goto unlock;
+
+ /*
+ * If prev_vector is empty, no action required.
+ */
+ vector = apicd->prev_vector;
+ if (!vector)
+ goto unlock;
+
+ /*
+ * This is tricky. If the cleanup of the old vector has not been
+ * done yet, then the following setaffinity call will fail with
+ * -EBUSY. This can leave the interrupt in a stale state.
+ *
+ * All CPUs are stuck in stop machine with interrupts disabled so
+ * calling __irq_complete_move() would be completely pointless.
+ *
+ * 1) The interrupt is in move_in_progress state. That means that we
+ * have not seen an interrupt since the io_apic was reprogrammed to
+ * the new vector.
+ *
+ * 2) The interrupt has fired on the new vector, but the cleanup IPIs
+ * have not been processed yet.
+ */
+ if (apicd->move_in_progress) {
+ /*
+ * In theory there is a race:
+ *
+ * set_ioapic(new_vector) <-- Interrupt is raised before update
+ * is effective, i.e. it's raised on
+ * the old vector.
+ *
+ * So if the target cpu cannot handle that interrupt before
+ * the old vector is cleaned up, we get a spurious interrupt
+ * and in the worst case the ioapic irq line becomes stale.
+ *
+ * But in case of cpu hotplug this should be a non issue
+ * because if the affinity update happens right before all
+ * cpus rendevouz in stop machine, there is no way that the
+ * interrupt can be blocked on the target cpu because all cpus
+ * loops first with interrupts enabled in stop machine, so the
+ * old vector is not yet cleaned up when the interrupt fires.
+ *
+ * So the only way to run into this issue is if the delivery
+ * of the interrupt on the apic/system bus would be delayed
+ * beyond the point where the target cpu disables interrupts
+ * in stop machine. I doubt that it can happen, but at least
+ * there is a theroretical chance. Virtualization might be
+ * able to expose this, but AFAICT the IOAPIC emulation is not
+ * as stupid as the real hardware.
+ *
+ * Anyway, there is nothing we can do about that at this point
+ * w/o refactoring the whole fixup_irq() business completely.
+ * We print at least the irq number and the old vector number,
+ * so we have the necessary information when a problem in that
+ * area arises.
+ */
+ pr_warn("IRQ fixup: irq %d move in progress, old vector %d\n",
+ irqd->irq, vector);
+ }
+ free_moved_vector(apicd);
+unlock:
+ raw_spin_unlock(&vector_lock);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * Note, this is not accurate accounting, but at least good enough to
+ * prevent that the actual interrupt move will run out of vectors.
+ */
+int lapic_can_unplug_cpu(void)
+{
+ unsigned int rsvd, avl, tomove, cpu = smp_processor_id();
+ int ret = 0;
+
+ raw_spin_lock(&vector_lock);
+ tomove = irq_matrix_allocated(vector_matrix);
+ avl = irq_matrix_available(vector_matrix, true);
+ if (avl < tomove) {
+ pr_warn("CPU %u has %u vectors, %u available. Cannot disable CPU\n",
+ cpu, tomove, avl);
+ ret = -ENOSPC;
+ goto out;
+ }
+ rsvd = irq_matrix_reserved(vector_matrix);
+ if (avl < rsvd) {
+ pr_warn("Reserved vectors %u > available %u. IRQ request may fail\n",
+ rsvd, avl);
+ }
+out:
+ raw_spin_unlock(&vector_lock);
+ return ret;
+}
+#endif /* HOTPLUG_CPU */
+#endif /* SMP */
+
+static void __init print_APIC_field(int base)
+{
+ int i;
+
+ printk(KERN_DEBUG);
+
+ for (i = 0; i < 8; i++)
+ pr_cont("%08x", apic_read(base + i*0x10));
+
+ pr_cont("\n");
+}
+
+static void __init print_local_APIC(void *dummy)
+{
+ unsigned int i, v, ver, maxlvt;
+ u64 icr;
+
+ pr_debug("printing local APIC contents on CPU#%d/%d:\n",
+ smp_processor_id(), hard_smp_processor_id());
+ v = apic_read(APIC_ID);
+ pr_info("... APIC ID: %08x (%01x)\n", v, read_apic_id());
+ v = apic_read(APIC_LVR);
+ pr_info("... APIC VERSION: %08x\n", v);
+ ver = GET_APIC_VERSION(v);
+ maxlvt = lapic_get_maxlvt();
+
+ v = apic_read(APIC_TASKPRI);
+ pr_debug("... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
+
+ /* !82489DX */
+ if (APIC_INTEGRATED(ver)) {
+ if (!APIC_XAPIC(ver)) {
+ v = apic_read(APIC_ARBPRI);
+ pr_debug("... APIC ARBPRI: %08x (%02x)\n",
+ v, v & APIC_ARBPRI_MASK);
+ }
+ v = apic_read(APIC_PROCPRI);
+ pr_debug("... APIC PROCPRI: %08x\n", v);
+ }
+
+ /*
+ * Remote read supported only in the 82489DX and local APIC for
+ * Pentium processors.
+ */
+ if (!APIC_INTEGRATED(ver) || maxlvt == 3) {
+ v = apic_read(APIC_RRR);
+ pr_debug("... APIC RRR: %08x\n", v);
+ }
+
+ v = apic_read(APIC_LDR);
+ pr_debug("... APIC LDR: %08x\n", v);
+ if (!x2apic_enabled()) {
+ v = apic_read(APIC_DFR);
+ pr_debug("... APIC DFR: %08x\n", v);
+ }
+ v = apic_read(APIC_SPIV);
+ pr_debug("... APIC SPIV: %08x\n", v);
+
+ pr_debug("... APIC ISR field:\n");
+ print_APIC_field(APIC_ISR);
+ pr_debug("... APIC TMR field:\n");
+ print_APIC_field(APIC_TMR);
+ pr_debug("... APIC IRR field:\n");
+ print_APIC_field(APIC_IRR);
+
+ /* !82489DX */
+ if (APIC_INTEGRATED(ver)) {
+ /* Due to the Pentium erratum 3AP. */
+ if (maxlvt > 3)
+ apic_write(APIC_ESR, 0);
+
+ v = apic_read(APIC_ESR);
+ pr_debug("... APIC ESR: %08x\n", v);
+ }
+
+ icr = apic_icr_read();
+ pr_debug("... APIC ICR: %08x\n", (u32)icr);
+ pr_debug("... APIC ICR2: %08x\n", (u32)(icr >> 32));
+
+ v = apic_read(APIC_LVTT);
+ pr_debug("... APIC LVTT: %08x\n", v);
+
+ if (maxlvt > 3) {
+ /* PC is LVT#4. */
+ v = apic_read(APIC_LVTPC);
+ pr_debug("... APIC LVTPC: %08x\n", v);
+ }
+ v = apic_read(APIC_LVT0);
+ pr_debug("... APIC LVT0: %08x\n", v);
+ v = apic_read(APIC_LVT1);
+ pr_debug("... APIC LVT1: %08x\n", v);
+
+ if (maxlvt > 2) {
+ /* ERR is LVT#3. */
+ v = apic_read(APIC_LVTERR);
+ pr_debug("... APIC LVTERR: %08x\n", v);
+ }
+
+ v = apic_read(APIC_TMICT);
+ pr_debug("... APIC TMICT: %08x\n", v);
+ v = apic_read(APIC_TMCCT);
+ pr_debug("... APIC TMCCT: %08x\n", v);
+ v = apic_read(APIC_TDCR);
+ pr_debug("... APIC TDCR: %08x\n", v);
+
+ if (boot_cpu_has(X86_FEATURE_EXTAPIC)) {
+ v = apic_read(APIC_EFEAT);
+ maxlvt = (v >> 16) & 0xff;
+ pr_debug("... APIC EFEAT: %08x\n", v);
+ v = apic_read(APIC_ECTRL);
+ pr_debug("... APIC ECTRL: %08x\n", v);
+ for (i = 0; i < maxlvt; i++) {
+ v = apic_read(APIC_EILVTn(i));
+ pr_debug("... APIC EILVT%d: %08x\n", i, v);
+ }
+ }
+ pr_cont("\n");
+}
+
+static void __init print_local_APICs(int maxcpu)
+{
+ int cpu;
+
+ if (!maxcpu)
+ return;
+
+ preempt_disable();
+ for_each_online_cpu(cpu) {
+ if (cpu >= maxcpu)
+ break;
+ smp_call_function_single(cpu, print_local_APIC, NULL, 1);
+ }
+ preempt_enable();
+}
+
+static void __init print_PIC(void)
+{
+ unsigned int v;
+ unsigned long flags;
+
+ if (!nr_legacy_irqs())
+ return;
+
+ pr_debug("\nprinting PIC contents\n");
+
+ raw_spin_lock_irqsave(&i8259A_lock, flags);
+
+ v = inb(0xa1) << 8 | inb(0x21);
+ pr_debug("... PIC IMR: %04x\n", v);
+
+ v = inb(0xa0) << 8 | inb(0x20);
+ pr_debug("... PIC IRR: %04x\n", v);
+
+ outb(0x0b, 0xa0);
+ outb(0x0b, 0x20);
+ v = inb(0xa0) << 8 | inb(0x20);
+ outb(0x0a, 0xa0);
+ outb(0x0a, 0x20);
+
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
+
+ pr_debug("... PIC ISR: %04x\n", v);
+
+ v = inb(0x4d1) << 8 | inb(0x4d0);
+ pr_debug("... PIC ELCR: %04x\n", v);
+}
+
+static int show_lapic __initdata = 1;
+static __init int setup_show_lapic(char *arg)
+{
+ int num = -1;
+
+ if (strcmp(arg, "all") == 0) {
+ show_lapic = CONFIG_NR_CPUS;
+ } else {
+ get_option(&arg, &num);
+ if (num >= 0)
+ show_lapic = num;
+ }
+
+ return 1;
+}
+__setup("show_lapic=", setup_show_lapic);
+
+static int __init print_ICs(void)
+{
+ if (apic_verbosity == APIC_QUIET)
+ return 0;
+
+ print_PIC();
+
+ /* don't print out if apic is not there */
+ if (!boot_cpu_has(X86_FEATURE_APIC) && !apic_from_smp_config())
+ return 0;
+
+ print_local_APICs(show_lapic);
+ print_IO_APICs();
+
+ return 0;
+}
+
+late_initcall(print_ICs);
diff --git a/arch/x86/kernel/apic/x2apic.h b/arch/x86/kernel/apic/x2apic.h
new file mode 100644
index 000000000..a49b36040
--- /dev/null
+++ b/arch/x86/kernel/apic/x2apic.h
@@ -0,0 +1,9 @@
+/* Common bits for X2APIC cluster/physical modes. */
+
+int x2apic_apic_id_valid(u32 apicid);
+int x2apic_apic_id_registered(void);
+void __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest);
+unsigned int x2apic_get_apic_id(unsigned long id);
+u32 x2apic_set_apic_id(unsigned int id);
+int x2apic_phys_pkg_id(int initial_apicid, int index_msb);
+void x2apic_send_IPI_self(int vector);
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
new file mode 100644
index 000000000..b2c7e3bc5
--- /dev/null
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -0,0 +1,230 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/dmar.h>
+#include <linux/irq.h>
+#include <linux/cpu.h>
+
+#include <asm/smp.h>
+#include "x2apic.h"
+
+struct cluster_mask {
+ unsigned int clusterid;
+ int node;
+ struct cpumask mask;
+};
+
+static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
+static DEFINE_PER_CPU(cpumask_var_t, ipi_mask);
+static DEFINE_PER_CPU(struct cluster_mask *, cluster_masks);
+static struct cluster_mask *cluster_hotplug_mask;
+
+static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+ return x2apic_enabled();
+}
+
+static void x2apic_send_IPI(int cpu, int vector)
+{
+ u32 dest = per_cpu(x86_cpu_to_logical_apicid, cpu);
+
+ /* x2apic MSRs are special and need a special fence: */
+ weak_wrmsr_fence();
+ __x2apic_send_IPI_dest(dest, vector, APIC_DEST_LOGICAL);
+}
+
+static void
+__x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
+{
+ unsigned int cpu, clustercpu;
+ struct cpumask *tmpmsk;
+ unsigned long flags;
+ u32 dest;
+
+ /* x2apic MSRs are special and need a special fence: */
+ weak_wrmsr_fence();
+ local_irq_save(flags);
+
+ tmpmsk = this_cpu_cpumask_var_ptr(ipi_mask);
+ cpumask_copy(tmpmsk, mask);
+ /* If IPI should not be sent to self, clear current CPU */
+ if (apic_dest != APIC_DEST_ALLINC)
+ cpumask_clear_cpu(smp_processor_id(), tmpmsk);
+
+ /* Collapse cpus in a cluster so a single IPI per cluster is sent */
+ for_each_cpu(cpu, tmpmsk) {
+ struct cluster_mask *cmsk = per_cpu(cluster_masks, cpu);
+
+ dest = 0;
+ for_each_cpu_and(clustercpu, tmpmsk, &cmsk->mask)
+ dest |= per_cpu(x86_cpu_to_logical_apicid, clustercpu);
+
+ if (!dest)
+ continue;
+
+ __x2apic_send_IPI_dest(dest, vector, apic->dest_logical);
+ /* Remove cluster CPUs from tmpmask */
+ cpumask_andnot(tmpmsk, tmpmsk, &cmsk->mask);
+ }
+
+ local_irq_restore(flags);
+}
+
+static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
+{
+ __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLINC);
+}
+
+static void
+x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
+{
+ __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLBUT);
+}
+
+static void x2apic_send_IPI_allbutself(int vector)
+{
+ __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLBUT);
+}
+
+static void x2apic_send_IPI_all(int vector)
+{
+ __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC);
+}
+
+static u32 x2apic_calc_apicid(unsigned int cpu)
+{
+ return per_cpu(x86_cpu_to_logical_apicid, cpu);
+}
+
+static void init_x2apic_ldr(void)
+{
+ struct cluster_mask *cmsk = this_cpu_read(cluster_masks);
+ u32 cluster, apicid = apic_read(APIC_LDR);
+ unsigned int cpu;
+
+ this_cpu_write(x86_cpu_to_logical_apicid, apicid);
+
+ if (cmsk)
+ goto update;
+
+ cluster = apicid >> 16;
+ for_each_online_cpu(cpu) {
+ cmsk = per_cpu(cluster_masks, cpu);
+ /* Matching cluster found. Link and update it. */
+ if (cmsk && cmsk->clusterid == cluster)
+ goto update;
+ }
+ cmsk = cluster_hotplug_mask;
+ cmsk->clusterid = cluster;
+ cluster_hotplug_mask = NULL;
+update:
+ this_cpu_write(cluster_masks, cmsk);
+ cpumask_set_cpu(smp_processor_id(), &cmsk->mask);
+}
+
+static int alloc_clustermask(unsigned int cpu, int node)
+{
+ if (per_cpu(cluster_masks, cpu))
+ return 0;
+ /*
+ * If a hotplug spare mask exists, check whether it's on the right
+ * node. If not, free it and allocate a new one.
+ */
+ if (cluster_hotplug_mask) {
+ if (cluster_hotplug_mask->node == node)
+ return 0;
+ kfree(cluster_hotplug_mask);
+ }
+
+ cluster_hotplug_mask = kzalloc_node(sizeof(*cluster_hotplug_mask),
+ GFP_KERNEL, node);
+ if (!cluster_hotplug_mask)
+ return -ENOMEM;
+ cluster_hotplug_mask->node = node;
+ return 0;
+}
+
+static int x2apic_prepare_cpu(unsigned int cpu)
+{
+ if (alloc_clustermask(cpu, cpu_to_node(cpu)) < 0)
+ return -ENOMEM;
+ if (!zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL))
+ return -ENOMEM;
+ return 0;
+}
+
+static int x2apic_dead_cpu(unsigned int dead_cpu)
+{
+ struct cluster_mask *cmsk = per_cpu(cluster_masks, dead_cpu);
+
+ if (cmsk)
+ cpumask_clear_cpu(dead_cpu, &cmsk->mask);
+ free_cpumask_var(per_cpu(ipi_mask, dead_cpu));
+ return 0;
+}
+
+static int x2apic_cluster_probe(void)
+{
+ if (!x2apic_mode)
+ return 0;
+
+ if (cpuhp_setup_state(CPUHP_X2APIC_PREPARE, "x86/x2apic:prepare",
+ x2apic_prepare_cpu, x2apic_dead_cpu) < 0) {
+ pr_err("Failed to register X2APIC_PREPARE\n");
+ return 0;
+ }
+ init_x2apic_ldr();
+ return 1;
+}
+
+static struct apic apic_x2apic_cluster __ro_after_init = {
+
+ .name = "cluster x2apic",
+ .probe = x2apic_cluster_probe,
+ .acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
+ .apic_id_valid = x2apic_apic_id_valid,
+ .apic_id_registered = x2apic_apic_id_registered,
+
+ .irq_delivery_mode = dest_Fixed,
+ .irq_dest_mode = 1, /* logical */
+
+ .disable_esr = 0,
+ .dest_logical = APIC_DEST_LOGICAL,
+ .check_apicid_used = NULL,
+
+ .init_apic_ldr = init_x2apic_ldr,
+
+ .ioapic_phys_id_map = NULL,
+ .setup_apic_routing = NULL,
+ .cpu_present_to_apicid = default_cpu_present_to_apicid,
+ .apicid_to_cpu_present = NULL,
+ .check_phys_apicid_present = default_check_phys_apicid_present,
+ .phys_pkg_id = x2apic_phys_pkg_id,
+
+ .get_apic_id = x2apic_get_apic_id,
+ .set_apic_id = x2apic_set_apic_id,
+
+ .calc_dest_apicid = x2apic_calc_apicid,
+
+ .send_IPI = x2apic_send_IPI,
+ .send_IPI_mask = x2apic_send_IPI_mask,
+ .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself,
+ .send_IPI_allbutself = x2apic_send_IPI_allbutself,
+ .send_IPI_all = x2apic_send_IPI_all,
+ .send_IPI_self = x2apic_send_IPI_self,
+
+ .inquire_remote_apic = NULL,
+
+ .read = native_apic_msr_read,
+ .write = native_apic_msr_write,
+ .eoi_write = native_apic_msr_eoi_write,
+ .icr_read = native_x2apic_icr_read,
+ .icr_write = native_x2apic_icr_write,
+ .wait_icr_idle = native_x2apic_wait_icr_idle,
+ .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle,
+};
+
+apic_driver(apic_x2apic_cluster);
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
new file mode 100644
index 000000000..8e70c2ba2
--- /dev/null
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -0,0 +1,198 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/dmar.h>
+
+#include <asm/smp.h>
+#include <asm/ipi.h>
+#include "x2apic.h"
+
+int x2apic_phys;
+
+static struct apic apic_x2apic_phys;
+static u32 x2apic_max_apicid __ro_after_init;
+
+void __init x2apic_set_max_apicid(u32 apicid)
+{
+ x2apic_max_apicid = apicid;
+}
+
+static int __init set_x2apic_phys_mode(char *arg)
+{
+ x2apic_phys = 1;
+ return 0;
+}
+early_param("x2apic_phys", set_x2apic_phys_mode);
+
+static bool x2apic_fadt_phys(void)
+{
+#ifdef CONFIG_ACPI
+ if ((acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID) &&
+ (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) {
+ printk(KERN_DEBUG "System requires x2apic physical mode\n");
+ return true;
+ }
+#endif
+ return false;
+}
+
+static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+ return x2apic_enabled() && (x2apic_phys || x2apic_fadt_phys());
+}
+
+static void x2apic_send_IPI(int cpu, int vector)
+{
+ u32 dest = per_cpu(x86_cpu_to_apicid, cpu);
+
+ /* x2apic MSRs are special and need a special fence: */
+ weak_wrmsr_fence();
+ __x2apic_send_IPI_dest(dest, vector, APIC_DEST_PHYSICAL);
+}
+
+static void
+__x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
+{
+ unsigned long query_cpu;
+ unsigned long this_cpu;
+ unsigned long flags;
+
+ /* x2apic MSRs are special and need a special fence: */
+ weak_wrmsr_fence();
+
+ local_irq_save(flags);
+
+ this_cpu = smp_processor_id();
+ for_each_cpu(query_cpu, mask) {
+ if (apic_dest == APIC_DEST_ALLBUT && this_cpu == query_cpu)
+ continue;
+ __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu),
+ vector, APIC_DEST_PHYSICAL);
+ }
+ local_irq_restore(flags);
+}
+
+static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
+{
+ __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLINC);
+}
+
+static void
+ x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
+{
+ __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLBUT);
+}
+
+static void x2apic_send_IPI_allbutself(int vector)
+{
+ __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLBUT);
+}
+
+static void x2apic_send_IPI_all(int vector)
+{
+ __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC);
+}
+
+static void init_x2apic_ldr(void)
+{
+}
+
+static int x2apic_phys_probe(void)
+{
+ if (x2apic_mode && (x2apic_phys || x2apic_fadt_phys()))
+ return 1;
+
+ return apic == &apic_x2apic_phys;
+}
+
+/* Common x2apic functions, also used by x2apic_cluster */
+int x2apic_apic_id_valid(u32 apicid)
+{
+ if (x2apic_max_apicid && apicid > x2apic_max_apicid)
+ return 0;
+
+ return 1;
+}
+
+int x2apic_apic_id_registered(void)
+{
+ return 1;
+}
+
+void __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest)
+{
+ unsigned long cfg = __prepare_ICR(0, vector, dest);
+ native_x2apic_icr_write(cfg, apicid);
+}
+
+unsigned int x2apic_get_apic_id(unsigned long id)
+{
+ return id;
+}
+
+u32 x2apic_set_apic_id(unsigned int id)
+{
+ return id;
+}
+
+int x2apic_phys_pkg_id(int initial_apicid, int index_msb)
+{
+ return initial_apicid >> index_msb;
+}
+
+void x2apic_send_IPI_self(int vector)
+{
+ apic_write(APIC_SELF_IPI, vector);
+}
+
+static struct apic apic_x2apic_phys __ro_after_init = {
+
+ .name = "physical x2apic",
+ .probe = x2apic_phys_probe,
+ .acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
+ .apic_id_valid = x2apic_apic_id_valid,
+ .apic_id_registered = x2apic_apic_id_registered,
+
+ .irq_delivery_mode = dest_Fixed,
+ .irq_dest_mode = 0, /* physical */
+
+ .disable_esr = 0,
+ .dest_logical = 0,
+ .check_apicid_used = NULL,
+
+ .init_apic_ldr = init_x2apic_ldr,
+
+ .ioapic_phys_id_map = NULL,
+ .setup_apic_routing = NULL,
+ .cpu_present_to_apicid = default_cpu_present_to_apicid,
+ .apicid_to_cpu_present = NULL,
+ .check_phys_apicid_present = default_check_phys_apicid_present,
+ .phys_pkg_id = x2apic_phys_pkg_id,
+
+ .get_apic_id = x2apic_get_apic_id,
+ .set_apic_id = x2apic_set_apic_id,
+
+ .calc_dest_apicid = apic_default_calc_apicid,
+
+ .send_IPI = x2apic_send_IPI,
+ .send_IPI_mask = x2apic_send_IPI_mask,
+ .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself,
+ .send_IPI_allbutself = x2apic_send_IPI_allbutself,
+ .send_IPI_all = x2apic_send_IPI_all,
+ .send_IPI_self = x2apic_send_IPI_self,
+
+ .inquire_remote_apic = NULL,
+
+ .read = native_apic_msr_read,
+ .write = native_apic_msr_write,
+ .eoi_write = native_apic_msr_eoi_write,
+ .icr_read = native_x2apic_icr_read,
+ .icr_write = native_x2apic_icr_write,
+ .wait_icr_idle = native_x2apic_wait_icr_idle,
+ .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle,
+};
+
+apic_driver(apic_x2apic_phys);
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
new file mode 100644
index 000000000..391f358eb
--- /dev/null
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -0,0 +1,1609 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * SGI UV APIC functions (note: not an Intel compatible APIC)
+ *
+ * Copyright (C) 2007-2014 Silicon Graphics, Inc. All rights reserved.
+ */
+#include <linux/cpumask.h>
+#include <linux/hardirq.h>
+#include <linux/proc_fs.h>
+#include <linux/threads.h>
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/slab.h>
+#include <linux/cpu.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/pci.h>
+#include <linux/kdebug.h>
+#include <linux/delay.h>
+#include <linux/crash_dump.h>
+#include <linux/reboot.h>
+#include <linux/memory.h>
+
+#include <asm/uv/uv_mmrs.h>
+#include <asm/uv/uv_hub.h>
+#include <asm/current.h>
+#include <asm/pgtable.h>
+#include <asm/uv/bios.h>
+#include <asm/uv/uv.h>
+#include <asm/apic.h>
+#include <asm/e820/api.h>
+#include <asm/ipi.h>
+#include <asm/smp.h>
+#include <asm/x86_init.h>
+#include <asm/nmi.h>
+
+DEFINE_PER_CPU(int, x2apic_extra_bits);
+
+static enum uv_system_type uv_system_type;
+static bool uv_hubless_system;
+static u64 gru_start_paddr, gru_end_paddr;
+static u64 gru_dist_base, gru_first_node_paddr = -1LL, gru_last_node_paddr;
+static u64 gru_dist_lmask, gru_dist_umask;
+static union uvh_apicid uvh_apicid;
+
+/* Information derived from CPUID: */
+static struct {
+ unsigned int apicid_shift;
+ unsigned int apicid_mask;
+ unsigned int socketid_shift; /* aka pnode_shift for UV1/2/3 */
+ unsigned int pnode_mask;
+ unsigned int gpa_shift;
+ unsigned int gnode_shift;
+} uv_cpuid;
+
+int uv_min_hub_revision_id;
+EXPORT_SYMBOL_GPL(uv_min_hub_revision_id);
+
+unsigned int uv_apicid_hibits;
+EXPORT_SYMBOL_GPL(uv_apicid_hibits);
+
+static struct apic apic_x2apic_uv_x;
+static struct uv_hub_info_s uv_hub_info_node0;
+
+/* Set this to use hardware error handler instead of kernel panic: */
+static int disable_uv_undefined_panic = 1;
+
+unsigned long uv_undefined(char *str)
+{
+ if (likely(!disable_uv_undefined_panic))
+ panic("UV: error: undefined MMR: %s\n", str);
+ else
+ pr_crit("UV: error: undefined MMR: %s\n", str);
+
+ /* Cause a machine fault: */
+ return ~0ul;
+}
+EXPORT_SYMBOL(uv_undefined);
+
+static unsigned long __init uv_early_read_mmr(unsigned long addr)
+{
+ unsigned long val, *mmr;
+
+ mmr = early_ioremap(UV_LOCAL_MMR_BASE | addr, sizeof(*mmr));
+ val = *mmr;
+ early_iounmap(mmr, sizeof(*mmr));
+
+ return val;
+}
+
+static inline bool is_GRU_range(u64 start, u64 end)
+{
+ if (gru_dist_base) {
+ u64 su = start & gru_dist_umask; /* Upper (incl pnode) bits */
+ u64 sl = start & gru_dist_lmask; /* Base offset bits */
+ u64 eu = end & gru_dist_umask;
+ u64 el = end & gru_dist_lmask;
+
+ /* Must reside completely within a single GRU range: */
+ return (sl == gru_dist_base && el == gru_dist_base &&
+ su >= gru_first_node_paddr &&
+ su <= gru_last_node_paddr &&
+ eu == su);
+ } else {
+ return start >= gru_start_paddr && end <= gru_end_paddr;
+ }
+}
+
+static bool uv_is_untracked_pat_range(u64 start, u64 end)
+{
+ return is_ISA_range(start, end) || is_GRU_range(start, end);
+}
+
+static int __init early_get_pnodeid(void)
+{
+ union uvh_node_id_u node_id;
+ union uvh_rh_gam_config_mmr_u m_n_config;
+ int pnode;
+
+ /* Currently, all blades have same revision number */
+ node_id.v = uv_early_read_mmr(UVH_NODE_ID);
+ m_n_config.v = uv_early_read_mmr(UVH_RH_GAM_CONFIG_MMR);
+ uv_min_hub_revision_id = node_id.s.revision;
+
+ switch (node_id.s.part_number) {
+ case UV2_HUB_PART_NUMBER:
+ case UV2_HUB_PART_NUMBER_X:
+ uv_min_hub_revision_id += UV2_HUB_REVISION_BASE - 1;
+ break;
+ case UV3_HUB_PART_NUMBER:
+ case UV3_HUB_PART_NUMBER_X:
+ uv_min_hub_revision_id += UV3_HUB_REVISION_BASE;
+ break;
+
+ /* Update: UV4A has only a modified revision to indicate HUB fixes */
+ case UV4_HUB_PART_NUMBER:
+ uv_min_hub_revision_id += UV4_HUB_REVISION_BASE - 1;
+ uv_cpuid.gnode_shift = 2; /* min partition is 4 sockets */
+ break;
+ }
+
+ uv_hub_info->hub_revision = uv_min_hub_revision_id;
+ uv_cpuid.pnode_mask = (1 << m_n_config.s.n_skt) - 1;
+ pnode = (node_id.s.node_id >> 1) & uv_cpuid.pnode_mask;
+ uv_cpuid.gpa_shift = 46; /* Default unless changed */
+
+ pr_info("UV: rev:%d part#:%x nodeid:%04x n_skt:%d pnmsk:%x pn:%x\n",
+ node_id.s.revision, node_id.s.part_number, node_id.s.node_id,
+ m_n_config.s.n_skt, uv_cpuid.pnode_mask, pnode);
+ return pnode;
+}
+
+static void __init uv_tsc_check_sync(void)
+{
+ u64 mmr;
+ int sync_state;
+ int mmr_shift;
+ char *state;
+ bool valid;
+
+ /* Accommodate different UV arch BIOSes */
+ mmr = uv_early_read_mmr(UVH_TSC_SYNC_MMR);
+ mmr_shift =
+ is_uv1_hub() ? 0 :
+ is_uv2_hub() ? UVH_TSC_SYNC_SHIFT_UV2K : UVH_TSC_SYNC_SHIFT;
+ if (mmr_shift)
+ sync_state = (mmr >> mmr_shift) & UVH_TSC_SYNC_MASK;
+ else
+ sync_state = 0;
+
+ switch (sync_state) {
+ case UVH_TSC_SYNC_VALID:
+ state = "in sync";
+ valid = true;
+ break;
+
+ case UVH_TSC_SYNC_INVALID:
+ state = "unstable";
+ valid = false;
+ break;
+ default:
+ state = "unknown: assuming valid";
+ valid = true;
+ break;
+ }
+ pr_info("UV: TSC sync state from BIOS:0%d(%s)\n", sync_state, state);
+
+ /* Mark flag that says TSC != 0 is valid for socket 0 */
+ if (valid)
+ mark_tsc_async_resets("UV BIOS");
+ else
+ mark_tsc_unstable("UV BIOS");
+}
+
+/* [Copied from arch/x86/kernel/cpu/topology.c:detect_extended_topology()] */
+
+#define SMT_LEVEL 0 /* Leaf 0xb SMT level */
+#define INVALID_TYPE 0 /* Leaf 0xb sub-leaf types */
+#define SMT_TYPE 1
+#define CORE_TYPE 2
+#define LEAFB_SUBTYPE(ecx) (((ecx) >> 8) & 0xff)
+#define BITS_SHIFT_NEXT_LEVEL(eax) ((eax) & 0x1f)
+
+static void set_x2apic_bits(void)
+{
+ unsigned int eax, ebx, ecx, edx, sub_index;
+ unsigned int sid_shift;
+
+ cpuid(0, &eax, &ebx, &ecx, &edx);
+ if (eax < 0xb) {
+ pr_info("UV: CPU does not have CPUID.11\n");
+ return;
+ }
+
+ cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx);
+ if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE)) {
+ pr_info("UV: CPUID.11 not implemented\n");
+ return;
+ }
+
+ sid_shift = BITS_SHIFT_NEXT_LEVEL(eax);
+ sub_index = 1;
+ do {
+ cpuid_count(0xb, sub_index, &eax, &ebx, &ecx, &edx);
+ if (LEAFB_SUBTYPE(ecx) == CORE_TYPE) {
+ sid_shift = BITS_SHIFT_NEXT_LEVEL(eax);
+ break;
+ }
+ sub_index++;
+ } while (LEAFB_SUBTYPE(ecx) != INVALID_TYPE);
+
+ uv_cpuid.apicid_shift = 0;
+ uv_cpuid.apicid_mask = (~(-1 << sid_shift));
+ uv_cpuid.socketid_shift = sid_shift;
+}
+
+static void __init early_get_apic_socketid_shift(void)
+{
+ if (is_uv2_hub() || is_uv3_hub())
+ uvh_apicid.v = uv_early_read_mmr(UVH_APICID);
+
+ set_x2apic_bits();
+
+ pr_info("UV: apicid_shift:%d apicid_mask:0x%x\n", uv_cpuid.apicid_shift, uv_cpuid.apicid_mask);
+ pr_info("UV: socketid_shift:%d pnode_mask:0x%x\n", uv_cpuid.socketid_shift, uv_cpuid.pnode_mask);
+}
+
+/*
+ * Add an extra bit as dictated by bios to the destination apicid of
+ * interrupts potentially passing through the UV HUB. This prevents
+ * a deadlock between interrupts and IO port operations.
+ */
+static void __init uv_set_apicid_hibit(void)
+{
+ union uv1h_lb_target_physical_apic_id_mask_u apicid_mask;
+
+ if (is_uv1_hub()) {
+ apicid_mask.v = uv_early_read_mmr(UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK);
+ uv_apicid_hibits = apicid_mask.s1.bit_enables & UV_APICID_HIBIT_MASK;
+ }
+}
+
+static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+ int pnodeid;
+ int uv_apic;
+
+ if (strncmp(oem_id, "SGI", 3) != 0) {
+ if (strncmp(oem_id, "NSGI", 4) == 0) {
+ uv_hubless_system = true;
+ pr_info("UV: OEM IDs %s/%s, HUBLESS\n",
+ oem_id, oem_table_id);
+ }
+ return 0;
+ }
+
+ if (numa_off) {
+ pr_err("UV: NUMA is off, disabling UV support\n");
+ return 0;
+ }
+
+ /* Set up early hub type field in uv_hub_info for Node 0 */
+ uv_cpu_info->p_uv_hub_info = &uv_hub_info_node0;
+
+ /*
+ * Determine UV arch type.
+ * SGI: UV100/1000
+ * SGI2: UV2000/3000
+ * SGI3: UV300 (truncated to 4 chars because of different varieties)
+ * SGI4: UV400 (truncated to 4 chars because of different varieties)
+ */
+ uv_hub_info->hub_revision =
+ !strncmp(oem_id, "SGI4", 4) ? UV4_HUB_REVISION_BASE :
+ !strncmp(oem_id, "SGI3", 4) ? UV3_HUB_REVISION_BASE :
+ !strcmp(oem_id, "SGI2") ? UV2_HUB_REVISION_BASE :
+ !strcmp(oem_id, "SGI") ? UV1_HUB_REVISION_BASE : 0;
+
+ if (uv_hub_info->hub_revision == 0)
+ goto badbios;
+
+ pnodeid = early_get_pnodeid();
+ early_get_apic_socketid_shift();
+
+ x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range;
+ x86_platform.nmi_init = uv_nmi_init;
+
+ if (!strcmp(oem_table_id, "UVX")) {
+ /* This is the most common hardware variant: */
+ uv_system_type = UV_X2APIC;
+ uv_apic = 0;
+
+ } else if (!strcmp(oem_table_id, "UVH")) {
+ /* Only UV1 systems: */
+ uv_system_type = UV_NON_UNIQUE_APIC;
+ x86_platform.legacy.warm_reset = 0;
+ __this_cpu_write(x2apic_extra_bits, pnodeid << uvh_apicid.s.pnode_shift);
+ uv_set_apicid_hibit();
+ uv_apic = 1;
+
+ } else if (!strcmp(oem_table_id, "UVL")) {
+ /* Only used for very small systems: */
+ uv_system_type = UV_LEGACY_APIC;
+ uv_apic = 0;
+
+ } else {
+ goto badbios;
+ }
+
+ pr_info("UV: OEM IDs %s/%s, System/HUB Types %d/%d, uv_apic %d\n", oem_id, oem_table_id, uv_system_type, uv_min_hub_revision_id, uv_apic);
+ uv_tsc_check_sync();
+
+ return uv_apic;
+
+badbios:
+ pr_err("UV: OEM_ID:%s OEM_TABLE_ID:%s\n", oem_id, oem_table_id);
+ pr_err("Current BIOS not supported, update kernel and/or BIOS\n");
+ BUG();
+}
+
+enum uv_system_type get_uv_system_type(void)
+{
+ return uv_system_type;
+}
+
+int is_uv_system(void)
+{
+ return uv_system_type != UV_NONE;
+}
+EXPORT_SYMBOL_GPL(is_uv_system);
+
+int is_uv_hubless(void)
+{
+ return uv_hubless_system;
+}
+EXPORT_SYMBOL_GPL(is_uv_hubless);
+
+void **__uv_hub_info_list;
+EXPORT_SYMBOL_GPL(__uv_hub_info_list);
+
+DEFINE_PER_CPU(struct uv_cpu_info_s, __uv_cpu_info);
+EXPORT_PER_CPU_SYMBOL_GPL(__uv_cpu_info);
+
+short uv_possible_blades;
+EXPORT_SYMBOL_GPL(uv_possible_blades);
+
+unsigned long sn_rtc_cycles_per_second;
+EXPORT_SYMBOL(sn_rtc_cycles_per_second);
+
+/* The following values are used for the per node hub info struct */
+static __initdata unsigned short *_node_to_pnode;
+static __initdata unsigned short _min_socket, _max_socket;
+static __initdata unsigned short _min_pnode, _max_pnode, _gr_table_len;
+static __initdata struct uv_gam_range_entry *uv_gre_table;
+static __initdata struct uv_gam_parameters *uv_gp_table;
+static __initdata unsigned short *_socket_to_node;
+static __initdata unsigned short *_socket_to_pnode;
+static __initdata unsigned short *_pnode_to_socket;
+
+static __initdata struct uv_gam_range_s *_gr_table;
+
+#define SOCK_EMPTY ((unsigned short)~0)
+
+extern int uv_hub_info_version(void)
+{
+ return UV_HUB_INFO_VERSION;
+}
+EXPORT_SYMBOL(uv_hub_info_version);
+
+/* Default UV memory block size is 2GB */
+static unsigned long mem_block_size __initdata = (2UL << 30);
+
+/* Kernel parameter to specify UV mem block size */
+static int __init parse_mem_block_size(char *ptr)
+{
+ unsigned long size = memparse(ptr, NULL);
+
+ /* Size will be rounded down by set_block_size() below */
+ mem_block_size = size;
+ return 0;
+}
+early_param("uv_memblksize", parse_mem_block_size);
+
+static __init int adj_blksize(u32 lgre)
+{
+ unsigned long base = (unsigned long)lgre << UV_GAM_RANGE_SHFT;
+ unsigned long size;
+
+ for (size = mem_block_size; size > MIN_MEMORY_BLOCK_SIZE; size >>= 1)
+ if (IS_ALIGNED(base, size))
+ break;
+
+ if (size >= mem_block_size)
+ return 0;
+
+ mem_block_size = size;
+ return 1;
+}
+
+static __init void set_block_size(void)
+{
+ unsigned int order = ffs(mem_block_size);
+
+ if (order) {
+ /* adjust for ffs return of 1..64 */
+ set_memory_block_size_order(order - 1);
+ pr_info("UV: mem_block_size set to 0x%lx\n", mem_block_size);
+ } else {
+ /* bad or zero value, default to 1UL << 31 (2GB) */
+ pr_err("UV: mem_block_size error with 0x%lx\n", mem_block_size);
+ set_memory_block_size_order(31);
+ }
+}
+
+/* Build GAM range lookup table: */
+static __init void build_uv_gr_table(void)
+{
+ struct uv_gam_range_entry *gre = uv_gre_table;
+ struct uv_gam_range_s *grt;
+ unsigned long last_limit = 0, ram_limit = 0;
+ int bytes, i, sid, lsid = -1, indx = 0, lindx = -1;
+
+ if (!gre)
+ return;
+
+ bytes = _gr_table_len * sizeof(struct uv_gam_range_s);
+ grt = kzalloc(bytes, GFP_KERNEL);
+ BUG_ON(!grt);
+ _gr_table = grt;
+
+ for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) {
+ if (gre->type == UV_GAM_RANGE_TYPE_HOLE) {
+ if (!ram_limit) {
+ /* Mark hole between RAM/non-RAM: */
+ ram_limit = last_limit;
+ last_limit = gre->limit;
+ lsid++;
+ continue;
+ }
+ last_limit = gre->limit;
+ pr_info("UV: extra hole in GAM RE table @%d\n", (int)(gre - uv_gre_table));
+ continue;
+ }
+ if (_max_socket < gre->sockid) {
+ pr_err("UV: GAM table sockid(%d) too large(>%d) @%d\n", gre->sockid, _max_socket, (int)(gre - uv_gre_table));
+ continue;
+ }
+ sid = gre->sockid - _min_socket;
+ if (lsid < sid) {
+ /* New range: */
+ grt = &_gr_table[indx];
+ grt->base = lindx;
+ grt->nasid = gre->nasid;
+ grt->limit = last_limit = gre->limit;
+ lsid = sid;
+ lindx = indx++;
+ continue;
+ }
+ /* Update range: */
+ if (lsid == sid && !ram_limit) {
+ /* .. if contiguous: */
+ if (grt->limit == last_limit) {
+ grt->limit = last_limit = gre->limit;
+ continue;
+ }
+ }
+ /* Non-contiguous RAM range: */
+ if (!ram_limit) {
+ grt++;
+ grt->base = lindx;
+ grt->nasid = gre->nasid;
+ grt->limit = last_limit = gre->limit;
+ continue;
+ }
+ /* Non-contiguous/non-RAM: */
+ grt++;
+ /* base is this entry */
+ grt->base = grt - _gr_table;
+ grt->nasid = gre->nasid;
+ grt->limit = last_limit = gre->limit;
+ lsid++;
+ }
+
+ /* Shorten table if possible */
+ grt++;
+ i = grt - _gr_table;
+ if (i < _gr_table_len) {
+ void *ret;
+
+ bytes = i * sizeof(struct uv_gam_range_s);
+ ret = krealloc(_gr_table, bytes, GFP_KERNEL);
+ if (ret) {
+ _gr_table = ret;
+ _gr_table_len = i;
+ }
+ }
+
+ /* Display resultant GAM range table: */
+ for (i = 0, grt = _gr_table; i < _gr_table_len; i++, grt++) {
+ unsigned long start, end;
+ int gb = grt->base;
+
+ start = gb < 0 ? 0 : (unsigned long)_gr_table[gb].limit << UV_GAM_RANGE_SHFT;
+ end = (unsigned long)grt->limit << UV_GAM_RANGE_SHFT;
+
+ pr_info("UV: GAM Range %2d %04x 0x%013lx-0x%013lx (%d)\n", i, grt->nasid, start, end, gb);
+ }
+}
+
+static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip)
+{
+ unsigned long val;
+ int pnode;
+
+ pnode = uv_apicid_to_pnode(phys_apicid);
+ phys_apicid |= uv_apicid_hibits;
+
+ val = (1UL << UVH_IPI_INT_SEND_SHFT) |
+ (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) |
+ ((start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) |
+ APIC_DM_INIT;
+
+ uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
+
+ val = (1UL << UVH_IPI_INT_SEND_SHFT) |
+ (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) |
+ ((start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) |
+ APIC_DM_STARTUP;
+
+ uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
+
+ return 0;
+}
+
+static void uv_send_IPI_one(int cpu, int vector)
+{
+ unsigned long apicid;
+ int pnode;
+
+ apicid = per_cpu(x86_cpu_to_apicid, cpu);
+ pnode = uv_apicid_to_pnode(apicid);
+ uv_hub_send_ipi(pnode, apicid, vector);
+}
+
+static void uv_send_IPI_mask(const struct cpumask *mask, int vector)
+{
+ unsigned int cpu;
+
+ for_each_cpu(cpu, mask)
+ uv_send_IPI_one(cpu, vector);
+}
+
+static void uv_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
+{
+ unsigned int this_cpu = smp_processor_id();
+ unsigned int cpu;
+
+ for_each_cpu(cpu, mask) {
+ if (cpu != this_cpu)
+ uv_send_IPI_one(cpu, vector);
+ }
+}
+
+static void uv_send_IPI_allbutself(int vector)
+{
+ unsigned int this_cpu = smp_processor_id();
+ unsigned int cpu;
+
+ for_each_online_cpu(cpu) {
+ if (cpu != this_cpu)
+ uv_send_IPI_one(cpu, vector);
+ }
+}
+
+static void uv_send_IPI_all(int vector)
+{
+ uv_send_IPI_mask(cpu_online_mask, vector);
+}
+
+static int uv_apic_id_valid(u32 apicid)
+{
+ return 1;
+}
+
+static int uv_apic_id_registered(void)
+{
+ return 1;
+}
+
+static void uv_init_apic_ldr(void)
+{
+}
+
+static u32 apic_uv_calc_apicid(unsigned int cpu)
+{
+ return apic_default_calc_apicid(cpu) | uv_apicid_hibits;
+}
+
+static unsigned int x2apic_get_apic_id(unsigned long x)
+{
+ unsigned int id;
+
+ WARN_ON(preemptible() && num_online_cpus() > 1);
+ id = x | __this_cpu_read(x2apic_extra_bits);
+
+ return id;
+}
+
+static u32 set_apic_id(unsigned int id)
+{
+ /* CHECKME: Do we need to mask out the xapic extra bits? */
+ return id;
+}
+
+static unsigned int uv_read_apic_id(void)
+{
+ return x2apic_get_apic_id(apic_read(APIC_ID));
+}
+
+static int uv_phys_pkg_id(int initial_apicid, int index_msb)
+{
+ return uv_read_apic_id() >> index_msb;
+}
+
+static void uv_send_IPI_self(int vector)
+{
+ apic_write(APIC_SELF_IPI, vector);
+}
+
+static int uv_probe(void)
+{
+ return apic == &apic_x2apic_uv_x;
+}
+
+static struct apic apic_x2apic_uv_x __ro_after_init = {
+
+ .name = "UV large system",
+ .probe = uv_probe,
+ .acpi_madt_oem_check = uv_acpi_madt_oem_check,
+ .apic_id_valid = uv_apic_id_valid,
+ .apic_id_registered = uv_apic_id_registered,
+
+ .irq_delivery_mode = dest_Fixed,
+ .irq_dest_mode = 0, /* Physical */
+
+ .disable_esr = 0,
+ .dest_logical = APIC_DEST_LOGICAL,
+ .check_apicid_used = NULL,
+
+ .init_apic_ldr = uv_init_apic_ldr,
+
+ .ioapic_phys_id_map = NULL,
+ .setup_apic_routing = NULL,
+ .cpu_present_to_apicid = default_cpu_present_to_apicid,
+ .apicid_to_cpu_present = NULL,
+ .check_phys_apicid_present = default_check_phys_apicid_present,
+ .phys_pkg_id = uv_phys_pkg_id,
+
+ .get_apic_id = x2apic_get_apic_id,
+ .set_apic_id = set_apic_id,
+
+ .calc_dest_apicid = apic_uv_calc_apicid,
+
+ .send_IPI = uv_send_IPI_one,
+ .send_IPI_mask = uv_send_IPI_mask,
+ .send_IPI_mask_allbutself = uv_send_IPI_mask_allbutself,
+ .send_IPI_allbutself = uv_send_IPI_allbutself,
+ .send_IPI_all = uv_send_IPI_all,
+ .send_IPI_self = uv_send_IPI_self,
+
+ .wakeup_secondary_cpu = uv_wakeup_secondary,
+ .inquire_remote_apic = NULL,
+
+ .read = native_apic_msr_read,
+ .write = native_apic_msr_write,
+ .eoi_write = native_apic_msr_eoi_write,
+ .icr_read = native_x2apic_icr_read,
+ .icr_write = native_x2apic_icr_write,
+ .wait_icr_idle = native_x2apic_wait_icr_idle,
+ .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle,
+};
+
+static void set_x2apic_extra_bits(int pnode)
+{
+ __this_cpu_write(x2apic_extra_bits, pnode << uvh_apicid.s.pnode_shift);
+}
+
+#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_LENGTH 3
+#define DEST_SHIFT UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT
+
+static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size)
+{
+ union uvh_rh_gam_alias210_overlay_config_2_mmr_u alias;
+ union uvh_rh_gam_alias210_redirect_config_2_mmr_u redirect;
+ unsigned long m_redirect;
+ unsigned long m_overlay;
+ int i;
+
+ for (i = 0; i < UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_LENGTH; i++) {
+ switch (i) {
+ case 0:
+ m_redirect = UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR;
+ m_overlay = UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR;
+ break;
+ case 1:
+ m_redirect = UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR;
+ m_overlay = UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR;
+ break;
+ case 2:
+ m_redirect = UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR;
+ m_overlay = UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR;
+ break;
+ }
+ alias.v = uv_read_local_mmr(m_overlay);
+ if (alias.s.enable && alias.s.base == 0) {
+ *size = (1UL << alias.s.m_alias);
+ redirect.v = uv_read_local_mmr(m_redirect);
+ *base = (unsigned long)redirect.s.dest_base << DEST_SHIFT;
+ return;
+ }
+ }
+ *base = *size = 0;
+}
+
+enum map_type {map_wb, map_uc};
+
+static __init void map_high(char *id, unsigned long base, int pshift, int bshift, int max_pnode, enum map_type map_type)
+{
+ unsigned long bytes, paddr;
+
+ paddr = base << pshift;
+ bytes = (1UL << bshift) * (max_pnode + 1);
+ if (!paddr) {
+ pr_info("UV: Map %s_HI base address NULL\n", id);
+ return;
+ }
+ pr_debug("UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr, paddr + bytes);
+ if (map_type == map_uc)
+ init_extra_mapping_uc(paddr, bytes);
+ else
+ init_extra_mapping_wb(paddr, bytes);
+}
+
+static __init void map_gru_distributed(unsigned long c)
+{
+ union uvh_rh_gam_gru_overlay_config_mmr_u gru;
+ u64 paddr;
+ unsigned long bytes;
+ int nid;
+
+ gru.v = c;
+
+ /* Only base bits 42:28 relevant in dist mode */
+ gru_dist_base = gru.v & 0x000007fff0000000UL;
+ if (!gru_dist_base) {
+ pr_info("UV: Map GRU_DIST base address NULL\n");
+ return;
+ }
+
+ bytes = 1UL << UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT;
+ gru_dist_lmask = ((1UL << uv_hub_info->m_val) - 1) & ~(bytes - 1);
+ gru_dist_umask = ~((1UL << uv_hub_info->m_val) - 1);
+ gru_dist_base &= gru_dist_lmask; /* Clear bits above M */
+
+ for_each_online_node(nid) {
+ paddr = ((u64)uv_node_to_pnode(nid) << uv_hub_info->m_val) |
+ gru_dist_base;
+ init_extra_mapping_wb(paddr, bytes);
+ gru_first_node_paddr = min(paddr, gru_first_node_paddr);
+ gru_last_node_paddr = max(paddr, gru_last_node_paddr);
+ }
+
+ /* Save upper (63:M) bits of address only for is_GRU_range */
+ gru_first_node_paddr &= gru_dist_umask;
+ gru_last_node_paddr &= gru_dist_umask;
+
+ pr_debug("UV: Map GRU_DIST base 0x%016llx 0x%016llx - 0x%016llx\n", gru_dist_base, gru_first_node_paddr, gru_last_node_paddr);
+}
+
+static __init void map_gru_high(int max_pnode)
+{
+ union uvh_rh_gam_gru_overlay_config_mmr_u gru;
+ int shift = UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT;
+ unsigned long mask = UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK;
+ unsigned long base;
+
+ gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR);
+ if (!gru.s.enable) {
+ pr_info("UV: GRU disabled\n");
+ return;
+ }
+
+ /* Only UV3 has distributed GRU mode */
+ if (is_uv3_hub() && gru.s3.mode) {
+ map_gru_distributed(gru.v);
+ return;
+ }
+
+ base = (gru.v & mask) >> shift;
+ map_high("GRU", base, shift, shift, max_pnode, map_wb);
+ gru_start_paddr = ((u64)base << shift);
+ gru_end_paddr = gru_start_paddr + (1UL << shift) * (max_pnode + 1);
+}
+
+static __init void map_mmr_high(int max_pnode)
+{
+ union uvh_rh_gam_mmr_overlay_config_mmr_u mmr;
+ int shift = UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT;
+
+ mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR);
+ if (mmr.s.enable)
+ map_high("MMR", mmr.s.base, shift, shift, max_pnode, map_uc);
+ else
+ pr_info("UV: MMR disabled\n");
+}
+
+/* UV3/4 have identical MMIOH overlay configs, UV4A is slightly different */
+static __init void map_mmioh_high_uv34(int index, int min_pnode, int max_pnode)
+{
+ unsigned long overlay;
+ unsigned long mmr;
+ unsigned long base;
+ unsigned long nasid_mask;
+ unsigned long m_overlay;
+ int i, n, shift, m_io, max_io;
+ int nasid, lnasid, fi, li;
+ char *id;
+
+ if (index == 0) {
+ id = "MMIOH0";
+ m_overlay = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR;
+ overlay = uv_read_local_mmr(m_overlay);
+ base = overlay & UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_BASE_MASK;
+ mmr = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR;
+ m_io = (overlay & UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_MASK)
+ >> UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_SHFT;
+ shift = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_SHFT;
+ n = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH;
+ nasid_mask = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_NASID_MASK;
+ } else {
+ id = "MMIOH1";
+ m_overlay = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR;
+ overlay = uv_read_local_mmr(m_overlay);
+ base = overlay & UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_BASE_MASK;
+ mmr = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR;
+ m_io = (overlay & UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_MASK)
+ >> UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_SHFT;
+ shift = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_SHFT;
+ n = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_DEPTH;
+ nasid_mask = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_NASID_MASK;
+ }
+ pr_info("UV: %s overlay 0x%lx base:0x%lx m_io:%d\n", id, overlay, base, m_io);
+ if (!(overlay & UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_ENABLE_MASK)) {
+ pr_info("UV: %s disabled\n", id);
+ return;
+ }
+
+ /* Convert to NASID: */
+ min_pnode *= 2;
+ max_pnode *= 2;
+ max_io = lnasid = fi = li = -1;
+
+ for (i = 0; i < n; i++) {
+ unsigned long m_redirect = mmr + i * 8;
+ unsigned long redirect = uv_read_local_mmr(m_redirect);
+
+ nasid = redirect & nasid_mask;
+ if (i == 0)
+ pr_info("UV: %s redirect base 0x%lx(@0x%lx) 0x%04x\n",
+ id, redirect, m_redirect, nasid);
+
+ /* Invalid NASID: */
+ if (nasid < min_pnode || max_pnode < nasid)
+ nasid = -1;
+
+ if (nasid == lnasid) {
+ li = i;
+ /* Last entry check: */
+ if (i != n-1)
+ continue;
+ }
+
+ /* Check if we have a cached (or last) redirect to print: */
+ if (lnasid != -1 || (i == n-1 && nasid != -1)) {
+ unsigned long addr1, addr2;
+ int f, l;
+
+ if (lnasid == -1) {
+ f = l = i;
+ lnasid = nasid;
+ } else {
+ f = fi;
+ l = li;
+ }
+ addr1 = (base << shift) + f * (1ULL << m_io);
+ addr2 = (base << shift) + (l + 1) * (1ULL << m_io);
+ pr_info("UV: %s[%03d..%03d] NASID 0x%04x ADDR 0x%016lx - 0x%016lx\n", id, fi, li, lnasid, addr1, addr2);
+ if (max_io < l)
+ max_io = l;
+ }
+ fi = li = i;
+ lnasid = nasid;
+ }
+
+ pr_info("UV: %s base:0x%lx shift:%d M_IO:%d MAX_IO:%d\n", id, base, shift, m_io, max_io);
+
+ if (max_io >= 0)
+ map_high(id, base, shift, m_io, max_io, map_uc);
+}
+
+static __init void map_mmioh_high(int min_pnode, int max_pnode)
+{
+ union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh;
+ unsigned long mmr, base;
+ int shift, enable, m_io, n_io;
+
+ if (is_uv3_hub() || is_uv4_hub()) {
+ /* Map both MMIOH regions: */
+ map_mmioh_high_uv34(0, min_pnode, max_pnode);
+ map_mmioh_high_uv34(1, min_pnode, max_pnode);
+ return;
+ }
+
+ if (is_uv1_hub()) {
+ mmr = UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR;
+ shift = UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT;
+ mmioh.v = uv_read_local_mmr(mmr);
+ enable = !!mmioh.s1.enable;
+ base = mmioh.s1.base;
+ m_io = mmioh.s1.m_io;
+ n_io = mmioh.s1.n_io;
+ } else if (is_uv2_hub()) {
+ mmr = UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR;
+ shift = UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT;
+ mmioh.v = uv_read_local_mmr(mmr);
+ enable = !!mmioh.s2.enable;
+ base = mmioh.s2.base;
+ m_io = mmioh.s2.m_io;
+ n_io = mmioh.s2.n_io;
+ } else {
+ return;
+ }
+
+ if (enable) {
+ max_pnode &= (1 << n_io) - 1;
+ pr_info("UV: base:0x%lx shift:%d N_IO:%d M_IO:%d max_pnode:0x%x\n", base, shift, m_io, n_io, max_pnode);
+ map_high("MMIOH", base, shift, m_io, max_pnode, map_uc);
+ } else {
+ pr_info("UV: MMIOH disabled\n");
+ }
+}
+
+static __init void map_low_mmrs(void)
+{
+ init_extra_mapping_uc(UV_GLOBAL_MMR32_BASE, UV_GLOBAL_MMR32_SIZE);
+ init_extra_mapping_uc(UV_LOCAL_MMR_BASE, UV_LOCAL_MMR_SIZE);
+}
+
+static __init void uv_rtc_init(void)
+{
+ long status;
+ u64 ticks_per_sec;
+
+ status = uv_bios_freq_base(BIOS_FREQ_BASE_REALTIME_CLOCK, &ticks_per_sec);
+
+ if (status != BIOS_STATUS_SUCCESS || ticks_per_sec < 100000) {
+ pr_warn("UV: unable to determine platform RTC clock frequency, guessing.\n");
+
+ /* BIOS gives wrong value for clock frequency, so guess: */
+ sn_rtc_cycles_per_second = 1000000000000UL / 30000UL;
+ } else {
+ sn_rtc_cycles_per_second = ticks_per_sec;
+ }
+}
+
+/*
+ * percpu heartbeat timer
+ */
+static void uv_heartbeat(struct timer_list *timer)
+{
+ unsigned char bits = uv_scir_info->state;
+
+ /* Flip heartbeat bit: */
+ bits ^= SCIR_CPU_HEARTBEAT;
+
+ /* Is this CPU idle? */
+ if (idle_cpu(raw_smp_processor_id()))
+ bits &= ~SCIR_CPU_ACTIVITY;
+ else
+ bits |= SCIR_CPU_ACTIVITY;
+
+ /* Update system controller interface reg: */
+ uv_set_scir_bits(bits);
+
+ /* Enable next timer period: */
+ mod_timer(timer, jiffies + SCIR_CPU_HB_INTERVAL);
+}
+
+static int uv_heartbeat_enable(unsigned int cpu)
+{
+ while (!uv_cpu_scir_info(cpu)->enabled) {
+ struct timer_list *timer = &uv_cpu_scir_info(cpu)->timer;
+
+ uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY);
+ timer_setup(timer, uv_heartbeat, TIMER_PINNED);
+ timer->expires = jiffies + SCIR_CPU_HB_INTERVAL;
+ add_timer_on(timer, cpu);
+ uv_cpu_scir_info(cpu)->enabled = 1;
+
+ /* Also ensure that boot CPU is enabled: */
+ cpu = 0;
+ }
+ return 0;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static int uv_heartbeat_disable(unsigned int cpu)
+{
+ if (uv_cpu_scir_info(cpu)->enabled) {
+ uv_cpu_scir_info(cpu)->enabled = 0;
+ del_timer(&uv_cpu_scir_info(cpu)->timer);
+ }
+ uv_set_cpu_scir_bits(cpu, 0xff);
+ return 0;
+}
+
+static __init void uv_scir_register_cpu_notifier(void)
+{
+ cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/x2apic-uvx:online",
+ uv_heartbeat_enable, uv_heartbeat_disable);
+}
+
+#else /* !CONFIG_HOTPLUG_CPU */
+
+static __init void uv_scir_register_cpu_notifier(void)
+{
+}
+
+static __init int uv_init_heartbeat(void)
+{
+ int cpu;
+
+ if (is_uv_system()) {
+ for_each_online_cpu(cpu)
+ uv_heartbeat_enable(cpu);
+ }
+
+ return 0;
+}
+
+late_initcall(uv_init_heartbeat);
+
+#endif /* !CONFIG_HOTPLUG_CPU */
+
+/* Direct Legacy VGA I/O traffic to designated IOH */
+int uv_set_vga_state(struct pci_dev *pdev, bool decode, unsigned int command_bits, u32 flags)
+{
+ int domain, bus, rc;
+
+ if (!(flags & PCI_VGA_STATE_CHANGE_BRIDGE))
+ return 0;
+
+ if ((command_bits & PCI_COMMAND_IO) == 0)
+ return 0;
+
+ domain = pci_domain_nr(pdev->bus);
+ bus = pdev->bus->number;
+
+ rc = uv_bios_set_legacy_vga_target(decode, domain, bus);
+
+ return rc;
+}
+
+/*
+ * Called on each CPU to initialize the per_cpu UV data area.
+ * FIXME: hotplug not supported yet
+ */
+void uv_cpu_init(void)
+{
+ /* CPU 0 initialization will be done via uv_system_init. */
+ if (smp_processor_id() == 0)
+ return;
+
+ uv_hub_info->nr_online_cpus++;
+
+ if (get_uv_system_type() == UV_NON_UNIQUE_APIC)
+ set_x2apic_extra_bits(uv_hub_info->pnode);
+}
+
+struct mn {
+ unsigned char m_val;
+ unsigned char n_val;
+ unsigned char m_shift;
+ unsigned char n_lshift;
+};
+
+static void get_mn(struct mn *mnp)
+{
+ union uvh_rh_gam_config_mmr_u m_n_config;
+ union uv3h_gr0_gam_gr_config_u m_gr_config;
+
+ /* Make sure the whole structure is well initialized: */
+ memset(mnp, 0, sizeof(*mnp));
+
+ m_n_config.v = uv_read_local_mmr(UVH_RH_GAM_CONFIG_MMR);
+ mnp->n_val = m_n_config.s.n_skt;
+
+ if (is_uv4_hub()) {
+ mnp->m_val = 0;
+ mnp->n_lshift = 0;
+ } else if (is_uv3_hub()) {
+ mnp->m_val = m_n_config.s3.m_skt;
+ m_gr_config.v = uv_read_local_mmr(UV3H_GR0_GAM_GR_CONFIG);
+ mnp->n_lshift = m_gr_config.s3.m_skt;
+ } else if (is_uv2_hub()) {
+ mnp->m_val = m_n_config.s2.m_skt;
+ mnp->n_lshift = mnp->m_val == 40 ? 40 : 39;
+ } else if (is_uv1_hub()) {
+ mnp->m_val = m_n_config.s1.m_skt;
+ mnp->n_lshift = mnp->m_val;
+ }
+ mnp->m_shift = mnp->m_val ? 64 - mnp->m_val : 0;
+}
+
+void __init uv_init_hub_info(struct uv_hub_info_s *hi)
+{
+ union uvh_node_id_u node_id;
+ struct mn mn;
+
+ get_mn(&mn);
+ hi->gpa_mask = mn.m_val ?
+ (1UL << (mn.m_val + mn.n_val)) - 1 :
+ (1UL << uv_cpuid.gpa_shift) - 1;
+
+ hi->m_val = mn.m_val;
+ hi->n_val = mn.n_val;
+ hi->m_shift = mn.m_shift;
+ hi->n_lshift = mn.n_lshift ? mn.n_lshift : 0;
+ hi->hub_revision = uv_hub_info->hub_revision;
+ hi->pnode_mask = uv_cpuid.pnode_mask;
+ hi->min_pnode = _min_pnode;
+ hi->min_socket = _min_socket;
+ hi->pnode_to_socket = _pnode_to_socket;
+ hi->socket_to_node = _socket_to_node;
+ hi->socket_to_pnode = _socket_to_pnode;
+ hi->gr_table_len = _gr_table_len;
+ hi->gr_table = _gr_table;
+
+ node_id.v = uv_read_local_mmr(UVH_NODE_ID);
+ uv_cpuid.gnode_shift = max_t(unsigned int, uv_cpuid.gnode_shift, mn.n_val);
+ hi->gnode_extra = (node_id.s.node_id & ~((1 << uv_cpuid.gnode_shift) - 1)) >> 1;
+ if (mn.m_val)
+ hi->gnode_upper = (u64)hi->gnode_extra << mn.m_val;
+
+ if (uv_gp_table) {
+ hi->global_mmr_base = uv_gp_table->mmr_base;
+ hi->global_mmr_shift = uv_gp_table->mmr_shift;
+ hi->global_gru_base = uv_gp_table->gru_base;
+ hi->global_gru_shift = uv_gp_table->gru_shift;
+ hi->gpa_shift = uv_gp_table->gpa_shift;
+ hi->gpa_mask = (1UL << hi->gpa_shift) - 1;
+ } else {
+ hi->global_mmr_base = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) & ~UV_MMR_ENABLE;
+ hi->global_mmr_shift = _UV_GLOBAL_MMR64_PNODE_SHIFT;
+ }
+
+ get_lowmem_redirect(&hi->lowmem_remap_base, &hi->lowmem_remap_top);
+
+ hi->apic_pnode_shift = uv_cpuid.socketid_shift;
+
+ /* Show system specific info: */
+ pr_info("UV: N:%d M:%d m_shift:%d n_lshift:%d\n", hi->n_val, hi->m_val, hi->m_shift, hi->n_lshift);
+ pr_info("UV: gpa_mask/shift:0x%lx/%d pnode_mask:0x%x apic_pns:%d\n", hi->gpa_mask, hi->gpa_shift, hi->pnode_mask, hi->apic_pnode_shift);
+ pr_info("UV: mmr_base/shift:0x%lx/%ld gru_base/shift:0x%lx/%ld\n", hi->global_mmr_base, hi->global_mmr_shift, hi->global_gru_base, hi->global_gru_shift);
+ pr_info("UV: gnode_upper:0x%lx gnode_extra:0x%x\n", hi->gnode_upper, hi->gnode_extra);
+}
+
+static void __init decode_gam_params(unsigned long ptr)
+{
+ uv_gp_table = (struct uv_gam_parameters *)ptr;
+
+ pr_info("UV: GAM Params...\n");
+ pr_info("UV: mmr_base/shift:0x%llx/%d gru_base/shift:0x%llx/%d gpa_shift:%d\n",
+ uv_gp_table->mmr_base, uv_gp_table->mmr_shift,
+ uv_gp_table->gru_base, uv_gp_table->gru_shift,
+ uv_gp_table->gpa_shift);
+}
+
+static void __init decode_gam_rng_tbl(unsigned long ptr)
+{
+ struct uv_gam_range_entry *gre = (struct uv_gam_range_entry *)ptr;
+ unsigned long lgre = 0;
+ int index = 0;
+ int sock_min = 999999, pnode_min = 99999;
+ int sock_max = -1, pnode_max = -1;
+
+ uv_gre_table = gre;
+ for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) {
+ unsigned long size = ((unsigned long)(gre->limit - lgre)
+ << UV_GAM_RANGE_SHFT);
+ int order = 0;
+ char suffix[] = " KMGTPE";
+ int flag = ' ';
+
+ while (size > 9999 && order < sizeof(suffix)) {
+ size /= 1024;
+ order++;
+ }
+
+ /* adjust max block size to current range start */
+ if (gre->type == 1 || gre->type == 2)
+ if (adj_blksize(lgre))
+ flag = '*';
+
+ if (!index) {
+ pr_info("UV: GAM Range Table...\n");
+ pr_info("UV: # %20s %14s %6s %4s %5s %3s %2s\n", "Range", "", "Size", "Type", "NASID", "SID", "PN");
+ }
+ pr_info("UV: %2d: 0x%014lx-0x%014lx%c %5lu%c %3d %04x %02x %02x\n",
+ index++,
+ (unsigned long)lgre << UV_GAM_RANGE_SHFT,
+ (unsigned long)gre->limit << UV_GAM_RANGE_SHFT,
+ flag, size, suffix[order],
+ gre->type, gre->nasid, gre->sockid, gre->pnode);
+
+ /* update to next range start */
+ lgre = gre->limit;
+ if (sock_min > gre->sockid)
+ sock_min = gre->sockid;
+ if (sock_max < gre->sockid)
+ sock_max = gre->sockid;
+ if (pnode_min > gre->pnode)
+ pnode_min = gre->pnode;
+ if (pnode_max < gre->pnode)
+ pnode_max = gre->pnode;
+ }
+ _min_socket = sock_min;
+ _max_socket = sock_max;
+ _min_pnode = pnode_min;
+ _max_pnode = pnode_max;
+ _gr_table_len = index;
+
+ pr_info("UV: GRT: %d entries, sockets(min:%x,max:%x) pnodes(min:%x,max:%x)\n", index, _min_socket, _max_socket, _min_pnode, _max_pnode);
+}
+
+static int __init decode_uv_systab(void)
+{
+ struct uv_systab *st;
+ int i;
+
+ if (uv_hub_info->hub_revision < UV4_HUB_REVISION_BASE)
+ return 0; /* No extended UVsystab required */
+
+ st = uv_systab;
+ if ((!st) || (st->revision < UV_SYSTAB_VERSION_UV4_LATEST)) {
+ int rev = st ? st->revision : 0;
+
+ pr_err("UV: BIOS UVsystab version(%x) mismatch, expecting(%x)\n", rev, UV_SYSTAB_VERSION_UV4_LATEST);
+ pr_err("UV: Cannot support UV operations, switching to generic PC\n");
+ uv_system_type = UV_NONE;
+
+ return -EINVAL;
+ }
+
+ for (i = 0; st->entry[i].type != UV_SYSTAB_TYPE_UNUSED; i++) {
+ unsigned long ptr = st->entry[i].offset;
+
+ if (!ptr)
+ continue;
+
+ ptr = ptr + (unsigned long)st;
+
+ switch (st->entry[i].type) {
+ case UV_SYSTAB_TYPE_GAM_PARAMS:
+ decode_gam_params(ptr);
+ break;
+
+ case UV_SYSTAB_TYPE_GAM_RNG_TBL:
+ decode_gam_rng_tbl(ptr);
+ break;
+ }
+ }
+ return 0;
+}
+
+/*
+ * Set up physical blade translations from UVH_NODE_PRESENT_TABLE
+ * .. NB: UVH_NODE_PRESENT_TABLE is going away,
+ * .. being replaced by GAM Range Table
+ */
+static __init void boot_init_possible_blades(struct uv_hub_info_s *hub_info)
+{
+ int i, uv_pb = 0;
+
+ pr_info("UV: NODE_PRESENT_DEPTH = %d\n", UVH_NODE_PRESENT_TABLE_DEPTH);
+ for (i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) {
+ unsigned long np;
+
+ np = uv_read_local_mmr(UVH_NODE_PRESENT_TABLE + i * 8);
+ if (np)
+ pr_info("UV: NODE_PRESENT(%d) = 0x%016lx\n", i, np);
+
+ uv_pb += hweight64(np);
+ }
+ if (uv_possible_blades != uv_pb)
+ uv_possible_blades = uv_pb;
+}
+
+static void __init build_socket_tables(void)
+{
+ struct uv_gam_range_entry *gre = uv_gre_table;
+ int num, nump;
+ int cpu, i, lnid;
+ int minsock = _min_socket;
+ int maxsock = _max_socket;
+ int minpnode = _min_pnode;
+ int maxpnode = _max_pnode;
+ size_t bytes;
+
+ if (!gre) {
+ if (is_uv1_hub() || is_uv2_hub() || is_uv3_hub()) {
+ pr_info("UV: No UVsystab socket table, ignoring\n");
+ return;
+ }
+ pr_crit("UV: Error: UVsystab address translations not available!\n");
+ BUG();
+ }
+
+ /* Build socket id -> node id, pnode */
+ num = maxsock - minsock + 1;
+ bytes = num * sizeof(_socket_to_node[0]);
+ _socket_to_node = kmalloc(bytes, GFP_KERNEL);
+ _socket_to_pnode = kmalloc(bytes, GFP_KERNEL);
+
+ nump = maxpnode - minpnode + 1;
+ bytes = nump * sizeof(_pnode_to_socket[0]);
+ _pnode_to_socket = kmalloc(bytes, GFP_KERNEL);
+ BUG_ON(!_socket_to_node || !_socket_to_pnode || !_pnode_to_socket);
+
+ for (i = 0; i < num; i++)
+ _socket_to_node[i] = _socket_to_pnode[i] = SOCK_EMPTY;
+
+ for (i = 0; i < nump; i++)
+ _pnode_to_socket[i] = SOCK_EMPTY;
+
+ /* Fill in pnode/node/addr conversion list values: */
+ pr_info("UV: GAM Building socket/pnode conversion tables\n");
+ for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) {
+ if (gre->type == UV_GAM_RANGE_TYPE_HOLE)
+ continue;
+ i = gre->sockid - minsock;
+ /* Duplicate: */
+ if (_socket_to_pnode[i] != SOCK_EMPTY)
+ continue;
+ _socket_to_pnode[i] = gre->pnode;
+
+ i = gre->pnode - minpnode;
+ _pnode_to_socket[i] = gre->sockid;
+
+ pr_info("UV: sid:%02x type:%d nasid:%04x pn:%02x pn2s:%2x\n",
+ gre->sockid, gre->type, gre->nasid,
+ _socket_to_pnode[gre->sockid - minsock],
+ _pnode_to_socket[gre->pnode - minpnode]);
+ }
+
+ /* Set socket -> node values: */
+ lnid = -1;
+ for_each_present_cpu(cpu) {
+ int nid = cpu_to_node(cpu);
+ int apicid, sockid;
+
+ if (lnid == nid)
+ continue;
+ lnid = nid;
+ apicid = per_cpu(x86_cpu_to_apicid, cpu);
+ sockid = apicid >> uv_cpuid.socketid_shift;
+ _socket_to_node[sockid - minsock] = nid;
+ pr_info("UV: sid:%02x: apicid:%04x node:%2d\n",
+ sockid, apicid, nid);
+ }
+
+ /* Set up physical blade to pnode translation from GAM Range Table: */
+ bytes = num_possible_nodes() * sizeof(_node_to_pnode[0]);
+ _node_to_pnode = kmalloc(bytes, GFP_KERNEL);
+ BUG_ON(!_node_to_pnode);
+
+ for (lnid = 0; lnid < num_possible_nodes(); lnid++) {
+ unsigned short sockid;
+
+ for (sockid = minsock; sockid <= maxsock; sockid++) {
+ if (lnid == _socket_to_node[sockid - minsock]) {
+ _node_to_pnode[lnid] = _socket_to_pnode[sockid - minsock];
+ break;
+ }
+ }
+ if (sockid > maxsock) {
+ pr_err("UV: socket for node %d not found!\n", lnid);
+ BUG();
+ }
+ }
+
+ /*
+ * If socket id == pnode or socket id == node for all nodes,
+ * system runs faster by removing corresponding conversion table.
+ */
+ pr_info("UV: Checking socket->node/pnode for identity maps\n");
+ if (minsock == 0) {
+ for (i = 0; i < num; i++)
+ if (_socket_to_node[i] == SOCK_EMPTY || i != _socket_to_node[i])
+ break;
+ if (i >= num) {
+ kfree(_socket_to_node);
+ _socket_to_node = NULL;
+ pr_info("UV: 1:1 socket_to_node table removed\n");
+ }
+ }
+ if (minsock == minpnode) {
+ for (i = 0; i < num; i++)
+ if (_socket_to_pnode[i] != SOCK_EMPTY &&
+ _socket_to_pnode[i] != i + minpnode)
+ break;
+ if (i >= num) {
+ kfree(_socket_to_pnode);
+ _socket_to_pnode = NULL;
+ pr_info("UV: 1:1 socket_to_pnode table removed\n");
+ }
+ }
+}
+
+static void __init uv_system_init_hub(void)
+{
+ struct uv_hub_info_s hub_info = {0};
+ int bytes, cpu, nodeid;
+ unsigned short min_pnode = 9999, max_pnode = 0;
+ char *hub = is_uv4_hub() ? "UV400" :
+ is_uv3_hub() ? "UV300" :
+ is_uv2_hub() ? "UV2000/3000" :
+ is_uv1_hub() ? "UV100/1000" : NULL;
+
+ if (!hub) {
+ pr_err("UV: Unknown/unsupported UV hub\n");
+ return;
+ }
+ pr_info("UV: Found %s hub\n", hub);
+
+ map_low_mmrs();
+
+ /* Get uv_systab for decoding: */
+ uv_bios_init();
+
+ /* If there's an UVsystab problem then abort UV init: */
+ if (decode_uv_systab() < 0)
+ return;
+
+ build_socket_tables();
+ build_uv_gr_table();
+ set_block_size();
+ uv_init_hub_info(&hub_info);
+ uv_possible_blades = num_possible_nodes();
+ if (!_node_to_pnode)
+ boot_init_possible_blades(&hub_info);
+
+ /* uv_num_possible_blades() is really the hub count: */
+ pr_info("UV: Found %d hubs, %d nodes, %d CPUs\n", uv_num_possible_blades(), num_possible_nodes(), num_possible_cpus());
+
+ uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, &sn_coherency_id, &sn_region_size, &system_serial_number);
+ hub_info.coherency_domain_number = sn_coherency_id;
+ uv_rtc_init();
+
+ bytes = sizeof(void *) * uv_num_possible_blades();
+ __uv_hub_info_list = kzalloc(bytes, GFP_KERNEL);
+ BUG_ON(!__uv_hub_info_list);
+
+ bytes = sizeof(struct uv_hub_info_s);
+ for_each_node(nodeid) {
+ struct uv_hub_info_s *new_hub;
+
+ if (__uv_hub_info_list[nodeid]) {
+ pr_err("UV: Node %d UV HUB already initialized!?\n", nodeid);
+ BUG();
+ }
+
+ /* Allocate new per hub info list */
+ new_hub = (nodeid == 0) ? &uv_hub_info_node0 : kzalloc_node(bytes, GFP_KERNEL, nodeid);
+ BUG_ON(!new_hub);
+ __uv_hub_info_list[nodeid] = new_hub;
+ new_hub = uv_hub_info_list(nodeid);
+ BUG_ON(!new_hub);
+ *new_hub = hub_info;
+
+ /* Use information from GAM table if available: */
+ if (_node_to_pnode)
+ new_hub->pnode = _node_to_pnode[nodeid];
+ else /* Or fill in during CPU loop: */
+ new_hub->pnode = 0xffff;
+
+ new_hub->numa_blade_id = uv_node_to_blade_id(nodeid);
+ new_hub->memory_nid = -1;
+ new_hub->nr_possible_cpus = 0;
+ new_hub->nr_online_cpus = 0;
+ }
+
+ /* Initialize per CPU info: */
+ for_each_possible_cpu(cpu) {
+ int apicid = per_cpu(x86_cpu_to_apicid, cpu);
+ int numa_node_id;
+ unsigned short pnode;
+
+ nodeid = cpu_to_node(cpu);
+ numa_node_id = numa_cpu_node(cpu);
+ pnode = uv_apicid_to_pnode(apicid);
+
+ uv_cpu_info_per(cpu)->p_uv_hub_info = uv_hub_info_list(nodeid);
+ uv_cpu_info_per(cpu)->blade_cpu_id = uv_cpu_hub_info(cpu)->nr_possible_cpus++;
+ if (uv_cpu_hub_info(cpu)->memory_nid == -1)
+ uv_cpu_hub_info(cpu)->memory_nid = cpu_to_node(cpu);
+
+ /* Init memoryless node: */
+ if (nodeid != numa_node_id &&
+ uv_hub_info_list(numa_node_id)->pnode == 0xffff)
+ uv_hub_info_list(numa_node_id)->pnode = pnode;
+ else if (uv_cpu_hub_info(cpu)->pnode == 0xffff)
+ uv_cpu_hub_info(cpu)->pnode = pnode;
+
+ uv_cpu_scir_info(cpu)->offset = uv_scir_offset(apicid);
+ }
+
+ for_each_node(nodeid) {
+ unsigned short pnode = uv_hub_info_list(nodeid)->pnode;
+
+ /* Add pnode info for pre-GAM list nodes without CPUs: */
+ if (pnode == 0xffff) {
+ unsigned long paddr;
+
+ paddr = node_start_pfn(nodeid) << PAGE_SHIFT;
+ pnode = uv_gpa_to_pnode(uv_soc_phys_ram_to_gpa(paddr));
+ uv_hub_info_list(nodeid)->pnode = pnode;
+ }
+ min_pnode = min(pnode, min_pnode);
+ max_pnode = max(pnode, max_pnode);
+ pr_info("UV: UVHUB node:%2d pn:%02x nrcpus:%d\n",
+ nodeid,
+ uv_hub_info_list(nodeid)->pnode,
+ uv_hub_info_list(nodeid)->nr_possible_cpus);
+ }
+
+ pr_info("UV: min_pnode:%02x max_pnode:%02x\n", min_pnode, max_pnode);
+ map_gru_high(max_pnode);
+ map_mmr_high(max_pnode);
+ map_mmioh_high(min_pnode, max_pnode);
+
+ uv_nmi_setup();
+ uv_cpu_init();
+ uv_scir_register_cpu_notifier();
+ proc_mkdir("sgi_uv", NULL);
+
+ /* Register Legacy VGA I/O redirection handler: */
+ pci_register_set_vga_state(uv_set_vga_state);
+
+ /*
+ * For a kdump kernel the reset must be BOOT_ACPI, not BOOT_EFI, as
+ * EFI is not enabled in the kdump kernel:
+ */
+ if (is_kdump_kernel())
+ reboot_type = BOOT_ACPI;
+}
+
+/*
+ * There is a small amount of UV specific code needed to initialize a
+ * UV system that does not have a "UV HUB" (referred to as "hubless").
+ */
+void __init uv_system_init(void)
+{
+ if (likely(!is_uv_system() && !is_uv_hubless()))
+ return;
+
+ if (is_uv_system())
+ uv_system_init_hub();
+ else
+ uv_nmi_setup_hubless();
+}
+
+apic_driver(apic_x2apic_uv_x);
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
new file mode 100644
index 000000000..f7151cd03
--- /dev/null
+++ b/arch/x86/kernel/apm_32.c
@@ -0,0 +1,2447 @@
+/* -*- linux-c -*-
+ * APM BIOS driver for Linux
+ * Copyright 1994-2001 Stephen Rothwell (sfr@canb.auug.org.au)
+ *
+ * Initial development of this driver was funded by NEC Australia P/L
+ * and NEC Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * October 1995, Rik Faith (faith@cs.unc.edu):
+ * Minor enhancements and updates (to the patch set) for 1.3.x
+ * Documentation
+ * January 1996, Rik Faith (faith@cs.unc.edu):
+ * Make /proc/apm easy to format (bump driver version)
+ * March 1996, Rik Faith (faith@cs.unc.edu):
+ * Prohibit APM BIOS calls unless apm_enabled.
+ * (Thanks to Ulrich Windl <Ulrich.Windl@rz.uni-regensburg.de>)
+ * April 1996, Stephen Rothwell (sfr@canb.auug.org.au)
+ * Version 1.0 and 1.1
+ * May 1996, Version 1.2
+ * Feb 1998, Version 1.3
+ * Feb 1998, Version 1.4
+ * Aug 1998, Version 1.5
+ * Sep 1998, Version 1.6
+ * Nov 1998, Version 1.7
+ * Jan 1999, Version 1.8
+ * Jan 1999, Version 1.9
+ * Oct 1999, Version 1.10
+ * Nov 1999, Version 1.11
+ * Jan 2000, Version 1.12
+ * Feb 2000, Version 1.13
+ * Nov 2000, Version 1.14
+ * Oct 2001, Version 1.15
+ * Jan 2002, Version 1.16
+ * Oct 2002, Version 1.16ac
+ *
+ * History:
+ * 0.6b: first version in official kernel, Linux 1.3.46
+ * 0.7: changed /proc/apm format, Linux 1.3.58
+ * 0.8: fixed gcc 2.7.[12] compilation problems, Linux 1.3.59
+ * 0.9: only call bios if bios is present, Linux 1.3.72
+ * 1.0: use fixed device number, consolidate /proc/apm into this file,
+ * Linux 1.3.85
+ * 1.1: support user-space standby and suspend, power off after system
+ * halted, Linux 1.3.98
+ * 1.2: When resetting RTC after resume, take care so that the time
+ * is only incorrect by 30-60mS (vs. 1S previously) (Gabor J. Toth
+ * <jtoth@princeton.edu>); improve interaction between
+ * screen-blanking and gpm (Stephen Rothwell); Linux 1.99.4
+ * 1.2a:Simple change to stop mysterious bug reports with SMP also added
+ * levels to the printk calls. APM is not defined for SMP machines.
+ * The new replacement for it is, but Linux doesn't yet support this.
+ * Alan Cox Linux 2.1.55
+ * 1.3: Set up a valid data descriptor 0x40 for buggy BIOS's
+ * 1.4: Upgraded to support APM 1.2. Integrated ThinkPad suspend patch by
+ * Dean Gaudet <dgaudet@arctic.org>.
+ * C. Scott Ananian <cananian@alumni.princeton.edu> Linux 2.1.87
+ * 1.5: Fix segment register reloading (in case of bad segments saved
+ * across BIOS call).
+ * Stephen Rothwell
+ * 1.6: Cope with compiler/assembler differences.
+ * Only try to turn off the first display device.
+ * Fix OOPS at power off with no APM BIOS by Jan Echternach
+ * <echter@informatik.uni-rostock.de>
+ * Stephen Rothwell
+ * 1.7: Modify driver's cached copy of the disabled/disengaged flags
+ * to reflect current state of APM BIOS.
+ * Chris Rankin <rankinc@bellsouth.net>
+ * Reset interrupt 0 timer to 100Hz after suspend
+ * Chad Miller <cmiller@surfsouth.com>
+ * Add CONFIG_APM_IGNORE_SUSPEND_BOUNCE
+ * Richard Gooch <rgooch@atnf.csiro.au>
+ * Allow boot time disabling of APM
+ * Make boot messages far less verbose by default
+ * Make asm safer
+ * Stephen Rothwell
+ * 1.8: Add CONFIG_APM_RTC_IS_GMT
+ * Richard Gooch <rgooch@atnf.csiro.au>
+ * change APM_NOINTS to CONFIG_APM_ALLOW_INTS
+ * remove dependency on CONFIG_PROC_FS
+ * Stephen Rothwell
+ * 1.9: Fix small typo. <laslo@wodip.opole.pl>
+ * Try to cope with BIOS's that need to have all display
+ * devices blanked and not just the first one.
+ * Ross Paterson <ross@soi.city.ac.uk>
+ * Fix segment limit setting it has always been wrong as
+ * the segments needed to have byte granularity.
+ * Mark a few things __init.
+ * Add hack to allow power off of SMP systems by popular request.
+ * Use CONFIG_SMP instead of __SMP__
+ * Ignore BOUNCES for three seconds.
+ * Stephen Rothwell
+ * 1.10: Fix for Thinkpad return code.
+ * Merge 2.2 and 2.3 drivers.
+ * Remove APM dependencies in arch/i386/kernel/process.c
+ * Remove APM dependencies in drivers/char/sysrq.c
+ * Reset time across standby.
+ * Allow more inititialisation on SMP.
+ * Remove CONFIG_APM_POWER_OFF and make it boot time
+ * configurable (default on).
+ * Make debug only a boot time parameter (remove APM_DEBUG).
+ * Try to blank all devices on any error.
+ * 1.11: Remove APM dependencies in drivers/char/console.c
+ * Check nr_running to detect if we are idle (from
+ * Borislav Deianov <borislav@lix.polytechnique.fr>)
+ * Fix for bioses that don't zero the top part of the
+ * entrypoint offset (Mario Sitta <sitta@al.unipmn.it>)
+ * (reported by Panos Katsaloulis <teras@writeme.com>).
+ * Real mode power off patch (Walter Hofmann
+ * <Walter.Hofmann@physik.stud.uni-erlangen.de>).
+ * 1.12: Remove CONFIG_SMP as the compiler will optimize
+ * the code away anyway (smp_num_cpus == 1 in UP)
+ * noted by Artur Skawina <skawina@geocities.com>.
+ * Make power off under SMP work again.
+ * Fix thinko with initial engaging of BIOS.
+ * Make sure power off only happens on CPU 0
+ * (Paul "Rusty" Russell <rusty@rustcorp.com.au>).
+ * Do error notification to user mode if BIOS calls fail.
+ * Move entrypoint offset fix to ...boot/setup.S
+ * where it belongs (Cosmos <gis88564@cis.nctu.edu.tw>).
+ * Remove smp-power-off. SMP users must now specify
+ * "apm=power-off" on the kernel command line. Suggested
+ * by Jim Avera <jima@hal.com>, modified by Alan Cox
+ * <alan@lxorguk.ukuu.org.uk>.
+ * Register the /proc/apm entry even on SMP so that
+ * scripts that check for it before doing power off
+ * work (Jim Avera <jima@hal.com>).
+ * 1.13: Changes for new pm_ interfaces (Andy Henroid
+ * <andy_henroid@yahoo.com>).
+ * Modularize the code.
+ * Fix the Thinkpad (again) :-( (CONFIG_APM_IGNORE_MULTIPLE_SUSPENDS
+ * is now the way life works).
+ * Fix thinko in suspend() (wrong return).
+ * Notify drivers on critical suspend.
+ * Make kapmd absorb more idle time (Pavel Machek <pavel@ucw.cz>
+ * modified by sfr).
+ * Disable interrupts while we are suspended (Andy Henroid
+ * <andy_henroid@yahoo.com> fixed by sfr).
+ * Make power off work on SMP again (Tony Hoyle
+ * <tmh@magenta-logic.com> and <zlatko@iskon.hr>) modified by sfr.
+ * Remove CONFIG_APM_SUSPEND_BOUNCE. The bounce ignore
+ * interval is now configurable.
+ * 1.14: Make connection version persist across module unload/load.
+ * Enable and engage power management earlier.
+ * Disengage power management on module unload.
+ * Changed to use the sysrq-register hack for registering the
+ * power off function called by magic sysrq based upon discussions
+ * in irc://irc.openprojects.net/#kernelnewbies
+ * (Crutcher Dunnavant <crutcher+kernel@datastacks.com>).
+ * Make CONFIG_APM_REAL_MODE_POWER_OFF run time configurable.
+ * (Arjan van de Ven <arjanv@redhat.com>) modified by sfr.
+ * Work around byte swap bug in one of the Vaio's BIOS's
+ * (Marc Boucher <marc@mbsi.ca>).
+ * Exposed the disable flag to dmi so that we can handle known
+ * broken APM (Alan Cox <alan@lxorguk.ukuu.org.uk>).
+ * 1.14ac: If the BIOS says "I slowed the CPU down" then don't spin
+ * calling it - instead idle. (Alan Cox <alan@lxorguk.ukuu.org.uk>)
+ * If an APM idle fails log it and idle sensibly
+ * 1.15: Don't queue events to clients who open the device O_WRONLY.
+ * Don't expect replies from clients who open the device O_RDONLY.
+ * (Idea from Thomas Hood)
+ * Minor waitqueue cleanups. (John Fremlin <chief@bandits.org>)
+ * 1.16: Fix idle calling. (Andreas Steinmetz <ast@domdv.de> et al.)
+ * Notify listeners of standby or suspend events before notifying
+ * drivers. Return EBUSY to ioctl() if suspend is rejected.
+ * (Russell King <rmk@arm.linux.org.uk> and Thomas Hood)
+ * Ignore first resume after we generate our own resume event
+ * after a suspend (Thomas Hood)
+ * Daemonize now gets rid of our controlling terminal (sfr).
+ * CONFIG_APM_CPU_IDLE now just affects the default value of
+ * idle_threshold (sfr).
+ * Change name of kernel apm daemon (as it no longer idles) (sfr).
+ * 1.16ac: Fix up SMP support somewhat. You can now force SMP on and we
+ * make _all_ APM calls on the CPU#0. Fix unsafe sign bug.
+ * TODO: determine if its "boot CPU" or "CPU0" we want to lock to.
+ *
+ * APM 1.1 Reference:
+ *
+ * Intel Corporation, Microsoft Corporation. Advanced Power Management
+ * (APM) BIOS Interface Specification, Revision 1.1, September 1993.
+ * Intel Order Number 241704-001. Microsoft Part Number 781-110-X01.
+ *
+ * [This document is available free from Intel by calling 800.628.8686 (fax
+ * 916.356.6100) or 800.548.4725; or from
+ * http://www.microsoft.com/whdc/archive/amp_12.mspx It is also
+ * available from Microsoft by calling 206.882.8080.]
+ *
+ * APM 1.2 Reference:
+ * Intel Corporation, Microsoft Corporation. Advanced Power Management
+ * (APM) BIOS Interface Specification, Revision 1.2, February 1996.
+ *
+ * [This document is available from Microsoft at:
+ * http://www.microsoft.com/whdc/archive/amp_12.mspx]
+ */
+
+#define pr_fmt(fmt) "apm: " fmt
+
+#include <linux/module.h>
+
+#include <linux/poll.h>
+#include <linux/types.h>
+#include <linux/stddef.h>
+#include <linux/timer.h>
+#include <linux/fcntl.h>
+#include <linux/slab.h>
+#include <linux/stat.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/miscdevice.h>
+#include <linux/apm_bios.h>
+#include <linux/init.h>
+#include <linux/time.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/cputime.h>
+#include <linux/pm.h>
+#include <linux/capability.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/freezer.h>
+#include <linux/smp.h>
+#include <linux/dmi.h>
+#include <linux/suspend.h>
+#include <linux/kthread.h>
+#include <linux/jiffies.h>
+#include <linux/acpi.h>
+#include <linux/syscore_ops.h>
+#include <linux/i8253.h>
+#include <linux/cpuidle.h>
+
+#include <linux/uaccess.h>
+#include <asm/desc.h>
+#include <asm/olpc.h>
+#include <asm/paravirt.h>
+#include <asm/reboot.h>
+#include <asm/nospec-branch.h>
+
+#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT)
+extern int (*console_blank_hook)(int);
+#endif
+
+/*
+ * The apm_bios device is one of the misc char devices.
+ * This is its minor number.
+ */
+#define APM_MINOR_DEV 134
+
+/*
+ * Various options can be changed at boot time as follows:
+ * (We allow underscores for compatibility with the modules code)
+ * apm=on/off enable/disable APM
+ * [no-]allow[-_]ints allow interrupts during BIOS calls
+ * [no-]broken[-_]psr BIOS has a broken GetPowerStatus call
+ * [no-]realmode[-_]power[-_]off switch to real mode before
+ * powering off
+ * [no-]debug log some debugging messages
+ * [no-]power[-_]off power off on shutdown
+ * [no-]smp Use apm even on an SMP box
+ * bounce[-_]interval=<n> number of ticks to ignore suspend
+ * bounces
+ * idle[-_]threshold=<n> System idle percentage above which to
+ * make APM BIOS idle calls. Set it to
+ * 100 to disable.
+ * idle[-_]period=<n> Period (in 1/100s of a second) over
+ * which the idle percentage is
+ * calculated.
+ */
+
+/* KNOWN PROBLEM MACHINES:
+ *
+ * U: TI 4000M TravelMate: BIOS is *NOT* APM compliant
+ * [Confirmed by TI representative]
+ * ?: ACER 486DX4/75: uses dseg 0040, in violation of APM specification
+ * [Confirmed by BIOS disassembly]
+ * [This may work now ...]
+ * P: Toshiba 1950S: battery life information only gets updated after resume
+ * P: Midwest Micro Soundbook Elite DX2/66 monochrome: screen blanking
+ * broken in BIOS [Reported by Garst R. Reese <reese@isn.net>]
+ * ?: AcerNote-950: oops on reading /proc/apm - workaround is a WIP
+ * Neale Banks <neale@lowendale.com.au> December 2000
+ *
+ * Legend: U = unusable with APM patches
+ * P = partially usable with APM patches
+ */
+
+/*
+ * Define as 1 to make the driver always call the APM BIOS busy
+ * routine even if the clock was not reported as slowed by the
+ * idle routine. Otherwise, define as 0.
+ */
+#define ALWAYS_CALL_BUSY 1
+
+/*
+ * Define to make the APM BIOS calls zero all data segment registers (so
+ * that an incorrect BIOS implementation will cause a kernel panic if it
+ * tries to write to arbitrary memory).
+ */
+#define APM_ZERO_SEGS
+
+#include <asm/apm.h>
+
+/*
+ * Define to re-initialize the interrupt 0 timer to 100 Hz after a suspend.
+ * This patched by Chad Miller <cmiller@surfsouth.com>, original code by
+ * David Chen <chen@ctpa04.mit.edu>
+ */
+#undef INIT_TIMER_AFTER_SUSPEND
+
+#ifdef INIT_TIMER_AFTER_SUSPEND
+#include <linux/timex.h>
+#include <asm/io.h>
+#include <linux/delay.h>
+#endif
+
+/*
+ * Need to poll the APM BIOS every second
+ */
+#define APM_CHECK_TIMEOUT (HZ)
+
+/*
+ * Ignore suspend events for this amount of time after a resume
+ */
+#define DEFAULT_BOUNCE_INTERVAL (3 * HZ)
+
+/*
+ * Maximum number of events stored
+ */
+#define APM_MAX_EVENTS 20
+
+/*
+ * The per-file APM data
+ */
+struct apm_user {
+ int magic;
+ struct apm_user *next;
+ unsigned int suser: 1;
+ unsigned int writer: 1;
+ unsigned int reader: 1;
+ unsigned int suspend_wait: 1;
+ int suspend_result;
+ int suspends_pending;
+ int standbys_pending;
+ int suspends_read;
+ int standbys_read;
+ int event_head;
+ int event_tail;
+ apm_event_t events[APM_MAX_EVENTS];
+};
+
+/*
+ * The magic number in apm_user
+ */
+#define APM_BIOS_MAGIC 0x4101
+
+/*
+ * idle percentage above which bios idle calls are done
+ */
+#ifdef CONFIG_APM_CPU_IDLE
+#define DEFAULT_IDLE_THRESHOLD 95
+#else
+#define DEFAULT_IDLE_THRESHOLD 100
+#endif
+#define DEFAULT_IDLE_PERIOD (100 / 3)
+
+static int apm_cpu_idle(struct cpuidle_device *dev,
+ struct cpuidle_driver *drv, int index);
+
+static struct cpuidle_driver apm_idle_driver = {
+ .name = "apm_idle",
+ .owner = THIS_MODULE,
+ .states = {
+ { /* entry 0 is for polling */ },
+ { /* entry 1 is for APM idle */
+ .name = "APM",
+ .desc = "APM idle",
+ .exit_latency = 250, /* WAG */
+ .target_residency = 500, /* WAG */
+ .enter = &apm_cpu_idle
+ },
+ },
+ .state_count = 2,
+};
+
+static struct cpuidle_device apm_cpuidle_device;
+
+/*
+ * Local variables
+ */
+__visible struct {
+ unsigned long offset;
+ unsigned short segment;
+} apm_bios_entry;
+static int clock_slowed;
+static int idle_threshold __read_mostly = DEFAULT_IDLE_THRESHOLD;
+static int idle_period __read_mostly = DEFAULT_IDLE_PERIOD;
+static int suspends_pending;
+static int standbys_pending;
+static int ignore_sys_suspend;
+static int ignore_normal_resume;
+static int bounce_interval __read_mostly = DEFAULT_BOUNCE_INTERVAL;
+
+static bool debug __read_mostly;
+static bool smp __read_mostly;
+static int apm_disabled = -1;
+#ifdef CONFIG_SMP
+static bool power_off;
+#else
+static bool power_off = 1;
+#endif
+static bool realmode_power_off;
+#ifdef CONFIG_APM_ALLOW_INTS
+static bool allow_ints = 1;
+#else
+static bool allow_ints;
+#endif
+static bool broken_psr;
+
+static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue);
+static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue);
+static struct apm_user *user_list;
+static DEFINE_SPINLOCK(user_list_lock);
+static DEFINE_MUTEX(apm_mutex);
+
+/*
+ * Set up a segment that references the real mode segment 0x40
+ * that extends up to the end of page zero (that we have reserved).
+ * This is for buggy BIOS's that refer to (real mode) segment 0x40
+ * even though they are called in protected mode.
+ */
+static struct desc_struct bad_bios_desc = GDT_ENTRY_INIT(0x4092,
+ (unsigned long)__va(0x400UL), PAGE_SIZE - 0x400 - 1);
+
+static const char driver_version[] = "1.16ac"; /* no spaces */
+
+static struct task_struct *kapmd_task;
+
+/*
+ * APM event names taken from the APM 1.2 specification. These are
+ * the message codes that the BIOS uses to tell us about events
+ */
+static const char * const apm_event_name[] = {
+ "system standby",
+ "system suspend",
+ "normal resume",
+ "critical resume",
+ "low battery",
+ "power status change",
+ "update time",
+ "critical suspend",
+ "user standby",
+ "user suspend",
+ "system standby resume",
+ "capabilities change"
+};
+#define NR_APM_EVENT_NAME ARRAY_SIZE(apm_event_name)
+
+typedef struct lookup_t {
+ int key;
+ char *msg;
+} lookup_t;
+
+/*
+ * The BIOS returns a set of standard error codes in AX when the
+ * carry flag is set.
+ */
+
+static const lookup_t error_table[] = {
+/* N/A { APM_SUCCESS, "Operation succeeded" }, */
+ { APM_DISABLED, "Power management disabled" },
+ { APM_CONNECTED, "Real mode interface already connected" },
+ { APM_NOT_CONNECTED, "Interface not connected" },
+ { APM_16_CONNECTED, "16 bit interface already connected" },
+/* N/A { APM_16_UNSUPPORTED, "16 bit interface not supported" }, */
+ { APM_32_CONNECTED, "32 bit interface already connected" },
+ { APM_32_UNSUPPORTED, "32 bit interface not supported" },
+ { APM_BAD_DEVICE, "Unrecognized device ID" },
+ { APM_BAD_PARAM, "Parameter out of range" },
+ { APM_NOT_ENGAGED, "Interface not engaged" },
+ { APM_BAD_FUNCTION, "Function not supported" },
+ { APM_RESUME_DISABLED, "Resume timer disabled" },
+ { APM_BAD_STATE, "Unable to enter requested state" },
+/* N/A { APM_NO_EVENTS, "No events pending" }, */
+ { APM_NO_ERROR, "BIOS did not set a return code" },
+ { APM_NOT_PRESENT, "No APM present" }
+};
+#define ERROR_COUNT ARRAY_SIZE(error_table)
+
+/**
+ * apm_error - display an APM error
+ * @str: information string
+ * @err: APM BIOS return code
+ *
+ * Write a meaningful log entry to the kernel log in the event of
+ * an APM error. Note that this also handles (negative) kernel errors.
+ */
+
+static void apm_error(char *str, int err)
+{
+ int i;
+
+ for (i = 0; i < ERROR_COUNT; i++)
+ if (error_table[i].key == err)
+ break;
+ if (i < ERROR_COUNT)
+ pr_notice("%s: %s\n", str, error_table[i].msg);
+ else if (err < 0)
+ pr_notice("%s: linux error code %i\n", str, err);
+ else
+ pr_notice("%s: unknown error code %#2.2x\n",
+ str, err);
+}
+
+/*
+ * These are the actual BIOS calls. Depending on APM_ZERO_SEGS and
+ * apm_info.allow_ints, we are being really paranoid here! Not only
+ * are interrupts disabled, but all the segment registers (except SS)
+ * are saved and zeroed this means that if the BIOS tries to reference
+ * any data without explicitly loading the segment registers, the kernel
+ * will fault immediately rather than have some unforeseen circumstances
+ * for the rest of the kernel. And it will be very obvious! :-) Doing
+ * this depends on CS referring to the same physical memory as DS so that
+ * DS can be zeroed before the call. Unfortunately, we can't do anything
+ * about the stack segment/pointer. Also, we tell the compiler that
+ * everything could change.
+ *
+ * Also, we KNOW that for the non error case of apm_bios_call, there
+ * is no useful data returned in the low order 8 bits of eax.
+ */
+
+static inline unsigned long __apm_irq_save(void)
+{
+ unsigned long flags;
+ local_save_flags(flags);
+ if (apm_info.allow_ints) {
+ if (irqs_disabled_flags(flags))
+ local_irq_enable();
+ } else
+ local_irq_disable();
+
+ return flags;
+}
+
+#define apm_irq_save(flags) \
+ do { flags = __apm_irq_save(); } while (0)
+
+static inline void apm_irq_restore(unsigned long flags)
+{
+ if (irqs_disabled_flags(flags))
+ local_irq_disable();
+ else if (irqs_disabled())
+ local_irq_enable();
+}
+
+#ifdef APM_ZERO_SEGS
+# define APM_DECL_SEGS \
+ unsigned int saved_fs; unsigned int saved_gs;
+# define APM_DO_SAVE_SEGS \
+ savesegment(fs, saved_fs); savesegment(gs, saved_gs)
+# define APM_DO_RESTORE_SEGS \
+ loadsegment(fs, saved_fs); loadsegment(gs, saved_gs)
+#else
+# define APM_DECL_SEGS
+# define APM_DO_SAVE_SEGS
+# define APM_DO_RESTORE_SEGS
+#endif
+
+struct apm_bios_call {
+ u32 func;
+ /* In and out */
+ u32 ebx;
+ u32 ecx;
+ /* Out only */
+ u32 eax;
+ u32 edx;
+ u32 esi;
+
+ /* Error: -ENOMEM, or bits 8-15 of eax */
+ int err;
+};
+
+/**
+ * __apm_bios_call - Make an APM BIOS 32bit call
+ * @_call: pointer to struct apm_bios_call.
+ *
+ * Make an APM call using the 32bit protected mode interface. The
+ * caller is responsible for knowing if APM BIOS is configured and
+ * enabled. This call can disable interrupts for a long period of
+ * time on some laptops. The return value is in AH and the carry
+ * flag is loaded into AL. If there is an error, then the error
+ * code is returned in AH (bits 8-15 of eax) and this function
+ * returns non-zero.
+ *
+ * Note: this makes the call on the current CPU.
+ */
+static long __apm_bios_call(void *_call)
+{
+ APM_DECL_SEGS
+ unsigned long flags;
+ int cpu;
+ struct desc_struct save_desc_40;
+ struct desc_struct *gdt;
+ struct apm_bios_call *call = _call;
+
+ cpu = get_cpu();
+ BUG_ON(cpu != 0);
+ gdt = get_cpu_gdt_rw(cpu);
+ save_desc_40 = gdt[0x40 / 8];
+ gdt[0x40 / 8] = bad_bios_desc;
+
+ apm_irq_save(flags);
+ firmware_restrict_branch_speculation_start();
+ APM_DO_SAVE_SEGS;
+ apm_bios_call_asm(call->func, call->ebx, call->ecx,
+ &call->eax, &call->ebx, &call->ecx, &call->edx,
+ &call->esi);
+ APM_DO_RESTORE_SEGS;
+ firmware_restrict_branch_speculation_end();
+ apm_irq_restore(flags);
+ gdt[0x40 / 8] = save_desc_40;
+ put_cpu();
+
+ return call->eax & 0xff;
+}
+
+/* Run __apm_bios_call or __apm_bios_call_simple on CPU 0 */
+static int on_cpu0(long (*fn)(void *), struct apm_bios_call *call)
+{
+ int ret;
+
+ /* Don't bother with work_on_cpu in the common case, so we don't
+ * have to worry about OOM or overhead. */
+ if (get_cpu() == 0) {
+ ret = fn(call);
+ put_cpu();
+ } else {
+ put_cpu();
+ ret = work_on_cpu(0, fn, call);
+ }
+
+ /* work_on_cpu can fail with -ENOMEM */
+ if (ret < 0)
+ call->err = ret;
+ else
+ call->err = (call->eax >> 8) & 0xff;
+
+ return ret;
+}
+
+/**
+ * apm_bios_call - Make an APM BIOS 32bit call (on CPU 0)
+ * @call: the apm_bios_call registers.
+ *
+ * If there is an error, it is returned in @call.err.
+ */
+static int apm_bios_call(struct apm_bios_call *call)
+{
+ return on_cpu0(__apm_bios_call, call);
+}
+
+/**
+ * __apm_bios_call_simple - Make an APM BIOS 32bit call (on CPU 0)
+ * @_call: pointer to struct apm_bios_call.
+ *
+ * Make a BIOS call that returns one value only, or just status.
+ * If there is an error, then the error code is returned in AH
+ * (bits 8-15 of eax) and this function returns non-zero (it can
+ * also return -ENOMEM). This is used for simpler BIOS operations.
+ * This call may hold interrupts off for a long time on some laptops.
+ *
+ * Note: this makes the call on the current CPU.
+ */
+static long __apm_bios_call_simple(void *_call)
+{
+ u8 error;
+ APM_DECL_SEGS
+ unsigned long flags;
+ int cpu;
+ struct desc_struct save_desc_40;
+ struct desc_struct *gdt;
+ struct apm_bios_call *call = _call;
+
+ cpu = get_cpu();
+ BUG_ON(cpu != 0);
+ gdt = get_cpu_gdt_rw(cpu);
+ save_desc_40 = gdt[0x40 / 8];
+ gdt[0x40 / 8] = bad_bios_desc;
+
+ apm_irq_save(flags);
+ firmware_restrict_branch_speculation_start();
+ APM_DO_SAVE_SEGS;
+ error = apm_bios_call_simple_asm(call->func, call->ebx, call->ecx,
+ &call->eax);
+ APM_DO_RESTORE_SEGS;
+ firmware_restrict_branch_speculation_end();
+ apm_irq_restore(flags);
+ gdt[0x40 / 8] = save_desc_40;
+ put_cpu();
+ return error;
+}
+
+/**
+ * apm_bios_call_simple - make a simple APM BIOS 32bit call
+ * @func: APM function to invoke
+ * @ebx_in: EBX register value for BIOS call
+ * @ecx_in: ECX register value for BIOS call
+ * @eax: EAX register on return from the BIOS call
+ * @err: bits
+ *
+ * Make a BIOS call that returns one value only, or just status.
+ * If there is an error, then the error code is returned in @err
+ * and this function returns non-zero. This is used for simpler
+ * BIOS operations. This call may hold interrupts off for a long
+ * time on some laptops.
+ */
+static int apm_bios_call_simple(u32 func, u32 ebx_in, u32 ecx_in, u32 *eax,
+ int *err)
+{
+ struct apm_bios_call call;
+ int ret;
+
+ call.func = func;
+ call.ebx = ebx_in;
+ call.ecx = ecx_in;
+
+ ret = on_cpu0(__apm_bios_call_simple, &call);
+ *eax = call.eax;
+ *err = call.err;
+ return ret;
+}
+
+/**
+ * apm_driver_version - APM driver version
+ * @val: loaded with the APM version on return
+ *
+ * Retrieve the APM version supported by the BIOS. This is only
+ * supported for APM 1.1 or higher. An error indicates APM 1.0 is
+ * probably present.
+ *
+ * On entry val should point to a value indicating the APM driver
+ * version with the high byte being the major and the low byte the
+ * minor number both in BCD
+ *
+ * On return it will hold the BIOS revision supported in the
+ * same format.
+ */
+
+static int apm_driver_version(u_short *val)
+{
+ u32 eax;
+ int err;
+
+ if (apm_bios_call_simple(APM_FUNC_VERSION, 0, *val, &eax, &err))
+ return err;
+ *val = eax;
+ return APM_SUCCESS;
+}
+
+/**
+ * apm_get_event - get an APM event from the BIOS
+ * @event: pointer to the event
+ * @info: point to the event information
+ *
+ * The APM BIOS provides a polled information for event
+ * reporting. The BIOS expects to be polled at least every second
+ * when events are pending. When a message is found the caller should
+ * poll until no more messages are present. However, this causes
+ * problems on some laptops where a suspend event notification is
+ * not cleared until it is acknowledged.
+ *
+ * Additional information is returned in the info pointer, providing
+ * that APM 1.2 is in use. If no messges are pending the value 0x80
+ * is returned (No power management events pending).
+ */
+static int apm_get_event(apm_event_t *event, apm_eventinfo_t *info)
+{
+ struct apm_bios_call call;
+
+ call.func = APM_FUNC_GET_EVENT;
+ call.ebx = call.ecx = 0;
+
+ if (apm_bios_call(&call))
+ return call.err;
+
+ *event = call.ebx;
+ if (apm_info.connection_version < 0x0102)
+ *info = ~0; /* indicate info not valid */
+ else
+ *info = call.ecx;
+ return APM_SUCCESS;
+}
+
+/**
+ * set_power_state - set the power management state
+ * @what: which items to transition
+ * @state: state to transition to
+ *
+ * Request an APM change of state for one or more system devices. The
+ * processor state must be transitioned last of all. what holds the
+ * class of device in the upper byte and the device number (0xFF for
+ * all) for the object to be transitioned.
+ *
+ * The state holds the state to transition to, which may in fact
+ * be an acceptance of a BIOS requested state change.
+ */
+
+static int set_power_state(u_short what, u_short state)
+{
+ u32 eax;
+ int err;
+
+ if (apm_bios_call_simple(APM_FUNC_SET_STATE, what, state, &eax, &err))
+ return err;
+ return APM_SUCCESS;
+}
+
+/**
+ * set_system_power_state - set system wide power state
+ * @state: which state to enter
+ *
+ * Transition the entire system into a new APM power state.
+ */
+
+static int set_system_power_state(u_short state)
+{
+ return set_power_state(APM_DEVICE_ALL, state);
+}
+
+/**
+ * apm_do_idle - perform power saving
+ *
+ * This function notifies the BIOS that the processor is (in the view
+ * of the OS) idle. It returns -1 in the event that the BIOS refuses
+ * to handle the idle request. On a success the function returns 1
+ * if the BIOS did clock slowing or 0 otherwise.
+ */
+
+static int apm_do_idle(void)
+{
+ u32 eax;
+ u8 ret = 0;
+ int idled = 0;
+ int err = 0;
+
+ if (!need_resched()) {
+ idled = 1;
+ ret = apm_bios_call_simple(APM_FUNC_IDLE, 0, 0, &eax, &err);
+ }
+
+ if (!idled)
+ return 0;
+
+ if (ret) {
+ static unsigned long t;
+
+ /* This always fails on some SMP boards running UP kernels.
+ * Only report the failure the first 5 times.
+ */
+ if (++t < 5) {
+ printk(KERN_DEBUG "apm_do_idle failed (%d)\n", err);
+ t = jiffies;
+ }
+ return -1;
+ }
+ clock_slowed = (apm_info.bios.flags & APM_IDLE_SLOWS_CLOCK) != 0;
+ return clock_slowed;
+}
+
+/**
+ * apm_do_busy - inform the BIOS the CPU is busy
+ *
+ * Request that the BIOS brings the CPU back to full performance.
+ */
+
+static void apm_do_busy(void)
+{
+ u32 dummy;
+ int err;
+
+ if (clock_slowed || ALWAYS_CALL_BUSY) {
+ (void)apm_bios_call_simple(APM_FUNC_BUSY, 0, 0, &dummy, &err);
+ clock_slowed = 0;
+ }
+}
+
+/*
+ * If no process has really been interested in
+ * the CPU for some time, we want to call BIOS
+ * power management - we probably want
+ * to conserve power.
+ */
+#define IDLE_CALC_LIMIT (HZ * 100)
+#define IDLE_LEAKY_MAX 16
+
+/**
+ * apm_cpu_idle - cpu idling for APM capable Linux
+ *
+ * This is the idling function the kernel executes when APM is available. It
+ * tries to do BIOS powermanagement based on the average system idle time.
+ * Furthermore it calls the system default idle routine.
+ */
+
+static int apm_cpu_idle(struct cpuidle_device *dev,
+ struct cpuidle_driver *drv, int index)
+{
+ static int use_apm_idle; /* = 0 */
+ static unsigned int last_jiffies; /* = 0 */
+ static u64 last_stime; /* = 0 */
+ u64 stime, utime;
+
+ int apm_idle_done = 0;
+ unsigned int jiffies_since_last_check = jiffies - last_jiffies;
+ unsigned int bucket;
+
+recalc:
+ task_cputime(current, &utime, &stime);
+ if (jiffies_since_last_check > IDLE_CALC_LIMIT) {
+ use_apm_idle = 0;
+ } else if (jiffies_since_last_check > idle_period) {
+ unsigned int idle_percentage;
+
+ idle_percentage = nsecs_to_jiffies(stime - last_stime);
+ idle_percentage *= 100;
+ idle_percentage /= jiffies_since_last_check;
+ use_apm_idle = (idle_percentage > idle_threshold);
+ if (apm_info.forbid_idle)
+ use_apm_idle = 0;
+ }
+
+ last_jiffies = jiffies;
+ last_stime = stime;
+
+ bucket = IDLE_LEAKY_MAX;
+
+ while (!need_resched()) {
+ if (use_apm_idle) {
+ unsigned int t;
+
+ t = jiffies;
+ switch (apm_do_idle()) {
+ case 0:
+ apm_idle_done = 1;
+ if (t != jiffies) {
+ if (bucket) {
+ bucket = IDLE_LEAKY_MAX;
+ continue;
+ }
+ } else if (bucket) {
+ bucket--;
+ continue;
+ }
+ break;
+ case 1:
+ apm_idle_done = 1;
+ break;
+ default: /* BIOS refused */
+ break;
+ }
+ }
+ default_idle();
+ local_irq_disable();
+ jiffies_since_last_check = jiffies - last_jiffies;
+ if (jiffies_since_last_check > idle_period)
+ goto recalc;
+ }
+
+ if (apm_idle_done)
+ apm_do_busy();
+
+ return index;
+}
+
+/**
+ * apm_power_off - ask the BIOS to power off
+ *
+ * Handle the power off sequence. This is the one piece of code we
+ * will execute even on SMP machines. In order to deal with BIOS
+ * bugs we support real mode APM BIOS power off calls. We also make
+ * the SMP call on CPU0 as some systems will only honour this call
+ * on their first cpu.
+ */
+
+static void apm_power_off(void)
+{
+ /* Some bioses don't like being called from CPU != 0 */
+ if (apm_info.realmode_power_off) {
+ set_cpus_allowed_ptr(current, cpumask_of(0));
+ machine_real_restart(MRR_APM);
+ } else {
+ (void)set_system_power_state(APM_STATE_OFF);
+ }
+}
+
+#ifdef CONFIG_APM_DO_ENABLE
+
+/**
+ * apm_enable_power_management - enable BIOS APM power management
+ * @enable: enable yes/no
+ *
+ * Enable or disable the APM BIOS power services.
+ */
+
+static int apm_enable_power_management(int enable)
+{
+ u32 eax;
+ int err;
+
+ if ((enable == 0) && (apm_info.bios.flags & APM_BIOS_DISENGAGED))
+ return APM_NOT_ENGAGED;
+ if (apm_bios_call_simple(APM_FUNC_ENABLE_PM, APM_DEVICE_BALL,
+ enable, &eax, &err))
+ return err;
+ if (enable)
+ apm_info.bios.flags &= ~APM_BIOS_DISABLED;
+ else
+ apm_info.bios.flags |= APM_BIOS_DISABLED;
+ return APM_SUCCESS;
+}
+#endif
+
+/**
+ * apm_get_power_status - get current power state
+ * @status: returned status
+ * @bat: battery info
+ * @life: estimated life
+ *
+ * Obtain the current power status from the APM BIOS. We return a
+ * status which gives the rough battery status, and current power
+ * source. The bat value returned give an estimate as a percentage
+ * of life and a status value for the battery. The estimated life
+ * if reported is a lifetime in secodnds/minutes at current powwer
+ * consumption.
+ */
+
+static int apm_get_power_status(u_short *status, u_short *bat, u_short *life)
+{
+ struct apm_bios_call call;
+
+ call.func = APM_FUNC_GET_STATUS;
+ call.ebx = APM_DEVICE_ALL;
+ call.ecx = 0;
+
+ if (apm_info.get_power_status_broken)
+ return APM_32_UNSUPPORTED;
+ if (apm_bios_call(&call)) {
+ if (!call.err)
+ return APM_NO_ERROR;
+ return call.err;
+ }
+ *status = call.ebx;
+ *bat = call.ecx;
+ if (apm_info.get_power_status_swabinminutes) {
+ *life = swab16((u16)call.edx);
+ *life |= 0x8000;
+ } else
+ *life = call.edx;
+ return APM_SUCCESS;
+}
+
+#if 0
+static int apm_get_battery_status(u_short which, u_short *status,
+ u_short *bat, u_short *life, u_short *nbat)
+{
+ u32 eax;
+ u32 ebx;
+ u32 ecx;
+ u32 edx;
+ u32 esi;
+
+ if (apm_info.connection_version < 0x0102) {
+ /* pretend we only have one battery. */
+ if (which != 1)
+ return APM_BAD_DEVICE;
+ *nbat = 1;
+ return apm_get_power_status(status, bat, life);
+ }
+
+ if (apm_bios_call(APM_FUNC_GET_STATUS, (0x8000 | (which)), 0, &eax,
+ &ebx, &ecx, &edx, &esi))
+ return (eax >> 8) & 0xff;
+ *status = ebx;
+ *bat = ecx;
+ *life = edx;
+ *nbat = esi;
+ return APM_SUCCESS;
+}
+#endif
+
+/**
+ * apm_engage_power_management - enable PM on a device
+ * @device: identity of device
+ * @enable: on/off
+ *
+ * Activate or deactivate power management on either a specific device
+ * or the entire system (%APM_DEVICE_ALL).
+ */
+
+static int apm_engage_power_management(u_short device, int enable)
+{
+ u32 eax;
+ int err;
+
+ if ((enable == 0) && (device == APM_DEVICE_ALL)
+ && (apm_info.bios.flags & APM_BIOS_DISABLED))
+ return APM_DISABLED;
+ if (apm_bios_call_simple(APM_FUNC_ENGAGE_PM, device, enable,
+ &eax, &err))
+ return err;
+ if (device == APM_DEVICE_ALL) {
+ if (enable)
+ apm_info.bios.flags &= ~APM_BIOS_DISENGAGED;
+ else
+ apm_info.bios.flags |= APM_BIOS_DISENGAGED;
+ }
+ return APM_SUCCESS;
+}
+
+#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT)
+
+/**
+ * apm_console_blank - blank the display
+ * @blank: on/off
+ *
+ * Attempt to blank the console, firstly by blanking just video device
+ * zero, and if that fails (some BIOSes don't support it) then it blanks
+ * all video devices. Typically the BIOS will do laptop backlight and
+ * monitor powerdown for us.
+ */
+
+static int apm_console_blank(int blank)
+{
+ int error = APM_NOT_ENGAGED; /* silence gcc */
+ int i;
+ u_short state;
+ static const u_short dev[3] = { 0x100, 0x1FF, 0x101 };
+
+ state = blank ? APM_STATE_STANDBY : APM_STATE_READY;
+
+ for (i = 0; i < ARRAY_SIZE(dev); i++) {
+ error = set_power_state(dev[i], state);
+
+ if ((error == APM_SUCCESS) || (error == APM_NO_ERROR))
+ return 1;
+
+ if (error == APM_NOT_ENGAGED)
+ break;
+ }
+
+ if (error == APM_NOT_ENGAGED) {
+ static int tried;
+ int eng_error;
+ if (tried++ == 0) {
+ eng_error = apm_engage_power_management(APM_DEVICE_ALL, 1);
+ if (eng_error) {
+ apm_error("set display", error);
+ apm_error("engage interface", eng_error);
+ return 0;
+ } else
+ return apm_console_blank(blank);
+ }
+ }
+ apm_error("set display", error);
+ return 0;
+}
+#endif
+
+static int queue_empty(struct apm_user *as)
+{
+ return as->event_head == as->event_tail;
+}
+
+static apm_event_t get_queued_event(struct apm_user *as)
+{
+ if (++as->event_tail >= APM_MAX_EVENTS)
+ as->event_tail = 0;
+ return as->events[as->event_tail];
+}
+
+static void queue_event(apm_event_t event, struct apm_user *sender)
+{
+ struct apm_user *as;
+
+ spin_lock(&user_list_lock);
+ if (user_list == NULL)
+ goto out;
+ for (as = user_list; as != NULL; as = as->next) {
+ if ((as == sender) || (!as->reader))
+ continue;
+ if (++as->event_head >= APM_MAX_EVENTS)
+ as->event_head = 0;
+
+ if (as->event_head == as->event_tail) {
+ static int notified;
+
+ if (notified++ == 0)
+ pr_err("an event queue overflowed\n");
+ if (++as->event_tail >= APM_MAX_EVENTS)
+ as->event_tail = 0;
+ }
+ as->events[as->event_head] = event;
+ if (!as->suser || !as->writer)
+ continue;
+ switch (event) {
+ case APM_SYS_SUSPEND:
+ case APM_USER_SUSPEND:
+ as->suspends_pending++;
+ suspends_pending++;
+ break;
+
+ case APM_SYS_STANDBY:
+ case APM_USER_STANDBY:
+ as->standbys_pending++;
+ standbys_pending++;
+ break;
+ }
+ }
+ wake_up_interruptible(&apm_waitqueue);
+out:
+ spin_unlock(&user_list_lock);
+}
+
+static void reinit_timer(void)
+{
+#ifdef INIT_TIMER_AFTER_SUSPEND
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&i8253_lock, flags);
+ /* set the clock to HZ */
+ outb_p(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */
+ udelay(10);
+ outb_p(LATCH & 0xff, PIT_CH0); /* LSB */
+ udelay(10);
+ outb_p(LATCH >> 8, PIT_CH0); /* MSB */
+ udelay(10);
+ raw_spin_unlock_irqrestore(&i8253_lock, flags);
+#endif
+}
+
+static int suspend(int vetoable)
+{
+ int err;
+ struct apm_user *as;
+
+ dpm_suspend_start(PMSG_SUSPEND);
+ dpm_suspend_end(PMSG_SUSPEND);
+
+ local_irq_disable();
+ syscore_suspend();
+
+ local_irq_enable();
+
+ save_processor_state();
+ err = set_system_power_state(APM_STATE_SUSPEND);
+ ignore_normal_resume = 1;
+ restore_processor_state();
+
+ local_irq_disable();
+ reinit_timer();
+
+ if (err == APM_NO_ERROR)
+ err = APM_SUCCESS;
+ if (err != APM_SUCCESS)
+ apm_error("suspend", err);
+ err = (err == APM_SUCCESS) ? 0 : -EIO;
+
+ syscore_resume();
+ local_irq_enable();
+
+ dpm_resume_start(PMSG_RESUME);
+ dpm_resume_end(PMSG_RESUME);
+
+ queue_event(APM_NORMAL_RESUME, NULL);
+ spin_lock(&user_list_lock);
+ for (as = user_list; as != NULL; as = as->next) {
+ as->suspend_wait = 0;
+ as->suspend_result = err;
+ }
+ spin_unlock(&user_list_lock);
+ wake_up_interruptible(&apm_suspend_waitqueue);
+ return err;
+}
+
+static void standby(void)
+{
+ int err;
+
+ dpm_suspend_end(PMSG_SUSPEND);
+
+ local_irq_disable();
+ syscore_suspend();
+ local_irq_enable();
+
+ err = set_system_power_state(APM_STATE_STANDBY);
+ if ((err != APM_SUCCESS) && (err != APM_NO_ERROR))
+ apm_error("standby", err);
+
+ local_irq_disable();
+ syscore_resume();
+ local_irq_enable();
+
+ dpm_resume_start(PMSG_RESUME);
+}
+
+static apm_event_t get_event(void)
+{
+ int error;
+ apm_event_t event = APM_NO_EVENTS; /* silence gcc */
+ apm_eventinfo_t info;
+
+ static int notified;
+
+ /* we don't use the eventinfo */
+ error = apm_get_event(&event, &info);
+ if (error == APM_SUCCESS)
+ return event;
+
+ if ((error != APM_NO_EVENTS) && (notified++ == 0))
+ apm_error("get_event", error);
+
+ return 0;
+}
+
+static void check_events(void)
+{
+ apm_event_t event;
+ static unsigned long last_resume;
+ static int ignore_bounce;
+
+ while ((event = get_event()) != 0) {
+ if (debug) {
+ if (event <= NR_APM_EVENT_NAME)
+ printk(KERN_DEBUG "apm: received %s notify\n",
+ apm_event_name[event - 1]);
+ else
+ printk(KERN_DEBUG "apm: received unknown "
+ "event 0x%02x\n", event);
+ }
+ if (ignore_bounce
+ && (time_after(jiffies, last_resume + bounce_interval)))
+ ignore_bounce = 0;
+
+ switch (event) {
+ case APM_SYS_STANDBY:
+ case APM_USER_STANDBY:
+ queue_event(event, NULL);
+ if (standbys_pending <= 0)
+ standby();
+ break;
+
+ case APM_USER_SUSPEND:
+#ifdef CONFIG_APM_IGNORE_USER_SUSPEND
+ if (apm_info.connection_version > 0x100)
+ set_system_power_state(APM_STATE_REJECT);
+ break;
+#endif
+ case APM_SYS_SUSPEND:
+ if (ignore_bounce) {
+ if (apm_info.connection_version > 0x100)
+ set_system_power_state(APM_STATE_REJECT);
+ break;
+ }
+ /*
+ * If we are already processing a SUSPEND,
+ * then further SUSPEND events from the BIOS
+ * will be ignored. We also return here to
+ * cope with the fact that the Thinkpads keep
+ * sending a SUSPEND event until something else
+ * happens!
+ */
+ if (ignore_sys_suspend)
+ return;
+ ignore_sys_suspend = 1;
+ queue_event(event, NULL);
+ if (suspends_pending <= 0)
+ (void) suspend(1);
+ break;
+
+ case APM_NORMAL_RESUME:
+ case APM_CRITICAL_RESUME:
+ case APM_STANDBY_RESUME:
+ ignore_sys_suspend = 0;
+ last_resume = jiffies;
+ ignore_bounce = 1;
+ if ((event != APM_NORMAL_RESUME)
+ || (ignore_normal_resume == 0)) {
+ dpm_resume_end(PMSG_RESUME);
+ queue_event(event, NULL);
+ }
+ ignore_normal_resume = 0;
+ break;
+
+ case APM_CAPABILITY_CHANGE:
+ case APM_LOW_BATTERY:
+ case APM_POWER_STATUS_CHANGE:
+ queue_event(event, NULL);
+ /* If needed, notify drivers here */
+ break;
+
+ case APM_UPDATE_TIME:
+ break;
+
+ case APM_CRITICAL_SUSPEND:
+ /*
+ * We are not allowed to reject a critical suspend.
+ */
+ (void)suspend(0);
+ break;
+ }
+ }
+}
+
+static void apm_event_handler(void)
+{
+ static int pending_count = 4;
+ int err;
+
+ if ((standbys_pending > 0) || (suspends_pending > 0)) {
+ if ((apm_info.connection_version > 0x100) &&
+ (pending_count-- <= 0)) {
+ pending_count = 4;
+ if (debug)
+ printk(KERN_DEBUG "apm: setting state busy\n");
+ err = set_system_power_state(APM_STATE_BUSY);
+ if (err)
+ apm_error("busy", err);
+ }
+ } else
+ pending_count = 4;
+ check_events();
+}
+
+/*
+ * This is the APM thread main loop.
+ */
+
+static void apm_mainloop(void)
+{
+ DECLARE_WAITQUEUE(wait, current);
+
+ add_wait_queue(&apm_waitqueue, &wait);
+ set_current_state(TASK_INTERRUPTIBLE);
+ for (;;) {
+ schedule_timeout(APM_CHECK_TIMEOUT);
+ if (kthread_should_stop())
+ break;
+ /*
+ * Ok, check all events, check for idle (and mark us sleeping
+ * so as not to count towards the load average)..
+ */
+ set_current_state(TASK_INTERRUPTIBLE);
+ apm_event_handler();
+ }
+ remove_wait_queue(&apm_waitqueue, &wait);
+}
+
+static int check_apm_user(struct apm_user *as, const char *func)
+{
+ if (as == NULL || as->magic != APM_BIOS_MAGIC) {
+ pr_err("%s passed bad filp\n", func);
+ return 1;
+ }
+ return 0;
+}
+
+static ssize_t do_read(struct file *fp, char __user *buf, size_t count, loff_t *ppos)
+{
+ struct apm_user *as;
+ int i;
+ apm_event_t event;
+
+ as = fp->private_data;
+ if (check_apm_user(as, "read"))
+ return -EIO;
+ if ((int)count < sizeof(apm_event_t))
+ return -EINVAL;
+ if ((queue_empty(as)) && (fp->f_flags & O_NONBLOCK))
+ return -EAGAIN;
+ wait_event_interruptible(apm_waitqueue, !queue_empty(as));
+ i = count;
+ while ((i >= sizeof(event)) && !queue_empty(as)) {
+ event = get_queued_event(as);
+ if (copy_to_user(buf, &event, sizeof(event))) {
+ if (i < count)
+ break;
+ return -EFAULT;
+ }
+ switch (event) {
+ case APM_SYS_SUSPEND:
+ case APM_USER_SUSPEND:
+ as->suspends_read++;
+ break;
+
+ case APM_SYS_STANDBY:
+ case APM_USER_STANDBY:
+ as->standbys_read++;
+ break;
+ }
+ buf += sizeof(event);
+ i -= sizeof(event);
+ }
+ if (i < count)
+ return count - i;
+ if (signal_pending(current))
+ return -ERESTARTSYS;
+ return 0;
+}
+
+static __poll_t do_poll(struct file *fp, poll_table *wait)
+{
+ struct apm_user *as;
+
+ as = fp->private_data;
+ if (check_apm_user(as, "poll"))
+ return 0;
+ poll_wait(fp, &apm_waitqueue, wait);
+ if (!queue_empty(as))
+ return EPOLLIN | EPOLLRDNORM;
+ return 0;
+}
+
+static long do_ioctl(struct file *filp, u_int cmd, u_long arg)
+{
+ struct apm_user *as;
+ int ret;
+
+ as = filp->private_data;
+ if (check_apm_user(as, "ioctl"))
+ return -EIO;
+ if (!as->suser || !as->writer)
+ return -EPERM;
+ switch (cmd) {
+ case APM_IOC_STANDBY:
+ mutex_lock(&apm_mutex);
+ if (as->standbys_read > 0) {
+ as->standbys_read--;
+ as->standbys_pending--;
+ standbys_pending--;
+ } else
+ queue_event(APM_USER_STANDBY, as);
+ if (standbys_pending <= 0)
+ standby();
+ mutex_unlock(&apm_mutex);
+ break;
+ case APM_IOC_SUSPEND:
+ mutex_lock(&apm_mutex);
+ if (as->suspends_read > 0) {
+ as->suspends_read--;
+ as->suspends_pending--;
+ suspends_pending--;
+ } else
+ queue_event(APM_USER_SUSPEND, as);
+ if (suspends_pending <= 0) {
+ ret = suspend(1);
+ mutex_unlock(&apm_mutex);
+ } else {
+ as->suspend_wait = 1;
+ mutex_unlock(&apm_mutex);
+ wait_event_interruptible(apm_suspend_waitqueue,
+ as->suspend_wait == 0);
+ ret = as->suspend_result;
+ }
+ return ret;
+ default:
+ return -ENOTTY;
+ }
+ return 0;
+}
+
+static int do_release(struct inode *inode, struct file *filp)
+{
+ struct apm_user *as;
+
+ as = filp->private_data;
+ if (check_apm_user(as, "release"))
+ return 0;
+ filp->private_data = NULL;
+ if (as->standbys_pending > 0) {
+ standbys_pending -= as->standbys_pending;
+ if (standbys_pending <= 0)
+ standby();
+ }
+ if (as->suspends_pending > 0) {
+ suspends_pending -= as->suspends_pending;
+ if (suspends_pending <= 0)
+ (void) suspend(1);
+ }
+ spin_lock(&user_list_lock);
+ if (user_list == as)
+ user_list = as->next;
+ else {
+ struct apm_user *as1;
+
+ for (as1 = user_list;
+ (as1 != NULL) && (as1->next != as);
+ as1 = as1->next)
+ ;
+ if (as1 == NULL)
+ pr_err("filp not in user list\n");
+ else
+ as1->next = as->next;
+ }
+ spin_unlock(&user_list_lock);
+ kfree(as);
+ return 0;
+}
+
+static int do_open(struct inode *inode, struct file *filp)
+{
+ struct apm_user *as;
+
+ as = kmalloc(sizeof(*as), GFP_KERNEL);
+ if (as == NULL)
+ return -ENOMEM;
+
+ as->magic = APM_BIOS_MAGIC;
+ as->event_tail = as->event_head = 0;
+ as->suspends_pending = as->standbys_pending = 0;
+ as->suspends_read = as->standbys_read = 0;
+ /*
+ * XXX - this is a tiny bit broken, when we consider BSD
+ * process accounting. If the device is opened by root, we
+ * instantly flag that we used superuser privs. Who knows,
+ * we might close the device immediately without doing a
+ * privileged operation -- cevans
+ */
+ as->suser = capable(CAP_SYS_ADMIN);
+ as->writer = (filp->f_mode & FMODE_WRITE) == FMODE_WRITE;
+ as->reader = (filp->f_mode & FMODE_READ) == FMODE_READ;
+ spin_lock(&user_list_lock);
+ as->next = user_list;
+ user_list = as;
+ spin_unlock(&user_list_lock);
+ filp->private_data = as;
+ return 0;
+}
+
+#ifdef CONFIG_PROC_FS
+static int proc_apm_show(struct seq_file *m, void *v)
+{
+ unsigned short bx;
+ unsigned short cx;
+ unsigned short dx;
+ int error;
+ unsigned short ac_line_status = 0xff;
+ unsigned short battery_status = 0xff;
+ unsigned short battery_flag = 0xff;
+ int percentage = -1;
+ int time_units = -1;
+ char *units = "?";
+
+ if ((num_online_cpus() == 1) &&
+ !(error = apm_get_power_status(&bx, &cx, &dx))) {
+ ac_line_status = (bx >> 8) & 0xff;
+ battery_status = bx & 0xff;
+ if ((cx & 0xff) != 0xff)
+ percentage = cx & 0xff;
+
+ if (apm_info.connection_version > 0x100) {
+ battery_flag = (cx >> 8) & 0xff;
+ if (dx != 0xffff) {
+ units = (dx & 0x8000) ? "min" : "sec";
+ time_units = dx & 0x7fff;
+ }
+ }
+ }
+ /* Arguments, with symbols from linux/apm_bios.h. Information is
+ from the Get Power Status (0x0a) call unless otherwise noted.
+
+ 0) Linux driver version (this will change if format changes)
+ 1) APM BIOS Version. Usually 1.0, 1.1 or 1.2.
+ 2) APM flags from APM Installation Check (0x00):
+ bit 0: APM_16_BIT_SUPPORT
+ bit 1: APM_32_BIT_SUPPORT
+ bit 2: APM_IDLE_SLOWS_CLOCK
+ bit 3: APM_BIOS_DISABLED
+ bit 4: APM_BIOS_DISENGAGED
+ 3) AC line status
+ 0x00: Off-line
+ 0x01: On-line
+ 0x02: On backup power (BIOS >= 1.1 only)
+ 0xff: Unknown
+ 4) Battery status
+ 0x00: High
+ 0x01: Low
+ 0x02: Critical
+ 0x03: Charging
+ 0x04: Selected battery not present (BIOS >= 1.2 only)
+ 0xff: Unknown
+ 5) Battery flag
+ bit 0: High
+ bit 1: Low
+ bit 2: Critical
+ bit 3: Charging
+ bit 7: No system battery
+ 0xff: Unknown
+ 6) Remaining battery life (percentage of charge):
+ 0-100: valid
+ -1: Unknown
+ 7) Remaining battery life (time units):
+ Number of remaining minutes or seconds
+ -1: Unknown
+ 8) min = minutes; sec = seconds */
+
+ seq_printf(m, "%s %d.%d 0x%02x 0x%02x 0x%02x 0x%02x %d%% %d %s\n",
+ driver_version,
+ (apm_info.bios.version >> 8) & 0xff,
+ apm_info.bios.version & 0xff,
+ apm_info.bios.flags,
+ ac_line_status,
+ battery_status,
+ battery_flag,
+ percentage,
+ time_units,
+ units);
+ return 0;
+}
+#endif
+
+static int apm(void *unused)
+{
+ unsigned short bx;
+ unsigned short cx;
+ unsigned short dx;
+ int error;
+ char *power_stat;
+ char *bat_stat;
+
+ /* 2002/08/01 - WT
+ * This is to avoid random crashes at boot time during initialization
+ * on SMP systems in case of "apm=power-off" mode. Seen on ASUS A7M266D.
+ * Some bioses don't like being called from CPU != 0.
+ * Method suggested by Ingo Molnar.
+ */
+ set_cpus_allowed_ptr(current, cpumask_of(0));
+ BUG_ON(smp_processor_id() != 0);
+
+ if (apm_info.connection_version == 0) {
+ apm_info.connection_version = apm_info.bios.version;
+ if (apm_info.connection_version > 0x100) {
+ /*
+ * We only support BIOSs up to version 1.2
+ */
+ if (apm_info.connection_version > 0x0102)
+ apm_info.connection_version = 0x0102;
+ error = apm_driver_version(&apm_info.connection_version);
+ if (error != APM_SUCCESS) {
+ apm_error("driver version", error);
+ /* Fall back to an APM 1.0 connection. */
+ apm_info.connection_version = 0x100;
+ }
+ }
+ }
+
+ if (debug)
+ printk(KERN_INFO "apm: Connection version %d.%d\n",
+ (apm_info.connection_version >> 8) & 0xff,
+ apm_info.connection_version & 0xff);
+
+#ifdef CONFIG_APM_DO_ENABLE
+ if (apm_info.bios.flags & APM_BIOS_DISABLED) {
+ /*
+ * This call causes my NEC UltraLite Versa 33/C to hang if it
+ * is booted with PM disabled but not in the docking station.
+ * Unfortunate ...
+ */
+ error = apm_enable_power_management(1);
+ if (error) {
+ apm_error("enable power management", error);
+ return -1;
+ }
+ }
+#endif
+
+ if ((apm_info.bios.flags & APM_BIOS_DISENGAGED)
+ && (apm_info.connection_version > 0x0100)) {
+ error = apm_engage_power_management(APM_DEVICE_ALL, 1);
+ if (error) {
+ apm_error("engage power management", error);
+ return -1;
+ }
+ }
+
+ if (debug && (num_online_cpus() == 1 || smp)) {
+ error = apm_get_power_status(&bx, &cx, &dx);
+ if (error)
+ printk(KERN_INFO "apm: power status not available\n");
+ else {
+ switch ((bx >> 8) & 0xff) {
+ case 0:
+ power_stat = "off line";
+ break;
+ case 1:
+ power_stat = "on line";
+ break;
+ case 2:
+ power_stat = "on backup power";
+ break;
+ default:
+ power_stat = "unknown";
+ break;
+ }
+ switch (bx & 0xff) {
+ case 0:
+ bat_stat = "high";
+ break;
+ case 1:
+ bat_stat = "low";
+ break;
+ case 2:
+ bat_stat = "critical";
+ break;
+ case 3:
+ bat_stat = "charging";
+ break;
+ default:
+ bat_stat = "unknown";
+ break;
+ }
+ printk(KERN_INFO
+ "apm: AC %s, battery status %s, battery life ",
+ power_stat, bat_stat);
+ if ((cx & 0xff) == 0xff)
+ printk("unknown\n");
+ else
+ printk("%d%%\n", cx & 0xff);
+ if (apm_info.connection_version > 0x100) {
+ printk(KERN_INFO
+ "apm: battery flag 0x%02x, battery life ",
+ (cx >> 8) & 0xff);
+ if (dx == 0xffff)
+ printk("unknown\n");
+ else
+ printk("%d %s\n", dx & 0x7fff,
+ (dx & 0x8000) ?
+ "minutes" : "seconds");
+ }
+ }
+ }
+
+ /* Install our power off handler.. */
+ if (power_off)
+ pm_power_off = apm_power_off;
+
+ if (num_online_cpus() == 1 || smp) {
+#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT)
+ console_blank_hook = apm_console_blank;
+#endif
+ apm_mainloop();
+#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT)
+ console_blank_hook = NULL;
+#endif
+ }
+
+ return 0;
+}
+
+#ifndef MODULE
+static int __init apm_setup(char *str)
+{
+ int invert;
+
+ while ((str != NULL) && (*str != '\0')) {
+ if (strncmp(str, "off", 3) == 0)
+ apm_disabled = 1;
+ if (strncmp(str, "on", 2) == 0)
+ apm_disabled = 0;
+ if ((strncmp(str, "bounce-interval=", 16) == 0) ||
+ (strncmp(str, "bounce_interval=", 16) == 0))
+ bounce_interval = simple_strtol(str + 16, NULL, 0);
+ if ((strncmp(str, "idle-threshold=", 15) == 0) ||
+ (strncmp(str, "idle_threshold=", 15) == 0))
+ idle_threshold = simple_strtol(str + 15, NULL, 0);
+ if ((strncmp(str, "idle-period=", 12) == 0) ||
+ (strncmp(str, "idle_period=", 12) == 0))
+ idle_period = simple_strtol(str + 12, NULL, 0);
+ invert = (strncmp(str, "no-", 3) == 0) ||
+ (strncmp(str, "no_", 3) == 0);
+ if (invert)
+ str += 3;
+ if (strncmp(str, "debug", 5) == 0)
+ debug = !invert;
+ if ((strncmp(str, "power-off", 9) == 0) ||
+ (strncmp(str, "power_off", 9) == 0))
+ power_off = !invert;
+ if (strncmp(str, "smp", 3) == 0) {
+ smp = !invert;
+ idle_threshold = 100;
+ }
+ if ((strncmp(str, "allow-ints", 10) == 0) ||
+ (strncmp(str, "allow_ints", 10) == 0))
+ apm_info.allow_ints = !invert;
+ if ((strncmp(str, "broken-psr", 10) == 0) ||
+ (strncmp(str, "broken_psr", 10) == 0))
+ apm_info.get_power_status_broken = !invert;
+ if ((strncmp(str, "realmode-power-off", 18) == 0) ||
+ (strncmp(str, "realmode_power_off", 18) == 0))
+ apm_info.realmode_power_off = !invert;
+ str = strchr(str, ',');
+ if (str != NULL)
+ str += strspn(str, ", \t");
+ }
+ return 1;
+}
+
+__setup("apm=", apm_setup);
+#endif
+
+static const struct file_operations apm_bios_fops = {
+ .owner = THIS_MODULE,
+ .read = do_read,
+ .poll = do_poll,
+ .unlocked_ioctl = do_ioctl,
+ .open = do_open,
+ .release = do_release,
+ .llseek = noop_llseek,
+};
+
+static struct miscdevice apm_device = {
+ APM_MINOR_DEV,
+ "apm_bios",
+ &apm_bios_fops
+};
+
+
+/* Simple "print if true" callback */
+static int __init print_if_true(const struct dmi_system_id *d)
+{
+ printk("%s\n", d->ident);
+ return 0;
+}
+
+/*
+ * Some Bioses enable the PS/2 mouse (touchpad) at resume, even if it was
+ * disabled before the suspend. Linux used to get terribly confused by that.
+ */
+static int __init broken_ps2_resume(const struct dmi_system_id *d)
+{
+ printk(KERN_INFO "%s machine detected. Mousepad Resume Bug "
+ "workaround hopefully not needed.\n", d->ident);
+ return 0;
+}
+
+/* Some bioses have a broken protected mode poweroff and need to use realmode */
+static int __init set_realmode_power_off(const struct dmi_system_id *d)
+{
+ if (apm_info.realmode_power_off == 0) {
+ apm_info.realmode_power_off = 1;
+ printk(KERN_INFO "%s bios detected. "
+ "Using realmode poweroff only.\n", d->ident);
+ }
+ return 0;
+}
+
+/* Some laptops require interrupts to be enabled during APM calls */
+static int __init set_apm_ints(const struct dmi_system_id *d)
+{
+ if (apm_info.allow_ints == 0) {
+ apm_info.allow_ints = 1;
+ printk(KERN_INFO "%s machine detected. "
+ "Enabling interrupts during APM calls.\n", d->ident);
+ }
+ return 0;
+}
+
+/* Some APM bioses corrupt memory or just plain do not work */
+static int __init apm_is_horked(const struct dmi_system_id *d)
+{
+ if (apm_info.disabled == 0) {
+ apm_info.disabled = 1;
+ printk(KERN_INFO "%s machine detected. "
+ "Disabling APM.\n", d->ident);
+ }
+ return 0;
+}
+
+static int __init apm_is_horked_d850md(const struct dmi_system_id *d)
+{
+ if (apm_info.disabled == 0) {
+ apm_info.disabled = 1;
+ printk(KERN_INFO "%s machine detected. "
+ "Disabling APM.\n", d->ident);
+ printk(KERN_INFO "This bug is fixed in bios P15 which is available for\n");
+ printk(KERN_INFO "download from support.intel.com\n");
+ }
+ return 0;
+}
+
+/* Some APM bioses hang on APM idle calls */
+static int __init apm_likes_to_melt(const struct dmi_system_id *d)
+{
+ if (apm_info.forbid_idle == 0) {
+ apm_info.forbid_idle = 1;
+ printk(KERN_INFO "%s machine detected. "
+ "Disabling APM idle calls.\n", d->ident);
+ }
+ return 0;
+}
+
+/*
+ * Check for clue free BIOS implementations who use
+ * the following QA technique
+ *
+ * [ Write BIOS Code ]<------
+ * | ^
+ * < Does it Compile >----N--
+ * |Y ^
+ * < Does it Boot Win98 >-N--
+ * |Y
+ * [Ship It]
+ *
+ * Phoenix A04 08/24/2000 is known bad (Dell Inspiron 5000e)
+ * Phoenix A07 09/29/2000 is known good (Dell Inspiron 5000)
+ */
+static int __init broken_apm_power(const struct dmi_system_id *d)
+{
+ apm_info.get_power_status_broken = 1;
+ printk(KERN_WARNING "BIOS strings suggest APM bugs, "
+ "disabling power status reporting.\n");
+ return 0;
+}
+
+/*
+ * This bios swaps the APM minute reporting bytes over (Many sony laptops
+ * have this problem).
+ */
+static int __init swab_apm_power_in_minutes(const struct dmi_system_id *d)
+{
+ apm_info.get_power_status_swabinminutes = 1;
+ printk(KERN_WARNING "BIOS strings suggest APM reports battery life "
+ "in minutes and wrong byte order.\n");
+ return 0;
+}
+
+static const struct dmi_system_id apm_dmi_table[] __initconst = {
+ {
+ print_if_true,
+ KERN_WARNING "IBM T23 - BIOS 1.03b+ and controller firmware 1.02+ may be needed for Linux APM.",
+ { DMI_MATCH(DMI_SYS_VENDOR, "IBM"),
+ DMI_MATCH(DMI_BIOS_VERSION, "1AET38WW (1.01b)"), },
+ },
+ { /* Handle problems with APM on the C600 */
+ broken_ps2_resume, "Dell Latitude C600",
+ { DMI_MATCH(DMI_SYS_VENDOR, "Dell"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Latitude C600"), },
+ },
+ { /* Allow interrupts during suspend on Dell Latitude laptops*/
+ set_apm_ints, "Dell Latitude",
+ { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Latitude C510"), }
+ },
+ { /* APM crashes */
+ apm_is_horked, "Dell Inspiron 2500",
+ { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 2500"),
+ DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+ DMI_MATCH(DMI_BIOS_VERSION, "A11"), },
+ },
+ { /* Allow interrupts during suspend on Dell Inspiron laptops*/
+ set_apm_ints, "Dell Inspiron", {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 4000"), },
+ },
+ { /* Handle problems with APM on Inspiron 5000e */
+ broken_apm_power, "Dell Inspiron 5000e",
+ { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+ DMI_MATCH(DMI_BIOS_VERSION, "A04"),
+ DMI_MATCH(DMI_BIOS_DATE, "08/24/2000"), },
+ },
+ { /* Handle problems with APM on Inspiron 2500 */
+ broken_apm_power, "Dell Inspiron 2500",
+ { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+ DMI_MATCH(DMI_BIOS_VERSION, "A12"),
+ DMI_MATCH(DMI_BIOS_DATE, "02/04/2002"), },
+ },
+ { /* APM crashes */
+ apm_is_horked, "Dell Dimension 4100",
+ { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "XPS-Z"),
+ DMI_MATCH(DMI_BIOS_VENDOR, "Intel Corp."),
+ DMI_MATCH(DMI_BIOS_VERSION, "A11"), },
+ },
+ { /* Allow interrupts during suspend on Compaq Laptops*/
+ set_apm_ints, "Compaq 12XL125",
+ { DMI_MATCH(DMI_SYS_VENDOR, "Compaq"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Compaq PC"),
+ DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+ DMI_MATCH(DMI_BIOS_VERSION, "4.06"), },
+ },
+ { /* Allow interrupts during APM or the clock goes slow */
+ set_apm_ints, "ASUSTeK",
+ { DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK Computer Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "L8400K series Notebook PC"), },
+ },
+ { /* APM blows on shutdown */
+ apm_is_horked, "ABIT KX7-333[R]",
+ { DMI_MATCH(DMI_BOARD_VENDOR, "ABIT"),
+ DMI_MATCH(DMI_BOARD_NAME, "VT8367-8233A (KX7-333[R])"), },
+ },
+ { /* APM crashes */
+ apm_is_horked, "Trigem Delhi3",
+ { DMI_MATCH(DMI_SYS_VENDOR, "TriGem Computer, Inc"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Delhi3"), },
+ },
+ { /* APM crashes */
+ apm_is_horked, "Fujitsu-Siemens",
+ { DMI_MATCH(DMI_BIOS_VENDOR, "hoenix/FUJITSU SIEMENS"),
+ DMI_MATCH(DMI_BIOS_VERSION, "Version1.01"), },
+ },
+ { /* APM crashes */
+ apm_is_horked_d850md, "Intel D850MD",
+ { DMI_MATCH(DMI_BIOS_VENDOR, "Intel Corp."),
+ DMI_MATCH(DMI_BIOS_VERSION, "MV85010A.86A.0016.P07.0201251536"), },
+ },
+ { /* APM crashes */
+ apm_is_horked, "Intel D810EMO",
+ { DMI_MATCH(DMI_BIOS_VENDOR, "Intel Corp."),
+ DMI_MATCH(DMI_BIOS_VERSION, "MO81010A.86A.0008.P04.0004170800"), },
+ },
+ { /* APM crashes */
+ apm_is_horked, "Dell XPS-Z",
+ { DMI_MATCH(DMI_BIOS_VENDOR, "Intel Corp."),
+ DMI_MATCH(DMI_BIOS_VERSION, "A11"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "XPS-Z"), },
+ },
+ { /* APM crashes */
+ apm_is_horked, "Sharp PC-PJ/AX",
+ { DMI_MATCH(DMI_SYS_VENDOR, "SHARP"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "PC-PJ/AX"),
+ DMI_MATCH(DMI_BIOS_VENDOR, "SystemSoft"),
+ DMI_MATCH(DMI_BIOS_VERSION, "Version R2.08"), },
+ },
+ { /* APM crashes */
+ apm_is_horked, "Dell Inspiron 2500",
+ { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 2500"),
+ DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+ DMI_MATCH(DMI_BIOS_VERSION, "A11"), },
+ },
+ { /* APM idle hangs */
+ apm_likes_to_melt, "Jabil AMD",
+ { DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."),
+ DMI_MATCH(DMI_BIOS_VERSION, "0AASNP06"), },
+ },
+ { /* APM idle hangs */
+ apm_likes_to_melt, "AMI Bios",
+ { DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."),
+ DMI_MATCH(DMI_BIOS_VERSION, "0AASNP05"), },
+ },
+ { /* Handle problems with APM on Sony Vaio PCG-N505X(DE) */
+ swab_apm_power_in_minutes, "Sony VAIO",
+ { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+ DMI_MATCH(DMI_BIOS_VERSION, "R0206H"),
+ DMI_MATCH(DMI_BIOS_DATE, "08/23/99"), },
+ },
+ { /* Handle problems with APM on Sony Vaio PCG-N505VX */
+ swab_apm_power_in_minutes, "Sony VAIO",
+ { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+ DMI_MATCH(DMI_BIOS_VERSION, "W2K06H0"),
+ DMI_MATCH(DMI_BIOS_DATE, "02/03/00"), },
+ },
+ { /* Handle problems with APM on Sony Vaio PCG-XG29 */
+ swab_apm_power_in_minutes, "Sony VAIO",
+ { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+ DMI_MATCH(DMI_BIOS_VERSION, "R0117A0"),
+ DMI_MATCH(DMI_BIOS_DATE, "04/25/00"), },
+ },
+ { /* Handle problems with APM on Sony Vaio PCG-Z600NE */
+ swab_apm_power_in_minutes, "Sony VAIO",
+ { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+ DMI_MATCH(DMI_BIOS_VERSION, "R0121Z1"),
+ DMI_MATCH(DMI_BIOS_DATE, "05/11/00"), },
+ },
+ { /* Handle problems with APM on Sony Vaio PCG-Z600NE */
+ swab_apm_power_in_minutes, "Sony VAIO",
+ { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+ DMI_MATCH(DMI_BIOS_VERSION, "WME01Z1"),
+ DMI_MATCH(DMI_BIOS_DATE, "08/11/00"), },
+ },
+ { /* Handle problems with APM on Sony Vaio PCG-Z600LEK(DE) */
+ swab_apm_power_in_minutes, "Sony VAIO",
+ { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+ DMI_MATCH(DMI_BIOS_VERSION, "R0206Z3"),
+ DMI_MATCH(DMI_BIOS_DATE, "12/25/00"), },
+ },
+ { /* Handle problems with APM on Sony Vaio PCG-Z505LS */
+ swab_apm_power_in_minutes, "Sony VAIO",
+ { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+ DMI_MATCH(DMI_BIOS_VERSION, "R0203D0"),
+ DMI_MATCH(DMI_BIOS_DATE, "05/12/00"), },
+ },
+ { /* Handle problems with APM on Sony Vaio PCG-Z505LS */
+ swab_apm_power_in_minutes, "Sony VAIO",
+ { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+ DMI_MATCH(DMI_BIOS_VERSION, "R0203Z3"),
+ DMI_MATCH(DMI_BIOS_DATE, "08/25/00"), },
+ },
+ { /* Handle problems with APM on Sony Vaio PCG-Z505LS (with updated BIOS) */
+ swab_apm_power_in_minutes, "Sony VAIO",
+ { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+ DMI_MATCH(DMI_BIOS_VERSION, "R0209Z3"),
+ DMI_MATCH(DMI_BIOS_DATE, "05/12/01"), },
+ },
+ { /* Handle problems with APM on Sony Vaio PCG-F104K */
+ swab_apm_power_in_minutes, "Sony VAIO",
+ { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+ DMI_MATCH(DMI_BIOS_VERSION, "R0204K2"),
+ DMI_MATCH(DMI_BIOS_DATE, "08/28/00"), },
+ },
+
+ { /* Handle problems with APM on Sony Vaio PCG-C1VN/C1VE */
+ swab_apm_power_in_minutes, "Sony VAIO",
+ { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+ DMI_MATCH(DMI_BIOS_VERSION, "R0208P1"),
+ DMI_MATCH(DMI_BIOS_DATE, "11/09/00"), },
+ },
+ { /* Handle problems with APM on Sony Vaio PCG-C1VE */
+ swab_apm_power_in_minutes, "Sony VAIO",
+ { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+ DMI_MATCH(DMI_BIOS_VERSION, "R0204P1"),
+ DMI_MATCH(DMI_BIOS_DATE, "09/12/00"), },
+ },
+ { /* Handle problems with APM on Sony Vaio PCG-C1VE */
+ swab_apm_power_in_minutes, "Sony VAIO",
+ { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
+ DMI_MATCH(DMI_BIOS_VERSION, "WXPO1Z3"),
+ DMI_MATCH(DMI_BIOS_DATE, "10/26/01"), },
+ },
+ { /* broken PM poweroff bios */
+ set_realmode_power_off, "Award Software v4.60 PGMA",
+ { DMI_MATCH(DMI_BIOS_VENDOR, "Award Software International, Inc."),
+ DMI_MATCH(DMI_BIOS_VERSION, "4.60 PGMA"),
+ DMI_MATCH(DMI_BIOS_DATE, "134526184"), },
+ },
+
+ /* Generic per vendor APM settings */
+
+ { /* Allow interrupts during suspend on IBM laptops */
+ set_apm_ints, "IBM",
+ { DMI_MATCH(DMI_SYS_VENDOR, "IBM"), },
+ },
+
+ { }
+};
+
+/*
+ * Just start the APM thread. We do NOT want to do APM BIOS
+ * calls from anything but the APM thread, if for no other reason
+ * than the fact that we don't trust the APM BIOS. This way,
+ * most common APM BIOS problems that lead to protection errors
+ * etc will have at least some level of being contained...
+ *
+ * In short, if something bad happens, at least we have a choice
+ * of just killing the apm thread..
+ */
+static int __init apm_init(void)
+{
+ struct desc_struct *gdt;
+ int err;
+
+ dmi_check_system(apm_dmi_table);
+
+ if (apm_info.bios.version == 0 || machine_is_olpc()) {
+ printk(KERN_INFO "apm: BIOS not found.\n");
+ return -ENODEV;
+ }
+ printk(KERN_INFO
+ "apm: BIOS version %d.%d Flags 0x%02x (Driver version %s)\n",
+ ((apm_info.bios.version >> 8) & 0xff),
+ (apm_info.bios.version & 0xff),
+ apm_info.bios.flags,
+ driver_version);
+ if ((apm_info.bios.flags & APM_32_BIT_SUPPORT) == 0) {
+ printk(KERN_INFO "apm: no 32 bit BIOS support\n");
+ return -ENODEV;
+ }
+
+ if (allow_ints)
+ apm_info.allow_ints = 1;
+ if (broken_psr)
+ apm_info.get_power_status_broken = 1;
+ if (realmode_power_off)
+ apm_info.realmode_power_off = 1;
+ /* User can override, but default is to trust DMI */
+ if (apm_disabled != -1)
+ apm_info.disabled = apm_disabled;
+
+ /*
+ * Fix for the Compaq Contura 3/25c which reports BIOS version 0.1
+ * but is reportedly a 1.0 BIOS.
+ */
+ if (apm_info.bios.version == 0x001)
+ apm_info.bios.version = 0x100;
+
+ /* BIOS < 1.2 doesn't set cseg_16_len */
+ if (apm_info.bios.version < 0x102)
+ apm_info.bios.cseg_16_len = 0; /* 64k */
+
+ if (debug) {
+ printk(KERN_INFO "apm: entry %x:%x cseg16 %x dseg %x",
+ apm_info.bios.cseg, apm_info.bios.offset,
+ apm_info.bios.cseg_16, apm_info.bios.dseg);
+ if (apm_info.bios.version > 0x100)
+ printk(" cseg len %x, dseg len %x",
+ apm_info.bios.cseg_len,
+ apm_info.bios.dseg_len);
+ if (apm_info.bios.version > 0x101)
+ printk(" cseg16 len %x", apm_info.bios.cseg_16_len);
+ printk("\n");
+ }
+
+ if (apm_info.disabled) {
+ pr_notice("disabled on user request.\n");
+ return -ENODEV;
+ }
+ if ((num_online_cpus() > 1) && !power_off && !smp) {
+ pr_notice("disabled - APM is not SMP safe.\n");
+ apm_info.disabled = 1;
+ return -ENODEV;
+ }
+ if (!acpi_disabled) {
+ pr_notice("overridden by ACPI.\n");
+ apm_info.disabled = 1;
+ return -ENODEV;
+ }
+
+ /*
+ * Set up the long jump entry point to the APM BIOS, which is called
+ * from inline assembly.
+ */
+ apm_bios_entry.offset = apm_info.bios.offset;
+ apm_bios_entry.segment = APM_CS;
+
+ /*
+ * The APM 1.1 BIOS is supposed to provide limit information that it
+ * recognizes. Many machines do this correctly, but many others do
+ * not restrict themselves to their claimed limit. When this happens,
+ * they will cause a segmentation violation in the kernel at boot time.
+ * Most BIOS's, however, will respect a 64k limit, so we use that.
+ *
+ * Note we only set APM segments on CPU zero, since we pin the APM
+ * code to that CPU.
+ */
+ gdt = get_cpu_gdt_rw(0);
+ set_desc_base(&gdt[APM_CS >> 3],
+ (unsigned long)__va((unsigned long)apm_info.bios.cseg << 4));
+ set_desc_base(&gdt[APM_CS_16 >> 3],
+ (unsigned long)__va((unsigned long)apm_info.bios.cseg_16 << 4));
+ set_desc_base(&gdt[APM_DS >> 3],
+ (unsigned long)__va((unsigned long)apm_info.bios.dseg << 4));
+
+ proc_create_single("apm", 0, NULL, proc_apm_show);
+
+ kapmd_task = kthread_create(apm, NULL, "kapmd");
+ if (IS_ERR(kapmd_task)) {
+ pr_err("disabled - Unable to start kernel thread\n");
+ err = PTR_ERR(kapmd_task);
+ kapmd_task = NULL;
+ remove_proc_entry("apm", NULL);
+ return err;
+ }
+ wake_up_process(kapmd_task);
+
+ if (num_online_cpus() > 1 && !smp) {
+ printk(KERN_NOTICE
+ "apm: disabled - APM is not SMP safe (power off active).\n");
+ return 0;
+ }
+
+ /*
+ * Note we don't actually care if the misc_device cannot be registered.
+ * this driver can do its job without it, even if userspace can't
+ * control it. just log the error
+ */
+ if (misc_register(&apm_device))
+ printk(KERN_WARNING "apm: Could not register misc device.\n");
+
+ if (HZ != 100)
+ idle_period = (idle_period * HZ) / 100;
+ if (idle_threshold < 100) {
+ cpuidle_poll_state_init(&apm_idle_driver);
+ if (!cpuidle_register_driver(&apm_idle_driver))
+ if (cpuidle_register_device(&apm_cpuidle_device))
+ cpuidle_unregister_driver(&apm_idle_driver);
+ }
+
+ return 0;
+}
+
+static void __exit apm_exit(void)
+{
+ int error;
+
+ cpuidle_unregister_device(&apm_cpuidle_device);
+ cpuidle_unregister_driver(&apm_idle_driver);
+
+ if (((apm_info.bios.flags & APM_BIOS_DISENGAGED) == 0)
+ && (apm_info.connection_version > 0x0100)) {
+ error = apm_engage_power_management(APM_DEVICE_ALL, 0);
+ if (error)
+ apm_error("disengage power management", error);
+ }
+ misc_deregister(&apm_device);
+ remove_proc_entry("apm", NULL);
+ if (power_off)
+ pm_power_off = NULL;
+ if (kapmd_task) {
+ kthread_stop(kapmd_task);
+ kapmd_task = NULL;
+ }
+}
+
+module_init(apm_init);
+module_exit(apm_exit);
+
+MODULE_AUTHOR("Stephen Rothwell");
+MODULE_DESCRIPTION("Advanced Power Management");
+MODULE_LICENSE("GPL");
+module_param(debug, bool, 0644);
+MODULE_PARM_DESC(debug, "Enable debug mode");
+module_param(power_off, bool, 0444);
+MODULE_PARM_DESC(power_off, "Enable power off");
+module_param(bounce_interval, int, 0444);
+MODULE_PARM_DESC(bounce_interval,
+ "Set the number of ticks to ignore suspend bounces");
+module_param(allow_ints, bool, 0444);
+MODULE_PARM_DESC(allow_ints, "Allow interrupts during BIOS calls");
+module_param(broken_psr, bool, 0444);
+MODULE_PARM_DESC(broken_psr, "BIOS has a broken GetPowerStatus call");
+module_param(realmode_power_off, bool, 0444);
+MODULE_PARM_DESC(realmode_power_off,
+ "Switch to real mode before powering off");
+module_param(idle_threshold, int, 0444);
+MODULE_PARM_DESC(idle_threshold,
+ "System idle percentage above which to make APM BIOS idle calls");
+module_param(idle_period, int, 0444);
+MODULE_PARM_DESC(idle_period,
+ "Period (in sec/100) over which to calculate the idle percentage");
+module_param(smp, bool, 0444);
+MODULE_PARM_DESC(smp,
+ "Set this to enable APM use on an SMP platform. Use with caution on older systems");
+MODULE_ALIAS_MISCDEV(APM_MINOR_DEV);
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
new file mode 100644
index 000000000..01de31db3
--- /dev/null
+++ b/arch/x86/kernel/asm-offsets.c
@@ -0,0 +1,111 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Generate definitions needed by assembly language modules.
+ * This code generates raw asm output which is post-processed to extract
+ * and format the required data.
+ */
+#define COMPILE_OFFSETS
+
+#include <linux/crypto.h>
+#include <linux/sched.h>
+#include <linux/stddef.h>
+#include <linux/hardirq.h>
+#include <linux/suspend.h>
+#include <linux/kbuild.h>
+#include <asm/processor.h>
+#include <asm/thread_info.h>
+#include <asm/sigframe.h>
+#include <asm/bootparam.h>
+#include <asm/suspend.h>
+#include <asm/tlbflush.h>
+
+#ifdef CONFIG_XEN
+#include <xen/interface/xen.h>
+#endif
+
+#ifdef CONFIG_X86_32
+# include "asm-offsets_32.c"
+#else
+# include "asm-offsets_64.c"
+#endif
+
+void common(void) {
+ BLANK();
+ OFFSET(TASK_threadsp, task_struct, thread.sp);
+#ifdef CONFIG_STACKPROTECTOR
+ OFFSET(TASK_stack_canary, task_struct, stack_canary);
+#endif
+
+ BLANK();
+ OFFSET(TASK_TI_flags, task_struct, thread_info.flags);
+ OFFSET(TASK_addr_limit, task_struct, thread.addr_limit);
+
+ BLANK();
+ OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
+
+ BLANK();
+ OFFSET(pbe_address, pbe, address);
+ OFFSET(pbe_orig_address, pbe, orig_address);
+ OFFSET(pbe_next, pbe, next);
+
+#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
+ BLANK();
+ OFFSET(IA32_SIGCONTEXT_ax, sigcontext_32, ax);
+ OFFSET(IA32_SIGCONTEXT_bx, sigcontext_32, bx);
+ OFFSET(IA32_SIGCONTEXT_cx, sigcontext_32, cx);
+ OFFSET(IA32_SIGCONTEXT_dx, sigcontext_32, dx);
+ OFFSET(IA32_SIGCONTEXT_si, sigcontext_32, si);
+ OFFSET(IA32_SIGCONTEXT_di, sigcontext_32, di);
+ OFFSET(IA32_SIGCONTEXT_bp, sigcontext_32, bp);
+ OFFSET(IA32_SIGCONTEXT_sp, sigcontext_32, sp);
+ OFFSET(IA32_SIGCONTEXT_ip, sigcontext_32, ip);
+
+ BLANK();
+ OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe_ia32, uc.uc_mcontext);
+#endif
+
+#ifdef CONFIG_PARAVIRT
+ BLANK();
+ OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops);
+ OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops);
+ OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
+ OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
+ OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
+ OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0);
+ OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2);
+#endif
+
+#ifdef CONFIG_XEN
+ BLANK();
+ OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
+ OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
+#endif
+
+ BLANK();
+ OFFSET(BP_scratch, boot_params, scratch);
+ OFFSET(BP_secure_boot, boot_params, secure_boot);
+ OFFSET(BP_loadflags, boot_params, hdr.loadflags);
+ OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
+ OFFSET(BP_version, boot_params, hdr.version);
+ OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
+ OFFSET(BP_init_size, boot_params, hdr.init_size);
+ OFFSET(BP_pref_address, boot_params, hdr.pref_address);
+ OFFSET(BP_code32_start, boot_params, hdr.code32_start);
+
+ BLANK();
+ DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
+
+ /* TLB state for the entry code */
+ OFFSET(TLB_STATE_user_pcid_flush_mask, tlb_state, user_pcid_flush_mask);
+
+ /* Layout info for cpu_entry_area */
+ OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss);
+ OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline);
+ OFFSET(CPU_ENTRY_AREA_entry_stack, cpu_entry_area, entry_stack_page);
+ DEFINE(SIZEOF_entry_stack, sizeof(struct entry_stack));
+ DEFINE(MASK_entry_stack, (~(sizeof(struct entry_stack) - 1)));
+
+ /* Offset for sp0 and sp1 into the tss_struct */
+ OFFSET(TSS_sp0, tss_struct, x86_tss.sp0);
+ OFFSET(TSS_sp1, tss_struct, x86_tss.sp1);
+}
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
new file mode 100644
index 000000000..82826f227
--- /dev/null
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef __LINUX_KBUILD_H
+# error "Please do not build this file directly, build asm-offsets.c instead"
+#endif
+
+#include <asm/ucontext.h>
+
+#define __SYSCALL_I386(nr, sym, qual) [nr] = 1,
+static char syscalls[] = {
+#include <asm/syscalls_32.h>
+};
+
+/* workaround for a warning with -Wmissing-prototypes */
+void foo(void);
+
+void foo(void)
+{
+ OFFSET(CPUINFO_x86, cpuinfo_x86, x86);
+ OFFSET(CPUINFO_x86_vendor, cpuinfo_x86, x86_vendor);
+ OFFSET(CPUINFO_x86_model, cpuinfo_x86, x86_model);
+ OFFSET(CPUINFO_x86_stepping, cpuinfo_x86, x86_stepping);
+ OFFSET(CPUINFO_cpuid_level, cpuinfo_x86, cpuid_level);
+ OFFSET(CPUINFO_x86_capability, cpuinfo_x86, x86_capability);
+ OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id);
+ BLANK();
+
+ OFFSET(PT_EBX, pt_regs, bx);
+ OFFSET(PT_ECX, pt_regs, cx);
+ OFFSET(PT_EDX, pt_regs, dx);
+ OFFSET(PT_ESI, pt_regs, si);
+ OFFSET(PT_EDI, pt_regs, di);
+ OFFSET(PT_EBP, pt_regs, bp);
+ OFFSET(PT_EAX, pt_regs, ax);
+ OFFSET(PT_DS, pt_regs, ds);
+ OFFSET(PT_ES, pt_regs, es);
+ OFFSET(PT_FS, pt_regs, fs);
+ OFFSET(PT_GS, pt_regs, gs);
+ OFFSET(PT_ORIG_EAX, pt_regs, orig_ax);
+ OFFSET(PT_EIP, pt_regs, ip);
+ OFFSET(PT_CS, pt_regs, cs);
+ OFFSET(PT_EFLAGS, pt_regs, flags);
+ OFFSET(PT_OLDESP, pt_regs, sp);
+ OFFSET(PT_OLDSS, pt_regs, ss);
+ BLANK();
+
+ OFFSET(saved_context_gdt_desc, saved_context, gdt_desc);
+ BLANK();
+
+ /*
+ * Offset from the entry stack to task stack stored in TSS. Kernel entry
+ * happens on the per-cpu entry-stack, and the asm code switches to the
+ * task-stack pointer stored in x86_tss.sp1, which is a copy of
+ * task->thread.sp0 where entry code can find it.
+ */
+ DEFINE(TSS_entry2task_stack,
+ offsetof(struct cpu_entry_area, tss.x86_tss.sp1) -
+ offsetofend(struct cpu_entry_area, entry_stack_page.stack));
+
+#ifdef CONFIG_STACKPROTECTOR
+ BLANK();
+ OFFSET(stack_canary_offset, stack_canary, canary);
+#endif
+
+ BLANK();
+ DEFINE(__NR_syscall_max, sizeof(syscalls) - 1);
+ DEFINE(NR_syscalls, sizeof(syscalls));
+}
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
new file mode 100644
index 000000000..3b9405e7b
--- /dev/null
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef __LINUX_KBUILD_H
+# error "Please do not build this file directly, build asm-offsets.c instead"
+#endif
+
+#include <asm/ia32.h>
+
+#define __SYSCALL_64(nr, sym, qual) [nr] = 1,
+static char syscalls_64[] = {
+#include <asm/syscalls_64.h>
+};
+#define __SYSCALL_I386(nr, sym, qual) [nr] = 1,
+static char syscalls_ia32[] = {
+#include <asm/syscalls_32.h>
+};
+
+#if defined(CONFIG_KVM_GUEST) && defined(CONFIG_PARAVIRT_SPINLOCKS)
+#include <asm/kvm_para.h>
+#endif
+
+int main(void)
+{
+#ifdef CONFIG_PARAVIRT
+ OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64);
+ OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs);
+#ifdef CONFIG_DEBUG_ENTRY
+ OFFSET(PV_IRQ_save_fl, pv_irq_ops, save_fl);
+#endif
+ BLANK();
+#endif
+
+#if defined(CONFIG_KVM_GUEST) && defined(CONFIG_PARAVIRT_SPINLOCKS)
+ OFFSET(KVM_STEAL_TIME_preempted, kvm_steal_time, preempted);
+ BLANK();
+#endif
+
+#define ENTRY(entry) OFFSET(pt_regs_ ## entry, pt_regs, entry)
+ ENTRY(bx);
+ ENTRY(cx);
+ ENTRY(dx);
+ ENTRY(sp);
+ ENTRY(bp);
+ ENTRY(si);
+ ENTRY(di);
+ ENTRY(r8);
+ ENTRY(r9);
+ ENTRY(r10);
+ ENTRY(r11);
+ ENTRY(r12);
+ ENTRY(r13);
+ ENTRY(r14);
+ ENTRY(r15);
+ ENTRY(flags);
+ BLANK();
+#undef ENTRY
+
+#define ENTRY(entry) OFFSET(saved_context_ ## entry, saved_context, entry)
+ ENTRY(cr0);
+ ENTRY(cr2);
+ ENTRY(cr3);
+ ENTRY(cr4);
+ ENTRY(cr8);
+ ENTRY(gdt_desc);
+ BLANK();
+#undef ENTRY
+
+ OFFSET(TSS_ist, tss_struct, x86_tss.ist);
+ BLANK();
+
+#ifdef CONFIG_STACKPROTECTOR
+ DEFINE(stack_canary_offset, offsetof(union irq_stack_union, stack_canary));
+ BLANK();
+#endif
+
+ DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1);
+ DEFINE(NR_syscalls, sizeof(syscalls_64));
+
+ DEFINE(__NR_syscall_compat_max, sizeof(syscalls_ia32) - 1);
+ DEFINE(IA32_NR_syscalls, sizeof(syscalls_ia32));
+
+ return 0;
+}
diff --git a/arch/x86/kernel/audit_64.c b/arch/x86/kernel/audit_64.c
new file mode 100644
index 000000000..e1efe44eb
--- /dev/null
+++ b/arch/x86/kernel/audit_64.c
@@ -0,0 +1,83 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/audit.h>
+#include <asm/unistd.h>
+
+static unsigned dir_class[] = {
+#include <asm-generic/audit_dir_write.h>
+~0U
+};
+
+static unsigned read_class[] = {
+#include <asm-generic/audit_read.h>
+~0U
+};
+
+static unsigned write_class[] = {
+#include <asm-generic/audit_write.h>
+~0U
+};
+
+static unsigned chattr_class[] = {
+#include <asm-generic/audit_change_attr.h>
+~0U
+};
+
+static unsigned signal_class[] = {
+#include <asm-generic/audit_signal.h>
+~0U
+};
+
+int audit_classify_arch(int arch)
+{
+#ifdef CONFIG_IA32_EMULATION
+ if (arch == AUDIT_ARCH_I386)
+ return 1;
+#endif
+ return 0;
+}
+
+int audit_classify_syscall(int abi, unsigned syscall)
+{
+#ifdef CONFIG_IA32_EMULATION
+ extern int ia32_classify_syscall(unsigned);
+ if (abi == AUDIT_ARCH_I386)
+ return ia32_classify_syscall(syscall);
+#endif
+ switch(syscall) {
+ case __NR_open:
+ return 2;
+ case __NR_openat:
+ return 3;
+ case __NR_execve:
+ case __NR_execveat:
+ return 5;
+ default:
+ return 0;
+ }
+}
+
+static int __init audit_classes_init(void)
+{
+#ifdef CONFIG_IA32_EMULATION
+ extern __u32 ia32_dir_class[];
+ extern __u32 ia32_write_class[];
+ extern __u32 ia32_read_class[];
+ extern __u32 ia32_chattr_class[];
+ extern __u32 ia32_signal_class[];
+ audit_register_class(AUDIT_CLASS_WRITE_32, ia32_write_class);
+ audit_register_class(AUDIT_CLASS_READ_32, ia32_read_class);
+ audit_register_class(AUDIT_CLASS_DIR_WRITE_32, ia32_dir_class);
+ audit_register_class(AUDIT_CLASS_CHATTR_32, ia32_chattr_class);
+ audit_register_class(AUDIT_CLASS_SIGNAL_32, ia32_signal_class);
+#endif
+ audit_register_class(AUDIT_CLASS_WRITE, write_class);
+ audit_register_class(AUDIT_CLASS_READ, read_class);
+ audit_register_class(AUDIT_CLASS_DIR_WRITE, dir_class);
+ audit_register_class(AUDIT_CLASS_CHATTR, chattr_class);
+ audit_register_class(AUDIT_CLASS_SIGNAL, signal_class);
+ return 0;
+}
+
+__initcall(audit_classes_init);
diff --git a/arch/x86/kernel/bootflag.c b/arch/x86/kernel/bootflag.c
new file mode 100644
index 000000000..3fed7ae58
--- /dev/null
+++ b/arch/x86/kernel/bootflag.c
@@ -0,0 +1,102 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Implement 'Simple Boot Flag Specification 2.0'
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/spinlock.h>
+#include <linux/acpi.h>
+#include <asm/io.h>
+
+#include <linux/mc146818rtc.h>
+
+#define SBF_RESERVED (0x78)
+#define SBF_PNPOS (1<<0)
+#define SBF_BOOTING (1<<1)
+#define SBF_DIAG (1<<2)
+#define SBF_PARITY (1<<7)
+
+int sbf_port __initdata = -1; /* set via acpi_boot_init() */
+
+static int __init parity(u8 v)
+{
+ int x = 0;
+ int i;
+
+ for (i = 0; i < 8; i++) {
+ x ^= (v & 1);
+ v >>= 1;
+ }
+
+ return x;
+}
+
+static void __init sbf_write(u8 v)
+{
+ unsigned long flags;
+
+ if (sbf_port != -1) {
+ v &= ~SBF_PARITY;
+ if (!parity(v))
+ v |= SBF_PARITY;
+
+ printk(KERN_INFO "Simple Boot Flag at 0x%x set to 0x%x\n",
+ sbf_port, v);
+
+ spin_lock_irqsave(&rtc_lock, flags);
+ CMOS_WRITE(v, sbf_port);
+ spin_unlock_irqrestore(&rtc_lock, flags);
+ }
+}
+
+static u8 __init sbf_read(void)
+{
+ unsigned long flags;
+ u8 v;
+
+ if (sbf_port == -1)
+ return 0;
+
+ spin_lock_irqsave(&rtc_lock, flags);
+ v = CMOS_READ(sbf_port);
+ spin_unlock_irqrestore(&rtc_lock, flags);
+
+ return v;
+}
+
+static int __init sbf_value_valid(u8 v)
+{
+ if (v & SBF_RESERVED) /* Reserved bits */
+ return 0;
+ if (!parity(v))
+ return 0;
+
+ return 1;
+}
+
+static int __init sbf_init(void)
+{
+ u8 v;
+
+ if (sbf_port == -1)
+ return 0;
+
+ v = sbf_read();
+ if (!sbf_value_valid(v)) {
+ printk(KERN_WARNING "Simple Boot Flag value 0x%x read from "
+ "CMOS RAM was invalid\n", v);
+ }
+
+ v &= ~SBF_RESERVED;
+ v &= ~SBF_BOOTING;
+ v &= ~SBF_DIAG;
+#if defined(CONFIG_ISAPNP)
+ v |= SBF_PNPOS;
+#endif
+ sbf_write(v);
+
+ return 0;
+}
+arch_initcall(sbf_init);
diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c
new file mode 100644
index 000000000..cc8258a53
--- /dev/null
+++ b/arch/x86/kernel/check.c
@@ -0,0 +1,183 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/kthread.h>
+#include <linux/workqueue.h>
+#include <linux/memblock.h>
+
+#include <asm/proto.h>
+
+/*
+ * Some BIOSes seem to corrupt the low 64k of memory during events
+ * like suspend/resume and unplugging an HDMI cable. Reserve all
+ * remaining free memory in that area and fill it with a distinct
+ * pattern.
+ */
+#define MAX_SCAN_AREAS 8
+
+static int __read_mostly memory_corruption_check = -1;
+
+static unsigned __read_mostly corruption_check_size = 64*1024;
+static unsigned __read_mostly corruption_check_period = 60; /* seconds */
+
+static struct scan_area {
+ u64 addr;
+ u64 size;
+} scan_areas[MAX_SCAN_AREAS];
+static int num_scan_areas;
+
+static __init int set_corruption_check(char *arg)
+{
+ ssize_t ret;
+ unsigned long val;
+
+ if (!arg) {
+ pr_err("memory_corruption_check config string not provided\n");
+ return -EINVAL;
+ }
+
+ ret = kstrtoul(arg, 10, &val);
+ if (ret)
+ return ret;
+
+ memory_corruption_check = val;
+ return 0;
+}
+early_param("memory_corruption_check", set_corruption_check);
+
+static __init int set_corruption_check_period(char *arg)
+{
+ ssize_t ret;
+ unsigned long val;
+
+ if (!arg) {
+ pr_err("memory_corruption_check_period config string not provided\n");
+ return -EINVAL;
+ }
+
+ ret = kstrtoul(arg, 10, &val);
+ if (ret)
+ return ret;
+
+ corruption_check_period = val;
+ return 0;
+}
+early_param("memory_corruption_check_period", set_corruption_check_period);
+
+static __init int set_corruption_check_size(char *arg)
+{
+ char *end;
+ unsigned size;
+
+ if (!arg) {
+ pr_err("memory_corruption_check_size config string not provided\n");
+ return -EINVAL;
+ }
+
+ size = memparse(arg, &end);
+
+ if (*end == '\0')
+ corruption_check_size = size;
+
+ return (size == corruption_check_size) ? 0 : -EINVAL;
+}
+early_param("memory_corruption_check_size", set_corruption_check_size);
+
+
+void __init setup_bios_corruption_check(void)
+{
+ phys_addr_t start, end;
+ u64 i;
+
+ if (memory_corruption_check == -1) {
+ memory_corruption_check =
+#ifdef CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK
+ 1
+#else
+ 0
+#endif
+ ;
+ }
+
+ if (corruption_check_size == 0)
+ memory_corruption_check = 0;
+
+ if (!memory_corruption_check)
+ return;
+
+ corruption_check_size = round_up(corruption_check_size, PAGE_SIZE);
+
+ for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end,
+ NULL) {
+ start = clamp_t(phys_addr_t, round_up(start, PAGE_SIZE),
+ PAGE_SIZE, corruption_check_size);
+ end = clamp_t(phys_addr_t, round_down(end, PAGE_SIZE),
+ PAGE_SIZE, corruption_check_size);
+ if (start >= end)
+ continue;
+
+ memblock_reserve(start, end - start);
+ scan_areas[num_scan_areas].addr = start;
+ scan_areas[num_scan_areas].size = end - start;
+
+ /* Assume we've already mapped this early memory */
+ memset(__va(start), 0, end - start);
+
+ if (++num_scan_areas >= MAX_SCAN_AREAS)
+ break;
+ }
+
+ if (num_scan_areas)
+ printk(KERN_INFO "Scanning %d areas for low memory corruption\n", num_scan_areas);
+}
+
+
+void check_for_bios_corruption(void)
+{
+ int i;
+ int corruption = 0;
+
+ if (!memory_corruption_check)
+ return;
+
+ for (i = 0; i < num_scan_areas; i++) {
+ unsigned long *addr = __va(scan_areas[i].addr);
+ unsigned long size = scan_areas[i].size;
+
+ for (; size; addr++, size -= sizeof(unsigned long)) {
+ if (!*addr)
+ continue;
+ printk(KERN_ERR "Corrupted low memory at %p (%lx phys) = %08lx\n",
+ addr, __pa(addr), *addr);
+ corruption = 1;
+ *addr = 0;
+ }
+ }
+
+ WARN_ONCE(corruption, KERN_ERR "Memory corruption detected in low memory\n");
+}
+
+static void check_corruption(struct work_struct *dummy);
+static DECLARE_DELAYED_WORK(bios_check_work, check_corruption);
+
+static void check_corruption(struct work_struct *dummy)
+{
+ check_for_bios_corruption();
+ schedule_delayed_work(&bios_check_work,
+ round_jiffies_relative(corruption_check_period*HZ));
+}
+
+static int start_periodic_check_for_corruption(void)
+{
+ if (!num_scan_areas || !memory_corruption_check || corruption_check_period == 0)
+ return 0;
+
+ printk(KERN_INFO "Scanning for low memory corruption every %d seconds\n",
+ corruption_check_period);
+
+ /* First time we run the checks right away */
+ schedule_delayed_work(&bios_check_work, 0);
+ return 0;
+}
+device_initcall(start_periodic_check_for_corruption);
+
diff --git a/arch/x86/kernel/cpu/.gitignore b/arch/x86/kernel/cpu/.gitignore
new file mode 100644
index 000000000..667df55a4
--- /dev/null
+++ b/arch/x86/kernel/cpu/.gitignore
@@ -0,0 +1 @@
+capflags.c
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
new file mode 100644
index 000000000..320769b48
--- /dev/null
+++ b/arch/x86/kernel/cpu/Makefile
@@ -0,0 +1,60 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for x86-compatible CPU details, features and quirks
+#
+
+# Don't trace early stages of a secondary CPU boot
+ifdef CONFIG_FUNCTION_TRACER
+CFLAGS_REMOVE_common.o = -pg
+CFLAGS_REMOVE_perf_event.o = -pg
+endif
+
+# If these files are instrumented, boot hangs during the first second.
+KCOV_INSTRUMENT_common.o := n
+KCOV_INSTRUMENT_perf_event.o := n
+
+# Make sure load_percpu_segment has no stackprotector
+nostackp := $(call cc-option, -fno-stack-protector)
+CFLAGS_common.o := $(nostackp)
+
+obj-y := cacheinfo.o scattered.o topology.o
+obj-y += common.o
+obj-y += rdrand.o
+obj-y += match.o
+obj-y += bugs.o
+obj-y += aperfmperf.o
+obj-y += cpuid-deps.o
+
+obj-$(CONFIG_PROC_FS) += proc.o
+obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o
+
+obj-$(CONFIG_CPU_SUP_INTEL) += intel.o intel_pconfig.o tsx.o
+obj-$(CONFIG_CPU_SUP_AMD) += amd.o
+obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o
+obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o
+obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
+obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
+
+obj-$(CONFIG_INTEL_RDT) += intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_monitor.o
+obj-$(CONFIG_INTEL_RDT) += intel_rdt_ctrlmondata.o intel_rdt_pseudo_lock.o
+CFLAGS_intel_rdt_pseudo_lock.o = -I$(src)
+
+obj-$(CONFIG_X86_MCE) += mcheck/
+obj-$(CONFIG_MTRR) += mtrr/
+obj-$(CONFIG_MICROCODE) += microcode/
+
+obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
+
+obj-$(CONFIG_HYPERVISOR_GUEST) += vmware.o hypervisor.o mshyperv.o
+
+ifdef CONFIG_X86_FEATURE_NAMES
+quiet_cmd_mkcapflags = MKCAP $@
+ cmd_mkcapflags = $(CONFIG_SHELL) $(srctree)/$(src)/mkcapflags.sh $< $@
+
+cpufeature = $(src)/../../include/asm/cpufeatures.h
+
+targets += capflags.c
+$(obj)/capflags.c: $(cpufeature) $(src)/mkcapflags.sh FORCE
+ $(call if_changed,mkcapflags)
+endif
+clean-files += capflags.c
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
new file mode 100644
index 000000000..98c23126f
--- /dev/null
+++ b/arch/x86/kernel/cpu/amd.c
@@ -0,0 +1,1177 @@
+#include <linux/export.h>
+#include <linux/bitops.h>
+#include <linux/elf.h>
+#include <linux/mm.h>
+
+#include <linux/io.h>
+#include <linux/sched.h>
+#include <linux/sched/clock.h>
+#include <linux/random.h>
+#include <asm/processor.h>
+#include <asm/apic.h>
+#include <asm/cacheinfo.h>
+#include <asm/cpu.h>
+#include <asm/spec-ctrl.h>
+#include <asm/smp.h>
+#include <asm/pci-direct.h>
+#include <asm/delay.h>
+
+#ifdef CONFIG_X86_64
+# include <asm/mmconfig.h>
+# include <asm/set_memory.h>
+#endif
+
+#include "cpu.h"
+
+static const int amd_erratum_383[];
+static const int amd_erratum_400[];
+static const int amd_erratum_1054[];
+static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum);
+
+/*
+ * nodes_per_socket: Stores the number of nodes per socket.
+ * Refer to Fam15h Models 00-0fh BKDG - CPUID Fn8000_001E_ECX
+ * Node Identifiers[10:8]
+ */
+static u32 nodes_per_socket = 1;
+
+static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p)
+{
+ u32 gprs[8] = { 0 };
+ int err;
+
+ WARN_ONCE((boot_cpu_data.x86 != 0xf),
+ "%s should only be used on K8!\n", __func__);
+
+ gprs[1] = msr;
+ gprs[7] = 0x9c5a203a;
+
+ err = rdmsr_safe_regs(gprs);
+
+ *p = gprs[0] | ((u64)gprs[2] << 32);
+
+ return err;
+}
+
+static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val)
+{
+ u32 gprs[8] = { 0 };
+
+ WARN_ONCE((boot_cpu_data.x86 != 0xf),
+ "%s should only be used on K8!\n", __func__);
+
+ gprs[0] = (u32)val;
+ gprs[1] = msr;
+ gprs[2] = val >> 32;
+ gprs[7] = 0x9c5a203a;
+
+ return wrmsr_safe_regs(gprs);
+}
+
+/*
+ * B step AMD K6 before B 9730xxxx have hardware bugs that can cause
+ * misexecution of code under Linux. Owners of such processors should
+ * contact AMD for precise details and a CPU swap.
+ *
+ * See http://www.multimania.com/poulot/k6bug.html
+ * and section 2.6.2 of "AMD-K6 Processor Revision Guide - Model 6"
+ * (Publication # 21266 Issue Date: August 1998)
+ *
+ * The following test is erm.. interesting. AMD neglected to up
+ * the chip setting when fixing the bug but they also tweaked some
+ * performance at the same time..
+ */
+
+extern __visible void vide(void);
+__asm__(".globl vide\n"
+ ".type vide, @function\n"
+ ".align 4\n"
+ "vide: ret\n");
+
+static void init_amd_k5(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_X86_32
+/*
+ * General Systems BIOSen alias the cpu frequency registers
+ * of the Elan at 0x000df000. Unfortunately, one of the Linux
+ * drivers subsequently pokes it, and changes the CPU speed.
+ * Workaround : Remove the unneeded alias.
+ */
+#define CBAR (0xfffc) /* Configuration Base Address (32-bit) */
+#define CBAR_ENB (0x80000000)
+#define CBAR_KEY (0X000000CB)
+ if (c->x86_model == 9 || c->x86_model == 10) {
+ if (inl(CBAR) & CBAR_ENB)
+ outl(0 | CBAR_KEY, CBAR);
+ }
+#endif
+}
+
+static void init_amd_k6(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_X86_32
+ u32 l, h;
+ int mbytes = get_num_physpages() >> (20-PAGE_SHIFT);
+
+ if (c->x86_model < 6) {
+ /* Based on AMD doc 20734R - June 2000 */
+ if (c->x86_model == 0) {
+ clear_cpu_cap(c, X86_FEATURE_APIC);
+ set_cpu_cap(c, X86_FEATURE_PGE);
+ }
+ return;
+ }
+
+ if (c->x86_model == 6 && c->x86_stepping == 1) {
+ const int K6_BUG_LOOP = 1000000;
+ int n;
+ void (*f_vide)(void);
+ u64 d, d2;
+
+ pr_info("AMD K6 stepping B detected - ");
+
+ /*
+ * It looks like AMD fixed the 2.6.2 bug and improved indirect
+ * calls at the same time.
+ */
+
+ n = K6_BUG_LOOP;
+ f_vide = vide;
+ OPTIMIZER_HIDE_VAR(f_vide);
+ d = rdtsc();
+ while (n--)
+ f_vide();
+ d2 = rdtsc();
+ d = d2-d;
+
+ if (d > 20*K6_BUG_LOOP)
+ pr_cont("system stability may be impaired when more than 32 MB are used.\n");
+ else
+ pr_cont("probably OK (after B9730xxxx).\n");
+ }
+
+ /* K6 with old style WHCR */
+ if (c->x86_model < 8 ||
+ (c->x86_model == 8 && c->x86_stepping < 8)) {
+ /* We can only write allocate on the low 508Mb */
+ if (mbytes > 508)
+ mbytes = 508;
+
+ rdmsr(MSR_K6_WHCR, l, h);
+ if ((l&0x0000FFFF) == 0) {
+ unsigned long flags;
+ l = (1<<0)|((mbytes/4)<<1);
+ local_irq_save(flags);
+ wbinvd();
+ wrmsr(MSR_K6_WHCR, l, h);
+ local_irq_restore(flags);
+ pr_info("Enabling old style K6 write allocation for %d Mb\n",
+ mbytes);
+ }
+ return;
+ }
+
+ if ((c->x86_model == 8 && c->x86_stepping > 7) ||
+ c->x86_model == 9 || c->x86_model == 13) {
+ /* The more serious chips .. */
+
+ if (mbytes > 4092)
+ mbytes = 4092;
+
+ rdmsr(MSR_K6_WHCR, l, h);
+ if ((l&0xFFFF0000) == 0) {
+ unsigned long flags;
+ l = ((mbytes>>2)<<22)|(1<<16);
+ local_irq_save(flags);
+ wbinvd();
+ wrmsr(MSR_K6_WHCR, l, h);
+ local_irq_restore(flags);
+ pr_info("Enabling new style K6 write allocation for %d Mb\n",
+ mbytes);
+ }
+
+ return;
+ }
+
+ if (c->x86_model == 10) {
+ /* AMD Geode LX is model 10 */
+ /* placeholder for any needed mods */
+ return;
+ }
+#endif
+}
+
+static void init_amd_k7(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_X86_32
+ u32 l, h;
+
+ /*
+ * Bit 15 of Athlon specific MSR 15, needs to be 0
+ * to enable SSE on Palomino/Morgan/Barton CPU's.
+ * If the BIOS didn't enable it already, enable it here.
+ */
+ if (c->x86_model >= 6 && c->x86_model <= 10) {
+ if (!cpu_has(c, X86_FEATURE_XMM)) {
+ pr_info("Enabling disabled K7/SSE Support.\n");
+ msr_clear_bit(MSR_K7_HWCR, 15);
+ set_cpu_cap(c, X86_FEATURE_XMM);
+ }
+ }
+
+ /*
+ * It's been determined by AMD that Athlons since model 8 stepping 1
+ * are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx
+ * As per AMD technical note 27212 0.2
+ */
+ if ((c->x86_model == 8 && c->x86_stepping >= 1) || (c->x86_model > 8)) {
+ rdmsr(MSR_K7_CLK_CTL, l, h);
+ if ((l & 0xfff00000) != 0x20000000) {
+ pr_info("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n",
+ l, ((l & 0x000fffff)|0x20000000));
+ wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h);
+ }
+ }
+
+ /* calling is from identify_secondary_cpu() ? */
+ if (!c->cpu_index)
+ return;
+
+ /*
+ * Certain Athlons might work (for various values of 'work') in SMP
+ * but they are not certified as MP capable.
+ */
+ /* Athlon 660/661 is valid. */
+ if ((c->x86_model == 6) && ((c->x86_stepping == 0) ||
+ (c->x86_stepping == 1)))
+ return;
+
+ /* Duron 670 is valid */
+ if ((c->x86_model == 7) && (c->x86_stepping == 0))
+ return;
+
+ /*
+ * Athlon 662, Duron 671, and Athlon >model 7 have capability
+ * bit. It's worth noting that the A5 stepping (662) of some
+ * Athlon XP's have the MP bit set.
+ * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for
+ * more.
+ */
+ if (((c->x86_model == 6) && (c->x86_stepping >= 2)) ||
+ ((c->x86_model == 7) && (c->x86_stepping >= 1)) ||
+ (c->x86_model > 7))
+ if (cpu_has(c, X86_FEATURE_MP))
+ return;
+
+ /* If we get here, not a certified SMP capable AMD system. */
+
+ /*
+ * Don't taint if we are running SMP kernel on a single non-MP
+ * approved Athlon
+ */
+ WARN_ONCE(1, "WARNING: This combination of AMD"
+ " processors is not suitable for SMP.\n");
+ add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_NOW_UNRELIABLE);
+#endif
+}
+
+#ifdef CONFIG_NUMA
+/*
+ * To workaround broken NUMA config. Read the comment in
+ * srat_detect_node().
+ */
+static int nearby_node(int apicid)
+{
+ int i, node;
+
+ for (i = apicid - 1; i >= 0; i--) {
+ node = __apicid_to_node[i];
+ if (node != NUMA_NO_NODE && node_online(node))
+ return node;
+ }
+ for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
+ node = __apicid_to_node[i];
+ if (node != NUMA_NO_NODE && node_online(node))
+ return node;
+ }
+ return first_node(node_online_map); /* Shouldn't happen */
+}
+#endif
+
+/*
+ * Fix up cpu_core_id for pre-F17h systems to be in the
+ * [0 .. cores_per_node - 1] range. Not really needed but
+ * kept so as not to break existing setups.
+ */
+static void legacy_fixup_core_id(struct cpuinfo_x86 *c)
+{
+ u32 cus_per_node;
+
+ if (c->x86 >= 0x17)
+ return;
+
+ cus_per_node = c->x86_max_cores / nodes_per_socket;
+ c->cpu_core_id %= cus_per_node;
+}
+
+
+static void amd_get_topology_early(struct cpuinfo_x86 *c)
+{
+ if (cpu_has(c, X86_FEATURE_TOPOEXT))
+ smp_num_siblings = ((cpuid_ebx(0x8000001e) >> 8) & 0xff) + 1;
+}
+
+/*
+ * Fixup core topology information for
+ * (1) AMD multi-node processors
+ * Assumption: Number of cores in each internal node is the same.
+ * (2) AMD processors supporting compute units
+ */
+static void amd_get_topology(struct cpuinfo_x86 *c)
+{
+ u8 node_id;
+ int cpu = smp_processor_id();
+
+ /* get information required for multi-node processors */
+ if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
+ int err;
+ u32 eax, ebx, ecx, edx;
+
+ cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
+
+ node_id = ecx & 0xff;
+
+ if (c->x86 == 0x15)
+ c->cu_id = ebx & 0xff;
+
+ if (c->x86 >= 0x17) {
+ c->cpu_core_id = ebx & 0xff;
+
+ if (smp_num_siblings > 1)
+ c->x86_max_cores /= smp_num_siblings;
+ }
+
+ /*
+ * In case leaf B is available, use it to derive
+ * topology information.
+ */
+ err = detect_extended_topology(c);
+ if (!err)
+ c->x86_coreid_bits = get_count_order(c->x86_max_cores);
+
+ cacheinfo_amd_init_llc_id(c, cpu, node_id);
+
+ } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) {
+ u64 value;
+
+ rdmsrl(MSR_FAM10H_NODE_ID, value);
+ node_id = value & 7;
+
+ per_cpu(cpu_llc_id, cpu) = node_id;
+ } else
+ return;
+
+ if (nodes_per_socket > 1) {
+ set_cpu_cap(c, X86_FEATURE_AMD_DCM);
+ legacy_fixup_core_id(c);
+ }
+}
+
+/*
+ * On a AMD dual core setup the lower bits of the APIC id distinguish the cores.
+ * Assumes number of cores is a power of two.
+ */
+static void amd_detect_cmp(struct cpuinfo_x86 *c)
+{
+ unsigned bits;
+ int cpu = smp_processor_id();
+
+ bits = c->x86_coreid_bits;
+ /* Low order bits define the core id (index of core in socket) */
+ c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
+ /* Convert the initial APIC ID into the socket ID */
+ c->phys_proc_id = c->initial_apicid >> bits;
+ /* use socket ID also for last level cache */
+ per_cpu(cpu_llc_id, cpu) = c->phys_proc_id;
+}
+
+u16 amd_get_nb_id(int cpu)
+{
+ return per_cpu(cpu_llc_id, cpu);
+}
+EXPORT_SYMBOL_GPL(amd_get_nb_id);
+
+u32 amd_get_nodes_per_socket(void)
+{
+ return nodes_per_socket;
+}
+EXPORT_SYMBOL_GPL(amd_get_nodes_per_socket);
+
+static void srat_detect_node(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_NUMA
+ int cpu = smp_processor_id();
+ int node;
+ unsigned apicid = c->apicid;
+
+ node = numa_cpu_node(cpu);
+ if (node == NUMA_NO_NODE)
+ node = per_cpu(cpu_llc_id, cpu);
+
+ /*
+ * On multi-fabric platform (e.g. Numascale NumaChip) a
+ * platform-specific handler needs to be called to fixup some
+ * IDs of the CPU.
+ */
+ if (x86_cpuinit.fixup_cpu_id)
+ x86_cpuinit.fixup_cpu_id(c, node);
+
+ if (!node_online(node)) {
+ /*
+ * Two possibilities here:
+ *
+ * - The CPU is missing memory and no node was created. In
+ * that case try picking one from a nearby CPU.
+ *
+ * - The APIC IDs differ from the HyperTransport node IDs
+ * which the K8 northbridge parsing fills in. Assume
+ * they are all increased by a constant offset, but in
+ * the same order as the HT nodeids. If that doesn't
+ * result in a usable node fall back to the path for the
+ * previous case.
+ *
+ * This workaround operates directly on the mapping between
+ * APIC ID and NUMA node, assuming certain relationship
+ * between APIC ID, HT node ID and NUMA topology. As going
+ * through CPU mapping may alter the outcome, directly
+ * access __apicid_to_node[].
+ */
+ int ht_nodeid = c->initial_apicid;
+
+ if (__apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
+ node = __apicid_to_node[ht_nodeid];
+ /* Pick a nearby node */
+ if (!node_online(node))
+ node = nearby_node(apicid);
+ }
+ numa_set_node(cpu, node);
+#endif
+}
+
+static void early_init_amd_mc(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_SMP
+ unsigned bits, ecx;
+
+ /* Multi core CPU? */
+ if (c->extended_cpuid_level < 0x80000008)
+ return;
+
+ ecx = cpuid_ecx(0x80000008);
+
+ c->x86_max_cores = (ecx & 0xff) + 1;
+
+ /* CPU telling us the core id bits shift? */
+ bits = (ecx >> 12) & 0xF;
+
+ /* Otherwise recompute */
+ if (bits == 0) {
+ while ((1 << bits) < c->x86_max_cores)
+ bits++;
+ }
+
+ c->x86_coreid_bits = bits;
+#endif
+}
+
+static void bsp_init_amd(struct cpuinfo_x86 *c)
+{
+
+#ifdef CONFIG_X86_64
+ if (c->x86 >= 0xf) {
+ unsigned long long tseg;
+
+ /*
+ * Split up direct mapping around the TSEG SMM area.
+ * Don't do it for gbpages because there seems very little
+ * benefit in doing so.
+ */
+ if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
+ unsigned long pfn = tseg >> PAGE_SHIFT;
+
+ pr_debug("tseg: %010llx\n", tseg);
+ if (pfn_range_is_mapped(pfn, pfn + 1))
+ set_memory_4k((unsigned long)__va(tseg), 1);
+ }
+ }
+#endif
+
+ if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) {
+
+ if (c->x86 > 0x10 ||
+ (c->x86 == 0x10 && c->x86_model >= 0x2)) {
+ u64 val;
+
+ rdmsrl(MSR_K7_HWCR, val);
+ if (!(val & BIT(24)))
+ pr_warn(FW_BUG "TSC doesn't count with P0 frequency!\n");
+ }
+ }
+
+ if (c->x86 == 0x15) {
+ unsigned long upperbit;
+ u32 cpuid, assoc;
+
+ cpuid = cpuid_edx(0x80000005);
+ assoc = cpuid >> 16 & 0xff;
+ upperbit = ((cpuid >> 24) << 10) / assoc;
+
+ va_align.mask = (upperbit - 1) & PAGE_MASK;
+ va_align.flags = ALIGN_VA_32 | ALIGN_VA_64;
+
+ /* A random value per boot for bit slice [12:upper_bit) */
+ va_align.bits = get_random_int() & va_align.mask;
+ }
+
+ if (cpu_has(c, X86_FEATURE_MWAITX))
+ use_mwaitx_delay();
+
+ if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
+ u32 ecx;
+
+ ecx = cpuid_ecx(0x8000001e);
+ nodes_per_socket = ((ecx >> 8) & 7) + 1;
+ } else if (boot_cpu_has(X86_FEATURE_NODEID_MSR)) {
+ u64 value;
+
+ rdmsrl(MSR_FAM10H_NODE_ID, value);
+ nodes_per_socket = ((value >> 3) & 7) + 1;
+ }
+
+ if (!boot_cpu_has(X86_FEATURE_AMD_SSBD) &&
+ !boot_cpu_has(X86_FEATURE_VIRT_SSBD) &&
+ c->x86 >= 0x15 && c->x86 <= 0x17) {
+ unsigned int bit;
+
+ switch (c->x86) {
+ case 0x15: bit = 54; break;
+ case 0x16: bit = 33; break;
+ case 0x17: bit = 10; break;
+ default: return;
+ }
+ /*
+ * Try to cache the base value so further operations can
+ * avoid RMW. If that faults, do not enable SSBD.
+ */
+ if (!rdmsrl_safe(MSR_AMD64_LS_CFG, &x86_amd_ls_cfg_base)) {
+ setup_force_cpu_cap(X86_FEATURE_LS_CFG_SSBD);
+ setup_force_cpu_cap(X86_FEATURE_SSBD);
+ x86_amd_ls_cfg_ssbd_mask = 1ULL << bit;
+ }
+ }
+}
+
+static void early_detect_mem_encrypt(struct cpuinfo_x86 *c)
+{
+ u64 msr;
+
+ /*
+ * BIOS support is required for SME and SEV.
+ * For SME: If BIOS has enabled SME then adjust x86_phys_bits by
+ * the SME physical address space reduction value.
+ * If BIOS has not enabled SME then don't advertise the
+ * SME feature (set in scattered.c).
+ * For SEV: If BIOS has not enabled SEV then don't advertise the
+ * SEV feature (set in scattered.c).
+ *
+ * In all cases, since support for SME and SEV requires long mode,
+ * don't advertise the feature under CONFIG_X86_32.
+ */
+ if (cpu_has(c, X86_FEATURE_SME) || cpu_has(c, X86_FEATURE_SEV)) {
+ /* Check if memory encryption is enabled */
+ rdmsrl(MSR_K8_SYSCFG, msr);
+ if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT))
+ goto clear_all;
+
+ /*
+ * Always adjust physical address bits. Even though this
+ * will be a value above 32-bits this is still done for
+ * CONFIG_X86_32 so that accurate values are reported.
+ */
+ c->x86_phys_bits -= (cpuid_ebx(0x8000001f) >> 6) & 0x3f;
+
+ if (IS_ENABLED(CONFIG_X86_32))
+ goto clear_all;
+
+ rdmsrl(MSR_K7_HWCR, msr);
+ if (!(msr & MSR_K7_HWCR_SMMLOCK))
+ goto clear_sev;
+
+ return;
+
+clear_all:
+ setup_clear_cpu_cap(X86_FEATURE_SME);
+clear_sev:
+ setup_clear_cpu_cap(X86_FEATURE_SEV);
+ }
+}
+
+static void early_init_amd(struct cpuinfo_x86 *c)
+{
+ u64 value;
+ u32 dummy;
+
+ early_init_amd_mc(c);
+
+#ifdef CONFIG_X86_32
+ if (c->x86 == 6)
+ set_cpu_cap(c, X86_FEATURE_K7);
+#endif
+
+ if (c->x86 >= 0xf)
+ set_cpu_cap(c, X86_FEATURE_K8);
+
+ rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);
+
+ /*
+ * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate
+ * with P/T states and does not stop in deep C-states
+ */
+ if (c->x86_power & (1 << 8)) {
+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+ set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
+ }
+
+ /* Bit 12 of 8000_0007 edx is accumulated power mechanism. */
+ if (c->x86_power & BIT(12))
+ set_cpu_cap(c, X86_FEATURE_ACC_POWER);
+
+#ifdef CONFIG_X86_64
+ set_cpu_cap(c, X86_FEATURE_SYSCALL32);
+#else
+ /* Set MTRR capability flag if appropriate */
+ if (c->x86 == 5)
+ if (c->x86_model == 13 || c->x86_model == 9 ||
+ (c->x86_model == 8 && c->x86_stepping >= 8))
+ set_cpu_cap(c, X86_FEATURE_K6_MTRR);
+#endif
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI)
+ /*
+ * ApicID can always be treated as an 8-bit value for AMD APIC versions
+ * >= 0x10, but even old K8s came out of reset with version 0x10. So, we
+ * can safely set X86_FEATURE_EXTD_APICID unconditionally for families
+ * after 16h.
+ */
+ if (boot_cpu_has(X86_FEATURE_APIC)) {
+ if (c->x86 > 0x16)
+ set_cpu_cap(c, X86_FEATURE_EXTD_APICID);
+ else if (c->x86 >= 0xf) {
+ /* check CPU config space for extended APIC ID */
+ unsigned int val;
+
+ val = read_pci_config(0, 24, 0, 0x68);
+ if ((val >> 17 & 0x3) == 0x3)
+ set_cpu_cap(c, X86_FEATURE_EXTD_APICID);
+ }
+ }
+#endif
+
+ /*
+ * This is only needed to tell the kernel whether to use VMCALL
+ * and VMMCALL. VMMCALL is never executed except under virt, so
+ * we can set it unconditionally.
+ */
+ set_cpu_cap(c, X86_FEATURE_VMMCALL);
+
+ /* F16h erratum 793, CVE-2013-6885 */
+ if (c->x86 == 0x16 && c->x86_model <= 0xf)
+ msr_set_bit(MSR_AMD64_LS_CFG, 15);
+
+ /*
+ * Check whether the machine is affected by erratum 400. This is
+ * used to select the proper idle routine and to enable the check
+ * whether the machine is affected in arch_post_acpi_init(), which
+ * sets the X86_BUG_AMD_APIC_C1E bug depending on the MSR check.
+ */
+ if (cpu_has_amd_erratum(c, amd_erratum_400))
+ set_cpu_bug(c, X86_BUG_AMD_E400);
+
+ early_detect_mem_encrypt(c);
+
+ /* Re-enable TopologyExtensions if switched off by BIOS */
+ if (c->x86 == 0x15 &&
+ (c->x86_model >= 0x10 && c->x86_model <= 0x6f) &&
+ !cpu_has(c, X86_FEATURE_TOPOEXT)) {
+
+ if (msr_set_bit(0xc0011005, 54) > 0) {
+ rdmsrl(0xc0011005, value);
+ if (value & BIT_64(54)) {
+ set_cpu_cap(c, X86_FEATURE_TOPOEXT);
+ pr_info_once(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n");
+ }
+ }
+ }
+
+ amd_get_topology_early(c);
+}
+
+static void init_amd_k8(struct cpuinfo_x86 *c)
+{
+ u32 level;
+ u64 value;
+
+ /* On C+ stepping K8 rep microcode works well for copy/memset */
+ level = cpuid_eax(1);
+ if ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)
+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+
+ /*
+ * Some BIOSes incorrectly force this feature, but only K8 revision D
+ * (model = 0x14) and later actually support it.
+ * (AMD Erratum #110, docId: 25759).
+ */
+ if (c->x86_model < 0x14 && cpu_has(c, X86_FEATURE_LAHF_LM)) {
+ clear_cpu_cap(c, X86_FEATURE_LAHF_LM);
+ if (!rdmsrl_amd_safe(0xc001100d, &value)) {
+ value &= ~BIT_64(32);
+ wrmsrl_amd_safe(0xc001100d, value);
+ }
+ }
+
+ if (!c->x86_model_id[0])
+ strcpy(c->x86_model_id, "Hammer");
+
+#ifdef CONFIG_SMP
+ /*
+ * Disable TLB flush filter by setting HWCR.FFDIS on K8
+ * bit 6 of msr C001_0015
+ *
+ * Errata 63 for SH-B3 steppings
+ * Errata 122 for all steppings (F+ have it disabled by default)
+ */
+ msr_set_bit(MSR_K7_HWCR, 6);
+#endif
+ set_cpu_bug(c, X86_BUG_SWAPGS_FENCE);
+}
+
+static void init_amd_gh(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_MMCONF_FAM10H
+ /* do this for boot cpu */
+ if (c == &boot_cpu_data)
+ check_enable_amd_mmconf_dmi();
+
+ fam10h_check_enable_mmcfg();
+#endif
+
+ /*
+ * Disable GART TLB Walk Errors on Fam10h. We do this here because this
+ * is always needed when GART is enabled, even in a kernel which has no
+ * MCE support built in. BIOS should disable GartTlbWlk Errors already.
+ * If it doesn't, we do it here as suggested by the BKDG.
+ *
+ * Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=33012
+ */
+ msr_set_bit(MSR_AMD64_MCx_MASK(4), 10);
+
+ /*
+ * On family 10h BIOS may not have properly enabled WC+ support, causing
+ * it to be converted to CD memtype. This may result in performance
+ * degradation for certain nested-paging guests. Prevent this conversion
+ * by clearing bit 24 in MSR_AMD64_BU_CFG2.
+ *
+ * NOTE: we want to use the _safe accessors so as not to #GP kvm
+ * guests on older kvm hosts.
+ */
+ msr_clear_bit(MSR_AMD64_BU_CFG2, 24);
+
+ if (cpu_has_amd_erratum(c, amd_erratum_383))
+ set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH);
+}
+
+#define MSR_AMD64_DE_CFG 0xC0011029
+
+static void init_amd_ln(struct cpuinfo_x86 *c)
+{
+ /*
+ * Apply erratum 665 fix unconditionally so machines without a BIOS
+ * fix work.
+ */
+ msr_set_bit(MSR_AMD64_DE_CFG, 31);
+}
+
+static bool rdrand_force;
+
+static int __init rdrand_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "force"))
+ rdrand_force = true;
+ else
+ return -EINVAL;
+
+ return 0;
+}
+early_param("rdrand", rdrand_cmdline);
+
+static void clear_rdrand_cpuid_bit(struct cpuinfo_x86 *c)
+{
+ /*
+ * Saving of the MSR used to hide the RDRAND support during
+ * suspend/resume is done by arch/x86/power/cpu.c, which is
+ * dependent on CONFIG_PM_SLEEP.
+ */
+ if (!IS_ENABLED(CONFIG_PM_SLEEP))
+ return;
+
+ /*
+ * The nordrand option can clear X86_FEATURE_RDRAND, so check for
+ * RDRAND support using the CPUID function directly.
+ */
+ if (!(cpuid_ecx(1) & BIT(30)) || rdrand_force)
+ return;
+
+ msr_clear_bit(MSR_AMD64_CPUID_FN_1, 62);
+
+ /*
+ * Verify that the CPUID change has occurred in case the kernel is
+ * running virtualized and the hypervisor doesn't support the MSR.
+ */
+ if (cpuid_ecx(1) & BIT(30)) {
+ pr_info_once("BIOS may not properly restore RDRAND after suspend, but hypervisor does not support hiding RDRAND via CPUID.\n");
+ return;
+ }
+
+ clear_cpu_cap(c, X86_FEATURE_RDRAND);
+ pr_info_once("BIOS may not properly restore RDRAND after suspend, hiding RDRAND via CPUID. Use rdrand=force to reenable.\n");
+}
+
+static void init_amd_jg(struct cpuinfo_x86 *c)
+{
+ /*
+ * Some BIOS implementations do not restore proper RDRAND support
+ * across suspend and resume. Check on whether to hide the RDRAND
+ * instruction support via CPUID.
+ */
+ clear_rdrand_cpuid_bit(c);
+}
+
+static void init_amd_bd(struct cpuinfo_x86 *c)
+{
+ u64 value;
+
+ /*
+ * The way access filter has a performance penalty on some workloads.
+ * Disable it on the affected CPUs.
+ */
+ if ((c->x86_model >= 0x02) && (c->x86_model < 0x20)) {
+ if (!rdmsrl_safe(MSR_F15H_IC_CFG, &value) && !(value & 0x1E)) {
+ value |= 0x1E;
+ wrmsrl_safe(MSR_F15H_IC_CFG, value);
+ }
+ }
+
+ /*
+ * Some BIOS implementations do not restore proper RDRAND support
+ * across suspend and resume. Check on whether to hide the RDRAND
+ * instruction support via CPUID.
+ */
+ clear_rdrand_cpuid_bit(c);
+}
+
+static void init_amd_zn(struct cpuinfo_x86 *c)
+{
+ set_cpu_cap(c, X86_FEATURE_ZEN);
+
+ /*
+ * Fix erratum 1076: CPB feature bit not being set in CPUID.
+ * Always set it, except when running under a hypervisor.
+ */
+ if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && !cpu_has(c, X86_FEATURE_CPB))
+ set_cpu_cap(c, X86_FEATURE_CPB);
+}
+
+static void init_amd(struct cpuinfo_x86 *c)
+{
+ early_init_amd(c);
+
+ /*
+ * Bit 31 in normal CPUID used for nonstandard 3DNow ID;
+ * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway
+ */
+ clear_cpu_cap(c, 0*32+31);
+
+ if (c->x86 >= 0x10)
+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+
+ /* get apicid instead of initial apic id from cpuid */
+ c->apicid = hard_smp_processor_id();
+
+ /* K6s reports MCEs but don't actually have all the MSRs */
+ if (c->x86 < 6)
+ clear_cpu_cap(c, X86_FEATURE_MCE);
+
+ switch (c->x86) {
+ case 4: init_amd_k5(c); break;
+ case 5: init_amd_k6(c); break;
+ case 6: init_amd_k7(c); break;
+ case 0xf: init_amd_k8(c); break;
+ case 0x10: init_amd_gh(c); break;
+ case 0x12: init_amd_ln(c); break;
+ case 0x15: init_amd_bd(c); break;
+ case 0x16: init_amd_jg(c); break;
+ case 0x17: init_amd_zn(c); break;
+ }
+
+ /*
+ * Enable workaround for FXSAVE leak on CPUs
+ * without a XSaveErPtr feature
+ */
+ if ((c->x86 >= 6) && (!cpu_has(c, X86_FEATURE_XSAVEERPTR)))
+ set_cpu_bug(c, X86_BUG_FXSAVE_LEAK);
+
+ cpu_detect_cache_sizes(c);
+
+ amd_detect_cmp(c);
+ amd_get_topology(c);
+ srat_detect_node(c);
+
+ init_amd_cacheinfo(c);
+
+ if (cpu_has(c, X86_FEATURE_XMM2)) {
+ unsigned long long val;
+ int ret;
+
+ /*
+ * A serializing LFENCE has less overhead than MFENCE, so
+ * use it for execution serialization. On families which
+ * don't have that MSR, LFENCE is already serializing.
+ * msr_set_bit() uses the safe accessors, too, even if the MSR
+ * is not present.
+ */
+ msr_set_bit(MSR_F10H_DECFG,
+ MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT);
+
+ /*
+ * Verify that the MSR write was successful (could be running
+ * under a hypervisor) and only then assume that LFENCE is
+ * serializing.
+ */
+ ret = rdmsrl_safe(MSR_F10H_DECFG, &val);
+ if (!ret && (val & MSR_F10H_DECFG_LFENCE_SERIALIZE)) {
+ /* A serializing LFENCE stops RDTSC speculation */
+ set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
+ } else {
+ /* MFENCE stops RDTSC speculation */
+ set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
+ }
+ }
+
+ /*
+ * Family 0x12 and above processors have APIC timer
+ * running in deep C states.
+ */
+ if (c->x86 > 0x11)
+ set_cpu_cap(c, X86_FEATURE_ARAT);
+
+ /* 3DNow or LM implies PREFETCHW */
+ if (!cpu_has(c, X86_FEATURE_3DNOWPREFETCH))
+ if (cpu_has(c, X86_FEATURE_3DNOW) || cpu_has(c, X86_FEATURE_LM))
+ set_cpu_cap(c, X86_FEATURE_3DNOWPREFETCH);
+
+ /* AMD CPUs don't reset SS attributes on SYSRET, Xen does. */
+ if (!cpu_has(c, X86_FEATURE_XENPV))
+ set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS);
+
+ /*
+ * Turn on the Instructions Retired free counter on machines not
+ * susceptible to erratum #1054 "Instructions Retired Performance
+ * Counter May Be Inaccurate".
+ */
+ if (cpu_has(c, X86_FEATURE_IRPERF) &&
+ !cpu_has_amd_erratum(c, amd_erratum_1054))
+ msr_set_bit(MSR_K7_HWCR, MSR_K7_HWCR_IRPERF_EN_BIT);
+
+ check_null_seg_clears_base(c);
+}
+
+#ifdef CONFIG_X86_32
+static unsigned int amd_size_cache(struct cpuinfo_x86 *c, unsigned int size)
+{
+ /* AMD errata T13 (order #21922) */
+ if (c->x86 == 6) {
+ /* Duron Rev A0 */
+ if (c->x86_model == 3 && c->x86_stepping == 0)
+ size = 64;
+ /* Tbird rev A1/A2 */
+ if (c->x86_model == 4 &&
+ (c->x86_stepping == 0 || c->x86_stepping == 1))
+ size = 256;
+ }
+ return size;
+}
+#endif
+
+static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c)
+{
+ u32 ebx, eax, ecx, edx;
+ u16 mask = 0xfff;
+
+ if (c->x86 < 0xf)
+ return;
+
+ if (c->extended_cpuid_level < 0x80000006)
+ return;
+
+ cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
+
+ tlb_lld_4k[ENTRIES] = (ebx >> 16) & mask;
+ tlb_lli_4k[ENTRIES] = ebx & mask;
+
+ /*
+ * K8 doesn't have 2M/4M entries in the L2 TLB so read out the L1 TLB
+ * characteristics from the CPUID function 0x80000005 instead.
+ */
+ if (c->x86 == 0xf) {
+ cpuid(0x80000005, &eax, &ebx, &ecx, &edx);
+ mask = 0xff;
+ }
+
+ /* Handle DTLB 2M and 4M sizes, fall back to L1 if L2 is disabled */
+ if (!((eax >> 16) & mask))
+ tlb_lld_2m[ENTRIES] = (cpuid_eax(0x80000005) >> 16) & 0xff;
+ else
+ tlb_lld_2m[ENTRIES] = (eax >> 16) & mask;
+
+ /* a 4M entry uses two 2M entries */
+ tlb_lld_4m[ENTRIES] = tlb_lld_2m[ENTRIES] >> 1;
+
+ /* Handle ITLB 2M and 4M sizes, fall back to L1 if L2 is disabled */
+ if (!(eax & mask)) {
+ /* Erratum 658 */
+ if (c->x86 == 0x15 && c->x86_model <= 0x1f) {
+ tlb_lli_2m[ENTRIES] = 1024;
+ } else {
+ cpuid(0x80000005, &eax, &ebx, &ecx, &edx);
+ tlb_lli_2m[ENTRIES] = eax & 0xff;
+ }
+ } else
+ tlb_lli_2m[ENTRIES] = eax & mask;
+
+ tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1;
+}
+
+static const struct cpu_dev amd_cpu_dev = {
+ .c_vendor = "AMD",
+ .c_ident = { "AuthenticAMD" },
+#ifdef CONFIG_X86_32
+ .legacy_models = {
+ { .family = 4, .model_names =
+ {
+ [3] = "486 DX/2",
+ [7] = "486 DX/2-WB",
+ [8] = "486 DX/4",
+ [9] = "486 DX/4-WB",
+ [14] = "Am5x86-WT",
+ [15] = "Am5x86-WB"
+ }
+ },
+ },
+ .legacy_cache_size = amd_size_cache,
+#endif
+ .c_early_init = early_init_amd,
+ .c_detect_tlb = cpu_detect_tlb_amd,
+ .c_bsp_init = bsp_init_amd,
+ .c_init = init_amd,
+ .c_x86_vendor = X86_VENDOR_AMD,
+};
+
+cpu_dev_register(amd_cpu_dev);
+
+/*
+ * AMD errata checking
+ *
+ * Errata are defined as arrays of ints using the AMD_LEGACY_ERRATUM() or
+ * AMD_OSVW_ERRATUM() macros. The latter is intended for newer errata that
+ * have an OSVW id assigned, which it takes as first argument. Both take a
+ * variable number of family-specific model-stepping ranges created by
+ * AMD_MODEL_RANGE().
+ *
+ * Example:
+ *
+ * const int amd_erratum_319[] =
+ * AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0x4, 0x2),
+ * AMD_MODEL_RANGE(0x10, 0x8, 0x0, 0x8, 0x0),
+ * AMD_MODEL_RANGE(0x10, 0x9, 0x0, 0x9, 0x0));
+ */
+
+#define AMD_LEGACY_ERRATUM(...) { -1, __VA_ARGS__, 0 }
+#define AMD_OSVW_ERRATUM(osvw_id, ...) { osvw_id, __VA_ARGS__, 0 }
+#define AMD_MODEL_RANGE(f, m_start, s_start, m_end, s_end) \
+ ((f << 24) | (m_start << 16) | (s_start << 12) | (m_end << 4) | (s_end))
+#define AMD_MODEL_RANGE_FAMILY(range) (((range) >> 24) & 0xff)
+#define AMD_MODEL_RANGE_START(range) (((range) >> 12) & 0xfff)
+#define AMD_MODEL_RANGE_END(range) ((range) & 0xfff)
+
+static const int amd_erratum_400[] =
+ AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf),
+ AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf));
+
+static const int amd_erratum_383[] =
+ AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf));
+
+/* #1054: Instructions Retired Performance Counter May Be Inaccurate */
+static const int amd_erratum_1054[] =
+ AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0, 0, 0x2f, 0xf));
+
+static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum)
+{
+ int osvw_id = *erratum++;
+ u32 range;
+ u32 ms;
+
+ if (osvw_id >= 0 && osvw_id < 65536 &&
+ cpu_has(cpu, X86_FEATURE_OSVW)) {
+ u64 osvw_len;
+
+ rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, osvw_len);
+ if (osvw_id < osvw_len) {
+ u64 osvw_bits;
+
+ rdmsrl(MSR_AMD64_OSVW_STATUS + (osvw_id >> 6),
+ osvw_bits);
+ return osvw_bits & (1ULL << (osvw_id & 0x3f));
+ }
+ }
+
+ /* OSVW unavailable or ID unknown, match family-model-stepping range */
+ ms = (cpu->x86_model << 4) | cpu->x86_stepping;
+ while ((range = *erratum++))
+ if ((cpu->x86 == AMD_MODEL_RANGE_FAMILY(range)) &&
+ (ms >= AMD_MODEL_RANGE_START(range)) &&
+ (ms <= AMD_MODEL_RANGE_END(range)))
+ return true;
+
+ return false;
+}
+
+void set_dr_addr_mask(unsigned long mask, int dr)
+{
+ if (!boot_cpu_has(X86_FEATURE_BPEXT))
+ return;
+
+ switch (dr) {
+ case 0:
+ wrmsr(MSR_F16H_DR0_ADDR_MASK, mask, 0);
+ break;
+ case 1:
+ case 2:
+ case 3:
+ wrmsr(MSR_F16H_DR1_ADDR_MASK - 1 + dr, mask, 0);
+ break;
+ default:
+ break;
+ }
+}
diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c
new file mode 100644
index 000000000..7eba34df5
--- /dev/null
+++ b/arch/x86/kernel/cpu/aperfmperf.c
@@ -0,0 +1,127 @@
+/*
+ * x86 APERF/MPERF KHz calculation for
+ * /sys/.../cpufreq/scaling_cur_freq
+ *
+ * Copyright (C) 2017 Intel Corp.
+ * Author: Len Brown <len.brown@intel.com>
+ *
+ * This file is licensed under GPLv2.
+ */
+
+#include <linux/delay.h>
+#include <linux/ktime.h>
+#include <linux/math64.h>
+#include <linux/percpu.h>
+#include <linux/smp.h>
+
+#include "cpu.h"
+
+struct aperfmperf_sample {
+ unsigned int khz;
+ ktime_t time;
+ u64 aperf;
+ u64 mperf;
+};
+
+static DEFINE_PER_CPU(struct aperfmperf_sample, samples);
+
+#define APERFMPERF_CACHE_THRESHOLD_MS 10
+#define APERFMPERF_REFRESH_DELAY_MS 10
+#define APERFMPERF_STALE_THRESHOLD_MS 1000
+
+/*
+ * aperfmperf_snapshot_khz()
+ * On the current CPU, snapshot APERF, MPERF, and jiffies
+ * unless we already did it within 10ms
+ * calculate kHz, save snapshot
+ */
+static void aperfmperf_snapshot_khz(void *dummy)
+{
+ u64 aperf, aperf_delta;
+ u64 mperf, mperf_delta;
+ struct aperfmperf_sample *s = this_cpu_ptr(&samples);
+ unsigned long flags;
+
+ local_irq_save(flags);
+ rdmsrl(MSR_IA32_APERF, aperf);
+ rdmsrl(MSR_IA32_MPERF, mperf);
+ local_irq_restore(flags);
+
+ aperf_delta = aperf - s->aperf;
+ mperf_delta = mperf - s->mperf;
+
+ /*
+ * There is no architectural guarantee that MPERF
+ * increments faster than we can read it.
+ */
+ if (mperf_delta == 0)
+ return;
+
+ s->time = ktime_get();
+ s->aperf = aperf;
+ s->mperf = mperf;
+ s->khz = div64_u64((cpu_khz * aperf_delta), mperf_delta);
+}
+
+static bool aperfmperf_snapshot_cpu(int cpu, ktime_t now, bool wait)
+{
+ s64 time_delta = ktime_ms_delta(now, per_cpu(samples.time, cpu));
+
+ /* Don't bother re-computing within the cache threshold time. */
+ if (time_delta < APERFMPERF_CACHE_THRESHOLD_MS)
+ return true;
+
+ smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, wait);
+
+ /* Return false if the previous iteration was too long ago. */
+ return time_delta <= APERFMPERF_STALE_THRESHOLD_MS;
+}
+
+unsigned int aperfmperf_get_khz(int cpu)
+{
+ if (!cpu_khz)
+ return 0;
+
+ if (!static_cpu_has(X86_FEATURE_APERFMPERF))
+ return 0;
+
+ aperfmperf_snapshot_cpu(cpu, ktime_get(), true);
+ return per_cpu(samples.khz, cpu);
+}
+
+void arch_freq_prepare_all(void)
+{
+ ktime_t now = ktime_get();
+ bool wait = false;
+ int cpu;
+
+ if (!cpu_khz)
+ return;
+
+ if (!static_cpu_has(X86_FEATURE_APERFMPERF))
+ return;
+
+ for_each_online_cpu(cpu)
+ if (!aperfmperf_snapshot_cpu(cpu, now, false))
+ wait = true;
+
+ if (wait)
+ msleep(APERFMPERF_REFRESH_DELAY_MS);
+}
+
+unsigned int arch_freq_get_on_cpu(int cpu)
+{
+ if (!cpu_khz)
+ return 0;
+
+ if (!static_cpu_has(X86_FEATURE_APERFMPERF))
+ return 0;
+
+ if (aperfmperf_snapshot_cpu(cpu, ktime_get(), true))
+ return per_cpu(samples.khz, cpu);
+
+ msleep(APERFMPERF_REFRESH_DELAY_MS);
+ smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, 1);
+
+ return per_cpu(samples.khz, cpu);
+}
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
new file mode 100644
index 000000000..058e92b93
--- /dev/null
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -0,0 +1,1997 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 1994 Linus Torvalds
+ *
+ * Cyrix stuff, June 1998 by:
+ * - Rafael R. Reilova (moved everything from head.S),
+ * <rreilova@ececs.uc.edu>
+ * - Channing Corn (tests & fixes),
+ * - Andrew D. Balsa (code cleanup).
+ */
+#include <linux/init.h>
+#include <linux/utsname.h>
+#include <linux/cpu.h>
+#include <linux/module.h>
+#include <linux/nospec.h>
+#include <linux/prctl.h>
+#include <linux/sched/smt.h>
+
+#include <asm/spec-ctrl.h>
+#include <asm/cmdline.h>
+#include <asm/bugs.h>
+#include <asm/processor.h>
+#include <asm/processor-flags.h>
+#include <asm/fpu/internal.h>
+#include <asm/msr.h>
+#include <asm/vmx.h>
+#include <asm/paravirt.h>
+#include <asm/alternative.h>
+#include <asm/pgtable.h>
+#include <asm/set_memory.h>
+#include <asm/intel-family.h>
+#include <asm/e820/api.h>
+#include <asm/hypervisor.h>
+#include <linux/bpf.h>
+
+#include "cpu.h"
+
+static void __init spectre_v1_select_mitigation(void);
+static void __init spectre_v2_select_mitigation(void);
+static void __init ssb_select_mitigation(void);
+static void __init l1tf_select_mitigation(void);
+static void __init mds_select_mitigation(void);
+static void __init md_clear_update_mitigation(void);
+static void __init md_clear_select_mitigation(void);
+static void __init taa_select_mitigation(void);
+static void __init mmio_select_mitigation(void);
+static void __init srbds_select_mitigation(void);
+
+/* The base value of the SPEC_CTRL MSR that always has to be preserved. */
+u64 x86_spec_ctrl_base;
+EXPORT_SYMBOL_GPL(x86_spec_ctrl_base);
+static DEFINE_MUTEX(spec_ctrl_mutex);
+
+/*
+ * The vendor and possibly platform specific bits which can be modified in
+ * x86_spec_ctrl_base.
+ */
+static u64 __ro_after_init x86_spec_ctrl_mask = SPEC_CTRL_IBRS;
+
+/*
+ * AMD specific MSR info for Speculative Store Bypass control.
+ * x86_amd_ls_cfg_ssbd_mask is initialized in identify_boot_cpu().
+ */
+u64 __ro_after_init x86_amd_ls_cfg_base;
+u64 __ro_after_init x86_amd_ls_cfg_ssbd_mask;
+
+/* Control conditional STIBP in switch_to() */
+DEFINE_STATIC_KEY_FALSE(switch_to_cond_stibp);
+/* Control conditional IBPB in switch_mm() */
+DEFINE_STATIC_KEY_FALSE(switch_mm_cond_ibpb);
+/* Control unconditional IBPB in switch_mm() */
+DEFINE_STATIC_KEY_FALSE(switch_mm_always_ibpb);
+
+/* Control MDS CPU buffer clear before returning to user space */
+DEFINE_STATIC_KEY_FALSE(mds_user_clear);
+EXPORT_SYMBOL_GPL(mds_user_clear);
+/* Control MDS CPU buffer clear before idling (halt, mwait) */
+DEFINE_STATIC_KEY_FALSE(mds_idle_clear);
+EXPORT_SYMBOL_GPL(mds_idle_clear);
+
+/* Controls CPU Fill buffer clear before KVM guest MMIO accesses */
+DEFINE_STATIC_KEY_FALSE(mmio_stale_data_clear);
+EXPORT_SYMBOL_GPL(mmio_stale_data_clear);
+
+void __init check_bugs(void)
+{
+ identify_boot_cpu();
+
+ /*
+ * identify_boot_cpu() initialized SMT support information, let the
+ * core code know.
+ */
+ cpu_smt_check_topology();
+
+ if (!IS_ENABLED(CONFIG_SMP)) {
+ pr_info("CPU: ");
+ print_cpu_info(&boot_cpu_data);
+ }
+
+ /*
+ * Read the SPEC_CTRL MSR to account for reserved bits which may
+ * have unknown values. AMD64_LS_CFG MSR is cached in the early AMD
+ * init code as it is not enumerated and depends on the family.
+ */
+ if (boot_cpu_has(X86_FEATURE_MSR_SPEC_CTRL))
+ rdmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
+
+ /* Allow STIBP in MSR_SPEC_CTRL if supported */
+ if (boot_cpu_has(X86_FEATURE_STIBP))
+ x86_spec_ctrl_mask |= SPEC_CTRL_STIBP;
+
+ /* Select the proper CPU mitigations before patching alternatives: */
+ spectre_v1_select_mitigation();
+ spectre_v2_select_mitigation();
+ ssb_select_mitigation();
+ l1tf_select_mitigation();
+ md_clear_select_mitigation();
+ srbds_select_mitigation();
+
+ arch_smt_update();
+
+#ifdef CONFIG_X86_32
+ /*
+ * Check whether we are able to run this kernel safely on SMP.
+ *
+ * - i386 is no longer supported.
+ * - In order to run on anything without a TSC, we need to be
+ * compiled for a i486.
+ */
+ if (boot_cpu_data.x86 < 4)
+ panic("Kernel requires i486+ for 'invlpg' and other features");
+
+ init_utsname()->machine[1] =
+ '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86);
+ alternative_instructions();
+
+ fpu__init_check_bugs();
+#else /* CONFIG_X86_64 */
+ alternative_instructions();
+
+ /*
+ * Make sure the first 2MB area is not mapped by huge pages
+ * There are typically fixed size MTRRs in there and overlapping
+ * MTRRs into large pages causes slow downs.
+ *
+ * Right now we don't do that with gbpages because there seems
+ * very little benefit for that case.
+ */
+ if (!direct_gbpages)
+ set_memory_4k((unsigned long)__va(0), 1);
+#endif
+}
+
+void
+x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest)
+{
+ u64 msrval, guestval, hostval = x86_spec_ctrl_base;
+ struct thread_info *ti = current_thread_info();
+
+ /* Is MSR_SPEC_CTRL implemented ? */
+ if (static_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) {
+ /*
+ * Restrict guest_spec_ctrl to supported values. Clear the
+ * modifiable bits in the host base value and or the
+ * modifiable bits from the guest value.
+ */
+ guestval = hostval & ~x86_spec_ctrl_mask;
+ guestval |= guest_spec_ctrl & x86_spec_ctrl_mask;
+
+ /* SSBD controlled in MSR_SPEC_CTRL */
+ if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) ||
+ static_cpu_has(X86_FEATURE_AMD_SSBD))
+ hostval |= ssbd_tif_to_spec_ctrl(ti->flags);
+
+ /* Conditional STIBP enabled? */
+ if (static_branch_unlikely(&switch_to_cond_stibp))
+ hostval |= stibp_tif_to_spec_ctrl(ti->flags);
+
+ if (hostval != guestval) {
+ msrval = setguest ? guestval : hostval;
+ wrmsrl(MSR_IA32_SPEC_CTRL, msrval);
+ }
+ }
+
+ /*
+ * If SSBD is not handled in MSR_SPEC_CTRL on AMD, update
+ * MSR_AMD64_L2_CFG or MSR_VIRT_SPEC_CTRL if supported.
+ */
+ if (!static_cpu_has(X86_FEATURE_LS_CFG_SSBD) &&
+ !static_cpu_has(X86_FEATURE_VIRT_SSBD))
+ return;
+
+ /*
+ * If the host has SSBD mitigation enabled, force it in the host's
+ * virtual MSR value. If its not permanently enabled, evaluate
+ * current's TIF_SSBD thread flag.
+ */
+ if (static_cpu_has(X86_FEATURE_SPEC_STORE_BYPASS_DISABLE))
+ hostval = SPEC_CTRL_SSBD;
+ else
+ hostval = ssbd_tif_to_spec_ctrl(ti->flags);
+
+ /* Sanitize the guest value */
+ guestval = guest_virt_spec_ctrl & SPEC_CTRL_SSBD;
+
+ if (hostval != guestval) {
+ unsigned long tif;
+
+ tif = setguest ? ssbd_spec_ctrl_to_tif(guestval) :
+ ssbd_spec_ctrl_to_tif(hostval);
+
+ speculation_ctrl_update(tif);
+ }
+}
+EXPORT_SYMBOL_GPL(x86_virt_spec_ctrl);
+
+static void x86_amd_ssb_disable(void)
+{
+ u64 msrval = x86_amd_ls_cfg_base | x86_amd_ls_cfg_ssbd_mask;
+
+ if (boot_cpu_has(X86_FEATURE_VIRT_SSBD))
+ wrmsrl(MSR_AMD64_VIRT_SPEC_CTRL, SPEC_CTRL_SSBD);
+ else if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD))
+ wrmsrl(MSR_AMD64_LS_CFG, msrval);
+}
+
+#undef pr_fmt
+#define pr_fmt(fmt) "MDS: " fmt
+
+/* Default mitigation for MDS-affected CPUs */
+static enum mds_mitigations mds_mitigation __ro_after_init = MDS_MITIGATION_FULL;
+static bool mds_nosmt __ro_after_init = false;
+
+static const char * const mds_strings[] = {
+ [MDS_MITIGATION_OFF] = "Vulnerable",
+ [MDS_MITIGATION_FULL] = "Mitigation: Clear CPU buffers",
+ [MDS_MITIGATION_VMWERV] = "Vulnerable: Clear CPU buffers attempted, no microcode",
+};
+
+static void __init mds_select_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_MDS) || cpu_mitigations_off()) {
+ mds_mitigation = MDS_MITIGATION_OFF;
+ return;
+ }
+
+ if (mds_mitigation == MDS_MITIGATION_FULL) {
+ if (!boot_cpu_has(X86_FEATURE_MD_CLEAR))
+ mds_mitigation = MDS_MITIGATION_VMWERV;
+
+ static_branch_enable(&mds_user_clear);
+
+ if (!boot_cpu_has(X86_BUG_MSBDS_ONLY) &&
+ (mds_nosmt || cpu_mitigations_auto_nosmt()))
+ cpu_smt_disable(false);
+ }
+}
+
+static int __init mds_cmdline(char *str)
+{
+ if (!boot_cpu_has_bug(X86_BUG_MDS))
+ return 0;
+
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "off"))
+ mds_mitigation = MDS_MITIGATION_OFF;
+ else if (!strcmp(str, "full"))
+ mds_mitigation = MDS_MITIGATION_FULL;
+ else if (!strcmp(str, "full,nosmt")) {
+ mds_mitigation = MDS_MITIGATION_FULL;
+ mds_nosmt = true;
+ }
+
+ return 0;
+}
+early_param("mds", mds_cmdline);
+
+#undef pr_fmt
+#define pr_fmt(fmt) "TAA: " fmt
+
+/* Default mitigation for TAA-affected CPUs */
+static enum taa_mitigations taa_mitigation __ro_after_init = TAA_MITIGATION_VERW;
+static bool taa_nosmt __ro_after_init;
+
+static const char * const taa_strings[] = {
+ [TAA_MITIGATION_OFF] = "Vulnerable",
+ [TAA_MITIGATION_UCODE_NEEDED] = "Vulnerable: Clear CPU buffers attempted, no microcode",
+ [TAA_MITIGATION_VERW] = "Mitigation: Clear CPU buffers",
+ [TAA_MITIGATION_TSX_DISABLED] = "Mitigation: TSX disabled",
+};
+
+static void __init taa_select_mitigation(void)
+{
+ u64 ia32_cap;
+
+ if (!boot_cpu_has_bug(X86_BUG_TAA)) {
+ taa_mitigation = TAA_MITIGATION_OFF;
+ return;
+ }
+
+ /* TSX previously disabled by tsx=off */
+ if (!boot_cpu_has(X86_FEATURE_RTM)) {
+ taa_mitigation = TAA_MITIGATION_TSX_DISABLED;
+ return;
+ }
+
+ if (cpu_mitigations_off()) {
+ taa_mitigation = TAA_MITIGATION_OFF;
+ return;
+ }
+
+ /*
+ * TAA mitigation via VERW is turned off if both
+ * tsx_async_abort=off and mds=off are specified.
+ */
+ if (taa_mitigation == TAA_MITIGATION_OFF &&
+ mds_mitigation == MDS_MITIGATION_OFF)
+ return;
+
+ if (boot_cpu_has(X86_FEATURE_MD_CLEAR))
+ taa_mitigation = TAA_MITIGATION_VERW;
+ else
+ taa_mitigation = TAA_MITIGATION_UCODE_NEEDED;
+
+ /*
+ * VERW doesn't clear the CPU buffers when MD_CLEAR=1 and MDS_NO=1.
+ * A microcode update fixes this behavior to clear CPU buffers. It also
+ * adds support for MSR_IA32_TSX_CTRL which is enumerated by the
+ * ARCH_CAP_TSX_CTRL_MSR bit.
+ *
+ * On MDS_NO=1 CPUs if ARCH_CAP_TSX_CTRL_MSR is not set, microcode
+ * update is required.
+ */
+ ia32_cap = x86_read_arch_cap_msr();
+ if ( (ia32_cap & ARCH_CAP_MDS_NO) &&
+ !(ia32_cap & ARCH_CAP_TSX_CTRL_MSR))
+ taa_mitigation = TAA_MITIGATION_UCODE_NEEDED;
+
+ /*
+ * TSX is enabled, select alternate mitigation for TAA which is
+ * the same as MDS. Enable MDS static branch to clear CPU buffers.
+ *
+ * For guests that can't determine whether the correct microcode is
+ * present on host, enable the mitigation for UCODE_NEEDED as well.
+ */
+ static_branch_enable(&mds_user_clear);
+
+ if (taa_nosmt || cpu_mitigations_auto_nosmt())
+ cpu_smt_disable(false);
+}
+
+static int __init tsx_async_abort_parse_cmdline(char *str)
+{
+ if (!boot_cpu_has_bug(X86_BUG_TAA))
+ return 0;
+
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "off")) {
+ taa_mitigation = TAA_MITIGATION_OFF;
+ } else if (!strcmp(str, "full")) {
+ taa_mitigation = TAA_MITIGATION_VERW;
+ } else if (!strcmp(str, "full,nosmt")) {
+ taa_mitigation = TAA_MITIGATION_VERW;
+ taa_nosmt = true;
+ }
+
+ return 0;
+}
+early_param("tsx_async_abort", tsx_async_abort_parse_cmdline);
+
+#undef pr_fmt
+#define pr_fmt(fmt) "MMIO Stale Data: " fmt
+
+enum mmio_mitigations {
+ MMIO_MITIGATION_OFF,
+ MMIO_MITIGATION_UCODE_NEEDED,
+ MMIO_MITIGATION_VERW,
+};
+
+/* Default mitigation for Processor MMIO Stale Data vulnerabilities */
+static enum mmio_mitigations mmio_mitigation __ro_after_init = MMIO_MITIGATION_VERW;
+static bool mmio_nosmt __ro_after_init = false;
+
+static const char * const mmio_strings[] = {
+ [MMIO_MITIGATION_OFF] = "Vulnerable",
+ [MMIO_MITIGATION_UCODE_NEEDED] = "Vulnerable: Clear CPU buffers attempted, no microcode",
+ [MMIO_MITIGATION_VERW] = "Mitigation: Clear CPU buffers",
+};
+
+static void __init mmio_select_mitigation(void)
+{
+ u64 ia32_cap;
+
+ if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA) ||
+ cpu_mitigations_off()) {
+ mmio_mitigation = MMIO_MITIGATION_OFF;
+ return;
+ }
+
+ if (mmio_mitigation == MMIO_MITIGATION_OFF)
+ return;
+
+ ia32_cap = x86_read_arch_cap_msr();
+
+ /*
+ * Enable CPU buffer clear mitigation for host and VMM, if also affected
+ * by MDS or TAA. Otherwise, enable mitigation for VMM only.
+ */
+ if (boot_cpu_has_bug(X86_BUG_MDS) || (boot_cpu_has_bug(X86_BUG_TAA) &&
+ boot_cpu_has(X86_FEATURE_RTM)))
+ static_branch_enable(&mds_user_clear);
+ else
+ static_branch_enable(&mmio_stale_data_clear);
+
+ /*
+ * If Processor-MMIO-Stale-Data bug is present and Fill Buffer data can
+ * be propagated to uncore buffers, clearing the Fill buffers on idle
+ * is required irrespective of SMT state.
+ */
+ if (!(ia32_cap & ARCH_CAP_FBSDP_NO))
+ static_branch_enable(&mds_idle_clear);
+
+ /*
+ * Check if the system has the right microcode.
+ *
+ * CPU Fill buffer clear mitigation is enumerated by either an explicit
+ * FB_CLEAR or by the presence of both MD_CLEAR and L1D_FLUSH on MDS
+ * affected systems.
+ */
+ if ((ia32_cap & ARCH_CAP_FB_CLEAR) ||
+ (boot_cpu_has(X86_FEATURE_MD_CLEAR) &&
+ boot_cpu_has(X86_FEATURE_FLUSH_L1D) &&
+ !(ia32_cap & ARCH_CAP_MDS_NO)))
+ mmio_mitigation = MMIO_MITIGATION_VERW;
+ else
+ mmio_mitigation = MMIO_MITIGATION_UCODE_NEEDED;
+
+ if (mmio_nosmt || cpu_mitigations_auto_nosmt())
+ cpu_smt_disable(false);
+}
+
+static int __init mmio_stale_data_parse_cmdline(char *str)
+{
+ if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA))
+ return 0;
+
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "off")) {
+ mmio_mitigation = MMIO_MITIGATION_OFF;
+ } else if (!strcmp(str, "full")) {
+ mmio_mitigation = MMIO_MITIGATION_VERW;
+ } else if (!strcmp(str, "full,nosmt")) {
+ mmio_mitigation = MMIO_MITIGATION_VERW;
+ mmio_nosmt = true;
+ }
+
+ return 0;
+}
+early_param("mmio_stale_data", mmio_stale_data_parse_cmdline);
+
+#undef pr_fmt
+#define pr_fmt(fmt) "" fmt
+
+static void __init md_clear_update_mitigation(void)
+{
+ if (cpu_mitigations_off())
+ return;
+
+ if (!static_key_enabled(&mds_user_clear))
+ goto out;
+
+ /*
+ * mds_user_clear is now enabled. Update MDS, TAA and MMIO Stale Data
+ * mitigation, if necessary.
+ */
+ if (mds_mitigation == MDS_MITIGATION_OFF &&
+ boot_cpu_has_bug(X86_BUG_MDS)) {
+ mds_mitigation = MDS_MITIGATION_FULL;
+ mds_select_mitigation();
+ }
+ if (taa_mitigation == TAA_MITIGATION_OFF &&
+ boot_cpu_has_bug(X86_BUG_TAA)) {
+ taa_mitigation = TAA_MITIGATION_VERW;
+ taa_select_mitigation();
+ }
+ if (mmio_mitigation == MMIO_MITIGATION_OFF &&
+ boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) {
+ mmio_mitigation = MMIO_MITIGATION_VERW;
+ mmio_select_mitigation();
+ }
+out:
+ if (boot_cpu_has_bug(X86_BUG_MDS))
+ pr_info("MDS: %s\n", mds_strings[mds_mitigation]);
+ if (boot_cpu_has_bug(X86_BUG_TAA))
+ pr_info("TAA: %s\n", taa_strings[taa_mitigation]);
+ if (boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA))
+ pr_info("MMIO Stale Data: %s\n", mmio_strings[mmio_mitigation]);
+}
+
+static void __init md_clear_select_mitigation(void)
+{
+ mds_select_mitigation();
+ taa_select_mitigation();
+ mmio_select_mitigation();
+
+ /*
+ * As MDS, TAA and MMIO Stale Data mitigations are inter-related, update
+ * and print their mitigation after MDS, TAA and MMIO Stale Data
+ * mitigation selection is done.
+ */
+ md_clear_update_mitigation();
+}
+
+#undef pr_fmt
+#define pr_fmt(fmt) "SRBDS: " fmt
+
+enum srbds_mitigations {
+ SRBDS_MITIGATION_OFF,
+ SRBDS_MITIGATION_UCODE_NEEDED,
+ SRBDS_MITIGATION_FULL,
+ SRBDS_MITIGATION_TSX_OFF,
+ SRBDS_MITIGATION_HYPERVISOR,
+};
+
+static enum srbds_mitigations srbds_mitigation __ro_after_init = SRBDS_MITIGATION_FULL;
+
+static const char * const srbds_strings[] = {
+ [SRBDS_MITIGATION_OFF] = "Vulnerable",
+ [SRBDS_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode",
+ [SRBDS_MITIGATION_FULL] = "Mitigation: Microcode",
+ [SRBDS_MITIGATION_TSX_OFF] = "Mitigation: TSX disabled",
+ [SRBDS_MITIGATION_HYPERVISOR] = "Unknown: Dependent on hypervisor status",
+};
+
+static bool srbds_off;
+
+void update_srbds_msr(void)
+{
+ u64 mcu_ctrl;
+
+ if (!boot_cpu_has_bug(X86_BUG_SRBDS))
+ return;
+
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ return;
+
+ if (srbds_mitigation == SRBDS_MITIGATION_UCODE_NEEDED)
+ return;
+
+ rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
+
+ switch (srbds_mitigation) {
+ case SRBDS_MITIGATION_OFF:
+ case SRBDS_MITIGATION_TSX_OFF:
+ mcu_ctrl |= RNGDS_MITG_DIS;
+ break;
+ case SRBDS_MITIGATION_FULL:
+ mcu_ctrl &= ~RNGDS_MITG_DIS;
+ break;
+ default:
+ break;
+ }
+
+ wrmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
+}
+
+static void __init srbds_select_mitigation(void)
+{
+ u64 ia32_cap;
+
+ if (!boot_cpu_has_bug(X86_BUG_SRBDS))
+ return;
+
+ /*
+ * Check to see if this is one of the MDS_NO systems supporting TSX that
+ * are only exposed to SRBDS when TSX is enabled or when CPU is affected
+ * by Processor MMIO Stale Data vulnerability.
+ */
+ ia32_cap = x86_read_arch_cap_msr();
+ if ((ia32_cap & ARCH_CAP_MDS_NO) && !boot_cpu_has(X86_FEATURE_RTM) &&
+ !boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA))
+ srbds_mitigation = SRBDS_MITIGATION_TSX_OFF;
+ else if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ srbds_mitigation = SRBDS_MITIGATION_HYPERVISOR;
+ else if (!boot_cpu_has(X86_FEATURE_SRBDS_CTRL))
+ srbds_mitigation = SRBDS_MITIGATION_UCODE_NEEDED;
+ else if (cpu_mitigations_off() || srbds_off)
+ srbds_mitigation = SRBDS_MITIGATION_OFF;
+
+ update_srbds_msr();
+ pr_info("%s\n", srbds_strings[srbds_mitigation]);
+}
+
+static int __init srbds_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!boot_cpu_has_bug(X86_BUG_SRBDS))
+ return 0;
+
+ srbds_off = !strcmp(str, "off");
+ return 0;
+}
+early_param("srbds", srbds_parse_cmdline);
+
+#undef pr_fmt
+#define pr_fmt(fmt) "Spectre V1 : " fmt
+
+enum spectre_v1_mitigation {
+ SPECTRE_V1_MITIGATION_NONE,
+ SPECTRE_V1_MITIGATION_AUTO,
+};
+
+static enum spectre_v1_mitigation spectre_v1_mitigation __ro_after_init =
+ SPECTRE_V1_MITIGATION_AUTO;
+
+static const char * const spectre_v1_strings[] = {
+ [SPECTRE_V1_MITIGATION_NONE] = "Vulnerable: __user pointer sanitization and usercopy barriers only; no swapgs barriers",
+ [SPECTRE_V1_MITIGATION_AUTO] = "Mitigation: usercopy/swapgs barriers and __user pointer sanitization",
+};
+
+/*
+ * Does SMAP provide full mitigation against speculative kernel access to
+ * userspace?
+ */
+static bool smap_works_speculatively(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_SMAP))
+ return false;
+
+ /*
+ * On CPUs which are vulnerable to Meltdown, SMAP does not
+ * prevent speculative access to user data in the L1 cache.
+ * Consider SMAP to be non-functional as a mitigation on these
+ * CPUs.
+ */
+ if (boot_cpu_has(X86_BUG_CPU_MELTDOWN))
+ return false;
+
+ return true;
+}
+
+static void __init spectre_v1_select_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1) || cpu_mitigations_off()) {
+ spectre_v1_mitigation = SPECTRE_V1_MITIGATION_NONE;
+ return;
+ }
+
+ if (spectre_v1_mitigation == SPECTRE_V1_MITIGATION_AUTO) {
+ /*
+ * With Spectre v1, a user can speculatively control either
+ * path of a conditional swapgs with a user-controlled GS
+ * value. The mitigation is to add lfences to both code paths.
+ *
+ * If FSGSBASE is enabled, the user can put a kernel address in
+ * GS, in which case SMAP provides no protection.
+ *
+ * [ NOTE: Don't check for X86_FEATURE_FSGSBASE until the
+ * FSGSBASE enablement patches have been merged. ]
+ *
+ * If FSGSBASE is disabled, the user can only put a user space
+ * address in GS. That makes an attack harder, but still
+ * possible if there's no SMAP protection.
+ */
+ if (!smap_works_speculatively()) {
+ /*
+ * Mitigation can be provided from SWAPGS itself or
+ * PTI as the CR3 write in the Meltdown mitigation
+ * is serializing.
+ *
+ * If neither is there, mitigate with an LFENCE to
+ * stop speculation through swapgs.
+ */
+ if (boot_cpu_has_bug(X86_BUG_SWAPGS) &&
+ !boot_cpu_has(X86_FEATURE_PTI))
+ setup_force_cpu_cap(X86_FEATURE_FENCE_SWAPGS_USER);
+
+ /*
+ * Enable lfences in the kernel entry (non-swapgs)
+ * paths, to prevent user entry from speculatively
+ * skipping swapgs.
+ */
+ setup_force_cpu_cap(X86_FEATURE_FENCE_SWAPGS_KERNEL);
+ }
+ }
+
+ pr_info("%s\n", spectre_v1_strings[spectre_v1_mitigation]);
+}
+
+static int __init nospectre_v1_cmdline(char *str)
+{
+ spectre_v1_mitigation = SPECTRE_V1_MITIGATION_NONE;
+ return 0;
+}
+early_param("nospectre_v1", nospectre_v1_cmdline);
+
+#undef pr_fmt
+#define pr_fmt(fmt) "Spectre V2 : " fmt
+
+static enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init =
+ SPECTRE_V2_NONE;
+
+static enum spectre_v2_user_mitigation spectre_v2_user_stibp __ro_after_init =
+ SPECTRE_V2_USER_NONE;
+static enum spectre_v2_user_mitigation spectre_v2_user_ibpb __ro_after_init =
+ SPECTRE_V2_USER_NONE;
+
+#ifdef CONFIG_RETPOLINE
+static bool spectre_v2_bad_module;
+
+bool retpoline_module_ok(bool has_retpoline)
+{
+ if (spectre_v2_enabled == SPECTRE_V2_NONE || has_retpoline)
+ return true;
+
+ pr_err("System may be vulnerable to spectre v2\n");
+ spectre_v2_bad_module = true;
+ return false;
+}
+
+static inline const char *spectre_v2_module_string(void)
+{
+ return spectre_v2_bad_module ? " - vulnerable module loaded" : "";
+}
+#else
+static inline const char *spectre_v2_module_string(void) { return ""; }
+#endif
+
+#define SPECTRE_V2_LFENCE_MSG "WARNING: LFENCE mitigation is not recommended for this CPU, data leaks possible!\n"
+#define SPECTRE_V2_EIBRS_EBPF_MSG "WARNING: Unprivileged eBPF is enabled with eIBRS on, data leaks possible via Spectre v2 BHB attacks!\n"
+#define SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG "WARNING: Unprivileged eBPF is enabled with eIBRS+LFENCE mitigation and SMT, data leaks possible via Spectre v2 BHB attacks!\n"
+
+#ifdef CONFIG_BPF_SYSCALL
+void unpriv_ebpf_notify(int new_state)
+{
+ if (new_state)
+ return;
+
+ /* Unprivileged eBPF is enabled */
+
+ switch (spectre_v2_enabled) {
+ case SPECTRE_V2_EIBRS:
+ pr_err(SPECTRE_V2_EIBRS_EBPF_MSG);
+ break;
+ case SPECTRE_V2_EIBRS_LFENCE:
+ if (sched_smt_active())
+ pr_err(SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG);
+ break;
+ default:
+ break;
+ }
+}
+#endif
+
+static inline bool match_option(const char *arg, int arglen, const char *opt)
+{
+ int len = strlen(opt);
+
+ return len == arglen && !strncmp(arg, opt, len);
+}
+
+/* The kernel command line selection for spectre v2 */
+enum spectre_v2_mitigation_cmd {
+ SPECTRE_V2_CMD_NONE,
+ SPECTRE_V2_CMD_AUTO,
+ SPECTRE_V2_CMD_FORCE,
+ SPECTRE_V2_CMD_RETPOLINE,
+ SPECTRE_V2_CMD_RETPOLINE_GENERIC,
+ SPECTRE_V2_CMD_RETPOLINE_LFENCE,
+ SPECTRE_V2_CMD_EIBRS,
+ SPECTRE_V2_CMD_EIBRS_RETPOLINE,
+ SPECTRE_V2_CMD_EIBRS_LFENCE,
+};
+
+enum spectre_v2_user_cmd {
+ SPECTRE_V2_USER_CMD_NONE,
+ SPECTRE_V2_USER_CMD_AUTO,
+ SPECTRE_V2_USER_CMD_FORCE,
+ SPECTRE_V2_USER_CMD_PRCTL,
+ SPECTRE_V2_USER_CMD_PRCTL_IBPB,
+ SPECTRE_V2_USER_CMD_SECCOMP,
+ SPECTRE_V2_USER_CMD_SECCOMP_IBPB,
+};
+
+static const char * const spectre_v2_user_strings[] = {
+ [SPECTRE_V2_USER_NONE] = "User space: Vulnerable",
+ [SPECTRE_V2_USER_STRICT] = "User space: Mitigation: STIBP protection",
+ [SPECTRE_V2_USER_STRICT_PREFERRED] = "User space: Mitigation: STIBP always-on protection",
+ [SPECTRE_V2_USER_PRCTL] = "User space: Mitigation: STIBP via prctl",
+ [SPECTRE_V2_USER_SECCOMP] = "User space: Mitigation: STIBP via seccomp and prctl",
+};
+
+static const struct {
+ const char *option;
+ enum spectre_v2_user_cmd cmd;
+ bool secure;
+} v2_user_options[] __initconst = {
+ { "auto", SPECTRE_V2_USER_CMD_AUTO, false },
+ { "off", SPECTRE_V2_USER_CMD_NONE, false },
+ { "on", SPECTRE_V2_USER_CMD_FORCE, true },
+ { "prctl", SPECTRE_V2_USER_CMD_PRCTL, false },
+ { "prctl,ibpb", SPECTRE_V2_USER_CMD_PRCTL_IBPB, false },
+ { "seccomp", SPECTRE_V2_USER_CMD_SECCOMP, false },
+ { "seccomp,ibpb", SPECTRE_V2_USER_CMD_SECCOMP_IBPB, false },
+};
+
+static void __init spec_v2_user_print_cond(const char *reason, bool secure)
+{
+ if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2) != secure)
+ pr_info("spectre_v2_user=%s forced on command line.\n", reason);
+}
+
+static enum spectre_v2_user_cmd __init
+spectre_v2_parse_user_cmdline(enum spectre_v2_mitigation_cmd v2_cmd)
+{
+ char arg[20];
+ int ret, i;
+
+ switch (v2_cmd) {
+ case SPECTRE_V2_CMD_NONE:
+ return SPECTRE_V2_USER_CMD_NONE;
+ case SPECTRE_V2_CMD_FORCE:
+ return SPECTRE_V2_USER_CMD_FORCE;
+ default:
+ break;
+ }
+
+ ret = cmdline_find_option(boot_command_line, "spectre_v2_user",
+ arg, sizeof(arg));
+ if (ret < 0)
+ return SPECTRE_V2_USER_CMD_AUTO;
+
+ for (i = 0; i < ARRAY_SIZE(v2_user_options); i++) {
+ if (match_option(arg, ret, v2_user_options[i].option)) {
+ spec_v2_user_print_cond(v2_user_options[i].option,
+ v2_user_options[i].secure);
+ return v2_user_options[i].cmd;
+ }
+ }
+
+ pr_err("Unknown user space protection option (%s). Switching to AUTO select\n", arg);
+ return SPECTRE_V2_USER_CMD_AUTO;
+}
+
+static inline bool spectre_v2_in_eibrs_mode(enum spectre_v2_mitigation mode)
+{
+ return (mode == SPECTRE_V2_EIBRS ||
+ mode == SPECTRE_V2_EIBRS_RETPOLINE ||
+ mode == SPECTRE_V2_EIBRS_LFENCE);
+}
+
+static void __init
+spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd)
+{
+ enum spectre_v2_user_mitigation mode = SPECTRE_V2_USER_NONE;
+ bool smt_possible = IS_ENABLED(CONFIG_SMP);
+ enum spectre_v2_user_cmd cmd;
+
+ if (!boot_cpu_has(X86_FEATURE_IBPB) && !boot_cpu_has(X86_FEATURE_STIBP))
+ return;
+
+ if (cpu_smt_control == CPU_SMT_FORCE_DISABLED ||
+ cpu_smt_control == CPU_SMT_NOT_SUPPORTED)
+ smt_possible = false;
+
+ cmd = spectre_v2_parse_user_cmdline(v2_cmd);
+ switch (cmd) {
+ case SPECTRE_V2_USER_CMD_NONE:
+ goto set_mode;
+ case SPECTRE_V2_USER_CMD_FORCE:
+ mode = SPECTRE_V2_USER_STRICT;
+ break;
+ case SPECTRE_V2_USER_CMD_PRCTL:
+ case SPECTRE_V2_USER_CMD_PRCTL_IBPB:
+ mode = SPECTRE_V2_USER_PRCTL;
+ break;
+ case SPECTRE_V2_USER_CMD_AUTO:
+ case SPECTRE_V2_USER_CMD_SECCOMP:
+ case SPECTRE_V2_USER_CMD_SECCOMP_IBPB:
+ if (IS_ENABLED(CONFIG_SECCOMP))
+ mode = SPECTRE_V2_USER_SECCOMP;
+ else
+ mode = SPECTRE_V2_USER_PRCTL;
+ break;
+ }
+
+ /* Initialize Indirect Branch Prediction Barrier */
+ if (boot_cpu_has(X86_FEATURE_IBPB)) {
+ setup_force_cpu_cap(X86_FEATURE_USE_IBPB);
+
+ spectre_v2_user_ibpb = mode;
+ switch (cmd) {
+ case SPECTRE_V2_USER_CMD_FORCE:
+ case SPECTRE_V2_USER_CMD_PRCTL_IBPB:
+ case SPECTRE_V2_USER_CMD_SECCOMP_IBPB:
+ static_branch_enable(&switch_mm_always_ibpb);
+ spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT;
+ break;
+ case SPECTRE_V2_USER_CMD_PRCTL:
+ case SPECTRE_V2_USER_CMD_AUTO:
+ case SPECTRE_V2_USER_CMD_SECCOMP:
+ static_branch_enable(&switch_mm_cond_ibpb);
+ break;
+ default:
+ break;
+ }
+
+ pr_info("mitigation: Enabling %s Indirect Branch Prediction Barrier\n",
+ static_key_enabled(&switch_mm_always_ibpb) ?
+ "always-on" : "conditional");
+ }
+
+ /*
+ * If no STIBP, enhanced IBRS is enabled or SMT impossible, STIBP is not
+ * required.
+ */
+ if (!boot_cpu_has(X86_FEATURE_STIBP) ||
+ !smt_possible ||
+ spectre_v2_in_eibrs_mode(spectre_v2_enabled))
+ return;
+
+ /*
+ * At this point, an STIBP mode other than "off" has been set.
+ * If STIBP support is not being forced, check if STIBP always-on
+ * is preferred.
+ */
+ if (mode != SPECTRE_V2_USER_STRICT &&
+ boot_cpu_has(X86_FEATURE_AMD_STIBP_ALWAYS_ON))
+ mode = SPECTRE_V2_USER_STRICT_PREFERRED;
+
+ spectre_v2_user_stibp = mode;
+
+set_mode:
+ pr_info("%s\n", spectre_v2_user_strings[mode]);
+}
+
+static const char * const spectre_v2_strings[] = {
+ [SPECTRE_V2_NONE] = "Vulnerable",
+ [SPECTRE_V2_RETPOLINE] = "Mitigation: Retpolines",
+ [SPECTRE_V2_LFENCE] = "Mitigation: LFENCE",
+ [SPECTRE_V2_EIBRS] = "Mitigation: Enhanced IBRS",
+ [SPECTRE_V2_EIBRS_LFENCE] = "Mitigation: Enhanced IBRS + LFENCE",
+ [SPECTRE_V2_EIBRS_RETPOLINE] = "Mitigation: Enhanced IBRS + Retpolines",
+};
+
+static const struct {
+ const char *option;
+ enum spectre_v2_mitigation_cmd cmd;
+ bool secure;
+} mitigation_options[] __initconst = {
+ { "off", SPECTRE_V2_CMD_NONE, false },
+ { "on", SPECTRE_V2_CMD_FORCE, true },
+ { "retpoline", SPECTRE_V2_CMD_RETPOLINE, false },
+ { "retpoline,amd", SPECTRE_V2_CMD_RETPOLINE_LFENCE, false },
+ { "retpoline,lfence", SPECTRE_V2_CMD_RETPOLINE_LFENCE, false },
+ { "retpoline,generic", SPECTRE_V2_CMD_RETPOLINE_GENERIC, false },
+ { "eibrs", SPECTRE_V2_CMD_EIBRS, false },
+ { "eibrs,lfence", SPECTRE_V2_CMD_EIBRS_LFENCE, false },
+ { "eibrs,retpoline", SPECTRE_V2_CMD_EIBRS_RETPOLINE, false },
+ { "auto", SPECTRE_V2_CMD_AUTO, false },
+};
+
+static void __init spec_v2_print_cond(const char *reason, bool secure)
+{
+ if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2) != secure)
+ pr_info("%s selected on command line.\n", reason);
+}
+
+static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
+{
+ enum spectre_v2_mitigation_cmd cmd = SPECTRE_V2_CMD_AUTO;
+ char arg[20];
+ int ret, i;
+
+ if (cmdline_find_option_bool(boot_command_line, "nospectre_v2") ||
+ cpu_mitigations_off())
+ return SPECTRE_V2_CMD_NONE;
+
+ ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, sizeof(arg));
+ if (ret < 0)
+ return SPECTRE_V2_CMD_AUTO;
+
+ for (i = 0; i < ARRAY_SIZE(mitigation_options); i++) {
+ if (!match_option(arg, ret, mitigation_options[i].option))
+ continue;
+ cmd = mitigation_options[i].cmd;
+ break;
+ }
+
+ if (i >= ARRAY_SIZE(mitigation_options)) {
+ pr_err("unknown option (%s). Switching to AUTO select\n", arg);
+ return SPECTRE_V2_CMD_AUTO;
+ }
+
+ if ((cmd == SPECTRE_V2_CMD_RETPOLINE ||
+ cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE ||
+ cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC ||
+ cmd == SPECTRE_V2_CMD_EIBRS_LFENCE ||
+ cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) &&
+ !IS_ENABLED(CONFIG_RETPOLINE)) {
+ pr_err("%s selected but not compiled in. Switching to AUTO select\n",
+ mitigation_options[i].option);
+ return SPECTRE_V2_CMD_AUTO;
+ }
+
+ if ((cmd == SPECTRE_V2_CMD_EIBRS ||
+ cmd == SPECTRE_V2_CMD_EIBRS_LFENCE ||
+ cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) &&
+ !boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) {
+ pr_err("%s selected but CPU doesn't have eIBRS. Switching to AUTO select\n",
+ mitigation_options[i].option);
+ return SPECTRE_V2_CMD_AUTO;
+ }
+
+ if ((cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE ||
+ cmd == SPECTRE_V2_CMD_EIBRS_LFENCE) &&
+ !boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) {
+ pr_err("%s selected, but CPU doesn't have a serializing LFENCE. Switching to AUTO select\n",
+ mitigation_options[i].option);
+ return SPECTRE_V2_CMD_AUTO;
+ }
+
+ spec_v2_print_cond(mitigation_options[i].option,
+ mitigation_options[i].secure);
+ return cmd;
+}
+
+static enum spectre_v2_mitigation __init spectre_v2_select_retpoline(void)
+{
+ if (!IS_ENABLED(CONFIG_RETPOLINE)) {
+ pr_err("Kernel not compiled with retpoline; no mitigation available!");
+ return SPECTRE_V2_NONE;
+ }
+
+ return SPECTRE_V2_RETPOLINE;
+}
+
+static void __init spectre_v2_select_mitigation(void)
+{
+ enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline();
+ enum spectre_v2_mitigation mode = SPECTRE_V2_NONE;
+
+ /*
+ * If the CPU is not affected and the command line mode is NONE or AUTO
+ * then nothing to do.
+ */
+ if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2) &&
+ (cmd == SPECTRE_V2_CMD_NONE || cmd == SPECTRE_V2_CMD_AUTO))
+ return;
+
+ switch (cmd) {
+ case SPECTRE_V2_CMD_NONE:
+ return;
+
+ case SPECTRE_V2_CMD_FORCE:
+ case SPECTRE_V2_CMD_AUTO:
+ if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) {
+ mode = SPECTRE_V2_EIBRS;
+ break;
+ }
+
+ mode = spectre_v2_select_retpoline();
+ break;
+
+ case SPECTRE_V2_CMD_RETPOLINE_LFENCE:
+ pr_err(SPECTRE_V2_LFENCE_MSG);
+ mode = SPECTRE_V2_LFENCE;
+ break;
+
+ case SPECTRE_V2_CMD_RETPOLINE_GENERIC:
+ mode = SPECTRE_V2_RETPOLINE;
+ break;
+
+ case SPECTRE_V2_CMD_RETPOLINE:
+ mode = spectre_v2_select_retpoline();
+ break;
+
+ case SPECTRE_V2_CMD_EIBRS:
+ mode = SPECTRE_V2_EIBRS;
+ break;
+
+ case SPECTRE_V2_CMD_EIBRS_LFENCE:
+ mode = SPECTRE_V2_EIBRS_LFENCE;
+ break;
+
+ case SPECTRE_V2_CMD_EIBRS_RETPOLINE:
+ mode = SPECTRE_V2_EIBRS_RETPOLINE;
+ break;
+ }
+
+ if (mode == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled())
+ pr_err(SPECTRE_V2_EIBRS_EBPF_MSG);
+
+ if (spectre_v2_in_eibrs_mode(mode)) {
+ /* Force it so VMEXIT will restore correctly */
+ x86_spec_ctrl_base |= SPEC_CTRL_IBRS;
+ wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
+ }
+
+ switch (mode) {
+ case SPECTRE_V2_NONE:
+ case SPECTRE_V2_EIBRS:
+ break;
+
+ case SPECTRE_V2_LFENCE:
+ case SPECTRE_V2_EIBRS_LFENCE:
+ setup_force_cpu_cap(X86_FEATURE_RETPOLINE_LFENCE);
+ /* fallthrough */
+
+ case SPECTRE_V2_RETPOLINE:
+ case SPECTRE_V2_EIBRS_RETPOLINE:
+ setup_force_cpu_cap(X86_FEATURE_RETPOLINE);
+ break;
+ }
+
+ spectre_v2_enabled = mode;
+ pr_info("%s\n", spectre_v2_strings[mode]);
+
+ /*
+ * If spectre v2 protection has been enabled, unconditionally fill
+ * RSB during a context switch; this protects against two independent
+ * issues:
+ *
+ * - RSB underflow (and switch to BTB) on Skylake+
+ * - SpectreRSB variant of spectre v2 on X86_BUG_SPECTRE_V2 CPUs
+ */
+ setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW);
+ pr_info("Spectre v2 / SpectreRSB mitigation: Filling RSB on context switch\n");
+
+ /*
+ * Retpoline means the kernel is safe because it has no indirect
+ * branches. Enhanced IBRS protects firmware too, so, enable restricted
+ * speculation around firmware calls only when Enhanced IBRS isn't
+ * supported.
+ *
+ * Use "mode" to check Enhanced IBRS instead of boot_cpu_has(), because
+ * the user might select retpoline on the kernel command line and if
+ * the CPU supports Enhanced IBRS, kernel might un-intentionally not
+ * enable IBRS around firmware calls.
+ */
+ if (boot_cpu_has(X86_FEATURE_IBRS) && !spectre_v2_in_eibrs_mode(mode)) {
+ setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW);
+ pr_info("Enabling Restricted Speculation for firmware calls\n");
+ }
+
+ /* Set up IBPB and STIBP depending on the general spectre V2 command */
+ spectre_v2_user_select_mitigation(cmd);
+}
+
+static void update_stibp_msr(void * __unused)
+{
+ wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
+}
+
+/* Update x86_spec_ctrl_base in case SMT state changed. */
+static void update_stibp_strict(void)
+{
+ u64 mask = x86_spec_ctrl_base & ~SPEC_CTRL_STIBP;
+
+ if (sched_smt_active())
+ mask |= SPEC_CTRL_STIBP;
+
+ if (mask == x86_spec_ctrl_base)
+ return;
+
+ pr_info("Update user space SMT mitigation: STIBP %s\n",
+ mask & SPEC_CTRL_STIBP ? "always-on" : "off");
+ x86_spec_ctrl_base = mask;
+ on_each_cpu(update_stibp_msr, NULL, 1);
+}
+
+/* Update the static key controlling the evaluation of TIF_SPEC_IB */
+static void update_indir_branch_cond(void)
+{
+ if (sched_smt_active())
+ static_branch_enable(&switch_to_cond_stibp);
+ else
+ static_branch_disable(&switch_to_cond_stibp);
+}
+
+#undef pr_fmt
+#define pr_fmt(fmt) fmt
+
+/* Update the static key controlling the MDS CPU buffer clear in idle */
+static void update_mds_branch_idle(void)
+{
+ u64 ia32_cap = x86_read_arch_cap_msr();
+
+ /*
+ * Enable the idle clearing if SMT is active on CPUs which are
+ * affected only by MSBDS and not any other MDS variant.
+ *
+ * The other variants cannot be mitigated when SMT is enabled, so
+ * clearing the buffers on idle just to prevent the Store Buffer
+ * repartitioning leak would be a window dressing exercise.
+ */
+ if (!boot_cpu_has_bug(X86_BUG_MSBDS_ONLY))
+ return;
+
+ if (sched_smt_active()) {
+ static_branch_enable(&mds_idle_clear);
+ } else if (mmio_mitigation == MMIO_MITIGATION_OFF ||
+ (ia32_cap & ARCH_CAP_FBSDP_NO)) {
+ static_branch_disable(&mds_idle_clear);
+ }
+}
+
+#define MDS_MSG_SMT "MDS CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/mds.html for more details.\n"
+#define TAA_MSG_SMT "TAA CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/tsx_async_abort.html for more details.\n"
+#define MMIO_MSG_SMT "MMIO Stale Data CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/processor_mmio_stale_data.html for more details.\n"
+
+void arch_smt_update(void)
+{
+ mutex_lock(&spec_ctrl_mutex);
+
+ if (sched_smt_active() && unprivileged_ebpf_enabled() &&
+ spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE)
+ pr_warn_once(SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG);
+
+ switch (spectre_v2_user_stibp) {
+ case SPECTRE_V2_USER_NONE:
+ break;
+ case SPECTRE_V2_USER_STRICT:
+ case SPECTRE_V2_USER_STRICT_PREFERRED:
+ update_stibp_strict();
+ break;
+ case SPECTRE_V2_USER_PRCTL:
+ case SPECTRE_V2_USER_SECCOMP:
+ update_indir_branch_cond();
+ break;
+ }
+
+ switch (mds_mitigation) {
+ case MDS_MITIGATION_FULL:
+ case MDS_MITIGATION_VMWERV:
+ if (sched_smt_active() && !boot_cpu_has(X86_BUG_MSBDS_ONLY))
+ pr_warn_once(MDS_MSG_SMT);
+ update_mds_branch_idle();
+ break;
+ case MDS_MITIGATION_OFF:
+ break;
+ }
+
+ switch (taa_mitigation) {
+ case TAA_MITIGATION_VERW:
+ case TAA_MITIGATION_UCODE_NEEDED:
+ if (sched_smt_active())
+ pr_warn_once(TAA_MSG_SMT);
+ break;
+ case TAA_MITIGATION_TSX_DISABLED:
+ case TAA_MITIGATION_OFF:
+ break;
+ }
+
+ switch (mmio_mitigation) {
+ case MMIO_MITIGATION_VERW:
+ case MMIO_MITIGATION_UCODE_NEEDED:
+ if (sched_smt_active())
+ pr_warn_once(MMIO_MSG_SMT);
+ break;
+ case MMIO_MITIGATION_OFF:
+ break;
+ }
+
+ mutex_unlock(&spec_ctrl_mutex);
+}
+
+#undef pr_fmt
+#define pr_fmt(fmt) "Speculative Store Bypass: " fmt
+
+static enum ssb_mitigation ssb_mode __ro_after_init = SPEC_STORE_BYPASS_NONE;
+
+/* The kernel command line selection */
+enum ssb_mitigation_cmd {
+ SPEC_STORE_BYPASS_CMD_NONE,
+ SPEC_STORE_BYPASS_CMD_AUTO,
+ SPEC_STORE_BYPASS_CMD_ON,
+ SPEC_STORE_BYPASS_CMD_PRCTL,
+ SPEC_STORE_BYPASS_CMD_SECCOMP,
+};
+
+static const char * const ssb_strings[] = {
+ [SPEC_STORE_BYPASS_NONE] = "Vulnerable",
+ [SPEC_STORE_BYPASS_DISABLE] = "Mitigation: Speculative Store Bypass disabled",
+ [SPEC_STORE_BYPASS_PRCTL] = "Mitigation: Speculative Store Bypass disabled via prctl",
+ [SPEC_STORE_BYPASS_SECCOMP] = "Mitigation: Speculative Store Bypass disabled via prctl and seccomp",
+};
+
+static const struct {
+ const char *option;
+ enum ssb_mitigation_cmd cmd;
+} ssb_mitigation_options[] __initconst = {
+ { "auto", SPEC_STORE_BYPASS_CMD_AUTO }, /* Platform decides */
+ { "on", SPEC_STORE_BYPASS_CMD_ON }, /* Disable Speculative Store Bypass */
+ { "off", SPEC_STORE_BYPASS_CMD_NONE }, /* Don't touch Speculative Store Bypass */
+ { "prctl", SPEC_STORE_BYPASS_CMD_PRCTL }, /* Disable Speculative Store Bypass via prctl */
+ { "seccomp", SPEC_STORE_BYPASS_CMD_SECCOMP }, /* Disable Speculative Store Bypass via prctl and seccomp */
+};
+
+static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void)
+{
+ enum ssb_mitigation_cmd cmd = SPEC_STORE_BYPASS_CMD_AUTO;
+ char arg[20];
+ int ret, i;
+
+ if (cmdline_find_option_bool(boot_command_line, "nospec_store_bypass_disable") ||
+ cpu_mitigations_off()) {
+ return SPEC_STORE_BYPASS_CMD_NONE;
+ } else {
+ ret = cmdline_find_option(boot_command_line, "spec_store_bypass_disable",
+ arg, sizeof(arg));
+ if (ret < 0)
+ return SPEC_STORE_BYPASS_CMD_AUTO;
+
+ for (i = 0; i < ARRAY_SIZE(ssb_mitigation_options); i++) {
+ if (!match_option(arg, ret, ssb_mitigation_options[i].option))
+ continue;
+
+ cmd = ssb_mitigation_options[i].cmd;
+ break;
+ }
+
+ if (i >= ARRAY_SIZE(ssb_mitigation_options)) {
+ pr_err("unknown option (%s). Switching to AUTO select\n", arg);
+ return SPEC_STORE_BYPASS_CMD_AUTO;
+ }
+ }
+
+ return cmd;
+}
+
+static enum ssb_mitigation __init __ssb_select_mitigation(void)
+{
+ enum ssb_mitigation mode = SPEC_STORE_BYPASS_NONE;
+ enum ssb_mitigation_cmd cmd;
+
+ if (!boot_cpu_has(X86_FEATURE_SSBD))
+ return mode;
+
+ cmd = ssb_parse_cmdline();
+ if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS) &&
+ (cmd == SPEC_STORE_BYPASS_CMD_NONE ||
+ cmd == SPEC_STORE_BYPASS_CMD_AUTO))
+ return mode;
+
+ switch (cmd) {
+ case SPEC_STORE_BYPASS_CMD_AUTO:
+ case SPEC_STORE_BYPASS_CMD_SECCOMP:
+ /*
+ * Choose prctl+seccomp as the default mode if seccomp is
+ * enabled.
+ */
+ if (IS_ENABLED(CONFIG_SECCOMP))
+ mode = SPEC_STORE_BYPASS_SECCOMP;
+ else
+ mode = SPEC_STORE_BYPASS_PRCTL;
+ break;
+ case SPEC_STORE_BYPASS_CMD_ON:
+ mode = SPEC_STORE_BYPASS_DISABLE;
+ break;
+ case SPEC_STORE_BYPASS_CMD_PRCTL:
+ mode = SPEC_STORE_BYPASS_PRCTL;
+ break;
+ case SPEC_STORE_BYPASS_CMD_NONE:
+ break;
+ }
+
+ /*
+ * If SSBD is controlled by the SPEC_CTRL MSR, then set the proper
+ * bit in the mask to allow guests to use the mitigation even in the
+ * case where the host does not enable it.
+ */
+ if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) ||
+ static_cpu_has(X86_FEATURE_AMD_SSBD)) {
+ x86_spec_ctrl_mask |= SPEC_CTRL_SSBD;
+ }
+
+ /*
+ * We have three CPU feature flags that are in play here:
+ * - X86_BUG_SPEC_STORE_BYPASS - CPU is susceptible.
+ * - X86_FEATURE_SSBD - CPU is able to turn off speculative store bypass
+ * - X86_FEATURE_SPEC_STORE_BYPASS_DISABLE - engage the mitigation
+ */
+ if (mode == SPEC_STORE_BYPASS_DISABLE) {
+ setup_force_cpu_cap(X86_FEATURE_SPEC_STORE_BYPASS_DISABLE);
+ /*
+ * Intel uses the SPEC CTRL MSR Bit(2) for this, while AMD may
+ * use a completely different MSR and bit dependent on family.
+ */
+ if (!static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) &&
+ !static_cpu_has(X86_FEATURE_AMD_SSBD)) {
+ x86_amd_ssb_disable();
+ } else {
+ x86_spec_ctrl_base |= SPEC_CTRL_SSBD;
+ wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
+ }
+ }
+
+ return mode;
+}
+
+static void ssb_select_mitigation(void)
+{
+ ssb_mode = __ssb_select_mitigation();
+
+ if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
+ pr_info("%s\n", ssb_strings[ssb_mode]);
+}
+
+#undef pr_fmt
+#define pr_fmt(fmt) "Speculation prctl: " fmt
+
+static void task_update_spec_tif(struct task_struct *tsk)
+{
+ /* Force the update of the real TIF bits */
+ set_tsk_thread_flag(tsk, TIF_SPEC_FORCE_UPDATE);
+
+ /*
+ * Immediately update the speculation control MSRs for the current
+ * task, but for a non-current task delay setting the CPU
+ * mitigation until it is scheduled next.
+ *
+ * This can only happen for SECCOMP mitigation. For PRCTL it's
+ * always the current task.
+ */
+ if (tsk == current)
+ speculation_ctrl_update_current();
+}
+
+static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl)
+{
+ if (ssb_mode != SPEC_STORE_BYPASS_PRCTL &&
+ ssb_mode != SPEC_STORE_BYPASS_SECCOMP)
+ return -ENXIO;
+
+ switch (ctrl) {
+ case PR_SPEC_ENABLE:
+ /* If speculation is force disabled, enable is not allowed */
+ if (task_spec_ssb_force_disable(task))
+ return -EPERM;
+ task_clear_spec_ssb_disable(task);
+ task_update_spec_tif(task);
+ break;
+ case PR_SPEC_DISABLE:
+ task_set_spec_ssb_disable(task);
+ task_update_spec_tif(task);
+ break;
+ case PR_SPEC_FORCE_DISABLE:
+ task_set_spec_ssb_disable(task);
+ task_set_spec_ssb_force_disable(task);
+ task_update_spec_tif(task);
+ break;
+ default:
+ return -ERANGE;
+ }
+ return 0;
+}
+
+static bool is_spec_ib_user_controlled(void)
+{
+ return spectre_v2_user_ibpb == SPECTRE_V2_USER_PRCTL ||
+ spectre_v2_user_ibpb == SPECTRE_V2_USER_SECCOMP ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_PRCTL ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_SECCOMP;
+}
+
+static int ib_prctl_set(struct task_struct *task, unsigned long ctrl)
+{
+ switch (ctrl) {
+ case PR_SPEC_ENABLE:
+ if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE &&
+ spectre_v2_user_stibp == SPECTRE_V2_USER_NONE)
+ return 0;
+ /*
+ * With strict mode for both IBPB and STIBP, the instruction
+ * code paths avoid checking this task flag and instead,
+ * unconditionally run the instruction. However, STIBP and IBPB
+ * are independent and either can be set to conditionally
+ * enabled regardless of the mode of the other.
+ *
+ * If either is set to conditional, allow the task flag to be
+ * updated, unless it was force-disabled by a previous prctl
+ * call. Currently, this is possible on an AMD CPU which has the
+ * feature X86_FEATURE_AMD_STIBP_ALWAYS_ON. In this case, if the
+ * kernel is booted with 'spectre_v2_user=seccomp', then
+ * spectre_v2_user_ibpb == SPECTRE_V2_USER_SECCOMP and
+ * spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED.
+ */
+ if (!is_spec_ib_user_controlled() ||
+ task_spec_ib_force_disable(task))
+ return -EPERM;
+
+ task_clear_spec_ib_disable(task);
+ task_update_spec_tif(task);
+ break;
+ case PR_SPEC_DISABLE:
+ case PR_SPEC_FORCE_DISABLE:
+ /*
+ * Indirect branch speculation is always allowed when
+ * mitigation is force disabled.
+ */
+ if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE &&
+ spectre_v2_user_stibp == SPECTRE_V2_USER_NONE)
+ return -EPERM;
+
+ if (!is_spec_ib_user_controlled())
+ return 0;
+
+ task_set_spec_ib_disable(task);
+ if (ctrl == PR_SPEC_FORCE_DISABLE)
+ task_set_spec_ib_force_disable(task);
+ task_update_spec_tif(task);
+ break;
+ default:
+ return -ERANGE;
+ }
+ return 0;
+}
+
+int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which,
+ unsigned long ctrl)
+{
+ switch (which) {
+ case PR_SPEC_STORE_BYPASS:
+ return ssb_prctl_set(task, ctrl);
+ case PR_SPEC_INDIRECT_BRANCH:
+ return ib_prctl_set(task, ctrl);
+ default:
+ return -ENODEV;
+ }
+}
+
+#ifdef CONFIG_SECCOMP
+void arch_seccomp_spec_mitigate(struct task_struct *task)
+{
+ if (ssb_mode == SPEC_STORE_BYPASS_SECCOMP)
+ ssb_prctl_set(task, PR_SPEC_FORCE_DISABLE);
+ if (spectre_v2_user_ibpb == SPECTRE_V2_USER_SECCOMP ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_SECCOMP)
+ ib_prctl_set(task, PR_SPEC_FORCE_DISABLE);
+}
+#endif
+
+static int ssb_prctl_get(struct task_struct *task)
+{
+ switch (ssb_mode) {
+ case SPEC_STORE_BYPASS_DISABLE:
+ return PR_SPEC_DISABLE;
+ case SPEC_STORE_BYPASS_SECCOMP:
+ case SPEC_STORE_BYPASS_PRCTL:
+ if (task_spec_ssb_force_disable(task))
+ return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE;
+ if (task_spec_ssb_disable(task))
+ return PR_SPEC_PRCTL | PR_SPEC_DISABLE;
+ return PR_SPEC_PRCTL | PR_SPEC_ENABLE;
+ default:
+ if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
+ return PR_SPEC_ENABLE;
+ return PR_SPEC_NOT_AFFECTED;
+ }
+}
+
+static int ib_prctl_get(struct task_struct *task)
+{
+ if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
+ return PR_SPEC_NOT_AFFECTED;
+
+ if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE &&
+ spectre_v2_user_stibp == SPECTRE_V2_USER_NONE)
+ return PR_SPEC_ENABLE;
+ else if (is_spec_ib_user_controlled()) {
+ if (task_spec_ib_force_disable(task))
+ return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE;
+ if (task_spec_ib_disable(task))
+ return PR_SPEC_PRCTL | PR_SPEC_DISABLE;
+ return PR_SPEC_PRCTL | PR_SPEC_ENABLE;
+ } else if (spectre_v2_user_ibpb == SPECTRE_V2_USER_STRICT ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED)
+ return PR_SPEC_DISABLE;
+ else
+ return PR_SPEC_NOT_AFFECTED;
+}
+
+int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which)
+{
+ switch (which) {
+ case PR_SPEC_STORE_BYPASS:
+ return ssb_prctl_get(task);
+ case PR_SPEC_INDIRECT_BRANCH:
+ return ib_prctl_get(task);
+ default:
+ return -ENODEV;
+ }
+}
+
+void x86_spec_ctrl_setup_ap(void)
+{
+ if (boot_cpu_has(X86_FEATURE_MSR_SPEC_CTRL))
+ wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
+
+ if (ssb_mode == SPEC_STORE_BYPASS_DISABLE)
+ x86_amd_ssb_disable();
+}
+
+bool itlb_multihit_kvm_mitigation;
+EXPORT_SYMBOL_GPL(itlb_multihit_kvm_mitigation);
+
+#undef pr_fmt
+#define pr_fmt(fmt) "L1TF: " fmt
+
+/* Default mitigation for L1TF-affected CPUs */
+enum l1tf_mitigations l1tf_mitigation __ro_after_init = L1TF_MITIGATION_FLUSH;
+#if IS_ENABLED(CONFIG_KVM_INTEL)
+EXPORT_SYMBOL_GPL(l1tf_mitigation);
+#endif
+enum vmx_l1d_flush_state l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
+EXPORT_SYMBOL_GPL(l1tf_vmx_mitigation);
+
+/*
+ * These CPUs all support 44bits physical address space internally in the
+ * cache but CPUID can report a smaller number of physical address bits.
+ *
+ * The L1TF mitigation uses the top most address bit for the inversion of
+ * non present PTEs. When the installed memory reaches into the top most
+ * address bit due to memory holes, which has been observed on machines
+ * which report 36bits physical address bits and have 32G RAM installed,
+ * then the mitigation range check in l1tf_select_mitigation() triggers.
+ * This is a false positive because the mitigation is still possible due to
+ * the fact that the cache uses 44bit internally. Use the cache bits
+ * instead of the reported physical bits and adjust them on the affected
+ * machines to 44bit if the reported bits are less than 44.
+ */
+static void override_cache_bits(struct cpuinfo_x86 *c)
+{
+ if (c->x86 != 6)
+ return;
+
+ switch (c->x86_model) {
+ case INTEL_FAM6_NEHALEM:
+ case INTEL_FAM6_WESTMERE:
+ case INTEL_FAM6_SANDYBRIDGE:
+ case INTEL_FAM6_IVYBRIDGE:
+ case INTEL_FAM6_HASWELL_CORE:
+ case INTEL_FAM6_HASWELL_ULT:
+ case INTEL_FAM6_HASWELL_GT3E:
+ case INTEL_FAM6_BROADWELL_CORE:
+ case INTEL_FAM6_BROADWELL_GT3E:
+ case INTEL_FAM6_SKYLAKE_MOBILE:
+ case INTEL_FAM6_SKYLAKE_DESKTOP:
+ case INTEL_FAM6_KABYLAKE_MOBILE:
+ case INTEL_FAM6_KABYLAKE_DESKTOP:
+ if (c->x86_cache_bits < 44)
+ c->x86_cache_bits = 44;
+ break;
+ }
+}
+
+static void __init l1tf_select_mitigation(void)
+{
+ u64 half_pa;
+
+ if (!boot_cpu_has_bug(X86_BUG_L1TF))
+ return;
+
+ if (cpu_mitigations_off())
+ l1tf_mitigation = L1TF_MITIGATION_OFF;
+ else if (cpu_mitigations_auto_nosmt())
+ l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOSMT;
+
+ override_cache_bits(&boot_cpu_data);
+
+ switch (l1tf_mitigation) {
+ case L1TF_MITIGATION_OFF:
+ case L1TF_MITIGATION_FLUSH_NOWARN:
+ case L1TF_MITIGATION_FLUSH:
+ break;
+ case L1TF_MITIGATION_FLUSH_NOSMT:
+ case L1TF_MITIGATION_FULL:
+ cpu_smt_disable(false);
+ break;
+ case L1TF_MITIGATION_FULL_FORCE:
+ cpu_smt_disable(true);
+ break;
+ }
+
+#if CONFIG_PGTABLE_LEVELS == 2
+ pr_warn("Kernel not compiled for PAE. No mitigation for L1TF\n");
+ return;
+#endif
+
+ half_pa = (u64)l1tf_pfn_limit() << PAGE_SHIFT;
+ if (l1tf_mitigation != L1TF_MITIGATION_OFF &&
+ e820__mapped_any(half_pa, ULLONG_MAX - half_pa, E820_TYPE_RAM)) {
+ pr_warn("System has more than MAX_PA/2 memory. L1TF mitigation not effective.\n");
+ pr_info("You may make it effective by booting the kernel with mem=%llu parameter.\n",
+ half_pa);
+ pr_info("However, doing so will make a part of your RAM unusable.\n");
+ pr_info("Reading https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html might help you decide.\n");
+ return;
+ }
+
+ setup_force_cpu_cap(X86_FEATURE_L1TF_PTEINV);
+}
+
+static int __init l1tf_cmdline(char *str)
+{
+ if (!boot_cpu_has_bug(X86_BUG_L1TF))
+ return 0;
+
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "off"))
+ l1tf_mitigation = L1TF_MITIGATION_OFF;
+ else if (!strcmp(str, "flush,nowarn"))
+ l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOWARN;
+ else if (!strcmp(str, "flush"))
+ l1tf_mitigation = L1TF_MITIGATION_FLUSH;
+ else if (!strcmp(str, "flush,nosmt"))
+ l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOSMT;
+ else if (!strcmp(str, "full"))
+ l1tf_mitigation = L1TF_MITIGATION_FULL;
+ else if (!strcmp(str, "full,force"))
+ l1tf_mitigation = L1TF_MITIGATION_FULL_FORCE;
+
+ return 0;
+}
+early_param("l1tf", l1tf_cmdline);
+
+#undef pr_fmt
+#define pr_fmt(fmt) fmt
+
+#ifdef CONFIG_SYSFS
+
+#define L1TF_DEFAULT_MSG "Mitigation: PTE Inversion"
+
+#if IS_ENABLED(CONFIG_KVM_INTEL)
+static const char * const l1tf_vmx_states[] = {
+ [VMENTER_L1D_FLUSH_AUTO] = "auto",
+ [VMENTER_L1D_FLUSH_NEVER] = "vulnerable",
+ [VMENTER_L1D_FLUSH_COND] = "conditional cache flushes",
+ [VMENTER_L1D_FLUSH_ALWAYS] = "cache flushes",
+ [VMENTER_L1D_FLUSH_EPT_DISABLED] = "EPT disabled",
+ [VMENTER_L1D_FLUSH_NOT_REQUIRED] = "flush not necessary"
+};
+
+static ssize_t l1tf_show_state(char *buf)
+{
+ if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO)
+ return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG);
+
+ if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_EPT_DISABLED ||
+ (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER &&
+ sched_smt_active())) {
+ return sprintf(buf, "%s; VMX: %s\n", L1TF_DEFAULT_MSG,
+ l1tf_vmx_states[l1tf_vmx_mitigation]);
+ }
+
+ return sprintf(buf, "%s; VMX: %s, SMT %s\n", L1TF_DEFAULT_MSG,
+ l1tf_vmx_states[l1tf_vmx_mitigation],
+ sched_smt_active() ? "vulnerable" : "disabled");
+}
+
+static ssize_t itlb_multihit_show_state(char *buf)
+{
+ if (itlb_multihit_kvm_mitigation)
+ return sprintf(buf, "KVM: Mitigation: Split huge pages\n");
+ else
+ return sprintf(buf, "KVM: Vulnerable\n");
+}
+#else
+static ssize_t l1tf_show_state(char *buf)
+{
+ return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG);
+}
+
+static ssize_t itlb_multihit_show_state(char *buf)
+{
+ return sprintf(buf, "Processor vulnerable\n");
+}
+#endif
+
+static ssize_t mds_show_state(char *buf)
+{
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
+ return sprintf(buf, "%s; SMT Host state unknown\n",
+ mds_strings[mds_mitigation]);
+ }
+
+ if (boot_cpu_has(X86_BUG_MSBDS_ONLY)) {
+ return sprintf(buf, "%s; SMT %s\n", mds_strings[mds_mitigation],
+ (mds_mitigation == MDS_MITIGATION_OFF ? "vulnerable" :
+ sched_smt_active() ? "mitigated" : "disabled"));
+ }
+
+ return sprintf(buf, "%s; SMT %s\n", mds_strings[mds_mitigation],
+ sched_smt_active() ? "vulnerable" : "disabled");
+}
+
+static ssize_t tsx_async_abort_show_state(char *buf)
+{
+ if ((taa_mitigation == TAA_MITIGATION_TSX_DISABLED) ||
+ (taa_mitigation == TAA_MITIGATION_OFF))
+ return sprintf(buf, "%s\n", taa_strings[taa_mitigation]);
+
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
+ return sprintf(buf, "%s; SMT Host state unknown\n",
+ taa_strings[taa_mitigation]);
+ }
+
+ return sprintf(buf, "%s; SMT %s\n", taa_strings[taa_mitigation],
+ sched_smt_active() ? "vulnerable" : "disabled");
+}
+
+static ssize_t mmio_stale_data_show_state(char *buf)
+{
+ if (mmio_mitigation == MMIO_MITIGATION_OFF)
+ return sysfs_emit(buf, "%s\n", mmio_strings[mmio_mitigation]);
+
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
+ return sysfs_emit(buf, "%s; SMT Host state unknown\n",
+ mmio_strings[mmio_mitigation]);
+ }
+
+ return sysfs_emit(buf, "%s; SMT %s\n", mmio_strings[mmio_mitigation],
+ sched_smt_active() ? "vulnerable" : "disabled");
+}
+
+static char *stibp_state(void)
+{
+ if (spectre_v2_in_eibrs_mode(spectre_v2_enabled))
+ return "";
+
+ switch (spectre_v2_user_stibp) {
+ case SPECTRE_V2_USER_NONE:
+ return ", STIBP: disabled";
+ case SPECTRE_V2_USER_STRICT:
+ return ", STIBP: forced";
+ case SPECTRE_V2_USER_STRICT_PREFERRED:
+ return ", STIBP: always-on";
+ case SPECTRE_V2_USER_PRCTL:
+ case SPECTRE_V2_USER_SECCOMP:
+ if (static_key_enabled(&switch_to_cond_stibp))
+ return ", STIBP: conditional";
+ }
+ return "";
+}
+
+static char *ibpb_state(void)
+{
+ if (boot_cpu_has(X86_FEATURE_IBPB)) {
+ if (static_key_enabled(&switch_mm_always_ibpb))
+ return ", IBPB: always-on";
+ if (static_key_enabled(&switch_mm_cond_ibpb))
+ return ", IBPB: conditional";
+ return ", IBPB: disabled";
+ }
+ return "";
+}
+
+static ssize_t spectre_v2_show_state(char *buf)
+{
+ if (spectre_v2_enabled == SPECTRE_V2_LFENCE)
+ return sprintf(buf, "Vulnerable: LFENCE\n");
+
+ if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled())
+ return sprintf(buf, "Vulnerable: eIBRS with unprivileged eBPF\n");
+
+ if (sched_smt_active() && unprivileged_ebpf_enabled() &&
+ spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE)
+ return sprintf(buf, "Vulnerable: eIBRS+LFENCE with unprivileged eBPF and SMT\n");
+
+ return sprintf(buf, "%s%s%s%s%s%s\n",
+ spectre_v2_strings[spectre_v2_enabled],
+ ibpb_state(),
+ boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "",
+ stibp_state(),
+ boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "",
+ spectre_v2_module_string());
+}
+
+static ssize_t srbds_show_state(char *buf)
+{
+ return sprintf(buf, "%s\n", srbds_strings[srbds_mitigation]);
+}
+
+static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr,
+ char *buf, unsigned int bug)
+{
+ if (!boot_cpu_has_bug(bug))
+ return sprintf(buf, "Not affected\n");
+
+ switch (bug) {
+ case X86_BUG_CPU_MELTDOWN:
+ if (boot_cpu_has(X86_FEATURE_PTI))
+ return sprintf(buf, "Mitigation: PTI\n");
+
+ if (hypervisor_is_type(X86_HYPER_XEN_PV))
+ return sprintf(buf, "Unknown (XEN PV detected, hypervisor mitigation required)\n");
+
+ break;
+
+ case X86_BUG_SPECTRE_V1:
+ return sprintf(buf, "%s\n", spectre_v1_strings[spectre_v1_mitigation]);
+
+ case X86_BUG_SPECTRE_V2:
+ return spectre_v2_show_state(buf);
+
+ case X86_BUG_SPEC_STORE_BYPASS:
+ return sprintf(buf, "%s\n", ssb_strings[ssb_mode]);
+
+ case X86_BUG_L1TF:
+ if (boot_cpu_has(X86_FEATURE_L1TF_PTEINV))
+ return l1tf_show_state(buf);
+ break;
+
+ case X86_BUG_MDS:
+ return mds_show_state(buf);
+
+ case X86_BUG_TAA:
+ return tsx_async_abort_show_state(buf);
+
+ case X86_BUG_ITLB_MULTIHIT:
+ return itlb_multihit_show_state(buf);
+
+ case X86_BUG_SRBDS:
+ return srbds_show_state(buf);
+
+ case X86_BUG_MMIO_STALE_DATA:
+ return mmio_stale_data_show_state(buf);
+
+ default:
+ break;
+ }
+
+ return sprintf(buf, "Vulnerable\n");
+}
+
+ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_CPU_MELTDOWN);
+}
+
+ssize_t cpu_show_spectre_v1(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_SPECTRE_V1);
+}
+
+ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_SPECTRE_V2);
+}
+
+ssize_t cpu_show_spec_store_bypass(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_SPEC_STORE_BYPASS);
+}
+
+ssize_t cpu_show_l1tf(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_L1TF);
+}
+
+ssize_t cpu_show_mds(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_MDS);
+}
+
+ssize_t cpu_show_tsx_async_abort(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_TAA);
+}
+
+ssize_t cpu_show_itlb_multihit(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_ITLB_MULTIHIT);
+}
+
+ssize_t cpu_show_srbds(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_SRBDS);
+}
+
+ssize_t cpu_show_mmio_stale_data(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_MMIO_STALE_DATA);
+}
+#endif
diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c
new file mode 100644
index 000000000..4a393023f
--- /dev/null
+++ b/arch/x86/kernel/cpu/cacheinfo.c
@@ -0,0 +1,1006 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Routines to identify caches on Intel CPU.
+ *
+ * Changes:
+ * Venkatesh Pallipadi : Adding cache identification through cpuid(4)
+ * Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure.
+ * Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD.
+ */
+
+#include <linux/slab.h>
+#include <linux/cacheinfo.h>
+#include <linux/cpu.h>
+#include <linux/sched.h>
+#include <linux/capability.h>
+#include <linux/sysfs.h>
+#include <linux/pci.h>
+
+#include <asm/cpufeature.h>
+#include <asm/amd_nb.h>
+#include <asm/smp.h>
+
+#include "cpu.h"
+
+#define LVL_1_INST 1
+#define LVL_1_DATA 2
+#define LVL_2 3
+#define LVL_3 4
+#define LVL_TRACE 5
+
+struct _cache_table {
+ unsigned char descriptor;
+ char cache_type;
+ short size;
+};
+
+#define MB(x) ((x) * 1024)
+
+/* All the cache descriptor types we care about (no TLB or
+ trace cache entries) */
+
+static const struct _cache_table cache_table[] =
+{
+ { 0x06, LVL_1_INST, 8 }, /* 4-way set assoc, 32 byte line size */
+ { 0x08, LVL_1_INST, 16 }, /* 4-way set assoc, 32 byte line size */
+ { 0x09, LVL_1_INST, 32 }, /* 4-way set assoc, 64 byte line size */
+ { 0x0a, LVL_1_DATA, 8 }, /* 2 way set assoc, 32 byte line size */
+ { 0x0c, LVL_1_DATA, 16 }, /* 4-way set assoc, 32 byte line size */
+ { 0x0d, LVL_1_DATA, 16 }, /* 4-way set assoc, 64 byte line size */
+ { 0x0e, LVL_1_DATA, 24 }, /* 6-way set assoc, 64 byte line size */
+ { 0x21, LVL_2, 256 }, /* 8-way set assoc, 64 byte line size */
+ { 0x22, LVL_3, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */
+ { 0x23, LVL_3, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */
+ { 0x25, LVL_3, MB(2) }, /* 8-way set assoc, sectored cache, 64 byte line size */
+ { 0x29, LVL_3, MB(4) }, /* 8-way set assoc, sectored cache, 64 byte line size */
+ { 0x2c, LVL_1_DATA, 32 }, /* 8-way set assoc, 64 byte line size */
+ { 0x30, LVL_1_INST, 32 }, /* 8-way set assoc, 64 byte line size */
+ { 0x39, LVL_2, 128 }, /* 4-way set assoc, sectored cache, 64 byte line size */
+ { 0x3a, LVL_2, 192 }, /* 6-way set assoc, sectored cache, 64 byte line size */
+ { 0x3b, LVL_2, 128 }, /* 2-way set assoc, sectored cache, 64 byte line size */
+ { 0x3c, LVL_2, 256 }, /* 4-way set assoc, sectored cache, 64 byte line size */
+ { 0x3d, LVL_2, 384 }, /* 6-way set assoc, sectored cache, 64 byte line size */
+ { 0x3e, LVL_2, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */
+ { 0x3f, LVL_2, 256 }, /* 2-way set assoc, 64 byte line size */
+ { 0x41, LVL_2, 128 }, /* 4-way set assoc, 32 byte line size */
+ { 0x42, LVL_2, 256 }, /* 4-way set assoc, 32 byte line size */
+ { 0x43, LVL_2, 512 }, /* 4-way set assoc, 32 byte line size */
+ { 0x44, LVL_2, MB(1) }, /* 4-way set assoc, 32 byte line size */
+ { 0x45, LVL_2, MB(2) }, /* 4-way set assoc, 32 byte line size */
+ { 0x46, LVL_3, MB(4) }, /* 4-way set assoc, 64 byte line size */
+ { 0x47, LVL_3, MB(8) }, /* 8-way set assoc, 64 byte line size */
+ { 0x48, LVL_2, MB(3) }, /* 12-way set assoc, 64 byte line size */
+ { 0x49, LVL_3, MB(4) }, /* 16-way set assoc, 64 byte line size */
+ { 0x4a, LVL_3, MB(6) }, /* 12-way set assoc, 64 byte line size */
+ { 0x4b, LVL_3, MB(8) }, /* 16-way set assoc, 64 byte line size */
+ { 0x4c, LVL_3, MB(12) }, /* 12-way set assoc, 64 byte line size */
+ { 0x4d, LVL_3, MB(16) }, /* 16-way set assoc, 64 byte line size */
+ { 0x4e, LVL_2, MB(6) }, /* 24-way set assoc, 64 byte line size */
+ { 0x60, LVL_1_DATA, 16 }, /* 8-way set assoc, sectored cache, 64 byte line size */
+ { 0x66, LVL_1_DATA, 8 }, /* 4-way set assoc, sectored cache, 64 byte line size */
+ { 0x67, LVL_1_DATA, 16 }, /* 4-way set assoc, sectored cache, 64 byte line size */
+ { 0x68, LVL_1_DATA, 32 }, /* 4-way set assoc, sectored cache, 64 byte line size */
+ { 0x70, LVL_TRACE, 12 }, /* 8-way set assoc */
+ { 0x71, LVL_TRACE, 16 }, /* 8-way set assoc */
+ { 0x72, LVL_TRACE, 32 }, /* 8-way set assoc */
+ { 0x73, LVL_TRACE, 64 }, /* 8-way set assoc */
+ { 0x78, LVL_2, MB(1) }, /* 4-way set assoc, 64 byte line size */
+ { 0x79, LVL_2, 128 }, /* 8-way set assoc, sectored cache, 64 byte line size */
+ { 0x7a, LVL_2, 256 }, /* 8-way set assoc, sectored cache, 64 byte line size */
+ { 0x7b, LVL_2, 512 }, /* 8-way set assoc, sectored cache, 64 byte line size */
+ { 0x7c, LVL_2, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */
+ { 0x7d, LVL_2, MB(2) }, /* 8-way set assoc, 64 byte line size */
+ { 0x7f, LVL_2, 512 }, /* 2-way set assoc, 64 byte line size */
+ { 0x80, LVL_2, 512 }, /* 8-way set assoc, 64 byte line size */
+ { 0x82, LVL_2, 256 }, /* 8-way set assoc, 32 byte line size */
+ { 0x83, LVL_2, 512 }, /* 8-way set assoc, 32 byte line size */
+ { 0x84, LVL_2, MB(1) }, /* 8-way set assoc, 32 byte line size */
+ { 0x85, LVL_2, MB(2) }, /* 8-way set assoc, 32 byte line size */
+ { 0x86, LVL_2, 512 }, /* 4-way set assoc, 64 byte line size */
+ { 0x87, LVL_2, MB(1) }, /* 8-way set assoc, 64 byte line size */
+ { 0xd0, LVL_3, 512 }, /* 4-way set assoc, 64 byte line size */
+ { 0xd1, LVL_3, MB(1) }, /* 4-way set assoc, 64 byte line size */
+ { 0xd2, LVL_3, MB(2) }, /* 4-way set assoc, 64 byte line size */
+ { 0xd6, LVL_3, MB(1) }, /* 8-way set assoc, 64 byte line size */
+ { 0xd7, LVL_3, MB(2) }, /* 8-way set assoc, 64 byte line size */
+ { 0xd8, LVL_3, MB(4) }, /* 12-way set assoc, 64 byte line size */
+ { 0xdc, LVL_3, MB(2) }, /* 12-way set assoc, 64 byte line size */
+ { 0xdd, LVL_3, MB(4) }, /* 12-way set assoc, 64 byte line size */
+ { 0xde, LVL_3, MB(8) }, /* 12-way set assoc, 64 byte line size */
+ { 0xe2, LVL_3, MB(2) }, /* 16-way set assoc, 64 byte line size */
+ { 0xe3, LVL_3, MB(4) }, /* 16-way set assoc, 64 byte line size */
+ { 0xe4, LVL_3, MB(8) }, /* 16-way set assoc, 64 byte line size */
+ { 0xea, LVL_3, MB(12) }, /* 24-way set assoc, 64 byte line size */
+ { 0xeb, LVL_3, MB(18) }, /* 24-way set assoc, 64 byte line size */
+ { 0xec, LVL_3, MB(24) }, /* 24-way set assoc, 64 byte line size */
+ { 0x00, 0, 0}
+};
+
+
+enum _cache_type {
+ CTYPE_NULL = 0,
+ CTYPE_DATA = 1,
+ CTYPE_INST = 2,
+ CTYPE_UNIFIED = 3
+};
+
+union _cpuid4_leaf_eax {
+ struct {
+ enum _cache_type type:5;
+ unsigned int level:3;
+ unsigned int is_self_initializing:1;
+ unsigned int is_fully_associative:1;
+ unsigned int reserved:4;
+ unsigned int num_threads_sharing:12;
+ unsigned int num_cores_on_die:6;
+ } split;
+ u32 full;
+};
+
+union _cpuid4_leaf_ebx {
+ struct {
+ unsigned int coherency_line_size:12;
+ unsigned int physical_line_partition:10;
+ unsigned int ways_of_associativity:10;
+ } split;
+ u32 full;
+};
+
+union _cpuid4_leaf_ecx {
+ struct {
+ unsigned int number_of_sets:32;
+ } split;
+ u32 full;
+};
+
+struct _cpuid4_info_regs {
+ union _cpuid4_leaf_eax eax;
+ union _cpuid4_leaf_ebx ebx;
+ union _cpuid4_leaf_ecx ecx;
+ unsigned int id;
+ unsigned long size;
+ struct amd_northbridge *nb;
+};
+
+static unsigned short num_cache_leaves;
+
+/* AMD doesn't have CPUID4. Emulate it here to report the same
+ information to the user. This makes some assumptions about the machine:
+ L2 not shared, no SMT etc. that is currently true on AMD CPUs.
+
+ In theory the TLBs could be reported as fake type (they are in "dummy").
+ Maybe later */
+union l1_cache {
+ struct {
+ unsigned line_size:8;
+ unsigned lines_per_tag:8;
+ unsigned assoc:8;
+ unsigned size_in_kb:8;
+ };
+ unsigned val;
+};
+
+union l2_cache {
+ struct {
+ unsigned line_size:8;
+ unsigned lines_per_tag:4;
+ unsigned assoc:4;
+ unsigned size_in_kb:16;
+ };
+ unsigned val;
+};
+
+union l3_cache {
+ struct {
+ unsigned line_size:8;
+ unsigned lines_per_tag:4;
+ unsigned assoc:4;
+ unsigned res:2;
+ unsigned size_encoded:14;
+ };
+ unsigned val;
+};
+
+static const unsigned short assocs[] = {
+ [1] = 1,
+ [2] = 2,
+ [4] = 4,
+ [6] = 8,
+ [8] = 16,
+ [0xa] = 32,
+ [0xb] = 48,
+ [0xc] = 64,
+ [0xd] = 96,
+ [0xe] = 128,
+ [0xf] = 0xffff /* fully associative - no way to show this currently */
+};
+
+static const unsigned char levels[] = { 1, 1, 2, 3 };
+static const unsigned char types[] = { 1, 2, 3, 3 };
+
+static const enum cache_type cache_type_map[] = {
+ [CTYPE_NULL] = CACHE_TYPE_NOCACHE,
+ [CTYPE_DATA] = CACHE_TYPE_DATA,
+ [CTYPE_INST] = CACHE_TYPE_INST,
+ [CTYPE_UNIFIED] = CACHE_TYPE_UNIFIED,
+};
+
+static void
+amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
+ union _cpuid4_leaf_ebx *ebx,
+ union _cpuid4_leaf_ecx *ecx)
+{
+ unsigned dummy;
+ unsigned line_size, lines_per_tag, assoc, size_in_kb;
+ union l1_cache l1i, l1d;
+ union l2_cache l2;
+ union l3_cache l3;
+ union l1_cache *l1 = &l1d;
+
+ eax->full = 0;
+ ebx->full = 0;
+ ecx->full = 0;
+
+ cpuid(0x80000005, &dummy, &dummy, &l1d.val, &l1i.val);
+ cpuid(0x80000006, &dummy, &dummy, &l2.val, &l3.val);
+
+ switch (leaf) {
+ case 1:
+ l1 = &l1i;
+ case 0:
+ if (!l1->val)
+ return;
+ assoc = assocs[l1->assoc];
+ line_size = l1->line_size;
+ lines_per_tag = l1->lines_per_tag;
+ size_in_kb = l1->size_in_kb;
+ break;
+ case 2:
+ if (!l2.val)
+ return;
+ assoc = assocs[l2.assoc];
+ line_size = l2.line_size;
+ lines_per_tag = l2.lines_per_tag;
+ /* cpu_data has errata corrections for K7 applied */
+ size_in_kb = __this_cpu_read(cpu_info.x86_cache_size);
+ break;
+ case 3:
+ if (!l3.val)
+ return;
+ assoc = assocs[l3.assoc];
+ line_size = l3.line_size;
+ lines_per_tag = l3.lines_per_tag;
+ size_in_kb = l3.size_encoded * 512;
+ if (boot_cpu_has(X86_FEATURE_AMD_DCM)) {
+ size_in_kb = size_in_kb >> 1;
+ assoc = assoc >> 1;
+ }
+ break;
+ default:
+ return;
+ }
+
+ eax->split.is_self_initializing = 1;
+ eax->split.type = types[leaf];
+ eax->split.level = levels[leaf];
+ eax->split.num_threads_sharing = 0;
+ eax->split.num_cores_on_die = __this_cpu_read(cpu_info.x86_max_cores) - 1;
+
+
+ if (assoc == 0xffff)
+ eax->split.is_fully_associative = 1;
+ ebx->split.coherency_line_size = line_size - 1;
+ ebx->split.ways_of_associativity = assoc - 1;
+ ebx->split.physical_line_partition = lines_per_tag - 1;
+ ecx->split.number_of_sets = (size_in_kb * 1024) / line_size /
+ (ebx->split.ways_of_associativity + 1) - 1;
+}
+
+#if defined(CONFIG_AMD_NB) && defined(CONFIG_SYSFS)
+
+/*
+ * L3 cache descriptors
+ */
+static void amd_calc_l3_indices(struct amd_northbridge *nb)
+{
+ struct amd_l3_cache *l3 = &nb->l3_cache;
+ unsigned int sc0, sc1, sc2, sc3;
+ u32 val = 0;
+
+ pci_read_config_dword(nb->misc, 0x1C4, &val);
+
+ /* calculate subcache sizes */
+ l3->subcaches[0] = sc0 = !(val & BIT(0));
+ l3->subcaches[1] = sc1 = !(val & BIT(4));
+
+ if (boot_cpu_data.x86 == 0x15) {
+ l3->subcaches[0] = sc0 += !(val & BIT(1));
+ l3->subcaches[1] = sc1 += !(val & BIT(5));
+ }
+
+ l3->subcaches[2] = sc2 = !(val & BIT(8)) + !(val & BIT(9));
+ l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13));
+
+ l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1;
+}
+
+/*
+ * check whether a slot used for disabling an L3 index is occupied.
+ * @l3: L3 cache descriptor
+ * @slot: slot number (0..1)
+ *
+ * @returns: the disabled index if used or negative value if slot free.
+ */
+static int amd_get_l3_disable_slot(struct amd_northbridge *nb, unsigned slot)
+{
+ unsigned int reg = 0;
+
+ pci_read_config_dword(nb->misc, 0x1BC + slot * 4, &reg);
+
+ /* check whether this slot is activated already */
+ if (reg & (3UL << 30))
+ return reg & 0xfff;
+
+ return -1;
+}
+
+static ssize_t show_cache_disable(struct cacheinfo *this_leaf, char *buf,
+ unsigned int slot)
+{
+ int index;
+ struct amd_northbridge *nb = this_leaf->priv;
+
+ index = amd_get_l3_disable_slot(nb, slot);
+ if (index >= 0)
+ return sprintf(buf, "%d\n", index);
+
+ return sprintf(buf, "FREE\n");
+}
+
+#define SHOW_CACHE_DISABLE(slot) \
+static ssize_t \
+cache_disable_##slot##_show(struct device *dev, \
+ struct device_attribute *attr, char *buf) \
+{ \
+ struct cacheinfo *this_leaf = dev_get_drvdata(dev); \
+ return show_cache_disable(this_leaf, buf, slot); \
+}
+SHOW_CACHE_DISABLE(0)
+SHOW_CACHE_DISABLE(1)
+
+static void amd_l3_disable_index(struct amd_northbridge *nb, int cpu,
+ unsigned slot, unsigned long idx)
+{
+ int i;
+
+ idx |= BIT(30);
+
+ /*
+ * disable index in all 4 subcaches
+ */
+ for (i = 0; i < 4; i++) {
+ u32 reg = idx | (i << 20);
+
+ if (!nb->l3_cache.subcaches[i])
+ continue;
+
+ pci_write_config_dword(nb->misc, 0x1BC + slot * 4, reg);
+
+ /*
+ * We need to WBINVD on a core on the node containing the L3
+ * cache which indices we disable therefore a simple wbinvd()
+ * is not sufficient.
+ */
+ wbinvd_on_cpu(cpu);
+
+ reg |= BIT(31);
+ pci_write_config_dword(nb->misc, 0x1BC + slot * 4, reg);
+ }
+}
+
+/*
+ * disable a L3 cache index by using a disable-slot
+ *
+ * @l3: L3 cache descriptor
+ * @cpu: A CPU on the node containing the L3 cache
+ * @slot: slot number (0..1)
+ * @index: index to disable
+ *
+ * @return: 0 on success, error status on failure
+ */
+static int amd_set_l3_disable_slot(struct amd_northbridge *nb, int cpu,
+ unsigned slot, unsigned long index)
+{
+ int ret = 0;
+
+ /* check if @slot is already used or the index is already disabled */
+ ret = amd_get_l3_disable_slot(nb, slot);
+ if (ret >= 0)
+ return -EEXIST;
+
+ if (index > nb->l3_cache.indices)
+ return -EINVAL;
+
+ /* check whether the other slot has disabled the same index already */
+ if (index == amd_get_l3_disable_slot(nb, !slot))
+ return -EEXIST;
+
+ amd_l3_disable_index(nb, cpu, slot, index);
+
+ return 0;
+}
+
+static ssize_t store_cache_disable(struct cacheinfo *this_leaf,
+ const char *buf, size_t count,
+ unsigned int slot)
+{
+ unsigned long val = 0;
+ int cpu, err = 0;
+ struct amd_northbridge *nb = this_leaf->priv;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ cpu = cpumask_first(&this_leaf->shared_cpu_map);
+
+ if (kstrtoul(buf, 10, &val) < 0)
+ return -EINVAL;
+
+ err = amd_set_l3_disable_slot(nb, cpu, slot, val);
+ if (err) {
+ if (err == -EEXIST)
+ pr_warn("L3 slot %d in use/index already disabled!\n",
+ slot);
+ return err;
+ }
+ return count;
+}
+
+#define STORE_CACHE_DISABLE(slot) \
+static ssize_t \
+cache_disable_##slot##_store(struct device *dev, \
+ struct device_attribute *attr, \
+ const char *buf, size_t count) \
+{ \
+ struct cacheinfo *this_leaf = dev_get_drvdata(dev); \
+ return store_cache_disable(this_leaf, buf, count, slot); \
+}
+STORE_CACHE_DISABLE(0)
+STORE_CACHE_DISABLE(1)
+
+static ssize_t subcaches_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct cacheinfo *this_leaf = dev_get_drvdata(dev);
+ int cpu = cpumask_first(&this_leaf->shared_cpu_map);
+
+ return sprintf(buf, "%x\n", amd_get_subcaches(cpu));
+}
+
+static ssize_t subcaches_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct cacheinfo *this_leaf = dev_get_drvdata(dev);
+ int cpu = cpumask_first(&this_leaf->shared_cpu_map);
+ unsigned long val;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (kstrtoul(buf, 16, &val) < 0)
+ return -EINVAL;
+
+ if (amd_set_subcaches(cpu, val))
+ return -EINVAL;
+
+ return count;
+}
+
+static DEVICE_ATTR_RW(cache_disable_0);
+static DEVICE_ATTR_RW(cache_disable_1);
+static DEVICE_ATTR_RW(subcaches);
+
+static umode_t
+cache_private_attrs_is_visible(struct kobject *kobj,
+ struct attribute *attr, int unused)
+{
+ struct device *dev = kobj_to_dev(kobj);
+ struct cacheinfo *this_leaf = dev_get_drvdata(dev);
+ umode_t mode = attr->mode;
+
+ if (!this_leaf->priv)
+ return 0;
+
+ if ((attr == &dev_attr_subcaches.attr) &&
+ amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
+ return mode;
+
+ if ((attr == &dev_attr_cache_disable_0.attr ||
+ attr == &dev_attr_cache_disable_1.attr) &&
+ amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
+ return mode;
+
+ return 0;
+}
+
+static struct attribute_group cache_private_group = {
+ .is_visible = cache_private_attrs_is_visible,
+};
+
+static void init_amd_l3_attrs(void)
+{
+ int n = 1;
+ static struct attribute **amd_l3_attrs;
+
+ if (amd_l3_attrs) /* already initialized */
+ return;
+
+ if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
+ n += 2;
+ if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
+ n += 1;
+
+ amd_l3_attrs = kcalloc(n, sizeof(*amd_l3_attrs), GFP_KERNEL);
+ if (!amd_l3_attrs)
+ return;
+
+ n = 0;
+ if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) {
+ amd_l3_attrs[n++] = &dev_attr_cache_disable_0.attr;
+ amd_l3_attrs[n++] = &dev_attr_cache_disable_1.attr;
+ }
+ if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
+ amd_l3_attrs[n++] = &dev_attr_subcaches.attr;
+
+ cache_private_group.attrs = amd_l3_attrs;
+}
+
+const struct attribute_group *
+cache_get_priv_group(struct cacheinfo *this_leaf)
+{
+ struct amd_northbridge *nb = this_leaf->priv;
+
+ if (this_leaf->level < 3 || !nb)
+ return NULL;
+
+ if (nb && nb->l3_cache.indices)
+ init_amd_l3_attrs();
+
+ return &cache_private_group;
+}
+
+static void amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index)
+{
+ int node;
+
+ /* only for L3, and not in virtualized environments */
+ if (index < 3)
+ return;
+
+ node = amd_get_nb_id(smp_processor_id());
+ this_leaf->nb = node_to_amd_nb(node);
+ if (this_leaf->nb && !this_leaf->nb->l3_cache.indices)
+ amd_calc_l3_indices(this_leaf->nb);
+}
+#else
+#define amd_init_l3_cache(x, y)
+#endif /* CONFIG_AMD_NB && CONFIG_SYSFS */
+
+static int
+cpuid4_cache_lookup_regs(int index, struct _cpuid4_info_regs *this_leaf)
+{
+ union _cpuid4_leaf_eax eax;
+ union _cpuid4_leaf_ebx ebx;
+ union _cpuid4_leaf_ecx ecx;
+ unsigned edx;
+
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
+ if (boot_cpu_has(X86_FEATURE_TOPOEXT))
+ cpuid_count(0x8000001d, index, &eax.full,
+ &ebx.full, &ecx.full, &edx);
+ else
+ amd_cpuid4(index, &eax, &ebx, &ecx);
+ amd_init_l3_cache(this_leaf, index);
+ } else {
+ cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx);
+ }
+
+ if (eax.split.type == CTYPE_NULL)
+ return -EIO; /* better error ? */
+
+ this_leaf->eax = eax;
+ this_leaf->ebx = ebx;
+ this_leaf->ecx = ecx;
+ this_leaf->size = (ecx.split.number_of_sets + 1) *
+ (ebx.split.coherency_line_size + 1) *
+ (ebx.split.physical_line_partition + 1) *
+ (ebx.split.ways_of_associativity + 1);
+ return 0;
+}
+
+static int find_num_cache_leaves(struct cpuinfo_x86 *c)
+{
+ unsigned int eax, ebx, ecx, edx, op;
+ union _cpuid4_leaf_eax cache_eax;
+ int i = -1;
+
+ if (c->x86_vendor == X86_VENDOR_AMD)
+ op = 0x8000001d;
+ else
+ op = 4;
+
+ do {
+ ++i;
+ /* Do cpuid(op) loop to find out num_cache_leaves */
+ cpuid_count(op, i, &eax, &ebx, &ecx, &edx);
+ cache_eax.full = eax;
+ } while (cache_eax.split.type != CTYPE_NULL);
+ return i;
+}
+
+void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu, u8 node_id)
+{
+ /*
+ * We may have multiple LLCs if L3 caches exist, so check if we
+ * have an L3 cache by looking at the L3 cache CPUID leaf.
+ */
+ if (!cpuid_edx(0x80000006))
+ return;
+
+ if (c->x86 < 0x17) {
+ /* LLC is at the node level. */
+ per_cpu(cpu_llc_id, cpu) = node_id;
+ } else if (c->x86 == 0x17 && c->x86_model <= 0x1F) {
+ /*
+ * LLC is at the core complex level.
+ * Core complex ID is ApicId[3] for these processors.
+ */
+ per_cpu(cpu_llc_id, cpu) = c->apicid >> 3;
+ } else {
+ /*
+ * LLC ID is calculated from the number of threads sharing the
+ * cache.
+ * */
+ u32 eax, ebx, ecx, edx, num_sharing_cache = 0;
+ u32 llc_index = find_num_cache_leaves(c) - 1;
+
+ cpuid_count(0x8000001d, llc_index, &eax, &ebx, &ecx, &edx);
+ if (eax)
+ num_sharing_cache = ((eax >> 14) & 0xfff) + 1;
+
+ if (num_sharing_cache) {
+ int bits = get_count_order(num_sharing_cache);
+
+ per_cpu(cpu_llc_id, cpu) = c->apicid >> bits;
+ }
+ }
+}
+
+void init_amd_cacheinfo(struct cpuinfo_x86 *c)
+{
+
+ if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
+ num_cache_leaves = find_num_cache_leaves(c);
+ } else if (c->extended_cpuid_level >= 0x80000006) {
+ if (cpuid_edx(0x80000006) & 0xf000)
+ num_cache_leaves = 4;
+ else
+ num_cache_leaves = 3;
+ }
+}
+
+void init_intel_cacheinfo(struct cpuinfo_x86 *c)
+{
+ /* Cache sizes */
+ unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0;
+ unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */
+ unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */
+ unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb;
+#ifdef CONFIG_SMP
+ unsigned int cpu = c->cpu_index;
+#endif
+
+ if (c->cpuid_level > 3) {
+ static int is_initialized;
+
+ if (is_initialized == 0) {
+ /* Init num_cache_leaves from boot CPU */
+ num_cache_leaves = find_num_cache_leaves(c);
+ is_initialized++;
+ }
+
+ /*
+ * Whenever possible use cpuid(4), deterministic cache
+ * parameters cpuid leaf to find the cache details
+ */
+ for (i = 0; i < num_cache_leaves; i++) {
+ struct _cpuid4_info_regs this_leaf = {};
+ int retval;
+
+ retval = cpuid4_cache_lookup_regs(i, &this_leaf);
+ if (retval < 0)
+ continue;
+
+ switch (this_leaf.eax.split.level) {
+ case 1:
+ if (this_leaf.eax.split.type == CTYPE_DATA)
+ new_l1d = this_leaf.size/1024;
+ else if (this_leaf.eax.split.type == CTYPE_INST)
+ new_l1i = this_leaf.size/1024;
+ break;
+ case 2:
+ new_l2 = this_leaf.size/1024;
+ num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
+ index_msb = get_count_order(num_threads_sharing);
+ l2_id = c->apicid & ~((1 << index_msb) - 1);
+ break;
+ case 3:
+ new_l3 = this_leaf.size/1024;
+ num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
+ index_msb = get_count_order(num_threads_sharing);
+ l3_id = c->apicid & ~((1 << index_msb) - 1);
+ break;
+ default:
+ break;
+ }
+ }
+ }
+ /*
+ * Don't use cpuid2 if cpuid4 is supported. For P4, we use cpuid2 for
+ * trace cache
+ */
+ if ((num_cache_leaves == 0 || c->x86 == 15) && c->cpuid_level > 1) {
+ /* supports eax=2 call */
+ int j, n;
+ unsigned int regs[4];
+ unsigned char *dp = (unsigned char *)regs;
+ int only_trace = 0;
+
+ if (num_cache_leaves != 0 && c->x86 == 15)
+ only_trace = 1;
+
+ /* Number of times to iterate */
+ n = cpuid_eax(2) & 0xFF;
+
+ for (i = 0 ; i < n ; i++) {
+ cpuid(2, &regs[0], &regs[1], &regs[2], &regs[3]);
+
+ /* If bit 31 is set, this is an unknown format */
+ for (j = 0 ; j < 3 ; j++)
+ if (regs[j] & (1 << 31))
+ regs[j] = 0;
+
+ /* Byte 0 is level count, not a descriptor */
+ for (j = 1 ; j < 16 ; j++) {
+ unsigned char des = dp[j];
+ unsigned char k = 0;
+
+ /* look up this descriptor in the table */
+ while (cache_table[k].descriptor != 0) {
+ if (cache_table[k].descriptor == des) {
+ if (only_trace && cache_table[k].cache_type != LVL_TRACE)
+ break;
+ switch (cache_table[k].cache_type) {
+ case LVL_1_INST:
+ l1i += cache_table[k].size;
+ break;
+ case LVL_1_DATA:
+ l1d += cache_table[k].size;
+ break;
+ case LVL_2:
+ l2 += cache_table[k].size;
+ break;
+ case LVL_3:
+ l3 += cache_table[k].size;
+ break;
+ case LVL_TRACE:
+ trace += cache_table[k].size;
+ break;
+ }
+
+ break;
+ }
+
+ k++;
+ }
+ }
+ }
+ }
+
+ if (new_l1d)
+ l1d = new_l1d;
+
+ if (new_l1i)
+ l1i = new_l1i;
+
+ if (new_l2) {
+ l2 = new_l2;
+#ifdef CONFIG_SMP
+ per_cpu(cpu_llc_id, cpu) = l2_id;
+#endif
+ }
+
+ if (new_l3) {
+ l3 = new_l3;
+#ifdef CONFIG_SMP
+ per_cpu(cpu_llc_id, cpu) = l3_id;
+#endif
+ }
+
+#ifdef CONFIG_SMP
+ /*
+ * If cpu_llc_id is not yet set, this means cpuid_level < 4 which in
+ * turns means that the only possibility is SMT (as indicated in
+ * cpuid1). Since cpuid2 doesn't specify shared caches, and we know
+ * that SMT shares all caches, we can unconditionally set cpu_llc_id to
+ * c->phys_proc_id.
+ */
+ if (per_cpu(cpu_llc_id, cpu) == BAD_APICID)
+ per_cpu(cpu_llc_id, cpu) = c->phys_proc_id;
+#endif
+
+ c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d));
+
+ if (!l2)
+ cpu_detect_cache_sizes(c);
+}
+
+static int __cache_amd_cpumap_setup(unsigned int cpu, int index,
+ struct _cpuid4_info_regs *base)
+{
+ struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
+ struct cacheinfo *this_leaf;
+ int i, sibling;
+
+ /*
+ * For L3, always use the pre-calculated cpu_llc_shared_mask
+ * to derive shared_cpu_map.
+ */
+ if (index == 3) {
+ for_each_cpu(i, cpu_llc_shared_mask(cpu)) {
+ this_cpu_ci = get_cpu_cacheinfo(i);
+ if (!this_cpu_ci->info_list)
+ continue;
+ this_leaf = this_cpu_ci->info_list + index;
+ for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) {
+ if (!cpu_online(sibling))
+ continue;
+ cpumask_set_cpu(sibling,
+ &this_leaf->shared_cpu_map);
+ }
+ }
+ } else if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
+ unsigned int apicid, nshared, first, last;
+
+ nshared = base->eax.split.num_threads_sharing + 1;
+ apicid = cpu_data(cpu).apicid;
+ first = apicid - (apicid % nshared);
+ last = first + nshared - 1;
+
+ for_each_online_cpu(i) {
+ this_cpu_ci = get_cpu_cacheinfo(i);
+ if (!this_cpu_ci->info_list)
+ continue;
+
+ apicid = cpu_data(i).apicid;
+ if ((apicid < first) || (apicid > last))
+ continue;
+
+ this_leaf = this_cpu_ci->info_list + index;
+
+ for_each_online_cpu(sibling) {
+ apicid = cpu_data(sibling).apicid;
+ if ((apicid < first) || (apicid > last))
+ continue;
+ cpumask_set_cpu(sibling,
+ &this_leaf->shared_cpu_map);
+ }
+ }
+ } else
+ return 0;
+
+ return 1;
+}
+
+static void __cache_cpumap_setup(unsigned int cpu, int index,
+ struct _cpuid4_info_regs *base)
+{
+ struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
+ struct cacheinfo *this_leaf, *sibling_leaf;
+ unsigned long num_threads_sharing;
+ int index_msb, i;
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+ if (c->x86_vendor == X86_VENDOR_AMD) {
+ if (__cache_amd_cpumap_setup(cpu, index, base))
+ return;
+ }
+
+ this_leaf = this_cpu_ci->info_list + index;
+ num_threads_sharing = 1 + base->eax.split.num_threads_sharing;
+
+ cpumask_set_cpu(cpu, &this_leaf->shared_cpu_map);
+ if (num_threads_sharing == 1)
+ return;
+
+ index_msb = get_count_order(num_threads_sharing);
+
+ for_each_online_cpu(i)
+ if (cpu_data(i).apicid >> index_msb == c->apicid >> index_msb) {
+ struct cpu_cacheinfo *sib_cpu_ci = get_cpu_cacheinfo(i);
+
+ if (i == cpu || !sib_cpu_ci->info_list)
+ continue;/* skip if itself or no cacheinfo */
+ sibling_leaf = sib_cpu_ci->info_list + index;
+ cpumask_set_cpu(i, &this_leaf->shared_cpu_map);
+ cpumask_set_cpu(cpu, &sibling_leaf->shared_cpu_map);
+ }
+}
+
+static void ci_leaf_init(struct cacheinfo *this_leaf,
+ struct _cpuid4_info_regs *base)
+{
+ this_leaf->id = base->id;
+ this_leaf->attributes = CACHE_ID;
+ this_leaf->level = base->eax.split.level;
+ this_leaf->type = cache_type_map[base->eax.split.type];
+ this_leaf->coherency_line_size =
+ base->ebx.split.coherency_line_size + 1;
+ this_leaf->ways_of_associativity =
+ base->ebx.split.ways_of_associativity + 1;
+ this_leaf->size = base->size;
+ this_leaf->number_of_sets = base->ecx.split.number_of_sets + 1;
+ this_leaf->physical_line_partition =
+ base->ebx.split.physical_line_partition + 1;
+ this_leaf->priv = base->nb;
+}
+
+int init_cache_level(unsigned int cpu)
+{
+ struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
+
+ if (!num_cache_leaves)
+ return -ENOENT;
+ if (!this_cpu_ci)
+ return -EINVAL;
+ this_cpu_ci->num_levels = 3;
+ this_cpu_ci->num_leaves = num_cache_leaves;
+ return 0;
+}
+
+/*
+ * The max shared threads number comes from CPUID.4:EAX[25-14] with input
+ * ECX as cache index. Then right shift apicid by the number's order to get
+ * cache id for this cache node.
+ */
+static void get_cache_id(int cpu, struct _cpuid4_info_regs *id4_regs)
+{
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+ unsigned long num_threads_sharing;
+ int index_msb;
+
+ num_threads_sharing = 1 + id4_regs->eax.split.num_threads_sharing;
+ index_msb = get_count_order(num_threads_sharing);
+ id4_regs->id = c->apicid >> index_msb;
+}
+
+int populate_cache_leaves(unsigned int cpu)
+{
+ unsigned int idx, ret;
+ struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
+ struct cacheinfo *this_leaf = this_cpu_ci->info_list;
+ struct _cpuid4_info_regs id4_regs = {};
+
+ for (idx = 0; idx < this_cpu_ci->num_leaves; idx++) {
+ ret = cpuid4_cache_lookup_regs(idx, &id4_regs);
+ if (ret)
+ return ret;
+ get_cache_id(cpu, &id4_regs);
+ ci_leaf_init(this_leaf++, &id4_regs);
+ __cache_cpumap_setup(cpu, idx, &id4_regs);
+ }
+ this_cpu_ci->cpu_map_populated = true;
+
+ return 0;
+}
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
new file mode 100644
index 000000000..14433ff5b
--- /dev/null
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -0,0 +1,288 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/sched.h>
+#include <linux/sched/clock.h>
+
+#include <asm/cpufeature.h>
+#include <asm/e820/api.h>
+#include <asm/mtrr.h>
+#include <asm/msr.h>
+
+#include "cpu.h"
+
+#define ACE_PRESENT (1 << 6)
+#define ACE_ENABLED (1 << 7)
+#define ACE_FCR (1 << 28) /* MSR_VIA_FCR */
+
+#define RNG_PRESENT (1 << 2)
+#define RNG_ENABLED (1 << 3)
+#define RNG_ENABLE (1 << 6) /* MSR_VIA_RNG */
+
+#define X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW 0x00200000
+#define X86_VMX_FEATURE_PROC_CTLS_VNMI 0x00400000
+#define X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS 0x80000000
+#define X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC 0x00000001
+#define X86_VMX_FEATURE_PROC_CTLS2_EPT 0x00000002
+#define X86_VMX_FEATURE_PROC_CTLS2_VPID 0x00000020
+
+static void init_c3(struct cpuinfo_x86 *c)
+{
+ u32 lo, hi;
+
+ /* Test for Centaur Extended Feature Flags presence */
+ if (cpuid_eax(0xC0000000) >= 0xC0000001) {
+ u32 tmp = cpuid_edx(0xC0000001);
+
+ /* enable ACE unit, if present and disabled */
+ if ((tmp & (ACE_PRESENT | ACE_ENABLED)) == ACE_PRESENT) {
+ rdmsr(MSR_VIA_FCR, lo, hi);
+ lo |= ACE_FCR; /* enable ACE unit */
+ wrmsr(MSR_VIA_FCR, lo, hi);
+ pr_info("CPU: Enabled ACE h/w crypto\n");
+ }
+
+ /* enable RNG unit, if present and disabled */
+ if ((tmp & (RNG_PRESENT | RNG_ENABLED)) == RNG_PRESENT) {
+ rdmsr(MSR_VIA_RNG, lo, hi);
+ lo |= RNG_ENABLE; /* enable RNG unit */
+ wrmsr(MSR_VIA_RNG, lo, hi);
+ pr_info("CPU: Enabled h/w RNG\n");
+ }
+
+ /* store Centaur Extended Feature Flags as
+ * word 5 of the CPU capability bit array
+ */
+ c->x86_capability[CPUID_C000_0001_EDX] = cpuid_edx(0xC0000001);
+ }
+#ifdef CONFIG_X86_32
+ /* Cyrix III family needs CX8 & PGE explicitly enabled. */
+ if (c->x86_model >= 6 && c->x86_model <= 13) {
+ rdmsr(MSR_VIA_FCR, lo, hi);
+ lo |= (1<<1 | 1<<7);
+ wrmsr(MSR_VIA_FCR, lo, hi);
+ set_cpu_cap(c, X86_FEATURE_CX8);
+ }
+
+ /* Before Nehemiah, the C3's had 3dNOW! */
+ if (c->x86_model >= 6 && c->x86_model < 9)
+ set_cpu_cap(c, X86_FEATURE_3DNOW);
+#endif
+ if (c->x86 == 0x6 && c->x86_model >= 0xf) {
+ c->x86_cache_alignment = c->x86_clflush_size * 2;
+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+ }
+
+ cpu_detect_cache_sizes(c);
+}
+
+enum {
+ ECX8 = 1<<1,
+ EIERRINT = 1<<2,
+ DPM = 1<<3,
+ DMCE = 1<<4,
+ DSTPCLK = 1<<5,
+ ELINEAR = 1<<6,
+ DSMC = 1<<7,
+ DTLOCK = 1<<8,
+ EDCTLB = 1<<8,
+ EMMX = 1<<9,
+ DPDC = 1<<11,
+ EBRPRED = 1<<12,
+ DIC = 1<<13,
+ DDC = 1<<14,
+ DNA = 1<<15,
+ ERETSTK = 1<<16,
+ E2MMX = 1<<19,
+ EAMD3D = 1<<20,
+};
+
+static void early_init_centaur(struct cpuinfo_x86 *c)
+{
+ switch (c->x86) {
+#ifdef CONFIG_X86_32
+ case 5:
+ /* Emulate MTRRs using Centaur's MCR. */
+ set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR);
+ break;
+#endif
+ case 6:
+ if (c->x86_model >= 0xf)
+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+ break;
+ }
+#ifdef CONFIG_X86_64
+ set_cpu_cap(c, X86_FEATURE_SYSENTER32);
+#endif
+ if (c->x86_power & (1 << 8)) {
+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+ set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
+ }
+}
+
+static void centaur_detect_vmx_virtcap(struct cpuinfo_x86 *c)
+{
+ u32 vmx_msr_low, vmx_msr_high, msr_ctl, msr_ctl2;
+
+ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high);
+ msr_ctl = vmx_msr_high | vmx_msr_low;
+
+ if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW)
+ set_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
+ if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_VNMI)
+ set_cpu_cap(c, X86_FEATURE_VNMI);
+ if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS) {
+ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
+ vmx_msr_low, vmx_msr_high);
+ msr_ctl2 = vmx_msr_high | vmx_msr_low;
+ if ((msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC) &&
+ (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW))
+ set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
+ if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_EPT)
+ set_cpu_cap(c, X86_FEATURE_EPT);
+ if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VPID)
+ set_cpu_cap(c, X86_FEATURE_VPID);
+ }
+}
+
+static void init_centaur(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_X86_32
+ char *name;
+ u32 fcr_set = 0;
+ u32 fcr_clr = 0;
+ u32 lo, hi, newlo;
+ u32 aa, bb, cc, dd;
+
+ /*
+ * Bit 31 in normal CPUID used for nonstandard 3DNow ID;
+ * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway
+ */
+ clear_cpu_cap(c, 0*32+31);
+#endif
+ early_init_centaur(c);
+ init_intel_cacheinfo(c);
+ detect_num_cpu_cores(c);
+#ifdef CONFIG_X86_32
+ detect_ht(c);
+#endif
+
+ if (c->cpuid_level > 9) {
+ unsigned int eax = cpuid_eax(10);
+
+ /*
+ * Check for version and the number of counters
+ * Version(eax[7:0]) can't be 0;
+ * Counters(eax[15:8]) should be greater than 1;
+ */
+ if ((eax & 0xff) && (((eax >> 8) & 0xff) > 1))
+ set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
+ }
+
+ switch (c->x86) {
+#ifdef CONFIG_X86_32
+ case 5:
+ switch (c->x86_model) {
+ case 4:
+ name = "C6";
+ fcr_set = ECX8|DSMC|EDCTLB|EMMX|ERETSTK;
+ fcr_clr = DPDC;
+ pr_notice("Disabling bugged TSC.\n");
+ clear_cpu_cap(c, X86_FEATURE_TSC);
+ break;
+ case 8:
+ switch (c->x86_stepping) {
+ default:
+ name = "2";
+ break;
+ case 7 ... 9:
+ name = "2A";
+ break;
+ case 10 ... 15:
+ name = "2B";
+ break;
+ }
+ fcr_set = ECX8|DSMC|DTLOCK|EMMX|EBRPRED|ERETSTK|
+ E2MMX|EAMD3D;
+ fcr_clr = DPDC;
+ break;
+ case 9:
+ name = "3";
+ fcr_set = ECX8|DSMC|DTLOCK|EMMX|EBRPRED|ERETSTK|
+ E2MMX|EAMD3D;
+ fcr_clr = DPDC;
+ break;
+ default:
+ name = "??";
+ }
+
+ rdmsr(MSR_IDT_FCR1, lo, hi);
+ newlo = (lo|fcr_set) & (~fcr_clr);
+
+ if (newlo != lo) {
+ pr_info("Centaur FCR was 0x%X now 0x%X\n",
+ lo, newlo);
+ wrmsr(MSR_IDT_FCR1, newlo, hi);
+ } else {
+ pr_info("Centaur FCR is 0x%X\n", lo);
+ }
+ /* Emulate MTRRs using Centaur's MCR. */
+ set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR);
+ /* Report CX8 */
+ set_cpu_cap(c, X86_FEATURE_CX8);
+ /* Set 3DNow! on Winchip 2 and above. */
+ if (c->x86_model >= 8)
+ set_cpu_cap(c, X86_FEATURE_3DNOW);
+ /* See if we can find out some more. */
+ if (cpuid_eax(0x80000000) >= 0x80000005) {
+ /* Yes, we can. */
+ cpuid(0x80000005, &aa, &bb, &cc, &dd);
+ /* Add L1 data and code cache sizes. */
+ c->x86_cache_size = (cc>>24)+(dd>>24);
+ }
+ sprintf(c->x86_model_id, "WinChip %s", name);
+ break;
+#endif
+ case 6:
+ init_c3(c);
+ break;
+ }
+#ifdef CONFIG_X86_64
+ set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
+#endif
+
+ if (cpu_has(c, X86_FEATURE_VMX))
+ centaur_detect_vmx_virtcap(c);
+}
+
+#ifdef CONFIG_X86_32
+static unsigned int
+centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size)
+{
+ /* VIA C3 CPUs (670-68F) need further shifting. */
+ if ((c->x86 == 6) && ((c->x86_model == 7) || (c->x86_model == 8)))
+ size >>= 8;
+
+ /*
+ * There's also an erratum in Nehemiah stepping 1, which
+ * returns '65KB' instead of '64KB'
+ * - Note, it seems this may only be in engineering samples.
+ */
+ if ((c->x86 == 6) && (c->x86_model == 9) &&
+ (c->x86_stepping == 1) && (size == 65))
+ size -= 1;
+ return size;
+}
+#endif
+
+static const struct cpu_dev centaur_cpu_dev = {
+ .c_vendor = "Centaur",
+ .c_ident = { "CentaurHauls" },
+ .c_early_init = early_init_centaur,
+ .c_init = init_centaur,
+#ifdef CONFIG_X86_32
+ .legacy_cache_size = centaur_size_cache,
+#endif
+ .c_x86_vendor = X86_VENDOR_CENTAUR,
+};
+
+cpu_dev_register(centaur_cpu_dev);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
new file mode 100644
index 000000000..389d11b2c
--- /dev/null
+++ b/arch/x86/kernel/cpu/common.c
@@ -0,0 +1,2065 @@
+/* cpu_feature_enabled() cannot be used this early */
+#define USE_EARLY_PGTABLE_L5
+
+#include <linux/bootmem.h>
+#include <linux/linkage.h>
+#include <linux/bitops.h>
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/percpu.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
+#include <linux/delay.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/clock.h>
+#include <linux/sched/task.h>
+#include <linux/init.h>
+#include <linux/kprobes.h>
+#include <linux/kgdb.h>
+#include <linux/smp.h>
+#include <linux/io.h>
+#include <linux/syscore_ops.h>
+
+#include <asm/stackprotector.h>
+#include <asm/perf_event.h>
+#include <asm/mmu_context.h>
+#include <asm/archrandom.h>
+#include <asm/hypervisor.h>
+#include <asm/processor.h>
+#include <asm/tlbflush.h>
+#include <asm/debugreg.h>
+#include <asm/sections.h>
+#include <asm/vsyscall.h>
+#include <linux/topology.h>
+#include <linux/cpumask.h>
+#include <asm/pgtable.h>
+#include <linux/atomic.h>
+#include <asm/proto.h>
+#include <asm/setup.h>
+#include <asm/apic.h>
+#include <asm/desc.h>
+#include <asm/fpu/internal.h>
+#include <asm/mtrr.h>
+#include <asm/hwcap2.h>
+#include <linux/numa.h>
+#include <asm/asm.h>
+#include <asm/bugs.h>
+#include <asm/cpu.h>
+#include <asm/mce.h>
+#include <asm/msr.h>
+#include <asm/pat.h>
+#include <asm/microcode.h>
+#include <asm/microcode_intel.h>
+#include <asm/intel-family.h>
+#include <asm/cpu_device_id.h>
+
+#ifdef CONFIG_X86_LOCAL_APIC
+#include <asm/uv/uv.h>
+#endif
+
+#include "cpu.h"
+
+u32 elf_hwcap2 __read_mostly;
+
+/* all of these masks are initialized in setup_cpu_local_masks() */
+cpumask_var_t cpu_initialized_mask;
+cpumask_var_t cpu_callout_mask;
+cpumask_var_t cpu_callin_mask;
+
+/* representing cpus for which sibling maps can be computed */
+cpumask_var_t cpu_sibling_setup_mask;
+
+/* Number of siblings per CPU package */
+int smp_num_siblings = 1;
+EXPORT_SYMBOL(smp_num_siblings);
+
+/* Last level cache ID of each logical CPU */
+DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id) = BAD_APICID;
+
+/* correctly size the local cpu masks */
+void __init setup_cpu_local_masks(void)
+{
+ alloc_bootmem_cpumask_var(&cpu_initialized_mask);
+ alloc_bootmem_cpumask_var(&cpu_callin_mask);
+ alloc_bootmem_cpumask_var(&cpu_callout_mask);
+ alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask);
+}
+
+static void default_init(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_X86_64
+ cpu_detect_cache_sizes(c);
+#else
+ /* Not much we can do here... */
+ /* Check if at least it has cpuid */
+ if (c->cpuid_level == -1) {
+ /* No cpuid. It must be an ancient CPU */
+ if (c->x86 == 4)
+ strcpy(c->x86_model_id, "486");
+ else if (c->x86 == 3)
+ strcpy(c->x86_model_id, "386");
+ }
+#endif
+}
+
+static const struct cpu_dev default_cpu = {
+ .c_init = default_init,
+ .c_vendor = "Unknown",
+ .c_x86_vendor = X86_VENDOR_UNKNOWN,
+};
+
+static const struct cpu_dev *this_cpu = &default_cpu;
+
+DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
+#ifdef CONFIG_X86_64
+ /*
+ * We need valid kernel segments for data and code in long mode too
+ * IRET will check the segment types kkeil 2000/10/28
+ * Also sysret mandates a special GDT layout
+ *
+ * TLS descriptors are currently at a different place compared to i386.
+ * Hopefully nobody expects them at a fixed place (Wine?)
+ */
+ [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff),
+ [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff),
+ [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc093, 0, 0xfffff),
+ [GDT_ENTRY_DEFAULT_USER32_CS] = GDT_ENTRY_INIT(0xc0fb, 0, 0xfffff),
+ [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(0xc0f3, 0, 0xfffff),
+ [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(0xa0fb, 0, 0xfffff),
+#else
+ [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xc09a, 0, 0xfffff),
+ [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff),
+ [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(0xc0fa, 0, 0xfffff),
+ [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(0xc0f2, 0, 0xfffff),
+ /*
+ * Segments used for calling PnP BIOS have byte granularity.
+ * They code segments and data segments have fixed 64k limits,
+ * the transfer segment sizes are set at run time.
+ */
+ /* 32-bit code */
+ [GDT_ENTRY_PNPBIOS_CS32] = GDT_ENTRY_INIT(0x409a, 0, 0xffff),
+ /* 16-bit code */
+ [GDT_ENTRY_PNPBIOS_CS16] = GDT_ENTRY_INIT(0x009a, 0, 0xffff),
+ /* 16-bit data */
+ [GDT_ENTRY_PNPBIOS_DS] = GDT_ENTRY_INIT(0x0092, 0, 0xffff),
+ /* 16-bit data */
+ [GDT_ENTRY_PNPBIOS_TS1] = GDT_ENTRY_INIT(0x0092, 0, 0),
+ /* 16-bit data */
+ [GDT_ENTRY_PNPBIOS_TS2] = GDT_ENTRY_INIT(0x0092, 0, 0),
+ /*
+ * The APM segments have byte granularity and their bases
+ * are set at run time. All have 64k limits.
+ */
+ /* 32-bit code */
+ [GDT_ENTRY_APMBIOS_BASE] = GDT_ENTRY_INIT(0x409a, 0, 0xffff),
+ /* 16-bit code */
+ [GDT_ENTRY_APMBIOS_BASE+1] = GDT_ENTRY_INIT(0x009a, 0, 0xffff),
+ /* data */
+ [GDT_ENTRY_APMBIOS_BASE+2] = GDT_ENTRY_INIT(0x4092, 0, 0xffff),
+
+ [GDT_ENTRY_ESPFIX_SS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff),
+ [GDT_ENTRY_PERCPU] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff),
+ GDT_STACK_CANARY_INIT
+#endif
+} };
+EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
+
+static int __init x86_mpx_setup(char *s)
+{
+ /* require an exact match without trailing characters */
+ if (strlen(s))
+ return 0;
+
+ /* do not emit a message if the feature is not present */
+ if (!boot_cpu_has(X86_FEATURE_MPX))
+ return 1;
+
+ setup_clear_cpu_cap(X86_FEATURE_MPX);
+ pr_info("nompx: Intel Memory Protection Extensions (MPX) disabled\n");
+ return 1;
+}
+__setup("nompx", x86_mpx_setup);
+
+#ifdef CONFIG_X86_64
+static int __init x86_nopcid_setup(char *s)
+{
+ /* nopcid doesn't accept parameters */
+ if (s)
+ return -EINVAL;
+
+ /* do not emit a message if the feature is not present */
+ if (!boot_cpu_has(X86_FEATURE_PCID))
+ return 0;
+
+ setup_clear_cpu_cap(X86_FEATURE_PCID);
+ pr_info("nopcid: PCID feature disabled\n");
+ return 0;
+}
+early_param("nopcid", x86_nopcid_setup);
+#endif
+
+static int __init x86_noinvpcid_setup(char *s)
+{
+ /* noinvpcid doesn't accept parameters */
+ if (s)
+ return -EINVAL;
+
+ /* do not emit a message if the feature is not present */
+ if (!boot_cpu_has(X86_FEATURE_INVPCID))
+ return 0;
+
+ setup_clear_cpu_cap(X86_FEATURE_INVPCID);
+ pr_info("noinvpcid: INVPCID feature disabled\n");
+ return 0;
+}
+early_param("noinvpcid", x86_noinvpcid_setup);
+
+#ifdef CONFIG_X86_32
+static int cachesize_override = -1;
+static int disable_x86_serial_nr = 1;
+
+static int __init cachesize_setup(char *str)
+{
+ get_option(&str, &cachesize_override);
+ return 1;
+}
+__setup("cachesize=", cachesize_setup);
+
+static int __init x86_sep_setup(char *s)
+{
+ setup_clear_cpu_cap(X86_FEATURE_SEP);
+ return 1;
+}
+__setup("nosep", x86_sep_setup);
+
+/* Standard macro to see if a specific flag is changeable */
+static inline int flag_is_changeable_p(u32 flag)
+{
+ u32 f1, f2;
+
+ /*
+ * Cyrix and IDT cpus allow disabling of CPUID
+ * so the code below may return different results
+ * when it is executed before and after enabling
+ * the CPUID. Add "volatile" to not allow gcc to
+ * optimize the subsequent calls to this function.
+ */
+ asm volatile ("pushfl \n\t"
+ "pushfl \n\t"
+ "popl %0 \n\t"
+ "movl %0, %1 \n\t"
+ "xorl %2, %0 \n\t"
+ "pushl %0 \n\t"
+ "popfl \n\t"
+ "pushfl \n\t"
+ "popl %0 \n\t"
+ "popfl \n\t"
+
+ : "=&r" (f1), "=&r" (f2)
+ : "ir" (flag));
+
+ return ((f1^f2) & flag) != 0;
+}
+
+/* Probe for the CPUID instruction */
+int have_cpuid_p(void)
+{
+ return flag_is_changeable_p(X86_EFLAGS_ID);
+}
+
+static void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
+{
+ unsigned long lo, hi;
+
+ if (!cpu_has(c, X86_FEATURE_PN) || !disable_x86_serial_nr)
+ return;
+
+ /* Disable processor serial number: */
+
+ rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
+ lo |= 0x200000;
+ wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
+
+ pr_notice("CPU serial number disabled.\n");
+ clear_cpu_cap(c, X86_FEATURE_PN);
+
+ /* Disabling the serial number may affect the cpuid level */
+ c->cpuid_level = cpuid_eax(0);
+}
+
+static int __init x86_serial_nr_setup(char *s)
+{
+ disable_x86_serial_nr = 0;
+ return 1;
+}
+__setup("serialnumber", x86_serial_nr_setup);
+#else
+static inline int flag_is_changeable_p(u32 flag)
+{
+ return 1;
+}
+static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
+{
+}
+#endif
+
+static __init int setup_disable_smep(char *arg)
+{
+ setup_clear_cpu_cap(X86_FEATURE_SMEP);
+ /* Check for things that depend on SMEP being enabled: */
+ check_mpx_erratum(&boot_cpu_data);
+ return 1;
+}
+__setup("nosmep", setup_disable_smep);
+
+static __always_inline void setup_smep(struct cpuinfo_x86 *c)
+{
+ if (cpu_has(c, X86_FEATURE_SMEP))
+ cr4_set_bits(X86_CR4_SMEP);
+}
+
+static __init int setup_disable_smap(char *arg)
+{
+ setup_clear_cpu_cap(X86_FEATURE_SMAP);
+ return 1;
+}
+__setup("nosmap", setup_disable_smap);
+
+static __always_inline void setup_smap(struct cpuinfo_x86 *c)
+{
+ unsigned long eflags = native_save_fl();
+
+ /* This should have been cleared long ago */
+ BUG_ON(eflags & X86_EFLAGS_AC);
+
+ if (cpu_has(c, X86_FEATURE_SMAP)) {
+#ifdef CONFIG_X86_SMAP
+ cr4_set_bits(X86_CR4_SMAP);
+#else
+ cr4_clear_bits(X86_CR4_SMAP);
+#endif
+ }
+}
+
+static __always_inline void setup_umip(struct cpuinfo_x86 *c)
+{
+ /* Check the boot processor, plus build option for UMIP. */
+ if (!cpu_feature_enabled(X86_FEATURE_UMIP))
+ goto out;
+
+ /* Check the current processor's cpuid bits. */
+ if (!cpu_has(c, X86_FEATURE_UMIP))
+ goto out;
+
+ cr4_set_bits(X86_CR4_UMIP);
+
+ pr_info("x86/cpu: Activated the Intel User Mode Instruction Prevention (UMIP) CPU feature\n");
+
+ return;
+
+out:
+ /*
+ * Make sure UMIP is disabled in case it was enabled in a
+ * previous boot (e.g., via kexec).
+ */
+ cr4_clear_bits(X86_CR4_UMIP);
+}
+
+/*
+ * Protection Keys are not available in 32-bit mode.
+ */
+static bool pku_disabled;
+
+static __always_inline void setup_pku(struct cpuinfo_x86 *c)
+{
+ /* check the boot processor, plus compile options for PKU: */
+ if (!cpu_feature_enabled(X86_FEATURE_PKU))
+ return;
+ /* checks the actual processor's cpuid bits: */
+ if (!cpu_has(c, X86_FEATURE_PKU))
+ return;
+ if (pku_disabled)
+ return;
+
+ cr4_set_bits(X86_CR4_PKE);
+ /*
+ * Seting X86_CR4_PKE will cause the X86_FEATURE_OSPKE
+ * cpuid bit to be set. We need to ensure that we
+ * update that bit in this CPU's "cpu_info".
+ */
+ set_cpu_cap(c, X86_FEATURE_OSPKE);
+}
+
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+static __init int setup_disable_pku(char *arg)
+{
+ /*
+ * Do not clear the X86_FEATURE_PKU bit. All of the
+ * runtime checks are against OSPKE so clearing the
+ * bit does nothing.
+ *
+ * This way, we will see "pku" in cpuinfo, but not
+ * "ospke", which is exactly what we want. It shows
+ * that the CPU has PKU, but the OS has not enabled it.
+ * This happens to be exactly how a system would look
+ * if we disabled the config option.
+ */
+ pr_info("x86: 'nopku' specified, disabling Memory Protection Keys\n");
+ pku_disabled = true;
+ return 1;
+}
+__setup("nopku", setup_disable_pku);
+#endif /* CONFIG_X86_64 */
+
+/*
+ * Some CPU features depend on higher CPUID levels, which may not always
+ * be available due to CPUID level capping or broken virtualization
+ * software. Add those features to this table to auto-disable them.
+ */
+struct cpuid_dependent_feature {
+ u32 feature;
+ u32 level;
+};
+
+static const struct cpuid_dependent_feature
+cpuid_dependent_features[] = {
+ { X86_FEATURE_MWAIT, 0x00000005 },
+ { X86_FEATURE_DCA, 0x00000009 },
+ { X86_FEATURE_XSAVE, 0x0000000d },
+ { 0, 0 }
+};
+
+static void filter_cpuid_features(struct cpuinfo_x86 *c, bool warn)
+{
+ const struct cpuid_dependent_feature *df;
+
+ for (df = cpuid_dependent_features; df->feature; df++) {
+
+ if (!cpu_has(c, df->feature))
+ continue;
+ /*
+ * Note: cpuid_level is set to -1 if unavailable, but
+ * extended_extended_level is set to 0 if unavailable
+ * and the legitimate extended levels are all negative
+ * when signed; hence the weird messing around with
+ * signs here...
+ */
+ if (!((s32)df->level < 0 ?
+ (u32)df->level > (u32)c->extended_cpuid_level :
+ (s32)df->level > (s32)c->cpuid_level))
+ continue;
+
+ clear_cpu_cap(c, df->feature);
+ if (!warn)
+ continue;
+
+ pr_warn("CPU: CPU feature " X86_CAP_FMT " disabled, no CPUID level 0x%x\n",
+ x86_cap_flag(df->feature), df->level);
+ }
+}
+
+/*
+ * Naming convention should be: <Name> [(<Codename>)]
+ * This table only is used unless init_<vendor>() below doesn't set it;
+ * in particular, if CPUID levels 0x80000002..4 are supported, this
+ * isn't used
+ */
+
+/* Look up CPU names by table lookup. */
+static const char *table_lookup_model(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_X86_32
+ const struct legacy_cpu_model_info *info;
+
+ if (c->x86_model >= 16)
+ return NULL; /* Range check */
+
+ if (!this_cpu)
+ return NULL;
+
+ info = this_cpu->legacy_models;
+
+ while (info->family) {
+ if (info->family == c->x86)
+ return info->model_names[c->x86_model];
+ info++;
+ }
+#endif
+ return NULL; /* Not found */
+}
+
+__u32 cpu_caps_cleared[NCAPINTS + NBUGINTS];
+__u32 cpu_caps_set[NCAPINTS + NBUGINTS];
+
+void load_percpu_segment(int cpu)
+{
+#ifdef CONFIG_X86_32
+ loadsegment(fs, __KERNEL_PERCPU);
+#else
+ __loadsegment_simple(gs, 0);
+ wrmsrl(MSR_GS_BASE, cpu_kernelmode_gs_base(cpu));
+#endif
+ load_stack_canary_segment();
+}
+
+#ifdef CONFIG_X86_32
+/* The 32-bit entry code needs to find cpu_entry_area. */
+DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area);
+#endif
+
+#ifdef CONFIG_X86_64
+/*
+ * Special IST stacks which the CPU switches to when it calls
+ * an IST-marked descriptor entry. Up to 7 stacks (hardware
+ * limit), all of them are 4K, except the debug stack which
+ * is 8K.
+ */
+static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
+ [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
+ [DEBUG_STACK - 1] = DEBUG_STKSZ
+};
+#endif
+
+/* Load the original GDT from the per-cpu structure */
+void load_direct_gdt(int cpu)
+{
+ struct desc_ptr gdt_descr;
+
+ gdt_descr.address = (long)get_cpu_gdt_rw(cpu);
+ gdt_descr.size = GDT_SIZE - 1;
+ load_gdt(&gdt_descr);
+}
+EXPORT_SYMBOL_GPL(load_direct_gdt);
+
+/* Load a fixmap remapping of the per-cpu GDT */
+void load_fixmap_gdt(int cpu)
+{
+ struct desc_ptr gdt_descr;
+
+ gdt_descr.address = (long)get_cpu_gdt_ro(cpu);
+ gdt_descr.size = GDT_SIZE - 1;
+ load_gdt(&gdt_descr);
+}
+EXPORT_SYMBOL_GPL(load_fixmap_gdt);
+
+/*
+ * Current gdt points %fs at the "master" per-cpu area: after this,
+ * it's on the real one.
+ */
+void switch_to_new_gdt(int cpu)
+{
+ /* Load the original GDT */
+ load_direct_gdt(cpu);
+ /* Reload the per-cpu base */
+ load_percpu_segment(cpu);
+}
+
+static const struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
+
+static void get_model_name(struct cpuinfo_x86 *c)
+{
+ unsigned int *v;
+ char *p, *q, *s;
+
+ if (c->extended_cpuid_level < 0x80000004)
+ return;
+
+ v = (unsigned int *)c->x86_model_id;
+ cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
+ cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
+ cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
+ c->x86_model_id[48] = 0;
+
+ /* Trim whitespace */
+ p = q = s = &c->x86_model_id[0];
+
+ while (*p == ' ')
+ p++;
+
+ while (*p) {
+ /* Note the last non-whitespace index */
+ if (!isspace(*p))
+ s = q;
+
+ *q++ = *p++;
+ }
+
+ *(s + 1) = '\0';
+}
+
+void detect_num_cpu_cores(struct cpuinfo_x86 *c)
+{
+ unsigned int eax, ebx, ecx, edx;
+
+ c->x86_max_cores = 1;
+ if (!IS_ENABLED(CONFIG_SMP) || c->cpuid_level < 4)
+ return;
+
+ cpuid_count(4, 0, &eax, &ebx, &ecx, &edx);
+ if (eax & 0x1f)
+ c->x86_max_cores = (eax >> 26) + 1;
+}
+
+void cpu_detect_cache_sizes(struct cpuinfo_x86 *c)
+{
+ unsigned int n, dummy, ebx, ecx, edx, l2size;
+
+ n = c->extended_cpuid_level;
+
+ if (n >= 0x80000005) {
+ cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
+ c->x86_cache_size = (ecx>>24) + (edx>>24);
+#ifdef CONFIG_X86_64
+ /* On K8 L1 TLB is inclusive, so don't count it */
+ c->x86_tlbsize = 0;
+#endif
+ }
+
+ if (n < 0x80000006) /* Some chips just has a large L1. */
+ return;
+
+ cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
+ l2size = ecx >> 16;
+
+#ifdef CONFIG_X86_64
+ c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
+#else
+ /* do processor-specific cache resizing */
+ if (this_cpu->legacy_cache_size)
+ l2size = this_cpu->legacy_cache_size(c, l2size);
+
+ /* Allow user to override all this if necessary. */
+ if (cachesize_override != -1)
+ l2size = cachesize_override;
+
+ if (l2size == 0)
+ return; /* Again, no L2 cache is possible */
+#endif
+
+ c->x86_cache_size = l2size;
+}
+
+u16 __read_mostly tlb_lli_4k[NR_INFO];
+u16 __read_mostly tlb_lli_2m[NR_INFO];
+u16 __read_mostly tlb_lli_4m[NR_INFO];
+u16 __read_mostly tlb_lld_4k[NR_INFO];
+u16 __read_mostly tlb_lld_2m[NR_INFO];
+u16 __read_mostly tlb_lld_4m[NR_INFO];
+u16 __read_mostly tlb_lld_1g[NR_INFO];
+
+static void cpu_detect_tlb(struct cpuinfo_x86 *c)
+{
+ if (this_cpu->c_detect_tlb)
+ this_cpu->c_detect_tlb(c);
+
+ pr_info("Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n",
+ tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES],
+ tlb_lli_4m[ENTRIES]);
+
+ pr_info("Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n",
+ tlb_lld_4k[ENTRIES], tlb_lld_2m[ENTRIES],
+ tlb_lld_4m[ENTRIES], tlb_lld_1g[ENTRIES]);
+}
+
+int detect_ht_early(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_SMP
+ u32 eax, ebx, ecx, edx;
+
+ if (!cpu_has(c, X86_FEATURE_HT))
+ return -1;
+
+ if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
+ return -1;
+
+ if (cpu_has(c, X86_FEATURE_XTOPOLOGY))
+ return -1;
+
+ cpuid(1, &eax, &ebx, &ecx, &edx);
+
+ smp_num_siblings = (ebx & 0xff0000) >> 16;
+ if (smp_num_siblings == 1)
+ pr_info_once("CPU0: Hyper-Threading is disabled\n");
+#endif
+ return 0;
+}
+
+void detect_ht(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_SMP
+ int index_msb, core_bits;
+
+ if (detect_ht_early(c) < 0)
+ return;
+
+ index_msb = get_count_order(smp_num_siblings);
+ c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb);
+
+ smp_num_siblings = smp_num_siblings / c->x86_max_cores;
+
+ index_msb = get_count_order(smp_num_siblings);
+
+ core_bits = get_count_order(c->x86_max_cores);
+
+ c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, index_msb) &
+ ((1 << core_bits) - 1);
+#endif
+}
+
+static void get_cpu_vendor(struct cpuinfo_x86 *c)
+{
+ char *v = c->x86_vendor_id;
+ int i;
+
+ for (i = 0; i < X86_VENDOR_NUM; i++) {
+ if (!cpu_devs[i])
+ break;
+
+ if (!strcmp(v, cpu_devs[i]->c_ident[0]) ||
+ (cpu_devs[i]->c_ident[1] &&
+ !strcmp(v, cpu_devs[i]->c_ident[1]))) {
+
+ this_cpu = cpu_devs[i];
+ c->x86_vendor = this_cpu->c_x86_vendor;
+ return;
+ }
+ }
+
+ pr_err_once("CPU: vendor_id '%s' unknown, using generic init.\n" \
+ "CPU: Your system may be unstable.\n", v);
+
+ c->x86_vendor = X86_VENDOR_UNKNOWN;
+ this_cpu = &default_cpu;
+}
+
+void cpu_detect(struct cpuinfo_x86 *c)
+{
+ /* Get vendor name */
+ cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
+ (unsigned int *)&c->x86_vendor_id[0],
+ (unsigned int *)&c->x86_vendor_id[8],
+ (unsigned int *)&c->x86_vendor_id[4]);
+
+ c->x86 = 4;
+ /* Intel-defined flags: level 0x00000001 */
+ if (c->cpuid_level >= 0x00000001) {
+ u32 junk, tfms, cap0, misc;
+
+ cpuid(0x00000001, &tfms, &misc, &junk, &cap0);
+ c->x86 = x86_family(tfms);
+ c->x86_model = x86_model(tfms);
+ c->x86_stepping = x86_stepping(tfms);
+
+ if (cap0 & (1<<19)) {
+ c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
+ c->x86_cache_alignment = c->x86_clflush_size;
+ }
+ }
+}
+
+static void apply_forced_caps(struct cpuinfo_x86 *c)
+{
+ int i;
+
+ for (i = 0; i < NCAPINTS + NBUGINTS; i++) {
+ c->x86_capability[i] &= ~cpu_caps_cleared[i];
+ c->x86_capability[i] |= cpu_caps_set[i];
+ }
+}
+
+static void init_speculation_control(struct cpuinfo_x86 *c)
+{
+ /*
+ * The Intel SPEC_CTRL CPUID bit implies IBRS and IBPB support,
+ * and they also have a different bit for STIBP support. Also,
+ * a hypervisor might have set the individual AMD bits even on
+ * Intel CPUs, for finer-grained selection of what's available.
+ */
+ if (cpu_has(c, X86_FEATURE_SPEC_CTRL)) {
+ set_cpu_cap(c, X86_FEATURE_IBRS);
+ set_cpu_cap(c, X86_FEATURE_IBPB);
+ set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL);
+ }
+
+ if (cpu_has(c, X86_FEATURE_INTEL_STIBP))
+ set_cpu_cap(c, X86_FEATURE_STIBP);
+
+ if (cpu_has(c, X86_FEATURE_SPEC_CTRL_SSBD) ||
+ cpu_has(c, X86_FEATURE_VIRT_SSBD))
+ set_cpu_cap(c, X86_FEATURE_SSBD);
+
+ if (cpu_has(c, X86_FEATURE_AMD_IBRS)) {
+ set_cpu_cap(c, X86_FEATURE_IBRS);
+ set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL);
+ }
+
+ if (cpu_has(c, X86_FEATURE_AMD_IBPB))
+ set_cpu_cap(c, X86_FEATURE_IBPB);
+
+ if (cpu_has(c, X86_FEATURE_AMD_STIBP)) {
+ set_cpu_cap(c, X86_FEATURE_STIBP);
+ set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL);
+ }
+
+ if (cpu_has(c, X86_FEATURE_AMD_SSBD)) {
+ set_cpu_cap(c, X86_FEATURE_SSBD);
+ set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL);
+ clear_cpu_cap(c, X86_FEATURE_VIRT_SSBD);
+ }
+}
+
+static void init_cqm(struct cpuinfo_x86 *c)
+{
+ if (!cpu_has(c, X86_FEATURE_CQM_LLC)) {
+ c->x86_cache_max_rmid = -1;
+ c->x86_cache_occ_scale = -1;
+ return;
+ }
+
+ /* will be overridden if occupancy monitoring exists */
+ c->x86_cache_max_rmid = cpuid_ebx(0xf);
+
+ if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC) ||
+ cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL) ||
+ cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL)) {
+ u32 eax, ebx, ecx, edx;
+
+ /* QoS sub-leaf, EAX=0Fh, ECX=1 */
+ cpuid_count(0xf, 1, &eax, &ebx, &ecx, &edx);
+
+ c->x86_cache_max_rmid = ecx;
+ c->x86_cache_occ_scale = ebx;
+ }
+}
+
+void get_cpu_cap(struct cpuinfo_x86 *c)
+{
+ u32 eax, ebx, ecx, edx;
+
+ /* Intel-defined flags: level 0x00000001 */
+ if (c->cpuid_level >= 0x00000001) {
+ cpuid(0x00000001, &eax, &ebx, &ecx, &edx);
+
+ c->x86_capability[CPUID_1_ECX] = ecx;
+ c->x86_capability[CPUID_1_EDX] = edx;
+ }
+
+ /* Thermal and Power Management Leaf: level 0x00000006 (eax) */
+ if (c->cpuid_level >= 0x00000006)
+ c->x86_capability[CPUID_6_EAX] = cpuid_eax(0x00000006);
+
+ /* Additional Intel-defined flags: level 0x00000007 */
+ if (c->cpuid_level >= 0x00000007) {
+ cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx);
+ c->x86_capability[CPUID_7_0_EBX] = ebx;
+ c->x86_capability[CPUID_7_ECX] = ecx;
+ c->x86_capability[CPUID_7_EDX] = edx;
+ }
+
+ /* Extended state features: level 0x0000000d */
+ if (c->cpuid_level >= 0x0000000d) {
+ cpuid_count(0x0000000d, 1, &eax, &ebx, &ecx, &edx);
+
+ c->x86_capability[CPUID_D_1_EAX] = eax;
+ }
+
+ /* AMD-defined flags: level 0x80000001 */
+ eax = cpuid_eax(0x80000000);
+ c->extended_cpuid_level = eax;
+
+ if ((eax & 0xffff0000) == 0x80000000) {
+ if (eax >= 0x80000001) {
+ cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
+
+ c->x86_capability[CPUID_8000_0001_ECX] = ecx;
+ c->x86_capability[CPUID_8000_0001_EDX] = edx;
+ }
+ }
+
+ if (c->extended_cpuid_level >= 0x80000007) {
+ cpuid(0x80000007, &eax, &ebx, &ecx, &edx);
+
+ c->x86_capability[CPUID_8000_0007_EBX] = ebx;
+ c->x86_power = edx;
+ }
+
+ if (c->extended_cpuid_level >= 0x80000008) {
+ cpuid(0x80000008, &eax, &ebx, &ecx, &edx);
+ c->x86_capability[CPUID_8000_0008_EBX] = ebx;
+ }
+
+ if (c->extended_cpuid_level >= 0x8000000a)
+ c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x8000000a);
+
+ init_scattered_cpuid_features(c);
+ init_speculation_control(c);
+ init_cqm(c);
+
+ /*
+ * Clear/Set all flags overridden by options, after probe.
+ * This needs to happen each time we re-probe, which may happen
+ * several times during CPU initialization.
+ */
+ apply_forced_caps(c);
+}
+
+void get_cpu_address_sizes(struct cpuinfo_x86 *c)
+{
+ u32 eax, ebx, ecx, edx;
+
+ if (c->extended_cpuid_level >= 0x80000008) {
+ cpuid(0x80000008, &eax, &ebx, &ecx, &edx);
+
+ c->x86_virt_bits = (eax >> 8) & 0xff;
+ c->x86_phys_bits = eax & 0xff;
+ }
+#ifdef CONFIG_X86_32
+ else if (cpu_has(c, X86_FEATURE_PAE) || cpu_has(c, X86_FEATURE_PSE36))
+ c->x86_phys_bits = 36;
+#endif
+ c->x86_cache_bits = c->x86_phys_bits;
+}
+
+static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_X86_32
+ int i;
+
+ /*
+ * First of all, decide if this is a 486 or higher
+ * It's a 486 if we can modify the AC flag
+ */
+ if (flag_is_changeable_p(X86_EFLAGS_AC))
+ c->x86 = 4;
+ else
+ c->x86 = 3;
+
+ for (i = 0; i < X86_VENDOR_NUM; i++)
+ if (cpu_devs[i] && cpu_devs[i]->c_identify) {
+ c->x86_vendor_id[0] = 0;
+ cpu_devs[i]->c_identify(c);
+ if (c->x86_vendor_id[0]) {
+ get_cpu_vendor(c);
+ break;
+ }
+ }
+#endif
+}
+
+#define NO_SPECULATION BIT(0)
+#define NO_MELTDOWN BIT(1)
+#define NO_SSB BIT(2)
+#define NO_L1TF BIT(3)
+#define NO_MDS BIT(4)
+#define MSBDS_ONLY BIT(5)
+#define NO_SWAPGS BIT(6)
+#define NO_ITLB_MULTIHIT BIT(7)
+
+#define VULNWL(_vendor, _family, _model, _whitelist) \
+ { X86_VENDOR_##_vendor, _family, _model, X86_FEATURE_ANY, _whitelist }
+
+#define VULNWL_INTEL(model, whitelist) \
+ VULNWL(INTEL, 6, INTEL_FAM6_##model, whitelist)
+
+#define VULNWL_AMD(family, whitelist) \
+ VULNWL(AMD, family, X86_MODEL_ANY, whitelist)
+
+static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
+ VULNWL(ANY, 4, X86_MODEL_ANY, NO_SPECULATION),
+ VULNWL(CENTAUR, 5, X86_MODEL_ANY, NO_SPECULATION),
+ VULNWL(INTEL, 5, X86_MODEL_ANY, NO_SPECULATION),
+ VULNWL(NSC, 5, X86_MODEL_ANY, NO_SPECULATION),
+
+ /* Intel Family 6 */
+ VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(ATOM_BONNELL, NO_SPECULATION | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(ATOM_BONNELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT),
+
+ VULNWL_INTEL(ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(ATOM_SILVERMONT_X, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+
+ VULNWL_INTEL(CORE_YONAH, NO_SSB),
+
+ VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+
+ VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(ATOM_GOLDMONT_X, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT),
+
+ /*
+ * Technically, swapgs isn't serializing on AMD (despite it previously
+ * being documented as such in the APM). But according to AMD, %gs is
+ * updated non-speculatively, and the issuing of %gs-relative memory
+ * operands will be blocked until the %gs update completes, which is
+ * good enough for our purposes.
+ */
+
+ VULNWL_INTEL(ATOM_TREMONT_X, NO_ITLB_MULTIHIT),
+
+ /* AMD Family 0xf - 0x12 */
+ VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),
+
+ /* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */
+ VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ {}
+};
+
+#define VULNBL_INTEL_STEPPINGS(model, steppings, issues) \
+ X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(INTEL, 6, \
+ INTEL_FAM6_##model, steppings, \
+ X86_FEATURE_ANY, issues)
+
+#define SRBDS BIT(0)
+/* CPU is affected by X86_BUG_MMIO_STALE_DATA */
+#define MMIO BIT(1)
+/* CPU is affected by Shared Buffers Data Sampling (SBDS), a variant of X86_BUG_MMIO_STALE_DATA */
+#define MMIO_SBDS BIT(2)
+
+static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = {
+ VULNBL_INTEL_STEPPINGS(IVYBRIDGE, X86_STEPPING_ANY, SRBDS),
+ VULNBL_INTEL_STEPPINGS(HASWELL_CORE, X86_STEPPING_ANY, SRBDS),
+ VULNBL_INTEL_STEPPINGS(HASWELL_ULT, X86_STEPPING_ANY, SRBDS),
+ VULNBL_INTEL_STEPPINGS(HASWELL_GT3E, X86_STEPPING_ANY, SRBDS),
+ VULNBL_INTEL_STEPPINGS(HASWELL_X, BIT(2) | BIT(4), MMIO),
+ VULNBL_INTEL_STEPPINGS(BROADWELL_XEON_D,X86_STEPPINGS(0x3, 0x5), MMIO),
+ VULNBL_INTEL_STEPPINGS(BROADWELL_GT3E, X86_STEPPING_ANY, SRBDS),
+ VULNBL_INTEL_STEPPINGS(BROADWELL_X, X86_STEPPING_ANY, MMIO),
+ VULNBL_INTEL_STEPPINGS(BROADWELL_CORE, X86_STEPPING_ANY, SRBDS),
+ VULNBL_INTEL_STEPPINGS(SKYLAKE_MOBILE, X86_STEPPINGS(0x3, 0x3), SRBDS | MMIO),
+ VULNBL_INTEL_STEPPINGS(SKYLAKE_MOBILE, X86_STEPPING_ANY, SRBDS),
+ VULNBL_INTEL_STEPPINGS(SKYLAKE_X, BIT(3) | BIT(4) | BIT(6) |
+ BIT(7) | BIT(0xB), MMIO),
+ VULNBL_INTEL_STEPPINGS(SKYLAKE_DESKTOP, X86_STEPPINGS(0x3, 0x3), SRBDS | MMIO),
+ VULNBL_INTEL_STEPPINGS(SKYLAKE_DESKTOP, X86_STEPPING_ANY, SRBDS),
+ VULNBL_INTEL_STEPPINGS(KABYLAKE_MOBILE, X86_STEPPINGS(0x9, 0xC), SRBDS | MMIO),
+ VULNBL_INTEL_STEPPINGS(KABYLAKE_MOBILE, X86_STEPPINGS(0x0, 0x8), SRBDS),
+ VULNBL_INTEL_STEPPINGS(KABYLAKE_DESKTOP,X86_STEPPINGS(0x9, 0xD), SRBDS | MMIO),
+ VULNBL_INTEL_STEPPINGS(KABYLAKE_DESKTOP,X86_STEPPINGS(0x0, 0x8), SRBDS),
+ VULNBL_INTEL_STEPPINGS(ICELAKE_MOBILE, X86_STEPPINGS(0x5, 0x5), MMIO | MMIO_SBDS),
+ VULNBL_INTEL_STEPPINGS(ICELAKE_XEON_D, X86_STEPPINGS(0x1, 0x1), MMIO),
+ VULNBL_INTEL_STEPPINGS(ICELAKE_X, X86_STEPPINGS(0x4, 0x6), MMIO),
+ VULNBL_INTEL_STEPPINGS(COMETLAKE, BIT(2) | BIT(3) | BIT(5), MMIO | MMIO_SBDS),
+ VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPINGS(0x1, 0x1), MMIO | MMIO_SBDS),
+ VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPINGS(0x0, 0x0), MMIO),
+ VULNBL_INTEL_STEPPINGS(LAKEFIELD, X86_STEPPINGS(0x1, 0x1), MMIO | MMIO_SBDS),
+ VULNBL_INTEL_STEPPINGS(ROCKETLAKE, X86_STEPPINGS(0x1, 0x1), MMIO),
+ VULNBL_INTEL_STEPPINGS(ATOM_TREMONT, X86_STEPPINGS(0x1, 0x1), MMIO | MMIO_SBDS),
+ VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_X, X86_STEPPING_ANY, MMIO),
+ VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_L, X86_STEPPINGS(0x0, 0x0), MMIO | MMIO_SBDS),
+ {}
+};
+
+static bool __init cpu_matches(const struct x86_cpu_id *table, unsigned long which)
+{
+ const struct x86_cpu_id *m = x86_match_cpu(table);
+
+ return m && !!(m->driver_data & which);
+}
+
+u64 x86_read_arch_cap_msr(void)
+{
+ u64 ia32_cap = 0;
+
+ if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
+ rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap);
+
+ return ia32_cap;
+}
+
+static bool arch_cap_mmio_immune(u64 ia32_cap)
+{
+ return (ia32_cap & ARCH_CAP_FBSDP_NO &&
+ ia32_cap & ARCH_CAP_PSDP_NO &&
+ ia32_cap & ARCH_CAP_SBDR_SSDP_NO);
+}
+
+static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
+{
+ u64 ia32_cap = x86_read_arch_cap_msr();
+
+ /* Set ITLB_MULTIHIT bug if cpu is not in the whitelist and not mitigated */
+ if (!cpu_matches(cpu_vuln_whitelist, NO_ITLB_MULTIHIT) &&
+ !(ia32_cap & ARCH_CAP_PSCHANGE_MC_NO))
+ setup_force_cpu_bug(X86_BUG_ITLB_MULTIHIT);
+
+ if (cpu_matches(cpu_vuln_whitelist, NO_SPECULATION))
+ return;
+
+ setup_force_cpu_bug(X86_BUG_SPECTRE_V1);
+ setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
+
+ if (!cpu_matches(cpu_vuln_whitelist, NO_SSB) &&
+ !(ia32_cap & ARCH_CAP_SSB_NO) &&
+ !cpu_has(c, X86_FEATURE_AMD_SSB_NO))
+ setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS);
+
+ if (ia32_cap & ARCH_CAP_IBRS_ALL)
+ setup_force_cpu_cap(X86_FEATURE_IBRS_ENHANCED);
+
+ if (!cpu_matches(cpu_vuln_whitelist, NO_MDS) &&
+ !(ia32_cap & ARCH_CAP_MDS_NO)) {
+ setup_force_cpu_bug(X86_BUG_MDS);
+ if (cpu_matches(cpu_vuln_whitelist, MSBDS_ONLY))
+ setup_force_cpu_bug(X86_BUG_MSBDS_ONLY);
+ }
+
+ if (!cpu_matches(cpu_vuln_whitelist, NO_SWAPGS))
+ setup_force_cpu_bug(X86_BUG_SWAPGS);
+
+ /*
+ * When the CPU is not mitigated for TAA (TAA_NO=0) set TAA bug when:
+ * - TSX is supported or
+ * - TSX_CTRL is present
+ *
+ * TSX_CTRL check is needed for cases when TSX could be disabled before
+ * the kernel boot e.g. kexec.
+ * TSX_CTRL check alone is not sufficient for cases when the microcode
+ * update is not present or running as guest that don't get TSX_CTRL.
+ */
+ if (!(ia32_cap & ARCH_CAP_TAA_NO) &&
+ (cpu_has(c, X86_FEATURE_RTM) ||
+ (ia32_cap & ARCH_CAP_TSX_CTRL_MSR)))
+ setup_force_cpu_bug(X86_BUG_TAA);
+
+ /*
+ * SRBDS affects CPUs which support RDRAND or RDSEED and are listed
+ * in the vulnerability blacklist.
+ *
+ * Some of the implications and mitigation of Shared Buffers Data
+ * Sampling (SBDS) are similar to SRBDS. Give SBDS same treatment as
+ * SRBDS.
+ */
+ if ((cpu_has(c, X86_FEATURE_RDRAND) ||
+ cpu_has(c, X86_FEATURE_RDSEED)) &&
+ cpu_matches(cpu_vuln_blacklist, SRBDS | MMIO_SBDS))
+ setup_force_cpu_bug(X86_BUG_SRBDS);
+
+ /*
+ * Processor MMIO Stale Data bug enumeration
+ *
+ * Affected CPU list is generally enough to enumerate the vulnerability,
+ * but for virtualization case check for ARCH_CAP MSR bits also, VMM may
+ * not want the guest to enumerate the bug.
+ */
+ if (cpu_matches(cpu_vuln_blacklist, MMIO) &&
+ !arch_cap_mmio_immune(ia32_cap))
+ setup_force_cpu_bug(X86_BUG_MMIO_STALE_DATA);
+
+ if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN))
+ return;
+
+ /* Rogue Data Cache Load? No! */
+ if (ia32_cap & ARCH_CAP_RDCL_NO)
+ return;
+
+ setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN);
+
+ if (cpu_matches(cpu_vuln_whitelist, NO_L1TF))
+ return;
+
+ setup_force_cpu_bug(X86_BUG_L1TF);
+}
+
+/*
+ * The NOPL instruction is supposed to exist on all CPUs of family >= 6;
+ * unfortunately, that's not true in practice because of early VIA
+ * chips and (more importantly) broken virtualizers that are not easy
+ * to detect. In the latter case it doesn't even *fail* reliably, so
+ * probing for it doesn't even work. Disable it completely on 32-bit
+ * unless we can find a reliable way to detect all the broken cases.
+ * Enable it explicitly on 64-bit for non-constant inputs of cpu_has().
+ */
+static void detect_nopl(void)
+{
+#ifdef CONFIG_X86_32
+ setup_clear_cpu_cap(X86_FEATURE_NOPL);
+#else
+ setup_force_cpu_cap(X86_FEATURE_NOPL);
+#endif
+}
+
+/*
+ * Do minimum CPU detection early.
+ * Fields really needed: vendor, cpuid_level, family, model, mask,
+ * cache alignment.
+ * The others are not touched to avoid unwanted side effects.
+ *
+ * WARNING: this function is only called on the boot CPU. Don't add code
+ * here that is supposed to run on all CPUs.
+ */
+static void __init early_identify_cpu(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_X86_64
+ c->x86_clflush_size = 64;
+ c->x86_phys_bits = 36;
+ c->x86_virt_bits = 48;
+#else
+ c->x86_clflush_size = 32;
+ c->x86_phys_bits = 32;
+ c->x86_virt_bits = 32;
+#endif
+ c->x86_cache_alignment = c->x86_clflush_size;
+
+ memset(&c->x86_capability, 0, sizeof c->x86_capability);
+ c->extended_cpuid_level = 0;
+
+ if (!have_cpuid_p())
+ identify_cpu_without_cpuid(c);
+
+ /* cyrix could have cpuid enabled via c_identify()*/
+ if (have_cpuid_p()) {
+ cpu_detect(c);
+ get_cpu_vendor(c);
+ get_cpu_cap(c);
+ get_cpu_address_sizes(c);
+ setup_force_cpu_cap(X86_FEATURE_CPUID);
+
+ if (this_cpu->c_early_init)
+ this_cpu->c_early_init(c);
+
+ c->cpu_index = 0;
+ filter_cpuid_features(c, false);
+
+ if (this_cpu->c_bsp_init)
+ this_cpu->c_bsp_init(c);
+ } else {
+ setup_clear_cpu_cap(X86_FEATURE_CPUID);
+ }
+
+ setup_force_cpu_cap(X86_FEATURE_ALWAYS);
+
+ cpu_set_bug_bits(c);
+
+ fpu__init_system(c);
+
+#ifdef CONFIG_X86_32
+ /*
+ * Regardless of whether PCID is enumerated, the SDM says
+ * that it can't be enabled in 32-bit mode.
+ */
+ setup_clear_cpu_cap(X86_FEATURE_PCID);
+#endif
+
+ /*
+ * Later in the boot process pgtable_l5_enabled() relies on
+ * cpu_feature_enabled(X86_FEATURE_LA57). If 5-level paging is not
+ * enabled by this point we need to clear the feature bit to avoid
+ * false-positives at the later stage.
+ *
+ * pgtable_l5_enabled() can be false here for several reasons:
+ * - 5-level paging is disabled compile-time;
+ * - it's 32-bit kernel;
+ * - machine doesn't support 5-level paging;
+ * - user specified 'no5lvl' in kernel command line.
+ */
+ if (!pgtable_l5_enabled())
+ setup_clear_cpu_cap(X86_FEATURE_LA57);
+
+ detect_nopl();
+}
+
+void __init early_cpu_init(void)
+{
+ const struct cpu_dev *const *cdev;
+ int count = 0;
+
+#ifdef CONFIG_PROCESSOR_SELECT
+ pr_info("KERNEL supported cpus:\n");
+#endif
+
+ for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) {
+ const struct cpu_dev *cpudev = *cdev;
+
+ if (count >= X86_VENDOR_NUM)
+ break;
+ cpu_devs[count] = cpudev;
+ count++;
+
+#ifdef CONFIG_PROCESSOR_SELECT
+ {
+ unsigned int j;
+
+ for (j = 0; j < 2; j++) {
+ if (!cpudev->c_ident[j])
+ continue;
+ pr_info(" %s %s\n", cpudev->c_vendor,
+ cpudev->c_ident[j]);
+ }
+ }
+#endif
+ }
+ early_identify_cpu(&boot_cpu_data);
+}
+
+static bool detect_null_seg_behavior(void)
+{
+ /*
+ * Empirically, writing zero to a segment selector on AMD does
+ * not clear the base, whereas writing zero to a segment
+ * selector on Intel does clear the base. Intel's behavior
+ * allows slightly faster context switches in the common case
+ * where GS is unused by the prev and next threads.
+ *
+ * Since neither vendor documents this anywhere that I can see,
+ * detect it directly instead of hardcoding the choice by
+ * vendor.
+ *
+ * I've designated AMD's behavior as the "bug" because it's
+ * counterintuitive and less friendly.
+ */
+
+ unsigned long old_base, tmp;
+ rdmsrl(MSR_FS_BASE, old_base);
+ wrmsrl(MSR_FS_BASE, 1);
+ loadsegment(fs, 0);
+ rdmsrl(MSR_FS_BASE, tmp);
+ wrmsrl(MSR_FS_BASE, old_base);
+ return tmp == 0;
+}
+
+void check_null_seg_clears_base(struct cpuinfo_x86 *c)
+{
+ /* BUG_NULL_SEG is only relevant with 64bit userspace */
+ if (!IS_ENABLED(CONFIG_X86_64))
+ return;
+
+ /* Zen3 CPUs advertise Null Selector Clears Base in CPUID. */
+ if (c->extended_cpuid_level >= 0x80000021 &&
+ cpuid_eax(0x80000021) & BIT(6))
+ return;
+
+ /*
+ * CPUID bit above wasn't set. If this kernel is still running
+ * as a HV guest, then the HV has decided not to advertize
+ * that CPUID bit for whatever reason. For example, one
+ * member of the migration pool might be vulnerable. Which
+ * means, the bug is present: set the BUG flag and return.
+ */
+ if (cpu_has(c, X86_FEATURE_HYPERVISOR)) {
+ set_cpu_bug(c, X86_BUG_NULL_SEG);
+ return;
+ }
+
+ /*
+ * Zen2 CPUs also have this behaviour, but no CPUID bit.
+ * 0x18 is the respective family for Hygon.
+ */
+ if ((c->x86 == 0x17 || c->x86 == 0x18) &&
+ detect_null_seg_behavior())
+ return;
+
+ /* All the remaining ones are affected */
+ set_cpu_bug(c, X86_BUG_NULL_SEG);
+}
+
+static void generic_identify(struct cpuinfo_x86 *c)
+{
+ c->extended_cpuid_level = 0;
+
+ if (!have_cpuid_p())
+ identify_cpu_without_cpuid(c);
+
+ /* cyrix could have cpuid enabled via c_identify()*/
+ if (!have_cpuid_p())
+ return;
+
+ cpu_detect(c);
+
+ get_cpu_vendor(c);
+
+ get_cpu_cap(c);
+
+ get_cpu_address_sizes(c);
+
+ if (c->cpuid_level >= 0x00000001) {
+ c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF;
+#ifdef CONFIG_X86_32
+# ifdef CONFIG_SMP
+ c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
+# else
+ c->apicid = c->initial_apicid;
+# endif
+#endif
+ c->phys_proc_id = c->initial_apicid;
+ }
+
+ get_model_name(c); /* Default name */
+
+ /*
+ * ESPFIX is a strange bug. All real CPUs have it. Paravirt
+ * systems that run Linux at CPL > 0 may or may not have the
+ * issue, but, even if they have the issue, there's absolutely
+ * nothing we can do about it because we can't use the real IRET
+ * instruction.
+ *
+ * NB: For the time being, only 32-bit kernels support
+ * X86_BUG_ESPFIX as such. 64-bit kernels directly choose
+ * whether to apply espfix using paravirt hooks. If any
+ * non-paravirt system ever shows up that does *not* have the
+ * ESPFIX issue, we can change this.
+ */
+#ifdef CONFIG_X86_32
+# ifdef CONFIG_PARAVIRT
+ do {
+ extern void native_iret(void);
+ if (pv_cpu_ops.iret == native_iret)
+ set_cpu_bug(c, X86_BUG_ESPFIX);
+ } while (0);
+# else
+ set_cpu_bug(c, X86_BUG_ESPFIX);
+# endif
+#endif
+}
+
+static void x86_init_cache_qos(struct cpuinfo_x86 *c)
+{
+ /*
+ * The heavy lifting of max_rmid and cache_occ_scale are handled
+ * in get_cpu_cap(). Here we just set the max_rmid for the boot_cpu
+ * in case CQM bits really aren't there in this CPU.
+ */
+ if (c != &boot_cpu_data) {
+ boot_cpu_data.x86_cache_max_rmid =
+ min(boot_cpu_data.x86_cache_max_rmid,
+ c->x86_cache_max_rmid);
+ }
+}
+
+/*
+ * Validate that ACPI/mptables have the same information about the
+ * effective APIC id and update the package map.
+ */
+static void validate_apic_and_package_id(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_SMP
+ unsigned int apicid, cpu = smp_processor_id();
+
+ apicid = apic->cpu_present_to_apicid(cpu);
+
+ if (apicid != c->apicid) {
+ pr_err(FW_BUG "CPU%u: APIC id mismatch. Firmware: %x APIC: %x\n",
+ cpu, apicid, c->initial_apicid);
+ }
+ BUG_ON(topology_update_package_map(c->phys_proc_id, cpu));
+#else
+ c->logical_proc_id = 0;
+#endif
+}
+
+/*
+ * This does the hard work of actually picking apart the CPU stuff...
+ */
+static void identify_cpu(struct cpuinfo_x86 *c)
+{
+ int i;
+
+ c->loops_per_jiffy = loops_per_jiffy;
+ c->x86_cache_size = 0;
+ c->x86_vendor = X86_VENDOR_UNKNOWN;
+ c->x86_model = c->x86_stepping = 0; /* So far unknown... */
+ c->x86_vendor_id[0] = '\0'; /* Unset */
+ c->x86_model_id[0] = '\0'; /* Unset */
+ c->x86_max_cores = 1;
+ c->x86_coreid_bits = 0;
+ c->cu_id = 0xff;
+#ifdef CONFIG_X86_64
+ c->x86_clflush_size = 64;
+ c->x86_phys_bits = 36;
+ c->x86_virt_bits = 48;
+#else
+ c->cpuid_level = -1; /* CPUID not detected */
+ c->x86_clflush_size = 32;
+ c->x86_phys_bits = 32;
+ c->x86_virt_bits = 32;
+#endif
+ c->x86_cache_alignment = c->x86_clflush_size;
+ memset(&c->x86_capability, 0, sizeof c->x86_capability);
+
+ generic_identify(c);
+
+ if (this_cpu->c_identify)
+ this_cpu->c_identify(c);
+
+ /* Clear/Set all flags overridden by options, after probe */
+ apply_forced_caps(c);
+
+#ifdef CONFIG_X86_64
+ c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
+#endif
+
+ /*
+ * Vendor-specific initialization. In this section we
+ * canonicalize the feature flags, meaning if there are
+ * features a certain CPU supports which CPUID doesn't
+ * tell us, CPUID claiming incorrect flags, or other bugs,
+ * we handle them here.
+ *
+ * At the end of this section, c->x86_capability better
+ * indicate the features this CPU genuinely supports!
+ */
+ if (this_cpu->c_init)
+ this_cpu->c_init(c);
+
+ /* Disable the PN if appropriate */
+ squash_the_stupid_serial_number(c);
+
+ /* Set up SMEP/SMAP/UMIP */
+ setup_smep(c);
+ setup_smap(c);
+ setup_umip(c);
+
+ /*
+ * The vendor-specific functions might have changed features.
+ * Now we do "generic changes."
+ */
+
+ /* Filter out anything that depends on CPUID levels we don't have */
+ filter_cpuid_features(c, true);
+
+ /* If the model name is still unset, do table lookup. */
+ if (!c->x86_model_id[0]) {
+ const char *p;
+ p = table_lookup_model(c);
+ if (p)
+ strcpy(c->x86_model_id, p);
+ else
+ /* Last resort... */
+ sprintf(c->x86_model_id, "%02x/%02x",
+ c->x86, c->x86_model);
+ }
+
+#ifdef CONFIG_X86_64
+ detect_ht(c);
+#endif
+
+ x86_init_rdrand(c);
+ x86_init_cache_qos(c);
+ setup_pku(c);
+
+ /*
+ * Clear/Set all flags overridden by options, need do it
+ * before following smp all cpus cap AND.
+ */
+ apply_forced_caps(c);
+
+ /*
+ * On SMP, boot_cpu_data holds the common feature set between
+ * all CPUs; so make sure that we indicate which features are
+ * common between the CPUs. The first time this routine gets
+ * executed, c == &boot_cpu_data.
+ */
+ if (c != &boot_cpu_data) {
+ /* AND the already accumulated flags with these */
+ for (i = 0; i < NCAPINTS; i++)
+ boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
+
+ /* OR, i.e. replicate the bug flags */
+ for (i = NCAPINTS; i < NCAPINTS + NBUGINTS; i++)
+ c->x86_capability[i] |= boot_cpu_data.x86_capability[i];
+ }
+
+ /* Init Machine Check Exception if available. */
+ mcheck_cpu_init(c);
+
+ select_idle_routine(c);
+
+#ifdef CONFIG_NUMA
+ numa_add_cpu(smp_processor_id());
+#endif
+}
+
+/*
+ * Set up the CPU state needed to execute SYSENTER/SYSEXIT instructions
+ * on 32-bit kernels:
+ */
+#ifdef CONFIG_X86_32
+void enable_sep_cpu(void)
+{
+ struct tss_struct *tss;
+ int cpu;
+
+ if (!boot_cpu_has(X86_FEATURE_SEP))
+ return;
+
+ cpu = get_cpu();
+ tss = &per_cpu(cpu_tss_rw, cpu);
+
+ /*
+ * We cache MSR_IA32_SYSENTER_CS's value in the TSS's ss1 field --
+ * see the big comment in struct x86_hw_tss's definition.
+ */
+
+ tss->x86_tss.ss1 = __KERNEL_CS;
+ wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0);
+ wrmsr(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1), 0);
+ wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0);
+
+ put_cpu();
+}
+#endif
+
+void __init identify_boot_cpu(void)
+{
+ identify_cpu(&boot_cpu_data);
+#ifdef CONFIG_X86_32
+ sysenter_setup();
+ enable_sep_cpu();
+#endif
+ cpu_detect_tlb(&boot_cpu_data);
+ tsx_init();
+}
+
+void identify_secondary_cpu(struct cpuinfo_x86 *c)
+{
+ BUG_ON(c == &boot_cpu_data);
+ identify_cpu(c);
+#ifdef CONFIG_X86_32
+ enable_sep_cpu();
+#endif
+ mtrr_ap_init();
+ validate_apic_and_package_id(c);
+ x86_spec_ctrl_setup_ap();
+ update_srbds_msr();
+}
+
+static __init int setup_noclflush(char *arg)
+{
+ setup_clear_cpu_cap(X86_FEATURE_CLFLUSH);
+ setup_clear_cpu_cap(X86_FEATURE_CLFLUSHOPT);
+ return 1;
+}
+__setup("noclflush", setup_noclflush);
+
+void print_cpu_info(struct cpuinfo_x86 *c)
+{
+ const char *vendor = NULL;
+
+ if (c->x86_vendor < X86_VENDOR_NUM) {
+ vendor = this_cpu->c_vendor;
+ } else {
+ if (c->cpuid_level >= 0)
+ vendor = c->x86_vendor_id;
+ }
+
+ if (vendor && !strstr(c->x86_model_id, vendor))
+ pr_cont("%s ", vendor);
+
+ if (c->x86_model_id[0])
+ pr_cont("%s", c->x86_model_id);
+ else
+ pr_cont("%d86", c->x86);
+
+ pr_cont(" (family: 0x%x, model: 0x%x", c->x86, c->x86_model);
+
+ if (c->x86_stepping || c->cpuid_level >= 0)
+ pr_cont(", stepping: 0x%x)\n", c->x86_stepping);
+ else
+ pr_cont(")\n");
+}
+
+/*
+ * clearcpuid= was already parsed in fpu__init_parse_early_param.
+ * But we need to keep a dummy __setup around otherwise it would
+ * show up as an environment variable for init.
+ */
+static __init int setup_clearcpuid(char *arg)
+{
+ return 1;
+}
+__setup("clearcpuid=", setup_clearcpuid);
+
+#ifdef CONFIG_X86_64
+DEFINE_PER_CPU_FIRST(union irq_stack_union,
+ irq_stack_union) __aligned(PAGE_SIZE) __visible;
+EXPORT_PER_CPU_SYMBOL_GPL(irq_stack_union);
+
+/*
+ * The following percpu variables are hot. Align current_task to
+ * cacheline size such that they fall in the same cacheline.
+ */
+DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned =
+ &init_task;
+EXPORT_PER_CPU_SYMBOL(current_task);
+
+DEFINE_PER_CPU(char *, irq_stack_ptr) =
+ init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE;
+
+DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1;
+
+DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
+EXPORT_PER_CPU_SYMBOL(__preempt_count);
+
+/* May not be marked __init: used by software suspend */
+void syscall_init(void)
+{
+ extern char _entry_trampoline[];
+ extern char entry_SYSCALL_64_trampoline[];
+
+ int cpu = smp_processor_id();
+ unsigned long SYSCALL64_entry_trampoline =
+ (unsigned long)get_cpu_entry_area(cpu)->entry_trampoline +
+ (entry_SYSCALL_64_trampoline - _entry_trampoline);
+
+ wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
+ if (static_cpu_has(X86_FEATURE_PTI))
+ wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline);
+ else
+ wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
+
+#ifdef CONFIG_IA32_EMULATION
+ wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat);
+ /*
+ * This only works on Intel CPUs.
+ * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP.
+ * This does not cause SYSENTER to jump to the wrong location, because
+ * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
+ */
+ wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
+ wrmsrl_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1));
+ wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
+#else
+ wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret);
+ wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
+ wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
+ wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
+#endif
+
+ /* Flags to clear on syscall */
+ wrmsrl(MSR_SYSCALL_MASK,
+ X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|
+ X86_EFLAGS_IOPL|X86_EFLAGS_AC|X86_EFLAGS_NT);
+}
+
+/*
+ * Copies of the original ist values from the tss are only accessed during
+ * debugging, no special alignment required.
+ */
+DEFINE_PER_CPU(struct orig_ist, orig_ist);
+
+static DEFINE_PER_CPU(unsigned long, debug_stack_addr);
+DEFINE_PER_CPU(int, debug_stack_usage);
+
+int is_debug_stack(unsigned long addr)
+{
+ return __this_cpu_read(debug_stack_usage) ||
+ (addr <= __this_cpu_read(debug_stack_addr) &&
+ addr > (__this_cpu_read(debug_stack_addr) - DEBUG_STKSZ));
+}
+NOKPROBE_SYMBOL(is_debug_stack);
+
+DEFINE_PER_CPU(u32, debug_idt_ctr);
+
+void debug_stack_set_zero(void)
+{
+ this_cpu_inc(debug_idt_ctr);
+ load_current_idt();
+}
+NOKPROBE_SYMBOL(debug_stack_set_zero);
+
+void debug_stack_reset(void)
+{
+ if (WARN_ON(!this_cpu_read(debug_idt_ctr)))
+ return;
+ if (this_cpu_dec_return(debug_idt_ctr) == 0)
+ load_current_idt();
+}
+NOKPROBE_SYMBOL(debug_stack_reset);
+
+#else /* CONFIG_X86_64 */
+
+DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
+EXPORT_PER_CPU_SYMBOL(current_task);
+DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
+EXPORT_PER_CPU_SYMBOL(__preempt_count);
+
+/*
+ * On x86_32, vm86 modifies tss.sp0, so sp0 isn't a reliable way to find
+ * the top of the kernel stack. Use an extra percpu variable to track the
+ * top of the kernel stack directly.
+ */
+DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) =
+ (unsigned long)&init_thread_union + THREAD_SIZE;
+EXPORT_PER_CPU_SYMBOL(cpu_current_top_of_stack);
+
+#ifdef CONFIG_STACKPROTECTOR
+DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
+#endif
+
+#endif /* CONFIG_X86_64 */
+
+/*
+ * Clear all 6 debug registers:
+ */
+static void clear_all_debug_regs(void)
+{
+ int i;
+
+ for (i = 0; i < 8; i++) {
+ /* Ignore db4, db5 */
+ if ((i == 4) || (i == 5))
+ continue;
+
+ set_debugreg(0, i);
+ }
+}
+
+#ifdef CONFIG_KGDB
+/*
+ * Restore debug regs if using kgdbwait and you have a kernel debugger
+ * connection established.
+ */
+static void dbg_restore_debug_regs(void)
+{
+ if (unlikely(kgdb_connected && arch_kgdb_ops.correct_hw_break))
+ arch_kgdb_ops.correct_hw_break();
+}
+#else /* ! CONFIG_KGDB */
+#define dbg_restore_debug_regs()
+#endif /* ! CONFIG_KGDB */
+
+static void wait_for_master_cpu(int cpu)
+{
+#ifdef CONFIG_SMP
+ /*
+ * wait for ACK from master CPU before continuing
+ * with AP initialization
+ */
+ WARN_ON(cpumask_test_and_set_cpu(cpu, cpu_initialized_mask));
+ while (!cpumask_test_cpu(cpu, cpu_callout_mask))
+ cpu_relax();
+#endif
+}
+
+/*
+ * cpu_init() initializes state that is per-CPU. Some data is already
+ * initialized (naturally) in the bootstrap process, such as the GDT
+ * and IDT. We reload them nevertheless, this function acts as a
+ * 'CPU state barrier', nothing should get across.
+ * A lot of state is already set up in PDA init for 64 bit
+ */
+#ifdef CONFIG_X86_64
+
+void cpu_init(void)
+{
+ struct orig_ist *oist;
+ struct task_struct *me;
+ struct tss_struct *t;
+ unsigned long v;
+ int cpu = raw_smp_processor_id();
+ int i;
+
+ wait_for_master_cpu(cpu);
+
+ /*
+ * Initialize the CR4 shadow before doing anything that could
+ * try to read it.
+ */
+ cr4_init_shadow();
+
+ if (cpu)
+ load_ucode_ap();
+
+ t = &per_cpu(cpu_tss_rw, cpu);
+ oist = &per_cpu(orig_ist, cpu);
+
+#ifdef CONFIG_NUMA
+ if (this_cpu_read(numa_node) == 0 &&
+ early_cpu_to_node(cpu) != NUMA_NO_NODE)
+ set_numa_node(early_cpu_to_node(cpu));
+#endif
+
+ me = current;
+
+ pr_debug("Initializing CPU#%d\n", cpu);
+
+ cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
+
+ /*
+ * Initialize the per-CPU GDT with the boot GDT,
+ * and set up the GDT descriptor:
+ */
+
+ switch_to_new_gdt(cpu);
+ loadsegment(fs, 0);
+
+ load_current_idt();
+
+ memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
+ syscall_init();
+
+ wrmsrl(MSR_FS_BASE, 0);
+ wrmsrl(MSR_KERNEL_GS_BASE, 0);
+ barrier();
+
+ x86_configure_nx();
+ x2apic_setup();
+
+ /*
+ * set up and load the per-CPU TSS
+ */
+ if (!oist->ist[0]) {
+ char *estacks = get_cpu_entry_area(cpu)->exception_stacks;
+
+ for (v = 0; v < N_EXCEPTION_STACKS; v++) {
+ estacks += exception_stack_sizes[v];
+ oist->ist[v] = t->x86_tss.ist[v] =
+ (unsigned long)estacks;
+ if (v == DEBUG_STACK-1)
+ per_cpu(debug_stack_addr, cpu) = (unsigned long)estacks;
+ }
+ }
+
+ t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
+
+ /*
+ * <= is required because the CPU will access up to
+ * 8 bits beyond the end of the IO permission bitmap.
+ */
+ for (i = 0; i <= IO_BITMAP_LONGS; i++)
+ t->io_bitmap[i] = ~0UL;
+
+ mmgrab(&init_mm);
+ me->active_mm = &init_mm;
+ BUG_ON(me->mm);
+ initialize_tlbstate_and_flush();
+ enter_lazy_tlb(&init_mm, me);
+
+ /*
+ * Initialize the TSS. sp0 points to the entry trampoline stack
+ * regardless of what task is running.
+ */
+ set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
+ load_TR_desc();
+ load_sp0((unsigned long)(cpu_entry_stack(cpu) + 1));
+
+ load_mm_ldt(&init_mm);
+
+ clear_all_debug_regs();
+ dbg_restore_debug_regs();
+
+ fpu__init_cpu();
+
+ if (is_uv_system())
+ uv_cpu_init();
+
+ load_fixmap_gdt(cpu);
+}
+
+#else
+
+void cpu_init(void)
+{
+ int cpu = smp_processor_id();
+ struct task_struct *curr = current;
+ struct tss_struct *t = &per_cpu(cpu_tss_rw, cpu);
+
+ wait_for_master_cpu(cpu);
+
+ /*
+ * Initialize the CR4 shadow before doing anything that could
+ * try to read it.
+ */
+ cr4_init_shadow();
+
+ show_ucode_info_early();
+
+ pr_info("Initializing CPU#%d\n", cpu);
+
+ if (cpu_feature_enabled(X86_FEATURE_VME) ||
+ boot_cpu_has(X86_FEATURE_TSC) ||
+ boot_cpu_has(X86_FEATURE_DE))
+ cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
+
+ load_current_idt();
+ switch_to_new_gdt(cpu);
+
+ /*
+ * Set up and load the per-CPU TSS and LDT
+ */
+ mmgrab(&init_mm);
+ curr->active_mm = &init_mm;
+ BUG_ON(curr->mm);
+ initialize_tlbstate_and_flush();
+ enter_lazy_tlb(&init_mm, curr);
+
+ /*
+ * Initialize the TSS. sp0 points to the entry trampoline stack
+ * regardless of what task is running.
+ */
+ set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
+ load_TR_desc();
+ load_sp0((unsigned long)(cpu_entry_stack(cpu) + 1));
+
+ load_mm_ldt(&init_mm);
+
+ t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
+
+#ifdef CONFIG_DOUBLEFAULT
+ /* Set up doublefault TSS pointer in the GDT */
+ __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
+#endif
+
+ clear_all_debug_regs();
+ dbg_restore_debug_regs();
+
+ fpu__init_cpu();
+
+ load_fixmap_gdt(cpu);
+}
+#endif
+
+static void bsp_resume(void)
+{
+ if (this_cpu->c_bsp_resume)
+ this_cpu->c_bsp_resume(&boot_cpu_data);
+}
+
+static struct syscore_ops cpu_syscore_ops = {
+ .resume = bsp_resume,
+};
+
+static int __init init_cpu_syscore(void)
+{
+ register_syscore_ops(&cpu_syscore_ops);
+ return 0;
+}
+core_initcall(init_cpu_syscore);
+
+/*
+ * The microcode loader calls this upon late microcode load to recheck features,
+ * only when microcode has been updated. Caller holds microcode_mutex and CPU
+ * hotplug lock.
+ */
+void microcode_check(void)
+{
+ struct cpuinfo_x86 info;
+
+ perf_check_microcode();
+
+ /* Reload CPUID max function as it might've changed. */
+ info.cpuid_level = cpuid_eax(0);
+
+ /*
+ * Copy all capability leafs to pick up the synthetic ones so that
+ * memcmp() below doesn't fail on that. The ones coming from CPUID will
+ * get overwritten in get_cpu_cap().
+ */
+ memcpy(&info.x86_capability, &boot_cpu_data.x86_capability, sizeof(info.x86_capability));
+
+ get_cpu_cap(&info);
+
+ if (!memcmp(&info.x86_capability, &boot_cpu_data.x86_capability, sizeof(info.x86_capability)))
+ return;
+
+ pr_warn("x86/CPU: CPU features have changed after loading microcode, but might not take effect.\n");
+ pr_warn("x86/CPU: Please consider either early loading through initrd/built-in or a potential BIOS update.\n");
+}
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
new file mode 100644
index 000000000..4eb9bf68b
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -0,0 +1,88 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ARCH_X86_CPU_H
+#define ARCH_X86_CPU_H
+
+/* attempt to consolidate cpu attributes */
+struct cpu_dev {
+ const char *c_vendor;
+
+ /* some have two possibilities for cpuid string */
+ const char *c_ident[2];
+
+ void (*c_early_init)(struct cpuinfo_x86 *);
+ void (*c_bsp_init)(struct cpuinfo_x86 *);
+ void (*c_init)(struct cpuinfo_x86 *);
+ void (*c_identify)(struct cpuinfo_x86 *);
+ void (*c_detect_tlb)(struct cpuinfo_x86 *);
+ void (*c_bsp_resume)(struct cpuinfo_x86 *);
+ int c_x86_vendor;
+#ifdef CONFIG_X86_32
+ /* Optional vendor specific routine to obtain the cache size. */
+ unsigned int (*legacy_cache_size)(struct cpuinfo_x86 *,
+ unsigned int);
+
+ /* Family/stepping-based lookup table for model names. */
+ struct legacy_cpu_model_info {
+ int family;
+ const char *model_names[16];
+ } legacy_models[5];
+#endif
+};
+
+struct _tlb_table {
+ unsigned char descriptor;
+ char tlb_type;
+ unsigned int entries;
+ /* unsigned int ways; */
+ char info[128];
+};
+
+#define cpu_dev_register(cpu_devX) \
+ static const struct cpu_dev *const __cpu_dev_##cpu_devX __used \
+ __attribute__((__section__(".x86_cpu_dev.init"))) = \
+ &cpu_devX;
+
+extern const struct cpu_dev *const __x86_cpu_dev_start[],
+ *const __x86_cpu_dev_end[];
+
+#ifdef CONFIG_CPU_SUP_INTEL
+enum tsx_ctrl_states {
+ TSX_CTRL_ENABLE,
+ TSX_CTRL_DISABLE,
+ TSX_CTRL_NOT_SUPPORTED,
+};
+
+extern __ro_after_init enum tsx_ctrl_states tsx_ctrl_state;
+
+extern void __init tsx_init(void);
+extern void tsx_enable(void);
+extern void tsx_disable(void);
+#else
+static inline void tsx_init(void) { }
+#endif /* CONFIG_CPU_SUP_INTEL */
+
+extern void get_cpu_cap(struct cpuinfo_x86 *c);
+extern void get_cpu_address_sizes(struct cpuinfo_x86 *c);
+extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c);
+extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
+extern u32 get_scattered_cpuid_leaf(unsigned int level,
+ unsigned int sub_leaf,
+ enum cpuid_regs_idx reg);
+extern void init_intel_cacheinfo(struct cpuinfo_x86 *c);
+extern void init_amd_cacheinfo(struct cpuinfo_x86 *c);
+
+extern void detect_num_cpu_cores(struct cpuinfo_x86 *c);
+extern int detect_extended_topology_early(struct cpuinfo_x86 *c);
+extern int detect_extended_topology(struct cpuinfo_x86 *c);
+extern int detect_ht_early(struct cpuinfo_x86 *c);
+extern void detect_ht(struct cpuinfo_x86 *c);
+extern void check_null_seg_clears_base(struct cpuinfo_x86 *c);
+
+unsigned int aperfmperf_get_khz(int cpu);
+
+extern void x86_spec_ctrl_setup_ap(void);
+extern void update_srbds_msr(void);
+
+extern u64 x86_read_arch_cap_msr(void);
+
+#endif /* ARCH_X86_CPU_H */
diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
new file mode 100644
index 000000000..fa07a224e
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
@@ -0,0 +1,124 @@
+/* Declare dependencies between CPUIDs */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <asm/cpufeature.h>
+
+struct cpuid_dep {
+ unsigned int feature;
+ unsigned int depends;
+};
+
+/*
+ * Table of CPUID features that depend on others.
+ *
+ * This only includes dependencies that can be usefully disabled, not
+ * features part of the base set (like FPU).
+ *
+ * Note this all is not __init / __initdata because it can be
+ * called from cpu hotplug. It shouldn't do anything in this case,
+ * but it's difficult to tell that to the init reference checker.
+ */
+static const struct cpuid_dep cpuid_deps[] = {
+ { X86_FEATURE_XSAVEOPT, X86_FEATURE_XSAVE },
+ { X86_FEATURE_XSAVEC, X86_FEATURE_XSAVE },
+ { X86_FEATURE_XSAVES, X86_FEATURE_XSAVE },
+ { X86_FEATURE_AVX, X86_FEATURE_XSAVE },
+ { X86_FEATURE_PKU, X86_FEATURE_XSAVE },
+ { X86_FEATURE_MPX, X86_FEATURE_XSAVE },
+ { X86_FEATURE_XGETBV1, X86_FEATURE_XSAVE },
+ { X86_FEATURE_FXSR_OPT, X86_FEATURE_FXSR },
+ { X86_FEATURE_XMM, X86_FEATURE_FXSR },
+ { X86_FEATURE_XMM2, X86_FEATURE_XMM },
+ { X86_FEATURE_XMM3, X86_FEATURE_XMM2 },
+ { X86_FEATURE_XMM4_1, X86_FEATURE_XMM2 },
+ { X86_FEATURE_XMM4_2, X86_FEATURE_XMM2 },
+ { X86_FEATURE_XMM3, X86_FEATURE_XMM2 },
+ { X86_FEATURE_PCLMULQDQ, X86_FEATURE_XMM2 },
+ { X86_FEATURE_SSSE3, X86_FEATURE_XMM2, },
+ { X86_FEATURE_F16C, X86_FEATURE_XMM2, },
+ { X86_FEATURE_AES, X86_FEATURE_XMM2 },
+ { X86_FEATURE_SHA_NI, X86_FEATURE_XMM2 },
+ { X86_FEATURE_FMA, X86_FEATURE_AVX },
+ { X86_FEATURE_AVX2, X86_FEATURE_AVX, },
+ { X86_FEATURE_AVX512F, X86_FEATURE_AVX, },
+ { X86_FEATURE_AVX512IFMA, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512PF, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512ER, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512CD, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512DQ, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512BW, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512VL, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512VBMI, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512_VBMI2, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_GFNI, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_VAES, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_VPCLMULQDQ, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_AVX512_VNNI, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_AVX512_BITALG, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_AVX512_4VNNIW, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512_4FMAPS, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512_VPOPCNTDQ, X86_FEATURE_AVX512F },
+ { X86_FEATURE_CQM_OCCUP_LLC, X86_FEATURE_CQM_LLC },
+ { X86_FEATURE_CQM_MBM_TOTAL, X86_FEATURE_CQM_LLC },
+ { X86_FEATURE_CQM_MBM_LOCAL, X86_FEATURE_CQM_LLC },
+ {}
+};
+
+static inline void clear_feature(struct cpuinfo_x86 *c, unsigned int feature)
+{
+ /*
+ * Note: This could use the non atomic __*_bit() variants, but the
+ * rest of the cpufeature code uses atomics as well, so keep it for
+ * consistency. Cleanup all of it separately.
+ */
+ if (!c) {
+ clear_cpu_cap(&boot_cpu_data, feature);
+ set_bit(feature, (unsigned long *)cpu_caps_cleared);
+ } else {
+ clear_bit(feature, (unsigned long *)c->x86_capability);
+ }
+}
+
+/* Take the capabilities and the BUG bits into account */
+#define MAX_FEATURE_BITS ((NCAPINTS + NBUGINTS) * sizeof(u32) * 8)
+
+static void do_clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature)
+{
+ DECLARE_BITMAP(disable, MAX_FEATURE_BITS);
+ const struct cpuid_dep *d;
+ bool changed;
+
+ if (WARN_ON(feature >= MAX_FEATURE_BITS))
+ return;
+
+ clear_feature(c, feature);
+
+ /* Collect all features to disable, handling dependencies */
+ memset(disable, 0, sizeof(disable));
+ __set_bit(feature, disable);
+
+ /* Loop until we get a stable state. */
+ do {
+ changed = false;
+ for (d = cpuid_deps; d->feature; d++) {
+ if (!test_bit(d->depends, disable))
+ continue;
+ if (__test_and_set_bit(d->feature, disable))
+ continue;
+
+ changed = true;
+ clear_feature(c, d->feature);
+ }
+ } while (changed);
+}
+
+void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature)
+{
+ do_clear_cpu_cap(c, feature);
+}
+
+void setup_clear_cpu_cap(unsigned int feature)
+{
+ do_clear_cpu_cap(NULL, feature);
+}
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
new file mode 100644
index 000000000..1d9b8aaea
--- /dev/null
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -0,0 +1,466 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bitops.h>
+#include <linux/delay.h>
+#include <linux/pci.h>
+#include <asm/dma.h>
+#include <linux/io.h>
+#include <asm/processor-cyrix.h>
+#include <asm/processor-flags.h>
+#include <linux/timer.h>
+#include <asm/pci-direct.h>
+#include <asm/tsc.h>
+#include <asm/cpufeature.h>
+#include <linux/sched.h>
+#include <linux/sched/clock.h>
+
+#include "cpu.h"
+
+/*
+ * Read NSC/Cyrix DEVID registers (DIR) to get more detailed info. about the CPU
+ */
+static void __do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
+{
+ unsigned char ccr2, ccr3;
+
+ /* we test for DEVID by checking whether CCR3 is writable */
+ ccr3 = getCx86(CX86_CCR3);
+ setCx86(CX86_CCR3, ccr3 ^ 0x80);
+ getCx86(0xc0); /* dummy to change bus */
+
+ if (getCx86(CX86_CCR3) == ccr3) { /* no DEVID regs. */
+ ccr2 = getCx86(CX86_CCR2);
+ setCx86(CX86_CCR2, ccr2 ^ 0x04);
+ getCx86(0xc0); /* dummy */
+
+ if (getCx86(CX86_CCR2) == ccr2) /* old Cx486SLC/DLC */
+ *dir0 = 0xfd;
+ else { /* Cx486S A step */
+ setCx86(CX86_CCR2, ccr2);
+ *dir0 = 0xfe;
+ }
+ } else {
+ setCx86(CX86_CCR3, ccr3); /* restore CCR3 */
+
+ /* read DIR0 and DIR1 CPU registers */
+ *dir0 = getCx86(CX86_DIR0);
+ *dir1 = getCx86(CX86_DIR1);
+ }
+}
+
+static void do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __do_cyrix_devid(dir0, dir1);
+ local_irq_restore(flags);
+}
+/*
+ * Cx86_dir0_msb is a HACK needed by check_cx686_cpuid/slop in bugs.h in
+ * order to identify the Cyrix CPU model after we're out of setup.c
+ *
+ * Actually since bugs.h doesn't even reference this perhaps someone should
+ * fix the documentation ???
+ */
+static unsigned char Cx86_dir0_msb = 0;
+
+static const char Cx86_model[][9] = {
+ "Cx486", "Cx486", "5x86 ", "6x86", "MediaGX ", "6x86MX ",
+ "M II ", "Unknown"
+};
+static const char Cx486_name[][5] = {
+ "SLC", "DLC", "SLC2", "DLC2", "SRx", "DRx",
+ "SRx2", "DRx2"
+};
+static const char Cx486S_name[][4] = {
+ "S", "S2", "Se", "S2e"
+};
+static const char Cx486D_name[][4] = {
+ "DX", "DX2", "?", "?", "?", "DX4"
+};
+static char Cx86_cb[] = "?.5x Core/Bus Clock";
+static const char cyrix_model_mult1[] = "12??43";
+static const char cyrix_model_mult2[] = "12233445";
+
+/*
+ * Reset the slow-loop (SLOP) bit on the 686(L) which is set by some old
+ * BIOSes for compatibility with DOS games. This makes the udelay loop
+ * work correctly, and improves performance.
+ *
+ * FIXME: our newer udelay uses the tsc. We don't need to frob with SLOP
+ */
+
+static void check_cx686_slop(struct cpuinfo_x86 *c)
+{
+ unsigned long flags;
+
+ if (Cx86_dir0_msb == 3) {
+ unsigned char ccr3, ccr5;
+
+ local_irq_save(flags);
+ ccr3 = getCx86(CX86_CCR3);
+ setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
+ ccr5 = getCx86(CX86_CCR5);
+ if (ccr5 & 2)
+ setCx86(CX86_CCR5, ccr5 & 0xfd); /* reset SLOP */
+ setCx86(CX86_CCR3, ccr3); /* disable MAPEN */
+ local_irq_restore(flags);
+
+ if (ccr5 & 2) { /* possible wrong calibration done */
+ pr_info("Recalibrating delay loop with SLOP bit reset\n");
+ calibrate_delay();
+ c->loops_per_jiffy = loops_per_jiffy;
+ }
+ }
+}
+
+
+static void set_cx86_reorder(void)
+{
+ u8 ccr3;
+
+ pr_info("Enable Memory access reorder on Cyrix/NSC processor.\n");
+ ccr3 = getCx86(CX86_CCR3);
+ setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
+
+ /* Load/Store Serialize to mem access disable (=reorder it) */
+ setCx86(CX86_PCR0, getCx86(CX86_PCR0) & ~0x80);
+ /* set load/store serialize from 1GB to 4GB */
+ ccr3 |= 0xe0;
+ setCx86(CX86_CCR3, ccr3);
+}
+
+static void set_cx86_memwb(void)
+{
+ pr_info("Enable Memory-Write-back mode on Cyrix/NSC processor.\n");
+
+ /* CCR2 bit 2: unlock NW bit */
+ setCx86(CX86_CCR2, getCx86(CX86_CCR2) & ~0x04);
+ /* set 'Not Write-through' */
+ write_cr0(read_cr0() | X86_CR0_NW);
+ /* CCR2 bit 2: lock NW bit and set WT1 */
+ setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x14);
+}
+
+/*
+ * Configure later MediaGX and/or Geode processor.
+ */
+
+static void geode_configure(void)
+{
+ unsigned long flags;
+ u8 ccr3;
+ local_irq_save(flags);
+
+ /* Suspend on halt power saving and enable #SUSP pin */
+ setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x88);
+
+ ccr3 = getCx86(CX86_CCR3);
+ setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
+
+
+ /* FPU fast, DTE cache, Mem bypass */
+ setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x38);
+ setCx86(CX86_CCR3, ccr3); /* disable MAPEN */
+
+ set_cx86_memwb();
+ set_cx86_reorder();
+
+ local_irq_restore(flags);
+}
+
+static void early_init_cyrix(struct cpuinfo_x86 *c)
+{
+ unsigned char dir0, dir0_msn, dir1 = 0;
+
+ __do_cyrix_devid(&dir0, &dir1);
+ dir0_msn = dir0 >> 4; /* identifies CPU "family" */
+
+ switch (dir0_msn) {
+ case 3: /* 6x86/6x86L */
+ /* Emulate MTRRs using Cyrix's ARRs. */
+ set_cpu_cap(c, X86_FEATURE_CYRIX_ARR);
+ break;
+ case 5: /* 6x86MX/M II */
+ /* Emulate MTRRs using Cyrix's ARRs. */
+ set_cpu_cap(c, X86_FEATURE_CYRIX_ARR);
+ break;
+ }
+}
+
+static void init_cyrix(struct cpuinfo_x86 *c)
+{
+ unsigned char dir0, dir0_msn, dir0_lsn, dir1 = 0;
+ char *buf = c->x86_model_id;
+ const char *p = NULL;
+
+ /*
+ * Bit 31 in normal CPUID used for nonstandard 3DNow ID;
+ * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway
+ */
+ clear_cpu_cap(c, 0*32+31);
+
+ /* Cyrix used bit 24 in extended (AMD) CPUID for Cyrix MMX extensions */
+ if (test_cpu_cap(c, 1*32+24)) {
+ clear_cpu_cap(c, 1*32+24);
+ set_cpu_cap(c, X86_FEATURE_CXMMX);
+ }
+
+ do_cyrix_devid(&dir0, &dir1);
+
+ check_cx686_slop(c);
+
+ Cx86_dir0_msb = dir0_msn = dir0 >> 4; /* identifies CPU "family" */
+ dir0_lsn = dir0 & 0xf; /* model or clock multiplier */
+
+ /* common case step number/rev -- exceptions handled below */
+ c->x86_model = (dir1 >> 4) + 1;
+ c->x86_stepping = dir1 & 0xf;
+
+ /* Now cook; the original recipe is by Channing Corn, from Cyrix.
+ * We do the same thing for each generation: we work out
+ * the model, multiplier and stepping. Black magic included,
+ * to make the silicon step/rev numbers match the printed ones.
+ */
+
+ switch (dir0_msn) {
+ unsigned char tmp;
+
+ case 0: /* Cx486SLC/DLC/SRx/DRx */
+ p = Cx486_name[dir0_lsn & 7];
+ break;
+
+ case 1: /* Cx486S/DX/DX2/DX4 */
+ p = (dir0_lsn & 8) ? Cx486D_name[dir0_lsn & 5]
+ : Cx486S_name[dir0_lsn & 3];
+ break;
+
+ case 2: /* 5x86 */
+ Cx86_cb[2] = cyrix_model_mult1[dir0_lsn & 5];
+ p = Cx86_cb+2;
+ break;
+
+ case 3: /* 6x86/6x86L */
+ Cx86_cb[1] = ' ';
+ Cx86_cb[2] = cyrix_model_mult1[dir0_lsn & 5];
+ if (dir1 > 0x21) { /* 686L */
+ Cx86_cb[0] = 'L';
+ p = Cx86_cb;
+ (c->x86_model)++;
+ } else /* 686 */
+ p = Cx86_cb+1;
+ /* Emulate MTRRs using Cyrix's ARRs. */
+ set_cpu_cap(c, X86_FEATURE_CYRIX_ARR);
+ /* 6x86's contain this bug */
+ set_cpu_bug(c, X86_BUG_COMA);
+ break;
+
+ case 4: /* MediaGX/GXm or Geode GXM/GXLV/GX1 */
+ case 11: /* GX1 with inverted Device ID */
+#ifdef CONFIG_PCI
+ {
+ u32 vendor, device;
+ /*
+ * It isn't really a PCI quirk directly, but the cure is the
+ * same. The MediaGX has deep magic SMM stuff that handles the
+ * SB emulation. It throws away the fifo on disable_dma() which
+ * is wrong and ruins the audio.
+ *
+ * Bug2: VSA1 has a wrap bug so that using maximum sized DMA
+ * causes bad things. According to NatSemi VSA2 has another
+ * bug to do with 'hlt'. I've not seen any boards using VSA2
+ * and X doesn't seem to support it either so who cares 8).
+ * VSA1 we work around however.
+ */
+
+ pr_info("Working around Cyrix MediaGX virtual DMA bugs.\n");
+ isa_dma_bridge_buggy = 2;
+
+ /* We do this before the PCI layer is running. However we
+ are safe here as we know the bridge must be a Cyrix
+ companion and must be present */
+ vendor = read_pci_config_16(0, 0, 0x12, PCI_VENDOR_ID);
+ device = read_pci_config_16(0, 0, 0x12, PCI_DEVICE_ID);
+
+ /*
+ * The 5510/5520 companion chips have a funky PIT.
+ */
+ if (vendor == PCI_VENDOR_ID_CYRIX &&
+ (device == PCI_DEVICE_ID_CYRIX_5510 ||
+ device == PCI_DEVICE_ID_CYRIX_5520))
+ mark_tsc_unstable("cyrix 5510/5520 detected");
+ }
+#endif
+ c->x86_cache_size = 16; /* Yep 16K integrated cache thats it */
+
+ /* GXm supports extended cpuid levels 'ala' AMD */
+ if (c->cpuid_level == 2) {
+ /* Enable cxMMX extensions (GX1 Datasheet 54) */
+ setCx86(CX86_CCR7, getCx86(CX86_CCR7) | 1);
+
+ /*
+ * GXm : 0x30 ... 0x5f GXm datasheet 51
+ * GXlv: 0x6x GXlv datasheet 54
+ * ? : 0x7x
+ * GX1 : 0x8x GX1 datasheet 56
+ */
+ if ((0x30 <= dir1 && dir1 <= 0x6f) ||
+ (0x80 <= dir1 && dir1 <= 0x8f))
+ geode_configure();
+ return;
+ } else { /* MediaGX */
+ Cx86_cb[2] = (dir0_lsn & 1) ? '3' : '4';
+ p = Cx86_cb+2;
+ c->x86_model = (dir1 & 0x20) ? 1 : 2;
+ }
+ break;
+
+ case 5: /* 6x86MX/M II */
+ if (dir1 > 7) {
+ dir0_msn++; /* M II */
+ /* Enable MMX extensions (App note 108) */
+ setCx86(CX86_CCR7, getCx86(CX86_CCR7)|1);
+ } else {
+ /* A 6x86MX - it has the bug. */
+ set_cpu_bug(c, X86_BUG_COMA);
+ }
+ tmp = (!(dir0_lsn & 7) || dir0_lsn & 1) ? 2 : 0;
+ Cx86_cb[tmp] = cyrix_model_mult2[dir0_lsn & 7];
+ p = Cx86_cb+tmp;
+ if (((dir1 & 0x0f) > 4) || ((dir1 & 0xf0) == 0x20))
+ (c->x86_model)++;
+ /* Emulate MTRRs using Cyrix's ARRs. */
+ set_cpu_cap(c, X86_FEATURE_CYRIX_ARR);
+ break;
+
+ case 0xf: /* Cyrix 486 without DEVID registers */
+ switch (dir0_lsn) {
+ case 0xd: /* either a 486SLC or DLC w/o DEVID */
+ dir0_msn = 0;
+ p = Cx486_name[!!boot_cpu_has(X86_FEATURE_FPU)];
+ break;
+
+ case 0xe: /* a 486S A step */
+ dir0_msn = 0;
+ p = Cx486S_name[0];
+ break;
+ }
+ break;
+
+ default: /* unknown (shouldn't happen, we know everyone ;-) */
+ dir0_msn = 7;
+ break;
+ }
+ strcpy(buf, Cx86_model[dir0_msn & 7]);
+ if (p)
+ strcat(buf, p);
+ return;
+}
+
+/*
+ * Handle National Semiconductor branded processors
+ */
+static void init_nsc(struct cpuinfo_x86 *c)
+{
+ /*
+ * There may be GX1 processors in the wild that are branded
+ * NSC and not Cyrix.
+ *
+ * This function only handles the GX processor, and kicks every
+ * thing else to the Cyrix init function above - that should
+ * cover any processors that might have been branded differently
+ * after NSC acquired Cyrix.
+ *
+ * If this breaks your GX1 horribly, please e-mail
+ * info-linux@ldcmail.amd.com to tell us.
+ */
+
+ /* Handle the GX (Formally known as the GX2) */
+
+ if (c->x86 == 5 && c->x86_model == 5)
+ cpu_detect_cache_sizes(c);
+ else
+ init_cyrix(c);
+}
+
+/*
+ * Cyrix CPUs without cpuid or with cpuid not yet enabled can be detected
+ * by the fact that they preserve the flags across the division of 5/2.
+ * PII and PPro exhibit this behavior too, but they have cpuid available.
+ */
+
+/*
+ * Perform the Cyrix 5/2 test. A Cyrix won't change
+ * the flags, while other 486 chips will.
+ */
+static inline int test_cyrix_52div(void)
+{
+ unsigned int test;
+
+ __asm__ __volatile__(
+ "sahf\n\t" /* clear flags (%eax = 0x0005) */
+ "div %b2\n\t" /* divide 5 by 2 */
+ "lahf" /* store flags into %ah */
+ : "=a" (test)
+ : "0" (5), "q" (2)
+ : "cc");
+
+ /* AH is 0x02 on Cyrix after the divide.. */
+ return (unsigned char) (test >> 8) == 0x02;
+}
+
+static void cyrix_identify(struct cpuinfo_x86 *c)
+{
+ /* Detect Cyrix with disabled CPUID */
+ if (c->x86 == 4 && test_cyrix_52div()) {
+ unsigned char dir0, dir1;
+
+ strcpy(c->x86_vendor_id, "CyrixInstead");
+ c->x86_vendor = X86_VENDOR_CYRIX;
+
+ /* Actually enable cpuid on the older cyrix */
+
+ /* Retrieve CPU revisions */
+
+ do_cyrix_devid(&dir0, &dir1);
+
+ dir0 >>= 4;
+
+ /* Check it is an affected model */
+
+ if (dir0 == 5 || dir0 == 3) {
+ unsigned char ccr3;
+ unsigned long flags;
+ pr_info("Enabling CPUID on Cyrix processor.\n");
+ local_irq_save(flags);
+ ccr3 = getCx86(CX86_CCR3);
+ /* enable MAPEN */
+ setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10);
+ /* enable cpuid */
+ setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x80);
+ /* disable MAPEN */
+ setCx86(CX86_CCR3, ccr3);
+ local_irq_restore(flags);
+ }
+ }
+}
+
+static const struct cpu_dev cyrix_cpu_dev = {
+ .c_vendor = "Cyrix",
+ .c_ident = { "CyrixInstead" },
+ .c_early_init = early_init_cyrix,
+ .c_init = init_cyrix,
+ .c_identify = cyrix_identify,
+ .c_x86_vendor = X86_VENDOR_CYRIX,
+};
+
+cpu_dev_register(cyrix_cpu_dev);
+
+static const struct cpu_dev nsc_cpu_dev = {
+ .c_vendor = "NSC",
+ .c_ident = { "Geode by NSC" },
+ .c_init = init_nsc,
+ .c_x86_vendor = X86_VENDOR_NSC,
+};
+
+cpu_dev_register(nsc_cpu_dev);
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
new file mode 100644
index 000000000..479ca4728
--- /dev/null
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -0,0 +1,102 @@
+/*
+ * Common hypervisor code
+ *
+ * Copyright (C) 2008, VMware, Inc.
+ * Author : Alok N Kataria <akataria@vmware.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/export.h>
+#include <asm/processor.h>
+#include <asm/hypervisor.h>
+
+extern const struct hypervisor_x86 x86_hyper_vmware;
+extern const struct hypervisor_x86 x86_hyper_ms_hyperv;
+extern const struct hypervisor_x86 x86_hyper_xen_pv;
+extern const struct hypervisor_x86 x86_hyper_xen_hvm;
+extern const struct hypervisor_x86 x86_hyper_kvm;
+extern const struct hypervisor_x86 x86_hyper_jailhouse;
+
+static const __initconst struct hypervisor_x86 * const hypervisors[] =
+{
+#ifdef CONFIG_XEN_PV
+ &x86_hyper_xen_pv,
+#endif
+#ifdef CONFIG_XEN_PVHVM
+ &x86_hyper_xen_hvm,
+#endif
+ &x86_hyper_vmware,
+ &x86_hyper_ms_hyperv,
+#ifdef CONFIG_KVM_GUEST
+ &x86_hyper_kvm,
+#endif
+#ifdef CONFIG_JAILHOUSE_GUEST
+ &x86_hyper_jailhouse,
+#endif
+};
+
+enum x86_hypervisor_type x86_hyper_type;
+EXPORT_SYMBOL(x86_hyper_type);
+
+static inline const struct hypervisor_x86 * __init
+detect_hypervisor_vendor(void)
+{
+ const struct hypervisor_x86 *h = NULL, * const *p;
+ uint32_t pri, max_pri = 0;
+
+ for (p = hypervisors; p < hypervisors + ARRAY_SIZE(hypervisors); p++) {
+ pri = (*p)->detect();
+ if (pri > max_pri) {
+ max_pri = pri;
+ h = *p;
+ }
+ }
+
+ if (h)
+ pr_info("Hypervisor detected: %s\n", h->name);
+
+ return h;
+}
+
+static void __init copy_array(const void *src, void *target, unsigned int size)
+{
+ unsigned int i, n = size / sizeof(void *);
+ const void * const *from = (const void * const *)src;
+ const void **to = (const void **)target;
+
+ for (i = 0; i < n; i++)
+ if (from[i])
+ to[i] = from[i];
+}
+
+void __init init_hypervisor_platform(void)
+{
+ const struct hypervisor_x86 *h;
+
+ h = detect_hypervisor_vendor();
+
+ if (!h)
+ return;
+
+ copy_array(&h->init, &x86_init.hyper, sizeof(h->init));
+ copy_array(&h->runtime, &x86_platform.hyper, sizeof(h->runtime));
+
+ x86_hyper_type = h->type;
+ x86_init.hyper.init_platform();
+}
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
new file mode 100644
index 000000000..76692590e
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel.c
@@ -0,0 +1,1036 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+
+#include <linux/string.h>
+#include <linux/bitops.h>
+#include <linux/smp.h>
+#include <linux/sched.h>
+#include <linux/sched/clock.h>
+#include <linux/thread_info.h>
+#include <linux/init.h>
+#include <linux/uaccess.h>
+
+#include <asm/cpufeature.h>
+#include <asm/pgtable.h>
+#include <asm/msr.h>
+#include <asm/bugs.h>
+#include <asm/cpu.h>
+#include <asm/intel-family.h>
+#include <asm/microcode_intel.h>
+#include <asm/hwcap2.h>
+#include <asm/elf.h>
+
+#ifdef CONFIG_X86_64
+#include <linux/topology.h>
+#endif
+
+#include "cpu.h"
+
+#ifdef CONFIG_X86_LOCAL_APIC
+#include <asm/mpspec.h>
+#include <asm/apic.h>
+#endif
+
+/*
+ * Just in case our CPU detection goes bad, or you have a weird system,
+ * allow a way to override the automatic disabling of MPX.
+ */
+static int forcempx;
+
+static int __init forcempx_setup(char *__unused)
+{
+ forcempx = 1;
+
+ return 1;
+}
+__setup("intel-skd-046-workaround=disable", forcempx_setup);
+
+void check_mpx_erratum(struct cpuinfo_x86 *c)
+{
+ if (forcempx)
+ return;
+ /*
+ * Turn off the MPX feature on CPUs where SMEP is not
+ * available or disabled.
+ *
+ * Works around Intel Erratum SKD046: "Branch Instructions
+ * May Initialize MPX Bound Registers Incorrectly".
+ *
+ * This might falsely disable MPX on systems without
+ * SMEP, like Atom processors without SMEP. But there
+ * is no such hardware known at the moment.
+ */
+ if (cpu_has(c, X86_FEATURE_MPX) && !cpu_has(c, X86_FEATURE_SMEP)) {
+ setup_clear_cpu_cap(X86_FEATURE_MPX);
+ pr_warn("x86/mpx: Disabling MPX since SMEP not present\n");
+ }
+}
+
+static bool ring3mwait_disabled __read_mostly;
+
+static int __init ring3mwait_disable(char *__unused)
+{
+ ring3mwait_disabled = true;
+ return 1;
+}
+__setup("ring3mwait=disable", ring3mwait_disable);
+
+static void probe_xeon_phi_r3mwait(struct cpuinfo_x86 *c)
+{
+ /*
+ * Ring 3 MONITOR/MWAIT feature cannot be detected without
+ * cpu model and family comparison.
+ */
+ if (c->x86 != 6)
+ return;
+ switch (c->x86_model) {
+ case INTEL_FAM6_XEON_PHI_KNL:
+ case INTEL_FAM6_XEON_PHI_KNM:
+ break;
+ default:
+ return;
+ }
+
+ if (ring3mwait_disabled)
+ return;
+
+ set_cpu_cap(c, X86_FEATURE_RING3MWAIT);
+ this_cpu_or(msr_misc_features_shadow,
+ 1UL << MSR_MISC_FEATURES_ENABLES_RING3MWAIT_BIT);
+
+ if (c == &boot_cpu_data)
+ ELF_HWCAP2 |= HWCAP2_RING3MWAIT;
+}
+
+/*
+ * Early microcode releases for the Spectre v2 mitigation were broken.
+ * Information taken from;
+ * - https://newsroom.intel.com/wp-content/uploads/sites/11/2018/03/microcode-update-guidance.pdf
+ * - https://kb.vmware.com/s/article/52345
+ * - Microcode revisions observed in the wild
+ * - Release note from 20180108 microcode release
+ */
+struct sku_microcode {
+ u8 model;
+ u8 stepping;
+ u32 microcode;
+};
+static const struct sku_microcode spectre_bad_microcodes[] = {
+ { INTEL_FAM6_KABYLAKE_DESKTOP, 0x0B, 0x80 },
+ { INTEL_FAM6_KABYLAKE_DESKTOP, 0x0A, 0x80 },
+ { INTEL_FAM6_KABYLAKE_DESKTOP, 0x09, 0x80 },
+ { INTEL_FAM6_KABYLAKE_MOBILE, 0x0A, 0x80 },
+ { INTEL_FAM6_KABYLAKE_MOBILE, 0x09, 0x80 },
+ { INTEL_FAM6_SKYLAKE_X, 0x03, 0x0100013e },
+ { INTEL_FAM6_SKYLAKE_X, 0x04, 0x0200003c },
+ { INTEL_FAM6_BROADWELL_CORE, 0x04, 0x28 },
+ { INTEL_FAM6_BROADWELL_GT3E, 0x01, 0x1b },
+ { INTEL_FAM6_BROADWELL_XEON_D, 0x02, 0x14 },
+ { INTEL_FAM6_BROADWELL_XEON_D, 0x03, 0x07000011 },
+ { INTEL_FAM6_BROADWELL_X, 0x01, 0x0b000025 },
+ { INTEL_FAM6_HASWELL_ULT, 0x01, 0x21 },
+ { INTEL_FAM6_HASWELL_GT3E, 0x01, 0x18 },
+ { INTEL_FAM6_HASWELL_CORE, 0x03, 0x23 },
+ { INTEL_FAM6_HASWELL_X, 0x02, 0x3b },
+ { INTEL_FAM6_HASWELL_X, 0x04, 0x10 },
+ { INTEL_FAM6_IVYBRIDGE_X, 0x04, 0x42a },
+ /* Observed in the wild */
+ { INTEL_FAM6_SANDYBRIDGE_X, 0x06, 0x61b },
+ { INTEL_FAM6_SANDYBRIDGE_X, 0x07, 0x712 },
+};
+
+static bool bad_spectre_microcode(struct cpuinfo_x86 *c)
+{
+ int i;
+
+ /*
+ * We know that the hypervisor lie to us on the microcode version so
+ * we may as well hope that it is running the correct version.
+ */
+ if (cpu_has(c, X86_FEATURE_HYPERVISOR))
+ return false;
+
+ if (c->x86 != 6)
+ return false;
+
+ for (i = 0; i < ARRAY_SIZE(spectre_bad_microcodes); i++) {
+ if (c->x86_model == spectre_bad_microcodes[i].model &&
+ c->x86_stepping == spectre_bad_microcodes[i].stepping)
+ return (c->microcode <= spectre_bad_microcodes[i].microcode);
+ }
+ return false;
+}
+
+static void early_init_intel(struct cpuinfo_x86 *c)
+{
+ u64 misc_enable;
+
+ /* Unmask CPUID levels if masked: */
+ if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) {
+ if (msr_clear_bit(MSR_IA32_MISC_ENABLE,
+ MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT) > 0) {
+ c->cpuid_level = cpuid_eax(0);
+ get_cpu_cap(c);
+ }
+ }
+
+ if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
+ (c->x86 == 0x6 && c->x86_model >= 0x0e))
+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+
+ if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64))
+ c->microcode = intel_get_microcode_revision();
+
+ /* Now if any of them are set, check the blacklist and clear the lot */
+ if ((cpu_has(c, X86_FEATURE_SPEC_CTRL) ||
+ cpu_has(c, X86_FEATURE_INTEL_STIBP) ||
+ cpu_has(c, X86_FEATURE_IBRS) || cpu_has(c, X86_FEATURE_IBPB) ||
+ cpu_has(c, X86_FEATURE_STIBP)) && bad_spectre_microcode(c)) {
+ pr_warn("Intel Spectre v2 broken microcode detected; disabling Speculation Control\n");
+ setup_clear_cpu_cap(X86_FEATURE_IBRS);
+ setup_clear_cpu_cap(X86_FEATURE_IBPB);
+ setup_clear_cpu_cap(X86_FEATURE_STIBP);
+ setup_clear_cpu_cap(X86_FEATURE_SPEC_CTRL);
+ setup_clear_cpu_cap(X86_FEATURE_MSR_SPEC_CTRL);
+ setup_clear_cpu_cap(X86_FEATURE_INTEL_STIBP);
+ setup_clear_cpu_cap(X86_FEATURE_SSBD);
+ setup_clear_cpu_cap(X86_FEATURE_SPEC_CTRL_SSBD);
+ }
+
+ /*
+ * Atom erratum AAE44/AAF40/AAG38/AAH41:
+ *
+ * A race condition between speculative fetches and invalidating
+ * a large page. This is worked around in microcode, but we
+ * need the microcode to have already been loaded... so if it is
+ * not, recommend a BIOS update and disable large pages.
+ */
+ if (c->x86 == 6 && c->x86_model == 0x1c && c->x86_stepping <= 2 &&
+ c->microcode < 0x20e) {
+ pr_warn("Atom PSE erratum detected, BIOS microcode update recommended\n");
+ clear_cpu_cap(c, X86_FEATURE_PSE);
+ }
+
+#ifdef CONFIG_X86_64
+ set_cpu_cap(c, X86_FEATURE_SYSENTER32);
+#else
+ /* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */
+ if (c->x86 == 15 && c->x86_cache_alignment == 64)
+ c->x86_cache_alignment = 128;
+#endif
+
+ /* CPUID workaround for 0F33/0F34 CPU */
+ if (c->x86 == 0xF && c->x86_model == 0x3
+ && (c->x86_stepping == 0x3 || c->x86_stepping == 0x4))
+ c->x86_phys_bits = 36;
+
+ /*
+ * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate
+ * with P/T states and does not stop in deep C-states.
+ *
+ * It is also reliable across cores and sockets. (but not across
+ * cabinets - we turn it off in that case explicitly.)
+ */
+ if (c->x86_power & (1 << 8)) {
+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+ set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
+ }
+
+ /* Penwell and Cloverview have the TSC which doesn't sleep on S3 */
+ if (c->x86 == 6) {
+ switch (c->x86_model) {
+ case 0x27: /* Penwell */
+ case 0x35: /* Cloverview */
+ case 0x4a: /* Merrifield */
+ set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC_S3);
+ break;
+ default:
+ break;
+ }
+ }
+
+ /*
+ * There is a known erratum on Pentium III and Core Solo
+ * and Core Duo CPUs.
+ * " Page with PAT set to WC while associated MTRR is UC
+ * may consolidate to UC "
+ * Because of this erratum, it is better to stick with
+ * setting WC in MTRR rather than using PAT on these CPUs.
+ *
+ * Enable PAT WC only on P4, Core 2 or later CPUs.
+ */
+ if (c->x86 == 6 && c->x86_model < 15)
+ clear_cpu_cap(c, X86_FEATURE_PAT);
+
+ /*
+ * If fast string is not enabled in IA32_MISC_ENABLE for any reason,
+ * clear the fast string and enhanced fast string CPU capabilities.
+ */
+ if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) {
+ rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
+ if (!(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) {
+ pr_info("Disabled fast string operations\n");
+ setup_clear_cpu_cap(X86_FEATURE_REP_GOOD);
+ setup_clear_cpu_cap(X86_FEATURE_ERMS);
+ }
+ }
+
+ /*
+ * Intel Quark Core DevMan_001.pdf section 6.4.11
+ * "The operating system also is required to invalidate (i.e., flush)
+ * the TLB when any changes are made to any of the page table entries.
+ * The operating system must reload CR3 to cause the TLB to be flushed"
+ *
+ * As a result, boot_cpu_has(X86_FEATURE_PGE) in arch/x86/include/asm/tlbflush.h
+ * should be false so that __flush_tlb_all() causes CR3 insted of CR4.PGE
+ * to be modified.
+ */
+ if (c->x86 == 5 && c->x86_model == 9) {
+ pr_info("Disabling PGE capability bit\n");
+ setup_clear_cpu_cap(X86_FEATURE_PGE);
+ }
+
+ if (c->cpuid_level >= 0x00000001) {
+ u32 eax, ebx, ecx, edx;
+
+ cpuid(0x00000001, &eax, &ebx, &ecx, &edx);
+ /*
+ * If HTT (EDX[28]) is set EBX[16:23] contain the number of
+ * apicids which are reserved per package. Store the resulting
+ * shift value for the package management code.
+ */
+ if (edx & (1U << 28))
+ c->x86_coreid_bits = get_count_order((ebx >> 16) & 0xff);
+ }
+
+ check_mpx_erratum(c);
+
+ /*
+ * Get the number of SMT siblings early from the extended topology
+ * leaf, if available. Otherwise try the legacy SMT detection.
+ */
+ if (detect_extended_topology_early(c) < 0)
+ detect_ht_early(c);
+}
+
+#ifdef CONFIG_X86_32
+/*
+ * Early probe support logic for ppro memory erratum #50
+ *
+ * This is called before we do cpu ident work
+ */
+
+int ppro_with_ram_bug(void)
+{
+ /* Uses data from early_cpu_detect now */
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
+ boot_cpu_data.x86 == 6 &&
+ boot_cpu_data.x86_model == 1 &&
+ boot_cpu_data.x86_stepping < 8) {
+ pr_info("Pentium Pro with Errata#50 detected. Taking evasive action.\n");
+ return 1;
+ }
+ return 0;
+}
+
+static void intel_smp_check(struct cpuinfo_x86 *c)
+{
+ /* calling is from identify_secondary_cpu() ? */
+ if (!c->cpu_index)
+ return;
+
+ /*
+ * Mask B, Pentium, but not Pentium MMX
+ */
+ if (c->x86 == 5 &&
+ c->x86_stepping >= 1 && c->x86_stepping <= 4 &&
+ c->x86_model <= 3) {
+ /*
+ * Remember we have B step Pentia with bugs
+ */
+ WARN_ONCE(1, "WARNING: SMP operation may be unreliable"
+ "with B stepping processors.\n");
+ }
+}
+
+static int forcepae;
+static int __init forcepae_setup(char *__unused)
+{
+ forcepae = 1;
+ return 1;
+}
+__setup("forcepae", forcepae_setup);
+
+static void intel_workarounds(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_X86_F00F_BUG
+ /*
+ * All models of Pentium and Pentium with MMX technology CPUs
+ * have the F0 0F bug, which lets nonprivileged users lock up the
+ * system. Announce that the fault handler will be checking for it.
+ * The Quark is also family 5, but does not have the same bug.
+ */
+ clear_cpu_bug(c, X86_BUG_F00F);
+ if (c->x86 == 5 && c->x86_model < 9) {
+ static int f00f_workaround_enabled;
+
+ set_cpu_bug(c, X86_BUG_F00F);
+ if (!f00f_workaround_enabled) {
+ pr_notice("Intel Pentium with F0 0F bug - workaround enabled.\n");
+ f00f_workaround_enabled = 1;
+ }
+ }
+#endif
+
+ /*
+ * SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until
+ * model 3 mask 3
+ */
+ if ((c->x86<<8 | c->x86_model<<4 | c->x86_stepping) < 0x633)
+ clear_cpu_cap(c, X86_FEATURE_SEP);
+
+ /*
+ * PAE CPUID issue: many Pentium M report no PAE but may have a
+ * functionally usable PAE implementation.
+ * Forcefully enable PAE if kernel parameter "forcepae" is present.
+ */
+ if (forcepae) {
+ pr_warn("PAE forced!\n");
+ set_cpu_cap(c, X86_FEATURE_PAE);
+ add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_NOW_UNRELIABLE);
+ }
+
+ /*
+ * P4 Xeon erratum 037 workaround.
+ * Hardware prefetcher may cause stale data to be loaded into the cache.
+ */
+ if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_stepping == 1)) {
+ if (msr_set_bit(MSR_IA32_MISC_ENABLE,
+ MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE_BIT) > 0) {
+ pr_info("CPU: C0 stepping P4 Xeon detected.\n");
+ pr_info("CPU: Disabling hardware prefetching (Erratum 037)\n");
+ }
+ }
+
+ /*
+ * See if we have a good local APIC by checking for buggy Pentia,
+ * i.e. all B steppings and the C2 stepping of P54C when using their
+ * integrated APIC (see 11AP erratum in "Pentium Processor
+ * Specification Update").
+ */
+ if (boot_cpu_has(X86_FEATURE_APIC) && (c->x86<<8 | c->x86_model<<4) == 0x520 &&
+ (c->x86_stepping < 0x6 || c->x86_stepping == 0xb))
+ set_cpu_bug(c, X86_BUG_11AP);
+
+
+#ifdef CONFIG_X86_INTEL_USERCOPY
+ /*
+ * Set up the preferred alignment for movsl bulk memory moves
+ */
+ switch (c->x86) {
+ case 4: /* 486: untested */
+ break;
+ case 5: /* Old Pentia: untested */
+ break;
+ case 6: /* PII/PIII only like movsl with 8-byte alignment */
+ movsl_mask.mask = 7;
+ break;
+ case 15: /* P4 is OK down to 8-byte alignment */
+ movsl_mask.mask = 7;
+ break;
+ }
+#endif
+
+ intel_smp_check(c);
+}
+#else
+static void intel_workarounds(struct cpuinfo_x86 *c)
+{
+}
+#endif
+
+static void srat_detect_node(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_NUMA
+ unsigned node;
+ int cpu = smp_processor_id();
+
+ /* Don't do the funky fallback heuristics the AMD version employs
+ for now. */
+ node = numa_cpu_node(cpu);
+ if (node == NUMA_NO_NODE || !node_online(node)) {
+ /* reuse the value from init_cpu_to_node() */
+ node = cpu_to_node(cpu);
+ }
+ numa_set_node(cpu, node);
+#endif
+}
+
+static void detect_vmx_virtcap(struct cpuinfo_x86 *c)
+{
+ /* Intel VMX MSR indicated features */
+#define X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW 0x00200000
+#define X86_VMX_FEATURE_PROC_CTLS_VNMI 0x00400000
+#define X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS 0x80000000
+#define X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC 0x00000001
+#define X86_VMX_FEATURE_PROC_CTLS2_EPT 0x00000002
+#define X86_VMX_FEATURE_PROC_CTLS2_VPID 0x00000020
+#define x86_VMX_FEATURE_EPT_CAP_AD 0x00200000
+
+ u32 vmx_msr_low, vmx_msr_high, msr_ctl, msr_ctl2;
+ u32 msr_vpid_cap, msr_ept_cap;
+
+ clear_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
+ clear_cpu_cap(c, X86_FEATURE_VNMI);
+ clear_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
+ clear_cpu_cap(c, X86_FEATURE_EPT);
+ clear_cpu_cap(c, X86_FEATURE_VPID);
+ clear_cpu_cap(c, X86_FEATURE_EPT_AD);
+
+ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high);
+ msr_ctl = vmx_msr_high | vmx_msr_low;
+ if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW)
+ set_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
+ if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_VNMI)
+ set_cpu_cap(c, X86_FEATURE_VNMI);
+ if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS) {
+ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
+ vmx_msr_low, vmx_msr_high);
+ msr_ctl2 = vmx_msr_high | vmx_msr_low;
+ if ((msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC) &&
+ (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW))
+ set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
+ if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_EPT) {
+ set_cpu_cap(c, X86_FEATURE_EPT);
+ rdmsr(MSR_IA32_VMX_EPT_VPID_CAP,
+ msr_ept_cap, msr_vpid_cap);
+ if (msr_ept_cap & x86_VMX_FEATURE_EPT_CAP_AD)
+ set_cpu_cap(c, X86_FEATURE_EPT_AD);
+ }
+ if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VPID)
+ set_cpu_cap(c, X86_FEATURE_VPID);
+ }
+}
+
+#define MSR_IA32_TME_ACTIVATE 0x982
+
+/* Helpers to access TME_ACTIVATE MSR */
+#define TME_ACTIVATE_LOCKED(x) (x & 0x1)
+#define TME_ACTIVATE_ENABLED(x) (x & 0x2)
+
+#define TME_ACTIVATE_POLICY(x) ((x >> 4) & 0xf) /* Bits 7:4 */
+#define TME_ACTIVATE_POLICY_AES_XTS_128 0
+
+#define TME_ACTIVATE_KEYID_BITS(x) ((x >> 32) & 0xf) /* Bits 35:32 */
+
+#define TME_ACTIVATE_CRYPTO_ALGS(x) ((x >> 48) & 0xffff) /* Bits 63:48 */
+#define TME_ACTIVATE_CRYPTO_AES_XTS_128 1
+
+/* Values for mktme_status (SW only construct) */
+#define MKTME_ENABLED 0
+#define MKTME_DISABLED 1
+#define MKTME_UNINITIALIZED 2
+static int mktme_status = MKTME_UNINITIALIZED;
+
+static void detect_tme(struct cpuinfo_x86 *c)
+{
+ u64 tme_activate, tme_policy, tme_crypto_algs;
+ int keyid_bits = 0, nr_keyids = 0;
+ static u64 tme_activate_cpu0 = 0;
+
+ rdmsrl(MSR_IA32_TME_ACTIVATE, tme_activate);
+
+ if (mktme_status != MKTME_UNINITIALIZED) {
+ if (tme_activate != tme_activate_cpu0) {
+ /* Broken BIOS? */
+ pr_err_once("x86/tme: configuration is inconsistent between CPUs\n");
+ pr_err_once("x86/tme: MKTME is not usable\n");
+ mktme_status = MKTME_DISABLED;
+
+ /* Proceed. We may need to exclude bits from x86_phys_bits. */
+ }
+ } else {
+ tme_activate_cpu0 = tme_activate;
+ }
+
+ if (!TME_ACTIVATE_LOCKED(tme_activate) || !TME_ACTIVATE_ENABLED(tme_activate)) {
+ pr_info_once("x86/tme: not enabled by BIOS\n");
+ mktme_status = MKTME_DISABLED;
+ return;
+ }
+
+ if (mktme_status != MKTME_UNINITIALIZED)
+ goto detect_keyid_bits;
+
+ pr_info("x86/tme: enabled by BIOS\n");
+
+ tme_policy = TME_ACTIVATE_POLICY(tme_activate);
+ if (tme_policy != TME_ACTIVATE_POLICY_AES_XTS_128)
+ pr_warn("x86/tme: Unknown policy is active: %#llx\n", tme_policy);
+
+ tme_crypto_algs = TME_ACTIVATE_CRYPTO_ALGS(tme_activate);
+ if (!(tme_crypto_algs & TME_ACTIVATE_CRYPTO_AES_XTS_128)) {
+ pr_err("x86/mktme: No known encryption algorithm is supported: %#llx\n",
+ tme_crypto_algs);
+ mktme_status = MKTME_DISABLED;
+ }
+detect_keyid_bits:
+ keyid_bits = TME_ACTIVATE_KEYID_BITS(tme_activate);
+ nr_keyids = (1UL << keyid_bits) - 1;
+ if (nr_keyids) {
+ pr_info_once("x86/mktme: enabled by BIOS\n");
+ pr_info_once("x86/mktme: %d KeyIDs available\n", nr_keyids);
+ } else {
+ pr_info_once("x86/mktme: disabled by BIOS\n");
+ }
+
+ if (mktme_status == MKTME_UNINITIALIZED) {
+ /* MKTME is usable */
+ mktme_status = MKTME_ENABLED;
+ }
+
+ /*
+ * KeyID bits effectively lower the number of physical address
+ * bits. Update cpuinfo_x86::x86_phys_bits accordingly.
+ */
+ c->x86_phys_bits -= keyid_bits;
+}
+
+static void init_intel_energy_perf(struct cpuinfo_x86 *c)
+{
+ u64 epb;
+
+ /*
+ * Initialize MSR_IA32_ENERGY_PERF_BIAS if not already initialized.
+ * (x86_energy_perf_policy(8) is available to change it at run-time.)
+ */
+ if (!cpu_has(c, X86_FEATURE_EPB))
+ return;
+
+ rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
+ if ((epb & 0xF) != ENERGY_PERF_BIAS_PERFORMANCE)
+ return;
+
+ pr_warn_once("ENERGY_PERF_BIAS: Set to 'normal', was 'performance'\n");
+ pr_warn_once("ENERGY_PERF_BIAS: View and update with x86_energy_perf_policy(8)\n");
+ epb = (epb & ~0xF) | ENERGY_PERF_BIAS_NORMAL;
+ wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
+}
+
+static void intel_bsp_resume(struct cpuinfo_x86 *c)
+{
+ /*
+ * MSR_IA32_ENERGY_PERF_BIAS is lost across suspend/resume,
+ * so reinitialize it properly like during bootup:
+ */
+ init_intel_energy_perf(c);
+}
+
+static void init_cpuid_fault(struct cpuinfo_x86 *c)
+{
+ u64 msr;
+
+ if (!rdmsrl_safe(MSR_PLATFORM_INFO, &msr)) {
+ if (msr & MSR_PLATFORM_INFO_CPUID_FAULT)
+ set_cpu_cap(c, X86_FEATURE_CPUID_FAULT);
+ }
+}
+
+static void init_intel_misc_features(struct cpuinfo_x86 *c)
+{
+ u64 msr;
+
+ if (rdmsrl_safe(MSR_MISC_FEATURES_ENABLES, &msr))
+ return;
+
+ /* Clear all MISC features */
+ this_cpu_write(msr_misc_features_shadow, 0);
+
+ /* Check features and update capabilities and shadow control bits */
+ init_cpuid_fault(c);
+ probe_xeon_phi_r3mwait(c);
+
+ msr = this_cpu_read(msr_misc_features_shadow);
+ wrmsrl(MSR_MISC_FEATURES_ENABLES, msr);
+}
+
+static void init_intel(struct cpuinfo_x86 *c)
+{
+ early_init_intel(c);
+
+ intel_workarounds(c);
+
+ /*
+ * Detect the extended topology information if available. This
+ * will reinitialise the initial_apicid which will be used
+ * in init_intel_cacheinfo()
+ */
+ detect_extended_topology(c);
+
+ if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) {
+ /*
+ * let's use the legacy cpuid vector 0x1 and 0x4 for topology
+ * detection.
+ */
+ detect_num_cpu_cores(c);
+#ifdef CONFIG_X86_32
+ detect_ht(c);
+#endif
+ }
+
+ init_intel_cacheinfo(c);
+
+ if (c->cpuid_level > 9) {
+ unsigned eax = cpuid_eax(10);
+ /* Check for version and the number of counters */
+ if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
+ set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
+ }
+
+ if (cpu_has(c, X86_FEATURE_XMM2))
+ set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
+
+ if (boot_cpu_has(X86_FEATURE_DS)) {
+ unsigned int l1, l2;
+
+ rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
+ if (!(l1 & (1<<11)))
+ set_cpu_cap(c, X86_FEATURE_BTS);
+ if (!(l1 & (1<<12)))
+ set_cpu_cap(c, X86_FEATURE_PEBS);
+ }
+
+ if (c->x86 == 6 && boot_cpu_has(X86_FEATURE_CLFLUSH) &&
+ (c->x86_model == 29 || c->x86_model == 46 || c->x86_model == 47))
+ set_cpu_bug(c, X86_BUG_CLFLUSH_MONITOR);
+
+ if (c->x86 == 6 && boot_cpu_has(X86_FEATURE_MWAIT) &&
+ ((c->x86_model == INTEL_FAM6_ATOM_GOLDMONT)))
+ set_cpu_bug(c, X86_BUG_MONITOR);
+
+#ifdef CONFIG_X86_64
+ if (c->x86 == 15)
+ c->x86_cache_alignment = c->x86_clflush_size * 2;
+ if (c->x86 == 6)
+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+#else
+ /*
+ * Names for the Pentium II/Celeron processors
+ * detectable only by also checking the cache size.
+ * Dixon is NOT a Celeron.
+ */
+ if (c->x86 == 6) {
+ unsigned int l2 = c->x86_cache_size;
+ char *p = NULL;
+
+ switch (c->x86_model) {
+ case 5:
+ if (l2 == 0)
+ p = "Celeron (Covington)";
+ else if (l2 == 256)
+ p = "Mobile Pentium II (Dixon)";
+ break;
+
+ case 6:
+ if (l2 == 128)
+ p = "Celeron (Mendocino)";
+ else if (c->x86_stepping == 0 || c->x86_stepping == 5)
+ p = "Celeron-A";
+ break;
+
+ case 8:
+ if (l2 == 128)
+ p = "Celeron (Coppermine)";
+ break;
+ }
+
+ if (p)
+ strcpy(c->x86_model_id, p);
+ }
+
+ if (c->x86 == 15)
+ set_cpu_cap(c, X86_FEATURE_P4);
+ if (c->x86 == 6)
+ set_cpu_cap(c, X86_FEATURE_P3);
+#endif
+
+ /* Work around errata */
+ srat_detect_node(c);
+
+ if (cpu_has(c, X86_FEATURE_VMX))
+ detect_vmx_virtcap(c);
+
+ if (cpu_has(c, X86_FEATURE_TME))
+ detect_tme(c);
+
+ init_intel_energy_perf(c);
+
+ init_intel_misc_features(c);
+
+ if (tsx_ctrl_state == TSX_CTRL_ENABLE)
+ tsx_enable();
+ if (tsx_ctrl_state == TSX_CTRL_DISABLE)
+ tsx_disable();
+}
+
+#ifdef CONFIG_X86_32
+static unsigned int intel_size_cache(struct cpuinfo_x86 *c, unsigned int size)
+{
+ /*
+ * Intel PIII Tualatin. This comes in two flavours.
+ * One has 256kb of cache, the other 512. We have no way
+ * to determine which, so we use a boottime override
+ * for the 512kb model, and assume 256 otherwise.
+ */
+ if ((c->x86 == 6) && (c->x86_model == 11) && (size == 0))
+ size = 256;
+
+ /*
+ * Intel Quark SoC X1000 contains a 4-way set associative
+ * 16K cache with a 16 byte cache line and 256 lines per tag
+ */
+ if ((c->x86 == 5) && (c->x86_model == 9))
+ size = 16;
+ return size;
+}
+#endif
+
+#define TLB_INST_4K 0x01
+#define TLB_INST_4M 0x02
+#define TLB_INST_2M_4M 0x03
+
+#define TLB_INST_ALL 0x05
+#define TLB_INST_1G 0x06
+
+#define TLB_DATA_4K 0x11
+#define TLB_DATA_4M 0x12
+#define TLB_DATA_2M_4M 0x13
+#define TLB_DATA_4K_4M 0x14
+
+#define TLB_DATA_1G 0x16
+
+#define TLB_DATA0_4K 0x21
+#define TLB_DATA0_4M 0x22
+#define TLB_DATA0_2M_4M 0x23
+
+#define STLB_4K 0x41
+#define STLB_4K_2M 0x42
+
+static const struct _tlb_table intel_tlb_table[] = {
+ { 0x01, TLB_INST_4K, 32, " TLB_INST 4 KByte pages, 4-way set associative" },
+ { 0x02, TLB_INST_4M, 2, " TLB_INST 4 MByte pages, full associative" },
+ { 0x03, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way set associative" },
+ { 0x04, TLB_DATA_4M, 8, " TLB_DATA 4 MByte pages, 4-way set associative" },
+ { 0x05, TLB_DATA_4M, 32, " TLB_DATA 4 MByte pages, 4-way set associative" },
+ { 0x0b, TLB_INST_4M, 4, " TLB_INST 4 MByte pages, 4-way set associative" },
+ { 0x4f, TLB_INST_4K, 32, " TLB_INST 4 KByte pages */" },
+ { 0x50, TLB_INST_ALL, 64, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
+ { 0x51, TLB_INST_ALL, 128, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
+ { 0x52, TLB_INST_ALL, 256, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
+ { 0x55, TLB_INST_2M_4M, 7, " TLB_INST 2-MByte or 4-MByte pages, fully associative" },
+ { 0x56, TLB_DATA0_4M, 16, " TLB_DATA0 4 MByte pages, 4-way set associative" },
+ { 0x57, TLB_DATA0_4K, 16, " TLB_DATA0 4 KByte pages, 4-way associative" },
+ { 0x59, TLB_DATA0_4K, 16, " TLB_DATA0 4 KByte pages, fully associative" },
+ { 0x5a, TLB_DATA0_2M_4M, 32, " TLB_DATA0 2-MByte or 4 MByte pages, 4-way set associative" },
+ { 0x5b, TLB_DATA_4K_4M, 64, " TLB_DATA 4 KByte and 4 MByte pages" },
+ { 0x5c, TLB_DATA_4K_4M, 128, " TLB_DATA 4 KByte and 4 MByte pages" },
+ { 0x5d, TLB_DATA_4K_4M, 256, " TLB_DATA 4 KByte and 4 MByte pages" },
+ { 0x61, TLB_INST_4K, 48, " TLB_INST 4 KByte pages, full associative" },
+ { 0x63, TLB_DATA_1G, 4, " TLB_DATA 1 GByte pages, 4-way set associative" },
+ { 0x6b, TLB_DATA_4K, 256, " TLB_DATA 4 KByte pages, 8-way associative" },
+ { 0x6c, TLB_DATA_2M_4M, 128, " TLB_DATA 2 MByte or 4 MByte pages, 8-way associative" },
+ { 0x6d, TLB_DATA_1G, 16, " TLB_DATA 1 GByte pages, fully associative" },
+ { 0x76, TLB_INST_2M_4M, 8, " TLB_INST 2-MByte or 4-MByte pages, fully associative" },
+ { 0xb0, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 4-way set associative" },
+ { 0xb1, TLB_INST_2M_4M, 4, " TLB_INST 2M pages, 4-way, 8 entries or 4M pages, 4-way entries" },
+ { 0xb2, TLB_INST_4K, 64, " TLB_INST 4KByte pages, 4-way set associative" },
+ { 0xb3, TLB_DATA_4K, 128, " TLB_DATA 4 KByte pages, 4-way set associative" },
+ { 0xb4, TLB_DATA_4K, 256, " TLB_DATA 4 KByte pages, 4-way associative" },
+ { 0xb5, TLB_INST_4K, 64, " TLB_INST 4 KByte pages, 8-way set associative" },
+ { 0xb6, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 8-way set associative" },
+ { 0xba, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way associative" },
+ { 0xc0, TLB_DATA_4K_4M, 8, " TLB_DATA 4 KByte and 4 MByte pages, 4-way associative" },
+ { 0xc1, STLB_4K_2M, 1024, " STLB 4 KByte and 2 MByte pages, 8-way associative" },
+ { 0xc2, TLB_DATA_2M_4M, 16, " DTLB 2 MByte/4MByte pages, 4-way associative" },
+ { 0xca, STLB_4K, 512, " STLB 4 KByte pages, 4-way associative" },
+ { 0x00, 0, 0 }
+};
+
+static void intel_tlb_lookup(const unsigned char desc)
+{
+ unsigned char k;
+ if (desc == 0)
+ return;
+
+ /* look up this descriptor in the table */
+ for (k = 0; intel_tlb_table[k].descriptor != desc && \
+ intel_tlb_table[k].descriptor != 0; k++)
+ ;
+
+ if (intel_tlb_table[k].tlb_type == 0)
+ return;
+
+ switch (intel_tlb_table[k].tlb_type) {
+ case STLB_4K:
+ if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
+ if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
+ break;
+ case STLB_4K_2M:
+ if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
+ if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
+ if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries;
+ if (tlb_lld_2m[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lld_2m[ENTRIES] = intel_tlb_table[k].entries;
+ if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries;
+ if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
+ break;
+ case TLB_INST_ALL:
+ if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
+ if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries;
+ if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries;
+ break;
+ case TLB_INST_4K:
+ if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
+ break;
+ case TLB_INST_4M:
+ if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries;
+ break;
+ case TLB_INST_2M_4M:
+ if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries;
+ if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries;
+ break;
+ case TLB_DATA_4K:
+ case TLB_DATA0_4K:
+ if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
+ break;
+ case TLB_DATA_4M:
+ case TLB_DATA0_4M:
+ if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
+ break;
+ case TLB_DATA_2M_4M:
+ case TLB_DATA0_2M_4M:
+ if (tlb_lld_2m[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lld_2m[ENTRIES] = intel_tlb_table[k].entries;
+ if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
+ break;
+ case TLB_DATA_4K_4M:
+ if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
+ if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
+ break;
+ case TLB_DATA_1G:
+ if (tlb_lld_1g[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lld_1g[ENTRIES] = intel_tlb_table[k].entries;
+ break;
+ }
+}
+
+static void intel_detect_tlb(struct cpuinfo_x86 *c)
+{
+ int i, j, n;
+ unsigned int regs[4];
+ unsigned char *desc = (unsigned char *)regs;
+
+ if (c->cpuid_level < 2)
+ return;
+
+ /* Number of times to iterate */
+ n = cpuid_eax(2) & 0xFF;
+
+ for (i = 0 ; i < n ; i++) {
+ cpuid(2, &regs[0], &regs[1], &regs[2], &regs[3]);
+
+ /* If bit 31 is set, this is an unknown format */
+ for (j = 0 ; j < 3 ; j++)
+ if (regs[j] & (1 << 31))
+ regs[j] = 0;
+
+ /* Byte 0 is level count, not a descriptor */
+ for (j = 1 ; j < 16 ; j++)
+ intel_tlb_lookup(desc[j]);
+ }
+}
+
+static const struct cpu_dev intel_cpu_dev = {
+ .c_vendor = "Intel",
+ .c_ident = { "GenuineIntel" },
+#ifdef CONFIG_X86_32
+ .legacy_models = {
+ { .family = 4, .model_names =
+ {
+ [0] = "486 DX-25/33",
+ [1] = "486 DX-50",
+ [2] = "486 SX",
+ [3] = "486 DX/2",
+ [4] = "486 SL",
+ [5] = "486 SX/2",
+ [7] = "486 DX/2-WB",
+ [8] = "486 DX/4",
+ [9] = "486 DX/4-WB"
+ }
+ },
+ { .family = 5, .model_names =
+ {
+ [0] = "Pentium 60/66 A-step",
+ [1] = "Pentium 60/66",
+ [2] = "Pentium 75 - 200",
+ [3] = "OverDrive PODP5V83",
+ [4] = "Pentium MMX",
+ [7] = "Mobile Pentium 75 - 200",
+ [8] = "Mobile Pentium MMX",
+ [9] = "Quark SoC X1000",
+ }
+ },
+ { .family = 6, .model_names =
+ {
+ [0] = "Pentium Pro A-step",
+ [1] = "Pentium Pro",
+ [3] = "Pentium II (Klamath)",
+ [4] = "Pentium II (Deschutes)",
+ [5] = "Pentium II (Deschutes)",
+ [6] = "Mobile Pentium II",
+ [7] = "Pentium III (Katmai)",
+ [8] = "Pentium III (Coppermine)",
+ [10] = "Pentium III (Cascades)",
+ [11] = "Pentium III (Tualatin)",
+ }
+ },
+ { .family = 15, .model_names =
+ {
+ [0] = "Pentium 4 (Unknown)",
+ [1] = "Pentium 4 (Willamette)",
+ [2] = "Pentium 4 (Northwood)",
+ [4] = "Pentium 4 (Foster)",
+ [5] = "Pentium 4 (Foster)",
+ }
+ },
+ },
+ .legacy_cache_size = intel_size_cache,
+#endif
+ .c_detect_tlb = intel_detect_tlb,
+ .c_early_init = early_init_intel,
+ .c_init = init_intel,
+ .c_bsp_resume = intel_bsp_resume,
+ .c_x86_vendor = X86_VENDOR_INTEL,
+};
+
+cpu_dev_register(intel_cpu_dev);
+
diff --git a/arch/x86/kernel/cpu/intel_pconfig.c b/arch/x86/kernel/cpu/intel_pconfig.c
new file mode 100644
index 000000000..0771a905b
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_pconfig.c
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel PCONFIG instruction support.
+ *
+ * Copyright (C) 2017 Intel Corporation
+ *
+ * Author:
+ * Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+ */
+
+#include <asm/cpufeature.h>
+#include <asm/intel_pconfig.h>
+
+#define PCONFIG_CPUID 0x1b
+
+#define PCONFIG_CPUID_SUBLEAF_MASK ((1 << 12) - 1)
+
+/* Subleaf type (EAX) for PCONFIG CPUID leaf (0x1B) */
+enum {
+ PCONFIG_CPUID_SUBLEAF_INVALID = 0,
+ PCONFIG_CPUID_SUBLEAF_TARGETID = 1,
+};
+
+/* Bitmask of supported targets */
+static u64 targets_supported __read_mostly;
+
+int pconfig_target_supported(enum pconfig_target target)
+{
+ /*
+ * We would need to re-think the implementation once we get > 64
+ * PCONFIG targets. Spec allows up to 2^32 targets.
+ */
+ BUILD_BUG_ON(PCONFIG_TARGET_NR >= 64);
+
+ if (WARN_ON_ONCE(target >= 64))
+ return 0;
+ return targets_supported & (1ULL << target);
+}
+
+static int __init intel_pconfig_init(void)
+{
+ int subleaf;
+
+ if (!boot_cpu_has(X86_FEATURE_PCONFIG))
+ return 0;
+
+ /*
+ * Scan subleafs of PCONFIG CPUID leaf.
+ *
+ * Subleafs of the same type need not to be consecutive.
+ *
+ * Stop on the first invalid subleaf type. All subleafs after the first
+ * invalid are invalid too.
+ */
+ for (subleaf = 0; subleaf < INT_MAX; subleaf++) {
+ struct cpuid_regs regs;
+
+ cpuid_count(PCONFIG_CPUID, subleaf,
+ &regs.eax, &regs.ebx, &regs.ecx, &regs.edx);
+
+ switch (regs.eax & PCONFIG_CPUID_SUBLEAF_MASK) {
+ case PCONFIG_CPUID_SUBLEAF_INVALID:
+ /* Stop on the first invalid subleaf */
+ goto out;
+ case PCONFIG_CPUID_SUBLEAF_TARGETID:
+ /* Mark supported PCONFIG targets */
+ if (regs.ebx < 64)
+ targets_supported |= (1ULL << regs.ebx);
+ if (regs.ecx < 64)
+ targets_supported |= (1ULL << regs.ecx);
+ if (regs.edx < 64)
+ targets_supported |= (1ULL << regs.edx);
+ break;
+ default:
+ /* Unknown CPUID.PCONFIG subleaf: ignore */
+ break;
+ }
+ }
+out:
+ return 0;
+}
+arch_initcall(intel_pconfig_init);
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
new file mode 100644
index 000000000..d4993d42b
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -0,0 +1,915 @@
+/*
+ * Resource Director Technology(RDT)
+ * - Cache Allocation code.
+ *
+ * Copyright (C) 2016 Intel Corporation
+ *
+ * Authors:
+ * Fenghua Yu <fenghua.yu@intel.com>
+ * Tony Luck <tony.luck@intel.com>
+ * Vikas Shivappa <vikas.shivappa@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * More information about RDT be found in the Intel (R) x86 Architecture
+ * Software Developer Manual June 2016, volume 3, section 17.17.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/cacheinfo.h>
+#include <linux/cpuhotplug.h>
+
+#include <asm/intel-family.h>
+#include <asm/intel_rdt_sched.h>
+#include "intel_rdt.h"
+
+#define MBA_IS_LINEAR 0x4
+#define MBA_MAX_MBPS U32_MAX
+
+/* Mutex to protect rdtgroup access. */
+DEFINE_MUTEX(rdtgroup_mutex);
+
+/*
+ * The cached intel_pqr_state is strictly per CPU and can never be
+ * updated from a remote CPU. Functions which modify the state
+ * are called with interrupts disabled and no preemption, which
+ * is sufficient for the protection.
+ */
+DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
+
+/*
+ * Used to store the max resource name width and max resource data width
+ * to display the schemata in a tabular format
+ */
+int max_name_width, max_data_width;
+
+/*
+ * Global boolean for rdt_alloc which is true if any
+ * resource allocation is enabled.
+ */
+bool rdt_alloc_capable;
+
+static void
+mba_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r);
+static void
+cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r);
+
+#define domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].domains)
+
+struct rdt_resource rdt_resources_all[] = {
+ [RDT_RESOURCE_L3] =
+ {
+ .rid = RDT_RESOURCE_L3,
+ .name = "L3",
+ .domains = domain_init(RDT_RESOURCE_L3),
+ .msr_base = IA32_L3_CBM_BASE,
+ .msr_update = cat_wrmsr,
+ .cache_level = 3,
+ .cache = {
+ .min_cbm_bits = 1,
+ .cbm_idx_mult = 1,
+ .cbm_idx_offset = 0,
+ },
+ .parse_ctrlval = parse_cbm,
+ .format_str = "%d=%0*x",
+ .fflags = RFTYPE_RES_CACHE,
+ },
+ [RDT_RESOURCE_L3DATA] =
+ {
+ .rid = RDT_RESOURCE_L3DATA,
+ .name = "L3DATA",
+ .domains = domain_init(RDT_RESOURCE_L3DATA),
+ .msr_base = IA32_L3_CBM_BASE,
+ .msr_update = cat_wrmsr,
+ .cache_level = 3,
+ .cache = {
+ .min_cbm_bits = 1,
+ .cbm_idx_mult = 2,
+ .cbm_idx_offset = 0,
+ },
+ .parse_ctrlval = parse_cbm,
+ .format_str = "%d=%0*x",
+ .fflags = RFTYPE_RES_CACHE,
+ },
+ [RDT_RESOURCE_L3CODE] =
+ {
+ .rid = RDT_RESOURCE_L3CODE,
+ .name = "L3CODE",
+ .domains = domain_init(RDT_RESOURCE_L3CODE),
+ .msr_base = IA32_L3_CBM_BASE,
+ .msr_update = cat_wrmsr,
+ .cache_level = 3,
+ .cache = {
+ .min_cbm_bits = 1,
+ .cbm_idx_mult = 2,
+ .cbm_idx_offset = 1,
+ },
+ .parse_ctrlval = parse_cbm,
+ .format_str = "%d=%0*x",
+ .fflags = RFTYPE_RES_CACHE,
+ },
+ [RDT_RESOURCE_L2] =
+ {
+ .rid = RDT_RESOURCE_L2,
+ .name = "L2",
+ .domains = domain_init(RDT_RESOURCE_L2),
+ .msr_base = IA32_L2_CBM_BASE,
+ .msr_update = cat_wrmsr,
+ .cache_level = 2,
+ .cache = {
+ .min_cbm_bits = 1,
+ .cbm_idx_mult = 1,
+ .cbm_idx_offset = 0,
+ },
+ .parse_ctrlval = parse_cbm,
+ .format_str = "%d=%0*x",
+ .fflags = RFTYPE_RES_CACHE,
+ },
+ [RDT_RESOURCE_L2DATA] =
+ {
+ .rid = RDT_RESOURCE_L2DATA,
+ .name = "L2DATA",
+ .domains = domain_init(RDT_RESOURCE_L2DATA),
+ .msr_base = IA32_L2_CBM_BASE,
+ .msr_update = cat_wrmsr,
+ .cache_level = 2,
+ .cache = {
+ .min_cbm_bits = 1,
+ .cbm_idx_mult = 2,
+ .cbm_idx_offset = 0,
+ },
+ .parse_ctrlval = parse_cbm,
+ .format_str = "%d=%0*x",
+ .fflags = RFTYPE_RES_CACHE,
+ },
+ [RDT_RESOURCE_L2CODE] =
+ {
+ .rid = RDT_RESOURCE_L2CODE,
+ .name = "L2CODE",
+ .domains = domain_init(RDT_RESOURCE_L2CODE),
+ .msr_base = IA32_L2_CBM_BASE,
+ .msr_update = cat_wrmsr,
+ .cache_level = 2,
+ .cache = {
+ .min_cbm_bits = 1,
+ .cbm_idx_mult = 2,
+ .cbm_idx_offset = 1,
+ },
+ .parse_ctrlval = parse_cbm,
+ .format_str = "%d=%0*x",
+ .fflags = RFTYPE_RES_CACHE,
+ },
+ [RDT_RESOURCE_MBA] =
+ {
+ .rid = RDT_RESOURCE_MBA,
+ .name = "MB",
+ .domains = domain_init(RDT_RESOURCE_MBA),
+ .msr_base = IA32_MBA_THRTL_BASE,
+ .msr_update = mba_wrmsr,
+ .cache_level = 3,
+ .parse_ctrlval = parse_bw,
+ .format_str = "%d=%*u",
+ .fflags = RFTYPE_RES_MB,
+ },
+};
+
+static unsigned int cbm_idx(struct rdt_resource *r, unsigned int closid)
+{
+ return closid * r->cache.cbm_idx_mult + r->cache.cbm_idx_offset;
+}
+
+/*
+ * cache_alloc_hsw_probe() - Have to probe for Intel haswell server CPUs
+ * as they do not have CPUID enumeration support for Cache allocation.
+ * The check for Vendor/Family/Model is not enough to guarantee that
+ * the MSRs won't #GP fault because only the following SKUs support
+ * CAT:
+ * Intel(R) Xeon(R) CPU E5-2658 v3 @ 2.20GHz
+ * Intel(R) Xeon(R) CPU E5-2648L v3 @ 1.80GHz
+ * Intel(R) Xeon(R) CPU E5-2628L v3 @ 2.00GHz
+ * Intel(R) Xeon(R) CPU E5-2618L v3 @ 2.30GHz
+ * Intel(R) Xeon(R) CPU E5-2608L v3 @ 2.00GHz
+ * Intel(R) Xeon(R) CPU E5-2658A v3 @ 2.20GHz
+ *
+ * Probe by trying to write the first of the L3 cach mask registers
+ * and checking that the bits stick. Max CLOSids is always 4 and max cbm length
+ * is always 20 on hsw server parts. The minimum cache bitmask length
+ * allowed for HSW server is always 2 bits. Hardcode all of them.
+ */
+static inline void cache_alloc_hsw_probe(void)
+{
+ struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3];
+ u32 l, h, max_cbm = BIT_MASK(20) - 1;
+
+ if (wrmsr_safe(IA32_L3_CBM_BASE, max_cbm, 0))
+ return;
+ rdmsr(IA32_L3_CBM_BASE, l, h);
+
+ /* If all the bits were set in MSR, return success */
+ if (l != max_cbm)
+ return;
+
+ r->num_closid = 4;
+ r->default_ctrl = max_cbm;
+ r->cache.cbm_len = 20;
+ r->cache.shareable_bits = 0xc0000;
+ r->cache.min_cbm_bits = 2;
+ r->alloc_capable = true;
+ r->alloc_enabled = true;
+
+ rdt_alloc_capable = true;
+}
+
+bool is_mba_sc(struct rdt_resource *r)
+{
+ if (!r)
+ return rdt_resources_all[RDT_RESOURCE_MBA].membw.mba_sc;
+
+ return r->membw.mba_sc;
+}
+
+/*
+ * rdt_get_mb_table() - get a mapping of bandwidth(b/w) percentage values
+ * exposed to user interface and the h/w understandable delay values.
+ *
+ * The non-linear delay values have the granularity of power of two
+ * and also the h/w does not guarantee a curve for configured delay
+ * values vs. actual b/w enforced.
+ * Hence we need a mapping that is pre calibrated so the user can
+ * express the memory b/w as a percentage value.
+ */
+static inline bool rdt_get_mb_table(struct rdt_resource *r)
+{
+ /*
+ * There are no Intel SKUs as of now to support non-linear delay.
+ */
+ pr_info("MBA b/w map not implemented for cpu:%d, model:%d",
+ boot_cpu_data.x86, boot_cpu_data.x86_model);
+
+ return false;
+}
+
+static bool rdt_get_mem_config(struct rdt_resource *r)
+{
+ union cpuid_0x10_3_eax eax;
+ union cpuid_0x10_x_edx edx;
+ u32 ebx, ecx;
+
+ cpuid_count(0x00000010, 3, &eax.full, &ebx, &ecx, &edx.full);
+ r->num_closid = edx.split.cos_max + 1;
+ r->membw.max_delay = eax.split.max_delay + 1;
+ r->default_ctrl = MAX_MBA_BW;
+ if (ecx & MBA_IS_LINEAR) {
+ r->membw.delay_linear = true;
+ r->membw.min_bw = MAX_MBA_BW - r->membw.max_delay;
+ r->membw.bw_gran = MAX_MBA_BW - r->membw.max_delay;
+ } else {
+ if (!rdt_get_mb_table(r))
+ return false;
+ }
+ r->data_width = 3;
+
+ r->alloc_capable = true;
+ r->alloc_enabled = true;
+
+ return true;
+}
+
+static void rdt_get_cache_alloc_cfg(int idx, struct rdt_resource *r)
+{
+ union cpuid_0x10_1_eax eax;
+ union cpuid_0x10_x_edx edx;
+ u32 ebx, ecx;
+
+ cpuid_count(0x00000010, idx, &eax.full, &ebx, &ecx, &edx.full);
+ r->num_closid = edx.split.cos_max + 1;
+ r->cache.cbm_len = eax.split.cbm_len + 1;
+ r->default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1;
+ r->cache.shareable_bits = ebx & r->default_ctrl;
+ r->data_width = (r->cache.cbm_len + 3) / 4;
+ r->alloc_capable = true;
+ r->alloc_enabled = true;
+}
+
+static void rdt_get_cdp_config(int level, int type)
+{
+ struct rdt_resource *r_l = &rdt_resources_all[level];
+ struct rdt_resource *r = &rdt_resources_all[type];
+
+ r->num_closid = r_l->num_closid / 2;
+ r->cache.cbm_len = r_l->cache.cbm_len;
+ r->default_ctrl = r_l->default_ctrl;
+ r->cache.shareable_bits = r_l->cache.shareable_bits;
+ r->data_width = (r->cache.cbm_len + 3) / 4;
+ r->alloc_capable = true;
+ /*
+ * By default, CDP is disabled. CDP can be enabled by mount parameter
+ * "cdp" during resctrl file system mount time.
+ */
+ r->alloc_enabled = false;
+}
+
+static void rdt_get_cdp_l3_config(void)
+{
+ rdt_get_cdp_config(RDT_RESOURCE_L3, RDT_RESOURCE_L3DATA);
+ rdt_get_cdp_config(RDT_RESOURCE_L3, RDT_RESOURCE_L3CODE);
+}
+
+static void rdt_get_cdp_l2_config(void)
+{
+ rdt_get_cdp_config(RDT_RESOURCE_L2, RDT_RESOURCE_L2DATA);
+ rdt_get_cdp_config(RDT_RESOURCE_L2, RDT_RESOURCE_L2CODE);
+}
+
+static int get_cache_id(int cpu, int level)
+{
+ struct cpu_cacheinfo *ci = get_cpu_cacheinfo(cpu);
+ int i;
+
+ for (i = 0; i < ci->num_leaves; i++) {
+ if (ci->info_list[i].level == level)
+ return ci->info_list[i].id;
+ }
+
+ return -1;
+}
+
+/*
+ * Map the memory b/w percentage value to delay values
+ * that can be written to QOS_MSRs.
+ * There are currently no SKUs which support non linear delay values.
+ */
+u32 delay_bw_map(unsigned long bw, struct rdt_resource *r)
+{
+ if (r->membw.delay_linear)
+ return MAX_MBA_BW - bw;
+
+ pr_warn_once("Non Linear delay-bw map not supported but queried\n");
+ return r->default_ctrl;
+}
+
+static void
+mba_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r)
+{
+ unsigned int i;
+
+ /* Write the delay values for mba. */
+ for (i = m->low; i < m->high; i++)
+ wrmsrl(r->msr_base + i, delay_bw_map(d->ctrl_val[i], r));
+}
+
+static void
+cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r)
+{
+ unsigned int i;
+
+ for (i = m->low; i < m->high; i++)
+ wrmsrl(r->msr_base + cbm_idx(r, i), d->ctrl_val[i]);
+}
+
+struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r)
+{
+ struct rdt_domain *d;
+
+ list_for_each_entry(d, &r->domains, list) {
+ /* Find the domain that contains this CPU */
+ if (cpumask_test_cpu(cpu, &d->cpu_mask))
+ return d;
+ }
+
+ return NULL;
+}
+
+void rdt_ctrl_update(void *arg)
+{
+ struct msr_param *m = arg;
+ struct rdt_resource *r = m->res;
+ int cpu = smp_processor_id();
+ struct rdt_domain *d;
+
+ d = get_domain_from_cpu(cpu, r);
+ if (d) {
+ r->msr_update(d, m, r);
+ return;
+ }
+ pr_warn_once("cpu %d not found in any domain for resource %s\n",
+ cpu, r->name);
+}
+
+/*
+ * rdt_find_domain - Find a domain in a resource that matches input resource id
+ *
+ * Search resource r's domain list to find the resource id. If the resource
+ * id is found in a domain, return the domain. Otherwise, if requested by
+ * caller, return the first domain whose id is bigger than the input id.
+ * The domain list is sorted by id in ascending order.
+ */
+struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id,
+ struct list_head **pos)
+{
+ struct rdt_domain *d;
+ struct list_head *l;
+
+ if (id < 0)
+ return ERR_PTR(-ENODEV);
+
+ list_for_each(l, &r->domains) {
+ d = list_entry(l, struct rdt_domain, list);
+ /* When id is found, return its domain. */
+ if (id == d->id)
+ return d;
+ /* Stop searching when finding id's position in sorted list. */
+ if (id < d->id)
+ break;
+ }
+
+ if (pos)
+ *pos = l;
+
+ return NULL;
+}
+
+void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm)
+{
+ int i;
+
+ /*
+ * Initialize the Control MSRs to having no control.
+ * For Cache Allocation: Set all bits in cbm
+ * For Memory Allocation: Set b/w requested to 100%
+ * and the bandwidth in MBps to U32_MAX
+ */
+ for (i = 0; i < r->num_closid; i++, dc++, dm++) {
+ *dc = r->default_ctrl;
+ *dm = MBA_MAX_MBPS;
+ }
+}
+
+static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d)
+{
+ struct msr_param m;
+ u32 *dc, *dm;
+
+ dc = kmalloc_array(r->num_closid, sizeof(*d->ctrl_val), GFP_KERNEL);
+ if (!dc)
+ return -ENOMEM;
+
+ dm = kmalloc_array(r->num_closid, sizeof(*d->mbps_val), GFP_KERNEL);
+ if (!dm) {
+ kfree(dc);
+ return -ENOMEM;
+ }
+
+ d->ctrl_val = dc;
+ d->mbps_val = dm;
+ setup_default_ctrlval(r, dc, dm);
+
+ m.low = 0;
+ m.high = r->num_closid;
+ r->msr_update(d, &m, r);
+ return 0;
+}
+
+static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d)
+{
+ size_t tsize;
+
+ if (is_llc_occupancy_enabled()) {
+ d->rmid_busy_llc = kcalloc(BITS_TO_LONGS(r->num_rmid),
+ sizeof(unsigned long),
+ GFP_KERNEL);
+ if (!d->rmid_busy_llc)
+ return -ENOMEM;
+ INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo);
+ }
+ if (is_mbm_total_enabled()) {
+ tsize = sizeof(*d->mbm_total);
+ d->mbm_total = kcalloc(r->num_rmid, tsize, GFP_KERNEL);
+ if (!d->mbm_total) {
+ kfree(d->rmid_busy_llc);
+ return -ENOMEM;
+ }
+ }
+ if (is_mbm_local_enabled()) {
+ tsize = sizeof(*d->mbm_local);
+ d->mbm_local = kcalloc(r->num_rmid, tsize, GFP_KERNEL);
+ if (!d->mbm_local) {
+ kfree(d->rmid_busy_llc);
+ kfree(d->mbm_total);
+ return -ENOMEM;
+ }
+ }
+
+ if (is_mbm_enabled()) {
+ INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow);
+ mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL);
+ }
+
+ return 0;
+}
+
+/*
+ * domain_add_cpu - Add a cpu to a resource's domain list.
+ *
+ * If an existing domain in the resource r's domain list matches the cpu's
+ * resource id, add the cpu in the domain.
+ *
+ * Otherwise, a new domain is allocated and inserted into the right position
+ * in the domain list sorted by id in ascending order.
+ *
+ * The order in the domain list is visible to users when we print entries
+ * in the schemata file and schemata input is validated to have the same order
+ * as this list.
+ */
+static void domain_add_cpu(int cpu, struct rdt_resource *r)
+{
+ int id = get_cache_id(cpu, r->cache_level);
+ struct list_head *add_pos = NULL;
+ struct rdt_domain *d;
+
+ d = rdt_find_domain(r, id, &add_pos);
+ if (IS_ERR(d)) {
+ pr_warn("Could't find cache id for cpu %d\n", cpu);
+ return;
+ }
+
+ if (d) {
+ cpumask_set_cpu(cpu, &d->cpu_mask);
+ return;
+ }
+
+ d = kzalloc_node(sizeof(*d), GFP_KERNEL, cpu_to_node(cpu));
+ if (!d)
+ return;
+
+ d->id = id;
+ cpumask_set_cpu(cpu, &d->cpu_mask);
+
+ rdt_domain_reconfigure_cdp(r);
+
+ if (r->alloc_capable && domain_setup_ctrlval(r, d)) {
+ kfree(d);
+ return;
+ }
+
+ if (r->mon_capable && domain_setup_mon_state(r, d)) {
+ kfree(d->ctrl_val);
+ kfree(d->mbps_val);
+ kfree(d);
+ return;
+ }
+
+ list_add_tail(&d->list, add_pos);
+
+ /*
+ * If resctrl is mounted, add
+ * per domain monitor data directories.
+ */
+ if (static_branch_unlikely(&rdt_mon_enable_key))
+ mkdir_mondata_subdir_allrdtgrp(r, d);
+}
+
+static void domain_remove_cpu(int cpu, struct rdt_resource *r)
+{
+ int id = get_cache_id(cpu, r->cache_level);
+ struct rdt_domain *d;
+
+ d = rdt_find_domain(r, id, NULL);
+ if (IS_ERR_OR_NULL(d)) {
+ pr_warn("Could't find cache id for cpu %d\n", cpu);
+ return;
+ }
+
+ cpumask_clear_cpu(cpu, &d->cpu_mask);
+ if (cpumask_empty(&d->cpu_mask)) {
+ /*
+ * If resctrl is mounted, remove all the
+ * per domain monitor data directories.
+ */
+ if (static_branch_unlikely(&rdt_mon_enable_key))
+ rmdir_mondata_subdir_allrdtgrp(r, d->id);
+ list_del(&d->list);
+ if (r->mon_capable && is_mbm_enabled())
+ cancel_delayed_work(&d->mbm_over);
+ if (is_llc_occupancy_enabled() && has_busy_rmid(r, d)) {
+ /*
+ * When a package is going down, forcefully
+ * decrement rmid->ebusy. There is no way to know
+ * that the L3 was flushed and hence may lead to
+ * incorrect counts in rare scenarios, but leaving
+ * the RMID as busy creates RMID leaks if the
+ * package never comes back.
+ */
+ __check_limbo(d, true);
+ cancel_delayed_work(&d->cqm_limbo);
+ }
+
+ /*
+ * rdt_domain "d" is going to be freed below, so clear
+ * its pointer from pseudo_lock_region struct.
+ */
+ if (d->plr)
+ d->plr->d = NULL;
+
+ kfree(d->ctrl_val);
+ kfree(d->mbps_val);
+ kfree(d->rmid_busy_llc);
+ kfree(d->mbm_total);
+ kfree(d->mbm_local);
+ kfree(d);
+ return;
+ }
+
+ if (r == &rdt_resources_all[RDT_RESOURCE_L3]) {
+ if (is_mbm_enabled() && cpu == d->mbm_work_cpu) {
+ cancel_delayed_work(&d->mbm_over);
+ mbm_setup_overflow_handler(d, 0);
+ }
+ if (is_llc_occupancy_enabled() && cpu == d->cqm_work_cpu &&
+ has_busy_rmid(r, d)) {
+ cancel_delayed_work(&d->cqm_limbo);
+ cqm_setup_limbo_handler(d, 0);
+ }
+ }
+}
+
+static void clear_closid_rmid(int cpu)
+{
+ struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
+
+ state->default_closid = 0;
+ state->default_rmid = 0;
+ state->cur_closid = 0;
+ state->cur_rmid = 0;
+ wrmsr(IA32_PQR_ASSOC, 0, 0);
+}
+
+static int intel_rdt_online_cpu(unsigned int cpu)
+{
+ struct rdt_resource *r;
+
+ mutex_lock(&rdtgroup_mutex);
+ for_each_capable_rdt_resource(r)
+ domain_add_cpu(cpu, r);
+ /* The cpu is set in default rdtgroup after online. */
+ cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask);
+ clear_closid_rmid(cpu);
+ mutex_unlock(&rdtgroup_mutex);
+
+ return 0;
+}
+
+static void clear_childcpus(struct rdtgroup *r, unsigned int cpu)
+{
+ struct rdtgroup *cr;
+
+ list_for_each_entry(cr, &r->mon.crdtgrp_list, mon.crdtgrp_list) {
+ if (cpumask_test_and_clear_cpu(cpu, &cr->cpu_mask)) {
+ break;
+ }
+ }
+}
+
+static int intel_rdt_offline_cpu(unsigned int cpu)
+{
+ struct rdtgroup *rdtgrp;
+ struct rdt_resource *r;
+
+ mutex_lock(&rdtgroup_mutex);
+ for_each_capable_rdt_resource(r)
+ domain_remove_cpu(cpu, r);
+ list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
+ if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) {
+ clear_childcpus(rdtgrp, cpu);
+ break;
+ }
+ }
+ clear_closid_rmid(cpu);
+ mutex_unlock(&rdtgroup_mutex);
+
+ return 0;
+}
+
+/*
+ * Choose a width for the resource name and resource data based on the
+ * resource that has widest name and cbm.
+ */
+static __init void rdt_init_padding(void)
+{
+ struct rdt_resource *r;
+ int cl;
+
+ for_each_alloc_capable_rdt_resource(r) {
+ cl = strlen(r->name);
+ if (cl > max_name_width)
+ max_name_width = cl;
+
+ if (r->data_width > max_data_width)
+ max_data_width = r->data_width;
+ }
+}
+
+enum {
+ RDT_FLAG_CMT,
+ RDT_FLAG_MBM_TOTAL,
+ RDT_FLAG_MBM_LOCAL,
+ RDT_FLAG_L3_CAT,
+ RDT_FLAG_L3_CDP,
+ RDT_FLAG_L2_CAT,
+ RDT_FLAG_L2_CDP,
+ RDT_FLAG_MBA,
+};
+
+#define RDT_OPT(idx, n, f) \
+[idx] = { \
+ .name = n, \
+ .flag = f \
+}
+
+struct rdt_options {
+ char *name;
+ int flag;
+ bool force_off, force_on;
+};
+
+static struct rdt_options rdt_options[] __initdata = {
+ RDT_OPT(RDT_FLAG_CMT, "cmt", X86_FEATURE_CQM_OCCUP_LLC),
+ RDT_OPT(RDT_FLAG_MBM_TOTAL, "mbmtotal", X86_FEATURE_CQM_MBM_TOTAL),
+ RDT_OPT(RDT_FLAG_MBM_LOCAL, "mbmlocal", X86_FEATURE_CQM_MBM_LOCAL),
+ RDT_OPT(RDT_FLAG_L3_CAT, "l3cat", X86_FEATURE_CAT_L3),
+ RDT_OPT(RDT_FLAG_L3_CDP, "l3cdp", X86_FEATURE_CDP_L3),
+ RDT_OPT(RDT_FLAG_L2_CAT, "l2cat", X86_FEATURE_CAT_L2),
+ RDT_OPT(RDT_FLAG_L2_CDP, "l2cdp", X86_FEATURE_CDP_L2),
+ RDT_OPT(RDT_FLAG_MBA, "mba", X86_FEATURE_MBA),
+};
+#define NUM_RDT_OPTIONS ARRAY_SIZE(rdt_options)
+
+static int __init set_rdt_options(char *str)
+{
+ struct rdt_options *o;
+ bool force_off;
+ char *tok;
+
+ if (*str == '=')
+ str++;
+ while ((tok = strsep(&str, ",")) != NULL) {
+ force_off = *tok == '!';
+ if (force_off)
+ tok++;
+ for (o = rdt_options; o < &rdt_options[NUM_RDT_OPTIONS]; o++) {
+ if (strcmp(tok, o->name) == 0) {
+ if (force_off)
+ o->force_off = true;
+ else
+ o->force_on = true;
+ break;
+ }
+ }
+ }
+ return 1;
+}
+__setup("rdt", set_rdt_options);
+
+static bool __init rdt_cpu_has(int flag)
+{
+ bool ret = boot_cpu_has(flag);
+ struct rdt_options *o;
+
+ if (!ret)
+ return ret;
+
+ for (o = rdt_options; o < &rdt_options[NUM_RDT_OPTIONS]; o++) {
+ if (flag == o->flag) {
+ if (o->force_off)
+ ret = false;
+ if (o->force_on)
+ ret = true;
+ break;
+ }
+ }
+ return ret;
+}
+
+static __init bool get_rdt_alloc_resources(void)
+{
+ bool ret = false;
+
+ if (rdt_alloc_capable)
+ return true;
+
+ if (!boot_cpu_has(X86_FEATURE_RDT_A))
+ return false;
+
+ if (rdt_cpu_has(X86_FEATURE_CAT_L3)) {
+ rdt_get_cache_alloc_cfg(1, &rdt_resources_all[RDT_RESOURCE_L3]);
+ if (rdt_cpu_has(X86_FEATURE_CDP_L3))
+ rdt_get_cdp_l3_config();
+ ret = true;
+ }
+ if (rdt_cpu_has(X86_FEATURE_CAT_L2)) {
+ /* CPUID 0x10.2 fields are same format at 0x10.1 */
+ rdt_get_cache_alloc_cfg(2, &rdt_resources_all[RDT_RESOURCE_L2]);
+ if (rdt_cpu_has(X86_FEATURE_CDP_L2))
+ rdt_get_cdp_l2_config();
+ ret = true;
+ }
+
+ if (rdt_cpu_has(X86_FEATURE_MBA)) {
+ if (rdt_get_mem_config(&rdt_resources_all[RDT_RESOURCE_MBA]))
+ ret = true;
+ }
+ return ret;
+}
+
+static __init bool get_rdt_mon_resources(void)
+{
+ if (rdt_cpu_has(X86_FEATURE_CQM_OCCUP_LLC))
+ rdt_mon_features |= (1 << QOS_L3_OCCUP_EVENT_ID);
+ if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL))
+ rdt_mon_features |= (1 << QOS_L3_MBM_TOTAL_EVENT_ID);
+ if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL))
+ rdt_mon_features |= (1 << QOS_L3_MBM_LOCAL_EVENT_ID);
+
+ if (!rdt_mon_features)
+ return false;
+
+ return !rdt_get_mon_l3_config(&rdt_resources_all[RDT_RESOURCE_L3]);
+}
+
+static __init void rdt_quirks(void)
+{
+ switch (boot_cpu_data.x86_model) {
+ case INTEL_FAM6_HASWELL_X:
+ if (!rdt_options[RDT_FLAG_L3_CAT].force_off)
+ cache_alloc_hsw_probe();
+ break;
+ case INTEL_FAM6_SKYLAKE_X:
+ if (boot_cpu_data.x86_stepping <= 4)
+ set_rdt_options("!cmt,!mbmtotal,!mbmlocal,!l3cat");
+ else
+ set_rdt_options("!l3cat");
+ }
+}
+
+static __init bool get_rdt_resources(void)
+{
+ rdt_quirks();
+ rdt_alloc_capable = get_rdt_alloc_resources();
+ rdt_mon_capable = get_rdt_mon_resources();
+
+ return (rdt_mon_capable || rdt_alloc_capable);
+}
+
+static enum cpuhp_state rdt_online;
+
+static int __init intel_rdt_late_init(void)
+{
+ struct rdt_resource *r;
+ int state, ret;
+
+ if (!get_rdt_resources())
+ return -ENODEV;
+
+ rdt_init_padding();
+
+ state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
+ "x86/rdt/cat:online:",
+ intel_rdt_online_cpu, intel_rdt_offline_cpu);
+ if (state < 0)
+ return state;
+
+ ret = rdtgroup_init();
+ if (ret) {
+ cpuhp_remove_state(state);
+ return ret;
+ }
+ rdt_online = state;
+
+ for_each_alloc_capable_rdt_resource(r)
+ pr_info("Intel RDT %s allocation detected\n", r->name);
+
+ for_each_mon_capable_rdt_resource(r)
+ pr_info("Intel RDT %s monitoring detected\n", r->name);
+
+ return 0;
+}
+
+late_initcall(intel_rdt_late_init);
+
+static void __exit intel_rdt_exit(void)
+{
+ cpuhp_remove_state(rdt_online);
+ rdtgroup_exit();
+}
+
+__exitcall(intel_rdt_exit);
diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
new file mode 100644
index 000000000..8412234ea
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -0,0 +1,570 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_INTEL_RDT_H
+#define _ASM_X86_INTEL_RDT_H
+
+#include <linux/sched.h>
+#include <linux/kernfs.h>
+#include <linux/jump_label.h>
+
+#define IA32_L3_QOS_CFG 0xc81
+#define IA32_L2_QOS_CFG 0xc82
+#define IA32_L3_CBM_BASE 0xc90
+#define IA32_L2_CBM_BASE 0xd10
+#define IA32_MBA_THRTL_BASE 0xd50
+
+#define L3_QOS_CDP_ENABLE 0x01ULL
+
+#define L2_QOS_CDP_ENABLE 0x01ULL
+
+/*
+ * Event IDs are used to program IA32_QM_EVTSEL before reading event
+ * counter from IA32_QM_CTR
+ */
+#define QOS_L3_OCCUP_EVENT_ID 0x01
+#define QOS_L3_MBM_TOTAL_EVENT_ID 0x02
+#define QOS_L3_MBM_LOCAL_EVENT_ID 0x03
+
+#define CQM_LIMBOCHECK_INTERVAL 1000
+
+#define MBM_CNTR_WIDTH 24
+#define MBM_OVERFLOW_INTERVAL 1000
+#define MAX_MBA_BW 100u
+
+#define RMID_VAL_ERROR BIT_ULL(63)
+#define RMID_VAL_UNAVAIL BIT_ULL(62)
+
+DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
+
+/**
+ * struct mon_evt - Entry in the event list of a resource
+ * @evtid: event id
+ * @name: name of the event
+ */
+struct mon_evt {
+ u32 evtid;
+ char *name;
+ struct list_head list;
+};
+
+/**
+ * struct mon_data_bits - Monitoring details for each event file
+ * @rid: Resource id associated with the event file.
+ * @evtid: Event id associated with the event file
+ * @domid: The domain to which the event file belongs
+ */
+union mon_data_bits {
+ void *priv;
+ struct {
+ unsigned int rid : 10;
+ unsigned int evtid : 8;
+ unsigned int domid : 14;
+ } u;
+};
+
+struct rmid_read {
+ struct rdtgroup *rgrp;
+ struct rdt_domain *d;
+ int evtid;
+ bool first;
+ u64 val;
+};
+
+extern unsigned int intel_cqm_threshold;
+extern bool rdt_alloc_capable;
+extern bool rdt_mon_capable;
+extern unsigned int rdt_mon_features;
+
+enum rdt_group_type {
+ RDTCTRL_GROUP = 0,
+ RDTMON_GROUP,
+ RDT_NUM_GROUP,
+};
+
+/**
+ * enum rdtgrp_mode - Mode of a RDT resource group
+ * @RDT_MODE_SHAREABLE: This resource group allows sharing of its allocations
+ * @RDT_MODE_EXCLUSIVE: No sharing of this resource group's allocations allowed
+ * @RDT_MODE_PSEUDO_LOCKSETUP: Resource group will be used for Pseudo-Locking
+ * @RDT_MODE_PSEUDO_LOCKED: No sharing of this resource group's allocations
+ * allowed AND the allocations are Cache Pseudo-Locked
+ *
+ * The mode of a resource group enables control over the allowed overlap
+ * between allocations associated with different resource groups (classes
+ * of service). User is able to modify the mode of a resource group by
+ * writing to the "mode" resctrl file associated with the resource group.
+ *
+ * The "shareable", "exclusive", and "pseudo-locksetup" modes are set by
+ * writing the appropriate text to the "mode" file. A resource group enters
+ * "pseudo-locked" mode after the schemata is written while the resource
+ * group is in "pseudo-locksetup" mode.
+ */
+enum rdtgrp_mode {
+ RDT_MODE_SHAREABLE = 0,
+ RDT_MODE_EXCLUSIVE,
+ RDT_MODE_PSEUDO_LOCKSETUP,
+ RDT_MODE_PSEUDO_LOCKED,
+
+ /* Must be last */
+ RDT_NUM_MODES,
+};
+
+/**
+ * struct mongroup - store mon group's data in resctrl fs.
+ * @mon_data_kn kernlfs node for the mon_data directory
+ * @parent: parent rdtgrp
+ * @crdtgrp_list: child rdtgroup node list
+ * @rmid: rmid for this rdtgroup
+ */
+struct mongroup {
+ struct kernfs_node *mon_data_kn;
+ struct rdtgroup *parent;
+ struct list_head crdtgrp_list;
+ u32 rmid;
+};
+
+/**
+ * struct pseudo_lock_region - pseudo-lock region information
+ * @r: RDT resource to which this pseudo-locked region
+ * belongs
+ * @d: RDT domain to which this pseudo-locked region
+ * belongs
+ * @cbm: bitmask of the pseudo-locked region
+ * @lock_thread_wq: waitqueue used to wait on the pseudo-locking thread
+ * completion
+ * @thread_done: variable used by waitqueue to test if pseudo-locking
+ * thread completed
+ * @cpu: core associated with the cache on which the setup code
+ * will be run
+ * @line_size: size of the cache lines
+ * @size: size of pseudo-locked region in bytes
+ * @kmem: the kernel memory associated with pseudo-locked region
+ * @minor: minor number of character device associated with this
+ * region
+ * @debugfs_dir: pointer to this region's directory in the debugfs
+ * filesystem
+ * @pm_reqs: Power management QoS requests related to this region
+ */
+struct pseudo_lock_region {
+ struct rdt_resource *r;
+ struct rdt_domain *d;
+ u32 cbm;
+ wait_queue_head_t lock_thread_wq;
+ int thread_done;
+ int cpu;
+ unsigned int line_size;
+ unsigned int size;
+ void *kmem;
+ unsigned int minor;
+ struct dentry *debugfs_dir;
+ struct list_head pm_reqs;
+};
+
+/**
+ * struct rdtgroup - store rdtgroup's data in resctrl file system.
+ * @kn: kernfs node
+ * @rdtgroup_list: linked list for all rdtgroups
+ * @closid: closid for this rdtgroup
+ * @cpu_mask: CPUs assigned to this rdtgroup
+ * @flags: status bits
+ * @waitcount: how many cpus expect to find this
+ * group when they acquire rdtgroup_mutex
+ * @type: indicates type of this rdtgroup - either
+ * monitor only or ctrl_mon group
+ * @mon: mongroup related data
+ * @mode: mode of resource group
+ * @plr: pseudo-locked region
+ */
+struct rdtgroup {
+ struct kernfs_node *kn;
+ struct list_head rdtgroup_list;
+ u32 closid;
+ struct cpumask cpu_mask;
+ int flags;
+ atomic_t waitcount;
+ enum rdt_group_type type;
+ struct mongroup mon;
+ enum rdtgrp_mode mode;
+ struct pseudo_lock_region *plr;
+};
+
+/* rdtgroup.flags */
+#define RDT_DELETED 1
+
+/* rftype.flags */
+#define RFTYPE_FLAGS_CPUS_LIST 1
+
+/*
+ * Define the file type flags for base and info directories.
+ */
+#define RFTYPE_INFO BIT(0)
+#define RFTYPE_BASE BIT(1)
+#define RF_CTRLSHIFT 4
+#define RF_MONSHIFT 5
+#define RF_TOPSHIFT 6
+#define RFTYPE_CTRL BIT(RF_CTRLSHIFT)
+#define RFTYPE_MON BIT(RF_MONSHIFT)
+#define RFTYPE_TOP BIT(RF_TOPSHIFT)
+#define RFTYPE_RES_CACHE BIT(8)
+#define RFTYPE_RES_MB BIT(9)
+#define RF_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL)
+#define RF_MON_INFO (RFTYPE_INFO | RFTYPE_MON)
+#define RF_TOP_INFO (RFTYPE_INFO | RFTYPE_TOP)
+#define RF_CTRL_BASE (RFTYPE_BASE | RFTYPE_CTRL)
+
+/* List of all resource groups */
+extern struct list_head rdt_all_groups;
+
+extern int max_name_width, max_data_width;
+
+int __init rdtgroup_init(void);
+void __exit rdtgroup_exit(void);
+
+/**
+ * struct rftype - describe each file in the resctrl file system
+ * @name: File name
+ * @mode: Access mode
+ * @kf_ops: File operations
+ * @flags: File specific RFTYPE_FLAGS_* flags
+ * @fflags: File specific RF_* or RFTYPE_* flags
+ * @seq_show: Show content of the file
+ * @write: Write to the file
+ */
+struct rftype {
+ char *name;
+ umode_t mode;
+ struct kernfs_ops *kf_ops;
+ unsigned long flags;
+ unsigned long fflags;
+
+ int (*seq_show)(struct kernfs_open_file *of,
+ struct seq_file *sf, void *v);
+ /*
+ * write() is the generic write callback which maps directly to
+ * kernfs write operation and overrides all other operations.
+ * Maximum write size is determined by ->max_write_len.
+ */
+ ssize_t (*write)(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off);
+};
+
+/**
+ * struct mbm_state - status for each MBM counter in each domain
+ * @chunks: Total data moved (multiply by rdt_group.mon_scale to get bytes)
+ * @prev_msr Value of IA32_QM_CTR for this RMID last time we read it
+ * @prev_bw_msr:Value of previous IA32_QM_CTR for bandwidth counting
+ * @prev_bw The most recent bandwidth in MBps
+ * @delta_bw Difference between the current and previous bandwidth
+ * @delta_comp Indicates whether to compute the delta_bw
+ */
+struct mbm_state {
+ u64 chunks;
+ u64 prev_msr;
+ u64 prev_bw_msr;
+ u32 prev_bw;
+ u32 delta_bw;
+ bool delta_comp;
+};
+
+/**
+ * struct rdt_domain - group of cpus sharing an RDT resource
+ * @list: all instances of this resource
+ * @id: unique id for this instance
+ * @cpu_mask: which cpus share this resource
+ * @rmid_busy_llc:
+ * bitmap of which limbo RMIDs are above threshold
+ * @mbm_total: saved state for MBM total bandwidth
+ * @mbm_local: saved state for MBM local bandwidth
+ * @mbm_over: worker to periodically read MBM h/w counters
+ * @cqm_limbo: worker to periodically read CQM h/w counters
+ * @mbm_work_cpu:
+ * worker cpu for MBM h/w counters
+ * @cqm_work_cpu:
+ * worker cpu for CQM h/w counters
+ * @ctrl_val: array of cache or mem ctrl values (indexed by CLOSID)
+ * @mbps_val: When mba_sc is enabled, this holds the bandwidth in MBps
+ * @new_ctrl: new ctrl value to be loaded
+ * @have_new_ctrl: did user provide new_ctrl for this domain
+ * @plr: pseudo-locked region (if any) associated with domain
+ */
+struct rdt_domain {
+ struct list_head list;
+ int id;
+ struct cpumask cpu_mask;
+ unsigned long *rmid_busy_llc;
+ struct mbm_state *mbm_total;
+ struct mbm_state *mbm_local;
+ struct delayed_work mbm_over;
+ struct delayed_work cqm_limbo;
+ int mbm_work_cpu;
+ int cqm_work_cpu;
+ u32 *ctrl_val;
+ u32 *mbps_val;
+ u32 new_ctrl;
+ bool have_new_ctrl;
+ struct pseudo_lock_region *plr;
+};
+
+/**
+ * struct msr_param - set a range of MSRs from a domain
+ * @res: The resource to use
+ * @low: Beginning index from base MSR
+ * @high: End index
+ */
+struct msr_param {
+ struct rdt_resource *res;
+ int low;
+ int high;
+};
+
+/**
+ * struct rdt_cache - Cache allocation related data
+ * @cbm_len: Length of the cache bit mask
+ * @min_cbm_bits: Minimum number of consecutive bits to be set
+ * @cbm_idx_mult: Multiplier of CBM index
+ * @cbm_idx_offset: Offset of CBM index. CBM index is computed by:
+ * closid * cbm_idx_multi + cbm_idx_offset
+ * in a cache bit mask
+ * @shareable_bits: Bitmask of shareable resource with other
+ * executing entities
+ */
+struct rdt_cache {
+ unsigned int cbm_len;
+ unsigned int min_cbm_bits;
+ unsigned int cbm_idx_mult;
+ unsigned int cbm_idx_offset;
+ unsigned int shareable_bits;
+};
+
+/**
+ * struct rdt_membw - Memory bandwidth allocation related data
+ * @max_delay: Max throttle delay. Delay is the hardware
+ * representation for memory bandwidth.
+ * @min_bw: Minimum memory bandwidth percentage user can request
+ * @bw_gran: Granularity at which the memory bandwidth is allocated
+ * @delay_linear: True if memory B/W delay is in linear scale
+ * @mba_sc: True if MBA software controller(mba_sc) is enabled
+ * @mb_map: Mapping of memory B/W percentage to memory B/W delay
+ */
+struct rdt_membw {
+ u32 max_delay;
+ u32 min_bw;
+ u32 bw_gran;
+ u32 delay_linear;
+ bool mba_sc;
+ u32 *mb_map;
+};
+
+static inline bool is_llc_occupancy_enabled(void)
+{
+ return (rdt_mon_features & (1 << QOS_L3_OCCUP_EVENT_ID));
+}
+
+static inline bool is_mbm_total_enabled(void)
+{
+ return (rdt_mon_features & (1 << QOS_L3_MBM_TOTAL_EVENT_ID));
+}
+
+static inline bool is_mbm_local_enabled(void)
+{
+ return (rdt_mon_features & (1 << QOS_L3_MBM_LOCAL_EVENT_ID));
+}
+
+static inline bool is_mbm_enabled(void)
+{
+ return (is_mbm_total_enabled() || is_mbm_local_enabled());
+}
+
+static inline bool is_mbm_event(int e)
+{
+ return (e >= QOS_L3_MBM_TOTAL_EVENT_ID &&
+ e <= QOS_L3_MBM_LOCAL_EVENT_ID);
+}
+
+struct rdt_parse_data {
+ struct rdtgroup *rdtgrp;
+ char *buf;
+};
+
+/**
+ * struct rdt_resource - attributes of an RDT resource
+ * @rid: The index of the resource
+ * @alloc_enabled: Is allocation enabled on this machine
+ * @mon_enabled: Is monitoring enabled for this feature
+ * @alloc_capable: Is allocation available on this machine
+ * @mon_capable: Is monitor feature available on this machine
+ * @name: Name to use in "schemata" file
+ * @num_closid: Number of CLOSIDs available
+ * @cache_level: Which cache level defines scope of this resource
+ * @default_ctrl: Specifies default cache cbm or memory B/W percent.
+ * @msr_base: Base MSR address for CBMs
+ * @msr_update: Function pointer to update QOS MSRs
+ * @data_width: Character width of data when displaying
+ * @domains: All domains for this resource
+ * @cache: Cache allocation related data
+ * @format_str: Per resource format string to show domain value
+ * @parse_ctrlval: Per resource function pointer to parse control values
+ * @evt_list: List of monitoring events
+ * @num_rmid: Number of RMIDs available
+ * @mon_scale: cqm counter * mon_scale = occupancy in bytes
+ * @fflags: flags to choose base and info files
+ */
+struct rdt_resource {
+ int rid;
+ bool alloc_enabled;
+ bool mon_enabled;
+ bool alloc_capable;
+ bool mon_capable;
+ char *name;
+ int num_closid;
+ int cache_level;
+ u32 default_ctrl;
+ unsigned int msr_base;
+ void (*msr_update) (struct rdt_domain *d, struct msr_param *m,
+ struct rdt_resource *r);
+ int data_width;
+ struct list_head domains;
+ struct rdt_cache cache;
+ struct rdt_membw membw;
+ const char *format_str;
+ int (*parse_ctrlval)(struct rdt_parse_data *data,
+ struct rdt_resource *r,
+ struct rdt_domain *d);
+ struct list_head evt_list;
+ int num_rmid;
+ unsigned int mon_scale;
+ unsigned long fflags;
+};
+
+int parse_cbm(struct rdt_parse_data *data, struct rdt_resource *r,
+ struct rdt_domain *d);
+int parse_bw(struct rdt_parse_data *data, struct rdt_resource *r,
+ struct rdt_domain *d);
+
+extern struct mutex rdtgroup_mutex;
+
+extern struct rdt_resource rdt_resources_all[];
+extern struct rdtgroup rdtgroup_default;
+DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
+
+extern struct dentry *debugfs_resctrl;
+
+enum {
+ RDT_RESOURCE_L3,
+ RDT_RESOURCE_L3DATA,
+ RDT_RESOURCE_L3CODE,
+ RDT_RESOURCE_L2,
+ RDT_RESOURCE_L2DATA,
+ RDT_RESOURCE_L2CODE,
+ RDT_RESOURCE_MBA,
+
+ /* Must be the last */
+ RDT_NUM_RESOURCES,
+};
+
+#define for_each_capable_rdt_resource(r) \
+ for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
+ r++) \
+ if (r->alloc_capable || r->mon_capable)
+
+#define for_each_alloc_capable_rdt_resource(r) \
+ for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
+ r++) \
+ if (r->alloc_capable)
+
+#define for_each_mon_capable_rdt_resource(r) \
+ for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
+ r++) \
+ if (r->mon_capable)
+
+#define for_each_alloc_enabled_rdt_resource(r) \
+ for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
+ r++) \
+ if (r->alloc_enabled)
+
+#define for_each_mon_enabled_rdt_resource(r) \
+ for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
+ r++) \
+ if (r->mon_enabled)
+
+/* CPUID.(EAX=10H, ECX=ResID=1).EAX */
+union cpuid_0x10_1_eax {
+ struct {
+ unsigned int cbm_len:5;
+ } split;
+ unsigned int full;
+};
+
+/* CPUID.(EAX=10H, ECX=ResID=3).EAX */
+union cpuid_0x10_3_eax {
+ struct {
+ unsigned int max_delay:12;
+ } split;
+ unsigned int full;
+};
+
+/* CPUID.(EAX=10H, ECX=ResID).EDX */
+union cpuid_0x10_x_edx {
+ struct {
+ unsigned int cos_max:16;
+ } split;
+ unsigned int full;
+};
+
+void rdt_last_cmd_clear(void);
+void rdt_last_cmd_puts(const char *s);
+void rdt_last_cmd_printf(const char *fmt, ...);
+
+void rdt_ctrl_update(void *arg);
+struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn);
+void rdtgroup_kn_unlock(struct kernfs_node *kn);
+int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name);
+int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name,
+ umode_t mask);
+struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id,
+ struct list_head **pos);
+ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off);
+int rdtgroup_schemata_show(struct kernfs_open_file *of,
+ struct seq_file *s, void *v);
+bool rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d,
+ unsigned long cbm, int closid, bool exclusive);
+unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, struct rdt_domain *d,
+ unsigned long cbm);
+enum rdtgrp_mode rdtgroup_mode_by_closid(int closid);
+int rdtgroup_tasks_assigned(struct rdtgroup *r);
+int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp);
+int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp);
+bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_domain *d, unsigned long cbm);
+bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d);
+int rdt_pseudo_lock_init(void);
+void rdt_pseudo_lock_release(void);
+int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp);
+void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp);
+struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r);
+int update_domains(struct rdt_resource *r, int closid);
+int closids_supported(void);
+void closid_free(int closid);
+int alloc_rmid(void);
+void free_rmid(u32 rmid);
+int rdt_get_mon_l3_config(struct rdt_resource *r);
+void mon_event_count(void *info);
+int rdtgroup_mondata_show(struct seq_file *m, void *arg);
+void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
+ unsigned int dom_id);
+void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
+ struct rdt_domain *d);
+void mon_event_read(struct rmid_read *rr, struct rdt_domain *d,
+ struct rdtgroup *rdtgrp, int evtid, int first);
+void mbm_setup_overflow_handler(struct rdt_domain *dom,
+ unsigned long delay_ms);
+void mbm_handle_overflow(struct work_struct *work);
+bool is_mba_sc(struct rdt_resource *r);
+void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm);
+u32 delay_bw_map(unsigned long bw, struct rdt_resource *r);
+void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms);
+void cqm_handle_limbo(struct work_struct *work);
+bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d);
+void __check_limbo(struct rdt_domain *d, bool force_free);
+void rdt_domain_reconfigure_cdp(struct rdt_resource *r);
+
+#endif /* _ASM_X86_INTEL_RDT_H */
diff --git a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
new file mode 100644
index 000000000..2052e1e6a
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
@@ -0,0 +1,491 @@
+/*
+ * Resource Director Technology(RDT)
+ * - Cache Allocation code.
+ *
+ * Copyright (C) 2016 Intel Corporation
+ *
+ * Authors:
+ * Fenghua Yu <fenghua.yu@intel.com>
+ * Tony Luck <tony.luck@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * More information about RDT be found in the Intel (R) x86 Architecture
+ * Software Developer Manual June 2016, volume 3, section 17.17.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/cpu.h>
+#include <linux/kernfs.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include "intel_rdt.h"
+
+/*
+ * Check whether MBA bandwidth percentage value is correct. The value is
+ * checked against the minimum and max bandwidth values specified by the
+ * hardware. The allocated bandwidth percentage is rounded to the next
+ * control step available on the hardware.
+ */
+static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r)
+{
+ unsigned long bw;
+ int ret;
+
+ /*
+ * Only linear delay values is supported for current Intel SKUs.
+ */
+ if (!r->membw.delay_linear) {
+ rdt_last_cmd_puts("No support for non-linear MB domains\n");
+ return false;
+ }
+
+ ret = kstrtoul(buf, 10, &bw);
+ if (ret) {
+ rdt_last_cmd_printf("Non-decimal digit in MB value %s\n", buf);
+ return false;
+ }
+
+ if ((bw < r->membw.min_bw || bw > r->default_ctrl) &&
+ !is_mba_sc(r)) {
+ rdt_last_cmd_printf("MB value %ld out of range [%d,%d]\n", bw,
+ r->membw.min_bw, r->default_ctrl);
+ return false;
+ }
+
+ *data = roundup(bw, (unsigned long)r->membw.bw_gran);
+ return true;
+}
+
+int parse_bw(struct rdt_parse_data *data, struct rdt_resource *r,
+ struct rdt_domain *d)
+{
+ unsigned long bw_val;
+
+ if (d->have_new_ctrl) {
+ rdt_last_cmd_printf("duplicate domain %d\n", d->id);
+ return -EINVAL;
+ }
+
+ if (!bw_validate(data->buf, &bw_val, r))
+ return -EINVAL;
+ d->new_ctrl = bw_val;
+ d->have_new_ctrl = true;
+
+ return 0;
+}
+
+/*
+ * Check whether a cache bit mask is valid. The SDM says:
+ * Please note that all (and only) contiguous '1' combinations
+ * are allowed (e.g. FFFFH, 0FF0H, 003CH, etc.).
+ * Additionally Haswell requires at least two bits set.
+ */
+static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r)
+{
+ unsigned long first_bit, zero_bit, val;
+ unsigned int cbm_len = r->cache.cbm_len;
+ int ret;
+
+ ret = kstrtoul(buf, 16, &val);
+ if (ret) {
+ rdt_last_cmd_printf("non-hex character in mask %s\n", buf);
+ return false;
+ }
+
+ if (val == 0 || val > r->default_ctrl) {
+ rdt_last_cmd_puts("mask out of range\n");
+ return false;
+ }
+
+ first_bit = find_first_bit(&val, cbm_len);
+ zero_bit = find_next_zero_bit(&val, cbm_len, first_bit);
+
+ if (find_next_bit(&val, cbm_len, zero_bit) < cbm_len) {
+ rdt_last_cmd_printf("mask %lx has non-consecutive 1-bits\n", val);
+ return false;
+ }
+
+ if ((zero_bit - first_bit) < r->cache.min_cbm_bits) {
+ rdt_last_cmd_printf("Need at least %d bits in mask\n",
+ r->cache.min_cbm_bits);
+ return false;
+ }
+
+ *data = val;
+ return true;
+}
+
+/*
+ * Read one cache bit mask (hex). Check that it is valid for the current
+ * resource type.
+ */
+int parse_cbm(struct rdt_parse_data *data, struct rdt_resource *r,
+ struct rdt_domain *d)
+{
+ struct rdtgroup *rdtgrp = data->rdtgrp;
+ u32 cbm_val;
+
+ if (d->have_new_ctrl) {
+ rdt_last_cmd_printf("duplicate domain %d\n", d->id);
+ return -EINVAL;
+ }
+
+ /*
+ * Cannot set up more than one pseudo-locked region in a cache
+ * hierarchy.
+ */
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP &&
+ rdtgroup_pseudo_locked_in_hierarchy(d)) {
+ rdt_last_cmd_printf("pseudo-locked region in hierarchy\n");
+ return -EINVAL;
+ }
+
+ if (!cbm_validate(data->buf, &cbm_val, r))
+ return -EINVAL;
+
+ if ((rdtgrp->mode == RDT_MODE_EXCLUSIVE ||
+ rdtgrp->mode == RDT_MODE_SHAREABLE) &&
+ rdtgroup_cbm_overlaps_pseudo_locked(d, cbm_val)) {
+ rdt_last_cmd_printf("CBM overlaps with pseudo-locked region\n");
+ return -EINVAL;
+ }
+
+ /*
+ * The CBM may not overlap with the CBM of another closid if
+ * either is exclusive.
+ */
+ if (rdtgroup_cbm_overlaps(r, d, cbm_val, rdtgrp->closid, true)) {
+ rdt_last_cmd_printf("overlaps with exclusive group\n");
+ return -EINVAL;
+ }
+
+ if (rdtgroup_cbm_overlaps(r, d, cbm_val, rdtgrp->closid, false)) {
+ if (rdtgrp->mode == RDT_MODE_EXCLUSIVE ||
+ rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
+ rdt_last_cmd_printf("overlaps with other group\n");
+ return -EINVAL;
+ }
+ }
+
+ d->new_ctrl = cbm_val;
+ d->have_new_ctrl = true;
+
+ return 0;
+}
+
+/*
+ * For each domain in this resource we expect to find a series of:
+ * id=mask
+ * separated by ";". The "id" is in decimal, and must match one of
+ * the "id"s for this resource.
+ */
+static int parse_line(char *line, struct rdt_resource *r,
+ struct rdtgroup *rdtgrp)
+{
+ struct rdt_parse_data data;
+ char *dom = NULL, *id;
+ struct rdt_domain *d;
+ unsigned long dom_id;
+
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP &&
+ r->rid == RDT_RESOURCE_MBA) {
+ rdt_last_cmd_puts("Cannot pseudo-lock MBA resource\n");
+ return -EINVAL;
+ }
+
+next:
+ if (!line || line[0] == '\0')
+ return 0;
+ dom = strsep(&line, ";");
+ id = strsep(&dom, "=");
+ if (!dom || kstrtoul(id, 10, &dom_id)) {
+ rdt_last_cmd_puts("Missing '=' or non-numeric domain\n");
+ return -EINVAL;
+ }
+ dom = strim(dom);
+ list_for_each_entry(d, &r->domains, list) {
+ if (d->id == dom_id) {
+ data.buf = dom;
+ data.rdtgrp = rdtgrp;
+ if (r->parse_ctrlval(&data, r, d))
+ return -EINVAL;
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
+ /*
+ * In pseudo-locking setup mode and just
+ * parsed a valid CBM that should be
+ * pseudo-locked. Only one locked region per
+ * resource group and domain so just do
+ * the required initialization for single
+ * region and return.
+ */
+ rdtgrp->plr->r = r;
+ rdtgrp->plr->d = d;
+ rdtgrp->plr->cbm = d->new_ctrl;
+ d->plr = rdtgrp->plr;
+ return 0;
+ }
+ goto next;
+ }
+ }
+ return -EINVAL;
+}
+
+int update_domains(struct rdt_resource *r, int closid)
+{
+ struct msr_param msr_param;
+ cpumask_var_t cpu_mask;
+ struct rdt_domain *d;
+ bool mba_sc;
+ u32 *dc;
+ int cpu;
+
+ if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ msr_param.low = closid;
+ msr_param.high = msr_param.low + 1;
+ msr_param.res = r;
+
+ mba_sc = is_mba_sc(r);
+ list_for_each_entry(d, &r->domains, list) {
+ dc = !mba_sc ? d->ctrl_val : d->mbps_val;
+ if (d->have_new_ctrl && d->new_ctrl != dc[closid]) {
+ cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
+ dc[closid] = d->new_ctrl;
+ }
+ }
+
+ /*
+ * Avoid writing the control msr with control values when
+ * MBA software controller is enabled
+ */
+ if (cpumask_empty(cpu_mask) || mba_sc)
+ goto done;
+ cpu = get_cpu();
+ /* Update CBM on this cpu if it's in cpu_mask. */
+ if (cpumask_test_cpu(cpu, cpu_mask))
+ rdt_ctrl_update(&msr_param);
+ /* Update CBM on other cpus. */
+ smp_call_function_many(cpu_mask, rdt_ctrl_update, &msr_param, 1);
+ put_cpu();
+
+done:
+ free_cpumask_var(cpu_mask);
+
+ return 0;
+}
+
+static int rdtgroup_parse_resource(char *resname, char *tok,
+ struct rdtgroup *rdtgrp)
+{
+ struct rdt_resource *r;
+
+ for_each_alloc_enabled_rdt_resource(r) {
+ if (!strcmp(resname, r->name) && rdtgrp->closid < r->num_closid)
+ return parse_line(tok, r, rdtgrp);
+ }
+ rdt_last_cmd_printf("unknown/unsupported resource name '%s'\n", resname);
+ return -EINVAL;
+}
+
+ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct rdtgroup *rdtgrp;
+ struct rdt_domain *dom;
+ struct rdt_resource *r;
+ char *tok, *resname;
+ int ret = 0;
+
+ /* Valid input requires a trailing newline */
+ if (nbytes == 0 || buf[nbytes - 1] != '\n')
+ return -EINVAL;
+ buf[nbytes - 1] = '\0';
+
+ cpus_read_lock();
+ rdtgrp = rdtgroup_kn_lock_live(of->kn);
+ if (!rdtgrp) {
+ rdtgroup_kn_unlock(of->kn);
+ cpus_read_unlock();
+ return -ENOENT;
+ }
+ rdt_last_cmd_clear();
+
+ /*
+ * No changes to pseudo-locked region allowed. It has to be removed
+ * and re-created instead.
+ */
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
+ ret = -EINVAL;
+ rdt_last_cmd_puts("resource group is pseudo-locked\n");
+ goto out;
+ }
+
+ for_each_alloc_enabled_rdt_resource(r) {
+ list_for_each_entry(dom, &r->domains, list)
+ dom->have_new_ctrl = false;
+ }
+
+ while ((tok = strsep(&buf, "\n")) != NULL) {
+ resname = strim(strsep(&tok, ":"));
+ if (!tok) {
+ rdt_last_cmd_puts("Missing ':'\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ if (tok[0] == '\0') {
+ rdt_last_cmd_printf("Missing '%s' value\n", resname);
+ ret = -EINVAL;
+ goto out;
+ }
+ ret = rdtgroup_parse_resource(resname, tok, rdtgrp);
+ if (ret)
+ goto out;
+ }
+
+ for_each_alloc_enabled_rdt_resource(r) {
+ ret = update_domains(r, rdtgrp->closid);
+ if (ret)
+ goto out;
+ }
+
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
+ /*
+ * If pseudo-locking fails we keep the resource group in
+ * mode RDT_MODE_PSEUDO_LOCKSETUP with its class of service
+ * active and updated for just the domain the pseudo-locked
+ * region was requested for.
+ */
+ ret = rdtgroup_pseudo_lock_create(rdtgrp);
+ }
+
+out:
+ rdtgroup_kn_unlock(of->kn);
+ cpus_read_unlock();
+ return ret ?: nbytes;
+}
+
+static void show_doms(struct seq_file *s, struct rdt_resource *r, int closid)
+{
+ struct rdt_domain *dom;
+ bool sep = false;
+ u32 ctrl_val;
+
+ seq_printf(s, "%*s:", max_name_width, r->name);
+ list_for_each_entry(dom, &r->domains, list) {
+ if (sep)
+ seq_puts(s, ";");
+
+ ctrl_val = (!is_mba_sc(r) ? dom->ctrl_val[closid] :
+ dom->mbps_val[closid]);
+ seq_printf(s, r->format_str, dom->id, max_data_width,
+ ctrl_val);
+ sep = true;
+ }
+ seq_puts(s, "\n");
+}
+
+int rdtgroup_schemata_show(struct kernfs_open_file *of,
+ struct seq_file *s, void *v)
+{
+ struct rdtgroup *rdtgrp;
+ struct rdt_resource *r;
+ int ret = 0;
+ u32 closid;
+
+ rdtgrp = rdtgroup_kn_lock_live(of->kn);
+ if (rdtgrp) {
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
+ for_each_alloc_enabled_rdt_resource(r)
+ seq_printf(s, "%s:uninitialized\n", r->name);
+ } else if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
+ if (!rdtgrp->plr->d) {
+ rdt_last_cmd_clear();
+ rdt_last_cmd_puts("Cache domain offline\n");
+ ret = -ENODEV;
+ } else {
+ seq_printf(s, "%s:%d=%x\n",
+ rdtgrp->plr->r->name,
+ rdtgrp->plr->d->id,
+ rdtgrp->plr->cbm);
+ }
+ } else {
+ closid = rdtgrp->closid;
+ for_each_alloc_enabled_rdt_resource(r) {
+ if (closid < r->num_closid)
+ show_doms(s, r, closid);
+ }
+ }
+ } else {
+ ret = -ENOENT;
+ }
+ rdtgroup_kn_unlock(of->kn);
+ return ret;
+}
+
+void mon_event_read(struct rmid_read *rr, struct rdt_domain *d,
+ struct rdtgroup *rdtgrp, int evtid, int first)
+{
+ /*
+ * setup the parameters to send to the IPI to read the data.
+ */
+ rr->rgrp = rdtgrp;
+ rr->evtid = evtid;
+ rr->d = d;
+ rr->val = 0;
+ rr->first = first;
+
+ smp_call_function_any(&d->cpu_mask, mon_event_count, rr, 1);
+}
+
+int rdtgroup_mondata_show(struct seq_file *m, void *arg)
+{
+ struct kernfs_open_file *of = m->private;
+ u32 resid, evtid, domid;
+ struct rdtgroup *rdtgrp;
+ struct rdt_resource *r;
+ union mon_data_bits md;
+ struct rdt_domain *d;
+ struct rmid_read rr;
+ int ret = 0;
+
+ rdtgrp = rdtgroup_kn_lock_live(of->kn);
+ if (!rdtgrp) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ md.priv = of->kn->priv;
+ resid = md.u.rid;
+ domid = md.u.domid;
+ evtid = md.u.evtid;
+
+ r = &rdt_resources_all[resid];
+ d = rdt_find_domain(r, domid, NULL);
+ if (IS_ERR_OR_NULL(d)) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ mon_event_read(&rr, d, rdtgrp, evtid, false);
+
+ if (rr.val & RMID_VAL_ERROR)
+ seq_puts(m, "Error\n");
+ else if (rr.val & RMID_VAL_UNAVAIL)
+ seq_puts(m, "Unavailable\n");
+ else
+ seq_printf(m, "%llu\n", rr.val * r->mon_scale);
+
+out:
+ rdtgroup_kn_unlock(of->kn);
+ return ret;
+}
diff --git a/arch/x86/kernel/cpu/intel_rdt_monitor.c b/arch/x86/kernel/cpu/intel_rdt_monitor.c
new file mode 100644
index 000000000..5fee674fe
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_rdt_monitor.c
@@ -0,0 +1,660 @@
+/*
+ * Resource Director Technology(RDT)
+ * - Monitoring code
+ *
+ * Copyright (C) 2017 Intel Corporation
+ *
+ * Author:
+ * Vikas Shivappa <vikas.shivappa@intel.com>
+ *
+ * This replaces the cqm.c based on perf but we reuse a lot of
+ * code and datastructures originally from Peter Zijlstra and Matt Fleming.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * More information about RDT be found in the Intel (R) x86 Architecture
+ * Software Developer Manual June 2016, volume 3, section 17.17.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <asm/cpu_device_id.h>
+#include "intel_rdt.h"
+
+#define MSR_IA32_QM_CTR 0x0c8e
+#define MSR_IA32_QM_EVTSEL 0x0c8d
+
+struct rmid_entry {
+ u32 rmid;
+ int busy;
+ struct list_head list;
+};
+
+/**
+ * @rmid_free_lru A least recently used list of free RMIDs
+ * These RMIDs are guaranteed to have an occupancy less than the
+ * threshold occupancy
+ */
+static LIST_HEAD(rmid_free_lru);
+
+/**
+ * @rmid_limbo_count count of currently unused but (potentially)
+ * dirty RMIDs.
+ * This counts RMIDs that no one is currently using but that
+ * may have a occupancy value > intel_cqm_threshold. User can change
+ * the threshold occupancy value.
+ */
+static unsigned int rmid_limbo_count;
+
+/**
+ * @rmid_entry - The entry in the limbo and free lists.
+ */
+static struct rmid_entry *rmid_ptrs;
+
+/*
+ * Global boolean for rdt_monitor which is true if any
+ * resource monitoring is enabled.
+ */
+bool rdt_mon_capable;
+
+/*
+ * Global to indicate which monitoring events are enabled.
+ */
+unsigned int rdt_mon_features;
+
+/*
+ * This is the threshold cache occupancy at which we will consider an
+ * RMID available for re-allocation.
+ */
+unsigned int intel_cqm_threshold;
+
+static inline struct rmid_entry *__rmid_entry(u32 rmid)
+{
+ struct rmid_entry *entry;
+
+ entry = &rmid_ptrs[rmid];
+ WARN_ON(entry->rmid != rmid);
+
+ return entry;
+}
+
+static u64 __rmid_read(u32 rmid, u32 eventid)
+{
+ u64 val;
+
+ /*
+ * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured
+ * with a valid event code for supported resource type and the bits
+ * IA32_QM_EVTSEL.RMID (bits 41:32) are configured with valid RMID,
+ * IA32_QM_CTR.data (bits 61:0) reports the monitored data.
+ * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62)
+ * are error bits.
+ */
+ wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
+ rdmsrl(MSR_IA32_QM_CTR, val);
+
+ return val;
+}
+
+static bool rmid_dirty(struct rmid_entry *entry)
+{
+ u64 val = __rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID);
+
+ return val >= intel_cqm_threshold;
+}
+
+/*
+ * Check the RMIDs that are marked as busy for this domain. If the
+ * reported LLC occupancy is below the threshold clear the busy bit and
+ * decrement the count. If the busy count gets to zero on an RMID, we
+ * free the RMID
+ */
+void __check_limbo(struct rdt_domain *d, bool force_free)
+{
+ struct rmid_entry *entry;
+ struct rdt_resource *r;
+ u32 crmid = 1, nrmid;
+
+ r = &rdt_resources_all[RDT_RESOURCE_L3];
+
+ /*
+ * Skip RMID 0 and start from RMID 1 and check all the RMIDs that
+ * are marked as busy for occupancy < threshold. If the occupancy
+ * is less than the threshold decrement the busy counter of the
+ * RMID and move it to the free list when the counter reaches 0.
+ */
+ for (;;) {
+ nrmid = find_next_bit(d->rmid_busy_llc, r->num_rmid, crmid);
+ if (nrmid >= r->num_rmid)
+ break;
+
+ entry = __rmid_entry(nrmid);
+ if (force_free || !rmid_dirty(entry)) {
+ clear_bit(entry->rmid, d->rmid_busy_llc);
+ if (!--entry->busy) {
+ rmid_limbo_count--;
+ list_add_tail(&entry->list, &rmid_free_lru);
+ }
+ }
+ crmid = nrmid + 1;
+ }
+}
+
+bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d)
+{
+ return find_first_bit(d->rmid_busy_llc, r->num_rmid) != r->num_rmid;
+}
+
+/*
+ * As of now the RMIDs allocation is global.
+ * However we keep track of which packages the RMIDs
+ * are used to optimize the limbo list management.
+ */
+int alloc_rmid(void)
+{
+ struct rmid_entry *entry;
+
+ lockdep_assert_held(&rdtgroup_mutex);
+
+ if (list_empty(&rmid_free_lru))
+ return rmid_limbo_count ? -EBUSY : -ENOSPC;
+
+ entry = list_first_entry(&rmid_free_lru,
+ struct rmid_entry, list);
+ list_del(&entry->list);
+
+ return entry->rmid;
+}
+
+static void add_rmid_to_limbo(struct rmid_entry *entry)
+{
+ struct rdt_resource *r;
+ struct rdt_domain *d;
+ int cpu;
+ u64 val;
+
+ r = &rdt_resources_all[RDT_RESOURCE_L3];
+
+ entry->busy = 0;
+ cpu = get_cpu();
+ list_for_each_entry(d, &r->domains, list) {
+ if (cpumask_test_cpu(cpu, &d->cpu_mask)) {
+ val = __rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID);
+ if (val <= intel_cqm_threshold)
+ continue;
+ }
+
+ /*
+ * For the first limbo RMID in the domain,
+ * setup up the limbo worker.
+ */
+ if (!has_busy_rmid(r, d))
+ cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL);
+ set_bit(entry->rmid, d->rmid_busy_llc);
+ entry->busy++;
+ }
+ put_cpu();
+
+ if (entry->busy)
+ rmid_limbo_count++;
+ else
+ list_add_tail(&entry->list, &rmid_free_lru);
+}
+
+void free_rmid(u32 rmid)
+{
+ struct rmid_entry *entry;
+
+ if (!rmid)
+ return;
+
+ lockdep_assert_held(&rdtgroup_mutex);
+
+ entry = __rmid_entry(rmid);
+
+ if (is_llc_occupancy_enabled())
+ add_rmid_to_limbo(entry);
+ else
+ list_add_tail(&entry->list, &rmid_free_lru);
+}
+
+static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr)
+{
+ u64 shift = 64 - MBM_CNTR_WIDTH, chunks;
+
+ chunks = (cur_msr << shift) - (prev_msr << shift);
+ return chunks >>= shift;
+}
+
+static u64 __mon_event_count(u32 rmid, struct rmid_read *rr)
+{
+ struct mbm_state *m;
+ u64 chunks, tval;
+
+ tval = __rmid_read(rmid, rr->evtid);
+ if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) {
+ return tval;
+ }
+ switch (rr->evtid) {
+ case QOS_L3_OCCUP_EVENT_ID:
+ rr->val += tval;
+ return 0;
+ case QOS_L3_MBM_TOTAL_EVENT_ID:
+ m = &rr->d->mbm_total[rmid];
+ break;
+ case QOS_L3_MBM_LOCAL_EVENT_ID:
+ m = &rr->d->mbm_local[rmid];
+ break;
+ default:
+ /*
+ * Code would never reach here because an invalid
+ * event id would fail the __rmid_read.
+ */
+ return RMID_VAL_ERROR;
+ }
+
+ if (rr->first) {
+ memset(m, 0, sizeof(struct mbm_state));
+ m->prev_bw_msr = m->prev_msr = tval;
+ return 0;
+ }
+
+ chunks = mbm_overflow_count(m->prev_msr, tval);
+ m->chunks += chunks;
+ m->prev_msr = tval;
+
+ rr->val += m->chunks;
+ return 0;
+}
+
+/*
+ * Supporting function to calculate the memory bandwidth
+ * and delta bandwidth in MBps.
+ */
+static void mbm_bw_count(u32 rmid, struct rmid_read *rr)
+{
+ struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3];
+ struct mbm_state *m = &rr->d->mbm_local[rmid];
+ u64 tval, cur_bw, chunks;
+
+ tval = __rmid_read(rmid, rr->evtid);
+ if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
+ return;
+
+ chunks = mbm_overflow_count(m->prev_bw_msr, tval);
+ cur_bw = (chunks * r->mon_scale) >> 20;
+
+ if (m->delta_comp)
+ m->delta_bw = abs(cur_bw - m->prev_bw);
+ m->delta_comp = false;
+ m->prev_bw = cur_bw;
+ m->prev_bw_msr = tval;
+}
+
+/*
+ * This is called via IPI to read the CQM/MBM counters
+ * on a domain.
+ */
+void mon_event_count(void *info)
+{
+ struct rdtgroup *rdtgrp, *entry;
+ struct rmid_read *rr = info;
+ struct list_head *head;
+ u64 ret_val;
+
+ rdtgrp = rr->rgrp;
+
+ ret_val = __mon_event_count(rdtgrp->mon.rmid, rr);
+
+ /*
+ * For Ctrl groups read data from child monitor groups and
+ * add them together. Count events which are read successfully.
+ * Discard the rmid_read's reporting errors.
+ */
+ head = &rdtgrp->mon.crdtgrp_list;
+
+ if (rdtgrp->type == RDTCTRL_GROUP) {
+ list_for_each_entry(entry, head, mon.crdtgrp_list) {
+ if (__mon_event_count(entry->mon.rmid, rr) == 0)
+ ret_val = 0;
+ }
+ }
+
+ /* Report error if none of rmid_reads are successful */
+ if (ret_val)
+ rr->val = ret_val;
+}
+
+/*
+ * Feedback loop for MBA software controller (mba_sc)
+ *
+ * mba_sc is a feedback loop where we periodically read MBM counters and
+ * adjust the bandwidth percentage values via the IA32_MBA_THRTL_MSRs so
+ * that:
+ *
+ * current bandwdith(cur_bw) < user specified bandwidth(user_bw)
+ *
+ * This uses the MBM counters to measure the bandwidth and MBA throttle
+ * MSRs to control the bandwidth for a particular rdtgrp. It builds on the
+ * fact that resctrl rdtgroups have both monitoring and control.
+ *
+ * The frequency of the checks is 1s and we just tag along the MBM overflow
+ * timer. Having 1s interval makes the calculation of bandwidth simpler.
+ *
+ * Although MBA's goal is to restrict the bandwidth to a maximum, there may
+ * be a need to increase the bandwidth to avoid uncecessarily restricting
+ * the L2 <-> L3 traffic.
+ *
+ * Since MBA controls the L2 external bandwidth where as MBM measures the
+ * L3 external bandwidth the following sequence could lead to such a
+ * situation.
+ *
+ * Consider an rdtgroup which had high L3 <-> memory traffic in initial
+ * phases -> mba_sc kicks in and reduced bandwidth percentage values -> but
+ * after some time rdtgroup has mostly L2 <-> L3 traffic.
+ *
+ * In this case we may restrict the rdtgroup's L2 <-> L3 traffic as its
+ * throttle MSRs already have low percentage values. To avoid
+ * unnecessarily restricting such rdtgroups, we also increase the bandwidth.
+ */
+static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
+{
+ u32 closid, rmid, cur_msr, cur_msr_val, new_msr_val;
+ struct mbm_state *pmbm_data, *cmbm_data;
+ u32 cur_bw, delta_bw, user_bw;
+ struct rdt_resource *r_mba;
+ struct rdt_domain *dom_mba;
+ struct list_head *head;
+ struct rdtgroup *entry;
+
+ if (!is_mbm_local_enabled())
+ return;
+
+ r_mba = &rdt_resources_all[RDT_RESOURCE_MBA];
+ closid = rgrp->closid;
+ rmid = rgrp->mon.rmid;
+ pmbm_data = &dom_mbm->mbm_local[rmid];
+
+ dom_mba = get_domain_from_cpu(smp_processor_id(), r_mba);
+ if (!dom_mba) {
+ pr_warn_once("Failure to get domain for MBA update\n");
+ return;
+ }
+
+ cur_bw = pmbm_data->prev_bw;
+ user_bw = dom_mba->mbps_val[closid];
+ delta_bw = pmbm_data->delta_bw;
+ cur_msr_val = dom_mba->ctrl_val[closid];
+
+ /*
+ * For Ctrl groups read data from child monitor groups.
+ */
+ head = &rgrp->mon.crdtgrp_list;
+ list_for_each_entry(entry, head, mon.crdtgrp_list) {
+ cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid];
+ cur_bw += cmbm_data->prev_bw;
+ delta_bw += cmbm_data->delta_bw;
+ }
+
+ /*
+ * Scale up/down the bandwidth linearly for the ctrl group. The
+ * bandwidth step is the bandwidth granularity specified by the
+ * hardware.
+ *
+ * The delta_bw is used when increasing the bandwidth so that we
+ * dont alternately increase and decrease the control values
+ * continuously.
+ *
+ * For ex: consider cur_bw = 90MBps, user_bw = 100MBps and if
+ * bandwidth step is 20MBps(> user_bw - cur_bw), we would keep
+ * switching between 90 and 110 continuously if we only check
+ * cur_bw < user_bw.
+ */
+ if (cur_msr_val > r_mba->membw.min_bw && user_bw < cur_bw) {
+ new_msr_val = cur_msr_val - r_mba->membw.bw_gran;
+ } else if (cur_msr_val < MAX_MBA_BW &&
+ (user_bw > (cur_bw + delta_bw))) {
+ new_msr_val = cur_msr_val + r_mba->membw.bw_gran;
+ } else {
+ return;
+ }
+
+ cur_msr = r_mba->msr_base + closid;
+ wrmsrl(cur_msr, delay_bw_map(new_msr_val, r_mba));
+ dom_mba->ctrl_val[closid] = new_msr_val;
+
+ /*
+ * Delta values are updated dynamically package wise for each
+ * rdtgrp everytime the throttle MSR changes value.
+ *
+ * This is because (1)the increase in bandwidth is not perfectly
+ * linear and only "approximately" linear even when the hardware
+ * says it is linear.(2)Also since MBA is a core specific
+ * mechanism, the delta values vary based on number of cores used
+ * by the rdtgrp.
+ */
+ pmbm_data->delta_comp = true;
+ list_for_each_entry(entry, head, mon.crdtgrp_list) {
+ cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid];
+ cmbm_data->delta_comp = true;
+ }
+}
+
+static void mbm_update(struct rdt_domain *d, int rmid)
+{
+ struct rmid_read rr;
+
+ rr.first = false;
+ rr.d = d;
+
+ /*
+ * This is protected from concurrent reads from user
+ * as both the user and we hold the global mutex.
+ */
+ if (is_mbm_total_enabled()) {
+ rr.evtid = QOS_L3_MBM_TOTAL_EVENT_ID;
+ __mon_event_count(rmid, &rr);
+ }
+ if (is_mbm_local_enabled()) {
+ rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID;
+ __mon_event_count(rmid, &rr);
+
+ /*
+ * Call the MBA software controller only for the
+ * control groups and when user has enabled
+ * the software controller explicitly.
+ */
+ if (is_mba_sc(NULL))
+ mbm_bw_count(rmid, &rr);
+ }
+}
+
+/*
+ * Handler to scan the limbo list and move the RMIDs
+ * to free list whose occupancy < threshold_occupancy.
+ */
+void cqm_handle_limbo(struct work_struct *work)
+{
+ unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL);
+ int cpu = smp_processor_id();
+ struct rdt_resource *r;
+ struct rdt_domain *d;
+
+ mutex_lock(&rdtgroup_mutex);
+
+ r = &rdt_resources_all[RDT_RESOURCE_L3];
+ d = get_domain_from_cpu(cpu, r);
+
+ if (!d) {
+ pr_warn_once("Failure to get domain for limbo worker\n");
+ goto out_unlock;
+ }
+
+ __check_limbo(d, false);
+
+ if (has_busy_rmid(r, d))
+ schedule_delayed_work_on(cpu, &d->cqm_limbo, delay);
+
+out_unlock:
+ mutex_unlock(&rdtgroup_mutex);
+}
+
+void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms)
+{
+ unsigned long delay = msecs_to_jiffies(delay_ms);
+ struct rdt_resource *r;
+ int cpu;
+
+ r = &rdt_resources_all[RDT_RESOURCE_L3];
+
+ cpu = cpumask_any(&dom->cpu_mask);
+ dom->cqm_work_cpu = cpu;
+
+ schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay);
+}
+
+void mbm_handle_overflow(struct work_struct *work)
+{
+ unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL);
+ struct rdtgroup *prgrp, *crgrp;
+ int cpu = smp_processor_id();
+ struct list_head *head;
+ struct rdt_domain *d;
+
+ mutex_lock(&rdtgroup_mutex);
+
+ if (!static_branch_likely(&rdt_enable_key))
+ goto out_unlock;
+
+ d = get_domain_from_cpu(cpu, &rdt_resources_all[RDT_RESOURCE_L3]);
+ if (!d)
+ goto out_unlock;
+
+ list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
+ mbm_update(d, prgrp->mon.rmid);
+
+ head = &prgrp->mon.crdtgrp_list;
+ list_for_each_entry(crgrp, head, mon.crdtgrp_list)
+ mbm_update(d, crgrp->mon.rmid);
+
+ if (is_mba_sc(NULL))
+ update_mba_bw(prgrp, d);
+ }
+
+ schedule_delayed_work_on(cpu, &d->mbm_over, delay);
+
+out_unlock:
+ mutex_unlock(&rdtgroup_mutex);
+}
+
+void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms)
+{
+ unsigned long delay = msecs_to_jiffies(delay_ms);
+ int cpu;
+
+ if (!static_branch_likely(&rdt_enable_key))
+ return;
+ cpu = cpumask_any(&dom->cpu_mask);
+ dom->mbm_work_cpu = cpu;
+ schedule_delayed_work_on(cpu, &dom->mbm_over, delay);
+}
+
+static int dom_data_init(struct rdt_resource *r)
+{
+ struct rmid_entry *entry = NULL;
+ int i, nr_rmids;
+
+ nr_rmids = r->num_rmid;
+ rmid_ptrs = kcalloc(nr_rmids, sizeof(struct rmid_entry), GFP_KERNEL);
+ if (!rmid_ptrs)
+ return -ENOMEM;
+
+ for (i = 0; i < nr_rmids; i++) {
+ entry = &rmid_ptrs[i];
+ INIT_LIST_HEAD(&entry->list);
+
+ entry->rmid = i;
+ list_add_tail(&entry->list, &rmid_free_lru);
+ }
+
+ /*
+ * RMID 0 is special and is always allocated. It's used for all
+ * tasks that are not monitored.
+ */
+ entry = __rmid_entry(0);
+ list_del(&entry->list);
+
+ return 0;
+}
+
+static struct mon_evt llc_occupancy_event = {
+ .name = "llc_occupancy",
+ .evtid = QOS_L3_OCCUP_EVENT_ID,
+};
+
+static struct mon_evt mbm_total_event = {
+ .name = "mbm_total_bytes",
+ .evtid = QOS_L3_MBM_TOTAL_EVENT_ID,
+};
+
+static struct mon_evt mbm_local_event = {
+ .name = "mbm_local_bytes",
+ .evtid = QOS_L3_MBM_LOCAL_EVENT_ID,
+};
+
+/*
+ * Initialize the event list for the resource.
+ *
+ * Note that MBM events are also part of RDT_RESOURCE_L3 resource
+ * because as per the SDM the total and local memory bandwidth
+ * are enumerated as part of L3 monitoring.
+ */
+static void l3_mon_evt_init(struct rdt_resource *r)
+{
+ INIT_LIST_HEAD(&r->evt_list);
+
+ if (is_llc_occupancy_enabled())
+ list_add_tail(&llc_occupancy_event.list, &r->evt_list);
+ if (is_mbm_total_enabled())
+ list_add_tail(&mbm_total_event.list, &r->evt_list);
+ if (is_mbm_local_enabled())
+ list_add_tail(&mbm_local_event.list, &r->evt_list);
+}
+
+int rdt_get_mon_l3_config(struct rdt_resource *r)
+{
+ int ret;
+
+ r->mon_scale = boot_cpu_data.x86_cache_occ_scale;
+ r->num_rmid = boot_cpu_data.x86_cache_max_rmid + 1;
+
+ /*
+ * A reasonable upper limit on the max threshold is the number
+ * of lines tagged per RMID if all RMIDs have the same number of
+ * lines tagged in the LLC.
+ *
+ * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
+ */
+ intel_cqm_threshold = boot_cpu_data.x86_cache_size * 1024 / r->num_rmid;
+
+ /* h/w works in units of "boot_cpu_data.x86_cache_occ_scale" */
+ intel_cqm_threshold /= r->mon_scale;
+
+ ret = dom_data_init(r);
+ if (ret)
+ return ret;
+
+ l3_mon_evt_init(r);
+
+ r->mon_capable = true;
+ r->mon_enabled = true;
+
+ return 0;
+}
diff --git a/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c
new file mode 100644
index 000000000..a999a58ca
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c
@@ -0,0 +1,1534 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Resource Director Technology (RDT)
+ *
+ * Pseudo-locking support built on top of Cache Allocation Technology (CAT)
+ *
+ * Copyright (C) 2018 Intel Corporation
+ *
+ * Author: Reinette Chatre <reinette.chatre@intel.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/cacheinfo.h>
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/debugfs.h>
+#include <linux/kthread.h>
+#include <linux/mman.h>
+#include <linux/pm_qos.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+#include <asm/cacheflush.h>
+#include <asm/intel-family.h>
+#include <asm/intel_rdt_sched.h>
+#include <asm/perf_event.h>
+
+#include "intel_rdt.h"
+
+#define CREATE_TRACE_POINTS
+#include "intel_rdt_pseudo_lock_event.h"
+
+/*
+ * MSR_MISC_FEATURE_CONTROL register enables the modification of hardware
+ * prefetcher state. Details about this register can be found in the MSR
+ * tables for specific platforms found in Intel's SDM.
+ */
+#define MSR_MISC_FEATURE_CONTROL 0x000001a4
+
+/*
+ * The bits needed to disable hardware prefetching varies based on the
+ * platform. During initialization we will discover which bits to use.
+ */
+static u64 prefetch_disable_bits;
+
+/*
+ * Major number assigned to and shared by all devices exposing
+ * pseudo-locked regions.
+ */
+static unsigned int pseudo_lock_major;
+static unsigned long pseudo_lock_minor_avail = GENMASK(MINORBITS, 0);
+static struct class *pseudo_lock_class;
+
+/**
+ * get_prefetch_disable_bits - prefetch disable bits of supported platforms
+ *
+ * Capture the list of platforms that have been validated to support
+ * pseudo-locking. This includes testing to ensure pseudo-locked regions
+ * with low cache miss rates can be created under variety of load conditions
+ * as well as that these pseudo-locked regions can maintain their low cache
+ * miss rates under variety of load conditions for significant lengths of time.
+ *
+ * After a platform has been validated to support pseudo-locking its
+ * hardware prefetch disable bits are included here as they are documented
+ * in the SDM.
+ *
+ * When adding a platform here also add support for its cache events to
+ * measure_cycles_perf_fn()
+ *
+ * Return:
+ * If platform is supported, the bits to disable hardware prefetchers, 0
+ * if platform is not supported.
+ */
+static u64 get_prefetch_disable_bits(void)
+{
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
+ boot_cpu_data.x86 != 6)
+ return 0;
+
+ switch (boot_cpu_data.x86_model) {
+ case INTEL_FAM6_BROADWELL_X:
+ /*
+ * SDM defines bits of MSR_MISC_FEATURE_CONTROL register
+ * as:
+ * 0 L2 Hardware Prefetcher Disable (R/W)
+ * 1 L2 Adjacent Cache Line Prefetcher Disable (R/W)
+ * 2 DCU Hardware Prefetcher Disable (R/W)
+ * 3 DCU IP Prefetcher Disable (R/W)
+ * 63:4 Reserved
+ */
+ return 0xF;
+ case INTEL_FAM6_ATOM_GOLDMONT:
+ case INTEL_FAM6_ATOM_GOLDMONT_PLUS:
+ /*
+ * SDM defines bits of MSR_MISC_FEATURE_CONTROL register
+ * as:
+ * 0 L2 Hardware Prefetcher Disable (R/W)
+ * 1 Reserved
+ * 2 DCU Hardware Prefetcher Disable (R/W)
+ * 63:3 Reserved
+ */
+ return 0x5;
+ }
+
+ return 0;
+}
+
+/*
+ * Helper to write 64bit value to MSR without tracing. Used when
+ * use of the cache should be restricted and use of registers used
+ * for local variables avoided.
+ */
+static inline void pseudo_wrmsrl_notrace(unsigned int msr, u64 val)
+{
+ __wrmsr(msr, (u32)(val & 0xffffffffULL), (u32)(val >> 32));
+}
+
+/**
+ * pseudo_lock_minor_get - Obtain available minor number
+ * @minor: Pointer to where new minor number will be stored
+ *
+ * A bitmask is used to track available minor numbers. Here the next free
+ * minor number is marked as unavailable and returned.
+ *
+ * Return: 0 on success, <0 on failure.
+ */
+static int pseudo_lock_minor_get(unsigned int *minor)
+{
+ unsigned long first_bit;
+
+ first_bit = find_first_bit(&pseudo_lock_minor_avail, MINORBITS);
+
+ if (first_bit == MINORBITS)
+ return -ENOSPC;
+
+ __clear_bit(first_bit, &pseudo_lock_minor_avail);
+ *minor = first_bit;
+
+ return 0;
+}
+
+/**
+ * pseudo_lock_minor_release - Return minor number to available
+ * @minor: The minor number made available
+ */
+static void pseudo_lock_minor_release(unsigned int minor)
+{
+ __set_bit(minor, &pseudo_lock_minor_avail);
+}
+
+/**
+ * region_find_by_minor - Locate a pseudo-lock region by inode minor number
+ * @minor: The minor number of the device representing pseudo-locked region
+ *
+ * When the character device is accessed we need to determine which
+ * pseudo-locked region it belongs to. This is done by matching the minor
+ * number of the device to the pseudo-locked region it belongs.
+ *
+ * Minor numbers are assigned at the time a pseudo-locked region is associated
+ * with a cache instance.
+ *
+ * Return: On success return pointer to resource group owning the pseudo-locked
+ * region, NULL on failure.
+ */
+static struct rdtgroup *region_find_by_minor(unsigned int minor)
+{
+ struct rdtgroup *rdtgrp, *rdtgrp_match = NULL;
+
+ list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
+ if (rdtgrp->plr && rdtgrp->plr->minor == minor) {
+ rdtgrp_match = rdtgrp;
+ break;
+ }
+ }
+ return rdtgrp_match;
+}
+
+/**
+ * pseudo_lock_pm_req - A power management QoS request list entry
+ * @list: Entry within the @pm_reqs list for a pseudo-locked region
+ * @req: PM QoS request
+ */
+struct pseudo_lock_pm_req {
+ struct list_head list;
+ struct dev_pm_qos_request req;
+};
+
+static void pseudo_lock_cstates_relax(struct pseudo_lock_region *plr)
+{
+ struct pseudo_lock_pm_req *pm_req, *next;
+
+ list_for_each_entry_safe(pm_req, next, &plr->pm_reqs, list) {
+ dev_pm_qos_remove_request(&pm_req->req);
+ list_del(&pm_req->list);
+ kfree(pm_req);
+ }
+}
+
+/**
+ * pseudo_lock_cstates_constrain - Restrict cores from entering C6
+ *
+ * To prevent the cache from being affected by power management entering
+ * C6 has to be avoided. This is accomplished by requesting a latency
+ * requirement lower than lowest C6 exit latency of all supported
+ * platforms as found in the cpuidle state tables in the intel_idle driver.
+ * At this time it is possible to do so with a single latency requirement
+ * for all supported platforms.
+ *
+ * Since Goldmont is supported, which is affected by X86_BUG_MONITOR,
+ * the ACPI latencies need to be considered while keeping in mind that C2
+ * may be set to map to deeper sleep states. In this case the latency
+ * requirement needs to prevent entering C2 also.
+ */
+static int pseudo_lock_cstates_constrain(struct pseudo_lock_region *plr)
+{
+ struct pseudo_lock_pm_req *pm_req;
+ int cpu;
+ int ret;
+
+ for_each_cpu(cpu, &plr->d->cpu_mask) {
+ pm_req = kzalloc(sizeof(*pm_req), GFP_KERNEL);
+ if (!pm_req) {
+ rdt_last_cmd_puts("fail allocating mem for PM QoS\n");
+ ret = -ENOMEM;
+ goto out_err;
+ }
+ ret = dev_pm_qos_add_request(get_cpu_device(cpu),
+ &pm_req->req,
+ DEV_PM_QOS_RESUME_LATENCY,
+ 30);
+ if (ret < 0) {
+ rdt_last_cmd_printf("fail to add latency req cpu%d\n",
+ cpu);
+ kfree(pm_req);
+ ret = -1;
+ goto out_err;
+ }
+ list_add(&pm_req->list, &plr->pm_reqs);
+ }
+
+ return 0;
+
+out_err:
+ pseudo_lock_cstates_relax(plr);
+ return ret;
+}
+
+/**
+ * pseudo_lock_region_clear - Reset pseudo-lock region data
+ * @plr: pseudo-lock region
+ *
+ * All content of the pseudo-locked region is reset - any memory allocated
+ * freed.
+ *
+ * Return: void
+ */
+static void pseudo_lock_region_clear(struct pseudo_lock_region *plr)
+{
+ plr->size = 0;
+ plr->line_size = 0;
+ kfree(plr->kmem);
+ plr->kmem = NULL;
+ plr->r = NULL;
+ if (plr->d)
+ plr->d->plr = NULL;
+ plr->d = NULL;
+ plr->cbm = 0;
+ plr->debugfs_dir = NULL;
+}
+
+/**
+ * pseudo_lock_region_init - Initialize pseudo-lock region information
+ * @plr: pseudo-lock region
+ *
+ * Called after user provided a schemata to be pseudo-locked. From the
+ * schemata the &struct pseudo_lock_region is on entry already initialized
+ * with the resource, domain, and capacity bitmask. Here the information
+ * required for pseudo-locking is deduced from this data and &struct
+ * pseudo_lock_region initialized further. This information includes:
+ * - size in bytes of the region to be pseudo-locked
+ * - cache line size to know the stride with which data needs to be accessed
+ * to be pseudo-locked
+ * - a cpu associated with the cache instance on which the pseudo-locking
+ * flow can be executed
+ *
+ * Return: 0 on success, <0 on failure. Descriptive error will be written
+ * to last_cmd_status buffer.
+ */
+static int pseudo_lock_region_init(struct pseudo_lock_region *plr)
+{
+ struct cpu_cacheinfo *ci;
+ int ret;
+ int i;
+
+ /* Pick the first cpu we find that is associated with the cache. */
+ plr->cpu = cpumask_first(&plr->d->cpu_mask);
+
+ if (!cpu_online(plr->cpu)) {
+ rdt_last_cmd_printf("cpu %u associated with cache not online\n",
+ plr->cpu);
+ ret = -ENODEV;
+ goto out_region;
+ }
+
+ ci = get_cpu_cacheinfo(plr->cpu);
+
+ plr->size = rdtgroup_cbm_to_size(plr->r, plr->d, plr->cbm);
+
+ for (i = 0; i < ci->num_leaves; i++) {
+ if (ci->info_list[i].level == plr->r->cache_level) {
+ plr->line_size = ci->info_list[i].coherency_line_size;
+ return 0;
+ }
+ }
+
+ ret = -1;
+ rdt_last_cmd_puts("unable to determine cache line size\n");
+out_region:
+ pseudo_lock_region_clear(plr);
+ return ret;
+}
+
+/**
+ * pseudo_lock_init - Initialize a pseudo-lock region
+ * @rdtgrp: resource group to which new pseudo-locked region will belong
+ *
+ * A pseudo-locked region is associated with a resource group. When this
+ * association is created the pseudo-locked region is initialized. The
+ * details of the pseudo-locked region are not known at this time so only
+ * allocation is done and association established.
+ *
+ * Return: 0 on success, <0 on failure
+ */
+static int pseudo_lock_init(struct rdtgroup *rdtgrp)
+{
+ struct pseudo_lock_region *plr;
+
+ plr = kzalloc(sizeof(*plr), GFP_KERNEL);
+ if (!plr)
+ return -ENOMEM;
+
+ init_waitqueue_head(&plr->lock_thread_wq);
+ INIT_LIST_HEAD(&plr->pm_reqs);
+ rdtgrp->plr = plr;
+ return 0;
+}
+
+/**
+ * pseudo_lock_region_alloc - Allocate kernel memory that will be pseudo-locked
+ * @plr: pseudo-lock region
+ *
+ * Initialize the details required to set up the pseudo-locked region and
+ * allocate the contiguous memory that will be pseudo-locked to the cache.
+ *
+ * Return: 0 on success, <0 on failure. Descriptive error will be written
+ * to last_cmd_status buffer.
+ */
+static int pseudo_lock_region_alloc(struct pseudo_lock_region *plr)
+{
+ int ret;
+
+ ret = pseudo_lock_region_init(plr);
+ if (ret < 0)
+ return ret;
+
+ /*
+ * We do not yet support contiguous regions larger than
+ * KMALLOC_MAX_SIZE.
+ */
+ if (plr->size > KMALLOC_MAX_SIZE) {
+ rdt_last_cmd_puts("requested region exceeds maximum size\n");
+ ret = -E2BIG;
+ goto out_region;
+ }
+
+ plr->kmem = kzalloc(plr->size, GFP_KERNEL);
+ if (!plr->kmem) {
+ rdt_last_cmd_puts("unable to allocate memory\n");
+ ret = -ENOMEM;
+ goto out_region;
+ }
+
+ ret = 0;
+ goto out;
+out_region:
+ pseudo_lock_region_clear(plr);
+out:
+ return ret;
+}
+
+/**
+ * pseudo_lock_free - Free a pseudo-locked region
+ * @rdtgrp: resource group to which pseudo-locked region belonged
+ *
+ * The pseudo-locked region's resources have already been released, or not
+ * yet created at this point. Now it can be freed and disassociated from the
+ * resource group.
+ *
+ * Return: void
+ */
+static void pseudo_lock_free(struct rdtgroup *rdtgrp)
+{
+ pseudo_lock_region_clear(rdtgrp->plr);
+ kfree(rdtgrp->plr);
+ rdtgrp->plr = NULL;
+}
+
+/**
+ * pseudo_lock_fn - Load kernel memory into cache
+ * @_rdtgrp: resource group to which pseudo-lock region belongs
+ *
+ * This is the core pseudo-locking flow.
+ *
+ * First we ensure that the kernel memory cannot be found in the cache.
+ * Then, while taking care that there will be as little interference as
+ * possible, the memory to be loaded is accessed while core is running
+ * with class of service set to the bitmask of the pseudo-locked region.
+ * After this is complete no future CAT allocations will be allowed to
+ * overlap with this bitmask.
+ *
+ * Local register variables are utilized to ensure that the memory region
+ * to be locked is the only memory access made during the critical locking
+ * loop.
+ *
+ * Return: 0. Waiter on waitqueue will be woken on completion.
+ */
+static int pseudo_lock_fn(void *_rdtgrp)
+{
+ struct rdtgroup *rdtgrp = _rdtgrp;
+ struct pseudo_lock_region *plr = rdtgrp->plr;
+ u32 rmid_p, closid_p;
+ unsigned long i;
+#ifdef CONFIG_KASAN
+ /*
+ * The registers used for local register variables are also used
+ * when KASAN is active. When KASAN is active we use a regular
+ * variable to ensure we always use a valid pointer, but the cost
+ * is that this variable will enter the cache through evicting the
+ * memory we are trying to lock into the cache. Thus expect lower
+ * pseudo-locking success rate when KASAN is active.
+ */
+ unsigned int line_size;
+ unsigned int size;
+ void *mem_r;
+#else
+ register unsigned int line_size asm("esi");
+ register unsigned int size asm("edi");
+#ifdef CONFIG_X86_64
+ register void *mem_r asm("rbx");
+#else
+ register void *mem_r asm("ebx");
+#endif /* CONFIG_X86_64 */
+#endif /* CONFIG_KASAN */
+
+ /*
+ * Make sure none of the allocated memory is cached. If it is we
+ * will get a cache hit in below loop from outside of pseudo-locked
+ * region.
+ * wbinvd (as opposed to clflush/clflushopt) is required to
+ * increase likelihood that allocated cache portion will be filled
+ * with associated memory.
+ */
+ native_wbinvd();
+
+ /*
+ * Always called with interrupts enabled. By disabling interrupts
+ * ensure that we will not be preempted during this critical section.
+ */
+ local_irq_disable();
+
+ /*
+ * Call wrmsr and rdmsr as directly as possible to avoid tracing
+ * clobbering local register variables or affecting cache accesses.
+ *
+ * Disable the hardware prefetcher so that when the end of the memory
+ * being pseudo-locked is reached the hardware will not read beyond
+ * the buffer and evict pseudo-locked memory read earlier from the
+ * cache.
+ */
+ __wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0);
+ closid_p = this_cpu_read(pqr_state.cur_closid);
+ rmid_p = this_cpu_read(pqr_state.cur_rmid);
+ mem_r = plr->kmem;
+ size = plr->size;
+ line_size = plr->line_size;
+ /*
+ * Critical section begin: start by writing the closid associated
+ * with the capacity bitmask of the cache region being
+ * pseudo-locked followed by reading of kernel memory to load it
+ * into the cache.
+ */
+ __wrmsr(IA32_PQR_ASSOC, rmid_p, rdtgrp->closid);
+ /*
+ * Cache was flushed earlier. Now access kernel memory to read it
+ * into cache region associated with just activated plr->closid.
+ * Loop over data twice:
+ * - In first loop the cache region is shared with the page walker
+ * as it populates the paging structure caches (including TLB).
+ * - In the second loop the paging structure caches are used and
+ * cache region is populated with the memory being referenced.
+ */
+ for (i = 0; i < size; i += PAGE_SIZE) {
+ /*
+ * Add a barrier to prevent speculative execution of this
+ * loop reading beyond the end of the buffer.
+ */
+ rmb();
+ asm volatile("mov (%0,%1,1), %%eax\n\t"
+ :
+ : "r" (mem_r), "r" (i)
+ : "%eax", "memory");
+ }
+ for (i = 0; i < size; i += line_size) {
+ /*
+ * Add a barrier to prevent speculative execution of this
+ * loop reading beyond the end of the buffer.
+ */
+ rmb();
+ asm volatile("mov (%0,%1,1), %%eax\n\t"
+ :
+ : "r" (mem_r), "r" (i)
+ : "%eax", "memory");
+ }
+ /*
+ * Critical section end: restore closid with capacity bitmask that
+ * does not overlap with pseudo-locked region.
+ */
+ __wrmsr(IA32_PQR_ASSOC, rmid_p, closid_p);
+
+ /* Re-enable the hardware prefetcher(s) */
+ wrmsr(MSR_MISC_FEATURE_CONTROL, 0x0, 0x0);
+ local_irq_enable();
+
+ plr->thread_done = 1;
+ wake_up_interruptible(&plr->lock_thread_wq);
+ return 0;
+}
+
+/**
+ * rdtgroup_monitor_in_progress - Test if monitoring in progress
+ * @r: resource group being queried
+ *
+ * Return: 1 if monitor groups have been created for this resource
+ * group, 0 otherwise.
+ */
+static int rdtgroup_monitor_in_progress(struct rdtgroup *rdtgrp)
+{
+ return !list_empty(&rdtgrp->mon.crdtgrp_list);
+}
+
+/**
+ * rdtgroup_locksetup_user_restrict - Restrict user access to group
+ * @rdtgrp: resource group needing access restricted
+ *
+ * A resource group used for cache pseudo-locking cannot have cpus or tasks
+ * assigned to it. This is communicated to the user by restricting access
+ * to all the files that can be used to make such changes.
+ *
+ * Permissions restored with rdtgroup_locksetup_user_restore()
+ *
+ * Return: 0 on success, <0 on failure. If a failure occurs during the
+ * restriction of access an attempt will be made to restore permissions but
+ * the state of the mode of these files will be uncertain when a failure
+ * occurs.
+ */
+static int rdtgroup_locksetup_user_restrict(struct rdtgroup *rdtgrp)
+{
+ int ret;
+
+ ret = rdtgroup_kn_mode_restrict(rdtgrp, "tasks");
+ if (ret)
+ return ret;
+
+ ret = rdtgroup_kn_mode_restrict(rdtgrp, "cpus");
+ if (ret)
+ goto err_tasks;
+
+ ret = rdtgroup_kn_mode_restrict(rdtgrp, "cpus_list");
+ if (ret)
+ goto err_cpus;
+
+ if (rdt_mon_capable) {
+ ret = rdtgroup_kn_mode_restrict(rdtgrp, "mon_groups");
+ if (ret)
+ goto err_cpus_list;
+ }
+
+ ret = 0;
+ goto out;
+
+err_cpus_list:
+ rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0777);
+err_cpus:
+ rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0777);
+err_tasks:
+ rdtgroup_kn_mode_restore(rdtgrp, "tasks", 0777);
+out:
+ return ret;
+}
+
+/**
+ * rdtgroup_locksetup_user_restore - Restore user access to group
+ * @rdtgrp: resource group needing access restored
+ *
+ * Restore all file access previously removed using
+ * rdtgroup_locksetup_user_restrict()
+ *
+ * Return: 0 on success, <0 on failure. If a failure occurs during the
+ * restoration of access an attempt will be made to restrict permissions
+ * again but the state of the mode of these files will be uncertain when
+ * a failure occurs.
+ */
+static int rdtgroup_locksetup_user_restore(struct rdtgroup *rdtgrp)
+{
+ int ret;
+
+ ret = rdtgroup_kn_mode_restore(rdtgrp, "tasks", 0777);
+ if (ret)
+ return ret;
+
+ ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0777);
+ if (ret)
+ goto err_tasks;
+
+ ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0777);
+ if (ret)
+ goto err_cpus;
+
+ if (rdt_mon_capable) {
+ ret = rdtgroup_kn_mode_restore(rdtgrp, "mon_groups", 0777);
+ if (ret)
+ goto err_cpus_list;
+ }
+
+ ret = 0;
+ goto out;
+
+err_cpus_list:
+ rdtgroup_kn_mode_restrict(rdtgrp, "cpus_list");
+err_cpus:
+ rdtgroup_kn_mode_restrict(rdtgrp, "cpus");
+err_tasks:
+ rdtgroup_kn_mode_restrict(rdtgrp, "tasks");
+out:
+ return ret;
+}
+
+/**
+ * rdtgroup_locksetup_enter - Resource group enters locksetup mode
+ * @rdtgrp: resource group requested to enter locksetup mode
+ *
+ * A resource group enters locksetup mode to reflect that it would be used
+ * to represent a pseudo-locked region and is in the process of being set
+ * up to do so. A resource group used for a pseudo-locked region would
+ * lose the closid associated with it so we cannot allow it to have any
+ * tasks or cpus assigned nor permit tasks or cpus to be assigned in the
+ * future. Monitoring of a pseudo-locked region is not allowed either.
+ *
+ * The above and more restrictions on a pseudo-locked region are checked
+ * for and enforced before the resource group enters the locksetup mode.
+ *
+ * Returns: 0 if the resource group successfully entered locksetup mode, <0
+ * on failure. On failure the last_cmd_status buffer is updated with text to
+ * communicate details of failure to the user.
+ */
+int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp)
+{
+ int ret;
+
+ /*
+ * The default resource group can neither be removed nor lose the
+ * default closid associated with it.
+ */
+ if (rdtgrp == &rdtgroup_default) {
+ rdt_last_cmd_puts("cannot pseudo-lock default group\n");
+ return -EINVAL;
+ }
+
+ /*
+ * Cache Pseudo-locking not supported when CDP is enabled.
+ *
+ * Some things to consider if you would like to enable this
+ * support (using L3 CDP as example):
+ * - When CDP is enabled two separate resources are exposed,
+ * L3DATA and L3CODE, but they are actually on the same cache.
+ * The implication for pseudo-locking is that if a
+ * pseudo-locked region is created on a domain of one
+ * resource (eg. L3CODE), then a pseudo-locked region cannot
+ * be created on that same domain of the other resource
+ * (eg. L3DATA). This is because the creation of a
+ * pseudo-locked region involves a call to wbinvd that will
+ * affect all cache allocations on particular domain.
+ * - Considering the previous, it may be possible to only
+ * expose one of the CDP resources to pseudo-locking and
+ * hide the other. For example, we could consider to only
+ * expose L3DATA and since the L3 cache is unified it is
+ * still possible to place instructions there are execute it.
+ * - If only one region is exposed to pseudo-locking we should
+ * still keep in mind that availability of a portion of cache
+ * for pseudo-locking should take into account both resources.
+ * Similarly, if a pseudo-locked region is created in one
+ * resource, the portion of cache used by it should be made
+ * unavailable to all future allocations from both resources.
+ */
+ if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled ||
+ rdt_resources_all[RDT_RESOURCE_L2DATA].alloc_enabled) {
+ rdt_last_cmd_puts("CDP enabled\n");
+ return -EINVAL;
+ }
+
+ /*
+ * Not knowing the bits to disable prefetching implies that this
+ * platform does not support Cache Pseudo-Locking.
+ */
+ prefetch_disable_bits = get_prefetch_disable_bits();
+ if (prefetch_disable_bits == 0) {
+ rdt_last_cmd_puts("pseudo-locking not supported\n");
+ return -EINVAL;
+ }
+
+ if (rdtgroup_monitor_in_progress(rdtgrp)) {
+ rdt_last_cmd_puts("monitoring in progress\n");
+ return -EINVAL;
+ }
+
+ if (rdtgroup_tasks_assigned(rdtgrp)) {
+ rdt_last_cmd_puts("tasks assigned to resource group\n");
+ return -EINVAL;
+ }
+
+ if (!cpumask_empty(&rdtgrp->cpu_mask)) {
+ rdt_last_cmd_puts("CPUs assigned to resource group\n");
+ return -EINVAL;
+ }
+
+ if (rdtgroup_locksetup_user_restrict(rdtgrp)) {
+ rdt_last_cmd_puts("unable to modify resctrl permissions\n");
+ return -EIO;
+ }
+
+ ret = pseudo_lock_init(rdtgrp);
+ if (ret) {
+ rdt_last_cmd_puts("unable to init pseudo-lock region\n");
+ goto out_release;
+ }
+
+ /*
+ * If this system is capable of monitoring a rmid would have been
+ * allocated when the control group was created. This is not needed
+ * anymore when this group would be used for pseudo-locking. This
+ * is safe to call on platforms not capable of monitoring.
+ */
+ free_rmid(rdtgrp->mon.rmid);
+
+ ret = 0;
+ goto out;
+
+out_release:
+ rdtgroup_locksetup_user_restore(rdtgrp);
+out:
+ return ret;
+}
+
+/**
+ * rdtgroup_locksetup_exit - resource group exist locksetup mode
+ * @rdtgrp: resource group
+ *
+ * When a resource group exits locksetup mode the earlier restrictions are
+ * lifted.
+ *
+ * Return: 0 on success, <0 on failure
+ */
+int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp)
+{
+ int ret;
+
+ if (rdt_mon_capable) {
+ ret = alloc_rmid();
+ if (ret < 0) {
+ rdt_last_cmd_puts("out of RMIDs\n");
+ return ret;
+ }
+ rdtgrp->mon.rmid = ret;
+ }
+
+ ret = rdtgroup_locksetup_user_restore(rdtgrp);
+ if (ret) {
+ free_rmid(rdtgrp->mon.rmid);
+ return ret;
+ }
+
+ pseudo_lock_free(rdtgrp);
+ return 0;
+}
+
+/**
+ * rdtgroup_cbm_overlaps_pseudo_locked - Test if CBM or portion is pseudo-locked
+ * @d: RDT domain
+ * @cbm: CBM to test
+ *
+ * @d represents a cache instance and @cbm a capacity bitmask that is
+ * considered for it. Determine if @cbm overlaps with any existing
+ * pseudo-locked region on @d.
+ *
+ * @cbm is unsigned long, even if only 32 bits are used, to make the
+ * bitmap functions work correctly.
+ *
+ * Return: true if @cbm overlaps with pseudo-locked region on @d, false
+ * otherwise.
+ */
+bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_domain *d, unsigned long cbm)
+{
+ unsigned int cbm_len;
+ unsigned long cbm_b;
+
+ if (d->plr) {
+ cbm_len = d->plr->r->cache.cbm_len;
+ cbm_b = d->plr->cbm;
+ if (bitmap_intersects(&cbm, &cbm_b, cbm_len))
+ return true;
+ }
+ return false;
+}
+
+/**
+ * rdtgroup_pseudo_locked_in_hierarchy - Pseudo-locked region in cache hierarchy
+ * @d: RDT domain under test
+ *
+ * The setup of a pseudo-locked region affects all cache instances within
+ * the hierarchy of the region. It is thus essential to know if any
+ * pseudo-locked regions exist within a cache hierarchy to prevent any
+ * attempts to create new pseudo-locked regions in the same hierarchy.
+ *
+ * Return: true if a pseudo-locked region exists in the hierarchy of @d or
+ * if it is not possible to test due to memory allocation issue,
+ * false otherwise.
+ */
+bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d)
+{
+ cpumask_var_t cpu_with_psl;
+ struct rdt_resource *r;
+ struct rdt_domain *d_i;
+ bool ret = false;
+
+ if (!zalloc_cpumask_var(&cpu_with_psl, GFP_KERNEL))
+ return true;
+
+ /*
+ * First determine which cpus have pseudo-locked regions
+ * associated with them.
+ */
+ for_each_alloc_enabled_rdt_resource(r) {
+ list_for_each_entry(d_i, &r->domains, list) {
+ if (d_i->plr)
+ cpumask_or(cpu_with_psl, cpu_with_psl,
+ &d_i->cpu_mask);
+ }
+ }
+
+ /*
+ * Next test if new pseudo-locked region would intersect with
+ * existing region.
+ */
+ if (cpumask_intersects(&d->cpu_mask, cpu_with_psl))
+ ret = true;
+
+ free_cpumask_var(cpu_with_psl);
+ return ret;
+}
+
+/**
+ * measure_cycles_lat_fn - Measure cycle latency to read pseudo-locked memory
+ * @_plr: pseudo-lock region to measure
+ *
+ * There is no deterministic way to test if a memory region is cached. One
+ * way is to measure how long it takes to read the memory, the speed of
+ * access is a good way to learn how close to the cpu the data was. Even
+ * more, if the prefetcher is disabled and the memory is read at a stride
+ * of half the cache line, then a cache miss will be easy to spot since the
+ * read of the first half would be significantly slower than the read of
+ * the second half.
+ *
+ * Return: 0. Waiter on waitqueue will be woken on completion.
+ */
+static int measure_cycles_lat_fn(void *_plr)
+{
+ struct pseudo_lock_region *plr = _plr;
+ unsigned long i;
+ u64 start, end;
+#ifdef CONFIG_KASAN
+ /*
+ * The registers used for local register variables are also used
+ * when KASAN is active. When KASAN is active we use a regular
+ * variable to ensure we always use a valid pointer to access memory.
+ * The cost is that accessing this pointer, which could be in
+ * cache, will be included in the measurement of memory read latency.
+ */
+ void *mem_r;
+#else
+#ifdef CONFIG_X86_64
+ register void *mem_r asm("rbx");
+#else
+ register void *mem_r asm("ebx");
+#endif /* CONFIG_X86_64 */
+#endif /* CONFIG_KASAN */
+
+ local_irq_disable();
+ /*
+ * The wrmsr call may be reordered with the assignment below it.
+ * Call wrmsr as directly as possible to avoid tracing clobbering
+ * local register variable used for memory pointer.
+ */
+ __wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0);
+ mem_r = plr->kmem;
+ /*
+ * Dummy execute of the time measurement to load the needed
+ * instructions into the L1 instruction cache.
+ */
+ start = rdtsc_ordered();
+ for (i = 0; i < plr->size; i += 32) {
+ start = rdtsc_ordered();
+ asm volatile("mov (%0,%1,1), %%eax\n\t"
+ :
+ : "r" (mem_r), "r" (i)
+ : "%eax", "memory");
+ end = rdtsc_ordered();
+ trace_pseudo_lock_mem_latency((u32)(end - start));
+ }
+ wrmsr(MSR_MISC_FEATURE_CONTROL, 0x0, 0x0);
+ local_irq_enable();
+ plr->thread_done = 1;
+ wake_up_interruptible(&plr->lock_thread_wq);
+ return 0;
+}
+
+static int measure_cycles_perf_fn(void *_plr)
+{
+ unsigned long long l3_hits = 0, l3_miss = 0;
+ u64 l3_hit_bits = 0, l3_miss_bits = 0;
+ struct pseudo_lock_region *plr = _plr;
+ unsigned long long l2_hits, l2_miss;
+ u64 l2_hit_bits, l2_miss_bits;
+ unsigned long i;
+#ifdef CONFIG_KASAN
+ /*
+ * The registers used for local register variables are also used
+ * when KASAN is active. When KASAN is active we use regular variables
+ * at the cost of including cache access latency to these variables
+ * in the measurements.
+ */
+ unsigned int line_size;
+ unsigned int size;
+ void *mem_r;
+#else
+ register unsigned int line_size asm("esi");
+ register unsigned int size asm("edi");
+#ifdef CONFIG_X86_64
+ register void *mem_r asm("rbx");
+#else
+ register void *mem_r asm("ebx");
+#endif /* CONFIG_X86_64 */
+#endif /* CONFIG_KASAN */
+
+ /*
+ * Non-architectural event for the Goldmont Microarchitecture
+ * from Intel x86 Architecture Software Developer Manual (SDM):
+ * MEM_LOAD_UOPS_RETIRED D1H (event number)
+ * Umask values:
+ * L1_HIT 01H
+ * L2_HIT 02H
+ * L1_MISS 08H
+ * L2_MISS 10H
+ *
+ * On Broadwell Microarchitecture the MEM_LOAD_UOPS_RETIRED event
+ * has two "no fix" errata associated with it: BDM35 and BDM100. On
+ * this platform we use the following events instead:
+ * L2_RQSTS 24H (Documented in https://download.01.org/perfmon/BDW/)
+ * REFERENCES FFH
+ * MISS 3FH
+ * LONGEST_LAT_CACHE 2EH (Documented in SDM)
+ * REFERENCE 4FH
+ * MISS 41H
+ */
+
+ /*
+ * Start by setting flags for IA32_PERFEVTSELx:
+ * OS (Operating system mode) 0x2
+ * INT (APIC interrupt enable) 0x10
+ * EN (Enable counter) 0x40
+ *
+ * Then add the Umask value and event number to select performance
+ * event.
+ */
+
+ switch (boot_cpu_data.x86_model) {
+ case INTEL_FAM6_ATOM_GOLDMONT:
+ case INTEL_FAM6_ATOM_GOLDMONT_PLUS:
+ l2_hit_bits = (0x52ULL << 16) | (0x2 << 8) | 0xd1;
+ l2_miss_bits = (0x52ULL << 16) | (0x10 << 8) | 0xd1;
+ break;
+ case INTEL_FAM6_BROADWELL_X:
+ /* On BDW the l2_hit_bits count references, not hits */
+ l2_hit_bits = (0x52ULL << 16) | (0xff << 8) | 0x24;
+ l2_miss_bits = (0x52ULL << 16) | (0x3f << 8) | 0x24;
+ /* On BDW the l3_hit_bits count references, not hits */
+ l3_hit_bits = (0x52ULL << 16) | (0x4f << 8) | 0x2e;
+ l3_miss_bits = (0x52ULL << 16) | (0x41 << 8) | 0x2e;
+ break;
+ default:
+ goto out;
+ }
+
+ local_irq_disable();
+ /*
+ * Call wrmsr direcly to avoid the local register variables from
+ * being overwritten due to reordering of their assignment with
+ * the wrmsr calls.
+ */
+ __wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0);
+ /* Disable events and reset counters */
+ pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0, 0x0);
+ pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 1, 0x0);
+ pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_PERFCTR0, 0x0);
+ pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_PERFCTR0 + 1, 0x0);
+ if (l3_hit_bits > 0) {
+ pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 2, 0x0);
+ pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 3, 0x0);
+ pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_PERFCTR0 + 2, 0x0);
+ pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_PERFCTR0 + 3, 0x0);
+ }
+ /* Set and enable the L2 counters */
+ pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0, l2_hit_bits);
+ pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 1, l2_miss_bits);
+ if (l3_hit_bits > 0) {
+ pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 2,
+ l3_hit_bits);
+ pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 3,
+ l3_miss_bits);
+ }
+ mem_r = plr->kmem;
+ size = plr->size;
+ line_size = plr->line_size;
+ for (i = 0; i < size; i += line_size) {
+ asm volatile("mov (%0,%1,1), %%eax\n\t"
+ :
+ : "r" (mem_r), "r" (i)
+ : "%eax", "memory");
+ }
+ /*
+ * Call wrmsr directly (no tracing) to not influence
+ * the cache access counters as they are disabled.
+ */
+ pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0,
+ l2_hit_bits & ~(0x40ULL << 16));
+ pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 1,
+ l2_miss_bits & ~(0x40ULL << 16));
+ if (l3_hit_bits > 0) {
+ pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 2,
+ l3_hit_bits & ~(0x40ULL << 16));
+ pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 3,
+ l3_miss_bits & ~(0x40ULL << 16));
+ }
+ l2_hits = native_read_pmc(0);
+ l2_miss = native_read_pmc(1);
+ if (l3_hit_bits > 0) {
+ l3_hits = native_read_pmc(2);
+ l3_miss = native_read_pmc(3);
+ }
+ wrmsr(MSR_MISC_FEATURE_CONTROL, 0x0, 0x0);
+ local_irq_enable();
+ /*
+ * On BDW we count references and misses, need to adjust. Sometimes
+ * the "hits" counter is a bit more than the references, for
+ * example, x references but x + 1 hits. To not report invalid
+ * hit values in this case we treat that as misses eaqual to
+ * references.
+ */
+ if (boot_cpu_data.x86_model == INTEL_FAM6_BROADWELL_X)
+ l2_hits -= (l2_miss > l2_hits ? l2_hits : l2_miss);
+ trace_pseudo_lock_l2(l2_hits, l2_miss);
+ if (l3_hit_bits > 0) {
+ if (boot_cpu_data.x86_model == INTEL_FAM6_BROADWELL_X)
+ l3_hits -= (l3_miss > l3_hits ? l3_hits : l3_miss);
+ trace_pseudo_lock_l3(l3_hits, l3_miss);
+ }
+
+out:
+ plr->thread_done = 1;
+ wake_up_interruptible(&plr->lock_thread_wq);
+ return 0;
+}
+
+/**
+ * pseudo_lock_measure_cycles - Trigger latency measure to pseudo-locked region
+ *
+ * The measurement of latency to access a pseudo-locked region should be
+ * done from a cpu that is associated with that pseudo-locked region.
+ * Determine which cpu is associated with this region and start a thread on
+ * that cpu to perform the measurement, wait for that thread to complete.
+ *
+ * Return: 0 on success, <0 on failure
+ */
+static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel)
+{
+ struct pseudo_lock_region *plr = rdtgrp->plr;
+ struct task_struct *thread;
+ unsigned int cpu;
+ int ret = -1;
+
+ cpus_read_lock();
+ mutex_lock(&rdtgroup_mutex);
+
+ if (rdtgrp->flags & RDT_DELETED) {
+ ret = -ENODEV;
+ goto out;
+ }
+
+ if (!plr->d) {
+ ret = -ENODEV;
+ goto out;
+ }
+
+ plr->thread_done = 0;
+ cpu = cpumask_first(&plr->d->cpu_mask);
+ if (!cpu_online(cpu)) {
+ ret = -ENODEV;
+ goto out;
+ }
+
+ if (sel == 1)
+ thread = kthread_create_on_node(measure_cycles_lat_fn, plr,
+ cpu_to_node(cpu),
+ "pseudo_lock_measure/%u",
+ cpu);
+ else if (sel == 2)
+ thread = kthread_create_on_node(measure_cycles_perf_fn, plr,
+ cpu_to_node(cpu),
+ "pseudo_lock_measure/%u",
+ cpu);
+ else
+ goto out;
+
+ if (IS_ERR(thread)) {
+ ret = PTR_ERR(thread);
+ goto out;
+ }
+ kthread_bind(thread, cpu);
+ wake_up_process(thread);
+
+ ret = wait_event_interruptible(plr->lock_thread_wq,
+ plr->thread_done == 1);
+ if (ret < 0)
+ goto out;
+
+ ret = 0;
+
+out:
+ mutex_unlock(&rdtgroup_mutex);
+ cpus_read_unlock();
+ return ret;
+}
+
+static ssize_t pseudo_lock_measure_trigger(struct file *file,
+ const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct rdtgroup *rdtgrp = file->private_data;
+ size_t buf_size;
+ char buf[32];
+ int ret;
+ int sel;
+
+ buf_size = min(count, (sizeof(buf) - 1));
+ if (copy_from_user(buf, user_buf, buf_size))
+ return -EFAULT;
+
+ buf[buf_size] = '\0';
+ ret = kstrtoint(buf, 10, &sel);
+ if (ret == 0) {
+ if (sel != 1)
+ return -EINVAL;
+ ret = debugfs_file_get(file->f_path.dentry);
+ if (ret)
+ return ret;
+ ret = pseudo_lock_measure_cycles(rdtgrp, sel);
+ if (ret == 0)
+ ret = count;
+ debugfs_file_put(file->f_path.dentry);
+ }
+
+ return ret;
+}
+
+static const struct file_operations pseudo_measure_fops = {
+ .write = pseudo_lock_measure_trigger,
+ .open = simple_open,
+ .llseek = default_llseek,
+};
+
+/**
+ * rdtgroup_pseudo_lock_create - Create a pseudo-locked region
+ * @rdtgrp: resource group to which pseudo-lock region belongs
+ *
+ * Called when a resource group in the pseudo-locksetup mode receives a
+ * valid schemata that should be pseudo-locked. Since the resource group is
+ * in pseudo-locksetup mode the &struct pseudo_lock_region has already been
+ * allocated and initialized with the essential information. If a failure
+ * occurs the resource group remains in the pseudo-locksetup mode with the
+ * &struct pseudo_lock_region associated with it, but cleared from all
+ * information and ready for the user to re-attempt pseudo-locking by
+ * writing the schemata again.
+ *
+ * Return: 0 if the pseudo-locked region was successfully pseudo-locked, <0
+ * on failure. Descriptive error will be written to last_cmd_status buffer.
+ */
+int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp)
+{
+ struct pseudo_lock_region *plr = rdtgrp->plr;
+ struct task_struct *thread;
+ unsigned int new_minor;
+ struct device *dev;
+ int ret;
+
+ ret = pseudo_lock_region_alloc(plr);
+ if (ret < 0)
+ return ret;
+
+ ret = pseudo_lock_cstates_constrain(plr);
+ if (ret < 0) {
+ ret = -EINVAL;
+ goto out_region;
+ }
+
+ plr->thread_done = 0;
+
+ thread = kthread_create_on_node(pseudo_lock_fn, rdtgrp,
+ cpu_to_node(plr->cpu),
+ "pseudo_lock/%u", plr->cpu);
+ if (IS_ERR(thread)) {
+ ret = PTR_ERR(thread);
+ rdt_last_cmd_printf("locking thread returned error %d\n", ret);
+ goto out_cstates;
+ }
+
+ kthread_bind(thread, plr->cpu);
+ wake_up_process(thread);
+
+ ret = wait_event_interruptible(plr->lock_thread_wq,
+ plr->thread_done == 1);
+ if (ret < 0) {
+ /*
+ * If the thread does not get on the CPU for whatever
+ * reason and the process which sets up the region is
+ * interrupted then this will leave the thread in runnable
+ * state and once it gets on the CPU it will derefence
+ * the cleared, but not freed, plr struct resulting in an
+ * empty pseudo-locking loop.
+ */
+ rdt_last_cmd_puts("locking thread interrupted\n");
+ goto out_cstates;
+ }
+
+ ret = pseudo_lock_minor_get(&new_minor);
+ if (ret < 0) {
+ rdt_last_cmd_puts("unable to obtain a new minor number\n");
+ goto out_cstates;
+ }
+
+ /*
+ * Unlock access but do not release the reference. The
+ * pseudo-locked region will still be here on return.
+ *
+ * The mutex has to be released temporarily to avoid a potential
+ * deadlock with the mm->mmap_sem semaphore which is obtained in
+ * the device_create() and debugfs_create_dir() callpath below
+ * as well as before the mmap() callback is called.
+ */
+ mutex_unlock(&rdtgroup_mutex);
+
+ if (!IS_ERR_OR_NULL(debugfs_resctrl)) {
+ plr->debugfs_dir = debugfs_create_dir(rdtgrp->kn->name,
+ debugfs_resctrl);
+ if (!IS_ERR_OR_NULL(plr->debugfs_dir))
+ debugfs_create_file("pseudo_lock_measure", 0200,
+ plr->debugfs_dir, rdtgrp,
+ &pseudo_measure_fops);
+ }
+
+ dev = device_create(pseudo_lock_class, NULL,
+ MKDEV(pseudo_lock_major, new_minor),
+ rdtgrp, "%s", rdtgrp->kn->name);
+
+ mutex_lock(&rdtgroup_mutex);
+
+ if (IS_ERR(dev)) {
+ ret = PTR_ERR(dev);
+ rdt_last_cmd_printf("failed to create character device: %d\n",
+ ret);
+ goto out_debugfs;
+ }
+
+ /* We released the mutex - check if group was removed while we did so */
+ if (rdtgrp->flags & RDT_DELETED) {
+ ret = -ENODEV;
+ goto out_device;
+ }
+
+ plr->minor = new_minor;
+
+ rdtgrp->mode = RDT_MODE_PSEUDO_LOCKED;
+ closid_free(rdtgrp->closid);
+ rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0444);
+ rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0444);
+
+ ret = 0;
+ goto out;
+
+out_device:
+ device_destroy(pseudo_lock_class, MKDEV(pseudo_lock_major, new_minor));
+out_debugfs:
+ debugfs_remove_recursive(plr->debugfs_dir);
+ pseudo_lock_minor_release(new_minor);
+out_cstates:
+ pseudo_lock_cstates_relax(plr);
+out_region:
+ pseudo_lock_region_clear(plr);
+out:
+ return ret;
+}
+
+/**
+ * rdtgroup_pseudo_lock_remove - Remove a pseudo-locked region
+ * @rdtgrp: resource group to which the pseudo-locked region belongs
+ *
+ * The removal of a pseudo-locked region can be initiated when the resource
+ * group is removed from user space via a "rmdir" from userspace or the
+ * unmount of the resctrl filesystem. On removal the resource group does
+ * not go back to pseudo-locksetup mode before it is removed, instead it is
+ * removed directly. There is thus assymmetry with the creation where the
+ * &struct pseudo_lock_region is removed here while it was not created in
+ * rdtgroup_pseudo_lock_create().
+ *
+ * Return: void
+ */
+void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp)
+{
+ struct pseudo_lock_region *plr = rdtgrp->plr;
+
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
+ /*
+ * Default group cannot be a pseudo-locked region so we can
+ * free closid here.
+ */
+ closid_free(rdtgrp->closid);
+ goto free;
+ }
+
+ pseudo_lock_cstates_relax(plr);
+ debugfs_remove_recursive(rdtgrp->plr->debugfs_dir);
+ device_destroy(pseudo_lock_class, MKDEV(pseudo_lock_major, plr->minor));
+ pseudo_lock_minor_release(plr->minor);
+
+free:
+ pseudo_lock_free(rdtgrp);
+}
+
+static int pseudo_lock_dev_open(struct inode *inode, struct file *filp)
+{
+ struct rdtgroup *rdtgrp;
+
+ mutex_lock(&rdtgroup_mutex);
+
+ rdtgrp = region_find_by_minor(iminor(inode));
+ if (!rdtgrp) {
+ mutex_unlock(&rdtgroup_mutex);
+ return -ENODEV;
+ }
+
+ filp->private_data = rdtgrp;
+ atomic_inc(&rdtgrp->waitcount);
+ /* Perform a non-seekable open - llseek is not supported */
+ filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
+
+ mutex_unlock(&rdtgroup_mutex);
+
+ return 0;
+}
+
+static int pseudo_lock_dev_release(struct inode *inode, struct file *filp)
+{
+ struct rdtgroup *rdtgrp;
+
+ mutex_lock(&rdtgroup_mutex);
+ rdtgrp = filp->private_data;
+ WARN_ON(!rdtgrp);
+ if (!rdtgrp) {
+ mutex_unlock(&rdtgroup_mutex);
+ return -ENODEV;
+ }
+ filp->private_data = NULL;
+ atomic_dec(&rdtgrp->waitcount);
+ mutex_unlock(&rdtgroup_mutex);
+ return 0;
+}
+
+static int pseudo_lock_dev_mremap(struct vm_area_struct *area)
+{
+ /* Not supported */
+ return -EINVAL;
+}
+
+static const struct vm_operations_struct pseudo_mmap_ops = {
+ .mremap = pseudo_lock_dev_mremap,
+};
+
+static int pseudo_lock_dev_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+ unsigned long vsize = vma->vm_end - vma->vm_start;
+ unsigned long off = vma->vm_pgoff << PAGE_SHIFT;
+ struct pseudo_lock_region *plr;
+ struct rdtgroup *rdtgrp;
+ unsigned long physical;
+ unsigned long psize;
+
+ mutex_lock(&rdtgroup_mutex);
+
+ rdtgrp = filp->private_data;
+ WARN_ON(!rdtgrp);
+ if (!rdtgrp) {
+ mutex_unlock(&rdtgroup_mutex);
+ return -ENODEV;
+ }
+
+ plr = rdtgrp->plr;
+
+ if (!plr->d) {
+ mutex_unlock(&rdtgroup_mutex);
+ return -ENODEV;
+ }
+
+ /*
+ * Task is required to run with affinity to the cpus associated
+ * with the pseudo-locked region. If this is not the case the task
+ * may be scheduled elsewhere and invalidate entries in the
+ * pseudo-locked region.
+ */
+ if (!cpumask_subset(&current->cpus_allowed, &plr->d->cpu_mask)) {
+ mutex_unlock(&rdtgroup_mutex);
+ return -EINVAL;
+ }
+
+ physical = __pa(plr->kmem) >> PAGE_SHIFT;
+ psize = plr->size - off;
+
+ if (off > plr->size) {
+ mutex_unlock(&rdtgroup_mutex);
+ return -ENOSPC;
+ }
+
+ /*
+ * Ensure changes are carried directly to the memory being mapped,
+ * do not allow copy-on-write mapping.
+ */
+ if (!(vma->vm_flags & VM_SHARED)) {
+ mutex_unlock(&rdtgroup_mutex);
+ return -EINVAL;
+ }
+
+ if (vsize > psize) {
+ mutex_unlock(&rdtgroup_mutex);
+ return -ENOSPC;
+ }
+
+ memset(plr->kmem + off, 0, vsize);
+
+ if (remap_pfn_range(vma, vma->vm_start, physical + vma->vm_pgoff,
+ vsize, vma->vm_page_prot)) {
+ mutex_unlock(&rdtgroup_mutex);
+ return -EAGAIN;
+ }
+ vma->vm_ops = &pseudo_mmap_ops;
+ mutex_unlock(&rdtgroup_mutex);
+ return 0;
+}
+
+static const struct file_operations pseudo_lock_dev_fops = {
+ .owner = THIS_MODULE,
+ .llseek = no_llseek,
+ .read = NULL,
+ .write = NULL,
+ .open = pseudo_lock_dev_open,
+ .release = pseudo_lock_dev_release,
+ .mmap = pseudo_lock_dev_mmap,
+};
+
+static char *pseudo_lock_devnode(struct device *dev, umode_t *mode)
+{
+ struct rdtgroup *rdtgrp;
+
+ rdtgrp = dev_get_drvdata(dev);
+ if (mode)
+ *mode = 0600;
+ return kasprintf(GFP_KERNEL, "pseudo_lock/%s", rdtgrp->kn->name);
+}
+
+int rdt_pseudo_lock_init(void)
+{
+ int ret;
+
+ ret = register_chrdev(0, "pseudo_lock", &pseudo_lock_dev_fops);
+ if (ret < 0)
+ return ret;
+
+ pseudo_lock_major = ret;
+
+ pseudo_lock_class = class_create(THIS_MODULE, "pseudo_lock");
+ if (IS_ERR(pseudo_lock_class)) {
+ ret = PTR_ERR(pseudo_lock_class);
+ unregister_chrdev(pseudo_lock_major, "pseudo_lock");
+ return ret;
+ }
+
+ pseudo_lock_class->devnode = pseudo_lock_devnode;
+ return 0;
+}
+
+void rdt_pseudo_lock_release(void)
+{
+ class_destroy(pseudo_lock_class);
+ pseudo_lock_class = NULL;
+ unregister_chrdev(pseudo_lock_major, "pseudo_lock");
+ pseudo_lock_major = 0;
+}
diff --git a/arch/x86/kernel/cpu/intel_rdt_pseudo_lock_event.h b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock_event.h
new file mode 100644
index 000000000..2c041e6d9
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock_event.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM resctrl
+
+#if !defined(_TRACE_PSEUDO_LOCK_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_PSEUDO_LOCK_H
+
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(pseudo_lock_mem_latency,
+ TP_PROTO(u32 latency),
+ TP_ARGS(latency),
+ TP_STRUCT__entry(__field(u32, latency)),
+ TP_fast_assign(__entry->latency = latency),
+ TP_printk("latency=%u", __entry->latency)
+ );
+
+TRACE_EVENT(pseudo_lock_l2,
+ TP_PROTO(u64 l2_hits, u64 l2_miss),
+ TP_ARGS(l2_hits, l2_miss),
+ TP_STRUCT__entry(__field(u64, l2_hits)
+ __field(u64, l2_miss)),
+ TP_fast_assign(__entry->l2_hits = l2_hits;
+ __entry->l2_miss = l2_miss;),
+ TP_printk("hits=%llu miss=%llu",
+ __entry->l2_hits, __entry->l2_miss));
+
+TRACE_EVENT(pseudo_lock_l3,
+ TP_PROTO(u64 l3_hits, u64 l3_miss),
+ TP_ARGS(l3_hits, l3_miss),
+ TP_STRUCT__entry(__field(u64, l3_hits)
+ __field(u64, l3_miss)),
+ TP_fast_assign(__entry->l3_hits = l3_hits;
+ __entry->l3_miss = l3_miss;),
+ TP_printk("hits=%llu miss=%llu",
+ __entry->l3_hits, __entry->l3_miss));
+
+#endif /* _TRACE_PSEUDO_LOCK_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE intel_rdt_pseudo_lock_event
+#include <trace/define_trace.h>
diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
new file mode 100644
index 000000000..f406e3b85
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -0,0 +1,3041 @@
+/*
+ * User interface for Resource Alloction in Resource Director Technology(RDT)
+ *
+ * Copyright (C) 2016 Intel Corporation
+ *
+ * Author: Fenghua Yu <fenghua.yu@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * More information about RDT be found in the Intel (R) x86 Architecture
+ * Software Developer Manual.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/cacheinfo.h>
+#include <linux/cpu.h>
+#include <linux/debugfs.h>
+#include <linux/fs.h>
+#include <linux/sysfs.h>
+#include <linux/kernfs.h>
+#include <linux/seq_buf.h>
+#include <linux/seq_file.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/task.h>
+#include <linux/slab.h>
+#include <linux/task_work.h>
+
+#include <uapi/linux/magic.h>
+
+#include <asm/intel_rdt_sched.h>
+#include "intel_rdt.h"
+
+DEFINE_STATIC_KEY_FALSE(rdt_enable_key);
+DEFINE_STATIC_KEY_FALSE(rdt_mon_enable_key);
+DEFINE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
+static struct kernfs_root *rdt_root;
+struct rdtgroup rdtgroup_default;
+LIST_HEAD(rdt_all_groups);
+
+/* Kernel fs node for "info" directory under root */
+static struct kernfs_node *kn_info;
+
+/* Kernel fs node for "mon_groups" directory under root */
+static struct kernfs_node *kn_mongrp;
+
+/* Kernel fs node for "mon_data" directory under root */
+static struct kernfs_node *kn_mondata;
+
+static struct seq_buf last_cmd_status;
+static char last_cmd_status_buf[512];
+
+struct dentry *debugfs_resctrl;
+
+void rdt_last_cmd_clear(void)
+{
+ lockdep_assert_held(&rdtgroup_mutex);
+ seq_buf_clear(&last_cmd_status);
+}
+
+void rdt_last_cmd_puts(const char *s)
+{
+ lockdep_assert_held(&rdtgroup_mutex);
+ seq_buf_puts(&last_cmd_status, s);
+}
+
+void rdt_last_cmd_printf(const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ lockdep_assert_held(&rdtgroup_mutex);
+ seq_buf_vprintf(&last_cmd_status, fmt, ap);
+ va_end(ap);
+}
+
+/*
+ * Trivial allocator for CLOSIDs. Since h/w only supports a small number,
+ * we can keep a bitmap of free CLOSIDs in a single integer.
+ *
+ * Using a global CLOSID across all resources has some advantages and
+ * some drawbacks:
+ * + We can simply set "current->closid" to assign a task to a resource
+ * group.
+ * + Context switch code can avoid extra memory references deciding which
+ * CLOSID to load into the PQR_ASSOC MSR
+ * - We give up some options in configuring resource groups across multi-socket
+ * systems.
+ * - Our choices on how to configure each resource become progressively more
+ * limited as the number of resources grows.
+ */
+static int closid_free_map;
+static int closid_free_map_len;
+
+int closids_supported(void)
+{
+ return closid_free_map_len;
+}
+
+static void closid_init(void)
+{
+ struct rdt_resource *r;
+ int rdt_min_closid = 32;
+
+ /* Compute rdt_min_closid across all resources */
+ for_each_alloc_enabled_rdt_resource(r)
+ rdt_min_closid = min(rdt_min_closid, r->num_closid);
+
+ closid_free_map = BIT_MASK(rdt_min_closid) - 1;
+
+ /* CLOSID 0 is always reserved for the default group */
+ closid_free_map &= ~1;
+ closid_free_map_len = rdt_min_closid;
+}
+
+static int closid_alloc(void)
+{
+ u32 closid = ffs(closid_free_map);
+
+ if (closid == 0)
+ return -ENOSPC;
+ closid--;
+ closid_free_map &= ~(1 << closid);
+
+ return closid;
+}
+
+void closid_free(int closid)
+{
+ closid_free_map |= 1 << closid;
+}
+
+/**
+ * closid_allocated - test if provided closid is in use
+ * @closid: closid to be tested
+ *
+ * Return: true if @closid is currently associated with a resource group,
+ * false if @closid is free
+ */
+static bool closid_allocated(unsigned int closid)
+{
+ return (closid_free_map & (1 << closid)) == 0;
+}
+
+/**
+ * rdtgroup_mode_by_closid - Return mode of resource group with closid
+ * @closid: closid if the resource group
+ *
+ * Each resource group is associated with a @closid. Here the mode
+ * of a resource group can be queried by searching for it using its closid.
+ *
+ * Return: mode as &enum rdtgrp_mode of resource group with closid @closid
+ */
+enum rdtgrp_mode rdtgroup_mode_by_closid(int closid)
+{
+ struct rdtgroup *rdtgrp;
+
+ list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
+ if (rdtgrp->closid == closid)
+ return rdtgrp->mode;
+ }
+
+ return RDT_NUM_MODES;
+}
+
+static const char * const rdt_mode_str[] = {
+ [RDT_MODE_SHAREABLE] = "shareable",
+ [RDT_MODE_EXCLUSIVE] = "exclusive",
+ [RDT_MODE_PSEUDO_LOCKSETUP] = "pseudo-locksetup",
+ [RDT_MODE_PSEUDO_LOCKED] = "pseudo-locked",
+};
+
+/**
+ * rdtgroup_mode_str - Return the string representation of mode
+ * @mode: the resource group mode as &enum rdtgroup_mode
+ *
+ * Return: string representation of valid mode, "unknown" otherwise
+ */
+static const char *rdtgroup_mode_str(enum rdtgrp_mode mode)
+{
+ if (mode < RDT_MODE_SHAREABLE || mode >= RDT_NUM_MODES)
+ return "unknown";
+
+ return rdt_mode_str[mode];
+}
+
+/* set uid and gid of rdtgroup dirs and files to that of the creator */
+static int rdtgroup_kn_set_ugid(struct kernfs_node *kn)
+{
+ struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
+ .ia_uid = current_fsuid(),
+ .ia_gid = current_fsgid(), };
+
+ if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
+ gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
+ return 0;
+
+ return kernfs_setattr(kn, &iattr);
+}
+
+static int rdtgroup_add_file(struct kernfs_node *parent_kn, struct rftype *rft)
+{
+ struct kernfs_node *kn;
+ int ret;
+
+ kn = __kernfs_create_file(parent_kn, rft->name, rft->mode,
+ GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
+ 0, rft->kf_ops, rft, NULL, NULL);
+ if (IS_ERR(kn))
+ return PTR_ERR(kn);
+
+ ret = rdtgroup_kn_set_ugid(kn);
+ if (ret) {
+ kernfs_remove(kn);
+ return ret;
+ }
+
+ return 0;
+}
+
+static int rdtgroup_seqfile_show(struct seq_file *m, void *arg)
+{
+ struct kernfs_open_file *of = m->private;
+ struct rftype *rft = of->kn->priv;
+
+ if (rft->seq_show)
+ return rft->seq_show(of, m, arg);
+ return 0;
+}
+
+static ssize_t rdtgroup_file_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ struct rftype *rft = of->kn->priv;
+
+ if (rft->write)
+ return rft->write(of, buf, nbytes, off);
+
+ return -EINVAL;
+}
+
+static struct kernfs_ops rdtgroup_kf_single_ops = {
+ .atomic_write_len = PAGE_SIZE,
+ .write = rdtgroup_file_write,
+ .seq_show = rdtgroup_seqfile_show,
+};
+
+static struct kernfs_ops kf_mondata_ops = {
+ .atomic_write_len = PAGE_SIZE,
+ .seq_show = rdtgroup_mondata_show,
+};
+
+static bool is_cpu_list(struct kernfs_open_file *of)
+{
+ struct rftype *rft = of->kn->priv;
+
+ return rft->flags & RFTYPE_FLAGS_CPUS_LIST;
+}
+
+static int rdtgroup_cpus_show(struct kernfs_open_file *of,
+ struct seq_file *s, void *v)
+{
+ struct rdtgroup *rdtgrp;
+ struct cpumask *mask;
+ int ret = 0;
+
+ rdtgrp = rdtgroup_kn_lock_live(of->kn);
+
+ if (rdtgrp) {
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
+ if (!rdtgrp->plr->d) {
+ rdt_last_cmd_clear();
+ rdt_last_cmd_puts("Cache domain offline\n");
+ ret = -ENODEV;
+ } else {
+ mask = &rdtgrp->plr->d->cpu_mask;
+ seq_printf(s, is_cpu_list(of) ?
+ "%*pbl\n" : "%*pb\n",
+ cpumask_pr_args(mask));
+ }
+ } else {
+ seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n",
+ cpumask_pr_args(&rdtgrp->cpu_mask));
+ }
+ } else {
+ ret = -ENOENT;
+ }
+ rdtgroup_kn_unlock(of->kn);
+
+ return ret;
+}
+
+/*
+ * This is safe against intel_rdt_sched_in() called from __switch_to()
+ * because __switch_to() is executed with interrupts disabled. A local call
+ * from update_closid_rmid() is proteced against __switch_to() because
+ * preemption is disabled.
+ */
+static void update_cpu_closid_rmid(void *info)
+{
+ struct rdtgroup *r = info;
+
+ if (r) {
+ this_cpu_write(pqr_state.default_closid, r->closid);
+ this_cpu_write(pqr_state.default_rmid, r->mon.rmid);
+ }
+
+ /*
+ * We cannot unconditionally write the MSR because the current
+ * executing task might have its own closid selected. Just reuse
+ * the context switch code.
+ */
+ intel_rdt_sched_in();
+}
+
+/*
+ * Update the PGR_ASSOC MSR on all cpus in @cpu_mask,
+ *
+ * Per task closids/rmids must have been set up before calling this function.
+ */
+static void
+update_closid_rmid(const struct cpumask *cpu_mask, struct rdtgroup *r)
+{
+ int cpu = get_cpu();
+
+ if (cpumask_test_cpu(cpu, cpu_mask))
+ update_cpu_closid_rmid(r);
+ smp_call_function_many(cpu_mask, update_cpu_closid_rmid, r, 1);
+ put_cpu();
+}
+
+static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
+ cpumask_var_t tmpmask)
+{
+ struct rdtgroup *prgrp = rdtgrp->mon.parent, *crgrp;
+ struct list_head *head;
+
+ /* Check whether cpus belong to parent ctrl group */
+ cpumask_andnot(tmpmask, newmask, &prgrp->cpu_mask);
+ if (cpumask_weight(tmpmask)) {
+ rdt_last_cmd_puts("can only add CPUs to mongroup that belong to parent\n");
+ return -EINVAL;
+ }
+
+ /* Check whether cpus are dropped from this group */
+ cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
+ if (cpumask_weight(tmpmask)) {
+ /* Give any dropped cpus to parent rdtgroup */
+ cpumask_or(&prgrp->cpu_mask, &prgrp->cpu_mask, tmpmask);
+ update_closid_rmid(tmpmask, prgrp);
+ }
+
+ /*
+ * If we added cpus, remove them from previous group that owned them
+ * and update per-cpu rmid
+ */
+ cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
+ if (cpumask_weight(tmpmask)) {
+ head = &prgrp->mon.crdtgrp_list;
+ list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
+ if (crgrp == rdtgrp)
+ continue;
+ cpumask_andnot(&crgrp->cpu_mask, &crgrp->cpu_mask,
+ tmpmask);
+ }
+ update_closid_rmid(tmpmask, rdtgrp);
+ }
+
+ /* Done pushing/pulling - update this group with new mask */
+ cpumask_copy(&rdtgrp->cpu_mask, newmask);
+
+ return 0;
+}
+
+static void cpumask_rdtgrp_clear(struct rdtgroup *r, struct cpumask *m)
+{
+ struct rdtgroup *crgrp;
+
+ cpumask_andnot(&r->cpu_mask, &r->cpu_mask, m);
+ /* update the child mon group masks as well*/
+ list_for_each_entry(crgrp, &r->mon.crdtgrp_list, mon.crdtgrp_list)
+ cpumask_and(&crgrp->cpu_mask, &r->cpu_mask, &crgrp->cpu_mask);
+}
+
+static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
+ cpumask_var_t tmpmask, cpumask_var_t tmpmask1)
+{
+ struct rdtgroup *r, *crgrp;
+ struct list_head *head;
+
+ /* Check whether cpus are dropped from this group */
+ cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
+ if (cpumask_weight(tmpmask)) {
+ /* Can't drop from default group */
+ if (rdtgrp == &rdtgroup_default) {
+ rdt_last_cmd_puts("Can't drop CPUs from default group\n");
+ return -EINVAL;
+ }
+
+ /* Give any dropped cpus to rdtgroup_default */
+ cpumask_or(&rdtgroup_default.cpu_mask,
+ &rdtgroup_default.cpu_mask, tmpmask);
+ update_closid_rmid(tmpmask, &rdtgroup_default);
+ }
+
+ /*
+ * If we added cpus, remove them from previous group and
+ * the prev group's child groups that owned them
+ * and update per-cpu closid/rmid.
+ */
+ cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
+ if (cpumask_weight(tmpmask)) {
+ list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) {
+ if (r == rdtgrp)
+ continue;
+ cpumask_and(tmpmask1, &r->cpu_mask, tmpmask);
+ if (cpumask_weight(tmpmask1))
+ cpumask_rdtgrp_clear(r, tmpmask1);
+ }
+ update_closid_rmid(tmpmask, rdtgrp);
+ }
+
+ /* Done pushing/pulling - update this group with new mask */
+ cpumask_copy(&rdtgrp->cpu_mask, newmask);
+
+ /*
+ * Clear child mon group masks since there is a new parent mask
+ * now and update the rmid for the cpus the child lost.
+ */
+ head = &rdtgrp->mon.crdtgrp_list;
+ list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
+ cpumask_and(tmpmask, &rdtgrp->cpu_mask, &crgrp->cpu_mask);
+ update_closid_rmid(tmpmask, rdtgrp);
+ cpumask_clear(&crgrp->cpu_mask);
+ }
+
+ return 0;
+}
+
+static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ cpumask_var_t tmpmask, newmask, tmpmask1;
+ struct rdtgroup *rdtgrp;
+ int ret;
+
+ if (!buf)
+ return -EINVAL;
+
+ if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
+ return -ENOMEM;
+ if (!zalloc_cpumask_var(&newmask, GFP_KERNEL)) {
+ free_cpumask_var(tmpmask);
+ return -ENOMEM;
+ }
+ if (!zalloc_cpumask_var(&tmpmask1, GFP_KERNEL)) {
+ free_cpumask_var(tmpmask);
+ free_cpumask_var(newmask);
+ return -ENOMEM;
+ }
+
+ rdtgrp = rdtgroup_kn_lock_live(of->kn);
+ rdt_last_cmd_clear();
+ if (!rdtgrp) {
+ ret = -ENOENT;
+ rdt_last_cmd_puts("directory was removed\n");
+ goto unlock;
+ }
+
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED ||
+ rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
+ ret = -EINVAL;
+ rdt_last_cmd_puts("pseudo-locking in progress\n");
+ goto unlock;
+ }
+
+ if (is_cpu_list(of))
+ ret = cpulist_parse(buf, newmask);
+ else
+ ret = cpumask_parse(buf, newmask);
+
+ if (ret) {
+ rdt_last_cmd_puts("bad cpu list/mask\n");
+ goto unlock;
+ }
+
+ /* check that user didn't specify any offline cpus */
+ cpumask_andnot(tmpmask, newmask, cpu_online_mask);
+ if (cpumask_weight(tmpmask)) {
+ ret = -EINVAL;
+ rdt_last_cmd_puts("can only assign online cpus\n");
+ goto unlock;
+ }
+
+ if (rdtgrp->type == RDTCTRL_GROUP)
+ ret = cpus_ctrl_write(rdtgrp, newmask, tmpmask, tmpmask1);
+ else if (rdtgrp->type == RDTMON_GROUP)
+ ret = cpus_mon_write(rdtgrp, newmask, tmpmask);
+ else
+ ret = -EINVAL;
+
+unlock:
+ rdtgroup_kn_unlock(of->kn);
+ free_cpumask_var(tmpmask);
+ free_cpumask_var(newmask);
+ free_cpumask_var(tmpmask1);
+
+ return ret ?: nbytes;
+}
+
+/**
+ * rdtgroup_remove - the helper to remove resource group safely
+ * @rdtgrp: resource group to remove
+ *
+ * On resource group creation via a mkdir, an extra kernfs_node reference is
+ * taken to ensure that the rdtgroup structure remains accessible for the
+ * rdtgroup_kn_unlock() calls where it is removed.
+ *
+ * Drop the extra reference here, then free the rdtgroup structure.
+ *
+ * Return: void
+ */
+static void rdtgroup_remove(struct rdtgroup *rdtgrp)
+{
+ kernfs_put(rdtgrp->kn);
+ kfree(rdtgrp);
+}
+
+static void _update_task_closid_rmid(void *task)
+{
+ /*
+ * If the task is still current on this CPU, update PQR_ASSOC MSR.
+ * Otherwise, the MSR is updated when the task is scheduled in.
+ */
+ if (task == current)
+ intel_rdt_sched_in();
+}
+
+static void update_task_closid_rmid(struct task_struct *t)
+{
+ if (IS_ENABLED(CONFIG_SMP) && task_curr(t))
+ smp_call_function_single(task_cpu(t), _update_task_closid_rmid, t, 1);
+ else
+ _update_task_closid_rmid(t);
+}
+
+static int __rdtgroup_move_task(struct task_struct *tsk,
+ struct rdtgroup *rdtgrp)
+{
+ /* If the task is already in rdtgrp, no need to move the task. */
+ if ((rdtgrp->type == RDTCTRL_GROUP && tsk->closid == rdtgrp->closid &&
+ tsk->rmid == rdtgrp->mon.rmid) ||
+ (rdtgrp->type == RDTMON_GROUP && tsk->rmid == rdtgrp->mon.rmid &&
+ tsk->closid == rdtgrp->mon.parent->closid))
+ return 0;
+
+ /*
+ * Set the task's closid/rmid before the PQR_ASSOC MSR can be
+ * updated by them.
+ *
+ * For ctrl_mon groups, move both closid and rmid.
+ * For monitor groups, can move the tasks only from
+ * their parent CTRL group.
+ */
+
+ if (rdtgrp->type == RDTCTRL_GROUP) {
+ tsk->closid = rdtgrp->closid;
+ tsk->rmid = rdtgrp->mon.rmid;
+ } else if (rdtgrp->type == RDTMON_GROUP) {
+ if (rdtgrp->mon.parent->closid == tsk->closid) {
+ tsk->rmid = rdtgrp->mon.rmid;
+ } else {
+ rdt_last_cmd_puts("Can't move task to different control group\n");
+ return -EINVAL;
+ }
+ }
+
+ /*
+ * Ensure the task's closid and rmid are written before determining if
+ * the task is current that will decide if it will be interrupted.
+ */
+ barrier();
+
+ /*
+ * By now, the task's closid and rmid are set. If the task is current
+ * on a CPU, the PQR_ASSOC MSR needs to be updated to make the resource
+ * group go into effect. If the task is not current, the MSR will be
+ * updated when the task is scheduled in.
+ */
+ update_task_closid_rmid(tsk);
+
+ return 0;
+}
+
+/**
+ * rdtgroup_tasks_assigned - Test if tasks have been assigned to resource group
+ * @r: Resource group
+ *
+ * Return: 1 if tasks have been assigned to @r, 0 otherwise
+ */
+int rdtgroup_tasks_assigned(struct rdtgroup *r)
+{
+ struct task_struct *p, *t;
+ int ret = 0;
+
+ lockdep_assert_held(&rdtgroup_mutex);
+
+ rcu_read_lock();
+ for_each_process_thread(p, t) {
+ if ((r->type == RDTCTRL_GROUP && t->closid == r->closid) ||
+ (r->type == RDTMON_GROUP && t->rmid == r->mon.rmid)) {
+ ret = 1;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static int rdtgroup_task_write_permission(struct task_struct *task,
+ struct kernfs_open_file *of)
+{
+ const struct cred *tcred = get_task_cred(task);
+ const struct cred *cred = current_cred();
+ int ret = 0;
+
+ /*
+ * Even if we're attaching all tasks in the thread group, we only
+ * need to check permissions on one of them.
+ */
+ if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
+ !uid_eq(cred->euid, tcred->uid) &&
+ !uid_eq(cred->euid, tcred->suid)) {
+ rdt_last_cmd_printf("No permission to move task %d\n", task->pid);
+ ret = -EPERM;
+ }
+
+ put_cred(tcred);
+ return ret;
+}
+
+static int rdtgroup_move_task(pid_t pid, struct rdtgroup *rdtgrp,
+ struct kernfs_open_file *of)
+{
+ struct task_struct *tsk;
+ int ret;
+
+ rcu_read_lock();
+ if (pid) {
+ tsk = find_task_by_vpid(pid);
+ if (!tsk) {
+ rcu_read_unlock();
+ rdt_last_cmd_printf("No task %d\n", pid);
+ return -ESRCH;
+ }
+ } else {
+ tsk = current;
+ }
+
+ get_task_struct(tsk);
+ rcu_read_unlock();
+
+ ret = rdtgroup_task_write_permission(tsk, of);
+ if (!ret)
+ ret = __rdtgroup_move_task(tsk, rdtgrp);
+
+ put_task_struct(tsk);
+ return ret;
+}
+
+static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct rdtgroup *rdtgrp;
+ int ret = 0;
+ pid_t pid;
+
+ if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
+ return -EINVAL;
+ rdtgrp = rdtgroup_kn_lock_live(of->kn);
+ if (!rdtgrp) {
+ rdtgroup_kn_unlock(of->kn);
+ return -ENOENT;
+ }
+ rdt_last_cmd_clear();
+
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED ||
+ rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
+ ret = -EINVAL;
+ rdt_last_cmd_puts("pseudo-locking in progress\n");
+ goto unlock;
+ }
+
+ ret = rdtgroup_move_task(pid, rdtgrp, of);
+
+unlock:
+ rdtgroup_kn_unlock(of->kn);
+
+ return ret ?: nbytes;
+}
+
+static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s)
+{
+ struct task_struct *p, *t;
+
+ rcu_read_lock();
+ for_each_process_thread(p, t) {
+ if ((r->type == RDTCTRL_GROUP && t->closid == r->closid) ||
+ (r->type == RDTMON_GROUP && t->rmid == r->mon.rmid))
+ seq_printf(s, "%d\n", t->pid);
+ }
+ rcu_read_unlock();
+}
+
+static int rdtgroup_tasks_show(struct kernfs_open_file *of,
+ struct seq_file *s, void *v)
+{
+ struct rdtgroup *rdtgrp;
+ int ret = 0;
+
+ rdtgrp = rdtgroup_kn_lock_live(of->kn);
+ if (rdtgrp)
+ show_rdt_tasks(rdtgrp, s);
+ else
+ ret = -ENOENT;
+ rdtgroup_kn_unlock(of->kn);
+
+ return ret;
+}
+
+static int rdt_last_cmd_status_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ int len;
+
+ mutex_lock(&rdtgroup_mutex);
+ len = seq_buf_used(&last_cmd_status);
+ if (len)
+ seq_printf(seq, "%.*s", len, last_cmd_status_buf);
+ else
+ seq_puts(seq, "ok\n");
+ mutex_unlock(&rdtgroup_mutex);
+ return 0;
+}
+
+static int rdt_num_closids_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct rdt_resource *r = of->kn->parent->priv;
+
+ seq_printf(seq, "%d\n", r->num_closid);
+ return 0;
+}
+
+static int rdt_default_ctrl_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct rdt_resource *r = of->kn->parent->priv;
+
+ seq_printf(seq, "%x\n", r->default_ctrl);
+ return 0;
+}
+
+static int rdt_min_cbm_bits_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct rdt_resource *r = of->kn->parent->priv;
+
+ seq_printf(seq, "%u\n", r->cache.min_cbm_bits);
+ return 0;
+}
+
+static int rdt_shareable_bits_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct rdt_resource *r = of->kn->parent->priv;
+
+ seq_printf(seq, "%x\n", r->cache.shareable_bits);
+ return 0;
+}
+
+/**
+ * rdt_bit_usage_show - Display current usage of resources
+ *
+ * A domain is a shared resource that can now be allocated differently. Here
+ * we display the current regions of the domain as an annotated bitmask.
+ * For each domain of this resource its allocation bitmask
+ * is annotated as below to indicate the current usage of the corresponding bit:
+ * 0 - currently unused
+ * X - currently available for sharing and used by software and hardware
+ * H - currently used by hardware only but available for software use
+ * S - currently used and shareable by software only
+ * E - currently used exclusively by one resource group
+ * P - currently pseudo-locked by one resource group
+ */
+static int rdt_bit_usage_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct rdt_resource *r = of->kn->parent->priv;
+ /*
+ * Use unsigned long even though only 32 bits are used to ensure
+ * test_bit() is used safely.
+ */
+ unsigned long sw_shareable = 0, hw_shareable = 0;
+ unsigned long exclusive = 0, pseudo_locked = 0;
+ struct rdt_domain *dom;
+ int i, hwb, swb, excl, psl;
+ enum rdtgrp_mode mode;
+ bool sep = false;
+ u32 *ctrl;
+
+ mutex_lock(&rdtgroup_mutex);
+ hw_shareable = r->cache.shareable_bits;
+ list_for_each_entry(dom, &r->domains, list) {
+ if (sep)
+ seq_putc(seq, ';');
+ ctrl = dom->ctrl_val;
+ sw_shareable = 0;
+ exclusive = 0;
+ seq_printf(seq, "%d=", dom->id);
+ for (i = 0; i < closids_supported(); i++, ctrl++) {
+ if (!closid_allocated(i))
+ continue;
+ mode = rdtgroup_mode_by_closid(i);
+ switch (mode) {
+ case RDT_MODE_SHAREABLE:
+ sw_shareable |= *ctrl;
+ break;
+ case RDT_MODE_EXCLUSIVE:
+ exclusive |= *ctrl;
+ break;
+ case RDT_MODE_PSEUDO_LOCKSETUP:
+ /*
+ * RDT_MODE_PSEUDO_LOCKSETUP is possible
+ * here but not included since the CBM
+ * associated with this CLOSID in this mode
+ * is not initialized and no task or cpu can be
+ * assigned this CLOSID.
+ */
+ break;
+ case RDT_MODE_PSEUDO_LOCKED:
+ case RDT_NUM_MODES:
+ WARN(1,
+ "invalid mode for closid %d\n", i);
+ break;
+ }
+ }
+ for (i = r->cache.cbm_len - 1; i >= 0; i--) {
+ pseudo_locked = dom->plr ? dom->plr->cbm : 0;
+ hwb = test_bit(i, &hw_shareable);
+ swb = test_bit(i, &sw_shareable);
+ excl = test_bit(i, &exclusive);
+ psl = test_bit(i, &pseudo_locked);
+ if (hwb && swb)
+ seq_putc(seq, 'X');
+ else if (hwb && !swb)
+ seq_putc(seq, 'H');
+ else if (!hwb && swb)
+ seq_putc(seq, 'S');
+ else if (excl)
+ seq_putc(seq, 'E');
+ else if (psl)
+ seq_putc(seq, 'P');
+ else /* Unused bits remain */
+ seq_putc(seq, '0');
+ }
+ sep = true;
+ }
+ seq_putc(seq, '\n');
+ mutex_unlock(&rdtgroup_mutex);
+ return 0;
+}
+
+static int rdt_min_bw_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct rdt_resource *r = of->kn->parent->priv;
+
+ seq_printf(seq, "%u\n", r->membw.min_bw);
+ return 0;
+}
+
+static int rdt_num_rmids_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct rdt_resource *r = of->kn->parent->priv;
+
+ seq_printf(seq, "%d\n", r->num_rmid);
+
+ return 0;
+}
+
+static int rdt_mon_features_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct rdt_resource *r = of->kn->parent->priv;
+ struct mon_evt *mevt;
+
+ list_for_each_entry(mevt, &r->evt_list, list)
+ seq_printf(seq, "%s\n", mevt->name);
+
+ return 0;
+}
+
+static int rdt_bw_gran_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct rdt_resource *r = of->kn->parent->priv;
+
+ seq_printf(seq, "%u\n", r->membw.bw_gran);
+ return 0;
+}
+
+static int rdt_delay_linear_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct rdt_resource *r = of->kn->parent->priv;
+
+ seq_printf(seq, "%u\n", r->membw.delay_linear);
+ return 0;
+}
+
+static int max_threshold_occ_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct rdt_resource *r = of->kn->parent->priv;
+
+ seq_printf(seq, "%u\n", intel_cqm_threshold * r->mon_scale);
+
+ return 0;
+}
+
+static ssize_t max_threshold_occ_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct rdt_resource *r = of->kn->parent->priv;
+ unsigned int bytes;
+ int ret;
+
+ ret = kstrtouint(buf, 0, &bytes);
+ if (ret)
+ return ret;
+
+ if (bytes > (boot_cpu_data.x86_cache_size * 1024))
+ return -EINVAL;
+
+ intel_cqm_threshold = bytes / r->mon_scale;
+
+ return nbytes;
+}
+
+/*
+ * rdtgroup_mode_show - Display mode of this resource group
+ */
+static int rdtgroup_mode_show(struct kernfs_open_file *of,
+ struct seq_file *s, void *v)
+{
+ struct rdtgroup *rdtgrp;
+
+ rdtgrp = rdtgroup_kn_lock_live(of->kn);
+ if (!rdtgrp) {
+ rdtgroup_kn_unlock(of->kn);
+ return -ENOENT;
+ }
+
+ seq_printf(s, "%s\n", rdtgroup_mode_str(rdtgrp->mode));
+
+ rdtgroup_kn_unlock(of->kn);
+ return 0;
+}
+
+/**
+ * rdt_cdp_peer_get - Retrieve CDP peer if it exists
+ * @r: RDT resource to which RDT domain @d belongs
+ * @d: Cache instance for which a CDP peer is requested
+ * @r_cdp: RDT resource that shares hardware with @r (RDT resource peer)
+ * Used to return the result.
+ * @d_cdp: RDT domain that shares hardware with @d (RDT domain peer)
+ * Used to return the result.
+ *
+ * RDT resources are managed independently and by extension the RDT domains
+ * (RDT resource instances) are managed independently also. The Code and
+ * Data Prioritization (CDP) RDT resources, while managed independently,
+ * could refer to the same underlying hardware. For example,
+ * RDT_RESOURCE_L2CODE and RDT_RESOURCE_L2DATA both refer to the L2 cache.
+ *
+ * When provided with an RDT resource @r and an instance of that RDT
+ * resource @d rdt_cdp_peer_get() will return if there is a peer RDT
+ * resource and the exact instance that shares the same hardware.
+ *
+ * Return: 0 if a CDP peer was found, <0 on error or if no CDP peer exists.
+ * If a CDP peer was found, @r_cdp will point to the peer RDT resource
+ * and @d_cdp will point to the peer RDT domain.
+ */
+static int rdt_cdp_peer_get(struct rdt_resource *r, struct rdt_domain *d,
+ struct rdt_resource **r_cdp,
+ struct rdt_domain **d_cdp)
+{
+ struct rdt_resource *_r_cdp = NULL;
+ struct rdt_domain *_d_cdp = NULL;
+ int ret = 0;
+
+ switch (r->rid) {
+ case RDT_RESOURCE_L3DATA:
+ _r_cdp = &rdt_resources_all[RDT_RESOURCE_L3CODE];
+ break;
+ case RDT_RESOURCE_L3CODE:
+ _r_cdp = &rdt_resources_all[RDT_RESOURCE_L3DATA];
+ break;
+ case RDT_RESOURCE_L2DATA:
+ _r_cdp = &rdt_resources_all[RDT_RESOURCE_L2CODE];
+ break;
+ case RDT_RESOURCE_L2CODE:
+ _r_cdp = &rdt_resources_all[RDT_RESOURCE_L2DATA];
+ break;
+ default:
+ ret = -ENOENT;
+ goto out;
+ }
+
+ /*
+ * When a new CPU comes online and CDP is enabled then the new
+ * RDT domains (if any) associated with both CDP RDT resources
+ * are added in the same CPU online routine while the
+ * rdtgroup_mutex is held. It should thus not happen for one
+ * RDT domain to exist and be associated with its RDT CDP
+ * resource but there is no RDT domain associated with the
+ * peer RDT CDP resource. Hence the WARN.
+ */
+ _d_cdp = rdt_find_domain(_r_cdp, d->id, NULL);
+ if (WARN_ON(IS_ERR_OR_NULL(_d_cdp))) {
+ _r_cdp = NULL;
+ _d_cdp = NULL;
+ ret = -EINVAL;
+ }
+
+out:
+ *r_cdp = _r_cdp;
+ *d_cdp = _d_cdp;
+
+ return ret;
+}
+
+/**
+ * __rdtgroup_cbm_overlaps - Does CBM for intended closid overlap with other
+ * @r: Resource to which domain instance @d belongs.
+ * @d: The domain instance for which @closid is being tested.
+ * @cbm: Capacity bitmask being tested.
+ * @closid: Intended closid for @cbm.
+ * @exclusive: Only check if overlaps with exclusive resource groups
+ *
+ * Checks if provided @cbm intended to be used for @closid on domain
+ * @d overlaps with any other closids or other hardware usage associated
+ * with this domain. If @exclusive is true then only overlaps with
+ * resource groups in exclusive mode will be considered. If @exclusive
+ * is false then overlaps with any resource group or hardware entities
+ * will be considered.
+ *
+ * @cbm is unsigned long, even if only 32 bits are used, to make the
+ * bitmap functions work correctly.
+ *
+ * Return: false if CBM does not overlap, true if it does.
+ */
+static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d,
+ unsigned long cbm, int closid, bool exclusive)
+{
+ enum rdtgrp_mode mode;
+ unsigned long ctrl_b;
+ u32 *ctrl;
+ int i;
+
+ /* Check for any overlap with regions used by hardware directly */
+ if (!exclusive) {
+ ctrl_b = r->cache.shareable_bits;
+ if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len))
+ return true;
+ }
+
+ /* Check for overlap with other resource groups */
+ ctrl = d->ctrl_val;
+ for (i = 0; i < closids_supported(); i++, ctrl++) {
+ ctrl_b = *ctrl;
+ mode = rdtgroup_mode_by_closid(i);
+ if (closid_allocated(i) && i != closid &&
+ mode != RDT_MODE_PSEUDO_LOCKSETUP) {
+ if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len)) {
+ if (exclusive) {
+ if (mode == RDT_MODE_EXCLUSIVE)
+ return true;
+ continue;
+ }
+ return true;
+ }
+ }
+ }
+
+ return false;
+}
+
+/**
+ * rdtgroup_cbm_overlaps - Does CBM overlap with other use of hardware
+ * @r: Resource to which domain instance @d belongs.
+ * @d: The domain instance for which @closid is being tested.
+ * @cbm: Capacity bitmask being tested.
+ * @closid: Intended closid for @cbm.
+ * @exclusive: Only check if overlaps with exclusive resource groups
+ *
+ * Resources that can be allocated using a CBM can use the CBM to control
+ * the overlap of these allocations. rdtgroup_cmb_overlaps() is the test
+ * for overlap. Overlap test is not limited to the specific resource for
+ * which the CBM is intended though - when dealing with CDP resources that
+ * share the underlying hardware the overlap check should be performed on
+ * the CDP resource sharing the hardware also.
+ *
+ * Refer to description of __rdtgroup_cbm_overlaps() for the details of the
+ * overlap test.
+ *
+ * Return: true if CBM overlap detected, false if there is no overlap
+ */
+bool rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d,
+ unsigned long cbm, int closid, bool exclusive)
+{
+ struct rdt_resource *r_cdp;
+ struct rdt_domain *d_cdp;
+
+ if (__rdtgroup_cbm_overlaps(r, d, cbm, closid, exclusive))
+ return true;
+
+ if (rdt_cdp_peer_get(r, d, &r_cdp, &d_cdp) < 0)
+ return false;
+
+ return __rdtgroup_cbm_overlaps(r_cdp, d_cdp, cbm, closid, exclusive);
+}
+
+/**
+ * rdtgroup_mode_test_exclusive - Test if this resource group can be exclusive
+ *
+ * An exclusive resource group implies that there should be no sharing of
+ * its allocated resources. At the time this group is considered to be
+ * exclusive this test can determine if its current schemata supports this
+ * setting by testing for overlap with all other resource groups.
+ *
+ * Return: true if resource group can be exclusive, false if there is overlap
+ * with allocations of other resource groups and thus this resource group
+ * cannot be exclusive.
+ */
+static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp)
+{
+ int closid = rdtgrp->closid;
+ struct rdt_resource *r;
+ bool has_cache = false;
+ struct rdt_domain *d;
+
+ for_each_alloc_enabled_rdt_resource(r) {
+ if (r->rid == RDT_RESOURCE_MBA)
+ continue;
+ has_cache = true;
+ list_for_each_entry(d, &r->domains, list) {
+ if (rdtgroup_cbm_overlaps(r, d, d->ctrl_val[closid],
+ rdtgrp->closid, false)) {
+ rdt_last_cmd_puts("schemata overlaps\n");
+ return false;
+ }
+ }
+ }
+
+ if (!has_cache) {
+ rdt_last_cmd_puts("cannot be exclusive without CAT/CDP\n");
+ return false;
+ }
+
+ return true;
+}
+
+/**
+ * rdtgroup_mode_write - Modify the resource group's mode
+ *
+ */
+static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct rdtgroup *rdtgrp;
+ enum rdtgrp_mode mode;
+ int ret = 0;
+
+ /* Valid input requires a trailing newline */
+ if (nbytes == 0 || buf[nbytes - 1] != '\n')
+ return -EINVAL;
+ buf[nbytes - 1] = '\0';
+
+ rdtgrp = rdtgroup_kn_lock_live(of->kn);
+ if (!rdtgrp) {
+ rdtgroup_kn_unlock(of->kn);
+ return -ENOENT;
+ }
+
+ rdt_last_cmd_clear();
+
+ mode = rdtgrp->mode;
+
+ if ((!strcmp(buf, "shareable") && mode == RDT_MODE_SHAREABLE) ||
+ (!strcmp(buf, "exclusive") && mode == RDT_MODE_EXCLUSIVE) ||
+ (!strcmp(buf, "pseudo-locksetup") &&
+ mode == RDT_MODE_PSEUDO_LOCKSETUP) ||
+ (!strcmp(buf, "pseudo-locked") && mode == RDT_MODE_PSEUDO_LOCKED))
+ goto out;
+
+ if (mode == RDT_MODE_PSEUDO_LOCKED) {
+ rdt_last_cmd_printf("cannot change pseudo-locked group\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (!strcmp(buf, "shareable")) {
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
+ ret = rdtgroup_locksetup_exit(rdtgrp);
+ if (ret)
+ goto out;
+ }
+ rdtgrp->mode = RDT_MODE_SHAREABLE;
+ } else if (!strcmp(buf, "exclusive")) {
+ if (!rdtgroup_mode_test_exclusive(rdtgrp)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
+ ret = rdtgroup_locksetup_exit(rdtgrp);
+ if (ret)
+ goto out;
+ }
+ rdtgrp->mode = RDT_MODE_EXCLUSIVE;
+ } else if (!strcmp(buf, "pseudo-locksetup")) {
+ ret = rdtgroup_locksetup_enter(rdtgrp);
+ if (ret)
+ goto out;
+ rdtgrp->mode = RDT_MODE_PSEUDO_LOCKSETUP;
+ } else {
+ rdt_last_cmd_printf("unknown/unsupported mode\n");
+ ret = -EINVAL;
+ }
+
+out:
+ rdtgroup_kn_unlock(of->kn);
+ return ret ?: nbytes;
+}
+
+/**
+ * rdtgroup_cbm_to_size - Translate CBM to size in bytes
+ * @r: RDT resource to which @d belongs.
+ * @d: RDT domain instance.
+ * @cbm: bitmask for which the size should be computed.
+ *
+ * The bitmask provided associated with the RDT domain instance @d will be
+ * translated into how many bytes it represents. The size in bytes is
+ * computed by first dividing the total cache size by the CBM length to
+ * determine how many bytes each bit in the bitmask represents. The result
+ * is multiplied with the number of bits set in the bitmask.
+ *
+ * @cbm is unsigned long, even if only 32 bits are used to make the
+ * bitmap functions work correctly.
+ */
+unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r,
+ struct rdt_domain *d, unsigned long cbm)
+{
+ struct cpu_cacheinfo *ci;
+ unsigned int size = 0;
+ int num_b, i;
+
+ num_b = bitmap_weight(&cbm, r->cache.cbm_len);
+ ci = get_cpu_cacheinfo(cpumask_any(&d->cpu_mask));
+ for (i = 0; i < ci->num_leaves; i++) {
+ if (ci->info_list[i].level == r->cache_level) {
+ size = ci->info_list[i].size / r->cache.cbm_len * num_b;
+ break;
+ }
+ }
+
+ return size;
+}
+
+/**
+ * rdtgroup_size_show - Display size in bytes of allocated regions
+ *
+ * The "size" file mirrors the layout of the "schemata" file, printing the
+ * size in bytes of each region instead of the capacity bitmask.
+ *
+ */
+static int rdtgroup_size_show(struct kernfs_open_file *of,
+ struct seq_file *s, void *v)
+{
+ struct rdtgroup *rdtgrp;
+ struct rdt_resource *r;
+ struct rdt_domain *d;
+ unsigned int size;
+ int ret = 0;
+ bool sep;
+ u32 ctrl;
+
+ rdtgrp = rdtgroup_kn_lock_live(of->kn);
+ if (!rdtgrp) {
+ rdtgroup_kn_unlock(of->kn);
+ return -ENOENT;
+ }
+
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
+ if (!rdtgrp->plr->d) {
+ rdt_last_cmd_clear();
+ rdt_last_cmd_puts("Cache domain offline\n");
+ ret = -ENODEV;
+ } else {
+ seq_printf(s, "%*s:", max_name_width,
+ rdtgrp->plr->r->name);
+ size = rdtgroup_cbm_to_size(rdtgrp->plr->r,
+ rdtgrp->plr->d,
+ rdtgrp->plr->cbm);
+ seq_printf(s, "%d=%u\n", rdtgrp->plr->d->id, size);
+ }
+ goto out;
+ }
+
+ for_each_alloc_enabled_rdt_resource(r) {
+ sep = false;
+ seq_printf(s, "%*s:", max_name_width, r->name);
+ list_for_each_entry(d, &r->domains, list) {
+ if (sep)
+ seq_putc(s, ';');
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
+ size = 0;
+ } else {
+ ctrl = (!is_mba_sc(r) ?
+ d->ctrl_val[rdtgrp->closid] :
+ d->mbps_val[rdtgrp->closid]);
+ if (r->rid == RDT_RESOURCE_MBA)
+ size = ctrl;
+ else
+ size = rdtgroup_cbm_to_size(r, d, ctrl);
+ }
+ seq_printf(s, "%d=%u", d->id, size);
+ sep = true;
+ }
+ seq_putc(s, '\n');
+ }
+
+out:
+ rdtgroup_kn_unlock(of->kn);
+
+ return ret;
+}
+
+/* rdtgroup information files for one cache resource. */
+static struct rftype res_common_files[] = {
+ {
+ .name = "last_cmd_status",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdt_last_cmd_status_show,
+ .fflags = RF_TOP_INFO,
+ },
+ {
+ .name = "num_closids",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdt_num_closids_show,
+ .fflags = RF_CTRL_INFO,
+ },
+ {
+ .name = "mon_features",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdt_mon_features_show,
+ .fflags = RF_MON_INFO,
+ },
+ {
+ .name = "num_rmids",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdt_num_rmids_show,
+ .fflags = RF_MON_INFO,
+ },
+ {
+ .name = "cbm_mask",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdt_default_ctrl_show,
+ .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE,
+ },
+ {
+ .name = "min_cbm_bits",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdt_min_cbm_bits_show,
+ .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE,
+ },
+ {
+ .name = "shareable_bits",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdt_shareable_bits_show,
+ .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE,
+ },
+ {
+ .name = "bit_usage",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdt_bit_usage_show,
+ .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE,
+ },
+ {
+ .name = "min_bandwidth",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdt_min_bw_show,
+ .fflags = RF_CTRL_INFO | RFTYPE_RES_MB,
+ },
+ {
+ .name = "bandwidth_gran",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdt_bw_gran_show,
+ .fflags = RF_CTRL_INFO | RFTYPE_RES_MB,
+ },
+ {
+ .name = "delay_linear",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdt_delay_linear_show,
+ .fflags = RF_CTRL_INFO | RFTYPE_RES_MB,
+ },
+ {
+ .name = "max_threshold_occupancy",
+ .mode = 0644,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .write = max_threshold_occ_write,
+ .seq_show = max_threshold_occ_show,
+ .fflags = RF_MON_INFO | RFTYPE_RES_CACHE,
+ },
+ {
+ .name = "cpus",
+ .mode = 0644,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .write = rdtgroup_cpus_write,
+ .seq_show = rdtgroup_cpus_show,
+ .fflags = RFTYPE_BASE,
+ },
+ {
+ .name = "cpus_list",
+ .mode = 0644,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .write = rdtgroup_cpus_write,
+ .seq_show = rdtgroup_cpus_show,
+ .flags = RFTYPE_FLAGS_CPUS_LIST,
+ .fflags = RFTYPE_BASE,
+ },
+ {
+ .name = "tasks",
+ .mode = 0644,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .write = rdtgroup_tasks_write,
+ .seq_show = rdtgroup_tasks_show,
+ .fflags = RFTYPE_BASE,
+ },
+ {
+ .name = "schemata",
+ .mode = 0644,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .write = rdtgroup_schemata_write,
+ .seq_show = rdtgroup_schemata_show,
+ .fflags = RF_CTRL_BASE,
+ },
+ {
+ .name = "mode",
+ .mode = 0644,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .write = rdtgroup_mode_write,
+ .seq_show = rdtgroup_mode_show,
+ .fflags = RF_CTRL_BASE,
+ },
+ {
+ .name = "size",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdtgroup_size_show,
+ .fflags = RF_CTRL_BASE,
+ },
+
+};
+
+static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags)
+{
+ struct rftype *rfts, *rft;
+ int ret, len;
+
+ rfts = res_common_files;
+ len = ARRAY_SIZE(res_common_files);
+
+ lockdep_assert_held(&rdtgroup_mutex);
+
+ for (rft = rfts; rft < rfts + len; rft++) {
+ if ((fflags & rft->fflags) == rft->fflags) {
+ ret = rdtgroup_add_file(kn, rft);
+ if (ret)
+ goto error;
+ }
+ }
+
+ return 0;
+error:
+ pr_warn("Failed to add %s, err=%d\n", rft->name, ret);
+ while (--rft >= rfts) {
+ if ((fflags & rft->fflags) == rft->fflags)
+ kernfs_remove_by_name(kn, rft->name);
+ }
+ return ret;
+}
+
+/**
+ * rdtgroup_kn_mode_restrict - Restrict user access to named resctrl file
+ * @r: The resource group with which the file is associated.
+ * @name: Name of the file
+ *
+ * The permissions of named resctrl file, directory, or link are modified
+ * to not allow read, write, or execute by any user.
+ *
+ * WARNING: This function is intended to communicate to the user that the
+ * resctrl file has been locked down - that it is not relevant to the
+ * particular state the system finds itself in. It should not be relied
+ * on to protect from user access because after the file's permissions
+ * are restricted the user can still change the permissions using chmod
+ * from the command line.
+ *
+ * Return: 0 on success, <0 on failure.
+ */
+int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name)
+{
+ struct iattr iattr = {.ia_valid = ATTR_MODE,};
+ struct kernfs_node *kn;
+ int ret = 0;
+
+ kn = kernfs_find_and_get_ns(r->kn, name, NULL);
+ if (!kn)
+ return -ENOENT;
+
+ switch (kernfs_type(kn)) {
+ case KERNFS_DIR:
+ iattr.ia_mode = S_IFDIR;
+ break;
+ case KERNFS_FILE:
+ iattr.ia_mode = S_IFREG;
+ break;
+ case KERNFS_LINK:
+ iattr.ia_mode = S_IFLNK;
+ break;
+ }
+
+ ret = kernfs_setattr(kn, &iattr);
+ kernfs_put(kn);
+ return ret;
+}
+
+/**
+ * rdtgroup_kn_mode_restore - Restore user access to named resctrl file
+ * @r: The resource group with which the file is associated.
+ * @name: Name of the file
+ * @mask: Mask of permissions that should be restored
+ *
+ * Restore the permissions of the named file. If @name is a directory the
+ * permissions of its parent will be used.
+ *
+ * Return: 0 on success, <0 on failure.
+ */
+int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name,
+ umode_t mask)
+{
+ struct iattr iattr = {.ia_valid = ATTR_MODE,};
+ struct kernfs_node *kn, *parent;
+ struct rftype *rfts, *rft;
+ int ret, len;
+
+ rfts = res_common_files;
+ len = ARRAY_SIZE(res_common_files);
+
+ for (rft = rfts; rft < rfts + len; rft++) {
+ if (!strcmp(rft->name, name))
+ iattr.ia_mode = rft->mode & mask;
+ }
+
+ kn = kernfs_find_and_get_ns(r->kn, name, NULL);
+ if (!kn)
+ return -ENOENT;
+
+ switch (kernfs_type(kn)) {
+ case KERNFS_DIR:
+ parent = kernfs_get_parent(kn);
+ if (parent) {
+ iattr.ia_mode |= parent->mode;
+ kernfs_put(parent);
+ }
+ iattr.ia_mode |= S_IFDIR;
+ break;
+ case KERNFS_FILE:
+ iattr.ia_mode |= S_IFREG;
+ break;
+ case KERNFS_LINK:
+ iattr.ia_mode |= S_IFLNK;
+ break;
+ }
+
+ ret = kernfs_setattr(kn, &iattr);
+ kernfs_put(kn);
+ return ret;
+}
+
+static int rdtgroup_mkdir_info_resdir(struct rdt_resource *r, char *name,
+ unsigned long fflags)
+{
+ struct kernfs_node *kn_subdir;
+ int ret;
+
+ kn_subdir = kernfs_create_dir(kn_info, name,
+ kn_info->mode, r);
+ if (IS_ERR(kn_subdir))
+ return PTR_ERR(kn_subdir);
+
+ ret = rdtgroup_kn_set_ugid(kn_subdir);
+ if (ret)
+ return ret;
+
+ ret = rdtgroup_add_files(kn_subdir, fflags);
+ if (!ret)
+ kernfs_activate(kn_subdir);
+
+ return ret;
+}
+
+static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
+{
+ struct rdt_resource *r;
+ unsigned long fflags;
+ char name[32];
+ int ret;
+
+ /* create the directory */
+ kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL);
+ if (IS_ERR(kn_info))
+ return PTR_ERR(kn_info);
+
+ ret = rdtgroup_add_files(kn_info, RF_TOP_INFO);
+ if (ret)
+ goto out_destroy;
+
+ for_each_alloc_enabled_rdt_resource(r) {
+ fflags = r->fflags | RF_CTRL_INFO;
+ ret = rdtgroup_mkdir_info_resdir(r, r->name, fflags);
+ if (ret)
+ goto out_destroy;
+ }
+
+ for_each_mon_enabled_rdt_resource(r) {
+ fflags = r->fflags | RF_MON_INFO;
+ sprintf(name, "%s_MON", r->name);
+ ret = rdtgroup_mkdir_info_resdir(r, name, fflags);
+ if (ret)
+ goto out_destroy;
+ }
+
+ ret = rdtgroup_kn_set_ugid(kn_info);
+ if (ret)
+ goto out_destroy;
+
+ kernfs_activate(kn_info);
+
+ return 0;
+
+out_destroy:
+ kernfs_remove(kn_info);
+ return ret;
+}
+
+static int
+mongroup_create_dir(struct kernfs_node *parent_kn, struct rdtgroup *prgrp,
+ char *name, struct kernfs_node **dest_kn)
+{
+ struct kernfs_node *kn;
+ int ret;
+
+ /* create the directory */
+ kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp);
+ if (IS_ERR(kn))
+ return PTR_ERR(kn);
+
+ if (dest_kn)
+ *dest_kn = kn;
+
+ ret = rdtgroup_kn_set_ugid(kn);
+ if (ret)
+ goto out_destroy;
+
+ kernfs_activate(kn);
+
+ return 0;
+
+out_destroy:
+ kernfs_remove(kn);
+ return ret;
+}
+
+static void l3_qos_cfg_update(void *arg)
+{
+ bool *enable = arg;
+
+ wrmsrl(IA32_L3_QOS_CFG, *enable ? L3_QOS_CDP_ENABLE : 0ULL);
+}
+
+static void l2_qos_cfg_update(void *arg)
+{
+ bool *enable = arg;
+
+ wrmsrl(IA32_L2_QOS_CFG, *enable ? L2_QOS_CDP_ENABLE : 0ULL);
+}
+
+static inline bool is_mba_linear(void)
+{
+ return rdt_resources_all[RDT_RESOURCE_MBA].membw.delay_linear;
+}
+
+static int set_cache_qos_cfg(int level, bool enable)
+{
+ void (*update)(void *arg);
+ struct rdt_resource *r_l;
+ cpumask_var_t cpu_mask;
+ struct rdt_domain *d;
+ int cpu;
+
+ if (level == RDT_RESOURCE_L3)
+ update = l3_qos_cfg_update;
+ else if (level == RDT_RESOURCE_L2)
+ update = l2_qos_cfg_update;
+ else
+ return -EINVAL;
+
+ if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ r_l = &rdt_resources_all[level];
+ list_for_each_entry(d, &r_l->domains, list) {
+ /* Pick one CPU from each domain instance to update MSR */
+ cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
+ }
+ cpu = get_cpu();
+ /* Update QOS_CFG MSR on this cpu if it's in cpu_mask. */
+ if (cpumask_test_cpu(cpu, cpu_mask))
+ update(&enable);
+ /* Update QOS_CFG MSR on all other cpus in cpu_mask. */
+ smp_call_function_many(cpu_mask, update, &enable, 1);
+ put_cpu();
+
+ free_cpumask_var(cpu_mask);
+
+ return 0;
+}
+
+/* Restore the qos cfg state when a domain comes online */
+void rdt_domain_reconfigure_cdp(struct rdt_resource *r)
+{
+ if (!r->alloc_capable)
+ return;
+
+ if (r == &rdt_resources_all[RDT_RESOURCE_L2DATA])
+ l2_qos_cfg_update(&r->alloc_enabled);
+
+ if (r == &rdt_resources_all[RDT_RESOURCE_L3DATA])
+ l3_qos_cfg_update(&r->alloc_enabled);
+}
+
+/*
+ * Enable or disable the MBA software controller
+ * which helps user specify bandwidth in MBps.
+ * MBA software controller is supported only if
+ * MBM is supported and MBA is in linear scale.
+ */
+static int set_mba_sc(bool mba_sc)
+{
+ struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA];
+ struct rdt_domain *d;
+
+ if (!is_mbm_enabled() || !is_mba_linear() ||
+ mba_sc == is_mba_sc(r))
+ return -EINVAL;
+
+ r->membw.mba_sc = mba_sc;
+ list_for_each_entry(d, &r->domains, list)
+ setup_default_ctrlval(r, d->ctrl_val, d->mbps_val);
+
+ return 0;
+}
+
+static int cdp_enable(int level, int data_type, int code_type)
+{
+ struct rdt_resource *r_ldata = &rdt_resources_all[data_type];
+ struct rdt_resource *r_lcode = &rdt_resources_all[code_type];
+ struct rdt_resource *r_l = &rdt_resources_all[level];
+ int ret;
+
+ if (!r_l->alloc_capable || !r_ldata->alloc_capable ||
+ !r_lcode->alloc_capable)
+ return -EINVAL;
+
+ ret = set_cache_qos_cfg(level, true);
+ if (!ret) {
+ r_l->alloc_enabled = false;
+ r_ldata->alloc_enabled = true;
+ r_lcode->alloc_enabled = true;
+ }
+ return ret;
+}
+
+static int cdpl3_enable(void)
+{
+ return cdp_enable(RDT_RESOURCE_L3, RDT_RESOURCE_L3DATA,
+ RDT_RESOURCE_L3CODE);
+}
+
+static int cdpl2_enable(void)
+{
+ return cdp_enable(RDT_RESOURCE_L2, RDT_RESOURCE_L2DATA,
+ RDT_RESOURCE_L2CODE);
+}
+
+static void cdp_disable(int level, int data_type, int code_type)
+{
+ struct rdt_resource *r = &rdt_resources_all[level];
+
+ r->alloc_enabled = r->alloc_capable;
+
+ if (rdt_resources_all[data_type].alloc_enabled) {
+ rdt_resources_all[data_type].alloc_enabled = false;
+ rdt_resources_all[code_type].alloc_enabled = false;
+ set_cache_qos_cfg(level, false);
+ }
+}
+
+static void cdpl3_disable(void)
+{
+ cdp_disable(RDT_RESOURCE_L3, RDT_RESOURCE_L3DATA, RDT_RESOURCE_L3CODE);
+}
+
+static void cdpl2_disable(void)
+{
+ cdp_disable(RDT_RESOURCE_L2, RDT_RESOURCE_L2DATA, RDT_RESOURCE_L2CODE);
+}
+
+static void cdp_disable_all(void)
+{
+ if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled)
+ cdpl3_disable();
+ if (rdt_resources_all[RDT_RESOURCE_L2DATA].alloc_enabled)
+ cdpl2_disable();
+}
+
+static int parse_rdtgroupfs_options(char *data)
+{
+ char *token, *o = data;
+ int ret = 0;
+
+ while ((token = strsep(&o, ",")) != NULL) {
+ if (!*token) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (!strcmp(token, "cdp")) {
+ ret = cdpl3_enable();
+ if (ret)
+ goto out;
+ } else if (!strcmp(token, "cdpl2")) {
+ ret = cdpl2_enable();
+ if (ret)
+ goto out;
+ } else if (!strcmp(token, "mba_MBps")) {
+ ret = set_mba_sc(true);
+ if (ret)
+ goto out;
+ } else {
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+
+ return 0;
+
+out:
+ pr_err("Invalid mount option \"%s\"\n", token);
+
+ return ret;
+}
+
+/*
+ * We don't allow rdtgroup directories to be created anywhere
+ * except the root directory. Thus when looking for the rdtgroup
+ * structure for a kernfs node we are either looking at a directory,
+ * in which case the rdtgroup structure is pointed at by the "priv"
+ * field, otherwise we have a file, and need only look to the parent
+ * to find the rdtgroup.
+ */
+static struct rdtgroup *kernfs_to_rdtgroup(struct kernfs_node *kn)
+{
+ if (kernfs_type(kn) == KERNFS_DIR) {
+ /*
+ * All the resource directories use "kn->priv"
+ * to point to the "struct rdtgroup" for the
+ * resource. "info" and its subdirectories don't
+ * have rdtgroup structures, so return NULL here.
+ */
+ if (kn == kn_info || kn->parent == kn_info)
+ return NULL;
+ else
+ return kn->priv;
+ } else {
+ return kn->parent->priv;
+ }
+}
+
+struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn)
+{
+ struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn);
+
+ if (!rdtgrp)
+ return NULL;
+
+ atomic_inc(&rdtgrp->waitcount);
+ kernfs_break_active_protection(kn);
+
+ mutex_lock(&rdtgroup_mutex);
+
+ /* Was this group deleted while we waited? */
+ if (rdtgrp->flags & RDT_DELETED)
+ return NULL;
+
+ return rdtgrp;
+}
+
+void rdtgroup_kn_unlock(struct kernfs_node *kn)
+{
+ struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn);
+
+ if (!rdtgrp)
+ return;
+
+ mutex_unlock(&rdtgroup_mutex);
+
+ if (atomic_dec_and_test(&rdtgrp->waitcount) &&
+ (rdtgrp->flags & RDT_DELETED)) {
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
+ rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)
+ rdtgroup_pseudo_lock_remove(rdtgrp);
+ kernfs_unbreak_active_protection(kn);
+ rdtgroup_remove(rdtgrp);
+ } else {
+ kernfs_unbreak_active_protection(kn);
+ }
+}
+
+static int mkdir_mondata_all(struct kernfs_node *parent_kn,
+ struct rdtgroup *prgrp,
+ struct kernfs_node **mon_data_kn);
+
+static struct dentry *rdt_mount(struct file_system_type *fs_type,
+ int flags, const char *unused_dev_name,
+ void *data)
+{
+ struct rdt_domain *dom;
+ struct rdt_resource *r;
+ struct dentry *dentry;
+ int ret;
+
+ cpus_read_lock();
+ mutex_lock(&rdtgroup_mutex);
+ /*
+ * resctrl file system can only be mounted once.
+ */
+ if (static_branch_unlikely(&rdt_enable_key)) {
+ dentry = ERR_PTR(-EBUSY);
+ goto out;
+ }
+
+ ret = parse_rdtgroupfs_options(data);
+ if (ret) {
+ dentry = ERR_PTR(ret);
+ goto out_cdp;
+ }
+
+ closid_init();
+
+ ret = rdtgroup_create_info_dir(rdtgroup_default.kn);
+ if (ret) {
+ dentry = ERR_PTR(ret);
+ goto out_cdp;
+ }
+
+ if (rdt_mon_capable) {
+ ret = mongroup_create_dir(rdtgroup_default.kn,
+ &rdtgroup_default, "mon_groups",
+ &kn_mongrp);
+ if (ret) {
+ dentry = ERR_PTR(ret);
+ goto out_info;
+ }
+
+ ret = mkdir_mondata_all(rdtgroup_default.kn,
+ &rdtgroup_default, &kn_mondata);
+ if (ret) {
+ dentry = ERR_PTR(ret);
+ goto out_mongrp;
+ }
+ rdtgroup_default.mon.mon_data_kn = kn_mondata;
+ }
+
+ ret = rdt_pseudo_lock_init();
+ if (ret) {
+ dentry = ERR_PTR(ret);
+ goto out_mondata;
+ }
+
+ dentry = kernfs_mount(fs_type, flags, rdt_root,
+ RDTGROUP_SUPER_MAGIC, NULL);
+ if (IS_ERR(dentry))
+ goto out_psl;
+
+ if (rdt_alloc_capable)
+ static_branch_enable_cpuslocked(&rdt_alloc_enable_key);
+ if (rdt_mon_capable)
+ static_branch_enable_cpuslocked(&rdt_mon_enable_key);
+
+ if (rdt_alloc_capable || rdt_mon_capable)
+ static_branch_enable_cpuslocked(&rdt_enable_key);
+
+ if (is_mbm_enabled()) {
+ r = &rdt_resources_all[RDT_RESOURCE_L3];
+ list_for_each_entry(dom, &r->domains, list)
+ mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL);
+ }
+
+ goto out;
+
+out_psl:
+ rdt_pseudo_lock_release();
+out_mondata:
+ if (rdt_mon_capable)
+ kernfs_remove(kn_mondata);
+out_mongrp:
+ if (rdt_mon_capable)
+ kernfs_remove(kn_mongrp);
+out_info:
+ kernfs_remove(kn_info);
+out_cdp:
+ cdp_disable_all();
+out:
+ rdt_last_cmd_clear();
+ mutex_unlock(&rdtgroup_mutex);
+ cpus_read_unlock();
+
+ return dentry;
+}
+
+static int reset_all_ctrls(struct rdt_resource *r)
+{
+ struct msr_param msr_param;
+ cpumask_var_t cpu_mask;
+ struct rdt_domain *d;
+ int i, cpu;
+
+ if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ msr_param.res = r;
+ msr_param.low = 0;
+ msr_param.high = r->num_closid;
+
+ /*
+ * Disable resource control for this resource by setting all
+ * CBMs in all domains to the maximum mask value. Pick one CPU
+ * from each domain to update the MSRs below.
+ */
+ list_for_each_entry(d, &r->domains, list) {
+ cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
+
+ for (i = 0; i < r->num_closid; i++)
+ d->ctrl_val[i] = r->default_ctrl;
+ }
+ cpu = get_cpu();
+ /* Update CBM on this cpu if it's in cpu_mask. */
+ if (cpumask_test_cpu(cpu, cpu_mask))
+ rdt_ctrl_update(&msr_param);
+ /* Update CBM on all other cpus in cpu_mask. */
+ smp_call_function_many(cpu_mask, rdt_ctrl_update, &msr_param, 1);
+ put_cpu();
+
+ free_cpumask_var(cpu_mask);
+
+ return 0;
+}
+
+static bool is_closid_match(struct task_struct *t, struct rdtgroup *r)
+{
+ return (rdt_alloc_capable &&
+ (r->type == RDTCTRL_GROUP) && (t->closid == r->closid));
+}
+
+static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r)
+{
+ return (rdt_mon_capable &&
+ (r->type == RDTMON_GROUP) && (t->rmid == r->mon.rmid));
+}
+
+/*
+ * Move tasks from one to the other group. If @from is NULL, then all tasks
+ * in the systems are moved unconditionally (used for teardown).
+ *
+ * If @mask is not NULL the cpus on which moved tasks are running are set
+ * in that mask so the update smp function call is restricted to affected
+ * cpus.
+ */
+static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to,
+ struct cpumask *mask)
+{
+ struct task_struct *p, *t;
+
+ read_lock(&tasklist_lock);
+ for_each_process_thread(p, t) {
+ if (!from || is_closid_match(t, from) ||
+ is_rmid_match(t, from)) {
+ t->closid = to->closid;
+ t->rmid = to->mon.rmid;
+
+#ifdef CONFIG_SMP
+ /*
+ * This is safe on x86 w/o barriers as the ordering
+ * of writing to task_cpu() and t->on_cpu is
+ * reverse to the reading here. The detection is
+ * inaccurate as tasks might move or schedule
+ * before the smp function call takes place. In
+ * such a case the function call is pointless, but
+ * there is no other side effect.
+ */
+ if (mask && t->on_cpu)
+ cpumask_set_cpu(task_cpu(t), mask);
+#endif
+ }
+ }
+ read_unlock(&tasklist_lock);
+}
+
+static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp)
+{
+ struct rdtgroup *sentry, *stmp;
+ struct list_head *head;
+
+ head = &rdtgrp->mon.crdtgrp_list;
+ list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) {
+ free_rmid(sentry->mon.rmid);
+ list_del(&sentry->mon.crdtgrp_list);
+
+ if (atomic_read(&sentry->waitcount) != 0)
+ sentry->flags = RDT_DELETED;
+ else
+ rdtgroup_remove(sentry);
+ }
+}
+
+/*
+ * Forcibly remove all of subdirectories under root.
+ */
+static void rmdir_all_sub(void)
+{
+ struct rdtgroup *rdtgrp, *tmp;
+
+ /* Move all tasks to the default resource group */
+ rdt_move_group_tasks(NULL, &rdtgroup_default, NULL);
+
+ list_for_each_entry_safe(rdtgrp, tmp, &rdt_all_groups, rdtgroup_list) {
+ /* Free any child rmids */
+ free_all_child_rdtgrp(rdtgrp);
+
+ /* Remove each rdtgroup other than root */
+ if (rdtgrp == &rdtgroup_default)
+ continue;
+
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
+ rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)
+ rdtgroup_pseudo_lock_remove(rdtgrp);
+
+ /*
+ * Give any CPUs back to the default group. We cannot copy
+ * cpu_online_mask because a CPU might have executed the
+ * offline callback already, but is still marked online.
+ */
+ cpumask_or(&rdtgroup_default.cpu_mask,
+ &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
+
+ free_rmid(rdtgrp->mon.rmid);
+
+ kernfs_remove(rdtgrp->kn);
+ list_del(&rdtgrp->rdtgroup_list);
+
+ if (atomic_read(&rdtgrp->waitcount) != 0)
+ rdtgrp->flags = RDT_DELETED;
+ else
+ rdtgroup_remove(rdtgrp);
+ }
+ /* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */
+ update_closid_rmid(cpu_online_mask, &rdtgroup_default);
+
+ kernfs_remove(kn_info);
+ kernfs_remove(kn_mongrp);
+ kernfs_remove(kn_mondata);
+}
+
+static void rdt_kill_sb(struct super_block *sb)
+{
+ struct rdt_resource *r;
+
+ cpus_read_lock();
+ mutex_lock(&rdtgroup_mutex);
+
+ set_mba_sc(false);
+
+ /*Put everything back to default values. */
+ for_each_alloc_enabled_rdt_resource(r)
+ reset_all_ctrls(r);
+ cdp_disable_all();
+ rmdir_all_sub();
+ rdt_pseudo_lock_release();
+ rdtgroup_default.mode = RDT_MODE_SHAREABLE;
+ static_branch_disable_cpuslocked(&rdt_alloc_enable_key);
+ static_branch_disable_cpuslocked(&rdt_mon_enable_key);
+ static_branch_disable_cpuslocked(&rdt_enable_key);
+ kernfs_kill_sb(sb);
+ mutex_unlock(&rdtgroup_mutex);
+ cpus_read_unlock();
+}
+
+static struct file_system_type rdt_fs_type = {
+ .name = "resctrl",
+ .mount = rdt_mount,
+ .kill_sb = rdt_kill_sb,
+};
+
+static int mon_addfile(struct kernfs_node *parent_kn, const char *name,
+ void *priv)
+{
+ struct kernfs_node *kn;
+ int ret = 0;
+
+ kn = __kernfs_create_file(parent_kn, name, 0444,
+ GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0,
+ &kf_mondata_ops, priv, NULL, NULL);
+ if (IS_ERR(kn))
+ return PTR_ERR(kn);
+
+ ret = rdtgroup_kn_set_ugid(kn);
+ if (ret) {
+ kernfs_remove(kn);
+ return ret;
+ }
+
+ return ret;
+}
+
+/*
+ * Remove all subdirectories of mon_data of ctrl_mon groups
+ * and monitor groups with given domain id.
+ */
+void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, unsigned int dom_id)
+{
+ struct rdtgroup *prgrp, *crgrp;
+ char name[32];
+
+ if (!r->mon_enabled)
+ return;
+
+ list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
+ sprintf(name, "mon_%s_%02d", r->name, dom_id);
+ kernfs_remove_by_name(prgrp->mon.mon_data_kn, name);
+
+ list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list)
+ kernfs_remove_by_name(crgrp->mon.mon_data_kn, name);
+ }
+}
+
+static int mkdir_mondata_subdir(struct kernfs_node *parent_kn,
+ struct rdt_domain *d,
+ struct rdt_resource *r, struct rdtgroup *prgrp)
+{
+ union mon_data_bits priv;
+ struct kernfs_node *kn;
+ struct mon_evt *mevt;
+ struct rmid_read rr;
+ char name[32];
+ int ret;
+
+ sprintf(name, "mon_%s_%02d", r->name, d->id);
+ /* create the directory */
+ kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp);
+ if (IS_ERR(kn))
+ return PTR_ERR(kn);
+
+ ret = rdtgroup_kn_set_ugid(kn);
+ if (ret)
+ goto out_destroy;
+
+ if (WARN_ON(list_empty(&r->evt_list))) {
+ ret = -EPERM;
+ goto out_destroy;
+ }
+
+ priv.u.rid = r->rid;
+ priv.u.domid = d->id;
+ list_for_each_entry(mevt, &r->evt_list, list) {
+ priv.u.evtid = mevt->evtid;
+ ret = mon_addfile(kn, mevt->name, priv.priv);
+ if (ret)
+ goto out_destroy;
+
+ if (is_mbm_event(mevt->evtid))
+ mon_event_read(&rr, d, prgrp, mevt->evtid, true);
+ }
+ kernfs_activate(kn);
+ return 0;
+
+out_destroy:
+ kernfs_remove(kn);
+ return ret;
+}
+
+/*
+ * Add all subdirectories of mon_data for "ctrl_mon" groups
+ * and "monitor" groups with given domain id.
+ */
+void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
+ struct rdt_domain *d)
+{
+ struct kernfs_node *parent_kn;
+ struct rdtgroup *prgrp, *crgrp;
+ struct list_head *head;
+
+ if (!r->mon_enabled)
+ return;
+
+ list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
+ parent_kn = prgrp->mon.mon_data_kn;
+ mkdir_mondata_subdir(parent_kn, d, r, prgrp);
+
+ head = &prgrp->mon.crdtgrp_list;
+ list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
+ parent_kn = crgrp->mon.mon_data_kn;
+ mkdir_mondata_subdir(parent_kn, d, r, crgrp);
+ }
+ }
+}
+
+static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn,
+ struct rdt_resource *r,
+ struct rdtgroup *prgrp)
+{
+ struct rdt_domain *dom;
+ int ret;
+
+ list_for_each_entry(dom, &r->domains, list) {
+ ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+/*
+ * This creates a directory mon_data which contains the monitored data.
+ *
+ * mon_data has one directory for each domain whic are named
+ * in the format mon_<domain_name>_<domain_id>. For ex: A mon_data
+ * with L3 domain looks as below:
+ * ./mon_data:
+ * mon_L3_00
+ * mon_L3_01
+ * mon_L3_02
+ * ...
+ *
+ * Each domain directory has one file per event:
+ * ./mon_L3_00/:
+ * llc_occupancy
+ *
+ */
+static int mkdir_mondata_all(struct kernfs_node *parent_kn,
+ struct rdtgroup *prgrp,
+ struct kernfs_node **dest_kn)
+{
+ struct rdt_resource *r;
+ struct kernfs_node *kn;
+ int ret;
+
+ /*
+ * Create the mon_data directory first.
+ */
+ ret = mongroup_create_dir(parent_kn, prgrp, "mon_data", &kn);
+ if (ret)
+ return ret;
+
+ if (dest_kn)
+ *dest_kn = kn;
+
+ /*
+ * Create the subdirectories for each domain. Note that all events
+ * in a domain like L3 are grouped into a resource whose domain is L3
+ */
+ for_each_mon_enabled_rdt_resource(r) {
+ ret = mkdir_mondata_subdir_alldom(kn, r, prgrp);
+ if (ret)
+ goto out_destroy;
+ }
+
+ return 0;
+
+out_destroy:
+ kernfs_remove(kn);
+ return ret;
+}
+
+/**
+ * cbm_ensure_valid - Enforce validity on provided CBM
+ * @_val: Candidate CBM
+ * @r: RDT resource to which the CBM belongs
+ *
+ * The provided CBM represents all cache portions available for use. This
+ * may be represented by a bitmap that does not consist of contiguous ones
+ * and thus be an invalid CBM.
+ * Here the provided CBM is forced to be a valid CBM by only considering
+ * the first set of contiguous bits as valid and clearing all bits.
+ * The intention here is to provide a valid default CBM with which a new
+ * resource group is initialized. The user can follow this with a
+ * modification to the CBM if the default does not satisfy the
+ * requirements.
+ */
+static void cbm_ensure_valid(u32 *_val, struct rdt_resource *r)
+{
+ unsigned long val = *_val;
+ unsigned int cbm_len = r->cache.cbm_len;
+ unsigned long first_bit, zero_bit;
+
+ if (val == 0)
+ return;
+
+ first_bit = find_first_bit(&val, cbm_len);
+ zero_bit = find_next_zero_bit(&val, cbm_len, first_bit);
+
+ /* Clear any remaining bits to ensure contiguous region */
+ bitmap_clear(&val, zero_bit, cbm_len - zero_bit);
+ *_val = (u32)val;
+}
+
+/**
+ * rdtgroup_init_alloc - Initialize the new RDT group's allocations
+ *
+ * A new RDT group is being created on an allocation capable (CAT)
+ * supporting system. Set this group up to start off with all usable
+ * allocations. That is, all shareable and unused bits.
+ *
+ * All-zero CBM is invalid. If there are no more shareable bits available
+ * on any domain then the entire allocation will fail.
+ */
+static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp)
+{
+ u32 used_b = 0, unused_b = 0;
+ u32 closid = rdtgrp->closid;
+ struct rdt_resource *r;
+ unsigned long tmp_cbm;
+ enum rdtgrp_mode mode;
+ struct rdt_domain *d;
+ int i, ret;
+ u32 *ctrl;
+
+ for_each_alloc_enabled_rdt_resource(r) {
+ /*
+ * Only initialize default allocations for CBM cache
+ * resources
+ */
+ if (r->rid == RDT_RESOURCE_MBA)
+ continue;
+ list_for_each_entry(d, &r->domains, list) {
+ d->have_new_ctrl = false;
+ d->new_ctrl = r->cache.shareable_bits;
+ used_b = r->cache.shareable_bits;
+ ctrl = d->ctrl_val;
+ for (i = 0; i < closids_supported(); i++, ctrl++) {
+ if (closid_allocated(i) && i != closid) {
+ mode = rdtgroup_mode_by_closid(i);
+ if (mode == RDT_MODE_PSEUDO_LOCKSETUP)
+ continue;
+ used_b |= *ctrl;
+ if (mode == RDT_MODE_SHAREABLE)
+ d->new_ctrl |= *ctrl;
+ }
+ }
+ if (d->plr && d->plr->cbm > 0)
+ used_b |= d->plr->cbm;
+ unused_b = used_b ^ (BIT_MASK(r->cache.cbm_len) - 1);
+ unused_b &= BIT_MASK(r->cache.cbm_len) - 1;
+ d->new_ctrl |= unused_b;
+ /*
+ * Force the initial CBM to be valid, user can
+ * modify the CBM based on system availability.
+ */
+ cbm_ensure_valid(&d->new_ctrl, r);
+ /*
+ * Assign the u32 CBM to an unsigned long to ensure
+ * that bitmap_weight() does not access out-of-bound
+ * memory.
+ */
+ tmp_cbm = d->new_ctrl;
+ if (bitmap_weight(&tmp_cbm, r->cache.cbm_len) <
+ r->cache.min_cbm_bits) {
+ rdt_last_cmd_printf("no space on %s:%d\n",
+ r->name, d->id);
+ return -ENOSPC;
+ }
+ d->have_new_ctrl = true;
+ }
+ }
+
+ for_each_alloc_enabled_rdt_resource(r) {
+ /*
+ * Only initialize default allocations for CBM cache
+ * resources
+ */
+ if (r->rid == RDT_RESOURCE_MBA)
+ continue;
+ ret = update_domains(r, rdtgrp->closid);
+ if (ret < 0) {
+ rdt_last_cmd_puts("failed to initialize allocations\n");
+ return ret;
+ }
+ rdtgrp->mode = RDT_MODE_SHAREABLE;
+ }
+
+ return 0;
+}
+
+static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
+ struct kernfs_node *prgrp_kn,
+ const char *name, umode_t mode,
+ enum rdt_group_type rtype, struct rdtgroup **r)
+{
+ struct rdtgroup *prdtgrp, *rdtgrp;
+ struct kernfs_node *kn;
+ uint files = 0;
+ int ret;
+
+ prdtgrp = rdtgroup_kn_lock_live(parent_kn);
+ rdt_last_cmd_clear();
+ if (!prdtgrp) {
+ ret = -ENODEV;
+ rdt_last_cmd_puts("directory was removed\n");
+ goto out_unlock;
+ }
+
+ if (rtype == RDTMON_GROUP &&
+ (prdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
+ prdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)) {
+ ret = -EINVAL;
+ rdt_last_cmd_puts("pseudo-locking in progress\n");
+ goto out_unlock;
+ }
+
+ /* allocate the rdtgroup. */
+ rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL);
+ if (!rdtgrp) {
+ ret = -ENOSPC;
+ rdt_last_cmd_puts("kernel out of memory\n");
+ goto out_unlock;
+ }
+ *r = rdtgrp;
+ rdtgrp->mon.parent = prdtgrp;
+ rdtgrp->type = rtype;
+ INIT_LIST_HEAD(&rdtgrp->mon.crdtgrp_list);
+
+ /* kernfs creates the directory for rdtgrp */
+ kn = kernfs_create_dir(parent_kn, name, mode, rdtgrp);
+ if (IS_ERR(kn)) {
+ ret = PTR_ERR(kn);
+ rdt_last_cmd_puts("kernfs create error\n");
+ goto out_free_rgrp;
+ }
+ rdtgrp->kn = kn;
+
+ /*
+ * kernfs_remove() will drop the reference count on "kn" which
+ * will free it. But we still need it to stick around for the
+ * rdtgroup_kn_unlock(kn) call. Take one extra reference here,
+ * which will be dropped by kernfs_put() in rdtgroup_remove().
+ */
+ kernfs_get(kn);
+
+ ret = rdtgroup_kn_set_ugid(kn);
+ if (ret) {
+ rdt_last_cmd_puts("kernfs perm error\n");
+ goto out_destroy;
+ }
+
+ files = RFTYPE_BASE | BIT(RF_CTRLSHIFT + rtype);
+ ret = rdtgroup_add_files(kn, files);
+ if (ret) {
+ rdt_last_cmd_puts("kernfs fill error\n");
+ goto out_destroy;
+ }
+
+ if (rdt_mon_capable) {
+ ret = alloc_rmid();
+ if (ret < 0) {
+ rdt_last_cmd_puts("out of RMIDs\n");
+ goto out_destroy;
+ }
+ rdtgrp->mon.rmid = ret;
+
+ ret = mkdir_mondata_all(kn, rdtgrp, &rdtgrp->mon.mon_data_kn);
+ if (ret) {
+ rdt_last_cmd_puts("kernfs subdir error\n");
+ goto out_idfree;
+ }
+ }
+ kernfs_activate(kn);
+
+ /*
+ * The caller unlocks the parent_kn upon success.
+ */
+ return 0;
+
+out_idfree:
+ free_rmid(rdtgrp->mon.rmid);
+out_destroy:
+ kernfs_put(rdtgrp->kn);
+ kernfs_remove(rdtgrp->kn);
+out_free_rgrp:
+ kfree(rdtgrp);
+out_unlock:
+ rdtgroup_kn_unlock(parent_kn);
+ return ret;
+}
+
+static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp)
+{
+ kernfs_remove(rgrp->kn);
+ free_rmid(rgrp->mon.rmid);
+ rdtgroup_remove(rgrp);
+}
+
+/*
+ * Create a monitor group under "mon_groups" directory of a control
+ * and monitor group(ctrl_mon). This is a resource group
+ * to monitor a subset of tasks and cpus in its parent ctrl_mon group.
+ */
+static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn,
+ struct kernfs_node *prgrp_kn,
+ const char *name,
+ umode_t mode)
+{
+ struct rdtgroup *rdtgrp, *prgrp;
+ int ret;
+
+ ret = mkdir_rdt_prepare(parent_kn, prgrp_kn, name, mode, RDTMON_GROUP,
+ &rdtgrp);
+ if (ret)
+ return ret;
+
+ prgrp = rdtgrp->mon.parent;
+ rdtgrp->closid = prgrp->closid;
+
+ /*
+ * Add the rdtgrp to the list of rdtgrps the parent
+ * ctrl_mon group has to track.
+ */
+ list_add_tail(&rdtgrp->mon.crdtgrp_list, &prgrp->mon.crdtgrp_list);
+
+ rdtgroup_kn_unlock(parent_kn);
+ return ret;
+}
+
+/*
+ * These are rdtgroups created under the root directory. Can be used
+ * to allocate and monitor resources.
+ */
+static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn,
+ struct kernfs_node *prgrp_kn,
+ const char *name, umode_t mode)
+{
+ struct rdtgroup *rdtgrp;
+ struct kernfs_node *kn;
+ u32 closid;
+ int ret;
+
+ ret = mkdir_rdt_prepare(parent_kn, prgrp_kn, name, mode, RDTCTRL_GROUP,
+ &rdtgrp);
+ if (ret)
+ return ret;
+
+ kn = rdtgrp->kn;
+ ret = closid_alloc();
+ if (ret < 0) {
+ rdt_last_cmd_puts("out of CLOSIDs\n");
+ goto out_common_fail;
+ }
+ closid = ret;
+ ret = 0;
+
+ rdtgrp->closid = closid;
+ ret = rdtgroup_init_alloc(rdtgrp);
+ if (ret < 0)
+ goto out_id_free;
+
+ list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups);
+
+ if (rdt_mon_capable) {
+ /*
+ * Create an empty mon_groups directory to hold the subset
+ * of tasks and cpus to monitor.
+ */
+ ret = mongroup_create_dir(kn, rdtgrp, "mon_groups", NULL);
+ if (ret) {
+ rdt_last_cmd_puts("kernfs subdir error\n");
+ goto out_del_list;
+ }
+ }
+
+ goto out_unlock;
+
+out_del_list:
+ list_del(&rdtgrp->rdtgroup_list);
+out_id_free:
+ closid_free(closid);
+out_common_fail:
+ mkdir_rdt_prepare_clean(rdtgrp);
+out_unlock:
+ rdtgroup_kn_unlock(parent_kn);
+ return ret;
+}
+
+/*
+ * We allow creating mon groups only with in a directory called "mon_groups"
+ * which is present in every ctrl_mon group. Check if this is a valid
+ * "mon_groups" directory.
+ *
+ * 1. The directory should be named "mon_groups".
+ * 2. The mon group itself should "not" be named "mon_groups".
+ * This makes sure "mon_groups" directory always has a ctrl_mon group
+ * as parent.
+ */
+static bool is_mon_groups(struct kernfs_node *kn, const char *name)
+{
+ return (!strcmp(kn->name, "mon_groups") &&
+ strcmp(name, "mon_groups"));
+}
+
+static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
+ umode_t mode)
+{
+ /* Do not accept '\n' to avoid unparsable situation. */
+ if (strchr(name, '\n'))
+ return -EINVAL;
+
+ /*
+ * If the parent directory is the root directory and RDT
+ * allocation is supported, add a control and monitoring
+ * subdirectory
+ */
+ if (rdt_alloc_capable && parent_kn == rdtgroup_default.kn)
+ return rdtgroup_mkdir_ctrl_mon(parent_kn, parent_kn, name, mode);
+
+ /*
+ * If RDT monitoring is supported and the parent directory is a valid
+ * "mon_groups" directory, add a monitoring subdirectory.
+ */
+ if (rdt_mon_capable && is_mon_groups(parent_kn, name))
+ return rdtgroup_mkdir_mon(parent_kn, parent_kn->parent, name, mode);
+
+ return -EPERM;
+}
+
+static int rdtgroup_rmdir_mon(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
+ cpumask_var_t tmpmask)
+{
+ struct rdtgroup *prdtgrp = rdtgrp->mon.parent;
+ int cpu;
+
+ /* Give any tasks back to the parent group */
+ rdt_move_group_tasks(rdtgrp, prdtgrp, tmpmask);
+
+ /* Update per cpu rmid of the moved CPUs first */
+ for_each_cpu(cpu, &rdtgrp->cpu_mask)
+ per_cpu(pqr_state.default_rmid, cpu) = prdtgrp->mon.rmid;
+ /*
+ * Update the MSR on moved CPUs and CPUs which have moved
+ * task running on them.
+ */
+ cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
+ update_closid_rmid(tmpmask, NULL);
+
+ rdtgrp->flags = RDT_DELETED;
+ free_rmid(rdtgrp->mon.rmid);
+
+ /*
+ * Remove the rdtgrp from the parent ctrl_mon group's list
+ */
+ WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list));
+ list_del(&rdtgrp->mon.crdtgrp_list);
+
+ kernfs_remove(rdtgrp->kn);
+
+ return 0;
+}
+
+static int rdtgroup_ctrl_remove(struct kernfs_node *kn,
+ struct rdtgroup *rdtgrp)
+{
+ rdtgrp->flags = RDT_DELETED;
+ list_del(&rdtgrp->rdtgroup_list);
+
+ kernfs_remove(rdtgrp->kn);
+ return 0;
+}
+
+static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
+ cpumask_var_t tmpmask)
+{
+ int cpu;
+
+ /* Give any tasks back to the default group */
+ rdt_move_group_tasks(rdtgrp, &rdtgroup_default, tmpmask);
+
+ /* Give any CPUs back to the default group */
+ cpumask_or(&rdtgroup_default.cpu_mask,
+ &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
+
+ /* Update per cpu closid and rmid of the moved CPUs first */
+ for_each_cpu(cpu, &rdtgrp->cpu_mask) {
+ per_cpu(pqr_state.default_closid, cpu) = rdtgroup_default.closid;
+ per_cpu(pqr_state.default_rmid, cpu) = rdtgroup_default.mon.rmid;
+ }
+
+ /*
+ * Update the MSR on moved CPUs and CPUs which have moved
+ * task running on them.
+ */
+ cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
+ update_closid_rmid(tmpmask, NULL);
+
+ closid_free(rdtgrp->closid);
+ free_rmid(rdtgrp->mon.rmid);
+
+ rdtgroup_ctrl_remove(kn, rdtgrp);
+
+ /*
+ * Free all the child monitor group rmids.
+ */
+ free_all_child_rdtgrp(rdtgrp);
+
+ return 0;
+}
+
+static int rdtgroup_rmdir(struct kernfs_node *kn)
+{
+ struct kernfs_node *parent_kn = kn->parent;
+ struct rdtgroup *rdtgrp;
+ cpumask_var_t tmpmask;
+ int ret = 0;
+
+ if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
+ return -ENOMEM;
+
+ rdtgrp = rdtgroup_kn_lock_live(kn);
+ if (!rdtgrp) {
+ ret = -EPERM;
+ goto out;
+ }
+
+ /*
+ * If the rdtgroup is a ctrl_mon group and parent directory
+ * is the root directory, remove the ctrl_mon group.
+ *
+ * If the rdtgroup is a mon group and parent directory
+ * is a valid "mon_groups" directory, remove the mon group.
+ */
+ if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn &&
+ rdtgrp != &rdtgroup_default) {
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
+ rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
+ ret = rdtgroup_ctrl_remove(kn, rdtgrp);
+ } else {
+ ret = rdtgroup_rmdir_ctrl(kn, rdtgrp, tmpmask);
+ }
+ } else if (rdtgrp->type == RDTMON_GROUP &&
+ is_mon_groups(parent_kn, kn->name)) {
+ ret = rdtgroup_rmdir_mon(kn, rdtgrp, tmpmask);
+ } else {
+ ret = -EPERM;
+ }
+
+out:
+ rdtgroup_kn_unlock(kn);
+ free_cpumask_var(tmpmask);
+ return ret;
+}
+
+static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf)
+{
+ if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled)
+ seq_puts(seq, ",cdp");
+
+ if (rdt_resources_all[RDT_RESOURCE_L2DATA].alloc_enabled)
+ seq_puts(seq, ",cdpl2");
+
+ if (is_mba_sc(&rdt_resources_all[RDT_RESOURCE_MBA]))
+ seq_puts(seq, ",mba_MBps");
+
+ return 0;
+}
+
+static struct kernfs_syscall_ops rdtgroup_kf_syscall_ops = {
+ .mkdir = rdtgroup_mkdir,
+ .rmdir = rdtgroup_rmdir,
+ .show_options = rdtgroup_show_options,
+};
+
+static int __init rdtgroup_setup_root(void)
+{
+ int ret;
+
+ rdt_root = kernfs_create_root(&rdtgroup_kf_syscall_ops,
+ KERNFS_ROOT_CREATE_DEACTIVATED |
+ KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK,
+ &rdtgroup_default);
+ if (IS_ERR(rdt_root))
+ return PTR_ERR(rdt_root);
+
+ mutex_lock(&rdtgroup_mutex);
+
+ rdtgroup_default.closid = 0;
+ rdtgroup_default.mon.rmid = 0;
+ rdtgroup_default.type = RDTCTRL_GROUP;
+ INIT_LIST_HEAD(&rdtgroup_default.mon.crdtgrp_list);
+
+ list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups);
+
+ ret = rdtgroup_add_files(rdt_root->kn, RF_CTRL_BASE);
+ if (ret) {
+ kernfs_destroy_root(rdt_root);
+ goto out;
+ }
+
+ rdtgroup_default.kn = rdt_root->kn;
+ kernfs_activate(rdtgroup_default.kn);
+
+out:
+ mutex_unlock(&rdtgroup_mutex);
+
+ return ret;
+}
+
+/*
+ * rdtgroup_init - rdtgroup initialization
+ *
+ * Setup resctrl file system including set up root, create mount point,
+ * register rdtgroup filesystem, and initialize files under root directory.
+ *
+ * Return: 0 on success or -errno
+ */
+int __init rdtgroup_init(void)
+{
+ int ret = 0;
+
+ seq_buf_init(&last_cmd_status, last_cmd_status_buf,
+ sizeof(last_cmd_status_buf));
+
+ ret = rdtgroup_setup_root();
+ if (ret)
+ return ret;
+
+ ret = sysfs_create_mount_point(fs_kobj, "resctrl");
+ if (ret)
+ goto cleanup_root;
+
+ ret = register_filesystem(&rdt_fs_type);
+ if (ret)
+ goto cleanup_mountpoint;
+
+ /*
+ * Adding the resctrl debugfs directory here may not be ideal since
+ * it would let the resctrl debugfs directory appear on the debugfs
+ * filesystem before the resctrl filesystem is mounted.
+ * It may also be ok since that would enable debugging of RDT before
+ * resctrl is mounted.
+ * The reason why the debugfs directory is created here and not in
+ * rdt_mount() is because rdt_mount() takes rdtgroup_mutex and
+ * during the debugfs directory creation also &sb->s_type->i_mutex_key
+ * (the lockdep class of inode->i_rwsem). Other filesystem
+ * interactions (eg. SyS_getdents) have the lock ordering:
+ * &sb->s_type->i_mutex_key --> &mm->mmap_sem
+ * During mmap(), called with &mm->mmap_sem, the rdtgroup_mutex
+ * is taken, thus creating dependency:
+ * &mm->mmap_sem --> rdtgroup_mutex for the latter that can cause
+ * issues considering the other two lock dependencies.
+ * By creating the debugfs directory here we avoid a dependency
+ * that may cause deadlock (even though file operations cannot
+ * occur until the filesystem is mounted, but I do not know how to
+ * tell lockdep that).
+ */
+ debugfs_resctrl = debugfs_create_dir("resctrl", NULL);
+
+ return 0;
+
+cleanup_mountpoint:
+ sysfs_remove_mount_point(fs_kobj, "resctrl");
+cleanup_root:
+ kernfs_destroy_root(rdt_root);
+
+ return ret;
+}
+
+void __exit rdtgroup_exit(void)
+{
+ debugfs_remove_recursive(debugfs_resctrl);
+ unregister_filesystem(&rdt_fs_type);
+ sysfs_remove_mount_point(fs_kobj, "resctrl");
+ kernfs_destroy_root(rdt_root);
+}
diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c
new file mode 100644
index 000000000..751e59057
--- /dev/null
+++ b/arch/x86/kernel/cpu/match.c
@@ -0,0 +1,55 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <asm/cpu_device_id.h>
+#include <asm/cpufeature.h>
+#include <linux/cpu.h>
+#include <linux/export.h>
+#include <linux/slab.h>
+
+/**
+ * x86_match_cpu - match current CPU again an array of x86_cpu_ids
+ * @match: Pointer to array of x86_cpu_ids. Last entry terminated with
+ * {}.
+ *
+ * Return the entry if the current CPU matches the entries in the
+ * passed x86_cpu_id match table. Otherwise NULL. The match table
+ * contains vendor (X86_VENDOR_*), family, model and feature bits or
+ * respective wildcard entries.
+ *
+ * A typical table entry would be to match a specific CPU
+ * { X86_VENDOR_INTEL, 6, 0x12 }
+ * or to match a specific CPU feature
+ * { X86_FEATURE_MATCH(X86_FEATURE_FOOBAR) }
+ *
+ * Fields can be wildcarded with %X86_VENDOR_ANY, %X86_FAMILY_ANY,
+ * %X86_MODEL_ANY, %X86_FEATURE_ANY or 0 (except for vendor)
+ *
+ * Arrays used to match for this should also be declared using
+ * MODULE_DEVICE_TABLE(x86cpu, ...)
+ *
+ * This always matches against the boot cpu, assuming models and features are
+ * consistent over all CPUs.
+ */
+const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match)
+{
+ const struct x86_cpu_id *m;
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+
+ for (m = match;
+ m->vendor | m->family | m->model | m->steppings | m->feature;
+ m++) {
+ if (m->vendor != X86_VENDOR_ANY && c->x86_vendor != m->vendor)
+ continue;
+ if (m->family != X86_FAMILY_ANY && c->x86 != m->family)
+ continue;
+ if (m->model != X86_MODEL_ANY && c->x86_model != m->model)
+ continue;
+ if (m->steppings != X86_STEPPING_ANY &&
+ !(BIT(c->x86_stepping) & m->steppings))
+ continue;
+ if (m->feature != X86_FEATURE_ANY && !cpu_has(c, m->feature))
+ continue;
+ return m;
+ }
+ return NULL;
+}
+EXPORT_SYMBOL(x86_match_cpu);
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile
new file mode 100644
index 000000000..bcc7c54c7
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-y = mce.o mce-severity.o mce-genpool.o
+
+obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o
+obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o
+obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o
+obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
+obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o
+
+obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o
+
+obj-$(CONFIG_ACPI_APEI) += mce-apei.o
+
+obj-$(CONFIG_X86_MCELOG_LEGACY) += dev-mcelog.o
diff --git a/arch/x86/kernel/cpu/mcheck/dev-mcelog.c b/arch/x86/kernel/cpu/mcheck/dev-mcelog.c
new file mode 100644
index 000000000..97685a0c3
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/dev-mcelog.c
@@ -0,0 +1,363 @@
+/*
+ * /dev/mcelog driver
+ *
+ * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ * Rest from unknown author(s).
+ * 2004 Andi Kleen. Rewrote most of it.
+ * Copyright 2008 Intel Corporation
+ * Author: Andi Kleen
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/miscdevice.h>
+#include <linux/slab.h>
+#include <linux/kmod.h>
+#include <linux/poll.h>
+
+#include "mce-internal.h"
+
+static BLOCKING_NOTIFIER_HEAD(mce_injector_chain);
+
+static DEFINE_MUTEX(mce_chrdev_read_mutex);
+
+static char mce_helper[128];
+static char *mce_helper_argv[2] = { mce_helper, NULL };
+
+/*
+ * Lockless MCE logging infrastructure.
+ * This avoids deadlocks on printk locks without having to break locks. Also
+ * separate MCEs from kernel messages to avoid bogus bug reports.
+ */
+
+static struct mce_log_buffer mcelog = {
+ .signature = MCE_LOG_SIGNATURE,
+ .len = MCE_LOG_LEN,
+ .recordlen = sizeof(struct mce),
+};
+
+static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
+
+/* User mode helper program triggered by machine check event */
+extern char mce_helper[128];
+
+static int dev_mce_log(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ struct mce *mce = (struct mce *)data;
+ unsigned int entry;
+
+ mutex_lock(&mce_chrdev_read_mutex);
+
+ entry = mcelog.next;
+
+ /*
+ * When the buffer fills up discard new entries. Assume that the
+ * earlier errors are the more interesting ones:
+ */
+ if (entry >= MCE_LOG_LEN) {
+ set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags);
+ goto unlock;
+ }
+
+ mcelog.next = entry + 1;
+
+ memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
+ mcelog.entry[entry].finished = 1;
+
+ /* wake processes polling /dev/mcelog */
+ wake_up_interruptible(&mce_chrdev_wait);
+
+unlock:
+ mutex_unlock(&mce_chrdev_read_mutex);
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block dev_mcelog_nb = {
+ .notifier_call = dev_mce_log,
+ .priority = MCE_PRIO_MCELOG,
+};
+
+static void mce_do_trigger(struct work_struct *work)
+{
+ call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT);
+}
+
+static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
+
+
+void mce_work_trigger(void)
+{
+ if (mce_helper[0])
+ schedule_work(&mce_trigger_work);
+}
+
+static ssize_t
+show_trigger(struct device *s, struct device_attribute *attr, char *buf)
+{
+ strcpy(buf, mce_helper);
+ strcat(buf, "\n");
+ return strlen(mce_helper) + 1;
+}
+
+static ssize_t set_trigger(struct device *s, struct device_attribute *attr,
+ const char *buf, size_t siz)
+{
+ char *p;
+
+ strncpy(mce_helper, buf, sizeof(mce_helper));
+ mce_helper[sizeof(mce_helper)-1] = 0;
+ p = strchr(mce_helper, '\n');
+
+ if (p)
+ *p = 0;
+
+ return strlen(mce_helper) + !!p;
+}
+
+DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger);
+
+/*
+ * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log.
+ */
+
+static DEFINE_SPINLOCK(mce_chrdev_state_lock);
+static int mce_chrdev_open_count; /* #times opened */
+static int mce_chrdev_open_exclu; /* already open exclusive? */
+
+static int mce_chrdev_open(struct inode *inode, struct file *file)
+{
+ spin_lock(&mce_chrdev_state_lock);
+
+ if (mce_chrdev_open_exclu ||
+ (mce_chrdev_open_count && (file->f_flags & O_EXCL))) {
+ spin_unlock(&mce_chrdev_state_lock);
+
+ return -EBUSY;
+ }
+
+ if (file->f_flags & O_EXCL)
+ mce_chrdev_open_exclu = 1;
+ mce_chrdev_open_count++;
+
+ spin_unlock(&mce_chrdev_state_lock);
+
+ return nonseekable_open(inode, file);
+}
+
+static int mce_chrdev_release(struct inode *inode, struct file *file)
+{
+ spin_lock(&mce_chrdev_state_lock);
+
+ mce_chrdev_open_count--;
+ mce_chrdev_open_exclu = 0;
+
+ spin_unlock(&mce_chrdev_state_lock);
+
+ return 0;
+}
+
+static int mce_apei_read_done;
+
+/* Collect MCE record of previous boot in persistent storage via APEI ERST. */
+static int __mce_read_apei(char __user **ubuf, size_t usize)
+{
+ int rc;
+ u64 record_id;
+ struct mce m;
+
+ if (usize < sizeof(struct mce))
+ return -EINVAL;
+
+ rc = apei_read_mce(&m, &record_id);
+ /* Error or no more MCE record */
+ if (rc <= 0) {
+ mce_apei_read_done = 1;
+ /*
+ * When ERST is disabled, mce_chrdev_read() should return
+ * "no record" instead of "no device."
+ */
+ if (rc == -ENODEV)
+ return 0;
+ return rc;
+ }
+ rc = -EFAULT;
+ if (copy_to_user(*ubuf, &m, sizeof(struct mce)))
+ return rc;
+ /*
+ * In fact, we should have cleared the record after that has
+ * been flushed to the disk or sent to network in
+ * /sbin/mcelog, but we have no interface to support that now,
+ * so just clear it to avoid duplication.
+ */
+ rc = apei_clear_mce(record_id);
+ if (rc) {
+ mce_apei_read_done = 1;
+ return rc;
+ }
+ *ubuf += sizeof(struct mce);
+
+ return 0;
+}
+
+static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
+ size_t usize, loff_t *off)
+{
+ char __user *buf = ubuf;
+ unsigned next;
+ int i, err;
+
+ mutex_lock(&mce_chrdev_read_mutex);
+
+ if (!mce_apei_read_done) {
+ err = __mce_read_apei(&buf, usize);
+ if (err || buf != ubuf)
+ goto out;
+ }
+
+ /* Only supports full reads right now */
+ err = -EINVAL;
+ if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce))
+ goto out;
+
+ next = mcelog.next;
+ err = 0;
+
+ for (i = 0; i < next; i++) {
+ struct mce *m = &mcelog.entry[i];
+
+ err |= copy_to_user(buf, m, sizeof(*m));
+ buf += sizeof(*m);
+ }
+
+ memset(mcelog.entry, 0, next * sizeof(struct mce));
+ mcelog.next = 0;
+
+ if (err)
+ err = -EFAULT;
+
+out:
+ mutex_unlock(&mce_chrdev_read_mutex);
+
+ return err ? err : buf - ubuf;
+}
+
+static __poll_t mce_chrdev_poll(struct file *file, poll_table *wait)
+{
+ poll_wait(file, &mce_chrdev_wait, wait);
+ if (READ_ONCE(mcelog.next))
+ return EPOLLIN | EPOLLRDNORM;
+ if (!mce_apei_read_done && apei_check_mce())
+ return EPOLLIN | EPOLLRDNORM;
+ return 0;
+}
+
+static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
+ unsigned long arg)
+{
+ int __user *p = (int __user *)arg;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ switch (cmd) {
+ case MCE_GET_RECORD_LEN:
+ return put_user(sizeof(struct mce), p);
+ case MCE_GET_LOG_LEN:
+ return put_user(MCE_LOG_LEN, p);
+ case MCE_GETCLEAR_FLAGS: {
+ unsigned flags;
+
+ do {
+ flags = mcelog.flags;
+ } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
+
+ return put_user(flags, p);
+ }
+ default:
+ return -ENOTTY;
+ }
+}
+
+void mce_register_injector_chain(struct notifier_block *nb)
+{
+ blocking_notifier_chain_register(&mce_injector_chain, nb);
+}
+EXPORT_SYMBOL_GPL(mce_register_injector_chain);
+
+void mce_unregister_injector_chain(struct notifier_block *nb)
+{
+ blocking_notifier_chain_unregister(&mce_injector_chain, nb);
+}
+EXPORT_SYMBOL_GPL(mce_unregister_injector_chain);
+
+static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
+ size_t usize, loff_t *off)
+{
+ struct mce m;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ /*
+ * There are some cases where real MSR reads could slip
+ * through.
+ */
+ if (!boot_cpu_has(X86_FEATURE_MCE) || !boot_cpu_has(X86_FEATURE_MCA))
+ return -EIO;
+
+ if ((unsigned long)usize > sizeof(struct mce))
+ usize = sizeof(struct mce);
+ if (copy_from_user(&m, ubuf, usize))
+ return -EFAULT;
+
+ if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu))
+ return -EINVAL;
+
+ /*
+ * Need to give user space some time to set everything up,
+ * so do it a jiffie or two later everywhere.
+ */
+ schedule_timeout(2);
+
+ blocking_notifier_call_chain(&mce_injector_chain, 0, &m);
+
+ return usize;
+}
+
+static const struct file_operations mce_chrdev_ops = {
+ .open = mce_chrdev_open,
+ .release = mce_chrdev_release,
+ .read = mce_chrdev_read,
+ .write = mce_chrdev_write,
+ .poll = mce_chrdev_poll,
+ .unlocked_ioctl = mce_chrdev_ioctl,
+ .llseek = no_llseek,
+};
+
+static struct miscdevice mce_chrdev_device = {
+ MISC_MCELOG_MINOR,
+ "mcelog",
+ &mce_chrdev_ops,
+};
+
+static __init int dev_mcelog_init_device(void)
+{
+ int err;
+
+ /* register character device /dev/mcelog */
+ err = misc_register(&mce_chrdev_device);
+ if (err) {
+ if (err == -EBUSY)
+ /* Xen dom0 might have registered the device already. */
+ pr_info("Unable to init device /dev/mcelog, already registered");
+ else
+ pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err);
+
+ return err;
+ }
+
+ mce_register_decode_chain(&dev_mcelog_nb);
+ return 0;
+}
+device_initcall_sync(dev_mcelog_init_device);
diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c
new file mode 100644
index 000000000..2eee85379
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c
@@ -0,0 +1,157 @@
+/*
+ * Bridge between MCE and APEI
+ *
+ * On some machine, corrected memory errors are reported via APEI
+ * generic hardware error source (GHES) instead of corrected Machine
+ * Check. These corrected memory errors can be reported to user space
+ * through /dev/mcelog via faking a corrected Machine Check, so that
+ * the error memory page can be offlined by /sbin/mcelog if the error
+ * count for one page is beyond the threshold.
+ *
+ * For fatal MCE, save MCE record into persistent storage via ERST, so
+ * that the MCE record can be logged after reboot via ERST.
+ *
+ * Copyright 2010 Intel Corp.
+ * Author: Huang Ying <ying.huang@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/acpi.h>
+#include <linux/cper.h>
+#include <acpi/apei.h>
+#include <acpi/ghes.h>
+#include <asm/mce.h>
+
+#include "mce-internal.h"
+
+void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err)
+{
+ struct mce m;
+
+ if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
+ return;
+
+ mce_setup(&m);
+ m.bank = -1;
+ /* Fake a memory read error with unknown channel */
+ m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f;
+
+ if (severity >= GHES_SEV_RECOVERABLE)
+ m.status |= MCI_STATUS_UC;
+
+ if (severity >= GHES_SEV_PANIC) {
+ m.status |= MCI_STATUS_PCC;
+ m.tsc = rdtsc();
+ }
+
+ m.addr = mem_err->physical_addr;
+ mce_log(&m);
+}
+EXPORT_SYMBOL_GPL(apei_mce_report_mem_error);
+
+#define CPER_CREATOR_MCE \
+ UUID_LE(0x75a574e3, 0x5052, 0x4b29, 0x8a, 0x8e, 0xbe, 0x2c, \
+ 0x64, 0x90, 0xb8, 0x9d)
+#define CPER_SECTION_TYPE_MCE \
+ UUID_LE(0xfe08ffbe, 0x95e4, 0x4be7, 0xbc, 0x73, 0x40, 0x96, \
+ 0x04, 0x4a, 0x38, 0xfc)
+
+/*
+ * CPER specification (in UEFI specification 2.3 appendix N) requires
+ * byte-packed.
+ */
+struct cper_mce_record {
+ struct cper_record_header hdr;
+ struct cper_section_descriptor sec_hdr;
+ struct mce mce;
+} __packed;
+
+int apei_write_mce(struct mce *m)
+{
+ struct cper_mce_record rcd;
+
+ memset(&rcd, 0, sizeof(rcd));
+ memcpy(rcd.hdr.signature, CPER_SIG_RECORD, CPER_SIG_SIZE);
+ rcd.hdr.revision = CPER_RECORD_REV;
+ rcd.hdr.signature_end = CPER_SIG_END;
+ rcd.hdr.section_count = 1;
+ rcd.hdr.error_severity = CPER_SEV_FATAL;
+ /* timestamp, platform_id, partition_id are all invalid */
+ rcd.hdr.validation_bits = 0;
+ rcd.hdr.record_length = sizeof(rcd);
+ rcd.hdr.creator_id = CPER_CREATOR_MCE;
+ rcd.hdr.notification_type = CPER_NOTIFY_MCE;
+ rcd.hdr.record_id = cper_next_record_id();
+ rcd.hdr.flags = CPER_HW_ERROR_FLAGS_PREVERR;
+
+ rcd.sec_hdr.section_offset = (void *)&rcd.mce - (void *)&rcd;
+ rcd.sec_hdr.section_length = sizeof(rcd.mce);
+ rcd.sec_hdr.revision = CPER_SEC_REV;
+ /* fru_id and fru_text is invalid */
+ rcd.sec_hdr.validation_bits = 0;
+ rcd.sec_hdr.flags = CPER_SEC_PRIMARY;
+ rcd.sec_hdr.section_type = CPER_SECTION_TYPE_MCE;
+ rcd.sec_hdr.section_severity = CPER_SEV_FATAL;
+
+ memcpy(&rcd.mce, m, sizeof(*m));
+
+ return erst_write(&rcd.hdr);
+}
+
+ssize_t apei_read_mce(struct mce *m, u64 *record_id)
+{
+ struct cper_mce_record rcd;
+ int rc, pos;
+
+ rc = erst_get_record_id_begin(&pos);
+ if (rc)
+ return rc;
+retry:
+ rc = erst_get_record_id_next(&pos, record_id);
+ if (rc)
+ goto out;
+ /* no more record */
+ if (*record_id == APEI_ERST_INVALID_RECORD_ID)
+ goto out;
+ rc = erst_read(*record_id, &rcd.hdr, sizeof(rcd));
+ /* someone else has cleared the record, try next one */
+ if (rc == -ENOENT)
+ goto retry;
+ else if (rc < 0)
+ goto out;
+ /* try to skip other type records in storage */
+ else if (rc != sizeof(rcd) ||
+ uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE))
+ goto retry;
+ memcpy(m, &rcd.mce, sizeof(*m));
+ rc = sizeof(*m);
+out:
+ erst_get_record_id_end();
+
+ return rc;
+}
+
+/* Check whether there is record in ERST */
+int apei_check_mce(void)
+{
+ return erst_get_record_count();
+}
+
+int apei_clear_mce(u64 record_id)
+{
+ return erst_clear(record_id);
+}
diff --git a/arch/x86/kernel/cpu/mcheck/mce-genpool.c b/arch/x86/kernel/cpu/mcheck/mce-genpool.c
new file mode 100644
index 000000000..217cd4449
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce-genpool.c
@@ -0,0 +1,145 @@
+/*
+ * MCE event pool management in MCE context
+ *
+ * Copyright (C) 2015 Intel Corp.
+ * Author: Chen, Gong <gong.chen@linux.intel.com>
+ *
+ * This file is licensed under GPLv2.
+ */
+#include <linux/smp.h>
+#include <linux/mm.h>
+#include <linux/genalloc.h>
+#include <linux/llist.h>
+#include "mce-internal.h"
+
+/*
+ * printk() is not safe in MCE context. This is a lock-less memory allocator
+ * used to save error information organized in a lock-less list.
+ *
+ * This memory pool is only to be used to save MCE records in MCE context.
+ * MCE events are rare, so a fixed size memory pool should be enough. Use
+ * 2 pages to save MCE events for now (~80 MCE records at most).
+ */
+#define MCE_POOLSZ (2 * PAGE_SIZE)
+
+static struct gen_pool *mce_evt_pool;
+static LLIST_HEAD(mce_event_llist);
+static char gen_pool_buf[MCE_POOLSZ];
+
+/*
+ * Compare the record "t" with each of the records on list "l" to see if
+ * an equivalent one is present in the list.
+ */
+static bool is_duplicate_mce_record(struct mce_evt_llist *t, struct mce_evt_llist *l)
+{
+ struct mce_evt_llist *node;
+ struct mce *m1, *m2;
+
+ m1 = &t->mce;
+
+ llist_for_each_entry(node, &l->llnode, llnode) {
+ m2 = &node->mce;
+
+ if (!mce_cmp(m1, m2))
+ return true;
+ }
+ return false;
+}
+
+/*
+ * The system has panicked - we'd like to peruse the list of MCE records
+ * that have been queued, but not seen by anyone yet. The list is in
+ * reverse time order, so we need to reverse it. While doing that we can
+ * also drop duplicate records (these were logged because some banks are
+ * shared between cores or by all threads on a socket).
+ */
+struct llist_node *mce_gen_pool_prepare_records(void)
+{
+ struct llist_node *head;
+ LLIST_HEAD(new_head);
+ struct mce_evt_llist *node, *t;
+
+ head = llist_del_all(&mce_event_llist);
+ if (!head)
+ return NULL;
+
+ /* squeeze out duplicates while reversing order */
+ llist_for_each_entry_safe(node, t, head, llnode) {
+ if (!is_duplicate_mce_record(node, t))
+ llist_add(&node->llnode, &new_head);
+ }
+
+ return new_head.first;
+}
+
+void mce_gen_pool_process(struct work_struct *__unused)
+{
+ struct llist_node *head;
+ struct mce_evt_llist *node, *tmp;
+ struct mce *mce;
+
+ head = llist_del_all(&mce_event_llist);
+ if (!head)
+ return;
+
+ head = llist_reverse_order(head);
+ llist_for_each_entry_safe(node, tmp, head, llnode) {
+ mce = &node->mce;
+ blocking_notifier_call_chain(&x86_mce_decoder_chain, 0, mce);
+ gen_pool_free(mce_evt_pool, (unsigned long)node, sizeof(*node));
+ }
+}
+
+bool mce_gen_pool_empty(void)
+{
+ return llist_empty(&mce_event_llist);
+}
+
+int mce_gen_pool_add(struct mce *mce)
+{
+ struct mce_evt_llist *node;
+
+ if (!mce_evt_pool)
+ return -EINVAL;
+
+ node = (void *)gen_pool_alloc(mce_evt_pool, sizeof(*node));
+ if (!node) {
+ pr_warn_ratelimited("MCE records pool full!\n");
+ return -ENOMEM;
+ }
+
+ memcpy(&node->mce, mce, sizeof(*mce));
+ llist_add(&node->llnode, &mce_event_llist);
+
+ return 0;
+}
+
+static int mce_gen_pool_create(void)
+{
+ struct gen_pool *tmpp;
+ int ret = -ENOMEM;
+
+ tmpp = gen_pool_create(ilog2(sizeof(struct mce_evt_llist)), -1);
+ if (!tmpp)
+ goto out;
+
+ ret = gen_pool_add(tmpp, (unsigned long)gen_pool_buf, MCE_POOLSZ, -1);
+ if (ret) {
+ gen_pool_destroy(tmpp);
+ goto out;
+ }
+
+ mce_evt_pool = tmpp;
+
+out:
+ return ret;
+}
+
+int mce_gen_pool_init(void)
+{
+ /* Just init mce_gen_pool once. */
+ if (mce_evt_pool)
+ return 0;
+
+ return mce_gen_pool_create();
+}
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
new file mode 100644
index 000000000..14dc3c1f7
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -0,0 +1,739 @@
+/*
+ * Machine check injection support.
+ * Copyright 2008 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ *
+ * Authors:
+ * Andi Kleen
+ * Ying Huang
+ *
+ * The AMD part (from mce_amd_inj.c): a simple MCE injection facility
+ * for testing different aspects of the RAS code. This driver should be
+ * built as module so that it can be loaded on production kernels for
+ * testing purposes.
+ *
+ * This file may be distributed under the terms of the GNU General Public
+ * License version 2.
+ *
+ * Copyright (c) 2010-17: Borislav Petkov <bp@alien8.de>
+ * Advanced Micro Devices Inc.
+ */
+
+#include <linux/cpu.h>
+#include <linux/debugfs.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/notifier.h>
+#include <linux/pci.h>
+#include <linux/uaccess.h>
+
+#include <asm/amd_nb.h>
+#include <asm/apic.h>
+#include <asm/irq_vectors.h>
+#include <asm/mce.h>
+#include <asm/nmi.h>
+#include <asm/smp.h>
+
+#include "mce-internal.h"
+
+/*
+ * Collect all the MCi_XXX settings
+ */
+static struct mce i_mce;
+static struct dentry *dfs_inj;
+
+#define MAX_FLAG_OPT_SIZE 4
+#define NBCFG 0x44
+
+enum injection_type {
+ SW_INJ = 0, /* SW injection, simply decode the error */
+ HW_INJ, /* Trigger a #MC */
+ DFR_INT_INJ, /* Trigger Deferred error interrupt */
+ THR_INT_INJ, /* Trigger threshold interrupt */
+ N_INJ_TYPES,
+};
+
+static const char * const flags_options[] = {
+ [SW_INJ] = "sw",
+ [HW_INJ] = "hw",
+ [DFR_INT_INJ] = "df",
+ [THR_INT_INJ] = "th",
+ NULL
+};
+
+/* Set default injection to SW_INJ */
+static enum injection_type inj_type = SW_INJ;
+
+#define MCE_INJECT_SET(reg) \
+static int inj_##reg##_set(void *data, u64 val) \
+{ \
+ struct mce *m = (struct mce *)data; \
+ \
+ m->reg = val; \
+ return 0; \
+}
+
+MCE_INJECT_SET(status);
+MCE_INJECT_SET(misc);
+MCE_INJECT_SET(addr);
+MCE_INJECT_SET(synd);
+
+#define MCE_INJECT_GET(reg) \
+static int inj_##reg##_get(void *data, u64 *val) \
+{ \
+ struct mce *m = (struct mce *)data; \
+ \
+ *val = m->reg; \
+ return 0; \
+}
+
+MCE_INJECT_GET(status);
+MCE_INJECT_GET(misc);
+MCE_INJECT_GET(addr);
+MCE_INJECT_GET(synd);
+
+DEFINE_SIMPLE_ATTRIBUTE(status_fops, inj_status_get, inj_status_set, "%llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(misc_fops, inj_misc_get, inj_misc_set, "%llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(addr_fops, inj_addr_get, inj_addr_set, "%llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(synd_fops, inj_synd_get, inj_synd_set, "%llx\n");
+
+static void setup_inj_struct(struct mce *m)
+{
+ memset(m, 0, sizeof(struct mce));
+
+ m->cpuvendor = boot_cpu_data.x86_vendor;
+ m->time = ktime_get_real_seconds();
+ m->cpuid = cpuid_eax(1);
+ m->microcode = boot_cpu_data.microcode;
+}
+
+/* Update fake mce registers on current CPU. */
+static void inject_mce(struct mce *m)
+{
+ struct mce *i = &per_cpu(injectm, m->extcpu);
+
+ /* Make sure no one reads partially written injectm */
+ i->finished = 0;
+ mb();
+ m->finished = 0;
+ /* First set the fields after finished */
+ i->extcpu = m->extcpu;
+ mb();
+ /* Now write record in order, finished last (except above) */
+ memcpy(i, m, sizeof(struct mce));
+ /* Finally activate it */
+ mb();
+ i->finished = 1;
+}
+
+static void raise_poll(struct mce *m)
+{
+ unsigned long flags;
+ mce_banks_t b;
+
+ memset(&b, 0xff, sizeof(mce_banks_t));
+ local_irq_save(flags);
+ machine_check_poll(0, &b);
+ local_irq_restore(flags);
+ m->finished = 0;
+}
+
+static void raise_exception(struct mce *m, struct pt_regs *pregs)
+{
+ struct pt_regs regs;
+ unsigned long flags;
+
+ if (!pregs) {
+ memset(&regs, 0, sizeof(struct pt_regs));
+ regs.ip = m->ip;
+ regs.cs = m->cs;
+ pregs = &regs;
+ }
+ /* in mcheck exeception handler, irq will be disabled */
+ local_irq_save(flags);
+ do_machine_check(pregs, 0);
+ local_irq_restore(flags);
+ m->finished = 0;
+}
+
+static cpumask_var_t mce_inject_cpumask;
+static DEFINE_MUTEX(mce_inject_mutex);
+
+static int mce_raise_notify(unsigned int cmd, struct pt_regs *regs)
+{
+ int cpu = smp_processor_id();
+ struct mce *m = this_cpu_ptr(&injectm);
+ if (!cpumask_test_cpu(cpu, mce_inject_cpumask))
+ return NMI_DONE;
+ cpumask_clear_cpu(cpu, mce_inject_cpumask);
+ if (m->inject_flags & MCJ_EXCEPTION)
+ raise_exception(m, regs);
+ else if (m->status)
+ raise_poll(m);
+ return NMI_HANDLED;
+}
+
+static void mce_irq_ipi(void *info)
+{
+ int cpu = smp_processor_id();
+ struct mce *m = this_cpu_ptr(&injectm);
+
+ if (cpumask_test_cpu(cpu, mce_inject_cpumask) &&
+ m->inject_flags & MCJ_EXCEPTION) {
+ cpumask_clear_cpu(cpu, mce_inject_cpumask);
+ raise_exception(m, NULL);
+ }
+}
+
+/* Inject mce on current CPU */
+static int raise_local(void)
+{
+ struct mce *m = this_cpu_ptr(&injectm);
+ int context = MCJ_CTX(m->inject_flags);
+ int ret = 0;
+ int cpu = m->extcpu;
+
+ if (m->inject_flags & MCJ_EXCEPTION) {
+ pr_info("Triggering MCE exception on CPU %d\n", cpu);
+ switch (context) {
+ case MCJ_CTX_IRQ:
+ /*
+ * Could do more to fake interrupts like
+ * calling irq_enter, but the necessary
+ * machinery isn't exported currently.
+ */
+ /*FALL THROUGH*/
+ case MCJ_CTX_PROCESS:
+ raise_exception(m, NULL);
+ break;
+ default:
+ pr_info("Invalid MCE context\n");
+ ret = -EINVAL;
+ }
+ pr_info("MCE exception done on CPU %d\n", cpu);
+ } else if (m->status) {
+ pr_info("Starting machine check poll CPU %d\n", cpu);
+ raise_poll(m);
+ mce_notify_irq();
+ pr_info("Machine check poll done on CPU %d\n", cpu);
+ } else
+ m->finished = 0;
+
+ return ret;
+}
+
+static void __maybe_unused raise_mce(struct mce *m)
+{
+ int context = MCJ_CTX(m->inject_flags);
+
+ inject_mce(m);
+
+ if (context == MCJ_CTX_RANDOM)
+ return;
+
+ if (m->inject_flags & (MCJ_IRQ_BROADCAST | MCJ_NMI_BROADCAST)) {
+ unsigned long start;
+ int cpu;
+
+ get_online_cpus();
+ cpumask_copy(mce_inject_cpumask, cpu_online_mask);
+ cpumask_clear_cpu(get_cpu(), mce_inject_cpumask);
+ for_each_online_cpu(cpu) {
+ struct mce *mcpu = &per_cpu(injectm, cpu);
+ if (!mcpu->finished ||
+ MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM)
+ cpumask_clear_cpu(cpu, mce_inject_cpumask);
+ }
+ if (!cpumask_empty(mce_inject_cpumask)) {
+ if (m->inject_flags & MCJ_IRQ_BROADCAST) {
+ /*
+ * don't wait because mce_irq_ipi is necessary
+ * to be sync with following raise_local
+ */
+ preempt_disable();
+ smp_call_function_many(mce_inject_cpumask,
+ mce_irq_ipi, NULL, 0);
+ preempt_enable();
+ } else if (m->inject_flags & MCJ_NMI_BROADCAST)
+ apic->send_IPI_mask(mce_inject_cpumask,
+ NMI_VECTOR);
+ }
+ start = jiffies;
+ while (!cpumask_empty(mce_inject_cpumask)) {
+ if (!time_before(jiffies, start + 2*HZ)) {
+ pr_err("Timeout waiting for mce inject %lx\n",
+ *cpumask_bits(mce_inject_cpumask));
+ break;
+ }
+ cpu_relax();
+ }
+ raise_local();
+ put_cpu();
+ put_online_cpus();
+ } else {
+ preempt_disable();
+ raise_local();
+ preempt_enable();
+ }
+}
+
+static int mce_inject_raise(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ struct mce *m = (struct mce *)data;
+
+ if (!m)
+ return NOTIFY_DONE;
+
+ mutex_lock(&mce_inject_mutex);
+ raise_mce(m);
+ mutex_unlock(&mce_inject_mutex);
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block inject_nb = {
+ .notifier_call = mce_inject_raise,
+};
+
+/*
+ * Caller needs to be make sure this cpu doesn't disappear
+ * from under us, i.e.: get_cpu/put_cpu.
+ */
+static int toggle_hw_mce_inject(unsigned int cpu, bool enable)
+{
+ u32 l, h;
+ int err;
+
+ err = rdmsr_on_cpu(cpu, MSR_K7_HWCR, &l, &h);
+ if (err) {
+ pr_err("%s: error reading HWCR\n", __func__);
+ return err;
+ }
+
+ enable ? (l |= BIT(18)) : (l &= ~BIT(18));
+
+ err = wrmsr_on_cpu(cpu, MSR_K7_HWCR, l, h);
+ if (err)
+ pr_err("%s: error writing HWCR\n", __func__);
+
+ return err;
+}
+
+static int __set_inj(const char *buf)
+{
+ int i;
+
+ for (i = 0; i < N_INJ_TYPES; i++) {
+ if (!strncmp(flags_options[i], buf, strlen(flags_options[i]))) {
+ inj_type = i;
+ return 0;
+ }
+ }
+ return -EINVAL;
+}
+
+static ssize_t flags_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char buf[MAX_FLAG_OPT_SIZE];
+ int n;
+
+ n = sprintf(buf, "%s\n", flags_options[inj_type]);
+
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, n);
+}
+
+static ssize_t flags_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char buf[MAX_FLAG_OPT_SIZE], *__buf;
+ int err;
+
+ if (!cnt || cnt > MAX_FLAG_OPT_SIZE)
+ return -EINVAL;
+
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+
+ buf[cnt - 1] = 0;
+
+ /* strip whitespace */
+ __buf = strstrip(buf);
+
+ err = __set_inj(__buf);
+ if (err) {
+ pr_err("%s: Invalid flags value: %s\n", __func__, __buf);
+ return err;
+ }
+
+ *ppos += cnt;
+
+ return cnt;
+}
+
+static const struct file_operations flags_fops = {
+ .read = flags_read,
+ .write = flags_write,
+ .llseek = generic_file_llseek,
+};
+
+/*
+ * On which CPU to inject?
+ */
+MCE_INJECT_GET(extcpu);
+
+static int inj_extcpu_set(void *data, u64 val)
+{
+ struct mce *m = (struct mce *)data;
+
+ if (val >= nr_cpu_ids || !cpu_online(val)) {
+ pr_err("%s: Invalid CPU: %llu\n", __func__, val);
+ return -EINVAL;
+ }
+ m->extcpu = val;
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(extcpu_fops, inj_extcpu_get, inj_extcpu_set, "%llu\n");
+
+static void trigger_mce(void *info)
+{
+ asm volatile("int $18");
+}
+
+static void trigger_dfr_int(void *info)
+{
+ asm volatile("int %0" :: "i" (DEFERRED_ERROR_VECTOR));
+}
+
+static void trigger_thr_int(void *info)
+{
+ asm volatile("int %0" :: "i" (THRESHOLD_APIC_VECTOR));
+}
+
+static u32 get_nbc_for_node(int node_id)
+{
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+ u32 cores_per_node;
+
+ cores_per_node = (c->x86_max_cores * smp_num_siblings) / amd_get_nodes_per_socket();
+
+ return cores_per_node * node_id;
+}
+
+static void toggle_nb_mca_mst_cpu(u16 nid)
+{
+ struct amd_northbridge *nb;
+ struct pci_dev *F3;
+ u32 val;
+ int err;
+
+ nb = node_to_amd_nb(nid);
+ if (!nb)
+ return;
+
+ F3 = nb->misc;
+ if (!F3)
+ return;
+
+ err = pci_read_config_dword(F3, NBCFG, &val);
+ if (err) {
+ pr_err("%s: Error reading F%dx%03x.\n",
+ __func__, PCI_FUNC(F3->devfn), NBCFG);
+ return;
+ }
+
+ if (val & BIT(27))
+ return;
+
+ pr_err("%s: Set D18F3x44[NbMcaToMstCpuEn] which BIOS hasn't done.\n",
+ __func__);
+
+ val |= BIT(27);
+ err = pci_write_config_dword(F3, NBCFG, val);
+ if (err)
+ pr_err("%s: Error writing F%dx%03x.\n",
+ __func__, PCI_FUNC(F3->devfn), NBCFG);
+}
+
+static void prepare_msrs(void *info)
+{
+ struct mce m = *(struct mce *)info;
+ u8 b = m.bank;
+
+ wrmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
+
+ if (boot_cpu_has(X86_FEATURE_SMCA)) {
+ if (m.inject_flags == DFR_INT_INJ) {
+ wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(b), m.status);
+ wrmsrl(MSR_AMD64_SMCA_MCx_DEADDR(b), m.addr);
+ } else {
+ wrmsrl(MSR_AMD64_SMCA_MCx_STATUS(b), m.status);
+ wrmsrl(MSR_AMD64_SMCA_MCx_ADDR(b), m.addr);
+ }
+
+ wrmsrl(MSR_AMD64_SMCA_MCx_MISC(b), m.misc);
+ wrmsrl(MSR_AMD64_SMCA_MCx_SYND(b), m.synd);
+ } else {
+ wrmsrl(MSR_IA32_MCx_STATUS(b), m.status);
+ wrmsrl(MSR_IA32_MCx_ADDR(b), m.addr);
+ wrmsrl(MSR_IA32_MCx_MISC(b), m.misc);
+ }
+}
+
+static void do_inject(void)
+{
+ u64 mcg_status = 0;
+ unsigned int cpu = i_mce.extcpu;
+ u8 b = i_mce.bank;
+
+ i_mce.tsc = rdtsc_ordered();
+
+ if (i_mce.misc)
+ i_mce.status |= MCI_STATUS_MISCV;
+
+ if (i_mce.synd)
+ i_mce.status |= MCI_STATUS_SYNDV;
+
+ if (inj_type == SW_INJ) {
+ mce_inject_log(&i_mce);
+ return;
+ }
+
+ /* prep MCE global settings for the injection */
+ mcg_status = MCG_STATUS_MCIP | MCG_STATUS_EIPV;
+
+ if (!(i_mce.status & MCI_STATUS_PCC))
+ mcg_status |= MCG_STATUS_RIPV;
+
+ /*
+ * Ensure necessary status bits for deferred errors:
+ * - MCx_STATUS[Deferred]: make sure it is a deferred error
+ * - MCx_STATUS[UC] cleared: deferred errors are _not_ UC
+ */
+ if (inj_type == DFR_INT_INJ) {
+ i_mce.status |= MCI_STATUS_DEFERRED;
+ i_mce.status &= ~MCI_STATUS_UC;
+ }
+
+ /*
+ * For multi node CPUs, logging and reporting of bank 4 errors happens
+ * only on the node base core. Refer to D18F3x44[NbMcaToMstCpuEn] for
+ * Fam10h and later BKDGs.
+ */
+ if (static_cpu_has(X86_FEATURE_AMD_DCM) &&
+ b == 4 &&
+ boot_cpu_data.x86 < 0x17) {
+ toggle_nb_mca_mst_cpu(amd_get_nb_id(cpu));
+ cpu = get_nbc_for_node(amd_get_nb_id(cpu));
+ }
+
+ get_online_cpus();
+ if (!cpu_online(cpu))
+ goto err;
+
+ toggle_hw_mce_inject(cpu, true);
+
+ i_mce.mcgstatus = mcg_status;
+ i_mce.inject_flags = inj_type;
+ smp_call_function_single(cpu, prepare_msrs, &i_mce, 0);
+
+ toggle_hw_mce_inject(cpu, false);
+
+ switch (inj_type) {
+ case DFR_INT_INJ:
+ smp_call_function_single(cpu, trigger_dfr_int, NULL, 0);
+ break;
+ case THR_INT_INJ:
+ smp_call_function_single(cpu, trigger_thr_int, NULL, 0);
+ break;
+ default:
+ smp_call_function_single(cpu, trigger_mce, NULL, 0);
+ }
+
+err:
+ put_online_cpus();
+
+}
+
+/*
+ * This denotes into which bank we're injecting and triggers
+ * the injection, at the same time.
+ */
+static int inj_bank_set(void *data, u64 val)
+{
+ struct mce *m = (struct mce *)data;
+ u8 n_banks;
+ u64 cap;
+
+ /* Get bank count on target CPU so we can handle non-uniform values. */
+ rdmsrl_on_cpu(m->extcpu, MSR_IA32_MCG_CAP, &cap);
+ n_banks = cap & MCG_BANKCNT_MASK;
+
+ if (val >= n_banks) {
+ pr_err("MCA bank %llu non-existent on CPU%d\n", val, m->extcpu);
+ return -EINVAL;
+ }
+
+ m->bank = val;
+ do_inject();
+
+ /* Reset injection struct */
+ setup_inj_struct(&i_mce);
+
+ return 0;
+}
+
+MCE_INJECT_GET(bank);
+
+DEFINE_SIMPLE_ATTRIBUTE(bank_fops, inj_bank_get, inj_bank_set, "%llu\n");
+
+static const char readme_msg[] =
+"Description of the files and their usages:\n"
+"\n"
+"Note1: i refers to the bank number below.\n"
+"Note2: See respective BKDGs for the exact bit definitions of the files below\n"
+"as they mirror the hardware registers.\n"
+"\n"
+"status:\t Set MCi_STATUS: the bits in that MSR control the error type and\n"
+"\t attributes of the error which caused the MCE.\n"
+"\n"
+"misc:\t Set MCi_MISC: provide auxiliary info about the error. It is mostly\n"
+"\t used for error thresholding purposes and its validity is indicated by\n"
+"\t MCi_STATUS[MiscV].\n"
+"\n"
+"synd:\t Set MCi_SYND: provide syndrome info about the error. Only valid on\n"
+"\t Scalable MCA systems, and its validity is indicated by MCi_STATUS[SyndV].\n"
+"\n"
+"addr:\t Error address value to be written to MCi_ADDR. Log address information\n"
+"\t associated with the error.\n"
+"\n"
+"cpu:\t The CPU to inject the error on.\n"
+"\n"
+"bank:\t Specify the bank you want to inject the error into: the number of\n"
+"\t banks in a processor varies and is family/model-specific, therefore, the\n"
+"\t supplied value is sanity-checked. Setting the bank value also triggers the\n"
+"\t injection.\n"
+"\n"
+"flags:\t Injection type to be performed. Writing to this file will trigger a\n"
+"\t real machine check, an APIC interrupt or invoke the error decoder routines\n"
+"\t for AMD processors.\n"
+"\n"
+"\t Allowed error injection types:\n"
+"\t - \"sw\": Software error injection. Decode error to a human-readable \n"
+"\t format only. Safe to use.\n"
+"\t - \"hw\": Hardware error injection. Causes the #MC exception handler to \n"
+"\t handle the error. Be warned: might cause system panic if MCi_STATUS[PCC] \n"
+"\t is set. Therefore, consider setting (debugfs_mountpoint)/mce/fake_panic \n"
+"\t before injecting.\n"
+"\t - \"df\": Trigger APIC interrupt for Deferred error. Causes deferred \n"
+"\t error APIC interrupt handler to handle the error if the feature is \n"
+"\t is present in hardware. \n"
+"\t - \"th\": Trigger APIC interrupt for Threshold errors. Causes threshold \n"
+"\t APIC interrupt handler to handle the error. \n"
+"\n";
+
+static ssize_t
+inj_readme_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ return simple_read_from_buffer(ubuf, cnt, ppos,
+ readme_msg, strlen(readme_msg));
+}
+
+static const struct file_operations readme_fops = {
+ .read = inj_readme_read,
+};
+
+static struct dfs_node {
+ char *name;
+ struct dentry *d;
+ const struct file_operations *fops;
+ umode_t perm;
+} dfs_fls[] = {
+ { .name = "status", .fops = &status_fops, .perm = S_IRUSR | S_IWUSR },
+ { .name = "misc", .fops = &misc_fops, .perm = S_IRUSR | S_IWUSR },
+ { .name = "addr", .fops = &addr_fops, .perm = S_IRUSR | S_IWUSR },
+ { .name = "synd", .fops = &synd_fops, .perm = S_IRUSR | S_IWUSR },
+ { .name = "bank", .fops = &bank_fops, .perm = S_IRUSR | S_IWUSR },
+ { .name = "flags", .fops = &flags_fops, .perm = S_IRUSR | S_IWUSR },
+ { .name = "cpu", .fops = &extcpu_fops, .perm = S_IRUSR | S_IWUSR },
+ { .name = "README", .fops = &readme_fops, .perm = S_IRUSR | S_IRGRP | S_IROTH },
+};
+
+static int __init debugfs_init(void)
+{
+ unsigned int i;
+
+ dfs_inj = debugfs_create_dir("mce-inject", NULL);
+ if (!dfs_inj)
+ return -EINVAL;
+
+ for (i = 0; i < ARRAY_SIZE(dfs_fls); i++) {
+ dfs_fls[i].d = debugfs_create_file(dfs_fls[i].name,
+ dfs_fls[i].perm,
+ dfs_inj,
+ &i_mce,
+ dfs_fls[i].fops);
+
+ if (!dfs_fls[i].d)
+ goto err_dfs_add;
+ }
+
+ return 0;
+
+err_dfs_add:
+ while (i-- > 0)
+ debugfs_remove(dfs_fls[i].d);
+
+ debugfs_remove(dfs_inj);
+ dfs_inj = NULL;
+
+ return -ENODEV;
+}
+
+static int __init inject_init(void)
+{
+ int err;
+
+ if (!alloc_cpumask_var(&mce_inject_cpumask, GFP_KERNEL))
+ return -ENOMEM;
+
+ err = debugfs_init();
+ if (err) {
+ free_cpumask_var(mce_inject_cpumask);
+ return err;
+ }
+
+ register_nmi_handler(NMI_LOCAL, mce_raise_notify, 0, "mce_notify");
+ mce_register_injector_chain(&inject_nb);
+
+ setup_inj_struct(&i_mce);
+
+ pr_info("Machine check injector initialized\n");
+
+ return 0;
+}
+
+static void __exit inject_exit(void)
+{
+
+ mce_unregister_injector_chain(&inject_nb);
+ unregister_nmi_handler(NMI_LOCAL, "mce_notify");
+
+ debugfs_remove_recursive(dfs_inj);
+ dfs_inj = NULL;
+
+ memset(&dfs_fls, 0, sizeof(dfs_fls));
+
+ free_cpumask_var(mce_inject_cpumask);
+}
+
+module_init(inject_init);
+module_exit(inject_exit);
+MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
new file mode 100644
index 000000000..ceb67cd59
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -0,0 +1,173 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __X86_MCE_INTERNAL_H__
+#define __X86_MCE_INTERNAL_H__
+
+#include <linux/device.h>
+#include <asm/mce.h>
+
+enum severity_level {
+ MCE_NO_SEVERITY,
+ MCE_DEFERRED_SEVERITY,
+ MCE_UCNA_SEVERITY = MCE_DEFERRED_SEVERITY,
+ MCE_KEEP_SEVERITY,
+ MCE_SOME_SEVERITY,
+ MCE_AO_SEVERITY,
+ MCE_UC_SEVERITY,
+ MCE_AR_SEVERITY,
+ MCE_PANIC_SEVERITY,
+};
+
+extern struct blocking_notifier_head x86_mce_decoder_chain;
+
+#define ATTR_LEN 16
+#define INITIAL_CHECK_INTERVAL 5 * 60 /* 5 minutes */
+
+/* One object for each MCE bank, shared by all CPUs */
+struct mce_bank {
+ u64 ctl; /* subevents to enable */
+ unsigned char init; /* initialise bank? */
+ struct device_attribute attr; /* device attribute */
+ char attrname[ATTR_LEN]; /* attribute name */
+};
+
+struct mce_evt_llist {
+ struct llist_node llnode;
+ struct mce mce;
+};
+
+void mce_gen_pool_process(struct work_struct *__unused);
+bool mce_gen_pool_empty(void);
+int mce_gen_pool_add(struct mce *mce);
+int mce_gen_pool_init(void);
+struct llist_node *mce_gen_pool_prepare_records(void);
+
+extern int (*mce_severity)(struct mce *a, int tolerant, char **msg, bool is_excp);
+struct dentry *mce_get_debugfs_dir(void);
+
+extern struct mce_bank *mce_banks;
+extern mce_banks_t mce_banks_ce_disabled;
+
+#ifdef CONFIG_X86_MCE_INTEL
+unsigned long cmci_intel_adjust_timer(unsigned long interval);
+bool mce_intel_cmci_poll(void);
+void mce_intel_hcpu_update(unsigned long cpu);
+void cmci_disable_bank(int bank);
+#else
+# define cmci_intel_adjust_timer mce_adjust_timer_default
+static inline bool mce_intel_cmci_poll(void) { return false; }
+static inline void mce_intel_hcpu_update(unsigned long cpu) { }
+static inline void cmci_disable_bank(int bank) { }
+#endif
+
+void mce_timer_kick(unsigned long interval);
+
+#ifdef CONFIG_ACPI_APEI
+int apei_write_mce(struct mce *m);
+ssize_t apei_read_mce(struct mce *m, u64 *record_id);
+int apei_check_mce(void);
+int apei_clear_mce(u64 record_id);
+#else
+static inline int apei_write_mce(struct mce *m)
+{
+ return -EINVAL;
+}
+static inline ssize_t apei_read_mce(struct mce *m, u64 *record_id)
+{
+ return 0;
+}
+static inline int apei_check_mce(void)
+{
+ return 0;
+}
+static inline int apei_clear_mce(u64 record_id)
+{
+ return -EINVAL;
+}
+#endif
+
+void mce_inject_log(struct mce *m);
+
+/*
+ * We consider records to be equivalent if bank+status+addr+misc all match.
+ * This is only used when the system is going down because of a fatal error
+ * to avoid cluttering the console log with essentially repeated information.
+ * In normal processing all errors seen are logged.
+ */
+static inline bool mce_cmp(struct mce *m1, struct mce *m2)
+{
+ return m1->bank != m2->bank ||
+ m1->status != m2->status ||
+ m1->addr != m2->addr ||
+ m1->misc != m2->misc;
+}
+
+extern struct device_attribute dev_attr_trigger;
+
+#ifdef CONFIG_X86_MCELOG_LEGACY
+void mce_work_trigger(void);
+void mce_register_injector_chain(struct notifier_block *nb);
+void mce_unregister_injector_chain(struct notifier_block *nb);
+#else
+static inline void mce_work_trigger(void) { }
+static inline void mce_register_injector_chain(struct notifier_block *nb) { }
+static inline void mce_unregister_injector_chain(struct notifier_block *nb) { }
+#endif
+
+struct mca_config {
+ bool dont_log_ce;
+ bool cmci_disabled;
+ bool ignore_ce;
+
+ __u64 lmce_disabled : 1,
+ disabled : 1,
+ ser : 1,
+ recovery : 1,
+ bios_cmci_threshold : 1,
+ __reserved : 59;
+
+ u8 banks;
+ s8 bootlog;
+ int tolerant;
+ int monarch_timeout;
+ int panic_timeout;
+ u32 rip_msr;
+};
+
+extern struct mca_config mca_cfg;
+
+struct mce_vendor_flags {
+ /*
+ * Indicates that overflow conditions are not fatal, when set.
+ */
+ __u64 overflow_recov : 1,
+
+ /*
+ * (AMD) SUCCOR stands for S/W UnCorrectable error COntainment and
+ * Recovery. It indicates support for data poisoning in HW and deferred
+ * error interrupts.
+ */
+ succor : 1,
+
+ /*
+ * (AMD) SMCA: This bit indicates support for Scalable MCA which expands
+ * the register space for each MCA bank and also increases number of
+ * banks. Also, to accommodate the new banks and registers, the MCA
+ * register space is moved to a new MSR range.
+ */
+ smca : 1,
+
+ __reserved_0 : 61;
+};
+
+extern struct mce_vendor_flags mce_flags;
+
+struct mca_msr_regs {
+ u32 (*ctl) (int bank);
+ u32 (*status) (int bank);
+ u32 (*addr) (int bank);
+ u32 (*misc) (int bank);
+};
+
+extern struct mca_msr_regs msr_ops;
+
+#endif /* __X86_MCE_INTERNAL_H__ */
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
new file mode 100644
index 000000000..06f7c04a4
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -0,0 +1,423 @@
+/*
+ * MCE grading rules.
+ * Copyright 2008, 2009 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ *
+ * Author: Andi Kleen
+ */
+#include <linux/kernel.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <linux/debugfs.h>
+#include <asm/mce.h>
+#include <linux/uaccess.h>
+
+#include "mce-internal.h"
+
+/*
+ * Grade an mce by severity. In general the most severe ones are processed
+ * first. Since there are quite a lot of combinations test the bits in a
+ * table-driven way. The rules are simply processed in order, first
+ * match wins.
+ *
+ * Note this is only used for machine check exceptions, the corrected
+ * errors use much simpler rules. The exceptions still check for the corrected
+ * errors, but only to leave them alone for the CMCI handler (except for
+ * panic situations)
+ */
+
+enum context { IN_KERNEL = 1, IN_USER = 2, IN_KERNEL_RECOV = 3 };
+enum ser { SER_REQUIRED = 1, NO_SER = 2 };
+enum exception { EXCP_CONTEXT = 1, NO_EXCP = 2 };
+
+static struct severity {
+ u64 mask;
+ u64 result;
+ unsigned char sev;
+ unsigned char mcgmask;
+ unsigned char mcgres;
+ unsigned char ser;
+ unsigned char context;
+ unsigned char excp;
+ unsigned char covered;
+ char *msg;
+} severities[] = {
+#define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c }
+#define KERNEL .context = IN_KERNEL
+#define USER .context = IN_USER
+#define KERNEL_RECOV .context = IN_KERNEL_RECOV
+#define SER .ser = SER_REQUIRED
+#define NOSER .ser = NO_SER
+#define EXCP .excp = EXCP_CONTEXT
+#define NOEXCP .excp = NO_EXCP
+#define BITCLR(x) .mask = x, .result = 0
+#define BITSET(x) .mask = x, .result = x
+#define MCGMASK(x, y) .mcgmask = x, .mcgres = y
+#define MASK(x, y) .mask = x, .result = y
+#define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S)
+#define MCI_UC_AR (MCI_STATUS_UC|MCI_STATUS_AR)
+#define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR)
+#define MCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV)
+
+ MCESEV(
+ NO, "Invalid",
+ BITCLR(MCI_STATUS_VAL)
+ ),
+ MCESEV(
+ NO, "Not enabled",
+ EXCP, BITCLR(MCI_STATUS_EN)
+ ),
+ MCESEV(
+ PANIC, "Processor context corrupt",
+ BITSET(MCI_STATUS_PCC)
+ ),
+ /* When MCIP is not set something is very confused */
+ MCESEV(
+ PANIC, "MCIP not set in MCA handler",
+ EXCP, MCGMASK(MCG_STATUS_MCIP, 0)
+ ),
+ /* Neither return not error IP -- no chance to recover -> PANIC */
+ MCESEV(
+ PANIC, "Neither restart nor error IP",
+ EXCP, MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0)
+ ),
+ MCESEV(
+ PANIC, "In kernel and no restart IP",
+ EXCP, KERNEL, MCGMASK(MCG_STATUS_RIPV, 0)
+ ),
+ MCESEV(
+ PANIC, "In kernel and no restart IP",
+ EXCP, KERNEL_RECOV, MCGMASK(MCG_STATUS_RIPV, 0)
+ ),
+ MCESEV(
+ DEFERRED, "Deferred error",
+ NOSER, MASK(MCI_STATUS_UC|MCI_STATUS_DEFERRED|MCI_STATUS_POISON, MCI_STATUS_DEFERRED)
+ ),
+ MCESEV(
+ KEEP, "Corrected error",
+ NOSER, BITCLR(MCI_STATUS_UC)
+ ),
+
+ /*
+ * known AO MCACODs reported via MCE or CMC:
+ *
+ * SRAO could be signaled either via a machine check exception or
+ * CMCI with the corresponding bit S 1 or 0. So we don't need to
+ * check bit S for SRAO.
+ */
+ MCESEV(
+ AO, "Action optional: memory scrubbing error",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD_SCRUBMSK, MCI_STATUS_UC|MCACOD_SCRUB)
+ ),
+ MCESEV(
+ AO, "Action optional: last level cache writeback error",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD, MCI_STATUS_UC|MCACOD_L3WB)
+ ),
+
+ /* ignore OVER for UCNA */
+ MCESEV(
+ UCNA, "Uncorrected no action required",
+ SER, MASK(MCI_UC_SAR, MCI_STATUS_UC)
+ ),
+ MCESEV(
+ PANIC, "Illegal combination (UCNA with AR=1)",
+ SER,
+ MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR)
+ ),
+ MCESEV(
+ KEEP, "Non signalled machine check",
+ SER, BITCLR(MCI_STATUS_S)
+ ),
+
+ MCESEV(
+ PANIC, "Action required with lost events",
+ SER, BITSET(MCI_STATUS_OVER|MCI_UC_SAR)
+ ),
+
+ /* known AR MCACODs: */
+#ifdef CONFIG_MEMORY_FAILURE
+ MCESEV(
+ KEEP, "Action required but unaffected thread is continuable",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR, MCI_UC_SAR|MCI_ADDR),
+ MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, MCG_STATUS_RIPV)
+ ),
+ MCESEV(
+ AR, "Action required: data load in error recoverable area of kernel",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
+ KERNEL_RECOV
+ ),
+ MCESEV(
+ AR, "Action required: data load error in a user process",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
+ USER
+ ),
+ MCESEV(
+ AR, "Action required: instruction fetch error in a user process",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR),
+ USER
+ ),
+ MCESEV(
+ PANIC, "Data load in unrecoverable area of kernel",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
+ KERNEL
+ ),
+ MCESEV(
+ PANIC, "Instruction fetch error in kernel",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR),
+ KERNEL
+ ),
+#endif
+ MCESEV(
+ PANIC, "Action required: unknown MCACOD",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR)
+ ),
+
+ MCESEV(
+ SOME, "Action optional: unknown MCACOD",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S)
+ ),
+ MCESEV(
+ SOME, "Action optional with lost events",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_S)
+ ),
+
+ MCESEV(
+ PANIC, "Overflowed uncorrected",
+ BITSET(MCI_STATUS_OVER|MCI_STATUS_UC)
+ ),
+ MCESEV(
+ UC, "Uncorrected",
+ BITSET(MCI_STATUS_UC)
+ ),
+ MCESEV(
+ SOME, "No match",
+ BITSET(0)
+ ) /* always matches. keep at end */
+};
+
+#define mc_recoverable(mcg) (((mcg) & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) == \
+ (MCG_STATUS_RIPV|MCG_STATUS_EIPV))
+
+/*
+ * If mcgstatus indicated that ip/cs on the stack were
+ * no good, then "m->cs" will be zero and we will have
+ * to assume the worst case (IN_KERNEL) as we actually
+ * have no idea what we were executing when the machine
+ * check hit.
+ * If we do have a good "m->cs" (or a faked one in the
+ * case we were executing in VM86 mode) we can use it to
+ * distinguish an exception taken in user from from one
+ * taken in the kernel.
+ */
+static int error_context(struct mce *m)
+{
+ if ((m->cs & 3) == 3)
+ return IN_USER;
+ if (mc_recoverable(m->mcgstatus) && ex_has_fault_handler(m->ip))
+ return IN_KERNEL_RECOV;
+ return IN_KERNEL;
+}
+
+static int mce_severity_amd_smca(struct mce *m, enum context err_ctx)
+{
+ u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);
+ u32 low, high;
+
+ /*
+ * We need to look at the following bits:
+ * - "succor" bit (data poisoning support), and
+ * - TCC bit (Task Context Corrupt)
+ * in MCi_STATUS to determine error severity.
+ */
+ if (!mce_flags.succor)
+ return MCE_PANIC_SEVERITY;
+
+ if (rdmsr_safe(addr, &low, &high))
+ return MCE_PANIC_SEVERITY;
+
+ /* TCC (Task context corrupt). If set and if IN_KERNEL, panic. */
+ if ((low & MCI_CONFIG_MCAX) &&
+ (m->status & MCI_STATUS_TCC) &&
+ (err_ctx == IN_KERNEL))
+ return MCE_PANIC_SEVERITY;
+
+ /* ...otherwise invoke hwpoison handler. */
+ return MCE_AR_SEVERITY;
+}
+
+/*
+ * See AMD Error Scope Hierarchy table in a newer BKDG. For example
+ * 49125_15h_Models_30h-3Fh_BKDG.pdf, section "RAS Features"
+ */
+static int mce_severity_amd(struct mce *m, int tolerant, char **msg, bool is_excp)
+{
+ enum context ctx = error_context(m);
+
+ /* Processor Context Corrupt, no need to fumble too much, die! */
+ if (m->status & MCI_STATUS_PCC)
+ return MCE_PANIC_SEVERITY;
+
+ if (m->status & MCI_STATUS_UC) {
+
+ if (ctx == IN_KERNEL)
+ return MCE_PANIC_SEVERITY;
+
+ /*
+ * On older systems where overflow_recov flag is not present, we
+ * should simply panic if an error overflow occurs. If
+ * overflow_recov flag is present and set, then software can try
+ * to at least kill process to prolong system operation.
+ */
+ if (mce_flags.overflow_recov) {
+ if (mce_flags.smca)
+ return mce_severity_amd_smca(m, ctx);
+
+ /* kill current process */
+ return MCE_AR_SEVERITY;
+ } else {
+ /* at least one error was not logged */
+ if (m->status & MCI_STATUS_OVER)
+ return MCE_PANIC_SEVERITY;
+ }
+
+ /*
+ * For any other case, return MCE_UC_SEVERITY so that we log the
+ * error and exit #MC handler.
+ */
+ return MCE_UC_SEVERITY;
+ }
+
+ /*
+ * deferred error: poll handler catches these and adds to mce_ring so
+ * memory-failure can take recovery actions.
+ */
+ if (m->status & MCI_STATUS_DEFERRED)
+ return MCE_DEFERRED_SEVERITY;
+
+ /*
+ * corrected error: poll handler catches these and passes responsibility
+ * of decoding the error to EDAC
+ */
+ return MCE_KEEP_SEVERITY;
+}
+
+static int mce_severity_intel(struct mce *m, int tolerant, char **msg, bool is_excp)
+{
+ enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP);
+ enum context ctx = error_context(m);
+ struct severity *s;
+
+ for (s = severities;; s++) {
+ if ((m->status & s->mask) != s->result)
+ continue;
+ if ((m->mcgstatus & s->mcgmask) != s->mcgres)
+ continue;
+ if (s->ser == SER_REQUIRED && !mca_cfg.ser)
+ continue;
+ if (s->ser == NO_SER && mca_cfg.ser)
+ continue;
+ if (s->context && ctx != s->context)
+ continue;
+ if (s->excp && excp != s->excp)
+ continue;
+ if (msg)
+ *msg = s->msg;
+ s->covered = 1;
+ if (s->sev >= MCE_UC_SEVERITY && ctx == IN_KERNEL) {
+ if (tolerant < 1)
+ return MCE_PANIC_SEVERITY;
+ }
+ return s->sev;
+ }
+}
+
+/* Default to mce_severity_intel */
+int (*mce_severity)(struct mce *m, int tolerant, char **msg, bool is_excp) =
+ mce_severity_intel;
+
+void __init mcheck_vendor_init_severity(void)
+{
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+ mce_severity = mce_severity_amd;
+}
+
+#ifdef CONFIG_DEBUG_FS
+static void *s_start(struct seq_file *f, loff_t *pos)
+{
+ if (*pos >= ARRAY_SIZE(severities))
+ return NULL;
+ return &severities[*pos];
+}
+
+static void *s_next(struct seq_file *f, void *data, loff_t *pos)
+{
+ if (++(*pos) >= ARRAY_SIZE(severities))
+ return NULL;
+ return &severities[*pos];
+}
+
+static void s_stop(struct seq_file *f, void *data)
+{
+}
+
+static int s_show(struct seq_file *f, void *data)
+{
+ struct severity *ser = data;
+ seq_printf(f, "%d\t%s\n", ser->covered, ser->msg);
+ return 0;
+}
+
+static const struct seq_operations severities_seq_ops = {
+ .start = s_start,
+ .next = s_next,
+ .stop = s_stop,
+ .show = s_show,
+};
+
+static int severities_coverage_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &severities_seq_ops);
+}
+
+static ssize_t severities_coverage_write(struct file *file,
+ const char __user *ubuf,
+ size_t count, loff_t *ppos)
+{
+ int i;
+ for (i = 0; i < ARRAY_SIZE(severities); i++)
+ severities[i].covered = 0;
+ return count;
+}
+
+static const struct file_operations severities_coverage_fops = {
+ .open = severities_coverage_open,
+ .release = seq_release,
+ .read = seq_read,
+ .write = severities_coverage_write,
+ .llseek = seq_lseek,
+};
+
+static int __init severities_debugfs_init(void)
+{
+ struct dentry *dmce, *fsev;
+
+ dmce = mce_get_debugfs_dir();
+ if (!dmce)
+ goto err_out;
+
+ fsev = debugfs_create_file("severities-coverage", 0444, dmce, NULL,
+ &severities_coverage_fops);
+ if (!fsev)
+ goto err_out;
+
+ return 0;
+
+err_out:
+ return -ENOMEM;
+}
+late_initcall(severities_debugfs_init);
+#endif /* CONFIG_DEBUG_FS */
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
new file mode 100644
index 000000000..8f36ccf26
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -0,0 +1,2506 @@
+/*
+ * Machine check handler.
+ *
+ * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ * Rest from unknown author(s).
+ * 2004 Andi Kleen. Rewrote most of it.
+ * Copyright 2008 Intel Corporation
+ * Author: Andi Kleen
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/thread_info.h>
+#include <linux/capability.h>
+#include <linux/miscdevice.h>
+#include <linux/ratelimit.h>
+#include <linux/rcupdate.h>
+#include <linux/kobject.h>
+#include <linux/uaccess.h>
+#include <linux/kdebug.h>
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <linux/string.h>
+#include <linux/device.h>
+#include <linux/syscore_ops.h>
+#include <linux/delay.h>
+#include <linux/ctype.h>
+#include <linux/sched.h>
+#include <linux/sysfs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/kmod.h>
+#include <linux/poll.h>
+#include <linux/nmi.h>
+#include <linux/cpu.h>
+#include <linux/ras.h>
+#include <linux/smp.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/debugfs.h>
+#include <linux/irq_work.h>
+#include <linux/export.h>
+#include <linux/jump_label.h>
+#include <linux/set_memory.h>
+
+#include <asm/intel-family.h>
+#include <asm/processor.h>
+#include <asm/traps.h>
+#include <asm/tlbflush.h>
+#include <asm/mce.h>
+#include <asm/msr.h>
+#include <asm/reboot.h>
+
+#include "mce-internal.h"
+
+static DEFINE_MUTEX(mce_log_mutex);
+
+/* sysfs synchronization */
+static DEFINE_MUTEX(mce_sysfs_mutex);
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/mce.h>
+
+#define SPINUNIT 100 /* 100ns */
+
+DEFINE_PER_CPU(unsigned, mce_exception_count);
+
+struct mce_bank *mce_banks __read_mostly;
+struct mce_vendor_flags mce_flags __read_mostly;
+
+struct mca_config mca_cfg __read_mostly = {
+ .bootlog = -1,
+ /*
+ * Tolerant levels:
+ * 0: always panic on uncorrected errors, log corrected errors
+ * 1: panic or SIGBUS on uncorrected errors, log corrected errors
+ * 2: SIGBUS or log uncorrected errors (if possible), log corr. errors
+ * 3: never panic or SIGBUS, log all errors (for testing only)
+ */
+ .tolerant = 1,
+ .monarch_timeout = -1
+};
+
+static DEFINE_PER_CPU(struct mce, mces_seen);
+static unsigned long mce_need_notify;
+static int cpu_missing;
+
+/*
+ * MCA banks polled by the period polling timer for corrected events.
+ * With Intel CMCI, this only has MCA banks which do not support CMCI (if any).
+ */
+DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
+ [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
+};
+
+/*
+ * MCA banks controlled through firmware first for corrected errors.
+ * This is a global list of banks for which we won't enable CMCI and we
+ * won't poll. Firmware controls these banks and is responsible for
+ * reporting corrected errors through GHES. Uncorrected/recoverable
+ * errors are still notified through a machine check.
+ */
+mce_banks_t mce_banks_ce_disabled;
+
+static struct work_struct mce_work;
+static struct irq_work mce_irq_work;
+
+static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
+
+/*
+ * CPU/chipset specific EDAC code can register a notifier call here to print
+ * MCE errors in a human-readable form.
+ */
+BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain);
+
+/* Do initial initialization of a struct mce */
+void mce_setup(struct mce *m)
+{
+ memset(m, 0, sizeof(struct mce));
+ m->cpu = m->extcpu = smp_processor_id();
+ /* need the internal __ version to avoid deadlocks */
+ m->time = __ktime_get_real_seconds();
+ m->cpuvendor = boot_cpu_data.x86_vendor;
+ m->cpuid = cpuid_eax(1);
+ m->socketid = cpu_data(m->extcpu).phys_proc_id;
+ m->apicid = cpu_data(m->extcpu).initial_apicid;
+ rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
+
+ if (this_cpu_has(X86_FEATURE_INTEL_PPIN))
+ rdmsrl(MSR_PPIN, m->ppin);
+
+ m->microcode = boot_cpu_data.microcode;
+}
+
+DEFINE_PER_CPU(struct mce, injectm);
+EXPORT_PER_CPU_SYMBOL_GPL(injectm);
+
+void mce_log(struct mce *m)
+{
+ if (!mce_gen_pool_add(m))
+ irq_work_queue(&mce_irq_work);
+}
+
+void mce_inject_log(struct mce *m)
+{
+ mutex_lock(&mce_log_mutex);
+ mce_log(m);
+ mutex_unlock(&mce_log_mutex);
+}
+EXPORT_SYMBOL_GPL(mce_inject_log);
+
+static struct notifier_block mce_srao_nb;
+
+/*
+ * We run the default notifier if we have only the SRAO, the first and the
+ * default notifier registered. I.e., the mandatory NUM_DEFAULT_NOTIFIERS
+ * notifiers registered on the chain.
+ */
+#define NUM_DEFAULT_NOTIFIERS 3
+static atomic_t num_notifiers;
+
+void mce_register_decode_chain(struct notifier_block *nb)
+{
+ if (WARN_ON(nb->priority > MCE_PRIO_MCELOG && nb->priority < MCE_PRIO_EDAC))
+ return;
+
+ atomic_inc(&num_notifiers);
+
+ blocking_notifier_chain_register(&x86_mce_decoder_chain, nb);
+}
+EXPORT_SYMBOL_GPL(mce_register_decode_chain);
+
+void mce_unregister_decode_chain(struct notifier_block *nb)
+{
+ atomic_dec(&num_notifiers);
+
+ blocking_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
+}
+EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
+
+static inline u32 ctl_reg(int bank)
+{
+ return MSR_IA32_MCx_CTL(bank);
+}
+
+static inline u32 status_reg(int bank)
+{
+ return MSR_IA32_MCx_STATUS(bank);
+}
+
+static inline u32 addr_reg(int bank)
+{
+ return MSR_IA32_MCx_ADDR(bank);
+}
+
+static inline u32 misc_reg(int bank)
+{
+ return MSR_IA32_MCx_MISC(bank);
+}
+
+static inline u32 smca_ctl_reg(int bank)
+{
+ return MSR_AMD64_SMCA_MCx_CTL(bank);
+}
+
+static inline u32 smca_status_reg(int bank)
+{
+ return MSR_AMD64_SMCA_MCx_STATUS(bank);
+}
+
+static inline u32 smca_addr_reg(int bank)
+{
+ return MSR_AMD64_SMCA_MCx_ADDR(bank);
+}
+
+static inline u32 smca_misc_reg(int bank)
+{
+ return MSR_AMD64_SMCA_MCx_MISC(bank);
+}
+
+struct mca_msr_regs msr_ops = {
+ .ctl = ctl_reg,
+ .status = status_reg,
+ .addr = addr_reg,
+ .misc = misc_reg
+};
+
+static void __print_mce(struct mce *m)
+{
+ pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n",
+ m->extcpu,
+ (m->mcgstatus & MCG_STATUS_MCIP ? " Exception" : ""),
+ m->mcgstatus, m->bank, m->status);
+
+ if (m->ip) {
+ pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
+ !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
+ m->cs, m->ip);
+
+ if (m->cs == __KERNEL_CS)
+ pr_cont("{%pS}", (void *)(unsigned long)m->ip);
+ pr_cont("\n");
+ }
+
+ pr_emerg(HW_ERR "TSC %llx ", m->tsc);
+ if (m->addr)
+ pr_cont("ADDR %llx ", m->addr);
+ if (m->misc)
+ pr_cont("MISC %llx ", m->misc);
+
+ if (mce_flags.smca) {
+ if (m->synd)
+ pr_cont("SYND %llx ", m->synd);
+ if (m->ipid)
+ pr_cont("IPID %llx ", m->ipid);
+ }
+
+ pr_cont("\n");
+ /*
+ * Note this output is parsed by external tools and old fields
+ * should not be changed.
+ */
+ pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n",
+ m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid,
+ m->microcode);
+}
+
+static void print_mce(struct mce *m)
+{
+ __print_mce(m);
+
+ if (m->cpuvendor != X86_VENDOR_AMD)
+ pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
+}
+
+#define PANIC_TIMEOUT 5 /* 5 seconds */
+
+static atomic_t mce_panicked;
+
+static int fake_panic;
+static atomic_t mce_fake_panicked;
+
+/* Panic in progress. Enable interrupts and wait for final IPI */
+static void wait_for_panic(void)
+{
+ long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
+
+ preempt_disable();
+ local_irq_enable();
+ while (timeout-- > 0)
+ udelay(1);
+ if (panic_timeout == 0)
+ panic_timeout = mca_cfg.panic_timeout;
+ panic("Panicing machine check CPU died");
+}
+
+static noinstr void mce_panic(const char *msg, struct mce *final, char *exp)
+{
+ struct llist_node *pending;
+ struct mce_evt_llist *l;
+ int apei_err = 0;
+
+ /*
+ * Allow instrumentation around external facilities usage. Not that it
+ * matters a whole lot since the machine is going to panic anyway.
+ */
+ instrumentation_begin();
+
+ if (!fake_panic) {
+ /*
+ * Make sure only one CPU runs in machine check panic
+ */
+ if (atomic_inc_return(&mce_panicked) > 1)
+ wait_for_panic();
+ barrier();
+
+ bust_spinlocks(1);
+ console_verbose();
+ } else {
+ /* Don't log too much for fake panic */
+ if (atomic_inc_return(&mce_fake_panicked) > 1)
+ goto out;
+ }
+ pending = mce_gen_pool_prepare_records();
+ /* First print corrected ones that are still unlogged */
+ llist_for_each_entry(l, pending, llnode) {
+ struct mce *m = &l->mce;
+ if (!(m->status & MCI_STATUS_UC)) {
+ print_mce(m);
+ if (!apei_err)
+ apei_err = apei_write_mce(m);
+ }
+ }
+ /* Now print uncorrected but with the final one last */
+ llist_for_each_entry(l, pending, llnode) {
+ struct mce *m = &l->mce;
+ if (!(m->status & MCI_STATUS_UC))
+ continue;
+ if (!final || mce_cmp(m, final)) {
+ print_mce(m);
+ if (!apei_err)
+ apei_err = apei_write_mce(m);
+ }
+ }
+ if (final) {
+ print_mce(final);
+ if (!apei_err)
+ apei_err = apei_write_mce(final);
+ }
+ if (cpu_missing)
+ pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
+ if (exp)
+ pr_emerg(HW_ERR "Machine check: %s\n", exp);
+ if (!fake_panic) {
+ if (panic_timeout == 0)
+ panic_timeout = mca_cfg.panic_timeout;
+ panic(msg);
+ } else
+ pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
+
+out:
+ instrumentation_end();
+}
+
+/* Support code for software error injection */
+
+static int msr_to_offset(u32 msr)
+{
+ unsigned bank = __this_cpu_read(injectm.bank);
+
+ if (msr == mca_cfg.rip_msr)
+ return offsetof(struct mce, ip);
+ if (msr == msr_ops.status(bank))
+ return offsetof(struct mce, status);
+ if (msr == msr_ops.addr(bank))
+ return offsetof(struct mce, addr);
+ if (msr == msr_ops.misc(bank))
+ return offsetof(struct mce, misc);
+ if (msr == MSR_IA32_MCG_STATUS)
+ return offsetof(struct mce, mcgstatus);
+ return -1;
+}
+
+/* MSR access wrappers used for error injection */
+static u64 mce_rdmsrl(u32 msr)
+{
+ u64 v;
+
+ if (__this_cpu_read(injectm.finished)) {
+ int offset = msr_to_offset(msr);
+
+ if (offset < 0)
+ return 0;
+ return *(u64 *)((char *)this_cpu_ptr(&injectm) + offset);
+ }
+
+ if (rdmsrl_safe(msr, &v)) {
+ WARN_ONCE(1, "mce: Unable to read MSR 0x%x!\n", msr);
+ /*
+ * Return zero in case the access faulted. This should
+ * not happen normally but can happen if the CPU does
+ * something weird, or if the code is buggy.
+ */
+ v = 0;
+ }
+
+ return v;
+}
+
+static void mce_wrmsrl(u32 msr, u64 v)
+{
+ if (__this_cpu_read(injectm.finished)) {
+ int offset = msr_to_offset(msr);
+
+ if (offset >= 0)
+ *(u64 *)((char *)this_cpu_ptr(&injectm) + offset) = v;
+ return;
+ }
+ wrmsrl(msr, v);
+}
+
+/*
+ * Collect all global (w.r.t. this processor) status about this machine
+ * check into our "mce" struct so that we can use it later to assess
+ * the severity of the problem as we read per-bank specific details.
+ */
+static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
+{
+ mce_setup(m);
+
+ m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
+ if (regs) {
+ /*
+ * Get the address of the instruction at the time of
+ * the machine check error.
+ */
+ if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
+ m->ip = regs->ip;
+ m->cs = regs->cs;
+
+ /*
+ * When in VM86 mode make the cs look like ring 3
+ * always. This is a lie, but it's better than passing
+ * the additional vm86 bit around everywhere.
+ */
+ if (v8086_mode(regs))
+ m->cs |= 3;
+ }
+ /* Use accurate RIP reporting if available. */
+ if (mca_cfg.rip_msr)
+ m->ip = mce_rdmsrl(mca_cfg.rip_msr);
+ }
+}
+
+int mce_available(struct cpuinfo_x86 *c)
+{
+ if (mca_cfg.disabled)
+ return 0;
+ return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
+}
+
+static void mce_schedule_work(void)
+{
+ if (!mce_gen_pool_empty())
+ schedule_work(&mce_work);
+}
+
+static void mce_irq_work_cb(struct irq_work *entry)
+{
+ mce_schedule_work();
+}
+
+static void mce_report_event(struct pt_regs *regs)
+{
+ if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {
+ mce_notify_irq();
+ /*
+ * Triggering the work queue here is just an insurance
+ * policy in case the syscall exit notify handler
+ * doesn't run soon enough or ends up running on the
+ * wrong CPU (can happen when audit sleeps)
+ */
+ mce_schedule_work();
+ return;
+ }
+
+ irq_work_queue(&mce_irq_work);
+}
+
+/*
+ * Check if the address reported by the CPU is in a format we can parse.
+ * It would be possible to add code for most other cases, but all would
+ * be somewhat complicated (e.g. segment offset would require an instruction
+ * parser). So only support physical addresses up to page granuality for now.
+ */
+int mce_usable_address(struct mce *m)
+{
+ if (!(m->status & MCI_STATUS_ADDRV))
+ return 0;
+
+ /* Checks after this one are Intel-specific: */
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return 1;
+
+ if (!(m->status & MCI_STATUS_MISCV))
+ return 0;
+
+ if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
+ return 0;
+
+ if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
+ return 0;
+
+ return 1;
+}
+EXPORT_SYMBOL_GPL(mce_usable_address);
+
+bool mce_is_memory_error(struct mce *m)
+{
+ if (m->cpuvendor == X86_VENDOR_AMD) {
+ return amd_mce_is_memory_error(m);
+
+ } else if (m->cpuvendor == X86_VENDOR_INTEL) {
+ /*
+ * Intel SDM Volume 3B - 15.9.2 Compound Error Codes
+ *
+ * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
+ * indicating a memory error. Bit 8 is used for indicating a
+ * cache hierarchy error. The combination of bit 2 and bit 3
+ * is used for indicating a `generic' cache hierarchy error
+ * But we can't just blindly check the above bits, because if
+ * bit 11 is set, then it is a bus/interconnect error - and
+ * either way the above bits just gives more detail on what
+ * bus/interconnect error happened. Note that bit 12 can be
+ * ignored, as it's the "filter" bit.
+ */
+ return (m->status & 0xef80) == BIT(7) ||
+ (m->status & 0xef00) == BIT(8) ||
+ (m->status & 0xeffc) == 0xc;
+ }
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(mce_is_memory_error);
+
+static bool whole_page(struct mce *m)
+{
+ if (!mca_cfg.ser || !(m->status & MCI_STATUS_MISCV))
+ return true;
+ return MCI_MISC_ADDR_LSB(m->misc) >= PAGE_SHIFT;
+}
+
+bool mce_is_correctable(struct mce *m)
+{
+ if (m->cpuvendor == X86_VENDOR_AMD && m->status & MCI_STATUS_DEFERRED)
+ return false;
+
+ if (m->status & MCI_STATUS_UC)
+ return false;
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(mce_is_correctable);
+
+static bool cec_add_mce(struct mce *m)
+{
+ if (!m)
+ return false;
+
+ /* We eat only correctable DRAM errors with usable addresses. */
+ if (mce_is_memory_error(m) &&
+ mce_is_correctable(m) &&
+ mce_usable_address(m))
+ if (!cec_add_elem(m->addr >> PAGE_SHIFT))
+ return true;
+
+ return false;
+}
+
+static int mce_first_notifier(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ struct mce *m = (struct mce *)data;
+
+ if (!m)
+ return NOTIFY_DONE;
+
+ if (cec_add_mce(m))
+ return NOTIFY_STOP;
+
+ /* Emit the trace record: */
+ trace_mce_record(m);
+
+ set_bit(0, &mce_need_notify);
+
+ mce_notify_irq();
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block first_nb = {
+ .notifier_call = mce_first_notifier,
+ .priority = MCE_PRIO_FIRST,
+};
+
+static int srao_decode_notifier(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ struct mce *mce = (struct mce *)data;
+ unsigned long pfn;
+
+ if (!mce)
+ return NOTIFY_DONE;
+
+ if (mce_usable_address(mce) && (mce->severity == MCE_AO_SEVERITY)) {
+ pfn = mce->addr >> PAGE_SHIFT;
+ if (!memory_failure(pfn, 0))
+ set_mce_nospec(pfn, whole_page(mce));
+ }
+
+ return NOTIFY_OK;
+}
+static struct notifier_block mce_srao_nb = {
+ .notifier_call = srao_decode_notifier,
+ .priority = MCE_PRIO_SRAO,
+};
+
+static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ struct mce *m = (struct mce *)data;
+
+ if (!m)
+ return NOTIFY_DONE;
+
+ if (atomic_read(&num_notifiers) > NUM_DEFAULT_NOTIFIERS)
+ return NOTIFY_DONE;
+
+ __print_mce(m);
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block mce_default_nb = {
+ .notifier_call = mce_default_notifier,
+ /* lowest prio, we want it to run last. */
+ .priority = MCE_PRIO_LOWEST,
+};
+
+/*
+ * Read ADDR and MISC registers.
+ */
+static noinstr void mce_read_aux(struct mce *m, int i)
+{
+ if (m->status & MCI_STATUS_MISCV)
+ m->misc = mce_rdmsrl(msr_ops.misc(i));
+
+ if (m->status & MCI_STATUS_ADDRV) {
+ m->addr = mce_rdmsrl(msr_ops.addr(i));
+
+ /*
+ * Mask the reported address by the reported granularity.
+ */
+ if (mca_cfg.ser && (m->status & MCI_STATUS_MISCV)) {
+ u8 shift = MCI_MISC_ADDR_LSB(m->misc);
+ m->addr >>= shift;
+ m->addr <<= shift;
+ }
+
+ /*
+ * Extract [55:<lsb>] where lsb is the least significant
+ * *valid* bit of the address bits.
+ */
+ if (mce_flags.smca) {
+ u8 lsb = (m->addr >> 56) & 0x3f;
+
+ m->addr &= GENMASK_ULL(55, lsb);
+ }
+ }
+
+ if (mce_flags.smca) {
+ m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i));
+
+ if (m->status & MCI_STATUS_SYNDV)
+ m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i));
+ }
+}
+
+DEFINE_PER_CPU(unsigned, mce_poll_count);
+
+/*
+ * Poll for corrected events or events that happened before reset.
+ * Those are just logged through /dev/mcelog.
+ *
+ * This is executed in standard interrupt context.
+ *
+ * Note: spec recommends to panic for fatal unsignalled
+ * errors here. However this would be quite problematic --
+ * we would need to reimplement the Monarch handling and
+ * it would mess up the exclusion between exception handler
+ * and poll hander -- * so we skip this for now.
+ * These cases should not happen anyways, or only when the CPU
+ * is already totally * confused. In this case it's likely it will
+ * not fully execute the machine check handler either.
+ */
+bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
+{
+ bool error_seen = false;
+ struct mce m;
+ int i;
+
+ this_cpu_inc(mce_poll_count);
+
+ mce_gather_info(&m, NULL);
+
+ if (flags & MCP_TIMESTAMP)
+ m.tsc = rdtsc();
+
+ for (i = 0; i < mca_cfg.banks; i++) {
+ if (!mce_banks[i].ctl || !test_bit(i, *b))
+ continue;
+
+ m.misc = 0;
+ m.addr = 0;
+ m.bank = i;
+
+ barrier();
+ m.status = mce_rdmsrl(msr_ops.status(i));
+
+ /* If this entry is not valid, ignore it */
+ if (!(m.status & MCI_STATUS_VAL))
+ continue;
+
+ /*
+ * If we are logging everything (at CPU online) or this
+ * is a corrected error, then we must log it.
+ */
+ if ((flags & MCP_UC) || !(m.status & MCI_STATUS_UC))
+ goto log_it;
+
+ /*
+ * Newer Intel systems that support software error
+ * recovery need to make additional checks. Other
+ * CPUs should skip over uncorrected errors, but log
+ * everything else.
+ */
+ if (!mca_cfg.ser) {
+ if (m.status & MCI_STATUS_UC)
+ continue;
+ goto log_it;
+ }
+
+ /* Log "not enabled" (speculative) errors */
+ if (!(m.status & MCI_STATUS_EN))
+ goto log_it;
+
+ /*
+ * Log UCNA (SDM: 15.6.3 "UCR Error Classification")
+ * UC == 1 && PCC == 0 && S == 0
+ */
+ if (!(m.status & MCI_STATUS_PCC) && !(m.status & MCI_STATUS_S))
+ goto log_it;
+
+ /*
+ * Skip anything else. Presumption is that our read of this
+ * bank is racing with a machine check. Leave the log alone
+ * for do_machine_check() to deal with it.
+ */
+ continue;
+
+log_it:
+ error_seen = true;
+
+ mce_read_aux(&m, i);
+
+ m.severity = mce_severity(&m, mca_cfg.tolerant, NULL, false);
+
+ /*
+ * Don't get the IP here because it's unlikely to
+ * have anything to do with the actual error location.
+ */
+ if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce)
+ mce_log(&m);
+ else if (mce_usable_address(&m)) {
+ /*
+ * Although we skipped logging this, we still want
+ * to take action. Add to the pool so the registered
+ * notifiers will see it.
+ */
+ if (!mce_gen_pool_add(&m))
+ mce_schedule_work();
+ }
+
+ /*
+ * Clear state for this bank.
+ */
+ mce_wrmsrl(msr_ops.status(i), 0);
+ }
+
+ /*
+ * Don't clear MCG_STATUS here because it's only defined for
+ * exceptions.
+ */
+
+ sync_core();
+
+ return error_seen;
+}
+EXPORT_SYMBOL_GPL(machine_check_poll);
+
+/*
+ * Do a quick check if any of the events requires a panic.
+ * This decides if we keep the events around or clear them.
+ */
+static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
+ struct pt_regs *regs)
+{
+ char *tmp;
+ int i;
+
+ for (i = 0; i < mca_cfg.banks; i++) {
+ m->status = mce_rdmsrl(msr_ops.status(i));
+ if (!(m->status & MCI_STATUS_VAL))
+ continue;
+
+ __set_bit(i, validp);
+ if (quirk_no_way_out)
+ quirk_no_way_out(i, m, regs);
+
+ m->bank = i;
+ if (mce_severity(m, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) {
+ mce_read_aux(m, i);
+ *msg = tmp;
+ return 1;
+ }
+ }
+ return 0;
+}
+
+/*
+ * Variable to establish order between CPUs while scanning.
+ * Each CPU spins initially until executing is equal its number.
+ */
+static atomic_t mce_executing;
+
+/*
+ * Defines order of CPUs on entry. First CPU becomes Monarch.
+ */
+static atomic_t mce_callin;
+
+/*
+ * Check if a timeout waiting for other CPUs happened.
+ */
+static int mce_timed_out(u64 *t, const char *msg)
+{
+ /*
+ * The others already did panic for some reason.
+ * Bail out like in a timeout.
+ * rmb() to tell the compiler that system_state
+ * might have been modified by someone else.
+ */
+ rmb();
+ if (atomic_read(&mce_panicked))
+ wait_for_panic();
+ if (!mca_cfg.monarch_timeout)
+ goto out;
+ if ((s64)*t < SPINUNIT) {
+ if (mca_cfg.tolerant <= 1)
+ mce_panic(msg, NULL, NULL);
+ cpu_missing = 1;
+ return 1;
+ }
+ *t -= SPINUNIT;
+out:
+ touch_nmi_watchdog();
+ return 0;
+}
+
+/*
+ * The Monarch's reign. The Monarch is the CPU who entered
+ * the machine check handler first. It waits for the others to
+ * raise the exception too and then grades them. When any
+ * error is fatal panic. Only then let the others continue.
+ *
+ * The other CPUs entering the MCE handler will be controlled by the
+ * Monarch. They are called Subjects.
+ *
+ * This way we prevent any potential data corruption in a unrecoverable case
+ * and also makes sure always all CPU's errors are examined.
+ *
+ * Also this detects the case of a machine check event coming from outer
+ * space (not detected by any CPUs) In this case some external agent wants
+ * us to shut down, so panic too.
+ *
+ * The other CPUs might still decide to panic if the handler happens
+ * in a unrecoverable place, but in this case the system is in a semi-stable
+ * state and won't corrupt anything by itself. It's ok to let the others
+ * continue for a bit first.
+ *
+ * All the spin loops have timeouts; when a timeout happens a CPU
+ * typically elects itself to be Monarch.
+ */
+static void mce_reign(void)
+{
+ int cpu;
+ struct mce *m = NULL;
+ int global_worst = 0;
+ char *msg = NULL;
+ char *nmsg = NULL;
+
+ /*
+ * This CPU is the Monarch and the other CPUs have run
+ * through their handlers.
+ * Grade the severity of the errors of all the CPUs.
+ */
+ for_each_possible_cpu(cpu) {
+ int severity = mce_severity(&per_cpu(mces_seen, cpu),
+ mca_cfg.tolerant,
+ &nmsg, true);
+ if (severity > global_worst) {
+ msg = nmsg;
+ global_worst = severity;
+ m = &per_cpu(mces_seen, cpu);
+ }
+ }
+
+ /*
+ * Cannot recover? Panic here then.
+ * This dumps all the mces in the log buffer and stops the
+ * other CPUs.
+ */
+ if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
+ mce_panic("Fatal machine check", m, msg);
+
+ /*
+ * For UC somewhere we let the CPU who detects it handle it.
+ * Also must let continue the others, otherwise the handling
+ * CPU could deadlock on a lock.
+ */
+
+ /*
+ * No machine check event found. Must be some external
+ * source or one CPU is hung. Panic.
+ */
+ if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3)
+ mce_panic("Fatal machine check from unknown source", NULL, NULL);
+
+ /*
+ * Now clear all the mces_seen so that they don't reappear on
+ * the next mce.
+ */
+ for_each_possible_cpu(cpu)
+ memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
+}
+
+static atomic_t global_nwo;
+
+/*
+ * Start of Monarch synchronization. This waits until all CPUs have
+ * entered the exception handler and then determines if any of them
+ * saw a fatal event that requires panic. Then it executes them
+ * in the entry order.
+ * TBD double check parallel CPU hotunplug
+ */
+static int mce_start(int *no_way_out)
+{
+ int order;
+ int cpus = num_online_cpus();
+ u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
+
+ if (!timeout)
+ return -1;
+
+ atomic_add(*no_way_out, &global_nwo);
+ /*
+ * Rely on the implied barrier below, such that global_nwo
+ * is updated before mce_callin.
+ */
+ order = atomic_inc_return(&mce_callin);
+
+ /*
+ * Wait for everyone.
+ */
+ while (atomic_read(&mce_callin) != cpus) {
+ if (mce_timed_out(&timeout,
+ "Timeout: Not all CPUs entered broadcast exception handler")) {
+ atomic_set(&global_nwo, 0);
+ return -1;
+ }
+ ndelay(SPINUNIT);
+ }
+
+ /*
+ * mce_callin should be read before global_nwo
+ */
+ smp_rmb();
+
+ if (order == 1) {
+ /*
+ * Monarch: Starts executing now, the others wait.
+ */
+ atomic_set(&mce_executing, 1);
+ } else {
+ /*
+ * Subject: Now start the scanning loop one by one in
+ * the original callin order.
+ * This way when there are any shared banks it will be
+ * only seen by one CPU before cleared, avoiding duplicates.
+ */
+ while (atomic_read(&mce_executing) < order) {
+ if (mce_timed_out(&timeout,
+ "Timeout: Subject CPUs unable to finish machine check processing")) {
+ atomic_set(&global_nwo, 0);
+ return -1;
+ }
+ ndelay(SPINUNIT);
+ }
+ }
+
+ /*
+ * Cache the global no_way_out state.
+ */
+ *no_way_out = atomic_read(&global_nwo);
+
+ return order;
+}
+
+/*
+ * Synchronize between CPUs after main scanning loop.
+ * This invokes the bulk of the Monarch processing.
+ */
+static noinstr int mce_end(int order)
+{
+ u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
+ int ret = -1;
+
+ /* Allow instrumentation around external facilities. */
+ instrumentation_begin();
+
+ if (!timeout)
+ goto reset;
+ if (order < 0)
+ goto reset;
+
+ /*
+ * Allow others to run.
+ */
+ atomic_inc(&mce_executing);
+
+ if (order == 1) {
+ /* CHECKME: Can this race with a parallel hotplug? */
+ int cpus = num_online_cpus();
+
+ /*
+ * Monarch: Wait for everyone to go through their scanning
+ * loops.
+ */
+ while (atomic_read(&mce_executing) <= cpus) {
+ if (mce_timed_out(&timeout,
+ "Timeout: Monarch CPU unable to finish machine check processing"))
+ goto reset;
+ ndelay(SPINUNIT);
+ }
+
+ mce_reign();
+ barrier();
+ ret = 0;
+ } else {
+ /*
+ * Subject: Wait for Monarch to finish.
+ */
+ while (atomic_read(&mce_executing) != 0) {
+ if (mce_timed_out(&timeout,
+ "Timeout: Monarch CPU did not finish machine check processing"))
+ goto reset;
+ ndelay(SPINUNIT);
+ }
+
+ /*
+ * Don't reset anything. That's done by the Monarch.
+ */
+ ret = 0;
+ goto out;
+ }
+
+ /*
+ * Reset all global state.
+ */
+reset:
+ atomic_set(&global_nwo, 0);
+ atomic_set(&mce_callin, 0);
+ barrier();
+
+ /*
+ * Let others run again.
+ */
+ atomic_set(&mce_executing, 0);
+
+out:
+ instrumentation_end();
+
+ return ret;
+}
+
+static void mce_clear_state(unsigned long *toclear)
+{
+ int i;
+
+ for (i = 0; i < mca_cfg.banks; i++) {
+ if (test_bit(i, toclear))
+ mce_wrmsrl(msr_ops.status(i), 0);
+ }
+}
+
+static int do_memory_failure(struct mce *m)
+{
+ int flags = MF_ACTION_REQUIRED;
+ int ret;
+
+ pr_err("Uncorrected hardware memory error in user-access at %llx", m->addr);
+ if (!(m->mcgstatus & MCG_STATUS_RIPV))
+ flags |= MF_MUST_KILL;
+ ret = memory_failure(m->addr >> PAGE_SHIFT, flags);
+ if (ret)
+ pr_err("Memory error not recovered");
+ else
+ set_mce_nospec(m->addr >> PAGE_SHIFT, whole_page(m));
+ return ret;
+}
+
+
+/*
+ * Cases where we avoid rendezvous handler timeout:
+ * 1) If this CPU is offline.
+ *
+ * 2) If crashing_cpu was set, e.g. we're entering kdump and we need to
+ * skip those CPUs which remain looping in the 1st kernel - see
+ * crash_nmi_callback().
+ *
+ * Note: there still is a small window between kexec-ing and the new,
+ * kdump kernel establishing a new #MC handler where a broadcasted MCE
+ * might not get handled properly.
+ */
+static bool __mc_check_crashing_cpu(int cpu)
+{
+ if (cpu_is_offline(cpu) ||
+ (crashing_cpu != -1 && crashing_cpu != cpu)) {
+ u64 mcgstatus;
+
+ mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
+ if (mcgstatus & MCG_STATUS_RIPV) {
+ mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
+ return true;
+ }
+ }
+ return false;
+}
+
+static void __mc_scan_banks(struct mce *m, struct mce *final,
+ unsigned long *toclear, unsigned long *valid_banks,
+ int no_way_out, int *worst)
+{
+ struct mca_config *cfg = &mca_cfg;
+ int severity, i;
+
+ for (i = 0; i < cfg->banks; i++) {
+ __clear_bit(i, toclear);
+ if (!test_bit(i, valid_banks))
+ continue;
+
+ if (!mce_banks[i].ctl)
+ continue;
+
+ m->misc = 0;
+ m->addr = 0;
+ m->bank = i;
+
+ m->status = mce_rdmsrl(msr_ops.status(i));
+ if (!(m->status & MCI_STATUS_VAL))
+ continue;
+
+ /*
+ * Corrected or non-signaled errors are handled by
+ * machine_check_poll(). Leave them alone, unless this panics.
+ */
+ if (!(m->status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
+ !no_way_out)
+ continue;
+
+ /* Set taint even when machine check was not enabled. */
+ add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
+
+ severity = mce_severity(m, cfg->tolerant, NULL, true);
+
+ /*
+ * When machine check was for corrected/deferred handler don't
+ * touch, unless we're panicking.
+ */
+ if ((severity == MCE_KEEP_SEVERITY ||
+ severity == MCE_UCNA_SEVERITY) && !no_way_out)
+ continue;
+
+ __set_bit(i, toclear);
+
+ /* Machine check event was not enabled. Clear, but ignore. */
+ if (severity == MCE_NO_SEVERITY)
+ continue;
+
+ mce_read_aux(m, i);
+
+ /* assuming valid severity level != 0 */
+ m->severity = severity;
+
+ mce_log(m);
+
+ if (severity > *worst) {
+ *final = *m;
+ *worst = severity;
+ }
+ }
+
+ /* mce_clear_state will clear *final, save locally for use later */
+ *m = *final;
+}
+
+/*
+ * The actual machine check handler. This only handles real
+ * exceptions when something got corrupted coming in through int 18.
+ *
+ * This is executed in NMI context not subject to normal locking rules. This
+ * implies that most kernel services cannot be safely used. Don't even
+ * think about putting a printk in there!
+ *
+ * On Intel systems this is entered on all CPUs in parallel through
+ * MCE broadcast. However some CPUs might be broken beyond repair,
+ * so be always careful when synchronizing with others.
+ */
+void do_machine_check(struct pt_regs *regs, long error_code)
+{
+ DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
+ DECLARE_BITMAP(toclear, MAX_NR_BANKS);
+ struct mca_config *cfg = &mca_cfg;
+ int cpu = smp_processor_id();
+ char *msg = "Unknown";
+ struct mce m, *final;
+ int worst = 0;
+
+ /*
+ * Establish sequential order between the CPUs entering the machine
+ * check handler.
+ */
+ int order = -1;
+
+ /*
+ * If no_way_out gets set, there is no safe way to recover from this
+ * MCE. If mca_cfg.tolerant is cranked up, we'll try anyway.
+ */
+ int no_way_out = 0;
+
+ /*
+ * If kill_it gets set, there might be a way to recover from this
+ * error.
+ */
+ int kill_it = 0;
+
+ /*
+ * MCEs are always local on AMD. Same is determined by MCG_STATUS_LMCES
+ * on Intel.
+ */
+ int lmce = 1;
+
+ if (__mc_check_crashing_cpu(cpu))
+ return;
+
+ ist_enter(regs);
+
+ this_cpu_inc(mce_exception_count);
+
+ mce_gather_info(&m, regs);
+ m.tsc = rdtsc();
+
+ final = this_cpu_ptr(&mces_seen);
+ *final = m;
+
+ memset(valid_banks, 0, sizeof(valid_banks));
+ no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs);
+
+ barrier();
+
+ /*
+ * When no restart IP might need to kill or panic.
+ * Assume the worst for now, but if we find the
+ * severity is MCE_AR_SEVERITY we have other options.
+ */
+ if (!(m.mcgstatus & MCG_STATUS_RIPV))
+ kill_it = 1;
+
+ /*
+ * Check if this MCE is signaled to only this logical processor,
+ * on Intel only.
+ */
+ if (m.cpuvendor == X86_VENDOR_INTEL)
+ lmce = m.mcgstatus & MCG_STATUS_LMCES;
+
+ /*
+ * Local machine check may already know that we have to panic.
+ * Broadcast machine check begins rendezvous in mce_start()
+ * Go through all banks in exclusion of the other CPUs. This way we
+ * don't report duplicated events on shared banks because the first one
+ * to see it will clear it.
+ */
+ if (lmce) {
+ if (no_way_out)
+ mce_panic("Fatal local machine check", &m, msg);
+ } else {
+ order = mce_start(&no_way_out);
+ }
+
+ __mc_scan_banks(&m, final, toclear, valid_banks, no_way_out, &worst);
+
+ if (!no_way_out)
+ mce_clear_state(toclear);
+
+ /*
+ * Do most of the synchronization with other CPUs.
+ * When there's any problem use only local no_way_out state.
+ */
+ if (!lmce) {
+ if (mce_end(order) < 0)
+ no_way_out = worst >= MCE_PANIC_SEVERITY;
+ } else {
+ /*
+ * If there was a fatal machine check we should have
+ * already called mce_panic earlier in this function.
+ * Since we re-read the banks, we might have found
+ * something new. Check again to see if we found a
+ * fatal error. We call "mce_severity()" again to
+ * make sure we have the right "msg".
+ */
+ if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) {
+ mce_severity(&m, cfg->tolerant, &msg, true);
+ mce_panic("Local fatal machine check!", &m, msg);
+ }
+ }
+
+ /*
+ * If tolerant is at an insane level we drop requests to kill
+ * processes and continue even when there is no way out.
+ */
+ if (cfg->tolerant == 3)
+ kill_it = 0;
+ else if (no_way_out)
+ mce_panic("Fatal machine check on current CPU", &m, msg);
+
+ if (worst > 0)
+ mce_report_event(regs);
+ mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
+
+ sync_core();
+
+ if (worst != MCE_AR_SEVERITY && !kill_it)
+ goto out_ist;
+
+ /* Fault was in user mode and we need to take some action */
+ if ((m.cs & 3) == 3) {
+ ist_begin_non_atomic(regs);
+ local_irq_enable();
+
+ if (kill_it || do_memory_failure(&m))
+ force_sig(SIGBUS, current);
+ local_irq_disable();
+ ist_end_non_atomic();
+ } else {
+ if (!fixup_exception(regs, X86_TRAP_MC))
+ mce_panic("Failed kernel mode recovery", &m, NULL);
+ }
+
+out_ist:
+ ist_exit(regs);
+}
+EXPORT_SYMBOL_GPL(do_machine_check);
+
+#ifndef CONFIG_MEMORY_FAILURE
+int memory_failure(unsigned long pfn, int flags)
+{
+ /* mce_severity() should not hand us an ACTION_REQUIRED error */
+ BUG_ON(flags & MF_ACTION_REQUIRED);
+ pr_err("Uncorrected memory error in page 0x%lx ignored\n"
+ "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n",
+ pfn);
+
+ return 0;
+}
+#endif
+
+/*
+ * Periodic polling timer for "silent" machine check errors. If the
+ * poller finds an MCE, poll 2x faster. When the poller finds no more
+ * errors, poll 2x slower (up to check_interval seconds).
+ */
+static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
+
+static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
+static DEFINE_PER_CPU(struct timer_list, mce_timer);
+
+static unsigned long mce_adjust_timer_default(unsigned long interval)
+{
+ return interval;
+}
+
+static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
+
+static void __start_timer(struct timer_list *t, unsigned long interval)
+{
+ unsigned long when = jiffies + interval;
+ unsigned long flags;
+
+ local_irq_save(flags);
+
+ if (!timer_pending(t) || time_before(when, t->expires))
+ mod_timer(t, round_jiffies(when));
+
+ local_irq_restore(flags);
+}
+
+static void mce_timer_fn(struct timer_list *t)
+{
+ struct timer_list *cpu_t = this_cpu_ptr(&mce_timer);
+ unsigned long iv;
+
+ WARN_ON(cpu_t != t);
+
+ iv = __this_cpu_read(mce_next_interval);
+
+ if (mce_available(this_cpu_ptr(&cpu_info))) {
+ machine_check_poll(0, this_cpu_ptr(&mce_poll_banks));
+
+ if (mce_intel_cmci_poll()) {
+ iv = mce_adjust_timer(iv);
+ goto done;
+ }
+ }
+
+ /*
+ * Alert userspace if needed. If we logged an MCE, reduce the polling
+ * interval, otherwise increase the polling interval.
+ */
+ if (mce_notify_irq())
+ iv = max(iv / 2, (unsigned long) HZ/100);
+ else
+ iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
+
+done:
+ __this_cpu_write(mce_next_interval, iv);
+ __start_timer(t, iv);
+}
+
+/*
+ * Ensure that the timer is firing in @interval from now.
+ */
+void mce_timer_kick(unsigned long interval)
+{
+ struct timer_list *t = this_cpu_ptr(&mce_timer);
+ unsigned long iv = __this_cpu_read(mce_next_interval);
+
+ __start_timer(t, interval);
+
+ if (interval < iv)
+ __this_cpu_write(mce_next_interval, interval);
+}
+
+/* Must not be called in IRQ context where del_timer_sync() can deadlock */
+static void mce_timer_delete_all(void)
+{
+ int cpu;
+
+ for_each_online_cpu(cpu)
+ del_timer_sync(&per_cpu(mce_timer, cpu));
+}
+
+/*
+ * Notify the user(s) about new machine check events.
+ * Can be called from interrupt context, but not from machine check/NMI
+ * context.
+ */
+int mce_notify_irq(void)
+{
+ /* Not more than two messages every minute */
+ static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
+
+ if (test_and_clear_bit(0, &mce_need_notify)) {
+ mce_work_trigger();
+
+ if (__ratelimit(&ratelimit))
+ pr_info(HW_ERR "Machine check events logged\n");
+
+ return 1;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(mce_notify_irq);
+
+static int __mcheck_cpu_mce_banks_init(void)
+{
+ int i;
+
+ mce_banks = kcalloc(MAX_NR_BANKS, sizeof(struct mce_bank), GFP_KERNEL);
+ if (!mce_banks)
+ return -ENOMEM;
+
+ for (i = 0; i < MAX_NR_BANKS; i++) {
+ struct mce_bank *b = &mce_banks[i];
+
+ b->ctl = -1ULL;
+ b->init = 1;
+ }
+ return 0;
+}
+
+/*
+ * Initialize Machine Checks for a CPU.
+ */
+static int __mcheck_cpu_cap_init(void)
+{
+ u64 cap;
+ u8 b;
+
+ rdmsrl(MSR_IA32_MCG_CAP, cap);
+
+ b = cap & MCG_BANKCNT_MASK;
+ if (WARN_ON_ONCE(b > MAX_NR_BANKS))
+ b = MAX_NR_BANKS;
+
+ mca_cfg.banks = max(mca_cfg.banks, b);
+
+ if (!mce_banks) {
+ int err = __mcheck_cpu_mce_banks_init();
+ if (err)
+ return err;
+ }
+
+ /* Use accurate RIP reporting if available. */
+ if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
+ mca_cfg.rip_msr = MSR_IA32_MCG_EIP;
+
+ if (cap & MCG_SER_P)
+ mca_cfg.ser = 1;
+
+ return 0;
+}
+
+static void __mcheck_cpu_init_generic(void)
+{
+ enum mcp_flags m_fl = 0;
+ mce_banks_t all_banks;
+ u64 cap;
+
+ if (!mca_cfg.bootlog)
+ m_fl = MCP_DONTLOG;
+
+ /*
+ * Log the machine checks left over from the previous reset.
+ */
+ bitmap_fill(all_banks, MAX_NR_BANKS);
+ machine_check_poll(MCP_UC | m_fl, &all_banks);
+
+ cr4_set_bits(X86_CR4_MCE);
+
+ rdmsrl(MSR_IA32_MCG_CAP, cap);
+ if (cap & MCG_CTL_P)
+ wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
+}
+
+static void __mcheck_cpu_init_clear_banks(void)
+{
+ int i;
+
+ for (i = 0; i < mca_cfg.banks; i++) {
+ struct mce_bank *b = &mce_banks[i];
+
+ if (!b->init)
+ continue;
+ wrmsrl(msr_ops.ctl(i), b->ctl);
+ wrmsrl(msr_ops.status(i), 0);
+ }
+}
+
+/*
+ * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and
+ * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM
+ * Vol 3B Table 15-20). But this confuses both the code that determines
+ * whether the machine check occurred in kernel or user mode, and also
+ * the severity assessment code. Pretend that EIPV was set, and take the
+ * ip/cs values from the pt_regs that mce_gather_info() ignored earlier.
+ */
+static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
+{
+ if (bank != 0)
+ return;
+ if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0)
+ return;
+ if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC|
+ MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV|
+ MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR|
+ MCACOD)) !=
+ (MCI_STATUS_UC|MCI_STATUS_EN|
+ MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S|
+ MCI_STATUS_AR|MCACOD_INSTR))
+ return;
+
+ m->mcgstatus |= MCG_STATUS_EIPV;
+ m->ip = regs->ip;
+ m->cs = regs->cs;
+}
+
+/* Add per CPU specific workarounds here */
+static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
+{
+ struct mca_config *cfg = &mca_cfg;
+
+ if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
+ pr_info("unknown CPU type - not enabling MCE support\n");
+ return -EOPNOTSUPP;
+ }
+
+ /* This should be disabled by the BIOS, but isn't always */
+ if (c->x86_vendor == X86_VENDOR_AMD) {
+ if (c->x86 == 15 && cfg->banks > 4) {
+ /*
+ * disable GART TBL walk error reporting, which
+ * trips off incorrectly with the IOMMU & 3ware
+ * & Cerberus:
+ */
+ clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
+ }
+ if (c->x86 < 0x11 && cfg->bootlog < 0) {
+ /*
+ * Lots of broken BIOS around that don't clear them
+ * by default and leave crap in there. Don't log:
+ */
+ cfg->bootlog = 0;
+ }
+ /*
+ * Various K7s with broken bank 0 around. Always disable
+ * by default.
+ */
+ if (c->x86 == 6 && cfg->banks > 0)
+ mce_banks[0].ctl = 0;
+
+ /*
+ * overflow_recov is supported for F15h Models 00h-0fh
+ * even though we don't have a CPUID bit for it.
+ */
+ if (c->x86 == 0x15 && c->x86_model <= 0xf)
+ mce_flags.overflow_recov = 1;
+
+ }
+
+ if (c->x86_vendor == X86_VENDOR_INTEL) {
+ /*
+ * SDM documents that on family 6 bank 0 should not be written
+ * because it aliases to another special BIOS controlled
+ * register.
+ * But it's not aliased anymore on model 0x1a+
+ * Don't ignore bank 0 completely because there could be a
+ * valid event later, merely don't write CTL0.
+ */
+
+ if (c->x86 == 6 && c->x86_model < 0x1A && cfg->banks > 0)
+ mce_banks[0].init = 0;
+
+ /*
+ * All newer Intel systems support MCE broadcasting. Enable
+ * synchronization with a one second timeout.
+ */
+ if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
+ cfg->monarch_timeout < 0)
+ cfg->monarch_timeout = USEC_PER_SEC;
+
+ /*
+ * There are also broken BIOSes on some Pentium M and
+ * earlier systems:
+ */
+ if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0)
+ cfg->bootlog = 0;
+
+ if (c->x86 == 6 && c->x86_model == 45)
+ quirk_no_way_out = quirk_sandybridge_ifu;
+ }
+ if (cfg->monarch_timeout < 0)
+ cfg->monarch_timeout = 0;
+ if (cfg->bootlog != 0)
+ cfg->panic_timeout = 30;
+
+ return 0;
+}
+
+static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
+{
+ if (c->x86 != 5)
+ return 0;
+
+ switch (c->x86_vendor) {
+ case X86_VENDOR_INTEL:
+ intel_p5_mcheck_init(c);
+ return 1;
+ break;
+ case X86_VENDOR_CENTAUR:
+ winchip_mcheck_init(c);
+ return 1;
+ break;
+ default:
+ return 0;
+ }
+
+ return 0;
+}
+
+/*
+ * Init basic CPU features needed for early decoding of MCEs.
+ */
+static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c)
+{
+ if (c->x86_vendor == X86_VENDOR_AMD) {
+ mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV);
+ mce_flags.succor = !!cpu_has(c, X86_FEATURE_SUCCOR);
+ mce_flags.smca = !!cpu_has(c, X86_FEATURE_SMCA);
+
+ if (mce_flags.smca) {
+ msr_ops.ctl = smca_ctl_reg;
+ msr_ops.status = smca_status_reg;
+ msr_ops.addr = smca_addr_reg;
+ msr_ops.misc = smca_misc_reg;
+ }
+ }
+}
+
+static void mce_centaur_feature_init(struct cpuinfo_x86 *c)
+{
+ struct mca_config *cfg = &mca_cfg;
+
+ /*
+ * All newer Centaur CPUs support MCE broadcasting. Enable
+ * synchronization with a one second timeout.
+ */
+ if ((c->x86 == 6 && c->x86_model == 0xf && c->x86_stepping >= 0xe) ||
+ c->x86 > 6) {
+ if (cfg->monarch_timeout < 0)
+ cfg->monarch_timeout = USEC_PER_SEC;
+ }
+}
+
+static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
+{
+ switch (c->x86_vendor) {
+ case X86_VENDOR_INTEL:
+ mce_intel_feature_init(c);
+ mce_adjust_timer = cmci_intel_adjust_timer;
+ break;
+
+ case X86_VENDOR_AMD: {
+ mce_amd_feature_init(c);
+ break;
+ }
+ case X86_VENDOR_CENTAUR:
+ mce_centaur_feature_init(c);
+ break;
+
+ default:
+ break;
+ }
+}
+
+static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
+{
+ switch (c->x86_vendor) {
+ case X86_VENDOR_INTEL:
+ mce_intel_feature_clear(c);
+ break;
+ default:
+ break;
+ }
+}
+
+static void mce_start_timer(struct timer_list *t)
+{
+ unsigned long iv = check_interval * HZ;
+
+ if (mca_cfg.ignore_ce || !iv)
+ return;
+
+ this_cpu_write(mce_next_interval, iv);
+ __start_timer(t, iv);
+}
+
+static void __mcheck_cpu_setup_timer(void)
+{
+ struct timer_list *t = this_cpu_ptr(&mce_timer);
+
+ timer_setup(t, mce_timer_fn, TIMER_PINNED);
+}
+
+static void __mcheck_cpu_init_timer(void)
+{
+ struct timer_list *t = this_cpu_ptr(&mce_timer);
+
+ timer_setup(t, mce_timer_fn, TIMER_PINNED);
+ mce_start_timer(t);
+}
+
+/* Handle unconfigured int18 (should never happen) */
+static void unexpected_machine_check(struct pt_regs *regs, long error_code)
+{
+ pr_err("CPU#%d: Unexpected int18 (Machine Check)\n",
+ smp_processor_id());
+}
+
+/* Call the installed machine check handler for this CPU setup. */
+void (*machine_check_vector)(struct pt_regs *, long error_code) =
+ unexpected_machine_check;
+
+dotraplinkage void do_mce(struct pt_regs *regs, long error_code)
+{
+ machine_check_vector(regs, error_code);
+}
+
+/*
+ * Called for each booted CPU to set up machine checks.
+ * Must be called with preempt off:
+ */
+void mcheck_cpu_init(struct cpuinfo_x86 *c)
+{
+ if (mca_cfg.disabled)
+ return;
+
+ if (__mcheck_cpu_ancient_init(c))
+ return;
+
+ if (!mce_available(c))
+ return;
+
+ if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) {
+ mca_cfg.disabled = 1;
+ return;
+ }
+
+ if (mce_gen_pool_init()) {
+ mca_cfg.disabled = 1;
+ pr_emerg("Couldn't allocate MCE records pool!\n");
+ return;
+ }
+
+ machine_check_vector = do_machine_check;
+
+ __mcheck_cpu_init_early(c);
+ __mcheck_cpu_init_generic();
+ __mcheck_cpu_init_vendor(c);
+ __mcheck_cpu_init_clear_banks();
+ __mcheck_cpu_setup_timer();
+}
+
+/*
+ * Called for each booted CPU to clear some machine checks opt-ins
+ */
+void mcheck_cpu_clear(struct cpuinfo_x86 *c)
+{
+ if (mca_cfg.disabled)
+ return;
+
+ if (!mce_available(c))
+ return;
+
+ /*
+ * Possibly to clear general settings generic to x86
+ * __mcheck_cpu_clear_generic(c);
+ */
+ __mcheck_cpu_clear_vendor(c);
+
+}
+
+static void __mce_disable_bank(void *arg)
+{
+ int bank = *((int *)arg);
+ __clear_bit(bank, this_cpu_ptr(mce_poll_banks));
+ cmci_disable_bank(bank);
+}
+
+void mce_disable_bank(int bank)
+{
+ if (bank >= mca_cfg.banks) {
+ pr_warn(FW_BUG
+ "Ignoring request to disable invalid MCA bank %d.\n",
+ bank);
+ return;
+ }
+ set_bit(bank, mce_banks_ce_disabled);
+ on_each_cpu(__mce_disable_bank, &bank, 1);
+}
+
+/*
+ * mce=off Disables machine check
+ * mce=no_cmci Disables CMCI
+ * mce=no_lmce Disables LMCE
+ * mce=dont_log_ce Clears corrected events silently, no log created for CEs.
+ * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
+ * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
+ * monarchtimeout is how long to wait for other CPUs on machine
+ * check, or 0 to not wait
+ * mce=bootlog Log MCEs from before booting. Disabled by default on AMD Fam10h
+ and older.
+ * mce=nobootlog Don't log MCEs from before booting.
+ * mce=bios_cmci_threshold Don't program the CMCI threshold
+ * mce=recovery force enable memcpy_mcsafe()
+ */
+static int __init mcheck_enable(char *str)
+{
+ struct mca_config *cfg = &mca_cfg;
+
+ if (*str == 0) {
+ enable_p5_mce();
+ return 1;
+ }
+ if (*str == '=')
+ str++;
+ if (!strcmp(str, "off"))
+ cfg->disabled = 1;
+ else if (!strcmp(str, "no_cmci"))
+ cfg->cmci_disabled = true;
+ else if (!strcmp(str, "no_lmce"))
+ cfg->lmce_disabled = 1;
+ else if (!strcmp(str, "dont_log_ce"))
+ cfg->dont_log_ce = true;
+ else if (!strcmp(str, "ignore_ce"))
+ cfg->ignore_ce = true;
+ else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
+ cfg->bootlog = (str[0] == 'b');
+ else if (!strcmp(str, "bios_cmci_threshold"))
+ cfg->bios_cmci_threshold = 1;
+ else if (!strcmp(str, "recovery"))
+ cfg->recovery = 1;
+ else if (isdigit(str[0])) {
+ if (get_option(&str, &cfg->tolerant) == 2)
+ get_option(&str, &(cfg->monarch_timeout));
+ } else {
+ pr_info("mce argument %s ignored. Please use /sys\n", str);
+ return 0;
+ }
+ return 1;
+}
+__setup("mce", mcheck_enable);
+
+int __init mcheck_init(void)
+{
+ mcheck_intel_therm_init();
+ mce_register_decode_chain(&first_nb);
+ mce_register_decode_chain(&mce_srao_nb);
+ mce_register_decode_chain(&mce_default_nb);
+ mcheck_vendor_init_severity();
+
+ INIT_WORK(&mce_work, mce_gen_pool_process);
+ init_irq_work(&mce_irq_work, mce_irq_work_cb);
+
+ return 0;
+}
+
+/*
+ * mce_syscore: PM support
+ */
+
+/*
+ * Disable machine checks on suspend and shutdown. We can't really handle
+ * them later.
+ */
+static void mce_disable_error_reporting(void)
+{
+ int i;
+
+ for (i = 0; i < mca_cfg.banks; i++) {
+ struct mce_bank *b = &mce_banks[i];
+
+ if (b->init)
+ wrmsrl(msr_ops.ctl(i), 0);
+ }
+ return;
+}
+
+static void vendor_disable_error_reporting(void)
+{
+ /*
+ * Don't clear on Intel or AMD CPUs. Some of these MSRs are socket-wide.
+ * Disabling them for just a single offlined CPU is bad, since it will
+ * inhibit reporting for all shared resources on the socket like the
+ * last level cache (LLC), the integrated memory controller (iMC), etc.
+ */
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ||
+ boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+ return;
+
+ mce_disable_error_reporting();
+}
+
+static int mce_syscore_suspend(void)
+{
+ vendor_disable_error_reporting();
+ return 0;
+}
+
+static void mce_syscore_shutdown(void)
+{
+ vendor_disable_error_reporting();
+}
+
+/*
+ * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
+ * Only one CPU is active at this time, the others get re-added later using
+ * CPU hotplug:
+ */
+static void mce_syscore_resume(void)
+{
+ __mcheck_cpu_init_generic();
+ __mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info));
+ __mcheck_cpu_init_clear_banks();
+}
+
+static struct syscore_ops mce_syscore_ops = {
+ .suspend = mce_syscore_suspend,
+ .shutdown = mce_syscore_shutdown,
+ .resume = mce_syscore_resume,
+};
+
+/*
+ * mce_device: Sysfs support
+ */
+
+static void mce_cpu_restart(void *data)
+{
+ if (!mce_available(raw_cpu_ptr(&cpu_info)))
+ return;
+ __mcheck_cpu_init_generic();
+ __mcheck_cpu_init_clear_banks();
+ __mcheck_cpu_init_timer();
+}
+
+/* Reinit MCEs after user configuration changes */
+static void mce_restart(void)
+{
+ mce_timer_delete_all();
+ on_each_cpu(mce_cpu_restart, NULL, 1);
+}
+
+/* Toggle features for corrected errors */
+static void mce_disable_cmci(void *data)
+{
+ if (!mce_available(raw_cpu_ptr(&cpu_info)))
+ return;
+ cmci_clear();
+}
+
+static void mce_enable_ce(void *all)
+{
+ if (!mce_available(raw_cpu_ptr(&cpu_info)))
+ return;
+ cmci_reenable();
+ cmci_recheck();
+ if (all)
+ __mcheck_cpu_init_timer();
+}
+
+static struct bus_type mce_subsys = {
+ .name = "machinecheck",
+ .dev_name = "machinecheck",
+};
+
+DEFINE_PER_CPU(struct device *, mce_device);
+
+static inline struct mce_bank *attr_to_bank(struct device_attribute *attr)
+{
+ return container_of(attr, struct mce_bank, attr);
+}
+
+static ssize_t show_bank(struct device *s, struct device_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl);
+}
+
+static ssize_t set_bank(struct device *s, struct device_attribute *attr,
+ const char *buf, size_t size)
+{
+ u64 new;
+
+ if (kstrtou64(buf, 0, &new) < 0)
+ return -EINVAL;
+
+ attr_to_bank(attr)->ctl = new;
+ mce_restart();
+
+ return size;
+}
+
+static ssize_t set_ignore_ce(struct device *s,
+ struct device_attribute *attr,
+ const char *buf, size_t size)
+{
+ u64 new;
+
+ if (kstrtou64(buf, 0, &new) < 0)
+ return -EINVAL;
+
+ mutex_lock(&mce_sysfs_mutex);
+ if (mca_cfg.ignore_ce ^ !!new) {
+ if (new) {
+ /* disable ce features */
+ mce_timer_delete_all();
+ on_each_cpu(mce_disable_cmci, NULL, 1);
+ mca_cfg.ignore_ce = true;
+ } else {
+ /* enable ce features */
+ mca_cfg.ignore_ce = false;
+ on_each_cpu(mce_enable_ce, (void *)1, 1);
+ }
+ }
+ mutex_unlock(&mce_sysfs_mutex);
+
+ return size;
+}
+
+static ssize_t set_cmci_disabled(struct device *s,
+ struct device_attribute *attr,
+ const char *buf, size_t size)
+{
+ u64 new;
+
+ if (kstrtou64(buf, 0, &new) < 0)
+ return -EINVAL;
+
+ mutex_lock(&mce_sysfs_mutex);
+ if (mca_cfg.cmci_disabled ^ !!new) {
+ if (new) {
+ /* disable cmci */
+ on_each_cpu(mce_disable_cmci, NULL, 1);
+ mca_cfg.cmci_disabled = true;
+ } else {
+ /* enable cmci */
+ mca_cfg.cmci_disabled = false;
+ on_each_cpu(mce_enable_ce, NULL, 1);
+ }
+ }
+ mutex_unlock(&mce_sysfs_mutex);
+
+ return size;
+}
+
+static ssize_t store_int_with_restart(struct device *s,
+ struct device_attribute *attr,
+ const char *buf, size_t size)
+{
+ unsigned long old_check_interval = check_interval;
+ ssize_t ret = device_store_ulong(s, attr, buf, size);
+
+ if (check_interval == old_check_interval)
+ return ret;
+
+ mutex_lock(&mce_sysfs_mutex);
+ mce_restart();
+ mutex_unlock(&mce_sysfs_mutex);
+
+ return ret;
+}
+
+static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant);
+static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout);
+static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce);
+
+static struct dev_ext_attribute dev_attr_check_interval = {
+ __ATTR(check_interval, 0644, device_show_int, store_int_with_restart),
+ &check_interval
+};
+
+static struct dev_ext_attribute dev_attr_ignore_ce = {
+ __ATTR(ignore_ce, 0644, device_show_bool, set_ignore_ce),
+ &mca_cfg.ignore_ce
+};
+
+static struct dev_ext_attribute dev_attr_cmci_disabled = {
+ __ATTR(cmci_disabled, 0644, device_show_bool, set_cmci_disabled),
+ &mca_cfg.cmci_disabled
+};
+
+static struct device_attribute *mce_device_attrs[] = {
+ &dev_attr_tolerant.attr,
+ &dev_attr_check_interval.attr,
+#ifdef CONFIG_X86_MCELOG_LEGACY
+ &dev_attr_trigger,
+#endif
+ &dev_attr_monarch_timeout.attr,
+ &dev_attr_dont_log_ce.attr,
+ &dev_attr_ignore_ce.attr,
+ &dev_attr_cmci_disabled.attr,
+ NULL
+};
+
+static cpumask_var_t mce_device_initialized;
+
+static void mce_device_release(struct device *dev)
+{
+ kfree(dev);
+}
+
+/* Per cpu device init. All of the cpus still share the same ctrl bank: */
+static int mce_device_create(unsigned int cpu)
+{
+ struct device *dev;
+ int err;
+ int i, j;
+
+ if (!mce_available(&boot_cpu_data))
+ return -EIO;
+
+ dev = per_cpu(mce_device, cpu);
+ if (dev)
+ return 0;
+
+ dev = kzalloc(sizeof *dev, GFP_KERNEL);
+ if (!dev)
+ return -ENOMEM;
+ dev->id = cpu;
+ dev->bus = &mce_subsys;
+ dev->release = &mce_device_release;
+
+ err = device_register(dev);
+ if (err) {
+ put_device(dev);
+ return err;
+ }
+
+ for (i = 0; mce_device_attrs[i]; i++) {
+ err = device_create_file(dev, mce_device_attrs[i]);
+ if (err)
+ goto error;
+ }
+ for (j = 0; j < mca_cfg.banks; j++) {
+ err = device_create_file(dev, &mce_banks[j].attr);
+ if (err)
+ goto error2;
+ }
+ cpumask_set_cpu(cpu, mce_device_initialized);
+ per_cpu(mce_device, cpu) = dev;
+
+ return 0;
+error2:
+ while (--j >= 0)
+ device_remove_file(dev, &mce_banks[j].attr);
+error:
+ while (--i >= 0)
+ device_remove_file(dev, mce_device_attrs[i]);
+
+ device_unregister(dev);
+
+ return err;
+}
+
+static void mce_device_remove(unsigned int cpu)
+{
+ struct device *dev = per_cpu(mce_device, cpu);
+ int i;
+
+ if (!cpumask_test_cpu(cpu, mce_device_initialized))
+ return;
+
+ for (i = 0; mce_device_attrs[i]; i++)
+ device_remove_file(dev, mce_device_attrs[i]);
+
+ for (i = 0; i < mca_cfg.banks; i++)
+ device_remove_file(dev, &mce_banks[i].attr);
+
+ device_unregister(dev);
+ cpumask_clear_cpu(cpu, mce_device_initialized);
+ per_cpu(mce_device, cpu) = NULL;
+}
+
+/* Make sure there are no machine checks on offlined CPUs. */
+static void mce_disable_cpu(void)
+{
+ if (!mce_available(raw_cpu_ptr(&cpu_info)))
+ return;
+
+ if (!cpuhp_tasks_frozen)
+ cmci_clear();
+
+ vendor_disable_error_reporting();
+}
+
+static void mce_reenable_cpu(void)
+{
+ int i;
+
+ if (!mce_available(raw_cpu_ptr(&cpu_info)))
+ return;
+
+ if (!cpuhp_tasks_frozen)
+ cmci_reenable();
+ for (i = 0; i < mca_cfg.banks; i++) {
+ struct mce_bank *b = &mce_banks[i];
+
+ if (b->init)
+ wrmsrl(msr_ops.ctl(i), b->ctl);
+ }
+}
+
+static int mce_cpu_dead(unsigned int cpu)
+{
+ mce_intel_hcpu_update(cpu);
+
+ /* intentionally ignoring frozen here */
+ if (!cpuhp_tasks_frozen)
+ cmci_rediscover();
+ return 0;
+}
+
+static int mce_cpu_online(unsigned int cpu)
+{
+ struct timer_list *t = this_cpu_ptr(&mce_timer);
+ int ret;
+
+ mce_device_create(cpu);
+
+ ret = mce_threshold_create_device(cpu);
+ if (ret) {
+ mce_device_remove(cpu);
+ return ret;
+ }
+ mce_reenable_cpu();
+ mce_start_timer(t);
+ return 0;
+}
+
+static int mce_cpu_pre_down(unsigned int cpu)
+{
+ struct timer_list *t = this_cpu_ptr(&mce_timer);
+
+ mce_disable_cpu();
+ del_timer_sync(t);
+ mce_threshold_remove_device(cpu);
+ mce_device_remove(cpu);
+ return 0;
+}
+
+static __init void mce_init_banks(void)
+{
+ int i;
+
+ for (i = 0; i < mca_cfg.banks; i++) {
+ struct mce_bank *b = &mce_banks[i];
+ struct device_attribute *a = &b->attr;
+
+ sysfs_attr_init(&a->attr);
+ a->attr.name = b->attrname;
+ snprintf(b->attrname, ATTR_LEN, "bank%d", i);
+
+ a->attr.mode = 0644;
+ a->show = show_bank;
+ a->store = set_bank;
+ }
+}
+
+static __init int mcheck_init_device(void)
+{
+ int err;
+
+ /*
+ * Check if we have a spare virtual bit. This will only become
+ * a problem if/when we move beyond 5-level page tables.
+ */
+ MAYBE_BUILD_BUG_ON(__VIRTUAL_MASK_SHIFT >= 63);
+
+ if (!mce_available(&boot_cpu_data)) {
+ err = -EIO;
+ goto err_out;
+ }
+
+ if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+
+ mce_init_banks();
+
+ err = subsys_system_register(&mce_subsys, NULL);
+ if (err)
+ goto err_out_mem;
+
+ err = cpuhp_setup_state(CPUHP_X86_MCE_DEAD, "x86/mce:dead", NULL,
+ mce_cpu_dead);
+ if (err)
+ goto err_out_mem;
+
+ err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/mce:online",
+ mce_cpu_online, mce_cpu_pre_down);
+ if (err < 0)
+ goto err_out_online;
+
+ register_syscore_ops(&mce_syscore_ops);
+
+ return 0;
+
+err_out_online:
+ cpuhp_remove_state(CPUHP_X86_MCE_DEAD);
+
+err_out_mem:
+ free_cpumask_var(mce_device_initialized);
+
+err_out:
+ pr_err("Unable to init MCE device (rc: %d)\n", err);
+
+ return err;
+}
+device_initcall_sync(mcheck_init_device);
+
+/*
+ * Old style boot options parsing. Only for compatibility.
+ */
+static int __init mcheck_disable(char *str)
+{
+ mca_cfg.disabled = 1;
+ return 1;
+}
+__setup("nomce", mcheck_disable);
+
+#ifdef CONFIG_DEBUG_FS
+struct dentry *mce_get_debugfs_dir(void)
+{
+ static struct dentry *dmce;
+
+ if (!dmce)
+ dmce = debugfs_create_dir("mce", NULL);
+
+ return dmce;
+}
+
+static void mce_reset(void)
+{
+ cpu_missing = 0;
+ atomic_set(&mce_fake_panicked, 0);
+ atomic_set(&mce_executing, 0);
+ atomic_set(&mce_callin, 0);
+ atomic_set(&global_nwo, 0);
+}
+
+static int fake_panic_get(void *data, u64 *val)
+{
+ *val = fake_panic;
+ return 0;
+}
+
+static int fake_panic_set(void *data, u64 val)
+{
+ mce_reset();
+ fake_panic = val;
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get,
+ fake_panic_set, "%llu\n");
+
+static int __init mcheck_debugfs_init(void)
+{
+ struct dentry *dmce, *ffake_panic;
+
+ dmce = mce_get_debugfs_dir();
+ if (!dmce)
+ return -ENOMEM;
+ ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL,
+ &fake_panic_fops);
+ if (!ffake_panic)
+ return -ENOMEM;
+
+ return 0;
+}
+#else
+static int __init mcheck_debugfs_init(void) { return -EINVAL; }
+#endif
+
+DEFINE_STATIC_KEY_FALSE(mcsafe_key);
+EXPORT_SYMBOL_GPL(mcsafe_key);
+
+static int __init mcheck_late_init(void)
+{
+ pr_info("Using %d MCE banks\n", mca_cfg.banks);
+
+ if (mca_cfg.recovery)
+ static_branch_inc(&mcsafe_key);
+
+ mcheck_debugfs_init();
+ cec_init();
+
+ /*
+ * Flush out everything that has been logged during early boot, now that
+ * everything has been initialized (workqueues, decoders, ...).
+ */
+ mce_schedule_work();
+
+ return 0;
+}
+late_initcall(mcheck_late_init);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
new file mode 100644
index 000000000..f878d24ff
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -0,0 +1,1478 @@
+/*
+ * (c) 2005-2016 Advanced Micro Devices, Inc.
+ * Your use of this code is subject to the terms and conditions of the
+ * GNU general public license version 2. See "COPYING" or
+ * http://www.gnu.org/licenses/gpl.html
+ *
+ * Written by Jacob Shin - AMD, Inc.
+ * Maintained by: Borislav Petkov <bp@alien8.de>
+ *
+ * All MC4_MISCi registers are shared between cores on a node.
+ */
+#include <linux/interrupt.h>
+#include <linux/notifier.h>
+#include <linux/kobject.h>
+#include <linux/percpu.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/sysfs.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/cpu.h>
+#include <linux/smp.h>
+#include <linux/string.h>
+
+#include <asm/amd_nb.h>
+#include <asm/traps.h>
+#include <asm/apic.h>
+#include <asm/mce.h>
+#include <asm/msr.h>
+#include <asm/trace/irq_vectors.h>
+
+#include "mce-internal.h"
+
+#define NR_BLOCKS 5
+#define THRESHOLD_MAX 0xFFF
+#define INT_TYPE_APIC 0x00020000
+#define MASK_VALID_HI 0x80000000
+#define MASK_CNTP_HI 0x40000000
+#define MASK_LOCKED_HI 0x20000000
+#define MASK_LVTOFF_HI 0x00F00000
+#define MASK_COUNT_EN_HI 0x00080000
+#define MASK_INT_TYPE_HI 0x00060000
+#define MASK_OVERFLOW_HI 0x00010000
+#define MASK_ERR_COUNT_HI 0x00000FFF
+#define MASK_BLKPTR_LO 0xFF000000
+#define MCG_XBLK_ADDR 0xC0000400
+
+/* Deferred error settings */
+#define MSR_CU_DEF_ERR 0xC0000410
+#define MASK_DEF_LVTOFF 0x000000F0
+#define MASK_DEF_INT_TYPE 0x00000006
+#define DEF_LVT_OFF 0x2
+#define DEF_INT_TYPE_APIC 0x2
+
+/* Scalable MCA: */
+
+/* Threshold LVT offset is at MSR0xC0000410[15:12] */
+#define SMCA_THR_LVT_OFF 0xF000
+
+static bool thresholding_irq_en;
+
+static const char * const th_names[] = {
+ "load_store",
+ "insn_fetch",
+ "combined_unit",
+ "decode_unit",
+ "northbridge",
+ "execution_unit",
+};
+
+static const char * const smca_umc_block_names[] = {
+ "dram_ecc",
+ "misc_umc"
+};
+
+struct smca_bank_name {
+ const char *name; /* Short name for sysfs */
+ const char *long_name; /* Long name for pretty-printing */
+};
+
+static struct smca_bank_name smca_names[] = {
+ [SMCA_LS] = { "load_store", "Load Store Unit" },
+ [SMCA_IF] = { "insn_fetch", "Instruction Fetch Unit" },
+ [SMCA_L2_CACHE] = { "l2_cache", "L2 Cache" },
+ [SMCA_DE] = { "decode_unit", "Decode Unit" },
+ [SMCA_RESERVED] = { "reserved", "Reserved" },
+ [SMCA_EX] = { "execution_unit", "Execution Unit" },
+ [SMCA_FP] = { "floating_point", "Floating Point Unit" },
+ [SMCA_L3_CACHE] = { "l3_cache", "L3 Cache" },
+ [SMCA_CS] = { "coherent_slave", "Coherent Slave" },
+ [SMCA_PIE] = { "pie", "Power, Interrupts, etc." },
+ [SMCA_UMC] = { "umc", "Unified Memory Controller" },
+ [SMCA_PB] = { "param_block", "Parameter Block" },
+ [SMCA_PSP] = { "psp", "Platform Security Processor" },
+ [SMCA_SMU] = { "smu", "System Management Unit" },
+};
+
+static u32 smca_bank_addrs[MAX_NR_BANKS][NR_BLOCKS] __ro_after_init =
+{
+ [0 ... MAX_NR_BANKS - 1] = { [0 ... NR_BLOCKS - 1] = -1 }
+};
+
+static const char *smca_get_name(enum smca_bank_types t)
+{
+ if (t >= N_SMCA_BANK_TYPES)
+ return NULL;
+
+ return smca_names[t].name;
+}
+
+const char *smca_get_long_name(enum smca_bank_types t)
+{
+ if (t >= N_SMCA_BANK_TYPES)
+ return NULL;
+
+ return smca_names[t].long_name;
+}
+EXPORT_SYMBOL_GPL(smca_get_long_name);
+
+static enum smca_bank_types smca_get_bank_type(unsigned int bank)
+{
+ struct smca_bank *b;
+
+ if (bank >= MAX_NR_BANKS)
+ return N_SMCA_BANK_TYPES;
+
+ b = &smca_banks[bank];
+ if (!b->hwid)
+ return N_SMCA_BANK_TYPES;
+
+ return b->hwid->bank_type;
+}
+
+static struct smca_hwid smca_hwid_mcatypes[] = {
+ /* { bank_type, hwid_mcatype, xec_bitmap } */
+
+ /* Reserved type */
+ { SMCA_RESERVED, HWID_MCATYPE(0x00, 0x0), 0x0 },
+
+ /* ZN Core (HWID=0xB0) MCA types */
+ { SMCA_LS, HWID_MCATYPE(0xB0, 0x0), 0x1FFFEF },
+ { SMCA_IF, HWID_MCATYPE(0xB0, 0x1), 0x3FFF },
+ { SMCA_L2_CACHE, HWID_MCATYPE(0xB0, 0x2), 0xF },
+ { SMCA_DE, HWID_MCATYPE(0xB0, 0x3), 0x1FF },
+ /* HWID 0xB0 MCATYPE 0x4 is Reserved */
+ { SMCA_EX, HWID_MCATYPE(0xB0, 0x5), 0x7FF },
+ { SMCA_FP, HWID_MCATYPE(0xB0, 0x6), 0x7F },
+ { SMCA_L3_CACHE, HWID_MCATYPE(0xB0, 0x7), 0xFF },
+
+ /* Data Fabric MCA types */
+ { SMCA_CS, HWID_MCATYPE(0x2E, 0x0), 0x1FF },
+ { SMCA_PIE, HWID_MCATYPE(0x2E, 0x1), 0xF },
+
+ /* Unified Memory Controller MCA type */
+ { SMCA_UMC, HWID_MCATYPE(0x96, 0x0), 0x3F },
+
+ /* Parameter Block MCA type */
+ { SMCA_PB, HWID_MCATYPE(0x05, 0x0), 0x1 },
+
+ /* Platform Security Processor MCA type */
+ { SMCA_PSP, HWID_MCATYPE(0xFF, 0x0), 0x1 },
+
+ /* System Management Unit MCA type */
+ { SMCA_SMU, HWID_MCATYPE(0x01, 0x0), 0x1 },
+};
+
+struct smca_bank smca_banks[MAX_NR_BANKS];
+EXPORT_SYMBOL_GPL(smca_banks);
+
+/*
+ * In SMCA enabled processors, we can have multiple banks for a given IP type.
+ * So to define a unique name for each bank, we use a temp c-string to append
+ * the MCA_IPID[InstanceId] to type's name in get_name().
+ *
+ * InstanceId is 32 bits which is 8 characters. Make sure MAX_MCATYPE_NAME_LEN
+ * is greater than 8 plus 1 (for underscore) plus length of longest type name.
+ */
+#define MAX_MCATYPE_NAME_LEN 30
+static char buf_mcatype[MAX_MCATYPE_NAME_LEN];
+
+static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks);
+static DEFINE_PER_CPU(unsigned int, bank_map); /* see which banks are on */
+
+static void amd_threshold_interrupt(void);
+static void amd_deferred_error_interrupt(void);
+
+static void default_deferred_error_interrupt(void)
+{
+ pr_err("Unexpected deferred interrupt at vector %x\n", DEFERRED_ERROR_VECTOR);
+}
+void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt;
+
+static void smca_configure(unsigned int bank, unsigned int cpu)
+{
+ unsigned int i, hwid_mcatype;
+ struct smca_hwid *s_hwid;
+ u32 high, low;
+ u32 smca_config = MSR_AMD64_SMCA_MCx_CONFIG(bank);
+
+ /* Set appropriate bits in MCA_CONFIG */
+ if (!rdmsr_safe(smca_config, &low, &high)) {
+ /*
+ * OS is required to set the MCAX bit to acknowledge that it is
+ * now using the new MSR ranges and new registers under each
+ * bank. It also means that the OS will configure deferred
+ * errors in the new MCx_CONFIG register. If the bit is not set,
+ * uncorrectable errors will cause a system panic.
+ *
+ * MCA_CONFIG[MCAX] is bit 32 (0 in the high portion of the MSR.)
+ */
+ high |= BIT(0);
+
+ /*
+ * SMCA sets the Deferred Error Interrupt type per bank.
+ *
+ * MCA_CONFIG[DeferredIntTypeSupported] is bit 5, and tells us
+ * if the DeferredIntType bit field is available.
+ *
+ * MCA_CONFIG[DeferredIntType] is bits [38:37] ([6:5] in the
+ * high portion of the MSR). OS should set this to 0x1 to enable
+ * APIC based interrupt. First, check that no interrupt has been
+ * set.
+ */
+ if ((low & BIT(5)) && !((high >> 5) & 0x3))
+ high |= BIT(5);
+
+ wrmsr(smca_config, low, high);
+ }
+
+ /* Return early if this bank was already initialized. */
+ if (smca_banks[bank].hwid && smca_banks[bank].hwid->hwid_mcatype != 0)
+ return;
+
+ if (rdmsr_safe(MSR_AMD64_SMCA_MCx_IPID(bank), &low, &high)) {
+ pr_warn("Failed to read MCA_IPID for bank %d\n", bank);
+ return;
+ }
+
+ hwid_mcatype = HWID_MCATYPE(high & MCI_IPID_HWID,
+ (high & MCI_IPID_MCATYPE) >> 16);
+
+ for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) {
+ s_hwid = &smca_hwid_mcatypes[i];
+ if (hwid_mcatype == s_hwid->hwid_mcatype) {
+ smca_banks[bank].hwid = s_hwid;
+ smca_banks[bank].id = low;
+ smca_banks[bank].sysfs_id = s_hwid->count++;
+ break;
+ }
+ }
+}
+
+struct thresh_restart {
+ struct threshold_block *b;
+ int reset;
+ int set_lvt_off;
+ int lvt_off;
+ u16 old_limit;
+};
+
+static inline bool is_shared_bank(int bank)
+{
+ /*
+ * Scalable MCA provides for only one core to have access to the MSRs of
+ * a shared bank.
+ */
+ if (mce_flags.smca)
+ return false;
+
+ /* Bank 4 is for northbridge reporting and is thus shared */
+ return (bank == 4);
+}
+
+static const char *bank4_names(const struct threshold_block *b)
+{
+ switch (b->address) {
+ /* MSR4_MISC0 */
+ case 0x00000413:
+ return "dram";
+
+ case 0xc0000408:
+ return "ht_links";
+
+ case 0xc0000409:
+ return "l3_cache";
+
+ default:
+ WARN(1, "Funny MSR: 0x%08x\n", b->address);
+ return "";
+ }
+};
+
+
+static bool lvt_interrupt_supported(unsigned int bank, u32 msr_high_bits)
+{
+ /*
+ * bank 4 supports APIC LVT interrupts implicitly since forever.
+ */
+ if (bank == 4)
+ return true;
+
+ /*
+ * IntP: interrupt present; if this bit is set, the thresholding
+ * bank can generate APIC LVT interrupts
+ */
+ return msr_high_bits & BIT(28);
+}
+
+static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
+{
+ int msr = (hi & MASK_LVTOFF_HI) >> 20;
+
+ if (apic < 0) {
+ pr_err(FW_BUG "cpu %d, failed to setup threshold interrupt "
+ "for bank %d, block %d (MSR%08X=0x%x%08x)\n", b->cpu,
+ b->bank, b->block, b->address, hi, lo);
+ return 0;
+ }
+
+ if (apic != msr) {
+ /*
+ * On SMCA CPUs, LVT offset is programmed at a different MSR, and
+ * the BIOS provides the value. The original field where LVT offset
+ * was set is reserved. Return early here:
+ */
+ if (mce_flags.smca)
+ return 0;
+
+ pr_err(FW_BUG "cpu %d, invalid threshold interrupt offset %d "
+ "for bank %d, block %d (MSR%08X=0x%x%08x)\n",
+ b->cpu, apic, b->bank, b->block, b->address, hi, lo);
+ return 0;
+ }
+
+ return 1;
+};
+
+/* Reprogram MCx_MISC MSR behind this threshold bank. */
+static void threshold_restart_bank(void *_tr)
+{
+ struct thresh_restart *tr = _tr;
+ u32 hi, lo;
+
+ rdmsr(tr->b->address, lo, hi);
+
+ if (tr->b->threshold_limit < (hi & THRESHOLD_MAX))
+ tr->reset = 1; /* limit cannot be lower than err count */
+
+ if (tr->reset) { /* reset err count and overflow bit */
+ hi =
+ (hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) |
+ (THRESHOLD_MAX - tr->b->threshold_limit);
+ } else if (tr->old_limit) { /* change limit w/o reset */
+ int new_count = (hi & THRESHOLD_MAX) +
+ (tr->old_limit - tr->b->threshold_limit);
+
+ hi = (hi & ~MASK_ERR_COUNT_HI) |
+ (new_count & THRESHOLD_MAX);
+ }
+
+ /* clear IntType */
+ hi &= ~MASK_INT_TYPE_HI;
+
+ if (!tr->b->interrupt_capable)
+ goto done;
+
+ if (tr->set_lvt_off) {
+ if (lvt_off_valid(tr->b, tr->lvt_off, lo, hi)) {
+ /* set new lvt offset */
+ hi &= ~MASK_LVTOFF_HI;
+ hi |= tr->lvt_off << 20;
+ }
+ }
+
+ if (tr->b->interrupt_enable)
+ hi |= INT_TYPE_APIC;
+
+ done:
+
+ hi |= MASK_COUNT_EN_HI;
+ wrmsr(tr->b->address, lo, hi);
+}
+
+static void mce_threshold_block_init(struct threshold_block *b, int offset)
+{
+ struct thresh_restart tr = {
+ .b = b,
+ .set_lvt_off = 1,
+ .lvt_off = offset,
+ };
+
+ b->threshold_limit = THRESHOLD_MAX;
+ threshold_restart_bank(&tr);
+};
+
+static int setup_APIC_mce_threshold(int reserved, int new)
+{
+ if (reserved < 0 && !setup_APIC_eilvt(new, THRESHOLD_APIC_VECTOR,
+ APIC_EILVT_MSG_FIX, 0))
+ return new;
+
+ return reserved;
+}
+
+static int setup_APIC_deferred_error(int reserved, int new)
+{
+ if (reserved < 0 && !setup_APIC_eilvt(new, DEFERRED_ERROR_VECTOR,
+ APIC_EILVT_MSG_FIX, 0))
+ return new;
+
+ return reserved;
+}
+
+static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c)
+{
+ u32 low = 0, high = 0;
+ int def_offset = -1, def_new;
+
+ if (rdmsr_safe(MSR_CU_DEF_ERR, &low, &high))
+ return;
+
+ def_new = (low & MASK_DEF_LVTOFF) >> 4;
+ if (!(low & MASK_DEF_LVTOFF)) {
+ pr_err(FW_BUG "Your BIOS is not setting up LVT offset 0x2 for deferred error IRQs correctly.\n");
+ def_new = DEF_LVT_OFF;
+ low = (low & ~MASK_DEF_LVTOFF) | (DEF_LVT_OFF << 4);
+ }
+
+ def_offset = setup_APIC_deferred_error(def_offset, def_new);
+ if ((def_offset == def_new) &&
+ (deferred_error_int_vector != amd_deferred_error_interrupt))
+ deferred_error_int_vector = amd_deferred_error_interrupt;
+
+ if (!mce_flags.smca)
+ low = (low & ~MASK_DEF_INT_TYPE) | DEF_INT_TYPE_APIC;
+
+ wrmsr(MSR_CU_DEF_ERR, low, high);
+}
+
+static u32 smca_get_block_address(unsigned int bank, unsigned int block)
+{
+ u32 low, high;
+ u32 addr = 0;
+
+ if (smca_get_bank_type(bank) == SMCA_RESERVED)
+ return addr;
+
+ if (!block)
+ return MSR_AMD64_SMCA_MCx_MISC(bank);
+
+ /* Check our cache first: */
+ if (smca_bank_addrs[bank][block] != -1)
+ return smca_bank_addrs[bank][block];
+
+ /*
+ * For SMCA enabled processors, BLKPTR field of the first MISC register
+ * (MCx_MISC0) indicates presence of additional MISC regs set (MISC1-4).
+ */
+ if (rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high))
+ goto out;
+
+ if (!(low & MCI_CONFIG_MCAX))
+ goto out;
+
+ if (!rdmsr_safe(MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high) &&
+ (low & MASK_BLKPTR_LO))
+ addr = MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1);
+
+out:
+ smca_bank_addrs[bank][block] = addr;
+ return addr;
+}
+
+static u32 get_block_address(u32 current_addr, u32 low, u32 high,
+ unsigned int bank, unsigned int block)
+{
+ u32 addr = 0, offset = 0;
+
+ if ((bank >= mca_cfg.banks) || (block >= NR_BLOCKS))
+ return addr;
+
+ if (mce_flags.smca)
+ return smca_get_block_address(bank, block);
+
+ /* Fall back to method we used for older processors: */
+ switch (block) {
+ case 0:
+ addr = msr_ops.misc(bank);
+ break;
+ case 1:
+ offset = ((low & MASK_BLKPTR_LO) >> 21);
+ if (offset)
+ addr = MCG_XBLK_ADDR + offset;
+ break;
+ default:
+ addr = ++current_addr;
+ }
+ return addr;
+}
+
+static int
+prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,
+ int offset, u32 misc_high)
+{
+ unsigned int cpu = smp_processor_id();
+ u32 smca_low, smca_high;
+ struct threshold_block b;
+ int new;
+
+ if (!block)
+ per_cpu(bank_map, cpu) |= (1 << bank);
+
+ memset(&b, 0, sizeof(b));
+ b.cpu = cpu;
+ b.bank = bank;
+ b.block = block;
+ b.address = addr;
+ b.interrupt_capable = lvt_interrupt_supported(bank, misc_high);
+
+ if (!b.interrupt_capable)
+ goto done;
+
+ b.interrupt_enable = 1;
+
+ if (!mce_flags.smca) {
+ new = (misc_high & MASK_LVTOFF_HI) >> 20;
+ goto set_offset;
+ }
+
+ /* Gather LVT offset for thresholding: */
+ if (rdmsr_safe(MSR_CU_DEF_ERR, &smca_low, &smca_high))
+ goto out;
+
+ new = (smca_low & SMCA_THR_LVT_OFF) >> 12;
+
+set_offset:
+ offset = setup_APIC_mce_threshold(offset, new);
+ if (offset == new)
+ thresholding_irq_en = true;
+
+done:
+ mce_threshold_block_init(&b, offset);
+
+out:
+ return offset;
+}
+
+/*
+ * Turn off MC4_MISC thresholding banks on all family 0x15 models since
+ * they're not supported there.
+ */
+void disable_err_thresholding(struct cpuinfo_x86 *c)
+{
+ int i;
+ u64 hwcr;
+ bool need_toggle;
+ u32 msrs[] = {
+ 0x00000413, /* MC4_MISC0 */
+ 0xc0000408, /* MC4_MISC1 */
+ };
+
+ if (c->x86 != 0x15)
+ return;
+
+ rdmsrl(MSR_K7_HWCR, hwcr);
+
+ /* McStatusWrEn has to be set */
+ need_toggle = !(hwcr & BIT(18));
+
+ if (need_toggle)
+ wrmsrl(MSR_K7_HWCR, hwcr | BIT(18));
+
+ /* Clear CntP bit safely */
+ for (i = 0; i < ARRAY_SIZE(msrs); i++)
+ msr_clear_bit(msrs[i], 62);
+
+ /* restore old settings */
+ if (need_toggle)
+ wrmsrl(MSR_K7_HWCR, hwcr);
+}
+
+/* cpu init entry point, called from mce.c with preempt off */
+void mce_amd_feature_init(struct cpuinfo_x86 *c)
+{
+ u32 low = 0, high = 0, address = 0;
+ unsigned int bank, block, cpu = smp_processor_id();
+ int offset = -1;
+
+ disable_err_thresholding(c);
+
+ for (bank = 0; bank < mca_cfg.banks; ++bank) {
+ if (mce_flags.smca)
+ smca_configure(bank, cpu);
+
+ for (block = 0; block < NR_BLOCKS; ++block) {
+ address = get_block_address(address, low, high, bank, block);
+ if (!address)
+ break;
+
+ if (rdmsr_safe(address, &low, &high))
+ break;
+
+ if (!(high & MASK_VALID_HI))
+ continue;
+
+ if (!(high & MASK_CNTP_HI) ||
+ (high & MASK_LOCKED_HI))
+ continue;
+
+ offset = prepare_threshold_block(bank, block, address, offset, high);
+ }
+ }
+
+ if (mce_flags.succor)
+ deferred_error_interrupt_enable(c);
+}
+
+int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr)
+{
+ u64 dram_base_addr, dram_limit_addr, dram_hole_base;
+ /* We start from the normalized address */
+ u64 ret_addr = norm_addr;
+
+ u32 tmp;
+
+ u8 die_id_shift, die_id_mask, socket_id_shift, socket_id_mask;
+ u8 intlv_num_dies, intlv_num_chan, intlv_num_sockets;
+ u8 intlv_addr_sel, intlv_addr_bit;
+ u8 num_intlv_bits, hashed_bit;
+ u8 lgcy_mmio_hole_en, base = 0;
+ u8 cs_mask, cs_id = 0;
+ bool hash_enabled = false;
+
+ /* Read D18F0x1B4 (DramOffset), check if base 1 is used. */
+ if (amd_df_indirect_read(nid, 0, 0x1B4, umc, &tmp))
+ goto out_err;
+
+ /* Remove HiAddrOffset from normalized address, if enabled: */
+ if (tmp & BIT(0)) {
+ u64 hi_addr_offset = (tmp & GENMASK_ULL(31, 20)) << 8;
+
+ if (norm_addr >= hi_addr_offset) {
+ ret_addr -= hi_addr_offset;
+ base = 1;
+ }
+ }
+
+ /* Read D18F0x110 (DramBaseAddress). */
+ if (amd_df_indirect_read(nid, 0, 0x110 + (8 * base), umc, &tmp))
+ goto out_err;
+
+ /* Check if address range is valid. */
+ if (!(tmp & BIT(0))) {
+ pr_err("%s: Invalid DramBaseAddress range: 0x%x.\n",
+ __func__, tmp);
+ goto out_err;
+ }
+
+ lgcy_mmio_hole_en = tmp & BIT(1);
+ intlv_num_chan = (tmp >> 4) & 0xF;
+ intlv_addr_sel = (tmp >> 8) & 0x7;
+ dram_base_addr = (tmp & GENMASK_ULL(31, 12)) << 16;
+
+ /* {0, 1, 2, 3} map to address bits {8, 9, 10, 11} respectively */
+ if (intlv_addr_sel > 3) {
+ pr_err("%s: Invalid interleave address select %d.\n",
+ __func__, intlv_addr_sel);
+ goto out_err;
+ }
+
+ /* Read D18F0x114 (DramLimitAddress). */
+ if (amd_df_indirect_read(nid, 0, 0x114 + (8 * base), umc, &tmp))
+ goto out_err;
+
+ intlv_num_sockets = (tmp >> 8) & 0x1;
+ intlv_num_dies = (tmp >> 10) & 0x3;
+ dram_limit_addr = ((tmp & GENMASK_ULL(31, 12)) << 16) | GENMASK_ULL(27, 0);
+
+ intlv_addr_bit = intlv_addr_sel + 8;
+
+ /* Re-use intlv_num_chan by setting it equal to log2(#channels) */
+ switch (intlv_num_chan) {
+ case 0: intlv_num_chan = 0; break;
+ case 1: intlv_num_chan = 1; break;
+ case 3: intlv_num_chan = 2; break;
+ case 5: intlv_num_chan = 3; break;
+ case 7: intlv_num_chan = 4; break;
+
+ case 8: intlv_num_chan = 1;
+ hash_enabled = true;
+ break;
+ default:
+ pr_err("%s: Invalid number of interleaved channels %d.\n",
+ __func__, intlv_num_chan);
+ goto out_err;
+ }
+
+ num_intlv_bits = intlv_num_chan;
+
+ if (intlv_num_dies > 2) {
+ pr_err("%s: Invalid number of interleaved nodes/dies %d.\n",
+ __func__, intlv_num_dies);
+ goto out_err;
+ }
+
+ num_intlv_bits += intlv_num_dies;
+
+ /* Add a bit if sockets are interleaved. */
+ num_intlv_bits += intlv_num_sockets;
+
+ /* Assert num_intlv_bits <= 4 */
+ if (num_intlv_bits > 4) {
+ pr_err("%s: Invalid interleave bits %d.\n",
+ __func__, num_intlv_bits);
+ goto out_err;
+ }
+
+ if (num_intlv_bits > 0) {
+ u64 temp_addr_x, temp_addr_i, temp_addr_y;
+ u8 die_id_bit, sock_id_bit, cs_fabric_id;
+
+ /*
+ * Read FabricBlockInstanceInformation3_CS[BlockFabricID].
+ * This is the fabric id for this coherent slave. Use
+ * umc/channel# as instance id of the coherent slave
+ * for FICAA.
+ */
+ if (amd_df_indirect_read(nid, 0, 0x50, umc, &tmp))
+ goto out_err;
+
+ cs_fabric_id = (tmp >> 8) & 0xFF;
+ die_id_bit = 0;
+
+ /* If interleaved over more than 1 channel: */
+ if (intlv_num_chan) {
+ die_id_bit = intlv_num_chan;
+ cs_mask = (1 << die_id_bit) - 1;
+ cs_id = cs_fabric_id & cs_mask;
+ }
+
+ sock_id_bit = die_id_bit;
+
+ /* Read D18F1x208 (SystemFabricIdMask). */
+ if (intlv_num_dies || intlv_num_sockets)
+ if (amd_df_indirect_read(nid, 1, 0x208, umc, &tmp))
+ goto out_err;
+
+ /* If interleaved over more than 1 die. */
+ if (intlv_num_dies) {
+ sock_id_bit = die_id_bit + intlv_num_dies;
+ die_id_shift = (tmp >> 24) & 0xF;
+ die_id_mask = (tmp >> 8) & 0xFF;
+
+ cs_id |= ((cs_fabric_id & die_id_mask) >> die_id_shift) << die_id_bit;
+ }
+
+ /* If interleaved over more than 1 socket. */
+ if (intlv_num_sockets) {
+ socket_id_shift = (tmp >> 28) & 0xF;
+ socket_id_mask = (tmp >> 16) & 0xFF;
+
+ cs_id |= ((cs_fabric_id & socket_id_mask) >> socket_id_shift) << sock_id_bit;
+ }
+
+ /*
+ * The pre-interleaved address consists of XXXXXXIIIYYYYY
+ * where III is the ID for this CS, and XXXXXXYYYYY are the
+ * address bits from the post-interleaved address.
+ * "num_intlv_bits" has been calculated to tell us how many "I"
+ * bits there are. "intlv_addr_bit" tells us how many "Y" bits
+ * there are (where "I" starts).
+ */
+ temp_addr_y = ret_addr & GENMASK_ULL(intlv_addr_bit-1, 0);
+ temp_addr_i = (cs_id << intlv_addr_bit);
+ temp_addr_x = (ret_addr & GENMASK_ULL(63, intlv_addr_bit)) << num_intlv_bits;
+ ret_addr = temp_addr_x | temp_addr_i | temp_addr_y;
+ }
+
+ /* Add dram base address */
+ ret_addr += dram_base_addr;
+
+ /* If legacy MMIO hole enabled */
+ if (lgcy_mmio_hole_en) {
+ if (amd_df_indirect_read(nid, 0, 0x104, umc, &tmp))
+ goto out_err;
+
+ dram_hole_base = tmp & GENMASK(31, 24);
+ if (ret_addr >= dram_hole_base)
+ ret_addr += (BIT_ULL(32) - dram_hole_base);
+ }
+
+ if (hash_enabled) {
+ /* Save some parentheses and grab ls-bit at the end. */
+ hashed_bit = (ret_addr >> 12) ^
+ (ret_addr >> 18) ^
+ (ret_addr >> 21) ^
+ (ret_addr >> 30) ^
+ cs_id;
+
+ hashed_bit &= BIT(0);
+
+ if (hashed_bit != ((ret_addr >> intlv_addr_bit) & BIT(0)))
+ ret_addr ^= BIT(intlv_addr_bit);
+ }
+
+ /* Is calculated system address is above DRAM limit address? */
+ if (ret_addr > dram_limit_addr)
+ goto out_err;
+
+ *sys_addr = ret_addr;
+ return 0;
+
+out_err:
+ return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(umc_normaddr_to_sysaddr);
+
+bool amd_mce_is_memory_error(struct mce *m)
+{
+ /* ErrCodeExt[20:16] */
+ u8 xec = (m->status >> 16) & 0x1f;
+
+ if (mce_flags.smca)
+ return smca_get_bank_type(m->bank) == SMCA_UMC && xec == 0x0;
+
+ return m->bank == 4 && xec == 0x8;
+}
+
+static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc)
+{
+ struct mce m;
+
+ mce_setup(&m);
+
+ m.status = status;
+ m.misc = misc;
+ m.bank = bank;
+ m.tsc = rdtsc();
+
+ if (m.status & MCI_STATUS_ADDRV) {
+ m.addr = addr;
+
+ /*
+ * Extract [55:<lsb>] where lsb is the least significant
+ * *valid* bit of the address bits.
+ */
+ if (mce_flags.smca) {
+ u8 lsb = (m.addr >> 56) & 0x3f;
+
+ m.addr &= GENMASK_ULL(55, lsb);
+ }
+ }
+
+ if (mce_flags.smca) {
+ rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), m.ipid);
+
+ if (m.status & MCI_STATUS_SYNDV)
+ rdmsrl(MSR_AMD64_SMCA_MCx_SYND(bank), m.synd);
+ }
+
+ mce_log(&m);
+}
+
+asmlinkage __visible void __irq_entry smp_deferred_error_interrupt(struct pt_regs *regs)
+{
+ entering_irq();
+ trace_deferred_error_apic_entry(DEFERRED_ERROR_VECTOR);
+ inc_irq_stat(irq_deferred_error_count);
+ deferred_error_int_vector();
+ trace_deferred_error_apic_exit(DEFERRED_ERROR_VECTOR);
+ exiting_ack_irq();
+}
+
+/*
+ * Returns true if the logged error is deferred. False, otherwise.
+ */
+static inline bool
+_log_error_bank(unsigned int bank, u32 msr_stat, u32 msr_addr, u64 misc)
+{
+ u64 status, addr = 0;
+
+ rdmsrl(msr_stat, status);
+ if (!(status & MCI_STATUS_VAL))
+ return false;
+
+ if (status & MCI_STATUS_ADDRV)
+ rdmsrl(msr_addr, addr);
+
+ __log_error(bank, status, addr, misc);
+
+ wrmsrl(msr_stat, 0);
+
+ return status & MCI_STATUS_DEFERRED;
+}
+
+/*
+ * We have three scenarios for checking for Deferred errors:
+ *
+ * 1) Non-SMCA systems check MCA_STATUS and log error if found.
+ * 2) SMCA systems check MCA_STATUS. If error is found then log it and also
+ * clear MCA_DESTAT.
+ * 3) SMCA systems check MCA_DESTAT, if error was not found in MCA_STATUS, and
+ * log it.
+ */
+static void log_error_deferred(unsigned int bank)
+{
+ bool defrd;
+
+ defrd = _log_error_bank(bank, msr_ops.status(bank),
+ msr_ops.addr(bank), 0);
+
+ if (!mce_flags.smca)
+ return;
+
+ /* Clear MCA_DESTAT if we logged the deferred error from MCA_STATUS. */
+ if (defrd) {
+ wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(bank), 0);
+ return;
+ }
+
+ /*
+ * Only deferred errors are logged in MCA_DE{STAT,ADDR} so just check
+ * for a valid error.
+ */
+ _log_error_bank(bank, MSR_AMD64_SMCA_MCx_DESTAT(bank),
+ MSR_AMD64_SMCA_MCx_DEADDR(bank), 0);
+}
+
+/* APIC interrupt handler for deferred errors */
+static void amd_deferred_error_interrupt(void)
+{
+ unsigned int bank;
+
+ for (bank = 0; bank < mca_cfg.banks; ++bank)
+ log_error_deferred(bank);
+}
+
+static void log_error_thresholding(unsigned int bank, u64 misc)
+{
+ _log_error_bank(bank, msr_ops.status(bank), msr_ops.addr(bank), misc);
+}
+
+static void log_and_reset_block(struct threshold_block *block)
+{
+ struct thresh_restart tr;
+ u32 low = 0, high = 0;
+
+ if (!block)
+ return;
+
+ if (rdmsr_safe(block->address, &low, &high))
+ return;
+
+ if (!(high & MASK_OVERFLOW_HI))
+ return;
+
+ /* Log the MCE which caused the threshold event. */
+ log_error_thresholding(block->bank, ((u64)high << 32) | low);
+
+ /* Reset threshold block after logging error. */
+ memset(&tr, 0, sizeof(tr));
+ tr.b = block;
+ threshold_restart_bank(&tr);
+}
+
+/*
+ * Threshold interrupt handler will service THRESHOLD_APIC_VECTOR. The interrupt
+ * goes off when error_count reaches threshold_limit.
+ */
+static void amd_threshold_interrupt(void)
+{
+ struct threshold_block *first_block = NULL, *block = NULL, *tmp = NULL;
+ unsigned int bank, cpu = smp_processor_id();
+
+ for (bank = 0; bank < mca_cfg.banks; ++bank) {
+ if (!(per_cpu(bank_map, cpu) & (1 << bank)))
+ continue;
+
+ first_block = per_cpu(threshold_banks, cpu)[bank]->blocks;
+ if (!first_block)
+ continue;
+
+ /*
+ * The first block is also the head of the list. Check it first
+ * before iterating over the rest.
+ */
+ log_and_reset_block(first_block);
+ list_for_each_entry_safe(block, tmp, &first_block->miscj, miscj)
+ log_and_reset_block(block);
+ }
+}
+
+/*
+ * Sysfs Interface
+ */
+
+struct threshold_attr {
+ struct attribute attr;
+ ssize_t (*show) (struct threshold_block *, char *);
+ ssize_t (*store) (struct threshold_block *, const char *, size_t count);
+};
+
+#define SHOW_FIELDS(name) \
+static ssize_t show_ ## name(struct threshold_block *b, char *buf) \
+{ \
+ return sprintf(buf, "%lu\n", (unsigned long) b->name); \
+}
+SHOW_FIELDS(interrupt_enable)
+SHOW_FIELDS(threshold_limit)
+
+static ssize_t
+store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size)
+{
+ struct thresh_restart tr;
+ unsigned long new;
+
+ if (!b->interrupt_capable)
+ return -EINVAL;
+
+ if (kstrtoul(buf, 0, &new) < 0)
+ return -EINVAL;
+
+ b->interrupt_enable = !!new;
+
+ memset(&tr, 0, sizeof(tr));
+ tr.b = b;
+
+ smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
+
+ return size;
+}
+
+static ssize_t
+store_threshold_limit(struct threshold_block *b, const char *buf, size_t size)
+{
+ struct thresh_restart tr;
+ unsigned long new;
+
+ if (kstrtoul(buf, 0, &new) < 0)
+ return -EINVAL;
+
+ if (new > THRESHOLD_MAX)
+ new = THRESHOLD_MAX;
+ if (new < 1)
+ new = 1;
+
+ memset(&tr, 0, sizeof(tr));
+ tr.old_limit = b->threshold_limit;
+ b->threshold_limit = new;
+ tr.b = b;
+
+ smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
+
+ return size;
+}
+
+static ssize_t show_error_count(struct threshold_block *b, char *buf)
+{
+ u32 lo, hi;
+
+ rdmsr_on_cpu(b->cpu, b->address, &lo, &hi);
+
+ return sprintf(buf, "%u\n", ((hi & THRESHOLD_MAX) -
+ (THRESHOLD_MAX - b->threshold_limit)));
+}
+
+static struct threshold_attr error_count = {
+ .attr = {.name = __stringify(error_count), .mode = 0444 },
+ .show = show_error_count,
+};
+
+#define RW_ATTR(val) \
+static struct threshold_attr val = { \
+ .attr = {.name = __stringify(val), .mode = 0644 }, \
+ .show = show_## val, \
+ .store = store_## val, \
+};
+
+RW_ATTR(interrupt_enable);
+RW_ATTR(threshold_limit);
+
+static struct attribute *default_attrs[] = {
+ &threshold_limit.attr,
+ &error_count.attr,
+ NULL, /* possibly interrupt_enable if supported, see below */
+ NULL,
+};
+
+#define to_block(k) container_of(k, struct threshold_block, kobj)
+#define to_attr(a) container_of(a, struct threshold_attr, attr)
+
+static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+ struct threshold_block *b = to_block(kobj);
+ struct threshold_attr *a = to_attr(attr);
+ ssize_t ret;
+
+ ret = a->show ? a->show(b, buf) : -EIO;
+
+ return ret;
+}
+
+static ssize_t store(struct kobject *kobj, struct attribute *attr,
+ const char *buf, size_t count)
+{
+ struct threshold_block *b = to_block(kobj);
+ struct threshold_attr *a = to_attr(attr);
+ ssize_t ret;
+
+ ret = a->store ? a->store(b, buf, count) : -EIO;
+
+ return ret;
+}
+
+static const struct sysfs_ops threshold_ops = {
+ .show = show,
+ .store = store,
+};
+
+static void threshold_block_release(struct kobject *kobj);
+
+static struct kobj_type threshold_ktype = {
+ .sysfs_ops = &threshold_ops,
+ .default_attrs = default_attrs,
+ .release = threshold_block_release,
+};
+
+static const char *get_name(unsigned int bank, struct threshold_block *b)
+{
+ enum smca_bank_types bank_type;
+
+ if (!mce_flags.smca) {
+ if (b && bank == 4)
+ return bank4_names(b);
+
+ return th_names[bank];
+ }
+
+ bank_type = smca_get_bank_type(bank);
+ if (bank_type >= N_SMCA_BANK_TYPES)
+ return NULL;
+
+ if (b && bank_type == SMCA_UMC) {
+ if (b->block < ARRAY_SIZE(smca_umc_block_names))
+ return smca_umc_block_names[b->block];
+ return NULL;
+ }
+
+ if (smca_banks[bank].hwid->count == 1)
+ return smca_get_name(bank_type);
+
+ snprintf(buf_mcatype, MAX_MCATYPE_NAME_LEN,
+ "%s_%x", smca_get_name(bank_type),
+ smca_banks[bank].sysfs_id);
+ return buf_mcatype;
+}
+
+static int allocate_threshold_blocks(unsigned int cpu, struct threshold_bank *tb,
+ unsigned int bank, unsigned int block,
+ u32 address)
+{
+ struct threshold_block *b = NULL;
+ u32 low, high;
+ int err;
+
+ if ((bank >= mca_cfg.banks) || (block >= NR_BLOCKS))
+ return 0;
+
+ if (rdmsr_safe_on_cpu(cpu, address, &low, &high))
+ return 0;
+
+ if (!(high & MASK_VALID_HI)) {
+ if (block)
+ goto recurse;
+ else
+ return 0;
+ }
+
+ if (!(high & MASK_CNTP_HI) ||
+ (high & MASK_LOCKED_HI))
+ goto recurse;
+
+ b = kzalloc(sizeof(struct threshold_block), GFP_KERNEL);
+ if (!b)
+ return -ENOMEM;
+
+ b->block = block;
+ b->bank = bank;
+ b->cpu = cpu;
+ b->address = address;
+ b->interrupt_enable = 0;
+ b->interrupt_capable = lvt_interrupt_supported(bank, high);
+ b->threshold_limit = THRESHOLD_MAX;
+
+ if (b->interrupt_capable) {
+ threshold_ktype.default_attrs[2] = &interrupt_enable.attr;
+ b->interrupt_enable = 1;
+ } else {
+ threshold_ktype.default_attrs[2] = NULL;
+ }
+
+ INIT_LIST_HEAD(&b->miscj);
+
+ if (tb->blocks)
+ list_add(&b->miscj, &tb->blocks->miscj);
+ else
+ tb->blocks = b;
+
+ err = kobject_init_and_add(&b->kobj, &threshold_ktype, tb->kobj, get_name(bank, b));
+ if (err)
+ goto out_free;
+recurse:
+ address = get_block_address(address, low, high, bank, ++block);
+ if (!address)
+ return 0;
+
+ err = allocate_threshold_blocks(cpu, tb, bank, block, address);
+ if (err)
+ goto out_free;
+
+ if (b)
+ kobject_uevent(&b->kobj, KOBJ_ADD);
+
+ return err;
+
+out_free:
+ if (b) {
+ kobject_put(&b->kobj);
+ list_del(&b->miscj);
+ kfree(b);
+ }
+ return err;
+}
+
+static int __threshold_add_blocks(struct threshold_bank *b)
+{
+ struct list_head *head = &b->blocks->miscj;
+ struct threshold_block *pos = NULL;
+ struct threshold_block *tmp = NULL;
+ int err = 0;
+
+ err = kobject_add(&b->blocks->kobj, b->kobj, b->blocks->kobj.name);
+ if (err)
+ return err;
+
+ list_for_each_entry_safe(pos, tmp, head, miscj) {
+
+ err = kobject_add(&pos->kobj, b->kobj, pos->kobj.name);
+ if (err) {
+ list_for_each_entry_safe_reverse(pos, tmp, head, miscj)
+ kobject_del(&pos->kobj);
+
+ return err;
+ }
+ }
+ return err;
+}
+
+static int threshold_create_bank(unsigned int cpu, unsigned int bank)
+{
+ struct device *dev = per_cpu(mce_device, cpu);
+ struct amd_northbridge *nb = NULL;
+ struct threshold_bank *b = NULL;
+ const char *name = get_name(bank, NULL);
+ int err = 0;
+
+ if (!dev)
+ return -ENODEV;
+
+ if (is_shared_bank(bank)) {
+ nb = node_to_amd_nb(amd_get_nb_id(cpu));
+
+ /* threshold descriptor already initialized on this node? */
+ if (nb && nb->bank4) {
+ /* yes, use it */
+ b = nb->bank4;
+ err = kobject_add(b->kobj, &dev->kobj, name);
+ if (err)
+ goto out;
+
+ per_cpu(threshold_banks, cpu)[bank] = b;
+ refcount_inc(&b->cpus);
+
+ err = __threshold_add_blocks(b);
+
+ goto out;
+ }
+ }
+
+ b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL);
+ if (!b) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ b->kobj = kobject_create_and_add(name, &dev->kobj);
+ if (!b->kobj) {
+ err = -EINVAL;
+ goto out_free;
+ }
+
+ if (is_shared_bank(bank)) {
+ refcount_set(&b->cpus, 1);
+
+ /* nb is already initialized, see above */
+ if (nb) {
+ WARN_ON(nb->bank4);
+ nb->bank4 = b;
+ }
+ }
+
+ err = allocate_threshold_blocks(cpu, b, bank, 0, msr_ops.misc(bank));
+ if (err)
+ goto out_free;
+
+ per_cpu(threshold_banks, cpu)[bank] = b;
+
+ return 0;
+
+ out_free:
+ kfree(b);
+
+ out:
+ return err;
+}
+
+static void threshold_block_release(struct kobject *kobj)
+{
+ kfree(to_block(kobj));
+}
+
+static void deallocate_threshold_block(unsigned int cpu, unsigned int bank)
+{
+ struct threshold_block *pos = NULL;
+ struct threshold_block *tmp = NULL;
+ struct threshold_bank *head = per_cpu(threshold_banks, cpu)[bank];
+
+ if (!head)
+ return;
+
+ list_for_each_entry_safe(pos, tmp, &head->blocks->miscj, miscj) {
+ list_del(&pos->miscj);
+ kobject_put(&pos->kobj);
+ }
+
+ kobject_put(&head->blocks->kobj);
+}
+
+static void __threshold_remove_blocks(struct threshold_bank *b)
+{
+ struct threshold_block *pos = NULL;
+ struct threshold_block *tmp = NULL;
+
+ kobject_del(b->kobj);
+
+ list_for_each_entry_safe(pos, tmp, &b->blocks->miscj, miscj)
+ kobject_del(&pos->kobj);
+}
+
+static void threshold_remove_bank(unsigned int cpu, int bank)
+{
+ struct amd_northbridge *nb;
+ struct threshold_bank *b;
+
+ b = per_cpu(threshold_banks, cpu)[bank];
+ if (!b)
+ return;
+
+ if (!b->blocks)
+ goto free_out;
+
+ if (is_shared_bank(bank)) {
+ if (!refcount_dec_and_test(&b->cpus)) {
+ __threshold_remove_blocks(b);
+ per_cpu(threshold_banks, cpu)[bank] = NULL;
+ return;
+ } else {
+ /*
+ * the last CPU on this node using the shared bank is
+ * going away, remove that bank now.
+ */
+ nb = node_to_amd_nb(amd_get_nb_id(cpu));
+ nb->bank4 = NULL;
+ }
+ }
+
+ deallocate_threshold_block(cpu, bank);
+
+free_out:
+ kobject_del(b->kobj);
+ kobject_put(b->kobj);
+ kfree(b);
+ per_cpu(threshold_banks, cpu)[bank] = NULL;
+}
+
+int mce_threshold_remove_device(unsigned int cpu)
+{
+ unsigned int bank;
+
+ for (bank = 0; bank < mca_cfg.banks; ++bank) {
+ if (!(per_cpu(bank_map, cpu) & (1 << bank)))
+ continue;
+ threshold_remove_bank(cpu, bank);
+ }
+ kfree(per_cpu(threshold_banks, cpu));
+ per_cpu(threshold_banks, cpu) = NULL;
+ return 0;
+}
+
+/* create dir/files for all valid threshold banks */
+int mce_threshold_create_device(unsigned int cpu)
+{
+ unsigned int bank;
+ struct threshold_bank **bp;
+ int err = 0;
+
+ bp = per_cpu(threshold_banks, cpu);
+ if (bp)
+ return 0;
+
+ bp = kcalloc(mca_cfg.banks, sizeof(struct threshold_bank *),
+ GFP_KERNEL);
+ if (!bp)
+ return -ENOMEM;
+
+ per_cpu(threshold_banks, cpu) = bp;
+
+ for (bank = 0; bank < mca_cfg.banks; ++bank) {
+ if (!(per_cpu(bank_map, cpu) & (1 << bank)))
+ continue;
+ err = threshold_create_bank(cpu, bank);
+ if (err)
+ goto err;
+ }
+ return err;
+err:
+ mce_threshold_remove_device(cpu);
+ return err;
+}
+
+static __init int threshold_init_device(void)
+{
+ unsigned lcpu = 0;
+
+ /* to hit CPUs online before the notifier is up */
+ for_each_online_cpu(lcpu) {
+ int err = mce_threshold_create_device(lcpu);
+
+ if (err)
+ return err;
+ }
+
+ if (thresholding_irq_en)
+ mce_threshold_vector = amd_threshold_interrupt;
+
+ return 0;
+}
+/*
+ * there are 3 funcs which need to be _initcalled in a logic sequence:
+ * 1. xen_late_init_mcelog
+ * 2. mcheck_init_device
+ * 3. threshold_init_device
+ *
+ * xen_late_init_mcelog must register xen_mce_chrdev_device before
+ * native mce_chrdev_device registration if running under xen platform;
+ *
+ * mcheck_init_device should be inited before threshold_init_device to
+ * initialize mce_device, otherwise a NULL ptr dereference will cause panic.
+ *
+ * so we use following _initcalls
+ * 1. device_initcall(xen_late_init_mcelog);
+ * 2. device_initcall_sync(mcheck_init_device);
+ * 3. late_initcall(threshold_init_device);
+ *
+ * when running under xen, the initcall order is 1,2,3;
+ * on baremetal, we skip 1 and we do only 2 and 3.
+ */
+late_initcall(threshold_init_device);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
new file mode 100644
index 000000000..1d87b8515
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -0,0 +1,519 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel specific MCE features.
+ * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca>
+ * Copyright (C) 2008, 2009 Intel Corporation
+ * Author: Andi Kleen
+ */
+
+#include <linux/gfp.h>
+#include <linux/interrupt.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/cpumask.h>
+#include <asm/apic.h>
+#include <asm/cpufeature.h>
+#include <asm/intel-family.h>
+#include <asm/processor.h>
+#include <asm/msr.h>
+#include <asm/mce.h>
+
+#include "mce-internal.h"
+
+/*
+ * Support for Intel Correct Machine Check Interrupts. This allows
+ * the CPU to raise an interrupt when a corrected machine check happened.
+ * Normally we pick those up using a regular polling timer.
+ * Also supports reliable discovery of shared banks.
+ */
+
+/*
+ * CMCI can be delivered to multiple cpus that share a machine check bank
+ * so we need to designate a single cpu to process errors logged in each bank
+ * in the interrupt handler (otherwise we would have many races and potential
+ * double reporting of the same error).
+ * Note that this can change when a cpu is offlined or brought online since
+ * some MCA banks are shared across cpus. When a cpu is offlined, cmci_clear()
+ * disables CMCI on all banks owned by the cpu and clears this bitfield. At
+ * this point, cmci_rediscover() kicks in and a different cpu may end up
+ * taking ownership of some of the shared MCA banks that were previously
+ * owned by the offlined cpu.
+ */
+static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
+
+/*
+ * CMCI storm detection backoff counter
+ *
+ * During storm, we reset this counter to INITIAL_CHECK_INTERVAL in case we've
+ * encountered an error. If not, we decrement it by one. We signal the end of
+ * the CMCI storm when it reaches 0.
+ */
+static DEFINE_PER_CPU(int, cmci_backoff_cnt);
+
+/*
+ * cmci_discover_lock protects against parallel discovery attempts
+ * which could race against each other.
+ */
+static DEFINE_RAW_SPINLOCK(cmci_discover_lock);
+
+#define CMCI_THRESHOLD 1
+#define CMCI_POLL_INTERVAL (30 * HZ)
+#define CMCI_STORM_INTERVAL (HZ)
+#define CMCI_STORM_THRESHOLD 15
+
+static DEFINE_PER_CPU(unsigned long, cmci_time_stamp);
+static DEFINE_PER_CPU(unsigned int, cmci_storm_cnt);
+static DEFINE_PER_CPU(unsigned int, cmci_storm_state);
+
+enum {
+ CMCI_STORM_NONE,
+ CMCI_STORM_ACTIVE,
+ CMCI_STORM_SUBSIDED,
+};
+
+static atomic_t cmci_storm_on_cpus;
+
+static int cmci_supported(int *banks)
+{
+ u64 cap;
+
+ if (mca_cfg.cmci_disabled || mca_cfg.ignore_ce)
+ return 0;
+
+ /*
+ * Vendor check is not strictly needed, but the initial
+ * initialization is vendor keyed and this
+ * makes sure none of the backdoors are entered otherwise.
+ */
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return 0;
+ if (!boot_cpu_has(X86_FEATURE_APIC) || lapic_get_maxlvt() < 6)
+ return 0;
+ rdmsrl(MSR_IA32_MCG_CAP, cap);
+ *banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff);
+ return !!(cap & MCG_CMCI_P);
+}
+
+static bool lmce_supported(void)
+{
+ u64 tmp;
+
+ if (mca_cfg.lmce_disabled)
+ return false;
+
+ rdmsrl(MSR_IA32_MCG_CAP, tmp);
+
+ /*
+ * LMCE depends on recovery support in the processor. Hence both
+ * MCG_SER_P and MCG_LMCE_P should be present in MCG_CAP.
+ */
+ if ((tmp & (MCG_SER_P | MCG_LMCE_P)) !=
+ (MCG_SER_P | MCG_LMCE_P))
+ return false;
+
+ /*
+ * BIOS should indicate support for LMCE by setting bit 20 in
+ * IA32_FEATURE_CONTROL without which touching MCG_EXT_CTL will
+ * generate a #GP fault.
+ */
+ rdmsrl(MSR_IA32_FEATURE_CONTROL, tmp);
+ if ((tmp & (FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_LMCE)) ==
+ (FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_LMCE))
+ return true;
+
+ return false;
+}
+
+bool mce_intel_cmci_poll(void)
+{
+ if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE)
+ return false;
+
+ /*
+ * Reset the counter if we've logged an error in the last poll
+ * during the storm.
+ */
+ if (machine_check_poll(0, this_cpu_ptr(&mce_banks_owned)))
+ this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL);
+ else
+ this_cpu_dec(cmci_backoff_cnt);
+
+ return true;
+}
+
+void mce_intel_hcpu_update(unsigned long cpu)
+{
+ if (per_cpu(cmci_storm_state, cpu) == CMCI_STORM_ACTIVE)
+ atomic_dec(&cmci_storm_on_cpus);
+
+ per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE;
+}
+
+static void cmci_toggle_interrupt_mode(bool on)
+{
+ unsigned long flags, *owned;
+ int bank;
+ u64 val;
+
+ raw_spin_lock_irqsave(&cmci_discover_lock, flags);
+ owned = this_cpu_ptr(mce_banks_owned);
+ for_each_set_bit(bank, owned, MAX_NR_BANKS) {
+ rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
+
+ if (on)
+ val |= MCI_CTL2_CMCI_EN;
+ else
+ val &= ~MCI_CTL2_CMCI_EN;
+
+ wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
+ }
+ raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
+}
+
+unsigned long cmci_intel_adjust_timer(unsigned long interval)
+{
+ if ((this_cpu_read(cmci_backoff_cnt) > 0) &&
+ (__this_cpu_read(cmci_storm_state) == CMCI_STORM_ACTIVE)) {
+ mce_notify_irq();
+ return CMCI_STORM_INTERVAL;
+ }
+
+ switch (__this_cpu_read(cmci_storm_state)) {
+ case CMCI_STORM_ACTIVE:
+
+ /*
+ * We switch back to interrupt mode once the poll timer has
+ * silenced itself. That means no events recorded and the timer
+ * interval is back to our poll interval.
+ */
+ __this_cpu_write(cmci_storm_state, CMCI_STORM_SUBSIDED);
+ if (!atomic_sub_return(1, &cmci_storm_on_cpus))
+ pr_notice("CMCI storm subsided: switching to interrupt mode\n");
+
+ /* FALLTHROUGH */
+
+ case CMCI_STORM_SUBSIDED:
+ /*
+ * We wait for all CPUs to go back to SUBSIDED state. When that
+ * happens we switch back to interrupt mode.
+ */
+ if (!atomic_read(&cmci_storm_on_cpus)) {
+ __this_cpu_write(cmci_storm_state, CMCI_STORM_NONE);
+ cmci_toggle_interrupt_mode(true);
+ cmci_recheck();
+ }
+ return CMCI_POLL_INTERVAL;
+ default:
+
+ /* We have shiny weather. Let the poll do whatever it thinks. */
+ return interval;
+ }
+}
+
+static bool cmci_storm_detect(void)
+{
+ unsigned int cnt = __this_cpu_read(cmci_storm_cnt);
+ unsigned long ts = __this_cpu_read(cmci_time_stamp);
+ unsigned long now = jiffies;
+ int r;
+
+ if (__this_cpu_read(cmci_storm_state) != CMCI_STORM_NONE)
+ return true;
+
+ if (time_before_eq(now, ts + CMCI_STORM_INTERVAL)) {
+ cnt++;
+ } else {
+ cnt = 1;
+ __this_cpu_write(cmci_time_stamp, now);
+ }
+ __this_cpu_write(cmci_storm_cnt, cnt);
+
+ if (cnt <= CMCI_STORM_THRESHOLD)
+ return false;
+
+ cmci_toggle_interrupt_mode(false);
+ __this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE);
+ r = atomic_add_return(1, &cmci_storm_on_cpus);
+ mce_timer_kick(CMCI_STORM_INTERVAL);
+ this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL);
+
+ if (r == 1)
+ pr_notice("CMCI storm detected: switching to poll mode\n");
+ return true;
+}
+
+/*
+ * The interrupt handler. This is called on every event.
+ * Just call the poller directly to log any events.
+ * This could in theory increase the threshold under high load,
+ * but doesn't for now.
+ */
+static void intel_threshold_interrupt(void)
+{
+ if (cmci_storm_detect())
+ return;
+
+ machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
+}
+
+/*
+ * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks
+ * on this CPU. Use the algorithm recommended in the SDM to discover shared
+ * banks.
+ */
+static void cmci_discover(int banks)
+{
+ unsigned long *owned = (void *)this_cpu_ptr(&mce_banks_owned);
+ unsigned long flags;
+ int i;
+ int bios_wrong_thresh = 0;
+
+ raw_spin_lock_irqsave(&cmci_discover_lock, flags);
+ for (i = 0; i < banks; i++) {
+ u64 val;
+ int bios_zero_thresh = 0;
+
+ if (test_bit(i, owned))
+ continue;
+
+ /* Skip banks in firmware first mode */
+ if (test_bit(i, mce_banks_ce_disabled))
+ continue;
+
+ rdmsrl(MSR_IA32_MCx_CTL2(i), val);
+
+ /* Already owned by someone else? */
+ if (val & MCI_CTL2_CMCI_EN) {
+ clear_bit(i, owned);
+ __clear_bit(i, this_cpu_ptr(mce_poll_banks));
+ continue;
+ }
+
+ if (!mca_cfg.bios_cmci_threshold) {
+ val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
+ val |= CMCI_THRESHOLD;
+ } else if (!(val & MCI_CTL2_CMCI_THRESHOLD_MASK)) {
+ /*
+ * If bios_cmci_threshold boot option was specified
+ * but the threshold is zero, we'll try to initialize
+ * it to 1.
+ */
+ bios_zero_thresh = 1;
+ val |= CMCI_THRESHOLD;
+ }
+
+ val |= MCI_CTL2_CMCI_EN;
+ wrmsrl(MSR_IA32_MCx_CTL2(i), val);
+ rdmsrl(MSR_IA32_MCx_CTL2(i), val);
+
+ /* Did the enable bit stick? -- the bank supports CMCI */
+ if (val & MCI_CTL2_CMCI_EN) {
+ set_bit(i, owned);
+ __clear_bit(i, this_cpu_ptr(mce_poll_banks));
+ /*
+ * We are able to set thresholds for some banks that
+ * had a threshold of 0. This means the BIOS has not
+ * set the thresholds properly or does not work with
+ * this boot option. Note down now and report later.
+ */
+ if (mca_cfg.bios_cmci_threshold && bios_zero_thresh &&
+ (val & MCI_CTL2_CMCI_THRESHOLD_MASK))
+ bios_wrong_thresh = 1;
+ } else {
+ WARN_ON(!test_bit(i, this_cpu_ptr(mce_poll_banks)));
+ }
+ }
+ raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
+ if (mca_cfg.bios_cmci_threshold && bios_wrong_thresh) {
+ pr_info_once(
+ "bios_cmci_threshold: Some banks do not have valid thresholds set\n");
+ pr_info_once(
+ "bios_cmci_threshold: Make sure your BIOS supports this boot option\n");
+ }
+}
+
+/*
+ * Just in case we missed an event during initialization check
+ * all the CMCI owned banks.
+ */
+void cmci_recheck(void)
+{
+ unsigned long flags;
+ int banks;
+
+ if (!mce_available(raw_cpu_ptr(&cpu_info)) || !cmci_supported(&banks))
+ return;
+
+ local_irq_save(flags);
+ machine_check_poll(0, this_cpu_ptr(&mce_banks_owned));
+ local_irq_restore(flags);
+}
+
+/* Caller must hold the lock on cmci_discover_lock */
+static void __cmci_disable_bank(int bank)
+{
+ u64 val;
+
+ if (!test_bit(bank, this_cpu_ptr(mce_banks_owned)))
+ return;
+ rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
+ val &= ~MCI_CTL2_CMCI_EN;
+ wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
+ __clear_bit(bank, this_cpu_ptr(mce_banks_owned));
+}
+
+/*
+ * Disable CMCI on this CPU for all banks it owns when it goes down.
+ * This allows other CPUs to claim the banks on rediscovery.
+ */
+void cmci_clear(void)
+{
+ unsigned long flags;
+ int i;
+ int banks;
+
+ if (!cmci_supported(&banks))
+ return;
+ raw_spin_lock_irqsave(&cmci_discover_lock, flags);
+ for (i = 0; i < banks; i++)
+ __cmci_disable_bank(i);
+ raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
+}
+
+static void cmci_rediscover_work_func(void *arg)
+{
+ int banks;
+
+ /* Recheck banks in case CPUs don't all have the same */
+ if (cmci_supported(&banks))
+ cmci_discover(banks);
+}
+
+/* After a CPU went down cycle through all the others and rediscover */
+void cmci_rediscover(void)
+{
+ int banks;
+
+ if (!cmci_supported(&banks))
+ return;
+
+ on_each_cpu(cmci_rediscover_work_func, NULL, 1);
+}
+
+/*
+ * Reenable CMCI on this CPU in case a CPU down failed.
+ */
+void cmci_reenable(void)
+{
+ int banks;
+ if (cmci_supported(&banks))
+ cmci_discover(banks);
+}
+
+void cmci_disable_bank(int bank)
+{
+ int banks;
+ unsigned long flags;
+
+ if (!cmci_supported(&banks))
+ return;
+
+ raw_spin_lock_irqsave(&cmci_discover_lock, flags);
+ __cmci_disable_bank(bank);
+ raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
+}
+
+static void intel_init_cmci(void)
+{
+ int banks;
+
+ if (!cmci_supported(&banks))
+ return;
+
+ mce_threshold_vector = intel_threshold_interrupt;
+ cmci_discover(banks);
+ /*
+ * For CPU #0 this runs with still disabled APIC, but that's
+ * ok because only the vector is set up. We still do another
+ * check for the banks later for CPU #0 just to make sure
+ * to not miss any events.
+ */
+ apic_write(APIC_LVTCMCI, THRESHOLD_APIC_VECTOR|APIC_DM_FIXED);
+ cmci_recheck();
+}
+
+static void intel_init_lmce(void)
+{
+ u64 val;
+
+ if (!lmce_supported())
+ return;
+
+ rdmsrl(MSR_IA32_MCG_EXT_CTL, val);
+
+ if (!(val & MCG_EXT_CTL_LMCE_EN))
+ wrmsrl(MSR_IA32_MCG_EXT_CTL, val | MCG_EXT_CTL_LMCE_EN);
+}
+
+static void intel_clear_lmce(void)
+{
+ u64 val;
+
+ if (!lmce_supported())
+ return;
+
+ rdmsrl(MSR_IA32_MCG_EXT_CTL, val);
+ val &= ~MCG_EXT_CTL_LMCE_EN;
+ wrmsrl(MSR_IA32_MCG_EXT_CTL, val);
+}
+
+static void intel_ppin_init(struct cpuinfo_x86 *c)
+{
+ unsigned long long val;
+
+ /*
+ * Even if testing the presence of the MSR would be enough, we don't
+ * want to risk the situation where other models reuse this MSR for
+ * other purposes.
+ */
+ switch (c->x86_model) {
+ case INTEL_FAM6_IVYBRIDGE_X:
+ case INTEL_FAM6_HASWELL_X:
+ case INTEL_FAM6_BROADWELL_XEON_D:
+ case INTEL_FAM6_BROADWELL_X:
+ case INTEL_FAM6_SKYLAKE_X:
+ case INTEL_FAM6_XEON_PHI_KNL:
+ case INTEL_FAM6_XEON_PHI_KNM:
+
+ if (rdmsrl_safe(MSR_PPIN_CTL, &val))
+ return;
+
+ if ((val & 3UL) == 1UL) {
+ /* PPIN locked in disabled mode */
+ return;
+ }
+
+ /* If PPIN is disabled, try to enable */
+ if (!(val & 2UL)) {
+ wrmsrl_safe(MSR_PPIN_CTL, val | 2UL);
+ rdmsrl_safe(MSR_PPIN_CTL, &val);
+ }
+
+ /* Is the enable bit set? */
+ if (val & 2UL)
+ set_cpu_cap(c, X86_FEATURE_INTEL_PPIN);
+ }
+}
+
+void mce_intel_feature_init(struct cpuinfo_x86 *c)
+{
+ intel_init_thermal(c);
+ intel_init_cmci();
+ intel_init_lmce();
+ intel_ppin_init(c);
+}
+
+void mce_intel_feature_clear(struct cpuinfo_x86 *c)
+{
+ intel_clear_lmce();
+}
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c
new file mode 100644
index 000000000..5cddf8317
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/p5.c
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * P5 specific Machine Check Exception Reporting
+ * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>
+ */
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/smp.h>
+
+#include <asm/processor.h>
+#include <asm/traps.h>
+#include <asm/tlbflush.h>
+#include <asm/mce.h>
+#include <asm/msr.h>
+
+/* By default disabled */
+int mce_p5_enabled __read_mostly;
+
+/* Machine check handler for Pentium class Intel CPUs: */
+static void pentium_machine_check(struct pt_regs *regs, long error_code)
+{
+ u32 loaddr, hi, lotype;
+
+ ist_enter(regs);
+
+ rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
+ rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi);
+
+ pr_emerg("CPU#%d: Machine Check Exception: 0x%8X (type 0x%8X).\n",
+ smp_processor_id(), loaddr, lotype);
+
+ if (lotype & (1<<5)) {
+ pr_emerg("CPU#%d: Possible thermal failure (CPU on fire ?).\n",
+ smp_processor_id());
+ }
+
+ add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
+
+ ist_exit(regs);
+}
+
+/* Set up machine check reporting for processors with Intel style MCE: */
+void intel_p5_mcheck_init(struct cpuinfo_x86 *c)
+{
+ u32 l, h;
+
+ /* Default P5 to off as its often misconnected: */
+ if (!mce_p5_enabled)
+ return;
+
+ /* Check for MCE support: */
+ if (!cpu_has(c, X86_FEATURE_MCE))
+ return;
+
+ machine_check_vector = pentium_machine_check;
+ /* Make sure the vector pointer is visible before we enable MCEs: */
+ wmb();
+
+ /* Read registers before enabling: */
+ rdmsr(MSR_IA32_P5_MC_ADDR, l, h);
+ rdmsr(MSR_IA32_P5_MC_TYPE, l, h);
+ pr_info("Intel old style machine check architecture supported.\n");
+
+ /* Enable MCE: */
+ cr4_set_bits(X86_CR4_MCE);
+ pr_info("Intel old style machine check reporting enabled on CPU#%d.\n",
+ smp_processor_id());
+}
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
new file mode 100644
index 000000000..ec6a07b04
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -0,0 +1,521 @@
+/*
+ * Thermal throttle event support code (such as syslog messaging and rate
+ * limiting) that was factored out from x86_64 (mce_intel.c) and i386 (p4.c).
+ *
+ * This allows consistent reporting of CPU thermal throttle events.
+ *
+ * Maintains a counter in /sys that keeps track of the number of thermal
+ * events, such that the user knows how bad the thermal problem might be
+ * (since the logging to syslog is rate limited).
+ *
+ * Author: Dmitriy Zavin (dmitriyz@google.com)
+ *
+ * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c.
+ * Inspired by Ross Biro's and Al Borchers' counter code.
+ */
+#include <linux/interrupt.h>
+#include <linux/notifier.h>
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <linux/export.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/cpu.h>
+
+#include <asm/processor.h>
+#include <asm/traps.h>
+#include <asm/apic.h>
+#include <asm/mce.h>
+#include <asm/msr.h>
+#include <asm/trace/irq_vectors.h>
+
+/* How long to wait between reporting thermal events */
+#define CHECK_INTERVAL (300 * HZ)
+
+#define THERMAL_THROTTLING_EVENT 0
+#define POWER_LIMIT_EVENT 1
+
+/*
+ * Current thermal event state:
+ */
+struct _thermal_state {
+ bool new_event;
+ int event;
+ u64 next_check;
+ unsigned long count;
+ unsigned long last_count;
+};
+
+struct thermal_state {
+ struct _thermal_state core_throttle;
+ struct _thermal_state core_power_limit;
+ struct _thermal_state package_throttle;
+ struct _thermal_state package_power_limit;
+ struct _thermal_state core_thresh0;
+ struct _thermal_state core_thresh1;
+ struct _thermal_state pkg_thresh0;
+ struct _thermal_state pkg_thresh1;
+};
+
+/* Callback to handle core threshold interrupts */
+int (*platform_thermal_notify)(__u64 msr_val);
+EXPORT_SYMBOL(platform_thermal_notify);
+
+/* Callback to handle core package threshold_interrupts */
+int (*platform_thermal_package_notify)(__u64 msr_val);
+EXPORT_SYMBOL_GPL(platform_thermal_package_notify);
+
+/* Callback support of rate control, return true, if
+ * callback has rate control */
+bool (*platform_thermal_package_rate_control)(void);
+EXPORT_SYMBOL_GPL(platform_thermal_package_rate_control);
+
+
+static DEFINE_PER_CPU(struct thermal_state, thermal_state);
+
+static atomic_t therm_throt_en = ATOMIC_INIT(0);
+
+static u32 lvtthmr_init __read_mostly;
+
+#ifdef CONFIG_SYSFS
+#define define_therm_throt_device_one_ro(_name) \
+ static DEVICE_ATTR(_name, 0444, \
+ therm_throt_device_show_##_name, \
+ NULL) \
+
+#define define_therm_throt_device_show_func(event, name) \
+ \
+static ssize_t therm_throt_device_show_##event##_##name( \
+ struct device *dev, \
+ struct device_attribute *attr, \
+ char *buf) \
+{ \
+ unsigned int cpu = dev->id; \
+ ssize_t ret; \
+ \
+ preempt_disable(); /* CPU hotplug */ \
+ if (cpu_online(cpu)) { \
+ ret = sprintf(buf, "%lu\n", \
+ per_cpu(thermal_state, cpu).event.name); \
+ } else \
+ ret = 0; \
+ preempt_enable(); \
+ \
+ return ret; \
+}
+
+define_therm_throt_device_show_func(core_throttle, count);
+define_therm_throt_device_one_ro(core_throttle_count);
+
+define_therm_throt_device_show_func(core_power_limit, count);
+define_therm_throt_device_one_ro(core_power_limit_count);
+
+define_therm_throt_device_show_func(package_throttle, count);
+define_therm_throt_device_one_ro(package_throttle_count);
+
+define_therm_throt_device_show_func(package_power_limit, count);
+define_therm_throt_device_one_ro(package_power_limit_count);
+
+static struct attribute *thermal_throttle_attrs[] = {
+ &dev_attr_core_throttle_count.attr,
+ NULL
+};
+
+static const struct attribute_group thermal_attr_group = {
+ .attrs = thermal_throttle_attrs,
+ .name = "thermal_throttle"
+};
+#endif /* CONFIG_SYSFS */
+
+#define CORE_LEVEL 0
+#define PACKAGE_LEVEL 1
+
+/***
+ * therm_throt_process - Process thermal throttling event from interrupt
+ * @curr: Whether the condition is current or not (boolean), since the
+ * thermal interrupt normally gets called both when the thermal
+ * event begins and once the event has ended.
+ *
+ * This function is called by the thermal interrupt after the
+ * IRQ has been acknowledged.
+ *
+ * It will take care of rate limiting and printing messages to the syslog.
+ */
+static void therm_throt_process(bool new_event, int event, int level)
+{
+ struct _thermal_state *state;
+ unsigned int this_cpu = smp_processor_id();
+ bool old_event;
+ u64 now;
+ struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu);
+
+ now = get_jiffies_64();
+ if (level == CORE_LEVEL) {
+ if (event == THERMAL_THROTTLING_EVENT)
+ state = &pstate->core_throttle;
+ else if (event == POWER_LIMIT_EVENT)
+ state = &pstate->core_power_limit;
+ else
+ return;
+ } else if (level == PACKAGE_LEVEL) {
+ if (event == THERMAL_THROTTLING_EVENT)
+ state = &pstate->package_throttle;
+ else if (event == POWER_LIMIT_EVENT)
+ state = &pstate->package_power_limit;
+ else
+ return;
+ } else
+ return;
+
+ old_event = state->new_event;
+ state->new_event = new_event;
+
+ if (new_event)
+ state->count++;
+
+ if (time_before64(now, state->next_check) &&
+ state->count != state->last_count)
+ return;
+
+ state->next_check = now + CHECK_INTERVAL;
+ state->last_count = state->count;
+
+ /* if we just entered the thermal event */
+ if (new_event) {
+ if (event == THERMAL_THROTTLING_EVENT)
+ pr_warn("CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n",
+ this_cpu,
+ level == CORE_LEVEL ? "Core" : "Package",
+ state->count);
+ return;
+ }
+ if (old_event) {
+ if (event == THERMAL_THROTTLING_EVENT)
+ pr_info("CPU%d: %s temperature/speed normal\n", this_cpu,
+ level == CORE_LEVEL ? "Core" : "Package");
+ return;
+ }
+}
+
+static int thresh_event_valid(int level, int event)
+{
+ struct _thermal_state *state;
+ unsigned int this_cpu = smp_processor_id();
+ struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu);
+ u64 now = get_jiffies_64();
+
+ if (level == PACKAGE_LEVEL)
+ state = (event == 0) ? &pstate->pkg_thresh0 :
+ &pstate->pkg_thresh1;
+ else
+ state = (event == 0) ? &pstate->core_thresh0 :
+ &pstate->core_thresh1;
+
+ if (time_before64(now, state->next_check))
+ return 0;
+
+ state->next_check = now + CHECK_INTERVAL;
+
+ return 1;
+}
+
+static bool int_pln_enable;
+static int __init int_pln_enable_setup(char *s)
+{
+ int_pln_enable = true;
+
+ return 1;
+}
+__setup("int_pln_enable", int_pln_enable_setup);
+
+#ifdef CONFIG_SYSFS
+/* Add/Remove thermal_throttle interface for CPU device: */
+static int thermal_throttle_add_dev(struct device *dev, unsigned int cpu)
+{
+ int err;
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+ err = sysfs_create_group(&dev->kobj, &thermal_attr_group);
+ if (err)
+ return err;
+
+ if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable)
+ err = sysfs_add_file_to_group(&dev->kobj,
+ &dev_attr_core_power_limit_count.attr,
+ thermal_attr_group.name);
+ if (cpu_has(c, X86_FEATURE_PTS)) {
+ err = sysfs_add_file_to_group(&dev->kobj,
+ &dev_attr_package_throttle_count.attr,
+ thermal_attr_group.name);
+ if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable)
+ err = sysfs_add_file_to_group(&dev->kobj,
+ &dev_attr_package_power_limit_count.attr,
+ thermal_attr_group.name);
+ }
+
+ return err;
+}
+
+static void thermal_throttle_remove_dev(struct device *dev)
+{
+ sysfs_remove_group(&dev->kobj, &thermal_attr_group);
+}
+
+/* Get notified when a cpu comes on/off. Be hotplug friendly. */
+static int thermal_throttle_online(unsigned int cpu)
+{
+ struct device *dev = get_cpu_device(cpu);
+
+ return thermal_throttle_add_dev(dev, cpu);
+}
+
+static int thermal_throttle_offline(unsigned int cpu)
+{
+ struct device *dev = get_cpu_device(cpu);
+
+ thermal_throttle_remove_dev(dev);
+ return 0;
+}
+
+static __init int thermal_throttle_init_device(void)
+{
+ int ret;
+
+ if (!atomic_read(&therm_throt_en))
+ return 0;
+
+ ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/therm:online",
+ thermal_throttle_online,
+ thermal_throttle_offline);
+ return ret < 0 ? ret : 0;
+}
+device_initcall(thermal_throttle_init_device);
+
+#endif /* CONFIG_SYSFS */
+
+static void notify_package_thresholds(__u64 msr_val)
+{
+ bool notify_thres_0 = false;
+ bool notify_thres_1 = false;
+
+ if (!platform_thermal_package_notify)
+ return;
+
+ /* lower threshold check */
+ if (msr_val & THERM_LOG_THRESHOLD0)
+ notify_thres_0 = true;
+ /* higher threshold check */
+ if (msr_val & THERM_LOG_THRESHOLD1)
+ notify_thres_1 = true;
+
+ if (!notify_thres_0 && !notify_thres_1)
+ return;
+
+ if (platform_thermal_package_rate_control &&
+ platform_thermal_package_rate_control()) {
+ /* Rate control is implemented in callback */
+ platform_thermal_package_notify(msr_val);
+ return;
+ }
+
+ /* lower threshold reached */
+ if (notify_thres_0 && thresh_event_valid(PACKAGE_LEVEL, 0))
+ platform_thermal_package_notify(msr_val);
+ /* higher threshold reached */
+ if (notify_thres_1 && thresh_event_valid(PACKAGE_LEVEL, 1))
+ platform_thermal_package_notify(msr_val);
+}
+
+static void notify_thresholds(__u64 msr_val)
+{
+ /* check whether the interrupt handler is defined;
+ * otherwise simply return
+ */
+ if (!platform_thermal_notify)
+ return;
+
+ /* lower threshold reached */
+ if ((msr_val & THERM_LOG_THRESHOLD0) &&
+ thresh_event_valid(CORE_LEVEL, 0))
+ platform_thermal_notify(msr_val);
+ /* higher threshold reached */
+ if ((msr_val & THERM_LOG_THRESHOLD1) &&
+ thresh_event_valid(CORE_LEVEL, 1))
+ platform_thermal_notify(msr_val);
+}
+
+/* Thermal transition interrupt handler */
+static void intel_thermal_interrupt(void)
+{
+ __u64 msr_val;
+
+ if (static_cpu_has(X86_FEATURE_HWP))
+ wrmsrl_safe(MSR_HWP_STATUS, 0);
+
+ rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
+
+ /* Check for violation of core thermal thresholds*/
+ notify_thresholds(msr_val);
+
+ therm_throt_process(msr_val & THERM_STATUS_PROCHOT,
+ THERMAL_THROTTLING_EVENT,
+ CORE_LEVEL);
+
+ if (this_cpu_has(X86_FEATURE_PLN) && int_pln_enable)
+ therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT,
+ POWER_LIMIT_EVENT,
+ CORE_LEVEL);
+
+ if (this_cpu_has(X86_FEATURE_PTS)) {
+ rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
+ /* check violations of package thermal thresholds */
+ notify_package_thresholds(msr_val);
+ therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT,
+ THERMAL_THROTTLING_EVENT,
+ PACKAGE_LEVEL);
+ if (this_cpu_has(X86_FEATURE_PLN) && int_pln_enable)
+ therm_throt_process(msr_val &
+ PACKAGE_THERM_STATUS_POWER_LIMIT,
+ POWER_LIMIT_EVENT,
+ PACKAGE_LEVEL);
+ }
+}
+
+static void unexpected_thermal_interrupt(void)
+{
+ pr_err("CPU%d: Unexpected LVT thermal interrupt!\n",
+ smp_processor_id());
+}
+
+static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt;
+
+asmlinkage __visible void __irq_entry smp_thermal_interrupt(struct pt_regs *regs)
+{
+ entering_irq();
+ trace_thermal_apic_entry(THERMAL_APIC_VECTOR);
+ inc_irq_stat(irq_thermal_count);
+ smp_thermal_vector();
+ trace_thermal_apic_exit(THERMAL_APIC_VECTOR);
+ exiting_ack_irq();
+}
+
+/* Thermal monitoring depends on APIC, ACPI and clock modulation */
+static int intel_thermal_supported(struct cpuinfo_x86 *c)
+{
+ if (!boot_cpu_has(X86_FEATURE_APIC))
+ return 0;
+ if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC))
+ return 0;
+ return 1;
+}
+
+void __init mcheck_intel_therm_init(void)
+{
+ /*
+ * This function is only called on boot CPU. Save the init thermal
+ * LVT value on BSP and use that value to restore APs' thermal LVT
+ * entry BIOS programmed later
+ */
+ if (intel_thermal_supported(&boot_cpu_data))
+ lvtthmr_init = apic_read(APIC_LVTTHMR);
+}
+
+void intel_init_thermal(struct cpuinfo_x86 *c)
+{
+ unsigned int cpu = smp_processor_id();
+ int tm2 = 0;
+ u32 l, h;
+
+ if (!intel_thermal_supported(c))
+ return;
+
+ /*
+ * First check if its enabled already, in which case there might
+ * be some SMM goo which handles it, so we can't even put a handler
+ * since it might be delivered via SMI already:
+ */
+ rdmsr(MSR_IA32_MISC_ENABLE, l, h);
+
+ h = lvtthmr_init;
+ /*
+ * The initial value of thermal LVT entries on all APs always reads
+ * 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI
+ * sequence to them and LVT registers are reset to 0s except for
+ * the mask bits which are set to 1s when APs receive INIT IPI.
+ * If BIOS takes over the thermal interrupt and sets its interrupt
+ * delivery mode to SMI (not fixed), it restores the value that the
+ * BIOS has programmed on AP based on BSP's info we saved since BIOS
+ * is always setting the same value for all threads/cores.
+ */
+ if ((h & APIC_DM_FIXED_MASK) != APIC_DM_FIXED)
+ apic_write(APIC_LVTTHMR, lvtthmr_init);
+
+
+ if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
+ if (system_state == SYSTEM_BOOTING)
+ pr_debug("CPU%d: Thermal monitoring handled by SMI\n", cpu);
+ return;
+ }
+
+ /* early Pentium M models use different method for enabling TM2 */
+ if (cpu_has(c, X86_FEATURE_TM2)) {
+ if (c->x86 == 6 && (c->x86_model == 9 || c->x86_model == 13)) {
+ rdmsr(MSR_THERM2_CTL, l, h);
+ if (l & MSR_THERM2_CTL_TM_SELECT)
+ tm2 = 1;
+ } else if (l & MSR_IA32_MISC_ENABLE_TM2)
+ tm2 = 1;
+ }
+
+ /* We'll mask the thermal vector in the lapic till we're ready: */
+ h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED;
+ apic_write(APIC_LVTTHMR, h);
+
+ rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
+ if (cpu_has(c, X86_FEATURE_PLN) && !int_pln_enable)
+ wrmsr(MSR_IA32_THERM_INTERRUPT,
+ (l | (THERM_INT_LOW_ENABLE
+ | THERM_INT_HIGH_ENABLE)) & ~THERM_INT_PLN_ENABLE, h);
+ else if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable)
+ wrmsr(MSR_IA32_THERM_INTERRUPT,
+ l | (THERM_INT_LOW_ENABLE
+ | THERM_INT_HIGH_ENABLE | THERM_INT_PLN_ENABLE), h);
+ else
+ wrmsr(MSR_IA32_THERM_INTERRUPT,
+ l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h);
+
+ if (cpu_has(c, X86_FEATURE_PTS)) {
+ rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
+ if (cpu_has(c, X86_FEATURE_PLN) && !int_pln_enable)
+ wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
+ (l | (PACKAGE_THERM_INT_LOW_ENABLE
+ | PACKAGE_THERM_INT_HIGH_ENABLE))
+ & ~PACKAGE_THERM_INT_PLN_ENABLE, h);
+ else if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable)
+ wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
+ l | (PACKAGE_THERM_INT_LOW_ENABLE
+ | PACKAGE_THERM_INT_HIGH_ENABLE
+ | PACKAGE_THERM_INT_PLN_ENABLE), h);
+ else
+ wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
+ l | (PACKAGE_THERM_INT_LOW_ENABLE
+ | PACKAGE_THERM_INT_HIGH_ENABLE), h);
+ }
+
+ smp_thermal_vector = intel_thermal_interrupt;
+
+ rdmsr(MSR_IA32_MISC_ENABLE, l, h);
+ wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h);
+
+ /* Unmask the thermal vector: */
+ l = apic_read(APIC_LVTTHMR);
+ apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
+
+ pr_info_once("CPU0: Thermal monitoring enabled (%s)\n",
+ tm2 ? "TM2" : "TM1");
+
+ /* enable thermal throttle processing */
+ atomic_set(&therm_throt_en, 1);
+}
diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c
new file mode 100644
index 000000000..c21e0a1ef
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/threshold.c
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Common corrected MCE threshold handler code:
+ */
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+
+#include <asm/irq_vectors.h>
+#include <asm/traps.h>
+#include <asm/apic.h>
+#include <asm/mce.h>
+#include <asm/trace/irq_vectors.h>
+
+static void default_threshold_interrupt(void)
+{
+ pr_err("Unexpected threshold interrupt at vector %x\n",
+ THRESHOLD_APIC_VECTOR);
+}
+
+void (*mce_threshold_vector)(void) = default_threshold_interrupt;
+
+asmlinkage __visible void __irq_entry smp_threshold_interrupt(struct pt_regs *regs)
+{
+ entering_irq();
+ trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR);
+ inc_irq_stat(irq_threshold_count);
+ mce_threshold_vector();
+ trace_threshold_apic_exit(THRESHOLD_APIC_VECTOR);
+ exiting_ack_irq();
+}
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c
new file mode 100644
index 000000000..3b45b270a
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/winchip.c
@@ -0,0 +1,44 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * IDT Winchip specific Machine Check Exception Reporting
+ * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>
+ */
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+
+#include <asm/processor.h>
+#include <asm/traps.h>
+#include <asm/tlbflush.h>
+#include <asm/mce.h>
+#include <asm/msr.h>
+
+/* Machine check handler for WinChip C6: */
+static void winchip_machine_check(struct pt_regs *regs, long error_code)
+{
+ ist_enter(regs);
+
+ pr_emerg("CPU0: Machine Check Exception.\n");
+ add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
+
+ ist_exit(regs);
+}
+
+/* Set up machine check reporting on the Winchip C6 series */
+void winchip_mcheck_init(struct cpuinfo_x86 *c)
+{
+ u32 lo, hi;
+
+ machine_check_vector = winchip_machine_check;
+ /* Make sure the vector pointer is visible before we enable MCEs: */
+ wmb();
+
+ rdmsr(MSR_IDT_FCR1, lo, hi);
+ lo |= (1<<2); /* Enable EIERRINT (int 18 MCE) */
+ lo &= ~(1<<4); /* Enable MCE */
+ wrmsr(MSR_IDT_FCR1, lo, hi);
+
+ cr4_set_bits(X86_CR4_MCE);
+
+ pr_info("Winchip machine check reporting enabled on CPU#0.\n");
+}
diff --git a/arch/x86/kernel/cpu/microcode/Makefile b/arch/x86/kernel/cpu/microcode/Makefile
new file mode 100644
index 000000000..ba12e8aa4
--- /dev/null
+++ b/arch/x86/kernel/cpu/microcode/Makefile
@@ -0,0 +1,4 @@
+microcode-y := core.o
+obj-$(CONFIG_MICROCODE) += microcode.o
+microcode-$(CONFIG_MICROCODE_INTEL) += intel.o
+microcode-$(CONFIG_MICROCODE_AMD) += amd.o
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
new file mode 100644
index 000000000..a4e7e100e
--- /dev/null
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -0,0 +1,818 @@
+/*
+ * AMD CPU Microcode Update Driver for Linux
+ *
+ * This driver allows to upgrade microcode on F10h AMD
+ * CPUs and later.
+ *
+ * Copyright (C) 2008-2011 Advanced Micro Devices Inc.
+ * 2013-2016 Borislav Petkov <bp@alien8.de>
+ *
+ * Author: Peter Oruba <peter.oruba@amd.com>
+ *
+ * Based on work by:
+ * Tigran Aivazian <aivazian.tigran@gmail.com>
+ *
+ * early loader:
+ * Copyright (C) 2013 Advanced Micro Devices, Inc.
+ *
+ * Author: Jacob Shin <jacob.shin@amd.com>
+ * Fixes: Borislav Petkov <bp@suse.de>
+ *
+ * Licensed under the terms of the GNU General Public
+ * License version 2. See file COPYING for details.
+ */
+#define pr_fmt(fmt) "microcode: " fmt
+
+#include <linux/earlycpio.h>
+#include <linux/firmware.h>
+#include <linux/uaccess.h>
+#include <linux/vmalloc.h>
+#include <linux/initrd.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+
+#include <asm/microcode_amd.h>
+#include <asm/microcode.h>
+#include <asm/processor.h>
+#include <asm/setup.h>
+#include <asm/cpu.h>
+#include <asm/msr.h>
+
+static struct equiv_cpu_entry *equiv_cpu_table;
+
+/*
+ * This points to the current valid container of microcode patches which we will
+ * save from the initrd/builtin before jettisoning its contents. @mc is the
+ * microcode patch we found to match.
+ */
+struct cont_desc {
+ struct microcode_amd *mc;
+ u32 cpuid_1_eax;
+ u32 psize;
+ u8 *data;
+ size_t size;
+};
+
+static u32 ucode_new_rev;
+static u8 amd_ucode_patch[PATCH_MAX_SIZE];
+
+/*
+ * Microcode patch container file is prepended to the initrd in cpio
+ * format. See Documentation/x86/microcode.txt
+ */
+static const char
+ucode_path[] __maybe_unused = "kernel/x86/microcode/AuthenticAMD.bin";
+
+static u16 find_equiv_id(struct equiv_cpu_entry *equiv_table, u32 sig)
+{
+ for (; equiv_table && equiv_table->installed_cpu; equiv_table++) {
+ if (sig == equiv_table->installed_cpu)
+ return equiv_table->equiv_cpu;
+ }
+
+ return 0;
+}
+
+/*
+ * This scans the ucode blob for the proper container as we can have multiple
+ * containers glued together. Returns the equivalence ID from the equivalence
+ * table or 0 if none found.
+ * Returns the amount of bytes consumed while scanning. @desc contains all the
+ * data we're going to use in later stages of the application.
+ */
+static ssize_t parse_container(u8 *ucode, ssize_t size, struct cont_desc *desc)
+{
+ struct equiv_cpu_entry *eq;
+ ssize_t orig_size = size;
+ u32 *hdr = (u32 *)ucode;
+ u16 eq_id;
+ u8 *buf;
+
+ /* Am I looking at an equivalence table header? */
+ if (hdr[0] != UCODE_MAGIC ||
+ hdr[1] != UCODE_EQUIV_CPU_TABLE_TYPE ||
+ hdr[2] == 0)
+ return CONTAINER_HDR_SZ;
+
+ buf = ucode;
+
+ eq = (struct equiv_cpu_entry *)(buf + CONTAINER_HDR_SZ);
+
+ /* Find the equivalence ID of our CPU in this table: */
+ eq_id = find_equiv_id(eq, desc->cpuid_1_eax);
+
+ buf += hdr[2] + CONTAINER_HDR_SZ;
+ size -= hdr[2] + CONTAINER_HDR_SZ;
+
+ /*
+ * Scan through the rest of the container to find where it ends. We do
+ * some basic sanity-checking too.
+ */
+ while (size > 0) {
+ struct microcode_amd *mc;
+ u32 patch_size;
+
+ hdr = (u32 *)buf;
+
+ if (hdr[0] != UCODE_UCODE_TYPE)
+ break;
+
+ /* Sanity-check patch size. */
+ patch_size = hdr[1];
+ if (patch_size > PATCH_MAX_SIZE)
+ break;
+
+ /* Skip patch section header: */
+ buf += SECTION_HDR_SIZE;
+ size -= SECTION_HDR_SIZE;
+
+ mc = (struct microcode_amd *)buf;
+ if (eq_id == mc->hdr.processor_rev_id) {
+ desc->psize = patch_size;
+ desc->mc = mc;
+ }
+
+ buf += patch_size;
+ size -= patch_size;
+ }
+
+ /*
+ * If we have found a patch (desc->mc), it means we're looking at the
+ * container which has a patch for this CPU so return 0 to mean, @ucode
+ * already points to the proper container. Otherwise, we return the size
+ * we scanned so that we can advance to the next container in the
+ * buffer.
+ */
+ if (desc->mc) {
+ desc->data = ucode;
+ desc->size = orig_size - size;
+
+ return 0;
+ }
+
+ return orig_size - size;
+}
+
+/*
+ * Scan the ucode blob for the proper container as we can have multiple
+ * containers glued together.
+ */
+static void scan_containers(u8 *ucode, size_t size, struct cont_desc *desc)
+{
+ ssize_t rem = size;
+
+ while (rem >= 0) {
+ ssize_t s = parse_container(ucode, rem, desc);
+ if (!s)
+ return;
+
+ ucode += s;
+ rem -= s;
+ }
+}
+
+static int __apply_microcode_amd(struct microcode_amd *mc)
+{
+ u32 rev, dummy;
+
+ native_wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc->hdr.data_code);
+
+ /* verify patch application was successful */
+ native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
+ if (rev != mc->hdr.patch_id)
+ return -1;
+
+ return 0;
+}
+
+/*
+ * Early load occurs before we can vmalloc(). So we look for the microcode
+ * patch container file in initrd, traverse equivalent cpu table, look for a
+ * matching microcode patch, and update, all in initrd memory in place.
+ * When vmalloc() is available for use later -- on 64-bit during first AP load,
+ * and on 32-bit during save_microcode_in_initrd_amd() -- we can call
+ * load_microcode_amd() to save equivalent cpu table and microcode patches in
+ * kernel heap memory.
+ *
+ * Returns true if container found (sets @desc), false otherwise.
+ */
+static bool
+apply_microcode_early_amd(u32 cpuid_1_eax, void *ucode, size_t size, bool save_patch)
+{
+ struct cont_desc desc = { 0 };
+ u8 (*patch)[PATCH_MAX_SIZE];
+ struct microcode_amd *mc;
+ u32 rev, dummy, *new_rev;
+ bool ret = false;
+
+#ifdef CONFIG_X86_32
+ new_rev = (u32 *)__pa_nodebug(&ucode_new_rev);
+ patch = (u8 (*)[PATCH_MAX_SIZE])__pa_nodebug(&amd_ucode_patch);
+#else
+ new_rev = &ucode_new_rev;
+ patch = &amd_ucode_patch;
+#endif
+
+ desc.cpuid_1_eax = cpuid_1_eax;
+
+ scan_containers(ucode, size, &desc);
+
+ mc = desc.mc;
+ if (!mc)
+ return ret;
+
+ native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
+ if (rev >= mc->hdr.patch_id)
+ return ret;
+
+ if (!__apply_microcode_amd(mc)) {
+ *new_rev = mc->hdr.patch_id;
+ ret = true;
+
+ if (save_patch)
+ memcpy(patch, mc, min_t(u32, desc.psize, PATCH_MAX_SIZE));
+ }
+
+ return ret;
+}
+
+static bool get_builtin_microcode(struct cpio_data *cp, unsigned int family)
+{
+#ifdef CONFIG_X86_64
+ char fw_name[36] = "amd-ucode/microcode_amd.bin";
+
+ if (family >= 0x15)
+ snprintf(fw_name, sizeof(fw_name),
+ "amd-ucode/microcode_amd_fam%.2xh.bin", family);
+
+ return get_builtin_firmware(cp, fw_name);
+#else
+ return false;
+#endif
+}
+
+static void __load_ucode_amd(unsigned int cpuid_1_eax, struct cpio_data *ret)
+{
+ struct ucode_cpu_info *uci;
+ struct cpio_data cp;
+ const char *path;
+ bool use_pa;
+
+ if (IS_ENABLED(CONFIG_X86_32)) {
+ uci = (struct ucode_cpu_info *)__pa_nodebug(ucode_cpu_info);
+ path = (const char *)__pa_nodebug(ucode_path);
+ use_pa = true;
+ } else {
+ uci = ucode_cpu_info;
+ path = ucode_path;
+ use_pa = false;
+ }
+
+ if (!get_builtin_microcode(&cp, x86_family(cpuid_1_eax)))
+ cp = find_microcode_in_initrd(path, use_pa);
+
+ /* Needed in load_microcode_amd() */
+ uci->cpu_sig.sig = cpuid_1_eax;
+
+ *ret = cp;
+}
+
+void __init load_ucode_amd_bsp(unsigned int cpuid_1_eax)
+{
+ struct cpio_data cp = { };
+
+ __load_ucode_amd(cpuid_1_eax, &cp);
+ if (!(cp.data && cp.size))
+ return;
+
+ apply_microcode_early_amd(cpuid_1_eax, cp.data, cp.size, true);
+}
+
+void load_ucode_amd_ap(unsigned int cpuid_1_eax)
+{
+ struct microcode_amd *mc;
+ struct cpio_data cp;
+ u32 *new_rev, rev, dummy;
+
+ if (IS_ENABLED(CONFIG_X86_32)) {
+ mc = (struct microcode_amd *)__pa_nodebug(amd_ucode_patch);
+ new_rev = (u32 *)__pa_nodebug(&ucode_new_rev);
+ } else {
+ mc = (struct microcode_amd *)amd_ucode_patch;
+ new_rev = &ucode_new_rev;
+ }
+
+ native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
+
+ /* Check whether we have saved a new patch already: */
+ if (*new_rev && rev < mc->hdr.patch_id) {
+ if (!__apply_microcode_amd(mc)) {
+ *new_rev = mc->hdr.patch_id;
+ return;
+ }
+ }
+
+ __load_ucode_amd(cpuid_1_eax, &cp);
+ if (!(cp.data && cp.size))
+ return;
+
+ apply_microcode_early_amd(cpuid_1_eax, cp.data, cp.size, false);
+}
+
+static enum ucode_state
+load_microcode_amd(bool save, u8 family, const u8 *data, size_t size);
+
+int __init save_microcode_in_initrd_amd(unsigned int cpuid_1_eax)
+{
+ struct cont_desc desc = { 0 };
+ enum ucode_state ret;
+ struct cpio_data cp;
+
+ cp = find_microcode_in_initrd(ucode_path, false);
+ if (!(cp.data && cp.size))
+ return -EINVAL;
+
+ desc.cpuid_1_eax = cpuid_1_eax;
+
+ scan_containers(cp.data, cp.size, &desc);
+ if (!desc.mc)
+ return -EINVAL;
+
+ ret = load_microcode_amd(true, x86_family(cpuid_1_eax), desc.data, desc.size);
+ if (ret > UCODE_UPDATED)
+ return -EINVAL;
+
+ return 0;
+}
+
+void reload_ucode_amd(void)
+{
+ struct microcode_amd *mc;
+ u32 rev, dummy;
+
+ mc = (struct microcode_amd *)amd_ucode_patch;
+
+ rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
+
+ if (rev < mc->hdr.patch_id) {
+ if (!__apply_microcode_amd(mc)) {
+ ucode_new_rev = mc->hdr.patch_id;
+ pr_info("reload patch_level=0x%08x\n", ucode_new_rev);
+ }
+ }
+}
+static u16 __find_equiv_id(unsigned int cpu)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+ return find_equiv_id(equiv_cpu_table, uci->cpu_sig.sig);
+}
+
+static u32 find_cpu_family_by_equiv_cpu(u16 equiv_cpu)
+{
+ int i = 0;
+
+ BUG_ON(!equiv_cpu_table);
+
+ while (equiv_cpu_table[i].equiv_cpu != 0) {
+ if (equiv_cpu == equiv_cpu_table[i].equiv_cpu)
+ return equiv_cpu_table[i].installed_cpu;
+ i++;
+ }
+ return 0;
+}
+
+/*
+ * a small, trivial cache of per-family ucode patches
+ */
+static struct ucode_patch *cache_find_patch(u16 equiv_cpu)
+{
+ struct ucode_patch *p;
+
+ list_for_each_entry(p, &microcode_cache, plist)
+ if (p->equiv_cpu == equiv_cpu)
+ return p;
+ return NULL;
+}
+
+static void update_cache(struct ucode_patch *new_patch)
+{
+ struct ucode_patch *p;
+
+ list_for_each_entry(p, &microcode_cache, plist) {
+ if (p->equiv_cpu == new_patch->equiv_cpu) {
+ if (p->patch_id >= new_patch->patch_id) {
+ /* we already have the latest patch */
+ kfree(new_patch->data);
+ kfree(new_patch);
+ return;
+ }
+
+ list_replace(&p->plist, &new_patch->plist);
+ kfree(p->data);
+ kfree(p);
+ return;
+ }
+ }
+ /* no patch found, add it */
+ list_add_tail(&new_patch->plist, &microcode_cache);
+}
+
+static void free_cache(void)
+{
+ struct ucode_patch *p, *tmp;
+
+ list_for_each_entry_safe(p, tmp, &microcode_cache, plist) {
+ __list_del(p->plist.prev, p->plist.next);
+ kfree(p->data);
+ kfree(p);
+ }
+}
+
+static struct ucode_patch *find_patch(unsigned int cpu)
+{
+ u16 equiv_id;
+
+ equiv_id = __find_equiv_id(cpu);
+ if (!equiv_id)
+ return NULL;
+
+ return cache_find_patch(equiv_id);
+}
+
+static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
+{
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+ struct ucode_patch *p;
+
+ csig->sig = cpuid_eax(0x00000001);
+ csig->rev = c->microcode;
+
+ /*
+ * a patch could have been loaded early, set uci->mc so that
+ * mc_bp_resume() can call apply_microcode()
+ */
+ p = find_patch(cpu);
+ if (p && (p->patch_id == csig->rev))
+ uci->mc = p->data;
+
+ pr_info("CPU%d: patch_level=0x%08x\n", cpu, csig->rev);
+
+ return 0;
+}
+
+static unsigned int verify_patch_size(u8 family, u32 patch_size,
+ unsigned int size)
+{
+ u32 max_size;
+
+#define F1XH_MPB_MAX_SIZE 2048
+#define F14H_MPB_MAX_SIZE 1824
+#define F15H_MPB_MAX_SIZE 4096
+#define F16H_MPB_MAX_SIZE 3458
+#define F17H_MPB_MAX_SIZE 3200
+
+ switch (family) {
+ case 0x14:
+ max_size = F14H_MPB_MAX_SIZE;
+ break;
+ case 0x15:
+ max_size = F15H_MPB_MAX_SIZE;
+ break;
+ case 0x16:
+ max_size = F16H_MPB_MAX_SIZE;
+ break;
+ case 0x17:
+ max_size = F17H_MPB_MAX_SIZE;
+ break;
+ default:
+ max_size = F1XH_MPB_MAX_SIZE;
+ break;
+ }
+
+ if (patch_size > min_t(u32, size, max_size)) {
+ pr_err("patch size mismatch\n");
+ return 0;
+ }
+
+ return patch_size;
+}
+
+static enum ucode_state apply_microcode_amd(int cpu)
+{
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+ struct microcode_amd *mc_amd;
+ struct ucode_cpu_info *uci;
+ struct ucode_patch *p;
+ enum ucode_state ret;
+ u32 rev, dummy;
+
+ BUG_ON(raw_smp_processor_id() != cpu);
+
+ uci = ucode_cpu_info + cpu;
+
+ p = find_patch(cpu);
+ if (!p)
+ return UCODE_NFOUND;
+
+ mc_amd = p->data;
+ uci->mc = p->data;
+
+ rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
+
+ /* need to apply patch? */
+ if (rev >= mc_amd->hdr.patch_id) {
+ ret = UCODE_OK;
+ goto out;
+ }
+
+ if (__apply_microcode_amd(mc_amd)) {
+ pr_err("CPU%d: update failed for patch_level=0x%08x\n",
+ cpu, mc_amd->hdr.patch_id);
+ return UCODE_ERROR;
+ }
+
+ rev = mc_amd->hdr.patch_id;
+ ret = UCODE_UPDATED;
+
+ pr_info("CPU%d: new patch_level=0x%08x\n", cpu, rev);
+
+out:
+ uci->cpu_sig.rev = rev;
+ c->microcode = rev;
+
+ /* Update boot_cpu_data's revision too, if we're on the BSP: */
+ if (c->cpu_index == boot_cpu_data.cpu_index)
+ boot_cpu_data.microcode = rev;
+
+ return ret;
+}
+
+static int install_equiv_cpu_table(const u8 *buf)
+{
+ unsigned int *ibuf = (unsigned int *)buf;
+ unsigned int type = ibuf[1];
+ unsigned int size = ibuf[2];
+
+ if (type != UCODE_EQUIV_CPU_TABLE_TYPE || !size) {
+ pr_err("empty section/"
+ "invalid type field in container file section header\n");
+ return -EINVAL;
+ }
+
+ equiv_cpu_table = vmalloc(size);
+ if (!equiv_cpu_table) {
+ pr_err("failed to allocate equivalent CPU table\n");
+ return -ENOMEM;
+ }
+
+ memcpy(equiv_cpu_table, buf + CONTAINER_HDR_SZ, size);
+
+ /* add header length */
+ return size + CONTAINER_HDR_SZ;
+}
+
+static void free_equiv_cpu_table(void)
+{
+ vfree(equiv_cpu_table);
+ equiv_cpu_table = NULL;
+}
+
+static void cleanup(void)
+{
+ free_equiv_cpu_table();
+ free_cache();
+}
+
+/*
+ * We return the current size even if some of the checks failed so that
+ * we can skip over the next patch. If we return a negative value, we
+ * signal a grave error like a memory allocation has failed and the
+ * driver cannot continue functioning normally. In such cases, we tear
+ * down everything we've used up so far and exit.
+ */
+static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover)
+{
+ struct microcode_header_amd *mc_hdr;
+ struct ucode_patch *patch;
+ unsigned int patch_size, crnt_size, ret;
+ u32 proc_fam;
+ u16 proc_id;
+
+ patch_size = *(u32 *)(fw + 4);
+ crnt_size = patch_size + SECTION_HDR_SIZE;
+ mc_hdr = (struct microcode_header_amd *)(fw + SECTION_HDR_SIZE);
+ proc_id = mc_hdr->processor_rev_id;
+
+ proc_fam = find_cpu_family_by_equiv_cpu(proc_id);
+ if (!proc_fam) {
+ pr_err("No patch family for equiv ID: 0x%04x\n", proc_id);
+ return crnt_size;
+ }
+
+ /* check if patch is for the current family */
+ proc_fam = ((proc_fam >> 8) & 0xf) + ((proc_fam >> 20) & 0xff);
+ if (proc_fam != family)
+ return crnt_size;
+
+ if (mc_hdr->nb_dev_id || mc_hdr->sb_dev_id) {
+ pr_err("Patch-ID 0x%08x: chipset-specific code unsupported.\n",
+ mc_hdr->patch_id);
+ return crnt_size;
+ }
+
+ ret = verify_patch_size(family, patch_size, leftover);
+ if (!ret) {
+ pr_err("Patch-ID 0x%08x: size mismatch.\n", mc_hdr->patch_id);
+ return crnt_size;
+ }
+
+ patch = kzalloc(sizeof(*patch), GFP_KERNEL);
+ if (!patch) {
+ pr_err("Patch allocation failure.\n");
+ return -EINVAL;
+ }
+
+ patch->data = kmemdup(fw + SECTION_HDR_SIZE, patch_size, GFP_KERNEL);
+ if (!patch->data) {
+ pr_err("Patch data allocation failure.\n");
+ kfree(patch);
+ return -EINVAL;
+ }
+
+ INIT_LIST_HEAD(&patch->plist);
+ patch->patch_id = mc_hdr->patch_id;
+ patch->equiv_cpu = proc_id;
+
+ pr_debug("%s: Added patch_id: 0x%08x, proc_id: 0x%04x\n",
+ __func__, patch->patch_id, proc_id);
+
+ /* ... and add to cache. */
+ update_cache(patch);
+
+ return crnt_size;
+}
+
+static enum ucode_state __load_microcode_amd(u8 family, const u8 *data,
+ size_t size)
+{
+ enum ucode_state ret = UCODE_ERROR;
+ unsigned int leftover;
+ u8 *fw = (u8 *)data;
+ int crnt_size = 0;
+ int offset;
+
+ offset = install_equiv_cpu_table(data);
+ if (offset < 0) {
+ pr_err("failed to create equivalent cpu table\n");
+ return ret;
+ }
+ fw += offset;
+ leftover = size - offset;
+
+ if (*(u32 *)fw != UCODE_UCODE_TYPE) {
+ pr_err("invalid type field in container file section header\n");
+ free_equiv_cpu_table();
+ return ret;
+ }
+
+ while (leftover) {
+ crnt_size = verify_and_add_patch(family, fw, leftover);
+ if (crnt_size < 0)
+ return ret;
+
+ fw += crnt_size;
+ leftover -= crnt_size;
+ }
+
+ return UCODE_OK;
+}
+
+static enum ucode_state
+load_microcode_amd(bool save, u8 family, const u8 *data, size_t size)
+{
+ struct ucode_patch *p;
+ enum ucode_state ret;
+
+ /* free old equiv table */
+ free_equiv_cpu_table();
+
+ ret = __load_microcode_amd(family, data, size);
+ if (ret != UCODE_OK) {
+ cleanup();
+ return ret;
+ }
+
+ p = find_patch(0);
+ if (!p) {
+ return ret;
+ } else {
+ if (boot_cpu_data.microcode >= p->patch_id)
+ return ret;
+
+ ret = UCODE_NEW;
+ }
+
+ /* save BSP's matching patch for early load */
+ if (!save)
+ return ret;
+
+ memset(amd_ucode_patch, 0, PATCH_MAX_SIZE);
+ memcpy(amd_ucode_patch, p->data, min_t(u32, ksize(p->data), PATCH_MAX_SIZE));
+
+ return ret;
+}
+
+/*
+ * AMD microcode firmware naming convention, up to family 15h they are in
+ * the legacy file:
+ *
+ * amd-ucode/microcode_amd.bin
+ *
+ * This legacy file is always smaller than 2K in size.
+ *
+ * Beginning with family 15h, they are in family-specific firmware files:
+ *
+ * amd-ucode/microcode_amd_fam15h.bin
+ * amd-ucode/microcode_amd_fam16h.bin
+ * ...
+ *
+ * These might be larger than 2K.
+ */
+static enum ucode_state request_microcode_amd(int cpu, struct device *device,
+ bool refresh_fw)
+{
+ char fw_name[36] = "amd-ucode/microcode_amd.bin";
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+ bool bsp = c->cpu_index == boot_cpu_data.cpu_index;
+ enum ucode_state ret = UCODE_NFOUND;
+ const struct firmware *fw;
+
+ /* reload ucode container only on the boot cpu */
+ if (!refresh_fw || !bsp)
+ return UCODE_OK;
+
+ if (c->x86 >= 0x15)
+ snprintf(fw_name, sizeof(fw_name), "amd-ucode/microcode_amd_fam%.2xh.bin", c->x86);
+
+ if (request_firmware_direct(&fw, (const char *)fw_name, device)) {
+ pr_debug("failed to load file %s\n", fw_name);
+ goto out;
+ }
+
+ ret = UCODE_ERROR;
+ if (*(u32 *)fw->data != UCODE_MAGIC) {
+ pr_err("invalid magic value (0x%08x)\n", *(u32 *)fw->data);
+ goto fw_release;
+ }
+
+ ret = load_microcode_amd(bsp, c->x86, fw->data, fw->size);
+
+ fw_release:
+ release_firmware(fw);
+
+ out:
+ return ret;
+}
+
+static enum ucode_state
+request_microcode_user(int cpu, const void __user *buf, size_t size)
+{
+ return UCODE_ERROR;
+}
+
+static void microcode_fini_cpu_amd(int cpu)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+ uci->mc = NULL;
+}
+
+static struct microcode_ops microcode_amd_ops = {
+ .request_microcode_user = request_microcode_user,
+ .request_microcode_fw = request_microcode_amd,
+ .collect_cpu_info = collect_cpu_info_amd,
+ .apply_microcode = apply_microcode_amd,
+ .microcode_fini_cpu = microcode_fini_cpu_amd,
+};
+
+struct microcode_ops * __init init_amd_microcode(void)
+{
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+
+ if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
+ pr_warn("AMD CPU family 0x%x not supported\n", c->x86);
+ return NULL;
+ }
+
+ if (ucode_new_rev)
+ pr_info_once("microcode updated early to new patch_level=0x%08x\n",
+ ucode_new_rev);
+
+ return &microcode_amd_ops;
+}
+
+void __exit exit_amd_microcode(void)
+{
+ cleanup();
+}
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
new file mode 100644
index 000000000..985ef98c8
--- /dev/null
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -0,0 +1,907 @@
+/*
+ * CPU Microcode Update Driver for Linux
+ *
+ * Copyright (C) 2000-2006 Tigran Aivazian <aivazian.tigran@gmail.com>
+ * 2006 Shaohua Li <shaohua.li@intel.com>
+ * 2013-2016 Borislav Petkov <bp@alien8.de>
+ *
+ * X86 CPU microcode early update for Linux:
+ *
+ * Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com>
+ * H Peter Anvin" <hpa@zytor.com>
+ * (C) 2015 Borislav Petkov <bp@alien8.de>
+ *
+ * This driver allows to upgrade microcode on x86 processors.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) "microcode: " fmt
+
+#include <linux/platform_device.h>
+#include <linux/stop_machine.h>
+#include <linux/syscore_ops.h>
+#include <linux/miscdevice.h>
+#include <linux/capability.h>
+#include <linux/firmware.h>
+#include <linux/kernel.h>
+#include <linux/delay.h>
+#include <linux/mutex.h>
+#include <linux/cpu.h>
+#include <linux/nmi.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+
+#include <asm/microcode_intel.h>
+#include <asm/cpu_device_id.h>
+#include <asm/microcode_amd.h>
+#include <asm/perf_event.h>
+#include <asm/microcode.h>
+#include <asm/processor.h>
+#include <asm/cmdline.h>
+#include <asm/setup.h>
+
+#define DRIVER_VERSION "2.2"
+
+static struct microcode_ops *microcode_ops;
+static bool dis_ucode_ldr = true;
+
+bool initrd_gone;
+
+LIST_HEAD(microcode_cache);
+
+/*
+ * Synchronization.
+ *
+ * All non cpu-hotplug-callback call sites use:
+ *
+ * - microcode_mutex to synchronize with each other;
+ * - get/put_online_cpus() to synchronize with
+ * the cpu-hotplug-callback call sites.
+ *
+ * We guarantee that only a single cpu is being
+ * updated at any particular moment of time.
+ */
+static DEFINE_MUTEX(microcode_mutex);
+
+/*
+ * Serialize late loading so that CPUs get updated one-by-one.
+ */
+static DEFINE_RAW_SPINLOCK(update_lock);
+
+struct ucode_cpu_info ucode_cpu_info[NR_CPUS];
+
+struct cpu_info_ctx {
+ struct cpu_signature *cpu_sig;
+ int err;
+};
+
+/*
+ * Those patch levels cannot be updated to newer ones and thus should be final.
+ */
+static u32 final_levels[] = {
+ 0x01000098,
+ 0x0100009f,
+ 0x010000af,
+ 0, /* T-101 terminator */
+};
+
+/*
+ * Check the current patch level on this CPU.
+ *
+ * Returns:
+ * - true: if update should stop
+ * - false: otherwise
+ */
+static bool amd_check_current_patch_level(void)
+{
+ u32 lvl, dummy, i;
+ u32 *levels;
+
+ native_rdmsr(MSR_AMD64_PATCH_LEVEL, lvl, dummy);
+
+ if (IS_ENABLED(CONFIG_X86_32))
+ levels = (u32 *)__pa_nodebug(&final_levels);
+ else
+ levels = final_levels;
+
+ for (i = 0; levels[i]; i++) {
+ if (lvl == levels[i])
+ return true;
+ }
+ return false;
+}
+
+static bool __init check_loader_disabled_bsp(void)
+{
+ static const char *__dis_opt_str = "dis_ucode_ldr";
+
+#ifdef CONFIG_X86_32
+ const char *cmdline = (const char *)__pa_nodebug(boot_command_line);
+ const char *option = (const char *)__pa_nodebug(__dis_opt_str);
+ bool *res = (bool *)__pa_nodebug(&dis_ucode_ldr);
+
+#else /* CONFIG_X86_64 */
+ const char *cmdline = boot_command_line;
+ const char *option = __dis_opt_str;
+ bool *res = &dis_ucode_ldr;
+#endif
+
+ /*
+ * CPUID(1).ECX[31]: reserved for hypervisor use. This is still not
+ * completely accurate as xen pv guests don't see that CPUID bit set but
+ * that's good enough as they don't land on the BSP path anyway.
+ */
+ if (native_cpuid_ecx(1) & BIT(31))
+ return *res;
+
+ if (x86_cpuid_vendor() == X86_VENDOR_AMD) {
+ if (amd_check_current_patch_level())
+ return *res;
+ }
+
+ if (cmdline_find_option_bool(cmdline, option) <= 0)
+ *res = false;
+
+ return *res;
+}
+
+extern struct builtin_fw __start_builtin_fw[];
+extern struct builtin_fw __end_builtin_fw[];
+
+bool get_builtin_firmware(struct cpio_data *cd, const char *name)
+{
+#ifdef CONFIG_FW_LOADER
+ struct builtin_fw *b_fw;
+
+ for (b_fw = __start_builtin_fw; b_fw != __end_builtin_fw; b_fw++) {
+ if (!strcmp(name, b_fw->name)) {
+ cd->size = b_fw->size;
+ cd->data = b_fw->data;
+ return true;
+ }
+ }
+#endif
+ return false;
+}
+
+void __init load_ucode_bsp(void)
+{
+ unsigned int cpuid_1_eax;
+ bool intel = true;
+
+ if (!have_cpuid_p())
+ return;
+
+ cpuid_1_eax = native_cpuid_eax(1);
+
+ switch (x86_cpuid_vendor()) {
+ case X86_VENDOR_INTEL:
+ if (x86_family(cpuid_1_eax) < 6)
+ return;
+ break;
+
+ case X86_VENDOR_AMD:
+ if (x86_family(cpuid_1_eax) < 0x10)
+ return;
+ intel = false;
+ break;
+
+ default:
+ return;
+ }
+
+ if (check_loader_disabled_bsp())
+ return;
+
+ if (intel)
+ load_ucode_intel_bsp();
+ else
+ load_ucode_amd_bsp(cpuid_1_eax);
+}
+
+static bool check_loader_disabled_ap(void)
+{
+#ifdef CONFIG_X86_32
+ return *((bool *)__pa_nodebug(&dis_ucode_ldr));
+#else
+ return dis_ucode_ldr;
+#endif
+}
+
+void load_ucode_ap(void)
+{
+ unsigned int cpuid_1_eax;
+
+ if (check_loader_disabled_ap())
+ return;
+
+ cpuid_1_eax = native_cpuid_eax(1);
+
+ switch (x86_cpuid_vendor()) {
+ case X86_VENDOR_INTEL:
+ if (x86_family(cpuid_1_eax) >= 6)
+ load_ucode_intel_ap();
+ break;
+ case X86_VENDOR_AMD:
+ if (x86_family(cpuid_1_eax) >= 0x10)
+ load_ucode_amd_ap(cpuid_1_eax);
+ break;
+ default:
+ break;
+ }
+}
+
+static int __init save_microcode_in_initrd(void)
+{
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+ int ret = -EINVAL;
+
+ switch (c->x86_vendor) {
+ case X86_VENDOR_INTEL:
+ if (c->x86 >= 6)
+ ret = save_microcode_in_initrd_intel();
+ break;
+ case X86_VENDOR_AMD:
+ if (c->x86 >= 0x10)
+ ret = save_microcode_in_initrd_amd(cpuid_eax(1));
+ break;
+ default:
+ break;
+ }
+
+ initrd_gone = true;
+
+ return ret;
+}
+
+struct cpio_data find_microcode_in_initrd(const char *path, bool use_pa)
+{
+#ifdef CONFIG_BLK_DEV_INITRD
+ unsigned long start = 0;
+ size_t size;
+
+#ifdef CONFIG_X86_32
+ struct boot_params *params;
+
+ if (use_pa)
+ params = (struct boot_params *)__pa_nodebug(&boot_params);
+ else
+ params = &boot_params;
+
+ size = params->hdr.ramdisk_size;
+
+ /*
+ * Set start only if we have an initrd image. We cannot use initrd_start
+ * because it is not set that early yet.
+ */
+ if (size)
+ start = params->hdr.ramdisk_image;
+
+# else /* CONFIG_X86_64 */
+ size = (unsigned long)boot_params.ext_ramdisk_size << 32;
+ size |= boot_params.hdr.ramdisk_size;
+
+ if (size) {
+ start = (unsigned long)boot_params.ext_ramdisk_image << 32;
+ start |= boot_params.hdr.ramdisk_image;
+
+ start += PAGE_OFFSET;
+ }
+# endif
+
+ /*
+ * Fixup the start address: after reserve_initrd() runs, initrd_start
+ * has the virtual address of the beginning of the initrd. It also
+ * possibly relocates the ramdisk. In either case, initrd_start contains
+ * the updated address so use that instead.
+ *
+ * initrd_gone is for the hotplug case where we've thrown out initrd
+ * already.
+ */
+ if (!use_pa) {
+ if (initrd_gone)
+ return (struct cpio_data){ NULL, 0, "" };
+ if (initrd_start)
+ start = initrd_start;
+ } else {
+ /*
+ * The picture with physical addresses is a bit different: we
+ * need to get the *physical* address to which the ramdisk was
+ * relocated, i.e., relocated_ramdisk (not initrd_start) and
+ * since we're running from physical addresses, we need to access
+ * relocated_ramdisk through its *physical* address too.
+ */
+ u64 *rr = (u64 *)__pa_nodebug(&relocated_ramdisk);
+ if (*rr)
+ start = *rr;
+ }
+
+ return find_cpio_data(path, (void *)start, size, NULL);
+#else /* !CONFIG_BLK_DEV_INITRD */
+ return (struct cpio_data){ NULL, 0, "" };
+#endif
+}
+
+void reload_early_microcode(void)
+{
+ int vendor, family;
+
+ vendor = x86_cpuid_vendor();
+ family = x86_cpuid_family();
+
+ switch (vendor) {
+ case X86_VENDOR_INTEL:
+ if (family >= 6)
+ reload_ucode_intel();
+ break;
+ case X86_VENDOR_AMD:
+ if (family >= 0x10)
+ reload_ucode_amd();
+ break;
+ default:
+ break;
+ }
+}
+
+static void collect_cpu_info_local(void *arg)
+{
+ struct cpu_info_ctx *ctx = arg;
+
+ ctx->err = microcode_ops->collect_cpu_info(smp_processor_id(),
+ ctx->cpu_sig);
+}
+
+static int collect_cpu_info_on_target(int cpu, struct cpu_signature *cpu_sig)
+{
+ struct cpu_info_ctx ctx = { .cpu_sig = cpu_sig, .err = 0 };
+ int ret;
+
+ ret = smp_call_function_single(cpu, collect_cpu_info_local, &ctx, 1);
+ if (!ret)
+ ret = ctx.err;
+
+ return ret;
+}
+
+static int collect_cpu_info(int cpu)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+ int ret;
+
+ memset(uci, 0, sizeof(*uci));
+
+ ret = collect_cpu_info_on_target(cpu, &uci->cpu_sig);
+ if (!ret)
+ uci->valid = 1;
+
+ return ret;
+}
+
+static void apply_microcode_local(void *arg)
+{
+ enum ucode_state *err = arg;
+
+ *err = microcode_ops->apply_microcode(smp_processor_id());
+}
+
+static int apply_microcode_on_target(int cpu)
+{
+ enum ucode_state err;
+ int ret;
+
+ ret = smp_call_function_single(cpu, apply_microcode_local, &err, 1);
+ if (!ret) {
+ if (err == UCODE_ERROR)
+ ret = 1;
+ }
+ return ret;
+}
+
+#ifdef CONFIG_MICROCODE_OLD_INTERFACE
+static int do_microcode_update(const void __user *buf, size_t size)
+{
+ int error = 0;
+ int cpu;
+
+ for_each_online_cpu(cpu) {
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+ enum ucode_state ustate;
+
+ if (!uci->valid)
+ continue;
+
+ ustate = microcode_ops->request_microcode_user(cpu, buf, size);
+ if (ustate == UCODE_ERROR) {
+ error = -1;
+ break;
+ } else if (ustate == UCODE_NEW) {
+ apply_microcode_on_target(cpu);
+ }
+ }
+
+ return error;
+}
+
+static int microcode_open(struct inode *inode, struct file *file)
+{
+ return capable(CAP_SYS_RAWIO) ? nonseekable_open(inode, file) : -EPERM;
+}
+
+static ssize_t microcode_write(struct file *file, const char __user *buf,
+ size_t len, loff_t *ppos)
+{
+ ssize_t ret = -EINVAL;
+
+ if ((len >> PAGE_SHIFT) > totalram_pages) {
+ pr_err("too much data (max %ld pages)\n", totalram_pages);
+ return ret;
+ }
+
+ get_online_cpus();
+ mutex_lock(&microcode_mutex);
+
+ if (do_microcode_update(buf, len) == 0)
+ ret = (ssize_t)len;
+
+ if (ret > 0)
+ perf_check_microcode();
+
+ mutex_unlock(&microcode_mutex);
+ put_online_cpus();
+
+ return ret;
+}
+
+static const struct file_operations microcode_fops = {
+ .owner = THIS_MODULE,
+ .write = microcode_write,
+ .open = microcode_open,
+ .llseek = no_llseek,
+};
+
+static struct miscdevice microcode_dev = {
+ .minor = MICROCODE_MINOR,
+ .name = "microcode",
+ .nodename = "cpu/microcode",
+ .fops = &microcode_fops,
+};
+
+static int __init microcode_dev_init(void)
+{
+ int error;
+
+ error = misc_register(&microcode_dev);
+ if (error) {
+ pr_err("can't misc_register on minor=%d\n", MICROCODE_MINOR);
+ return error;
+ }
+
+ return 0;
+}
+
+static void __exit microcode_dev_exit(void)
+{
+ misc_deregister(&microcode_dev);
+}
+#else
+#define microcode_dev_init() 0
+#define microcode_dev_exit() do { } while (0)
+#endif
+
+/* fake device for request_firmware */
+static struct platform_device *microcode_pdev;
+
+/*
+ * Late loading dance. Why the heavy-handed stomp_machine effort?
+ *
+ * - HT siblings must be idle and not execute other code while the other sibling
+ * is loading microcode in order to avoid any negative interactions caused by
+ * the loading.
+ *
+ * - In addition, microcode update on the cores must be serialized until this
+ * requirement can be relaxed in the future. Right now, this is conservative
+ * and good.
+ */
+#define SPINUNIT 100 /* 100 nsec */
+
+static int check_online_cpus(void)
+{
+ unsigned int cpu;
+
+ /*
+ * Make sure all CPUs are online. It's fine for SMT to be disabled if
+ * all the primary threads are still online.
+ */
+ for_each_present_cpu(cpu) {
+ if (topology_is_primary_thread(cpu) && !cpu_online(cpu)) {
+ pr_err("Not all CPUs online, aborting microcode update.\n");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static atomic_t late_cpus_in;
+static atomic_t late_cpus_out;
+
+static int __wait_for_cpus(atomic_t *t, long long timeout)
+{
+ int all_cpus = num_online_cpus();
+
+ atomic_inc(t);
+
+ while (atomic_read(t) < all_cpus) {
+ if (timeout < SPINUNIT) {
+ pr_err("Timeout while waiting for CPUs rendezvous, remaining: %d\n",
+ all_cpus - atomic_read(t));
+ return 1;
+ }
+
+ ndelay(SPINUNIT);
+ timeout -= SPINUNIT;
+
+ touch_nmi_watchdog();
+ }
+ return 0;
+}
+
+/*
+ * Returns:
+ * < 0 - on error
+ * 0 - no update done
+ * 1 - microcode was updated
+ */
+static int __reload_late(void *info)
+{
+ int cpu = smp_processor_id();
+ enum ucode_state err;
+ int ret = 0;
+
+ /*
+ * Wait for all CPUs to arrive. A load will not be attempted unless all
+ * CPUs show up.
+ * */
+ if (__wait_for_cpus(&late_cpus_in, NSEC_PER_SEC))
+ return -1;
+
+ raw_spin_lock(&update_lock);
+ apply_microcode_local(&err);
+ raw_spin_unlock(&update_lock);
+
+ /* siblings return UCODE_OK because their engine got updated already */
+ if (err > UCODE_NFOUND) {
+ pr_warn("Error reloading microcode on CPU %d\n", cpu);
+ ret = -1;
+ } else if (err == UCODE_UPDATED || err == UCODE_OK) {
+ ret = 1;
+ }
+
+ /*
+ * Increase the wait timeout to a safe value here since we're
+ * serializing the microcode update and that could take a while on a
+ * large number of CPUs. And that is fine as the *actual* timeout will
+ * be determined by the last CPU finished updating and thus cut short.
+ */
+ if (__wait_for_cpus(&late_cpus_out, NSEC_PER_SEC * num_online_cpus()))
+ panic("Timeout during microcode update!\n");
+
+ return ret;
+}
+
+/*
+ * Reload microcode late on all CPUs. Wait for a sec until they
+ * all gather together.
+ */
+static int microcode_reload_late(void)
+{
+ int ret;
+
+ atomic_set(&late_cpus_in, 0);
+ atomic_set(&late_cpus_out, 0);
+
+ ret = stop_machine_cpuslocked(__reload_late, NULL, cpu_online_mask);
+ if (ret > 0)
+ microcode_check();
+
+ return ret;
+}
+
+static ssize_t reload_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t size)
+{
+ enum ucode_state tmp_ret = UCODE_OK;
+ int bsp = boot_cpu_data.cpu_index;
+ unsigned long val;
+ ssize_t ret = 0;
+
+ ret = kstrtoul(buf, 0, &val);
+ if (ret)
+ return ret;
+
+ if (val != 1)
+ return size;
+
+ get_online_cpus();
+
+ ret = check_online_cpus();
+ if (ret)
+ goto put;
+
+ tmp_ret = microcode_ops->request_microcode_fw(bsp, &microcode_pdev->dev, true);
+ if (tmp_ret != UCODE_NEW)
+ goto put;
+
+ mutex_lock(&microcode_mutex);
+ ret = microcode_reload_late();
+ mutex_unlock(&microcode_mutex);
+
+put:
+ put_online_cpus();
+
+ if (ret >= 0)
+ ret = size;
+
+ return ret;
+}
+
+static ssize_t version_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
+
+ return sprintf(buf, "0x%x\n", uci->cpu_sig.rev);
+}
+
+static ssize_t pf_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
+
+ return sprintf(buf, "0x%x\n", uci->cpu_sig.pf);
+}
+
+static DEVICE_ATTR_WO(reload);
+static DEVICE_ATTR(version, 0400, version_show, NULL);
+static DEVICE_ATTR(processor_flags, 0400, pf_show, NULL);
+
+static struct attribute *mc_default_attrs[] = {
+ &dev_attr_version.attr,
+ &dev_attr_processor_flags.attr,
+ NULL
+};
+
+static const struct attribute_group mc_attr_group = {
+ .attrs = mc_default_attrs,
+ .name = "microcode",
+};
+
+static void microcode_fini_cpu(int cpu)
+{
+ if (microcode_ops->microcode_fini_cpu)
+ microcode_ops->microcode_fini_cpu(cpu);
+}
+
+static enum ucode_state microcode_resume_cpu(int cpu)
+{
+ if (apply_microcode_on_target(cpu))
+ return UCODE_ERROR;
+
+ pr_debug("CPU%d updated upon resume\n", cpu);
+
+ return UCODE_OK;
+}
+
+static enum ucode_state microcode_init_cpu(int cpu, bool refresh_fw)
+{
+ enum ucode_state ustate;
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+ if (uci->valid)
+ return UCODE_OK;
+
+ if (collect_cpu_info(cpu))
+ return UCODE_ERROR;
+
+ /* --dimm. Trigger a delayed update? */
+ if (system_state != SYSTEM_RUNNING)
+ return UCODE_NFOUND;
+
+ ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev, refresh_fw);
+ if (ustate == UCODE_NEW) {
+ pr_debug("CPU%d updated upon init\n", cpu);
+ apply_microcode_on_target(cpu);
+ }
+
+ return ustate;
+}
+
+static enum ucode_state microcode_update_cpu(int cpu)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+ /* Refresh CPU microcode revision after resume. */
+ collect_cpu_info(cpu);
+
+ if (uci->valid)
+ return microcode_resume_cpu(cpu);
+
+ return microcode_init_cpu(cpu, false);
+}
+
+static int mc_device_add(struct device *dev, struct subsys_interface *sif)
+{
+ int err, cpu = dev->id;
+
+ if (!cpu_online(cpu))
+ return 0;
+
+ pr_debug("CPU%d added\n", cpu);
+
+ err = sysfs_create_group(&dev->kobj, &mc_attr_group);
+ if (err)
+ return err;
+
+ if (microcode_init_cpu(cpu, true) == UCODE_ERROR)
+ return -EINVAL;
+
+ return err;
+}
+
+static void mc_device_remove(struct device *dev, struct subsys_interface *sif)
+{
+ int cpu = dev->id;
+
+ if (!cpu_online(cpu))
+ return;
+
+ pr_debug("CPU%d removed\n", cpu);
+ microcode_fini_cpu(cpu);
+ sysfs_remove_group(&dev->kobj, &mc_attr_group);
+}
+
+static struct subsys_interface mc_cpu_interface = {
+ .name = "microcode",
+ .subsys = &cpu_subsys,
+ .add_dev = mc_device_add,
+ .remove_dev = mc_device_remove,
+};
+
+/**
+ * microcode_bsp_resume - Update boot CPU microcode during resume.
+ */
+void microcode_bsp_resume(void)
+{
+ int cpu = smp_processor_id();
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+ if (uci->valid && uci->mc)
+ microcode_ops->apply_microcode(cpu);
+ else if (!uci->mc)
+ reload_early_microcode();
+}
+
+static struct syscore_ops mc_syscore_ops = {
+ .resume = microcode_bsp_resume,
+};
+
+static int mc_cpu_starting(unsigned int cpu)
+{
+ microcode_update_cpu(cpu);
+ pr_debug("CPU%d added\n", cpu);
+ return 0;
+}
+
+static int mc_cpu_online(unsigned int cpu)
+{
+ struct device *dev = get_cpu_device(cpu);
+
+ if (sysfs_create_group(&dev->kobj, &mc_attr_group))
+ pr_err("Failed to create group for CPU%d\n", cpu);
+ return 0;
+}
+
+static int mc_cpu_down_prep(unsigned int cpu)
+{
+ struct device *dev;
+
+ dev = get_cpu_device(cpu);
+ /* Suspend is in progress, only remove the interface */
+ sysfs_remove_group(&dev->kobj, &mc_attr_group);
+ pr_debug("CPU%d removed\n", cpu);
+
+ return 0;
+}
+
+static struct attribute *cpu_root_microcode_attrs[] = {
+ &dev_attr_reload.attr,
+ NULL
+};
+
+static const struct attribute_group cpu_root_microcode_group = {
+ .name = "microcode",
+ .attrs = cpu_root_microcode_attrs,
+};
+
+int __init microcode_init(void)
+{
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+ int error;
+
+ if (dis_ucode_ldr)
+ return -EINVAL;
+
+ if (c->x86_vendor == X86_VENDOR_INTEL)
+ microcode_ops = init_intel_microcode();
+ else if (c->x86_vendor == X86_VENDOR_AMD)
+ microcode_ops = init_amd_microcode();
+ else
+ pr_err("no support for this CPU vendor\n");
+
+ if (!microcode_ops)
+ return -ENODEV;
+
+ microcode_pdev = platform_device_register_simple("microcode", -1,
+ NULL, 0);
+ if (IS_ERR(microcode_pdev))
+ return PTR_ERR(microcode_pdev);
+
+ get_online_cpus();
+ mutex_lock(&microcode_mutex);
+
+ error = subsys_interface_register(&mc_cpu_interface);
+ if (!error)
+ perf_check_microcode();
+ mutex_unlock(&microcode_mutex);
+ put_online_cpus();
+
+ if (error)
+ goto out_pdev;
+
+ error = sysfs_create_group(&cpu_subsys.dev_root->kobj,
+ &cpu_root_microcode_group);
+
+ if (error) {
+ pr_err("Error creating microcode group!\n");
+ goto out_driver;
+ }
+
+ error = microcode_dev_init();
+ if (error)
+ goto out_ucode_group;
+
+ register_syscore_ops(&mc_syscore_ops);
+ cpuhp_setup_state_nocalls(CPUHP_AP_MICROCODE_LOADER, "x86/microcode:starting",
+ mc_cpu_starting, NULL);
+ cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/microcode:online",
+ mc_cpu_online, mc_cpu_down_prep);
+
+ pr_info("Microcode Update Driver: v%s.", DRIVER_VERSION);
+
+ return 0;
+
+ out_ucode_group:
+ sysfs_remove_group(&cpu_subsys.dev_root->kobj,
+ &cpu_root_microcode_group);
+
+ out_driver:
+ get_online_cpus();
+ mutex_lock(&microcode_mutex);
+
+ subsys_interface_unregister(&mc_cpu_interface);
+
+ mutex_unlock(&microcode_mutex);
+ put_online_cpus();
+
+ out_pdev:
+ platform_device_unregister(microcode_pdev);
+ return error;
+
+}
+fs_initcall(save_microcode_in_initrd);
+late_initcall(microcode_init);
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
new file mode 100644
index 000000000..3aa0e5a45
--- /dev/null
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -0,0 +1,1004 @@
+/*
+ * Intel CPU Microcode Update Driver for Linux
+ *
+ * Copyright (C) 2000-2006 Tigran Aivazian <aivazian.tigran@gmail.com>
+ * 2006 Shaohua Li <shaohua.li@intel.com>
+ *
+ * Intel CPU microcode early update for Linux
+ *
+ * Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com>
+ * H Peter Anvin" <hpa@zytor.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+/*
+ * This needs to be before all headers so that pr_debug in printk.h doesn't turn
+ * printk calls into no_printk().
+ *
+ *#define DEBUG
+ */
+#define pr_fmt(fmt) "microcode: " fmt
+
+#include <linux/earlycpio.h>
+#include <linux/firmware.h>
+#include <linux/uaccess.h>
+#include <linux/vmalloc.h>
+#include <linux/initrd.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/cpu.h>
+#include <linux/mm.h>
+
+#include <asm/microcode_intel.h>
+#include <asm/intel-family.h>
+#include <asm/processor.h>
+#include <asm/tlbflush.h>
+#include <asm/setup.h>
+#include <asm/msr.h>
+
+static const char ucode_path[] = "kernel/x86/microcode/GenuineIntel.bin";
+
+/* Current microcode patch used in early patching on the APs. */
+static struct microcode_intel *intel_ucode_patch;
+
+/* last level cache size per core */
+static int llc_size_per_core;
+
+static inline bool cpu_signatures_match(unsigned int s1, unsigned int p1,
+ unsigned int s2, unsigned int p2)
+{
+ if (s1 != s2)
+ return false;
+
+ /* Processor flags are either both 0 ... */
+ if (!p1 && !p2)
+ return true;
+
+ /* ... or they intersect. */
+ return p1 & p2;
+}
+
+/*
+ * Returns 1 if update has been found, 0 otherwise.
+ */
+static int find_matching_signature(void *mc, unsigned int csig, int cpf)
+{
+ struct microcode_header_intel *mc_hdr = mc;
+ struct extended_sigtable *ext_hdr;
+ struct extended_signature *ext_sig;
+ int i;
+
+ if (cpu_signatures_match(csig, cpf, mc_hdr->sig, mc_hdr->pf))
+ return 1;
+
+ /* Look for ext. headers: */
+ if (get_totalsize(mc_hdr) <= get_datasize(mc_hdr) + MC_HEADER_SIZE)
+ return 0;
+
+ ext_hdr = mc + get_datasize(mc_hdr) + MC_HEADER_SIZE;
+ ext_sig = (void *)ext_hdr + EXT_HEADER_SIZE;
+
+ for (i = 0; i < ext_hdr->count; i++) {
+ if (cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf))
+ return 1;
+ ext_sig++;
+ }
+ return 0;
+}
+
+/*
+ * Returns 1 if update has been found, 0 otherwise.
+ */
+static int has_newer_microcode(void *mc, unsigned int csig, int cpf, int new_rev)
+{
+ struct microcode_header_intel *mc_hdr = mc;
+
+ if (mc_hdr->rev <= new_rev)
+ return 0;
+
+ return find_matching_signature(mc, csig, cpf);
+}
+
+static struct ucode_patch *memdup_patch(void *data, unsigned int size)
+{
+ struct ucode_patch *p;
+
+ p = kzalloc(sizeof(struct ucode_patch), GFP_KERNEL);
+ if (!p)
+ return NULL;
+
+ p->data = kmemdup(data, size, GFP_KERNEL);
+ if (!p->data) {
+ kfree(p);
+ return NULL;
+ }
+
+ return p;
+}
+
+static void save_microcode_patch(struct ucode_cpu_info *uci, void *data, unsigned int size)
+{
+ struct microcode_header_intel *mc_hdr, *mc_saved_hdr;
+ struct ucode_patch *iter, *tmp, *p = NULL;
+ bool prev_found = false;
+ unsigned int sig, pf;
+
+ mc_hdr = (struct microcode_header_intel *)data;
+
+ list_for_each_entry_safe(iter, tmp, &microcode_cache, plist) {
+ mc_saved_hdr = (struct microcode_header_intel *)iter->data;
+ sig = mc_saved_hdr->sig;
+ pf = mc_saved_hdr->pf;
+
+ if (find_matching_signature(data, sig, pf)) {
+ prev_found = true;
+
+ if (mc_hdr->rev <= mc_saved_hdr->rev)
+ continue;
+
+ p = memdup_patch(data, size);
+ if (!p)
+ pr_err("Error allocating buffer %p\n", data);
+ else {
+ list_replace(&iter->plist, &p->plist);
+ kfree(iter->data);
+ kfree(iter);
+ }
+ }
+ }
+
+ /*
+ * There weren't any previous patches found in the list cache; save the
+ * newly found.
+ */
+ if (!prev_found) {
+ p = memdup_patch(data, size);
+ if (!p)
+ pr_err("Error allocating buffer for %p\n", data);
+ else
+ list_add_tail(&p->plist, &microcode_cache);
+ }
+
+ if (!p)
+ return;
+
+ if (!find_matching_signature(p->data, uci->cpu_sig.sig, uci->cpu_sig.pf))
+ return;
+
+ /*
+ * Save for early loading. On 32-bit, that needs to be a physical
+ * address as the APs are running from physical addresses, before
+ * paging has been enabled.
+ */
+ if (IS_ENABLED(CONFIG_X86_32))
+ intel_ucode_patch = (struct microcode_intel *)__pa_nodebug(p->data);
+ else
+ intel_ucode_patch = p->data;
+}
+
+static int microcode_sanity_check(void *mc, int print_err)
+{
+ unsigned long total_size, data_size, ext_table_size;
+ struct microcode_header_intel *mc_header = mc;
+ struct extended_sigtable *ext_header = NULL;
+ u32 sum, orig_sum, ext_sigcount = 0, i;
+ struct extended_signature *ext_sig;
+
+ total_size = get_totalsize(mc_header);
+ data_size = get_datasize(mc_header);
+
+ if (data_size + MC_HEADER_SIZE > total_size) {
+ if (print_err)
+ pr_err("Error: bad microcode data file size.\n");
+ return -EINVAL;
+ }
+
+ if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
+ if (print_err)
+ pr_err("Error: invalid/unknown microcode update format.\n");
+ return -EINVAL;
+ }
+
+ ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
+ if (ext_table_size) {
+ u32 ext_table_sum = 0;
+ u32 *ext_tablep;
+
+ if ((ext_table_size < EXT_HEADER_SIZE)
+ || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
+ if (print_err)
+ pr_err("Error: truncated extended signature table.\n");
+ return -EINVAL;
+ }
+
+ ext_header = mc + MC_HEADER_SIZE + data_size;
+ if (ext_table_size != exttable_size(ext_header)) {
+ if (print_err)
+ pr_err("Error: extended signature table size mismatch.\n");
+ return -EFAULT;
+ }
+
+ ext_sigcount = ext_header->count;
+
+ /*
+ * Check extended table checksum: the sum of all dwords that
+ * comprise a valid table must be 0.
+ */
+ ext_tablep = (u32 *)ext_header;
+
+ i = ext_table_size / sizeof(u32);
+ while (i--)
+ ext_table_sum += ext_tablep[i];
+
+ if (ext_table_sum) {
+ if (print_err)
+ pr_warn("Bad extended signature table checksum, aborting.\n");
+ return -EINVAL;
+ }
+ }
+
+ /*
+ * Calculate the checksum of update data and header. The checksum of
+ * valid update data and header including the extended signature table
+ * must be 0.
+ */
+ orig_sum = 0;
+ i = (MC_HEADER_SIZE + data_size) / sizeof(u32);
+ while (i--)
+ orig_sum += ((u32 *)mc)[i];
+
+ if (orig_sum) {
+ if (print_err)
+ pr_err("Bad microcode data checksum, aborting.\n");
+ return -EINVAL;
+ }
+
+ if (!ext_table_size)
+ return 0;
+
+ /*
+ * Check extended signature checksum: 0 => valid.
+ */
+ for (i = 0; i < ext_sigcount; i++) {
+ ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
+ EXT_SIGNATURE_SIZE * i;
+
+ sum = (mc_header->sig + mc_header->pf + mc_header->cksum) -
+ (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
+ if (sum) {
+ if (print_err)
+ pr_err("Bad extended signature checksum, aborting.\n");
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+/*
+ * Get microcode matching with BSP's model. Only CPUs with the same model as
+ * BSP can stay in the platform.
+ */
+static struct microcode_intel *
+scan_microcode(void *data, size_t size, struct ucode_cpu_info *uci, bool save)
+{
+ struct microcode_header_intel *mc_header;
+ struct microcode_intel *patch = NULL;
+ unsigned int mc_size;
+
+ while (size) {
+ if (size < sizeof(struct microcode_header_intel))
+ break;
+
+ mc_header = (struct microcode_header_intel *)data;
+
+ mc_size = get_totalsize(mc_header);
+ if (!mc_size ||
+ mc_size > size ||
+ microcode_sanity_check(data, 0) < 0)
+ break;
+
+ size -= mc_size;
+
+ if (!find_matching_signature(data, uci->cpu_sig.sig,
+ uci->cpu_sig.pf)) {
+ data += mc_size;
+ continue;
+ }
+
+ if (save) {
+ save_microcode_patch(uci, data, mc_size);
+ goto next;
+ }
+
+
+ if (!patch) {
+ if (!has_newer_microcode(data,
+ uci->cpu_sig.sig,
+ uci->cpu_sig.pf,
+ uci->cpu_sig.rev))
+ goto next;
+
+ } else {
+ struct microcode_header_intel *phdr = &patch->hdr;
+
+ if (!has_newer_microcode(data,
+ phdr->sig,
+ phdr->pf,
+ phdr->rev))
+ goto next;
+ }
+
+ /* We have a newer patch, save it. */
+ patch = data;
+
+next:
+ data += mc_size;
+ }
+
+ if (size)
+ return NULL;
+
+ return patch;
+}
+
+static int collect_cpu_info_early(struct ucode_cpu_info *uci)
+{
+ unsigned int val[2];
+ unsigned int family, model;
+ struct cpu_signature csig = { 0 };
+ unsigned int eax, ebx, ecx, edx;
+
+ memset(uci, 0, sizeof(*uci));
+
+ eax = 0x00000001;
+ ecx = 0;
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+ csig.sig = eax;
+
+ family = x86_family(eax);
+ model = x86_model(eax);
+
+ if ((model >= 5) || (family > 6)) {
+ /* get processor flags from MSR 0x17 */
+ native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
+ csig.pf = 1 << ((val[1] >> 18) & 7);
+ }
+
+ csig.rev = intel_get_microcode_revision();
+
+ uci->cpu_sig = csig;
+ uci->valid = 1;
+
+ return 0;
+}
+
+static void show_saved_mc(void)
+{
+#ifdef DEBUG
+ int i = 0, j;
+ unsigned int sig, pf, rev, total_size, data_size, date;
+ struct ucode_cpu_info uci;
+ struct ucode_patch *p;
+
+ if (list_empty(&microcode_cache)) {
+ pr_debug("no microcode data saved.\n");
+ return;
+ }
+
+ collect_cpu_info_early(&uci);
+
+ sig = uci.cpu_sig.sig;
+ pf = uci.cpu_sig.pf;
+ rev = uci.cpu_sig.rev;
+ pr_debug("CPU: sig=0x%x, pf=0x%x, rev=0x%x\n", sig, pf, rev);
+
+ list_for_each_entry(p, &microcode_cache, plist) {
+ struct microcode_header_intel *mc_saved_header;
+ struct extended_sigtable *ext_header;
+ struct extended_signature *ext_sig;
+ int ext_sigcount;
+
+ mc_saved_header = (struct microcode_header_intel *)p->data;
+
+ sig = mc_saved_header->sig;
+ pf = mc_saved_header->pf;
+ rev = mc_saved_header->rev;
+ date = mc_saved_header->date;
+
+ total_size = get_totalsize(mc_saved_header);
+ data_size = get_datasize(mc_saved_header);
+
+ pr_debug("mc_saved[%d]: sig=0x%x, pf=0x%x, rev=0x%x, total size=0x%x, date = %04x-%02x-%02x\n",
+ i++, sig, pf, rev, total_size,
+ date & 0xffff,
+ date >> 24,
+ (date >> 16) & 0xff);
+
+ /* Look for ext. headers: */
+ if (total_size <= data_size + MC_HEADER_SIZE)
+ continue;
+
+ ext_header = (void *)mc_saved_header + data_size + MC_HEADER_SIZE;
+ ext_sigcount = ext_header->count;
+ ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
+
+ for (j = 0; j < ext_sigcount; j++) {
+ sig = ext_sig->sig;
+ pf = ext_sig->pf;
+
+ pr_debug("\tExtended[%d]: sig=0x%x, pf=0x%x\n",
+ j, sig, pf);
+
+ ext_sig++;
+ }
+ }
+#endif
+}
+
+/*
+ * Save this microcode patch. It will be loaded early when a CPU is
+ * hot-added or resumes.
+ */
+static void save_mc_for_early(struct ucode_cpu_info *uci, u8 *mc, unsigned int size)
+{
+ /* Synchronization during CPU hotplug. */
+ static DEFINE_MUTEX(x86_cpu_microcode_mutex);
+
+ mutex_lock(&x86_cpu_microcode_mutex);
+
+ save_microcode_patch(uci, mc, size);
+ show_saved_mc();
+
+ mutex_unlock(&x86_cpu_microcode_mutex);
+}
+
+static bool load_builtin_intel_microcode(struct cpio_data *cp)
+{
+ unsigned int eax = 1, ebx, ecx = 0, edx;
+ char name[30];
+
+ if (IS_ENABLED(CONFIG_X86_32))
+ return false;
+
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+
+ sprintf(name, "intel-ucode/%02x-%02x-%02x",
+ x86_family(eax), x86_model(eax), x86_stepping(eax));
+
+ return get_builtin_firmware(cp, name);
+}
+
+/*
+ * Print ucode update info.
+ */
+static void
+print_ucode_info(struct ucode_cpu_info *uci, unsigned int date)
+{
+ pr_info_once("microcode updated early to revision 0x%x, date = %04x-%02x-%02x\n",
+ uci->cpu_sig.rev,
+ date & 0xffff,
+ date >> 24,
+ (date >> 16) & 0xff);
+}
+
+#ifdef CONFIG_X86_32
+
+static int delay_ucode_info;
+static int current_mc_date;
+
+/*
+ * Print early updated ucode info after printk works. This is delayed info dump.
+ */
+void show_ucode_info_early(void)
+{
+ struct ucode_cpu_info uci;
+
+ if (delay_ucode_info) {
+ collect_cpu_info_early(&uci);
+ print_ucode_info(&uci, current_mc_date);
+ delay_ucode_info = 0;
+ }
+}
+
+/*
+ * At this point, we can not call printk() yet. Delay printing microcode info in
+ * show_ucode_info_early() until printk() works.
+ */
+static void print_ucode(struct ucode_cpu_info *uci)
+{
+ struct microcode_intel *mc;
+ int *delay_ucode_info_p;
+ int *current_mc_date_p;
+
+ mc = uci->mc;
+ if (!mc)
+ return;
+
+ delay_ucode_info_p = (int *)__pa_nodebug(&delay_ucode_info);
+ current_mc_date_p = (int *)__pa_nodebug(&current_mc_date);
+
+ *delay_ucode_info_p = 1;
+ *current_mc_date_p = mc->hdr.date;
+}
+#else
+
+static inline void print_ucode(struct ucode_cpu_info *uci)
+{
+ struct microcode_intel *mc;
+
+ mc = uci->mc;
+ if (!mc)
+ return;
+
+ print_ucode_info(uci, mc->hdr.date);
+}
+#endif
+
+static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)
+{
+ struct microcode_intel *mc;
+ u32 rev;
+
+ mc = uci->mc;
+ if (!mc)
+ return 0;
+
+ /*
+ * Save us the MSR write below - which is a particular expensive
+ * operation - when the other hyperthread has updated the microcode
+ * already.
+ */
+ rev = intel_get_microcode_revision();
+ if (rev >= mc->hdr.rev) {
+ uci->cpu_sig.rev = rev;
+ return UCODE_OK;
+ }
+
+ /*
+ * Writeback and invalidate caches before updating microcode to avoid
+ * internal issues depending on what the microcode is updating.
+ */
+ native_wbinvd();
+
+ /* write microcode via MSR 0x79 */
+ native_wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits);
+
+ rev = intel_get_microcode_revision();
+ if (rev != mc->hdr.rev)
+ return -1;
+
+ uci->cpu_sig.rev = rev;
+
+ if (early)
+ print_ucode(uci);
+ else
+ print_ucode_info(uci, mc->hdr.date);
+
+ return 0;
+}
+
+int __init save_microcode_in_initrd_intel(void)
+{
+ struct ucode_cpu_info uci;
+ struct cpio_data cp;
+
+ /*
+ * initrd is going away, clear patch ptr. We will scan the microcode one
+ * last time before jettisoning and save a patch, if found. Then we will
+ * update that pointer too, with a stable patch address to use when
+ * resuming the cores.
+ */
+ intel_ucode_patch = NULL;
+
+ if (!load_builtin_intel_microcode(&cp))
+ cp = find_microcode_in_initrd(ucode_path, false);
+
+ if (!(cp.data && cp.size))
+ return 0;
+
+ collect_cpu_info_early(&uci);
+
+ scan_microcode(cp.data, cp.size, &uci, true);
+
+ show_saved_mc();
+
+ return 0;
+}
+
+/*
+ * @res_patch, output: a pointer to the patch we found.
+ */
+static struct microcode_intel *__load_ucode_intel(struct ucode_cpu_info *uci)
+{
+ static const char *path;
+ struct cpio_data cp;
+ bool use_pa;
+
+ if (IS_ENABLED(CONFIG_X86_32)) {
+ path = (const char *)__pa_nodebug(ucode_path);
+ use_pa = true;
+ } else {
+ path = ucode_path;
+ use_pa = false;
+ }
+
+ /* try built-in microcode first */
+ if (!load_builtin_intel_microcode(&cp))
+ cp = find_microcode_in_initrd(path, use_pa);
+
+ if (!(cp.data && cp.size))
+ return NULL;
+
+ collect_cpu_info_early(uci);
+
+ return scan_microcode(cp.data, cp.size, uci, false);
+}
+
+void __init load_ucode_intel_bsp(void)
+{
+ struct microcode_intel *patch;
+ struct ucode_cpu_info uci;
+
+ patch = __load_ucode_intel(&uci);
+ if (!patch)
+ return;
+
+ uci.mc = patch;
+
+ apply_microcode_early(&uci, true);
+}
+
+void load_ucode_intel_ap(void)
+{
+ struct microcode_intel *patch, **iup;
+ struct ucode_cpu_info uci;
+
+ if (IS_ENABLED(CONFIG_X86_32))
+ iup = (struct microcode_intel **) __pa_nodebug(&intel_ucode_patch);
+ else
+ iup = &intel_ucode_patch;
+
+reget:
+ if (!*iup) {
+ patch = __load_ucode_intel(&uci);
+ if (!patch)
+ return;
+
+ *iup = patch;
+ }
+
+ uci.mc = *iup;
+
+ if (apply_microcode_early(&uci, true)) {
+ /* Mixed-silicon system? Try to refetch the proper patch: */
+ *iup = NULL;
+
+ goto reget;
+ }
+}
+
+static struct microcode_intel *find_patch(struct ucode_cpu_info *uci)
+{
+ struct microcode_header_intel *phdr;
+ struct ucode_patch *iter, *tmp;
+
+ list_for_each_entry_safe(iter, tmp, &microcode_cache, plist) {
+
+ phdr = (struct microcode_header_intel *)iter->data;
+
+ if (phdr->rev <= uci->cpu_sig.rev)
+ continue;
+
+ if (!find_matching_signature(phdr,
+ uci->cpu_sig.sig,
+ uci->cpu_sig.pf))
+ continue;
+
+ return iter->data;
+ }
+ return NULL;
+}
+
+void reload_ucode_intel(void)
+{
+ struct microcode_intel *p;
+ struct ucode_cpu_info uci;
+
+ collect_cpu_info_early(&uci);
+
+ p = find_patch(&uci);
+ if (!p)
+ return;
+
+ uci.mc = p;
+
+ apply_microcode_early(&uci, false);
+}
+
+static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
+{
+ static struct cpu_signature prev;
+ struct cpuinfo_x86 *c = &cpu_data(cpu_num);
+ unsigned int val[2];
+
+ memset(csig, 0, sizeof(*csig));
+
+ csig->sig = cpuid_eax(0x00000001);
+
+ if ((c->x86_model >= 5) || (c->x86 > 6)) {
+ /* get processor flags from MSR 0x17 */
+ rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
+ csig->pf = 1 << ((val[1] >> 18) & 7);
+ }
+
+ csig->rev = c->microcode;
+
+ /* No extra locking on prev, races are harmless. */
+ if (csig->sig != prev.sig || csig->pf != prev.pf || csig->rev != prev.rev) {
+ pr_info("sig=0x%x, pf=0x%x, revision=0x%x\n",
+ csig->sig, csig->pf, csig->rev);
+ prev = *csig;
+ }
+
+ return 0;
+}
+
+static enum ucode_state apply_microcode_intel(int cpu)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+ struct microcode_intel *mc;
+ enum ucode_state ret;
+ static int prev_rev;
+ u32 rev;
+
+ /* We should bind the task to the CPU */
+ if (WARN_ON(raw_smp_processor_id() != cpu))
+ return UCODE_ERROR;
+
+ /* Look for a newer patch in our cache: */
+ mc = find_patch(uci);
+ if (!mc) {
+ mc = uci->mc;
+ if (!mc)
+ return UCODE_NFOUND;
+ }
+
+ /*
+ * Save us the MSR write below - which is a particular expensive
+ * operation - when the other hyperthread has updated the microcode
+ * already.
+ */
+ rev = intel_get_microcode_revision();
+ if (rev >= mc->hdr.rev) {
+ ret = UCODE_OK;
+ goto out;
+ }
+
+ /*
+ * Writeback and invalidate caches before updating microcode to avoid
+ * internal issues depending on what the microcode is updating.
+ */
+ native_wbinvd();
+
+ /* write microcode via MSR 0x79 */
+ wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits);
+
+ rev = intel_get_microcode_revision();
+
+ if (rev != mc->hdr.rev) {
+ pr_err("CPU%d update to revision 0x%x failed\n",
+ cpu, mc->hdr.rev);
+ return UCODE_ERROR;
+ }
+
+ if (rev != prev_rev) {
+ pr_info("updated to revision 0x%x, date = %04x-%02x-%02x\n",
+ rev,
+ mc->hdr.date & 0xffff,
+ mc->hdr.date >> 24,
+ (mc->hdr.date >> 16) & 0xff);
+ prev_rev = rev;
+ }
+
+ ret = UCODE_UPDATED;
+
+out:
+ uci->cpu_sig.rev = rev;
+ c->microcode = rev;
+
+ /* Update boot_cpu_data's revision too, if we're on the BSP: */
+ if (c->cpu_index == boot_cpu_data.cpu_index)
+ boot_cpu_data.microcode = rev;
+
+ return ret;
+}
+
+static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
+ int (*get_ucode_data)(void *, const void *, size_t))
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+ u8 *ucode_ptr = data, *new_mc = NULL, *mc = NULL;
+ int new_rev = uci->cpu_sig.rev;
+ unsigned int leftover = size;
+ unsigned int curr_mc_size = 0, new_mc_size = 0;
+ unsigned int csig, cpf;
+ enum ucode_state ret = UCODE_OK;
+
+ while (leftover) {
+ struct microcode_header_intel mc_header;
+ unsigned int mc_size;
+
+ if (leftover < sizeof(mc_header)) {
+ pr_err("error! Truncated header in microcode data file\n");
+ break;
+ }
+
+ if (get_ucode_data(&mc_header, ucode_ptr, sizeof(mc_header)))
+ break;
+
+ mc_size = get_totalsize(&mc_header);
+ if (!mc_size || mc_size > leftover) {
+ pr_err("error! Bad data in microcode data file\n");
+ break;
+ }
+
+ /* For performance reasons, reuse mc area when possible */
+ if (!mc || mc_size > curr_mc_size) {
+ vfree(mc);
+ mc = vmalloc(mc_size);
+ if (!mc)
+ break;
+ curr_mc_size = mc_size;
+ }
+
+ if (get_ucode_data(mc, ucode_ptr, mc_size) ||
+ microcode_sanity_check(mc, 1) < 0) {
+ break;
+ }
+
+ csig = uci->cpu_sig.sig;
+ cpf = uci->cpu_sig.pf;
+ if (has_newer_microcode(mc, csig, cpf, new_rev)) {
+ vfree(new_mc);
+ new_rev = mc_header.rev;
+ new_mc = mc;
+ new_mc_size = mc_size;
+ mc = NULL; /* trigger new vmalloc */
+ ret = UCODE_NEW;
+ }
+
+ ucode_ptr += mc_size;
+ leftover -= mc_size;
+ }
+
+ vfree(mc);
+
+ if (leftover) {
+ vfree(new_mc);
+ return UCODE_ERROR;
+ }
+
+ if (!new_mc)
+ return UCODE_NFOUND;
+
+ vfree(uci->mc);
+ uci->mc = (struct microcode_intel *)new_mc;
+
+ /*
+ * If early loading microcode is supported, save this mc into
+ * permanent memory. So it will be loaded early when a CPU is hot added
+ * or resumes.
+ */
+ save_mc_for_early(uci, new_mc, new_mc_size);
+
+ pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n",
+ cpu, new_rev, uci->cpu_sig.rev);
+
+ return ret;
+}
+
+static int get_ucode_fw(void *to, const void *from, size_t n)
+{
+ memcpy(to, from, n);
+ return 0;
+}
+
+static bool is_blacklisted(unsigned int cpu)
+{
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+ /*
+ * Late loading on model 79 with microcode revision less than 0x0b000021
+ * and LLC size per core bigger than 2.5MB may result in a system hang.
+ * This behavior is documented in item BDF90, #334165 (Intel Xeon
+ * Processor E7-8800/4800 v4 Product Family).
+ */
+ if (c->x86 == 6 &&
+ c->x86_model == INTEL_FAM6_BROADWELL_X &&
+ c->x86_stepping == 0x01 &&
+ llc_size_per_core > 2621440 &&
+ c->microcode < 0x0b000021) {
+ pr_err_once("Erratum BDF90: late loading with revision < 0x0b000021 (0x%x) disabled.\n", c->microcode);
+ pr_err_once("Please consider either early loading through initrd/built-in or a potential BIOS update.\n");
+ return true;
+ }
+
+ return false;
+}
+
+static enum ucode_state request_microcode_fw(int cpu, struct device *device,
+ bool refresh_fw)
+{
+ char name[30];
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+ const struct firmware *firmware;
+ enum ucode_state ret;
+
+ if (is_blacklisted(cpu))
+ return UCODE_NFOUND;
+
+ sprintf(name, "intel-ucode/%02x-%02x-%02x",
+ c->x86, c->x86_model, c->x86_stepping);
+
+ if (request_firmware_direct(&firmware, name, device)) {
+ pr_debug("data file %s load failed\n", name);
+ return UCODE_NFOUND;
+ }
+
+ ret = generic_load_microcode(cpu, (void *)firmware->data,
+ firmware->size, &get_ucode_fw);
+
+ release_firmware(firmware);
+
+ return ret;
+}
+
+static int get_ucode_user(void *to, const void *from, size_t n)
+{
+ return copy_from_user(to, from, n);
+}
+
+static enum ucode_state
+request_microcode_user(int cpu, const void __user *buf, size_t size)
+{
+ if (is_blacklisted(cpu))
+ return UCODE_NFOUND;
+
+ return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user);
+}
+
+static struct microcode_ops microcode_intel_ops = {
+ .request_microcode_user = request_microcode_user,
+ .request_microcode_fw = request_microcode_fw,
+ .collect_cpu_info = collect_cpu_info,
+ .apply_microcode = apply_microcode_intel,
+};
+
+static int __init calc_llc_size_per_core(struct cpuinfo_x86 *c)
+{
+ u64 llc_size = c->x86_cache_size * 1024ULL;
+
+ do_div(llc_size, c->x86_max_cores);
+
+ return (int)llc_size;
+}
+
+struct microcode_ops * __init init_intel_microcode(void)
+{
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+
+ if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
+ cpu_has(c, X86_FEATURE_IA64)) {
+ pr_err("Intel CPU family 0x%x not supported\n", c->x86);
+ return NULL;
+ }
+
+ llc_size_per_core = calc_llc_size_per_core(c);
+
+ return &microcode_intel_ops;
+}
diff --git a/arch/x86/kernel/cpu/mkcapflags.sh b/arch/x86/kernel/cpu/mkcapflags.sh
new file mode 100644
index 000000000..aed45b889
--- /dev/null
+++ b/arch/x86/kernel/cpu/mkcapflags.sh
@@ -0,0 +1,67 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+#
+# Generate the x86_cap/bug_flags[] arrays from include/asm/cpufeatures.h
+#
+
+set -e
+
+IN=$1
+OUT=$2
+
+dump_array()
+{
+ ARRAY=$1
+ SIZE=$2
+ PFX=$3
+ POSTFIX=$4
+
+ PFX_SZ=$(echo $PFX | wc -c)
+ TABS="$(printf '\t\t\t\t\t')"
+
+ echo "const char * const $ARRAY[$SIZE] = {"
+
+ # Iterate through any input lines starting with #define $PFX
+ sed -n -e 's/\t/ /g' -e "s/^ *# *define *$PFX//p" $IN |
+ while read i
+ do
+ # Name is everything up to the first whitespace
+ NAME="$(echo "$i" | sed 's/ .*//')"
+
+ # If the /* comment */ starts with a quote string, grab that.
+ VALUE="$(echo "$i" | sed -n 's@.*/\* *\("[^"]*"\).*\*/@\1@p')"
+ [ -z "$VALUE" ] && VALUE="\"$NAME\""
+ [ "$VALUE" = '""' ] && continue
+
+ # Name is uppercase, VALUE is all lowercase
+ VALUE="$(echo "$VALUE" | tr A-Z a-z)"
+
+ if [ -n "$POSTFIX" ]; then
+ T=$(( $PFX_SZ + $(echo $POSTFIX | wc -c) + 2 ))
+ TABS="$(printf '\t\t\t\t\t\t')"
+ TABCOUNT=$(( ( 6*8 - ($T + 1) - $(echo "$NAME" | wc -c) ) / 8 ))
+ printf "\t[%s - %s]%.*s = %s,\n" "$PFX$NAME" "$POSTFIX" "$TABCOUNT" "$TABS" "$VALUE"
+ else
+ TABCOUNT=$(( ( 5*8 - ($PFX_SZ + 1) - $(echo "$NAME" | wc -c) ) / 8 ))
+ printf "\t[%s]%.*s = %s,\n" "$PFX$NAME" "$TABCOUNT" "$TABS" "$VALUE"
+ fi
+ done
+ echo "};"
+}
+
+trap 'rm "$OUT"' EXIT
+
+(
+ echo "#ifndef _ASM_X86_CPUFEATURES_H"
+ echo "#include <asm/cpufeatures.h>"
+ echo "#endif"
+ echo ""
+
+ dump_array "x86_cap_flags" "NCAPINTS*32" "X86_FEATURE_" ""
+ echo ""
+
+ dump_array "x86_bug_flags" "NBUGINTS*32" "X86_BUG_" "NCAPINTS*32"
+
+) > $OUT
+
+trap - EXIT
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
new file mode 100644
index 000000000..f8b0fa2db
--- /dev/null
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -0,0 +1,335 @@
+/*
+ * HyperV Detection code.
+ *
+ * Copyright (C) 2010, Novell, Inc.
+ * Author : K. Y. Srinivasan <ksrinivasan@novell.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/time.h>
+#include <linux/clocksource.h>
+#include <linux/init.h>
+#include <linux/export.h>
+#include <linux/hardirq.h>
+#include <linux/efi.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/kexec.h>
+#include <linux/i8253.h>
+#include <asm/processor.h>
+#include <asm/hypervisor.h>
+#include <asm/hyperv-tlfs.h>
+#include <asm/mshyperv.h>
+#include <asm/desc.h>
+#include <asm/irq_regs.h>
+#include <asm/i8259.h>
+#include <asm/apic.h>
+#include <asm/timer.h>
+#include <asm/reboot.h>
+#include <asm/nmi.h>
+
+struct ms_hyperv_info ms_hyperv;
+EXPORT_SYMBOL_GPL(ms_hyperv);
+
+#if IS_ENABLED(CONFIG_HYPERV)
+static void (*vmbus_handler)(void);
+static void (*hv_stimer0_handler)(void);
+static void (*hv_kexec_handler)(void);
+static void (*hv_crash_handler)(struct pt_regs *regs);
+
+__visible void __irq_entry hyperv_vector_handler(struct pt_regs *regs)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+
+ entering_irq();
+ inc_irq_stat(irq_hv_callback_count);
+ if (vmbus_handler)
+ vmbus_handler();
+
+ if (ms_hyperv.hints & HV_DEPRECATING_AEOI_RECOMMENDED)
+ ack_APIC_irq();
+
+ exiting_irq();
+ set_irq_regs(old_regs);
+}
+
+void hv_setup_vmbus_irq(void (*handler)(void))
+{
+ vmbus_handler = handler;
+}
+
+void hv_remove_vmbus_irq(void)
+{
+ /* We have no way to deallocate the interrupt gate */
+ vmbus_handler = NULL;
+}
+EXPORT_SYMBOL_GPL(hv_setup_vmbus_irq);
+EXPORT_SYMBOL_GPL(hv_remove_vmbus_irq);
+
+/*
+ * Routines to do per-architecture handling of stimer0
+ * interrupts when in Direct Mode
+ */
+
+__visible void __irq_entry hv_stimer0_vector_handler(struct pt_regs *regs)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+
+ entering_irq();
+ inc_irq_stat(hyperv_stimer0_count);
+ if (hv_stimer0_handler)
+ hv_stimer0_handler();
+ ack_APIC_irq();
+
+ exiting_irq();
+ set_irq_regs(old_regs);
+}
+
+int hv_setup_stimer0_irq(int *irq, int *vector, void (*handler)(void))
+{
+ *vector = HYPERV_STIMER0_VECTOR;
+ *irq = 0; /* Unused on x86/x64 */
+ hv_stimer0_handler = handler;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(hv_setup_stimer0_irq);
+
+void hv_remove_stimer0_irq(int irq)
+{
+ /* We have no way to deallocate the interrupt gate */
+ hv_stimer0_handler = NULL;
+}
+EXPORT_SYMBOL_GPL(hv_remove_stimer0_irq);
+
+void hv_setup_kexec_handler(void (*handler)(void))
+{
+ hv_kexec_handler = handler;
+}
+EXPORT_SYMBOL_GPL(hv_setup_kexec_handler);
+
+void hv_remove_kexec_handler(void)
+{
+ hv_kexec_handler = NULL;
+}
+EXPORT_SYMBOL_GPL(hv_remove_kexec_handler);
+
+void hv_setup_crash_handler(void (*handler)(struct pt_regs *regs))
+{
+ hv_crash_handler = handler;
+}
+EXPORT_SYMBOL_GPL(hv_setup_crash_handler);
+
+void hv_remove_crash_handler(void)
+{
+ hv_crash_handler = NULL;
+}
+EXPORT_SYMBOL_GPL(hv_remove_crash_handler);
+
+#ifdef CONFIG_KEXEC_CORE
+static void hv_machine_shutdown(void)
+{
+ if (kexec_in_progress && hv_kexec_handler)
+ hv_kexec_handler();
+ native_machine_shutdown();
+}
+
+static void hv_machine_crash_shutdown(struct pt_regs *regs)
+{
+ if (hv_crash_handler)
+ hv_crash_handler(regs);
+ native_machine_crash_shutdown(regs);
+}
+#endif /* CONFIG_KEXEC_CORE */
+#endif /* CONFIG_HYPERV */
+
+static uint32_t __init ms_hyperv_platform(void)
+{
+ u32 eax;
+ u32 hyp_signature[3];
+
+ if (!boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ return 0;
+
+ cpuid(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS,
+ &eax, &hyp_signature[0], &hyp_signature[1], &hyp_signature[2]);
+
+ if (eax >= HYPERV_CPUID_MIN &&
+ eax <= HYPERV_CPUID_MAX &&
+ !memcmp("Microsoft Hv", hyp_signature, 12))
+ return HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS;
+
+ return 0;
+}
+
+static unsigned char hv_get_nmi_reason(void)
+{
+ return 0;
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+/*
+ * Prior to WS2016 Debug-VM sends NMIs to all CPUs which makes
+ * it dificult to process CHANNELMSG_UNLOAD in case of crash. Handle
+ * unknown NMI on the first CPU which gets it.
+ */
+static int hv_nmi_unknown(unsigned int val, struct pt_regs *regs)
+{
+ static atomic_t nmi_cpu = ATOMIC_INIT(-1);
+
+ if (!unknown_nmi_panic)
+ return NMI_DONE;
+
+ if (atomic_cmpxchg(&nmi_cpu, -1, raw_smp_processor_id()) != -1)
+ return NMI_HANDLED;
+
+ return NMI_DONE;
+}
+#endif
+
+static unsigned long hv_get_tsc_khz(void)
+{
+ unsigned long freq;
+
+ rdmsrl(HV_X64_MSR_TSC_FREQUENCY, freq);
+
+ return freq / 1000;
+}
+
+static void __init ms_hyperv_init_platform(void)
+{
+ int hv_host_info_eax;
+ int hv_host_info_ebx;
+ int hv_host_info_ecx;
+ int hv_host_info_edx;
+
+ /*
+ * Extract the features and hints
+ */
+ ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES);
+ ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES);
+ ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO);
+
+ pr_info("Hyper-V: features 0x%x, hints 0x%x, misc 0x%x\n",
+ ms_hyperv.features, ms_hyperv.hints, ms_hyperv.misc_features);
+
+ ms_hyperv.max_vp_index = cpuid_eax(HYPERV_CPUID_IMPLEMENT_LIMITS);
+ ms_hyperv.max_lp_index = cpuid_ebx(HYPERV_CPUID_IMPLEMENT_LIMITS);
+
+ pr_debug("Hyper-V: max %u virtual processors, %u logical processors\n",
+ ms_hyperv.max_vp_index, ms_hyperv.max_lp_index);
+
+ /*
+ * Extract host information.
+ */
+ if (cpuid_eax(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS) >=
+ HYPERV_CPUID_VERSION) {
+ hv_host_info_eax = cpuid_eax(HYPERV_CPUID_VERSION);
+ hv_host_info_ebx = cpuid_ebx(HYPERV_CPUID_VERSION);
+ hv_host_info_ecx = cpuid_ecx(HYPERV_CPUID_VERSION);
+ hv_host_info_edx = cpuid_edx(HYPERV_CPUID_VERSION);
+
+ pr_info("Hyper-V Host Build:%d-%d.%d-%d-%d.%d\n",
+ hv_host_info_eax, hv_host_info_ebx >> 16,
+ hv_host_info_ebx & 0xFFFF, hv_host_info_ecx,
+ hv_host_info_edx >> 24, hv_host_info_edx & 0xFFFFFF);
+ }
+
+ if (ms_hyperv.features & HV_X64_ACCESS_FREQUENCY_MSRS &&
+ ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) {
+ x86_platform.calibrate_tsc = hv_get_tsc_khz;
+ x86_platform.calibrate_cpu = hv_get_tsc_khz;
+ }
+
+ if (ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED) {
+ ms_hyperv.nested_features =
+ cpuid_eax(HYPERV_CPUID_NESTED_FEATURES);
+ }
+
+ /*
+ * Hyper-V expects to get crash register data or kmsg when
+ * crash enlightment is available and system crashes. Set
+ * crash_kexec_post_notifiers to be true to make sure that
+ * calling crash enlightment interface before running kdump
+ * kernel.
+ */
+ if (ms_hyperv.misc_features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE)
+ crash_kexec_post_notifiers = true;
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ if (ms_hyperv.features & HV_X64_ACCESS_FREQUENCY_MSRS &&
+ ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) {
+ /*
+ * Get the APIC frequency.
+ */
+ u64 hv_lapic_frequency;
+
+ rdmsrl(HV_X64_MSR_APIC_FREQUENCY, hv_lapic_frequency);
+ hv_lapic_frequency = div_u64(hv_lapic_frequency, HZ);
+ lapic_timer_frequency = hv_lapic_frequency;
+ pr_info("Hyper-V: LAPIC Timer Frequency: %#x\n",
+ lapic_timer_frequency);
+ }
+
+ register_nmi_handler(NMI_UNKNOWN, hv_nmi_unknown, NMI_FLAG_FIRST,
+ "hv_nmi_unknown");
+#endif
+
+#ifdef CONFIG_X86_IO_APIC
+ no_timer_check = 1;
+#endif
+
+#if IS_ENABLED(CONFIG_HYPERV) && defined(CONFIG_KEXEC_CORE)
+ machine_ops.shutdown = hv_machine_shutdown;
+ machine_ops.crash_shutdown = hv_machine_crash_shutdown;
+#endif
+ mark_tsc_unstable("running on Hyper-V");
+
+ /*
+ * Generation 2 instances don't support reading the NMI status from
+ * 0x61 port.
+ */
+ if (efi_enabled(EFI_BOOT))
+ x86_platform.get_nmi_reason = hv_get_nmi_reason;
+
+ /*
+ * Hyper-V VMs have a PIT emulation quirk such that zeroing the
+ * counter register during PIT shutdown restarts the PIT. So it
+ * continues to interrupt @18.2 HZ. Setting i8253_clear_counter
+ * to false tells pit_shutdown() not to zero the counter so that
+ * the PIT really is shutdown. Generation 2 VMs don't have a PIT,
+ * and setting this value has no effect.
+ */
+ i8253_clear_counter_on_shutdown = false;
+
+#if IS_ENABLED(CONFIG_HYPERV)
+ /*
+ * Setup the hook to get control post apic initialization.
+ */
+ x86_platform.apic_post_init = hyperv_init;
+ hyperv_setup_mmu_ops();
+ /* Setup the IDT for hypervisor callback */
+ alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, hyperv_callback_vector);
+
+ /* Setup the IDT for reenlightenment notifications */
+ if (ms_hyperv.features & HV_X64_ACCESS_REENLIGHTENMENT)
+ alloc_intr_gate(HYPERV_REENLIGHTENMENT_VECTOR,
+ hyperv_reenlightenment_vector);
+
+ /* Setup the IDT for stimer0 */
+ if (ms_hyperv.misc_features & HV_STIMER_DIRECT_MODE_AVAILABLE)
+ alloc_intr_gate(HYPERV_STIMER0_VECTOR,
+ hv_stimer0_callback_vector);
+#endif
+}
+
+const __initconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
+ .name = "Microsoft Hyper-V",
+ .detect = ms_hyperv_platform,
+ .type = X86_HYPER_MS_HYPERV,
+ .init.init_platform = ms_hyperv_init_platform,
+};
diff --git a/arch/x86/kernel/cpu/mtrr/Makefile b/arch/x86/kernel/cpu/mtrr/Makefile
new file mode 100644
index 000000000..2ad9107ee
--- /dev/null
+++ b/arch/x86/kernel/cpu/mtrr/Makefile
@@ -0,0 +1,3 @@
+obj-y := mtrr.o if.o generic.o cleanup.o
+obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o
+
diff --git a/arch/x86/kernel/cpu/mtrr/amd.c b/arch/x86/kernel/cpu/mtrr/amd.c
new file mode 100644
index 000000000..a65a02720
--- /dev/null
+++ b/arch/x86/kernel/cpu/mtrr/amd.c
@@ -0,0 +1,125 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <asm/mtrr.h>
+#include <asm/msr.h>
+
+#include "mtrr.h"
+
+static void
+amd_get_mtrr(unsigned int reg, unsigned long *base,
+ unsigned long *size, mtrr_type *type)
+{
+ unsigned long low, high;
+
+ rdmsr(MSR_K6_UWCCR, low, high);
+ /* Upper dword is region 1, lower is region 0 */
+ if (reg == 1)
+ low = high;
+ /* The base masks off on the right alignment */
+ *base = (low & 0xFFFE0000) >> PAGE_SHIFT;
+ *type = 0;
+ if (low & 1)
+ *type = MTRR_TYPE_UNCACHABLE;
+ if (low & 2)
+ *type = MTRR_TYPE_WRCOMB;
+ if (!(low & 3)) {
+ *size = 0;
+ return;
+ }
+ /*
+ * This needs a little explaining. The size is stored as an
+ * inverted mask of bits of 128K granularity 15 bits long offset
+ * 2 bits.
+ *
+ * So to get a size we do invert the mask and add 1 to the lowest
+ * mask bit (4 as its 2 bits in). This gives us a size we then shift
+ * to turn into 128K blocks.
+ *
+ * eg 111 1111 1111 1100 is 512K
+ *
+ * invert 000 0000 0000 0011
+ * +1 000 0000 0000 0100
+ * *128K ...
+ */
+ low = (~low) & 0x1FFFC;
+ *size = (low + 4) << (15 - PAGE_SHIFT);
+}
+
+/**
+ * amd_set_mtrr - Set variable MTRR register on the local CPU.
+ *
+ * @reg The register to set.
+ * @base The base address of the region.
+ * @size The size of the region. If this is 0 the region is disabled.
+ * @type The type of the region.
+ *
+ * Returns nothing.
+ */
+static void
+amd_set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type)
+{
+ u32 regs[2];
+
+ /*
+ * Low is MTRR0, High MTRR 1
+ */
+ rdmsr(MSR_K6_UWCCR, regs[0], regs[1]);
+ /*
+ * Blank to disable
+ */
+ if (size == 0) {
+ regs[reg] = 0;
+ } else {
+ /*
+ * Set the register to the base, the type (off by one) and an
+ * inverted bitmask of the size The size is the only odd
+ * bit. We are fed say 512K We invert this and we get 111 1111
+ * 1111 1011 but if you subtract one and invert you get the
+ * desired 111 1111 1111 1100 mask
+ *
+ * But ~(x - 1) == ~x + 1 == -x. Two's complement rocks!
+ */
+ regs[reg] = (-size >> (15 - PAGE_SHIFT) & 0x0001FFFC)
+ | (base << PAGE_SHIFT) | (type + 1);
+ }
+
+ /*
+ * The writeback rule is quite specific. See the manual. Its
+ * disable local interrupts, write back the cache, set the mtrr
+ */
+ wbinvd();
+ wrmsr(MSR_K6_UWCCR, regs[0], regs[1]);
+}
+
+static int
+amd_validate_add_page(unsigned long base, unsigned long size, unsigned int type)
+{
+ /*
+ * Apply the K6 block alignment and size rules
+ * In order
+ * o Uncached or gathering only
+ * o 128K or bigger block
+ * o Power of 2 block
+ * o base suitably aligned to the power
+ */
+ if (type > MTRR_TYPE_WRCOMB || size < (1 << (17 - PAGE_SHIFT))
+ || (size & ~(size - 1)) - size || (base & (size - 1)))
+ return -EINVAL;
+ return 0;
+}
+
+static const struct mtrr_ops amd_mtrr_ops = {
+ .vendor = X86_VENDOR_AMD,
+ .set = amd_set_mtrr,
+ .get = amd_get_mtrr,
+ .get_free_region = generic_get_free_region,
+ .validate_add_page = amd_validate_add_page,
+ .have_wrcomb = positive_have_wrcomb,
+};
+
+int __init amd_init_mtrr(void)
+{
+ set_mtrr_ops(&amd_mtrr_ops);
+ return 0;
+}
diff --git a/arch/x86/kernel/cpu/mtrr/centaur.c b/arch/x86/kernel/cpu/mtrr/centaur.c
new file mode 100644
index 000000000..f27177816
--- /dev/null
+++ b/arch/x86/kernel/cpu/mtrr/centaur.c
@@ -0,0 +1,127 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/init.h>
+#include <linux/mm.h>
+
+#include <asm/mtrr.h>
+#include <asm/msr.h>
+
+#include "mtrr.h"
+
+static struct {
+ unsigned long high;
+ unsigned long low;
+} centaur_mcr[8];
+
+static u8 centaur_mcr_reserved;
+static u8 centaur_mcr_type; /* 0 for winchip, 1 for winchip2 */
+
+/**
+ * centaur_get_free_region - Get a free MTRR.
+ *
+ * @base: The starting (base) address of the region.
+ * @size: The size (in bytes) of the region.
+ *
+ * Returns: the index of the region on success, else -1 on error.
+ */
+static int
+centaur_get_free_region(unsigned long base, unsigned long size, int replace_reg)
+{
+ unsigned long lbase, lsize;
+ mtrr_type ltype;
+ int i, max;
+
+ max = num_var_ranges;
+ if (replace_reg >= 0 && replace_reg < max)
+ return replace_reg;
+
+ for (i = 0; i < max; ++i) {
+ if (centaur_mcr_reserved & (1 << i))
+ continue;
+ mtrr_if->get(i, &lbase, &lsize, &ltype);
+ if (lsize == 0)
+ return i;
+ }
+
+ return -ENOSPC;
+}
+
+/*
+ * Report boot time MCR setups
+ */
+void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi)
+{
+ centaur_mcr[mcr].low = lo;
+ centaur_mcr[mcr].high = hi;
+}
+
+static void
+centaur_get_mcr(unsigned int reg, unsigned long *base,
+ unsigned long *size, mtrr_type * type)
+{
+ *base = centaur_mcr[reg].high >> PAGE_SHIFT;
+ *size = -(centaur_mcr[reg].low & 0xfffff000) >> PAGE_SHIFT;
+ *type = MTRR_TYPE_WRCOMB; /* write-combining */
+
+ if (centaur_mcr_type == 1 && ((centaur_mcr[reg].low & 31) & 2))
+ *type = MTRR_TYPE_UNCACHABLE;
+ if (centaur_mcr_type == 1 && (centaur_mcr[reg].low & 31) == 25)
+ *type = MTRR_TYPE_WRBACK;
+ if (centaur_mcr_type == 0 && (centaur_mcr[reg].low & 31) == 31)
+ *type = MTRR_TYPE_WRBACK;
+}
+
+static void
+centaur_set_mcr(unsigned int reg, unsigned long base,
+ unsigned long size, mtrr_type type)
+{
+ unsigned long low, high;
+
+ if (size == 0) {
+ /* Disable */
+ high = low = 0;
+ } else {
+ high = base << PAGE_SHIFT;
+ if (centaur_mcr_type == 0) {
+ /* Only support write-combining... */
+ low = -size << PAGE_SHIFT | 0x1f;
+ } else {
+ if (type == MTRR_TYPE_UNCACHABLE)
+ low = -size << PAGE_SHIFT | 0x02; /* NC */
+ else
+ low = -size << PAGE_SHIFT | 0x09; /* WWO, WC */
+ }
+ }
+ centaur_mcr[reg].high = high;
+ centaur_mcr[reg].low = low;
+ wrmsr(MSR_IDT_MCR0 + reg, low, high);
+}
+
+static int
+centaur_validate_add_page(unsigned long base, unsigned long size, unsigned int type)
+{
+ /*
+ * FIXME: Winchip2 supports uncached
+ */
+ if (type != MTRR_TYPE_WRCOMB &&
+ (centaur_mcr_type == 0 || type != MTRR_TYPE_UNCACHABLE)) {
+ pr_warn("mtrr: only write-combining%s supported\n",
+ centaur_mcr_type ? " and uncacheable are" : " is");
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static const struct mtrr_ops centaur_mtrr_ops = {
+ .vendor = X86_VENDOR_CENTAUR,
+ .set = centaur_set_mcr,
+ .get = centaur_get_mcr,
+ .get_free_region = centaur_get_free_region,
+ .validate_add_page = centaur_validate_add_page,
+ .have_wrcomb = positive_have_wrcomb,
+};
+
+int __init centaur_init_mtrr(void)
+{
+ set_mtrr_ops(&centaur_mtrr_ops);
+ return 0;
+}
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
new file mode 100644
index 000000000..765afd599
--- /dev/null
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -0,0 +1,987 @@
+/*
+ * MTRR (Memory Type Range Register) cleanup
+ *
+ * Copyright (C) 2009 Yinghai Lu
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the Free
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/smp.h>
+#include <linux/cpu.h>
+#include <linux/mutex.h>
+#include <linux/uaccess.h>
+#include <linux/kvm_para.h>
+#include <linux/range.h>
+
+#include <asm/processor.h>
+#include <asm/e820/api.h>
+#include <asm/mtrr.h>
+#include <asm/msr.h>
+
+#include "mtrr.h"
+
+struct var_mtrr_range_state {
+ unsigned long base_pfn;
+ unsigned long size_pfn;
+ mtrr_type type;
+};
+
+struct var_mtrr_state {
+ unsigned long range_startk;
+ unsigned long range_sizek;
+ unsigned long chunk_sizek;
+ unsigned long gran_sizek;
+ unsigned int reg;
+};
+
+/* Should be related to MTRR_VAR_RANGES nums */
+#define RANGE_NUM 256
+
+static struct range __initdata range[RANGE_NUM];
+static int __initdata nr_range;
+
+static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
+
+static int __initdata debug_print;
+#define Dprintk(x...) do { if (debug_print) pr_debug(x); } while (0)
+
+#define BIOS_BUG_MSG \
+ "WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n"
+
+static int __init
+x86_get_mtrr_mem_range(struct range *range, int nr_range,
+ unsigned long extra_remove_base,
+ unsigned long extra_remove_size)
+{
+ unsigned long base, size;
+ mtrr_type type;
+ int i;
+
+ for (i = 0; i < num_var_ranges; i++) {
+ type = range_state[i].type;
+ if (type != MTRR_TYPE_WRBACK)
+ continue;
+ base = range_state[i].base_pfn;
+ size = range_state[i].size_pfn;
+ nr_range = add_range_with_merge(range, RANGE_NUM, nr_range,
+ base, base + size);
+ }
+ if (debug_print) {
+ pr_debug("After WB checking\n");
+ for (i = 0; i < nr_range; i++)
+ pr_debug("MTRR MAP PFN: %016llx - %016llx\n",
+ range[i].start, range[i].end);
+ }
+
+ /* Take out UC ranges: */
+ for (i = 0; i < num_var_ranges; i++) {
+ type = range_state[i].type;
+ if (type != MTRR_TYPE_UNCACHABLE &&
+ type != MTRR_TYPE_WRPROT)
+ continue;
+ size = range_state[i].size_pfn;
+ if (!size)
+ continue;
+ base = range_state[i].base_pfn;
+ if (base < (1<<(20-PAGE_SHIFT)) && mtrr_state.have_fixed &&
+ (mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED) &&
+ (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) {
+ /* Var MTRR contains UC entry below 1M? Skip it: */
+ pr_warn(BIOS_BUG_MSG, i);
+ if (base + size <= (1<<(20-PAGE_SHIFT)))
+ continue;
+ size -= (1<<(20-PAGE_SHIFT)) - base;
+ base = 1<<(20-PAGE_SHIFT);
+ }
+ subtract_range(range, RANGE_NUM, base, base + size);
+ }
+ if (extra_remove_size)
+ subtract_range(range, RANGE_NUM, extra_remove_base,
+ extra_remove_base + extra_remove_size);
+
+ if (debug_print) {
+ pr_debug("After UC checking\n");
+ for (i = 0; i < RANGE_NUM; i++) {
+ if (!range[i].end)
+ continue;
+ pr_debug("MTRR MAP PFN: %016llx - %016llx\n",
+ range[i].start, range[i].end);
+ }
+ }
+
+ /* sort the ranges */
+ nr_range = clean_sort_range(range, RANGE_NUM);
+ if (debug_print) {
+ pr_debug("After sorting\n");
+ for (i = 0; i < nr_range; i++)
+ pr_debug("MTRR MAP PFN: %016llx - %016llx\n",
+ range[i].start, range[i].end);
+ }
+
+ return nr_range;
+}
+
+#ifdef CONFIG_MTRR_SANITIZER
+
+static unsigned long __init sum_ranges(struct range *range, int nr_range)
+{
+ unsigned long sum = 0;
+ int i;
+
+ for (i = 0; i < nr_range; i++)
+ sum += range[i].end - range[i].start;
+
+ return sum;
+}
+
+static int enable_mtrr_cleanup __initdata =
+ CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT;
+
+static int __init disable_mtrr_cleanup_setup(char *str)
+{
+ enable_mtrr_cleanup = 0;
+ return 0;
+}
+early_param("disable_mtrr_cleanup", disable_mtrr_cleanup_setup);
+
+static int __init enable_mtrr_cleanup_setup(char *str)
+{
+ enable_mtrr_cleanup = 1;
+ return 0;
+}
+early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup);
+
+static int __init mtrr_cleanup_debug_setup(char *str)
+{
+ debug_print = 1;
+ return 0;
+}
+early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup);
+
+static void __init
+set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
+ unsigned char type, unsigned int address_bits)
+{
+ u32 base_lo, base_hi, mask_lo, mask_hi;
+ u64 base, mask;
+
+ if (!sizek) {
+ fill_mtrr_var_range(reg, 0, 0, 0, 0);
+ return;
+ }
+
+ mask = (1ULL << address_bits) - 1;
+ mask &= ~((((u64)sizek) << 10) - 1);
+
+ base = ((u64)basek) << 10;
+
+ base |= type;
+ mask |= 0x800;
+
+ base_lo = base & ((1ULL<<32) - 1);
+ base_hi = base >> 32;
+
+ mask_lo = mask & ((1ULL<<32) - 1);
+ mask_hi = mask >> 32;
+
+ fill_mtrr_var_range(reg, base_lo, base_hi, mask_lo, mask_hi);
+}
+
+static void __init
+save_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
+ unsigned char type)
+{
+ range_state[reg].base_pfn = basek >> (PAGE_SHIFT - 10);
+ range_state[reg].size_pfn = sizek >> (PAGE_SHIFT - 10);
+ range_state[reg].type = type;
+}
+
+static void __init set_var_mtrr_all(unsigned int address_bits)
+{
+ unsigned long basek, sizek;
+ unsigned char type;
+ unsigned int reg;
+
+ for (reg = 0; reg < num_var_ranges; reg++) {
+ basek = range_state[reg].base_pfn << (PAGE_SHIFT - 10);
+ sizek = range_state[reg].size_pfn << (PAGE_SHIFT - 10);
+ type = range_state[reg].type;
+
+ set_var_mtrr(reg, basek, sizek, type, address_bits);
+ }
+}
+
+static unsigned long to_size_factor(unsigned long sizek, char *factorp)
+{
+ unsigned long base = sizek;
+ char factor;
+
+ if (base & ((1<<10) - 1)) {
+ /* Not MB-aligned: */
+ factor = 'K';
+ } else if (base & ((1<<20) - 1)) {
+ factor = 'M';
+ base >>= 10;
+ } else {
+ factor = 'G';
+ base >>= 20;
+ }
+
+ *factorp = factor;
+
+ return base;
+}
+
+static unsigned int __init
+range_to_mtrr(unsigned int reg, unsigned long range_startk,
+ unsigned long range_sizek, unsigned char type)
+{
+ if (!range_sizek || (reg >= num_var_ranges))
+ return reg;
+
+ while (range_sizek) {
+ unsigned long max_align, align;
+ unsigned long sizek;
+
+ /* Compute the maximum size with which we can make a range: */
+ if (range_startk)
+ max_align = __ffs(range_startk);
+ else
+ max_align = BITS_PER_LONG - 1;
+
+ align = __fls(range_sizek);
+ if (align > max_align)
+ align = max_align;
+
+ sizek = 1UL << align;
+ if (debug_print) {
+ char start_factor = 'K', size_factor = 'K';
+ unsigned long start_base, size_base;
+
+ start_base = to_size_factor(range_startk, &start_factor);
+ size_base = to_size_factor(sizek, &size_factor);
+
+ Dprintk("Setting variable MTRR %d, "
+ "base: %ld%cB, range: %ld%cB, type %s\n",
+ reg, start_base, start_factor,
+ size_base, size_factor,
+ (type == MTRR_TYPE_UNCACHABLE) ? "UC" :
+ ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other")
+ );
+ }
+ save_var_mtrr(reg++, range_startk, sizek, type);
+ range_startk += sizek;
+ range_sizek -= sizek;
+ if (reg >= num_var_ranges)
+ break;
+ }
+ return reg;
+}
+
+static unsigned __init
+range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
+ unsigned long sizek)
+{
+ unsigned long hole_basek, hole_sizek;
+ unsigned long second_basek, second_sizek;
+ unsigned long range0_basek, range0_sizek;
+ unsigned long range_basek, range_sizek;
+ unsigned long chunk_sizek;
+ unsigned long gran_sizek;
+
+ hole_basek = 0;
+ hole_sizek = 0;
+ second_basek = 0;
+ second_sizek = 0;
+ chunk_sizek = state->chunk_sizek;
+ gran_sizek = state->gran_sizek;
+
+ /* Align with gran size, prevent small block used up MTRRs: */
+ range_basek = ALIGN(state->range_startk, gran_sizek);
+ if ((range_basek > basek) && basek)
+ return second_sizek;
+
+ state->range_sizek -= (range_basek - state->range_startk);
+ range_sizek = ALIGN(state->range_sizek, gran_sizek);
+
+ while (range_sizek > state->range_sizek) {
+ range_sizek -= gran_sizek;
+ if (!range_sizek)
+ return 0;
+ }
+ state->range_sizek = range_sizek;
+
+ /* Try to append some small hole: */
+ range0_basek = state->range_startk;
+ range0_sizek = ALIGN(state->range_sizek, chunk_sizek);
+
+ /* No increase: */
+ if (range0_sizek == state->range_sizek) {
+ Dprintk("rangeX: %016lx - %016lx\n",
+ range0_basek<<10,
+ (range0_basek + state->range_sizek)<<10);
+ state->reg = range_to_mtrr(state->reg, range0_basek,
+ state->range_sizek, MTRR_TYPE_WRBACK);
+ return 0;
+ }
+
+ /* Only cut back when it is not the last: */
+ if (sizek) {
+ while (range0_basek + range0_sizek > (basek + sizek)) {
+ if (range0_sizek >= chunk_sizek)
+ range0_sizek -= chunk_sizek;
+ else
+ range0_sizek = 0;
+
+ if (!range0_sizek)
+ break;
+ }
+ }
+
+second_try:
+ range_basek = range0_basek + range0_sizek;
+
+ /* One hole in the middle: */
+ if (range_basek > basek && range_basek <= (basek + sizek))
+ second_sizek = range_basek - basek;
+
+ if (range0_sizek > state->range_sizek) {
+
+ /* One hole in middle or at the end: */
+ hole_sizek = range0_sizek - state->range_sizek - second_sizek;
+
+ /* Hole size should be less than half of range0 size: */
+ if (hole_sizek >= (range0_sizek >> 1) &&
+ range0_sizek >= chunk_sizek) {
+ range0_sizek -= chunk_sizek;
+ second_sizek = 0;
+ hole_sizek = 0;
+
+ goto second_try;
+ }
+ }
+
+ if (range0_sizek) {
+ Dprintk("range0: %016lx - %016lx\n",
+ range0_basek<<10,
+ (range0_basek + range0_sizek)<<10);
+ state->reg = range_to_mtrr(state->reg, range0_basek,
+ range0_sizek, MTRR_TYPE_WRBACK);
+ }
+
+ if (range0_sizek < state->range_sizek) {
+ /* Need to handle left over range: */
+ range_sizek = state->range_sizek - range0_sizek;
+
+ Dprintk("range: %016lx - %016lx\n",
+ range_basek<<10,
+ (range_basek + range_sizek)<<10);
+
+ state->reg = range_to_mtrr(state->reg, range_basek,
+ range_sizek, MTRR_TYPE_WRBACK);
+ }
+
+ if (hole_sizek) {
+ hole_basek = range_basek - hole_sizek - second_sizek;
+ Dprintk("hole: %016lx - %016lx\n",
+ hole_basek<<10,
+ (hole_basek + hole_sizek)<<10);
+ state->reg = range_to_mtrr(state->reg, hole_basek,
+ hole_sizek, MTRR_TYPE_UNCACHABLE);
+ }
+
+ return second_sizek;
+}
+
+static void __init
+set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn,
+ unsigned long size_pfn)
+{
+ unsigned long basek, sizek;
+ unsigned long second_sizek = 0;
+
+ if (state->reg >= num_var_ranges)
+ return;
+
+ basek = base_pfn << (PAGE_SHIFT - 10);
+ sizek = size_pfn << (PAGE_SHIFT - 10);
+
+ /* See if I can merge with the last range: */
+ if ((basek <= 1024) ||
+ (state->range_startk + state->range_sizek == basek)) {
+ unsigned long endk = basek + sizek;
+ state->range_sizek = endk - state->range_startk;
+ return;
+ }
+ /* Write the range mtrrs: */
+ if (state->range_sizek != 0)
+ second_sizek = range_to_mtrr_with_hole(state, basek, sizek);
+
+ /* Allocate an msr: */
+ state->range_startk = basek + second_sizek;
+ state->range_sizek = sizek - second_sizek;
+}
+
+/* Mininum size of mtrr block that can take hole: */
+static u64 mtrr_chunk_size __initdata = (256ULL<<20);
+
+static int __init parse_mtrr_chunk_size_opt(char *p)
+{
+ if (!p)
+ return -EINVAL;
+ mtrr_chunk_size = memparse(p, &p);
+ return 0;
+}
+early_param("mtrr_chunk_size", parse_mtrr_chunk_size_opt);
+
+/* Granularity of mtrr of block: */
+static u64 mtrr_gran_size __initdata;
+
+static int __init parse_mtrr_gran_size_opt(char *p)
+{
+ if (!p)
+ return -EINVAL;
+ mtrr_gran_size = memparse(p, &p);
+ return 0;
+}
+early_param("mtrr_gran_size", parse_mtrr_gran_size_opt);
+
+static unsigned long nr_mtrr_spare_reg __initdata =
+ CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT;
+
+static int __init parse_mtrr_spare_reg(char *arg)
+{
+ if (arg)
+ nr_mtrr_spare_reg = simple_strtoul(arg, NULL, 0);
+ return 0;
+}
+early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg);
+
+static int __init
+x86_setup_var_mtrrs(struct range *range, int nr_range,
+ u64 chunk_size, u64 gran_size)
+{
+ struct var_mtrr_state var_state;
+ int num_reg;
+ int i;
+
+ var_state.range_startk = 0;
+ var_state.range_sizek = 0;
+ var_state.reg = 0;
+ var_state.chunk_sizek = chunk_size >> 10;
+ var_state.gran_sizek = gran_size >> 10;
+
+ memset(range_state, 0, sizeof(range_state));
+
+ /* Write the range: */
+ for (i = 0; i < nr_range; i++) {
+ set_var_mtrr_range(&var_state, range[i].start,
+ range[i].end - range[i].start);
+ }
+
+ /* Write the last range: */
+ if (var_state.range_sizek != 0)
+ range_to_mtrr_with_hole(&var_state, 0, 0);
+
+ num_reg = var_state.reg;
+ /* Clear out the extra MTRR's: */
+ while (var_state.reg < num_var_ranges) {
+ save_var_mtrr(var_state.reg, 0, 0, 0);
+ var_state.reg++;
+ }
+
+ return num_reg;
+}
+
+struct mtrr_cleanup_result {
+ unsigned long gran_sizek;
+ unsigned long chunk_sizek;
+ unsigned long lose_cover_sizek;
+ unsigned int num_reg;
+ int bad;
+};
+
+/*
+ * gran_size: 64K, 128K, 256K, 512K, 1M, 2M, ..., 2G
+ * chunk size: gran_size, ..., 2G
+ * so we need (1+16)*8
+ */
+#define NUM_RESULT 136
+#define PSHIFT (PAGE_SHIFT - 10)
+
+static struct mtrr_cleanup_result __initdata result[NUM_RESULT];
+static unsigned long __initdata min_loss_pfn[RANGE_NUM];
+
+static void __init print_out_mtrr_range_state(void)
+{
+ char start_factor = 'K', size_factor = 'K';
+ unsigned long start_base, size_base;
+ mtrr_type type;
+ int i;
+
+ for (i = 0; i < num_var_ranges; i++) {
+
+ size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10);
+ if (!size_base)
+ continue;
+
+ size_base = to_size_factor(size_base, &size_factor),
+ start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10);
+ start_base = to_size_factor(start_base, &start_factor),
+ type = range_state[i].type;
+
+ pr_debug("reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
+ i, start_base, start_factor,
+ size_base, size_factor,
+ (type == MTRR_TYPE_UNCACHABLE) ? "UC" :
+ ((type == MTRR_TYPE_WRPROT) ? "WP" :
+ ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other"))
+ );
+ }
+}
+
+static int __init mtrr_need_cleanup(void)
+{
+ int i;
+ mtrr_type type;
+ unsigned long size;
+ /* Extra one for all 0: */
+ int num[MTRR_NUM_TYPES + 1];
+
+ /* Check entries number: */
+ memset(num, 0, sizeof(num));
+ for (i = 0; i < num_var_ranges; i++) {
+ type = range_state[i].type;
+ size = range_state[i].size_pfn;
+ if (type >= MTRR_NUM_TYPES)
+ continue;
+ if (!size)
+ type = MTRR_NUM_TYPES;
+ num[type]++;
+ }
+
+ /* Check if we got UC entries: */
+ if (!num[MTRR_TYPE_UNCACHABLE])
+ return 0;
+
+ /* Check if we only had WB and UC */
+ if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
+ num_var_ranges - num[MTRR_NUM_TYPES])
+ return 0;
+
+ return 1;
+}
+
+static unsigned long __initdata range_sums;
+
+static void __init
+mtrr_calc_range_state(u64 chunk_size, u64 gran_size,
+ unsigned long x_remove_base,
+ unsigned long x_remove_size, int i)
+{
+ /*
+ * range_new should really be an automatic variable, but
+ * putting 4096 bytes on the stack is frowned upon, to put it
+ * mildly. It is safe to make it a static __initdata variable,
+ * since mtrr_calc_range_state is only called during init and
+ * there's no way it will call itself recursively.
+ */
+ static struct range range_new[RANGE_NUM] __initdata;
+ unsigned long range_sums_new;
+ int nr_range_new;
+ int num_reg;
+
+ /* Convert ranges to var ranges state: */
+ num_reg = x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
+
+ /* We got new setting in range_state, check it: */
+ memset(range_new, 0, sizeof(range_new));
+ nr_range_new = x86_get_mtrr_mem_range(range_new, 0,
+ x_remove_base, x_remove_size);
+ range_sums_new = sum_ranges(range_new, nr_range_new);
+
+ result[i].chunk_sizek = chunk_size >> 10;
+ result[i].gran_sizek = gran_size >> 10;
+ result[i].num_reg = num_reg;
+
+ if (range_sums < range_sums_new) {
+ result[i].lose_cover_sizek = (range_sums_new - range_sums) << PSHIFT;
+ result[i].bad = 1;
+ } else {
+ result[i].lose_cover_sizek = (range_sums - range_sums_new) << PSHIFT;
+ }
+
+ /* Double check it: */
+ if (!result[i].bad && !result[i].lose_cover_sizek) {
+ if (nr_range_new != nr_range || memcmp(range, range_new, sizeof(range)))
+ result[i].bad = 1;
+ }
+
+ if (!result[i].bad && (range_sums - range_sums_new < min_loss_pfn[num_reg]))
+ min_loss_pfn[num_reg] = range_sums - range_sums_new;
+}
+
+static void __init mtrr_print_out_one_result(int i)
+{
+ unsigned long gran_base, chunk_base, lose_base;
+ char gran_factor, chunk_factor, lose_factor;
+
+ gran_base = to_size_factor(result[i].gran_sizek, &gran_factor);
+ chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor);
+ lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor);
+
+ pr_info("%sgran_size: %ld%c \tchunk_size: %ld%c \t",
+ result[i].bad ? "*BAD*" : " ",
+ gran_base, gran_factor, chunk_base, chunk_factor);
+ pr_cont("num_reg: %d \tlose cover RAM: %s%ld%c\n",
+ result[i].num_reg, result[i].bad ? "-" : "",
+ lose_base, lose_factor);
+}
+
+static int __init mtrr_search_optimal_index(void)
+{
+ int num_reg_good;
+ int index_good;
+ int i;
+
+ if (nr_mtrr_spare_reg >= num_var_ranges)
+ nr_mtrr_spare_reg = num_var_ranges - 1;
+
+ num_reg_good = -1;
+ for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) {
+ if (!min_loss_pfn[i])
+ num_reg_good = i;
+ }
+
+ index_good = -1;
+ if (num_reg_good != -1) {
+ for (i = 0; i < NUM_RESULT; i++) {
+ if (!result[i].bad &&
+ result[i].num_reg == num_reg_good &&
+ !result[i].lose_cover_sizek) {
+ index_good = i;
+ break;
+ }
+ }
+ }
+
+ return index_good;
+}
+
+int __init mtrr_cleanup(unsigned address_bits)
+{
+ unsigned long x_remove_base, x_remove_size;
+ unsigned long base, size, def, dummy;
+ u64 chunk_size, gran_size;
+ mtrr_type type;
+ int index_good;
+ int i;
+
+ if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1)
+ return 0;
+
+ rdmsr(MSR_MTRRdefType, def, dummy);
+ def &= 0xff;
+ if (def != MTRR_TYPE_UNCACHABLE)
+ return 0;
+
+ /* Get it and store it aside: */
+ memset(range_state, 0, sizeof(range_state));
+ for (i = 0; i < num_var_ranges; i++) {
+ mtrr_if->get(i, &base, &size, &type);
+ range_state[i].base_pfn = base;
+ range_state[i].size_pfn = size;
+ range_state[i].type = type;
+ }
+
+ /* Check if we need handle it and can handle it: */
+ if (!mtrr_need_cleanup())
+ return 0;
+
+ /* Print original var MTRRs at first, for debugging: */
+ pr_debug("original variable MTRRs\n");
+ print_out_mtrr_range_state();
+
+ memset(range, 0, sizeof(range));
+ x_remove_size = 0;
+ x_remove_base = 1 << (32 - PAGE_SHIFT);
+ if (mtrr_tom2)
+ x_remove_size = (mtrr_tom2 >> PAGE_SHIFT) - x_remove_base;
+
+ /*
+ * [0, 1M) should always be covered by var mtrr with WB
+ * and fixed mtrrs should take effect before var mtrr for it:
+ */
+ nr_range = add_range_with_merge(range, RANGE_NUM, 0, 0,
+ 1ULL<<(20 - PAGE_SHIFT));
+ /* add from var mtrr at last */
+ nr_range = x86_get_mtrr_mem_range(range, nr_range,
+ x_remove_base, x_remove_size);
+
+ range_sums = sum_ranges(range, nr_range);
+ pr_info("total RAM covered: %ldM\n",
+ range_sums >> (20 - PAGE_SHIFT));
+
+ if (mtrr_chunk_size && mtrr_gran_size) {
+ i = 0;
+ mtrr_calc_range_state(mtrr_chunk_size, mtrr_gran_size,
+ x_remove_base, x_remove_size, i);
+
+ mtrr_print_out_one_result(i);
+
+ if (!result[i].bad) {
+ set_var_mtrr_all(address_bits);
+ pr_debug("New variable MTRRs\n");
+ print_out_mtrr_range_state();
+ return 1;
+ }
+ pr_info("invalid mtrr_gran_size or mtrr_chunk_size, will find optimal one\n");
+ }
+
+ i = 0;
+ memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn));
+ memset(result, 0, sizeof(result));
+ for (gran_size = (1ULL<<16); gran_size < (1ULL<<32); gran_size <<= 1) {
+
+ for (chunk_size = gran_size; chunk_size < (1ULL<<32);
+ chunk_size <<= 1) {
+
+ if (i >= NUM_RESULT)
+ continue;
+
+ mtrr_calc_range_state(chunk_size, gran_size,
+ x_remove_base, x_remove_size, i);
+ if (debug_print) {
+ mtrr_print_out_one_result(i);
+ pr_info("\n");
+ }
+
+ i++;
+ }
+ }
+
+ /* Try to find the optimal index: */
+ index_good = mtrr_search_optimal_index();
+
+ if (index_good != -1) {
+ pr_info("Found optimal setting for mtrr clean up\n");
+ i = index_good;
+ mtrr_print_out_one_result(i);
+
+ /* Convert ranges to var ranges state: */
+ chunk_size = result[i].chunk_sizek;
+ chunk_size <<= 10;
+ gran_size = result[i].gran_sizek;
+ gran_size <<= 10;
+ x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
+ set_var_mtrr_all(address_bits);
+ pr_debug("New variable MTRRs\n");
+ print_out_mtrr_range_state();
+ return 1;
+ } else {
+ /* print out all */
+ for (i = 0; i < NUM_RESULT; i++)
+ mtrr_print_out_one_result(i);
+ }
+
+ pr_info("mtrr_cleanup: can not find optimal value\n");
+ pr_info("please specify mtrr_gran_size/mtrr_chunk_size\n");
+
+ return 0;
+}
+#else
+int __init mtrr_cleanup(unsigned address_bits)
+{
+ return 0;
+}
+#endif
+
+static int disable_mtrr_trim;
+
+static int __init disable_mtrr_trim_setup(char *str)
+{
+ disable_mtrr_trim = 1;
+ return 0;
+}
+early_param("disable_mtrr_trim", disable_mtrr_trim_setup);
+
+/*
+ * Newer AMD K8s and later CPUs have a special magic MSR way to force WB
+ * for memory >4GB. Check for that here.
+ * Note this won't check if the MTRRs < 4GB where the magic bit doesn't
+ * apply to are wrong, but so far we don't know of any such case in the wild.
+ */
+#define Tom2Enabled (1U << 21)
+#define Tom2ForceMemTypeWB (1U << 22)
+
+int __init amd_special_default_mtrr(void)
+{
+ u32 l, h;
+
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
+ return 0;
+ if (boot_cpu_data.x86 < 0xf)
+ return 0;
+ /* In case some hypervisor doesn't pass SYSCFG through: */
+ if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0)
+ return 0;
+ /*
+ * Memory between 4GB and top of mem is forced WB by this magic bit.
+ * Reserved before K8RevF, but should be zero there.
+ */
+ if ((l & (Tom2Enabled | Tom2ForceMemTypeWB)) ==
+ (Tom2Enabled | Tom2ForceMemTypeWB))
+ return 1;
+ return 0;
+}
+
+static u64 __init
+real_trim_memory(unsigned long start_pfn, unsigned long limit_pfn)
+{
+ u64 trim_start, trim_size;
+
+ trim_start = start_pfn;
+ trim_start <<= PAGE_SHIFT;
+
+ trim_size = limit_pfn;
+ trim_size <<= PAGE_SHIFT;
+ trim_size -= trim_start;
+
+ return e820__range_update(trim_start, trim_size, E820_TYPE_RAM, E820_TYPE_RESERVED);
+}
+
+/**
+ * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs
+ * @end_pfn: ending page frame number
+ *
+ * Some buggy BIOSes don't setup the MTRRs properly for systems with certain
+ * memory configurations. This routine checks that the highest MTRR matches
+ * the end of memory, to make sure the MTRRs having a write back type cover
+ * all of the memory the kernel is intending to use. If not, it'll trim any
+ * memory off the end by adjusting end_pfn, removing it from the kernel's
+ * allocation pools, warning the user with an obnoxious message.
+ */
+int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
+{
+ unsigned long i, base, size, highest_pfn = 0, def, dummy;
+ mtrr_type type;
+ u64 total_trim_size;
+ /* extra one for all 0 */
+ int num[MTRR_NUM_TYPES + 1];
+
+ /*
+ * Make sure we only trim uncachable memory on machines that
+ * support the Intel MTRR architecture:
+ */
+ if (!is_cpu(INTEL) || disable_mtrr_trim)
+ return 0;
+
+ rdmsr(MSR_MTRRdefType, def, dummy);
+ def &= 0xff;
+ if (def != MTRR_TYPE_UNCACHABLE)
+ return 0;
+
+ /* Get it and store it aside: */
+ memset(range_state, 0, sizeof(range_state));
+ for (i = 0; i < num_var_ranges; i++) {
+ mtrr_if->get(i, &base, &size, &type);
+ range_state[i].base_pfn = base;
+ range_state[i].size_pfn = size;
+ range_state[i].type = type;
+ }
+
+ /* Find highest cached pfn: */
+ for (i = 0; i < num_var_ranges; i++) {
+ type = range_state[i].type;
+ if (type != MTRR_TYPE_WRBACK)
+ continue;
+ base = range_state[i].base_pfn;
+ size = range_state[i].size_pfn;
+ if (highest_pfn < base + size)
+ highest_pfn = base + size;
+ }
+
+ /* kvm/qemu doesn't have mtrr set right, don't trim them all: */
+ if (!highest_pfn) {
+ pr_info("CPU MTRRs all blank - virtualized system.\n");
+ return 0;
+ }
+
+ /* Check entries number: */
+ memset(num, 0, sizeof(num));
+ for (i = 0; i < num_var_ranges; i++) {
+ type = range_state[i].type;
+ if (type >= MTRR_NUM_TYPES)
+ continue;
+ size = range_state[i].size_pfn;
+ if (!size)
+ type = MTRR_NUM_TYPES;
+ num[type]++;
+ }
+
+ /* No entry for WB? */
+ if (!num[MTRR_TYPE_WRBACK])
+ return 0;
+
+ /* Check if we only had WB and UC: */
+ if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
+ num_var_ranges - num[MTRR_NUM_TYPES])
+ return 0;
+
+ memset(range, 0, sizeof(range));
+ nr_range = 0;
+ if (mtrr_tom2) {
+ range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT));
+ range[nr_range].end = mtrr_tom2 >> PAGE_SHIFT;
+ if (highest_pfn < range[nr_range].end)
+ highest_pfn = range[nr_range].end;
+ nr_range++;
+ }
+ nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0);
+
+ /* Check the head: */
+ total_trim_size = 0;
+ if (range[0].start)
+ total_trim_size += real_trim_memory(0, range[0].start);
+
+ /* Check the holes: */
+ for (i = 0; i < nr_range - 1; i++) {
+ if (range[i].end < range[i+1].start)
+ total_trim_size += real_trim_memory(range[i].end,
+ range[i+1].start);
+ }
+
+ /* Check the top: */
+ i = nr_range - 1;
+ if (range[i].end < end_pfn)
+ total_trim_size += real_trim_memory(range[i].end,
+ end_pfn);
+
+ if (total_trim_size) {
+ pr_warn("WARNING: BIOS bug: CPU MTRRs don't cover all of memory, losing %lluMB of RAM.\n",
+ total_trim_size >> 20);
+
+ if (!changed_by_mtrr_cleanup)
+ WARN_ON(1);
+
+ pr_info("update e820 for mtrr\n");
+ e820__update_table_print();
+
+ return 1;
+ }
+
+ return 0;
+}
diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c
new file mode 100644
index 000000000..4296c702a
--- /dev/null
+++ b/arch/x86/kernel/cpu/mtrr/cyrix.c
@@ -0,0 +1,283 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/mm.h>
+
+#include <asm/processor-cyrix.h>
+#include <asm/processor-flags.h>
+#include <asm/mtrr.h>
+#include <asm/msr.h>
+
+#include "mtrr.h"
+
+static void
+cyrix_get_arr(unsigned int reg, unsigned long *base,
+ unsigned long *size, mtrr_type * type)
+{
+ unsigned char arr, ccr3, rcr, shift;
+ unsigned long flags;
+
+ arr = CX86_ARR_BASE + (reg << 1) + reg; /* avoid multiplication by 3 */
+
+ local_irq_save(flags);
+
+ ccr3 = getCx86(CX86_CCR3);
+ setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
+ ((unsigned char *)base)[3] = getCx86(arr);
+ ((unsigned char *)base)[2] = getCx86(arr + 1);
+ ((unsigned char *)base)[1] = getCx86(arr + 2);
+ rcr = getCx86(CX86_RCR_BASE + reg);
+ setCx86(CX86_CCR3, ccr3); /* disable MAPEN */
+
+ local_irq_restore(flags);
+
+ shift = ((unsigned char *) base)[1] & 0x0f;
+ *base >>= PAGE_SHIFT;
+
+ /*
+ * Power of two, at least 4K on ARR0-ARR6, 256K on ARR7
+ * Note: shift==0xf means 4G, this is unsupported.
+ */
+ if (shift)
+ *size = (reg < 7 ? 0x1UL : 0x40UL) << (shift - 1);
+ else
+ *size = 0;
+
+ /* Bit 0 is Cache Enable on ARR7, Cache Disable on ARR0-ARR6 */
+ if (reg < 7) {
+ switch (rcr) {
+ case 1:
+ *type = MTRR_TYPE_UNCACHABLE;
+ break;
+ case 8:
+ *type = MTRR_TYPE_WRBACK;
+ break;
+ case 9:
+ *type = MTRR_TYPE_WRCOMB;
+ break;
+ case 24:
+ default:
+ *type = MTRR_TYPE_WRTHROUGH;
+ break;
+ }
+ } else {
+ switch (rcr) {
+ case 0:
+ *type = MTRR_TYPE_UNCACHABLE;
+ break;
+ case 8:
+ *type = MTRR_TYPE_WRCOMB;
+ break;
+ case 9:
+ *type = MTRR_TYPE_WRBACK;
+ break;
+ case 25:
+ default:
+ *type = MTRR_TYPE_WRTHROUGH;
+ break;
+ }
+ }
+}
+
+/*
+ * cyrix_get_free_region - get a free ARR.
+ *
+ * @base: the starting (base) address of the region.
+ * @size: the size (in bytes) of the region.
+ *
+ * Returns: the index of the region on success, else -1 on error.
+*/
+static int
+cyrix_get_free_region(unsigned long base, unsigned long size, int replace_reg)
+{
+ unsigned long lbase, lsize;
+ mtrr_type ltype;
+ int i;
+
+ switch (replace_reg) {
+ case 7:
+ if (size < 0x40)
+ break;
+ case 6:
+ case 5:
+ case 4:
+ return replace_reg;
+ case 3:
+ case 2:
+ case 1:
+ case 0:
+ return replace_reg;
+ }
+ /* If we are to set up a region >32M then look at ARR7 immediately */
+ if (size > 0x2000) {
+ cyrix_get_arr(7, &lbase, &lsize, &ltype);
+ if (lsize == 0)
+ return 7;
+ /* Else try ARR0-ARR6 first */
+ } else {
+ for (i = 0; i < 7; i++) {
+ cyrix_get_arr(i, &lbase, &lsize, &ltype);
+ if (lsize == 0)
+ return i;
+ }
+ /*
+ * ARR0-ARR6 isn't free
+ * try ARR7 but its size must be at least 256K
+ */
+ cyrix_get_arr(i, &lbase, &lsize, &ltype);
+ if ((lsize == 0) && (size >= 0x40))
+ return i;
+ }
+ return -ENOSPC;
+}
+
+static u32 cr4, ccr3;
+
+static void prepare_set(void)
+{
+ u32 cr0;
+
+ /* Save value of CR4 and clear Page Global Enable (bit 7) */
+ if (boot_cpu_has(X86_FEATURE_PGE)) {
+ cr4 = __read_cr4();
+ __write_cr4(cr4 & ~X86_CR4_PGE);
+ }
+
+ /*
+ * Disable and flush caches.
+ * Note that wbinvd flushes the TLBs as a side-effect
+ */
+ cr0 = read_cr0() | X86_CR0_CD;
+ wbinvd();
+ write_cr0(cr0);
+ wbinvd();
+
+ /* Cyrix ARRs - everything else was excluded at the top */
+ ccr3 = getCx86(CX86_CCR3);
+
+ /* Cyrix ARRs - everything else was excluded at the top */
+ setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10);
+}
+
+static void post_set(void)
+{
+ /* Flush caches and TLBs */
+ wbinvd();
+
+ /* Cyrix ARRs - everything else was excluded at the top */
+ setCx86(CX86_CCR3, ccr3);
+
+ /* Enable caches */
+ write_cr0(read_cr0() & ~X86_CR0_CD);
+
+ /* Restore value of CR4 */
+ if (boot_cpu_has(X86_FEATURE_PGE))
+ __write_cr4(cr4);
+}
+
+static void cyrix_set_arr(unsigned int reg, unsigned long base,
+ unsigned long size, mtrr_type type)
+{
+ unsigned char arr, arr_type, arr_size;
+
+ arr = CX86_ARR_BASE + (reg << 1) + reg; /* avoid multiplication by 3 */
+
+ /* count down from 32M (ARR0-ARR6) or from 2G (ARR7) */
+ if (reg >= 7)
+ size >>= 6;
+
+ size &= 0x7fff; /* make sure arr_size <= 14 */
+ for (arr_size = 0; size; arr_size++, size >>= 1)
+ ;
+
+ if (reg < 7) {
+ switch (type) {
+ case MTRR_TYPE_UNCACHABLE:
+ arr_type = 1;
+ break;
+ case MTRR_TYPE_WRCOMB:
+ arr_type = 9;
+ break;
+ case MTRR_TYPE_WRTHROUGH:
+ arr_type = 24;
+ break;
+ default:
+ arr_type = 8;
+ break;
+ }
+ } else {
+ switch (type) {
+ case MTRR_TYPE_UNCACHABLE:
+ arr_type = 0;
+ break;
+ case MTRR_TYPE_WRCOMB:
+ arr_type = 8;
+ break;
+ case MTRR_TYPE_WRTHROUGH:
+ arr_type = 25;
+ break;
+ default:
+ arr_type = 9;
+ break;
+ }
+ }
+
+ prepare_set();
+
+ base <<= PAGE_SHIFT;
+ setCx86(arr + 0, ((unsigned char *)&base)[3]);
+ setCx86(arr + 1, ((unsigned char *)&base)[2]);
+ setCx86(arr + 2, (((unsigned char *)&base)[1]) | arr_size);
+ setCx86(CX86_RCR_BASE + reg, arr_type);
+
+ post_set();
+}
+
+typedef struct {
+ unsigned long base;
+ unsigned long size;
+ mtrr_type type;
+} arr_state_t;
+
+static arr_state_t arr_state[8] = {
+ {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL},
+ {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}
+};
+
+static unsigned char ccr_state[7] = { 0, 0, 0, 0, 0, 0, 0 };
+
+static void cyrix_set_all(void)
+{
+ int i;
+
+ prepare_set();
+
+ /* the CCRs are not contiguous */
+ for (i = 0; i < 4; i++)
+ setCx86(CX86_CCR0 + i, ccr_state[i]);
+ for (; i < 7; i++)
+ setCx86(CX86_CCR4 + i, ccr_state[i]);
+
+ for (i = 0; i < 8; i++) {
+ cyrix_set_arr(i, arr_state[i].base,
+ arr_state[i].size, arr_state[i].type);
+ }
+
+ post_set();
+}
+
+static const struct mtrr_ops cyrix_mtrr_ops = {
+ .vendor = X86_VENDOR_CYRIX,
+ .set_all = cyrix_set_all,
+ .set = cyrix_set_arr,
+ .get = cyrix_get_arr,
+ .get_free_region = cyrix_get_free_region,
+ .validate_add_page = generic_validate_add_page,
+ .have_wrcomb = positive_have_wrcomb,
+};
+
+int __init cyrix_init_mtrr(void)
+{
+ set_mtrr_ops(&cyrix_mtrr_ops);
+ return 0;
+}
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
new file mode 100644
index 000000000..9436f3452
--- /dev/null
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -0,0 +1,913 @@
+/*
+ * This only handles 32bit MTRR on 32bit hosts. This is strictly wrong
+ * because MTRRs can span up to 40 bits (36bits on most modern x86)
+ */
+#define DEBUG
+
+#include <linux/export.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/mm.h>
+
+#include <asm/processor-flags.h>
+#include <asm/cpufeature.h>
+#include <asm/tlbflush.h>
+#include <asm/mtrr.h>
+#include <asm/msr.h>
+#include <asm/pat.h>
+
+#include "mtrr.h"
+
+struct fixed_range_block {
+ int base_msr; /* start address of an MTRR block */
+ int ranges; /* number of MTRRs in this block */
+};
+
+static struct fixed_range_block fixed_range_blocks[] = {
+ { MSR_MTRRfix64K_00000, 1 }, /* one 64k MTRR */
+ { MSR_MTRRfix16K_80000, 2 }, /* two 16k MTRRs */
+ { MSR_MTRRfix4K_C0000, 8 }, /* eight 4k MTRRs */
+ {}
+};
+
+static unsigned long smp_changes_mask;
+static int mtrr_state_set;
+u64 mtrr_tom2;
+
+struct mtrr_state_type mtrr_state;
+EXPORT_SYMBOL_GPL(mtrr_state);
+
+/*
+ * BIOS is expected to clear MtrrFixDramModEn bit, see for example
+ * "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD
+ * Opteron Processors" (26094 Rev. 3.30 February 2006), section
+ * "13.2.1.2 SYSCFG Register": "The MtrrFixDramModEn bit should be set
+ * to 1 during BIOS initialization of the fixed MTRRs, then cleared to
+ * 0 for operation."
+ */
+static inline void k8_check_syscfg_dram_mod_en(void)
+{
+ u32 lo, hi;
+
+ if (!((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) &&
+ (boot_cpu_data.x86 >= 0x0f)))
+ return;
+
+ rdmsr(MSR_K8_SYSCFG, lo, hi);
+ if (lo & K8_MTRRFIXRANGE_DRAM_MODIFY) {
+ pr_err(FW_WARN "MTRR: CPU %u: SYSCFG[MtrrFixDramModEn]"
+ " not cleared by BIOS, clearing this bit\n",
+ smp_processor_id());
+ lo &= ~K8_MTRRFIXRANGE_DRAM_MODIFY;
+ mtrr_wrmsr(MSR_K8_SYSCFG, lo, hi);
+ }
+}
+
+/* Get the size of contiguous MTRR range */
+static u64 get_mtrr_size(u64 mask)
+{
+ u64 size;
+
+ mask >>= PAGE_SHIFT;
+ mask |= size_or_mask;
+ size = -mask;
+ size <<= PAGE_SHIFT;
+ return size;
+}
+
+/*
+ * Check and return the effective type for MTRR-MTRR type overlap.
+ * Returns 1 if the effective type is UNCACHEABLE, else returns 0
+ */
+static int check_type_overlap(u8 *prev, u8 *curr)
+{
+ if (*prev == MTRR_TYPE_UNCACHABLE || *curr == MTRR_TYPE_UNCACHABLE) {
+ *prev = MTRR_TYPE_UNCACHABLE;
+ *curr = MTRR_TYPE_UNCACHABLE;
+ return 1;
+ }
+
+ if ((*prev == MTRR_TYPE_WRBACK && *curr == MTRR_TYPE_WRTHROUGH) ||
+ (*prev == MTRR_TYPE_WRTHROUGH && *curr == MTRR_TYPE_WRBACK)) {
+ *prev = MTRR_TYPE_WRTHROUGH;
+ *curr = MTRR_TYPE_WRTHROUGH;
+ }
+
+ if (*prev != *curr) {
+ *prev = MTRR_TYPE_UNCACHABLE;
+ *curr = MTRR_TYPE_UNCACHABLE;
+ return 1;
+ }
+
+ return 0;
+}
+
+/**
+ * mtrr_type_lookup_fixed - look up memory type in MTRR fixed entries
+ *
+ * Return the MTRR fixed memory type of 'start'.
+ *
+ * MTRR fixed entries are divided into the following ways:
+ * 0x00000 - 0x7FFFF : This range is divided into eight 64KB sub-ranges
+ * 0x80000 - 0xBFFFF : This range is divided into sixteen 16KB sub-ranges
+ * 0xC0000 - 0xFFFFF : This range is divided into sixty-four 4KB sub-ranges
+ *
+ * Return Values:
+ * MTRR_TYPE_(type) - Matched memory type
+ * MTRR_TYPE_INVALID - Unmatched
+ */
+static u8 mtrr_type_lookup_fixed(u64 start, u64 end)
+{
+ int idx;
+
+ if (start >= 0x100000)
+ return MTRR_TYPE_INVALID;
+
+ /* 0x0 - 0x7FFFF */
+ if (start < 0x80000) {
+ idx = 0;
+ idx += (start >> 16);
+ return mtrr_state.fixed_ranges[idx];
+ /* 0x80000 - 0xBFFFF */
+ } else if (start < 0xC0000) {
+ idx = 1 * 8;
+ idx += ((start - 0x80000) >> 14);
+ return mtrr_state.fixed_ranges[idx];
+ }
+
+ /* 0xC0000 - 0xFFFFF */
+ idx = 3 * 8;
+ idx += ((start - 0xC0000) >> 12);
+ return mtrr_state.fixed_ranges[idx];
+}
+
+/**
+ * mtrr_type_lookup_variable - look up memory type in MTRR variable entries
+ *
+ * Return Value:
+ * MTRR_TYPE_(type) - Matched memory type or default memory type (unmatched)
+ *
+ * Output Arguments:
+ * repeat - Set to 1 when [start:end] spanned across MTRR range and type
+ * returned corresponds only to [start:*partial_end]. Caller has
+ * to lookup again for [*partial_end:end].
+ *
+ * uniform - Set to 1 when an MTRR covers the region uniformly, i.e. the
+ * region is fully covered by a single MTRR entry or the default
+ * type.
+ */
+static u8 mtrr_type_lookup_variable(u64 start, u64 end, u64 *partial_end,
+ int *repeat, u8 *uniform)
+{
+ int i;
+ u64 base, mask;
+ u8 prev_match, curr_match;
+
+ *repeat = 0;
+ *uniform = 1;
+
+ prev_match = MTRR_TYPE_INVALID;
+ for (i = 0; i < num_var_ranges; ++i) {
+ unsigned short start_state, end_state, inclusive;
+
+ if (!(mtrr_state.var_ranges[i].mask_lo & (1 << 11)))
+ continue;
+
+ base = (((u64)mtrr_state.var_ranges[i].base_hi) << 32) +
+ (mtrr_state.var_ranges[i].base_lo & PAGE_MASK);
+ mask = (((u64)mtrr_state.var_ranges[i].mask_hi) << 32) +
+ (mtrr_state.var_ranges[i].mask_lo & PAGE_MASK);
+
+ start_state = ((start & mask) == (base & mask));
+ end_state = ((end & mask) == (base & mask));
+ inclusive = ((start < base) && (end > base));
+
+ if ((start_state != end_state) || inclusive) {
+ /*
+ * We have start:end spanning across an MTRR.
+ * We split the region into either
+ *
+ * - start_state:1
+ * (start:mtrr_end)(mtrr_end:end)
+ * - end_state:1
+ * (start:mtrr_start)(mtrr_start:end)
+ * - inclusive:1
+ * (start:mtrr_start)(mtrr_start:mtrr_end)(mtrr_end:end)
+ *
+ * depending on kind of overlap.
+ *
+ * Return the type of the first region and a pointer
+ * to the start of next region so that caller will be
+ * advised to lookup again after having adjusted start
+ * and end.
+ *
+ * Note: This way we handle overlaps with multiple
+ * entries and the default type properly.
+ */
+ if (start_state)
+ *partial_end = base + get_mtrr_size(mask);
+ else
+ *partial_end = base;
+
+ if (unlikely(*partial_end <= start)) {
+ WARN_ON(1);
+ *partial_end = start + PAGE_SIZE;
+ }
+
+ end = *partial_end - 1; /* end is inclusive */
+ *repeat = 1;
+ *uniform = 0;
+ }
+
+ if ((start & mask) != (base & mask))
+ continue;
+
+ curr_match = mtrr_state.var_ranges[i].base_lo & 0xff;
+ if (prev_match == MTRR_TYPE_INVALID) {
+ prev_match = curr_match;
+ continue;
+ }
+
+ *uniform = 0;
+ if (check_type_overlap(&prev_match, &curr_match))
+ return curr_match;
+ }
+
+ if (prev_match != MTRR_TYPE_INVALID)
+ return prev_match;
+
+ return mtrr_state.def_type;
+}
+
+/**
+ * mtrr_type_lookup - look up memory type in MTRR
+ *
+ * Return Values:
+ * MTRR_TYPE_(type) - The effective MTRR type for the region
+ * MTRR_TYPE_INVALID - MTRR is disabled
+ *
+ * Output Argument:
+ * uniform - Set to 1 when an MTRR covers the region uniformly, i.e. the
+ * region is fully covered by a single MTRR entry or the default
+ * type.
+ */
+u8 mtrr_type_lookup(u64 start, u64 end, u8 *uniform)
+{
+ u8 type, prev_type, is_uniform = 1, dummy;
+ int repeat;
+ u64 partial_end;
+
+ /* Make end inclusive instead of exclusive */
+ end--;
+
+ if (!mtrr_state_set)
+ return MTRR_TYPE_INVALID;
+
+ if (!(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED))
+ return MTRR_TYPE_INVALID;
+
+ /*
+ * Look up the fixed ranges first, which take priority over
+ * the variable ranges.
+ */
+ if ((start < 0x100000) &&
+ (mtrr_state.have_fixed) &&
+ (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) {
+ is_uniform = 0;
+ type = mtrr_type_lookup_fixed(start, end);
+ goto out;
+ }
+
+ /*
+ * Look up the variable ranges. Look of multiple ranges matching
+ * this address and pick type as per MTRR precedence.
+ */
+ type = mtrr_type_lookup_variable(start, end, &partial_end,
+ &repeat, &is_uniform);
+
+ /*
+ * Common path is with repeat = 0.
+ * However, we can have cases where [start:end] spans across some
+ * MTRR ranges and/or the default type. Do repeated lookups for
+ * that case here.
+ */
+ while (repeat) {
+ prev_type = type;
+ start = partial_end;
+ is_uniform = 0;
+ type = mtrr_type_lookup_variable(start, end, &partial_end,
+ &repeat, &dummy);
+
+ if (check_type_overlap(&prev_type, &type))
+ goto out;
+ }
+
+ if (mtrr_tom2 && (start >= (1ULL<<32)) && (end < mtrr_tom2))
+ type = MTRR_TYPE_WRBACK;
+
+out:
+ *uniform = is_uniform;
+ return type;
+}
+
+/* Get the MSR pair relating to a var range */
+static void
+get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr)
+{
+ rdmsr(MTRRphysBase_MSR(index), vr->base_lo, vr->base_hi);
+ rdmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi);
+}
+
+/* Fill the MSR pair relating to a var range */
+void fill_mtrr_var_range(unsigned int index,
+ u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi)
+{
+ struct mtrr_var_range *vr;
+
+ vr = mtrr_state.var_ranges;
+
+ vr[index].base_lo = base_lo;
+ vr[index].base_hi = base_hi;
+ vr[index].mask_lo = mask_lo;
+ vr[index].mask_hi = mask_hi;
+}
+
+static void get_fixed_ranges(mtrr_type *frs)
+{
+ unsigned int *p = (unsigned int *)frs;
+ int i;
+
+ k8_check_syscfg_dram_mod_en();
+
+ rdmsr(MSR_MTRRfix64K_00000, p[0], p[1]);
+
+ for (i = 0; i < 2; i++)
+ rdmsr(MSR_MTRRfix16K_80000 + i, p[2 + i * 2], p[3 + i * 2]);
+ for (i = 0; i < 8; i++)
+ rdmsr(MSR_MTRRfix4K_C0000 + i, p[6 + i * 2], p[7 + i * 2]);
+}
+
+void mtrr_save_fixed_ranges(void *info)
+{
+ if (boot_cpu_has(X86_FEATURE_MTRR))
+ get_fixed_ranges(mtrr_state.fixed_ranges);
+}
+
+static unsigned __initdata last_fixed_start;
+static unsigned __initdata last_fixed_end;
+static mtrr_type __initdata last_fixed_type;
+
+static void __init print_fixed_last(void)
+{
+ if (!last_fixed_end)
+ return;
+
+ pr_debug(" %05X-%05X %s\n", last_fixed_start,
+ last_fixed_end - 1, mtrr_attrib_to_str(last_fixed_type));
+
+ last_fixed_end = 0;
+}
+
+static void __init update_fixed_last(unsigned base, unsigned end,
+ mtrr_type type)
+{
+ last_fixed_start = base;
+ last_fixed_end = end;
+ last_fixed_type = type;
+}
+
+static void __init
+print_fixed(unsigned base, unsigned step, const mtrr_type *types)
+{
+ unsigned i;
+
+ for (i = 0; i < 8; ++i, ++types, base += step) {
+ if (last_fixed_end == 0) {
+ update_fixed_last(base, base + step, *types);
+ continue;
+ }
+ if (last_fixed_end == base && last_fixed_type == *types) {
+ last_fixed_end = base + step;
+ continue;
+ }
+ /* new segments: gap or different type */
+ print_fixed_last();
+ update_fixed_last(base, base + step, *types);
+ }
+}
+
+static void prepare_set(void);
+static void post_set(void);
+
+static void __init print_mtrr_state(void)
+{
+ unsigned int i;
+ int high_width;
+
+ pr_debug("MTRR default type: %s\n",
+ mtrr_attrib_to_str(mtrr_state.def_type));
+ if (mtrr_state.have_fixed) {
+ pr_debug("MTRR fixed ranges %sabled:\n",
+ ((mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED) &&
+ (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) ?
+ "en" : "dis");
+ print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0);
+ for (i = 0; i < 2; ++i)
+ print_fixed(0x80000 + i * 0x20000, 0x04000,
+ mtrr_state.fixed_ranges + (i + 1) * 8);
+ for (i = 0; i < 8; ++i)
+ print_fixed(0xC0000 + i * 0x08000, 0x01000,
+ mtrr_state.fixed_ranges + (i + 3) * 8);
+
+ /* tail */
+ print_fixed_last();
+ }
+ pr_debug("MTRR variable ranges %sabled:\n",
+ mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED ? "en" : "dis");
+ high_width = (__ffs64(size_or_mask) - (32 - PAGE_SHIFT) + 3) / 4;
+
+ for (i = 0; i < num_var_ranges; ++i) {
+ if (mtrr_state.var_ranges[i].mask_lo & (1 << 11))
+ pr_debug(" %u base %0*X%05X000 mask %0*X%05X000 %s\n",
+ i,
+ high_width,
+ mtrr_state.var_ranges[i].base_hi,
+ mtrr_state.var_ranges[i].base_lo >> 12,
+ high_width,
+ mtrr_state.var_ranges[i].mask_hi,
+ mtrr_state.var_ranges[i].mask_lo >> 12,
+ mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff));
+ else
+ pr_debug(" %u disabled\n", i);
+ }
+ if (mtrr_tom2)
+ pr_debug("TOM2: %016llx aka %lldM\n", mtrr_tom2, mtrr_tom2>>20);
+}
+
+/* PAT setup for BP. We need to go through sync steps here */
+void __init mtrr_bp_pat_init(void)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ prepare_set();
+
+ pat_init();
+
+ post_set();
+ local_irq_restore(flags);
+}
+
+/* Grab all of the MTRR state for this CPU into *state */
+bool __init get_mtrr_state(void)
+{
+ struct mtrr_var_range *vrs;
+ unsigned lo, dummy;
+ unsigned int i;
+
+ vrs = mtrr_state.var_ranges;
+
+ rdmsr(MSR_MTRRcap, lo, dummy);
+ mtrr_state.have_fixed = (lo >> 8) & 1;
+
+ for (i = 0; i < num_var_ranges; i++)
+ get_mtrr_var_range(i, &vrs[i]);
+ if (mtrr_state.have_fixed)
+ get_fixed_ranges(mtrr_state.fixed_ranges);
+
+ rdmsr(MSR_MTRRdefType, lo, dummy);
+ mtrr_state.def_type = (lo & 0xff);
+ mtrr_state.enabled = (lo & 0xc00) >> 10;
+
+ if (amd_special_default_mtrr()) {
+ unsigned low, high;
+
+ /* TOP_MEM2 */
+ rdmsr(MSR_K8_TOP_MEM2, low, high);
+ mtrr_tom2 = high;
+ mtrr_tom2 <<= 32;
+ mtrr_tom2 |= low;
+ mtrr_tom2 &= 0xffffff800000ULL;
+ }
+
+ print_mtrr_state();
+
+ mtrr_state_set = 1;
+
+ return !!(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED);
+}
+
+/* Some BIOS's are messed up and don't set all MTRRs the same! */
+void __init mtrr_state_warn(void)
+{
+ unsigned long mask = smp_changes_mask;
+
+ if (!mask)
+ return;
+ if (mask & MTRR_CHANGE_MASK_FIXED)
+ pr_warn("mtrr: your CPUs had inconsistent fixed MTRR settings\n");
+ if (mask & MTRR_CHANGE_MASK_VARIABLE)
+ pr_warn("mtrr: your CPUs had inconsistent variable MTRR settings\n");
+ if (mask & MTRR_CHANGE_MASK_DEFTYPE)
+ pr_warn("mtrr: your CPUs had inconsistent MTRRdefType settings\n");
+
+ pr_info("mtrr: probably your BIOS does not setup all CPUs.\n");
+ pr_info("mtrr: corrected configuration.\n");
+}
+
+/*
+ * Doesn't attempt to pass an error out to MTRR users
+ * because it's quite complicated in some cases and probably not
+ * worth it because the best error handling is to ignore it.
+ */
+void mtrr_wrmsr(unsigned msr, unsigned a, unsigned b)
+{
+ if (wrmsr_safe(msr, a, b) < 0) {
+ pr_err("MTRR: CPU %u: Writing MSR %x to %x:%x failed\n",
+ smp_processor_id(), msr, a, b);
+ }
+}
+
+/**
+ * set_fixed_range - checks & updates a fixed-range MTRR if it
+ * differs from the value it should have
+ * @msr: MSR address of the MTTR which should be checked and updated
+ * @changed: pointer which indicates whether the MTRR needed to be changed
+ * @msrwords: pointer to the MSR values which the MSR should have
+ */
+static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords)
+{
+ unsigned lo, hi;
+
+ rdmsr(msr, lo, hi);
+
+ if (lo != msrwords[0] || hi != msrwords[1]) {
+ mtrr_wrmsr(msr, msrwords[0], msrwords[1]);
+ *changed = true;
+ }
+}
+
+/**
+ * generic_get_free_region - Get a free MTRR.
+ * @base: The starting (base) address of the region.
+ * @size: The size (in bytes) of the region.
+ * @replace_reg: mtrr index to be replaced; set to invalid value if none.
+ *
+ * Returns: The index of the region on success, else negative on error.
+ */
+int
+generic_get_free_region(unsigned long base, unsigned long size, int replace_reg)
+{
+ unsigned long lbase, lsize;
+ mtrr_type ltype;
+ int i, max;
+
+ max = num_var_ranges;
+ if (replace_reg >= 0 && replace_reg < max)
+ return replace_reg;
+
+ for (i = 0; i < max; ++i) {
+ mtrr_if->get(i, &lbase, &lsize, &ltype);
+ if (lsize == 0)
+ return i;
+ }
+
+ return -ENOSPC;
+}
+
+static void generic_get_mtrr(unsigned int reg, unsigned long *base,
+ unsigned long *size, mtrr_type *type)
+{
+ u32 mask_lo, mask_hi, base_lo, base_hi;
+ unsigned int hi;
+ u64 tmp, mask;
+
+ /*
+ * get_mtrr doesn't need to update mtrr_state, also it could be called
+ * from any cpu, so try to print it out directly.
+ */
+ get_cpu();
+
+ rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi);
+
+ if ((mask_lo & 0x800) == 0) {
+ /* Invalid (i.e. free) range */
+ *base = 0;
+ *size = 0;
+ *type = 0;
+ goto out_put_cpu;
+ }
+
+ rdmsr(MTRRphysBase_MSR(reg), base_lo, base_hi);
+
+ /* Work out the shifted address mask: */
+ tmp = (u64)mask_hi << (32 - PAGE_SHIFT) | mask_lo >> PAGE_SHIFT;
+ mask = size_or_mask | tmp;
+
+ /* Expand tmp with high bits to all 1s: */
+ hi = fls64(tmp);
+ if (hi > 0) {
+ tmp |= ~((1ULL<<(hi - 1)) - 1);
+
+ if (tmp != mask) {
+ pr_warn("mtrr: your BIOS has configured an incorrect mask, fixing it.\n");
+ add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
+ mask = tmp;
+ }
+ }
+
+ /*
+ * This works correctly if size is a power of two, i.e. a
+ * contiguous range:
+ */
+ *size = -mask;
+ *base = (u64)base_hi << (32 - PAGE_SHIFT) | base_lo >> PAGE_SHIFT;
+ *type = base_lo & 0xff;
+
+out_put_cpu:
+ put_cpu();
+}
+
+/**
+ * set_fixed_ranges - checks & updates the fixed-range MTRRs if they
+ * differ from the saved set
+ * @frs: pointer to fixed-range MTRR values, saved by get_fixed_ranges()
+ */
+static int set_fixed_ranges(mtrr_type *frs)
+{
+ unsigned long long *saved = (unsigned long long *)frs;
+ bool changed = false;
+ int block = -1, range;
+
+ k8_check_syscfg_dram_mod_en();
+
+ while (fixed_range_blocks[++block].ranges) {
+ for (range = 0; range < fixed_range_blocks[block].ranges; range++)
+ set_fixed_range(fixed_range_blocks[block].base_msr + range,
+ &changed, (unsigned int *)saved++);
+ }
+
+ return changed;
+}
+
+/*
+ * Set the MSR pair relating to a var range.
+ * Returns true if changes are made.
+ */
+static bool set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr)
+{
+ unsigned int lo, hi;
+ bool changed = false;
+
+ rdmsr(MTRRphysBase_MSR(index), lo, hi);
+ if ((vr->base_lo & 0xfffff0ffUL) != (lo & 0xfffff0ffUL)
+ || (vr->base_hi & (size_and_mask >> (32 - PAGE_SHIFT))) !=
+ (hi & (size_and_mask >> (32 - PAGE_SHIFT)))) {
+
+ mtrr_wrmsr(MTRRphysBase_MSR(index), vr->base_lo, vr->base_hi);
+ changed = true;
+ }
+
+ rdmsr(MTRRphysMask_MSR(index), lo, hi);
+
+ if ((vr->mask_lo & 0xfffff800UL) != (lo & 0xfffff800UL)
+ || (vr->mask_hi & (size_and_mask >> (32 - PAGE_SHIFT))) !=
+ (hi & (size_and_mask >> (32 - PAGE_SHIFT)))) {
+ mtrr_wrmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi);
+ changed = true;
+ }
+ return changed;
+}
+
+static u32 deftype_lo, deftype_hi;
+
+/**
+ * set_mtrr_state - Set the MTRR state for this CPU.
+ *
+ * NOTE: The CPU must already be in a safe state for MTRR changes.
+ * RETURNS: 0 if no changes made, else a mask indicating what was changed.
+ */
+static unsigned long set_mtrr_state(void)
+{
+ unsigned long change_mask = 0;
+ unsigned int i;
+
+ for (i = 0; i < num_var_ranges; i++) {
+ if (set_mtrr_var_ranges(i, &mtrr_state.var_ranges[i]))
+ change_mask |= MTRR_CHANGE_MASK_VARIABLE;
+ }
+
+ if (mtrr_state.have_fixed && set_fixed_ranges(mtrr_state.fixed_ranges))
+ change_mask |= MTRR_CHANGE_MASK_FIXED;
+
+ /*
+ * Set_mtrr_restore restores the old value of MTRRdefType,
+ * so to set it we fiddle with the saved value:
+ */
+ if ((deftype_lo & 0xff) != mtrr_state.def_type
+ || ((deftype_lo & 0xc00) >> 10) != mtrr_state.enabled) {
+
+ deftype_lo = (deftype_lo & ~0xcff) | mtrr_state.def_type |
+ (mtrr_state.enabled << 10);
+ change_mask |= MTRR_CHANGE_MASK_DEFTYPE;
+ }
+
+ return change_mask;
+}
+
+
+static unsigned long cr4;
+static DEFINE_RAW_SPINLOCK(set_atomicity_lock);
+
+/*
+ * Since we are disabling the cache don't allow any interrupts,
+ * they would run extremely slow and would only increase the pain.
+ *
+ * The caller must ensure that local interrupts are disabled and
+ * are reenabled after post_set() has been called.
+ */
+static void prepare_set(void) __acquires(set_atomicity_lock)
+{
+ unsigned long cr0;
+
+ /*
+ * Note that this is not ideal
+ * since the cache is only flushed/disabled for this CPU while the
+ * MTRRs are changed, but changing this requires more invasive
+ * changes to the way the kernel boots
+ */
+
+ raw_spin_lock(&set_atomicity_lock);
+
+ /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */
+ cr0 = read_cr0() | X86_CR0_CD;
+ write_cr0(cr0);
+ wbinvd();
+
+ /* Save value of CR4 and clear Page Global Enable (bit 7) */
+ if (boot_cpu_has(X86_FEATURE_PGE)) {
+ cr4 = __read_cr4();
+ __write_cr4(cr4 & ~X86_CR4_PGE);
+ }
+
+ /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
+ __flush_tlb();
+
+ /* Save MTRR state */
+ rdmsr(MSR_MTRRdefType, deftype_lo, deftype_hi);
+
+ /* Disable MTRRs, and set the default type to uncached */
+ mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi);
+ wbinvd();
+}
+
+static void post_set(void) __releases(set_atomicity_lock)
+{
+ /* Flush TLBs (no need to flush caches - they are disabled) */
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
+ __flush_tlb();
+
+ /* Intel (P6) standard MTRRs */
+ mtrr_wrmsr(MSR_MTRRdefType, deftype_lo, deftype_hi);
+
+ /* Enable caches */
+ write_cr0(read_cr0() & ~X86_CR0_CD);
+
+ /* Restore value of CR4 */
+ if (boot_cpu_has(X86_FEATURE_PGE))
+ __write_cr4(cr4);
+ raw_spin_unlock(&set_atomicity_lock);
+}
+
+static void generic_set_all(void)
+{
+ unsigned long mask, count;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ prepare_set();
+
+ /* Actually set the state */
+ mask = set_mtrr_state();
+
+ /* also set PAT */
+ pat_init();
+
+ post_set();
+ local_irq_restore(flags);
+
+ /* Use the atomic bitops to update the global mask */
+ for (count = 0; count < sizeof mask * 8; ++count) {
+ if (mask & 0x01)
+ set_bit(count, &smp_changes_mask);
+ mask >>= 1;
+ }
+
+}
+
+/**
+ * generic_set_mtrr - set variable MTRR register on the local CPU.
+ *
+ * @reg: The register to set.
+ * @base: The base address of the region.
+ * @size: The size of the region. If this is 0 the region is disabled.
+ * @type: The type of the region.
+ *
+ * Returns nothing.
+ */
+static void generic_set_mtrr(unsigned int reg, unsigned long base,
+ unsigned long size, mtrr_type type)
+{
+ unsigned long flags;
+ struct mtrr_var_range *vr;
+
+ vr = &mtrr_state.var_ranges[reg];
+
+ local_irq_save(flags);
+ prepare_set();
+
+ if (size == 0) {
+ /*
+ * The invalid bit is kept in the mask, so we simply
+ * clear the relevant mask register to disable a range.
+ */
+ mtrr_wrmsr(MTRRphysMask_MSR(reg), 0, 0);
+ memset(vr, 0, sizeof(struct mtrr_var_range));
+ } else {
+ vr->base_lo = base << PAGE_SHIFT | type;
+ vr->base_hi = (base & size_and_mask) >> (32 - PAGE_SHIFT);
+ vr->mask_lo = -size << PAGE_SHIFT | 0x800;
+ vr->mask_hi = (-size & size_and_mask) >> (32 - PAGE_SHIFT);
+
+ mtrr_wrmsr(MTRRphysBase_MSR(reg), vr->base_lo, vr->base_hi);
+ mtrr_wrmsr(MTRRphysMask_MSR(reg), vr->mask_lo, vr->mask_hi);
+ }
+
+ post_set();
+ local_irq_restore(flags);
+}
+
+int generic_validate_add_page(unsigned long base, unsigned long size,
+ unsigned int type)
+{
+ unsigned long lbase, last;
+
+ /*
+ * For Intel PPro stepping <= 7
+ * must be 4 MiB aligned and not touch 0x70000000 -> 0x7003FFFF
+ */
+ if (is_cpu(INTEL) && boot_cpu_data.x86 == 6 &&
+ boot_cpu_data.x86_model == 1 &&
+ boot_cpu_data.x86_stepping <= 7) {
+ if (base & ((1 << (22 - PAGE_SHIFT)) - 1)) {
+ pr_warn("mtrr: base(0x%lx000) is not 4 MiB aligned\n", base);
+ return -EINVAL;
+ }
+ if (!(base + size < 0x70000 || base > 0x7003F) &&
+ (type == MTRR_TYPE_WRCOMB
+ || type == MTRR_TYPE_WRBACK)) {
+ pr_warn("mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n");
+ return -EINVAL;
+ }
+ }
+
+ /*
+ * Check upper bits of base and last are equal and lower bits are 0
+ * for base and 1 for last
+ */
+ last = base + size - 1;
+ for (lbase = base; !(lbase & 1) && (last & 1);
+ lbase = lbase >> 1, last = last >> 1)
+ ;
+ if (lbase != last) {
+ pr_warn("mtrr: base(0x%lx000) is not aligned on a size(0x%lx000) boundary\n", base, size);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static int generic_have_wrcomb(void)
+{
+ unsigned long config, dummy;
+ rdmsr(MSR_MTRRcap, config, dummy);
+ return config & (1 << 10);
+}
+
+int positive_have_wrcomb(void)
+{
+ return 1;
+}
+
+/*
+ * Generic structure...
+ */
+const struct mtrr_ops generic_mtrr_ops = {
+ .use_intel_if = 1,
+ .set_all = generic_set_all,
+ .get = generic_get_mtrr,
+ .get_free_region = generic_get_free_region,
+ .set = generic_set_mtrr,
+ .validate_add_page = generic_validate_add_page,
+ .have_wrcomb = generic_have_wrcomb,
+};
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
new file mode 100644
index 000000000..254683b50
--- /dev/null
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -0,0 +1,443 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/capability.h>
+#include <linux/seq_file.h>
+#include <linux/uaccess.h>
+#include <linux/proc_fs.h>
+#include <linux/ctype.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+
+#define LINE_SIZE 80
+
+#include <asm/mtrr.h>
+
+#include "mtrr.h"
+
+#define FILE_FCOUNT(f) (((struct seq_file *)((f)->private_data))->private)
+
+static const char *const mtrr_strings[MTRR_NUM_TYPES] =
+{
+ "uncachable", /* 0 */
+ "write-combining", /* 1 */
+ "?", /* 2 */
+ "?", /* 3 */
+ "write-through", /* 4 */
+ "write-protect", /* 5 */
+ "write-back", /* 6 */
+};
+
+const char *mtrr_attrib_to_str(int x)
+{
+ return (x <= 6) ? mtrr_strings[x] : "?";
+}
+
+#ifdef CONFIG_PROC_FS
+
+static int
+mtrr_file_add(unsigned long base, unsigned long size,
+ unsigned int type, bool increment, struct file *file, int page)
+{
+ unsigned int *fcount = FILE_FCOUNT(file);
+ int reg, max;
+
+ max = num_var_ranges;
+ if (fcount == NULL) {
+ fcount = kcalloc(max, sizeof(*fcount), GFP_KERNEL);
+ if (!fcount)
+ return -ENOMEM;
+ FILE_FCOUNT(file) = fcount;
+ }
+ if (!page) {
+ if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1)))
+ return -EINVAL;
+ base >>= PAGE_SHIFT;
+ size >>= PAGE_SHIFT;
+ }
+ reg = mtrr_add_page(base, size, type, true);
+ if (reg >= 0)
+ ++fcount[reg];
+ return reg;
+}
+
+static int
+mtrr_file_del(unsigned long base, unsigned long size,
+ struct file *file, int page)
+{
+ unsigned int *fcount = FILE_FCOUNT(file);
+ int reg;
+
+ if (!page) {
+ if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1)))
+ return -EINVAL;
+ base >>= PAGE_SHIFT;
+ size >>= PAGE_SHIFT;
+ }
+ reg = mtrr_del_page(-1, base, size);
+ if (reg < 0)
+ return reg;
+ if (fcount == NULL)
+ return reg;
+ if (fcount[reg] < 1)
+ return -EINVAL;
+ --fcount[reg];
+ return reg;
+}
+
+/*
+ * seq_file can seek but we ignore it.
+ *
+ * Format of control line:
+ * "base=%Lx size=%Lx type=%s" or "disable=%d"
+ */
+static ssize_t
+mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)
+{
+ int i, err;
+ unsigned long reg;
+ unsigned long long base, size;
+ char *ptr;
+ char line[LINE_SIZE];
+ int length;
+ size_t linelen;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ memset(line, 0, LINE_SIZE);
+
+ len = min_t(size_t, len, LINE_SIZE - 1);
+ length = strncpy_from_user(line, buf, len);
+ if (length < 0)
+ return length;
+
+ linelen = strlen(line);
+ ptr = line + linelen - 1;
+ if (linelen && *ptr == '\n')
+ *ptr = '\0';
+
+ if (!strncmp(line, "disable=", 8)) {
+ reg = simple_strtoul(line + 8, &ptr, 0);
+ err = mtrr_del_page(reg, 0, 0);
+ if (err < 0)
+ return err;
+ return len;
+ }
+
+ if (strncmp(line, "base=", 5))
+ return -EINVAL;
+
+ base = simple_strtoull(line + 5, &ptr, 0);
+ ptr = skip_spaces(ptr);
+
+ if (strncmp(ptr, "size=", 5))
+ return -EINVAL;
+
+ size = simple_strtoull(ptr + 5, &ptr, 0);
+ if ((base & 0xfff) || (size & 0xfff))
+ return -EINVAL;
+ ptr = skip_spaces(ptr);
+
+ if (strncmp(ptr, "type=", 5))
+ return -EINVAL;
+ ptr = skip_spaces(ptr + 5);
+
+ i = match_string(mtrr_strings, MTRR_NUM_TYPES, ptr);
+ if (i < 0)
+ return i;
+
+ base >>= PAGE_SHIFT;
+ size >>= PAGE_SHIFT;
+ err = mtrr_add_page((unsigned long)base, (unsigned long)size, i, true);
+ if (err < 0)
+ return err;
+ return len;
+}
+
+static long
+mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
+{
+ int err = 0;
+ mtrr_type type;
+ unsigned long base;
+ unsigned long size;
+ struct mtrr_sentry sentry;
+ struct mtrr_gentry gentry;
+ void __user *arg = (void __user *) __arg;
+
+ memset(&gentry, 0, sizeof(gentry));
+
+ switch (cmd) {
+ case MTRRIOC_ADD_ENTRY:
+ case MTRRIOC_SET_ENTRY:
+ case MTRRIOC_DEL_ENTRY:
+ case MTRRIOC_KILL_ENTRY:
+ case MTRRIOC_ADD_PAGE_ENTRY:
+ case MTRRIOC_SET_PAGE_ENTRY:
+ case MTRRIOC_DEL_PAGE_ENTRY:
+ case MTRRIOC_KILL_PAGE_ENTRY:
+ if (copy_from_user(&sentry, arg, sizeof sentry))
+ return -EFAULT;
+ break;
+ case MTRRIOC_GET_ENTRY:
+ case MTRRIOC_GET_PAGE_ENTRY:
+ if (copy_from_user(&gentry, arg, sizeof gentry))
+ return -EFAULT;
+ break;
+#ifdef CONFIG_COMPAT
+ case MTRRIOC32_ADD_ENTRY:
+ case MTRRIOC32_SET_ENTRY:
+ case MTRRIOC32_DEL_ENTRY:
+ case MTRRIOC32_KILL_ENTRY:
+ case MTRRIOC32_ADD_PAGE_ENTRY:
+ case MTRRIOC32_SET_PAGE_ENTRY:
+ case MTRRIOC32_DEL_PAGE_ENTRY:
+ case MTRRIOC32_KILL_PAGE_ENTRY: {
+ struct mtrr_sentry32 __user *s32;
+
+ s32 = (struct mtrr_sentry32 __user *)__arg;
+ err = get_user(sentry.base, &s32->base);
+ err |= get_user(sentry.size, &s32->size);
+ err |= get_user(sentry.type, &s32->type);
+ if (err)
+ return err;
+ break;
+ }
+ case MTRRIOC32_GET_ENTRY:
+ case MTRRIOC32_GET_PAGE_ENTRY: {
+ struct mtrr_gentry32 __user *g32;
+
+ g32 = (struct mtrr_gentry32 __user *)__arg;
+ err = get_user(gentry.regnum, &g32->regnum);
+ err |= get_user(gentry.base, &g32->base);
+ err |= get_user(gentry.size, &g32->size);
+ err |= get_user(gentry.type, &g32->type);
+ if (err)
+ return err;
+ break;
+ }
+#endif
+ }
+
+ switch (cmd) {
+ default:
+ return -ENOTTY;
+ case MTRRIOC_ADD_ENTRY:
+#ifdef CONFIG_COMPAT
+ case MTRRIOC32_ADD_ENTRY:
+#endif
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ err =
+ mtrr_file_add(sentry.base, sentry.size, sentry.type, true,
+ file, 0);
+ break;
+ case MTRRIOC_SET_ENTRY:
+#ifdef CONFIG_COMPAT
+ case MTRRIOC32_SET_ENTRY:
+#endif
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ err = mtrr_add(sentry.base, sentry.size, sentry.type, false);
+ break;
+ case MTRRIOC_DEL_ENTRY:
+#ifdef CONFIG_COMPAT
+ case MTRRIOC32_DEL_ENTRY:
+#endif
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ err = mtrr_file_del(sentry.base, sentry.size, file, 0);
+ break;
+ case MTRRIOC_KILL_ENTRY:
+#ifdef CONFIG_COMPAT
+ case MTRRIOC32_KILL_ENTRY:
+#endif
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ err = mtrr_del(-1, sentry.base, sentry.size);
+ break;
+ case MTRRIOC_GET_ENTRY:
+#ifdef CONFIG_COMPAT
+ case MTRRIOC32_GET_ENTRY:
+#endif
+ if (gentry.regnum >= num_var_ranges)
+ return -EINVAL;
+ mtrr_if->get(gentry.regnum, &base, &size, &type);
+
+ /* Hide entries that go above 4GB */
+ if (base + size - 1 >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT))
+ || size >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT)))
+ gentry.base = gentry.size = gentry.type = 0;
+ else {
+ gentry.base = base << PAGE_SHIFT;
+ gentry.size = size << PAGE_SHIFT;
+ gentry.type = type;
+ }
+
+ break;
+ case MTRRIOC_ADD_PAGE_ENTRY:
+#ifdef CONFIG_COMPAT
+ case MTRRIOC32_ADD_PAGE_ENTRY:
+#endif
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ err =
+ mtrr_file_add(sentry.base, sentry.size, sentry.type, true,
+ file, 1);
+ break;
+ case MTRRIOC_SET_PAGE_ENTRY:
+#ifdef CONFIG_COMPAT
+ case MTRRIOC32_SET_PAGE_ENTRY:
+#endif
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ err =
+ mtrr_add_page(sentry.base, sentry.size, sentry.type, false);
+ break;
+ case MTRRIOC_DEL_PAGE_ENTRY:
+#ifdef CONFIG_COMPAT
+ case MTRRIOC32_DEL_PAGE_ENTRY:
+#endif
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ err = mtrr_file_del(sentry.base, sentry.size, file, 1);
+ break;
+ case MTRRIOC_KILL_PAGE_ENTRY:
+#ifdef CONFIG_COMPAT
+ case MTRRIOC32_KILL_PAGE_ENTRY:
+#endif
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ err = mtrr_del_page(-1, sentry.base, sentry.size);
+ break;
+ case MTRRIOC_GET_PAGE_ENTRY:
+#ifdef CONFIG_COMPAT
+ case MTRRIOC32_GET_PAGE_ENTRY:
+#endif
+ if (gentry.regnum >= num_var_ranges)
+ return -EINVAL;
+ mtrr_if->get(gentry.regnum, &base, &size, &type);
+ /* Hide entries that would overflow */
+ if (size != (__typeof__(gentry.size))size)
+ gentry.base = gentry.size = gentry.type = 0;
+ else {
+ gentry.base = base;
+ gentry.size = size;
+ gentry.type = type;
+ }
+ break;
+ }
+
+ if (err)
+ return err;
+
+ switch (cmd) {
+ case MTRRIOC_GET_ENTRY:
+ case MTRRIOC_GET_PAGE_ENTRY:
+ if (copy_to_user(arg, &gentry, sizeof gentry))
+ err = -EFAULT;
+ break;
+#ifdef CONFIG_COMPAT
+ case MTRRIOC32_GET_ENTRY:
+ case MTRRIOC32_GET_PAGE_ENTRY: {
+ struct mtrr_gentry32 __user *g32;
+
+ g32 = (struct mtrr_gentry32 __user *)__arg;
+ err = put_user(gentry.base, &g32->base);
+ err |= put_user(gentry.size, &g32->size);
+ err |= put_user(gentry.regnum, &g32->regnum);
+ err |= put_user(gentry.type, &g32->type);
+ break;
+ }
+#endif
+ }
+ return err;
+}
+
+static int mtrr_close(struct inode *ino, struct file *file)
+{
+ unsigned int *fcount = FILE_FCOUNT(file);
+ int i, max;
+
+ if (fcount != NULL) {
+ max = num_var_ranges;
+ for (i = 0; i < max; ++i) {
+ while (fcount[i] > 0) {
+ mtrr_del(i, 0, 0);
+ --fcount[i];
+ }
+ }
+ kfree(fcount);
+ FILE_FCOUNT(file) = NULL;
+ }
+ return single_release(ino, file);
+}
+
+static int mtrr_seq_show(struct seq_file *seq, void *offset);
+
+static int mtrr_open(struct inode *inode, struct file *file)
+{
+ if (!mtrr_if)
+ return -EIO;
+ if (!mtrr_if->get)
+ return -ENXIO;
+ return single_open(file, mtrr_seq_show, NULL);
+}
+
+static const struct file_operations mtrr_fops = {
+ .owner = THIS_MODULE,
+ .open = mtrr_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .write = mtrr_write,
+ .unlocked_ioctl = mtrr_ioctl,
+ .compat_ioctl = mtrr_ioctl,
+ .release = mtrr_close,
+};
+
+static int mtrr_seq_show(struct seq_file *seq, void *offset)
+{
+ char factor;
+ int i, max;
+ mtrr_type type;
+ unsigned long base, size;
+
+ max = num_var_ranges;
+ for (i = 0; i < max; i++) {
+ mtrr_if->get(i, &base, &size, &type);
+ if (size == 0) {
+ mtrr_usage_table[i] = 0;
+ continue;
+ }
+ if (size < (0x100000 >> PAGE_SHIFT)) {
+ /* less than 1MB */
+ factor = 'K';
+ size <<= PAGE_SHIFT - 10;
+ } else {
+ factor = 'M';
+ size >>= 20 - PAGE_SHIFT;
+ }
+ /* Base can be > 32bit */
+ seq_printf(seq, "reg%02i: base=0x%06lx000 (%5luMB), size=%5lu%cB, count=%d: %s\n",
+ i, base, base >> (20 - PAGE_SHIFT),
+ size, factor,
+ mtrr_usage_table[i], mtrr_attrib_to_str(type));
+ }
+ return 0;
+}
+
+static int __init mtrr_if_init(void)
+{
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+
+ if ((!cpu_has(c, X86_FEATURE_MTRR)) &&
+ (!cpu_has(c, X86_FEATURE_K6_MTRR)) &&
+ (!cpu_has(c, X86_FEATURE_CYRIX_ARR)) &&
+ (!cpu_has(c, X86_FEATURE_CENTAUR_MCR)))
+ return -ENODEV;
+
+ proc_create("mtrr", S_IWUSR | S_IRUGO, NULL, &mtrr_fops);
+ return 0;
+}
+arch_initcall(mtrr_if_init);
+#endif /* CONFIG_PROC_FS */
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.c b/arch/x86/kernel/cpu/mtrr/mtrr.c
new file mode 100644
index 000000000..9a19c800f
--- /dev/null
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.c
@@ -0,0 +1,890 @@
+/* Generic MTRR (Memory Type Range Register) driver.
+
+ Copyright (C) 1997-2000 Richard Gooch
+ Copyright (c) 2002 Patrick Mochel
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Library General Public
+ License as published by the Free Software Foundation; either
+ version 2 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Library General Public License for more details.
+
+ You should have received a copy of the GNU Library General Public
+ License along with this library; if not, write to the Free
+ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ Richard Gooch may be reached by email at rgooch@atnf.csiro.au
+ The postal address is:
+ Richard Gooch, c/o ATNF, P. O. Box 76, Epping, N.S.W., 2121, Australia.
+
+ Source: "Pentium Pro Family Developer's Manual, Volume 3:
+ Operating System Writer's Guide" (Intel document number 242692),
+ section 11.11.7
+
+ This was cleaned and made readable by Patrick Mochel <mochel@osdl.org>
+ on 6-7 March 2002.
+ Source: Intel Architecture Software Developers Manual, Volume 3:
+ System Programming Guide; Section 9.11. (1997 edition - PPro).
+*/
+
+#define DEBUG
+
+#include <linux/types.h> /* FIXME: kvm_para.h needs this */
+
+#include <linux/stop_machine.h>
+#include <linux/kvm_para.h>
+#include <linux/uaccess.h>
+#include <linux/export.h>
+#include <linux/mutex.h>
+#include <linux/init.h>
+#include <linux/sort.h>
+#include <linux/cpu.h>
+#include <linux/pci.h>
+#include <linux/smp.h>
+#include <linux/syscore_ops.h>
+#include <linux/rcupdate.h>
+
+#include <asm/cpufeature.h>
+#include <asm/e820/api.h>
+#include <asm/mtrr.h>
+#include <asm/msr.h>
+#include <asm/pat.h>
+
+#include "mtrr.h"
+
+/* arch_phys_wc_add returns an MTRR register index plus this offset. */
+#define MTRR_TO_PHYS_WC_OFFSET 1000
+
+u32 num_var_ranges;
+static bool __mtrr_enabled;
+
+static bool mtrr_enabled(void)
+{
+ return __mtrr_enabled;
+}
+
+unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
+static DEFINE_MUTEX(mtrr_mutex);
+
+u64 size_or_mask, size_and_mask;
+static bool mtrr_aps_delayed_init;
+
+static const struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM] __ro_after_init;
+
+const struct mtrr_ops *mtrr_if;
+
+static void set_mtrr(unsigned int reg, unsigned long base,
+ unsigned long size, mtrr_type type);
+
+void __init set_mtrr_ops(const struct mtrr_ops *ops)
+{
+ if (ops->vendor && ops->vendor < X86_VENDOR_NUM)
+ mtrr_ops[ops->vendor] = ops;
+}
+
+/* Returns non-zero if we have the write-combining memory type */
+static int have_wrcomb(void)
+{
+ struct pci_dev *dev;
+
+ dev = pci_get_class(PCI_CLASS_BRIDGE_HOST << 8, NULL);
+ if (dev != NULL) {
+ /*
+ * ServerWorks LE chipsets < rev 6 have problems with
+ * write-combining. Don't allow it and leave room for other
+ * chipsets to be tagged
+ */
+ if (dev->vendor == PCI_VENDOR_ID_SERVERWORKS &&
+ dev->device == PCI_DEVICE_ID_SERVERWORKS_LE &&
+ dev->revision <= 5) {
+ pr_info("Serverworks LE rev < 6 detected. Write-combining disabled.\n");
+ pci_dev_put(dev);
+ return 0;
+ }
+ /*
+ * Intel 450NX errata # 23. Non ascending cacheline evictions to
+ * write combining memory may resulting in data corruption
+ */
+ if (dev->vendor == PCI_VENDOR_ID_INTEL &&
+ dev->device == PCI_DEVICE_ID_INTEL_82451NX) {
+ pr_info("Intel 450NX MMC detected. Write-combining disabled.\n");
+ pci_dev_put(dev);
+ return 0;
+ }
+ pci_dev_put(dev);
+ }
+ return mtrr_if->have_wrcomb ? mtrr_if->have_wrcomb() : 0;
+}
+
+/* This function returns the number of variable MTRRs */
+static void __init set_num_var_ranges(void)
+{
+ unsigned long config = 0, dummy;
+
+ if (use_intel())
+ rdmsr(MSR_MTRRcap, config, dummy);
+ else if (is_cpu(AMD))
+ config = 2;
+ else if (is_cpu(CYRIX) || is_cpu(CENTAUR))
+ config = 8;
+
+ num_var_ranges = config & 0xff;
+}
+
+static void __init init_table(void)
+{
+ int i, max;
+
+ max = num_var_ranges;
+ for (i = 0; i < max; i++)
+ mtrr_usage_table[i] = 1;
+}
+
+struct set_mtrr_data {
+ unsigned long smp_base;
+ unsigned long smp_size;
+ unsigned int smp_reg;
+ mtrr_type smp_type;
+};
+
+/**
+ * mtrr_rendezvous_handler - Work done in the synchronization handler. Executed
+ * by all the CPUs.
+ * @info: pointer to mtrr configuration data
+ *
+ * Returns nothing.
+ */
+static int mtrr_rendezvous_handler(void *info)
+{
+ struct set_mtrr_data *data = info;
+
+ /*
+ * We use this same function to initialize the mtrrs during boot,
+ * resume, runtime cpu online and on an explicit request to set a
+ * specific MTRR.
+ *
+ * During boot or suspend, the state of the boot cpu's mtrrs has been
+ * saved, and we want to replicate that across all the cpus that come
+ * online (either at the end of boot or resume or during a runtime cpu
+ * online). If we're doing that, @reg is set to something special and on
+ * all the cpu's we do mtrr_if->set_all() (On the logical cpu that
+ * started the boot/resume sequence, this might be a duplicate
+ * set_all()).
+ */
+ if (data->smp_reg != ~0U) {
+ mtrr_if->set(data->smp_reg, data->smp_base,
+ data->smp_size, data->smp_type);
+ } else if (mtrr_aps_delayed_init || !cpu_online(smp_processor_id())) {
+ mtrr_if->set_all();
+ }
+ return 0;
+}
+
+static inline int types_compatible(mtrr_type type1, mtrr_type type2)
+{
+ return type1 == MTRR_TYPE_UNCACHABLE ||
+ type2 == MTRR_TYPE_UNCACHABLE ||
+ (type1 == MTRR_TYPE_WRTHROUGH && type2 == MTRR_TYPE_WRBACK) ||
+ (type1 == MTRR_TYPE_WRBACK && type2 == MTRR_TYPE_WRTHROUGH);
+}
+
+/**
+ * set_mtrr - update mtrrs on all processors
+ * @reg: mtrr in question
+ * @base: mtrr base
+ * @size: mtrr size
+ * @type: mtrr type
+ *
+ * This is kinda tricky, but fortunately, Intel spelled it out for us cleanly:
+ *
+ * 1. Queue work to do the following on all processors:
+ * 2. Disable Interrupts
+ * 3. Wait for all procs to do so
+ * 4. Enter no-fill cache mode
+ * 5. Flush caches
+ * 6. Clear PGE bit
+ * 7. Flush all TLBs
+ * 8. Disable all range registers
+ * 9. Update the MTRRs
+ * 10. Enable all range registers
+ * 11. Flush all TLBs and caches again
+ * 12. Enter normal cache mode and reenable caching
+ * 13. Set PGE
+ * 14. Wait for buddies to catch up
+ * 15. Enable interrupts.
+ *
+ * What does that mean for us? Well, stop_machine() will ensure that
+ * the rendezvous handler is started on each CPU. And in lockstep they
+ * do the state transition of disabling interrupts, updating MTRR's
+ * (the CPU vendors may each do it differently, so we call mtrr_if->set()
+ * callback and let them take care of it.) and enabling interrupts.
+ *
+ * Note that the mechanism is the same for UP systems, too; all the SMP stuff
+ * becomes nops.
+ */
+static void
+set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type)
+{
+ struct set_mtrr_data data = { .smp_reg = reg,
+ .smp_base = base,
+ .smp_size = size,
+ .smp_type = type
+ };
+
+ stop_machine(mtrr_rendezvous_handler, &data, cpu_online_mask);
+}
+
+static void set_mtrr_cpuslocked(unsigned int reg, unsigned long base,
+ unsigned long size, mtrr_type type)
+{
+ struct set_mtrr_data data = { .smp_reg = reg,
+ .smp_base = base,
+ .smp_size = size,
+ .smp_type = type
+ };
+
+ stop_machine_cpuslocked(mtrr_rendezvous_handler, &data, cpu_online_mask);
+}
+
+static void set_mtrr_from_inactive_cpu(unsigned int reg, unsigned long base,
+ unsigned long size, mtrr_type type)
+{
+ struct set_mtrr_data data = { .smp_reg = reg,
+ .smp_base = base,
+ .smp_size = size,
+ .smp_type = type
+ };
+
+ stop_machine_from_inactive_cpu(mtrr_rendezvous_handler, &data,
+ cpu_callout_mask);
+}
+
+/**
+ * mtrr_add_page - Add a memory type region
+ * @base: Physical base address of region in pages (in units of 4 kB!)
+ * @size: Physical size of region in pages (4 kB)
+ * @type: Type of MTRR desired
+ * @increment: If this is true do usage counting on the region
+ *
+ * Memory type region registers control the caching on newer Intel and
+ * non Intel processors. This function allows drivers to request an
+ * MTRR is added. The details and hardware specifics of each processor's
+ * implementation are hidden from the caller, but nevertheless the
+ * caller should expect to need to provide a power of two size on an
+ * equivalent power of two boundary.
+ *
+ * If the region cannot be added either because all regions are in use
+ * or the CPU cannot support it a negative value is returned. On success
+ * the register number for this entry is returned, but should be treated
+ * as a cookie only.
+ *
+ * On a multiprocessor machine the changes are made to all processors.
+ * This is required on x86 by the Intel processors.
+ *
+ * The available types are
+ *
+ * %MTRR_TYPE_UNCACHABLE - No caching
+ *
+ * %MTRR_TYPE_WRBACK - Write data back in bursts whenever
+ *
+ * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts
+ *
+ * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes
+ *
+ * BUGS: Needs a quiet flag for the cases where drivers do not mind
+ * failures and do not wish system log messages to be sent.
+ */
+int mtrr_add_page(unsigned long base, unsigned long size,
+ unsigned int type, bool increment)
+{
+ unsigned long lbase, lsize;
+ int i, replace, error;
+ mtrr_type ltype;
+
+ if (!mtrr_enabled())
+ return -ENXIO;
+
+ error = mtrr_if->validate_add_page(base, size, type);
+ if (error)
+ return error;
+
+ if (type >= MTRR_NUM_TYPES) {
+ pr_warn("type: %u invalid\n", type);
+ return -EINVAL;
+ }
+
+ /* If the type is WC, check that this processor supports it */
+ if ((type == MTRR_TYPE_WRCOMB) && !have_wrcomb()) {
+ pr_warn("your processor doesn't support write-combining\n");
+ return -ENOSYS;
+ }
+
+ if (!size) {
+ pr_warn("zero sized request\n");
+ return -EINVAL;
+ }
+
+ if ((base | (base + size - 1)) >>
+ (boot_cpu_data.x86_phys_bits - PAGE_SHIFT)) {
+ pr_warn("base or size exceeds the MTRR width\n");
+ return -EINVAL;
+ }
+
+ error = -EINVAL;
+ replace = -1;
+
+ /* No CPU hotplug when we change MTRR entries */
+ get_online_cpus();
+
+ /* Search for existing MTRR */
+ mutex_lock(&mtrr_mutex);
+ for (i = 0; i < num_var_ranges; ++i) {
+ mtrr_if->get(i, &lbase, &lsize, &ltype);
+ if (!lsize || base > lbase + lsize - 1 ||
+ base + size - 1 < lbase)
+ continue;
+ /*
+ * At this point we know there is some kind of
+ * overlap/enclosure
+ */
+ if (base < lbase || base + size - 1 > lbase + lsize - 1) {
+ if (base <= lbase &&
+ base + size - 1 >= lbase + lsize - 1) {
+ /* New region encloses an existing region */
+ if (type == ltype) {
+ replace = replace == -1 ? i : -2;
+ continue;
+ } else if (types_compatible(type, ltype))
+ continue;
+ }
+ pr_warn("0x%lx000,0x%lx000 overlaps existing 0x%lx000,0x%lx000\n", base, size, lbase,
+ lsize);
+ goto out;
+ }
+ /* New region is enclosed by an existing region */
+ if (ltype != type) {
+ if (types_compatible(type, ltype))
+ continue;
+ pr_warn("type mismatch for %lx000,%lx000 old: %s new: %s\n",
+ base, size, mtrr_attrib_to_str(ltype),
+ mtrr_attrib_to_str(type));
+ goto out;
+ }
+ if (increment)
+ ++mtrr_usage_table[i];
+ error = i;
+ goto out;
+ }
+ /* Search for an empty MTRR */
+ i = mtrr_if->get_free_region(base, size, replace);
+ if (i >= 0) {
+ set_mtrr_cpuslocked(i, base, size, type);
+ if (likely(replace < 0)) {
+ mtrr_usage_table[i] = 1;
+ } else {
+ mtrr_usage_table[i] = mtrr_usage_table[replace];
+ if (increment)
+ mtrr_usage_table[i]++;
+ if (unlikely(replace != i)) {
+ set_mtrr_cpuslocked(replace, 0, 0, 0);
+ mtrr_usage_table[replace] = 0;
+ }
+ }
+ } else {
+ pr_info("no more MTRRs available\n");
+ }
+ error = i;
+ out:
+ mutex_unlock(&mtrr_mutex);
+ put_online_cpus();
+ return error;
+}
+
+static int mtrr_check(unsigned long base, unsigned long size)
+{
+ if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) {
+ pr_warn("size and base must be multiples of 4 kiB\n");
+ pr_debug("size: 0x%lx base: 0x%lx\n", size, base);
+ dump_stack();
+ return -1;
+ }
+ return 0;
+}
+
+/**
+ * mtrr_add - Add a memory type region
+ * @base: Physical base address of region
+ * @size: Physical size of region
+ * @type: Type of MTRR desired
+ * @increment: If this is true do usage counting on the region
+ *
+ * Memory type region registers control the caching on newer Intel and
+ * non Intel processors. This function allows drivers to request an
+ * MTRR is added. The details and hardware specifics of each processor's
+ * implementation are hidden from the caller, but nevertheless the
+ * caller should expect to need to provide a power of two size on an
+ * equivalent power of two boundary.
+ *
+ * If the region cannot be added either because all regions are in use
+ * or the CPU cannot support it a negative value is returned. On success
+ * the register number for this entry is returned, but should be treated
+ * as a cookie only.
+ *
+ * On a multiprocessor machine the changes are made to all processors.
+ * This is required on x86 by the Intel processors.
+ *
+ * The available types are
+ *
+ * %MTRR_TYPE_UNCACHABLE - No caching
+ *
+ * %MTRR_TYPE_WRBACK - Write data back in bursts whenever
+ *
+ * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts
+ *
+ * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes
+ *
+ * BUGS: Needs a quiet flag for the cases where drivers do not mind
+ * failures and do not wish system log messages to be sent.
+ */
+int mtrr_add(unsigned long base, unsigned long size, unsigned int type,
+ bool increment)
+{
+ if (!mtrr_enabled())
+ return -ENODEV;
+ if (mtrr_check(base, size))
+ return -EINVAL;
+ return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type,
+ increment);
+}
+
+/**
+ * mtrr_del_page - delete a memory type region
+ * @reg: Register returned by mtrr_add
+ * @base: Physical base address
+ * @size: Size of region
+ *
+ * If register is supplied then base and size are ignored. This is
+ * how drivers should call it.
+ *
+ * Releases an MTRR region. If the usage count drops to zero the
+ * register is freed and the region returns to default state.
+ * On success the register is returned, on failure a negative error
+ * code.
+ */
+int mtrr_del_page(int reg, unsigned long base, unsigned long size)
+{
+ int i, max;
+ mtrr_type ltype;
+ unsigned long lbase, lsize;
+ int error = -EINVAL;
+
+ if (!mtrr_enabled())
+ return -ENODEV;
+
+ max = num_var_ranges;
+ /* No CPU hotplug when we change MTRR entries */
+ get_online_cpus();
+ mutex_lock(&mtrr_mutex);
+ if (reg < 0) {
+ /* Search for existing MTRR */
+ for (i = 0; i < max; ++i) {
+ mtrr_if->get(i, &lbase, &lsize, &ltype);
+ if (lbase == base && lsize == size) {
+ reg = i;
+ break;
+ }
+ }
+ if (reg < 0) {
+ pr_debug("no MTRR for %lx000,%lx000 found\n",
+ base, size);
+ goto out;
+ }
+ }
+ if (reg >= max) {
+ pr_warn("register: %d too big\n", reg);
+ goto out;
+ }
+ mtrr_if->get(reg, &lbase, &lsize, &ltype);
+ if (lsize < 1) {
+ pr_warn("MTRR %d not used\n", reg);
+ goto out;
+ }
+ if (mtrr_usage_table[reg] < 1) {
+ pr_warn("reg: %d has count=0\n", reg);
+ goto out;
+ }
+ if (--mtrr_usage_table[reg] < 1)
+ set_mtrr_cpuslocked(reg, 0, 0, 0);
+ error = reg;
+ out:
+ mutex_unlock(&mtrr_mutex);
+ put_online_cpus();
+ return error;
+}
+
+/**
+ * mtrr_del - delete a memory type region
+ * @reg: Register returned by mtrr_add
+ * @base: Physical base address
+ * @size: Size of region
+ *
+ * If register is supplied then base and size are ignored. This is
+ * how drivers should call it.
+ *
+ * Releases an MTRR region. If the usage count drops to zero the
+ * register is freed and the region returns to default state.
+ * On success the register is returned, on failure a negative error
+ * code.
+ */
+int mtrr_del(int reg, unsigned long base, unsigned long size)
+{
+ if (!mtrr_enabled())
+ return -ENODEV;
+ if (mtrr_check(base, size))
+ return -EINVAL;
+ return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT);
+}
+
+/**
+ * arch_phys_wc_add - add a WC MTRR and handle errors if PAT is unavailable
+ * @base: Physical base address
+ * @size: Size of region
+ *
+ * If PAT is available, this does nothing. If PAT is unavailable, it
+ * attempts to add a WC MTRR covering size bytes starting at base and
+ * logs an error if this fails.
+ *
+ * The called should provide a power of two size on an equivalent
+ * power of two boundary.
+ *
+ * Drivers must store the return value to pass to mtrr_del_wc_if_needed,
+ * but drivers should not try to interpret that return value.
+ */
+int arch_phys_wc_add(unsigned long base, unsigned long size)
+{
+ int ret;
+
+ if (pat_enabled() || !mtrr_enabled())
+ return 0; /* Success! (We don't need to do anything.) */
+
+ ret = mtrr_add(base, size, MTRR_TYPE_WRCOMB, true);
+ if (ret < 0) {
+ pr_warn("Failed to add WC MTRR for [%p-%p]; performance may suffer.",
+ (void *)base, (void *)(base + size - 1));
+ return ret;
+ }
+ return ret + MTRR_TO_PHYS_WC_OFFSET;
+}
+EXPORT_SYMBOL(arch_phys_wc_add);
+
+/*
+ * arch_phys_wc_del - undoes arch_phys_wc_add
+ * @handle: Return value from arch_phys_wc_add
+ *
+ * This cleans up after mtrr_add_wc_if_needed.
+ *
+ * The API guarantees that mtrr_del_wc_if_needed(error code) and
+ * mtrr_del_wc_if_needed(0) do nothing.
+ */
+void arch_phys_wc_del(int handle)
+{
+ if (handle >= 1) {
+ WARN_ON(handle < MTRR_TO_PHYS_WC_OFFSET);
+ mtrr_del(handle - MTRR_TO_PHYS_WC_OFFSET, 0, 0);
+ }
+}
+EXPORT_SYMBOL(arch_phys_wc_del);
+
+/*
+ * arch_phys_wc_index - translates arch_phys_wc_add's return value
+ * @handle: Return value from arch_phys_wc_add
+ *
+ * This will turn the return value from arch_phys_wc_add into an mtrr
+ * index suitable for debugging.
+ *
+ * Note: There is no legitimate use for this function, except possibly
+ * in printk line. Alas there is an illegitimate use in some ancient
+ * drm ioctls.
+ */
+int arch_phys_wc_index(int handle)
+{
+ if (handle < MTRR_TO_PHYS_WC_OFFSET)
+ return -1;
+ else
+ return handle - MTRR_TO_PHYS_WC_OFFSET;
+}
+EXPORT_SYMBOL_GPL(arch_phys_wc_index);
+
+/*
+ * HACK ALERT!
+ * These should be called implicitly, but we can't yet until all the initcall
+ * stuff is done...
+ */
+static void __init init_ifs(void)
+{
+#ifndef CONFIG_X86_64
+ amd_init_mtrr();
+ cyrix_init_mtrr();
+ centaur_init_mtrr();
+#endif
+}
+
+/* The suspend/resume methods are only for CPU without MTRR. CPU using generic
+ * MTRR driver doesn't require this
+ */
+struct mtrr_value {
+ mtrr_type ltype;
+ unsigned long lbase;
+ unsigned long lsize;
+};
+
+static struct mtrr_value mtrr_value[MTRR_MAX_VAR_RANGES];
+
+static int mtrr_save(void)
+{
+ int i;
+
+ for (i = 0; i < num_var_ranges; i++) {
+ mtrr_if->get(i, &mtrr_value[i].lbase,
+ &mtrr_value[i].lsize,
+ &mtrr_value[i].ltype);
+ }
+ return 0;
+}
+
+static void mtrr_restore(void)
+{
+ int i;
+
+ for (i = 0; i < num_var_ranges; i++) {
+ if (mtrr_value[i].lsize) {
+ set_mtrr(i, mtrr_value[i].lbase,
+ mtrr_value[i].lsize,
+ mtrr_value[i].ltype);
+ }
+ }
+}
+
+
+
+static struct syscore_ops mtrr_syscore_ops = {
+ .suspend = mtrr_save,
+ .resume = mtrr_restore,
+};
+
+int __initdata changed_by_mtrr_cleanup;
+
+#define SIZE_OR_MASK_BITS(n) (~((1ULL << ((n) - PAGE_SHIFT)) - 1))
+/**
+ * mtrr_bp_init - initialize mtrrs on the boot CPU
+ *
+ * This needs to be called early; before any of the other CPUs are
+ * initialized (i.e. before smp_init()).
+ *
+ */
+void __init mtrr_bp_init(void)
+{
+ u32 phys_addr;
+
+ init_ifs();
+
+ phys_addr = 32;
+
+ if (boot_cpu_has(X86_FEATURE_MTRR)) {
+ mtrr_if = &generic_mtrr_ops;
+ size_or_mask = SIZE_OR_MASK_BITS(36);
+ size_and_mask = 0x00f00000;
+ phys_addr = 36;
+
+ /*
+ * This is an AMD specific MSR, but we assume(hope?) that
+ * Intel will implement it too when they extend the address
+ * bus of the Xeon.
+ */
+ if (cpuid_eax(0x80000000) >= 0x80000008) {
+ phys_addr = cpuid_eax(0x80000008) & 0xff;
+ /* CPUID workaround for Intel 0F33/0F34 CPU */
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
+ boot_cpu_data.x86 == 0xF &&
+ boot_cpu_data.x86_model == 0x3 &&
+ (boot_cpu_data.x86_stepping == 0x3 ||
+ boot_cpu_data.x86_stepping == 0x4))
+ phys_addr = 36;
+
+ size_or_mask = SIZE_OR_MASK_BITS(phys_addr);
+ size_and_mask = ~size_or_mask & 0xfffff00000ULL;
+ } else if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR &&
+ boot_cpu_data.x86 == 6) {
+ /*
+ * VIA C* family have Intel style MTRRs,
+ * but don't support PAE
+ */
+ size_or_mask = SIZE_OR_MASK_BITS(32);
+ size_and_mask = 0;
+ phys_addr = 32;
+ }
+ } else {
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_AMD:
+ if (cpu_feature_enabled(X86_FEATURE_K6_MTRR)) {
+ /* Pre-Athlon (K6) AMD CPU MTRRs */
+ mtrr_if = mtrr_ops[X86_VENDOR_AMD];
+ size_or_mask = SIZE_OR_MASK_BITS(32);
+ size_and_mask = 0;
+ }
+ break;
+ case X86_VENDOR_CENTAUR:
+ if (cpu_feature_enabled(X86_FEATURE_CENTAUR_MCR)) {
+ mtrr_if = mtrr_ops[X86_VENDOR_CENTAUR];
+ size_or_mask = SIZE_OR_MASK_BITS(32);
+ size_and_mask = 0;
+ }
+ break;
+ case X86_VENDOR_CYRIX:
+ if (cpu_feature_enabled(X86_FEATURE_CYRIX_ARR)) {
+ mtrr_if = mtrr_ops[X86_VENDOR_CYRIX];
+ size_or_mask = SIZE_OR_MASK_BITS(32);
+ size_and_mask = 0;
+ }
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (mtrr_if) {
+ __mtrr_enabled = true;
+ set_num_var_ranges();
+ init_table();
+ if (use_intel()) {
+ /* BIOS may override */
+ __mtrr_enabled = get_mtrr_state();
+
+ if (mtrr_enabled())
+ mtrr_bp_pat_init();
+
+ if (mtrr_cleanup(phys_addr)) {
+ changed_by_mtrr_cleanup = 1;
+ mtrr_if->set_all();
+ }
+ }
+ }
+
+ if (!mtrr_enabled()) {
+ pr_info("Disabled\n");
+
+ /*
+ * PAT initialization relies on MTRR's rendezvous handler.
+ * Skip PAT init until the handler can initialize both
+ * features independently.
+ */
+ pat_disable("MTRRs disabled, skipping PAT initialization too.");
+ }
+}
+
+void mtrr_ap_init(void)
+{
+ if (!mtrr_enabled())
+ return;
+
+ if (!use_intel() || mtrr_aps_delayed_init)
+ return;
+
+ rcu_cpu_starting(smp_processor_id());
+
+ /*
+ * Ideally we should hold mtrr_mutex here to avoid mtrr entries
+ * changed, but this routine will be called in cpu boot time,
+ * holding the lock breaks it.
+ *
+ * This routine is called in two cases:
+ *
+ * 1. very earily time of software resume, when there absolutely
+ * isn't mtrr entry changes;
+ *
+ * 2. cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug
+ * lock to prevent mtrr entry changes
+ */
+ set_mtrr_from_inactive_cpu(~0U, 0, 0, 0);
+}
+
+/**
+ * Save current fixed-range MTRR state of the first cpu in cpu_online_mask.
+ */
+void mtrr_save_state(void)
+{
+ int first_cpu;
+
+ if (!mtrr_enabled())
+ return;
+
+ first_cpu = cpumask_first(cpu_online_mask);
+ smp_call_function_single(first_cpu, mtrr_save_fixed_ranges, NULL, 1);
+}
+
+void set_mtrr_aps_delayed_init(void)
+{
+ if (!mtrr_enabled())
+ return;
+ if (!use_intel())
+ return;
+
+ mtrr_aps_delayed_init = true;
+}
+
+/*
+ * Delayed MTRR initialization for all AP's
+ */
+void mtrr_aps_init(void)
+{
+ if (!use_intel() || !mtrr_enabled())
+ return;
+
+ /*
+ * Check if someone has requested the delay of AP MTRR initialization,
+ * by doing set_mtrr_aps_delayed_init(), prior to this point. If not,
+ * then we are done.
+ */
+ if (!mtrr_aps_delayed_init)
+ return;
+
+ set_mtrr(~0U, 0, 0, 0);
+ mtrr_aps_delayed_init = false;
+}
+
+void mtrr_bp_restore(void)
+{
+ if (!use_intel() || !mtrr_enabled())
+ return;
+
+ mtrr_if->set_all();
+}
+
+static int __init mtrr_init_finialize(void)
+{
+ if (!mtrr_enabled())
+ return 0;
+
+ if (use_intel()) {
+ if (!changed_by_mtrr_cleanup)
+ mtrr_state_warn();
+ return 0;
+ }
+
+ /*
+ * The CPU has no MTRR and seems to not support SMP. They have
+ * specific drivers, we use a tricky method to support
+ * suspend/resume for them.
+ *
+ * TBD: is there any system with such CPU which supports
+ * suspend/resume? If no, we should remove the code.
+ */
+ register_syscore_ops(&mtrr_syscore_ops);
+
+ return 0;
+}
+subsys_initcall(mtrr_init_finialize);
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
new file mode 100644
index 000000000..2ac99e561
--- /dev/null
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -0,0 +1,80 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * local MTRR defines.
+ */
+
+#include <linux/types.h>
+#include <linux/stddef.h>
+
+#define MTRR_CHANGE_MASK_FIXED 0x01
+#define MTRR_CHANGE_MASK_VARIABLE 0x02
+#define MTRR_CHANGE_MASK_DEFTYPE 0x04
+
+extern unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
+
+struct mtrr_ops {
+ u32 vendor;
+ u32 use_intel_if;
+ void (*set)(unsigned int reg, unsigned long base,
+ unsigned long size, mtrr_type type);
+ void (*set_all)(void);
+
+ void (*get)(unsigned int reg, unsigned long *base,
+ unsigned long *size, mtrr_type *type);
+ int (*get_free_region)(unsigned long base, unsigned long size,
+ int replace_reg);
+ int (*validate_add_page)(unsigned long base, unsigned long size,
+ unsigned int type);
+ int (*have_wrcomb)(void);
+};
+
+extern int generic_get_free_region(unsigned long base, unsigned long size,
+ int replace_reg);
+extern int generic_validate_add_page(unsigned long base, unsigned long size,
+ unsigned int type);
+
+extern const struct mtrr_ops generic_mtrr_ops;
+
+extern int positive_have_wrcomb(void);
+
+/* library functions for processor-specific routines */
+struct set_mtrr_context {
+ unsigned long flags;
+ unsigned long cr4val;
+ u32 deftype_lo;
+ u32 deftype_hi;
+ u32 ccr3;
+};
+
+void set_mtrr_done(struct set_mtrr_context *ctxt);
+void set_mtrr_cache_disable(struct set_mtrr_context *ctxt);
+void set_mtrr_prepare_save(struct set_mtrr_context *ctxt);
+
+void fill_mtrr_var_range(unsigned int index,
+ u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi);
+bool get_mtrr_state(void);
+void mtrr_bp_pat_init(void);
+
+extern void __init set_mtrr_ops(const struct mtrr_ops *ops);
+
+extern u64 size_or_mask, size_and_mask;
+extern const struct mtrr_ops *mtrr_if;
+
+#define is_cpu(vnd) (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd)
+#define use_intel() (mtrr_if && mtrr_if->use_intel_if == 1)
+
+extern unsigned int num_var_ranges;
+extern u64 mtrr_tom2;
+extern struct mtrr_state_type mtrr_state;
+
+void mtrr_state_warn(void);
+const char *mtrr_attrib_to_str(int x);
+void mtrr_wrmsr(unsigned, unsigned, unsigned);
+
+/* CPU specific mtrr init functions */
+int amd_init_mtrr(void);
+int cyrix_init_mtrr(void);
+int centaur_init_mtrr(void);
+
+extern int changed_by_mtrr_cleanup;
+extern int mtrr_cleanup(unsigned address_bits);
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
new file mode 100644
index 000000000..d38908333
--- /dev/null
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -0,0 +1,161 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * local apic based NMI watchdog for various CPUs.
+ *
+ * This file also handles reservation of performance counters for coordination
+ * with other users (like oprofile).
+ *
+ * Note that these events normally don't tick when the CPU idles. This means
+ * the frequency varies with CPU load.
+ *
+ * Original code for K7/P6 written by Keith Owens
+ *
+ */
+
+#include <linux/percpu.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/bitops.h>
+#include <linux/smp.h>
+#include <asm/nmi.h>
+#include <linux/kprobes.h>
+
+#include <asm/apic.h>
+#include <asm/perf_event.h>
+
+/*
+ * this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
+ * offset from MSR_P4_BSU_ESCR0.
+ *
+ * It will be the max for all platforms (for now)
+ */
+#define NMI_MAX_COUNTER_BITS 66
+
+/*
+ * perfctr_nmi_owner tracks the ownership of the perfctr registers:
+ * evtsel_nmi_owner tracks the ownership of the event selection
+ * - different performance counters/ event selection may be reserved for
+ * different subsystems this reservation system just tries to coordinate
+ * things a little
+ */
+static DECLARE_BITMAP(perfctr_nmi_owner, NMI_MAX_COUNTER_BITS);
+static DECLARE_BITMAP(evntsel_nmi_owner, NMI_MAX_COUNTER_BITS);
+
+/* converts an msr to an appropriate reservation bit */
+static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
+{
+ /* returns the bit offset of the performance counter register */
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_AMD:
+ if (msr >= MSR_F15H_PERF_CTR)
+ return (msr - MSR_F15H_PERF_CTR) >> 1;
+ return msr - MSR_K7_PERFCTR0;
+ case X86_VENDOR_INTEL:
+ if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
+ return msr - MSR_ARCH_PERFMON_PERFCTR0;
+
+ switch (boot_cpu_data.x86) {
+ case 6:
+ return msr - MSR_P6_PERFCTR0;
+ case 11:
+ return msr - MSR_KNC_PERFCTR0;
+ case 15:
+ return msr - MSR_P4_BPU_PERFCTR0;
+ }
+ }
+ return 0;
+}
+
+/*
+ * converts an msr to an appropriate reservation bit
+ * returns the bit offset of the event selection register
+ */
+static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
+{
+ /* returns the bit offset of the event selection register */
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_AMD:
+ if (msr >= MSR_F15H_PERF_CTL)
+ return (msr - MSR_F15H_PERF_CTL) >> 1;
+ return msr - MSR_K7_EVNTSEL0;
+ case X86_VENDOR_INTEL:
+ if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
+ return msr - MSR_ARCH_PERFMON_EVENTSEL0;
+
+ switch (boot_cpu_data.x86) {
+ case 6:
+ return msr - MSR_P6_EVNTSEL0;
+ case 11:
+ return msr - MSR_KNC_EVNTSEL0;
+ case 15:
+ return msr - MSR_P4_BSU_ESCR0;
+ }
+ }
+ return 0;
+
+}
+
+/* checks for a bit availability (hack for oprofile) */
+int avail_to_resrv_perfctr_nmi_bit(unsigned int counter)
+{
+ BUG_ON(counter > NMI_MAX_COUNTER_BITS);
+
+ return !test_bit(counter, perfctr_nmi_owner);
+}
+EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
+
+int reserve_perfctr_nmi(unsigned int msr)
+{
+ unsigned int counter;
+
+ counter = nmi_perfctr_msr_to_bit(msr);
+ /* register not managed by the allocator? */
+ if (counter > NMI_MAX_COUNTER_BITS)
+ return 1;
+
+ if (!test_and_set_bit(counter, perfctr_nmi_owner))
+ return 1;
+ return 0;
+}
+EXPORT_SYMBOL(reserve_perfctr_nmi);
+
+void release_perfctr_nmi(unsigned int msr)
+{
+ unsigned int counter;
+
+ counter = nmi_perfctr_msr_to_bit(msr);
+ /* register not managed by the allocator? */
+ if (counter > NMI_MAX_COUNTER_BITS)
+ return;
+
+ clear_bit(counter, perfctr_nmi_owner);
+}
+EXPORT_SYMBOL(release_perfctr_nmi);
+
+int reserve_evntsel_nmi(unsigned int msr)
+{
+ unsigned int counter;
+
+ counter = nmi_evntsel_msr_to_bit(msr);
+ /* register not managed by the allocator? */
+ if (counter > NMI_MAX_COUNTER_BITS)
+ return 1;
+
+ if (!test_and_set_bit(counter, evntsel_nmi_owner))
+ return 1;
+ return 0;
+}
+EXPORT_SYMBOL(reserve_evntsel_nmi);
+
+void release_evntsel_nmi(unsigned int msr)
+{
+ unsigned int counter;
+
+ counter = nmi_evntsel_msr_to_bit(msr);
+ /* register not managed by the allocator? */
+ if (counter > NMI_MAX_COUNTER_BITS)
+ return;
+
+ clear_bit(counter, evntsel_nmi_owner);
+}
+EXPORT_SYMBOL(release_evntsel_nmi);
diff --git a/arch/x86/kernel/cpu/powerflags.c b/arch/x86/kernel/cpu/powerflags.c
new file mode 100644
index 000000000..fd6ec2aa0
--- /dev/null
+++ b/arch/x86/kernel/cpu/powerflags.c
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Strings for the various x86 power flags
+ *
+ * This file must not contain any executable code.
+ */
+
+#include <asm/cpufeature.h>
+
+const char *const x86_power_flags[32] = {
+ "ts", /* temperature sensor */
+ "fid", /* frequency id control */
+ "vid", /* voltage id control */
+ "ttp", /* thermal trip */
+ "tm", /* hardware thermal control */
+ "stc", /* software thermal control */
+ "100mhzsteps", /* 100 MHz multiplier control */
+ "hwpstate", /* hardware P-state control */
+ "", /* tsc invariant mapped to constant_tsc */
+ "cpb", /* core performance boost */
+ "eff_freq_ro", /* Readonly aperf/mperf */
+ "proc_feedback", /* processor feedback interface */
+ "acc_power", /* accumulated power mechanism */
+};
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
new file mode 100644
index 000000000..2c8522a39
--- /dev/null
+++ b/arch/x86/kernel/cpu/proc.c
@@ -0,0 +1,167 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/smp.h>
+#include <linux/timex.h>
+#include <linux/string.h>
+#include <linux/seq_file.h>
+#include <linux/cpufreq.h>
+
+#include "cpu.h"
+
+/*
+ * Get CPU information for use by the procfs.
+ */
+static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c,
+ unsigned int cpu)
+{
+#ifdef CONFIG_SMP
+ seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
+ seq_printf(m, "siblings\t: %d\n",
+ cpumask_weight(topology_core_cpumask(cpu)));
+ seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
+ seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
+ seq_printf(m, "apicid\t\t: %d\n", c->apicid);
+ seq_printf(m, "initial apicid\t: %d\n", c->initial_apicid);
+#endif
+}
+
+#ifdef CONFIG_X86_32
+static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
+{
+ seq_printf(m,
+ "fdiv_bug\t: %s\n"
+ "f00f_bug\t: %s\n"
+ "coma_bug\t: %s\n"
+ "fpu\t\t: %s\n"
+ "fpu_exception\t: %s\n"
+ "cpuid level\t: %d\n"
+ "wp\t\t: yes\n",
+ static_cpu_has_bug(X86_BUG_FDIV) ? "yes" : "no",
+ static_cpu_has_bug(X86_BUG_F00F) ? "yes" : "no",
+ static_cpu_has_bug(X86_BUG_COMA) ? "yes" : "no",
+ static_cpu_has(X86_FEATURE_FPU) ? "yes" : "no",
+ static_cpu_has(X86_FEATURE_FPU) ? "yes" : "no",
+ c->cpuid_level);
+}
+#else
+static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
+{
+ seq_printf(m,
+ "fpu\t\t: yes\n"
+ "fpu_exception\t: yes\n"
+ "cpuid level\t: %d\n"
+ "wp\t\t: yes\n",
+ c->cpuid_level);
+}
+#endif
+
+static int show_cpuinfo(struct seq_file *m, void *v)
+{
+ struct cpuinfo_x86 *c = v;
+ unsigned int cpu;
+ int i;
+
+ cpu = c->cpu_index;
+ seq_printf(m, "processor\t: %u\n"
+ "vendor_id\t: %s\n"
+ "cpu family\t: %d\n"
+ "model\t\t: %u\n"
+ "model name\t: %s\n",
+ cpu,
+ c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown",
+ c->x86,
+ c->x86_model,
+ c->x86_model_id[0] ? c->x86_model_id : "unknown");
+
+ if (c->x86_stepping || c->cpuid_level >= 0)
+ seq_printf(m, "stepping\t: %d\n", c->x86_stepping);
+ else
+ seq_puts(m, "stepping\t: unknown\n");
+ if (c->microcode)
+ seq_printf(m, "microcode\t: 0x%x\n", c->microcode);
+
+ if (cpu_has(c, X86_FEATURE_TSC)) {
+ unsigned int freq = aperfmperf_get_khz(cpu);
+
+ if (!freq)
+ freq = cpufreq_quick_get(cpu);
+ if (!freq)
+ freq = cpu_khz;
+ seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
+ freq / 1000, (freq % 1000));
+ }
+
+ /* Cache size */
+ if (c->x86_cache_size)
+ seq_printf(m, "cache size\t: %u KB\n", c->x86_cache_size);
+
+ show_cpuinfo_core(m, c, cpu);
+ show_cpuinfo_misc(m, c);
+
+ seq_puts(m, "flags\t\t:");
+ for (i = 0; i < 32*NCAPINTS; i++)
+ if (cpu_has(c, i) && x86_cap_flags[i] != NULL)
+ seq_printf(m, " %s", x86_cap_flags[i]);
+
+ seq_puts(m, "\nbugs\t\t:");
+ for (i = 0; i < 32*NBUGINTS; i++) {
+ unsigned int bug_bit = 32*NCAPINTS + i;
+
+ if (cpu_has_bug(c, bug_bit) && x86_bug_flags[i])
+ seq_printf(m, " %s", x86_bug_flags[i]);
+ }
+
+ seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
+ c->loops_per_jiffy/(500000/HZ),
+ (c->loops_per_jiffy/(5000/HZ)) % 100);
+
+#ifdef CONFIG_X86_64
+ if (c->x86_tlbsize > 0)
+ seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize);
+#endif
+ seq_printf(m, "clflush size\t: %u\n", c->x86_clflush_size);
+ seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment);
+ seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n",
+ c->x86_phys_bits, c->x86_virt_bits);
+
+ seq_puts(m, "power management:");
+ for (i = 0; i < 32; i++) {
+ if (c->x86_power & (1 << i)) {
+ if (i < ARRAY_SIZE(x86_power_flags) &&
+ x86_power_flags[i])
+ seq_printf(m, "%s%s",
+ x86_power_flags[i][0] ? " " : "",
+ x86_power_flags[i]);
+ else
+ seq_printf(m, " [%d]", i);
+ }
+ }
+
+ seq_puts(m, "\n\n");
+
+ return 0;
+}
+
+static void *c_start(struct seq_file *m, loff_t *pos)
+{
+ *pos = cpumask_next(*pos - 1, cpu_online_mask);
+ if ((*pos) < nr_cpu_ids)
+ return &cpu_data(*pos);
+ return NULL;
+}
+
+static void *c_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ (*pos)++;
+ return c_start(m, pos);
+}
+
+static void c_stop(struct seq_file *m, void *v)
+{
+}
+
+const struct seq_operations cpuinfo_op = {
+ .start = c_start,
+ .next = c_next,
+ .stop = c_stop,
+ .show = show_cpuinfo,
+};
diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c
new file mode 100644
index 000000000..cfa97ff67
--- /dev/null
+++ b/arch/x86/kernel/cpu/rdrand.c
@@ -0,0 +1,59 @@
+/*
+ * This file is part of the Linux kernel.
+ *
+ * Copyright (c) 2011, Intel Corporation
+ * Authors: Fenghua Yu <fenghua.yu@intel.com>,
+ * H. Peter Anvin <hpa@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+
+#include <asm/processor.h>
+#include <asm/archrandom.h>
+#include <asm/sections.h>
+
+static int __init x86_rdrand_setup(char *s)
+{
+ setup_clear_cpu_cap(X86_FEATURE_RDRAND);
+ setup_clear_cpu_cap(X86_FEATURE_RDSEED);
+ return 1;
+}
+__setup("nordrand", x86_rdrand_setup);
+
+/*
+ * RDRAND has Built-In-Self-Test (BIST) that runs on every invocation.
+ * Run the instruction a few times as a sanity check.
+ * If it fails, it is simple to disable RDRAND here.
+ */
+#define SANITY_CHECK_LOOPS 8
+
+#ifdef CONFIG_ARCH_RANDOM
+void x86_init_rdrand(struct cpuinfo_x86 *c)
+{
+ unsigned long tmp;
+ int i;
+
+ if (!cpu_has(c, X86_FEATURE_RDRAND))
+ return;
+
+ for (i = 0; i < SANITY_CHECK_LOOPS; i++) {
+ if (!rdrand_long(&tmp)) {
+ clear_cpu_cap(c, X86_FEATURE_RDRAND);
+ pr_warn_once("rdrand: disabled\n");
+ return;
+ }
+ }
+}
+#endif
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
new file mode 100644
index 000000000..5a52672e3
--- /dev/null
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -0,0 +1,86 @@
+/*
+ * Routines to identify additional cpu features that are scattered in
+ * cpuid space.
+ */
+#include <linux/cpu.h>
+
+#include <asm/pat.h>
+#include <asm/processor.h>
+
+#include <asm/apic.h>
+
+struct cpuid_bit {
+ u16 feature;
+ u8 reg;
+ u8 bit;
+ u32 level;
+ u32 sub_leaf;
+};
+
+/* Please keep the leaf sorted by cpuid_bit.level for faster search. */
+static const struct cpuid_bit cpuid_bits[] = {
+ { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 },
+ { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 },
+ { X86_FEATURE_CQM_LLC, CPUID_EDX, 1, 0x0000000f, 0 },
+ { X86_FEATURE_CQM_OCCUP_LLC, CPUID_EDX, 0, 0x0000000f, 1 },
+ { X86_FEATURE_CQM_MBM_TOTAL, CPUID_EDX, 1, 0x0000000f, 1 },
+ { X86_FEATURE_CQM_MBM_LOCAL, CPUID_EDX, 2, 0x0000000f, 1 },
+ { X86_FEATURE_CAT_L3, CPUID_EBX, 1, 0x00000010, 0 },
+ { X86_FEATURE_CAT_L2, CPUID_EBX, 2, 0x00000010, 0 },
+ { X86_FEATURE_CDP_L3, CPUID_ECX, 2, 0x00000010, 1 },
+ { X86_FEATURE_CDP_L2, CPUID_ECX, 2, 0x00000010, 2 },
+ { X86_FEATURE_MBA, CPUID_EBX, 3, 0x00000010, 0 },
+ { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 },
+ { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 },
+ { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 },
+ { X86_FEATURE_SME, CPUID_EAX, 0, 0x8000001f, 0 },
+ { X86_FEATURE_SEV, CPUID_EAX, 1, 0x8000001f, 0 },
+ { 0, 0, 0, 0, 0 }
+};
+
+void init_scattered_cpuid_features(struct cpuinfo_x86 *c)
+{
+ u32 max_level;
+ u32 regs[4];
+ const struct cpuid_bit *cb;
+
+ for (cb = cpuid_bits; cb->feature; cb++) {
+
+ /* Verify that the level is valid */
+ max_level = cpuid_eax(cb->level & 0xffff0000);
+ if (max_level < cb->level ||
+ max_level > (cb->level | 0xffff))
+ continue;
+
+ cpuid_count(cb->level, cb->sub_leaf, &regs[CPUID_EAX],
+ &regs[CPUID_EBX], &regs[CPUID_ECX],
+ &regs[CPUID_EDX]);
+
+ if (regs[cb->reg] & (1 << cb->bit))
+ set_cpu_cap(c, cb->feature);
+ }
+}
+
+u32 get_scattered_cpuid_leaf(unsigned int level, unsigned int sub_leaf,
+ enum cpuid_regs_idx reg)
+{
+ const struct cpuid_bit *cb;
+ u32 cpuid_val = 0;
+
+ for (cb = cpuid_bits; cb->feature; cb++) {
+
+ if (level > cb->level)
+ continue;
+
+ if (level < cb->level)
+ break;
+
+ if (reg == cb->reg && sub_leaf == cb->sub_leaf) {
+ if (cpu_has(&boot_cpu_data, cb->feature))
+ cpuid_val |= BIT(cb->bit);
+ }
+ }
+
+ return cpuid_val;
+}
+EXPORT_SYMBOL_GPL(get_scattered_cpuid_leaf);
diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c
new file mode 100644
index 000000000..71ca064e3
--- /dev/null
+++ b/arch/x86/kernel/cpu/topology.c
@@ -0,0 +1,103 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Check for extended topology enumeration cpuid leaf 0xb and if it
+ * exists, use it for populating initial_apicid and cpu topology
+ * detection.
+ */
+
+#include <linux/cpu.h>
+#include <asm/apic.h>
+#include <asm/pat.h>
+#include <asm/processor.h>
+
+/* leaf 0xb SMT level */
+#define SMT_LEVEL 0
+
+/* leaf 0xb sub-leaf types */
+#define INVALID_TYPE 0
+#define SMT_TYPE 1
+#define CORE_TYPE 2
+
+#define LEAFB_SUBTYPE(ecx) (((ecx) >> 8) & 0xff)
+#define BITS_SHIFT_NEXT_LEVEL(eax) ((eax) & 0x1f)
+#define LEVEL_MAX_SIBLINGS(ebx) ((ebx) & 0xffff)
+
+int detect_extended_topology_early(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_SMP
+ unsigned int eax, ebx, ecx, edx;
+
+ if (c->cpuid_level < 0xb)
+ return -1;
+
+ cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx);
+
+ /*
+ * check if the cpuid leaf 0xb is actually implemented.
+ */
+ if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE))
+ return -1;
+
+ set_cpu_cap(c, X86_FEATURE_XTOPOLOGY);
+
+ /*
+ * initial apic id, which also represents 32-bit extended x2apic id.
+ */
+ c->initial_apicid = edx;
+ smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx);
+#endif
+ return 0;
+}
+
+/*
+ * Check for extended topology enumeration cpuid leaf 0xb and if it
+ * exists, use it for populating initial_apicid and cpu topology
+ * detection.
+ */
+int detect_extended_topology(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_SMP
+ unsigned int eax, ebx, ecx, edx, sub_index;
+ unsigned int ht_mask_width, core_plus_mask_width;
+ unsigned int core_select_mask, core_level_siblings;
+
+ if (detect_extended_topology_early(c) < 0)
+ return -1;
+
+ /*
+ * Populate HT related information from sub-leaf level 0.
+ */
+ cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx);
+ core_level_siblings = smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx);
+ core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
+
+ sub_index = 1;
+ do {
+ cpuid_count(0xb, sub_index, &eax, &ebx, &ecx, &edx);
+
+ /*
+ * Check for the Core type in the implemented sub leaves.
+ */
+ if (LEAFB_SUBTYPE(ecx) == CORE_TYPE) {
+ core_level_siblings = LEVEL_MAX_SIBLINGS(ebx);
+ core_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
+ break;
+ }
+
+ sub_index++;
+ } while (LEAFB_SUBTYPE(ecx) != INVALID_TYPE);
+
+ core_select_mask = (~(-1 << core_plus_mask_width)) >> ht_mask_width;
+
+ c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, ht_mask_width)
+ & core_select_mask;
+ c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, core_plus_mask_width);
+ /*
+ * Reinit the apicid, now that we have extended initial_apicid.
+ */
+ c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
+
+ c->x86_max_cores = (core_level_siblings / smp_num_siblings);
+#endif
+ return 0;
+}
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
new file mode 100644
index 000000000..42c939827
--- /dev/null
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -0,0 +1,111 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/sched/clock.h>
+#include <linux/mm.h>
+#include <asm/cpufeature.h>
+#include <asm/msr.h>
+#include "cpu.h"
+
+static void early_init_transmeta(struct cpuinfo_x86 *c)
+{
+ u32 xlvl;
+
+ /* Transmeta-defined flags: level 0x80860001 */
+ xlvl = cpuid_eax(0x80860000);
+ if ((xlvl & 0xffff0000) == 0x80860000) {
+ if (xlvl >= 0x80860001)
+ c->x86_capability[CPUID_8086_0001_EDX] = cpuid_edx(0x80860001);
+ }
+}
+
+static void init_transmeta(struct cpuinfo_x86 *c)
+{
+ unsigned int cap_mask, uk, max, dummy;
+ unsigned int cms_rev1, cms_rev2;
+ unsigned int cpu_rev, cpu_freq = 0, cpu_flags, new_cpu_rev;
+ char cpu_info[65];
+
+ early_init_transmeta(c);
+
+ cpu_detect_cache_sizes(c);
+
+ /* Print CMS and CPU revision */
+ max = cpuid_eax(0x80860000);
+ cpu_rev = 0;
+ if (max >= 0x80860001) {
+ cpuid(0x80860001, &dummy, &cpu_rev, &cpu_freq, &cpu_flags);
+ if (cpu_rev != 0x02000000) {
+ pr_info("CPU: Processor revision %u.%u.%u.%u, %u MHz\n",
+ (cpu_rev >> 24) & 0xff,
+ (cpu_rev >> 16) & 0xff,
+ (cpu_rev >> 8) & 0xff,
+ cpu_rev & 0xff,
+ cpu_freq);
+ }
+ }
+ if (max >= 0x80860002) {
+ cpuid(0x80860002, &new_cpu_rev, &cms_rev1, &cms_rev2, &dummy);
+ if (cpu_rev == 0x02000000) {
+ pr_info("CPU: Processor revision %08X, %u MHz\n",
+ new_cpu_rev, cpu_freq);
+ }
+ pr_info("CPU: Code Morphing Software revision %u.%u.%u-%u-%u\n",
+ (cms_rev1 >> 24) & 0xff,
+ (cms_rev1 >> 16) & 0xff,
+ (cms_rev1 >> 8) & 0xff,
+ cms_rev1 & 0xff,
+ cms_rev2);
+ }
+ if (max >= 0x80860006) {
+ cpuid(0x80860003,
+ (void *)&cpu_info[0],
+ (void *)&cpu_info[4],
+ (void *)&cpu_info[8],
+ (void *)&cpu_info[12]);
+ cpuid(0x80860004,
+ (void *)&cpu_info[16],
+ (void *)&cpu_info[20],
+ (void *)&cpu_info[24],
+ (void *)&cpu_info[28]);
+ cpuid(0x80860005,
+ (void *)&cpu_info[32],
+ (void *)&cpu_info[36],
+ (void *)&cpu_info[40],
+ (void *)&cpu_info[44]);
+ cpuid(0x80860006,
+ (void *)&cpu_info[48],
+ (void *)&cpu_info[52],
+ (void *)&cpu_info[56],
+ (void *)&cpu_info[60]);
+ cpu_info[64] = '\0';
+ pr_info("CPU: %s\n", cpu_info);
+ }
+
+ /* Unhide possibly hidden capability flags */
+ rdmsr(0x80860004, cap_mask, uk);
+ wrmsr(0x80860004, ~0, uk);
+ c->x86_capability[CPUID_1_EDX] = cpuid_edx(0x00000001);
+ wrmsr(0x80860004, cap_mask, uk);
+
+ /* All Transmeta CPUs have a constant TSC */
+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+
+#ifdef CONFIG_SYSCTL
+ /*
+ * randomize_va_space slows us down enormously;
+ * it probably triggers retranslation of x86->native bytecode
+ */
+ randomize_va_space = 0;
+#endif
+}
+
+static const struct cpu_dev transmeta_cpu_dev = {
+ .c_vendor = "Transmeta",
+ .c_ident = { "GenuineTMx86", "TransmetaCPU" },
+ .c_early_init = early_init_transmeta,
+ .c_init = init_transmeta,
+ .c_x86_vendor = X86_VENDOR_TRANSMETA,
+};
+
+cpu_dev_register(transmeta_cpu_dev);
diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c
new file mode 100644
index 000000000..032509adf
--- /dev/null
+++ b/arch/x86/kernel/cpu/tsx.c
@@ -0,0 +1,141 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel Transactional Synchronization Extensions (TSX) control.
+ *
+ * Copyright (C) 2019 Intel Corporation
+ *
+ * Author:
+ * Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
+ */
+
+#include <linux/cpufeature.h>
+
+#include <asm/cmdline.h>
+
+#include "cpu.h"
+
+enum tsx_ctrl_states tsx_ctrl_state __ro_after_init = TSX_CTRL_NOT_SUPPORTED;
+
+void tsx_disable(void)
+{
+ u64 tsx;
+
+ rdmsrl(MSR_IA32_TSX_CTRL, tsx);
+
+ /* Force all transactions to immediately abort */
+ tsx |= TSX_CTRL_RTM_DISABLE;
+
+ /*
+ * Ensure TSX support is not enumerated in CPUID.
+ * This is visible to userspace and will ensure they
+ * do not waste resources trying TSX transactions that
+ * will always abort.
+ */
+ tsx |= TSX_CTRL_CPUID_CLEAR;
+
+ wrmsrl(MSR_IA32_TSX_CTRL, tsx);
+}
+
+void tsx_enable(void)
+{
+ u64 tsx;
+
+ rdmsrl(MSR_IA32_TSX_CTRL, tsx);
+
+ /* Enable the RTM feature in the cpu */
+ tsx &= ~TSX_CTRL_RTM_DISABLE;
+
+ /*
+ * Ensure TSX support is enumerated in CPUID.
+ * This is visible to userspace and will ensure they
+ * can enumerate and use the TSX feature.
+ */
+ tsx &= ~TSX_CTRL_CPUID_CLEAR;
+
+ wrmsrl(MSR_IA32_TSX_CTRL, tsx);
+}
+
+static bool __init tsx_ctrl_is_supported(void)
+{
+ u64 ia32_cap = x86_read_arch_cap_msr();
+
+ /*
+ * TSX is controlled via MSR_IA32_TSX_CTRL. However, support for this
+ * MSR is enumerated by ARCH_CAP_TSX_MSR bit in MSR_IA32_ARCH_CAPABILITIES.
+ *
+ * TSX control (aka MSR_IA32_TSX_CTRL) is only available after a
+ * microcode update on CPUs that have their MSR_IA32_ARCH_CAPABILITIES
+ * bit MDS_NO=1. CPUs with MDS_NO=0 are not planned to get
+ * MSR_IA32_TSX_CTRL support even after a microcode update. Thus,
+ * tsx= cmdline requests will do nothing on CPUs without
+ * MSR_IA32_TSX_CTRL support.
+ */
+ return !!(ia32_cap & ARCH_CAP_TSX_CTRL_MSR);
+}
+
+static enum tsx_ctrl_states x86_get_tsx_auto_mode(void)
+{
+ if (boot_cpu_has_bug(X86_BUG_TAA))
+ return TSX_CTRL_DISABLE;
+
+ return TSX_CTRL_ENABLE;
+}
+
+void __init tsx_init(void)
+{
+ char arg[5] = {};
+ int ret;
+
+ if (!tsx_ctrl_is_supported())
+ return;
+
+ ret = cmdline_find_option(boot_command_line, "tsx", arg, sizeof(arg));
+ if (ret >= 0) {
+ if (!strcmp(arg, "on")) {
+ tsx_ctrl_state = TSX_CTRL_ENABLE;
+ } else if (!strcmp(arg, "off")) {
+ tsx_ctrl_state = TSX_CTRL_DISABLE;
+ } else if (!strcmp(arg, "auto")) {
+ tsx_ctrl_state = x86_get_tsx_auto_mode();
+ } else {
+ tsx_ctrl_state = TSX_CTRL_DISABLE;
+ pr_err("tsx: invalid option, defaulting to off\n");
+ }
+ } else {
+ /* tsx= not provided */
+ if (IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_AUTO))
+ tsx_ctrl_state = x86_get_tsx_auto_mode();
+ else if (IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_OFF))
+ tsx_ctrl_state = TSX_CTRL_DISABLE;
+ else
+ tsx_ctrl_state = TSX_CTRL_ENABLE;
+ }
+
+ if (tsx_ctrl_state == TSX_CTRL_DISABLE) {
+ tsx_disable();
+
+ /*
+ * tsx_disable() will change the state of the RTM and HLE CPUID
+ * bits. Clear them here since they are now expected to be not
+ * set.
+ */
+ setup_clear_cpu_cap(X86_FEATURE_RTM);
+ setup_clear_cpu_cap(X86_FEATURE_HLE);
+ } else if (tsx_ctrl_state == TSX_CTRL_ENABLE) {
+
+ /*
+ * HW defaults TSX to be enabled at bootup.
+ * We may still need the TSX enable support
+ * during init for special cases like
+ * kexec after TSX is disabled.
+ */
+ tsx_enable();
+
+ /*
+ * tsx_enable() will change the state of the RTM and HLE CPUID
+ * bits. Force them here since they are now expected to be set.
+ */
+ setup_force_cpu_cap(X86_FEATURE_RTM);
+ setup_force_cpu_cap(X86_FEATURE_HLE);
+ }
+}
diff --git a/arch/x86/kernel/cpu/umc.c b/arch/x86/kernel/cpu/umc.c
new file mode 100644
index 000000000..65a58a390
--- /dev/null
+++ b/arch/x86/kernel/cpu/umc.c
@@ -0,0 +1,26 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <asm/processor.h>
+#include "cpu.h"
+
+/*
+ * UMC chips appear to be only either 386 or 486,
+ * so no special init takes place.
+ */
+
+static const struct cpu_dev umc_cpu_dev = {
+ .c_vendor = "UMC",
+ .c_ident = { "UMC UMC UMC" },
+ .legacy_models = {
+ { .family = 4, .model_names =
+ {
+ [1] = "U5D",
+ [2] = "U5S",
+ }
+ },
+ },
+ .c_x86_vendor = X86_VENDOR_UMC,
+};
+
+cpu_dev_register(umc_cpu_dev);
+
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
new file mode 100644
index 000000000..d805202c6
--- /dev/null
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -0,0 +1,214 @@
+/*
+ * VMware Detection code.
+ *
+ * Copyright (C) 2008, VMware, Inc.
+ * Author : Alok N Kataria <akataria@vmware.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+
+#include <linux/dmi.h>
+#include <linux/init.h>
+#include <linux/export.h>
+#include <linux/clocksource.h>
+#include <asm/div64.h>
+#include <asm/x86_init.h>
+#include <asm/hypervisor.h>
+#include <asm/timer.h>
+#include <asm/apic.h>
+
+#undef pr_fmt
+#define pr_fmt(fmt) "vmware: " fmt
+
+#define CPUID_VMWARE_INFO_LEAF 0x40000000
+#define VMWARE_HYPERVISOR_MAGIC 0x564D5868
+#define VMWARE_HYPERVISOR_PORT 0x5658
+
+#define VMWARE_PORT_CMD_GETVERSION 10
+#define VMWARE_PORT_CMD_GETHZ 45
+#define VMWARE_PORT_CMD_GETVCPU_INFO 68
+#define VMWARE_PORT_CMD_LEGACY_X2APIC 3
+#define VMWARE_PORT_CMD_VCPU_RESERVED 31
+
+#define VMWARE_PORT(cmd, eax, ebx, ecx, edx) \
+ __asm__("inl (%%dx)" : \
+ "=a"(eax), "=c"(ecx), "=d"(edx), "=b"(ebx) : \
+ "0"(VMWARE_HYPERVISOR_MAGIC), \
+ "1"(VMWARE_PORT_CMD_##cmd), \
+ "2"(VMWARE_HYPERVISOR_PORT), "3"(UINT_MAX) : \
+ "memory");
+
+static unsigned long vmware_tsc_khz __ro_after_init;
+
+static inline int __vmware_platform(void)
+{
+ uint32_t eax, ebx, ecx, edx;
+ VMWARE_PORT(GETVERSION, eax, ebx, ecx, edx);
+ return eax != (uint32_t)-1 && ebx == VMWARE_HYPERVISOR_MAGIC;
+}
+
+static unsigned long vmware_get_tsc_khz(void)
+{
+ return vmware_tsc_khz;
+}
+
+#ifdef CONFIG_PARAVIRT
+static struct cyc2ns_data vmware_cyc2ns __ro_after_init;
+static int vmw_sched_clock __initdata = 1;
+
+static __init int setup_vmw_sched_clock(char *s)
+{
+ vmw_sched_clock = 0;
+ return 0;
+}
+early_param("no-vmw-sched-clock", setup_vmw_sched_clock);
+
+static unsigned long long notrace vmware_sched_clock(void)
+{
+ unsigned long long ns;
+
+ ns = mul_u64_u32_shr(rdtsc(), vmware_cyc2ns.cyc2ns_mul,
+ vmware_cyc2ns.cyc2ns_shift);
+ ns -= vmware_cyc2ns.cyc2ns_offset;
+ return ns;
+}
+
+static void __init vmware_sched_clock_setup(void)
+{
+ struct cyc2ns_data *d = &vmware_cyc2ns;
+ unsigned long long tsc_now = rdtsc();
+
+ clocks_calc_mult_shift(&d->cyc2ns_mul, &d->cyc2ns_shift,
+ vmware_tsc_khz, NSEC_PER_MSEC, 0);
+ d->cyc2ns_offset = mul_u64_u32_shr(tsc_now, d->cyc2ns_mul,
+ d->cyc2ns_shift);
+
+ pv_time_ops.sched_clock = vmware_sched_clock;
+ pr_info("using sched offset of %llu ns\n", d->cyc2ns_offset);
+}
+
+static void __init vmware_paravirt_ops_setup(void)
+{
+ pv_info.name = "VMware hypervisor";
+ pv_cpu_ops.io_delay = paravirt_nop;
+
+ if (vmware_tsc_khz && vmw_sched_clock)
+ vmware_sched_clock_setup();
+}
+#else
+#define vmware_paravirt_ops_setup() do {} while (0)
+#endif
+
+/*
+ * VMware hypervisor takes care of exporting a reliable TSC to the guest.
+ * Still, due to timing difference when running on virtual cpus, the TSC can
+ * be marked as unstable in some cases. For example, the TSC sync check at
+ * bootup can fail due to a marginal offset between vcpus' TSCs (though the
+ * TSCs do not drift from each other). Also, the ACPI PM timer clocksource
+ * is not suitable as a watchdog when running on a hypervisor because the
+ * kernel may miss a wrap of the counter if the vcpu is descheduled for a
+ * long time. To skip these checks at runtime we set these capability bits,
+ * so that the kernel could just trust the hypervisor with providing a
+ * reliable virtual TSC that is suitable for timekeeping.
+ */
+static void __init vmware_set_capabilities(void)
+{
+ setup_force_cpu_cap(X86_FEATURE_CONSTANT_TSC);
+ setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
+}
+
+static void __init vmware_platform_setup(void)
+{
+ uint32_t eax, ebx, ecx, edx;
+ uint64_t lpj, tsc_khz;
+
+ VMWARE_PORT(GETHZ, eax, ebx, ecx, edx);
+
+ if (ebx != UINT_MAX) {
+ lpj = tsc_khz = eax | (((uint64_t)ebx) << 32);
+ do_div(tsc_khz, 1000);
+ WARN_ON(tsc_khz >> 32);
+ pr_info("TSC freq read from hypervisor : %lu.%03lu MHz\n",
+ (unsigned long) tsc_khz / 1000,
+ (unsigned long) tsc_khz % 1000);
+
+ if (!preset_lpj) {
+ do_div(lpj, HZ);
+ preset_lpj = lpj;
+ }
+
+ vmware_tsc_khz = tsc_khz;
+ x86_platform.calibrate_tsc = vmware_get_tsc_khz;
+ x86_platform.calibrate_cpu = vmware_get_tsc_khz;
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ /* Skip lapic calibration since we know the bus frequency. */
+ lapic_timer_frequency = ecx / HZ;
+ pr_info("Host bus clock speed read from hypervisor : %u Hz\n",
+ ecx);
+#endif
+ } else {
+ pr_warn("Failed to get TSC freq from the hypervisor\n");
+ }
+
+ vmware_paravirt_ops_setup();
+
+#ifdef CONFIG_X86_IO_APIC
+ no_timer_check = 1;
+#endif
+
+ vmware_set_capabilities();
+}
+
+/*
+ * While checking the dmi string information, just checking the product
+ * serial key should be enough, as this will always have a VMware
+ * specific string when running under VMware hypervisor.
+ */
+static uint32_t __init vmware_platform(void)
+{
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
+ unsigned int eax;
+ unsigned int hyper_vendor_id[3];
+
+ cpuid(CPUID_VMWARE_INFO_LEAF, &eax, &hyper_vendor_id[0],
+ &hyper_vendor_id[1], &hyper_vendor_id[2]);
+ if (!memcmp(hyper_vendor_id, "VMwareVMware", 12))
+ return CPUID_VMWARE_INFO_LEAF;
+ } else if (dmi_available && dmi_name_in_serial("VMware") &&
+ __vmware_platform())
+ return 1;
+
+ return 0;
+}
+
+/* Checks if hypervisor supports x2apic without VT-D interrupt remapping. */
+static bool __init vmware_legacy_x2apic_available(void)
+{
+ uint32_t eax, ebx, ecx, edx;
+ VMWARE_PORT(GETVCPU_INFO, eax, ebx, ecx, edx);
+ return (eax & (1 << VMWARE_PORT_CMD_VCPU_RESERVED)) == 0 &&
+ (eax & (1 << VMWARE_PORT_CMD_LEGACY_X2APIC)) != 0;
+}
+
+const __initconst struct hypervisor_x86 x86_hyper_vmware = {
+ .name = "VMware",
+ .detect = vmware_platform,
+ .type = X86_HYPER_VMWARE,
+ .init.init_platform = vmware_platform_setup,
+ .init.x2apic_available = vmware_legacy_x2apic_available,
+};
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
new file mode 100644
index 000000000..1d300f96d
--- /dev/null
+++ b/arch/x86/kernel/cpuid.c
@@ -0,0 +1,196 @@
+/* ----------------------------------------------------------------------- *
+ *
+ * Copyright 2000-2008 H. Peter Anvin - All Rights Reserved
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139,
+ * USA; either version 2 of the License, or (at your option) any later
+ * version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * x86 CPUID access device
+ *
+ * This device is accessed by lseek() to the appropriate CPUID level
+ * and then read in chunks of 16 bytes. A larger size means multiple
+ * reads of consecutive levels.
+ *
+ * The lower 32 bits of the file position is used as the incoming %eax,
+ * and the upper 32 bits of the file position as the incoming %ecx,
+ * the latter intended for "counting" eax levels like eax=4.
+ *
+ * This driver uses /dev/cpu/%d/cpuid where %d is the minor number, and on
+ * an SMP box will direct the access to CPU %d.
+ */
+
+#include <linux/module.h>
+
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/fcntl.h>
+#include <linux/init.h>
+#include <linux/poll.h>
+#include <linux/smp.h>
+#include <linux/major.h>
+#include <linux/fs.h>
+#include <linux/device.h>
+#include <linux/cpu.h>
+#include <linux/notifier.h>
+#include <linux/uaccess.h>
+#include <linux/gfp.h>
+#include <linux/completion.h>
+
+#include <asm/processor.h>
+#include <asm/msr.h>
+
+static struct class *cpuid_class;
+static enum cpuhp_state cpuhp_cpuid_state;
+
+struct cpuid_regs_done {
+ struct cpuid_regs regs;
+ struct completion done;
+};
+
+static void cpuid_smp_cpuid(void *cmd_block)
+{
+ struct cpuid_regs_done *cmd = cmd_block;
+
+ cpuid_count(cmd->regs.eax, cmd->regs.ecx,
+ &cmd->regs.eax, &cmd->regs.ebx,
+ &cmd->regs.ecx, &cmd->regs.edx);
+
+ complete(&cmd->done);
+}
+
+static ssize_t cpuid_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ char __user *tmp = buf;
+ struct cpuid_regs_done cmd;
+ int cpu = iminor(file_inode(file));
+ u64 pos = *ppos;
+ ssize_t bytes = 0;
+ int err = 0;
+
+ if (count % 16)
+ return -EINVAL; /* Invalid chunk size */
+
+ init_completion(&cmd.done);
+ for (; count; count -= 16) {
+ call_single_data_t csd = {
+ .func = cpuid_smp_cpuid,
+ .info = &cmd,
+ };
+
+ cmd.regs.eax = pos;
+ cmd.regs.ecx = pos >> 32;
+
+ err = smp_call_function_single_async(cpu, &csd);
+ if (err)
+ break;
+ wait_for_completion(&cmd.done);
+ if (copy_to_user(tmp, &cmd.regs, 16)) {
+ err = -EFAULT;
+ break;
+ }
+ tmp += 16;
+ bytes += 16;
+ *ppos = ++pos;
+ reinit_completion(&cmd.done);
+ }
+
+ return bytes ? bytes : err;
+}
+
+static int cpuid_open(struct inode *inode, struct file *file)
+{
+ unsigned int cpu;
+ struct cpuinfo_x86 *c;
+
+ cpu = iminor(file_inode(file));
+ if (cpu >= nr_cpu_ids || !cpu_online(cpu))
+ return -ENXIO; /* No such CPU */
+
+ c = &cpu_data(cpu);
+ if (c->cpuid_level < 0)
+ return -EIO; /* CPUID not supported */
+
+ return 0;
+}
+
+/*
+ * File operations we support
+ */
+static const struct file_operations cpuid_fops = {
+ .owner = THIS_MODULE,
+ .llseek = no_seek_end_llseek,
+ .read = cpuid_read,
+ .open = cpuid_open,
+};
+
+static int cpuid_device_create(unsigned int cpu)
+{
+ struct device *dev;
+
+ dev = device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), NULL,
+ "cpu%d", cpu);
+ return PTR_ERR_OR_ZERO(dev);
+}
+
+static int cpuid_device_destroy(unsigned int cpu)
+{
+ device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, cpu));
+ return 0;
+}
+
+static char *cpuid_devnode(struct device *dev, umode_t *mode)
+{
+ return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt));
+}
+
+static int __init cpuid_init(void)
+{
+ int err;
+
+ if (__register_chrdev(CPUID_MAJOR, 0, NR_CPUS,
+ "cpu/cpuid", &cpuid_fops)) {
+ printk(KERN_ERR "cpuid: unable to get major %d for cpuid\n",
+ CPUID_MAJOR);
+ return -EBUSY;
+ }
+ cpuid_class = class_create(THIS_MODULE, "cpuid");
+ if (IS_ERR(cpuid_class)) {
+ err = PTR_ERR(cpuid_class);
+ goto out_chrdev;
+ }
+ cpuid_class->devnode = cpuid_devnode;
+
+ err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/cpuid:online",
+ cpuid_device_create, cpuid_device_destroy);
+ if (err < 0)
+ goto out_class;
+
+ cpuhp_cpuid_state = err;
+ return 0;
+
+out_class:
+ class_destroy(cpuid_class);
+out_chrdev:
+ __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid");
+ return err;
+}
+module_init(cpuid_init);
+
+static void __exit cpuid_exit(void)
+{
+ cpuhp_remove_state(cpuhp_cpuid_state);
+ class_destroy(cpuid_class);
+ __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid");
+}
+module_exit(cpuid_exit);
+
+MODULE_AUTHOR("H. Peter Anvin <hpa@zytor.com>");
+MODULE_DESCRIPTION("x86 generic CPUID driver");
+MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
new file mode 100644
index 000000000..91b3483e5
--- /dev/null
+++ b/arch/x86/kernel/crash.c
@@ -0,0 +1,483 @@
+/*
+ * Architecture specific (i386/x86_64) functions for kexec based crash dumps.
+ *
+ * Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
+ *
+ * Copyright (C) IBM Corporation, 2004. All rights reserved.
+ * Copyright (C) Red Hat Inc., 2014. All rights reserved.
+ * Authors:
+ * Vivek Goyal <vgoyal@redhat.com>
+ *
+ */
+
+#define pr_fmt(fmt) "kexec: " fmt
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/smp.h>
+#include <linux/reboot.h>
+#include <linux/kexec.h>
+#include <linux/delay.h>
+#include <linux/elf.h>
+#include <linux/elfcore.h>
+#include <linux/export.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#include <asm/processor.h>
+#include <asm/hardirq.h>
+#include <asm/nmi.h>
+#include <asm/hw_irq.h>
+#include <asm/apic.h>
+#include <asm/e820/types.h>
+#include <asm/io_apic.h>
+#include <asm/hpet.h>
+#include <linux/kdebug.h>
+#include <asm/cpu.h>
+#include <asm/reboot.h>
+#include <asm/virtext.h>
+#include <asm/intel_pt.h>
+
+/* Used while preparing memory map entries for second kernel */
+struct crash_memmap_data {
+ struct boot_params *params;
+ /* Type of memory */
+ unsigned int type;
+};
+
+/*
+ * This is used to VMCLEAR all VMCSs loaded on the
+ * processor. And when loading kvm_intel module, the
+ * callback function pointer will be assigned.
+ *
+ * protected by rcu.
+ */
+crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss = NULL;
+EXPORT_SYMBOL_GPL(crash_vmclear_loaded_vmcss);
+unsigned long crash_zero_bytes;
+
+static inline void cpu_crash_vmclear_loaded_vmcss(void)
+{
+ crash_vmclear_fn *do_vmclear_operation = NULL;
+
+ rcu_read_lock();
+ do_vmclear_operation = rcu_dereference(crash_vmclear_loaded_vmcss);
+ if (do_vmclear_operation)
+ do_vmclear_operation();
+ rcu_read_unlock();
+}
+
+#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
+
+static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
+{
+#ifdef CONFIG_X86_32
+ struct pt_regs fixed_regs;
+
+ if (!user_mode(regs)) {
+ crash_fixup_ss_esp(&fixed_regs, regs);
+ regs = &fixed_regs;
+ }
+#endif
+ crash_save_cpu(regs, cpu);
+
+ /*
+ * VMCLEAR VMCSs loaded on all cpus if needed.
+ */
+ cpu_crash_vmclear_loaded_vmcss();
+
+ /* Disable VMX or SVM if needed.
+ *
+ * We need to disable virtualization on all CPUs.
+ * Having VMX or SVM enabled on any CPU may break rebooting
+ * after the kdump kernel has finished its task.
+ */
+ cpu_emergency_vmxoff();
+ cpu_emergency_svm_disable();
+
+ /*
+ * Disable Intel PT to stop its logging
+ */
+ cpu_emergency_stop_pt();
+
+ disable_local_APIC();
+}
+
+void kdump_nmi_shootdown_cpus(void)
+{
+ nmi_shootdown_cpus(kdump_nmi_callback);
+
+ disable_local_APIC();
+}
+
+/* Override the weak function in kernel/panic.c */
+void crash_smp_send_stop(void)
+{
+ static int cpus_stopped;
+
+ if (cpus_stopped)
+ return;
+
+ if (smp_ops.crash_stop_other_cpus)
+ smp_ops.crash_stop_other_cpus();
+ else
+ smp_send_stop();
+
+ cpus_stopped = 1;
+}
+
+#else
+void crash_smp_send_stop(void)
+{
+ /* There are no cpus to shootdown */
+}
+#endif
+
+void native_machine_crash_shutdown(struct pt_regs *regs)
+{
+ /* This function is only called after the system
+ * has panicked or is otherwise in a critical state.
+ * The minimum amount of code to allow a kexec'd kernel
+ * to run successfully needs to happen here.
+ *
+ * In practice this means shooting down the other cpus in
+ * an SMP system.
+ */
+ /* The kernel is broken so disable interrupts */
+ local_irq_disable();
+
+ crash_smp_send_stop();
+
+ /*
+ * VMCLEAR VMCSs loaded on this cpu if needed.
+ */
+ cpu_crash_vmclear_loaded_vmcss();
+
+ /* Booting kdump kernel with VMX or SVM enabled won't work,
+ * because (among other limitations) we can't disable paging
+ * with the virt flags.
+ */
+ cpu_emergency_vmxoff();
+ cpu_emergency_svm_disable();
+
+ /*
+ * Disable Intel PT to stop its logging
+ */
+ cpu_emergency_stop_pt();
+
+#ifdef CONFIG_X86_IO_APIC
+ /* Prevent crash_kexec() from deadlocking on ioapic_lock. */
+ ioapic_zap_locks();
+ clear_IO_APIC();
+#endif
+ lapic_shutdown();
+ restore_boot_irq_mode();
+#ifdef CONFIG_HPET_TIMER
+ hpet_disable();
+#endif
+ crash_save_cpu(regs, safe_smp_processor_id());
+}
+
+#ifdef CONFIG_KEXEC_FILE
+static int get_nr_ram_ranges_callback(struct resource *res, void *arg)
+{
+ unsigned int *nr_ranges = arg;
+
+ (*nr_ranges)++;
+ return 0;
+}
+
+/* Gather all the required information to prepare elf headers for ram regions */
+static struct crash_mem *fill_up_crash_elf_data(void)
+{
+ unsigned int nr_ranges = 0;
+ struct crash_mem *cmem;
+
+ walk_system_ram_res(0, -1, &nr_ranges,
+ get_nr_ram_ranges_callback);
+ if (!nr_ranges)
+ return NULL;
+
+ /*
+ * Exclusion of crash region and/or crashk_low_res may cause
+ * another range split. So add extra two slots here.
+ */
+ nr_ranges += 2;
+ cmem = vzalloc(sizeof(struct crash_mem) +
+ sizeof(struct crash_mem_range) * nr_ranges);
+ if (!cmem)
+ return NULL;
+
+ cmem->max_nr_ranges = nr_ranges;
+ cmem->nr_ranges = 0;
+
+ return cmem;
+}
+
+/*
+ * Look for any unwanted ranges between mstart, mend and remove them. This
+ * might lead to split and split ranges are put in cmem->ranges[] array
+ */
+static int elf_header_exclude_ranges(struct crash_mem *cmem)
+{
+ int ret = 0;
+
+ /* Exclude crashkernel region */
+ ret = crash_exclude_mem_range(cmem, crashk_res.start, crashk_res.end);
+ if (ret)
+ return ret;
+
+ if (crashk_low_res.end) {
+ ret = crash_exclude_mem_range(cmem, crashk_low_res.start,
+ crashk_low_res.end);
+ if (ret)
+ return ret;
+ }
+
+ return ret;
+}
+
+static int prepare_elf64_ram_headers_callback(struct resource *res, void *arg)
+{
+ struct crash_mem *cmem = arg;
+
+ cmem->ranges[cmem->nr_ranges].start = res->start;
+ cmem->ranges[cmem->nr_ranges].end = res->end;
+ cmem->nr_ranges++;
+
+ return 0;
+}
+
+/* Prepare elf headers. Return addr and size */
+static int prepare_elf_headers(struct kimage *image, void **addr,
+ unsigned long *sz)
+{
+ struct crash_mem *cmem;
+ Elf64_Ehdr *ehdr;
+ Elf64_Phdr *phdr;
+ int ret, i;
+
+ cmem = fill_up_crash_elf_data();
+ if (!cmem)
+ return -ENOMEM;
+
+ ret = walk_system_ram_res(0, -1, cmem,
+ prepare_elf64_ram_headers_callback);
+ if (ret)
+ goto out;
+
+ /* Exclude unwanted mem ranges */
+ ret = elf_header_exclude_ranges(cmem);
+ if (ret)
+ goto out;
+
+ /* By default prepare 64bit headers */
+ ret = crash_prepare_elf64_headers(cmem,
+ IS_ENABLED(CONFIG_X86_64), addr, sz);
+ if (ret)
+ goto out;
+
+ /*
+ * If a range matches backup region, adjust offset to backup
+ * segment.
+ */
+ ehdr = (Elf64_Ehdr *)*addr;
+ phdr = (Elf64_Phdr *)(ehdr + 1);
+ for (i = 0; i < ehdr->e_phnum; phdr++, i++)
+ if (phdr->p_type == PT_LOAD &&
+ phdr->p_paddr == image->arch.backup_src_start &&
+ phdr->p_memsz == image->arch.backup_src_sz) {
+ phdr->p_offset = image->arch.backup_load_addr;
+ break;
+ }
+out:
+ vfree(cmem);
+ return ret;
+}
+
+static int add_e820_entry(struct boot_params *params, struct e820_entry *entry)
+{
+ unsigned int nr_e820_entries;
+
+ nr_e820_entries = params->e820_entries;
+ if (nr_e820_entries >= E820_MAX_ENTRIES_ZEROPAGE)
+ return 1;
+
+ memcpy(&params->e820_table[nr_e820_entries], entry,
+ sizeof(struct e820_entry));
+ params->e820_entries++;
+ return 0;
+}
+
+static int memmap_entry_callback(struct resource *res, void *arg)
+{
+ struct crash_memmap_data *cmd = arg;
+ struct boot_params *params = cmd->params;
+ struct e820_entry ei;
+
+ ei.addr = res->start;
+ ei.size = resource_size(res);
+ ei.type = cmd->type;
+ add_e820_entry(params, &ei);
+
+ return 0;
+}
+
+static int memmap_exclude_ranges(struct kimage *image, struct crash_mem *cmem,
+ unsigned long long mstart,
+ unsigned long long mend)
+{
+ unsigned long start, end;
+ int ret = 0;
+
+ cmem->ranges[0].start = mstart;
+ cmem->ranges[0].end = mend;
+ cmem->nr_ranges = 1;
+
+ /* Exclude Backup region */
+ start = image->arch.backup_load_addr;
+ end = start + image->arch.backup_src_sz - 1;
+ ret = crash_exclude_mem_range(cmem, start, end);
+ if (ret)
+ return ret;
+
+ /* Exclude elf header region */
+ start = image->arch.elf_load_addr;
+ end = start + image->arch.elf_headers_sz - 1;
+ return crash_exclude_mem_range(cmem, start, end);
+}
+
+/* Prepare memory map for crash dump kernel */
+int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params)
+{
+ int i, ret = 0;
+ unsigned long flags;
+ struct e820_entry ei;
+ struct crash_memmap_data cmd;
+ struct crash_mem *cmem;
+
+ cmem = vzalloc(struct_size(cmem, ranges, 1));
+ if (!cmem)
+ return -ENOMEM;
+
+ memset(&cmd, 0, sizeof(struct crash_memmap_data));
+ cmd.params = params;
+
+ /* Add first 640K segment */
+ ei.addr = image->arch.backup_src_start;
+ ei.size = image->arch.backup_src_sz;
+ ei.type = E820_TYPE_RAM;
+ add_e820_entry(params, &ei);
+
+ /* Add ACPI tables */
+ cmd.type = E820_TYPE_ACPI;
+ flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+ walk_iomem_res_desc(IORES_DESC_ACPI_TABLES, flags, 0, -1, &cmd,
+ memmap_entry_callback);
+
+ /* Add ACPI Non-volatile Storage */
+ cmd.type = E820_TYPE_NVS;
+ walk_iomem_res_desc(IORES_DESC_ACPI_NV_STORAGE, flags, 0, -1, &cmd,
+ memmap_entry_callback);
+
+ /* Add crashk_low_res region */
+ if (crashk_low_res.end) {
+ ei.addr = crashk_low_res.start;
+ ei.size = crashk_low_res.end - crashk_low_res.start + 1;
+ ei.type = E820_TYPE_RAM;
+ add_e820_entry(params, &ei);
+ }
+
+ /* Exclude some ranges from crashk_res and add rest to memmap */
+ ret = memmap_exclude_ranges(image, cmem, crashk_res.start,
+ crashk_res.end);
+ if (ret)
+ goto out;
+
+ for (i = 0; i < cmem->nr_ranges; i++) {
+ ei.size = cmem->ranges[i].end - cmem->ranges[i].start + 1;
+
+ /* If entry is less than a page, skip it */
+ if (ei.size < PAGE_SIZE)
+ continue;
+ ei.addr = cmem->ranges[i].start;
+ ei.type = E820_TYPE_RAM;
+ add_e820_entry(params, &ei);
+ }
+
+out:
+ vfree(cmem);
+ return ret;
+}
+
+static int determine_backup_region(struct resource *res, void *arg)
+{
+ struct kimage *image = arg;
+
+ image->arch.backup_src_start = res->start;
+ image->arch.backup_src_sz = resource_size(res);
+
+ /* Expecting only one range for backup region */
+ return 1;
+}
+
+int crash_load_segments(struct kimage *image)
+{
+ int ret;
+ struct kexec_buf kbuf = { .image = image, .buf_min = 0,
+ .buf_max = ULONG_MAX, .top_down = false };
+
+ /*
+ * Determine and load a segment for backup area. First 640K RAM
+ * region is backup source
+ */
+
+ ret = walk_system_ram_res(KEXEC_BACKUP_SRC_START, KEXEC_BACKUP_SRC_END,
+ image, determine_backup_region);
+
+ /* Zero or postive return values are ok */
+ if (ret < 0)
+ return ret;
+
+ /* Add backup segment. */
+ if (image->arch.backup_src_sz) {
+ kbuf.buffer = &crash_zero_bytes;
+ kbuf.bufsz = sizeof(crash_zero_bytes);
+ kbuf.memsz = image->arch.backup_src_sz;
+ kbuf.buf_align = PAGE_SIZE;
+ /*
+ * Ideally there is no source for backup segment. This is
+ * copied in purgatory after crash. Just add a zero filled
+ * segment for now to make sure checksum logic works fine.
+ */
+ ret = kexec_add_buffer(&kbuf);
+ if (ret)
+ return ret;
+ image->arch.backup_load_addr = kbuf.mem;
+ pr_debug("Loaded backup region at 0x%lx backup_start=0x%lx memsz=0x%lx\n",
+ image->arch.backup_load_addr,
+ image->arch.backup_src_start, kbuf.memsz);
+ }
+
+ /* Prepare elf headers and add a segment */
+ ret = prepare_elf_headers(image, &kbuf.buffer, &kbuf.bufsz);
+ if (ret)
+ return ret;
+
+ image->arch.elf_headers = kbuf.buffer;
+ image->arch.elf_headers_sz = kbuf.bufsz;
+
+ kbuf.memsz = kbuf.bufsz;
+ kbuf.buf_align = ELF_CORE_HEADER_ALIGN;
+ ret = kexec_add_buffer(&kbuf);
+ if (ret) {
+ vfree((void *)image->arch.elf_headers);
+ return ret;
+ }
+ image->arch.elf_load_addr = kbuf.mem;
+ pr_debug("Loaded ELF headers at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
+ image->arch.elf_load_addr, kbuf.bufsz, kbuf.bufsz);
+
+ return ret;
+}
+#endif /* CONFIG_KEXEC_FILE */
diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c
new file mode 100644
index 000000000..33ee47670
--- /dev/null
+++ b/arch/x86/kernel/crash_dump_32.c
@@ -0,0 +1,96 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Memory preserving reboot related code.
+ *
+ * Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
+ * Copyright (C) IBM Corporation, 2004. All rights reserved
+ */
+
+#include <linux/slab.h>
+#include <linux/errno.h>
+#include <linux/highmem.h>
+#include <linux/crash_dump.h>
+
+#include <linux/uaccess.h>
+
+static void *kdump_buf_page;
+
+static inline bool is_crashed_pfn_valid(unsigned long pfn)
+{
+#ifndef CONFIG_X86_PAE
+ /*
+ * non-PAE kdump kernel executed from a PAE one will crop high pte
+ * bits and poke unwanted space counting again from address 0, we
+ * don't want that. pte must fit into unsigned long. In fact the
+ * test checks high 12 bits for being zero (pfn will be shifted left
+ * by PAGE_SHIFT).
+ */
+ return pte_pfn(pfn_pte(pfn, __pgprot(0))) == pfn;
+#else
+ return true;
+#endif
+}
+
+/**
+ * copy_oldmem_page - copy one page from "oldmem"
+ * @pfn: page frame number to be copied
+ * @buf: target memory address for the copy; this can be in kernel address
+ * space or user address space (see @userbuf)
+ * @csize: number of bytes to copy
+ * @offset: offset in bytes into the page (based on pfn) to begin the copy
+ * @userbuf: if set, @buf is in user address space, use copy_to_user(),
+ * otherwise @buf is in kernel address space, use memcpy().
+ *
+ * Copy a page from "oldmem". For this page, there is no pte mapped
+ * in the current kernel. We stitch up a pte, similar to kmap_atomic.
+ *
+ * Calling copy_to_user() in atomic context is not desirable. Hence first
+ * copying the data to a pre-allocated kernel page and then copying to user
+ * space in non-atomic context.
+ */
+ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
+ size_t csize, unsigned long offset, int userbuf)
+{
+ void *vaddr;
+
+ if (!csize)
+ return 0;
+
+ if (!is_crashed_pfn_valid(pfn))
+ return -EFAULT;
+
+ vaddr = kmap_atomic_pfn(pfn);
+
+ if (!userbuf) {
+ memcpy(buf, (vaddr + offset), csize);
+ kunmap_atomic(vaddr);
+ } else {
+ if (!kdump_buf_page) {
+ printk(KERN_WARNING "Kdump: Kdump buffer page not"
+ " allocated\n");
+ kunmap_atomic(vaddr);
+ return -EFAULT;
+ }
+ copy_page(kdump_buf_page, vaddr);
+ kunmap_atomic(vaddr);
+ if (copy_to_user(buf, (kdump_buf_page + offset), csize))
+ return -EFAULT;
+ }
+
+ return csize;
+}
+
+static int __init kdump_buf_page_init(void)
+{
+ int ret = 0;
+
+ kdump_buf_page = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!kdump_buf_page) {
+ printk(KERN_WARNING "Kdump: Failed to allocate kdump buffer"
+ " page\n");
+ ret = -ENOMEM;
+ }
+
+ return ret;
+}
+arch_initcall(kdump_buf_page_init);
diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c
new file mode 100644
index 000000000..4f2e0778f
--- /dev/null
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Memory preserving reboot related code.
+ *
+ * Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
+ * Copyright (C) IBM Corporation, 2004. All rights reserved
+ */
+
+#include <linux/errno.h>
+#include <linux/crash_dump.h>
+#include <linux/uaccess.h>
+#include <linux/io.h>
+
+/**
+ * copy_oldmem_page - copy one page from "oldmem"
+ * @pfn: page frame number to be copied
+ * @buf: target memory address for the copy; this can be in kernel address
+ * space or user address space (see @userbuf)
+ * @csize: number of bytes to copy
+ * @offset: offset in bytes into the page (based on pfn) to begin the copy
+ * @userbuf: if set, @buf is in user address space, use copy_to_user(),
+ * otherwise @buf is in kernel address space, use memcpy().
+ *
+ * Copy a page from "oldmem". For this page, there is no pte mapped
+ * in the current kernel. We stitch up a pte, similar to kmap_atomic.
+ */
+ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
+ size_t csize, unsigned long offset, int userbuf)
+{
+ void *vaddr;
+
+ if (!csize)
+ return 0;
+
+ vaddr = ioremap_cache(pfn << PAGE_SHIFT, PAGE_SIZE);
+ if (!vaddr)
+ return -ENOMEM;
+
+ if (userbuf) {
+ if (copy_to_user(buf, vaddr + offset, csize)) {
+ iounmap(vaddr);
+ return -EFAULT;
+ }
+ } else
+ memcpy(buf, vaddr + offset, csize);
+
+ set_iounmap_nonlazy();
+ iounmap(vaddr);
+ return csize;
+}
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
new file mode 100644
index 000000000..f39f3a06c
--- /dev/null
+++ b/arch/x86/kernel/devicetree.c
@@ -0,0 +1,323 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Architecture specific OF callbacks.
+ */
+#include <linux/export.h>
+#include <linux/io.h>
+#include <linux/interrupt.h>
+#include <linux/list.h>
+#include <linux/of.h>
+#include <linux/of_fdt.h>
+#include <linux/of_address.h>
+#include <linux/of_platform.h>
+#include <linux/of_irq.h>
+#include <linux/libfdt.h>
+#include <linux/slab.h>
+#include <linux/pci.h>
+#include <linux/of_pci.h>
+#include <linux/initrd.h>
+
+#include <asm/irqdomain.h>
+#include <asm/hpet.h>
+#include <asm/apic.h>
+#include <asm/pci_x86.h>
+#include <asm/setup.h>
+#include <asm/i8259.h>
+
+__initdata u64 initial_dtb;
+char __initdata cmd_line[COMMAND_LINE_SIZE];
+
+int __initdata of_ioapic;
+
+void __init early_init_dt_scan_chosen_arch(unsigned long node)
+{
+ BUG();
+}
+
+void __init early_init_dt_add_memory_arch(u64 base, u64 size)
+{
+ BUG();
+}
+
+void __init add_dtb(u64 data)
+{
+ initial_dtb = data + offsetof(struct setup_data, data);
+}
+
+/*
+ * CE4100 ids. Will be moved to machine_device_initcall() once we have it.
+ */
+static struct of_device_id __initdata ce4100_ids[] = {
+ { .compatible = "intel,ce4100-cp", },
+ { .compatible = "isa", },
+ { .compatible = "pci", },
+ {},
+};
+
+static int __init add_bus_probe(void)
+{
+ if (!of_have_populated_dt())
+ return 0;
+
+ return of_platform_bus_probe(NULL, ce4100_ids, NULL);
+}
+device_initcall(add_bus_probe);
+
+#ifdef CONFIG_PCI
+struct device_node *pcibios_get_phb_of_node(struct pci_bus *bus)
+{
+ struct device_node *np;
+
+ for_each_node_by_type(np, "pci") {
+ const void *prop;
+ unsigned int bus_min;
+
+ prop = of_get_property(np, "bus-range", NULL);
+ if (!prop)
+ continue;
+ bus_min = be32_to_cpup(prop);
+ if (bus->number == bus_min)
+ return np;
+ }
+ return NULL;
+}
+
+static int x86_of_pci_irq_enable(struct pci_dev *dev)
+{
+ u32 virq;
+ int ret;
+ u8 pin;
+
+ ret = pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
+ if (ret)
+ return ret;
+ if (!pin)
+ return 0;
+
+ virq = of_irq_parse_and_map_pci(dev, 0, 0);
+ if (virq == 0)
+ return -EINVAL;
+ dev->irq = virq;
+ return 0;
+}
+
+static void x86_of_pci_irq_disable(struct pci_dev *dev)
+{
+}
+
+void x86_of_pci_init(void)
+{
+ pcibios_enable_irq = x86_of_pci_irq_enable;
+ pcibios_disable_irq = x86_of_pci_irq_disable;
+}
+#endif
+
+static void __init dtb_setup_hpet(void)
+{
+#ifdef CONFIG_HPET_TIMER
+ struct device_node *dn;
+ struct resource r;
+ int ret;
+
+ dn = of_find_compatible_node(NULL, NULL, "intel,ce4100-hpet");
+ if (!dn)
+ return;
+ ret = of_address_to_resource(dn, 0, &r);
+ if (ret) {
+ WARN_ON(1);
+ return;
+ }
+ hpet_address = r.start;
+#endif
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+
+static void __init dtb_cpu_setup(void)
+{
+ struct device_node *dn;
+ u32 apic_id, version;
+ int ret;
+
+ version = GET_APIC_VERSION(apic_read(APIC_LVR));
+ for_each_node_by_type(dn, "cpu") {
+ ret = of_property_read_u32(dn, "reg", &apic_id);
+ if (ret < 0) {
+ pr_warn("%pOF: missing local APIC ID\n", dn);
+ continue;
+ }
+ generic_processor_info(apic_id, version);
+ }
+}
+
+static void __init dtb_lapic_setup(void)
+{
+ struct device_node *dn;
+ struct resource r;
+ unsigned long lapic_addr = APIC_DEFAULT_PHYS_BASE;
+ int ret;
+
+ dn = of_find_compatible_node(NULL, NULL, "intel,ce4100-lapic");
+ if (dn) {
+ ret = of_address_to_resource(dn, 0, &r);
+ if (WARN_ON(ret))
+ return;
+ lapic_addr = r.start;
+ }
+
+ /* Did the boot loader setup the local APIC ? */
+ if (!boot_cpu_has(X86_FEATURE_APIC)) {
+ if (apic_force_enable(lapic_addr))
+ return;
+ }
+ smp_found_config = 1;
+ pic_mode = 1;
+ register_lapic_address(lapic_addr);
+}
+
+#endif /* CONFIG_X86_LOCAL_APIC */
+
+#ifdef CONFIG_X86_IO_APIC
+static unsigned int ioapic_id;
+
+struct of_ioapic_type {
+ u32 out_type;
+ u32 trigger;
+ u32 polarity;
+};
+
+static struct of_ioapic_type of_ioapic_type[] =
+{
+ {
+ .out_type = IRQ_TYPE_EDGE_RISING,
+ .trigger = IOAPIC_EDGE,
+ .polarity = 1,
+ },
+ {
+ .out_type = IRQ_TYPE_LEVEL_LOW,
+ .trigger = IOAPIC_LEVEL,
+ .polarity = 0,
+ },
+ {
+ .out_type = IRQ_TYPE_LEVEL_HIGH,
+ .trigger = IOAPIC_LEVEL,
+ .polarity = 1,
+ },
+ {
+ .out_type = IRQ_TYPE_EDGE_FALLING,
+ .trigger = IOAPIC_EDGE,
+ .polarity = 0,
+ },
+};
+
+static int dt_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs, void *arg)
+{
+ struct irq_fwspec *fwspec = (struct irq_fwspec *)arg;
+ struct of_ioapic_type *it;
+ struct irq_alloc_info tmp;
+ int type_index;
+
+ if (WARN_ON(fwspec->param_count < 2))
+ return -EINVAL;
+
+ type_index = fwspec->param[1];
+ if (type_index >= ARRAY_SIZE(of_ioapic_type))
+ return -EINVAL;
+
+ it = &of_ioapic_type[type_index];
+ ioapic_set_alloc_attr(&tmp, NUMA_NO_NODE, it->trigger, it->polarity);
+ tmp.ioapic_id = mpc_ioapic_id(mp_irqdomain_ioapic_idx(domain));
+ tmp.ioapic_pin = fwspec->param[0];
+
+ return mp_irqdomain_alloc(domain, virq, nr_irqs, &tmp);
+}
+
+static const struct irq_domain_ops ioapic_irq_domain_ops = {
+ .alloc = dt_irqdomain_alloc,
+ .free = mp_irqdomain_free,
+ .activate = mp_irqdomain_activate,
+ .deactivate = mp_irqdomain_deactivate,
+};
+
+static void __init dtb_add_ioapic(struct device_node *dn)
+{
+ struct resource r;
+ int ret;
+ struct ioapic_domain_cfg cfg = {
+ .type = IOAPIC_DOMAIN_DYNAMIC,
+ .ops = &ioapic_irq_domain_ops,
+ .dev = dn,
+ };
+
+ ret = of_address_to_resource(dn, 0, &r);
+ if (ret) {
+ printk(KERN_ERR "Can't obtain address from device node %pOF.\n", dn);
+ return;
+ }
+ mp_register_ioapic(++ioapic_id, r.start, gsi_top, &cfg);
+}
+
+static void __init dtb_ioapic_setup(void)
+{
+ struct device_node *dn;
+
+ for_each_compatible_node(dn, NULL, "intel,ce4100-ioapic")
+ dtb_add_ioapic(dn);
+
+ if (nr_ioapics) {
+ of_ioapic = 1;
+ return;
+ }
+ printk(KERN_ERR "Error: No information about IO-APIC in OF.\n");
+}
+#else
+static void __init dtb_ioapic_setup(void) {}
+#endif
+
+static void __init dtb_apic_setup(void)
+{
+#ifdef CONFIG_X86_LOCAL_APIC
+ dtb_lapic_setup();
+ dtb_cpu_setup();
+#endif
+ dtb_ioapic_setup();
+}
+
+#ifdef CONFIG_OF_EARLY_FLATTREE
+static void __init x86_flattree_get_config(void)
+{
+ u32 size, map_len;
+ void *dt;
+
+ if (!initial_dtb)
+ return;
+
+ map_len = max(PAGE_SIZE - (initial_dtb & ~PAGE_MASK), (u64)128);
+
+ dt = early_memremap(initial_dtb, map_len);
+ size = fdt_totalsize(dt);
+ if (map_len < size) {
+ early_memunmap(dt, map_len);
+ dt = early_memremap(initial_dtb, size);
+ map_len = size;
+ }
+
+ early_init_dt_verify(dt);
+ unflatten_and_copy_device_tree();
+ early_memunmap(dt, map_len);
+}
+#else
+static inline void x86_flattree_get_config(void) { }
+#endif
+
+void __init x86_dtb_init(void)
+{
+ x86_flattree_get_config();
+
+ if (!of_have_populated_dt())
+ return;
+
+ dtb_setup_hpet();
+ dtb_apic_setup();
+}
diff --git a/arch/x86/kernel/doublefault.c b/arch/x86/kernel/doublefault.c
new file mode 100644
index 000000000..0b8cedb20
--- /dev/null
+++ b/arch/x86/kernel/doublefault.c
@@ -0,0 +1,83 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/sched/debug.h>
+#include <linux/init_task.h>
+#include <linux/fs.h>
+
+#include <linux/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/processor.h>
+#include <asm/desc.h>
+
+#ifdef CONFIG_X86_32
+
+#define DOUBLEFAULT_STACKSIZE (1024)
+static unsigned long doublefault_stack[DOUBLEFAULT_STACKSIZE];
+#define STACK_START (unsigned long)(doublefault_stack+DOUBLEFAULT_STACKSIZE)
+
+#define ptr_ok(x) ((x) > PAGE_OFFSET && (x) < PAGE_OFFSET + MAXMEM)
+
+static void doublefault_fn(void)
+{
+ struct desc_ptr gdt_desc = {0, 0};
+ unsigned long gdt, tss;
+
+ native_store_gdt(&gdt_desc);
+ gdt = gdt_desc.address;
+
+ printk(KERN_EMERG "PANIC: double fault, gdt at %08lx [%d bytes]\n", gdt, gdt_desc.size);
+
+ if (ptr_ok(gdt)) {
+ gdt += GDT_ENTRY_TSS << 3;
+ tss = get_desc_base((struct desc_struct *)gdt);
+ printk(KERN_EMERG "double fault, tss at %08lx\n", tss);
+
+ if (ptr_ok(tss)) {
+ struct x86_hw_tss *t = (struct x86_hw_tss *)tss;
+
+ printk(KERN_EMERG "eip = %08lx, esp = %08lx\n",
+ t->ip, t->sp);
+
+ printk(KERN_EMERG "eax = %08lx, ebx = %08lx, ecx = %08lx, edx = %08lx\n",
+ t->ax, t->bx, t->cx, t->dx);
+ printk(KERN_EMERG "esi = %08lx, edi = %08lx\n",
+ t->si, t->di);
+ }
+ }
+
+ for (;;)
+ cpu_relax();
+}
+
+struct x86_hw_tss doublefault_tss __cacheline_aligned = {
+ .sp0 = STACK_START,
+ .ss0 = __KERNEL_DS,
+ .ldt = 0,
+ .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
+
+ .ip = (unsigned long) doublefault_fn,
+ /* 0x2 bit is always set */
+ .flags = X86_EFLAGS_SF | 0x2,
+ .sp = STACK_START,
+ .es = __USER_DS,
+ .cs = __KERNEL_CS,
+ .ss = __KERNEL_DS,
+ .ds = __USER_DS,
+ .fs = __KERNEL_PERCPU,
+
+ .__cr3 = __pa_nodebug(swapper_pg_dir),
+};
+
+/* dummy for do_double_fault() call */
+void df_debug(struct pt_regs *regs, long error_code) {}
+
+#else /* !CONFIG_X86_32 */
+
+void df_debug(struct pt_regs *regs, long error_code)
+{
+ pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code);
+ show_regs(regs);
+ panic("Machine halted.");
+}
+#endif
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
new file mode 100644
index 000000000..2b5886401
--- /dev/null
+++ b/arch/x86/kernel/dumpstack.c
@@ -0,0 +1,419 @@
+/*
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
+ */
+#include <linux/kallsyms.h>
+#include <linux/kprobes.h>
+#include <linux/uaccess.h>
+#include <linux/utsname.h>
+#include <linux/hardirq.h>
+#include <linux/kdebug.h>
+#include <linux/module.h>
+#include <linux/ptrace.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/task_stack.h>
+#include <linux/ftrace.h>
+#include <linux/kexec.h>
+#include <linux/bug.h>
+#include <linux/nmi.h>
+#include <linux/sysfs.h>
+#include <linux/kasan.h>
+
+#include <asm/cpu_entry_area.h>
+#include <asm/stacktrace.h>
+#include <asm/unwind.h>
+
+int panic_on_unrecovered_nmi;
+int panic_on_io_nmi;
+static int die_counter;
+
+static struct pt_regs exec_summary_regs;
+
+bool in_task_stack(unsigned long *stack, struct task_struct *task,
+ struct stack_info *info)
+{
+ unsigned long *begin = task_stack_page(task);
+ unsigned long *end = task_stack_page(task) + THREAD_SIZE;
+
+ if (stack < begin || stack >= end)
+ return false;
+
+ info->type = STACK_TYPE_TASK;
+ info->begin = begin;
+ info->end = end;
+ info->next_sp = NULL;
+
+ return true;
+}
+
+bool in_entry_stack(unsigned long *stack, struct stack_info *info)
+{
+ struct entry_stack *ss = cpu_entry_stack(smp_processor_id());
+
+ void *begin = ss;
+ void *end = ss + 1;
+
+ if ((void *)stack < begin || (void *)stack >= end)
+ return false;
+
+ info->type = STACK_TYPE_ENTRY;
+ info->begin = begin;
+ info->end = end;
+ info->next_sp = NULL;
+
+ return true;
+}
+
+static void printk_stack_address(unsigned long address, int reliable,
+ char *log_lvl)
+{
+ touch_nmi_watchdog();
+ printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address);
+}
+
+/*
+ * There are a couple of reasons for the 2/3rd prologue, courtesy of Linus:
+ *
+ * In case where we don't have the exact kernel image (which, if we did, we can
+ * simply disassemble and navigate to the RIP), the purpose of the bigger
+ * prologue is to have more context and to be able to correlate the code from
+ * the different toolchains better.
+ *
+ * In addition, it helps in recreating the register allocation of the failing
+ * kernel and thus make sense of the register dump.
+ *
+ * What is more, the additional complication of a variable length insn arch like
+ * x86 warrants having longer byte sequence before rIP so that the disassembler
+ * can "sync" up properly and find instruction boundaries when decoding the
+ * opcode bytes.
+ *
+ * Thus, the 2/3rds prologue and 64 byte OPCODE_BUFSIZE is just a random
+ * guesstimate in attempt to achieve all of the above.
+ */
+void show_opcodes(struct pt_regs *regs, const char *loglvl)
+{
+#define PROLOGUE_SIZE 42
+#define EPILOGUE_SIZE 21
+#define OPCODE_BUFSIZE (PROLOGUE_SIZE + 1 + EPILOGUE_SIZE)
+ u8 opcodes[OPCODE_BUFSIZE];
+ unsigned long prologue = regs->ip - PROLOGUE_SIZE;
+ bool bad_ip;
+
+ /*
+ * Make sure userspace isn't trying to trick us into dumping kernel
+ * memory by pointing the userspace instruction pointer at it.
+ */
+ bad_ip = user_mode(regs) &&
+ __chk_range_not_ok(prologue, OPCODE_BUFSIZE, TASK_SIZE_MAX);
+
+ if (bad_ip || probe_kernel_read(opcodes, (u8 *)prologue,
+ OPCODE_BUFSIZE)) {
+ printk("%sCode: Bad RIP value.\n", loglvl);
+ } else {
+ printk("%sCode: %" __stringify(PROLOGUE_SIZE) "ph <%02x> %"
+ __stringify(EPILOGUE_SIZE) "ph\n", loglvl, opcodes,
+ opcodes[PROLOGUE_SIZE], opcodes + PROLOGUE_SIZE + 1);
+ }
+}
+
+void show_ip(struct pt_regs *regs, const char *loglvl)
+{
+#ifdef CONFIG_X86_32
+ printk("%sEIP: %pS\n", loglvl, (void *)regs->ip);
+#else
+ printk("%sRIP: %04x:%pS\n", loglvl, (int)regs->cs, (void *)regs->ip);
+#endif
+ show_opcodes(regs, loglvl);
+}
+
+void show_iret_regs(struct pt_regs *regs)
+{
+ show_ip(regs, KERN_DEFAULT);
+ printk(KERN_DEFAULT "RSP: %04x:%016lx EFLAGS: %08lx", (int)regs->ss,
+ regs->sp, regs->flags);
+}
+
+static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs,
+ bool partial)
+{
+ /*
+ * These on_stack() checks aren't strictly necessary: the unwind code
+ * has already validated the 'regs' pointer. The checks are done for
+ * ordering reasons: if the registers are on the next stack, we don't
+ * want to print them out yet. Otherwise they'll be shown as part of
+ * the wrong stack. Later, when show_trace_log_lvl() switches to the
+ * next stack, this function will be called again with the same regs so
+ * they can be printed in the right context.
+ */
+ if (!partial && on_stack(info, regs, sizeof(*regs))) {
+ __show_regs(regs, SHOW_REGS_SHORT);
+
+ } else if (partial && on_stack(info, (void *)regs + IRET_FRAME_OFFSET,
+ IRET_FRAME_SIZE)) {
+ /*
+ * When an interrupt or exception occurs in entry code, the
+ * full pt_regs might not have been saved yet. In that case
+ * just print the iret frame.
+ */
+ show_iret_regs(regs);
+ }
+}
+
+void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
+ unsigned long *stack, char *log_lvl)
+{
+ struct unwind_state state;
+ struct stack_info stack_info = {0};
+ unsigned long visit_mask = 0;
+ int graph_idx = 0;
+ bool partial = false;
+
+ printk("%sCall Trace:\n", log_lvl);
+
+ unwind_start(&state, task, regs, stack);
+ stack = stack ? : get_stack_pointer(task, regs);
+ regs = unwind_get_entry_regs(&state, &partial);
+
+ /*
+ * Iterate through the stacks, starting with the current stack pointer.
+ * Each stack has a pointer to the next one.
+ *
+ * x86-64 can have several stacks:
+ * - task stack
+ * - interrupt stack
+ * - HW exception stacks (double fault, nmi, debug, mce)
+ * - entry stack
+ *
+ * x86-32 can have up to four stacks:
+ * - task stack
+ * - softirq stack
+ * - hardirq stack
+ * - entry stack
+ */
+ for ( ; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) {
+ const char *stack_name;
+
+ if (get_stack_info(stack, task, &stack_info, &visit_mask)) {
+ /*
+ * We weren't on a valid stack. It's possible that
+ * we overflowed a valid stack into a guard page.
+ * See if the next page up is valid so that we can
+ * generate some kind of backtrace if this happens.
+ */
+ stack = (unsigned long *)PAGE_ALIGN((unsigned long)stack);
+ if (get_stack_info(stack, task, &stack_info, &visit_mask))
+ break;
+ }
+
+ stack_name = stack_type_name(stack_info.type);
+ if (stack_name)
+ printk("%s <%s>\n", log_lvl, stack_name);
+
+ if (regs)
+ show_regs_if_on_stack(&stack_info, regs, partial);
+
+ /*
+ * Scan the stack, printing any text addresses we find. At the
+ * same time, follow proper stack frames with the unwinder.
+ *
+ * Addresses found during the scan which are not reported by
+ * the unwinder are considered to be additional clues which are
+ * sometimes useful for debugging and are prefixed with '?'.
+ * This also serves as a failsafe option in case the unwinder
+ * goes off in the weeds.
+ */
+ for (; stack < stack_info.end; stack++) {
+ unsigned long real_addr;
+ int reliable = 0;
+ unsigned long addr = READ_ONCE_NOCHECK(*stack);
+ unsigned long *ret_addr_p =
+ unwind_get_return_address_ptr(&state);
+
+ if (!__kernel_text_address(addr))
+ continue;
+
+ /*
+ * Don't print regs->ip again if it was already printed
+ * by show_regs_if_on_stack().
+ */
+ if (regs && stack == &regs->ip)
+ goto next;
+
+ if (stack == ret_addr_p)
+ reliable = 1;
+
+ /*
+ * When function graph tracing is enabled for a
+ * function, its return address on the stack is
+ * replaced with the address of an ftrace handler
+ * (return_to_handler). In that case, before printing
+ * the "real" address, we want to print the handler
+ * address as an "unreliable" hint that function graph
+ * tracing was involved.
+ */
+ real_addr = ftrace_graph_ret_addr(task, &graph_idx,
+ addr, stack);
+ if (real_addr != addr)
+ printk_stack_address(addr, 0, log_lvl);
+ printk_stack_address(real_addr, reliable, log_lvl);
+
+ if (!reliable)
+ continue;
+
+next:
+ /*
+ * Get the next frame from the unwinder. No need to
+ * check for an error: if anything goes wrong, the rest
+ * of the addresses will just be printed as unreliable.
+ */
+ unwind_next_frame(&state);
+
+ /* if the frame has entry regs, print them */
+ regs = unwind_get_entry_regs(&state, &partial);
+ if (regs)
+ show_regs_if_on_stack(&stack_info, regs, partial);
+ }
+
+ if (stack_name)
+ printk("%s </%s>\n", log_lvl, stack_name);
+ }
+}
+
+void show_stack(struct task_struct *task, unsigned long *sp)
+{
+ task = task ? : current;
+
+ /*
+ * Stack frames below this one aren't interesting. Don't show them
+ * if we're printing for %current.
+ */
+ if (!sp && task == current)
+ sp = get_stack_pointer(current, NULL);
+
+ show_trace_log_lvl(task, NULL, sp, KERN_DEFAULT);
+}
+
+void show_stack_regs(struct pt_regs *regs)
+{
+ show_trace_log_lvl(current, regs, NULL, KERN_DEFAULT);
+}
+
+static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED;
+static int die_owner = -1;
+static unsigned int die_nest_count;
+
+unsigned long oops_begin(void)
+{
+ int cpu;
+ unsigned long flags;
+
+ oops_enter();
+
+ /* racy, but better than risking deadlock. */
+ raw_local_irq_save(flags);
+ cpu = smp_processor_id();
+ if (!arch_spin_trylock(&die_lock)) {
+ if (cpu == die_owner)
+ /* nested oops. should stop eventually */;
+ else
+ arch_spin_lock(&die_lock);
+ }
+ die_nest_count++;
+ die_owner = cpu;
+ console_verbose();
+ bust_spinlocks(1);
+ return flags;
+}
+NOKPROBE_SYMBOL(oops_begin);
+
+void __noreturn rewind_stack_do_exit(int signr);
+
+void oops_end(unsigned long flags, struct pt_regs *regs, int signr)
+{
+ if (regs && kexec_should_crash(current))
+ crash_kexec(regs);
+
+ bust_spinlocks(0);
+ die_owner = -1;
+ add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE);
+ die_nest_count--;
+ if (!die_nest_count)
+ /* Nest count reaches zero, release the lock. */
+ arch_spin_unlock(&die_lock);
+ raw_local_irq_restore(flags);
+ oops_exit();
+
+ /* Executive summary in case the oops scrolled away */
+ __show_regs(&exec_summary_regs, SHOW_REGS_ALL);
+
+ if (!signr)
+ return;
+ if (in_interrupt())
+ panic("Fatal exception in interrupt");
+ if (panic_on_oops)
+ panic("Fatal exception");
+
+ /*
+ * We're not going to return, but we might be on an IST stack or
+ * have very little stack space left. Rewind the stack and kill
+ * the task.
+ * Before we rewind the stack, we have to tell KASAN that we're going to
+ * reuse the task stack and that existing poisons are invalid.
+ */
+ kasan_unpoison_task_stack(current);
+ rewind_stack_do_exit(signr);
+}
+NOKPROBE_SYMBOL(oops_end);
+
+int __die(const char *str, struct pt_regs *regs, long err)
+{
+ /* Save the regs of the first oops for the executive summary later. */
+ if (!die_counter)
+ exec_summary_regs = *regs;
+
+ printk(KERN_DEFAULT
+ "%s: %04lx [#%d]%s%s%s%s%s\n", str, err & 0xffff, ++die_counter,
+ IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "",
+ IS_ENABLED(CONFIG_SMP) ? " SMP" : "",
+ debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "",
+ IS_ENABLED(CONFIG_KASAN) ? " KASAN" : "",
+ IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION) ?
+ (boot_cpu_has(X86_FEATURE_PTI) ? " PTI" : " NOPTI") : "");
+
+ show_regs(regs);
+ print_modules();
+
+ if (notify_die(DIE_OOPS, str, regs, err,
+ current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP)
+ return 1;
+
+ return 0;
+}
+NOKPROBE_SYMBOL(__die);
+
+/*
+ * This is gone through when something in the kernel has done something bad
+ * and is about to be terminated:
+ */
+void die(const char *str, struct pt_regs *regs, long err)
+{
+ unsigned long flags = oops_begin();
+ int sig = SIGSEGV;
+
+ if (__die(str, regs, err))
+ sig = 0;
+ oops_end(flags, regs, sig);
+}
+
+void show_regs(struct pt_regs *regs)
+{
+ show_regs_print_info(KERN_DEFAULT);
+
+ __show_regs(regs, user_mode(regs) ? SHOW_REGS_USER : SHOW_REGS_ALL);
+
+ /*
+ * When in-kernel, we also print out the stack at the time of the fault..
+ */
+ if (!user_mode(regs))
+ show_trace_log_lvl(current, regs, NULL, KERN_DEFAULT);
+}
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
new file mode 100644
index 000000000..cd53f3030
--- /dev/null
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -0,0 +1,129 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
+ */
+#include <linux/sched/debug.h>
+#include <linux/kallsyms.h>
+#include <linux/kprobes.h>
+#include <linux/uaccess.h>
+#include <linux/hardirq.h>
+#include <linux/kdebug.h>
+#include <linux/export.h>
+#include <linux/ptrace.h>
+#include <linux/kexec.h>
+#include <linux/sysfs.h>
+#include <linux/bug.h>
+#include <linux/nmi.h>
+
+#include <asm/stacktrace.h>
+
+const char *stack_type_name(enum stack_type type)
+{
+ if (type == STACK_TYPE_IRQ)
+ return "IRQ";
+
+ if (type == STACK_TYPE_SOFTIRQ)
+ return "SOFTIRQ";
+
+ if (type == STACK_TYPE_ENTRY)
+ return "ENTRY_TRAMPOLINE";
+
+ return NULL;
+}
+
+static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info)
+{
+ unsigned long *begin = (unsigned long *)this_cpu_read(hardirq_stack);
+ unsigned long *end = begin + (THREAD_SIZE / sizeof(long));
+
+ /*
+ * This is a software stack, so 'end' can be a valid stack pointer.
+ * It just means the stack is empty.
+ */
+ if (stack <= begin || stack > end)
+ return false;
+
+ info->type = STACK_TYPE_IRQ;
+ info->begin = begin;
+ info->end = end;
+
+ /*
+ * See irq_32.c -- the next stack pointer is stored at the beginning of
+ * the stack.
+ */
+ info->next_sp = (unsigned long *)*begin;
+
+ return true;
+}
+
+static bool in_softirq_stack(unsigned long *stack, struct stack_info *info)
+{
+ unsigned long *begin = (unsigned long *)this_cpu_read(softirq_stack);
+ unsigned long *end = begin + (THREAD_SIZE / sizeof(long));
+
+ /*
+ * This is a software stack, so 'end' can be a valid stack pointer.
+ * It just means the stack is empty.
+ */
+ if (stack <= begin || stack > end)
+ return false;
+
+ info->type = STACK_TYPE_SOFTIRQ;
+ info->begin = begin;
+ info->end = end;
+
+ /*
+ * The next stack pointer is stored at the beginning of the stack.
+ * See irq_32.c.
+ */
+ info->next_sp = (unsigned long *)*begin;
+
+ return true;
+}
+
+int get_stack_info(unsigned long *stack, struct task_struct *task,
+ struct stack_info *info, unsigned long *visit_mask)
+{
+ if (!stack)
+ goto unknown;
+
+ task = task ? : current;
+
+ if (in_task_stack(stack, task, info))
+ goto recursion_check;
+
+ if (task != current)
+ goto unknown;
+
+ if (in_entry_stack(stack, info))
+ goto recursion_check;
+
+ if (in_hardirq_stack(stack, info))
+ goto recursion_check;
+
+ if (in_softirq_stack(stack, info))
+ goto recursion_check;
+
+ goto unknown;
+
+recursion_check:
+ /*
+ * Make sure we don't iterate through any given stack more than once.
+ * If it comes up a second time then there's something wrong going on:
+ * just break out and report an unknown stack type.
+ */
+ if (visit_mask) {
+ if (*visit_mask & (1UL << info->type)) {
+ printk_deferred_once(KERN_WARNING "WARNING: stack recursion on stack type %d\n", info->type);
+ goto unknown;
+ }
+ *visit_mask |= 1UL << info->type;
+ }
+
+ return 0;
+
+unknown:
+ info->type = STACK_TYPE_UNKNOWN;
+ return -EINVAL;
+}
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
new file mode 100644
index 000000000..5cdb9e84d
--- /dev/null
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -0,0 +1,151 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
+ */
+#include <linux/sched/debug.h>
+#include <linux/kallsyms.h>
+#include <linux/kprobes.h>
+#include <linux/uaccess.h>
+#include <linux/hardirq.h>
+#include <linux/kdebug.h>
+#include <linux/export.h>
+#include <linux/ptrace.h>
+#include <linux/kexec.h>
+#include <linux/sysfs.h>
+#include <linux/bug.h>
+#include <linux/nmi.h>
+
+#include <asm/stacktrace.h>
+
+static char *exception_stack_names[N_EXCEPTION_STACKS] = {
+ [ DOUBLEFAULT_STACK-1 ] = "#DF",
+ [ NMI_STACK-1 ] = "NMI",
+ [ DEBUG_STACK-1 ] = "#DB",
+ [ MCE_STACK-1 ] = "#MC",
+};
+
+static unsigned long exception_stack_sizes[N_EXCEPTION_STACKS] = {
+ [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
+ [DEBUG_STACK - 1] = DEBUG_STKSZ
+};
+
+const char *stack_type_name(enum stack_type type)
+{
+ BUILD_BUG_ON(N_EXCEPTION_STACKS != 4);
+
+ if (type == STACK_TYPE_IRQ)
+ return "IRQ";
+
+ if (type == STACK_TYPE_ENTRY) {
+ /*
+ * On 64-bit, we have a generic entry stack that we
+ * use for all the kernel entry points, including
+ * SYSENTER.
+ */
+ return "ENTRY_TRAMPOLINE";
+ }
+
+ if (type >= STACK_TYPE_EXCEPTION && type <= STACK_TYPE_EXCEPTION_LAST)
+ return exception_stack_names[type - STACK_TYPE_EXCEPTION];
+
+ return NULL;
+}
+
+static bool in_exception_stack(unsigned long *stack, struct stack_info *info)
+{
+ unsigned long *begin, *end;
+ struct pt_regs *regs;
+ unsigned k;
+
+ BUILD_BUG_ON(N_EXCEPTION_STACKS != 4);
+
+ for (k = 0; k < N_EXCEPTION_STACKS; k++) {
+ end = (unsigned long *)raw_cpu_ptr(&orig_ist)->ist[k];
+ begin = end - (exception_stack_sizes[k] / sizeof(long));
+ regs = (struct pt_regs *)end - 1;
+
+ if (stack <= begin || stack >= end)
+ continue;
+
+ info->type = STACK_TYPE_EXCEPTION + k;
+ info->begin = begin;
+ info->end = end;
+ info->next_sp = (unsigned long *)regs->sp;
+
+ return true;
+ }
+
+ return false;
+}
+
+static bool in_irq_stack(unsigned long *stack, struct stack_info *info)
+{
+ unsigned long *end = (unsigned long *)this_cpu_read(irq_stack_ptr);
+ unsigned long *begin = end - (IRQ_STACK_SIZE / sizeof(long));
+
+ /*
+ * This is a software stack, so 'end' can be a valid stack pointer.
+ * It just means the stack is empty.
+ */
+ if (stack <= begin || stack > end)
+ return false;
+
+ info->type = STACK_TYPE_IRQ;
+ info->begin = begin;
+ info->end = end;
+
+ /*
+ * The next stack pointer is the first thing pushed by the entry code
+ * after switching to the irq stack.
+ */
+ info->next_sp = (unsigned long *)*(end - 1);
+
+ return true;
+}
+
+int get_stack_info(unsigned long *stack, struct task_struct *task,
+ struct stack_info *info, unsigned long *visit_mask)
+{
+ if (!stack)
+ goto unknown;
+
+ task = task ? : current;
+
+ if (in_task_stack(stack, task, info))
+ goto recursion_check;
+
+ if (task != current)
+ goto unknown;
+
+ if (in_exception_stack(stack, info))
+ goto recursion_check;
+
+ if (in_irq_stack(stack, info))
+ goto recursion_check;
+
+ if (in_entry_stack(stack, info))
+ goto recursion_check;
+
+ goto unknown;
+
+recursion_check:
+ /*
+ * Make sure we don't iterate through any given stack more than once.
+ * If it comes up a second time then there's something wrong going on:
+ * just break out and report an unknown stack type.
+ */
+ if (visit_mask) {
+ if (*visit_mask & (1UL << info->type)) {
+ printk_deferred_once(KERN_WARNING "WARNING: stack recursion on stack type %d\n", info->type);
+ goto unknown;
+ }
+ *visit_mask |= 1UL << info->type;
+ }
+
+ return 0;
+
+unknown:
+ info->type = STACK_TYPE_UNKNOWN;
+ return -EINVAL;
+}
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
new file mode 100644
index 000000000..d1f25c831
--- /dev/null
+++ b/arch/x86/kernel/e820.c
@@ -0,0 +1,1280 @@
+/*
+ * Low level x86 E820 memory map handling functions.
+ *
+ * The firmware and bootloader passes us the "E820 table", which is the primary
+ * physical memory layout description available about x86 systems.
+ *
+ * The kernel takes the E820 memory layout and optionally modifies it with
+ * quirks and other tweaks, and feeds that into the generic Linux memory
+ * allocation code routines via a platform independent interface (memblock, etc.).
+ */
+#include <linux/crash_dump.h>
+#include <linux/bootmem.h>
+#include <linux/suspend.h>
+#include <linux/acpi.h>
+#include <linux/firmware-map.h>
+#include <linux/memblock.h>
+#include <linux/sort.h>
+
+#include <asm/e820/api.h>
+#include <asm/setup.h>
+
+/*
+ * We organize the E820 table into three main data structures:
+ *
+ * - 'e820_table_firmware': the original firmware version passed to us by the
+ * bootloader - not modified by the kernel. It is composed of two parts:
+ * the first 128 E820 memory entries in boot_params.e820_table and the remaining
+ * (if any) entries of the SETUP_E820_EXT nodes. We use this to:
+ *
+ * - inform the user about the firmware's notion of memory layout
+ * via /sys/firmware/memmap
+ *
+ * - the hibernation code uses it to generate a kernel-independent MD5
+ * fingerprint of the physical memory layout of a system.
+ *
+ * - 'e820_table_kexec': a slightly modified (by the kernel) firmware version
+ * passed to us by the bootloader - the major difference between
+ * e820_table_firmware[] and this one is that, the latter marks the setup_data
+ * list created by the EFI boot stub as reserved, so that kexec can reuse the
+ * setup_data information in the second kernel. Besides, e820_table_kexec[]
+ * might also be modified by the kexec itself to fake a mptable.
+ * We use this to:
+ *
+ * - kexec, which is a bootloader in disguise, uses the original E820
+ * layout to pass to the kexec-ed kernel. This way the original kernel
+ * can have a restricted E820 map while the kexec()-ed kexec-kernel
+ * can have access to full memory - etc.
+ *
+ * - 'e820_table': this is the main E820 table that is massaged by the
+ * low level x86 platform code, or modified by boot parameters, before
+ * passed on to higher level MM layers.
+ *
+ * Once the E820 map has been converted to the standard Linux memory layout
+ * information its role stops - modifying it has no effect and does not get
+ * re-propagated. So itsmain role is a temporary bootstrap storage of firmware
+ * specific memory layout data during early bootup.
+ */
+static struct e820_table e820_table_init __initdata;
+static struct e820_table e820_table_kexec_init __initdata;
+static struct e820_table e820_table_firmware_init __initdata;
+
+struct e820_table *e820_table __refdata = &e820_table_init;
+struct e820_table *e820_table_kexec __refdata = &e820_table_kexec_init;
+struct e820_table *e820_table_firmware __refdata = &e820_table_firmware_init;
+
+/* For PCI or other memory-mapped resources */
+unsigned long pci_mem_start = 0xaeedbabe;
+#ifdef CONFIG_PCI
+EXPORT_SYMBOL(pci_mem_start);
+#endif
+
+/*
+ * This function checks if any part of the range <start,end> is mapped
+ * with type.
+ */
+bool e820__mapped_any(u64 start, u64 end, enum e820_type type)
+{
+ int i;
+
+ for (i = 0; i < e820_table->nr_entries; i++) {
+ struct e820_entry *entry = &e820_table->entries[i];
+
+ if (type && entry->type != type)
+ continue;
+ if (entry->addr >= end || entry->addr + entry->size <= start)
+ continue;
+ return 1;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(e820__mapped_any);
+
+/*
+ * This function checks if the entire <start,end> range is mapped with 'type'.
+ *
+ * Note: this function only works correctly once the E820 table is sorted and
+ * not-overlapping (at least for the range specified), which is the case normally.
+ */
+static struct e820_entry *__e820__mapped_all(u64 start, u64 end,
+ enum e820_type type)
+{
+ int i;
+
+ for (i = 0; i < e820_table->nr_entries; i++) {
+ struct e820_entry *entry = &e820_table->entries[i];
+
+ if (type && entry->type != type)
+ continue;
+
+ /* Is the region (part) in overlap with the current region? */
+ if (entry->addr >= end || entry->addr + entry->size <= start)
+ continue;
+
+ /*
+ * If the region is at the beginning of <start,end> we move
+ * 'start' to the end of the region since it's ok until there
+ */
+ if (entry->addr <= start)
+ start = entry->addr + entry->size;
+
+ /*
+ * If 'start' is now at or beyond 'end', we're done, full
+ * coverage of the desired range exists:
+ */
+ if (start >= end)
+ return entry;
+ }
+
+ return NULL;
+}
+
+/*
+ * This function checks if the entire range <start,end> is mapped with type.
+ */
+bool __init e820__mapped_all(u64 start, u64 end, enum e820_type type)
+{
+ return __e820__mapped_all(start, end, type);
+}
+
+/*
+ * This function returns the type associated with the range <start,end>.
+ */
+int e820__get_entry_type(u64 start, u64 end)
+{
+ struct e820_entry *entry = __e820__mapped_all(start, end, 0);
+
+ return entry ? entry->type : -EINVAL;
+}
+
+/*
+ * Add a memory region to the kernel E820 map.
+ */
+static void __init __e820__range_add(struct e820_table *table, u64 start, u64 size, enum e820_type type)
+{
+ int x = table->nr_entries;
+
+ if (x >= ARRAY_SIZE(table->entries)) {
+ pr_err("too many entries; ignoring [mem %#010llx-%#010llx]\n",
+ start, start + size - 1);
+ return;
+ }
+
+ table->entries[x].addr = start;
+ table->entries[x].size = size;
+ table->entries[x].type = type;
+ table->nr_entries++;
+}
+
+void __init e820__range_add(u64 start, u64 size, enum e820_type type)
+{
+ __e820__range_add(e820_table, start, size, type);
+}
+
+static void __init e820_print_type(enum e820_type type)
+{
+ switch (type) {
+ case E820_TYPE_RAM: /* Fall through: */
+ case E820_TYPE_RESERVED_KERN: pr_cont("usable"); break;
+ case E820_TYPE_RESERVED: pr_cont("reserved"); break;
+ case E820_TYPE_ACPI: pr_cont("ACPI data"); break;
+ case E820_TYPE_NVS: pr_cont("ACPI NVS"); break;
+ case E820_TYPE_UNUSABLE: pr_cont("unusable"); break;
+ case E820_TYPE_PMEM: /* Fall through: */
+ case E820_TYPE_PRAM: pr_cont("persistent (type %u)", type); break;
+ default: pr_cont("type %u", type); break;
+ }
+}
+
+void __init e820__print_table(char *who)
+{
+ int i;
+
+ for (i = 0; i < e820_table->nr_entries; i++) {
+ pr_info("%s: [mem %#018Lx-%#018Lx] ",
+ who,
+ e820_table->entries[i].addr,
+ e820_table->entries[i].addr + e820_table->entries[i].size - 1);
+
+ e820_print_type(e820_table->entries[i].type);
+ pr_cont("\n");
+ }
+}
+
+/*
+ * Sanitize an E820 map.
+ *
+ * Some E820 layouts include overlapping entries. The following
+ * replaces the original E820 map with a new one, removing overlaps,
+ * and resolving conflicting memory types in favor of highest
+ * numbered type.
+ *
+ * The input parameter 'entries' points to an array of 'struct
+ * e820_entry' which on entry has elements in the range [0, *nr_entries)
+ * valid, and which has space for up to max_nr_entries entries.
+ * On return, the resulting sanitized E820 map entries will be in
+ * overwritten in the same location, starting at 'entries'.
+ *
+ * The integer pointed to by nr_entries must be valid on entry (the
+ * current number of valid entries located at 'entries'). If the
+ * sanitizing succeeds the *nr_entries will be updated with the new
+ * number of valid entries (something no more than max_nr_entries).
+ *
+ * The return value from e820__update_table() is zero if it
+ * successfully 'sanitized' the map entries passed in, and is -1
+ * if it did nothing, which can happen if either of (1) it was
+ * only passed one map entry, or (2) any of the input map entries
+ * were invalid (start + size < start, meaning that the size was
+ * so big the described memory range wrapped around through zero.)
+ *
+ * Visually we're performing the following
+ * (1,2,3,4 = memory types)...
+ *
+ * Sample memory map (w/overlaps):
+ * ____22__________________
+ * ______________________4_
+ * ____1111________________
+ * _44_____________________
+ * 11111111________________
+ * ____________________33__
+ * ___________44___________
+ * __________33333_________
+ * ______________22________
+ * ___________________2222_
+ * _________111111111______
+ * _____________________11_
+ * _________________4______
+ *
+ * Sanitized equivalent (no overlap):
+ * 1_______________________
+ * _44_____________________
+ * ___1____________________
+ * ____22__________________
+ * ______11________________
+ * _________1______________
+ * __________3_____________
+ * ___________44___________
+ * _____________33_________
+ * _______________2________
+ * ________________1_______
+ * _________________4______
+ * ___________________2____
+ * ____________________33__
+ * ______________________4_
+ */
+struct change_member {
+ /* Pointer to the original entry: */
+ struct e820_entry *entry;
+ /* Address for this change point: */
+ unsigned long long addr;
+};
+
+static struct change_member change_point_list[2*E820_MAX_ENTRIES] __initdata;
+static struct change_member *change_point[2*E820_MAX_ENTRIES] __initdata;
+static struct e820_entry *overlap_list[E820_MAX_ENTRIES] __initdata;
+static struct e820_entry new_entries[E820_MAX_ENTRIES] __initdata;
+
+static int __init cpcompare(const void *a, const void *b)
+{
+ struct change_member * const *app = a, * const *bpp = b;
+ const struct change_member *ap = *app, *bp = *bpp;
+
+ /*
+ * Inputs are pointers to two elements of change_point[]. If their
+ * addresses are not equal, their difference dominates. If the addresses
+ * are equal, then consider one that represents the end of its region
+ * to be greater than one that does not.
+ */
+ if (ap->addr != bp->addr)
+ return ap->addr > bp->addr ? 1 : -1;
+
+ return (ap->addr != ap->entry->addr) - (bp->addr != bp->entry->addr);
+}
+
+int __init e820__update_table(struct e820_table *table)
+{
+ struct e820_entry *entries = table->entries;
+ u32 max_nr_entries = ARRAY_SIZE(table->entries);
+ enum e820_type current_type, last_type;
+ unsigned long long last_addr;
+ u32 new_nr_entries, overlap_entries;
+ u32 i, chg_idx, chg_nr;
+
+ /* If there's only one memory region, don't bother: */
+ if (table->nr_entries < 2)
+ return -1;
+
+ BUG_ON(table->nr_entries > max_nr_entries);
+
+ /* Bail out if we find any unreasonable addresses in the map: */
+ for (i = 0; i < table->nr_entries; i++) {
+ if (entries[i].addr + entries[i].size < entries[i].addr)
+ return -1;
+ }
+
+ /* Create pointers for initial change-point information (for sorting): */
+ for (i = 0; i < 2 * table->nr_entries; i++)
+ change_point[i] = &change_point_list[i];
+
+ /*
+ * Record all known change-points (starting and ending addresses),
+ * omitting empty memory regions:
+ */
+ chg_idx = 0;
+ for (i = 0; i < table->nr_entries; i++) {
+ if (entries[i].size != 0) {
+ change_point[chg_idx]->addr = entries[i].addr;
+ change_point[chg_idx++]->entry = &entries[i];
+ change_point[chg_idx]->addr = entries[i].addr + entries[i].size;
+ change_point[chg_idx++]->entry = &entries[i];
+ }
+ }
+ chg_nr = chg_idx;
+
+ /* Sort change-point list by memory addresses (low -> high): */
+ sort(change_point, chg_nr, sizeof(*change_point), cpcompare, NULL);
+
+ /* Create a new memory map, removing overlaps: */
+ overlap_entries = 0; /* Number of entries in the overlap table */
+ new_nr_entries = 0; /* Index for creating new map entries */
+ last_type = 0; /* Start with undefined memory type */
+ last_addr = 0; /* Start with 0 as last starting address */
+
+ /* Loop through change-points, determining effect on the new map: */
+ for (chg_idx = 0; chg_idx < chg_nr; chg_idx++) {
+ /* Keep track of all overlapping entries */
+ if (change_point[chg_idx]->addr == change_point[chg_idx]->entry->addr) {
+ /* Add map entry to overlap list (> 1 entry implies an overlap) */
+ overlap_list[overlap_entries++] = change_point[chg_idx]->entry;
+ } else {
+ /* Remove entry from list (order independent, so swap with last): */
+ for (i = 0; i < overlap_entries; i++) {
+ if (overlap_list[i] == change_point[chg_idx]->entry)
+ overlap_list[i] = overlap_list[overlap_entries-1];
+ }
+ overlap_entries--;
+ }
+ /*
+ * If there are overlapping entries, decide which
+ * "type" to use (larger value takes precedence --
+ * 1=usable, 2,3,4,4+=unusable)
+ */
+ current_type = 0;
+ for (i = 0; i < overlap_entries; i++) {
+ if (overlap_list[i]->type > current_type)
+ current_type = overlap_list[i]->type;
+ }
+
+ /* Continue building up new map based on this information: */
+ if (current_type != last_type || current_type == E820_TYPE_PRAM) {
+ if (last_type != 0) {
+ new_entries[new_nr_entries].size = change_point[chg_idx]->addr - last_addr;
+ /* Move forward only if the new size was non-zero: */
+ if (new_entries[new_nr_entries].size != 0)
+ /* No more space left for new entries? */
+ if (++new_nr_entries >= max_nr_entries)
+ break;
+ }
+ if (current_type != 0) {
+ new_entries[new_nr_entries].addr = change_point[chg_idx]->addr;
+ new_entries[new_nr_entries].type = current_type;
+ last_addr = change_point[chg_idx]->addr;
+ }
+ last_type = current_type;
+ }
+ }
+
+ /* Copy the new entries into the original location: */
+ memcpy(entries, new_entries, new_nr_entries*sizeof(*entries));
+ table->nr_entries = new_nr_entries;
+
+ return 0;
+}
+
+static int __init __append_e820_table(struct boot_e820_entry *entries, u32 nr_entries)
+{
+ struct boot_e820_entry *entry = entries;
+
+ while (nr_entries) {
+ u64 start = entry->addr;
+ u64 size = entry->size;
+ u64 end = start + size - 1;
+ u32 type = entry->type;
+
+ /* Ignore the entry on 64-bit overflow: */
+ if (start > end && likely(size))
+ return -1;
+
+ e820__range_add(start, size, type);
+
+ entry++;
+ nr_entries--;
+ }
+ return 0;
+}
+
+/*
+ * Copy the BIOS E820 map into a safe place.
+ *
+ * Sanity-check it while we're at it..
+ *
+ * If we're lucky and live on a modern system, the setup code
+ * will have given us a memory map that we can use to properly
+ * set up memory. If we aren't, we'll fake a memory map.
+ */
+static int __init append_e820_table(struct boot_e820_entry *entries, u32 nr_entries)
+{
+ /* Only one memory region (or negative)? Ignore it */
+ if (nr_entries < 2)
+ return -1;
+
+ return __append_e820_table(entries, nr_entries);
+}
+
+static u64 __init
+__e820__range_update(struct e820_table *table, u64 start, u64 size, enum e820_type old_type, enum e820_type new_type)
+{
+ u64 end;
+ unsigned int i;
+ u64 real_updated_size = 0;
+
+ BUG_ON(old_type == new_type);
+
+ if (size > (ULLONG_MAX - start))
+ size = ULLONG_MAX - start;
+
+ end = start + size;
+ printk(KERN_DEBUG "e820: update [mem %#010Lx-%#010Lx] ", start, end - 1);
+ e820_print_type(old_type);
+ pr_cont(" ==> ");
+ e820_print_type(new_type);
+ pr_cont("\n");
+
+ for (i = 0; i < table->nr_entries; i++) {
+ struct e820_entry *entry = &table->entries[i];
+ u64 final_start, final_end;
+ u64 entry_end;
+
+ if (entry->type != old_type)
+ continue;
+
+ entry_end = entry->addr + entry->size;
+
+ /* Completely covered by new range? */
+ if (entry->addr >= start && entry_end <= end) {
+ entry->type = new_type;
+ real_updated_size += entry->size;
+ continue;
+ }
+
+ /* New range is completely covered? */
+ if (entry->addr < start && entry_end > end) {
+ __e820__range_add(table, start, size, new_type);
+ __e820__range_add(table, end, entry_end - end, entry->type);
+ entry->size = start - entry->addr;
+ real_updated_size += size;
+ continue;
+ }
+
+ /* Partially covered: */
+ final_start = max(start, entry->addr);
+ final_end = min(end, entry_end);
+ if (final_start >= final_end)
+ continue;
+
+ __e820__range_add(table, final_start, final_end - final_start, new_type);
+
+ real_updated_size += final_end - final_start;
+
+ /*
+ * Left range could be head or tail, so need to update
+ * its size first:
+ */
+ entry->size -= final_end - final_start;
+ if (entry->addr < final_start)
+ continue;
+
+ entry->addr = final_end;
+ }
+ return real_updated_size;
+}
+
+u64 __init e820__range_update(u64 start, u64 size, enum e820_type old_type, enum e820_type new_type)
+{
+ return __e820__range_update(e820_table, start, size, old_type, new_type);
+}
+
+static u64 __init e820__range_update_kexec(u64 start, u64 size, enum e820_type old_type, enum e820_type new_type)
+{
+ return __e820__range_update(e820_table_kexec, start, size, old_type, new_type);
+}
+
+/* Remove a range of memory from the E820 table: */
+u64 __init e820__range_remove(u64 start, u64 size, enum e820_type old_type, bool check_type)
+{
+ int i;
+ u64 end;
+ u64 real_removed_size = 0;
+
+ if (size > (ULLONG_MAX - start))
+ size = ULLONG_MAX - start;
+
+ end = start + size;
+ printk(KERN_DEBUG "e820: remove [mem %#010Lx-%#010Lx] ", start, end - 1);
+ if (check_type)
+ e820_print_type(old_type);
+ pr_cont("\n");
+
+ for (i = 0; i < e820_table->nr_entries; i++) {
+ struct e820_entry *entry = &e820_table->entries[i];
+ u64 final_start, final_end;
+ u64 entry_end;
+
+ if (check_type && entry->type != old_type)
+ continue;
+
+ entry_end = entry->addr + entry->size;
+
+ /* Completely covered? */
+ if (entry->addr >= start && entry_end <= end) {
+ real_removed_size += entry->size;
+ memset(entry, 0, sizeof(*entry));
+ continue;
+ }
+
+ /* Is the new range completely covered? */
+ if (entry->addr < start && entry_end > end) {
+ e820__range_add(end, entry_end - end, entry->type);
+ entry->size = start - entry->addr;
+ real_removed_size += size;
+ continue;
+ }
+
+ /* Partially covered: */
+ final_start = max(start, entry->addr);
+ final_end = min(end, entry_end);
+ if (final_start >= final_end)
+ continue;
+
+ real_removed_size += final_end - final_start;
+
+ /*
+ * Left range could be head or tail, so need to update
+ * the size first:
+ */
+ entry->size -= final_end - final_start;
+ if (entry->addr < final_start)
+ continue;
+
+ entry->addr = final_end;
+ }
+ return real_removed_size;
+}
+
+void __init e820__update_table_print(void)
+{
+ if (e820__update_table(e820_table))
+ return;
+
+ pr_info("modified physical RAM map:\n");
+ e820__print_table("modified");
+}
+
+static void __init e820__update_table_kexec(void)
+{
+ e820__update_table(e820_table_kexec);
+}
+
+#define MAX_GAP_END 0x100000000ull
+
+/*
+ * Search for a gap in the E820 memory space from 0 to MAX_GAP_END (4GB).
+ */
+static int __init e820_search_gap(unsigned long *gapstart, unsigned long *gapsize)
+{
+ unsigned long long last = MAX_GAP_END;
+ int i = e820_table->nr_entries;
+ int found = 0;
+
+ while (--i >= 0) {
+ unsigned long long start = e820_table->entries[i].addr;
+ unsigned long long end = start + e820_table->entries[i].size;
+
+ /*
+ * Since "last" is at most 4GB, we know we'll
+ * fit in 32 bits if this condition is true:
+ */
+ if (last > end) {
+ unsigned long gap = last - end;
+
+ if (gap >= *gapsize) {
+ *gapsize = gap;
+ *gapstart = end;
+ found = 1;
+ }
+ }
+ if (start < last)
+ last = start;
+ }
+ return found;
+}
+
+/*
+ * Search for the biggest gap in the low 32 bits of the E820
+ * memory space. We pass this space to the PCI subsystem, so
+ * that it can assign MMIO resources for hotplug or
+ * unconfigured devices in.
+ *
+ * Hopefully the BIOS let enough space left.
+ */
+__init void e820__setup_pci_gap(void)
+{
+ unsigned long gapstart, gapsize;
+ int found;
+
+ gapsize = 0x400000;
+ found = e820_search_gap(&gapstart, &gapsize);
+
+ if (!found) {
+#ifdef CONFIG_X86_64
+ gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024;
+ pr_err("Cannot find an available gap in the 32-bit address range\n");
+ pr_err("PCI devices with unassigned 32-bit BARs may not work!\n");
+#else
+ gapstart = 0x10000000;
+#endif
+ }
+
+ /*
+ * e820__reserve_resources_late() protects stolen RAM already:
+ */
+ pci_mem_start = gapstart;
+
+ pr_info("[mem %#010lx-%#010lx] available for PCI devices\n",
+ gapstart, gapstart + gapsize - 1);
+}
+
+/*
+ * Called late during init, in free_initmem().
+ *
+ * Initial e820_table and e820_table_kexec are largish __initdata arrays.
+ *
+ * Copy them to a (usually much smaller) dynamically allocated area that is
+ * sized precisely after the number of e820 entries.
+ *
+ * This is done after we've performed all the fixes and tweaks to the tables.
+ * All functions which modify them are __init functions, which won't exist
+ * after free_initmem().
+ */
+__init void e820__reallocate_tables(void)
+{
+ struct e820_table *n;
+ int size;
+
+ size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table->nr_entries;
+ n = kmalloc(size, GFP_KERNEL);
+ BUG_ON(!n);
+ memcpy(n, e820_table, size);
+ e820_table = n;
+
+ size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table_kexec->nr_entries;
+ n = kmalloc(size, GFP_KERNEL);
+ BUG_ON(!n);
+ memcpy(n, e820_table_kexec, size);
+ e820_table_kexec = n;
+
+ size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table_firmware->nr_entries;
+ n = kmalloc(size, GFP_KERNEL);
+ BUG_ON(!n);
+ memcpy(n, e820_table_firmware, size);
+ e820_table_firmware = n;
+}
+
+/*
+ * Because of the small fixed size of struct boot_params, only the first
+ * 128 E820 memory entries are passed to the kernel via boot_params.e820_table,
+ * the remaining (if any) entries are passed via the SETUP_E820_EXT node of
+ * struct setup_data, which is parsed here.
+ */
+void __init e820__memory_setup_extended(u64 phys_addr, u32 data_len)
+{
+ int entries;
+ struct boot_e820_entry *extmap;
+ struct setup_data *sdata;
+
+ sdata = early_memremap(phys_addr, data_len);
+ entries = sdata->len / sizeof(*extmap);
+ extmap = (struct boot_e820_entry *)(sdata->data);
+
+ __append_e820_table(extmap, entries);
+ e820__update_table(e820_table);
+
+ memcpy(e820_table_kexec, e820_table, sizeof(*e820_table_kexec));
+ memcpy(e820_table_firmware, e820_table, sizeof(*e820_table_firmware));
+
+ early_memunmap(sdata, data_len);
+ pr_info("extended physical RAM map:\n");
+ e820__print_table("extended");
+}
+
+/*
+ * Find the ranges of physical addresses that do not correspond to
+ * E820 RAM areas and register the corresponding pages as 'nosave' for
+ * hibernation (32-bit) or software suspend and suspend to RAM (64-bit).
+ *
+ * This function requires the E820 map to be sorted and without any
+ * overlapping entries.
+ */
+void __init e820__register_nosave_regions(unsigned long limit_pfn)
+{
+ int i;
+ unsigned long pfn = 0;
+
+ for (i = 0; i < e820_table->nr_entries; i++) {
+ struct e820_entry *entry = &e820_table->entries[i];
+
+ if (pfn < PFN_UP(entry->addr))
+ register_nosave_region(pfn, PFN_UP(entry->addr));
+
+ pfn = PFN_DOWN(entry->addr + entry->size);
+
+ if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN)
+ register_nosave_region(PFN_UP(entry->addr), pfn);
+
+ if (pfn >= limit_pfn)
+ break;
+ }
+}
+
+#ifdef CONFIG_ACPI
+/*
+ * Register ACPI NVS memory regions, so that we can save/restore them during
+ * hibernation and the subsequent resume:
+ */
+static int __init e820__register_nvs_regions(void)
+{
+ int i;
+
+ for (i = 0; i < e820_table->nr_entries; i++) {
+ struct e820_entry *entry = &e820_table->entries[i];
+
+ if (entry->type == E820_TYPE_NVS)
+ acpi_nvs_register(entry->addr, entry->size);
+ }
+
+ return 0;
+}
+core_initcall(e820__register_nvs_regions);
+#endif
+
+/*
+ * Allocate the requested number of bytes with the requsted alignment
+ * and return (the physical address) to the caller. Also register this
+ * range in the 'kexec' E820 table as a reserved range.
+ *
+ * This allows kexec to fake a new mptable, as if it came from the real
+ * system.
+ */
+u64 __init e820__memblock_alloc_reserved(u64 size, u64 align)
+{
+ u64 addr;
+
+ addr = __memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
+ if (addr) {
+ e820__range_update_kexec(addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED);
+ pr_info("update e820_table_kexec for e820__memblock_alloc_reserved()\n");
+ e820__update_table_kexec();
+ }
+
+ return addr;
+}
+
+#ifdef CONFIG_X86_32
+# ifdef CONFIG_X86_PAE
+# define MAX_ARCH_PFN (1ULL<<(36-PAGE_SHIFT))
+# else
+# define MAX_ARCH_PFN (1ULL<<(32-PAGE_SHIFT))
+# endif
+#else /* CONFIG_X86_32 */
+# define MAX_ARCH_PFN MAXMEM>>PAGE_SHIFT
+#endif
+
+/*
+ * Find the highest page frame number we have available
+ */
+static unsigned long __init e820_end_pfn(unsigned long limit_pfn, enum e820_type type)
+{
+ int i;
+ unsigned long last_pfn = 0;
+ unsigned long max_arch_pfn = MAX_ARCH_PFN;
+
+ for (i = 0; i < e820_table->nr_entries; i++) {
+ struct e820_entry *entry = &e820_table->entries[i];
+ unsigned long start_pfn;
+ unsigned long end_pfn;
+
+ if (entry->type != type)
+ continue;
+
+ start_pfn = entry->addr >> PAGE_SHIFT;
+ end_pfn = (entry->addr + entry->size) >> PAGE_SHIFT;
+
+ if (start_pfn >= limit_pfn)
+ continue;
+ if (end_pfn > limit_pfn) {
+ last_pfn = limit_pfn;
+ break;
+ }
+ if (end_pfn > last_pfn)
+ last_pfn = end_pfn;
+ }
+
+ if (last_pfn > max_arch_pfn)
+ last_pfn = max_arch_pfn;
+
+ pr_info("last_pfn = %#lx max_arch_pfn = %#lx\n",
+ last_pfn, max_arch_pfn);
+ return last_pfn;
+}
+
+unsigned long __init e820__end_of_ram_pfn(void)
+{
+ return e820_end_pfn(MAX_ARCH_PFN, E820_TYPE_RAM);
+}
+
+unsigned long __init e820__end_of_low_ram_pfn(void)
+{
+ return e820_end_pfn(1UL << (32 - PAGE_SHIFT), E820_TYPE_RAM);
+}
+
+static void __init early_panic(char *msg)
+{
+ early_printk(msg);
+ panic(msg);
+}
+
+static int userdef __initdata;
+
+/* The "mem=nopentium" boot option disables 4MB page tables on 32-bit kernels: */
+static int __init parse_memopt(char *p)
+{
+ u64 mem_size;
+
+ if (!p)
+ return -EINVAL;
+
+ if (!strcmp(p, "nopentium")) {
+#ifdef CONFIG_X86_32
+ setup_clear_cpu_cap(X86_FEATURE_PSE);
+ return 0;
+#else
+ pr_warn("mem=nopentium ignored! (only supported on x86_32)\n");
+ return -EINVAL;
+#endif
+ }
+
+ userdef = 1;
+ mem_size = memparse(p, &p);
+
+ /* Don't remove all memory when getting "mem={invalid}" parameter: */
+ if (mem_size == 0)
+ return -EINVAL;
+
+ e820__range_remove(mem_size, ULLONG_MAX - mem_size, E820_TYPE_RAM, 1);
+
+ return 0;
+}
+early_param("mem", parse_memopt);
+
+static int __init parse_memmap_one(char *p)
+{
+ char *oldp;
+ u64 start_at, mem_size;
+
+ if (!p)
+ return -EINVAL;
+
+ if (!strncmp(p, "exactmap", 8)) {
+#ifdef CONFIG_CRASH_DUMP
+ /*
+ * If we are doing a crash dump, we still need to know
+ * the real memory size before the original memory map is
+ * reset.
+ */
+ saved_max_pfn = e820__end_of_ram_pfn();
+#endif
+ e820_table->nr_entries = 0;
+ userdef = 1;
+ return 0;
+ }
+
+ oldp = p;
+ mem_size = memparse(p, &p);
+ if (p == oldp)
+ return -EINVAL;
+
+ userdef = 1;
+ if (*p == '@') {
+ start_at = memparse(p+1, &p);
+ e820__range_add(start_at, mem_size, E820_TYPE_RAM);
+ } else if (*p == '#') {
+ start_at = memparse(p+1, &p);
+ e820__range_add(start_at, mem_size, E820_TYPE_ACPI);
+ } else if (*p == '$') {
+ start_at = memparse(p+1, &p);
+ e820__range_add(start_at, mem_size, E820_TYPE_RESERVED);
+ } else if (*p == '!') {
+ start_at = memparse(p+1, &p);
+ e820__range_add(start_at, mem_size, E820_TYPE_PRAM);
+ } else if (*p == '%') {
+ enum e820_type from = 0, to = 0;
+
+ start_at = memparse(p + 1, &p);
+ if (*p == '-')
+ from = simple_strtoull(p + 1, &p, 0);
+ if (*p == '+')
+ to = simple_strtoull(p + 1, &p, 0);
+ if (*p != '\0')
+ return -EINVAL;
+ if (from && to)
+ e820__range_update(start_at, mem_size, from, to);
+ else if (to)
+ e820__range_add(start_at, mem_size, to);
+ else if (from)
+ e820__range_remove(start_at, mem_size, from, 1);
+ else
+ e820__range_remove(start_at, mem_size, 0, 0);
+ } else {
+ e820__range_remove(mem_size, ULLONG_MAX - mem_size, E820_TYPE_RAM, 1);
+ }
+
+ return *p == '\0' ? 0 : -EINVAL;
+}
+
+static int __init parse_memmap_opt(char *str)
+{
+ while (str) {
+ char *k = strchr(str, ',');
+
+ if (k)
+ *k++ = 0;
+
+ parse_memmap_one(str);
+ str = k;
+ }
+
+ return 0;
+}
+early_param("memmap", parse_memmap_opt);
+
+/*
+ * Reserve all entries from the bootloader's extensible data nodes list,
+ * because if present we are going to use it later on to fetch e820
+ * entries from it:
+ */
+void __init e820__reserve_setup_data(void)
+{
+ struct setup_data *data;
+ u64 pa_data;
+
+ pa_data = boot_params.hdr.setup_data;
+ if (!pa_data)
+ return;
+
+ while (pa_data) {
+ data = early_memremap(pa_data, sizeof(*data));
+ e820__range_update(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN);
+ e820__range_update_kexec(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN);
+ pa_data = data->next;
+ early_memunmap(data, sizeof(*data));
+ }
+
+ e820__update_table(e820_table);
+ e820__update_table(e820_table_kexec);
+
+ pr_info("extended physical RAM map:\n");
+ e820__print_table("reserve setup_data");
+}
+
+/*
+ * Called after parse_early_param(), after early parameters (such as mem=)
+ * have been processed, in which case we already have an E820 table filled in
+ * via the parameter callback function(s), but it's not sorted and printed yet:
+ */
+void __init e820__finish_early_params(void)
+{
+ if (userdef) {
+ if (e820__update_table(e820_table) < 0)
+ early_panic("Invalid user supplied memory map");
+
+ pr_info("user-defined physical RAM map:\n");
+ e820__print_table("user");
+ }
+}
+
+static const char *__init e820_type_to_string(struct e820_entry *entry)
+{
+ switch (entry->type) {
+ case E820_TYPE_RESERVED_KERN: /* Fall-through: */
+ case E820_TYPE_RAM: return "System RAM";
+ case E820_TYPE_ACPI: return "ACPI Tables";
+ case E820_TYPE_NVS: return "ACPI Non-volatile Storage";
+ case E820_TYPE_UNUSABLE: return "Unusable memory";
+ case E820_TYPE_PRAM: return "Persistent Memory (legacy)";
+ case E820_TYPE_PMEM: return "Persistent Memory";
+ case E820_TYPE_RESERVED: return "Reserved";
+ default: return "Unknown E820 type";
+ }
+}
+
+static unsigned long __init e820_type_to_iomem_type(struct e820_entry *entry)
+{
+ switch (entry->type) {
+ case E820_TYPE_RESERVED_KERN: /* Fall-through: */
+ case E820_TYPE_RAM: return IORESOURCE_SYSTEM_RAM;
+ case E820_TYPE_ACPI: /* Fall-through: */
+ case E820_TYPE_NVS: /* Fall-through: */
+ case E820_TYPE_UNUSABLE: /* Fall-through: */
+ case E820_TYPE_PRAM: /* Fall-through: */
+ case E820_TYPE_PMEM: /* Fall-through: */
+ case E820_TYPE_RESERVED: /* Fall-through: */
+ default: return IORESOURCE_MEM;
+ }
+}
+
+static unsigned long __init e820_type_to_iores_desc(struct e820_entry *entry)
+{
+ switch (entry->type) {
+ case E820_TYPE_ACPI: return IORES_DESC_ACPI_TABLES;
+ case E820_TYPE_NVS: return IORES_DESC_ACPI_NV_STORAGE;
+ case E820_TYPE_PMEM: return IORES_DESC_PERSISTENT_MEMORY;
+ case E820_TYPE_PRAM: return IORES_DESC_PERSISTENT_MEMORY_LEGACY;
+ case E820_TYPE_RESERVED_KERN: /* Fall-through: */
+ case E820_TYPE_RAM: /* Fall-through: */
+ case E820_TYPE_UNUSABLE: /* Fall-through: */
+ case E820_TYPE_RESERVED: /* Fall-through: */
+ default: return IORES_DESC_NONE;
+ }
+}
+
+static bool __init do_mark_busy(enum e820_type type, struct resource *res)
+{
+ /* this is the legacy bios/dos rom-shadow + mmio region */
+ if (res->start < (1ULL<<20))
+ return true;
+
+ /*
+ * Treat persistent memory like device memory, i.e. reserve it
+ * for exclusive use of a driver
+ */
+ switch (type) {
+ case E820_TYPE_RESERVED:
+ case E820_TYPE_PRAM:
+ case E820_TYPE_PMEM:
+ return false;
+ case E820_TYPE_RESERVED_KERN:
+ case E820_TYPE_RAM:
+ case E820_TYPE_ACPI:
+ case E820_TYPE_NVS:
+ case E820_TYPE_UNUSABLE:
+ default:
+ return true;
+ }
+}
+
+/*
+ * Mark E820 reserved areas as busy for the resource manager:
+ */
+
+static struct resource __initdata *e820_res;
+
+void __init e820__reserve_resources(void)
+{
+ int i;
+ struct resource *res;
+ u64 end;
+
+ res = alloc_bootmem(sizeof(*res) * e820_table->nr_entries);
+ e820_res = res;
+
+ for (i = 0; i < e820_table->nr_entries; i++) {
+ struct e820_entry *entry = e820_table->entries + i;
+
+ end = entry->addr + entry->size - 1;
+ if (end != (resource_size_t)end) {
+ res++;
+ continue;
+ }
+ res->start = entry->addr;
+ res->end = end;
+ res->name = e820_type_to_string(entry);
+ res->flags = e820_type_to_iomem_type(entry);
+ res->desc = e820_type_to_iores_desc(entry);
+
+ /*
+ * Don't register the region that could be conflicted with
+ * PCI device BAR resources and insert them later in
+ * pcibios_resource_survey():
+ */
+ if (do_mark_busy(entry->type, res)) {
+ res->flags |= IORESOURCE_BUSY;
+ insert_resource(&iomem_resource, res);
+ }
+ res++;
+ }
+
+ /* Expose the bootloader-provided memory layout to the sysfs. */
+ for (i = 0; i < e820_table_firmware->nr_entries; i++) {
+ struct e820_entry *entry = e820_table_firmware->entries + i;
+
+ firmware_map_add_early(entry->addr, entry->addr + entry->size, e820_type_to_string(entry));
+ }
+}
+
+/*
+ * How much should we pad the end of RAM, depending on where it is?
+ */
+static unsigned long __init ram_alignment(resource_size_t pos)
+{
+ unsigned long mb = pos >> 20;
+
+ /* To 64kB in the first megabyte */
+ if (!mb)
+ return 64*1024;
+
+ /* To 1MB in the first 16MB */
+ if (mb < 16)
+ return 1024*1024;
+
+ /* To 64MB for anything above that */
+ return 64*1024*1024;
+}
+
+#define MAX_RESOURCE_SIZE ((resource_size_t)-1)
+
+void __init e820__reserve_resources_late(void)
+{
+ int i;
+ struct resource *res;
+
+ res = e820_res;
+ for (i = 0; i < e820_table->nr_entries; i++) {
+ if (!res->parent && res->end)
+ insert_resource_expand_to_fit(&iomem_resource, res);
+ res++;
+ }
+
+ /*
+ * Try to bump up RAM regions to reasonable boundaries, to
+ * avoid stolen RAM:
+ */
+ for (i = 0; i < e820_table->nr_entries; i++) {
+ struct e820_entry *entry = &e820_table->entries[i];
+ u64 start, end;
+
+ if (entry->type != E820_TYPE_RAM)
+ continue;
+
+ start = entry->addr + entry->size;
+ end = round_up(start, ram_alignment(start)) - 1;
+ if (end > MAX_RESOURCE_SIZE)
+ end = MAX_RESOURCE_SIZE;
+ if (start >= end)
+ continue;
+
+ printk(KERN_DEBUG "e820: reserve RAM buffer [mem %#010llx-%#010llx]\n", start, end);
+ reserve_region_with_split(&iomem_resource, start, end, "RAM buffer");
+ }
+}
+
+/*
+ * Pass the firmware (bootloader) E820 map to the kernel and process it:
+ */
+char *__init e820__memory_setup_default(void)
+{
+ char *who = "BIOS-e820";
+
+ /*
+ * Try to copy the BIOS-supplied E820-map.
+ *
+ * Otherwise fake a memory map; one section from 0k->640k,
+ * the next section from 1mb->appropriate_mem_k
+ */
+ if (append_e820_table(boot_params.e820_table, boot_params.e820_entries) < 0) {
+ u64 mem_size;
+
+ /* Compare results from other methods and take the one that gives more RAM: */
+ if (boot_params.alt_mem_k < boot_params.screen_info.ext_mem_k) {
+ mem_size = boot_params.screen_info.ext_mem_k;
+ who = "BIOS-88";
+ } else {
+ mem_size = boot_params.alt_mem_k;
+ who = "BIOS-e801";
+ }
+
+ e820_table->nr_entries = 0;
+ e820__range_add(0, LOWMEMSIZE(), E820_TYPE_RAM);
+ e820__range_add(HIGH_MEMORY, mem_size << 10, E820_TYPE_RAM);
+ }
+
+ /* We just appended a lot of ranges, sanitize the table: */
+ e820__update_table(e820_table);
+
+ return who;
+}
+
+/*
+ * Calls e820__memory_setup_default() in essence to pick up the firmware/bootloader
+ * E820 map - with an optional platform quirk available for virtual platforms
+ * to override this method of boot environment processing:
+ */
+void __init e820__memory_setup(void)
+{
+ char *who;
+
+ /* This is a firmware interface ABI - make sure we don't break it: */
+ BUILD_BUG_ON(sizeof(struct boot_e820_entry) != 20);
+
+ who = x86_init.resources.memory_setup();
+
+ memcpy(e820_table_kexec, e820_table, sizeof(*e820_table_kexec));
+ memcpy(e820_table_firmware, e820_table, sizeof(*e820_table_firmware));
+
+ pr_info("BIOS-provided physical RAM map:\n");
+ e820__print_table(who);
+}
+
+void __init e820__memblock_setup(void)
+{
+ int i;
+ u64 end;
+
+ /*
+ * The bootstrap memblock region count maximum is 128 entries
+ * (INIT_MEMBLOCK_REGIONS), but EFI might pass us more E820 entries
+ * than that - so allow memblock resizing.
+ *
+ * This is safe, because this call happens pretty late during x86 setup,
+ * so we know about reserved memory regions already. (This is important
+ * so that memblock resizing does no stomp over reserved areas.)
+ */
+ memblock_allow_resize();
+
+ for (i = 0; i < e820_table->nr_entries; i++) {
+ struct e820_entry *entry = &e820_table->entries[i];
+
+ end = entry->addr + entry->size;
+ if (end != (resource_size_t)end)
+ continue;
+
+ if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN)
+ continue;
+
+ memblock_add(entry->addr, entry->size);
+ }
+
+ /* Throw away partial pages: */
+ memblock_trim_memory(PAGE_SIZE);
+
+ memblock_dump_all();
+}
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
new file mode 100644
index 000000000..d5352b0ae
--- /dev/null
+++ b/arch/x86/kernel/early-quirks.c
@@ -0,0 +1,802 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Various workarounds for chipset bugs.
+ This code runs very early and can't use the regular PCI subsystem
+ The entries are keyed to PCI bridges which usually identify chipsets
+ uniquely.
+ This is only for whole classes of chipsets with specific problems which
+ need early invasive action (e.g. before the timers are initialized).
+ Most PCI device specific workarounds can be done later and should be
+ in standard PCI quirks
+ Mainboard specific bugs should be handled by DMI entries.
+ CPU specific bugs in setup.c */
+
+#include <linux/pci.h>
+#include <linux/acpi.h>
+#include <linux/delay.h>
+#include <linux/pci_ids.h>
+#include <linux/bcma/bcma.h>
+#include <linux/bcma/bcma_regs.h>
+#include <linux/platform_data/x86/apple.h>
+#include <drm/i915_drm.h>
+#include <asm/pci-direct.h>
+#include <asm/dma.h>
+#include <asm/io_apic.h>
+#include <asm/apic.h>
+#include <asm/hpet.h>
+#include <asm/iommu.h>
+#include <asm/gart.h>
+#include <asm/irq_remapping.h>
+#include <asm/early_ioremap.h>
+
+static void __init fix_hypertransport_config(int num, int slot, int func)
+{
+ u32 htcfg;
+ /*
+ * we found a hypertransport bus
+ * make sure that we are broadcasting
+ * interrupts to all cpus on the ht bus
+ * if we're using extended apic ids
+ */
+ htcfg = read_pci_config(num, slot, func, 0x68);
+ if (htcfg & (1 << 18)) {
+ printk(KERN_INFO "Detected use of extended apic ids "
+ "on hypertransport bus\n");
+ if ((htcfg & (1 << 17)) == 0) {
+ printk(KERN_INFO "Enabling hypertransport extended "
+ "apic interrupt broadcast\n");
+ printk(KERN_INFO "Note this is a bios bug, "
+ "please contact your hw vendor\n");
+ htcfg |= (1 << 17);
+ write_pci_config(num, slot, func, 0x68, htcfg);
+ }
+ }
+
+
+}
+
+static void __init via_bugs(int num, int slot, int func)
+{
+#ifdef CONFIG_GART_IOMMU
+ if ((max_pfn > MAX_DMA32_PFN || force_iommu) &&
+ !gart_iommu_aperture_allowed) {
+ printk(KERN_INFO
+ "Looks like a VIA chipset. Disabling IOMMU."
+ " Override with iommu=allowed\n");
+ gart_iommu_aperture_disabled = 1;
+ }
+#endif
+}
+
+#ifdef CONFIG_ACPI
+#ifdef CONFIG_X86_IO_APIC
+
+static int __init nvidia_hpet_check(struct acpi_table_header *header)
+{
+ return 0;
+}
+#endif /* CONFIG_X86_IO_APIC */
+#endif /* CONFIG_ACPI */
+
+static void __init nvidia_bugs(int num, int slot, int func)
+{
+#ifdef CONFIG_ACPI
+#ifdef CONFIG_X86_IO_APIC
+ /*
+ * Only applies to Nvidia root ports (bus 0) and not to
+ * Nvidia graphics cards with PCI ports on secondary buses.
+ */
+ if (num)
+ return;
+
+ /*
+ * All timer overrides on Nvidia are
+ * wrong unless HPET is enabled.
+ * Unfortunately that's not true on many Asus boards.
+ * We don't know yet how to detect this automatically, but
+ * at least allow a command line override.
+ */
+ if (acpi_use_timer_override)
+ return;
+
+ if (acpi_table_parse(ACPI_SIG_HPET, nvidia_hpet_check)) {
+ acpi_skip_timer_override = 1;
+ printk(KERN_INFO "Nvidia board "
+ "detected. Ignoring ACPI "
+ "timer override.\n");
+ printk(KERN_INFO "If you got timer trouble "
+ "try acpi_use_timer_override\n");
+ }
+#endif
+#endif
+ /* RED-PEN skip them on mptables too? */
+
+}
+
+#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC)
+static u32 __init ati_ixp4x0_rev(int num, int slot, int func)
+{
+ u32 d;
+ u8 b;
+
+ b = read_pci_config_byte(num, slot, func, 0xac);
+ b &= ~(1<<5);
+ write_pci_config_byte(num, slot, func, 0xac, b);
+
+ d = read_pci_config(num, slot, func, 0x70);
+ d |= 1<<8;
+ write_pci_config(num, slot, func, 0x70, d);
+
+ d = read_pci_config(num, slot, func, 0x8);
+ d &= 0xff;
+ return d;
+}
+
+static void __init ati_bugs(int num, int slot, int func)
+{
+ u32 d;
+ u8 b;
+
+ if (acpi_use_timer_override)
+ return;
+
+ d = ati_ixp4x0_rev(num, slot, func);
+ if (d < 0x82)
+ acpi_skip_timer_override = 1;
+ else {
+ /* check for IRQ0 interrupt swap */
+ outb(0x72, 0xcd6); b = inb(0xcd7);
+ if (!(b & 0x2))
+ acpi_skip_timer_override = 1;
+ }
+
+ if (acpi_skip_timer_override) {
+ printk(KERN_INFO "SB4X0 revision 0x%x\n", d);
+ printk(KERN_INFO "Ignoring ACPI timer override.\n");
+ printk(KERN_INFO "If you got timer trouble "
+ "try acpi_use_timer_override\n");
+ }
+}
+
+static u32 __init ati_sbx00_rev(int num, int slot, int func)
+{
+ u32 d;
+
+ d = read_pci_config(num, slot, func, 0x8);
+ d &= 0xff;
+
+ return d;
+}
+
+static void __init ati_bugs_contd(int num, int slot, int func)
+{
+ u32 d, rev;
+
+ rev = ati_sbx00_rev(num, slot, func);
+ if (rev >= 0x40)
+ acpi_fix_pin2_polarity = 1;
+
+ /*
+ * SB600: revisions 0x11, 0x12, 0x13, 0x14, ...
+ * SB700: revisions 0x39, 0x3a, ...
+ * SB800: revisions 0x40, 0x41, ...
+ */
+ if (rev >= 0x39)
+ return;
+
+ if (acpi_use_timer_override)
+ return;
+
+ /* check for IRQ0 interrupt swap */
+ d = read_pci_config(num, slot, func, 0x64);
+ if (!(d & (1<<14)))
+ acpi_skip_timer_override = 1;
+
+ if (acpi_skip_timer_override) {
+ printk(KERN_INFO "SB600 revision 0x%x\n", rev);
+ printk(KERN_INFO "Ignoring ACPI timer override.\n");
+ printk(KERN_INFO "If you got timer trouble "
+ "try acpi_use_timer_override\n");
+ }
+}
+#else
+static void __init ati_bugs(int num, int slot, int func)
+{
+}
+
+static void __init ati_bugs_contd(int num, int slot, int func)
+{
+}
+#endif
+
+static void __init intel_remapping_check(int num, int slot, int func)
+{
+ u8 revision;
+ u16 device;
+
+ device = read_pci_config_16(num, slot, func, PCI_DEVICE_ID);
+ revision = read_pci_config_byte(num, slot, func, PCI_REVISION_ID);
+
+ /*
+ * Revision <= 13 of all triggering devices id in this quirk
+ * have a problem draining interrupts when irq remapping is
+ * enabled, and should be flagged as broken. Additionally
+ * revision 0x22 of device id 0x3405 has this problem.
+ */
+ if (revision <= 0x13)
+ set_irq_remapping_broken();
+ else if (device == 0x3405 && revision == 0x22)
+ set_irq_remapping_broken();
+}
+
+/*
+ * Systems with Intel graphics controllers set aside memory exclusively
+ * for gfx driver use. This memory is not marked in the E820 as reserved
+ * or as RAM, and so is subject to overlap from E820 manipulation later
+ * in the boot process. On some systems, MMIO space is allocated on top,
+ * despite the efforts of the "RAM buffer" approach, which simply rounds
+ * memory boundaries up to 64M to try to catch space that may decode
+ * as RAM and so is not suitable for MMIO.
+ */
+
+#define KB(x) ((x) * 1024UL)
+#define MB(x) (KB (KB (x)))
+
+static resource_size_t __init i830_tseg_size(void)
+{
+ u8 esmramc = read_pci_config_byte(0, 0, 0, I830_ESMRAMC);
+
+ if (!(esmramc & TSEG_ENABLE))
+ return 0;
+
+ if (esmramc & I830_TSEG_SIZE_1M)
+ return MB(1);
+ else
+ return KB(512);
+}
+
+static resource_size_t __init i845_tseg_size(void)
+{
+ u8 esmramc = read_pci_config_byte(0, 0, 0, I845_ESMRAMC);
+ u8 tseg_size = esmramc & I845_TSEG_SIZE_MASK;
+
+ if (!(esmramc & TSEG_ENABLE))
+ return 0;
+
+ switch (tseg_size) {
+ case I845_TSEG_SIZE_512K: return KB(512);
+ case I845_TSEG_SIZE_1M: return MB(1);
+ default:
+ WARN(1, "Unknown ESMRAMC value: %x!\n", esmramc);
+ }
+ return 0;
+}
+
+static resource_size_t __init i85x_tseg_size(void)
+{
+ u8 esmramc = read_pci_config_byte(0, 0, 0, I85X_ESMRAMC);
+
+ if (!(esmramc & TSEG_ENABLE))
+ return 0;
+
+ return MB(1);
+}
+
+static resource_size_t __init i830_mem_size(void)
+{
+ return read_pci_config_byte(0, 0, 0, I830_DRB3) * MB(32);
+}
+
+static resource_size_t __init i85x_mem_size(void)
+{
+ return read_pci_config_byte(0, 0, 1, I85X_DRB3) * MB(32);
+}
+
+/*
+ * On 830/845/85x the stolen memory base isn't available in any
+ * register. We need to calculate it as TOM-TSEG_SIZE-stolen_size.
+ */
+static resource_size_t __init i830_stolen_base(int num, int slot, int func,
+ resource_size_t stolen_size)
+{
+ return i830_mem_size() - i830_tseg_size() - stolen_size;
+}
+
+static resource_size_t __init i845_stolen_base(int num, int slot, int func,
+ resource_size_t stolen_size)
+{
+ return i830_mem_size() - i845_tseg_size() - stolen_size;
+}
+
+static resource_size_t __init i85x_stolen_base(int num, int slot, int func,
+ resource_size_t stolen_size)
+{
+ return i85x_mem_size() - i85x_tseg_size() - stolen_size;
+}
+
+static resource_size_t __init i865_stolen_base(int num, int slot, int func,
+ resource_size_t stolen_size)
+{
+ u16 toud = 0;
+
+ toud = read_pci_config_16(0, 0, 0, I865_TOUD);
+
+ return toud * KB(64) + i845_tseg_size();
+}
+
+static resource_size_t __init gen3_stolen_base(int num, int slot, int func,
+ resource_size_t stolen_size)
+{
+ u32 bsm;
+
+ /* Almost universally we can find the Graphics Base of Stolen Memory
+ * at register BSM (0x5c) in the igfx configuration space. On a few
+ * (desktop) machines this is also mirrored in the bridge device at
+ * different locations, or in the MCHBAR.
+ */
+ bsm = read_pci_config(num, slot, func, INTEL_BSM);
+
+ return bsm & INTEL_BSM_MASK;
+}
+
+static resource_size_t __init gen11_stolen_base(int num, int slot, int func,
+ resource_size_t stolen_size)
+{
+ u64 bsm;
+
+ bsm = read_pci_config(num, slot, func, INTEL_GEN11_BSM_DW0);
+ bsm &= INTEL_BSM_MASK;
+ bsm |= (u64)read_pci_config(num, slot, func, INTEL_GEN11_BSM_DW1) << 32;
+
+ return bsm;
+}
+
+static resource_size_t __init i830_stolen_size(int num, int slot, int func)
+{
+ u16 gmch_ctrl;
+ u16 gms;
+
+ gmch_ctrl = read_pci_config_16(0, 0, 0, I830_GMCH_CTRL);
+ gms = gmch_ctrl & I830_GMCH_GMS_MASK;
+
+ switch (gms) {
+ case I830_GMCH_GMS_STOLEN_512: return KB(512);
+ case I830_GMCH_GMS_STOLEN_1024: return MB(1);
+ case I830_GMCH_GMS_STOLEN_8192: return MB(8);
+ /* local memory isn't part of the normal address space */
+ case I830_GMCH_GMS_LOCAL: return 0;
+ default:
+ WARN(1, "Unknown GMCH_CTRL value: %x!\n", gmch_ctrl);
+ }
+
+ return 0;
+}
+
+static resource_size_t __init gen3_stolen_size(int num, int slot, int func)
+{
+ u16 gmch_ctrl;
+ u16 gms;
+
+ gmch_ctrl = read_pci_config_16(0, 0, 0, I830_GMCH_CTRL);
+ gms = gmch_ctrl & I855_GMCH_GMS_MASK;
+
+ switch (gms) {
+ case I855_GMCH_GMS_STOLEN_1M: return MB(1);
+ case I855_GMCH_GMS_STOLEN_4M: return MB(4);
+ case I855_GMCH_GMS_STOLEN_8M: return MB(8);
+ case I855_GMCH_GMS_STOLEN_16M: return MB(16);
+ case I855_GMCH_GMS_STOLEN_32M: return MB(32);
+ case I915_GMCH_GMS_STOLEN_48M: return MB(48);
+ case I915_GMCH_GMS_STOLEN_64M: return MB(64);
+ case G33_GMCH_GMS_STOLEN_128M: return MB(128);
+ case G33_GMCH_GMS_STOLEN_256M: return MB(256);
+ case INTEL_GMCH_GMS_STOLEN_96M: return MB(96);
+ case INTEL_GMCH_GMS_STOLEN_160M:return MB(160);
+ case INTEL_GMCH_GMS_STOLEN_224M:return MB(224);
+ case INTEL_GMCH_GMS_STOLEN_352M:return MB(352);
+ default:
+ WARN(1, "Unknown GMCH_CTRL value: %x!\n", gmch_ctrl);
+ }
+
+ return 0;
+}
+
+static resource_size_t __init gen6_stolen_size(int num, int slot, int func)
+{
+ u16 gmch_ctrl;
+ u16 gms;
+
+ gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL);
+ gms = (gmch_ctrl >> SNB_GMCH_GMS_SHIFT) & SNB_GMCH_GMS_MASK;
+
+ return gms * MB(32);
+}
+
+static resource_size_t __init gen8_stolen_size(int num, int slot, int func)
+{
+ u16 gmch_ctrl;
+ u16 gms;
+
+ gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL);
+ gms = (gmch_ctrl >> BDW_GMCH_GMS_SHIFT) & BDW_GMCH_GMS_MASK;
+
+ return gms * MB(32);
+}
+
+static resource_size_t __init chv_stolen_size(int num, int slot, int func)
+{
+ u16 gmch_ctrl;
+ u16 gms;
+
+ gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL);
+ gms = (gmch_ctrl >> SNB_GMCH_GMS_SHIFT) & SNB_GMCH_GMS_MASK;
+
+ /*
+ * 0x0 to 0x10: 32MB increments starting at 0MB
+ * 0x11 to 0x16: 4MB increments starting at 8MB
+ * 0x17 to 0x1d: 4MB increments start at 36MB
+ */
+ if (gms < 0x11)
+ return gms * MB(32);
+ else if (gms < 0x17)
+ return (gms - 0x11) * MB(4) + MB(8);
+ else
+ return (gms - 0x17) * MB(4) + MB(36);
+}
+
+static resource_size_t __init gen9_stolen_size(int num, int slot, int func)
+{
+ u16 gmch_ctrl;
+ u16 gms;
+
+ gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL);
+ gms = (gmch_ctrl >> BDW_GMCH_GMS_SHIFT) & BDW_GMCH_GMS_MASK;
+
+ /* 0x0 to 0xef: 32MB increments starting at 0MB */
+ /* 0xf0 to 0xfe: 4MB increments starting at 4MB */
+ if (gms < 0xf0)
+ return gms * MB(32);
+ else
+ return (gms - 0xf0) * MB(4) + MB(4);
+}
+
+struct intel_early_ops {
+ resource_size_t (*stolen_size)(int num, int slot, int func);
+ resource_size_t (*stolen_base)(int num, int slot, int func,
+ resource_size_t size);
+};
+
+static const struct intel_early_ops i830_early_ops __initconst = {
+ .stolen_base = i830_stolen_base,
+ .stolen_size = i830_stolen_size,
+};
+
+static const struct intel_early_ops i845_early_ops __initconst = {
+ .stolen_base = i845_stolen_base,
+ .stolen_size = i830_stolen_size,
+};
+
+static const struct intel_early_ops i85x_early_ops __initconst = {
+ .stolen_base = i85x_stolen_base,
+ .stolen_size = gen3_stolen_size,
+};
+
+static const struct intel_early_ops i865_early_ops __initconst = {
+ .stolen_base = i865_stolen_base,
+ .stolen_size = gen3_stolen_size,
+};
+
+static const struct intel_early_ops gen3_early_ops __initconst = {
+ .stolen_base = gen3_stolen_base,
+ .stolen_size = gen3_stolen_size,
+};
+
+static const struct intel_early_ops gen6_early_ops __initconst = {
+ .stolen_base = gen3_stolen_base,
+ .stolen_size = gen6_stolen_size,
+};
+
+static const struct intel_early_ops gen8_early_ops __initconst = {
+ .stolen_base = gen3_stolen_base,
+ .stolen_size = gen8_stolen_size,
+};
+
+static const struct intel_early_ops gen9_early_ops __initconst = {
+ .stolen_base = gen3_stolen_base,
+ .stolen_size = gen9_stolen_size,
+};
+
+static const struct intel_early_ops chv_early_ops __initconst = {
+ .stolen_base = gen3_stolen_base,
+ .stolen_size = chv_stolen_size,
+};
+
+static const struct intel_early_ops gen11_early_ops __initconst = {
+ .stolen_base = gen11_stolen_base,
+ .stolen_size = gen9_stolen_size,
+};
+
+/* Intel integrated GPUs for which we need to reserve "stolen memory" */
+static const struct pci_device_id intel_early_ids[] __initconst = {
+ INTEL_I830_IDS(&i830_early_ops),
+ INTEL_I845G_IDS(&i845_early_ops),
+ INTEL_I85X_IDS(&i85x_early_ops),
+ INTEL_I865G_IDS(&i865_early_ops),
+ INTEL_I915G_IDS(&gen3_early_ops),
+ INTEL_I915GM_IDS(&gen3_early_ops),
+ INTEL_I945G_IDS(&gen3_early_ops),
+ INTEL_I945GM_IDS(&gen3_early_ops),
+ INTEL_VLV_IDS(&gen6_early_ops),
+ INTEL_PINEVIEW_IDS(&gen3_early_ops),
+ INTEL_I965G_IDS(&gen3_early_ops),
+ INTEL_G33_IDS(&gen3_early_ops),
+ INTEL_I965GM_IDS(&gen3_early_ops),
+ INTEL_GM45_IDS(&gen3_early_ops),
+ INTEL_G45_IDS(&gen3_early_ops),
+ INTEL_IRONLAKE_D_IDS(&gen3_early_ops),
+ INTEL_IRONLAKE_M_IDS(&gen3_early_ops),
+ INTEL_SNB_D_IDS(&gen6_early_ops),
+ INTEL_SNB_M_IDS(&gen6_early_ops),
+ INTEL_IVB_M_IDS(&gen6_early_ops),
+ INTEL_IVB_D_IDS(&gen6_early_ops),
+ INTEL_HSW_IDS(&gen6_early_ops),
+ INTEL_BDW_IDS(&gen8_early_ops),
+ INTEL_CHV_IDS(&chv_early_ops),
+ INTEL_SKL_IDS(&gen9_early_ops),
+ INTEL_BXT_IDS(&gen9_early_ops),
+ INTEL_KBL_IDS(&gen9_early_ops),
+ INTEL_CFL_IDS(&gen9_early_ops),
+ INTEL_GLK_IDS(&gen9_early_ops),
+ INTEL_CNL_IDS(&gen9_early_ops),
+ INTEL_ICL_11_IDS(&gen11_early_ops),
+};
+
+struct resource intel_graphics_stolen_res __ro_after_init = DEFINE_RES_MEM(0, 0);
+EXPORT_SYMBOL(intel_graphics_stolen_res);
+
+static void __init
+intel_graphics_stolen(int num, int slot, int func,
+ const struct intel_early_ops *early_ops)
+{
+ resource_size_t base, size;
+ resource_size_t end;
+
+ size = early_ops->stolen_size(num, slot, func);
+ base = early_ops->stolen_base(num, slot, func, size);
+
+ if (!size || !base)
+ return;
+
+ end = base + size - 1;
+
+ intel_graphics_stolen_res.start = base;
+ intel_graphics_stolen_res.end = end;
+
+ printk(KERN_INFO "Reserving Intel graphics memory at %pR\n",
+ &intel_graphics_stolen_res);
+
+ /* Mark this space as reserved */
+ e820__range_add(base, size, E820_TYPE_RESERVED);
+ e820__update_table(e820_table);
+}
+
+static void __init intel_graphics_quirks(int num, int slot, int func)
+{
+ const struct intel_early_ops *early_ops;
+ u16 device;
+ int i;
+
+ /*
+ * Reserve "stolen memory" for an integrated GPU. If we've already
+ * found one, there's nothing to do for other (discrete) GPUs.
+ */
+ if (resource_size(&intel_graphics_stolen_res))
+ return;
+
+ device = read_pci_config_16(num, slot, func, PCI_DEVICE_ID);
+
+ for (i = 0; i < ARRAY_SIZE(intel_early_ids); i++) {
+ kernel_ulong_t driver_data = intel_early_ids[i].driver_data;
+
+ if (intel_early_ids[i].device != device)
+ continue;
+
+ early_ops = (typeof(early_ops))driver_data;
+
+ intel_graphics_stolen(num, slot, func, early_ops);
+
+ return;
+ }
+}
+
+static void __init force_disable_hpet(int num, int slot, int func)
+{
+#ifdef CONFIG_HPET_TIMER
+ boot_hpet_disable = true;
+ pr_info("x86/hpet: Will disable the HPET for this platform because it's not reliable\n");
+#endif
+}
+
+#define BCM4331_MMIO_SIZE 16384
+#define BCM4331_PM_CAP 0x40
+#define bcma_aread32(reg) ioread32(mmio + 1 * BCMA_CORE_SIZE + reg)
+#define bcma_awrite32(reg, val) iowrite32(val, mmio + 1 * BCMA_CORE_SIZE + reg)
+
+static void __init apple_airport_reset(int bus, int slot, int func)
+{
+ void __iomem *mmio;
+ u16 pmcsr;
+ u64 addr;
+ int i;
+
+ if (!x86_apple_machine)
+ return;
+
+ /* Card may have been put into PCI_D3hot by grub quirk */
+ pmcsr = read_pci_config_16(bus, slot, func, BCM4331_PM_CAP + PCI_PM_CTRL);
+
+ if ((pmcsr & PCI_PM_CTRL_STATE_MASK) != PCI_D0) {
+ pmcsr &= ~PCI_PM_CTRL_STATE_MASK;
+ write_pci_config_16(bus, slot, func, BCM4331_PM_CAP + PCI_PM_CTRL, pmcsr);
+ mdelay(10);
+
+ pmcsr = read_pci_config_16(bus, slot, func, BCM4331_PM_CAP + PCI_PM_CTRL);
+ if ((pmcsr & PCI_PM_CTRL_STATE_MASK) != PCI_D0) {
+ pr_err("pci 0000:%02x:%02x.%d: Cannot power up Apple AirPort card\n",
+ bus, slot, func);
+ return;
+ }
+ }
+
+ addr = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0);
+ addr |= (u64)read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_1) << 32;
+ addr &= PCI_BASE_ADDRESS_MEM_MASK;
+
+ mmio = early_ioremap(addr, BCM4331_MMIO_SIZE);
+ if (!mmio) {
+ pr_err("pci 0000:%02x:%02x.%d: Cannot iomap Apple AirPort card\n",
+ bus, slot, func);
+ return;
+ }
+
+ pr_info("Resetting Apple AirPort card (left enabled by EFI)\n");
+
+ for (i = 0; bcma_aread32(BCMA_RESET_ST) && i < 30; i++)
+ udelay(10);
+
+ bcma_awrite32(BCMA_RESET_CTL, BCMA_RESET_CTL_RESET);
+ bcma_aread32(BCMA_RESET_CTL);
+ udelay(1);
+
+ bcma_awrite32(BCMA_RESET_CTL, 0);
+ bcma_aread32(BCMA_RESET_CTL);
+ udelay(10);
+
+ early_iounmap(mmio, BCM4331_MMIO_SIZE);
+}
+
+#define QFLAG_APPLY_ONCE 0x1
+#define QFLAG_APPLIED 0x2
+#define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED)
+struct chipset {
+ u32 vendor;
+ u32 device;
+ u32 class;
+ u32 class_mask;
+ u32 flags;
+ void (*f)(int num, int slot, int func);
+};
+
+static struct chipset early_qrk[] __initdata = {
+ { PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID,
+ PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, nvidia_bugs },
+ { PCI_VENDOR_ID_VIA, PCI_ANY_ID,
+ PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, via_bugs },
+ { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB,
+ PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, fix_hypertransport_config },
+ { PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS,
+ PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs },
+ { PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS,
+ PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs_contd },
+ { PCI_VENDOR_ID_INTEL, 0x3403, PCI_CLASS_BRIDGE_HOST,
+ PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check },
+ { PCI_VENDOR_ID_INTEL, 0x3405, PCI_CLASS_BRIDGE_HOST,
+ PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check },
+ { PCI_VENDOR_ID_INTEL, 0x3406, PCI_CLASS_BRIDGE_HOST,
+ PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check },
+ { PCI_VENDOR_ID_INTEL, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA, PCI_ANY_ID,
+ 0, intel_graphics_quirks },
+ /*
+ * HPET on the current version of the Baytrail platform has accuracy
+ * problems: it will halt in deep idle state - so we disable it.
+ *
+ * More details can be found in section 18.10.1.3 of the datasheet:
+ *
+ * http://www.intel.com/content/dam/www/public/us/en/documents/datasheets/atom-z8000-datasheet-vol-1.pdf
+ */
+ { PCI_VENDOR_ID_INTEL, 0x0f00,
+ PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet},
+ { PCI_VENDOR_ID_BROADCOM, 0x4331,
+ PCI_CLASS_NETWORK_OTHER, PCI_ANY_ID, 0, apple_airport_reset},
+ {}
+};
+
+static void __init early_pci_scan_bus(int bus);
+
+/**
+ * check_dev_quirk - apply early quirks to a given PCI device
+ * @num: bus number
+ * @slot: slot number
+ * @func: PCI function
+ *
+ * Check the vendor & device ID against the early quirks table.
+ *
+ * If the device is single function, let early_pci_scan_bus() know so we don't
+ * poke at this device again.
+ */
+static int __init check_dev_quirk(int num, int slot, int func)
+{
+ u16 class;
+ u16 vendor;
+ u16 device;
+ u8 type;
+ u8 sec;
+ int i;
+
+ class = read_pci_config_16(num, slot, func, PCI_CLASS_DEVICE);
+
+ if (class == 0xffff)
+ return -1; /* no class, treat as single function */
+
+ vendor = read_pci_config_16(num, slot, func, PCI_VENDOR_ID);
+
+ device = read_pci_config_16(num, slot, func, PCI_DEVICE_ID);
+
+ for (i = 0; early_qrk[i].f != NULL; i++) {
+ if (((early_qrk[i].vendor == PCI_ANY_ID) ||
+ (early_qrk[i].vendor == vendor)) &&
+ ((early_qrk[i].device == PCI_ANY_ID) ||
+ (early_qrk[i].device == device)) &&
+ (!((early_qrk[i].class ^ class) &
+ early_qrk[i].class_mask))) {
+ if ((early_qrk[i].flags &
+ QFLAG_DONE) != QFLAG_DONE)
+ early_qrk[i].f(num, slot, func);
+ early_qrk[i].flags |= QFLAG_APPLIED;
+ }
+ }
+
+ type = read_pci_config_byte(num, slot, func,
+ PCI_HEADER_TYPE);
+
+ if ((type & 0x7f) == PCI_HEADER_TYPE_BRIDGE) {
+ sec = read_pci_config_byte(num, slot, func, PCI_SECONDARY_BUS);
+ if (sec > num)
+ early_pci_scan_bus(sec);
+ }
+
+ if (!(type & 0x80))
+ return -1;
+
+ return 0;
+}
+
+static void __init early_pci_scan_bus(int bus)
+{
+ int slot, func;
+
+ /* Poor man's PCI discovery */
+ for (slot = 0; slot < 32; slot++)
+ for (func = 0; func < 8; func++) {
+ /* Only probe function 0 on single fn devices */
+ if (check_dev_quirk(bus, slot, func))
+ break;
+ }
+}
+
+void __init early_quirks(void)
+{
+ if (!early_pci_allowed())
+ return;
+
+ early_pci_scan_bus(0);
+}
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
new file mode 100644
index 000000000..374a52fa5
--- /dev/null
+++ b/arch/x86/kernel/early_printk.c
@@ -0,0 +1,405 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/console.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/screen_info.h>
+#include <linux/usb/ch9.h>
+#include <linux/pci_regs.h>
+#include <linux/pci_ids.h>
+#include <linux/errno.h>
+#include <asm/io.h>
+#include <asm/processor.h>
+#include <asm/fcntl.h>
+#include <asm/setup.h>
+#include <xen/hvc-console.h>
+#include <asm/pci-direct.h>
+#include <asm/fixmap.h>
+#include <asm/intel-mid.h>
+#include <asm/pgtable.h>
+#include <linux/usb/ehci_def.h>
+#include <linux/usb/xhci-dbgp.h>
+#include <linux/efi.h>
+#include <asm/efi.h>
+#include <asm/pci_x86.h>
+
+/* Simple VGA output */
+#define VGABASE (__ISA_IO_base + 0xb8000)
+
+static int max_ypos = 25, max_xpos = 80;
+static int current_ypos = 25, current_xpos;
+
+static void early_vga_write(struct console *con, const char *str, unsigned n)
+{
+ char c;
+ int i, k, j;
+
+ while ((c = *str++) != '\0' && n-- > 0) {
+ if (current_ypos >= max_ypos) {
+ /* scroll 1 line up */
+ for (k = 1, j = 0; k < max_ypos; k++, j++) {
+ for (i = 0; i < max_xpos; i++) {
+ writew(readw(VGABASE+2*(max_xpos*k+i)),
+ VGABASE + 2*(max_xpos*j + i));
+ }
+ }
+ for (i = 0; i < max_xpos; i++)
+ writew(0x720, VGABASE + 2*(max_xpos*j + i));
+ current_ypos = max_ypos-1;
+ }
+#ifdef CONFIG_KGDB_KDB
+ if (c == '\b') {
+ if (current_xpos > 0)
+ current_xpos--;
+ } else if (c == '\r') {
+ current_xpos = 0;
+ } else
+#endif
+ if (c == '\n') {
+ current_xpos = 0;
+ current_ypos++;
+ } else if (c != '\r') {
+ writew(((0x7 << 8) | (unsigned short) c),
+ VGABASE + 2*(max_xpos*current_ypos +
+ current_xpos++));
+ if (current_xpos >= max_xpos) {
+ current_xpos = 0;
+ current_ypos++;
+ }
+ }
+ }
+}
+
+static struct console early_vga_console = {
+ .name = "earlyvga",
+ .write = early_vga_write,
+ .flags = CON_PRINTBUFFER,
+ .index = -1,
+};
+
+/* Serial functions loosely based on a similar package from Klaus P. Gerlicher */
+
+static unsigned long early_serial_base = 0x3f8; /* ttyS0 */
+
+#define XMTRDY 0x20
+
+#define DLAB 0x80
+
+#define TXR 0 /* Transmit register (WRITE) */
+#define RXR 0 /* Receive register (READ) */
+#define IER 1 /* Interrupt Enable */
+#define IIR 2 /* Interrupt ID */
+#define FCR 2 /* FIFO control */
+#define LCR 3 /* Line control */
+#define MCR 4 /* Modem control */
+#define LSR 5 /* Line Status */
+#define MSR 6 /* Modem Status */
+#define DLL 0 /* Divisor Latch Low */
+#define DLH 1 /* Divisor latch High */
+
+static unsigned int io_serial_in(unsigned long addr, int offset)
+{
+ return inb(addr + offset);
+}
+
+static void io_serial_out(unsigned long addr, int offset, int value)
+{
+ outb(value, addr + offset);
+}
+
+static unsigned int (*serial_in)(unsigned long addr, int offset) = io_serial_in;
+static void (*serial_out)(unsigned long addr, int offset, int value) = io_serial_out;
+
+static int early_serial_putc(unsigned char ch)
+{
+ unsigned timeout = 0xffff;
+
+ while ((serial_in(early_serial_base, LSR) & XMTRDY) == 0 && --timeout)
+ cpu_relax();
+ serial_out(early_serial_base, TXR, ch);
+ return timeout ? 0 : -1;
+}
+
+static void early_serial_write(struct console *con, const char *s, unsigned n)
+{
+ while (*s && n-- > 0) {
+ if (*s == '\n')
+ early_serial_putc('\r');
+ early_serial_putc(*s);
+ s++;
+ }
+}
+
+static __init void early_serial_hw_init(unsigned divisor)
+{
+ unsigned char c;
+
+ serial_out(early_serial_base, LCR, 0x3); /* 8n1 */
+ serial_out(early_serial_base, IER, 0); /* no interrupt */
+ serial_out(early_serial_base, FCR, 0); /* no fifo */
+ serial_out(early_serial_base, MCR, 0x3); /* DTR + RTS */
+
+ c = serial_in(early_serial_base, LCR);
+ serial_out(early_serial_base, LCR, c | DLAB);
+ serial_out(early_serial_base, DLL, divisor & 0xff);
+ serial_out(early_serial_base, DLH, (divisor >> 8) & 0xff);
+ serial_out(early_serial_base, LCR, c & ~DLAB);
+}
+
+#define DEFAULT_BAUD 9600
+
+static __init void early_serial_init(char *s)
+{
+ unsigned divisor;
+ unsigned long baud = DEFAULT_BAUD;
+ char *e;
+
+ if (*s == ',')
+ ++s;
+
+ if (*s) {
+ unsigned port;
+ if (!strncmp(s, "0x", 2)) {
+ early_serial_base = simple_strtoul(s, &e, 16);
+ } else {
+ static const int __initconst bases[] = { 0x3f8, 0x2f8 };
+
+ if (!strncmp(s, "ttyS", 4))
+ s += 4;
+ port = simple_strtoul(s, &e, 10);
+ if (port > 1 || s == e)
+ port = 0;
+ early_serial_base = bases[port];
+ }
+ s += strcspn(s, ",");
+ if (*s == ',')
+ s++;
+ }
+
+ if (*s) {
+ baud = simple_strtoull(s, &e, 0);
+
+ if (baud == 0 || s == e)
+ baud = DEFAULT_BAUD;
+ }
+
+ /* Convert from baud to divisor value */
+ divisor = 115200 / baud;
+
+ /* These will always be IO based ports */
+ serial_in = io_serial_in;
+ serial_out = io_serial_out;
+
+ /* Set up the HW */
+ early_serial_hw_init(divisor);
+}
+
+#ifdef CONFIG_PCI
+static void mem32_serial_out(unsigned long addr, int offset, int value)
+{
+ u32 __iomem *vaddr = (u32 __iomem *)addr;
+ /* shift implied by pointer type */
+ writel(value, vaddr + offset);
+}
+
+static unsigned int mem32_serial_in(unsigned long addr, int offset)
+{
+ u32 __iomem *vaddr = (u32 __iomem *)addr;
+ /* shift implied by pointer type */
+ return readl(vaddr + offset);
+}
+
+/*
+ * early_pci_serial_init()
+ *
+ * This function is invoked when the early_printk param starts with "pciserial"
+ * The rest of the param should be "[force],B:D.F,baud", where B, D & F describe
+ * the location of a PCI device that must be a UART device. "force" is optional
+ * and overrides the use of an UART device with a wrong PCI class code.
+ */
+static __init void early_pci_serial_init(char *s)
+{
+ unsigned divisor;
+ unsigned long baud = DEFAULT_BAUD;
+ u8 bus, slot, func;
+ u32 classcode, bar0;
+ u16 cmdreg;
+ char *e;
+ int force = 0;
+
+ if (*s == ',')
+ ++s;
+
+ if (*s == 0)
+ return;
+
+ /* Force the use of an UART device with wrong class code */
+ if (!strncmp(s, "force,", 6)) {
+ force = 1;
+ s += 6;
+ }
+
+ /*
+ * Part the param to get the BDF values
+ */
+ bus = (u8)simple_strtoul(s, &e, 16);
+ s = e;
+ if (*s != ':')
+ return;
+ ++s;
+ slot = (u8)simple_strtoul(s, &e, 16);
+ s = e;
+ if (*s != '.')
+ return;
+ ++s;
+ func = (u8)simple_strtoul(s, &e, 16);
+ s = e;
+
+ /* A baud might be following */
+ if (*s == ',')
+ s++;
+
+ /*
+ * Find the device from the BDF
+ */
+ cmdreg = read_pci_config(bus, slot, func, PCI_COMMAND);
+ classcode = read_pci_config(bus, slot, func, PCI_CLASS_REVISION);
+ bar0 = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0);
+
+ /*
+ * Verify it is a UART type device
+ */
+ if (((classcode >> 16 != PCI_CLASS_COMMUNICATION_MODEM) &&
+ (classcode >> 16 != PCI_CLASS_COMMUNICATION_SERIAL)) ||
+ (((classcode >> 8) & 0xff) != 0x02)) /* 16550 I/F at BAR0 */ {
+ if (!force)
+ return;
+ }
+
+ /*
+ * Determine if it is IO or memory mapped
+ */
+ if (bar0 & 0x01) {
+ /* it is IO mapped */
+ serial_in = io_serial_in;
+ serial_out = io_serial_out;
+ early_serial_base = bar0&0xfffffffc;
+ write_pci_config(bus, slot, func, PCI_COMMAND,
+ cmdreg|PCI_COMMAND_IO);
+ } else {
+ /* It is memory mapped - assume 32-bit alignment */
+ serial_in = mem32_serial_in;
+ serial_out = mem32_serial_out;
+ /* WARNING! assuming the address is always in the first 4G */
+ early_serial_base =
+ (unsigned long)early_ioremap(bar0 & 0xfffffff0, 0x10);
+ write_pci_config(bus, slot, func, PCI_COMMAND,
+ cmdreg|PCI_COMMAND_MEMORY);
+ }
+
+ /*
+ * Initialize the hardware
+ */
+ if (*s) {
+ if (strcmp(s, "nocfg") == 0)
+ /* Sometimes, we want to leave the UART alone
+ * and assume the BIOS has set it up correctly.
+ * "nocfg" tells us this is the case, and we
+ * should do no more setup.
+ */
+ return;
+ if (kstrtoul(s, 0, &baud) < 0 || baud == 0)
+ baud = DEFAULT_BAUD;
+ }
+
+ /* Convert from baud to divisor value */
+ divisor = 115200 / baud;
+
+ /* Set up the HW */
+ early_serial_hw_init(divisor);
+}
+#endif
+
+static struct console early_serial_console = {
+ .name = "earlyser",
+ .write = early_serial_write,
+ .flags = CON_PRINTBUFFER,
+ .index = -1,
+};
+
+static void early_console_register(struct console *con, int keep_early)
+{
+ if (con->index != -1) {
+ printk(KERN_CRIT "ERROR: earlyprintk= %s already used\n",
+ con->name);
+ return;
+ }
+ early_console = con;
+ if (keep_early)
+ early_console->flags &= ~CON_BOOT;
+ else
+ early_console->flags |= CON_BOOT;
+ register_console(early_console);
+}
+
+static int __init setup_early_printk(char *buf)
+{
+ int keep;
+
+ if (!buf)
+ return 0;
+
+ if (early_console)
+ return 0;
+
+ keep = (strstr(buf, "keep") != NULL);
+
+ while (*buf != '\0') {
+ if (!strncmp(buf, "serial", 6)) {
+ buf += 6;
+ early_serial_init(buf);
+ early_console_register(&early_serial_console, keep);
+ if (!strncmp(buf, ",ttyS", 5))
+ buf += 5;
+ }
+ if (!strncmp(buf, "ttyS", 4)) {
+ early_serial_init(buf + 4);
+ early_console_register(&early_serial_console, keep);
+ }
+#ifdef CONFIG_PCI
+ if (!strncmp(buf, "pciserial", 9)) {
+ early_pci_serial_init(buf + 9);
+ early_console_register(&early_serial_console, keep);
+ buf += 9; /* Keep from match the above "serial" */
+ }
+#endif
+ if (!strncmp(buf, "vga", 3) &&
+ boot_params.screen_info.orig_video_isVGA == 1) {
+ max_xpos = boot_params.screen_info.orig_video_cols;
+ max_ypos = boot_params.screen_info.orig_video_lines;
+ current_ypos = boot_params.screen_info.orig_y;
+ early_console_register(&early_vga_console, keep);
+ }
+#ifdef CONFIG_EARLY_PRINTK_DBGP
+ if (!strncmp(buf, "dbgp", 4) && !early_dbgp_init(buf + 4))
+ early_console_register(&early_dbgp_console, keep);
+#endif
+#ifdef CONFIG_HVC_XEN
+ if (!strncmp(buf, "xen", 3))
+ early_console_register(&xenboot_console, keep);
+#endif
+#ifdef CONFIG_EARLY_PRINTK_EFI
+ if (!strncmp(buf, "efi", 3))
+ early_console_register(&early_efi_console, keep);
+#endif
+#ifdef CONFIG_EARLY_PRINTK_USB_XDBC
+ if (!strncmp(buf, "xdbc", 4))
+ early_xdbc_parse_parameter(buf + 4);
+#endif
+
+ buf++;
+ }
+ return 0;
+}
+
+early_param("earlyprintk", setup_early_printk);
diff --git a/arch/x86/kernel/ebda.c b/arch/x86/kernel/ebda.c
new file mode 100644
index 000000000..38e7d597b
--- /dev/null
+++ b/arch/x86/kernel/ebda.c
@@ -0,0 +1,98 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/memblock.h>
+
+#include <asm/setup.h>
+#include <asm/bios_ebda.h>
+
+/*
+ * This function reserves all conventional PC system BIOS related
+ * firmware memory areas (some of which are data, some of which
+ * are code), that must not be used by the kernel as available
+ * RAM.
+ *
+ * The BIOS places the EBDA/XBDA at the top of conventional
+ * memory, and usually decreases the reported amount of
+ * conventional memory (int 0x12) too.
+ *
+ * This means that as a first approximation on most systems we can
+ * guess the reserved BIOS area by looking at the low BIOS RAM size
+ * value and assume that everything above that value (up to 1MB) is
+ * reserved.
+ *
+ * But life in firmware country is not that simple:
+ *
+ * - This code also contains a quirk for Dell systems that neglect
+ * to reserve the EBDA area in the 'RAM size' value ...
+ *
+ * - The same quirk also avoids a problem with the AMD768MPX
+ * chipset: reserve a page before VGA to prevent PCI prefetch
+ * into it (errata #56). (Usually the page is reserved anyways,
+ * unless you have no PS/2 mouse plugged in.)
+ *
+ * - Plus paravirt systems don't have a reliable value in the
+ * 'BIOS RAM size' pointer we can rely on, so we must quirk
+ * them too.
+ *
+ * Due to those various problems this function is deliberately
+ * very conservative and tries to err on the side of reserving
+ * too much, to not risk reserving too little.
+ *
+ * Losing a small amount of memory in the bottom megabyte is
+ * rarely a problem, as long as we have enough memory to install
+ * the SMP bootup trampoline which *must* be in this area.
+ *
+ * Using memory that is in use by the BIOS or by some DMA device
+ * the BIOS didn't shut down *is* a big problem to the kernel,
+ * obviously.
+ */
+
+#define BIOS_RAM_SIZE_KB_PTR 0x413
+
+#define BIOS_START_MIN 0x20000U /* 128K, less than this is insane */
+#define BIOS_START_MAX 0x9f000U /* 640K, absolute maximum */
+
+void __init reserve_bios_regions(void)
+{
+ unsigned int bios_start, ebda_start;
+
+ /*
+ * NOTE: In a paravirtual environment the BIOS reserved
+ * area is absent. We'll just have to assume that the
+ * paravirt case can handle memory setup correctly,
+ * without our help.
+ */
+ if (!x86_platform.legacy.reserve_bios_regions)
+ return;
+
+ /*
+ * BIOS RAM size is encoded in kilobytes, convert it
+ * to bytes to get a first guess at where the BIOS
+ * firmware area starts:
+ */
+ bios_start = *(unsigned short *)__va(BIOS_RAM_SIZE_KB_PTR);
+ bios_start <<= 10;
+
+ /*
+ * If bios_start is less than 128K, assume it is bogus
+ * and bump it up to 640K. Similarly, if bios_start is above 640K,
+ * don't trust it.
+ */
+ if (bios_start < BIOS_START_MIN || bios_start > BIOS_START_MAX)
+ bios_start = BIOS_START_MAX;
+
+ /* Get the start address of the EBDA page: */
+ ebda_start = get_bios_ebda();
+
+ /*
+ * If the EBDA start address is sane and is below the BIOS region,
+ * then also reserve everything from the EBDA start address up to
+ * the BIOS region.
+ */
+ if (ebda_start >= BIOS_START_MIN && ebda_start < bios_start)
+ bios_start = ebda_start;
+
+ /* Reserve all memory between bios_start and the 1MB mark: */
+ memblock_reserve(bios_start, 0x100000 - bios_start);
+}
diff --git a/arch/x86/kernel/eisa.c b/arch/x86/kernel/eisa.c
new file mode 100644
index 000000000..e8c8c5d78
--- /dev/null
+++ b/arch/x86/kernel/eisa.c
@@ -0,0 +1,25 @@
+/*
+ * EISA specific code
+ *
+ * This file is licensed under the GPL V2
+ */
+#include <linux/ioport.h>
+#include <linux/eisa.h>
+#include <linux/io.h>
+
+#include <xen/xen.h>
+
+static __init int eisa_bus_probe(void)
+{
+ void __iomem *p;
+
+ if (xen_pv_domain() && !xen_initial_domain())
+ return 0;
+
+ p = ioremap(0x0FFFD9, 4);
+ if (p && readl(p) == 'E' + ('I' << 8) + ('S' << 16) + ('A' << 24))
+ EISA_bus = 1;
+ iounmap(p);
+ return 0;
+}
+subsys_initcall(eisa_bus_probe);
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
new file mode 100644
index 000000000..aebd0d5bc
--- /dev/null
+++ b/arch/x86/kernel/espfix_64.c
@@ -0,0 +1,215 @@
+/* ----------------------------------------------------------------------- *
+ *
+ * Copyright 2014 Intel Corporation; author: H. Peter Anvin
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * The IRET instruction, when returning to a 16-bit segment, only
+ * restores the bottom 16 bits of the user space stack pointer. This
+ * causes some 16-bit software to break, but it also leaks kernel state
+ * to user space.
+ *
+ * This works around this by creating percpu "ministacks", each of which
+ * is mapped 2^16 times 64K apart. When we detect that the return SS is
+ * on the LDT, we copy the IRET frame to the ministack and use the
+ * relevant alias to return to userspace. The ministacks are mapped
+ * readonly, so if the IRET fault we promote #GP to #DF which is an IST
+ * vector and thus has its own stack; we then do the fixup in the #DF
+ * handler.
+ *
+ * This file sets up the ministacks and the related page tables. The
+ * actual ministack invocation is in entry_64.S.
+ */
+
+#include <linux/init.h>
+#include <linux/init_task.h>
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <linux/gfp.h>
+#include <linux/random.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/setup.h>
+#include <asm/espfix.h>
+
+/*
+ * Note: we only need 6*8 = 48 bytes for the espfix stack, but round
+ * it up to a cache line to avoid unnecessary sharing.
+ */
+#define ESPFIX_STACK_SIZE (8*8UL)
+#define ESPFIX_STACKS_PER_PAGE (PAGE_SIZE/ESPFIX_STACK_SIZE)
+
+/* There is address space for how many espfix pages? */
+#define ESPFIX_PAGE_SPACE (1UL << (P4D_SHIFT-PAGE_SHIFT-16))
+
+#define ESPFIX_MAX_CPUS (ESPFIX_STACKS_PER_PAGE * ESPFIX_PAGE_SPACE)
+#if CONFIG_NR_CPUS > ESPFIX_MAX_CPUS
+# error "Need more virtual address space for the ESPFIX hack"
+#endif
+
+#define PGALLOC_GFP (GFP_KERNEL | __GFP_ZERO)
+
+/* This contains the *bottom* address of the espfix stack */
+DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack);
+DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr);
+
+/* Initialization mutex - should this be a spinlock? */
+static DEFINE_MUTEX(espfix_init_mutex);
+
+/* Page allocation bitmap - each page serves ESPFIX_STACKS_PER_PAGE CPUs */
+#define ESPFIX_MAX_PAGES DIV_ROUND_UP(CONFIG_NR_CPUS, ESPFIX_STACKS_PER_PAGE)
+static void *espfix_pages[ESPFIX_MAX_PAGES];
+
+static __page_aligned_bss pud_t espfix_pud_page[PTRS_PER_PUD]
+ __aligned(PAGE_SIZE);
+
+static unsigned int page_random, slot_random;
+
+/*
+ * This returns the bottom address of the espfix stack for a specific CPU.
+ * The math allows for a non-power-of-two ESPFIX_STACK_SIZE, in which case
+ * we have to account for some amount of padding at the end of each page.
+ */
+static inline unsigned long espfix_base_addr(unsigned int cpu)
+{
+ unsigned long page, slot;
+ unsigned long addr;
+
+ page = (cpu / ESPFIX_STACKS_PER_PAGE) ^ page_random;
+ slot = (cpu + slot_random) % ESPFIX_STACKS_PER_PAGE;
+ addr = (page << PAGE_SHIFT) + (slot * ESPFIX_STACK_SIZE);
+ addr = (addr & 0xffffUL) | ((addr & ~0xffffUL) << 16);
+ addr += ESPFIX_BASE_ADDR;
+ return addr;
+}
+
+#define PTE_STRIDE (65536/PAGE_SIZE)
+#define ESPFIX_PTE_CLONES (PTRS_PER_PTE/PTE_STRIDE)
+#define ESPFIX_PMD_CLONES PTRS_PER_PMD
+#define ESPFIX_PUD_CLONES (65536/(ESPFIX_PTE_CLONES*ESPFIX_PMD_CLONES))
+
+#define PGTABLE_PROT ((_KERNPG_TABLE & ~_PAGE_RW) | _PAGE_NX)
+
+static void init_espfix_random(void)
+{
+ unsigned long rand;
+
+ /*
+ * This is run before the entropy pools are initialized,
+ * but this is hopefully better than nothing.
+ */
+ if (!arch_get_random_long(&rand)) {
+ /* The constant is an arbitrary large prime */
+ rand = rdtsc();
+ rand *= 0xc345c6b72fd16123UL;
+ }
+
+ slot_random = rand % ESPFIX_STACKS_PER_PAGE;
+ page_random = (rand / ESPFIX_STACKS_PER_PAGE)
+ & (ESPFIX_PAGE_SPACE - 1);
+}
+
+void __init init_espfix_bsp(void)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+
+ /* Install the espfix pud into the kernel page directory */
+ pgd = &init_top_pgt[pgd_index(ESPFIX_BASE_ADDR)];
+ p4d = p4d_alloc(&init_mm, pgd, ESPFIX_BASE_ADDR);
+ p4d_populate(&init_mm, p4d, espfix_pud_page);
+
+ /* Randomize the locations */
+ init_espfix_random();
+
+ /* The rest is the same as for any other processor */
+ init_espfix_ap(0);
+}
+
+void init_espfix_ap(int cpu)
+{
+ unsigned int page;
+ unsigned long addr;
+ pud_t pud, *pud_p;
+ pmd_t pmd, *pmd_p;
+ pte_t pte, *pte_p;
+ int n, node;
+ void *stack_page;
+ pteval_t ptemask;
+
+ /* We only have to do this once... */
+ if (likely(per_cpu(espfix_stack, cpu)))
+ return; /* Already initialized */
+
+ addr = espfix_base_addr(cpu);
+ page = cpu/ESPFIX_STACKS_PER_PAGE;
+
+ /* Did another CPU already set this up? */
+ stack_page = READ_ONCE(espfix_pages[page]);
+ if (likely(stack_page))
+ goto done;
+
+ mutex_lock(&espfix_init_mutex);
+
+ /* Did we race on the lock? */
+ stack_page = READ_ONCE(espfix_pages[page]);
+ if (stack_page)
+ goto unlock_done;
+
+ node = cpu_to_node(cpu);
+ ptemask = __supported_pte_mask;
+
+ pud_p = &espfix_pud_page[pud_index(addr)];
+ pud = *pud_p;
+ if (!pud_present(pud)) {
+ struct page *page = alloc_pages_node(node, PGALLOC_GFP, 0);
+
+ pmd_p = (pmd_t *)page_address(page);
+ pud = __pud(__pa(pmd_p) | (PGTABLE_PROT & ptemask));
+ paravirt_alloc_pmd(&init_mm, __pa(pmd_p) >> PAGE_SHIFT);
+ for (n = 0; n < ESPFIX_PUD_CLONES; n++)
+ set_pud(&pud_p[n], pud);
+ }
+
+ pmd_p = pmd_offset(&pud, addr);
+ pmd = *pmd_p;
+ if (!pmd_present(pmd)) {
+ struct page *page = alloc_pages_node(node, PGALLOC_GFP, 0);
+
+ pte_p = (pte_t *)page_address(page);
+ pmd = __pmd(__pa(pte_p) | (PGTABLE_PROT & ptemask));
+ paravirt_alloc_pte(&init_mm, __pa(pte_p) >> PAGE_SHIFT);
+ for (n = 0; n < ESPFIX_PMD_CLONES; n++)
+ set_pmd(&pmd_p[n], pmd);
+ }
+
+ pte_p = pte_offset_kernel(&pmd, addr);
+ stack_page = page_address(alloc_pages_node(node, GFP_KERNEL, 0));
+ /*
+ * __PAGE_KERNEL_* includes _PAGE_GLOBAL, which we want since
+ * this is mapped to userspace.
+ */
+ pte = __pte(__pa(stack_page) | ((__PAGE_KERNEL_RO | _PAGE_ENC) & ptemask));
+ for (n = 0; n < ESPFIX_PTE_CLONES; n++)
+ set_pte(&pte_p[n*PTE_STRIDE], pte);
+
+ /* Job is done for this CPU and any CPU which shares this page */
+ WRITE_ONCE(espfix_pages[page], stack_page);
+
+unlock_done:
+ mutex_unlock(&espfix_init_mutex);
+done:
+ per_cpu(espfix_stack, cpu) = addr;
+ per_cpu(espfix_waddr, cpu) = (unsigned long)stack_page
+ + (addr & ~PAGE_MASK);
+}
diff --git a/arch/x86/kernel/fpu/Makefile b/arch/x86/kernel/fpu/Makefile
new file mode 100644
index 000000000..68279efb8
--- /dev/null
+++ b/arch/x86/kernel/fpu/Makefile
@@ -0,0 +1,5 @@
+#
+# Build rules for the FPU support code:
+#
+
+obj-y += init.o bugs.o core.o regset.o signal.o xstate.o
diff --git a/arch/x86/kernel/fpu/bugs.c b/arch/x86/kernel/fpu/bugs.c
new file mode 100644
index 000000000..2954fab15
--- /dev/null
+++ b/arch/x86/kernel/fpu/bugs.c
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * x86 FPU bug checks:
+ */
+#include <asm/fpu/internal.h>
+
+/*
+ * Boot time CPU/FPU FDIV bug detection code:
+ */
+
+static double __initdata x = 4195835.0;
+static double __initdata y = 3145727.0;
+
+/*
+ * This used to check for exceptions..
+ * However, it turns out that to support that,
+ * the XMM trap handlers basically had to
+ * be buggy. So let's have a correct XMM trap
+ * handler, and forget about printing out
+ * some status at boot.
+ *
+ * We should really only care about bugs here
+ * anyway. Not features.
+ */
+void __init fpu__init_check_bugs(void)
+{
+ s32 fdiv_bug;
+
+ /* kernel_fpu_begin/end() relies on patched alternative instructions. */
+ if (!boot_cpu_has(X86_FEATURE_FPU))
+ return;
+
+ kernel_fpu_begin();
+
+ /*
+ * trap_init() enabled FXSR and company _before_ testing for FP
+ * problems here.
+ *
+ * Test for the divl bug: http://en.wikipedia.org/wiki/Fdiv_bug
+ */
+ __asm__("fninit\n\t"
+ "fldl %1\n\t"
+ "fdivl %2\n\t"
+ "fmull %2\n\t"
+ "fldl %1\n\t"
+ "fsubp %%st,%%st(1)\n\t"
+ "fistpl %0\n\t"
+ "fwait\n\t"
+ "fninit"
+ : "=m" (*&fdiv_bug)
+ : "m" (*&x), "m" (*&y));
+
+ kernel_fpu_end();
+
+ if (fdiv_bug) {
+ set_cpu_bug(&boot_cpu_data, X86_BUG_FDIV);
+ pr_warn("Hmm, FPU with FDIV bug\n");
+ }
+}
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
new file mode 100644
index 000000000..2e5003fef
--- /dev/null
+++ b/arch/x86/kernel/fpu/core.c
@@ -0,0 +1,468 @@
+/*
+ * Copyright (C) 1994 Linus Torvalds
+ *
+ * Pentium III FXSR, SSE support
+ * General FPU state handling cleanups
+ * Gareth Hughes <gareth@valinux.com>, May 2000
+ */
+#include <asm/fpu/internal.h>
+#include <asm/fpu/regset.h>
+#include <asm/fpu/signal.h>
+#include <asm/fpu/types.h>
+#include <asm/traps.h>
+#include <asm/irq_regs.h>
+
+#include <linux/hardirq.h>
+#include <linux/pkeys.h>
+
+#define CREATE_TRACE_POINTS
+#include <asm/trace/fpu.h>
+
+/*
+ * Represents the initial FPU state. It's mostly (but not completely) zeroes,
+ * depending on the FPU hardware format:
+ */
+union fpregs_state init_fpstate __read_mostly;
+
+/*
+ * Track whether the kernel is using the FPU state
+ * currently.
+ *
+ * This flag is used:
+ *
+ * - by IRQ context code to potentially use the FPU
+ * if it's unused.
+ *
+ * - to debug kernel_fpu_begin()/end() correctness
+ */
+static DEFINE_PER_CPU(bool, in_kernel_fpu);
+
+/*
+ * Track which context is using the FPU on the CPU:
+ */
+DEFINE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx);
+
+static void kernel_fpu_disable(void)
+{
+ WARN_ON_FPU(this_cpu_read(in_kernel_fpu));
+ this_cpu_write(in_kernel_fpu, true);
+}
+
+static void kernel_fpu_enable(void)
+{
+ WARN_ON_FPU(!this_cpu_read(in_kernel_fpu));
+ this_cpu_write(in_kernel_fpu, false);
+}
+
+static bool kernel_fpu_disabled(void)
+{
+ return this_cpu_read(in_kernel_fpu);
+}
+
+static bool interrupted_kernel_fpu_idle(void)
+{
+ return !kernel_fpu_disabled();
+}
+
+/*
+ * Were we in user mode (or vm86 mode) when we were
+ * interrupted?
+ *
+ * Doing kernel_fpu_begin/end() is ok if we are running
+ * in an interrupt context from user mode - we'll just
+ * save the FPU state as required.
+ */
+static bool interrupted_user_mode(void)
+{
+ struct pt_regs *regs = get_irq_regs();
+ return regs && user_mode(regs);
+}
+
+/*
+ * Can we use the FPU in kernel mode with the
+ * whole "kernel_fpu_begin/end()" sequence?
+ *
+ * It's always ok in process context (ie "not interrupt")
+ * but it is sometimes ok even from an irq.
+ */
+bool irq_fpu_usable(void)
+{
+ return !in_interrupt() ||
+ interrupted_user_mode() ||
+ interrupted_kernel_fpu_idle();
+}
+EXPORT_SYMBOL(irq_fpu_usable);
+
+static void __kernel_fpu_begin(void)
+{
+ struct fpu *fpu = &current->thread.fpu;
+
+ WARN_ON_FPU(!irq_fpu_usable());
+
+ kernel_fpu_disable();
+
+ if (fpu->initialized) {
+ /*
+ * Ignore return value -- we don't care if reg state
+ * is clobbered.
+ */
+ copy_fpregs_to_fpstate(fpu);
+ } else {
+ __cpu_invalidate_fpregs_state();
+ }
+}
+
+static void __kernel_fpu_end(void)
+{
+ struct fpu *fpu = &current->thread.fpu;
+
+ if (fpu->initialized)
+ copy_kernel_to_fpregs(&fpu->state);
+
+ kernel_fpu_enable();
+}
+
+void kernel_fpu_begin(void)
+{
+ preempt_disable();
+ __kernel_fpu_begin();
+}
+EXPORT_SYMBOL_GPL(kernel_fpu_begin);
+
+void kernel_fpu_end(void)
+{
+ __kernel_fpu_end();
+ preempt_enable();
+}
+EXPORT_SYMBOL_GPL(kernel_fpu_end);
+
+/*
+ * Save the FPU state (mark it for reload if necessary):
+ *
+ * This only ever gets called for the current task.
+ */
+void fpu__save(struct fpu *fpu)
+{
+ WARN_ON_FPU(fpu != &current->thread.fpu);
+
+ preempt_disable();
+ trace_x86_fpu_before_save(fpu);
+ if (fpu->initialized) {
+ if (!copy_fpregs_to_fpstate(fpu)) {
+ copy_kernel_to_fpregs(&fpu->state);
+ }
+ }
+ trace_x86_fpu_after_save(fpu);
+ preempt_enable();
+}
+EXPORT_SYMBOL_GPL(fpu__save);
+
+/*
+ * Legacy x87 fpstate state init:
+ */
+static inline void fpstate_init_fstate(struct fregs_state *fp)
+{
+ fp->cwd = 0xffff037fu;
+ fp->swd = 0xffff0000u;
+ fp->twd = 0xffffffffu;
+ fp->fos = 0xffff0000u;
+}
+
+void fpstate_init(union fpregs_state *state)
+{
+ if (!static_cpu_has(X86_FEATURE_FPU)) {
+ fpstate_init_soft(&state->soft);
+ return;
+ }
+
+ memset(state, 0, fpu_kernel_xstate_size);
+
+ if (static_cpu_has(X86_FEATURE_XSAVES))
+ fpstate_init_xstate(&state->xsave);
+ if (static_cpu_has(X86_FEATURE_FXSR))
+ fpstate_init_fxstate(&state->fxsave);
+ else
+ fpstate_init_fstate(&state->fsave);
+}
+EXPORT_SYMBOL_GPL(fpstate_init);
+
+int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu)
+{
+ dst_fpu->last_cpu = -1;
+
+ if (!src_fpu->initialized || !static_cpu_has(X86_FEATURE_FPU))
+ return 0;
+
+ WARN_ON_FPU(src_fpu != &current->thread.fpu);
+
+ /*
+ * Don't let 'init optimized' areas of the XSAVE area
+ * leak into the child task:
+ */
+ memset(&dst_fpu->state.xsave, 0, fpu_kernel_xstate_size);
+
+ /*
+ * Save current FPU registers directly into the child
+ * FPU context, without any memory-to-memory copying.
+ *
+ * ( The function 'fails' in the FNSAVE case, which destroys
+ * register contents so we have to copy them back. )
+ */
+ if (!copy_fpregs_to_fpstate(dst_fpu)) {
+ memcpy(&src_fpu->state, &dst_fpu->state, fpu_kernel_xstate_size);
+ copy_kernel_to_fpregs(&src_fpu->state);
+ }
+
+ trace_x86_fpu_copy_src(src_fpu);
+ trace_x86_fpu_copy_dst(dst_fpu);
+
+ return 0;
+}
+
+/*
+ * Activate the current task's in-memory FPU context,
+ * if it has not been used before:
+ */
+void fpu__initialize(struct fpu *fpu)
+{
+ WARN_ON_FPU(fpu != &current->thread.fpu);
+
+ if (!fpu->initialized) {
+ fpstate_init(&fpu->state);
+ trace_x86_fpu_init_state(fpu);
+
+ trace_x86_fpu_activate_state(fpu);
+ /* Safe to do for the current task: */
+ fpu->initialized = 1;
+ }
+}
+EXPORT_SYMBOL_GPL(fpu__initialize);
+
+/*
+ * This function must be called before we read a task's fpstate.
+ *
+ * There's two cases where this gets called:
+ *
+ * - for the current task (when coredumping), in which case we have
+ * to save the latest FPU registers into the fpstate,
+ *
+ * - or it's called for stopped tasks (ptrace), in which case the
+ * registers were already saved by the context-switch code when
+ * the task scheduled out - we only have to initialize the registers
+ * if they've never been initialized.
+ *
+ * If the task has used the FPU before then save it.
+ */
+void fpu__prepare_read(struct fpu *fpu)
+{
+ if (fpu == &current->thread.fpu) {
+ fpu__save(fpu);
+ } else {
+ if (!fpu->initialized) {
+ fpstate_init(&fpu->state);
+ trace_x86_fpu_init_state(fpu);
+
+ trace_x86_fpu_activate_state(fpu);
+ /* Safe to do for current and for stopped child tasks: */
+ fpu->initialized = 1;
+ }
+ }
+}
+
+/*
+ * This function must be called before we write a task's fpstate.
+ *
+ * If the task has used the FPU before then invalidate any cached FPU registers.
+ * If the task has not used the FPU before then initialize its fpstate.
+ *
+ * After this function call, after registers in the fpstate are
+ * modified and the child task has woken up, the child task will
+ * restore the modified FPU state from the modified context. If we
+ * didn't clear its cached status here then the cached in-registers
+ * state pending on its former CPU could be restored, corrupting
+ * the modifications.
+ */
+void fpu__prepare_write(struct fpu *fpu)
+{
+ /*
+ * Only stopped child tasks can be used to modify the FPU
+ * state in the fpstate buffer:
+ */
+ WARN_ON_FPU(fpu == &current->thread.fpu);
+
+ if (fpu->initialized) {
+ /* Invalidate any cached state: */
+ __fpu_invalidate_fpregs_state(fpu);
+ } else {
+ fpstate_init(&fpu->state);
+ trace_x86_fpu_init_state(fpu);
+
+ trace_x86_fpu_activate_state(fpu);
+ /* Safe to do for stopped child tasks: */
+ fpu->initialized = 1;
+ }
+}
+
+/*
+ * 'fpu__restore()' is called to copy FPU registers from
+ * the FPU fpstate to the live hw registers and to activate
+ * access to the hardware registers, so that FPU instructions
+ * can be used afterwards.
+ *
+ * Must be called with kernel preemption disabled (for example
+ * with local interrupts disabled, as it is in the case of
+ * do_device_not_available()).
+ */
+void fpu__restore(struct fpu *fpu)
+{
+ fpu__initialize(fpu);
+
+ /* Avoid __kernel_fpu_begin() right after fpregs_activate() */
+ kernel_fpu_disable();
+ trace_x86_fpu_before_restore(fpu);
+ fpregs_activate(fpu);
+ copy_kernel_to_fpregs(&fpu->state);
+ trace_x86_fpu_after_restore(fpu);
+ kernel_fpu_enable();
+}
+EXPORT_SYMBOL_GPL(fpu__restore);
+
+/*
+ * Drops current FPU state: deactivates the fpregs and
+ * the fpstate. NOTE: it still leaves previous contents
+ * in the fpregs in the eager-FPU case.
+ *
+ * This function can be used in cases where we know that
+ * a state-restore is coming: either an explicit one,
+ * or a reschedule.
+ */
+void fpu__drop(struct fpu *fpu)
+{
+ preempt_disable();
+
+ if (fpu == &current->thread.fpu) {
+ if (fpu->initialized) {
+ /* Ignore delayed exceptions from user space */
+ asm volatile("1: fwait\n"
+ "2:\n"
+ _ASM_EXTABLE(1b, 2b));
+ fpregs_deactivate(fpu);
+ }
+ }
+
+ fpu->initialized = 0;
+
+ trace_x86_fpu_dropped(fpu);
+
+ preempt_enable();
+}
+
+/*
+ * Clear FPU registers by setting them up from
+ * the init fpstate:
+ */
+static inline void copy_init_fpstate_to_fpregs(void)
+{
+ if (use_xsave())
+ copy_kernel_to_xregs(&init_fpstate.xsave, -1);
+ else if (static_cpu_has(X86_FEATURE_FXSR))
+ copy_kernel_to_fxregs(&init_fpstate.fxsave);
+ else
+ copy_kernel_to_fregs(&init_fpstate.fsave);
+
+ if (boot_cpu_has(X86_FEATURE_OSPKE))
+ copy_init_pkru_to_fpregs();
+}
+
+/*
+ * Clear the FPU state back to init state.
+ *
+ * Called by sys_execve(), by the signal handler code and by various
+ * error paths.
+ */
+void fpu__clear(struct fpu *fpu)
+{
+ WARN_ON_FPU(fpu != &current->thread.fpu); /* Almost certainly an anomaly */
+
+ fpu__drop(fpu);
+
+ /*
+ * Make sure fpstate is cleared and initialized.
+ */
+ if (static_cpu_has(X86_FEATURE_FPU)) {
+ preempt_disable();
+ fpu__initialize(fpu);
+ user_fpu_begin();
+ copy_init_fpstate_to_fpregs();
+ preempt_enable();
+ }
+}
+
+/*
+ * x87 math exception handling:
+ */
+
+int fpu__exception_code(struct fpu *fpu, int trap_nr)
+{
+ int err;
+
+ if (trap_nr == X86_TRAP_MF) {
+ unsigned short cwd, swd;
+ /*
+ * (~cwd & swd) will mask out exceptions that are not set to unmasked
+ * status. 0x3f is the exception bits in these regs, 0x200 is the
+ * C1 reg you need in case of a stack fault, 0x040 is the stack
+ * fault bit. We should only be taking one exception at a time,
+ * so if this combination doesn't produce any single exception,
+ * then we have a bad program that isn't synchronizing its FPU usage
+ * and it will suffer the consequences since we won't be able to
+ * fully reproduce the context of the exception.
+ */
+ if (boot_cpu_has(X86_FEATURE_FXSR)) {
+ cwd = fpu->state.fxsave.cwd;
+ swd = fpu->state.fxsave.swd;
+ } else {
+ cwd = (unsigned short)fpu->state.fsave.cwd;
+ swd = (unsigned short)fpu->state.fsave.swd;
+ }
+
+ err = swd & ~cwd;
+ } else {
+ /*
+ * The SIMD FPU exceptions are handled a little differently, as there
+ * is only a single status/control register. Thus, to determine which
+ * unmasked exception was caught we must mask the exception mask bits
+ * at 0x1f80, and then use these to mask the exception bits at 0x3f.
+ */
+ unsigned short mxcsr = MXCSR_DEFAULT;
+
+ if (boot_cpu_has(X86_FEATURE_XMM))
+ mxcsr = fpu->state.fxsave.mxcsr;
+
+ err = ~(mxcsr >> 7) & mxcsr;
+ }
+
+ if (err & 0x001) { /* Invalid op */
+ /*
+ * swd & 0x240 == 0x040: Stack Underflow
+ * swd & 0x240 == 0x240: Stack Overflow
+ * User must clear the SF bit (0x40) if set
+ */
+ return FPE_FLTINV;
+ } else if (err & 0x004) { /* Divide by Zero */
+ return FPE_FLTDIV;
+ } else if (err & 0x008) { /* Overflow */
+ return FPE_FLTOVF;
+ } else if (err & 0x012) { /* Denormal, Underflow */
+ return FPE_FLTUND;
+ } else if (err & 0x020) { /* Precision */
+ return FPE_FLTRES;
+ }
+
+ /*
+ * If we're using IRQ 13, or supposedly even some trap
+ * X86_TRAP_MF implementations, it's possible
+ * we get a spurious trap, which is not an error.
+ */
+ return 0;
+}
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
new file mode 100644
index 000000000..9692ccc58
--- /dev/null
+++ b/arch/x86/kernel/fpu/init.c
@@ -0,0 +1,317 @@
+/*
+ * x86 FPU boot time init code:
+ */
+#include <asm/fpu/internal.h>
+#include <asm/tlbflush.h>
+#include <asm/setup.h>
+#include <asm/cmdline.h>
+
+#include <linux/sched.h>
+#include <linux/sched/task.h>
+#include <linux/init.h>
+
+/*
+ * Initialize the registers found in all CPUs, CR0 and CR4:
+ */
+static void fpu__init_cpu_generic(void)
+{
+ unsigned long cr0;
+ unsigned long cr4_mask = 0;
+
+ if (boot_cpu_has(X86_FEATURE_FXSR))
+ cr4_mask |= X86_CR4_OSFXSR;
+ if (boot_cpu_has(X86_FEATURE_XMM))
+ cr4_mask |= X86_CR4_OSXMMEXCPT;
+ if (cr4_mask)
+ cr4_set_bits(cr4_mask);
+
+ cr0 = read_cr0();
+ cr0 &= ~(X86_CR0_TS|X86_CR0_EM); /* clear TS and EM */
+ if (!boot_cpu_has(X86_FEATURE_FPU))
+ cr0 |= X86_CR0_EM;
+ write_cr0(cr0);
+
+ /* Flush out any pending x87 state: */
+#ifdef CONFIG_MATH_EMULATION
+ if (!boot_cpu_has(X86_FEATURE_FPU))
+ fpstate_init_soft(&current->thread.fpu.state.soft);
+ else
+#endif
+ asm volatile ("fninit");
+}
+
+/*
+ * Enable all supported FPU features. Called when a CPU is brought online:
+ */
+void fpu__init_cpu(void)
+{
+ fpu__init_cpu_generic();
+ fpu__init_cpu_xstate();
+}
+
+static bool fpu__probe_without_cpuid(void)
+{
+ unsigned long cr0;
+ u16 fsw, fcw;
+
+ fsw = fcw = 0xffff;
+
+ cr0 = read_cr0();
+ cr0 &= ~(X86_CR0_TS | X86_CR0_EM);
+ write_cr0(cr0);
+
+ asm volatile("fninit ; fnstsw %0 ; fnstcw %1" : "+m" (fsw), "+m" (fcw));
+
+ pr_info("x86/fpu: Probing for FPU: FSW=0x%04hx FCW=0x%04hx\n", fsw, fcw);
+
+ return fsw == 0 && (fcw & 0x103f) == 0x003f;
+}
+
+static void fpu__init_system_early_generic(struct cpuinfo_x86 *c)
+{
+ if (!boot_cpu_has(X86_FEATURE_CPUID) &&
+ !test_bit(X86_FEATURE_FPU, (unsigned long *)cpu_caps_cleared)) {
+ if (fpu__probe_without_cpuid())
+ setup_force_cpu_cap(X86_FEATURE_FPU);
+ else
+ setup_clear_cpu_cap(X86_FEATURE_FPU);
+ }
+
+#ifndef CONFIG_MATH_EMULATION
+ if (!test_cpu_cap(&boot_cpu_data, X86_FEATURE_FPU)) {
+ pr_emerg("x86/fpu: Giving up, no FPU found and no math emulation present\n");
+ for (;;)
+ asm volatile("hlt");
+ }
+#endif
+}
+
+/*
+ * Boot time FPU feature detection code:
+ */
+unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu;
+EXPORT_SYMBOL_GPL(mxcsr_feature_mask);
+
+static void __init fpu__init_system_mxcsr(void)
+{
+ unsigned int mask = 0;
+
+ if (boot_cpu_has(X86_FEATURE_FXSR)) {
+ /* Static because GCC does not get 16-byte stack alignment right: */
+ static struct fxregs_state fxregs __initdata;
+
+ asm volatile("fxsave %0" : "+m" (fxregs));
+
+ mask = fxregs.mxcsr_mask;
+
+ /*
+ * If zero then use the default features mask,
+ * which has all features set, except the
+ * denormals-are-zero feature bit:
+ */
+ if (mask == 0)
+ mask = 0x0000ffbf;
+ }
+ mxcsr_feature_mask &= mask;
+}
+
+/*
+ * Once per bootup FPU initialization sequences that will run on most x86 CPUs:
+ */
+static void __init fpu__init_system_generic(void)
+{
+ /*
+ * Set up the legacy init FPU context. (xstate init might overwrite this
+ * with a more modern format, if the CPU supports it.)
+ */
+ fpstate_init(&init_fpstate);
+
+ fpu__init_system_mxcsr();
+}
+
+/*
+ * Size of the FPU context state. All tasks in the system use the
+ * same context size, regardless of what portion they use.
+ * This is inherent to the XSAVE architecture which puts all state
+ * components into a single, continuous memory block:
+ */
+unsigned int fpu_kernel_xstate_size;
+EXPORT_SYMBOL_GPL(fpu_kernel_xstate_size);
+
+/* Get alignment of the TYPE. */
+#define TYPE_ALIGN(TYPE) offsetof(struct { char x; TYPE test; }, test)
+
+/*
+ * Enforce that 'MEMBER' is the last field of 'TYPE'.
+ *
+ * Align the computed size with alignment of the TYPE,
+ * because that's how C aligns structs.
+ */
+#define CHECK_MEMBER_AT_END_OF(TYPE, MEMBER) \
+ BUILD_BUG_ON(sizeof(TYPE) != ALIGN(offsetofend(TYPE, MEMBER), \
+ TYPE_ALIGN(TYPE)))
+
+/*
+ * We append the 'struct fpu' to the task_struct:
+ */
+static void __init fpu__init_task_struct_size(void)
+{
+ int task_size = sizeof(struct task_struct);
+
+ /*
+ * Subtract off the static size of the register state.
+ * It potentially has a bunch of padding.
+ */
+ task_size -= sizeof(((struct task_struct *)0)->thread.fpu.state);
+
+ /*
+ * Add back the dynamically-calculated register state
+ * size.
+ */
+ task_size += fpu_kernel_xstate_size;
+
+ /*
+ * We dynamically size 'struct fpu', so we require that
+ * it be at the end of 'thread_struct' and that
+ * 'thread_struct' be at the end of 'task_struct'. If
+ * you hit a compile error here, check the structure to
+ * see if something got added to the end.
+ */
+ CHECK_MEMBER_AT_END_OF(struct fpu, state);
+ CHECK_MEMBER_AT_END_OF(struct thread_struct, fpu);
+ CHECK_MEMBER_AT_END_OF(struct task_struct, thread);
+
+ arch_task_struct_size = task_size;
+}
+
+/*
+ * Set up the user and kernel xstate sizes based on the legacy FPU context size.
+ *
+ * We set this up first, and later it will be overwritten by
+ * fpu__init_system_xstate() if the CPU knows about xstates.
+ */
+static void __init fpu__init_system_xstate_size_legacy(void)
+{
+ static int on_boot_cpu __initdata = 1;
+
+ WARN_ON_FPU(!on_boot_cpu);
+ on_boot_cpu = 0;
+
+ /*
+ * Note that xstate sizes might be overwritten later during
+ * fpu__init_system_xstate().
+ */
+
+ if (!boot_cpu_has(X86_FEATURE_FPU)) {
+ /*
+ * Disable xsave as we do not support it if i387
+ * emulation is enabled.
+ */
+ setup_clear_cpu_cap(X86_FEATURE_XSAVE);
+ setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
+ fpu_kernel_xstate_size = sizeof(struct swregs_state);
+ } else {
+ if (boot_cpu_has(X86_FEATURE_FXSR))
+ fpu_kernel_xstate_size =
+ sizeof(struct fxregs_state);
+ else
+ fpu_kernel_xstate_size =
+ sizeof(struct fregs_state);
+ }
+
+ fpu_user_xstate_size = fpu_kernel_xstate_size;
+}
+
+/*
+ * Find supported xfeatures based on cpu features and command-line input.
+ * This must be called after fpu__init_parse_early_param() is called and
+ * xfeatures_mask is enumerated.
+ */
+u64 __init fpu__get_supported_xfeatures_mask(void)
+{
+ return XCNTXT_MASK;
+}
+
+/* Legacy code to initialize eager fpu mode. */
+static void __init fpu__init_system_ctx_switch(void)
+{
+ static bool on_boot_cpu __initdata = 1;
+
+ WARN_ON_FPU(!on_boot_cpu);
+ on_boot_cpu = 0;
+
+ WARN_ON_FPU(current->thread.fpu.initialized);
+}
+
+/*
+ * We parse fpu parameters early because fpu__init_system() is executed
+ * before parse_early_param().
+ */
+static void __init fpu__init_parse_early_param(void)
+{
+ char arg[128];
+ char *argptr = arg;
+ int arglen, res, bit;
+
+ if (cmdline_find_option_bool(boot_command_line, "no387"))
+ setup_clear_cpu_cap(X86_FEATURE_FPU);
+
+ if (cmdline_find_option_bool(boot_command_line, "nofxsr")) {
+ setup_clear_cpu_cap(X86_FEATURE_FXSR);
+ setup_clear_cpu_cap(X86_FEATURE_FXSR_OPT);
+ setup_clear_cpu_cap(X86_FEATURE_XMM);
+ }
+
+ if (cmdline_find_option_bool(boot_command_line, "noxsave"))
+ fpu__xstate_clear_all_cpu_caps();
+
+ if (cmdline_find_option_bool(boot_command_line, "noxsaveopt"))
+ setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
+
+ if (cmdline_find_option_bool(boot_command_line, "noxsaves"))
+ setup_clear_cpu_cap(X86_FEATURE_XSAVES);
+
+ arglen = cmdline_find_option(boot_command_line, "clearcpuid", arg, sizeof(arg));
+ if (arglen <= 0)
+ return;
+
+ pr_info("Clearing CPUID bits:");
+ do {
+ res = get_option(&argptr, &bit);
+ if (res == 0 || res == 3)
+ break;
+
+ /* If the argument was too long, the last bit may be cut off */
+ if (res == 1 && arglen >= sizeof(arg))
+ break;
+
+ if (bit >= 0 && bit < NCAPINTS * 32) {
+ pr_cont(" " X86_CAP_FMT, x86_cap_flag(bit));
+ setup_clear_cpu_cap(bit);
+ }
+ } while (res == 2);
+ pr_cont("\n");
+}
+
+/*
+ * Called on the boot CPU once per system bootup, to set up the initial
+ * FPU state that is later cloned into all processes:
+ */
+void __init fpu__init_system(struct cpuinfo_x86 *c)
+{
+ fpu__init_parse_early_param();
+ fpu__init_system_early_generic(c);
+
+ /*
+ * The FPU has to be operational for some of the
+ * later FPU init activities:
+ */
+ fpu__init_cpu();
+
+ fpu__init_system_generic();
+ fpu__init_system_xstate_size_legacy();
+ fpu__init_system_xstate();
+ fpu__init_task_struct_size();
+
+ fpu__init_system_ctx_switch();
+}
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
new file mode 100644
index 000000000..621d249de
--- /dev/null
+++ b/arch/x86/kernel/fpu/regset.c
@@ -0,0 +1,387 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * FPU register's regset abstraction, for ptrace, core dumps, etc.
+ */
+#include <asm/fpu/internal.h>
+#include <asm/fpu/signal.h>
+#include <asm/fpu/regset.h>
+#include <asm/fpu/xstate.h>
+#include <linux/sched/task_stack.h>
+
+/*
+ * The xstateregs_active() routine is the same as the regset_fpregs_active() routine,
+ * as the "regset->n" for the xstate regset will be updated based on the feature
+ * capabilities supported by the xsave.
+ */
+int regset_fpregs_active(struct task_struct *target, const struct user_regset *regset)
+{
+ struct fpu *target_fpu = &target->thread.fpu;
+
+ return target_fpu->initialized ? regset->n : 0;
+}
+
+int regset_xregset_fpregs_active(struct task_struct *target, const struct user_regset *regset)
+{
+ struct fpu *target_fpu = &target->thread.fpu;
+
+ if (boot_cpu_has(X86_FEATURE_FXSR) && target_fpu->initialized)
+ return regset->n;
+ else
+ return 0;
+}
+
+int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
+ unsigned int pos, unsigned int count,
+ void *kbuf, void __user *ubuf)
+{
+ struct fpu *fpu = &target->thread.fpu;
+
+ if (!boot_cpu_has(X86_FEATURE_FXSR))
+ return -ENODEV;
+
+ fpu__prepare_read(fpu);
+ fpstate_sanitize_xstate(fpu);
+
+ return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+ &fpu->state.fxsave, 0, -1);
+}
+
+int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
+ unsigned int pos, unsigned int count,
+ const void *kbuf, const void __user *ubuf)
+{
+ struct fpu *fpu = &target->thread.fpu;
+ int ret;
+
+ if (!boot_cpu_has(X86_FEATURE_FXSR))
+ return -ENODEV;
+
+ fpu__prepare_write(fpu);
+ fpstate_sanitize_xstate(fpu);
+
+ ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+ &fpu->state.fxsave, 0, -1);
+
+ /*
+ * mxcsr reserved bits must be masked to zero for security reasons.
+ */
+ fpu->state.fxsave.mxcsr &= mxcsr_feature_mask;
+
+ /*
+ * update the header bits in the xsave header, indicating the
+ * presence of FP and SSE state.
+ */
+ if (boot_cpu_has(X86_FEATURE_XSAVE))
+ fpu->state.xsave.header.xfeatures |= XFEATURE_MASK_FPSSE;
+
+ return ret;
+}
+
+int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
+ unsigned int pos, unsigned int count,
+ void *kbuf, void __user *ubuf)
+{
+ struct fpu *fpu = &target->thread.fpu;
+ struct xregs_state *xsave;
+ int ret;
+
+ if (!boot_cpu_has(X86_FEATURE_XSAVE))
+ return -ENODEV;
+
+ xsave = &fpu->state.xsave;
+
+ fpu__prepare_read(fpu);
+
+ if (using_compacted_format()) {
+ if (kbuf)
+ ret = copy_xstate_to_kernel(kbuf, xsave, pos, count);
+ else
+ ret = copy_xstate_to_user(ubuf, xsave, pos, count);
+ } else {
+ fpstate_sanitize_xstate(fpu);
+ /*
+ * Copy the 48 bytes defined by the software into the xsave
+ * area in the thread struct, so that we can copy the whole
+ * area to user using one user_regset_copyout().
+ */
+ memcpy(&xsave->i387.sw_reserved, xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes));
+
+ /*
+ * Copy the xstate memory layout.
+ */
+ ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, xsave, 0, -1);
+ }
+ return ret;
+}
+
+int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
+ unsigned int pos, unsigned int count,
+ const void *kbuf, const void __user *ubuf)
+{
+ struct fpu *fpu = &target->thread.fpu;
+ struct xregs_state *xsave;
+ int ret;
+
+ if (!boot_cpu_has(X86_FEATURE_XSAVE))
+ return -ENODEV;
+
+ /*
+ * A whole standard-format XSAVE buffer is needed:
+ */
+ if (pos != 0 || count != fpu_user_xstate_size)
+ return -EFAULT;
+
+ xsave = &fpu->state.xsave;
+
+ fpu__prepare_write(fpu);
+
+ if (using_compacted_format()) {
+ if (kbuf)
+ ret = copy_kernel_to_xstate(xsave, kbuf);
+ else
+ ret = copy_user_to_xstate(xsave, ubuf);
+ } else {
+ ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1);
+ if (!ret)
+ ret = validate_xstate_header(&xsave->header);
+ }
+
+ /*
+ * mxcsr reserved bits must be masked to zero for security reasons.
+ */
+ xsave->i387.mxcsr &= mxcsr_feature_mask;
+
+ /*
+ * In case of failure, mark all states as init:
+ */
+ if (ret)
+ fpstate_init(&fpu->state);
+
+ return ret;
+}
+
+#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
+
+/*
+ * FPU tag word conversions.
+ */
+
+static inline unsigned short twd_i387_to_fxsr(unsigned short twd)
+{
+ unsigned int tmp; /* to avoid 16 bit prefixes in the code */
+
+ /* Transform each pair of bits into 01 (valid) or 00 (empty) */
+ tmp = ~twd;
+ tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */
+ /* and move the valid bits to the lower byte. */
+ tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */
+ tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */
+ tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */
+
+ return tmp;
+}
+
+#define FPREG_ADDR(f, n) ((void *)&(f)->st_space + (n) * 16)
+#define FP_EXP_TAG_VALID 0
+#define FP_EXP_TAG_ZERO 1
+#define FP_EXP_TAG_SPECIAL 2
+#define FP_EXP_TAG_EMPTY 3
+
+static inline u32 twd_fxsr_to_i387(struct fxregs_state *fxsave)
+{
+ struct _fpxreg *st;
+ u32 tos = (fxsave->swd >> 11) & 7;
+ u32 twd = (unsigned long) fxsave->twd;
+ u32 tag;
+ u32 ret = 0xffff0000u;
+ int i;
+
+ for (i = 0; i < 8; i++, twd >>= 1) {
+ if (twd & 0x1) {
+ st = FPREG_ADDR(fxsave, (i - tos) & 7);
+
+ switch (st->exponent & 0x7fff) {
+ case 0x7fff:
+ tag = FP_EXP_TAG_SPECIAL;
+ break;
+ case 0x0000:
+ if (!st->significand[0] &&
+ !st->significand[1] &&
+ !st->significand[2] &&
+ !st->significand[3])
+ tag = FP_EXP_TAG_ZERO;
+ else
+ tag = FP_EXP_TAG_SPECIAL;
+ break;
+ default:
+ if (st->significand[3] & 0x8000)
+ tag = FP_EXP_TAG_VALID;
+ else
+ tag = FP_EXP_TAG_SPECIAL;
+ break;
+ }
+ } else {
+ tag = FP_EXP_TAG_EMPTY;
+ }
+ ret |= tag << (2 * i);
+ }
+ return ret;
+}
+
+/*
+ * FXSR floating point environment conversions.
+ */
+
+void
+convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
+{
+ struct fxregs_state *fxsave = &tsk->thread.fpu.state.fxsave;
+ struct _fpreg *to = (struct _fpreg *) &env->st_space[0];
+ struct _fpxreg *from = (struct _fpxreg *) &fxsave->st_space[0];
+ int i;
+
+ env->cwd = fxsave->cwd | 0xffff0000u;
+ env->swd = fxsave->swd | 0xffff0000u;
+ env->twd = twd_fxsr_to_i387(fxsave);
+
+#ifdef CONFIG_X86_64
+ env->fip = fxsave->rip;
+ env->foo = fxsave->rdp;
+ /*
+ * should be actually ds/cs at fpu exception time, but
+ * that information is not available in 64bit mode.
+ */
+ env->fcs = task_pt_regs(tsk)->cs;
+ if (tsk == current) {
+ savesegment(ds, env->fos);
+ } else {
+ env->fos = tsk->thread.ds;
+ }
+ env->fos |= 0xffff0000;
+#else
+ env->fip = fxsave->fip;
+ env->fcs = (u16) fxsave->fcs | ((u32) fxsave->fop << 16);
+ env->foo = fxsave->foo;
+ env->fos = fxsave->fos;
+#endif
+
+ for (i = 0; i < 8; ++i)
+ memcpy(&to[i], &from[i], sizeof(to[0]));
+}
+
+void convert_to_fxsr(struct task_struct *tsk,
+ const struct user_i387_ia32_struct *env)
+
+{
+ struct fxregs_state *fxsave = &tsk->thread.fpu.state.fxsave;
+ struct _fpreg *from = (struct _fpreg *) &env->st_space[0];
+ struct _fpxreg *to = (struct _fpxreg *) &fxsave->st_space[0];
+ int i;
+
+ fxsave->cwd = env->cwd;
+ fxsave->swd = env->swd;
+ fxsave->twd = twd_i387_to_fxsr(env->twd);
+ fxsave->fop = (u16) ((u32) env->fcs >> 16);
+#ifdef CONFIG_X86_64
+ fxsave->rip = env->fip;
+ fxsave->rdp = env->foo;
+ /* cs and ds ignored */
+#else
+ fxsave->fip = env->fip;
+ fxsave->fcs = (env->fcs & 0xffff);
+ fxsave->foo = env->foo;
+ fxsave->fos = env->fos;
+#endif
+
+ for (i = 0; i < 8; ++i)
+ memcpy(&to[i], &from[i], sizeof(from[0]));
+}
+
+int fpregs_get(struct task_struct *target, const struct user_regset *regset,
+ unsigned int pos, unsigned int count,
+ void *kbuf, void __user *ubuf)
+{
+ struct fpu *fpu = &target->thread.fpu;
+ struct user_i387_ia32_struct env;
+
+ fpu__prepare_read(fpu);
+
+ if (!boot_cpu_has(X86_FEATURE_FPU))
+ return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf);
+
+ if (!boot_cpu_has(X86_FEATURE_FXSR))
+ return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+ &fpu->state.fsave, 0,
+ -1);
+
+ fpstate_sanitize_xstate(fpu);
+
+ if (kbuf && pos == 0 && count == sizeof(env)) {
+ convert_from_fxsr(kbuf, target);
+ return 0;
+ }
+
+ convert_from_fxsr(&env, target);
+
+ return user_regset_copyout(&pos, &count, &kbuf, &ubuf, &env, 0, -1);
+}
+
+int fpregs_set(struct task_struct *target, const struct user_regset *regset,
+ unsigned int pos, unsigned int count,
+ const void *kbuf, const void __user *ubuf)
+{
+ struct fpu *fpu = &target->thread.fpu;
+ struct user_i387_ia32_struct env;
+ int ret;
+
+ fpu__prepare_write(fpu);
+ fpstate_sanitize_xstate(fpu);
+
+ if (!boot_cpu_has(X86_FEATURE_FPU))
+ return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf);
+
+ if (!boot_cpu_has(X86_FEATURE_FXSR))
+ return user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+ &fpu->state.fsave, 0,
+ -1);
+
+ if (pos > 0 || count < sizeof(env))
+ convert_from_fxsr(&env, target);
+
+ ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &env, 0, -1);
+ if (!ret)
+ convert_to_fxsr(target, &env);
+
+ /*
+ * update the header bit in the xsave header, indicating the
+ * presence of FP.
+ */
+ if (boot_cpu_has(X86_FEATURE_XSAVE))
+ fpu->state.xsave.header.xfeatures |= XFEATURE_MASK_FP;
+ return ret;
+}
+
+/*
+ * FPU state for core dumps.
+ * This is only used for a.out dumps now.
+ * It is declared generically using elf_fpregset_t (which is
+ * struct user_i387_struct) but is in fact only used for 32-bit
+ * dumps, so on 64-bit it is really struct user_i387_ia32_struct.
+ */
+int dump_fpu(struct pt_regs *regs, struct user_i387_struct *ufpu)
+{
+ struct task_struct *tsk = current;
+ struct fpu *fpu = &tsk->thread.fpu;
+ int fpvalid;
+
+ fpvalid = fpu->initialized;
+ if (fpvalid)
+ fpvalid = !fpregs_get(tsk, NULL,
+ 0, sizeof(struct user_i387_ia32_struct),
+ ufpu, NULL);
+
+ return fpvalid;
+}
+EXPORT_SYMBOL(dump_fpu);
+
+#endif /* CONFIG_X86_32 || CONFIG_IA32_EMULATION */
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
new file mode 100644
index 000000000..86a231338
--- /dev/null
+++ b/arch/x86/kernel/fpu/signal.c
@@ -0,0 +1,439 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * FPU signal frame handling routines.
+ */
+
+#include <linux/compat.h>
+#include <linux/cpu.h>
+
+#include <asm/fpu/internal.h>
+#include <asm/fpu/signal.h>
+#include <asm/fpu/regset.h>
+#include <asm/fpu/xstate.h>
+
+#include <asm/sigframe.h>
+#include <asm/trace/fpu.h>
+
+static struct _fpx_sw_bytes fx_sw_reserved, fx_sw_reserved_ia32;
+
+/*
+ * Check for the presence of extended state information in the
+ * user fpstate pointer in the sigcontext.
+ */
+static inline int check_for_xstate(struct fxregs_state __user *buf,
+ void __user *fpstate,
+ struct _fpx_sw_bytes *fx_sw)
+{
+ int min_xstate_size = sizeof(struct fxregs_state) +
+ sizeof(struct xstate_header);
+ unsigned int magic2;
+
+ if (__copy_from_user(fx_sw, &buf->sw_reserved[0], sizeof(*fx_sw)))
+ return -1;
+
+ /* Check for the first magic field and other error scenarios. */
+ if (fx_sw->magic1 != FP_XSTATE_MAGIC1 ||
+ fx_sw->xstate_size < min_xstate_size ||
+ fx_sw->xstate_size > fpu_user_xstate_size ||
+ fx_sw->xstate_size > fx_sw->extended_size)
+ return -1;
+
+ /*
+ * Check for the presence of second magic word at the end of memory
+ * layout. This detects the case where the user just copied the legacy
+ * fpstate layout with out copying the extended state information
+ * in the memory layout.
+ */
+ if (__get_user(magic2, (__u32 __user *)(fpstate + fx_sw->xstate_size))
+ || magic2 != FP_XSTATE_MAGIC2)
+ return -1;
+
+ return 0;
+}
+
+/*
+ * Signal frame handlers.
+ */
+static inline int save_fsave_header(struct task_struct *tsk, void __user *buf)
+{
+ if (use_fxsr()) {
+ struct xregs_state *xsave = &tsk->thread.fpu.state.xsave;
+ struct user_i387_ia32_struct env;
+ struct _fpstate_32 __user *fp = buf;
+
+ convert_from_fxsr(&env, tsk);
+
+ if (__copy_to_user(buf, &env, sizeof(env)) ||
+ __put_user(xsave->i387.swd, &fp->status) ||
+ __put_user(X86_FXSR_MAGIC, &fp->magic))
+ return -1;
+ } else {
+ struct fregs_state __user *fp = buf;
+ u32 swd;
+ if (__get_user(swd, &fp->swd) || __put_user(swd, &fp->status))
+ return -1;
+ }
+
+ return 0;
+}
+
+static inline int save_xstate_epilog(void __user *buf, int ia32_frame)
+{
+ struct xregs_state __user *x = buf;
+ struct _fpx_sw_bytes *sw_bytes;
+ u32 xfeatures;
+ int err;
+
+ /* Setup the bytes not touched by the [f]xsave and reserved for SW. */
+ sw_bytes = ia32_frame ? &fx_sw_reserved_ia32 : &fx_sw_reserved;
+ err = __copy_to_user(&x->i387.sw_reserved, sw_bytes, sizeof(*sw_bytes));
+
+ if (!use_xsave())
+ return err;
+
+ err |= __put_user(FP_XSTATE_MAGIC2,
+ (__u32 *)(buf + fpu_user_xstate_size));
+
+ /*
+ * Read the xfeatures which we copied (directly from the cpu or
+ * from the state in task struct) to the user buffers.
+ */
+ err |= __get_user(xfeatures, (__u32 *)&x->header.xfeatures);
+
+ /*
+ * For legacy compatible, we always set FP/SSE bits in the bit
+ * vector while saving the state to the user context. This will
+ * enable us capturing any changes(during sigreturn) to
+ * the FP/SSE bits by the legacy applications which don't touch
+ * xfeatures in the xsave header.
+ *
+ * xsave aware apps can change the xfeatures in the xsave
+ * header as well as change any contents in the memory layout.
+ * xrestore as part of sigreturn will capture all the changes.
+ */
+ xfeatures |= XFEATURE_MASK_FPSSE;
+
+ err |= __put_user(xfeatures, (__u32 *)&x->header.xfeatures);
+
+ return err;
+}
+
+static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf)
+{
+ int err;
+
+ if (use_xsave())
+ err = copy_xregs_to_user(buf);
+ else if (use_fxsr())
+ err = copy_fxregs_to_user((struct fxregs_state __user *) buf);
+ else
+ err = copy_fregs_to_user((struct fregs_state __user *) buf);
+
+ if (unlikely(err) && __clear_user(buf, fpu_user_xstate_size))
+ err = -EFAULT;
+ return err;
+}
+
+/*
+ * Save the fpu, extended register state to the user signal frame.
+ *
+ * 'buf_fx' is the 64-byte aligned pointer at which the [f|fx|x]save
+ * state is copied.
+ * 'buf' points to the 'buf_fx' or to the fsave header followed by 'buf_fx'.
+ *
+ * buf == buf_fx for 64-bit frames and 32-bit fsave frame.
+ * buf != buf_fx for 32-bit frames with fxstate.
+ *
+ * If the fpu, extended register state is live, save the state directly
+ * to the user frame pointed by the aligned pointer 'buf_fx'. Otherwise,
+ * copy the thread's fpu state to the user frame starting at 'buf_fx'.
+ *
+ * If this is a 32-bit frame with fxstate, put a fsave header before
+ * the aligned state at 'buf_fx'.
+ *
+ * For [f]xsave state, update the SW reserved fields in the [f]xsave frame
+ * indicating the absence/presence of the extended state to the user.
+ */
+int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
+{
+ struct fpu *fpu = &current->thread.fpu;
+ struct xregs_state *xsave = &fpu->state.xsave;
+ struct task_struct *tsk = current;
+ int ia32_fxstate = (buf != buf_fx);
+
+ ia32_fxstate &= (IS_ENABLED(CONFIG_X86_32) ||
+ IS_ENABLED(CONFIG_IA32_EMULATION));
+
+ if (!access_ok(VERIFY_WRITE, buf, size))
+ return -EACCES;
+
+ if (!static_cpu_has(X86_FEATURE_FPU))
+ return fpregs_soft_get(current, NULL, 0,
+ sizeof(struct user_i387_ia32_struct), NULL,
+ (struct _fpstate_32 __user *) buf) ? -1 : 1;
+
+ if (fpu->initialized || using_compacted_format()) {
+ /* Save the live register state to the user directly. */
+ if (copy_fpregs_to_sigframe(buf_fx))
+ return -1;
+ /* Update the thread's fxstate to save the fsave header. */
+ if (ia32_fxstate)
+ copy_fxregs_to_kernel(fpu);
+ } else {
+ /*
+ * It is a *bug* if kernel uses compacted-format for xsave
+ * area and we copy it out directly to a signal frame. It
+ * should have been handled above by saving the registers
+ * directly.
+ */
+ if (boot_cpu_has(X86_FEATURE_XSAVES)) {
+ WARN_ONCE(1, "x86/fpu: saving compacted-format xsave area to a signal frame!\n");
+ return -1;
+ }
+
+ fpstate_sanitize_xstate(fpu);
+ if (__copy_to_user(buf_fx, xsave, fpu_user_xstate_size))
+ return -1;
+ }
+
+ /* Save the fsave header for the 32-bit frames. */
+ if ((ia32_fxstate || !use_fxsr()) && save_fsave_header(tsk, buf))
+ return -1;
+
+ if (use_fxsr() && save_xstate_epilog(buf_fx, ia32_fxstate))
+ return -1;
+
+ return 0;
+}
+
+static inline void
+sanitize_restored_xstate(struct task_struct *tsk,
+ struct user_i387_ia32_struct *ia32_env,
+ u64 xfeatures, int fx_only)
+{
+ struct xregs_state *xsave = &tsk->thread.fpu.state.xsave;
+ struct xstate_header *header = &xsave->header;
+
+ if (use_xsave()) {
+ /*
+ * Note: we don't need to zero the reserved bits in the
+ * xstate_header here because we either didn't copy them at all,
+ * or we checked earlier that they aren't set.
+ */
+
+ /*
+ * Init the state that is not present in the memory
+ * layout and not enabled by the OS.
+ */
+ if (fx_only)
+ header->xfeatures = XFEATURE_MASK_FPSSE;
+ else
+ header->xfeatures &= xfeatures;
+ }
+
+ if (use_fxsr()) {
+ /*
+ * mscsr reserved bits must be masked to zero for security
+ * reasons.
+ */
+ xsave->i387.mxcsr &= mxcsr_feature_mask;
+
+ convert_to_fxsr(tsk, ia32_env);
+ }
+}
+
+/*
+ * Restore the extended state if present. Otherwise, restore the FP/SSE state.
+ */
+static inline int copy_user_to_fpregs_zeroing(void __user *buf, u64 xbv, int fx_only)
+{
+ if (use_xsave()) {
+ if ((unsigned long)buf % 64 || fx_only) {
+ u64 init_bv = xfeatures_mask & ~XFEATURE_MASK_FPSSE;
+ copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
+ return copy_user_to_fxregs(buf);
+ } else {
+ u64 init_bv = xfeatures_mask & ~xbv;
+ if (unlikely(init_bv))
+ copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
+ return copy_user_to_xregs(buf, xbv);
+ }
+ } else if (use_fxsr()) {
+ return copy_user_to_fxregs(buf);
+ } else
+ return copy_user_to_fregs(buf);
+}
+
+static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
+{
+ int ia32_fxstate = (buf != buf_fx);
+ struct task_struct *tsk = current;
+ struct fpu *fpu = &tsk->thread.fpu;
+ int state_size = fpu_kernel_xstate_size;
+ u64 xfeatures = 0;
+ int fx_only = 0;
+ int ret = 0;
+
+ ia32_fxstate &= (IS_ENABLED(CONFIG_X86_32) ||
+ IS_ENABLED(CONFIG_IA32_EMULATION));
+
+ if (!buf) {
+ fpu__clear(fpu);
+ return 0;
+ }
+
+ if (!access_ok(VERIFY_READ, buf, size)) {
+ ret = -EACCES;
+ goto out_err;
+ }
+
+ fpu__initialize(fpu);
+
+ if (!static_cpu_has(X86_FEATURE_FPU)) {
+ ret = fpregs_soft_set(current, NULL,
+ 0, sizeof(struct user_i387_ia32_struct),
+ NULL, buf) != 0;
+ if (ret)
+ goto out_err;
+ return 0;
+ }
+
+ if (use_xsave()) {
+ struct _fpx_sw_bytes fx_sw_user;
+ if (unlikely(check_for_xstate(buf_fx, buf_fx, &fx_sw_user))) {
+ /*
+ * Couldn't find the extended state information in the
+ * memory layout. Restore just the FP/SSE and init all
+ * the other extended state.
+ */
+ state_size = sizeof(struct fxregs_state);
+ fx_only = 1;
+ trace_x86_fpu_xstate_check_failed(fpu);
+ } else {
+ state_size = fx_sw_user.xstate_size;
+ xfeatures = fx_sw_user.xfeatures;
+ }
+ }
+
+ if (ia32_fxstate) {
+ /*
+ * For 32-bit frames with fxstate, copy the user state to the
+ * thread's fpu state, reconstruct fxstate from the fsave
+ * header. Validate and sanitize the copied state.
+ */
+ struct user_i387_ia32_struct env;
+ int err = 0;
+
+ /*
+ * Drop the current fpu which clears fpu->initialized. This ensures
+ * that any context-switch during the copy of the new state,
+ * avoids the intermediate state from getting restored/saved.
+ * Thus avoiding the new restored state from getting corrupted.
+ * We will be ready to restore/save the state only after
+ * fpu->initialized is again set.
+ */
+ fpu__drop(fpu);
+
+ if (using_compacted_format()) {
+ err = copy_user_to_xstate(&fpu->state.xsave, buf_fx);
+ } else {
+ err = __copy_from_user(&fpu->state.xsave, buf_fx, state_size);
+
+ if (!err && state_size > offsetof(struct xregs_state, header))
+ err = validate_xstate_header(&fpu->state.xsave.header);
+ }
+
+ if (err || __copy_from_user(&env, buf, sizeof(env))) {
+ fpstate_init(&fpu->state);
+ trace_x86_fpu_init_state(fpu);
+ err = -1;
+ } else {
+ sanitize_restored_xstate(tsk, &env, xfeatures, fx_only);
+ }
+
+ local_bh_disable();
+ fpu->initialized = 1;
+ fpu__restore(fpu);
+ local_bh_enable();
+
+ /* Failure is already handled */
+ return err;
+ } else {
+ /*
+ * For 64-bit frames and 32-bit fsave frames, restore the user
+ * state to the registers directly (with exceptions handled).
+ */
+ user_fpu_begin();
+ if (!copy_user_to_fpregs_zeroing(buf_fx, xfeatures, fx_only))
+ return 0;
+ ret = -1;
+ }
+
+out_err:
+ fpu__clear(fpu);
+ return ret;
+}
+
+static inline int xstate_sigframe_size(void)
+{
+ return use_xsave() ? fpu_user_xstate_size + FP_XSTATE_MAGIC2_SIZE :
+ fpu_user_xstate_size;
+}
+
+/*
+ * Restore FPU state from a sigframe:
+ */
+int fpu__restore_sig(void __user *buf, int ia32_frame)
+{
+ void __user *buf_fx = buf;
+ int size = xstate_sigframe_size();
+
+ if (ia32_frame && use_fxsr()) {
+ buf_fx = buf + sizeof(struct fregs_state);
+ size += sizeof(struct fregs_state);
+ }
+
+ return __fpu__restore_sig(buf, buf_fx, size);
+}
+
+unsigned long
+fpu__alloc_mathframe(unsigned long sp, int ia32_frame,
+ unsigned long *buf_fx, unsigned long *size)
+{
+ unsigned long frame_size = xstate_sigframe_size();
+
+ *buf_fx = sp = round_down(sp - frame_size, 64);
+ if (ia32_frame && use_fxsr()) {
+ frame_size += sizeof(struct fregs_state);
+ sp -= sizeof(struct fregs_state);
+ }
+
+ *size = frame_size;
+
+ return sp;
+}
+/*
+ * Prepare the SW reserved portion of the fxsave memory layout, indicating
+ * the presence of the extended state information in the memory layout
+ * pointed by the fpstate pointer in the sigcontext.
+ * This will be saved when ever the FP and extended state context is
+ * saved on the user stack during the signal handler delivery to the user.
+ */
+void fpu__init_prepare_fx_sw_frame(void)
+{
+ int size = fpu_user_xstate_size + FP_XSTATE_MAGIC2_SIZE;
+
+ fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1;
+ fx_sw_reserved.extended_size = size;
+ fx_sw_reserved.xfeatures = xfeatures_mask;
+ fx_sw_reserved.xstate_size = fpu_user_xstate_size;
+
+ if (IS_ENABLED(CONFIG_IA32_EMULATION) ||
+ IS_ENABLED(CONFIG_X86_32)) {
+ int fsave_header_size = sizeof(struct fregs_state);
+
+ fx_sw_reserved_ia32 = fx_sw_reserved;
+ fx_sw_reserved_ia32.extended_size = size + fsave_header_size;
+ }
+}
+
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
new file mode 100644
index 000000000..7d372db8b
--- /dev/null
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -0,0 +1,1294 @@
+/*
+ * xsave/xrstor support.
+ *
+ * Author: Suresh Siddha <suresh.b.siddha@intel.com>
+ */
+#include <linux/compat.h>
+#include <linux/cpu.h>
+#include <linux/mman.h>
+#include <linux/pkeys.h>
+
+#include <asm/fpu/api.h>
+#include <asm/fpu/internal.h>
+#include <asm/fpu/signal.h>
+#include <asm/fpu/regset.h>
+#include <asm/fpu/xstate.h>
+
+#include <asm/tlbflush.h>
+#include <asm/cpufeature.h>
+
+/*
+ * Although we spell it out in here, the Processor Trace
+ * xfeature is completely unused. We use other mechanisms
+ * to save/restore PT state in Linux.
+ */
+static const char *xfeature_names[] =
+{
+ "x87 floating point registers" ,
+ "SSE registers" ,
+ "AVX registers" ,
+ "MPX bounds registers" ,
+ "MPX CSR" ,
+ "AVX-512 opmask" ,
+ "AVX-512 Hi256" ,
+ "AVX-512 ZMM_Hi256" ,
+ "Processor Trace (unused)" ,
+ "Protection Keys User registers",
+ "unknown xstate feature" ,
+};
+
+static short xsave_cpuid_features[] __initdata = {
+ X86_FEATURE_FPU,
+ X86_FEATURE_XMM,
+ X86_FEATURE_AVX,
+ X86_FEATURE_MPX,
+ X86_FEATURE_MPX,
+ X86_FEATURE_AVX512F,
+ X86_FEATURE_AVX512F,
+ X86_FEATURE_AVX512F,
+ X86_FEATURE_INTEL_PT,
+ X86_FEATURE_PKU,
+};
+
+/*
+ * Mask of xstate features supported by the CPU and the kernel:
+ */
+u64 xfeatures_mask __read_mostly;
+
+static unsigned int xstate_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1};
+static unsigned int xstate_sizes[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1};
+static unsigned int xstate_comp_offsets[sizeof(xfeatures_mask)*8];
+
+/*
+ * The XSAVE area of kernel can be in standard or compacted format;
+ * it is always in standard format for user mode. This is the user
+ * mode standard format size used for signal and ptrace frames.
+ */
+unsigned int fpu_user_xstate_size;
+
+/*
+ * Clear all of the X86_FEATURE_* bits that are unavailable
+ * when the CPU has no XSAVE support.
+ */
+void fpu__xstate_clear_all_cpu_caps(void)
+{
+ setup_clear_cpu_cap(X86_FEATURE_XSAVE);
+}
+
+/*
+ * Return whether the system supports a given xfeature.
+ *
+ * Also return the name of the (most advanced) feature that the caller requested:
+ */
+int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name)
+{
+ u64 xfeatures_missing = xfeatures_needed & ~xfeatures_mask;
+
+ if (unlikely(feature_name)) {
+ long xfeature_idx, max_idx;
+ u64 xfeatures_print;
+ /*
+ * So we use FLS here to be able to print the most advanced
+ * feature that was requested but is missing. So if a driver
+ * asks about "XFEATURE_MASK_SSE | XFEATURE_MASK_YMM" we'll print the
+ * missing AVX feature - this is the most informative message
+ * to users:
+ */
+ if (xfeatures_missing)
+ xfeatures_print = xfeatures_missing;
+ else
+ xfeatures_print = xfeatures_needed;
+
+ xfeature_idx = fls64(xfeatures_print)-1;
+ max_idx = ARRAY_SIZE(xfeature_names)-1;
+ xfeature_idx = min(xfeature_idx, max_idx);
+
+ *feature_name = xfeature_names[xfeature_idx];
+ }
+
+ if (xfeatures_missing)
+ return 0;
+
+ return 1;
+}
+EXPORT_SYMBOL_GPL(cpu_has_xfeatures);
+
+static int xfeature_is_supervisor(int xfeature_nr)
+{
+ /*
+ * We currently do not support supervisor states, but if
+ * we did, we could find out like this.
+ *
+ * SDM says: If state component 'i' is a user state component,
+ * ECX[0] return 0; if state component i is a supervisor
+ * state component, ECX[0] returns 1.
+ */
+ u32 eax, ebx, ecx, edx;
+
+ cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
+ return !!(ecx & 1);
+}
+
+static int xfeature_is_user(int xfeature_nr)
+{
+ return !xfeature_is_supervisor(xfeature_nr);
+}
+
+/*
+ * When executing XSAVEOPT (or other optimized XSAVE instructions), if
+ * a processor implementation detects that an FPU state component is still
+ * (or is again) in its initialized state, it may clear the corresponding
+ * bit in the header.xfeatures field, and can skip the writeout of registers
+ * to the corresponding memory layout.
+ *
+ * This means that when the bit is zero, the state component might still contain
+ * some previous - non-initialized register state.
+ *
+ * Before writing xstate information to user-space we sanitize those components,
+ * to always ensure that the memory layout of a feature will be in the init state
+ * if the corresponding header bit is zero. This is to ensure that user-space doesn't
+ * see some stale state in the memory layout during signal handling, debugging etc.
+ */
+void fpstate_sanitize_xstate(struct fpu *fpu)
+{
+ struct fxregs_state *fx = &fpu->state.fxsave;
+ int feature_bit;
+ u64 xfeatures;
+
+ if (!use_xsaveopt())
+ return;
+
+ xfeatures = fpu->state.xsave.header.xfeatures;
+
+ /*
+ * None of the feature bits are in init state. So nothing else
+ * to do for us, as the memory layout is up to date.
+ */
+ if ((xfeatures & xfeatures_mask) == xfeatures_mask)
+ return;
+
+ /*
+ * FP is in init state
+ */
+ if (!(xfeatures & XFEATURE_MASK_FP)) {
+ fx->cwd = 0x37f;
+ fx->swd = 0;
+ fx->twd = 0;
+ fx->fop = 0;
+ fx->rip = 0;
+ fx->rdp = 0;
+ memset(&fx->st_space[0], 0, 128);
+ }
+
+ /*
+ * SSE is in init state
+ */
+ if (!(xfeatures & XFEATURE_MASK_SSE))
+ memset(&fx->xmm_space[0], 0, 256);
+
+ /*
+ * First two features are FPU and SSE, which above we handled
+ * in a special way already:
+ */
+ feature_bit = 0x2;
+ xfeatures = (xfeatures_mask & ~xfeatures) >> 2;
+
+ /*
+ * Update all the remaining memory layouts according to their
+ * standard xstate layout, if their header bit is in the init
+ * state:
+ */
+ while (xfeatures) {
+ if (xfeatures & 0x1) {
+ int offset = xstate_comp_offsets[feature_bit];
+ int size = xstate_sizes[feature_bit];
+
+ memcpy((void *)fx + offset,
+ (void *)&init_fpstate.xsave + offset,
+ size);
+ }
+
+ xfeatures >>= 1;
+ feature_bit++;
+ }
+}
+
+/*
+ * Enable the extended processor state save/restore feature.
+ * Called once per CPU onlining.
+ */
+void fpu__init_cpu_xstate(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_XSAVE) || !xfeatures_mask)
+ return;
+ /*
+ * Make it clear that XSAVES supervisor states are not yet
+ * implemented should anyone expect it to work by changing
+ * bits in XFEATURE_MASK_* macros and XCR0.
+ */
+ WARN_ONCE((xfeatures_mask & XFEATURE_MASK_SUPERVISOR),
+ "x86/fpu: XSAVES supervisor states are not yet implemented.\n");
+
+ xfeatures_mask &= ~XFEATURE_MASK_SUPERVISOR;
+
+ cr4_set_bits(X86_CR4_OSXSAVE);
+ xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask);
+}
+
+/*
+ * Note that in the future we will likely need a pair of
+ * functions here: one for user xstates and the other for
+ * system xstates. For now, they are the same.
+ */
+static int xfeature_enabled(enum xfeature xfeature)
+{
+ return !!(xfeatures_mask & (1UL << xfeature));
+}
+
+/*
+ * Record the offsets and sizes of various xstates contained
+ * in the XSAVE state memory layout.
+ */
+static void __init setup_xstate_features(void)
+{
+ u32 eax, ebx, ecx, edx, i;
+ /* start at the beginnning of the "extended state" */
+ unsigned int last_good_offset = offsetof(struct xregs_state,
+ extended_state_area);
+ /*
+ * The FP xstates and SSE xstates are legacy states. They are always
+ * in the fixed offsets in the xsave area in either compacted form
+ * or standard form.
+ */
+ xstate_offsets[0] = 0;
+ xstate_sizes[0] = offsetof(struct fxregs_state, xmm_space);
+ xstate_offsets[1] = xstate_sizes[0];
+ xstate_sizes[1] = FIELD_SIZEOF(struct fxregs_state, xmm_space);
+
+ for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
+ if (!xfeature_enabled(i))
+ continue;
+
+ cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
+
+ /*
+ * If an xfeature is supervisor state, the offset
+ * in EBX is invalid. We leave it to -1.
+ */
+ if (xfeature_is_user(i))
+ xstate_offsets[i] = ebx;
+
+ xstate_sizes[i] = eax;
+ /*
+ * In our xstate size checks, we assume that the
+ * highest-numbered xstate feature has the
+ * highest offset in the buffer. Ensure it does.
+ */
+ WARN_ONCE(last_good_offset > xstate_offsets[i],
+ "x86/fpu: misordered xstate at %d\n", last_good_offset);
+ last_good_offset = xstate_offsets[i];
+ }
+}
+
+static void __init print_xstate_feature(u64 xstate_mask)
+{
+ const char *feature_name;
+
+ if (cpu_has_xfeatures(xstate_mask, &feature_name))
+ pr_info("x86/fpu: Supporting XSAVE feature 0x%03Lx: '%s'\n", xstate_mask, feature_name);
+}
+
+/*
+ * Print out all the supported xstate features:
+ */
+static void __init print_xstate_features(void)
+{
+ print_xstate_feature(XFEATURE_MASK_FP);
+ print_xstate_feature(XFEATURE_MASK_SSE);
+ print_xstate_feature(XFEATURE_MASK_YMM);
+ print_xstate_feature(XFEATURE_MASK_BNDREGS);
+ print_xstate_feature(XFEATURE_MASK_BNDCSR);
+ print_xstate_feature(XFEATURE_MASK_OPMASK);
+ print_xstate_feature(XFEATURE_MASK_ZMM_Hi256);
+ print_xstate_feature(XFEATURE_MASK_Hi16_ZMM);
+ print_xstate_feature(XFEATURE_MASK_PKRU);
+}
+
+/*
+ * This check is important because it is easy to get XSTATE_*
+ * confused with XSTATE_BIT_*.
+ */
+#define CHECK_XFEATURE(nr) do { \
+ WARN_ON(nr < FIRST_EXTENDED_XFEATURE); \
+ WARN_ON(nr >= XFEATURE_MAX); \
+} while (0)
+
+/*
+ * We could cache this like xstate_size[], but we only use
+ * it here, so it would be a waste of space.
+ */
+static int xfeature_is_aligned(int xfeature_nr)
+{
+ u32 eax, ebx, ecx, edx;
+
+ CHECK_XFEATURE(xfeature_nr);
+ cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
+ /*
+ * The value returned by ECX[1] indicates the alignment
+ * of state component 'i' when the compacted format
+ * of the extended region of an XSAVE area is used:
+ */
+ return !!(ecx & 2);
+}
+
+/*
+ * This function sets up offsets and sizes of all extended states in
+ * xsave area. This supports both standard format and compacted format
+ * of the xsave aread.
+ */
+static void __init setup_xstate_comp(void)
+{
+ unsigned int xstate_comp_sizes[sizeof(xfeatures_mask)*8];
+ int i;
+
+ /*
+ * The FP xstates and SSE xstates are legacy states. They are always
+ * in the fixed offsets in the xsave area in either compacted form
+ * or standard form.
+ */
+ xstate_comp_offsets[0] = 0;
+ xstate_comp_offsets[1] = offsetof(struct fxregs_state, xmm_space);
+
+ if (!boot_cpu_has(X86_FEATURE_XSAVES)) {
+ for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
+ if (xfeature_enabled(i)) {
+ xstate_comp_offsets[i] = xstate_offsets[i];
+ xstate_comp_sizes[i] = xstate_sizes[i];
+ }
+ }
+ return;
+ }
+
+ xstate_comp_offsets[FIRST_EXTENDED_XFEATURE] =
+ FXSAVE_SIZE + XSAVE_HDR_SIZE;
+
+ for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
+ if (xfeature_enabled(i))
+ xstate_comp_sizes[i] = xstate_sizes[i];
+ else
+ xstate_comp_sizes[i] = 0;
+
+ if (i > FIRST_EXTENDED_XFEATURE) {
+ xstate_comp_offsets[i] = xstate_comp_offsets[i-1]
+ + xstate_comp_sizes[i-1];
+
+ if (xfeature_is_aligned(i))
+ xstate_comp_offsets[i] =
+ ALIGN(xstate_comp_offsets[i], 64);
+ }
+ }
+}
+
+/*
+ * Print out xstate component offsets and sizes
+ */
+static void __init print_xstate_offset_size(void)
+{
+ int i;
+
+ for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
+ if (!xfeature_enabled(i))
+ continue;
+ pr_info("x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n",
+ i, xstate_comp_offsets[i], i, xstate_sizes[i]);
+ }
+}
+
+/*
+ * All supported features have either init state all zeros or are
+ * handled in setup_init_fpu() individually. This is an explicit
+ * feature list and does not use XFEATURE_MASK*SUPPORTED to catch
+ * newly added supported features at build time and make people
+ * actually look at the init state for the new feature.
+ */
+#define XFEATURES_INIT_FPSTATE_HANDLED \
+ (XFEATURE_MASK_FP | \
+ XFEATURE_MASK_SSE | \
+ XFEATURE_MASK_YMM | \
+ XFEATURE_MASK_OPMASK | \
+ XFEATURE_MASK_ZMM_Hi256 | \
+ XFEATURE_MASK_Hi16_ZMM | \
+ XFEATURE_MASK_PKRU | \
+ XFEATURE_MASK_BNDREGS | \
+ XFEATURE_MASK_BNDCSR)
+
+/*
+ * setup the xstate image representing the init state
+ */
+static void __init setup_init_fpu_buf(void)
+{
+ static int on_boot_cpu __initdata = 1;
+
+ BUILD_BUG_ON(XCNTXT_MASK != XFEATURES_INIT_FPSTATE_HANDLED);
+
+ WARN_ON_FPU(!on_boot_cpu);
+ on_boot_cpu = 0;
+
+ if (!boot_cpu_has(X86_FEATURE_XSAVE))
+ return;
+
+ setup_xstate_features();
+ print_xstate_features();
+
+ if (boot_cpu_has(X86_FEATURE_XSAVES))
+ init_fpstate.xsave.header.xcomp_bv = (u64)1 << 63 | xfeatures_mask;
+
+ /*
+ * Init all the features state with header.xfeatures being 0x0
+ */
+ copy_kernel_to_xregs_booting(&init_fpstate.xsave);
+
+ /*
+ * All components are now in init state. Read the state back so
+ * that init_fpstate contains all non-zero init state. This only
+ * works with XSAVE, but not with XSAVEOPT and XSAVES because
+ * those use the init optimization which skips writing data for
+ * components in init state.
+ *
+ * XSAVE could be used, but that would require to reshuffle the
+ * data when XSAVES is available because XSAVES uses xstate
+ * compaction. But doing so is a pointless exercise because most
+ * components have an all zeros init state except for the legacy
+ * ones (FP and SSE). Those can be saved with FXSAVE into the
+ * legacy area. Adding new features requires to ensure that init
+ * state is all zeroes or if not to add the necessary handling
+ * here.
+ */
+ fxsave(&init_fpstate.fxsave);
+}
+
+static int xfeature_uncompacted_offset(int xfeature_nr)
+{
+ u32 eax, ebx, ecx, edx;
+
+ /*
+ * Only XSAVES supports supervisor states and it uses compacted
+ * format. Checking a supervisor state's uncompacted offset is
+ * an error.
+ */
+ if (XFEATURE_MASK_SUPERVISOR & (1 << xfeature_nr)) {
+ WARN_ONCE(1, "No fixed offset for xstate %d\n", xfeature_nr);
+ return -1;
+ }
+
+ CHECK_XFEATURE(xfeature_nr);
+ cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
+ return ebx;
+}
+
+static int xfeature_size(int xfeature_nr)
+{
+ u32 eax, ebx, ecx, edx;
+
+ CHECK_XFEATURE(xfeature_nr);
+ cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
+ return eax;
+}
+
+/*
+ * 'XSAVES' implies two different things:
+ * 1. saving of supervisor/system state
+ * 2. using the compacted format
+ *
+ * Use this function when dealing with the compacted format so
+ * that it is obvious which aspect of 'XSAVES' is being handled
+ * by the calling code.
+ */
+int using_compacted_format(void)
+{
+ return boot_cpu_has(X86_FEATURE_XSAVES);
+}
+
+/* Validate an xstate header supplied by userspace (ptrace or sigreturn) */
+int validate_xstate_header(const struct xstate_header *hdr)
+{
+ /* No unknown or supervisor features may be set */
+ if (hdr->xfeatures & (~xfeatures_mask | XFEATURE_MASK_SUPERVISOR))
+ return -EINVAL;
+
+ /* Userspace must use the uncompacted format */
+ if (hdr->xcomp_bv)
+ return -EINVAL;
+
+ /*
+ * If 'reserved' is shrunken to add a new field, make sure to validate
+ * that new field here!
+ */
+ BUILD_BUG_ON(sizeof(hdr->reserved) != 48);
+
+ /* No reserved bits may be set */
+ if (memchr_inv(hdr->reserved, 0, sizeof(hdr->reserved)))
+ return -EINVAL;
+
+ return 0;
+}
+
+static void __xstate_dump_leaves(void)
+{
+ int i;
+ u32 eax, ebx, ecx, edx;
+ static int should_dump = 1;
+
+ if (!should_dump)
+ return;
+ should_dump = 0;
+ /*
+ * Dump out a few leaves past the ones that we support
+ * just in case there are some goodies up there
+ */
+ for (i = 0; i < XFEATURE_MAX + 10; i++) {
+ cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
+ pr_warn("CPUID[%02x, %02x]: eax=%08x ebx=%08x ecx=%08x edx=%08x\n",
+ XSTATE_CPUID, i, eax, ebx, ecx, edx);
+ }
+}
+
+#define XSTATE_WARN_ON(x) do { \
+ if (WARN_ONCE(x, "XSAVE consistency problem, dumping leaves")) { \
+ __xstate_dump_leaves(); \
+ } \
+} while (0)
+
+#define XCHECK_SZ(sz, nr, nr_macro, __struct) do { \
+ if ((nr == nr_macro) && \
+ WARN_ONCE(sz != sizeof(__struct), \
+ "%s: struct is %zu bytes, cpu state %d bytes\n", \
+ __stringify(nr_macro), sizeof(__struct), sz)) { \
+ __xstate_dump_leaves(); \
+ } \
+} while (0)
+
+/*
+ * We have a C struct for each 'xstate'. We need to ensure
+ * that our software representation matches what the CPU
+ * tells us about the state's size.
+ */
+static void check_xstate_against_struct(int nr)
+{
+ /*
+ * Ask the CPU for the size of the state.
+ */
+ int sz = xfeature_size(nr);
+ /*
+ * Match each CPU state with the corresponding software
+ * structure.
+ */
+ XCHECK_SZ(sz, nr, XFEATURE_YMM, struct ymmh_struct);
+ XCHECK_SZ(sz, nr, XFEATURE_BNDREGS, struct mpx_bndreg_state);
+ XCHECK_SZ(sz, nr, XFEATURE_BNDCSR, struct mpx_bndcsr_state);
+ XCHECK_SZ(sz, nr, XFEATURE_OPMASK, struct avx_512_opmask_state);
+ XCHECK_SZ(sz, nr, XFEATURE_ZMM_Hi256, struct avx_512_zmm_uppers_state);
+ XCHECK_SZ(sz, nr, XFEATURE_Hi16_ZMM, struct avx_512_hi16_state);
+ XCHECK_SZ(sz, nr, XFEATURE_PKRU, struct pkru_state);
+
+ /*
+ * Make *SURE* to add any feature numbers in below if
+ * there are "holes" in the xsave state component
+ * numbers.
+ */
+ if ((nr < XFEATURE_YMM) ||
+ (nr >= XFEATURE_MAX) ||
+ (nr == XFEATURE_PT_UNIMPLEMENTED_SO_FAR)) {
+ WARN_ONCE(1, "no structure for xstate: %d\n", nr);
+ XSTATE_WARN_ON(1);
+ }
+}
+
+/*
+ * This essentially double-checks what the cpu told us about
+ * how large the XSAVE buffer needs to be. We are recalculating
+ * it to be safe.
+ */
+static void do_extra_xstate_size_checks(void)
+{
+ int paranoid_xstate_size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
+ int i;
+
+ for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
+ if (!xfeature_enabled(i))
+ continue;
+
+ check_xstate_against_struct(i);
+ /*
+ * Supervisor state components can be managed only by
+ * XSAVES, which is compacted-format only.
+ */
+ if (!using_compacted_format())
+ XSTATE_WARN_ON(xfeature_is_supervisor(i));
+
+ /* Align from the end of the previous feature */
+ if (xfeature_is_aligned(i))
+ paranoid_xstate_size = ALIGN(paranoid_xstate_size, 64);
+ /*
+ * The offset of a given state in the non-compacted
+ * format is given to us in a CPUID leaf. We check
+ * them for being ordered (increasing offsets) in
+ * setup_xstate_features().
+ */
+ if (!using_compacted_format())
+ paranoid_xstate_size = xfeature_uncompacted_offset(i);
+ /*
+ * The compacted-format offset always depends on where
+ * the previous state ended.
+ */
+ paranoid_xstate_size += xfeature_size(i);
+ }
+ XSTATE_WARN_ON(paranoid_xstate_size != fpu_kernel_xstate_size);
+}
+
+
+/*
+ * Get total size of enabled xstates in XCR0/xfeatures_mask.
+ *
+ * Note the SDM's wording here. "sub-function 0" only enumerates
+ * the size of the *user* states. If we use it to size a buffer
+ * that we use 'XSAVES' on, we could potentially overflow the
+ * buffer because 'XSAVES' saves system states too.
+ *
+ * Note that we do not currently set any bits on IA32_XSS so
+ * 'XCR0 | IA32_XSS == XCR0' for now.
+ */
+static unsigned int __init get_xsaves_size(void)
+{
+ unsigned int eax, ebx, ecx, edx;
+ /*
+ * - CPUID function 0DH, sub-function 1:
+ * EBX enumerates the size (in bytes) required by
+ * the XSAVES instruction for an XSAVE area
+ * containing all the state components
+ * corresponding to bits currently set in
+ * XCR0 | IA32_XSS.
+ */
+ cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx);
+ return ebx;
+}
+
+static unsigned int __init get_xsave_size(void)
+{
+ unsigned int eax, ebx, ecx, edx;
+ /*
+ * - CPUID function 0DH, sub-function 0:
+ * EBX enumerates the size (in bytes) required by
+ * the XSAVE instruction for an XSAVE area
+ * containing all the *user* state components
+ * corresponding to bits currently set in XCR0.
+ */
+ cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
+ return ebx;
+}
+
+/*
+ * Will the runtime-enumerated 'xstate_size' fit in the init
+ * task's statically-allocated buffer?
+ */
+static bool is_supported_xstate_size(unsigned int test_xstate_size)
+{
+ if (test_xstate_size <= sizeof(union fpregs_state))
+ return true;
+
+ pr_warn("x86/fpu: xstate buffer too small (%zu < %d), disabling xsave\n",
+ sizeof(union fpregs_state), test_xstate_size);
+ return false;
+}
+
+static int init_xstate_size(void)
+{
+ /* Recompute the context size for enabled features: */
+ unsigned int possible_xstate_size;
+ unsigned int xsave_size;
+
+ xsave_size = get_xsave_size();
+
+ if (boot_cpu_has(X86_FEATURE_XSAVES))
+ possible_xstate_size = get_xsaves_size();
+ else
+ possible_xstate_size = xsave_size;
+
+ /* Ensure we have the space to store all enabled: */
+ if (!is_supported_xstate_size(possible_xstate_size))
+ return -EINVAL;
+
+ /*
+ * The size is OK, we are definitely going to use xsave,
+ * make it known to the world that we need more space.
+ */
+ fpu_kernel_xstate_size = possible_xstate_size;
+ do_extra_xstate_size_checks();
+
+ /*
+ * User space is always in standard format.
+ */
+ fpu_user_xstate_size = xsave_size;
+ return 0;
+}
+
+/*
+ * We enabled the XSAVE hardware, but something went wrong and
+ * we can not use it. Disable it.
+ */
+static void fpu__init_disable_system_xstate(void)
+{
+ xfeatures_mask = 0;
+ cr4_clear_bits(X86_CR4_OSXSAVE);
+ fpu__xstate_clear_all_cpu_caps();
+}
+
+/*
+ * Enable and initialize the xsave feature.
+ * Called once per system bootup.
+ */
+void __init fpu__init_system_xstate(void)
+{
+ unsigned int eax, ebx, ecx, edx;
+ static int on_boot_cpu __initdata = 1;
+ int err;
+ int i;
+
+ WARN_ON_FPU(!on_boot_cpu);
+ on_boot_cpu = 0;
+
+ if (!boot_cpu_has(X86_FEATURE_FPU)) {
+ pr_info("x86/fpu: No FPU detected\n");
+ return;
+ }
+
+ if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
+ pr_info("x86/fpu: x87 FPU will use %s\n",
+ boot_cpu_has(X86_FEATURE_FXSR) ? "FXSAVE" : "FSAVE");
+ return;
+ }
+
+ if (boot_cpu_data.cpuid_level < XSTATE_CPUID) {
+ WARN_ON_FPU(1);
+ return;
+ }
+
+ cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
+ xfeatures_mask = eax + ((u64)edx << 32);
+
+ if ((xfeatures_mask & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
+ /*
+ * This indicates that something really unexpected happened
+ * with the enumeration. Disable XSAVE and try to continue
+ * booting without it. This is too early to BUG().
+ */
+ pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n", xfeatures_mask);
+ goto out_disable;
+ }
+
+ /*
+ * Clear XSAVE features that are disabled in the normal CPUID.
+ */
+ for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) {
+ if (!boot_cpu_has(xsave_cpuid_features[i]))
+ xfeatures_mask &= ~BIT(i);
+ }
+
+ xfeatures_mask &= fpu__get_supported_xfeatures_mask();
+
+ /* Enable xstate instructions to be able to continue with initialization: */
+ fpu__init_cpu_xstate();
+ err = init_xstate_size();
+ if (err)
+ goto out_disable;
+
+ /*
+ * Update info used for ptrace frames; use standard-format size and no
+ * supervisor xstates:
+ */
+ update_regset_xstate_info(fpu_user_xstate_size, xfeatures_mask & ~XFEATURE_MASK_SUPERVISOR);
+
+ fpu__init_prepare_fx_sw_frame();
+ setup_init_fpu_buf();
+ setup_xstate_comp();
+ print_xstate_offset_size();
+
+ pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n",
+ xfeatures_mask,
+ fpu_kernel_xstate_size,
+ boot_cpu_has(X86_FEATURE_XSAVES) ? "compacted" : "standard");
+ return;
+
+out_disable:
+ /* something went wrong, try to boot without any XSAVE support */
+ fpu__init_disable_system_xstate();
+}
+
+/*
+ * Restore minimal FPU state after suspend:
+ */
+void fpu__resume_cpu(void)
+{
+ /*
+ * Restore XCR0 on xsave capable CPUs:
+ */
+ if (boot_cpu_has(X86_FEATURE_XSAVE))
+ xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask);
+}
+
+/*
+ * Given an xstate feature mask, calculate where in the xsave
+ * buffer the state is. Callers should ensure that the buffer
+ * is valid.
+ *
+ * Note: does not work for compacted buffers.
+ */
+void *__raw_xsave_addr(struct xregs_state *xsave, int xstate_feature_mask)
+{
+ int feature_nr = fls64(xstate_feature_mask) - 1;
+
+ if (!xfeature_enabled(feature_nr)) {
+ WARN_ON_FPU(1);
+ return NULL;
+ }
+
+ return (void *)xsave + xstate_comp_offsets[feature_nr];
+}
+/*
+ * Given the xsave area and a state inside, this function returns the
+ * address of the state.
+ *
+ * This is the API that is called to get xstate address in either
+ * standard format or compacted format of xsave area.
+ *
+ * Note that if there is no data for the field in the xsave buffer
+ * this will return NULL.
+ *
+ * Inputs:
+ * xstate: the thread's storage area for all FPU data
+ * xstate_feature: state which is defined in xsave.h (e.g.
+ * XFEATURE_MASK_FP, XFEATURE_MASK_SSE, etc...)
+ * Output:
+ * address of the state in the xsave area, or NULL if the
+ * field is not present in the xsave buffer.
+ */
+void *get_xsave_addr(struct xregs_state *xsave, int xstate_feature)
+{
+ /*
+ * Do we even *have* xsave state?
+ */
+ if (!boot_cpu_has(X86_FEATURE_XSAVE))
+ return NULL;
+
+ /*
+ * We should not ever be requesting features that we
+ * have not enabled. Remember that pcntxt_mask is
+ * what we write to the XCR0 register.
+ */
+ WARN_ONCE(!(xfeatures_mask & xstate_feature),
+ "get of unsupported state");
+ /*
+ * This assumes the last 'xsave*' instruction to
+ * have requested that 'xstate_feature' be saved.
+ * If it did not, we might be seeing and old value
+ * of the field in the buffer.
+ *
+ * This can happen because the last 'xsave' did not
+ * request that this feature be saved (unlikely)
+ * or because the "init optimization" caused it
+ * to not be saved.
+ */
+ if (!(xsave->header.xfeatures & xstate_feature))
+ return NULL;
+
+ return __raw_xsave_addr(xsave, xstate_feature);
+}
+EXPORT_SYMBOL_GPL(get_xsave_addr);
+
+/*
+ * This wraps up the common operations that need to occur when retrieving
+ * data from xsave state. It first ensures that the current task was
+ * using the FPU and retrieves the data in to a buffer. It then calculates
+ * the offset of the requested field in the buffer.
+ *
+ * This function is safe to call whether the FPU is in use or not.
+ *
+ * Note that this only works on the current task.
+ *
+ * Inputs:
+ * @xsave_state: state which is defined in xsave.h (e.g. XFEATURE_MASK_FP,
+ * XFEATURE_MASK_SSE, etc...)
+ * Output:
+ * address of the state in the xsave area or NULL if the state
+ * is not present or is in its 'init state'.
+ */
+const void *get_xsave_field_ptr(int xsave_state)
+{
+ struct fpu *fpu = &current->thread.fpu;
+
+ if (!fpu->initialized)
+ return NULL;
+ /*
+ * fpu__save() takes the CPU's xstate registers
+ * and saves them off to the 'fpu memory buffer.
+ */
+ fpu__save(fpu);
+
+ return get_xsave_addr(&fpu->state.xsave, xsave_state);
+}
+
+#ifdef CONFIG_ARCH_HAS_PKEYS
+
+/*
+ * This will go out and modify PKRU register to set the access
+ * rights for @pkey to @init_val.
+ */
+int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
+ unsigned long init_val)
+{
+ u32 old_pkru;
+ int pkey_shift = (pkey * PKRU_BITS_PER_PKEY);
+ u32 new_pkru_bits = 0;
+
+ /*
+ * This check implies XSAVE support. OSPKE only gets
+ * set if we enable XSAVE and we enable PKU in XCR0.
+ */
+ if (!boot_cpu_has(X86_FEATURE_OSPKE))
+ return -EINVAL;
+
+ /*
+ * This code should only be called with valid 'pkey'
+ * values originating from in-kernel users. Complain
+ * if a bad value is observed.
+ */
+ WARN_ON_ONCE(pkey >= arch_max_pkey());
+
+ /* Set the bits we need in PKRU: */
+ if (init_val & PKEY_DISABLE_ACCESS)
+ new_pkru_bits |= PKRU_AD_BIT;
+ if (init_val & PKEY_DISABLE_WRITE)
+ new_pkru_bits |= PKRU_WD_BIT;
+
+ /* Shift the bits in to the correct place in PKRU for pkey: */
+ new_pkru_bits <<= pkey_shift;
+
+ /* Get old PKRU and mask off any old bits in place: */
+ old_pkru = read_pkru();
+ old_pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift);
+
+ /* Write old part along with new part: */
+ write_pkru(old_pkru | new_pkru_bits);
+
+ return 0;
+}
+#endif /* ! CONFIG_ARCH_HAS_PKEYS */
+
+/*
+ * Weird legacy quirk: SSE and YMM states store information in the
+ * MXCSR and MXCSR_FLAGS fields of the FP area. That means if the FP
+ * area is marked as unused in the xfeatures header, we need to copy
+ * MXCSR and MXCSR_FLAGS if either SSE or YMM are in use.
+ */
+static inline bool xfeatures_mxcsr_quirk(u64 xfeatures)
+{
+ if (!(xfeatures & (XFEATURE_MASK_SSE|XFEATURE_MASK_YMM)))
+ return false;
+
+ if (xfeatures & XFEATURE_MASK_FP)
+ return false;
+
+ return true;
+}
+
+static void fill_gap(unsigned to, void **kbuf, unsigned *pos, unsigned *count)
+{
+ if (*pos < to) {
+ unsigned size = to - *pos;
+
+ if (size > *count)
+ size = *count;
+ memcpy(*kbuf, (void *)&init_fpstate.xsave + *pos, size);
+ *kbuf += size;
+ *pos += size;
+ *count -= size;
+ }
+}
+
+static void copy_part(unsigned offset, unsigned size, void *from,
+ void **kbuf, unsigned *pos, unsigned *count)
+{
+ fill_gap(offset, kbuf, pos, count);
+ if (size > *count)
+ size = *count;
+ if (size) {
+ memcpy(*kbuf, from, size);
+ *kbuf += size;
+ *pos += size;
+ *count -= size;
+ }
+}
+
+/*
+ * Convert from kernel XSAVES compacted format to standard format and copy
+ * to a kernel-space ptrace buffer.
+ *
+ * It supports partial copy but pos always starts from zero. This is called
+ * from xstateregs_get() and there we check the CPU has XSAVES.
+ */
+int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int offset_start, unsigned int size_total)
+{
+ struct xstate_header header;
+ const unsigned off_mxcsr = offsetof(struct fxregs_state, mxcsr);
+ unsigned count = size_total;
+ int i;
+
+ /*
+ * Currently copy_regset_to_user() starts from pos 0:
+ */
+ if (unlikely(offset_start != 0))
+ return -EFAULT;
+
+ /*
+ * The destination is a ptrace buffer; we put in only user xstates:
+ */
+ memset(&header, 0, sizeof(header));
+ header.xfeatures = xsave->header.xfeatures;
+ header.xfeatures &= ~XFEATURE_MASK_SUPERVISOR;
+
+ if (header.xfeatures & XFEATURE_MASK_FP)
+ copy_part(0, off_mxcsr,
+ &xsave->i387, &kbuf, &offset_start, &count);
+ if (header.xfeatures & (XFEATURE_MASK_SSE | XFEATURE_MASK_YMM))
+ copy_part(off_mxcsr, MXCSR_AND_FLAGS_SIZE,
+ &xsave->i387.mxcsr, &kbuf, &offset_start, &count);
+ if (header.xfeatures & XFEATURE_MASK_FP)
+ copy_part(offsetof(struct fxregs_state, st_space), 128,
+ &xsave->i387.st_space, &kbuf, &offset_start, &count);
+ if (header.xfeatures & XFEATURE_MASK_SSE)
+ copy_part(xstate_offsets[XFEATURE_SSE], 256,
+ &xsave->i387.xmm_space, &kbuf, &offset_start, &count);
+ /*
+ * Fill xsave->i387.sw_reserved value for ptrace frame:
+ */
+ copy_part(offsetof(struct fxregs_state, sw_reserved), 48,
+ xstate_fx_sw_bytes, &kbuf, &offset_start, &count);
+ /*
+ * Copy xregs_state->header:
+ */
+ copy_part(offsetof(struct xregs_state, header), sizeof(header),
+ &header, &kbuf, &offset_start, &count);
+
+ for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
+ /*
+ * Copy only in-use xstates:
+ */
+ if ((header.xfeatures >> i) & 1) {
+ void *src = __raw_xsave_addr(xsave, 1 << i);
+
+ copy_part(xstate_offsets[i], xstate_sizes[i],
+ src, &kbuf, &offset_start, &count);
+ }
+
+ }
+ fill_gap(size_total, &kbuf, &offset_start, &count);
+
+ return 0;
+}
+
+static inline int
+__copy_xstate_to_user(void __user *ubuf, const void *data, unsigned int offset, unsigned int size, unsigned int size_total)
+{
+ if (!size)
+ return 0;
+
+ if (offset < size_total) {
+ unsigned int copy = min(size, size_total - offset);
+
+ if (__copy_to_user(ubuf + offset, data, copy))
+ return -EFAULT;
+ }
+ return 0;
+}
+
+/*
+ * Convert from kernel XSAVES compacted format to standard format and copy
+ * to a user-space buffer. It supports partial copy but pos always starts from
+ * zero. This is called from xstateregs_get() and there we check the CPU
+ * has XSAVES.
+ */
+int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int offset_start, unsigned int size_total)
+{
+ unsigned int offset, size;
+ int ret, i;
+ struct xstate_header header;
+
+ /*
+ * Currently copy_regset_to_user() starts from pos 0:
+ */
+ if (unlikely(offset_start != 0))
+ return -EFAULT;
+
+ /*
+ * The destination is a ptrace buffer; we put in only user xstates:
+ */
+ memset(&header, 0, sizeof(header));
+ header.xfeatures = xsave->header.xfeatures;
+ header.xfeatures &= ~XFEATURE_MASK_SUPERVISOR;
+
+ /*
+ * Copy xregs_state->header:
+ */
+ offset = offsetof(struct xregs_state, header);
+ size = sizeof(header);
+
+ ret = __copy_xstate_to_user(ubuf, &header, offset, size, size_total);
+ if (ret)
+ return ret;
+
+ for (i = 0; i < XFEATURE_MAX; i++) {
+ /*
+ * Copy only in-use xstates:
+ */
+ if ((header.xfeatures >> i) & 1) {
+ void *src = __raw_xsave_addr(xsave, 1 << i);
+
+ offset = xstate_offsets[i];
+ size = xstate_sizes[i];
+
+ /* The next component has to fit fully into the output buffer: */
+ if (offset + size > size_total)
+ break;
+
+ ret = __copy_xstate_to_user(ubuf, src, offset, size, size_total);
+ if (ret)
+ return ret;
+ }
+
+ }
+
+ if (xfeatures_mxcsr_quirk(header.xfeatures)) {
+ offset = offsetof(struct fxregs_state, mxcsr);
+ size = MXCSR_AND_FLAGS_SIZE;
+ __copy_xstate_to_user(ubuf, &xsave->i387.mxcsr, offset, size, size_total);
+ }
+
+ /*
+ * Fill xsave->i387.sw_reserved value for ptrace frame:
+ */
+ offset = offsetof(struct fxregs_state, sw_reserved);
+ size = sizeof(xstate_fx_sw_bytes);
+
+ ret = __copy_xstate_to_user(ubuf, xstate_fx_sw_bytes, offset, size, size_total);
+ if (ret)
+ return ret;
+
+ return 0;
+}
+
+/*
+ * Convert from a ptrace standard-format kernel buffer to kernel XSAVES format
+ * and copy to the target thread. This is called from xstateregs_set().
+ */
+int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf)
+{
+ unsigned int offset, size;
+ int i;
+ struct xstate_header hdr;
+
+ offset = offsetof(struct xregs_state, header);
+ size = sizeof(hdr);
+
+ memcpy(&hdr, kbuf + offset, size);
+
+ if (validate_xstate_header(&hdr))
+ return -EINVAL;
+
+ for (i = 0; i < XFEATURE_MAX; i++) {
+ u64 mask = ((u64)1 << i);
+
+ if (hdr.xfeatures & mask) {
+ void *dst = __raw_xsave_addr(xsave, 1 << i);
+
+ offset = xstate_offsets[i];
+ size = xstate_sizes[i];
+
+ memcpy(dst, kbuf + offset, size);
+ }
+ }
+
+ if (xfeatures_mxcsr_quirk(hdr.xfeatures)) {
+ offset = offsetof(struct fxregs_state, mxcsr);
+ size = MXCSR_AND_FLAGS_SIZE;
+ memcpy(&xsave->i387.mxcsr, kbuf + offset, size);
+ }
+
+ /*
+ * The state that came in from userspace was user-state only.
+ * Mask all the user states out of 'xfeatures':
+ */
+ xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR;
+
+ /*
+ * Add back in the features that came in from userspace:
+ */
+ xsave->header.xfeatures |= hdr.xfeatures;
+
+ return 0;
+}
+
+/*
+ * Convert from a ptrace or sigreturn standard-format user-space buffer to
+ * kernel XSAVES format and copy to the target thread. This is called from
+ * xstateregs_set(), as well as potentially from the sigreturn() and
+ * rt_sigreturn() system calls.
+ */
+int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf)
+{
+ unsigned int offset, size;
+ int i;
+ struct xstate_header hdr;
+
+ offset = offsetof(struct xregs_state, header);
+ size = sizeof(hdr);
+
+ if (__copy_from_user(&hdr, ubuf + offset, size))
+ return -EFAULT;
+
+ if (validate_xstate_header(&hdr))
+ return -EINVAL;
+
+ for (i = 0; i < XFEATURE_MAX; i++) {
+ u64 mask = ((u64)1 << i);
+
+ if (hdr.xfeatures & mask) {
+ void *dst = __raw_xsave_addr(xsave, 1 << i);
+
+ offset = xstate_offsets[i];
+ size = xstate_sizes[i];
+
+ if (__copy_from_user(dst, ubuf + offset, size))
+ return -EFAULT;
+ }
+ }
+
+ if (xfeatures_mxcsr_quirk(hdr.xfeatures)) {
+ offset = offsetof(struct fxregs_state, mxcsr);
+ size = MXCSR_AND_FLAGS_SIZE;
+ if (__copy_from_user(&xsave->i387.mxcsr, ubuf + offset, size))
+ return -EFAULT;
+ }
+
+ /*
+ * The state that came in from userspace was user-state only.
+ * Mask all the user states out of 'xfeatures':
+ */
+ xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR;
+
+ /*
+ * Add back in the features that came in from userspace:
+ */
+ xsave->header.xfeatures |= hdr.xfeatures;
+
+ return 0;
+}
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
new file mode 100644
index 000000000..32b63b30e
--- /dev/null
+++ b/arch/x86/kernel/ftrace.c
@@ -0,0 +1,1081 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Dynamic function tracing support.
+ *
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ *
+ * Thanks goes to Ingo Molnar, for suggesting the idea.
+ * Mathieu Desnoyers, for suggesting postponing the modifications.
+ * Arjan van de Ven, for keeping me straight, and explaining to me
+ * the dangers of modifying code on the run.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/spinlock.h>
+#include <linux/hardirq.h>
+#include <linux/uaccess.h>
+#include <linux/ftrace.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/memory.h>
+
+#include <trace/syscall.h>
+
+#include <asm/set_memory.h>
+#include <asm/kprobes.h>
+#include <asm/ftrace.h>
+#include <asm/nops.h>
+#include <asm/text-patching.h>
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+int ftrace_arch_code_modify_prepare(void)
+ __acquires(&text_mutex)
+{
+ mutex_lock(&text_mutex);
+ set_kernel_text_rw();
+ set_all_modules_text_rw();
+ return 0;
+}
+
+int ftrace_arch_code_modify_post_process(void)
+ __releases(&text_mutex)
+{
+ set_all_modules_text_ro();
+ set_kernel_text_ro();
+ mutex_unlock(&text_mutex);
+ return 0;
+}
+
+union ftrace_code_union {
+ char code[MCOUNT_INSN_SIZE];
+ struct {
+ unsigned char op;
+ int offset;
+ } __attribute__((packed));
+};
+
+static int ftrace_calc_offset(long ip, long addr)
+{
+ return (int)(addr - ip);
+}
+
+static unsigned char *
+ftrace_text_replace(unsigned char op, unsigned long ip, unsigned long addr)
+{
+ static union ftrace_code_union calc;
+
+ calc.op = op;
+ calc.offset = ftrace_calc_offset(ip + MCOUNT_INSN_SIZE, addr);
+
+ return calc.code;
+}
+
+static unsigned char *
+ftrace_call_replace(unsigned long ip, unsigned long addr)
+{
+ return ftrace_text_replace(0xe8, ip, addr);
+}
+
+static inline int
+within(unsigned long addr, unsigned long start, unsigned long end)
+{
+ return addr >= start && addr < end;
+}
+
+static unsigned long text_ip_addr(unsigned long ip)
+{
+ /*
+ * On x86_64, kernel text mappings are mapped read-only, so we use
+ * the kernel identity mapping instead of the kernel text mapping
+ * to modify the kernel text.
+ *
+ * For 32bit kernels, these mappings are same and we can use
+ * kernel identity mapping to modify code.
+ */
+ if (within(ip, (unsigned long)_text, (unsigned long)_etext))
+ ip = (unsigned long)__va(__pa_symbol(ip));
+
+ return ip;
+}
+
+static const unsigned char *ftrace_nop_replace(void)
+{
+ return ideal_nops[NOP_ATOMIC5];
+}
+
+static int
+ftrace_modify_code_direct(unsigned long ip, unsigned const char *old_code,
+ unsigned const char *new_code)
+{
+ unsigned char replaced[MCOUNT_INSN_SIZE];
+
+ ftrace_expected = old_code;
+
+ /*
+ * Note:
+ * We are paranoid about modifying text, as if a bug was to happen, it
+ * could cause us to read or write to someplace that could cause harm.
+ * Carefully read and modify the code with probe_kernel_*(), and make
+ * sure what we read is what we expected it to be before modifying it.
+ */
+
+ /* read the text we want to modify */
+ if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE))
+ return -EFAULT;
+
+ /* Make sure it is what we expect it to be */
+ if (memcmp(replaced, old_code, MCOUNT_INSN_SIZE) != 0)
+ return -EINVAL;
+
+ ip = text_ip_addr(ip);
+
+ /* replace the text with the new text */
+ if (probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE))
+ return -EPERM;
+
+ sync_core();
+
+ return 0;
+}
+
+int ftrace_make_nop(struct module *mod,
+ struct dyn_ftrace *rec, unsigned long addr)
+{
+ unsigned const char *new, *old;
+ unsigned long ip = rec->ip;
+
+ old = ftrace_call_replace(ip, addr);
+ new = ftrace_nop_replace();
+
+ /*
+ * On boot up, and when modules are loaded, the MCOUNT_ADDR
+ * is converted to a nop, and will never become MCOUNT_ADDR
+ * again. This code is either running before SMP (on boot up)
+ * or before the code will ever be executed (module load).
+ * We do not want to use the breakpoint version in this case,
+ * just modify the code directly.
+ */
+ if (addr == MCOUNT_ADDR)
+ return ftrace_modify_code_direct(rec->ip, old, new);
+
+ ftrace_expected = NULL;
+
+ /* Normal cases use add_brk_on_nop */
+ WARN_ONCE(1, "invalid use of ftrace_make_nop");
+ return -EINVAL;
+}
+
+int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
+{
+ unsigned const char *new, *old;
+ unsigned long ip = rec->ip;
+
+ old = ftrace_nop_replace();
+ new = ftrace_call_replace(ip, addr);
+
+ /* Should only be called when module is loaded */
+ return ftrace_modify_code_direct(rec->ip, old, new);
+}
+
+/*
+ * The modifying_ftrace_code is used to tell the breakpoint
+ * handler to call ftrace_int3_handler(). If it fails to
+ * call this handler for a breakpoint added by ftrace, then
+ * the kernel may crash.
+ *
+ * As atomic_writes on x86 do not need a barrier, we do not
+ * need to add smp_mb()s for this to work. It is also considered
+ * that we can not read the modifying_ftrace_code before
+ * executing the breakpoint. That would be quite remarkable if
+ * it could do that. Here's the flow that is required:
+ *
+ * CPU-0 CPU-1
+ *
+ * atomic_inc(mfc);
+ * write int3s
+ * <trap-int3> // implicit (r)mb
+ * if (atomic_read(mfc))
+ * call ftrace_int3_handler()
+ *
+ * Then when we are finished:
+ *
+ * atomic_dec(mfc);
+ *
+ * If we hit a breakpoint that was not set by ftrace, it does not
+ * matter if ftrace_int3_handler() is called or not. It will
+ * simply be ignored. But it is crucial that a ftrace nop/caller
+ * breakpoint is handled. No other user should ever place a
+ * breakpoint on an ftrace nop/caller location. It must only
+ * be done by this code.
+ */
+atomic_t modifying_ftrace_code __read_mostly;
+
+static int
+ftrace_modify_code(unsigned long ip, unsigned const char *old_code,
+ unsigned const char *new_code);
+
+/*
+ * Should never be called:
+ * As it is only called by __ftrace_replace_code() which is called by
+ * ftrace_replace_code() that x86 overrides, and by ftrace_update_code()
+ * which is called to turn mcount into nops or nops into function calls
+ * but not to convert a function from not using regs to one that uses
+ * regs, which ftrace_modify_call() is for.
+ */
+int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr,
+ unsigned long addr)
+{
+ WARN_ON(1);
+ ftrace_expected = NULL;
+ return -EINVAL;
+}
+
+static unsigned long ftrace_update_func;
+static unsigned long ftrace_update_func_call;
+
+static int update_ftrace_func(unsigned long ip, void *new)
+{
+ unsigned char old[MCOUNT_INSN_SIZE];
+ int ret;
+
+ memcpy(old, (void *)ip, MCOUNT_INSN_SIZE);
+
+ ftrace_update_func = ip;
+ /* Make sure the breakpoints see the ftrace_update_func update */
+ smp_wmb();
+
+ /* See comment above by declaration of modifying_ftrace_code */
+ atomic_inc(&modifying_ftrace_code);
+
+ ret = ftrace_modify_code(ip, old, new);
+
+ atomic_dec(&modifying_ftrace_code);
+
+ return ret;
+}
+
+int ftrace_update_ftrace_func(ftrace_func_t func)
+{
+ unsigned long ip = (unsigned long)(&ftrace_call);
+ unsigned char *new;
+ int ret;
+
+ ftrace_update_func_call = (unsigned long)func;
+
+ new = ftrace_call_replace(ip, (unsigned long)func);
+ ret = update_ftrace_func(ip, new);
+
+ /* Also update the regs callback function */
+ if (!ret) {
+ ip = (unsigned long)(&ftrace_regs_call);
+ new = ftrace_call_replace(ip, (unsigned long)func);
+ ret = update_ftrace_func(ip, new);
+ }
+
+ return ret;
+}
+
+static int is_ftrace_caller(unsigned long ip)
+{
+ if (ip == ftrace_update_func)
+ return 1;
+
+ return 0;
+}
+
+/*
+ * A breakpoint was added to the code address we are about to
+ * modify, and this is the handle that will just skip over it.
+ * We are either changing a nop into a trace call, or a trace
+ * call to a nop. While the change is taking place, we treat
+ * it just like it was a nop.
+ */
+int ftrace_int3_handler(struct pt_regs *regs)
+{
+ unsigned long ip;
+
+ if (WARN_ON_ONCE(!regs))
+ return 0;
+
+ ip = regs->ip - INT3_INSN_SIZE;
+
+#ifdef CONFIG_X86_64
+ if (ftrace_location(ip)) {
+ int3_emulate_call(regs, (unsigned long)ftrace_regs_caller);
+ return 1;
+ } else if (is_ftrace_caller(ip)) {
+ if (!ftrace_update_func_call) {
+ int3_emulate_jmp(regs, ip + CALL_INSN_SIZE);
+ return 1;
+ }
+ int3_emulate_call(regs, ftrace_update_func_call);
+ return 1;
+ }
+#else
+ if (ftrace_location(ip) || is_ftrace_caller(ip)) {
+ int3_emulate_jmp(regs, ip + CALL_INSN_SIZE);
+ return 1;
+ }
+#endif
+
+ return 0;
+}
+
+static int ftrace_write(unsigned long ip, const char *val, int size)
+{
+ ip = text_ip_addr(ip);
+
+ if (probe_kernel_write((void *)ip, val, size))
+ return -EPERM;
+
+ return 0;
+}
+
+static int add_break(unsigned long ip, const char *old)
+{
+ unsigned char replaced[MCOUNT_INSN_SIZE];
+ unsigned char brk = BREAKPOINT_INSTRUCTION;
+
+ if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE))
+ return -EFAULT;
+
+ ftrace_expected = old;
+
+ /* Make sure it is what we expect it to be */
+ if (memcmp(replaced, old, MCOUNT_INSN_SIZE) != 0)
+ return -EINVAL;
+
+ return ftrace_write(ip, &brk, 1);
+}
+
+static int add_brk_on_call(struct dyn_ftrace *rec, unsigned long addr)
+{
+ unsigned const char *old;
+ unsigned long ip = rec->ip;
+
+ old = ftrace_call_replace(ip, addr);
+
+ return add_break(rec->ip, old);
+}
+
+
+static int add_brk_on_nop(struct dyn_ftrace *rec)
+{
+ unsigned const char *old;
+
+ old = ftrace_nop_replace();
+
+ return add_break(rec->ip, old);
+}
+
+static int add_breakpoints(struct dyn_ftrace *rec, int enable)
+{
+ unsigned long ftrace_addr;
+ int ret;
+
+ ftrace_addr = ftrace_get_addr_curr(rec);
+
+ ret = ftrace_test_record(rec, enable);
+
+ switch (ret) {
+ case FTRACE_UPDATE_IGNORE:
+ return 0;
+
+ case FTRACE_UPDATE_MAKE_CALL:
+ /* converting nop to call */
+ return add_brk_on_nop(rec);
+
+ case FTRACE_UPDATE_MODIFY_CALL:
+ case FTRACE_UPDATE_MAKE_NOP:
+ /* converting a call to a nop */
+ return add_brk_on_call(rec, ftrace_addr);
+ }
+ return 0;
+}
+
+/*
+ * On error, we need to remove breakpoints. This needs to
+ * be done caefully. If the address does not currently have a
+ * breakpoint, we know we are done. Otherwise, we look at the
+ * remaining 4 bytes of the instruction. If it matches a nop
+ * we replace the breakpoint with the nop. Otherwise we replace
+ * it with the call instruction.
+ */
+static int remove_breakpoint(struct dyn_ftrace *rec)
+{
+ unsigned char ins[MCOUNT_INSN_SIZE];
+ unsigned char brk = BREAKPOINT_INSTRUCTION;
+ const unsigned char *nop;
+ unsigned long ftrace_addr;
+ unsigned long ip = rec->ip;
+
+ /* If we fail the read, just give up */
+ if (probe_kernel_read(ins, (void *)ip, MCOUNT_INSN_SIZE))
+ return -EFAULT;
+
+ /* If this does not have a breakpoint, we are done */
+ if (ins[0] != brk)
+ return 0;
+
+ nop = ftrace_nop_replace();
+
+ /*
+ * If the last 4 bytes of the instruction do not match
+ * a nop, then we assume that this is a call to ftrace_addr.
+ */
+ if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0) {
+ /*
+ * For extra paranoidism, we check if the breakpoint is on
+ * a call that would actually jump to the ftrace_addr.
+ * If not, don't touch the breakpoint, we make just create
+ * a disaster.
+ */
+ ftrace_addr = ftrace_get_addr_new(rec);
+ nop = ftrace_call_replace(ip, ftrace_addr);
+
+ if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) == 0)
+ goto update;
+
+ /* Check both ftrace_addr and ftrace_old_addr */
+ ftrace_addr = ftrace_get_addr_curr(rec);
+ nop = ftrace_call_replace(ip, ftrace_addr);
+
+ ftrace_expected = nop;
+
+ if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0)
+ return -EINVAL;
+ }
+
+ update:
+ return ftrace_write(ip, nop, 1);
+}
+
+static int add_update_code(unsigned long ip, unsigned const char *new)
+{
+ /* skip breakpoint */
+ ip++;
+ new++;
+ return ftrace_write(ip, new, MCOUNT_INSN_SIZE - 1);
+}
+
+static int add_update_call(struct dyn_ftrace *rec, unsigned long addr)
+{
+ unsigned long ip = rec->ip;
+ unsigned const char *new;
+
+ new = ftrace_call_replace(ip, addr);
+ return add_update_code(ip, new);
+}
+
+static int add_update_nop(struct dyn_ftrace *rec)
+{
+ unsigned long ip = rec->ip;
+ unsigned const char *new;
+
+ new = ftrace_nop_replace();
+ return add_update_code(ip, new);
+}
+
+static int add_update(struct dyn_ftrace *rec, int enable)
+{
+ unsigned long ftrace_addr;
+ int ret;
+
+ ret = ftrace_test_record(rec, enable);
+
+ ftrace_addr = ftrace_get_addr_new(rec);
+
+ switch (ret) {
+ case FTRACE_UPDATE_IGNORE:
+ return 0;
+
+ case FTRACE_UPDATE_MODIFY_CALL:
+ case FTRACE_UPDATE_MAKE_CALL:
+ /* converting nop to call */
+ return add_update_call(rec, ftrace_addr);
+
+ case FTRACE_UPDATE_MAKE_NOP:
+ /* converting a call to a nop */
+ return add_update_nop(rec);
+ }
+
+ return 0;
+}
+
+static int finish_update_call(struct dyn_ftrace *rec, unsigned long addr)
+{
+ unsigned long ip = rec->ip;
+ unsigned const char *new;
+
+ new = ftrace_call_replace(ip, addr);
+
+ return ftrace_write(ip, new, 1);
+}
+
+static int finish_update_nop(struct dyn_ftrace *rec)
+{
+ unsigned long ip = rec->ip;
+ unsigned const char *new;
+
+ new = ftrace_nop_replace();
+
+ return ftrace_write(ip, new, 1);
+}
+
+static int finish_update(struct dyn_ftrace *rec, int enable)
+{
+ unsigned long ftrace_addr;
+ int ret;
+
+ ret = ftrace_update_record(rec, enable);
+
+ ftrace_addr = ftrace_get_addr_new(rec);
+
+ switch (ret) {
+ case FTRACE_UPDATE_IGNORE:
+ return 0;
+
+ case FTRACE_UPDATE_MODIFY_CALL:
+ case FTRACE_UPDATE_MAKE_CALL:
+ /* converting nop to call */
+ return finish_update_call(rec, ftrace_addr);
+
+ case FTRACE_UPDATE_MAKE_NOP:
+ /* converting a call to a nop */
+ return finish_update_nop(rec);
+ }
+
+ return 0;
+}
+
+static void do_sync_core(void *data)
+{
+ sync_core();
+}
+
+static void run_sync(void)
+{
+ int enable_irqs;
+
+ /* No need to sync if there's only one CPU */
+ if (num_online_cpus() == 1)
+ return;
+
+ enable_irqs = irqs_disabled();
+
+ /* We may be called with interrupts disabled (on bootup). */
+ if (enable_irqs)
+ local_irq_enable();
+ on_each_cpu(do_sync_core, NULL, 1);
+ if (enable_irqs)
+ local_irq_disable();
+}
+
+void ftrace_replace_code(int enable)
+{
+ struct ftrace_rec_iter *iter;
+ struct dyn_ftrace *rec;
+ const char *report = "adding breakpoints";
+ int count = 0;
+ int ret;
+
+ for_ftrace_rec_iter(iter) {
+ rec = ftrace_rec_iter_record(iter);
+
+ ret = add_breakpoints(rec, enable);
+ if (ret)
+ goto remove_breakpoints;
+ count++;
+ }
+
+ run_sync();
+
+ report = "updating code";
+ count = 0;
+
+ for_ftrace_rec_iter(iter) {
+ rec = ftrace_rec_iter_record(iter);
+
+ ret = add_update(rec, enable);
+ if (ret)
+ goto remove_breakpoints;
+ count++;
+ }
+
+ run_sync();
+
+ report = "removing breakpoints";
+ count = 0;
+
+ for_ftrace_rec_iter(iter) {
+ rec = ftrace_rec_iter_record(iter);
+
+ ret = finish_update(rec, enable);
+ if (ret)
+ goto remove_breakpoints;
+ count++;
+ }
+
+ run_sync();
+
+ return;
+
+ remove_breakpoints:
+ pr_warn("Failed on %s (%d):\n", report, count);
+ ftrace_bug(ret, rec);
+ for_ftrace_rec_iter(iter) {
+ rec = ftrace_rec_iter_record(iter);
+ /*
+ * Breakpoints are handled only when this function is in
+ * progress. The system could not work with them.
+ */
+ if (remove_breakpoint(rec))
+ BUG();
+ }
+ run_sync();
+}
+
+static int
+ftrace_modify_code(unsigned long ip, unsigned const char *old_code,
+ unsigned const char *new_code)
+{
+ int ret;
+
+ ret = add_break(ip, old_code);
+ if (ret)
+ goto out;
+
+ run_sync();
+
+ ret = add_update_code(ip, new_code);
+ if (ret)
+ goto fail_update;
+
+ run_sync();
+
+ ret = ftrace_write(ip, new_code, 1);
+ /*
+ * The breakpoint is handled only when this function is in progress.
+ * The system could not work if we could not remove it.
+ */
+ BUG_ON(ret);
+ out:
+ run_sync();
+ return ret;
+
+ fail_update:
+ /* Also here the system could not work with the breakpoint */
+ if (ftrace_write(ip, old_code, 1))
+ BUG();
+ goto out;
+}
+
+void arch_ftrace_update_code(int command)
+{
+ /* See comment above by declaration of modifying_ftrace_code */
+ atomic_inc(&modifying_ftrace_code);
+
+ ftrace_modify_all_code(command);
+
+ atomic_dec(&modifying_ftrace_code);
+}
+
+int __init ftrace_dyn_arch_init(void)
+{
+ return 0;
+}
+
+/* Currently only x86_64 supports dynamic trampolines */
+#ifdef CONFIG_X86_64
+
+#ifdef CONFIG_MODULES
+#include <linux/moduleloader.h>
+/* Module allocation simplifies allocating memory for code */
+static inline void *alloc_tramp(unsigned long size)
+{
+ return module_alloc(size);
+}
+static inline void tramp_free(void *tramp, int size)
+{
+ int npages = PAGE_ALIGN(size) >> PAGE_SHIFT;
+
+ set_memory_nx((unsigned long)tramp, npages);
+ set_memory_rw((unsigned long)tramp, npages);
+ module_memfree(tramp);
+}
+#else
+/* Trampolines can only be created if modules are supported */
+static inline void *alloc_tramp(unsigned long size)
+{
+ return NULL;
+}
+static inline void tramp_free(void *tramp, int size) { }
+#endif
+
+/* Defined as markers to the end of the ftrace default trampolines */
+extern void ftrace_regs_caller_end(void);
+extern void ftrace_epilogue(void);
+extern void ftrace_caller_op_ptr(void);
+extern void ftrace_regs_caller_op_ptr(void);
+
+/* movq function_trace_op(%rip), %rdx */
+/* 0x48 0x8b 0x15 <offset-to-ftrace_trace_op (4 bytes)> */
+#define OP_REF_SIZE 7
+
+/*
+ * The ftrace_ops is passed to the function callback. Since the
+ * trampoline only services a single ftrace_ops, we can pass in
+ * that ops directly.
+ *
+ * The ftrace_op_code_union is used to create a pointer to the
+ * ftrace_ops that will be passed to the callback function.
+ */
+union ftrace_op_code_union {
+ char code[OP_REF_SIZE];
+ struct {
+ char op[3];
+ int offset;
+ } __attribute__((packed));
+};
+
+#define RET_SIZE 1
+
+static unsigned long
+create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
+{
+ unsigned long start_offset;
+ unsigned long end_offset;
+ unsigned long op_offset;
+ unsigned long offset;
+ unsigned long npages;
+ unsigned long size;
+ unsigned long retq;
+ unsigned long *ptr;
+ void *trampoline;
+ void *ip;
+ /* 48 8b 15 <offset> is movq <offset>(%rip), %rdx */
+ unsigned const char op_ref[] = { 0x48, 0x8b, 0x15 };
+ union ftrace_op_code_union op_ptr;
+ int ret;
+
+ if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) {
+ start_offset = (unsigned long)ftrace_regs_caller;
+ end_offset = (unsigned long)ftrace_regs_caller_end;
+ op_offset = (unsigned long)ftrace_regs_caller_op_ptr;
+ } else {
+ start_offset = (unsigned long)ftrace_caller;
+ end_offset = (unsigned long)ftrace_epilogue;
+ op_offset = (unsigned long)ftrace_caller_op_ptr;
+ }
+
+ size = end_offset - start_offset;
+
+ /*
+ * Allocate enough size to store the ftrace_caller code,
+ * the iret , as well as the address of the ftrace_ops this
+ * trampoline is used for.
+ */
+ trampoline = alloc_tramp(size + RET_SIZE + sizeof(void *));
+ if (!trampoline)
+ return 0;
+
+ *tramp_size = size + RET_SIZE + sizeof(void *);
+ npages = DIV_ROUND_UP(*tramp_size, PAGE_SIZE);
+
+ /* Copy ftrace_caller onto the trampoline memory */
+ ret = probe_kernel_read(trampoline, (void *)start_offset, size);
+ if (WARN_ON(ret < 0))
+ goto fail;
+
+ ip = trampoline + size;
+
+ /* The trampoline ends with ret(q) */
+ retq = (unsigned long)ftrace_stub;
+ ret = probe_kernel_read(ip, (void *)retq, RET_SIZE);
+ if (WARN_ON(ret < 0))
+ goto fail;
+
+ /*
+ * The address of the ftrace_ops that is used for this trampoline
+ * is stored at the end of the trampoline. This will be used to
+ * load the third parameter for the callback. Basically, that
+ * location at the end of the trampoline takes the place of
+ * the global function_trace_op variable.
+ */
+
+ ptr = (unsigned long *)(trampoline + size + RET_SIZE);
+ *ptr = (unsigned long)ops;
+
+ op_offset -= start_offset;
+ memcpy(&op_ptr, trampoline + op_offset, OP_REF_SIZE);
+
+ /* Are we pointing to the reference? */
+ if (WARN_ON(memcmp(op_ptr.op, op_ref, 3) != 0))
+ goto fail;
+
+ /* Load the contents of ptr into the callback parameter */
+ offset = (unsigned long)ptr;
+ offset -= (unsigned long)trampoline + op_offset + OP_REF_SIZE;
+
+ op_ptr.offset = offset;
+
+ /* put in the new offset to the ftrace_ops */
+ memcpy(trampoline + op_offset, &op_ptr, OP_REF_SIZE);
+
+ /* ALLOC_TRAMP flags lets us know we created it */
+ ops->flags |= FTRACE_OPS_FL_ALLOC_TRAMP;
+
+ /*
+ * Module allocation needs to be completed by making the page
+ * executable. The page is still writable, which is a security hazard,
+ * but anyhow ftrace breaks W^X completely.
+ */
+ set_memory_x((unsigned long)trampoline, npages);
+ return (unsigned long)trampoline;
+fail:
+ tramp_free(trampoline, *tramp_size);
+ return 0;
+}
+
+static unsigned long calc_trampoline_call_offset(bool save_regs)
+{
+ unsigned long start_offset;
+ unsigned long call_offset;
+
+ if (save_regs) {
+ start_offset = (unsigned long)ftrace_regs_caller;
+ call_offset = (unsigned long)ftrace_regs_call;
+ } else {
+ start_offset = (unsigned long)ftrace_caller;
+ call_offset = (unsigned long)ftrace_call;
+ }
+
+ return call_offset - start_offset;
+}
+
+void arch_ftrace_update_trampoline(struct ftrace_ops *ops)
+{
+ ftrace_func_t func;
+ unsigned char *new;
+ unsigned long offset;
+ unsigned long ip;
+ unsigned int size;
+ int ret, npages;
+
+ if (ops->trampoline) {
+ /*
+ * The ftrace_ops caller may set up its own trampoline.
+ * In such a case, this code must not modify it.
+ */
+ if (!(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))
+ return;
+ npages = PAGE_ALIGN(ops->trampoline_size) >> PAGE_SHIFT;
+ set_memory_rw(ops->trampoline, npages);
+ } else {
+ ops->trampoline = create_trampoline(ops, &size);
+ if (!ops->trampoline)
+ return;
+ ops->trampoline_size = size;
+ npages = PAGE_ALIGN(size) >> PAGE_SHIFT;
+ }
+
+ offset = calc_trampoline_call_offset(ops->flags & FTRACE_OPS_FL_SAVE_REGS);
+ ip = ops->trampoline + offset;
+
+ func = ftrace_ops_get_func(ops);
+
+ ftrace_update_func_call = (unsigned long)func;
+
+ /* Do a safe modify in case the trampoline is executing */
+ new = ftrace_call_replace(ip, (unsigned long)func);
+ ret = update_ftrace_func(ip, new);
+ set_memory_ro(ops->trampoline, npages);
+
+ /* The update should never fail */
+ WARN_ON(ret);
+}
+
+/* Return the address of the function the trampoline calls */
+static void *addr_from_call(void *ptr)
+{
+ union ftrace_code_union calc;
+ int ret;
+
+ ret = probe_kernel_read(&calc, ptr, MCOUNT_INSN_SIZE);
+ if (WARN_ON_ONCE(ret < 0))
+ return NULL;
+
+ /* Make sure this is a call */
+ if (WARN_ON_ONCE(calc.op != 0xe8)) {
+ pr_warn("Expected e8, got %x\n", calc.op);
+ return NULL;
+ }
+
+ return ptr + MCOUNT_INSN_SIZE + calc.offset;
+}
+
+void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent,
+ unsigned long frame_pointer);
+
+/*
+ * If the ops->trampoline was not allocated, then it probably
+ * has a static trampoline func, or is the ftrace caller itself.
+ */
+static void *static_tramp_func(struct ftrace_ops *ops, struct dyn_ftrace *rec)
+{
+ unsigned long offset;
+ bool save_regs = rec->flags & FTRACE_FL_REGS_EN;
+ void *ptr;
+
+ if (ops && ops->trampoline) {
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ /*
+ * We only know about function graph tracer setting as static
+ * trampoline.
+ */
+ if (ops->trampoline == FTRACE_GRAPH_ADDR)
+ return (void *)prepare_ftrace_return;
+#endif
+ return NULL;
+ }
+
+ offset = calc_trampoline_call_offset(save_regs);
+
+ if (save_regs)
+ ptr = (void *)FTRACE_REGS_ADDR + offset;
+ else
+ ptr = (void *)FTRACE_ADDR + offset;
+
+ return addr_from_call(ptr);
+}
+
+void *arch_ftrace_trampoline_func(struct ftrace_ops *ops, struct dyn_ftrace *rec)
+{
+ unsigned long offset;
+
+ /* If we didn't allocate this trampoline, consider it static */
+ if (!ops || !(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))
+ return static_tramp_func(ops, rec);
+
+ offset = calc_trampoline_call_offset(ops->flags & FTRACE_OPS_FL_SAVE_REGS);
+ return addr_from_call((void *)ops->trampoline + offset);
+}
+
+void arch_ftrace_trampoline_free(struct ftrace_ops *ops)
+{
+ if (!ops || !(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))
+ return;
+
+ tramp_free((void *)ops->trampoline, ops->trampoline_size);
+ ops->trampoline = 0;
+}
+
+#endif /* CONFIG_X86_64 */
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+extern void ftrace_graph_call(void);
+
+static unsigned char *ftrace_jmp_replace(unsigned long ip, unsigned long addr)
+{
+ return ftrace_text_replace(0xe9, ip, addr);
+}
+
+static int ftrace_mod_jmp(unsigned long ip, void *func)
+{
+ unsigned char *new;
+
+ ftrace_update_func_call = 0UL;
+ new = ftrace_jmp_replace(ip, (unsigned long)func);
+
+ return update_ftrace_func(ip, new);
+}
+
+int ftrace_enable_ftrace_graph_caller(void)
+{
+ unsigned long ip = (unsigned long)(&ftrace_graph_call);
+
+ return ftrace_mod_jmp(ip, &ftrace_graph_caller);
+}
+
+int ftrace_disable_ftrace_graph_caller(void)
+{
+ unsigned long ip = (unsigned long)(&ftrace_graph_call);
+
+ return ftrace_mod_jmp(ip, &ftrace_stub);
+}
+
+#endif /* !CONFIG_DYNAMIC_FTRACE */
+
+/*
+ * Hook the return address and push it in the stack of return addrs
+ * in current thread info.
+ */
+void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent,
+ unsigned long frame_pointer)
+{
+ unsigned long old;
+ int faulted;
+ unsigned long return_hooker = (unsigned long)
+ &return_to_handler;
+
+ /*
+ * When resuming from suspend-to-ram, this function can be indirectly
+ * called from early CPU startup code while the CPU is in real mode,
+ * which would fail miserably. Make sure the stack pointer is a
+ * virtual address.
+ *
+ * This check isn't as accurate as virt_addr_valid(), but it should be
+ * good enough for this purpose, and it's fast.
+ */
+ if (unlikely((long)__builtin_frame_address(0) >= 0))
+ return;
+
+ if (unlikely(ftrace_graph_is_dead()))
+ return;
+
+ if (unlikely(atomic_read(&current->tracing_graph_pause)))
+ return;
+
+ /*
+ * Protect against fault, even if it shouldn't
+ * happen. This tool is too much intrusive to
+ * ignore such a protection.
+ */
+ asm volatile(
+ "1: " _ASM_MOV " (%[parent]), %[old]\n"
+ "2: " _ASM_MOV " %[return_hooker], (%[parent])\n"
+ " movl $0, %[faulted]\n"
+ "3:\n"
+
+ ".section .fixup, \"ax\"\n"
+ "4: movl $1, %[faulted]\n"
+ " jmp 3b\n"
+ ".previous\n"
+
+ _ASM_EXTABLE(1b, 4b)
+ _ASM_EXTABLE(2b, 4b)
+
+ : [old] "=&r" (old), [faulted] "=r" (faulted)
+ : [parent] "r" (parent), [return_hooker] "r" (return_hooker)
+ : "memory"
+ );
+
+ if (unlikely(faulted)) {
+ ftrace_graph_stop();
+ WARN_ON(1);
+ return;
+ }
+
+ if (function_graph_enter(old, self_addr, frame_pointer, parent))
+ *parent = old;
+}
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/arch/x86/kernel/ftrace_32.S b/arch/x86/kernel/ftrace_32.S
new file mode 100644
index 000000000..83f18e829
--- /dev/null
+++ b/arch/x86/kernel/ftrace_32.S
@@ -0,0 +1,250 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2017 Steven Rostedt, VMware Inc.
+ */
+
+#include <linux/linkage.h>
+#include <asm/page_types.h>
+#include <asm/segment.h>
+#include <asm/export.h>
+#include <asm/ftrace.h>
+#include <asm/nospec-branch.h>
+#include <asm/frame.h>
+
+#ifdef CC_USING_FENTRY
+# define function_hook __fentry__
+EXPORT_SYMBOL(__fentry__)
+#else
+# define function_hook mcount
+EXPORT_SYMBOL(mcount)
+#endif
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+/* mcount uses a frame pointer even if CONFIG_FRAME_POINTER is not set */
+#if !defined(CC_USING_FENTRY) || defined(CONFIG_FRAME_POINTER)
+# define USING_FRAME_POINTER
+#endif
+
+#ifdef USING_FRAME_POINTER
+# define MCOUNT_FRAME 1 /* using frame = true */
+#else
+# define MCOUNT_FRAME 0 /* using frame = false */
+#endif
+
+ENTRY(function_hook)
+ ret
+END(function_hook)
+
+ENTRY(ftrace_caller)
+
+#ifdef USING_FRAME_POINTER
+# ifdef CC_USING_FENTRY
+ /*
+ * Frame pointers are of ip followed by bp.
+ * Since fentry is an immediate jump, we are left with
+ * parent-ip, function-ip. We need to add a frame with
+ * parent-ip followed by ebp.
+ */
+ pushl 4(%esp) /* parent ip */
+ pushl %ebp
+ movl %esp, %ebp
+ pushl 2*4(%esp) /* function ip */
+# endif
+ /* For mcount, the function ip is directly above */
+ pushl %ebp
+ movl %esp, %ebp
+#endif
+ pushl %eax
+ pushl %ecx
+ pushl %edx
+ pushl $0 /* Pass NULL as regs pointer */
+
+#ifdef USING_FRAME_POINTER
+ /* Load parent ebp into edx */
+ movl 4*4(%esp), %edx
+#else
+ /* There's no frame pointer, load the appropriate stack addr instead */
+ lea 4*4(%esp), %edx
+#endif
+
+ movl (MCOUNT_FRAME+4)*4(%esp), %eax /* load the rip */
+ /* Get the parent ip */
+ movl 4(%edx), %edx /* edx has ebp */
+
+ movl function_trace_op, %ecx
+ subl $MCOUNT_INSN_SIZE, %eax
+
+.globl ftrace_call
+ftrace_call:
+ call ftrace_stub
+
+ addl $4, %esp /* skip NULL pointer */
+ popl %edx
+ popl %ecx
+ popl %eax
+#ifdef USING_FRAME_POINTER
+ popl %ebp
+# ifdef CC_USING_FENTRY
+ addl $4,%esp /* skip function ip */
+ popl %ebp /* this is the orig bp */
+ addl $4, %esp /* skip parent ip */
+# endif
+#endif
+.Lftrace_ret:
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+.globl ftrace_graph_call
+ftrace_graph_call:
+ jmp ftrace_stub
+#endif
+
+/* This is weak to keep gas from relaxing the jumps */
+WEAK(ftrace_stub)
+ ret
+END(ftrace_caller)
+
+ENTRY(ftrace_regs_caller)
+ /*
+ * i386 does not save SS and ESP when coming from kernel.
+ * Instead, to get sp, &regs->sp is used (see ptrace.h).
+ * Unfortunately, that means eflags must be at the same location
+ * as the current return ip is. We move the return ip into the
+ * regs->ip location, and move flags into the return ip location.
+ */
+ pushl $__KERNEL_CS
+ pushl 4(%esp) /* Save the return ip */
+ pushl $0 /* Load 0 into orig_ax */
+ pushl %gs
+ pushl %fs
+ pushl %es
+ pushl %ds
+ pushl %eax
+
+ /* Get flags and place them into the return ip slot */
+ pushf
+ popl %eax
+ movl %eax, 8*4(%esp)
+
+ pushl %ebp
+ pushl %edi
+ pushl %esi
+ pushl %edx
+ pushl %ecx
+ pushl %ebx
+
+ ENCODE_FRAME_POINTER
+
+ movl 12*4(%esp), %eax /* Load ip (1st parameter) */
+ subl $MCOUNT_INSN_SIZE, %eax /* Adjust ip */
+#ifdef CC_USING_FENTRY
+ movl 15*4(%esp), %edx /* Load parent ip (2nd parameter) */
+#else
+ movl 0x4(%ebp), %edx /* Load parent ip (2nd parameter) */
+#endif
+ movl function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */
+ pushl %esp /* Save pt_regs as 4th parameter */
+
+GLOBAL(ftrace_regs_call)
+ call ftrace_stub
+
+ addl $4, %esp /* Skip pt_regs */
+
+ /* restore flags */
+ push 14*4(%esp)
+ popf
+
+ /* Move return ip back to its original location */
+ movl 12*4(%esp), %eax
+ movl %eax, 14*4(%esp)
+
+ popl %ebx
+ popl %ecx
+ popl %edx
+ popl %esi
+ popl %edi
+ popl %ebp
+ popl %eax
+ popl %ds
+ popl %es
+ popl %fs
+ popl %gs
+
+ /* use lea to not affect flags */
+ lea 3*4(%esp), %esp /* Skip orig_ax, ip and cs */
+
+ jmp .Lftrace_ret
+#else /* ! CONFIG_DYNAMIC_FTRACE */
+
+ENTRY(function_hook)
+ cmpl $__PAGE_OFFSET, %esp
+ jb ftrace_stub /* Paging not enabled yet? */
+
+ cmpl $ftrace_stub, ftrace_trace_function
+ jnz .Ltrace
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ cmpl $ftrace_stub, ftrace_graph_return
+ jnz ftrace_graph_caller
+
+ cmpl $ftrace_graph_entry_stub, ftrace_graph_entry
+ jnz ftrace_graph_caller
+#endif
+.globl ftrace_stub
+ftrace_stub:
+ ret
+
+ /* taken from glibc */
+.Ltrace:
+ pushl %eax
+ pushl %ecx
+ pushl %edx
+ movl 0xc(%esp), %eax
+ movl 0x4(%ebp), %edx
+ subl $MCOUNT_INSN_SIZE, %eax
+
+ movl ftrace_trace_function, %ecx
+ CALL_NOSPEC %ecx
+
+ popl %edx
+ popl %ecx
+ popl %eax
+ jmp ftrace_stub
+END(function_hook)
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ENTRY(ftrace_graph_caller)
+ pushl %eax
+ pushl %ecx
+ pushl %edx
+ movl 3*4(%esp), %eax
+ /* Even with frame pointers, fentry doesn't have one here */
+#ifdef CC_USING_FENTRY
+ lea 4*4(%esp), %edx
+ movl $0, %ecx
+#else
+ lea 0x4(%ebp), %edx
+ movl (%ebp), %ecx
+#endif
+ subl $MCOUNT_INSN_SIZE, %eax
+ call prepare_ftrace_return
+ popl %edx
+ popl %ecx
+ popl %eax
+ ret
+END(ftrace_graph_caller)
+
+.globl return_to_handler
+return_to_handler:
+ pushl %eax
+ pushl %edx
+#ifdef CC_USING_FENTRY
+ movl $0, %eax
+#else
+ movl %ebp, %eax
+#endif
+ call ftrace_return_to_handler
+ movl %eax, %ecx
+ popl %edx
+ popl %eax
+ JMP_NOSPEC %ecx
+#endif
diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S
new file mode 100644
index 000000000..24b9abf71
--- /dev/null
+++ b/arch/x86/kernel/ftrace_64.S
@@ -0,0 +1,342 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2014 Steven Rostedt, Red Hat Inc
+ */
+
+#include <linux/linkage.h>
+#include <asm/ptrace.h>
+#include <asm/ftrace.h>
+#include <asm/export.h>
+#include <asm/nospec-branch.h>
+#include <asm/unwind_hints.h>
+#include <asm/frame.h>
+
+ .code64
+ .section .entry.text, "ax"
+
+#ifdef CC_USING_FENTRY
+# define function_hook __fentry__
+EXPORT_SYMBOL(__fentry__)
+#else
+# define function_hook mcount
+EXPORT_SYMBOL(mcount)
+#endif
+
+#ifdef CONFIG_FRAME_POINTER
+# ifdef CC_USING_FENTRY
+/* Save parent and function stack frames (rip and rbp) */
+# define MCOUNT_FRAME_SIZE (8+16*2)
+# else
+/* Save just function stack frame (rip and rbp) */
+# define MCOUNT_FRAME_SIZE (8+16)
+# endif
+#else
+/* No need to save a stack frame */
+# define MCOUNT_FRAME_SIZE 0
+#endif /* CONFIG_FRAME_POINTER */
+
+/* Size of stack used to save mcount regs in save_mcount_regs */
+#define MCOUNT_REG_SIZE (SS+8 + MCOUNT_FRAME_SIZE)
+
+/*
+ * gcc -pg option adds a call to 'mcount' in most functions.
+ * When -mfentry is used, the call is to 'fentry' and not 'mcount'
+ * and is done before the function's stack frame is set up.
+ * They both require a set of regs to be saved before calling
+ * any C code and restored before returning back to the function.
+ *
+ * On boot up, all these calls are converted into nops. When tracing
+ * is enabled, the call can jump to either ftrace_caller or
+ * ftrace_regs_caller. Callbacks (tracing functions) that require
+ * ftrace_regs_caller (like kprobes) need to have pt_regs passed to
+ * it. For this reason, the size of the pt_regs structure will be
+ * allocated on the stack and the required mcount registers will
+ * be saved in the locations that pt_regs has them in.
+ */
+
+/*
+ * @added: the amount of stack added before calling this
+ *
+ * After this is called, the following registers contain:
+ *
+ * %rdi - holds the address that called the trampoline
+ * %rsi - holds the parent function (traced function's return address)
+ * %rdx - holds the original %rbp
+ */
+.macro save_mcount_regs added=0
+
+#ifdef CONFIG_FRAME_POINTER
+ /* Save the original rbp */
+ pushq %rbp
+
+ /*
+ * Stack traces will stop at the ftrace trampoline if the frame pointer
+ * is not set up properly. If fentry is used, we need to save a frame
+ * pointer for the parent as well as the function traced, because the
+ * fentry is called before the stack frame is set up, where as mcount
+ * is called afterward.
+ */
+#ifdef CC_USING_FENTRY
+ /* Save the parent pointer (skip orig rbp and our return address) */
+ pushq \added+8*2(%rsp)
+ pushq %rbp
+ movq %rsp, %rbp
+ /* Save the return address (now skip orig rbp, rbp and parent) */
+ pushq \added+8*3(%rsp)
+#else
+ /* Can't assume that rip is before this (unless added was zero) */
+ pushq \added+8(%rsp)
+#endif
+ pushq %rbp
+ movq %rsp, %rbp
+#endif /* CONFIG_FRAME_POINTER */
+
+ /*
+ * We add enough stack to save all regs.
+ */
+ subq $(MCOUNT_REG_SIZE - MCOUNT_FRAME_SIZE), %rsp
+ movq %rax, RAX(%rsp)
+ movq %rcx, RCX(%rsp)
+ movq %rdx, RDX(%rsp)
+ movq %rsi, RSI(%rsp)
+ movq %rdi, RDI(%rsp)
+ movq %r8, R8(%rsp)
+ movq %r9, R9(%rsp)
+ /*
+ * Save the original RBP. Even though the mcount ABI does not
+ * require this, it helps out callers.
+ */
+#ifdef CONFIG_FRAME_POINTER
+ movq MCOUNT_REG_SIZE-8(%rsp), %rdx
+#else
+ movq %rbp, %rdx
+#endif
+ movq %rdx, RBP(%rsp)
+
+ /* Copy the parent address into %rsi (second parameter) */
+#ifdef CC_USING_FENTRY
+ movq MCOUNT_REG_SIZE+8+\added(%rsp), %rsi
+#else
+ /* %rdx contains original %rbp */
+ movq 8(%rdx), %rsi
+#endif
+
+ /* Move RIP to its proper location */
+ movq MCOUNT_REG_SIZE+\added(%rsp), %rdi
+ movq %rdi, RIP(%rsp)
+
+ /*
+ * Now %rdi (the first parameter) has the return address of
+ * where ftrace_call returns. But the callbacks expect the
+ * address of the call itself.
+ */
+ subq $MCOUNT_INSN_SIZE, %rdi
+ .endm
+
+.macro restore_mcount_regs
+ movq R9(%rsp), %r9
+ movq R8(%rsp), %r8
+ movq RDI(%rsp), %rdi
+ movq RSI(%rsp), %rsi
+ movq RDX(%rsp), %rdx
+ movq RCX(%rsp), %rcx
+ movq RAX(%rsp), %rax
+
+ /* ftrace_regs_caller can modify %rbp */
+ movq RBP(%rsp), %rbp
+
+ addq $MCOUNT_REG_SIZE, %rsp
+
+ .endm
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+ENTRY(function_hook)
+ retq
+ENDPROC(function_hook)
+
+ENTRY(ftrace_caller)
+ /* save_mcount_regs fills in first two parameters */
+ save_mcount_regs
+
+GLOBAL(ftrace_caller_op_ptr)
+ /* Load the ftrace_ops into the 3rd parameter */
+ movq function_trace_op(%rip), %rdx
+
+ /* regs go into 4th parameter (but make it NULL) */
+ movq $0, %rcx
+
+GLOBAL(ftrace_call)
+ call ftrace_stub
+
+ restore_mcount_regs
+
+ /*
+ * The code up to this label is copied into trampolines so
+ * think twice before adding any new code or changing the
+ * layout here.
+ */
+GLOBAL(ftrace_epilogue)
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+GLOBAL(ftrace_graph_call)
+ jmp ftrace_stub
+#endif
+
+/*
+ * This is weak to keep gas from relaxing the jumps.
+ * It is also used to copy the retq for trampolines.
+ */
+WEAK(ftrace_stub)
+ retq
+ENDPROC(ftrace_caller)
+
+ENTRY(ftrace_regs_caller)
+ /* Save the current flags before any operations that can change them */
+ pushfq
+
+ /* added 8 bytes to save flags */
+ save_mcount_regs 8
+ /* save_mcount_regs fills in first two parameters */
+
+GLOBAL(ftrace_regs_caller_op_ptr)
+ /* Load the ftrace_ops into the 3rd parameter */
+ movq function_trace_op(%rip), %rdx
+
+ /* Save the rest of pt_regs */
+ movq %r15, R15(%rsp)
+ movq %r14, R14(%rsp)
+ movq %r13, R13(%rsp)
+ movq %r12, R12(%rsp)
+ movq %r11, R11(%rsp)
+ movq %r10, R10(%rsp)
+ movq %rbx, RBX(%rsp)
+ /* Copy saved flags */
+ movq MCOUNT_REG_SIZE(%rsp), %rcx
+ movq %rcx, EFLAGS(%rsp)
+ /* Kernel segments */
+ movq $__KERNEL_DS, %rcx
+ movq %rcx, SS(%rsp)
+ movq $__KERNEL_CS, %rcx
+ movq %rcx, CS(%rsp)
+ /* Stack - skipping return address and flags */
+ leaq MCOUNT_REG_SIZE+8*2(%rsp), %rcx
+ movq %rcx, RSP(%rsp)
+
+ ENCODE_FRAME_POINTER
+
+ /* regs go into 4th parameter */
+ leaq (%rsp), %rcx
+
+GLOBAL(ftrace_regs_call)
+ call ftrace_stub
+
+ /* Copy flags back to SS, to restore them */
+ movq EFLAGS(%rsp), %rax
+ movq %rax, MCOUNT_REG_SIZE(%rsp)
+
+ /* Handlers can change the RIP */
+ movq RIP(%rsp), %rax
+ movq %rax, MCOUNT_REG_SIZE+8(%rsp)
+
+ /* restore the rest of pt_regs */
+ movq R15(%rsp), %r15
+ movq R14(%rsp), %r14
+ movq R13(%rsp), %r13
+ movq R12(%rsp), %r12
+ movq R10(%rsp), %r10
+ movq RBX(%rsp), %rbx
+
+ restore_mcount_regs
+
+ /* Restore flags */
+ popfq
+
+ /*
+ * As this jmp to ftrace_epilogue can be a short jump
+ * it must not be copied into the trampoline.
+ * The trampoline will add the code to jump
+ * to the return.
+ */
+GLOBAL(ftrace_regs_caller_end)
+
+ jmp ftrace_epilogue
+
+ENDPROC(ftrace_regs_caller)
+
+
+#else /* ! CONFIG_DYNAMIC_FTRACE */
+
+ENTRY(function_hook)
+ cmpq $ftrace_stub, ftrace_trace_function
+ jnz trace
+
+fgraph_trace:
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ cmpq $ftrace_stub, ftrace_graph_return
+ jnz ftrace_graph_caller
+
+ cmpq $ftrace_graph_entry_stub, ftrace_graph_entry
+ jnz ftrace_graph_caller
+#endif
+
+GLOBAL(ftrace_stub)
+ retq
+
+trace:
+ /* save_mcount_regs fills in first two parameters */
+ save_mcount_regs
+
+ /*
+ * When DYNAMIC_FTRACE is not defined, ARCH_SUPPORTS_FTRACE_OPS is not
+ * set (see include/asm/ftrace.h and include/linux/ftrace.h). Only the
+ * ip and parent ip are used and the list function is called when
+ * function tracing is enabled.
+ */
+ movq ftrace_trace_function, %r8
+ CALL_NOSPEC %r8
+ restore_mcount_regs
+
+ jmp fgraph_trace
+ENDPROC(function_hook)
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ENTRY(ftrace_graph_caller)
+ /* Saves rbp into %rdx and fills first parameter */
+ save_mcount_regs
+
+#ifdef CC_USING_FENTRY
+ leaq MCOUNT_REG_SIZE+8(%rsp), %rsi
+ movq $0, %rdx /* No framepointers needed */
+#else
+ /* Save address of the return address of traced function */
+ leaq 8(%rdx), %rsi
+ /* ftrace does sanity checks against frame pointers */
+ movq (%rdx), %rdx
+#endif
+ call prepare_ftrace_return
+
+ restore_mcount_regs
+
+ retq
+ENDPROC(ftrace_graph_caller)
+
+ENTRY(return_to_handler)
+ UNWIND_HINT_EMPTY
+ subq $24, %rsp
+
+ /* Save the return values */
+ movq %rax, (%rsp)
+ movq %rdx, 8(%rsp)
+ movq %rbp, %rdi
+
+ call ftrace_return_to_handler
+
+ movq %rax, %rdi
+ movq 8(%rsp), %rdx
+ movq (%rsp), %rax
+ addq $24, %rsp
+ JMP_NOSPEC %rdi
+END(return_to_handler)
+#endif
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
new file mode 100644
index 000000000..ec6fefbfd
--- /dev/null
+++ b/arch/x86/kernel/head32.c
@@ -0,0 +1,119 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/arch/i386/kernel/head32.c -- prepare to run common code
+ *
+ * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
+ * Copyright (C) 2007 Eric Biederman <ebiederm@xmission.com>
+ */
+
+#include <linux/init.h>
+#include <linux/start_kernel.h>
+#include <linux/mm.h>
+#include <linux/memblock.h>
+
+#include <asm/desc.h>
+#include <asm/setup.h>
+#include <asm/sections.h>
+#include <asm/e820/api.h>
+#include <asm/page.h>
+#include <asm/apic.h>
+#include <asm/io_apic.h>
+#include <asm/bios_ebda.h>
+#include <asm/tlbflush.h>
+#include <asm/bootparam_utils.h>
+
+static void __init i386_default_early_setup(void)
+{
+ /* Initialize 32bit specific setup functions */
+ x86_init.resources.reserve_resources = i386_reserve_resources;
+ x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc;
+}
+
+asmlinkage __visible void __init i386_start_kernel(void)
+{
+ /* Make sure IDT is set up before any exception happens */
+ idt_setup_early_handler();
+
+ cr4_init_shadow();
+
+ sanitize_boot_params(&boot_params);
+
+ x86_early_init_platform_quirks();
+
+ /* Call the subarch specific early setup function */
+ switch (boot_params.hdr.hardware_subarch) {
+ case X86_SUBARCH_INTEL_MID:
+ x86_intel_mid_early_setup();
+ break;
+ case X86_SUBARCH_CE4100:
+ x86_ce4100_early_setup();
+ break;
+ default:
+ i386_default_early_setup();
+ break;
+ }
+
+ start_kernel();
+}
+
+/*
+ * Initialize page tables. This creates a PDE and a set of page
+ * tables, which are located immediately beyond __brk_base. The variable
+ * _brk_end is set up to point to the first "safe" location.
+ * Mappings are created both at virtual address 0 (identity mapping)
+ * and PAGE_OFFSET for up to _end.
+ *
+ * In PAE mode initial_page_table is statically defined to contain
+ * enough entries to cover the VMSPLIT option (that is the top 1, 2 or 3
+ * entries). The identity mapping is handled by pointing two PGD entries
+ * to the first kernel PMD. Note the upper half of each PMD or PTE are
+ * always zero at this stage.
+ */
+void __init mk_early_pgtbl_32(void)
+{
+#ifdef __pa
+#undef __pa
+#endif
+#define __pa(x) ((unsigned long)(x) - PAGE_OFFSET)
+ pte_t pte, *ptep;
+ int i;
+ unsigned long *ptr;
+ /* Enough space to fit pagetables for the low memory linear map */
+ const unsigned long limit = __pa(_end) +
+ (PAGE_TABLE_SIZE(LOWMEM_PAGES) << PAGE_SHIFT);
+#ifdef CONFIG_X86_PAE
+ pmd_t pl2, *pl2p = (pmd_t *)__pa(initial_pg_pmd);
+#define SET_PL2(pl2, val) { (pl2).pmd = (val); }
+#else
+ pgd_t pl2, *pl2p = (pgd_t *)__pa(initial_page_table);
+#define SET_PL2(pl2, val) { (pl2).pgd = (val); }
+#endif
+
+ ptep = (pte_t *)__pa(__brk_base);
+ pte.pte = PTE_IDENT_ATTR;
+
+ while ((pte.pte & PTE_PFN_MASK) < limit) {
+
+ SET_PL2(pl2, (unsigned long)ptep | PDE_IDENT_ATTR);
+ *pl2p = pl2;
+#ifndef CONFIG_X86_PAE
+ /* Kernel PDE entry */
+ *(pl2p + ((PAGE_OFFSET >> PGDIR_SHIFT))) = pl2;
+#endif
+ for (i = 0; i < PTRS_PER_PTE; i++) {
+ *ptep = pte;
+ pte.pte += PAGE_SIZE;
+ ptep++;
+ }
+
+ pl2p++;
+ }
+
+ ptr = (unsigned long *)__pa(&max_pfn_mapped);
+ /* Can't use pte_pfn() since it's a call with CONFIG_PARAVIRT */
+ *ptr = (pte.pte & PTE_PFN_MASK) >> PAGE_SHIFT;
+
+ ptr = (unsigned long *)__pa(&_brk_end);
+ *ptr = (unsigned long)ptep + PAGE_OFFSET;
+}
+
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
new file mode 100644
index 000000000..88dc38b4a
--- /dev/null
+++ b/arch/x86/kernel/head64.c
@@ -0,0 +1,491 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * prepare to run common code
+ *
+ * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
+ */
+
+#define DISABLE_BRANCH_PROFILING
+
+/* cpu_feature_enabled() cannot be used this early */
+#define USE_EARLY_PGTABLE_L5
+
+#include <linux/init.h>
+#include <linux/linkage.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/percpu.h>
+#include <linux/start_kernel.h>
+#include <linux/io.h>
+#include <linux/memblock.h>
+#include <linux/mem_encrypt.h>
+
+#include <asm/processor.h>
+#include <asm/proto.h>
+#include <asm/smp.h>
+#include <asm/setup.h>
+#include <asm/desc.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/sections.h>
+#include <asm/kdebug.h>
+#include <asm/e820/api.h>
+#include <asm/bios_ebda.h>
+#include <asm/bootparam_utils.h>
+#include <asm/microcode.h>
+#include <asm/kasan.h>
+#include <asm/fixmap.h>
+
+/*
+ * Manage page tables very early on.
+ */
+extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD];
+static unsigned int __initdata next_early_pgt;
+pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX);
+
+#ifdef CONFIG_X86_5LEVEL
+unsigned int __pgtable_l5_enabled __ro_after_init;
+unsigned int pgdir_shift __ro_after_init = 39;
+EXPORT_SYMBOL(pgdir_shift);
+unsigned int ptrs_per_p4d __ro_after_init = 1;
+EXPORT_SYMBOL(ptrs_per_p4d);
+#endif
+
+#ifdef CONFIG_DYNAMIC_MEMORY_LAYOUT
+unsigned long page_offset_base __ro_after_init = __PAGE_OFFSET_BASE_L4;
+EXPORT_SYMBOL(page_offset_base);
+unsigned long vmalloc_base __ro_after_init = __VMALLOC_BASE_L4;
+EXPORT_SYMBOL(vmalloc_base);
+unsigned long vmemmap_base __ro_after_init = __VMEMMAP_BASE_L4;
+EXPORT_SYMBOL(vmemmap_base);
+#endif
+
+#define __head __section(.head.text)
+
+static void __head *fixup_pointer(void *ptr, unsigned long physaddr)
+{
+ return ptr - (void *)_text + (void *)physaddr;
+}
+
+static unsigned long __head *fixup_long(void *ptr, unsigned long physaddr)
+{
+ return fixup_pointer(ptr, physaddr);
+}
+
+#ifdef CONFIG_X86_5LEVEL
+static unsigned int __head *fixup_int(void *ptr, unsigned long physaddr)
+{
+ return fixup_pointer(ptr, physaddr);
+}
+
+static bool __head check_la57_support(unsigned long physaddr)
+{
+ /*
+ * 5-level paging is detected and enabled at kernel decomression
+ * stage. Only check if it has been enabled there.
+ */
+ if (!(native_read_cr4() & X86_CR4_LA57))
+ return false;
+
+ *fixup_int(&__pgtable_l5_enabled, physaddr) = 1;
+ *fixup_int(&pgdir_shift, physaddr) = 48;
+ *fixup_int(&ptrs_per_p4d, physaddr) = 512;
+ *fixup_long(&page_offset_base, physaddr) = __PAGE_OFFSET_BASE_L5;
+ *fixup_long(&vmalloc_base, physaddr) = __VMALLOC_BASE_L5;
+ *fixup_long(&vmemmap_base, physaddr) = __VMEMMAP_BASE_L5;
+
+ return true;
+}
+#else
+static bool __head check_la57_support(unsigned long physaddr)
+{
+ return false;
+}
+#endif
+
+/* Code in __startup_64() can be relocated during execution, but the compiler
+ * doesn't have to generate PC-relative relocations when accessing globals from
+ * that function. Clang actually does not generate them, which leads to
+ * boot-time crashes. To work around this problem, every global pointer must
+ * be adjusted using fixup_pointer().
+ */
+unsigned long __head __startup_64(unsigned long physaddr,
+ struct boot_params *bp)
+{
+ unsigned long vaddr, vaddr_end;
+ unsigned long load_delta, *p;
+ unsigned long pgtable_flags;
+ pgdval_t *pgd;
+ p4dval_t *p4d;
+ pudval_t *pud;
+ pmdval_t *pmd, pmd_entry;
+ pteval_t *mask_ptr;
+ bool la57;
+ int i;
+ unsigned int *next_pgt_ptr;
+
+ la57 = check_la57_support(physaddr);
+
+ /* Is the address too large? */
+ if (physaddr >> MAX_PHYSMEM_BITS)
+ for (;;);
+
+ /*
+ * Compute the delta between the address I am compiled to run at
+ * and the address I am actually running at.
+ */
+ load_delta = physaddr - (unsigned long)(_text - __START_KERNEL_map);
+
+ /* Is the address not 2M aligned? */
+ if (load_delta & ~PMD_PAGE_MASK)
+ for (;;);
+
+ /* Activate Secure Memory Encryption (SME) if supported and enabled */
+ sme_enable(bp);
+
+ /* Include the SME encryption mask in the fixup value */
+ load_delta += sme_get_me_mask();
+
+ /* Fixup the physical addresses in the page table */
+
+ pgd = fixup_pointer(&early_top_pgt, physaddr);
+ p = pgd + pgd_index(__START_KERNEL_map);
+ if (la57)
+ *p = (unsigned long)level4_kernel_pgt;
+ else
+ *p = (unsigned long)level3_kernel_pgt;
+ *p += _PAGE_TABLE_NOENC - __START_KERNEL_map + load_delta;
+
+ if (la57) {
+ p4d = fixup_pointer(&level4_kernel_pgt, physaddr);
+ p4d[511] += load_delta;
+ }
+
+ pud = fixup_pointer(&level3_kernel_pgt, physaddr);
+ pud[510] += load_delta;
+ pud[511] += load_delta;
+
+ pmd = fixup_pointer(level2_fixmap_pgt, physaddr);
+ for (i = FIXMAP_PMD_TOP; i > FIXMAP_PMD_TOP - FIXMAP_PMD_NUM; i--)
+ pmd[i] += load_delta;
+
+ /*
+ * Set up the identity mapping for the switchover. These
+ * entries should *NOT* have the global bit set! This also
+ * creates a bunch of nonsense entries but that is fine --
+ * it avoids problems around wraparound.
+ */
+
+ next_pgt_ptr = fixup_pointer(&next_early_pgt, physaddr);
+ pud = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], physaddr);
+ pmd = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], physaddr);
+
+ pgtable_flags = _KERNPG_TABLE_NOENC + sme_get_me_mask();
+
+ if (la57) {
+ p4d = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++],
+ physaddr);
+
+ i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
+ pgd[i + 0] = (pgdval_t)p4d + pgtable_flags;
+ pgd[i + 1] = (pgdval_t)p4d + pgtable_flags;
+
+ i = physaddr >> P4D_SHIFT;
+ p4d[(i + 0) % PTRS_PER_P4D] = (pgdval_t)pud + pgtable_flags;
+ p4d[(i + 1) % PTRS_PER_P4D] = (pgdval_t)pud + pgtable_flags;
+ } else {
+ i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
+ pgd[i + 0] = (pgdval_t)pud + pgtable_flags;
+ pgd[i + 1] = (pgdval_t)pud + pgtable_flags;
+ }
+
+ i = physaddr >> PUD_SHIFT;
+ pud[(i + 0) % PTRS_PER_PUD] = (pudval_t)pmd + pgtable_flags;
+ pud[(i + 1) % PTRS_PER_PUD] = (pudval_t)pmd + pgtable_flags;
+
+ pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL;
+ /* Filter out unsupported __PAGE_KERNEL_* bits: */
+ mask_ptr = fixup_pointer(&__supported_pte_mask, physaddr);
+ pmd_entry &= *mask_ptr;
+ pmd_entry += sme_get_me_mask();
+ pmd_entry += physaddr;
+
+ for (i = 0; i < DIV_ROUND_UP(_end - _text, PMD_SIZE); i++) {
+ int idx = i + (physaddr >> PMD_SHIFT);
+
+ pmd[idx % PTRS_PER_PMD] = pmd_entry + i * PMD_SIZE;
+ }
+
+ /*
+ * Fixup the kernel text+data virtual addresses. Note that
+ * we might write invalid pmds, when the kernel is relocated
+ * cleanup_highmap() fixes this up along with the mappings
+ * beyond _end.
+ *
+ * Only the region occupied by the kernel image has so far
+ * been checked against the table of usable memory regions
+ * provided by the firmware, so invalidate pages outside that
+ * region. A page table entry that maps to a reserved area of
+ * memory would allow processor speculation into that area,
+ * and on some hardware (particularly the UV platform) even
+ * speculative access to some reserved areas is caught as an
+ * error, causing the BIOS to halt the system.
+ */
+
+ pmd = fixup_pointer(level2_kernel_pgt, physaddr);
+
+ /* invalidate pages before the kernel image */
+ for (i = 0; i < pmd_index((unsigned long)_text); i++)
+ pmd[i] &= ~_PAGE_PRESENT;
+
+ /* fixup pages that are part of the kernel image */
+ for (; i <= pmd_index((unsigned long)_end); i++)
+ if (pmd[i] & _PAGE_PRESENT)
+ pmd[i] += load_delta;
+
+ /* invalidate pages after the kernel image */
+ for (; i < PTRS_PER_PMD; i++)
+ pmd[i] &= ~_PAGE_PRESENT;
+
+ /*
+ * Fixup phys_base - remove the memory encryption mask to obtain
+ * the true physical address.
+ */
+ *fixup_long(&phys_base, physaddr) += load_delta - sme_get_me_mask();
+
+ /* Encrypt the kernel and related (if SME is active) */
+ sme_encrypt_kernel(bp);
+
+ /*
+ * Clear the memory encryption mask from the .bss..decrypted section.
+ * The bss section will be memset to zero later in the initialization so
+ * there is no need to zero it after changing the memory encryption
+ * attribute.
+ */
+ if (mem_encrypt_active()) {
+ vaddr = (unsigned long)__start_bss_decrypted;
+ vaddr_end = (unsigned long)__end_bss_decrypted;
+ for (; vaddr < vaddr_end; vaddr += PMD_SIZE) {
+ i = pmd_index(vaddr);
+ pmd[i] -= sme_get_me_mask();
+ }
+ }
+
+ /*
+ * Return the SME encryption mask (if SME is active) to be used as a
+ * modifier for the initial pgdir entry programmed into CR3.
+ */
+ return sme_get_me_mask();
+}
+
+unsigned long __startup_secondary_64(void)
+{
+ /*
+ * Return the SME encryption mask (if SME is active) to be used as a
+ * modifier for the initial pgdir entry programmed into CR3.
+ */
+ return sme_get_me_mask();
+}
+
+/* Wipe all early page tables except for the kernel symbol map */
+static void __init reset_early_page_tables(void)
+{
+ memset(early_top_pgt, 0, sizeof(pgd_t)*(PTRS_PER_PGD-1));
+ next_early_pgt = 0;
+ write_cr3(__sme_pa_nodebug(early_top_pgt));
+}
+
+/* Create a new PMD entry */
+int __init __early_make_pgtable(unsigned long address, pmdval_t pmd)
+{
+ unsigned long physaddr = address - __PAGE_OFFSET;
+ pgdval_t pgd, *pgd_p;
+ p4dval_t p4d, *p4d_p;
+ pudval_t pud, *pud_p;
+ pmdval_t *pmd_p;
+
+ /* Invalid address or early pgt is done ? */
+ if (physaddr >= MAXMEM || read_cr3_pa() != __pa_nodebug(early_top_pgt))
+ return -1;
+
+again:
+ pgd_p = &early_top_pgt[pgd_index(address)].pgd;
+ pgd = *pgd_p;
+
+ /*
+ * The use of __START_KERNEL_map rather than __PAGE_OFFSET here is
+ * critical -- __PAGE_OFFSET would point us back into the dynamic
+ * range and we might end up looping forever...
+ */
+ if (!pgtable_l5_enabled())
+ p4d_p = pgd_p;
+ else if (pgd)
+ p4d_p = (p4dval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
+ else {
+ if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) {
+ reset_early_page_tables();
+ goto again;
+ }
+
+ p4d_p = (p4dval_t *)early_dynamic_pgts[next_early_pgt++];
+ memset(p4d_p, 0, sizeof(*p4d_p) * PTRS_PER_P4D);
+ *pgd_p = (pgdval_t)p4d_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
+ }
+ p4d_p += p4d_index(address);
+ p4d = *p4d_p;
+
+ if (p4d)
+ pud_p = (pudval_t *)((p4d & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
+ else {
+ if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) {
+ reset_early_page_tables();
+ goto again;
+ }
+
+ pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++];
+ memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD);
+ *p4d_p = (p4dval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
+ }
+ pud_p += pud_index(address);
+ pud = *pud_p;
+
+ if (pud)
+ pmd_p = (pmdval_t *)((pud & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
+ else {
+ if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) {
+ reset_early_page_tables();
+ goto again;
+ }
+
+ pmd_p = (pmdval_t *)early_dynamic_pgts[next_early_pgt++];
+ memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD);
+ *pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
+ }
+ pmd_p[pmd_index(address)] = pmd;
+
+ return 0;
+}
+
+int __init early_make_pgtable(unsigned long address)
+{
+ unsigned long physaddr = address - __PAGE_OFFSET;
+ pmdval_t pmd;
+
+ pmd = (physaddr & PMD_MASK) + early_pmd_flags;
+
+ return __early_make_pgtable(address, pmd);
+}
+
+/* Don't add a printk in there. printk relies on the PDA which is not initialized
+ yet. */
+static void __init clear_bss(void)
+{
+ memset(__bss_start, 0,
+ (unsigned long) __bss_stop - (unsigned long) __bss_start);
+}
+
+static unsigned long get_cmd_line_ptr(void)
+{
+ unsigned long cmd_line_ptr = boot_params.hdr.cmd_line_ptr;
+
+ cmd_line_ptr |= (u64)boot_params.ext_cmd_line_ptr << 32;
+
+ return cmd_line_ptr;
+}
+
+static void __init copy_bootdata(char *real_mode_data)
+{
+ char * command_line;
+ unsigned long cmd_line_ptr;
+
+ /*
+ * If SME is active, this will create decrypted mappings of the
+ * boot data in advance of the copy operations.
+ */
+ sme_map_bootdata(real_mode_data);
+
+ memcpy(&boot_params, real_mode_data, sizeof boot_params);
+ sanitize_boot_params(&boot_params);
+ cmd_line_ptr = get_cmd_line_ptr();
+ if (cmd_line_ptr) {
+ command_line = __va(cmd_line_ptr);
+ memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE);
+ }
+
+ /*
+ * The old boot data is no longer needed and won't be reserved,
+ * freeing up that memory for use by the system. If SME is active,
+ * we need to remove the mappings that were created so that the
+ * memory doesn't remain mapped as decrypted.
+ */
+ sme_unmap_bootdata(real_mode_data);
+}
+
+asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
+{
+ /*
+ * Build-time sanity checks on the kernel image and module
+ * area mappings. (these are purely build-time and produce no code)
+ */
+ BUILD_BUG_ON(MODULES_VADDR < __START_KERNEL_map);
+ BUILD_BUG_ON(MODULES_VADDR - __START_KERNEL_map < KERNEL_IMAGE_SIZE);
+ BUILD_BUG_ON(MODULES_LEN + KERNEL_IMAGE_SIZE > 2*PUD_SIZE);
+ BUILD_BUG_ON((__START_KERNEL_map & ~PMD_MASK) != 0);
+ BUILD_BUG_ON((MODULES_VADDR & ~PMD_MASK) != 0);
+ BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
+ MAYBE_BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
+ (__START_KERNEL & PGDIR_MASK)));
+ BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);
+
+ cr4_init_shadow();
+
+ /* Kill off the identity-map trampoline */
+ reset_early_page_tables();
+
+ clear_bss();
+
+ clear_page(init_top_pgt);
+
+ /*
+ * SME support may update early_pmd_flags to include the memory
+ * encryption mask, so it needs to be called before anything
+ * that may generate a page fault.
+ */
+ sme_early_init();
+
+ kasan_early_init();
+
+ idt_setup_early_handler();
+
+ copy_bootdata(__va(real_mode_data));
+
+ /*
+ * Load microcode early on BSP.
+ */
+ load_ucode_bsp();
+
+ /* set init_top_pgt kernel high mapping*/
+ init_top_pgt[511] = early_top_pgt[511];
+
+ x86_64_start_reservations(real_mode_data);
+}
+
+void __init x86_64_start_reservations(char *real_mode_data)
+{
+ /* version is always not zero if it is copied */
+ if (!boot_params.hdr.version)
+ copy_bootdata(__va(real_mode_data));
+
+ x86_early_init_platform_quirks();
+
+ switch (boot_params.hdr.hardware_subarch) {
+ case X86_SUBARCH_INTEL_MID:
+ x86_intel_mid_early_setup();
+ break;
+ default:
+ break;
+ }
+
+ start_kernel();
+}
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
new file mode 100644
index 000000000..30f9cb2c0
--- /dev/null
+++ b/arch/x86/kernel/head_32.S
@@ -0,0 +1,623 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * Enhanced CPU detection and feature setting code by Mike Jagdis
+ * and Martin Mares, November 1997.
+ */
+
+.text
+#include <linux/threads.h>
+#include <linux/init.h>
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/page_types.h>
+#include <asm/pgtable_types.h>
+#include <asm/cache.h>
+#include <asm/thread_info.h>
+#include <asm/asm-offsets.h>
+#include <asm/setup.h>
+#include <asm/processor-flags.h>
+#include <asm/msr-index.h>
+#include <asm/cpufeatures.h>
+#include <asm/percpu.h>
+#include <asm/nops.h>
+#include <asm/bootparam.h>
+#include <asm/export.h>
+#include <asm/pgtable_32.h>
+
+/* Physical address */
+#define pa(X) ((X) - __PAGE_OFFSET)
+
+/*
+ * References to members of the new_cpu_data structure.
+ */
+
+#define X86 new_cpu_data+CPUINFO_x86
+#define X86_VENDOR new_cpu_data+CPUINFO_x86_vendor
+#define X86_MODEL new_cpu_data+CPUINFO_x86_model
+#define X86_STEPPING new_cpu_data+CPUINFO_x86_stepping
+#define X86_HARD_MATH new_cpu_data+CPUINFO_hard_math
+#define X86_CPUID new_cpu_data+CPUINFO_cpuid_level
+#define X86_CAPABILITY new_cpu_data+CPUINFO_x86_capability
+#define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id
+
+
+#define SIZEOF_PTREGS 17*4
+
+/*
+ * Worst-case size of the kernel mapping we need to make:
+ * a relocatable kernel can live anywhere in lowmem, so we need to be able
+ * to map all of lowmem.
+ */
+KERNEL_PAGES = LOWMEM_PAGES
+
+INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_PAGES) * PAGE_SIZE
+RESERVE_BRK(pagetables, INIT_MAP_SIZE)
+
+/*
+ * 32-bit kernel entrypoint; only used by the boot CPU. On entry,
+ * %esi points to the real-mode code as a 32-bit pointer.
+ * CS and DS must be 4 GB flat segments, but we don't depend on
+ * any particular GDT layout, because we load our own as soon as we
+ * can.
+ */
+__HEAD
+ENTRY(startup_32)
+ movl pa(initial_stack),%ecx
+
+ /* test KEEP_SEGMENTS flag to see if the bootloader is asking
+ us to not reload segments */
+ testb $KEEP_SEGMENTS, BP_loadflags(%esi)
+ jnz 2f
+
+/*
+ * Set segments to known values.
+ */
+ lgdt pa(boot_gdt_descr)
+ movl $(__BOOT_DS),%eax
+ movl %eax,%ds
+ movl %eax,%es
+ movl %eax,%fs
+ movl %eax,%gs
+ movl %eax,%ss
+2:
+ leal -__PAGE_OFFSET(%ecx),%esp
+
+/*
+ * Clear BSS first so that there are no surprises...
+ */
+ cld
+ xorl %eax,%eax
+ movl $pa(__bss_start),%edi
+ movl $pa(__bss_stop),%ecx
+ subl %edi,%ecx
+ shrl $2,%ecx
+ rep ; stosl
+/*
+ * Copy bootup parameters out of the way.
+ * Note: %esi still has the pointer to the real-mode data.
+ * With the kexec as boot loader, parameter segment might be loaded beyond
+ * kernel image and might not even be addressable by early boot page tables.
+ * (kexec on panic case). Hence copy out the parameters before initializing
+ * page tables.
+ */
+ movl $pa(boot_params),%edi
+ movl $(PARAM_SIZE/4),%ecx
+ cld
+ rep
+ movsl
+ movl pa(boot_params) + NEW_CL_POINTER,%esi
+ andl %esi,%esi
+ jz 1f # No command line
+ movl $pa(boot_command_line),%edi
+ movl $(COMMAND_LINE_SIZE/4),%ecx
+ rep
+ movsl
+1:
+
+#ifdef CONFIG_OLPC
+ /* save OFW's pgdir table for later use when calling into OFW */
+ movl %cr3, %eax
+ movl %eax, pa(olpc_ofw_pgd)
+#endif
+
+#ifdef CONFIG_MICROCODE
+ /* Early load ucode on BSP. */
+ call load_ucode_bsp
+#endif
+
+ /* Create early pagetables. */
+ call mk_early_pgtbl_32
+
+ /* Do early initialization of the fixmap area */
+ movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax
+#ifdef CONFIG_X86_PAE
+#define KPMDS (((-__PAGE_OFFSET) >> 30) & 3) /* Number of kernel PMDs */
+ movl %eax,pa(initial_pg_pmd+0x1000*KPMDS-8)
+#else
+ movl %eax,pa(initial_page_table+0xffc)
+#endif
+
+#ifdef CONFIG_PARAVIRT
+ /* This is can only trip for a broken bootloader... */
+ cmpw $0x207, pa(boot_params + BP_version)
+ jb .Ldefault_entry
+
+ /* Paravirt-compatible boot parameters. Look to see what architecture
+ we're booting under. */
+ movl pa(boot_params + BP_hardware_subarch), %eax
+ cmpl $num_subarch_entries, %eax
+ jae .Lbad_subarch
+
+ movl pa(subarch_entries)(,%eax,4), %eax
+ subl $__PAGE_OFFSET, %eax
+ jmp *%eax
+
+.Lbad_subarch:
+WEAK(xen_entry)
+ /* Unknown implementation; there's really
+ nothing we can do at this point. */
+ ud2a
+
+ __INITDATA
+
+subarch_entries:
+ .long .Ldefault_entry /* normal x86/PC */
+ .long xen_entry /* Xen hypervisor */
+ .long .Ldefault_entry /* Moorestown MID */
+num_subarch_entries = (. - subarch_entries) / 4
+.previous
+#else
+ jmp .Ldefault_entry
+#endif /* CONFIG_PARAVIRT */
+
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * Boot CPU0 entry point. It's called from play_dead(). Everything has been set
+ * up already except stack. We just set up stack here. Then call
+ * start_secondary().
+ */
+ENTRY(start_cpu0)
+ movl initial_stack, %ecx
+ movl %ecx, %esp
+ call *(initial_code)
+1: jmp 1b
+ENDPROC(start_cpu0)
+#endif
+
+/*
+ * Non-boot CPU entry point; entered from trampoline.S
+ * We can't lgdt here, because lgdt itself uses a data segment, but
+ * we know the trampoline has already loaded the boot_gdt for us.
+ *
+ * If cpu hotplug is not supported then this code can go in init section
+ * which will be freed later
+ */
+ENTRY(startup_32_smp)
+ cld
+ movl $(__BOOT_DS),%eax
+ movl %eax,%ds
+ movl %eax,%es
+ movl %eax,%fs
+ movl %eax,%gs
+ movl pa(initial_stack),%ecx
+ movl %eax,%ss
+ leal -__PAGE_OFFSET(%ecx),%esp
+
+#ifdef CONFIG_MICROCODE
+ /* Early load ucode on AP. */
+ call load_ucode_ap
+#endif
+
+.Ldefault_entry:
+ movl $(CR0_STATE & ~X86_CR0_PG),%eax
+ movl %eax,%cr0
+
+/*
+ * We want to start out with EFLAGS unambiguously cleared. Some BIOSes leave
+ * bits like NT set. This would confuse the debugger if this code is traced. So
+ * initialize them properly now before switching to protected mode. That means
+ * DF in particular (even though we have cleared it earlier after copying the
+ * command line) because GCC expects it.
+ */
+ pushl $0
+ popfl
+
+/*
+ * New page tables may be in 4Mbyte page mode and may be using the global pages.
+ *
+ * NOTE! If we are on a 486 we may have no cr4 at all! Specifically, cr4 exists
+ * if and only if CPUID exists and has flags other than the FPU flag set.
+ */
+ movl $-1,pa(X86_CPUID) # preset CPUID level
+ movl $X86_EFLAGS_ID,%ecx
+ pushl %ecx
+ popfl # set EFLAGS=ID
+ pushfl
+ popl %eax # get EFLAGS
+ testl $X86_EFLAGS_ID,%eax # did EFLAGS.ID remained set?
+ jz .Lenable_paging # hw disallowed setting of ID bit
+ # which means no CPUID and no CR4
+
+ xorl %eax,%eax
+ cpuid
+ movl %eax,pa(X86_CPUID) # save largest std CPUID function
+
+ movl $1,%eax
+ cpuid
+ andl $~1,%edx # Ignore CPUID.FPU
+ jz .Lenable_paging # No flags or only CPUID.FPU = no CR4
+
+ movl pa(mmu_cr4_features),%eax
+ movl %eax,%cr4
+
+ testb $X86_CR4_PAE, %al # check if PAE is enabled
+ jz .Lenable_paging
+
+ /* Check if extended functions are implemented */
+ movl $0x80000000, %eax
+ cpuid
+ /* Value must be in the range 0x80000001 to 0x8000ffff */
+ subl $0x80000001, %eax
+ cmpl $(0x8000ffff-0x80000001), %eax
+ ja .Lenable_paging
+
+ /* Clear bogus XD_DISABLE bits */
+ call verify_cpu
+
+ mov $0x80000001, %eax
+ cpuid
+ /* Execute Disable bit supported? */
+ btl $(X86_FEATURE_NX & 31), %edx
+ jnc .Lenable_paging
+
+ /* Setup EFER (Extended Feature Enable Register) */
+ movl $MSR_EFER, %ecx
+ rdmsr
+
+ btsl $_EFER_NX, %eax
+ /* Make changes effective */
+ wrmsr
+
+.Lenable_paging:
+
+/*
+ * Enable paging
+ */
+ movl $pa(initial_page_table), %eax
+ movl %eax,%cr3 /* set the page table pointer.. */
+ movl $CR0_STATE,%eax
+ movl %eax,%cr0 /* ..and set paging (PG) bit */
+ ljmp $__BOOT_CS,$1f /* Clear prefetch and normalize %eip */
+1:
+ /* Shift the stack pointer to a virtual address */
+ addl $__PAGE_OFFSET, %esp
+
+/*
+ * start system 32-bit setup. We need to re-do some of the things done
+ * in 16-bit mode for the "real" operations.
+ */
+ movl setup_once_ref,%eax
+ andl %eax,%eax
+ jz 1f # Did we do this already?
+ call *%eax
+1:
+
+/*
+ * Check if it is 486
+ */
+ movb $4,X86 # at least 486
+ cmpl $-1,X86_CPUID
+ je .Lis486
+
+ /* get vendor info */
+ xorl %eax,%eax # call CPUID with 0 -> return vendor ID
+ cpuid
+ movl %eax,X86_CPUID # save CPUID level
+ movl %ebx,X86_VENDOR_ID # lo 4 chars
+ movl %edx,X86_VENDOR_ID+4 # next 4 chars
+ movl %ecx,X86_VENDOR_ID+8 # last 4 chars
+
+ orl %eax,%eax # do we have processor info as well?
+ je .Lis486
+
+ movl $1,%eax # Use the CPUID instruction to get CPU type
+ cpuid
+ movb %al,%cl # save reg for future use
+ andb $0x0f,%ah # mask processor family
+ movb %ah,X86
+ andb $0xf0,%al # mask model
+ shrb $4,%al
+ movb %al,X86_MODEL
+ andb $0x0f,%cl # mask mask revision
+ movb %cl,X86_STEPPING
+ movl %edx,X86_CAPABILITY
+
+.Lis486:
+ movl $0x50022,%ecx # set AM, WP, NE and MP
+ movl %cr0,%eax
+ andl $0x80000011,%eax # Save PG,PE,ET
+ orl %ecx,%eax
+ movl %eax,%cr0
+
+ lgdt early_gdt_descr
+ ljmp $(__KERNEL_CS),$1f
+1: movl $(__KERNEL_DS),%eax # reload all the segment registers
+ movl %eax,%ss # after changing gdt.
+
+ movl $(__USER_DS),%eax # DS/ES contains default USER segment
+ movl %eax,%ds
+ movl %eax,%es
+
+ movl $(__KERNEL_PERCPU), %eax
+ movl %eax,%fs # set this cpu's percpu
+
+ movl $(__KERNEL_STACK_CANARY),%eax
+ movl %eax,%gs
+
+ xorl %eax,%eax # Clear LDT
+ lldt %ax
+
+ call *(initial_code)
+1: jmp 1b
+ENDPROC(startup_32_smp)
+
+#include "verify_cpu.S"
+
+/*
+ * setup_once
+ *
+ * The setup work we only want to run on the BSP.
+ *
+ * Warning: %esi is live across this function.
+ */
+__INIT
+setup_once:
+#ifdef CONFIG_STACKPROTECTOR
+ /*
+ * Configure the stack canary. The linker can't handle this by
+ * relocation. Manually set base address in stack canary
+ * segment descriptor.
+ */
+ movl $gdt_page,%eax
+ movl $stack_canary,%ecx
+ movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax)
+ shrl $16, %ecx
+ movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax)
+ movb %ch, 8 * GDT_ENTRY_STACK_CANARY + 7(%eax)
+#endif
+
+ andl $0,setup_once_ref /* Once is enough, thanks */
+ ret
+
+ENTRY(early_idt_handler_array)
+ # 36(%esp) %eflags
+ # 32(%esp) %cs
+ # 28(%esp) %eip
+ # 24(%rsp) error code
+ i = 0
+ .rept NUM_EXCEPTION_VECTORS
+ .if ((EXCEPTION_ERRCODE_MASK >> i) & 1) == 0
+ pushl $0 # Dummy error code, to make stack frame uniform
+ .endif
+ pushl $i # 20(%esp) Vector number
+ jmp early_idt_handler_common
+ i = i + 1
+ .fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc
+ .endr
+ENDPROC(early_idt_handler_array)
+
+early_idt_handler_common:
+ /*
+ * The stack is the hardware frame, an error code or zero, and the
+ * vector number.
+ */
+ cld
+
+ incl %ss:early_recursion_flag
+
+ /* The vector number is in pt_regs->gs */
+
+ cld
+ pushl %fs /* pt_regs->fs (__fsh varies by model) */
+ pushl %es /* pt_regs->es (__esh varies by model) */
+ pushl %ds /* pt_regs->ds (__dsh varies by model) */
+ pushl %eax /* pt_regs->ax */
+ pushl %ebp /* pt_regs->bp */
+ pushl %edi /* pt_regs->di */
+ pushl %esi /* pt_regs->si */
+ pushl %edx /* pt_regs->dx */
+ pushl %ecx /* pt_regs->cx */
+ pushl %ebx /* pt_regs->bx */
+
+ /* Fix up DS and ES */
+ movl $(__KERNEL_DS), %ecx
+ movl %ecx, %ds
+ movl %ecx, %es
+
+ /* Load the vector number into EDX */
+ movl PT_GS(%esp), %edx
+
+ /* Load GS into pt_regs->gs (and maybe clobber __gsh) */
+ movw %gs, PT_GS(%esp)
+
+ movl %esp, %eax /* args are pt_regs (EAX), trapnr (EDX) */
+ call early_fixup_exception
+
+ popl %ebx /* pt_regs->bx */
+ popl %ecx /* pt_regs->cx */
+ popl %edx /* pt_regs->dx */
+ popl %esi /* pt_regs->si */
+ popl %edi /* pt_regs->di */
+ popl %ebp /* pt_regs->bp */
+ popl %eax /* pt_regs->ax */
+ popl %ds /* pt_regs->ds (always ignores __dsh) */
+ popl %es /* pt_regs->es (always ignores __esh) */
+ popl %fs /* pt_regs->fs (always ignores __fsh) */
+ popl %gs /* pt_regs->gs (always ignores __gsh) */
+ decl %ss:early_recursion_flag
+ addl $4, %esp /* pop pt_regs->orig_ax */
+ iret
+ENDPROC(early_idt_handler_common)
+
+/* This is the default interrupt "handler" :-) */
+ENTRY(early_ignore_irq)
+ cld
+#ifdef CONFIG_PRINTK
+ pushl %eax
+ pushl %ecx
+ pushl %edx
+ pushl %es
+ pushl %ds
+ movl $(__KERNEL_DS),%eax
+ movl %eax,%ds
+ movl %eax,%es
+ cmpl $2,early_recursion_flag
+ je hlt_loop
+ incl early_recursion_flag
+ pushl 16(%esp)
+ pushl 24(%esp)
+ pushl 32(%esp)
+ pushl 40(%esp)
+ pushl $int_msg
+ call printk
+
+ call dump_stack
+
+ addl $(5*4),%esp
+ popl %ds
+ popl %es
+ popl %edx
+ popl %ecx
+ popl %eax
+#endif
+ iret
+
+hlt_loop:
+ hlt
+ jmp hlt_loop
+ENDPROC(early_ignore_irq)
+
+__INITDATA
+ .align 4
+GLOBAL(early_recursion_flag)
+ .long 0
+
+__REFDATA
+ .align 4
+ENTRY(initial_code)
+ .long i386_start_kernel
+ENTRY(setup_once_ref)
+ .long setup_once
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+#define PGD_ALIGN (2 * PAGE_SIZE)
+#define PTI_USER_PGD_FILL 1024
+#else
+#define PGD_ALIGN (PAGE_SIZE)
+#define PTI_USER_PGD_FILL 0
+#endif
+/*
+ * BSS section
+ */
+__PAGE_ALIGNED_BSS
+ .align PGD_ALIGN
+#ifdef CONFIG_X86_PAE
+.globl initial_pg_pmd
+initial_pg_pmd:
+ .fill 1024*KPMDS,4,0
+#else
+.globl initial_page_table
+initial_page_table:
+ .fill 1024,4,0
+#endif
+ .align PGD_ALIGN
+initial_pg_fixmap:
+ .fill 1024,4,0
+.globl swapper_pg_dir
+ .align PGD_ALIGN
+swapper_pg_dir:
+ .fill 1024,4,0
+ .fill PTI_USER_PGD_FILL,4,0
+.globl empty_zero_page
+empty_zero_page:
+ .fill 4096,1,0
+EXPORT_SYMBOL(empty_zero_page)
+
+/*
+ * This starts the data section.
+ */
+#ifdef CONFIG_X86_PAE
+__PAGE_ALIGNED_DATA
+ /* Page-aligned for the benefit of paravirt? */
+ .align PGD_ALIGN
+ENTRY(initial_page_table)
+ .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */
+# if KPMDS == 3
+ .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0
+ .long pa(initial_pg_pmd+PGD_IDENT_ATTR+0x1000),0
+ .long pa(initial_pg_pmd+PGD_IDENT_ATTR+0x2000),0
+# elif KPMDS == 2
+ .long 0,0
+ .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0
+ .long pa(initial_pg_pmd+PGD_IDENT_ATTR+0x1000),0
+# elif KPMDS == 1
+ .long 0,0
+ .long 0,0
+ .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0
+# else
+# error "Kernel PMDs should be 1, 2 or 3"
+# endif
+ .align PAGE_SIZE /* needs to be page-sized too */
+#endif
+
+.data
+.balign 4
+ENTRY(initial_stack)
+ /*
+ * The SIZEOF_PTREGS gap is a convention which helps the in-kernel
+ * unwinder reliably detect the end of the stack.
+ */
+ .long init_thread_union + THREAD_SIZE - SIZEOF_PTREGS - \
+ TOP_OF_KERNEL_STACK_PADDING;
+
+__INITRODATA
+int_msg:
+ .asciz "Unknown interrupt or fault at: %p %p %p\n"
+
+#include "../../x86/xen/xen-head.S"
+
+/*
+ * The IDT and GDT 'descriptors' are a strange 48-bit object
+ * only used by the lidt and lgdt instructions. They are not
+ * like usual segment descriptors - they consist of a 16-bit
+ * segment size, and 32-bit linear address value:
+ */
+
+ .data
+.globl boot_gdt_descr
+
+ ALIGN
+# early boot GDT descriptor (must use 1:1 address mapping)
+ .word 0 # 32 bit align gdt_desc.address
+boot_gdt_descr:
+ .word __BOOT_DS+7
+ .long boot_gdt - __PAGE_OFFSET
+
+# boot GDT descriptor (later on used by CPU#0):
+ .word 0 # 32 bit align gdt_desc.address
+ENTRY(early_gdt_descr)
+ .word GDT_ENTRIES*8-1
+ .long gdt_page /* Overwritten for secondary CPUs */
+
+/*
+ * The boot_gdt must mirror the equivalent in setup.S and is
+ * used only for booting.
+ */
+ .align L1_CACHE_BYTES
+ENTRY(boot_gdt)
+ .fill GDT_ENTRY_BOOT_CS,8,0
+ .quad 0x00cf9a000000ffff /* kernel 4GB code at 0x00000000 */
+ .quad 0x00cf92000000ffff /* kernel 4GB data at 0x00000000 */
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
new file mode 100644
index 000000000..a3618cf04
--- /dev/null
+++ b/arch/x86/kernel/head_64.S
@@ -0,0 +1,485 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * linux/arch/x86/kernel/head_64.S -- start in 32bit and switch to 64bit
+ *
+ * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
+ * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 2000 Karsten Keil <kkeil@suse.de>
+ * Copyright (C) 2001,2002 Andi Kleen <ak@suse.de>
+ * Copyright (C) 2005 Eric Biederman <ebiederm@xmission.com>
+ */
+
+
+#include <linux/linkage.h>
+#include <linux/threads.h>
+#include <linux/init.h>
+#include <asm/segment.h>
+#include <asm/pgtable.h>
+#include <asm/page.h>
+#include <asm/msr.h>
+#include <asm/cache.h>
+#include <asm/processor-flags.h>
+#include <asm/percpu.h>
+#include <asm/nops.h>
+#include "../entry/calling.h"
+#include <asm/export.h>
+#include <asm/nospec-branch.h>
+#include <asm/fixmap.h>
+
+#ifdef CONFIG_PARAVIRT
+#include <asm/asm-offsets.h>
+#include <asm/paravirt.h>
+#define GET_CR2_INTO(reg) GET_CR2_INTO_RAX ; movq %rax, reg
+#else
+#define GET_CR2_INTO(reg) movq %cr2, reg
+#define INTERRUPT_RETURN iretq
+#endif
+
+/* we are not able to switch in one step to the final KERNEL ADDRESS SPACE
+ * because we need identity-mapped pages.
+ *
+ */
+
+#define l4_index(x) (((x) >> 39) & 511)
+#define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
+
+L4_PAGE_OFFSET = l4_index(__PAGE_OFFSET_BASE_L4)
+L4_START_KERNEL = l4_index(__START_KERNEL_map)
+
+L3_START_KERNEL = pud_index(__START_KERNEL_map)
+
+ .text
+ __HEAD
+ .code64
+ .globl startup_64
+startup_64:
+ UNWIND_HINT_EMPTY
+ /*
+ * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0,
+ * and someone has loaded an identity mapped page table
+ * for us. These identity mapped page tables map all of the
+ * kernel pages and possibly all of memory.
+ *
+ * %rsi holds a physical pointer to real_mode_data.
+ *
+ * We come here either directly from a 64bit bootloader, or from
+ * arch/x86/boot/compressed/head_64.S.
+ *
+ * We only come here initially at boot nothing else comes here.
+ *
+ * Since we may be loaded at an address different from what we were
+ * compiled to run at we first fixup the physical addresses in our page
+ * tables and then reload them.
+ */
+
+ /* Set up the stack for verify_cpu(), similar to initial_stack below */
+ leaq (__end_init_task - SIZEOF_PTREGS)(%rip), %rsp
+
+ /* Sanitize CPU configuration */
+ call verify_cpu
+
+ /*
+ * Perform pagetable fixups. Additionally, if SME is active, encrypt
+ * the kernel and retrieve the modifier (SME encryption mask if SME
+ * is active) to be added to the initial pgdir entry that will be
+ * programmed into CR3.
+ */
+ leaq _text(%rip), %rdi
+ pushq %rsi
+ call __startup_64
+ popq %rsi
+
+ /* Form the CR3 value being sure to include the CR3 modifier */
+ addq $(early_top_pgt - __START_KERNEL_map), %rax
+ jmp 1f
+ENTRY(secondary_startup_64)
+ UNWIND_HINT_EMPTY
+ /*
+ * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0,
+ * and someone has loaded a mapped page table.
+ *
+ * %rsi holds a physical pointer to real_mode_data.
+ *
+ * We come here either from startup_64 (using physical addresses)
+ * or from trampoline.S (using virtual addresses).
+ *
+ * Using virtual addresses from trampoline.S removes the need
+ * to have any identity mapped pages in the kernel page table
+ * after the boot processor executes this code.
+ */
+
+ /* Sanitize CPU configuration */
+ call verify_cpu
+
+ /*
+ * Retrieve the modifier (SME encryption mask if SME is active) to be
+ * added to the initial pgdir entry that will be programmed into CR3.
+ */
+ pushq %rsi
+ call __startup_secondary_64
+ popq %rsi
+
+ /* Form the CR3 value being sure to include the CR3 modifier */
+ addq $(init_top_pgt - __START_KERNEL_map), %rax
+1:
+
+ /* Enable PAE mode, PGE and LA57 */
+ movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx
+#ifdef CONFIG_X86_5LEVEL
+ testl $1, __pgtable_l5_enabled(%rip)
+ jz 1f
+ orl $X86_CR4_LA57, %ecx
+1:
+#endif
+ movq %rcx, %cr4
+
+ /* Setup early boot stage 4-/5-level pagetables. */
+ addq phys_base(%rip), %rax
+ movq %rax, %cr3
+
+ /* Ensure I am executing from virtual addresses */
+ movq $1f, %rax
+ ANNOTATE_RETPOLINE_SAFE
+ jmp *%rax
+1:
+ UNWIND_HINT_EMPTY
+
+ /* Check if nx is implemented */
+ movl $0x80000001, %eax
+ cpuid
+ movl %edx,%edi
+
+ /* Setup EFER (Extended Feature Enable Register) */
+ movl $MSR_EFER, %ecx
+ rdmsr
+ btsl $_EFER_SCE, %eax /* Enable System Call */
+ btl $20,%edi /* No Execute supported? */
+ jnc 1f
+ btsl $_EFER_NX, %eax
+ btsq $_PAGE_BIT_NX,early_pmd_flags(%rip)
+1: wrmsr /* Make changes effective */
+
+ /* Setup cr0 */
+ movl $CR0_STATE, %eax
+ /* Make changes effective */
+ movq %rax, %cr0
+
+ /* Setup a boot time stack */
+ movq initial_stack(%rip), %rsp
+
+ /* zero EFLAGS after setting rsp */
+ pushq $0
+ popfq
+
+ /*
+ * We must switch to a new descriptor in kernel space for the GDT
+ * because soon the kernel won't have access anymore to the userspace
+ * addresses where we're currently running on. We have to do that here
+ * because in 32bit we couldn't load a 64bit linear address.
+ */
+ lgdt early_gdt_descr(%rip)
+
+ /* set up data segments */
+ xorl %eax,%eax
+ movl %eax,%ds
+ movl %eax,%ss
+ movl %eax,%es
+
+ /*
+ * We don't really need to load %fs or %gs, but load them anyway
+ * to kill any stale realmode selectors. This allows execution
+ * under VT hardware.
+ */
+ movl %eax,%fs
+ movl %eax,%gs
+
+ /* Set up %gs.
+ *
+ * The base of %gs always points to the bottom of the irqstack
+ * union. If the stack protector canary is enabled, it is
+ * located at %gs:40. Note that, on SMP, the boot cpu uses
+ * init data section till per cpu areas are set up.
+ */
+ movl $MSR_GS_BASE,%ecx
+ movl initial_gs(%rip),%eax
+ movl initial_gs+4(%rip),%edx
+ wrmsr
+
+ /* rsi is pointer to real mode structure with interesting info.
+ pass it to C */
+ movq %rsi, %rdi
+
+.Ljump_to_C_code:
+ /*
+ * Jump to run C code and to be on a real kernel address.
+ * Since we are running on identity-mapped space we have to jump
+ * to the full 64bit address, this is only possible as indirect
+ * jump. In addition we need to ensure %cs is set so we make this
+ * a far return.
+ *
+ * Note: do not change to far jump indirect with 64bit offset.
+ *
+ * AMD does not support far jump indirect with 64bit offset.
+ * AMD64 Architecture Programmer's Manual, Volume 3: states only
+ * JMP FAR mem16:16 FF /5 Far jump indirect,
+ * with the target specified by a far pointer in memory.
+ * JMP FAR mem16:32 FF /5 Far jump indirect,
+ * with the target specified by a far pointer in memory.
+ *
+ * Intel64 does support 64bit offset.
+ * Software Developer Manual Vol 2: states:
+ * FF /5 JMP m16:16 Jump far, absolute indirect,
+ * address given in m16:16
+ * FF /5 JMP m16:32 Jump far, absolute indirect,
+ * address given in m16:32.
+ * REX.W + FF /5 JMP m16:64 Jump far, absolute indirect,
+ * address given in m16:64.
+ */
+ pushq $.Lafter_lret # put return address on stack for unwinder
+ xorl %ebp, %ebp # clear frame pointer
+ movq initial_code(%rip), %rax
+ pushq $__KERNEL_CS # set correct cs
+ pushq %rax # target address in negative space
+ lretq
+.Lafter_lret:
+END(secondary_startup_64)
+
+#include "verify_cpu.S"
+
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * Boot CPU0 entry point. It's called from play_dead(). Everything has been set
+ * up already except stack. We just set up stack here. Then call
+ * start_secondary() via .Ljump_to_C_code.
+ */
+ENTRY(start_cpu0)
+ movq initial_stack(%rip), %rsp
+ UNWIND_HINT_EMPTY
+ jmp .Ljump_to_C_code
+ENDPROC(start_cpu0)
+#endif
+
+ /* Both SMP bootup and ACPI suspend change these variables */
+ __REFDATA
+ .balign 8
+ GLOBAL(initial_code)
+ .quad x86_64_start_kernel
+ GLOBAL(initial_gs)
+ .quad INIT_PER_CPU_VAR(irq_stack_union)
+ GLOBAL(initial_stack)
+ /*
+ * The SIZEOF_PTREGS gap is a convention which helps the in-kernel
+ * unwinder reliably detect the end of the stack.
+ */
+ .quad init_thread_union + THREAD_SIZE - SIZEOF_PTREGS
+ __FINITDATA
+
+ __INIT
+ENTRY(early_idt_handler_array)
+ i = 0
+ .rept NUM_EXCEPTION_VECTORS
+ .if ((EXCEPTION_ERRCODE_MASK >> i) & 1) == 0
+ UNWIND_HINT_IRET_REGS
+ pushq $0 # Dummy error code, to make stack frame uniform
+ .else
+ UNWIND_HINT_IRET_REGS offset=8
+ .endif
+ pushq $i # 72(%rsp) Vector number
+ jmp early_idt_handler_common
+ UNWIND_HINT_IRET_REGS
+ i = i + 1
+ .fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc
+ .endr
+ UNWIND_HINT_IRET_REGS offset=16
+END(early_idt_handler_array)
+
+early_idt_handler_common:
+ /*
+ * The stack is the hardware frame, an error code or zero, and the
+ * vector number.
+ */
+ cld
+
+ incl early_recursion_flag(%rip)
+
+ /* The vector number is currently in the pt_regs->di slot. */
+ pushq %rsi /* pt_regs->si */
+ movq 8(%rsp), %rsi /* RSI = vector number */
+ movq %rdi, 8(%rsp) /* pt_regs->di = RDI */
+ pushq %rdx /* pt_regs->dx */
+ pushq %rcx /* pt_regs->cx */
+ pushq %rax /* pt_regs->ax */
+ pushq %r8 /* pt_regs->r8 */
+ pushq %r9 /* pt_regs->r9 */
+ pushq %r10 /* pt_regs->r10 */
+ pushq %r11 /* pt_regs->r11 */
+ pushq %rbx /* pt_regs->bx */
+ pushq %rbp /* pt_regs->bp */
+ pushq %r12 /* pt_regs->r12 */
+ pushq %r13 /* pt_regs->r13 */
+ pushq %r14 /* pt_regs->r14 */
+ pushq %r15 /* pt_regs->r15 */
+ UNWIND_HINT_REGS
+
+ cmpq $14,%rsi /* Page fault? */
+ jnz 10f
+ GET_CR2_INTO(%rdi) /* Can clobber any volatile register if pv */
+ call early_make_pgtable
+ andl %eax,%eax
+ jz 20f /* All good */
+
+10:
+ movq %rsp,%rdi /* RDI = pt_regs; RSI is already trapnr */
+ call early_fixup_exception
+
+20:
+ decl early_recursion_flag(%rip)
+ jmp restore_regs_and_return_to_kernel
+END(early_idt_handler_common)
+
+ __INITDATA
+
+ .balign 4
+GLOBAL(early_recursion_flag)
+ .long 0
+
+#define NEXT_PAGE(name) \
+ .balign PAGE_SIZE; \
+GLOBAL(name)
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+/*
+ * Each PGD needs to be 8k long and 8k aligned. We do not
+ * ever go out to userspace with these, so we do not
+ * strictly *need* the second page, but this allows us to
+ * have a single set_pgd() implementation that does not
+ * need to worry about whether it has 4k or 8k to work
+ * with.
+ *
+ * This ensures PGDs are 8k long:
+ */
+#define PTI_USER_PGD_FILL 512
+/* This ensures they are 8k-aligned: */
+#define NEXT_PGD_PAGE(name) \
+ .balign 2 * PAGE_SIZE; \
+GLOBAL(name)
+#else
+#define NEXT_PGD_PAGE(name) NEXT_PAGE(name)
+#define PTI_USER_PGD_FILL 0
+#endif
+
+/* Automate the creation of 1 to 1 mapping pmd entries */
+#define PMDS(START, PERM, COUNT) \
+ i = 0 ; \
+ .rept (COUNT) ; \
+ .quad (START) + (i << PMD_SHIFT) + (PERM) ; \
+ i = i + 1 ; \
+ .endr
+
+ __INITDATA
+NEXT_PGD_PAGE(early_top_pgt)
+ .fill 512,8,0
+ .fill PTI_USER_PGD_FILL,8,0
+
+NEXT_PAGE(early_dynamic_pgts)
+ .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0
+
+ .data
+
+#if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH)
+NEXT_PGD_PAGE(init_top_pgt)
+ .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
+ .org init_top_pgt + L4_PAGE_OFFSET*8, 0
+ .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
+ .org init_top_pgt + L4_START_KERNEL*8, 0
+ /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
+ .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
+ .fill PTI_USER_PGD_FILL,8,0
+
+NEXT_PAGE(level3_ident_pgt)
+ .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
+ .fill 511, 8, 0
+NEXT_PAGE(level2_ident_pgt)
+ /*
+ * Since I easily can, map the first 1G.
+ * Don't set NX because code runs from these pages.
+ *
+ * Note: This sets _PAGE_GLOBAL despite whether
+ * the CPU supports it or it is enabled. But,
+ * the CPU should ignore the bit.
+ */
+ PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
+#else
+NEXT_PGD_PAGE(init_top_pgt)
+ .fill 512,8,0
+ .fill PTI_USER_PGD_FILL,8,0
+#endif
+
+#ifdef CONFIG_X86_5LEVEL
+NEXT_PAGE(level4_kernel_pgt)
+ .fill 511,8,0
+ .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
+#endif
+
+NEXT_PAGE(level3_kernel_pgt)
+ .fill L3_START_KERNEL,8,0
+ /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */
+ .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
+ .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
+
+NEXT_PAGE(level2_kernel_pgt)
+ /*
+ * 512 MB kernel mapping. We spend a full page on this pagetable
+ * anyway.
+ *
+ * The kernel code+data+bss must not be bigger than that.
+ *
+ * (NOTE: at +512MB starts the module area, see MODULES_VADDR.
+ * If you want to increase this then increase MODULES_VADDR
+ * too.)
+ *
+ * This table is eventually used by the kernel during normal
+ * runtime. Care must be taken to clear out undesired bits
+ * later, like _PAGE_RW or _PAGE_GLOBAL in some cases.
+ */
+ PMDS(0, __PAGE_KERNEL_LARGE_EXEC,
+ KERNEL_IMAGE_SIZE/PMD_SIZE)
+
+NEXT_PAGE(level2_fixmap_pgt)
+ .fill (512 - 4 - FIXMAP_PMD_NUM),8,0
+ pgtno = 0
+ .rept (FIXMAP_PMD_NUM)
+ .quad level1_fixmap_pgt + (pgtno << PAGE_SHIFT) - __START_KERNEL_map \
+ + _PAGE_TABLE_NOENC;
+ pgtno = pgtno + 1
+ .endr
+ /* 6 MB reserved space + a 2MB hole */
+ .fill 4,8,0
+
+NEXT_PAGE(level1_fixmap_pgt)
+ .rept (FIXMAP_PMD_NUM)
+ .fill 512,8,0
+ .endr
+
+#undef PMDS
+
+ .data
+ .align 16
+ .globl early_gdt_descr
+early_gdt_descr:
+ .word GDT_ENTRIES*8-1
+early_gdt_descr_base:
+ .quad INIT_PER_CPU_VAR(gdt_page)
+
+ENTRY(phys_base)
+ /* This must match the first entry in level2_kernel_pgt */
+ .quad 0x0000000000000000
+EXPORT_SYMBOL(phys_base)
+
+#include "../../x86/xen/xen-head.S"
+
+ __PAGE_ALIGNED_BSS
+NEXT_PAGE(empty_zero_page)
+ .skip PAGE_SIZE
+EXPORT_SYMBOL(empty_zero_page)
+
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
new file mode 100644
index 000000000..1e3f1f140
--- /dev/null
+++ b/arch/x86/kernel/hpet.c
@@ -0,0 +1,1366 @@
+#include <linux/clocksource.h>
+#include <linux/clockchips.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/export.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/i8253.h>
+#include <linux/slab.h>
+#include <linux/hpet.h>
+#include <linux/init.h>
+#include <linux/cpu.h>
+#include <linux/pm.h>
+#include <linux/io.h>
+
+#include <asm/cpufeature.h>
+#include <asm/irqdomain.h>
+#include <asm/fixmap.h>
+#include <asm/hpet.h>
+#include <asm/time.h>
+
+#define HPET_MASK CLOCKSOURCE_MASK(32)
+
+/* FSEC = 10^-15
+ NSEC = 10^-9 */
+#define FSEC_PER_NSEC 1000000L
+
+#define HPET_DEV_USED_BIT 2
+#define HPET_DEV_USED (1 << HPET_DEV_USED_BIT)
+#define HPET_DEV_VALID 0x8
+#define HPET_DEV_FSB_CAP 0x1000
+#define HPET_DEV_PERI_CAP 0x2000
+
+#define HPET_MIN_CYCLES 128
+#define HPET_MIN_PROG_DELTA (HPET_MIN_CYCLES + (HPET_MIN_CYCLES >> 1))
+
+/*
+ * HPET address is set in acpi/boot.c, when an ACPI entry exists
+ */
+unsigned long hpet_address;
+u8 hpet_blockid; /* OS timer block num */
+bool hpet_msi_disable;
+
+#ifdef CONFIG_PCI_MSI
+static unsigned int hpet_num_timers;
+#endif
+static void __iomem *hpet_virt_address;
+
+struct hpet_dev {
+ struct clock_event_device evt;
+ unsigned int num;
+ int cpu;
+ unsigned int irq;
+ unsigned int flags;
+ char name[10];
+};
+
+static inline struct hpet_dev *EVT_TO_HPET_DEV(struct clock_event_device *evtdev)
+{
+ return container_of(evtdev, struct hpet_dev, evt);
+}
+
+inline unsigned int hpet_readl(unsigned int a)
+{
+ return readl(hpet_virt_address + a);
+}
+
+static inline void hpet_writel(unsigned int d, unsigned int a)
+{
+ writel(d, hpet_virt_address + a);
+}
+
+#ifdef CONFIG_X86_64
+#include <asm/pgtable.h>
+#endif
+
+static inline void hpet_set_mapping(void)
+{
+ hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE);
+}
+
+static inline void hpet_clear_mapping(void)
+{
+ iounmap(hpet_virt_address);
+ hpet_virt_address = NULL;
+}
+
+/*
+ * HPET command line enable / disable
+ */
+bool boot_hpet_disable;
+bool hpet_force_user;
+static bool hpet_verbose;
+
+static int __init hpet_setup(char *str)
+{
+ while (str) {
+ char *next = strchr(str, ',');
+
+ if (next)
+ *next++ = 0;
+ if (!strncmp("disable", str, 7))
+ boot_hpet_disable = true;
+ if (!strncmp("force", str, 5))
+ hpet_force_user = true;
+ if (!strncmp("verbose", str, 7))
+ hpet_verbose = true;
+ str = next;
+ }
+ return 1;
+}
+__setup("hpet=", hpet_setup);
+
+static int __init disable_hpet(char *str)
+{
+ boot_hpet_disable = true;
+ return 1;
+}
+__setup("nohpet", disable_hpet);
+
+static inline int is_hpet_capable(void)
+{
+ return !boot_hpet_disable && hpet_address;
+}
+
+/*
+ * HPET timer interrupt enable / disable
+ */
+static bool hpet_legacy_int_enabled;
+
+/**
+ * is_hpet_enabled - check whether the hpet timer interrupt is enabled
+ */
+int is_hpet_enabled(void)
+{
+ return is_hpet_capable() && hpet_legacy_int_enabled;
+}
+EXPORT_SYMBOL_GPL(is_hpet_enabled);
+
+static void _hpet_print_config(const char *function, int line)
+{
+ u32 i, timers, l, h;
+ printk(KERN_INFO "hpet: %s(%d):\n", function, line);
+ l = hpet_readl(HPET_ID);
+ h = hpet_readl(HPET_PERIOD);
+ timers = ((l & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1;
+ printk(KERN_INFO "hpet: ID: 0x%x, PERIOD: 0x%x\n", l, h);
+ l = hpet_readl(HPET_CFG);
+ h = hpet_readl(HPET_STATUS);
+ printk(KERN_INFO "hpet: CFG: 0x%x, STATUS: 0x%x\n", l, h);
+ l = hpet_readl(HPET_COUNTER);
+ h = hpet_readl(HPET_COUNTER+4);
+ printk(KERN_INFO "hpet: COUNTER_l: 0x%x, COUNTER_h: 0x%x\n", l, h);
+
+ for (i = 0; i < timers; i++) {
+ l = hpet_readl(HPET_Tn_CFG(i));
+ h = hpet_readl(HPET_Tn_CFG(i)+4);
+ printk(KERN_INFO "hpet: T%d: CFG_l: 0x%x, CFG_h: 0x%x\n",
+ i, l, h);
+ l = hpet_readl(HPET_Tn_CMP(i));
+ h = hpet_readl(HPET_Tn_CMP(i)+4);
+ printk(KERN_INFO "hpet: T%d: CMP_l: 0x%x, CMP_h: 0x%x\n",
+ i, l, h);
+ l = hpet_readl(HPET_Tn_ROUTE(i));
+ h = hpet_readl(HPET_Tn_ROUTE(i)+4);
+ printk(KERN_INFO "hpet: T%d ROUTE_l: 0x%x, ROUTE_h: 0x%x\n",
+ i, l, h);
+ }
+}
+
+#define hpet_print_config() \
+do { \
+ if (hpet_verbose) \
+ _hpet_print_config(__func__, __LINE__); \
+} while (0)
+
+/*
+ * When the hpet driver (/dev/hpet) is enabled, we need to reserve
+ * timer 0 and timer 1 in case of RTC emulation.
+ */
+#ifdef CONFIG_HPET
+
+static void hpet_reserve_msi_timers(struct hpet_data *hd);
+
+static void hpet_reserve_platform_timers(unsigned int id)
+{
+ struct hpet __iomem *hpet = hpet_virt_address;
+ struct hpet_timer __iomem *timer = &hpet->hpet_timers[2];
+ unsigned int nrtimers, i;
+ struct hpet_data hd;
+
+ nrtimers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1;
+
+ memset(&hd, 0, sizeof(hd));
+ hd.hd_phys_address = hpet_address;
+ hd.hd_address = hpet;
+ hd.hd_nirqs = nrtimers;
+ hpet_reserve_timer(&hd, 0);
+
+#ifdef CONFIG_HPET_EMULATE_RTC
+ hpet_reserve_timer(&hd, 1);
+#endif
+
+ /*
+ * NOTE that hd_irq[] reflects IOAPIC input pins (LEGACY_8254
+ * is wrong for i8259!) not the output IRQ. Many BIOS writers
+ * don't bother configuring *any* comparator interrupts.
+ */
+ hd.hd_irq[0] = HPET_LEGACY_8254;
+ hd.hd_irq[1] = HPET_LEGACY_RTC;
+
+ for (i = 2; i < nrtimers; timer++, i++) {
+ hd.hd_irq[i] = (readl(&timer->hpet_config) &
+ Tn_INT_ROUTE_CNF_MASK) >> Tn_INT_ROUTE_CNF_SHIFT;
+ }
+
+ hpet_reserve_msi_timers(&hd);
+
+ hpet_alloc(&hd);
+
+}
+#else
+static void hpet_reserve_platform_timers(unsigned int id) { }
+#endif
+
+/*
+ * Common hpet info
+ */
+static unsigned long hpet_freq;
+
+static struct clock_event_device hpet_clockevent;
+
+static void hpet_stop_counter(void)
+{
+ u32 cfg = hpet_readl(HPET_CFG);
+ cfg &= ~HPET_CFG_ENABLE;
+ hpet_writel(cfg, HPET_CFG);
+}
+
+static void hpet_reset_counter(void)
+{
+ hpet_writel(0, HPET_COUNTER);
+ hpet_writel(0, HPET_COUNTER + 4);
+}
+
+static void hpet_start_counter(void)
+{
+ unsigned int cfg = hpet_readl(HPET_CFG);
+ cfg |= HPET_CFG_ENABLE;
+ hpet_writel(cfg, HPET_CFG);
+}
+
+static void hpet_restart_counter(void)
+{
+ hpet_stop_counter();
+ hpet_reset_counter();
+ hpet_start_counter();
+}
+
+static void hpet_resume_device(void)
+{
+ force_hpet_resume();
+}
+
+static void hpet_resume_counter(struct clocksource *cs)
+{
+ hpet_resume_device();
+ hpet_restart_counter();
+}
+
+static void hpet_enable_legacy_int(void)
+{
+ unsigned int cfg = hpet_readl(HPET_CFG);
+
+ cfg |= HPET_CFG_LEGACY;
+ hpet_writel(cfg, HPET_CFG);
+ hpet_legacy_int_enabled = true;
+}
+
+static void hpet_legacy_clockevent_register(void)
+{
+ /* Start HPET legacy interrupts */
+ hpet_enable_legacy_int();
+
+ /*
+ * Start hpet with the boot cpu mask and make it
+ * global after the IO_APIC has been initialized.
+ */
+ hpet_clockevent.cpumask = cpumask_of(boot_cpu_data.cpu_index);
+ clockevents_config_and_register(&hpet_clockevent, hpet_freq,
+ HPET_MIN_PROG_DELTA, 0x7FFFFFFF);
+ global_clock_event = &hpet_clockevent;
+ printk(KERN_DEBUG "hpet clockevent registered\n");
+}
+
+static int hpet_set_periodic(struct clock_event_device *evt, int timer)
+{
+ unsigned int cfg, cmp, now;
+ uint64_t delta;
+
+ hpet_stop_counter();
+ delta = ((uint64_t)(NSEC_PER_SEC / HZ)) * evt->mult;
+ delta >>= evt->shift;
+ now = hpet_readl(HPET_COUNTER);
+ cmp = now + (unsigned int)delta;
+ cfg = hpet_readl(HPET_Tn_CFG(timer));
+ cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL |
+ HPET_TN_32BIT;
+ hpet_writel(cfg, HPET_Tn_CFG(timer));
+ hpet_writel(cmp, HPET_Tn_CMP(timer));
+ udelay(1);
+ /*
+ * HPET on AMD 81xx needs a second write (with HPET_TN_SETVAL
+ * cleared) to T0_CMP to set the period. The HPET_TN_SETVAL
+ * bit is automatically cleared after the first write.
+ * (See AMD-8111 HyperTransport I/O Hub Data Sheet,
+ * Publication # 24674)
+ */
+ hpet_writel((unsigned int)delta, HPET_Tn_CMP(timer));
+ hpet_start_counter();
+ hpet_print_config();
+
+ return 0;
+}
+
+static int hpet_set_oneshot(struct clock_event_device *evt, int timer)
+{
+ unsigned int cfg;
+
+ cfg = hpet_readl(HPET_Tn_CFG(timer));
+ cfg &= ~HPET_TN_PERIODIC;
+ cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
+ hpet_writel(cfg, HPET_Tn_CFG(timer));
+
+ return 0;
+}
+
+static int hpet_shutdown(struct clock_event_device *evt, int timer)
+{
+ unsigned int cfg;
+
+ cfg = hpet_readl(HPET_Tn_CFG(timer));
+ cfg &= ~HPET_TN_ENABLE;
+ hpet_writel(cfg, HPET_Tn_CFG(timer));
+
+ return 0;
+}
+
+static int hpet_resume(struct clock_event_device *evt)
+{
+ hpet_enable_legacy_int();
+ hpet_print_config();
+ return 0;
+}
+
+static int hpet_next_event(unsigned long delta,
+ struct clock_event_device *evt, int timer)
+{
+ u32 cnt;
+ s32 res;
+
+ cnt = hpet_readl(HPET_COUNTER);
+ cnt += (u32) delta;
+ hpet_writel(cnt, HPET_Tn_CMP(timer));
+
+ /*
+ * HPETs are a complete disaster. The compare register is
+ * based on a equal comparison and neither provides a less
+ * than or equal functionality (which would require to take
+ * the wraparound into account) nor a simple count down event
+ * mode. Further the write to the comparator register is
+ * delayed internally up to two HPET clock cycles in certain
+ * chipsets (ATI, ICH9,10). Some newer AMD chipsets have even
+ * longer delays. We worked around that by reading back the
+ * compare register, but that required another workaround for
+ * ICH9,10 chips where the first readout after write can
+ * return the old stale value. We already had a minimum
+ * programming delta of 5us enforced, but a NMI or SMI hitting
+ * between the counter readout and the comparator write can
+ * move us behind that point easily. Now instead of reading
+ * the compare register back several times, we make the ETIME
+ * decision based on the following: Return ETIME if the
+ * counter value after the write is less than HPET_MIN_CYCLES
+ * away from the event or if the counter is already ahead of
+ * the event. The minimum programming delta for the generic
+ * clockevents code is set to 1.5 * HPET_MIN_CYCLES.
+ */
+ res = (s32)(cnt - hpet_readl(HPET_COUNTER));
+
+ return res < HPET_MIN_CYCLES ? -ETIME : 0;
+}
+
+static int hpet_legacy_shutdown(struct clock_event_device *evt)
+{
+ return hpet_shutdown(evt, 0);
+}
+
+static int hpet_legacy_set_oneshot(struct clock_event_device *evt)
+{
+ return hpet_set_oneshot(evt, 0);
+}
+
+static int hpet_legacy_set_periodic(struct clock_event_device *evt)
+{
+ return hpet_set_periodic(evt, 0);
+}
+
+static int hpet_legacy_resume(struct clock_event_device *evt)
+{
+ return hpet_resume(evt);
+}
+
+static int hpet_legacy_next_event(unsigned long delta,
+ struct clock_event_device *evt)
+{
+ return hpet_next_event(delta, evt, 0);
+}
+
+/*
+ * The hpet clock event device
+ */
+static struct clock_event_device hpet_clockevent = {
+ .name = "hpet",
+ .features = CLOCK_EVT_FEAT_PERIODIC |
+ CLOCK_EVT_FEAT_ONESHOT,
+ .set_state_periodic = hpet_legacy_set_periodic,
+ .set_state_oneshot = hpet_legacy_set_oneshot,
+ .set_state_shutdown = hpet_legacy_shutdown,
+ .tick_resume = hpet_legacy_resume,
+ .set_next_event = hpet_legacy_next_event,
+ .irq = 0,
+ .rating = 50,
+};
+
+/*
+ * HPET MSI Support
+ */
+#ifdef CONFIG_PCI_MSI
+
+static DEFINE_PER_CPU(struct hpet_dev *, cpu_hpet_dev);
+static struct hpet_dev *hpet_devs;
+static struct irq_domain *hpet_domain;
+
+void hpet_msi_unmask(struct irq_data *data)
+{
+ struct hpet_dev *hdev = irq_data_get_irq_handler_data(data);
+ unsigned int cfg;
+
+ /* unmask it */
+ cfg = hpet_readl(HPET_Tn_CFG(hdev->num));
+ cfg |= HPET_TN_ENABLE | HPET_TN_FSB;
+ hpet_writel(cfg, HPET_Tn_CFG(hdev->num));
+}
+
+void hpet_msi_mask(struct irq_data *data)
+{
+ struct hpet_dev *hdev = irq_data_get_irq_handler_data(data);
+ unsigned int cfg;
+
+ /* mask it */
+ cfg = hpet_readl(HPET_Tn_CFG(hdev->num));
+ cfg &= ~(HPET_TN_ENABLE | HPET_TN_FSB);
+ hpet_writel(cfg, HPET_Tn_CFG(hdev->num));
+}
+
+void hpet_msi_write(struct hpet_dev *hdev, struct msi_msg *msg)
+{
+ hpet_writel(msg->data, HPET_Tn_ROUTE(hdev->num));
+ hpet_writel(msg->address_lo, HPET_Tn_ROUTE(hdev->num) + 4);
+}
+
+void hpet_msi_read(struct hpet_dev *hdev, struct msi_msg *msg)
+{
+ msg->data = hpet_readl(HPET_Tn_ROUTE(hdev->num));
+ msg->address_lo = hpet_readl(HPET_Tn_ROUTE(hdev->num) + 4);
+ msg->address_hi = 0;
+}
+
+static int hpet_msi_shutdown(struct clock_event_device *evt)
+{
+ struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
+
+ return hpet_shutdown(evt, hdev->num);
+}
+
+static int hpet_msi_set_oneshot(struct clock_event_device *evt)
+{
+ struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
+
+ return hpet_set_oneshot(evt, hdev->num);
+}
+
+static int hpet_msi_set_periodic(struct clock_event_device *evt)
+{
+ struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
+
+ return hpet_set_periodic(evt, hdev->num);
+}
+
+static int hpet_msi_resume(struct clock_event_device *evt)
+{
+ struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
+ struct irq_data *data = irq_get_irq_data(hdev->irq);
+ struct msi_msg msg;
+
+ /* Restore the MSI msg and unmask the interrupt */
+ irq_chip_compose_msi_msg(data, &msg);
+ hpet_msi_write(hdev, &msg);
+ hpet_msi_unmask(data);
+ return 0;
+}
+
+static int hpet_msi_next_event(unsigned long delta,
+ struct clock_event_device *evt)
+{
+ struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
+ return hpet_next_event(delta, evt, hdev->num);
+}
+
+static irqreturn_t hpet_interrupt_handler(int irq, void *data)
+{
+ struct hpet_dev *dev = (struct hpet_dev *)data;
+ struct clock_event_device *hevt = &dev->evt;
+
+ if (!hevt->event_handler) {
+ printk(KERN_INFO "Spurious HPET timer interrupt on HPET timer %d\n",
+ dev->num);
+ return IRQ_HANDLED;
+ }
+
+ hevt->event_handler(hevt);
+ return IRQ_HANDLED;
+}
+
+static int hpet_setup_irq(struct hpet_dev *dev)
+{
+
+ if (request_irq(dev->irq, hpet_interrupt_handler,
+ IRQF_TIMER | IRQF_NOBALANCING,
+ dev->name, dev))
+ return -1;
+
+ disable_irq(dev->irq);
+ irq_set_affinity(dev->irq, cpumask_of(dev->cpu));
+ enable_irq(dev->irq);
+
+ printk(KERN_DEBUG "hpet: %s irq %d for MSI\n",
+ dev->name, dev->irq);
+
+ return 0;
+}
+
+/* This should be called in specific @cpu */
+static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu)
+{
+ struct clock_event_device *evt = &hdev->evt;
+
+ WARN_ON(cpu != smp_processor_id());
+ if (!(hdev->flags & HPET_DEV_VALID))
+ return;
+
+ hdev->cpu = cpu;
+ per_cpu(cpu_hpet_dev, cpu) = hdev;
+ evt->name = hdev->name;
+ hpet_setup_irq(hdev);
+ evt->irq = hdev->irq;
+
+ evt->rating = 110;
+ evt->features = CLOCK_EVT_FEAT_ONESHOT;
+ if (hdev->flags & HPET_DEV_PERI_CAP) {
+ evt->features |= CLOCK_EVT_FEAT_PERIODIC;
+ evt->set_state_periodic = hpet_msi_set_periodic;
+ }
+
+ evt->set_state_shutdown = hpet_msi_shutdown;
+ evt->set_state_oneshot = hpet_msi_set_oneshot;
+ evt->tick_resume = hpet_msi_resume;
+ evt->set_next_event = hpet_msi_next_event;
+ evt->cpumask = cpumask_of(hdev->cpu);
+
+ clockevents_config_and_register(evt, hpet_freq, HPET_MIN_PROG_DELTA,
+ 0x7FFFFFFF);
+}
+
+#ifdef CONFIG_HPET
+/* Reserve at least one timer for userspace (/dev/hpet) */
+#define RESERVE_TIMERS 1
+#else
+#define RESERVE_TIMERS 0
+#endif
+
+static void hpet_msi_capability_lookup(unsigned int start_timer)
+{
+ unsigned int id;
+ unsigned int num_timers;
+ unsigned int num_timers_used = 0;
+ int i, irq;
+
+ if (hpet_msi_disable)
+ return;
+
+ if (boot_cpu_has(X86_FEATURE_ARAT))
+ return;
+ id = hpet_readl(HPET_ID);
+
+ num_timers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT);
+ num_timers++; /* Value read out starts from 0 */
+ hpet_print_config();
+
+ hpet_domain = hpet_create_irq_domain(hpet_blockid);
+ if (!hpet_domain)
+ return;
+
+ hpet_devs = kcalloc(num_timers, sizeof(struct hpet_dev), GFP_KERNEL);
+ if (!hpet_devs)
+ return;
+
+ hpet_num_timers = num_timers;
+
+ for (i = start_timer; i < num_timers - RESERVE_TIMERS; i++) {
+ struct hpet_dev *hdev = &hpet_devs[num_timers_used];
+ unsigned int cfg = hpet_readl(HPET_Tn_CFG(i));
+
+ /* Only consider HPET timer with MSI support */
+ if (!(cfg & HPET_TN_FSB_CAP))
+ continue;
+
+ hdev->flags = 0;
+ if (cfg & HPET_TN_PERIODIC_CAP)
+ hdev->flags |= HPET_DEV_PERI_CAP;
+ sprintf(hdev->name, "hpet%d", i);
+ hdev->num = i;
+
+ irq = hpet_assign_irq(hpet_domain, hdev, hdev->num);
+ if (irq <= 0)
+ continue;
+
+ hdev->irq = irq;
+ hdev->flags |= HPET_DEV_FSB_CAP;
+ hdev->flags |= HPET_DEV_VALID;
+ num_timers_used++;
+ if (num_timers_used == num_possible_cpus())
+ break;
+ }
+
+ printk(KERN_INFO "HPET: %d timers in total, %d timers will be used for per-cpu timer\n",
+ num_timers, num_timers_used);
+}
+
+#ifdef CONFIG_HPET
+static void hpet_reserve_msi_timers(struct hpet_data *hd)
+{
+ int i;
+
+ if (!hpet_devs)
+ return;
+
+ for (i = 0; i < hpet_num_timers; i++) {
+ struct hpet_dev *hdev = &hpet_devs[i];
+
+ if (!(hdev->flags & HPET_DEV_VALID))
+ continue;
+
+ hd->hd_irq[hdev->num] = hdev->irq;
+ hpet_reserve_timer(hd, hdev->num);
+ }
+}
+#endif
+
+static struct hpet_dev *hpet_get_unused_timer(void)
+{
+ int i;
+
+ if (!hpet_devs)
+ return NULL;
+
+ for (i = 0; i < hpet_num_timers; i++) {
+ struct hpet_dev *hdev = &hpet_devs[i];
+
+ if (!(hdev->flags & HPET_DEV_VALID))
+ continue;
+ if (test_and_set_bit(HPET_DEV_USED_BIT,
+ (unsigned long *)&hdev->flags))
+ continue;
+ return hdev;
+ }
+ return NULL;
+}
+
+struct hpet_work_struct {
+ struct delayed_work work;
+ struct completion complete;
+};
+
+static void hpet_work(struct work_struct *w)
+{
+ struct hpet_dev *hdev;
+ int cpu = smp_processor_id();
+ struct hpet_work_struct *hpet_work;
+
+ hpet_work = container_of(w, struct hpet_work_struct, work.work);
+
+ hdev = hpet_get_unused_timer();
+ if (hdev)
+ init_one_hpet_msi_clockevent(hdev, cpu);
+
+ complete(&hpet_work->complete);
+}
+
+static int hpet_cpuhp_online(unsigned int cpu)
+{
+ struct hpet_work_struct work;
+
+ INIT_DELAYED_WORK_ONSTACK(&work.work, hpet_work);
+ init_completion(&work.complete);
+ /* FIXME: add schedule_work_on() */
+ schedule_delayed_work_on(cpu, &work.work, 0);
+ wait_for_completion(&work.complete);
+ destroy_delayed_work_on_stack(&work.work);
+ return 0;
+}
+
+static int hpet_cpuhp_dead(unsigned int cpu)
+{
+ struct hpet_dev *hdev = per_cpu(cpu_hpet_dev, cpu);
+
+ if (!hdev)
+ return 0;
+ free_irq(hdev->irq, hdev);
+ hdev->flags &= ~HPET_DEV_USED;
+ per_cpu(cpu_hpet_dev, cpu) = NULL;
+ return 0;
+}
+#else
+
+static void hpet_msi_capability_lookup(unsigned int start_timer)
+{
+ return;
+}
+
+#ifdef CONFIG_HPET
+static void hpet_reserve_msi_timers(struct hpet_data *hd)
+{
+ return;
+}
+#endif
+
+#define hpet_cpuhp_online NULL
+#define hpet_cpuhp_dead NULL
+
+#endif
+
+/*
+ * Clock source related code
+ */
+#if defined(CONFIG_SMP) && defined(CONFIG_64BIT)
+/*
+ * Reading the HPET counter is a very slow operation. If a large number of
+ * CPUs are trying to access the HPET counter simultaneously, it can cause
+ * massive delay and slow down system performance dramatically. This may
+ * happen when HPET is the default clock source instead of TSC. For a
+ * really large system with hundreds of CPUs, the slowdown may be so
+ * severe that it may actually crash the system because of a NMI watchdog
+ * soft lockup, for example.
+ *
+ * If multiple CPUs are trying to access the HPET counter at the same time,
+ * we don't actually need to read the counter multiple times. Instead, the
+ * other CPUs can use the counter value read by the first CPU in the group.
+ *
+ * This special feature is only enabled on x86-64 systems. It is unlikely
+ * that 32-bit x86 systems will have enough CPUs to require this feature
+ * with its associated locking overhead. And we also need 64-bit atomic
+ * read.
+ *
+ * The lock and the hpet value are stored together and can be read in a
+ * single atomic 64-bit read. It is explicitly assumed that arch_spinlock_t
+ * is 32 bits in size.
+ */
+union hpet_lock {
+ struct {
+ arch_spinlock_t lock;
+ u32 value;
+ };
+ u64 lockval;
+};
+
+static union hpet_lock hpet __cacheline_aligned = {
+ { .lock = __ARCH_SPIN_LOCK_UNLOCKED, },
+};
+
+static u64 read_hpet(struct clocksource *cs)
+{
+ unsigned long flags;
+ union hpet_lock old, new;
+
+ BUILD_BUG_ON(sizeof(union hpet_lock) != 8);
+
+ /*
+ * Read HPET directly if in NMI.
+ */
+ if (in_nmi())
+ return (u64)hpet_readl(HPET_COUNTER);
+
+ /*
+ * Read the current state of the lock and HPET value atomically.
+ */
+ old.lockval = READ_ONCE(hpet.lockval);
+
+ if (arch_spin_is_locked(&old.lock))
+ goto contended;
+
+ local_irq_save(flags);
+ if (arch_spin_trylock(&hpet.lock)) {
+ new.value = hpet_readl(HPET_COUNTER);
+ /*
+ * Use WRITE_ONCE() to prevent store tearing.
+ */
+ WRITE_ONCE(hpet.value, new.value);
+ arch_spin_unlock(&hpet.lock);
+ local_irq_restore(flags);
+ return (u64)new.value;
+ }
+ local_irq_restore(flags);
+
+contended:
+ /*
+ * Contended case
+ * --------------
+ * Wait until the HPET value change or the lock is free to indicate
+ * its value is up-to-date.
+ *
+ * It is possible that old.value has already contained the latest
+ * HPET value while the lock holder was in the process of releasing
+ * the lock. Checking for lock state change will enable us to return
+ * the value immediately instead of waiting for the next HPET reader
+ * to come along.
+ */
+ do {
+ cpu_relax();
+ new.lockval = READ_ONCE(hpet.lockval);
+ } while ((new.value == old.value) && arch_spin_is_locked(&new.lock));
+
+ return (u64)new.value;
+}
+#else
+/*
+ * For UP or 32-bit.
+ */
+static u64 read_hpet(struct clocksource *cs)
+{
+ return (u64)hpet_readl(HPET_COUNTER);
+}
+#endif
+
+static struct clocksource clocksource_hpet = {
+ .name = "hpet",
+ .rating = 250,
+ .read = read_hpet,
+ .mask = HPET_MASK,
+ .flags = CLOCK_SOURCE_IS_CONTINUOUS,
+ .resume = hpet_resume_counter,
+};
+
+static int hpet_clocksource_register(void)
+{
+ u64 start, now;
+ u64 t1;
+
+ /* Start the counter */
+ hpet_restart_counter();
+
+ /* Verify whether hpet counter works */
+ t1 = hpet_readl(HPET_COUNTER);
+ start = rdtsc();
+
+ /*
+ * We don't know the TSC frequency yet, but waiting for
+ * 200000 TSC cycles is safe:
+ * 4 GHz == 50us
+ * 1 GHz == 200us
+ */
+ do {
+ rep_nop();
+ now = rdtsc();
+ } while ((now - start) < 200000UL);
+
+ if (t1 == hpet_readl(HPET_COUNTER)) {
+ printk(KERN_WARNING
+ "HPET counter not counting. HPET disabled\n");
+ return -ENODEV;
+ }
+
+ clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq);
+ return 0;
+}
+
+static u32 *hpet_boot_cfg;
+
+/**
+ * hpet_enable - Try to setup the HPET timer. Returns 1 on success.
+ */
+int __init hpet_enable(void)
+{
+ u32 hpet_period, cfg, id;
+ u64 freq;
+ unsigned int i, last;
+
+ if (!is_hpet_capable())
+ return 0;
+
+ hpet_set_mapping();
+ if (!hpet_virt_address)
+ return 0;
+
+ /*
+ * Read the period and check for a sane value:
+ */
+ hpet_period = hpet_readl(HPET_PERIOD);
+
+ /*
+ * AMD SB700 based systems with spread spectrum enabled use a
+ * SMM based HPET emulation to provide proper frequency
+ * setting. The SMM code is initialized with the first HPET
+ * register access and takes some time to complete. During
+ * this time the config register reads 0xffffffff. We check
+ * for max. 1000 loops whether the config register reads a non
+ * 0xffffffff value to make sure that HPET is up and running
+ * before we go further. A counting loop is safe, as the HPET
+ * access takes thousands of CPU cycles. On non SB700 based
+ * machines this check is only done once and has no side
+ * effects.
+ */
+ for (i = 0; hpet_readl(HPET_CFG) == 0xFFFFFFFF; i++) {
+ if (i == 1000) {
+ printk(KERN_WARNING
+ "HPET config register value = 0xFFFFFFFF. "
+ "Disabling HPET\n");
+ goto out_nohpet;
+ }
+ }
+
+ if (hpet_period < HPET_MIN_PERIOD || hpet_period > HPET_MAX_PERIOD)
+ goto out_nohpet;
+
+ /*
+ * The period is a femto seconds value. Convert it to a
+ * frequency.
+ */
+ freq = FSEC_PER_SEC;
+ do_div(freq, hpet_period);
+ hpet_freq = freq;
+
+ /*
+ * Read the HPET ID register to retrieve the IRQ routing
+ * information and the number of channels
+ */
+ id = hpet_readl(HPET_ID);
+ hpet_print_config();
+
+ last = (id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT;
+
+#ifdef CONFIG_HPET_EMULATE_RTC
+ /*
+ * The legacy routing mode needs at least two channels, tick timer
+ * and the rtc emulation channel.
+ */
+ if (!last)
+ goto out_nohpet;
+#endif
+
+ cfg = hpet_readl(HPET_CFG);
+ hpet_boot_cfg = kmalloc_array(last + 2, sizeof(*hpet_boot_cfg),
+ GFP_KERNEL);
+ if (hpet_boot_cfg)
+ *hpet_boot_cfg = cfg;
+ else
+ pr_warn("HPET initial state will not be saved\n");
+ cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY);
+ hpet_writel(cfg, HPET_CFG);
+ if (cfg)
+ pr_warn("Unrecognized bits %#x set in global cfg\n", cfg);
+
+ for (i = 0; i <= last; ++i) {
+ cfg = hpet_readl(HPET_Tn_CFG(i));
+ if (hpet_boot_cfg)
+ hpet_boot_cfg[i + 1] = cfg;
+ cfg &= ~(HPET_TN_ENABLE | HPET_TN_LEVEL | HPET_TN_FSB);
+ hpet_writel(cfg, HPET_Tn_CFG(i));
+ cfg &= ~(HPET_TN_PERIODIC | HPET_TN_PERIODIC_CAP
+ | HPET_TN_64BIT_CAP | HPET_TN_32BIT | HPET_TN_ROUTE
+ | HPET_TN_FSB | HPET_TN_FSB_CAP);
+ if (cfg)
+ pr_warn("Unrecognized bits %#x set in cfg#%u\n",
+ cfg, i);
+ }
+ hpet_print_config();
+
+ if (hpet_clocksource_register())
+ goto out_nohpet;
+
+ if (id & HPET_ID_LEGSUP) {
+ hpet_legacy_clockevent_register();
+ return 1;
+ }
+ return 0;
+
+out_nohpet:
+ hpet_clear_mapping();
+ hpet_address = 0;
+ return 0;
+}
+
+/*
+ * Needs to be late, as the reserve_timer code calls kalloc !
+ *
+ * Not a problem on i386 as hpet_enable is called from late_time_init,
+ * but on x86_64 it is necessary !
+ */
+static __init int hpet_late_init(void)
+{
+ int ret;
+
+ if (boot_hpet_disable)
+ return -ENODEV;
+
+ if (!hpet_address) {
+ if (!force_hpet_address)
+ return -ENODEV;
+
+ hpet_address = force_hpet_address;
+ hpet_enable();
+ }
+
+ if (!hpet_virt_address)
+ return -ENODEV;
+
+ if (hpet_readl(HPET_ID) & HPET_ID_LEGSUP)
+ hpet_msi_capability_lookup(2);
+ else
+ hpet_msi_capability_lookup(0);
+
+ hpet_reserve_platform_timers(hpet_readl(HPET_ID));
+ hpet_print_config();
+
+ if (hpet_msi_disable)
+ return 0;
+
+ if (boot_cpu_has(X86_FEATURE_ARAT))
+ return 0;
+
+ /* This notifier should be called after workqueue is ready */
+ ret = cpuhp_setup_state(CPUHP_AP_X86_HPET_ONLINE, "x86/hpet:online",
+ hpet_cpuhp_online, NULL);
+ if (ret)
+ return ret;
+ ret = cpuhp_setup_state(CPUHP_X86_HPET_DEAD, "x86/hpet:dead", NULL,
+ hpet_cpuhp_dead);
+ if (ret)
+ goto err_cpuhp;
+ return 0;
+
+err_cpuhp:
+ cpuhp_remove_state(CPUHP_AP_X86_HPET_ONLINE);
+ return ret;
+}
+fs_initcall(hpet_late_init);
+
+void hpet_disable(void)
+{
+ if (is_hpet_capable() && hpet_virt_address) {
+ unsigned int cfg = hpet_readl(HPET_CFG), id, last;
+
+ if (hpet_boot_cfg)
+ cfg = *hpet_boot_cfg;
+ else if (hpet_legacy_int_enabled) {
+ cfg &= ~HPET_CFG_LEGACY;
+ hpet_legacy_int_enabled = false;
+ }
+ cfg &= ~HPET_CFG_ENABLE;
+ hpet_writel(cfg, HPET_CFG);
+
+ if (!hpet_boot_cfg)
+ return;
+
+ id = hpet_readl(HPET_ID);
+ last = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT);
+
+ for (id = 0; id <= last; ++id)
+ hpet_writel(hpet_boot_cfg[id + 1], HPET_Tn_CFG(id));
+
+ if (*hpet_boot_cfg & HPET_CFG_ENABLE)
+ hpet_writel(*hpet_boot_cfg, HPET_CFG);
+ }
+}
+
+#ifdef CONFIG_HPET_EMULATE_RTC
+
+/* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET
+ * is enabled, we support RTC interrupt functionality in software.
+ * RTC has 3 kinds of interrupts:
+ * 1) Update Interrupt - generate an interrupt, every sec, when RTC clock
+ * is updated
+ * 2) Alarm Interrupt - generate an interrupt at a specific time of day
+ * 3) Periodic Interrupt - generate periodic interrupt, with frequencies
+ * 2Hz-8192Hz (2Hz-64Hz for non-root user) (all freqs in powers of 2)
+ * (1) and (2) above are implemented using polling at a frequency of
+ * 64 Hz. The exact frequency is a tradeoff between accuracy and interrupt
+ * overhead. (DEFAULT_RTC_INT_FREQ)
+ * For (3), we use interrupts at 64Hz or user specified periodic
+ * frequency, whichever is higher.
+ */
+#include <linux/mc146818rtc.h>
+#include <linux/rtc.h>
+
+#define DEFAULT_RTC_INT_FREQ 64
+#define DEFAULT_RTC_SHIFT 6
+#define RTC_NUM_INTS 1
+
+static unsigned long hpet_rtc_flags;
+static int hpet_prev_update_sec;
+static struct rtc_time hpet_alarm_time;
+static unsigned long hpet_pie_count;
+static u32 hpet_t1_cmp;
+static u32 hpet_default_delta;
+static u32 hpet_pie_delta;
+static unsigned long hpet_pie_limit;
+
+static rtc_irq_handler irq_handler;
+
+/*
+ * Check that the hpet counter c1 is ahead of the c2
+ */
+static inline int hpet_cnt_ahead(u32 c1, u32 c2)
+{
+ return (s32)(c2 - c1) < 0;
+}
+
+/*
+ * Registers a IRQ handler.
+ */
+int hpet_register_irq_handler(rtc_irq_handler handler)
+{
+ if (!is_hpet_enabled())
+ return -ENODEV;
+ if (irq_handler)
+ return -EBUSY;
+
+ irq_handler = handler;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(hpet_register_irq_handler);
+
+/*
+ * Deregisters the IRQ handler registered with hpet_register_irq_handler()
+ * and does cleanup.
+ */
+void hpet_unregister_irq_handler(rtc_irq_handler handler)
+{
+ if (!is_hpet_enabled())
+ return;
+
+ irq_handler = NULL;
+ hpet_rtc_flags = 0;
+}
+EXPORT_SYMBOL_GPL(hpet_unregister_irq_handler);
+
+/*
+ * Timer 1 for RTC emulation. We use one shot mode, as periodic mode
+ * is not supported by all HPET implementations for timer 1.
+ *
+ * hpet_rtc_timer_init() is called when the rtc is initialized.
+ */
+int hpet_rtc_timer_init(void)
+{
+ unsigned int cfg, cnt, delta;
+ unsigned long flags;
+
+ if (!is_hpet_enabled())
+ return 0;
+
+ if (!hpet_default_delta) {
+ uint64_t clc;
+
+ clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC;
+ clc >>= hpet_clockevent.shift + DEFAULT_RTC_SHIFT;
+ hpet_default_delta = clc;
+ }
+
+ if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit)
+ delta = hpet_default_delta;
+ else
+ delta = hpet_pie_delta;
+
+ local_irq_save(flags);
+
+ cnt = delta + hpet_readl(HPET_COUNTER);
+ hpet_writel(cnt, HPET_T1_CMP);
+ hpet_t1_cmp = cnt;
+
+ cfg = hpet_readl(HPET_T1_CFG);
+ cfg &= ~HPET_TN_PERIODIC;
+ cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
+ hpet_writel(cfg, HPET_T1_CFG);
+
+ local_irq_restore(flags);
+
+ return 1;
+}
+EXPORT_SYMBOL_GPL(hpet_rtc_timer_init);
+
+static void hpet_disable_rtc_channel(void)
+{
+ u32 cfg = hpet_readl(HPET_T1_CFG);
+ cfg &= ~HPET_TN_ENABLE;
+ hpet_writel(cfg, HPET_T1_CFG);
+}
+
+/*
+ * The functions below are called from rtc driver.
+ * Return 0 if HPET is not being used.
+ * Otherwise do the necessary changes and return 1.
+ */
+int hpet_mask_rtc_irq_bit(unsigned long bit_mask)
+{
+ if (!is_hpet_enabled())
+ return 0;
+
+ hpet_rtc_flags &= ~bit_mask;
+ if (unlikely(!hpet_rtc_flags))
+ hpet_disable_rtc_channel();
+
+ return 1;
+}
+EXPORT_SYMBOL_GPL(hpet_mask_rtc_irq_bit);
+
+int hpet_set_rtc_irq_bit(unsigned long bit_mask)
+{
+ unsigned long oldbits = hpet_rtc_flags;
+
+ if (!is_hpet_enabled())
+ return 0;
+
+ hpet_rtc_flags |= bit_mask;
+
+ if ((bit_mask & RTC_UIE) && !(oldbits & RTC_UIE))
+ hpet_prev_update_sec = -1;
+
+ if (!oldbits)
+ hpet_rtc_timer_init();
+
+ return 1;
+}
+EXPORT_SYMBOL_GPL(hpet_set_rtc_irq_bit);
+
+int hpet_set_alarm_time(unsigned char hrs, unsigned char min,
+ unsigned char sec)
+{
+ if (!is_hpet_enabled())
+ return 0;
+
+ hpet_alarm_time.tm_hour = hrs;
+ hpet_alarm_time.tm_min = min;
+ hpet_alarm_time.tm_sec = sec;
+
+ return 1;
+}
+EXPORT_SYMBOL_GPL(hpet_set_alarm_time);
+
+int hpet_set_periodic_freq(unsigned long freq)
+{
+ uint64_t clc;
+
+ if (!is_hpet_enabled())
+ return 0;
+
+ if (freq <= DEFAULT_RTC_INT_FREQ)
+ hpet_pie_limit = DEFAULT_RTC_INT_FREQ / freq;
+ else {
+ clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC;
+ do_div(clc, freq);
+ clc >>= hpet_clockevent.shift;
+ hpet_pie_delta = clc;
+ hpet_pie_limit = 0;
+ }
+ return 1;
+}
+EXPORT_SYMBOL_GPL(hpet_set_periodic_freq);
+
+int hpet_rtc_dropped_irq(void)
+{
+ return is_hpet_enabled();
+}
+EXPORT_SYMBOL_GPL(hpet_rtc_dropped_irq);
+
+static void hpet_rtc_timer_reinit(void)
+{
+ unsigned int delta;
+ int lost_ints = -1;
+
+ if (unlikely(!hpet_rtc_flags))
+ hpet_disable_rtc_channel();
+
+ if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit)
+ delta = hpet_default_delta;
+ else
+ delta = hpet_pie_delta;
+
+ /*
+ * Increment the comparator value until we are ahead of the
+ * current count.
+ */
+ do {
+ hpet_t1_cmp += delta;
+ hpet_writel(hpet_t1_cmp, HPET_T1_CMP);
+ lost_ints++;
+ } while (!hpet_cnt_ahead(hpet_t1_cmp, hpet_readl(HPET_COUNTER)));
+
+ if (lost_ints) {
+ if (hpet_rtc_flags & RTC_PIE)
+ hpet_pie_count += lost_ints;
+ if (printk_ratelimit())
+ printk(KERN_WARNING "hpet1: lost %d rtc interrupts\n",
+ lost_ints);
+ }
+}
+
+irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
+{
+ struct rtc_time curr_time;
+ unsigned long rtc_int_flag = 0;
+
+ hpet_rtc_timer_reinit();
+ memset(&curr_time, 0, sizeof(struct rtc_time));
+
+ if (hpet_rtc_flags & (RTC_UIE | RTC_AIE))
+ mc146818_get_time(&curr_time);
+
+ if (hpet_rtc_flags & RTC_UIE &&
+ curr_time.tm_sec != hpet_prev_update_sec) {
+ if (hpet_prev_update_sec >= 0)
+ rtc_int_flag = RTC_UF;
+ hpet_prev_update_sec = curr_time.tm_sec;
+ }
+
+ if (hpet_rtc_flags & RTC_PIE &&
+ ++hpet_pie_count >= hpet_pie_limit) {
+ rtc_int_flag |= RTC_PF;
+ hpet_pie_count = 0;
+ }
+
+ if (hpet_rtc_flags & RTC_AIE &&
+ (curr_time.tm_sec == hpet_alarm_time.tm_sec) &&
+ (curr_time.tm_min == hpet_alarm_time.tm_min) &&
+ (curr_time.tm_hour == hpet_alarm_time.tm_hour))
+ rtc_int_flag |= RTC_AF;
+
+ if (rtc_int_flag) {
+ rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8));
+ if (irq_handler)
+ irq_handler(rtc_int_flag, dev_id);
+ }
+ return IRQ_HANDLED;
+}
+EXPORT_SYMBOL_GPL(hpet_rtc_interrupt);
+#endif
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
new file mode 100644
index 000000000..2882fe1d2
--- /dev/null
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -0,0 +1,552 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) 2007 Alan Stern
+ * Copyright (C) 2009 IBM Corporation
+ * Copyright (C) 2009 Frederic Weisbecker <fweisbec@gmail.com>
+ *
+ * Authors: Alan Stern <stern@rowland.harvard.edu>
+ * K.Prasad <prasad@linux.vnet.ibm.com>
+ * Frederic Weisbecker <fweisbec@gmail.com>
+ */
+
+/*
+ * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility,
+ * using the CPU's debug registers.
+ */
+
+#include <linux/perf_event.h>
+#include <linux/hw_breakpoint.h>
+#include <linux/irqflags.h>
+#include <linux/notifier.h>
+#include <linux/kallsyms.h>
+#include <linux/kprobes.h>
+#include <linux/percpu.h>
+#include <linux/kdebug.h>
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+
+#include <asm/hw_breakpoint.h>
+#include <asm/processor.h>
+#include <asm/debugreg.h>
+#include <asm/user.h>
+
+/* Per cpu debug control register value */
+DEFINE_PER_CPU(unsigned long, cpu_dr7);
+EXPORT_PER_CPU_SYMBOL(cpu_dr7);
+
+/* Per cpu debug address registers values */
+static DEFINE_PER_CPU(unsigned long, cpu_debugreg[HBP_NUM]);
+
+/*
+ * Stores the breakpoints currently in use on each breakpoint address
+ * register for each cpus
+ */
+static DEFINE_PER_CPU(struct perf_event *, bp_per_reg[HBP_NUM]);
+
+
+static inline unsigned long
+__encode_dr7(int drnum, unsigned int len, unsigned int type)
+{
+ unsigned long bp_info;
+
+ bp_info = (len | type) & 0xf;
+ bp_info <<= (DR_CONTROL_SHIFT + drnum * DR_CONTROL_SIZE);
+ bp_info |= (DR_GLOBAL_ENABLE << (drnum * DR_ENABLE_SIZE));
+
+ return bp_info;
+}
+
+/*
+ * Encode the length, type, Exact, and Enable bits for a particular breakpoint
+ * as stored in debug register 7.
+ */
+unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type)
+{
+ return __encode_dr7(drnum, len, type) | DR_GLOBAL_SLOWDOWN;
+}
+
+/*
+ * Decode the length and type bits for a particular breakpoint as
+ * stored in debug register 7. Return the "enabled" status.
+ */
+int decode_dr7(unsigned long dr7, int bpnum, unsigned *len, unsigned *type)
+{
+ int bp_info = dr7 >> (DR_CONTROL_SHIFT + bpnum * DR_CONTROL_SIZE);
+
+ *len = (bp_info & 0xc) | 0x40;
+ *type = (bp_info & 0x3) | 0x80;
+
+ return (dr7 >> (bpnum * DR_ENABLE_SIZE)) & 0x3;
+}
+
+/*
+ * Install a perf counter breakpoint.
+ *
+ * We seek a free debug address register and use it for this
+ * breakpoint. Eventually we enable it in the debug control register.
+ *
+ * Atomic: we hold the counter->ctx->lock and we only handle variables
+ * and registers local to this cpu.
+ */
+int arch_install_hw_breakpoint(struct perf_event *bp)
+{
+ struct arch_hw_breakpoint *info = counter_arch_bp(bp);
+ unsigned long *dr7;
+ int i;
+
+ for (i = 0; i < HBP_NUM; i++) {
+ struct perf_event **slot = this_cpu_ptr(&bp_per_reg[i]);
+
+ if (!*slot) {
+ *slot = bp;
+ break;
+ }
+ }
+
+ if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot"))
+ return -EBUSY;
+
+ set_debugreg(info->address, i);
+ __this_cpu_write(cpu_debugreg[i], info->address);
+
+ dr7 = this_cpu_ptr(&cpu_dr7);
+ *dr7 |= encode_dr7(i, info->len, info->type);
+
+ set_debugreg(*dr7, 7);
+ if (info->mask)
+ set_dr_addr_mask(info->mask, i);
+
+ return 0;
+}
+
+/*
+ * Uninstall the breakpoint contained in the given counter.
+ *
+ * First we search the debug address register it uses and then we disable
+ * it.
+ *
+ * Atomic: we hold the counter->ctx->lock and we only handle variables
+ * and registers local to this cpu.
+ */
+void arch_uninstall_hw_breakpoint(struct perf_event *bp)
+{
+ struct arch_hw_breakpoint *info = counter_arch_bp(bp);
+ unsigned long *dr7;
+ int i;
+
+ for (i = 0; i < HBP_NUM; i++) {
+ struct perf_event **slot = this_cpu_ptr(&bp_per_reg[i]);
+
+ if (*slot == bp) {
+ *slot = NULL;
+ break;
+ }
+ }
+
+ if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot"))
+ return;
+
+ dr7 = this_cpu_ptr(&cpu_dr7);
+ *dr7 &= ~__encode_dr7(i, info->len, info->type);
+
+ set_debugreg(*dr7, 7);
+ if (info->mask)
+ set_dr_addr_mask(0, i);
+}
+
+static int arch_bp_generic_len(int x86_len)
+{
+ switch (x86_len) {
+ case X86_BREAKPOINT_LEN_1:
+ return HW_BREAKPOINT_LEN_1;
+ case X86_BREAKPOINT_LEN_2:
+ return HW_BREAKPOINT_LEN_2;
+ case X86_BREAKPOINT_LEN_4:
+ return HW_BREAKPOINT_LEN_4;
+#ifdef CONFIG_X86_64
+ case X86_BREAKPOINT_LEN_8:
+ return HW_BREAKPOINT_LEN_8;
+#endif
+ default:
+ return -EINVAL;
+ }
+}
+
+int arch_bp_generic_fields(int x86_len, int x86_type,
+ int *gen_len, int *gen_type)
+{
+ int len;
+
+ /* Type */
+ switch (x86_type) {
+ case X86_BREAKPOINT_EXECUTE:
+ if (x86_len != X86_BREAKPOINT_LEN_X)
+ return -EINVAL;
+
+ *gen_type = HW_BREAKPOINT_X;
+ *gen_len = sizeof(long);
+ return 0;
+ case X86_BREAKPOINT_WRITE:
+ *gen_type = HW_BREAKPOINT_W;
+ break;
+ case X86_BREAKPOINT_RW:
+ *gen_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ /* Len */
+ len = arch_bp_generic_len(x86_len);
+ if (len < 0)
+ return -EINVAL;
+ *gen_len = len;
+
+ return 0;
+}
+
+/*
+ * Check for virtual address in kernel space.
+ */
+int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw)
+{
+ unsigned long va;
+ int len;
+
+ va = hw->address;
+ len = arch_bp_generic_len(hw->len);
+ WARN_ON_ONCE(len < 0);
+
+ /*
+ * We don't need to worry about va + len - 1 overflowing:
+ * we already require that va is aligned to a multiple of len.
+ */
+ return (va >= TASK_SIZE_MAX) || ((va + len - 1) >= TASK_SIZE_MAX);
+}
+
+static int arch_build_bp_info(struct perf_event *bp,
+ const struct perf_event_attr *attr,
+ struct arch_hw_breakpoint *hw)
+{
+ hw->address = attr->bp_addr;
+ hw->mask = 0;
+
+ /* Type */
+ switch (attr->bp_type) {
+ case HW_BREAKPOINT_W:
+ hw->type = X86_BREAKPOINT_WRITE;
+ break;
+ case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
+ hw->type = X86_BREAKPOINT_RW;
+ break;
+ case HW_BREAKPOINT_X:
+ /*
+ * We don't allow kernel breakpoints in places that are not
+ * acceptable for kprobes. On non-kprobes kernels, we don't
+ * allow kernel breakpoints at all.
+ */
+ if (attr->bp_addr >= TASK_SIZE_MAX) {
+#ifdef CONFIG_KPROBES
+ if (within_kprobe_blacklist(attr->bp_addr))
+ return -EINVAL;
+#else
+ return -EINVAL;
+#endif
+ }
+
+ hw->type = X86_BREAKPOINT_EXECUTE;
+ /*
+ * x86 inst breakpoints need to have a specific undefined len.
+ * But we still need to check userspace is not trying to setup
+ * an unsupported length, to get a range breakpoint for example.
+ */
+ if (attr->bp_len == sizeof(long)) {
+ hw->len = X86_BREAKPOINT_LEN_X;
+ return 0;
+ }
+ default:
+ return -EINVAL;
+ }
+
+ /* Len */
+ switch (attr->bp_len) {
+ case HW_BREAKPOINT_LEN_1:
+ hw->len = X86_BREAKPOINT_LEN_1;
+ break;
+ case HW_BREAKPOINT_LEN_2:
+ hw->len = X86_BREAKPOINT_LEN_2;
+ break;
+ case HW_BREAKPOINT_LEN_4:
+ hw->len = X86_BREAKPOINT_LEN_4;
+ break;
+#ifdef CONFIG_X86_64
+ case HW_BREAKPOINT_LEN_8:
+ hw->len = X86_BREAKPOINT_LEN_8;
+ break;
+#endif
+ default:
+ /* AMD range breakpoint */
+ if (!is_power_of_2(attr->bp_len))
+ return -EINVAL;
+ if (attr->bp_addr & (attr->bp_len - 1))
+ return -EINVAL;
+
+ if (!boot_cpu_has(X86_FEATURE_BPEXT))
+ return -EOPNOTSUPP;
+
+ /*
+ * It's impossible to use a range breakpoint to fake out
+ * user vs kernel detection because bp_len - 1 can't
+ * have the high bit set. If we ever allow range instruction
+ * breakpoints, then we'll have to check for kprobe-blacklisted
+ * addresses anywhere in the range.
+ */
+ hw->mask = attr->bp_len - 1;
+ hw->len = X86_BREAKPOINT_LEN_1;
+ }
+
+ return 0;
+}
+
+/*
+ * Validate the arch-specific HW Breakpoint register settings
+ */
+int hw_breakpoint_arch_parse(struct perf_event *bp,
+ const struct perf_event_attr *attr,
+ struct arch_hw_breakpoint *hw)
+{
+ unsigned int align;
+ int ret;
+
+
+ ret = arch_build_bp_info(bp, attr, hw);
+ if (ret)
+ return ret;
+
+ switch (hw->len) {
+ case X86_BREAKPOINT_LEN_1:
+ align = 0;
+ if (hw->mask)
+ align = hw->mask;
+ break;
+ case X86_BREAKPOINT_LEN_2:
+ align = 1;
+ break;
+ case X86_BREAKPOINT_LEN_4:
+ align = 3;
+ break;
+#ifdef CONFIG_X86_64
+ case X86_BREAKPOINT_LEN_8:
+ align = 7;
+ break;
+#endif
+ default:
+ WARN_ON_ONCE(1);
+ return -EINVAL;
+ }
+
+ /*
+ * Check that the low-order bits of the address are appropriate
+ * for the alignment implied by len.
+ */
+ if (hw->address & align)
+ return -EINVAL;
+
+ return 0;
+}
+
+/*
+ * Dump the debug register contents to the user.
+ * We can't dump our per cpu values because it
+ * may contain cpu wide breakpoint, something that
+ * doesn't belong to the current task.
+ *
+ * TODO: include non-ptrace user breakpoints (perf)
+ */
+void aout_dump_debugregs(struct user *dump)
+{
+ int i;
+ int dr7 = 0;
+ struct perf_event *bp;
+ struct arch_hw_breakpoint *info;
+ struct thread_struct *thread = &current->thread;
+
+ for (i = 0; i < HBP_NUM; i++) {
+ bp = thread->ptrace_bps[i];
+
+ if (bp && !bp->attr.disabled) {
+ dump->u_debugreg[i] = bp->attr.bp_addr;
+ info = counter_arch_bp(bp);
+ dr7 |= encode_dr7(i, info->len, info->type);
+ } else {
+ dump->u_debugreg[i] = 0;
+ }
+ }
+
+ dump->u_debugreg[4] = 0;
+ dump->u_debugreg[5] = 0;
+ dump->u_debugreg[6] = current->thread.debugreg6;
+
+ dump->u_debugreg[7] = dr7;
+}
+EXPORT_SYMBOL_GPL(aout_dump_debugregs);
+
+/*
+ * Release the user breakpoints used by ptrace
+ */
+void flush_ptrace_hw_breakpoint(struct task_struct *tsk)
+{
+ int i;
+ struct thread_struct *t = &tsk->thread;
+
+ for (i = 0; i < HBP_NUM; i++) {
+ unregister_hw_breakpoint(t->ptrace_bps[i]);
+ t->ptrace_bps[i] = NULL;
+ }
+
+ t->debugreg6 = 0;
+ t->ptrace_dr7 = 0;
+}
+
+void hw_breakpoint_restore(void)
+{
+ set_debugreg(__this_cpu_read(cpu_debugreg[0]), 0);
+ set_debugreg(__this_cpu_read(cpu_debugreg[1]), 1);
+ set_debugreg(__this_cpu_read(cpu_debugreg[2]), 2);
+ set_debugreg(__this_cpu_read(cpu_debugreg[3]), 3);
+ set_debugreg(current->thread.debugreg6, 6);
+ set_debugreg(__this_cpu_read(cpu_dr7), 7);
+}
+EXPORT_SYMBOL_GPL(hw_breakpoint_restore);
+
+/*
+ * Handle debug exception notifications.
+ *
+ * Return value is either NOTIFY_STOP or NOTIFY_DONE as explained below.
+ *
+ * NOTIFY_DONE returned if one of the following conditions is true.
+ * i) When the causative address is from user-space and the exception
+ * is a valid one, i.e. not triggered as a result of lazy debug register
+ * switching
+ * ii) When there are more bits than trap<n> set in DR6 register (such
+ * as BD, BS or BT) indicating that more than one debug condition is
+ * met and requires some more action in do_debug().
+ *
+ * NOTIFY_STOP returned for all other cases
+ *
+ */
+static int hw_breakpoint_handler(struct die_args *args)
+{
+ int i, cpu, rc = NOTIFY_STOP;
+ struct perf_event *bp;
+ unsigned long dr7, dr6;
+ unsigned long *dr6_p;
+
+ /* The DR6 value is pointed by args->err */
+ dr6_p = (unsigned long *)ERR_PTR(args->err);
+ dr6 = *dr6_p;
+
+ /* If it's a single step, TRAP bits are random */
+ if (dr6 & DR_STEP)
+ return NOTIFY_DONE;
+
+ /* Do an early return if no trap bits are set in DR6 */
+ if ((dr6 & DR_TRAP_BITS) == 0)
+ return NOTIFY_DONE;
+
+ get_debugreg(dr7, 7);
+ /* Disable breakpoints during exception handling */
+ set_debugreg(0UL, 7);
+ /*
+ * Assert that local interrupts are disabled
+ * Reset the DRn bits in the virtualized register value.
+ * The ptrace trigger routine will add in whatever is needed.
+ */
+ current->thread.debugreg6 &= ~DR_TRAP_BITS;
+ cpu = get_cpu();
+
+ /* Handle all the breakpoints that were triggered */
+ for (i = 0; i < HBP_NUM; ++i) {
+ if (likely(!(dr6 & (DR_TRAP0 << i))))
+ continue;
+
+ /*
+ * The counter may be concurrently released but that can only
+ * occur from a call_rcu() path. We can then safely fetch
+ * the breakpoint, use its callback, touch its counter
+ * while we are in an rcu_read_lock() path.
+ */
+ rcu_read_lock();
+
+ bp = per_cpu(bp_per_reg[i], cpu);
+ /*
+ * Reset the 'i'th TRAP bit in dr6 to denote completion of
+ * exception handling
+ */
+ (*dr6_p) &= ~(DR_TRAP0 << i);
+ /*
+ * bp can be NULL due to lazy debug register switching
+ * or due to concurrent perf counter removing.
+ */
+ if (!bp) {
+ rcu_read_unlock();
+ break;
+ }
+
+ perf_bp_event(bp, args->regs);
+
+ /*
+ * Set up resume flag to avoid breakpoint recursion when
+ * returning back to origin.
+ */
+ if (bp->hw.info.type == X86_BREAKPOINT_EXECUTE)
+ args->regs->flags |= X86_EFLAGS_RF;
+
+ rcu_read_unlock();
+ }
+ /*
+ * Further processing in do_debug() is needed for a) user-space
+ * breakpoints (to generate signals) and b) when the system has
+ * taken exception due to multiple causes
+ */
+ if ((current->thread.debugreg6 & DR_TRAP_BITS) ||
+ (dr6 & (~DR_TRAP_BITS)))
+ rc = NOTIFY_DONE;
+
+ set_debugreg(dr7, 7);
+ put_cpu();
+
+ return rc;
+}
+
+/*
+ * Handle debug exception notifications.
+ */
+int hw_breakpoint_exceptions_notify(
+ struct notifier_block *unused, unsigned long val, void *data)
+{
+ if (val != DIE_DEBUG)
+ return NOTIFY_DONE;
+
+ return hw_breakpoint_handler(data);
+}
+
+void hw_breakpoint_pmu_read(struct perf_event *bp)
+{
+ /* TODO */
+}
diff --git a/arch/x86/kernel/i8237.c b/arch/x86/kernel/i8237.c
new file mode 100644
index 000000000..0a3e70fd0
--- /dev/null
+++ b/arch/x86/kernel/i8237.c
@@ -0,0 +1,80 @@
+/*
+ * 8237A DMA controller suspend functions.
+ *
+ * Written by Pierre Ossman, 2005.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ */
+
+#include <linux/dmi.h>
+#include <linux/init.h>
+#include <linux/syscore_ops.h>
+
+#include <asm/dma.h>
+#include <asm/x86_init.h>
+
+/*
+ * This module just handles suspend/resume issues with the
+ * 8237A DMA controller (used for ISA and LPC).
+ * Allocation is handled in kernel/dma.c and normal usage is
+ * in asm/dma.h.
+ */
+
+static void i8237A_resume(void)
+{
+ unsigned long flags;
+ int i;
+
+ flags = claim_dma_lock();
+
+ dma_outb(0, DMA1_RESET_REG);
+ dma_outb(0, DMA2_RESET_REG);
+
+ for (i = 0; i < 8; i++) {
+ set_dma_addr(i, 0x000000);
+ /* DMA count is a bit weird so this is not 0 */
+ set_dma_count(i, 1);
+ }
+
+ /* Enable cascade DMA or channel 0-3 won't work */
+ enable_dma(4);
+
+ release_dma_lock(flags);
+}
+
+static struct syscore_ops i8237_syscore_ops = {
+ .resume = i8237A_resume,
+};
+
+static int __init i8237A_init_ops(void)
+{
+ /*
+ * From SKL PCH onwards, the legacy DMA device is removed in which the
+ * I/O ports (81h-83h, 87h, 89h-8Bh, 8Fh) related to it are removed
+ * as well. All removed ports must return 0xff for a inb() request.
+ *
+ * Note: DMA_PAGE_2 (port 0x81) should not be checked for detecting
+ * the presence of DMA device since it may be used by BIOS to decode
+ * LPC traffic for POST codes. Original LPC only decodes one byte of
+ * port 0x80 but some BIOS may choose to enhance PCH LPC port 0x8x
+ * decoding.
+ */
+ if (dma_inb(DMA_PAGE_0) == 0xFF)
+ return -ENODEV;
+
+ /*
+ * It is not required to load this driver as newer SoC may not
+ * support 8237 DMA or bus mastering from LPC. Platform firmware
+ * must announce the support for such legacy devices via
+ * ACPI_FADT_LEGACY_DEVICES field in FADT table.
+ */
+ if (x86_pnpbios_disabled() && dmi_get_bios_year() >= 2017)
+ return -ENODEV;
+
+ register_syscore_ops(&i8237_syscore_ops);
+ return 0;
+}
+device_initcall(i8237A_init_ops);
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
new file mode 100644
index 000000000..0d307a657
--- /dev/null
+++ b/arch/x86/kernel/i8253.c
@@ -0,0 +1,44 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * 8253/PIT functions
+ *
+ */
+#include <linux/clockchips.h>
+#include <linux/init.h>
+#include <linux/timex.h>
+#include <linux/i8253.h>
+
+#include <asm/hpet.h>
+#include <asm/time.h>
+#include <asm/smp.h>
+
+/*
+ * HPET replaces the PIT, when enabled. So we need to know, which of
+ * the two timers is used
+ */
+struct clock_event_device *global_clock_event;
+
+void __init setup_pit_timer(void)
+{
+ clockevent_i8253_init(true);
+ global_clock_event = &i8253_clockevent;
+}
+
+#ifndef CONFIG_X86_64
+static int __init init_pit_clocksource(void)
+{
+ /*
+ * Several reasons not to register PIT as a clocksource:
+ *
+ * - On SMP PIT does not scale due to i8253_lock
+ * - when HPET is enabled
+ * - when local APIC timer is active (PIT is switched off)
+ */
+ if (num_possible_cpus() > 1 || is_hpet_enabled() ||
+ !clockevent_state_periodic(&i8253_clockevent))
+ return 0;
+
+ return clocksource_i8253_init();
+}
+arch_initcall(init_pit_clocksource);
+#endif /* !CONFIG_X86_64 */
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
new file mode 100644
index 000000000..fe522691a
--- /dev/null
+++ b/arch/x86/kernel/i8259.c
@@ -0,0 +1,434 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/linkage.h>
+#include <linux/errno.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/ioport.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/timex.h>
+#include <linux/random.h>
+#include <linux/init.h>
+#include <linux/kernel_stat.h>
+#include <linux/syscore_ops.h>
+#include <linux/bitops.h>
+#include <linux/acpi.h>
+#include <linux/io.h>
+#include <linux/delay.h>
+
+#include <linux/atomic.h>
+#include <asm/timer.h>
+#include <asm/hw_irq.h>
+#include <asm/pgtable.h>
+#include <asm/desc.h>
+#include <asm/apic.h>
+#include <asm/i8259.h>
+
+/*
+ * This is the 'legacy' 8259A Programmable Interrupt Controller,
+ * present in the majority of PC/AT boxes.
+ * plus some generic x86 specific things if generic specifics makes
+ * any sense at all.
+ */
+static void init_8259A(int auto_eoi);
+
+static int i8259A_auto_eoi;
+DEFINE_RAW_SPINLOCK(i8259A_lock);
+
+/*
+ * 8259A PIC functions to handle ISA devices:
+ */
+
+/*
+ * This contains the irq mask for both 8259A irq controllers,
+ */
+unsigned int cached_irq_mask = 0xffff;
+
+/*
+ * Not all IRQs can be routed through the IO-APIC, eg. on certain (older)
+ * boards the timer interrupt is not really connected to any IO-APIC pin,
+ * it's fed to the master 8259A's IR0 line only.
+ *
+ * Any '1' bit in this mask means the IRQ is routed through the IO-APIC.
+ * this 'mixed mode' IRQ handling costs nothing because it's only used
+ * at IRQ setup time.
+ */
+unsigned long io_apic_irqs;
+
+static void mask_8259A_irq(unsigned int irq)
+{
+ unsigned int mask = 1 << irq;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&i8259A_lock, flags);
+ cached_irq_mask |= mask;
+ if (irq & 8)
+ outb(cached_slave_mask, PIC_SLAVE_IMR);
+ else
+ outb(cached_master_mask, PIC_MASTER_IMR);
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
+}
+
+static void disable_8259A_irq(struct irq_data *data)
+{
+ mask_8259A_irq(data->irq);
+}
+
+static void unmask_8259A_irq(unsigned int irq)
+{
+ unsigned int mask = ~(1 << irq);
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&i8259A_lock, flags);
+ cached_irq_mask &= mask;
+ if (irq & 8)
+ outb(cached_slave_mask, PIC_SLAVE_IMR);
+ else
+ outb(cached_master_mask, PIC_MASTER_IMR);
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
+}
+
+static void enable_8259A_irq(struct irq_data *data)
+{
+ unmask_8259A_irq(data->irq);
+}
+
+static int i8259A_irq_pending(unsigned int irq)
+{
+ unsigned int mask = 1<<irq;
+ unsigned long flags;
+ int ret;
+
+ raw_spin_lock_irqsave(&i8259A_lock, flags);
+ if (irq < 8)
+ ret = inb(PIC_MASTER_CMD) & mask;
+ else
+ ret = inb(PIC_SLAVE_CMD) & (mask >> 8);
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
+
+ return ret;
+}
+
+static void make_8259A_irq(unsigned int irq)
+{
+ disable_irq_nosync(irq);
+ io_apic_irqs &= ~(1<<irq);
+ irq_set_chip_and_handler(irq, &i8259A_chip, handle_level_irq);
+ enable_irq(irq);
+ lapic_assign_legacy_vector(irq, true);
+}
+
+/*
+ * This function assumes to be called rarely. Switching between
+ * 8259A registers is slow.
+ * This has to be protected by the irq controller spinlock
+ * before being called.
+ */
+static inline int i8259A_irq_real(unsigned int irq)
+{
+ int value;
+ int irqmask = 1<<irq;
+
+ if (irq < 8) {
+ outb(0x0B, PIC_MASTER_CMD); /* ISR register */
+ value = inb(PIC_MASTER_CMD) & irqmask;
+ outb(0x0A, PIC_MASTER_CMD); /* back to the IRR register */
+ return value;
+ }
+ outb(0x0B, PIC_SLAVE_CMD); /* ISR register */
+ value = inb(PIC_SLAVE_CMD) & (irqmask >> 8);
+ outb(0x0A, PIC_SLAVE_CMD); /* back to the IRR register */
+ return value;
+}
+
+/*
+ * Careful! The 8259A is a fragile beast, it pretty
+ * much _has_ to be done exactly like this (mask it
+ * first, _then_ send the EOI, and the order of EOI
+ * to the two 8259s is important!
+ */
+static void mask_and_ack_8259A(struct irq_data *data)
+{
+ unsigned int irq = data->irq;
+ unsigned int irqmask = 1 << irq;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&i8259A_lock, flags);
+ /*
+ * Lightweight spurious IRQ detection. We do not want
+ * to overdo spurious IRQ handling - it's usually a sign
+ * of hardware problems, so we only do the checks we can
+ * do without slowing down good hardware unnecessarily.
+ *
+ * Note that IRQ7 and IRQ15 (the two spurious IRQs
+ * usually resulting from the 8259A-1|2 PICs) occur
+ * even if the IRQ is masked in the 8259A. Thus we
+ * can check spurious 8259A IRQs without doing the
+ * quite slow i8259A_irq_real() call for every IRQ.
+ * This does not cover 100% of spurious interrupts,
+ * but should be enough to warn the user that there
+ * is something bad going on ...
+ */
+ if (cached_irq_mask & irqmask)
+ goto spurious_8259A_irq;
+ cached_irq_mask |= irqmask;
+
+handle_real_irq:
+ if (irq & 8) {
+ inb(PIC_SLAVE_IMR); /* DUMMY - (do we need this?) */
+ outb(cached_slave_mask, PIC_SLAVE_IMR);
+ /* 'Specific EOI' to slave */
+ outb(0x60+(irq&7), PIC_SLAVE_CMD);
+ /* 'Specific EOI' to master-IRQ2 */
+ outb(0x60+PIC_CASCADE_IR, PIC_MASTER_CMD);
+ } else {
+ inb(PIC_MASTER_IMR); /* DUMMY - (do we need this?) */
+ outb(cached_master_mask, PIC_MASTER_IMR);
+ outb(0x60+irq, PIC_MASTER_CMD); /* 'Specific EOI to master */
+ }
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
+ return;
+
+spurious_8259A_irq:
+ /*
+ * this is the slow path - should happen rarely.
+ */
+ if (i8259A_irq_real(irq))
+ /*
+ * oops, the IRQ _is_ in service according to the
+ * 8259A - not spurious, go handle it.
+ */
+ goto handle_real_irq;
+
+ {
+ static int spurious_irq_mask;
+ /*
+ * At this point we can be sure the IRQ is spurious,
+ * lets ACK and report it. [once per IRQ]
+ */
+ if (!(spurious_irq_mask & irqmask)) {
+ printk_deferred(KERN_DEBUG
+ "spurious 8259A interrupt: IRQ%d.\n", irq);
+ spurious_irq_mask |= irqmask;
+ }
+ atomic_inc(&irq_err_count);
+ /*
+ * Theoretically we do not have to handle this IRQ,
+ * but in Linux this does not cause problems and is
+ * simpler for us.
+ */
+ goto handle_real_irq;
+ }
+}
+
+struct irq_chip i8259A_chip = {
+ .name = "XT-PIC",
+ .irq_mask = disable_8259A_irq,
+ .irq_disable = disable_8259A_irq,
+ .irq_unmask = enable_8259A_irq,
+ .irq_mask_ack = mask_and_ack_8259A,
+};
+
+static char irq_trigger[2];
+/**
+ * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ
+ */
+static void restore_ELCR(char *trigger)
+{
+ outb(trigger[0], 0x4d0);
+ outb(trigger[1], 0x4d1);
+}
+
+static void save_ELCR(char *trigger)
+{
+ /* IRQ 0,1,2,8,13 are marked as reserved */
+ trigger[0] = inb(0x4d0) & 0xF8;
+ trigger[1] = inb(0x4d1) & 0xDE;
+}
+
+static void i8259A_resume(void)
+{
+ init_8259A(i8259A_auto_eoi);
+ restore_ELCR(irq_trigger);
+}
+
+static int i8259A_suspend(void)
+{
+ save_ELCR(irq_trigger);
+ return 0;
+}
+
+static void i8259A_shutdown(void)
+{
+ /* Put the i8259A into a quiescent state that
+ * the kernel initialization code can get it
+ * out of.
+ */
+ outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
+ outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */
+}
+
+static struct syscore_ops i8259_syscore_ops = {
+ .suspend = i8259A_suspend,
+ .resume = i8259A_resume,
+ .shutdown = i8259A_shutdown,
+};
+
+static void mask_8259A(void)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&i8259A_lock, flags);
+
+ outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
+ outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */
+
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
+}
+
+static void unmask_8259A(void)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&i8259A_lock, flags);
+
+ outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */
+ outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */
+
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
+}
+
+static int probe_8259A(void)
+{
+ unsigned long flags;
+ unsigned char probe_val = ~(1 << PIC_CASCADE_IR);
+ unsigned char new_val;
+ /*
+ * Check to see if we have a PIC.
+ * Mask all except the cascade and read
+ * back the value we just wrote. If we don't
+ * have a PIC, we will read 0xff as opposed to the
+ * value we wrote.
+ */
+ raw_spin_lock_irqsave(&i8259A_lock, flags);
+
+ outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */
+ outb(probe_val, PIC_MASTER_IMR);
+ new_val = inb(PIC_MASTER_IMR);
+ if (new_val != probe_val) {
+ printk(KERN_INFO "Using NULL legacy PIC\n");
+ legacy_pic = &null_legacy_pic;
+ }
+
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
+ return nr_legacy_irqs();
+}
+
+static void init_8259A(int auto_eoi)
+{
+ unsigned long flags;
+
+ i8259A_auto_eoi = auto_eoi;
+
+ raw_spin_lock_irqsave(&i8259A_lock, flags);
+
+ outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
+
+ /*
+ * outb_pic - this has to work on a wide range of PC hardware.
+ */
+ outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */
+
+ /* ICW2: 8259A-1 IR0-7 mapped to ISA_IRQ_VECTOR(0) */
+ outb_pic(ISA_IRQ_VECTOR(0), PIC_MASTER_IMR);
+
+ /* 8259A-1 (the master) has a slave on IR2 */
+ outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR);
+
+ if (auto_eoi) /* master does Auto EOI */
+ outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR);
+ else /* master expects normal EOI */
+ outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR);
+
+ outb_pic(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */
+
+ /* ICW2: 8259A-2 IR0-7 mapped to ISA_IRQ_VECTOR(8) */
+ outb_pic(ISA_IRQ_VECTOR(8), PIC_SLAVE_IMR);
+ /* 8259A-2 is a slave on master's IR2 */
+ outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR);
+ /* (slave's support for AEOI in flat mode is to be investigated) */
+ outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR);
+
+ if (auto_eoi)
+ /*
+ * In AEOI mode we just have to mask the interrupt
+ * when acking.
+ */
+ i8259A_chip.irq_mask_ack = disable_8259A_irq;
+ else
+ i8259A_chip.irq_mask_ack = mask_and_ack_8259A;
+
+ udelay(100); /* wait for 8259A to initialize */
+
+ outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */
+ outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */
+
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
+}
+
+/*
+ * make i8259 a driver so that we can select pic functions at run time. the goal
+ * is to make x86 binary compatible among pc compatible and non-pc compatible
+ * platforms, such as x86 MID.
+ */
+
+static void legacy_pic_noop(void) { };
+static void legacy_pic_uint_noop(unsigned int unused) { };
+static void legacy_pic_int_noop(int unused) { };
+static int legacy_pic_irq_pending_noop(unsigned int irq)
+{
+ return 0;
+}
+static int legacy_pic_probe(void)
+{
+ return 0;
+}
+
+struct legacy_pic null_legacy_pic = {
+ .nr_legacy_irqs = 0,
+ .chip = &dummy_irq_chip,
+ .mask = legacy_pic_uint_noop,
+ .unmask = legacy_pic_uint_noop,
+ .mask_all = legacy_pic_noop,
+ .restore_mask = legacy_pic_noop,
+ .init = legacy_pic_int_noop,
+ .probe = legacy_pic_probe,
+ .irq_pending = legacy_pic_irq_pending_noop,
+ .make_irq = legacy_pic_uint_noop,
+};
+
+struct legacy_pic default_legacy_pic = {
+ .nr_legacy_irqs = NR_IRQS_LEGACY,
+ .chip = &i8259A_chip,
+ .mask = mask_8259A_irq,
+ .unmask = unmask_8259A_irq,
+ .mask_all = mask_8259A,
+ .restore_mask = unmask_8259A,
+ .init = init_8259A,
+ .probe = probe_8259A,
+ .irq_pending = i8259A_irq_pending,
+ .make_irq = make_8259A_irq,
+};
+
+struct legacy_pic *legacy_pic = &default_legacy_pic;
+EXPORT_SYMBOL(legacy_pic);
+
+static int __init i8259A_init_ops(void)
+{
+ if (legacy_pic == &default_legacy_pic)
+ register_syscore_ops(&i8259_syscore_ops);
+
+ return 0;
+}
+
+device_initcall(i8259A_init_ops);
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
new file mode 100644
index 000000000..e1f935530
--- /dev/null
+++ b/arch/x86/kernel/idt.c
@@ -0,0 +1,373 @@
+/*
+ * Interrupt descriptor table related code
+ *
+ * This file is licensed under the GPL V2
+ */
+#include <linux/interrupt.h>
+
+#include <asm/traps.h>
+#include <asm/proto.h>
+#include <asm/desc.h>
+#include <asm/hw_irq.h>
+
+struct idt_data {
+ unsigned int vector;
+ unsigned int segment;
+ struct idt_bits bits;
+ const void *addr;
+};
+
+#define DPL0 0x0
+#define DPL3 0x3
+
+#define DEFAULT_STACK 0
+
+#define G(_vector, _addr, _ist, _type, _dpl, _segment) \
+ { \
+ .vector = _vector, \
+ .bits.ist = _ist, \
+ .bits.type = _type, \
+ .bits.dpl = _dpl, \
+ .bits.p = 1, \
+ .addr = _addr, \
+ .segment = _segment, \
+ }
+
+/* Interrupt gate */
+#define INTG(_vector, _addr) \
+ G(_vector, _addr, DEFAULT_STACK, GATE_INTERRUPT, DPL0, __KERNEL_CS)
+
+/* System interrupt gate */
+#define SYSG(_vector, _addr) \
+ G(_vector, _addr, DEFAULT_STACK, GATE_INTERRUPT, DPL3, __KERNEL_CS)
+
+/* Interrupt gate with interrupt stack */
+#define ISTG(_vector, _addr, _ist) \
+ G(_vector, _addr, _ist, GATE_INTERRUPT, DPL0, __KERNEL_CS)
+
+/* System interrupt gate with interrupt stack */
+#define SISTG(_vector, _addr, _ist) \
+ G(_vector, _addr, _ist, GATE_INTERRUPT, DPL3, __KERNEL_CS)
+
+/* Task gate */
+#define TSKG(_vector, _gdt) \
+ G(_vector, NULL, DEFAULT_STACK, GATE_TASK, DPL0, _gdt << 3)
+
+/*
+ * Early traps running on the DEFAULT_STACK because the other interrupt
+ * stacks work only after cpu_init().
+ */
+static const __initconst struct idt_data early_idts[] = {
+ INTG(X86_TRAP_DB, debug),
+ SYSG(X86_TRAP_BP, int3),
+#ifdef CONFIG_X86_32
+ INTG(X86_TRAP_PF, page_fault),
+#endif
+};
+
+/*
+ * The default IDT entries which are set up in trap_init() before
+ * cpu_init() is invoked. Interrupt stacks cannot be used at that point and
+ * the traps which use them are reinitialized with IST after cpu_init() has
+ * set up TSS.
+ */
+static const __initconst struct idt_data def_idts[] = {
+ INTG(X86_TRAP_DE, divide_error),
+ INTG(X86_TRAP_NMI, nmi),
+ INTG(X86_TRAP_BR, bounds),
+ INTG(X86_TRAP_UD, invalid_op),
+ INTG(X86_TRAP_NM, device_not_available),
+ INTG(X86_TRAP_OLD_MF, coprocessor_segment_overrun),
+ INTG(X86_TRAP_TS, invalid_TSS),
+ INTG(X86_TRAP_NP, segment_not_present),
+ INTG(X86_TRAP_SS, stack_segment),
+ INTG(X86_TRAP_GP, general_protection),
+ INTG(X86_TRAP_SPURIOUS, spurious_interrupt_bug),
+ INTG(X86_TRAP_MF, coprocessor_error),
+ INTG(X86_TRAP_AC, alignment_check),
+ INTG(X86_TRAP_XF, simd_coprocessor_error),
+
+#ifdef CONFIG_X86_32
+ TSKG(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS),
+#else
+ INTG(X86_TRAP_DF, double_fault),
+#endif
+ INTG(X86_TRAP_DB, debug),
+
+#ifdef CONFIG_X86_MCE
+ INTG(X86_TRAP_MC, &machine_check),
+#endif
+
+ SYSG(X86_TRAP_OF, overflow),
+#if defined(CONFIG_IA32_EMULATION)
+ SYSG(IA32_SYSCALL_VECTOR, entry_INT80_compat),
+#elif defined(CONFIG_X86_32)
+ SYSG(IA32_SYSCALL_VECTOR, entry_INT80_32),
+#endif
+};
+
+/*
+ * The APIC and SMP idt entries
+ */
+static const __initconst struct idt_data apic_idts[] = {
+#ifdef CONFIG_SMP
+ INTG(RESCHEDULE_VECTOR, reschedule_interrupt),
+ INTG(CALL_FUNCTION_VECTOR, call_function_interrupt),
+ INTG(CALL_FUNCTION_SINGLE_VECTOR, call_function_single_interrupt),
+ INTG(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt),
+ INTG(REBOOT_VECTOR, reboot_interrupt),
+#endif
+
+#ifdef CONFIG_X86_THERMAL_VECTOR
+ INTG(THERMAL_APIC_VECTOR, thermal_interrupt),
+#endif
+
+#ifdef CONFIG_X86_MCE_THRESHOLD
+ INTG(THRESHOLD_APIC_VECTOR, threshold_interrupt),
+#endif
+
+#ifdef CONFIG_X86_MCE_AMD
+ INTG(DEFERRED_ERROR_VECTOR, deferred_error_interrupt),
+#endif
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ INTG(LOCAL_TIMER_VECTOR, apic_timer_interrupt),
+ INTG(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi),
+# ifdef CONFIG_HAVE_KVM
+ INTG(POSTED_INTR_VECTOR, kvm_posted_intr_ipi),
+ INTG(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi),
+ INTG(POSTED_INTR_NESTED_VECTOR, kvm_posted_intr_nested_ipi),
+# endif
+# ifdef CONFIG_IRQ_WORK
+ INTG(IRQ_WORK_VECTOR, irq_work_interrupt),
+# endif
+#ifdef CONFIG_X86_UV
+ INTG(UV_BAU_MESSAGE, uv_bau_message_intr1),
+#endif
+ INTG(SPURIOUS_APIC_VECTOR, spurious_interrupt),
+ INTG(ERROR_APIC_VECTOR, error_interrupt),
+#endif
+};
+
+#ifdef CONFIG_X86_64
+/*
+ * Early traps running on the DEFAULT_STACK because the other interrupt
+ * stacks work only after cpu_init().
+ */
+static const __initconst struct idt_data early_pf_idts[] = {
+ INTG(X86_TRAP_PF, page_fault),
+};
+
+/*
+ * Override for the debug_idt. Same as the default, but with interrupt
+ * stack set to DEFAULT_STACK (0). Required for NMI trap handling.
+ */
+static const __initconst struct idt_data dbg_idts[] = {
+ INTG(X86_TRAP_DB, debug),
+};
+#endif
+
+/* Must be page-aligned because the real IDT is used in a fixmap. */
+gate_desc idt_table[IDT_ENTRIES] __page_aligned_bss;
+
+struct desc_ptr idt_descr __ro_after_init = {
+ .size = (IDT_ENTRIES * 2 * sizeof(unsigned long)) - 1,
+ .address = (unsigned long) idt_table,
+};
+
+#ifdef CONFIG_X86_64
+/* No need to be aligned, but done to keep all IDTs defined the same way. */
+gate_desc debug_idt_table[IDT_ENTRIES] __page_aligned_bss;
+
+/*
+ * The exceptions which use Interrupt stacks. They are setup after
+ * cpu_init() when the TSS has been initialized.
+ */
+static const __initconst struct idt_data ist_idts[] = {
+ ISTG(X86_TRAP_DB, debug, DEBUG_STACK),
+ ISTG(X86_TRAP_NMI, nmi, NMI_STACK),
+ ISTG(X86_TRAP_DF, double_fault, DOUBLEFAULT_STACK),
+#ifdef CONFIG_X86_MCE
+ ISTG(X86_TRAP_MC, &machine_check, MCE_STACK),
+#endif
+};
+
+/*
+ * Override for the debug_idt. Same as the default, but with interrupt
+ * stack set to DEFAULT_STACK (0). Required for NMI trap handling.
+ */
+const struct desc_ptr debug_idt_descr = {
+ .size = IDT_ENTRIES * 16 - 1,
+ .address = (unsigned long) debug_idt_table,
+};
+#endif
+
+static inline void idt_init_desc(gate_desc *gate, const struct idt_data *d)
+{
+ unsigned long addr = (unsigned long) d->addr;
+
+ gate->offset_low = (u16) addr;
+ gate->segment = (u16) d->segment;
+ gate->bits = d->bits;
+ gate->offset_middle = (u16) (addr >> 16);
+#ifdef CONFIG_X86_64
+ gate->offset_high = (u32) (addr >> 32);
+ gate->reserved = 0;
+#endif
+}
+
+static void
+idt_setup_from_table(gate_desc *idt, const struct idt_data *t, int size, bool sys)
+{
+ gate_desc desc;
+
+ for (; size > 0; t++, size--) {
+ idt_init_desc(&desc, t);
+ write_idt_entry(idt, t->vector, &desc);
+ if (sys)
+ set_bit(t->vector, system_vectors);
+ }
+}
+
+static void set_intr_gate(unsigned int n, const void *addr)
+{
+ struct idt_data data;
+
+ BUG_ON(n > 0xFF);
+
+ memset(&data, 0, sizeof(data));
+ data.vector = n;
+ data.addr = addr;
+ data.segment = __KERNEL_CS;
+ data.bits.type = GATE_INTERRUPT;
+ data.bits.p = 1;
+
+ idt_setup_from_table(idt_table, &data, 1, false);
+}
+
+/**
+ * idt_setup_early_traps - Initialize the idt table with early traps
+ *
+ * On X8664 these traps do not use interrupt stacks as they can't work
+ * before cpu_init() is invoked and sets up TSS. The IST variants are
+ * installed after that.
+ */
+void __init idt_setup_early_traps(void)
+{
+ idt_setup_from_table(idt_table, early_idts, ARRAY_SIZE(early_idts),
+ true);
+ load_idt(&idt_descr);
+}
+
+/**
+ * idt_setup_traps - Initialize the idt table with default traps
+ */
+void __init idt_setup_traps(void)
+{
+ idt_setup_from_table(idt_table, def_idts, ARRAY_SIZE(def_idts), true);
+}
+
+#ifdef CONFIG_X86_64
+/**
+ * idt_setup_early_pf - Initialize the idt table with early pagefault handler
+ *
+ * On X8664 this does not use interrupt stacks as they can't work before
+ * cpu_init() is invoked and sets up TSS. The IST variant is installed
+ * after that.
+ *
+ * FIXME: Why is 32bit and 64bit installing the PF handler at different
+ * places in the early setup code?
+ */
+void __init idt_setup_early_pf(void)
+{
+ idt_setup_from_table(idt_table, early_pf_idts,
+ ARRAY_SIZE(early_pf_idts), true);
+}
+
+/**
+ * idt_setup_ist_traps - Initialize the idt table with traps using IST
+ */
+void __init idt_setup_ist_traps(void)
+{
+ idt_setup_from_table(idt_table, ist_idts, ARRAY_SIZE(ist_idts), true);
+}
+
+/**
+ * idt_setup_debugidt_traps - Initialize the debug idt table with debug traps
+ */
+void __init idt_setup_debugidt_traps(void)
+{
+ memcpy(&debug_idt_table, &idt_table, IDT_ENTRIES * 16);
+
+ idt_setup_from_table(debug_idt_table, dbg_idts, ARRAY_SIZE(dbg_idts), false);
+}
+#endif
+
+/**
+ * idt_setup_apic_and_irq_gates - Setup APIC/SMP and normal interrupt gates
+ */
+void __init idt_setup_apic_and_irq_gates(void)
+{
+ int i = FIRST_EXTERNAL_VECTOR;
+ void *entry;
+
+ idt_setup_from_table(idt_table, apic_idts, ARRAY_SIZE(apic_idts), true);
+
+ for_each_clear_bit_from(i, system_vectors, FIRST_SYSTEM_VECTOR) {
+ entry = irq_entries_start + 8 * (i - FIRST_EXTERNAL_VECTOR);
+ set_intr_gate(i, entry);
+ }
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ for_each_clear_bit_from(i, system_vectors, NR_VECTORS) {
+ /*
+ * Don't set the non assigned system vectors in the
+ * system_vectors bitmap. Otherwise they show up in
+ * /proc/interrupts.
+ */
+ entry = spurious_entries_start + 8 * (i - FIRST_SYSTEM_VECTOR);
+ set_intr_gate(i, entry);
+ }
+#endif
+}
+
+/**
+ * idt_setup_early_handler - Initializes the idt table with early handlers
+ */
+void __init idt_setup_early_handler(void)
+{
+ int i;
+
+ for (i = 0; i < NUM_EXCEPTION_VECTORS; i++)
+ set_intr_gate(i, early_idt_handler_array[i]);
+#ifdef CONFIG_X86_32
+ for ( ; i < NR_VECTORS; i++)
+ set_intr_gate(i, early_ignore_irq);
+#endif
+ load_idt(&idt_descr);
+}
+
+/**
+ * idt_invalidate - Invalidate interrupt descriptor table
+ * @addr: The virtual address of the 'invalid' IDT
+ */
+void idt_invalidate(void *addr)
+{
+ struct desc_ptr idt = { .address = (unsigned long) addr, .size = 0 };
+
+ load_idt(&idt);
+}
+
+void __init update_intr_gate(unsigned int n, const void *addr)
+{
+ if (WARN_ON_ONCE(!test_bit(n, system_vectors)))
+ return;
+ set_intr_gate(n, addr);
+}
+
+void alloc_intr_gate(unsigned int n, const void *addr)
+{
+ BUG_ON(n < FIRST_SYSTEM_VECTOR);
+ if (!test_and_set_bit(n, system_vectors))
+ set_intr_gate(n, addr);
+}
diff --git a/arch/x86/kernel/io_delay.c b/arch/x86/kernel/io_delay.c
new file mode 100644
index 000000000..805b7a341
--- /dev/null
+++ b/arch/x86/kernel/io_delay.c
@@ -0,0 +1,132 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * I/O delay strategies for inb_p/outb_p
+ *
+ * Allow for a DMI based override of port 0x80, needed for certain HP laptops
+ * and possibly other systems. Also allow for the gradual elimination of
+ * outb_p/inb_p API uses.
+ */
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/dmi.h>
+#include <linux/io.h>
+
+int io_delay_type __read_mostly = CONFIG_DEFAULT_IO_DELAY_TYPE;
+
+static int __initdata io_delay_override;
+
+/*
+ * Paravirt wants native_io_delay to be a constant.
+ */
+void native_io_delay(void)
+{
+ switch (io_delay_type) {
+ default:
+ case CONFIG_IO_DELAY_TYPE_0X80:
+ asm volatile ("outb %al, $0x80");
+ break;
+ case CONFIG_IO_DELAY_TYPE_0XED:
+ asm volatile ("outb %al, $0xed");
+ break;
+ case CONFIG_IO_DELAY_TYPE_UDELAY:
+ /*
+ * 2 usecs is an upper-bound for the outb delay but
+ * note that udelay doesn't have the bus-level
+ * side-effects that outb does, nor does udelay() have
+ * precise timings during very early bootup (the delays
+ * are shorter until calibrated):
+ */
+ udelay(2);
+ case CONFIG_IO_DELAY_TYPE_NONE:
+ break;
+ }
+}
+EXPORT_SYMBOL(native_io_delay);
+
+static int __init dmi_io_delay_0xed_port(const struct dmi_system_id *id)
+{
+ if (io_delay_type == CONFIG_IO_DELAY_TYPE_0X80) {
+ pr_notice("%s: using 0xed I/O delay port\n", id->ident);
+ io_delay_type = CONFIG_IO_DELAY_TYPE_0XED;
+ }
+
+ return 0;
+}
+
+/*
+ * Quirk table for systems that misbehave (lock up, etc.) if port
+ * 0x80 is used:
+ */
+static const struct dmi_system_id io_delay_0xed_port_dmi_table[] __initconst = {
+ {
+ .callback = dmi_io_delay_0xed_port,
+ .ident = "Compaq Presario V6000",
+ .matches = {
+ DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
+ DMI_MATCH(DMI_BOARD_NAME, "30B7")
+ }
+ },
+ {
+ .callback = dmi_io_delay_0xed_port,
+ .ident = "HP Pavilion dv9000z",
+ .matches = {
+ DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
+ DMI_MATCH(DMI_BOARD_NAME, "30B9")
+ }
+ },
+ {
+ .callback = dmi_io_delay_0xed_port,
+ .ident = "HP Pavilion dv6000",
+ .matches = {
+ DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
+ DMI_MATCH(DMI_BOARD_NAME, "30B8")
+ }
+ },
+ {
+ .callback = dmi_io_delay_0xed_port,
+ .ident = "HP Pavilion tx1000",
+ .matches = {
+ DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
+ DMI_MATCH(DMI_BOARD_NAME, "30BF")
+ }
+ },
+ {
+ .callback = dmi_io_delay_0xed_port,
+ .ident = "Presario F700",
+ .matches = {
+ DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
+ DMI_MATCH(DMI_BOARD_NAME, "30D3")
+ }
+ },
+ { }
+};
+
+void __init io_delay_init(void)
+{
+ if (!io_delay_override)
+ dmi_check_system(io_delay_0xed_port_dmi_table);
+}
+
+static int __init io_delay_param(char *s)
+{
+ if (!s)
+ return -EINVAL;
+
+ if (!strcmp(s, "0x80"))
+ io_delay_type = CONFIG_IO_DELAY_TYPE_0X80;
+ else if (!strcmp(s, "0xed"))
+ io_delay_type = CONFIG_IO_DELAY_TYPE_0XED;
+ else if (!strcmp(s, "udelay"))
+ io_delay_type = CONFIG_IO_DELAY_TYPE_UDELAY;
+ else if (!strcmp(s, "none"))
+ io_delay_type = CONFIG_IO_DELAY_TYPE_NONE;
+ else
+ return -EINVAL;
+
+ io_delay_override = 1;
+ return 0;
+}
+
+early_param("io_delay", io_delay_param);
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
new file mode 100644
index 000000000..0fe1c8782
--- /dev/null
+++ b/arch/x86/kernel/ioport.c
@@ -0,0 +1,138 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This contains the io-permission bitmap code - written by obz, with changes
+ * by Linus. 32/64 bits code unification by Miguel Botón.
+ */
+
+#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
+#include <linux/kernel.h>
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/ioport.h>
+#include <linux/smp.h>
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <linux/thread_info.h>
+#include <linux/syscalls.h>
+#include <linux/bitmap.h>
+#include <asm/syscalls.h>
+#include <asm/desc.h>
+
+/*
+ * this changes the io permissions bitmap in the current task.
+ */
+long ksys_ioperm(unsigned long from, unsigned long num, int turn_on)
+{
+ struct thread_struct *t = &current->thread;
+ struct tss_struct *tss;
+ unsigned int i, max_long, bytes, bytes_updated;
+
+ if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
+ return -EINVAL;
+ if (turn_on && !capable(CAP_SYS_RAWIO))
+ return -EPERM;
+
+ /*
+ * If it's the first ioperm() call in this thread's lifetime, set the
+ * IO bitmap up. ioperm() is much less timing critical than clone(),
+ * this is why we delay this operation until now:
+ */
+ if (!t->io_bitmap_ptr) {
+ unsigned long *bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
+
+ if (!bitmap)
+ return -ENOMEM;
+
+ memset(bitmap, 0xff, IO_BITMAP_BYTES);
+ t->io_bitmap_ptr = bitmap;
+ set_thread_flag(TIF_IO_BITMAP);
+
+ /*
+ * Now that we have an IO bitmap, we need our TSS limit to be
+ * correct. It's fine if we are preempted after doing this:
+ * with TIF_IO_BITMAP set, context switches will keep our TSS
+ * limit correct.
+ */
+ preempt_disable();
+ refresh_tss_limit();
+ preempt_enable();
+ }
+
+ /*
+ * do it in the per-thread copy and in the TSS ...
+ *
+ * Disable preemption via get_cpu() - we must not switch away
+ * because the ->io_bitmap_max value must match the bitmap
+ * contents:
+ */
+ tss = &per_cpu(cpu_tss_rw, get_cpu());
+
+ if (turn_on)
+ bitmap_clear(t->io_bitmap_ptr, from, num);
+ else
+ bitmap_set(t->io_bitmap_ptr, from, num);
+
+ /*
+ * Search for a (possibly new) maximum. This is simple and stupid,
+ * to keep it obviously correct:
+ */
+ max_long = 0;
+ for (i = 0; i < IO_BITMAP_LONGS; i++)
+ if (t->io_bitmap_ptr[i] != ~0UL)
+ max_long = i;
+
+ bytes = (max_long + 1) * sizeof(unsigned long);
+ bytes_updated = max(bytes, t->io_bitmap_max);
+
+ t->io_bitmap_max = bytes;
+
+ /* Update the TSS: */
+ memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated);
+
+ put_cpu();
+
+ return 0;
+}
+
+SYSCALL_DEFINE3(ioperm, unsigned long, from, unsigned long, num, int, turn_on)
+{
+ return ksys_ioperm(from, num, turn_on);
+}
+
+/*
+ * sys_iopl has to be used when you want to access the IO ports
+ * beyond the 0x3ff range: to get the full 65536 ports bitmapped
+ * you'd need 8kB of bitmaps/process, which is a bit excessive.
+ *
+ * Here we just change the flags value on the stack: we allow
+ * only the super-user to do it. This depends on the stack-layout
+ * on system-call entry - see also fork() and the signal handling
+ * code.
+ */
+SYSCALL_DEFINE1(iopl, unsigned int, level)
+{
+ struct pt_regs *regs = current_pt_regs();
+ struct thread_struct *t = &current->thread;
+
+ /*
+ * Careful: the IOPL bits in regs->flags are undefined under Xen PV
+ * and changing them has no effect.
+ */
+ unsigned int old = t->iopl >> X86_EFLAGS_IOPL_BIT;
+
+ if (level > 3)
+ return -EINVAL;
+ /* Trying to gain more privileges? */
+ if (level > old) {
+ if (!capable(CAP_SYS_RAWIO))
+ return -EPERM;
+ }
+ regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) |
+ (level << X86_EFLAGS_IOPL_BIT);
+ t->iopl = level << X86_EFLAGS_IOPL_BIT;
+ set_iopl_mask(t->iopl);
+
+ return 0;
+}
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
new file mode 100644
index 000000000..ef3317c6d
--- /dev/null
+++ b/arch/x86/kernel/irq.c
@@ -0,0 +1,390 @@
+/*
+ * Common interrupt code for 32 and 64 bit
+ */
+#include <linux/cpu.h>
+#include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
+#include <linux/of.h>
+#include <linux/seq_file.h>
+#include <linux/smp.h>
+#include <linux/ftrace.h>
+#include <linux/delay.h>
+#include <linux/export.h>
+#include <linux/irq.h>
+
+#include <asm/apic.h>
+#include <asm/io_apic.h>
+#include <asm/irq.h>
+#include <asm/mce.h>
+#include <asm/hw_irq.h>
+#include <asm/desc.h>
+
+#define CREATE_TRACE_POINTS
+#include <asm/trace/irq_vectors.h>
+
+DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
+EXPORT_PER_CPU_SYMBOL(irq_stat);
+
+DEFINE_PER_CPU(struct pt_regs *, irq_regs);
+EXPORT_PER_CPU_SYMBOL(irq_regs);
+
+atomic_t irq_err_count;
+
+/*
+ * 'what should we do if we get a hw irq event on an illegal vector'.
+ * each architecture has to answer this themselves.
+ */
+void ack_bad_irq(unsigned int irq)
+{
+ if (printk_ratelimit())
+ pr_err("unexpected IRQ trap at vector %02x\n", irq);
+
+ /*
+ * Currently unexpected vectors happen only on SMP and APIC.
+ * We _must_ ack these because every local APIC has only N
+ * irq slots per priority level, and a 'hanging, unacked' IRQ
+ * holds up an irq slot - in excessive cases (when multiple
+ * unexpected vectors occur) that might lock up the APIC
+ * completely.
+ * But only ack when the APIC is enabled -AK
+ */
+ ack_APIC_irq();
+}
+
+#define irq_stats(x) (&per_cpu(irq_stat, x))
+/*
+ * /proc/interrupts printing for arch specific interrupts
+ */
+int arch_show_interrupts(struct seq_file *p, int prec)
+{
+ int j;
+
+ seq_printf(p, "%*s: ", prec, "NMI");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->__nmi_count);
+ seq_puts(p, " Non-maskable interrupts\n");
+#ifdef CONFIG_X86_LOCAL_APIC
+ seq_printf(p, "%*s: ", prec, "LOC");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs);
+ seq_puts(p, " Local timer interrupts\n");
+
+ seq_printf(p, "%*s: ", prec, "SPU");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
+ seq_puts(p, " Spurious interrupts\n");
+ seq_printf(p, "%*s: ", prec, "PMI");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
+ seq_puts(p, " Performance monitoring interrupts\n");
+ seq_printf(p, "%*s: ", prec, "IWI");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->apic_irq_work_irqs);
+ seq_puts(p, " IRQ work interrupts\n");
+ seq_printf(p, "%*s: ", prec, "RTR");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->icr_read_retry_count);
+ seq_puts(p, " APIC ICR read retries\n");
+ if (x86_platform_ipi_callback) {
+ seq_printf(p, "%*s: ", prec, "PLT");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->x86_platform_ipis);
+ seq_puts(p, " Platform interrupts\n");
+ }
+#endif
+#ifdef CONFIG_SMP
+ seq_printf(p, "%*s: ", prec, "RES");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->irq_resched_count);
+ seq_puts(p, " Rescheduling interrupts\n");
+ seq_printf(p, "%*s: ", prec, "CAL");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->irq_call_count);
+ seq_puts(p, " Function call interrupts\n");
+ seq_printf(p, "%*s: ", prec, "TLB");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count);
+ seq_puts(p, " TLB shootdowns\n");
+#endif
+#ifdef CONFIG_X86_THERMAL_VECTOR
+ seq_printf(p, "%*s: ", prec, "TRM");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count);
+ seq_puts(p, " Thermal event interrupts\n");
+#endif
+#ifdef CONFIG_X86_MCE_THRESHOLD
+ seq_printf(p, "%*s: ", prec, "THR");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count);
+ seq_puts(p, " Threshold APIC interrupts\n");
+#endif
+#ifdef CONFIG_X86_MCE_AMD
+ seq_printf(p, "%*s: ", prec, "DFR");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->irq_deferred_error_count);
+ seq_puts(p, " Deferred Error APIC interrupts\n");
+#endif
+#ifdef CONFIG_X86_MCE
+ seq_printf(p, "%*s: ", prec, "MCE");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", per_cpu(mce_exception_count, j));
+ seq_puts(p, " Machine check exceptions\n");
+ seq_printf(p, "%*s: ", prec, "MCP");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", per_cpu(mce_poll_count, j));
+ seq_puts(p, " Machine check polls\n");
+#endif
+#if IS_ENABLED(CONFIG_HYPERV) || defined(CONFIG_XEN)
+ if (test_bit(HYPERVISOR_CALLBACK_VECTOR, system_vectors)) {
+ seq_printf(p, "%*s: ", prec, "HYP");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ",
+ irq_stats(j)->irq_hv_callback_count);
+ seq_puts(p, " Hypervisor callback interrupts\n");
+ }
+#endif
+#if IS_ENABLED(CONFIG_HYPERV)
+ if (test_bit(HYPERV_REENLIGHTENMENT_VECTOR, system_vectors)) {
+ seq_printf(p, "%*s: ", prec, "HRE");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ",
+ irq_stats(j)->irq_hv_reenlightenment_count);
+ seq_puts(p, " Hyper-V reenlightenment interrupts\n");
+ }
+ if (test_bit(HYPERV_STIMER0_VECTOR, system_vectors)) {
+ seq_printf(p, "%*s: ", prec, "HVS");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ",
+ irq_stats(j)->hyperv_stimer0_count);
+ seq_puts(p, " Hyper-V stimer0 interrupts\n");
+ }
+#endif
+ seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count));
+#if defined(CONFIG_X86_IO_APIC)
+ seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count));
+#endif
+#ifdef CONFIG_HAVE_KVM
+ seq_printf(p, "%*s: ", prec, "PIN");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->kvm_posted_intr_ipis);
+ seq_puts(p, " Posted-interrupt notification event\n");
+
+ seq_printf(p, "%*s: ", prec, "NPI");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ",
+ irq_stats(j)->kvm_posted_intr_nested_ipis);
+ seq_puts(p, " Nested posted-interrupt event\n");
+
+ seq_printf(p, "%*s: ", prec, "PIW");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ",
+ irq_stats(j)->kvm_posted_intr_wakeup_ipis);
+ seq_puts(p, " Posted-interrupt wakeup event\n");
+#endif
+ return 0;
+}
+
+/*
+ * /proc/stat helpers
+ */
+u64 arch_irq_stat_cpu(unsigned int cpu)
+{
+ u64 sum = irq_stats(cpu)->__nmi_count;
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ sum += irq_stats(cpu)->apic_timer_irqs;
+ sum += irq_stats(cpu)->irq_spurious_count;
+ sum += irq_stats(cpu)->apic_perf_irqs;
+ sum += irq_stats(cpu)->apic_irq_work_irqs;
+ sum += irq_stats(cpu)->icr_read_retry_count;
+ if (x86_platform_ipi_callback)
+ sum += irq_stats(cpu)->x86_platform_ipis;
+#endif
+#ifdef CONFIG_SMP
+ sum += irq_stats(cpu)->irq_resched_count;
+ sum += irq_stats(cpu)->irq_call_count;
+#endif
+#ifdef CONFIG_X86_THERMAL_VECTOR
+ sum += irq_stats(cpu)->irq_thermal_count;
+#endif
+#ifdef CONFIG_X86_MCE_THRESHOLD
+ sum += irq_stats(cpu)->irq_threshold_count;
+#endif
+#ifdef CONFIG_X86_MCE
+ sum += per_cpu(mce_exception_count, cpu);
+ sum += per_cpu(mce_poll_count, cpu);
+#endif
+ return sum;
+}
+
+u64 arch_irq_stat(void)
+{
+ u64 sum = atomic_read(&irq_err_count);
+ return sum;
+}
+
+
+/*
+ * do_IRQ handles all normal device IRQ's (the special
+ * SMP cross-CPU interrupts have their own specific
+ * handlers).
+ */
+__visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+ struct irq_desc * desc;
+ /* high bit used in ret_from_ code */
+ unsigned vector = ~regs->orig_ax;
+
+ entering_irq();
+
+ /* entering_irq() tells RCU that we're not quiescent. Check it. */
+ RCU_LOCKDEP_WARN(!rcu_is_watching(), "IRQ failed to wake up RCU");
+
+ desc = __this_cpu_read(vector_irq[vector]);
+
+ if (!handle_irq(desc, regs)) {
+ ack_APIC_irq();
+
+ if (desc != VECTOR_RETRIGGERED && desc != VECTOR_SHUTDOWN) {
+ pr_emerg_ratelimited("%s: %d.%d No irq handler for vector\n",
+ __func__, smp_processor_id(),
+ vector);
+ } else {
+ __this_cpu_write(vector_irq[vector], VECTOR_UNUSED);
+ }
+ }
+
+ exiting_irq();
+
+ set_irq_regs(old_regs);
+ return 1;
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+/* Function pointer for generic interrupt vector handling */
+void (*x86_platform_ipi_callback)(void) = NULL;
+/*
+ * Handler for X86_PLATFORM_IPI_VECTOR.
+ */
+__visible void __irq_entry smp_x86_platform_ipi(struct pt_regs *regs)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+
+ entering_ack_irq();
+ trace_x86_platform_ipi_entry(X86_PLATFORM_IPI_VECTOR);
+ inc_irq_stat(x86_platform_ipis);
+ if (x86_platform_ipi_callback)
+ x86_platform_ipi_callback();
+ trace_x86_platform_ipi_exit(X86_PLATFORM_IPI_VECTOR);
+ exiting_irq();
+ set_irq_regs(old_regs);
+}
+#endif
+
+#ifdef CONFIG_HAVE_KVM
+static void dummy_handler(void) {}
+static void (*kvm_posted_intr_wakeup_handler)(void) = dummy_handler;
+
+void kvm_set_posted_intr_wakeup_handler(void (*handler)(void))
+{
+ if (handler)
+ kvm_posted_intr_wakeup_handler = handler;
+ else {
+ kvm_posted_intr_wakeup_handler = dummy_handler;
+ synchronize_rcu();
+ }
+}
+EXPORT_SYMBOL_GPL(kvm_set_posted_intr_wakeup_handler);
+
+/*
+ * Handler for POSTED_INTERRUPT_VECTOR.
+ */
+__visible void smp_kvm_posted_intr_ipi(struct pt_regs *regs)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+
+ entering_ack_irq();
+ inc_irq_stat(kvm_posted_intr_ipis);
+ exiting_irq();
+ set_irq_regs(old_regs);
+}
+
+/*
+ * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
+ */
+__visible void smp_kvm_posted_intr_wakeup_ipi(struct pt_regs *regs)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+
+ entering_ack_irq();
+ inc_irq_stat(kvm_posted_intr_wakeup_ipis);
+ kvm_posted_intr_wakeup_handler();
+ exiting_irq();
+ set_irq_regs(old_regs);
+}
+
+/*
+ * Handler for POSTED_INTERRUPT_NESTED_VECTOR.
+ */
+__visible void smp_kvm_posted_intr_nested_ipi(struct pt_regs *regs)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+
+ entering_ack_irq();
+ inc_irq_stat(kvm_posted_intr_nested_ipis);
+ exiting_irq();
+ set_irq_regs(old_regs);
+}
+#endif
+
+
+#ifdef CONFIG_HOTPLUG_CPU
+/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
+void fixup_irqs(void)
+{
+ unsigned int irr, vector;
+ struct irq_desc *desc;
+ struct irq_data *data;
+ struct irq_chip *chip;
+
+ irq_migrate_all_off_this_cpu();
+
+ /*
+ * We can remove mdelay() and then send spuriuous interrupts to
+ * new cpu targets for all the irqs that were handled previously by
+ * this cpu. While it works, I have seen spurious interrupt messages
+ * (nothing wrong but still...).
+ *
+ * So for now, retain mdelay(1) and check the IRR and then send those
+ * interrupts to new targets as this cpu is already offlined...
+ */
+ mdelay(1);
+
+ /*
+ * We can walk the vector array of this cpu without holding
+ * vector_lock because the cpu is already marked !online, so
+ * nothing else will touch it.
+ */
+ for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
+ if (IS_ERR_OR_NULL(__this_cpu_read(vector_irq[vector])))
+ continue;
+
+ irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
+ if (irr & (1 << (vector % 32))) {
+ desc = __this_cpu_read(vector_irq[vector]);
+
+ raw_spin_lock(&desc->lock);
+ data = irq_desc_get_irq_data(desc);
+ chip = irq_data_get_irq_chip(data);
+ if (chip->irq_retrigger) {
+ chip->irq_retrigger(data);
+ __this_cpu_write(vector_irq[vector], VECTOR_RETRIGGERED);
+ }
+ raw_spin_unlock(&desc->lock);
+ }
+ if (__this_cpu_read(vector_irq[vector]) != VECTOR_RETRIGGERED)
+ __this_cpu_write(vector_irq[vector], VECTOR_UNUSED);
+ }
+}
+#endif
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
new file mode 100644
index 000000000..95600a99a
--- /dev/null
+++ b/arch/x86/kernel/irq_32.c
@@ -0,0 +1,164 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar
+ *
+ * This file contains the lowest level x86-specific interrupt
+ * entry, irq-stacks and irq statistics code. All the remaining
+ * irq logic is done by the generic kernel/irq/ code and
+ * by the x86-specific irq controller code. (e.g. i8259.c and
+ * io_apic.c.)
+ */
+
+#include <linux/seq_file.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/kernel_stat.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/delay.h>
+#include <linux/uaccess.h>
+#include <linux/percpu.h>
+#include <linux/mm.h>
+
+#include <asm/apic.h>
+#include <asm/nospec-branch.h>
+
+#ifdef CONFIG_DEBUG_STACKOVERFLOW
+
+int sysctl_panic_on_stackoverflow __read_mostly;
+
+/* Debugging check for stack overflow: is there less than 1KB free? */
+static int check_stack_overflow(void)
+{
+ long sp;
+
+ __asm__ __volatile__("andl %%esp,%0" :
+ "=r" (sp) : "0" (THREAD_SIZE - 1));
+
+ return sp < (sizeof(struct thread_info) + STACK_WARN);
+}
+
+static void print_stack_overflow(void)
+{
+ printk(KERN_WARNING "low stack detected by irq handler\n");
+ dump_stack();
+ if (sysctl_panic_on_stackoverflow)
+ panic("low stack detected by irq handler - check messages\n");
+}
+
+#else
+static inline int check_stack_overflow(void) { return 0; }
+static inline void print_stack_overflow(void) { }
+#endif
+
+DEFINE_PER_CPU(struct irq_stack *, hardirq_stack);
+DEFINE_PER_CPU(struct irq_stack *, softirq_stack);
+
+static void call_on_stack(void *func, void *stack)
+{
+ asm volatile("xchgl %%ebx,%%esp \n"
+ CALL_NOSPEC
+ "movl %%ebx,%%esp \n"
+ : "=b" (stack)
+ : "0" (stack),
+ [thunk_target] "D"(func)
+ : "memory", "cc", "edx", "ecx", "eax");
+}
+
+static inline void *current_stack(void)
+{
+ return (void *)(current_stack_pointer & ~(THREAD_SIZE - 1));
+}
+
+static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc)
+{
+ struct irq_stack *curstk, *irqstk;
+ u32 *isp, *prev_esp, arg1;
+
+ curstk = (struct irq_stack *) current_stack();
+ irqstk = __this_cpu_read(hardirq_stack);
+
+ /*
+ * this is where we switch to the IRQ stack. However, if we are
+ * already using the IRQ stack (because we interrupted a hardirq
+ * handler) we can't do that and just have to keep using the
+ * current stack (which is the irq stack already after all)
+ */
+ if (unlikely(curstk == irqstk))
+ return 0;
+
+ isp = (u32 *) ((char *)irqstk + sizeof(*irqstk));
+
+ /* Save the next esp at the bottom of the stack */
+ prev_esp = (u32 *)irqstk;
+ *prev_esp = current_stack_pointer;
+
+ if (unlikely(overflow))
+ call_on_stack(print_stack_overflow, isp);
+
+ asm volatile("xchgl %%ebx,%%esp \n"
+ CALL_NOSPEC
+ "movl %%ebx,%%esp \n"
+ : "=a" (arg1), "=b" (isp)
+ : "0" (desc), "1" (isp),
+ [thunk_target] "D" (desc->handle_irq)
+ : "memory", "cc", "ecx");
+ return 1;
+}
+
+/*
+ * allocate per-cpu stacks for hardirq and for softirq processing
+ */
+void irq_ctx_init(int cpu)
+{
+ struct irq_stack *irqstk;
+
+ if (per_cpu(hardirq_stack, cpu))
+ return;
+
+ irqstk = page_address(alloc_pages_node(cpu_to_node(cpu),
+ THREADINFO_GFP,
+ THREAD_SIZE_ORDER));
+ per_cpu(hardirq_stack, cpu) = irqstk;
+
+ irqstk = page_address(alloc_pages_node(cpu_to_node(cpu),
+ THREADINFO_GFP,
+ THREAD_SIZE_ORDER));
+ per_cpu(softirq_stack, cpu) = irqstk;
+
+ printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n",
+ cpu, per_cpu(hardirq_stack, cpu), per_cpu(softirq_stack, cpu));
+}
+
+void do_softirq_own_stack(void)
+{
+ struct irq_stack *irqstk;
+ u32 *isp, *prev_esp;
+
+ irqstk = __this_cpu_read(softirq_stack);
+
+ /* build the stack frame on the softirq stack */
+ isp = (u32 *) ((char *)irqstk + sizeof(*irqstk));
+
+ /* Push the previous esp onto the stack */
+ prev_esp = (u32 *)irqstk;
+ *prev_esp = current_stack_pointer;
+
+ call_on_stack(__do_softirq, isp);
+}
+
+bool handle_irq(struct irq_desc *desc, struct pt_regs *regs)
+{
+ int overflow = check_stack_overflow();
+
+ if (IS_ERR_OR_NULL(desc))
+ return false;
+
+ if (user_mode(regs) || !execute_on_irq_stack(overflow, desc)) {
+ if (unlikely(overflow))
+ print_stack_overflow();
+ generic_handle_irq_desc(desc);
+ }
+
+ return true;
+}
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
new file mode 100644
index 000000000..b50ac9c73
--- /dev/null
+++ b/arch/x86/kernel/irq_64.c
@@ -0,0 +1,89 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar
+ *
+ * This file contains the lowest level x86_64-specific interrupt
+ * entry and irq statistics code. All the remaining irq logic is
+ * done by the generic kernel/irq/ code and in the
+ * x86_64-specific irq controller code. (e.g. i8259.c and
+ * io_apic.c.)
+ */
+
+#include <linux/kernel_stat.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/seq_file.h>
+#include <linux/delay.h>
+#include <linux/ftrace.h>
+#include <linux/uaccess.h>
+#include <linux/smp.h>
+#include <linux/sched/task_stack.h>
+#include <asm/io_apic.h>
+#include <asm/apic.h>
+
+int sysctl_panic_on_stackoverflow;
+
+/*
+ * Probabilistic stack overflow check:
+ *
+ * Regular device interrupts can enter on the following stacks:
+ *
+ * - User stack
+ *
+ * - Kernel task stack
+ *
+ * - Interrupt stack if a device driver reenables interrupts
+ * which should only happen in really old drivers.
+ *
+ * - Debug IST stack
+ *
+ * All other contexts are invalid.
+ */
+static inline void stack_overflow_check(struct pt_regs *regs)
+{
+#ifdef CONFIG_DEBUG_STACKOVERFLOW
+#define STACK_TOP_MARGIN 128
+ struct orig_ist *oist;
+ u64 irq_stack_top, irq_stack_bottom;
+ u64 estack_top, estack_bottom;
+ u64 curbase = (u64)task_stack_page(current);
+
+ if (user_mode(regs))
+ return;
+
+ if (regs->sp >= curbase + sizeof(struct pt_regs) + STACK_TOP_MARGIN &&
+ regs->sp <= curbase + THREAD_SIZE)
+ return;
+
+ irq_stack_top = (u64)this_cpu_ptr(irq_stack_union.irq_stack) +
+ STACK_TOP_MARGIN;
+ irq_stack_bottom = (u64)__this_cpu_read(irq_stack_ptr);
+ if (regs->sp >= irq_stack_top && regs->sp <= irq_stack_bottom)
+ return;
+
+ oist = this_cpu_ptr(&orig_ist);
+ estack_bottom = (u64)oist->ist[DEBUG_STACK];
+ estack_top = estack_bottom - DEBUG_STKSZ + STACK_TOP_MARGIN;
+ if (regs->sp >= estack_top && regs->sp <= estack_bottom)
+ return;
+
+ WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx,ip:%pF)\n",
+ current->comm, curbase, regs->sp,
+ irq_stack_top, irq_stack_bottom,
+ estack_top, estack_bottom, (void *)regs->ip);
+
+ if (sysctl_panic_on_stackoverflow)
+ panic("low stack detected by irq handler - check messages\n");
+#endif
+}
+
+bool handle_irq(struct irq_desc *desc, struct pt_regs *regs)
+{
+ stack_overflow_check(regs);
+
+ if (IS_ERR_OR_NULL(desc))
+ return false;
+
+ generic_handle_irq_desc(desc);
+ return true;
+}
diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c
new file mode 100644
index 000000000..80bee7695
--- /dev/null
+++ b/arch/x86/kernel/irq_work.c
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * x86 specific code for irq_work
+ *
+ * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra
+ */
+
+#include <linux/kernel.h>
+#include <linux/irq_work.h>
+#include <linux/hardirq.h>
+#include <asm/apic.h>
+#include <asm/trace/irq_vectors.h>
+#include <linux/interrupt.h>
+
+#ifdef CONFIG_X86_LOCAL_APIC
+__visible void __irq_entry smp_irq_work_interrupt(struct pt_regs *regs)
+{
+ ipi_entering_ack_irq();
+ trace_irq_work_entry(IRQ_WORK_VECTOR);
+ inc_irq_stat(apic_irq_work_irqs);
+ irq_work_run();
+ trace_irq_work_exit(IRQ_WORK_VECTOR);
+ exiting_irq();
+}
+
+void arch_irq_work_raise(void)
+{
+ if (!arch_irq_work_has_interrupt())
+ return;
+
+ apic->send_IPI_self(IRQ_WORK_VECTOR);
+ apic_wait_icr_idle();
+}
+#endif
diff --git a/arch/x86/kernel/irqflags.S b/arch/x86/kernel/irqflags.S
new file mode 100644
index 000000000..ddeeaac8a
--- /dev/null
+++ b/arch/x86/kernel/irqflags.S
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <asm/asm.h>
+#include <asm/export.h>
+#include <linux/linkage.h>
+
+/*
+ * unsigned long native_save_fl(void)
+ */
+ENTRY(native_save_fl)
+ pushf
+ pop %_ASM_AX
+ ret
+ENDPROC(native_save_fl)
+EXPORT_SYMBOL(native_save_fl)
+
+/*
+ * void native_restore_fl(unsigned long flags)
+ * %eax/%rdi: flags
+ */
+ENTRY(native_restore_fl)
+ push %_ASM_ARG1
+ popf
+ ret
+ENDPROC(native_restore_fl)
+EXPORT_SYMBOL(native_restore_fl)
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
new file mode 100644
index 000000000..a0693b71c
--- /dev/null
+++ b/arch/x86/kernel/irqinit.c
@@ -0,0 +1,109 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/linkage.h>
+#include <linux/errno.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/ioport.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/timex.h>
+#include <linux/random.h>
+#include <linux/kprobes.h>
+#include <linux/init.h>
+#include <linux/kernel_stat.h>
+#include <linux/device.h>
+#include <linux/bitops.h>
+#include <linux/acpi.h>
+#include <linux/io.h>
+#include <linux/delay.h>
+
+#include <linux/atomic.h>
+#include <asm/timer.h>
+#include <asm/hw_irq.h>
+#include <asm/pgtable.h>
+#include <asm/desc.h>
+#include <asm/apic.h>
+#include <asm/setup.h>
+#include <asm/i8259.h>
+#include <asm/traps.h>
+#include <asm/prom.h>
+
+/*
+ * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
+ * (these are usually mapped to vectors 0x30-0x3f)
+ */
+
+/*
+ * The IO-APIC gives us many more interrupt sources. Most of these
+ * are unused but an SMP system is supposed to have enough memory ...
+ * sometimes (mostly wrt. hw bugs) we get corrupted vectors all
+ * across the spectrum, so we really want to be prepared to get all
+ * of these. Plus, more powerful systems might have more than 64
+ * IO-APIC registers.
+ *
+ * (these are usually mapped into the 0x30-0xff vector range)
+ */
+
+/*
+ * IRQ2 is cascade interrupt to second interrupt controller
+ */
+static struct irqaction irq2 = {
+ .handler = no_action,
+ .name = "cascade",
+ .flags = IRQF_NO_THREAD,
+};
+
+DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
+ [0 ... NR_VECTORS - 1] = VECTOR_UNUSED,
+};
+
+void __init init_ISA_irqs(void)
+{
+ struct irq_chip *chip = legacy_pic->chip;
+ int i;
+
+ /*
+ * Try to set up the through-local-APIC virtual wire mode earlier.
+ *
+ * On some 32-bit UP machines, whose APIC has been disabled by BIOS
+ * and then got re-enabled by "lapic", it hangs at boot time without this.
+ */
+ init_bsp_APIC();
+
+ legacy_pic->init(0);
+
+ for (i = 0; i < nr_legacy_irqs(); i++)
+ irq_set_chip_and_handler(i, chip, handle_level_irq);
+}
+
+void __init init_IRQ(void)
+{
+ int i;
+
+ /*
+ * On cpu 0, Assign ISA_IRQ_VECTOR(irq) to IRQ 0..15.
+ * If these IRQ's are handled by legacy interrupt-controllers like PIC,
+ * then this configuration will likely be static after the boot. If
+ * these IRQ's are handled by more mordern controllers like IO-APIC,
+ * then this vector space can be freed and re-used dynamically as the
+ * irq's migrate etc.
+ */
+ for (i = 0; i < nr_legacy_irqs(); i++)
+ per_cpu(vector_irq, 0)[ISA_IRQ_VECTOR(i)] = irq_to_desc(i);
+
+ x86_init.irqs.intr_init();
+}
+
+void __init native_init_IRQ(void)
+{
+ /* Execute any quirks before the call gates are initialised: */
+ x86_init.irqs.pre_vector_init();
+
+ idt_setup_apic_and_irq_gates();
+ lapic_assign_system_vectors();
+
+ if (!acpi_ioapic && !of_ioapic && nr_legacy_irqs())
+ setup_irq(2, &irq2);
+
+ irq_ctx_init(smp_processor_id());
+}
diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c
new file mode 100644
index 000000000..d177940aa
--- /dev/null
+++ b/arch/x86/kernel/itmt.c
@@ -0,0 +1,212 @@
+/*
+ * itmt.c: Support Intel Turbo Boost Max Technology 3.0
+ *
+ * (C) Copyright 2016 Intel Corporation
+ * Author: Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ *
+ * On platforms supporting Intel Turbo Boost Max Technology 3.0, (ITMT),
+ * the maximum turbo frequencies of some cores in a CPU package may be
+ * higher than for the other cores in the same package. In that case,
+ * better performance can be achieved by making the scheduler prefer
+ * to run tasks on the CPUs with higher max turbo frequencies.
+ *
+ * This file provides functions and data structures for enabling the
+ * scheduler to favor scheduling on cores can be boosted to a higher
+ * frequency under ITMT.
+ */
+
+#include <linux/sched.h>
+#include <linux/cpumask.h>
+#include <linux/cpuset.h>
+#include <linux/mutex.h>
+#include <linux/sysctl.h>
+#include <linux/nodemask.h>
+
+static DEFINE_MUTEX(itmt_update_mutex);
+DEFINE_PER_CPU_READ_MOSTLY(int, sched_core_priority);
+
+/* Boolean to track if system has ITMT capabilities */
+static bool __read_mostly sched_itmt_capable;
+
+/*
+ * Boolean to control whether we want to move processes to cpu capable
+ * of higher turbo frequency for cpus supporting Intel Turbo Boost Max
+ * Technology 3.0.
+ *
+ * It can be set via /proc/sys/kernel/sched_itmt_enabled
+ */
+unsigned int __read_mostly sysctl_sched_itmt_enabled;
+
+static int sched_itmt_update_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ unsigned int old_sysctl;
+ int ret;
+
+ mutex_lock(&itmt_update_mutex);
+
+ if (!sched_itmt_capable) {
+ mutex_unlock(&itmt_update_mutex);
+ return -EINVAL;
+ }
+
+ old_sysctl = sysctl_sched_itmt_enabled;
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+
+ if (!ret && write && old_sysctl != sysctl_sched_itmt_enabled) {
+ x86_topology_update = true;
+ rebuild_sched_domains();
+ }
+
+ mutex_unlock(&itmt_update_mutex);
+
+ return ret;
+}
+
+static unsigned int zero;
+static unsigned int one = 1;
+static struct ctl_table itmt_kern_table[] = {
+ {
+ .procname = "sched_itmt_enabled",
+ .data = &sysctl_sched_itmt_enabled,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sched_itmt_update_handler,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+ {}
+};
+
+static struct ctl_table itmt_root_table[] = {
+ {
+ .procname = "kernel",
+ .mode = 0555,
+ .child = itmt_kern_table,
+ },
+ {}
+};
+
+static struct ctl_table_header *itmt_sysctl_header;
+
+/**
+ * sched_set_itmt_support() - Indicate platform supports ITMT
+ *
+ * This function is used by the OS to indicate to scheduler that the platform
+ * is capable of supporting the ITMT feature.
+ *
+ * The current scheme has the pstate driver detects if the system
+ * is ITMT capable and call sched_set_itmt_support.
+ *
+ * This must be done only after sched_set_itmt_core_prio
+ * has been called to set the cpus' priorities.
+ * It must not be called with cpu hot plug lock
+ * held as we need to acquire the lock to rebuild sched domains
+ * later.
+ *
+ * Return: 0 on success
+ */
+int sched_set_itmt_support(void)
+{
+ mutex_lock(&itmt_update_mutex);
+
+ if (sched_itmt_capable) {
+ mutex_unlock(&itmt_update_mutex);
+ return 0;
+ }
+
+ itmt_sysctl_header = register_sysctl_table(itmt_root_table);
+ if (!itmt_sysctl_header) {
+ mutex_unlock(&itmt_update_mutex);
+ return -ENOMEM;
+ }
+
+ sched_itmt_capable = true;
+
+ sysctl_sched_itmt_enabled = 1;
+
+ x86_topology_update = true;
+ rebuild_sched_domains();
+
+ mutex_unlock(&itmt_update_mutex);
+
+ return 0;
+}
+
+/**
+ * sched_clear_itmt_support() - Revoke platform's support of ITMT
+ *
+ * This function is used by the OS to indicate that it has
+ * revoked the platform's support of ITMT feature.
+ *
+ * It must not be called with cpu hot plug lock
+ * held as we need to acquire the lock to rebuild sched domains
+ * later.
+ */
+void sched_clear_itmt_support(void)
+{
+ mutex_lock(&itmt_update_mutex);
+
+ if (!sched_itmt_capable) {
+ mutex_unlock(&itmt_update_mutex);
+ return;
+ }
+ sched_itmt_capable = false;
+
+ if (itmt_sysctl_header) {
+ unregister_sysctl_table(itmt_sysctl_header);
+ itmt_sysctl_header = NULL;
+ }
+
+ if (sysctl_sched_itmt_enabled) {
+ /* disable sched_itmt if we are no longer ITMT capable */
+ sysctl_sched_itmt_enabled = 0;
+ x86_topology_update = true;
+ rebuild_sched_domains();
+ }
+
+ mutex_unlock(&itmt_update_mutex);
+}
+
+int arch_asym_cpu_priority(int cpu)
+{
+ return per_cpu(sched_core_priority, cpu);
+}
+
+/**
+ * sched_set_itmt_core_prio() - Set CPU priority based on ITMT
+ * @prio: Priority of cpu core
+ * @core_cpu: The cpu number associated with the core
+ *
+ * The pstate driver will find out the max boost frequency
+ * and call this function to set a priority proportional
+ * to the max boost frequency. CPU with higher boost
+ * frequency will receive higher priority.
+ *
+ * No need to rebuild sched domain after updating
+ * the CPU priorities. The sched domains have no
+ * dependency on CPU priorities.
+ */
+void sched_set_itmt_core_prio(int prio, int core_cpu)
+{
+ int cpu, i = 1;
+
+ for_each_cpu(cpu, topology_sibling_cpumask(core_cpu)) {
+ int smt_prio;
+
+ /*
+ * Ensure that the siblings are moved to the end
+ * of the priority chain and only used when
+ * all other high priority cpus are out of capacity.
+ */
+ smt_prio = prio * smp_num_siblings / i;
+ per_cpu(sched_core_priority, cpu) = smt_prio;
+ i++;
+ }
+}
diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c
new file mode 100644
index 000000000..108c48d0d
--- /dev/null
+++ b/arch/x86/kernel/jailhouse.c
@@ -0,0 +1,219 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Jailhouse paravirt_ops implementation
+ *
+ * Copyright (c) Siemens AG, 2015-2017
+ *
+ * Authors:
+ * Jan Kiszka <jan.kiszka@siemens.com>
+ */
+
+#include <linux/acpi_pmtmr.h>
+#include <linux/kernel.h>
+#include <linux/reboot.h>
+#include <asm/apic.h>
+#include <asm/cpu.h>
+#include <asm/hypervisor.h>
+#include <asm/i8259.h>
+#include <asm/irqdomain.h>
+#include <asm/pci_x86.h>
+#include <asm/reboot.h>
+#include <asm/setup.h>
+
+static __initdata struct jailhouse_setup_data setup_data;
+static unsigned int precalibrated_tsc_khz;
+
+static uint32_t jailhouse_cpuid_base(void)
+{
+ if (boot_cpu_data.cpuid_level < 0 ||
+ !boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ return 0;
+
+ return hypervisor_cpuid_base("Jailhouse\0\0\0", 0);
+}
+
+static uint32_t __init jailhouse_detect(void)
+{
+ return jailhouse_cpuid_base();
+}
+
+static void jailhouse_get_wallclock(struct timespec64 *now)
+{
+ memset(now, 0, sizeof(*now));
+}
+
+static void __init jailhouse_timer_init(void)
+{
+ lapic_timer_frequency = setup_data.apic_khz * (1000 / HZ);
+}
+
+static unsigned long jailhouse_get_tsc(void)
+{
+ return precalibrated_tsc_khz;
+}
+
+static void __init jailhouse_x2apic_init(void)
+{
+#ifdef CONFIG_X86_X2APIC
+ if (!x2apic_enabled())
+ return;
+ /*
+ * We do not have access to IR inside Jailhouse non-root cells. So
+ * we have to run in physical mode.
+ */
+ x2apic_phys = 1;
+ /*
+ * This will trigger the switch to apic_x2apic_phys. Empty OEM IDs
+ * ensure that only this APIC driver picks up the call.
+ */
+ default_acpi_madt_oem_check("", "");
+#endif
+}
+
+static void __init jailhouse_get_smp_config(unsigned int early)
+{
+ struct ioapic_domain_cfg ioapic_cfg = {
+ .type = IOAPIC_DOMAIN_STRICT,
+ .ops = &mp_ioapic_irqdomain_ops,
+ };
+ struct mpc_intsrc mp_irq = {
+ .type = MP_INTSRC,
+ .irqtype = mp_INT,
+ .irqflag = MP_IRQPOL_ACTIVE_HIGH | MP_IRQTRIG_EDGE,
+ };
+ unsigned int cpu;
+
+ jailhouse_x2apic_init();
+
+ register_lapic_address(0xfee00000);
+
+ for (cpu = 0; cpu < setup_data.num_cpus; cpu++) {
+ generic_processor_info(setup_data.cpu_ids[cpu],
+ boot_cpu_apic_version);
+ }
+
+ smp_found_config = 1;
+
+ if (setup_data.standard_ioapic) {
+ mp_register_ioapic(0, 0xfec00000, gsi_top, &ioapic_cfg);
+
+ /* Register 1:1 mapping for legacy UART IRQs 3 and 4 */
+ mp_irq.srcbusirq = mp_irq.dstirq = 3;
+ mp_save_irq(&mp_irq);
+
+ mp_irq.srcbusirq = mp_irq.dstirq = 4;
+ mp_save_irq(&mp_irq);
+ }
+}
+
+static void jailhouse_no_restart(void)
+{
+ pr_notice("Jailhouse: Restart not supported, halting\n");
+ machine_halt();
+}
+
+static int __init jailhouse_pci_arch_init(void)
+{
+ pci_direct_init(1);
+
+ /*
+ * There are no bridges on the virtual PCI root bus under Jailhouse,
+ * thus no other way to discover all devices than a full scan.
+ * Respect any overrides via the command line, though.
+ */
+ if (pcibios_last_bus < 0)
+ pcibios_last_bus = 0xff;
+
+#ifdef CONFIG_PCI_MMCONFIG
+ if (setup_data.pci_mmconfig_base) {
+ pci_mmconfig_add(0, 0, pcibios_last_bus,
+ setup_data.pci_mmconfig_base);
+ pci_mmcfg_arch_init();
+ }
+#endif
+
+ return 0;
+}
+
+static void __init jailhouse_init_platform(void)
+{
+ u64 pa_data = boot_params.hdr.setup_data;
+ struct setup_data header;
+ void *mapping;
+
+ x86_init.irqs.pre_vector_init = x86_init_noop;
+ x86_init.timers.timer_init = jailhouse_timer_init;
+ x86_init.mpparse.get_smp_config = jailhouse_get_smp_config;
+ x86_init.pci.arch_init = jailhouse_pci_arch_init;
+
+ x86_platform.calibrate_cpu = jailhouse_get_tsc;
+ x86_platform.calibrate_tsc = jailhouse_get_tsc;
+ x86_platform.get_wallclock = jailhouse_get_wallclock;
+ x86_platform.legacy.rtc = 0;
+ x86_platform.legacy.warm_reset = 0;
+ x86_platform.legacy.i8042 = X86_LEGACY_I8042_PLATFORM_ABSENT;
+
+ legacy_pic = &null_legacy_pic;
+
+ machine_ops.emergency_restart = jailhouse_no_restart;
+
+ while (pa_data) {
+ mapping = early_memremap(pa_data, sizeof(header));
+ memcpy(&header, mapping, sizeof(header));
+ early_memunmap(mapping, sizeof(header));
+
+ if (header.type == SETUP_JAILHOUSE &&
+ header.len >= sizeof(setup_data)) {
+ pa_data += offsetof(struct setup_data, data);
+
+ mapping = early_memremap(pa_data, sizeof(setup_data));
+ memcpy(&setup_data, mapping, sizeof(setup_data));
+ early_memunmap(mapping, sizeof(setup_data));
+
+ break;
+ }
+
+ pa_data = header.next;
+ }
+
+ if (!pa_data)
+ panic("Jailhouse: No valid setup data found");
+
+ if (setup_data.compatible_version > JAILHOUSE_SETUP_REQUIRED_VERSION)
+ panic("Jailhouse: Unsupported setup data structure");
+
+ pmtmr_ioport = setup_data.pm_timer_address;
+ pr_debug("Jailhouse: PM-Timer IO Port: %#x\n", pmtmr_ioport);
+
+ precalibrated_tsc_khz = setup_data.tsc_khz;
+ setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
+
+ pci_probe = 0;
+
+ /*
+ * Avoid that the kernel complains about missing ACPI tables - there
+ * are none in a non-root cell.
+ */
+ disable_acpi();
+}
+
+bool jailhouse_paravirt(void)
+{
+ return jailhouse_cpuid_base() != 0;
+}
+
+static bool jailhouse_x2apic_available(void)
+{
+ /*
+ * The x2APIC is only available if the root cell enabled it. Jailhouse
+ * does not support switching between xAPIC and x2APIC.
+ */
+ return x2apic_enabled();
+}
+
+const struct hypervisor_x86 x86_hyper_jailhouse __refconst = {
+ .name = "Jailhouse",
+ .detect = jailhouse_detect,
+ .init.init_platform = jailhouse_init_platform,
+ .init.x2apic_available = jailhouse_x2apic_available,
+};
diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c
new file mode 100644
index 000000000..4c3d9a3d4
--- /dev/null
+++ b/arch/x86/kernel/jump_label.c
@@ -0,0 +1,142 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * jump label x86 support
+ *
+ * Copyright (C) 2009 Jason Baron <jbaron@redhat.com>
+ *
+ */
+#include <linux/jump_label.h>
+#include <linux/memory.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/jhash.h>
+#include <linux/cpu.h>
+#include <asm/kprobes.h>
+#include <asm/alternative.h>
+#include <asm/text-patching.h>
+
+union jump_code_union {
+ char code[JUMP_LABEL_NOP_SIZE];
+ struct {
+ char jump;
+ int offset;
+ } __attribute__((packed));
+};
+
+static void bug_at(unsigned char *ip, int line)
+{
+ /*
+ * The location is not an op that we were expecting.
+ * Something went wrong. Crash the box, as something could be
+ * corrupting the kernel.
+ */
+ pr_crit("jump_label: Fatal kernel bug, unexpected op at %pS [%p] (%5ph) %d\n", ip, ip, ip, line);
+ BUG();
+}
+
+static void __ref __jump_label_transform(struct jump_entry *entry,
+ enum jump_label_type type,
+ void *(*poker)(void *, const void *, size_t),
+ int init)
+{
+ union jump_code_union code;
+ const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP };
+ const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5];
+
+ if (early_boot_irqs_disabled)
+ poker = text_poke_early;
+
+ if (type == JUMP_LABEL_JMP) {
+ if (init) {
+ /*
+ * Jump label is enabled for the first time.
+ * So we expect a default_nop...
+ */
+ if (unlikely(memcmp((void *)entry->code, default_nop, 5)
+ != 0))
+ bug_at((void *)entry->code, __LINE__);
+ } else {
+ /*
+ * ...otherwise expect an ideal_nop. Otherwise
+ * something went horribly wrong.
+ */
+ if (unlikely(memcmp((void *)entry->code, ideal_nop, 5)
+ != 0))
+ bug_at((void *)entry->code, __LINE__);
+ }
+
+ code.jump = 0xe9;
+ code.offset = entry->target -
+ (entry->code + JUMP_LABEL_NOP_SIZE);
+ } else {
+ /*
+ * We are disabling this jump label. If it is not what
+ * we think it is, then something must have gone wrong.
+ * If this is the first initialization call, then we
+ * are converting the default nop to the ideal nop.
+ */
+ if (init) {
+ if (unlikely(memcmp((void *)entry->code, default_nop, 5) != 0))
+ bug_at((void *)entry->code, __LINE__);
+ } else {
+ code.jump = 0xe9;
+ code.offset = entry->target -
+ (entry->code + JUMP_LABEL_NOP_SIZE);
+ if (unlikely(memcmp((void *)entry->code, &code, 5) != 0))
+ bug_at((void *)entry->code, __LINE__);
+ }
+ memcpy(&code, ideal_nops[NOP_ATOMIC5], JUMP_LABEL_NOP_SIZE);
+ }
+
+ /*
+ * Make text_poke_bp() a default fallback poker.
+ *
+ * At the time the change is being done, just ignore whether we
+ * are doing nop -> jump or jump -> nop transition, and assume
+ * always nop being the 'currently valid' instruction
+ *
+ */
+ if (poker)
+ (*poker)((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE);
+ else
+ text_poke_bp((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE,
+ (void *)entry->code + JUMP_LABEL_NOP_SIZE);
+}
+
+void arch_jump_label_transform(struct jump_entry *entry,
+ enum jump_label_type type)
+{
+ mutex_lock(&text_mutex);
+ __jump_label_transform(entry, type, NULL, 0);
+ mutex_unlock(&text_mutex);
+}
+
+static enum {
+ JL_STATE_START,
+ JL_STATE_NO_UPDATE,
+ JL_STATE_UPDATE,
+} jlstate __initdata_or_module = JL_STATE_START;
+
+__init_or_module void arch_jump_label_transform_static(struct jump_entry *entry,
+ enum jump_label_type type)
+{
+ /*
+ * This function is called at boot up and when modules are
+ * first loaded. Check if the default nop, the one that is
+ * inserted at compile time, is the ideal nop. If it is, then
+ * we do not need to update the nop, and we can leave it as is.
+ * If it is not, then we need to update the nop to the ideal nop.
+ */
+ if (jlstate == JL_STATE_START) {
+ const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP };
+ const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5];
+
+ if (memcmp(ideal_nop, default_nop, 5) != 0)
+ jlstate = JL_STATE_UPDATE;
+ else
+ jlstate = JL_STATE_NO_UPDATE;
+ }
+ if (jlstate == JL_STATE_UPDATE)
+ __jump_label_transform(entry, type, text_poke_early, 1);
+}
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c
new file mode 100644
index 000000000..fd6f8fbbe
--- /dev/null
+++ b/arch/x86/kernel/kdebugfs.c
@@ -0,0 +1,202 @@
+/*
+ * Architecture specific debugfs files
+ *
+ * Copyright (C) 2007, Intel Corp.
+ * Huang Ying <ying.huang@intel.com>
+ *
+ * This file is released under the GPLv2.
+ */
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/export.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/stat.h>
+#include <linux/io.h>
+#include <linux/mm.h>
+
+#include <asm/setup.h>
+
+struct dentry *arch_debugfs_dir;
+EXPORT_SYMBOL(arch_debugfs_dir);
+
+#ifdef CONFIG_DEBUG_BOOT_PARAMS
+struct setup_data_node {
+ u64 paddr;
+ u32 type;
+ u32 len;
+};
+
+static ssize_t setup_data_read(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct setup_data_node *node = file->private_data;
+ unsigned long remain;
+ loff_t pos = *ppos;
+ void *p;
+ u64 pa;
+
+ if (pos < 0)
+ return -EINVAL;
+
+ if (pos >= node->len)
+ return 0;
+
+ if (count > node->len - pos)
+ count = node->len - pos;
+
+ pa = node->paddr + sizeof(struct setup_data) + pos;
+ p = memremap(pa, count, MEMREMAP_WB);
+ if (!p)
+ return -ENOMEM;
+
+ remain = copy_to_user(user_buf, p, count);
+
+ memunmap(p);
+
+ if (remain)
+ return -EFAULT;
+
+ *ppos = pos + count;
+
+ return count;
+}
+
+static const struct file_operations fops_setup_data = {
+ .read = setup_data_read,
+ .open = simple_open,
+ .llseek = default_llseek,
+};
+
+static int __init
+create_setup_data_node(struct dentry *parent, int no,
+ struct setup_data_node *node)
+{
+ struct dentry *d, *type, *data;
+ char buf[16];
+
+ sprintf(buf, "%d", no);
+ d = debugfs_create_dir(buf, parent);
+ if (!d)
+ return -ENOMEM;
+
+ type = debugfs_create_x32("type", S_IRUGO, d, &node->type);
+ if (!type)
+ goto err_dir;
+
+ data = debugfs_create_file("data", S_IRUGO, d, node, &fops_setup_data);
+ if (!data)
+ goto err_type;
+
+ return 0;
+
+err_type:
+ debugfs_remove(type);
+err_dir:
+ debugfs_remove(d);
+ return -ENOMEM;
+}
+
+static int __init create_setup_data_nodes(struct dentry *parent)
+{
+ struct setup_data_node *node;
+ struct setup_data *data;
+ int error;
+ struct dentry *d;
+ u64 pa_data;
+ int no = 0;
+
+ d = debugfs_create_dir("setup_data", parent);
+ if (!d)
+ return -ENOMEM;
+
+ pa_data = boot_params.hdr.setup_data;
+
+ while (pa_data) {
+ node = kmalloc(sizeof(*node), GFP_KERNEL);
+ if (!node) {
+ error = -ENOMEM;
+ goto err_dir;
+ }
+
+ data = memremap(pa_data, sizeof(*data), MEMREMAP_WB);
+ if (!data) {
+ kfree(node);
+ error = -ENOMEM;
+ goto err_dir;
+ }
+
+ node->paddr = pa_data;
+ node->type = data->type;
+ node->len = data->len;
+ error = create_setup_data_node(d, no, node);
+ pa_data = data->next;
+
+ memunmap(data);
+ if (error)
+ goto err_dir;
+ no++;
+ }
+
+ return 0;
+
+err_dir:
+ debugfs_remove(d);
+ return error;
+}
+
+static struct debugfs_blob_wrapper boot_params_blob = {
+ .data = &boot_params,
+ .size = sizeof(boot_params),
+};
+
+static int __init boot_params_kdebugfs_init(void)
+{
+ struct dentry *dbp, *version, *data;
+ int error = -ENOMEM;
+
+ dbp = debugfs_create_dir("boot_params", arch_debugfs_dir);
+ if (!dbp)
+ return -ENOMEM;
+
+ version = debugfs_create_x16("version", S_IRUGO, dbp,
+ &boot_params.hdr.version);
+ if (!version)
+ goto err_dir;
+
+ data = debugfs_create_blob("data", S_IRUGO, dbp,
+ &boot_params_blob);
+ if (!data)
+ goto err_version;
+
+ error = create_setup_data_nodes(dbp);
+ if (error)
+ goto err_data;
+
+ return 0;
+
+err_data:
+ debugfs_remove(data);
+err_version:
+ debugfs_remove(version);
+err_dir:
+ debugfs_remove(dbp);
+ return error;
+}
+#endif /* CONFIG_DEBUG_BOOT_PARAMS */
+
+static int __init arch_kdebugfs_init(void)
+{
+ int error = 0;
+
+ arch_debugfs_dir = debugfs_create_dir("x86", NULL);
+ if (!arch_debugfs_dir)
+ return -ENOMEM;
+
+#ifdef CONFIG_DEBUG_BOOT_PARAMS
+ error = boot_params_kdebugfs_init();
+#endif
+
+ return error;
+}
+arch_initcall(arch_kdebugfs_init);
diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
new file mode 100644
index 000000000..273687986
--- /dev/null
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -0,0 +1,549 @@
+/*
+ * Kexec bzImage loader
+ *
+ * Copyright (C) 2014 Red Hat Inc.
+ * Authors:
+ * Vivek Goyal <vgoyal@redhat.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+#define pr_fmt(fmt) "kexec-bzImage64: " fmt
+
+#include <linux/string.h>
+#include <linux/printk.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/kexec.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/efi.h>
+#include <linux/verification.h>
+
+#include <asm/bootparam.h>
+#include <asm/setup.h>
+#include <asm/crash.h>
+#include <asm/efi.h>
+#include <asm/e820/api.h>
+#include <asm/kexec-bzimage64.h>
+
+#define MAX_ELFCOREHDR_STR_LEN 30 /* elfcorehdr=0x<64bit-value> */
+
+/*
+ * Defines lowest physical address for various segments. Not sure where
+ * exactly these limits came from. Current bzimage64 loader in kexec-tools
+ * uses these so I am retaining it. It can be changed over time as we gain
+ * more insight.
+ */
+#define MIN_PURGATORY_ADDR 0x3000
+#define MIN_BOOTPARAM_ADDR 0x3000
+#define MIN_KERNEL_LOAD_ADDR 0x100000
+#define MIN_INITRD_LOAD_ADDR 0x1000000
+
+/*
+ * This is a place holder for all boot loader specific data structure which
+ * gets allocated in one call but gets freed much later during cleanup
+ * time. Right now there is only one field but it can grow as need be.
+ */
+struct bzimage64_data {
+ /*
+ * Temporary buffer to hold bootparams buffer. This should be
+ * freed once the bootparam segment has been loaded.
+ */
+ void *bootparams_buf;
+};
+
+static int setup_initrd(struct boot_params *params,
+ unsigned long initrd_load_addr, unsigned long initrd_len)
+{
+ params->hdr.ramdisk_image = initrd_load_addr & 0xffffffffUL;
+ params->hdr.ramdisk_size = initrd_len & 0xffffffffUL;
+
+ params->ext_ramdisk_image = initrd_load_addr >> 32;
+ params->ext_ramdisk_size = initrd_len >> 32;
+
+ return 0;
+}
+
+static int setup_cmdline(struct kimage *image, struct boot_params *params,
+ unsigned long bootparams_load_addr,
+ unsigned long cmdline_offset, char *cmdline,
+ unsigned long cmdline_len)
+{
+ char *cmdline_ptr = ((char *)params) + cmdline_offset;
+ unsigned long cmdline_ptr_phys, len = 0;
+ uint32_t cmdline_low_32, cmdline_ext_32;
+
+ if (image->type == KEXEC_TYPE_CRASH) {
+ len = sprintf(cmdline_ptr,
+ "elfcorehdr=0x%lx ", image->arch.elf_load_addr);
+ }
+ memcpy(cmdline_ptr + len, cmdline, cmdline_len);
+ cmdline_len += len;
+
+ cmdline_ptr[cmdline_len - 1] = '\0';
+
+ pr_debug("Final command line is: %s\n", cmdline_ptr);
+ cmdline_ptr_phys = bootparams_load_addr + cmdline_offset;
+ cmdline_low_32 = cmdline_ptr_phys & 0xffffffffUL;
+ cmdline_ext_32 = cmdline_ptr_phys >> 32;
+
+ params->hdr.cmd_line_ptr = cmdline_low_32;
+ if (cmdline_ext_32)
+ params->ext_cmd_line_ptr = cmdline_ext_32;
+
+ return 0;
+}
+
+static int setup_e820_entries(struct boot_params *params)
+{
+ unsigned int nr_e820_entries;
+
+ nr_e820_entries = e820_table_kexec->nr_entries;
+
+ /* TODO: Pass entries more than E820_MAX_ENTRIES_ZEROPAGE in bootparams setup data */
+ if (nr_e820_entries > E820_MAX_ENTRIES_ZEROPAGE)
+ nr_e820_entries = E820_MAX_ENTRIES_ZEROPAGE;
+
+ params->e820_entries = nr_e820_entries;
+ memcpy(&params->e820_table, &e820_table_kexec->entries, nr_e820_entries*sizeof(struct e820_entry));
+
+ return 0;
+}
+
+#ifdef CONFIG_EFI
+static int setup_efi_info_memmap(struct boot_params *params,
+ unsigned long params_load_addr,
+ unsigned int efi_map_offset,
+ unsigned int efi_map_sz)
+{
+ void *efi_map = (void *)params + efi_map_offset;
+ unsigned long efi_map_phys_addr = params_load_addr + efi_map_offset;
+ struct efi_info *ei = &params->efi_info;
+
+ if (!efi_map_sz)
+ return 0;
+
+ efi_runtime_map_copy(efi_map, efi_map_sz);
+
+ ei->efi_memmap = efi_map_phys_addr & 0xffffffff;
+ ei->efi_memmap_hi = efi_map_phys_addr >> 32;
+ ei->efi_memmap_size = efi_map_sz;
+
+ return 0;
+}
+
+static int
+prepare_add_efi_setup_data(struct boot_params *params,
+ unsigned long params_load_addr,
+ unsigned int efi_setup_data_offset)
+{
+ unsigned long setup_data_phys;
+ struct setup_data *sd = (void *)params + efi_setup_data_offset;
+ struct efi_setup_data *esd = (void *)sd + sizeof(struct setup_data);
+
+ esd->fw_vendor = efi.fw_vendor;
+ esd->runtime = efi.runtime;
+ esd->tables = efi.config_table;
+ esd->smbios = efi.smbios;
+
+ sd->type = SETUP_EFI;
+ sd->len = sizeof(struct efi_setup_data);
+
+ /* Add setup data */
+ setup_data_phys = params_load_addr + efi_setup_data_offset;
+ sd->next = params->hdr.setup_data;
+ params->hdr.setup_data = setup_data_phys;
+
+ return 0;
+}
+
+static int
+setup_efi_state(struct boot_params *params, unsigned long params_load_addr,
+ unsigned int efi_map_offset, unsigned int efi_map_sz,
+ unsigned int efi_setup_data_offset)
+{
+ struct efi_info *current_ei = &boot_params.efi_info;
+ struct efi_info *ei = &params->efi_info;
+
+ if (!efi_enabled(EFI_RUNTIME_SERVICES))
+ return 0;
+
+ if (!current_ei->efi_memmap_size)
+ return 0;
+
+ /*
+ * If 1:1 mapping is not enabled, second kernel can not setup EFI
+ * and use EFI run time services. User space will have to pass
+ * acpi_rsdp=<addr> on kernel command line to make second kernel boot
+ * without efi.
+ */
+ if (efi_enabled(EFI_OLD_MEMMAP))
+ return 0;
+
+ ei->efi_loader_signature = current_ei->efi_loader_signature;
+ ei->efi_systab = current_ei->efi_systab;
+ ei->efi_systab_hi = current_ei->efi_systab_hi;
+
+ ei->efi_memdesc_version = current_ei->efi_memdesc_version;
+ ei->efi_memdesc_size = efi_get_runtime_map_desc_size();
+
+ setup_efi_info_memmap(params, params_load_addr, efi_map_offset,
+ efi_map_sz);
+ prepare_add_efi_setup_data(params, params_load_addr,
+ efi_setup_data_offset);
+ return 0;
+}
+#endif /* CONFIG_EFI */
+
+static int
+setup_boot_parameters(struct kimage *image, struct boot_params *params,
+ unsigned long params_load_addr,
+ unsigned int efi_map_offset, unsigned int efi_map_sz,
+ unsigned int efi_setup_data_offset)
+{
+ unsigned int nr_e820_entries;
+ unsigned long long mem_k, start, end;
+ int i, ret = 0;
+
+ /* Get subarch from existing bootparams */
+ params->hdr.hardware_subarch = boot_params.hdr.hardware_subarch;
+
+ /* Copying screen_info will do? */
+ memcpy(&params->screen_info, &screen_info, sizeof(struct screen_info));
+
+ /* Fill in memsize later */
+ params->screen_info.ext_mem_k = 0;
+ params->alt_mem_k = 0;
+
+ /* Default APM info */
+ memset(&params->apm_bios_info, 0, sizeof(params->apm_bios_info));
+
+ /* Default drive info */
+ memset(&params->hd0_info, 0, sizeof(params->hd0_info));
+ memset(&params->hd1_info, 0, sizeof(params->hd1_info));
+
+ if (image->type == KEXEC_TYPE_CRASH) {
+ ret = crash_setup_memmap_entries(image, params);
+ if (ret)
+ return ret;
+ } else
+ setup_e820_entries(params);
+
+ nr_e820_entries = params->e820_entries;
+
+ for (i = 0; i < nr_e820_entries; i++) {
+ if (params->e820_table[i].type != E820_TYPE_RAM)
+ continue;
+ start = params->e820_table[i].addr;
+ end = params->e820_table[i].addr + params->e820_table[i].size - 1;
+
+ if ((start <= 0x100000) && end > 0x100000) {
+ mem_k = (end >> 10) - (0x100000 >> 10);
+ params->screen_info.ext_mem_k = mem_k;
+ params->alt_mem_k = mem_k;
+ if (mem_k > 0xfc00)
+ params->screen_info.ext_mem_k = 0xfc00; /* 64M*/
+ if (mem_k > 0xffffffff)
+ params->alt_mem_k = 0xffffffff;
+ }
+ }
+
+#ifdef CONFIG_EFI
+ /* Setup EFI state */
+ setup_efi_state(params, params_load_addr, efi_map_offset, efi_map_sz,
+ efi_setup_data_offset);
+#endif
+
+ /* Setup EDD info */
+ memcpy(params->eddbuf, boot_params.eddbuf,
+ EDDMAXNR * sizeof(struct edd_info));
+ params->eddbuf_entries = boot_params.eddbuf_entries;
+
+ memcpy(params->edd_mbr_sig_buffer, boot_params.edd_mbr_sig_buffer,
+ EDD_MBR_SIG_MAX * sizeof(unsigned int));
+
+ return ret;
+}
+
+static int bzImage64_probe(const char *buf, unsigned long len)
+{
+ int ret = -ENOEXEC;
+ struct setup_header *header;
+
+ /* kernel should be at least two sectors long */
+ if (len < 2 * 512) {
+ pr_err("File is too short to be a bzImage\n");
+ return ret;
+ }
+
+ header = (struct setup_header *)(buf + offsetof(struct boot_params, hdr));
+ if (memcmp((char *)&header->header, "HdrS", 4) != 0) {
+ pr_err("Not a bzImage\n");
+ return ret;
+ }
+
+ if (header->boot_flag != 0xAA55) {
+ pr_err("No x86 boot sector present\n");
+ return ret;
+ }
+
+ if (header->version < 0x020C) {
+ pr_err("Must be at least protocol version 2.12\n");
+ return ret;
+ }
+
+ if (!(header->loadflags & LOADED_HIGH)) {
+ pr_err("zImage not a bzImage\n");
+ return ret;
+ }
+
+ if (!(header->xloadflags & XLF_KERNEL_64)) {
+ pr_err("Not a bzImage64. XLF_KERNEL_64 is not set.\n");
+ return ret;
+ }
+
+ if (!(header->xloadflags & XLF_CAN_BE_LOADED_ABOVE_4G)) {
+ pr_err("XLF_CAN_BE_LOADED_ABOVE_4G is not set.\n");
+ return ret;
+ }
+
+ /*
+ * Can't handle 32bit EFI as it does not allow loading kernel
+ * above 4G. This should be handled by 32bit bzImage loader
+ */
+ if (efi_enabled(EFI_RUNTIME_SERVICES) && !efi_enabled(EFI_64BIT)) {
+ pr_debug("EFI is 32 bit. Can't load kernel above 4G.\n");
+ return ret;
+ }
+
+ /* I've got a bzImage */
+ pr_debug("It's a relocatable bzImage64\n");
+ ret = 0;
+
+ return ret;
+}
+
+static void *bzImage64_load(struct kimage *image, char *kernel,
+ unsigned long kernel_len, char *initrd,
+ unsigned long initrd_len, char *cmdline,
+ unsigned long cmdline_len)
+{
+
+ struct setup_header *header;
+ int setup_sects, kern16_size, ret = 0;
+ unsigned long setup_header_size, params_cmdline_sz;
+ struct boot_params *params;
+ unsigned long bootparam_load_addr, kernel_load_addr, initrd_load_addr;
+ struct bzimage64_data *ldata;
+ struct kexec_entry64_regs regs64;
+ void *stack;
+ unsigned int setup_hdr_offset = offsetof(struct boot_params, hdr);
+ unsigned int efi_map_offset, efi_map_sz, efi_setup_data_offset;
+ struct kexec_buf kbuf = { .image = image, .buf_max = ULONG_MAX,
+ .top_down = true };
+ struct kexec_buf pbuf = { .image = image, .buf_min = MIN_PURGATORY_ADDR,
+ .buf_max = ULONG_MAX, .top_down = true };
+
+ header = (struct setup_header *)(kernel + setup_hdr_offset);
+ setup_sects = header->setup_sects;
+ if (setup_sects == 0)
+ setup_sects = 4;
+
+ kern16_size = (setup_sects + 1) * 512;
+ if (kernel_len < kern16_size) {
+ pr_err("bzImage truncated\n");
+ return ERR_PTR(-ENOEXEC);
+ }
+
+ if (cmdline_len > header->cmdline_size) {
+ pr_err("Kernel command line too long\n");
+ return ERR_PTR(-EINVAL);
+ }
+
+ /*
+ * In case of crash dump, we will append elfcorehdr=<addr> to
+ * command line. Make sure it does not overflow
+ */
+ if (cmdline_len + MAX_ELFCOREHDR_STR_LEN > header->cmdline_size) {
+ pr_debug("Appending elfcorehdr=<addr> to command line exceeds maximum allowed length\n");
+ return ERR_PTR(-EINVAL);
+ }
+
+ /* Allocate and load backup region */
+ if (image->type == KEXEC_TYPE_CRASH) {
+ ret = crash_load_segments(image);
+ if (ret)
+ return ERR_PTR(ret);
+ }
+
+ /*
+ * Load purgatory. For 64bit entry point, purgatory code can be
+ * anywhere.
+ */
+ ret = kexec_load_purgatory(image, &pbuf);
+ if (ret) {
+ pr_err("Loading purgatory failed\n");
+ return ERR_PTR(ret);
+ }
+
+ pr_debug("Loaded purgatory at 0x%lx\n", pbuf.mem);
+
+
+ /*
+ * Load Bootparams and cmdline and space for efi stuff.
+ *
+ * Allocate memory together for multiple data structures so
+ * that they all can go in single area/segment and we don't
+ * have to create separate segment for each. Keeps things
+ * little bit simple
+ */
+ efi_map_sz = efi_get_runtime_map_size();
+ params_cmdline_sz = sizeof(struct boot_params) + cmdline_len +
+ MAX_ELFCOREHDR_STR_LEN;
+ params_cmdline_sz = ALIGN(params_cmdline_sz, 16);
+ kbuf.bufsz = params_cmdline_sz + ALIGN(efi_map_sz, 16) +
+ sizeof(struct setup_data) +
+ sizeof(struct efi_setup_data);
+
+ params = kzalloc(kbuf.bufsz, GFP_KERNEL);
+ if (!params)
+ return ERR_PTR(-ENOMEM);
+ efi_map_offset = params_cmdline_sz;
+ efi_setup_data_offset = efi_map_offset + ALIGN(efi_map_sz, 16);
+
+ /* Copy setup header onto bootparams. Documentation/x86/boot.txt */
+ setup_header_size = 0x0202 + kernel[0x0201] - setup_hdr_offset;
+
+ /* Is there a limit on setup header size? */
+ memcpy(&params->hdr, (kernel + setup_hdr_offset), setup_header_size);
+
+ kbuf.buffer = params;
+ kbuf.memsz = kbuf.bufsz;
+ kbuf.buf_align = 16;
+ kbuf.buf_min = MIN_BOOTPARAM_ADDR;
+ ret = kexec_add_buffer(&kbuf);
+ if (ret)
+ goto out_free_params;
+ bootparam_load_addr = kbuf.mem;
+ pr_debug("Loaded boot_param, command line and misc at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
+ bootparam_load_addr, kbuf.bufsz, kbuf.bufsz);
+
+ /* Load kernel */
+ kbuf.buffer = kernel + kern16_size;
+ kbuf.bufsz = kernel_len - kern16_size;
+ kbuf.memsz = PAGE_ALIGN(header->init_size);
+ kbuf.buf_align = header->kernel_alignment;
+ kbuf.buf_min = MIN_KERNEL_LOAD_ADDR;
+ ret = kexec_add_buffer(&kbuf);
+ if (ret)
+ goto out_free_params;
+ kernel_load_addr = kbuf.mem;
+
+ pr_debug("Loaded 64bit kernel at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
+ kernel_load_addr, kbuf.bufsz, kbuf.memsz);
+
+ /* Load initrd high */
+ if (initrd) {
+ kbuf.buffer = initrd;
+ kbuf.bufsz = kbuf.memsz = initrd_len;
+ kbuf.buf_align = PAGE_SIZE;
+ kbuf.buf_min = MIN_INITRD_LOAD_ADDR;
+ ret = kexec_add_buffer(&kbuf);
+ if (ret)
+ goto out_free_params;
+ initrd_load_addr = kbuf.mem;
+
+ pr_debug("Loaded initrd at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
+ initrd_load_addr, initrd_len, initrd_len);
+
+ setup_initrd(params, initrd_load_addr, initrd_len);
+ }
+
+ setup_cmdline(image, params, bootparam_load_addr,
+ sizeof(struct boot_params), cmdline, cmdline_len);
+
+ /* bootloader info. Do we need a separate ID for kexec kernel loader? */
+ params->hdr.type_of_loader = 0x0D << 4;
+ params->hdr.loadflags = 0;
+
+ /* Setup purgatory regs for entry */
+ ret = kexec_purgatory_get_set_symbol(image, "entry64_regs", &regs64,
+ sizeof(regs64), 1);
+ if (ret)
+ goto out_free_params;
+
+ regs64.rbx = 0; /* Bootstrap Processor */
+ regs64.rsi = bootparam_load_addr;
+ regs64.rip = kernel_load_addr + 0x200;
+ stack = kexec_purgatory_get_symbol_addr(image, "stack_end");
+ if (IS_ERR(stack)) {
+ pr_err("Could not find address of symbol stack_end\n");
+ ret = -EINVAL;
+ goto out_free_params;
+ }
+
+ regs64.rsp = (unsigned long)stack;
+ ret = kexec_purgatory_get_set_symbol(image, "entry64_regs", &regs64,
+ sizeof(regs64), 0);
+ if (ret)
+ goto out_free_params;
+
+ ret = setup_boot_parameters(image, params, bootparam_load_addr,
+ efi_map_offset, efi_map_sz,
+ efi_setup_data_offset);
+ if (ret)
+ goto out_free_params;
+
+ /* Allocate loader specific data */
+ ldata = kzalloc(sizeof(struct bzimage64_data), GFP_KERNEL);
+ if (!ldata) {
+ ret = -ENOMEM;
+ goto out_free_params;
+ }
+
+ /*
+ * Store pointer to params so that it could be freed after loading
+ * params segment has been loaded and contents have been copied
+ * somewhere else.
+ */
+ ldata->bootparams_buf = params;
+ return ldata;
+
+out_free_params:
+ kfree(params);
+ return ERR_PTR(ret);
+}
+
+/* This cleanup function is called after various segments have been loaded */
+static int bzImage64_cleanup(void *loader_data)
+{
+ struct bzimage64_data *ldata = loader_data;
+
+ if (!ldata)
+ return 0;
+
+ kfree(ldata->bootparams_buf);
+ ldata->bootparams_buf = NULL;
+
+ return 0;
+}
+
+#ifdef CONFIG_KEXEC_BZIMAGE_VERIFY_SIG
+static int bzImage64_verify_sig(const char *kernel, unsigned long kernel_len)
+{
+ return verify_pefile_signature(kernel, kernel_len,
+ VERIFY_USE_SECONDARY_KEYRING,
+ VERIFYING_KEXEC_PE_SIGNATURE);
+}
+#endif
+
+const struct kexec_file_ops kexec_bzImage64_ops = {
+ .probe = bzImage64_probe,
+ .load = bzImage64_load,
+ .cleanup = bzImage64_cleanup,
+#ifdef CONFIG_KEXEC_BZIMAGE_VERIFY_SIG
+ .verify_sig = bzImage64_verify_sig,
+#endif
+};
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
new file mode 100644
index 000000000..904e18bb3
--- /dev/null
+++ b/arch/x86/kernel/kgdb.c
@@ -0,0 +1,816 @@
+/*
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ */
+
+/*
+ * Copyright (C) 2004 Amit S. Kale <amitkale@linsyssoft.com>
+ * Copyright (C) 2000-2001 VERITAS Software Corporation.
+ * Copyright (C) 2002 Andi Kleen, SuSE Labs
+ * Copyright (C) 2004 LinSysSoft Technologies Pvt. Ltd.
+ * Copyright (C) 2007 MontaVista Software, Inc.
+ * Copyright (C) 2007-2008 Jason Wessel, Wind River Systems, Inc.
+ */
+/****************************************************************************
+ * Contributor: Lake Stevens Instrument Division$
+ * Written by: Glenn Engel $
+ * Updated by: Amit Kale<akale@veritas.com>
+ * Updated by: Tom Rini <trini@kernel.crashing.org>
+ * Updated by: Jason Wessel <jason.wessel@windriver.com>
+ * Modified for 386 by Jim Kingdon, Cygnus Support.
+ * Origianl kgdb, compatibility with 2.1.xx kernel by
+ * David Grothe <dave@gcom.com>
+ * Integrated into 2.2.5 kernel by Tigran Aivazian <tigran@sco.com>
+ * X86_64 changes from Andi Kleen's patch merged by Jim Houston
+ */
+#include <linux/spinlock.h>
+#include <linux/kdebug.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/ptrace.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/kgdb.h>
+#include <linux/smp.h>
+#include <linux/nmi.h>
+#include <linux/hw_breakpoint.h>
+#include <linux/uaccess.h>
+#include <linux/memory.h>
+
+#include <asm/text-patching.h>
+#include <asm/debugreg.h>
+#include <asm/apicdef.h>
+#include <asm/apic.h>
+#include <asm/nmi.h>
+#include <asm/switch_to.h>
+
+struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] =
+{
+#ifdef CONFIG_X86_32
+ { "ax", 4, offsetof(struct pt_regs, ax) },
+ { "cx", 4, offsetof(struct pt_regs, cx) },
+ { "dx", 4, offsetof(struct pt_regs, dx) },
+ { "bx", 4, offsetof(struct pt_regs, bx) },
+ { "sp", 4, offsetof(struct pt_regs, sp) },
+ { "bp", 4, offsetof(struct pt_regs, bp) },
+ { "si", 4, offsetof(struct pt_regs, si) },
+ { "di", 4, offsetof(struct pt_regs, di) },
+ { "ip", 4, offsetof(struct pt_regs, ip) },
+ { "flags", 4, offsetof(struct pt_regs, flags) },
+ { "cs", 4, offsetof(struct pt_regs, cs) },
+ { "ss", 4, offsetof(struct pt_regs, ss) },
+ { "ds", 4, offsetof(struct pt_regs, ds) },
+ { "es", 4, offsetof(struct pt_regs, es) },
+#else
+ { "ax", 8, offsetof(struct pt_regs, ax) },
+ { "bx", 8, offsetof(struct pt_regs, bx) },
+ { "cx", 8, offsetof(struct pt_regs, cx) },
+ { "dx", 8, offsetof(struct pt_regs, dx) },
+ { "si", 8, offsetof(struct pt_regs, si) },
+ { "di", 8, offsetof(struct pt_regs, di) },
+ { "bp", 8, offsetof(struct pt_regs, bp) },
+ { "sp", 8, offsetof(struct pt_regs, sp) },
+ { "r8", 8, offsetof(struct pt_regs, r8) },
+ { "r9", 8, offsetof(struct pt_regs, r9) },
+ { "r10", 8, offsetof(struct pt_regs, r10) },
+ { "r11", 8, offsetof(struct pt_regs, r11) },
+ { "r12", 8, offsetof(struct pt_regs, r12) },
+ { "r13", 8, offsetof(struct pt_regs, r13) },
+ { "r14", 8, offsetof(struct pt_regs, r14) },
+ { "r15", 8, offsetof(struct pt_regs, r15) },
+ { "ip", 8, offsetof(struct pt_regs, ip) },
+ { "flags", 4, offsetof(struct pt_regs, flags) },
+ { "cs", 4, offsetof(struct pt_regs, cs) },
+ { "ss", 4, offsetof(struct pt_regs, ss) },
+ { "ds", 4, -1 },
+ { "es", 4, -1 },
+#endif
+ { "fs", 4, -1 },
+ { "gs", 4, -1 },
+};
+
+int dbg_set_reg(int regno, void *mem, struct pt_regs *regs)
+{
+ if (
+#ifdef CONFIG_X86_32
+ regno == GDB_SS || regno == GDB_FS || regno == GDB_GS ||
+#endif
+ regno == GDB_SP || regno == GDB_ORIG_AX)
+ return 0;
+
+ if (dbg_reg_def[regno].offset != -1)
+ memcpy((void *)regs + dbg_reg_def[regno].offset, mem,
+ dbg_reg_def[regno].size);
+ return 0;
+}
+
+char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs)
+{
+ if (regno == GDB_ORIG_AX) {
+ memcpy(mem, &regs->orig_ax, sizeof(regs->orig_ax));
+ return "orig_ax";
+ }
+ if (regno >= DBG_MAX_REG_NUM || regno < 0)
+ return NULL;
+
+ if (dbg_reg_def[regno].offset != -1)
+ memcpy(mem, (void *)regs + dbg_reg_def[regno].offset,
+ dbg_reg_def[regno].size);
+
+#ifdef CONFIG_X86_32
+ switch (regno) {
+ case GDB_SS:
+ if (!user_mode(regs))
+ *(unsigned long *)mem = __KERNEL_DS;
+ break;
+ case GDB_SP:
+ if (!user_mode(regs))
+ *(unsigned long *)mem = kernel_stack_pointer(regs);
+ break;
+ case GDB_GS:
+ case GDB_FS:
+ *(unsigned long *)mem = 0xFFFF;
+ break;
+ }
+#endif
+ return dbg_reg_def[regno].name;
+}
+
+/**
+ * sleeping_thread_to_gdb_regs - Convert ptrace regs to GDB regs
+ * @gdb_regs: A pointer to hold the registers in the order GDB wants.
+ * @p: The &struct task_struct of the desired process.
+ *
+ * Convert the register values of the sleeping process in @p to
+ * the format that GDB expects.
+ * This function is called when kgdb does not have access to the
+ * &struct pt_regs and therefore it should fill the gdb registers
+ * @gdb_regs with what has been saved in &struct thread_struct
+ * thread field during switch_to.
+ */
+void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
+{
+#ifndef CONFIG_X86_32
+ u32 *gdb_regs32 = (u32 *)gdb_regs;
+#endif
+ gdb_regs[GDB_AX] = 0;
+ gdb_regs[GDB_BX] = 0;
+ gdb_regs[GDB_CX] = 0;
+ gdb_regs[GDB_DX] = 0;
+ gdb_regs[GDB_SI] = 0;
+ gdb_regs[GDB_DI] = 0;
+ gdb_regs[GDB_BP] = ((struct inactive_task_frame *)p->thread.sp)->bp;
+#ifdef CONFIG_X86_32
+ gdb_regs[GDB_DS] = __KERNEL_DS;
+ gdb_regs[GDB_ES] = __KERNEL_DS;
+ gdb_regs[GDB_PS] = 0;
+ gdb_regs[GDB_CS] = __KERNEL_CS;
+ gdb_regs[GDB_SS] = __KERNEL_DS;
+ gdb_regs[GDB_FS] = 0xFFFF;
+ gdb_regs[GDB_GS] = 0xFFFF;
+#else
+ gdb_regs32[GDB_PS] = 0;
+ gdb_regs32[GDB_CS] = __KERNEL_CS;
+ gdb_regs32[GDB_SS] = __KERNEL_DS;
+ gdb_regs[GDB_R8] = 0;
+ gdb_regs[GDB_R9] = 0;
+ gdb_regs[GDB_R10] = 0;
+ gdb_regs[GDB_R11] = 0;
+ gdb_regs[GDB_R12] = 0;
+ gdb_regs[GDB_R13] = 0;
+ gdb_regs[GDB_R14] = 0;
+ gdb_regs[GDB_R15] = 0;
+#endif
+ gdb_regs[GDB_PC] = 0;
+ gdb_regs[GDB_SP] = p->thread.sp;
+}
+
+static struct hw_breakpoint {
+ unsigned enabled;
+ unsigned long addr;
+ int len;
+ int type;
+ struct perf_event * __percpu *pev;
+} breakinfo[HBP_NUM];
+
+static unsigned long early_dr7;
+
+static void kgdb_correct_hw_break(void)
+{
+ int breakno;
+
+ for (breakno = 0; breakno < HBP_NUM; breakno++) {
+ struct perf_event *bp;
+ struct arch_hw_breakpoint *info;
+ int val;
+ int cpu = raw_smp_processor_id();
+ if (!breakinfo[breakno].enabled)
+ continue;
+ if (dbg_is_early) {
+ set_debugreg(breakinfo[breakno].addr, breakno);
+ early_dr7 |= encode_dr7(breakno,
+ breakinfo[breakno].len,
+ breakinfo[breakno].type);
+ set_debugreg(early_dr7, 7);
+ continue;
+ }
+ bp = *per_cpu_ptr(breakinfo[breakno].pev, cpu);
+ info = counter_arch_bp(bp);
+ if (bp->attr.disabled != 1)
+ continue;
+ bp->attr.bp_addr = breakinfo[breakno].addr;
+ bp->attr.bp_len = breakinfo[breakno].len;
+ bp->attr.bp_type = breakinfo[breakno].type;
+ info->address = breakinfo[breakno].addr;
+ info->len = breakinfo[breakno].len;
+ info->type = breakinfo[breakno].type;
+ val = arch_install_hw_breakpoint(bp);
+ if (!val)
+ bp->attr.disabled = 0;
+ }
+ if (!dbg_is_early)
+ hw_breakpoint_restore();
+}
+
+static int hw_break_reserve_slot(int breakno)
+{
+ int cpu;
+ int cnt = 0;
+ struct perf_event **pevent;
+
+ if (dbg_is_early)
+ return 0;
+
+ for_each_online_cpu(cpu) {
+ cnt++;
+ pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu);
+ if (dbg_reserve_bp_slot(*pevent))
+ goto fail;
+ }
+
+ return 0;
+
+fail:
+ for_each_online_cpu(cpu) {
+ cnt--;
+ if (!cnt)
+ break;
+ pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu);
+ dbg_release_bp_slot(*pevent);
+ }
+ return -1;
+}
+
+static int hw_break_release_slot(int breakno)
+{
+ struct perf_event **pevent;
+ int cpu;
+
+ if (dbg_is_early)
+ return 0;
+
+ for_each_online_cpu(cpu) {
+ pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu);
+ if (dbg_release_bp_slot(*pevent))
+ /*
+ * The debugger is responsible for handing the retry on
+ * remove failure.
+ */
+ return -1;
+ }
+ return 0;
+}
+
+static int
+kgdb_remove_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
+{
+ int i;
+
+ for (i = 0; i < HBP_NUM; i++)
+ if (breakinfo[i].addr == addr && breakinfo[i].enabled)
+ break;
+ if (i == HBP_NUM)
+ return -1;
+
+ if (hw_break_release_slot(i)) {
+ printk(KERN_ERR "Cannot remove hw breakpoint at %lx\n", addr);
+ return -1;
+ }
+ breakinfo[i].enabled = 0;
+
+ return 0;
+}
+
+static void kgdb_remove_all_hw_break(void)
+{
+ int i;
+ int cpu = raw_smp_processor_id();
+ struct perf_event *bp;
+
+ for (i = 0; i < HBP_NUM; i++) {
+ if (!breakinfo[i].enabled)
+ continue;
+ bp = *per_cpu_ptr(breakinfo[i].pev, cpu);
+ if (!bp->attr.disabled) {
+ arch_uninstall_hw_breakpoint(bp);
+ bp->attr.disabled = 1;
+ continue;
+ }
+ if (dbg_is_early)
+ early_dr7 &= ~encode_dr7(i, breakinfo[i].len,
+ breakinfo[i].type);
+ else if (hw_break_release_slot(i))
+ printk(KERN_ERR "KGDB: hw bpt remove failed %lx\n",
+ breakinfo[i].addr);
+ breakinfo[i].enabled = 0;
+ }
+}
+
+static int
+kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
+{
+ int i;
+
+ for (i = 0; i < HBP_NUM; i++)
+ if (!breakinfo[i].enabled)
+ break;
+ if (i == HBP_NUM)
+ return -1;
+
+ switch (bptype) {
+ case BP_HARDWARE_BREAKPOINT:
+ len = 1;
+ breakinfo[i].type = X86_BREAKPOINT_EXECUTE;
+ break;
+ case BP_WRITE_WATCHPOINT:
+ breakinfo[i].type = X86_BREAKPOINT_WRITE;
+ break;
+ case BP_ACCESS_WATCHPOINT:
+ breakinfo[i].type = X86_BREAKPOINT_RW;
+ break;
+ default:
+ return -1;
+ }
+ switch (len) {
+ case 1:
+ breakinfo[i].len = X86_BREAKPOINT_LEN_1;
+ break;
+ case 2:
+ breakinfo[i].len = X86_BREAKPOINT_LEN_2;
+ break;
+ case 4:
+ breakinfo[i].len = X86_BREAKPOINT_LEN_4;
+ break;
+#ifdef CONFIG_X86_64
+ case 8:
+ breakinfo[i].len = X86_BREAKPOINT_LEN_8;
+ break;
+#endif
+ default:
+ return -1;
+ }
+ breakinfo[i].addr = addr;
+ if (hw_break_reserve_slot(i)) {
+ breakinfo[i].addr = 0;
+ return -1;
+ }
+ breakinfo[i].enabled = 1;
+
+ return 0;
+}
+
+/**
+ * kgdb_disable_hw_debug - Disable hardware debugging while we in kgdb.
+ * @regs: Current &struct pt_regs.
+ *
+ * This function will be called if the particular architecture must
+ * disable hardware debugging while it is processing gdb packets or
+ * handling exception.
+ */
+static void kgdb_disable_hw_debug(struct pt_regs *regs)
+{
+ int i;
+ int cpu = raw_smp_processor_id();
+ struct perf_event *bp;
+
+ /* Disable hardware debugging while we are in kgdb: */
+ set_debugreg(0UL, 7);
+ for (i = 0; i < HBP_NUM; i++) {
+ if (!breakinfo[i].enabled)
+ continue;
+ if (dbg_is_early) {
+ early_dr7 &= ~encode_dr7(i, breakinfo[i].len,
+ breakinfo[i].type);
+ continue;
+ }
+ bp = *per_cpu_ptr(breakinfo[i].pev, cpu);
+ if (bp->attr.disabled == 1)
+ continue;
+ arch_uninstall_hw_breakpoint(bp);
+ bp->attr.disabled = 1;
+ }
+}
+
+#ifdef CONFIG_SMP
+/**
+ * kgdb_roundup_cpus - Get other CPUs into a holding pattern
+ * @flags: Current IRQ state
+ *
+ * On SMP systems, we need to get the attention of the other CPUs
+ * and get them be in a known state. This should do what is needed
+ * to get the other CPUs to call kgdb_wait(). Note that on some arches,
+ * the NMI approach is not used for rounding up all the CPUs. For example,
+ * in case of MIPS, smp_call_function() is used to roundup CPUs. In
+ * this case, we have to make sure that interrupts are enabled before
+ * calling smp_call_function(). The argument to this function is
+ * the flags that will be used when restoring the interrupts. There is
+ * local_irq_save() call before kgdb_roundup_cpus().
+ *
+ * On non-SMP systems, this is not called.
+ */
+void kgdb_roundup_cpus(unsigned long flags)
+{
+ apic->send_IPI_allbutself(NMI_VECTOR);
+}
+#endif
+
+/**
+ * kgdb_arch_handle_exception - Handle architecture specific GDB packets.
+ * @e_vector: The error vector of the exception that happened.
+ * @signo: The signal number of the exception that happened.
+ * @err_code: The error code of the exception that happened.
+ * @remcomInBuffer: The buffer of the packet we have read.
+ * @remcomOutBuffer: The buffer of %BUFMAX bytes to write a packet into.
+ * @linux_regs: The &struct pt_regs of the current process.
+ *
+ * This function MUST handle the 'c' and 's' command packets,
+ * as well packets to set / remove a hardware breakpoint, if used.
+ * If there are additional packets which the hardware needs to handle,
+ * they are handled here. The code should return -1 if it wants to
+ * process more packets, and a %0 or %1 if it wants to exit from the
+ * kgdb callback.
+ */
+int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
+ char *remcomInBuffer, char *remcomOutBuffer,
+ struct pt_regs *linux_regs)
+{
+ unsigned long addr;
+ char *ptr;
+
+ switch (remcomInBuffer[0]) {
+ case 'c':
+ case 's':
+ /* try to read optional parameter, pc unchanged if no parm */
+ ptr = &remcomInBuffer[1];
+ if (kgdb_hex2long(&ptr, &addr))
+ linux_regs->ip = addr;
+ case 'D':
+ case 'k':
+ /* clear the trace bit */
+ linux_regs->flags &= ~X86_EFLAGS_TF;
+ atomic_set(&kgdb_cpu_doing_single_step, -1);
+
+ /* set the trace bit if we're stepping */
+ if (remcomInBuffer[0] == 's') {
+ linux_regs->flags |= X86_EFLAGS_TF;
+ atomic_set(&kgdb_cpu_doing_single_step,
+ raw_smp_processor_id());
+ }
+
+ return 0;
+ }
+
+ /* this means that we do not want to exit from the handler: */
+ return -1;
+}
+
+static inline int
+single_step_cont(struct pt_regs *regs, struct die_args *args)
+{
+ /*
+ * Single step exception from kernel space to user space so
+ * eat the exception and continue the process:
+ */
+ printk(KERN_ERR "KGDB: trap/step from kernel to user space, "
+ "resuming...\n");
+ kgdb_arch_handle_exception(args->trapnr, args->signr,
+ args->err, "c", "", regs);
+ /*
+ * Reset the BS bit in dr6 (pointed by args->err) to
+ * denote completion of processing
+ */
+ (*(unsigned long *)ERR_PTR(args->err)) &= ~DR_STEP;
+
+ return NOTIFY_STOP;
+}
+
+static DECLARE_BITMAP(was_in_debug_nmi, NR_CPUS);
+
+static int kgdb_nmi_handler(unsigned int cmd, struct pt_regs *regs)
+{
+ int cpu;
+
+ switch (cmd) {
+ case NMI_LOCAL:
+ if (atomic_read(&kgdb_active) != -1) {
+ /* KGDB CPU roundup */
+ cpu = raw_smp_processor_id();
+ kgdb_nmicallback(cpu, regs);
+ set_bit(cpu, was_in_debug_nmi);
+ touch_nmi_watchdog();
+
+ return NMI_HANDLED;
+ }
+ break;
+
+ case NMI_UNKNOWN:
+ cpu = raw_smp_processor_id();
+
+ if (__test_and_clear_bit(cpu, was_in_debug_nmi))
+ return NMI_HANDLED;
+
+ break;
+ default:
+ /* do nothing */
+ break;
+ }
+ return NMI_DONE;
+}
+
+static int __kgdb_notify(struct die_args *args, unsigned long cmd)
+{
+ struct pt_regs *regs = args->regs;
+
+ switch (cmd) {
+ case DIE_DEBUG:
+ if (atomic_read(&kgdb_cpu_doing_single_step) != -1) {
+ if (user_mode(regs))
+ return single_step_cont(regs, args);
+ break;
+ } else if (test_thread_flag(TIF_SINGLESTEP))
+ /* This means a user thread is single stepping
+ * a system call which should be ignored
+ */
+ return NOTIFY_DONE;
+ /* fall through */
+ default:
+ if (user_mode(regs))
+ return NOTIFY_DONE;
+ }
+
+ if (kgdb_handle_exception(args->trapnr, args->signr, cmd, regs))
+ return NOTIFY_DONE;
+
+ /* Must touch watchdog before return to normal operation */
+ touch_nmi_watchdog();
+ return NOTIFY_STOP;
+}
+
+int kgdb_ll_trap(int cmd, const char *str,
+ struct pt_regs *regs, long err, int trap, int sig)
+{
+ struct die_args args = {
+ .regs = regs,
+ .str = str,
+ .err = err,
+ .trapnr = trap,
+ .signr = sig,
+
+ };
+
+ if (!kgdb_io_module_registered)
+ return NOTIFY_DONE;
+
+ return __kgdb_notify(&args, cmd);
+}
+
+static int
+kgdb_notify(struct notifier_block *self, unsigned long cmd, void *ptr)
+{
+ unsigned long flags;
+ int ret;
+
+ local_irq_save(flags);
+ ret = __kgdb_notify(ptr, cmd);
+ local_irq_restore(flags);
+
+ return ret;
+}
+
+static struct notifier_block kgdb_notifier = {
+ .notifier_call = kgdb_notify,
+};
+
+/**
+ * kgdb_arch_init - Perform any architecture specific initialization.
+ *
+ * This function will handle the initialization of any architecture
+ * specific callbacks.
+ */
+int kgdb_arch_init(void)
+{
+ int retval;
+
+ retval = register_die_notifier(&kgdb_notifier);
+ if (retval)
+ goto out;
+
+ retval = register_nmi_handler(NMI_LOCAL, kgdb_nmi_handler,
+ 0, "kgdb");
+ if (retval)
+ goto out1;
+
+ retval = register_nmi_handler(NMI_UNKNOWN, kgdb_nmi_handler,
+ 0, "kgdb");
+
+ if (retval)
+ goto out2;
+
+ return retval;
+
+out2:
+ unregister_nmi_handler(NMI_LOCAL, "kgdb");
+out1:
+ unregister_die_notifier(&kgdb_notifier);
+out:
+ return retval;
+}
+
+static void kgdb_hw_overflow_handler(struct perf_event *event,
+ struct perf_sample_data *data, struct pt_regs *regs)
+{
+ struct task_struct *tsk = current;
+ int i;
+
+ for (i = 0; i < 4; i++)
+ if (breakinfo[i].enabled)
+ tsk->thread.debugreg6 |= (DR_TRAP0 << i);
+}
+
+void kgdb_arch_late(void)
+{
+ int i, cpu;
+ struct perf_event_attr attr;
+ struct perf_event **pevent;
+
+ /*
+ * Pre-allocate the hw breakpoint structions in the non-atomic
+ * portion of kgdb because this operation requires mutexs to
+ * complete.
+ */
+ hw_breakpoint_init(&attr);
+ attr.bp_addr = (unsigned long)kgdb_arch_init;
+ attr.bp_len = HW_BREAKPOINT_LEN_1;
+ attr.bp_type = HW_BREAKPOINT_W;
+ attr.disabled = 1;
+ for (i = 0; i < HBP_NUM; i++) {
+ if (breakinfo[i].pev)
+ continue;
+ breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL, NULL);
+ if (IS_ERR((void * __force)breakinfo[i].pev)) {
+ printk(KERN_ERR "kgdb: Could not allocate hw"
+ "breakpoints\nDisabling the kernel debugger\n");
+ breakinfo[i].pev = NULL;
+ kgdb_arch_exit();
+ return;
+ }
+ for_each_online_cpu(cpu) {
+ pevent = per_cpu_ptr(breakinfo[i].pev, cpu);
+ pevent[0]->hw.sample_period = 1;
+ pevent[0]->overflow_handler = kgdb_hw_overflow_handler;
+ if (pevent[0]->destroy != NULL) {
+ pevent[0]->destroy = NULL;
+ release_bp_slot(*pevent);
+ }
+ }
+ }
+}
+
+/**
+ * kgdb_arch_exit - Perform any architecture specific uninitalization.
+ *
+ * This function will handle the uninitalization of any architecture
+ * specific callbacks, for dynamic registration and unregistration.
+ */
+void kgdb_arch_exit(void)
+{
+ int i;
+ for (i = 0; i < 4; i++) {
+ if (breakinfo[i].pev) {
+ unregister_wide_hw_breakpoint(breakinfo[i].pev);
+ breakinfo[i].pev = NULL;
+ }
+ }
+ unregister_nmi_handler(NMI_UNKNOWN, "kgdb");
+ unregister_nmi_handler(NMI_LOCAL, "kgdb");
+ unregister_die_notifier(&kgdb_notifier);
+}
+
+/**
+ *
+ * kgdb_skipexception - Bail out of KGDB when we've been triggered.
+ * @exception: Exception vector number
+ * @regs: Current &struct pt_regs.
+ *
+ * On some architectures we need to skip a breakpoint exception when
+ * it occurs after a breakpoint has been removed.
+ *
+ * Skip an int3 exception when it occurs after a breakpoint has been
+ * removed. Backtrack eip by 1 since the int3 would have caused it to
+ * increment by 1.
+ */
+int kgdb_skipexception(int exception, struct pt_regs *regs)
+{
+ if (exception == 3 && kgdb_isremovedbreak(regs->ip - 1)) {
+ regs->ip -= 1;
+ return 1;
+ }
+ return 0;
+}
+
+unsigned long kgdb_arch_pc(int exception, struct pt_regs *regs)
+{
+ if (exception == 3)
+ return instruction_pointer(regs) - 1;
+ return instruction_pointer(regs);
+}
+
+void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long ip)
+{
+ regs->ip = ip;
+}
+
+int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
+{
+ int err;
+ char opc[BREAK_INSTR_SIZE];
+
+ bpt->type = BP_BREAKPOINT;
+ err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr,
+ BREAK_INSTR_SIZE);
+ if (err)
+ return err;
+ err = probe_kernel_write((char *)bpt->bpt_addr,
+ arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE);
+ if (!err)
+ return err;
+ /*
+ * It is safe to call text_poke() because normal kernel execution
+ * is stopped on all cores, so long as the text_mutex is not locked.
+ */
+ if (mutex_is_locked(&text_mutex))
+ return -EBUSY;
+ text_poke((void *)bpt->bpt_addr, arch_kgdb_ops.gdb_bpt_instr,
+ BREAK_INSTR_SIZE);
+ err = probe_kernel_read(opc, (char *)bpt->bpt_addr, BREAK_INSTR_SIZE);
+ if (err)
+ return err;
+ if (memcmp(opc, arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE))
+ return -EINVAL;
+ bpt->type = BP_POKE_BREAKPOINT;
+
+ return err;
+}
+
+int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt)
+{
+ int err;
+ char opc[BREAK_INSTR_SIZE];
+
+ if (bpt->type != BP_POKE_BREAKPOINT)
+ goto knl_write;
+ /*
+ * It is safe to call text_poke() because normal kernel execution
+ * is stopped on all cores, so long as the text_mutex is not locked.
+ */
+ if (mutex_is_locked(&text_mutex))
+ goto knl_write;
+ text_poke((void *)bpt->bpt_addr, bpt->saved_instr, BREAK_INSTR_SIZE);
+ err = probe_kernel_read(opc, (char *)bpt->bpt_addr, BREAK_INSTR_SIZE);
+ if (err || memcmp(opc, bpt->saved_instr, BREAK_INSTR_SIZE))
+ goto knl_write;
+ return err;
+
+knl_write:
+ return probe_kernel_write((char *)bpt->bpt_addr,
+ (char *)bpt->saved_instr, BREAK_INSTR_SIZE);
+}
+
+struct kgdb_arch arch_kgdb_ops = {
+ /* Breakpoint instruction: */
+ .gdb_bpt_instr = { 0xcc },
+ .flags = KGDB_HW_BREAKPOINT,
+ .set_hw_breakpoint = kgdb_set_hw_break,
+ .remove_hw_breakpoint = kgdb_remove_hw_break,
+ .disable_hw_break = kgdb_disable_hw_debug,
+ .remove_all_hw_break = kgdb_remove_all_hw_break,
+ .correct_hw_break = kgdb_correct_hw_break,
+};
diff --git a/arch/x86/kernel/kprobes/Makefile b/arch/x86/kernel/kprobes/Makefile
new file mode 100644
index 000000000..0d33169cc
--- /dev/null
+++ b/arch/x86/kernel/kprobes/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for kernel probes
+#
+
+obj-$(CONFIG_KPROBES) += core.o
+obj-$(CONFIG_OPTPROBES) += opt.o
+obj-$(CONFIG_KPROBES_ON_FTRACE) += ftrace.o
diff --git a/arch/x86/kernel/kprobes/common.h b/arch/x86/kernel/kprobes/common.h
new file mode 100644
index 000000000..2b949f4fd
--- /dev/null
+++ b/arch/x86/kernel/kprobes/common.h
@@ -0,0 +1,108 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __X86_KERNEL_KPROBES_COMMON_H
+#define __X86_KERNEL_KPROBES_COMMON_H
+
+/* Kprobes and Optprobes common header */
+
+#include <asm/asm.h>
+
+#ifdef CONFIG_FRAME_POINTER
+# define SAVE_RBP_STRING " push %" _ASM_BP "\n" \
+ " mov %" _ASM_SP ", %" _ASM_BP "\n"
+#else
+# define SAVE_RBP_STRING " push %" _ASM_BP "\n"
+#endif
+
+#ifdef CONFIG_X86_64
+#define SAVE_REGS_STRING \
+ /* Skip cs, ip, orig_ax. */ \
+ " subq $24, %rsp\n" \
+ " pushq %rdi\n" \
+ " pushq %rsi\n" \
+ " pushq %rdx\n" \
+ " pushq %rcx\n" \
+ " pushq %rax\n" \
+ " pushq %r8\n" \
+ " pushq %r9\n" \
+ " pushq %r10\n" \
+ " pushq %r11\n" \
+ " pushq %rbx\n" \
+ SAVE_RBP_STRING \
+ " pushq %r12\n" \
+ " pushq %r13\n" \
+ " pushq %r14\n" \
+ " pushq %r15\n"
+#define RESTORE_REGS_STRING \
+ " popq %r15\n" \
+ " popq %r14\n" \
+ " popq %r13\n" \
+ " popq %r12\n" \
+ " popq %rbp\n" \
+ " popq %rbx\n" \
+ " popq %r11\n" \
+ " popq %r10\n" \
+ " popq %r9\n" \
+ " popq %r8\n" \
+ " popq %rax\n" \
+ " popq %rcx\n" \
+ " popq %rdx\n" \
+ " popq %rsi\n" \
+ " popq %rdi\n" \
+ /* Skip orig_ax, ip, cs */ \
+ " addq $24, %rsp\n"
+#else
+#define SAVE_REGS_STRING \
+ /* Skip cs, ip, orig_ax and gs. */ \
+ " subl $16, %esp\n" \
+ " pushl %fs\n" \
+ " pushl %es\n" \
+ " pushl %ds\n" \
+ " pushl %eax\n" \
+ SAVE_RBP_STRING \
+ " pushl %edi\n" \
+ " pushl %esi\n" \
+ " pushl %edx\n" \
+ " pushl %ecx\n" \
+ " pushl %ebx\n"
+#define RESTORE_REGS_STRING \
+ " popl %ebx\n" \
+ " popl %ecx\n" \
+ " popl %edx\n" \
+ " popl %esi\n" \
+ " popl %edi\n" \
+ " popl %ebp\n" \
+ " popl %eax\n" \
+ /* Skip ds, es, fs, gs, orig_ax, and ip. Note: don't pop cs here*/\
+ " addl $24, %esp\n"
+#endif
+
+/* Ensure if the instruction can be boostable */
+extern int can_boost(struct insn *insn, void *orig_addr);
+/* Recover instruction if given address is probed */
+extern unsigned long recover_probed_instruction(kprobe_opcode_t *buf,
+ unsigned long addr);
+/*
+ * Copy an instruction and adjust the displacement if the instruction
+ * uses the %rip-relative addressing mode.
+ */
+extern int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn);
+
+/* Generate a relative-jump/call instruction */
+extern void synthesize_reljump(void *dest, void *from, void *to);
+extern void synthesize_relcall(void *dest, void *from, void *to);
+
+#ifdef CONFIG_OPTPROBES
+extern int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter);
+extern unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr);
+#else /* !CONFIG_OPTPROBES */
+static inline int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter)
+{
+ return 0;
+}
+static inline unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr)
+{
+ return addr;
+}
+#endif
+
+#endif
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
new file mode 100644
index 000000000..3334e1400
--- /dev/null
+++ b/arch/x86/kernel/kprobes/core.c
@@ -0,0 +1,1159 @@
+/*
+ * Kernel Probes (KProbes)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2002, 2004
+ *
+ * 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
+ * Probes initial implementation ( includes contributions from
+ * Rusty Russell).
+ * 2004-July Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
+ * interface to access function arguments.
+ * 2004-Oct Jim Keniston <jkenisto@us.ibm.com> and Prasanna S Panchamukhi
+ * <prasanna@in.ibm.com> adapted for x86_64 from i386.
+ * 2005-Mar Roland McGrath <roland@redhat.com>
+ * Fixed to handle %rip-relative addressing mode correctly.
+ * 2005-May Hien Nguyen <hien@us.ibm.com>, Jim Keniston
+ * <jkenisto@us.ibm.com> and Prasanna S Panchamukhi
+ * <prasanna@in.ibm.com> added function-return probes.
+ * 2005-May Rusty Lynch <rusty.lynch@intel.com>
+ * Added function return probes functionality
+ * 2006-Feb Masami Hiramatsu <hiramatu@sdl.hitachi.co.jp> added
+ * kprobe-booster and kretprobe-booster for i386.
+ * 2007-Dec Masami Hiramatsu <mhiramat@redhat.com> added kprobe-booster
+ * and kretprobe-booster for x86-64
+ * 2007-Dec Masami Hiramatsu <mhiramat@redhat.com>, Arjan van de Ven
+ * <arjan@infradead.org> and Jim Keniston <jkenisto@us.ibm.com>
+ * unified x86 kprobes code.
+ */
+#include <linux/kprobes.h>
+#include <linux/ptrace.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/hardirq.h>
+#include <linux/preempt.h>
+#include <linux/sched/debug.h>
+#include <linux/extable.h>
+#include <linux/kdebug.h>
+#include <linux/kallsyms.h>
+#include <linux/ftrace.h>
+#include <linux/frame.h>
+#include <linux/kasan.h>
+#include <linux/moduleloader.h>
+
+#include <asm/text-patching.h>
+#include <asm/cacheflush.h>
+#include <asm/desc.h>
+#include <asm/pgtable.h>
+#include <linux/uaccess.h>
+#include <asm/alternative.h>
+#include <asm/insn.h>
+#include <asm/debugreg.h>
+#include <asm/set_memory.h>
+
+#include "common.h"
+
+DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
+DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
+
+#define stack_addr(regs) ((unsigned long *)kernel_stack_pointer(regs))
+
+#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\
+ (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \
+ (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \
+ (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \
+ (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \
+ << (row % 32))
+ /*
+ * Undefined/reserved opcodes, conditional jump, Opcode Extension
+ * Groups, and some special opcodes can not boost.
+ * This is non-const and volatile to keep gcc from statically
+ * optimizing it out, as variable_test_bit makes gcc think only
+ * *(unsigned long*) is used.
+ */
+static volatile u32 twobyte_is_boostable[256 / 32] = {
+ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+ /* ---------------------------------------------- */
+ W(0x00, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0) | /* 00 */
+ W(0x10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1) , /* 10 */
+ W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 20 */
+ W(0x30, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */
+ W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
+ W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */
+ W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1) | /* 60 */
+ W(0x70, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) , /* 70 */
+ W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 80 */
+ W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
+ W(0xa0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) | /* a0 */
+ W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) , /* b0 */
+ W(0xc0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */
+ W(0xd0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) , /* d0 */
+ W(0xe0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) | /* e0 */
+ W(0xf0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0) /* f0 */
+ /* ----------------------------------------------- */
+ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+};
+#undef W
+
+struct kretprobe_blackpoint kretprobe_blacklist[] = {
+ {"__switch_to", }, /* This function switches only current task, but
+ doesn't switch kernel stack.*/
+ {NULL, NULL} /* Terminator */
+};
+
+const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist);
+
+static nokprobe_inline void
+__synthesize_relative_insn(void *dest, void *from, void *to, u8 op)
+{
+ struct __arch_relative_insn {
+ u8 op;
+ s32 raddr;
+ } __packed *insn;
+
+ insn = (struct __arch_relative_insn *)dest;
+ insn->raddr = (s32)((long)(to) - ((long)(from) + 5));
+ insn->op = op;
+}
+
+/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/
+void synthesize_reljump(void *dest, void *from, void *to)
+{
+ __synthesize_relative_insn(dest, from, to, RELATIVEJUMP_OPCODE);
+}
+NOKPROBE_SYMBOL(synthesize_reljump);
+
+/* Insert a call instruction at address 'from', which calls address 'to'.*/
+void synthesize_relcall(void *dest, void *from, void *to)
+{
+ __synthesize_relative_insn(dest, from, to, RELATIVECALL_OPCODE);
+}
+NOKPROBE_SYMBOL(synthesize_relcall);
+
+/*
+ * Skip the prefixes of the instruction.
+ */
+static kprobe_opcode_t *skip_prefixes(kprobe_opcode_t *insn)
+{
+ insn_attr_t attr;
+
+ attr = inat_get_opcode_attribute((insn_byte_t)*insn);
+ while (inat_is_legacy_prefix(attr)) {
+ insn++;
+ attr = inat_get_opcode_attribute((insn_byte_t)*insn);
+ }
+#ifdef CONFIG_X86_64
+ if (inat_is_rex_prefix(attr))
+ insn++;
+#endif
+ return insn;
+}
+NOKPROBE_SYMBOL(skip_prefixes);
+
+/*
+ * Returns non-zero if INSN is boostable.
+ * RIP relative instructions are adjusted at copying time in 64 bits mode
+ */
+int can_boost(struct insn *insn, void *addr)
+{
+ kprobe_opcode_t opcode;
+ insn_byte_t prefix;
+ int i;
+
+ if (search_exception_tables((unsigned long)addr))
+ return 0; /* Page fault may occur on this address. */
+
+ /* 2nd-byte opcode */
+ if (insn->opcode.nbytes == 2)
+ return test_bit(insn->opcode.bytes[1],
+ (unsigned long *)twobyte_is_boostable);
+
+ if (insn->opcode.nbytes != 1)
+ return 0;
+
+ for_each_insn_prefix(insn, i, prefix) {
+ insn_attr_t attr;
+
+ attr = inat_get_opcode_attribute(prefix);
+ /* Can't boost Address-size override prefix and CS override prefix */
+ if (prefix == 0x2e || inat_is_address_size_prefix(attr))
+ return 0;
+ }
+
+ opcode = insn->opcode.bytes[0];
+
+ switch (opcode & 0xf0) {
+ case 0x60:
+ /* can't boost "bound" */
+ return (opcode != 0x62);
+ case 0x70:
+ return 0; /* can't boost conditional jump */
+ case 0x90:
+ return opcode != 0x9a; /* can't boost call far */
+ case 0xc0:
+ /* can't boost software-interruptions */
+ return (0xc1 < opcode && opcode < 0xcc) || opcode == 0xcf;
+ case 0xd0:
+ /* can boost AA* and XLAT */
+ return (opcode == 0xd4 || opcode == 0xd5 || opcode == 0xd7);
+ case 0xe0:
+ /* can boost in/out and absolute jmps */
+ return ((opcode & 0x04) || opcode == 0xea);
+ case 0xf0:
+ /* clear and set flags are boostable */
+ return (opcode == 0xf5 || (0xf7 < opcode && opcode < 0xfe));
+ default:
+ /* call is not boostable */
+ return opcode != 0x9a;
+ }
+}
+
+static unsigned long
+__recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr)
+{
+ struct kprobe *kp;
+ unsigned long faddr;
+
+ kp = get_kprobe((void *)addr);
+ faddr = ftrace_location(addr);
+ /*
+ * Addresses inside the ftrace location are refused by
+ * arch_check_ftrace_location(). Something went terribly wrong
+ * if such an address is checked here.
+ */
+ if (WARN_ON(faddr && faddr != addr))
+ return 0UL;
+ /*
+ * Use the current code if it is not modified by Kprobe
+ * and it cannot be modified by ftrace.
+ */
+ if (!kp && !faddr)
+ return addr;
+
+ /*
+ * Basically, kp->ainsn.insn has an original instruction.
+ * However, RIP-relative instruction can not do single-stepping
+ * at different place, __copy_instruction() tweaks the displacement of
+ * that instruction. In that case, we can't recover the instruction
+ * from the kp->ainsn.insn.
+ *
+ * On the other hand, in case on normal Kprobe, kp->opcode has a copy
+ * of the first byte of the probed instruction, which is overwritten
+ * by int3. And the instruction at kp->addr is not modified by kprobes
+ * except for the first byte, we can recover the original instruction
+ * from it and kp->opcode.
+ *
+ * In case of Kprobes using ftrace, we do not have a copy of
+ * the original instruction. In fact, the ftrace location might
+ * be modified at anytime and even could be in an inconsistent state.
+ * Fortunately, we know that the original code is the ideal 5-byte
+ * long NOP.
+ */
+ if (probe_kernel_read(buf, (void *)addr,
+ MAX_INSN_SIZE * sizeof(kprobe_opcode_t)))
+ return 0UL;
+
+ if (faddr)
+ memcpy(buf, ideal_nops[NOP_ATOMIC5], 5);
+ else
+ buf[0] = kp->opcode;
+ return (unsigned long)buf;
+}
+
+/*
+ * Recover the probed instruction at addr for further analysis.
+ * Caller must lock kprobes by kprobe_mutex, or disable preemption
+ * for preventing to release referencing kprobes.
+ * Returns zero if the instruction can not get recovered (or access failed).
+ */
+unsigned long recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
+{
+ unsigned long __addr;
+
+ __addr = __recover_optprobed_insn(buf, addr);
+ if (__addr != addr)
+ return __addr;
+
+ return __recover_probed_insn(buf, addr);
+}
+
+/* Check if paddr is at an instruction boundary */
+static int can_probe(unsigned long paddr)
+{
+ unsigned long addr, __addr, offset = 0;
+ struct insn insn;
+ kprobe_opcode_t buf[MAX_INSN_SIZE];
+
+ if (!kallsyms_lookup_size_offset(paddr, NULL, &offset))
+ return 0;
+
+ /* Decode instructions */
+ addr = paddr - offset;
+ while (addr < paddr) {
+ /*
+ * Check if the instruction has been modified by another
+ * kprobe, in which case we replace the breakpoint by the
+ * original instruction in our buffer.
+ * Also, jump optimization will change the breakpoint to
+ * relative-jump. Since the relative-jump itself is
+ * normally used, we just go through if there is no kprobe.
+ */
+ __addr = recover_probed_instruction(buf, addr);
+ if (!__addr)
+ return 0;
+ kernel_insn_init(&insn, (void *)__addr, MAX_INSN_SIZE);
+ insn_get_length(&insn);
+
+ /*
+ * Another debugging subsystem might insert this breakpoint.
+ * In that case, we can't recover it.
+ */
+ if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
+ return 0;
+ addr += insn.length;
+ }
+
+ return (addr == paddr);
+}
+
+/*
+ * Returns non-zero if opcode modifies the interrupt flag.
+ */
+static int is_IF_modifier(kprobe_opcode_t *insn)
+{
+ /* Skip prefixes */
+ insn = skip_prefixes(insn);
+
+ switch (*insn) {
+ case 0xfa: /* cli */
+ case 0xfb: /* sti */
+ case 0xcf: /* iret/iretd */
+ case 0x9d: /* popf/popfd */
+ return 1;
+ }
+
+ return 0;
+}
+
+/*
+ * Copy an instruction with recovering modified instruction by kprobes
+ * and adjust the displacement if the instruction uses the %rip-relative
+ * addressing mode. Note that since @real will be the final place of copied
+ * instruction, displacement must be adjust by @real, not @dest.
+ * This returns the length of copied instruction, or 0 if it has an error.
+ */
+int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn)
+{
+ kprobe_opcode_t buf[MAX_INSN_SIZE];
+ unsigned long recovered_insn =
+ recover_probed_instruction(buf, (unsigned long)src);
+
+ if (!recovered_insn || !insn)
+ return 0;
+
+ /* This can access kernel text if given address is not recovered */
+ if (probe_kernel_read(dest, (void *)recovered_insn, MAX_INSN_SIZE))
+ return 0;
+
+ kernel_insn_init(insn, dest, MAX_INSN_SIZE);
+ insn_get_length(insn);
+
+ /* Another subsystem puts a breakpoint, failed to recover */
+ if (insn->opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
+ return 0;
+
+ /* We should not singlestep on the exception masking instructions */
+ if (insn_masking_exception(insn))
+ return 0;
+
+#ifdef CONFIG_X86_64
+ /* Only x86_64 has RIP relative instructions */
+ if (insn_rip_relative(insn)) {
+ s64 newdisp;
+ u8 *disp;
+ /*
+ * The copied instruction uses the %rip-relative addressing
+ * mode. Adjust the displacement for the difference between
+ * the original location of this instruction and the location
+ * of the copy that will actually be run. The tricky bit here
+ * is making sure that the sign extension happens correctly in
+ * this calculation, since we need a signed 32-bit result to
+ * be sign-extended to 64 bits when it's added to the %rip
+ * value and yield the same 64-bit result that the sign-
+ * extension of the original signed 32-bit displacement would
+ * have given.
+ */
+ newdisp = (u8 *) src + (s64) insn->displacement.value
+ - (u8 *) real;
+ if ((s64) (s32) newdisp != newdisp) {
+ pr_err("Kprobes error: new displacement does not fit into s32 (%llx)\n", newdisp);
+ return 0;
+ }
+ disp = (u8 *) dest + insn_offset_displacement(insn);
+ *(s32 *) disp = (s32) newdisp;
+ }
+#endif
+ return insn->length;
+}
+
+/* Prepare reljump right after instruction to boost */
+static int prepare_boost(kprobe_opcode_t *buf, struct kprobe *p,
+ struct insn *insn)
+{
+ int len = insn->length;
+
+ if (can_boost(insn, p->addr) &&
+ MAX_INSN_SIZE - len >= RELATIVEJUMP_SIZE) {
+ /*
+ * These instructions can be executed directly if it
+ * jumps back to correct address.
+ */
+ synthesize_reljump(buf + len, p->ainsn.insn + len,
+ p->addr + insn->length);
+ len += RELATIVEJUMP_SIZE;
+ p->ainsn.boostable = true;
+ } else {
+ p->ainsn.boostable = false;
+ }
+
+ return len;
+}
+
+/* Make page to RO mode when allocate it */
+void *alloc_insn_page(void)
+{
+ void *page;
+
+ page = module_alloc(PAGE_SIZE);
+ if (!page)
+ return NULL;
+
+ /*
+ * First make the page read-only, and only then make it executable to
+ * prevent it from being W+X in between.
+ */
+ set_memory_ro((unsigned long)page, 1);
+
+ /*
+ * TODO: Once additional kernel code protection mechanisms are set, ensure
+ * that the page was not maliciously altered and it is still zeroed.
+ */
+ set_memory_x((unsigned long)page, 1);
+
+ return page;
+}
+
+/* Recover page to RW mode before releasing it */
+void free_insn_page(void *page)
+{
+ /*
+ * First make the page non-executable, and only then make it writable to
+ * prevent it from being W+X in between.
+ */
+ set_memory_nx((unsigned long)page, 1);
+ set_memory_rw((unsigned long)page, 1);
+ module_memfree(page);
+}
+
+static int arch_copy_kprobe(struct kprobe *p)
+{
+ struct insn insn;
+ kprobe_opcode_t buf[MAX_INSN_SIZE];
+ int len;
+
+ /* Copy an instruction with recovering if other optprobe modifies it.*/
+ len = __copy_instruction(buf, p->addr, p->ainsn.insn, &insn);
+ if (!len)
+ return -EINVAL;
+
+ /*
+ * __copy_instruction can modify the displacement of the instruction,
+ * but it doesn't affect boostable check.
+ */
+ len = prepare_boost(buf, p, &insn);
+
+ /* Check whether the instruction modifies Interrupt Flag or not */
+ p->ainsn.if_modifier = is_IF_modifier(buf);
+
+ /* Also, displacement change doesn't affect the first byte */
+ p->opcode = buf[0];
+
+ /* OK, write back the instruction(s) into ROX insn buffer */
+ text_poke(p->ainsn.insn, buf, len);
+
+ return 0;
+}
+
+int arch_prepare_kprobe(struct kprobe *p)
+{
+ int ret;
+
+ if (alternatives_text_reserved(p->addr, p->addr))
+ return -EINVAL;
+
+ if (!can_probe((unsigned long)p->addr))
+ return -EILSEQ;
+ /* insn: must be on special executable page on x86. */
+ p->ainsn.insn = get_insn_slot();
+ if (!p->ainsn.insn)
+ return -ENOMEM;
+
+ ret = arch_copy_kprobe(p);
+ if (ret) {
+ free_insn_slot(p->ainsn.insn, 0);
+ p->ainsn.insn = NULL;
+ }
+
+ return ret;
+}
+
+void arch_arm_kprobe(struct kprobe *p)
+{
+ text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1);
+}
+
+void arch_disarm_kprobe(struct kprobe *p)
+{
+ text_poke(p->addr, &p->opcode, 1);
+}
+
+void arch_remove_kprobe(struct kprobe *p)
+{
+ if (p->ainsn.insn) {
+ free_insn_slot(p->ainsn.insn, p->ainsn.boostable);
+ p->ainsn.insn = NULL;
+ }
+}
+
+static nokprobe_inline void
+save_previous_kprobe(struct kprobe_ctlblk *kcb)
+{
+ kcb->prev_kprobe.kp = kprobe_running();
+ kcb->prev_kprobe.status = kcb->kprobe_status;
+ kcb->prev_kprobe.old_flags = kcb->kprobe_old_flags;
+ kcb->prev_kprobe.saved_flags = kcb->kprobe_saved_flags;
+}
+
+static nokprobe_inline void
+restore_previous_kprobe(struct kprobe_ctlblk *kcb)
+{
+ __this_cpu_write(current_kprobe, kcb->prev_kprobe.kp);
+ kcb->kprobe_status = kcb->prev_kprobe.status;
+ kcb->kprobe_old_flags = kcb->prev_kprobe.old_flags;
+ kcb->kprobe_saved_flags = kcb->prev_kprobe.saved_flags;
+}
+
+static nokprobe_inline void
+set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
+ struct kprobe_ctlblk *kcb)
+{
+ __this_cpu_write(current_kprobe, p);
+ kcb->kprobe_saved_flags = kcb->kprobe_old_flags
+ = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF));
+ if (p->ainsn.if_modifier)
+ kcb->kprobe_saved_flags &= ~X86_EFLAGS_IF;
+}
+
+static nokprobe_inline void clear_btf(void)
+{
+ if (test_thread_flag(TIF_BLOCKSTEP)) {
+ unsigned long debugctl = get_debugctlmsr();
+
+ debugctl &= ~DEBUGCTLMSR_BTF;
+ update_debugctlmsr(debugctl);
+ }
+}
+
+static nokprobe_inline void restore_btf(void)
+{
+ if (test_thread_flag(TIF_BLOCKSTEP)) {
+ unsigned long debugctl = get_debugctlmsr();
+
+ debugctl |= DEBUGCTLMSR_BTF;
+ update_debugctlmsr(debugctl);
+ }
+}
+
+void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs)
+{
+ unsigned long *sara = stack_addr(regs);
+
+ ri->ret_addr = (kprobe_opcode_t *) *sara;
+ ri->fp = sara;
+
+ /* Replace the return addr with trampoline addr */
+ *sara = (unsigned long) &kretprobe_trampoline;
+}
+NOKPROBE_SYMBOL(arch_prepare_kretprobe);
+
+static void setup_singlestep(struct kprobe *p, struct pt_regs *regs,
+ struct kprobe_ctlblk *kcb, int reenter)
+{
+ if (setup_detour_execution(p, regs, reenter))
+ return;
+
+#if !defined(CONFIG_PREEMPT)
+ if (p->ainsn.boostable && !p->post_handler) {
+ /* Boost up -- we can execute copied instructions directly */
+ if (!reenter)
+ reset_current_kprobe();
+ /*
+ * Reentering boosted probe doesn't reset current_kprobe,
+ * nor set current_kprobe, because it doesn't use single
+ * stepping.
+ */
+ regs->ip = (unsigned long)p->ainsn.insn;
+ return;
+ }
+#endif
+ if (reenter) {
+ save_previous_kprobe(kcb);
+ set_current_kprobe(p, regs, kcb);
+ kcb->kprobe_status = KPROBE_REENTER;
+ } else
+ kcb->kprobe_status = KPROBE_HIT_SS;
+ /* Prepare real single stepping */
+ clear_btf();
+ regs->flags |= X86_EFLAGS_TF;
+ regs->flags &= ~X86_EFLAGS_IF;
+ /* single step inline if the instruction is an int3 */
+ if (p->opcode == BREAKPOINT_INSTRUCTION)
+ regs->ip = (unsigned long)p->addr;
+ else
+ regs->ip = (unsigned long)p->ainsn.insn;
+}
+NOKPROBE_SYMBOL(setup_singlestep);
+
+/*
+ * We have reentered the kprobe_handler(), since another probe was hit while
+ * within the handler. We save the original kprobes variables and just single
+ * step on the instruction of the new probe without calling any user handlers.
+ */
+static int reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
+ struct kprobe_ctlblk *kcb)
+{
+ switch (kcb->kprobe_status) {
+ case KPROBE_HIT_SSDONE:
+ case KPROBE_HIT_ACTIVE:
+ case KPROBE_HIT_SS:
+ kprobes_inc_nmissed_count(p);
+ setup_singlestep(p, regs, kcb, 1);
+ break;
+ case KPROBE_REENTER:
+ /* A probe has been hit in the codepath leading up to, or just
+ * after, single-stepping of a probed instruction. This entire
+ * codepath should strictly reside in .kprobes.text section.
+ * Raise a BUG or we'll continue in an endless reentering loop
+ * and eventually a stack overflow.
+ */
+ pr_err("Unrecoverable kprobe detected.\n");
+ dump_kprobe(p);
+ BUG();
+ default:
+ /* impossible cases */
+ WARN_ON(1);
+ return 0;
+ }
+
+ return 1;
+}
+NOKPROBE_SYMBOL(reenter_kprobe);
+
+/*
+ * Interrupts are disabled on entry as trap3 is an interrupt gate and they
+ * remain disabled throughout this function.
+ */
+int kprobe_int3_handler(struct pt_regs *regs)
+{
+ kprobe_opcode_t *addr;
+ struct kprobe *p;
+ struct kprobe_ctlblk *kcb;
+
+ if (user_mode(regs))
+ return 0;
+
+ addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t));
+ /*
+ * We don't want to be preempted for the entire duration of kprobe
+ * processing. Since int3 and debug trap disables irqs and we clear
+ * IF while singlestepping, it must be no preemptible.
+ */
+
+ kcb = get_kprobe_ctlblk();
+ p = get_kprobe(addr);
+
+ if (p) {
+ if (kprobe_running()) {
+ if (reenter_kprobe(p, regs, kcb))
+ return 1;
+ } else {
+ set_current_kprobe(p, regs, kcb);
+ kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+
+ /*
+ * If we have no pre-handler or it returned 0, we
+ * continue with normal processing. If we have a
+ * pre-handler and it returned non-zero, that means
+ * user handler setup registers to exit to another
+ * instruction, we must skip the single stepping.
+ */
+ if (!p->pre_handler || !p->pre_handler(p, regs))
+ setup_singlestep(p, regs, kcb, 0);
+ else
+ reset_current_kprobe();
+ return 1;
+ }
+ } else if (*addr != BREAKPOINT_INSTRUCTION) {
+ /*
+ * The breakpoint instruction was removed right
+ * after we hit it. Another cpu has removed
+ * either a probepoint or a debugger breakpoint
+ * at this address. In either case, no further
+ * handling of this interrupt is appropriate.
+ * Back up over the (now missing) int3 and run
+ * the original instruction.
+ */
+ regs->ip = (unsigned long)addr;
+ return 1;
+ } /* else: not a kprobe fault; let the kernel handle it */
+
+ return 0;
+}
+NOKPROBE_SYMBOL(kprobe_int3_handler);
+
+/*
+ * When a retprobed function returns, this code saves registers and
+ * calls trampoline_handler() runs, which calls the kretprobe's handler.
+ */
+asm(
+ ".global kretprobe_trampoline\n"
+ ".type kretprobe_trampoline, @function\n"
+ "kretprobe_trampoline:\n"
+#ifdef CONFIG_X86_64
+ /* We don't bother saving the ss register */
+ " pushq %rsp\n"
+ " pushfq\n"
+ SAVE_REGS_STRING
+ " movq %rsp, %rdi\n"
+ " call trampoline_handler\n"
+ /* Replace saved sp with true return address. */
+ " movq %rax, 152(%rsp)\n"
+ RESTORE_REGS_STRING
+ " popfq\n"
+#else
+ " pushf\n"
+ SAVE_REGS_STRING
+ " movl %esp, %eax\n"
+ " call trampoline_handler\n"
+ /* Move flags to cs */
+ " movl 56(%esp), %edx\n"
+ " movl %edx, 52(%esp)\n"
+ /* Replace saved flags with true return address. */
+ " movl %eax, 56(%esp)\n"
+ RESTORE_REGS_STRING
+ " popf\n"
+#endif
+ " ret\n"
+ ".size kretprobe_trampoline, .-kretprobe_trampoline\n"
+);
+NOKPROBE_SYMBOL(kretprobe_trampoline);
+STACK_FRAME_NON_STANDARD(kretprobe_trampoline);
+
+/*
+ * Called from kretprobe_trampoline
+ */
+__visible __used void *trampoline_handler(struct pt_regs *regs)
+{
+ struct kretprobe_instance *ri = NULL;
+ struct hlist_head *head, empty_rp;
+ struct hlist_node *tmp;
+ unsigned long flags, orig_ret_address = 0;
+ unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline;
+ kprobe_opcode_t *correct_ret_addr = NULL;
+ void *frame_pointer;
+ bool skipped = false;
+
+ /*
+ * Set a dummy kprobe for avoiding kretprobe recursion.
+ * Since kretprobe never run in kprobe handler, kprobe must not
+ * be running at this point.
+ */
+ kprobe_busy_begin();
+
+ INIT_HLIST_HEAD(&empty_rp);
+ kretprobe_hash_lock(current, &head, &flags);
+ /* fixup registers */
+#ifdef CONFIG_X86_64
+ regs->cs = __KERNEL_CS;
+ /* On x86-64, we use pt_regs->sp for return address holder. */
+ frame_pointer = &regs->sp;
+#else
+ regs->cs = __KERNEL_CS | get_kernel_rpl();
+ regs->gs = 0;
+ /* On x86-32, we use pt_regs->flags for return address holder. */
+ frame_pointer = &regs->flags;
+#endif
+ regs->ip = trampoline_address;
+ regs->orig_ax = ~0UL;
+
+ /*
+ * It is possible to have multiple instances associated with a given
+ * task either because multiple functions in the call path have
+ * return probes installed on them, and/or more than one
+ * return probe was registered for a target function.
+ *
+ * We can handle this because:
+ * - instances are always pushed into the head of the list
+ * - when multiple return probes are registered for the same
+ * function, the (chronologically) first instance's ret_addr
+ * will be the real return address, and all the rest will
+ * point to kretprobe_trampoline.
+ */
+ hlist_for_each_entry(ri, head, hlist) {
+ if (ri->task != current)
+ /* another task is sharing our hash bucket */
+ continue;
+ /*
+ * Return probes must be pushed on this hash list correct
+ * order (same as return order) so that it can be poped
+ * correctly. However, if we find it is pushed it incorrect
+ * order, this means we find a function which should not be
+ * probed, because the wrong order entry is pushed on the
+ * path of processing other kretprobe itself.
+ */
+ if (ri->fp != frame_pointer) {
+ if (!skipped)
+ pr_warn("kretprobe is stacked incorrectly. Trying to fixup.\n");
+ skipped = true;
+ continue;
+ }
+
+ orig_ret_address = (unsigned long)ri->ret_addr;
+ if (skipped)
+ pr_warn("%ps must be blacklisted because of incorrect kretprobe order\n",
+ ri->rp->kp.addr);
+
+ if (orig_ret_address != trampoline_address)
+ /*
+ * This is the real return address. Any other
+ * instances associated with this task are for
+ * other calls deeper on the call stack
+ */
+ break;
+ }
+
+ kretprobe_assert(ri, orig_ret_address, trampoline_address);
+
+ correct_ret_addr = ri->ret_addr;
+ hlist_for_each_entry_safe(ri, tmp, head, hlist) {
+ if (ri->task != current)
+ /* another task is sharing our hash bucket */
+ continue;
+ if (ri->fp != frame_pointer)
+ continue;
+
+ orig_ret_address = (unsigned long)ri->ret_addr;
+ if (ri->rp && ri->rp->handler) {
+ __this_cpu_write(current_kprobe, &ri->rp->kp);
+ ri->ret_addr = correct_ret_addr;
+ ri->rp->handler(ri, regs);
+ __this_cpu_write(current_kprobe, &kprobe_busy);
+ }
+
+ recycle_rp_inst(ri, &empty_rp);
+
+ if (orig_ret_address != trampoline_address)
+ /*
+ * This is the real return address. Any other
+ * instances associated with this task are for
+ * other calls deeper on the call stack
+ */
+ break;
+ }
+
+ kretprobe_hash_unlock(current, &flags);
+
+ kprobe_busy_end();
+
+ hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) {
+ hlist_del(&ri->hlist);
+ kfree(ri);
+ }
+ return (void *)orig_ret_address;
+}
+NOKPROBE_SYMBOL(trampoline_handler);
+
+/*
+ * Called after single-stepping. p->addr is the address of the
+ * instruction whose first byte has been replaced by the "int 3"
+ * instruction. To avoid the SMP problems that can occur when we
+ * temporarily put back the original opcode to single-step, we
+ * single-stepped a copy of the instruction. The address of this
+ * copy is p->ainsn.insn.
+ *
+ * This function prepares to return from the post-single-step
+ * interrupt. We have to fix up the stack as follows:
+ *
+ * 0) Except in the case of absolute or indirect jump or call instructions,
+ * the new ip is relative to the copied instruction. We need to make
+ * it relative to the original instruction.
+ *
+ * 1) If the single-stepped instruction was pushfl, then the TF and IF
+ * flags are set in the just-pushed flags, and may need to be cleared.
+ *
+ * 2) If the single-stepped instruction was a call, the return address
+ * that is atop the stack is the address following the copied instruction.
+ * We need to make it the address following the original instruction.
+ *
+ * If this is the first time we've single-stepped the instruction at
+ * this probepoint, and the instruction is boostable, boost it: add a
+ * jump instruction after the copied instruction, that jumps to the next
+ * instruction after the probepoint.
+ */
+static void resume_execution(struct kprobe *p, struct pt_regs *regs,
+ struct kprobe_ctlblk *kcb)
+{
+ unsigned long *tos = stack_addr(regs);
+ unsigned long copy_ip = (unsigned long)p->ainsn.insn;
+ unsigned long orig_ip = (unsigned long)p->addr;
+ kprobe_opcode_t *insn = p->ainsn.insn;
+
+ /* Skip prefixes */
+ insn = skip_prefixes(insn);
+
+ regs->flags &= ~X86_EFLAGS_TF;
+ switch (*insn) {
+ case 0x9c: /* pushfl */
+ *tos &= ~(X86_EFLAGS_TF | X86_EFLAGS_IF);
+ *tos |= kcb->kprobe_old_flags;
+ break;
+ case 0xc2: /* iret/ret/lret */
+ case 0xc3:
+ case 0xca:
+ case 0xcb:
+ case 0xcf:
+ case 0xea: /* jmp absolute -- ip is correct */
+ /* ip is already adjusted, no more changes required */
+ p->ainsn.boostable = true;
+ goto no_change;
+ case 0xe8: /* call relative - Fix return addr */
+ *tos = orig_ip + (*tos - copy_ip);
+ break;
+#ifdef CONFIG_X86_32
+ case 0x9a: /* call absolute -- same as call absolute, indirect */
+ *tos = orig_ip + (*tos - copy_ip);
+ goto no_change;
+#endif
+ case 0xff:
+ if ((insn[1] & 0x30) == 0x10) {
+ /*
+ * call absolute, indirect
+ * Fix return addr; ip is correct.
+ * But this is not boostable
+ */
+ *tos = orig_ip + (*tos - copy_ip);
+ goto no_change;
+ } else if (((insn[1] & 0x31) == 0x20) ||
+ ((insn[1] & 0x31) == 0x21)) {
+ /*
+ * jmp near and far, absolute indirect
+ * ip is correct. And this is boostable
+ */
+ p->ainsn.boostable = true;
+ goto no_change;
+ }
+ default:
+ break;
+ }
+
+ regs->ip += orig_ip - copy_ip;
+
+no_change:
+ restore_btf();
+}
+NOKPROBE_SYMBOL(resume_execution);
+
+/*
+ * Interrupts are disabled on entry as trap1 is an interrupt gate and they
+ * remain disabled throughout this function.
+ */
+int kprobe_debug_handler(struct pt_regs *regs)
+{
+ struct kprobe *cur = kprobe_running();
+ struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+ if (!cur)
+ return 0;
+
+ resume_execution(cur, regs, kcb);
+ regs->flags |= kcb->kprobe_saved_flags;
+
+ if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
+ kcb->kprobe_status = KPROBE_HIT_SSDONE;
+ cur->post_handler(cur, regs, 0);
+ }
+
+ /* Restore back the original saved kprobes variables and continue. */
+ if (kcb->kprobe_status == KPROBE_REENTER) {
+ restore_previous_kprobe(kcb);
+ goto out;
+ }
+ reset_current_kprobe();
+out:
+ /*
+ * if somebody else is singlestepping across a probe point, flags
+ * will have TF set, in which case, continue the remaining processing
+ * of do_debug, as if this is not a probe hit.
+ */
+ if (regs->flags & X86_EFLAGS_TF)
+ return 0;
+
+ return 1;
+}
+NOKPROBE_SYMBOL(kprobe_debug_handler);
+
+int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
+{
+ struct kprobe *cur = kprobe_running();
+ struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+ if (unlikely(regs->ip == (unsigned long)cur->ainsn.insn)) {
+ /* This must happen on single-stepping */
+ WARN_ON(kcb->kprobe_status != KPROBE_HIT_SS &&
+ kcb->kprobe_status != KPROBE_REENTER);
+ /*
+ * We are here because the instruction being single
+ * stepped caused a page fault. We reset the current
+ * kprobe and the ip points back to the probe address
+ * and allow the page fault handler to continue as a
+ * normal page fault.
+ */
+ regs->ip = (unsigned long)cur->addr;
+ /*
+ * Trap flag (TF) has been set here because this fault
+ * happened where the single stepping will be done.
+ * So clear it by resetting the current kprobe:
+ */
+ regs->flags &= ~X86_EFLAGS_TF;
+ /*
+ * Since the single step (trap) has been cancelled,
+ * we need to restore BTF here.
+ */
+ restore_btf();
+
+ /*
+ * If the TF flag was set before the kprobe hit,
+ * don't touch it:
+ */
+ regs->flags |= kcb->kprobe_old_flags;
+
+ if (kcb->kprobe_status == KPROBE_REENTER)
+ restore_previous_kprobe(kcb);
+ else
+ reset_current_kprobe();
+ } else if (kcb->kprobe_status == KPROBE_HIT_ACTIVE ||
+ kcb->kprobe_status == KPROBE_HIT_SSDONE) {
+ /*
+ * We increment the nmissed count for accounting,
+ * we can also use npre/npostfault count for accounting
+ * these specific fault cases.
+ */
+ kprobes_inc_nmissed_count(cur);
+
+ /*
+ * We come here because instructions in the pre/post
+ * handler caused the page_fault, this could happen
+ * if handler tries to access user space by
+ * copy_from_user(), get_user() etc. Let the
+ * user-specified handler try to fix it first.
+ */
+ if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr))
+ return 1;
+
+ /*
+ * In case the user-specified fault handler returned
+ * zero, try to fix up.
+ */
+ if (fixup_exception(regs, trapnr))
+ return 1;
+
+ /*
+ * fixup routine could not handle it,
+ * Let do_page_fault() fix it.
+ */
+ }
+
+ return 0;
+}
+NOKPROBE_SYMBOL(kprobe_fault_handler);
+
+/*
+ * Wrapper routine for handling exceptions.
+ */
+int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val,
+ void *data)
+{
+ struct die_args *args = data;
+ int ret = NOTIFY_DONE;
+
+ if (args->regs && user_mode(args->regs))
+ return ret;
+
+ if (val == DIE_GPF) {
+ /*
+ * To be potentially processing a kprobe fault and to
+ * trust the result from kprobe_running(), we have
+ * be non-preemptible.
+ */
+ if (!preemptible() && kprobe_running() &&
+ kprobe_fault_handler(args->regs, args->trapnr))
+ ret = NOTIFY_STOP;
+ }
+ return ret;
+}
+NOKPROBE_SYMBOL(kprobe_exceptions_notify);
+
+bool arch_within_kprobe_blacklist(unsigned long addr)
+{
+ bool is_in_entry_trampoline_section = false;
+
+#ifdef CONFIG_X86_64
+ is_in_entry_trampoline_section =
+ (addr >= (unsigned long)__entry_trampoline_start &&
+ addr < (unsigned long)__entry_trampoline_end);
+#endif
+ return (addr >= (unsigned long)__kprobes_text_start &&
+ addr < (unsigned long)__kprobes_text_end) ||
+ (addr >= (unsigned long)__entry_text_start &&
+ addr < (unsigned long)__entry_text_end) ||
+ is_in_entry_trampoline_section;
+}
+
+int __init arch_populate_kprobe_blacklist(void)
+{
+ return kprobe_add_area_blacklist((unsigned long)__entry_text_start,
+ (unsigned long)__entry_text_end);
+}
+
+int __init arch_init_kprobes(void)
+{
+ return 0;
+}
+
+int arch_trampoline_kprobe(struct kprobe *p)
+{
+ return 0;
+}
diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c
new file mode 100644
index 000000000..ef819e196
--- /dev/null
+++ b/arch/x86/kernel/kprobes/ftrace.c
@@ -0,0 +1,76 @@
+/*
+ * Dynamic Ftrace based Kprobes Optimization
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) Hitachi Ltd., 2012
+ */
+#include <linux/kprobes.h>
+#include <linux/ptrace.h>
+#include <linux/hardirq.h>
+#include <linux/preempt.h>
+#include <linux/ftrace.h>
+
+#include "common.h"
+
+/* Ftrace callback handler for kprobes -- called under preepmt disabed */
+void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *ops, struct pt_regs *regs)
+{
+ struct kprobe *p;
+ struct kprobe_ctlblk *kcb;
+
+ /* Preempt is disabled by ftrace */
+ p = get_kprobe((kprobe_opcode_t *)ip);
+ if (unlikely(!p) || kprobe_disabled(p))
+ return;
+
+ kcb = get_kprobe_ctlblk();
+ if (kprobe_running()) {
+ kprobes_inc_nmissed_count(p);
+ } else {
+ unsigned long orig_ip = regs->ip;
+ /* Kprobe handler expects regs->ip = ip + 1 as breakpoint hit */
+ regs->ip = ip + sizeof(kprobe_opcode_t);
+
+ __this_cpu_write(current_kprobe, p);
+ kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+ if (!p->pre_handler || !p->pre_handler(p, regs)) {
+ /*
+ * Emulate singlestep (and also recover regs->ip)
+ * as if there is a 5byte nop
+ */
+ regs->ip = (unsigned long)p->addr + MCOUNT_INSN_SIZE;
+ if (unlikely(p->post_handler)) {
+ kcb->kprobe_status = KPROBE_HIT_SSDONE;
+ p->post_handler(p, regs, 0);
+ }
+ regs->ip = orig_ip;
+ }
+ /*
+ * If pre_handler returns !0, it changes regs->ip. We have to
+ * skip emulating post_handler.
+ */
+ __this_cpu_write(current_kprobe, NULL);
+ }
+}
+NOKPROBE_SYMBOL(kprobe_ftrace_handler);
+
+int arch_prepare_kprobe_ftrace(struct kprobe *p)
+{
+ p->ainsn.insn = NULL;
+ p->ainsn.boostable = false;
+ return 0;
+}
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
new file mode 100644
index 000000000..544bd41a5
--- /dev/null
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -0,0 +1,503 @@
+/*
+ * Kernel Probes Jump Optimization (Optprobes)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2002, 2004
+ * Copyright (C) Hitachi Ltd., 2012
+ */
+#include <linux/kprobes.h>
+#include <linux/ptrace.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/hardirq.h>
+#include <linux/preempt.h>
+#include <linux/extable.h>
+#include <linux/kdebug.h>
+#include <linux/kallsyms.h>
+#include <linux/ftrace.h>
+#include <linux/frame.h>
+
+#include <asm/text-patching.h>
+#include <asm/cacheflush.h>
+#include <asm/desc.h>
+#include <asm/pgtable.h>
+#include <linux/uaccess.h>
+#include <asm/alternative.h>
+#include <asm/insn.h>
+#include <asm/debugreg.h>
+#include <asm/set_memory.h>
+#include <asm/sections.h>
+#include <asm/nospec-branch.h>
+
+#include "common.h"
+
+unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr)
+{
+ struct optimized_kprobe *op;
+ struct kprobe *kp;
+ long offs;
+ int i;
+
+ for (i = 0; i < RELATIVEJUMP_SIZE; i++) {
+ kp = get_kprobe((void *)addr - i);
+ /* This function only handles jump-optimized kprobe */
+ if (kp && kprobe_optimized(kp)) {
+ op = container_of(kp, struct optimized_kprobe, kp);
+ /* If op->list is not empty, op is under optimizing */
+ if (list_empty(&op->list))
+ goto found;
+ }
+ }
+
+ return addr;
+found:
+ /*
+ * If the kprobe can be optimized, original bytes which can be
+ * overwritten by jump destination address. In this case, original
+ * bytes must be recovered from op->optinsn.copied_insn buffer.
+ */
+ if (probe_kernel_read(buf, (void *)addr,
+ MAX_INSN_SIZE * sizeof(kprobe_opcode_t)))
+ return 0UL;
+
+ if (addr == (unsigned long)kp->addr) {
+ buf[0] = kp->opcode;
+ memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
+ } else {
+ offs = addr - (unsigned long)kp->addr - 1;
+ memcpy(buf, op->optinsn.copied_insn + offs, RELATIVE_ADDR_SIZE - offs);
+ }
+
+ return (unsigned long)buf;
+}
+
+/* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */
+static void synthesize_set_arg1(kprobe_opcode_t *addr, unsigned long val)
+{
+#ifdef CONFIG_X86_64
+ *addr++ = 0x48;
+ *addr++ = 0xbf;
+#else
+ *addr++ = 0xb8;
+#endif
+ *(unsigned long *)addr = val;
+}
+
+asm (
+ "optprobe_template_func:\n"
+ ".global optprobe_template_entry\n"
+ "optprobe_template_entry:\n"
+#ifdef CONFIG_X86_64
+ /* We don't bother saving the ss register */
+ " pushq %rsp\n"
+ " pushfq\n"
+ SAVE_REGS_STRING
+ " movq %rsp, %rsi\n"
+ ".global optprobe_template_val\n"
+ "optprobe_template_val:\n"
+ ASM_NOP5
+ ASM_NOP5
+ ".global optprobe_template_call\n"
+ "optprobe_template_call:\n"
+ ASM_NOP5
+ /* Move flags to rsp */
+ " movq 144(%rsp), %rdx\n"
+ " movq %rdx, 152(%rsp)\n"
+ RESTORE_REGS_STRING
+ /* Skip flags entry */
+ " addq $8, %rsp\n"
+ " popfq\n"
+#else /* CONFIG_X86_32 */
+ " pushf\n"
+ SAVE_REGS_STRING
+ " movl %esp, %edx\n"
+ ".global optprobe_template_val\n"
+ "optprobe_template_val:\n"
+ ASM_NOP5
+ ".global optprobe_template_call\n"
+ "optprobe_template_call:\n"
+ ASM_NOP5
+ RESTORE_REGS_STRING
+ " addl $4, %esp\n" /* skip cs */
+ " popf\n"
+#endif
+ ".global optprobe_template_end\n"
+ "optprobe_template_end:\n"
+ ".type optprobe_template_func, @function\n"
+ ".size optprobe_template_func, .-optprobe_template_func\n");
+
+void optprobe_template_func(void);
+STACK_FRAME_NON_STANDARD(optprobe_template_func);
+NOKPROBE_SYMBOL(optprobe_template_func);
+NOKPROBE_SYMBOL(optprobe_template_entry);
+NOKPROBE_SYMBOL(optprobe_template_val);
+NOKPROBE_SYMBOL(optprobe_template_call);
+NOKPROBE_SYMBOL(optprobe_template_end);
+
+#define TMPL_MOVE_IDX \
+ ((long)optprobe_template_val - (long)optprobe_template_entry)
+#define TMPL_CALL_IDX \
+ ((long)optprobe_template_call - (long)optprobe_template_entry)
+#define TMPL_END_IDX \
+ ((long)optprobe_template_end - (long)optprobe_template_entry)
+
+#define INT3_SIZE sizeof(kprobe_opcode_t)
+
+/* Optimized kprobe call back function: called from optinsn */
+static void
+optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs)
+{
+ /* This is possible if op is under delayed unoptimizing */
+ if (kprobe_disabled(&op->kp))
+ return;
+
+ preempt_disable();
+ if (kprobe_running()) {
+ kprobes_inc_nmissed_count(&op->kp);
+ } else {
+ struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+ /* Save skipped registers */
+#ifdef CONFIG_X86_64
+ regs->cs = __KERNEL_CS;
+#else
+ regs->cs = __KERNEL_CS | get_kernel_rpl();
+ regs->gs = 0;
+#endif
+ regs->ip = (unsigned long)op->kp.addr + INT3_SIZE;
+ regs->orig_ax = ~0UL;
+
+ __this_cpu_write(current_kprobe, &op->kp);
+ kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+ opt_pre_handler(&op->kp, regs);
+ __this_cpu_write(current_kprobe, NULL);
+ }
+ preempt_enable();
+}
+NOKPROBE_SYMBOL(optimized_callback);
+
+static int copy_optimized_instructions(u8 *dest, u8 *src, u8 *real)
+{
+ struct insn insn;
+ int len = 0, ret;
+
+ while (len < RELATIVEJUMP_SIZE) {
+ ret = __copy_instruction(dest + len, src + len, real + len, &insn);
+ if (!ret || !can_boost(&insn, src + len))
+ return -EINVAL;
+ len += ret;
+ }
+ /* Check whether the address range is reserved */
+ if (ftrace_text_reserved(src, src + len - 1) ||
+ alternatives_text_reserved(src, src + len - 1) ||
+ jump_label_text_reserved(src, src + len - 1))
+ return -EBUSY;
+
+ return len;
+}
+
+/* Check whether insn is indirect jump */
+static int __insn_is_indirect_jump(struct insn *insn)
+{
+ return ((insn->opcode.bytes[0] == 0xff &&
+ (X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */
+ insn->opcode.bytes[0] == 0xea); /* Segment based jump */
+}
+
+/* Check whether insn jumps into specified address range */
+static int insn_jump_into_range(struct insn *insn, unsigned long start, int len)
+{
+ unsigned long target = 0;
+
+ switch (insn->opcode.bytes[0]) {
+ case 0xe0: /* loopne */
+ case 0xe1: /* loope */
+ case 0xe2: /* loop */
+ case 0xe3: /* jcxz */
+ case 0xe9: /* near relative jump */
+ case 0xeb: /* short relative jump */
+ break;
+ case 0x0f:
+ if ((insn->opcode.bytes[1] & 0xf0) == 0x80) /* jcc near */
+ break;
+ return 0;
+ default:
+ if ((insn->opcode.bytes[0] & 0xf0) == 0x70) /* jcc short */
+ break;
+ return 0;
+ }
+ target = (unsigned long)insn->next_byte + insn->immediate.value;
+
+ return (start <= target && target <= start + len);
+}
+
+static int insn_is_indirect_jump(struct insn *insn)
+{
+ int ret = __insn_is_indirect_jump(insn);
+
+#ifdef CONFIG_RETPOLINE
+ /*
+ * Jump to x86_indirect_thunk_* is treated as an indirect jump.
+ * Note that even with CONFIG_RETPOLINE=y, the kernel compiled with
+ * older gcc may use indirect jump. So we add this check instead of
+ * replace indirect-jump check.
+ */
+ if (!ret)
+ ret = insn_jump_into_range(insn,
+ (unsigned long)__indirect_thunk_start,
+ (unsigned long)__indirect_thunk_end -
+ (unsigned long)__indirect_thunk_start);
+#endif
+ return ret;
+}
+
+/* Decode whole function to ensure any instructions don't jump into target */
+static int can_optimize(unsigned long paddr)
+{
+ unsigned long addr, size = 0, offset = 0;
+ struct insn insn;
+ kprobe_opcode_t buf[MAX_INSN_SIZE];
+
+ /* Lookup symbol including addr */
+ if (!kallsyms_lookup_size_offset(paddr, &size, &offset))
+ return 0;
+
+ /*
+ * Do not optimize in the entry code due to the unstable
+ * stack handling and registers setup.
+ */
+ if (((paddr >= (unsigned long)__entry_text_start) &&
+ (paddr < (unsigned long)__entry_text_end)) ||
+ ((paddr >= (unsigned long)__irqentry_text_start) &&
+ (paddr < (unsigned long)__irqentry_text_end)))
+ return 0;
+
+ /* Check there is enough space for a relative jump. */
+ if (size - offset < RELATIVEJUMP_SIZE)
+ return 0;
+
+ /* Decode instructions */
+ addr = paddr - offset;
+ while (addr < paddr - offset + size) { /* Decode until function end */
+ unsigned long recovered_insn;
+ if (search_exception_tables(addr))
+ /*
+ * Since some fixup code will jumps into this function,
+ * we can't optimize kprobe in this function.
+ */
+ return 0;
+ recovered_insn = recover_probed_instruction(buf, addr);
+ if (!recovered_insn)
+ return 0;
+ kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE);
+ insn_get_length(&insn);
+ /* Another subsystem puts a breakpoint */
+ if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
+ return 0;
+ /* Recover address */
+ insn.kaddr = (void *)addr;
+ insn.next_byte = (void *)(addr + insn.length);
+ /* Check any instructions don't jump into target */
+ if (insn_is_indirect_jump(&insn) ||
+ insn_jump_into_range(&insn, paddr + INT3_SIZE,
+ RELATIVE_ADDR_SIZE))
+ return 0;
+ addr += insn.length;
+ }
+
+ return 1;
+}
+
+/* Check optimized_kprobe can actually be optimized. */
+int arch_check_optimized_kprobe(struct optimized_kprobe *op)
+{
+ int i;
+ struct kprobe *p;
+
+ for (i = 1; i < op->optinsn.size; i++) {
+ p = get_kprobe(op->kp.addr + i);
+ if (p && !kprobe_disabled(p))
+ return -EEXIST;
+ }
+
+ return 0;
+}
+
+/* Check the addr is within the optimized instructions. */
+int arch_within_optimized_kprobe(struct optimized_kprobe *op,
+ unsigned long addr)
+{
+ return ((unsigned long)op->kp.addr <= addr &&
+ (unsigned long)op->kp.addr + op->optinsn.size > addr);
+}
+
+/* Free optimized instruction slot */
+static
+void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty)
+{
+ if (op->optinsn.insn) {
+ free_optinsn_slot(op->optinsn.insn, dirty);
+ op->optinsn.insn = NULL;
+ op->optinsn.size = 0;
+ }
+}
+
+void arch_remove_optimized_kprobe(struct optimized_kprobe *op)
+{
+ __arch_remove_optimized_kprobe(op, 1);
+}
+
+/*
+ * Copy replacing target instructions
+ * Target instructions MUST be relocatable (checked inside)
+ * This is called when new aggr(opt)probe is allocated or reused.
+ */
+int arch_prepare_optimized_kprobe(struct optimized_kprobe *op,
+ struct kprobe *__unused)
+{
+ u8 *buf = NULL, *slot;
+ int ret, len;
+ long rel;
+
+ if (!can_optimize((unsigned long)op->kp.addr))
+ return -EILSEQ;
+
+ buf = kzalloc(MAX_OPTINSN_SIZE, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ op->optinsn.insn = slot = get_optinsn_slot();
+ if (!slot) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /*
+ * Verify if the address gap is in 2GB range, because this uses
+ * a relative jump.
+ */
+ rel = (long)slot - (long)op->kp.addr + RELATIVEJUMP_SIZE;
+ if (abs(rel) > 0x7fffffff) {
+ ret = -ERANGE;
+ goto err;
+ }
+
+ /* Copy arch-dep-instance from template */
+ memcpy(buf, optprobe_template_entry, TMPL_END_IDX);
+
+ /* Copy instructions into the out-of-line buffer */
+ ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr,
+ slot + TMPL_END_IDX);
+ if (ret < 0)
+ goto err;
+ op->optinsn.size = ret;
+ len = TMPL_END_IDX + op->optinsn.size;
+
+ /* Set probe information */
+ synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op);
+
+ /* Set probe function call */
+ synthesize_relcall(buf + TMPL_CALL_IDX,
+ slot + TMPL_CALL_IDX, optimized_callback);
+
+ /* Set returning jmp instruction at the tail of out-of-line buffer */
+ synthesize_reljump(buf + len, slot + len,
+ (u8 *)op->kp.addr + op->optinsn.size);
+ len += RELATIVEJUMP_SIZE;
+
+ /* We have to use text_poke for instuction buffer because it is RO */
+ text_poke(slot, buf, len);
+ ret = 0;
+out:
+ kfree(buf);
+ return ret;
+
+err:
+ __arch_remove_optimized_kprobe(op, 0);
+ goto out;
+}
+
+/*
+ * Replace breakpoints (int3) with relative jumps.
+ * Caller must call with locking kprobe_mutex and text_mutex.
+ */
+void arch_optimize_kprobes(struct list_head *oplist)
+{
+ struct optimized_kprobe *op, *tmp;
+ u8 insn_buf[RELATIVEJUMP_SIZE];
+
+ list_for_each_entry_safe(op, tmp, oplist, list) {
+ s32 rel = (s32)((long)op->optinsn.insn -
+ ((long)op->kp.addr + RELATIVEJUMP_SIZE));
+
+ WARN_ON(kprobe_disabled(&op->kp));
+
+ /* Backup instructions which will be replaced by jump address */
+ memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE,
+ RELATIVE_ADDR_SIZE);
+
+ insn_buf[0] = RELATIVEJUMP_OPCODE;
+ *(s32 *)(&insn_buf[1]) = rel;
+
+ text_poke_bp(op->kp.addr, insn_buf, RELATIVEJUMP_SIZE,
+ op->optinsn.insn);
+
+ list_del_init(&op->list);
+ }
+}
+
+/* Replace a relative jump with a breakpoint (int3). */
+void arch_unoptimize_kprobe(struct optimized_kprobe *op)
+{
+ u8 insn_buf[RELATIVEJUMP_SIZE];
+
+ /* Set int3 to first byte for kprobes */
+ insn_buf[0] = BREAKPOINT_INSTRUCTION;
+ memcpy(insn_buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
+ text_poke_bp(op->kp.addr, insn_buf, RELATIVEJUMP_SIZE,
+ op->optinsn.insn);
+}
+
+/*
+ * Recover original instructions and breakpoints from relative jumps.
+ * Caller must call with locking kprobe_mutex.
+ */
+extern void arch_unoptimize_kprobes(struct list_head *oplist,
+ struct list_head *done_list)
+{
+ struct optimized_kprobe *op, *tmp;
+
+ list_for_each_entry_safe(op, tmp, oplist, list) {
+ arch_unoptimize_kprobe(op);
+ list_move(&op->list, done_list);
+ }
+}
+
+int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter)
+{
+ struct optimized_kprobe *op;
+
+ if (p->flags & KPROBE_FLAG_OPTIMIZED) {
+ /* This kprobe is really able to run optimized path. */
+ op = container_of(p, struct optimized_kprobe, kp);
+ /* Detour through copied instructions */
+ regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX;
+ if (!reenter)
+ reset_current_kprobe();
+ return 1;
+ }
+ return 0;
+}
+NOKPROBE_SYMBOL(setup_detour_execution);
diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c
new file mode 100644
index 000000000..163ae706a
--- /dev/null
+++ b/arch/x86/kernel/ksysfs.c
@@ -0,0 +1,340 @@
+/*
+ * Architecture specific sysfs attributes in /sys/kernel
+ *
+ * Copyright (C) 2007, Intel Corp.
+ * Huang Ying <ying.huang@intel.com>
+ * Copyright (C) 2013, 2013 Red Hat, Inc.
+ * Dave Young <dyoung@redhat.com>
+ *
+ * This file is released under the GPLv2
+ */
+
+#include <linux/kobject.h>
+#include <linux/string.h>
+#include <linux/sysfs.h>
+#include <linux/init.h>
+#include <linux/stat.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/io.h>
+
+#include <asm/setup.h>
+
+static ssize_t version_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "0x%04x\n", boot_params.hdr.version);
+}
+
+static struct kobj_attribute boot_params_version_attr = __ATTR_RO(version);
+
+static ssize_t boot_params_data_read(struct file *fp, struct kobject *kobj,
+ struct bin_attribute *bin_attr,
+ char *buf, loff_t off, size_t count)
+{
+ memcpy(buf, (void *)&boot_params + off, count);
+ return count;
+}
+
+static struct bin_attribute boot_params_data_attr = {
+ .attr = {
+ .name = "data",
+ .mode = S_IRUGO,
+ },
+ .read = boot_params_data_read,
+ .size = sizeof(boot_params),
+};
+
+static struct attribute *boot_params_version_attrs[] = {
+ &boot_params_version_attr.attr,
+ NULL,
+};
+
+static struct bin_attribute *boot_params_data_attrs[] = {
+ &boot_params_data_attr,
+ NULL,
+};
+
+static const struct attribute_group boot_params_attr_group = {
+ .attrs = boot_params_version_attrs,
+ .bin_attrs = boot_params_data_attrs,
+};
+
+static int kobj_to_setup_data_nr(struct kobject *kobj, int *nr)
+{
+ const char *name;
+
+ name = kobject_name(kobj);
+ return kstrtoint(name, 10, nr);
+}
+
+static int get_setup_data_paddr(int nr, u64 *paddr)
+{
+ int i = 0;
+ struct setup_data *data;
+ u64 pa_data = boot_params.hdr.setup_data;
+
+ while (pa_data) {
+ if (nr == i) {
+ *paddr = pa_data;
+ return 0;
+ }
+ data = memremap(pa_data, sizeof(*data), MEMREMAP_WB);
+ if (!data)
+ return -ENOMEM;
+
+ pa_data = data->next;
+ memunmap(data);
+ i++;
+ }
+ return -EINVAL;
+}
+
+static int __init get_setup_data_size(int nr, size_t *size)
+{
+ int i = 0;
+ struct setup_data *data;
+ u64 pa_data = boot_params.hdr.setup_data;
+
+ while (pa_data) {
+ data = memremap(pa_data, sizeof(*data), MEMREMAP_WB);
+ if (!data)
+ return -ENOMEM;
+ if (nr == i) {
+ *size = data->len;
+ memunmap(data);
+ return 0;
+ }
+
+ pa_data = data->next;
+ memunmap(data);
+ i++;
+ }
+ return -EINVAL;
+}
+
+static ssize_t type_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ int nr, ret;
+ u64 paddr;
+ struct setup_data *data;
+
+ ret = kobj_to_setup_data_nr(kobj, &nr);
+ if (ret)
+ return ret;
+
+ ret = get_setup_data_paddr(nr, &paddr);
+ if (ret)
+ return ret;
+ data = memremap(paddr, sizeof(*data), MEMREMAP_WB);
+ if (!data)
+ return -ENOMEM;
+
+ ret = sprintf(buf, "0x%x\n", data->type);
+ memunmap(data);
+ return ret;
+}
+
+static ssize_t setup_data_data_read(struct file *fp,
+ struct kobject *kobj,
+ struct bin_attribute *bin_attr,
+ char *buf,
+ loff_t off, size_t count)
+{
+ int nr, ret = 0;
+ u64 paddr;
+ struct setup_data *data;
+ void *p;
+
+ ret = kobj_to_setup_data_nr(kobj, &nr);
+ if (ret)
+ return ret;
+
+ ret = get_setup_data_paddr(nr, &paddr);
+ if (ret)
+ return ret;
+ data = memremap(paddr, sizeof(*data), MEMREMAP_WB);
+ if (!data)
+ return -ENOMEM;
+
+ if (off > data->len) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (count > data->len - off)
+ count = data->len - off;
+
+ if (!count)
+ goto out;
+
+ ret = count;
+ p = memremap(paddr + sizeof(*data), data->len, MEMREMAP_WB);
+ if (!p) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ memcpy(buf, p + off, count);
+ memunmap(p);
+out:
+ memunmap(data);
+ return ret;
+}
+
+static struct kobj_attribute type_attr = __ATTR_RO(type);
+
+static struct bin_attribute data_attr __ro_after_init = {
+ .attr = {
+ .name = "data",
+ .mode = S_IRUGO,
+ },
+ .read = setup_data_data_read,
+};
+
+static struct attribute *setup_data_type_attrs[] = {
+ &type_attr.attr,
+ NULL,
+};
+
+static struct bin_attribute *setup_data_data_attrs[] = {
+ &data_attr,
+ NULL,
+};
+
+static const struct attribute_group setup_data_attr_group = {
+ .attrs = setup_data_type_attrs,
+ .bin_attrs = setup_data_data_attrs,
+};
+
+static int __init create_setup_data_node(struct kobject *parent,
+ struct kobject **kobjp, int nr)
+{
+ int ret = 0;
+ size_t size;
+ struct kobject *kobj;
+ char name[16]; /* should be enough for setup_data nodes numbers */
+ snprintf(name, 16, "%d", nr);
+
+ kobj = kobject_create_and_add(name, parent);
+ if (!kobj)
+ return -ENOMEM;
+
+ ret = get_setup_data_size(nr, &size);
+ if (ret)
+ goto out_kobj;
+
+ data_attr.size = size;
+ ret = sysfs_create_group(kobj, &setup_data_attr_group);
+ if (ret)
+ goto out_kobj;
+ *kobjp = kobj;
+
+ return 0;
+out_kobj:
+ kobject_put(kobj);
+ return ret;
+}
+
+static void __init cleanup_setup_data_node(struct kobject *kobj)
+{
+ sysfs_remove_group(kobj, &setup_data_attr_group);
+ kobject_put(kobj);
+}
+
+static int __init get_setup_data_total_num(u64 pa_data, int *nr)
+{
+ int ret = 0;
+ struct setup_data *data;
+
+ *nr = 0;
+ while (pa_data) {
+ *nr += 1;
+ data = memremap(pa_data, sizeof(*data), MEMREMAP_WB);
+ if (!data) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ pa_data = data->next;
+ memunmap(data);
+ }
+
+out:
+ return ret;
+}
+
+static int __init create_setup_data_nodes(struct kobject *parent)
+{
+ struct kobject *setup_data_kobj, **kobjp;
+ u64 pa_data;
+ int i, j, nr, ret = 0;
+
+ pa_data = boot_params.hdr.setup_data;
+ if (!pa_data)
+ return 0;
+
+ setup_data_kobj = kobject_create_and_add("setup_data", parent);
+ if (!setup_data_kobj) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = get_setup_data_total_num(pa_data, &nr);
+ if (ret)
+ goto out_setup_data_kobj;
+
+ kobjp = kmalloc_array(nr, sizeof(*kobjp), GFP_KERNEL);
+ if (!kobjp) {
+ ret = -ENOMEM;
+ goto out_setup_data_kobj;
+ }
+
+ for (i = 0; i < nr; i++) {
+ ret = create_setup_data_node(setup_data_kobj, kobjp + i, i);
+ if (ret)
+ goto out_clean_nodes;
+ }
+
+ kfree(kobjp);
+ return 0;
+
+out_clean_nodes:
+ for (j = i - 1; j >= 0; j--)
+ cleanup_setup_data_node(*(kobjp + j));
+ kfree(kobjp);
+out_setup_data_kobj:
+ kobject_put(setup_data_kobj);
+out:
+ return ret;
+}
+
+static int __init boot_params_ksysfs_init(void)
+{
+ int ret;
+ struct kobject *boot_params_kobj;
+
+ boot_params_kobj = kobject_create_and_add("boot_params",
+ kernel_kobj);
+ if (!boot_params_kobj) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = sysfs_create_group(boot_params_kobj, &boot_params_attr_group);
+ if (ret)
+ goto out_boot_params_kobj;
+
+ ret = create_setup_data_nodes(boot_params_kobj);
+ if (ret)
+ goto out_create_group;
+
+ return 0;
+out_create_group:
+ sysfs_remove_group(boot_params_kobj, &boot_params_attr_group);
+out_boot_params_kobj:
+ kobject_put(boot_params_kobj);
+out:
+ return ret;
+}
+
+arch_initcall(boot_params_ksysfs_init);
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
new file mode 100644
index 000000000..bd6ee0bfa
--- /dev/null
+++ b/arch/x86/kernel/kvm.c
@@ -0,0 +1,868 @@
+/*
+ * KVM paravirt_ops implementation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * Copyright (C) 2007, Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ * Copyright IBM Corporation, 2007
+ * Authors: Anthony Liguori <aliguori@us.ibm.com>
+ */
+
+#include <linux/context_tracking.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/kvm_para.h>
+#include <linux/cpu.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/hardirq.h>
+#include <linux/notifier.h>
+#include <linux/reboot.h>
+#include <linux/hash.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/kprobes.h>
+#include <linux/debugfs.h>
+#include <linux/nmi.h>
+#include <linux/swait.h>
+#include <asm/timer.h>
+#include <asm/cpu.h>
+#include <asm/traps.h>
+#include <asm/desc.h>
+#include <asm/tlbflush.h>
+#include <asm/apic.h>
+#include <asm/apicdef.h>
+#include <asm/hypervisor.h>
+#include <asm/tlb.h>
+
+static int kvmapf = 1;
+
+static int __init parse_no_kvmapf(char *arg)
+{
+ kvmapf = 0;
+ return 0;
+}
+
+early_param("no-kvmapf", parse_no_kvmapf);
+
+static int steal_acc = 1;
+static int __init parse_no_stealacc(char *arg)
+{
+ steal_acc = 0;
+ return 0;
+}
+
+early_param("no-steal-acc", parse_no_stealacc);
+
+static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
+static DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64);
+static int has_steal_clock = 0;
+
+/*
+ * No need for any "IO delay" on KVM
+ */
+static void kvm_io_delay(void)
+{
+}
+
+#define KVM_TASK_SLEEP_HASHBITS 8
+#define KVM_TASK_SLEEP_HASHSIZE (1<<KVM_TASK_SLEEP_HASHBITS)
+
+struct kvm_task_sleep_node {
+ struct hlist_node link;
+ struct swait_queue_head wq;
+ u32 token;
+ int cpu;
+ bool halted;
+};
+
+static struct kvm_task_sleep_head {
+ raw_spinlock_t lock;
+ struct hlist_head list;
+} async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE];
+
+static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b,
+ u32 token)
+{
+ struct hlist_node *p;
+
+ hlist_for_each(p, &b->list) {
+ struct kvm_task_sleep_node *n =
+ hlist_entry(p, typeof(*n), link);
+ if (n->token == token)
+ return n;
+ }
+
+ return NULL;
+}
+
+/*
+ * @interrupt_kernel: Is this called from a routine which interrupts the kernel
+ * (other than user space)?
+ */
+void kvm_async_pf_task_wait(u32 token, int interrupt_kernel)
+{
+ u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
+ struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
+ struct kvm_task_sleep_node n, *e;
+ DECLARE_SWAITQUEUE(wait);
+
+ rcu_irq_enter();
+
+ raw_spin_lock(&b->lock);
+ e = _find_apf_task(b, token);
+ if (e) {
+ /* dummy entry exist -> wake up was delivered ahead of PF */
+ hlist_del(&e->link);
+ kfree(e);
+ raw_spin_unlock(&b->lock);
+
+ rcu_irq_exit();
+ return;
+ }
+
+ n.token = token;
+ n.cpu = smp_processor_id();
+ n.halted = is_idle_task(current) ||
+ (IS_ENABLED(CONFIG_PREEMPT_COUNT)
+ ? preempt_count() > 1 || rcu_preempt_depth()
+ : interrupt_kernel);
+ init_swait_queue_head(&n.wq);
+ hlist_add_head(&n.link, &b->list);
+ raw_spin_unlock(&b->lock);
+
+ for (;;) {
+ if (!n.halted)
+ prepare_to_swait_exclusive(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
+ if (hlist_unhashed(&n.link))
+ break;
+
+ rcu_irq_exit();
+
+ if (!n.halted) {
+ local_irq_enable();
+ schedule();
+ local_irq_disable();
+ } else {
+ /*
+ * We cannot reschedule. So halt.
+ */
+ native_safe_halt();
+ local_irq_disable();
+ }
+
+ rcu_irq_enter();
+ }
+ if (!n.halted)
+ finish_swait(&n.wq, &wait);
+
+ rcu_irq_exit();
+ return;
+}
+EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait);
+
+static void apf_task_wake_one(struct kvm_task_sleep_node *n)
+{
+ hlist_del_init(&n->link);
+ if (n->halted)
+ smp_send_reschedule(n->cpu);
+ else if (swq_has_sleeper(&n->wq))
+ swake_up_one(&n->wq);
+}
+
+static void apf_task_wake_all(void)
+{
+ int i;
+
+ for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) {
+ struct hlist_node *p, *next;
+ struct kvm_task_sleep_head *b = &async_pf_sleepers[i];
+ raw_spin_lock(&b->lock);
+ hlist_for_each_safe(p, next, &b->list) {
+ struct kvm_task_sleep_node *n =
+ hlist_entry(p, typeof(*n), link);
+ if (n->cpu == smp_processor_id())
+ apf_task_wake_one(n);
+ }
+ raw_spin_unlock(&b->lock);
+ }
+}
+
+void kvm_async_pf_task_wake(u32 token)
+{
+ u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
+ struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
+ struct kvm_task_sleep_node *n;
+
+ if (token == ~0) {
+ apf_task_wake_all();
+ return;
+ }
+
+again:
+ raw_spin_lock(&b->lock);
+ n = _find_apf_task(b, token);
+ if (!n) {
+ /*
+ * async PF was not yet handled.
+ * Add dummy entry for the token.
+ */
+ n = kzalloc(sizeof(*n), GFP_ATOMIC);
+ if (!n) {
+ /*
+ * Allocation failed! Busy wait while other cpu
+ * handles async PF.
+ */
+ raw_spin_unlock(&b->lock);
+ cpu_relax();
+ goto again;
+ }
+ n->token = token;
+ n->cpu = smp_processor_id();
+ init_swait_queue_head(&n->wq);
+ hlist_add_head(&n->link, &b->list);
+ } else
+ apf_task_wake_one(n);
+ raw_spin_unlock(&b->lock);
+ return;
+}
+EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake);
+
+u32 kvm_read_and_reset_pf_reason(void)
+{
+ u32 reason = 0;
+
+ if (__this_cpu_read(apf_reason.enabled)) {
+ reason = __this_cpu_read(apf_reason.reason);
+ __this_cpu_write(apf_reason.reason, 0);
+ }
+
+ return reason;
+}
+EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason);
+NOKPROBE_SYMBOL(kvm_read_and_reset_pf_reason);
+
+dotraplinkage void
+do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
+{
+ enum ctx_state prev_state;
+
+ switch (kvm_read_and_reset_pf_reason()) {
+ default:
+ do_page_fault(regs, error_code);
+ break;
+ case KVM_PV_REASON_PAGE_NOT_PRESENT:
+ /* page is swapped out by the host. */
+ prev_state = exception_enter();
+ kvm_async_pf_task_wait((u32)read_cr2(), !user_mode(regs));
+ exception_exit(prev_state);
+ break;
+ case KVM_PV_REASON_PAGE_READY:
+ rcu_irq_enter();
+ kvm_async_pf_task_wake((u32)read_cr2());
+ rcu_irq_exit();
+ break;
+ }
+}
+NOKPROBE_SYMBOL(do_async_page_fault);
+
+static void __init paravirt_ops_setup(void)
+{
+ pv_info.name = "KVM";
+
+ if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY))
+ pv_cpu_ops.io_delay = kvm_io_delay;
+
+#ifdef CONFIG_X86_IO_APIC
+ no_timer_check = 1;
+#endif
+}
+
+static void kvm_register_steal_time(void)
+{
+ int cpu = smp_processor_id();
+ struct kvm_steal_time *st = &per_cpu(steal_time, cpu);
+
+ if (!has_steal_clock)
+ return;
+
+ wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED));
+ pr_info("kvm-stealtime: cpu %d, msr %llx\n",
+ cpu, (unsigned long long) slow_virt_to_phys(st));
+}
+
+static DEFINE_PER_CPU_DECRYPTED(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED;
+
+static notrace void kvm_guest_apic_eoi_write(u32 reg, u32 val)
+{
+ /**
+ * This relies on __test_and_clear_bit to modify the memory
+ * in a way that is atomic with respect to the local CPU.
+ * The hypervisor only accesses this memory from the local CPU so
+ * there's no need for lock or memory barriers.
+ * An optimization barrier is implied in apic write.
+ */
+ if (__test_and_clear_bit(KVM_PV_EOI_BIT, this_cpu_ptr(&kvm_apic_eoi)))
+ return;
+ apic->native_eoi_write(APIC_EOI, APIC_EOI_ACK);
+}
+
+static void kvm_guest_cpu_init(void)
+{
+ if (!kvm_para_available())
+ return;
+
+ if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) {
+ u64 pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason));
+
+#ifdef CONFIG_PREEMPT
+ pa |= KVM_ASYNC_PF_SEND_ALWAYS;
+#endif
+ pa |= KVM_ASYNC_PF_ENABLED;
+
+ if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_VMEXIT))
+ pa |= KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;
+
+ wrmsrl(MSR_KVM_ASYNC_PF_EN, pa);
+ __this_cpu_write(apf_reason.enabled, 1);
+ printk(KERN_INFO"KVM setup async PF for cpu %d\n",
+ smp_processor_id());
+ }
+
+ if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) {
+ unsigned long pa;
+ /* Size alignment is implied but just to make it explicit. */
+ BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4);
+ __this_cpu_write(kvm_apic_eoi, 0);
+ pa = slow_virt_to_phys(this_cpu_ptr(&kvm_apic_eoi))
+ | KVM_MSR_ENABLED;
+ wrmsrl(MSR_KVM_PV_EOI_EN, pa);
+ }
+
+ if (has_steal_clock)
+ kvm_register_steal_time();
+}
+
+static void kvm_pv_disable_apf(void)
+{
+ if (!__this_cpu_read(apf_reason.enabled))
+ return;
+
+ wrmsrl(MSR_KVM_ASYNC_PF_EN, 0);
+ __this_cpu_write(apf_reason.enabled, 0);
+
+ printk(KERN_INFO"Unregister pv shared memory for cpu %d\n",
+ smp_processor_id());
+}
+
+static void kvm_pv_guest_cpu_reboot(void *unused)
+{
+ /*
+ * We disable PV EOI before we load a new kernel by kexec,
+ * since MSR_KVM_PV_EOI_EN stores a pointer into old kernel's memory.
+ * New kernel can re-enable when it boots.
+ */
+ if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
+ wrmsrl(MSR_KVM_PV_EOI_EN, 0);
+ kvm_pv_disable_apf();
+ kvm_disable_steal_time();
+}
+
+static int kvm_pv_reboot_notify(struct notifier_block *nb,
+ unsigned long code, void *unused)
+{
+ if (code == SYS_RESTART)
+ on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1);
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block kvm_pv_reboot_nb = {
+ .notifier_call = kvm_pv_reboot_notify,
+};
+
+static u64 kvm_steal_clock(int cpu)
+{
+ u64 steal;
+ struct kvm_steal_time *src;
+ int version;
+
+ src = &per_cpu(steal_time, cpu);
+ do {
+ version = src->version;
+ virt_rmb();
+ steal = src->steal;
+ virt_rmb();
+ } while ((version & 1) || (version != src->version));
+
+ return steal;
+}
+
+void kvm_disable_steal_time(void)
+{
+ if (!has_steal_clock)
+ return;
+
+ wrmsr(MSR_KVM_STEAL_TIME, 0, 0);
+}
+
+static inline void __set_percpu_decrypted(void *ptr, unsigned long size)
+{
+ early_set_memory_decrypted((unsigned long) ptr, size);
+}
+
+/*
+ * Iterate through all possible CPUs and map the memory region pointed
+ * by apf_reason, steal_time and kvm_apic_eoi as decrypted at once.
+ *
+ * Note: we iterate through all possible CPUs to ensure that CPUs
+ * hotplugged will have their per-cpu variable already mapped as
+ * decrypted.
+ */
+static void __init sev_map_percpu_data(void)
+{
+ int cpu;
+
+ if (!sev_active())
+ return;
+
+ for_each_possible_cpu(cpu) {
+ __set_percpu_decrypted(&per_cpu(apf_reason, cpu), sizeof(apf_reason));
+ __set_percpu_decrypted(&per_cpu(steal_time, cpu), sizeof(steal_time));
+ __set_percpu_decrypted(&per_cpu(kvm_apic_eoi, cpu), sizeof(kvm_apic_eoi));
+ }
+}
+
+#ifdef CONFIG_SMP
+#define KVM_IPI_CLUSTER_SIZE (2 * BITS_PER_LONG)
+
+static void __send_ipi_mask(const struct cpumask *mask, int vector)
+{
+ unsigned long flags;
+ int cpu, apic_id, icr;
+ int min = 0, max = 0;
+#ifdef CONFIG_X86_64
+ __uint128_t ipi_bitmap = 0;
+#else
+ u64 ipi_bitmap = 0;
+#endif
+ long ret;
+
+ if (cpumask_empty(mask))
+ return;
+
+ local_irq_save(flags);
+
+ switch (vector) {
+ default:
+ icr = APIC_DM_FIXED | vector;
+ break;
+ case NMI_VECTOR:
+ icr = APIC_DM_NMI;
+ break;
+ }
+
+ for_each_cpu(cpu, mask) {
+ apic_id = per_cpu(x86_cpu_to_apicid, cpu);
+ if (!ipi_bitmap) {
+ min = max = apic_id;
+ } else if (apic_id < min && max - apic_id < KVM_IPI_CLUSTER_SIZE) {
+ ipi_bitmap <<= min - apic_id;
+ min = apic_id;
+ } else if (apic_id > min && apic_id < min + KVM_IPI_CLUSTER_SIZE) {
+ max = apic_id < max ? max : apic_id;
+ } else {
+ ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap,
+ (unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr);
+ WARN_ONCE(ret < 0, "KVM: failed to send PV IPI: %ld", ret);
+ min = max = apic_id;
+ ipi_bitmap = 0;
+ }
+ __set_bit(apic_id - min, (unsigned long *)&ipi_bitmap);
+ }
+
+ if (ipi_bitmap) {
+ ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap,
+ (unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr);
+ WARN_ONCE(ret < 0, "KVM: failed to send PV IPI: %ld", ret);
+ }
+
+ local_irq_restore(flags);
+}
+
+static void kvm_send_ipi_mask(const struct cpumask *mask, int vector)
+{
+ __send_ipi_mask(mask, vector);
+}
+
+static void kvm_send_ipi_mask_allbutself(const struct cpumask *mask, int vector)
+{
+ unsigned int this_cpu = smp_processor_id();
+ struct cpumask new_mask;
+ const struct cpumask *local_mask;
+
+ cpumask_copy(&new_mask, mask);
+ cpumask_clear_cpu(this_cpu, &new_mask);
+ local_mask = &new_mask;
+ __send_ipi_mask(local_mask, vector);
+}
+
+static void kvm_send_ipi_allbutself(int vector)
+{
+ kvm_send_ipi_mask_allbutself(cpu_online_mask, vector);
+}
+
+static void kvm_send_ipi_all(int vector)
+{
+ __send_ipi_mask(cpu_online_mask, vector);
+}
+
+/*
+ * Set the IPI entry points
+ */
+static void kvm_setup_pv_ipi(void)
+{
+ apic->send_IPI_mask = kvm_send_ipi_mask;
+ apic->send_IPI_mask_allbutself = kvm_send_ipi_mask_allbutself;
+ apic->send_IPI_allbutself = kvm_send_ipi_allbutself;
+ apic->send_IPI_all = kvm_send_ipi_all;
+ pr_info("KVM setup pv IPIs\n");
+}
+
+static void __init kvm_smp_prepare_cpus(unsigned int max_cpus)
+{
+ native_smp_prepare_cpus(max_cpus);
+ if (kvm_para_has_hint(KVM_HINTS_REALTIME))
+ static_branch_disable(&virt_spin_lock_key);
+}
+
+static void __init kvm_smp_prepare_boot_cpu(void)
+{
+ /*
+ * Map the per-cpu variables as decrypted before kvm_guest_cpu_init()
+ * shares the guest physical address with the hypervisor.
+ */
+ sev_map_percpu_data();
+
+ kvm_guest_cpu_init();
+ native_smp_prepare_boot_cpu();
+ kvm_spinlock_init();
+}
+
+static void kvm_guest_cpu_offline(void)
+{
+ kvm_disable_steal_time();
+ if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
+ wrmsrl(MSR_KVM_PV_EOI_EN, 0);
+ kvm_pv_disable_apf();
+ apf_task_wake_all();
+}
+
+static int kvm_cpu_online(unsigned int cpu)
+{
+ local_irq_disable();
+ kvm_guest_cpu_init();
+ local_irq_enable();
+ return 0;
+}
+
+static int kvm_cpu_down_prepare(unsigned int cpu)
+{
+ local_irq_disable();
+ kvm_guest_cpu_offline();
+ local_irq_enable();
+ return 0;
+}
+#endif
+
+static void __init kvm_apf_trap_init(void)
+{
+ update_intr_gate(X86_TRAP_PF, async_page_fault);
+}
+
+static DEFINE_PER_CPU(cpumask_var_t, __pv_tlb_mask);
+
+static void kvm_flush_tlb_others(const struct cpumask *cpumask,
+ const struct flush_tlb_info *info)
+{
+ u8 state;
+ int cpu;
+ struct kvm_steal_time *src;
+ struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_tlb_mask);
+
+ cpumask_copy(flushmask, cpumask);
+ /*
+ * We have to call flush only on online vCPUs. And
+ * queue flush_on_enter for pre-empted vCPUs
+ */
+ for_each_cpu(cpu, flushmask) {
+ src = &per_cpu(steal_time, cpu);
+ state = READ_ONCE(src->preempted);
+ if ((state & KVM_VCPU_PREEMPTED)) {
+ if (try_cmpxchg(&src->preempted, &state,
+ state | KVM_VCPU_FLUSH_TLB))
+ __cpumask_clear_cpu(cpu, flushmask);
+ }
+ }
+
+ native_flush_tlb_others(flushmask, info);
+}
+
+static void __init kvm_guest_init(void)
+{
+ int i;
+
+ if (!kvm_para_available())
+ return;
+
+ paravirt_ops_setup();
+ register_reboot_notifier(&kvm_pv_reboot_nb);
+ for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++)
+ raw_spin_lock_init(&async_pf_sleepers[i].lock);
+ if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF))
+ x86_init.irqs.trap_init = kvm_apf_trap_init;
+
+ if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
+ has_steal_clock = 1;
+ pv_time_ops.steal_clock = kvm_steal_clock;
+ }
+
+ if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) &&
+ !kvm_para_has_hint(KVM_HINTS_REALTIME) &&
+ kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
+ pv_mmu_ops.flush_tlb_others = kvm_flush_tlb_others;
+ pv_mmu_ops.tlb_remove_table = tlb_remove_table;
+ }
+
+ if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
+ apic_set_eoi_write(kvm_guest_apic_eoi_write);
+
+#ifdef CONFIG_SMP
+ smp_ops.smp_prepare_cpus = kvm_smp_prepare_cpus;
+ smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
+ if (cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/kvm:online",
+ kvm_cpu_online, kvm_cpu_down_prepare) < 0)
+ pr_err("kvm_guest: Failed to install cpu hotplug callbacks\n");
+#else
+ sev_map_percpu_data();
+ kvm_guest_cpu_init();
+#endif
+
+ /*
+ * Hard lockup detection is enabled by default. Disable it, as guests
+ * can get false positives too easily, for example if the host is
+ * overcommitted.
+ */
+ hardlockup_detector_disable();
+}
+
+static noinline uint32_t __kvm_cpuid_base(void)
+{
+ if (boot_cpu_data.cpuid_level < 0)
+ return 0; /* So we don't blow up on old processors */
+
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ return hypervisor_cpuid_base("KVMKVMKVM\0\0\0", 0);
+
+ return 0;
+}
+
+static inline uint32_t kvm_cpuid_base(void)
+{
+ static int kvm_cpuid_base = -1;
+
+ if (kvm_cpuid_base == -1)
+ kvm_cpuid_base = __kvm_cpuid_base();
+
+ return kvm_cpuid_base;
+}
+
+bool kvm_para_available(void)
+{
+ return kvm_cpuid_base() != 0;
+}
+EXPORT_SYMBOL_GPL(kvm_para_available);
+
+unsigned int kvm_arch_para_features(void)
+{
+ return cpuid_eax(kvm_cpuid_base() | KVM_CPUID_FEATURES);
+}
+
+unsigned int kvm_arch_para_hints(void)
+{
+ return cpuid_edx(kvm_cpuid_base() | KVM_CPUID_FEATURES);
+}
+
+static uint32_t __init kvm_detect(void)
+{
+ return kvm_cpuid_base();
+}
+
+static void __init kvm_apic_init(void)
+{
+#if defined(CONFIG_SMP)
+ if (kvm_para_has_feature(KVM_FEATURE_PV_SEND_IPI))
+ kvm_setup_pv_ipi();
+#endif
+}
+
+static void __init kvm_init_platform(void)
+{
+ kvmclock_init();
+ x86_platform.apic_post_init = kvm_apic_init;
+}
+
+const __initconst struct hypervisor_x86 x86_hyper_kvm = {
+ .name = "KVM",
+ .detect = kvm_detect,
+ .type = X86_HYPER_KVM,
+ .init.guest_late_init = kvm_guest_init,
+ .init.x2apic_available = kvm_para_available,
+ .init.init_platform = kvm_init_platform,
+};
+
+static __init int activate_jump_labels(void)
+{
+ if (has_steal_clock) {
+ static_key_slow_inc(&paravirt_steal_enabled);
+ if (steal_acc)
+ static_key_slow_inc(&paravirt_steal_rq_enabled);
+ }
+
+ return 0;
+}
+arch_initcall(activate_jump_labels);
+
+static __init int kvm_setup_pv_tlb_flush(void)
+{
+ int cpu;
+
+ if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) &&
+ !kvm_para_has_hint(KVM_HINTS_REALTIME) &&
+ kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
+ for_each_possible_cpu(cpu) {
+ zalloc_cpumask_var_node(per_cpu_ptr(&__pv_tlb_mask, cpu),
+ GFP_KERNEL, cpu_to_node(cpu));
+ }
+ pr_info("KVM setup pv remote TLB flush\n");
+ }
+
+ return 0;
+}
+arch_initcall(kvm_setup_pv_tlb_flush);
+
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+
+/* Kick a cpu by its apicid. Used to wake up a halted vcpu */
+static void kvm_kick_cpu(int cpu)
+{
+ int apicid;
+ unsigned long flags = 0;
+
+ apicid = per_cpu(x86_cpu_to_apicid, cpu);
+ kvm_hypercall2(KVM_HC_KICK_CPU, flags, apicid);
+}
+
+#include <asm/qspinlock.h>
+
+static void kvm_wait(u8 *ptr, u8 val)
+{
+ unsigned long flags;
+
+ if (in_nmi())
+ return;
+
+ local_irq_save(flags);
+
+ if (READ_ONCE(*ptr) != val)
+ goto out;
+
+ /*
+ * halt until it's our turn and kicked. Note that we do safe halt
+ * for irq enabled case to avoid hang when lock info is overwritten
+ * in irq spinlock slowpath and no spurious interrupt occur to save us.
+ */
+ if (arch_irqs_disabled_flags(flags))
+ halt();
+ else
+ safe_halt();
+
+out:
+ local_irq_restore(flags);
+}
+
+#ifdef CONFIG_X86_32
+__visible bool __kvm_vcpu_is_preempted(long cpu)
+{
+ struct kvm_steal_time *src = &per_cpu(steal_time, cpu);
+
+ return !!(src->preempted & KVM_VCPU_PREEMPTED);
+}
+PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted);
+
+#else
+
+#include <asm/asm-offsets.h>
+
+extern bool __raw_callee_save___kvm_vcpu_is_preempted(long);
+
+/*
+ * Hand-optimize version for x86-64 to avoid 8 64-bit register saving and
+ * restoring to/from the stack.
+ */
+asm(
+".pushsection .text;"
+".global __raw_callee_save___kvm_vcpu_is_preempted;"
+".type __raw_callee_save___kvm_vcpu_is_preempted, @function;"
+"__raw_callee_save___kvm_vcpu_is_preempted:"
+"movq __per_cpu_offset(,%rdi,8), %rax;"
+"cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax);"
+"setne %al;"
+"ret;"
+".size __raw_callee_save___kvm_vcpu_is_preempted, .-__raw_callee_save___kvm_vcpu_is_preempted;"
+".popsection");
+
+#endif
+
+/*
+ * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
+ */
+void __init kvm_spinlock_init(void)
+{
+ if (!kvm_para_available())
+ return;
+ /* Does host kernel support KVM_FEATURE_PV_UNHALT? */
+ if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT))
+ return;
+
+ if (kvm_para_has_hint(KVM_HINTS_REALTIME))
+ return;
+
+ /* Don't use the pvqspinlock code if there is only 1 vCPU. */
+ if (num_possible_cpus() == 1)
+ return;
+
+ __pv_init_lock_hash();
+ pv_lock_ops.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
+ pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock);
+ pv_lock_ops.wait = kvm_wait;
+ pv_lock_ops.kick = kvm_kick_cpu;
+
+ if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
+ pv_lock_ops.vcpu_is_preempted =
+ PV_CALLEE_SAVE(__kvm_vcpu_is_preempted);
+ }
+}
+
+#endif /* CONFIG_PARAVIRT_SPINLOCKS */
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
new file mode 100644
index 000000000..6752d3768
--- /dev/null
+++ b/arch/x86/kernel/kvmclock.c
@@ -0,0 +1,360 @@
+/* KVM paravirtual clock driver. A clocksource implementation
+ Copyright (C) 2008 Glauber de Oliveira Costa, Red Hat Inc.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+*/
+
+#include <linux/clocksource.h>
+#include <linux/kvm_para.h>
+#include <asm/pvclock.h>
+#include <asm/msr.h>
+#include <asm/apic.h>
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+#include <linux/cpuhotplug.h>
+#include <linux/sched.h>
+#include <linux/sched/clock.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/set_memory.h>
+
+#include <asm/hypervisor.h>
+#include <asm/mem_encrypt.h>
+#include <asm/x86_init.h>
+#include <asm/reboot.h>
+#include <asm/kvmclock.h>
+
+static int kvmclock __initdata = 1;
+static int kvmclock_vsyscall __initdata = 1;
+static int msr_kvm_system_time __ro_after_init = MSR_KVM_SYSTEM_TIME;
+static int msr_kvm_wall_clock __ro_after_init = MSR_KVM_WALL_CLOCK;
+static u64 kvm_sched_clock_offset __ro_after_init;
+
+static int __init parse_no_kvmclock(char *arg)
+{
+ kvmclock = 0;
+ return 0;
+}
+early_param("no-kvmclock", parse_no_kvmclock);
+
+static int __init parse_no_kvmclock_vsyscall(char *arg)
+{
+ kvmclock_vsyscall = 0;
+ return 0;
+}
+early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall);
+
+/* Aligned to page sizes to match whats mapped via vsyscalls to userspace */
+#define HV_CLOCK_SIZE (sizeof(struct pvclock_vsyscall_time_info) * NR_CPUS)
+#define HVC_BOOT_ARRAY_SIZE \
+ (PAGE_SIZE / sizeof(struct pvclock_vsyscall_time_info))
+
+static struct pvclock_vsyscall_time_info
+ hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __bss_decrypted __aligned(PAGE_SIZE);
+static struct pvclock_wall_clock wall_clock __bss_decrypted;
+static struct pvclock_vsyscall_time_info *hvclock_mem;
+DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu);
+EXPORT_PER_CPU_SYMBOL_GPL(hv_clock_per_cpu);
+
+/*
+ * The wallclock is the time of day when we booted. Since then, some time may
+ * have elapsed since the hypervisor wrote the data. So we try to account for
+ * that with system time
+ */
+static void kvm_get_wallclock(struct timespec64 *now)
+{
+ wrmsrl(msr_kvm_wall_clock, slow_virt_to_phys(&wall_clock));
+ preempt_disable();
+ pvclock_read_wallclock(&wall_clock, this_cpu_pvti(), now);
+ preempt_enable();
+}
+
+static int kvm_set_wallclock(const struct timespec64 *now)
+{
+ return -ENODEV;
+}
+
+static u64 kvm_clock_read(void)
+{
+ u64 ret;
+
+ preempt_disable_notrace();
+ ret = pvclock_clocksource_read(this_cpu_pvti());
+ preempt_enable_notrace();
+ return ret;
+}
+
+static u64 kvm_clock_get_cycles(struct clocksource *cs)
+{
+ return kvm_clock_read();
+}
+
+static u64 kvm_sched_clock_read(void)
+{
+ return kvm_clock_read() - kvm_sched_clock_offset;
+}
+
+static inline void kvm_sched_clock_init(bool stable)
+{
+ if (!stable)
+ clear_sched_clock_stable();
+ kvm_sched_clock_offset = kvm_clock_read();
+ pv_time_ops.sched_clock = kvm_sched_clock_read;
+
+ pr_info("kvm-clock: using sched offset of %llu cycles",
+ kvm_sched_clock_offset);
+
+ BUILD_BUG_ON(sizeof(kvm_sched_clock_offset) >
+ sizeof(((struct pvclock_vcpu_time_info *)NULL)->system_time));
+}
+
+/*
+ * If we don't do that, there is the possibility that the guest
+ * will calibrate under heavy load - thus, getting a lower lpj -
+ * and execute the delays themselves without load. This is wrong,
+ * because no delay loop can finish beforehand.
+ * Any heuristics is subject to fail, because ultimately, a large
+ * poll of guests can be running and trouble each other. So we preset
+ * lpj here
+ */
+static unsigned long kvm_get_tsc_khz(void)
+{
+ setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
+ return pvclock_tsc_khz(this_cpu_pvti());
+}
+
+static void __init kvm_get_preset_lpj(void)
+{
+ unsigned long khz;
+ u64 lpj;
+
+ khz = kvm_get_tsc_khz();
+
+ lpj = ((u64)khz * 1000);
+ do_div(lpj, HZ);
+ preset_lpj = lpj;
+}
+
+bool kvm_check_and_clear_guest_paused(void)
+{
+ struct pvclock_vsyscall_time_info *src = this_cpu_hvclock();
+ bool ret = false;
+
+ if (!src)
+ return ret;
+
+ if ((src->pvti.flags & PVCLOCK_GUEST_STOPPED) != 0) {
+ src->pvti.flags &= ~PVCLOCK_GUEST_STOPPED;
+ pvclock_touch_watchdogs();
+ ret = true;
+ }
+ return ret;
+}
+
+struct clocksource kvm_clock = {
+ .name = "kvm-clock",
+ .read = kvm_clock_get_cycles,
+ .rating = 400,
+ .mask = CLOCKSOURCE_MASK(64),
+ .flags = CLOCK_SOURCE_IS_CONTINUOUS,
+};
+EXPORT_SYMBOL_GPL(kvm_clock);
+
+static void kvm_register_clock(char *txt)
+{
+ struct pvclock_vsyscall_time_info *src = this_cpu_hvclock();
+ u64 pa;
+
+ if (!src)
+ return;
+
+ pa = slow_virt_to_phys(&src->pvti) | 0x01ULL;
+ wrmsrl(msr_kvm_system_time, pa);
+ pr_info("kvm-clock: cpu %d, msr %llx, %s", smp_processor_id(), pa, txt);
+}
+
+static void kvm_save_sched_clock_state(void)
+{
+}
+
+static void kvm_restore_sched_clock_state(void)
+{
+ kvm_register_clock("primary cpu clock, resume");
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+static void kvm_setup_secondary_clock(void)
+{
+ kvm_register_clock("secondary cpu clock");
+}
+#endif
+
+/*
+ * After the clock is registered, the host will keep writing to the
+ * registered memory location. If the guest happens to shutdown, this memory
+ * won't be valid. In cases like kexec, in which you install a new kernel, this
+ * means a random memory location will be kept being written. So before any
+ * kind of shutdown from our side, we unregister the clock by writing anything
+ * that does not have the 'enable' bit set in the msr
+ */
+#ifdef CONFIG_KEXEC_CORE
+static void kvm_crash_shutdown(struct pt_regs *regs)
+{
+ native_write_msr(msr_kvm_system_time, 0, 0);
+ kvm_disable_steal_time();
+ native_machine_crash_shutdown(regs);
+}
+#endif
+
+static void kvm_shutdown(void)
+{
+ native_write_msr(msr_kvm_system_time, 0, 0);
+ kvm_disable_steal_time();
+ native_machine_shutdown();
+}
+
+static void __init kvmclock_init_mem(void)
+{
+ unsigned long ncpus;
+ unsigned int order;
+ struct page *p;
+ int r;
+
+ if (HVC_BOOT_ARRAY_SIZE >= num_possible_cpus())
+ return;
+
+ ncpus = num_possible_cpus() - HVC_BOOT_ARRAY_SIZE;
+ order = get_order(ncpus * sizeof(*hvclock_mem));
+
+ p = alloc_pages(GFP_KERNEL, order);
+ if (!p) {
+ pr_warn("%s: failed to alloc %d pages", __func__, (1U << order));
+ return;
+ }
+
+ hvclock_mem = page_address(p);
+
+ /*
+ * hvclock is shared between the guest and the hypervisor, must
+ * be mapped decrypted.
+ */
+ if (sev_active()) {
+ r = set_memory_decrypted((unsigned long) hvclock_mem,
+ 1UL << order);
+ if (r) {
+ __free_pages(p, order);
+ hvclock_mem = NULL;
+ pr_warn("kvmclock: set_memory_decrypted() failed. Disabling\n");
+ return;
+ }
+ }
+
+ memset(hvclock_mem, 0, PAGE_SIZE << order);
+}
+
+static int __init kvm_setup_vsyscall_timeinfo(void)
+{
+#ifdef CONFIG_X86_64
+ u8 flags;
+
+ if (!per_cpu(hv_clock_per_cpu, 0) || !kvmclock_vsyscall)
+ return 0;
+
+ flags = pvclock_read_flags(&hv_clock_boot[0].pvti);
+ if (!(flags & PVCLOCK_TSC_STABLE_BIT))
+ return 0;
+
+ kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK;
+#endif
+
+ kvmclock_init_mem();
+
+ return 0;
+}
+early_initcall(kvm_setup_vsyscall_timeinfo);
+
+static int kvmclock_setup_percpu(unsigned int cpu)
+{
+ struct pvclock_vsyscall_time_info *p = per_cpu(hv_clock_per_cpu, cpu);
+
+ /*
+ * The per cpu area setup replicates CPU0 data to all cpu
+ * pointers. So carefully check. CPU0 has been set up in init
+ * already.
+ */
+ if (!cpu || (p && p != per_cpu(hv_clock_per_cpu, 0)))
+ return 0;
+
+ /* Use the static page for the first CPUs, allocate otherwise */
+ if (cpu < HVC_BOOT_ARRAY_SIZE)
+ p = &hv_clock_boot[cpu];
+ else if (hvclock_mem)
+ p = hvclock_mem + cpu - HVC_BOOT_ARRAY_SIZE;
+ else
+ return -ENOMEM;
+
+ per_cpu(hv_clock_per_cpu, cpu) = p;
+ return p ? 0 : -ENOMEM;
+}
+
+void __init kvmclock_init(void)
+{
+ u8 flags;
+
+ if (!kvm_para_available() || !kvmclock)
+ return;
+
+ if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE2)) {
+ msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW;
+ msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW;
+ } else if (!kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) {
+ return;
+ }
+
+ if (cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "kvmclock:setup_percpu",
+ kvmclock_setup_percpu, NULL) < 0) {
+ return;
+ }
+
+ pr_info("kvm-clock: Using msrs %x and %x",
+ msr_kvm_system_time, msr_kvm_wall_clock);
+
+ this_cpu_write(hv_clock_per_cpu, &hv_clock_boot[0]);
+ kvm_register_clock("primary cpu clock");
+ pvclock_set_pvti_cpu0_va(hv_clock_boot);
+
+ if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
+ pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
+
+ flags = pvclock_read_flags(&hv_clock_boot[0].pvti);
+ kvm_sched_clock_init(flags & PVCLOCK_TSC_STABLE_BIT);
+
+ x86_platform.calibrate_tsc = kvm_get_tsc_khz;
+ x86_platform.calibrate_cpu = kvm_get_tsc_khz;
+ x86_platform.get_wallclock = kvm_get_wallclock;
+ x86_platform.set_wallclock = kvm_set_wallclock;
+#ifdef CONFIG_X86_LOCAL_APIC
+ x86_cpuinit.early_percpu_clock_init = kvm_setup_secondary_clock;
+#endif
+ x86_platform.save_sched_clock_state = kvm_save_sched_clock_state;
+ x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state;
+ machine_ops.shutdown = kvm_shutdown;
+#ifdef CONFIG_KEXEC_CORE
+ machine_ops.crash_shutdown = kvm_crash_shutdown;
+#endif
+ kvm_get_preset_lpj();
+ clocksource_register_hz(&kvm_clock, NSEC_PER_SEC);
+ pv_info.name = "KVM";
+}
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
new file mode 100644
index 000000000..65590eee6
--- /dev/null
+++ b/arch/x86/kernel/ldt.c
@@ -0,0 +1,582 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
+ * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2002 Andi Kleen
+ *
+ * This handles calls from both 32bit and 64bit mode.
+ *
+ * Lock order:
+ * contex.ldt_usr_sem
+ * mmap_sem
+ * context.lock
+ */
+
+#include <linux/errno.h>
+#include <linux/gfp.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/syscalls.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/uaccess.h>
+
+#include <asm/ldt.h>
+#include <asm/tlb.h>
+#include <asm/desc.h>
+#include <asm/mmu_context.h>
+#include <asm/syscalls.h>
+
+static void refresh_ldt_segments(void)
+{
+#ifdef CONFIG_X86_64
+ unsigned short sel;
+
+ /*
+ * Make sure that the cached DS and ES descriptors match the updated
+ * LDT.
+ */
+ savesegment(ds, sel);
+ if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT)
+ loadsegment(ds, sel);
+
+ savesegment(es, sel);
+ if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT)
+ loadsegment(es, sel);
+#endif
+}
+
+/* context.lock is held by the task which issued the smp function call */
+static void flush_ldt(void *__mm)
+{
+ struct mm_struct *mm = __mm;
+
+ if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm)
+ return;
+
+ load_mm_ldt(mm);
+
+ refresh_ldt_segments();
+}
+
+/* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
+static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries)
+{
+ struct ldt_struct *new_ldt;
+ unsigned int alloc_size;
+
+ if (num_entries > LDT_ENTRIES)
+ return NULL;
+
+ new_ldt = kmalloc(sizeof(struct ldt_struct), GFP_KERNEL);
+ if (!new_ldt)
+ return NULL;
+
+ BUILD_BUG_ON(LDT_ENTRY_SIZE != sizeof(struct desc_struct));
+ alloc_size = num_entries * LDT_ENTRY_SIZE;
+
+ /*
+ * Xen is very picky: it requires a page-aligned LDT that has no
+ * trailing nonzero bytes in any page that contains LDT descriptors.
+ * Keep it simple: zero the whole allocation and never allocate less
+ * than PAGE_SIZE.
+ */
+ if (alloc_size > PAGE_SIZE)
+ new_ldt->entries = vzalloc(alloc_size);
+ else
+ new_ldt->entries = (void *)get_zeroed_page(GFP_KERNEL);
+
+ if (!new_ldt->entries) {
+ kfree(new_ldt);
+ return NULL;
+ }
+
+ /* The new LDT isn't aliased for PTI yet. */
+ new_ldt->slot = -1;
+
+ new_ldt->nr_entries = num_entries;
+ return new_ldt;
+}
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+
+static void do_sanity_check(struct mm_struct *mm,
+ bool had_kernel_mapping,
+ bool had_user_mapping)
+{
+ if (mm->context.ldt) {
+ /*
+ * We already had an LDT. The top-level entry should already
+ * have been allocated and synchronized with the usermode
+ * tables.
+ */
+ WARN_ON(!had_kernel_mapping);
+ if (static_cpu_has(X86_FEATURE_PTI))
+ WARN_ON(!had_user_mapping);
+ } else {
+ /*
+ * This is the first time we're mapping an LDT for this process.
+ * Sync the pgd to the usermode tables.
+ */
+ WARN_ON(had_kernel_mapping);
+ if (static_cpu_has(X86_FEATURE_PTI))
+ WARN_ON(had_user_mapping);
+ }
+}
+
+#ifdef CONFIG_X86_PAE
+
+static pmd_t *pgd_to_pmd_walk(pgd_t *pgd, unsigned long va)
+{
+ p4d_t *p4d;
+ pud_t *pud;
+
+ if (pgd->pgd == 0)
+ return NULL;
+
+ p4d = p4d_offset(pgd, va);
+ if (p4d_none(*p4d))
+ return NULL;
+
+ pud = pud_offset(p4d, va);
+ if (pud_none(*pud))
+ return NULL;
+
+ return pmd_offset(pud, va);
+}
+
+static void map_ldt_struct_to_user(struct mm_struct *mm)
+{
+ pgd_t *k_pgd = pgd_offset(mm, LDT_BASE_ADDR);
+ pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd);
+ pmd_t *k_pmd, *u_pmd;
+
+ k_pmd = pgd_to_pmd_walk(k_pgd, LDT_BASE_ADDR);
+ u_pmd = pgd_to_pmd_walk(u_pgd, LDT_BASE_ADDR);
+
+ if (static_cpu_has(X86_FEATURE_PTI) && !mm->context.ldt)
+ set_pmd(u_pmd, *k_pmd);
+}
+
+static void sanity_check_ldt_mapping(struct mm_struct *mm)
+{
+ pgd_t *k_pgd = pgd_offset(mm, LDT_BASE_ADDR);
+ pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd);
+ bool had_kernel, had_user;
+ pmd_t *k_pmd, *u_pmd;
+
+ k_pmd = pgd_to_pmd_walk(k_pgd, LDT_BASE_ADDR);
+ u_pmd = pgd_to_pmd_walk(u_pgd, LDT_BASE_ADDR);
+ had_kernel = (k_pmd->pmd != 0);
+ had_user = (u_pmd->pmd != 0);
+
+ do_sanity_check(mm, had_kernel, had_user);
+}
+
+#else /* !CONFIG_X86_PAE */
+
+static void map_ldt_struct_to_user(struct mm_struct *mm)
+{
+ pgd_t *pgd = pgd_offset(mm, LDT_BASE_ADDR);
+
+ if (static_cpu_has(X86_FEATURE_PTI) && !mm->context.ldt)
+ set_pgd(kernel_to_user_pgdp(pgd), *pgd);
+}
+
+static void sanity_check_ldt_mapping(struct mm_struct *mm)
+{
+ pgd_t *pgd = pgd_offset(mm, LDT_BASE_ADDR);
+ bool had_kernel = (pgd->pgd != 0);
+ bool had_user = (kernel_to_user_pgdp(pgd)->pgd != 0);
+
+ do_sanity_check(mm, had_kernel, had_user);
+}
+
+#endif /* CONFIG_X86_PAE */
+
+/*
+ * If PTI is enabled, this maps the LDT into the kernelmode and
+ * usermode tables for the given mm.
+ */
+static int
+map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
+{
+ unsigned long va;
+ bool is_vmalloc;
+ spinlock_t *ptl;
+ int i, nr_pages;
+
+ if (!static_cpu_has(X86_FEATURE_PTI))
+ return 0;
+
+ /*
+ * Any given ldt_struct should have map_ldt_struct() called at most
+ * once.
+ */
+ WARN_ON(ldt->slot != -1);
+
+ /* Check if the current mappings are sane */
+ sanity_check_ldt_mapping(mm);
+
+ is_vmalloc = is_vmalloc_addr(ldt->entries);
+
+ nr_pages = DIV_ROUND_UP(ldt->nr_entries * LDT_ENTRY_SIZE, PAGE_SIZE);
+
+ for (i = 0; i < nr_pages; i++) {
+ unsigned long offset = i << PAGE_SHIFT;
+ const void *src = (char *)ldt->entries + offset;
+ unsigned long pfn;
+ pgprot_t pte_prot;
+ pte_t pte, *ptep;
+
+ va = (unsigned long)ldt_slot_va(slot) + offset;
+ pfn = is_vmalloc ? vmalloc_to_pfn(src) :
+ page_to_pfn(virt_to_page(src));
+ /*
+ * Treat the PTI LDT range as a *userspace* range.
+ * get_locked_pte() will allocate all needed pagetables
+ * and account for them in this mm.
+ */
+ ptep = get_locked_pte(mm, va, &ptl);
+ if (!ptep)
+ return -ENOMEM;
+ /*
+ * Map it RO so the easy to find address is not a primary
+ * target via some kernel interface which misses a
+ * permission check.
+ */
+ pte_prot = __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL);
+ /* Filter out unsuppored __PAGE_KERNEL* bits: */
+ pgprot_val(pte_prot) &= __supported_pte_mask;
+ pte = pfn_pte(pfn, pte_prot);
+ set_pte_at(mm, va, ptep, pte);
+ pte_unmap_unlock(ptep, ptl);
+ }
+
+ /* Propagate LDT mapping to the user page-table */
+ map_ldt_struct_to_user(mm);
+
+ ldt->slot = slot;
+ return 0;
+}
+
+static void unmap_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt)
+{
+ unsigned long va;
+ int i, nr_pages;
+
+ if (!ldt)
+ return;
+
+ /* LDT map/unmap is only required for PTI */
+ if (!static_cpu_has(X86_FEATURE_PTI))
+ return;
+
+ nr_pages = DIV_ROUND_UP(ldt->nr_entries * LDT_ENTRY_SIZE, PAGE_SIZE);
+
+ for (i = 0; i < nr_pages; i++) {
+ unsigned long offset = i << PAGE_SHIFT;
+ spinlock_t *ptl;
+ pte_t *ptep;
+
+ va = (unsigned long)ldt_slot_va(ldt->slot) + offset;
+ ptep = get_locked_pte(mm, va, &ptl);
+ pte_clear(mm, va, ptep);
+ pte_unmap_unlock(ptep, ptl);
+ }
+
+ va = (unsigned long)ldt_slot_va(ldt->slot);
+ flush_tlb_mm_range(mm, va, va + nr_pages * PAGE_SIZE, 0);
+}
+
+#else /* !CONFIG_PAGE_TABLE_ISOLATION */
+
+static int
+map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
+{
+ return 0;
+}
+
+static void unmap_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt)
+{
+}
+#endif /* CONFIG_PAGE_TABLE_ISOLATION */
+
+static void free_ldt_pgtables(struct mm_struct *mm)
+{
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+ struct mmu_gather tlb;
+ unsigned long start = LDT_BASE_ADDR;
+ unsigned long end = LDT_END_ADDR;
+
+ if (!static_cpu_has(X86_FEATURE_PTI))
+ return;
+
+ tlb_gather_mmu(&tlb, mm, start, end);
+ free_pgd_range(&tlb, start, end, start, end);
+ tlb_finish_mmu(&tlb, start, end);
+#endif
+}
+
+/* After calling this, the LDT is immutable. */
+static void finalize_ldt_struct(struct ldt_struct *ldt)
+{
+ paravirt_alloc_ldt(ldt->entries, ldt->nr_entries);
+}
+
+static void install_ldt(struct mm_struct *mm, struct ldt_struct *ldt)
+{
+ mutex_lock(&mm->context.lock);
+
+ /* Synchronizes with READ_ONCE in load_mm_ldt. */
+ smp_store_release(&mm->context.ldt, ldt);
+
+ /* Activate the LDT for all CPUs using currents mm. */
+ on_each_cpu_mask(mm_cpumask(mm), flush_ldt, mm, true);
+
+ mutex_unlock(&mm->context.lock);
+}
+
+static void free_ldt_struct(struct ldt_struct *ldt)
+{
+ if (likely(!ldt))
+ return;
+
+ paravirt_free_ldt(ldt->entries, ldt->nr_entries);
+ if (ldt->nr_entries * LDT_ENTRY_SIZE > PAGE_SIZE)
+ vfree_atomic(ldt->entries);
+ else
+ free_page((unsigned long)ldt->entries);
+ kfree(ldt);
+}
+
+/*
+ * Called on fork from arch_dup_mmap(). Just copy the current LDT state,
+ * the new task is not running, so nothing can be installed.
+ */
+int ldt_dup_context(struct mm_struct *old_mm, struct mm_struct *mm)
+{
+ struct ldt_struct *new_ldt;
+ int retval = 0;
+
+ if (!old_mm)
+ return 0;
+
+ mutex_lock(&old_mm->context.lock);
+ if (!old_mm->context.ldt)
+ goto out_unlock;
+
+ new_ldt = alloc_ldt_struct(old_mm->context.ldt->nr_entries);
+ if (!new_ldt) {
+ retval = -ENOMEM;
+ goto out_unlock;
+ }
+
+ memcpy(new_ldt->entries, old_mm->context.ldt->entries,
+ new_ldt->nr_entries * LDT_ENTRY_SIZE);
+ finalize_ldt_struct(new_ldt);
+
+ retval = map_ldt_struct(mm, new_ldt, 0);
+ if (retval) {
+ free_ldt_pgtables(mm);
+ free_ldt_struct(new_ldt);
+ goto out_unlock;
+ }
+ mm->context.ldt = new_ldt;
+
+out_unlock:
+ mutex_unlock(&old_mm->context.lock);
+ return retval;
+}
+
+/*
+ * No need to lock the MM as we are the last user
+ *
+ * 64bit: Don't touch the LDT register - we're already in the next thread.
+ */
+void destroy_context_ldt(struct mm_struct *mm)
+{
+ free_ldt_struct(mm->context.ldt);
+ mm->context.ldt = NULL;
+}
+
+void ldt_arch_exit_mmap(struct mm_struct *mm)
+{
+ free_ldt_pgtables(mm);
+}
+
+static int read_ldt(void __user *ptr, unsigned long bytecount)
+{
+ struct mm_struct *mm = current->mm;
+ unsigned long entries_size;
+ int retval;
+
+ down_read(&mm->context.ldt_usr_sem);
+
+ if (!mm->context.ldt) {
+ retval = 0;
+ goto out_unlock;
+ }
+
+ if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES)
+ bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES;
+
+ entries_size = mm->context.ldt->nr_entries * LDT_ENTRY_SIZE;
+ if (entries_size > bytecount)
+ entries_size = bytecount;
+
+ if (copy_to_user(ptr, mm->context.ldt->entries, entries_size)) {
+ retval = -EFAULT;
+ goto out_unlock;
+ }
+
+ if (entries_size != bytecount) {
+ /* Zero-fill the rest and pretend we read bytecount bytes. */
+ if (clear_user(ptr + entries_size, bytecount - entries_size)) {
+ retval = -EFAULT;
+ goto out_unlock;
+ }
+ }
+ retval = bytecount;
+
+out_unlock:
+ up_read(&mm->context.ldt_usr_sem);
+ return retval;
+}
+
+static int read_default_ldt(void __user *ptr, unsigned long bytecount)
+{
+ /* CHECKME: Can we use _one_ random number ? */
+#ifdef CONFIG_X86_32
+ unsigned long size = 5 * sizeof(struct desc_struct);
+#else
+ unsigned long size = 128;
+#endif
+ if (bytecount > size)
+ bytecount = size;
+ if (clear_user(ptr, bytecount))
+ return -EFAULT;
+ return bytecount;
+}
+
+static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
+{
+ struct mm_struct *mm = current->mm;
+ struct ldt_struct *new_ldt, *old_ldt;
+ unsigned int old_nr_entries, new_nr_entries;
+ struct user_desc ldt_info;
+ struct desc_struct ldt;
+ int error;
+
+ error = -EINVAL;
+ if (bytecount != sizeof(ldt_info))
+ goto out;
+ error = -EFAULT;
+ if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info)))
+ goto out;
+
+ error = -EINVAL;
+ if (ldt_info.entry_number >= LDT_ENTRIES)
+ goto out;
+ if (ldt_info.contents == 3) {
+ if (oldmode)
+ goto out;
+ if (ldt_info.seg_not_present == 0)
+ goto out;
+ }
+
+ if ((oldmode && !ldt_info.base_addr && !ldt_info.limit) ||
+ LDT_empty(&ldt_info)) {
+ /* The user wants to clear the entry. */
+ memset(&ldt, 0, sizeof(ldt));
+ } else {
+ if (!IS_ENABLED(CONFIG_X86_16BIT) && !ldt_info.seg_32bit) {
+ error = -EINVAL;
+ goto out;
+ }
+
+ fill_ldt(&ldt, &ldt_info);
+ if (oldmode)
+ ldt.avl = 0;
+ }
+
+ if (down_write_killable(&mm->context.ldt_usr_sem))
+ return -EINTR;
+
+ old_ldt = mm->context.ldt;
+ old_nr_entries = old_ldt ? old_ldt->nr_entries : 0;
+ new_nr_entries = max(ldt_info.entry_number + 1, old_nr_entries);
+
+ error = -ENOMEM;
+ new_ldt = alloc_ldt_struct(new_nr_entries);
+ if (!new_ldt)
+ goto out_unlock;
+
+ if (old_ldt)
+ memcpy(new_ldt->entries, old_ldt->entries, old_nr_entries * LDT_ENTRY_SIZE);
+
+ new_ldt->entries[ldt_info.entry_number] = ldt;
+ finalize_ldt_struct(new_ldt);
+
+ /*
+ * If we are using PTI, map the new LDT into the userspace pagetables.
+ * If there is already an LDT, use the other slot so that other CPUs
+ * will continue to use the old LDT until install_ldt() switches
+ * them over to the new LDT.
+ */
+ error = map_ldt_struct(mm, new_ldt, old_ldt ? !old_ldt->slot : 0);
+ if (error) {
+ /*
+ * This only can fail for the first LDT setup. If an LDT is
+ * already installed then the PTE page is already
+ * populated. Mop up a half populated page table.
+ */
+ if (!WARN_ON_ONCE(old_ldt))
+ free_ldt_pgtables(mm);
+ free_ldt_struct(new_ldt);
+ goto out_unlock;
+ }
+
+ install_ldt(mm, new_ldt);
+ unmap_ldt_struct(mm, old_ldt);
+ free_ldt_struct(old_ldt);
+ error = 0;
+
+out_unlock:
+ up_write(&mm->context.ldt_usr_sem);
+out:
+ return error;
+}
+
+SYSCALL_DEFINE3(modify_ldt, int , func , void __user * , ptr ,
+ unsigned long , bytecount)
+{
+ int ret = -ENOSYS;
+
+ switch (func) {
+ case 0:
+ ret = read_ldt(ptr, bytecount);
+ break;
+ case 1:
+ ret = write_ldt(ptr, bytecount, 1);
+ break;
+ case 2:
+ ret = read_default_ldt(ptr, bytecount);
+ break;
+ case 0x11:
+ ret = write_ldt(ptr, bytecount, 0);
+ break;
+ }
+ /*
+ * The SYSCALL_DEFINE() macros give us an 'unsigned long'
+ * return type, but tht ABI for sys_modify_ldt() expects
+ * 'int'. This cast gives us an int-sized value in %rax
+ * for the return code. The 'unsigned' is necessary so
+ * the compiler does not try to sign-extend the negative
+ * return codes into the high half of the register when
+ * taking the value from int->long.
+ */
+ return (unsigned int)ret;
+}
diff --git a/arch/x86/kernel/livepatch.c b/arch/x86/kernel/livepatch.c
new file mode 100644
index 000000000..e9d252d87
--- /dev/null
+++ b/arch/x86/kernel/livepatch.c
@@ -0,0 +1,65 @@
+/*
+ * livepatch.c - x86-specific Kernel Live Patching Core
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/livepatch.h>
+#include <asm/text-patching.h>
+
+/* Apply per-object alternatives. Based on x86 module_finalize() */
+void arch_klp_init_object_loaded(struct klp_patch *patch,
+ struct klp_object *obj)
+{
+ int cnt;
+ struct klp_modinfo *info;
+ Elf_Shdr *s, *alt = NULL, *para = NULL;
+ void *aseg, *pseg;
+ const char *objname;
+ char sec_objname[MODULE_NAME_LEN];
+ char secname[KSYM_NAME_LEN];
+
+ info = patch->mod->klp_info;
+ objname = obj->name ? obj->name : "vmlinux";
+
+ /* See livepatch core code for BUILD_BUG_ON() explanation */
+ BUILD_BUG_ON(MODULE_NAME_LEN < 56 || KSYM_NAME_LEN != 128);
+
+ for (s = info->sechdrs; s < info->sechdrs + info->hdr.e_shnum; s++) {
+ /* Apply per-object .klp.arch sections */
+ cnt = sscanf(info->secstrings + s->sh_name,
+ ".klp.arch.%55[^.].%127s",
+ sec_objname, secname);
+ if (cnt != 2)
+ continue;
+ if (strcmp(sec_objname, objname))
+ continue;
+ if (!strcmp(".altinstructions", secname))
+ alt = s;
+ if (!strcmp(".parainstructions", secname))
+ para = s;
+ }
+
+ if (alt) {
+ aseg = (void *) alt->sh_addr;
+ apply_alternatives(aseg, aseg + alt->sh_size);
+ }
+
+ if (para) {
+ pseg = (void *) para->sh_addr;
+ apply_paravirt(pseg, pseg + para->sh_size);
+ }
+}
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
new file mode 100644
index 000000000..5409c2800
--- /dev/null
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -0,0 +1,266 @@
+/*
+ * handle transition of Linux booting another kernel
+ * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+#include <linux/mm.h>
+#include <linux/kexec.h>
+#include <linux/delay.h>
+#include <linux/numa.h>
+#include <linux/ftrace.h>
+#include <linux/suspend.h>
+#include <linux/gfp.h>
+#include <linux/io.h>
+
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+#include <asm/apic.h>
+#include <asm/io_apic.h>
+#include <asm/cpufeature.h>
+#include <asm/desc.h>
+#include <asm/set_memory.h>
+#include <asm/debugreg.h>
+
+static void set_gdt(void *newgdt, __u16 limit)
+{
+ struct desc_ptr curgdt;
+
+ /* ia32 supports unaligned loads & stores */
+ curgdt.size = limit;
+ curgdt.address = (unsigned long)newgdt;
+
+ load_gdt(&curgdt);
+}
+
+static void load_segments(void)
+{
+#define __STR(X) #X
+#define STR(X) __STR(X)
+
+ __asm__ __volatile__ (
+ "\tljmp $"STR(__KERNEL_CS)",$1f\n"
+ "\t1:\n"
+ "\tmovl $"STR(__KERNEL_DS)",%%eax\n"
+ "\tmovl %%eax,%%ds\n"
+ "\tmovl %%eax,%%es\n"
+ "\tmovl %%eax,%%ss\n"
+ : : : "eax", "memory");
+#undef STR
+#undef __STR
+}
+
+static void machine_kexec_free_page_tables(struct kimage *image)
+{
+ free_pages((unsigned long)image->arch.pgd, PGD_ALLOCATION_ORDER);
+ image->arch.pgd = NULL;
+#ifdef CONFIG_X86_PAE
+ free_page((unsigned long)image->arch.pmd0);
+ image->arch.pmd0 = NULL;
+ free_page((unsigned long)image->arch.pmd1);
+ image->arch.pmd1 = NULL;
+#endif
+ free_page((unsigned long)image->arch.pte0);
+ image->arch.pte0 = NULL;
+ free_page((unsigned long)image->arch.pte1);
+ image->arch.pte1 = NULL;
+}
+
+static int machine_kexec_alloc_page_tables(struct kimage *image)
+{
+ image->arch.pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+ PGD_ALLOCATION_ORDER);
+#ifdef CONFIG_X86_PAE
+ image->arch.pmd0 = (pmd_t *)get_zeroed_page(GFP_KERNEL);
+ image->arch.pmd1 = (pmd_t *)get_zeroed_page(GFP_KERNEL);
+#endif
+ image->arch.pte0 = (pte_t *)get_zeroed_page(GFP_KERNEL);
+ image->arch.pte1 = (pte_t *)get_zeroed_page(GFP_KERNEL);
+ if (!image->arch.pgd ||
+#ifdef CONFIG_X86_PAE
+ !image->arch.pmd0 || !image->arch.pmd1 ||
+#endif
+ !image->arch.pte0 || !image->arch.pte1) {
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+static void machine_kexec_page_table_set_one(
+ pgd_t *pgd, pmd_t *pmd, pte_t *pte,
+ unsigned long vaddr, unsigned long paddr)
+{
+ p4d_t *p4d;
+ pud_t *pud;
+
+ pgd += pgd_index(vaddr);
+#ifdef CONFIG_X86_PAE
+ if (!(pgd_val(*pgd) & _PAGE_PRESENT))
+ set_pgd(pgd, __pgd(__pa(pmd) | _PAGE_PRESENT));
+#endif
+ p4d = p4d_offset(pgd, vaddr);
+ pud = pud_offset(p4d, vaddr);
+ pmd = pmd_offset(pud, vaddr);
+ if (!(pmd_val(*pmd) & _PAGE_PRESENT))
+ set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE));
+ pte = pte_offset_kernel(pmd, vaddr);
+ set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC));
+}
+
+static void machine_kexec_prepare_page_tables(struct kimage *image)
+{
+ void *control_page;
+ pmd_t *pmd = NULL;
+
+ control_page = page_address(image->control_code_page);
+#ifdef CONFIG_X86_PAE
+ pmd = image->arch.pmd0;
+#endif
+ machine_kexec_page_table_set_one(
+ image->arch.pgd, pmd, image->arch.pte0,
+ (unsigned long)control_page, __pa(control_page));
+#ifdef CONFIG_X86_PAE
+ pmd = image->arch.pmd1;
+#endif
+ machine_kexec_page_table_set_one(
+ image->arch.pgd, pmd, image->arch.pte1,
+ __pa(control_page), __pa(control_page));
+}
+
+/*
+ * A architecture hook called to validate the
+ * proposed image and prepare the control pages
+ * as needed. The pages for KEXEC_CONTROL_PAGE_SIZE
+ * have been allocated, but the segments have yet
+ * been copied into the kernel.
+ *
+ * Do what every setup is needed on image and the
+ * reboot code buffer to allow us to avoid allocations
+ * later.
+ *
+ * - Make control page executable.
+ * - Allocate page tables
+ * - Setup page tables
+ */
+int machine_kexec_prepare(struct kimage *image)
+{
+ int error;
+
+ set_pages_x(image->control_code_page, 1);
+ error = machine_kexec_alloc_page_tables(image);
+ if (error)
+ return error;
+ machine_kexec_prepare_page_tables(image);
+ return 0;
+}
+
+/*
+ * Undo anything leftover by machine_kexec_prepare
+ * when an image is freed.
+ */
+void machine_kexec_cleanup(struct kimage *image)
+{
+ set_pages_nx(image->control_code_page, 1);
+ machine_kexec_free_page_tables(image);
+}
+
+/*
+ * Do not allocate memory (or fail in any way) in machine_kexec().
+ * We are past the point of no return, committed to rebooting now.
+ */
+void machine_kexec(struct kimage *image)
+{
+ unsigned long page_list[PAGES_NR];
+ void *control_page;
+ int save_ftrace_enabled;
+ asmlinkage unsigned long
+ (*relocate_kernel_ptr)(unsigned long indirection_page,
+ unsigned long control_page,
+ unsigned long start_address,
+ unsigned int has_pae,
+ unsigned int preserve_context);
+
+#ifdef CONFIG_KEXEC_JUMP
+ if (image->preserve_context)
+ save_processor_state();
+#endif
+
+ save_ftrace_enabled = __ftrace_enabled_save();
+
+ /* Interrupts aren't acceptable while we reboot */
+ local_irq_disable();
+ hw_breakpoint_disable();
+
+ if (image->preserve_context) {
+#ifdef CONFIG_X86_IO_APIC
+ /*
+ * We need to put APICs in legacy mode so that we can
+ * get timer interrupts in second kernel. kexec/kdump
+ * paths already have calls to restore_boot_irq_mode()
+ * in one form or other. kexec jump path also need one.
+ */
+ clear_IO_APIC();
+ restore_boot_irq_mode();
+#endif
+ }
+
+ control_page = page_address(image->control_code_page);
+ memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE);
+
+ relocate_kernel_ptr = control_page;
+ page_list[PA_CONTROL_PAGE] = __pa(control_page);
+ page_list[VA_CONTROL_PAGE] = (unsigned long)control_page;
+ page_list[PA_PGD] = __pa(image->arch.pgd);
+
+ if (image->type == KEXEC_TYPE_DEFAULT)
+ page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
+ << PAGE_SHIFT);
+
+ /*
+ * The segment registers are funny things, they have both a
+ * visible and an invisible part. Whenever the visible part is
+ * set to a specific selector, the invisible part is loaded
+ * with from a table in memory. At no other time is the
+ * descriptor table in memory accessed.
+ *
+ * I take advantage of this here by force loading the
+ * segments, before I zap the gdt with an invalid value.
+ */
+ load_segments();
+ /*
+ * The gdt & idt are now invalid.
+ * If you want to load them you must set up your own idt & gdt.
+ */
+ idt_invalidate(phys_to_virt(0));
+ set_gdt(phys_to_virt(0), 0);
+
+ /* now call it */
+ image->start = relocate_kernel_ptr((unsigned long)image->head,
+ (unsigned long)page_list,
+ image->start,
+ boot_cpu_has(X86_FEATURE_PAE),
+ image->preserve_context);
+
+#ifdef CONFIG_KEXEC_JUMP
+ if (image->preserve_context)
+ restore_processor_state();
+#endif
+
+ __ftrace_enabled_restore(save_ftrace_enabled);
+}
+
+void arch_crash_save_vmcoreinfo(void)
+{
+#ifdef CONFIG_NUMA
+ VMCOREINFO_SYMBOL(node_data);
+ VMCOREINFO_LENGTH(node_data, MAX_NUMNODES);
+#endif
+#ifdef CONFIG_X86_PAE
+ VMCOREINFO_CONFIG(X86_PAE);
+#endif
+}
+
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
new file mode 100644
index 000000000..4c8acdfdc
--- /dev/null
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -0,0 +1,576 @@
+/*
+ * handle transition of Linux booting another kernel
+ * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+#define pr_fmt(fmt) "kexec: " fmt
+
+#include <linux/mm.h>
+#include <linux/kexec.h>
+#include <linux/string.h>
+#include <linux/gfp.h>
+#include <linux/reboot.h>
+#include <linux/numa.h>
+#include <linux/ftrace.h>
+#include <linux/io.h>
+#include <linux/suspend.h>
+#include <linux/vmalloc.h>
+
+#include <asm/init.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+#include <asm/io_apic.h>
+#include <asm/debugreg.h>
+#include <asm/kexec-bzimage64.h>
+#include <asm/setup.h>
+#include <asm/set_memory.h>
+
+#ifdef CONFIG_KEXEC_FILE
+const struct kexec_file_ops * const kexec_file_loaders[] = {
+ &kexec_bzImage64_ops,
+ NULL
+};
+#endif
+
+static void free_transition_pgtable(struct kimage *image)
+{
+ free_page((unsigned long)image->arch.p4d);
+ image->arch.p4d = NULL;
+ free_page((unsigned long)image->arch.pud);
+ image->arch.pud = NULL;
+ free_page((unsigned long)image->arch.pmd);
+ image->arch.pmd = NULL;
+ free_page((unsigned long)image->arch.pte);
+ image->arch.pte = NULL;
+}
+
+static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
+{
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+ unsigned long vaddr, paddr;
+ int result = -ENOMEM;
+
+ vaddr = (unsigned long)relocate_kernel;
+ paddr = __pa(page_address(image->control_code_page)+PAGE_SIZE);
+ pgd += pgd_index(vaddr);
+ if (!pgd_present(*pgd)) {
+ p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL);
+ if (!p4d)
+ goto err;
+ image->arch.p4d = p4d;
+ set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE));
+ }
+ p4d = p4d_offset(pgd, vaddr);
+ if (!p4d_present(*p4d)) {
+ pud = (pud_t *)get_zeroed_page(GFP_KERNEL);
+ if (!pud)
+ goto err;
+ image->arch.pud = pud;
+ set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE));
+ }
+ pud = pud_offset(p4d, vaddr);
+ if (!pud_present(*pud)) {
+ pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL);
+ if (!pmd)
+ goto err;
+ image->arch.pmd = pmd;
+ set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
+ }
+ pmd = pmd_offset(pud, vaddr);
+ if (!pmd_present(*pmd)) {
+ pte = (pte_t *)get_zeroed_page(GFP_KERNEL);
+ if (!pte)
+ goto err;
+ image->arch.pte = pte;
+ set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
+ }
+ pte = pte_offset_kernel(pmd, vaddr);
+ set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC_NOENC));
+ return 0;
+err:
+ return result;
+}
+
+static void *alloc_pgt_page(void *data)
+{
+ struct kimage *image = (struct kimage *)data;
+ struct page *page;
+ void *p = NULL;
+
+ page = kimage_alloc_control_pages(image, 0);
+ if (page) {
+ p = page_address(page);
+ clear_page(p);
+ }
+
+ return p;
+}
+
+static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
+{
+ struct x86_mapping_info info = {
+ .alloc_pgt_page = alloc_pgt_page,
+ .context = image,
+ .page_flag = __PAGE_KERNEL_LARGE_EXEC,
+ .kernpg_flag = _KERNPG_TABLE_NOENC,
+ };
+ unsigned long mstart, mend;
+ pgd_t *level4p;
+ int result;
+ int i;
+
+ level4p = (pgd_t *)__va(start_pgtable);
+ clear_page(level4p);
+
+ if (direct_gbpages)
+ info.direct_gbpages = true;
+
+ for (i = 0; i < nr_pfn_mapped; i++) {
+ mstart = pfn_mapped[i].start << PAGE_SHIFT;
+ mend = pfn_mapped[i].end << PAGE_SHIFT;
+
+ result = kernel_ident_mapping_init(&info,
+ level4p, mstart, mend);
+ if (result)
+ return result;
+ }
+
+ /*
+ * segments's mem ranges could be outside 0 ~ max_pfn,
+ * for example when jump back to original kernel from kexeced kernel.
+ * or first kernel is booted with user mem map, and second kernel
+ * could be loaded out of that range.
+ */
+ for (i = 0; i < image->nr_segments; i++) {
+ mstart = image->segment[i].mem;
+ mend = mstart + image->segment[i].memsz;
+
+ result = kernel_ident_mapping_init(&info,
+ level4p, mstart, mend);
+
+ if (result)
+ return result;
+ }
+
+ return init_transition_pgtable(image, level4p);
+}
+
+static void set_idt(void *newidt, u16 limit)
+{
+ struct desc_ptr curidt;
+
+ /* x86-64 supports unaliged loads & stores */
+ curidt.size = limit;
+ curidt.address = (unsigned long)newidt;
+
+ __asm__ __volatile__ (
+ "lidtq %0\n"
+ : : "m" (curidt)
+ );
+};
+
+
+static void set_gdt(void *newgdt, u16 limit)
+{
+ struct desc_ptr curgdt;
+
+ /* x86-64 supports unaligned loads & stores */
+ curgdt.size = limit;
+ curgdt.address = (unsigned long)newgdt;
+
+ __asm__ __volatile__ (
+ "lgdtq %0\n"
+ : : "m" (curgdt)
+ );
+};
+
+static void load_segments(void)
+{
+ __asm__ __volatile__ (
+ "\tmovl %0,%%ds\n"
+ "\tmovl %0,%%es\n"
+ "\tmovl %0,%%ss\n"
+ "\tmovl %0,%%fs\n"
+ "\tmovl %0,%%gs\n"
+ : : "a" (__KERNEL_DS) : "memory"
+ );
+}
+
+#ifdef CONFIG_KEXEC_FILE
+/* Update purgatory as needed after various image segments have been prepared */
+static int arch_update_purgatory(struct kimage *image)
+{
+ int ret = 0;
+
+ if (!image->file_mode)
+ return 0;
+
+ /* Setup copying of backup region */
+ if (image->type == KEXEC_TYPE_CRASH) {
+ ret = kexec_purgatory_get_set_symbol(image,
+ "purgatory_backup_dest",
+ &image->arch.backup_load_addr,
+ sizeof(image->arch.backup_load_addr), 0);
+ if (ret)
+ return ret;
+
+ ret = kexec_purgatory_get_set_symbol(image,
+ "purgatory_backup_src",
+ &image->arch.backup_src_start,
+ sizeof(image->arch.backup_src_start), 0);
+ if (ret)
+ return ret;
+
+ ret = kexec_purgatory_get_set_symbol(image,
+ "purgatory_backup_sz",
+ &image->arch.backup_src_sz,
+ sizeof(image->arch.backup_src_sz), 0);
+ if (ret)
+ return ret;
+ }
+
+ return ret;
+}
+#else /* !CONFIG_KEXEC_FILE */
+static inline int arch_update_purgatory(struct kimage *image)
+{
+ return 0;
+}
+#endif /* CONFIG_KEXEC_FILE */
+
+int machine_kexec_prepare(struct kimage *image)
+{
+ unsigned long start_pgtable;
+ int result;
+
+ /* Calculate the offsets */
+ start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
+
+ /* Setup the identity mapped 64bit page table */
+ result = init_pgtable(image, start_pgtable);
+ if (result)
+ return result;
+
+ /* update purgatory as needed */
+ result = arch_update_purgatory(image);
+ if (result)
+ return result;
+
+ return 0;
+}
+
+void machine_kexec_cleanup(struct kimage *image)
+{
+ free_transition_pgtable(image);
+}
+
+/*
+ * Do not allocate memory (or fail in any way) in machine_kexec().
+ * We are past the point of no return, committed to rebooting now.
+ */
+void machine_kexec(struct kimage *image)
+{
+ unsigned long page_list[PAGES_NR];
+ void *control_page;
+ int save_ftrace_enabled;
+
+#ifdef CONFIG_KEXEC_JUMP
+ if (image->preserve_context)
+ save_processor_state();
+#endif
+
+ save_ftrace_enabled = __ftrace_enabled_save();
+
+ /* Interrupts aren't acceptable while we reboot */
+ local_irq_disable();
+ hw_breakpoint_disable();
+
+ if (image->preserve_context) {
+#ifdef CONFIG_X86_IO_APIC
+ /*
+ * We need to put APICs in legacy mode so that we can
+ * get timer interrupts in second kernel. kexec/kdump
+ * paths already have calls to restore_boot_irq_mode()
+ * in one form or other. kexec jump path also need one.
+ */
+ clear_IO_APIC();
+ restore_boot_irq_mode();
+#endif
+ }
+
+ control_page = page_address(image->control_code_page) + PAGE_SIZE;
+ memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE);
+
+ page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page);
+ page_list[VA_CONTROL_PAGE] = (unsigned long)control_page;
+ page_list[PA_TABLE_PAGE] =
+ (unsigned long)__pa(page_address(image->control_code_page));
+
+ if (image->type == KEXEC_TYPE_DEFAULT)
+ page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
+ << PAGE_SHIFT);
+
+ /*
+ * The segment registers are funny things, they have both a
+ * visible and an invisible part. Whenever the visible part is
+ * set to a specific selector, the invisible part is loaded
+ * with from a table in memory. At no other time is the
+ * descriptor table in memory accessed.
+ *
+ * I take advantage of this here by force loading the
+ * segments, before I zap the gdt with an invalid value.
+ */
+ load_segments();
+ /*
+ * The gdt & idt are now invalid.
+ * If you want to load them you must set up your own idt & gdt.
+ */
+ set_gdt(phys_to_virt(0), 0);
+ set_idt(phys_to_virt(0), 0);
+
+ /* now call it */
+ image->start = relocate_kernel((unsigned long)image->head,
+ (unsigned long)page_list,
+ image->start,
+ image->preserve_context,
+ sme_active());
+
+#ifdef CONFIG_KEXEC_JUMP
+ if (image->preserve_context)
+ restore_processor_state();
+#endif
+
+ __ftrace_enabled_restore(save_ftrace_enabled);
+}
+
+void arch_crash_save_vmcoreinfo(void)
+{
+ VMCOREINFO_NUMBER(phys_base);
+ VMCOREINFO_SYMBOL(init_top_pgt);
+ vmcoreinfo_append_str("NUMBER(pgtable_l5_enabled)=%d\n",
+ pgtable_l5_enabled());
+
+#ifdef CONFIG_NUMA
+ VMCOREINFO_SYMBOL(node_data);
+ VMCOREINFO_LENGTH(node_data, MAX_NUMNODES);
+#endif
+ vmcoreinfo_append_str("KERNELOFFSET=%lx\n",
+ kaslr_offset());
+ VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE);
+}
+
+/* arch-dependent functionality related to kexec file-based syscall */
+
+#ifdef CONFIG_KEXEC_FILE
+void *arch_kexec_kernel_image_load(struct kimage *image)
+{
+ vfree(image->arch.elf_headers);
+ image->arch.elf_headers = NULL;
+
+ if (!image->fops || !image->fops->load)
+ return ERR_PTR(-ENOEXEC);
+
+ return image->fops->load(image, image->kernel_buf,
+ image->kernel_buf_len, image->initrd_buf,
+ image->initrd_buf_len, image->cmdline_buf,
+ image->cmdline_buf_len);
+}
+
+/*
+ * Apply purgatory relocations.
+ *
+ * @pi: Purgatory to be relocated.
+ * @section: Section relocations applying to.
+ * @relsec: Section containing RELAs.
+ * @symtabsec: Corresponding symtab.
+ *
+ * TODO: Some of the code belongs to generic code. Move that in kexec.c.
+ */
+int arch_kexec_apply_relocations_add(struct purgatory_info *pi,
+ Elf_Shdr *section, const Elf_Shdr *relsec,
+ const Elf_Shdr *symtabsec)
+{
+ unsigned int i;
+ Elf64_Rela *rel;
+ Elf64_Sym *sym;
+ void *location;
+ unsigned long address, sec_base, value;
+ const char *strtab, *name, *shstrtab;
+ const Elf_Shdr *sechdrs;
+
+ /* String & section header string table */
+ sechdrs = (void *)pi->ehdr + pi->ehdr->e_shoff;
+ strtab = (char *)pi->ehdr + sechdrs[symtabsec->sh_link].sh_offset;
+ shstrtab = (char *)pi->ehdr + sechdrs[pi->ehdr->e_shstrndx].sh_offset;
+
+ rel = (void *)pi->ehdr + relsec->sh_offset;
+
+ pr_debug("Applying relocate section %s to %u\n",
+ shstrtab + relsec->sh_name, relsec->sh_info);
+
+ for (i = 0; i < relsec->sh_size / sizeof(*rel); i++) {
+
+ /*
+ * rel[i].r_offset contains byte offset from beginning
+ * of section to the storage unit affected.
+ *
+ * This is location to update. This is temporary buffer
+ * where section is currently loaded. This will finally be
+ * loaded to a different address later, pointed to by
+ * ->sh_addr. kexec takes care of moving it
+ * (kexec_load_segment()).
+ */
+ location = pi->purgatory_buf;
+ location += section->sh_offset;
+ location += rel[i].r_offset;
+
+ /* Final address of the location */
+ address = section->sh_addr + rel[i].r_offset;
+
+ /*
+ * rel[i].r_info contains information about symbol table index
+ * w.r.t which relocation must be made and type of relocation
+ * to apply. ELF64_R_SYM() and ELF64_R_TYPE() macros get
+ * these respectively.
+ */
+ sym = (void *)pi->ehdr + symtabsec->sh_offset;
+ sym += ELF64_R_SYM(rel[i].r_info);
+
+ if (sym->st_name)
+ name = strtab + sym->st_name;
+ else
+ name = shstrtab + sechdrs[sym->st_shndx].sh_name;
+
+ pr_debug("Symbol: %s info: %02x shndx: %02x value=%llx size: %llx\n",
+ name, sym->st_info, sym->st_shndx, sym->st_value,
+ sym->st_size);
+
+ if (sym->st_shndx == SHN_UNDEF) {
+ pr_err("Undefined symbol: %s\n", name);
+ return -ENOEXEC;
+ }
+
+ if (sym->st_shndx == SHN_COMMON) {
+ pr_err("symbol '%s' in common section\n", name);
+ return -ENOEXEC;
+ }
+
+ if (sym->st_shndx == SHN_ABS)
+ sec_base = 0;
+ else if (sym->st_shndx >= pi->ehdr->e_shnum) {
+ pr_err("Invalid section %d for symbol %s\n",
+ sym->st_shndx, name);
+ return -ENOEXEC;
+ } else
+ sec_base = pi->sechdrs[sym->st_shndx].sh_addr;
+
+ value = sym->st_value;
+ value += sec_base;
+ value += rel[i].r_addend;
+
+ switch (ELF64_R_TYPE(rel[i].r_info)) {
+ case R_X86_64_NONE:
+ break;
+ case R_X86_64_64:
+ *(u64 *)location = value;
+ break;
+ case R_X86_64_32:
+ *(u32 *)location = value;
+ if (value != *(u32 *)location)
+ goto overflow;
+ break;
+ case R_X86_64_32S:
+ *(s32 *)location = value;
+ if ((s64)value != *(s32 *)location)
+ goto overflow;
+ break;
+ case R_X86_64_PC32:
+ case R_X86_64_PLT32:
+ value -= (u64)address;
+ *(u32 *)location = value;
+ break;
+ default:
+ pr_err("Unknown rela relocation: %llu\n",
+ ELF64_R_TYPE(rel[i].r_info));
+ return -ENOEXEC;
+ }
+ }
+ return 0;
+
+overflow:
+ pr_err("Overflow in relocation type %d value 0x%lx\n",
+ (int)ELF64_R_TYPE(rel[i].r_info), value);
+ return -ENOEXEC;
+}
+#endif /* CONFIG_KEXEC_FILE */
+
+static int
+kexec_mark_range(unsigned long start, unsigned long end, bool protect)
+{
+ struct page *page;
+ unsigned int nr_pages;
+
+ /*
+ * For physical range: [start, end]. We must skip the unassigned
+ * crashk resource with zero-valued "end" member.
+ */
+ if (!end || start > end)
+ return 0;
+
+ page = pfn_to_page(start >> PAGE_SHIFT);
+ nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
+ if (protect)
+ return set_pages_ro(page, nr_pages);
+ else
+ return set_pages_rw(page, nr_pages);
+}
+
+static void kexec_mark_crashkres(bool protect)
+{
+ unsigned long control;
+
+ kexec_mark_range(crashk_low_res.start, crashk_low_res.end, protect);
+
+ /* Don't touch the control code page used in crash_kexec().*/
+ control = PFN_PHYS(page_to_pfn(kexec_crash_image->control_code_page));
+ /* Control code page is located in the 2nd page. */
+ kexec_mark_range(crashk_res.start, control + PAGE_SIZE - 1, protect);
+ control += KEXEC_CONTROL_PAGE_SIZE;
+ kexec_mark_range(control, crashk_res.end, protect);
+}
+
+void arch_kexec_protect_crashkres(void)
+{
+ kexec_mark_crashkres(true);
+}
+
+void arch_kexec_unprotect_crashkres(void)
+{
+ kexec_mark_crashkres(false);
+}
+
+int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp)
+{
+ /*
+ * If SME is active we need to be sure that kexec pages are
+ * not encrypted because when we boot to the new kernel the
+ * pages won't be accessed encrypted (initially).
+ */
+ return set_memory_decrypted((unsigned long)vaddr, pages);
+}
+
+void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages)
+{
+ /*
+ * If SME is active we need to reset the pages back to being
+ * an encrypted mapping before freeing them.
+ */
+ set_memory_encrypted((unsigned long)vaddr, pages);
+}
diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c
new file mode 100644
index 000000000..b5cb49e57
--- /dev/null
+++ b/arch/x86/kernel/mmconf-fam10h_64.c
@@ -0,0 +1,238 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * AMD Family 10h mmconfig enablement
+ */
+
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/pci.h>
+#include <linux/dmi.h>
+#include <linux/range.h>
+
+#include <asm/pci-direct.h>
+#include <linux/sort.h>
+#include <asm/io.h>
+#include <asm/msr.h>
+#include <asm/acpi.h>
+#include <asm/mmconfig.h>
+#include <asm/pci_x86.h>
+
+struct pci_hostbridge_probe {
+ u32 bus;
+ u32 slot;
+ u32 vendor;
+ u32 device;
+};
+
+static u64 fam10h_pci_mmconf_base;
+
+static struct pci_hostbridge_probe pci_probes[] = {
+ { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1200 },
+ { 0xff, 0, PCI_VENDOR_ID_AMD, 0x1200 },
+};
+
+static int cmp_range(const void *x1, const void *x2)
+{
+ const struct range *r1 = x1;
+ const struct range *r2 = x2;
+ int start1, start2;
+
+ start1 = r1->start >> 32;
+ start2 = r2->start >> 32;
+
+ return start1 - start2;
+}
+
+#define MMCONF_UNIT (1ULL << FAM10H_MMIO_CONF_BASE_SHIFT)
+#define MMCONF_MASK (~(MMCONF_UNIT - 1))
+#define MMCONF_SIZE (MMCONF_UNIT << 8)
+/* need to avoid (0xfd<<32), (0xfe<<32), and (0xff<<32), ht used space */
+#define FAM10H_PCI_MMCONF_BASE (0xfcULL<<32)
+#define BASE_VALID(b) ((b) + MMCONF_SIZE <= (0xfdULL<<32) || (b) >= (1ULL<<40))
+static void get_fam10h_pci_mmconf_base(void)
+{
+ int i;
+ unsigned bus;
+ unsigned slot;
+ int found;
+
+ u64 val;
+ u32 address;
+ u64 tom2;
+ u64 base = FAM10H_PCI_MMCONF_BASE;
+
+ int hi_mmio_num;
+ struct range range[8];
+
+ /* only try to get setting from BSP */
+ if (fam10h_pci_mmconf_base)
+ return;
+
+ if (!early_pci_allowed())
+ return;
+
+ found = 0;
+ for (i = 0; i < ARRAY_SIZE(pci_probes); i++) {
+ u32 id;
+ u16 device;
+ u16 vendor;
+
+ bus = pci_probes[i].bus;
+ slot = pci_probes[i].slot;
+ id = read_pci_config(bus, slot, 0, PCI_VENDOR_ID);
+
+ vendor = id & 0xffff;
+ device = (id>>16) & 0xffff;
+ if (pci_probes[i].vendor == vendor &&
+ pci_probes[i].device == device) {
+ found = 1;
+ break;
+ }
+ }
+
+ if (!found)
+ return;
+
+ /* SYS_CFG */
+ address = MSR_K8_SYSCFG;
+ rdmsrl(address, val);
+
+ /* TOP_MEM2 is not enabled? */
+ if (!(val & (1<<21))) {
+ tom2 = 1ULL << 32;
+ } else {
+ /* TOP_MEM2 */
+ address = MSR_K8_TOP_MEM2;
+ rdmsrl(address, val);
+ tom2 = max(val & 0xffffff800000ULL, 1ULL << 32);
+ }
+
+ if (base <= tom2)
+ base = (tom2 + 2 * MMCONF_UNIT - 1) & MMCONF_MASK;
+
+ /*
+ * need to check if the range is in the high mmio range that is
+ * above 4G
+ */
+ hi_mmio_num = 0;
+ for (i = 0; i < 8; i++) {
+ u32 reg;
+ u64 start;
+ u64 end;
+ reg = read_pci_config(bus, slot, 1, 0x80 + (i << 3));
+ if (!(reg & 3))
+ continue;
+
+ start = (u64)(reg & 0xffffff00) << 8; /* 39:16 on 31:8*/
+ reg = read_pci_config(bus, slot, 1, 0x84 + (i << 3));
+ end = ((u64)(reg & 0xffffff00) << 8) | 0xffff; /* 39:16 on 31:8*/
+
+ if (end < tom2)
+ continue;
+
+ range[hi_mmio_num].start = start;
+ range[hi_mmio_num].end = end;
+ hi_mmio_num++;
+ }
+
+ if (!hi_mmio_num)
+ goto out;
+
+ /* sort the range */
+ sort(range, hi_mmio_num, sizeof(struct range), cmp_range, NULL);
+
+ if (range[hi_mmio_num - 1].end < base)
+ goto out;
+ if (range[0].start > base + MMCONF_SIZE)
+ goto out;
+
+ /* need to find one window */
+ base = (range[0].start & MMCONF_MASK) - MMCONF_UNIT;
+ if ((base > tom2) && BASE_VALID(base))
+ goto out;
+ base = (range[hi_mmio_num - 1].end + MMCONF_UNIT) & MMCONF_MASK;
+ if (BASE_VALID(base))
+ goto out;
+ /* need to find window between ranges */
+ for (i = 1; i < hi_mmio_num; i++) {
+ base = (range[i - 1].end + MMCONF_UNIT) & MMCONF_MASK;
+ val = range[i].start & MMCONF_MASK;
+ if (val >= base + MMCONF_SIZE && BASE_VALID(base))
+ goto out;
+ }
+ return;
+
+out:
+ fam10h_pci_mmconf_base = base;
+}
+
+void fam10h_check_enable_mmcfg(void)
+{
+ u64 val;
+ u32 address;
+
+ if (!(pci_probe & PCI_CHECK_ENABLE_AMD_MMCONF))
+ return;
+
+ address = MSR_FAM10H_MMIO_CONF_BASE;
+ rdmsrl(address, val);
+
+ /* try to make sure that AP's setting is identical to BSP setting */
+ if (val & FAM10H_MMIO_CONF_ENABLE) {
+ unsigned busnbits;
+ busnbits = (val >> FAM10H_MMIO_CONF_BUSRANGE_SHIFT) &
+ FAM10H_MMIO_CONF_BUSRANGE_MASK;
+
+ /* only trust the one handle 256 buses, if acpi=off */
+ if (!acpi_pci_disabled || busnbits >= 8) {
+ u64 base = val & MMCONF_MASK;
+
+ if (!fam10h_pci_mmconf_base) {
+ fam10h_pci_mmconf_base = base;
+ return;
+ } else if (fam10h_pci_mmconf_base == base)
+ return;
+ }
+ }
+
+ /*
+ * if it is not enabled, try to enable it and assume only one segment
+ * with 256 buses
+ */
+ get_fam10h_pci_mmconf_base();
+ if (!fam10h_pci_mmconf_base) {
+ pci_probe &= ~PCI_CHECK_ENABLE_AMD_MMCONF;
+ return;
+ }
+
+ printk(KERN_INFO "Enable MMCONFIG on AMD Family 10h\n");
+ val &= ~((FAM10H_MMIO_CONF_BASE_MASK<<FAM10H_MMIO_CONF_BASE_SHIFT) |
+ (FAM10H_MMIO_CONF_BUSRANGE_MASK<<FAM10H_MMIO_CONF_BUSRANGE_SHIFT));
+ val |= fam10h_pci_mmconf_base | (8 << FAM10H_MMIO_CONF_BUSRANGE_SHIFT) |
+ FAM10H_MMIO_CONF_ENABLE;
+ wrmsrl(address, val);
+}
+
+static int __init set_check_enable_amd_mmconf(const struct dmi_system_id *d)
+{
+ pci_probe |= PCI_CHECK_ENABLE_AMD_MMCONF;
+ return 0;
+}
+
+static const struct dmi_system_id __initconst mmconf_dmi_table[] = {
+ {
+ .callback = set_check_enable_amd_mmconf,
+ .ident = "Sun Microsystems Machine",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Sun Microsystems"),
+ },
+ },
+ {}
+};
+
+/* Called from a non __init function, but only on the BSP. */
+void __ref check_enable_amd_mmconf_dmi(void)
+{
+ dmi_check_system(mmconf_dmi_table);
+}
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
new file mode 100644
index 000000000..9f0be2c7e
--- /dev/null
+++ b/arch/x86/kernel/module.c
@@ -0,0 +1,281 @@
+/* Kernel module help for x86.
+ Copyright (C) 2001 Rusty Russell.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/moduleloader.h>
+#include <linux/elf.h>
+#include <linux/vmalloc.h>
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/kasan.h>
+#include <linux/bug.h>
+#include <linux/mm.h>
+#include <linux/gfp.h>
+#include <linux/jump_label.h>
+#include <linux/random.h>
+
+#include <asm/text-patching.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/setup.h>
+#include <asm/unwind.h>
+
+#if 0
+#define DEBUGP(fmt, ...) \
+ printk(KERN_DEBUG fmt, ##__VA_ARGS__)
+#else
+#define DEBUGP(fmt, ...) \
+do { \
+ if (0) \
+ printk(KERN_DEBUG fmt, ##__VA_ARGS__); \
+} while (0)
+#endif
+
+#ifdef CONFIG_RANDOMIZE_BASE
+static unsigned long module_load_offset;
+
+/* Mutex protects the module_load_offset. */
+static DEFINE_MUTEX(module_kaslr_mutex);
+
+static unsigned long int get_module_load_offset(void)
+{
+ if (kaslr_enabled()) {
+ mutex_lock(&module_kaslr_mutex);
+ /*
+ * Calculate the module_load_offset the first time this
+ * code is called. Once calculated it stays the same until
+ * reboot.
+ */
+ if (module_load_offset == 0)
+ module_load_offset =
+ (get_random_int() % 1024 + 1) * PAGE_SIZE;
+ mutex_unlock(&module_kaslr_mutex);
+ }
+ return module_load_offset;
+}
+#else
+static unsigned long int get_module_load_offset(void)
+{
+ return 0;
+}
+#endif
+
+void *module_alloc(unsigned long size)
+{
+ void *p;
+
+ if (PAGE_ALIGN(size) > MODULES_LEN)
+ return NULL;
+
+ p = __vmalloc_node_range(size, MODULE_ALIGN,
+ MODULES_VADDR + get_module_load_offset(),
+ MODULES_END, GFP_KERNEL,
+ PAGE_KERNEL, 0, NUMA_NO_NODE,
+ __builtin_return_address(0));
+ if (p && (kasan_module_alloc(p, size) < 0)) {
+ vfree(p);
+ return NULL;
+ }
+
+ return p;
+}
+
+#ifdef CONFIG_X86_32
+int apply_relocate(Elf32_Shdr *sechdrs,
+ const char *strtab,
+ unsigned int symindex,
+ unsigned int relsec,
+ struct module *me)
+{
+ unsigned int i;
+ Elf32_Rel *rel = (void *)sechdrs[relsec].sh_addr;
+ Elf32_Sym *sym;
+ uint32_t *location;
+
+ DEBUGP("Applying relocate section %u to %u\n",
+ relsec, sechdrs[relsec].sh_info);
+ for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
+ /* This is where to make the change */
+ location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr
+ + rel[i].r_offset;
+ /* This is the symbol it is referring to. Note that all
+ undefined symbols have been resolved. */
+ sym = (Elf32_Sym *)sechdrs[symindex].sh_addr
+ + ELF32_R_SYM(rel[i].r_info);
+
+ switch (ELF32_R_TYPE(rel[i].r_info)) {
+ case R_386_32:
+ /* We add the value into the location given */
+ *location += sym->st_value;
+ break;
+ case R_386_PC32:
+ case R_386_PLT32:
+ /* Add the value, subtract its position */
+ *location += sym->st_value - (uint32_t)location;
+ break;
+ default:
+ pr_err("%s: Unknown relocation: %u\n",
+ me->name, ELF32_R_TYPE(rel[i].r_info));
+ return -ENOEXEC;
+ }
+ }
+ return 0;
+}
+#else /*X86_64*/
+int apply_relocate_add(Elf64_Shdr *sechdrs,
+ const char *strtab,
+ unsigned int symindex,
+ unsigned int relsec,
+ struct module *me)
+{
+ unsigned int i;
+ Elf64_Rela *rel = (void *)sechdrs[relsec].sh_addr;
+ Elf64_Sym *sym;
+ void *loc;
+ u64 val;
+
+ DEBUGP("Applying relocate section %u to %u\n",
+ relsec, sechdrs[relsec].sh_info);
+ for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
+ /* This is where to make the change */
+ loc = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr
+ + rel[i].r_offset;
+
+ /* This is the symbol it is referring to. Note that all
+ undefined symbols have been resolved. */
+ sym = (Elf64_Sym *)sechdrs[symindex].sh_addr
+ + ELF64_R_SYM(rel[i].r_info);
+
+ DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n",
+ (int)ELF64_R_TYPE(rel[i].r_info),
+ sym->st_value, rel[i].r_addend, (u64)loc);
+
+ val = sym->st_value + rel[i].r_addend;
+
+ switch (ELF64_R_TYPE(rel[i].r_info)) {
+ case R_X86_64_NONE:
+ break;
+ case R_X86_64_64:
+ if (*(u64 *)loc != 0)
+ goto invalid_relocation;
+ *(u64 *)loc = val;
+ break;
+ case R_X86_64_32:
+ if (*(u32 *)loc != 0)
+ goto invalid_relocation;
+ *(u32 *)loc = val;
+ if (val != *(u32 *)loc)
+ goto overflow;
+ break;
+ case R_X86_64_32S:
+ if (*(s32 *)loc != 0)
+ goto invalid_relocation;
+ *(s32 *)loc = val;
+ if ((s64)val != *(s32 *)loc)
+ goto overflow;
+ break;
+ case R_X86_64_PC32:
+ case R_X86_64_PLT32:
+ if (*(u32 *)loc != 0)
+ goto invalid_relocation;
+ val -= (u64)loc;
+ *(u32 *)loc = val;
+#if 0
+ if ((s64)val != *(s32 *)loc)
+ goto overflow;
+#endif
+ break;
+ default:
+ pr_err("%s: Unknown rela relocation: %llu\n",
+ me->name, ELF64_R_TYPE(rel[i].r_info));
+ return -ENOEXEC;
+ }
+ }
+ return 0;
+
+invalid_relocation:
+ pr_err("x86/modules: Skipping invalid relocation target, existing value is nonzero for type %d, loc %p, val %Lx\n",
+ (int)ELF64_R_TYPE(rel[i].r_info), loc, val);
+ return -ENOEXEC;
+
+overflow:
+ pr_err("overflow in relocation type %d val %Lx\n",
+ (int)ELF64_R_TYPE(rel[i].r_info), val);
+ pr_err("`%s' likely not compiled with -mcmodel=kernel\n",
+ me->name);
+ return -ENOEXEC;
+}
+#endif
+
+int module_finalize(const Elf_Ehdr *hdr,
+ const Elf_Shdr *sechdrs,
+ struct module *me)
+{
+ const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,
+ *para = NULL, *orc = NULL, *orc_ip = NULL;
+ char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
+
+ for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
+ if (!strcmp(".text", secstrings + s->sh_name))
+ text = s;
+ if (!strcmp(".altinstructions", secstrings + s->sh_name))
+ alt = s;
+ if (!strcmp(".smp_locks", secstrings + s->sh_name))
+ locks = s;
+ if (!strcmp(".parainstructions", secstrings + s->sh_name))
+ para = s;
+ if (!strcmp(".orc_unwind", secstrings + s->sh_name))
+ orc = s;
+ if (!strcmp(".orc_unwind_ip", secstrings + s->sh_name))
+ orc_ip = s;
+ }
+
+ if (alt) {
+ /* patch .altinstructions */
+ void *aseg = (void *)alt->sh_addr;
+ apply_alternatives(aseg, aseg + alt->sh_size);
+ }
+ if (locks && text) {
+ void *lseg = (void *)locks->sh_addr;
+ void *tseg = (void *)text->sh_addr;
+ alternatives_smp_module_add(me, me->name,
+ lseg, lseg + locks->sh_size,
+ tseg, tseg + text->sh_size);
+ }
+
+ if (para) {
+ void *pseg = (void *)para->sh_addr;
+ apply_paravirt(pseg, pseg + para->sh_size);
+ }
+
+ /* make jump label nops */
+ jump_label_apply_nops(me);
+
+ if (orc && orc_ip)
+ unwind_module_init(me, (void *)orc_ip->sh_addr, orc_ip->sh_size,
+ (void *)orc->sh_addr, orc->sh_size);
+
+ return 0;
+}
+
+void module_arch_cleanup(struct module *mod)
+{
+ alternatives_smp_module_del(mod);
+}
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
new file mode 100644
index 000000000..5b4c32799
--- /dev/null
+++ b/arch/x86/kernel/mpparse.c
@@ -0,0 +1,963 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel Multiprocessor Specification 1.1 and 1.4
+ * compliant MP-table parsing routines.
+ *
+ * (c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk>
+ * (c) 1998, 1999, 2000, 2009 Ingo Molnar <mingo@redhat.com>
+ * (c) 2008 Alexey Starikovskiy <astarikovskiy@suse.de>
+ */
+
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/bootmem.h>
+#include <linux/memblock.h>
+#include <linux/kernel_stat.h>
+#include <linux/mc146818rtc.h>
+#include <linux/bitops.h>
+#include <linux/acpi.h>
+#include <linux/smp.h>
+#include <linux/pci.h>
+
+#include <asm/irqdomain.h>
+#include <asm/mtrr.h>
+#include <asm/mpspec.h>
+#include <asm/pgalloc.h>
+#include <asm/io_apic.h>
+#include <asm/proto.h>
+#include <asm/bios_ebda.h>
+#include <asm/e820/api.h>
+#include <asm/setup.h>
+#include <asm/smp.h>
+
+#include <asm/apic.h>
+/*
+ * Checksum an MP configuration block.
+ */
+
+static int __init mpf_checksum(unsigned char *mp, int len)
+{
+ int sum = 0;
+
+ while (len--)
+ sum += *mp++;
+
+ return sum & 0xFF;
+}
+
+int __init default_mpc_apic_id(struct mpc_cpu *m)
+{
+ return m->apicid;
+}
+
+static void __init MP_processor_info(struct mpc_cpu *m)
+{
+ int apicid;
+ char *bootup_cpu = "";
+
+ if (!(m->cpuflag & CPU_ENABLED)) {
+ disabled_cpus++;
+ return;
+ }
+
+ apicid = x86_init.mpparse.mpc_apic_id(m);
+
+ if (m->cpuflag & CPU_BOOTPROCESSOR) {
+ bootup_cpu = " (Bootup-CPU)";
+ boot_cpu_physical_apicid = m->apicid;
+ }
+
+ pr_info("Processor #%d%s\n", m->apicid, bootup_cpu);
+ generic_processor_info(apicid, m->apicver);
+}
+
+#ifdef CONFIG_X86_IO_APIC
+void __init default_mpc_oem_bus_info(struct mpc_bus *m, char *str)
+{
+ memcpy(str, m->bustype, 6);
+ str[6] = 0;
+ apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->busid, str);
+}
+
+static void __init MP_bus_info(struct mpc_bus *m)
+{
+ char str[7];
+
+ x86_init.mpparse.mpc_oem_bus_info(m, str);
+
+#if MAX_MP_BUSSES < 256
+ if (m->busid >= MAX_MP_BUSSES) {
+ pr_warn("MP table busid value (%d) for bustype %s is too large, max. supported is %d\n",
+ m->busid, str, MAX_MP_BUSSES - 1);
+ return;
+ }
+#endif
+
+ set_bit(m->busid, mp_bus_not_pci);
+ if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA) - 1) == 0) {
+#ifdef CONFIG_EISA
+ mp_bus_id_to_type[m->busid] = MP_BUS_ISA;
+#endif
+ } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) {
+ if (x86_init.mpparse.mpc_oem_pci_bus)
+ x86_init.mpparse.mpc_oem_pci_bus(m);
+
+ clear_bit(m->busid, mp_bus_not_pci);
+#ifdef CONFIG_EISA
+ mp_bus_id_to_type[m->busid] = MP_BUS_PCI;
+ } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) {
+ mp_bus_id_to_type[m->busid] = MP_BUS_EISA;
+#endif
+ } else
+ pr_warn("Unknown bustype %s - ignoring\n", str);
+}
+
+static void __init MP_ioapic_info(struct mpc_ioapic *m)
+{
+ struct ioapic_domain_cfg cfg = {
+ .type = IOAPIC_DOMAIN_LEGACY,
+ .ops = &mp_ioapic_irqdomain_ops,
+ };
+
+ if (m->flags & MPC_APIC_USABLE)
+ mp_register_ioapic(m->apicid, m->apicaddr, gsi_top, &cfg);
+}
+
+static void __init print_mp_irq_info(struct mpc_intsrc *mp_irq)
+{
+ apic_printk(APIC_VERBOSE,
+ "Int: type %d, pol %d, trig %d, bus %02x, IRQ %02x, APIC ID %x, APIC INT %02x\n",
+ mp_irq->irqtype, mp_irq->irqflag & 3,
+ (mp_irq->irqflag >> 2) & 3, mp_irq->srcbus,
+ mp_irq->srcbusirq, mp_irq->dstapic, mp_irq->dstirq);
+}
+
+#else /* CONFIG_X86_IO_APIC */
+static inline void __init MP_bus_info(struct mpc_bus *m) {}
+static inline void __init MP_ioapic_info(struct mpc_ioapic *m) {}
+#endif /* CONFIG_X86_IO_APIC */
+
+static void __init MP_lintsrc_info(struct mpc_lintsrc *m)
+{
+ apic_printk(APIC_VERBOSE,
+ "Lint: type %d, pol %d, trig %d, bus %02x, IRQ %02x, APIC ID %x, APIC LINT %02x\n",
+ m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbusid,
+ m->srcbusirq, m->destapic, m->destapiclint);
+}
+
+/*
+ * Read/parse the MPC
+ */
+static int __init smp_check_mpc(struct mpc_table *mpc, char *oem, char *str)
+{
+
+ if (memcmp(mpc->signature, MPC_SIGNATURE, 4)) {
+ pr_err("MPTABLE: bad signature [%c%c%c%c]!\n",
+ mpc->signature[0], mpc->signature[1],
+ mpc->signature[2], mpc->signature[3]);
+ return 0;
+ }
+ if (mpf_checksum((unsigned char *)mpc, mpc->length)) {
+ pr_err("MPTABLE: checksum error!\n");
+ return 0;
+ }
+ if (mpc->spec != 0x01 && mpc->spec != 0x04) {
+ pr_err("MPTABLE: bad table version (%d)!!\n", mpc->spec);
+ return 0;
+ }
+ if (!mpc->lapic) {
+ pr_err("MPTABLE: null local APIC address!\n");
+ return 0;
+ }
+ memcpy(oem, mpc->oem, 8);
+ oem[8] = 0;
+ pr_info("MPTABLE: OEM ID: %s\n", oem);
+
+ memcpy(str, mpc->productid, 12);
+ str[12] = 0;
+
+ pr_info("MPTABLE: Product ID: %s\n", str);
+
+ pr_info("MPTABLE: APIC at: 0x%X\n", mpc->lapic);
+
+ return 1;
+}
+
+static void skip_entry(unsigned char **ptr, int *count, int size)
+{
+ *ptr += size;
+ *count += size;
+}
+
+static void __init smp_dump_mptable(struct mpc_table *mpc, unsigned char *mpt)
+{
+ pr_err("Your mptable is wrong, contact your HW vendor!\n");
+ pr_cont("type %x\n", *mpt);
+ print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16,
+ 1, mpc, mpc->length, 1);
+}
+
+void __init default_smp_read_mpc_oem(struct mpc_table *mpc) { }
+
+static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
+{
+ char str[16];
+ char oem[10];
+
+ int count = sizeof(*mpc);
+ unsigned char *mpt = ((unsigned char *)mpc) + count;
+
+ if (!smp_check_mpc(mpc, oem, str))
+ return 0;
+
+ /* Initialize the lapic mapping */
+ if (!acpi_lapic)
+ register_lapic_address(mpc->lapic);
+
+ if (early)
+ return 1;
+
+ if (mpc->oemptr)
+ x86_init.mpparse.smp_read_mpc_oem(mpc);
+
+ /*
+ * Now process the configuration blocks.
+ */
+ x86_init.mpparse.mpc_record(0);
+
+ while (count < mpc->length) {
+ switch (*mpt) {
+ case MP_PROCESSOR:
+ /* ACPI may have already provided this data */
+ if (!acpi_lapic)
+ MP_processor_info((struct mpc_cpu *)mpt);
+ skip_entry(&mpt, &count, sizeof(struct mpc_cpu));
+ break;
+ case MP_BUS:
+ MP_bus_info((struct mpc_bus *)mpt);
+ skip_entry(&mpt, &count, sizeof(struct mpc_bus));
+ break;
+ case MP_IOAPIC:
+ MP_ioapic_info((struct mpc_ioapic *)mpt);
+ skip_entry(&mpt, &count, sizeof(struct mpc_ioapic));
+ break;
+ case MP_INTSRC:
+ mp_save_irq((struct mpc_intsrc *)mpt);
+ skip_entry(&mpt, &count, sizeof(struct mpc_intsrc));
+ break;
+ case MP_LINTSRC:
+ MP_lintsrc_info((struct mpc_lintsrc *)mpt);
+ skip_entry(&mpt, &count, sizeof(struct mpc_lintsrc));
+ break;
+ default:
+ /* wrong mptable */
+ smp_dump_mptable(mpc, mpt);
+ count = mpc->length;
+ break;
+ }
+ x86_init.mpparse.mpc_record(1);
+ }
+
+ if (!num_processors)
+ pr_err("MPTABLE: no processors registered!\n");
+ return num_processors;
+}
+
+#ifdef CONFIG_X86_IO_APIC
+
+static int __init ELCR_trigger(unsigned int irq)
+{
+ unsigned int port;
+
+ port = 0x4d0 + (irq >> 3);
+ return (inb(port) >> (irq & 7)) & 1;
+}
+
+static void __init construct_default_ioirq_mptable(int mpc_default_type)
+{
+ struct mpc_intsrc intsrc;
+ int i;
+ int ELCR_fallback = 0;
+
+ intsrc.type = MP_INTSRC;
+ intsrc.irqflag = MP_IRQTRIG_DEFAULT | MP_IRQPOL_DEFAULT;
+ intsrc.srcbus = 0;
+ intsrc.dstapic = mpc_ioapic_id(0);
+
+ intsrc.irqtype = mp_INT;
+
+ /*
+ * If true, we have an ISA/PCI system with no IRQ entries
+ * in the MP table. To prevent the PCI interrupts from being set up
+ * incorrectly, we try to use the ELCR. The sanity check to see if
+ * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can
+ * never be level sensitive, so we simply see if the ELCR agrees.
+ * If it does, we assume it's valid.
+ */
+ if (mpc_default_type == 5) {
+ pr_info("ISA/PCI bus type with no IRQ information... falling back to ELCR\n");
+
+ if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) ||
+ ELCR_trigger(13))
+ pr_err("ELCR contains invalid data... not using ELCR\n");
+ else {
+ pr_info("Using ELCR to identify PCI interrupts\n");
+ ELCR_fallback = 1;
+ }
+ }
+
+ for (i = 0; i < 16; i++) {
+ switch (mpc_default_type) {
+ case 2:
+ if (i == 0 || i == 13)
+ continue; /* IRQ0 & IRQ13 not connected */
+ /* fall through */
+ default:
+ if (i == 2)
+ continue; /* IRQ2 is never connected */
+ }
+
+ if (ELCR_fallback) {
+ /*
+ * If the ELCR indicates a level-sensitive interrupt, we
+ * copy that information over to the MP table in the
+ * irqflag field (level sensitive, active high polarity).
+ */
+ if (ELCR_trigger(i)) {
+ intsrc.irqflag = MP_IRQTRIG_LEVEL |
+ MP_IRQPOL_ACTIVE_HIGH;
+ } else {
+ intsrc.irqflag = MP_IRQTRIG_DEFAULT |
+ MP_IRQPOL_DEFAULT;
+ }
+ }
+
+ intsrc.srcbusirq = i;
+ intsrc.dstirq = i ? i : 2; /* IRQ0 to INTIN2 */
+ mp_save_irq(&intsrc);
+ }
+
+ intsrc.irqtype = mp_ExtINT;
+ intsrc.srcbusirq = 0;
+ intsrc.dstirq = 0; /* 8259A to INTIN0 */
+ mp_save_irq(&intsrc);
+}
+
+
+static void __init construct_ioapic_table(int mpc_default_type)
+{
+ struct mpc_ioapic ioapic;
+ struct mpc_bus bus;
+
+ bus.type = MP_BUS;
+ bus.busid = 0;
+ switch (mpc_default_type) {
+ default:
+ pr_err("???\nUnknown standard configuration %d\n",
+ mpc_default_type);
+ /* fall through */
+ case 1:
+ case 5:
+ memcpy(bus.bustype, "ISA ", 6);
+ break;
+ case 2:
+ case 6:
+ case 3:
+ memcpy(bus.bustype, "EISA ", 6);
+ break;
+ }
+ MP_bus_info(&bus);
+ if (mpc_default_type > 4) {
+ bus.busid = 1;
+ memcpy(bus.bustype, "PCI ", 6);
+ MP_bus_info(&bus);
+ }
+
+ ioapic.type = MP_IOAPIC;
+ ioapic.apicid = 2;
+ ioapic.apicver = mpc_default_type > 4 ? 0x10 : 0x01;
+ ioapic.flags = MPC_APIC_USABLE;
+ ioapic.apicaddr = IO_APIC_DEFAULT_PHYS_BASE;
+ MP_ioapic_info(&ioapic);
+
+ /*
+ * We set up most of the low 16 IO-APIC pins according to MPS rules.
+ */
+ construct_default_ioirq_mptable(mpc_default_type);
+}
+#else
+static inline void __init construct_ioapic_table(int mpc_default_type) { }
+#endif
+
+static inline void __init construct_default_ISA_mptable(int mpc_default_type)
+{
+ struct mpc_cpu processor;
+ struct mpc_lintsrc lintsrc;
+ int linttypes[2] = { mp_ExtINT, mp_NMI };
+ int i;
+
+ /*
+ * local APIC has default address
+ */
+ mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
+
+ /*
+ * 2 CPUs, numbered 0 & 1.
+ */
+ processor.type = MP_PROCESSOR;
+ /* Either an integrated APIC or a discrete 82489DX. */
+ processor.apicver = mpc_default_type > 4 ? 0x10 : 0x01;
+ processor.cpuflag = CPU_ENABLED;
+ processor.cpufeature = (boot_cpu_data.x86 << 8) |
+ (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_stepping;
+ processor.featureflag = boot_cpu_data.x86_capability[CPUID_1_EDX];
+ processor.reserved[0] = 0;
+ processor.reserved[1] = 0;
+ for (i = 0; i < 2; i++) {
+ processor.apicid = i;
+ MP_processor_info(&processor);
+ }
+
+ construct_ioapic_table(mpc_default_type);
+
+ lintsrc.type = MP_LINTSRC;
+ lintsrc.irqflag = MP_IRQTRIG_DEFAULT | MP_IRQPOL_DEFAULT;
+ lintsrc.srcbusid = 0;
+ lintsrc.srcbusirq = 0;
+ lintsrc.destapic = MP_APIC_ALL;
+ for (i = 0; i < 2; i++) {
+ lintsrc.irqtype = linttypes[i];
+ lintsrc.destapiclint = i;
+ MP_lintsrc_info(&lintsrc);
+ }
+}
+
+static unsigned long mpf_base;
+static bool mpf_found;
+
+static unsigned long __init get_mpc_size(unsigned long physptr)
+{
+ struct mpc_table *mpc;
+ unsigned long size;
+
+ mpc = early_memremap(physptr, PAGE_SIZE);
+ size = mpc->length;
+ early_memunmap(mpc, PAGE_SIZE);
+ apic_printk(APIC_VERBOSE, " mpc: %lx-%lx\n", physptr, physptr + size);
+
+ return size;
+}
+
+static int __init check_physptr(struct mpf_intel *mpf, unsigned int early)
+{
+ struct mpc_table *mpc;
+ unsigned long size;
+
+ size = get_mpc_size(mpf->physptr);
+ mpc = early_memremap(mpf->physptr, size);
+
+ /*
+ * Read the physical hardware table. Anything here will
+ * override the defaults.
+ */
+ if (!smp_read_mpc(mpc, early)) {
+#ifdef CONFIG_X86_LOCAL_APIC
+ smp_found_config = 0;
+#endif
+ pr_err("BIOS bug, MP table errors detected!...\n");
+ pr_cont("... disabling SMP support. (tell your hw vendor)\n");
+ early_memunmap(mpc, size);
+ return -1;
+ }
+ early_memunmap(mpc, size);
+
+ if (early)
+ return -1;
+
+#ifdef CONFIG_X86_IO_APIC
+ /*
+ * If there are no explicit MP IRQ entries, then we are
+ * broken. We set up most of the low 16 IO-APIC pins to
+ * ISA defaults and hope it will work.
+ */
+ if (!mp_irq_entries) {
+ struct mpc_bus bus;
+
+ pr_err("BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n");
+
+ bus.type = MP_BUS;
+ bus.busid = 0;
+ memcpy(bus.bustype, "ISA ", 6);
+ MP_bus_info(&bus);
+
+ construct_default_ioirq_mptable(0);
+ }
+#endif
+
+ return 0;
+}
+
+/*
+ * Scan the memory blocks for an SMP configuration block.
+ */
+void __init default_get_smp_config(unsigned int early)
+{
+ struct mpf_intel *mpf;
+
+ if (!smp_found_config)
+ return;
+
+ if (!mpf_found)
+ return;
+
+ if (acpi_lapic && early)
+ return;
+
+ /*
+ * MPS doesn't support hyperthreading, aka only have
+ * thread 0 apic id in MPS table
+ */
+ if (acpi_lapic && acpi_ioapic)
+ return;
+
+ mpf = early_memremap(mpf_base, sizeof(*mpf));
+ if (!mpf) {
+ pr_err("MPTABLE: error mapping MP table\n");
+ return;
+ }
+
+ pr_info("Intel MultiProcessor Specification v1.%d\n",
+ mpf->specification);
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32)
+ if (mpf->feature2 & (1 << 7)) {
+ pr_info(" IMCR and PIC compatibility mode.\n");
+ pic_mode = 1;
+ } else {
+ pr_info(" Virtual Wire compatibility mode.\n");
+ pic_mode = 0;
+ }
+#endif
+ /*
+ * Now see if we need to read further.
+ */
+ if (mpf->feature1) {
+ if (early) {
+ /*
+ * local APIC has default address
+ */
+ mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
+ goto out;
+ }
+
+ pr_info("Default MP configuration #%d\n", mpf->feature1);
+ construct_default_ISA_mptable(mpf->feature1);
+
+ } else if (mpf->physptr) {
+ if (check_physptr(mpf, early))
+ goto out;
+ } else
+ BUG();
+
+ if (!early)
+ pr_info("Processors: %d\n", num_processors);
+ /*
+ * Only use the first configuration found.
+ */
+out:
+ early_memunmap(mpf, sizeof(*mpf));
+}
+
+static void __init smp_reserve_memory(struct mpf_intel *mpf)
+{
+ memblock_reserve(mpf->physptr, get_mpc_size(mpf->physptr));
+}
+
+static int __init smp_scan_config(unsigned long base, unsigned long length)
+{
+ unsigned int *bp;
+ struct mpf_intel *mpf;
+ int ret = 0;
+
+ apic_printk(APIC_VERBOSE, "Scan for SMP in [mem %#010lx-%#010lx]\n",
+ base, base + length - 1);
+ BUILD_BUG_ON(sizeof(*mpf) != 16);
+
+ while (length > 0) {
+ bp = early_memremap(base, length);
+ mpf = (struct mpf_intel *)bp;
+ if ((*bp == SMP_MAGIC_IDENT) &&
+ (mpf->length == 1) &&
+ !mpf_checksum((unsigned char *)bp, 16) &&
+ ((mpf->specification == 1)
+ || (mpf->specification == 4))) {
+#ifdef CONFIG_X86_LOCAL_APIC
+ smp_found_config = 1;
+#endif
+ mpf_base = base;
+ mpf_found = true;
+
+ pr_info("found SMP MP-table at [mem %#010lx-%#010lx]\n",
+ base, base + sizeof(*mpf) - 1);
+
+ memblock_reserve(base, sizeof(*mpf));
+ if (mpf->physptr)
+ smp_reserve_memory(mpf);
+
+ ret = 1;
+ }
+ early_memunmap(bp, length);
+
+ if (ret)
+ break;
+
+ base += 16;
+ length -= 16;
+ }
+ return ret;
+}
+
+void __init default_find_smp_config(void)
+{
+ unsigned int address;
+
+ /*
+ * FIXME: Linux assumes you have 640K of base ram..
+ * this continues the error...
+ *
+ * 1) Scan the bottom 1K for a signature
+ * 2) Scan the top 1K of base RAM
+ * 3) Scan the 64K of bios
+ */
+ if (smp_scan_config(0x0, 0x400) ||
+ smp_scan_config(639 * 0x400, 0x400) ||
+ smp_scan_config(0xF0000, 0x10000))
+ return;
+ /*
+ * If it is an SMP machine we should know now, unless the
+ * configuration is in an EISA bus machine with an
+ * extended bios data area.
+ *
+ * there is a real-mode segmented pointer pointing to the
+ * 4K EBDA area at 0x40E, calculate and scan it here.
+ *
+ * NOTE! There are Linux loaders that will corrupt the EBDA
+ * area, and as such this kind of SMP config may be less
+ * trustworthy, simply because the SMP table may have been
+ * stomped on during early boot. These loaders are buggy and
+ * should be fixed.
+ *
+ * MP1.4 SPEC states to only scan first 1K of 4K EBDA.
+ */
+
+ address = get_bios_ebda();
+ if (address)
+ smp_scan_config(address, 0x400);
+}
+
+#ifdef CONFIG_X86_IO_APIC
+static u8 __initdata irq_used[MAX_IRQ_SOURCES];
+
+static int __init get_MP_intsrc_index(struct mpc_intsrc *m)
+{
+ int i;
+
+ if (m->irqtype != mp_INT)
+ return 0;
+
+ if (m->irqflag != (MP_IRQTRIG_LEVEL | MP_IRQPOL_ACTIVE_LOW))
+ return 0;
+
+ /* not legacy */
+
+ for (i = 0; i < mp_irq_entries; i++) {
+ if (mp_irqs[i].irqtype != mp_INT)
+ continue;
+
+ if (mp_irqs[i].irqflag != (MP_IRQTRIG_LEVEL |
+ MP_IRQPOL_ACTIVE_LOW))
+ continue;
+
+ if (mp_irqs[i].srcbus != m->srcbus)
+ continue;
+ if (mp_irqs[i].srcbusirq != m->srcbusirq)
+ continue;
+ if (irq_used[i]) {
+ /* already claimed */
+ return -2;
+ }
+ irq_used[i] = 1;
+ return i;
+ }
+
+ /* not found */
+ return -1;
+}
+
+#define SPARE_SLOT_NUM 20
+
+static struct mpc_intsrc __initdata *m_spare[SPARE_SLOT_NUM];
+
+static void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare)
+{
+ int i;
+
+ apic_printk(APIC_VERBOSE, "OLD ");
+ print_mp_irq_info(m);
+
+ i = get_MP_intsrc_index(m);
+ if (i > 0) {
+ memcpy(m, &mp_irqs[i], sizeof(*m));
+ apic_printk(APIC_VERBOSE, "NEW ");
+ print_mp_irq_info(&mp_irqs[i]);
+ return;
+ }
+ if (!i) {
+ /* legacy, do nothing */
+ return;
+ }
+ if (*nr_m_spare < SPARE_SLOT_NUM) {
+ /*
+ * not found (-1), or duplicated (-2) are invalid entries,
+ * we need to use the slot later
+ */
+ m_spare[*nr_m_spare] = m;
+ *nr_m_spare += 1;
+ }
+}
+
+static int __init
+check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count)
+{
+ if (!mpc_new_phys || count <= mpc_new_length) {
+ WARN(1, "update_mptable: No spare slots (length: %x)\n", count);
+ return -1;
+ }
+
+ return 0;
+}
+#else /* CONFIG_X86_IO_APIC */
+static
+inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {}
+#endif /* CONFIG_X86_IO_APIC */
+
+static int __init replace_intsrc_all(struct mpc_table *mpc,
+ unsigned long mpc_new_phys,
+ unsigned long mpc_new_length)
+{
+#ifdef CONFIG_X86_IO_APIC
+ int i;
+#endif
+ int count = sizeof(*mpc);
+ int nr_m_spare = 0;
+ unsigned char *mpt = ((unsigned char *)mpc) + count;
+
+ pr_info("mpc_length %x\n", mpc->length);
+ while (count < mpc->length) {
+ switch (*mpt) {
+ case MP_PROCESSOR:
+ skip_entry(&mpt, &count, sizeof(struct mpc_cpu));
+ break;
+ case MP_BUS:
+ skip_entry(&mpt, &count, sizeof(struct mpc_bus));
+ break;
+ case MP_IOAPIC:
+ skip_entry(&mpt, &count, sizeof(struct mpc_ioapic));
+ break;
+ case MP_INTSRC:
+ check_irq_src((struct mpc_intsrc *)mpt, &nr_m_spare);
+ skip_entry(&mpt, &count, sizeof(struct mpc_intsrc));
+ break;
+ case MP_LINTSRC:
+ skip_entry(&mpt, &count, sizeof(struct mpc_lintsrc));
+ break;
+ default:
+ /* wrong mptable */
+ smp_dump_mptable(mpc, mpt);
+ goto out;
+ }
+ }
+
+#ifdef CONFIG_X86_IO_APIC
+ for (i = 0; i < mp_irq_entries; i++) {
+ if (irq_used[i])
+ continue;
+
+ if (mp_irqs[i].irqtype != mp_INT)
+ continue;
+
+ if (mp_irqs[i].irqflag != (MP_IRQTRIG_LEVEL |
+ MP_IRQPOL_ACTIVE_LOW))
+ continue;
+
+ if (nr_m_spare > 0) {
+ apic_printk(APIC_VERBOSE, "*NEW* found\n");
+ nr_m_spare--;
+ memcpy(m_spare[nr_m_spare], &mp_irqs[i], sizeof(mp_irqs[i]));
+ m_spare[nr_m_spare] = NULL;
+ } else {
+ struct mpc_intsrc *m = (struct mpc_intsrc *)mpt;
+ count += sizeof(struct mpc_intsrc);
+ if (check_slot(mpc_new_phys, mpc_new_length, count) < 0)
+ goto out;
+ memcpy(m, &mp_irqs[i], sizeof(*m));
+ mpc->length = count;
+ mpt += sizeof(struct mpc_intsrc);
+ }
+ print_mp_irq_info(&mp_irqs[i]);
+ }
+#endif
+out:
+ /* update checksum */
+ mpc->checksum = 0;
+ mpc->checksum -= mpf_checksum((unsigned char *)mpc, mpc->length);
+
+ return 0;
+}
+
+int enable_update_mptable;
+
+static int __init update_mptable_setup(char *str)
+{
+ enable_update_mptable = 1;
+#ifdef CONFIG_PCI
+ pci_routeirq = 1;
+#endif
+ return 0;
+}
+early_param("update_mptable", update_mptable_setup);
+
+static unsigned long __initdata mpc_new_phys;
+static unsigned long mpc_new_length __initdata = 4096;
+
+/* alloc_mptable or alloc_mptable=4k */
+static int __initdata alloc_mptable;
+static int __init parse_alloc_mptable_opt(char *p)
+{
+ enable_update_mptable = 1;
+#ifdef CONFIG_PCI
+ pci_routeirq = 1;
+#endif
+ alloc_mptable = 1;
+ if (!p)
+ return 0;
+ mpc_new_length = memparse(p, &p);
+ return 0;
+}
+early_param("alloc_mptable", parse_alloc_mptable_opt);
+
+void __init e820__memblock_alloc_reserved_mpc_new(void)
+{
+ if (enable_update_mptable && alloc_mptable)
+ mpc_new_phys = e820__memblock_alloc_reserved(mpc_new_length, 4);
+}
+
+static int __init update_mp_table(void)
+{
+ char str[16];
+ char oem[10];
+ struct mpf_intel *mpf;
+ struct mpc_table *mpc, *mpc_new;
+ unsigned long size;
+
+ if (!enable_update_mptable)
+ return 0;
+
+ if (!mpf_found)
+ return 0;
+
+ mpf = early_memremap(mpf_base, sizeof(*mpf));
+ if (!mpf) {
+ pr_err("MPTABLE: mpf early_memremap() failed\n");
+ return 0;
+ }
+
+ /*
+ * Now see if we need to go further.
+ */
+ if (mpf->feature1)
+ goto do_unmap_mpf;
+
+ if (!mpf->physptr)
+ goto do_unmap_mpf;
+
+ size = get_mpc_size(mpf->physptr);
+ mpc = early_memremap(mpf->physptr, size);
+ if (!mpc) {
+ pr_err("MPTABLE: mpc early_memremap() failed\n");
+ goto do_unmap_mpf;
+ }
+
+ if (!smp_check_mpc(mpc, oem, str))
+ goto do_unmap_mpc;
+
+ pr_info("mpf: %llx\n", (u64)mpf_base);
+ pr_info("physptr: %x\n", mpf->physptr);
+
+ if (mpc_new_phys && mpc->length > mpc_new_length) {
+ mpc_new_phys = 0;
+ pr_info("mpc_new_length is %ld, please use alloc_mptable=8k\n",
+ mpc_new_length);
+ }
+
+ if (!mpc_new_phys) {
+ unsigned char old, new;
+ /* check if we can change the position */
+ mpc->checksum = 0;
+ old = mpf_checksum((unsigned char *)mpc, mpc->length);
+ mpc->checksum = 0xff;
+ new = mpf_checksum((unsigned char *)mpc, mpc->length);
+ if (old == new) {
+ pr_info("mpc is readonly, please try alloc_mptable instead\n");
+ goto do_unmap_mpc;
+ }
+ pr_info("use in-position replacing\n");
+ } else {
+ mpc_new = early_memremap(mpc_new_phys, mpc_new_length);
+ if (!mpc_new) {
+ pr_err("MPTABLE: new mpc early_memremap() failed\n");
+ goto do_unmap_mpc;
+ }
+ mpf->physptr = mpc_new_phys;
+ memcpy(mpc_new, mpc, mpc->length);
+ early_memunmap(mpc, size);
+ mpc = mpc_new;
+ size = mpc_new_length;
+ /* check if we can modify that */
+ if (mpc_new_phys - mpf->physptr) {
+ struct mpf_intel *mpf_new;
+ /* steal 16 bytes from [0, 1k) */
+ mpf_new = early_memremap(0x400 - 16, sizeof(*mpf_new));
+ if (!mpf_new) {
+ pr_err("MPTABLE: new mpf early_memremap() failed\n");
+ goto do_unmap_mpc;
+ }
+ pr_info("mpf new: %x\n", 0x400 - 16);
+ memcpy(mpf_new, mpf, 16);
+ early_memunmap(mpf, sizeof(*mpf));
+ mpf = mpf_new;
+ mpf->physptr = mpc_new_phys;
+ }
+ mpf->checksum = 0;
+ mpf->checksum -= mpf_checksum((unsigned char *)mpf, 16);
+ pr_info("physptr new: %x\n", mpf->physptr);
+ }
+
+ /*
+ * only replace the one with mp_INT and
+ * MP_IRQ_TRIGGER_LEVEL|MP_IRQ_POLARITY_LOW,
+ * already in mp_irqs , stored by ... and mp_config_acpi_gsi,
+ * may need pci=routeirq for all coverage
+ */
+ replace_intsrc_all(mpc, mpc_new_phys, mpc_new_length);
+
+do_unmap_mpc:
+ early_memunmap(mpc, size);
+
+do_unmap_mpf:
+ early_memunmap(mpf, sizeof(*mpf));
+
+ return 0;
+}
+
+late_initcall(update_mp_table);
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
new file mode 100644
index 000000000..ef688804f
--- /dev/null
+++ b/arch/x86/kernel/msr.c
@@ -0,0 +1,244 @@
+/* ----------------------------------------------------------------------- *
+ *
+ * Copyright 2000-2008 H. Peter Anvin - All Rights Reserved
+ * Copyright 2009 Intel Corporation; author: H. Peter Anvin
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139,
+ * USA; either version 2 of the License, or (at your option) any later
+ * version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * x86 MSR access device
+ *
+ * This device is accessed by lseek() to the appropriate register number
+ * and then read/write in chunks of 8 bytes. A larger size means multiple
+ * reads or writes of the same register.
+ *
+ * This driver uses /dev/cpu/%d/msr where %d is the minor number, and on
+ * an SMP box will direct the access to CPU %d.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/fcntl.h>
+#include <linux/init.h>
+#include <linux/poll.h>
+#include <linux/smp.h>
+#include <linux/major.h>
+#include <linux/fs.h>
+#include <linux/device.h>
+#include <linux/cpu.h>
+#include <linux/notifier.h>
+#include <linux/uaccess.h>
+#include <linux/gfp.h>
+
+#include <asm/cpufeature.h>
+#include <asm/msr.h>
+
+static struct class *msr_class;
+static enum cpuhp_state cpuhp_msr_state;
+
+static ssize_t msr_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ u32 __user *tmp = (u32 __user *) buf;
+ u32 data[2];
+ u32 reg = *ppos;
+ int cpu = iminor(file_inode(file));
+ int err = 0;
+ ssize_t bytes = 0;
+
+ if (count % 8)
+ return -EINVAL; /* Invalid chunk size */
+
+ for (; count; count -= 8) {
+ err = rdmsr_safe_on_cpu(cpu, reg, &data[0], &data[1]);
+ if (err)
+ break;
+ if (copy_to_user(tmp, &data, 8)) {
+ err = -EFAULT;
+ break;
+ }
+ tmp += 2;
+ bytes += 8;
+ }
+
+ return bytes ? bytes : err;
+}
+
+static ssize_t msr_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ const u32 __user *tmp = (const u32 __user *)buf;
+ u32 data[2];
+ u32 reg = *ppos;
+ int cpu = iminor(file_inode(file));
+ int err = 0;
+ ssize_t bytes = 0;
+
+ if (count % 8)
+ return -EINVAL; /* Invalid chunk size */
+
+ for (; count; count -= 8) {
+ if (copy_from_user(&data, tmp, 8)) {
+ err = -EFAULT;
+ break;
+ }
+ err = wrmsr_safe_on_cpu(cpu, reg, data[0], data[1]);
+ if (err)
+ break;
+ tmp += 2;
+ bytes += 8;
+ }
+
+ return bytes ? bytes : err;
+}
+
+static long msr_ioctl(struct file *file, unsigned int ioc, unsigned long arg)
+{
+ u32 __user *uregs = (u32 __user *)arg;
+ u32 regs[8];
+ int cpu = iminor(file_inode(file));
+ int err;
+
+ switch (ioc) {
+ case X86_IOC_RDMSR_REGS:
+ if (!(file->f_mode & FMODE_READ)) {
+ err = -EBADF;
+ break;
+ }
+ if (copy_from_user(&regs, uregs, sizeof regs)) {
+ err = -EFAULT;
+ break;
+ }
+ err = rdmsr_safe_regs_on_cpu(cpu, regs);
+ if (err)
+ break;
+ if (copy_to_user(uregs, &regs, sizeof regs))
+ err = -EFAULT;
+ break;
+
+ case X86_IOC_WRMSR_REGS:
+ if (!(file->f_mode & FMODE_WRITE)) {
+ err = -EBADF;
+ break;
+ }
+ if (copy_from_user(&regs, uregs, sizeof regs)) {
+ err = -EFAULT;
+ break;
+ }
+ err = wrmsr_safe_regs_on_cpu(cpu, regs);
+ if (err)
+ break;
+ if (copy_to_user(uregs, &regs, sizeof regs))
+ err = -EFAULT;
+ break;
+
+ default:
+ err = -ENOTTY;
+ break;
+ }
+
+ return err;
+}
+
+static int msr_open(struct inode *inode, struct file *file)
+{
+ unsigned int cpu = iminor(file_inode(file));
+ struct cpuinfo_x86 *c;
+
+ if (!capable(CAP_SYS_RAWIO))
+ return -EPERM;
+
+ if (cpu >= nr_cpu_ids || !cpu_online(cpu))
+ return -ENXIO; /* No such CPU */
+
+ c = &cpu_data(cpu);
+ if (!cpu_has(c, X86_FEATURE_MSR))
+ return -EIO; /* MSR not supported */
+
+ return 0;
+}
+
+/*
+ * File operations we support
+ */
+static const struct file_operations msr_fops = {
+ .owner = THIS_MODULE,
+ .llseek = no_seek_end_llseek,
+ .read = msr_read,
+ .write = msr_write,
+ .open = msr_open,
+ .unlocked_ioctl = msr_ioctl,
+ .compat_ioctl = msr_ioctl,
+};
+
+static int msr_device_create(unsigned int cpu)
+{
+ struct device *dev;
+
+ dev = device_create(msr_class, NULL, MKDEV(MSR_MAJOR, cpu), NULL,
+ "msr%d", cpu);
+ return PTR_ERR_OR_ZERO(dev);
+}
+
+static int msr_device_destroy(unsigned int cpu)
+{
+ device_destroy(msr_class, MKDEV(MSR_MAJOR, cpu));
+ return 0;
+}
+
+static char *msr_devnode(struct device *dev, umode_t *mode)
+{
+ return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt));
+}
+
+static int __init msr_init(void)
+{
+ int err;
+
+ if (__register_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr", &msr_fops)) {
+ pr_err("unable to get major %d for msr\n", MSR_MAJOR);
+ return -EBUSY;
+ }
+ msr_class = class_create(THIS_MODULE, "msr");
+ if (IS_ERR(msr_class)) {
+ err = PTR_ERR(msr_class);
+ goto out_chrdev;
+ }
+ msr_class->devnode = msr_devnode;
+
+ err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/msr:online",
+ msr_device_create, msr_device_destroy);
+ if (err < 0)
+ goto out_class;
+ cpuhp_msr_state = err;
+ return 0;
+
+out_class:
+ class_destroy(msr_class);
+out_chrdev:
+ __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr");
+ return err;
+}
+module_init(msr_init);
+
+static void __exit msr_exit(void)
+{
+ cpuhp_remove_state(cpuhp_msr_state);
+ class_destroy(msr_class);
+ __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr");
+}
+module_exit(msr_exit)
+
+MODULE_AUTHOR("H. Peter Anvin <hpa@zytor.com>");
+MODULE_DESCRIPTION("x86 generic MSR driver");
+MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
new file mode 100644
index 000000000..996eb53f8
--- /dev/null
+++ b/arch/x86/kernel/nmi.c
@@ -0,0 +1,555 @@
+/*
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
+ * Copyright (C) 2011 Don Zickus Red Hat, Inc.
+ *
+ * Pentium III FXSR, SSE support
+ * Gareth Hughes <gareth@valinux.com>, May 2000
+ */
+
+/*
+ * Handle hardware traps and faults.
+ */
+#include <linux/spinlock.h>
+#include <linux/kprobes.h>
+#include <linux/kdebug.h>
+#include <linux/sched/debug.h>
+#include <linux/nmi.h>
+#include <linux/debugfs.h>
+#include <linux/delay.h>
+#include <linux/hardirq.h>
+#include <linux/ratelimit.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/sched/clock.h>
+
+#if defined(CONFIG_EDAC)
+#include <linux/edac.h>
+#endif
+
+#include <linux/atomic.h>
+#include <asm/traps.h>
+#include <asm/mach_traps.h>
+#include <asm/nmi.h>
+#include <asm/x86_init.h>
+#include <asm/reboot.h>
+#include <asm/cache.h>
+#include <asm/nospec-branch.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/nmi.h>
+
+struct nmi_desc {
+ raw_spinlock_t lock;
+ struct list_head head;
+};
+
+static struct nmi_desc nmi_desc[NMI_MAX] =
+{
+ {
+ .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[0].lock),
+ .head = LIST_HEAD_INIT(nmi_desc[0].head),
+ },
+ {
+ .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[1].lock),
+ .head = LIST_HEAD_INIT(nmi_desc[1].head),
+ },
+ {
+ .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[2].lock),
+ .head = LIST_HEAD_INIT(nmi_desc[2].head),
+ },
+ {
+ .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[3].lock),
+ .head = LIST_HEAD_INIT(nmi_desc[3].head),
+ },
+
+};
+
+struct nmi_stats {
+ unsigned int normal;
+ unsigned int unknown;
+ unsigned int external;
+ unsigned int swallow;
+};
+
+static DEFINE_PER_CPU(struct nmi_stats, nmi_stats);
+
+static int ignore_nmis __read_mostly;
+
+int unknown_nmi_panic;
+/*
+ * Prevent NMI reason port (0x61) being accessed simultaneously, can
+ * only be used in NMI handler.
+ */
+static DEFINE_RAW_SPINLOCK(nmi_reason_lock);
+
+static int __init setup_unknown_nmi_panic(char *str)
+{
+ unknown_nmi_panic = 1;
+ return 1;
+}
+__setup("unknown_nmi_panic", setup_unknown_nmi_panic);
+
+#define nmi_to_desc(type) (&nmi_desc[type])
+
+static u64 nmi_longest_ns = 1 * NSEC_PER_MSEC;
+
+static int __init nmi_warning_debugfs(void)
+{
+ debugfs_create_u64("nmi_longest_ns", 0644,
+ arch_debugfs_dir, &nmi_longest_ns);
+ return 0;
+}
+fs_initcall(nmi_warning_debugfs);
+
+static void nmi_check_duration(struct nmiaction *action, u64 duration)
+{
+ int remainder_ns, decimal_msecs;
+
+ if (duration < nmi_longest_ns || duration < action->max_duration)
+ return;
+
+ action->max_duration = duration;
+
+ remainder_ns = do_div(duration, (1000 * 1000));
+ decimal_msecs = remainder_ns / 1000;
+
+ printk_ratelimited(KERN_INFO
+ "INFO: NMI handler (%ps) took too long to run: %lld.%03d msecs\n",
+ action->handler, duration, decimal_msecs);
+}
+
+static int nmi_handle(unsigned int type, struct pt_regs *regs)
+{
+ struct nmi_desc *desc = nmi_to_desc(type);
+ struct nmiaction *a;
+ int handled=0;
+
+ rcu_read_lock();
+
+ /*
+ * NMIs are edge-triggered, which means if you have enough
+ * of them concurrently, you can lose some because only one
+ * can be latched at any given time. Walk the whole list
+ * to handle those situations.
+ */
+ list_for_each_entry_rcu(a, &desc->head, list) {
+ int thishandled;
+ u64 delta;
+
+ delta = sched_clock();
+ thishandled = a->handler(type, regs);
+ handled += thishandled;
+ delta = sched_clock() - delta;
+ trace_nmi_handler(a->handler, (int)delta, thishandled);
+
+ nmi_check_duration(a, delta);
+ }
+
+ rcu_read_unlock();
+
+ /* return total number of NMI events handled */
+ return handled;
+}
+NOKPROBE_SYMBOL(nmi_handle);
+
+int __register_nmi_handler(unsigned int type, struct nmiaction *action)
+{
+ struct nmi_desc *desc = nmi_to_desc(type);
+ unsigned long flags;
+
+ if (!action->handler)
+ return -EINVAL;
+
+ raw_spin_lock_irqsave(&desc->lock, flags);
+
+ /*
+ * Indicate if there are multiple registrations on the
+ * internal NMI handler call chains (SERR and IO_CHECK).
+ */
+ WARN_ON_ONCE(type == NMI_SERR && !list_empty(&desc->head));
+ WARN_ON_ONCE(type == NMI_IO_CHECK && !list_empty(&desc->head));
+
+ /*
+ * some handlers need to be executed first otherwise a fake
+ * event confuses some handlers (kdump uses this flag)
+ */
+ if (action->flags & NMI_FLAG_FIRST)
+ list_add_rcu(&action->list, &desc->head);
+ else
+ list_add_tail_rcu(&action->list, &desc->head);
+
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+ return 0;
+}
+EXPORT_SYMBOL(__register_nmi_handler);
+
+void unregister_nmi_handler(unsigned int type, const char *name)
+{
+ struct nmi_desc *desc = nmi_to_desc(type);
+ struct nmiaction *n;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&desc->lock, flags);
+
+ list_for_each_entry_rcu(n, &desc->head, list) {
+ /*
+ * the name passed in to describe the nmi handler
+ * is used as the lookup key
+ */
+ if (!strcmp(n->name, name)) {
+ WARN(in_nmi(),
+ "Trying to free NMI (%s) from NMI context!\n", n->name);
+ list_del_rcu(&n->list);
+ break;
+ }
+ }
+
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+ synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(unregister_nmi_handler);
+
+static void
+pci_serr_error(unsigned char reason, struct pt_regs *regs)
+{
+ /* check to see if anyone registered against these types of errors */
+ if (nmi_handle(NMI_SERR, regs))
+ return;
+
+ pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n",
+ reason, smp_processor_id());
+
+ if (panic_on_unrecovered_nmi)
+ nmi_panic(regs, "NMI: Not continuing");
+
+ pr_emerg("Dazed and confused, but trying to continue\n");
+
+ /* Clear and disable the PCI SERR error line. */
+ reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_SERR;
+ outb(reason, NMI_REASON_PORT);
+}
+NOKPROBE_SYMBOL(pci_serr_error);
+
+static void
+io_check_error(unsigned char reason, struct pt_regs *regs)
+{
+ unsigned long i;
+
+ /* check to see if anyone registered against these types of errors */
+ if (nmi_handle(NMI_IO_CHECK, regs))
+ return;
+
+ pr_emerg(
+ "NMI: IOCK error (debug interrupt?) for reason %02x on CPU %d.\n",
+ reason, smp_processor_id());
+ show_regs(regs);
+
+ if (panic_on_io_nmi) {
+ nmi_panic(regs, "NMI IOCK error: Not continuing");
+
+ /*
+ * If we end up here, it means we have received an NMI while
+ * processing panic(). Simply return without delaying and
+ * re-enabling NMIs.
+ */
+ return;
+ }
+
+ /* Re-enable the IOCK line, wait for a few seconds */
+ reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_IOCHK;
+ outb(reason, NMI_REASON_PORT);
+
+ i = 20000;
+ while (--i) {
+ touch_nmi_watchdog();
+ udelay(100);
+ }
+
+ reason &= ~NMI_REASON_CLEAR_IOCHK;
+ outb(reason, NMI_REASON_PORT);
+}
+NOKPROBE_SYMBOL(io_check_error);
+
+static void
+unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
+{
+ int handled;
+
+ /*
+ * Use 'false' as back-to-back NMIs are dealt with one level up.
+ * Of course this makes having multiple 'unknown' handlers useless
+ * as only the first one is ever run (unless it can actually determine
+ * if it caused the NMI)
+ */
+ handled = nmi_handle(NMI_UNKNOWN, regs);
+ if (handled) {
+ __this_cpu_add(nmi_stats.unknown, handled);
+ return;
+ }
+
+ __this_cpu_add(nmi_stats.unknown, 1);
+
+ pr_emerg("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
+ reason, smp_processor_id());
+
+ pr_emerg("Do you have a strange power saving mode enabled?\n");
+ if (unknown_nmi_panic || panic_on_unrecovered_nmi)
+ nmi_panic(regs, "NMI: Not continuing");
+
+ pr_emerg("Dazed and confused, but trying to continue\n");
+}
+NOKPROBE_SYMBOL(unknown_nmi_error);
+
+static DEFINE_PER_CPU(bool, swallow_nmi);
+static DEFINE_PER_CPU(unsigned long, last_nmi_rip);
+
+static void default_do_nmi(struct pt_regs *regs)
+{
+ unsigned char reason = 0;
+ int handled;
+ bool b2b = false;
+
+ /*
+ * CPU-specific NMI must be processed before non-CPU-specific
+ * NMI, otherwise we may lose it, because the CPU-specific
+ * NMI can not be detected/processed on other CPUs.
+ */
+
+ /*
+ * Back-to-back NMIs are interesting because they can either
+ * be two NMI or more than two NMIs (any thing over two is dropped
+ * due to NMI being edge-triggered). If this is the second half
+ * of the back-to-back NMI, assume we dropped things and process
+ * more handlers. Otherwise reset the 'swallow' NMI behaviour
+ */
+ if (regs->ip == __this_cpu_read(last_nmi_rip))
+ b2b = true;
+ else
+ __this_cpu_write(swallow_nmi, false);
+
+ __this_cpu_write(last_nmi_rip, regs->ip);
+
+ handled = nmi_handle(NMI_LOCAL, regs);
+ __this_cpu_add(nmi_stats.normal, handled);
+ if (handled) {
+ /*
+ * There are cases when a NMI handler handles multiple
+ * events in the current NMI. One of these events may
+ * be queued for in the next NMI. Because the event is
+ * already handled, the next NMI will result in an unknown
+ * NMI. Instead lets flag this for a potential NMI to
+ * swallow.
+ */
+ if (handled > 1)
+ __this_cpu_write(swallow_nmi, true);
+ return;
+ }
+
+ /*
+ * Non-CPU-specific NMI: NMI sources can be processed on any CPU.
+ *
+ * Another CPU may be processing panic routines while holding
+ * nmi_reason_lock. Check if the CPU issued the IPI for crash dumping,
+ * and if so, call its callback directly. If there is no CPU preparing
+ * crash dump, we simply loop here.
+ */
+ while (!raw_spin_trylock(&nmi_reason_lock)) {
+ run_crash_ipi_callback(regs);
+ cpu_relax();
+ }
+
+ reason = x86_platform.get_nmi_reason();
+
+ if (reason & NMI_REASON_MASK) {
+ if (reason & NMI_REASON_SERR)
+ pci_serr_error(reason, regs);
+ else if (reason & NMI_REASON_IOCHK)
+ io_check_error(reason, regs);
+#ifdef CONFIG_X86_32
+ /*
+ * Reassert NMI in case it became active
+ * meanwhile as it's edge-triggered:
+ */
+ reassert_nmi();
+#endif
+ __this_cpu_add(nmi_stats.external, 1);
+ raw_spin_unlock(&nmi_reason_lock);
+ return;
+ }
+ raw_spin_unlock(&nmi_reason_lock);
+
+ /*
+ * Only one NMI can be latched at a time. To handle
+ * this we may process multiple nmi handlers at once to
+ * cover the case where an NMI is dropped. The downside
+ * to this approach is we may process an NMI prematurely,
+ * while its real NMI is sitting latched. This will cause
+ * an unknown NMI on the next run of the NMI processing.
+ *
+ * We tried to flag that condition above, by setting the
+ * swallow_nmi flag when we process more than one event.
+ * This condition is also only present on the second half
+ * of a back-to-back NMI, so we flag that condition too.
+ *
+ * If both are true, we assume we already processed this
+ * NMI previously and we swallow it. Otherwise we reset
+ * the logic.
+ *
+ * There are scenarios where we may accidentally swallow
+ * a 'real' unknown NMI. For example, while processing
+ * a perf NMI another perf NMI comes in along with a
+ * 'real' unknown NMI. These two NMIs get combined into
+ * one (as descibed above). When the next NMI gets
+ * processed, it will be flagged by perf as handled, but
+ * noone will know that there was a 'real' unknown NMI sent
+ * also. As a result it gets swallowed. Or if the first
+ * perf NMI returns two events handled then the second
+ * NMI will get eaten by the logic below, again losing a
+ * 'real' unknown NMI. But this is the best we can do
+ * for now.
+ */
+ if (b2b && __this_cpu_read(swallow_nmi))
+ __this_cpu_add(nmi_stats.swallow, 1);
+ else
+ unknown_nmi_error(reason, regs);
+}
+NOKPROBE_SYMBOL(default_do_nmi);
+
+/*
+ * NMIs can page fault or hit breakpoints which will cause it to lose
+ * its NMI context with the CPU when the breakpoint or page fault does an IRET.
+ *
+ * As a result, NMIs can nest if NMIs get unmasked due an IRET during
+ * NMI processing. On x86_64, the asm glue protects us from nested NMIs
+ * if the outer NMI came from kernel mode, but we can still nest if the
+ * outer NMI came from user mode.
+ *
+ * To handle these nested NMIs, we have three states:
+ *
+ * 1) not running
+ * 2) executing
+ * 3) latched
+ *
+ * When no NMI is in progress, it is in the "not running" state.
+ * When an NMI comes in, it goes into the "executing" state.
+ * Normally, if another NMI is triggered, it does not interrupt
+ * the running NMI and the HW will simply latch it so that when
+ * the first NMI finishes, it will restart the second NMI.
+ * (Note, the latch is binary, thus multiple NMIs triggering,
+ * when one is running, are ignored. Only one NMI is restarted.)
+ *
+ * If an NMI executes an iret, another NMI can preempt it. We do not
+ * want to allow this new NMI to run, but we want to execute it when the
+ * first one finishes. We set the state to "latched", and the exit of
+ * the first NMI will perform a dec_return, if the result is zero
+ * (NOT_RUNNING), then it will simply exit the NMI handler. If not, the
+ * dec_return would have set the state to NMI_EXECUTING (what we want it
+ * to be when we are running). In this case, we simply jump back to
+ * rerun the NMI handler again, and restart the 'latched' NMI.
+ *
+ * No trap (breakpoint or page fault) should be hit before nmi_restart,
+ * thus there is no race between the first check of state for NOT_RUNNING
+ * and setting it to NMI_EXECUTING. The HW will prevent nested NMIs
+ * at this point.
+ *
+ * In case the NMI takes a page fault, we need to save off the CR2
+ * because the NMI could have preempted another page fault and corrupt
+ * the CR2 that is about to be read. As nested NMIs must be restarted
+ * and they can not take breakpoints or page faults, the update of the
+ * CR2 must be done before converting the nmi state back to NOT_RUNNING.
+ * Otherwise, there would be a race of another nested NMI coming in
+ * after setting state to NOT_RUNNING but before updating the nmi_cr2.
+ */
+enum nmi_states {
+ NMI_NOT_RUNNING = 0,
+ NMI_EXECUTING,
+ NMI_LATCHED,
+};
+static DEFINE_PER_CPU(enum nmi_states, nmi_state);
+static DEFINE_PER_CPU(unsigned long, nmi_cr2);
+
+#ifdef CONFIG_X86_64
+/*
+ * In x86_64, we need to handle breakpoint -> NMI -> breakpoint. Without
+ * some care, the inner breakpoint will clobber the outer breakpoint's
+ * stack.
+ *
+ * If a breakpoint is being processed, and the debug stack is being
+ * used, if an NMI comes in and also hits a breakpoint, the stack
+ * pointer will be set to the same fixed address as the breakpoint that
+ * was interrupted, causing that stack to be corrupted. To handle this
+ * case, check if the stack that was interrupted is the debug stack, and
+ * if so, change the IDT so that new breakpoints will use the current
+ * stack and not switch to the fixed address. On return of the NMI,
+ * switch back to the original IDT.
+ */
+static DEFINE_PER_CPU(int, update_debug_stack);
+#endif
+
+dotraplinkage notrace void
+do_nmi(struct pt_regs *regs, long error_code)
+{
+ if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) {
+ this_cpu_write(nmi_state, NMI_LATCHED);
+ return;
+ }
+ this_cpu_write(nmi_state, NMI_EXECUTING);
+ this_cpu_write(nmi_cr2, read_cr2());
+nmi_restart:
+
+#ifdef CONFIG_X86_64
+ /*
+ * If we interrupted a breakpoint, it is possible that
+ * the nmi handler will have breakpoints too. We need to
+ * change the IDT such that breakpoints that happen here
+ * continue to use the NMI stack.
+ */
+ if (unlikely(is_debug_stack(regs->sp))) {
+ debug_stack_set_zero();
+ this_cpu_write(update_debug_stack, 1);
+ }
+#endif
+
+ nmi_enter();
+
+ inc_irq_stat(__nmi_count);
+
+ if (!ignore_nmis)
+ default_do_nmi(regs);
+
+ nmi_exit();
+
+#ifdef CONFIG_X86_64
+ if (unlikely(this_cpu_read(update_debug_stack))) {
+ debug_stack_reset();
+ this_cpu_write(update_debug_stack, 0);
+ }
+#endif
+
+ if (unlikely(this_cpu_read(nmi_cr2) != read_cr2()))
+ write_cr2(this_cpu_read(nmi_cr2));
+ if (this_cpu_dec_return(nmi_state))
+ goto nmi_restart;
+
+ if (user_mode(regs))
+ mds_user_clear_cpu_buffers();
+}
+NOKPROBE_SYMBOL(do_nmi);
+
+void stop_nmi(void)
+{
+ ignore_nmis++;
+}
+
+void restart_nmi(void)
+{
+ ignore_nmis--;
+}
+
+/* reset the back-to-back NMI logic */
+void local_touch_nmi(void)
+{
+ __this_cpu_write(last_nmi_rip, 0);
+}
+EXPORT_SYMBOL_GPL(local_touch_nmi);
diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c
new file mode 100644
index 000000000..a1a96df3d
--- /dev/null
+++ b/arch/x86/kernel/nmi_selftest.c
@@ -0,0 +1,184 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * arch/x86/kernel/nmi-selftest.c
+ *
+ * Testsuite for NMI: IPIs
+ *
+ * Started by Don Zickus:
+ * (using lib/locking-selftest.c as a guide)
+ *
+ * Copyright (C) 2011 Red Hat, Inc., Don Zickus <dzickus@redhat.com>
+ */
+
+#include <linux/smp.h>
+#include <linux/cpumask.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/percpu.h>
+
+#include <asm/apic.h>
+#include <asm/nmi.h>
+
+#define SUCCESS 0
+#define FAILURE 1
+#define TIMEOUT 2
+
+static int __initdata nmi_fail;
+
+/* check to see if NMI IPIs work on this machine */
+static DECLARE_BITMAP(nmi_ipi_mask, NR_CPUS) __initdata;
+
+static int __initdata testcase_total;
+static int __initdata testcase_successes;
+static int __initdata expected_testcase_failures;
+static int __initdata unexpected_testcase_failures;
+static int __initdata unexpected_testcase_unknowns;
+
+static int __init nmi_unk_cb(unsigned int val, struct pt_regs *regs)
+{
+ unexpected_testcase_unknowns++;
+ return NMI_HANDLED;
+}
+
+static void __init init_nmi_testsuite(void)
+{
+ /* trap all the unknown NMIs we may generate */
+ register_nmi_handler(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk",
+ __initdata);
+}
+
+static void __init cleanup_nmi_testsuite(void)
+{
+ unregister_nmi_handler(NMI_UNKNOWN, "nmi_selftest_unk");
+}
+
+static int __init test_nmi_ipi_callback(unsigned int val, struct pt_regs *regs)
+{
+ int cpu = raw_smp_processor_id();
+
+ if (cpumask_test_and_clear_cpu(cpu, to_cpumask(nmi_ipi_mask)))
+ return NMI_HANDLED;
+
+ return NMI_DONE;
+}
+
+static void __init test_nmi_ipi(struct cpumask *mask)
+{
+ unsigned long timeout;
+
+ if (register_nmi_handler(NMI_LOCAL, test_nmi_ipi_callback,
+ NMI_FLAG_FIRST, "nmi_selftest", __initdata)) {
+ nmi_fail = FAILURE;
+ return;
+ }
+
+ /* sync above data before sending NMI */
+ wmb();
+
+ apic->send_IPI_mask(mask, NMI_VECTOR);
+
+ /* Don't wait longer than a second */
+ timeout = USEC_PER_SEC;
+ while (!cpumask_empty(mask) && --timeout)
+ udelay(1);
+
+ /* What happens if we timeout, do we still unregister?? */
+ unregister_nmi_handler(NMI_LOCAL, "nmi_selftest");
+
+ if (!timeout)
+ nmi_fail = TIMEOUT;
+ return;
+}
+
+static void __init remote_ipi(void)
+{
+ cpumask_copy(to_cpumask(nmi_ipi_mask), cpu_online_mask);
+ cpumask_clear_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask));
+ if (!cpumask_empty(to_cpumask(nmi_ipi_mask)))
+ test_nmi_ipi(to_cpumask(nmi_ipi_mask));
+}
+
+static void __init local_ipi(void)
+{
+ cpumask_clear(to_cpumask(nmi_ipi_mask));
+ cpumask_set_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask));
+ test_nmi_ipi(to_cpumask(nmi_ipi_mask));
+}
+
+static void __init reset_nmi(void)
+{
+ nmi_fail = 0;
+}
+
+static void __init dotest(void (*testcase_fn)(void), int expected)
+{
+ testcase_fn();
+ /*
+ * Filter out expected failures:
+ */
+ if (nmi_fail != expected) {
+ unexpected_testcase_failures++;
+
+ if (nmi_fail == FAILURE)
+ printk(KERN_CONT "FAILED |");
+ else if (nmi_fail == TIMEOUT)
+ printk(KERN_CONT "TIMEOUT|");
+ else
+ printk(KERN_CONT "ERROR |");
+ dump_stack();
+ } else {
+ testcase_successes++;
+ printk(KERN_CONT " ok |");
+ }
+ testcase_total++;
+
+ reset_nmi();
+}
+
+static inline void __init print_testname(const char *testname)
+{
+ printk("%12s:", testname);
+}
+
+void __init nmi_selftest(void)
+{
+ init_nmi_testsuite();
+
+ /*
+ * Run the testsuite:
+ */
+ printk("----------------\n");
+ printk("| NMI testsuite:\n");
+ printk("--------------------\n");
+
+ print_testname("remote IPI");
+ dotest(remote_ipi, SUCCESS);
+ printk(KERN_CONT "\n");
+ print_testname("local IPI");
+ dotest(local_ipi, SUCCESS);
+ printk(KERN_CONT "\n");
+
+ cleanup_nmi_testsuite();
+
+ if (unexpected_testcase_failures) {
+ printk("--------------------\n");
+ printk("BUG: %3d unexpected failures (out of %3d) - debugging disabled! |\n",
+ unexpected_testcase_failures, testcase_total);
+ printk("-----------------------------------------------------------------\n");
+ } else if (expected_testcase_failures && testcase_successes) {
+ printk("--------------------\n");
+ printk("%3d out of %3d testcases failed, as expected. |\n",
+ expected_testcase_failures, testcase_total);
+ printk("----------------------------------------------------\n");
+ } else if (expected_testcase_failures && !testcase_successes) {
+ printk("--------------------\n");
+ printk("All %3d testcases failed, as expected. |\n",
+ expected_testcase_failures);
+ printk("----------------------------------------\n");
+ } else {
+ printk("--------------------\n");
+ printk("Good, all %3d testcases passed! |\n",
+ testcase_successes);
+ printk("---------------------------------\n");
+ }
+}
diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
new file mode 100644
index 000000000..71f2d1125
--- /dev/null
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -0,0 +1,45 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Split spinlock implementation out into its own file, so it can be
+ * compiled in a FTRACE-compatible way.
+ */
+#include <linux/spinlock.h>
+#include <linux/export.h>
+#include <linux/jump_label.h>
+
+#include <asm/paravirt.h>
+
+__visible void __native_queued_spin_unlock(struct qspinlock *lock)
+{
+ native_queued_spin_unlock(lock);
+}
+PV_CALLEE_SAVE_REGS_THUNK(__native_queued_spin_unlock);
+
+bool pv_is_native_spin_unlock(void)
+{
+ return pv_lock_ops.queued_spin_unlock.func ==
+ __raw_callee_save___native_queued_spin_unlock;
+}
+
+__visible bool __native_vcpu_is_preempted(long cpu)
+{
+ return false;
+}
+PV_CALLEE_SAVE_REGS_THUNK(__native_vcpu_is_preempted);
+
+bool pv_is_native_vcpu_is_preempted(void)
+{
+ return pv_lock_ops.vcpu_is_preempted.func ==
+ __raw_callee_save___native_vcpu_is_preempted;
+}
+
+struct pv_lock_ops pv_lock_ops = {
+#ifdef CONFIG_SMP
+ .queued_spin_lock_slowpath = native_queued_spin_lock_slowpath,
+ .queued_spin_unlock = PV_CALLEE_SAVE(__native_queued_spin_unlock),
+ .wait = paravirt_nop,
+ .kick = paravirt_nop,
+ .vcpu_is_preempted = PV_CALLEE_SAVE(__native_vcpu_is_preempted),
+#endif /* SMP */
+};
+EXPORT_SYMBOL(pv_lock_ops);
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
new file mode 100644
index 000000000..8dc69d825
--- /dev/null
+++ b/arch/x86/kernel/paravirt.c
@@ -0,0 +1,483 @@
+/* Paravirtualization interfaces
+ Copyright (C) 2006 Rusty Russell IBM Corporation
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+ 2007 - x86_64 support added by Glauber de Oliveira Costa, Red Hat Inc
+*/
+
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/export.h>
+#include <linux/efi.h>
+#include <linux/bcd.h>
+#include <linux/highmem.h>
+#include <linux/kprobes.h>
+
+#include <asm/bug.h>
+#include <asm/paravirt.h>
+#include <asm/debugreg.h>
+#include <asm/desc.h>
+#include <asm/setup.h>
+#include <asm/pgtable.h>
+#include <asm/time.h>
+#include <asm/pgalloc.h>
+#include <asm/irq.h>
+#include <asm/delay.h>
+#include <asm/fixmap.h>
+#include <asm/apic.h>
+#include <asm/tlbflush.h>
+#include <asm/timer.h>
+#include <asm/special_insns.h>
+#include <asm/tlb.h>
+
+/*
+ * nop stub, which must not clobber anything *including the stack* to
+ * avoid confusing the entry prologues.
+ */
+extern void _paravirt_nop(void);
+asm (".pushsection .entry.text, \"ax\"\n"
+ ".global _paravirt_nop\n"
+ "_paravirt_nop:\n\t"
+ "ret\n\t"
+ ".size _paravirt_nop, . - _paravirt_nop\n\t"
+ ".type _paravirt_nop, @function\n\t"
+ ".popsection");
+
+/* identity function, which can be inlined */
+u32 notrace _paravirt_ident_32(u32 x)
+{
+ return x;
+}
+
+u64 notrace _paravirt_ident_64(u64 x)
+{
+ return x;
+}
+
+void __init default_banner(void)
+{
+ printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
+ pv_info.name);
+}
+
+/* Undefined instruction for dealing with missing ops pointers. */
+static const unsigned char ud2a[] = { 0x0f, 0x0b };
+
+struct branch {
+ unsigned char opcode;
+ u32 delta;
+} __attribute__((packed));
+
+unsigned paravirt_patch_call(void *insnbuf,
+ const void *target, u16 tgt_clobbers,
+ unsigned long addr, u16 site_clobbers,
+ unsigned len)
+{
+ struct branch *b = insnbuf;
+ unsigned long delta = (unsigned long)target - (addr+5);
+
+ if (len < 5) {
+#ifdef CONFIG_RETPOLINE
+ WARN_ONCE(1, "Failing to patch indirect CALL in %ps\n", (void *)addr);
+#endif
+ return len; /* call too long for patch site */
+ }
+
+ b->opcode = 0xe8; /* call */
+ b->delta = delta;
+ BUILD_BUG_ON(sizeof(*b) != 5);
+
+ return 5;
+}
+
+unsigned paravirt_patch_jmp(void *insnbuf, const void *target,
+ unsigned long addr, unsigned len)
+{
+ struct branch *b = insnbuf;
+ unsigned long delta = (unsigned long)target - (addr+5);
+
+ if (len < 5) {
+#ifdef CONFIG_RETPOLINE
+ WARN_ONCE(1, "Failing to patch indirect JMP in %ps\n", (void *)addr);
+#endif
+ return len; /* call too long for patch site */
+ }
+
+ b->opcode = 0xe9; /* jmp */
+ b->delta = delta;
+
+ return 5;
+}
+
+DEFINE_STATIC_KEY_TRUE(virt_spin_lock_key);
+
+void __init native_pv_lock_init(void)
+{
+ if (!static_cpu_has(X86_FEATURE_HYPERVISOR))
+ static_branch_disable(&virt_spin_lock_key);
+}
+
+/*
+ * Neat trick to map patch type back to the call within the
+ * corresponding structure.
+ */
+static void *get_call_destination(u8 type)
+{
+ struct paravirt_patch_template tmpl = {
+ .pv_init_ops = pv_init_ops,
+ .pv_time_ops = pv_time_ops,
+ .pv_cpu_ops = pv_cpu_ops,
+ .pv_irq_ops = pv_irq_ops,
+ .pv_mmu_ops = pv_mmu_ops,
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+ .pv_lock_ops = pv_lock_ops,
+#endif
+ };
+ return *((void **)&tmpl + type);
+}
+
+unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
+ unsigned long addr, unsigned len)
+{
+ void *opfunc = get_call_destination(type);
+ unsigned ret;
+
+ if (opfunc == NULL)
+ /* If there's no function, patch it with a ud2a (BUG) */
+ ret = paravirt_patch_insns(insnbuf, len, ud2a, ud2a+sizeof(ud2a));
+ else if (opfunc == _paravirt_nop)
+ ret = 0;
+
+ /* identity functions just return their single argument */
+ else if (opfunc == _paravirt_ident_32)
+ ret = paravirt_patch_ident_32(insnbuf, len);
+ else if (opfunc == _paravirt_ident_64)
+ ret = paravirt_patch_ident_64(insnbuf, len);
+
+ else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) ||
+ type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret64))
+ /* If operation requires a jmp, then jmp */
+ ret = paravirt_patch_jmp(insnbuf, opfunc, addr, len);
+ else
+ /* Otherwise call the function; assume target could
+ clobber any caller-save reg */
+ ret = paravirt_patch_call(insnbuf, opfunc, CLBR_ANY,
+ addr, clobbers, len);
+
+ return ret;
+}
+
+unsigned paravirt_patch_insns(void *insnbuf, unsigned len,
+ const char *start, const char *end)
+{
+ unsigned insn_len = end - start;
+
+ if (insn_len > len || start == NULL)
+ insn_len = len;
+ else
+ memcpy(insnbuf, start, insn_len);
+
+ return insn_len;
+}
+
+static void native_flush_tlb(void)
+{
+ __native_flush_tlb();
+}
+
+/*
+ * Global pages have to be flushed a bit differently. Not a real
+ * performance problem because this does not happen often.
+ */
+static void native_flush_tlb_global(void)
+{
+ __native_flush_tlb_global();
+}
+
+static void native_flush_tlb_one_user(unsigned long addr)
+{
+ __native_flush_tlb_one_user(addr);
+}
+
+struct static_key paravirt_steal_enabled;
+struct static_key paravirt_steal_rq_enabled;
+
+static u64 native_steal_clock(int cpu)
+{
+ return 0;
+}
+
+/* These are in entry.S */
+extern void native_iret(void);
+extern void native_usergs_sysret64(void);
+
+static struct resource reserve_ioports = {
+ .start = 0,
+ .end = IO_SPACE_LIMIT,
+ .name = "paravirt-ioport",
+ .flags = IORESOURCE_IO | IORESOURCE_BUSY,
+};
+
+/*
+ * Reserve the whole legacy IO space to prevent any legacy drivers
+ * from wasting time probing for their hardware. This is a fairly
+ * brute-force approach to disabling all non-virtual drivers.
+ *
+ * Note that this must be called very early to have any effect.
+ */
+int paravirt_disable_iospace(void)
+{
+ return request_resource(&ioport_resource, &reserve_ioports);
+}
+
+static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LAZY_NONE;
+
+static inline void enter_lazy(enum paravirt_lazy_mode mode)
+{
+ BUG_ON(this_cpu_read(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE);
+
+ this_cpu_write(paravirt_lazy_mode, mode);
+}
+
+static void leave_lazy(enum paravirt_lazy_mode mode)
+{
+ BUG_ON(this_cpu_read(paravirt_lazy_mode) != mode);
+
+ this_cpu_write(paravirt_lazy_mode, PARAVIRT_LAZY_NONE);
+}
+
+void paravirt_enter_lazy_mmu(void)
+{
+ enter_lazy(PARAVIRT_LAZY_MMU);
+}
+
+void paravirt_leave_lazy_mmu(void)
+{
+ leave_lazy(PARAVIRT_LAZY_MMU);
+}
+
+void paravirt_flush_lazy_mmu(void)
+{
+ preempt_disable();
+
+ if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
+ arch_leave_lazy_mmu_mode();
+ arch_enter_lazy_mmu_mode();
+ }
+
+ preempt_enable();
+}
+
+void paravirt_start_context_switch(struct task_struct *prev)
+{
+ BUG_ON(preemptible());
+
+ if (this_cpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) {
+ arch_leave_lazy_mmu_mode();
+ set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES);
+ }
+ enter_lazy(PARAVIRT_LAZY_CPU);
+}
+
+void paravirt_end_context_switch(struct task_struct *next)
+{
+ BUG_ON(preemptible());
+
+ leave_lazy(PARAVIRT_LAZY_CPU);
+
+ if (test_and_clear_ti_thread_flag(task_thread_info(next), TIF_LAZY_MMU_UPDATES))
+ arch_enter_lazy_mmu_mode();
+}
+
+enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
+{
+ if (in_interrupt())
+ return PARAVIRT_LAZY_NONE;
+
+ return this_cpu_read(paravirt_lazy_mode);
+}
+
+struct pv_info pv_info = {
+ .name = "bare hardware",
+ .kernel_rpl = 0,
+ .shared_kernel_pmd = 1, /* Only used when CONFIG_X86_PAE is set */
+
+#ifdef CONFIG_X86_64
+ .extra_user_64bit_cs = __USER_CS,
+#endif
+};
+
+struct pv_init_ops pv_init_ops = {
+ .patch = native_patch,
+};
+
+struct pv_time_ops pv_time_ops = {
+ .sched_clock = native_sched_clock,
+ .steal_clock = native_steal_clock,
+};
+
+__visible struct pv_irq_ops pv_irq_ops = {
+ .save_fl = __PV_IS_CALLEE_SAVE(native_save_fl),
+ .restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl),
+ .irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable),
+ .irq_enable = __PV_IS_CALLEE_SAVE(native_irq_enable),
+ .safe_halt = native_safe_halt,
+ .halt = native_halt,
+};
+
+__visible struct pv_cpu_ops pv_cpu_ops = {
+ .cpuid = native_cpuid,
+ .get_debugreg = native_get_debugreg,
+ .set_debugreg = native_set_debugreg,
+ .read_cr0 = native_read_cr0,
+ .write_cr0 = native_write_cr0,
+ .write_cr4 = native_write_cr4,
+#ifdef CONFIG_X86_64
+ .read_cr8 = native_read_cr8,
+ .write_cr8 = native_write_cr8,
+#endif
+ .wbinvd = native_wbinvd,
+ .read_msr = native_read_msr,
+ .write_msr = native_write_msr,
+ .read_msr_safe = native_read_msr_safe,
+ .write_msr_safe = native_write_msr_safe,
+ .read_pmc = native_read_pmc,
+ .load_tr_desc = native_load_tr_desc,
+ .set_ldt = native_set_ldt,
+ .load_gdt = native_load_gdt,
+ .load_idt = native_load_idt,
+ .store_tr = native_store_tr,
+ .load_tls = native_load_tls,
+#ifdef CONFIG_X86_64
+ .load_gs_index = native_load_gs_index,
+#endif
+ .write_ldt_entry = native_write_ldt_entry,
+ .write_gdt_entry = native_write_gdt_entry,
+ .write_idt_entry = native_write_idt_entry,
+
+ .alloc_ldt = paravirt_nop,
+ .free_ldt = paravirt_nop,
+
+ .load_sp0 = native_load_sp0,
+
+#ifdef CONFIG_X86_64
+ .usergs_sysret64 = native_usergs_sysret64,
+#endif
+ .iret = native_iret,
+ .swapgs = native_swapgs,
+
+ .set_iopl_mask = native_set_iopl_mask,
+ .io_delay = native_io_delay,
+
+ .start_context_switch = paravirt_nop,
+ .end_context_switch = paravirt_nop,
+};
+
+/* At this point, native_get/set_debugreg has real function entries */
+NOKPROBE_SYMBOL(native_get_debugreg);
+NOKPROBE_SYMBOL(native_set_debugreg);
+NOKPROBE_SYMBOL(native_load_idt);
+
+#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_PAE)
+/* 32-bit pagetable entries */
+#define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_32)
+#else
+/* 64-bit pagetable entries */
+#define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_64)
+#endif
+
+struct pv_mmu_ops pv_mmu_ops __ro_after_init = {
+
+ .read_cr2 = native_read_cr2,
+ .write_cr2 = native_write_cr2,
+ .read_cr3 = __native_read_cr3,
+ .write_cr3 = native_write_cr3,
+
+ .flush_tlb_user = native_flush_tlb,
+ .flush_tlb_kernel = native_flush_tlb_global,
+ .flush_tlb_one_user = native_flush_tlb_one_user,
+ .flush_tlb_others = native_flush_tlb_others,
+ .tlb_remove_table = (void (*)(struct mmu_gather *, void *))tlb_remove_page,
+
+ .pgd_alloc = __paravirt_pgd_alloc,
+ .pgd_free = paravirt_nop,
+
+ .alloc_pte = paravirt_nop,
+ .alloc_pmd = paravirt_nop,
+ .alloc_pud = paravirt_nop,
+ .alloc_p4d = paravirt_nop,
+ .release_pte = paravirt_nop,
+ .release_pmd = paravirt_nop,
+ .release_pud = paravirt_nop,
+ .release_p4d = paravirt_nop,
+
+ .set_pte = native_set_pte,
+ .set_pte_at = native_set_pte_at,
+ .set_pmd = native_set_pmd,
+
+ .ptep_modify_prot_start = __ptep_modify_prot_start,
+ .ptep_modify_prot_commit = __ptep_modify_prot_commit,
+
+#if CONFIG_PGTABLE_LEVELS >= 3
+#ifdef CONFIG_X86_PAE
+ .set_pte_atomic = native_set_pte_atomic,
+ .pte_clear = native_pte_clear,
+ .pmd_clear = native_pmd_clear,
+#endif
+ .set_pud = native_set_pud,
+
+ .pmd_val = PTE_IDENT,
+ .make_pmd = PTE_IDENT,
+
+#if CONFIG_PGTABLE_LEVELS >= 4
+ .pud_val = PTE_IDENT,
+ .make_pud = PTE_IDENT,
+
+ .set_p4d = native_set_p4d,
+
+#if CONFIG_PGTABLE_LEVELS >= 5
+ .p4d_val = PTE_IDENT,
+ .make_p4d = PTE_IDENT,
+
+ .set_pgd = native_set_pgd,
+#endif /* CONFIG_PGTABLE_LEVELS >= 5 */
+#endif /* CONFIG_PGTABLE_LEVELS >= 4 */
+#endif /* CONFIG_PGTABLE_LEVELS >= 3 */
+
+ .pte_val = PTE_IDENT,
+ .pgd_val = PTE_IDENT,
+
+ .make_pte = PTE_IDENT,
+ .make_pgd = PTE_IDENT,
+
+ .dup_mmap = paravirt_nop,
+ .exit_mmap = paravirt_nop,
+ .activate_mm = paravirt_nop,
+
+ .lazy_mode = {
+ .enter = paravirt_nop,
+ .leave = paravirt_nop,
+ .flush = paravirt_nop,
+ },
+
+ .set_fixmap = native_set_fixmap,
+};
+
+EXPORT_SYMBOL_GPL(pv_time_ops);
+EXPORT_SYMBOL (pv_cpu_ops);
+EXPORT_SYMBOL (pv_mmu_ops);
+EXPORT_SYMBOL_GPL(pv_info);
+EXPORT_SYMBOL (pv_irq_ops);
diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c
new file mode 100644
index 000000000..758e69d72
--- /dev/null
+++ b/arch/x86/kernel/paravirt_patch_32.c
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <asm/paravirt.h>
+
+DEF_NATIVE(pv_irq_ops, irq_disable, "cli");
+DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
+DEF_NATIVE(pv_irq_ops, restore_fl, "push %eax; popf");
+DEF_NATIVE(pv_irq_ops, save_fl, "pushf; pop %eax");
+DEF_NATIVE(pv_cpu_ops, iret, "iret");
+DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax");
+DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3");
+DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
+
+#if defined(CONFIG_PARAVIRT_SPINLOCKS)
+DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%eax)");
+DEF_NATIVE(pv_lock_ops, vcpu_is_preempted, "xor %eax, %eax");
+#endif
+
+unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
+{
+ /* arg in %eax, return in %eax */
+ return 0;
+}
+
+unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len)
+{
+ /* arg in %edx:%eax, return in %edx:%eax */
+ return 0;
+}
+
+extern bool pv_is_native_spin_unlock(void);
+extern bool pv_is_native_vcpu_is_preempted(void);
+
+unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
+ unsigned long addr, unsigned len)
+{
+ const unsigned char *start, *end;
+ unsigned ret;
+
+#define PATCH_SITE(ops, x) \
+ case PARAVIRT_PATCH(ops.x): \
+ start = start_##ops##_##x; \
+ end = end_##ops##_##x; \
+ goto patch_site
+ switch (type) {
+ PATCH_SITE(pv_irq_ops, irq_disable);
+ PATCH_SITE(pv_irq_ops, irq_enable);
+ PATCH_SITE(pv_irq_ops, restore_fl);
+ PATCH_SITE(pv_irq_ops, save_fl);
+ PATCH_SITE(pv_cpu_ops, iret);
+ PATCH_SITE(pv_mmu_ops, read_cr2);
+ PATCH_SITE(pv_mmu_ops, read_cr3);
+ PATCH_SITE(pv_mmu_ops, write_cr3);
+#if defined(CONFIG_PARAVIRT_SPINLOCKS)
+ case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):
+ if (pv_is_native_spin_unlock()) {
+ start = start_pv_lock_ops_queued_spin_unlock;
+ end = end_pv_lock_ops_queued_spin_unlock;
+ goto patch_site;
+ }
+ goto patch_default;
+
+ case PARAVIRT_PATCH(pv_lock_ops.vcpu_is_preempted):
+ if (pv_is_native_vcpu_is_preempted()) {
+ start = start_pv_lock_ops_vcpu_is_preempted;
+ end = end_pv_lock_ops_vcpu_is_preempted;
+ goto patch_site;
+ }
+ goto patch_default;
+#endif
+
+ default:
+patch_default: __maybe_unused
+ ret = paravirt_patch_default(type, clobbers, ibuf, addr, len);
+ break;
+
+patch_site:
+ ret = paravirt_patch_insns(ibuf, len, start, end);
+ break;
+ }
+#undef PATCH_SITE
+ return ret;
+}
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
new file mode 100644
index 000000000..9cb98f7b0
--- /dev/null
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -0,0 +1,92 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <asm/paravirt.h>
+#include <asm/asm-offsets.h>
+#include <linux/stringify.h>
+
+DEF_NATIVE(pv_irq_ops, irq_disable, "cli");
+DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
+DEF_NATIVE(pv_irq_ops, restore_fl, "pushq %rdi; popfq");
+DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax");
+DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax");
+DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax");
+DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3");
+DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd");
+
+DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq");
+DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs");
+
+DEF_NATIVE(, mov32, "mov %edi, %eax");
+DEF_NATIVE(, mov64, "mov %rdi, %rax");
+
+#if defined(CONFIG_PARAVIRT_SPINLOCKS)
+DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%rdi)");
+DEF_NATIVE(pv_lock_ops, vcpu_is_preempted, "xor %eax, %eax");
+#endif
+
+unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
+{
+ return paravirt_patch_insns(insnbuf, len,
+ start__mov32, end__mov32);
+}
+
+unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len)
+{
+ return paravirt_patch_insns(insnbuf, len,
+ start__mov64, end__mov64);
+}
+
+extern bool pv_is_native_spin_unlock(void);
+extern bool pv_is_native_vcpu_is_preempted(void);
+
+unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
+ unsigned long addr, unsigned len)
+{
+ const unsigned char *start, *end;
+ unsigned ret;
+
+#define PATCH_SITE(ops, x) \
+ case PARAVIRT_PATCH(ops.x): \
+ start = start_##ops##_##x; \
+ end = end_##ops##_##x; \
+ goto patch_site
+ switch(type) {
+ PATCH_SITE(pv_irq_ops, restore_fl);
+ PATCH_SITE(pv_irq_ops, save_fl);
+ PATCH_SITE(pv_irq_ops, irq_enable);
+ PATCH_SITE(pv_irq_ops, irq_disable);
+ PATCH_SITE(pv_cpu_ops, usergs_sysret64);
+ PATCH_SITE(pv_cpu_ops, swapgs);
+ PATCH_SITE(pv_mmu_ops, read_cr2);
+ PATCH_SITE(pv_mmu_ops, read_cr3);
+ PATCH_SITE(pv_mmu_ops, write_cr3);
+ PATCH_SITE(pv_cpu_ops, wbinvd);
+#if defined(CONFIG_PARAVIRT_SPINLOCKS)
+ case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):
+ if (pv_is_native_spin_unlock()) {
+ start = start_pv_lock_ops_queued_spin_unlock;
+ end = end_pv_lock_ops_queued_spin_unlock;
+ goto patch_site;
+ }
+ goto patch_default;
+
+ case PARAVIRT_PATCH(pv_lock_ops.vcpu_is_preempted):
+ if (pv_is_native_vcpu_is_preempted()) {
+ start = start_pv_lock_ops_vcpu_is_preempted;
+ end = end_pv_lock_ops_vcpu_is_preempted;
+ goto patch_site;
+ }
+ goto patch_default;
+#endif
+
+ default:
+patch_default: __maybe_unused
+ ret = paravirt_patch_default(type, clobbers, ibuf, addr, len);
+ break;
+
+patch_site:
+ ret = paravirt_patch_insns(ibuf, len, start, end);
+ break;
+ }
+#undef PATCH_SITE
+ return ret;
+}
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
new file mode 100644
index 000000000..bbfc8b1e9
--- /dev/null
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -0,0 +1,1612 @@
+/*
+ * Derived from arch/powerpc/kernel/iommu.c
+ *
+ * Copyright IBM Corporation, 2006-2007
+ * Copyright (C) 2006 Jon Mason <jdmason@kudzu.us>
+ *
+ * Author: Jon Mason <jdmason@kudzu.us>
+ * Author: Muli Ben-Yehuda <muli@il.ibm.com>
+
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#define pr_fmt(fmt) "Calgary: " fmt
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/crash_dump.h>
+#include <linux/dma-mapping.h>
+#include <linux/dma-direct.h>
+#include <linux/bitmap.h>
+#include <linux/pci_ids.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/scatterlist.h>
+#include <linux/iommu-helper.h>
+
+#include <asm/iommu.h>
+#include <asm/calgary.h>
+#include <asm/tce.h>
+#include <asm/pci-direct.h>
+#include <asm/dma.h>
+#include <asm/rio.h>
+#include <asm/bios_ebda.h>
+#include <asm/x86_init.h>
+#include <asm/iommu_table.h>
+
+#define CALGARY_MAPPING_ERROR 0
+
+#ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT
+int use_calgary __read_mostly = 1;
+#else
+int use_calgary __read_mostly = 0;
+#endif /* CONFIG_CALGARY_DEFAULT_ENABLED */
+
+#define PCI_DEVICE_ID_IBM_CALGARY 0x02a1
+#define PCI_DEVICE_ID_IBM_CALIOC2 0x0308
+
+/* register offsets inside the host bridge space */
+#define CALGARY_CONFIG_REG 0x0108
+#define PHB_CSR_OFFSET 0x0110 /* Channel Status */
+#define PHB_PLSSR_OFFSET 0x0120
+#define PHB_CONFIG_RW_OFFSET 0x0160
+#define PHB_IOBASE_BAR_LOW 0x0170
+#define PHB_IOBASE_BAR_HIGH 0x0180
+#define PHB_MEM_1_LOW 0x0190
+#define PHB_MEM_1_HIGH 0x01A0
+#define PHB_IO_ADDR_SIZE 0x01B0
+#define PHB_MEM_1_SIZE 0x01C0
+#define PHB_MEM_ST_OFFSET 0x01D0
+#define PHB_AER_OFFSET 0x0200
+#define PHB_CONFIG_0_HIGH 0x0220
+#define PHB_CONFIG_0_LOW 0x0230
+#define PHB_CONFIG_0_END 0x0240
+#define PHB_MEM_2_LOW 0x02B0
+#define PHB_MEM_2_HIGH 0x02C0
+#define PHB_MEM_2_SIZE_HIGH 0x02D0
+#define PHB_MEM_2_SIZE_LOW 0x02E0
+#define PHB_DOSHOLE_OFFSET 0x08E0
+
+/* CalIOC2 specific */
+#define PHB_SAVIOR_L2 0x0DB0
+#define PHB_PAGE_MIG_CTRL 0x0DA8
+#define PHB_PAGE_MIG_DEBUG 0x0DA0
+#define PHB_ROOT_COMPLEX_STATUS 0x0CB0
+
+/* PHB_CONFIG_RW */
+#define PHB_TCE_ENABLE 0x20000000
+#define PHB_SLOT_DISABLE 0x1C000000
+#define PHB_DAC_DISABLE 0x01000000
+#define PHB_MEM2_ENABLE 0x00400000
+#define PHB_MCSR_ENABLE 0x00100000
+/* TAR (Table Address Register) */
+#define TAR_SW_BITS 0x0000ffffffff800fUL
+#define TAR_VALID 0x0000000000000008UL
+/* CSR (Channel/DMA Status Register) */
+#define CSR_AGENT_MASK 0xffe0ffff
+/* CCR (Calgary Configuration Register) */
+#define CCR_2SEC_TIMEOUT 0x000000000000000EUL
+/* PMCR/PMDR (Page Migration Control/Debug Registers */
+#define PMR_SOFTSTOP 0x80000000
+#define PMR_SOFTSTOPFAULT 0x40000000
+#define PMR_HARDSTOP 0x20000000
+
+/*
+ * The maximum PHB bus number.
+ * x3950M2 (rare): 8 chassis, 48 PHBs per chassis = 384
+ * x3950M2: 4 chassis, 48 PHBs per chassis = 192
+ * x3950 (PCIE): 8 chassis, 32 PHBs per chassis = 256
+ * x3950 (PCIX): 8 chassis, 16 PHBs per chassis = 128
+ */
+#define MAX_PHB_BUS_NUM 256
+
+#define PHBS_PER_CALGARY 4
+
+/* register offsets in Calgary's internal register space */
+static const unsigned long tar_offsets[] = {
+ 0x0580 /* TAR0 */,
+ 0x0588 /* TAR1 */,
+ 0x0590 /* TAR2 */,
+ 0x0598 /* TAR3 */
+};
+
+static const unsigned long split_queue_offsets[] = {
+ 0x4870 /* SPLIT QUEUE 0 */,
+ 0x5870 /* SPLIT QUEUE 1 */,
+ 0x6870 /* SPLIT QUEUE 2 */,
+ 0x7870 /* SPLIT QUEUE 3 */
+};
+
+static const unsigned long phb_offsets[] = {
+ 0x8000 /* PHB0 */,
+ 0x9000 /* PHB1 */,
+ 0xA000 /* PHB2 */,
+ 0xB000 /* PHB3 */
+};
+
+/* PHB debug registers */
+
+static const unsigned long phb_debug_offsets[] = {
+ 0x4000 /* PHB 0 DEBUG */,
+ 0x5000 /* PHB 1 DEBUG */,
+ 0x6000 /* PHB 2 DEBUG */,
+ 0x7000 /* PHB 3 DEBUG */
+};
+
+/*
+ * STUFF register for each debug PHB,
+ * byte 1 = start bus number, byte 2 = end bus number
+ */
+
+#define PHB_DEBUG_STUFF_OFFSET 0x0020
+
+#define EMERGENCY_PAGES 32 /* = 128KB */
+
+unsigned int specified_table_size = TCE_TABLE_SIZE_UNSPECIFIED;
+static int translate_empty_slots __read_mostly = 0;
+static int calgary_detected __read_mostly = 0;
+
+static struct rio_table_hdr *rio_table_hdr __initdata;
+static struct scal_detail *scal_devs[MAX_NUMNODES] __initdata;
+static struct rio_detail *rio_devs[MAX_NUMNODES * 4] __initdata;
+
+struct calgary_bus_info {
+ void *tce_space;
+ unsigned char translation_disabled;
+ signed char phbid;
+ void __iomem *bbar;
+};
+
+static void calgary_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev);
+static void calgary_tce_cache_blast(struct iommu_table *tbl);
+static void calgary_dump_error_regs(struct iommu_table *tbl);
+static void calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev);
+static void calioc2_tce_cache_blast(struct iommu_table *tbl);
+static void calioc2_dump_error_regs(struct iommu_table *tbl);
+static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl);
+static void get_tce_space_from_tar(void);
+
+static const struct cal_chipset_ops calgary_chip_ops = {
+ .handle_quirks = calgary_handle_quirks,
+ .tce_cache_blast = calgary_tce_cache_blast,
+ .dump_error_regs = calgary_dump_error_regs
+};
+
+static const struct cal_chipset_ops calioc2_chip_ops = {
+ .handle_quirks = calioc2_handle_quirks,
+ .tce_cache_blast = calioc2_tce_cache_blast,
+ .dump_error_regs = calioc2_dump_error_regs
+};
+
+static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, };
+
+static inline int translation_enabled(struct iommu_table *tbl)
+{
+ /* only PHBs with translation enabled have an IOMMU table */
+ return (tbl != NULL);
+}
+
+static void iommu_range_reserve(struct iommu_table *tbl,
+ unsigned long start_addr, unsigned int npages)
+{
+ unsigned long index;
+ unsigned long end;
+ unsigned long flags;
+
+ index = start_addr >> PAGE_SHIFT;
+
+ /* bail out if we're asked to reserve a region we don't cover */
+ if (index >= tbl->it_size)
+ return;
+
+ end = index + npages;
+ if (end > tbl->it_size) /* don't go off the table */
+ end = tbl->it_size;
+
+ spin_lock_irqsave(&tbl->it_lock, flags);
+
+ bitmap_set(tbl->it_map, index, npages);
+
+ spin_unlock_irqrestore(&tbl->it_lock, flags);
+}
+
+static unsigned long iommu_range_alloc(struct device *dev,
+ struct iommu_table *tbl,
+ unsigned int npages)
+{
+ unsigned long flags;
+ unsigned long offset;
+ unsigned long boundary_size;
+
+ boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
+ PAGE_SIZE) >> PAGE_SHIFT;
+
+ BUG_ON(npages == 0);
+
+ spin_lock_irqsave(&tbl->it_lock, flags);
+
+ offset = iommu_area_alloc(tbl->it_map, tbl->it_size, tbl->it_hint,
+ npages, 0, boundary_size, 0);
+ if (offset == ~0UL) {
+ tbl->chip_ops->tce_cache_blast(tbl);
+
+ offset = iommu_area_alloc(tbl->it_map, tbl->it_size, 0,
+ npages, 0, boundary_size, 0);
+ if (offset == ~0UL) {
+ pr_warn("IOMMU full\n");
+ spin_unlock_irqrestore(&tbl->it_lock, flags);
+ if (panic_on_overflow)
+ panic("Calgary: fix the allocator.\n");
+ else
+ return CALGARY_MAPPING_ERROR;
+ }
+ }
+
+ tbl->it_hint = offset + npages;
+ BUG_ON(tbl->it_hint > tbl->it_size);
+
+ spin_unlock_irqrestore(&tbl->it_lock, flags);
+
+ return offset;
+}
+
+static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
+ void *vaddr, unsigned int npages, int direction)
+{
+ unsigned long entry;
+ dma_addr_t ret;
+
+ entry = iommu_range_alloc(dev, tbl, npages);
+
+ if (unlikely(entry == CALGARY_MAPPING_ERROR)) {
+ pr_warn("failed to allocate %u pages in iommu %p\n",
+ npages, tbl);
+ return CALGARY_MAPPING_ERROR;
+ }
+
+ /* set the return dma address */
+ ret = (entry << PAGE_SHIFT) | ((unsigned long)vaddr & ~PAGE_MASK);
+
+ /* put the TCEs in the HW table */
+ tce_build(tbl, entry, npages, (unsigned long)vaddr & PAGE_MASK,
+ direction);
+ return ret;
+}
+
+static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
+ unsigned int npages)
+{
+ unsigned long entry;
+ unsigned long badend;
+ unsigned long flags;
+
+ /* were we called with bad_dma_address? */
+ badend = CALGARY_MAPPING_ERROR + (EMERGENCY_PAGES * PAGE_SIZE);
+ if (unlikely(dma_addr < badend)) {
+ WARN(1, KERN_ERR "Calgary: driver tried unmapping bad DMA "
+ "address 0x%Lx\n", dma_addr);
+ return;
+ }
+
+ entry = dma_addr >> PAGE_SHIFT;
+
+ BUG_ON(entry + npages > tbl->it_size);
+
+ tce_free(tbl, entry, npages);
+
+ spin_lock_irqsave(&tbl->it_lock, flags);
+
+ bitmap_clear(tbl->it_map, entry, npages);
+
+ spin_unlock_irqrestore(&tbl->it_lock, flags);
+}
+
+static inline struct iommu_table *find_iommu_table(struct device *dev)
+{
+ struct pci_dev *pdev;
+ struct pci_bus *pbus;
+ struct iommu_table *tbl;
+
+ pdev = to_pci_dev(dev);
+
+ /* search up the device tree for an iommu */
+ pbus = pdev->bus;
+ do {
+ tbl = pci_iommu(pbus);
+ if (tbl && tbl->it_busno == pbus->number)
+ break;
+ tbl = NULL;
+ pbus = pbus->parent;
+ } while (pbus);
+
+ BUG_ON(tbl && (tbl->it_busno != pbus->number));
+
+ return tbl;
+}
+
+static void calgary_unmap_sg(struct device *dev, struct scatterlist *sglist,
+ int nelems,enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ struct iommu_table *tbl = find_iommu_table(dev);
+ struct scatterlist *s;
+ int i;
+
+ if (!translation_enabled(tbl))
+ return;
+
+ for_each_sg(sglist, s, nelems, i) {
+ unsigned int npages;
+ dma_addr_t dma = s->dma_address;
+ unsigned int dmalen = s->dma_length;
+
+ if (dmalen == 0)
+ break;
+
+ npages = iommu_num_pages(dma, dmalen, PAGE_SIZE);
+ iommu_free(tbl, dma, npages);
+ }
+}
+
+static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
+ int nelems, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ struct iommu_table *tbl = find_iommu_table(dev);
+ struct scatterlist *s;
+ unsigned long vaddr;
+ unsigned int npages;
+ unsigned long entry;
+ int i;
+
+ for_each_sg(sg, s, nelems, i) {
+ BUG_ON(!sg_page(s));
+
+ vaddr = (unsigned long) sg_virt(s);
+ npages = iommu_num_pages(vaddr, s->length, PAGE_SIZE);
+
+ entry = iommu_range_alloc(dev, tbl, npages);
+ if (entry == CALGARY_MAPPING_ERROR) {
+ /* makes sure unmap knows to stop */
+ s->dma_length = 0;
+ goto error;
+ }
+
+ s->dma_address = (entry << PAGE_SHIFT) | s->offset;
+
+ /* insert into HW table */
+ tce_build(tbl, entry, npages, vaddr & PAGE_MASK, dir);
+
+ s->dma_length = s->length;
+ }
+
+ return nelems;
+error:
+ calgary_unmap_sg(dev, sg, nelems, dir, 0);
+ for_each_sg(sg, s, nelems, i) {
+ sg->dma_address = CALGARY_MAPPING_ERROR;
+ sg->dma_length = 0;
+ }
+ return 0;
+}
+
+static dma_addr_t calgary_map_page(struct device *dev, struct page *page,
+ unsigned long offset, size_t size,
+ enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ void *vaddr = page_address(page) + offset;
+ unsigned long uaddr;
+ unsigned int npages;
+ struct iommu_table *tbl = find_iommu_table(dev);
+
+ uaddr = (unsigned long)vaddr;
+ npages = iommu_num_pages(uaddr, size, PAGE_SIZE);
+
+ return iommu_alloc(dev, tbl, vaddr, npages, dir);
+}
+
+static void calgary_unmap_page(struct device *dev, dma_addr_t dma_addr,
+ size_t size, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ struct iommu_table *tbl = find_iommu_table(dev);
+ unsigned int npages;
+
+ npages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
+ iommu_free(tbl, dma_addr, npages);
+}
+
+static void* calgary_alloc_coherent(struct device *dev, size_t size,
+ dma_addr_t *dma_handle, gfp_t flag, unsigned long attrs)
+{
+ void *ret = NULL;
+ dma_addr_t mapping;
+ unsigned int npages, order;
+ struct iommu_table *tbl = find_iommu_table(dev);
+
+ size = PAGE_ALIGN(size); /* size rounded up to full pages */
+ npages = size >> PAGE_SHIFT;
+ order = get_order(size);
+
+ /* alloc enough pages (and possibly more) */
+ ret = (void *)__get_free_pages(flag, order);
+ if (!ret)
+ goto error;
+ memset(ret, 0, size);
+
+ /* set up tces to cover the allocated range */
+ mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL);
+ if (mapping == CALGARY_MAPPING_ERROR)
+ goto free;
+ *dma_handle = mapping;
+ return ret;
+free:
+ free_pages((unsigned long)ret, get_order(size));
+ ret = NULL;
+error:
+ return ret;
+}
+
+static void calgary_free_coherent(struct device *dev, size_t size,
+ void *vaddr, dma_addr_t dma_handle,
+ unsigned long attrs)
+{
+ unsigned int npages;
+ struct iommu_table *tbl = find_iommu_table(dev);
+
+ size = PAGE_ALIGN(size);
+ npages = size >> PAGE_SHIFT;
+
+ iommu_free(tbl, dma_handle, npages);
+ free_pages((unsigned long)vaddr, get_order(size));
+}
+
+static int calgary_mapping_error(struct device *dev, dma_addr_t dma_addr)
+{
+ return dma_addr == CALGARY_MAPPING_ERROR;
+}
+
+static const struct dma_map_ops calgary_dma_ops = {
+ .alloc = calgary_alloc_coherent,
+ .free = calgary_free_coherent,
+ .map_sg = calgary_map_sg,
+ .unmap_sg = calgary_unmap_sg,
+ .map_page = calgary_map_page,
+ .unmap_page = calgary_unmap_page,
+ .mapping_error = calgary_mapping_error,
+ .dma_supported = dma_direct_supported,
+};
+
+static inline void __iomem * busno_to_bbar(unsigned char num)
+{
+ return bus_info[num].bbar;
+}
+
+static inline int busno_to_phbid(unsigned char num)
+{
+ return bus_info[num].phbid;
+}
+
+static inline unsigned long split_queue_offset(unsigned char num)
+{
+ size_t idx = busno_to_phbid(num);
+
+ return split_queue_offsets[idx];
+}
+
+static inline unsigned long tar_offset(unsigned char num)
+{
+ size_t idx = busno_to_phbid(num);
+
+ return tar_offsets[idx];
+}
+
+static inline unsigned long phb_offset(unsigned char num)
+{
+ size_t idx = busno_to_phbid(num);
+
+ return phb_offsets[idx];
+}
+
+static inline void __iomem* calgary_reg(void __iomem *bar, unsigned long offset)
+{
+ unsigned long target = ((unsigned long)bar) | offset;
+ return (void __iomem*)target;
+}
+
+static inline int is_calioc2(unsigned short device)
+{
+ return (device == PCI_DEVICE_ID_IBM_CALIOC2);
+}
+
+static inline int is_calgary(unsigned short device)
+{
+ return (device == PCI_DEVICE_ID_IBM_CALGARY);
+}
+
+static inline int is_cal_pci_dev(unsigned short device)
+{
+ return (is_calgary(device) || is_calioc2(device));
+}
+
+static void calgary_tce_cache_blast(struct iommu_table *tbl)
+{
+ u64 val;
+ u32 aer;
+ int i = 0;
+ void __iomem *bbar = tbl->bbar;
+ void __iomem *target;
+
+ /* disable arbitration on the bus */
+ target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_AER_OFFSET);
+ aer = readl(target);
+ writel(0, target);
+
+ /* read plssr to ensure it got there */
+ target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_PLSSR_OFFSET);
+ val = readl(target);
+
+ /* poll split queues until all DMA activity is done */
+ target = calgary_reg(bbar, split_queue_offset(tbl->it_busno));
+ do {
+ val = readq(target);
+ i++;
+ } while ((val & 0xff) != 0xff && i < 100);
+ if (i == 100)
+ pr_warn("PCI bus not quiesced, continuing anyway\n");
+
+ /* invalidate TCE cache */
+ target = calgary_reg(bbar, tar_offset(tbl->it_busno));
+ writeq(tbl->tar_val, target);
+
+ /* enable arbitration */
+ target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_AER_OFFSET);
+ writel(aer, target);
+ (void)readl(target); /* flush */
+}
+
+static void calioc2_tce_cache_blast(struct iommu_table *tbl)
+{
+ void __iomem *bbar = tbl->bbar;
+ void __iomem *target;
+ u64 val64;
+ u32 val;
+ int i = 0;
+ int count = 1;
+ unsigned char bus = tbl->it_busno;
+
+begin:
+ printk(KERN_DEBUG "Calgary: CalIOC2 bus 0x%x entering tce cache blast "
+ "sequence - count %d\n", bus, count);
+
+ /* 1. using the Page Migration Control reg set SoftStop */
+ target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL);
+ val = be32_to_cpu(readl(target));
+ printk(KERN_DEBUG "1a. read 0x%x [LE] from %p\n", val, target);
+ val |= PMR_SOFTSTOP;
+ printk(KERN_DEBUG "1b. writing 0x%x [LE] to %p\n", val, target);
+ writel(cpu_to_be32(val), target);
+
+ /* 2. poll split queues until all DMA activity is done */
+ printk(KERN_DEBUG "2a. starting to poll split queues\n");
+ target = calgary_reg(bbar, split_queue_offset(bus));
+ do {
+ val64 = readq(target);
+ i++;
+ } while ((val64 & 0xff) != 0xff && i < 100);
+ if (i == 100)
+ pr_warn("CalIOC2: PCI bus not quiesced, continuing anyway\n");
+
+ /* 3. poll Page Migration DEBUG for SoftStopFault */
+ target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_DEBUG);
+ val = be32_to_cpu(readl(target));
+ printk(KERN_DEBUG "3. read 0x%x [LE] from %p\n", val, target);
+
+ /* 4. if SoftStopFault - goto (1) */
+ if (val & PMR_SOFTSTOPFAULT) {
+ if (++count < 100)
+ goto begin;
+ else {
+ pr_warn("CalIOC2: too many SoftStopFaults, aborting TCE cache flush sequence!\n");
+ return; /* pray for the best */
+ }
+ }
+
+ /* 5. Slam into HardStop by reading PHB_PAGE_MIG_CTRL */
+ target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL);
+ printk(KERN_DEBUG "5a. slamming into HardStop by reading %p\n", target);
+ val = be32_to_cpu(readl(target));
+ printk(KERN_DEBUG "5b. read 0x%x [LE] from %p\n", val, target);
+ target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_DEBUG);
+ val = be32_to_cpu(readl(target));
+ printk(KERN_DEBUG "5c. read 0x%x [LE] from %p (debug)\n", val, target);
+
+ /* 6. invalidate TCE cache */
+ printk(KERN_DEBUG "6. invalidating TCE cache\n");
+ target = calgary_reg(bbar, tar_offset(bus));
+ writeq(tbl->tar_val, target);
+
+ /* 7. Re-read PMCR */
+ printk(KERN_DEBUG "7a. Re-reading PMCR\n");
+ target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL);
+ val = be32_to_cpu(readl(target));
+ printk(KERN_DEBUG "7b. read 0x%x [LE] from %p\n", val, target);
+
+ /* 8. Remove HardStop */
+ printk(KERN_DEBUG "8a. removing HardStop from PMCR\n");
+ target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL);
+ val = 0;
+ printk(KERN_DEBUG "8b. writing 0x%x [LE] to %p\n", val, target);
+ writel(cpu_to_be32(val), target);
+ val = be32_to_cpu(readl(target));
+ printk(KERN_DEBUG "8c. read 0x%x [LE] from %p\n", val, target);
+}
+
+static void __init calgary_reserve_mem_region(struct pci_dev *dev, u64 start,
+ u64 limit)
+{
+ unsigned int numpages;
+
+ limit = limit | 0xfffff;
+ limit++;
+
+ numpages = ((limit - start) >> PAGE_SHIFT);
+ iommu_range_reserve(pci_iommu(dev->bus), start, numpages);
+}
+
+static void __init calgary_reserve_peripheral_mem_1(struct pci_dev *dev)
+{
+ void __iomem *target;
+ u64 low, high, sizelow;
+ u64 start, limit;
+ struct iommu_table *tbl = pci_iommu(dev->bus);
+ unsigned char busnum = dev->bus->number;
+ void __iomem *bbar = tbl->bbar;
+
+ /* peripheral MEM_1 region */
+ target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_LOW);
+ low = be32_to_cpu(readl(target));
+ target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_HIGH);
+ high = be32_to_cpu(readl(target));
+ target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_SIZE);
+ sizelow = be32_to_cpu(readl(target));
+
+ start = (high << 32) | low;
+ limit = sizelow;
+
+ calgary_reserve_mem_region(dev, start, limit);
+}
+
+static void __init calgary_reserve_peripheral_mem_2(struct pci_dev *dev)
+{
+ void __iomem *target;
+ u32 val32;
+ u64 low, high, sizelow, sizehigh;
+ u64 start, limit;
+ struct iommu_table *tbl = pci_iommu(dev->bus);
+ unsigned char busnum = dev->bus->number;
+ void __iomem *bbar = tbl->bbar;
+
+ /* is it enabled? */
+ target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET);
+ val32 = be32_to_cpu(readl(target));
+ if (!(val32 & PHB_MEM2_ENABLE))
+ return;
+
+ target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_LOW);
+ low = be32_to_cpu(readl(target));
+ target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_HIGH);
+ high = be32_to_cpu(readl(target));
+ target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_SIZE_LOW);
+ sizelow = be32_to_cpu(readl(target));
+ target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_SIZE_HIGH);
+ sizehigh = be32_to_cpu(readl(target));
+
+ start = (high << 32) | low;
+ limit = (sizehigh << 32) | sizelow;
+
+ calgary_reserve_mem_region(dev, start, limit);
+}
+
+/*
+ * some regions of the IO address space do not get translated, so we
+ * must not give devices IO addresses in those regions. The regions
+ * are the 640KB-1MB region and the two PCI peripheral memory holes.
+ * Reserve all of them in the IOMMU bitmap to avoid giving them out
+ * later.
+ */
+static void __init calgary_reserve_regions(struct pci_dev *dev)
+{
+ unsigned int npages;
+ u64 start;
+ struct iommu_table *tbl = pci_iommu(dev->bus);
+
+ /* reserve EMERGENCY_PAGES from bad_dma_address and up */
+ iommu_range_reserve(tbl, CALGARY_MAPPING_ERROR, EMERGENCY_PAGES);
+
+ /* avoid the BIOS/VGA first 640KB-1MB region */
+ /* for CalIOC2 - avoid the entire first MB */
+ if (is_calgary(dev->device)) {
+ start = (640 * 1024);
+ npages = ((1024 - 640) * 1024) >> PAGE_SHIFT;
+ } else { /* calioc2 */
+ start = 0;
+ npages = (1 * 1024 * 1024) >> PAGE_SHIFT;
+ }
+ iommu_range_reserve(tbl, start, npages);
+
+ /* reserve the two PCI peripheral memory regions in IO space */
+ calgary_reserve_peripheral_mem_1(dev);
+ calgary_reserve_peripheral_mem_2(dev);
+}
+
+static int __init calgary_setup_tar(struct pci_dev *dev, void __iomem *bbar)
+{
+ u64 val64;
+ u64 table_phys;
+ void __iomem *target;
+ int ret;
+ struct iommu_table *tbl;
+
+ /* build TCE tables for each PHB */
+ ret = build_tce_table(dev, bbar);
+ if (ret)
+ return ret;
+
+ tbl = pci_iommu(dev->bus);
+ tbl->it_base = (unsigned long)bus_info[dev->bus->number].tce_space;
+
+ if (is_kdump_kernel())
+ calgary_init_bitmap_from_tce_table(tbl);
+ else
+ tce_free(tbl, 0, tbl->it_size);
+
+ if (is_calgary(dev->device))
+ tbl->chip_ops = &calgary_chip_ops;
+ else if (is_calioc2(dev->device))
+ tbl->chip_ops = &calioc2_chip_ops;
+ else
+ BUG();
+
+ calgary_reserve_regions(dev);
+
+ /* set TARs for each PHB */
+ target = calgary_reg(bbar, tar_offset(dev->bus->number));
+ val64 = be64_to_cpu(readq(target));
+
+ /* zero out all TAR bits under sw control */
+ val64 &= ~TAR_SW_BITS;
+ table_phys = (u64)__pa(tbl->it_base);
+
+ val64 |= table_phys;
+
+ BUG_ON(specified_table_size > TCE_TABLE_SIZE_8M);
+ val64 |= (u64) specified_table_size;
+
+ tbl->tar_val = cpu_to_be64(val64);
+
+ writeq(tbl->tar_val, target);
+ readq(target); /* flush */
+
+ return 0;
+}
+
+static void __init calgary_free_bus(struct pci_dev *dev)
+{
+ u64 val64;
+ struct iommu_table *tbl = pci_iommu(dev->bus);
+ void __iomem *target;
+ unsigned int bitmapsz;
+
+ target = calgary_reg(tbl->bbar, tar_offset(dev->bus->number));
+ val64 = be64_to_cpu(readq(target));
+ val64 &= ~TAR_SW_BITS;
+ writeq(cpu_to_be64(val64), target);
+ readq(target); /* flush */
+
+ bitmapsz = tbl->it_size / BITS_PER_BYTE;
+ free_pages((unsigned long)tbl->it_map, get_order(bitmapsz));
+ tbl->it_map = NULL;
+
+ kfree(tbl);
+
+ set_pci_iommu(dev->bus, NULL);
+
+ /* Can't free bootmem allocated memory after system is up :-( */
+ bus_info[dev->bus->number].tce_space = NULL;
+}
+
+static void calgary_dump_error_regs(struct iommu_table *tbl)
+{
+ void __iomem *bbar = tbl->bbar;
+ void __iomem *target;
+ u32 csr, plssr;
+
+ target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_CSR_OFFSET);
+ csr = be32_to_cpu(readl(target));
+
+ target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_PLSSR_OFFSET);
+ plssr = be32_to_cpu(readl(target));
+
+ /* If no error, the agent ID in the CSR is not valid */
+ pr_emerg("DMA error on Calgary PHB 0x%x, 0x%08x@CSR 0x%08x@PLSSR\n",
+ tbl->it_busno, csr, plssr);
+}
+
+static void calioc2_dump_error_regs(struct iommu_table *tbl)
+{
+ void __iomem *bbar = tbl->bbar;
+ u32 csr, csmr, plssr, mck, rcstat;
+ void __iomem *target;
+ unsigned long phboff = phb_offset(tbl->it_busno);
+ unsigned long erroff;
+ u32 errregs[7];
+ int i;
+
+ /* dump CSR */
+ target = calgary_reg(bbar, phboff | PHB_CSR_OFFSET);
+ csr = be32_to_cpu(readl(target));
+ /* dump PLSSR */
+ target = calgary_reg(bbar, phboff | PHB_PLSSR_OFFSET);
+ plssr = be32_to_cpu(readl(target));
+ /* dump CSMR */
+ target = calgary_reg(bbar, phboff | 0x290);
+ csmr = be32_to_cpu(readl(target));
+ /* dump mck */
+ target = calgary_reg(bbar, phboff | 0x800);
+ mck = be32_to_cpu(readl(target));
+
+ pr_emerg("DMA error on CalIOC2 PHB 0x%x\n", tbl->it_busno);
+
+ pr_emerg("0x%08x@CSR 0x%08x@PLSSR 0x%08x@CSMR 0x%08x@MCK\n",
+ csr, plssr, csmr, mck);
+
+ /* dump rest of error regs */
+ pr_emerg("");
+ for (i = 0; i < ARRAY_SIZE(errregs); i++) {
+ /* err regs are at 0x810 - 0x870 */
+ erroff = (0x810 + (i * 0x10));
+ target = calgary_reg(bbar, phboff | erroff);
+ errregs[i] = be32_to_cpu(readl(target));
+ pr_cont("0x%08x@0x%lx ", errregs[i], erroff);
+ }
+ pr_cont("\n");
+
+ /* root complex status */
+ target = calgary_reg(bbar, phboff | PHB_ROOT_COMPLEX_STATUS);
+ rcstat = be32_to_cpu(readl(target));
+ printk(KERN_EMERG "Calgary: 0x%08x@0x%x\n", rcstat,
+ PHB_ROOT_COMPLEX_STATUS);
+}
+
+static void calgary_watchdog(struct timer_list *t)
+{
+ struct iommu_table *tbl = from_timer(tbl, t, watchdog_timer);
+ void __iomem *bbar = tbl->bbar;
+ u32 val32;
+ void __iomem *target;
+
+ target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_CSR_OFFSET);
+ val32 = be32_to_cpu(readl(target));
+
+ /* If no error, the agent ID in the CSR is not valid */
+ if (val32 & CSR_AGENT_MASK) {
+ tbl->chip_ops->dump_error_regs(tbl);
+
+ /* reset error */
+ writel(0, target);
+
+ /* Disable bus that caused the error */
+ target = calgary_reg(bbar, phb_offset(tbl->it_busno) |
+ PHB_CONFIG_RW_OFFSET);
+ val32 = be32_to_cpu(readl(target));
+ val32 |= PHB_SLOT_DISABLE;
+ writel(cpu_to_be32(val32), target);
+ readl(target); /* flush */
+ } else {
+ /* Reset the timer */
+ mod_timer(&tbl->watchdog_timer, jiffies + 2 * HZ);
+ }
+}
+
+static void __init calgary_set_split_completion_timeout(void __iomem *bbar,
+ unsigned char busnum, unsigned long timeout)
+{
+ u64 val64;
+ void __iomem *target;
+ unsigned int phb_shift = ~0; /* silence gcc */
+ u64 mask;
+
+ switch (busno_to_phbid(busnum)) {
+ case 0: phb_shift = (63 - 19);
+ break;
+ case 1: phb_shift = (63 - 23);
+ break;
+ case 2: phb_shift = (63 - 27);
+ break;
+ case 3: phb_shift = (63 - 35);
+ break;
+ default:
+ BUG_ON(busno_to_phbid(busnum));
+ }
+
+ target = calgary_reg(bbar, CALGARY_CONFIG_REG);
+ val64 = be64_to_cpu(readq(target));
+
+ /* zero out this PHB's timer bits */
+ mask = ~(0xFUL << phb_shift);
+ val64 &= mask;
+ val64 |= (timeout << phb_shift);
+ writeq(cpu_to_be64(val64), target);
+ readq(target); /* flush */
+}
+
+static void __init calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev)
+{
+ unsigned char busnum = dev->bus->number;
+ void __iomem *bbar = tbl->bbar;
+ void __iomem *target;
+ u32 val;
+
+ /*
+ * CalIOC2 designers recommend setting bit 8 in 0xnDB0 to 1
+ */
+ target = calgary_reg(bbar, phb_offset(busnum) | PHB_SAVIOR_L2);
+ val = cpu_to_be32(readl(target));
+ val |= 0x00800000;
+ writel(cpu_to_be32(val), target);
+}
+
+static void __init calgary_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev)
+{
+ unsigned char busnum = dev->bus->number;
+
+ /*
+ * Give split completion a longer timeout on bus 1 for aic94xx
+ * http://bugzilla.kernel.org/show_bug.cgi?id=7180
+ */
+ if (is_calgary(dev->device) && (busnum == 1))
+ calgary_set_split_completion_timeout(tbl->bbar, busnum,
+ CCR_2SEC_TIMEOUT);
+}
+
+static void __init calgary_enable_translation(struct pci_dev *dev)
+{
+ u32 val32;
+ unsigned char busnum;
+ void __iomem *target;
+ void __iomem *bbar;
+ struct iommu_table *tbl;
+
+ busnum = dev->bus->number;
+ tbl = pci_iommu(dev->bus);
+ bbar = tbl->bbar;
+
+ /* enable TCE in PHB Config Register */
+ target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET);
+ val32 = be32_to_cpu(readl(target));
+ val32 |= PHB_TCE_ENABLE | PHB_DAC_DISABLE | PHB_MCSR_ENABLE;
+
+ printk(KERN_INFO "Calgary: enabling translation on %s PHB %#x\n",
+ (dev->device == PCI_DEVICE_ID_IBM_CALGARY) ?
+ "Calgary" : "CalIOC2", busnum);
+ printk(KERN_INFO "Calgary: errant DMAs will now be prevented on this "
+ "bus.\n");
+
+ writel(cpu_to_be32(val32), target);
+ readl(target); /* flush */
+
+ timer_setup(&tbl->watchdog_timer, calgary_watchdog, 0);
+ mod_timer(&tbl->watchdog_timer, jiffies);
+}
+
+static void __init calgary_disable_translation(struct pci_dev *dev)
+{
+ u32 val32;
+ unsigned char busnum;
+ void __iomem *target;
+ void __iomem *bbar;
+ struct iommu_table *tbl;
+
+ busnum = dev->bus->number;
+ tbl = pci_iommu(dev->bus);
+ bbar = tbl->bbar;
+
+ /* disable TCE in PHB Config Register */
+ target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET);
+ val32 = be32_to_cpu(readl(target));
+ val32 &= ~(PHB_TCE_ENABLE | PHB_DAC_DISABLE | PHB_MCSR_ENABLE);
+
+ printk(KERN_INFO "Calgary: disabling translation on PHB %#x!\n", busnum);
+ writel(cpu_to_be32(val32), target);
+ readl(target); /* flush */
+
+ del_timer_sync(&tbl->watchdog_timer);
+}
+
+static void __init calgary_init_one_nontraslated(struct pci_dev *dev)
+{
+ pci_dev_get(dev);
+ set_pci_iommu(dev->bus, NULL);
+
+ /* is the device behind a bridge? */
+ if (dev->bus->parent)
+ dev->bus->parent->self = dev;
+ else
+ dev->bus->self = dev;
+}
+
+static int __init calgary_init_one(struct pci_dev *dev)
+{
+ void __iomem *bbar;
+ struct iommu_table *tbl;
+ int ret;
+
+ bbar = busno_to_bbar(dev->bus->number);
+ ret = calgary_setup_tar(dev, bbar);
+ if (ret)
+ goto done;
+
+ pci_dev_get(dev);
+
+ if (dev->bus->parent) {
+ if (dev->bus->parent->self)
+ printk(KERN_WARNING "Calgary: IEEEE, dev %p has "
+ "bus->parent->self!\n", dev);
+ dev->bus->parent->self = dev;
+ } else
+ dev->bus->self = dev;
+
+ tbl = pci_iommu(dev->bus);
+ tbl->chip_ops->handle_quirks(tbl, dev);
+
+ calgary_enable_translation(dev);
+
+ return 0;
+
+done:
+ return ret;
+}
+
+static int __init calgary_locate_bbars(void)
+{
+ int ret;
+ int rioidx, phb, bus;
+ void __iomem *bbar;
+ void __iomem *target;
+ unsigned long offset;
+ u8 start_bus, end_bus;
+ u32 val;
+
+ ret = -ENODATA;
+ for (rioidx = 0; rioidx < rio_table_hdr->num_rio_dev; rioidx++) {
+ struct rio_detail *rio = rio_devs[rioidx];
+
+ if ((rio->type != COMPAT_CALGARY) && (rio->type != ALT_CALGARY))
+ continue;
+
+ /* map entire 1MB of Calgary config space */
+ bbar = ioremap_nocache(rio->BBAR, 1024 * 1024);
+ if (!bbar)
+ goto error;
+
+ for (phb = 0; phb < PHBS_PER_CALGARY; phb++) {
+ offset = phb_debug_offsets[phb] | PHB_DEBUG_STUFF_OFFSET;
+ target = calgary_reg(bbar, offset);
+
+ val = be32_to_cpu(readl(target));
+
+ start_bus = (u8)((val & 0x00FF0000) >> 16);
+ end_bus = (u8)((val & 0x0000FF00) >> 8);
+
+ if (end_bus) {
+ for (bus = start_bus; bus <= end_bus; bus++) {
+ bus_info[bus].bbar = bbar;
+ bus_info[bus].phbid = phb;
+ }
+ } else {
+ bus_info[start_bus].bbar = bbar;
+ bus_info[start_bus].phbid = phb;
+ }
+ }
+ }
+
+ return 0;
+
+error:
+ /* scan bus_info and iounmap any bbars we previously ioremap'd */
+ for (bus = 0; bus < ARRAY_SIZE(bus_info); bus++)
+ if (bus_info[bus].bbar)
+ iounmap(bus_info[bus].bbar);
+
+ return ret;
+}
+
+static int __init calgary_init(void)
+{
+ int ret;
+ struct pci_dev *dev = NULL;
+ struct calgary_bus_info *info;
+
+ ret = calgary_locate_bbars();
+ if (ret)
+ return ret;
+
+ /* Purely for kdump kernel case */
+ if (is_kdump_kernel())
+ get_tce_space_from_tar();
+
+ do {
+ dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev);
+ if (!dev)
+ break;
+ if (!is_cal_pci_dev(dev->device))
+ continue;
+
+ info = &bus_info[dev->bus->number];
+ if (info->translation_disabled) {
+ calgary_init_one_nontraslated(dev);
+ continue;
+ }
+
+ if (!info->tce_space && !translate_empty_slots)
+ continue;
+
+ ret = calgary_init_one(dev);
+ if (ret)
+ goto error;
+ } while (1);
+
+ dev = NULL;
+ for_each_pci_dev(dev) {
+ struct iommu_table *tbl;
+
+ tbl = find_iommu_table(&dev->dev);
+
+ if (translation_enabled(tbl))
+ dev->dev.dma_ops = &calgary_dma_ops;
+ }
+
+ return ret;
+
+error:
+ do {
+ dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev);
+ if (!dev)
+ break;
+ if (!is_cal_pci_dev(dev->device))
+ continue;
+
+ info = &bus_info[dev->bus->number];
+ if (info->translation_disabled) {
+ pci_dev_put(dev);
+ continue;
+ }
+ if (!info->tce_space && !translate_empty_slots)
+ continue;
+
+ calgary_disable_translation(dev);
+ calgary_free_bus(dev);
+ pci_dev_put(dev); /* Undo calgary_init_one()'s pci_dev_get() */
+ dev->dev.dma_ops = NULL;
+ } while (1);
+
+ return ret;
+}
+
+static inline int __init determine_tce_table_size(void)
+{
+ int ret;
+
+ if (specified_table_size != TCE_TABLE_SIZE_UNSPECIFIED)
+ return specified_table_size;
+
+ if (is_kdump_kernel() && saved_max_pfn) {
+ /*
+ * Table sizes are from 0 to 7 (TCE_TABLE_SIZE_64K to
+ * TCE_TABLE_SIZE_8M). Table size 0 has 8K entries and each
+ * larger table size has twice as many entries, so shift the
+ * max ram address by 13 to divide by 8K and then look at the
+ * order of the result to choose between 0-7.
+ */
+ ret = get_order((saved_max_pfn * PAGE_SIZE) >> 13);
+ if (ret > TCE_TABLE_SIZE_8M)
+ ret = TCE_TABLE_SIZE_8M;
+ } else {
+ /*
+ * Use 8M by default (suggested by Muli) if it's not
+ * kdump kernel and saved_max_pfn isn't set.
+ */
+ ret = TCE_TABLE_SIZE_8M;
+ }
+
+ return ret;
+}
+
+static int __init build_detail_arrays(void)
+{
+ unsigned long ptr;
+ unsigned numnodes, i;
+ int scal_detail_size, rio_detail_size;
+
+ numnodes = rio_table_hdr->num_scal_dev;
+ if (numnodes > MAX_NUMNODES){
+ printk(KERN_WARNING
+ "Calgary: MAX_NUMNODES too low! Defined as %d, "
+ "but system has %d nodes.\n",
+ MAX_NUMNODES, numnodes);
+ return -ENODEV;
+ }
+
+ switch (rio_table_hdr->version){
+ case 2:
+ scal_detail_size = 11;
+ rio_detail_size = 13;
+ break;
+ case 3:
+ scal_detail_size = 12;
+ rio_detail_size = 15;
+ break;
+ default:
+ printk(KERN_WARNING
+ "Calgary: Invalid Rio Grande Table Version: %d\n",
+ rio_table_hdr->version);
+ return -EPROTO;
+ }
+
+ ptr = ((unsigned long)rio_table_hdr) + 3;
+ for (i = 0; i < numnodes; i++, ptr += scal_detail_size)
+ scal_devs[i] = (struct scal_detail *)ptr;
+
+ for (i = 0; i < rio_table_hdr->num_rio_dev;
+ i++, ptr += rio_detail_size)
+ rio_devs[i] = (struct rio_detail *)ptr;
+
+ return 0;
+}
+
+static int __init calgary_bus_has_devices(int bus, unsigned short pci_dev)
+{
+ int dev;
+ u32 val;
+
+ if (pci_dev == PCI_DEVICE_ID_IBM_CALIOC2) {
+ /*
+ * FIXME: properly scan for devices across the
+ * PCI-to-PCI bridge on every CalIOC2 port.
+ */
+ return 1;
+ }
+
+ for (dev = 1; dev < 8; dev++) {
+ val = read_pci_config(bus, dev, 0, 0);
+ if (val != 0xffffffff)
+ break;
+ }
+ return (val != 0xffffffff);
+}
+
+/*
+ * calgary_init_bitmap_from_tce_table():
+ * Function for kdump case. In the second/kdump kernel initialize
+ * the bitmap based on the tce table entries obtained from first kernel
+ */
+static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl)
+{
+ u64 *tp;
+ unsigned int index;
+ tp = ((u64 *)tbl->it_base);
+ for (index = 0 ; index < tbl->it_size; index++) {
+ if (*tp != 0x0)
+ set_bit(index, tbl->it_map);
+ tp++;
+ }
+}
+
+/*
+ * get_tce_space_from_tar():
+ * Function for kdump case. Get the tce tables from first kernel
+ * by reading the contents of the base address register of calgary iommu
+ */
+static void __init get_tce_space_from_tar(void)
+{
+ int bus;
+ void __iomem *target;
+ unsigned long tce_space;
+
+ for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) {
+ struct calgary_bus_info *info = &bus_info[bus];
+ unsigned short pci_device;
+ u32 val;
+
+ val = read_pci_config(bus, 0, 0, 0);
+ pci_device = (val & 0xFFFF0000) >> 16;
+
+ if (!is_cal_pci_dev(pci_device))
+ continue;
+ if (info->translation_disabled)
+ continue;
+
+ if (calgary_bus_has_devices(bus, pci_device) ||
+ translate_empty_slots) {
+ target = calgary_reg(bus_info[bus].bbar,
+ tar_offset(bus));
+ tce_space = be64_to_cpu(readq(target));
+ tce_space = tce_space & TAR_SW_BITS;
+
+ tce_space = tce_space & (~specified_table_size);
+ info->tce_space = (u64 *)__va(tce_space);
+ }
+ }
+ return;
+}
+
+static int __init calgary_iommu_init(void)
+{
+ int ret;
+
+ /* ok, we're trying to use Calgary - let's roll */
+ printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n");
+
+ ret = calgary_init();
+ if (ret) {
+ printk(KERN_ERR "PCI-DMA: Calgary init failed %d, "
+ "falling back to no_iommu\n", ret);
+ return ret;
+ }
+
+ return 0;
+}
+
+int __init detect_calgary(void)
+{
+ int bus;
+ void *tbl;
+ int calgary_found = 0;
+ unsigned long ptr;
+ unsigned int offset, prev_offset;
+ int ret;
+
+ /*
+ * if the user specified iommu=off or iommu=soft or we found
+ * another HW IOMMU already, bail out.
+ */
+ if (no_iommu || iommu_detected)
+ return -ENODEV;
+
+ if (!use_calgary)
+ return -ENODEV;
+
+ if (!early_pci_allowed())
+ return -ENODEV;
+
+ printk(KERN_DEBUG "Calgary: detecting Calgary via BIOS EBDA area\n");
+
+ ptr = (unsigned long)phys_to_virt(get_bios_ebda());
+
+ rio_table_hdr = NULL;
+ prev_offset = 0;
+ offset = 0x180;
+ /*
+ * The next offset is stored in the 1st word.
+ * Only parse up until the offset increases:
+ */
+ while (offset > prev_offset) {
+ /* The block id is stored in the 2nd word */
+ if (*((unsigned short *)(ptr + offset + 2)) == 0x4752){
+ /* set the pointer past the offset & block id */
+ rio_table_hdr = (struct rio_table_hdr *)(ptr + offset + 4);
+ break;
+ }
+ prev_offset = offset;
+ offset = *((unsigned short *)(ptr + offset));
+ }
+ if (!rio_table_hdr) {
+ printk(KERN_DEBUG "Calgary: Unable to locate Rio Grande table "
+ "in EBDA - bailing!\n");
+ return -ENODEV;
+ }
+
+ ret = build_detail_arrays();
+ if (ret) {
+ printk(KERN_DEBUG "Calgary: build_detail_arrays ret %d\n", ret);
+ return -ENOMEM;
+ }
+
+ specified_table_size = determine_tce_table_size();
+
+ for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) {
+ struct calgary_bus_info *info = &bus_info[bus];
+ unsigned short pci_device;
+ u32 val;
+
+ val = read_pci_config(bus, 0, 0, 0);
+ pci_device = (val & 0xFFFF0000) >> 16;
+
+ if (!is_cal_pci_dev(pci_device))
+ continue;
+
+ if (info->translation_disabled)
+ continue;
+
+ if (calgary_bus_has_devices(bus, pci_device) ||
+ translate_empty_slots) {
+ /*
+ * If it is kdump kernel, find and use tce tables
+ * from first kernel, else allocate tce tables here
+ */
+ if (!is_kdump_kernel()) {
+ tbl = alloc_tce_table();
+ if (!tbl)
+ goto cleanup;
+ info->tce_space = tbl;
+ }
+ calgary_found = 1;
+ }
+ }
+
+ printk(KERN_DEBUG "Calgary: finished detection, Calgary %s\n",
+ calgary_found ? "found" : "not found");
+
+ if (calgary_found) {
+ iommu_detected = 1;
+ calgary_detected = 1;
+ printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected.\n");
+ printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d\n",
+ specified_table_size);
+
+ x86_init.iommu.iommu_init = calgary_iommu_init;
+ }
+ return calgary_found;
+
+cleanup:
+ for (--bus; bus >= 0; --bus) {
+ struct calgary_bus_info *info = &bus_info[bus];
+
+ if (info->tce_space)
+ free_tce_table(info->tce_space);
+ }
+ return -ENOMEM;
+}
+
+static int __init calgary_parse_options(char *p)
+{
+ unsigned int bridge;
+ unsigned long val;
+ size_t len;
+ ssize_t ret;
+
+ while (*p) {
+ if (!strncmp(p, "64k", 3))
+ specified_table_size = TCE_TABLE_SIZE_64K;
+ else if (!strncmp(p, "128k", 4))
+ specified_table_size = TCE_TABLE_SIZE_128K;
+ else if (!strncmp(p, "256k", 4))
+ specified_table_size = TCE_TABLE_SIZE_256K;
+ else if (!strncmp(p, "512k", 4))
+ specified_table_size = TCE_TABLE_SIZE_512K;
+ else if (!strncmp(p, "1M", 2))
+ specified_table_size = TCE_TABLE_SIZE_1M;
+ else if (!strncmp(p, "2M", 2))
+ specified_table_size = TCE_TABLE_SIZE_2M;
+ else if (!strncmp(p, "4M", 2))
+ specified_table_size = TCE_TABLE_SIZE_4M;
+ else if (!strncmp(p, "8M", 2))
+ specified_table_size = TCE_TABLE_SIZE_8M;
+
+ len = strlen("translate_empty_slots");
+ if (!strncmp(p, "translate_empty_slots", len))
+ translate_empty_slots = 1;
+
+ len = strlen("disable");
+ if (!strncmp(p, "disable", len)) {
+ p += len;
+ if (*p == '=')
+ ++p;
+ if (*p == '\0')
+ break;
+ ret = kstrtoul(p, 0, &val);
+ if (ret)
+ break;
+
+ bridge = val;
+ if (bridge < MAX_PHB_BUS_NUM) {
+ printk(KERN_INFO "Calgary: disabling "
+ "translation for PHB %#x\n", bridge);
+ bus_info[bridge].translation_disabled = 1;
+ }
+ }
+
+ p = strpbrk(p, ",");
+ if (!p)
+ break;
+
+ p++; /* skip ',' */
+ }
+ return 1;
+}
+__setup("calgary=", calgary_parse_options);
+
+static void __init calgary_fixup_one_tce_space(struct pci_dev *dev)
+{
+ struct iommu_table *tbl;
+ unsigned int npages;
+ int i;
+
+ tbl = pci_iommu(dev->bus);
+
+ for (i = 0; i < 4; i++) {
+ struct resource *r = &dev->resource[PCI_BRIDGE_RESOURCES + i];
+
+ /* Don't give out TCEs that map MEM resources */
+ if (!(r->flags & IORESOURCE_MEM))
+ continue;
+
+ /* 0-based? we reserve the whole 1st MB anyway */
+ if (!r->start)
+ continue;
+
+ /* cover the whole region */
+ npages = resource_size(r) >> PAGE_SHIFT;
+ npages++;
+
+ iommu_range_reserve(tbl, r->start, npages);
+ }
+}
+
+static int __init calgary_fixup_tce_spaces(void)
+{
+ struct pci_dev *dev = NULL;
+ struct calgary_bus_info *info;
+
+ if (no_iommu || swiotlb || !calgary_detected)
+ return -ENODEV;
+
+ printk(KERN_DEBUG "Calgary: fixing up tce spaces\n");
+
+ do {
+ dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev);
+ if (!dev)
+ break;
+ if (!is_cal_pci_dev(dev->device))
+ continue;
+
+ info = &bus_info[dev->bus->number];
+ if (info->translation_disabled)
+ continue;
+
+ if (!info->tce_space)
+ continue;
+
+ calgary_fixup_one_tce_space(dev);
+
+ } while (1);
+
+ return 0;
+}
+
+/*
+ * We need to be call after pcibios_assign_resources (fs_initcall level)
+ * and before device_initcall.
+ */
+rootfs_initcall(calgary_fixup_tce_spaces);
+
+IOMMU_INIT_POST(detect_calgary);
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
new file mode 100644
index 000000000..7ba73fe0d
--- /dev/null
+++ b/arch/x86/kernel/pci-dma.c
@@ -0,0 +1,196 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/dma-direct.h>
+#include <linux/dma-debug.h>
+#include <linux/dmar.h>
+#include <linux/export.h>
+#include <linux/bootmem.h>
+#include <linux/gfp.h>
+#include <linux/pci.h>
+
+#include <asm/proto.h>
+#include <asm/dma.h>
+#include <asm/iommu.h>
+#include <asm/gart.h>
+#include <asm/calgary.h>
+#include <asm/x86_init.h>
+#include <asm/iommu_table.h>
+
+static bool disable_dac_quirk __read_mostly;
+
+const struct dma_map_ops *dma_ops = &dma_direct_ops;
+EXPORT_SYMBOL(dma_ops);
+
+#ifdef CONFIG_IOMMU_DEBUG
+int panic_on_overflow __read_mostly = 1;
+int force_iommu __read_mostly = 1;
+#else
+int panic_on_overflow __read_mostly = 0;
+int force_iommu __read_mostly = 0;
+#endif
+
+int iommu_merge __read_mostly = 0;
+
+int no_iommu __read_mostly;
+/* Set this to 1 if there is a HW IOMMU in the system */
+int iommu_detected __read_mostly = 0;
+
+/*
+ * This variable becomes 1 if iommu=pt is passed on the kernel command line.
+ * If this variable is 1, IOMMU implementations do no DMA translation for
+ * devices and allow every device to access to whole physical memory. This is
+ * useful if a user wants to use an IOMMU only for KVM device assignment to
+ * guests and not for driver dma translation.
+ * It is also possible to disable by default in kernel config, and enable with
+ * iommu=nopt at boot time.
+ */
+#ifdef CONFIG_IOMMU_DEFAULT_PASSTHROUGH
+int iommu_pass_through __read_mostly = 1;
+#else
+int iommu_pass_through __read_mostly;
+#endif
+
+extern struct iommu_table_entry __iommu_table[], __iommu_table_end[];
+
+/* Dummy device used for NULL arguments (normally ISA). */
+struct device x86_dma_fallback_dev = {
+ .init_name = "fallback device",
+ .coherent_dma_mask = ISA_DMA_BIT_MASK,
+ .dma_mask = &x86_dma_fallback_dev.coherent_dma_mask,
+};
+EXPORT_SYMBOL(x86_dma_fallback_dev);
+
+void __init pci_iommu_alloc(void)
+{
+ struct iommu_table_entry *p;
+
+ sort_iommu_table(__iommu_table, __iommu_table_end);
+ check_iommu_entries(__iommu_table, __iommu_table_end);
+
+ for (p = __iommu_table; p < __iommu_table_end; p++) {
+ if (p && p->detect && p->detect() > 0) {
+ p->flags |= IOMMU_DETECTED;
+ if (p->early_init)
+ p->early_init();
+ if (p->flags & IOMMU_FINISH_IF_DETECTED)
+ break;
+ }
+ }
+}
+
+bool arch_dma_alloc_attrs(struct device **dev)
+{
+ if (!*dev)
+ *dev = &x86_dma_fallback_dev;
+
+ if (!is_device_dma_capable(*dev))
+ return false;
+ return true;
+
+}
+EXPORT_SYMBOL(arch_dma_alloc_attrs);
+
+/*
+ * See <Documentation/x86/x86_64/boot-options.txt> for the iommu kernel
+ * parameter documentation.
+ */
+static __init int iommu_setup(char *p)
+{
+ iommu_merge = 1;
+
+ if (!p)
+ return -EINVAL;
+
+ while (*p) {
+ if (!strncmp(p, "off", 3))
+ no_iommu = 1;
+ /* gart_parse_options has more force support */
+ if (!strncmp(p, "force", 5))
+ force_iommu = 1;
+ if (!strncmp(p, "noforce", 7)) {
+ iommu_merge = 0;
+ force_iommu = 0;
+ }
+
+ if (!strncmp(p, "biomerge", 8)) {
+ iommu_merge = 1;
+ force_iommu = 1;
+ }
+ if (!strncmp(p, "panic", 5))
+ panic_on_overflow = 1;
+ if (!strncmp(p, "nopanic", 7))
+ panic_on_overflow = 0;
+ if (!strncmp(p, "merge", 5)) {
+ iommu_merge = 1;
+ force_iommu = 1;
+ }
+ if (!strncmp(p, "nomerge", 7))
+ iommu_merge = 0;
+ if (!strncmp(p, "forcesac", 8))
+ pr_warn("forcesac option ignored.\n");
+ if (!strncmp(p, "allowdac", 8))
+ pr_warn("allowdac option ignored.\n");
+ if (!strncmp(p, "nodac", 5))
+ pr_warn("nodac option ignored.\n");
+ if (!strncmp(p, "usedac", 6)) {
+ disable_dac_quirk = true;
+ return 1;
+ }
+#ifdef CONFIG_SWIOTLB
+ if (!strncmp(p, "soft", 4))
+ swiotlb = 1;
+#endif
+ if (!strncmp(p, "pt", 2))
+ iommu_pass_through = 1;
+ if (!strncmp(p, "nopt", 4))
+ iommu_pass_through = 0;
+
+ gart_parse_options(p);
+
+#ifdef CONFIG_CALGARY_IOMMU
+ if (!strncmp(p, "calgary", 7))
+ use_calgary = 1;
+#endif /* CONFIG_CALGARY_IOMMU */
+
+ p += strcspn(p, ",");
+ if (*p == ',')
+ ++p;
+ }
+ return 0;
+}
+early_param("iommu", iommu_setup);
+
+static int __init pci_iommu_init(void)
+{
+ struct iommu_table_entry *p;
+
+ x86_init.iommu.iommu_init();
+
+ for (p = __iommu_table; p < __iommu_table_end; p++) {
+ if (p && (p->flags & IOMMU_DETECTED) && p->late_init)
+ p->late_init();
+ }
+
+ return 0;
+}
+/* Must execute after PCI subsystem */
+rootfs_initcall(pci_iommu_init);
+
+#ifdef CONFIG_PCI
+/* Many VIA bridges seem to corrupt data for DAC. Disable it here */
+
+static int via_no_dac_cb(struct pci_dev *pdev, void *data)
+{
+ pdev->dev.bus_dma_mask = DMA_BIT_MASK(32);
+ return 0;
+}
+
+static void via_no_dac(struct pci_dev *dev)
+{
+ if (!disable_dac_quirk) {
+ dev_info(&dev->dev, "disabling DAC on VIA PCI bridge\n");
+ pci_walk_bus(dev->subordinate, via_no_dac_cb, NULL);
+ }
+}
+DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID,
+ PCI_CLASS_BRIDGE_PCI, 8, via_no_dac);
+#endif
diff --git a/arch/x86/kernel/pci-iommu_table.c b/arch/x86/kernel/pci-iommu_table.c
new file mode 100644
index 000000000..2e9006c1e
--- /dev/null
+++ b/arch/x86/kernel/pci-iommu_table.c
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/dma-mapping.h>
+#include <asm/iommu_table.h>
+#include <linux/string.h>
+#include <linux/kallsyms.h>
+
+
+#define DEBUG 1
+
+static struct iommu_table_entry * __init
+find_dependents_of(struct iommu_table_entry *start,
+ struct iommu_table_entry *finish,
+ struct iommu_table_entry *q)
+{
+ struct iommu_table_entry *p;
+
+ if (!q)
+ return NULL;
+
+ for (p = start; p < finish; p++)
+ if (p->detect == q->depend)
+ return p;
+
+ return NULL;
+}
+
+
+void __init sort_iommu_table(struct iommu_table_entry *start,
+ struct iommu_table_entry *finish) {
+
+ struct iommu_table_entry *p, *q, tmp;
+
+ for (p = start; p < finish; p++) {
+again:
+ q = find_dependents_of(start, finish, p);
+ /* We are bit sneaky here. We use the memory address to figure
+ * out if the node we depend on is past our point, if so, swap.
+ */
+ if (q > p) {
+ tmp = *p;
+ memmove(p, q, sizeof(*p));
+ *q = tmp;
+ goto again;
+ }
+ }
+
+}
+
+#ifdef DEBUG
+void __init check_iommu_entries(struct iommu_table_entry *start,
+ struct iommu_table_entry *finish)
+{
+ struct iommu_table_entry *p, *q, *x;
+
+ /* Simple cyclic dependency checker. */
+ for (p = start; p < finish; p++) {
+ q = find_dependents_of(start, finish, p);
+ x = find_dependents_of(start, finish, q);
+ if (p == x) {
+ printk(KERN_ERR "CYCLIC DEPENDENCY FOUND! %pS depends on %pS and vice-versa. BREAKING IT.\n",
+ p->detect, q->detect);
+ /* Heavy handed way..*/
+ x->depend = NULL;
+ }
+ }
+
+ for (p = start; p < finish; p++) {
+ q = find_dependents_of(p, finish, p);
+ if (q && q > p) {
+ printk(KERN_ERR "EXECUTION ORDER INVALID! %pS should be called before %pS!\n",
+ p->detect, q->detect);
+ }
+ }
+}
+#else
+void __init check_iommu_entries(struct iommu_table_entry *start,
+ struct iommu_table_entry *finish)
+{
+}
+#endif
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
new file mode 100644
index 000000000..71c0b01d9
--- /dev/null
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -0,0 +1,81 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Glue code to lib/swiotlb.c */
+
+#include <linux/pci.h>
+#include <linux/cache.h>
+#include <linux/init.h>
+#include <linux/swiotlb.h>
+#include <linux/bootmem.h>
+#include <linux/dma-direct.h>
+#include <linux/mem_encrypt.h>
+
+#include <asm/iommu.h>
+#include <asm/swiotlb.h>
+#include <asm/dma.h>
+#include <asm/xen/swiotlb-xen.h>
+#include <asm/iommu_table.h>
+
+int swiotlb __read_mostly;
+
+/*
+ * pci_swiotlb_detect_override - set swiotlb to 1 if necessary
+ *
+ * This returns non-zero if we are forced to use swiotlb (by the boot
+ * option).
+ */
+int __init pci_swiotlb_detect_override(void)
+{
+ if (swiotlb_force == SWIOTLB_FORCE)
+ swiotlb = 1;
+
+ return swiotlb;
+}
+IOMMU_INIT_FINISH(pci_swiotlb_detect_override,
+ pci_xen_swiotlb_detect,
+ pci_swiotlb_init,
+ pci_swiotlb_late_init);
+
+/*
+ * If 4GB or more detected (and iommu=off not set) or if SME is active
+ * then set swiotlb to 1 and return 1.
+ */
+int __init pci_swiotlb_detect_4gb(void)
+{
+ /* don't initialize swiotlb if iommu=off (no_iommu=1) */
+ if (!no_iommu && max_possible_pfn > MAX_DMA32_PFN)
+ swiotlb = 1;
+
+ /*
+ * If SME is active then swiotlb will be set to 1 so that bounce
+ * buffers are allocated and used for devices that do not support
+ * the addressing range required for the encryption mask.
+ */
+ if (sme_active())
+ swiotlb = 1;
+
+ return swiotlb;
+}
+IOMMU_INIT(pci_swiotlb_detect_4gb,
+ pci_swiotlb_detect_override,
+ pci_swiotlb_init,
+ pci_swiotlb_late_init);
+
+void __init pci_swiotlb_init(void)
+{
+ if (swiotlb) {
+ swiotlb_init(0);
+ dma_ops = &swiotlb_dma_ops;
+ }
+}
+
+void __init pci_swiotlb_late_init(void)
+{
+ /* An IOMMU turned us off. */
+ if (!swiotlb)
+ swiotlb_exit();
+ else {
+ printk(KERN_INFO "PCI-DMA: "
+ "Using software bounce buffering for IO (SWIOTLB)\n");
+ swiotlb_print_info();
+ }
+}
diff --git a/arch/x86/kernel/pcspeaker.c b/arch/x86/kernel/pcspeaker.c
new file mode 100644
index 000000000..4a710ffff
--- /dev/null
+++ b/arch/x86/kernel/pcspeaker.c
@@ -0,0 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/platform_device.h>
+#include <linux/err.h>
+#include <linux/init.h>
+
+static __init int add_pcspkr(void)
+{
+ struct platform_device *pd;
+
+ pd = platform_device_register_simple("pcspkr", -1, NULL, 0);
+
+ return PTR_ERR_OR_ZERO(pd);
+}
+device_initcall(add_pcspkr);
diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c
new file mode 100644
index 000000000..c06c4c16c
--- /dev/null
+++ b/arch/x86/kernel/perf_regs.c
@@ -0,0 +1,179 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
+#include <linux/perf_event.h>
+#include <linux/bug.h>
+#include <linux/stddef.h>
+#include <asm/perf_regs.h>
+#include <asm/ptrace.h>
+
+#ifdef CONFIG_X86_32
+#define PERF_REG_X86_MAX PERF_REG_X86_32_MAX
+#else
+#define PERF_REG_X86_MAX PERF_REG_X86_64_MAX
+#endif
+
+#define PT_REGS_OFFSET(id, r) [id] = offsetof(struct pt_regs, r)
+
+static unsigned int pt_regs_offset[PERF_REG_X86_MAX] = {
+ PT_REGS_OFFSET(PERF_REG_X86_AX, ax),
+ PT_REGS_OFFSET(PERF_REG_X86_BX, bx),
+ PT_REGS_OFFSET(PERF_REG_X86_CX, cx),
+ PT_REGS_OFFSET(PERF_REG_X86_DX, dx),
+ PT_REGS_OFFSET(PERF_REG_X86_SI, si),
+ PT_REGS_OFFSET(PERF_REG_X86_DI, di),
+ PT_REGS_OFFSET(PERF_REG_X86_BP, bp),
+ PT_REGS_OFFSET(PERF_REG_X86_SP, sp),
+ PT_REGS_OFFSET(PERF_REG_X86_IP, ip),
+ PT_REGS_OFFSET(PERF_REG_X86_FLAGS, flags),
+ PT_REGS_OFFSET(PERF_REG_X86_CS, cs),
+ PT_REGS_OFFSET(PERF_REG_X86_SS, ss),
+#ifdef CONFIG_X86_32
+ PT_REGS_OFFSET(PERF_REG_X86_DS, ds),
+ PT_REGS_OFFSET(PERF_REG_X86_ES, es),
+ PT_REGS_OFFSET(PERF_REG_X86_FS, fs),
+ PT_REGS_OFFSET(PERF_REG_X86_GS, gs),
+#else
+ /*
+ * The pt_regs struct does not store
+ * ds, es, fs, gs in 64 bit mode.
+ */
+ (unsigned int) -1,
+ (unsigned int) -1,
+ (unsigned int) -1,
+ (unsigned int) -1,
+#endif
+#ifdef CONFIG_X86_64
+ PT_REGS_OFFSET(PERF_REG_X86_R8, r8),
+ PT_REGS_OFFSET(PERF_REG_X86_R9, r9),
+ PT_REGS_OFFSET(PERF_REG_X86_R10, r10),
+ PT_REGS_OFFSET(PERF_REG_X86_R11, r11),
+ PT_REGS_OFFSET(PERF_REG_X86_R12, r12),
+ PT_REGS_OFFSET(PERF_REG_X86_R13, r13),
+ PT_REGS_OFFSET(PERF_REG_X86_R14, r14),
+ PT_REGS_OFFSET(PERF_REG_X86_R15, r15),
+#endif
+};
+
+u64 perf_reg_value(struct pt_regs *regs, int idx)
+{
+ if (WARN_ON_ONCE(idx >= ARRAY_SIZE(pt_regs_offset)))
+ return 0;
+
+ return regs_get_register(regs, pt_regs_offset[idx]);
+}
+
+#define REG_RESERVED (~((1ULL << PERF_REG_X86_MAX) - 1ULL))
+
+#ifdef CONFIG_X86_32
+int perf_reg_validate(u64 mask)
+{
+ if (!mask || mask & REG_RESERVED)
+ return -EINVAL;
+
+ return 0;
+}
+
+u64 perf_reg_abi(struct task_struct *task)
+{
+ return PERF_SAMPLE_REGS_ABI_32;
+}
+
+void perf_get_regs_user(struct perf_regs *regs_user,
+ struct pt_regs *regs,
+ struct pt_regs *regs_user_copy)
+{
+ regs_user->regs = task_pt_regs(current);
+ regs_user->abi = perf_reg_abi(current);
+}
+#else /* CONFIG_X86_64 */
+#define REG_NOSUPPORT ((1ULL << PERF_REG_X86_DS) | \
+ (1ULL << PERF_REG_X86_ES) | \
+ (1ULL << PERF_REG_X86_FS) | \
+ (1ULL << PERF_REG_X86_GS))
+
+int perf_reg_validate(u64 mask)
+{
+ if (!mask || mask & REG_RESERVED)
+ return -EINVAL;
+
+ if (mask & REG_NOSUPPORT)
+ return -EINVAL;
+
+ return 0;
+}
+
+u64 perf_reg_abi(struct task_struct *task)
+{
+ if (test_tsk_thread_flag(task, TIF_IA32))
+ return PERF_SAMPLE_REGS_ABI_32;
+ else
+ return PERF_SAMPLE_REGS_ABI_64;
+}
+
+void perf_get_regs_user(struct perf_regs *regs_user,
+ struct pt_regs *regs,
+ struct pt_regs *regs_user_copy)
+{
+ struct pt_regs *user_regs = task_pt_regs(current);
+
+ /*
+ * If we're in an NMI that interrupted task_pt_regs setup, then
+ * we can't sample user regs at all. This check isn't really
+ * sufficient, though, as we could be in an NMI inside an interrupt
+ * that happened during task_pt_regs setup.
+ */
+ if (regs->sp > (unsigned long)&user_regs->r11 &&
+ regs->sp <= (unsigned long)(user_regs + 1)) {
+ regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
+ regs_user->regs = NULL;
+ return;
+ }
+
+ /*
+ * These registers are always saved on 64-bit syscall entry.
+ * On 32-bit entry points, they are saved too except r8..r11.
+ */
+ regs_user_copy->ip = user_regs->ip;
+ regs_user_copy->ax = user_regs->ax;
+ regs_user_copy->cx = user_regs->cx;
+ regs_user_copy->dx = user_regs->dx;
+ regs_user_copy->si = user_regs->si;
+ regs_user_copy->di = user_regs->di;
+ regs_user_copy->r8 = user_regs->r8;
+ regs_user_copy->r9 = user_regs->r9;
+ regs_user_copy->r10 = user_regs->r10;
+ regs_user_copy->r11 = user_regs->r11;
+ regs_user_copy->orig_ax = user_regs->orig_ax;
+ regs_user_copy->flags = user_regs->flags;
+ regs_user_copy->sp = user_regs->sp;
+ regs_user_copy->cs = user_regs->cs;
+ regs_user_copy->ss = user_regs->ss;
+ /*
+ * Store user space frame-pointer value on sample
+ * to facilitate stack unwinding for cases when
+ * user space executable code has such support
+ * enabled at compile time:
+ */
+ regs_user_copy->bp = user_regs->bp;
+
+ regs_user_copy->bx = -1;
+ regs_user_copy->r12 = -1;
+ regs_user_copy->r13 = -1;
+ regs_user_copy->r14 = -1;
+ regs_user_copy->r15 = -1;
+ /*
+ * For this to be at all useful, we need a reasonable guess for
+ * the ABI. Be careful: we're in NMI context, and we're
+ * considering current to be the current task, so we should
+ * be careful not to look at any other percpu variables that might
+ * change during context switches.
+ */
+ regs_user->abi = user_64bit_mode(user_regs) ?
+ PERF_SAMPLE_REGS_ABI_64 : PERF_SAMPLE_REGS_ABI_32;
+
+ regs_user->regs = regs_user_copy;
+}
+#endif /* CONFIG_X86_32 */
diff --git a/arch/x86/kernel/platform-quirks.c b/arch/x86/kernel/platform-quirks.c
new file mode 100644
index 000000000..b348a672f
--- /dev/null
+++ b/arch/x86/kernel/platform-quirks.c
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/init.h>
+
+#include <asm/setup.h>
+#include <asm/bios_ebda.h>
+
+void __init x86_early_init_platform_quirks(void)
+{
+ x86_platform.legacy.i8042 = X86_LEGACY_I8042_EXPECTED_PRESENT;
+ x86_platform.legacy.rtc = 1;
+ x86_platform.legacy.warm_reset = 1;
+ x86_platform.legacy.reserve_bios_regions = 0;
+ x86_platform.legacy.devices.pnpbios = 1;
+
+ switch (boot_params.hdr.hardware_subarch) {
+ case X86_SUBARCH_PC:
+ x86_platform.legacy.reserve_bios_regions = 1;
+ break;
+ case X86_SUBARCH_XEN:
+ x86_platform.legacy.devices.pnpbios = 0;
+ x86_platform.legacy.rtc = 0;
+ break;
+ case X86_SUBARCH_INTEL_MID:
+ case X86_SUBARCH_CE4100:
+ x86_platform.legacy.devices.pnpbios = 0;
+ x86_platform.legacy.rtc = 0;
+ x86_platform.legacy.i8042 = X86_LEGACY_I8042_PLATFORM_ABSENT;
+ break;
+ }
+
+ if (x86_platform.set_legacy_features)
+ x86_platform.set_legacy_features();
+}
+
+bool __init x86_pnpbios_disabled(void)
+{
+ return x86_platform.legacy.devices.pnpbios == 0;
+}
+
+#if defined(CONFIG_PNPBIOS)
+bool __init arch_pnpbios_disabled(void)
+{
+ return x86_pnpbios_disabled();
+}
+#endif
diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c
new file mode 100644
index 000000000..6b07faaa1
--- /dev/null
+++ b/arch/x86/kernel/pmem.c
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2015, Christoph Hellwig.
+ * Copyright (c) 2015, Intel Corporation.
+ */
+#include <linux/platform_device.h>
+#include <linux/init.h>
+#include <linux/ioport.h>
+
+static int found(struct resource *res, void *data)
+{
+ return 1;
+}
+
+static __init int register_e820_pmem(void)
+{
+ struct platform_device *pdev;
+ int rc;
+
+ rc = walk_iomem_res_desc(IORES_DESC_PERSISTENT_MEMORY_LEGACY,
+ IORESOURCE_MEM, 0, -1, NULL, found);
+ if (rc <= 0)
+ return 0;
+
+ /*
+ * See drivers/nvdimm/e820.c for the implementation, this is
+ * simply here to trigger the module to load on demand.
+ */
+ pdev = platform_device_alloc("e820_pmem", -1);
+ return platform_device_add(pdev);
+}
+device_initcall(register_e820_pmem);
diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c
new file mode 100644
index 000000000..ee0286390
--- /dev/null
+++ b/arch/x86/kernel/probe_roms.c
@@ -0,0 +1,269 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/uaccess.h>
+#include <linux/mmzone.h>
+#include <linux/ioport.h>
+#include <linux/seq_file.h>
+#include <linux/console.h>
+#include <linux/init.h>
+#include <linux/edd.h>
+#include <linux/dmi.h>
+#include <linux/pfn.h>
+#include <linux/pci.h>
+#include <linux/export.h>
+
+#include <asm/probe_roms.h>
+#include <asm/pci-direct.h>
+#include <asm/e820/api.h>
+#include <asm/mmzone.h>
+#include <asm/setup.h>
+#include <asm/sections.h>
+#include <asm/io.h>
+#include <asm/setup_arch.h>
+
+static struct resource system_rom_resource = {
+ .name = "System ROM",
+ .start = 0xf0000,
+ .end = 0xfffff,
+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+};
+
+static struct resource extension_rom_resource = {
+ .name = "Extension ROM",
+ .start = 0xe0000,
+ .end = 0xeffff,
+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+};
+
+static struct resource adapter_rom_resources[] = { {
+ .name = "Adapter ROM",
+ .start = 0xc8000,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+ .name = "Adapter ROM",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+ .name = "Adapter ROM",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+ .name = "Adapter ROM",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+ .name = "Adapter ROM",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+ .name = "Adapter ROM",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+} };
+
+static struct resource video_rom_resource = {
+ .name = "Video ROM",
+ .start = 0xc0000,
+ .end = 0xc7fff,
+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+};
+
+/* does this oprom support the given pci device, or any of the devices
+ * that the driver supports?
+ */
+static bool match_id(struct pci_dev *pdev, unsigned short vendor, unsigned short device)
+{
+ struct pci_driver *drv = pdev->driver;
+ const struct pci_device_id *id;
+
+ if (pdev->vendor == vendor && pdev->device == device)
+ return true;
+
+ for (id = drv ? drv->id_table : NULL; id && id->vendor; id++)
+ if (id->vendor == vendor && id->device == device)
+ break;
+
+ return id && id->vendor;
+}
+
+static bool probe_list(struct pci_dev *pdev, unsigned short vendor,
+ const unsigned char *rom_list)
+{
+ unsigned short device;
+
+ do {
+ if (probe_kernel_address(rom_list, device) != 0)
+ device = 0;
+
+ if (device && match_id(pdev, vendor, device))
+ break;
+
+ rom_list += 2;
+ } while (device);
+
+ return !!device;
+}
+
+static struct resource *find_oprom(struct pci_dev *pdev)
+{
+ struct resource *oprom = NULL;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(adapter_rom_resources); i++) {
+ struct resource *res = &adapter_rom_resources[i];
+ unsigned short offset, vendor, device, list, rev;
+ const unsigned char *rom;
+
+ if (res->end == 0)
+ break;
+
+ rom = isa_bus_to_virt(res->start);
+ if (probe_kernel_address(rom + 0x18, offset) != 0)
+ continue;
+
+ if (probe_kernel_address(rom + offset + 0x4, vendor) != 0)
+ continue;
+
+ if (probe_kernel_address(rom + offset + 0x6, device) != 0)
+ continue;
+
+ if (match_id(pdev, vendor, device)) {
+ oprom = res;
+ break;
+ }
+
+ if (probe_kernel_address(rom + offset + 0x8, list) == 0 &&
+ probe_kernel_address(rom + offset + 0xc, rev) == 0 &&
+ rev >= 3 && list &&
+ probe_list(pdev, vendor, rom + offset + list)) {
+ oprom = res;
+ break;
+ }
+ }
+
+ return oprom;
+}
+
+void __iomem *pci_map_biosrom(struct pci_dev *pdev)
+{
+ struct resource *oprom = find_oprom(pdev);
+
+ if (!oprom)
+ return NULL;
+
+ return ioremap(oprom->start, resource_size(oprom));
+}
+EXPORT_SYMBOL(pci_map_biosrom);
+
+void pci_unmap_biosrom(void __iomem *image)
+{
+ iounmap(image);
+}
+EXPORT_SYMBOL(pci_unmap_biosrom);
+
+size_t pci_biosrom_size(struct pci_dev *pdev)
+{
+ struct resource *oprom = find_oprom(pdev);
+
+ return oprom ? resource_size(oprom) : 0;
+}
+EXPORT_SYMBOL(pci_biosrom_size);
+
+#define ROMSIGNATURE 0xaa55
+
+static int __init romsignature(const unsigned char *rom)
+{
+ const unsigned short * const ptr = (const unsigned short *)rom;
+ unsigned short sig;
+
+ return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE;
+}
+
+static int __init romchecksum(const unsigned char *rom, unsigned long length)
+{
+ unsigned char sum, c;
+
+ for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--)
+ sum += c;
+ return !length && !sum;
+}
+
+void __init probe_roms(void)
+{
+ const unsigned char *rom;
+ unsigned long start, length, upper;
+ unsigned char c;
+ int i;
+
+ /* video rom */
+ upper = adapter_rom_resources[0].start;
+ for (start = video_rom_resource.start; start < upper; start += 2048) {
+ rom = isa_bus_to_virt(start);
+ if (!romsignature(rom))
+ continue;
+
+ video_rom_resource.start = start;
+
+ if (probe_kernel_address(rom + 2, c) != 0)
+ continue;
+
+ /* 0 < length <= 0x7f * 512, historically */
+ length = c * 512;
+
+ /* if checksum okay, trust length byte */
+ if (length && romchecksum(rom, length))
+ video_rom_resource.end = start + length - 1;
+
+ request_resource(&iomem_resource, &video_rom_resource);
+ break;
+ }
+
+ start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
+ if (start < upper)
+ start = upper;
+
+ /* system rom */
+ request_resource(&iomem_resource, &system_rom_resource);
+ upper = system_rom_resource.start;
+
+ /* check for extension rom (ignore length byte!) */
+ rom = isa_bus_to_virt(extension_rom_resource.start);
+ if (romsignature(rom)) {
+ length = resource_size(&extension_rom_resource);
+ if (romchecksum(rom, length)) {
+ request_resource(&iomem_resource, &extension_rom_resource);
+ upper = extension_rom_resource.start;
+ }
+ }
+
+ /* check for adapter roms on 2k boundaries */
+ for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) {
+ rom = isa_bus_to_virt(start);
+ if (!romsignature(rom))
+ continue;
+
+ if (probe_kernel_address(rom + 2, c) != 0)
+ continue;
+
+ /* 0 < length <= 0x7f * 512, historically */
+ length = c * 512;
+
+ /* but accept any length that fits if checksum okay */
+ if (!length || start + length > upper || !romchecksum(rom, length))
+ continue;
+
+ adapter_rom_resources[i].start = start;
+ adapter_rom_resources[i].end = start + length - 1;
+ request_resource(&iomem_resource, &adapter_rom_resources[i]);
+
+ start = adapter_rom_resources[i++].end & ~2047UL;
+ }
+}
+
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
new file mode 100644
index 000000000..cd138bfd9
--- /dev/null
+++ b/arch/x86/kernel/process.c
@@ -0,0 +1,854 @@
+// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/prctl.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/sched/idle.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
+#include <linux/init.h>
+#include <linux/export.h>
+#include <linux/pm.h>
+#include <linux/tick.h>
+#include <linux/random.h>
+#include <linux/user-return-notifier.h>
+#include <linux/dmi.h>
+#include <linux/utsname.h>
+#include <linux/stackprotector.h>
+#include <linux/cpuidle.h>
+#include <trace/events/power.h>
+#include <linux/hw_breakpoint.h>
+#include <asm/cpu.h>
+#include <asm/apic.h>
+#include <asm/syscalls.h>
+#include <linux/uaccess.h>
+#include <asm/mwait.h>
+#include <asm/fpu/internal.h>
+#include <asm/debugreg.h>
+#include <asm/nmi.h>
+#include <asm/tlbflush.h>
+#include <asm/mce.h>
+#include <asm/vm86.h>
+#include <asm/switch_to.h>
+#include <asm/desc.h>
+#include <asm/prctl.h>
+#include <asm/spec-ctrl.h>
+
+#include "process.h"
+
+/*
+ * per-CPU TSS segments. Threads are completely 'soft' on Linux,
+ * no more per-task TSS's. The TSS size is kept cacheline-aligned
+ * so they are allowed to end up in the .data..cacheline_aligned
+ * section. Since TSS's are completely CPU-local, we want them
+ * on exact cacheline boundaries, to eliminate cacheline ping-pong.
+ */
+__visible DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw) = {
+ .x86_tss = {
+ /*
+ * .sp0 is only used when entering ring 0 from a lower
+ * privilege level. Since the init task never runs anything
+ * but ring 0 code, there is no need for a valid value here.
+ * Poison it.
+ */
+ .sp0 = (1UL << (BITS_PER_LONG-1)) + 1,
+
+ /*
+ * .sp1 is cpu_current_top_of_stack. The init task never
+ * runs user code, but cpu_current_top_of_stack should still
+ * be well defined before the first context switch.
+ */
+ .sp1 = TOP_OF_INIT_STACK,
+
+#ifdef CONFIG_X86_32
+ .ss0 = __KERNEL_DS,
+ .ss1 = __KERNEL_CS,
+ .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
+#endif
+ },
+#ifdef CONFIG_X86_32
+ /*
+ * Note that the .io_bitmap member must be extra-big. This is because
+ * the CPU will access an additional byte beyond the end of the IO
+ * permission bitmap. The extra byte must be all 1 bits, and must
+ * be within the limit.
+ */
+ .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 },
+#endif
+};
+EXPORT_PER_CPU_SYMBOL(cpu_tss_rw);
+
+DEFINE_PER_CPU(bool, __tss_limit_invalid);
+EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid);
+
+/*
+ * this gets called so that we can store lazy state into memory and copy the
+ * current task into the new thread.
+ */
+int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
+{
+ memcpy(dst, src, arch_task_struct_size);
+#ifdef CONFIG_VM86
+ dst->thread.vm86 = NULL;
+#endif
+
+ return fpu__copy(&dst->thread.fpu, &src->thread.fpu);
+}
+
+/*
+ * Free current thread data structures etc..
+ */
+void exit_thread(struct task_struct *tsk)
+{
+ struct thread_struct *t = &tsk->thread;
+ unsigned long *bp = t->io_bitmap_ptr;
+ struct fpu *fpu = &t->fpu;
+
+ if (bp) {
+ struct tss_struct *tss = &per_cpu(cpu_tss_rw, get_cpu());
+
+ t->io_bitmap_ptr = NULL;
+ clear_thread_flag(TIF_IO_BITMAP);
+ /*
+ * Careful, clear this in the TSS too:
+ */
+ memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
+ t->io_bitmap_max = 0;
+ put_cpu();
+ kfree(bp);
+ }
+
+ free_vm86(t);
+
+ fpu__drop(fpu);
+}
+
+void flush_thread(void)
+{
+ struct task_struct *tsk = current;
+
+ flush_ptrace_hw_breakpoint(tsk);
+ memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
+
+ fpu__clear(&tsk->thread.fpu);
+}
+
+void disable_TSC(void)
+{
+ preempt_disable();
+ if (!test_and_set_thread_flag(TIF_NOTSC))
+ /*
+ * Must flip the CPU state synchronously with
+ * TIF_NOTSC in the current running context.
+ */
+ cr4_set_bits(X86_CR4_TSD);
+ preempt_enable();
+}
+
+static void enable_TSC(void)
+{
+ preempt_disable();
+ if (test_and_clear_thread_flag(TIF_NOTSC))
+ /*
+ * Must flip the CPU state synchronously with
+ * TIF_NOTSC in the current running context.
+ */
+ cr4_clear_bits(X86_CR4_TSD);
+ preempt_enable();
+}
+
+int get_tsc_mode(unsigned long adr)
+{
+ unsigned int val;
+
+ if (test_thread_flag(TIF_NOTSC))
+ val = PR_TSC_SIGSEGV;
+ else
+ val = PR_TSC_ENABLE;
+
+ return put_user(val, (unsigned int __user *)adr);
+}
+
+int set_tsc_mode(unsigned int val)
+{
+ if (val == PR_TSC_SIGSEGV)
+ disable_TSC();
+ else if (val == PR_TSC_ENABLE)
+ enable_TSC();
+ else
+ return -EINVAL;
+
+ return 0;
+}
+
+DEFINE_PER_CPU(u64, msr_misc_features_shadow);
+
+static void set_cpuid_faulting(bool on)
+{
+ u64 msrval;
+
+ msrval = this_cpu_read(msr_misc_features_shadow);
+ msrval &= ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT;
+ msrval |= (on << MSR_MISC_FEATURES_ENABLES_CPUID_FAULT_BIT);
+ this_cpu_write(msr_misc_features_shadow, msrval);
+ wrmsrl(MSR_MISC_FEATURES_ENABLES, msrval);
+}
+
+static void disable_cpuid(void)
+{
+ preempt_disable();
+ if (!test_and_set_thread_flag(TIF_NOCPUID)) {
+ /*
+ * Must flip the CPU state synchronously with
+ * TIF_NOCPUID in the current running context.
+ */
+ set_cpuid_faulting(true);
+ }
+ preempt_enable();
+}
+
+static void enable_cpuid(void)
+{
+ preempt_disable();
+ if (test_and_clear_thread_flag(TIF_NOCPUID)) {
+ /*
+ * Must flip the CPU state synchronously with
+ * TIF_NOCPUID in the current running context.
+ */
+ set_cpuid_faulting(false);
+ }
+ preempt_enable();
+}
+
+static int get_cpuid_mode(void)
+{
+ return !test_thread_flag(TIF_NOCPUID);
+}
+
+static int set_cpuid_mode(struct task_struct *task, unsigned long cpuid_enabled)
+{
+ if (!static_cpu_has(X86_FEATURE_CPUID_FAULT))
+ return -ENODEV;
+
+ if (cpuid_enabled)
+ enable_cpuid();
+ else
+ disable_cpuid();
+
+ return 0;
+}
+
+/*
+ * Called immediately after a successful exec.
+ */
+void arch_setup_new_exec(void)
+{
+ /* If cpuid was previously disabled for this task, re-enable it. */
+ if (test_thread_flag(TIF_NOCPUID))
+ enable_cpuid();
+}
+
+static inline void switch_to_bitmap(struct thread_struct *prev,
+ struct thread_struct *next,
+ unsigned long tifp, unsigned long tifn)
+{
+ struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw);
+
+ if (tifn & _TIF_IO_BITMAP) {
+ /*
+ * Copy the relevant range of the IO bitmap.
+ * Normally this is 128 bytes or less:
+ */
+ memcpy(tss->io_bitmap, next->io_bitmap_ptr,
+ max(prev->io_bitmap_max, next->io_bitmap_max));
+ /*
+ * Make sure that the TSS limit is correct for the CPU
+ * to notice the IO bitmap.
+ */
+ refresh_tss_limit();
+ } else if (tifp & _TIF_IO_BITMAP) {
+ /*
+ * Clear any possible leftover bits:
+ */
+ memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
+ }
+}
+
+#ifdef CONFIG_SMP
+
+struct ssb_state {
+ struct ssb_state *shared_state;
+ raw_spinlock_t lock;
+ unsigned int disable_state;
+ unsigned long local_state;
+};
+
+#define LSTATE_SSB 0
+
+static DEFINE_PER_CPU(struct ssb_state, ssb_state);
+
+void speculative_store_bypass_ht_init(void)
+{
+ struct ssb_state *st = this_cpu_ptr(&ssb_state);
+ unsigned int this_cpu = smp_processor_id();
+ unsigned int cpu;
+
+ st->local_state = 0;
+
+ /*
+ * Shared state setup happens once on the first bringup
+ * of the CPU. It's not destroyed on CPU hotunplug.
+ */
+ if (st->shared_state)
+ return;
+
+ raw_spin_lock_init(&st->lock);
+
+ /*
+ * Go over HT siblings and check whether one of them has set up the
+ * shared state pointer already.
+ */
+ for_each_cpu(cpu, topology_sibling_cpumask(this_cpu)) {
+ if (cpu == this_cpu)
+ continue;
+
+ if (!per_cpu(ssb_state, cpu).shared_state)
+ continue;
+
+ /* Link it to the state of the sibling: */
+ st->shared_state = per_cpu(ssb_state, cpu).shared_state;
+ return;
+ }
+
+ /*
+ * First HT sibling to come up on the core. Link shared state of
+ * the first HT sibling to itself. The siblings on the same core
+ * which come up later will see the shared state pointer and link
+ * themself to the state of this CPU.
+ */
+ st->shared_state = st;
+}
+
+/*
+ * Logic is: First HT sibling enables SSBD for both siblings in the core
+ * and last sibling to disable it, disables it for the whole core. This how
+ * MSR_SPEC_CTRL works in "hardware":
+ *
+ * CORE_SPEC_CTRL = THREAD0_SPEC_CTRL | THREAD1_SPEC_CTRL
+ */
+static __always_inline void amd_set_core_ssb_state(unsigned long tifn)
+{
+ struct ssb_state *st = this_cpu_ptr(&ssb_state);
+ u64 msr = x86_amd_ls_cfg_base;
+
+ if (!static_cpu_has(X86_FEATURE_ZEN)) {
+ msr |= ssbd_tif_to_amd_ls_cfg(tifn);
+ wrmsrl(MSR_AMD64_LS_CFG, msr);
+ return;
+ }
+
+ if (tifn & _TIF_SSBD) {
+ /*
+ * Since this can race with prctl(), block reentry on the
+ * same CPU.
+ */
+ if (__test_and_set_bit(LSTATE_SSB, &st->local_state))
+ return;
+
+ msr |= x86_amd_ls_cfg_ssbd_mask;
+
+ raw_spin_lock(&st->shared_state->lock);
+ /* First sibling enables SSBD: */
+ if (!st->shared_state->disable_state)
+ wrmsrl(MSR_AMD64_LS_CFG, msr);
+ st->shared_state->disable_state++;
+ raw_spin_unlock(&st->shared_state->lock);
+ } else {
+ if (!__test_and_clear_bit(LSTATE_SSB, &st->local_state))
+ return;
+
+ raw_spin_lock(&st->shared_state->lock);
+ st->shared_state->disable_state--;
+ if (!st->shared_state->disable_state)
+ wrmsrl(MSR_AMD64_LS_CFG, msr);
+ raw_spin_unlock(&st->shared_state->lock);
+ }
+}
+#else
+static __always_inline void amd_set_core_ssb_state(unsigned long tifn)
+{
+ u64 msr = x86_amd_ls_cfg_base | ssbd_tif_to_amd_ls_cfg(tifn);
+
+ wrmsrl(MSR_AMD64_LS_CFG, msr);
+}
+#endif
+
+static __always_inline void amd_set_ssb_virt_state(unsigned long tifn)
+{
+ /*
+ * SSBD has the same definition in SPEC_CTRL and VIRT_SPEC_CTRL,
+ * so ssbd_tif_to_spec_ctrl() just works.
+ */
+ wrmsrl(MSR_AMD64_VIRT_SPEC_CTRL, ssbd_tif_to_spec_ctrl(tifn));
+}
+
+/*
+ * Update the MSRs managing speculation control, during context switch.
+ *
+ * tifp: Previous task's thread flags
+ * tifn: Next task's thread flags
+ */
+static __always_inline void __speculation_ctrl_update(unsigned long tifp,
+ unsigned long tifn)
+{
+ unsigned long tif_diff = tifp ^ tifn;
+ u64 msr = x86_spec_ctrl_base;
+ bool updmsr = false;
+
+ lockdep_assert_irqs_disabled();
+
+ /* Handle change of TIF_SSBD depending on the mitigation method. */
+ if (static_cpu_has(X86_FEATURE_VIRT_SSBD)) {
+ if (tif_diff & _TIF_SSBD)
+ amd_set_ssb_virt_state(tifn);
+ } else if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD)) {
+ if (tif_diff & _TIF_SSBD)
+ amd_set_core_ssb_state(tifn);
+ } else if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) ||
+ static_cpu_has(X86_FEATURE_AMD_SSBD)) {
+ updmsr |= !!(tif_diff & _TIF_SSBD);
+ msr |= ssbd_tif_to_spec_ctrl(tifn);
+ }
+
+ /* Only evaluate TIF_SPEC_IB if conditional STIBP is enabled. */
+ if (IS_ENABLED(CONFIG_SMP) &&
+ static_branch_unlikely(&switch_to_cond_stibp)) {
+ updmsr |= !!(tif_diff & _TIF_SPEC_IB);
+ msr |= stibp_tif_to_spec_ctrl(tifn);
+ }
+
+ if (updmsr)
+ wrmsrl(MSR_IA32_SPEC_CTRL, msr);
+}
+
+static unsigned long speculation_ctrl_update_tif(struct task_struct *tsk)
+{
+ if (test_and_clear_tsk_thread_flag(tsk, TIF_SPEC_FORCE_UPDATE)) {
+ if (task_spec_ssb_disable(tsk))
+ set_tsk_thread_flag(tsk, TIF_SSBD);
+ else
+ clear_tsk_thread_flag(tsk, TIF_SSBD);
+
+ if (task_spec_ib_disable(tsk))
+ set_tsk_thread_flag(tsk, TIF_SPEC_IB);
+ else
+ clear_tsk_thread_flag(tsk, TIF_SPEC_IB);
+ }
+ /* Return the updated threadinfo flags*/
+ return task_thread_info(tsk)->flags;
+}
+
+void speculation_ctrl_update(unsigned long tif)
+{
+ unsigned long flags;
+
+ /* Forced update. Make sure all relevant TIF flags are different */
+ local_irq_save(flags);
+ __speculation_ctrl_update(~tif, tif);
+ local_irq_restore(flags);
+}
+
+/* Called from seccomp/prctl update */
+void speculation_ctrl_update_current(void)
+{
+ preempt_disable();
+ speculation_ctrl_update(speculation_ctrl_update_tif(current));
+ preempt_enable();
+}
+
+void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p)
+{
+ struct thread_struct *prev, *next;
+ unsigned long tifp, tifn;
+
+ prev = &prev_p->thread;
+ next = &next_p->thread;
+
+ tifn = READ_ONCE(task_thread_info(next_p)->flags);
+ tifp = READ_ONCE(task_thread_info(prev_p)->flags);
+ switch_to_bitmap(prev, next, tifp, tifn);
+
+ propagate_user_return_notify(prev_p, next_p);
+
+ if ((tifp & _TIF_BLOCKSTEP || tifn & _TIF_BLOCKSTEP) &&
+ arch_has_block_step()) {
+ unsigned long debugctl, msk;
+
+ rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+ debugctl &= ~DEBUGCTLMSR_BTF;
+ msk = tifn & _TIF_BLOCKSTEP;
+ debugctl |= (msk >> TIF_BLOCKSTEP) << DEBUGCTLMSR_BTF_SHIFT;
+ wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+ }
+
+ if ((tifp ^ tifn) & _TIF_NOTSC)
+ cr4_toggle_bits_irqsoff(X86_CR4_TSD);
+
+ if ((tifp ^ tifn) & _TIF_NOCPUID)
+ set_cpuid_faulting(!!(tifn & _TIF_NOCPUID));
+
+ if (likely(!((tifp | tifn) & _TIF_SPEC_FORCE_UPDATE))) {
+ __speculation_ctrl_update(tifp, tifn);
+ } else {
+ speculation_ctrl_update_tif(prev_p);
+ tifn = speculation_ctrl_update_tif(next_p);
+
+ /* Enforce MSR update to ensure consistent state */
+ __speculation_ctrl_update(~tifn, tifn);
+ }
+}
+
+/*
+ * Idle related variables and functions
+ */
+unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE;
+EXPORT_SYMBOL(boot_option_idle_override);
+
+static void (*x86_idle)(void);
+
+#ifndef CONFIG_SMP
+static inline void play_dead(void)
+{
+ BUG();
+}
+#endif
+
+void arch_cpu_idle_enter(void)
+{
+ tsc_verify_tsc_adjust(false);
+ local_touch_nmi();
+}
+
+void arch_cpu_idle_dead(void)
+{
+ play_dead();
+}
+
+/*
+ * Called from the generic idle code.
+ */
+void arch_cpu_idle(void)
+{
+ x86_idle();
+}
+
+/*
+ * We use this if we don't have any better idle routine..
+ */
+void __cpuidle default_idle(void)
+{
+ trace_cpu_idle_rcuidle(1, smp_processor_id());
+ safe_halt();
+ trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
+}
+#ifdef CONFIG_APM_MODULE
+EXPORT_SYMBOL(default_idle);
+#endif
+
+#ifdef CONFIG_XEN
+bool xen_set_default_idle(void)
+{
+ bool ret = !!x86_idle;
+
+ x86_idle = default_idle;
+
+ return ret;
+}
+#endif
+
+void stop_this_cpu(void *dummy)
+{
+ local_irq_disable();
+ /*
+ * Remove this CPU:
+ */
+ set_cpu_online(smp_processor_id(), false);
+ disable_local_APIC();
+ mcheck_cpu_clear(this_cpu_ptr(&cpu_info));
+
+ /*
+ * Use wbinvd on processors that support SME. This provides support
+ * for performing a successful kexec when going from SME inactive
+ * to SME active (or vice-versa). The cache must be cleared so that
+ * if there are entries with the same physical address, both with and
+ * without the encryption bit, they don't race each other when flushed
+ * and potentially end up with the wrong entry being committed to
+ * memory.
+ */
+ if (boot_cpu_has(X86_FEATURE_SME))
+ native_wbinvd();
+ for (;;) {
+ /*
+ * Use native_halt() so that memory contents don't change
+ * (stack usage and variables) after possibly issuing the
+ * native_wbinvd() above.
+ */
+ native_halt();
+ }
+}
+
+/*
+ * AMD Erratum 400 aware idle routine. We handle it the same way as C3 power
+ * states (local apic timer and TSC stop).
+ */
+static void amd_e400_idle(void)
+{
+ /*
+ * We cannot use static_cpu_has_bug() here because X86_BUG_AMD_APIC_C1E
+ * gets set after static_cpu_has() places have been converted via
+ * alternatives.
+ */
+ if (!boot_cpu_has_bug(X86_BUG_AMD_APIC_C1E)) {
+ default_idle();
+ return;
+ }
+
+ tick_broadcast_enter();
+
+ default_idle();
+
+ /*
+ * The switch back from broadcast mode needs to be called with
+ * interrupts disabled.
+ */
+ local_irq_disable();
+ tick_broadcast_exit();
+ local_irq_enable();
+}
+
+/*
+ * Intel Core2 and older machines prefer MWAIT over HALT for C1.
+ * We can't rely on cpuidle installing MWAIT, because it will not load
+ * on systems that support only C1 -- so the boot default must be MWAIT.
+ *
+ * Some AMD machines are the opposite, they depend on using HALT.
+ *
+ * So for default C1, which is used during boot until cpuidle loads,
+ * use MWAIT-C1 on Intel HW that has it, else use HALT.
+ */
+static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c)
+{
+ if (c->x86_vendor != X86_VENDOR_INTEL)
+ return 0;
+
+ if (!cpu_has(c, X86_FEATURE_MWAIT) || static_cpu_has_bug(X86_BUG_MONITOR))
+ return 0;
+
+ return 1;
+}
+
+/*
+ * MONITOR/MWAIT with no hints, used for default C1 state. This invokes MWAIT
+ * with interrupts enabled and no flags, which is backwards compatible with the
+ * original MWAIT implementation.
+ */
+static __cpuidle void mwait_idle(void)
+{
+ if (!current_set_polling_and_test()) {
+ trace_cpu_idle_rcuidle(1, smp_processor_id());
+ if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) {
+ mb(); /* quirk */
+ clflush((void *)&current_thread_info()->flags);
+ mb(); /* quirk */
+ }
+
+ __monitor((void *)&current_thread_info()->flags, 0, 0);
+ if (!need_resched())
+ __sti_mwait(0, 0);
+ else
+ local_irq_enable();
+ trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
+ } else {
+ local_irq_enable();
+ }
+ __current_clr_polling();
+}
+
+void select_idle_routine(const struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_SMP
+ if (boot_option_idle_override == IDLE_POLL && smp_num_siblings > 1)
+ pr_warn_once("WARNING: polling idle and HT enabled, performance may degrade\n");
+#endif
+ if (x86_idle || boot_option_idle_override == IDLE_POLL)
+ return;
+
+ if (boot_cpu_has_bug(X86_BUG_AMD_E400)) {
+ pr_info("using AMD E400 aware idle routine\n");
+ x86_idle = amd_e400_idle;
+ } else if (prefer_mwait_c1_over_halt(c)) {
+ pr_info("using mwait in idle threads\n");
+ x86_idle = mwait_idle;
+ } else
+ x86_idle = default_idle;
+}
+
+void amd_e400_c1e_apic_setup(void)
+{
+ if (boot_cpu_has_bug(X86_BUG_AMD_APIC_C1E)) {
+ pr_info("Switch to broadcast mode on CPU%d\n", smp_processor_id());
+ local_irq_disable();
+ tick_broadcast_force();
+ local_irq_enable();
+ }
+}
+
+void __init arch_post_acpi_subsys_init(void)
+{
+ u32 lo, hi;
+
+ if (!boot_cpu_has_bug(X86_BUG_AMD_E400))
+ return;
+
+ /*
+ * AMD E400 detection needs to happen after ACPI has been enabled. If
+ * the machine is affected K8_INTP_C1E_ACTIVE_MASK bits are set in
+ * MSR_K8_INT_PENDING_MSG.
+ */
+ rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
+ if (!(lo & K8_INTP_C1E_ACTIVE_MASK))
+ return;
+
+ boot_cpu_set_bug(X86_BUG_AMD_APIC_C1E);
+
+ if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
+ mark_tsc_unstable("TSC halt in AMD C1E");
+ pr_info("System has AMD C1E enabled\n");
+}
+
+static int __init idle_setup(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "poll")) {
+ pr_info("using polling idle threads\n");
+ boot_option_idle_override = IDLE_POLL;
+ cpu_idle_poll_ctrl(true);
+ } else if (!strcmp(str, "halt")) {
+ /*
+ * When the boot option of idle=halt is added, halt is
+ * forced to be used for CPU idle. In such case CPU C2/C3
+ * won't be used again.
+ * To continue to load the CPU idle driver, don't touch
+ * the boot_option_idle_override.
+ */
+ x86_idle = default_idle;
+ boot_option_idle_override = IDLE_HALT;
+ } else if (!strcmp(str, "nomwait")) {
+ /*
+ * If the boot option of "idle=nomwait" is added,
+ * it means that mwait will be disabled for CPU C2/C3
+ * states. In such case it won't touch the variable
+ * of boot_option_idle_override.
+ */
+ boot_option_idle_override = IDLE_NOMWAIT;
+ } else
+ return -1;
+
+ return 0;
+}
+early_param("idle", idle_setup);
+
+unsigned long arch_align_stack(unsigned long sp)
+{
+ if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
+ sp -= get_random_int() % 8192;
+ return sp & ~0xf;
+}
+
+unsigned long arch_randomize_brk(struct mm_struct *mm)
+{
+ return randomize_page(mm->brk, 0x02000000);
+}
+
+/*
+ * Called from fs/proc with a reference on @p to find the function
+ * which called into schedule(). This needs to be done carefully
+ * because the task might wake up and we might look at a stack
+ * changing under us.
+ */
+unsigned long get_wchan(struct task_struct *p)
+{
+ unsigned long start, bottom, top, sp, fp, ip, ret = 0;
+ int count = 0;
+
+ if (!p || p == current || p->state == TASK_RUNNING)
+ return 0;
+
+ if (!try_get_task_stack(p))
+ return 0;
+
+ start = (unsigned long)task_stack_page(p);
+ if (!start)
+ goto out;
+
+ /*
+ * Layout of the stack page:
+ *
+ * ----------- topmax = start + THREAD_SIZE - sizeof(unsigned long)
+ * PADDING
+ * ----------- top = topmax - TOP_OF_KERNEL_STACK_PADDING
+ * stack
+ * ----------- bottom = start
+ *
+ * The tasks stack pointer points at the location where the
+ * framepointer is stored. The data on the stack is:
+ * ... IP FP ... IP FP
+ *
+ * We need to read FP and IP, so we need to adjust the upper
+ * bound by another unsigned long.
+ */
+ top = start + THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING;
+ top -= 2 * sizeof(unsigned long);
+ bottom = start;
+
+ sp = READ_ONCE(p->thread.sp);
+ if (sp < bottom || sp > top)
+ goto out;
+
+ fp = READ_ONCE_NOCHECK(((struct inactive_task_frame *)sp)->bp);
+ do {
+ if (fp < bottom || fp > top)
+ goto out;
+ ip = READ_ONCE_NOCHECK(*(unsigned long *)(fp + sizeof(unsigned long)));
+ if (!in_sched_functions(ip)) {
+ ret = ip;
+ goto out;
+ }
+ fp = READ_ONCE_NOCHECK(*(unsigned long *)fp);
+ } while (count++ < 16 && p->state != TASK_RUNNING);
+
+out:
+ put_task_stack(p);
+ return ret;
+}
+
+long do_arch_prctl_common(struct task_struct *task, int option,
+ unsigned long cpuid_enabled)
+{
+ switch (option) {
+ case ARCH_GET_CPUID:
+ return get_cpuid_mode();
+ case ARCH_SET_CPUID:
+ return set_cpuid_mode(task, cpuid_enabled);
+ }
+
+ return -EINVAL;
+}
diff --git a/arch/x86/kernel/process.h b/arch/x86/kernel/process.h
new file mode 100644
index 000000000..320ab978f
--- /dev/null
+++ b/arch/x86/kernel/process.h
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0
+//
+// Code shared between 32 and 64 bit
+
+#include <asm/spec-ctrl.h>
+
+void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p);
+
+/*
+ * This needs to be inline to optimize for the common case where no extra
+ * work needs to be done.
+ */
+static inline void switch_to_extra(struct task_struct *prev,
+ struct task_struct *next)
+{
+ unsigned long next_tif = task_thread_info(next)->flags;
+ unsigned long prev_tif = task_thread_info(prev)->flags;
+
+ if (IS_ENABLED(CONFIG_SMP)) {
+ /*
+ * Avoid __switch_to_xtra() invocation when conditional
+ * STIBP is disabled and the only different bit is
+ * TIF_SPEC_IB. For CONFIG_SMP=n TIF_SPEC_IB is not
+ * in the TIF_WORK_CTXSW masks.
+ */
+ if (!static_branch_likely(&switch_to_cond_stibp)) {
+ prev_tif &= ~_TIF_SPEC_IB;
+ next_tif &= ~_TIF_SPEC_IB;
+ }
+ }
+
+ /*
+ * __switch_to_xtra() handles debug registers, i/o bitmaps,
+ * speculation mitigations etc.
+ */
+ if (unlikely(next_tif & _TIF_WORK_CTXSW_NEXT ||
+ prev_tif & _TIF_WORK_CTXSW_PREV))
+ __switch_to_xtra(prev, next);
+}
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
new file mode 100644
index 000000000..020efe0f9
--- /dev/null
+++ b/arch/x86/kernel/process_32.c
@@ -0,0 +1,316 @@
+/*
+ * Copyright (C) 1995 Linus Torvalds
+ *
+ * Pentium III FXSR, SSE support
+ * Gareth Hughes <gareth@valinux.com>, May 2000
+ */
+
+/*
+ * This file handles the architecture-dependent parts of process handling..
+ */
+
+#include <linux/cpu.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/elfcore.h>
+#include <linux/smp.h>
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/user.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+#include <linux/reboot.h>
+#include <linux/mc146818rtc.h>
+#include <linux/export.h>
+#include <linux/kallsyms.h>
+#include <linux/ptrace.h>
+#include <linux/personality.h>
+#include <linux/percpu.h>
+#include <linux/prctl.h>
+#include <linux/ftrace.h>
+#include <linux/uaccess.h>
+#include <linux/io.h>
+#include <linux/kdebug.h>
+#include <linux/syscalls.h>
+
+#include <asm/pgtable.h>
+#include <asm/ldt.h>
+#include <asm/processor.h>
+#include <asm/fpu/internal.h>
+#include <asm/desc.h>
+#ifdef CONFIG_MATH_EMULATION
+#include <asm/math_emu.h>
+#endif
+
+#include <linux/err.h>
+
+#include <asm/tlbflush.h>
+#include <asm/cpu.h>
+#include <asm/syscalls.h>
+#include <asm/debugreg.h>
+#include <asm/switch_to.h>
+#include <asm/vm86.h>
+#include <asm/intel_rdt_sched.h>
+#include <asm/proto.h>
+
+#include "process.h"
+
+void __show_regs(struct pt_regs *regs, enum show_regs_mode mode)
+{
+ unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
+ unsigned long d0, d1, d2, d3, d6, d7;
+ unsigned long sp;
+ unsigned short ss, gs;
+
+ if (user_mode(regs)) {
+ sp = regs->sp;
+ ss = regs->ss;
+ gs = get_user_gs(regs);
+ } else {
+ sp = kernel_stack_pointer(regs);
+ savesegment(ss, ss);
+ savesegment(gs, gs);
+ }
+
+ show_ip(regs, KERN_DEFAULT);
+
+ printk(KERN_DEFAULT "EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
+ regs->ax, regs->bx, regs->cx, regs->dx);
+ printk(KERN_DEFAULT "ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n",
+ regs->si, regs->di, regs->bp, sp);
+ printk(KERN_DEFAULT "DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x EFLAGS: %08lx\n",
+ (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss, regs->flags);
+
+ if (mode != SHOW_REGS_ALL)
+ return;
+
+ cr0 = read_cr0();
+ cr2 = read_cr2();
+ cr3 = __read_cr3();
+ cr4 = __read_cr4();
+ printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n",
+ cr0, cr2, cr3, cr4);
+
+ get_debugreg(d0, 0);
+ get_debugreg(d1, 1);
+ get_debugreg(d2, 2);
+ get_debugreg(d3, 3);
+ get_debugreg(d6, 6);
+ get_debugreg(d7, 7);
+
+ /* Only print out debug registers if they are in their non-default state. */
+ if ((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) &&
+ (d6 == DR6_RESERVED) && (d7 == 0x400))
+ return;
+
+ printk(KERN_DEFAULT "DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n",
+ d0, d1, d2, d3);
+ printk(KERN_DEFAULT "DR6: %08lx DR7: %08lx\n",
+ d6, d7);
+}
+
+void release_thread(struct task_struct *dead_task)
+{
+ BUG_ON(dead_task->mm);
+ release_vm86_irqs(dead_task);
+}
+
+int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
+ unsigned long arg, struct task_struct *p, unsigned long tls)
+{
+ struct pt_regs *childregs = task_pt_regs(p);
+ struct fork_frame *fork_frame = container_of(childregs, struct fork_frame, regs);
+ struct inactive_task_frame *frame = &fork_frame->frame;
+ struct task_struct *tsk;
+ int err;
+
+ /*
+ * For a new task use the RESET flags value since there is no before.
+ * All the status flags are zero; DF and all the system flags must also
+ * be 0, specifically IF must be 0 because we context switch to the new
+ * task with interrupts disabled.
+ */
+ frame->flags = X86_EFLAGS_FIXED;
+ frame->bp = 0;
+ frame->ret_addr = (unsigned long) ret_from_fork;
+ p->thread.sp = (unsigned long) fork_frame;
+ p->thread.sp0 = (unsigned long) (childregs+1);
+ memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
+
+ if (unlikely(p->flags & PF_KTHREAD)) {
+ /* kernel thread */
+ memset(childregs, 0, sizeof(struct pt_regs));
+ frame->bx = sp; /* function */
+ frame->di = arg;
+ p->thread.io_bitmap_ptr = NULL;
+ return 0;
+ }
+ frame->bx = 0;
+ *childregs = *current_pt_regs();
+ childregs->ax = 0;
+ if (sp)
+ childregs->sp = sp;
+
+ task_user_gs(p) = get_user_gs(current_pt_regs());
+
+ p->thread.io_bitmap_ptr = NULL;
+ tsk = current;
+ err = -ENOMEM;
+
+ if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
+ p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
+ IO_BITMAP_BYTES, GFP_KERNEL);
+ if (!p->thread.io_bitmap_ptr) {
+ p->thread.io_bitmap_max = 0;
+ return -ENOMEM;
+ }
+ set_tsk_thread_flag(p, TIF_IO_BITMAP);
+ }
+
+ err = 0;
+
+ /*
+ * Set a new TLS for the child thread?
+ */
+ if (clone_flags & CLONE_SETTLS)
+ err = do_set_thread_area(p, -1,
+ (struct user_desc __user *)tls, 0);
+
+ if (err && p->thread.io_bitmap_ptr) {
+ kfree(p->thread.io_bitmap_ptr);
+ p->thread.io_bitmap_max = 0;
+ }
+ return err;
+}
+
+void
+start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
+{
+ set_user_gs(regs, 0);
+ regs->fs = 0;
+ regs->ds = __USER_DS;
+ regs->es = __USER_DS;
+ regs->ss = __USER_DS;
+ regs->cs = __USER_CS;
+ regs->ip = new_ip;
+ regs->sp = new_sp;
+ regs->flags = X86_EFLAGS_IF;
+ force_iret();
+}
+EXPORT_SYMBOL_GPL(start_thread);
+
+
+/*
+ * switch_to(x,y) should switch tasks from x to y.
+ *
+ * We fsave/fwait so that an exception goes off at the right time
+ * (as a call from the fsave or fwait in effect) rather than to
+ * the wrong process. Lazy FP saving no longer makes any sense
+ * with modern CPU's, and this simplifies a lot of things (SMP
+ * and UP become the same).
+ *
+ * NOTE! We used to use the x86 hardware context switching. The
+ * reason for not using it any more becomes apparent when you
+ * try to recover gracefully from saved state that is no longer
+ * valid (stale segment register values in particular). With the
+ * hardware task-switch, there is no way to fix up bad state in
+ * a reasonable manner.
+ *
+ * The fact that Intel documents the hardware task-switching to
+ * be slow is a fairly red herring - this code is not noticeably
+ * faster. However, there _is_ some room for improvement here,
+ * so the performance issues may eventually be a valid point.
+ * More important, however, is the fact that this allows us much
+ * more flexibility.
+ *
+ * The return value (in %ax) will be the "prev" task after
+ * the task-switch, and shows up in ret_from_fork in entry.S,
+ * for example.
+ */
+__visible __notrace_funcgraph struct task_struct *
+__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
+{
+ struct thread_struct *prev = &prev_p->thread,
+ *next = &next_p->thread;
+ struct fpu *prev_fpu = &prev->fpu;
+ struct fpu *next_fpu = &next->fpu;
+ int cpu = smp_processor_id();
+
+ /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
+
+ switch_fpu_prepare(prev_fpu, cpu);
+
+ /*
+ * Save away %gs. No need to save %fs, as it was saved on the
+ * stack on entry. No need to save %es and %ds, as those are
+ * always kernel segments while inside the kernel. Doing this
+ * before setting the new TLS descriptors avoids the situation
+ * where we temporarily have non-reloadable segments in %fs
+ * and %gs. This could be an issue if the NMI handler ever
+ * used %fs or %gs (it does not today), or if the kernel is
+ * running inside of a hypervisor layer.
+ */
+ lazy_save_gs(prev->gs);
+
+ /*
+ * Load the per-thread Thread-Local Storage descriptor.
+ */
+ load_TLS(next, cpu);
+
+ /*
+ * Restore IOPL if needed. In normal use, the flags restore
+ * in the switch assembly will handle this. But if the kernel
+ * is running virtualized at a non-zero CPL, the popf will
+ * not restore flags, so it must be done in a separate step.
+ */
+ if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl))
+ set_iopl_mask(next->iopl);
+
+ switch_to_extra(prev_p, next_p);
+
+ /*
+ * Leave lazy mode, flushing any hypercalls made here.
+ * This must be done before restoring TLS segments so
+ * the GDT and LDT are properly updated, and must be
+ * done before fpu__restore(), so the TS bit is up
+ * to date.
+ */
+ arch_end_context_switch(next_p);
+
+ /*
+ * Reload esp0 and cpu_current_top_of_stack. This changes
+ * current_thread_info(). Refresh the SYSENTER configuration in
+ * case prev or next is vm86.
+ */
+ update_task_stack(next_p);
+ refresh_sysenter_cs(next);
+ this_cpu_write(cpu_current_top_of_stack,
+ (unsigned long)task_stack_page(next_p) +
+ THREAD_SIZE);
+
+ /*
+ * Restore %gs if needed (which is common)
+ */
+ if (prev->gs | next->gs)
+ lazy_load_gs(next->gs);
+
+ switch_fpu_finish(next_fpu, cpu);
+
+ this_cpu_write(current_task, next_p);
+
+ /* Load the Intel cache allocation PQR MSR. */
+ intel_rdt_sched_in();
+
+ return prev_p;
+}
+
+SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
+{
+ return do_arch_prctl_common(current, option, arg2);
+}
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
new file mode 100644
index 000000000..59f71d0f2
--- /dev/null
+++ b/arch/x86/kernel/process_64.c
@@ -0,0 +1,728 @@
+/*
+ * Copyright (C) 1995 Linus Torvalds
+ *
+ * Pentium III FXSR, SSE support
+ * Gareth Hughes <gareth@valinux.com>, May 2000
+ *
+ * X86-64 port
+ * Andi Kleen.
+ *
+ * CPU hotplug support - ashok.raj@intel.com
+ */
+
+/*
+ * This file handles the architecture-dependent parts of process handling..
+ */
+
+#include <linux/cpu.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/elfcore.h>
+#include <linux/smp.h>
+#include <linux/slab.h>
+#include <linux/user.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+#include <linux/export.h>
+#include <linux/ptrace.h>
+#include <linux/notifier.h>
+#include <linux/kprobes.h>
+#include <linux/kdebug.h>
+#include <linux/prctl.h>
+#include <linux/uaccess.h>
+#include <linux/io.h>
+#include <linux/ftrace.h>
+#include <linux/syscalls.h>
+
+#include <asm/pgtable.h>
+#include <asm/processor.h>
+#include <asm/fpu/internal.h>
+#include <asm/mmu_context.h>
+#include <asm/prctl.h>
+#include <asm/desc.h>
+#include <asm/proto.h>
+#include <asm/ia32.h>
+#include <asm/syscalls.h>
+#include <asm/debugreg.h>
+#include <asm/switch_to.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/vdso.h>
+#include <asm/intel_rdt_sched.h>
+#include <asm/unistd.h>
+#ifdef CONFIG_IA32_EMULATION
+/* Not included via unistd.h */
+#include <asm/unistd_32_ia32.h>
+#endif
+
+#include "process.h"
+
+__visible DEFINE_PER_CPU(unsigned long, rsp_scratch);
+
+/* Prints also some state that isn't saved in the pt_regs */
+void __show_regs(struct pt_regs *regs, enum show_regs_mode mode)
+{
+ unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
+ unsigned long d0, d1, d2, d3, d6, d7;
+ unsigned int fsindex, gsindex;
+ unsigned int ds, cs, es;
+
+ show_iret_regs(regs);
+
+ if (regs->orig_ax != -1)
+ pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax);
+ else
+ pr_cont("\n");
+
+ printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
+ regs->ax, regs->bx, regs->cx);
+ printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
+ regs->dx, regs->si, regs->di);
+ printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
+ regs->bp, regs->r8, regs->r9);
+ printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
+ regs->r10, regs->r11, regs->r12);
+ printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
+ regs->r13, regs->r14, regs->r15);
+
+ if (mode == SHOW_REGS_SHORT)
+ return;
+
+ if (mode == SHOW_REGS_USER) {
+ rdmsrl(MSR_FS_BASE, fs);
+ rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
+ printk(KERN_DEFAULT "FS: %016lx GS: %016lx\n",
+ fs, shadowgs);
+ return;
+ }
+
+ asm("movl %%ds,%0" : "=r" (ds));
+ asm("movl %%cs,%0" : "=r" (cs));
+ asm("movl %%es,%0" : "=r" (es));
+ asm("movl %%fs,%0" : "=r" (fsindex));
+ asm("movl %%gs,%0" : "=r" (gsindex));
+
+ rdmsrl(MSR_FS_BASE, fs);
+ rdmsrl(MSR_GS_BASE, gs);
+ rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
+
+ cr0 = read_cr0();
+ cr2 = read_cr2();
+ cr3 = __read_cr3();
+ cr4 = __read_cr4();
+
+ printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
+ fs, fsindex, gs, gsindex, shadowgs);
+ printk(KERN_DEFAULT "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
+ es, cr0);
+ printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
+ cr4);
+
+ get_debugreg(d0, 0);
+ get_debugreg(d1, 1);
+ get_debugreg(d2, 2);
+ get_debugreg(d3, 3);
+ get_debugreg(d6, 6);
+ get_debugreg(d7, 7);
+
+ /* Only print out debug registers if they are in their non-default state. */
+ if (!((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) &&
+ (d6 == DR6_RESERVED) && (d7 == 0x400))) {
+ printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n",
+ d0, d1, d2);
+ printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n",
+ d3, d6, d7);
+ }
+
+ if (boot_cpu_has(X86_FEATURE_OSPKE))
+ printk(KERN_DEFAULT "PKRU: %08x\n", read_pkru());
+}
+
+void release_thread(struct task_struct *dead_task)
+{
+ if (dead_task->mm) {
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+ if (dead_task->mm->context.ldt) {
+ pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n",
+ dead_task->comm,
+ dead_task->mm->context.ldt->entries,
+ dead_task->mm->context.ldt->nr_entries);
+ BUG();
+ }
+#endif
+ }
+}
+
+enum which_selector {
+ FS,
+ GS
+};
+
+/*
+ * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are
+ * not available. The goal is to be reasonably fast on non-FSGSBASE systems.
+ * It's forcibly inlined because it'll generate better code and this function
+ * is hot.
+ */
+static __always_inline void save_base_legacy(struct task_struct *prev_p,
+ unsigned short selector,
+ enum which_selector which)
+{
+ if (likely(selector == 0)) {
+ /*
+ * On Intel (without X86_BUG_NULL_SEG), the segment base could
+ * be the pre-existing saved base or it could be zero. On AMD
+ * (with X86_BUG_NULL_SEG), the segment base could be almost
+ * anything.
+ *
+ * This branch is very hot (it's hit twice on almost every
+ * context switch between 64-bit programs), and avoiding
+ * the RDMSR helps a lot, so we just assume that whatever
+ * value is already saved is correct. This matches historical
+ * Linux behavior, so it won't break existing applications.
+ *
+ * To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we
+ * report that the base is zero, it needs to actually be zero:
+ * see the corresponding logic in load_seg_legacy.
+ */
+ } else {
+ /*
+ * If the selector is 1, 2, or 3, then the base is zero on
+ * !X86_BUG_NULL_SEG CPUs and could be anything on
+ * X86_BUG_NULL_SEG CPUs. In the latter case, Linux
+ * has never attempted to preserve the base across context
+ * switches.
+ *
+ * If selector > 3, then it refers to a real segment, and
+ * saving the base isn't necessary.
+ */
+ if (which == FS)
+ prev_p->thread.fsbase = 0;
+ else
+ prev_p->thread.gsbase = 0;
+ }
+}
+
+static __always_inline void save_fsgs(struct task_struct *task)
+{
+ savesegment(fs, task->thread.fsindex);
+ savesegment(gs, task->thread.gsindex);
+ save_base_legacy(task, task->thread.fsindex, FS);
+ save_base_legacy(task, task->thread.gsindex, GS);
+}
+
+#if IS_ENABLED(CONFIG_KVM)
+/*
+ * While a process is running,current->thread.fsbase and current->thread.gsbase
+ * may not match the corresponding CPU registers (see save_base_legacy()). KVM
+ * wants an efficient way to save and restore FSBASE and GSBASE.
+ * When FSGSBASE extensions are enabled, this will have to use RD{FS,GS}BASE.
+ */
+void save_fsgs_for_kvm(void)
+{
+ save_fsgs(current);
+}
+EXPORT_SYMBOL_GPL(save_fsgs_for_kvm);
+#endif
+
+static __always_inline void loadseg(enum which_selector which,
+ unsigned short sel)
+{
+ if (which == FS)
+ loadsegment(fs, sel);
+ else
+ load_gs_index(sel);
+}
+
+static __always_inline void load_seg_legacy(unsigned short prev_index,
+ unsigned long prev_base,
+ unsigned short next_index,
+ unsigned long next_base,
+ enum which_selector which)
+{
+ if (likely(next_index <= 3)) {
+ /*
+ * The next task is using 64-bit TLS, is not using this
+ * segment at all, or is having fun with arcane CPU features.
+ */
+ if (next_base == 0) {
+ /*
+ * Nasty case: on AMD CPUs, we need to forcibly zero
+ * the base.
+ */
+ if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
+ loadseg(which, __USER_DS);
+ loadseg(which, next_index);
+ } else {
+ /*
+ * We could try to exhaustively detect cases
+ * under which we can skip the segment load,
+ * but there's really only one case that matters
+ * for performance: if both the previous and
+ * next states are fully zeroed, we can skip
+ * the load.
+ *
+ * (This assumes that prev_base == 0 has no
+ * false positives. This is the case on
+ * Intel-style CPUs.)
+ */
+ if (likely(prev_index | next_index | prev_base))
+ loadseg(which, next_index);
+ }
+ } else {
+ if (prev_index != next_index)
+ loadseg(which, next_index);
+ wrmsrl(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE,
+ next_base);
+ }
+ } else {
+ /*
+ * The next task is using a real segment. Loading the selector
+ * is sufficient.
+ */
+ loadseg(which, next_index);
+ }
+}
+
+int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
+ unsigned long arg, struct task_struct *p, unsigned long tls)
+{
+ int err;
+ struct pt_regs *childregs;
+ struct fork_frame *fork_frame;
+ struct inactive_task_frame *frame;
+ struct task_struct *me = current;
+
+ childregs = task_pt_regs(p);
+ fork_frame = container_of(childregs, struct fork_frame, regs);
+ frame = &fork_frame->frame;
+
+ /*
+ * For a new task use the RESET flags value since there is no before.
+ * All the status flags are zero; DF and all the system flags must also
+ * be 0, specifically IF must be 0 because we context switch to the new
+ * task with interrupts disabled.
+ */
+ frame->flags = X86_EFLAGS_FIXED;
+ frame->bp = 0;
+ frame->ret_addr = (unsigned long) ret_from_fork;
+ p->thread.sp = (unsigned long) fork_frame;
+ p->thread.io_bitmap_ptr = NULL;
+
+ savesegment(gs, p->thread.gsindex);
+ p->thread.gsbase = p->thread.gsindex ? 0 : me->thread.gsbase;
+ savesegment(fs, p->thread.fsindex);
+ p->thread.fsbase = p->thread.fsindex ? 0 : me->thread.fsbase;
+ savesegment(es, p->thread.es);
+ savesegment(ds, p->thread.ds);
+ memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
+
+ if (unlikely(p->flags & PF_KTHREAD)) {
+ /* kernel thread */
+ memset(childregs, 0, sizeof(struct pt_regs));
+ frame->bx = sp; /* function */
+ frame->r12 = arg;
+ return 0;
+ }
+ frame->bx = 0;
+ *childregs = *current_pt_regs();
+
+ childregs->ax = 0;
+ if (sp)
+ childregs->sp = sp;
+
+ err = -ENOMEM;
+ if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
+ p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr,
+ IO_BITMAP_BYTES, GFP_KERNEL);
+ if (!p->thread.io_bitmap_ptr) {
+ p->thread.io_bitmap_max = 0;
+ return -ENOMEM;
+ }
+ set_tsk_thread_flag(p, TIF_IO_BITMAP);
+ }
+
+ /*
+ * Set a new TLS for the child thread?
+ */
+ if (clone_flags & CLONE_SETTLS) {
+#ifdef CONFIG_IA32_EMULATION
+ if (in_ia32_syscall())
+ err = do_set_thread_area(p, -1,
+ (struct user_desc __user *)tls, 0);
+ else
+#endif
+ err = do_arch_prctl_64(p, ARCH_SET_FS, tls);
+ if (err)
+ goto out;
+ }
+ err = 0;
+out:
+ if (err && p->thread.io_bitmap_ptr) {
+ kfree(p->thread.io_bitmap_ptr);
+ p->thread.io_bitmap_max = 0;
+ }
+
+ return err;
+}
+
+static void
+start_thread_common(struct pt_regs *regs, unsigned long new_ip,
+ unsigned long new_sp,
+ unsigned int _cs, unsigned int _ss, unsigned int _ds)
+{
+ WARN_ON_ONCE(regs != current_pt_regs());
+
+ if (static_cpu_has(X86_BUG_NULL_SEG)) {
+ /* Loading zero below won't clear the base. */
+ loadsegment(fs, __USER_DS);
+ load_gs_index(__USER_DS);
+ }
+
+ loadsegment(fs, 0);
+ loadsegment(es, _ds);
+ loadsegment(ds, _ds);
+ load_gs_index(0);
+
+ regs->ip = new_ip;
+ regs->sp = new_sp;
+ regs->cs = _cs;
+ regs->ss = _ss;
+ regs->flags = X86_EFLAGS_IF;
+ force_iret();
+}
+
+void
+start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
+{
+ start_thread_common(regs, new_ip, new_sp,
+ __USER_CS, __USER_DS, 0);
+}
+EXPORT_SYMBOL_GPL(start_thread);
+
+#ifdef CONFIG_COMPAT
+void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp)
+{
+ start_thread_common(regs, new_ip, new_sp,
+ test_thread_flag(TIF_X32)
+ ? __USER_CS : __USER32_CS,
+ __USER_DS, __USER_DS);
+}
+#endif
+
+/*
+ * switch_to(x,y) should switch tasks from x to y.
+ *
+ * This could still be optimized:
+ * - fold all the options into a flag word and test it with a single test.
+ * - could test fs/gs bitsliced
+ *
+ * Kprobes not supported here. Set the probe on schedule instead.
+ * Function graph tracer not supported too.
+ */
+__visible __notrace_funcgraph struct task_struct *
+__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
+{
+ struct thread_struct *prev = &prev_p->thread;
+ struct thread_struct *next = &next_p->thread;
+ struct fpu *prev_fpu = &prev->fpu;
+ struct fpu *next_fpu = &next->fpu;
+ int cpu = smp_processor_id();
+
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
+ this_cpu_read(irq_count) != -1);
+
+ switch_fpu_prepare(prev_fpu, cpu);
+
+ /* We must save %fs and %gs before load_TLS() because
+ * %fs and %gs may be cleared by load_TLS().
+ *
+ * (e.g. xen_load_tls())
+ */
+ save_fsgs(prev_p);
+
+ /*
+ * Load TLS before restoring any segments so that segment loads
+ * reference the correct GDT entries.
+ */
+ load_TLS(next, cpu);
+
+ /*
+ * Leave lazy mode, flushing any hypercalls made here. This
+ * must be done after loading TLS entries in the GDT but before
+ * loading segments that might reference them, and and it must
+ * be done before fpu__restore(), so the TS bit is up to
+ * date.
+ */
+ arch_end_context_switch(next_p);
+
+ /* Switch DS and ES.
+ *
+ * Reading them only returns the selectors, but writing them (if
+ * nonzero) loads the full descriptor from the GDT or LDT. The
+ * LDT for next is loaded in switch_mm, and the GDT is loaded
+ * above.
+ *
+ * We therefore need to write new values to the segment
+ * registers on every context switch unless both the new and old
+ * values are zero.
+ *
+ * Note that we don't need to do anything for CS and SS, as
+ * those are saved and restored as part of pt_regs.
+ */
+ savesegment(es, prev->es);
+ if (unlikely(next->es | prev->es))
+ loadsegment(es, next->es);
+
+ savesegment(ds, prev->ds);
+ if (unlikely(next->ds | prev->ds))
+ loadsegment(ds, next->ds);
+
+ load_seg_legacy(prev->fsindex, prev->fsbase,
+ next->fsindex, next->fsbase, FS);
+ load_seg_legacy(prev->gsindex, prev->gsbase,
+ next->gsindex, next->gsbase, GS);
+
+ switch_fpu_finish(next_fpu, cpu);
+
+ /*
+ * Switch the PDA and FPU contexts.
+ */
+ this_cpu_write(current_task, next_p);
+ this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p));
+
+ /* Reload sp0. */
+ update_task_stack(next_p);
+
+ switch_to_extra(prev_p, next_p);
+
+#ifdef CONFIG_XEN_PV
+ /*
+ * On Xen PV, IOPL bits in pt_regs->flags have no effect, and
+ * current_pt_regs()->flags may not match the current task's
+ * intended IOPL. We need to switch it manually.
+ */
+ if (unlikely(static_cpu_has(X86_FEATURE_XENPV) &&
+ prev->iopl != next->iopl))
+ xen_set_iopl_mask(next->iopl);
+#endif
+
+ if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) {
+ /*
+ * AMD CPUs have a misfeature: SYSRET sets the SS selector but
+ * does not update the cached descriptor. As a result, if we
+ * do SYSRET while SS is NULL, we'll end up in user mode with
+ * SS apparently equal to __USER_DS but actually unusable.
+ *
+ * The straightforward workaround would be to fix it up just
+ * before SYSRET, but that would slow down the system call
+ * fast paths. Instead, we ensure that SS is never NULL in
+ * system call context. We do this by replacing NULL SS
+ * selectors at every context switch. SYSCALL sets up a valid
+ * SS, so the only way to get NULL is to re-enter the kernel
+ * from CPL 3 through an interrupt. Since that can't happen
+ * in the same task as a running syscall, we are guaranteed to
+ * context switch between every interrupt vector entry and a
+ * subsequent SYSRET.
+ *
+ * We read SS first because SS reads are much faster than
+ * writes. Out of caution, we force SS to __KERNEL_DS even if
+ * it previously had a different non-NULL value.
+ */
+ unsigned short ss_sel;
+ savesegment(ss, ss_sel);
+ if (ss_sel != __KERNEL_DS)
+ loadsegment(ss, __KERNEL_DS);
+ }
+
+ /* Load the Intel cache allocation PQR MSR. */
+ intel_rdt_sched_in();
+
+ return prev_p;
+}
+
+void set_personality_64bit(void)
+{
+ /* inherit personality from parent */
+
+ /* Make sure to be in 64bit mode */
+ clear_thread_flag(TIF_IA32);
+ clear_thread_flag(TIF_ADDR32);
+ clear_thread_flag(TIF_X32);
+ /* Pretend that this comes from a 64bit execve */
+ task_pt_regs(current)->orig_ax = __NR_execve;
+ current_thread_info()->status &= ~TS_COMPAT;
+
+ /* Ensure the corresponding mm is not marked. */
+ if (current->mm)
+ current->mm->context.ia32_compat = 0;
+
+ /* TBD: overwrites user setup. Should have two bits.
+ But 64bit processes have always behaved this way,
+ so it's not too bad. The main problem is just that
+ 32bit childs are affected again. */
+ current->personality &= ~READ_IMPLIES_EXEC;
+}
+
+static void __set_personality_x32(void)
+{
+#ifdef CONFIG_X86_X32
+ clear_thread_flag(TIF_IA32);
+ set_thread_flag(TIF_X32);
+ if (current->mm)
+ current->mm->context.ia32_compat = TIF_X32;
+ current->personality &= ~READ_IMPLIES_EXEC;
+ /*
+ * in_compat_syscall() uses the presence of the x32 syscall bit
+ * flag to determine compat status. The x86 mmap() code relies on
+ * the syscall bitness so set x32 syscall bit right here to make
+ * in_compat_syscall() work during exec().
+ *
+ * Pretend to come from a x32 execve.
+ */
+ task_pt_regs(current)->orig_ax = __NR_x32_execve | __X32_SYSCALL_BIT;
+ current_thread_info()->status &= ~TS_COMPAT;
+#endif
+}
+
+static void __set_personality_ia32(void)
+{
+#ifdef CONFIG_IA32_EMULATION
+ set_thread_flag(TIF_IA32);
+ clear_thread_flag(TIF_X32);
+ if (current->mm)
+ current->mm->context.ia32_compat = TIF_IA32;
+ current->personality |= force_personality32;
+ /* Prepare the first "return" to user space */
+ task_pt_regs(current)->orig_ax = __NR_ia32_execve;
+ current_thread_info()->status |= TS_COMPAT;
+#endif
+}
+
+void set_personality_ia32(bool x32)
+{
+ /* Make sure to be in 32bit mode */
+ set_thread_flag(TIF_ADDR32);
+
+ if (x32)
+ __set_personality_x32();
+ else
+ __set_personality_ia32();
+}
+EXPORT_SYMBOL_GPL(set_personality_ia32);
+
+#ifdef CONFIG_CHECKPOINT_RESTORE
+static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr)
+{
+ int ret;
+
+ ret = map_vdso_once(image, addr);
+ if (ret)
+ return ret;
+
+ return (long)image->size;
+}
+#endif
+
+long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2)
+{
+ int ret = 0;
+ int doit = task == current;
+ int cpu;
+
+ switch (option) {
+ case ARCH_SET_GS:
+ if (arg2 >= TASK_SIZE_MAX)
+ return -EPERM;
+ cpu = get_cpu();
+ task->thread.gsindex = 0;
+ task->thread.gsbase = arg2;
+ if (doit) {
+ load_gs_index(0);
+ ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, arg2);
+ }
+ put_cpu();
+ break;
+ case ARCH_SET_FS:
+ /* Not strictly needed for fs, but do it for symmetry
+ with gs */
+ if (arg2 >= TASK_SIZE_MAX)
+ return -EPERM;
+ cpu = get_cpu();
+ task->thread.fsindex = 0;
+ task->thread.fsbase = arg2;
+ if (doit) {
+ /* set the selector to 0 to not confuse __switch_to */
+ loadsegment(fs, 0);
+ ret = wrmsrl_safe(MSR_FS_BASE, arg2);
+ }
+ put_cpu();
+ break;
+ case ARCH_GET_FS: {
+ unsigned long base;
+
+ if (doit)
+ rdmsrl(MSR_FS_BASE, base);
+ else
+ base = task->thread.fsbase;
+ ret = put_user(base, (unsigned long __user *)arg2);
+ break;
+ }
+ case ARCH_GET_GS: {
+ unsigned long base;
+
+ if (doit)
+ rdmsrl(MSR_KERNEL_GS_BASE, base);
+ else
+ base = task->thread.gsbase;
+ ret = put_user(base, (unsigned long __user *)arg2);
+ break;
+ }
+
+#ifdef CONFIG_CHECKPOINT_RESTORE
+# ifdef CONFIG_X86_X32_ABI
+ case ARCH_MAP_VDSO_X32:
+ return prctl_map_vdso(&vdso_image_x32, arg2);
+# endif
+# if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
+ case ARCH_MAP_VDSO_32:
+ return prctl_map_vdso(&vdso_image_32, arg2);
+# endif
+ case ARCH_MAP_VDSO_64:
+ return prctl_map_vdso(&vdso_image_64, arg2);
+#endif
+
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ return ret;
+}
+
+SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
+{
+ long ret;
+
+ ret = do_arch_prctl_64(current, option, arg2);
+ if (ret == -EINVAL)
+ ret = do_arch_prctl_common(current, option, arg2);
+
+ return ret;
+}
+
+#ifdef CONFIG_IA32_EMULATION
+COMPAT_SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
+{
+ return do_arch_prctl_common(current, option, arg2);
+}
+#endif
+
+unsigned long KSTK_ESP(struct task_struct *task)
+{
+ return task_pt_regs(task)->sp;
+}
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
new file mode 100644
index 000000000..1401f86e4
--- /dev/null
+++ b/arch/x86/kernel/ptrace.c
@@ -0,0 +1,1445 @@
+/* By Ross Biro 1/23/92 */
+/*
+ * Pentium III FXSR, SSE support
+ * Gareth Hughes <gareth@valinux.com>, May 2000
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/ptrace.h>
+#include <linux/tracehook.h>
+#include <linux/user.h>
+#include <linux/elf.h>
+#include <linux/security.h>
+#include <linux/audit.h>
+#include <linux/seccomp.h>
+#include <linux/signal.h>
+#include <linux/perf_event.h>
+#include <linux/hw_breakpoint.h>
+#include <linux/rcupdate.h>
+#include <linux/export.h>
+#include <linux/context_tracking.h>
+#include <linux/nospec.h>
+
+#include <linux/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/processor.h>
+#include <asm/fpu/internal.h>
+#include <asm/fpu/signal.h>
+#include <asm/fpu/regset.h>
+#include <asm/debugreg.h>
+#include <asm/ldt.h>
+#include <asm/desc.h>
+#include <asm/prctl.h>
+#include <asm/proto.h>
+#include <asm/hw_breakpoint.h>
+#include <asm/traps.h>
+#include <asm/syscall.h>
+#include <asm/mmu_context.h>
+
+#include "tls.h"
+
+enum x86_regset {
+ REGSET_GENERAL,
+ REGSET_FP,
+ REGSET_XFP,
+ REGSET_IOPERM64 = REGSET_XFP,
+ REGSET_XSTATE,
+ REGSET_TLS,
+ REGSET_IOPERM32,
+};
+
+struct pt_regs_offset {
+ const char *name;
+ int offset;
+};
+
+#define REG_OFFSET_NAME(r) {.name = #r, .offset = offsetof(struct pt_regs, r)}
+#define REG_OFFSET_END {.name = NULL, .offset = 0}
+
+static const struct pt_regs_offset regoffset_table[] = {
+#ifdef CONFIG_X86_64
+ REG_OFFSET_NAME(r15),
+ REG_OFFSET_NAME(r14),
+ REG_OFFSET_NAME(r13),
+ REG_OFFSET_NAME(r12),
+ REG_OFFSET_NAME(r11),
+ REG_OFFSET_NAME(r10),
+ REG_OFFSET_NAME(r9),
+ REG_OFFSET_NAME(r8),
+#endif
+ REG_OFFSET_NAME(bx),
+ REG_OFFSET_NAME(cx),
+ REG_OFFSET_NAME(dx),
+ REG_OFFSET_NAME(si),
+ REG_OFFSET_NAME(di),
+ REG_OFFSET_NAME(bp),
+ REG_OFFSET_NAME(ax),
+#ifdef CONFIG_X86_32
+ REG_OFFSET_NAME(ds),
+ REG_OFFSET_NAME(es),
+ REG_OFFSET_NAME(fs),
+ REG_OFFSET_NAME(gs),
+#endif
+ REG_OFFSET_NAME(orig_ax),
+ REG_OFFSET_NAME(ip),
+ REG_OFFSET_NAME(cs),
+ REG_OFFSET_NAME(flags),
+ REG_OFFSET_NAME(sp),
+ REG_OFFSET_NAME(ss),
+ REG_OFFSET_END,
+};
+
+/**
+ * regs_query_register_offset() - query register offset from its name
+ * @name: the name of a register
+ *
+ * regs_query_register_offset() returns the offset of a register in struct
+ * pt_regs from its name. If the name is invalid, this returns -EINVAL;
+ */
+int regs_query_register_offset(const char *name)
+{
+ const struct pt_regs_offset *roff;
+ for (roff = regoffset_table; roff->name != NULL; roff++)
+ if (!strcmp(roff->name, name))
+ return roff->offset;
+ return -EINVAL;
+}
+
+/**
+ * regs_query_register_name() - query register name from its offset
+ * @offset: the offset of a register in struct pt_regs.
+ *
+ * regs_query_register_name() returns the name of a register from its
+ * offset in struct pt_regs. If the @offset is invalid, this returns NULL;
+ */
+const char *regs_query_register_name(unsigned int offset)
+{
+ const struct pt_regs_offset *roff;
+ for (roff = regoffset_table; roff->name != NULL; roff++)
+ if (roff->offset == offset)
+ return roff->name;
+ return NULL;
+}
+
+/*
+ * does not yet catch signals sent when the child dies.
+ * in exit.c or in signal.c.
+ */
+
+/*
+ * Determines which flags the user has access to [1 = access, 0 = no access].
+ */
+#define FLAG_MASK_32 ((unsigned long) \
+ (X86_EFLAGS_CF | X86_EFLAGS_PF | \
+ X86_EFLAGS_AF | X86_EFLAGS_ZF | \
+ X86_EFLAGS_SF | X86_EFLAGS_TF | \
+ X86_EFLAGS_DF | X86_EFLAGS_OF | \
+ X86_EFLAGS_RF | X86_EFLAGS_AC))
+
+/*
+ * Determines whether a value may be installed in a segment register.
+ */
+static inline bool invalid_selector(u16 value)
+{
+ return unlikely(value != 0 && (value & SEGMENT_RPL_MASK) != USER_RPL);
+}
+
+#ifdef CONFIG_X86_32
+
+#define FLAG_MASK FLAG_MASK_32
+
+/*
+ * X86_32 CPUs don't save ss and esp if the CPU is already in kernel mode
+ * when it traps. The previous stack will be directly underneath the saved
+ * registers, and 'sp/ss' won't even have been saved. Thus the '&regs->sp'.
+ *
+ * Now, if the stack is empty, '&regs->sp' is out of range. In this
+ * case we try to take the previous stack. To always return a non-null
+ * stack pointer we fall back to regs as stack if no previous stack
+ * exists.
+ *
+ * This is valid only for kernel mode traps.
+ */
+unsigned long kernel_stack_pointer(struct pt_regs *regs)
+{
+ unsigned long context = (unsigned long)regs & ~(THREAD_SIZE - 1);
+ unsigned long sp = (unsigned long)&regs->sp;
+ u32 *prev_esp;
+
+ if (context == (sp & ~(THREAD_SIZE - 1)))
+ return sp;
+
+ prev_esp = (u32 *)(context);
+ if (*prev_esp)
+ return (unsigned long)*prev_esp;
+
+ return (unsigned long)regs;
+}
+EXPORT_SYMBOL_GPL(kernel_stack_pointer);
+
+static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long regno)
+{
+ BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0);
+ return &regs->bx + (regno >> 2);
+}
+
+static u16 get_segment_reg(struct task_struct *task, unsigned long offset)
+{
+ /*
+ * Returning the value truncates it to 16 bits.
+ */
+ unsigned int retval;
+ if (offset != offsetof(struct user_regs_struct, gs))
+ retval = *pt_regs_access(task_pt_regs(task), offset);
+ else {
+ if (task == current)
+ retval = get_user_gs(task_pt_regs(task));
+ else
+ retval = task_user_gs(task);
+ }
+ return retval;
+}
+
+static int set_segment_reg(struct task_struct *task,
+ unsigned long offset, u16 value)
+{
+ /*
+ * The value argument was already truncated to 16 bits.
+ */
+ if (invalid_selector(value))
+ return -EIO;
+
+ /*
+ * For %cs and %ss we cannot permit a null selector.
+ * We can permit a bogus selector as long as it has USER_RPL.
+ * Null selectors are fine for other segment registers, but
+ * we will never get back to user mode with invalid %cs or %ss
+ * and will take the trap in iret instead. Much code relies
+ * on user_mode() to distinguish a user trap frame (which can
+ * safely use invalid selectors) from a kernel trap frame.
+ */
+ switch (offset) {
+ case offsetof(struct user_regs_struct, cs):
+ case offsetof(struct user_regs_struct, ss):
+ if (unlikely(value == 0))
+ return -EIO;
+
+ default:
+ *pt_regs_access(task_pt_regs(task), offset) = value;
+ break;
+
+ case offsetof(struct user_regs_struct, gs):
+ if (task == current)
+ set_user_gs(task_pt_regs(task), value);
+ else
+ task_user_gs(task) = value;
+ }
+
+ return 0;
+}
+
+#else /* CONFIG_X86_64 */
+
+#define FLAG_MASK (FLAG_MASK_32 | X86_EFLAGS_NT)
+
+static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long offset)
+{
+ BUILD_BUG_ON(offsetof(struct pt_regs, r15) != 0);
+ return &regs->r15 + (offset / sizeof(regs->r15));
+}
+
+static u16 get_segment_reg(struct task_struct *task, unsigned long offset)
+{
+ /*
+ * Returning the value truncates it to 16 bits.
+ */
+ unsigned int seg;
+
+ switch (offset) {
+ case offsetof(struct user_regs_struct, fs):
+ if (task == current) {
+ /* Older gas can't assemble movq %?s,%r?? */
+ asm("movl %%fs,%0" : "=r" (seg));
+ return seg;
+ }
+ return task->thread.fsindex;
+ case offsetof(struct user_regs_struct, gs):
+ if (task == current) {
+ asm("movl %%gs,%0" : "=r" (seg));
+ return seg;
+ }
+ return task->thread.gsindex;
+ case offsetof(struct user_regs_struct, ds):
+ if (task == current) {
+ asm("movl %%ds,%0" : "=r" (seg));
+ return seg;
+ }
+ return task->thread.ds;
+ case offsetof(struct user_regs_struct, es):
+ if (task == current) {
+ asm("movl %%es,%0" : "=r" (seg));
+ return seg;
+ }
+ return task->thread.es;
+
+ case offsetof(struct user_regs_struct, cs):
+ case offsetof(struct user_regs_struct, ss):
+ break;
+ }
+ return *pt_regs_access(task_pt_regs(task), offset);
+}
+
+static int set_segment_reg(struct task_struct *task,
+ unsigned long offset, u16 value)
+{
+ /*
+ * The value argument was already truncated to 16 bits.
+ */
+ if (invalid_selector(value))
+ return -EIO;
+
+ switch (offset) {
+ case offsetof(struct user_regs_struct,fs):
+ task->thread.fsindex = value;
+ if (task == current)
+ loadsegment(fs, task->thread.fsindex);
+ break;
+ case offsetof(struct user_regs_struct,gs):
+ task->thread.gsindex = value;
+ if (task == current)
+ load_gs_index(task->thread.gsindex);
+ break;
+ case offsetof(struct user_regs_struct,ds):
+ task->thread.ds = value;
+ if (task == current)
+ loadsegment(ds, task->thread.ds);
+ break;
+ case offsetof(struct user_regs_struct,es):
+ task->thread.es = value;
+ if (task == current)
+ loadsegment(es, task->thread.es);
+ break;
+
+ /*
+ * Can't actually change these in 64-bit mode.
+ */
+ case offsetof(struct user_regs_struct,cs):
+ if (unlikely(value == 0))
+ return -EIO;
+ task_pt_regs(task)->cs = value;
+ break;
+ case offsetof(struct user_regs_struct,ss):
+ if (unlikely(value == 0))
+ return -EIO;
+ task_pt_regs(task)->ss = value;
+ break;
+ }
+
+ return 0;
+}
+
+static unsigned long task_seg_base(struct task_struct *task,
+ unsigned short selector)
+{
+ unsigned short idx = selector >> 3;
+ unsigned long base;
+
+ if (likely((selector & SEGMENT_TI_MASK) == 0)) {
+ if (unlikely(idx >= GDT_ENTRIES))
+ return 0;
+
+ /*
+ * There are no user segments in the GDT with nonzero bases
+ * other than the TLS segments.
+ */
+ if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
+ return 0;
+
+ idx -= GDT_ENTRY_TLS_MIN;
+ base = get_desc_base(&task->thread.tls_array[idx]);
+ } else {
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+ struct ldt_struct *ldt;
+
+ /*
+ * If performance here mattered, we could protect the LDT
+ * with RCU. This is a slow path, though, so we can just
+ * take the mutex.
+ */
+ mutex_lock(&task->mm->context.lock);
+ ldt = task->mm->context.ldt;
+ if (unlikely(!ldt || idx >= ldt->nr_entries))
+ base = 0;
+ else
+ base = get_desc_base(ldt->entries + idx);
+ mutex_unlock(&task->mm->context.lock);
+#else
+ base = 0;
+#endif
+ }
+
+ return base;
+}
+
+#endif /* CONFIG_X86_32 */
+
+static unsigned long get_flags(struct task_struct *task)
+{
+ unsigned long retval = task_pt_regs(task)->flags;
+
+ /*
+ * If the debugger set TF, hide it from the readout.
+ */
+ if (test_tsk_thread_flag(task, TIF_FORCED_TF))
+ retval &= ~X86_EFLAGS_TF;
+
+ return retval;
+}
+
+static int set_flags(struct task_struct *task, unsigned long value)
+{
+ struct pt_regs *regs = task_pt_regs(task);
+
+ /*
+ * If the user value contains TF, mark that
+ * it was not "us" (the debugger) that set it.
+ * If not, make sure it stays set if we had.
+ */
+ if (value & X86_EFLAGS_TF)
+ clear_tsk_thread_flag(task, TIF_FORCED_TF);
+ else if (test_tsk_thread_flag(task, TIF_FORCED_TF))
+ value |= X86_EFLAGS_TF;
+
+ regs->flags = (regs->flags & ~FLAG_MASK) | (value & FLAG_MASK);
+
+ return 0;
+}
+
+static int putreg(struct task_struct *child,
+ unsigned long offset, unsigned long value)
+{
+ switch (offset) {
+ case offsetof(struct user_regs_struct, cs):
+ case offsetof(struct user_regs_struct, ds):
+ case offsetof(struct user_regs_struct, es):
+ case offsetof(struct user_regs_struct, fs):
+ case offsetof(struct user_regs_struct, gs):
+ case offsetof(struct user_regs_struct, ss):
+ return set_segment_reg(child, offset, value);
+
+ case offsetof(struct user_regs_struct, flags):
+ return set_flags(child, value);
+
+#ifdef CONFIG_X86_64
+ case offsetof(struct user_regs_struct,fs_base):
+ if (value >= TASK_SIZE_MAX)
+ return -EIO;
+ /*
+ * When changing the segment base, use do_arch_prctl_64
+ * to set either thread.fs or thread.fsindex and the
+ * corresponding GDT slot.
+ */
+ if (child->thread.fsbase != value)
+ return do_arch_prctl_64(child, ARCH_SET_FS, value);
+ return 0;
+ case offsetof(struct user_regs_struct,gs_base):
+ /*
+ * Exactly the same here as the %fs handling above.
+ */
+ if (value >= TASK_SIZE_MAX)
+ return -EIO;
+ if (child->thread.gsbase != value)
+ return do_arch_prctl_64(child, ARCH_SET_GS, value);
+ return 0;
+#endif
+ }
+
+ *pt_regs_access(task_pt_regs(child), offset) = value;
+ return 0;
+}
+
+static unsigned long getreg(struct task_struct *task, unsigned long offset)
+{
+ switch (offset) {
+ case offsetof(struct user_regs_struct, cs):
+ case offsetof(struct user_regs_struct, ds):
+ case offsetof(struct user_regs_struct, es):
+ case offsetof(struct user_regs_struct, fs):
+ case offsetof(struct user_regs_struct, gs):
+ case offsetof(struct user_regs_struct, ss):
+ return get_segment_reg(task, offset);
+
+ case offsetof(struct user_regs_struct, flags):
+ return get_flags(task);
+
+#ifdef CONFIG_X86_64
+ case offsetof(struct user_regs_struct, fs_base): {
+ if (task->thread.fsindex == 0)
+ return task->thread.fsbase;
+ else
+ return task_seg_base(task, task->thread.fsindex);
+ }
+ case offsetof(struct user_regs_struct, gs_base): {
+ if (task->thread.gsindex == 0)
+ return task->thread.gsbase;
+ else
+ return task_seg_base(task, task->thread.gsindex);
+ }
+#endif
+ }
+
+ return *pt_regs_access(task_pt_regs(task), offset);
+}
+
+static int genregs_get(struct task_struct *target,
+ const struct user_regset *regset,
+ unsigned int pos, unsigned int count,
+ void *kbuf, void __user *ubuf)
+{
+ if (kbuf) {
+ unsigned long *k = kbuf;
+ while (count >= sizeof(*k)) {
+ *k++ = getreg(target, pos);
+ count -= sizeof(*k);
+ pos += sizeof(*k);
+ }
+ } else {
+ unsigned long __user *u = ubuf;
+ while (count >= sizeof(*u)) {
+ if (__put_user(getreg(target, pos), u++))
+ return -EFAULT;
+ count -= sizeof(*u);
+ pos += sizeof(*u);
+ }
+ }
+
+ return 0;
+}
+
+static int genregs_set(struct task_struct *target,
+ const struct user_regset *regset,
+ unsigned int pos, unsigned int count,
+ const void *kbuf, const void __user *ubuf)
+{
+ int ret = 0;
+ if (kbuf) {
+ const unsigned long *k = kbuf;
+ while (count >= sizeof(*k) && !ret) {
+ ret = putreg(target, pos, *k++);
+ count -= sizeof(*k);
+ pos += sizeof(*k);
+ }
+ } else {
+ const unsigned long __user *u = ubuf;
+ while (count >= sizeof(*u) && !ret) {
+ unsigned long word;
+ ret = __get_user(word, u++);
+ if (ret)
+ break;
+ ret = putreg(target, pos, word);
+ count -= sizeof(*u);
+ pos += sizeof(*u);
+ }
+ }
+ return ret;
+}
+
+static void ptrace_triggered(struct perf_event *bp,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ int i;
+ struct thread_struct *thread = &(current->thread);
+
+ /*
+ * Store in the virtual DR6 register the fact that the breakpoint
+ * was hit so the thread's debugger will see it.
+ */
+ for (i = 0; i < HBP_NUM; i++) {
+ if (thread->ptrace_bps[i] == bp)
+ break;
+ }
+
+ thread->debugreg6 |= (DR_TRAP0 << i);
+}
+
+/*
+ * Walk through every ptrace breakpoints for this thread and
+ * build the dr7 value on top of their attributes.
+ *
+ */
+static unsigned long ptrace_get_dr7(struct perf_event *bp[])
+{
+ int i;
+ int dr7 = 0;
+ struct arch_hw_breakpoint *info;
+
+ for (i = 0; i < HBP_NUM; i++) {
+ if (bp[i] && !bp[i]->attr.disabled) {
+ info = counter_arch_bp(bp[i]);
+ dr7 |= encode_dr7(i, info->len, info->type);
+ }
+ }
+
+ return dr7;
+}
+
+static int ptrace_fill_bp_fields(struct perf_event_attr *attr,
+ int len, int type, bool disabled)
+{
+ int err, bp_len, bp_type;
+
+ err = arch_bp_generic_fields(len, type, &bp_len, &bp_type);
+ if (!err) {
+ attr->bp_len = bp_len;
+ attr->bp_type = bp_type;
+ attr->disabled = disabled;
+ }
+
+ return err;
+}
+
+static struct perf_event *
+ptrace_register_breakpoint(struct task_struct *tsk, int len, int type,
+ unsigned long addr, bool disabled)
+{
+ struct perf_event_attr attr;
+ int err;
+
+ ptrace_breakpoint_init(&attr);
+ attr.bp_addr = addr;
+
+ err = ptrace_fill_bp_fields(&attr, len, type, disabled);
+ if (err)
+ return ERR_PTR(err);
+
+ return register_user_hw_breakpoint(&attr, ptrace_triggered,
+ NULL, tsk);
+}
+
+static int ptrace_modify_breakpoint(struct perf_event *bp, int len, int type,
+ int disabled)
+{
+ struct perf_event_attr attr = bp->attr;
+ int err;
+
+ err = ptrace_fill_bp_fields(&attr, len, type, disabled);
+ if (err)
+ return err;
+
+ return modify_user_hw_breakpoint(bp, &attr);
+}
+
+/*
+ * Handle ptrace writes to debug register 7.
+ */
+static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data)
+{
+ struct thread_struct *thread = &tsk->thread;
+ unsigned long old_dr7;
+ bool second_pass = false;
+ int i, rc, ret = 0;
+
+ data &= ~DR_CONTROL_RESERVED;
+ old_dr7 = ptrace_get_dr7(thread->ptrace_bps);
+
+restore:
+ rc = 0;
+ for (i = 0; i < HBP_NUM; i++) {
+ unsigned len, type;
+ bool disabled = !decode_dr7(data, i, &len, &type);
+ struct perf_event *bp = thread->ptrace_bps[i];
+
+ if (!bp) {
+ if (disabled)
+ continue;
+
+ bp = ptrace_register_breakpoint(tsk,
+ len, type, 0, disabled);
+ if (IS_ERR(bp)) {
+ rc = PTR_ERR(bp);
+ break;
+ }
+
+ thread->ptrace_bps[i] = bp;
+ continue;
+ }
+
+ rc = ptrace_modify_breakpoint(bp, len, type, disabled);
+ if (rc)
+ break;
+ }
+
+ /* Restore if the first pass failed, second_pass shouldn't fail. */
+ if (rc && !WARN_ON(second_pass)) {
+ ret = rc;
+ data = old_dr7;
+ second_pass = true;
+ goto restore;
+ }
+
+ return ret;
+}
+
+/*
+ * Handle PTRACE_PEEKUSR calls for the debug register area.
+ */
+static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n)
+{
+ struct thread_struct *thread = &tsk->thread;
+ unsigned long val = 0;
+
+ if (n < HBP_NUM) {
+ int index = array_index_nospec(n, HBP_NUM);
+ struct perf_event *bp = thread->ptrace_bps[index];
+
+ if (bp)
+ val = bp->hw.info.address;
+ } else if (n == 6) {
+ val = thread->debugreg6;
+ } else if (n == 7) {
+ val = thread->ptrace_dr7;
+ }
+ return val;
+}
+
+static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
+ unsigned long addr)
+{
+ struct thread_struct *t = &tsk->thread;
+ struct perf_event *bp = t->ptrace_bps[nr];
+ int err = 0;
+
+ if (!bp) {
+ /*
+ * Put stub len and type to create an inactive but correct bp.
+ *
+ * CHECKME: the previous code returned -EIO if the addr wasn't
+ * a valid task virtual addr. The new one will return -EINVAL in
+ * this case.
+ * -EINVAL may be what we want for in-kernel breakpoints users,
+ * but -EIO looks better for ptrace, since we refuse a register
+ * writing for the user. And anyway this is the previous
+ * behaviour.
+ */
+ bp = ptrace_register_breakpoint(tsk,
+ X86_BREAKPOINT_LEN_1, X86_BREAKPOINT_WRITE,
+ addr, true);
+ if (IS_ERR(bp))
+ err = PTR_ERR(bp);
+ else
+ t->ptrace_bps[nr] = bp;
+ } else {
+ struct perf_event_attr attr = bp->attr;
+
+ attr.bp_addr = addr;
+ err = modify_user_hw_breakpoint(bp, &attr);
+ }
+
+ return err;
+}
+
+/*
+ * Handle PTRACE_POKEUSR calls for the debug register area.
+ */
+static int ptrace_set_debugreg(struct task_struct *tsk, int n,
+ unsigned long val)
+{
+ struct thread_struct *thread = &tsk->thread;
+ /* There are no DR4 or DR5 registers */
+ int rc = -EIO;
+
+ if (n < HBP_NUM) {
+ rc = ptrace_set_breakpoint_addr(tsk, n, val);
+ } else if (n == 6) {
+ thread->debugreg6 = val;
+ rc = 0;
+ } else if (n == 7) {
+ rc = ptrace_write_dr7(tsk, val);
+ if (!rc)
+ thread->ptrace_dr7 = val;
+ }
+ return rc;
+}
+
+/*
+ * These access the current or another (stopped) task's io permission
+ * bitmap for debugging or core dump.
+ */
+static int ioperm_active(struct task_struct *target,
+ const struct user_regset *regset)
+{
+ return target->thread.io_bitmap_max / regset->size;
+}
+
+static int ioperm_get(struct task_struct *target,
+ const struct user_regset *regset,
+ unsigned int pos, unsigned int count,
+ void *kbuf, void __user *ubuf)
+{
+ if (!target->thread.io_bitmap_ptr)
+ return -ENXIO;
+
+ return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+ target->thread.io_bitmap_ptr,
+ 0, IO_BITMAP_BYTES);
+}
+
+/*
+ * Called by kernel/ptrace.c when detaching..
+ *
+ * Make sure the single step bit is not set.
+ */
+void ptrace_disable(struct task_struct *child)
+{
+ user_disable_single_step(child);
+#ifdef TIF_SYSCALL_EMU
+ clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
+#endif
+}
+
+#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
+static const struct user_regset_view user_x86_32_view; /* Initialized below. */
+#endif
+
+long arch_ptrace(struct task_struct *child, long request,
+ unsigned long addr, unsigned long data)
+{
+ int ret;
+ unsigned long __user *datap = (unsigned long __user *)data;
+
+ switch (request) {
+ /* read the word at location addr in the USER area. */
+ case PTRACE_PEEKUSR: {
+ unsigned long tmp;
+
+ ret = -EIO;
+ if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user))
+ break;
+
+ tmp = 0; /* Default return condition */
+ if (addr < sizeof(struct user_regs_struct))
+ tmp = getreg(child, addr);
+ else if (addr >= offsetof(struct user, u_debugreg[0]) &&
+ addr <= offsetof(struct user, u_debugreg[7])) {
+ addr -= offsetof(struct user, u_debugreg[0]);
+ tmp = ptrace_get_debugreg(child, addr / sizeof(data));
+ }
+ ret = put_user(tmp, datap);
+ break;
+ }
+
+ case PTRACE_POKEUSR: /* write the word at location addr in the USER area */
+ ret = -EIO;
+ if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user))
+ break;
+
+ if (addr < sizeof(struct user_regs_struct))
+ ret = putreg(child, addr, data);
+ else if (addr >= offsetof(struct user, u_debugreg[0]) &&
+ addr <= offsetof(struct user, u_debugreg[7])) {
+ addr -= offsetof(struct user, u_debugreg[0]);
+ ret = ptrace_set_debugreg(child,
+ addr / sizeof(data), data);
+ }
+ break;
+
+ case PTRACE_GETREGS: /* Get all gp regs from the child. */
+ return copy_regset_to_user(child,
+ task_user_regset_view(current),
+ REGSET_GENERAL,
+ 0, sizeof(struct user_regs_struct),
+ datap);
+
+ case PTRACE_SETREGS: /* Set all gp regs in the child. */
+ return copy_regset_from_user(child,
+ task_user_regset_view(current),
+ REGSET_GENERAL,
+ 0, sizeof(struct user_regs_struct),
+ datap);
+
+ case PTRACE_GETFPREGS: /* Get the child FPU state. */
+ return copy_regset_to_user(child,
+ task_user_regset_view(current),
+ REGSET_FP,
+ 0, sizeof(struct user_i387_struct),
+ datap);
+
+ case PTRACE_SETFPREGS: /* Set the child FPU state. */
+ return copy_regset_from_user(child,
+ task_user_regset_view(current),
+ REGSET_FP,
+ 0, sizeof(struct user_i387_struct),
+ datap);
+
+#ifdef CONFIG_X86_32
+ case PTRACE_GETFPXREGS: /* Get the child extended FPU state. */
+ return copy_regset_to_user(child, &user_x86_32_view,
+ REGSET_XFP,
+ 0, sizeof(struct user_fxsr_struct),
+ datap) ? -EIO : 0;
+
+ case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */
+ return copy_regset_from_user(child, &user_x86_32_view,
+ REGSET_XFP,
+ 0, sizeof(struct user_fxsr_struct),
+ datap) ? -EIO : 0;
+#endif
+
+#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
+ case PTRACE_GET_THREAD_AREA:
+ if ((int) addr < 0)
+ return -EIO;
+ ret = do_get_thread_area(child, addr,
+ (struct user_desc __user *)data);
+ break;
+
+ case PTRACE_SET_THREAD_AREA:
+ if ((int) addr < 0)
+ return -EIO;
+ ret = do_set_thread_area(child, addr,
+ (struct user_desc __user *)data, 0);
+ break;
+#endif
+
+#ifdef CONFIG_X86_64
+ /* normal 64bit interface to access TLS data.
+ Works just like arch_prctl, except that the arguments
+ are reversed. */
+ case PTRACE_ARCH_PRCTL:
+ ret = do_arch_prctl_64(child, data, addr);
+ break;
+#endif
+
+ default:
+ ret = ptrace_request(child, request, addr, data);
+ break;
+ }
+
+ return ret;
+}
+
+#ifdef CONFIG_IA32_EMULATION
+
+#include <linux/compat.h>
+#include <linux/syscalls.h>
+#include <asm/ia32.h>
+#include <asm/user32.h>
+
+#define R32(l,q) \
+ case offsetof(struct user32, regs.l): \
+ regs->q = value; break
+
+#define SEG32(rs) \
+ case offsetof(struct user32, regs.rs): \
+ return set_segment_reg(child, \
+ offsetof(struct user_regs_struct, rs), \
+ value); \
+ break
+
+static int putreg32(struct task_struct *child, unsigned regno, u32 value)
+{
+ struct pt_regs *regs = task_pt_regs(child);
+
+ switch (regno) {
+
+ SEG32(cs);
+ SEG32(ds);
+ SEG32(es);
+ SEG32(fs);
+ SEG32(gs);
+ SEG32(ss);
+
+ R32(ebx, bx);
+ R32(ecx, cx);
+ R32(edx, dx);
+ R32(edi, di);
+ R32(esi, si);
+ R32(ebp, bp);
+ R32(eax, ax);
+ R32(eip, ip);
+ R32(esp, sp);
+
+ case offsetof(struct user32, regs.orig_eax):
+ /*
+ * Warning: bizarre corner case fixup here. A 32-bit
+ * debugger setting orig_eax to -1 wants to disable
+ * syscall restart. Make sure that the syscall
+ * restart code sign-extends orig_ax. Also make sure
+ * we interpret the -ERESTART* codes correctly if
+ * loaded into regs->ax in case the task is not
+ * actually still sitting at the exit from a 32-bit
+ * syscall with TS_COMPAT still set.
+ */
+ regs->orig_ax = value;
+ if (syscall_get_nr(child, regs) >= 0)
+ child->thread_info.status |= TS_I386_REGS_POKED;
+ break;
+
+ case offsetof(struct user32, regs.eflags):
+ return set_flags(child, value);
+
+ case offsetof(struct user32, u_debugreg[0]) ...
+ offsetof(struct user32, u_debugreg[7]):
+ regno -= offsetof(struct user32, u_debugreg[0]);
+ return ptrace_set_debugreg(child, regno / 4, value);
+
+ default:
+ if (regno > sizeof(struct user32) || (regno & 3))
+ return -EIO;
+
+ /*
+ * Other dummy fields in the virtual user structure
+ * are ignored
+ */
+ break;
+ }
+ return 0;
+}
+
+#undef R32
+#undef SEG32
+
+#define R32(l,q) \
+ case offsetof(struct user32, regs.l): \
+ *val = regs->q; break
+
+#define SEG32(rs) \
+ case offsetof(struct user32, regs.rs): \
+ *val = get_segment_reg(child, \
+ offsetof(struct user_regs_struct, rs)); \
+ break
+
+static int getreg32(struct task_struct *child, unsigned regno, u32 *val)
+{
+ struct pt_regs *regs = task_pt_regs(child);
+
+ switch (regno) {
+
+ SEG32(ds);
+ SEG32(es);
+ SEG32(fs);
+ SEG32(gs);
+
+ R32(cs, cs);
+ R32(ss, ss);
+ R32(ebx, bx);
+ R32(ecx, cx);
+ R32(edx, dx);
+ R32(edi, di);
+ R32(esi, si);
+ R32(ebp, bp);
+ R32(eax, ax);
+ R32(orig_eax, orig_ax);
+ R32(eip, ip);
+ R32(esp, sp);
+
+ case offsetof(struct user32, regs.eflags):
+ *val = get_flags(child);
+ break;
+
+ case offsetof(struct user32, u_debugreg[0]) ...
+ offsetof(struct user32, u_debugreg[7]):
+ regno -= offsetof(struct user32, u_debugreg[0]);
+ *val = ptrace_get_debugreg(child, regno / 4);
+ break;
+
+ default:
+ if (regno > sizeof(struct user32) || (regno & 3))
+ return -EIO;
+
+ /*
+ * Other dummy fields in the virtual user structure
+ * are ignored
+ */
+ *val = 0;
+ break;
+ }
+ return 0;
+}
+
+#undef R32
+#undef SEG32
+
+static int genregs32_get(struct task_struct *target,
+ const struct user_regset *regset,
+ unsigned int pos, unsigned int count,
+ void *kbuf, void __user *ubuf)
+{
+ if (kbuf) {
+ compat_ulong_t *k = kbuf;
+ while (count >= sizeof(*k)) {
+ getreg32(target, pos, k++);
+ count -= sizeof(*k);
+ pos += sizeof(*k);
+ }
+ } else {
+ compat_ulong_t __user *u = ubuf;
+ while (count >= sizeof(*u)) {
+ compat_ulong_t word;
+ getreg32(target, pos, &word);
+ if (__put_user(word, u++))
+ return -EFAULT;
+ count -= sizeof(*u);
+ pos += sizeof(*u);
+ }
+ }
+
+ return 0;
+}
+
+static int genregs32_set(struct task_struct *target,
+ const struct user_regset *regset,
+ unsigned int pos, unsigned int count,
+ const void *kbuf, const void __user *ubuf)
+{
+ int ret = 0;
+ if (kbuf) {
+ const compat_ulong_t *k = kbuf;
+ while (count >= sizeof(*k) && !ret) {
+ ret = putreg32(target, pos, *k++);
+ count -= sizeof(*k);
+ pos += sizeof(*k);
+ }
+ } else {
+ const compat_ulong_t __user *u = ubuf;
+ while (count >= sizeof(*u) && !ret) {
+ compat_ulong_t word;
+ ret = __get_user(word, u++);
+ if (ret)
+ break;
+ ret = putreg32(target, pos, word);
+ count -= sizeof(*u);
+ pos += sizeof(*u);
+ }
+ }
+ return ret;
+}
+
+static long ia32_arch_ptrace(struct task_struct *child, compat_long_t request,
+ compat_ulong_t caddr, compat_ulong_t cdata)
+{
+ unsigned long addr = caddr;
+ unsigned long data = cdata;
+ void __user *datap = compat_ptr(data);
+ int ret;
+ __u32 val;
+
+ switch (request) {
+ case PTRACE_PEEKUSR:
+ ret = getreg32(child, addr, &val);
+ if (ret == 0)
+ ret = put_user(val, (__u32 __user *)datap);
+ break;
+
+ case PTRACE_POKEUSR:
+ ret = putreg32(child, addr, data);
+ break;
+
+ case PTRACE_GETREGS: /* Get all gp regs from the child. */
+ return copy_regset_to_user(child, &user_x86_32_view,
+ REGSET_GENERAL,
+ 0, sizeof(struct user_regs_struct32),
+ datap);
+
+ case PTRACE_SETREGS: /* Set all gp regs in the child. */
+ return copy_regset_from_user(child, &user_x86_32_view,
+ REGSET_GENERAL, 0,
+ sizeof(struct user_regs_struct32),
+ datap);
+
+ case PTRACE_GETFPREGS: /* Get the child FPU state. */
+ return copy_regset_to_user(child, &user_x86_32_view,
+ REGSET_FP, 0,
+ sizeof(struct user_i387_ia32_struct),
+ datap);
+
+ case PTRACE_SETFPREGS: /* Set the child FPU state. */
+ return copy_regset_from_user(
+ child, &user_x86_32_view, REGSET_FP,
+ 0, sizeof(struct user_i387_ia32_struct), datap);
+
+ case PTRACE_GETFPXREGS: /* Get the child extended FPU state. */
+ return copy_regset_to_user(child, &user_x86_32_view,
+ REGSET_XFP, 0,
+ sizeof(struct user32_fxsr_struct),
+ datap);
+
+ case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */
+ return copy_regset_from_user(child, &user_x86_32_view,
+ REGSET_XFP, 0,
+ sizeof(struct user32_fxsr_struct),
+ datap);
+
+ case PTRACE_GET_THREAD_AREA:
+ case PTRACE_SET_THREAD_AREA:
+ return arch_ptrace(child, request, addr, data);
+
+ default:
+ return compat_ptrace_request(child, request, addr, data);
+ }
+
+ return ret;
+}
+#endif /* CONFIG_IA32_EMULATION */
+
+#ifdef CONFIG_X86_X32_ABI
+static long x32_arch_ptrace(struct task_struct *child,
+ compat_long_t request, compat_ulong_t caddr,
+ compat_ulong_t cdata)
+{
+ unsigned long addr = caddr;
+ unsigned long data = cdata;
+ void __user *datap = compat_ptr(data);
+ int ret;
+
+ switch (request) {
+ /* Read 32bits at location addr in the USER area. Only allow
+ to return the lower 32bits of segment and debug registers. */
+ case PTRACE_PEEKUSR: {
+ u32 tmp;
+
+ ret = -EIO;
+ if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user) ||
+ addr < offsetof(struct user_regs_struct, cs))
+ break;
+
+ tmp = 0; /* Default return condition */
+ if (addr < sizeof(struct user_regs_struct))
+ tmp = getreg(child, addr);
+ else if (addr >= offsetof(struct user, u_debugreg[0]) &&
+ addr <= offsetof(struct user, u_debugreg[7])) {
+ addr -= offsetof(struct user, u_debugreg[0]);
+ tmp = ptrace_get_debugreg(child, addr / sizeof(data));
+ }
+ ret = put_user(tmp, (__u32 __user *)datap);
+ break;
+ }
+
+ /* Write the word at location addr in the USER area. Only allow
+ to update segment and debug registers with the upper 32bits
+ zero-extended. */
+ case PTRACE_POKEUSR:
+ ret = -EIO;
+ if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user) ||
+ addr < offsetof(struct user_regs_struct, cs))
+ break;
+
+ if (addr < sizeof(struct user_regs_struct))
+ ret = putreg(child, addr, data);
+ else if (addr >= offsetof(struct user, u_debugreg[0]) &&
+ addr <= offsetof(struct user, u_debugreg[7])) {
+ addr -= offsetof(struct user, u_debugreg[0]);
+ ret = ptrace_set_debugreg(child,
+ addr / sizeof(data), data);
+ }
+ break;
+
+ case PTRACE_GETREGS: /* Get all gp regs from the child. */
+ return copy_regset_to_user(child,
+ task_user_regset_view(current),
+ REGSET_GENERAL,
+ 0, sizeof(struct user_regs_struct),
+ datap);
+
+ case PTRACE_SETREGS: /* Set all gp regs in the child. */
+ return copy_regset_from_user(child,
+ task_user_regset_view(current),
+ REGSET_GENERAL,
+ 0, sizeof(struct user_regs_struct),
+ datap);
+
+ case PTRACE_GETFPREGS: /* Get the child FPU state. */
+ return copy_regset_to_user(child,
+ task_user_regset_view(current),
+ REGSET_FP,
+ 0, sizeof(struct user_i387_struct),
+ datap);
+
+ case PTRACE_SETFPREGS: /* Set the child FPU state. */
+ return copy_regset_from_user(child,
+ task_user_regset_view(current),
+ REGSET_FP,
+ 0, sizeof(struct user_i387_struct),
+ datap);
+
+ default:
+ return compat_ptrace_request(child, request, addr, data);
+ }
+
+ return ret;
+}
+#endif
+
+#ifdef CONFIG_COMPAT
+long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
+ compat_ulong_t caddr, compat_ulong_t cdata)
+{
+#ifdef CONFIG_X86_X32_ABI
+ if (!in_ia32_syscall())
+ return x32_arch_ptrace(child, request, caddr, cdata);
+#endif
+#ifdef CONFIG_IA32_EMULATION
+ return ia32_arch_ptrace(child, request, caddr, cdata);
+#else
+ return 0;
+#endif
+}
+#endif /* CONFIG_COMPAT */
+
+#ifdef CONFIG_X86_64
+
+static struct user_regset x86_64_regsets[] __ro_after_init = {
+ [REGSET_GENERAL] = {
+ .core_note_type = NT_PRSTATUS,
+ .n = sizeof(struct user_regs_struct) / sizeof(long),
+ .size = sizeof(long), .align = sizeof(long),
+ .get = genregs_get, .set = genregs_set
+ },
+ [REGSET_FP] = {
+ .core_note_type = NT_PRFPREG,
+ .n = sizeof(struct user_i387_struct) / sizeof(long),
+ .size = sizeof(long), .align = sizeof(long),
+ .active = regset_xregset_fpregs_active, .get = xfpregs_get, .set = xfpregs_set
+ },
+ [REGSET_XSTATE] = {
+ .core_note_type = NT_X86_XSTATE,
+ .size = sizeof(u64), .align = sizeof(u64),
+ .active = xstateregs_active, .get = xstateregs_get,
+ .set = xstateregs_set
+ },
+ [REGSET_IOPERM64] = {
+ .core_note_type = NT_386_IOPERM,
+ .n = IO_BITMAP_LONGS,
+ .size = sizeof(long), .align = sizeof(long),
+ .active = ioperm_active, .get = ioperm_get
+ },
+};
+
+static const struct user_regset_view user_x86_64_view = {
+ .name = "x86_64", .e_machine = EM_X86_64,
+ .regsets = x86_64_regsets, .n = ARRAY_SIZE(x86_64_regsets)
+};
+
+#else /* CONFIG_X86_32 */
+
+#define user_regs_struct32 user_regs_struct
+#define genregs32_get genregs_get
+#define genregs32_set genregs_set
+
+#endif /* CONFIG_X86_64 */
+
+#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
+static struct user_regset x86_32_regsets[] __ro_after_init = {
+ [REGSET_GENERAL] = {
+ .core_note_type = NT_PRSTATUS,
+ .n = sizeof(struct user_regs_struct32) / sizeof(u32),
+ .size = sizeof(u32), .align = sizeof(u32),
+ .get = genregs32_get, .set = genregs32_set
+ },
+ [REGSET_FP] = {
+ .core_note_type = NT_PRFPREG,
+ .n = sizeof(struct user_i387_ia32_struct) / sizeof(u32),
+ .size = sizeof(u32), .align = sizeof(u32),
+ .active = regset_fpregs_active, .get = fpregs_get, .set = fpregs_set
+ },
+ [REGSET_XFP] = {
+ .core_note_type = NT_PRXFPREG,
+ .n = sizeof(struct user32_fxsr_struct) / sizeof(u32),
+ .size = sizeof(u32), .align = sizeof(u32),
+ .active = regset_xregset_fpregs_active, .get = xfpregs_get, .set = xfpregs_set
+ },
+ [REGSET_XSTATE] = {
+ .core_note_type = NT_X86_XSTATE,
+ .size = sizeof(u64), .align = sizeof(u64),
+ .active = xstateregs_active, .get = xstateregs_get,
+ .set = xstateregs_set
+ },
+ [REGSET_TLS] = {
+ .core_note_type = NT_386_TLS,
+ .n = GDT_ENTRY_TLS_ENTRIES, .bias = GDT_ENTRY_TLS_MIN,
+ .size = sizeof(struct user_desc),
+ .align = sizeof(struct user_desc),
+ .active = regset_tls_active,
+ .get = regset_tls_get, .set = regset_tls_set
+ },
+ [REGSET_IOPERM32] = {
+ .core_note_type = NT_386_IOPERM,
+ .n = IO_BITMAP_BYTES / sizeof(u32),
+ .size = sizeof(u32), .align = sizeof(u32),
+ .active = ioperm_active, .get = ioperm_get
+ },
+};
+
+static const struct user_regset_view user_x86_32_view = {
+ .name = "i386", .e_machine = EM_386,
+ .regsets = x86_32_regsets, .n = ARRAY_SIZE(x86_32_regsets)
+};
+#endif
+
+/*
+ * This represents bytes 464..511 in the memory layout exported through
+ * the REGSET_XSTATE interface.
+ */
+u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
+
+void __init update_regset_xstate_info(unsigned int size, u64 xstate_mask)
+{
+#ifdef CONFIG_X86_64
+ x86_64_regsets[REGSET_XSTATE].n = size / sizeof(u64);
+#endif
+#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
+ x86_32_regsets[REGSET_XSTATE].n = size / sizeof(u64);
+#endif
+ xstate_fx_sw_bytes[USER_XSTATE_XCR0_WORD] = xstate_mask;
+}
+
+const struct user_regset_view *task_user_regset_view(struct task_struct *task)
+{
+#ifdef CONFIG_IA32_EMULATION
+ if (!user_64bit_mode(task_pt_regs(task)))
+#endif
+#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
+ return &user_x86_32_view;
+#endif
+#ifdef CONFIG_X86_64
+ return &user_x86_64_view;
+#endif
+}
+
+static void fill_sigtrap_info(struct task_struct *tsk,
+ struct pt_regs *regs,
+ int error_code, int si_code,
+ struct siginfo *info)
+{
+ tsk->thread.trap_nr = X86_TRAP_DB;
+ tsk->thread.error_code = error_code;
+
+ info->si_signo = SIGTRAP;
+ info->si_code = si_code;
+ info->si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL;
+}
+
+void user_single_step_siginfo(struct task_struct *tsk,
+ struct pt_regs *regs,
+ struct siginfo *info)
+{
+ fill_sigtrap_info(tsk, regs, 0, TRAP_BRKPT, info);
+}
+
+void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
+ int error_code, int si_code)
+{
+ struct siginfo info;
+
+ clear_siginfo(&info);
+ fill_sigtrap_info(tsk, regs, error_code, si_code, &info);
+ /* Send us the fake SIGTRAP */
+ force_sig_info(SIGTRAP, &info, tsk);
+}
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
new file mode 100644
index 000000000..637982efe
--- /dev/null
+++ b/arch/x86/kernel/pvclock.c
@@ -0,0 +1,167 @@
+/* paravirtual clock -- common code used by kvm/xen
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+*/
+
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/sched.h>
+#include <linux/gfp.h>
+#include <linux/bootmem.h>
+#include <linux/nmi.h>
+
+#include <asm/fixmap.h>
+#include <asm/pvclock.h>
+#include <asm/vgtod.h>
+
+static u8 valid_flags __read_mostly = 0;
+static struct pvclock_vsyscall_time_info *pvti_cpu0_va __read_mostly;
+
+void pvclock_set_flags(u8 flags)
+{
+ valid_flags = flags;
+}
+
+unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src)
+{
+ u64 pv_tsc_khz = 1000000ULL << 32;
+
+ do_div(pv_tsc_khz, src->tsc_to_system_mul);
+ if (src->tsc_shift < 0)
+ pv_tsc_khz <<= -src->tsc_shift;
+ else
+ pv_tsc_khz >>= src->tsc_shift;
+ return pv_tsc_khz;
+}
+
+void pvclock_touch_watchdogs(void)
+{
+ touch_softlockup_watchdog_sync();
+ clocksource_touch_watchdog();
+ rcu_cpu_stall_reset();
+ reset_hung_task_detector();
+}
+
+static atomic64_t last_value = ATOMIC64_INIT(0);
+
+void pvclock_resume(void)
+{
+ atomic64_set(&last_value, 0);
+}
+
+u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src)
+{
+ unsigned version;
+ u8 flags;
+
+ do {
+ version = pvclock_read_begin(src);
+ flags = src->flags;
+ } while (pvclock_read_retry(src, version));
+
+ return flags & valid_flags;
+}
+
+u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
+{
+ unsigned version;
+ u64 ret;
+ u64 last;
+ u8 flags;
+
+ do {
+ version = pvclock_read_begin(src);
+ ret = __pvclock_read_cycles(src, rdtsc_ordered());
+ flags = src->flags;
+ } while (pvclock_read_retry(src, version));
+
+ if (unlikely((flags & PVCLOCK_GUEST_STOPPED) != 0)) {
+ src->flags &= ~PVCLOCK_GUEST_STOPPED;
+ pvclock_touch_watchdogs();
+ }
+
+ if ((valid_flags & PVCLOCK_TSC_STABLE_BIT) &&
+ (flags & PVCLOCK_TSC_STABLE_BIT))
+ return ret;
+
+ /*
+ * Assumption here is that last_value, a global accumulator, always goes
+ * forward. If we are less than that, we should not be much smaller.
+ * We assume there is an error marging we're inside, and then the correction
+ * does not sacrifice accuracy.
+ *
+ * For reads: global may have changed between test and return,
+ * but this means someone else updated poked the clock at a later time.
+ * We just need to make sure we are not seeing a backwards event.
+ *
+ * For updates: last_value = ret is not enough, since two vcpus could be
+ * updating at the same time, and one of them could be slightly behind,
+ * making the assumption that last_value always go forward fail to hold.
+ */
+ last = atomic64_read(&last_value);
+ do {
+ if (ret < last)
+ return last;
+ last = atomic64_cmpxchg(&last_value, last, ret);
+ } while (unlikely(last != ret));
+
+ return ret;
+}
+
+void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
+ struct pvclock_vcpu_time_info *vcpu_time,
+ struct timespec64 *ts)
+{
+ u32 version;
+ u64 delta;
+ struct timespec64 now;
+
+ /* get wallclock at system boot */
+ do {
+ version = wall_clock->version;
+ rmb(); /* fetch version before time */
+ /*
+ * Note: wall_clock->sec is a u32 value, so it can
+ * only store dates between 1970 and 2106. To allow
+ * times beyond that, we need to create a new hypercall
+ * interface with an extended pvclock_wall_clock structure
+ * like ARM has.
+ */
+ now.tv_sec = wall_clock->sec;
+ now.tv_nsec = wall_clock->nsec;
+ rmb(); /* fetch time before checking version */
+ } while ((wall_clock->version & 1) || (version != wall_clock->version));
+
+ delta = pvclock_clocksource_read(vcpu_time); /* time since system boot */
+ delta += now.tv_sec * NSEC_PER_SEC + now.tv_nsec;
+
+ now.tv_nsec = do_div(delta, NSEC_PER_SEC);
+ now.tv_sec = delta;
+
+ set_normalized_timespec64(ts, now.tv_sec, now.tv_nsec);
+}
+
+void pvclock_set_pvti_cpu0_va(struct pvclock_vsyscall_time_info *pvti)
+{
+ WARN_ON(vclock_was_used(VCLOCK_PVCLOCK));
+ pvti_cpu0_va = pvti;
+}
+
+struct pvclock_vsyscall_time_info *pvclock_get_pvti_cpu0_va(void)
+{
+ return pvti_cpu0_va;
+}
+EXPORT_SYMBOL_GPL(pvclock_get_pvti_cpu0_va);
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
new file mode 100644
index 000000000..736348ead
--- /dev/null
+++ b/arch/x86/kernel/quirks.c
@@ -0,0 +1,676 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This file contains work-arounds for x86 and x86_64 platform bugs.
+ */
+#include <linux/dmi.h>
+#include <linux/pci.h>
+#include <linux/irq.h>
+
+#include <asm/hpet.h>
+
+#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI)
+
+static void quirk_intel_irqbalance(struct pci_dev *dev)
+{
+ u8 config;
+ u16 word;
+
+ /* BIOS may enable hardware IRQ balancing for
+ * E7520/E7320/E7525(revision ID 0x9 and below)
+ * based platforms.
+ * Disable SW irqbalance/affinity on those platforms.
+ */
+ if (dev->revision > 0x9)
+ return;
+
+ /* enable access to config space*/
+ pci_read_config_byte(dev, 0xf4, &config);
+ pci_write_config_byte(dev, 0xf4, config|0x2);
+
+ /*
+ * read xTPR register. We may not have a pci_dev for device 8
+ * because it might be hidden until the above write.
+ */
+ pci_bus_read_config_word(dev->bus, PCI_DEVFN(8, 0), 0x4c, &word);
+
+ if (!(word & (1 << 13))) {
+ dev_info(&dev->dev, "Intel E7520/7320/7525 detected; "
+ "disabling irq balancing and affinity\n");
+ noirqdebug_setup("");
+#ifdef CONFIG_PROC_FS
+ no_irq_affinity = 1;
+#endif
+ }
+
+ /* put back the original value for config space*/
+ if (!(config & 0x2))
+ pci_write_config_byte(dev, 0xf4, config);
+}
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH,
+ quirk_intel_irqbalance);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH,
+ quirk_intel_irqbalance);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH,
+ quirk_intel_irqbalance);
+#endif
+
+#if defined(CONFIG_HPET_TIMER)
+unsigned long force_hpet_address;
+
+static enum {
+ NONE_FORCE_HPET_RESUME,
+ OLD_ICH_FORCE_HPET_RESUME,
+ ICH_FORCE_HPET_RESUME,
+ VT8237_FORCE_HPET_RESUME,
+ NVIDIA_FORCE_HPET_RESUME,
+ ATI_FORCE_HPET_RESUME,
+} force_hpet_resume_type;
+
+static void __iomem *rcba_base;
+
+static void ich_force_hpet_resume(void)
+{
+ u32 val;
+
+ if (!force_hpet_address)
+ return;
+
+ BUG_ON(rcba_base == NULL);
+
+ /* read the Function Disable register, dword mode only */
+ val = readl(rcba_base + 0x3404);
+ if (!(val & 0x80)) {
+ /* HPET disabled in HPTC. Trying to enable */
+ writel(val | 0x80, rcba_base + 0x3404);
+ }
+
+ val = readl(rcba_base + 0x3404);
+ if (!(val & 0x80))
+ BUG();
+ else
+ printk(KERN_DEBUG "Force enabled HPET at resume\n");
+
+ return;
+}
+
+static void ich_force_enable_hpet(struct pci_dev *dev)
+{
+ u32 val;
+ u32 uninitialized_var(rcba);
+ int err = 0;
+
+ if (hpet_address || force_hpet_address)
+ return;
+
+ pci_read_config_dword(dev, 0xF0, &rcba);
+ rcba &= 0xFFFFC000;
+ if (rcba == 0) {
+ dev_printk(KERN_DEBUG, &dev->dev, "RCBA disabled; "
+ "cannot force enable HPET\n");
+ return;
+ }
+
+ /* use bits 31:14, 16 kB aligned */
+ rcba_base = ioremap_nocache(rcba, 0x4000);
+ if (rcba_base == NULL) {
+ dev_printk(KERN_DEBUG, &dev->dev, "ioremap failed; "
+ "cannot force enable HPET\n");
+ return;
+ }
+
+ /* read the Function Disable register, dword mode only */
+ val = readl(rcba_base + 0x3404);
+
+ if (val & 0x80) {
+ /* HPET is enabled in HPTC. Just not reported by BIOS */
+ val = val & 0x3;
+ force_hpet_address = 0xFED00000 | (val << 12);
+ dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at "
+ "0x%lx\n", force_hpet_address);
+ iounmap(rcba_base);
+ return;
+ }
+
+ /* HPET disabled in HPTC. Trying to enable */
+ writel(val | 0x80, rcba_base + 0x3404);
+
+ val = readl(rcba_base + 0x3404);
+ if (!(val & 0x80)) {
+ err = 1;
+ } else {
+ val = val & 0x3;
+ force_hpet_address = 0xFED00000 | (val << 12);
+ }
+
+ if (err) {
+ force_hpet_address = 0;
+ iounmap(rcba_base);
+ dev_printk(KERN_DEBUG, &dev->dev,
+ "Failed to force enable HPET\n");
+ } else {
+ force_hpet_resume_type = ICH_FORCE_HPET_RESUME;
+ dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at "
+ "0x%lx\n", force_hpet_address);
+ }
+}
+
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ESB2_0,
+ ich_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH6_0,
+ ich_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH6_1,
+ ich_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_0,
+ ich_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_1,
+ ich_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_31,
+ ich_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_1,
+ ich_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_4,
+ ich_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_7,
+ ich_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x3a16, /* ICH10 */
+ ich_force_enable_hpet);
+
+static struct pci_dev *cached_dev;
+
+static void hpet_print_force_info(void)
+{
+ printk(KERN_INFO "HPET not enabled in BIOS. "
+ "You might try hpet=force boot option\n");
+}
+
+static void old_ich_force_hpet_resume(void)
+{
+ u32 val;
+ u32 uninitialized_var(gen_cntl);
+
+ if (!force_hpet_address || !cached_dev)
+ return;
+
+ pci_read_config_dword(cached_dev, 0xD0, &gen_cntl);
+ gen_cntl &= (~(0x7 << 15));
+ gen_cntl |= (0x4 << 15);
+
+ pci_write_config_dword(cached_dev, 0xD0, gen_cntl);
+ pci_read_config_dword(cached_dev, 0xD0, &gen_cntl);
+ val = gen_cntl >> 15;
+ val &= 0x7;
+ if (val == 0x4)
+ printk(KERN_DEBUG "Force enabled HPET at resume\n");
+ else
+ BUG();
+}
+
+static void old_ich_force_enable_hpet(struct pci_dev *dev)
+{
+ u32 val;
+ u32 uninitialized_var(gen_cntl);
+
+ if (hpet_address || force_hpet_address)
+ return;
+
+ pci_read_config_dword(dev, 0xD0, &gen_cntl);
+ /*
+ * Bit 17 is HPET enable bit.
+ * Bit 16:15 control the HPET base address.
+ */
+ val = gen_cntl >> 15;
+ val &= 0x7;
+ if (val & 0x4) {
+ val &= 0x3;
+ force_hpet_address = 0xFED00000 | (val << 12);
+ dev_printk(KERN_DEBUG, &dev->dev, "HPET at 0x%lx\n",
+ force_hpet_address);
+ return;
+ }
+
+ /*
+ * HPET is disabled. Trying enabling at FED00000 and check
+ * whether it sticks
+ */
+ gen_cntl &= (~(0x7 << 15));
+ gen_cntl |= (0x4 << 15);
+ pci_write_config_dword(dev, 0xD0, gen_cntl);
+
+ pci_read_config_dword(dev, 0xD0, &gen_cntl);
+
+ val = gen_cntl >> 15;
+ val &= 0x7;
+ if (val & 0x4) {
+ /* HPET is enabled in HPTC. Just not reported by BIOS */
+ val &= 0x3;
+ force_hpet_address = 0xFED00000 | (val << 12);
+ dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at "
+ "0x%lx\n", force_hpet_address);
+ cached_dev = dev;
+ force_hpet_resume_type = OLD_ICH_FORCE_HPET_RESUME;
+ return;
+ }
+
+ dev_printk(KERN_DEBUG, &dev->dev, "Failed to force enable HPET\n");
+}
+
+/*
+ * Undocumented chipset features. Make sure that the user enforced
+ * this.
+ */
+static void old_ich_force_enable_hpet_user(struct pci_dev *dev)
+{
+ if (hpet_force_user)
+ old_ich_force_enable_hpet(dev);
+}
+
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ESB_1,
+ old_ich_force_enable_hpet_user);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_0,
+ old_ich_force_enable_hpet_user);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_12,
+ old_ich_force_enable_hpet_user);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801DB_0,
+ old_ich_force_enable_hpet_user);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801DB_12,
+ old_ich_force_enable_hpet_user);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801EB_0,
+ old_ich_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801EB_12,
+ old_ich_force_enable_hpet);
+
+
+static void vt8237_force_hpet_resume(void)
+{
+ u32 val;
+
+ if (!force_hpet_address || !cached_dev)
+ return;
+
+ val = 0xfed00000 | 0x80;
+ pci_write_config_dword(cached_dev, 0x68, val);
+
+ pci_read_config_dword(cached_dev, 0x68, &val);
+ if (val & 0x80)
+ printk(KERN_DEBUG "Force enabled HPET at resume\n");
+ else
+ BUG();
+}
+
+static void vt8237_force_enable_hpet(struct pci_dev *dev)
+{
+ u32 uninitialized_var(val);
+
+ if (hpet_address || force_hpet_address)
+ return;
+
+ if (!hpet_force_user) {
+ hpet_print_force_info();
+ return;
+ }
+
+ pci_read_config_dword(dev, 0x68, &val);
+ /*
+ * Bit 7 is HPET enable bit.
+ * Bit 31:10 is HPET base address (contrary to what datasheet claims)
+ */
+ if (val & 0x80) {
+ force_hpet_address = (val & ~0x3ff);
+ dev_printk(KERN_DEBUG, &dev->dev, "HPET at 0x%lx\n",
+ force_hpet_address);
+ return;
+ }
+
+ /*
+ * HPET is disabled. Trying enabling at FED00000 and check
+ * whether it sticks
+ */
+ val = 0xfed00000 | 0x80;
+ pci_write_config_dword(dev, 0x68, val);
+
+ pci_read_config_dword(dev, 0x68, &val);
+ if (val & 0x80) {
+ force_hpet_address = (val & ~0x3ff);
+ dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at "
+ "0x%lx\n", force_hpet_address);
+ cached_dev = dev;
+ force_hpet_resume_type = VT8237_FORCE_HPET_RESUME;
+ return;
+ }
+
+ dev_printk(KERN_DEBUG, &dev->dev, "Failed to force enable HPET\n");
+}
+
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235,
+ vt8237_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8237,
+ vt8237_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_CX700,
+ vt8237_force_enable_hpet);
+
+static void ati_force_hpet_resume(void)
+{
+ pci_write_config_dword(cached_dev, 0x14, 0xfed00000);
+ printk(KERN_DEBUG "Force enabled HPET at resume\n");
+}
+
+static u32 ati_ixp4x0_rev(struct pci_dev *dev)
+{
+ int err = 0;
+ u32 d = 0;
+ u8 b = 0;
+
+ err = pci_read_config_byte(dev, 0xac, &b);
+ b &= ~(1<<5);
+ err |= pci_write_config_byte(dev, 0xac, b);
+ err |= pci_read_config_dword(dev, 0x70, &d);
+ d |= 1<<8;
+ err |= pci_write_config_dword(dev, 0x70, d);
+ err |= pci_read_config_dword(dev, 0x8, &d);
+ d &= 0xff;
+ dev_printk(KERN_DEBUG, &dev->dev, "SB4X0 revision 0x%x\n", d);
+
+ WARN_ON_ONCE(err);
+
+ return d;
+}
+
+static void ati_force_enable_hpet(struct pci_dev *dev)
+{
+ u32 d, val;
+ u8 b;
+
+ if (hpet_address || force_hpet_address)
+ return;
+
+ if (!hpet_force_user) {
+ hpet_print_force_info();
+ return;
+ }
+
+ d = ati_ixp4x0_rev(dev);
+ if (d < 0x82)
+ return;
+
+ /* base address */
+ pci_write_config_dword(dev, 0x14, 0xfed00000);
+ pci_read_config_dword(dev, 0x14, &val);
+
+ /* enable interrupt */
+ outb(0x72, 0xcd6); b = inb(0xcd7);
+ b |= 0x1;
+ outb(0x72, 0xcd6); outb(b, 0xcd7);
+ outb(0x72, 0xcd6); b = inb(0xcd7);
+ if (!(b & 0x1))
+ return;
+ pci_read_config_dword(dev, 0x64, &d);
+ d |= (1<<10);
+ pci_write_config_dword(dev, 0x64, d);
+ pci_read_config_dword(dev, 0x64, &d);
+ if (!(d & (1<<10)))
+ return;
+
+ force_hpet_address = val;
+ force_hpet_resume_type = ATI_FORCE_HPET_RESUME;
+ dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n",
+ force_hpet_address);
+ cached_dev = dev;
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS,
+ ati_force_enable_hpet);
+
+/*
+ * Undocumented chipset feature taken from LinuxBIOS.
+ */
+static void nvidia_force_hpet_resume(void)
+{
+ pci_write_config_dword(cached_dev, 0x44, 0xfed00001);
+ printk(KERN_DEBUG "Force enabled HPET at resume\n");
+}
+
+static void nvidia_force_enable_hpet(struct pci_dev *dev)
+{
+ u32 uninitialized_var(val);
+
+ if (hpet_address || force_hpet_address)
+ return;
+
+ if (!hpet_force_user) {
+ hpet_print_force_info();
+ return;
+ }
+
+ pci_write_config_dword(dev, 0x44, 0xfed00001);
+ pci_read_config_dword(dev, 0x44, &val);
+ force_hpet_address = val & 0xfffffffe;
+ force_hpet_resume_type = NVIDIA_FORCE_HPET_RESUME;
+ dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n",
+ force_hpet_address);
+ cached_dev = dev;
+ return;
+}
+
+/* ISA Bridges */
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0050,
+ nvidia_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0051,
+ nvidia_force_enable_hpet);
+
+/* LPC bridges */
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0260,
+ nvidia_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0360,
+ nvidia_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0361,
+ nvidia_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0362,
+ nvidia_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0363,
+ nvidia_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0364,
+ nvidia_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0365,
+ nvidia_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0366,
+ nvidia_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0367,
+ nvidia_force_enable_hpet);
+
+void force_hpet_resume(void)
+{
+ switch (force_hpet_resume_type) {
+ case ICH_FORCE_HPET_RESUME:
+ ich_force_hpet_resume();
+ return;
+ case OLD_ICH_FORCE_HPET_RESUME:
+ old_ich_force_hpet_resume();
+ return;
+ case VT8237_FORCE_HPET_RESUME:
+ vt8237_force_hpet_resume();
+ return;
+ case NVIDIA_FORCE_HPET_RESUME:
+ nvidia_force_hpet_resume();
+ return;
+ case ATI_FORCE_HPET_RESUME:
+ ati_force_hpet_resume();
+ return;
+ default:
+ break;
+ }
+}
+
+/*
+ * According to the datasheet e6xx systems have the HPET hardwired to
+ * 0xfed00000
+ */
+static void e6xx_force_enable_hpet(struct pci_dev *dev)
+{
+ if (hpet_address || force_hpet_address)
+ return;
+
+ force_hpet_address = 0xFED00000;
+ force_hpet_resume_type = NONE_FORCE_HPET_RESUME;
+ dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at "
+ "0x%lx\n", force_hpet_address);
+ return;
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E6XX_CU,
+ e6xx_force_enable_hpet);
+
+/*
+ * HPET MSI on some boards (ATI SB700/SB800) has side effect on
+ * floppy DMA. Disable HPET MSI on such platforms.
+ * See erratum #27 (Misinterpreted MSI Requests May Result in
+ * Corrupted LPC DMA Data) in AMD Publication #46837,
+ * "SB700 Family Product Errata", Rev. 1.0, March 2010.
+ */
+static void force_disable_hpet_msi(struct pci_dev *unused)
+{
+ hpet_msi_disable = true;
+}
+
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS,
+ force_disable_hpet_msi);
+
+#endif
+
+#if defined(CONFIG_PCI) && defined(CONFIG_NUMA)
+/* Set correct numa_node information for AMD NB functions */
+static void quirk_amd_nb_node(struct pci_dev *dev)
+{
+ struct pci_dev *nb_ht;
+ unsigned int devfn;
+ u32 node;
+ u32 val;
+
+ devfn = PCI_DEVFN(PCI_SLOT(dev->devfn), 0);
+ nb_ht = pci_get_slot(dev->bus, devfn);
+ if (!nb_ht)
+ return;
+
+ pci_read_config_dword(nb_ht, 0x60, &val);
+ node = pcibus_to_node(dev->bus) | (val & 7);
+ /*
+ * Some hardware may return an invalid node ID,
+ * so check it first:
+ */
+ if (node_online(node))
+ set_dev_node(&dev->dev, node);
+ pci_dev_put(nb_ht);
+}
+
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB,
+ quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_ADDRMAP,
+ quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MEMCTL,
+ quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC,
+ quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_HT,
+ quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MAP,
+ quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_DRAM,
+ quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC,
+ quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_LINK,
+ quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F0,
+ quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F1,
+ quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F2,
+ quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3,
+ quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4,
+ quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F5,
+ quirk_amd_nb_node);
+
+#endif
+
+#ifdef CONFIG_PCI
+/*
+ * Processor does not ensure DRAM scrub read/write sequence
+ * is atomic wrt accesses to CC6 save state area. Therefore
+ * if a concurrent scrub read/write access is to same address
+ * the entry may appear as if it is not written. This quirk
+ * applies to Fam16h models 00h-0Fh
+ *
+ * See "Revision Guide" for AMD F16h models 00h-0fh,
+ * document 51810 rev. 3.04, Nov 2013
+ */
+static void amd_disable_seq_and_redirect_scrub(struct pci_dev *dev)
+{
+ u32 val;
+
+ /*
+ * Suggested workaround:
+ * set D18F3x58[4:0] = 00h and set D18F3x5C[0] = 0b
+ */
+ pci_read_config_dword(dev, 0x58, &val);
+ if (val & 0x1F) {
+ val &= ~(0x1F);
+ pci_write_config_dword(dev, 0x58, val);
+ }
+
+ pci_read_config_dword(dev, 0x5C, &val);
+ if (val & BIT(0)) {
+ val &= ~BIT(0);
+ pci_write_config_dword(dev, 0x5c, val);
+ }
+}
+
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3,
+ amd_disable_seq_and_redirect_scrub);
+
+#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE)
+#include <linux/jump_label.h>
+#include <asm/string_64.h>
+
+/* Ivy Bridge, Haswell, Broadwell */
+static void quirk_intel_brickland_xeon_ras_cap(struct pci_dev *pdev)
+{
+ u32 capid0;
+
+ pci_read_config_dword(pdev, 0x84, &capid0);
+
+ if (capid0 & 0x10)
+ static_branch_inc(&mcsafe_key);
+}
+
+/* Skylake */
+static void quirk_intel_purley_xeon_ras_cap(struct pci_dev *pdev)
+{
+ u32 capid0, capid5;
+
+ pci_read_config_dword(pdev, 0x84, &capid0);
+ pci_read_config_dword(pdev, 0x98, &capid5);
+
+ /*
+ * CAPID0{7:6} indicate whether this is an advanced RAS SKU
+ * CAPID5{8:5} indicate that various NVDIMM usage modes are
+ * enabled, so memory machine check recovery is also enabled.
+ */
+ if ((capid0 & 0xc0) == 0xc0 || (capid5 & 0x1e0))
+ static_branch_inc(&mcsafe_key);
+
+}
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x0ec3, quirk_intel_brickland_xeon_ras_cap);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2fc0, quirk_intel_brickland_xeon_ras_cap);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fc0, quirk_intel_brickland_xeon_ras_cap);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2083, quirk_intel_purley_xeon_ras_cap);
+#endif
+#endif
+
+bool x86_apple_machine;
+EXPORT_SYMBOL(x86_apple_machine);
+
+void __init early_platform_quirks(void)
+{
+ x86_apple_machine = dmi_match(DMI_SYS_VENDOR, "Apple Inc.") ||
+ dmi_match(DMI_SYS_VENDOR, "Apple Computer, Inc.");
+}
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
new file mode 100644
index 000000000..b0f3a996d
--- /dev/null
+++ b/arch/x86/kernel/reboot.c
@@ -0,0 +1,917 @@
+// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/export.h>
+#include <linux/reboot.h>
+#include <linux/init.h>
+#include <linux/pm.h>
+#include <linux/efi.h>
+#include <linux/dmi.h>
+#include <linux/sched.h>
+#include <linux/tboot.h>
+#include <linux/delay.h>
+#include <linux/frame.h>
+#include <acpi/reboot.h>
+#include <asm/io.h>
+#include <asm/apic.h>
+#include <asm/io_apic.h>
+#include <asm/desc.h>
+#include <asm/hpet.h>
+#include <asm/pgtable.h>
+#include <asm/proto.h>
+#include <asm/reboot_fixups.h>
+#include <asm/reboot.h>
+#include <asm/pci_x86.h>
+#include <asm/virtext.h>
+#include <asm/cpu.h>
+#include <asm/nmi.h>
+#include <asm/smp.h>
+
+#include <linux/ctype.h>
+#include <linux/mc146818rtc.h>
+#include <asm/realmode.h>
+#include <asm/x86_init.h>
+#include <asm/efi.h>
+
+/*
+ * Power off function, if any
+ */
+void (*pm_power_off)(void);
+EXPORT_SYMBOL(pm_power_off);
+
+/*
+ * This is set if we need to go through the 'emergency' path.
+ * When machine_emergency_restart() is called, we may be on
+ * an inconsistent state and won't be able to do a clean cleanup
+ */
+static int reboot_emergency;
+
+/* This is set by the PCI code if either type 1 or type 2 PCI is detected */
+bool port_cf9_safe = false;
+
+/*
+ * Reboot options and system auto-detection code provided by
+ * Dell Inc. so their systems "just work". :-)
+ */
+
+/*
+ * Some machines require the "reboot=a" commandline options
+ */
+static int __init set_acpi_reboot(const struct dmi_system_id *d)
+{
+ if (reboot_type != BOOT_ACPI) {
+ reboot_type = BOOT_ACPI;
+ pr_info("%s series board detected. Selecting %s-method for reboots.\n",
+ d->ident, "ACPI");
+ }
+ return 0;
+}
+
+/*
+ * Some machines require the "reboot=b" or "reboot=k" commandline options,
+ * this quirk makes that automatic.
+ */
+static int __init set_bios_reboot(const struct dmi_system_id *d)
+{
+ if (reboot_type != BOOT_BIOS) {
+ reboot_type = BOOT_BIOS;
+ pr_info("%s series board detected. Selecting %s-method for reboots.\n",
+ d->ident, "BIOS");
+ }
+ return 0;
+}
+
+/*
+ * Some machines don't handle the default ACPI reboot method and
+ * require the EFI reboot method:
+ */
+static int __init set_efi_reboot(const struct dmi_system_id *d)
+{
+ if (reboot_type != BOOT_EFI && !efi_runtime_disabled()) {
+ reboot_type = BOOT_EFI;
+ pr_info("%s series board detected. Selecting EFI-method for reboot.\n", d->ident);
+ }
+ return 0;
+}
+
+void __noreturn machine_real_restart(unsigned int type)
+{
+ local_irq_disable();
+
+ /*
+ * Write zero to CMOS register number 0x0f, which the BIOS POST
+ * routine will recognize as telling it to do a proper reboot. (Well
+ * that's what this book in front of me says -- it may only apply to
+ * the Phoenix BIOS though, it's not clear). At the same time,
+ * disable NMIs by setting the top bit in the CMOS address register,
+ * as we're about to do peculiar things to the CPU. I'm not sure if
+ * `outb_p' is needed instead of just `outb'. Use it to be on the
+ * safe side. (Yes, CMOS_WRITE does outb_p's. - Paul G.)
+ */
+ spin_lock(&rtc_lock);
+ CMOS_WRITE(0x00, 0x8f);
+ spin_unlock(&rtc_lock);
+
+ /*
+ * Switch back to the initial page table.
+ */
+#ifdef CONFIG_X86_32
+ load_cr3(initial_page_table);
+#else
+ write_cr3(real_mode_header->trampoline_pgd);
+
+ /* Exiting long mode will fail if CR4.PCIDE is set. */
+ if (static_cpu_has(X86_FEATURE_PCID))
+ cr4_clear_bits(X86_CR4_PCIDE);
+#endif
+
+ /* Jump to the identity-mapped low memory code */
+#ifdef CONFIG_X86_32
+ asm volatile("jmpl *%0" : :
+ "rm" (real_mode_header->machine_real_restart_asm),
+ "a" (type));
+#else
+ asm volatile("ljmpl *%0" : :
+ "m" (real_mode_header->machine_real_restart_asm),
+ "D" (type));
+#endif
+ unreachable();
+}
+#ifdef CONFIG_APM_MODULE
+EXPORT_SYMBOL(machine_real_restart);
+#endif
+STACK_FRAME_NON_STANDARD(machine_real_restart);
+
+/*
+ * Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot
+ */
+static int __init set_pci_reboot(const struct dmi_system_id *d)
+{
+ if (reboot_type != BOOT_CF9_FORCE) {
+ reboot_type = BOOT_CF9_FORCE;
+ pr_info("%s series board detected. Selecting %s-method for reboots.\n",
+ d->ident, "PCI");
+ }
+ return 0;
+}
+
+static int __init set_kbd_reboot(const struct dmi_system_id *d)
+{
+ if (reboot_type != BOOT_KBD) {
+ reboot_type = BOOT_KBD;
+ pr_info("%s series board detected. Selecting %s-method for reboot.\n",
+ d->ident, "KBD");
+ }
+ return 0;
+}
+
+/*
+ * This is a single dmi_table handling all reboot quirks.
+ */
+static const struct dmi_system_id reboot_dmi_table[] __initconst = {
+
+ /* Acer */
+ { /* Handle reboot issue on Acer Aspire one */
+ .callback = set_kbd_reboot,
+ .ident = "Acer Aspire One A110",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "AOA110"),
+ },
+ },
+ { /* Handle reboot issue on Acer TravelMate X514-51T */
+ .callback = set_efi_reboot,
+ .ident = "Acer TravelMate X514-51T",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate X514-51T"),
+ },
+ },
+
+ /* Apple */
+ { /* Handle problems with rebooting on Apple MacBook5 */
+ .callback = set_pci_reboot,
+ .ident = "Apple MacBook5",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "MacBook5"),
+ },
+ },
+ { /* Handle problems with rebooting on Apple MacBook6,1 */
+ .callback = set_pci_reboot,
+ .ident = "Apple MacBook6,1",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "MacBook6,1"),
+ },
+ },
+ { /* Handle problems with rebooting on Apple MacBookPro5 */
+ .callback = set_pci_reboot,
+ .ident = "Apple MacBookPro5",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro5"),
+ },
+ },
+ { /* Handle problems with rebooting on Apple Macmini3,1 */
+ .callback = set_pci_reboot,
+ .ident = "Apple Macmini3,1",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Macmini3,1"),
+ },
+ },
+ { /* Handle problems with rebooting on the iMac9,1. */
+ .callback = set_pci_reboot,
+ .ident = "Apple iMac9,1",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "iMac9,1"),
+ },
+ },
+ { /* Handle problems with rebooting on the iMac10,1. */
+ .callback = set_pci_reboot,
+ .ident = "Apple iMac10,1",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "iMac10,1"),
+ },
+ },
+
+ /* ASRock */
+ { /* Handle problems with rebooting on ASRock Q1900DC-ITX */
+ .callback = set_pci_reboot,
+ .ident = "ASRock Q1900DC-ITX",
+ .matches = {
+ DMI_MATCH(DMI_BOARD_VENDOR, "ASRock"),
+ DMI_MATCH(DMI_BOARD_NAME, "Q1900DC-ITX"),
+ },
+ },
+
+ /* ASUS */
+ { /* Handle problems with rebooting on ASUS P4S800 */
+ .callback = set_bios_reboot,
+ .ident = "ASUS P4S800",
+ .matches = {
+ DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
+ DMI_MATCH(DMI_BOARD_NAME, "P4S800"),
+ },
+ },
+ { /* Handle problems with rebooting on ASUS EeeBook X205TA */
+ .callback = set_acpi_reboot,
+ .ident = "ASUS EeeBook X205TA",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "X205TA"),
+ },
+ },
+ { /* Handle problems with rebooting on ASUS EeeBook X205TAW */
+ .callback = set_acpi_reboot,
+ .ident = "ASUS EeeBook X205TAW",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "X205TAW"),
+ },
+ },
+
+ /* Certec */
+ { /* Handle problems with rebooting on Certec BPC600 */
+ .callback = set_pci_reboot,
+ .ident = "Certec BPC600",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Certec"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "BPC600"),
+ },
+ },
+
+ /* Dell */
+ { /* Handle problems with rebooting on Dell DXP061 */
+ .callback = set_bios_reboot,
+ .ident = "Dell DXP061",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Dell DXP061"),
+ },
+ },
+ { /* Handle problems with rebooting on Dell E520's */
+ .callback = set_bios_reboot,
+ .ident = "Dell E520",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Dell DM061"),
+ },
+ },
+ { /* Handle problems with rebooting on the Latitude E5410. */
+ .callback = set_pci_reboot,
+ .ident = "Dell Latitude E5410",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E5410"),
+ },
+ },
+ { /* Handle problems with rebooting on the Latitude E5420. */
+ .callback = set_pci_reboot,
+ .ident = "Dell Latitude E5420",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E5420"),
+ },
+ },
+ { /* Handle problems with rebooting on the Latitude E6320. */
+ .callback = set_pci_reboot,
+ .ident = "Dell Latitude E6320",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6320"),
+ },
+ },
+ { /* Handle problems with rebooting on the Latitude E6420. */
+ .callback = set_pci_reboot,
+ .ident = "Dell Latitude E6420",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6420"),
+ },
+ },
+ { /* Handle problems with rebooting on Dell Optiplex 330 with 0KP561 */
+ .callback = set_bios_reboot,
+ .ident = "Dell OptiPlex 330",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 330"),
+ DMI_MATCH(DMI_BOARD_NAME, "0KP561"),
+ },
+ },
+ { /* Handle problems with rebooting on Dell Optiplex 360 with 0T656F */
+ .callback = set_bios_reboot,
+ .ident = "Dell OptiPlex 360",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 360"),
+ DMI_MATCH(DMI_BOARD_NAME, "0T656F"),
+ },
+ },
+ { /* Handle problems with rebooting on Dell Optiplex 745's SFF */
+ .callback = set_bios_reboot,
+ .ident = "Dell OptiPlex 745",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"),
+ },
+ },
+ { /* Handle problems with rebooting on Dell Optiplex 745's DFF */
+ .callback = set_bios_reboot,
+ .ident = "Dell OptiPlex 745",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"),
+ DMI_MATCH(DMI_BOARD_NAME, "0MM599"),
+ },
+ },
+ { /* Handle problems with rebooting on Dell Optiplex 745 with 0KW626 */
+ .callback = set_bios_reboot,
+ .ident = "Dell OptiPlex 745",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"),
+ DMI_MATCH(DMI_BOARD_NAME, "0KW626"),
+ },
+ },
+ { /* Handle problems with rebooting on Dell OptiPlex 760 with 0G919G */
+ .callback = set_bios_reboot,
+ .ident = "Dell OptiPlex 760",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 760"),
+ DMI_MATCH(DMI_BOARD_NAME, "0G919G"),
+ },
+ },
+ { /* Handle problems with rebooting on the OptiPlex 990. */
+ .callback = set_pci_reboot,
+ .ident = "Dell OptiPlex 990 BIOS A0x",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 990"),
+ DMI_MATCH(DMI_BIOS_VERSION, "A0"),
+ },
+ },
+ { /* Handle problems with rebooting on Dell 300's */
+ .callback = set_bios_reboot,
+ .ident = "Dell PowerEdge 300",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 300/"),
+ },
+ },
+ { /* Handle problems with rebooting on Dell 1300's */
+ .callback = set_bios_reboot,
+ .ident = "Dell PowerEdge 1300",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 1300/"),
+ },
+ },
+ { /* Handle problems with rebooting on Dell 2400's */
+ .callback = set_bios_reboot,
+ .ident = "Dell PowerEdge 2400",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2400"),
+ },
+ },
+ { /* Handle problems with rebooting on the Dell PowerEdge C6100. */
+ .callback = set_pci_reboot,
+ .ident = "Dell PowerEdge C6100",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "C6100"),
+ },
+ },
+ { /* Handle problems with rebooting on the Precision M6600. */
+ .callback = set_pci_reboot,
+ .ident = "Dell Precision M6600",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Precision M6600"),
+ },
+ },
+ { /* Handle problems with rebooting on Dell T5400's */
+ .callback = set_bios_reboot,
+ .ident = "Dell Precision T5400",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Precision WorkStation T5400"),
+ },
+ },
+ { /* Handle problems with rebooting on Dell T7400's */
+ .callback = set_bios_reboot,
+ .ident = "Dell Precision T7400",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Precision WorkStation T7400"),
+ },
+ },
+ { /* Handle problems with rebooting on Dell XPS710 */
+ .callback = set_bios_reboot,
+ .ident = "Dell XPS710",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Dell XPS710"),
+ },
+ },
+ { /* Handle problems with rebooting on Dell Optiplex 7450 AIO */
+ .callback = set_acpi_reboot,
+ .ident = "Dell OptiPlex 7450 AIO",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 7450 AIO"),
+ },
+ },
+
+ /* Hewlett-Packard */
+ { /* Handle problems with rebooting on HP laptops */
+ .callback = set_bios_reboot,
+ .ident = "HP Compaq Laptop",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq"),
+ },
+ },
+
+ { /* PCIe Wifi card isn't detected after reboot otherwise */
+ .callback = set_pci_reboot,
+ .ident = "Zotac ZBOX CI327 nano",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "NA"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "ZBOX-CI327NANO-GS-01"),
+ },
+ },
+
+ /* Sony */
+ { /* Handle problems with rebooting on Sony VGN-Z540N */
+ .callback = set_bios_reboot,
+ .ident = "Sony VGN-Z540N",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Sony Corporation"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "VGN-Z540N"),
+ },
+ },
+
+ { }
+};
+
+static int __init reboot_init(void)
+{
+ int rv;
+
+ /*
+ * Only do the DMI check if reboot_type hasn't been overridden
+ * on the command line
+ */
+ if (!reboot_default)
+ return 0;
+
+ /*
+ * The DMI quirks table takes precedence. If no quirks entry
+ * matches and the ACPI Hardware Reduced bit is set and EFI
+ * runtime services are enabled, force EFI reboot.
+ */
+ rv = dmi_check_system(reboot_dmi_table);
+
+ if (!rv && efi_reboot_required() && !efi_runtime_disabled())
+ reboot_type = BOOT_EFI;
+
+ return 0;
+}
+core_initcall(reboot_init);
+
+static inline void kb_wait(void)
+{
+ int i;
+
+ for (i = 0; i < 0x10000; i++) {
+ if ((inb(0x64) & 0x02) == 0)
+ break;
+ udelay(2);
+ }
+}
+
+static void vmxoff_nmi(int cpu, struct pt_regs *regs)
+{
+ cpu_emergency_vmxoff();
+}
+
+/* Use NMIs as IPIs to tell all CPUs to disable virtualization */
+static void emergency_vmx_disable_all(void)
+{
+ /* Just make sure we won't change CPUs while doing this */
+ local_irq_disable();
+
+ /*
+ * Disable VMX on all CPUs before rebooting, otherwise we risk hanging
+ * the machine, because the CPU blocks INIT when it's in VMX root.
+ *
+ * We can't take any locks and we may be on an inconsistent state, so
+ * use NMIs as IPIs to tell the other CPUs to exit VMX root and halt.
+ *
+ * Do the NMI shootdown even if VMX if off on _this_ CPU, as that
+ * doesn't prevent a different CPU from being in VMX root operation.
+ */
+ if (cpu_has_vmx()) {
+ /* Safely force _this_ CPU out of VMX root operation. */
+ __cpu_emergency_vmxoff();
+
+ /* Halt and exit VMX root operation on the other CPUs. */
+ nmi_shootdown_cpus(vmxoff_nmi);
+
+ }
+}
+
+
+void __attribute__((weak)) mach_reboot_fixups(void)
+{
+}
+
+/*
+ * To the best of our knowledge Windows compatible x86 hardware expects
+ * the following on reboot:
+ *
+ * 1) If the FADT has the ACPI reboot register flag set, try it
+ * 2) If still alive, write to the keyboard controller
+ * 3) If still alive, write to the ACPI reboot register again
+ * 4) If still alive, write to the keyboard controller again
+ * 5) If still alive, call the EFI runtime service to reboot
+ * 6) If no EFI runtime service, call the BIOS to do a reboot
+ *
+ * We default to following the same pattern. We also have
+ * two other reboot methods: 'triple fault' and 'PCI', which
+ * can be triggered via the reboot= kernel boot option or
+ * via quirks.
+ *
+ * This means that this function can never return, it can misbehave
+ * by not rebooting properly and hanging.
+ */
+static void native_machine_emergency_restart(void)
+{
+ int i;
+ int attempt = 0;
+ int orig_reboot_type = reboot_type;
+ unsigned short mode;
+
+ if (reboot_emergency)
+ emergency_vmx_disable_all();
+
+ tboot_shutdown(TB_SHUTDOWN_REBOOT);
+
+ /* Tell the BIOS if we want cold or warm reboot */
+ mode = reboot_mode == REBOOT_WARM ? 0x1234 : 0;
+ *((unsigned short *)__va(0x472)) = mode;
+
+ /*
+ * If an EFI capsule has been registered with the firmware then
+ * override the reboot= parameter.
+ */
+ if (efi_capsule_pending(NULL)) {
+ pr_info("EFI capsule is pending, forcing EFI reboot.\n");
+ reboot_type = BOOT_EFI;
+ }
+
+ for (;;) {
+ /* Could also try the reset bit in the Hammer NB */
+ switch (reboot_type) {
+ case BOOT_ACPI:
+ acpi_reboot();
+ reboot_type = BOOT_KBD;
+ break;
+
+ case BOOT_KBD:
+ mach_reboot_fixups(); /* For board specific fixups */
+
+ for (i = 0; i < 10; i++) {
+ kb_wait();
+ udelay(50);
+ outb(0xfe, 0x64); /* Pulse reset low */
+ udelay(50);
+ }
+ if (attempt == 0 && orig_reboot_type == BOOT_ACPI) {
+ attempt = 1;
+ reboot_type = BOOT_ACPI;
+ } else {
+ reboot_type = BOOT_EFI;
+ }
+ break;
+
+ case BOOT_EFI:
+ efi_reboot(reboot_mode, NULL);
+ reboot_type = BOOT_BIOS;
+ break;
+
+ case BOOT_BIOS:
+ machine_real_restart(MRR_BIOS);
+
+ /* We're probably dead after this, but... */
+ reboot_type = BOOT_CF9_SAFE;
+ break;
+
+ case BOOT_CF9_FORCE:
+ port_cf9_safe = true;
+ /* Fall through */
+
+ case BOOT_CF9_SAFE:
+ if (port_cf9_safe) {
+ u8 reboot_code = reboot_mode == REBOOT_WARM ? 0x06 : 0x0E;
+ u8 cf9 = inb(0xcf9) & ~reboot_code;
+ outb(cf9|2, 0xcf9); /* Request hard reset */
+ udelay(50);
+ /* Actually do the reset */
+ outb(cf9|reboot_code, 0xcf9);
+ udelay(50);
+ }
+ reboot_type = BOOT_TRIPLE;
+ break;
+
+ case BOOT_TRIPLE:
+ idt_invalidate(NULL);
+ __asm__ __volatile__("int3");
+
+ /* We're probably dead after this, but... */
+ reboot_type = BOOT_KBD;
+ break;
+ }
+ }
+}
+
+void native_machine_shutdown(void)
+{
+ /* Stop the cpus and apics */
+#ifdef CONFIG_X86_IO_APIC
+ /*
+ * Disabling IO APIC before local APIC is a workaround for
+ * erratum AVR31 in "Intel Atom Processor C2000 Product Family
+ * Specification Update". In this situation, interrupts that target
+ * a Logical Processor whose Local APIC is either in the process of
+ * being hardware disabled or software disabled are neither delivered
+ * nor discarded. When this erratum occurs, the processor may hang.
+ *
+ * Even without the erratum, it still makes sense to quiet IO APIC
+ * before disabling Local APIC.
+ */
+ clear_IO_APIC();
+#endif
+
+#ifdef CONFIG_SMP
+ /*
+ * Stop all of the others. Also disable the local irq to
+ * not receive the per-cpu timer interrupt which may trigger
+ * scheduler's load balance.
+ */
+ local_irq_disable();
+ stop_other_cpus();
+#endif
+
+ lapic_shutdown();
+ restore_boot_irq_mode();
+
+#ifdef CONFIG_HPET_TIMER
+ hpet_disable();
+#endif
+
+#ifdef CONFIG_X86_64
+ x86_platform.iommu_shutdown();
+#endif
+}
+
+static void __machine_emergency_restart(int emergency)
+{
+ reboot_emergency = emergency;
+ machine_ops.emergency_restart();
+}
+
+static void native_machine_restart(char *__unused)
+{
+ pr_notice("machine restart\n");
+
+ if (!reboot_force)
+ machine_shutdown();
+ __machine_emergency_restart(0);
+}
+
+static void native_machine_halt(void)
+{
+ /* Stop other cpus and apics */
+ machine_shutdown();
+
+ tboot_shutdown(TB_SHUTDOWN_HALT);
+
+ stop_this_cpu(NULL);
+}
+
+static void native_machine_power_off(void)
+{
+ if (pm_power_off) {
+ if (!reboot_force)
+ machine_shutdown();
+ pm_power_off();
+ }
+ /* A fallback in case there is no PM info available */
+ tboot_shutdown(TB_SHUTDOWN_HALT);
+}
+
+struct machine_ops machine_ops __ro_after_init = {
+ .power_off = native_machine_power_off,
+ .shutdown = native_machine_shutdown,
+ .emergency_restart = native_machine_emergency_restart,
+ .restart = native_machine_restart,
+ .halt = native_machine_halt,
+#ifdef CONFIG_KEXEC_CORE
+ .crash_shutdown = native_machine_crash_shutdown,
+#endif
+};
+
+void machine_power_off(void)
+{
+ machine_ops.power_off();
+}
+
+void machine_shutdown(void)
+{
+ machine_ops.shutdown();
+}
+
+void machine_emergency_restart(void)
+{
+ __machine_emergency_restart(1);
+}
+
+void machine_restart(char *cmd)
+{
+ machine_ops.restart(cmd);
+}
+
+void machine_halt(void)
+{
+ machine_ops.halt();
+}
+
+#ifdef CONFIG_KEXEC_CORE
+void machine_crash_shutdown(struct pt_regs *regs)
+{
+ machine_ops.crash_shutdown(regs);
+}
+#endif
+
+
+/* This is the CPU performing the emergency shutdown work. */
+int crashing_cpu = -1;
+
+#if defined(CONFIG_SMP)
+
+static nmi_shootdown_cb shootdown_callback;
+
+static atomic_t waiting_for_crash_ipi;
+static int crash_ipi_issued;
+
+static int crash_nmi_callback(unsigned int val, struct pt_regs *regs)
+{
+ int cpu;
+
+ cpu = raw_smp_processor_id();
+
+ /*
+ * Don't do anything if this handler is invoked on crashing cpu.
+ * Otherwise, system will completely hang. Crashing cpu can get
+ * an NMI if system was initially booted with nmi_watchdog parameter.
+ */
+ if (cpu == crashing_cpu)
+ return NMI_HANDLED;
+ local_irq_disable();
+
+ shootdown_callback(cpu, regs);
+
+ atomic_dec(&waiting_for_crash_ipi);
+ /* Assume hlt works */
+ halt();
+ for (;;)
+ cpu_relax();
+
+ return NMI_HANDLED;
+}
+
+static void smp_send_nmi_allbutself(void)
+{
+ apic->send_IPI_allbutself(NMI_VECTOR);
+}
+
+/*
+ * Halt all other CPUs, calling the specified function on each of them
+ *
+ * This function can be used to halt all other CPUs on crash
+ * or emergency reboot time. The function passed as parameter
+ * will be called inside a NMI handler on all CPUs.
+ */
+void nmi_shootdown_cpus(nmi_shootdown_cb callback)
+{
+ unsigned long msecs;
+ local_irq_disable();
+
+ /* Make a note of crashing cpu. Will be used in NMI callback. */
+ crashing_cpu = safe_smp_processor_id();
+
+ shootdown_callback = callback;
+
+ atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
+ /* Would it be better to replace the trap vector here? */
+ if (register_nmi_handler(NMI_LOCAL, crash_nmi_callback,
+ NMI_FLAG_FIRST, "crash"))
+ return; /* Return what? */
+ /*
+ * Ensure the new callback function is set before sending
+ * out the NMI
+ */
+ wmb();
+
+ smp_send_nmi_allbutself();
+
+ /* Kick CPUs looping in NMI context. */
+ WRITE_ONCE(crash_ipi_issued, 1);
+
+ msecs = 1000; /* Wait at most a second for the other cpus to stop */
+ while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) {
+ mdelay(1);
+ msecs--;
+ }
+
+ /* Leave the nmi callback set */
+}
+
+/*
+ * Check if the crash dumping IPI got issued and if so, call its callback
+ * directly. This function is used when we have already been in NMI handler.
+ * It doesn't return.
+ */
+void run_crash_ipi_callback(struct pt_regs *regs)
+{
+ if (crash_ipi_issued)
+ crash_nmi_callback(0, regs);
+}
+
+/* Override the weak function in kernel/panic.c */
+void nmi_panic_self_stop(struct pt_regs *regs)
+{
+ while (1) {
+ /* If no CPU is preparing crash dump, we simply loop here. */
+ run_crash_ipi_callback(regs);
+ cpu_relax();
+ }
+}
+
+#else /* !CONFIG_SMP */
+void nmi_shootdown_cpus(nmi_shootdown_cb callback)
+{
+ /* No other CPUs to shoot down */
+}
+
+void run_crash_ipi_callback(struct pt_regs *regs)
+{
+}
+#endif
diff --git a/arch/x86/kernel/reboot_fixups_32.c b/arch/x86/kernel/reboot_fixups_32.c
new file mode 100644
index 000000000..b7c0f142d
--- /dev/null
+++ b/arch/x86/kernel/reboot_fixups_32.c
@@ -0,0 +1,103 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This is a good place to put board specific reboot fixups.
+ *
+ * List of supported fixups:
+ * geode-gx1/cs5530a - Jaya Kumar <jayalk@intworks.biz>
+ * geode-gx/lx/cs5536 - Andres Salomon <dilinger@debian.org>
+ *
+ */
+
+#include <asm/delay.h>
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include <asm/reboot_fixups.h>
+#include <asm/msr.h>
+#include <linux/cs5535.h>
+
+static void cs5530a_warm_reset(struct pci_dev *dev)
+{
+ /* writing 1 to the reset control register, 0x44 causes the
+ cs5530a to perform a system warm reset */
+ pci_write_config_byte(dev, 0x44, 0x1);
+ udelay(50); /* shouldn't get here but be safe and spin-a-while */
+ return;
+}
+
+static void cs5536_warm_reset(struct pci_dev *dev)
+{
+ /* writing 1 to the LSB of this MSR causes a hard reset */
+ wrmsrl(MSR_DIVIL_SOFT_RESET, 1ULL);
+ udelay(50); /* shouldn't get here but be safe and spin a while */
+}
+
+static void rdc321x_reset(struct pci_dev *dev)
+{
+ unsigned i;
+ /* Voluntary reset the watchdog timer */
+ outl(0x80003840, 0xCF8);
+ /* Generate a CPU reset on next tick */
+ i = inl(0xCFC);
+ /* Use the minimum timer resolution */
+ i |= 0x1600;
+ outl(i, 0xCFC);
+ outb(1, 0x92);
+}
+
+static void ce4100_reset(struct pci_dev *dev)
+{
+ int i;
+
+ for (i = 0; i < 10; i++) {
+ outb(0x2, 0xcf9);
+ udelay(50);
+ }
+}
+
+struct device_fixup {
+ unsigned int vendor;
+ unsigned int device;
+ void (*reboot_fixup)(struct pci_dev *);
+};
+
+/*
+ * PCI ids solely used for fixups_table go here
+ */
+#define PCI_DEVICE_ID_INTEL_CE4100 0x0708
+
+static const struct device_fixup fixups_table[] = {
+{ PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, cs5530a_warm_reset },
+{ PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA, cs5536_warm_reset },
+{ PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE, cs5530a_warm_reset },
+{ PCI_VENDOR_ID_RDC, PCI_DEVICE_ID_RDC_R6030, rdc321x_reset },
+{ PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CE4100, ce4100_reset },
+};
+
+/*
+ * we see if any fixup is available for our current hardware. if there
+ * is a fixup, we call it and we expect to never return from it. if we
+ * do return, we keep looking and then eventually fall back to the
+ * standard mach_reboot on return.
+ */
+void mach_reboot_fixups(void)
+{
+ const struct device_fixup *cur;
+ struct pci_dev *dev;
+ int i;
+
+ /* we can be called from sysrq-B code. In such a case it is
+ * prohibited to dig PCI */
+ if (in_interrupt())
+ return;
+
+ for (i=0; i < ARRAY_SIZE(fixups_table); i++) {
+ cur = &(fixups_table[i]);
+ dev = pci_get_device(cur->vendor, cur->device, NULL);
+ if (!dev)
+ continue;
+
+ cur->reboot_fixup(dev);
+ pci_dev_put(dev);
+ }
+}
+
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S
new file mode 100644
index 000000000..77630d57e
--- /dev/null
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -0,0 +1,277 @@
+/*
+ * relocate_kernel.S - put the kernel image in place to boot
+ * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+#include <linux/linkage.h>
+#include <asm/page_types.h>
+#include <asm/kexec.h>
+#include <asm/processor-flags.h>
+
+/*
+ * Must be relocatable PIC code callable as a C function
+ */
+
+#define PTR(x) (x << 2)
+
+/*
+ * control_page + KEXEC_CONTROL_CODE_MAX_SIZE
+ * ~ control_page + PAGE_SIZE are used as data storage and stack for
+ * jumping back
+ */
+#define DATA(offset) (KEXEC_CONTROL_CODE_MAX_SIZE+(offset))
+
+/* Minimal CPU state */
+#define ESP DATA(0x0)
+#define CR0 DATA(0x4)
+#define CR3 DATA(0x8)
+#define CR4 DATA(0xc)
+
+/* other data */
+#define CP_VA_CONTROL_PAGE DATA(0x10)
+#define CP_PA_PGD DATA(0x14)
+#define CP_PA_SWAP_PAGE DATA(0x18)
+#define CP_PA_BACKUP_PAGES_MAP DATA(0x1c)
+
+ .text
+ .globl relocate_kernel
+relocate_kernel:
+ /* Save the CPU context, used for jumping back */
+
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ pushl %ebp
+ pushf
+
+ movl 20+8(%esp), %ebp /* list of pages */
+ movl PTR(VA_CONTROL_PAGE)(%ebp), %edi
+ movl %esp, ESP(%edi)
+ movl %cr0, %eax
+ movl %eax, CR0(%edi)
+ movl %cr3, %eax
+ movl %eax, CR3(%edi)
+ movl %cr4, %eax
+ movl %eax, CR4(%edi)
+
+ /* read the arguments and say goodbye to the stack */
+ movl 20+4(%esp), %ebx /* page_list */
+ movl 20+8(%esp), %ebp /* list of pages */
+ movl 20+12(%esp), %edx /* start address */
+ movl 20+16(%esp), %ecx /* cpu_has_pae */
+ movl 20+20(%esp), %esi /* preserve_context */
+
+ /* zero out flags, and disable interrupts */
+ pushl $0
+ popfl
+
+ /* save some information for jumping back */
+ movl PTR(VA_CONTROL_PAGE)(%ebp), %edi
+ movl %edi, CP_VA_CONTROL_PAGE(%edi)
+ movl PTR(PA_PGD)(%ebp), %eax
+ movl %eax, CP_PA_PGD(%edi)
+ movl PTR(PA_SWAP_PAGE)(%ebp), %eax
+ movl %eax, CP_PA_SWAP_PAGE(%edi)
+ movl %ebx, CP_PA_BACKUP_PAGES_MAP(%edi)
+
+ /*
+ * get physical address of control page now
+ * this is impossible after page table switch
+ */
+ movl PTR(PA_CONTROL_PAGE)(%ebp), %edi
+
+ /* switch to new set of page tables */
+ movl PTR(PA_PGD)(%ebp), %eax
+ movl %eax, %cr3
+
+ /* setup a new stack at the end of the physical control page */
+ lea PAGE_SIZE(%edi), %esp
+
+ /* jump to identity mapped page */
+ movl %edi, %eax
+ addl $(identity_mapped - relocate_kernel), %eax
+ pushl %eax
+ ret
+
+identity_mapped:
+ /* set return address to 0 if not preserving context */
+ pushl $0
+ /* store the start address on the stack */
+ pushl %edx
+
+ /*
+ * Set cr0 to a known state:
+ * - Paging disabled
+ * - Alignment check disabled
+ * - Write protect disabled
+ * - No task switch
+ * - Don't do FP software emulation.
+ * - Proctected mode enabled
+ */
+ movl %cr0, %eax
+ andl $~(X86_CR0_PG | X86_CR0_AM | X86_CR0_WP | X86_CR0_TS | X86_CR0_EM), %eax
+ orl $(X86_CR0_PE), %eax
+ movl %eax, %cr0
+
+ /* clear cr4 if applicable */
+ testl %ecx, %ecx
+ jz 1f
+ /*
+ * Set cr4 to a known state:
+ * Setting everything to zero seems safe.
+ */
+ xorl %eax, %eax
+ movl %eax, %cr4
+
+ jmp 1f
+1:
+
+ /* Flush the TLB (needed?) */
+ xorl %eax, %eax
+ movl %eax, %cr3
+
+ movl CP_PA_SWAP_PAGE(%edi), %eax
+ pushl %eax
+ pushl %ebx
+ call swap_pages
+ addl $8, %esp
+
+ /*
+ * To be certain of avoiding problems with self-modifying code
+ * I need to execute a serializing instruction here.
+ * So I flush the TLB, it's handy, and not processor dependent.
+ */
+ xorl %eax, %eax
+ movl %eax, %cr3
+
+ /*
+ * set all of the registers to known values
+ * leave %esp alone
+ */
+
+ testl %esi, %esi
+ jnz 1f
+ xorl %edi, %edi
+ xorl %eax, %eax
+ xorl %ebx, %ebx
+ xorl %ecx, %ecx
+ xorl %edx, %edx
+ xorl %esi, %esi
+ xorl %ebp, %ebp
+ ret
+1:
+ popl %edx
+ movl CP_PA_SWAP_PAGE(%edi), %esp
+ addl $PAGE_SIZE, %esp
+2:
+ call *%edx
+
+ /* get the re-entry point of the peer system */
+ movl 0(%esp), %ebp
+ call 1f
+1:
+ popl %ebx
+ subl $(1b - relocate_kernel), %ebx
+ movl CP_VA_CONTROL_PAGE(%ebx), %edi
+ lea PAGE_SIZE(%ebx), %esp
+ movl CP_PA_SWAP_PAGE(%ebx), %eax
+ movl CP_PA_BACKUP_PAGES_MAP(%ebx), %edx
+ pushl %eax
+ pushl %edx
+ call swap_pages
+ addl $8, %esp
+ movl CP_PA_PGD(%ebx), %eax
+ movl %eax, %cr3
+ movl %cr0, %eax
+ orl $X86_CR0_PG, %eax
+ movl %eax, %cr0
+ lea PAGE_SIZE(%edi), %esp
+ movl %edi, %eax
+ addl $(virtual_mapped - relocate_kernel), %eax
+ pushl %eax
+ ret
+
+virtual_mapped:
+ movl CR4(%edi), %eax
+ movl %eax, %cr4
+ movl CR3(%edi), %eax
+ movl %eax, %cr3
+ movl CR0(%edi), %eax
+ movl %eax, %cr0
+ movl ESP(%edi), %esp
+ movl %ebp, %eax
+
+ popf
+ popl %ebp
+ popl %edi
+ popl %esi
+ popl %ebx
+ ret
+
+ /* Do the copies */
+swap_pages:
+ movl 8(%esp), %edx
+ movl 4(%esp), %ecx
+ pushl %ebp
+ pushl %ebx
+ pushl %edi
+ pushl %esi
+ movl %ecx, %ebx
+ jmp 1f
+
+0: /* top, read another word from the indirection page */
+ movl (%ebx), %ecx
+ addl $4, %ebx
+1:
+ testb $0x1, %cl /* is it a destination page */
+ jz 2f
+ movl %ecx, %edi
+ andl $0xfffff000, %edi
+ jmp 0b
+2:
+ testb $0x2, %cl /* is it an indirection page */
+ jz 2f
+ movl %ecx, %ebx
+ andl $0xfffff000, %ebx
+ jmp 0b
+2:
+ testb $0x4, %cl /* is it the done indicator */
+ jz 2f
+ jmp 3f
+2:
+ testb $0x8, %cl /* is it the source indicator */
+ jz 0b /* Ignore it otherwise */
+ movl %ecx, %esi /* For every source page do a copy */
+ andl $0xfffff000, %esi
+
+ movl %edi, %eax
+ movl %esi, %ebp
+
+ movl %edx, %edi
+ movl $1024, %ecx
+ rep ; movsl
+
+ movl %ebp, %edi
+ movl %eax, %esi
+ movl $1024, %ecx
+ rep ; movsl
+
+ movl %eax, %edi
+ movl %edx, %esi
+ movl $1024, %ecx
+ rep ; movsl
+
+ lea PAGE_SIZE(%ebp), %esi
+ jmp 0b
+3:
+ popl %esi
+ popl %edi
+ popl %ebx
+ popl %ebp
+ ret
+
+ .globl kexec_control_code_size
+.set kexec_control_code_size, . - relocate_kernel
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
new file mode 100644
index 000000000..11eda21eb
--- /dev/null
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -0,0 +1,290 @@
+/*
+ * relocate_kernel.S - put the kernel image in place to boot
+ * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+#include <linux/linkage.h>
+#include <asm/page_types.h>
+#include <asm/kexec.h>
+#include <asm/processor-flags.h>
+#include <asm/pgtable_types.h>
+
+/*
+ * Must be relocatable PIC code callable as a C function
+ */
+
+#define PTR(x) (x << 3)
+#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+
+/*
+ * control_page + KEXEC_CONTROL_CODE_MAX_SIZE
+ * ~ control_page + PAGE_SIZE are used as data storage and stack for
+ * jumping back
+ */
+#define DATA(offset) (KEXEC_CONTROL_CODE_MAX_SIZE+(offset))
+
+/* Minimal CPU state */
+#define RSP DATA(0x0)
+#define CR0 DATA(0x8)
+#define CR3 DATA(0x10)
+#define CR4 DATA(0x18)
+
+/* other data */
+#define CP_PA_TABLE_PAGE DATA(0x20)
+#define CP_PA_SWAP_PAGE DATA(0x28)
+#define CP_PA_BACKUP_PAGES_MAP DATA(0x30)
+
+ .text
+ .align PAGE_SIZE
+ .code64
+ .globl relocate_kernel
+relocate_kernel:
+ /*
+ * %rdi indirection_page
+ * %rsi page_list
+ * %rdx start address
+ * %rcx preserve_context
+ * %r8 sme_active
+ */
+
+ /* Save the CPU context, used for jumping back */
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ pushf
+
+ movq PTR(VA_CONTROL_PAGE)(%rsi), %r11
+ movq %rsp, RSP(%r11)
+ movq %cr0, %rax
+ movq %rax, CR0(%r11)
+ movq %cr3, %rax
+ movq %rax, CR3(%r11)
+ movq %cr4, %rax
+ movq %rax, CR4(%r11)
+
+ /* Save CR4. Required to enable the right paging mode later. */
+ movq %rax, %r13
+
+ /* zero out flags, and disable interrupts */
+ pushq $0
+ popfq
+
+ /* Save SME active flag */
+ movq %r8, %r12
+
+ /*
+ * get physical address of control page now
+ * this is impossible after page table switch
+ */
+ movq PTR(PA_CONTROL_PAGE)(%rsi), %r8
+
+ /* get physical address of page table now too */
+ movq PTR(PA_TABLE_PAGE)(%rsi), %r9
+
+ /* get physical address of swap page now */
+ movq PTR(PA_SWAP_PAGE)(%rsi), %r10
+
+ /* save some information for jumping back */
+ movq %r9, CP_PA_TABLE_PAGE(%r11)
+ movq %r10, CP_PA_SWAP_PAGE(%r11)
+ movq %rdi, CP_PA_BACKUP_PAGES_MAP(%r11)
+
+ /* Switch to the identity mapped page tables */
+ movq %r9, %cr3
+
+ /* setup a new stack at the end of the physical control page */
+ lea PAGE_SIZE(%r8), %rsp
+
+ /* jump to identity mapped page */
+ addq $(identity_mapped - relocate_kernel), %r8
+ pushq %r8
+ ret
+
+identity_mapped:
+ /* set return address to 0 if not preserving context */
+ pushq $0
+ /* store the start address on the stack */
+ pushq %rdx
+
+ /*
+ * Set cr0 to a known state:
+ * - Paging enabled
+ * - Alignment check disabled
+ * - Write protect disabled
+ * - No task switch
+ * - Don't do FP software emulation.
+ * - Proctected mode enabled
+ */
+ movq %cr0, %rax
+ andq $~(X86_CR0_AM | X86_CR0_WP | X86_CR0_TS | X86_CR0_EM), %rax
+ orl $(X86_CR0_PG | X86_CR0_PE), %eax
+ movq %rax, %cr0
+
+ /*
+ * Set cr4 to a known state:
+ * - physical address extension enabled
+ * - 5-level paging, if it was enabled before
+ */
+ movl $X86_CR4_PAE, %eax
+ testq $X86_CR4_LA57, %r13
+ jz 1f
+ orl $X86_CR4_LA57, %eax
+1:
+ movq %rax, %cr4
+
+ jmp 1f
+1:
+
+ /* Flush the TLB (needed?) */
+ movq %r9, %cr3
+
+ /*
+ * If SME is active, there could be old encrypted cache line
+ * entries that will conflict with the now unencrypted memory
+ * used by kexec. Flush the caches before copying the kernel.
+ */
+ testq %r12, %r12
+ jz 1f
+ wbinvd
+1:
+
+ movq %rcx, %r11
+ call swap_pages
+
+ /*
+ * To be certain of avoiding problems with self-modifying code
+ * I need to execute a serializing instruction here.
+ * So I flush the TLB by reloading %cr3 here, it's handy,
+ * and not processor dependent.
+ */
+ movq %cr3, %rax
+ movq %rax, %cr3
+
+ /*
+ * set all of the registers to known values
+ * leave %rsp alone
+ */
+
+ testq %r11, %r11
+ jnz 1f
+ xorl %eax, %eax
+ xorl %ebx, %ebx
+ xorl %ecx, %ecx
+ xorl %edx, %edx
+ xorl %esi, %esi
+ xorl %edi, %edi
+ xorl %ebp, %ebp
+ xorl %r8d, %r8d
+ xorl %r9d, %r9d
+ xorl %r10d, %r10d
+ xorl %r11d, %r11d
+ xorl %r12d, %r12d
+ xorl %r13d, %r13d
+ xorl %r14d, %r14d
+ xorl %r15d, %r15d
+
+ ret
+
+1:
+ popq %rdx
+ leaq PAGE_SIZE(%r10), %rsp
+ call *%rdx
+
+ /* get the re-entry point of the peer system */
+ movq 0(%rsp), %rbp
+ call 1f
+1:
+ popq %r8
+ subq $(1b - relocate_kernel), %r8
+ movq CP_PA_SWAP_PAGE(%r8), %r10
+ movq CP_PA_BACKUP_PAGES_MAP(%r8), %rdi
+ movq CP_PA_TABLE_PAGE(%r8), %rax
+ movq %rax, %cr3
+ lea PAGE_SIZE(%r8), %rsp
+ call swap_pages
+ movq $virtual_mapped, %rax
+ pushq %rax
+ ret
+
+virtual_mapped:
+ movq RSP(%r8), %rsp
+ movq CR4(%r8), %rax
+ movq %rax, %cr4
+ movq CR3(%r8), %rax
+ movq CR0(%r8), %r8
+ movq %rax, %cr3
+ movq %r8, %cr0
+ movq %rbp, %rax
+
+ popf
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbp
+ popq %rbx
+ ret
+
+ /* Do the copies */
+swap_pages:
+ movq %rdi, %rcx /* Put the page_list in %rcx */
+ xorl %edi, %edi
+ xorl %esi, %esi
+ jmp 1f
+
+0: /* top, read another word for the indirection page */
+
+ movq (%rbx), %rcx
+ addq $8, %rbx
+1:
+ testb $0x1, %cl /* is it a destination page? */
+ jz 2f
+ movq %rcx, %rdi
+ andq $0xfffffffffffff000, %rdi
+ jmp 0b
+2:
+ testb $0x2, %cl /* is it an indirection page? */
+ jz 2f
+ movq %rcx, %rbx
+ andq $0xfffffffffffff000, %rbx
+ jmp 0b
+2:
+ testb $0x4, %cl /* is it the done indicator? */
+ jz 2f
+ jmp 3f
+2:
+ testb $0x8, %cl /* is it the source indicator? */
+ jz 0b /* Ignore it otherwise */
+ movq %rcx, %rsi /* For ever source page do a copy */
+ andq $0xfffffffffffff000, %rsi
+
+ movq %rdi, %rdx
+ movq %rsi, %rax
+
+ movq %r10, %rdi
+ movl $512, %ecx
+ rep ; movsq
+
+ movq %rax, %rdi
+ movq %rdx, %rsi
+ movl $512, %ecx
+ rep ; movsq
+
+ movq %rdx, %rdi
+ movq %r10, %rsi
+ movl $512, %ecx
+ rep ; movsq
+
+ lea PAGE_SIZE(%rax), %rsi
+ jmp 0b
+3:
+ ret
+
+ .globl kexec_control_code_size
+.set kexec_control_code_size, . - relocate_kernel
diff --git a/arch/x86/kernel/resource.c b/arch/x86/kernel/resource.c
new file mode 100644
index 000000000..9b9fb7882
--- /dev/null
+++ b/arch/x86/kernel/resource.c
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ioport.h>
+#include <asm/e820/api.h>
+
+static void resource_clip(struct resource *res, resource_size_t start,
+ resource_size_t end)
+{
+ resource_size_t low = 0, high = 0;
+
+ if (res->end < start || res->start > end)
+ return; /* no conflict */
+
+ if (res->start < start)
+ low = start - res->start;
+
+ if (res->end > end)
+ high = res->end - end;
+
+ /* Keep the area above or below the conflict, whichever is larger */
+ if (low > high)
+ res->end = start - 1;
+ else
+ res->start = end + 1;
+}
+
+static void remove_e820_regions(struct resource *avail)
+{
+ int i;
+ struct e820_entry *entry;
+
+ for (i = 0; i < e820_table->nr_entries; i++) {
+ entry = &e820_table->entries[i];
+
+ resource_clip(avail, entry->addr,
+ entry->addr + entry->size - 1);
+ }
+}
+
+void arch_remove_reservations(struct resource *avail)
+{
+ /*
+ * Trim out BIOS area (high 2MB) and E820 regions. We do not remove
+ * the low 1MB unconditionally, as this area is needed for some ISA
+ * cards requiring a memory range, e.g. the i82365 PCMCIA controller.
+ */
+ if (avail->flags & IORESOURCE_MEM) {
+ resource_clip(avail, BIOS_ROM_BASE, BIOS_ROM_END);
+
+ remove_e820_regions(avail);
+ }
+}
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
new file mode 100644
index 000000000..586f718b8
--- /dev/null
+++ b/arch/x86/kernel/rtc.c
@@ -0,0 +1,207 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * RTC related functions
+ */
+#include <linux/platform_device.h>
+#include <linux/mc146818rtc.h>
+#include <linux/acpi.h>
+#include <linux/bcd.h>
+#include <linux/export.h>
+#include <linux/pnp.h>
+#include <linux/of.h>
+
+#include <asm/vsyscall.h>
+#include <asm/x86_init.h>
+#include <asm/time.h>
+#include <asm/intel-mid.h>
+#include <asm/setup.h>
+
+#ifdef CONFIG_X86_32
+/*
+ * This is a special lock that is owned by the CPU and holds the index
+ * register we are working with. It is required for NMI access to the
+ * CMOS/RTC registers. See include/asm-i386/mc146818rtc.h for details.
+ */
+volatile unsigned long cmos_lock;
+EXPORT_SYMBOL(cmos_lock);
+#endif /* CONFIG_X86_32 */
+
+/* For two digit years assume time is always after that */
+#define CMOS_YEARS_OFFS 2000
+
+DEFINE_SPINLOCK(rtc_lock);
+EXPORT_SYMBOL(rtc_lock);
+
+/*
+ * In order to set the CMOS clock precisely, set_rtc_mmss has to be
+ * called 500 ms after the second nowtime has started, because when
+ * nowtime is written into the registers of the CMOS clock, it will
+ * jump to the next second precisely 500 ms later. Check the Motorola
+ * MC146818A or Dallas DS12887 data sheet for details.
+ */
+int mach_set_rtc_mmss(const struct timespec64 *now)
+{
+ unsigned long long nowtime = now->tv_sec;
+ struct rtc_time tm;
+ int retval = 0;
+
+ rtc_time64_to_tm(nowtime, &tm);
+ if (!rtc_valid_tm(&tm)) {
+ retval = mc146818_set_time(&tm);
+ if (retval)
+ printk(KERN_ERR "%s: RTC write failed with error %d\n",
+ __func__, retval);
+ } else {
+ printk(KERN_ERR
+ "%s: Invalid RTC value: write of %llx to RTC failed\n",
+ __func__, nowtime);
+ retval = -EINVAL;
+ }
+ return retval;
+}
+
+void mach_get_cmos_time(struct timespec64 *now)
+{
+ unsigned int status, year, mon, day, hour, min, sec, century = 0;
+ unsigned long flags;
+
+ /*
+ * If pm_trace abused the RTC as storage, set the timespec to 0,
+ * which tells the caller that this RTC value is unusable.
+ */
+ if (!pm_trace_rtc_valid()) {
+ now->tv_sec = now->tv_nsec = 0;
+ return;
+ }
+
+ spin_lock_irqsave(&rtc_lock, flags);
+
+ /*
+ * If UIP is clear, then we have >= 244 microseconds before
+ * RTC registers will be updated. Spec sheet says that this
+ * is the reliable way to read RTC - registers. If UIP is set
+ * then the register access might be invalid.
+ */
+ while ((CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP))
+ cpu_relax();
+
+ sec = CMOS_READ(RTC_SECONDS);
+ min = CMOS_READ(RTC_MINUTES);
+ hour = CMOS_READ(RTC_HOURS);
+ day = CMOS_READ(RTC_DAY_OF_MONTH);
+ mon = CMOS_READ(RTC_MONTH);
+ year = CMOS_READ(RTC_YEAR);
+
+#ifdef CONFIG_ACPI
+ if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID &&
+ acpi_gbl_FADT.century)
+ century = CMOS_READ(acpi_gbl_FADT.century);
+#endif
+
+ status = CMOS_READ(RTC_CONTROL);
+ WARN_ON_ONCE(RTC_ALWAYS_BCD && (status & RTC_DM_BINARY));
+
+ spin_unlock_irqrestore(&rtc_lock, flags);
+
+ if (RTC_ALWAYS_BCD || !(status & RTC_DM_BINARY)) {
+ sec = bcd2bin(sec);
+ min = bcd2bin(min);
+ hour = bcd2bin(hour);
+ day = bcd2bin(day);
+ mon = bcd2bin(mon);
+ year = bcd2bin(year);
+ }
+
+ if (century) {
+ century = bcd2bin(century);
+ year += century * 100;
+ } else
+ year += CMOS_YEARS_OFFS;
+
+ now->tv_sec = mktime64(year, mon, day, hour, min, sec);
+ now->tv_nsec = 0;
+}
+
+/* Routines for accessing the CMOS RAM/RTC. */
+unsigned char rtc_cmos_read(unsigned char addr)
+{
+ unsigned char val;
+
+ lock_cmos_prefix(addr);
+ outb(addr, RTC_PORT(0));
+ val = inb(RTC_PORT(1));
+ lock_cmos_suffix(addr);
+
+ return val;
+}
+EXPORT_SYMBOL(rtc_cmos_read);
+
+void rtc_cmos_write(unsigned char val, unsigned char addr)
+{
+ lock_cmos_prefix(addr);
+ outb(addr, RTC_PORT(0));
+ outb(val, RTC_PORT(1));
+ lock_cmos_suffix(addr);
+}
+EXPORT_SYMBOL(rtc_cmos_write);
+
+int update_persistent_clock64(struct timespec64 now)
+{
+ return x86_platform.set_wallclock(&now);
+}
+
+/* not static: needed by APM */
+void read_persistent_clock64(struct timespec64 *ts)
+{
+ x86_platform.get_wallclock(ts);
+}
+
+
+static struct resource rtc_resources[] = {
+ [0] = {
+ .start = RTC_PORT(0),
+ .end = RTC_PORT(1),
+ .flags = IORESOURCE_IO,
+ },
+ [1] = {
+ .start = RTC_IRQ,
+ .end = RTC_IRQ,
+ .flags = IORESOURCE_IRQ,
+ }
+};
+
+static struct platform_device rtc_device = {
+ .name = "rtc_cmos",
+ .id = -1,
+ .resource = rtc_resources,
+ .num_resources = ARRAY_SIZE(rtc_resources),
+};
+
+static __init int add_rtc_cmos(void)
+{
+#ifdef CONFIG_PNP
+ static const char * const ids[] __initconst =
+ { "PNP0b00", "PNP0b01", "PNP0b02", };
+ struct pnp_dev *dev;
+ struct pnp_id *id;
+ int i;
+
+ pnp_for_each_dev(dev) {
+ for (id = dev->id; id; id = id->next) {
+ for (i = 0; i < ARRAY_SIZE(ids); i++) {
+ if (compare_pnp_id(id, ids[i]) != 0)
+ return 0;
+ }
+ }
+ }
+#endif
+ if (!x86_platform.legacy.rtc)
+ return -ENODEV;
+
+ platform_device_register(&rtc_device);
+ dev_info(&rtc_device.dev,
+ "registered platform RTC device (no PNP device found)\n");
+
+ return 0;
+}
+device_initcall(add_rtc_cmos);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
new file mode 100644
index 000000000..652a10a32
--- /dev/null
+++ b/arch/x86/kernel/setup.c
@@ -0,0 +1,1308 @@
+/*
+ * Copyright (C) 1995 Linus Torvalds
+ *
+ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
+ *
+ * Memory region support
+ * David Parsons <orc@pell.chi.il.us>, July-August 1999
+ *
+ * Added E820 sanitization routine (removes overlapping memory regions);
+ * Brian Moyle <bmoyle@mvista.com>, February 2001
+ *
+ * Moved CPU detection code to cpu/${cpu}.c
+ * Patrick Mochel <mochel@osdl.org>, March 2002
+ *
+ * Provisions for empty E820 memory regions (reported by certain BIOSes).
+ * Alex Achenbach <xela@slit.de>, December 2002.
+ *
+ */
+
+/*
+ * This file handles the architecture-dependent parts of initialization
+ */
+
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/screen_info.h>
+#include <linux/ioport.h>
+#include <linux/acpi.h>
+#include <linux/sfi.h>
+#include <linux/apm_bios.h>
+#include <linux/initrd.h>
+#include <linux/bootmem.h>
+#include <linux/memblock.h>
+#include <linux/seq_file.h>
+#include <linux/console.h>
+#include <linux/root_dev.h>
+#include <linux/highmem.h>
+#include <linux/export.h>
+#include <linux/efi.h>
+#include <linux/init.h>
+#include <linux/edd.h>
+#include <linux/iscsi_ibft.h>
+#include <linux/nodemask.h>
+#include <linux/kexec.h>
+#include <linux/dmi.h>
+#include <linux/pfn.h>
+#include <linux/pci.h>
+#include <asm/pci-direct.h>
+#include <linux/init_ohci1394_dma.h>
+#include <linux/kvm_para.h>
+#include <linux/dma-contiguous.h>
+#include <xen/xen.h>
+
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/stddef.h>
+#include <linux/unistd.h>
+#include <linux/ptrace.h>
+#include <linux/user.h>
+#include <linux/delay.h>
+
+#include <linux/kallsyms.h>
+#include <linux/cpufreq.h>
+#include <linux/dma-mapping.h>
+#include <linux/ctype.h>
+#include <linux/uaccess.h>
+
+#include <linux/percpu.h>
+#include <linux/crash_dump.h>
+#include <linux/tboot.h>
+#include <linux/jiffies.h>
+#include <linux/mem_encrypt.h>
+
+#include <linux/usb/xhci-dbgp.h>
+#include <video/edid.h>
+
+#include <asm/mtrr.h>
+#include <asm/apic.h>
+#include <asm/realmode.h>
+#include <asm/e820/api.h>
+#include <asm/mpspec.h>
+#include <asm/setup.h>
+#include <asm/efi.h>
+#include <asm/timer.h>
+#include <asm/i8259.h>
+#include <asm/sections.h>
+#include <asm/io_apic.h>
+#include <asm/ist.h>
+#include <asm/setup_arch.h>
+#include <asm/bios_ebda.h>
+#include <asm/cacheflush.h>
+#include <asm/processor.h>
+#include <asm/bugs.h>
+#include <asm/kasan.h>
+
+#include <asm/vsyscall.h>
+#include <asm/cpu.h>
+#include <asm/desc.h>
+#include <asm/dma.h>
+#include <asm/iommu.h>
+#include <asm/gart.h>
+#include <asm/mmu_context.h>
+#include <asm/proto.h>
+
+#include <asm/paravirt.h>
+#include <asm/hypervisor.h>
+#include <asm/olpc_ofw.h>
+
+#include <asm/percpu.h>
+#include <asm/topology.h>
+#include <asm/apicdef.h>
+#include <asm/amd_nb.h>
+#include <asm/mce.h>
+#include <asm/alternative.h>
+#include <asm/prom.h>
+#include <asm/microcode.h>
+#include <asm/kaslr.h>
+#include <asm/unwind.h>
+
+/*
+ * max_low_pfn_mapped: highest direct mapped pfn under 4GB
+ * max_pfn_mapped: highest direct mapped pfn over 4GB
+ *
+ * The direct mapping only covers E820_TYPE_RAM regions, so the ranges and gaps are
+ * represented by pfn_mapped
+ */
+unsigned long max_low_pfn_mapped;
+unsigned long max_pfn_mapped;
+
+#ifdef CONFIG_DMI
+RESERVE_BRK(dmi_alloc, 65536);
+#endif
+
+
+static __initdata unsigned long _brk_start = (unsigned long)__brk_base;
+unsigned long _brk_end = (unsigned long)__brk_base;
+
+struct boot_params boot_params;
+
+/*
+ * Machine setup..
+ */
+static struct resource data_resource = {
+ .name = "Kernel data",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
+};
+
+static struct resource code_resource = {
+ .name = "Kernel code",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
+};
+
+static struct resource bss_resource = {
+ .name = "Kernel bss",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
+};
+
+
+#ifdef CONFIG_X86_32
+/* cpu data as detected by the assembly code in head_32.S */
+struct cpuinfo_x86 new_cpu_data;
+
+/* common cpu data for all cpus */
+struct cpuinfo_x86 boot_cpu_data __read_mostly;
+EXPORT_SYMBOL(boot_cpu_data);
+
+unsigned int def_to_bigsmp;
+
+/* for MCA, but anyone else can use it if they want */
+unsigned int machine_id;
+unsigned int machine_submodel_id;
+unsigned int BIOS_revision;
+
+struct apm_info apm_info;
+EXPORT_SYMBOL(apm_info);
+
+#if defined(CONFIG_X86_SPEEDSTEP_SMI) || \
+ defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
+struct ist_info ist_info;
+EXPORT_SYMBOL(ist_info);
+#else
+struct ist_info ist_info;
+#endif
+
+#else
+struct cpuinfo_x86 boot_cpu_data __read_mostly;
+EXPORT_SYMBOL(boot_cpu_data);
+#endif
+
+
+#if !defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64)
+__visible unsigned long mmu_cr4_features __ro_after_init;
+#else
+__visible unsigned long mmu_cr4_features __ro_after_init = X86_CR4_PAE;
+#endif
+
+/* Boot loader ID and version as integers, for the benefit of proc_dointvec */
+int bootloader_type, bootloader_version;
+
+/*
+ * Setup options
+ */
+struct screen_info screen_info;
+EXPORT_SYMBOL(screen_info);
+struct edid_info edid_info;
+EXPORT_SYMBOL_GPL(edid_info);
+
+extern int root_mountflags;
+
+unsigned long saved_video_mode;
+
+#define RAMDISK_IMAGE_START_MASK 0x07FF
+#define RAMDISK_PROMPT_FLAG 0x8000
+#define RAMDISK_LOAD_FLAG 0x4000
+
+static char __initdata command_line[COMMAND_LINE_SIZE];
+#ifdef CONFIG_CMDLINE_BOOL
+static char __initdata builtin_cmdline[COMMAND_LINE_SIZE] = CONFIG_CMDLINE;
+#endif
+
+#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
+struct edd edd;
+#ifdef CONFIG_EDD_MODULE
+EXPORT_SYMBOL(edd);
+#endif
+/**
+ * copy_edd() - Copy the BIOS EDD information
+ * from boot_params into a safe place.
+ *
+ */
+static inline void __init copy_edd(void)
+{
+ memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer,
+ sizeof(edd.mbr_signature));
+ memcpy(edd.edd_info, boot_params.eddbuf, sizeof(edd.edd_info));
+ edd.mbr_signature_nr = boot_params.edd_mbr_sig_buf_entries;
+ edd.edd_info_nr = boot_params.eddbuf_entries;
+}
+#else
+static inline void __init copy_edd(void)
+{
+}
+#endif
+
+void * __init extend_brk(size_t size, size_t align)
+{
+ size_t mask = align - 1;
+ void *ret;
+
+ BUG_ON(_brk_start == 0);
+ BUG_ON(align & mask);
+
+ _brk_end = (_brk_end + mask) & ~mask;
+ BUG_ON((char *)(_brk_end + size) > __brk_limit);
+
+ ret = (void *)_brk_end;
+ _brk_end += size;
+
+ memset(ret, 0, size);
+
+ return ret;
+}
+
+#ifdef CONFIG_X86_32
+static void __init cleanup_highmap(void)
+{
+}
+#endif
+
+static void __init reserve_brk(void)
+{
+ if (_brk_end > _brk_start)
+ memblock_reserve(__pa_symbol(_brk_start),
+ _brk_end - _brk_start);
+
+ /* Mark brk area as locked down and no longer taking any
+ new allocations */
+ _brk_start = 0;
+}
+
+u64 relocated_ramdisk;
+
+#ifdef CONFIG_BLK_DEV_INITRD
+
+static u64 __init get_ramdisk_image(void)
+{
+ u64 ramdisk_image = boot_params.hdr.ramdisk_image;
+
+ ramdisk_image |= (u64)boot_params.ext_ramdisk_image << 32;
+
+ return ramdisk_image;
+}
+static u64 __init get_ramdisk_size(void)
+{
+ u64 ramdisk_size = boot_params.hdr.ramdisk_size;
+
+ ramdisk_size |= (u64)boot_params.ext_ramdisk_size << 32;
+
+ return ramdisk_size;
+}
+
+static void __init relocate_initrd(void)
+{
+ /* Assume only end is not page aligned */
+ u64 ramdisk_image = get_ramdisk_image();
+ u64 ramdisk_size = get_ramdisk_size();
+ u64 area_size = PAGE_ALIGN(ramdisk_size);
+
+ /* We need to move the initrd down into directly mapped mem */
+ relocated_ramdisk = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
+ area_size, PAGE_SIZE);
+
+ if (!relocated_ramdisk)
+ panic("Cannot find place for new RAMDISK of size %lld\n",
+ ramdisk_size);
+
+ /* Note: this includes all the mem currently occupied by
+ the initrd, we rely on that fact to keep the data intact. */
+ memblock_reserve(relocated_ramdisk, area_size);
+ initrd_start = relocated_ramdisk + PAGE_OFFSET;
+ initrd_end = initrd_start + ramdisk_size;
+ printk(KERN_INFO "Allocated new RAMDISK: [mem %#010llx-%#010llx]\n",
+ relocated_ramdisk, relocated_ramdisk + ramdisk_size - 1);
+
+ copy_from_early_mem((void *)initrd_start, ramdisk_image, ramdisk_size);
+
+ printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to"
+ " [mem %#010llx-%#010llx]\n",
+ ramdisk_image, ramdisk_image + ramdisk_size - 1,
+ relocated_ramdisk, relocated_ramdisk + ramdisk_size - 1);
+}
+
+static void __init early_reserve_initrd(void)
+{
+ /* Assume only end is not page aligned */
+ u64 ramdisk_image = get_ramdisk_image();
+ u64 ramdisk_size = get_ramdisk_size();
+ u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
+
+ if (!boot_params.hdr.type_of_loader ||
+ !ramdisk_image || !ramdisk_size)
+ return; /* No initrd provided by bootloader */
+
+ memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image);
+}
+static void __init reserve_initrd(void)
+{
+ /* Assume only end is not page aligned */
+ u64 ramdisk_image = get_ramdisk_image();
+ u64 ramdisk_size = get_ramdisk_size();
+ u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
+ u64 mapped_size;
+
+ if (!boot_params.hdr.type_of_loader ||
+ !ramdisk_image || !ramdisk_size)
+ return; /* No initrd provided by bootloader */
+
+ initrd_start = 0;
+
+ mapped_size = memblock_mem_size(max_pfn_mapped);
+ if (ramdisk_size >= (mapped_size>>1))
+ panic("initrd too large to handle, "
+ "disabling initrd (%lld needed, %lld available)\n",
+ ramdisk_size, mapped_size>>1);
+
+ printk(KERN_INFO "RAMDISK: [mem %#010llx-%#010llx]\n", ramdisk_image,
+ ramdisk_end - 1);
+
+ if (pfn_range_is_mapped(PFN_DOWN(ramdisk_image),
+ PFN_DOWN(ramdisk_end))) {
+ /* All are mapped, easy case */
+ initrd_start = ramdisk_image + PAGE_OFFSET;
+ initrd_end = initrd_start + ramdisk_size;
+ return;
+ }
+
+ relocate_initrd();
+
+ memblock_free(ramdisk_image, ramdisk_end - ramdisk_image);
+}
+
+#else
+static void __init early_reserve_initrd(void)
+{
+}
+static void __init reserve_initrd(void)
+{
+}
+#endif /* CONFIG_BLK_DEV_INITRD */
+
+static void __init parse_setup_data(void)
+{
+ struct setup_data *data;
+ u64 pa_data, pa_next;
+
+ pa_data = boot_params.hdr.setup_data;
+ while (pa_data) {
+ u32 data_len, data_type;
+
+ data = early_memremap(pa_data, sizeof(*data));
+ data_len = data->len + sizeof(struct setup_data);
+ data_type = data->type;
+ pa_next = data->next;
+ early_memunmap(data, sizeof(*data));
+
+ switch (data_type) {
+ case SETUP_E820_EXT:
+ e820__memory_setup_extended(pa_data, data_len);
+ break;
+ case SETUP_DTB:
+ add_dtb(pa_data);
+ break;
+ case SETUP_EFI:
+ parse_efi_setup(pa_data, data_len);
+ break;
+ default:
+ break;
+ }
+ pa_data = pa_next;
+ }
+}
+
+static void __init memblock_x86_reserve_range_setup_data(void)
+{
+ struct setup_data *data;
+ u64 pa_data;
+
+ pa_data = boot_params.hdr.setup_data;
+ while (pa_data) {
+ data = early_memremap(pa_data, sizeof(*data));
+ memblock_reserve(pa_data, sizeof(*data) + data->len);
+ pa_data = data->next;
+ early_memunmap(data, sizeof(*data));
+ }
+}
+
+/*
+ * --------- Crashkernel reservation ------------------------------
+ */
+
+#ifdef CONFIG_KEXEC_CORE
+
+/* 16M alignment for crash kernel regions */
+#define CRASH_ALIGN (16 << 20)
+
+/*
+ * Keep the crash kernel below this limit. On 32 bits earlier kernels
+ * would limit the kernel to the low 512 MiB due to mapping restrictions.
+ * On 64bit, old kexec-tools need to under 896MiB.
+ */
+#ifdef CONFIG_X86_32
+# define CRASH_ADDR_LOW_MAX (512 << 20)
+# define CRASH_ADDR_HIGH_MAX (512 << 20)
+#else
+# define CRASH_ADDR_LOW_MAX (896UL << 20)
+# define CRASH_ADDR_HIGH_MAX MAXMEM
+#endif
+
+static int __init reserve_crashkernel_low(void)
+{
+#ifdef CONFIG_X86_64
+ unsigned long long base, low_base = 0, low_size = 0;
+ unsigned long total_low_mem;
+ int ret;
+
+ total_low_mem = memblock_mem_size(1UL << (32 - PAGE_SHIFT));
+
+ /* crashkernel=Y,low */
+ ret = parse_crashkernel_low(boot_command_line, total_low_mem, &low_size, &base);
+ if (ret) {
+ /*
+ * two parts from lib/swiotlb.c:
+ * -swiotlb size: user-specified with swiotlb= or default.
+ *
+ * -swiotlb overflow buffer: now hardcoded to 32k. We round it
+ * to 8M for other buffers that may need to stay low too. Also
+ * make sure we allocate enough extra low memory so that we
+ * don't run out of DMA buffers for 32-bit devices.
+ */
+ low_size = max(swiotlb_size_or_default() + (8UL << 20), 256UL << 20);
+ } else {
+ /* passed with crashkernel=0,low ? */
+ if (!low_size)
+ return 0;
+ }
+
+ low_base = memblock_find_in_range(0, 1ULL << 32, low_size, CRASH_ALIGN);
+ if (!low_base) {
+ pr_err("Cannot reserve %ldMB crashkernel low memory, please try smaller size.\n",
+ (unsigned long)(low_size >> 20));
+ return -ENOMEM;
+ }
+
+ ret = memblock_reserve(low_base, low_size);
+ if (ret) {
+ pr_err("%s: Error reserving crashkernel low memblock.\n", __func__);
+ return ret;
+ }
+
+ pr_info("Reserving %ldMB of low memory at %ldMB for crashkernel (System low RAM: %ldMB)\n",
+ (unsigned long)(low_size >> 20),
+ (unsigned long)(low_base >> 20),
+ (unsigned long)(total_low_mem >> 20));
+
+ crashk_low_res.start = low_base;
+ crashk_low_res.end = low_base + low_size - 1;
+ insert_resource(&iomem_resource, &crashk_low_res);
+#endif
+ return 0;
+}
+
+static void __init reserve_crashkernel(void)
+{
+ unsigned long long crash_size, crash_base, total_mem;
+ bool high = false;
+ int ret;
+
+ total_mem = memblock_phys_mem_size();
+
+ /* crashkernel=XM */
+ ret = parse_crashkernel(boot_command_line, total_mem, &crash_size, &crash_base);
+ if (ret != 0 || crash_size <= 0) {
+ /* crashkernel=X,high */
+ ret = parse_crashkernel_high(boot_command_line, total_mem,
+ &crash_size, &crash_base);
+ if (ret != 0 || crash_size <= 0)
+ return;
+ high = true;
+ }
+
+ if (xen_pv_domain()) {
+ pr_info("Ignoring crashkernel for a Xen PV domain\n");
+ return;
+ }
+
+ /* 0 means: find the address automatically */
+ if (crash_base <= 0) {
+ /*
+ * Set CRASH_ADDR_LOW_MAX upper bound for crash memory,
+ * as old kexec-tools loads bzImage below that, unless
+ * "crashkernel=size[KMG],high" is specified.
+ */
+ crash_base = memblock_find_in_range(CRASH_ALIGN,
+ high ? CRASH_ADDR_HIGH_MAX
+ : CRASH_ADDR_LOW_MAX,
+ crash_size, CRASH_ALIGN);
+ if (!crash_base) {
+ pr_info("crashkernel reservation failed - No suitable area found.\n");
+ return;
+ }
+
+ } else {
+ unsigned long long start;
+
+ start = memblock_find_in_range(crash_base,
+ crash_base + crash_size,
+ crash_size, 1 << 20);
+ if (start != crash_base) {
+ pr_info("crashkernel reservation failed - memory is in use.\n");
+ return;
+ }
+ }
+ ret = memblock_reserve(crash_base, crash_size);
+ if (ret) {
+ pr_err("%s: Error reserving crashkernel memblock.\n", __func__);
+ return;
+ }
+
+ if (crash_base >= (1ULL << 32) && reserve_crashkernel_low()) {
+ memblock_free(crash_base, crash_size);
+ return;
+ }
+
+ pr_info("Reserving %ldMB of memory at %ldMB for crashkernel (System RAM: %ldMB)\n",
+ (unsigned long)(crash_size >> 20),
+ (unsigned long)(crash_base >> 20),
+ (unsigned long)(total_mem >> 20));
+
+ crashk_res.start = crash_base;
+ crashk_res.end = crash_base + crash_size - 1;
+ insert_resource(&iomem_resource, &crashk_res);
+}
+#else
+static void __init reserve_crashkernel(void)
+{
+}
+#endif
+
+static struct resource standard_io_resources[] = {
+ { .name = "dma1", .start = 0x00, .end = 0x1f,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+ { .name = "pic1", .start = 0x20, .end = 0x21,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+ { .name = "timer0", .start = 0x40, .end = 0x43,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+ { .name = "timer1", .start = 0x50, .end = 0x53,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+ { .name = "keyboard", .start = 0x60, .end = 0x60,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+ { .name = "keyboard", .start = 0x64, .end = 0x64,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+ { .name = "dma page reg", .start = 0x80, .end = 0x8f,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+ { .name = "pic2", .start = 0xa0, .end = 0xa1,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+ { .name = "dma2", .start = 0xc0, .end = 0xdf,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+ { .name = "fpu", .start = 0xf0, .end = 0xff,
+ .flags = IORESOURCE_BUSY | IORESOURCE_IO }
+};
+
+void __init reserve_standard_io_resources(void)
+{
+ int i;
+
+ /* request I/O space for devices used on all i[345]86 PCs */
+ for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
+ request_resource(&ioport_resource, &standard_io_resources[i]);
+
+}
+
+static __init void reserve_ibft_region(void)
+{
+ unsigned long addr, size = 0;
+
+ addr = find_ibft_region(&size);
+
+ if (size)
+ memblock_reserve(addr, size);
+}
+
+static bool __init snb_gfx_workaround_needed(void)
+{
+#ifdef CONFIG_PCI
+ int i;
+ u16 vendor, devid;
+ static const __initconst u16 snb_ids[] = {
+ 0x0102,
+ 0x0112,
+ 0x0122,
+ 0x0106,
+ 0x0116,
+ 0x0126,
+ 0x010a,
+ };
+
+ /* Assume no if something weird is going on with PCI */
+ if (!early_pci_allowed())
+ return false;
+
+ vendor = read_pci_config_16(0, 2, 0, PCI_VENDOR_ID);
+ if (vendor != 0x8086)
+ return false;
+
+ devid = read_pci_config_16(0, 2, 0, PCI_DEVICE_ID);
+ for (i = 0; i < ARRAY_SIZE(snb_ids); i++)
+ if (devid == snb_ids[i])
+ return true;
+#endif
+
+ return false;
+}
+
+/*
+ * Sandy Bridge graphics has trouble with certain ranges, exclude
+ * them from allocation.
+ */
+static void __init trim_snb_memory(void)
+{
+ static const __initconst unsigned long bad_pages[] = {
+ 0x20050000,
+ 0x20110000,
+ 0x20130000,
+ 0x20138000,
+ 0x40004000,
+ };
+ int i;
+
+ if (!snb_gfx_workaround_needed())
+ return;
+
+ printk(KERN_DEBUG "reserving inaccessible SNB gfx pages\n");
+
+ /*
+ * Reserve all memory below the 1 MB mark that has not
+ * already been reserved.
+ */
+ memblock_reserve(0, 1<<20);
+
+ for (i = 0; i < ARRAY_SIZE(bad_pages); i++) {
+ if (memblock_reserve(bad_pages[i], PAGE_SIZE))
+ printk(KERN_WARNING "failed to reserve 0x%08lx\n",
+ bad_pages[i]);
+ }
+}
+
+/*
+ * Here we put platform-specific memory range workarounds, i.e.
+ * memory known to be corrupt or otherwise in need to be reserved on
+ * specific platforms.
+ *
+ * If this gets used more widely it could use a real dispatch mechanism.
+ */
+static void __init trim_platform_memory_ranges(void)
+{
+ trim_snb_memory();
+}
+
+static void __init trim_bios_range(void)
+{
+ /*
+ * A special case is the first 4Kb of memory;
+ * This is a BIOS owned area, not kernel ram, but generally
+ * not listed as such in the E820 table.
+ *
+ * This typically reserves additional memory (64KiB by default)
+ * since some BIOSes are known to corrupt low memory. See the
+ * Kconfig help text for X86_RESERVE_LOW.
+ */
+ e820__range_update(0, PAGE_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED);
+
+ /*
+ * special case: Some BIOSen report the PC BIOS
+ * area (640->1Mb) as ram even though it is not.
+ * take them out.
+ */
+ e820__range_remove(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_TYPE_RAM, 1);
+
+ e820__update_table(e820_table);
+}
+
+/* called before trim_bios_range() to spare extra sanitize */
+static void __init e820_add_kernel_range(void)
+{
+ u64 start = __pa_symbol(_text);
+ u64 size = __pa_symbol(_end) - start;
+
+ /*
+ * Complain if .text .data and .bss are not marked as E820_TYPE_RAM and
+ * attempt to fix it by adding the range. We may have a confused BIOS,
+ * or the user may have used memmap=exactmap or memmap=xxM$yyM to
+ * exclude kernel range. If we really are running on top non-RAM,
+ * we will crash later anyways.
+ */
+ if (e820__mapped_all(start, start + size, E820_TYPE_RAM))
+ return;
+
+ pr_warn(".text .data .bss are not marked as E820_TYPE_RAM!\n");
+ e820__range_remove(start, size, E820_TYPE_RAM, 0);
+ e820__range_add(start, size, E820_TYPE_RAM);
+}
+
+static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10;
+
+static int __init parse_reservelow(char *p)
+{
+ unsigned long long size;
+
+ if (!p)
+ return -EINVAL;
+
+ size = memparse(p, &p);
+
+ if (size < 4096)
+ size = 4096;
+
+ if (size > 640*1024)
+ size = 640*1024;
+
+ reserve_low = size;
+
+ return 0;
+}
+
+early_param("reservelow", parse_reservelow);
+
+static void __init trim_low_memory_range(void)
+{
+ memblock_reserve(0, ALIGN(reserve_low, PAGE_SIZE));
+}
+
+/*
+ * Dump out kernel offset information on panic.
+ */
+static int
+dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p)
+{
+ if (kaslr_enabled()) {
+ pr_emerg("Kernel Offset: 0x%lx from 0x%lx (relocation range: 0x%lx-0x%lx)\n",
+ kaslr_offset(),
+ __START_KERNEL,
+ __START_KERNEL_map,
+ MODULES_VADDR-1);
+ } else {
+ pr_emerg("Kernel Offset: disabled\n");
+ }
+
+ return 0;
+}
+
+/*
+ * Determine if we were loaded by an EFI loader. If so, then we have also been
+ * passed the efi memmap, systab, etc., so we should use these data structures
+ * for initialization. Note, the efi init code path is determined by the
+ * global efi_enabled. This allows the same kernel image to be used on existing
+ * systems (with a traditional BIOS) as well as on EFI systems.
+ */
+/*
+ * setup_arch - architecture-specific boot-time initializations
+ *
+ * Note: On x86_64, fixmaps are ready for use even before this is called.
+ */
+
+void __init setup_arch(char **cmdline_p)
+{
+ memblock_reserve(__pa_symbol(_text),
+ (unsigned long)__bss_stop - (unsigned long)_text);
+
+ /*
+ * Make sure page 0 is always reserved because on systems with
+ * L1TF its contents can be leaked to user processes.
+ */
+ memblock_reserve(0, PAGE_SIZE);
+
+ early_reserve_initrd();
+
+ /*
+ * At this point everything still needed from the boot loader
+ * or BIOS or kernel text should be early reserved or marked not
+ * RAM in e820. All other memory is free game.
+ */
+
+#ifdef CONFIG_X86_32
+ memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
+
+ /*
+ * copy kernel address range established so far and switch
+ * to the proper swapper page table
+ */
+ clone_pgd_range(swapper_pg_dir + KERNEL_PGD_BOUNDARY,
+ initial_page_table + KERNEL_PGD_BOUNDARY,
+ KERNEL_PGD_PTRS);
+
+ load_cr3(swapper_pg_dir);
+ /*
+ * Note: Quark X1000 CPUs advertise PGE incorrectly and require
+ * a cr3 based tlb flush, so the following __flush_tlb_all()
+ * will not flush anything because the cpu quirk which clears
+ * X86_FEATURE_PGE has not been invoked yet. Though due to the
+ * load_cr3() above the TLB has been flushed already. The
+ * quirk is invoked before subsequent calls to __flush_tlb_all()
+ * so proper operation is guaranteed.
+ */
+ __flush_tlb_all();
+#else
+ printk(KERN_INFO "Command line: %s\n", boot_command_line);
+ boot_cpu_data.x86_phys_bits = MAX_PHYSMEM_BITS;
+#endif
+
+ /*
+ * If we have OLPC OFW, we might end up relocating the fixmap due to
+ * reserve_top(), so do this before touching the ioremap area.
+ */
+ olpc_ofw_detect();
+
+ idt_setup_early_traps();
+ early_cpu_init();
+ arch_init_ideal_nops();
+ jump_label_init();
+ early_ioremap_init();
+
+ setup_olpc_ofw_pgd();
+
+ ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
+ screen_info = boot_params.screen_info;
+ edid_info = boot_params.edid_info;
+#ifdef CONFIG_X86_32
+ apm_info.bios = boot_params.apm_bios_info;
+ ist_info = boot_params.ist_info;
+#endif
+ saved_video_mode = boot_params.hdr.vid_mode;
+ bootloader_type = boot_params.hdr.type_of_loader;
+ if ((bootloader_type >> 4) == 0xe) {
+ bootloader_type &= 0xf;
+ bootloader_type |= (boot_params.hdr.ext_loader_type+0x10) << 4;
+ }
+ bootloader_version = bootloader_type & 0xf;
+ bootloader_version |= boot_params.hdr.ext_loader_ver << 4;
+
+#ifdef CONFIG_BLK_DEV_RAM
+ rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
+ rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
+ rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
+#endif
+#ifdef CONFIG_EFI
+ if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
+ EFI32_LOADER_SIGNATURE, 4)) {
+ set_bit(EFI_BOOT, &efi.flags);
+ } else if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
+ EFI64_LOADER_SIGNATURE, 4)) {
+ set_bit(EFI_BOOT, &efi.flags);
+ set_bit(EFI_64BIT, &efi.flags);
+ }
+#endif
+
+ x86_init.oem.arch_setup();
+
+ iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1;
+ e820__memory_setup();
+ parse_setup_data();
+
+ copy_edd();
+
+ if (!boot_params.hdr.root_flags)
+ root_mountflags &= ~MS_RDONLY;
+ init_mm.start_code = (unsigned long) _text;
+ init_mm.end_code = (unsigned long) _etext;
+ init_mm.end_data = (unsigned long) _edata;
+ init_mm.brk = _brk_end;
+
+ mpx_mm_init(&init_mm);
+
+ code_resource.start = __pa_symbol(_text);
+ code_resource.end = __pa_symbol(_etext)-1;
+ data_resource.start = __pa_symbol(_etext);
+ data_resource.end = __pa_symbol(_edata)-1;
+ bss_resource.start = __pa_symbol(__bss_start);
+ bss_resource.end = __pa_symbol(__bss_stop)-1;
+
+#ifdef CONFIG_CMDLINE_BOOL
+#ifdef CONFIG_CMDLINE_OVERRIDE
+ strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
+#else
+ if (builtin_cmdline[0]) {
+ /* append boot loader cmdline to builtin */
+ strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE);
+ strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE);
+ strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
+ }
+#endif
+#endif
+
+ strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
+ *cmdline_p = command_line;
+
+ /*
+ * x86_configure_nx() is called before parse_early_param() to detect
+ * whether hardware doesn't support NX (so that the early EHCI debug
+ * console setup can safely call set_fixmap()). It may then be called
+ * again from within noexec_setup() during parsing early parameters
+ * to honor the respective command line option.
+ */
+ x86_configure_nx();
+
+ parse_early_param();
+
+ if (efi_enabled(EFI_BOOT))
+ efi_memblock_x86_reserve_range();
+#ifdef CONFIG_MEMORY_HOTPLUG
+ /*
+ * Memory used by the kernel cannot be hot-removed because Linux
+ * cannot migrate the kernel pages. When memory hotplug is
+ * enabled, we should prevent memblock from allocating memory
+ * for the kernel.
+ *
+ * ACPI SRAT records all hotpluggable memory ranges. But before
+ * SRAT is parsed, we don't know about it.
+ *
+ * The kernel image is loaded into memory at very early time. We
+ * cannot prevent this anyway. So on NUMA system, we set any
+ * node the kernel resides in as un-hotpluggable.
+ *
+ * Since on modern servers, one node could have double-digit
+ * gigabytes memory, we can assume the memory around the kernel
+ * image is also un-hotpluggable. So before SRAT is parsed, just
+ * allocate memory near the kernel image to try the best to keep
+ * the kernel away from hotpluggable memory.
+ */
+ if (movable_node_is_enabled())
+ memblock_set_bottom_up(true);
+#endif
+
+ x86_report_nx();
+
+ /* after early param, so could get panic from serial */
+ memblock_x86_reserve_range_setup_data();
+
+ if (acpi_mps_check()) {
+#ifdef CONFIG_X86_LOCAL_APIC
+ disable_apic = 1;
+#endif
+ setup_clear_cpu_cap(X86_FEATURE_APIC);
+ }
+
+ e820__reserve_setup_data();
+ e820__finish_early_params();
+
+ if (efi_enabled(EFI_BOOT))
+ efi_init();
+
+ dmi_scan_machine();
+ dmi_memdev_walk();
+ dmi_set_dump_stack_arch_desc();
+
+ /*
+ * VMware detection requires dmi to be available, so this
+ * needs to be done after dmi_scan_machine(), for the boot CPU.
+ */
+ init_hypervisor_platform();
+
+ tsc_early_init();
+ x86_init.resources.probe_roms();
+
+ /* after parse_early_param, so could debug it */
+ insert_resource(&iomem_resource, &code_resource);
+ insert_resource(&iomem_resource, &data_resource);
+ insert_resource(&iomem_resource, &bss_resource);
+
+ e820_add_kernel_range();
+ trim_bios_range();
+#ifdef CONFIG_X86_32
+ if (ppro_with_ram_bug()) {
+ e820__range_update(0x70000000ULL, 0x40000ULL, E820_TYPE_RAM,
+ E820_TYPE_RESERVED);
+ e820__update_table(e820_table);
+ printk(KERN_INFO "fixed physical RAM map:\n");
+ e820__print_table("bad_ppro");
+ }
+#else
+ early_gart_iommu_check();
+#endif
+
+ /*
+ * partially used pages are not usable - thus
+ * we are rounding upwards:
+ */
+ max_pfn = e820__end_of_ram_pfn();
+
+ /* update e820 for memory not covered by WB MTRRs */
+ mtrr_bp_init();
+ if (mtrr_trim_uncached_memory(max_pfn))
+ max_pfn = e820__end_of_ram_pfn();
+
+ max_possible_pfn = max_pfn;
+
+ /*
+ * This call is required when the CPU does not support PAT. If
+ * mtrr_bp_init() invoked it already via pat_init() the call has no
+ * effect.
+ */
+ init_cache_modes();
+
+ /*
+ * Define random base addresses for memory sections after max_pfn is
+ * defined and before each memory section base is used.
+ */
+ kernel_randomize_memory();
+
+#ifdef CONFIG_X86_32
+ /* max_low_pfn get updated here */
+ find_low_pfn_range();
+#else
+ check_x2apic();
+
+ /* How many end-of-memory variables you have, grandma! */
+ /* need this before calling reserve_initrd */
+ if (max_pfn > (1UL<<(32 - PAGE_SHIFT)))
+ max_low_pfn = e820__end_of_low_ram_pfn();
+ else
+ max_low_pfn = max_pfn;
+
+ high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
+#endif
+
+ /*
+ * Find and reserve possible boot-time SMP configuration:
+ */
+ find_smp_config();
+
+ reserve_ibft_region();
+
+ early_alloc_pgt_buf();
+
+ /*
+ * Need to conclude brk, before e820__memblock_setup()
+ * it could use memblock_find_in_range, could overlap with
+ * brk area.
+ */
+ reserve_brk();
+
+ cleanup_highmap();
+
+ memblock_set_current_limit(ISA_END_ADDRESS);
+ e820__memblock_setup();
+
+ reserve_bios_regions();
+
+ if (efi_enabled(EFI_MEMMAP)) {
+ efi_fake_memmap();
+ efi_find_mirror();
+ efi_esrt_init();
+
+ /*
+ * The EFI specification says that boot service code won't be
+ * called after ExitBootServices(). This is, in fact, a lie.
+ */
+ efi_reserve_boot_services();
+ }
+
+ /* preallocate 4k for mptable mpc */
+ e820__memblock_alloc_reserved_mpc_new();
+
+#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
+ setup_bios_corruption_check();
+#endif
+
+#ifdef CONFIG_X86_32
+ printk(KERN_DEBUG "initial memory mapped: [mem 0x00000000-%#010lx]\n",
+ (max_pfn_mapped<<PAGE_SHIFT) - 1);
+#endif
+
+ reserve_real_mode();
+
+ trim_platform_memory_ranges();
+ trim_low_memory_range();
+
+ init_mem_mapping();
+
+ idt_setup_early_pf();
+
+ /*
+ * Update mmu_cr4_features (and, indirectly, trampoline_cr4_features)
+ * with the current CR4 value. This may not be necessary, but
+ * auditing all the early-boot CR4 manipulation would be needed to
+ * rule it out.
+ *
+ * Mask off features that don't work outside long mode (just
+ * PCIDE for now).
+ */
+ mmu_cr4_features = __read_cr4() & ~X86_CR4_PCIDE;
+
+ memblock_set_current_limit(get_max_mapped());
+
+ /*
+ * NOTE: On x86-32, only from this point on, fixmaps are ready for use.
+ */
+
+#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
+ if (init_ohci1394_dma_early)
+ init_ohci1394_dma_on_all_controllers();
+#endif
+ /* Allocate bigger log buffer */
+ setup_log_buf(1);
+
+ if (efi_enabled(EFI_BOOT)) {
+ switch (boot_params.secure_boot) {
+ case efi_secureboot_mode_disabled:
+ pr_info("Secure boot disabled\n");
+ break;
+ case efi_secureboot_mode_enabled:
+ pr_info("Secure boot enabled\n");
+ break;
+ default:
+ pr_info("Secure boot could not be determined\n");
+ break;
+ }
+ }
+
+ reserve_initrd();
+
+ acpi_table_upgrade();
+ /* Look for ACPI tables and reserve memory occupied by them. */
+ acpi_boot_table_init();
+
+ vsmp_init();
+
+ io_delay_init();
+
+ early_platform_quirks();
+
+ early_acpi_boot_init();
+
+ initmem_init();
+ dma_contiguous_reserve(max_pfn_mapped << PAGE_SHIFT);
+
+ /*
+ * Reserve memory for crash kernel after SRAT is parsed so that it
+ * won't consume hotpluggable memory.
+ */
+ reserve_crashkernel();
+
+ memblock_find_dma_reserve();
+
+ if (!early_xdbc_setup_hardware())
+ early_xdbc_register_console();
+
+ x86_init.paging.pagetable_init();
+
+ kasan_init();
+
+ /*
+ * Sync back kernel address range.
+ *
+ * FIXME: Can the later sync in setup_cpu_entry_areas() replace
+ * this call?
+ */
+ sync_initial_page_table();
+
+ tboot_probe();
+
+ map_vsyscall();
+
+ generic_apic_probe();
+
+ early_quirks();
+
+ /*
+ * Read APIC and some other early information from ACPI tables.
+ */
+ acpi_boot_init();
+ sfi_init();
+ x86_dtb_init();
+
+ /*
+ * get boot-time SMP configuration:
+ */
+ get_smp_config();
+
+ /*
+ * Systems w/o ACPI and mptables might not have it mapped the local
+ * APIC yet, but prefill_possible_map() might need to access it.
+ */
+ init_apic_mappings();
+
+ prefill_possible_map();
+
+ init_cpu_to_node();
+
+ io_apic_init_mappings();
+
+ x86_init.hyper.guest_late_init();
+
+ e820__reserve_resources();
+ e820__register_nosave_regions(max_pfn);
+
+ x86_init.resources.reserve_resources();
+
+ e820__setup_pci_gap();
+
+#ifdef CONFIG_VT
+#if defined(CONFIG_VGA_CONSOLE)
+ if (!efi_enabled(EFI_BOOT) || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
+ conswitchp = &vga_con;
+#elif defined(CONFIG_DUMMY_CONSOLE)
+ conswitchp = &dummy_con;
+#endif
+#endif
+ x86_init.oem.banner();
+
+ x86_init.timers.wallclock_init();
+
+ mcheck_init();
+
+ register_refined_jiffies(CLOCK_TICK_RATE);
+
+#ifdef CONFIG_EFI
+ if (efi_enabled(EFI_BOOT))
+ efi_apply_memmap_quirks();
+#endif
+
+ unwind_init();
+}
+
+#ifdef CONFIG_X86_32
+
+static struct resource video_ram_resource = {
+ .name = "Video RAM area",
+ .start = 0xa0000,
+ .end = 0xbffff,
+ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+void __init i386_reserve_resources(void)
+{
+ request_resource(&iomem_resource, &video_ram_resource);
+ reserve_standard_io_resources();
+}
+
+#endif /* CONFIG_X86_32 */
+
+static struct notifier_block kernel_offset_notifier = {
+ .notifier_call = dump_kernel_offset
+};
+
+static int __init register_kernel_offset_dumper(void)
+{
+ atomic_notifier_chain_register(&panic_notifier_list,
+ &kernel_offset_notifier);
+ return 0;
+}
+__initcall(register_kernel_offset_dumper);
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
new file mode 100644
index 000000000..ea554f812
--- /dev/null
+++ b/arch/x86/kernel/setup_percpu.c
@@ -0,0 +1,301 @@
+// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/percpu.h>
+#include <linux/kexec.h>
+#include <linux/crash_dump.h>
+#include <linux/smp.h>
+#include <linux/topology.h>
+#include <linux/pfn.h>
+#include <asm/sections.h>
+#include <asm/processor.h>
+#include <asm/desc.h>
+#include <asm/setup.h>
+#include <asm/mpspec.h>
+#include <asm/apicdef.h>
+#include <asm/highmem.h>
+#include <asm/proto.h>
+#include <asm/cpumask.h>
+#include <asm/cpu.h>
+#include <asm/stackprotector.h>
+
+DEFINE_PER_CPU_READ_MOSTLY(int, cpu_number);
+EXPORT_PER_CPU_SYMBOL(cpu_number);
+
+#ifdef CONFIG_X86_64
+#define BOOT_PERCPU_OFFSET ((unsigned long)__per_cpu_load)
+#else
+#define BOOT_PERCPU_OFFSET 0
+#endif
+
+DEFINE_PER_CPU_READ_MOSTLY(unsigned long, this_cpu_off) = BOOT_PERCPU_OFFSET;
+EXPORT_PER_CPU_SYMBOL(this_cpu_off);
+
+unsigned long __per_cpu_offset[NR_CPUS] __ro_after_init = {
+ [0 ... NR_CPUS-1] = BOOT_PERCPU_OFFSET,
+};
+EXPORT_SYMBOL(__per_cpu_offset);
+
+/*
+ * On x86_64 symbols referenced from code should be reachable using
+ * 32bit relocations. Reserve space for static percpu variables in
+ * modules so that they are always served from the first chunk which
+ * is located at the percpu segment base. On x86_32, anything can
+ * address anywhere. No need to reserve space in the first chunk.
+ */
+#ifdef CONFIG_X86_64
+#define PERCPU_FIRST_CHUNK_RESERVE PERCPU_MODULE_RESERVE
+#else
+#define PERCPU_FIRST_CHUNK_RESERVE 0
+#endif
+
+#ifdef CONFIG_X86_32
+/**
+ * pcpu_need_numa - determine percpu allocation needs to consider NUMA
+ *
+ * If NUMA is not configured or there is only one NUMA node available,
+ * there is no reason to consider NUMA. This function determines
+ * whether percpu allocation should consider NUMA or not.
+ *
+ * RETURNS:
+ * true if NUMA should be considered; otherwise, false.
+ */
+static bool __init pcpu_need_numa(void)
+{
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+ pg_data_t *last = NULL;
+ unsigned int cpu;
+
+ for_each_possible_cpu(cpu) {
+ int node = early_cpu_to_node(cpu);
+
+ if (node_online(node) && NODE_DATA(node) &&
+ last && last != NODE_DATA(node))
+ return true;
+
+ last = NODE_DATA(node);
+ }
+#endif
+ return false;
+}
+#endif
+
+/**
+ * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu
+ * @cpu: cpu to allocate for
+ * @size: size allocation in bytes
+ * @align: alignment
+ *
+ * Allocate @size bytes aligned at @align for cpu @cpu. This wrapper
+ * does the right thing for NUMA regardless of the current
+ * configuration.
+ *
+ * RETURNS:
+ * Pointer to the allocated area on success, NULL on failure.
+ */
+static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
+ unsigned long align)
+{
+ const unsigned long goal = __pa(MAX_DMA_ADDRESS);
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+ int node = early_cpu_to_node(cpu);
+ void *ptr;
+
+ if (!node_online(node) || !NODE_DATA(node)) {
+ ptr = __alloc_bootmem_nopanic(size, align, goal);
+ pr_info("cpu %d has no node %d or node-local memory\n",
+ cpu, node);
+ pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n",
+ cpu, size, __pa(ptr));
+ } else {
+ ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
+ size, align, goal);
+ pr_debug("per cpu data for cpu%d %lu bytes on node%d at %016lx\n",
+ cpu, size, node, __pa(ptr));
+ }
+ return ptr;
+#else
+ return __alloc_bootmem_nopanic(size, align, goal);
+#endif
+}
+
+/*
+ * Helpers for first chunk memory allocation
+ */
+static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align)
+{
+ return pcpu_alloc_bootmem(cpu, size, align);
+}
+
+static void __init pcpu_fc_free(void *ptr, size_t size)
+{
+ free_bootmem(__pa(ptr), size);
+}
+
+static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
+{
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+ if (early_cpu_to_node(from) == early_cpu_to_node(to))
+ return LOCAL_DISTANCE;
+ else
+ return REMOTE_DISTANCE;
+#else
+ return LOCAL_DISTANCE;
+#endif
+}
+
+static void __init pcpup_populate_pte(unsigned long addr)
+{
+ populate_extra_pte(addr);
+}
+
+static inline void setup_percpu_segment(int cpu)
+{
+#ifdef CONFIG_X86_32
+ struct desc_struct d = GDT_ENTRY_INIT(0x8092, per_cpu_offset(cpu),
+ 0xFFFFF);
+
+ write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_PERCPU, &d, DESCTYPE_S);
+#endif
+}
+
+void __init setup_per_cpu_areas(void)
+{
+ unsigned int cpu;
+ unsigned long delta;
+ int rc;
+
+ pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%u nr_node_ids:%d\n",
+ NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
+
+ /*
+ * Allocate percpu area. Embedding allocator is our favorite;
+ * however, on NUMA configurations, it can result in very
+ * sparse unit mapping and vmalloc area isn't spacious enough
+ * on 32bit. Use page in that case.
+ */
+#ifdef CONFIG_X86_32
+ if (pcpu_chosen_fc == PCPU_FC_AUTO && pcpu_need_numa())
+ pcpu_chosen_fc = PCPU_FC_PAGE;
+#endif
+ rc = -EINVAL;
+ if (pcpu_chosen_fc != PCPU_FC_PAGE) {
+ const size_t dyn_size = PERCPU_MODULE_RESERVE +
+ PERCPU_DYNAMIC_RESERVE - PERCPU_FIRST_CHUNK_RESERVE;
+ size_t atom_size;
+
+ /*
+ * On 64bit, use PMD_SIZE for atom_size so that embedded
+ * percpu areas are aligned to PMD. This, in the future,
+ * can also allow using PMD mappings in vmalloc area. Use
+ * PAGE_SIZE on 32bit as vmalloc space is highly contended
+ * and large vmalloc area allocs can easily fail.
+ */
+#ifdef CONFIG_X86_64
+ atom_size = PMD_SIZE;
+#else
+ atom_size = PAGE_SIZE;
+#endif
+ rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
+ dyn_size, atom_size,
+ pcpu_cpu_distance,
+ pcpu_fc_alloc, pcpu_fc_free);
+ if (rc < 0)
+ pr_warning("%s allocator failed (%d), falling back to page size\n",
+ pcpu_fc_names[pcpu_chosen_fc], rc);
+ }
+ if (rc < 0)
+ rc = pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
+ pcpu_fc_alloc, pcpu_fc_free,
+ pcpup_populate_pte);
+ if (rc < 0)
+ panic("cannot initialize percpu area (err=%d)", rc);
+
+ /* alrighty, percpu areas up and running */
+ delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
+ for_each_possible_cpu(cpu) {
+ per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu];
+ per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
+ per_cpu(cpu_number, cpu) = cpu;
+ setup_percpu_segment(cpu);
+ setup_stack_canary_segment(cpu);
+ /*
+ * Copy data used in early init routines from the
+ * initial arrays to the per cpu data areas. These
+ * arrays then become expendable and the *_early_ptr's
+ * are zeroed indicating that the static arrays are
+ * gone.
+ */
+#ifdef CONFIG_X86_LOCAL_APIC
+ per_cpu(x86_cpu_to_apicid, cpu) =
+ early_per_cpu_map(x86_cpu_to_apicid, cpu);
+ per_cpu(x86_bios_cpu_apicid, cpu) =
+ early_per_cpu_map(x86_bios_cpu_apicid, cpu);
+ per_cpu(x86_cpu_to_acpiid, cpu) =
+ early_per_cpu_map(x86_cpu_to_acpiid, cpu);
+#endif
+#ifdef CONFIG_X86_32
+ per_cpu(x86_cpu_to_logical_apicid, cpu) =
+ early_per_cpu_map(x86_cpu_to_logical_apicid, cpu);
+#endif
+#ifdef CONFIG_X86_64
+ per_cpu(irq_stack_ptr, cpu) =
+ per_cpu(irq_stack_union.irq_stack, cpu) +
+ IRQ_STACK_SIZE;
+#endif
+#ifdef CONFIG_NUMA
+ per_cpu(x86_cpu_to_node_map, cpu) =
+ early_per_cpu_map(x86_cpu_to_node_map, cpu);
+ /*
+ * Ensure that the boot cpu numa_node is correct when the boot
+ * cpu is on a node that doesn't have memory installed.
+ * Also cpu_up() will call cpu_to_node() for APs when
+ * MEMORY_HOTPLUG is defined, before per_cpu(numa_node) is set
+ * up later with c_init aka intel_init/amd_init.
+ * So set them all (boot cpu and all APs).
+ */
+ set_cpu_numa_node(cpu, early_cpu_to_node(cpu));
+#endif
+ /*
+ * Up to this point, the boot CPU has been using .init.data
+ * area. Reload any changed state for the boot CPU.
+ */
+ if (!cpu)
+ switch_to_new_gdt(cpu);
+ }
+
+ /* indicate the early static arrays will soon be gone */
+#ifdef CONFIG_X86_LOCAL_APIC
+ early_per_cpu_ptr(x86_cpu_to_apicid) = NULL;
+ early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL;
+ early_per_cpu_ptr(x86_cpu_to_acpiid) = NULL;
+#endif
+#ifdef CONFIG_X86_32
+ early_per_cpu_ptr(x86_cpu_to_logical_apicid) = NULL;
+#endif
+#ifdef CONFIG_NUMA
+ early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
+#endif
+
+ /* Setup node to cpumask map */
+ setup_node_to_cpumask_map();
+
+ /* Setup cpu initialized, callin, callout masks */
+ setup_cpu_local_masks();
+
+ /*
+ * Sync back kernel address range again. We already did this in
+ * setup_arch(), but percpu data also needs to be available in
+ * the smpboot asm. We can't reliably pick up percpu mappings
+ * using vmalloc_fault(), because exception dispatch needs
+ * percpu data.
+ *
+ * FIXME: Can the later sync in setup_cpu_entry_areas() replace
+ * this call?
+ */
+ sync_initial_page_table();
+}
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
new file mode 100644
index 000000000..16e908000
--- /dev/null
+++ b/arch/x86/kernel/signal.c
@@ -0,0 +1,878 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
+ *
+ * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson
+ * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes
+ * 2000-2002 x86-64 support by Andi Kleen
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/wait.h>
+#include <linux/tracehook.h>
+#include <linux/unistd.h>
+#include <linux/stddef.h>
+#include <linux/personality.h>
+#include <linux/uaccess.h>
+#include <linux/user-return-notifier.h>
+#include <linux/uprobes.h>
+#include <linux/context_tracking.h>
+#include <linux/syscalls.h>
+
+#include <asm/processor.h>
+#include <asm/ucontext.h>
+#include <asm/fpu/internal.h>
+#include <asm/fpu/signal.h>
+#include <asm/vdso.h>
+#include <asm/mce.h>
+#include <asm/sighandling.h>
+#include <asm/vm86.h>
+
+#ifdef CONFIG_X86_64
+#include <asm/proto.h>
+#include <asm/ia32_unistd.h>
+#endif /* CONFIG_X86_64 */
+
+#include <asm/syscall.h>
+#include <asm/syscalls.h>
+
+#include <asm/sigframe.h>
+#include <asm/signal.h>
+
+#define COPY(x) do { \
+ get_user_ex(regs->x, &sc->x); \
+} while (0)
+
+#define GET_SEG(seg) ({ \
+ unsigned short tmp; \
+ get_user_ex(tmp, &sc->seg); \
+ tmp; \
+})
+
+#define COPY_SEG(seg) do { \
+ regs->seg = GET_SEG(seg); \
+} while (0)
+
+#define COPY_SEG_CPL3(seg) do { \
+ regs->seg = GET_SEG(seg) | 3; \
+} while (0)
+
+#ifdef CONFIG_X86_64
+/*
+ * If regs->ss will cause an IRET fault, change it. Otherwise leave it
+ * alone. Using this generally makes no sense unless
+ * user_64bit_mode(regs) would return true.
+ */
+static void force_valid_ss(struct pt_regs *regs)
+{
+ u32 ar;
+ asm volatile ("lar %[old_ss], %[ar]\n\t"
+ "jz 1f\n\t" /* If invalid: */
+ "xorl %[ar], %[ar]\n\t" /* set ar = 0 */
+ "1:"
+ : [ar] "=r" (ar)
+ : [old_ss] "rm" ((u16)regs->ss));
+
+ /*
+ * For a valid 64-bit user context, we need DPL 3, type
+ * read-write data or read-write exp-down data, and S and P
+ * set. We can't use VERW because VERW doesn't check the
+ * P bit.
+ */
+ ar &= AR_DPL_MASK | AR_S | AR_P | AR_TYPE_MASK;
+ if (ar != (AR_DPL3 | AR_S | AR_P | AR_TYPE_RWDATA) &&
+ ar != (AR_DPL3 | AR_S | AR_P | AR_TYPE_RWDATA_EXPDOWN))
+ regs->ss = __USER_DS;
+}
+#endif
+
+static int restore_sigcontext(struct pt_regs *regs,
+ struct sigcontext __user *sc,
+ unsigned long uc_flags)
+{
+ unsigned long buf_val;
+ void __user *buf;
+ unsigned int tmpflags;
+ unsigned int err = 0;
+
+ /* Always make any pending restarted system calls return -EINTR */
+ current->restart_block.fn = do_no_restart_syscall;
+
+ get_user_try {
+
+#ifdef CONFIG_X86_32
+ set_user_gs(regs, GET_SEG(gs));
+ COPY_SEG(fs);
+ COPY_SEG(es);
+ COPY_SEG(ds);
+#endif /* CONFIG_X86_32 */
+
+ COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
+ COPY(dx); COPY(cx); COPY(ip); COPY(ax);
+
+#ifdef CONFIG_X86_64
+ COPY(r8);
+ COPY(r9);
+ COPY(r10);
+ COPY(r11);
+ COPY(r12);
+ COPY(r13);
+ COPY(r14);
+ COPY(r15);
+#endif /* CONFIG_X86_64 */
+
+ COPY_SEG_CPL3(cs);
+ COPY_SEG_CPL3(ss);
+
+ get_user_ex(tmpflags, &sc->flags);
+ regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
+ regs->orig_ax = -1; /* disable syscall checks */
+
+ get_user_ex(buf_val, &sc->fpstate);
+ buf = (void __user *)buf_val;
+ } get_user_catch(err);
+
+#ifdef CONFIG_X86_64
+ /*
+ * Fix up SS if needed for the benefit of old DOSEMU and
+ * CRIU.
+ */
+ if (unlikely(!(uc_flags & UC_STRICT_RESTORE_SS) && user_64bit_mode(regs)))
+ force_valid_ss(regs);
+#endif
+
+ err |= fpu__restore_sig(buf, IS_ENABLED(CONFIG_X86_32));
+
+ force_iret();
+
+ return err;
+}
+
+int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
+ struct pt_regs *regs, unsigned long mask)
+{
+ int err = 0;
+
+ put_user_try {
+
+#ifdef CONFIG_X86_32
+ put_user_ex(get_user_gs(regs), (unsigned int __user *)&sc->gs);
+ put_user_ex(regs->fs, (unsigned int __user *)&sc->fs);
+ put_user_ex(regs->es, (unsigned int __user *)&sc->es);
+ put_user_ex(regs->ds, (unsigned int __user *)&sc->ds);
+#endif /* CONFIG_X86_32 */
+
+ put_user_ex(regs->di, &sc->di);
+ put_user_ex(regs->si, &sc->si);
+ put_user_ex(regs->bp, &sc->bp);
+ put_user_ex(regs->sp, &sc->sp);
+ put_user_ex(regs->bx, &sc->bx);
+ put_user_ex(regs->dx, &sc->dx);
+ put_user_ex(regs->cx, &sc->cx);
+ put_user_ex(regs->ax, &sc->ax);
+#ifdef CONFIG_X86_64
+ put_user_ex(regs->r8, &sc->r8);
+ put_user_ex(regs->r9, &sc->r9);
+ put_user_ex(regs->r10, &sc->r10);
+ put_user_ex(regs->r11, &sc->r11);
+ put_user_ex(regs->r12, &sc->r12);
+ put_user_ex(regs->r13, &sc->r13);
+ put_user_ex(regs->r14, &sc->r14);
+ put_user_ex(regs->r15, &sc->r15);
+#endif /* CONFIG_X86_64 */
+
+ put_user_ex(current->thread.trap_nr, &sc->trapno);
+ put_user_ex(current->thread.error_code, &sc->err);
+ put_user_ex(regs->ip, &sc->ip);
+#ifdef CONFIG_X86_32
+ put_user_ex(regs->cs, (unsigned int __user *)&sc->cs);
+ put_user_ex(regs->flags, &sc->flags);
+ put_user_ex(regs->sp, &sc->sp_at_signal);
+ put_user_ex(regs->ss, (unsigned int __user *)&sc->ss);
+#else /* !CONFIG_X86_32 */
+ put_user_ex(regs->flags, &sc->flags);
+ put_user_ex(regs->cs, &sc->cs);
+ put_user_ex(0, &sc->gs);
+ put_user_ex(0, &sc->fs);
+ put_user_ex(regs->ss, &sc->ss);
+#endif /* CONFIG_X86_32 */
+
+ put_user_ex(fpstate, &sc->fpstate);
+
+ /* non-iBCS2 extensions.. */
+ put_user_ex(mask, &sc->oldmask);
+ put_user_ex(current->thread.cr2, &sc->cr2);
+ } put_user_catch(err);
+
+ return err;
+}
+
+/*
+ * Set up a signal frame.
+ */
+
+/*
+ * Determine which stack to use..
+ */
+static unsigned long align_sigframe(unsigned long sp)
+{
+#ifdef CONFIG_X86_32
+ /*
+ * Align the stack pointer according to the i386 ABI,
+ * i.e. so that on function entry ((sp + 4) & 15) == 0.
+ */
+ sp = ((sp + 4) & -16ul) - 4;
+#else /* !CONFIG_X86_32 */
+ sp = round_down(sp, 16) - 8;
+#endif
+ return sp;
+}
+
+static void __user *
+get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
+ void __user **fpstate)
+{
+ /* Default to using normal stack */
+ unsigned long math_size = 0;
+ unsigned long sp = regs->sp;
+ unsigned long buf_fx = 0;
+ int onsigstack = on_sig_stack(sp);
+ struct fpu *fpu = &current->thread.fpu;
+
+ /* redzone */
+ if (IS_ENABLED(CONFIG_X86_64))
+ sp -= 128;
+
+ /* This is the X/Open sanctioned signal stack switching. */
+ if (ka->sa.sa_flags & SA_ONSTACK) {
+ if (sas_ss_flags(sp) == 0)
+ sp = current->sas_ss_sp + current->sas_ss_size;
+ } else if (IS_ENABLED(CONFIG_X86_32) &&
+ !onsigstack &&
+ regs->ss != __USER_DS &&
+ !(ka->sa.sa_flags & SA_RESTORER) &&
+ ka->sa.sa_restorer) {
+ /* This is the legacy signal stack switching. */
+ sp = (unsigned long) ka->sa.sa_restorer;
+ }
+
+ if (fpu->initialized) {
+ sp = fpu__alloc_mathframe(sp, IS_ENABLED(CONFIG_X86_32),
+ &buf_fx, &math_size);
+ *fpstate = (void __user *)sp;
+ }
+
+ sp = align_sigframe(sp - frame_size);
+
+ /*
+ * If we are on the alternate signal stack and would overflow it, don't.
+ * Return an always-bogus address instead so we will die with SIGSEGV.
+ */
+ if (onsigstack && !likely(on_sig_stack(sp)))
+ return (void __user *)-1L;
+
+ /* save i387 and extended state */
+ if (fpu->initialized &&
+ copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size) < 0)
+ return (void __user *)-1L;
+
+ return (void __user *)sp;
+}
+
+#ifdef CONFIG_X86_32
+static const struct {
+ u16 poplmovl;
+ u32 val;
+ u16 int80;
+} __attribute__((packed)) retcode = {
+ 0xb858, /* popl %eax; movl $..., %eax */
+ __NR_sigreturn,
+ 0x80cd, /* int $0x80 */
+};
+
+static const struct {
+ u8 movl;
+ u32 val;
+ u16 int80;
+ u8 pad;
+} __attribute__((packed)) rt_retcode = {
+ 0xb8, /* movl $..., %eax */
+ __NR_rt_sigreturn,
+ 0x80cd, /* int $0x80 */
+ 0
+};
+
+static int
+__setup_frame(int sig, struct ksignal *ksig, sigset_t *set,
+ struct pt_regs *regs)
+{
+ struct sigframe __user *frame;
+ void __user *restorer;
+ int err = 0;
+ void __user *fpstate = NULL;
+
+ frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fpstate);
+
+ if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
+ return -EFAULT;
+
+ if (__put_user(sig, &frame->sig))
+ return -EFAULT;
+
+ if (setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0]))
+ return -EFAULT;
+
+ if (_NSIG_WORDS > 1) {
+ if (__copy_to_user(&frame->extramask, &set->sig[1],
+ sizeof(frame->extramask)))
+ return -EFAULT;
+ }
+
+ if (current->mm->context.vdso)
+ restorer = current->mm->context.vdso +
+ vdso_image_32.sym___kernel_sigreturn;
+ else
+ restorer = &frame->retcode;
+ if (ksig->ka.sa.sa_flags & SA_RESTORER)
+ restorer = ksig->ka.sa.sa_restorer;
+
+ /* Set up to return from userspace. */
+ err |= __put_user(restorer, &frame->pretcode);
+
+ /*
+ * This is popl %eax ; movl $__NR_sigreturn, %eax ; int $0x80
+ *
+ * WE DO NOT USE IT ANY MORE! It's only left here for historical
+ * reasons and because gdb uses it as a signature to notice
+ * signal handler stack frames.
+ */
+ err |= __put_user(*((u64 *)&retcode), (u64 *)frame->retcode);
+
+ if (err)
+ return -EFAULT;
+
+ /* Set up registers for signal handler */
+ regs->sp = (unsigned long)frame;
+ regs->ip = (unsigned long)ksig->ka.sa.sa_handler;
+ regs->ax = (unsigned long)sig;
+ regs->dx = 0;
+ regs->cx = 0;
+
+ regs->ds = __USER_DS;
+ regs->es = __USER_DS;
+ regs->ss = __USER_DS;
+ regs->cs = __USER_CS;
+
+ return 0;
+}
+
+static int __setup_rt_frame(int sig, struct ksignal *ksig,
+ sigset_t *set, struct pt_regs *regs)
+{
+ struct rt_sigframe __user *frame;
+ void __user *restorer;
+ int err = 0;
+ void __user *fpstate = NULL;
+
+ frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fpstate);
+
+ if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
+ return -EFAULT;
+
+ put_user_try {
+ put_user_ex(sig, &frame->sig);
+ put_user_ex(&frame->info, &frame->pinfo);
+ put_user_ex(&frame->uc, &frame->puc);
+
+ /* Create the ucontext. */
+ if (boot_cpu_has(X86_FEATURE_XSAVE))
+ put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags);
+ else
+ put_user_ex(0, &frame->uc.uc_flags);
+ put_user_ex(0, &frame->uc.uc_link);
+ save_altstack_ex(&frame->uc.uc_stack, regs->sp);
+
+ /* Set up to return from userspace. */
+ restorer = current->mm->context.vdso +
+ vdso_image_32.sym___kernel_rt_sigreturn;
+ if (ksig->ka.sa.sa_flags & SA_RESTORER)
+ restorer = ksig->ka.sa.sa_restorer;
+ put_user_ex(restorer, &frame->pretcode);
+
+ /*
+ * This is movl $__NR_rt_sigreturn, %ax ; int $0x80
+ *
+ * WE DO NOT USE IT ANY MORE! It's only left here for historical
+ * reasons and because gdb uses it as a signature to notice
+ * signal handler stack frames.
+ */
+ put_user_ex(*((u64 *)&rt_retcode), (u64 *)frame->retcode);
+ } put_user_catch(err);
+
+ err |= copy_siginfo_to_user(&frame->info, &ksig->info);
+ err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
+ regs, set->sig[0]);
+ err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
+
+ if (err)
+ return -EFAULT;
+
+ /* Set up registers for signal handler */
+ regs->sp = (unsigned long)frame;
+ regs->ip = (unsigned long)ksig->ka.sa.sa_handler;
+ regs->ax = (unsigned long)sig;
+ regs->dx = (unsigned long)&frame->info;
+ regs->cx = (unsigned long)&frame->uc;
+
+ regs->ds = __USER_DS;
+ regs->es = __USER_DS;
+ regs->ss = __USER_DS;
+ regs->cs = __USER_CS;
+
+ return 0;
+}
+#else /* !CONFIG_X86_32 */
+static unsigned long frame_uc_flags(struct pt_regs *regs)
+{
+ unsigned long flags;
+
+ if (boot_cpu_has(X86_FEATURE_XSAVE))
+ flags = UC_FP_XSTATE | UC_SIGCONTEXT_SS;
+ else
+ flags = UC_SIGCONTEXT_SS;
+
+ if (likely(user_64bit_mode(regs)))
+ flags |= UC_STRICT_RESTORE_SS;
+
+ return flags;
+}
+
+static int __setup_rt_frame(int sig, struct ksignal *ksig,
+ sigset_t *set, struct pt_regs *regs)
+{
+ struct rt_sigframe __user *frame;
+ void __user *fp = NULL;
+ unsigned long uc_flags;
+ int err = 0;
+
+ frame = get_sigframe(&ksig->ka, regs, sizeof(struct rt_sigframe), &fp);
+
+ if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
+ return -EFAULT;
+
+ if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
+ if (copy_siginfo_to_user(&frame->info, &ksig->info))
+ return -EFAULT;
+ }
+
+ uc_flags = frame_uc_flags(regs);
+
+ put_user_try {
+ /* Create the ucontext. */
+ put_user_ex(uc_flags, &frame->uc.uc_flags);
+ put_user_ex(0, &frame->uc.uc_link);
+ save_altstack_ex(&frame->uc.uc_stack, regs->sp);
+
+ /* Set up to return from userspace. If provided, use a stub
+ already in userspace. */
+ /* x86-64 should always use SA_RESTORER. */
+ if (ksig->ka.sa.sa_flags & SA_RESTORER) {
+ put_user_ex(ksig->ka.sa.sa_restorer, &frame->pretcode);
+ } else {
+ /* could use a vstub here */
+ err |= -EFAULT;
+ }
+ } put_user_catch(err);
+
+ err |= setup_sigcontext(&frame->uc.uc_mcontext, fp, regs, set->sig[0]);
+ err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
+
+ if (err)
+ return -EFAULT;
+
+ /* Set up registers for signal handler */
+ regs->di = sig;
+ /* In case the signal handler was declared without prototypes */
+ regs->ax = 0;
+
+ /* This also works for non SA_SIGINFO handlers because they expect the
+ next argument after the signal number on the stack. */
+ regs->si = (unsigned long)&frame->info;
+ regs->dx = (unsigned long)&frame->uc;
+ regs->ip = (unsigned long) ksig->ka.sa.sa_handler;
+
+ regs->sp = (unsigned long)frame;
+
+ /*
+ * Set up the CS and SS registers to run signal handlers in
+ * 64-bit mode, even if the handler happens to be interrupting
+ * 32-bit or 16-bit code.
+ *
+ * SS is subtle. In 64-bit mode, we don't need any particular
+ * SS descriptor, but we do need SS to be valid. It's possible
+ * that the old SS is entirely bogus -- this can happen if the
+ * signal we're trying to deliver is #GP or #SS caused by a bad
+ * SS value. We also have a compatbility issue here: DOSEMU
+ * relies on the contents of the SS register indicating the
+ * SS value at the time of the signal, even though that code in
+ * DOSEMU predates sigreturn's ability to restore SS. (DOSEMU
+ * avoids relying on sigreturn to restore SS; instead it uses
+ * a trampoline.) So we do our best: if the old SS was valid,
+ * we keep it. Otherwise we replace it.
+ */
+ regs->cs = __USER_CS;
+
+ if (unlikely(regs->ss != __USER_DS))
+ force_valid_ss(regs);
+
+ return 0;
+}
+#endif /* CONFIG_X86_32 */
+
+static int x32_setup_rt_frame(struct ksignal *ksig,
+ compat_sigset_t *set,
+ struct pt_regs *regs)
+{
+#ifdef CONFIG_X86_X32_ABI
+ struct rt_sigframe_x32 __user *frame;
+ unsigned long uc_flags;
+ void __user *restorer;
+ int err = 0;
+ void __user *fpstate = NULL;
+
+ frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fpstate);
+
+ if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
+ return -EFAULT;
+
+ if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
+ if (__copy_siginfo_to_user32(&frame->info, &ksig->info, true))
+ return -EFAULT;
+ }
+
+ uc_flags = frame_uc_flags(regs);
+
+ put_user_try {
+ /* Create the ucontext. */
+ put_user_ex(uc_flags, &frame->uc.uc_flags);
+ put_user_ex(0, &frame->uc.uc_link);
+ compat_save_altstack_ex(&frame->uc.uc_stack, regs->sp);
+ put_user_ex(0, &frame->uc.uc__pad0);
+
+ if (ksig->ka.sa.sa_flags & SA_RESTORER) {
+ restorer = ksig->ka.sa.sa_restorer;
+ } else {
+ /* could use a vstub here */
+ restorer = NULL;
+ err |= -EFAULT;
+ }
+ put_user_ex(restorer, &frame->pretcode);
+ } put_user_catch(err);
+
+ err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
+ regs, set->sig[0]);
+ err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
+
+ if (err)
+ return -EFAULT;
+
+ /* Set up registers for signal handler */
+ regs->sp = (unsigned long) frame;
+ regs->ip = (unsigned long) ksig->ka.sa.sa_handler;
+
+ /* We use the x32 calling convention here... */
+ regs->di = ksig->sig;
+ regs->si = (unsigned long) &frame->info;
+ regs->dx = (unsigned long) &frame->uc;
+
+ loadsegment(ds, __USER_DS);
+ loadsegment(es, __USER_DS);
+
+ regs->cs = __USER_CS;
+ regs->ss = __USER_DS;
+#endif /* CONFIG_X86_X32_ABI */
+
+ return 0;
+}
+
+/*
+ * Do a signal return; undo the signal stack.
+ */
+#ifdef CONFIG_X86_32
+SYSCALL_DEFINE0(sigreturn)
+{
+ struct pt_regs *regs = current_pt_regs();
+ struct sigframe __user *frame;
+ sigset_t set;
+
+ frame = (struct sigframe __user *)(regs->sp - 8);
+
+ if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
+ goto badframe;
+ if (__get_user(set.sig[0], &frame->sc.oldmask) || (_NSIG_WORDS > 1
+ && __copy_from_user(&set.sig[1], &frame->extramask,
+ sizeof(frame->extramask))))
+ goto badframe;
+
+ set_current_blocked(&set);
+
+ /*
+ * x86_32 has no uc_flags bits relevant to restore_sigcontext.
+ * Save a few cycles by skipping the __get_user.
+ */
+ if (restore_sigcontext(regs, &frame->sc, 0))
+ goto badframe;
+ return regs->ax;
+
+badframe:
+ signal_fault(regs, frame, "sigreturn");
+
+ return 0;
+}
+#endif /* CONFIG_X86_32 */
+
+SYSCALL_DEFINE0(rt_sigreturn)
+{
+ struct pt_regs *regs = current_pt_regs();
+ struct rt_sigframe __user *frame;
+ sigset_t set;
+ unsigned long uc_flags;
+
+ frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long));
+ if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
+ goto badframe;
+ if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
+ goto badframe;
+ if (__get_user(uc_flags, &frame->uc.uc_flags))
+ goto badframe;
+
+ set_current_blocked(&set);
+
+ if (restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags))
+ goto badframe;
+
+ if (restore_altstack(&frame->uc.uc_stack))
+ goto badframe;
+
+ return regs->ax;
+
+badframe:
+ signal_fault(regs, frame, "rt_sigreturn");
+ return 0;
+}
+
+static inline int is_ia32_compat_frame(struct ksignal *ksig)
+{
+ return IS_ENABLED(CONFIG_IA32_EMULATION) &&
+ ksig->ka.sa.sa_flags & SA_IA32_ABI;
+}
+
+static inline int is_ia32_frame(struct ksignal *ksig)
+{
+ return IS_ENABLED(CONFIG_X86_32) || is_ia32_compat_frame(ksig);
+}
+
+static inline int is_x32_frame(struct ksignal *ksig)
+{
+ return IS_ENABLED(CONFIG_X86_X32_ABI) &&
+ ksig->ka.sa.sa_flags & SA_X32_ABI;
+}
+
+static int
+setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
+{
+ int usig = ksig->sig;
+ sigset_t *set = sigmask_to_save();
+ compat_sigset_t *cset = (compat_sigset_t *) set;
+
+ /*
+ * Increment event counter and perform fixup for the pre-signal
+ * frame.
+ */
+ rseq_signal_deliver(ksig, regs);
+
+ /* Set up the stack frame */
+ if (is_ia32_frame(ksig)) {
+ if (ksig->ka.sa.sa_flags & SA_SIGINFO)
+ return ia32_setup_rt_frame(usig, ksig, cset, regs);
+ else
+ return ia32_setup_frame(usig, ksig, cset, regs);
+ } else if (is_x32_frame(ksig)) {
+ return x32_setup_rt_frame(ksig, cset, regs);
+ } else {
+ return __setup_rt_frame(ksig->sig, ksig, set, regs);
+ }
+}
+
+static void
+handle_signal(struct ksignal *ksig, struct pt_regs *regs)
+{
+ bool stepping, failed;
+ struct fpu *fpu = &current->thread.fpu;
+
+ if (v8086_mode(regs))
+ save_v86_state((struct kernel_vm86_regs *) regs, VM86_SIGNAL);
+
+ /* Are we from a system call? */
+ if (syscall_get_nr(current, regs) >= 0) {
+ /* If so, check system call restarting.. */
+ switch (syscall_get_error(current, regs)) {
+ case -ERESTART_RESTARTBLOCK:
+ case -ERESTARTNOHAND:
+ regs->ax = -EINTR;
+ break;
+
+ case -ERESTARTSYS:
+ if (!(ksig->ka.sa.sa_flags & SA_RESTART)) {
+ regs->ax = -EINTR;
+ break;
+ }
+ /* fallthrough */
+ case -ERESTARTNOINTR:
+ regs->ax = regs->orig_ax;
+ regs->ip -= 2;
+ break;
+ }
+ }
+
+ /*
+ * If TF is set due to a debugger (TIF_FORCED_TF), clear TF now
+ * so that register information in the sigcontext is correct and
+ * then notify the tracer before entering the signal handler.
+ */
+ stepping = test_thread_flag(TIF_SINGLESTEP);
+ if (stepping)
+ user_disable_single_step(current);
+
+ failed = (setup_rt_frame(ksig, regs) < 0);
+ if (!failed) {
+ /*
+ * Clear the direction flag as per the ABI for function entry.
+ *
+ * Clear RF when entering the signal handler, because
+ * it might disable possible debug exception from the
+ * signal handler.
+ *
+ * Clear TF for the case when it wasn't set by debugger to
+ * avoid the recursive send_sigtrap() in SIGTRAP handler.
+ */
+ regs->flags &= ~(X86_EFLAGS_DF|X86_EFLAGS_RF|X86_EFLAGS_TF);
+ /*
+ * Ensure the signal handler starts with the new fpu state.
+ */
+ if (fpu->initialized)
+ fpu__clear(fpu);
+ }
+ signal_setup_done(failed, ksig, stepping);
+}
+
+static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs)
+{
+#ifdef CONFIG_IA32_EMULATION
+ if (current_thread_info()->status & TS_COMPAT_RESTART)
+ return __NR_ia32_restart_syscall;
+#endif
+#ifdef CONFIG_X86_X32_ABI
+ return __NR_restart_syscall | (regs->orig_ax & __X32_SYSCALL_BIT);
+#else
+ return __NR_restart_syscall;
+#endif
+}
+
+/*
+ * Note that 'init' is a special process: it doesn't get signals it doesn't
+ * want to handle. Thus you cannot kill init even with a SIGKILL even by
+ * mistake.
+ */
+void do_signal(struct pt_regs *regs)
+{
+ struct ksignal ksig;
+
+ if (get_signal(&ksig)) {
+ /* Whee! Actually deliver the signal. */
+ handle_signal(&ksig, regs);
+ return;
+ }
+
+ /* Did we come from a system call? */
+ if (syscall_get_nr(current, regs) >= 0) {
+ /* Restart the system call - no handlers present */
+ switch (syscall_get_error(current, regs)) {
+ case -ERESTARTNOHAND:
+ case -ERESTARTSYS:
+ case -ERESTARTNOINTR:
+ regs->ax = regs->orig_ax;
+ regs->ip -= 2;
+ break;
+
+ case -ERESTART_RESTARTBLOCK:
+ regs->ax = get_nr_restart_syscall(regs);
+ regs->ip -= 2;
+ break;
+ }
+ }
+
+ /*
+ * If there's no signal to deliver, we just put the saved sigmask
+ * back.
+ */
+ restore_saved_sigmask();
+}
+
+void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
+{
+ struct task_struct *me = current;
+
+ if (show_unhandled_signals && printk_ratelimit()) {
+ printk("%s"
+ "%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx",
+ task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG,
+ me->comm, me->pid, where, frame,
+ regs->ip, regs->sp, regs->orig_ax);
+ print_vma_addr(KERN_CONT " in ", regs->ip);
+ pr_cont("\n");
+ }
+
+ force_sig(SIGSEGV, me);
+}
+
+#ifdef CONFIG_X86_X32_ABI
+asmlinkage long sys32_x32_rt_sigreturn(void)
+{
+ struct pt_regs *regs = current_pt_regs();
+ struct rt_sigframe_x32 __user *frame;
+ sigset_t set;
+ unsigned long uc_flags;
+
+ frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8);
+
+ if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
+ goto badframe;
+ if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
+ goto badframe;
+ if (__get_user(uc_flags, &frame->uc.uc_flags))
+ goto badframe;
+
+ set_current_blocked(&set);
+
+ if (restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags))
+ goto badframe;
+
+ if (compat_restore_altstack(&frame->uc.uc_stack))
+ goto badframe;
+
+ return regs->ax;
+
+badframe:
+ signal_fault(regs, frame, "x32 rt_sigreturn");
+ return 0;
+}
+#endif
diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c
new file mode 100644
index 000000000..9ccbf0576
--- /dev/null
+++ b/arch/x86/kernel/signal_compat.c
@@ -0,0 +1,182 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/compat.h>
+#include <linux/uaccess.h>
+#include <linux/ptrace.h>
+
+/*
+ * The compat_siginfo_t structure and handing code is very easy
+ * to break in several ways. It must always be updated when new
+ * updates are made to the main siginfo_t, and
+ * copy_siginfo_to_user32() must be updated when the
+ * (arch-independent) copy_siginfo_to_user() is updated.
+ *
+ * It is also easy to put a new member in the compat_siginfo_t
+ * which has implicit alignment which can move internal structure
+ * alignment around breaking the ABI. This can happen if you,
+ * for instance, put a plain 64-bit value in there.
+ */
+static inline void signal_compat_build_tests(void)
+{
+ int _sifields_offset = offsetof(compat_siginfo_t, _sifields);
+
+ /*
+ * If adding a new si_code, there is probably new data in
+ * the siginfo. Make sure folks bumping the si_code
+ * limits also have to look at this code. Make sure any
+ * new fields are handled in copy_siginfo_to_user32()!
+ */
+ BUILD_BUG_ON(NSIGILL != 11);
+ BUILD_BUG_ON(NSIGFPE != 15);
+ BUILD_BUG_ON(NSIGSEGV != 7);
+ BUILD_BUG_ON(NSIGBUS != 5);
+ BUILD_BUG_ON(NSIGTRAP != 5);
+ BUILD_BUG_ON(NSIGCHLD != 6);
+ BUILD_BUG_ON(NSIGSYS != 1);
+
+ /* This is part of the ABI and can never change in size: */
+ BUILD_BUG_ON(sizeof(compat_siginfo_t) != 128);
+ /*
+ * The offsets of all the (unioned) si_fields are fixed
+ * in the ABI, of course. Make sure none of them ever
+ * move and are always at the beginning:
+ */
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, _sifields) != 3 * sizeof(int));
+#define CHECK_CSI_OFFSET(name) BUILD_BUG_ON(_sifields_offset != offsetof(compat_siginfo_t, _sifields.name))
+
+ BUILD_BUG_ON(offsetof(siginfo_t, si_signo) != 0);
+ BUILD_BUG_ON(offsetof(siginfo_t, si_errno) != 4);
+ BUILD_BUG_ON(offsetof(siginfo_t, si_code) != 8);
+
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, si_signo) != 0);
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, si_errno) != 4);
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, si_code) != 8);
+ /*
+ * Ensure that the size of each si_field never changes.
+ * If it does, it is a sign that the
+ * copy_siginfo_to_user32() code below needs to updated
+ * along with the size in the CHECK_SI_SIZE().
+ *
+ * We repeat this check for both the generic and compat
+ * siginfos.
+ *
+ * Note: it is OK for these to grow as long as the whole
+ * structure stays within the padding size (checked
+ * above).
+ */
+#define CHECK_CSI_SIZE(name, size) BUILD_BUG_ON(size != sizeof(((compat_siginfo_t *)0)->_sifields.name))
+#define CHECK_SI_SIZE(name, size) BUILD_BUG_ON(size != sizeof(((siginfo_t *)0)->_sifields.name))
+
+ CHECK_CSI_OFFSET(_kill);
+ CHECK_CSI_SIZE (_kill, 2*sizeof(int));
+ CHECK_SI_SIZE (_kill, 2*sizeof(int));
+
+ BUILD_BUG_ON(offsetof(siginfo_t, si_pid) != 0x10);
+ BUILD_BUG_ON(offsetof(siginfo_t, si_uid) != 0x14);
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, si_pid) != 0xC);
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, si_uid) != 0x10);
+
+ CHECK_CSI_OFFSET(_timer);
+ CHECK_CSI_SIZE (_timer, 3*sizeof(int));
+ CHECK_SI_SIZE (_timer, 6*sizeof(int));
+
+ BUILD_BUG_ON(offsetof(siginfo_t, si_tid) != 0x10);
+ BUILD_BUG_ON(offsetof(siginfo_t, si_overrun) != 0x14);
+ BUILD_BUG_ON(offsetof(siginfo_t, si_value) != 0x18);
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, si_tid) != 0x0C);
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, si_overrun) != 0x10);
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, si_value) != 0x14);
+
+ CHECK_CSI_OFFSET(_rt);
+ CHECK_CSI_SIZE (_rt, 3*sizeof(int));
+ CHECK_SI_SIZE (_rt, 4*sizeof(int));
+
+ BUILD_BUG_ON(offsetof(siginfo_t, si_pid) != 0x10);
+ BUILD_BUG_ON(offsetof(siginfo_t, si_uid) != 0x14);
+ BUILD_BUG_ON(offsetof(siginfo_t, si_value) != 0x18);
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, si_pid) != 0x0C);
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, si_uid) != 0x10);
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, si_value) != 0x14);
+
+ CHECK_CSI_OFFSET(_sigchld);
+ CHECK_CSI_SIZE (_sigchld, 5*sizeof(int));
+ CHECK_SI_SIZE (_sigchld, 8*sizeof(int));
+
+ BUILD_BUG_ON(offsetof(siginfo_t, si_pid) != 0x10);
+ BUILD_BUG_ON(offsetof(siginfo_t, si_uid) != 0x14);
+ BUILD_BUG_ON(offsetof(siginfo_t, si_status) != 0x18);
+ BUILD_BUG_ON(offsetof(siginfo_t, si_utime) != 0x20);
+ BUILD_BUG_ON(offsetof(siginfo_t, si_stime) != 0x28);
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, si_pid) != 0x0C);
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, si_uid) != 0x10);
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, si_status) != 0x14);
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, si_utime) != 0x18);
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, si_stime) != 0x1C);
+
+#ifdef CONFIG_X86_X32_ABI
+ CHECK_CSI_OFFSET(_sigchld_x32);
+ CHECK_CSI_SIZE (_sigchld_x32, 7*sizeof(int));
+ /* no _sigchld_x32 in the generic siginfo_t */
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, _sifields._sigchld_x32._utime) != 0x18);
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, _sifields._sigchld_x32._stime) != 0x20);
+#endif
+
+ CHECK_CSI_OFFSET(_sigfault);
+ CHECK_CSI_SIZE (_sigfault, 4*sizeof(int));
+ CHECK_SI_SIZE (_sigfault, 8*sizeof(int));
+
+ BUILD_BUG_ON(offsetof(siginfo_t, si_addr) != 0x10);
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, si_addr) != 0x0C);
+
+ BUILD_BUG_ON(offsetof(siginfo_t, si_addr_lsb) != 0x18);
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, si_addr_lsb) != 0x10);
+
+ BUILD_BUG_ON(offsetof(siginfo_t, si_lower) != 0x20);
+ BUILD_BUG_ON(offsetof(siginfo_t, si_upper) != 0x28);
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, si_lower) != 0x14);
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, si_upper) != 0x18);
+
+ BUILD_BUG_ON(offsetof(siginfo_t, si_pkey) != 0x20);
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, si_pkey) != 0x14);
+
+ CHECK_CSI_OFFSET(_sigpoll);
+ CHECK_CSI_SIZE (_sigpoll, 2*sizeof(int));
+ CHECK_SI_SIZE (_sigpoll, 4*sizeof(int));
+
+ BUILD_BUG_ON(offsetof(siginfo_t, si_band) != 0x10);
+ BUILD_BUG_ON(offsetof(siginfo_t, si_fd) != 0x18);
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, si_band) != 0x0C);
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, si_fd) != 0x10);
+
+ CHECK_CSI_OFFSET(_sigsys);
+ CHECK_CSI_SIZE (_sigsys, 3*sizeof(int));
+ CHECK_SI_SIZE (_sigsys, 4*sizeof(int));
+
+ BUILD_BUG_ON(offsetof(siginfo_t, si_call_addr) != 0x10);
+ BUILD_BUG_ON(offsetof(siginfo_t, si_syscall) != 0x18);
+ BUILD_BUG_ON(offsetof(siginfo_t, si_arch) != 0x1C);
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, si_call_addr) != 0x0C);
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, si_syscall) != 0x10);
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, si_arch) != 0x14);
+
+ /* any new si_fields should be added here */
+}
+
+void sigaction_compat_abi(struct k_sigaction *act, struct k_sigaction *oact)
+{
+ signal_compat_build_tests();
+
+ /* Don't leak in-kernel non-uapi flags to user-space */
+ if (oact)
+ oact->sa.sa_flags &= ~(SA_IA32_ABI | SA_X32_ABI);
+
+ if (!act)
+ return;
+
+ /* Don't let flags to be set from userspace */
+ act->sa.sa_flags &= ~(SA_IA32_ABI | SA_X32_ABI);
+
+ if (in_ia32_syscall())
+ act->sa.sa_flags |= SA_IA32_ABI;
+ if (in_x32_syscall())
+ act->sa.sa_flags |= SA_X32_ABI;
+}
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
new file mode 100644
index 000000000..b2b87b91f
--- /dev/null
+++ b/arch/x86/kernel/smp.c
@@ -0,0 +1,336 @@
+/*
+ * Intel SMP support routines.
+ *
+ * (c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk>
+ * (c) 1998-99, 2000, 2009 Ingo Molnar <mingo@redhat.com>
+ * (c) 2002,2003 Andi Kleen, SuSE Labs.
+ *
+ * i386 and x86_64 integration by Glauber Costa <gcosta@redhat.com>
+ *
+ * This code is released under the GNU General Public License version 2 or
+ * later.
+ */
+
+#include <linux/init.h>
+
+#include <linux/mm.h>
+#include <linux/delay.h>
+#include <linux/spinlock.h>
+#include <linux/export.h>
+#include <linux/kernel_stat.h>
+#include <linux/mc146818rtc.h>
+#include <linux/cache.h>
+#include <linux/interrupt.h>
+#include <linux/cpu.h>
+#include <linux/gfp.h>
+
+#include <asm/mtrr.h>
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+#include <asm/proto.h>
+#include <asm/apic.h>
+#include <asm/nmi.h>
+#include <asm/mce.h>
+#include <asm/trace/irq_vectors.h>
+#include <asm/kexec.h>
+#include <asm/virtext.h>
+
+/*
+ * Some notes on x86 processor bugs affecting SMP operation:
+ *
+ * Pentium, Pentium Pro, II, III (and all CPUs) have bugs.
+ * The Linux implications for SMP are handled as follows:
+ *
+ * Pentium III / [Xeon]
+ * None of the E1AP-E3AP errata are visible to the user.
+ *
+ * E1AP. see PII A1AP
+ * E2AP. see PII A2AP
+ * E3AP. see PII A3AP
+ *
+ * Pentium II / [Xeon]
+ * None of the A1AP-A3AP errata are visible to the user.
+ *
+ * A1AP. see PPro 1AP
+ * A2AP. see PPro 2AP
+ * A3AP. see PPro 7AP
+ *
+ * Pentium Pro
+ * None of 1AP-9AP errata are visible to the normal user,
+ * except occasional delivery of 'spurious interrupt' as trap #15.
+ * This is very rare and a non-problem.
+ *
+ * 1AP. Linux maps APIC as non-cacheable
+ * 2AP. worked around in hardware
+ * 3AP. fixed in C0 and above steppings microcode update.
+ * Linux does not use excessive STARTUP_IPIs.
+ * 4AP. worked around in hardware
+ * 5AP. symmetric IO mode (normal Linux operation) not affected.
+ * 'noapic' mode has vector 0xf filled out properly.
+ * 6AP. 'noapic' mode might be affected - fixed in later steppings
+ * 7AP. We do not assume writes to the LVT deassering IRQs
+ * 8AP. We do not enable low power mode (deep sleep) during MP bootup
+ * 9AP. We do not use mixed mode
+ *
+ * Pentium
+ * There is a marginal case where REP MOVS on 100MHz SMP
+ * machines with B stepping processors can fail. XXX should provide
+ * an L1cache=Writethrough or L1cache=off option.
+ *
+ * B stepping CPUs may hang. There are hardware work arounds
+ * for this. We warn about it in case your board doesn't have the work
+ * arounds. Basically that's so I can tell anyone with a B stepping
+ * CPU and SMP problems "tough".
+ *
+ * Specific items [From Pentium Processor Specification Update]
+ *
+ * 1AP. Linux doesn't use remote read
+ * 2AP. Linux doesn't trust APIC errors
+ * 3AP. We work around this
+ * 4AP. Linux never generated 3 interrupts of the same priority
+ * to cause a lost local interrupt.
+ * 5AP. Remote read is never used
+ * 6AP. not affected - worked around in hardware
+ * 7AP. not affected - worked around in hardware
+ * 8AP. worked around in hardware - we get explicit CS errors if not
+ * 9AP. only 'noapic' mode affected. Might generate spurious
+ * interrupts, we log only the first one and count the
+ * rest silently.
+ * 10AP. not affected - worked around in hardware
+ * 11AP. Linux reads the APIC between writes to avoid this, as per
+ * the documentation. Make sure you preserve this as it affects
+ * the C stepping chips too.
+ * 12AP. not affected - worked around in hardware
+ * 13AP. not affected - worked around in hardware
+ * 14AP. we always deassert INIT during bootup
+ * 15AP. not affected - worked around in hardware
+ * 16AP. not affected - worked around in hardware
+ * 17AP. not affected - worked around in hardware
+ * 18AP. not affected - worked around in hardware
+ * 19AP. not affected - worked around in BIOS
+ *
+ * If this sounds worrying believe me these bugs are either ___RARE___,
+ * or are signal timing bugs worked around in hardware and there's
+ * about nothing of note with C stepping upwards.
+ */
+
+static atomic_t stopping_cpu = ATOMIC_INIT(-1);
+static bool smp_no_nmi_ipi = false;
+
+/*
+ * this function sends a 'reschedule' IPI to another CPU.
+ * it goes straight through and wastes no time serializing
+ * anything. Worst case is that we lose a reschedule ...
+ */
+static void native_smp_send_reschedule(int cpu)
+{
+ if (unlikely(cpu_is_offline(cpu))) {
+ WARN(1, "sched: Unexpected reschedule of offline CPU#%d!\n", cpu);
+ return;
+ }
+ apic->send_IPI(cpu, RESCHEDULE_VECTOR);
+}
+
+void native_send_call_func_single_ipi(int cpu)
+{
+ apic->send_IPI(cpu, CALL_FUNCTION_SINGLE_VECTOR);
+}
+
+void native_send_call_func_ipi(const struct cpumask *mask)
+{
+ cpumask_var_t allbutself;
+
+ if (!alloc_cpumask_var(&allbutself, GFP_ATOMIC)) {
+ apic->send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
+ return;
+ }
+
+ cpumask_copy(allbutself, cpu_online_mask);
+ cpumask_clear_cpu(smp_processor_id(), allbutself);
+
+ if (cpumask_equal(mask, allbutself) &&
+ cpumask_equal(cpu_online_mask, cpu_callout_mask))
+ apic->send_IPI_allbutself(CALL_FUNCTION_VECTOR);
+ else
+ apic->send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
+
+ free_cpumask_var(allbutself);
+}
+
+static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs)
+{
+ /* We are registered on stopping cpu too, avoid spurious NMI */
+ if (raw_smp_processor_id() == atomic_read(&stopping_cpu))
+ return NMI_HANDLED;
+
+ cpu_emergency_vmxoff();
+ stop_this_cpu(NULL);
+
+ return NMI_HANDLED;
+}
+
+/*
+ * this function calls the 'stop' function on all other CPUs in the system.
+ */
+
+asmlinkage __visible void smp_reboot_interrupt(void)
+{
+ ipi_entering_ack_irq();
+ cpu_emergency_vmxoff();
+ stop_this_cpu(NULL);
+ irq_exit();
+}
+
+static int register_stop_handler(void)
+{
+ return register_nmi_handler(NMI_LOCAL, smp_stop_nmi_callback,
+ NMI_FLAG_FIRST, "smp_stop");
+}
+
+static void native_stop_other_cpus(int wait)
+{
+ unsigned long flags;
+ unsigned long timeout;
+
+ if (reboot_force)
+ return;
+
+ /*
+ * Use an own vector here because smp_call_function
+ * does lots of things not suitable in a panic situation.
+ */
+
+ /*
+ * We start by using the REBOOT_VECTOR irq.
+ * The irq is treated as a sync point to allow critical
+ * regions of code on other cpus to release their spin locks
+ * and re-enable irqs. Jumping straight to an NMI might
+ * accidentally cause deadlocks with further shutdown/panic
+ * code. By syncing, we give the cpus up to one second to
+ * finish their work before we force them off with the NMI.
+ */
+ if (num_online_cpus() > 1) {
+ /* did someone beat us here? */
+ if (atomic_cmpxchg(&stopping_cpu, -1, safe_smp_processor_id()) != -1)
+ return;
+
+ /* sync above data before sending IRQ */
+ wmb();
+
+ apic->send_IPI_allbutself(REBOOT_VECTOR);
+
+ /*
+ * Don't wait longer than a second for IPI completion. The
+ * wait request is not checked here because that would
+ * prevent an NMI shutdown attempt in case that not all
+ * CPUs reach shutdown state.
+ */
+ timeout = USEC_PER_SEC;
+ while (num_online_cpus() > 1 && timeout--)
+ udelay(1);
+ }
+
+ /* if the REBOOT_VECTOR didn't work, try with the NMI */
+ if (num_online_cpus() > 1) {
+ /*
+ * If NMI IPI is enabled, try to register the stop handler
+ * and send the IPI. In any case try to wait for the other
+ * CPUs to stop.
+ */
+ if (!smp_no_nmi_ipi && !register_stop_handler()) {
+ /* Sync above data before sending IRQ */
+ wmb();
+
+ pr_emerg("Shutting down cpus with NMI\n");
+
+ apic->send_IPI_allbutself(NMI_VECTOR);
+ }
+ /*
+ * Don't wait longer than 10 ms if the caller didn't
+ * reqeust it. If wait is true, the machine hangs here if
+ * one or more CPUs do not reach shutdown state.
+ */
+ timeout = USEC_PER_MSEC * 10;
+ while (num_online_cpus() > 1 && (wait || timeout--))
+ udelay(1);
+ }
+
+ local_irq_save(flags);
+ disable_local_APIC();
+ mcheck_cpu_clear(this_cpu_ptr(&cpu_info));
+ local_irq_restore(flags);
+}
+
+/*
+ * Reschedule call back. KVM uses this interrupt to force a cpu out of
+ * guest mode
+ */
+__visible void __irq_entry smp_reschedule_interrupt(struct pt_regs *regs)
+{
+ ack_APIC_irq();
+ inc_irq_stat(irq_resched_count);
+ kvm_set_cpu_l1tf_flush_l1d();
+
+ if (trace_resched_ipi_enabled()) {
+ /*
+ * scheduler_ipi() might call irq_enter() as well, but
+ * nested calls are fine.
+ */
+ irq_enter();
+ trace_reschedule_entry(RESCHEDULE_VECTOR);
+ scheduler_ipi();
+ trace_reschedule_exit(RESCHEDULE_VECTOR);
+ irq_exit();
+ return;
+ }
+ scheduler_ipi();
+}
+
+__visible void __irq_entry smp_call_function_interrupt(struct pt_regs *regs)
+{
+ ipi_entering_ack_irq();
+ trace_call_function_entry(CALL_FUNCTION_VECTOR);
+ inc_irq_stat(irq_call_count);
+ generic_smp_call_function_interrupt();
+ trace_call_function_exit(CALL_FUNCTION_VECTOR);
+ exiting_irq();
+}
+
+__visible void __irq_entry smp_call_function_single_interrupt(struct pt_regs *r)
+{
+ ipi_entering_ack_irq();
+ trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR);
+ inc_irq_stat(irq_call_count);
+ generic_smp_call_function_single_interrupt();
+ trace_call_function_single_exit(CALL_FUNCTION_SINGLE_VECTOR);
+ exiting_irq();
+}
+
+static int __init nonmi_ipi_setup(char *str)
+{
+ smp_no_nmi_ipi = true;
+ return 1;
+}
+
+__setup("nonmi_ipi", nonmi_ipi_setup);
+
+struct smp_ops smp_ops = {
+ .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu,
+ .smp_prepare_cpus = native_smp_prepare_cpus,
+ .smp_cpus_done = native_smp_cpus_done,
+
+ .stop_other_cpus = native_stop_other_cpus,
+#if defined(CONFIG_KEXEC_CORE)
+ .crash_stop_other_cpus = kdump_nmi_shootdown_cpus,
+#endif
+ .smp_send_reschedule = native_smp_send_reschedule,
+
+ .cpu_up = native_cpu_up,
+ .cpu_die = native_cpu_die,
+ .cpu_disable = native_cpu_disable,
+ .play_dead = native_play_dead,
+
+ .send_call_func_ipi = native_send_call_func_ipi,
+ .send_call_func_single_ipi = native_send_call_func_single_ipi,
+};
+EXPORT_SYMBOL_GPL(smp_ops);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
new file mode 100644
index 000000000..8783d065f
--- /dev/null
+++ b/arch/x86/kernel/smpboot.c
@@ -0,0 +1,1707 @@
+ /*
+ * x86 SMP booting functions
+ *
+ * (c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk>
+ * (c) 1998, 1999, 2000, 2009 Ingo Molnar <mingo@redhat.com>
+ * Copyright 2001 Andi Kleen, SuSE Labs.
+ *
+ * Much of the core SMP work is based on previous work by Thomas Radke, to
+ * whom a great many thanks are extended.
+ *
+ * Thanks to Intel for making available several different Pentium,
+ * Pentium Pro and Pentium-II/Xeon MP machines.
+ * Original development of Linux SMP code supported by Caldera.
+ *
+ * This code is released under the GNU General Public License version 2 or
+ * later.
+ *
+ * Fixes
+ * Felix Koop : NR_CPUS used properly
+ * Jose Renau : Handle single CPU case.
+ * Alan Cox : By repeated request 8) - Total BogoMIPS report.
+ * Greg Wright : Fix for kernel stacks panic.
+ * Erich Boleyn : MP v1.4 and additional changes.
+ * Matthias Sattler : Changes for 2.1 kernel map.
+ * Michel Lespinasse : Changes for 2.1 kernel map.
+ * Michael Chastain : Change trampoline.S to gnu as.
+ * Alan Cox : Dumb bug: 'B' step PPro's are fine
+ * Ingo Molnar : Added APIC timers, based on code
+ * from Jose Renau
+ * Ingo Molnar : various cleanups and rewrites
+ * Tigran Aivazian : fixed "0.00 in /proc/uptime on SMP" bug.
+ * Maciej W. Rozycki : Bits for genuine 82489DX APICs
+ * Andi Kleen : Changed for SMP boot into long mode.
+ * Martin J. Bligh : Added support for multi-quad systems
+ * Dave Jones : Report invalid combinations of Athlon CPUs.
+ * Rusty Russell : Hacked into shape for new "hotplug" boot process.
+ * Andi Kleen : Converted to new state machine.
+ * Ashok Raj : CPU hotplug support
+ * Glauber Costa : i386 and x86_64 integration
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/export.h>
+#include <linux/sched.h>
+#include <linux/sched/topology.h>
+#include <linux/sched/hotplug.h>
+#include <linux/sched/task_stack.h>
+#include <linux/percpu.h>
+#include <linux/bootmem.h>
+#include <linux/err.h>
+#include <linux/nmi.h>
+#include <linux/tboot.h>
+#include <linux/stackprotector.h>
+#include <linux/gfp.h>
+#include <linux/cpuidle.h>
+
+#include <asm/acpi.h>
+#include <asm/desc.h>
+#include <asm/nmi.h>
+#include <asm/irq.h>
+#include <asm/realmode.h>
+#include <asm/cpu.h>
+#include <asm/numa.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/mtrr.h>
+#include <asm/mwait.h>
+#include <asm/apic.h>
+#include <asm/io_apic.h>
+#include <asm/fpu/internal.h>
+#include <asm/setup.h>
+#include <asm/uv/uv.h>
+#include <linux/mc146818rtc.h>
+#include <asm/i8259.h>
+#include <asm/misc.h>
+#include <asm/qspinlock.h>
+#include <asm/intel-family.h>
+#include <asm/cpu_device_id.h>
+#include <asm/spec-ctrl.h>
+#include <asm/hw_irq.h>
+
+/* representing HT siblings of each logical CPU */
+DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
+EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
+
+/* representing HT and core siblings of each logical CPU */
+DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map);
+EXPORT_PER_CPU_SYMBOL(cpu_core_map);
+
+DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
+
+/* Per CPU bogomips and other parameters */
+DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
+EXPORT_PER_CPU_SYMBOL(cpu_info);
+
+/* Logical package management. We might want to allocate that dynamically */
+unsigned int __max_logical_packages __read_mostly;
+EXPORT_SYMBOL(__max_logical_packages);
+static unsigned int logical_packages __read_mostly;
+
+/* Maximum number of SMT threads on any online core */
+int __read_mostly __max_smt_threads = 1;
+
+/* Flag to indicate if a complete sched domain rebuild is required */
+bool x86_topology_update;
+
+int arch_update_cpu_topology(void)
+{
+ int retval = x86_topology_update;
+
+ x86_topology_update = false;
+ return retval;
+}
+
+static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&rtc_lock, flags);
+ CMOS_WRITE(0xa, 0xf);
+ spin_unlock_irqrestore(&rtc_lock, flags);
+ *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) =
+ start_eip >> 4;
+ *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) =
+ start_eip & 0xf;
+}
+
+static inline void smpboot_restore_warm_reset_vector(void)
+{
+ unsigned long flags;
+
+ /*
+ * Paranoid: Set warm reset code and vector here back
+ * to default values.
+ */
+ spin_lock_irqsave(&rtc_lock, flags);
+ CMOS_WRITE(0, 0xf);
+ spin_unlock_irqrestore(&rtc_lock, flags);
+
+ *((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0;
+}
+
+/*
+ * Report back to the Boot Processor during boot time or to the caller processor
+ * during CPU online.
+ */
+static void smp_callin(void)
+{
+ int cpuid, phys_id;
+
+ /*
+ * If waken up by an INIT in an 82489DX configuration
+ * cpu_callout_mask guarantees we don't get here before
+ * an INIT_deassert IPI reaches our local APIC, so it is
+ * now safe to touch our local APIC.
+ */
+ cpuid = smp_processor_id();
+
+ /*
+ * (This works even if the APIC is not enabled.)
+ */
+ phys_id = read_apic_id();
+
+ /*
+ * the boot CPU has finished the init stage and is spinning
+ * on callin_map until we finish. We are free to set up this
+ * CPU, first the APIC. (this is probably redundant on most
+ * boards)
+ */
+ apic_ap_setup();
+
+ /*
+ * Save our processor parameters. Note: this information
+ * is needed for clock calibration.
+ */
+ smp_store_cpu_info(cpuid);
+
+ /*
+ * The topology information must be up to date before
+ * calibrate_delay() and notify_cpu_starting().
+ */
+ set_cpu_sibling_map(raw_smp_processor_id());
+
+ /*
+ * Get our bogomips.
+ * Update loops_per_jiffy in cpu_data. Previous call to
+ * smp_store_cpu_info() stored a value that is close but not as
+ * accurate as the value just calculated.
+ */
+ calibrate_delay();
+ cpu_data(cpuid).loops_per_jiffy = loops_per_jiffy;
+ pr_debug("Stack at about %p\n", &cpuid);
+
+ wmb();
+
+ notify_cpu_starting(cpuid);
+
+ /*
+ * Allow the master to continue.
+ */
+ cpumask_set_cpu(cpuid, cpu_callin_mask);
+}
+
+static int cpu0_logical_apicid;
+static int enable_start_cpu0;
+/*
+ * Activate a secondary processor.
+ */
+static void notrace start_secondary(void *unused)
+{
+ /*
+ * Don't put *anything* except direct CPU state initialization
+ * before cpu_init(), SMP booting is too fragile that we want to
+ * limit the things done here to the most necessary things.
+ */
+ if (boot_cpu_has(X86_FEATURE_PCID))
+ __write_cr4(__read_cr4() | X86_CR4_PCIDE);
+
+#ifdef CONFIG_X86_32
+ /* switch away from the initial page table */
+ load_cr3(swapper_pg_dir);
+ /*
+ * Initialize the CR4 shadow before doing anything that could
+ * try to read it.
+ */
+ cr4_init_shadow();
+ __flush_tlb_all();
+#endif
+ load_current_idt();
+ cpu_init();
+ x86_cpuinit.early_percpu_clock_init();
+ preempt_disable();
+ smp_callin();
+
+ enable_start_cpu0 = 0;
+
+ /* otherwise gcc will move up smp_processor_id before the cpu_init */
+ barrier();
+ /*
+ * Check TSC synchronization with the boot CPU:
+ */
+ check_tsc_sync_target();
+
+ speculative_store_bypass_ht_init();
+
+ /*
+ * Lock vector_lock, set CPU online and bring the vector
+ * allocator online. Online must be set with vector_lock held
+ * to prevent a concurrent irq setup/teardown from seeing a
+ * half valid vector space.
+ */
+ lock_vector_lock();
+ set_cpu_online(smp_processor_id(), true);
+ lapic_online();
+ unlock_vector_lock();
+ cpu_set_state_online(smp_processor_id());
+ x86_platform.nmi_init();
+
+ /* enable local interrupts */
+ local_irq_enable();
+
+ /* to prevent fake stack check failure in clock setup */
+ boot_init_stack_canary();
+
+ x86_cpuinit.setup_percpu_clockev();
+
+ wmb();
+ cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
+
+ /*
+ * Prevent tail call to cpu_startup_entry() because the stack protector
+ * guard has been changed a couple of function calls up, in
+ * boot_init_stack_canary() and must not be checked before tail calling
+ * another function.
+ */
+ prevent_tail_call_optimization();
+}
+
+/**
+ * topology_is_primary_thread - Check whether CPU is the primary SMT thread
+ * @cpu: CPU to check
+ */
+bool topology_is_primary_thread(unsigned int cpu)
+{
+ return apic_id_is_primary_thread(per_cpu(x86_cpu_to_apicid, cpu));
+}
+
+/**
+ * topology_smt_supported - Check whether SMT is supported by the CPUs
+ */
+bool topology_smt_supported(void)
+{
+ return smp_num_siblings > 1;
+}
+
+/**
+ * topology_phys_to_logical_pkg - Map a physical package id to a logical
+ *
+ * Returns logical package id or -1 if not found
+ */
+int topology_phys_to_logical_pkg(unsigned int phys_pkg)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+ if (c->initialized && c->phys_proc_id == phys_pkg)
+ return c->logical_proc_id;
+ }
+ return -1;
+}
+EXPORT_SYMBOL(topology_phys_to_logical_pkg);
+
+/**
+ * topology_update_package_map - Update the physical to logical package map
+ * @pkg: The physical package id as retrieved via CPUID
+ * @cpu: The cpu for which this is updated
+ */
+int topology_update_package_map(unsigned int pkg, unsigned int cpu)
+{
+ int new;
+
+ /* Already available somewhere? */
+ new = topology_phys_to_logical_pkg(pkg);
+ if (new >= 0)
+ goto found;
+
+ new = logical_packages++;
+ if (new != pkg) {
+ pr_info("CPU %u Converting physical %u to logical package %u\n",
+ cpu, pkg, new);
+ }
+found:
+ cpu_data(cpu).logical_proc_id = new;
+ return 0;
+}
+
+void __init smp_store_boot_cpu_info(void)
+{
+ int id = 0; /* CPU 0 */
+ struct cpuinfo_x86 *c = &cpu_data(id);
+
+ *c = boot_cpu_data;
+ c->cpu_index = id;
+ topology_update_package_map(c->phys_proc_id, id);
+ c->initialized = true;
+}
+
+/*
+ * The bootstrap kernel entry code has set these up. Save them for
+ * a given CPU
+ */
+void smp_store_cpu_info(int id)
+{
+ struct cpuinfo_x86 *c = &cpu_data(id);
+
+ /* Copy boot_cpu_data only on the first bringup */
+ if (!c->initialized)
+ *c = boot_cpu_data;
+ c->cpu_index = id;
+ /*
+ * During boot time, CPU0 has this setup already. Save the info when
+ * bringing up AP or offlined CPU0.
+ */
+ identify_secondary_cpu(c);
+ c->initialized = true;
+}
+
+static bool
+topology_same_node(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
+{
+ int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
+
+ return (cpu_to_node(cpu1) == cpu_to_node(cpu2));
+}
+
+static bool
+topology_sane(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o, const char *name)
+{
+ int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
+
+ return !WARN_ONCE(!topology_same_node(c, o),
+ "sched: CPU #%d's %s-sibling CPU #%d is not on the same node! "
+ "[node: %d != %d]. Ignoring dependency.\n",
+ cpu1, name, cpu2, cpu_to_node(cpu1), cpu_to_node(cpu2));
+}
+
+#define link_mask(mfunc, c1, c2) \
+do { \
+ cpumask_set_cpu((c1), mfunc(c2)); \
+ cpumask_set_cpu((c2), mfunc(c1)); \
+} while (0)
+
+static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
+{
+ if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
+ int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
+
+ if (c->phys_proc_id == o->phys_proc_id &&
+ per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2)) {
+ if (c->cpu_core_id == o->cpu_core_id)
+ return topology_sane(c, o, "smt");
+
+ if ((c->cu_id != 0xff) &&
+ (o->cu_id != 0xff) &&
+ (c->cu_id == o->cu_id))
+ return topology_sane(c, o, "smt");
+ }
+
+ } else if (c->phys_proc_id == o->phys_proc_id &&
+ c->cpu_core_id == o->cpu_core_id) {
+ return topology_sane(c, o, "smt");
+ }
+
+ return false;
+}
+
+/*
+ * Define snc_cpu[] for SNC (Sub-NUMA Cluster) CPUs.
+ *
+ * These are Intel CPUs that enumerate an LLC that is shared by
+ * multiple NUMA nodes. The LLC on these systems is shared for
+ * off-package data access but private to the NUMA node (half
+ * of the package) for on-package access.
+ *
+ * CPUID (the source of the information about the LLC) can only
+ * enumerate the cache as being shared *or* unshared, but not
+ * this particular configuration. The CPU in this case enumerates
+ * the cache to be shared across the entire package (spanning both
+ * NUMA nodes).
+ */
+
+static const struct x86_cpu_id snc_cpu[] = {
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_SKYLAKE_X },
+ {}
+};
+
+static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
+{
+ int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
+
+ /* Do not match if we do not have a valid APICID for cpu: */
+ if (per_cpu(cpu_llc_id, cpu1) == BAD_APICID)
+ return false;
+
+ /* Do not match if LLC id does not match: */
+ if (per_cpu(cpu_llc_id, cpu1) != per_cpu(cpu_llc_id, cpu2))
+ return false;
+
+ /*
+ * Allow the SNC topology without warning. Return of false
+ * means 'c' does not share the LLC of 'o'. This will be
+ * reflected to userspace.
+ */
+ if (!topology_same_node(c, o) && x86_match_cpu(snc_cpu))
+ return false;
+
+ return topology_sane(c, o, "llc");
+}
+
+/*
+ * Unlike the other levels, we do not enforce keeping a
+ * multicore group inside a NUMA node. If this happens, we will
+ * discard the MC level of the topology later.
+ */
+static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
+{
+ if (c->phys_proc_id == o->phys_proc_id)
+ return true;
+ return false;
+}
+
+#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC)
+static inline int x86_sched_itmt_flags(void)
+{
+ return sysctl_sched_itmt_enabled ? SD_ASYM_PACKING : 0;
+}
+
+#ifdef CONFIG_SCHED_MC
+static int x86_core_flags(void)
+{
+ return cpu_core_flags() | x86_sched_itmt_flags();
+}
+#endif
+#ifdef CONFIG_SCHED_SMT
+static int x86_smt_flags(void)
+{
+ return cpu_smt_flags() | x86_sched_itmt_flags();
+}
+#endif
+#endif
+
+static struct sched_domain_topology_level x86_numa_in_package_topology[] = {
+#ifdef CONFIG_SCHED_SMT
+ { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
+#endif
+#ifdef CONFIG_SCHED_MC
+ { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) },
+#endif
+ { NULL, },
+};
+
+static struct sched_domain_topology_level x86_topology[] = {
+#ifdef CONFIG_SCHED_SMT
+ { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
+#endif
+#ifdef CONFIG_SCHED_MC
+ { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) },
+#endif
+ { cpu_cpu_mask, SD_INIT_NAME(DIE) },
+ { NULL, },
+};
+
+/*
+ * Set if a package/die has multiple NUMA nodes inside.
+ * AMD Magny-Cours, Intel Cluster-on-Die, and Intel
+ * Sub-NUMA Clustering have this.
+ */
+static bool x86_has_numa_in_package;
+
+void set_cpu_sibling_map(int cpu)
+{
+ bool has_smt = smp_num_siblings > 1;
+ bool has_mp = has_smt || boot_cpu_data.x86_max_cores > 1;
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+ struct cpuinfo_x86 *o;
+ int i, threads;
+
+ cpumask_set_cpu(cpu, cpu_sibling_setup_mask);
+
+ if (!has_mp) {
+ cpumask_set_cpu(cpu, topology_sibling_cpumask(cpu));
+ cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
+ cpumask_set_cpu(cpu, topology_core_cpumask(cpu));
+ c->booted_cores = 1;
+ return;
+ }
+
+ for_each_cpu(i, cpu_sibling_setup_mask) {
+ o = &cpu_data(i);
+
+ if ((i == cpu) || (has_smt && match_smt(c, o)))
+ link_mask(topology_sibling_cpumask, cpu, i);
+
+ if ((i == cpu) || (has_mp && match_llc(c, o)))
+ link_mask(cpu_llc_shared_mask, cpu, i);
+
+ }
+
+ /*
+ * This needs a separate iteration over the cpus because we rely on all
+ * topology_sibling_cpumask links to be set-up.
+ */
+ for_each_cpu(i, cpu_sibling_setup_mask) {
+ o = &cpu_data(i);
+
+ if ((i == cpu) || (has_mp && match_die(c, o))) {
+ link_mask(topology_core_cpumask, cpu, i);
+
+ /*
+ * Does this new cpu bringup a new core?
+ */
+ if (cpumask_weight(
+ topology_sibling_cpumask(cpu)) == 1) {
+ /*
+ * for each core in package, increment
+ * the booted_cores for this new cpu
+ */
+ if (cpumask_first(
+ topology_sibling_cpumask(i)) == i)
+ c->booted_cores++;
+ /*
+ * increment the core count for all
+ * the other cpus in this package
+ */
+ if (i != cpu)
+ cpu_data(i).booted_cores++;
+ } else if (i != cpu && !c->booted_cores)
+ c->booted_cores = cpu_data(i).booted_cores;
+ }
+ if (match_die(c, o) && !topology_same_node(c, o))
+ x86_has_numa_in_package = true;
+ }
+
+ threads = cpumask_weight(topology_sibling_cpumask(cpu));
+ if (threads > __max_smt_threads)
+ __max_smt_threads = threads;
+}
+
+/* maps the cpu to the sched domain representing multi-core */
+const struct cpumask *cpu_coregroup_mask(int cpu)
+{
+ return cpu_llc_shared_mask(cpu);
+}
+
+static void impress_friends(void)
+{
+ int cpu;
+ unsigned long bogosum = 0;
+ /*
+ * Allow the user to impress friends.
+ */
+ pr_debug("Before bogomips\n");
+ for_each_possible_cpu(cpu)
+ if (cpumask_test_cpu(cpu, cpu_callout_mask))
+ bogosum += cpu_data(cpu).loops_per_jiffy;
+ pr_info("Total of %d processors activated (%lu.%02lu BogoMIPS)\n",
+ num_online_cpus(),
+ bogosum/(500000/HZ),
+ (bogosum/(5000/HZ))%100);
+
+ pr_debug("Before bogocount - setting activated=1\n");
+}
+
+void __inquire_remote_apic(int apicid)
+{
+ unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
+ const char * const names[] = { "ID", "VERSION", "SPIV" };
+ int timeout;
+ u32 status;
+
+ pr_info("Inquiring remote APIC 0x%x...\n", apicid);
+
+ for (i = 0; i < ARRAY_SIZE(regs); i++) {
+ pr_info("... APIC 0x%x %s: ", apicid, names[i]);
+
+ /*
+ * Wait for idle.
+ */
+ status = safe_apic_wait_icr_idle();
+ if (status)
+ pr_cont("a previous APIC delivery may have failed\n");
+
+ apic_icr_write(APIC_DM_REMRD | regs[i], apicid);
+
+ timeout = 0;
+ do {
+ udelay(100);
+ status = apic_read(APIC_ICR) & APIC_ICR_RR_MASK;
+ } while (status == APIC_ICR_RR_INPROG && timeout++ < 1000);
+
+ switch (status) {
+ case APIC_ICR_RR_VALID:
+ status = apic_read(APIC_RRR);
+ pr_cont("%08x\n", status);
+ break;
+ default:
+ pr_cont("failed\n");
+ }
+ }
+}
+
+/*
+ * The Multiprocessor Specification 1.4 (1997) example code suggests
+ * that there should be a 10ms delay between the BSP asserting INIT
+ * and de-asserting INIT, when starting a remote processor.
+ * But that slows boot and resume on modern processors, which include
+ * many cores and don't require that delay.
+ *
+ * Cmdline "init_cpu_udelay=" is available to over-ride this delay.
+ * Modern processor families are quirked to remove the delay entirely.
+ */
+#define UDELAY_10MS_DEFAULT 10000
+
+static unsigned int init_udelay = UINT_MAX;
+
+static int __init cpu_init_udelay(char *str)
+{
+ get_option(&str, &init_udelay);
+
+ return 0;
+}
+early_param("cpu_init_udelay", cpu_init_udelay);
+
+static void __init smp_quirk_init_udelay(void)
+{
+ /* if cmdline changed it from default, leave it alone */
+ if (init_udelay != UINT_MAX)
+ return;
+
+ /* if modern processor, use no delay */
+ if (((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 == 6)) ||
+ ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && (boot_cpu_data.x86 >= 0xF))) {
+ init_udelay = 0;
+ return;
+ }
+ /* else, use legacy delay */
+ init_udelay = UDELAY_10MS_DEFAULT;
+}
+
+/*
+ * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal
+ * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this
+ * won't ... remember to clear down the APIC, etc later.
+ */
+int
+wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip)
+{
+ unsigned long send_status, accept_status = 0;
+ int maxlvt;
+
+ /* Target chip */
+ /* Boot on the stack */
+ /* Kick the second */
+ apic_icr_write(APIC_DM_NMI | apic->dest_logical, apicid);
+
+ pr_debug("Waiting for send to finish...\n");
+ send_status = safe_apic_wait_icr_idle();
+
+ /*
+ * Give the other CPU some time to accept the IPI.
+ */
+ udelay(200);
+ if (APIC_INTEGRATED(boot_cpu_apic_version)) {
+ maxlvt = lapic_get_maxlvt();
+ if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
+ apic_write(APIC_ESR, 0);
+ accept_status = (apic_read(APIC_ESR) & 0xEF);
+ }
+ pr_debug("NMI sent\n");
+
+ if (send_status)
+ pr_err("APIC never delivered???\n");
+ if (accept_status)
+ pr_err("APIC delivery error (%lx)\n", accept_status);
+
+ return (send_status | accept_status);
+}
+
+static int
+wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
+{
+ unsigned long send_status = 0, accept_status = 0;
+ int maxlvt, num_starts, j;
+
+ maxlvt = lapic_get_maxlvt();
+
+ /*
+ * Be paranoid about clearing APIC errors.
+ */
+ if (APIC_INTEGRATED(boot_cpu_apic_version)) {
+ if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
+ apic_write(APIC_ESR, 0);
+ apic_read(APIC_ESR);
+ }
+
+ pr_debug("Asserting INIT\n");
+
+ /*
+ * Turn INIT on target chip
+ */
+ /*
+ * Send IPI
+ */
+ apic_icr_write(APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT,
+ phys_apicid);
+
+ pr_debug("Waiting for send to finish...\n");
+ send_status = safe_apic_wait_icr_idle();
+
+ udelay(init_udelay);
+
+ pr_debug("Deasserting INIT\n");
+
+ /* Target chip */
+ /* Send IPI */
+ apic_icr_write(APIC_INT_LEVELTRIG | APIC_DM_INIT, phys_apicid);
+
+ pr_debug("Waiting for send to finish...\n");
+ send_status = safe_apic_wait_icr_idle();
+
+ mb();
+
+ /*
+ * Should we send STARTUP IPIs ?
+ *
+ * Determine this based on the APIC version.
+ * If we don't have an integrated APIC, don't send the STARTUP IPIs.
+ */
+ if (APIC_INTEGRATED(boot_cpu_apic_version))
+ num_starts = 2;
+ else
+ num_starts = 0;
+
+ /*
+ * Run STARTUP IPI loop.
+ */
+ pr_debug("#startup loops: %d\n", num_starts);
+
+ for (j = 1; j <= num_starts; j++) {
+ pr_debug("Sending STARTUP #%d\n", j);
+ if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
+ apic_write(APIC_ESR, 0);
+ apic_read(APIC_ESR);
+ pr_debug("After apic_write\n");
+
+ /*
+ * STARTUP IPI
+ */
+
+ /* Target chip */
+ /* Boot on the stack */
+ /* Kick the second */
+ apic_icr_write(APIC_DM_STARTUP | (start_eip >> 12),
+ phys_apicid);
+
+ /*
+ * Give the other CPU some time to accept the IPI.
+ */
+ if (init_udelay == 0)
+ udelay(10);
+ else
+ udelay(300);
+
+ pr_debug("Startup point 1\n");
+
+ pr_debug("Waiting for send to finish...\n");
+ send_status = safe_apic_wait_icr_idle();
+
+ /*
+ * Give the other CPU some time to accept the IPI.
+ */
+ if (init_udelay == 0)
+ udelay(10);
+ else
+ udelay(200);
+
+ if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
+ apic_write(APIC_ESR, 0);
+ accept_status = (apic_read(APIC_ESR) & 0xEF);
+ if (send_status || accept_status)
+ break;
+ }
+ pr_debug("After Startup\n");
+
+ if (send_status)
+ pr_err("APIC never delivered???\n");
+ if (accept_status)
+ pr_err("APIC delivery error (%lx)\n", accept_status);
+
+ return (send_status | accept_status);
+}
+
+/* reduce the number of lines printed when booting a large cpu count system */
+static void announce_cpu(int cpu, int apicid)
+{
+ static int current_node = -1;
+ int node = early_cpu_to_node(cpu);
+ static int width, node_width;
+
+ if (!width)
+ width = num_digits(num_possible_cpus()) + 1; /* + '#' sign */
+
+ if (!node_width)
+ node_width = num_digits(num_possible_nodes()) + 1; /* + '#' */
+
+ if (cpu == 1)
+ printk(KERN_INFO "x86: Booting SMP configuration:\n");
+
+ if (system_state < SYSTEM_RUNNING) {
+ if (node != current_node) {
+ if (current_node > (-1))
+ pr_cont("\n");
+ current_node = node;
+
+ printk(KERN_INFO ".... node %*s#%d, CPUs: ",
+ node_width - num_digits(node), " ", node);
+ }
+
+ /* Add padding for the BSP */
+ if (cpu == 1)
+ pr_cont("%*s", width + 1, " ");
+
+ pr_cont("%*s#%d", width - num_digits(cpu), " ", cpu);
+
+ } else
+ pr_info("Booting Node %d Processor %d APIC 0x%x\n",
+ node, cpu, apicid);
+}
+
+static int wakeup_cpu0_nmi(unsigned int cmd, struct pt_regs *regs)
+{
+ int cpu;
+
+ cpu = smp_processor_id();
+ if (cpu == 0 && !cpu_online(cpu) && enable_start_cpu0)
+ return NMI_HANDLED;
+
+ return NMI_DONE;
+}
+
+/*
+ * Wake up AP by INIT, INIT, STARTUP sequence.
+ *
+ * Instead of waiting for STARTUP after INITs, BSP will execute the BIOS
+ * boot-strap code which is not a desired behavior for waking up BSP. To
+ * void the boot-strap code, wake up CPU0 by NMI instead.
+ *
+ * This works to wake up soft offlined CPU0 only. If CPU0 is hard offlined
+ * (i.e. physically hot removed and then hot added), NMI won't wake it up.
+ * We'll change this code in the future to wake up hard offlined CPU0 if
+ * real platform and request are available.
+ */
+static int
+wakeup_cpu_via_init_nmi(int cpu, unsigned long start_ip, int apicid,
+ int *cpu0_nmi_registered)
+{
+ int id;
+ int boot_error;
+
+ preempt_disable();
+
+ /*
+ * Wake up AP by INIT, INIT, STARTUP sequence.
+ */
+ if (cpu) {
+ boot_error = wakeup_secondary_cpu_via_init(apicid, start_ip);
+ goto out;
+ }
+
+ /*
+ * Wake up BSP by nmi.
+ *
+ * Register a NMI handler to help wake up CPU0.
+ */
+ boot_error = register_nmi_handler(NMI_LOCAL,
+ wakeup_cpu0_nmi, 0, "wake_cpu0");
+
+ if (!boot_error) {
+ enable_start_cpu0 = 1;
+ *cpu0_nmi_registered = 1;
+ if (apic->dest_logical == APIC_DEST_LOGICAL)
+ id = cpu0_logical_apicid;
+ else
+ id = apicid;
+ boot_error = wakeup_secondary_cpu_via_nmi(id, start_ip);
+ }
+
+out:
+ preempt_enable();
+
+ return boot_error;
+}
+
+void common_cpu_up(unsigned int cpu, struct task_struct *idle)
+{
+ /* Just in case we booted with a single CPU. */
+ alternatives_enable_smp();
+
+ per_cpu(current_task, cpu) = idle;
+
+#ifdef CONFIG_X86_32
+ /* Stack for startup_32 can be just as for start_secondary onwards */
+ irq_ctx_init(cpu);
+ per_cpu(cpu_current_top_of_stack, cpu) = task_top_of_stack(idle);
+#else
+ initial_gs = per_cpu_offset(cpu);
+#endif
+}
+
+/*
+ * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
+ * (ie clustered apic addressing mode), this is a LOGICAL apic ID.
+ * Returns zero if CPU booted OK, else error code from
+ * ->wakeup_secondary_cpu.
+ */
+static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
+ int *cpu0_nmi_registered)
+{
+ volatile u32 *trampoline_status =
+ (volatile u32 *) __va(real_mode_header->trampoline_status);
+ /* start_ip had better be page-aligned! */
+ unsigned long start_ip = real_mode_header->trampoline_start;
+
+ unsigned long boot_error = 0;
+ unsigned long timeout;
+
+ idle->thread.sp = (unsigned long)task_pt_regs(idle);
+ early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu);
+ initial_code = (unsigned long)start_secondary;
+ initial_stack = idle->thread.sp;
+
+ /* Enable the espfix hack for this CPU */
+ init_espfix_ap(cpu);
+
+ /* So we see what's up */
+ announce_cpu(cpu, apicid);
+
+ /*
+ * This grunge runs the startup process for
+ * the targeted processor.
+ */
+
+ if (x86_platform.legacy.warm_reset) {
+
+ pr_debug("Setting warm reset code and vector.\n");
+
+ smpboot_setup_warm_reset_vector(start_ip);
+ /*
+ * Be paranoid about clearing APIC errors.
+ */
+ if (APIC_INTEGRATED(boot_cpu_apic_version)) {
+ apic_write(APIC_ESR, 0);
+ apic_read(APIC_ESR);
+ }
+ }
+
+ /*
+ * AP might wait on cpu_callout_mask in cpu_init() with
+ * cpu_initialized_mask set if previous attempt to online
+ * it timed-out. Clear cpu_initialized_mask so that after
+ * INIT/SIPI it could start with a clean state.
+ */
+ cpumask_clear_cpu(cpu, cpu_initialized_mask);
+ smp_mb();
+
+ /*
+ * Wake up a CPU in difference cases:
+ * - Use the method in the APIC driver if it's defined
+ * Otherwise,
+ * - Use an INIT boot APIC message for APs or NMI for BSP.
+ */
+ if (apic->wakeup_secondary_cpu)
+ boot_error = apic->wakeup_secondary_cpu(apicid, start_ip);
+ else
+ boot_error = wakeup_cpu_via_init_nmi(cpu, start_ip, apicid,
+ cpu0_nmi_registered);
+
+ if (!boot_error) {
+ /*
+ * Wait 10s total for first sign of life from AP
+ */
+ boot_error = -1;
+ timeout = jiffies + 10*HZ;
+ while (time_before(jiffies, timeout)) {
+ if (cpumask_test_cpu(cpu, cpu_initialized_mask)) {
+ /*
+ * Tell AP to proceed with initialization
+ */
+ cpumask_set_cpu(cpu, cpu_callout_mask);
+ boot_error = 0;
+ break;
+ }
+ schedule();
+ }
+ }
+
+ if (!boot_error) {
+ /*
+ * Wait till AP completes initial initialization
+ */
+ while (!cpumask_test_cpu(cpu, cpu_callin_mask)) {
+ /*
+ * Allow other tasks to run while we wait for the
+ * AP to come online. This also gives a chance
+ * for the MTRR work(triggered by the AP coming online)
+ * to be completed in the stop machine context.
+ */
+ schedule();
+ }
+ }
+
+ /* mark "stuck" area as not stuck */
+ *trampoline_status = 0;
+
+ if (x86_platform.legacy.warm_reset) {
+ /*
+ * Cleanup possible dangling ends...
+ */
+ smpboot_restore_warm_reset_vector();
+ }
+
+ return boot_error;
+}
+
+int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
+{
+ int apicid = apic->cpu_present_to_apicid(cpu);
+ int cpu0_nmi_registered = 0;
+ unsigned long flags;
+ int err, ret = 0;
+
+ lockdep_assert_irqs_enabled();
+
+ pr_debug("++++++++++++++++++++=_---CPU UP %u\n", cpu);
+
+ if (apicid == BAD_APICID ||
+ !physid_isset(apicid, phys_cpu_present_map) ||
+ !apic->apic_id_valid(apicid)) {
+ pr_err("%s: bad cpu %d\n", __func__, cpu);
+ return -EINVAL;
+ }
+
+ /*
+ * Already booted CPU?
+ */
+ if (cpumask_test_cpu(cpu, cpu_callin_mask)) {
+ pr_debug("do_boot_cpu %d Already started\n", cpu);
+ return -ENOSYS;
+ }
+
+ /*
+ * Save current MTRR state in case it was changed since early boot
+ * (e.g. by the ACPI SMI) to initialize new CPUs with MTRRs in sync:
+ */
+ mtrr_save_state();
+
+ /* x86 CPUs take themselves offline, so delayed offline is OK. */
+ err = cpu_check_up_prepare(cpu);
+ if (err && err != -EBUSY)
+ return err;
+
+ /* the FPU context is blank, nobody can own it */
+ per_cpu(fpu_fpregs_owner_ctx, cpu) = NULL;
+
+ common_cpu_up(cpu, tidle);
+
+ err = do_boot_cpu(apicid, cpu, tidle, &cpu0_nmi_registered);
+ if (err) {
+ pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu);
+ ret = -EIO;
+ goto unreg_nmi;
+ }
+
+ /*
+ * Check TSC synchronization with the AP (keep irqs disabled
+ * while doing so):
+ */
+ local_irq_save(flags);
+ check_tsc_sync_source(cpu);
+ local_irq_restore(flags);
+
+ while (!cpu_online(cpu)) {
+ cpu_relax();
+ touch_nmi_watchdog();
+ }
+
+unreg_nmi:
+ /*
+ * Clean up the nmi handler. Do this after the callin and callout sync
+ * to avoid impact of possible long unregister time.
+ */
+ if (cpu0_nmi_registered)
+ unregister_nmi_handler(NMI_LOCAL, "wake_cpu0");
+
+ return ret;
+}
+
+/**
+ * arch_disable_smp_support() - disables SMP support for x86 at runtime
+ */
+void arch_disable_smp_support(void)
+{
+ disable_ioapic_support();
+}
+
+/*
+ * Fall back to non SMP mode after errors.
+ *
+ * RED-PEN audit/test this more. I bet there is more state messed up here.
+ */
+static __init void disable_smp(void)
+{
+ pr_info("SMP disabled\n");
+
+ disable_ioapic_support();
+
+ init_cpu_present(cpumask_of(0));
+ init_cpu_possible(cpumask_of(0));
+
+ if (smp_found_config)
+ physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
+ else
+ physid_set_mask_of_physid(0, &phys_cpu_present_map);
+ cpumask_set_cpu(0, topology_sibling_cpumask(0));
+ cpumask_set_cpu(0, topology_core_cpumask(0));
+}
+
+/*
+ * Various sanity checks.
+ */
+static void __init smp_sanity_check(void)
+{
+ preempt_disable();
+
+#if !defined(CONFIG_X86_BIGSMP) && defined(CONFIG_X86_32)
+ if (def_to_bigsmp && nr_cpu_ids > 8) {
+ unsigned int cpu;
+ unsigned nr;
+
+ pr_warn("More than 8 CPUs detected - skipping them\n"
+ "Use CONFIG_X86_BIGSMP\n");
+
+ nr = 0;
+ for_each_present_cpu(cpu) {
+ if (nr >= 8)
+ set_cpu_present(cpu, false);
+ nr++;
+ }
+
+ nr = 0;
+ for_each_possible_cpu(cpu) {
+ if (nr >= 8)
+ set_cpu_possible(cpu, false);
+ nr++;
+ }
+
+ nr_cpu_ids = 8;
+ }
+#endif
+
+ if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) {
+ pr_warn("weird, boot CPU (#%d) not listed by the BIOS\n",
+ hard_smp_processor_id());
+
+ physid_set(hard_smp_processor_id(), phys_cpu_present_map);
+ }
+
+ /*
+ * Should not be necessary because the MP table should list the boot
+ * CPU too, but we do it for the sake of robustness anyway.
+ */
+ if (!apic->check_phys_apicid_present(boot_cpu_physical_apicid)) {
+ pr_notice("weird, boot CPU (#%d) not listed by the BIOS\n",
+ boot_cpu_physical_apicid);
+ physid_set(hard_smp_processor_id(), phys_cpu_present_map);
+ }
+ preempt_enable();
+}
+
+static void __init smp_cpu_index_default(void)
+{
+ int i;
+ struct cpuinfo_x86 *c;
+
+ for_each_possible_cpu(i) {
+ c = &cpu_data(i);
+ /* mark all to hotplug */
+ c->cpu_index = nr_cpu_ids;
+ }
+}
+
+static void __init smp_get_logical_apicid(void)
+{
+ if (x2apic_mode)
+ cpu0_logical_apicid = apic_read(APIC_LDR);
+ else
+ cpu0_logical_apicid = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR));
+}
+
+/*
+ * Prepare for SMP bootup.
+ * @max_cpus: configured maximum number of CPUs, It is a legacy parameter
+ * for common interface support.
+ */
+void __init native_smp_prepare_cpus(unsigned int max_cpus)
+{
+ unsigned int i;
+
+ smp_cpu_index_default();
+
+ /*
+ * Setup boot CPU information
+ */
+ smp_store_boot_cpu_info(); /* Final full version of the data */
+ cpumask_copy(cpu_callin_mask, cpumask_of(0));
+ mb();
+
+ for_each_possible_cpu(i) {
+ zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
+ zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
+ zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL);
+ }
+
+ /*
+ * Set 'default' x86 topology, this matches default_topology() in that
+ * it has NUMA nodes as a topology level. See also
+ * native_smp_cpus_done().
+ *
+ * Must be done before set_cpus_sibling_map() is ran.
+ */
+ set_sched_topology(x86_topology);
+
+ set_cpu_sibling_map(0);
+
+ smp_sanity_check();
+
+ switch (apic_intr_mode) {
+ case APIC_PIC:
+ case APIC_VIRTUAL_WIRE_NO_CONFIG:
+ disable_smp();
+ return;
+ case APIC_SYMMETRIC_IO_NO_ROUTING:
+ disable_smp();
+ /* Setup local timer */
+ x86_init.timers.setup_percpu_clockev();
+ return;
+ case APIC_VIRTUAL_WIRE:
+ case APIC_SYMMETRIC_IO:
+ break;
+ }
+
+ /* Setup local timer */
+ x86_init.timers.setup_percpu_clockev();
+
+ smp_get_logical_apicid();
+
+ pr_info("CPU0: ");
+ print_cpu_info(&cpu_data(0));
+
+ native_pv_lock_init();
+
+ uv_system_init();
+
+ set_mtrr_aps_delayed_init();
+
+ smp_quirk_init_udelay();
+
+ speculative_store_bypass_ht_init();
+}
+
+void arch_enable_nonboot_cpus_begin(void)
+{
+ set_mtrr_aps_delayed_init();
+}
+
+void arch_enable_nonboot_cpus_end(void)
+{
+ mtrr_aps_init();
+}
+
+/*
+ * Early setup to make printk work.
+ */
+void __init native_smp_prepare_boot_cpu(void)
+{
+ int me = smp_processor_id();
+ switch_to_new_gdt(me);
+ /* already set me in cpu_online_mask in boot_cpu_init() */
+ cpumask_set_cpu(me, cpu_callout_mask);
+ cpu_set_state_online(me);
+}
+
+void __init calculate_max_logical_packages(void)
+{
+ int ncpus;
+
+ /*
+ * Today neither Intel nor AMD support heterogenous systems so
+ * extrapolate the boot cpu's data to all packages.
+ */
+ ncpus = cpu_data(0).booted_cores * topology_max_smt_threads();
+ __max_logical_packages = DIV_ROUND_UP(total_cpus, ncpus);
+ pr_info("Max logical packages: %u\n", __max_logical_packages);
+}
+
+void __init native_smp_cpus_done(unsigned int max_cpus)
+{
+ pr_debug("Boot done\n");
+
+ calculate_max_logical_packages();
+
+ if (x86_has_numa_in_package)
+ set_sched_topology(x86_numa_in_package_topology);
+
+ nmi_selftest();
+ impress_friends();
+ mtrr_aps_init();
+}
+
+static int __initdata setup_possible_cpus = -1;
+static int __init _setup_possible_cpus(char *str)
+{
+ get_option(&str, &setup_possible_cpus);
+ return 0;
+}
+early_param("possible_cpus", _setup_possible_cpus);
+
+
+/*
+ * cpu_possible_mask should be static, it cannot change as cpu's
+ * are onlined, or offlined. The reason is per-cpu data-structures
+ * are allocated by some modules at init time, and dont expect to
+ * do this dynamically on cpu arrival/departure.
+ * cpu_present_mask on the other hand can change dynamically.
+ * In case when cpu_hotplug is not compiled, then we resort to current
+ * behaviour, which is cpu_possible == cpu_present.
+ * - Ashok Raj
+ *
+ * Three ways to find out the number of additional hotplug CPUs:
+ * - If the BIOS specified disabled CPUs in ACPI/mptables use that.
+ * - The user can overwrite it with possible_cpus=NUM
+ * - Otherwise don't reserve additional CPUs.
+ * We do this because additional CPUs waste a lot of memory.
+ * -AK
+ */
+__init void prefill_possible_map(void)
+{
+ int i, possible;
+
+ /* No boot processor was found in mptable or ACPI MADT */
+ if (!num_processors) {
+ if (boot_cpu_has(X86_FEATURE_APIC)) {
+ int apicid = boot_cpu_physical_apicid;
+ int cpu = hard_smp_processor_id();
+
+ pr_warn("Boot CPU (id %d) not listed by BIOS\n", cpu);
+
+ /* Make sure boot cpu is enumerated */
+ if (apic->cpu_present_to_apicid(0) == BAD_APICID &&
+ apic->apic_id_valid(apicid))
+ generic_processor_info(apicid, boot_cpu_apic_version);
+ }
+
+ if (!num_processors)
+ num_processors = 1;
+ }
+
+ i = setup_max_cpus ?: 1;
+ if (setup_possible_cpus == -1) {
+ possible = num_processors;
+#ifdef CONFIG_HOTPLUG_CPU
+ if (setup_max_cpus)
+ possible += disabled_cpus;
+#else
+ if (possible > i)
+ possible = i;
+#endif
+ } else
+ possible = setup_possible_cpus;
+
+ total_cpus = max_t(int, possible, num_processors + disabled_cpus);
+
+ /* nr_cpu_ids could be reduced via nr_cpus= */
+ if (possible > nr_cpu_ids) {
+ pr_warn("%d Processors exceeds NR_CPUS limit of %u\n",
+ possible, nr_cpu_ids);
+ possible = nr_cpu_ids;
+ }
+
+#ifdef CONFIG_HOTPLUG_CPU
+ if (!setup_max_cpus)
+#endif
+ if (possible > i) {
+ pr_warn("%d Processors exceeds max_cpus limit of %u\n",
+ possible, setup_max_cpus);
+ possible = i;
+ }
+
+ nr_cpu_ids = possible;
+
+ pr_info("Allowing %d CPUs, %d hotplug CPUs\n",
+ possible, max_t(int, possible - num_processors, 0));
+
+ reset_cpu_possible_mask();
+
+ for (i = 0; i < possible; i++)
+ set_cpu_possible(i, true);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+/* Recompute SMT state for all CPUs on offline */
+static void recompute_smt_state(void)
+{
+ int max_threads, cpu;
+
+ max_threads = 0;
+ for_each_online_cpu (cpu) {
+ int threads = cpumask_weight(topology_sibling_cpumask(cpu));
+
+ if (threads > max_threads)
+ max_threads = threads;
+ }
+ __max_smt_threads = max_threads;
+}
+
+static void remove_siblinginfo(int cpu)
+{
+ int sibling;
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+ for_each_cpu(sibling, topology_core_cpumask(cpu)) {
+ cpumask_clear_cpu(cpu, topology_core_cpumask(sibling));
+ /*/
+ * last thread sibling in this cpu core going down
+ */
+ if (cpumask_weight(topology_sibling_cpumask(cpu)) == 1)
+ cpu_data(sibling).booted_cores--;
+ }
+
+ for_each_cpu(sibling, topology_sibling_cpumask(cpu))
+ cpumask_clear_cpu(cpu, topology_sibling_cpumask(sibling));
+ for_each_cpu(sibling, cpu_llc_shared_mask(cpu))
+ cpumask_clear_cpu(cpu, cpu_llc_shared_mask(sibling));
+ cpumask_clear(cpu_llc_shared_mask(cpu));
+ cpumask_clear(topology_sibling_cpumask(cpu));
+ cpumask_clear(topology_core_cpumask(cpu));
+ c->cpu_core_id = 0;
+ c->booted_cores = 0;
+ cpumask_clear_cpu(cpu, cpu_sibling_setup_mask);
+ recompute_smt_state();
+}
+
+static void remove_cpu_from_maps(int cpu)
+{
+ set_cpu_online(cpu, false);
+ cpumask_clear_cpu(cpu, cpu_callout_mask);
+ cpumask_clear_cpu(cpu, cpu_callin_mask);
+ /* was set by cpu_init() */
+ cpumask_clear_cpu(cpu, cpu_initialized_mask);
+ numa_remove_cpu(cpu);
+}
+
+void cpu_disable_common(void)
+{
+ int cpu = smp_processor_id();
+
+ remove_siblinginfo(cpu);
+
+ /* It's now safe to remove this processor from the online map */
+ lock_vector_lock();
+ remove_cpu_from_maps(cpu);
+ unlock_vector_lock();
+ fixup_irqs();
+ lapic_offline();
+}
+
+int native_cpu_disable(void)
+{
+ int ret;
+
+ ret = lapic_can_unplug_cpu();
+ if (ret)
+ return ret;
+
+ clear_local_APIC();
+ cpu_disable_common();
+
+ return 0;
+}
+
+int common_cpu_die(unsigned int cpu)
+{
+ int ret = 0;
+
+ /* We don't do anything here: idle task is faking death itself. */
+
+ /* They ack this in play_dead() by setting CPU_DEAD */
+ if (cpu_wait_death(cpu, 5)) {
+ if (system_state == SYSTEM_RUNNING)
+ pr_info("CPU %u is now offline\n", cpu);
+ } else {
+ pr_err("CPU %u didn't die...\n", cpu);
+ ret = -1;
+ }
+
+ return ret;
+}
+
+void native_cpu_die(unsigned int cpu)
+{
+ common_cpu_die(cpu);
+}
+
+void play_dead_common(void)
+{
+ idle_task_exit();
+
+ /* Ack it */
+ (void)cpu_report_death();
+
+ /*
+ * With physical CPU hotplug, we should halt the cpu
+ */
+ local_irq_disable();
+}
+
+static bool wakeup_cpu0(void)
+{
+ if (smp_processor_id() == 0 && enable_start_cpu0)
+ return true;
+
+ return false;
+}
+
+/*
+ * We need to flush the caches before going to sleep, lest we have
+ * dirty data in our caches when we come back up.
+ */
+static inline void mwait_play_dead(void)
+{
+ unsigned int eax, ebx, ecx, edx;
+ unsigned int highest_cstate = 0;
+ unsigned int highest_subcstate = 0;
+ void *mwait_ptr;
+ int i;
+
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+ return;
+ if (!this_cpu_has(X86_FEATURE_MWAIT))
+ return;
+ if (!this_cpu_has(X86_FEATURE_CLFLUSH))
+ return;
+ if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF)
+ return;
+
+ eax = CPUID_MWAIT_LEAF;
+ ecx = 0;
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+
+ /*
+ * eax will be 0 if EDX enumeration is not valid.
+ * Initialized below to cstate, sub_cstate value when EDX is valid.
+ */
+ if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED)) {
+ eax = 0;
+ } else {
+ edx >>= MWAIT_SUBSTATE_SIZE;
+ for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
+ if (edx & MWAIT_SUBSTATE_MASK) {
+ highest_cstate = i;
+ highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
+ }
+ }
+ eax = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
+ (highest_subcstate - 1);
+ }
+
+ /*
+ * This should be a memory location in a cache line which is
+ * unlikely to be touched by other processors. The actual
+ * content is immaterial as it is not actually modified in any way.
+ */
+ mwait_ptr = &current_thread_info()->flags;
+
+ wbinvd();
+
+ while (1) {
+ /*
+ * The CLFLUSH is a workaround for erratum AAI65 for
+ * the Xeon 7400 series. It's not clear it is actually
+ * needed, but it should be harmless in either case.
+ * The WBINVD is insufficient due to the spurious-wakeup
+ * case where we return around the loop.
+ */
+ mb();
+ clflush(mwait_ptr);
+ mb();
+ __monitor(mwait_ptr, 0, 0);
+ mb();
+ __mwait(eax, 0);
+ /*
+ * If NMI wants to wake up CPU0, start CPU0.
+ */
+ if (wakeup_cpu0())
+ start_cpu0();
+ }
+}
+
+void hlt_play_dead(void)
+{
+ if (__this_cpu_read(cpu_info.x86) >= 4)
+ wbinvd();
+
+ while (1) {
+ native_halt();
+ /*
+ * If NMI wants to wake up CPU0, start CPU0.
+ */
+ if (wakeup_cpu0())
+ start_cpu0();
+ }
+}
+
+void native_play_dead(void)
+{
+ play_dead_common();
+ tboot_shutdown(TB_SHUTDOWN_WFS);
+
+ mwait_play_dead(); /* Only returns on failure */
+ if (cpuidle_play_dead())
+ hlt_play_dead();
+}
+
+#else /* ... !CONFIG_HOTPLUG_CPU */
+int native_cpu_disable(void)
+{
+ return -ENOSYS;
+}
+
+void native_cpu_die(unsigned int cpu)
+{
+ /* We said "no" in __cpu_disable */
+ BUG();
+}
+
+void native_play_dead(void)
+{
+ BUG();
+}
+
+#endif
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
new file mode 100644
index 000000000..762745504
--- /dev/null
+++ b/arch/x86/kernel/stacktrace.c
@@ -0,0 +1,229 @@
+/*
+ * Stack trace management functions
+ *
+ * Copyright (C) 2006-2009 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ */
+#include <linux/sched.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/task_stack.h>
+#include <linux/stacktrace.h>
+#include <linux/export.h>
+#include <linux/uaccess.h>
+#include <asm/stacktrace.h>
+#include <asm/unwind.h>
+
+static int save_stack_address(struct stack_trace *trace, unsigned long addr,
+ bool nosched)
+{
+ if (nosched && in_sched_functions(addr))
+ return 0;
+
+ if (trace->skip > 0) {
+ trace->skip--;
+ return 0;
+ }
+
+ if (trace->nr_entries >= trace->max_entries)
+ return -1;
+
+ trace->entries[trace->nr_entries++] = addr;
+ return 0;
+}
+
+static void noinline __save_stack_trace(struct stack_trace *trace,
+ struct task_struct *task, struct pt_regs *regs,
+ bool nosched)
+{
+ struct unwind_state state;
+ unsigned long addr;
+
+ if (regs)
+ save_stack_address(trace, regs->ip, nosched);
+
+ for (unwind_start(&state, task, regs, NULL); !unwind_done(&state);
+ unwind_next_frame(&state)) {
+ addr = unwind_get_return_address(&state);
+ if (!addr || save_stack_address(trace, addr, nosched))
+ break;
+ }
+
+ if (trace->nr_entries < trace->max_entries)
+ trace->entries[trace->nr_entries++] = ULONG_MAX;
+}
+
+/*
+ * Save stack-backtrace addresses into a stack_trace buffer.
+ */
+void save_stack_trace(struct stack_trace *trace)
+{
+ trace->skip++;
+ __save_stack_trace(trace, current, NULL, false);
+}
+EXPORT_SYMBOL_GPL(save_stack_trace);
+
+void save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace)
+{
+ __save_stack_trace(trace, current, regs, false);
+}
+
+void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
+{
+ if (!try_get_task_stack(tsk))
+ return;
+
+ if (tsk == current)
+ trace->skip++;
+ __save_stack_trace(trace, tsk, NULL, true);
+
+ put_task_stack(tsk);
+}
+EXPORT_SYMBOL_GPL(save_stack_trace_tsk);
+
+#ifdef CONFIG_HAVE_RELIABLE_STACKTRACE
+
+static int __always_inline
+__save_stack_trace_reliable(struct stack_trace *trace,
+ struct task_struct *task)
+{
+ struct unwind_state state;
+ struct pt_regs *regs;
+ unsigned long addr;
+
+ for (unwind_start(&state, task, NULL, NULL);
+ !unwind_done(&state) && !unwind_error(&state);
+ unwind_next_frame(&state)) {
+
+ regs = unwind_get_entry_regs(&state, NULL);
+ if (regs) {
+ /* Success path for user tasks */
+ if (user_mode(regs))
+ goto success;
+
+ /*
+ * Kernel mode registers on the stack indicate an
+ * in-kernel interrupt or exception (e.g., preemption
+ * or a page fault), which can make frame pointers
+ * unreliable.
+ */
+
+ if (IS_ENABLED(CONFIG_FRAME_POINTER))
+ return -EINVAL;
+ }
+
+ addr = unwind_get_return_address(&state);
+
+ /*
+ * A NULL or invalid return address probably means there's some
+ * generated code which __kernel_text_address() doesn't know
+ * about.
+ */
+ if (!addr)
+ return -EINVAL;
+
+ if (save_stack_address(trace, addr, false))
+ return -EINVAL;
+ }
+
+ /* Check for stack corruption */
+ if (unwind_error(&state))
+ return -EINVAL;
+
+ /* Success path for non-user tasks, i.e. kthreads and idle tasks */
+ if (!(task->flags & (PF_KTHREAD | PF_IDLE)))
+ return -EINVAL;
+
+success:
+ if (trace->nr_entries < trace->max_entries)
+ trace->entries[trace->nr_entries++] = ULONG_MAX;
+
+ return 0;
+}
+
+/*
+ * This function returns an error if it detects any unreliable features of the
+ * stack. Otherwise it guarantees that the stack trace is reliable.
+ *
+ * If the task is not 'current', the caller *must* ensure the task is inactive.
+ */
+int save_stack_trace_tsk_reliable(struct task_struct *tsk,
+ struct stack_trace *trace)
+{
+ int ret;
+
+ /*
+ * If the task doesn't have a stack (e.g., a zombie), the stack is
+ * "reliably" empty.
+ */
+ if (!try_get_task_stack(tsk))
+ return 0;
+
+ ret = __save_stack_trace_reliable(trace, tsk);
+
+ put_task_stack(tsk);
+
+ return ret;
+}
+#endif /* CONFIG_HAVE_RELIABLE_STACKTRACE */
+
+/* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */
+
+struct stack_frame_user {
+ const void __user *next_fp;
+ unsigned long ret_addr;
+};
+
+static int
+copy_stack_frame(const void __user *fp, struct stack_frame_user *frame)
+{
+ int ret;
+
+ if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
+ return 0;
+
+ ret = 1;
+ pagefault_disable();
+ if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
+ ret = 0;
+ pagefault_enable();
+
+ return ret;
+}
+
+static inline void __save_stack_trace_user(struct stack_trace *trace)
+{
+ const struct pt_regs *regs = task_pt_regs(current);
+ const void __user *fp = (const void __user *)regs->bp;
+
+ if (trace->nr_entries < trace->max_entries)
+ trace->entries[trace->nr_entries++] = regs->ip;
+
+ while (trace->nr_entries < trace->max_entries) {
+ struct stack_frame_user frame;
+
+ frame.next_fp = NULL;
+ frame.ret_addr = 0;
+ if (!copy_stack_frame(fp, &frame))
+ break;
+ if ((unsigned long)fp < regs->sp)
+ break;
+ if (frame.ret_addr) {
+ trace->entries[trace->nr_entries++] =
+ frame.ret_addr;
+ }
+ if (fp == frame.next_fp)
+ break;
+ fp = frame.next_fp;
+ }
+}
+
+void save_stack_trace_user(struct stack_trace *trace)
+{
+ /*
+ * Trace user stack if we are not a kernel thread
+ */
+ if (current->mm) {
+ __save_stack_trace_user(trace);
+ }
+ if (trace->nr_entries < trace->max_entries)
+ trace->entries[trace->nr_entries++] = ULONG_MAX;
+}
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
new file mode 100644
index 000000000..2f97d1a10
--- /dev/null
+++ b/arch/x86/kernel/step.c
@@ -0,0 +1,236 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * x86 single-step support code, common to 32-bit and 64-bit.
+ */
+#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
+#include <linux/mm.h>
+#include <linux/ptrace.h>
+#include <asm/desc.h>
+#include <asm/mmu_context.h>
+
+unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs)
+{
+ unsigned long addr, seg;
+
+ addr = regs->ip;
+ seg = regs->cs;
+ if (v8086_mode(regs)) {
+ addr = (addr & 0xffff) + (seg << 4);
+ return addr;
+ }
+
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+ /*
+ * We'll assume that the code segments in the GDT
+ * are all zero-based. That is largely true: the
+ * TLS segments are used for data, and the PNPBIOS
+ * and APM bios ones we just ignore here.
+ */
+ if ((seg & SEGMENT_TI_MASK) == SEGMENT_LDT) {
+ struct desc_struct *desc;
+ unsigned long base;
+
+ seg >>= 3;
+
+ mutex_lock(&child->mm->context.lock);
+ if (unlikely(!child->mm->context.ldt ||
+ seg >= child->mm->context.ldt->nr_entries))
+ addr = -1L; /* bogus selector, access would fault */
+ else {
+ desc = &child->mm->context.ldt->entries[seg];
+ base = get_desc_base(desc);
+
+ /* 16-bit code segment? */
+ if (!desc->d)
+ addr &= 0xffff;
+ addr += base;
+ }
+ mutex_unlock(&child->mm->context.lock);
+ }
+#endif
+
+ return addr;
+}
+
+static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs)
+{
+ int i, copied;
+ unsigned char opcode[15];
+ unsigned long addr = convert_ip_to_linear(child, regs);
+
+ copied = access_process_vm(child, addr, opcode, sizeof(opcode),
+ FOLL_FORCE);
+ for (i = 0; i < copied; i++) {
+ switch (opcode[i]) {
+ /* popf and iret */
+ case 0x9d: case 0xcf:
+ return 1;
+
+ /* CHECKME: 64 65 */
+
+ /* opcode and address size prefixes */
+ case 0x66: case 0x67:
+ continue;
+ /* irrelevant prefixes (segment overrides and repeats) */
+ case 0x26: case 0x2e:
+ case 0x36: case 0x3e:
+ case 0x64: case 0x65:
+ case 0xf0: case 0xf2: case 0xf3:
+ continue;
+
+#ifdef CONFIG_X86_64
+ case 0x40 ... 0x4f:
+ if (!user_64bit_mode(regs))
+ /* 32-bit mode: register increment */
+ return 0;
+ /* 64-bit mode: REX prefix */
+ continue;
+#endif
+
+ /* CHECKME: f2, f3 */
+
+ /*
+ * pushf: NOTE! We should probably not let
+ * the user see the TF bit being set. But
+ * it's more pain than it's worth to avoid
+ * it, and a debugger could emulate this
+ * all in user space if it _really_ cares.
+ */
+ case 0x9c:
+ default:
+ return 0;
+ }
+ }
+ return 0;
+}
+
+/*
+ * Enable single-stepping. Return nonzero if user mode is not using TF itself.
+ */
+static int enable_single_step(struct task_struct *child)
+{
+ struct pt_regs *regs = task_pt_regs(child);
+ unsigned long oflags;
+
+ /*
+ * If we stepped into a sysenter/syscall insn, it trapped in
+ * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
+ * If user-mode had set TF itself, then it's still clear from
+ * do_debug() and we need to set it again to restore the user
+ * state so we don't wrongly set TIF_FORCED_TF below.
+ * If enable_single_step() was used last and that is what
+ * set TIF_SINGLESTEP, then both TF and TIF_FORCED_TF are
+ * already set and our bookkeeping is fine.
+ */
+ if (unlikely(test_tsk_thread_flag(child, TIF_SINGLESTEP)))
+ regs->flags |= X86_EFLAGS_TF;
+
+ /*
+ * Always set TIF_SINGLESTEP - this guarantees that
+ * we single-step system calls etc.. This will also
+ * cause us to set TF when returning to user mode.
+ */
+ set_tsk_thread_flag(child, TIF_SINGLESTEP);
+
+ oflags = regs->flags;
+
+ /* Set TF on the kernel stack.. */
+ regs->flags |= X86_EFLAGS_TF;
+
+ /*
+ * ..but if TF is changed by the instruction we will trace,
+ * don't mark it as being "us" that set it, so that we
+ * won't clear it by hand later.
+ *
+ * Note that if we don't actually execute the popf because
+ * of a signal arriving right now or suchlike, we will lose
+ * track of the fact that it really was "us" that set it.
+ */
+ if (is_setting_trap_flag(child, regs)) {
+ clear_tsk_thread_flag(child, TIF_FORCED_TF);
+ return 0;
+ }
+
+ /*
+ * If TF was already set, check whether it was us who set it.
+ * If not, we should never attempt a block step.
+ */
+ if (oflags & X86_EFLAGS_TF)
+ return test_tsk_thread_flag(child, TIF_FORCED_TF);
+
+ set_tsk_thread_flag(child, TIF_FORCED_TF);
+
+ return 1;
+}
+
+void set_task_blockstep(struct task_struct *task, bool on)
+{
+ unsigned long debugctl;
+
+ /*
+ * Ensure irq/preemption can't change debugctl in between.
+ * Note also that both TIF_BLOCKSTEP and debugctl should
+ * be changed atomically wrt preemption.
+ *
+ * NOTE: this means that set/clear TIF_BLOCKSTEP is only safe if
+ * task is current or it can't be running, otherwise we can race
+ * with __switch_to_xtra(). We rely on ptrace_freeze_traced().
+ */
+ local_irq_disable();
+ debugctl = get_debugctlmsr();
+ if (on) {
+ debugctl |= DEBUGCTLMSR_BTF;
+ set_tsk_thread_flag(task, TIF_BLOCKSTEP);
+ } else {
+ debugctl &= ~DEBUGCTLMSR_BTF;
+ clear_tsk_thread_flag(task, TIF_BLOCKSTEP);
+ }
+ if (task == current)
+ update_debugctlmsr(debugctl);
+ local_irq_enable();
+}
+
+/*
+ * Enable single or block step.
+ */
+static void enable_step(struct task_struct *child, bool block)
+{
+ /*
+ * Make sure block stepping (BTF) is not enabled unless it should be.
+ * Note that we don't try to worry about any is_setting_trap_flag()
+ * instructions after the first when using block stepping.
+ * So no one should try to use debugger block stepping in a program
+ * that uses user-mode single stepping itself.
+ */
+ if (enable_single_step(child) && block)
+ set_task_blockstep(child, true);
+ else if (test_tsk_thread_flag(child, TIF_BLOCKSTEP))
+ set_task_blockstep(child, false);
+}
+
+void user_enable_single_step(struct task_struct *child)
+{
+ enable_step(child, 0);
+}
+
+void user_enable_block_step(struct task_struct *child)
+{
+ enable_step(child, 1);
+}
+
+void user_disable_single_step(struct task_struct *child)
+{
+ /*
+ * Make sure block stepping (BTF) is disabled.
+ */
+ if (test_tsk_thread_flag(child, TIF_BLOCKSTEP))
+ set_task_blockstep(child, false);
+
+ /* Always clear TIF_SINGLESTEP... */
+ clear_tsk_thread_flag(child, TIF_SINGLESTEP);
+
+ /* But touch TF only if it was set by us.. */
+ if (test_and_clear_tsk_thread_flag(child, TIF_FORCED_TF))
+ task_pt_regs(child)->flags &= ~X86_EFLAGS_TF;
+}
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
new file mode 100644
index 000000000..9fb266c79
--- /dev/null
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -0,0 +1,241 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/compat.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/syscalls.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/smp.h>
+#include <linux/sem.h>
+#include <linux/msg.h>
+#include <linux/shm.h>
+#include <linux/stat.h>
+#include <linux/mman.h>
+#include <linux/file.h>
+#include <linux/utsname.h>
+#include <linux/personality.h>
+#include <linux/random.h>
+#include <linux/uaccess.h>
+#include <linux/elf.h>
+
+#include <asm/elf.h>
+#include <asm/ia32.h>
+#include <asm/syscalls.h>
+#include <asm/mpx.h>
+
+/*
+ * Align a virtual address to avoid aliasing in the I$ on AMD F15h.
+ */
+static unsigned long get_align_mask(void)
+{
+ /* handle 32- and 64-bit case with a single conditional */
+ if (va_align.flags < 0 || !(va_align.flags & (2 - mmap_is_ia32())))
+ return 0;
+
+ if (!(current->flags & PF_RANDOMIZE))
+ return 0;
+
+ return va_align.mask;
+}
+
+/*
+ * To avoid aliasing in the I$ on AMD F15h, the bits defined by the
+ * va_align.bits, [12:upper_bit), are set to a random value instead of
+ * zeroing them. This random value is computed once per boot. This form
+ * of ASLR is known as "per-boot ASLR".
+ *
+ * To achieve this, the random value is added to the info.align_offset
+ * value before calling vm_unmapped_area() or ORed directly to the
+ * address.
+ */
+static unsigned long get_align_bits(void)
+{
+ return va_align.bits & get_align_mask();
+}
+
+unsigned long align_vdso_addr(unsigned long addr)
+{
+ unsigned long align_mask = get_align_mask();
+ addr = (addr + align_mask) & ~align_mask;
+ return addr | get_align_bits();
+}
+
+static int __init control_va_addr_alignment(char *str)
+{
+ /* guard against enabling this on other CPU families */
+ if (va_align.flags < 0)
+ return 1;
+
+ if (*str == 0)
+ return 1;
+
+ if (!strcmp(str, "32"))
+ va_align.flags = ALIGN_VA_32;
+ else if (!strcmp(str, "64"))
+ va_align.flags = ALIGN_VA_64;
+ else if (!strcmp(str, "off"))
+ va_align.flags = 0;
+ else if (!strcmp(str, "on"))
+ va_align.flags = ALIGN_VA_32 | ALIGN_VA_64;
+ else
+ pr_warn("invalid option value: 'align_va_addr=%s'\n", str);
+
+ return 1;
+}
+__setup("align_va_addr=", control_va_addr_alignment);
+
+SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
+ unsigned long, prot, unsigned long, flags,
+ unsigned long, fd, unsigned long, off)
+{
+ long error;
+ error = -EINVAL;
+ if (off & ~PAGE_MASK)
+ goto out;
+
+ error = ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
+out:
+ return error;
+}
+
+static void find_start_end(unsigned long addr, unsigned long flags,
+ unsigned long *begin, unsigned long *end)
+{
+ if (!in_compat_syscall() && (flags & MAP_32BIT)) {
+ /* This is usually used needed to map code in small
+ model, so it needs to be in the first 31bit. Limit
+ it to that. This means we need to move the
+ unmapped base down for this case. This can give
+ conflicts with the heap, but we assume that glibc
+ malloc knows how to fall back to mmap. Give it 1GB
+ of playground for now. -AK */
+ *begin = 0x40000000;
+ *end = 0x80000000;
+ if (current->flags & PF_RANDOMIZE) {
+ *begin = randomize_page(*begin, 0x02000000);
+ }
+ return;
+ }
+
+ *begin = get_mmap_base(1);
+ if (in_compat_syscall())
+ *end = task_size_32bit();
+ else
+ *end = task_size_64bit(addr > DEFAULT_MAP_WINDOW);
+}
+
+unsigned long
+arch_get_unmapped_area(struct file *filp, unsigned long addr,
+ unsigned long len, unsigned long pgoff, unsigned long flags)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ struct vm_unmapped_area_info info;
+ unsigned long begin, end;
+
+ addr = mpx_unmapped_area_check(addr, len, flags);
+ if (IS_ERR_VALUE(addr))
+ return addr;
+
+ if (flags & MAP_FIXED)
+ return addr;
+
+ find_start_end(addr, flags, &begin, &end);
+
+ if (len > end)
+ return -ENOMEM;
+
+ if (addr) {
+ addr = PAGE_ALIGN(addr);
+ vma = find_vma(mm, addr);
+ if (end - len >= addr &&
+ (!vma || addr + len <= vm_start_gap(vma)))
+ return addr;
+ }
+
+ info.flags = 0;
+ info.length = len;
+ info.low_limit = begin;
+ info.high_limit = end;
+ info.align_mask = 0;
+ info.align_offset = pgoff << PAGE_SHIFT;
+ if (filp) {
+ info.align_mask = get_align_mask();
+ info.align_offset += get_align_bits();
+ }
+ return vm_unmapped_area(&info);
+}
+
+unsigned long
+arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
+ const unsigned long len, const unsigned long pgoff,
+ const unsigned long flags)
+{
+ struct vm_area_struct *vma;
+ struct mm_struct *mm = current->mm;
+ unsigned long addr = addr0;
+ struct vm_unmapped_area_info info;
+
+ addr = mpx_unmapped_area_check(addr, len, flags);
+ if (IS_ERR_VALUE(addr))
+ return addr;
+
+ /* requested length too big for entire address space */
+ if (len > TASK_SIZE)
+ return -ENOMEM;
+
+ /* No address checking. See comment at mmap_address_hint_valid() */
+ if (flags & MAP_FIXED)
+ return addr;
+
+ /* for MAP_32BIT mappings we force the legacy mmap base */
+ if (!in_compat_syscall() && (flags & MAP_32BIT))
+ goto bottomup;
+
+ /* requesting a specific address */
+ if (addr) {
+ addr &= PAGE_MASK;
+ if (!mmap_address_hint_valid(addr, len))
+ goto get_unmapped_area;
+
+ vma = find_vma(mm, addr);
+ if (!vma || addr + len <= vm_start_gap(vma))
+ return addr;
+ }
+get_unmapped_area:
+
+ info.flags = VM_UNMAPPED_AREA_TOPDOWN;
+ info.length = len;
+ info.low_limit = PAGE_SIZE;
+ info.high_limit = get_mmap_base(0);
+
+ /*
+ * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area
+ * in the full address space.
+ *
+ * !in_compat_syscall() check to avoid high addresses for x32.
+ */
+ if (addr > DEFAULT_MAP_WINDOW && !in_compat_syscall())
+ info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW;
+
+ info.align_mask = 0;
+ info.align_offset = pgoff << PAGE_SHIFT;
+ if (filp) {
+ info.align_mask = get_align_mask();
+ info.align_offset += get_align_bits();
+ }
+ addr = vm_unmapped_area(&info);
+ if (!(addr & ~PAGE_MASK))
+ return addr;
+ VM_BUG_ON(addr != -ENOMEM);
+
+bottomup:
+ /*
+ * A failed mmap() very likely causes application failure,
+ * so fall back to the bottom-up function here. This scenario
+ * can happen with large stack limits and large mmap()
+ * allocations.
+ */
+ return arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
+}
diff --git a/arch/x86/kernel/sysfb.c b/arch/x86/kernel/sysfb.c
new file mode 100644
index 000000000..160386e9f
--- /dev/null
+++ b/arch/x86/kernel/sysfb.c
@@ -0,0 +1,74 @@
+/*
+ * Generic System Framebuffers on x86
+ * Copyright (c) 2012-2013 David Herrmann <dh.herrmann@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+/*
+ * Simple-Framebuffer support for x86 systems
+ * Create a platform-device for any available boot framebuffer. The
+ * simple-framebuffer platform device is already available on DT systems, so
+ * this module parses the global "screen_info" object and creates a suitable
+ * platform device compatible with the "simple-framebuffer" DT object. If
+ * the framebuffer is incompatible, we instead create a legacy
+ * "vesa-framebuffer", "efi-framebuffer" or "platform-framebuffer" device and
+ * pass the screen_info as platform_data. This allows legacy drivers
+ * to pick these devices up without messing with simple-framebuffer drivers.
+ * The global "screen_info" is still valid at all times.
+ *
+ * If CONFIG_X86_SYSFB is not selected, we never register "simple-framebuffer"
+ * platform devices, but only use legacy framebuffer devices for
+ * backwards compatibility.
+ *
+ * TODO: We set the dev_id field of all platform-devices to 0. This allows
+ * other x86 OF/DT parsers to create such devices, too. However, they must
+ * start at offset 1 for this to work.
+ */
+
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/platform_data/simplefb.h>
+#include <linux/platform_device.h>
+#include <linux/screen_info.h>
+#include <asm/sysfb.h>
+
+static __init int sysfb_init(void)
+{
+ struct screen_info *si = &screen_info;
+ struct simplefb_platform_data mode;
+ struct platform_device *pd;
+ const char *name;
+ bool compatible;
+ int ret;
+
+ sysfb_apply_efi_quirks();
+
+ /* try to create a simple-framebuffer device */
+ compatible = parse_mode(si, &mode);
+ if (compatible) {
+ ret = create_simplefb(si, &mode);
+ if (!ret)
+ return 0;
+ }
+
+ /* if the FB is incompatible, create a legacy framebuffer device */
+ if (si->orig_video_isVGA == VIDEO_TYPE_EFI)
+ name = "efi-framebuffer";
+ else if (si->orig_video_isVGA == VIDEO_TYPE_VLFB)
+ name = "vesa-framebuffer";
+ else
+ name = "platform-framebuffer";
+
+ pd = platform_device_register_resndata(NULL, name, 0,
+ NULL, 0, si, sizeof(*si));
+ return PTR_ERR_OR_ZERO(pd);
+}
+
+/* must execute after PCI subsystem for EFI quirks */
+device_initcall(sysfb_init);
diff --git a/arch/x86/kernel/sysfb_efi.c b/arch/x86/kernel/sysfb_efi.c
new file mode 100644
index 000000000..897da526e
--- /dev/null
+++ b/arch/x86/kernel/sysfb_efi.c
@@ -0,0 +1,285 @@
+/*
+ * Generic System Framebuffers on x86
+ * Copyright (c) 2012-2013 David Herrmann <dh.herrmann@gmail.com>
+ *
+ * EFI Quirks Copyright (c) 2006 Edgar Hucek <gimli@dark-green.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+/*
+ * EFI Quirks
+ * Several EFI systems do not correctly advertise their boot framebuffers.
+ * Hence, we use this static table of known broken machines and fix up the
+ * information so framebuffer drivers can load corectly.
+ */
+
+#include <linux/dmi.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/pci.h>
+#include <linux/screen_info.h>
+#include <video/vga.h>
+#include <asm/sysfb.h>
+
+enum {
+ OVERRIDE_NONE = 0x0,
+ OVERRIDE_BASE = 0x1,
+ OVERRIDE_STRIDE = 0x2,
+ OVERRIDE_HEIGHT = 0x4,
+ OVERRIDE_WIDTH = 0x8,
+};
+
+struct efifb_dmi_info efifb_dmi_list[] = {
+ [M_I17] = { "i17", 0x80010000, 1472 * 4, 1440, 900, OVERRIDE_NONE },
+ [M_I20] = { "i20", 0x80010000, 1728 * 4, 1680, 1050, OVERRIDE_NONE }, /* guess */
+ [M_I20_SR] = { "imac7", 0x40010000, 1728 * 4, 1680, 1050, OVERRIDE_NONE },
+ [M_I24] = { "i24", 0x80010000, 2048 * 4, 1920, 1200, OVERRIDE_NONE }, /* guess */
+ [M_I24_8_1] = { "imac8", 0xc0060000, 2048 * 4, 1920, 1200, OVERRIDE_NONE },
+ [M_I24_10_1] = { "imac10", 0xc0010000, 2048 * 4, 1920, 1080, OVERRIDE_NONE },
+ [M_I27_11_1] = { "imac11", 0xc0010000, 2560 * 4, 2560, 1440, OVERRIDE_NONE },
+ [M_MINI]= { "mini", 0x80000000, 2048 * 4, 1024, 768, OVERRIDE_NONE },
+ [M_MINI_3_1] = { "mini31", 0x40010000, 1024 * 4, 1024, 768, OVERRIDE_NONE },
+ [M_MINI_4_1] = { "mini41", 0xc0010000, 2048 * 4, 1920, 1200, OVERRIDE_NONE },
+ [M_MB] = { "macbook", 0x80000000, 2048 * 4, 1280, 800, OVERRIDE_NONE },
+ [M_MB_5_1] = { "macbook51", 0x80010000, 2048 * 4, 1280, 800, OVERRIDE_NONE },
+ [M_MB_6_1] = { "macbook61", 0x80010000, 2048 * 4, 1280, 800, OVERRIDE_NONE },
+ [M_MB_7_1] = { "macbook71", 0x80010000, 2048 * 4, 1280, 800, OVERRIDE_NONE },
+ [M_MBA] = { "mba", 0x80000000, 2048 * 4, 1280, 800, OVERRIDE_NONE },
+ /* 11" Macbook Air 3,1 passes the wrong stride */
+ [M_MBA_3] = { "mba3", 0, 2048 * 4, 0, 0, OVERRIDE_STRIDE },
+ [M_MBP] = { "mbp", 0x80010000, 1472 * 4, 1440, 900, OVERRIDE_NONE },
+ [M_MBP_2] = { "mbp2", 0, 0, 0, 0, OVERRIDE_NONE }, /* placeholder */
+ [M_MBP_2_2] = { "mbp22", 0x80010000, 1472 * 4, 1440, 900, OVERRIDE_NONE },
+ [M_MBP_SR] = { "mbp3", 0x80030000, 2048 * 4, 1440, 900, OVERRIDE_NONE },
+ [M_MBP_4] = { "mbp4", 0xc0060000, 2048 * 4, 1920, 1200, OVERRIDE_NONE },
+ [M_MBP_5_1] = { "mbp51", 0xc0010000, 2048 * 4, 1440, 900, OVERRIDE_NONE },
+ [M_MBP_5_2] = { "mbp52", 0xc0010000, 2048 * 4, 1920, 1200, OVERRIDE_NONE },
+ [M_MBP_5_3] = { "mbp53", 0xd0010000, 2048 * 4, 1440, 900, OVERRIDE_NONE },
+ [M_MBP_6_1] = { "mbp61", 0x90030000, 2048 * 4, 1920, 1200, OVERRIDE_NONE },
+ [M_MBP_6_2] = { "mbp62", 0x90030000, 2048 * 4, 1680, 1050, OVERRIDE_NONE },
+ [M_MBP_7_1] = { "mbp71", 0xc0010000, 2048 * 4, 1280, 800, OVERRIDE_NONE },
+ [M_MBP_8_2] = { "mbp82", 0x90010000, 1472 * 4, 1440, 900, OVERRIDE_NONE },
+ [M_UNKNOWN] = { NULL, 0, 0, 0, 0, OVERRIDE_NONE }
+};
+
+void efifb_setup_from_dmi(struct screen_info *si, const char *opt)
+{
+ int i;
+
+ for (i = 0; i < M_UNKNOWN; i++) {
+ if (efifb_dmi_list[i].base != 0 &&
+ !strcmp(opt, efifb_dmi_list[i].optname)) {
+ si->lfb_base = efifb_dmi_list[i].base;
+ si->lfb_linelength = efifb_dmi_list[i].stride;
+ si->lfb_width = efifb_dmi_list[i].width;
+ si->lfb_height = efifb_dmi_list[i].height;
+ }
+ }
+}
+
+#define choose_value(dmivalue, fwvalue, field, flags) ({ \
+ typeof(fwvalue) _ret_ = fwvalue; \
+ if ((flags) & (field)) \
+ _ret_ = dmivalue; \
+ else if ((fwvalue) == 0) \
+ _ret_ = dmivalue; \
+ _ret_; \
+ })
+
+static int __init efifb_set_system(const struct dmi_system_id *id)
+{
+ struct efifb_dmi_info *info = id->driver_data;
+
+ if (info->base == 0 && info->height == 0 && info->width == 0 &&
+ info->stride == 0)
+ return 0;
+
+ /* Trust the bootloader over the DMI tables */
+ if (screen_info.lfb_base == 0) {
+#if defined(CONFIG_PCI)
+ struct pci_dev *dev = NULL;
+ int found_bar = 0;
+#endif
+ if (info->base) {
+ screen_info.lfb_base = choose_value(info->base,
+ screen_info.lfb_base, OVERRIDE_BASE,
+ info->flags);
+
+#if defined(CONFIG_PCI)
+ /* make sure that the address in the table is actually
+ * on a VGA device's PCI BAR */
+
+ for_each_pci_dev(dev) {
+ int i;
+ if ((dev->class >> 8) != PCI_CLASS_DISPLAY_VGA)
+ continue;
+ for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
+ resource_size_t start, end;
+ unsigned long flags;
+
+ flags = pci_resource_flags(dev, i);
+ if (!(flags & IORESOURCE_MEM))
+ continue;
+
+ if (flags & IORESOURCE_UNSET)
+ continue;
+
+ if (pci_resource_len(dev, i) == 0)
+ continue;
+
+ start = pci_resource_start(dev, i);
+ end = pci_resource_end(dev, i);
+ if (screen_info.lfb_base >= start &&
+ screen_info.lfb_base < end) {
+ found_bar = 1;
+ break;
+ }
+ }
+ }
+ if (!found_bar)
+ screen_info.lfb_base = 0;
+#endif
+ }
+ }
+ if (screen_info.lfb_base) {
+ screen_info.lfb_linelength = choose_value(info->stride,
+ screen_info.lfb_linelength, OVERRIDE_STRIDE,
+ info->flags);
+ screen_info.lfb_width = choose_value(info->width,
+ screen_info.lfb_width, OVERRIDE_WIDTH,
+ info->flags);
+ screen_info.lfb_height = choose_value(info->height,
+ screen_info.lfb_height, OVERRIDE_HEIGHT,
+ info->flags);
+ if (screen_info.orig_video_isVGA == 0)
+ screen_info.orig_video_isVGA = VIDEO_TYPE_EFI;
+ } else {
+ screen_info.lfb_linelength = 0;
+ screen_info.lfb_width = 0;
+ screen_info.lfb_height = 0;
+ screen_info.orig_video_isVGA = 0;
+ return 0;
+ }
+
+ printk(KERN_INFO "efifb: dmi detected %s - framebuffer at 0x%08x "
+ "(%dx%d, stride %d)\n", id->ident,
+ screen_info.lfb_base, screen_info.lfb_width,
+ screen_info.lfb_height, screen_info.lfb_linelength);
+
+ return 1;
+}
+
+#define EFIFB_DMI_SYSTEM_ID(vendor, name, enumid) \
+ { \
+ efifb_set_system, \
+ name, \
+ { \
+ DMI_MATCH(DMI_BIOS_VENDOR, vendor), \
+ DMI_MATCH(DMI_PRODUCT_NAME, name) \
+ }, \
+ &efifb_dmi_list[enumid] \
+ }
+
+static const struct dmi_system_id efifb_dmi_system_table[] __initconst = {
+ EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "iMac4,1", M_I17),
+ /* At least one of these two will be right; maybe both? */
+ EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "iMac5,1", M_I20),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac5,1", M_I20),
+ /* At least one of these two will be right; maybe both? */
+ EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "iMac6,1", M_I24),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac6,1", M_I24),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac7,1", M_I20_SR),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac8,1", M_I24_8_1),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac10,1", M_I24_10_1),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac11,1", M_I27_11_1),
+ EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "Macmini1,1", M_MINI),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "Macmini3,1", M_MINI_3_1),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "Macmini4,1", M_MINI_4_1),
+ EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBook1,1", M_MB),
+ /* At least one of these two will be right; maybe both? */
+ EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBook2,1", M_MB),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook2,1", M_MB),
+ /* At least one of these two will be right; maybe both? */
+ EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBook3,1", M_MB),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook3,1", M_MB),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook4,1", M_MB),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook5,1", M_MB_5_1),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook6,1", M_MB_6_1),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook7,1", M_MB_7_1),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookAir1,1", M_MBA),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookAir3,1", M_MBA_3),
+ EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBookPro1,1", M_MBP),
+ EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBookPro2,1", M_MBP_2),
+ EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBookPro2,2", M_MBP_2_2),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro2,1", M_MBP_2),
+ EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBookPro3,1", M_MBP_SR),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro3,1", M_MBP_SR),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro4,1", M_MBP_4),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro5,1", M_MBP_5_1),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro5,2", M_MBP_5_2),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro5,3", M_MBP_5_3),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro6,1", M_MBP_6_1),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro6,2", M_MBP_6_2),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro7,1", M_MBP_7_1),
+ EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro8,2", M_MBP_8_2),
+ {},
+};
+
+/*
+ * Some devices have a portrait LCD but advertise a landscape resolution (and
+ * pitch). We simply swap width and height for these devices so that we can
+ * correctly deal with some of them coming with multiple resolutions.
+ */
+static const struct dmi_system_id efifb_dmi_swap_width_height[] __initconst = {
+ {
+ /*
+ * Lenovo MIIX310-10ICR, only some batches have the troublesome
+ * 800x1280 portrait screen. Luckily the portrait version has
+ * its own BIOS version, so we match on that.
+ */
+ .matches = {
+ DMI_EXACT_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+ DMI_EXACT_MATCH(DMI_PRODUCT_VERSION, "MIIX 310-10ICR"),
+ DMI_EXACT_MATCH(DMI_BIOS_VERSION, "1HCN44WW"),
+ },
+ },
+ {
+ /* Lenovo MIIX 320-10ICR with 800x1280 portrait screen */
+ .matches = {
+ DMI_EXACT_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+ DMI_EXACT_MATCH(DMI_PRODUCT_VERSION,
+ "Lenovo MIIX 320-10ICR"),
+ },
+ },
+ {
+ /* Lenovo D330 with 800x1280 or 1200x1920 portrait screen */
+ .matches = {
+ DMI_EXACT_MATCH(DMI_SYS_VENDOR, "LENOVO"),
+ DMI_EXACT_MATCH(DMI_PRODUCT_VERSION,
+ "Lenovo ideapad D330-10IGM"),
+ },
+ },
+ {},
+};
+
+__init void sysfb_apply_efi_quirks(void)
+{
+ if (screen_info.orig_video_isVGA != VIDEO_TYPE_EFI ||
+ !(screen_info.capabilities & VIDEO_CAPABILITY_SKIP_QUIRKS))
+ dmi_check_system(efifb_dmi_system_table);
+
+ if (screen_info.orig_video_isVGA == VIDEO_TYPE_EFI &&
+ dmi_check_system(efifb_dmi_swap_width_height)) {
+ u16 temp = screen_info.lfb_width;
+
+ screen_info.lfb_width = screen_info.lfb_height;
+ screen_info.lfb_height = temp;
+ screen_info.lfb_linelength = 4 * screen_info.lfb_width;
+ }
+}
diff --git a/arch/x86/kernel/sysfb_simplefb.c b/arch/x86/kernel/sysfb_simplefb.c
new file mode 100644
index 000000000..f3215346e
--- /dev/null
+++ b/arch/x86/kernel/sysfb_simplefb.c
@@ -0,0 +1,115 @@
+/*
+ * Generic System Framebuffers on x86
+ * Copyright (c) 2012-2013 David Herrmann <dh.herrmann@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+/*
+ * simple-framebuffer probing
+ * Try to convert "screen_info" into a "simple-framebuffer" compatible mode.
+ * If the mode is incompatible, we return "false" and let the caller create
+ * legacy nodes instead.
+ */
+
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/platform_data/simplefb.h>
+#include <linux/platform_device.h>
+#include <linux/screen_info.h>
+#include <asm/sysfb.h>
+
+static const char simplefb_resname[] = "BOOTFB";
+static const struct simplefb_format formats[] = SIMPLEFB_FORMATS;
+
+/* try parsing x86 screen_info into a simple-framebuffer mode struct */
+__init bool parse_mode(const struct screen_info *si,
+ struct simplefb_platform_data *mode)
+{
+ const struct simplefb_format *f;
+ __u8 type;
+ unsigned int i;
+
+ type = si->orig_video_isVGA;
+ if (type != VIDEO_TYPE_VLFB && type != VIDEO_TYPE_EFI)
+ return false;
+
+ for (i = 0; i < ARRAY_SIZE(formats); ++i) {
+ f = &formats[i];
+ if (si->lfb_depth == f->bits_per_pixel &&
+ si->red_size == f->red.length &&
+ si->red_pos == f->red.offset &&
+ si->green_size == f->green.length &&
+ si->green_pos == f->green.offset &&
+ si->blue_size == f->blue.length &&
+ si->blue_pos == f->blue.offset &&
+ si->rsvd_size == f->transp.length &&
+ si->rsvd_pos == f->transp.offset) {
+ mode->format = f->name;
+ mode->width = si->lfb_width;
+ mode->height = si->lfb_height;
+ mode->stride = si->lfb_linelength;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+__init int create_simplefb(const struct screen_info *si,
+ const struct simplefb_platform_data *mode)
+{
+ struct platform_device *pd;
+ struct resource res;
+ u64 base, size;
+ u32 length;
+
+ /*
+ * If the 64BIT_BASE capability is set, ext_lfb_base will contain the
+ * upper half of the base address. Assemble the address, then make sure
+ * it is valid and we can actually access it.
+ */
+ base = si->lfb_base;
+ if (si->capabilities & VIDEO_CAPABILITY_64BIT_BASE)
+ base |= (u64)si->ext_lfb_base << 32;
+ if (!base || (u64)(resource_size_t)base != base) {
+ printk(KERN_DEBUG "sysfb: inaccessible VRAM base\n");
+ return -EINVAL;
+ }
+
+ /*
+ * Don't use lfb_size as IORESOURCE size, since it may contain the
+ * entire VMEM, and thus require huge mappings. Use just the part we
+ * need, that is, the part where the framebuffer is located. But verify
+ * that it does not exceed the advertised VMEM.
+ * Note that in case of VBE, the lfb_size is shifted by 16 bits for
+ * historical reasons.
+ */
+ size = si->lfb_size;
+ if (si->orig_video_isVGA == VIDEO_TYPE_VLFB)
+ size <<= 16;
+ length = mode->height * mode->stride;
+ if (length > size) {
+ printk(KERN_WARNING "sysfb: VRAM smaller than advertised\n");
+ return -EINVAL;
+ }
+ length = PAGE_ALIGN(length);
+
+ /* setup IORESOURCE_MEM as framebuffer memory */
+ memset(&res, 0, sizeof(res));
+ res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+ res.name = simplefb_resname;
+ res.start = base;
+ res.end = res.start + length - 1;
+ if (res.end <= res.start)
+ return -EINVAL;
+
+ pd = platform_device_register_resndata(NULL, "simple-framebuffer", 0,
+ &res, 1, mode, sizeof(*mode));
+ return PTR_ERR_OR_ZERO(pd);
+}
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
new file mode 100644
index 000000000..a2486f444
--- /dev/null
+++ b/arch/x86/kernel/tboot.c
@@ -0,0 +1,541 @@
+/*
+ * tboot.c: main implementation of helper functions used by kernel for
+ * runtime support of Intel(R) Trusted Execution Technology
+ *
+ * Copyright (c) 2006-2009, Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+
+#include <linux/dma_remapping.h>
+#include <linux/init_task.h>
+#include <linux/spinlock.h>
+#include <linux/export.h>
+#include <linux/delay.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/dmar.h>
+#include <linux/cpu.h>
+#include <linux/pfn.h>
+#include <linux/mm.h>
+#include <linux/tboot.h>
+#include <linux/debugfs.h>
+
+#include <asm/realmode.h>
+#include <asm/processor.h>
+#include <asm/bootparam.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/swiotlb.h>
+#include <asm/fixmap.h>
+#include <asm/proto.h>
+#include <asm/setup.h>
+#include <asm/e820/api.h>
+#include <asm/io.h>
+
+#include "../realmode/rm/wakeup.h"
+
+/* Global pointer to shared data; NULL means no measured launch. */
+struct tboot *tboot __read_mostly;
+EXPORT_SYMBOL(tboot);
+
+/* timeout for APs (in secs) to enter wait-for-SIPI state during shutdown */
+#define AP_WAIT_TIMEOUT 1
+
+#undef pr_fmt
+#define pr_fmt(fmt) "tboot: " fmt
+
+static u8 tboot_uuid[16] __initdata = TBOOT_UUID;
+
+void __init tboot_probe(void)
+{
+ /* Look for valid page-aligned address for shared page. */
+ if (!boot_params.tboot_addr)
+ return;
+ /*
+ * also verify that it is mapped as we expect it before calling
+ * set_fixmap(), to reduce chance of garbage value causing crash
+ */
+ if (!e820__mapped_any(boot_params.tboot_addr,
+ boot_params.tboot_addr, E820_TYPE_RESERVED)) {
+ pr_warning("non-0 tboot_addr but it is not of type E820_TYPE_RESERVED\n");
+ return;
+ }
+
+ /* Map and check for tboot UUID. */
+ set_fixmap(FIX_TBOOT_BASE, boot_params.tboot_addr);
+ tboot = (struct tboot *)fix_to_virt(FIX_TBOOT_BASE);
+ if (memcmp(&tboot_uuid, &tboot->uuid, sizeof(tboot->uuid))) {
+ pr_warning("tboot at 0x%llx is invalid\n",
+ boot_params.tboot_addr);
+ tboot = NULL;
+ return;
+ }
+ if (tboot->version < 5) {
+ pr_warning("tboot version is invalid: %u\n", tboot->version);
+ tboot = NULL;
+ return;
+ }
+
+ pr_info("found shared page at phys addr 0x%llx:\n",
+ boot_params.tboot_addr);
+ pr_debug("version: %d\n", tboot->version);
+ pr_debug("log_addr: 0x%08x\n", tboot->log_addr);
+ pr_debug("shutdown_entry: 0x%x\n", tboot->shutdown_entry);
+ pr_debug("tboot_base: 0x%08x\n", tboot->tboot_base);
+ pr_debug("tboot_size: 0x%x\n", tboot->tboot_size);
+}
+
+static pgd_t *tboot_pg_dir;
+static struct mm_struct tboot_mm = {
+ .mm_rb = RB_ROOT,
+ .pgd = swapper_pg_dir,
+ .mm_users = ATOMIC_INIT(2),
+ .mm_count = ATOMIC_INIT(1),
+ .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem),
+ .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
+ .mmlist = LIST_HEAD_INIT(init_mm.mmlist),
+};
+
+static inline void switch_to_tboot_pt(void)
+{
+ write_cr3(virt_to_phys(tboot_pg_dir));
+}
+
+static int map_tboot_page(unsigned long vaddr, unsigned long pfn,
+ pgprot_t prot)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ pgd = pgd_offset(&tboot_mm, vaddr);
+ p4d = p4d_alloc(&tboot_mm, pgd, vaddr);
+ if (!p4d)
+ return -1;
+ pud = pud_alloc(&tboot_mm, p4d, vaddr);
+ if (!pud)
+ return -1;
+ pmd = pmd_alloc(&tboot_mm, pud, vaddr);
+ if (!pmd)
+ return -1;
+ pte = pte_alloc_map(&tboot_mm, pmd, vaddr);
+ if (!pte)
+ return -1;
+ set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot));
+ pte_unmap(pte);
+
+ /*
+ * PTI poisons low addresses in the kernel page tables in the
+ * name of making them unusable for userspace. To execute
+ * code at such a low address, the poison must be cleared.
+ *
+ * Note: 'pgd' actually gets set in p4d_alloc() _or_
+ * pud_alloc() depending on 4/5-level paging.
+ */
+ pgd->pgd &= ~_PAGE_NX;
+
+ return 0;
+}
+
+static int map_tboot_pages(unsigned long vaddr, unsigned long start_pfn,
+ unsigned long nr)
+{
+ /* Reuse the original kernel mapping */
+ tboot_pg_dir = pgd_alloc(&tboot_mm);
+ if (!tboot_pg_dir)
+ return -1;
+
+ for (; nr > 0; nr--, vaddr += PAGE_SIZE, start_pfn++) {
+ if (map_tboot_page(vaddr, start_pfn, PAGE_KERNEL_EXEC))
+ return -1;
+ }
+
+ return 0;
+}
+
+static void tboot_create_trampoline(void)
+{
+ u32 map_base, map_size;
+
+ /* Create identity map for tboot shutdown code. */
+ map_base = PFN_DOWN(tboot->tboot_base);
+ map_size = PFN_UP(tboot->tboot_size);
+ if (map_tboot_pages(map_base << PAGE_SHIFT, map_base, map_size))
+ panic("tboot: Error mapping tboot pages (mfns) @ 0x%x, 0x%x\n",
+ map_base, map_size);
+}
+
+#ifdef CONFIG_ACPI_SLEEP
+
+static void add_mac_region(phys_addr_t start, unsigned long size)
+{
+ struct tboot_mac_region *mr;
+ phys_addr_t end = start + size;
+
+ if (tboot->num_mac_regions >= MAX_TB_MAC_REGIONS)
+ panic("tboot: Too many MAC regions\n");
+
+ if (start && size) {
+ mr = &tboot->mac_regions[tboot->num_mac_regions++];
+ mr->start = round_down(start, PAGE_SIZE);
+ mr->size = round_up(end, PAGE_SIZE) - mr->start;
+ }
+}
+
+static int tboot_setup_sleep(void)
+{
+ int i;
+
+ tboot->num_mac_regions = 0;
+
+ for (i = 0; i < e820_table->nr_entries; i++) {
+ if ((e820_table->entries[i].type != E820_TYPE_RAM)
+ && (e820_table->entries[i].type != E820_TYPE_RESERVED_KERN))
+ continue;
+
+ add_mac_region(e820_table->entries[i].addr, e820_table->entries[i].size);
+ }
+
+ tboot->acpi_sinfo.kernel_s3_resume_vector =
+ real_mode_header->wakeup_start;
+
+ return 0;
+}
+
+#else /* no CONFIG_ACPI_SLEEP */
+
+static int tboot_setup_sleep(void)
+{
+ /* S3 shutdown requested, but S3 not supported by the kernel... */
+ BUG();
+ return -1;
+}
+
+#endif
+
+void tboot_shutdown(u32 shutdown_type)
+{
+ void (*shutdown)(void);
+
+ if (!tboot_enabled())
+ return;
+
+ /*
+ * if we're being called before the 1:1 mapping is set up then just
+ * return and let the normal shutdown happen; this should only be
+ * due to very early panic()
+ */
+ if (!tboot_pg_dir)
+ return;
+
+ /* if this is S3 then set regions to MAC */
+ if (shutdown_type == TB_SHUTDOWN_S3)
+ if (tboot_setup_sleep())
+ return;
+
+ tboot->shutdown_type = shutdown_type;
+
+ switch_to_tboot_pt();
+
+ shutdown = (void(*)(void))(unsigned long)tboot->shutdown_entry;
+ shutdown();
+
+ /* should not reach here */
+ while (1)
+ halt();
+}
+
+static void tboot_copy_fadt(const struct acpi_table_fadt *fadt)
+{
+#define TB_COPY_GAS(tbg, g) \
+ tbg.space_id = g.space_id; \
+ tbg.bit_width = g.bit_width; \
+ tbg.bit_offset = g.bit_offset; \
+ tbg.access_width = g.access_width; \
+ tbg.address = g.address;
+
+ TB_COPY_GAS(tboot->acpi_sinfo.pm1a_cnt_blk, fadt->xpm1a_control_block);
+ TB_COPY_GAS(tboot->acpi_sinfo.pm1b_cnt_blk, fadt->xpm1b_control_block);
+ TB_COPY_GAS(tboot->acpi_sinfo.pm1a_evt_blk, fadt->xpm1a_event_block);
+ TB_COPY_GAS(tboot->acpi_sinfo.pm1b_evt_blk, fadt->xpm1b_event_block);
+
+ /*
+ * We need phys addr of waking vector, but can't use virt_to_phys() on
+ * &acpi_gbl_FACS because it is ioremap'ed, so calc from FACS phys
+ * addr.
+ */
+ tboot->acpi_sinfo.wakeup_vector = fadt->facs +
+ offsetof(struct acpi_table_facs, firmware_waking_vector);
+}
+
+static int tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control)
+{
+ static u32 acpi_shutdown_map[ACPI_S_STATE_COUNT] = {
+ /* S0,1,2: */ -1, -1, -1,
+ /* S3: */ TB_SHUTDOWN_S3,
+ /* S4: */ TB_SHUTDOWN_S4,
+ /* S5: */ TB_SHUTDOWN_S5 };
+
+ if (!tboot_enabled())
+ return 0;
+
+ tboot_copy_fadt(&acpi_gbl_FADT);
+ tboot->acpi_sinfo.pm1a_cnt_val = pm1a_control;
+ tboot->acpi_sinfo.pm1b_cnt_val = pm1b_control;
+ /* we always use the 32b wakeup vector */
+ tboot->acpi_sinfo.vector_width = 32;
+
+ if (sleep_state >= ACPI_S_STATE_COUNT ||
+ acpi_shutdown_map[sleep_state] == -1) {
+ pr_warning("unsupported sleep state 0x%x\n", sleep_state);
+ return -1;
+ }
+
+ tboot_shutdown(acpi_shutdown_map[sleep_state]);
+ return 0;
+}
+
+static int tboot_extended_sleep(u8 sleep_state, u32 val_a, u32 val_b)
+{
+ if (!tboot_enabled())
+ return 0;
+
+ pr_warning("tboot is not able to suspend on platforms with reduced hardware sleep (ACPIv5)");
+ return -ENODEV;
+}
+
+static atomic_t ap_wfs_count;
+
+static int tboot_wait_for_aps(int num_aps)
+{
+ unsigned long timeout;
+
+ timeout = AP_WAIT_TIMEOUT*HZ;
+ while (atomic_read((atomic_t *)&tboot->num_in_wfs) != num_aps &&
+ timeout) {
+ mdelay(1);
+ timeout--;
+ }
+
+ if (timeout)
+ pr_warning("tboot wait for APs timeout\n");
+
+ return !(atomic_read((atomic_t *)&tboot->num_in_wfs) == num_aps);
+}
+
+static int tboot_dying_cpu(unsigned int cpu)
+{
+ atomic_inc(&ap_wfs_count);
+ if (num_online_cpus() == 1) {
+ if (tboot_wait_for_aps(atomic_read(&ap_wfs_count)))
+ return -EBUSY;
+ }
+ return 0;
+}
+
+#ifdef CONFIG_DEBUG_FS
+
+#define TBOOT_LOG_UUID { 0x26, 0x25, 0x19, 0xc0, 0x30, 0x6b, 0xb4, 0x4d, \
+ 0x4c, 0x84, 0xa3, 0xe9, 0x53, 0xb8, 0x81, 0x74 }
+
+#define TBOOT_SERIAL_LOG_ADDR 0x60000
+#define TBOOT_SERIAL_LOG_SIZE 0x08000
+#define LOG_MAX_SIZE_OFF 16
+#define LOG_BUF_OFF 24
+
+static uint8_t tboot_log_uuid[16] = TBOOT_LOG_UUID;
+
+static ssize_t tboot_log_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos)
+{
+ void __iomem *log_base;
+ u8 log_uuid[16];
+ u32 max_size;
+ void *kbuf;
+ int ret = -EFAULT;
+
+ log_base = ioremap_nocache(TBOOT_SERIAL_LOG_ADDR, TBOOT_SERIAL_LOG_SIZE);
+ if (!log_base)
+ return ret;
+
+ memcpy_fromio(log_uuid, log_base, sizeof(log_uuid));
+ if (memcmp(&tboot_log_uuid, log_uuid, sizeof(log_uuid)))
+ goto err_iounmap;
+
+ max_size = readl(log_base + LOG_MAX_SIZE_OFF);
+ if (*ppos >= max_size) {
+ ret = 0;
+ goto err_iounmap;
+ }
+
+ if (*ppos + count > max_size)
+ count = max_size - *ppos;
+
+ kbuf = kmalloc(count, GFP_KERNEL);
+ if (!kbuf) {
+ ret = -ENOMEM;
+ goto err_iounmap;
+ }
+
+ memcpy_fromio(kbuf, log_base + LOG_BUF_OFF + *ppos, count);
+ if (copy_to_user(user_buf, kbuf, count))
+ goto err_kfree;
+
+ *ppos += count;
+
+ ret = count;
+
+err_kfree:
+ kfree(kbuf);
+
+err_iounmap:
+ iounmap(log_base);
+
+ return ret;
+}
+
+static const struct file_operations tboot_log_fops = {
+ .read = tboot_log_read,
+ .llseek = default_llseek,
+};
+
+#endif /* CONFIG_DEBUG_FS */
+
+static __init int tboot_late_init(void)
+{
+ if (!tboot_enabled())
+ return 0;
+
+ tboot_create_trampoline();
+
+ atomic_set(&ap_wfs_count, 0);
+ cpuhp_setup_state(CPUHP_AP_X86_TBOOT_DYING, "x86/tboot:dying", NULL,
+ tboot_dying_cpu);
+#ifdef CONFIG_DEBUG_FS
+ debugfs_create_file("tboot_log", S_IRUSR,
+ arch_debugfs_dir, NULL, &tboot_log_fops);
+#endif
+
+ acpi_os_set_prepare_sleep(&tboot_sleep);
+ acpi_os_set_prepare_extended_sleep(&tboot_extended_sleep);
+ return 0;
+}
+
+late_initcall(tboot_late_init);
+
+/*
+ * TXT configuration registers (offsets from TXT_{PUB, PRIV}_CONFIG_REGS_BASE)
+ */
+
+#define TXT_PUB_CONFIG_REGS_BASE 0xfed30000
+#define TXT_PRIV_CONFIG_REGS_BASE 0xfed20000
+
+/* # pages for each config regs space - used by fixmap */
+#define NR_TXT_CONFIG_PAGES ((TXT_PUB_CONFIG_REGS_BASE - \
+ TXT_PRIV_CONFIG_REGS_BASE) >> PAGE_SHIFT)
+
+/* offsets from pub/priv config space */
+#define TXTCR_HEAP_BASE 0x0300
+#define TXTCR_HEAP_SIZE 0x0308
+
+#define SHA1_SIZE 20
+
+struct sha1_hash {
+ u8 hash[SHA1_SIZE];
+};
+
+struct sinit_mle_data {
+ u32 version; /* currently 6 */
+ struct sha1_hash bios_acm_id;
+ u32 edx_senter_flags;
+ u64 mseg_valid;
+ struct sha1_hash sinit_hash;
+ struct sha1_hash mle_hash;
+ struct sha1_hash stm_hash;
+ struct sha1_hash lcp_policy_hash;
+ u32 lcp_policy_control;
+ u32 rlp_wakeup_addr;
+ u32 reserved;
+ u32 num_mdrs;
+ u32 mdrs_off;
+ u32 num_vtd_dmars;
+ u32 vtd_dmars_off;
+} __packed;
+
+struct acpi_table_header *tboot_get_dmar_table(struct acpi_table_header *dmar_tbl)
+{
+ void *heap_base, *heap_ptr, *config;
+
+ if (!tboot_enabled())
+ return dmar_tbl;
+
+ /*
+ * ACPI tables may not be DMA protected by tboot, so use DMAR copy
+ * SINIT saved in SinitMleData in TXT heap (which is DMA protected)
+ */
+
+ /* map config space in order to get heap addr */
+ config = ioremap(TXT_PUB_CONFIG_REGS_BASE, NR_TXT_CONFIG_PAGES *
+ PAGE_SIZE);
+ if (!config)
+ return NULL;
+
+ /* now map TXT heap */
+ heap_base = ioremap(*(u64 *)(config + TXTCR_HEAP_BASE),
+ *(u64 *)(config + TXTCR_HEAP_SIZE));
+ iounmap(config);
+ if (!heap_base)
+ return NULL;
+
+ /* walk heap to SinitMleData */
+ /* skip BiosData */
+ heap_ptr = heap_base + *(u64 *)heap_base;
+ /* skip OsMleData */
+ heap_ptr += *(u64 *)heap_ptr;
+ /* skip OsSinitData */
+ heap_ptr += *(u64 *)heap_ptr;
+ /* now points to SinitMleDataSize; set to SinitMleData */
+ heap_ptr += sizeof(u64);
+ /* get addr of DMAR table */
+ dmar_tbl = (struct acpi_table_header *)(heap_ptr +
+ ((struct sinit_mle_data *)heap_ptr)->vtd_dmars_off -
+ sizeof(u64));
+
+ /* don't unmap heap because dmar.c needs access to this */
+
+ return dmar_tbl;
+}
+
+int tboot_force_iommu(void)
+{
+ if (!tboot_enabled())
+ return 0;
+
+ if (intel_iommu_tboot_noforce)
+ return 1;
+
+ if (no_iommu || swiotlb || dmar_disabled)
+ pr_warning("Forcing Intel-IOMMU to enabled\n");
+
+ dmar_disabled = 0;
+#ifdef CONFIG_SWIOTLB
+ swiotlb = 0;
+#endif
+ no_iommu = 0;
+
+ return 1;
+}
diff --git a/arch/x86/kernel/tce_64.c b/arch/x86/kernel/tce_64.c
new file mode 100644
index 000000000..f386bad09
--- /dev/null
+++ b/arch/x86/kernel/tce_64.c
@@ -0,0 +1,190 @@
+/*
+ * This file manages the translation entries for the IBM Calgary IOMMU.
+ *
+ * Derived from arch/powerpc/platforms/pseries/iommu.c
+ *
+ * Copyright (C) IBM Corporation, 2006
+ *
+ * Author: Jon Mason <jdmason@us.ibm.com>
+ * Author: Muli Ben-Yehuda <muli@il.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/bootmem.h>
+#include <asm/tce.h>
+#include <asm/calgary.h>
+#include <asm/proto.h>
+#include <asm/cacheflush.h>
+
+/* flush a tce at 'tceaddr' to main memory */
+static inline void flush_tce(void* tceaddr)
+{
+ /* a single tce can't cross a cache line */
+ if (boot_cpu_has(X86_FEATURE_CLFLUSH))
+ clflush(tceaddr);
+ else
+ wbinvd();
+}
+
+void tce_build(struct iommu_table *tbl, unsigned long index,
+ unsigned int npages, unsigned long uaddr, int direction)
+{
+ u64* tp;
+ u64 t;
+ u64 rpn;
+
+ t = (1 << TCE_READ_SHIFT);
+ if (direction != DMA_TO_DEVICE)
+ t |= (1 << TCE_WRITE_SHIFT);
+
+ tp = ((u64*)tbl->it_base) + index;
+
+ while (npages--) {
+ rpn = (virt_to_bus((void*)uaddr)) >> PAGE_SHIFT;
+ t &= ~TCE_RPN_MASK;
+ t |= (rpn << TCE_RPN_SHIFT);
+
+ *tp = cpu_to_be64(t);
+ flush_tce(tp);
+
+ uaddr += PAGE_SIZE;
+ tp++;
+ }
+}
+
+void tce_free(struct iommu_table *tbl, long index, unsigned int npages)
+{
+ u64* tp;
+
+ tp = ((u64*)tbl->it_base) + index;
+
+ while (npages--) {
+ *tp = cpu_to_be64(0);
+ flush_tce(tp);
+ tp++;
+ }
+}
+
+static inline unsigned int table_size_to_number_of_entries(unsigned char size)
+{
+ /*
+ * size is the order of the table, 0-7
+ * smallest table is 8K entries, so shift result by 13 to
+ * multiply by 8K
+ */
+ return (1 << size) << 13;
+}
+
+static int tce_table_setparms(struct pci_dev *dev, struct iommu_table *tbl)
+{
+ unsigned int bitmapsz;
+ unsigned long bmppages;
+ int ret;
+
+ tbl->it_busno = dev->bus->number;
+
+ /* set the tce table size - measured in entries */
+ tbl->it_size = table_size_to_number_of_entries(specified_table_size);
+
+ /*
+ * number of bytes needed for the bitmap size in number of
+ * entries; we need one bit per entry
+ */
+ bitmapsz = tbl->it_size / BITS_PER_BYTE;
+ bmppages = __get_free_pages(GFP_KERNEL, get_order(bitmapsz));
+ if (!bmppages) {
+ printk(KERN_ERR "Calgary: cannot allocate bitmap\n");
+ ret = -ENOMEM;
+ goto done;
+ }
+
+ tbl->it_map = (unsigned long*)bmppages;
+
+ memset(tbl->it_map, 0, bitmapsz);
+
+ tbl->it_hint = 0;
+
+ spin_lock_init(&tbl->it_lock);
+
+ return 0;
+
+done:
+ return ret;
+}
+
+int __init build_tce_table(struct pci_dev *dev, void __iomem *bbar)
+{
+ struct iommu_table *tbl;
+ int ret;
+
+ if (pci_iommu(dev->bus)) {
+ printk(KERN_ERR "Calgary: dev %p has sysdata->iommu %p\n",
+ dev, pci_iommu(dev->bus));
+ BUG();
+ }
+
+ tbl = kzalloc(sizeof(struct iommu_table), GFP_KERNEL);
+ if (!tbl) {
+ printk(KERN_ERR "Calgary: error allocating iommu_table\n");
+ ret = -ENOMEM;
+ goto done;
+ }
+
+ ret = tce_table_setparms(dev, tbl);
+ if (ret)
+ goto free_tbl;
+
+ tbl->bbar = bbar;
+
+ set_pci_iommu(dev->bus, tbl);
+
+ return 0;
+
+free_tbl:
+ kfree(tbl);
+done:
+ return ret;
+}
+
+void * __init alloc_tce_table(void)
+{
+ unsigned int size;
+
+ size = table_size_to_number_of_entries(specified_table_size);
+ size *= TCE_ENTRY_SIZE;
+
+ return __alloc_bootmem_low(size, size, 0);
+}
+
+void __init free_tce_table(void *tbl)
+{
+ unsigned int size;
+
+ if (!tbl)
+ return;
+
+ size = table_size_to_number_of_entries(specified_table_size);
+ size *= TCE_ENTRY_SIZE;
+
+ free_bootmem(__pa(tbl), size);
+}
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
new file mode 100644
index 000000000..0680a2e9e
--- /dev/null
+++ b/arch/x86/kernel/time.c
@@ -0,0 +1,103 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 1991,1992,1995 Linus Torvalds
+ * Copyright (c) 1994 Alan Modra
+ * Copyright (c) 1995 Markus Kuhn
+ * Copyright (c) 1996 Ingo Molnar
+ * Copyright (c) 1998 Andrea Arcangeli
+ * Copyright (c) 2002,2006 Vojtech Pavlik
+ * Copyright (c) 2003 Andi Kleen
+ *
+ */
+
+#include <linux/clockchips.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/i8253.h>
+#include <linux/time.h>
+#include <linux/export.h>
+
+#include <asm/vsyscall.h>
+#include <asm/x86_init.h>
+#include <asm/i8259.h>
+#include <asm/timer.h>
+#include <asm/hpet.h>
+#include <asm/time.h>
+
+unsigned long profile_pc(struct pt_regs *regs)
+{
+ unsigned long pc = instruction_pointer(regs);
+
+ if (!user_mode(regs) && in_lock_functions(pc)) {
+#ifdef CONFIG_FRAME_POINTER
+ return *(unsigned long *)(regs->bp + sizeof(long));
+#else
+ unsigned long *sp =
+ (unsigned long *)kernel_stack_pointer(regs);
+ /*
+ * Return address is either directly at stack pointer
+ * or above a saved flags. Eflags has bits 22-31 zero,
+ * kernel addresses don't.
+ */
+ if (sp[0] >> 22)
+ return sp[0];
+ if (sp[1] >> 22)
+ return sp[1];
+#endif
+ }
+ return pc;
+}
+EXPORT_SYMBOL(profile_pc);
+
+/*
+ * Default timer interrupt handler for PIT/HPET
+ */
+static irqreturn_t timer_interrupt(int irq, void *dev_id)
+{
+ global_clock_event->event_handler(global_clock_event);
+ return IRQ_HANDLED;
+}
+
+static struct irqaction irq0 = {
+ .handler = timer_interrupt,
+ .flags = IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER,
+ .name = "timer"
+};
+
+static void __init setup_default_timer_irq(void)
+{
+ /*
+ * Unconditionally register the legacy timer; even without legacy
+ * PIC/PIT we need this for the HPET0 in legacy replacement mode.
+ */
+ if (setup_irq(0, &irq0))
+ pr_info("Failed to register legacy timer interrupt\n");
+}
+
+/* Default timer init function */
+void __init hpet_time_init(void)
+{
+ if (!hpet_enable())
+ setup_pit_timer();
+ setup_default_timer_irq();
+}
+
+static __init void x86_late_time_init(void)
+{
+ x86_init.timers.timer_init();
+ /*
+ * After PIT/HPET timers init, select and setup
+ * the final interrupt mode for delivering IRQs.
+ */
+ x86_init.irqs.intr_mode_init();
+ tsc_init();
+}
+
+/*
+ * Initialize TSC and delay the periodic timer init to
+ * late x86_late_time_init() so ioremap works.
+ */
+void __init time_init(void)
+{
+ late_time_init = x86_late_time_init;
+}
diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
new file mode 100644
index 000000000..71d3fef1e
--- /dev/null
+++ b/arch/x86/kernel/tls.c
@@ -0,0 +1,321 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/user.h>
+#include <linux/regset.h>
+#include <linux/syscalls.h>
+#include <linux/nospec.h>
+
+#include <linux/uaccess.h>
+#include <asm/desc.h>
+#include <asm/ldt.h>
+#include <asm/processor.h>
+#include <asm/proto.h>
+
+#include "tls.h"
+
+/*
+ * sys_alloc_thread_area: get a yet unused TLS descriptor index.
+ */
+static int get_free_idx(void)
+{
+ struct thread_struct *t = &current->thread;
+ int idx;
+
+ for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++)
+ if (desc_empty(&t->tls_array[idx]))
+ return idx + GDT_ENTRY_TLS_MIN;
+ return -ESRCH;
+}
+
+static bool tls_desc_okay(const struct user_desc *info)
+{
+ /*
+ * For historical reasons (i.e. no one ever documented how any
+ * of the segmentation APIs work), user programs can and do
+ * assume that a struct user_desc that's all zeros except for
+ * entry_number means "no segment at all". This never actually
+ * worked. In fact, up to Linux 3.19, a struct user_desc like
+ * this would create a 16-bit read-write segment with base and
+ * limit both equal to zero.
+ *
+ * That was close enough to "no segment at all" until we
+ * hardened this function to disallow 16-bit TLS segments. Fix
+ * it up by interpreting these zeroed segments the way that they
+ * were almost certainly intended to be interpreted.
+ *
+ * The correct way to ask for "no segment at all" is to specify
+ * a user_desc that satisfies LDT_empty. To keep everything
+ * working, we accept both.
+ *
+ * Note that there's a similar kludge in modify_ldt -- look at
+ * the distinction between modes 1 and 0x11.
+ */
+ if (LDT_empty(info) || LDT_zero(info))
+ return true;
+
+ /*
+ * espfix is required for 16-bit data segments, but espfix
+ * only works for LDT segments.
+ */
+ if (!info->seg_32bit)
+ return false;
+
+ /* Only allow data segments in the TLS array. */
+ if (info->contents > 1)
+ return false;
+
+ /*
+ * Non-present segments with DPL 3 present an interesting attack
+ * surface. The kernel should handle such segments correctly,
+ * but TLS is very difficult to protect in a sandbox, so prevent
+ * such segments from being created.
+ *
+ * If userspace needs to remove a TLS entry, it can still delete
+ * it outright.
+ */
+ if (info->seg_not_present)
+ return false;
+
+ return true;
+}
+
+static void set_tls_desc(struct task_struct *p, int idx,
+ const struct user_desc *info, int n)
+{
+ struct thread_struct *t = &p->thread;
+ struct desc_struct *desc = &t->tls_array[idx - GDT_ENTRY_TLS_MIN];
+ int cpu;
+
+ /*
+ * We must not get preempted while modifying the TLS.
+ */
+ cpu = get_cpu();
+
+ while (n-- > 0) {
+ if (LDT_empty(info) || LDT_zero(info))
+ memset(desc, 0, sizeof(*desc));
+ else
+ fill_ldt(desc, info);
+ ++info;
+ ++desc;
+ }
+
+ if (t == &current->thread)
+ load_TLS(t, cpu);
+
+ put_cpu();
+}
+
+/*
+ * Set a given TLS descriptor:
+ */
+int do_set_thread_area(struct task_struct *p, int idx,
+ struct user_desc __user *u_info,
+ int can_allocate)
+{
+ struct user_desc info;
+ unsigned short __maybe_unused sel, modified_sel;
+
+ if (copy_from_user(&info, u_info, sizeof(info)))
+ return -EFAULT;
+
+ if (!tls_desc_okay(&info))
+ return -EINVAL;
+
+ if (idx == -1)
+ idx = info.entry_number;
+
+ /*
+ * index -1 means the kernel should try to find and
+ * allocate an empty descriptor:
+ */
+ if (idx == -1 && can_allocate) {
+ idx = get_free_idx();
+ if (idx < 0)
+ return idx;
+ if (put_user(idx, &u_info->entry_number))
+ return -EFAULT;
+ }
+
+ if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
+ return -EINVAL;
+
+ set_tls_desc(p, idx, &info, 1);
+
+ /*
+ * If DS, ES, FS, or GS points to the modified segment, forcibly
+ * refresh it. Only needed on x86_64 because x86_32 reloads them
+ * on return to user mode.
+ */
+ modified_sel = (idx << 3) | 3;
+
+ if (p == current) {
+#ifdef CONFIG_X86_64
+ savesegment(ds, sel);
+ if (sel == modified_sel)
+ loadsegment(ds, sel);
+
+ savesegment(es, sel);
+ if (sel == modified_sel)
+ loadsegment(es, sel);
+
+ savesegment(fs, sel);
+ if (sel == modified_sel)
+ loadsegment(fs, sel);
+
+ savesegment(gs, sel);
+ if (sel == modified_sel)
+ load_gs_index(sel);
+#endif
+
+#ifdef CONFIG_X86_32_LAZY_GS
+ savesegment(gs, sel);
+ if (sel == modified_sel)
+ loadsegment(gs, sel);
+#endif
+ } else {
+#ifdef CONFIG_X86_64
+ if (p->thread.fsindex == modified_sel)
+ p->thread.fsbase = info.base_addr;
+
+ if (p->thread.gsindex == modified_sel)
+ p->thread.gsbase = info.base_addr;
+#endif
+ }
+
+ return 0;
+}
+
+SYSCALL_DEFINE1(set_thread_area, struct user_desc __user *, u_info)
+{
+ return do_set_thread_area(current, -1, u_info, 1);
+}
+
+
+/*
+ * Get the current Thread-Local Storage area:
+ */
+
+static void fill_user_desc(struct user_desc *info, int idx,
+ const struct desc_struct *desc)
+
+{
+ memset(info, 0, sizeof(*info));
+ info->entry_number = idx;
+ info->base_addr = get_desc_base(desc);
+ info->limit = get_desc_limit(desc);
+ info->seg_32bit = desc->d;
+ info->contents = desc->type >> 2;
+ info->read_exec_only = !(desc->type & 2);
+ info->limit_in_pages = desc->g;
+ info->seg_not_present = !desc->p;
+ info->useable = desc->avl;
+#ifdef CONFIG_X86_64
+ info->lm = desc->l;
+#endif
+}
+
+int do_get_thread_area(struct task_struct *p, int idx,
+ struct user_desc __user *u_info)
+{
+ struct user_desc info;
+ int index;
+
+ if (idx == -1 && get_user(idx, &u_info->entry_number))
+ return -EFAULT;
+
+ if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
+ return -EINVAL;
+
+ index = idx - GDT_ENTRY_TLS_MIN;
+ index = array_index_nospec(index,
+ GDT_ENTRY_TLS_MAX - GDT_ENTRY_TLS_MIN + 1);
+
+ fill_user_desc(&info, idx, &p->thread.tls_array[index]);
+
+ if (copy_to_user(u_info, &info, sizeof(info)))
+ return -EFAULT;
+ return 0;
+}
+
+SYSCALL_DEFINE1(get_thread_area, struct user_desc __user *, u_info)
+{
+ return do_get_thread_area(current, -1, u_info);
+}
+
+int regset_tls_active(struct task_struct *target,
+ const struct user_regset *regset)
+{
+ struct thread_struct *t = &target->thread;
+ int n = GDT_ENTRY_TLS_ENTRIES;
+ while (n > 0 && desc_empty(&t->tls_array[n - 1]))
+ --n;
+ return n;
+}
+
+int regset_tls_get(struct task_struct *target, const struct user_regset *regset,
+ unsigned int pos, unsigned int count,
+ void *kbuf, void __user *ubuf)
+{
+ const struct desc_struct *tls;
+
+ if (pos >= GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) ||
+ (pos % sizeof(struct user_desc)) != 0 ||
+ (count % sizeof(struct user_desc)) != 0)
+ return -EINVAL;
+
+ pos /= sizeof(struct user_desc);
+ count /= sizeof(struct user_desc);
+
+ tls = &target->thread.tls_array[pos];
+
+ if (kbuf) {
+ struct user_desc *info = kbuf;
+ while (count-- > 0)
+ fill_user_desc(info++, GDT_ENTRY_TLS_MIN + pos++,
+ tls++);
+ } else {
+ struct user_desc __user *u_info = ubuf;
+ while (count-- > 0) {
+ struct user_desc info;
+ fill_user_desc(&info, GDT_ENTRY_TLS_MIN + pos++, tls++);
+ if (__copy_to_user(u_info++, &info, sizeof(info)))
+ return -EFAULT;
+ }
+ }
+
+ return 0;
+}
+
+int regset_tls_set(struct task_struct *target, const struct user_regset *regset,
+ unsigned int pos, unsigned int count,
+ const void *kbuf, const void __user *ubuf)
+{
+ struct user_desc infobuf[GDT_ENTRY_TLS_ENTRIES];
+ const struct user_desc *info;
+ int i;
+
+ if (pos >= GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) ||
+ (pos % sizeof(struct user_desc)) != 0 ||
+ (count % sizeof(struct user_desc)) != 0)
+ return -EINVAL;
+
+ if (kbuf)
+ info = kbuf;
+ else if (__copy_from_user(infobuf, ubuf, count))
+ return -EFAULT;
+ else
+ info = infobuf;
+
+ for (i = 0; i < count / sizeof(struct user_desc); i++)
+ if (!tls_desc_okay(info + i))
+ return -EINVAL;
+
+ set_tls_desc(target,
+ GDT_ENTRY_TLS_MIN + (pos / sizeof(struct user_desc)),
+ info, count / sizeof(struct user_desc));
+
+ return 0;
+}
diff --git a/arch/x86/kernel/tls.h b/arch/x86/kernel/tls.h
new file mode 100644
index 000000000..2f083a2fe
--- /dev/null
+++ b/arch/x86/kernel/tls.h
@@ -0,0 +1,21 @@
+/*
+ * Internal declarations for x86 TLS implementation functions.
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v.2.
+ *
+ * Red Hat Author: Roland McGrath.
+ */
+
+#ifndef _ARCH_X86_KERNEL_TLS_H
+
+#include <linux/regset.h>
+
+extern user_regset_active_fn regset_tls_active;
+extern user_regset_get_fn regset_tls_get;
+extern user_regset_set_fn regset_tls_set;
+
+#endif /* _ARCH_X86_KERNEL_TLS_H */
diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c
new file mode 100644
index 000000000..738bf42b0
--- /dev/null
+++ b/arch/x86/kernel/topology.c
@@ -0,0 +1,175 @@
+/*
+ * Populate sysfs with topology information
+ *
+ * Written by: Matthew Dobson, IBM Corporation
+ * Original Code: Paul Dorwin, IBM Corporation, Patrick Mochel, OSDL
+ *
+ * Copyright (C) 2002, IBM Corp.
+ *
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send feedback to <colpatch@us.ibm.com>
+ */
+#include <linux/nodemask.h>
+#include <linux/export.h>
+#include <linux/mmzone.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/irq.h>
+#include <asm/cpu.h>
+
+static DEFINE_PER_CPU(struct x86_cpu, cpu_devices);
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+#ifdef CONFIG_BOOTPARAM_HOTPLUG_CPU0
+static int cpu0_hotpluggable = 1;
+#else
+static int cpu0_hotpluggable;
+static int __init enable_cpu0_hotplug(char *str)
+{
+ cpu0_hotpluggable = 1;
+ return 1;
+}
+
+__setup("cpu0_hotplug", enable_cpu0_hotplug);
+#endif
+
+#ifdef CONFIG_DEBUG_HOTPLUG_CPU0
+/*
+ * This function offlines a CPU as early as possible and allows userspace to
+ * boot up without the CPU. The CPU can be onlined back by user after boot.
+ *
+ * This is only called for debugging CPU offline/online feature.
+ */
+int _debug_hotplug_cpu(int cpu, int action)
+{
+ struct device *dev = get_cpu_device(cpu);
+ int ret;
+
+ if (!cpu_is_hotpluggable(cpu))
+ return -EINVAL;
+
+ lock_device_hotplug();
+
+ switch (action) {
+ case 0:
+ ret = cpu_down(cpu);
+ if (!ret) {
+ pr_info("CPU %u is now offline\n", cpu);
+ dev->offline = true;
+ kobject_uevent(&dev->kobj, KOBJ_OFFLINE);
+ } else
+ pr_debug("Can't offline CPU%d.\n", cpu);
+ break;
+ case 1:
+ ret = cpu_up(cpu);
+ if (!ret) {
+ dev->offline = false;
+ kobject_uevent(&dev->kobj, KOBJ_ONLINE);
+ } else {
+ pr_debug("Can't online CPU%d.\n", cpu);
+ }
+ break;
+ default:
+ ret = -EINVAL;
+ }
+
+ unlock_device_hotplug();
+
+ return ret;
+}
+
+static int __init debug_hotplug_cpu(void)
+{
+ _debug_hotplug_cpu(0, 0);
+ return 0;
+}
+
+late_initcall_sync(debug_hotplug_cpu);
+#endif /* CONFIG_DEBUG_HOTPLUG_CPU0 */
+
+int arch_register_cpu(int num)
+{
+ struct cpuinfo_x86 *c = &cpu_data(num);
+
+ /*
+ * Currently CPU0 is only hotpluggable on Intel platforms. Other
+ * vendors can add hotplug support later.
+ * Xen PV guests don't support CPU0 hotplug at all.
+ */
+ if (c->x86_vendor != X86_VENDOR_INTEL ||
+ boot_cpu_has(X86_FEATURE_XENPV))
+ cpu0_hotpluggable = 0;
+
+ /*
+ * Two known BSP/CPU0 dependencies: Resume from suspend/hibernate
+ * depends on BSP. PIC interrupts depend on BSP.
+ *
+ * If the BSP depencies are under control, one can tell kernel to
+ * enable BSP hotplug. This basically adds a control file and
+ * one can attempt to offline BSP.
+ */
+ if (num == 0 && cpu0_hotpluggable) {
+ unsigned int irq;
+ /*
+ * We won't take down the boot processor on i386 if some
+ * interrupts only are able to be serviced by the BSP in PIC.
+ */
+ for_each_active_irq(irq) {
+ if (!IO_APIC_IRQ(irq) && irq_has_action(irq)) {
+ cpu0_hotpluggable = 0;
+ break;
+ }
+ }
+ }
+ if (num || cpu0_hotpluggable)
+ per_cpu(cpu_devices, num).cpu.hotpluggable = 1;
+
+ return register_cpu(&per_cpu(cpu_devices, num).cpu, num);
+}
+EXPORT_SYMBOL(arch_register_cpu);
+
+void arch_unregister_cpu(int num)
+{
+ unregister_cpu(&per_cpu(cpu_devices, num).cpu);
+}
+EXPORT_SYMBOL(arch_unregister_cpu);
+#else /* CONFIG_HOTPLUG_CPU */
+
+static int __init arch_register_cpu(int num)
+{
+ return register_cpu(&per_cpu(cpu_devices, num).cpu, num);
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+static int __init topology_init(void)
+{
+ int i;
+
+#ifdef CONFIG_NUMA
+ for_each_online_node(i)
+ register_one_node(i);
+#endif
+
+ for_each_present_cpu(i)
+ arch_register_cpu(i);
+
+ return 0;
+}
+subsys_initcall(topology_init);
diff --git a/arch/x86/kernel/trace_clock.c b/arch/x86/kernel/trace_clock.c
new file mode 100644
index 000000000..b8e7abe00
--- /dev/null
+++ b/arch/x86/kernel/trace_clock.c
@@ -0,0 +1,17 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * X86 trace clocks
+ */
+#include <asm/trace_clock.h>
+#include <asm/barrier.h>
+#include <asm/msr.h>
+
+/*
+ * trace_clock_x86_tsc(): A clock that is just the cycle counter.
+ *
+ * Unlike the other clocks, this is not in nanoseconds.
+ */
+u64 notrace trace_clock_x86_tsc(void)
+{
+ return rdtsc_ordered();
+}
diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c
new file mode 100644
index 000000000..5bd30c442
--- /dev/null
+++ b/arch/x86/kernel/tracepoint.c
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Code for supporting irq vector tracepoints.
+ *
+ * Copyright (C) 2013 Seiji Aguchi <seiji.aguchi@hds.com>
+ *
+ */
+#include <linux/jump_label.h>
+#include <linux/atomic.h>
+
+#include <asm/hw_irq.h>
+#include <asm/desc.h>
+
+DEFINE_STATIC_KEY_FALSE(trace_pagefault_key);
+
+int trace_pagefault_reg(void)
+{
+ static_branch_inc(&trace_pagefault_key);
+ return 0;
+}
+
+void trace_pagefault_unreg(void)
+{
+ static_branch_dec(&trace_pagefault_key);
+}
+
+#ifdef CONFIG_SMP
+
+DEFINE_STATIC_KEY_FALSE(trace_resched_ipi_key);
+
+int trace_resched_ipi_reg(void)
+{
+ static_branch_inc(&trace_resched_ipi_key);
+ return 0;
+}
+
+void trace_resched_ipi_unreg(void)
+{
+ static_branch_dec(&trace_resched_ipi_key);
+}
+
+#endif
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
new file mode 100644
index 000000000..e6db47516
--- /dev/null
+++ b/arch/x86/kernel/traps.c
@@ -0,0 +1,975 @@
+/*
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
+ *
+ * Pentium III FXSR, SSE support
+ * Gareth Hughes <gareth@valinux.com>, May 2000
+ */
+
+/*
+ * Handle hardware traps and faults.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/context_tracking.h>
+#include <linux/interrupt.h>
+#include <linux/kallsyms.h>
+#include <linux/spinlock.h>
+#include <linux/kprobes.h>
+#include <linux/uaccess.h>
+#include <linux/kdebug.h>
+#include <linux/kgdb.h>
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/ptrace.h>
+#include <linux/uprobes.h>
+#include <linux/string.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/kexec.h>
+#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
+#include <linux/timer.h>
+#include <linux/init.h>
+#include <linux/bug.h>
+#include <linux/nmi.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/io.h>
+
+#if defined(CONFIG_EDAC)
+#include <linux/edac.h>
+#endif
+
+#include <asm/stacktrace.h>
+#include <asm/processor.h>
+#include <asm/debugreg.h>
+#include <linux/atomic.h>
+#include <asm/text-patching.h>
+#include <asm/ftrace.h>
+#include <asm/traps.h>
+#include <asm/desc.h>
+#include <asm/fpu/internal.h>
+#include <asm/cpu_entry_area.h>
+#include <asm/mce.h>
+#include <asm/fixmap.h>
+#include <asm/mach_traps.h>
+#include <asm/alternative.h>
+#include <asm/fpu/xstate.h>
+#include <asm/trace/mpx.h>
+#include <asm/mpx.h>
+#include <asm/vm86.h>
+#include <asm/umip.h>
+
+#ifdef CONFIG_X86_64
+#include <asm/x86_init.h>
+#include <asm/pgalloc.h>
+#include <asm/proto.h>
+#else
+#include <asm/processor-flags.h>
+#include <asm/setup.h>
+#include <asm/proto.h>
+#endif
+
+DECLARE_BITMAP(system_vectors, NR_VECTORS);
+
+static inline void cond_local_irq_enable(struct pt_regs *regs)
+{
+ if (regs->flags & X86_EFLAGS_IF)
+ local_irq_enable();
+}
+
+static inline void cond_local_irq_disable(struct pt_regs *regs)
+{
+ if (regs->flags & X86_EFLAGS_IF)
+ local_irq_disable();
+}
+
+/*
+ * In IST context, we explicitly disable preemption. This serves two
+ * purposes: it makes it much less likely that we would accidentally
+ * schedule in IST context and it will force a warning if we somehow
+ * manage to schedule by accident.
+ */
+void ist_enter(struct pt_regs *regs)
+{
+ if (user_mode(regs)) {
+ RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
+ } else {
+ /*
+ * We might have interrupted pretty much anything. In
+ * fact, if we're a machine check, we can even interrupt
+ * NMI processing. We don't want in_nmi() to return true,
+ * but we need to notify RCU.
+ */
+ rcu_nmi_enter();
+ }
+
+ preempt_disable();
+
+ /* This code is a bit fragile. Test it. */
+ RCU_LOCKDEP_WARN(!rcu_is_watching(), "ist_enter didn't work");
+}
+
+void ist_exit(struct pt_regs *regs)
+{
+ preempt_enable_no_resched();
+
+ if (!user_mode(regs))
+ rcu_nmi_exit();
+}
+
+/**
+ * ist_begin_non_atomic() - begin a non-atomic section in an IST exception
+ * @regs: regs passed to the IST exception handler
+ *
+ * IST exception handlers normally cannot schedule. As a special
+ * exception, if the exception interrupted userspace code (i.e.
+ * user_mode(regs) would return true) and the exception was not
+ * a double fault, it can be safe to schedule. ist_begin_non_atomic()
+ * begins a non-atomic section within an ist_enter()/ist_exit() region.
+ * Callers are responsible for enabling interrupts themselves inside
+ * the non-atomic section, and callers must call ist_end_non_atomic()
+ * before ist_exit().
+ */
+void ist_begin_non_atomic(struct pt_regs *regs)
+{
+ BUG_ON(!user_mode(regs));
+
+ /*
+ * Sanity check: we need to be on the normal thread stack. This
+ * will catch asm bugs and any attempt to use ist_preempt_enable
+ * from double_fault.
+ */
+ BUG_ON(!on_thread_stack());
+
+ preempt_enable_no_resched();
+}
+
+/**
+ * ist_end_non_atomic() - begin a non-atomic section in an IST exception
+ *
+ * Ends a non-atomic section started with ist_begin_non_atomic().
+ */
+void ist_end_non_atomic(void)
+{
+ preempt_disable();
+}
+
+int is_valid_bugaddr(unsigned long addr)
+{
+ unsigned short ud;
+
+ if (addr < TASK_SIZE_MAX)
+ return 0;
+
+ if (probe_kernel_address((unsigned short *)addr, ud))
+ return 0;
+
+ return ud == INSN_UD0 || ud == INSN_UD2;
+}
+
+int fixup_bug(struct pt_regs *regs, int trapnr)
+{
+ if (trapnr != X86_TRAP_UD)
+ return 0;
+
+ switch (report_bug(regs->ip, regs)) {
+ case BUG_TRAP_TYPE_NONE:
+ case BUG_TRAP_TYPE_BUG:
+ break;
+
+ case BUG_TRAP_TYPE_WARN:
+ regs->ip += LEN_UD2;
+ return 1;
+ }
+
+ return 0;
+}
+
+static nokprobe_inline int
+do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
+ struct pt_regs *regs, long error_code)
+{
+ if (v8086_mode(regs)) {
+ /*
+ * Traps 0, 1, 3, 4, and 5 should be forwarded to vm86.
+ * On nmi (interrupt 2), do_trap should not be called.
+ */
+ if (trapnr < X86_TRAP_UD) {
+ if (!handle_vm86_trap((struct kernel_vm86_regs *) regs,
+ error_code, trapnr))
+ return 0;
+ }
+ return -1;
+ }
+
+ if (!user_mode(regs)) {
+ if (fixup_exception(regs, trapnr))
+ return 0;
+
+ tsk->thread.error_code = error_code;
+ tsk->thread.trap_nr = trapnr;
+ die(str, regs, error_code);
+ }
+
+ return -1;
+}
+
+static siginfo_t *fill_trap_info(struct pt_regs *regs, int signr, int trapnr,
+ siginfo_t *info)
+{
+ unsigned long siaddr;
+ int sicode;
+
+ switch (trapnr) {
+ default:
+ return SEND_SIG_PRIV;
+
+ case X86_TRAP_DE:
+ sicode = FPE_INTDIV;
+ siaddr = uprobe_get_trap_addr(regs);
+ break;
+ case X86_TRAP_UD:
+ sicode = ILL_ILLOPN;
+ siaddr = uprobe_get_trap_addr(regs);
+ break;
+ case X86_TRAP_AC:
+ sicode = BUS_ADRALN;
+ siaddr = 0;
+ break;
+ }
+
+ info->si_signo = signr;
+ info->si_errno = 0;
+ info->si_code = sicode;
+ info->si_addr = (void __user *)siaddr;
+ return info;
+}
+
+static void
+do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
+ long error_code, siginfo_t *info)
+{
+ struct task_struct *tsk = current;
+
+
+ if (!do_trap_no_signal(tsk, trapnr, str, regs, error_code))
+ return;
+ /*
+ * We want error_code and trap_nr set for userspace faults and
+ * kernelspace faults which result in die(), but not
+ * kernelspace faults which are fixed up. die() gives the
+ * process no chance to handle the signal and notice the
+ * kernel fault information, so that won't result in polluting
+ * the information about previously queued, but not yet
+ * delivered, faults. See also do_general_protection below.
+ */
+ tsk->thread.error_code = error_code;
+ tsk->thread.trap_nr = trapnr;
+
+ if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
+ printk_ratelimit()) {
+ pr_info("%s[%d] trap %s ip:%lx sp:%lx error:%lx",
+ tsk->comm, tsk->pid, str,
+ regs->ip, regs->sp, error_code);
+ print_vma_addr(KERN_CONT " in ", regs->ip);
+ pr_cont("\n");
+ }
+
+ force_sig_info(signr, info ?: SEND_SIG_PRIV, tsk);
+}
+NOKPROBE_SYMBOL(do_trap);
+
+static void do_error_trap(struct pt_regs *regs, long error_code, char *str,
+ unsigned long trapnr, int signr)
+{
+ siginfo_t info;
+
+ RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
+
+ /*
+ * WARN*()s end up here; fix them up before we call the
+ * notifier chain.
+ */
+ if (!user_mode(regs) && fixup_bug(regs, trapnr))
+ return;
+
+ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) !=
+ NOTIFY_STOP) {
+ cond_local_irq_enable(regs);
+ clear_siginfo(&info);
+ do_trap(trapnr, signr, str, regs, error_code,
+ fill_trap_info(regs, signr, trapnr, &info));
+ }
+}
+
+#define DO_ERROR(trapnr, signr, str, name) \
+dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \
+{ \
+ do_error_trap(regs, error_code, str, trapnr, signr); \
+}
+
+DO_ERROR(X86_TRAP_DE, SIGFPE, "divide error", divide_error)
+DO_ERROR(X86_TRAP_OF, SIGSEGV, "overflow", overflow)
+DO_ERROR(X86_TRAP_UD, SIGILL, "invalid opcode", invalid_op)
+DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, "coprocessor segment overrun",coprocessor_segment_overrun)
+DO_ERROR(X86_TRAP_TS, SIGSEGV, "invalid TSS", invalid_TSS)
+DO_ERROR(X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present)
+DO_ERROR(X86_TRAP_SS, SIGBUS, "stack segment", stack_segment)
+DO_ERROR(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check)
+
+#ifdef CONFIG_VMAP_STACK
+__visible void __noreturn handle_stack_overflow(const char *message,
+ struct pt_regs *regs,
+ unsigned long fault_address)
+{
+ printk(KERN_EMERG "BUG: stack guard page was hit at %p (stack is %p..%p)\n",
+ (void *)fault_address, current->stack,
+ (char *)current->stack + THREAD_SIZE - 1);
+ die(message, regs, 0);
+
+ /* Be absolutely certain we don't return. */
+ panic(message);
+}
+#endif
+
+#ifdef CONFIG_X86_64
+/* Runs on IST stack */
+dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
+{
+ static const char str[] = "double fault";
+ struct task_struct *tsk = current;
+#ifdef CONFIG_VMAP_STACK
+ unsigned long cr2;
+#endif
+
+#ifdef CONFIG_X86_ESPFIX64
+ extern unsigned char native_irq_return_iret[];
+
+ /*
+ * If IRET takes a non-IST fault on the espfix64 stack, then we
+ * end up promoting it to a doublefault. In that case, take
+ * advantage of the fact that we're not using the normal (TSS.sp0)
+ * stack right now. We can write a fake #GP(0) frame at TSS.sp0
+ * and then modify our own IRET frame so that, when we return,
+ * we land directly at the #GP(0) vector with the stack already
+ * set up according to its expectations.
+ *
+ * The net result is that our #GP handler will think that we
+ * entered from usermode with the bad user context.
+ *
+ * No need for ist_enter here because we don't use RCU.
+ */
+ if (((long)regs->sp >> P4D_SHIFT) == ESPFIX_PGD_ENTRY &&
+ regs->cs == __KERNEL_CS &&
+ regs->ip == (unsigned long)native_irq_return_iret)
+ {
+ struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
+
+ /*
+ * regs->sp points to the failing IRET frame on the
+ * ESPFIX64 stack. Copy it to the entry stack. This fills
+ * in gpregs->ss through gpregs->ip.
+ *
+ */
+ memmove(&gpregs->ip, (void *)regs->sp, 5*8);
+ gpregs->orig_ax = 0; /* Missing (lost) #GP error code */
+
+ /*
+ * Adjust our frame so that we return straight to the #GP
+ * vector with the expected RSP value. This is safe because
+ * we won't enable interupts or schedule before we invoke
+ * general_protection, so nothing will clobber the stack
+ * frame we just set up.
+ */
+ regs->ip = (unsigned long)general_protection;
+ regs->sp = (unsigned long)&gpregs->orig_ax;
+
+ return;
+ }
+#endif
+
+ ist_enter(regs);
+ notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV);
+
+ tsk->thread.error_code = error_code;
+ tsk->thread.trap_nr = X86_TRAP_DF;
+
+#ifdef CONFIG_VMAP_STACK
+ /*
+ * If we overflow the stack into a guard page, the CPU will fail
+ * to deliver #PF and will send #DF instead. Similarly, if we
+ * take any non-IST exception while too close to the bottom of
+ * the stack, the processor will get a page fault while
+ * delivering the exception and will generate a double fault.
+ *
+ * According to the SDM (footnote in 6.15 under "Interrupt 14 -
+ * Page-Fault Exception (#PF):
+ *
+ * Processors update CR2 whenever a page fault is detected. If a
+ * second page fault occurs while an earlier page fault is being
+ * delivered, the faulting linear address of the second fault will
+ * overwrite the contents of CR2 (replacing the previous
+ * address). These updates to CR2 occur even if the page fault
+ * results in a double fault or occurs during the delivery of a
+ * double fault.
+ *
+ * The logic below has a small possibility of incorrectly diagnosing
+ * some errors as stack overflows. For example, if the IDT or GDT
+ * gets corrupted such that #GP delivery fails due to a bad descriptor
+ * causing #GP and we hit this condition while CR2 coincidentally
+ * points to the stack guard page, we'll think we overflowed the
+ * stack. Given that we're going to panic one way or another
+ * if this happens, this isn't necessarily worth fixing.
+ *
+ * If necessary, we could improve the test by only diagnosing
+ * a stack overflow if the saved RSP points within 47 bytes of
+ * the bottom of the stack: if RSP == tsk_stack + 48 and we
+ * take an exception, the stack is already aligned and there
+ * will be enough room SS, RSP, RFLAGS, CS, RIP, and a
+ * possible error code, so a stack overflow would *not* double
+ * fault. With any less space left, exception delivery could
+ * fail, and, as a practical matter, we've overflowed the
+ * stack even if the actual trigger for the double fault was
+ * something else.
+ */
+ cr2 = read_cr2();
+ if ((unsigned long)task_stack_page(tsk) - 1 - cr2 < PAGE_SIZE)
+ handle_stack_overflow("kernel stack overflow (double-fault)", regs, cr2);
+#endif
+
+#ifdef CONFIG_DOUBLEFAULT
+ df_debug(regs, error_code);
+#endif
+ /*
+ * This is always a kernel trap and never fixable (and thus must
+ * never return).
+ */
+ for (;;)
+ die(str, regs, error_code);
+}
+#endif
+
+dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
+{
+ const struct mpx_bndcsr *bndcsr;
+ siginfo_t *info;
+
+ RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
+ if (notify_die(DIE_TRAP, "bounds", regs, error_code,
+ X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP)
+ return;
+ cond_local_irq_enable(regs);
+
+ if (!user_mode(regs))
+ die("bounds", regs, error_code);
+
+ if (!cpu_feature_enabled(X86_FEATURE_MPX)) {
+ /* The exception is not from Intel MPX */
+ goto exit_trap;
+ }
+
+ /*
+ * We need to look at BNDSTATUS to resolve this exception.
+ * A NULL here might mean that it is in its 'init state',
+ * which is all zeros which indicates MPX was not
+ * responsible for the exception.
+ */
+ bndcsr = get_xsave_field_ptr(XFEATURE_MASK_BNDCSR);
+ if (!bndcsr)
+ goto exit_trap;
+
+ trace_bounds_exception_mpx(bndcsr);
+ /*
+ * The error code field of the BNDSTATUS register communicates status
+ * information of a bound range exception #BR or operation involving
+ * bound directory.
+ */
+ switch (bndcsr->bndstatus & MPX_BNDSTA_ERROR_CODE) {
+ case 2: /* Bound directory has invalid entry. */
+ if (mpx_handle_bd_fault())
+ goto exit_trap;
+ break; /* Success, it was handled */
+ case 1: /* Bound violation. */
+ info = mpx_generate_siginfo(regs);
+ if (IS_ERR(info)) {
+ /*
+ * We failed to decode the MPX instruction. Act as if
+ * the exception was not caused by MPX.
+ */
+ goto exit_trap;
+ }
+ /*
+ * Success, we decoded the instruction and retrieved
+ * an 'info' containing the address being accessed
+ * which caused the exception. This information
+ * allows and application to possibly handle the
+ * #BR exception itself.
+ */
+ do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, info);
+ kfree(info);
+ break;
+ case 0: /* No exception caused by Intel MPX operations. */
+ goto exit_trap;
+ default:
+ die("bounds", regs, error_code);
+ }
+
+ return;
+
+exit_trap:
+ /*
+ * This path out is for all the cases where we could not
+ * handle the exception in some way (like allocating a
+ * table or telling userspace about it. We will also end
+ * up here if the kernel has MPX turned off at compile
+ * time..
+ */
+ do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, NULL);
+}
+
+dotraplinkage void
+do_general_protection(struct pt_regs *regs, long error_code)
+{
+ struct task_struct *tsk;
+
+ RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
+ cond_local_irq_enable(regs);
+
+ if (static_cpu_has(X86_FEATURE_UMIP)) {
+ if (user_mode(regs) && fixup_umip_exception(regs))
+ return;
+ }
+
+ if (v8086_mode(regs)) {
+ local_irq_enable();
+ handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
+ return;
+ }
+
+ tsk = current;
+ if (!user_mode(regs)) {
+ if (fixup_exception(regs, X86_TRAP_GP))
+ return;
+
+ tsk->thread.error_code = error_code;
+ tsk->thread.trap_nr = X86_TRAP_GP;
+ if (notify_die(DIE_GPF, "general protection fault", regs, error_code,
+ X86_TRAP_GP, SIGSEGV) != NOTIFY_STOP)
+ die("general protection fault", regs, error_code);
+ return;
+ }
+
+ tsk->thread.error_code = error_code;
+ tsk->thread.trap_nr = X86_TRAP_GP;
+
+ if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
+ printk_ratelimit()) {
+ pr_info("%s[%d] general protection ip:%lx sp:%lx error:%lx",
+ tsk->comm, task_pid_nr(tsk),
+ regs->ip, regs->sp, error_code);
+ print_vma_addr(KERN_CONT " in ", regs->ip);
+ pr_cont("\n");
+ }
+
+ force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk);
+}
+NOKPROBE_SYMBOL(do_general_protection);
+
+dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
+{
+#ifdef CONFIG_DYNAMIC_FTRACE
+ /*
+ * ftrace must be first, everything else may cause a recursive crash.
+ * See note by declaration of modifying_ftrace_code in ftrace.c
+ */
+ if (unlikely(atomic_read(&modifying_ftrace_code)) &&
+ ftrace_int3_handler(regs))
+ return;
+#endif
+ if (poke_int3_handler(regs))
+ return;
+
+ /*
+ * Use ist_enter despite the fact that we don't use an IST stack.
+ * We can be called from a kprobe in non-CONTEXT_KERNEL kernel
+ * mode or even during context tracking state changes.
+ *
+ * This means that we can't schedule. That's okay.
+ */
+ ist_enter(regs);
+ RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
+#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
+ if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
+ SIGTRAP) == NOTIFY_STOP)
+ goto exit;
+#endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */
+
+#ifdef CONFIG_KPROBES
+ if (kprobe_int3_handler(regs))
+ goto exit;
+#endif
+
+ if (notify_die(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
+ SIGTRAP) == NOTIFY_STOP)
+ goto exit;
+
+ cond_local_irq_enable(regs);
+ do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, NULL);
+ cond_local_irq_disable(regs);
+
+exit:
+ ist_exit(regs);
+}
+NOKPROBE_SYMBOL(do_int3);
+
+#ifdef CONFIG_X86_64
+/*
+ * Help handler running on a per-cpu (IST or entry trampoline) stack
+ * to switch to the normal thread stack if the interrupted code was in
+ * user mode. The actual stack switch is done in entry_64.S
+ */
+asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs)
+{
+ struct pt_regs *regs = (struct pt_regs *)this_cpu_read(cpu_current_top_of_stack) - 1;
+ if (regs != eregs)
+ *regs = *eregs;
+ return regs;
+}
+NOKPROBE_SYMBOL(sync_regs);
+
+struct bad_iret_stack {
+ void *error_entry_ret;
+ struct pt_regs regs;
+};
+
+asmlinkage __visible notrace
+struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s)
+{
+ /*
+ * This is called from entry_64.S early in handling a fault
+ * caused by a bad iret to user mode. To handle the fault
+ * correctly, we want to move our stack frame to where it would
+ * be had we entered directly on the entry stack (rather than
+ * just below the IRET frame) and we want to pretend that the
+ * exception came from the IRET target.
+ */
+ struct bad_iret_stack *new_stack =
+ (struct bad_iret_stack *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
+
+ /* Copy the IRET target to the new stack. */
+ memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8);
+
+ /* Copy the remainder of the stack from the current stack. */
+ memmove(new_stack, s, offsetof(struct bad_iret_stack, regs.ip));
+
+ BUG_ON(!user_mode(&new_stack->regs));
+ return new_stack;
+}
+NOKPROBE_SYMBOL(fixup_bad_iret);
+#endif
+
+static bool is_sysenter_singlestep(struct pt_regs *regs)
+{
+ /*
+ * We don't try for precision here. If we're anywhere in the region of
+ * code that can be single-stepped in the SYSENTER entry path, then
+ * assume that this is a useless single-step trap due to SYSENTER
+ * being invoked with TF set. (We don't know in advance exactly
+ * which instructions will be hit because BTF could plausibly
+ * be set.)
+ */
+#ifdef CONFIG_X86_32
+ return (regs->ip - (unsigned long)__begin_SYSENTER_singlestep_region) <
+ (unsigned long)__end_SYSENTER_singlestep_region -
+ (unsigned long)__begin_SYSENTER_singlestep_region;
+#elif defined(CONFIG_IA32_EMULATION)
+ return (regs->ip - (unsigned long)entry_SYSENTER_compat) <
+ (unsigned long)__end_entry_SYSENTER_compat -
+ (unsigned long)entry_SYSENTER_compat;
+#else
+ return false;
+#endif
+}
+
+/*
+ * Our handling of the processor debug registers is non-trivial.
+ * We do not clear them on entry and exit from the kernel. Therefore
+ * it is possible to get a watchpoint trap here from inside the kernel.
+ * However, the code in ./ptrace.c has ensured that the user can
+ * only set watchpoints on userspace addresses. Therefore the in-kernel
+ * watchpoint trap can only occur in code which is reading/writing
+ * from user space. Such code must not hold kernel locks (since it
+ * can equally take a page fault), therefore it is safe to call
+ * force_sig_info even though that claims and releases locks.
+ *
+ * Code in ./signal.c ensures that the debug control register
+ * is restored before we deliver any signal, and therefore that
+ * user code runs with the correct debug control register even though
+ * we clear it here.
+ *
+ * Being careful here means that we don't have to be as careful in a
+ * lot of more complicated places (task switching can be a bit lazy
+ * about restoring all the debug state, and ptrace doesn't have to
+ * find every occurrence of the TF bit that could be saved away even
+ * by user code)
+ *
+ * May run on IST stack.
+ */
+dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
+{
+ struct task_struct *tsk = current;
+ int user_icebp = 0;
+ unsigned long dr6;
+ int si_code;
+
+ ist_enter(regs);
+
+ get_debugreg(dr6, 6);
+ /*
+ * The Intel SDM says:
+ *
+ * Certain debug exceptions may clear bits 0-3. The remaining
+ * contents of the DR6 register are never cleared by the
+ * processor. To avoid confusion in identifying debug
+ * exceptions, debug handlers should clear the register before
+ * returning to the interrupted task.
+ *
+ * Keep it simple: clear DR6 immediately.
+ */
+ set_debugreg(0, 6);
+
+ /* Filter out all the reserved bits which are preset to 1 */
+ dr6 &= ~DR6_RESERVED;
+
+ /*
+ * The SDM says "The processor clears the BTF flag when it
+ * generates a debug exception." Clear TIF_BLOCKSTEP to keep
+ * TIF_BLOCKSTEP in sync with the hardware BTF flag.
+ */
+ clear_tsk_thread_flag(tsk, TIF_BLOCKSTEP);
+
+ if (unlikely(!user_mode(regs) && (dr6 & DR_STEP) &&
+ is_sysenter_singlestep(regs))) {
+ dr6 &= ~DR_STEP;
+ if (!dr6)
+ goto exit;
+ /*
+ * else we might have gotten a single-step trap and hit a
+ * watchpoint at the same time, in which case we should fall
+ * through and handle the watchpoint.
+ */
+ }
+
+ /*
+ * If dr6 has no reason to give us about the origin of this trap,
+ * then it's very likely the result of an icebp/int01 trap.
+ * User wants a sigtrap for that.
+ */
+ if (!dr6 && user_mode(regs))
+ user_icebp = 1;
+
+ /* Store the virtualized DR6 value */
+ tsk->thread.debugreg6 = dr6;
+
+#ifdef CONFIG_KPROBES
+ if (kprobe_debug_handler(regs))
+ goto exit;
+#endif
+
+ if (notify_die(DIE_DEBUG, "debug", regs, (long)&dr6, error_code,
+ SIGTRAP) == NOTIFY_STOP)
+ goto exit;
+
+ /*
+ * Let others (NMI) know that the debug stack is in use
+ * as we may switch to the interrupt stack.
+ */
+ debug_stack_usage_inc();
+
+ /* It's safe to allow irq's after DR6 has been saved */
+ cond_local_irq_enable(regs);
+
+ if (v8086_mode(regs)) {
+ handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code,
+ X86_TRAP_DB);
+ cond_local_irq_disable(regs);
+ debug_stack_usage_dec();
+ goto exit;
+ }
+
+ if (WARN_ON_ONCE((dr6 & DR_STEP) && !user_mode(regs))) {
+ /*
+ * Historical junk that used to handle SYSENTER single-stepping.
+ * This should be unreachable now. If we survive for a while
+ * without anyone hitting this warning, we'll turn this into
+ * an oops.
+ */
+ tsk->thread.debugreg6 &= ~DR_STEP;
+ set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
+ regs->flags &= ~X86_EFLAGS_TF;
+ }
+ si_code = get_si_code(tsk->thread.debugreg6);
+ if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp)
+ send_sigtrap(tsk, regs, error_code, si_code);
+ cond_local_irq_disable(regs);
+ debug_stack_usage_dec();
+
+exit:
+ ist_exit(regs);
+}
+NOKPROBE_SYMBOL(do_debug);
+
+/*
+ * Note that we play around with the 'TS' bit in an attempt to get
+ * the correct behaviour even in the presence of the asynchronous
+ * IRQ13 behaviour
+ */
+static void math_error(struct pt_regs *regs, int error_code, int trapnr)
+{
+ struct task_struct *task = current;
+ struct fpu *fpu = &task->thread.fpu;
+ siginfo_t info;
+ char *str = (trapnr == X86_TRAP_MF) ? "fpu exception" :
+ "simd exception";
+
+ cond_local_irq_enable(regs);
+
+ if (!user_mode(regs)) {
+ if (fixup_exception(regs, trapnr))
+ return;
+
+ task->thread.error_code = error_code;
+ task->thread.trap_nr = trapnr;
+
+ if (notify_die(DIE_TRAP, str, regs, error_code,
+ trapnr, SIGFPE) != NOTIFY_STOP)
+ die(str, regs, error_code);
+ return;
+ }
+
+ /*
+ * Save the info for the exception handler and clear the error.
+ */
+ fpu__save(fpu);
+
+ task->thread.trap_nr = trapnr;
+ task->thread.error_code = error_code;
+ clear_siginfo(&info);
+ info.si_signo = SIGFPE;
+ info.si_errno = 0;
+ info.si_addr = (void __user *)uprobe_get_trap_addr(regs);
+
+ info.si_code = fpu__exception_code(fpu, trapnr);
+
+ /* Retry when we get spurious exceptions: */
+ if (!info.si_code)
+ return;
+
+ force_sig_info(SIGFPE, &info, task);
+}
+
+dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
+{
+ RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
+ math_error(regs, error_code, X86_TRAP_MF);
+}
+
+dotraplinkage void
+do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
+{
+ RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
+ math_error(regs, error_code, X86_TRAP_XF);
+}
+
+dotraplinkage void
+do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
+{
+ cond_local_irq_enable(regs);
+}
+
+dotraplinkage void
+do_device_not_available(struct pt_regs *regs, long error_code)
+{
+ unsigned long cr0;
+
+ RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
+
+#ifdef CONFIG_MATH_EMULATION
+ if (!boot_cpu_has(X86_FEATURE_FPU) && (read_cr0() & X86_CR0_EM)) {
+ struct math_emu_info info = { };
+
+ cond_local_irq_enable(regs);
+
+ info.regs = regs;
+ math_emulate(&info);
+ return;
+ }
+#endif
+
+ /* This should not happen. */
+ cr0 = read_cr0();
+ if (WARN(cr0 & X86_CR0_TS, "CR0.TS was set")) {
+ /* Try to fix it up and carry on. */
+ write_cr0(cr0 & ~X86_CR0_TS);
+ } else {
+ /*
+ * Something terrible happened, and we're better off trying
+ * to kill the task than getting stuck in a never-ending
+ * loop of #NM faults.
+ */
+ die("unexpected #NM exception", regs, error_code);
+ }
+}
+NOKPROBE_SYMBOL(do_device_not_available);
+
+#ifdef CONFIG_X86_32
+dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
+{
+ siginfo_t info;
+
+ RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
+ local_irq_enable();
+
+ clear_siginfo(&info);
+ info.si_signo = SIGILL;
+ info.si_errno = 0;
+ info.si_code = ILL_BADSTK;
+ info.si_addr = NULL;
+ if (notify_die(DIE_TRAP, "iret exception", regs, error_code,
+ X86_TRAP_IRET, SIGILL) != NOTIFY_STOP) {
+ do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code,
+ &info);
+ }
+}
+#endif
+
+void __init trap_init(void)
+{
+ /* Init cpu_entry_area before IST entries are set up */
+ setup_cpu_entry_areas();
+
+ idt_setup_traps();
+
+ /*
+ * Set the IDT descriptor to a fixed read-only location, so that the
+ * "sidt" instruction will not leak the location of the kernel, and
+ * to defend the IDT against arbitrary memory write vulnerabilities.
+ * It will be reloaded in cpu_init() */
+ cea_set_pte(CPU_ENTRY_AREA_RO_IDT_VADDR, __pa_symbol(idt_table),
+ PAGE_KERNEL_RO);
+ idt_descr.address = CPU_ENTRY_AREA_RO_IDT;
+
+ /*
+ * Should be a barrier for any external CPU state:
+ */
+ cpu_init();
+
+ idt_setup_ist_traps();
+
+ x86_init.irqs.trap_init();
+
+ idt_setup_debugidt_traps();
+}
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
new file mode 100644
index 000000000..03b752933
--- /dev/null
+++ b/arch/x86/kernel/tsc.c
@@ -0,0 +1,1511 @@
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/sched/clock.h>
+#include <linux/init.h>
+#include <linux/export.h>
+#include <linux/timer.h>
+#include <linux/acpi_pmtmr.h>
+#include <linux/cpufreq.h>
+#include <linux/delay.h>
+#include <linux/clocksource.h>
+#include <linux/percpu.h>
+#include <linux/timex.h>
+#include <linux/static_key.h>
+
+#include <asm/hpet.h>
+#include <asm/timer.h>
+#include <asm/vgtod.h>
+#include <asm/time.h>
+#include <asm/delay.h>
+#include <asm/hypervisor.h>
+#include <asm/nmi.h>
+#include <asm/x86_init.h>
+#include <asm/geode.h>
+#include <asm/apic.h>
+#include <asm/intel-family.h>
+#include <asm/i8259.h>
+#include <asm/uv/uv.h>
+
+unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */
+EXPORT_SYMBOL(cpu_khz);
+
+unsigned int __read_mostly tsc_khz;
+EXPORT_SYMBOL(tsc_khz);
+
+#define KHZ 1000
+
+/*
+ * TSC can be unstable due to cpufreq or due to unsynced TSCs
+ */
+static int __read_mostly tsc_unstable;
+
+static DEFINE_STATIC_KEY_FALSE(__use_tsc);
+
+int tsc_clocksource_reliable;
+
+static u32 art_to_tsc_numerator;
+static u32 art_to_tsc_denominator;
+static u64 art_to_tsc_offset;
+struct clocksource *art_related_clocksource;
+
+struct cyc2ns {
+ struct cyc2ns_data data[2]; /* 0 + 2*16 = 32 */
+ seqcount_t seq; /* 32 + 4 = 36 */
+
+}; /* fits one cacheline */
+
+static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns);
+
+void __always_inline cyc2ns_read_begin(struct cyc2ns_data *data)
+{
+ int seq, idx;
+
+ preempt_disable_notrace();
+
+ do {
+ seq = this_cpu_read(cyc2ns.seq.sequence);
+ idx = seq & 1;
+
+ data->cyc2ns_offset = this_cpu_read(cyc2ns.data[idx].cyc2ns_offset);
+ data->cyc2ns_mul = this_cpu_read(cyc2ns.data[idx].cyc2ns_mul);
+ data->cyc2ns_shift = this_cpu_read(cyc2ns.data[idx].cyc2ns_shift);
+
+ } while (unlikely(seq != this_cpu_read(cyc2ns.seq.sequence)));
+}
+
+void __always_inline cyc2ns_read_end(void)
+{
+ preempt_enable_notrace();
+}
+
+/*
+ * Accelerators for sched_clock()
+ * convert from cycles(64bits) => nanoseconds (64bits)
+ * basic equation:
+ * ns = cycles / (freq / ns_per_sec)
+ * ns = cycles * (ns_per_sec / freq)
+ * ns = cycles * (10^9 / (cpu_khz * 10^3))
+ * ns = cycles * (10^6 / cpu_khz)
+ *
+ * Then we use scaling math (suggested by george@mvista.com) to get:
+ * ns = cycles * (10^6 * SC / cpu_khz) / SC
+ * ns = cycles * cyc2ns_scale / SC
+ *
+ * And since SC is a constant power of two, we can convert the div
+ * into a shift. The larger SC is, the more accurate the conversion, but
+ * cyc2ns_scale needs to be a 32-bit value so that 32-bit multiplication
+ * (64-bit result) can be used.
+ *
+ * We can use khz divisor instead of mhz to keep a better precision.
+ * (mathieu.desnoyers@polymtl.ca)
+ *
+ * -johnstul@us.ibm.com "math is hard, lets go shopping!"
+ */
+
+static __always_inline unsigned long long cycles_2_ns(unsigned long long cyc)
+{
+ struct cyc2ns_data data;
+ unsigned long long ns;
+
+ cyc2ns_read_begin(&data);
+
+ ns = data.cyc2ns_offset;
+ ns += mul_u64_u32_shr(cyc, data.cyc2ns_mul, data.cyc2ns_shift);
+
+ cyc2ns_read_end();
+
+ return ns;
+}
+
+static void __set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now)
+{
+ unsigned long long ns_now;
+ struct cyc2ns_data data;
+ struct cyc2ns *c2n;
+
+ ns_now = cycles_2_ns(tsc_now);
+
+ /*
+ * Compute a new multiplier as per the above comment and ensure our
+ * time function is continuous; see the comment near struct
+ * cyc2ns_data.
+ */
+ clocks_calc_mult_shift(&data.cyc2ns_mul, &data.cyc2ns_shift, khz,
+ NSEC_PER_MSEC, 0);
+
+ /*
+ * cyc2ns_shift is exported via arch_perf_update_userpage() where it is
+ * not expected to be greater than 31 due to the original published
+ * conversion algorithm shifting a 32-bit value (now specifies a 64-bit
+ * value) - refer perf_event_mmap_page documentation in perf_event.h.
+ */
+ if (data.cyc2ns_shift == 32) {
+ data.cyc2ns_shift = 31;
+ data.cyc2ns_mul >>= 1;
+ }
+
+ data.cyc2ns_offset = ns_now -
+ mul_u64_u32_shr(tsc_now, data.cyc2ns_mul, data.cyc2ns_shift);
+
+ c2n = per_cpu_ptr(&cyc2ns, cpu);
+
+ raw_write_seqcount_latch(&c2n->seq);
+ c2n->data[0] = data;
+ raw_write_seqcount_latch(&c2n->seq);
+ c2n->data[1] = data;
+}
+
+static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ sched_clock_idle_sleep_event();
+
+ if (khz)
+ __set_cyc2ns_scale(khz, cpu, tsc_now);
+
+ sched_clock_idle_wakeup_event();
+ local_irq_restore(flags);
+}
+
+/*
+ * Initialize cyc2ns for boot cpu
+ */
+static void __init cyc2ns_init_boot_cpu(void)
+{
+ struct cyc2ns *c2n = this_cpu_ptr(&cyc2ns);
+
+ seqcount_init(&c2n->seq);
+ __set_cyc2ns_scale(tsc_khz, smp_processor_id(), rdtsc());
+}
+
+/*
+ * Secondary CPUs do not run through tsc_init(), so set up
+ * all the scale factors for all CPUs, assuming the same
+ * speed as the bootup CPU. (cpufreq notifiers will fix this
+ * up if their speed diverges)
+ */
+static void __init cyc2ns_init_secondary_cpus(void)
+{
+ unsigned int cpu, this_cpu = smp_processor_id();
+ struct cyc2ns *c2n = this_cpu_ptr(&cyc2ns);
+ struct cyc2ns_data *data = c2n->data;
+
+ for_each_possible_cpu(cpu) {
+ if (cpu != this_cpu) {
+ seqcount_init(&c2n->seq);
+ c2n = per_cpu_ptr(&cyc2ns, cpu);
+ c2n->data[0] = data[0];
+ c2n->data[1] = data[1];
+ }
+ }
+}
+
+/*
+ * Scheduler clock - returns current time in nanosec units.
+ */
+u64 native_sched_clock(void)
+{
+ if (static_branch_likely(&__use_tsc)) {
+ u64 tsc_now = rdtsc();
+
+ /* return the value in ns */
+ return cycles_2_ns(tsc_now);
+ }
+
+ /*
+ * Fall back to jiffies if there's no TSC available:
+ * ( But note that we still use it if the TSC is marked
+ * unstable. We do this because unlike Time Of Day,
+ * the scheduler clock tolerates small errors and it's
+ * very important for it to be as fast as the platform
+ * can achieve it. )
+ */
+
+ /* No locking but a rare wrong value is not a big deal: */
+ return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
+}
+
+/*
+ * Generate a sched_clock if you already have a TSC value.
+ */
+u64 native_sched_clock_from_tsc(u64 tsc)
+{
+ return cycles_2_ns(tsc);
+}
+
+/* We need to define a real function for sched_clock, to override the
+ weak default version */
+#ifdef CONFIG_PARAVIRT
+unsigned long long sched_clock(void)
+{
+ return paravirt_sched_clock();
+}
+
+bool using_native_sched_clock(void)
+{
+ return pv_time_ops.sched_clock == native_sched_clock;
+}
+#else
+unsigned long long
+sched_clock(void) __attribute__((alias("native_sched_clock")));
+
+bool using_native_sched_clock(void) { return true; }
+#endif
+
+int check_tsc_unstable(void)
+{
+ return tsc_unstable;
+}
+EXPORT_SYMBOL_GPL(check_tsc_unstable);
+
+#ifdef CONFIG_X86_TSC
+int __init notsc_setup(char *str)
+{
+ mark_tsc_unstable("boot parameter notsc");
+ return 1;
+}
+#else
+/*
+ * disable flag for tsc. Takes effect by clearing the TSC cpu flag
+ * in cpu/common.c
+ */
+int __init notsc_setup(char *str)
+{
+ setup_clear_cpu_cap(X86_FEATURE_TSC);
+ return 1;
+}
+#endif
+
+__setup("notsc", notsc_setup);
+
+static int no_sched_irq_time;
+
+static int __init tsc_setup(char *str)
+{
+ if (!strcmp(str, "reliable"))
+ tsc_clocksource_reliable = 1;
+ if (!strncmp(str, "noirqtime", 9))
+ no_sched_irq_time = 1;
+ if (!strcmp(str, "unstable"))
+ mark_tsc_unstable("boot parameter");
+ return 1;
+}
+
+__setup("tsc=", tsc_setup);
+
+#define MAX_RETRIES 5
+#define SMI_TRESHOLD 50000
+
+/*
+ * Read TSC and the reference counters. Take care of SMI disturbance
+ */
+static u64 tsc_read_refs(u64 *p, int hpet)
+{
+ u64 t1, t2;
+ int i;
+
+ for (i = 0; i < MAX_RETRIES; i++) {
+ t1 = get_cycles();
+ if (hpet)
+ *p = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF;
+ else
+ *p = acpi_pm_read_early();
+ t2 = get_cycles();
+ if ((t2 - t1) < SMI_TRESHOLD)
+ return t2;
+ }
+ return ULLONG_MAX;
+}
+
+/*
+ * Calculate the TSC frequency from HPET reference
+ */
+static unsigned long calc_hpet_ref(u64 deltatsc, u64 hpet1, u64 hpet2)
+{
+ u64 tmp;
+
+ if (hpet2 < hpet1)
+ hpet2 += 0x100000000ULL;
+ hpet2 -= hpet1;
+ tmp = ((u64)hpet2 * hpet_readl(HPET_PERIOD));
+ do_div(tmp, 1000000);
+ deltatsc = div64_u64(deltatsc, tmp);
+
+ return (unsigned long) deltatsc;
+}
+
+/*
+ * Calculate the TSC frequency from PMTimer reference
+ */
+static unsigned long calc_pmtimer_ref(u64 deltatsc, u64 pm1, u64 pm2)
+{
+ u64 tmp;
+
+ if (!pm1 && !pm2)
+ return ULONG_MAX;
+
+ if (pm2 < pm1)
+ pm2 += (u64)ACPI_PM_OVRRUN;
+ pm2 -= pm1;
+ tmp = pm2 * 1000000000LL;
+ do_div(tmp, PMTMR_TICKS_PER_SEC);
+ do_div(deltatsc, tmp);
+
+ return (unsigned long) deltatsc;
+}
+
+#define CAL_MS 10
+#define CAL_LATCH (PIT_TICK_RATE / (1000 / CAL_MS))
+#define CAL_PIT_LOOPS 1000
+
+#define CAL2_MS 50
+#define CAL2_LATCH (PIT_TICK_RATE / (1000 / CAL2_MS))
+#define CAL2_PIT_LOOPS 5000
+
+
+/*
+ * Try to calibrate the TSC against the Programmable
+ * Interrupt Timer and return the frequency of the TSC
+ * in kHz.
+ *
+ * Return ULONG_MAX on failure to calibrate.
+ */
+static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin)
+{
+ u64 tsc, t1, t2, delta;
+ unsigned long tscmin, tscmax;
+ int pitcnt;
+
+ if (!has_legacy_pic()) {
+ /*
+ * Relies on tsc_early_delay_calibrate() to have given us semi
+ * usable udelay(), wait for the same 50ms we would have with
+ * the PIT loop below.
+ */
+ udelay(10 * USEC_PER_MSEC);
+ udelay(10 * USEC_PER_MSEC);
+ udelay(10 * USEC_PER_MSEC);
+ udelay(10 * USEC_PER_MSEC);
+ udelay(10 * USEC_PER_MSEC);
+ return ULONG_MAX;
+ }
+
+ /* Set the Gate high, disable speaker */
+ outb((inb(0x61) & ~0x02) | 0x01, 0x61);
+
+ /*
+ * Setup CTC channel 2* for mode 0, (interrupt on terminal
+ * count mode), binary count. Set the latch register to 50ms
+ * (LSB then MSB) to begin countdown.
+ */
+ outb(0xb0, 0x43);
+ outb(latch & 0xff, 0x42);
+ outb(latch >> 8, 0x42);
+
+ tsc = t1 = t2 = get_cycles();
+
+ pitcnt = 0;
+ tscmax = 0;
+ tscmin = ULONG_MAX;
+ while ((inb(0x61) & 0x20) == 0) {
+ t2 = get_cycles();
+ delta = t2 - tsc;
+ tsc = t2;
+ if ((unsigned long) delta < tscmin)
+ tscmin = (unsigned int) delta;
+ if ((unsigned long) delta > tscmax)
+ tscmax = (unsigned int) delta;
+ pitcnt++;
+ }
+
+ /*
+ * Sanity checks:
+ *
+ * If we were not able to read the PIT more than loopmin
+ * times, then we have been hit by a massive SMI
+ *
+ * If the maximum is 10 times larger than the minimum,
+ * then we got hit by an SMI as well.
+ */
+ if (pitcnt < loopmin || tscmax > 10 * tscmin)
+ return ULONG_MAX;
+
+ /* Calculate the PIT value */
+ delta = t2 - t1;
+ do_div(delta, ms);
+ return delta;
+}
+
+/*
+ * This reads the current MSB of the PIT counter, and
+ * checks if we are running on sufficiently fast and
+ * non-virtualized hardware.
+ *
+ * Our expectations are:
+ *
+ * - the PIT is running at roughly 1.19MHz
+ *
+ * - each IO is going to take about 1us on real hardware,
+ * but we allow it to be much faster (by a factor of 10) or
+ * _slightly_ slower (ie we allow up to a 2us read+counter
+ * update - anything else implies a unacceptably slow CPU
+ * or PIT for the fast calibration to work.
+ *
+ * - with 256 PIT ticks to read the value, we have 214us to
+ * see the same MSB (and overhead like doing a single TSC
+ * read per MSB value etc).
+ *
+ * - We're doing 2 reads per loop (LSB, MSB), and we expect
+ * them each to take about a microsecond on real hardware.
+ * So we expect a count value of around 100. But we'll be
+ * generous, and accept anything over 50.
+ *
+ * - if the PIT is stuck, and we see *many* more reads, we
+ * return early (and the next caller of pit_expect_msb()
+ * then consider it a failure when they don't see the
+ * next expected value).
+ *
+ * These expectations mean that we know that we have seen the
+ * transition from one expected value to another with a fairly
+ * high accuracy, and we didn't miss any events. We can thus
+ * use the TSC value at the transitions to calculate a pretty
+ * good value for the TSC frequencty.
+ */
+static inline int pit_verify_msb(unsigned char val)
+{
+ /* Ignore LSB */
+ inb(0x42);
+ return inb(0x42) == val;
+}
+
+static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *deltap)
+{
+ int count;
+ u64 tsc = 0, prev_tsc = 0;
+
+ for (count = 0; count < 50000; count++) {
+ if (!pit_verify_msb(val))
+ break;
+ prev_tsc = tsc;
+ tsc = get_cycles();
+ }
+ *deltap = get_cycles() - prev_tsc;
+ *tscp = tsc;
+
+ /*
+ * We require _some_ success, but the quality control
+ * will be based on the error terms on the TSC values.
+ */
+ return count > 5;
+}
+
+/*
+ * How many MSB values do we want to see? We aim for
+ * a maximum error rate of 500ppm (in practice the
+ * real error is much smaller), but refuse to spend
+ * more than 50ms on it.
+ */
+#define MAX_QUICK_PIT_MS 50
+#define MAX_QUICK_PIT_ITERATIONS (MAX_QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256)
+
+static unsigned long quick_pit_calibrate(void)
+{
+ int i;
+ u64 tsc, delta;
+ unsigned long d1, d2;
+
+ if (!has_legacy_pic())
+ return 0;
+
+ /* Set the Gate high, disable speaker */
+ outb((inb(0x61) & ~0x02) | 0x01, 0x61);
+
+ /*
+ * Counter 2, mode 0 (one-shot), binary count
+ *
+ * NOTE! Mode 2 decrements by two (and then the
+ * output is flipped each time, giving the same
+ * final output frequency as a decrement-by-one),
+ * so mode 0 is much better when looking at the
+ * individual counts.
+ */
+ outb(0xb0, 0x43);
+
+ /* Start at 0xffff */
+ outb(0xff, 0x42);
+ outb(0xff, 0x42);
+
+ /*
+ * The PIT starts counting at the next edge, so we
+ * need to delay for a microsecond. The easiest way
+ * to do that is to just read back the 16-bit counter
+ * once from the PIT.
+ */
+ pit_verify_msb(0);
+
+ if (pit_expect_msb(0xff, &tsc, &d1)) {
+ for (i = 1; i <= MAX_QUICK_PIT_ITERATIONS; i++) {
+ if (!pit_expect_msb(0xff-i, &delta, &d2))
+ break;
+
+ delta -= tsc;
+
+ /*
+ * Extrapolate the error and fail fast if the error will
+ * never be below 500 ppm.
+ */
+ if (i == 1 &&
+ d1 + d2 >= (delta * MAX_QUICK_PIT_ITERATIONS) >> 11)
+ return 0;
+
+ /*
+ * Iterate until the error is less than 500 ppm
+ */
+ if (d1+d2 >= delta >> 11)
+ continue;
+
+ /*
+ * Check the PIT one more time to verify that
+ * all TSC reads were stable wrt the PIT.
+ *
+ * This also guarantees serialization of the
+ * last cycle read ('d2') in pit_expect_msb.
+ */
+ if (!pit_verify_msb(0xfe - i))
+ break;
+ goto success;
+ }
+ }
+ pr_info("Fast TSC calibration failed\n");
+ return 0;
+
+success:
+ /*
+ * Ok, if we get here, then we've seen the
+ * MSB of the PIT decrement 'i' times, and the
+ * error has shrunk to less than 500 ppm.
+ *
+ * As a result, we can depend on there not being
+ * any odd delays anywhere, and the TSC reads are
+ * reliable (within the error).
+ *
+ * kHz = ticks / time-in-seconds / 1000;
+ * kHz = (t2 - t1) / (I * 256 / PIT_TICK_RATE) / 1000
+ * kHz = ((t2 - t1) * PIT_TICK_RATE) / (I * 256 * 1000)
+ */
+ delta *= PIT_TICK_RATE;
+ do_div(delta, i*256*1000);
+ pr_info("Fast TSC calibration using PIT\n");
+ return delta;
+}
+
+/**
+ * native_calibrate_tsc
+ * Determine TSC frequency via CPUID, else return 0.
+ */
+unsigned long native_calibrate_tsc(void)
+{
+ unsigned int eax_denominator, ebx_numerator, ecx_hz, edx;
+ unsigned int crystal_khz;
+
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return 0;
+
+ if (boot_cpu_data.cpuid_level < 0x15)
+ return 0;
+
+ eax_denominator = ebx_numerator = ecx_hz = edx = 0;
+
+ /* CPUID 15H TSC/Crystal ratio, plus optionally Crystal Hz */
+ cpuid(0x15, &eax_denominator, &ebx_numerator, &ecx_hz, &edx);
+
+ if (ebx_numerator == 0 || eax_denominator == 0)
+ return 0;
+
+ crystal_khz = ecx_hz / 1000;
+
+ if (crystal_khz == 0) {
+ switch (boot_cpu_data.x86_model) {
+ case INTEL_FAM6_SKYLAKE_MOBILE:
+ case INTEL_FAM6_SKYLAKE_DESKTOP:
+ case INTEL_FAM6_KABYLAKE_MOBILE:
+ case INTEL_FAM6_KABYLAKE_DESKTOP:
+ crystal_khz = 24000; /* 24.0 MHz */
+ break;
+ case INTEL_FAM6_ATOM_GOLDMONT_X:
+ crystal_khz = 25000; /* 25.0 MHz */
+ break;
+ case INTEL_FAM6_ATOM_GOLDMONT:
+ crystal_khz = 19200; /* 19.2 MHz */
+ break;
+ }
+ }
+
+ if (crystal_khz == 0)
+ return 0;
+ /*
+ * TSC frequency determined by CPUID is a "hardware reported"
+ * frequency and is the most accurate one so far we have. This
+ * is considered a known frequency.
+ */
+ setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
+
+ /*
+ * For Atom SoCs TSC is the only reliable clocksource.
+ * Mark TSC reliable so no watchdog on it.
+ */
+ if (boot_cpu_data.x86_model == INTEL_FAM6_ATOM_GOLDMONT)
+ setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
+
+ return crystal_khz * ebx_numerator / eax_denominator;
+}
+
+static unsigned long cpu_khz_from_cpuid(void)
+{
+ unsigned int eax_base_mhz, ebx_max_mhz, ecx_bus_mhz, edx;
+
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return 0;
+
+ if (boot_cpu_data.cpuid_level < 0x16)
+ return 0;
+
+ eax_base_mhz = ebx_max_mhz = ecx_bus_mhz = edx = 0;
+
+ cpuid(0x16, &eax_base_mhz, &ebx_max_mhz, &ecx_bus_mhz, &edx);
+
+ return eax_base_mhz * 1000;
+}
+
+/*
+ * calibrate cpu using pit, hpet, and ptimer methods. They are available
+ * later in boot after acpi is initialized.
+ */
+static unsigned long pit_hpet_ptimer_calibrate_cpu(void)
+{
+ u64 tsc1, tsc2, delta, ref1, ref2;
+ unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX;
+ unsigned long flags, latch, ms;
+ int hpet = is_hpet_enabled(), i, loopmin;
+
+ /*
+ * Run 5 calibration loops to get the lowest frequency value
+ * (the best estimate). We use two different calibration modes
+ * here:
+ *
+ * 1) PIT loop. We set the PIT Channel 2 to oneshot mode and
+ * load a timeout of 50ms. We read the time right after we
+ * started the timer and wait until the PIT count down reaches
+ * zero. In each wait loop iteration we read the TSC and check
+ * the delta to the previous read. We keep track of the min
+ * and max values of that delta. The delta is mostly defined
+ * by the IO time of the PIT access, so we can detect when a
+ * SMI/SMM disturbance happened between the two reads. If the
+ * maximum time is significantly larger than the minimum time,
+ * then we discard the result and have another try.
+ *
+ * 2) Reference counter. If available we use the HPET or the
+ * PMTIMER as a reference to check the sanity of that value.
+ * We use separate TSC readouts and check inside of the
+ * reference read for a SMI/SMM disturbance. We dicard
+ * disturbed values here as well. We do that around the PIT
+ * calibration delay loop as we have to wait for a certain
+ * amount of time anyway.
+ */
+
+ /* Preset PIT loop values */
+ latch = CAL_LATCH;
+ ms = CAL_MS;
+ loopmin = CAL_PIT_LOOPS;
+
+ for (i = 0; i < 3; i++) {
+ unsigned long tsc_pit_khz;
+
+ /*
+ * Read the start value and the reference count of
+ * hpet/pmtimer when available. Then do the PIT
+ * calibration, which will take at least 50ms, and
+ * read the end value.
+ */
+ local_irq_save(flags);
+ tsc1 = tsc_read_refs(&ref1, hpet);
+ tsc_pit_khz = pit_calibrate_tsc(latch, ms, loopmin);
+ tsc2 = tsc_read_refs(&ref2, hpet);
+ local_irq_restore(flags);
+
+ /* Pick the lowest PIT TSC calibration so far */
+ tsc_pit_min = min(tsc_pit_min, tsc_pit_khz);
+
+ /* hpet or pmtimer available ? */
+ if (ref1 == ref2)
+ continue;
+
+ /* Check, whether the sampling was disturbed by an SMI */
+ if (tsc1 == ULLONG_MAX || tsc2 == ULLONG_MAX)
+ continue;
+
+ tsc2 = (tsc2 - tsc1) * 1000000LL;
+ if (hpet)
+ tsc2 = calc_hpet_ref(tsc2, ref1, ref2);
+ else
+ tsc2 = calc_pmtimer_ref(tsc2, ref1, ref2);
+
+ tsc_ref_min = min(tsc_ref_min, (unsigned long) tsc2);
+
+ /* Check the reference deviation */
+ delta = ((u64) tsc_pit_min) * 100;
+ do_div(delta, tsc_ref_min);
+
+ /*
+ * If both calibration results are inside a 10% window
+ * then we can be sure, that the calibration
+ * succeeded. We break out of the loop right away. We
+ * use the reference value, as it is more precise.
+ */
+ if (delta >= 90 && delta <= 110) {
+ pr_info("PIT calibration matches %s. %d loops\n",
+ hpet ? "HPET" : "PMTIMER", i + 1);
+ return tsc_ref_min;
+ }
+
+ /*
+ * Check whether PIT failed more than once. This
+ * happens in virtualized environments. We need to
+ * give the virtual PC a slightly longer timeframe for
+ * the HPET/PMTIMER to make the result precise.
+ */
+ if (i == 1 && tsc_pit_min == ULONG_MAX) {
+ latch = CAL2_LATCH;
+ ms = CAL2_MS;
+ loopmin = CAL2_PIT_LOOPS;
+ }
+ }
+
+ /*
+ * Now check the results.
+ */
+ if (tsc_pit_min == ULONG_MAX) {
+ /* PIT gave no useful value */
+ pr_warn("Unable to calibrate against PIT\n");
+
+ /* We don't have an alternative source, disable TSC */
+ if (!hpet && !ref1 && !ref2) {
+ pr_notice("No reference (HPET/PMTIMER) available\n");
+ return 0;
+ }
+
+ /* The alternative source failed as well, disable TSC */
+ if (tsc_ref_min == ULONG_MAX) {
+ pr_warn("HPET/PMTIMER calibration failed\n");
+ return 0;
+ }
+
+ /* Use the alternative source */
+ pr_info("using %s reference calibration\n",
+ hpet ? "HPET" : "PMTIMER");
+
+ return tsc_ref_min;
+ }
+
+ /* We don't have an alternative source, use the PIT calibration value */
+ if (!hpet && !ref1 && !ref2) {
+ pr_info("Using PIT calibration value\n");
+ return tsc_pit_min;
+ }
+
+ /* The alternative source failed, use the PIT calibration value */
+ if (tsc_ref_min == ULONG_MAX) {
+ pr_warn("HPET/PMTIMER calibration failed. Using PIT calibration.\n");
+ return tsc_pit_min;
+ }
+
+ /*
+ * The calibration values differ too much. In doubt, we use
+ * the PIT value as we know that there are PMTIMERs around
+ * running at double speed. At least we let the user know:
+ */
+ pr_warn("PIT calibration deviates from %s: %lu %lu\n",
+ hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min);
+ pr_info("Using PIT calibration value\n");
+ return tsc_pit_min;
+}
+
+/**
+ * native_calibrate_cpu_early - can calibrate the cpu early in boot
+ */
+unsigned long native_calibrate_cpu_early(void)
+{
+ unsigned long flags, fast_calibrate = cpu_khz_from_cpuid();
+
+ if (!fast_calibrate)
+ fast_calibrate = cpu_khz_from_msr();
+ if (!fast_calibrate) {
+ local_irq_save(flags);
+ fast_calibrate = quick_pit_calibrate();
+ local_irq_restore(flags);
+ }
+ return fast_calibrate;
+}
+
+
+/**
+ * native_calibrate_cpu - calibrate the cpu
+ */
+static unsigned long native_calibrate_cpu(void)
+{
+ unsigned long tsc_freq = native_calibrate_cpu_early();
+
+ if (!tsc_freq)
+ tsc_freq = pit_hpet_ptimer_calibrate_cpu();
+
+ return tsc_freq;
+}
+
+void recalibrate_cpu_khz(void)
+{
+#ifndef CONFIG_SMP
+ unsigned long cpu_khz_old = cpu_khz;
+
+ if (!boot_cpu_has(X86_FEATURE_TSC))
+ return;
+
+ cpu_khz = x86_platform.calibrate_cpu();
+ tsc_khz = x86_platform.calibrate_tsc();
+ if (tsc_khz == 0)
+ tsc_khz = cpu_khz;
+ else if (abs(cpu_khz - tsc_khz) * 10 > tsc_khz)
+ cpu_khz = tsc_khz;
+ cpu_data(0).loops_per_jiffy = cpufreq_scale(cpu_data(0).loops_per_jiffy,
+ cpu_khz_old, cpu_khz);
+#endif
+}
+
+EXPORT_SYMBOL(recalibrate_cpu_khz);
+
+
+static unsigned long long cyc2ns_suspend;
+
+void tsc_save_sched_clock_state(void)
+{
+ if (!sched_clock_stable())
+ return;
+
+ cyc2ns_suspend = sched_clock();
+}
+
+/*
+ * Even on processors with invariant TSC, TSC gets reset in some the
+ * ACPI system sleep states. And in some systems BIOS seem to reinit TSC to
+ * arbitrary value (still sync'd across cpu's) during resume from such sleep
+ * states. To cope up with this, recompute the cyc2ns_offset for each cpu so
+ * that sched_clock() continues from the point where it was left off during
+ * suspend.
+ */
+void tsc_restore_sched_clock_state(void)
+{
+ unsigned long long offset;
+ unsigned long flags;
+ int cpu;
+
+ if (!sched_clock_stable())
+ return;
+
+ local_irq_save(flags);
+
+ /*
+ * We're coming out of suspend, there's no concurrency yet; don't
+ * bother being nice about the RCU stuff, just write to both
+ * data fields.
+ */
+
+ this_cpu_write(cyc2ns.data[0].cyc2ns_offset, 0);
+ this_cpu_write(cyc2ns.data[1].cyc2ns_offset, 0);
+
+ offset = cyc2ns_suspend - sched_clock();
+
+ for_each_possible_cpu(cpu) {
+ per_cpu(cyc2ns.data[0].cyc2ns_offset, cpu) = offset;
+ per_cpu(cyc2ns.data[1].cyc2ns_offset, cpu) = offset;
+ }
+
+ local_irq_restore(flags);
+}
+
+#ifdef CONFIG_CPU_FREQ
+/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency
+ * changes.
+ *
+ * RED-PEN: On SMP we assume all CPUs run with the same frequency. It's
+ * not that important because current Opteron setups do not support
+ * scaling on SMP anyroads.
+ *
+ * Should fix up last_tsc too. Currently gettimeofday in the
+ * first tick after the change will be slightly wrong.
+ */
+
+static unsigned int ref_freq;
+static unsigned long loops_per_jiffy_ref;
+static unsigned long tsc_khz_ref;
+
+static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ struct cpufreq_freqs *freq = data;
+ unsigned long *lpj;
+
+ lpj = &boot_cpu_data.loops_per_jiffy;
+#ifdef CONFIG_SMP
+ if (!(freq->flags & CPUFREQ_CONST_LOOPS))
+ lpj = &cpu_data(freq->cpu).loops_per_jiffy;
+#endif
+
+ if (!ref_freq) {
+ ref_freq = freq->old;
+ loops_per_jiffy_ref = *lpj;
+ tsc_khz_ref = tsc_khz;
+ }
+ if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) ||
+ (val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) {
+ *lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
+
+ tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
+ if (!(freq->flags & CPUFREQ_CONST_LOOPS))
+ mark_tsc_unstable("cpufreq changes");
+
+ set_cyc2ns_scale(tsc_khz, freq->cpu, rdtsc());
+ }
+
+ return 0;
+}
+
+static struct notifier_block time_cpufreq_notifier_block = {
+ .notifier_call = time_cpufreq_notifier
+};
+
+static int __init cpufreq_register_tsc_scaling(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_TSC))
+ return 0;
+ if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
+ return 0;
+ cpufreq_register_notifier(&time_cpufreq_notifier_block,
+ CPUFREQ_TRANSITION_NOTIFIER);
+ return 0;
+}
+
+core_initcall(cpufreq_register_tsc_scaling);
+
+#endif /* CONFIG_CPU_FREQ */
+
+#define ART_CPUID_LEAF (0x15)
+#define ART_MIN_DENOMINATOR (1)
+
+
+/*
+ * If ART is present detect the numerator:denominator to convert to TSC
+ */
+static void __init detect_art(void)
+{
+ unsigned int unused[2];
+
+ if (boot_cpu_data.cpuid_level < ART_CPUID_LEAF)
+ return;
+
+ /*
+ * Don't enable ART in a VM, non-stop TSC and TSC_ADJUST required,
+ * and the TSC counter resets must not occur asynchronously.
+ */
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR) ||
+ !boot_cpu_has(X86_FEATURE_NONSTOP_TSC) ||
+ !boot_cpu_has(X86_FEATURE_TSC_ADJUST) ||
+ tsc_async_resets)
+ return;
+
+ cpuid(ART_CPUID_LEAF, &art_to_tsc_denominator,
+ &art_to_tsc_numerator, unused, unused+1);
+
+ if (art_to_tsc_denominator < ART_MIN_DENOMINATOR)
+ return;
+
+ rdmsrl(MSR_IA32_TSC_ADJUST, art_to_tsc_offset);
+
+ /* Make this sticky over multiple CPU init calls */
+ setup_force_cpu_cap(X86_FEATURE_ART);
+}
+
+
+/* clocksource code */
+
+static void tsc_resume(struct clocksource *cs)
+{
+ tsc_verify_tsc_adjust(true);
+}
+
+/*
+ * We used to compare the TSC to the cycle_last value in the clocksource
+ * structure to avoid a nasty time-warp. This can be observed in a
+ * very small window right after one CPU updated cycle_last under
+ * xtime/vsyscall_gtod lock and the other CPU reads a TSC value which
+ * is smaller than the cycle_last reference value due to a TSC which
+ * is slighty behind. This delta is nowhere else observable, but in
+ * that case it results in a forward time jump in the range of hours
+ * due to the unsigned delta calculation of the time keeping core
+ * code, which is necessary to support wrapping clocksources like pm
+ * timer.
+ *
+ * This sanity check is now done in the core timekeeping code.
+ * checking the result of read_tsc() - cycle_last for being negative.
+ * That works because CLOCKSOURCE_MASK(64) does not mask out any bit.
+ */
+static u64 read_tsc(struct clocksource *cs)
+{
+ return (u64)rdtsc_ordered();
+}
+
+static void tsc_cs_mark_unstable(struct clocksource *cs)
+{
+ if (tsc_unstable)
+ return;
+
+ tsc_unstable = 1;
+ if (using_native_sched_clock())
+ clear_sched_clock_stable();
+ disable_sched_clock_irqtime();
+ pr_info("Marking TSC unstable due to clocksource watchdog\n");
+}
+
+static void tsc_cs_tick_stable(struct clocksource *cs)
+{
+ if (tsc_unstable)
+ return;
+
+ if (using_native_sched_clock())
+ sched_clock_tick_stable();
+}
+
+/*
+ * .mask MUST be CLOCKSOURCE_MASK(64). See comment above read_tsc()
+ */
+static struct clocksource clocksource_tsc_early = {
+ .name = "tsc-early",
+ .rating = 299,
+ .read = read_tsc,
+ .mask = CLOCKSOURCE_MASK(64),
+ .flags = CLOCK_SOURCE_IS_CONTINUOUS |
+ CLOCK_SOURCE_MUST_VERIFY,
+ .archdata = { .vclock_mode = VCLOCK_TSC },
+ .resume = tsc_resume,
+ .mark_unstable = tsc_cs_mark_unstable,
+ .tick_stable = tsc_cs_tick_stable,
+ .list = LIST_HEAD_INIT(clocksource_tsc_early.list),
+};
+
+/*
+ * Must mark VALID_FOR_HRES early such that when we unregister tsc_early
+ * this one will immediately take over. We will only register if TSC has
+ * been found good.
+ */
+static struct clocksource clocksource_tsc = {
+ .name = "tsc",
+ .rating = 300,
+ .read = read_tsc,
+ .mask = CLOCKSOURCE_MASK(64),
+ .flags = CLOCK_SOURCE_IS_CONTINUOUS |
+ CLOCK_SOURCE_VALID_FOR_HRES |
+ CLOCK_SOURCE_MUST_VERIFY,
+ .archdata = { .vclock_mode = VCLOCK_TSC },
+ .resume = tsc_resume,
+ .mark_unstable = tsc_cs_mark_unstable,
+ .tick_stable = tsc_cs_tick_stable,
+ .list = LIST_HEAD_INIT(clocksource_tsc.list),
+};
+
+void mark_tsc_unstable(char *reason)
+{
+ if (tsc_unstable)
+ return;
+
+ tsc_unstable = 1;
+ if (using_native_sched_clock())
+ clear_sched_clock_stable();
+ disable_sched_clock_irqtime();
+ pr_info("Marking TSC unstable due to %s\n", reason);
+
+ clocksource_mark_unstable(&clocksource_tsc_early);
+ clocksource_mark_unstable(&clocksource_tsc);
+}
+
+EXPORT_SYMBOL_GPL(mark_tsc_unstable);
+
+static void __init check_system_tsc_reliable(void)
+{
+#if defined(CONFIG_MGEODEGX1) || defined(CONFIG_MGEODE_LX) || defined(CONFIG_X86_GENERIC)
+ if (is_geode_lx()) {
+ /* RTSC counts during suspend */
+#define RTSC_SUSP 0x100
+ unsigned long res_low, res_high;
+
+ rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high);
+ /* Geode_LX - the OLPC CPU has a very reliable TSC */
+ if (res_low & RTSC_SUSP)
+ tsc_clocksource_reliable = 1;
+ }
+#endif
+ if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE))
+ tsc_clocksource_reliable = 1;
+}
+
+/*
+ * Make an educated guess if the TSC is trustworthy and synchronized
+ * over all CPUs.
+ */
+int unsynchronized_tsc(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_TSC) || tsc_unstable)
+ return 1;
+
+#ifdef CONFIG_SMP
+ if (apic_is_clustered_box())
+ return 1;
+#endif
+
+ if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
+ return 0;
+
+ if (tsc_clocksource_reliable)
+ return 0;
+ /*
+ * Intel systems are normally all synchronized.
+ * Exceptions must mark TSC as unstable:
+ */
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
+ /* assume multi socket systems are not synchronized: */
+ if (num_possible_cpus() > 1)
+ return 1;
+ }
+
+ return 0;
+}
+
+/*
+ * Convert ART to TSC given numerator/denominator found in detect_art()
+ */
+struct system_counterval_t convert_art_to_tsc(u64 art)
+{
+ u64 tmp, res, rem;
+
+ rem = do_div(art, art_to_tsc_denominator);
+
+ res = art * art_to_tsc_numerator;
+ tmp = rem * art_to_tsc_numerator;
+
+ do_div(tmp, art_to_tsc_denominator);
+ res += tmp + art_to_tsc_offset;
+
+ return (struct system_counterval_t) {.cs = art_related_clocksource,
+ .cycles = res};
+}
+EXPORT_SYMBOL(convert_art_to_tsc);
+
+/**
+ * convert_art_ns_to_tsc() - Convert ART in nanoseconds to TSC.
+ * @art_ns: ART (Always Running Timer) in unit of nanoseconds
+ *
+ * PTM requires all timestamps to be in units of nanoseconds. When user
+ * software requests a cross-timestamp, this function converts system timestamp
+ * to TSC.
+ *
+ * This is valid when CPU feature flag X86_FEATURE_TSC_KNOWN_FREQ is set
+ * indicating the tsc_khz is derived from CPUID[15H]. Drivers should check
+ * that this flag is set before conversion to TSC is attempted.
+ *
+ * Return:
+ * struct system_counterval_t - system counter value with the pointer to the
+ * corresponding clocksource
+ * @cycles: System counter value
+ * @cs: Clocksource corresponding to system counter value. Used
+ * by timekeeping code to verify comparibility of two cycle
+ * values.
+ */
+
+struct system_counterval_t convert_art_ns_to_tsc(u64 art_ns)
+{
+ u64 tmp, res, rem;
+
+ rem = do_div(art_ns, USEC_PER_SEC);
+
+ res = art_ns * tsc_khz;
+ tmp = rem * tsc_khz;
+
+ do_div(tmp, USEC_PER_SEC);
+ res += tmp;
+
+ return (struct system_counterval_t) { .cs = art_related_clocksource,
+ .cycles = res};
+}
+EXPORT_SYMBOL(convert_art_ns_to_tsc);
+
+
+static void tsc_refine_calibration_work(struct work_struct *work);
+static DECLARE_DELAYED_WORK(tsc_irqwork, tsc_refine_calibration_work);
+/**
+ * tsc_refine_calibration_work - Further refine tsc freq calibration
+ * @work - ignored.
+ *
+ * This functions uses delayed work over a period of a
+ * second to further refine the TSC freq value. Since this is
+ * timer based, instead of loop based, we don't block the boot
+ * process while this longer calibration is done.
+ *
+ * If there are any calibration anomalies (too many SMIs, etc),
+ * or the refined calibration is off by 1% of the fast early
+ * calibration, we throw out the new calibration and use the
+ * early calibration.
+ */
+static void tsc_refine_calibration_work(struct work_struct *work)
+{
+ static u64 tsc_start = -1, ref_start;
+ static int hpet;
+ u64 tsc_stop, ref_stop, delta;
+ unsigned long freq;
+ int cpu;
+
+ /* Don't bother refining TSC on unstable systems */
+ if (tsc_unstable)
+ goto unreg;
+
+ /*
+ * Since the work is started early in boot, we may be
+ * delayed the first time we expire. So set the workqueue
+ * again once we know timers are working.
+ */
+ if (tsc_start == -1) {
+ /*
+ * Only set hpet once, to avoid mixing hardware
+ * if the hpet becomes enabled later.
+ */
+ hpet = is_hpet_enabled();
+ schedule_delayed_work(&tsc_irqwork, HZ);
+ tsc_start = tsc_read_refs(&ref_start, hpet);
+ return;
+ }
+
+ tsc_stop = tsc_read_refs(&ref_stop, hpet);
+
+ /* hpet or pmtimer available ? */
+ if (ref_start == ref_stop)
+ goto out;
+
+ /* Check, whether the sampling was disturbed by an SMI */
+ if (tsc_start == ULLONG_MAX || tsc_stop == ULLONG_MAX)
+ goto out;
+
+ delta = tsc_stop - tsc_start;
+ delta *= 1000000LL;
+ if (hpet)
+ freq = calc_hpet_ref(delta, ref_start, ref_stop);
+ else
+ freq = calc_pmtimer_ref(delta, ref_start, ref_stop);
+
+ /* Make sure we're within 1% */
+ if (abs(tsc_khz - freq) > tsc_khz/100)
+ goto out;
+
+ tsc_khz = freq;
+ pr_info("Refined TSC clocksource calibration: %lu.%03lu MHz\n",
+ (unsigned long)tsc_khz / 1000,
+ (unsigned long)tsc_khz % 1000);
+
+ /* Inform the TSC deadline clockevent devices about the recalibration */
+ lapic_update_tsc_freq();
+
+ /* Update the sched_clock() rate to match the clocksource one */
+ for_each_possible_cpu(cpu)
+ set_cyc2ns_scale(tsc_khz, cpu, tsc_stop);
+
+out:
+ if (tsc_unstable)
+ goto unreg;
+
+ if (boot_cpu_has(X86_FEATURE_ART))
+ art_related_clocksource = &clocksource_tsc;
+ clocksource_register_khz(&clocksource_tsc, tsc_khz);
+unreg:
+ clocksource_unregister(&clocksource_tsc_early);
+}
+
+
+static int __init init_tsc_clocksource(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_TSC) || !tsc_khz)
+ return 0;
+
+ if (tsc_unstable)
+ goto unreg;
+
+ if (tsc_clocksource_reliable)
+ clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
+
+ if (boot_cpu_has(X86_FEATURE_NONSTOP_TSC_S3))
+ clocksource_tsc.flags |= CLOCK_SOURCE_SUSPEND_NONSTOP;
+
+ /*
+ * When TSC frequency is known (retrieved via MSR or CPUID), we skip
+ * the refined calibration and directly register it as a clocksource.
+ */
+ if (boot_cpu_has(X86_FEATURE_TSC_KNOWN_FREQ)) {
+ if (boot_cpu_has(X86_FEATURE_ART))
+ art_related_clocksource = &clocksource_tsc;
+ clocksource_register_khz(&clocksource_tsc, tsc_khz);
+unreg:
+ clocksource_unregister(&clocksource_tsc_early);
+ return 0;
+ }
+
+ schedule_delayed_work(&tsc_irqwork, 0);
+ return 0;
+}
+/*
+ * We use device_initcall here, to ensure we run after the hpet
+ * is fully initialized, which may occur at fs_initcall time.
+ */
+device_initcall(init_tsc_clocksource);
+
+static bool __init determine_cpu_tsc_frequencies(bool early)
+{
+ /* Make sure that cpu and tsc are not already calibrated */
+ WARN_ON(cpu_khz || tsc_khz);
+
+ if (early) {
+ cpu_khz = x86_platform.calibrate_cpu();
+ tsc_khz = x86_platform.calibrate_tsc();
+ } else {
+ /* We should not be here with non-native cpu calibration */
+ WARN_ON(x86_platform.calibrate_cpu != native_calibrate_cpu);
+ cpu_khz = pit_hpet_ptimer_calibrate_cpu();
+ }
+
+ /*
+ * Trust non-zero tsc_khz as authoritative,
+ * and use it to sanity check cpu_khz,
+ * which will be off if system timer is off.
+ */
+ if (tsc_khz == 0)
+ tsc_khz = cpu_khz;
+ else if (abs(cpu_khz - tsc_khz) * 10 > tsc_khz)
+ cpu_khz = tsc_khz;
+
+ if (tsc_khz == 0)
+ return false;
+
+ pr_info("Detected %lu.%03lu MHz processor\n",
+ (unsigned long)cpu_khz / KHZ,
+ (unsigned long)cpu_khz % KHZ);
+
+ if (cpu_khz != tsc_khz) {
+ pr_info("Detected %lu.%03lu MHz TSC",
+ (unsigned long)tsc_khz / KHZ,
+ (unsigned long)tsc_khz % KHZ);
+ }
+ return true;
+}
+
+static unsigned long __init get_loops_per_jiffy(void)
+{
+ u64 lpj = (u64)tsc_khz * KHZ;
+
+ do_div(lpj, HZ);
+ return lpj;
+}
+
+static void __init tsc_enable_sched_clock(void)
+{
+ /* Sanitize TSC ADJUST before cyc2ns gets initialized */
+ tsc_store_and_check_tsc_adjust(true);
+ cyc2ns_init_boot_cpu();
+ static_branch_enable(&__use_tsc);
+}
+
+void __init tsc_early_init(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_TSC))
+ return;
+ /* Don't change UV TSC multi-chassis synchronization */
+ if (is_early_uv_system())
+ return;
+ if (!determine_cpu_tsc_frequencies(true))
+ return;
+ loops_per_jiffy = get_loops_per_jiffy();
+
+ tsc_enable_sched_clock();
+}
+
+void __init tsc_init(void)
+{
+ /*
+ * native_calibrate_cpu_early can only calibrate using methods that are
+ * available early in boot.
+ */
+ if (x86_platform.calibrate_cpu == native_calibrate_cpu_early)
+ x86_platform.calibrate_cpu = native_calibrate_cpu;
+
+ if (!boot_cpu_has(X86_FEATURE_TSC)) {
+ setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
+ return;
+ }
+
+ if (!tsc_khz) {
+ /* We failed to determine frequencies earlier, try again */
+ if (!determine_cpu_tsc_frequencies(false)) {
+ mark_tsc_unstable("could not calculate TSC khz");
+ setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
+ return;
+ }
+ tsc_enable_sched_clock();
+ }
+
+ cyc2ns_init_secondary_cpus();
+
+ if (!no_sched_irq_time)
+ enable_sched_clock_irqtime();
+
+ lpj_fine = get_loops_per_jiffy();
+ use_tsc_delay();
+
+ check_system_tsc_reliable();
+
+ if (unsynchronized_tsc()) {
+ mark_tsc_unstable("TSCs unsynchronized");
+ return;
+ }
+
+ clocksource_register_khz(&clocksource_tsc_early, tsc_khz);
+ detect_art();
+}
+
+#ifdef CONFIG_SMP
+/*
+ * If we have a constant TSC and are using the TSC for the delay loop,
+ * we can skip clock calibration if another cpu in the same socket has already
+ * been calibrated. This assumes that CONSTANT_TSC applies to all
+ * cpus in the socket - this should be a safe assumption.
+ */
+unsigned long calibrate_delay_is_known(void)
+{
+ int sibling, cpu = smp_processor_id();
+ int constant_tsc = cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC);
+ const struct cpumask *mask = topology_core_cpumask(cpu);
+
+ if (!constant_tsc || !mask)
+ return 0;
+
+ sibling = cpumask_any_but(mask, cpu);
+ if (sibling < nr_cpu_ids)
+ return cpu_data(sibling).loops_per_jiffy;
+ return 0;
+}
+#endif
diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c
new file mode 100644
index 000000000..3d0e9aeea
--- /dev/null
+++ b/arch/x86/kernel/tsc_msr.c
@@ -0,0 +1,129 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * TSC frequency enumeration via MSR
+ *
+ * Copyright (C) 2013, 2018 Intel Corporation
+ * Author: Bin Gao <bin.gao@intel.com>
+ */
+
+#include <linux/kernel.h>
+
+#include <asm/apic.h>
+#include <asm/cpu_device_id.h>
+#include <asm/intel-family.h>
+#include <asm/msr.h>
+#include <asm/param.h>
+#include <asm/tsc.h>
+
+#define MAX_NUM_FREQS 9
+
+/*
+ * If MSR_PERF_STAT[31] is set, the maximum resolved bus ratio can be
+ * read in MSR_PLATFORM_ID[12:8], otherwise in MSR_PERF_STAT[44:40].
+ * Unfortunately some Intel Atom SoCs aren't quite compliant to this,
+ * so we need manually differentiate SoC families. This is what the
+ * field msr_plat does.
+ */
+struct freq_desc {
+ u8 msr_plat; /* 1: use MSR_PLATFORM_INFO, 0: MSR_IA32_PERF_STATUS */
+ u32 freqs[MAX_NUM_FREQS];
+};
+
+/*
+ * Penwell and Clovertrail use spread spectrum clock,
+ * so the freq number is not exactly the same as reported
+ * by MSR based on SDM.
+ */
+static const struct freq_desc freq_desc_pnw = {
+ 0, { 0, 0, 0, 0, 0, 99840, 0, 83200 }
+};
+
+static const struct freq_desc freq_desc_clv = {
+ 0, { 0, 133200, 0, 0, 0, 99840, 0, 83200 }
+};
+
+static const struct freq_desc freq_desc_byt = {
+ 1, { 83300, 100000, 133300, 116700, 80000, 0, 0, 0 }
+};
+
+static const struct freq_desc freq_desc_cht = {
+ 1, { 83300, 100000, 133300, 116700, 80000, 93300, 90000, 88900, 87500 }
+};
+
+static const struct freq_desc freq_desc_tng = {
+ 1, { 0, 100000, 133300, 0, 0, 0, 0, 0 }
+};
+
+static const struct freq_desc freq_desc_ann = {
+ 1, { 83300, 100000, 133300, 100000, 0, 0, 0, 0 }
+};
+
+static const struct x86_cpu_id tsc_msr_cpu_ids[] = {
+ INTEL_CPU_FAM6(ATOM_SALTWELL_MID, freq_desc_pnw),
+ INTEL_CPU_FAM6(ATOM_SALTWELL_TABLET, freq_desc_clv),
+ INTEL_CPU_FAM6(ATOM_SILVERMONT, freq_desc_byt),
+ INTEL_CPU_FAM6(ATOM_SILVERMONT_MID, freq_desc_tng),
+ INTEL_CPU_FAM6(ATOM_AIRMONT, freq_desc_cht),
+ INTEL_CPU_FAM6(ATOM_AIRMONT_MID, freq_desc_ann),
+ {}
+};
+
+/*
+ * MSR-based CPU/TSC frequency discovery for certain CPUs.
+ *
+ * Set global "lapic_timer_frequency" to bus_clock_cycles/jiffy
+ * Return processor base frequency in KHz, or 0 on failure.
+ */
+unsigned long cpu_khz_from_msr(void)
+{
+ u32 lo, hi, ratio, freq;
+ const struct freq_desc *freq_desc;
+ const struct x86_cpu_id *id;
+ unsigned long res;
+
+ id = x86_match_cpu(tsc_msr_cpu_ids);
+ if (!id)
+ return 0;
+
+ freq_desc = (struct freq_desc *)id->driver_data;
+ if (freq_desc->msr_plat) {
+ rdmsr(MSR_PLATFORM_INFO, lo, hi);
+ ratio = (lo >> 8) & 0xff;
+ } else {
+ rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
+ ratio = (hi >> 8) & 0x1f;
+ }
+
+ /* Get FSB FREQ ID */
+ rdmsr(MSR_FSB_FREQ, lo, hi);
+
+ /* Map CPU reference clock freq ID(0-7) to CPU reference clock freq(KHz) */
+ freq = freq_desc->freqs[lo & 0x7];
+
+ /* TSC frequency = maximum resolved freq * maximum resolved bus ratio */
+ res = freq * ratio;
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ lapic_timer_frequency = (freq * 1000) / HZ;
+#endif
+
+ /*
+ * TSC frequency determined by MSR is always considered "known"
+ * because it is reported by HW.
+ * Another fact is that on MSR capable platforms, PIT/HPET is
+ * generally not available so calibration won't work at all.
+ */
+ setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
+
+ /*
+ * Unfortunately there is no way for hardware to tell whether the
+ * TSC is reliable. We were told by silicon design team that TSC
+ * on Atom SoCs are always "reliable". TSC is also the only
+ * reliable clocksource on these SoCs (HPET is either not present
+ * or not functional) so mark TSC reliable which removes the
+ * requirement for a watchdog clocksource.
+ */
+ setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
+
+ return res;
+}
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
new file mode 100644
index 000000000..ec534f978
--- /dev/null
+++ b/arch/x86/kernel/tsc_sync.c
@@ -0,0 +1,494 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * check TSC synchronization.
+ *
+ * Copyright (C) 2006, Red Hat, Inc., Ingo Molnar
+ *
+ * We check whether all boot CPUs have their TSC's synchronized,
+ * print a warning if not and turn off the TSC clock-source.
+ *
+ * The warp-check is point-to-point between two CPUs, the CPU
+ * initiating the bootup is the 'source CPU', the freshly booting
+ * CPU is the 'target CPU'.
+ *
+ * Only two CPUs may participate - they can enter in any order.
+ * ( The serial nature of the boot logic and the CPU hotplug lock
+ * protects against more than 2 CPUs entering this code. )
+ */
+#include <linux/topology.h>
+#include <linux/spinlock.h>
+#include <linux/kernel.h>
+#include <linux/smp.h>
+#include <linux/nmi.h>
+#include <asm/tsc.h>
+
+struct tsc_adjust {
+ s64 bootval;
+ s64 adjusted;
+ unsigned long nextcheck;
+ bool warned;
+};
+
+static DEFINE_PER_CPU(struct tsc_adjust, tsc_adjust);
+
+/*
+ * TSC's on different sockets may be reset asynchronously.
+ * This may cause the TSC ADJUST value on socket 0 to be NOT 0.
+ */
+bool __read_mostly tsc_async_resets;
+
+void mark_tsc_async_resets(char *reason)
+{
+ if (tsc_async_resets)
+ return;
+ tsc_async_resets = true;
+ pr_info("tsc: Marking TSC async resets true due to %s\n", reason);
+}
+
+void tsc_verify_tsc_adjust(bool resume)
+{
+ struct tsc_adjust *adj = this_cpu_ptr(&tsc_adjust);
+ s64 curval;
+
+ if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST))
+ return;
+
+ /* Skip unnecessary error messages if TSC already unstable */
+ if (check_tsc_unstable())
+ return;
+
+ /* Rate limit the MSR check */
+ if (!resume && time_before(jiffies, adj->nextcheck))
+ return;
+
+ adj->nextcheck = jiffies + HZ;
+
+ rdmsrl(MSR_IA32_TSC_ADJUST, curval);
+ if (adj->adjusted == curval)
+ return;
+
+ /* Restore the original value */
+ wrmsrl(MSR_IA32_TSC_ADJUST, adj->adjusted);
+
+ if (!adj->warned || resume) {
+ pr_warn(FW_BUG "TSC ADJUST differs: CPU%u %lld --> %lld. Restoring\n",
+ smp_processor_id(), adj->adjusted, curval);
+ adj->warned = true;
+ }
+}
+
+static void tsc_sanitize_first_cpu(struct tsc_adjust *cur, s64 bootval,
+ unsigned int cpu, bool bootcpu)
+{
+ /*
+ * First online CPU in a package stores the boot value in the
+ * adjustment value. This value might change later via the sync
+ * mechanism. If that fails we still can yell about boot values not
+ * being consistent.
+ *
+ * On the boot cpu we just force set the ADJUST value to 0 if it's
+ * non zero. We don't do that on non boot cpus because physical
+ * hotplug should have set the ADJUST register to a value > 0 so
+ * the TSC is in sync with the already running cpus.
+ *
+ * Also don't force the ADJUST value to zero if that is a valid value
+ * for socket 0 as determined by the system arch. This is required
+ * when multiple sockets are reset asynchronously with each other
+ * and socket 0 may not have an TSC ADJUST value of 0.
+ */
+ if (bootcpu && bootval != 0) {
+ if (likely(!tsc_async_resets)) {
+ pr_warn(FW_BUG "TSC ADJUST: CPU%u: %lld force to 0\n",
+ cpu, bootval);
+ wrmsrl(MSR_IA32_TSC_ADJUST, 0);
+ bootval = 0;
+ } else {
+ pr_info("TSC ADJUST: CPU%u: %lld NOT forced to 0\n",
+ cpu, bootval);
+ }
+ }
+ cur->adjusted = bootval;
+}
+
+#ifndef CONFIG_SMP
+bool __init tsc_store_and_check_tsc_adjust(bool bootcpu)
+{
+ struct tsc_adjust *cur = this_cpu_ptr(&tsc_adjust);
+ s64 bootval;
+
+ if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST))
+ return false;
+
+ /* Skip unnecessary error messages if TSC already unstable */
+ if (check_tsc_unstable())
+ return false;
+
+ rdmsrl(MSR_IA32_TSC_ADJUST, bootval);
+ cur->bootval = bootval;
+ cur->nextcheck = jiffies + HZ;
+ tsc_sanitize_first_cpu(cur, bootval, smp_processor_id(), bootcpu);
+ return false;
+}
+
+#else /* !CONFIG_SMP */
+
+/*
+ * Store and check the TSC ADJUST MSR if available
+ */
+bool tsc_store_and_check_tsc_adjust(bool bootcpu)
+{
+ struct tsc_adjust *ref, *cur = this_cpu_ptr(&tsc_adjust);
+ unsigned int refcpu, cpu = smp_processor_id();
+ struct cpumask *mask;
+ s64 bootval;
+
+ if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST))
+ return false;
+
+ rdmsrl(MSR_IA32_TSC_ADJUST, bootval);
+ cur->bootval = bootval;
+ cur->nextcheck = jiffies + HZ;
+ cur->warned = false;
+
+ /*
+ * If a non-zero TSC value for socket 0 may be valid then the default
+ * adjusted value cannot assumed to be zero either.
+ */
+ if (tsc_async_resets)
+ cur->adjusted = bootval;
+
+ /*
+ * Check whether this CPU is the first in a package to come up. In
+ * this case do not check the boot value against another package
+ * because the new package might have been physically hotplugged,
+ * where TSC_ADJUST is expected to be different. When called on the
+ * boot CPU topology_core_cpumask() might not be available yet.
+ */
+ mask = topology_core_cpumask(cpu);
+ refcpu = mask ? cpumask_any_but(mask, cpu) : nr_cpu_ids;
+
+ if (refcpu >= nr_cpu_ids) {
+ tsc_sanitize_first_cpu(cur, bootval, smp_processor_id(),
+ bootcpu);
+ return false;
+ }
+
+ ref = per_cpu_ptr(&tsc_adjust, refcpu);
+ /*
+ * Compare the boot value and complain if it differs in the
+ * package.
+ */
+ if (bootval != ref->bootval)
+ printk_once(FW_BUG "TSC ADJUST differs within socket(s), fixing all errors\n");
+
+ /*
+ * The TSC_ADJUST values in a package must be the same. If the boot
+ * value on this newly upcoming CPU differs from the adjustment
+ * value of the already online CPU in this package, set it to that
+ * adjusted value.
+ */
+ if (bootval != ref->adjusted) {
+ cur->adjusted = ref->adjusted;
+ wrmsrl(MSR_IA32_TSC_ADJUST, ref->adjusted);
+ }
+ /*
+ * We have the TSCs forced to be in sync on this package. Skip sync
+ * test:
+ */
+ return true;
+}
+
+/*
+ * Entry/exit counters that make sure that both CPUs
+ * run the measurement code at once:
+ */
+static atomic_t start_count;
+static atomic_t stop_count;
+static atomic_t skip_test;
+static atomic_t test_runs;
+
+/*
+ * We use a raw spinlock in this exceptional case, because
+ * we want to have the fastest, inlined, non-debug version
+ * of a critical section, to be able to prove TSC time-warps:
+ */
+static arch_spinlock_t sync_lock = __ARCH_SPIN_LOCK_UNLOCKED;
+
+static cycles_t last_tsc;
+static cycles_t max_warp;
+static int nr_warps;
+static int random_warps;
+
+/*
+ * TSC-warp measurement loop running on both CPUs. This is not called
+ * if there is no TSC.
+ */
+static cycles_t check_tsc_warp(unsigned int timeout)
+{
+ cycles_t start, now, prev, end, cur_max_warp = 0;
+ int i, cur_warps = 0;
+
+ start = rdtsc_ordered();
+ /*
+ * The measurement runs for 'timeout' msecs:
+ */
+ end = start + (cycles_t) tsc_khz * timeout;
+ now = start;
+
+ for (i = 0; ; i++) {
+ /*
+ * We take the global lock, measure TSC, save the
+ * previous TSC that was measured (possibly on
+ * another CPU) and update the previous TSC timestamp.
+ */
+ arch_spin_lock(&sync_lock);
+ prev = last_tsc;
+ now = rdtsc_ordered();
+ last_tsc = now;
+ arch_spin_unlock(&sync_lock);
+
+ /*
+ * Be nice every now and then (and also check whether
+ * measurement is done [we also insert a 10 million
+ * loops safety exit, so we dont lock up in case the
+ * TSC readout is totally broken]):
+ */
+ if (unlikely(!(i & 7))) {
+ if (now > end || i > 10000000)
+ break;
+ cpu_relax();
+ touch_nmi_watchdog();
+ }
+ /*
+ * Outside the critical section we can now see whether
+ * we saw a time-warp of the TSC going backwards:
+ */
+ if (unlikely(prev > now)) {
+ arch_spin_lock(&sync_lock);
+ max_warp = max(max_warp, prev - now);
+ cur_max_warp = max_warp;
+ /*
+ * Check whether this bounces back and forth. Only
+ * one CPU should observe time going backwards.
+ */
+ if (cur_warps != nr_warps)
+ random_warps++;
+ nr_warps++;
+ cur_warps = nr_warps;
+ arch_spin_unlock(&sync_lock);
+ }
+ }
+ WARN(!(now-start),
+ "Warning: zero tsc calibration delta: %Ld [max: %Ld]\n",
+ now-start, end-start);
+ return cur_max_warp;
+}
+
+/*
+ * If the target CPU coming online doesn't have any of its core-siblings
+ * online, a timeout of 20msec will be used for the TSC-warp measurement
+ * loop. Otherwise a smaller timeout of 2msec will be used, as we have some
+ * information about this socket already (and this information grows as we
+ * have more and more logical-siblings in that socket).
+ *
+ * Ideally we should be able to skip the TSC sync check on the other
+ * core-siblings, if the first logical CPU in a socket passed the sync test.
+ * But as the TSC is per-logical CPU and can potentially be modified wrongly
+ * by the bios, TSC sync test for smaller duration should be able
+ * to catch such errors. Also this will catch the condition where all the
+ * cores in the socket doesn't get reset at the same time.
+ */
+static inline unsigned int loop_timeout(int cpu)
+{
+ return (cpumask_weight(topology_core_cpumask(cpu)) > 1) ? 2 : 20;
+}
+
+/*
+ * Source CPU calls into this - it waits for the freshly booted
+ * target CPU to arrive and then starts the measurement:
+ */
+void check_tsc_sync_source(int cpu)
+{
+ int cpus = 2;
+
+ /*
+ * No need to check if we already know that the TSC is not
+ * synchronized or if we have no TSC.
+ */
+ if (unsynchronized_tsc())
+ return;
+
+ /*
+ * Set the maximum number of test runs to
+ * 1 if the CPU does not provide the TSC_ADJUST MSR
+ * 3 if the MSR is available, so the target can try to adjust
+ */
+ if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST))
+ atomic_set(&test_runs, 1);
+ else
+ atomic_set(&test_runs, 3);
+retry:
+ /*
+ * Wait for the target to start or to skip the test:
+ */
+ while (atomic_read(&start_count) != cpus - 1) {
+ if (atomic_read(&skip_test) > 0) {
+ atomic_set(&skip_test, 0);
+ return;
+ }
+ cpu_relax();
+ }
+
+ /*
+ * Trigger the target to continue into the measurement too:
+ */
+ atomic_inc(&start_count);
+
+ check_tsc_warp(loop_timeout(cpu));
+
+ while (atomic_read(&stop_count) != cpus-1)
+ cpu_relax();
+
+ /*
+ * If the test was successful set the number of runs to zero and
+ * stop. If not, decrement the number of runs an check if we can
+ * retry. In case of random warps no retry is attempted.
+ */
+ if (!nr_warps) {
+ atomic_set(&test_runs, 0);
+
+ pr_debug("TSC synchronization [CPU#%d -> CPU#%d]: passed\n",
+ smp_processor_id(), cpu);
+
+ } else if (atomic_dec_and_test(&test_runs) || random_warps) {
+ /* Force it to 0 if random warps brought us here */
+ atomic_set(&test_runs, 0);
+
+ pr_warning("TSC synchronization [CPU#%d -> CPU#%d]:\n",
+ smp_processor_id(), cpu);
+ pr_warning("Measured %Ld cycles TSC warp between CPUs, "
+ "turning off TSC clock.\n", max_warp);
+ if (random_warps)
+ pr_warning("TSC warped randomly between CPUs\n");
+ mark_tsc_unstable("check_tsc_sync_source failed");
+ }
+
+ /*
+ * Reset it - just in case we boot another CPU later:
+ */
+ atomic_set(&start_count, 0);
+ random_warps = 0;
+ nr_warps = 0;
+ max_warp = 0;
+ last_tsc = 0;
+
+ /*
+ * Let the target continue with the bootup:
+ */
+ atomic_inc(&stop_count);
+
+ /*
+ * Retry, if there is a chance to do so.
+ */
+ if (atomic_read(&test_runs) > 0)
+ goto retry;
+}
+
+/*
+ * Freshly booted CPUs call into this:
+ */
+void check_tsc_sync_target(void)
+{
+ struct tsc_adjust *cur = this_cpu_ptr(&tsc_adjust);
+ unsigned int cpu = smp_processor_id();
+ cycles_t cur_max_warp, gbl_max_warp;
+ int cpus = 2;
+
+ /* Also aborts if there is no TSC. */
+ if (unsynchronized_tsc())
+ return;
+
+ /*
+ * Store, verify and sanitize the TSC adjust register. If
+ * successful skip the test.
+ *
+ * The test is also skipped when the TSC is marked reliable. This
+ * is true for SoCs which have no fallback clocksource. On these
+ * SoCs the TSC is frequency synchronized, but still the TSC ADJUST
+ * register might have been wreckaged by the BIOS..
+ */
+ if (tsc_store_and_check_tsc_adjust(false) || tsc_clocksource_reliable) {
+ atomic_inc(&skip_test);
+ return;
+ }
+
+retry:
+ /*
+ * Register this CPU's participation and wait for the
+ * source CPU to start the measurement:
+ */
+ atomic_inc(&start_count);
+ while (atomic_read(&start_count) != cpus)
+ cpu_relax();
+
+ cur_max_warp = check_tsc_warp(loop_timeout(cpu));
+
+ /*
+ * Store the maximum observed warp value for a potential retry:
+ */
+ gbl_max_warp = max_warp;
+
+ /*
+ * Ok, we are done:
+ */
+ atomic_inc(&stop_count);
+
+ /*
+ * Wait for the source CPU to print stuff:
+ */
+ while (atomic_read(&stop_count) != cpus)
+ cpu_relax();
+
+ /*
+ * Reset it for the next sync test:
+ */
+ atomic_set(&stop_count, 0);
+
+ /*
+ * Check the number of remaining test runs. If not zero, the test
+ * failed and a retry with adjusted TSC is possible. If zero the
+ * test was either successful or failed terminally.
+ */
+ if (!atomic_read(&test_runs))
+ return;
+
+ /*
+ * If the warp value of this CPU is 0, then the other CPU
+ * observed time going backwards so this TSC was ahead and
+ * needs to move backwards.
+ */
+ if (!cur_max_warp)
+ cur_max_warp = -gbl_max_warp;
+
+ /*
+ * Add the result to the previous adjustment value.
+ *
+ * The adjustement value is slightly off by the overhead of the
+ * sync mechanism (observed values are ~200 TSC cycles), but this
+ * really depends on CPU, node distance and frequency. So
+ * compensating for this is hard to get right. Experiments show
+ * that the warp is not longer detectable when the observed warp
+ * value is used. In the worst case the adjustment needs to go
+ * through a 3rd run for fine tuning.
+ */
+ cur->adjusted += cur_max_warp;
+
+ pr_warn("TSC ADJUST compensate: CPU%u observed %lld warp. Adjust: %lld\n",
+ cpu, cur_max_warp, cur->adjusted);
+
+ wrmsrl(MSR_IA32_TSC_ADJUST, cur->adjusted);
+ goto retry;
+
+}
+
+#endif /* CONFIG_SMP */
diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c
new file mode 100644
index 000000000..ff20b35e9
--- /dev/null
+++ b/arch/x86/kernel/umip.c
@@ -0,0 +1,429 @@
+/*
+ * umip.c Emulation for instruction protected by the Intel User-Mode
+ * Instruction Prevention feature
+ *
+ * Copyright (c) 2017, Intel Corporation.
+ * Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+ */
+
+#include <linux/uaccess.h>
+#include <asm/umip.h>
+#include <asm/traps.h>
+#include <asm/insn.h>
+#include <asm/insn-eval.h>
+#include <linux/ratelimit.h>
+
+#undef pr_fmt
+#define pr_fmt(fmt) "umip: " fmt
+
+/** DOC: Emulation for User-Mode Instruction Prevention (UMIP)
+ *
+ * The feature User-Mode Instruction Prevention present in recent Intel
+ * processor prevents a group of instructions (sgdt, sidt, sldt, smsw, and str)
+ * from being executed with CPL > 0. Otherwise, a general protection fault is
+ * issued.
+ *
+ * Rather than relaying to the user space the general protection fault caused by
+ * the UMIP-protected instructions (in the form of a SIGSEGV signal), it can be
+ * trapped and emulate the result of such instructions to provide dummy values.
+ * This allows to both conserve the current kernel behavior and not reveal the
+ * system resources that UMIP intends to protect (i.e., the locations of the
+ * global descriptor and interrupt descriptor tables, the segment selectors of
+ * the local descriptor table, the value of the task state register and the
+ * contents of the CR0 register).
+ *
+ * This emulation is needed because certain applications (e.g., WineHQ and
+ * DOSEMU2) rely on this subset of instructions to function.
+ *
+ * The instructions protected by UMIP can be split in two groups. Those which
+ * return a kernel memory address (sgdt and sidt) and those which return a
+ * value (sldt, str and smsw).
+ *
+ * For the instructions that return a kernel memory address, applications
+ * such as WineHQ rely on the result being located in the kernel memory space,
+ * not the actual location of the table. The result is emulated as a hard-coded
+ * value that, lies close to the top of the kernel memory. The limit for the GDT
+ * and the IDT are set to zero.
+ *
+ * Given that sldt and str are not commonly used in programs that run on WineHQ
+ * or DOSEMU2, they are not emulated.
+ *
+ * The instruction smsw is emulated to return the value that the register CR0
+ * has at boot time as set in the head_32.
+ *
+ * Also, emulation is provided only for 32-bit processes; 64-bit processes
+ * that attempt to use the instructions that UMIP protects will receive the
+ * SIGSEGV signal issued as a consequence of the general protection fault.
+ *
+ * Care is taken to appropriately emulate the results when segmentation is
+ * used. That is, rather than relying on USER_DS and USER_CS, the function
+ * insn_get_addr_ref() inspects the segment descriptor pointed by the
+ * registers in pt_regs. This ensures that we correctly obtain the segment
+ * base address and the address and operand sizes even if the user space
+ * application uses a local descriptor table.
+ */
+
+#define UMIP_DUMMY_GDT_BASE 0xfffe0000
+#define UMIP_DUMMY_IDT_BASE 0xffff0000
+
+/*
+ * The SGDT and SIDT instructions store the contents of the global descriptor
+ * table and interrupt table registers, respectively. The destination is a
+ * memory operand of X+2 bytes. X bytes are used to store the base address of
+ * the table and 2 bytes are used to store the limit. In 32-bit processes, the
+ * only processes for which emulation is provided, X has a value of 4.
+ */
+#define UMIP_GDT_IDT_BASE_SIZE 4
+#define UMIP_GDT_IDT_LIMIT_SIZE 2
+
+#define UMIP_INST_SGDT 0 /* 0F 01 /0 */
+#define UMIP_INST_SIDT 1 /* 0F 01 /1 */
+#define UMIP_INST_SMSW 2 /* 0F 01 /4 */
+#define UMIP_INST_SLDT 3 /* 0F 00 /0 */
+#define UMIP_INST_STR 4 /* 0F 00 /1 */
+
+const char * const umip_insns[5] = {
+ [UMIP_INST_SGDT] = "SGDT",
+ [UMIP_INST_SIDT] = "SIDT",
+ [UMIP_INST_SMSW] = "SMSW",
+ [UMIP_INST_SLDT] = "SLDT",
+ [UMIP_INST_STR] = "STR",
+};
+
+#define umip_pr_err(regs, fmt, ...) \
+ umip_printk(regs, KERN_ERR, fmt, ##__VA_ARGS__)
+#define umip_pr_warning(regs, fmt, ...) \
+ umip_printk(regs, KERN_WARNING, fmt, ##__VA_ARGS__)
+
+/**
+ * umip_printk() - Print a rate-limited message
+ * @regs: Register set with the context in which the warning is printed
+ * @log_level: Kernel log level to print the message
+ * @fmt: The text string to print
+ *
+ * Print the text contained in @fmt. The print rate is limited to bursts of 5
+ * messages every two minutes. The purpose of this customized version of
+ * printk() is to print messages when user space processes use any of the
+ * UMIP-protected instructions. Thus, the printed text is prepended with the
+ * task name and process ID number of the current task as well as the
+ * instruction and stack pointers in @regs as seen when entering kernel mode.
+ *
+ * Returns:
+ *
+ * None.
+ */
+static __printf(3, 4)
+void umip_printk(const struct pt_regs *regs, const char *log_level,
+ const char *fmt, ...)
+{
+ /* Bursts of 5 messages every two minutes */
+ static DEFINE_RATELIMIT_STATE(ratelimit, 2 * 60 * HZ, 5);
+ struct task_struct *tsk = current;
+ struct va_format vaf;
+ va_list args;
+
+ if (!__ratelimit(&ratelimit))
+ return;
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ printk("%s" pr_fmt("%s[%d] ip:%lx sp:%lx: %pV"), log_level, tsk->comm,
+ task_pid_nr(tsk), regs->ip, regs->sp, &vaf);
+ va_end(args);
+}
+
+/**
+ * identify_insn() - Identify a UMIP-protected instruction
+ * @insn: Instruction structure with opcode and ModRM byte.
+ *
+ * From the opcode and ModRM.reg in @insn identify, if any, a UMIP-protected
+ * instruction that can be emulated.
+ *
+ * Returns:
+ *
+ * On success, a constant identifying a specific UMIP-protected instruction that
+ * can be emulated.
+ *
+ * -EINVAL on error or when not an UMIP-protected instruction that can be
+ * emulated.
+ */
+static int identify_insn(struct insn *insn)
+{
+ /* By getting modrm we also get the opcode. */
+ insn_get_modrm(insn);
+
+ if (!insn->modrm.nbytes)
+ return -EINVAL;
+
+ /* All the instructions of interest start with 0x0f. */
+ if (insn->opcode.bytes[0] != 0xf)
+ return -EINVAL;
+
+ if (insn->opcode.bytes[1] == 0x1) {
+ switch (X86_MODRM_REG(insn->modrm.value)) {
+ case 0:
+ return UMIP_INST_SGDT;
+ case 1:
+ return UMIP_INST_SIDT;
+ case 4:
+ return UMIP_INST_SMSW;
+ default:
+ return -EINVAL;
+ }
+ } else if (insn->opcode.bytes[1] == 0x0) {
+ if (X86_MODRM_REG(insn->modrm.value) == 0)
+ return UMIP_INST_SLDT;
+ else if (X86_MODRM_REG(insn->modrm.value) == 1)
+ return UMIP_INST_STR;
+ else
+ return -EINVAL;
+ } else {
+ return -EINVAL;
+ }
+}
+
+/**
+ * emulate_umip_insn() - Emulate UMIP instructions and return dummy values
+ * @insn: Instruction structure with operands
+ * @umip_inst: A constant indicating the instruction to emulate
+ * @data: Buffer into which the dummy result is stored
+ * @data_size: Size of the emulated result
+ *
+ * Emulate an instruction protected by UMIP and provide a dummy result. The
+ * result of the emulation is saved in @data. The size of the results depends
+ * on both the instruction and type of operand (register vs memory address).
+ * The size of the result is updated in @data_size. Caller is responsible
+ * of providing a @data buffer of at least UMIP_GDT_IDT_BASE_SIZE +
+ * UMIP_GDT_IDT_LIMIT_SIZE bytes.
+ *
+ * Returns:
+ *
+ * 0 on success, -EINVAL on error while emulating.
+ */
+static int emulate_umip_insn(struct insn *insn, int umip_inst,
+ unsigned char *data, int *data_size)
+{
+ unsigned long dummy_base_addr, dummy_value;
+ unsigned short dummy_limit = 0;
+
+ if (!data || !data_size || !insn)
+ return -EINVAL;
+ /*
+ * These two instructions return the base address and limit of the
+ * global and interrupt descriptor table, respectively. According to the
+ * Intel Software Development manual, the base address can be 24-bit,
+ * 32-bit or 64-bit. Limit is always 16-bit. If the operand size is
+ * 16-bit, the returned value of the base address is supposed to be a
+ * zero-extended 24-byte number. However, it seems that a 32-byte number
+ * is always returned irrespective of the operand size.
+ */
+ if (umip_inst == UMIP_INST_SGDT || umip_inst == UMIP_INST_SIDT) {
+ /* SGDT and SIDT do not use registers operands. */
+ if (X86_MODRM_MOD(insn->modrm.value) == 3)
+ return -EINVAL;
+
+ if (umip_inst == UMIP_INST_SGDT)
+ dummy_base_addr = UMIP_DUMMY_GDT_BASE;
+ else
+ dummy_base_addr = UMIP_DUMMY_IDT_BASE;
+
+ *data_size = UMIP_GDT_IDT_LIMIT_SIZE + UMIP_GDT_IDT_BASE_SIZE;
+
+ memcpy(data + 2, &dummy_base_addr, UMIP_GDT_IDT_BASE_SIZE);
+ memcpy(data, &dummy_limit, UMIP_GDT_IDT_LIMIT_SIZE);
+
+ } else if (umip_inst == UMIP_INST_SMSW) {
+ dummy_value = CR0_STATE;
+
+ /*
+ * Even though the CR0 register has 4 bytes, the number
+ * of bytes to be copied in the result buffer is determined
+ * by whether the operand is a register or a memory location.
+ * If operand is a register, return as many bytes as the operand
+ * size. If operand is memory, return only the two least
+ * siginificant bytes of CR0.
+ */
+ if (X86_MODRM_MOD(insn->modrm.value) == 3)
+ *data_size = insn->opnd_bytes;
+ else
+ *data_size = 2;
+
+ memcpy(data, &dummy_value, *data_size);
+ /* STR and SLDT are not emulated */
+ } else {
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/**
+ * force_sig_info_umip_fault() - Force a SIGSEGV with SEGV_MAPERR
+ * @addr: Address that caused the signal
+ * @regs: Register set containing the instruction pointer
+ *
+ * Force a SIGSEGV signal with SEGV_MAPERR as the error code. This function is
+ * intended to be used to provide a segmentation fault when the result of the
+ * UMIP emulation could not be copied to the user space memory.
+ *
+ * Returns: none
+ */
+static void force_sig_info_umip_fault(void __user *addr, struct pt_regs *regs)
+{
+ siginfo_t info;
+ struct task_struct *tsk = current;
+
+ tsk->thread.cr2 = (unsigned long)addr;
+ tsk->thread.error_code = X86_PF_USER | X86_PF_WRITE;
+ tsk->thread.trap_nr = X86_TRAP_PF;
+
+ clear_siginfo(&info);
+ info.si_signo = SIGSEGV;
+ info.si_errno = 0;
+ info.si_code = SEGV_MAPERR;
+ info.si_addr = addr;
+ force_sig_info(SIGSEGV, &info, tsk);
+
+ if (!(show_unhandled_signals && unhandled_signal(tsk, SIGSEGV)))
+ return;
+
+ umip_pr_err(regs, "segfault in emulation. error%x\n",
+ X86_PF_USER | X86_PF_WRITE);
+}
+
+/**
+ * fixup_umip_exception() - Fixup a general protection fault caused by UMIP
+ * @regs: Registers as saved when entering the #GP handler
+ *
+ * The instructions sgdt, sidt, str, smsw, sldt cause a general protection
+ * fault if executed with CPL > 0 (i.e., from user space). If the offending
+ * user-space process is not in long mode, this function fixes the exception
+ * up and provides dummy results for sgdt, sidt and smsw; str and sldt are not
+ * fixed up. Also long mode user-space processes are not fixed up.
+ *
+ * If operands are memory addresses, results are copied to user-space memory as
+ * indicated by the instruction pointed by eIP using the registers indicated in
+ * the instruction operands. If operands are registers, results are copied into
+ * the context that was saved when entering kernel mode.
+ *
+ * Returns:
+ *
+ * True if emulation was successful; false if not.
+ */
+bool fixup_umip_exception(struct pt_regs *regs)
+{
+ int not_copied, nr_copied, reg_offset, dummy_data_size, umip_inst;
+ unsigned long seg_base = 0, *reg_addr;
+ /* 10 bytes is the maximum size of the result of UMIP instructions */
+ unsigned char dummy_data[10] = { 0 };
+ unsigned char buf[MAX_INSN_SIZE];
+ void __user *uaddr;
+ struct insn insn;
+ int seg_defs;
+
+ if (!regs)
+ return false;
+
+ /*
+ * If not in user-space long mode, a custom code segment could be in
+ * use. This is true in protected mode (if the process defined a local
+ * descriptor table), or virtual-8086 mode. In most of the cases
+ * seg_base will be zero as in USER_CS.
+ */
+ if (!user_64bit_mode(regs))
+ seg_base = insn_get_seg_base(regs, INAT_SEG_REG_CS);
+
+ if (seg_base == -1L)
+ return false;
+
+ not_copied = copy_from_user(buf, (void __user *)(seg_base + regs->ip),
+ sizeof(buf));
+ nr_copied = sizeof(buf) - not_copied;
+
+ /*
+ * The copy_from_user above could have failed if user code is protected
+ * by a memory protection key. Give up on emulation in such a case.
+ * Should we issue a page fault?
+ */
+ if (!nr_copied)
+ return false;
+
+ insn_init(&insn, buf, nr_copied, user_64bit_mode(regs));
+
+ /*
+ * Override the default operand and address sizes with what is specified
+ * in the code segment descriptor. The instruction decoder only sets
+ * the address size it to either 4 or 8 address bytes and does nothing
+ * for the operand bytes. This OK for most of the cases, but we could
+ * have special cases where, for instance, a 16-bit code segment
+ * descriptor is used.
+ * If there is an address override prefix, the instruction decoder
+ * correctly updates these values, even for 16-bit defaults.
+ */
+ seg_defs = insn_get_code_seg_params(regs);
+ if (seg_defs == -EINVAL)
+ return false;
+
+ insn.addr_bytes = INSN_CODE_SEG_ADDR_SZ(seg_defs);
+ insn.opnd_bytes = INSN_CODE_SEG_OPND_SZ(seg_defs);
+
+ insn_get_length(&insn);
+ if (nr_copied < insn.length)
+ return false;
+
+ umip_inst = identify_insn(&insn);
+ if (umip_inst < 0)
+ return false;
+
+ umip_pr_warning(regs, "%s instruction cannot be used by applications.\n",
+ umip_insns[umip_inst]);
+
+ /* Do not emulate SLDT, STR or user long mode processes. */
+ if (umip_inst == UMIP_INST_STR || umip_inst == UMIP_INST_SLDT || user_64bit_mode(regs))
+ return false;
+
+ umip_pr_warning(regs, "For now, expensive software emulation returns the result.\n");
+
+ if (emulate_umip_insn(&insn, umip_inst, dummy_data, &dummy_data_size))
+ return false;
+
+ /*
+ * If operand is a register, write result to the copy of the register
+ * value that was pushed to the stack when entering into kernel mode.
+ * Upon exit, the value we write will be restored to the actual hardware
+ * register.
+ */
+ if (X86_MODRM_MOD(insn.modrm.value) == 3) {
+ reg_offset = insn_get_modrm_rm_off(&insn, regs);
+
+ /*
+ * Negative values are usually errors. In memory addressing,
+ * the exception is -EDOM. Since we expect a register operand,
+ * all negative values are errors.
+ */
+ if (reg_offset < 0)
+ return false;
+
+ reg_addr = (unsigned long *)((unsigned long)regs + reg_offset);
+ memcpy(reg_addr, dummy_data, dummy_data_size);
+ } else {
+ uaddr = insn_get_addr_ref(&insn, regs);
+ if ((unsigned long)uaddr == -1L)
+ return false;
+
+ nr_copied = copy_to_user(uaddr, dummy_data, dummy_data_size);
+ if (nr_copied > 0) {
+ /*
+ * If copy fails, send a signal and tell caller that
+ * fault was fixed up.
+ */
+ force_sig_info_umip_fault(uaddr, regs);
+ return true;
+ }
+ }
+
+ /* increase IP to let the program keep going */
+ regs->ip += insn.length;
+ return true;
+}
diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c
new file mode 100644
index 000000000..9b9fd4826
--- /dev/null
+++ b/arch/x86/kernel/unwind_frame.c
@@ -0,0 +1,435 @@
+#include <linux/sched.h>
+#include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
+#include <linux/interrupt.h>
+#include <asm/sections.h>
+#include <asm/ptrace.h>
+#include <asm/bitops.h>
+#include <asm/stacktrace.h>
+#include <asm/unwind.h>
+
+#define FRAME_HEADER_SIZE (sizeof(long) * 2)
+
+unsigned long unwind_get_return_address(struct unwind_state *state)
+{
+ if (unwind_done(state))
+ return 0;
+
+ return __kernel_text_address(state->ip) ? state->ip : 0;
+}
+EXPORT_SYMBOL_GPL(unwind_get_return_address);
+
+unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
+{
+ if (unwind_done(state))
+ return NULL;
+
+ return state->regs ? &state->regs->ip : state->bp + 1;
+}
+
+static void unwind_dump(struct unwind_state *state)
+{
+ static bool dumped_before = false;
+ bool prev_zero, zero = false;
+ unsigned long word, *sp;
+ struct stack_info stack_info = {0};
+ unsigned long visit_mask = 0;
+
+ if (dumped_before)
+ return;
+
+ dumped_before = true;
+
+ printk_deferred("unwind stack type:%d next_sp:%p mask:0x%lx graph_idx:%d\n",
+ state->stack_info.type, state->stack_info.next_sp,
+ state->stack_mask, state->graph_idx);
+
+ for (sp = PTR_ALIGN(state->orig_sp, sizeof(long)); sp;
+ sp = PTR_ALIGN(stack_info.next_sp, sizeof(long))) {
+ if (get_stack_info(sp, state->task, &stack_info, &visit_mask))
+ break;
+
+ for (; sp < stack_info.end; sp++) {
+
+ word = READ_ONCE_NOCHECK(*sp);
+
+ prev_zero = zero;
+ zero = word == 0;
+
+ if (zero) {
+ if (!prev_zero)
+ printk_deferred("%p: %0*x ...\n",
+ sp, BITS_PER_LONG/4, 0);
+ continue;
+ }
+
+ printk_deferred("%p: %0*lx (%pB)\n",
+ sp, BITS_PER_LONG/4, word, (void *)word);
+ }
+ }
+}
+
+static size_t regs_size(struct pt_regs *regs)
+{
+ /* x86_32 regs from kernel mode are two words shorter: */
+ if (IS_ENABLED(CONFIG_X86_32) && !user_mode(regs))
+ return sizeof(*regs) - 2*sizeof(long);
+
+ return sizeof(*regs);
+}
+
+static bool in_entry_code(unsigned long ip)
+{
+ char *addr = (char *)ip;
+
+ if (addr >= __entry_text_start && addr < __entry_text_end)
+ return true;
+
+ if (addr >= __irqentry_text_start && addr < __irqentry_text_end)
+ return true;
+
+ return false;
+}
+
+static inline unsigned long *last_frame(struct unwind_state *state)
+{
+ return (unsigned long *)task_pt_regs(state->task) - 2;
+}
+
+static bool is_last_frame(struct unwind_state *state)
+{
+ return state->bp == last_frame(state);
+}
+
+#ifdef CONFIG_X86_32
+#define GCC_REALIGN_WORDS 3
+#else
+#define GCC_REALIGN_WORDS 1
+#endif
+
+static inline unsigned long *last_aligned_frame(struct unwind_state *state)
+{
+ return last_frame(state) - GCC_REALIGN_WORDS;
+}
+
+static bool is_last_aligned_frame(struct unwind_state *state)
+{
+ unsigned long *last_bp = last_frame(state);
+ unsigned long *aligned_bp = last_aligned_frame(state);
+
+ /*
+ * GCC can occasionally decide to realign the stack pointer and change
+ * the offset of the stack frame in the prologue of a function called
+ * by head/entry code. Examples:
+ *
+ * <start_secondary>:
+ * push %edi
+ * lea 0x8(%esp),%edi
+ * and $0xfffffff8,%esp
+ * pushl -0x4(%edi)
+ * push %ebp
+ * mov %esp,%ebp
+ *
+ * <x86_64_start_kernel>:
+ * lea 0x8(%rsp),%r10
+ * and $0xfffffffffffffff0,%rsp
+ * pushq -0x8(%r10)
+ * push %rbp
+ * mov %rsp,%rbp
+ *
+ * After aligning the stack, it pushes a duplicate copy of the return
+ * address before pushing the frame pointer.
+ */
+ return (state->bp == aligned_bp && *(aligned_bp + 1) == *(last_bp + 1));
+}
+
+static bool is_last_ftrace_frame(struct unwind_state *state)
+{
+ unsigned long *last_bp = last_frame(state);
+ unsigned long *last_ftrace_bp = last_bp - 3;
+
+ /*
+ * When unwinding from an ftrace handler of a function called by entry
+ * code, the stack layout of the last frame is:
+ *
+ * bp
+ * parent ret addr
+ * bp
+ * function ret addr
+ * parent ret addr
+ * pt_regs
+ * -----------------
+ */
+ return (state->bp == last_ftrace_bp &&
+ *state->bp == *(state->bp + 2) &&
+ *(state->bp + 1) == *(state->bp + 4));
+}
+
+static bool is_last_task_frame(struct unwind_state *state)
+{
+ return is_last_frame(state) || is_last_aligned_frame(state) ||
+ is_last_ftrace_frame(state);
+}
+
+/*
+ * This determines if the frame pointer actually contains an encoded pointer to
+ * pt_regs on the stack. See ENCODE_FRAME_POINTER.
+ */
+#ifdef CONFIG_X86_64
+static struct pt_regs *decode_frame_pointer(unsigned long *bp)
+{
+ unsigned long regs = (unsigned long)bp;
+
+ if (!(regs & 0x1))
+ return NULL;
+
+ return (struct pt_regs *)(regs & ~0x1);
+}
+#else
+static struct pt_regs *decode_frame_pointer(unsigned long *bp)
+{
+ unsigned long regs = (unsigned long)bp;
+
+ if (regs & 0x80000000)
+ return NULL;
+
+ return (struct pt_regs *)(regs | 0x80000000);
+}
+#endif
+
+#ifdef CONFIG_X86_32
+#define KERNEL_REGS_SIZE (sizeof(struct pt_regs) - 2*sizeof(long))
+#else
+#define KERNEL_REGS_SIZE (sizeof(struct pt_regs))
+#endif
+
+static bool update_stack_state(struct unwind_state *state,
+ unsigned long *next_bp)
+{
+ struct stack_info *info = &state->stack_info;
+ enum stack_type prev_type = info->type;
+ struct pt_regs *regs;
+ unsigned long *frame, *prev_frame_end, *addr_p, addr;
+ size_t len;
+
+ if (state->regs)
+ prev_frame_end = (void *)state->regs + regs_size(state->regs);
+ else
+ prev_frame_end = (void *)state->bp + FRAME_HEADER_SIZE;
+
+ /* Is the next frame pointer an encoded pointer to pt_regs? */
+ regs = decode_frame_pointer(next_bp);
+ if (regs) {
+ frame = (unsigned long *)regs;
+ len = KERNEL_REGS_SIZE;
+ state->got_irq = true;
+ } else {
+ frame = next_bp;
+ len = FRAME_HEADER_SIZE;
+ }
+
+ /*
+ * If the next bp isn't on the current stack, switch to the next one.
+ *
+ * We may have to traverse multiple stacks to deal with the possibility
+ * that info->next_sp could point to an empty stack and the next bp
+ * could be on a subsequent stack.
+ */
+ while (!on_stack(info, frame, len))
+ if (get_stack_info(info->next_sp, state->task, info,
+ &state->stack_mask))
+ return false;
+
+ /* Make sure it only unwinds up and doesn't overlap the prev frame: */
+ if (state->orig_sp && state->stack_info.type == prev_type &&
+ frame < prev_frame_end)
+ return false;
+
+ /*
+ * On 32-bit with user mode regs, make sure the last two regs are safe
+ * to access:
+ */
+ if (IS_ENABLED(CONFIG_X86_32) && regs && user_mode(regs) &&
+ !on_stack(info, frame, len + 2*sizeof(long)))
+ return false;
+
+ /* Move state to the next frame: */
+ if (regs) {
+ state->regs = regs;
+ state->bp = NULL;
+ } else {
+ state->bp = next_bp;
+ state->regs = NULL;
+ }
+
+ /* Save the return address: */
+ if (state->regs && user_mode(state->regs))
+ state->ip = 0;
+ else {
+ addr_p = unwind_get_return_address_ptr(state);
+ addr = READ_ONCE_TASK_STACK(state->task, *addr_p);
+ state->ip = ftrace_graph_ret_addr(state->task, &state->graph_idx,
+ addr, addr_p);
+ }
+
+ /* Save the original stack pointer for unwind_dump(): */
+ if (!state->orig_sp)
+ state->orig_sp = frame;
+
+ return true;
+}
+
+bool unwind_next_frame(struct unwind_state *state)
+{
+ struct pt_regs *regs;
+ unsigned long *next_bp;
+
+ if (unwind_done(state))
+ return false;
+
+ /* Have we reached the end? */
+ if (state->regs && user_mode(state->regs))
+ goto the_end;
+
+ if (is_last_task_frame(state)) {
+ regs = task_pt_regs(state->task);
+
+ /*
+ * kthreads (other than the boot CPU's idle thread) have some
+ * partial regs at the end of their stack which were placed
+ * there by copy_thread_tls(). But the regs don't have any
+ * useful information, so we can skip them.
+ *
+ * This user_mode() check is slightly broader than a PF_KTHREAD
+ * check because it also catches the awkward situation where a
+ * newly forked kthread transitions into a user task by calling
+ * do_execve(), which eventually clears PF_KTHREAD.
+ */
+ if (!user_mode(regs))
+ goto the_end;
+
+ /*
+ * We're almost at the end, but not quite: there's still the
+ * syscall regs frame. Entry code doesn't encode the regs
+ * pointer for syscalls, so we have to set it manually.
+ */
+ state->regs = regs;
+ state->bp = NULL;
+ state->ip = 0;
+ return true;
+ }
+
+ /* Get the next frame pointer: */
+ if (state->next_bp) {
+ next_bp = state->next_bp;
+ state->next_bp = NULL;
+ } else if (state->regs) {
+ next_bp = (unsigned long *)state->regs->bp;
+ } else {
+ next_bp = (unsigned long *)READ_ONCE_TASK_STACK(state->task, *state->bp);
+ }
+
+ /* Move to the next frame if it's safe: */
+ if (!update_stack_state(state, next_bp))
+ goto bad_address;
+
+ return true;
+
+bad_address:
+ state->error = true;
+
+ /*
+ * When unwinding a non-current task, the task might actually be
+ * running on another CPU, in which case it could be modifying its
+ * stack while we're reading it. This is generally not a problem and
+ * can be ignored as long as the caller understands that unwinding
+ * another task will not always succeed.
+ */
+ if (state->task != current)
+ goto the_end;
+
+ /*
+ * Don't warn if the unwinder got lost due to an interrupt in entry
+ * code or in the C handler before the first frame pointer got set up:
+ */
+ if (state->got_irq && in_entry_code(state->ip))
+ goto the_end;
+ if (state->regs &&
+ state->regs->sp >= (unsigned long)last_aligned_frame(state) &&
+ state->regs->sp < (unsigned long)task_pt_regs(state->task))
+ goto the_end;
+
+ /*
+ * There are some known frame pointer issues on 32-bit. Disable
+ * unwinder warnings on 32-bit until it gets objtool support.
+ */
+ if (IS_ENABLED(CONFIG_X86_32))
+ goto the_end;
+
+ if (state->regs) {
+ printk_deferred_once(KERN_WARNING
+ "WARNING: kernel stack regs at %p in %s:%d has bad 'bp' value %p\n",
+ state->regs, state->task->comm,
+ state->task->pid, next_bp);
+ unwind_dump(state);
+ } else {
+ printk_deferred_once(KERN_WARNING
+ "WARNING: kernel stack frame pointer at %p in %s:%d has bad value %p\n",
+ state->bp, state->task->comm,
+ state->task->pid, next_bp);
+ unwind_dump(state);
+ }
+the_end:
+ state->stack_info.type = STACK_TYPE_UNKNOWN;
+ return false;
+}
+EXPORT_SYMBOL_GPL(unwind_next_frame);
+
+void __unwind_start(struct unwind_state *state, struct task_struct *task,
+ struct pt_regs *regs, unsigned long *first_frame)
+{
+ unsigned long *bp;
+
+ memset(state, 0, sizeof(*state));
+ state->task = task;
+ state->got_irq = (regs);
+
+ /* Don't even attempt to start from user mode regs: */
+ if (regs && user_mode(regs)) {
+ state->stack_info.type = STACK_TYPE_UNKNOWN;
+ return;
+ }
+
+ bp = get_frame_pointer(task, regs);
+
+ /*
+ * If we crash with IP==0, the last successfully executed instruction
+ * was probably an indirect function call with a NULL function pointer.
+ * That means that SP points into the middle of an incomplete frame:
+ * *SP is a return pointer, and *(SP-sizeof(unsigned long)) is where we
+ * would have written a frame pointer if we hadn't crashed.
+ * Pretend that the frame is complete and that BP points to it, but save
+ * the real BP so that we can use it when looking for the next frame.
+ */
+ if (regs && regs->ip == 0 &&
+ (unsigned long *)kernel_stack_pointer(regs) >= first_frame) {
+ state->next_bp = bp;
+ bp = ((unsigned long *)kernel_stack_pointer(regs)) - 1;
+ }
+
+ /* Initialize stack info and make sure the frame data is accessible: */
+ get_stack_info(bp, state->task, &state->stack_info,
+ &state->stack_mask);
+ update_stack_state(state, bp);
+
+ /*
+ * The caller can provide the address of the first frame directly
+ * (first_frame) or indirectly (regs->sp) to indicate which stack frame
+ * to start unwinding at. Skip ahead until we reach it.
+ */
+ while (!unwind_done(state) &&
+ (!on_stack(&state->stack_info, first_frame, sizeof(long)) ||
+ (state->next_bp == NULL && state->bp < first_frame)))
+ unwind_next_frame(state);
+}
+EXPORT_SYMBOL_GPL(__unwind_start);
diff --git a/arch/x86/kernel/unwind_guess.c b/arch/x86/kernel/unwind_guess.c
new file mode 100644
index 000000000..4f0e17b90
--- /dev/null
+++ b/arch/x86/kernel/unwind_guess.c
@@ -0,0 +1,72 @@
+#include <linux/sched.h>
+#include <linux/ftrace.h>
+#include <asm/ptrace.h>
+#include <asm/bitops.h>
+#include <asm/stacktrace.h>
+#include <asm/unwind.h>
+
+unsigned long unwind_get_return_address(struct unwind_state *state)
+{
+ unsigned long addr;
+
+ if (unwind_done(state))
+ return 0;
+
+ addr = READ_ONCE_NOCHECK(*state->sp);
+
+ return ftrace_graph_ret_addr(state->task, &state->graph_idx,
+ addr, state->sp);
+}
+EXPORT_SYMBOL_GPL(unwind_get_return_address);
+
+unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
+{
+ return NULL;
+}
+
+bool unwind_next_frame(struct unwind_state *state)
+{
+ struct stack_info *info = &state->stack_info;
+
+ if (unwind_done(state))
+ return false;
+
+ do {
+ for (state->sp++; state->sp < info->end; state->sp++) {
+ unsigned long addr = READ_ONCE_NOCHECK(*state->sp);
+
+ if (__kernel_text_address(addr))
+ return true;
+ }
+
+ state->sp = PTR_ALIGN(info->next_sp, sizeof(long));
+
+ } while (!get_stack_info(state->sp, state->task, info,
+ &state->stack_mask));
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(unwind_next_frame);
+
+void __unwind_start(struct unwind_state *state, struct task_struct *task,
+ struct pt_regs *regs, unsigned long *first_frame)
+{
+ memset(state, 0, sizeof(*state));
+
+ state->task = task;
+ state->sp = PTR_ALIGN(first_frame, sizeof(long));
+
+ get_stack_info(first_frame, state->task, &state->stack_info,
+ &state->stack_mask);
+
+ /*
+ * The caller can provide the address of the first frame directly
+ * (first_frame) or indirectly (regs->sp) to indicate which stack frame
+ * to start unwinding at. Skip ahead until we reach it.
+ */
+ if (!unwind_done(state) &&
+ (!on_stack(&state->stack_info, first_frame, sizeof(long)) ||
+ !__kernel_text_address(*first_frame)))
+ unwind_next_frame(state);
+}
+EXPORT_SYMBOL_GPL(__unwind_start);
diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
new file mode 100644
index 000000000..4f17c1c94
--- /dev/null
+++ b/arch/x86/kernel/unwind_orc.c
@@ -0,0 +1,676 @@
+#include <linux/module.h>
+#include <linux/sort.h>
+#include <asm/ptrace.h>
+#include <asm/stacktrace.h>
+#include <asm/unwind.h>
+#include <asm/orc_types.h>
+#include <asm/orc_lookup.h>
+
+#define orc_warn(fmt, ...) \
+ printk_deferred_once(KERN_WARNING pr_fmt("WARNING: " fmt), ##__VA_ARGS__)
+
+extern int __start_orc_unwind_ip[];
+extern int __stop_orc_unwind_ip[];
+extern struct orc_entry __start_orc_unwind[];
+extern struct orc_entry __stop_orc_unwind[];
+
+static DEFINE_MUTEX(sort_mutex);
+int *cur_orc_ip_table = __start_orc_unwind_ip;
+struct orc_entry *cur_orc_table = __start_orc_unwind;
+
+unsigned int lookup_num_blocks;
+bool orc_init;
+
+static inline unsigned long orc_ip(const int *ip)
+{
+ return (unsigned long)ip + *ip;
+}
+
+static struct orc_entry *__orc_find(int *ip_table, struct orc_entry *u_table,
+ unsigned int num_entries, unsigned long ip)
+{
+ int *first = ip_table;
+ int *last = ip_table + num_entries - 1;
+ int *mid = first, *found = first;
+
+ if (!num_entries)
+ return NULL;
+
+ /*
+ * Do a binary range search to find the rightmost duplicate of a given
+ * starting address. Some entries are section terminators which are
+ * "weak" entries for ensuring there are no gaps. They should be
+ * ignored when they conflict with a real entry.
+ */
+ while (first <= last) {
+ mid = first + ((last - first) / 2);
+
+ if (orc_ip(mid) <= ip) {
+ found = mid;
+ first = mid + 1;
+ } else
+ last = mid - 1;
+ }
+
+ return u_table + (found - ip_table);
+}
+
+#ifdef CONFIG_MODULES
+static struct orc_entry *orc_module_find(unsigned long ip)
+{
+ struct module *mod;
+
+ mod = __module_address(ip);
+ if (!mod || !mod->arch.orc_unwind || !mod->arch.orc_unwind_ip)
+ return NULL;
+ return __orc_find(mod->arch.orc_unwind_ip, mod->arch.orc_unwind,
+ mod->arch.num_orcs, ip);
+}
+#else
+static struct orc_entry *orc_module_find(unsigned long ip)
+{
+ return NULL;
+}
+#endif
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+static struct orc_entry *orc_find(unsigned long ip);
+
+/*
+ * Ftrace dynamic trampolines do not have orc entries of their own.
+ * But they are copies of the ftrace entries that are static and
+ * defined in ftrace_*.S, which do have orc entries.
+ *
+ * If the undwinder comes across a ftrace trampoline, then find the
+ * ftrace function that was used to create it, and use that ftrace
+ * function's orc entrie, as the placement of the return code in
+ * the stack will be identical.
+ */
+static struct orc_entry *orc_ftrace_find(unsigned long ip)
+{
+ struct ftrace_ops *ops;
+ unsigned long caller;
+
+ ops = ftrace_ops_trampoline(ip);
+ if (!ops)
+ return NULL;
+
+ if (ops->flags & FTRACE_OPS_FL_SAVE_REGS)
+ caller = (unsigned long)ftrace_regs_call;
+ else
+ caller = (unsigned long)ftrace_call;
+
+ /* Prevent unlikely recursion */
+ if (ip == caller)
+ return NULL;
+
+ return orc_find(caller);
+}
+#else
+static struct orc_entry *orc_ftrace_find(unsigned long ip)
+{
+ return NULL;
+}
+#endif
+
+/*
+ * If we crash with IP==0, the last successfully executed instruction
+ * was probably an indirect function call with a NULL function pointer,
+ * and we don't have unwind information for NULL.
+ * This hardcoded ORC entry for IP==0 allows us to unwind from a NULL function
+ * pointer into its parent and then continue normally from there.
+ */
+static struct orc_entry null_orc_entry = {
+ .sp_offset = sizeof(long),
+ .sp_reg = ORC_REG_SP,
+ .bp_reg = ORC_REG_UNDEFINED,
+ .type = ORC_TYPE_CALL
+};
+
+static struct orc_entry *orc_find(unsigned long ip)
+{
+ static struct orc_entry *orc;
+
+ if (ip == 0)
+ return &null_orc_entry;
+
+ /* For non-init vmlinux addresses, use the fast lookup table: */
+ if (ip >= LOOKUP_START_IP && ip < LOOKUP_STOP_IP) {
+ unsigned int idx, start, stop;
+
+ idx = (ip - LOOKUP_START_IP) / LOOKUP_BLOCK_SIZE;
+
+ if (unlikely((idx >= lookup_num_blocks-1))) {
+ orc_warn("WARNING: bad lookup idx: idx=%u num=%u ip=%pB\n",
+ idx, lookup_num_blocks, (void *)ip);
+ return NULL;
+ }
+
+ start = orc_lookup[idx];
+ stop = orc_lookup[idx + 1] + 1;
+
+ if (unlikely((__start_orc_unwind + start >= __stop_orc_unwind) ||
+ (__start_orc_unwind + stop > __stop_orc_unwind))) {
+ orc_warn("WARNING: bad lookup value: idx=%u num=%u start=%u stop=%u ip=%pB\n",
+ idx, lookup_num_blocks, start, stop, (void *)ip);
+ return NULL;
+ }
+
+ return __orc_find(__start_orc_unwind_ip + start,
+ __start_orc_unwind + start, stop - start, ip);
+ }
+
+ /* vmlinux .init slow lookup: */
+ if (init_kernel_text(ip))
+ return __orc_find(__start_orc_unwind_ip, __start_orc_unwind,
+ __stop_orc_unwind_ip - __start_orc_unwind_ip, ip);
+
+ /* Module lookup: */
+ orc = orc_module_find(ip);
+ if (orc)
+ return orc;
+
+ return orc_ftrace_find(ip);
+}
+
+static void orc_sort_swap(void *_a, void *_b, int size)
+{
+ struct orc_entry *orc_a, *orc_b;
+ struct orc_entry orc_tmp;
+ int *a = _a, *b = _b, tmp;
+ int delta = _b - _a;
+
+ /* Swap the .orc_unwind_ip entries: */
+ tmp = *a;
+ *a = *b + delta;
+ *b = tmp - delta;
+
+ /* Swap the corresponding .orc_unwind entries: */
+ orc_a = cur_orc_table + (a - cur_orc_ip_table);
+ orc_b = cur_orc_table + (b - cur_orc_ip_table);
+ orc_tmp = *orc_a;
+ *orc_a = *orc_b;
+ *orc_b = orc_tmp;
+}
+
+static int orc_sort_cmp(const void *_a, const void *_b)
+{
+ struct orc_entry *orc_a;
+ const int *a = _a, *b = _b;
+ unsigned long a_val = orc_ip(a);
+ unsigned long b_val = orc_ip(b);
+
+ if (a_val > b_val)
+ return 1;
+ if (a_val < b_val)
+ return -1;
+
+ /*
+ * The "weak" section terminator entries need to always be on the left
+ * to ensure the lookup code skips them in favor of real entries.
+ * These terminator entries exist to handle any gaps created by
+ * whitelisted .o files which didn't get objtool generation.
+ */
+ orc_a = cur_orc_table + (a - cur_orc_ip_table);
+ return orc_a->sp_reg == ORC_REG_UNDEFINED && !orc_a->end ? -1 : 1;
+}
+
+#ifdef CONFIG_MODULES
+void unwind_module_init(struct module *mod, void *_orc_ip, size_t orc_ip_size,
+ void *_orc, size_t orc_size)
+{
+ int *orc_ip = _orc_ip;
+ struct orc_entry *orc = _orc;
+ unsigned int num_entries = orc_ip_size / sizeof(int);
+
+ WARN_ON_ONCE(orc_ip_size % sizeof(int) != 0 ||
+ orc_size % sizeof(*orc) != 0 ||
+ num_entries != orc_size / sizeof(*orc));
+
+ /*
+ * The 'cur_orc_*' globals allow the orc_sort_swap() callback to
+ * associate an .orc_unwind_ip table entry with its corresponding
+ * .orc_unwind entry so they can both be swapped.
+ */
+ mutex_lock(&sort_mutex);
+ cur_orc_ip_table = orc_ip;
+ cur_orc_table = orc;
+ sort(orc_ip, num_entries, sizeof(int), orc_sort_cmp, orc_sort_swap);
+ mutex_unlock(&sort_mutex);
+
+ mod->arch.orc_unwind_ip = orc_ip;
+ mod->arch.orc_unwind = orc;
+ mod->arch.num_orcs = num_entries;
+}
+#endif
+
+void __init unwind_init(void)
+{
+ size_t orc_ip_size = (void *)__stop_orc_unwind_ip - (void *)__start_orc_unwind_ip;
+ size_t orc_size = (void *)__stop_orc_unwind - (void *)__start_orc_unwind;
+ size_t num_entries = orc_ip_size / sizeof(int);
+ struct orc_entry *orc;
+ int i;
+
+ if (!num_entries || orc_ip_size % sizeof(int) != 0 ||
+ orc_size % sizeof(struct orc_entry) != 0 ||
+ num_entries != orc_size / sizeof(struct orc_entry)) {
+ orc_warn("WARNING: Bad or missing .orc_unwind table. Disabling unwinder.\n");
+ return;
+ }
+
+ /* Sort the .orc_unwind and .orc_unwind_ip tables: */
+ sort(__start_orc_unwind_ip, num_entries, sizeof(int), orc_sort_cmp,
+ orc_sort_swap);
+
+ /* Initialize the fast lookup table: */
+ lookup_num_blocks = orc_lookup_end - orc_lookup;
+ for (i = 0; i < lookup_num_blocks-1; i++) {
+ orc = __orc_find(__start_orc_unwind_ip, __start_orc_unwind,
+ num_entries,
+ LOOKUP_START_IP + (LOOKUP_BLOCK_SIZE * i));
+ if (!orc) {
+ orc_warn("WARNING: Corrupt .orc_unwind table. Disabling unwinder.\n");
+ return;
+ }
+
+ orc_lookup[i] = orc - __start_orc_unwind;
+ }
+
+ /* Initialize the ending block: */
+ orc = __orc_find(__start_orc_unwind_ip, __start_orc_unwind, num_entries,
+ LOOKUP_STOP_IP);
+ if (!orc) {
+ orc_warn("WARNING: Corrupt .orc_unwind table. Disabling unwinder.\n");
+ return;
+ }
+ orc_lookup[lookup_num_blocks-1] = orc - __start_orc_unwind;
+
+ orc_init = true;
+}
+
+unsigned long unwind_get_return_address(struct unwind_state *state)
+{
+ if (unwind_done(state))
+ return 0;
+
+ return __kernel_text_address(state->ip) ? state->ip : 0;
+}
+EXPORT_SYMBOL_GPL(unwind_get_return_address);
+
+unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
+{
+ if (unwind_done(state))
+ return NULL;
+
+ if (state->regs)
+ return &state->regs->ip;
+
+ if (state->sp)
+ return (unsigned long *)state->sp - 1;
+
+ return NULL;
+}
+
+static bool stack_access_ok(struct unwind_state *state, unsigned long _addr,
+ size_t len)
+{
+ struct stack_info *info = &state->stack_info;
+ void *addr = (void *)_addr;
+
+ if (!on_stack(info, addr, len) &&
+ (get_stack_info(addr, state->task, info, &state->stack_mask)))
+ return false;
+
+ return true;
+}
+
+static bool deref_stack_reg(struct unwind_state *state, unsigned long addr,
+ unsigned long *val)
+{
+ if (!stack_access_ok(state, addr, sizeof(long)))
+ return false;
+
+ *val = READ_ONCE_NOCHECK(*(unsigned long *)addr);
+ return true;
+}
+
+static bool deref_stack_regs(struct unwind_state *state, unsigned long addr,
+ unsigned long *ip, unsigned long *sp)
+{
+ struct pt_regs *regs = (struct pt_regs *)addr;
+
+ /* x86-32 support will be more complicated due to the &regs->sp hack */
+ BUILD_BUG_ON(IS_ENABLED(CONFIG_X86_32));
+
+ if (!stack_access_ok(state, addr, sizeof(struct pt_regs)))
+ return false;
+
+ *ip = READ_ONCE_NOCHECK(regs->ip);
+ *sp = READ_ONCE_NOCHECK(regs->sp);
+ return true;
+}
+
+static bool deref_stack_iret_regs(struct unwind_state *state, unsigned long addr,
+ unsigned long *ip, unsigned long *sp)
+{
+ struct pt_regs *regs = (void *)addr - IRET_FRAME_OFFSET;
+
+ if (!stack_access_ok(state, addr, IRET_FRAME_SIZE))
+ return false;
+
+ *ip = READ_ONCE_NOCHECK(regs->ip);
+ *sp = READ_ONCE_NOCHECK(regs->sp);
+ return true;
+}
+
+/*
+ * If state->regs is non-NULL, and points to a full pt_regs, just get the reg
+ * value from state->regs.
+ *
+ * Otherwise, if state->regs just points to IRET regs, and the previous frame
+ * had full regs, it's safe to get the value from the previous regs. This can
+ * happen when early/late IRQ entry code gets interrupted by an NMI.
+ */
+static bool get_reg(struct unwind_state *state, unsigned int reg_off,
+ unsigned long *val)
+{
+ unsigned int reg = reg_off/8;
+
+ if (!state->regs)
+ return false;
+
+ if (state->full_regs) {
+ *val = READ_ONCE_NOCHECK(((unsigned long *)state->regs)[reg]);
+ return true;
+ }
+
+ if (state->prev_regs) {
+ *val = READ_ONCE_NOCHECK(((unsigned long *)state->prev_regs)[reg]);
+ return true;
+ }
+
+ return false;
+}
+
+bool unwind_next_frame(struct unwind_state *state)
+{
+ unsigned long ip_p, sp, tmp, orig_ip = state->ip, prev_sp = state->sp;
+ enum stack_type prev_type = state->stack_info.type;
+ struct orc_entry *orc;
+ bool indirect = false;
+
+ if (unwind_done(state))
+ return false;
+
+ /* Don't let modules unload while we're reading their ORC data. */
+ preempt_disable();
+
+ /* End-of-stack check for user tasks: */
+ if (state->regs && user_mode(state->regs))
+ goto the_end;
+
+ /*
+ * Find the orc_entry associated with the text address.
+ *
+ * For a call frame (as opposed to a signal frame), state->ip points to
+ * the instruction after the call. That instruction's stack layout
+ * could be different from the call instruction's layout, for example
+ * if the call was to a noreturn function. So get the ORC data for the
+ * call instruction itself.
+ */
+ orc = orc_find(state->signal ? state->ip : state->ip - 1);
+ if (!orc)
+ goto err;
+
+ /* End-of-stack check for kernel threads: */
+ if (orc->sp_reg == ORC_REG_UNDEFINED) {
+ if (!orc->end)
+ goto err;
+
+ goto the_end;
+ }
+
+ /* Find the previous frame's stack: */
+ switch (orc->sp_reg) {
+ case ORC_REG_SP:
+ sp = state->sp + orc->sp_offset;
+ break;
+
+ case ORC_REG_BP:
+ sp = state->bp + orc->sp_offset;
+ break;
+
+ case ORC_REG_SP_INDIRECT:
+ sp = state->sp + orc->sp_offset;
+ indirect = true;
+ break;
+
+ case ORC_REG_BP_INDIRECT:
+ sp = state->bp + orc->sp_offset;
+ indirect = true;
+ break;
+
+ case ORC_REG_R10:
+ if (!get_reg(state, offsetof(struct pt_regs, r10), &sp)) {
+ orc_warn("missing regs for base reg R10 at ip %pB\n",
+ (void *)state->ip);
+ goto err;
+ }
+ break;
+
+ case ORC_REG_R13:
+ if (!get_reg(state, offsetof(struct pt_regs, r13), &sp)) {
+ orc_warn("missing regs for base reg R13 at ip %pB\n",
+ (void *)state->ip);
+ goto err;
+ }
+ break;
+
+ case ORC_REG_DI:
+ if (!get_reg(state, offsetof(struct pt_regs, di), &sp)) {
+ orc_warn("missing regs for base reg DI at ip %pB\n",
+ (void *)state->ip);
+ goto err;
+ }
+ break;
+
+ case ORC_REG_DX:
+ if (!get_reg(state, offsetof(struct pt_regs, dx), &sp)) {
+ orc_warn("missing regs for base reg DX at ip %pB\n",
+ (void *)state->ip);
+ goto err;
+ }
+ break;
+
+ default:
+ orc_warn("unknown SP base reg %d for ip %pB\n",
+ orc->sp_reg, (void *)state->ip);
+ goto err;
+ }
+
+ if (indirect) {
+ if (!deref_stack_reg(state, sp, &sp))
+ goto err;
+ }
+
+ /* Find IP, SP and possibly regs: */
+ switch (orc->type) {
+ case ORC_TYPE_CALL:
+ ip_p = sp - sizeof(long);
+
+ if (!deref_stack_reg(state, ip_p, &state->ip))
+ goto err;
+
+ state->ip = ftrace_graph_ret_addr(state->task, &state->graph_idx,
+ state->ip, (void *)ip_p);
+
+ state->sp = sp;
+ state->regs = NULL;
+ state->prev_regs = NULL;
+ state->signal = false;
+ break;
+
+ case ORC_TYPE_REGS:
+ if (!deref_stack_regs(state, sp, &state->ip, &state->sp)) {
+ orc_warn("can't dereference registers at %p for ip %pB\n",
+ (void *)sp, (void *)orig_ip);
+ goto err;
+ }
+
+ state->regs = (struct pt_regs *)sp;
+ state->prev_regs = NULL;
+ state->full_regs = true;
+ state->signal = true;
+ break;
+
+ case ORC_TYPE_REGS_IRET:
+ if (!deref_stack_iret_regs(state, sp, &state->ip, &state->sp)) {
+ orc_warn("can't dereference iret registers at %p for ip %pB\n",
+ (void *)sp, (void *)orig_ip);
+ goto err;
+ }
+
+ if (state->full_regs)
+ state->prev_regs = state->regs;
+ state->regs = (void *)sp - IRET_FRAME_OFFSET;
+ state->full_regs = false;
+ state->signal = true;
+ break;
+
+ default:
+ orc_warn("unknown .orc_unwind entry type %d for ip %pB\n",
+ orc->type, (void *)orig_ip);
+ goto err;
+ }
+
+ /* Find BP: */
+ switch (orc->bp_reg) {
+ case ORC_REG_UNDEFINED:
+ if (get_reg(state, offsetof(struct pt_regs, bp), &tmp))
+ state->bp = tmp;
+ break;
+
+ case ORC_REG_PREV_SP:
+ if (!deref_stack_reg(state, sp + orc->bp_offset, &state->bp))
+ goto err;
+ break;
+
+ case ORC_REG_BP:
+ if (!deref_stack_reg(state, state->bp + orc->bp_offset, &state->bp))
+ goto err;
+ break;
+
+ default:
+ orc_warn("unknown BP base reg %d for ip %pB\n",
+ orc->bp_reg, (void *)orig_ip);
+ goto err;
+ }
+
+ /* Prevent a recursive loop due to bad ORC data: */
+ if (state->stack_info.type == prev_type &&
+ on_stack(&state->stack_info, (void *)state->sp, sizeof(long)) &&
+ state->sp <= prev_sp) {
+ orc_warn("stack going in the wrong direction? ip=%pB\n",
+ (void *)orig_ip);
+ goto err;
+ }
+
+ preempt_enable();
+ return true;
+
+err:
+ state->error = true;
+
+the_end:
+ preempt_enable();
+ state->stack_info.type = STACK_TYPE_UNKNOWN;
+ return false;
+}
+EXPORT_SYMBOL_GPL(unwind_next_frame);
+
+void __unwind_start(struct unwind_state *state, struct task_struct *task,
+ struct pt_regs *regs, unsigned long *first_frame)
+{
+ memset(state, 0, sizeof(*state));
+ state->task = task;
+
+ if (!orc_init)
+ goto err;
+
+ /*
+ * Refuse to unwind the stack of a task while it's executing on another
+ * CPU. This check is racy, but that's ok: the unwinder has other
+ * checks to prevent it from going off the rails.
+ */
+ if (task_on_another_cpu(task))
+ goto err;
+
+ if (regs) {
+ if (user_mode(regs))
+ goto the_end;
+
+ state->ip = regs->ip;
+ state->sp = kernel_stack_pointer(regs);
+ state->bp = regs->bp;
+ state->regs = regs;
+ state->full_regs = true;
+ state->signal = true;
+
+ } else if (task == current) {
+ asm volatile("lea (%%rip), %0\n\t"
+ "mov %%rsp, %1\n\t"
+ "mov %%rbp, %2\n\t"
+ : "=r" (state->ip), "=r" (state->sp),
+ "=r" (state->bp));
+
+ } else {
+ struct inactive_task_frame *frame = (void *)task->thread.sp;
+
+ state->sp = task->thread.sp + sizeof(*frame);
+ state->bp = READ_ONCE_NOCHECK(frame->bp);
+ state->ip = READ_ONCE_NOCHECK(frame->ret_addr);
+ state->signal = (void *)state->ip == ret_from_fork;
+ }
+
+ if (get_stack_info((unsigned long *)state->sp, state->task,
+ &state->stack_info, &state->stack_mask)) {
+ /*
+ * We weren't on a valid stack. It's possible that
+ * we overflowed a valid stack into a guard page.
+ * See if the next page up is valid so that we can
+ * generate some kind of backtrace if this happens.
+ */
+ void *next_page = (void *)PAGE_ALIGN((unsigned long)state->sp);
+ state->error = true;
+ if (get_stack_info(next_page, state->task, &state->stack_info,
+ &state->stack_mask))
+ return;
+ }
+
+ /*
+ * The caller can provide the address of the first frame directly
+ * (first_frame) or indirectly (regs->sp) to indicate which stack frame
+ * to start unwinding at. Skip ahead until we reach it.
+ */
+
+ /* When starting from regs, skip the regs frame: */
+ if (regs) {
+ unwind_next_frame(state);
+ return;
+ }
+
+ /* Otherwise, skip ahead to the user-specified starting frame: */
+ while (!unwind_done(state) &&
+ (!on_stack(&state->stack_info, first_frame, sizeof(long)) ||
+ state->sp < (unsigned long)first_frame))
+ unwind_next_frame(state);
+
+ return;
+
+err:
+ state->error = true;
+the_end:
+ state->stack_info.type = STACK_TYPE_UNKNOWN;
+}
+EXPORT_SYMBOL_GPL(__unwind_start);
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
new file mode 100644
index 000000000..ae9e806a1
--- /dev/null
+++ b/arch/x86/kernel/uprobes.c
@@ -0,0 +1,1107 @@
+/*
+ * User-space Probes (UProbes) for x86
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2008-2011
+ * Authors:
+ * Srikar Dronamraju
+ * Jim Keniston
+ */
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/ptrace.h>
+#include <linux/uprobes.h>
+#include <linux/uaccess.h>
+
+#include <linux/kdebug.h>
+#include <asm/processor.h>
+#include <asm/insn.h>
+#include <asm/mmu_context.h>
+
+/* Post-execution fixups. */
+
+/* Adjust IP back to vicinity of actual insn */
+#define UPROBE_FIX_IP 0x01
+
+/* Adjust the return address of a call insn */
+#define UPROBE_FIX_CALL 0x02
+
+/* Instruction will modify TF, don't change it */
+#define UPROBE_FIX_SETF 0x04
+
+#define UPROBE_FIX_RIP_SI 0x08
+#define UPROBE_FIX_RIP_DI 0x10
+#define UPROBE_FIX_RIP_BX 0x20
+#define UPROBE_FIX_RIP_MASK \
+ (UPROBE_FIX_RIP_SI | UPROBE_FIX_RIP_DI | UPROBE_FIX_RIP_BX)
+
+#define UPROBE_TRAP_NR UINT_MAX
+
+/* Adaptations for mhiramat x86 decoder v14. */
+#define OPCODE1(insn) ((insn)->opcode.bytes[0])
+#define OPCODE2(insn) ((insn)->opcode.bytes[1])
+#define OPCODE3(insn) ((insn)->opcode.bytes[2])
+#define MODRM_REG(insn) X86_MODRM_REG((insn)->modrm.value)
+
+#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\
+ (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \
+ (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \
+ (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \
+ (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \
+ << (row % 32))
+
+/*
+ * Good-instruction tables for 32-bit apps. This is non-const and volatile
+ * to keep gcc from statically optimizing it out, as variable_test_bit makes
+ * some versions of gcc to think only *(unsigned long*) is used.
+ *
+ * Opcodes we'll probably never support:
+ * 6c-6f - ins,outs. SEGVs if used in userspace
+ * e4-e7 - in,out imm. SEGVs if used in userspace
+ * ec-ef - in,out acc. SEGVs if used in userspace
+ * cc - int3. SIGTRAP if used in userspace
+ * ce - into. Not used in userspace - no kernel support to make it useful. SEGVs
+ * (why we support bound (62) then? it's similar, and similarly unused...)
+ * f1 - int1. SIGTRAP if used in userspace
+ * f4 - hlt. SEGVs if used in userspace
+ * fa - cli. SEGVs if used in userspace
+ * fb - sti. SEGVs if used in userspace
+ *
+ * Opcodes which need some work to be supported:
+ * 07,17,1f - pop es/ss/ds
+ * Normally not used in userspace, but would execute if used.
+ * Can cause GP or stack exception if tries to load wrong segment descriptor.
+ * We hesitate to run them under single step since kernel's handling
+ * of userspace single-stepping (TF flag) is fragile.
+ * We can easily refuse to support push es/cs/ss/ds (06/0e/16/1e)
+ * on the same grounds that they are never used.
+ * cd - int N.
+ * Used by userspace for "int 80" syscall entry. (Other "int N"
+ * cause GP -> SEGV since their IDT gates don't allow calls from CPL 3).
+ * Not supported since kernel's handling of userspace single-stepping
+ * (TF flag) is fragile.
+ * cf - iret. Normally not used in userspace. Doesn't SEGV unless arguments are bad
+ */
+#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
+static volatile u32 good_insns_32[256 / 32] = {
+ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+ /* ---------------------------------------------- */
+ W(0x00, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 00 */
+ W(0x10, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) , /* 10 */
+ W(0x20, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */
+ W(0x30, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 30 */
+ W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
+ W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */
+ W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */
+ W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */
+ W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
+ W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
+ W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */
+ W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
+ W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */
+ W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
+ W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* e0 */
+ W(0xf0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1) /* f0 */
+ /* ---------------------------------------------- */
+ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+};
+#else
+#define good_insns_32 NULL
+#endif
+
+/* Good-instruction tables for 64-bit apps.
+ *
+ * Genuinely invalid opcodes:
+ * 06,07 - formerly push/pop es
+ * 0e - formerly push cs
+ * 16,17 - formerly push/pop ss
+ * 1e,1f - formerly push/pop ds
+ * 27,2f,37,3f - formerly daa/das/aaa/aas
+ * 60,61 - formerly pusha/popa
+ * 62 - formerly bound. EVEX prefix for AVX512 (not yet supported)
+ * 82 - formerly redundant encoding of Group1
+ * 9a - formerly call seg:ofs
+ * ce - formerly into
+ * d4,d5 - formerly aam/aad
+ * d6 - formerly undocumented salc
+ * ea - formerly jmp seg:ofs
+ *
+ * Opcodes we'll probably never support:
+ * 6c-6f - ins,outs. SEGVs if used in userspace
+ * e4-e7 - in,out imm. SEGVs if used in userspace
+ * ec-ef - in,out acc. SEGVs if used in userspace
+ * cc - int3. SIGTRAP if used in userspace
+ * f1 - int1. SIGTRAP if used in userspace
+ * f4 - hlt. SEGVs if used in userspace
+ * fa - cli. SEGVs if used in userspace
+ * fb - sti. SEGVs if used in userspace
+ *
+ * Opcodes which need some work to be supported:
+ * cd - int N.
+ * Used by userspace for "int 80" syscall entry. (Other "int N"
+ * cause GP -> SEGV since their IDT gates don't allow calls from CPL 3).
+ * Not supported since kernel's handling of userspace single-stepping
+ * (TF flag) is fragile.
+ * cf - iret. Normally not used in userspace. Doesn't SEGV unless arguments are bad
+ */
+#if defined(CONFIG_X86_64)
+static volatile u32 good_insns_64[256 / 32] = {
+ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+ /* ---------------------------------------------- */
+ W(0x00, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1) | /* 00 */
+ W(0x10, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) , /* 10 */
+ W(0x20, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) | /* 20 */
+ W(0x30, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) , /* 30 */
+ W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
+ W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */
+ W(0x60, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */
+ W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */
+ W(0x80, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
+ W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1) , /* 90 */
+ W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */
+ W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
+ W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */
+ W(0xd0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
+ W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0) | /* e0 */
+ W(0xf0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1) /* f0 */
+ /* ---------------------------------------------- */
+ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+};
+#else
+#define good_insns_64 NULL
+#endif
+
+/* Using this for both 64-bit and 32-bit apps.
+ * Opcodes we don't support:
+ * 0f 00 - SLDT/STR/LLDT/LTR/VERR/VERW/-/- group. System insns
+ * 0f 01 - SGDT/SIDT/LGDT/LIDT/SMSW/-/LMSW/INVLPG group.
+ * Also encodes tons of other system insns if mod=11.
+ * Some are in fact non-system: xend, xtest, rdtscp, maybe more
+ * 0f 05 - syscall
+ * 0f 06 - clts (CPL0 insn)
+ * 0f 07 - sysret
+ * 0f 08 - invd (CPL0 insn)
+ * 0f 09 - wbinvd (CPL0 insn)
+ * 0f 0b - ud2
+ * 0f 30 - wrmsr (CPL0 insn) (then why rdmsr is allowed, it's also CPL0 insn?)
+ * 0f 34 - sysenter
+ * 0f 35 - sysexit
+ * 0f 37 - getsec
+ * 0f 78 - vmread (Intel VMX. CPL0 insn)
+ * 0f 79 - vmwrite (Intel VMX. CPL0 insn)
+ * Note: with prefixes, these two opcodes are
+ * extrq/insertq/AVX512 convert vector ops.
+ * 0f ae - group15: [f]xsave,[f]xrstor,[v]{ld,st}mxcsr,clflush[opt],
+ * {rd,wr}{fs,gs}base,{s,l,m}fence.
+ * Why? They are all user-executable.
+ */
+static volatile u32 good_2byte_insns[256 / 32] = {
+ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+ /* ---------------------------------------------- */
+ W(0x00, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1) | /* 00 */
+ W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 10 */
+ W(0x20, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */
+ W(0x30, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) , /* 30 */
+ W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
+ W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */
+ W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 60 */
+ W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1) , /* 70 */
+ W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
+ W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
+ W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1) | /* a0 */
+ W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
+ W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */
+ W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
+ W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* e0 */
+ W(0xf0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) /* f0 */
+ /* ---------------------------------------------- */
+ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+};
+#undef W
+
+/*
+ * opcodes we may need to refine support for:
+ *
+ * 0f - 2-byte instructions: For many of these instructions, the validity
+ * depends on the prefix and/or the reg field. On such instructions, we
+ * just consider the opcode combination valid if it corresponds to any
+ * valid instruction.
+ *
+ * 8f - Group 1 - only reg = 0 is OK
+ * c6-c7 - Group 11 - only reg = 0 is OK
+ * d9-df - fpu insns with some illegal encodings
+ * f2, f3 - repnz, repz prefixes. These are also the first byte for
+ * certain floating-point instructions, such as addsd.
+ *
+ * fe - Group 4 - only reg = 0 or 1 is OK
+ * ff - Group 5 - only reg = 0-6 is OK
+ *
+ * others -- Do we need to support these?
+ *
+ * 0f - (floating-point?) prefetch instructions
+ * 07, 17, 1f - pop es, pop ss, pop ds
+ * 26, 2e, 36, 3e - es:, cs:, ss:, ds: segment prefixes --
+ * but 64 and 65 (fs: and gs:) seem to be used, so we support them
+ * 67 - addr16 prefix
+ * ce - into
+ * f0 - lock prefix
+ */
+
+/*
+ * TODO:
+ * - Where necessary, examine the modrm byte and allow only valid instructions
+ * in the different Groups and fpu instructions.
+ */
+
+static bool is_prefix_bad(struct insn *insn)
+{
+ insn_byte_t p;
+ int i;
+
+ for_each_insn_prefix(insn, i, p) {
+ insn_attr_t attr;
+
+ attr = inat_get_opcode_attribute(p);
+ switch (attr) {
+ case INAT_MAKE_PREFIX(INAT_PFX_ES):
+ case INAT_MAKE_PREFIX(INAT_PFX_CS):
+ case INAT_MAKE_PREFIX(INAT_PFX_DS):
+ case INAT_MAKE_PREFIX(INAT_PFX_SS):
+ case INAT_MAKE_PREFIX(INAT_PFX_LOCK):
+ return true;
+ }
+ }
+ return false;
+}
+
+static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool x86_64)
+{
+ u32 volatile *good_insns;
+
+ insn_init(insn, auprobe->insn, sizeof(auprobe->insn), x86_64);
+ /* has the side-effect of processing the entire instruction */
+ insn_get_length(insn);
+ if (!insn_complete(insn))
+ return -ENOEXEC;
+
+ if (is_prefix_bad(insn))
+ return -ENOTSUPP;
+
+ /* We should not singlestep on the exception masking instructions */
+ if (insn_masking_exception(insn))
+ return -ENOTSUPP;
+
+ if (x86_64)
+ good_insns = good_insns_64;
+ else
+ good_insns = good_insns_32;
+
+ if (test_bit(OPCODE1(insn), (unsigned long *)good_insns))
+ return 0;
+
+ if (insn->opcode.nbytes == 2) {
+ if (test_bit(OPCODE2(insn), (unsigned long *)good_2byte_insns))
+ return 0;
+ }
+
+ return -ENOTSUPP;
+}
+
+#ifdef CONFIG_X86_64
+/*
+ * If arch_uprobe->insn doesn't use rip-relative addressing, return
+ * immediately. Otherwise, rewrite the instruction so that it accesses
+ * its memory operand indirectly through a scratch register. Set
+ * defparam->fixups accordingly. (The contents of the scratch register
+ * will be saved before we single-step the modified instruction,
+ * and restored afterward).
+ *
+ * We do this because a rip-relative instruction can access only a
+ * relatively small area (+/- 2 GB from the instruction), and the XOL
+ * area typically lies beyond that area. At least for instructions
+ * that store to memory, we can't execute the original instruction
+ * and "fix things up" later, because the misdirected store could be
+ * disastrous.
+ *
+ * Some useful facts about rip-relative instructions:
+ *
+ * - There's always a modrm byte with bit layout "00 reg 101".
+ * - There's never a SIB byte.
+ * - The displacement is always 4 bytes.
+ * - REX.B=1 bit in REX prefix, which normally extends r/m field,
+ * has no effect on rip-relative mode. It doesn't make modrm byte
+ * with r/m=101 refer to register 1101 = R13.
+ */
+static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn)
+{
+ u8 *cursor;
+ u8 reg;
+ u8 reg2;
+
+ if (!insn_rip_relative(insn))
+ return;
+
+ /*
+ * insn_rip_relative() would have decoded rex_prefix, vex_prefix, modrm.
+ * Clear REX.b bit (extension of MODRM.rm field):
+ * we want to encode low numbered reg, not r8+.
+ */
+ if (insn->rex_prefix.nbytes) {
+ cursor = auprobe->insn + insn_offset_rex_prefix(insn);
+ /* REX byte has 0100wrxb layout, clearing REX.b bit */
+ *cursor &= 0xfe;
+ }
+ /*
+ * Similar treatment for VEX3/EVEX prefix.
+ * TODO: add XOP treatment when insn decoder supports them
+ */
+ if (insn->vex_prefix.nbytes >= 3) {
+ /*
+ * vex2: c5 rvvvvLpp (has no b bit)
+ * vex3/xop: c4/8f rxbmmmmm wvvvvLpp
+ * evex: 62 rxbR00mm wvvvv1pp zllBVaaa
+ * Setting VEX3.b (setting because it has inverted meaning).
+ * Setting EVEX.x since (in non-SIB encoding) EVEX.x
+ * is the 4th bit of MODRM.rm, and needs the same treatment.
+ * For VEX3-encoded insns, VEX3.x value has no effect in
+ * non-SIB encoding, the change is superfluous but harmless.
+ */
+ cursor = auprobe->insn + insn_offset_vex_prefix(insn) + 1;
+ *cursor |= 0x60;
+ }
+
+ /*
+ * Convert from rip-relative addressing to register-relative addressing
+ * via a scratch register.
+ *
+ * This is tricky since there are insns with modrm byte
+ * which also use registers not encoded in modrm byte:
+ * [i]div/[i]mul: implicitly use dx:ax
+ * shift ops: implicitly use cx
+ * cmpxchg: implicitly uses ax
+ * cmpxchg8/16b: implicitly uses dx:ax and bx:cx
+ * Encoding: 0f c7/1 modrm
+ * The code below thinks that reg=1 (cx), chooses si as scratch.
+ * mulx: implicitly uses dx: mulx r/m,r1,r2 does r1:r2 = dx * r/m.
+ * First appeared in Haswell (BMI2 insn). It is vex-encoded.
+ * Example where none of bx,cx,dx can be used as scratch reg:
+ * c4 e2 63 f6 0d disp32 mulx disp32(%rip),%ebx,%ecx
+ * [v]pcmpistri: implicitly uses cx, xmm0
+ * [v]pcmpistrm: implicitly uses xmm0
+ * [v]pcmpestri: implicitly uses ax, dx, cx, xmm0
+ * [v]pcmpestrm: implicitly uses ax, dx, xmm0
+ * Evil SSE4.2 string comparison ops from hell.
+ * maskmovq/[v]maskmovdqu: implicitly uses (ds:rdi) as destination.
+ * Encoding: 0f f7 modrm, 66 0f f7 modrm, vex-encoded: c5 f9 f7 modrm.
+ * Store op1, byte-masked by op2 msb's in each byte, to (ds:rdi).
+ * AMD says it has no 3-operand form (vex.vvvv must be 1111)
+ * and that it can have only register operands, not mem
+ * (its modrm byte must have mode=11).
+ * If these restrictions will ever be lifted,
+ * we'll need code to prevent selection of di as scratch reg!
+ *
+ * Summary: I don't know any insns with modrm byte which
+ * use SI register implicitly. DI register is used only
+ * by one insn (maskmovq) and BX register is used
+ * only by one too (cmpxchg8b).
+ * BP is stack-segment based (may be a problem?).
+ * AX, DX, CX are off-limits (many implicit users).
+ * SP is unusable (it's stack pointer - think about "pop mem";
+ * also, rsp+disp32 needs sib encoding -> insn length change).
+ */
+
+ reg = MODRM_REG(insn); /* Fetch modrm.reg */
+ reg2 = 0xff; /* Fetch vex.vvvv */
+ if (insn->vex_prefix.nbytes)
+ reg2 = insn->vex_prefix.bytes[2];
+ /*
+ * TODO: add XOP vvvv reading.
+ *
+ * vex.vvvv field is in bits 6-3, bits are inverted.
+ * But in 32-bit mode, high-order bit may be ignored.
+ * Therefore, let's consider only 3 low-order bits.
+ */
+ reg2 = ((reg2 >> 3) & 0x7) ^ 0x7;
+ /*
+ * Register numbering is ax,cx,dx,bx, sp,bp,si,di, r8..r15.
+ *
+ * Choose scratch reg. Order is important: must not select bx
+ * if we can use si (cmpxchg8b case!)
+ */
+ if (reg != 6 && reg2 != 6) {
+ reg2 = 6;
+ auprobe->defparam.fixups |= UPROBE_FIX_RIP_SI;
+ } else if (reg != 7 && reg2 != 7) {
+ reg2 = 7;
+ auprobe->defparam.fixups |= UPROBE_FIX_RIP_DI;
+ /* TODO (paranoia): force maskmovq to not use di */
+ } else {
+ reg2 = 3;
+ auprobe->defparam.fixups |= UPROBE_FIX_RIP_BX;
+ }
+ /*
+ * Point cursor at the modrm byte. The next 4 bytes are the
+ * displacement. Beyond the displacement, for some instructions,
+ * is the immediate operand.
+ */
+ cursor = auprobe->insn + insn_offset_modrm(insn);
+ /*
+ * Change modrm from "00 reg 101" to "10 reg reg2". Example:
+ * 89 05 disp32 mov %eax,disp32(%rip) becomes
+ * 89 86 disp32 mov %eax,disp32(%rsi)
+ */
+ *cursor = 0x80 | (reg << 3) | reg2;
+}
+
+static inline unsigned long *
+scratch_reg(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ if (auprobe->defparam.fixups & UPROBE_FIX_RIP_SI)
+ return &regs->si;
+ if (auprobe->defparam.fixups & UPROBE_FIX_RIP_DI)
+ return &regs->di;
+ return &regs->bx;
+}
+
+/*
+ * If we're emulating a rip-relative instruction, save the contents
+ * of the scratch register and store the target address in that register.
+ */
+static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ if (auprobe->defparam.fixups & UPROBE_FIX_RIP_MASK) {
+ struct uprobe_task *utask = current->utask;
+ unsigned long *sr = scratch_reg(auprobe, regs);
+
+ utask->autask.saved_scratch_register = *sr;
+ *sr = utask->vaddr + auprobe->defparam.ilen;
+ }
+}
+
+static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ if (auprobe->defparam.fixups & UPROBE_FIX_RIP_MASK) {
+ struct uprobe_task *utask = current->utask;
+ unsigned long *sr = scratch_reg(auprobe, regs);
+
+ *sr = utask->autask.saved_scratch_register;
+ }
+}
+#else /* 32-bit: */
+/*
+ * No RIP-relative addressing on 32-bit
+ */
+static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn)
+{
+}
+static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+}
+static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+}
+#endif /* CONFIG_X86_64 */
+
+struct uprobe_xol_ops {
+ bool (*emulate)(struct arch_uprobe *, struct pt_regs *);
+ int (*pre_xol)(struct arch_uprobe *, struct pt_regs *);
+ int (*post_xol)(struct arch_uprobe *, struct pt_regs *);
+ void (*abort)(struct arch_uprobe *, struct pt_regs *);
+};
+
+static inline int sizeof_long(struct pt_regs *regs)
+{
+ /*
+ * Check registers for mode as in_xxx_syscall() does not apply here.
+ */
+ return user_64bit_mode(regs) ? 8 : 4;
+}
+
+static int default_pre_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ riprel_pre_xol(auprobe, regs);
+ return 0;
+}
+
+static int emulate_push_stack(struct pt_regs *regs, unsigned long val)
+{
+ unsigned long new_sp = regs->sp - sizeof_long(regs);
+
+ if (copy_to_user((void __user *)new_sp, &val, sizeof_long(regs)))
+ return -EFAULT;
+
+ regs->sp = new_sp;
+ return 0;
+}
+
+/*
+ * We have to fix things up as follows:
+ *
+ * Typically, the new ip is relative to the copied instruction. We need
+ * to make it relative to the original instruction (FIX_IP). Exceptions
+ * are return instructions and absolute or indirect jump or call instructions.
+ *
+ * If the single-stepped instruction was a call, the return address that
+ * is atop the stack is the address following the copied instruction. We
+ * need to make it the address following the original instruction (FIX_CALL).
+ *
+ * If the original instruction was a rip-relative instruction such as
+ * "movl %edx,0xnnnn(%rip)", we have instead executed an equivalent
+ * instruction using a scratch register -- e.g., "movl %edx,0xnnnn(%rsi)".
+ * We need to restore the contents of the scratch register
+ * (FIX_RIP_reg).
+ */
+static int default_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ struct uprobe_task *utask = current->utask;
+
+ riprel_post_xol(auprobe, regs);
+ if (auprobe->defparam.fixups & UPROBE_FIX_IP) {
+ long correction = utask->vaddr - utask->xol_vaddr;
+ regs->ip += correction;
+ } else if (auprobe->defparam.fixups & UPROBE_FIX_CALL) {
+ regs->sp += sizeof_long(regs); /* Pop incorrect return address */
+ if (emulate_push_stack(regs, utask->vaddr + auprobe->defparam.ilen))
+ return -ERESTART;
+ }
+ /* popf; tell the caller to not touch TF */
+ if (auprobe->defparam.fixups & UPROBE_FIX_SETF)
+ utask->autask.saved_tf = true;
+
+ return 0;
+}
+
+static void default_abort_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ riprel_post_xol(auprobe, regs);
+}
+
+static const struct uprobe_xol_ops default_xol_ops = {
+ .pre_xol = default_pre_xol_op,
+ .post_xol = default_post_xol_op,
+ .abort = default_abort_op,
+};
+
+static bool branch_is_call(struct arch_uprobe *auprobe)
+{
+ return auprobe->branch.opc1 == 0xe8;
+}
+
+#define CASE_COND \
+ COND(70, 71, XF(OF)) \
+ COND(72, 73, XF(CF)) \
+ COND(74, 75, XF(ZF)) \
+ COND(78, 79, XF(SF)) \
+ COND(7a, 7b, XF(PF)) \
+ COND(76, 77, XF(CF) || XF(ZF)) \
+ COND(7c, 7d, XF(SF) != XF(OF)) \
+ COND(7e, 7f, XF(ZF) || XF(SF) != XF(OF))
+
+#define COND(op_y, op_n, expr) \
+ case 0x ## op_y: DO((expr) != 0) \
+ case 0x ## op_n: DO((expr) == 0)
+
+#define XF(xf) (!!(flags & X86_EFLAGS_ ## xf))
+
+static bool is_cond_jmp_opcode(u8 opcode)
+{
+ switch (opcode) {
+ #define DO(expr) \
+ return true;
+ CASE_COND
+ #undef DO
+
+ default:
+ return false;
+ }
+}
+
+static bool check_jmp_cond(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ unsigned long flags = regs->flags;
+
+ switch (auprobe->branch.opc1) {
+ #define DO(expr) \
+ return expr;
+ CASE_COND
+ #undef DO
+
+ default: /* not a conditional jmp */
+ return true;
+ }
+}
+
+#undef XF
+#undef COND
+#undef CASE_COND
+
+static bool branch_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ unsigned long new_ip = regs->ip += auprobe->branch.ilen;
+ unsigned long offs = (long)auprobe->branch.offs;
+
+ if (branch_is_call(auprobe)) {
+ /*
+ * If it fails we execute this (mangled, see the comment in
+ * branch_clear_offset) insn out-of-line. In the likely case
+ * this should trigger the trap, and the probed application
+ * should die or restart the same insn after it handles the
+ * signal, arch_uprobe_post_xol() won't be even called.
+ *
+ * But there is corner case, see the comment in ->post_xol().
+ */
+ if (emulate_push_stack(regs, new_ip))
+ return false;
+ } else if (!check_jmp_cond(auprobe, regs)) {
+ offs = 0;
+ }
+
+ regs->ip = new_ip + offs;
+ return true;
+}
+
+static bool push_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ unsigned long *src_ptr = (void *)regs + auprobe->push.reg_offset;
+
+ if (emulate_push_stack(regs, *src_ptr))
+ return false;
+ regs->ip += auprobe->push.ilen;
+ return true;
+}
+
+static int branch_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ BUG_ON(!branch_is_call(auprobe));
+ /*
+ * We can only get here if branch_emulate_op() failed to push the ret
+ * address _and_ another thread expanded our stack before the (mangled)
+ * "call" insn was executed out-of-line. Just restore ->sp and restart.
+ * We could also restore ->ip and try to call branch_emulate_op() again.
+ */
+ regs->sp += sizeof_long(regs);
+ return -ERESTART;
+}
+
+static void branch_clear_offset(struct arch_uprobe *auprobe, struct insn *insn)
+{
+ /*
+ * Turn this insn into "call 1f; 1:", this is what we will execute
+ * out-of-line if ->emulate() fails. We only need this to generate
+ * a trap, so that the probed task receives the correct signal with
+ * the properly filled siginfo.
+ *
+ * But see the comment in ->post_xol(), in the unlikely case it can
+ * succeed. So we need to ensure that the new ->ip can not fall into
+ * the non-canonical area and trigger #GP.
+ *
+ * We could turn it into (say) "pushf", but then we would need to
+ * divorce ->insn[] and ->ixol[]. We need to preserve the 1st byte
+ * of ->insn[] for set_orig_insn().
+ */
+ memset(auprobe->insn + insn_offset_immediate(insn),
+ 0, insn->immediate.nbytes);
+}
+
+static const struct uprobe_xol_ops branch_xol_ops = {
+ .emulate = branch_emulate_op,
+ .post_xol = branch_post_xol_op,
+};
+
+static const struct uprobe_xol_ops push_xol_ops = {
+ .emulate = push_emulate_op,
+};
+
+/* Returns -ENOSYS if branch_xol_ops doesn't handle this insn */
+static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
+{
+ u8 opc1 = OPCODE1(insn);
+ insn_byte_t p;
+ int i;
+
+ switch (opc1) {
+ case 0xeb: /* jmp 8 */
+ case 0xe9: /* jmp 32 */
+ case 0x90: /* prefix* + nop; same as jmp with .offs = 0 */
+ break;
+
+ case 0xe8: /* call relative */
+ branch_clear_offset(auprobe, insn);
+ break;
+
+ case 0x0f:
+ if (insn->opcode.nbytes != 2)
+ return -ENOSYS;
+ /*
+ * If it is a "near" conditional jmp, OPCODE2() - 0x10 matches
+ * OPCODE1() of the "short" jmp which checks the same condition.
+ */
+ opc1 = OPCODE2(insn) - 0x10;
+ default:
+ if (!is_cond_jmp_opcode(opc1))
+ return -ENOSYS;
+ }
+
+ /*
+ * 16-bit overrides such as CALLW (66 e8 nn nn) are not supported.
+ * Intel and AMD behavior differ in 64-bit mode: Intel ignores 66 prefix.
+ * No one uses these insns, reject any branch insns with such prefix.
+ */
+ for_each_insn_prefix(insn, i, p) {
+ if (p == 0x66)
+ return -ENOTSUPP;
+ }
+
+ auprobe->branch.opc1 = opc1;
+ auprobe->branch.ilen = insn->length;
+ auprobe->branch.offs = insn->immediate.value;
+
+ auprobe->ops = &branch_xol_ops;
+ return 0;
+}
+
+/* Returns -ENOSYS if push_xol_ops doesn't handle this insn */
+static int push_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
+{
+ u8 opc1 = OPCODE1(insn), reg_offset = 0;
+
+ if (opc1 < 0x50 || opc1 > 0x57)
+ return -ENOSYS;
+
+ if (insn->length > 2)
+ return -ENOSYS;
+ if (insn->length == 2) {
+ /* only support rex_prefix 0x41 (x64 only) */
+#ifdef CONFIG_X86_64
+ if (insn->rex_prefix.nbytes != 1 ||
+ insn->rex_prefix.bytes[0] != 0x41)
+ return -ENOSYS;
+
+ switch (opc1) {
+ case 0x50:
+ reg_offset = offsetof(struct pt_regs, r8);
+ break;
+ case 0x51:
+ reg_offset = offsetof(struct pt_regs, r9);
+ break;
+ case 0x52:
+ reg_offset = offsetof(struct pt_regs, r10);
+ break;
+ case 0x53:
+ reg_offset = offsetof(struct pt_regs, r11);
+ break;
+ case 0x54:
+ reg_offset = offsetof(struct pt_regs, r12);
+ break;
+ case 0x55:
+ reg_offset = offsetof(struct pt_regs, r13);
+ break;
+ case 0x56:
+ reg_offset = offsetof(struct pt_regs, r14);
+ break;
+ case 0x57:
+ reg_offset = offsetof(struct pt_regs, r15);
+ break;
+ }
+#else
+ return -ENOSYS;
+#endif
+ } else {
+ switch (opc1) {
+ case 0x50:
+ reg_offset = offsetof(struct pt_regs, ax);
+ break;
+ case 0x51:
+ reg_offset = offsetof(struct pt_regs, cx);
+ break;
+ case 0x52:
+ reg_offset = offsetof(struct pt_regs, dx);
+ break;
+ case 0x53:
+ reg_offset = offsetof(struct pt_regs, bx);
+ break;
+ case 0x54:
+ reg_offset = offsetof(struct pt_regs, sp);
+ break;
+ case 0x55:
+ reg_offset = offsetof(struct pt_regs, bp);
+ break;
+ case 0x56:
+ reg_offset = offsetof(struct pt_regs, si);
+ break;
+ case 0x57:
+ reg_offset = offsetof(struct pt_regs, di);
+ break;
+ }
+ }
+
+ auprobe->push.reg_offset = reg_offset;
+ auprobe->push.ilen = insn->length;
+ auprobe->ops = &push_xol_ops;
+ return 0;
+}
+
+/**
+ * arch_uprobe_analyze_insn - instruction analysis including validity and fixups.
+ * @mm: the probed address space.
+ * @arch_uprobe: the probepoint information.
+ * @addr: virtual address at which to install the probepoint
+ * Return 0 on success or a -ve number on error.
+ */
+int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr)
+{
+ struct insn insn;
+ u8 fix_ip_or_call = UPROBE_FIX_IP;
+ int ret;
+
+ ret = uprobe_init_insn(auprobe, &insn, is_64bit_mm(mm));
+ if (ret)
+ return ret;
+
+ ret = branch_setup_xol_ops(auprobe, &insn);
+ if (ret != -ENOSYS)
+ return ret;
+
+ ret = push_setup_xol_ops(auprobe, &insn);
+ if (ret != -ENOSYS)
+ return ret;
+
+ /*
+ * Figure out which fixups default_post_xol_op() will need to perform,
+ * and annotate defparam->fixups accordingly.
+ */
+ switch (OPCODE1(&insn)) {
+ case 0x9d: /* popf */
+ auprobe->defparam.fixups |= UPROBE_FIX_SETF;
+ break;
+ case 0xc3: /* ret or lret -- ip is correct */
+ case 0xcb:
+ case 0xc2:
+ case 0xca:
+ case 0xea: /* jmp absolute -- ip is correct */
+ fix_ip_or_call = 0;
+ break;
+ case 0x9a: /* call absolute - Fix return addr, not ip */
+ fix_ip_or_call = UPROBE_FIX_CALL;
+ break;
+ case 0xff:
+ switch (MODRM_REG(&insn)) {
+ case 2: case 3: /* call or lcall, indirect */
+ fix_ip_or_call = UPROBE_FIX_CALL;
+ break;
+ case 4: case 5: /* jmp or ljmp, indirect */
+ fix_ip_or_call = 0;
+ break;
+ }
+ /* fall through */
+ default:
+ riprel_analyze(auprobe, &insn);
+ }
+
+ auprobe->defparam.ilen = insn.length;
+ auprobe->defparam.fixups |= fix_ip_or_call;
+
+ auprobe->ops = &default_xol_ops;
+ return 0;
+}
+
+/*
+ * arch_uprobe_pre_xol - prepare to execute out of line.
+ * @auprobe: the probepoint information.
+ * @regs: reflects the saved user state of current task.
+ */
+int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ struct uprobe_task *utask = current->utask;
+
+ if (auprobe->ops->pre_xol) {
+ int err = auprobe->ops->pre_xol(auprobe, regs);
+ if (err)
+ return err;
+ }
+
+ regs->ip = utask->xol_vaddr;
+ utask->autask.saved_trap_nr = current->thread.trap_nr;
+ current->thread.trap_nr = UPROBE_TRAP_NR;
+
+ utask->autask.saved_tf = !!(regs->flags & X86_EFLAGS_TF);
+ regs->flags |= X86_EFLAGS_TF;
+ if (test_tsk_thread_flag(current, TIF_BLOCKSTEP))
+ set_task_blockstep(current, false);
+
+ return 0;
+}
+
+/*
+ * If xol insn itself traps and generates a signal(Say,
+ * SIGILL/SIGSEGV/etc), then detect the case where a singlestepped
+ * instruction jumps back to its own address. It is assumed that anything
+ * like do_page_fault/do_trap/etc sets thread.trap_nr != -1.
+ *
+ * arch_uprobe_pre_xol/arch_uprobe_post_xol save/restore thread.trap_nr,
+ * arch_uprobe_xol_was_trapped() simply checks that ->trap_nr is not equal to
+ * UPROBE_TRAP_NR == -1 set by arch_uprobe_pre_xol().
+ */
+bool arch_uprobe_xol_was_trapped(struct task_struct *t)
+{
+ if (t->thread.trap_nr != UPROBE_TRAP_NR)
+ return true;
+
+ return false;
+}
+
+/*
+ * Called after single-stepping. To avoid the SMP problems that can
+ * occur when we temporarily put back the original opcode to
+ * single-step, we single-stepped a copy of the instruction.
+ *
+ * This function prepares to resume execution after the single-step.
+ */
+int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ struct uprobe_task *utask = current->utask;
+ bool send_sigtrap = utask->autask.saved_tf;
+ int err = 0;
+
+ WARN_ON_ONCE(current->thread.trap_nr != UPROBE_TRAP_NR);
+ current->thread.trap_nr = utask->autask.saved_trap_nr;
+
+ if (auprobe->ops->post_xol) {
+ err = auprobe->ops->post_xol(auprobe, regs);
+ if (err) {
+ /*
+ * Restore ->ip for restart or post mortem analysis.
+ * ->post_xol() must not return -ERESTART unless this
+ * is really possible.
+ */
+ regs->ip = utask->vaddr;
+ if (err == -ERESTART)
+ err = 0;
+ send_sigtrap = false;
+ }
+ }
+ /*
+ * arch_uprobe_pre_xol() doesn't save the state of TIF_BLOCKSTEP
+ * so we can get an extra SIGTRAP if we do not clear TF. We need
+ * to examine the opcode to make it right.
+ */
+ if (send_sigtrap)
+ send_sig(SIGTRAP, current, 0);
+
+ if (!utask->autask.saved_tf)
+ regs->flags &= ~X86_EFLAGS_TF;
+
+ return err;
+}
+
+/* callback routine for handling exceptions. */
+int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, void *data)
+{
+ struct die_args *args = data;
+ struct pt_regs *regs = args->regs;
+ int ret = NOTIFY_DONE;
+
+ /* We are only interested in userspace traps */
+ if (regs && !user_mode(regs))
+ return NOTIFY_DONE;
+
+ switch (val) {
+ case DIE_INT3:
+ if (uprobe_pre_sstep_notifier(regs))
+ ret = NOTIFY_STOP;
+
+ break;
+
+ case DIE_DEBUG:
+ if (uprobe_post_sstep_notifier(regs))
+ ret = NOTIFY_STOP;
+
+ default:
+ break;
+ }
+
+ return ret;
+}
+
+/*
+ * This function gets called when XOL instruction either gets trapped or
+ * the thread has a fatal signal. Reset the instruction pointer to its
+ * probed address for the potential restart or for post mortem analysis.
+ */
+void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ struct uprobe_task *utask = current->utask;
+
+ if (auprobe->ops->abort)
+ auprobe->ops->abort(auprobe, regs);
+
+ current->thread.trap_nr = utask->autask.saved_trap_nr;
+ regs->ip = utask->vaddr;
+ /* clear TF if it was set by us in arch_uprobe_pre_xol() */
+ if (!utask->autask.saved_tf)
+ regs->flags &= ~X86_EFLAGS_TF;
+}
+
+static bool __skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ if (auprobe->ops->emulate)
+ return auprobe->ops->emulate(auprobe, regs);
+ return false;
+}
+
+bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ bool ret = __skip_sstep(auprobe, regs);
+ if (ret && (regs->flags & X86_EFLAGS_TF))
+ send_sig(SIGTRAP, current, 0);
+ return ret;
+}
+
+unsigned long
+arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs *regs)
+{
+ int rasize = sizeof_long(regs), nleft;
+ unsigned long orig_ret_vaddr = 0; /* clear high bits for 32-bit apps */
+
+ if (copy_from_user(&orig_ret_vaddr, (void __user *)regs->sp, rasize))
+ return -1;
+
+ /* check whether address has been already hijacked */
+ if (orig_ret_vaddr == trampoline_vaddr)
+ return orig_ret_vaddr;
+
+ nleft = copy_to_user((void __user *)regs->sp, &trampoline_vaddr, rasize);
+ if (likely(!nleft))
+ return orig_ret_vaddr;
+
+ if (nleft != rasize) {
+ pr_err("return address clobbered: pid=%d, %%sp=%#lx, %%ip=%#lx\n",
+ current->pid, regs->sp, regs->ip);
+
+ force_sig(SIGSEGV, current);
+ }
+
+ return -1;
+}
+
+bool arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx,
+ struct pt_regs *regs)
+{
+ if (ctx == RP_CHECK_CALL) /* sp was just decremented by "call" insn */
+ return regs->sp < ret->stack;
+ else
+ return regs->sp <= ret->stack;
+}
diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S
new file mode 100644
index 000000000..3d3c2f71f
--- /dev/null
+++ b/arch/x86/kernel/verify_cpu.S
@@ -0,0 +1,142 @@
+/*
+ *
+ * verify_cpu.S - Code for cpu long mode and SSE verification. This
+ * code has been borrowed from boot/setup.S and was introduced by
+ * Andi Kleen.
+ *
+ * Copyright (c) 2007 Andi Kleen (ak@suse.de)
+ * Copyright (c) 2007 Eric Biederman (ebiederm@xmission.com)
+ * Copyright (c) 2007 Vivek Goyal (vgoyal@in.ibm.com)
+ * Copyright (c) 2010 Kees Cook (kees.cook@canonical.com)
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ *
+ * This is a common code for verification whether CPU supports
+ * long mode and SSE or not. It is not called directly instead this
+ * file is included at various places and compiled in that context.
+ * This file is expected to run in 32bit code. Currently:
+ *
+ * arch/x86/boot/compressed/head_64.S: Boot cpu verification
+ * arch/x86/kernel/trampoline_64.S: secondary processor verification
+ * arch/x86/kernel/head_32.S: processor startup
+ *
+ * verify_cpu, returns the status of longmode and SSE in register %eax.
+ * 0: Success 1: Failure
+ *
+ * On Intel, the XD_DISABLE flag will be cleared as a side-effect.
+ *
+ * The caller needs to check for the error code and take the action
+ * appropriately. Either display a message or halt.
+ */
+
+#include <asm/cpufeatures.h>
+#include <asm/msr-index.h>
+
+ENTRY(verify_cpu)
+ pushf # Save caller passed flags
+ push $0 # Kill any dangerous flags
+ popf
+
+#ifndef __x86_64__
+ pushfl # standard way to check for cpuid
+ popl %eax
+ movl %eax,%ebx
+ xorl $0x200000,%eax
+ pushl %eax
+ popfl
+ pushfl
+ popl %eax
+ cmpl %eax,%ebx
+ jz .Lverify_cpu_no_longmode # cpu has no cpuid
+#endif
+
+ movl $0x0,%eax # See if cpuid 1 is implemented
+ cpuid
+ cmpl $0x1,%eax
+ jb .Lverify_cpu_no_longmode # no cpuid 1
+
+ xor %di,%di
+ cmpl $0x68747541,%ebx # AuthenticAMD
+ jnz .Lverify_cpu_noamd
+ cmpl $0x69746e65,%edx
+ jnz .Lverify_cpu_noamd
+ cmpl $0x444d4163,%ecx
+ jnz .Lverify_cpu_noamd
+ mov $1,%di # cpu is from AMD
+ jmp .Lverify_cpu_check
+
+.Lverify_cpu_noamd:
+ cmpl $0x756e6547,%ebx # GenuineIntel?
+ jnz .Lverify_cpu_check
+ cmpl $0x49656e69,%edx
+ jnz .Lverify_cpu_check
+ cmpl $0x6c65746e,%ecx
+ jnz .Lverify_cpu_check
+
+ # only call IA32_MISC_ENABLE when:
+ # family > 6 || (family == 6 && model >= 0xd)
+ movl $0x1, %eax # check CPU family and model
+ cpuid
+ movl %eax, %ecx
+
+ andl $0x0ff00f00, %eax # mask family and extended family
+ shrl $8, %eax
+ cmpl $6, %eax
+ ja .Lverify_cpu_clear_xd # family > 6, ok
+ jb .Lverify_cpu_check # family < 6, skip
+
+ andl $0x000f00f0, %ecx # mask model and extended model
+ shrl $4, %ecx
+ cmpl $0xd, %ecx
+ jb .Lverify_cpu_check # family == 6, model < 0xd, skip
+
+.Lverify_cpu_clear_xd:
+ movl $MSR_IA32_MISC_ENABLE, %ecx
+ rdmsr
+ btrl $2, %edx # clear MSR_IA32_MISC_ENABLE_XD_DISABLE
+ jnc .Lverify_cpu_check # only write MSR if bit was changed
+ wrmsr
+
+.Lverify_cpu_check:
+ movl $0x1,%eax # Does the cpu have what it takes
+ cpuid
+ andl $REQUIRED_MASK0,%edx
+ xorl $REQUIRED_MASK0,%edx
+ jnz .Lverify_cpu_no_longmode
+
+ movl $0x80000000,%eax # See if extended cpuid is implemented
+ cpuid
+ cmpl $0x80000001,%eax
+ jb .Lverify_cpu_no_longmode # no extended cpuid
+
+ movl $0x80000001,%eax # Does the cpu have what it takes
+ cpuid
+ andl $REQUIRED_MASK1,%edx
+ xorl $REQUIRED_MASK1,%edx
+ jnz .Lverify_cpu_no_longmode
+
+.Lverify_cpu_sse_test:
+ movl $1,%eax
+ cpuid
+ andl $SSE_MASK,%edx
+ cmpl $SSE_MASK,%edx
+ je .Lverify_cpu_sse_ok
+ test %di,%di
+ jz .Lverify_cpu_no_longmode # only try to force SSE on AMD
+ movl $MSR_K7_HWCR,%ecx
+ rdmsr
+ btr $15,%eax # enable SSE
+ wrmsr
+ xor %di,%di # don't loop
+ jmp .Lverify_cpu_sse_test # try again
+
+.Lverify_cpu_no_longmode:
+ popf # Restore caller passed flags
+ movl $1,%eax
+ ret
+.Lverify_cpu_sse_ok:
+ popf # Restore caller passed flags
+ xorl %eax, %eax
+ ret
+ENDPROC(verify_cpu)
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
new file mode 100644
index 000000000..1c03e4aa6
--- /dev/null
+++ b/arch/x86/kernel/vm86_32.c
@@ -0,0 +1,874 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 1994 Linus Torvalds
+ *
+ * 29 dec 2001 - Fixed oopses caused by unchecked access to the vm86
+ * stack - Manfred Spraul <manfred@colorfullife.com>
+ *
+ * 22 mar 2002 - Manfred detected the stackfaults, but didn't handle
+ * them correctly. Now the emulation will be in a
+ * consistent state after stackfaults - Kasper Dupont
+ * <kasperd@daimi.au.dk>
+ *
+ * 22 mar 2002 - Added missing clear_IF in set_vflags_* Kasper Dupont
+ * <kasperd@daimi.au.dk>
+ *
+ * ?? ??? 2002 - Fixed premature returns from handle_vm86_fault
+ * caused by Kasper Dupont's changes - Stas Sergeev
+ *
+ * 4 apr 2002 - Fixed CHECK_IF_IN_TRAP broken by Stas' changes.
+ * Kasper Dupont <kasperd@daimi.au.dk>
+ *
+ * 9 apr 2002 - Changed syntax of macros in handle_vm86_fault.
+ * Kasper Dupont <kasperd@daimi.au.dk>
+ *
+ * 9 apr 2002 - Changed stack access macros to jump to a label
+ * instead of returning to userspace. This simplifies
+ * do_int, and is needed by handle_vm6_fault. Kasper
+ * Dupont <kasperd@daimi.au.dk>
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/syscalls.h>
+#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
+#include <linux/kernel.h>
+#include <linux/signal.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/highmem.h>
+#include <linux/ptrace.h>
+#include <linux/audit.h>
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <linux/security.h>
+
+#include <linux/uaccess.h>
+#include <asm/io.h>
+#include <asm/tlbflush.h>
+#include <asm/irq.h>
+#include <asm/traps.h>
+#include <asm/vm86.h>
+#include <asm/switch_to.h>
+
+/*
+ * Known problems:
+ *
+ * Interrupt handling is not guaranteed:
+ * - a real x86 will disable all interrupts for one instruction
+ * after a "mov ss,xx" to make stack handling atomic even without
+ * the 'lss' instruction. We can't guarantee this in v86 mode,
+ * as the next instruction might result in a page fault or similar.
+ * - a real x86 will have interrupts disabled for one instruction
+ * past the 'sti' that enables them. We don't bother with all the
+ * details yet.
+ *
+ * Let's hope these problems do not actually matter for anything.
+ */
+
+
+/*
+ * 8- and 16-bit register defines..
+ */
+#define AL(regs) (((unsigned char *)&((regs)->pt.ax))[0])
+#define AH(regs) (((unsigned char *)&((regs)->pt.ax))[1])
+#define IP(regs) (*(unsigned short *)&((regs)->pt.ip))
+#define SP(regs) (*(unsigned short *)&((regs)->pt.sp))
+
+/*
+ * virtual flags (16 and 32-bit versions)
+ */
+#define VFLAGS (*(unsigned short *)&(current->thread.vm86->veflags))
+#define VEFLAGS (current->thread.vm86->veflags)
+
+#define set_flags(X, new, mask) \
+((X) = ((X) & ~(mask)) | ((new) & (mask)))
+
+#define SAFE_MASK (0xDD5)
+#define RETURN_MASK (0xDFF)
+
+void save_v86_state(struct kernel_vm86_regs *regs, int retval)
+{
+ struct task_struct *tsk = current;
+ struct vm86plus_struct __user *user;
+ struct vm86 *vm86 = current->thread.vm86;
+ long err = 0;
+
+ /*
+ * This gets called from entry.S with interrupts disabled, but
+ * from process context. Enable interrupts here, before trying
+ * to access user space.
+ */
+ local_irq_enable();
+
+ if (!vm86 || !vm86->user_vm86) {
+ pr_alert("no user_vm86: BAD\n");
+ do_exit(SIGSEGV);
+ }
+ set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | vm86->veflags_mask);
+ user = vm86->user_vm86;
+
+ if (!access_ok(VERIFY_WRITE, user, vm86->vm86plus.is_vm86pus ?
+ sizeof(struct vm86plus_struct) :
+ sizeof(struct vm86_struct))) {
+ pr_alert("could not access userspace vm86 info\n");
+ do_exit(SIGSEGV);
+ }
+
+ put_user_try {
+ put_user_ex(regs->pt.bx, &user->regs.ebx);
+ put_user_ex(regs->pt.cx, &user->regs.ecx);
+ put_user_ex(regs->pt.dx, &user->regs.edx);
+ put_user_ex(regs->pt.si, &user->regs.esi);
+ put_user_ex(regs->pt.di, &user->regs.edi);
+ put_user_ex(regs->pt.bp, &user->regs.ebp);
+ put_user_ex(regs->pt.ax, &user->regs.eax);
+ put_user_ex(regs->pt.ip, &user->regs.eip);
+ put_user_ex(regs->pt.cs, &user->regs.cs);
+ put_user_ex(regs->pt.flags, &user->regs.eflags);
+ put_user_ex(regs->pt.sp, &user->regs.esp);
+ put_user_ex(regs->pt.ss, &user->regs.ss);
+ put_user_ex(regs->es, &user->regs.es);
+ put_user_ex(regs->ds, &user->regs.ds);
+ put_user_ex(regs->fs, &user->regs.fs);
+ put_user_ex(regs->gs, &user->regs.gs);
+
+ put_user_ex(vm86->screen_bitmap, &user->screen_bitmap);
+ } put_user_catch(err);
+ if (err) {
+ pr_alert("could not access userspace vm86 info\n");
+ do_exit(SIGSEGV);
+ }
+
+ preempt_disable();
+ tsk->thread.sp0 = vm86->saved_sp0;
+ tsk->thread.sysenter_cs = __KERNEL_CS;
+ update_task_stack(tsk);
+ refresh_sysenter_cs(&tsk->thread);
+ vm86->saved_sp0 = 0;
+ preempt_enable();
+
+ memcpy(&regs->pt, &vm86->regs32, sizeof(struct pt_regs));
+
+ lazy_load_gs(vm86->regs32.gs);
+
+ regs->pt.ax = retval;
+}
+
+static void mark_screen_rdonly(struct mm_struct *mm)
+{
+ struct vm_area_struct *vma;
+ spinlock_t *ptl;
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+ int i;
+
+ down_write(&mm->mmap_sem);
+ pgd = pgd_offset(mm, 0xA0000);
+ if (pgd_none_or_clear_bad(pgd))
+ goto out;
+ p4d = p4d_offset(pgd, 0xA0000);
+ if (p4d_none_or_clear_bad(p4d))
+ goto out;
+ pud = pud_offset(p4d, 0xA0000);
+ if (pud_none_or_clear_bad(pud))
+ goto out;
+ pmd = pmd_offset(pud, 0xA0000);
+
+ if (pmd_trans_huge(*pmd)) {
+ vma = find_vma(mm, 0xA0000);
+ split_huge_pmd(vma, pmd, 0xA0000);
+ }
+ if (pmd_none_or_clear_bad(pmd))
+ goto out;
+ pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl);
+ for (i = 0; i < 32; i++) {
+ if (pte_present(*pte))
+ set_pte(pte, pte_wrprotect(*pte));
+ pte++;
+ }
+ pte_unmap_unlock(pte, ptl);
+out:
+ up_write(&mm->mmap_sem);
+ flush_tlb_mm_range(mm, 0xA0000, 0xA0000 + 32*PAGE_SIZE, 0UL);
+}
+
+
+
+static int do_vm86_irq_handling(int subfunction, int irqnumber);
+static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus);
+
+SYSCALL_DEFINE1(vm86old, struct vm86_struct __user *, user_vm86)
+{
+ return do_sys_vm86((struct vm86plus_struct __user *) user_vm86, false);
+}
+
+
+SYSCALL_DEFINE2(vm86, unsigned long, cmd, unsigned long, arg)
+{
+ switch (cmd) {
+ case VM86_REQUEST_IRQ:
+ case VM86_FREE_IRQ:
+ case VM86_GET_IRQ_BITS:
+ case VM86_GET_AND_RESET_IRQ:
+ return do_vm86_irq_handling(cmd, (int)arg);
+ case VM86_PLUS_INSTALL_CHECK:
+ /*
+ * NOTE: on old vm86 stuff this will return the error
+ * from access_ok(), because the subfunction is
+ * interpreted as (invalid) address to vm86_struct.
+ * So the installation check works.
+ */
+ return 0;
+ }
+
+ /* we come here only for functions VM86_ENTER, VM86_ENTER_NO_BYPASS */
+ return do_sys_vm86((struct vm86plus_struct __user *) arg, true);
+}
+
+
+static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
+{
+ struct task_struct *tsk = current;
+ struct vm86 *vm86 = tsk->thread.vm86;
+ struct kernel_vm86_regs vm86regs;
+ struct pt_regs *regs = current_pt_regs();
+ unsigned long err = 0;
+
+ err = security_mmap_addr(0);
+ if (err) {
+ /*
+ * vm86 cannot virtualize the address space, so vm86 users
+ * need to manage the low 1MB themselves using mmap. Given
+ * that BIOS places important data in the first page, vm86
+ * is essentially useless if mmap_min_addr != 0. DOSEMU,
+ * for example, won't even bother trying to use vm86 if it
+ * can't map a page at virtual address 0.
+ *
+ * To reduce the available kernel attack surface, simply
+ * disallow vm86(old) for users who cannot mmap at va 0.
+ *
+ * The implementation of security_mmap_addr will allow
+ * suitably privileged users to map va 0 even if
+ * vm.mmap_min_addr is set above 0, and we want this
+ * behavior for vm86 as well, as it ensures that legacy
+ * tools like vbetool will not fail just because of
+ * vm.mmap_min_addr.
+ */
+ pr_info_once("Denied a call to vm86(old) from %s[%d] (uid: %d). Set the vm.mmap_min_addr sysctl to 0 and/or adjust LSM mmap_min_addr policy to enable vm86 if you are using a vm86-based DOS emulator.\n",
+ current->comm, task_pid_nr(current),
+ from_kuid_munged(&init_user_ns, current_uid()));
+ return -EPERM;
+ }
+
+ if (!vm86) {
+ if (!(vm86 = kzalloc(sizeof(*vm86), GFP_KERNEL)))
+ return -ENOMEM;
+ tsk->thread.vm86 = vm86;
+ }
+ if (vm86->saved_sp0)
+ return -EPERM;
+
+ if (!access_ok(VERIFY_READ, user_vm86, plus ?
+ sizeof(struct vm86_struct) :
+ sizeof(struct vm86plus_struct)))
+ return -EFAULT;
+
+ memset(&vm86regs, 0, sizeof(vm86regs));
+ get_user_try {
+ unsigned short seg;
+ get_user_ex(vm86regs.pt.bx, &user_vm86->regs.ebx);
+ get_user_ex(vm86regs.pt.cx, &user_vm86->regs.ecx);
+ get_user_ex(vm86regs.pt.dx, &user_vm86->regs.edx);
+ get_user_ex(vm86regs.pt.si, &user_vm86->regs.esi);
+ get_user_ex(vm86regs.pt.di, &user_vm86->regs.edi);
+ get_user_ex(vm86regs.pt.bp, &user_vm86->regs.ebp);
+ get_user_ex(vm86regs.pt.ax, &user_vm86->regs.eax);
+ get_user_ex(vm86regs.pt.ip, &user_vm86->regs.eip);
+ get_user_ex(seg, &user_vm86->regs.cs);
+ vm86regs.pt.cs = seg;
+ get_user_ex(vm86regs.pt.flags, &user_vm86->regs.eflags);
+ get_user_ex(vm86regs.pt.sp, &user_vm86->regs.esp);
+ get_user_ex(seg, &user_vm86->regs.ss);
+ vm86regs.pt.ss = seg;
+ get_user_ex(vm86regs.es, &user_vm86->regs.es);
+ get_user_ex(vm86regs.ds, &user_vm86->regs.ds);
+ get_user_ex(vm86regs.fs, &user_vm86->regs.fs);
+ get_user_ex(vm86regs.gs, &user_vm86->regs.gs);
+
+ get_user_ex(vm86->flags, &user_vm86->flags);
+ get_user_ex(vm86->screen_bitmap, &user_vm86->screen_bitmap);
+ get_user_ex(vm86->cpu_type, &user_vm86->cpu_type);
+ } get_user_catch(err);
+ if (err)
+ return err;
+
+ if (copy_from_user(&vm86->int_revectored,
+ &user_vm86->int_revectored,
+ sizeof(struct revectored_struct)))
+ return -EFAULT;
+ if (copy_from_user(&vm86->int21_revectored,
+ &user_vm86->int21_revectored,
+ sizeof(struct revectored_struct)))
+ return -EFAULT;
+ if (plus) {
+ if (copy_from_user(&vm86->vm86plus, &user_vm86->vm86plus,
+ sizeof(struct vm86plus_info_struct)))
+ return -EFAULT;
+ vm86->vm86plus.is_vm86pus = 1;
+ } else
+ memset(&vm86->vm86plus, 0,
+ sizeof(struct vm86plus_info_struct));
+
+ memcpy(&vm86->regs32, regs, sizeof(struct pt_regs));
+ vm86->user_vm86 = user_vm86;
+
+/*
+ * The flags register is also special: we cannot trust that the user
+ * has set it up safely, so this makes sure interrupt etc flags are
+ * inherited from protected mode.
+ */
+ VEFLAGS = vm86regs.pt.flags;
+ vm86regs.pt.flags &= SAFE_MASK;
+ vm86regs.pt.flags |= regs->flags & ~SAFE_MASK;
+ vm86regs.pt.flags |= X86_VM_MASK;
+
+ vm86regs.pt.orig_ax = regs->orig_ax;
+
+ switch (vm86->cpu_type) {
+ case CPU_286:
+ vm86->veflags_mask = 0;
+ break;
+ case CPU_386:
+ vm86->veflags_mask = X86_EFLAGS_NT | X86_EFLAGS_IOPL;
+ break;
+ case CPU_486:
+ vm86->veflags_mask = X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL;
+ break;
+ default:
+ vm86->veflags_mask = X86_EFLAGS_ID | X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL;
+ break;
+ }
+
+/*
+ * Save old state
+ */
+ vm86->saved_sp0 = tsk->thread.sp0;
+ lazy_save_gs(vm86->regs32.gs);
+
+ /* make room for real-mode segments */
+ preempt_disable();
+ tsk->thread.sp0 += 16;
+
+ if (static_cpu_has(X86_FEATURE_SEP)) {
+ tsk->thread.sysenter_cs = 0;
+ refresh_sysenter_cs(&tsk->thread);
+ }
+
+ update_task_stack(tsk);
+ preempt_enable();
+
+ if (vm86->flags & VM86_SCREEN_BITMAP)
+ mark_screen_rdonly(tsk->mm);
+
+ memcpy((struct kernel_vm86_regs *)regs, &vm86regs, sizeof(vm86regs));
+ force_iret();
+ return regs->ax;
+}
+
+static inline void set_IF(struct kernel_vm86_regs *regs)
+{
+ VEFLAGS |= X86_EFLAGS_VIF;
+}
+
+static inline void clear_IF(struct kernel_vm86_regs *regs)
+{
+ VEFLAGS &= ~X86_EFLAGS_VIF;
+}
+
+static inline void clear_TF(struct kernel_vm86_regs *regs)
+{
+ regs->pt.flags &= ~X86_EFLAGS_TF;
+}
+
+static inline void clear_AC(struct kernel_vm86_regs *regs)
+{
+ regs->pt.flags &= ~X86_EFLAGS_AC;
+}
+
+/*
+ * It is correct to call set_IF(regs) from the set_vflags_*
+ * functions. However someone forgot to call clear_IF(regs)
+ * in the opposite case.
+ * After the command sequence CLI PUSHF STI POPF you should
+ * end up with interrupts disabled, but you ended up with
+ * interrupts enabled.
+ * ( I was testing my own changes, but the only bug I
+ * could find was in a function I had not changed. )
+ * [KD]
+ */
+
+static inline void set_vflags_long(unsigned long flags, struct kernel_vm86_regs *regs)
+{
+ set_flags(VEFLAGS, flags, current->thread.vm86->veflags_mask);
+ set_flags(regs->pt.flags, flags, SAFE_MASK);
+ if (flags & X86_EFLAGS_IF)
+ set_IF(regs);
+ else
+ clear_IF(regs);
+}
+
+static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_regs *regs)
+{
+ set_flags(VFLAGS, flags, current->thread.vm86->veflags_mask);
+ set_flags(regs->pt.flags, flags, SAFE_MASK);
+ if (flags & X86_EFLAGS_IF)
+ set_IF(regs);
+ else
+ clear_IF(regs);
+}
+
+static inline unsigned long get_vflags(struct kernel_vm86_regs *regs)
+{
+ unsigned long flags = regs->pt.flags & RETURN_MASK;
+
+ if (VEFLAGS & X86_EFLAGS_VIF)
+ flags |= X86_EFLAGS_IF;
+ flags |= X86_EFLAGS_IOPL;
+ return flags | (VEFLAGS & current->thread.vm86->veflags_mask);
+}
+
+static inline int is_revectored(int nr, struct revectored_struct *bitmap)
+{
+ return test_bit(nr, bitmap->__map);
+}
+
+#define val_byte(val, n) (((__u8 *)&val)[n])
+
+#define pushb(base, ptr, val, err_label) \
+ do { \
+ __u8 __val = val; \
+ ptr--; \
+ if (put_user(__val, base + ptr) < 0) \
+ goto err_label; \
+ } while (0)
+
+#define pushw(base, ptr, val, err_label) \
+ do { \
+ __u16 __val = val; \
+ ptr--; \
+ if (put_user(val_byte(__val, 1), base + ptr) < 0) \
+ goto err_label; \
+ ptr--; \
+ if (put_user(val_byte(__val, 0), base + ptr) < 0) \
+ goto err_label; \
+ } while (0)
+
+#define pushl(base, ptr, val, err_label) \
+ do { \
+ __u32 __val = val; \
+ ptr--; \
+ if (put_user(val_byte(__val, 3), base + ptr) < 0) \
+ goto err_label; \
+ ptr--; \
+ if (put_user(val_byte(__val, 2), base + ptr) < 0) \
+ goto err_label; \
+ ptr--; \
+ if (put_user(val_byte(__val, 1), base + ptr) < 0) \
+ goto err_label; \
+ ptr--; \
+ if (put_user(val_byte(__val, 0), base + ptr) < 0) \
+ goto err_label; \
+ } while (0)
+
+#define popb(base, ptr, err_label) \
+ ({ \
+ __u8 __res; \
+ if (get_user(__res, base + ptr) < 0) \
+ goto err_label; \
+ ptr++; \
+ __res; \
+ })
+
+#define popw(base, ptr, err_label) \
+ ({ \
+ __u16 __res; \
+ if (get_user(val_byte(__res, 0), base + ptr) < 0) \
+ goto err_label; \
+ ptr++; \
+ if (get_user(val_byte(__res, 1), base + ptr) < 0) \
+ goto err_label; \
+ ptr++; \
+ __res; \
+ })
+
+#define popl(base, ptr, err_label) \
+ ({ \
+ __u32 __res; \
+ if (get_user(val_byte(__res, 0), base + ptr) < 0) \
+ goto err_label; \
+ ptr++; \
+ if (get_user(val_byte(__res, 1), base + ptr) < 0) \
+ goto err_label; \
+ ptr++; \
+ if (get_user(val_byte(__res, 2), base + ptr) < 0) \
+ goto err_label; \
+ ptr++; \
+ if (get_user(val_byte(__res, 3), base + ptr) < 0) \
+ goto err_label; \
+ ptr++; \
+ __res; \
+ })
+
+/* There are so many possible reasons for this function to return
+ * VM86_INTx, so adding another doesn't bother me. We can expect
+ * userspace programs to be able to handle it. (Getting a problem
+ * in userspace is always better than an Oops anyway.) [KD]
+ */
+static void do_int(struct kernel_vm86_regs *regs, int i,
+ unsigned char __user *ssp, unsigned short sp)
+{
+ unsigned long __user *intr_ptr;
+ unsigned long segoffs;
+ struct vm86 *vm86 = current->thread.vm86;
+
+ if (regs->pt.cs == BIOSSEG)
+ goto cannot_handle;
+ if (is_revectored(i, &vm86->int_revectored))
+ goto cannot_handle;
+ if (i == 0x21 && is_revectored(AH(regs), &vm86->int21_revectored))
+ goto cannot_handle;
+ intr_ptr = (unsigned long __user *) (i << 2);
+ if (get_user(segoffs, intr_ptr))
+ goto cannot_handle;
+ if ((segoffs >> 16) == BIOSSEG)
+ goto cannot_handle;
+ pushw(ssp, sp, get_vflags(regs), cannot_handle);
+ pushw(ssp, sp, regs->pt.cs, cannot_handle);
+ pushw(ssp, sp, IP(regs), cannot_handle);
+ regs->pt.cs = segoffs >> 16;
+ SP(regs) -= 6;
+ IP(regs) = segoffs & 0xffff;
+ clear_TF(regs);
+ clear_IF(regs);
+ clear_AC(regs);
+ return;
+
+cannot_handle:
+ save_v86_state(regs, VM86_INTx + (i << 8));
+}
+
+int handle_vm86_trap(struct kernel_vm86_regs *regs, long error_code, int trapno)
+{
+ struct vm86 *vm86 = current->thread.vm86;
+
+ if (vm86->vm86plus.is_vm86pus) {
+ if ((trapno == 3) || (trapno == 1)) {
+ save_v86_state(regs, VM86_TRAP + (trapno << 8));
+ return 0;
+ }
+ do_int(regs, trapno, (unsigned char __user *) (regs->pt.ss << 4), SP(regs));
+ return 0;
+ }
+ if (trapno != 1)
+ return 1; /* we let this handle by the calling routine */
+ current->thread.trap_nr = trapno;
+ current->thread.error_code = error_code;
+ force_sig(SIGTRAP, current);
+ return 0;
+}
+
+void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code)
+{
+ unsigned char opcode;
+ unsigned char __user *csp;
+ unsigned char __user *ssp;
+ unsigned short ip, sp, orig_flags;
+ int data32, pref_done;
+ struct vm86plus_info_struct *vmpi = &current->thread.vm86->vm86plus;
+
+#define CHECK_IF_IN_TRAP \
+ if (vmpi->vm86dbg_active && vmpi->vm86dbg_TFpendig) \
+ newflags |= X86_EFLAGS_TF
+
+ orig_flags = *(unsigned short *)&regs->pt.flags;
+
+ csp = (unsigned char __user *) (regs->pt.cs << 4);
+ ssp = (unsigned char __user *) (regs->pt.ss << 4);
+ sp = SP(regs);
+ ip = IP(regs);
+
+ data32 = 0;
+ pref_done = 0;
+ do {
+ switch (opcode = popb(csp, ip, simulate_sigsegv)) {
+ case 0x66: /* 32-bit data */ data32 = 1; break;
+ case 0x67: /* 32-bit address */ break;
+ case 0x2e: /* CS */ break;
+ case 0x3e: /* DS */ break;
+ case 0x26: /* ES */ break;
+ case 0x36: /* SS */ break;
+ case 0x65: /* GS */ break;
+ case 0x64: /* FS */ break;
+ case 0xf2: /* repnz */ break;
+ case 0xf3: /* rep */ break;
+ default: pref_done = 1;
+ }
+ } while (!pref_done);
+
+ switch (opcode) {
+
+ /* pushf */
+ case 0x9c:
+ if (data32) {
+ pushl(ssp, sp, get_vflags(regs), simulate_sigsegv);
+ SP(regs) -= 4;
+ } else {
+ pushw(ssp, sp, get_vflags(regs), simulate_sigsegv);
+ SP(regs) -= 2;
+ }
+ IP(regs) = ip;
+ goto vm86_fault_return;
+
+ /* popf */
+ case 0x9d:
+ {
+ unsigned long newflags;
+ if (data32) {
+ newflags = popl(ssp, sp, simulate_sigsegv);
+ SP(regs) += 4;
+ } else {
+ newflags = popw(ssp, sp, simulate_sigsegv);
+ SP(regs) += 2;
+ }
+ IP(regs) = ip;
+ CHECK_IF_IN_TRAP;
+ if (data32)
+ set_vflags_long(newflags, regs);
+ else
+ set_vflags_short(newflags, regs);
+
+ goto check_vip;
+ }
+
+ /* int xx */
+ case 0xcd: {
+ int intno = popb(csp, ip, simulate_sigsegv);
+ IP(regs) = ip;
+ if (vmpi->vm86dbg_active) {
+ if ((1 << (intno & 7)) & vmpi->vm86dbg_intxxtab[intno >> 3]) {
+ save_v86_state(regs, VM86_INTx + (intno << 8));
+ return;
+ }
+ }
+ do_int(regs, intno, ssp, sp);
+ return;
+ }
+
+ /* iret */
+ case 0xcf:
+ {
+ unsigned long newip;
+ unsigned long newcs;
+ unsigned long newflags;
+ if (data32) {
+ newip = popl(ssp, sp, simulate_sigsegv);
+ newcs = popl(ssp, sp, simulate_sigsegv);
+ newflags = popl(ssp, sp, simulate_sigsegv);
+ SP(regs) += 12;
+ } else {
+ newip = popw(ssp, sp, simulate_sigsegv);
+ newcs = popw(ssp, sp, simulate_sigsegv);
+ newflags = popw(ssp, sp, simulate_sigsegv);
+ SP(regs) += 6;
+ }
+ IP(regs) = newip;
+ regs->pt.cs = newcs;
+ CHECK_IF_IN_TRAP;
+ if (data32) {
+ set_vflags_long(newflags, regs);
+ } else {
+ set_vflags_short(newflags, regs);
+ }
+ goto check_vip;
+ }
+
+ /* cli */
+ case 0xfa:
+ IP(regs) = ip;
+ clear_IF(regs);
+ goto vm86_fault_return;
+
+ /* sti */
+ /*
+ * Damn. This is incorrect: the 'sti' instruction should actually
+ * enable interrupts after the /next/ instruction. Not good.
+ *
+ * Probably needs some horsing around with the TF flag. Aiee..
+ */
+ case 0xfb:
+ IP(regs) = ip;
+ set_IF(regs);
+ goto check_vip;
+
+ default:
+ save_v86_state(regs, VM86_UNKNOWN);
+ }
+
+ return;
+
+check_vip:
+ if ((VEFLAGS & (X86_EFLAGS_VIP | X86_EFLAGS_VIF)) ==
+ (X86_EFLAGS_VIP | X86_EFLAGS_VIF)) {
+ save_v86_state(regs, VM86_STI);
+ return;
+ }
+
+vm86_fault_return:
+ if (vmpi->force_return_for_pic && (VEFLAGS & (X86_EFLAGS_IF | X86_EFLAGS_VIF))) {
+ save_v86_state(regs, VM86_PICRETURN);
+ return;
+ }
+ if (orig_flags & X86_EFLAGS_TF)
+ handle_vm86_trap(regs, 0, X86_TRAP_DB);
+ return;
+
+simulate_sigsegv:
+ /* FIXME: After a long discussion with Stas we finally
+ * agreed, that this is wrong. Here we should
+ * really send a SIGSEGV to the user program.
+ * But how do we create the correct context? We
+ * are inside a general protection fault handler
+ * and has just returned from a page fault handler.
+ * The correct context for the signal handler
+ * should be a mixture of the two, but how do we
+ * get the information? [KD]
+ */
+ save_v86_state(regs, VM86_UNKNOWN);
+}
+
+/* ---------------- vm86 special IRQ passing stuff ----------------- */
+
+#define VM86_IRQNAME "vm86irq"
+
+static struct vm86_irqs {
+ struct task_struct *tsk;
+ int sig;
+} vm86_irqs[16];
+
+static DEFINE_SPINLOCK(irqbits_lock);
+static int irqbits;
+
+#define ALLOWED_SIGS (1 /* 0 = don't send a signal */ \
+ | (1 << SIGUSR1) | (1 << SIGUSR2) | (1 << SIGIO) | (1 << SIGURG) \
+ | (1 << SIGUNUSED))
+
+static irqreturn_t irq_handler(int intno, void *dev_id)
+{
+ int irq_bit;
+ unsigned long flags;
+
+ spin_lock_irqsave(&irqbits_lock, flags);
+ irq_bit = 1 << intno;
+ if ((irqbits & irq_bit) || !vm86_irqs[intno].tsk)
+ goto out;
+ irqbits |= irq_bit;
+ if (vm86_irqs[intno].sig)
+ send_sig(vm86_irqs[intno].sig, vm86_irqs[intno].tsk, 1);
+ /*
+ * IRQ will be re-enabled when user asks for the irq (whether
+ * polling or as a result of the signal)
+ */
+ disable_irq_nosync(intno);
+ spin_unlock_irqrestore(&irqbits_lock, flags);
+ return IRQ_HANDLED;
+
+out:
+ spin_unlock_irqrestore(&irqbits_lock, flags);
+ return IRQ_NONE;
+}
+
+static inline void free_vm86_irq(int irqnumber)
+{
+ unsigned long flags;
+
+ free_irq(irqnumber, NULL);
+ vm86_irqs[irqnumber].tsk = NULL;
+
+ spin_lock_irqsave(&irqbits_lock, flags);
+ irqbits &= ~(1 << irqnumber);
+ spin_unlock_irqrestore(&irqbits_lock, flags);
+}
+
+void release_vm86_irqs(struct task_struct *task)
+{
+ int i;
+ for (i = FIRST_VM86_IRQ ; i <= LAST_VM86_IRQ; i++)
+ if (vm86_irqs[i].tsk == task)
+ free_vm86_irq(i);
+}
+
+static inline int get_and_reset_irq(int irqnumber)
+{
+ int bit;
+ unsigned long flags;
+ int ret = 0;
+
+ if (invalid_vm86_irq(irqnumber)) return 0;
+ if (vm86_irqs[irqnumber].tsk != current) return 0;
+ spin_lock_irqsave(&irqbits_lock, flags);
+ bit = irqbits & (1 << irqnumber);
+ irqbits &= ~bit;
+ if (bit) {
+ enable_irq(irqnumber);
+ ret = 1;
+ }
+
+ spin_unlock_irqrestore(&irqbits_lock, flags);
+ return ret;
+}
+
+
+static int do_vm86_irq_handling(int subfunction, int irqnumber)
+{
+ int ret;
+ switch (subfunction) {
+ case VM86_GET_AND_RESET_IRQ: {
+ return get_and_reset_irq(irqnumber);
+ }
+ case VM86_GET_IRQ_BITS: {
+ return irqbits;
+ }
+ case VM86_REQUEST_IRQ: {
+ int sig = irqnumber >> 8;
+ int irq = irqnumber & 255;
+ if (!capable(CAP_SYS_ADMIN)) return -EPERM;
+ if (!((1 << sig) & ALLOWED_SIGS)) return -EPERM;
+ if (invalid_vm86_irq(irq)) return -EPERM;
+ if (vm86_irqs[irq].tsk) return -EPERM;
+ ret = request_irq(irq, &irq_handler, 0, VM86_IRQNAME, NULL);
+ if (ret) return ret;
+ vm86_irqs[irq].sig = sig;
+ vm86_irqs[irq].tsk = current;
+ return irq;
+ }
+ case VM86_FREE_IRQ: {
+ if (invalid_vm86_irq(irqnumber)) return -EPERM;
+ if (!vm86_irqs[irqnumber].tsk) return 0;
+ if (vm86_irqs[irqnumber].tsk != current) return -EPERM;
+ free_vm86_irq(irqnumber);
+ return 0;
+ }
+ }
+ return -EINVAL;
+}
+
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
new file mode 100644
index 000000000..34c0652ca
--- /dev/null
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -0,0 +1,438 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * ld script for the x86 kernel
+ *
+ * Historic 32-bit version written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>
+ *
+ * Modernisation, unification and other changes and fixes:
+ * Copyright (C) 2007-2009 Sam Ravnborg <sam@ravnborg.org>
+ *
+ *
+ * Don't define absolute symbols until and unless you know that symbol
+ * value is should remain constant even if kernel image is relocated
+ * at run time. Absolute symbols are not relocated. If symbol value should
+ * change if kernel is relocated, make the symbol section relative and
+ * put it inside the section definition.
+ */
+
+#ifdef CONFIG_X86_32
+#define LOAD_OFFSET __PAGE_OFFSET
+#else
+#define LOAD_OFFSET __START_KERNEL_map
+#endif
+
+#include <asm-generic/vmlinux.lds.h>
+#include <asm/asm-offsets.h>
+#include <asm/thread_info.h>
+#include <asm/page_types.h>
+#include <asm/orc_lookup.h>
+#include <asm/cache.h>
+#include <asm/boot.h>
+
+#undef i386 /* in case the preprocessor is a 32bit one */
+
+OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT)
+
+#ifdef CONFIG_X86_32
+OUTPUT_ARCH(i386)
+ENTRY(phys_startup_32)
+#else
+OUTPUT_ARCH(i386:x86-64)
+ENTRY(phys_startup_64)
+#endif
+
+jiffies = jiffies_64;
+
+#if defined(CONFIG_X86_64)
+/*
+ * On 64-bit, align RODATA to 2MB so we retain large page mappings for
+ * boundaries spanning kernel text, rodata and data sections.
+ *
+ * However, kernel identity mappings will have different RWX permissions
+ * to the pages mapping to text and to the pages padding (which are freed) the
+ * text section. Hence kernel identity mappings will be broken to smaller
+ * pages. For 64-bit, kernel text and kernel identity mappings are different,
+ * so we can enable protection checks as well as retain 2MB large page
+ * mappings for kernel text.
+ */
+#define X86_ALIGN_RODATA_BEGIN . = ALIGN(HPAGE_SIZE);
+
+#define X86_ALIGN_RODATA_END \
+ . = ALIGN(HPAGE_SIZE); \
+ __end_rodata_hpage_align = .; \
+ __end_rodata_aligned = .;
+
+#define ALIGN_ENTRY_TEXT_BEGIN . = ALIGN(PMD_SIZE);
+#define ALIGN_ENTRY_TEXT_END . = ALIGN(PMD_SIZE);
+
+/*
+ * This section contains data which will be mapped as decrypted. Memory
+ * encryption operates on a page basis. Make this section PMD-aligned
+ * to avoid splitting the pages while mapping the section early.
+ *
+ * Note: We use a separate section so that only this section gets
+ * decrypted to avoid exposing more than we wish.
+ */
+#define BSS_DECRYPTED \
+ . = ALIGN(PMD_SIZE); \
+ __start_bss_decrypted = .; \
+ *(.bss..decrypted); \
+ . = ALIGN(PAGE_SIZE); \
+ __start_bss_decrypted_unused = .; \
+ . = ALIGN(PMD_SIZE); \
+ __end_bss_decrypted = .; \
+
+#else
+
+#define X86_ALIGN_RODATA_BEGIN
+#define X86_ALIGN_RODATA_END \
+ . = ALIGN(PAGE_SIZE); \
+ __end_rodata_aligned = .;
+
+#define ALIGN_ENTRY_TEXT_BEGIN
+#define ALIGN_ENTRY_TEXT_END
+#define BSS_DECRYPTED
+
+#endif
+
+PHDRS {
+ text PT_LOAD FLAGS(5); /* R_E */
+ data PT_LOAD FLAGS(6); /* RW_ */
+#ifdef CONFIG_X86_64
+#ifdef CONFIG_SMP
+ percpu PT_LOAD FLAGS(6); /* RW_ */
+#endif
+ init PT_LOAD FLAGS(7); /* RWE */
+#endif
+ note PT_NOTE FLAGS(0); /* ___ */
+}
+
+SECTIONS
+{
+#ifdef CONFIG_X86_32
+ . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
+ phys_startup_32 = ABSOLUTE(startup_32 - LOAD_OFFSET);
+#else
+ . = __START_KERNEL;
+ phys_startup_64 = ABSOLUTE(startup_64 - LOAD_OFFSET);
+#endif
+
+ /* Text and read-only data */
+ .text : AT(ADDR(.text) - LOAD_OFFSET) {
+ _text = .;
+ _stext = .;
+ /* bootstrapping code */
+ HEAD_TEXT
+ TEXT_TEXT
+ SCHED_TEXT
+ CPUIDLE_TEXT
+ LOCK_TEXT
+ KPROBES_TEXT
+ ALIGN_ENTRY_TEXT_BEGIN
+ ENTRY_TEXT
+ IRQENTRY_TEXT
+ ALIGN_ENTRY_TEXT_END
+ SOFTIRQENTRY_TEXT
+ *(.fixup)
+ *(.gnu.warning)
+
+#ifdef CONFIG_X86_64
+ . = ALIGN(PAGE_SIZE);
+ __entry_trampoline_start = .;
+ _entry_trampoline = .;
+ *(.entry_trampoline)
+ . = ALIGN(PAGE_SIZE);
+ __entry_trampoline_end = .;
+ ASSERT(. - _entry_trampoline == PAGE_SIZE, "entry trampoline is too big");
+#endif
+
+#ifdef CONFIG_RETPOLINE
+ __indirect_thunk_start = .;
+ *(.text.__x86.indirect_thunk)
+ __indirect_thunk_end = .;
+#endif
+
+ /* End of text section */
+ _etext = .;
+ } :text = 0x9090
+
+ NOTES :text :note
+
+ EXCEPTION_TABLE(16) :text = 0x9090
+
+ /* .text should occupy whole number of pages */
+ . = ALIGN(PAGE_SIZE);
+ X86_ALIGN_RODATA_BEGIN
+ RO_DATA(PAGE_SIZE)
+ X86_ALIGN_RODATA_END
+
+ /* Data */
+ .data : AT(ADDR(.data) - LOAD_OFFSET) {
+ /* Start of data section */
+ _sdata = .;
+
+ /* init_task */
+ INIT_TASK_DATA(THREAD_SIZE)
+
+#ifdef CONFIG_X86_32
+ /* 32 bit has nosave before _edata */
+ NOSAVE_DATA
+#endif
+
+ PAGE_ALIGNED_DATA(PAGE_SIZE)
+
+ CACHELINE_ALIGNED_DATA(L1_CACHE_BYTES)
+
+ DATA_DATA
+ CONSTRUCTORS
+
+ /* rarely changed data like cpu maps */
+ READ_MOSTLY_DATA(INTERNODE_CACHE_BYTES)
+
+ /* End of data section */
+ _edata = .;
+ } :data
+
+ BUG_TABLE
+
+ ORC_UNWIND_TABLE
+
+ . = ALIGN(PAGE_SIZE);
+ __vvar_page = .;
+
+ .vvar : AT(ADDR(.vvar) - LOAD_OFFSET) {
+ /* work around gold bug 13023 */
+ __vvar_beginning_hack = .;
+
+ /* Place all vvars at the offsets in asm/vvar.h. */
+#define EMIT_VVAR(name, offset) \
+ . = __vvar_beginning_hack + offset; \
+ *(.vvar_ ## name)
+#define __VVAR_KERNEL_LDS
+#include <asm/vvar.h>
+#undef __VVAR_KERNEL_LDS
+#undef EMIT_VVAR
+
+ /*
+ * Pad the rest of the page with zeros. Otherwise the loader
+ * can leave garbage here.
+ */
+ . = __vvar_beginning_hack + PAGE_SIZE;
+ } :data
+
+ . = ALIGN(__vvar_page + PAGE_SIZE, PAGE_SIZE);
+
+ /* Init code and data - will be freed after init */
+ . = ALIGN(PAGE_SIZE);
+ .init.begin : AT(ADDR(.init.begin) - LOAD_OFFSET) {
+ __init_begin = .; /* paired with __init_end */
+ }
+
+#if defined(CONFIG_X86_64) && defined(CONFIG_SMP)
+ /*
+ * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the
+ * output PHDR, so the next output section - .init.text - should
+ * start another segment - init.
+ */
+ PERCPU_VADDR(INTERNODE_CACHE_BYTES, 0, :percpu)
+ ASSERT(SIZEOF(.data..percpu) < CONFIG_PHYSICAL_START,
+ "per-CPU data too large - increase CONFIG_PHYSICAL_START")
+#endif
+
+ INIT_TEXT_SECTION(PAGE_SIZE)
+#ifdef CONFIG_X86_64
+ :init
+#endif
+
+ /*
+ * Section for code used exclusively before alternatives are run. All
+ * references to such code must be patched out by alternatives, normally
+ * by using X86_FEATURE_ALWAYS CPU feature bit.
+ *
+ * See static_cpu_has() for an example.
+ */
+ .altinstr_aux : AT(ADDR(.altinstr_aux) - LOAD_OFFSET) {
+ *(.altinstr_aux)
+ }
+
+ INIT_DATA_SECTION(16)
+
+ .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
+ __x86_cpu_dev_start = .;
+ *(.x86_cpu_dev.init)
+ __x86_cpu_dev_end = .;
+ }
+
+#ifdef CONFIG_X86_INTEL_MID
+ .x86_intel_mid_dev.init : AT(ADDR(.x86_intel_mid_dev.init) - \
+ LOAD_OFFSET) {
+ __x86_intel_mid_dev_start = .;
+ *(.x86_intel_mid_dev.init)
+ __x86_intel_mid_dev_end = .;
+ }
+#endif
+
+ /*
+ * start address and size of operations which during runtime
+ * can be patched with virtualization friendly instructions or
+ * baremetal native ones. Think page table operations.
+ * Details in paravirt_types.h
+ */
+ . = ALIGN(8);
+ .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
+ __parainstructions = .;
+ *(.parainstructions)
+ __parainstructions_end = .;
+ }
+
+ /*
+ * struct alt_inst entries. From the header (alternative.h):
+ * "Alternative instructions for different CPU types or capabilities"
+ * Think locking instructions on spinlocks.
+ */
+ . = ALIGN(8);
+ .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
+ __alt_instructions = .;
+ *(.altinstructions)
+ __alt_instructions_end = .;
+ }
+
+ /*
+ * And here are the replacement instructions. The linker sticks
+ * them as binary blobs. The .altinstructions has enough data to
+ * get the address and the length of them to patch the kernel safely.
+ */
+ .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
+ *(.altinstr_replacement)
+ }
+
+ /*
+ * struct iommu_table_entry entries are injected in this section.
+ * It is an array of IOMMUs which during run time gets sorted depending
+ * on its dependency order. After rootfs_initcall is complete
+ * this section can be safely removed.
+ */
+ .iommu_table : AT(ADDR(.iommu_table) - LOAD_OFFSET) {
+ __iommu_table = .;
+ *(.iommu_table)
+ __iommu_table_end = .;
+ }
+
+ . = ALIGN(8);
+ .apicdrivers : AT(ADDR(.apicdrivers) - LOAD_OFFSET) {
+ __apicdrivers = .;
+ *(.apicdrivers);
+ __apicdrivers_end = .;
+ }
+
+ . = ALIGN(8);
+ /*
+ * .exit.text is discard at runtime, not link time, to deal with
+ * references from .altinstructions and .eh_frame
+ */
+ .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) {
+ EXIT_TEXT
+ }
+
+ .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) {
+ EXIT_DATA
+ }
+
+#if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP)
+ PERCPU_SECTION(INTERNODE_CACHE_BYTES)
+#endif
+
+ . = ALIGN(PAGE_SIZE);
+
+ /* freed after init ends here */
+ .init.end : AT(ADDR(.init.end) - LOAD_OFFSET) {
+ __init_end = .;
+ }
+
+ /*
+ * smp_locks might be freed after init
+ * start/end must be page aligned
+ */
+ . = ALIGN(PAGE_SIZE);
+ .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
+ __smp_locks = .;
+ *(.smp_locks)
+ . = ALIGN(PAGE_SIZE);
+ __smp_locks_end = .;
+ }
+
+#ifdef CONFIG_X86_64
+ .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
+ NOSAVE_DATA
+ }
+#endif
+
+ /* BSS */
+ . = ALIGN(PAGE_SIZE);
+ .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
+ __bss_start = .;
+ *(.bss..page_aligned)
+ . = ALIGN(PAGE_SIZE);
+ *(BSS_MAIN)
+ BSS_DECRYPTED
+ . = ALIGN(PAGE_SIZE);
+ __bss_stop = .;
+ }
+
+ . = ALIGN(PAGE_SIZE);
+ .brk : AT(ADDR(.brk) - LOAD_OFFSET) {
+ __brk_base = .;
+ . += 64 * 1024; /* 64k alignment slop space */
+ *(.brk_reservation) /* areas brk users have reserved */
+ __brk_limit = .;
+ }
+
+ . = ALIGN(PAGE_SIZE); /* keep VO_INIT_SIZE page aligned */
+ _end = .;
+
+ STABS_DEBUG
+ DWARF_DEBUG
+
+ /* Sections to be discarded */
+ DISCARDS
+ /DISCARD/ : {
+ *(.eh_frame)
+ }
+}
+
+
+#ifdef CONFIG_X86_32
+/*
+ * The ASSERT() sink to . is intentional, for binutils 2.14 compatibility:
+ */
+. = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
+ "kernel image bigger than KERNEL_IMAGE_SIZE");
+#else
+/*
+ * Per-cpu symbols which need to be offset from __per_cpu_load
+ * for the boot processor.
+ */
+#define INIT_PER_CPU(x) init_per_cpu__##x = ABSOLUTE(x) + __per_cpu_load
+INIT_PER_CPU(gdt_page);
+INIT_PER_CPU(irq_stack_union);
+
+/*
+ * Build-time check on the image size:
+ */
+. = ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
+ "kernel image bigger than KERNEL_IMAGE_SIZE");
+
+#ifdef CONFIG_SMP
+. = ASSERT((irq_stack_union == 0),
+ "irq_stack_union is not at start of per-cpu area");
+#endif
+
+#endif /* CONFIG_X86_32 */
+
+#ifdef CONFIG_KEXEC_CORE
+#include <asm/kexec.h>
+
+. = ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
+ "kexec control code size is too big");
+#endif
+
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
new file mode 100644
index 000000000..44685fb2a
--- /dev/null
+++ b/arch/x86/kernel/vsmp_64.c
@@ -0,0 +1,226 @@
+/*
+ * vSMPowered(tm) systems specific initialization
+ * Copyright (C) 2005 ScaleMP Inc.
+ *
+ * Use of this code is subject to the terms and conditions of the
+ * GNU general public license version 2. See "COPYING" or
+ * http://www.gnu.org/licenses/gpl.html
+ *
+ * Ravikiran Thirumalai <kiran@scalemp.com>,
+ * Shai Fultheim <shai@scalemp.com>
+ * Paravirt ops integration: Glauber de Oliveira Costa <gcosta@redhat.com>,
+ * Ravikiran Thirumalai <kiran@scalemp.com>
+ */
+
+#include <linux/init.h>
+#include <linux/pci_ids.h>
+#include <linux/pci_regs.h>
+#include <linux/smp.h>
+#include <linux/irq.h>
+
+#include <asm/apic.h>
+#include <asm/pci-direct.h>
+#include <asm/io.h>
+#include <asm/paravirt.h>
+#include <asm/setup.h>
+
+#define TOPOLOGY_REGISTER_OFFSET 0x10
+
+#if defined CONFIG_PCI && defined CONFIG_PARAVIRT
+/*
+ * Interrupt control on vSMPowered systems:
+ * ~AC is a shadow of IF. If IF is 'on' AC should be 'off'
+ * and vice versa.
+ */
+
+asmlinkage __visible unsigned long vsmp_save_fl(void)
+{
+ unsigned long flags = native_save_fl();
+
+ if (!(flags & X86_EFLAGS_IF) || (flags & X86_EFLAGS_AC))
+ flags &= ~X86_EFLAGS_IF;
+ return flags;
+}
+PV_CALLEE_SAVE_REGS_THUNK(vsmp_save_fl);
+
+__visible void vsmp_restore_fl(unsigned long flags)
+{
+ if (flags & X86_EFLAGS_IF)
+ flags &= ~X86_EFLAGS_AC;
+ else
+ flags |= X86_EFLAGS_AC;
+ native_restore_fl(flags);
+}
+PV_CALLEE_SAVE_REGS_THUNK(vsmp_restore_fl);
+
+asmlinkage __visible void vsmp_irq_disable(void)
+{
+ unsigned long flags = native_save_fl();
+
+ native_restore_fl((flags & ~X86_EFLAGS_IF) | X86_EFLAGS_AC);
+}
+PV_CALLEE_SAVE_REGS_THUNK(vsmp_irq_disable);
+
+asmlinkage __visible void vsmp_irq_enable(void)
+{
+ unsigned long flags = native_save_fl();
+
+ native_restore_fl((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC));
+}
+PV_CALLEE_SAVE_REGS_THUNK(vsmp_irq_enable);
+
+static unsigned __init vsmp_patch(u8 type, u16 clobbers, void *ibuf,
+ unsigned long addr, unsigned len)
+{
+ switch (type) {
+ case PARAVIRT_PATCH(pv_irq_ops.irq_enable):
+ case PARAVIRT_PATCH(pv_irq_ops.irq_disable):
+ case PARAVIRT_PATCH(pv_irq_ops.save_fl):
+ case PARAVIRT_PATCH(pv_irq_ops.restore_fl):
+ return paravirt_patch_default(type, clobbers, ibuf, addr, len);
+ default:
+ return native_patch(type, clobbers, ibuf, addr, len);
+ }
+
+}
+
+static void __init set_vsmp_pv_ops(void)
+{
+ void __iomem *address;
+ unsigned int cap, ctl, cfg;
+
+ /* set vSMP magic bits to indicate vSMP capable kernel */
+ cfg = read_pci_config(0, 0x1f, 0, PCI_BASE_ADDRESS_0);
+ address = early_ioremap(cfg, 8);
+ cap = readl(address);
+ ctl = readl(address + 4);
+ printk(KERN_INFO "vSMP CTL: capabilities:0x%08x control:0x%08x\n",
+ cap, ctl);
+
+ /* If possible, let the vSMP foundation route the interrupt optimally */
+#ifdef CONFIG_SMP
+ if (cap & ctl & BIT(8)) {
+ ctl &= ~BIT(8);
+
+#ifdef CONFIG_PROC_FS
+ /* Don't let users change irq affinity via procfs */
+ no_irq_affinity = 1;
+#endif
+ }
+#endif
+
+ if (cap & ctl & (1 << 4)) {
+ /* Setup irq ops and turn on vSMP IRQ fastpath handling */
+ pv_irq_ops.irq_disable = PV_CALLEE_SAVE(vsmp_irq_disable);
+ pv_irq_ops.irq_enable = PV_CALLEE_SAVE(vsmp_irq_enable);
+ pv_irq_ops.save_fl = PV_CALLEE_SAVE(vsmp_save_fl);
+ pv_irq_ops.restore_fl = PV_CALLEE_SAVE(vsmp_restore_fl);
+ pv_init_ops.patch = vsmp_patch;
+ ctl &= ~(1 << 4);
+ }
+ writel(ctl, address + 4);
+ ctl = readl(address + 4);
+ pr_info("vSMP CTL: control set to:0x%08x\n", ctl);
+
+ early_iounmap(address, 8);
+}
+#else
+static void __init set_vsmp_pv_ops(void)
+{
+}
+#endif
+
+#ifdef CONFIG_PCI
+static int is_vsmp = -1;
+
+static void __init detect_vsmp_box(void)
+{
+ is_vsmp = 0;
+
+ if (!early_pci_allowed())
+ return;
+
+ /* Check if we are running on a ScaleMP vSMPowered box */
+ if (read_pci_config(0, 0x1f, 0, PCI_VENDOR_ID) ==
+ (PCI_VENDOR_ID_SCALEMP | (PCI_DEVICE_ID_SCALEMP_VSMP_CTL << 16)))
+ is_vsmp = 1;
+}
+
+static int is_vsmp_box(void)
+{
+ if (is_vsmp != -1)
+ return is_vsmp;
+ else {
+ WARN_ON_ONCE(1);
+ return 0;
+ }
+}
+
+#else
+static void __init detect_vsmp_box(void)
+{
+}
+static int is_vsmp_box(void)
+{
+ return 0;
+}
+#endif
+
+static void __init vsmp_cap_cpus(void)
+{
+#if !defined(CONFIG_X86_VSMP) && defined(CONFIG_SMP)
+ void __iomem *address;
+ unsigned int cfg, topology, node_shift, maxcpus;
+
+ /*
+ * CONFIG_X86_VSMP is not configured, so limit the number CPUs to the
+ * ones present in the first board, unless explicitly overridden by
+ * setup_max_cpus
+ */
+ if (setup_max_cpus != NR_CPUS)
+ return;
+
+ /* Read the vSMP Foundation topology register */
+ cfg = read_pci_config(0, 0x1f, 0, PCI_BASE_ADDRESS_0);
+ address = early_ioremap(cfg + TOPOLOGY_REGISTER_OFFSET, 4);
+ if (WARN_ON(!address))
+ return;
+
+ topology = readl(address);
+ node_shift = (topology >> 16) & 0x7;
+ if (!node_shift)
+ /* The value 0 should be decoded as 8 */
+ node_shift = 8;
+ maxcpus = (topology & ((1 << node_shift) - 1)) + 1;
+
+ pr_info("vSMP CTL: Capping CPUs to %d (CONFIG_X86_VSMP is unset)\n",
+ maxcpus);
+ setup_max_cpus = maxcpus;
+ early_iounmap(address, 4);
+#endif
+}
+
+static int apicid_phys_pkg_id(int initial_apic_id, int index_msb)
+{
+ return hard_smp_processor_id() >> index_msb;
+}
+
+static void vsmp_apic_post_init(void)
+{
+ /* need to update phys_pkg_id */
+ apic->phys_pkg_id = apicid_phys_pkg_id;
+}
+
+void __init vsmp_init(void)
+{
+ detect_vsmp_box();
+ if (!is_vsmp_box())
+ return;
+
+ x86_platform.apic_post_init = vsmp_apic_post_init;
+
+ vsmp_cap_cpus();
+
+ set_vsmp_pv_ops();
+ return;
+}
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
new file mode 100644
index 000000000..2792b5573
--- /dev/null
+++ b/arch/x86/kernel/x86_init.c
@@ -0,0 +1,160 @@
+/*
+ * Copyright (C) 2009 Thomas Gleixner <tglx@linutronix.de>
+ *
+ * For licencing details see kernel-base/COPYING
+ */
+#include <linux/init.h>
+#include <linux/ioport.h>
+#include <linux/export.h>
+#include <linux/pci.h>
+
+#include <asm/acpi.h>
+#include <asm/bios_ebda.h>
+#include <asm/paravirt.h>
+#include <asm/pci_x86.h>
+#include <asm/mpspec.h>
+#include <asm/setup.h>
+#include <asm/apic.h>
+#include <asm/e820/api.h>
+#include <asm/time.h>
+#include <asm/irq.h>
+#include <asm/io_apic.h>
+#include <asm/hpet.h>
+#include <asm/pat.h>
+#include <asm/tsc.h>
+#include <asm/iommu.h>
+#include <asm/mach_traps.h>
+
+void x86_init_noop(void) { }
+void __init x86_init_uint_noop(unsigned int unused) { }
+static int __init iommu_init_noop(void) { return 0; }
+static void iommu_shutdown_noop(void) { }
+static bool __init bool_x86_init_noop(void) { return false; }
+static void x86_op_int_noop(int cpu) { }
+static u64 u64_x86_init_noop(void) { return 0; }
+
+/*
+ * The platform setup functions are preset with the default functions
+ * for standard PC hardware.
+ */
+struct x86_init_ops x86_init __initdata = {
+
+ .resources = {
+ .probe_roms = probe_roms,
+ .reserve_resources = reserve_standard_io_resources,
+ .memory_setup = e820__memory_setup_default,
+ },
+
+ .mpparse = {
+ .mpc_record = x86_init_uint_noop,
+ .setup_ioapic_ids = x86_init_noop,
+ .mpc_apic_id = default_mpc_apic_id,
+ .smp_read_mpc_oem = default_smp_read_mpc_oem,
+ .mpc_oem_bus_info = default_mpc_oem_bus_info,
+ .find_smp_config = default_find_smp_config,
+ .get_smp_config = default_get_smp_config,
+ },
+
+ .irqs = {
+ .pre_vector_init = init_ISA_irqs,
+ .intr_init = native_init_IRQ,
+ .trap_init = x86_init_noop,
+ .intr_mode_init = apic_intr_mode_init
+ },
+
+ .oem = {
+ .arch_setup = x86_init_noop,
+ .banner = default_banner,
+ },
+
+ .paging = {
+ .pagetable_init = native_pagetable_init,
+ },
+
+ .timers = {
+ .setup_percpu_clockev = setup_boot_APIC_clock,
+ .timer_init = hpet_time_init,
+ .wallclock_init = x86_init_noop,
+ },
+
+ .iommu = {
+ .iommu_init = iommu_init_noop,
+ },
+
+ .pci = {
+ .init = x86_default_pci_init,
+ .init_irq = x86_default_pci_init_irq,
+ .fixup_irqs = x86_default_pci_fixup_irqs,
+ },
+
+ .hyper = {
+ .init_platform = x86_init_noop,
+ .guest_late_init = x86_init_noop,
+ .x2apic_available = bool_x86_init_noop,
+ .init_mem_mapping = x86_init_noop,
+ .init_after_bootmem = x86_init_noop,
+ },
+
+ .acpi = {
+ .get_root_pointer = u64_x86_init_noop,
+ .reduced_hw_early_init = acpi_generic_reduced_hw_init,
+ },
+};
+
+struct x86_cpuinit_ops x86_cpuinit = {
+ .early_percpu_clock_init = x86_init_noop,
+ .setup_percpu_clockev = setup_secondary_APIC_clock,
+};
+
+static void default_nmi_init(void) { };
+
+struct x86_platform_ops x86_platform __ro_after_init = {
+ .calibrate_cpu = native_calibrate_cpu_early,
+ .calibrate_tsc = native_calibrate_tsc,
+ .get_wallclock = mach_get_cmos_time,
+ .set_wallclock = mach_set_rtc_mmss,
+ .iommu_shutdown = iommu_shutdown_noop,
+ .is_untracked_pat_range = is_ISA_range,
+ .nmi_init = default_nmi_init,
+ .get_nmi_reason = default_get_nmi_reason,
+ .save_sched_clock_state = tsc_save_sched_clock_state,
+ .restore_sched_clock_state = tsc_restore_sched_clock_state,
+ .hyper.pin_vcpu = x86_op_int_noop,
+};
+
+EXPORT_SYMBOL_GPL(x86_platform);
+
+#if defined(CONFIG_PCI_MSI)
+struct x86_msi_ops x86_msi __ro_after_init = {
+ .setup_msi_irqs = native_setup_msi_irqs,
+ .teardown_msi_irq = native_teardown_msi_irq,
+ .teardown_msi_irqs = default_teardown_msi_irqs,
+ .restore_msi_irqs = default_restore_msi_irqs,
+};
+
+/* MSI arch specific hooks */
+int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+{
+ return x86_msi.setup_msi_irqs(dev, nvec, type);
+}
+
+void arch_teardown_msi_irqs(struct pci_dev *dev)
+{
+ x86_msi.teardown_msi_irqs(dev);
+}
+
+void arch_teardown_msi_irq(unsigned int irq)
+{
+ x86_msi.teardown_msi_irq(irq);
+}
+
+void arch_restore_msi_irqs(struct pci_dev *dev)
+{
+ x86_msi.restore_msi_irqs(dev);
+}
+#endif
+
+struct x86_apic_ops x86_apic_ops __ro_after_init = {
+ .io_apic_read = native_io_apic_read,
+ .restore = native_restore_boot_irq_mode,
+};
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
new file mode 100644
index 000000000..1bbec387d
--- /dev/null
+++ b/arch/x86/kvm/Kconfig
@@ -0,0 +1,103 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# KVM configuration
+#
+
+source "virt/kvm/Kconfig"
+
+menuconfig VIRTUALIZATION
+ bool "Virtualization"
+ depends on HAVE_KVM || X86
+ default y
+ ---help---
+ Say Y here to get to see options for using your Linux host to run other
+ operating systems inside virtual machines (guests).
+ This option alone does not add any kernel code.
+
+ If you say N, all options in this submenu will be skipped and disabled.
+
+if VIRTUALIZATION
+
+config KVM
+ tristate "Kernel-based Virtual Machine (KVM) support"
+ depends on HAVE_KVM
+ depends on HIGH_RES_TIMERS
+ # for TASKSTATS/TASK_DELAY_ACCT:
+ depends on NET && MULTIUSER
+ depends on X86_LOCAL_APIC
+ select PREEMPT_NOTIFIERS
+ select MMU_NOTIFIER
+ select ANON_INODES
+ select HAVE_KVM_IRQCHIP
+ select HAVE_KVM_IRQFD
+ select IRQ_BYPASS_MANAGER
+ select HAVE_KVM_IRQ_BYPASS
+ select HAVE_KVM_IRQ_ROUTING
+ select HAVE_KVM_EVENTFD
+ select KVM_ASYNC_PF
+ select USER_RETURN_NOTIFIER
+ select KVM_MMIO
+ select TASKSTATS
+ select TASK_DELAY_ACCT
+ select PERF_EVENTS
+ select HAVE_KVM_MSI
+ select HAVE_KVM_CPU_RELAX_INTERCEPT
+ select KVM_GENERIC_DIRTYLOG_READ_PROTECT
+ select KVM_VFIO
+ select SRCU
+ ---help---
+ Support hosting fully virtualized guest machines using hardware
+ virtualization extensions. You will need a fairly recent
+ processor equipped with virtualization extensions. You will also
+ need to select one or more of the processor modules below.
+
+ This module provides access to the hardware capabilities through
+ a character device node named /dev/kvm.
+
+ To compile this as a module, choose M here: the module
+ will be called kvm.
+
+ If unsure, say N.
+
+config KVM_INTEL
+ tristate "KVM for Intel processors support"
+ depends on KVM
+ # for perf_guest_get_msrs():
+ depends on CPU_SUP_INTEL
+ ---help---
+ Provides support for KVM on Intel processors equipped with the VT
+ extensions.
+
+ To compile this as a module, choose M here: the module
+ will be called kvm-intel.
+
+config KVM_AMD
+ tristate "KVM for AMD processors support"
+ depends on KVM
+ ---help---
+ Provides support for KVM on AMD processors equipped with the AMD-V
+ (SVM) extensions.
+
+ To compile this as a module, choose M here: the module
+ will be called kvm-amd.
+
+config KVM_AMD_SEV
+ def_bool y
+ bool "AMD Secure Encrypted Virtualization (SEV) support"
+ depends on KVM_AMD && X86_64
+ depends on CRYPTO_DEV_SP_PSP && !(KVM_AMD=y && CRYPTO_DEV_CCP_DD=m)
+ ---help---
+ Provides support for launching Encrypted VMs on AMD processors.
+
+config KVM_MMU_AUDIT
+ bool "Audit KVM MMU"
+ depends on KVM && TRACEPOINTS
+ ---help---
+ This option adds a R/W kVM module parameter 'mmu_audit', which allows
+ auditing of KVM MMU events at runtime.
+
+# OK, it's a little counter-intuitive to do this, but it puts it neatly under
+# the virtualization menu.
+source drivers/vhost/Kconfig
+
+endif # VIRTUALIZATION
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
new file mode 100644
index 000000000..dc4f2fdf5
--- /dev/null
+++ b/arch/x86/kvm/Makefile
@@ -0,0 +1,24 @@
+# SPDX-License-Identifier: GPL-2.0
+
+ccflags-y += -Iarch/x86/kvm
+
+CFLAGS_x86.o := -I.
+CFLAGS_svm.o := -I.
+CFLAGS_vmx.o := -I.
+
+KVM := ../../../virt/kvm
+
+kvm-y += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \
+ $(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o
+kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o
+
+kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
+ i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
+ hyperv.o page_track.o debugfs.o
+
+kvm-intel-y += vmx.o pmu_intel.o
+kvm-amd-y += svm.o pmu_amd.o
+
+obj-$(CONFIG_KVM) += kvm.o
+obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
+obj-$(CONFIG_KVM_AMD) += kvm-amd.o
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
new file mode 100644
index 000000000..0489ffc3d
--- /dev/null
+++ b/arch/x86/kvm/cpuid.c
@@ -0,0 +1,983 @@
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ * cpuid support routines
+ *
+ * derived from arch/x86/kvm/x86.c
+ *
+ * Copyright 2011 Red Hat, Inc. and/or its affiliates.
+ * Copyright IBM Corporation, 2008
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/export.h>
+#include <linux/vmalloc.h>
+#include <linux/uaccess.h>
+#include <linux/sched/stat.h>
+
+#include <asm/processor.h>
+#include <asm/user.h>
+#include <asm/fpu/xstate.h>
+#include "cpuid.h"
+#include "lapic.h"
+#include "mmu.h"
+#include "trace.h"
+#include "pmu.h"
+
+static u32 xstate_required_size(u64 xstate_bv, bool compacted)
+{
+ int feature_bit = 0;
+ u32 ret = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
+
+ xstate_bv &= XFEATURE_MASK_EXTEND;
+ while (xstate_bv) {
+ if (xstate_bv & 0x1) {
+ u32 eax, ebx, ecx, edx, offset;
+ cpuid_count(0xD, feature_bit, &eax, &ebx, &ecx, &edx);
+ offset = compacted ? ret : ebx;
+ ret = max(ret, offset + eax);
+ }
+
+ xstate_bv >>= 1;
+ feature_bit++;
+ }
+
+ return ret;
+}
+
+bool kvm_mpx_supported(void)
+{
+ return ((host_xcr0 & (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR))
+ && kvm_x86_ops->mpx_supported());
+}
+EXPORT_SYMBOL_GPL(kvm_mpx_supported);
+
+u64 kvm_supported_xcr0(void)
+{
+ u64 xcr0 = KVM_SUPPORTED_XCR0 & host_xcr0;
+
+ if (!kvm_mpx_supported())
+ xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR);
+
+ return xcr0;
+}
+
+#define F(x) bit(X86_FEATURE_##x)
+
+/* For scattered features from cpufeatures.h; we currently expose none */
+#define KF(x) bit(KVM_CPUID_BIT_##x)
+
+int kvm_update_cpuid(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ best = kvm_find_cpuid_entry(vcpu, 1, 0);
+ if (!best)
+ return 0;
+
+ /* Update OSXSAVE bit */
+ if (boot_cpu_has(X86_FEATURE_XSAVE) && best->function == 0x1) {
+ best->ecx &= ~F(OSXSAVE);
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE))
+ best->ecx |= F(OSXSAVE);
+ }
+
+ best->edx &= ~F(APIC);
+ if (vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE)
+ best->edx |= F(APIC);
+
+ if (apic) {
+ if (best->ecx & F(TSC_DEADLINE_TIMER))
+ apic->lapic_timer.timer_mode_mask = 3 << 17;
+ else
+ apic->lapic_timer.timer_mode_mask = 1 << 17;
+ }
+
+ best = kvm_find_cpuid_entry(vcpu, 7, 0);
+ if (best) {
+ /* Update OSPKE bit */
+ if (boot_cpu_has(X86_FEATURE_PKU) && best->function == 0x7) {
+ best->ecx &= ~F(OSPKE);
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_PKE))
+ best->ecx |= F(OSPKE);
+ }
+ }
+
+ best = kvm_find_cpuid_entry(vcpu, 0xD, 0);
+ if (!best) {
+ vcpu->arch.guest_supported_xcr0 = 0;
+ vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
+ } else {
+ vcpu->arch.guest_supported_xcr0 =
+ (best->eax | ((u64)best->edx << 32)) &
+ kvm_supported_xcr0();
+ vcpu->arch.guest_xstate_size = best->ebx =
+ xstate_required_size(vcpu->arch.xcr0, false);
+ }
+
+ best = kvm_find_cpuid_entry(vcpu, 0xD, 1);
+ if (best && (best->eax & (F(XSAVES) | F(XSAVEC))))
+ best->ebx = xstate_required_size(vcpu->arch.xcr0, true);
+
+ /*
+ * The existing code assumes virtual address is 48-bit or 57-bit in the
+ * canonical address checks; exit if it is ever changed.
+ */
+ best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
+ if (best) {
+ int vaddr_bits = (best->eax & 0xff00) >> 8;
+
+ if (vaddr_bits != 48 && vaddr_bits != 57 && vaddr_bits != 0)
+ return -EINVAL;
+ }
+
+ best = kvm_find_cpuid_entry(vcpu, KVM_CPUID_FEATURES, 0);
+ if (kvm_hlt_in_guest(vcpu->kvm) && best &&
+ (best->eax & (1 << KVM_FEATURE_PV_UNHALT)))
+ best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT);
+
+ /* Update physical-address width */
+ vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
+ kvm_mmu_reset_context(vcpu);
+
+ kvm_pmu_refresh(vcpu);
+ return 0;
+}
+
+static int is_efer_nx(void)
+{
+ unsigned long long efer = 0;
+
+ rdmsrl_safe(MSR_EFER, &efer);
+ return efer & EFER_NX;
+}
+
+static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
+{
+ int i;
+ struct kvm_cpuid_entry2 *e, *entry;
+
+ entry = NULL;
+ for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
+ e = &vcpu->arch.cpuid_entries[i];
+ if (e->function == 0x80000001) {
+ entry = e;
+ break;
+ }
+ }
+ if (entry && (entry->edx & F(NX)) && !is_efer_nx()) {
+ entry->edx &= ~F(NX);
+ printk(KERN_INFO "kvm: guest NX capability removed\n");
+ }
+}
+
+int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0);
+ if (!best || best->eax < 0x80000008)
+ goto not_found;
+ best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
+ if (best)
+ return best->eax & 0xff;
+not_found:
+ return 36;
+}
+EXPORT_SYMBOL_GPL(cpuid_query_maxphyaddr);
+
+/* when an old userspace process fills a new kernel module */
+int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
+ struct kvm_cpuid *cpuid,
+ struct kvm_cpuid_entry __user *entries)
+{
+ int r, i;
+ struct kvm_cpuid_entry *cpuid_entries = NULL;
+
+ r = -E2BIG;
+ if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
+ goto out;
+ r = -ENOMEM;
+ if (cpuid->nent) {
+ cpuid_entries =
+ vmalloc(array_size(sizeof(struct kvm_cpuid_entry),
+ cpuid->nent));
+ if (!cpuid_entries)
+ goto out;
+ r = -EFAULT;
+ if (copy_from_user(cpuid_entries, entries,
+ cpuid->nent * sizeof(struct kvm_cpuid_entry)))
+ goto out;
+ }
+ for (i = 0; i < cpuid->nent; i++) {
+ vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
+ vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
+ vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx;
+ vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx;
+ vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx;
+ vcpu->arch.cpuid_entries[i].index = 0;
+ vcpu->arch.cpuid_entries[i].flags = 0;
+ vcpu->arch.cpuid_entries[i].padding[0] = 0;
+ vcpu->arch.cpuid_entries[i].padding[1] = 0;
+ vcpu->arch.cpuid_entries[i].padding[2] = 0;
+ }
+ vcpu->arch.cpuid_nent = cpuid->nent;
+ cpuid_fix_nx_cap(vcpu);
+ kvm_apic_set_version(vcpu);
+ kvm_x86_ops->cpuid_update(vcpu);
+ r = kvm_update_cpuid(vcpu);
+
+out:
+ vfree(cpuid_entries);
+ return r;
+}
+
+int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
+ struct kvm_cpuid2 *cpuid,
+ struct kvm_cpuid_entry2 __user *entries)
+{
+ int r;
+
+ r = -E2BIG;
+ if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
+ goto out;
+ r = -EFAULT;
+ if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
+ cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
+ goto out;
+ vcpu->arch.cpuid_nent = cpuid->nent;
+ kvm_apic_set_version(vcpu);
+ kvm_x86_ops->cpuid_update(vcpu);
+ r = kvm_update_cpuid(vcpu);
+out:
+ return r;
+}
+
+int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
+ struct kvm_cpuid2 *cpuid,
+ struct kvm_cpuid_entry2 __user *entries)
+{
+ int r;
+
+ r = -E2BIG;
+ if (cpuid->nent < vcpu->arch.cpuid_nent)
+ goto out;
+ r = -EFAULT;
+ if (copy_to_user(entries, &vcpu->arch.cpuid_entries,
+ vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
+ goto out;
+ return 0;
+
+out:
+ cpuid->nent = vcpu->arch.cpuid_nent;
+ return r;
+}
+
+static void cpuid_mask(u32 *word, int wordnum)
+{
+ *word &= boot_cpu_data.x86_capability[wordnum];
+}
+
+static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
+ u32 index)
+{
+ entry->function = function;
+ entry->index = index;
+ cpuid_count(entry->function, entry->index,
+ &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
+ entry->flags = 0;
+}
+
+static int __do_cpuid_ent_emulated(struct kvm_cpuid_entry2 *entry,
+ u32 func, u32 index, int *nent, int maxnent)
+{
+ switch (func) {
+ case 0:
+ entry->eax = 7;
+ ++*nent;
+ break;
+ case 1:
+ entry->ecx = F(MOVBE);
+ ++*nent;
+ break;
+ case 7:
+ entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+ if (index == 0)
+ entry->ecx = F(RDPID);
+ ++*nent;
+ default:
+ break;
+ }
+
+ entry->function = func;
+ entry->index = index;
+
+ return 0;
+}
+
+static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
+ u32 index, int *nent, int maxnent)
+{
+ int r;
+ unsigned f_nx = is_efer_nx() ? F(NX) : 0;
+#ifdef CONFIG_X86_64
+ unsigned f_gbpages = (kvm_x86_ops->get_lpage_level() == PT_PDPE_LEVEL)
+ ? F(GBPAGES) : 0;
+ unsigned f_lm = F(LM);
+#else
+ unsigned f_gbpages = 0;
+ unsigned f_lm = 0;
+#endif
+ unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0;
+ unsigned f_invpcid = kvm_x86_ops->invpcid_supported() ? F(INVPCID) : 0;
+ unsigned f_mpx = kvm_mpx_supported() ? F(MPX) : 0;
+ unsigned f_xsaves = kvm_x86_ops->xsaves_supported() ? F(XSAVES) : 0;
+ unsigned f_umip = kvm_x86_ops->umip_emulated() ? F(UMIP) : 0;
+ unsigned f_la57 = 0;
+
+ /* cpuid 1.edx */
+ const u32 kvm_cpuid_1_edx_x86_features =
+ F(FPU) | F(VME) | F(DE) | F(PSE) |
+ F(TSC) | F(MSR) | F(PAE) | F(MCE) |
+ F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) |
+ F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
+ F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLUSH) |
+ 0 /* Reserved, DS, ACPI */ | F(MMX) |
+ F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) |
+ 0 /* HTT, TM, Reserved, PBE */;
+ /* cpuid 0x80000001.edx */
+ const u32 kvm_cpuid_8000_0001_edx_x86_features =
+ F(FPU) | F(VME) | F(DE) | F(PSE) |
+ F(TSC) | F(MSR) | F(PAE) | F(MCE) |
+ F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) |
+ F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
+ F(PAT) | F(PSE36) | 0 /* Reserved */ |
+ f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
+ F(FXSR) | F(FXSR_OPT) | f_gbpages | f_rdtscp |
+ 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
+ /* cpuid 1.ecx */
+ const u32 kvm_cpuid_1_ecx_x86_features =
+ /* NOTE: MONITOR (and MWAIT) are emulated as NOP,
+ * but *not* advertised to guests via CPUID ! */
+ F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
+ 0 /* DS-CPL, VMX, SMX, EST */ |
+ 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
+ F(FMA) | F(CX16) | 0 /* xTPR Update, PDCM */ |
+ F(PCID) | 0 /* Reserved, DCA */ | F(XMM4_1) |
+ F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
+ 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
+ F(F16C) | F(RDRAND);
+ /* cpuid 0x80000001.ecx */
+ const u32 kvm_cpuid_8000_0001_ecx_x86_features =
+ F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
+ F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
+ F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) |
+ 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM) |
+ F(TOPOEXT) | F(PERFCTR_CORE);
+
+ /* cpuid 0x80000008.ebx */
+ const u32 kvm_cpuid_8000_0008_ebx_x86_features =
+ F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) |
+ F(AMD_SSB_NO) | F(AMD_STIBP);
+
+ /* cpuid 0xC0000001.edx */
+ const u32 kvm_cpuid_C000_0001_edx_x86_features =
+ F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) |
+ F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) |
+ F(PMM) | F(PMM_EN);
+
+ /* cpuid 7.0.ebx */
+ const u32 kvm_cpuid_7_0_ebx_x86_features =
+ F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
+ F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) |
+ F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) |
+ F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) |
+ F(SHA_NI) | F(AVX512BW) | F(AVX512VL);
+
+ /* cpuid 0xD.1.eax */
+ const u32 kvm_cpuid_D_1_eax_x86_features =
+ F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | f_xsaves;
+
+ /* cpuid 7.0.ecx*/
+ const u32 kvm_cpuid_7_0_ecx_x86_features =
+ F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ |
+ F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
+ F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) |
+ F(CLDEMOTE);
+
+ /* cpuid 7.0.edx*/
+ const u32 kvm_cpuid_7_0_edx_x86_features =
+ F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) |
+ F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) |
+ F(MD_CLEAR);
+
+ /* all calls to cpuid_count() should be made on the same cpu */
+ get_cpu();
+
+ r = -E2BIG;
+
+ if (WARN_ON(*nent >= maxnent))
+ goto out;
+
+ do_cpuid_1_ent(entry, function, index);
+ ++*nent;
+
+ switch (function) {
+ case 0:
+ entry->eax = min(entry->eax, (u32)0xd);
+ break;
+ case 1:
+ entry->edx &= kvm_cpuid_1_edx_x86_features;
+ cpuid_mask(&entry->edx, CPUID_1_EDX);
+ entry->ecx &= kvm_cpuid_1_ecx_x86_features;
+ cpuid_mask(&entry->ecx, CPUID_1_ECX);
+ /* we support x2apic emulation even if host does not support
+ * it since we emulate x2apic in software */
+ entry->ecx |= F(X2APIC);
+ break;
+ /* function 2 entries are STATEFUL. That is, repeated cpuid commands
+ * may return different values. This forces us to get_cpu() before
+ * issuing the first command, and also to emulate this annoying behavior
+ * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */
+ case 2: {
+ int t, times = entry->eax & 0xff;
+
+ entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
+ entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
+ for (t = 1; t < times; ++t) {
+ if (*nent >= maxnent)
+ goto out;
+
+ do_cpuid_1_ent(&entry[t], function, 0);
+ entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
+ ++*nent;
+ }
+ break;
+ }
+ /* function 4 has additional index. */
+ case 4: {
+ int i, cache_type;
+
+ entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+ /* read more entries until cache_type is zero */
+ for (i = 1; ; ++i) {
+ if (*nent >= maxnent)
+ goto out;
+
+ cache_type = entry[i - 1].eax & 0x1f;
+ if (!cache_type)
+ break;
+ do_cpuid_1_ent(&entry[i], function, i);
+ entry[i].flags |=
+ KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+ ++*nent;
+ }
+ break;
+ }
+ case 6: /* Thermal management */
+ entry->eax = 0x4; /* allow ARAT */
+ entry->ebx = 0;
+ entry->ecx = 0;
+ entry->edx = 0;
+ break;
+ case 7: {
+ entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+ /* Mask ebx against host capability word 9 */
+ if (index == 0) {
+ entry->ebx &= kvm_cpuid_7_0_ebx_x86_features;
+ cpuid_mask(&entry->ebx, CPUID_7_0_EBX);
+ // TSC_ADJUST is emulated
+ entry->ebx |= F(TSC_ADJUST);
+ entry->ecx &= kvm_cpuid_7_0_ecx_x86_features;
+ f_la57 = entry->ecx & F(LA57);
+ cpuid_mask(&entry->ecx, CPUID_7_ECX);
+ /* Set LA57 based on hardware capability. */
+ entry->ecx |= f_la57;
+ entry->ecx |= f_umip;
+ /* PKU is not yet implemented for shadow paging. */
+ if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE))
+ entry->ecx &= ~F(PKU);
+
+ entry->edx &= kvm_cpuid_7_0_edx_x86_features;
+ cpuid_mask(&entry->edx, CPUID_7_EDX);
+ if (boot_cpu_has(X86_FEATURE_IBPB) &&
+ boot_cpu_has(X86_FEATURE_IBRS))
+ entry->edx |= F(SPEC_CTRL);
+ if (boot_cpu_has(X86_FEATURE_STIBP))
+ entry->edx |= F(INTEL_STIBP);
+ if (boot_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) ||
+ boot_cpu_has(X86_FEATURE_AMD_SSBD))
+ entry->edx |= F(SPEC_CTRL_SSBD);
+ /*
+ * We emulate ARCH_CAPABILITIES in software even
+ * if the host doesn't support it.
+ */
+ entry->edx |= F(ARCH_CAPABILITIES);
+ } else {
+ entry->ebx = 0;
+ entry->ecx = 0;
+ entry->edx = 0;
+ }
+ entry->eax = 0;
+ break;
+ }
+ case 9:
+ break;
+ case 0xa: { /* Architectural Performance Monitoring */
+ struct x86_pmu_capability cap;
+ union cpuid10_eax eax;
+ union cpuid10_edx edx;
+
+ if (!static_cpu_has(X86_FEATURE_ARCH_PERFMON)) {
+ entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+ break;
+ }
+
+ perf_get_x86_pmu_capability(&cap);
+
+ /*
+ * Only support guest architectural pmu on a host
+ * with architectural pmu.
+ */
+ if (!cap.version)
+ memset(&cap, 0, sizeof(cap));
+
+ eax.split.version_id = min(cap.version, 2);
+ eax.split.num_counters = cap.num_counters_gp;
+ eax.split.bit_width = cap.bit_width_gp;
+ eax.split.mask_length = cap.events_mask_len;
+
+ edx.split.num_counters_fixed = cap.num_counters_fixed;
+ edx.split.bit_width_fixed = cap.bit_width_fixed;
+ edx.split.reserved = 0;
+
+ entry->eax = eax.full;
+ entry->ebx = cap.events_mask;
+ entry->ecx = 0;
+ entry->edx = edx.full;
+ break;
+ }
+ /* function 0xb has additional index. */
+ case 0xb: {
+ int i, level_type;
+
+ entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+ /* read more entries until level_type is zero */
+ for (i = 1; ; ++i) {
+ if (*nent >= maxnent)
+ goto out;
+
+ level_type = entry[i - 1].ecx & 0xff00;
+ if (!level_type)
+ break;
+ do_cpuid_1_ent(&entry[i], function, i);
+ entry[i].flags |=
+ KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+ ++*nent;
+ }
+ break;
+ }
+ case 0xd: {
+ int idx, i;
+ u64 supported = kvm_supported_xcr0();
+
+ entry->eax &= supported;
+ entry->ebx = xstate_required_size(supported, false);
+ entry->ecx = entry->ebx;
+ entry->edx &= supported >> 32;
+ entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+ if (!supported)
+ break;
+
+ for (idx = 1, i = 1; idx < 64; ++idx) {
+ u64 mask = ((u64)1 << idx);
+ if (*nent >= maxnent)
+ goto out;
+
+ do_cpuid_1_ent(&entry[i], function, idx);
+ if (idx == 1) {
+ entry[i].eax &= kvm_cpuid_D_1_eax_x86_features;
+ cpuid_mask(&entry[i].eax, CPUID_D_1_EAX);
+ entry[i].ebx = 0;
+ if (entry[i].eax & (F(XSAVES)|F(XSAVEC)))
+ entry[i].ebx =
+ xstate_required_size(supported,
+ true);
+ } else {
+ if (entry[i].eax == 0 || !(supported & mask))
+ continue;
+ if (WARN_ON_ONCE(entry[i].ecx & 1))
+ continue;
+ }
+ entry[i].ecx = 0;
+ entry[i].edx = 0;
+ entry[i].flags |=
+ KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+ ++*nent;
+ ++i;
+ }
+ break;
+ }
+ case KVM_CPUID_SIGNATURE: {
+ static const char signature[12] = "KVMKVMKVM\0\0";
+ const u32 *sigptr = (const u32 *)signature;
+ entry->eax = KVM_CPUID_FEATURES;
+ entry->ebx = sigptr[0];
+ entry->ecx = sigptr[1];
+ entry->edx = sigptr[2];
+ break;
+ }
+ case KVM_CPUID_FEATURES:
+ entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) |
+ (1 << KVM_FEATURE_NOP_IO_DELAY) |
+ (1 << KVM_FEATURE_CLOCKSOURCE2) |
+ (1 << KVM_FEATURE_ASYNC_PF) |
+ (1 << KVM_FEATURE_PV_EOI) |
+ (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) |
+ (1 << KVM_FEATURE_PV_UNHALT) |
+ (1 << KVM_FEATURE_PV_TLB_FLUSH) |
+ (1 << KVM_FEATURE_ASYNC_PF_VMEXIT) |
+ (1 << KVM_FEATURE_PV_SEND_IPI);
+
+ if (sched_info_on())
+ entry->eax |= (1 << KVM_FEATURE_STEAL_TIME);
+
+ entry->ebx = 0;
+ entry->ecx = 0;
+ entry->edx = 0;
+ break;
+ case 0x80000000:
+ entry->eax = min(entry->eax, 0x8000001f);
+ break;
+ case 0x80000001:
+ entry->edx &= kvm_cpuid_8000_0001_edx_x86_features;
+ cpuid_mask(&entry->edx, CPUID_8000_0001_EDX);
+ entry->ecx &= kvm_cpuid_8000_0001_ecx_x86_features;
+ cpuid_mask(&entry->ecx, CPUID_8000_0001_ECX);
+ break;
+ case 0x80000007: /* Advanced power management */
+ /* invariant TSC is CPUID.80000007H:EDX[8] */
+ entry->edx &= (1 << 8);
+ /* mask against host */
+ entry->edx &= boot_cpu_data.x86_power;
+ entry->eax = entry->ebx = entry->ecx = 0;
+ break;
+ case 0x80000008: {
+ unsigned g_phys_as = (entry->eax >> 16) & 0xff;
+ unsigned virt_as = max((entry->eax >> 8) & 0xff, 48U);
+ unsigned phys_as = entry->eax & 0xff;
+
+ /*
+ * Use bare metal's MAXPHADDR if the CPU doesn't report guest
+ * MAXPHYADDR separately, or if TDP (NPT) is disabled, as the
+ * guest version "applies only to guests using nested paging".
+ */
+ if (!g_phys_as || !tdp_enabled)
+ g_phys_as = phys_as;
+
+ entry->eax = g_phys_as | (virt_as << 8);
+ entry->edx = 0;
+ /*
+ * IBRS, IBPB and VIRT_SSBD aren't necessarily present in
+ * hardware cpuid
+ */
+ if (boot_cpu_has(X86_FEATURE_AMD_IBPB))
+ entry->ebx |= F(AMD_IBPB);
+ if (boot_cpu_has(X86_FEATURE_AMD_IBRS))
+ entry->ebx |= F(AMD_IBRS);
+ if (boot_cpu_has(X86_FEATURE_VIRT_SSBD))
+ entry->ebx |= F(VIRT_SSBD);
+ entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features;
+ cpuid_mask(&entry->ebx, CPUID_8000_0008_EBX);
+ /*
+ * The preference is to use SPEC CTRL MSR instead of the
+ * VIRT_SPEC MSR.
+ */
+ if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) &&
+ !boot_cpu_has(X86_FEATURE_AMD_SSBD))
+ entry->ebx |= F(VIRT_SSBD);
+ break;
+ }
+ case 0x80000019:
+ entry->ecx = entry->edx = 0;
+ break;
+ case 0x8000001a:
+ break;
+ case 0x8000001d:
+ break;
+ /*Add support for Centaur's CPUID instruction*/
+ case 0xC0000000:
+ /*Just support up to 0xC0000004 now*/
+ entry->eax = min(entry->eax, 0xC0000004);
+ break;
+ case 0xC0000001:
+ entry->edx &= kvm_cpuid_C000_0001_edx_x86_features;
+ cpuid_mask(&entry->edx, CPUID_C000_0001_EDX);
+ break;
+ case 3: /* Processor serial number */
+ case 5: /* MONITOR/MWAIT */
+ case 0xC0000002:
+ case 0xC0000003:
+ case 0xC0000004:
+ default:
+ entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+ break;
+ }
+
+ kvm_x86_ops->set_supported_cpuid(function, entry);
+
+ r = 0;
+
+out:
+ put_cpu();
+
+ return r;
+}
+
+static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 func,
+ u32 idx, int *nent, int maxnent, unsigned int type)
+{
+ if (*nent >= maxnent)
+ return -E2BIG;
+
+ if (type == KVM_GET_EMULATED_CPUID)
+ return __do_cpuid_ent_emulated(entry, func, idx, nent, maxnent);
+
+ return __do_cpuid_ent(entry, func, idx, nent, maxnent);
+}
+
+#undef F
+
+struct kvm_cpuid_param {
+ u32 func;
+ u32 idx;
+ bool has_leaf_count;
+ bool (*qualifier)(const struct kvm_cpuid_param *param);
+};
+
+static bool is_centaur_cpu(const struct kvm_cpuid_param *param)
+{
+ return boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR;
+}
+
+static bool sanity_check_entries(struct kvm_cpuid_entry2 __user *entries,
+ __u32 num_entries, unsigned int ioctl_type)
+{
+ int i;
+ __u32 pad[3];
+
+ if (ioctl_type != KVM_GET_EMULATED_CPUID)
+ return false;
+
+ /*
+ * We want to make sure that ->padding is being passed clean from
+ * userspace in case we want to use it for something in the future.
+ *
+ * Sadly, this wasn't enforced for KVM_GET_SUPPORTED_CPUID and so we
+ * have to give ourselves satisfied only with the emulated side. /me
+ * sheds a tear.
+ */
+ for (i = 0; i < num_entries; i++) {
+ if (copy_from_user(pad, entries[i].padding, sizeof(pad)))
+ return true;
+
+ if (pad[0] || pad[1] || pad[2])
+ return true;
+ }
+ return false;
+}
+
+int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid,
+ struct kvm_cpuid_entry2 __user *entries,
+ unsigned int type)
+{
+ struct kvm_cpuid_entry2 *cpuid_entries;
+ int limit, nent = 0, r = -E2BIG, i;
+ u32 func;
+ static const struct kvm_cpuid_param param[] = {
+ { .func = 0, .has_leaf_count = true },
+ { .func = 0x80000000, .has_leaf_count = true },
+ { .func = 0xC0000000, .qualifier = is_centaur_cpu, .has_leaf_count = true },
+ { .func = KVM_CPUID_SIGNATURE },
+ { .func = KVM_CPUID_FEATURES },
+ };
+
+ if (cpuid->nent < 1)
+ goto out;
+ if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
+ cpuid->nent = KVM_MAX_CPUID_ENTRIES;
+
+ if (sanity_check_entries(entries, cpuid->nent, type))
+ return -EINVAL;
+
+ r = -ENOMEM;
+ cpuid_entries = vzalloc(array_size(sizeof(struct kvm_cpuid_entry2),
+ cpuid->nent));
+ if (!cpuid_entries)
+ goto out;
+
+ r = 0;
+ for (i = 0; i < ARRAY_SIZE(param); i++) {
+ const struct kvm_cpuid_param *ent = &param[i];
+
+ if (ent->qualifier && !ent->qualifier(ent))
+ continue;
+
+ r = do_cpuid_ent(&cpuid_entries[nent], ent->func, ent->idx,
+ &nent, cpuid->nent, type);
+
+ if (r)
+ goto out_free;
+
+ if (!ent->has_leaf_count)
+ continue;
+
+ limit = cpuid_entries[nent - 1].eax;
+ for (func = ent->func + 1; func <= limit && nent < cpuid->nent && r == 0; ++func)
+ r = do_cpuid_ent(&cpuid_entries[nent], func, ent->idx,
+ &nent, cpuid->nent, type);
+
+ if (r)
+ goto out_free;
+ }
+
+ r = -EFAULT;
+ if (copy_to_user(entries, cpuid_entries,
+ nent * sizeof(struct kvm_cpuid_entry2)))
+ goto out_free;
+ cpuid->nent = nent;
+ r = 0;
+
+out_free:
+ vfree(cpuid_entries);
+out:
+ return r;
+}
+
+static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
+{
+ struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
+ struct kvm_cpuid_entry2 *ej;
+ int j = i;
+ int nent = vcpu->arch.cpuid_nent;
+
+ e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
+ /* when no next entry is found, the current entry[i] is reselected */
+ do {
+ j = (j + 1) % nent;
+ ej = &vcpu->arch.cpuid_entries[j];
+ } while (ej->function != e->function);
+
+ ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
+
+ return j;
+}
+
+/* find an entry with matching function, matching index (if needed), and that
+ * should be read next (if it's stateful) */
+static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e,
+ u32 function, u32 index)
+{
+ if (e->function != function)
+ return 0;
+ if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index)
+ return 0;
+ if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) &&
+ !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
+ return 0;
+ return 1;
+}
+
+struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
+ u32 function, u32 index)
+{
+ int i;
+ struct kvm_cpuid_entry2 *best = NULL;
+
+ for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
+ struct kvm_cpuid_entry2 *e;
+
+ e = &vcpu->arch.cpuid_entries[i];
+ if (is_matching_cpuid_entry(e, function, index)) {
+ if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC)
+ move_to_next_stateful_cpuid_entry(vcpu, i);
+ best = e;
+ break;
+ }
+ }
+ return best;
+}
+EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry);
+
+/*
+ * If no match is found, check whether we exceed the vCPU's limit
+ * and return the content of the highest valid _standard_ leaf instead.
+ * This is to satisfy the CPUID specification.
+ */
+static struct kvm_cpuid_entry2* check_cpuid_limit(struct kvm_vcpu *vcpu,
+ u32 function, u32 index)
+{
+ struct kvm_cpuid_entry2 *maxlevel;
+
+ maxlevel = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0);
+ if (!maxlevel || maxlevel->eax >= function)
+ return NULL;
+ if (function & 0x80000000) {
+ maxlevel = kvm_find_cpuid_entry(vcpu, 0, 0);
+ if (!maxlevel)
+ return NULL;
+ }
+ return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index);
+}
+
+bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
+ u32 *ecx, u32 *edx, bool check_limit)
+{
+ u32 function = *eax, index = *ecx;
+ struct kvm_cpuid_entry2 *best;
+ bool entry_found = true;
+
+ best = kvm_find_cpuid_entry(vcpu, function, index);
+
+ if (!best) {
+ entry_found = false;
+ if (!check_limit)
+ goto out;
+
+ best = check_cpuid_limit(vcpu, function, index);
+ }
+
+out:
+ if (best) {
+ *eax = best->eax;
+ *ebx = best->ebx;
+ *ecx = best->ecx;
+ *edx = best->edx;
+ } else
+ *eax = *ebx = *ecx = *edx = 0;
+ trace_kvm_cpuid(function, *eax, *ebx, *ecx, *edx, entry_found);
+ return entry_found;
+}
+EXPORT_SYMBOL_GPL(kvm_cpuid);
+
+int kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
+{
+ u32 eax, ebx, ecx, edx;
+
+ if (cpuid_fault_enabled(vcpu) && !kvm_require_cpl(vcpu, 0))
+ return 1;
+
+ eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
+ ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
+ kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx, true);
+ kvm_register_write(vcpu, VCPU_REGS_RAX, eax);
+ kvm_register_write(vcpu, VCPU_REGS_RBX, ebx);
+ kvm_register_write(vcpu, VCPU_REGS_RCX, ecx);
+ kvm_register_write(vcpu, VCPU_REGS_RDX, edx);
+ return kvm_skip_emulated_instruction(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
new file mode 100644
index 000000000..7dec43b2c
--- /dev/null
+++ b/arch/x86/kvm/cpuid.h
@@ -0,0 +1,182 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ARCH_X86_KVM_CPUID_H
+#define ARCH_X86_KVM_CPUID_H
+
+#include "x86.h"
+#include <asm/cpu.h>
+#include <asm/processor.h>
+
+int kvm_update_cpuid(struct kvm_vcpu *vcpu);
+bool kvm_mpx_supported(void);
+struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
+ u32 function, u32 index);
+int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid,
+ struct kvm_cpuid_entry2 __user *entries,
+ unsigned int type);
+int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
+ struct kvm_cpuid *cpuid,
+ struct kvm_cpuid_entry __user *entries);
+int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
+ struct kvm_cpuid2 *cpuid,
+ struct kvm_cpuid_entry2 __user *entries);
+int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
+ struct kvm_cpuid2 *cpuid,
+ struct kvm_cpuid_entry2 __user *entries);
+bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
+ u32 *ecx, u32 *edx, bool check_limit);
+
+int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu);
+
+static inline int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.maxphyaddr;
+}
+
+struct cpuid_reg {
+ u32 function;
+ u32 index;
+ int reg;
+};
+
+static const struct cpuid_reg reverse_cpuid[] = {
+ [CPUID_1_EDX] = { 1, 0, CPUID_EDX},
+ [CPUID_8000_0001_EDX] = {0x80000001, 0, CPUID_EDX},
+ [CPUID_8086_0001_EDX] = {0x80860001, 0, CPUID_EDX},
+ [CPUID_1_ECX] = { 1, 0, CPUID_ECX},
+ [CPUID_C000_0001_EDX] = {0xc0000001, 0, CPUID_EDX},
+ [CPUID_8000_0001_ECX] = {0x80000001, 0, CPUID_ECX},
+ [CPUID_7_0_EBX] = { 7, 0, CPUID_EBX},
+ [CPUID_D_1_EAX] = { 0xd, 1, CPUID_EAX},
+ [CPUID_8000_0008_EBX] = {0x80000008, 0, CPUID_EBX},
+ [CPUID_6_EAX] = { 6, 0, CPUID_EAX},
+ [CPUID_8000_000A_EDX] = {0x8000000a, 0, CPUID_EDX},
+ [CPUID_7_ECX] = { 7, 0, CPUID_ECX},
+ [CPUID_8000_0007_EBX] = {0x80000007, 0, CPUID_EBX},
+ [CPUID_7_EDX] = { 7, 0, CPUID_EDX},
+};
+
+static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned x86_feature)
+{
+ unsigned x86_leaf = x86_feature / 32;
+
+ BUILD_BUG_ON(x86_leaf >= ARRAY_SIZE(reverse_cpuid));
+ BUILD_BUG_ON(reverse_cpuid[x86_leaf].function == 0);
+
+ return reverse_cpuid[x86_leaf];
+}
+
+static __always_inline int *guest_cpuid_get_register(struct kvm_vcpu *vcpu, unsigned x86_feature)
+{
+ struct kvm_cpuid_entry2 *entry;
+ const struct cpuid_reg cpuid = x86_feature_cpuid(x86_feature);
+
+ entry = kvm_find_cpuid_entry(vcpu, cpuid.function, cpuid.index);
+ if (!entry)
+ return NULL;
+
+ switch (cpuid.reg) {
+ case CPUID_EAX:
+ return &entry->eax;
+ case CPUID_EBX:
+ return &entry->ebx;
+ case CPUID_ECX:
+ return &entry->ecx;
+ case CPUID_EDX:
+ return &entry->edx;
+ default:
+ BUILD_BUG();
+ return NULL;
+ }
+}
+
+static __always_inline bool guest_cpuid_has(struct kvm_vcpu *vcpu, unsigned x86_feature)
+{
+ int *reg;
+
+ if (x86_feature == X86_FEATURE_XSAVE &&
+ !static_cpu_has(X86_FEATURE_XSAVE))
+ return false;
+
+ reg = guest_cpuid_get_register(vcpu, x86_feature);
+ if (!reg)
+ return false;
+
+ return *reg & bit(x86_feature);
+}
+
+static __always_inline void guest_cpuid_clear(struct kvm_vcpu *vcpu, unsigned x86_feature)
+{
+ int *reg;
+
+ reg = guest_cpuid_get_register(vcpu, x86_feature);
+ if (reg)
+ *reg &= ~bit(x86_feature);
+}
+
+static inline bool guest_cpuid_is_amd(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 0, 0);
+ return best && best->ebx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx;
+}
+
+static inline int guest_cpuid_family(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 0x1, 0);
+ if (!best)
+ return -1;
+
+ return x86_family(best->eax);
+}
+
+static inline int guest_cpuid_model(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 0x1, 0);
+ if (!best)
+ return -1;
+
+ return x86_model(best->eax);
+}
+
+static inline int guest_cpuid_stepping(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 0x1, 0);
+ if (!best)
+ return -1;
+
+ return x86_stepping(best->eax);
+}
+
+static inline bool guest_has_spec_ctrl_msr(struct kvm_vcpu *vcpu)
+{
+ return (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) ||
+ guest_cpuid_has(vcpu, X86_FEATURE_AMD_STIBP) ||
+ guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS) ||
+ guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD));
+}
+
+static inline bool guest_has_pred_cmd_msr(struct kvm_vcpu *vcpu)
+{
+ return (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) ||
+ guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB));
+}
+
+static inline bool supports_cpuid_fault(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.msr_platform_info & MSR_PLATFORM_INFO_CPUID_FAULT;
+}
+
+static inline bool cpuid_fault_enabled(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.msr_misc_features_enables &
+ MSR_MISC_FEATURES_ENABLES_CPUID_FAULT;
+}
+
+#endif
diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c
new file mode 100644
index 000000000..c19c7ede9
--- /dev/null
+++ b/arch/x86/kvm/debugfs.c
@@ -0,0 +1,69 @@
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * Copyright 2016 Red Hat, Inc. and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+#include <linux/kvm_host.h>
+#include <linux/debugfs.h>
+
+bool kvm_arch_has_vcpu_debugfs(void)
+{
+ return true;
+}
+
+static int vcpu_get_tsc_offset(void *data, u64 *val)
+{
+ struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data;
+ *val = vcpu->arch.tsc_offset;
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_offset_fops, vcpu_get_tsc_offset, NULL, "%lld\n");
+
+static int vcpu_get_tsc_scaling_ratio(void *data, u64 *val)
+{
+ struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data;
+ *val = vcpu->arch.tsc_scaling_ratio;
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_scaling_fops, vcpu_get_tsc_scaling_ratio, NULL, "%llu\n");
+
+static int vcpu_get_tsc_scaling_frac_bits(void *data, u64 *val)
+{
+ *val = kvm_tsc_scaling_ratio_frac_bits;
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_scaling_frac_fops, vcpu_get_tsc_scaling_frac_bits, NULL, "%llu\n");
+
+int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
+{
+ struct dentry *ret;
+
+ ret = debugfs_create_file("tsc-offset", 0444,
+ vcpu->debugfs_dentry,
+ vcpu, &vcpu_tsc_offset_fops);
+ if (!ret)
+ return -ENOMEM;
+
+ if (kvm_has_tsc_control) {
+ ret = debugfs_create_file("tsc-scaling-ratio", 0444,
+ vcpu->debugfs_dentry,
+ vcpu, &vcpu_tsc_scaling_fops);
+ if (!ret)
+ return -ENOMEM;
+ ret = debugfs_create_file("tsc-scaling-ratio-frac-bits", 0444,
+ vcpu->debugfs_dentry,
+ vcpu, &vcpu_tsc_scaling_frac_fops);
+ if (!ret)
+ return -ENOMEM;
+
+ }
+
+ return 0;
+}
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
new file mode 100644
index 000000000..63754d248
--- /dev/null
+++ b/arch/x86/kvm/emulate.c
@@ -0,0 +1,5849 @@
+/******************************************************************************
+ * emulate.c
+ *
+ * Generic x86 (32-bit and 64-bit) instruction decoder and emulator.
+ *
+ * Copyright (c) 2005 Keir Fraser
+ *
+ * Linux coding style, mod r/m decoder, segment base fixes, real-mode
+ * privileged instructions:
+ *
+ * Copyright (C) 2006 Qumranet
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * Avi Kivity <avi@qumranet.com>
+ * Yaniv Kamay <yaniv@qumranet.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
+ */
+
+#include <linux/kvm_host.h>
+#include "kvm_cache_regs.h"
+#include <asm/kvm_emulate.h>
+#include <linux/stringify.h>
+#include <asm/debugreg.h>
+#include <asm/nospec-branch.h>
+
+#include "x86.h"
+#include "tss.h"
+#include "mmu.h"
+#include "pmu.h"
+
+/*
+ * Operand types
+ */
+#define OpNone 0ull
+#define OpImplicit 1ull /* No generic decode */
+#define OpReg 2ull /* Register */
+#define OpMem 3ull /* Memory */
+#define OpAcc 4ull /* Accumulator: AL/AX/EAX/RAX */
+#define OpDI 5ull /* ES:DI/EDI/RDI */
+#define OpMem64 6ull /* Memory, 64-bit */
+#define OpImmUByte 7ull /* Zero-extended 8-bit immediate */
+#define OpDX 8ull /* DX register */
+#define OpCL 9ull /* CL register (for shifts) */
+#define OpImmByte 10ull /* 8-bit sign extended immediate */
+#define OpOne 11ull /* Implied 1 */
+#define OpImm 12ull /* Sign extended up to 32-bit immediate */
+#define OpMem16 13ull /* Memory operand (16-bit). */
+#define OpMem32 14ull /* Memory operand (32-bit). */
+#define OpImmU 15ull /* Immediate operand, zero extended */
+#define OpSI 16ull /* SI/ESI/RSI */
+#define OpImmFAddr 17ull /* Immediate far address */
+#define OpMemFAddr 18ull /* Far address in memory */
+#define OpImmU16 19ull /* Immediate operand, 16 bits, zero extended */
+#define OpES 20ull /* ES */
+#define OpCS 21ull /* CS */
+#define OpSS 22ull /* SS */
+#define OpDS 23ull /* DS */
+#define OpFS 24ull /* FS */
+#define OpGS 25ull /* GS */
+#define OpMem8 26ull /* 8-bit zero extended memory operand */
+#define OpImm64 27ull /* Sign extended 16/32/64-bit immediate */
+#define OpXLat 28ull /* memory at BX/EBX/RBX + zero-extended AL */
+#define OpAccLo 29ull /* Low part of extended acc (AX/AX/EAX/RAX) */
+#define OpAccHi 30ull /* High part of extended acc (-/DX/EDX/RDX) */
+
+#define OpBits 5 /* Width of operand field */
+#define OpMask ((1ull << OpBits) - 1)
+
+/*
+ * Opcode effective-address decode tables.
+ * Note that we only emulate instructions that have at least one memory
+ * operand (excluding implicit stack references). We assume that stack
+ * references and instruction fetches will never occur in special memory
+ * areas that require emulation. So, for example, 'mov <imm>,<reg>' need
+ * not be handled.
+ */
+
+/* Operand sizes: 8-bit operands or specified/overridden size. */
+#define ByteOp (1<<0) /* 8-bit operands. */
+/* Destination operand type. */
+#define DstShift 1
+#define ImplicitOps (OpImplicit << DstShift)
+#define DstReg (OpReg << DstShift)
+#define DstMem (OpMem << DstShift)
+#define DstAcc (OpAcc << DstShift)
+#define DstDI (OpDI << DstShift)
+#define DstMem64 (OpMem64 << DstShift)
+#define DstMem16 (OpMem16 << DstShift)
+#define DstImmUByte (OpImmUByte << DstShift)
+#define DstDX (OpDX << DstShift)
+#define DstAccLo (OpAccLo << DstShift)
+#define DstMask (OpMask << DstShift)
+/* Source operand type. */
+#define SrcShift 6
+#define SrcNone (OpNone << SrcShift)
+#define SrcReg (OpReg << SrcShift)
+#define SrcMem (OpMem << SrcShift)
+#define SrcMem16 (OpMem16 << SrcShift)
+#define SrcMem32 (OpMem32 << SrcShift)
+#define SrcImm (OpImm << SrcShift)
+#define SrcImmByte (OpImmByte << SrcShift)
+#define SrcOne (OpOne << SrcShift)
+#define SrcImmUByte (OpImmUByte << SrcShift)
+#define SrcImmU (OpImmU << SrcShift)
+#define SrcSI (OpSI << SrcShift)
+#define SrcXLat (OpXLat << SrcShift)
+#define SrcImmFAddr (OpImmFAddr << SrcShift)
+#define SrcMemFAddr (OpMemFAddr << SrcShift)
+#define SrcAcc (OpAcc << SrcShift)
+#define SrcImmU16 (OpImmU16 << SrcShift)
+#define SrcImm64 (OpImm64 << SrcShift)
+#define SrcDX (OpDX << SrcShift)
+#define SrcMem8 (OpMem8 << SrcShift)
+#define SrcAccHi (OpAccHi << SrcShift)
+#define SrcMask (OpMask << SrcShift)
+#define BitOp (1<<11)
+#define MemAbs (1<<12) /* Memory operand is absolute displacement */
+#define String (1<<13) /* String instruction (rep capable) */
+#define Stack (1<<14) /* Stack instruction (push/pop) */
+#define GroupMask (7<<15) /* Opcode uses one of the group mechanisms */
+#define Group (1<<15) /* Bits 3:5 of modrm byte extend opcode */
+#define GroupDual (2<<15) /* Alternate decoding of mod == 3 */
+#define Prefix (3<<15) /* Instruction varies with 66/f2/f3 prefix */
+#define RMExt (4<<15) /* Opcode extension in ModRM r/m if mod == 3 */
+#define Escape (5<<15) /* Escape to coprocessor instruction */
+#define InstrDual (6<<15) /* Alternate instruction decoding of mod == 3 */
+#define ModeDual (7<<15) /* Different instruction for 32/64 bit */
+#define Sse (1<<18) /* SSE Vector instruction */
+/* Generic ModRM decode. */
+#define ModRM (1<<19)
+/* Destination is only written; never read. */
+#define Mov (1<<20)
+/* Misc flags */
+#define Prot (1<<21) /* instruction generates #UD if not in prot-mode */
+#define EmulateOnUD (1<<22) /* Emulate if unsupported by the host */
+#define NoAccess (1<<23) /* Don't access memory (lea/invlpg/verr etc) */
+#define Op3264 (1<<24) /* Operand is 64b in long mode, 32b otherwise */
+#define Undefined (1<<25) /* No Such Instruction */
+#define Lock (1<<26) /* lock prefix is allowed for the instruction */
+#define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */
+#define No64 (1<<28)
+#define PageTable (1 << 29) /* instruction used to write page table */
+#define NotImpl (1 << 30) /* instruction is not implemented */
+/* Source 2 operand type */
+#define Src2Shift (31)
+#define Src2None (OpNone << Src2Shift)
+#define Src2Mem (OpMem << Src2Shift)
+#define Src2CL (OpCL << Src2Shift)
+#define Src2ImmByte (OpImmByte << Src2Shift)
+#define Src2One (OpOne << Src2Shift)
+#define Src2Imm (OpImm << Src2Shift)
+#define Src2ES (OpES << Src2Shift)
+#define Src2CS (OpCS << Src2Shift)
+#define Src2SS (OpSS << Src2Shift)
+#define Src2DS (OpDS << Src2Shift)
+#define Src2FS (OpFS << Src2Shift)
+#define Src2GS (OpGS << Src2Shift)
+#define Src2Mask (OpMask << Src2Shift)
+#define Mmx ((u64)1 << 40) /* MMX Vector instruction */
+#define AlignMask ((u64)7 << 41)
+#define Aligned ((u64)1 << 41) /* Explicitly aligned (e.g. MOVDQA) */
+#define Unaligned ((u64)2 << 41) /* Explicitly unaligned (e.g. MOVDQU) */
+#define Avx ((u64)3 << 41) /* Advanced Vector Extensions */
+#define Aligned16 ((u64)4 << 41) /* Aligned to 16 byte boundary (e.g. FXSAVE) */
+#define Fastop ((u64)1 << 44) /* Use opcode::u.fastop */
+#define NoWrite ((u64)1 << 45) /* No writeback */
+#define SrcWrite ((u64)1 << 46) /* Write back src operand */
+#define NoMod ((u64)1 << 47) /* Mod field is ignored */
+#define Intercept ((u64)1 << 48) /* Has valid intercept field */
+#define CheckPerm ((u64)1 << 49) /* Has valid check_perm field */
+#define PrivUD ((u64)1 << 51) /* #UD instead of #GP on CPL > 0 */
+#define NearBranch ((u64)1 << 52) /* Near branches */
+#define No16 ((u64)1 << 53) /* No 16 bit operand */
+#define IncSP ((u64)1 << 54) /* SP is incremented before ModRM calc */
+#define TwoMemOp ((u64)1 << 55) /* Instruction has two memory operand */
+
+#define DstXacc (DstAccLo | SrcAccHi | SrcWrite)
+
+#define X2(x...) x, x
+#define X3(x...) X2(x), x
+#define X4(x...) X2(x), X2(x)
+#define X5(x...) X4(x), x
+#define X6(x...) X4(x), X2(x)
+#define X7(x...) X4(x), X3(x)
+#define X8(x...) X4(x), X4(x)
+#define X16(x...) X8(x), X8(x)
+
+#define NR_FASTOP (ilog2(sizeof(ulong)) + 1)
+#define FASTOP_SIZE 8
+
+/*
+ * fastop functions have a special calling convention:
+ *
+ * dst: rax (in/out)
+ * src: rdx (in/out)
+ * src2: rcx (in)
+ * flags: rflags (in/out)
+ * ex: rsi (in:fastop pointer, out:zero if exception)
+ *
+ * Moreover, they are all exactly FASTOP_SIZE bytes long, so functions for
+ * different operand sizes can be reached by calculation, rather than a jump
+ * table (which would be bigger than the code).
+ *
+ * fastop functions are declared as taking a never-defined fastop parameter,
+ * so they can't be called from C directly.
+ */
+
+struct fastop;
+
+struct opcode {
+ u64 flags : 56;
+ u64 intercept : 8;
+ union {
+ int (*execute)(struct x86_emulate_ctxt *ctxt);
+ const struct opcode *group;
+ const struct group_dual *gdual;
+ const struct gprefix *gprefix;
+ const struct escape *esc;
+ const struct instr_dual *idual;
+ const struct mode_dual *mdual;
+ void (*fastop)(struct fastop *fake);
+ } u;
+ int (*check_perm)(struct x86_emulate_ctxt *ctxt);
+};
+
+struct group_dual {
+ struct opcode mod012[8];
+ struct opcode mod3[8];
+};
+
+struct gprefix {
+ struct opcode pfx_no;
+ struct opcode pfx_66;
+ struct opcode pfx_f2;
+ struct opcode pfx_f3;
+};
+
+struct escape {
+ struct opcode op[8];
+ struct opcode high[64];
+};
+
+struct instr_dual {
+ struct opcode mod012;
+ struct opcode mod3;
+};
+
+struct mode_dual {
+ struct opcode mode32;
+ struct opcode mode64;
+};
+
+#define EFLG_RESERVED_ZEROS_MASK 0xffc0802a
+
+enum x86_transfer_type {
+ X86_TRANSFER_NONE,
+ X86_TRANSFER_CALL_JMP,
+ X86_TRANSFER_RET,
+ X86_TRANSFER_TASK_SWITCH,
+};
+
+static ulong reg_read(struct x86_emulate_ctxt *ctxt, unsigned nr)
+{
+ if (!(ctxt->regs_valid & (1 << nr))) {
+ ctxt->regs_valid |= 1 << nr;
+ ctxt->_regs[nr] = ctxt->ops->read_gpr(ctxt, nr);
+ }
+ return ctxt->_regs[nr];
+}
+
+static ulong *reg_write(struct x86_emulate_ctxt *ctxt, unsigned nr)
+{
+ ctxt->regs_valid |= 1 << nr;
+ ctxt->regs_dirty |= 1 << nr;
+ return &ctxt->_regs[nr];
+}
+
+static ulong *reg_rmw(struct x86_emulate_ctxt *ctxt, unsigned nr)
+{
+ reg_read(ctxt, nr);
+ return reg_write(ctxt, nr);
+}
+
+static void writeback_registers(struct x86_emulate_ctxt *ctxt)
+{
+ unsigned reg;
+
+ for_each_set_bit(reg, (ulong *)&ctxt->regs_dirty, 16)
+ ctxt->ops->write_gpr(ctxt, reg, ctxt->_regs[reg]);
+}
+
+static void invalidate_registers(struct x86_emulate_ctxt *ctxt)
+{
+ ctxt->regs_dirty = 0;
+ ctxt->regs_valid = 0;
+}
+
+/*
+ * These EFLAGS bits are restored from saved value during emulation, and
+ * any changes are written back to the saved value after emulation.
+ */
+#define EFLAGS_MASK (X86_EFLAGS_OF|X86_EFLAGS_SF|X86_EFLAGS_ZF|X86_EFLAGS_AF|\
+ X86_EFLAGS_PF|X86_EFLAGS_CF)
+
+#ifdef CONFIG_X86_64
+#define ON64(x) x
+#else
+#define ON64(x)
+#endif
+
+static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *));
+
+#define FOP_FUNC(name) \
+ ".align " __stringify(FASTOP_SIZE) " \n\t" \
+ ".type " name ", @function \n\t" \
+ name ":\n\t"
+
+#define FOP_RET "ret \n\t"
+
+#define FOP_START(op) \
+ extern void em_##op(struct fastop *fake); \
+ asm(".pushsection .text, \"ax\" \n\t" \
+ ".global em_" #op " \n\t" \
+ FOP_FUNC("em_" #op)
+
+#define FOP_END \
+ ".popsection")
+
+#define FOPNOP() \
+ FOP_FUNC(__stringify(__UNIQUE_ID(nop))) \
+ FOP_RET
+
+#define FOP1E(op, dst) \
+ FOP_FUNC(#op "_" #dst) \
+ "10: " #op " %" #dst " \n\t" FOP_RET
+
+#define FOP1EEX(op, dst) \
+ FOP1E(op, dst) _ASM_EXTABLE(10b, kvm_fastop_exception)
+
+#define FASTOP1(op) \
+ FOP_START(op) \
+ FOP1E(op##b, al) \
+ FOP1E(op##w, ax) \
+ FOP1E(op##l, eax) \
+ ON64(FOP1E(op##q, rax)) \
+ FOP_END
+
+/* 1-operand, using src2 (for MUL/DIV r/m) */
+#define FASTOP1SRC2(op, name) \
+ FOP_START(name) \
+ FOP1E(op, cl) \
+ FOP1E(op, cx) \
+ FOP1E(op, ecx) \
+ ON64(FOP1E(op, rcx)) \
+ FOP_END
+
+/* 1-operand, using src2 (for MUL/DIV r/m), with exceptions */
+#define FASTOP1SRC2EX(op, name) \
+ FOP_START(name) \
+ FOP1EEX(op, cl) \
+ FOP1EEX(op, cx) \
+ FOP1EEX(op, ecx) \
+ ON64(FOP1EEX(op, rcx)) \
+ FOP_END
+
+#define FOP2E(op, dst, src) \
+ FOP_FUNC(#op "_" #dst "_" #src) \
+ #op " %" #src ", %" #dst " \n\t" FOP_RET
+
+#define FASTOP2(op) \
+ FOP_START(op) \
+ FOP2E(op##b, al, dl) \
+ FOP2E(op##w, ax, dx) \
+ FOP2E(op##l, eax, edx) \
+ ON64(FOP2E(op##q, rax, rdx)) \
+ FOP_END
+
+/* 2 operand, word only */
+#define FASTOP2W(op) \
+ FOP_START(op) \
+ FOPNOP() \
+ FOP2E(op##w, ax, dx) \
+ FOP2E(op##l, eax, edx) \
+ ON64(FOP2E(op##q, rax, rdx)) \
+ FOP_END
+
+/* 2 operand, src is CL */
+#define FASTOP2CL(op) \
+ FOP_START(op) \
+ FOP2E(op##b, al, cl) \
+ FOP2E(op##w, ax, cl) \
+ FOP2E(op##l, eax, cl) \
+ ON64(FOP2E(op##q, rax, cl)) \
+ FOP_END
+
+/* 2 operand, src and dest are reversed */
+#define FASTOP2R(op, name) \
+ FOP_START(name) \
+ FOP2E(op##b, dl, al) \
+ FOP2E(op##w, dx, ax) \
+ FOP2E(op##l, edx, eax) \
+ ON64(FOP2E(op##q, rdx, rax)) \
+ FOP_END
+
+#define FOP3E(op, dst, src, src2) \
+ FOP_FUNC(#op "_" #dst "_" #src "_" #src2) \
+ #op " %" #src2 ", %" #src ", %" #dst " \n\t" FOP_RET
+
+/* 3-operand, word-only, src2=cl */
+#define FASTOP3WCL(op) \
+ FOP_START(op) \
+ FOPNOP() \
+ FOP3E(op##w, ax, dx, cl) \
+ FOP3E(op##l, eax, edx, cl) \
+ ON64(FOP3E(op##q, rax, rdx, cl)) \
+ FOP_END
+
+/* Special case for SETcc - 1 instruction per cc */
+#define FOP_SETCC(op) \
+ ".align 4 \n\t" \
+ ".type " #op ", @function \n\t" \
+ #op ": \n\t" \
+ #op " %al \n\t" \
+ FOP_RET
+
+asm(".pushsection .fixup, \"ax\"\n"
+ ".global kvm_fastop_exception \n"
+ "kvm_fastop_exception: xor %esi, %esi; ret\n"
+ ".popsection");
+
+FOP_START(setcc)
+FOP_SETCC(seto)
+FOP_SETCC(setno)
+FOP_SETCC(setc)
+FOP_SETCC(setnc)
+FOP_SETCC(setz)
+FOP_SETCC(setnz)
+FOP_SETCC(setbe)
+FOP_SETCC(setnbe)
+FOP_SETCC(sets)
+FOP_SETCC(setns)
+FOP_SETCC(setp)
+FOP_SETCC(setnp)
+FOP_SETCC(setl)
+FOP_SETCC(setnl)
+FOP_SETCC(setle)
+FOP_SETCC(setnle)
+FOP_END;
+
+FOP_START(salc) "pushf; sbb %al, %al; popf \n\t" FOP_RET
+FOP_END;
+
+/*
+ * XXX: inoutclob user must know where the argument is being expanded.
+ * Relying on CONFIG_CC_HAS_ASM_GOTO would allow us to remove _fault.
+ */
+#define asm_safe(insn, inoutclob...) \
+({ \
+ int _fault = 0; \
+ \
+ asm volatile("1:" insn "\n" \
+ "2:\n" \
+ ".pushsection .fixup, \"ax\"\n" \
+ "3: movl $1, %[_fault]\n" \
+ " jmp 2b\n" \
+ ".popsection\n" \
+ _ASM_EXTABLE(1b, 3b) \
+ : [_fault] "+qm"(_fault) inoutclob ); \
+ \
+ _fault ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE; \
+})
+
+static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt,
+ enum x86_intercept intercept,
+ enum x86_intercept_stage stage)
+{
+ struct x86_instruction_info info = {
+ .intercept = intercept,
+ .rep_prefix = ctxt->rep_prefix,
+ .modrm_mod = ctxt->modrm_mod,
+ .modrm_reg = ctxt->modrm_reg,
+ .modrm_rm = ctxt->modrm_rm,
+ .src_val = ctxt->src.val64,
+ .dst_val = ctxt->dst.val64,
+ .src_bytes = ctxt->src.bytes,
+ .dst_bytes = ctxt->dst.bytes,
+ .ad_bytes = ctxt->ad_bytes,
+ .next_rip = ctxt->eip,
+ };
+
+ return ctxt->ops->intercept(ctxt, &info, stage);
+}
+
+static void assign_masked(ulong *dest, ulong src, ulong mask)
+{
+ *dest = (*dest & ~mask) | (src & mask);
+}
+
+static void assign_register(unsigned long *reg, u64 val, int bytes)
+{
+ /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */
+ switch (bytes) {
+ case 1:
+ *(u8 *)reg = (u8)val;
+ break;
+ case 2:
+ *(u16 *)reg = (u16)val;
+ break;
+ case 4:
+ *reg = (u32)val;
+ break; /* 64b: zero-extend */
+ case 8:
+ *reg = val;
+ break;
+ }
+}
+
+static inline unsigned long ad_mask(struct x86_emulate_ctxt *ctxt)
+{
+ return (1UL << (ctxt->ad_bytes << 3)) - 1;
+}
+
+static ulong stack_mask(struct x86_emulate_ctxt *ctxt)
+{
+ u16 sel;
+ struct desc_struct ss;
+
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ return ~0UL;
+ ctxt->ops->get_segment(ctxt, &sel, &ss, NULL, VCPU_SREG_SS);
+ return ~0U >> ((ss.d ^ 1) * 16); /* d=0: 0xffff; d=1: 0xffffffff */
+}
+
+static int stack_size(struct x86_emulate_ctxt *ctxt)
+{
+ return (__fls(stack_mask(ctxt)) + 1) >> 3;
+}
+
+/* Access/update address held in a register, based on addressing mode. */
+static inline unsigned long
+address_mask(struct x86_emulate_ctxt *ctxt, unsigned long reg)
+{
+ if (ctxt->ad_bytes == sizeof(unsigned long))
+ return reg;
+ else
+ return reg & ad_mask(ctxt);
+}
+
+static inline unsigned long
+register_address(struct x86_emulate_ctxt *ctxt, int reg)
+{
+ return address_mask(ctxt, reg_read(ctxt, reg));
+}
+
+static void masked_increment(ulong *reg, ulong mask, int inc)
+{
+ assign_masked(reg, *reg + inc, mask);
+}
+
+static inline void
+register_address_increment(struct x86_emulate_ctxt *ctxt, int reg, int inc)
+{
+ ulong *preg = reg_rmw(ctxt, reg);
+
+ assign_register(preg, *preg + inc, ctxt->ad_bytes);
+}
+
+static void rsp_increment(struct x86_emulate_ctxt *ctxt, int inc)
+{
+ masked_increment(reg_rmw(ctxt, VCPU_REGS_RSP), stack_mask(ctxt), inc);
+}
+
+static u32 desc_limit_scaled(struct desc_struct *desc)
+{
+ u32 limit = get_desc_limit(desc);
+
+ return desc->g ? (limit << 12) | 0xfff : limit;
+}
+
+static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg)
+{
+ if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS)
+ return 0;
+
+ return ctxt->ops->get_cached_segment_base(ctxt, seg);
+}
+
+static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
+ u32 error, bool valid)
+{
+ WARN_ON(vec > 0x1f);
+ ctxt->exception.vector = vec;
+ ctxt->exception.error_code = error;
+ ctxt->exception.error_code_valid = valid;
+ return X86EMUL_PROPAGATE_FAULT;
+}
+
+static int emulate_db(struct x86_emulate_ctxt *ctxt)
+{
+ return emulate_exception(ctxt, DB_VECTOR, 0, false);
+}
+
+static int emulate_gp(struct x86_emulate_ctxt *ctxt, int err)
+{
+ return emulate_exception(ctxt, GP_VECTOR, err, true);
+}
+
+static int emulate_ss(struct x86_emulate_ctxt *ctxt, int err)
+{
+ return emulate_exception(ctxt, SS_VECTOR, err, true);
+}
+
+static int emulate_ud(struct x86_emulate_ctxt *ctxt)
+{
+ return emulate_exception(ctxt, UD_VECTOR, 0, false);
+}
+
+static int emulate_ts(struct x86_emulate_ctxt *ctxt, int err)
+{
+ return emulate_exception(ctxt, TS_VECTOR, err, true);
+}
+
+static int emulate_de(struct x86_emulate_ctxt *ctxt)
+{
+ return emulate_exception(ctxt, DE_VECTOR, 0, false);
+}
+
+static int emulate_nm(struct x86_emulate_ctxt *ctxt)
+{
+ return emulate_exception(ctxt, NM_VECTOR, 0, false);
+}
+
+static u16 get_segment_selector(struct x86_emulate_ctxt *ctxt, unsigned seg)
+{
+ u16 selector;
+ struct desc_struct desc;
+
+ ctxt->ops->get_segment(ctxt, &selector, &desc, NULL, seg);
+ return selector;
+}
+
+static void set_segment_selector(struct x86_emulate_ctxt *ctxt, u16 selector,
+ unsigned seg)
+{
+ u16 dummy;
+ u32 base3;
+ struct desc_struct desc;
+
+ ctxt->ops->get_segment(ctxt, &dummy, &desc, &base3, seg);
+ ctxt->ops->set_segment(ctxt, selector, &desc, base3, seg);
+}
+
+/*
+ * x86 defines three classes of vector instructions: explicitly
+ * aligned, explicitly unaligned, and the rest, which change behaviour
+ * depending on whether they're AVX encoded or not.
+ *
+ * Also included is CMPXCHG16B which is not a vector instruction, yet it is
+ * subject to the same check. FXSAVE and FXRSTOR are checked here too as their
+ * 512 bytes of data must be aligned to a 16 byte boundary.
+ */
+static unsigned insn_alignment(struct x86_emulate_ctxt *ctxt, unsigned size)
+{
+ u64 alignment = ctxt->d & AlignMask;
+
+ if (likely(size < 16))
+ return 1;
+
+ switch (alignment) {
+ case Unaligned:
+ case Avx:
+ return 1;
+ case Aligned16:
+ return 16;
+ case Aligned:
+ default:
+ return size;
+ }
+}
+
+static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt,
+ struct segmented_address addr,
+ unsigned *max_size, unsigned size,
+ bool write, bool fetch,
+ enum x86emul_mode mode, ulong *linear)
+{
+ struct desc_struct desc;
+ bool usable;
+ ulong la;
+ u32 lim;
+ u16 sel;
+ u8 va_bits;
+
+ la = seg_base(ctxt, addr.seg) + addr.ea;
+ *max_size = 0;
+ switch (mode) {
+ case X86EMUL_MODE_PROT64:
+ *linear = la;
+ va_bits = ctxt_virt_addr_bits(ctxt);
+ if (get_canonical(la, va_bits) != la)
+ goto bad;
+
+ *max_size = min_t(u64, ~0u, (1ull << va_bits) - la);
+ if (size > *max_size)
+ goto bad;
+ break;
+ default:
+ *linear = la = (u32)la;
+ usable = ctxt->ops->get_segment(ctxt, &sel, &desc, NULL,
+ addr.seg);
+ if (!usable)
+ goto bad;
+ /* code segment in protected mode or read-only data segment */
+ if ((((ctxt->mode != X86EMUL_MODE_REAL) && (desc.type & 8))
+ || !(desc.type & 2)) && write)
+ goto bad;
+ /* unreadable code segment */
+ if (!fetch && (desc.type & 8) && !(desc.type & 2))
+ goto bad;
+ lim = desc_limit_scaled(&desc);
+ if (!(desc.type & 8) && (desc.type & 4)) {
+ /* expand-down segment */
+ if (addr.ea <= lim)
+ goto bad;
+ lim = desc.d ? 0xffffffff : 0xffff;
+ }
+ if (addr.ea > lim)
+ goto bad;
+ if (lim == 0xffffffff)
+ *max_size = ~0u;
+ else {
+ *max_size = (u64)lim + 1 - addr.ea;
+ if (size > *max_size)
+ goto bad;
+ }
+ break;
+ }
+ if (la & (insn_alignment(ctxt, size) - 1))
+ return emulate_gp(ctxt, 0);
+ return X86EMUL_CONTINUE;
+bad:
+ if (addr.seg == VCPU_SREG_SS)
+ return emulate_ss(ctxt, 0);
+ else
+ return emulate_gp(ctxt, 0);
+}
+
+static int linearize(struct x86_emulate_ctxt *ctxt,
+ struct segmented_address addr,
+ unsigned size, bool write,
+ ulong *linear)
+{
+ unsigned max_size;
+ return __linearize(ctxt, addr, &max_size, size, write, false,
+ ctxt->mode, linear);
+}
+
+static inline int assign_eip(struct x86_emulate_ctxt *ctxt, ulong dst,
+ enum x86emul_mode mode)
+{
+ ulong linear;
+ int rc;
+ unsigned max_size;
+ struct segmented_address addr = { .seg = VCPU_SREG_CS,
+ .ea = dst };
+
+ if (ctxt->op_bytes != sizeof(unsigned long))
+ addr.ea = dst & ((1UL << (ctxt->op_bytes << 3)) - 1);
+ rc = __linearize(ctxt, addr, &max_size, 1, false, true, mode, &linear);
+ if (rc == X86EMUL_CONTINUE)
+ ctxt->_eip = addr.ea;
+ return rc;
+}
+
+static inline int assign_eip_near(struct x86_emulate_ctxt *ctxt, ulong dst)
+{
+ return assign_eip(ctxt, dst, ctxt->mode);
+}
+
+static int assign_eip_far(struct x86_emulate_ctxt *ctxt, ulong dst,
+ const struct desc_struct *cs_desc)
+{
+ enum x86emul_mode mode = ctxt->mode;
+ int rc;
+
+#ifdef CONFIG_X86_64
+ if (ctxt->mode >= X86EMUL_MODE_PROT16) {
+ if (cs_desc->l) {
+ u64 efer = 0;
+
+ ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
+ if (efer & EFER_LMA)
+ mode = X86EMUL_MODE_PROT64;
+ } else
+ mode = X86EMUL_MODE_PROT32; /* temporary value */
+ }
+#endif
+ if (mode == X86EMUL_MODE_PROT16 || mode == X86EMUL_MODE_PROT32)
+ mode = cs_desc->d ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
+ rc = assign_eip(ctxt, dst, mode);
+ if (rc == X86EMUL_CONTINUE)
+ ctxt->mode = mode;
+ return rc;
+}
+
+static inline int jmp_rel(struct x86_emulate_ctxt *ctxt, int rel)
+{
+ return assign_eip_near(ctxt, ctxt->_eip + rel);
+}
+
+static int linear_read_system(struct x86_emulate_ctxt *ctxt, ulong linear,
+ void *data, unsigned size)
+{
+ return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception, true);
+}
+
+static int linear_write_system(struct x86_emulate_ctxt *ctxt,
+ ulong linear, void *data,
+ unsigned int size)
+{
+ return ctxt->ops->write_std(ctxt, linear, data, size, &ctxt->exception, true);
+}
+
+static int segmented_read_std(struct x86_emulate_ctxt *ctxt,
+ struct segmented_address addr,
+ void *data,
+ unsigned size)
+{
+ int rc;
+ ulong linear;
+
+ rc = linearize(ctxt, addr, size, false, &linear);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception, false);
+}
+
+static int segmented_write_std(struct x86_emulate_ctxt *ctxt,
+ struct segmented_address addr,
+ void *data,
+ unsigned int size)
+{
+ int rc;
+ ulong linear;
+
+ rc = linearize(ctxt, addr, size, true, &linear);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ return ctxt->ops->write_std(ctxt, linear, data, size, &ctxt->exception, false);
+}
+
+/*
+ * Prefetch the remaining bytes of the instruction without crossing page
+ * boundary if they are not in fetch_cache yet.
+ */
+static int __do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt, int op_size)
+{
+ int rc;
+ unsigned size, max_size;
+ unsigned long linear;
+ int cur_size = ctxt->fetch.end - ctxt->fetch.data;
+ struct segmented_address addr = { .seg = VCPU_SREG_CS,
+ .ea = ctxt->eip + cur_size };
+
+ /*
+ * We do not know exactly how many bytes will be needed, and
+ * __linearize is expensive, so fetch as much as possible. We
+ * just have to avoid going beyond the 15 byte limit, the end
+ * of the segment, or the end of the page.
+ *
+ * __linearize is called with size 0 so that it does not do any
+ * boundary check itself. Instead, we use max_size to check
+ * against op_size.
+ */
+ rc = __linearize(ctxt, addr, &max_size, 0, false, true, ctxt->mode,
+ &linear);
+ if (unlikely(rc != X86EMUL_CONTINUE))
+ return rc;
+
+ size = min_t(unsigned, 15UL ^ cur_size, max_size);
+ size = min_t(unsigned, size, PAGE_SIZE - offset_in_page(linear));
+
+ /*
+ * One instruction can only straddle two pages,
+ * and one has been loaded at the beginning of
+ * x86_decode_insn. So, if not enough bytes
+ * still, we must have hit the 15-byte boundary.
+ */
+ if (unlikely(size < op_size))
+ return emulate_gp(ctxt, 0);
+
+ rc = ctxt->ops->fetch(ctxt, linear, ctxt->fetch.end,
+ size, &ctxt->exception);
+ if (unlikely(rc != X86EMUL_CONTINUE))
+ return rc;
+ ctxt->fetch.end += size;
+ return X86EMUL_CONTINUE;
+}
+
+static __always_inline int do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt,
+ unsigned size)
+{
+ unsigned done_size = ctxt->fetch.end - ctxt->fetch.ptr;
+
+ if (unlikely(done_size < size))
+ return __do_insn_fetch_bytes(ctxt, size - done_size);
+ else
+ return X86EMUL_CONTINUE;
+}
+
+/* Fetch next part of the instruction being emulated. */
+#define insn_fetch(_type, _ctxt) \
+({ _type _x; \
+ \
+ rc = do_insn_fetch_bytes(_ctxt, sizeof(_type)); \
+ if (rc != X86EMUL_CONTINUE) \
+ goto done; \
+ ctxt->_eip += sizeof(_type); \
+ memcpy(&_x, ctxt->fetch.ptr, sizeof(_type)); \
+ ctxt->fetch.ptr += sizeof(_type); \
+ _x; \
+})
+
+#define insn_fetch_arr(_arr, _size, _ctxt) \
+({ \
+ rc = do_insn_fetch_bytes(_ctxt, _size); \
+ if (rc != X86EMUL_CONTINUE) \
+ goto done; \
+ ctxt->_eip += (_size); \
+ memcpy(_arr, ctxt->fetch.ptr, _size); \
+ ctxt->fetch.ptr += (_size); \
+})
+
+/*
+ * Given the 'reg' portion of a ModRM byte, and a register block, return a
+ * pointer into the block that addresses the relevant register.
+ * @highbyte_regs specifies whether to decode AH,CH,DH,BH.
+ */
+static void *decode_register(struct x86_emulate_ctxt *ctxt, u8 modrm_reg,
+ int byteop)
+{
+ void *p;
+ int highbyte_regs = (ctxt->rex_prefix == 0) && byteop;
+
+ if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8)
+ p = (unsigned char *)reg_rmw(ctxt, modrm_reg & 3) + 1;
+ else
+ p = reg_rmw(ctxt, modrm_reg);
+ return p;
+}
+
+static int read_descriptor(struct x86_emulate_ctxt *ctxt,
+ struct segmented_address addr,
+ u16 *size, unsigned long *address, int op_bytes)
+{
+ int rc;
+
+ if (op_bytes == 2)
+ op_bytes = 3;
+ *address = 0;
+ rc = segmented_read_std(ctxt, addr, size, 2);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ addr.ea += 2;
+ rc = segmented_read_std(ctxt, addr, address, op_bytes);
+ return rc;
+}
+
+FASTOP2(add);
+FASTOP2(or);
+FASTOP2(adc);
+FASTOP2(sbb);
+FASTOP2(and);
+FASTOP2(sub);
+FASTOP2(xor);
+FASTOP2(cmp);
+FASTOP2(test);
+
+FASTOP1SRC2(mul, mul_ex);
+FASTOP1SRC2(imul, imul_ex);
+FASTOP1SRC2EX(div, div_ex);
+FASTOP1SRC2EX(idiv, idiv_ex);
+
+FASTOP3WCL(shld);
+FASTOP3WCL(shrd);
+
+FASTOP2W(imul);
+
+FASTOP1(not);
+FASTOP1(neg);
+FASTOP1(inc);
+FASTOP1(dec);
+
+FASTOP2CL(rol);
+FASTOP2CL(ror);
+FASTOP2CL(rcl);
+FASTOP2CL(rcr);
+FASTOP2CL(shl);
+FASTOP2CL(shr);
+FASTOP2CL(sar);
+
+FASTOP2W(bsf);
+FASTOP2W(bsr);
+FASTOP2W(bt);
+FASTOP2W(bts);
+FASTOP2W(btr);
+FASTOP2W(btc);
+
+FASTOP2(xadd);
+
+FASTOP2R(cmp, cmp_r);
+
+static int em_bsf_c(struct x86_emulate_ctxt *ctxt)
+{
+ /* If src is zero, do not writeback, but update flags */
+ if (ctxt->src.val == 0)
+ ctxt->dst.type = OP_NONE;
+ return fastop(ctxt, em_bsf);
+}
+
+static int em_bsr_c(struct x86_emulate_ctxt *ctxt)
+{
+ /* If src is zero, do not writeback, but update flags */
+ if (ctxt->src.val == 0)
+ ctxt->dst.type = OP_NONE;
+ return fastop(ctxt, em_bsr);
+}
+
+static __always_inline u8 test_cc(unsigned int condition, unsigned long flags)
+{
+ u8 rc;
+ void (*fop)(void) = (void *)em_setcc + 4 * (condition & 0xf);
+
+ flags = (flags & EFLAGS_MASK) | X86_EFLAGS_IF;
+ asm("push %[flags]; popf; " CALL_NOSPEC
+ : "=a"(rc) : [thunk_target]"r"(fop), [flags]"r"(flags));
+ return rc;
+}
+
+static void fetch_register_operand(struct operand *op)
+{
+ switch (op->bytes) {
+ case 1:
+ op->val = *(u8 *)op->addr.reg;
+ break;
+ case 2:
+ op->val = *(u16 *)op->addr.reg;
+ break;
+ case 4:
+ op->val = *(u32 *)op->addr.reg;
+ break;
+ case 8:
+ op->val = *(u64 *)op->addr.reg;
+ break;
+ }
+}
+
+static void read_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, int reg)
+{
+ switch (reg) {
+ case 0: asm("movdqa %%xmm0, %0" : "=m"(*data)); break;
+ case 1: asm("movdqa %%xmm1, %0" : "=m"(*data)); break;
+ case 2: asm("movdqa %%xmm2, %0" : "=m"(*data)); break;
+ case 3: asm("movdqa %%xmm3, %0" : "=m"(*data)); break;
+ case 4: asm("movdqa %%xmm4, %0" : "=m"(*data)); break;
+ case 5: asm("movdqa %%xmm5, %0" : "=m"(*data)); break;
+ case 6: asm("movdqa %%xmm6, %0" : "=m"(*data)); break;
+ case 7: asm("movdqa %%xmm7, %0" : "=m"(*data)); break;
+#ifdef CONFIG_X86_64
+ case 8: asm("movdqa %%xmm8, %0" : "=m"(*data)); break;
+ case 9: asm("movdqa %%xmm9, %0" : "=m"(*data)); break;
+ case 10: asm("movdqa %%xmm10, %0" : "=m"(*data)); break;
+ case 11: asm("movdqa %%xmm11, %0" : "=m"(*data)); break;
+ case 12: asm("movdqa %%xmm12, %0" : "=m"(*data)); break;
+ case 13: asm("movdqa %%xmm13, %0" : "=m"(*data)); break;
+ case 14: asm("movdqa %%xmm14, %0" : "=m"(*data)); break;
+ case 15: asm("movdqa %%xmm15, %0" : "=m"(*data)); break;
+#endif
+ default: BUG();
+ }
+}
+
+static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data,
+ int reg)
+{
+ switch (reg) {
+ case 0: asm("movdqa %0, %%xmm0" : : "m"(*data)); break;
+ case 1: asm("movdqa %0, %%xmm1" : : "m"(*data)); break;
+ case 2: asm("movdqa %0, %%xmm2" : : "m"(*data)); break;
+ case 3: asm("movdqa %0, %%xmm3" : : "m"(*data)); break;
+ case 4: asm("movdqa %0, %%xmm4" : : "m"(*data)); break;
+ case 5: asm("movdqa %0, %%xmm5" : : "m"(*data)); break;
+ case 6: asm("movdqa %0, %%xmm6" : : "m"(*data)); break;
+ case 7: asm("movdqa %0, %%xmm7" : : "m"(*data)); break;
+#ifdef CONFIG_X86_64
+ case 8: asm("movdqa %0, %%xmm8" : : "m"(*data)); break;
+ case 9: asm("movdqa %0, %%xmm9" : : "m"(*data)); break;
+ case 10: asm("movdqa %0, %%xmm10" : : "m"(*data)); break;
+ case 11: asm("movdqa %0, %%xmm11" : : "m"(*data)); break;
+ case 12: asm("movdqa %0, %%xmm12" : : "m"(*data)); break;
+ case 13: asm("movdqa %0, %%xmm13" : : "m"(*data)); break;
+ case 14: asm("movdqa %0, %%xmm14" : : "m"(*data)); break;
+ case 15: asm("movdqa %0, %%xmm15" : : "m"(*data)); break;
+#endif
+ default: BUG();
+ }
+}
+
+static void read_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg)
+{
+ switch (reg) {
+ case 0: asm("movq %%mm0, %0" : "=m"(*data)); break;
+ case 1: asm("movq %%mm1, %0" : "=m"(*data)); break;
+ case 2: asm("movq %%mm2, %0" : "=m"(*data)); break;
+ case 3: asm("movq %%mm3, %0" : "=m"(*data)); break;
+ case 4: asm("movq %%mm4, %0" : "=m"(*data)); break;
+ case 5: asm("movq %%mm5, %0" : "=m"(*data)); break;
+ case 6: asm("movq %%mm6, %0" : "=m"(*data)); break;
+ case 7: asm("movq %%mm7, %0" : "=m"(*data)); break;
+ default: BUG();
+ }
+}
+
+static void write_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg)
+{
+ switch (reg) {
+ case 0: asm("movq %0, %%mm0" : : "m"(*data)); break;
+ case 1: asm("movq %0, %%mm1" : : "m"(*data)); break;
+ case 2: asm("movq %0, %%mm2" : : "m"(*data)); break;
+ case 3: asm("movq %0, %%mm3" : : "m"(*data)); break;
+ case 4: asm("movq %0, %%mm4" : : "m"(*data)); break;
+ case 5: asm("movq %0, %%mm5" : : "m"(*data)); break;
+ case 6: asm("movq %0, %%mm6" : : "m"(*data)); break;
+ case 7: asm("movq %0, %%mm7" : : "m"(*data)); break;
+ default: BUG();
+ }
+}
+
+static int em_fninit(struct x86_emulate_ctxt *ctxt)
+{
+ if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))
+ return emulate_nm(ctxt);
+
+ asm volatile("fninit");
+ return X86EMUL_CONTINUE;
+}
+
+static int em_fnstcw(struct x86_emulate_ctxt *ctxt)
+{
+ u16 fcw;
+
+ if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))
+ return emulate_nm(ctxt);
+
+ asm volatile("fnstcw %0": "+m"(fcw));
+
+ ctxt->dst.val = fcw;
+
+ return X86EMUL_CONTINUE;
+}
+
+static int em_fnstsw(struct x86_emulate_ctxt *ctxt)
+{
+ u16 fsw;
+
+ if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))
+ return emulate_nm(ctxt);
+
+ asm volatile("fnstsw %0": "+m"(fsw));
+
+ ctxt->dst.val = fsw;
+
+ return X86EMUL_CONTINUE;
+}
+
+static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
+ struct operand *op)
+{
+ unsigned reg = ctxt->modrm_reg;
+
+ if (!(ctxt->d & ModRM))
+ reg = (ctxt->b & 7) | ((ctxt->rex_prefix & 1) << 3);
+
+ if (ctxt->d & Sse) {
+ op->type = OP_XMM;
+ op->bytes = 16;
+ op->addr.xmm = reg;
+ read_sse_reg(ctxt, &op->vec_val, reg);
+ return;
+ }
+ if (ctxt->d & Mmx) {
+ reg &= 7;
+ op->type = OP_MM;
+ op->bytes = 8;
+ op->addr.mm = reg;
+ return;
+ }
+
+ op->type = OP_REG;
+ op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+ op->addr.reg = decode_register(ctxt, reg, ctxt->d & ByteOp);
+
+ fetch_register_operand(op);
+ op->orig_val = op->val;
+}
+
+static void adjust_modrm_seg(struct x86_emulate_ctxt *ctxt, int base_reg)
+{
+ if (base_reg == VCPU_REGS_RSP || base_reg == VCPU_REGS_RBP)
+ ctxt->modrm_seg = VCPU_SREG_SS;
+}
+
+static int decode_modrm(struct x86_emulate_ctxt *ctxt,
+ struct operand *op)
+{
+ u8 sib;
+ int index_reg, base_reg, scale;
+ int rc = X86EMUL_CONTINUE;
+ ulong modrm_ea = 0;
+
+ ctxt->modrm_reg = ((ctxt->rex_prefix << 1) & 8); /* REX.R */
+ index_reg = (ctxt->rex_prefix << 2) & 8; /* REX.X */
+ base_reg = (ctxt->rex_prefix << 3) & 8; /* REX.B */
+
+ ctxt->modrm_mod = (ctxt->modrm & 0xc0) >> 6;
+ ctxt->modrm_reg |= (ctxt->modrm & 0x38) >> 3;
+ ctxt->modrm_rm = base_reg | (ctxt->modrm & 0x07);
+ ctxt->modrm_seg = VCPU_SREG_DS;
+
+ if (ctxt->modrm_mod == 3 || (ctxt->d & NoMod)) {
+ op->type = OP_REG;
+ op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+ op->addr.reg = decode_register(ctxt, ctxt->modrm_rm,
+ ctxt->d & ByteOp);
+ if (ctxt->d & Sse) {
+ op->type = OP_XMM;
+ op->bytes = 16;
+ op->addr.xmm = ctxt->modrm_rm;
+ read_sse_reg(ctxt, &op->vec_val, ctxt->modrm_rm);
+ return rc;
+ }
+ if (ctxt->d & Mmx) {
+ op->type = OP_MM;
+ op->bytes = 8;
+ op->addr.mm = ctxt->modrm_rm & 7;
+ return rc;
+ }
+ fetch_register_operand(op);
+ return rc;
+ }
+
+ op->type = OP_MEM;
+
+ if (ctxt->ad_bytes == 2) {
+ unsigned bx = reg_read(ctxt, VCPU_REGS_RBX);
+ unsigned bp = reg_read(ctxt, VCPU_REGS_RBP);
+ unsigned si = reg_read(ctxt, VCPU_REGS_RSI);
+ unsigned di = reg_read(ctxt, VCPU_REGS_RDI);
+
+ /* 16-bit ModR/M decode. */
+ switch (ctxt->modrm_mod) {
+ case 0:
+ if (ctxt->modrm_rm == 6)
+ modrm_ea += insn_fetch(u16, ctxt);
+ break;
+ case 1:
+ modrm_ea += insn_fetch(s8, ctxt);
+ break;
+ case 2:
+ modrm_ea += insn_fetch(u16, ctxt);
+ break;
+ }
+ switch (ctxt->modrm_rm) {
+ case 0:
+ modrm_ea += bx + si;
+ break;
+ case 1:
+ modrm_ea += bx + di;
+ break;
+ case 2:
+ modrm_ea += bp + si;
+ break;
+ case 3:
+ modrm_ea += bp + di;
+ break;
+ case 4:
+ modrm_ea += si;
+ break;
+ case 5:
+ modrm_ea += di;
+ break;
+ case 6:
+ if (ctxt->modrm_mod != 0)
+ modrm_ea += bp;
+ break;
+ case 7:
+ modrm_ea += bx;
+ break;
+ }
+ if (ctxt->modrm_rm == 2 || ctxt->modrm_rm == 3 ||
+ (ctxt->modrm_rm == 6 && ctxt->modrm_mod != 0))
+ ctxt->modrm_seg = VCPU_SREG_SS;
+ modrm_ea = (u16)modrm_ea;
+ } else {
+ /* 32/64-bit ModR/M decode. */
+ if ((ctxt->modrm_rm & 7) == 4) {
+ sib = insn_fetch(u8, ctxt);
+ index_reg |= (sib >> 3) & 7;
+ base_reg |= sib & 7;
+ scale = sib >> 6;
+
+ if ((base_reg & 7) == 5 && ctxt->modrm_mod == 0)
+ modrm_ea += insn_fetch(s32, ctxt);
+ else {
+ modrm_ea += reg_read(ctxt, base_reg);
+ adjust_modrm_seg(ctxt, base_reg);
+ /* Increment ESP on POP [ESP] */
+ if ((ctxt->d & IncSP) &&
+ base_reg == VCPU_REGS_RSP)
+ modrm_ea += ctxt->op_bytes;
+ }
+ if (index_reg != 4)
+ modrm_ea += reg_read(ctxt, index_reg) << scale;
+ } else if ((ctxt->modrm_rm & 7) == 5 && ctxt->modrm_mod == 0) {
+ modrm_ea += insn_fetch(s32, ctxt);
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ ctxt->rip_relative = 1;
+ } else {
+ base_reg = ctxt->modrm_rm;
+ modrm_ea += reg_read(ctxt, base_reg);
+ adjust_modrm_seg(ctxt, base_reg);
+ }
+ switch (ctxt->modrm_mod) {
+ case 1:
+ modrm_ea += insn_fetch(s8, ctxt);
+ break;
+ case 2:
+ modrm_ea += insn_fetch(s32, ctxt);
+ break;
+ }
+ }
+ op->addr.mem.ea = modrm_ea;
+ if (ctxt->ad_bytes != 8)
+ ctxt->memop.addr.mem.ea = (u32)ctxt->memop.addr.mem.ea;
+
+done:
+ return rc;
+}
+
+static int decode_abs(struct x86_emulate_ctxt *ctxt,
+ struct operand *op)
+{
+ int rc = X86EMUL_CONTINUE;
+
+ op->type = OP_MEM;
+ switch (ctxt->ad_bytes) {
+ case 2:
+ op->addr.mem.ea = insn_fetch(u16, ctxt);
+ break;
+ case 4:
+ op->addr.mem.ea = insn_fetch(u32, ctxt);
+ break;
+ case 8:
+ op->addr.mem.ea = insn_fetch(u64, ctxt);
+ break;
+ }
+done:
+ return rc;
+}
+
+static void fetch_bit_operand(struct x86_emulate_ctxt *ctxt)
+{
+ long sv = 0, mask;
+
+ if (ctxt->dst.type == OP_MEM && ctxt->src.type == OP_REG) {
+ mask = ~((long)ctxt->dst.bytes * 8 - 1);
+
+ if (ctxt->src.bytes == 2)
+ sv = (s16)ctxt->src.val & (s16)mask;
+ else if (ctxt->src.bytes == 4)
+ sv = (s32)ctxt->src.val & (s32)mask;
+ else
+ sv = (s64)ctxt->src.val & (s64)mask;
+
+ ctxt->dst.addr.mem.ea = address_mask(ctxt,
+ ctxt->dst.addr.mem.ea + (sv >> 3));
+ }
+
+ /* only subword offset */
+ ctxt->src.val &= (ctxt->dst.bytes << 3) - 1;
+}
+
+static int read_emulated(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr, void *dest, unsigned size)
+{
+ int rc;
+ struct read_cache *mc = &ctxt->mem_read;
+
+ if (mc->pos < mc->end)
+ goto read_cached;
+
+ WARN_ON((mc->end + size) >= sizeof(mc->data));
+
+ rc = ctxt->ops->read_emulated(ctxt, addr, mc->data + mc->end, size,
+ &ctxt->exception);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ mc->end += size;
+
+read_cached:
+ memcpy(dest, mc->data + mc->pos, size);
+ mc->pos += size;
+ return X86EMUL_CONTINUE;
+}
+
+static int segmented_read(struct x86_emulate_ctxt *ctxt,
+ struct segmented_address addr,
+ void *data,
+ unsigned size)
+{
+ int rc;
+ ulong linear;
+
+ rc = linearize(ctxt, addr, size, false, &linear);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ return read_emulated(ctxt, linear, data, size);
+}
+
+static int segmented_write(struct x86_emulate_ctxt *ctxt,
+ struct segmented_address addr,
+ const void *data,
+ unsigned size)
+{
+ int rc;
+ ulong linear;
+
+ rc = linearize(ctxt, addr, size, true, &linear);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ return ctxt->ops->write_emulated(ctxt, linear, data, size,
+ &ctxt->exception);
+}
+
+static int segmented_cmpxchg(struct x86_emulate_ctxt *ctxt,
+ struct segmented_address addr,
+ const void *orig_data, const void *data,
+ unsigned size)
+{
+ int rc;
+ ulong linear;
+
+ rc = linearize(ctxt, addr, size, true, &linear);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ return ctxt->ops->cmpxchg_emulated(ctxt, linear, orig_data, data,
+ size, &ctxt->exception);
+}
+
+static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
+ unsigned int size, unsigned short port,
+ void *dest)
+{
+ struct read_cache *rc = &ctxt->io_read;
+
+ if (rc->pos == rc->end) { /* refill pio read ahead */
+ unsigned int in_page, n;
+ unsigned int count = ctxt->rep_prefix ?
+ address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) : 1;
+ in_page = (ctxt->eflags & X86_EFLAGS_DF) ?
+ offset_in_page(reg_read(ctxt, VCPU_REGS_RDI)) :
+ PAGE_SIZE - offset_in_page(reg_read(ctxt, VCPU_REGS_RDI));
+ n = min3(in_page, (unsigned int)sizeof(rc->data) / size, count);
+ if (n == 0)
+ n = 1;
+ rc->pos = rc->end = 0;
+ if (!ctxt->ops->pio_in_emulated(ctxt, size, port, rc->data, n))
+ return 0;
+ rc->end = n * size;
+ }
+
+ if (ctxt->rep_prefix && (ctxt->d & String) &&
+ !(ctxt->eflags & X86_EFLAGS_DF)) {
+ ctxt->dst.data = rc->data + rc->pos;
+ ctxt->dst.type = OP_MEM_STR;
+ ctxt->dst.count = (rc->end - rc->pos) / size;
+ rc->pos = rc->end;
+ } else {
+ memcpy(dest, rc->data + rc->pos, size);
+ rc->pos += size;
+ }
+ return 1;
+}
+
+static int read_interrupt_descriptor(struct x86_emulate_ctxt *ctxt,
+ u16 index, struct desc_struct *desc)
+{
+ struct desc_ptr dt;
+ ulong addr;
+
+ ctxt->ops->get_idt(ctxt, &dt);
+
+ if (dt.size < index * 8 + 7)
+ return emulate_gp(ctxt, index << 3 | 0x2);
+
+ addr = dt.address + index * 8;
+ return linear_read_system(ctxt, addr, desc, sizeof *desc);
+}
+
+static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
+ u16 selector, struct desc_ptr *dt)
+{
+ const struct x86_emulate_ops *ops = ctxt->ops;
+ u32 base3 = 0;
+
+ if (selector & 1 << 2) {
+ struct desc_struct desc;
+ u16 sel;
+
+ memset (dt, 0, sizeof *dt);
+ if (!ops->get_segment(ctxt, &sel, &desc, &base3,
+ VCPU_SREG_LDTR))
+ return;
+
+ dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */
+ dt->address = get_desc_base(&desc) | ((u64)base3 << 32);
+ } else
+ ops->get_gdt(ctxt, dt);
+}
+
+static int get_descriptor_ptr(struct x86_emulate_ctxt *ctxt,
+ u16 selector, ulong *desc_addr_p)
+{
+ struct desc_ptr dt;
+ u16 index = selector >> 3;
+ ulong addr;
+
+ get_descriptor_table_ptr(ctxt, selector, &dt);
+
+ if (dt.size < index * 8 + 7)
+ return emulate_gp(ctxt, selector & 0xfffc);
+
+ addr = dt.address + index * 8;
+
+#ifdef CONFIG_X86_64
+ if (addr >> 32 != 0) {
+ u64 efer = 0;
+
+ ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
+ if (!(efer & EFER_LMA))
+ addr &= (u32)-1;
+ }
+#endif
+
+ *desc_addr_p = addr;
+ return X86EMUL_CONTINUE;
+}
+
+/* allowed just for 8 bytes segments */
+static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
+ u16 selector, struct desc_struct *desc,
+ ulong *desc_addr_p)
+{
+ int rc;
+
+ rc = get_descriptor_ptr(ctxt, selector, desc_addr_p);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ return linear_read_system(ctxt, *desc_addr_p, desc, sizeof(*desc));
+}
+
+/* allowed just for 8 bytes segments */
+static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
+ u16 selector, struct desc_struct *desc)
+{
+ int rc;
+ ulong addr;
+
+ rc = get_descriptor_ptr(ctxt, selector, &addr);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ return linear_write_system(ctxt, addr, desc, sizeof *desc);
+}
+
+static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
+ u16 selector, int seg, u8 cpl,
+ enum x86_transfer_type transfer,
+ struct desc_struct *desc)
+{
+ struct desc_struct seg_desc, old_desc;
+ u8 dpl, rpl;
+ unsigned err_vec = GP_VECTOR;
+ u32 err_code = 0;
+ bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */
+ ulong desc_addr;
+ int ret;
+ u16 dummy;
+ u32 base3 = 0;
+
+ memset(&seg_desc, 0, sizeof seg_desc);
+
+ if (ctxt->mode == X86EMUL_MODE_REAL) {
+ /* set real mode segment descriptor (keep limit etc. for
+ * unreal mode) */
+ ctxt->ops->get_segment(ctxt, &dummy, &seg_desc, NULL, seg);
+ set_desc_base(&seg_desc, selector << 4);
+ goto load;
+ } else if (seg <= VCPU_SREG_GS && ctxt->mode == X86EMUL_MODE_VM86) {
+ /* VM86 needs a clean new segment descriptor */
+ set_desc_base(&seg_desc, selector << 4);
+ set_desc_limit(&seg_desc, 0xffff);
+ seg_desc.type = 3;
+ seg_desc.p = 1;
+ seg_desc.s = 1;
+ seg_desc.dpl = 3;
+ goto load;
+ }
+
+ rpl = selector & 3;
+
+ /* TR should be in GDT only */
+ if (seg == VCPU_SREG_TR && (selector & (1 << 2)))
+ goto exception;
+
+ /* NULL selector is not valid for TR, CS and (except for long mode) SS */
+ if (null_selector) {
+ if (seg == VCPU_SREG_CS || seg == VCPU_SREG_TR)
+ goto exception;
+
+ if (seg == VCPU_SREG_SS) {
+ if (ctxt->mode != X86EMUL_MODE_PROT64 || rpl != cpl)
+ goto exception;
+
+ /*
+ * ctxt->ops->set_segment expects the CPL to be in
+ * SS.DPL, so fake an expand-up 32-bit data segment.
+ */
+ seg_desc.type = 3;
+ seg_desc.p = 1;
+ seg_desc.s = 1;
+ seg_desc.dpl = cpl;
+ seg_desc.d = 1;
+ seg_desc.g = 1;
+ }
+
+ /* Skip all following checks */
+ goto load;
+ }
+
+ ret = read_segment_descriptor(ctxt, selector, &seg_desc, &desc_addr);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+
+ err_code = selector & 0xfffc;
+ err_vec = (transfer == X86_TRANSFER_TASK_SWITCH) ? TS_VECTOR :
+ GP_VECTOR;
+
+ /* can't load system descriptor into segment selector */
+ if (seg <= VCPU_SREG_GS && !seg_desc.s) {
+ if (transfer == X86_TRANSFER_CALL_JMP)
+ return X86EMUL_UNHANDLEABLE;
+ goto exception;
+ }
+
+ dpl = seg_desc.dpl;
+
+ switch (seg) {
+ case VCPU_SREG_SS:
+ /*
+ * segment is not a writable data segment or segment
+ * selector's RPL != CPL or segment selector's RPL != CPL
+ */
+ if (rpl != cpl || (seg_desc.type & 0xa) != 0x2 || dpl != cpl)
+ goto exception;
+ break;
+ case VCPU_SREG_CS:
+ if (!(seg_desc.type & 8))
+ goto exception;
+
+ if (seg_desc.type & 4) {
+ /* conforming */
+ if (dpl > cpl)
+ goto exception;
+ } else {
+ /* nonconforming */
+ if (rpl > cpl || dpl != cpl)
+ goto exception;
+ }
+ /* in long-mode d/b must be clear if l is set */
+ if (seg_desc.d && seg_desc.l) {
+ u64 efer = 0;
+
+ ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
+ if (efer & EFER_LMA)
+ goto exception;
+ }
+
+ /* CS(RPL) <- CPL */
+ selector = (selector & 0xfffc) | cpl;
+ break;
+ case VCPU_SREG_TR:
+ if (seg_desc.s || (seg_desc.type != 1 && seg_desc.type != 9))
+ goto exception;
+ if (!seg_desc.p) {
+ err_vec = NP_VECTOR;
+ goto exception;
+ }
+ old_desc = seg_desc;
+ seg_desc.type |= 2; /* busy */
+ ret = ctxt->ops->cmpxchg_emulated(ctxt, desc_addr, &old_desc, &seg_desc,
+ sizeof(seg_desc), &ctxt->exception);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ break;
+ case VCPU_SREG_LDTR:
+ if (seg_desc.s || seg_desc.type != 2)
+ goto exception;
+ break;
+ default: /* DS, ES, FS, or GS */
+ /*
+ * segment is not a data or readable code segment or
+ * ((segment is a data or nonconforming code segment)
+ * and (both RPL and CPL > DPL))
+ */
+ if ((seg_desc.type & 0xa) == 0x8 ||
+ (((seg_desc.type & 0xc) != 0xc) &&
+ (rpl > dpl && cpl > dpl)))
+ goto exception;
+ break;
+ }
+
+ if (!seg_desc.p) {
+ err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
+ goto exception;
+ }
+
+ if (seg_desc.s) {
+ /* mark segment as accessed */
+ if (!(seg_desc.type & 1)) {
+ seg_desc.type |= 1;
+ ret = write_segment_descriptor(ctxt, selector,
+ &seg_desc);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ }
+ } else if (ctxt->mode == X86EMUL_MODE_PROT64) {
+ ret = linear_read_system(ctxt, desc_addr+8, &base3, sizeof(base3));
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ if (emul_is_noncanonical_address(get_desc_base(&seg_desc) |
+ ((u64)base3 << 32), ctxt))
+ return emulate_gp(ctxt, 0);
+ }
+load:
+ ctxt->ops->set_segment(ctxt, selector, &seg_desc, base3, seg);
+ if (desc)
+ *desc = seg_desc;
+ return X86EMUL_CONTINUE;
+exception:
+ return emulate_exception(ctxt, err_vec, err_code, true);
+}
+
+static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
+ u16 selector, int seg)
+{
+ u8 cpl = ctxt->ops->cpl(ctxt);
+
+ /*
+ * None of MOV, POP and LSS can load a NULL selector in CPL=3, but
+ * they can load it at CPL<3 (Intel's manual says only LSS can,
+ * but it's wrong).
+ *
+ * However, the Intel manual says that putting IST=1/DPL=3 in
+ * an interrupt gate will result in SS=3 (the AMD manual instead
+ * says it doesn't), so allow SS=3 in __load_segment_descriptor
+ * and only forbid it here.
+ */
+ if (seg == VCPU_SREG_SS && selector == 3 &&
+ ctxt->mode == X86EMUL_MODE_PROT64)
+ return emulate_exception(ctxt, GP_VECTOR, 0, true);
+
+ return __load_segment_descriptor(ctxt, selector, seg, cpl,
+ X86_TRANSFER_NONE, NULL);
+}
+
+static void write_register_operand(struct operand *op)
+{
+ return assign_register(op->addr.reg, op->val, op->bytes);
+}
+
+static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op)
+{
+ switch (op->type) {
+ case OP_REG:
+ write_register_operand(op);
+ break;
+ case OP_MEM:
+ if (ctxt->lock_prefix)
+ return segmented_cmpxchg(ctxt,
+ op->addr.mem,
+ &op->orig_val,
+ &op->val,
+ op->bytes);
+ else
+ return segmented_write(ctxt,
+ op->addr.mem,
+ &op->val,
+ op->bytes);
+ break;
+ case OP_MEM_STR:
+ return segmented_write(ctxt,
+ op->addr.mem,
+ op->data,
+ op->bytes * op->count);
+ break;
+ case OP_XMM:
+ write_sse_reg(ctxt, &op->vec_val, op->addr.xmm);
+ break;
+ case OP_MM:
+ write_mmx_reg(ctxt, &op->mm_val, op->addr.mm);
+ break;
+ case OP_NONE:
+ /* no writeback */
+ break;
+ default:
+ break;
+ }
+ return X86EMUL_CONTINUE;
+}
+
+static int push(struct x86_emulate_ctxt *ctxt, void *data, int bytes)
+{
+ struct segmented_address addr;
+
+ rsp_increment(ctxt, -bytes);
+ addr.ea = reg_read(ctxt, VCPU_REGS_RSP) & stack_mask(ctxt);
+ addr.seg = VCPU_SREG_SS;
+
+ return segmented_write(ctxt, addr, data, bytes);
+}
+
+static int em_push(struct x86_emulate_ctxt *ctxt)
+{
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return push(ctxt, &ctxt->src.val, ctxt->op_bytes);
+}
+
+static int emulate_pop(struct x86_emulate_ctxt *ctxt,
+ void *dest, int len)
+{
+ int rc;
+ struct segmented_address addr;
+
+ addr.ea = reg_read(ctxt, VCPU_REGS_RSP) & stack_mask(ctxt);
+ addr.seg = VCPU_SREG_SS;
+ rc = segmented_read(ctxt, addr, dest, len);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ rsp_increment(ctxt, len);
+ return rc;
+}
+
+static int em_pop(struct x86_emulate_ctxt *ctxt)
+{
+ return emulate_pop(ctxt, &ctxt->dst.val, ctxt->op_bytes);
+}
+
+static int emulate_popf(struct x86_emulate_ctxt *ctxt,
+ void *dest, int len)
+{
+ int rc;
+ unsigned long val, change_mask;
+ int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> X86_EFLAGS_IOPL_BIT;
+ int cpl = ctxt->ops->cpl(ctxt);
+
+ rc = emulate_pop(ctxt, &val, len);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ change_mask = X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
+ X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF |
+ X86_EFLAGS_TF | X86_EFLAGS_DF | X86_EFLAGS_NT |
+ X86_EFLAGS_AC | X86_EFLAGS_ID;
+
+ switch(ctxt->mode) {
+ case X86EMUL_MODE_PROT64:
+ case X86EMUL_MODE_PROT32:
+ case X86EMUL_MODE_PROT16:
+ if (cpl == 0)
+ change_mask |= X86_EFLAGS_IOPL;
+ if (cpl <= iopl)
+ change_mask |= X86_EFLAGS_IF;
+ break;
+ case X86EMUL_MODE_VM86:
+ if (iopl < 3)
+ return emulate_gp(ctxt, 0);
+ change_mask |= X86_EFLAGS_IF;
+ break;
+ default: /* real mode */
+ change_mask |= (X86_EFLAGS_IOPL | X86_EFLAGS_IF);
+ break;
+ }
+
+ *(unsigned long *)dest =
+ (ctxt->eflags & ~change_mask) | (val & change_mask);
+
+ return rc;
+}
+
+static int em_popf(struct x86_emulate_ctxt *ctxt)
+{
+ ctxt->dst.type = OP_REG;
+ ctxt->dst.addr.reg = &ctxt->eflags;
+ ctxt->dst.bytes = ctxt->op_bytes;
+ return emulate_popf(ctxt, &ctxt->dst.val, ctxt->op_bytes);
+}
+
+static int em_enter(struct x86_emulate_ctxt *ctxt)
+{
+ int rc;
+ unsigned frame_size = ctxt->src.val;
+ unsigned nesting_level = ctxt->src2.val & 31;
+ ulong rbp;
+
+ if (nesting_level)
+ return X86EMUL_UNHANDLEABLE;
+
+ rbp = reg_read(ctxt, VCPU_REGS_RBP);
+ rc = push(ctxt, &rbp, stack_size(ctxt));
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ assign_masked(reg_rmw(ctxt, VCPU_REGS_RBP), reg_read(ctxt, VCPU_REGS_RSP),
+ stack_mask(ctxt));
+ assign_masked(reg_rmw(ctxt, VCPU_REGS_RSP),
+ reg_read(ctxt, VCPU_REGS_RSP) - frame_size,
+ stack_mask(ctxt));
+ return X86EMUL_CONTINUE;
+}
+
+static int em_leave(struct x86_emulate_ctxt *ctxt)
+{
+ assign_masked(reg_rmw(ctxt, VCPU_REGS_RSP), reg_read(ctxt, VCPU_REGS_RBP),
+ stack_mask(ctxt));
+ return emulate_pop(ctxt, reg_rmw(ctxt, VCPU_REGS_RBP), ctxt->op_bytes);
+}
+
+static int em_push_sreg(struct x86_emulate_ctxt *ctxt)
+{
+ int seg = ctxt->src2.val;
+
+ ctxt->src.val = get_segment_selector(ctxt, seg);
+ if (ctxt->op_bytes == 4) {
+ rsp_increment(ctxt, -2);
+ ctxt->op_bytes = 2;
+ }
+
+ return em_push(ctxt);
+}
+
+static int em_pop_sreg(struct x86_emulate_ctxt *ctxt)
+{
+ int seg = ctxt->src2.val;
+ unsigned long selector;
+ int rc;
+
+ rc = emulate_pop(ctxt, &selector, 2);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ if (ctxt->modrm_reg == VCPU_SREG_SS)
+ ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS;
+ if (ctxt->op_bytes > 2)
+ rsp_increment(ctxt, ctxt->op_bytes - 2);
+
+ rc = load_segment_descriptor(ctxt, (u16)selector, seg);
+ return rc;
+}
+
+static int em_pusha(struct x86_emulate_ctxt *ctxt)
+{
+ unsigned long old_esp = reg_read(ctxt, VCPU_REGS_RSP);
+ int rc = X86EMUL_CONTINUE;
+ int reg = VCPU_REGS_RAX;
+
+ while (reg <= VCPU_REGS_RDI) {
+ (reg == VCPU_REGS_RSP) ?
+ (ctxt->src.val = old_esp) : (ctxt->src.val = reg_read(ctxt, reg));
+
+ rc = em_push(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ ++reg;
+ }
+
+ return rc;
+}
+
+static int em_pushf(struct x86_emulate_ctxt *ctxt)
+{
+ ctxt->src.val = (unsigned long)ctxt->eflags & ~X86_EFLAGS_VM;
+ return em_push(ctxt);
+}
+
+static int em_popa(struct x86_emulate_ctxt *ctxt)
+{
+ int rc = X86EMUL_CONTINUE;
+ int reg = VCPU_REGS_RDI;
+ u32 val;
+
+ while (reg >= VCPU_REGS_RAX) {
+ if (reg == VCPU_REGS_RSP) {
+ rsp_increment(ctxt, ctxt->op_bytes);
+ --reg;
+ }
+
+ rc = emulate_pop(ctxt, &val, ctxt->op_bytes);
+ if (rc != X86EMUL_CONTINUE)
+ break;
+ assign_register(reg_rmw(ctxt, reg), val, ctxt->op_bytes);
+ --reg;
+ }
+ return rc;
+}
+
+static int __emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq)
+{
+ const struct x86_emulate_ops *ops = ctxt->ops;
+ int rc;
+ struct desc_ptr dt;
+ gva_t cs_addr;
+ gva_t eip_addr;
+ u16 cs, eip;
+
+ /* TODO: Add limit checks */
+ ctxt->src.val = ctxt->eflags;
+ rc = em_push(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ ctxt->eflags &= ~(X86_EFLAGS_IF | X86_EFLAGS_TF | X86_EFLAGS_AC);
+
+ ctxt->src.val = get_segment_selector(ctxt, VCPU_SREG_CS);
+ rc = em_push(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ ctxt->src.val = ctxt->_eip;
+ rc = em_push(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ ops->get_idt(ctxt, &dt);
+
+ eip_addr = dt.address + (irq << 2);
+ cs_addr = dt.address + (irq << 2) + 2;
+
+ rc = linear_read_system(ctxt, cs_addr, &cs, 2);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ rc = linear_read_system(ctxt, eip_addr, &eip, 2);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ rc = load_segment_descriptor(ctxt, cs, VCPU_SREG_CS);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ ctxt->_eip = eip;
+
+ return rc;
+}
+
+int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq)
+{
+ int rc;
+
+ invalidate_registers(ctxt);
+ rc = __emulate_int_real(ctxt, irq);
+ if (rc == X86EMUL_CONTINUE)
+ writeback_registers(ctxt);
+ return rc;
+}
+
+static int emulate_int(struct x86_emulate_ctxt *ctxt, int irq)
+{
+ switch(ctxt->mode) {
+ case X86EMUL_MODE_REAL:
+ return __emulate_int_real(ctxt, irq);
+ case X86EMUL_MODE_VM86:
+ case X86EMUL_MODE_PROT16:
+ case X86EMUL_MODE_PROT32:
+ case X86EMUL_MODE_PROT64:
+ default:
+ /* Protected mode interrupts unimplemented yet */
+ return X86EMUL_UNHANDLEABLE;
+ }
+}
+
+static int emulate_iret_real(struct x86_emulate_ctxt *ctxt)
+{
+ int rc = X86EMUL_CONTINUE;
+ unsigned long temp_eip = 0;
+ unsigned long temp_eflags = 0;
+ unsigned long cs = 0;
+ unsigned long mask = X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
+ X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_TF |
+ X86_EFLAGS_IF | X86_EFLAGS_DF | X86_EFLAGS_OF |
+ X86_EFLAGS_IOPL | X86_EFLAGS_NT | X86_EFLAGS_RF |
+ X86_EFLAGS_AC | X86_EFLAGS_ID |
+ X86_EFLAGS_FIXED;
+ unsigned long vm86_mask = X86_EFLAGS_VM | X86_EFLAGS_VIF |
+ X86_EFLAGS_VIP;
+
+ /* TODO: Add stack limit check */
+
+ rc = emulate_pop(ctxt, &temp_eip, ctxt->op_bytes);
+
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ if (temp_eip & ~0xffff)
+ return emulate_gp(ctxt, 0);
+
+ rc = emulate_pop(ctxt, &cs, ctxt->op_bytes);
+
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ rc = emulate_pop(ctxt, &temp_eflags, ctxt->op_bytes);
+
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ rc = load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS);
+
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ ctxt->_eip = temp_eip;
+
+ if (ctxt->op_bytes == 4)
+ ctxt->eflags = ((temp_eflags & mask) | (ctxt->eflags & vm86_mask));
+ else if (ctxt->op_bytes == 2) {
+ ctxt->eflags &= ~0xffff;
+ ctxt->eflags |= temp_eflags;
+ }
+
+ ctxt->eflags &= ~EFLG_RESERVED_ZEROS_MASK; /* Clear reserved zeros */
+ ctxt->eflags |= X86_EFLAGS_FIXED;
+ ctxt->ops->set_nmi_mask(ctxt, false);
+
+ return rc;
+}
+
+static int em_iret(struct x86_emulate_ctxt *ctxt)
+{
+ switch(ctxt->mode) {
+ case X86EMUL_MODE_REAL:
+ return emulate_iret_real(ctxt);
+ case X86EMUL_MODE_VM86:
+ case X86EMUL_MODE_PROT16:
+ case X86EMUL_MODE_PROT32:
+ case X86EMUL_MODE_PROT64:
+ default:
+ /* iret from protected mode unimplemented yet */
+ return X86EMUL_UNHANDLEABLE;
+ }
+}
+
+static int em_jmp_far(struct x86_emulate_ctxt *ctxt)
+{
+ int rc;
+ unsigned short sel;
+ struct desc_struct new_desc;
+ u8 cpl = ctxt->ops->cpl(ctxt);
+
+ memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2);
+
+ rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl,
+ X86_TRANSFER_CALL_JMP,
+ &new_desc);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ rc = assign_eip_far(ctxt, ctxt->src.val, &new_desc);
+ /* Error handling is not implemented. */
+ if (rc != X86EMUL_CONTINUE)
+ return X86EMUL_UNHANDLEABLE;
+
+ return rc;
+}
+
+static int em_jmp_abs(struct x86_emulate_ctxt *ctxt)
+{
+ return assign_eip_near(ctxt, ctxt->src.val);
+}
+
+static int em_call_near_abs(struct x86_emulate_ctxt *ctxt)
+{
+ int rc;
+ long int old_eip;
+
+ old_eip = ctxt->_eip;
+ rc = assign_eip_near(ctxt, ctxt->src.val);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ ctxt->src.val = old_eip;
+ rc = em_push(ctxt);
+ return rc;
+}
+
+static int em_cmpxchg8b(struct x86_emulate_ctxt *ctxt)
+{
+ u64 old = ctxt->dst.orig_val64;
+
+ if (ctxt->dst.bytes == 16)
+ return X86EMUL_UNHANDLEABLE;
+
+ if (((u32) (old >> 0) != (u32) reg_read(ctxt, VCPU_REGS_RAX)) ||
+ ((u32) (old >> 32) != (u32) reg_read(ctxt, VCPU_REGS_RDX))) {
+ *reg_write(ctxt, VCPU_REGS_RAX) = (u32) (old >> 0);
+ *reg_write(ctxt, VCPU_REGS_RDX) = (u32) (old >> 32);
+ ctxt->eflags &= ~X86_EFLAGS_ZF;
+ } else {
+ ctxt->dst.val64 = ((u64)reg_read(ctxt, VCPU_REGS_RCX) << 32) |
+ (u32) reg_read(ctxt, VCPU_REGS_RBX);
+
+ ctxt->eflags |= X86_EFLAGS_ZF;
+ }
+ return X86EMUL_CONTINUE;
+}
+
+static int em_ret(struct x86_emulate_ctxt *ctxt)
+{
+ int rc;
+ unsigned long eip;
+
+ rc = emulate_pop(ctxt, &eip, ctxt->op_bytes);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ return assign_eip_near(ctxt, eip);
+}
+
+static int em_ret_far(struct x86_emulate_ctxt *ctxt)
+{
+ int rc;
+ unsigned long eip, cs;
+ int cpl = ctxt->ops->cpl(ctxt);
+ struct desc_struct new_desc;
+
+ rc = emulate_pop(ctxt, &eip, ctxt->op_bytes);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ rc = emulate_pop(ctxt, &cs, ctxt->op_bytes);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ /* Outer-privilege level return is not implemented */
+ if (ctxt->mode >= X86EMUL_MODE_PROT16 && (cs & 3) > cpl)
+ return X86EMUL_UNHANDLEABLE;
+ rc = __load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS, cpl,
+ X86_TRANSFER_RET,
+ &new_desc);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ rc = assign_eip_far(ctxt, eip, &new_desc);
+ /* Error handling is not implemented. */
+ if (rc != X86EMUL_CONTINUE)
+ return X86EMUL_UNHANDLEABLE;
+
+ return rc;
+}
+
+static int em_ret_far_imm(struct x86_emulate_ctxt *ctxt)
+{
+ int rc;
+
+ rc = em_ret_far(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ rsp_increment(ctxt, ctxt->src.val);
+ return X86EMUL_CONTINUE;
+}
+
+static int em_cmpxchg(struct x86_emulate_ctxt *ctxt)
+{
+ /* Save real source value, then compare EAX against destination. */
+ ctxt->dst.orig_val = ctxt->dst.val;
+ ctxt->dst.val = reg_read(ctxt, VCPU_REGS_RAX);
+ ctxt->src.orig_val = ctxt->src.val;
+ ctxt->src.val = ctxt->dst.orig_val;
+ fastop(ctxt, em_cmp);
+
+ if (ctxt->eflags & X86_EFLAGS_ZF) {
+ /* Success: write back to memory; no update of EAX */
+ ctxt->src.type = OP_NONE;
+ ctxt->dst.val = ctxt->src.orig_val;
+ } else {
+ /* Failure: write the value we saw to EAX. */
+ ctxt->src.type = OP_REG;
+ ctxt->src.addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX);
+ ctxt->src.val = ctxt->dst.orig_val;
+ /* Create write-cycle to dest by writing the same value */
+ ctxt->dst.val = ctxt->dst.orig_val;
+ }
+ return X86EMUL_CONTINUE;
+}
+
+static int em_lseg(struct x86_emulate_ctxt *ctxt)
+{
+ int seg = ctxt->src2.val;
+ unsigned short sel;
+ int rc;
+
+ memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2);
+
+ rc = load_segment_descriptor(ctxt, sel, seg);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ ctxt->dst.val = ctxt->src.val;
+ return rc;
+}
+
+static int emulator_has_longmode(struct x86_emulate_ctxt *ctxt)
+{
+#ifdef CONFIG_X86_64
+ u32 eax, ebx, ecx, edx;
+
+ eax = 0x80000001;
+ ecx = 0;
+ ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false);
+ return edx & bit(X86_FEATURE_LM);
+#else
+ return false;
+#endif
+}
+
+#define GET_SMSTATE(type, smbase, offset) \
+ ({ \
+ type __val; \
+ int r = ctxt->ops->read_phys(ctxt, smbase + offset, &__val, \
+ sizeof(__val)); \
+ if (r != X86EMUL_CONTINUE) \
+ return X86EMUL_UNHANDLEABLE; \
+ __val; \
+ })
+
+static void rsm_set_desc_flags(struct desc_struct *desc, u32 flags)
+{
+ desc->g = (flags >> 23) & 1;
+ desc->d = (flags >> 22) & 1;
+ desc->l = (flags >> 21) & 1;
+ desc->avl = (flags >> 20) & 1;
+ desc->p = (flags >> 15) & 1;
+ desc->dpl = (flags >> 13) & 3;
+ desc->s = (flags >> 12) & 1;
+ desc->type = (flags >> 8) & 15;
+}
+
+static int rsm_load_seg_32(struct x86_emulate_ctxt *ctxt, u64 smbase, int n)
+{
+ struct desc_struct desc;
+ int offset;
+ u16 selector;
+
+ selector = GET_SMSTATE(u32, smbase, 0x7fa8 + n * 4);
+
+ if (n < 3)
+ offset = 0x7f84 + n * 12;
+ else
+ offset = 0x7f2c + (n - 3) * 12;
+
+ set_desc_base(&desc, GET_SMSTATE(u32, smbase, offset + 8));
+ set_desc_limit(&desc, GET_SMSTATE(u32, smbase, offset + 4));
+ rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, offset));
+ ctxt->ops->set_segment(ctxt, selector, &desc, 0, n);
+ return X86EMUL_CONTINUE;
+}
+
+#ifdef CONFIG_X86_64
+static int rsm_load_seg_64(struct x86_emulate_ctxt *ctxt, u64 smbase, int n)
+{
+ struct desc_struct desc;
+ int offset;
+ u16 selector;
+ u32 base3;
+
+ offset = 0x7e00 + n * 16;
+
+ selector = GET_SMSTATE(u16, smbase, offset);
+ rsm_set_desc_flags(&desc, GET_SMSTATE(u16, smbase, offset + 2) << 8);
+ set_desc_limit(&desc, GET_SMSTATE(u32, smbase, offset + 4));
+ set_desc_base(&desc, GET_SMSTATE(u32, smbase, offset + 8));
+ base3 = GET_SMSTATE(u32, smbase, offset + 12);
+
+ ctxt->ops->set_segment(ctxt, selector, &desc, base3, n);
+ return X86EMUL_CONTINUE;
+}
+#endif
+
+static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt,
+ u64 cr0, u64 cr3, u64 cr4)
+{
+ int bad;
+ u64 pcid;
+
+ /* In order to later set CR4.PCIDE, CR3[11:0] must be zero. */
+ pcid = 0;
+ if (cr4 & X86_CR4_PCIDE) {
+ pcid = cr3 & 0xfff;
+ cr3 &= ~0xfff;
+ }
+
+ bad = ctxt->ops->set_cr(ctxt, 3, cr3);
+ if (bad)
+ return X86EMUL_UNHANDLEABLE;
+
+ /*
+ * First enable PAE, long mode needs it before CR0.PG = 1 is set.
+ * Then enable protected mode. However, PCID cannot be enabled
+ * if EFER.LMA=0, so set it separately.
+ */
+ bad = ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE);
+ if (bad)
+ return X86EMUL_UNHANDLEABLE;
+
+ bad = ctxt->ops->set_cr(ctxt, 0, cr0);
+ if (bad)
+ return X86EMUL_UNHANDLEABLE;
+
+ if (cr4 & X86_CR4_PCIDE) {
+ bad = ctxt->ops->set_cr(ctxt, 4, cr4);
+ if (bad)
+ return X86EMUL_UNHANDLEABLE;
+ if (pcid) {
+ bad = ctxt->ops->set_cr(ctxt, 3, cr3 | pcid);
+ if (bad)
+ return X86EMUL_UNHANDLEABLE;
+ }
+
+ }
+
+ return X86EMUL_CONTINUE;
+}
+
+static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, u64 smbase)
+{
+ struct desc_struct desc;
+ struct desc_ptr dt;
+ u16 selector;
+ u32 val, cr0, cr3, cr4;
+ int i;
+
+ cr0 = GET_SMSTATE(u32, smbase, 0x7ffc);
+ cr3 = GET_SMSTATE(u32, smbase, 0x7ff8);
+ ctxt->eflags = GET_SMSTATE(u32, smbase, 0x7ff4) | X86_EFLAGS_FIXED;
+ ctxt->_eip = GET_SMSTATE(u32, smbase, 0x7ff0);
+
+ for (i = 0; i < 8; i++)
+ *reg_write(ctxt, i) = GET_SMSTATE(u32, smbase, 0x7fd0 + i * 4);
+
+ val = GET_SMSTATE(u32, smbase, 0x7fcc);
+ ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1);
+ val = GET_SMSTATE(u32, smbase, 0x7fc8);
+ ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1);
+
+ selector = GET_SMSTATE(u32, smbase, 0x7fc4);
+ set_desc_base(&desc, GET_SMSTATE(u32, smbase, 0x7f64));
+ set_desc_limit(&desc, GET_SMSTATE(u32, smbase, 0x7f60));
+ rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, 0x7f5c));
+ ctxt->ops->set_segment(ctxt, selector, &desc, 0, VCPU_SREG_TR);
+
+ selector = GET_SMSTATE(u32, smbase, 0x7fc0);
+ set_desc_base(&desc, GET_SMSTATE(u32, smbase, 0x7f80));
+ set_desc_limit(&desc, GET_SMSTATE(u32, smbase, 0x7f7c));
+ rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, 0x7f78));
+ ctxt->ops->set_segment(ctxt, selector, &desc, 0, VCPU_SREG_LDTR);
+
+ dt.address = GET_SMSTATE(u32, smbase, 0x7f74);
+ dt.size = GET_SMSTATE(u32, smbase, 0x7f70);
+ ctxt->ops->set_gdt(ctxt, &dt);
+
+ dt.address = GET_SMSTATE(u32, smbase, 0x7f58);
+ dt.size = GET_SMSTATE(u32, smbase, 0x7f54);
+ ctxt->ops->set_idt(ctxt, &dt);
+
+ for (i = 0; i < 6; i++) {
+ int r = rsm_load_seg_32(ctxt, smbase, i);
+ if (r != X86EMUL_CONTINUE)
+ return r;
+ }
+
+ cr4 = GET_SMSTATE(u32, smbase, 0x7f14);
+
+ ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7ef8));
+
+ return rsm_enter_protected_mode(ctxt, cr0, cr3, cr4);
+}
+
+#ifdef CONFIG_X86_64
+static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase)
+{
+ struct desc_struct desc;
+ struct desc_ptr dt;
+ u64 val, cr0, cr3, cr4;
+ u32 base3;
+ u16 selector;
+ int i, r;
+
+ for (i = 0; i < 16; i++)
+ *reg_write(ctxt, i) = GET_SMSTATE(u64, smbase, 0x7ff8 - i * 8);
+
+ ctxt->_eip = GET_SMSTATE(u64, smbase, 0x7f78);
+ ctxt->eflags = GET_SMSTATE(u32, smbase, 0x7f70) | X86_EFLAGS_FIXED;
+
+ val = GET_SMSTATE(u32, smbase, 0x7f68);
+ ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1);
+ val = GET_SMSTATE(u32, smbase, 0x7f60);
+ ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1);
+
+ cr0 = GET_SMSTATE(u64, smbase, 0x7f58);
+ cr3 = GET_SMSTATE(u64, smbase, 0x7f50);
+ cr4 = GET_SMSTATE(u64, smbase, 0x7f48);
+ ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7f00));
+ val = GET_SMSTATE(u64, smbase, 0x7ed0);
+ ctxt->ops->set_msr(ctxt, MSR_EFER, val & ~EFER_LMA);
+
+ selector = GET_SMSTATE(u32, smbase, 0x7e90);
+ rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, 0x7e92) << 8);
+ set_desc_limit(&desc, GET_SMSTATE(u32, smbase, 0x7e94));
+ set_desc_base(&desc, GET_SMSTATE(u32, smbase, 0x7e98));
+ base3 = GET_SMSTATE(u32, smbase, 0x7e9c);
+ ctxt->ops->set_segment(ctxt, selector, &desc, base3, VCPU_SREG_TR);
+
+ dt.size = GET_SMSTATE(u32, smbase, 0x7e84);
+ dt.address = GET_SMSTATE(u64, smbase, 0x7e88);
+ ctxt->ops->set_idt(ctxt, &dt);
+
+ selector = GET_SMSTATE(u32, smbase, 0x7e70);
+ rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, 0x7e72) << 8);
+ set_desc_limit(&desc, GET_SMSTATE(u32, smbase, 0x7e74));
+ set_desc_base(&desc, GET_SMSTATE(u32, smbase, 0x7e78));
+ base3 = GET_SMSTATE(u32, smbase, 0x7e7c);
+ ctxt->ops->set_segment(ctxt, selector, &desc, base3, VCPU_SREG_LDTR);
+
+ dt.size = GET_SMSTATE(u32, smbase, 0x7e64);
+ dt.address = GET_SMSTATE(u64, smbase, 0x7e68);
+ ctxt->ops->set_gdt(ctxt, &dt);
+
+ r = rsm_enter_protected_mode(ctxt, cr0, cr3, cr4);
+ if (r != X86EMUL_CONTINUE)
+ return r;
+
+ for (i = 0; i < 6; i++) {
+ r = rsm_load_seg_64(ctxt, smbase, i);
+ if (r != X86EMUL_CONTINUE)
+ return r;
+ }
+
+ return X86EMUL_CONTINUE;
+}
+#endif
+
+static int em_rsm(struct x86_emulate_ctxt *ctxt)
+{
+ unsigned long cr0, cr4, efer;
+ u64 smbase;
+ int ret;
+
+ if ((ctxt->ops->get_hflags(ctxt) & X86EMUL_SMM_MASK) == 0)
+ return emulate_ud(ctxt);
+
+ /*
+ * Get back to real mode, to prepare a safe state in which to load
+ * CR0/CR3/CR4/EFER. It's all a bit more complicated if the vCPU
+ * supports long mode.
+ */
+ if (emulator_has_longmode(ctxt)) {
+ struct desc_struct cs_desc;
+
+ /* Zero CR4.PCIDE before CR0.PG. */
+ cr4 = ctxt->ops->get_cr(ctxt, 4);
+ if (cr4 & X86_CR4_PCIDE)
+ ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE);
+
+ /* A 32-bit code segment is required to clear EFER.LMA. */
+ memset(&cs_desc, 0, sizeof(cs_desc));
+ cs_desc.type = 0xb;
+ cs_desc.s = cs_desc.g = cs_desc.p = 1;
+ ctxt->ops->set_segment(ctxt, 0, &cs_desc, 0, VCPU_SREG_CS);
+ }
+
+ /* For the 64-bit case, this will clear EFER.LMA. */
+ cr0 = ctxt->ops->get_cr(ctxt, 0);
+ if (cr0 & X86_CR0_PE)
+ ctxt->ops->set_cr(ctxt, 0, cr0 & ~(X86_CR0_PG | X86_CR0_PE));
+
+ if (emulator_has_longmode(ctxt)) {
+ /* Clear CR4.PAE before clearing EFER.LME. */
+ cr4 = ctxt->ops->get_cr(ctxt, 4);
+ if (cr4 & X86_CR4_PAE)
+ ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE);
+
+ /* And finally go back to 32-bit mode. */
+ efer = 0;
+ ctxt->ops->set_msr(ctxt, MSR_EFER, efer);
+ }
+
+ smbase = ctxt->ops->get_smbase(ctxt);
+
+ /*
+ * Give pre_leave_smm() a chance to make ISA-specific changes to the
+ * vCPU state (e.g. enter guest mode) before loading state from the SMM
+ * state-save area.
+ */
+ if (ctxt->ops->pre_leave_smm(ctxt, smbase))
+ return X86EMUL_UNHANDLEABLE;
+
+#ifdef CONFIG_X86_64
+ if (emulator_has_longmode(ctxt))
+ ret = rsm_load_state_64(ctxt, smbase + 0x8000);
+ else
+#endif
+ ret = rsm_load_state_32(ctxt, smbase + 0x8000);
+
+ if (ret != X86EMUL_CONTINUE) {
+ /* FIXME: should triple fault */
+ return X86EMUL_UNHANDLEABLE;
+ }
+
+ if ((ctxt->ops->get_hflags(ctxt) & X86EMUL_SMM_INSIDE_NMI_MASK) == 0)
+ ctxt->ops->set_nmi_mask(ctxt, false);
+
+ ctxt->ops->set_hflags(ctxt, ctxt->ops->get_hflags(ctxt) &
+ ~(X86EMUL_SMM_INSIDE_NMI_MASK | X86EMUL_SMM_MASK));
+ return X86EMUL_CONTINUE;
+}
+
+static void
+setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
+ struct desc_struct *cs, struct desc_struct *ss)
+{
+ cs->l = 0; /* will be adjusted later */
+ set_desc_base(cs, 0); /* flat segment */
+ cs->g = 1; /* 4kb granularity */
+ set_desc_limit(cs, 0xfffff); /* 4GB limit */
+ cs->type = 0x0b; /* Read, Execute, Accessed */
+ cs->s = 1;
+ cs->dpl = 0; /* will be adjusted later */
+ cs->p = 1;
+ cs->d = 1;
+ cs->avl = 0;
+
+ set_desc_base(ss, 0); /* flat segment */
+ set_desc_limit(ss, 0xfffff); /* 4GB limit */
+ ss->g = 1; /* 4kb granularity */
+ ss->s = 1;
+ ss->type = 0x03; /* Read/Write, Accessed */
+ ss->d = 1; /* 32bit stack segment */
+ ss->dpl = 0;
+ ss->p = 1;
+ ss->l = 0;
+ ss->avl = 0;
+}
+
+static bool vendor_intel(struct x86_emulate_ctxt *ctxt)
+{
+ u32 eax, ebx, ecx, edx;
+
+ eax = ecx = 0;
+ ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false);
+ return ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx
+ && ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx
+ && edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx;
+}
+
+static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt)
+{
+ const struct x86_emulate_ops *ops = ctxt->ops;
+ u32 eax, ebx, ecx, edx;
+
+ /*
+ * syscall should always be enabled in longmode - so only become
+ * vendor specific (cpuid) if other modes are active...
+ */
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ return true;
+
+ eax = 0x00000000;
+ ecx = 0x00000000;
+ ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false);
+ /*
+ * Intel ("GenuineIntel")
+ * remark: Intel CPUs only support "syscall" in 64bit
+ * longmode. Also an 64bit guest with a
+ * 32bit compat-app running will #UD !! While this
+ * behaviour can be fixed (by emulating) into AMD
+ * response - CPUs of AMD can't behave like Intel.
+ */
+ if (ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx &&
+ ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx &&
+ edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx)
+ return false;
+
+ /* AMD ("AuthenticAMD") */
+ if (ebx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx &&
+ ecx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx &&
+ edx == X86EMUL_CPUID_VENDOR_AuthenticAMD_edx)
+ return true;
+
+ /* AMD ("AMDisbetter!") */
+ if (ebx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ebx &&
+ ecx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ecx &&
+ edx == X86EMUL_CPUID_VENDOR_AMDisbetterI_edx)
+ return true;
+
+ /* default: (not Intel, not AMD), apply Intel's stricter rules... */
+ return false;
+}
+
+static int em_syscall(struct x86_emulate_ctxt *ctxt)
+{
+ const struct x86_emulate_ops *ops = ctxt->ops;
+ struct desc_struct cs, ss;
+ u64 msr_data;
+ u16 cs_sel, ss_sel;
+ u64 efer = 0;
+
+ /* syscall is not available in real mode */
+ if (ctxt->mode == X86EMUL_MODE_REAL ||
+ ctxt->mode == X86EMUL_MODE_VM86)
+ return emulate_ud(ctxt);
+
+ if (!(em_syscall_is_enabled(ctxt)))
+ return emulate_ud(ctxt);
+
+ ops->get_msr(ctxt, MSR_EFER, &efer);
+ setup_syscalls_segments(ctxt, &cs, &ss);
+
+ if (!(efer & EFER_SCE))
+ return emulate_ud(ctxt);
+
+ ops->get_msr(ctxt, MSR_STAR, &msr_data);
+ msr_data >>= 32;
+ cs_sel = (u16)(msr_data & 0xfffc);
+ ss_sel = (u16)(msr_data + 8);
+
+ if (efer & EFER_LMA) {
+ cs.d = 0;
+ cs.l = 1;
+ }
+ ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
+ ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
+
+ *reg_write(ctxt, VCPU_REGS_RCX) = ctxt->_eip;
+ if (efer & EFER_LMA) {
+#ifdef CONFIG_X86_64
+ *reg_write(ctxt, VCPU_REGS_R11) = ctxt->eflags;
+
+ ops->get_msr(ctxt,
+ ctxt->mode == X86EMUL_MODE_PROT64 ?
+ MSR_LSTAR : MSR_CSTAR, &msr_data);
+ ctxt->_eip = msr_data;
+
+ ops->get_msr(ctxt, MSR_SYSCALL_MASK, &msr_data);
+ ctxt->eflags &= ~msr_data;
+ ctxt->eflags |= X86_EFLAGS_FIXED;
+#endif
+ } else {
+ /* legacy mode */
+ ops->get_msr(ctxt, MSR_STAR, &msr_data);
+ ctxt->_eip = (u32)msr_data;
+
+ ctxt->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IF);
+ }
+
+ ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_sysenter(struct x86_emulate_ctxt *ctxt)
+{
+ const struct x86_emulate_ops *ops = ctxt->ops;
+ struct desc_struct cs, ss;
+ u64 msr_data;
+ u16 cs_sel, ss_sel;
+ u64 efer = 0;
+
+ ops->get_msr(ctxt, MSR_EFER, &efer);
+ /* inject #GP if in real mode */
+ if (ctxt->mode == X86EMUL_MODE_REAL)
+ return emulate_gp(ctxt, 0);
+
+ /*
+ * Not recognized on AMD in compat mode (but is recognized in legacy
+ * mode).
+ */
+ if ((ctxt->mode != X86EMUL_MODE_PROT64) && (efer & EFER_LMA)
+ && !vendor_intel(ctxt))
+ return emulate_ud(ctxt);
+
+ /* sysenter/sysexit have not been tested in 64bit mode. */
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ return X86EMUL_UNHANDLEABLE;
+
+ setup_syscalls_segments(ctxt, &cs, &ss);
+
+ ops->get_msr(ctxt, MSR_IA32_SYSENTER_CS, &msr_data);
+ if ((msr_data & 0xfffc) == 0x0)
+ return emulate_gp(ctxt, 0);
+
+ ctxt->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IF);
+ cs_sel = (u16)msr_data & ~SEGMENT_RPL_MASK;
+ ss_sel = cs_sel + 8;
+ if (efer & EFER_LMA) {
+ cs.d = 0;
+ cs.l = 1;
+ }
+
+ ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
+ ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
+
+ ops->get_msr(ctxt, MSR_IA32_SYSENTER_EIP, &msr_data);
+ ctxt->_eip = (efer & EFER_LMA) ? msr_data : (u32)msr_data;
+
+ ops->get_msr(ctxt, MSR_IA32_SYSENTER_ESP, &msr_data);
+ *reg_write(ctxt, VCPU_REGS_RSP) = (efer & EFER_LMA) ? msr_data :
+ (u32)msr_data;
+
+ return X86EMUL_CONTINUE;
+}
+
+static int em_sysexit(struct x86_emulate_ctxt *ctxt)
+{
+ const struct x86_emulate_ops *ops = ctxt->ops;
+ struct desc_struct cs, ss;
+ u64 msr_data, rcx, rdx;
+ int usermode;
+ u16 cs_sel = 0, ss_sel = 0;
+
+ /* inject #GP if in real mode or Virtual 8086 mode */
+ if (ctxt->mode == X86EMUL_MODE_REAL ||
+ ctxt->mode == X86EMUL_MODE_VM86)
+ return emulate_gp(ctxt, 0);
+
+ setup_syscalls_segments(ctxt, &cs, &ss);
+
+ if ((ctxt->rex_prefix & 0x8) != 0x0)
+ usermode = X86EMUL_MODE_PROT64;
+ else
+ usermode = X86EMUL_MODE_PROT32;
+
+ rcx = reg_read(ctxt, VCPU_REGS_RCX);
+ rdx = reg_read(ctxt, VCPU_REGS_RDX);
+
+ cs.dpl = 3;
+ ss.dpl = 3;
+ ops->get_msr(ctxt, MSR_IA32_SYSENTER_CS, &msr_data);
+ switch (usermode) {
+ case X86EMUL_MODE_PROT32:
+ cs_sel = (u16)(msr_data + 16);
+ if ((msr_data & 0xfffc) == 0x0)
+ return emulate_gp(ctxt, 0);
+ ss_sel = (u16)(msr_data + 24);
+ rcx = (u32)rcx;
+ rdx = (u32)rdx;
+ break;
+ case X86EMUL_MODE_PROT64:
+ cs_sel = (u16)(msr_data + 32);
+ if (msr_data == 0x0)
+ return emulate_gp(ctxt, 0);
+ ss_sel = cs_sel + 8;
+ cs.d = 0;
+ cs.l = 1;
+ if (emul_is_noncanonical_address(rcx, ctxt) ||
+ emul_is_noncanonical_address(rdx, ctxt))
+ return emulate_gp(ctxt, 0);
+ break;
+ }
+ cs_sel |= SEGMENT_RPL_MASK;
+ ss_sel |= SEGMENT_RPL_MASK;
+
+ ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
+ ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
+
+ ctxt->_eip = rdx;
+ *reg_write(ctxt, VCPU_REGS_RSP) = rcx;
+
+ return X86EMUL_CONTINUE;
+}
+
+static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt)
+{
+ int iopl;
+ if (ctxt->mode == X86EMUL_MODE_REAL)
+ return false;
+ if (ctxt->mode == X86EMUL_MODE_VM86)
+ return true;
+ iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> X86_EFLAGS_IOPL_BIT;
+ return ctxt->ops->cpl(ctxt) > iopl;
+}
+
+#define VMWARE_PORT_VMPORT (0x5658)
+#define VMWARE_PORT_VMRPC (0x5659)
+
+static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
+ u16 port, u16 len)
+{
+ const struct x86_emulate_ops *ops = ctxt->ops;
+ struct desc_struct tr_seg;
+ u32 base3;
+ int r;
+ u16 tr, io_bitmap_ptr, perm, bit_idx = port & 0x7;
+ unsigned mask = (1 << len) - 1;
+ unsigned long base;
+
+ /*
+ * VMware allows access to these ports even if denied
+ * by TSS I/O permission bitmap. Mimic behavior.
+ */
+ if (enable_vmware_backdoor &&
+ ((port == VMWARE_PORT_VMPORT) || (port == VMWARE_PORT_VMRPC)))
+ return true;
+
+ ops->get_segment(ctxt, &tr, &tr_seg, &base3, VCPU_SREG_TR);
+ if (!tr_seg.p)
+ return false;
+ if (desc_limit_scaled(&tr_seg) < 103)
+ return false;
+ base = get_desc_base(&tr_seg);
+#ifdef CONFIG_X86_64
+ base |= ((u64)base3) << 32;
+#endif
+ r = ops->read_std(ctxt, base + 102, &io_bitmap_ptr, 2, NULL, true);
+ if (r != X86EMUL_CONTINUE)
+ return false;
+ if (io_bitmap_ptr + port/8 > desc_limit_scaled(&tr_seg))
+ return false;
+ r = ops->read_std(ctxt, base + io_bitmap_ptr + port/8, &perm, 2, NULL, true);
+ if (r != X86EMUL_CONTINUE)
+ return false;
+ if ((perm >> bit_idx) & mask)
+ return false;
+ return true;
+}
+
+static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
+ u16 port, u16 len)
+{
+ if (ctxt->perm_ok)
+ return true;
+
+ if (emulator_bad_iopl(ctxt))
+ if (!emulator_io_port_access_allowed(ctxt, port, len))
+ return false;
+
+ ctxt->perm_ok = true;
+
+ return true;
+}
+
+static void string_registers_quirk(struct x86_emulate_ctxt *ctxt)
+{
+ /*
+ * Intel CPUs mask the counter and pointers in quite strange
+ * manner when ECX is zero due to REP-string optimizations.
+ */
+#ifdef CONFIG_X86_64
+ if (ctxt->ad_bytes != 4 || !vendor_intel(ctxt))
+ return;
+
+ *reg_write(ctxt, VCPU_REGS_RCX) = 0;
+
+ switch (ctxt->b) {
+ case 0xa4: /* movsb */
+ case 0xa5: /* movsd/w */
+ *reg_rmw(ctxt, VCPU_REGS_RSI) &= (u32)-1;
+ /* fall through */
+ case 0xaa: /* stosb */
+ case 0xab: /* stosd/w */
+ *reg_rmw(ctxt, VCPU_REGS_RDI) &= (u32)-1;
+ }
+#endif
+}
+
+static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt,
+ struct tss_segment_16 *tss)
+{
+ tss->ip = ctxt->_eip;
+ tss->flag = ctxt->eflags;
+ tss->ax = reg_read(ctxt, VCPU_REGS_RAX);
+ tss->cx = reg_read(ctxt, VCPU_REGS_RCX);
+ tss->dx = reg_read(ctxt, VCPU_REGS_RDX);
+ tss->bx = reg_read(ctxt, VCPU_REGS_RBX);
+ tss->sp = reg_read(ctxt, VCPU_REGS_RSP);
+ tss->bp = reg_read(ctxt, VCPU_REGS_RBP);
+ tss->si = reg_read(ctxt, VCPU_REGS_RSI);
+ tss->di = reg_read(ctxt, VCPU_REGS_RDI);
+
+ tss->es = get_segment_selector(ctxt, VCPU_SREG_ES);
+ tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS);
+ tss->ss = get_segment_selector(ctxt, VCPU_SREG_SS);
+ tss->ds = get_segment_selector(ctxt, VCPU_SREG_DS);
+ tss->ldt = get_segment_selector(ctxt, VCPU_SREG_LDTR);
+}
+
+static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
+ struct tss_segment_16 *tss)
+{
+ int ret;
+ u8 cpl;
+
+ ctxt->_eip = tss->ip;
+ ctxt->eflags = tss->flag | 2;
+ *reg_write(ctxt, VCPU_REGS_RAX) = tss->ax;
+ *reg_write(ctxt, VCPU_REGS_RCX) = tss->cx;
+ *reg_write(ctxt, VCPU_REGS_RDX) = tss->dx;
+ *reg_write(ctxt, VCPU_REGS_RBX) = tss->bx;
+ *reg_write(ctxt, VCPU_REGS_RSP) = tss->sp;
+ *reg_write(ctxt, VCPU_REGS_RBP) = tss->bp;
+ *reg_write(ctxt, VCPU_REGS_RSI) = tss->si;
+ *reg_write(ctxt, VCPU_REGS_RDI) = tss->di;
+
+ /*
+ * SDM says that segment selectors are loaded before segment
+ * descriptors
+ */
+ set_segment_selector(ctxt, tss->ldt, VCPU_SREG_LDTR);
+ set_segment_selector(ctxt, tss->es, VCPU_SREG_ES);
+ set_segment_selector(ctxt, tss->cs, VCPU_SREG_CS);
+ set_segment_selector(ctxt, tss->ss, VCPU_SREG_SS);
+ set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS);
+
+ cpl = tss->cs & 3;
+
+ /*
+ * Now load segment descriptors. If fault happens at this stage
+ * it is handled in a context of new task
+ */
+ ret = __load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR, cpl,
+ X86_TRANSFER_TASK_SWITCH, NULL);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl,
+ X86_TRANSFER_TASK_SWITCH, NULL);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl,
+ X86_TRANSFER_TASK_SWITCH, NULL);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl,
+ X86_TRANSFER_TASK_SWITCH, NULL);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl,
+ X86_TRANSFER_TASK_SWITCH, NULL);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+
+ return X86EMUL_CONTINUE;
+}
+
+static int task_switch_16(struct x86_emulate_ctxt *ctxt,
+ u16 tss_selector, u16 old_tss_sel,
+ ulong old_tss_base, struct desc_struct *new_desc)
+{
+ struct tss_segment_16 tss_seg;
+ int ret;
+ u32 new_tss_base = get_desc_base(new_desc);
+
+ ret = linear_read_system(ctxt, old_tss_base, &tss_seg, sizeof tss_seg);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+
+ save_state_to_tss16(ctxt, &tss_seg);
+
+ ret = linear_write_system(ctxt, old_tss_base, &tss_seg, sizeof tss_seg);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+
+ ret = linear_read_system(ctxt, new_tss_base, &tss_seg, sizeof tss_seg);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+
+ if (old_tss_sel != 0xffff) {
+ tss_seg.prev_task_link = old_tss_sel;
+
+ ret = linear_write_system(ctxt, new_tss_base,
+ &tss_seg.prev_task_link,
+ sizeof tss_seg.prev_task_link);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ }
+
+ return load_state_from_tss16(ctxt, &tss_seg);
+}
+
+static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt,
+ struct tss_segment_32 *tss)
+{
+ /* CR3 and ldt selector are not saved intentionally */
+ tss->eip = ctxt->_eip;
+ tss->eflags = ctxt->eflags;
+ tss->eax = reg_read(ctxt, VCPU_REGS_RAX);
+ tss->ecx = reg_read(ctxt, VCPU_REGS_RCX);
+ tss->edx = reg_read(ctxt, VCPU_REGS_RDX);
+ tss->ebx = reg_read(ctxt, VCPU_REGS_RBX);
+ tss->esp = reg_read(ctxt, VCPU_REGS_RSP);
+ tss->ebp = reg_read(ctxt, VCPU_REGS_RBP);
+ tss->esi = reg_read(ctxt, VCPU_REGS_RSI);
+ tss->edi = reg_read(ctxt, VCPU_REGS_RDI);
+
+ tss->es = get_segment_selector(ctxt, VCPU_SREG_ES);
+ tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS);
+ tss->ss = get_segment_selector(ctxt, VCPU_SREG_SS);
+ tss->ds = get_segment_selector(ctxt, VCPU_SREG_DS);
+ tss->fs = get_segment_selector(ctxt, VCPU_SREG_FS);
+ tss->gs = get_segment_selector(ctxt, VCPU_SREG_GS);
+}
+
+static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
+ struct tss_segment_32 *tss)
+{
+ int ret;
+ u8 cpl;
+
+ if (ctxt->ops->set_cr(ctxt, 3, tss->cr3))
+ return emulate_gp(ctxt, 0);
+ ctxt->_eip = tss->eip;
+ ctxt->eflags = tss->eflags | 2;
+
+ /* General purpose registers */
+ *reg_write(ctxt, VCPU_REGS_RAX) = tss->eax;
+ *reg_write(ctxt, VCPU_REGS_RCX) = tss->ecx;
+ *reg_write(ctxt, VCPU_REGS_RDX) = tss->edx;
+ *reg_write(ctxt, VCPU_REGS_RBX) = tss->ebx;
+ *reg_write(ctxt, VCPU_REGS_RSP) = tss->esp;
+ *reg_write(ctxt, VCPU_REGS_RBP) = tss->ebp;
+ *reg_write(ctxt, VCPU_REGS_RSI) = tss->esi;
+ *reg_write(ctxt, VCPU_REGS_RDI) = tss->edi;
+
+ /*
+ * SDM says that segment selectors are loaded before segment
+ * descriptors. This is important because CPL checks will
+ * use CS.RPL.
+ */
+ set_segment_selector(ctxt, tss->ldt_selector, VCPU_SREG_LDTR);
+ set_segment_selector(ctxt, tss->es, VCPU_SREG_ES);
+ set_segment_selector(ctxt, tss->cs, VCPU_SREG_CS);
+ set_segment_selector(ctxt, tss->ss, VCPU_SREG_SS);
+ set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS);
+ set_segment_selector(ctxt, tss->fs, VCPU_SREG_FS);
+ set_segment_selector(ctxt, tss->gs, VCPU_SREG_GS);
+
+ /*
+ * If we're switching between Protected Mode and VM86, we need to make
+ * sure to update the mode before loading the segment descriptors so
+ * that the selectors are interpreted correctly.
+ */
+ if (ctxt->eflags & X86_EFLAGS_VM) {
+ ctxt->mode = X86EMUL_MODE_VM86;
+ cpl = 3;
+ } else {
+ ctxt->mode = X86EMUL_MODE_PROT32;
+ cpl = tss->cs & 3;
+ }
+
+ /*
+ * Now load segment descriptors. If fault happenes at this stage
+ * it is handled in a context of new task
+ */
+ ret = __load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR,
+ cpl, X86_TRANSFER_TASK_SWITCH, NULL);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl,
+ X86_TRANSFER_TASK_SWITCH, NULL);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl,
+ X86_TRANSFER_TASK_SWITCH, NULL);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl,
+ X86_TRANSFER_TASK_SWITCH, NULL);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl,
+ X86_TRANSFER_TASK_SWITCH, NULL);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = __load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS, cpl,
+ X86_TRANSFER_TASK_SWITCH, NULL);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = __load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS, cpl,
+ X86_TRANSFER_TASK_SWITCH, NULL);
+
+ return ret;
+}
+
+static int task_switch_32(struct x86_emulate_ctxt *ctxt,
+ u16 tss_selector, u16 old_tss_sel,
+ ulong old_tss_base, struct desc_struct *new_desc)
+{
+ struct tss_segment_32 tss_seg;
+ int ret;
+ u32 new_tss_base = get_desc_base(new_desc);
+ u32 eip_offset = offsetof(struct tss_segment_32, eip);
+ u32 ldt_sel_offset = offsetof(struct tss_segment_32, ldt_selector);
+
+ ret = linear_read_system(ctxt, old_tss_base, &tss_seg, sizeof tss_seg);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+
+ save_state_to_tss32(ctxt, &tss_seg);
+
+ /* Only GP registers and segment selectors are saved */
+ ret = linear_write_system(ctxt, old_tss_base + eip_offset, &tss_seg.eip,
+ ldt_sel_offset - eip_offset);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+
+ ret = linear_read_system(ctxt, new_tss_base, &tss_seg, sizeof tss_seg);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+
+ if (old_tss_sel != 0xffff) {
+ tss_seg.prev_task_link = old_tss_sel;
+
+ ret = linear_write_system(ctxt, new_tss_base,
+ &tss_seg.prev_task_link,
+ sizeof tss_seg.prev_task_link);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ }
+
+ return load_state_from_tss32(ctxt, &tss_seg);
+}
+
+static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
+ u16 tss_selector, int idt_index, int reason,
+ bool has_error_code, u32 error_code)
+{
+ const struct x86_emulate_ops *ops = ctxt->ops;
+ struct desc_struct curr_tss_desc, next_tss_desc;
+ int ret;
+ u16 old_tss_sel = get_segment_selector(ctxt, VCPU_SREG_TR);
+ ulong old_tss_base =
+ ops->get_cached_segment_base(ctxt, VCPU_SREG_TR);
+ u32 desc_limit;
+ ulong desc_addr, dr7;
+
+ /* FIXME: old_tss_base == ~0 ? */
+
+ ret = read_segment_descriptor(ctxt, tss_selector, &next_tss_desc, &desc_addr);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ ret = read_segment_descriptor(ctxt, old_tss_sel, &curr_tss_desc, &desc_addr);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+
+ /* FIXME: check that next_tss_desc is tss */
+
+ /*
+ * Check privileges. The three cases are task switch caused by...
+ *
+ * 1. jmp/call/int to task gate: Check against DPL of the task gate
+ * 2. Exception/IRQ/iret: No check is performed
+ * 3. jmp/call to TSS/task-gate: No check is performed since the
+ * hardware checks it before exiting.
+ */
+ if (reason == TASK_SWITCH_GATE) {
+ if (idt_index != -1) {
+ /* Software interrupts */
+ struct desc_struct task_gate_desc;
+ int dpl;
+
+ ret = read_interrupt_descriptor(ctxt, idt_index,
+ &task_gate_desc);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+
+ dpl = task_gate_desc.dpl;
+ if ((tss_selector & 3) > dpl || ops->cpl(ctxt) > dpl)
+ return emulate_gp(ctxt, (idt_index << 3) | 0x2);
+ }
+ }
+
+ desc_limit = desc_limit_scaled(&next_tss_desc);
+ if (!next_tss_desc.p ||
+ ((desc_limit < 0x67 && (next_tss_desc.type & 8)) ||
+ desc_limit < 0x2b)) {
+ return emulate_ts(ctxt, tss_selector & 0xfffc);
+ }
+
+ if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
+ curr_tss_desc.type &= ~(1 << 1); /* clear busy flag */
+ write_segment_descriptor(ctxt, old_tss_sel, &curr_tss_desc);
+ }
+
+ if (reason == TASK_SWITCH_IRET)
+ ctxt->eflags = ctxt->eflags & ~X86_EFLAGS_NT;
+
+ /* set back link to prev task only if NT bit is set in eflags
+ note that old_tss_sel is not used after this point */
+ if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
+ old_tss_sel = 0xffff;
+
+ if (next_tss_desc.type & 8)
+ ret = task_switch_32(ctxt, tss_selector, old_tss_sel,
+ old_tss_base, &next_tss_desc);
+ else
+ ret = task_switch_16(ctxt, tss_selector, old_tss_sel,
+ old_tss_base, &next_tss_desc);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+
+ if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE)
+ ctxt->eflags = ctxt->eflags | X86_EFLAGS_NT;
+
+ if (reason != TASK_SWITCH_IRET) {
+ next_tss_desc.type |= (1 << 1); /* set busy flag */
+ write_segment_descriptor(ctxt, tss_selector, &next_tss_desc);
+ }
+
+ ops->set_cr(ctxt, 0, ops->get_cr(ctxt, 0) | X86_CR0_TS);
+ ops->set_segment(ctxt, tss_selector, &next_tss_desc, 0, VCPU_SREG_TR);
+
+ if (has_error_code) {
+ ctxt->op_bytes = ctxt->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2;
+ ctxt->lock_prefix = 0;
+ ctxt->src.val = (unsigned long) error_code;
+ ret = em_push(ctxt);
+ }
+
+ ops->get_dr(ctxt, 7, &dr7);
+ ops->set_dr(ctxt, 7, dr7 & ~(DR_LOCAL_ENABLE_MASK | DR_LOCAL_SLOWDOWN));
+
+ return ret;
+}
+
+int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
+ u16 tss_selector, int idt_index, int reason,
+ bool has_error_code, u32 error_code)
+{
+ int rc;
+
+ invalidate_registers(ctxt);
+ ctxt->_eip = ctxt->eip;
+ ctxt->dst.type = OP_NONE;
+
+ rc = emulator_do_task_switch(ctxt, tss_selector, idt_index, reason,
+ has_error_code, error_code);
+
+ if (rc == X86EMUL_CONTINUE) {
+ ctxt->eip = ctxt->_eip;
+ writeback_registers(ctxt);
+ }
+
+ return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
+}
+
+static void string_addr_inc(struct x86_emulate_ctxt *ctxt, int reg,
+ struct operand *op)
+{
+ int df = (ctxt->eflags & X86_EFLAGS_DF) ? -op->count : op->count;
+
+ register_address_increment(ctxt, reg, df * op->bytes);
+ op->addr.mem.ea = register_address(ctxt, reg);
+}
+
+static int em_das(struct x86_emulate_ctxt *ctxt)
+{
+ u8 al, old_al;
+ bool af, cf, old_cf;
+
+ cf = ctxt->eflags & X86_EFLAGS_CF;
+ al = ctxt->dst.val;
+
+ old_al = al;
+ old_cf = cf;
+ cf = false;
+ af = ctxt->eflags & X86_EFLAGS_AF;
+ if ((al & 0x0f) > 9 || af) {
+ al -= 6;
+ cf = old_cf | (al >= 250);
+ af = true;
+ } else {
+ af = false;
+ }
+ if (old_al > 0x99 || old_cf) {
+ al -= 0x60;
+ cf = true;
+ }
+
+ ctxt->dst.val = al;
+ /* Set PF, ZF, SF */
+ ctxt->src.type = OP_IMM;
+ ctxt->src.val = 0;
+ ctxt->src.bytes = 1;
+ fastop(ctxt, em_or);
+ ctxt->eflags &= ~(X86_EFLAGS_AF | X86_EFLAGS_CF);
+ if (cf)
+ ctxt->eflags |= X86_EFLAGS_CF;
+ if (af)
+ ctxt->eflags |= X86_EFLAGS_AF;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_aam(struct x86_emulate_ctxt *ctxt)
+{
+ u8 al, ah;
+
+ if (ctxt->src.val == 0)
+ return emulate_de(ctxt);
+
+ al = ctxt->dst.val & 0xff;
+ ah = al / ctxt->src.val;
+ al %= ctxt->src.val;
+
+ ctxt->dst.val = (ctxt->dst.val & 0xffff0000) | al | (ah << 8);
+
+ /* Set PF, ZF, SF */
+ ctxt->src.type = OP_IMM;
+ ctxt->src.val = 0;
+ ctxt->src.bytes = 1;
+ fastop(ctxt, em_or);
+
+ return X86EMUL_CONTINUE;
+}
+
+static int em_aad(struct x86_emulate_ctxt *ctxt)
+{
+ u8 al = ctxt->dst.val & 0xff;
+ u8 ah = (ctxt->dst.val >> 8) & 0xff;
+
+ al = (al + (ah * ctxt->src.val)) & 0xff;
+
+ ctxt->dst.val = (ctxt->dst.val & 0xffff0000) | al;
+
+ /* Set PF, ZF, SF */
+ ctxt->src.type = OP_IMM;
+ ctxt->src.val = 0;
+ ctxt->src.bytes = 1;
+ fastop(ctxt, em_or);
+
+ return X86EMUL_CONTINUE;
+}
+
+static int em_call(struct x86_emulate_ctxt *ctxt)
+{
+ int rc;
+ long rel = ctxt->src.val;
+
+ ctxt->src.val = (unsigned long)ctxt->_eip;
+ rc = jmp_rel(ctxt, rel);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ return em_push(ctxt);
+}
+
+static int em_call_far(struct x86_emulate_ctxt *ctxt)
+{
+ u16 sel, old_cs;
+ ulong old_eip;
+ int rc;
+ struct desc_struct old_desc, new_desc;
+ const struct x86_emulate_ops *ops = ctxt->ops;
+ int cpl = ctxt->ops->cpl(ctxt);
+ enum x86emul_mode prev_mode = ctxt->mode;
+
+ old_eip = ctxt->_eip;
+ ops->get_segment(ctxt, &old_cs, &old_desc, NULL, VCPU_SREG_CS);
+
+ memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2);
+ rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl,
+ X86_TRANSFER_CALL_JMP, &new_desc);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ rc = assign_eip_far(ctxt, ctxt->src.val, &new_desc);
+ if (rc != X86EMUL_CONTINUE)
+ goto fail;
+
+ ctxt->src.val = old_cs;
+ rc = em_push(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ goto fail;
+
+ ctxt->src.val = old_eip;
+ rc = em_push(ctxt);
+ /* If we failed, we tainted the memory, but the very least we should
+ restore cs */
+ if (rc != X86EMUL_CONTINUE) {
+ pr_warn_once("faulting far call emulation tainted memory\n");
+ goto fail;
+ }
+ return rc;
+fail:
+ ops->set_segment(ctxt, old_cs, &old_desc, 0, VCPU_SREG_CS);
+ ctxt->mode = prev_mode;
+ return rc;
+
+}
+
+static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt)
+{
+ int rc;
+ unsigned long eip;
+
+ rc = emulate_pop(ctxt, &eip, ctxt->op_bytes);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ rc = assign_eip_near(ctxt, eip);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ rsp_increment(ctxt, ctxt->src.val);
+ return X86EMUL_CONTINUE;
+}
+
+static int em_xchg(struct x86_emulate_ctxt *ctxt)
+{
+ /* Write back the register source. */
+ ctxt->src.val = ctxt->dst.val;
+ write_register_operand(&ctxt->src);
+
+ /* Write back the memory destination with implicit LOCK prefix. */
+ ctxt->dst.val = ctxt->src.orig_val;
+ ctxt->lock_prefix = 1;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_imul_3op(struct x86_emulate_ctxt *ctxt)
+{
+ ctxt->dst.val = ctxt->src2.val;
+ return fastop(ctxt, em_imul);
+}
+
+static int em_cwd(struct x86_emulate_ctxt *ctxt)
+{
+ ctxt->dst.type = OP_REG;
+ ctxt->dst.bytes = ctxt->src.bytes;
+ ctxt->dst.addr.reg = reg_rmw(ctxt, VCPU_REGS_RDX);
+ ctxt->dst.val = ~((ctxt->src.val >> (ctxt->src.bytes * 8 - 1)) - 1);
+
+ return X86EMUL_CONTINUE;
+}
+
+static int em_rdpid(struct x86_emulate_ctxt *ctxt)
+{
+ u64 tsc_aux = 0;
+
+ if (ctxt->ops->get_msr(ctxt, MSR_TSC_AUX, &tsc_aux))
+ return emulate_ud(ctxt);
+ ctxt->dst.val = tsc_aux;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_rdtsc(struct x86_emulate_ctxt *ctxt)
+{
+ u64 tsc = 0;
+
+ ctxt->ops->get_msr(ctxt, MSR_IA32_TSC, &tsc);
+ *reg_write(ctxt, VCPU_REGS_RAX) = (u32)tsc;
+ *reg_write(ctxt, VCPU_REGS_RDX) = tsc >> 32;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_rdpmc(struct x86_emulate_ctxt *ctxt)
+{
+ u64 pmc;
+
+ if (ctxt->ops->read_pmc(ctxt, reg_read(ctxt, VCPU_REGS_RCX), &pmc))
+ return emulate_gp(ctxt, 0);
+ *reg_write(ctxt, VCPU_REGS_RAX) = (u32)pmc;
+ *reg_write(ctxt, VCPU_REGS_RDX) = pmc >> 32;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_mov(struct x86_emulate_ctxt *ctxt)
+{
+ memcpy(ctxt->dst.valptr, ctxt->src.valptr, sizeof(ctxt->src.valptr));
+ return X86EMUL_CONTINUE;
+}
+
+#define FFL(x) bit(X86_FEATURE_##x)
+
+static int em_movbe(struct x86_emulate_ctxt *ctxt)
+{
+ u32 ebx, ecx, edx, eax = 1;
+ u16 tmp;
+
+ /*
+ * Check MOVBE is set in the guest-visible CPUID leaf.
+ */
+ ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false);
+ if (!(ecx & FFL(MOVBE)))
+ return emulate_ud(ctxt);
+
+ switch (ctxt->op_bytes) {
+ case 2:
+ /*
+ * From MOVBE definition: "...When the operand size is 16 bits,
+ * the upper word of the destination register remains unchanged
+ * ..."
+ *
+ * Both casting ->valptr and ->val to u16 breaks strict aliasing
+ * rules so we have to do the operation almost per hand.
+ */
+ tmp = (u16)ctxt->src.val;
+ ctxt->dst.val &= ~0xffffUL;
+ ctxt->dst.val |= (unsigned long)swab16(tmp);
+ break;
+ case 4:
+ ctxt->dst.val = swab32((u32)ctxt->src.val);
+ break;
+ case 8:
+ ctxt->dst.val = swab64(ctxt->src.val);
+ break;
+ default:
+ BUG();
+ }
+ return X86EMUL_CONTINUE;
+}
+
+static int em_cr_write(struct x86_emulate_ctxt *ctxt)
+{
+ if (ctxt->ops->set_cr(ctxt, ctxt->modrm_reg, ctxt->src.val))
+ return emulate_gp(ctxt, 0);
+
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_dr_write(struct x86_emulate_ctxt *ctxt)
+{
+ unsigned long val;
+
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ val = ctxt->src.val & ~0ULL;
+ else
+ val = ctxt->src.val & ~0U;
+
+ /* #UD condition is already handled. */
+ if (ctxt->ops->set_dr(ctxt, ctxt->modrm_reg, val) < 0)
+ return emulate_gp(ctxt, 0);
+
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_wrmsr(struct x86_emulate_ctxt *ctxt)
+{
+ u64 msr_data;
+
+ msr_data = (u32)reg_read(ctxt, VCPU_REGS_RAX)
+ | ((u64)reg_read(ctxt, VCPU_REGS_RDX) << 32);
+ if (ctxt->ops->set_msr(ctxt, reg_read(ctxt, VCPU_REGS_RCX), msr_data))
+ return emulate_gp(ctxt, 0);
+
+ return X86EMUL_CONTINUE;
+}
+
+static int em_rdmsr(struct x86_emulate_ctxt *ctxt)
+{
+ u64 msr_data;
+
+ if (ctxt->ops->get_msr(ctxt, reg_read(ctxt, VCPU_REGS_RCX), &msr_data))
+ return emulate_gp(ctxt, 0);
+
+ *reg_write(ctxt, VCPU_REGS_RAX) = (u32)msr_data;
+ *reg_write(ctxt, VCPU_REGS_RDX) = msr_data >> 32;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_store_sreg(struct x86_emulate_ctxt *ctxt, int segment)
+{
+ if (segment > VCPU_SREG_GS &&
+ (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_UMIP) &&
+ ctxt->ops->cpl(ctxt) > 0)
+ return emulate_gp(ctxt, 0);
+
+ ctxt->dst.val = get_segment_selector(ctxt, segment);
+ if (ctxt->dst.bytes == 4 && ctxt->dst.type == OP_MEM)
+ ctxt->dst.bytes = 2;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_mov_rm_sreg(struct x86_emulate_ctxt *ctxt)
+{
+ if (ctxt->modrm_reg > VCPU_SREG_GS)
+ return emulate_ud(ctxt);
+
+ return em_store_sreg(ctxt, ctxt->modrm_reg);
+}
+
+static int em_mov_sreg_rm(struct x86_emulate_ctxt *ctxt)
+{
+ u16 sel = ctxt->src.val;
+
+ if (ctxt->modrm_reg == VCPU_SREG_CS || ctxt->modrm_reg > VCPU_SREG_GS)
+ return emulate_ud(ctxt);
+
+ if (ctxt->modrm_reg == VCPU_SREG_SS)
+ ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS;
+
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return load_segment_descriptor(ctxt, sel, ctxt->modrm_reg);
+}
+
+static int em_sldt(struct x86_emulate_ctxt *ctxt)
+{
+ return em_store_sreg(ctxt, VCPU_SREG_LDTR);
+}
+
+static int em_lldt(struct x86_emulate_ctxt *ctxt)
+{
+ u16 sel = ctxt->src.val;
+
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return load_segment_descriptor(ctxt, sel, VCPU_SREG_LDTR);
+}
+
+static int em_str(struct x86_emulate_ctxt *ctxt)
+{
+ return em_store_sreg(ctxt, VCPU_SREG_TR);
+}
+
+static int em_ltr(struct x86_emulate_ctxt *ctxt)
+{
+ u16 sel = ctxt->src.val;
+
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return load_segment_descriptor(ctxt, sel, VCPU_SREG_TR);
+}
+
+static int em_invlpg(struct x86_emulate_ctxt *ctxt)
+{
+ int rc;
+ ulong linear;
+
+ rc = linearize(ctxt, ctxt->src.addr.mem, 1, false, &linear);
+ if (rc == X86EMUL_CONTINUE)
+ ctxt->ops->invlpg(ctxt, linear);
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_clts(struct x86_emulate_ctxt *ctxt)
+{
+ ulong cr0;
+
+ cr0 = ctxt->ops->get_cr(ctxt, 0);
+ cr0 &= ~X86_CR0_TS;
+ ctxt->ops->set_cr(ctxt, 0, cr0);
+ return X86EMUL_CONTINUE;
+}
+
+static int em_hypercall(struct x86_emulate_ctxt *ctxt)
+{
+ int rc = ctxt->ops->fix_hypercall(ctxt);
+
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ /* Let the processor re-execute the fixed hypercall */
+ ctxt->_eip = ctxt->eip;
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return X86EMUL_CONTINUE;
+}
+
+static int emulate_store_desc_ptr(struct x86_emulate_ctxt *ctxt,
+ void (*get)(struct x86_emulate_ctxt *ctxt,
+ struct desc_ptr *ptr))
+{
+ struct desc_ptr desc_ptr;
+
+ if ((ctxt->ops->get_cr(ctxt, 4) & X86_CR4_UMIP) &&
+ ctxt->ops->cpl(ctxt) > 0)
+ return emulate_gp(ctxt, 0);
+
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ ctxt->op_bytes = 8;
+ get(ctxt, &desc_ptr);
+ if (ctxt->op_bytes == 2) {
+ ctxt->op_bytes = 4;
+ desc_ptr.address &= 0x00ffffff;
+ }
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return segmented_write_std(ctxt, ctxt->dst.addr.mem,
+ &desc_ptr, 2 + ctxt->op_bytes);
+}
+
+static int em_sgdt(struct x86_emulate_ctxt *ctxt)
+{
+ return emulate_store_desc_ptr(ctxt, ctxt->ops->get_gdt);
+}
+
+static int em_sidt(struct x86_emulate_ctxt *ctxt)
+{
+ return emulate_store_desc_ptr(ctxt, ctxt->ops->get_idt);
+}
+
+static int em_lgdt_lidt(struct x86_emulate_ctxt *ctxt, bool lgdt)
+{
+ struct desc_ptr desc_ptr;
+ int rc;
+
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ ctxt->op_bytes = 8;
+ rc = read_descriptor(ctxt, ctxt->src.addr.mem,
+ &desc_ptr.size, &desc_ptr.address,
+ ctxt->op_bytes);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ if (ctxt->mode == X86EMUL_MODE_PROT64 &&
+ emul_is_noncanonical_address(desc_ptr.address, ctxt))
+ return emulate_gp(ctxt, 0);
+ if (lgdt)
+ ctxt->ops->set_gdt(ctxt, &desc_ptr);
+ else
+ ctxt->ops->set_idt(ctxt, &desc_ptr);
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_lgdt(struct x86_emulate_ctxt *ctxt)
+{
+ return em_lgdt_lidt(ctxt, true);
+}
+
+static int em_lidt(struct x86_emulate_ctxt *ctxt)
+{
+ return em_lgdt_lidt(ctxt, false);
+}
+
+static int em_smsw(struct x86_emulate_ctxt *ctxt)
+{
+ if ((ctxt->ops->get_cr(ctxt, 4) & X86_CR4_UMIP) &&
+ ctxt->ops->cpl(ctxt) > 0)
+ return emulate_gp(ctxt, 0);
+
+ if (ctxt->dst.type == OP_MEM)
+ ctxt->dst.bytes = 2;
+ ctxt->dst.val = ctxt->ops->get_cr(ctxt, 0);
+ return X86EMUL_CONTINUE;
+}
+
+static int em_lmsw(struct x86_emulate_ctxt *ctxt)
+{
+ ctxt->ops->set_cr(ctxt, 0, (ctxt->ops->get_cr(ctxt, 0) & ~0x0eul)
+ | (ctxt->src.val & 0x0f));
+ ctxt->dst.type = OP_NONE;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_loop(struct x86_emulate_ctxt *ctxt)
+{
+ int rc = X86EMUL_CONTINUE;
+
+ register_address_increment(ctxt, VCPU_REGS_RCX, -1);
+ if ((address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) != 0) &&
+ (ctxt->b == 0xe2 || test_cc(ctxt->b ^ 0x5, ctxt->eflags)))
+ rc = jmp_rel(ctxt, ctxt->src.val);
+
+ return rc;
+}
+
+static int em_jcxz(struct x86_emulate_ctxt *ctxt)
+{
+ int rc = X86EMUL_CONTINUE;
+
+ if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0)
+ rc = jmp_rel(ctxt, ctxt->src.val);
+
+ return rc;
+}
+
+static int em_in(struct x86_emulate_ctxt *ctxt)
+{
+ if (!pio_in_emulated(ctxt, ctxt->dst.bytes, ctxt->src.val,
+ &ctxt->dst.val))
+ return X86EMUL_IO_NEEDED;
+
+ return X86EMUL_CONTINUE;
+}
+
+static int em_out(struct x86_emulate_ctxt *ctxt)
+{
+ ctxt->ops->pio_out_emulated(ctxt, ctxt->src.bytes, ctxt->dst.val,
+ &ctxt->src.val, 1);
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_cli(struct x86_emulate_ctxt *ctxt)
+{
+ if (emulator_bad_iopl(ctxt))
+ return emulate_gp(ctxt, 0);
+
+ ctxt->eflags &= ~X86_EFLAGS_IF;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_sti(struct x86_emulate_ctxt *ctxt)
+{
+ if (emulator_bad_iopl(ctxt))
+ return emulate_gp(ctxt, 0);
+
+ ctxt->interruptibility = KVM_X86_SHADOW_INT_STI;
+ ctxt->eflags |= X86_EFLAGS_IF;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_cpuid(struct x86_emulate_ctxt *ctxt)
+{
+ u32 eax, ebx, ecx, edx;
+ u64 msr = 0;
+
+ ctxt->ops->get_msr(ctxt, MSR_MISC_FEATURES_ENABLES, &msr);
+ if (msr & MSR_MISC_FEATURES_ENABLES_CPUID_FAULT &&
+ ctxt->ops->cpl(ctxt)) {
+ return emulate_gp(ctxt, 0);
+ }
+
+ eax = reg_read(ctxt, VCPU_REGS_RAX);
+ ecx = reg_read(ctxt, VCPU_REGS_RCX);
+ ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, true);
+ *reg_write(ctxt, VCPU_REGS_RAX) = eax;
+ *reg_write(ctxt, VCPU_REGS_RBX) = ebx;
+ *reg_write(ctxt, VCPU_REGS_RCX) = ecx;
+ *reg_write(ctxt, VCPU_REGS_RDX) = edx;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_sahf(struct x86_emulate_ctxt *ctxt)
+{
+ u32 flags;
+
+ flags = X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
+ X86_EFLAGS_SF;
+ flags &= *reg_rmw(ctxt, VCPU_REGS_RAX) >> 8;
+
+ ctxt->eflags &= ~0xffUL;
+ ctxt->eflags |= flags | X86_EFLAGS_FIXED;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_lahf(struct x86_emulate_ctxt *ctxt)
+{
+ *reg_rmw(ctxt, VCPU_REGS_RAX) &= ~0xff00UL;
+ *reg_rmw(ctxt, VCPU_REGS_RAX) |= (ctxt->eflags & 0xff) << 8;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_bswap(struct x86_emulate_ctxt *ctxt)
+{
+ switch (ctxt->op_bytes) {
+#ifdef CONFIG_X86_64
+ case 8:
+ asm("bswap %0" : "+r"(ctxt->dst.val));
+ break;
+#endif
+ default:
+ asm("bswap %0" : "+r"(*(u32 *)&ctxt->dst.val));
+ break;
+ }
+ return X86EMUL_CONTINUE;
+}
+
+static int em_clflush(struct x86_emulate_ctxt *ctxt)
+{
+ /* emulating clflush regardless of cpuid */
+ return X86EMUL_CONTINUE;
+}
+
+static int em_clflushopt(struct x86_emulate_ctxt *ctxt)
+{
+ /* emulating clflushopt regardless of cpuid */
+ return X86EMUL_CONTINUE;
+}
+
+static int em_movsxd(struct x86_emulate_ctxt *ctxt)
+{
+ ctxt->dst.val = (s32) ctxt->src.val;
+ return X86EMUL_CONTINUE;
+}
+
+static int check_fxsr(struct x86_emulate_ctxt *ctxt)
+{
+ u32 eax = 1, ebx, ecx = 0, edx;
+
+ ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false);
+ if (!(edx & FFL(FXSR)))
+ return emulate_ud(ctxt);
+
+ if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))
+ return emulate_nm(ctxt);
+
+ /*
+ * Don't emulate a case that should never be hit, instead of working
+ * around a lack of fxsave64/fxrstor64 on old compilers.
+ */
+ if (ctxt->mode >= X86EMUL_MODE_PROT64)
+ return X86EMUL_UNHANDLEABLE;
+
+ return X86EMUL_CONTINUE;
+}
+
+/*
+ * Hardware doesn't save and restore XMM 0-7 without CR4.OSFXSR, but does save
+ * and restore MXCSR.
+ */
+static size_t __fxstate_size(int nregs)
+{
+ return offsetof(struct fxregs_state, xmm_space[0]) + nregs * 16;
+}
+
+static inline size_t fxstate_size(struct x86_emulate_ctxt *ctxt)
+{
+ bool cr4_osfxsr;
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ return __fxstate_size(16);
+
+ cr4_osfxsr = ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR;
+ return __fxstate_size(cr4_osfxsr ? 8 : 0);
+}
+
+/*
+ * FXSAVE and FXRSTOR have 4 different formats depending on execution mode,
+ * 1) 16 bit mode
+ * 2) 32 bit mode
+ * - like (1), but FIP and FDP (foo) are only 16 bit. At least Intel CPUs
+ * preserve whole 32 bit values, though, so (1) and (2) are the same wrt.
+ * save and restore
+ * 3) 64-bit mode with REX.W prefix
+ * - like (2), but XMM 8-15 are being saved and restored
+ * 4) 64-bit mode without REX.W prefix
+ * - like (3), but FIP and FDP are 64 bit
+ *
+ * Emulation uses (3) for (1) and (2) and preserves XMM 8-15 to reach the
+ * desired result. (4) is not emulated.
+ *
+ * Note: Guest and host CPUID.(EAX=07H,ECX=0H):EBX[bit 13] (deprecate FPU CS
+ * and FPU DS) should match.
+ */
+static int em_fxsave(struct x86_emulate_ctxt *ctxt)
+{
+ struct fxregs_state fx_state;
+ int rc;
+
+ rc = check_fxsr(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ rc = asm_safe("fxsave %[fx]", , [fx] "+m"(fx_state));
+
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ return segmented_write_std(ctxt, ctxt->memop.addr.mem, &fx_state,
+ fxstate_size(ctxt));
+}
+
+/*
+ * FXRSTOR might restore XMM registers not provided by the guest. Fill
+ * in the host registers (via FXSAVE) instead, so they won't be modified.
+ * (preemption has to stay disabled until FXRSTOR).
+ *
+ * Use noinline to keep the stack for other functions called by callers small.
+ */
+static noinline int fxregs_fixup(struct fxregs_state *fx_state,
+ const size_t used_size)
+{
+ struct fxregs_state fx_tmp;
+ int rc;
+
+ rc = asm_safe("fxsave %[fx]", , [fx] "+m"(fx_tmp));
+ memcpy((void *)fx_state + used_size, (void *)&fx_tmp + used_size,
+ __fxstate_size(16) - used_size);
+
+ return rc;
+}
+
+static int em_fxrstor(struct x86_emulate_ctxt *ctxt)
+{
+ struct fxregs_state fx_state;
+ int rc;
+ size_t size;
+
+ rc = check_fxsr(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ size = fxstate_size(ctxt);
+ rc = segmented_read_std(ctxt, ctxt->memop.addr.mem, &fx_state, size);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ if (size < __fxstate_size(16)) {
+ rc = fxregs_fixup(&fx_state, size);
+ if (rc != X86EMUL_CONTINUE)
+ goto out;
+ }
+
+ if (fx_state.mxcsr >> 16) {
+ rc = emulate_gp(ctxt, 0);
+ goto out;
+ }
+
+ if (rc == X86EMUL_CONTINUE)
+ rc = asm_safe("fxrstor %[fx]", : [fx] "m"(fx_state));
+
+out:
+ return rc;
+}
+
+static bool valid_cr(int nr)
+{
+ switch (nr) {
+ case 0:
+ case 2 ... 4:
+ case 8:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static int check_cr_read(struct x86_emulate_ctxt *ctxt)
+{
+ if (!valid_cr(ctxt->modrm_reg))
+ return emulate_ud(ctxt);
+
+ return X86EMUL_CONTINUE;
+}
+
+static int check_cr_write(struct x86_emulate_ctxt *ctxt)
+{
+ u64 new_val = ctxt->src.val64;
+ int cr = ctxt->modrm_reg;
+ u64 efer = 0;
+
+ static u64 cr_reserved_bits[] = {
+ 0xffffffff00000000ULL,
+ 0, 0, 0, /* CR3 checked later */
+ CR4_RESERVED_BITS,
+ 0, 0, 0,
+ CR8_RESERVED_BITS,
+ };
+
+ if (!valid_cr(cr))
+ return emulate_ud(ctxt);
+
+ if (new_val & cr_reserved_bits[cr])
+ return emulate_gp(ctxt, 0);
+
+ switch (cr) {
+ case 0: {
+ u64 cr4;
+ if (((new_val & X86_CR0_PG) && !(new_val & X86_CR0_PE)) ||
+ ((new_val & X86_CR0_NW) && !(new_val & X86_CR0_CD)))
+ return emulate_gp(ctxt, 0);
+
+ cr4 = ctxt->ops->get_cr(ctxt, 4);
+ ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
+
+ if ((new_val & X86_CR0_PG) && (efer & EFER_LME) &&
+ !(cr4 & X86_CR4_PAE))
+ return emulate_gp(ctxt, 0);
+
+ break;
+ }
+ case 3: {
+ u64 rsvd = 0;
+
+ ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
+ if (efer & EFER_LMA) {
+ u64 maxphyaddr;
+ u32 eax, ebx, ecx, edx;
+
+ eax = 0x80000008;
+ ecx = 0;
+ if (ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx,
+ &edx, false))
+ maxphyaddr = eax & 0xff;
+ else
+ maxphyaddr = 36;
+ rsvd = rsvd_bits(maxphyaddr, 63);
+ if (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_PCIDE)
+ rsvd &= ~X86_CR3_PCID_NOFLUSH;
+ }
+
+ if (new_val & rsvd)
+ return emulate_gp(ctxt, 0);
+
+ break;
+ }
+ case 4: {
+ ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
+
+ if ((efer & EFER_LMA) && !(new_val & X86_CR4_PAE))
+ return emulate_gp(ctxt, 0);
+
+ break;
+ }
+ }
+
+ return X86EMUL_CONTINUE;
+}
+
+static int check_dr7_gd(struct x86_emulate_ctxt *ctxt)
+{
+ unsigned long dr7;
+
+ ctxt->ops->get_dr(ctxt, 7, &dr7);
+
+ /* Check if DR7.Global_Enable is set */
+ return dr7 & (1 << 13);
+}
+
+static int check_dr_read(struct x86_emulate_ctxt *ctxt)
+{
+ int dr = ctxt->modrm_reg;
+ u64 cr4;
+
+ if (dr > 7)
+ return emulate_ud(ctxt);
+
+ cr4 = ctxt->ops->get_cr(ctxt, 4);
+ if ((cr4 & X86_CR4_DE) && (dr == 4 || dr == 5))
+ return emulate_ud(ctxt);
+
+ if (check_dr7_gd(ctxt)) {
+ ulong dr6;
+
+ ctxt->ops->get_dr(ctxt, 6, &dr6);
+ dr6 &= ~15;
+ dr6 |= DR6_BD | DR6_RTM;
+ ctxt->ops->set_dr(ctxt, 6, dr6);
+ return emulate_db(ctxt);
+ }
+
+ return X86EMUL_CONTINUE;
+}
+
+static int check_dr_write(struct x86_emulate_ctxt *ctxt)
+{
+ u64 new_val = ctxt->src.val64;
+ int dr = ctxt->modrm_reg;
+
+ if ((dr == 6 || dr == 7) && (new_val & 0xffffffff00000000ULL))
+ return emulate_gp(ctxt, 0);
+
+ return check_dr_read(ctxt);
+}
+
+static int check_svme(struct x86_emulate_ctxt *ctxt)
+{
+ u64 efer = 0;
+
+ ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
+
+ if (!(efer & EFER_SVME))
+ return emulate_ud(ctxt);
+
+ return X86EMUL_CONTINUE;
+}
+
+static int check_svme_pa(struct x86_emulate_ctxt *ctxt)
+{
+ u64 rax = reg_read(ctxt, VCPU_REGS_RAX);
+
+ /* Valid physical address? */
+ if (rax & 0xffff000000000000ULL)
+ return emulate_gp(ctxt, 0);
+
+ return check_svme(ctxt);
+}
+
+static int check_rdtsc(struct x86_emulate_ctxt *ctxt)
+{
+ u64 cr4 = ctxt->ops->get_cr(ctxt, 4);
+
+ if (cr4 & X86_CR4_TSD && ctxt->ops->cpl(ctxt))
+ return emulate_ud(ctxt);
+
+ return X86EMUL_CONTINUE;
+}
+
+static int check_rdpmc(struct x86_emulate_ctxt *ctxt)
+{
+ u64 cr4 = ctxt->ops->get_cr(ctxt, 4);
+ u64 rcx = reg_read(ctxt, VCPU_REGS_RCX);
+
+ /*
+ * VMware allows access to these Pseduo-PMCs even when read via RDPMC
+ * in Ring3 when CR4.PCE=0.
+ */
+ if (enable_vmware_backdoor && is_vmware_backdoor_pmc(rcx))
+ return X86EMUL_CONTINUE;
+
+ if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) ||
+ ctxt->ops->check_pmc(ctxt, rcx))
+ return emulate_gp(ctxt, 0);
+
+ return X86EMUL_CONTINUE;
+}
+
+static int check_perm_in(struct x86_emulate_ctxt *ctxt)
+{
+ ctxt->dst.bytes = min(ctxt->dst.bytes, 4u);
+ if (!emulator_io_permited(ctxt, ctxt->src.val, ctxt->dst.bytes))
+ return emulate_gp(ctxt, 0);
+
+ return X86EMUL_CONTINUE;
+}
+
+static int check_perm_out(struct x86_emulate_ctxt *ctxt)
+{
+ ctxt->src.bytes = min(ctxt->src.bytes, 4u);
+ if (!emulator_io_permited(ctxt, ctxt->dst.val, ctxt->src.bytes))
+ return emulate_gp(ctxt, 0);
+
+ return X86EMUL_CONTINUE;
+}
+
+#define D(_y) { .flags = (_y) }
+#define DI(_y, _i) { .flags = (_y)|Intercept, .intercept = x86_intercept_##_i }
+#define DIP(_y, _i, _p) { .flags = (_y)|Intercept|CheckPerm, \
+ .intercept = x86_intercept_##_i, .check_perm = (_p) }
+#define N D(NotImpl)
+#define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) }
+#define G(_f, _g) { .flags = ((_f) | Group | ModRM), .u.group = (_g) }
+#define GD(_f, _g) { .flags = ((_f) | GroupDual | ModRM), .u.gdual = (_g) }
+#define ID(_f, _i) { .flags = ((_f) | InstrDual | ModRM), .u.idual = (_i) }
+#define MD(_f, _m) { .flags = ((_f) | ModeDual), .u.mdual = (_m) }
+#define E(_f, _e) { .flags = ((_f) | Escape | ModRM), .u.esc = (_e) }
+#define I(_f, _e) { .flags = (_f), .u.execute = (_e) }
+#define F(_f, _e) { .flags = (_f) | Fastop, .u.fastop = (_e) }
+#define II(_f, _e, _i) \
+ { .flags = (_f)|Intercept, .u.execute = (_e), .intercept = x86_intercept_##_i }
+#define IIP(_f, _e, _i, _p) \
+ { .flags = (_f)|Intercept|CheckPerm, .u.execute = (_e), \
+ .intercept = x86_intercept_##_i, .check_perm = (_p) }
+#define GP(_f, _g) { .flags = ((_f) | Prefix), .u.gprefix = (_g) }
+
+#define D2bv(_f) D((_f) | ByteOp), D(_f)
+#define D2bvIP(_f, _i, _p) DIP((_f) | ByteOp, _i, _p), DIP(_f, _i, _p)
+#define I2bv(_f, _e) I((_f) | ByteOp, _e), I(_f, _e)
+#define F2bv(_f, _e) F((_f) | ByteOp, _e), F(_f, _e)
+#define I2bvIP(_f, _e, _i, _p) \
+ IIP((_f) | ByteOp, _e, _i, _p), IIP(_f, _e, _i, _p)
+
+#define F6ALU(_f, _e) F2bv((_f) | DstMem | SrcReg | ModRM, _e), \
+ F2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e), \
+ F2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e)
+
+static const struct opcode group7_rm0[] = {
+ N,
+ I(SrcNone | Priv | EmulateOnUD, em_hypercall),
+ N, N, N, N, N, N,
+};
+
+static const struct opcode group7_rm1[] = {
+ DI(SrcNone | Priv, monitor),
+ DI(SrcNone | Priv, mwait),
+ N, N, N, N, N, N,
+};
+
+static const struct opcode group7_rm3[] = {
+ DIP(SrcNone | Prot | Priv, vmrun, check_svme_pa),
+ II(SrcNone | Prot | EmulateOnUD, em_hypercall, vmmcall),
+ DIP(SrcNone | Prot | Priv, vmload, check_svme_pa),
+ DIP(SrcNone | Prot | Priv, vmsave, check_svme_pa),
+ DIP(SrcNone | Prot | Priv, stgi, check_svme),
+ DIP(SrcNone | Prot | Priv, clgi, check_svme),
+ DIP(SrcNone | Prot | Priv, skinit, check_svme),
+ DIP(SrcNone | Prot | Priv, invlpga, check_svme),
+};
+
+static const struct opcode group7_rm7[] = {
+ N,
+ DIP(SrcNone, rdtscp, check_rdtsc),
+ N, N, N, N, N, N,
+};
+
+static const struct opcode group1[] = {
+ F(Lock, em_add),
+ F(Lock | PageTable, em_or),
+ F(Lock, em_adc),
+ F(Lock, em_sbb),
+ F(Lock | PageTable, em_and),
+ F(Lock, em_sub),
+ F(Lock, em_xor),
+ F(NoWrite, em_cmp),
+};
+
+static const struct opcode group1A[] = {
+ I(DstMem | SrcNone | Mov | Stack | IncSP | TwoMemOp, em_pop), N, N, N, N, N, N, N,
+};
+
+static const struct opcode group2[] = {
+ F(DstMem | ModRM, em_rol),
+ F(DstMem | ModRM, em_ror),
+ F(DstMem | ModRM, em_rcl),
+ F(DstMem | ModRM, em_rcr),
+ F(DstMem | ModRM, em_shl),
+ F(DstMem | ModRM, em_shr),
+ F(DstMem | ModRM, em_shl),
+ F(DstMem | ModRM, em_sar),
+};
+
+static const struct opcode group3[] = {
+ F(DstMem | SrcImm | NoWrite, em_test),
+ F(DstMem | SrcImm | NoWrite, em_test),
+ F(DstMem | SrcNone | Lock, em_not),
+ F(DstMem | SrcNone | Lock, em_neg),
+ F(DstXacc | Src2Mem, em_mul_ex),
+ F(DstXacc | Src2Mem, em_imul_ex),
+ F(DstXacc | Src2Mem, em_div_ex),
+ F(DstXacc | Src2Mem, em_idiv_ex),
+};
+
+static const struct opcode group4[] = {
+ F(ByteOp | DstMem | SrcNone | Lock, em_inc),
+ F(ByteOp | DstMem | SrcNone | Lock, em_dec),
+ N, N, N, N, N, N,
+};
+
+static const struct opcode group5[] = {
+ F(DstMem | SrcNone | Lock, em_inc),
+ F(DstMem | SrcNone | Lock, em_dec),
+ I(SrcMem | NearBranch, em_call_near_abs),
+ I(SrcMemFAddr | ImplicitOps, em_call_far),
+ I(SrcMem | NearBranch, em_jmp_abs),
+ I(SrcMemFAddr | ImplicitOps, em_jmp_far),
+ I(SrcMem | Stack | TwoMemOp, em_push), D(Undefined),
+};
+
+static const struct opcode group6[] = {
+ II(Prot | DstMem, em_sldt, sldt),
+ II(Prot | DstMem, em_str, str),
+ II(Prot | Priv | SrcMem16, em_lldt, lldt),
+ II(Prot | Priv | SrcMem16, em_ltr, ltr),
+ N, N, N, N,
+};
+
+static const struct group_dual group7 = { {
+ II(Mov | DstMem, em_sgdt, sgdt),
+ II(Mov | DstMem, em_sidt, sidt),
+ II(SrcMem | Priv, em_lgdt, lgdt),
+ II(SrcMem | Priv, em_lidt, lidt),
+ II(SrcNone | DstMem | Mov, em_smsw, smsw), N,
+ II(SrcMem16 | Mov | Priv, em_lmsw, lmsw),
+ II(SrcMem | ByteOp | Priv | NoAccess, em_invlpg, invlpg),
+}, {
+ EXT(0, group7_rm0),
+ EXT(0, group7_rm1),
+ N, EXT(0, group7_rm3),
+ II(SrcNone | DstMem | Mov, em_smsw, smsw), N,
+ II(SrcMem16 | Mov | Priv, em_lmsw, lmsw),
+ EXT(0, group7_rm7),
+} };
+
+static const struct opcode group8[] = {
+ N, N, N, N,
+ F(DstMem | SrcImmByte | NoWrite, em_bt),
+ F(DstMem | SrcImmByte | Lock | PageTable, em_bts),
+ F(DstMem | SrcImmByte | Lock, em_btr),
+ F(DstMem | SrcImmByte | Lock | PageTable, em_btc),
+};
+
+/*
+ * The "memory" destination is actually always a register, since we come
+ * from the register case of group9.
+ */
+static const struct gprefix pfx_0f_c7_7 = {
+ N, N, N, II(DstMem | ModRM | Op3264 | EmulateOnUD, em_rdpid, rdtscp),
+};
+
+
+static const struct group_dual group9 = { {
+ N, I(DstMem64 | Lock | PageTable, em_cmpxchg8b), N, N, N, N, N, N,
+}, {
+ N, N, N, N, N, N, N,
+ GP(0, &pfx_0f_c7_7),
+} };
+
+static const struct opcode group11[] = {
+ I(DstMem | SrcImm | Mov | PageTable, em_mov),
+ X7(D(Undefined)),
+};
+
+static const struct gprefix pfx_0f_ae_7 = {
+ I(SrcMem | ByteOp, em_clflush), I(SrcMem | ByteOp, em_clflushopt), N, N,
+};
+
+static const struct group_dual group15 = { {
+ I(ModRM | Aligned16, em_fxsave),
+ I(ModRM | Aligned16, em_fxrstor),
+ N, N, N, N, N, GP(0, &pfx_0f_ae_7),
+}, {
+ N, N, N, N, N, N, N, N,
+} };
+
+static const struct gprefix pfx_0f_6f_0f_7f = {
+ I(Mmx, em_mov), I(Sse | Aligned, em_mov), N, I(Sse | Unaligned, em_mov),
+};
+
+static const struct instr_dual instr_dual_0f_2b = {
+ I(0, em_mov), N
+};
+
+static const struct gprefix pfx_0f_2b = {
+ ID(0, &instr_dual_0f_2b), ID(0, &instr_dual_0f_2b), N, N,
+};
+
+static const struct gprefix pfx_0f_10_0f_11 = {
+ I(Unaligned, em_mov), I(Unaligned, em_mov), N, N,
+};
+
+static const struct gprefix pfx_0f_28_0f_29 = {
+ I(Aligned, em_mov), I(Aligned, em_mov), N, N,
+};
+
+static const struct gprefix pfx_0f_e7 = {
+ N, I(Sse, em_mov), N, N,
+};
+
+static const struct escape escape_d9 = { {
+ N, N, N, N, N, N, N, I(DstMem16 | Mov, em_fnstcw),
+}, {
+ /* 0xC0 - 0xC7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xC8 - 0xCF */
+ N, N, N, N, N, N, N, N,
+ /* 0xD0 - 0xC7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xD8 - 0xDF */
+ N, N, N, N, N, N, N, N,
+ /* 0xE0 - 0xE7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xE8 - 0xEF */
+ N, N, N, N, N, N, N, N,
+ /* 0xF0 - 0xF7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xF8 - 0xFF */
+ N, N, N, N, N, N, N, N,
+} };
+
+static const struct escape escape_db = { {
+ N, N, N, N, N, N, N, N,
+}, {
+ /* 0xC0 - 0xC7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xC8 - 0xCF */
+ N, N, N, N, N, N, N, N,
+ /* 0xD0 - 0xC7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xD8 - 0xDF */
+ N, N, N, N, N, N, N, N,
+ /* 0xE0 - 0xE7 */
+ N, N, N, I(ImplicitOps, em_fninit), N, N, N, N,
+ /* 0xE8 - 0xEF */
+ N, N, N, N, N, N, N, N,
+ /* 0xF0 - 0xF7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xF8 - 0xFF */
+ N, N, N, N, N, N, N, N,
+} };
+
+static const struct escape escape_dd = { {
+ N, N, N, N, N, N, N, I(DstMem16 | Mov, em_fnstsw),
+}, {
+ /* 0xC0 - 0xC7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xC8 - 0xCF */
+ N, N, N, N, N, N, N, N,
+ /* 0xD0 - 0xC7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xD8 - 0xDF */
+ N, N, N, N, N, N, N, N,
+ /* 0xE0 - 0xE7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xE8 - 0xEF */
+ N, N, N, N, N, N, N, N,
+ /* 0xF0 - 0xF7 */
+ N, N, N, N, N, N, N, N,
+ /* 0xF8 - 0xFF */
+ N, N, N, N, N, N, N, N,
+} };
+
+static const struct instr_dual instr_dual_0f_c3 = {
+ I(DstMem | SrcReg | ModRM | No16 | Mov, em_mov), N
+};
+
+static const struct mode_dual mode_dual_63 = {
+ N, I(DstReg | SrcMem32 | ModRM | Mov, em_movsxd)
+};
+
+static const struct opcode opcode_table[256] = {
+ /* 0x00 - 0x07 */
+ F6ALU(Lock, em_add),
+ I(ImplicitOps | Stack | No64 | Src2ES, em_push_sreg),
+ I(ImplicitOps | Stack | No64 | Src2ES, em_pop_sreg),
+ /* 0x08 - 0x0F */
+ F6ALU(Lock | PageTable, em_or),
+ I(ImplicitOps | Stack | No64 | Src2CS, em_push_sreg),
+ N,
+ /* 0x10 - 0x17 */
+ F6ALU(Lock, em_adc),
+ I(ImplicitOps | Stack | No64 | Src2SS, em_push_sreg),
+ I(ImplicitOps | Stack | No64 | Src2SS, em_pop_sreg),
+ /* 0x18 - 0x1F */
+ F6ALU(Lock, em_sbb),
+ I(ImplicitOps | Stack | No64 | Src2DS, em_push_sreg),
+ I(ImplicitOps | Stack | No64 | Src2DS, em_pop_sreg),
+ /* 0x20 - 0x27 */
+ F6ALU(Lock | PageTable, em_and), N, N,
+ /* 0x28 - 0x2F */
+ F6ALU(Lock, em_sub), N, I(ByteOp | DstAcc | No64, em_das),
+ /* 0x30 - 0x37 */
+ F6ALU(Lock, em_xor), N, N,
+ /* 0x38 - 0x3F */
+ F6ALU(NoWrite, em_cmp), N, N,
+ /* 0x40 - 0x4F */
+ X8(F(DstReg, em_inc)), X8(F(DstReg, em_dec)),
+ /* 0x50 - 0x57 */
+ X8(I(SrcReg | Stack, em_push)),
+ /* 0x58 - 0x5F */
+ X8(I(DstReg | Stack, em_pop)),
+ /* 0x60 - 0x67 */
+ I(ImplicitOps | Stack | No64, em_pusha),
+ I(ImplicitOps | Stack | No64, em_popa),
+ N, MD(ModRM, &mode_dual_63),
+ N, N, N, N,
+ /* 0x68 - 0x6F */
+ I(SrcImm | Mov | Stack, em_push),
+ I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op),
+ I(SrcImmByte | Mov | Stack, em_push),
+ I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op),
+ I2bvIP(DstDI | SrcDX | Mov | String | Unaligned, em_in, ins, check_perm_in), /* insb, insw/insd */
+ I2bvIP(SrcSI | DstDX | String, em_out, outs, check_perm_out), /* outsb, outsw/outsd */
+ /* 0x70 - 0x7F */
+ X16(D(SrcImmByte | NearBranch)),
+ /* 0x80 - 0x87 */
+ G(ByteOp | DstMem | SrcImm, group1),
+ G(DstMem | SrcImm, group1),
+ G(ByteOp | DstMem | SrcImm | No64, group1),
+ G(DstMem | SrcImmByte, group1),
+ F2bv(DstMem | SrcReg | ModRM | NoWrite, em_test),
+ I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_xchg),
+ /* 0x88 - 0x8F */
+ I2bv(DstMem | SrcReg | ModRM | Mov | PageTable, em_mov),
+ I2bv(DstReg | SrcMem | ModRM | Mov, em_mov),
+ I(DstMem | SrcNone | ModRM | Mov | PageTable, em_mov_rm_sreg),
+ D(ModRM | SrcMem | NoAccess | DstReg),
+ I(ImplicitOps | SrcMem16 | ModRM, em_mov_sreg_rm),
+ G(0, group1A),
+ /* 0x90 - 0x97 */
+ DI(SrcAcc | DstReg, pause), X7(D(SrcAcc | DstReg)),
+ /* 0x98 - 0x9F */
+ D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd),
+ I(SrcImmFAddr | No64, em_call_far), N,
+ II(ImplicitOps | Stack, em_pushf, pushf),
+ II(ImplicitOps | Stack, em_popf, popf),
+ I(ImplicitOps, em_sahf), I(ImplicitOps, em_lahf),
+ /* 0xA0 - 0xA7 */
+ I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov),
+ I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov),
+ I2bv(SrcSI | DstDI | Mov | String | TwoMemOp, em_mov),
+ F2bv(SrcSI | DstDI | String | NoWrite | TwoMemOp, em_cmp_r),
+ /* 0xA8 - 0xAF */
+ F2bv(DstAcc | SrcImm | NoWrite, em_test),
+ I2bv(SrcAcc | DstDI | Mov | String, em_mov),
+ I2bv(SrcSI | DstAcc | Mov | String, em_mov),
+ F2bv(SrcAcc | DstDI | String | NoWrite, em_cmp_r),
+ /* 0xB0 - 0xB7 */
+ X8(I(ByteOp | DstReg | SrcImm | Mov, em_mov)),
+ /* 0xB8 - 0xBF */
+ X8(I(DstReg | SrcImm64 | Mov, em_mov)),
+ /* 0xC0 - 0xC7 */
+ G(ByteOp | Src2ImmByte, group2), G(Src2ImmByte, group2),
+ I(ImplicitOps | NearBranch | SrcImmU16, em_ret_near_imm),
+ I(ImplicitOps | NearBranch, em_ret),
+ I(DstReg | SrcMemFAddr | ModRM | No64 | Src2ES, em_lseg),
+ I(DstReg | SrcMemFAddr | ModRM | No64 | Src2DS, em_lseg),
+ G(ByteOp, group11), G(0, group11),
+ /* 0xC8 - 0xCF */
+ I(Stack | SrcImmU16 | Src2ImmByte, em_enter), I(Stack, em_leave),
+ I(ImplicitOps | SrcImmU16, em_ret_far_imm),
+ I(ImplicitOps, em_ret_far),
+ D(ImplicitOps), DI(SrcImmByte, intn),
+ D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret),
+ /* 0xD0 - 0xD7 */
+ G(Src2One | ByteOp, group2), G(Src2One, group2),
+ G(Src2CL | ByteOp, group2), G(Src2CL, group2),
+ I(DstAcc | SrcImmUByte | No64, em_aam),
+ I(DstAcc | SrcImmUByte | No64, em_aad),
+ F(DstAcc | ByteOp | No64, em_salc),
+ I(DstAcc | SrcXLat | ByteOp, em_mov),
+ /* 0xD8 - 0xDF */
+ N, E(0, &escape_d9), N, E(0, &escape_db), N, E(0, &escape_dd), N, N,
+ /* 0xE0 - 0xE7 */
+ X3(I(SrcImmByte | NearBranch, em_loop)),
+ I(SrcImmByte | NearBranch, em_jcxz),
+ I2bvIP(SrcImmUByte | DstAcc, em_in, in, check_perm_in),
+ I2bvIP(SrcAcc | DstImmUByte, em_out, out, check_perm_out),
+ /* 0xE8 - 0xEF */
+ I(SrcImm | NearBranch, em_call), D(SrcImm | ImplicitOps | NearBranch),
+ I(SrcImmFAddr | No64, em_jmp_far),
+ D(SrcImmByte | ImplicitOps | NearBranch),
+ I2bvIP(SrcDX | DstAcc, em_in, in, check_perm_in),
+ I2bvIP(SrcAcc | DstDX, em_out, out, check_perm_out),
+ /* 0xF0 - 0xF7 */
+ N, DI(ImplicitOps, icebp), N, N,
+ DI(ImplicitOps | Priv, hlt), D(ImplicitOps),
+ G(ByteOp, group3), G(0, group3),
+ /* 0xF8 - 0xFF */
+ D(ImplicitOps), D(ImplicitOps),
+ I(ImplicitOps, em_cli), I(ImplicitOps, em_sti),
+ D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5),
+};
+
+static const struct opcode twobyte_table[256] = {
+ /* 0x00 - 0x0F */
+ G(0, group6), GD(0, &group7), N, N,
+ N, I(ImplicitOps | EmulateOnUD, em_syscall),
+ II(ImplicitOps | Priv, em_clts, clts), N,
+ DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N,
+ N, D(ImplicitOps | ModRM | SrcMem | NoAccess), N, N,
+ /* 0x10 - 0x1F */
+ GP(ModRM | DstReg | SrcMem | Mov | Sse, &pfx_0f_10_0f_11),
+ GP(ModRM | DstMem | SrcReg | Mov | Sse, &pfx_0f_10_0f_11),
+ N, N, N, N, N, N,
+ D(ImplicitOps | ModRM | SrcMem | NoAccess),
+ N, N, N, N, N, N, D(ImplicitOps | ModRM | SrcMem | NoAccess),
+ /* 0x20 - 0x2F */
+ DIP(ModRM | DstMem | Priv | Op3264 | NoMod, cr_read, check_cr_read),
+ DIP(ModRM | DstMem | Priv | Op3264 | NoMod, dr_read, check_dr_read),
+ IIP(ModRM | SrcMem | Priv | Op3264 | NoMod, em_cr_write, cr_write,
+ check_cr_write),
+ IIP(ModRM | SrcMem | Priv | Op3264 | NoMod, em_dr_write, dr_write,
+ check_dr_write),
+ N, N, N, N,
+ GP(ModRM | DstReg | SrcMem | Mov | Sse, &pfx_0f_28_0f_29),
+ GP(ModRM | DstMem | SrcReg | Mov | Sse, &pfx_0f_28_0f_29),
+ N, GP(ModRM | DstMem | SrcReg | Mov | Sse, &pfx_0f_2b),
+ N, N, N, N,
+ /* 0x30 - 0x3F */
+ II(ImplicitOps | Priv, em_wrmsr, wrmsr),
+ IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc),
+ II(ImplicitOps | Priv, em_rdmsr, rdmsr),
+ IIP(ImplicitOps, em_rdpmc, rdpmc, check_rdpmc),
+ I(ImplicitOps | EmulateOnUD, em_sysenter),
+ I(ImplicitOps | Priv | EmulateOnUD, em_sysexit),
+ N, N,
+ N, N, N, N, N, N, N, N,
+ /* 0x40 - 0x4F */
+ X16(D(DstReg | SrcMem | ModRM)),
+ /* 0x50 - 0x5F */
+ N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
+ /* 0x60 - 0x6F */
+ N, N, N, N,
+ N, N, N, N,
+ N, N, N, N,
+ N, N, N, GP(SrcMem | DstReg | ModRM | Mov, &pfx_0f_6f_0f_7f),
+ /* 0x70 - 0x7F */
+ N, N, N, N,
+ N, N, N, N,
+ N, N, N, N,
+ N, N, N, GP(SrcReg | DstMem | ModRM | Mov, &pfx_0f_6f_0f_7f),
+ /* 0x80 - 0x8F */
+ X16(D(SrcImm | NearBranch)),
+ /* 0x90 - 0x9F */
+ X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)),
+ /* 0xA0 - 0xA7 */
+ I(Stack | Src2FS, em_push_sreg), I(Stack | Src2FS, em_pop_sreg),
+ II(ImplicitOps, em_cpuid, cpuid),
+ F(DstMem | SrcReg | ModRM | BitOp | NoWrite, em_bt),
+ F(DstMem | SrcReg | Src2ImmByte | ModRM, em_shld),
+ F(DstMem | SrcReg | Src2CL | ModRM, em_shld), N, N,
+ /* 0xA8 - 0xAF */
+ I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg),
+ II(EmulateOnUD | ImplicitOps, em_rsm, rsm),
+ F(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_bts),
+ F(DstMem | SrcReg | Src2ImmByte | ModRM, em_shrd),
+ F(DstMem | SrcReg | Src2CL | ModRM, em_shrd),
+ GD(0, &group15), F(DstReg | SrcMem | ModRM, em_imul),
+ /* 0xB0 - 0xB7 */
+ I2bv(DstMem | SrcReg | ModRM | Lock | PageTable | SrcWrite, em_cmpxchg),
+ I(DstReg | SrcMemFAddr | ModRM | Src2SS, em_lseg),
+ F(DstMem | SrcReg | ModRM | BitOp | Lock, em_btr),
+ I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg),
+ I(DstReg | SrcMemFAddr | ModRM | Src2GS, em_lseg),
+ D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
+ /* 0xB8 - 0xBF */
+ N, N,
+ G(BitOp, group8),
+ F(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc),
+ I(DstReg | SrcMem | ModRM, em_bsf_c),
+ I(DstReg | SrcMem | ModRM, em_bsr_c),
+ D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
+ /* 0xC0 - 0xC7 */
+ F2bv(DstMem | SrcReg | ModRM | SrcWrite | Lock, em_xadd),
+ N, ID(0, &instr_dual_0f_c3),
+ N, N, N, GD(0, &group9),
+ /* 0xC8 - 0xCF */
+ X8(I(DstReg, em_bswap)),
+ /* 0xD0 - 0xDF */
+ N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
+ /* 0xE0 - 0xEF */
+ N, N, N, N, N, N, N, GP(SrcReg | DstMem | ModRM | Mov, &pfx_0f_e7),
+ N, N, N, N, N, N, N, N,
+ /* 0xF0 - 0xFF */
+ N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N
+};
+
+static const struct instr_dual instr_dual_0f_38_f0 = {
+ I(DstReg | SrcMem | Mov, em_movbe), N
+};
+
+static const struct instr_dual instr_dual_0f_38_f1 = {
+ I(DstMem | SrcReg | Mov, em_movbe), N
+};
+
+static const struct gprefix three_byte_0f_38_f0 = {
+ ID(0, &instr_dual_0f_38_f0), N, N, N
+};
+
+static const struct gprefix three_byte_0f_38_f1 = {
+ ID(0, &instr_dual_0f_38_f1), N, N, N
+};
+
+/*
+ * Insns below are selected by the prefix which indexed by the third opcode
+ * byte.
+ */
+static const struct opcode opcode_map_0f_38[256] = {
+ /* 0x00 - 0x7f */
+ X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N),
+ /* 0x80 - 0xef */
+ X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N),
+ /* 0xf0 - 0xf1 */
+ GP(EmulateOnUD | ModRM, &three_byte_0f_38_f0),
+ GP(EmulateOnUD | ModRM, &three_byte_0f_38_f1),
+ /* 0xf2 - 0xff */
+ N, N, X4(N), X8(N)
+};
+
+#undef D
+#undef N
+#undef G
+#undef GD
+#undef I
+#undef GP
+#undef EXT
+#undef MD
+#undef ID
+
+#undef D2bv
+#undef D2bvIP
+#undef I2bv
+#undef I2bvIP
+#undef I6ALU
+
+static unsigned imm_size(struct x86_emulate_ctxt *ctxt)
+{
+ unsigned size;
+
+ size = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+ if (size == 8)
+ size = 4;
+ return size;
+}
+
+static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op,
+ unsigned size, bool sign_extension)
+{
+ int rc = X86EMUL_CONTINUE;
+
+ op->type = OP_IMM;
+ op->bytes = size;
+ op->addr.mem.ea = ctxt->_eip;
+ /* NB. Immediates are sign-extended as necessary. */
+ switch (op->bytes) {
+ case 1:
+ op->val = insn_fetch(s8, ctxt);
+ break;
+ case 2:
+ op->val = insn_fetch(s16, ctxt);
+ break;
+ case 4:
+ op->val = insn_fetch(s32, ctxt);
+ break;
+ case 8:
+ op->val = insn_fetch(s64, ctxt);
+ break;
+ }
+ if (!sign_extension) {
+ switch (op->bytes) {
+ case 1:
+ op->val &= 0xff;
+ break;
+ case 2:
+ op->val &= 0xffff;
+ break;
+ case 4:
+ op->val &= 0xffffffff;
+ break;
+ }
+ }
+done:
+ return rc;
+}
+
+static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
+ unsigned d)
+{
+ int rc = X86EMUL_CONTINUE;
+
+ switch (d) {
+ case OpReg:
+ decode_register_operand(ctxt, op);
+ break;
+ case OpImmUByte:
+ rc = decode_imm(ctxt, op, 1, false);
+ break;
+ case OpMem:
+ ctxt->memop.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+ mem_common:
+ *op = ctxt->memop;
+ ctxt->memopp = op;
+ if (ctxt->d & BitOp)
+ fetch_bit_operand(ctxt);
+ op->orig_val = op->val;
+ break;
+ case OpMem64:
+ ctxt->memop.bytes = (ctxt->op_bytes == 8) ? 16 : 8;
+ goto mem_common;
+ case OpAcc:
+ op->type = OP_REG;
+ op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+ op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX);
+ fetch_register_operand(op);
+ op->orig_val = op->val;
+ break;
+ case OpAccLo:
+ op->type = OP_REG;
+ op->bytes = (ctxt->d & ByteOp) ? 2 : ctxt->op_bytes;
+ op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX);
+ fetch_register_operand(op);
+ op->orig_val = op->val;
+ break;
+ case OpAccHi:
+ if (ctxt->d & ByteOp) {
+ op->type = OP_NONE;
+ break;
+ }
+ op->type = OP_REG;
+ op->bytes = ctxt->op_bytes;
+ op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RDX);
+ fetch_register_operand(op);
+ op->orig_val = op->val;
+ break;
+ case OpDI:
+ op->type = OP_MEM;
+ op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+ op->addr.mem.ea =
+ register_address(ctxt, VCPU_REGS_RDI);
+ op->addr.mem.seg = VCPU_SREG_ES;
+ op->val = 0;
+ op->count = 1;
+ break;
+ case OpDX:
+ op->type = OP_REG;
+ op->bytes = 2;
+ op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RDX);
+ fetch_register_operand(op);
+ break;
+ case OpCL:
+ op->type = OP_IMM;
+ op->bytes = 1;
+ op->val = reg_read(ctxt, VCPU_REGS_RCX) & 0xff;
+ break;
+ case OpImmByte:
+ rc = decode_imm(ctxt, op, 1, true);
+ break;
+ case OpOne:
+ op->type = OP_IMM;
+ op->bytes = 1;
+ op->val = 1;
+ break;
+ case OpImm:
+ rc = decode_imm(ctxt, op, imm_size(ctxt), true);
+ break;
+ case OpImm64:
+ rc = decode_imm(ctxt, op, ctxt->op_bytes, true);
+ break;
+ case OpMem8:
+ ctxt->memop.bytes = 1;
+ if (ctxt->memop.type == OP_REG) {
+ ctxt->memop.addr.reg = decode_register(ctxt,
+ ctxt->modrm_rm, true);
+ fetch_register_operand(&ctxt->memop);
+ }
+ goto mem_common;
+ case OpMem16:
+ ctxt->memop.bytes = 2;
+ goto mem_common;
+ case OpMem32:
+ ctxt->memop.bytes = 4;
+ goto mem_common;
+ case OpImmU16:
+ rc = decode_imm(ctxt, op, 2, false);
+ break;
+ case OpImmU:
+ rc = decode_imm(ctxt, op, imm_size(ctxt), false);
+ break;
+ case OpSI:
+ op->type = OP_MEM;
+ op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+ op->addr.mem.ea =
+ register_address(ctxt, VCPU_REGS_RSI);
+ op->addr.mem.seg = ctxt->seg_override;
+ op->val = 0;
+ op->count = 1;
+ break;
+ case OpXLat:
+ op->type = OP_MEM;
+ op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+ op->addr.mem.ea =
+ address_mask(ctxt,
+ reg_read(ctxt, VCPU_REGS_RBX) +
+ (reg_read(ctxt, VCPU_REGS_RAX) & 0xff));
+ op->addr.mem.seg = ctxt->seg_override;
+ op->val = 0;
+ break;
+ case OpImmFAddr:
+ op->type = OP_IMM;
+ op->addr.mem.ea = ctxt->_eip;
+ op->bytes = ctxt->op_bytes + 2;
+ insn_fetch_arr(op->valptr, op->bytes, ctxt);
+ break;
+ case OpMemFAddr:
+ ctxt->memop.bytes = ctxt->op_bytes + 2;
+ goto mem_common;
+ case OpES:
+ op->type = OP_IMM;
+ op->val = VCPU_SREG_ES;
+ break;
+ case OpCS:
+ op->type = OP_IMM;
+ op->val = VCPU_SREG_CS;
+ break;
+ case OpSS:
+ op->type = OP_IMM;
+ op->val = VCPU_SREG_SS;
+ break;
+ case OpDS:
+ op->type = OP_IMM;
+ op->val = VCPU_SREG_DS;
+ break;
+ case OpFS:
+ op->type = OP_IMM;
+ op->val = VCPU_SREG_FS;
+ break;
+ case OpGS:
+ op->type = OP_IMM;
+ op->val = VCPU_SREG_GS;
+ break;
+ case OpImplicit:
+ /* Special instructions do their own operand decoding. */
+ default:
+ op->type = OP_NONE; /* Disable writeback. */
+ break;
+ }
+
+done:
+ return rc;
+}
+
+int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
+{
+ int rc = X86EMUL_CONTINUE;
+ int mode = ctxt->mode;
+ int def_op_bytes, def_ad_bytes, goffset, simd_prefix;
+ bool op_prefix = false;
+ bool has_seg_override = false;
+ struct opcode opcode;
+ u16 dummy;
+ struct desc_struct desc;
+
+ ctxt->memop.type = OP_NONE;
+ ctxt->memopp = NULL;
+ ctxt->_eip = ctxt->eip;
+ ctxt->fetch.ptr = ctxt->fetch.data;
+ ctxt->fetch.end = ctxt->fetch.data + insn_len;
+ ctxt->opcode_len = 1;
+ ctxt->intercept = x86_intercept_none;
+ if (insn_len > 0)
+ memcpy(ctxt->fetch.data, insn, insn_len);
+ else {
+ rc = __do_insn_fetch_bytes(ctxt, 1);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ }
+
+ switch (mode) {
+ case X86EMUL_MODE_REAL:
+ case X86EMUL_MODE_VM86:
+ def_op_bytes = def_ad_bytes = 2;
+ ctxt->ops->get_segment(ctxt, &dummy, &desc, NULL, VCPU_SREG_CS);
+ if (desc.d)
+ def_op_bytes = def_ad_bytes = 4;
+ break;
+ case X86EMUL_MODE_PROT16:
+ def_op_bytes = def_ad_bytes = 2;
+ break;
+ case X86EMUL_MODE_PROT32:
+ def_op_bytes = def_ad_bytes = 4;
+ break;
+#ifdef CONFIG_X86_64
+ case X86EMUL_MODE_PROT64:
+ def_op_bytes = 4;
+ def_ad_bytes = 8;
+ break;
+#endif
+ default:
+ return EMULATION_FAILED;
+ }
+
+ ctxt->op_bytes = def_op_bytes;
+ ctxt->ad_bytes = def_ad_bytes;
+
+ /* Legacy prefixes. */
+ for (;;) {
+ switch (ctxt->b = insn_fetch(u8, ctxt)) {
+ case 0x66: /* operand-size override */
+ op_prefix = true;
+ /* switch between 2/4 bytes */
+ ctxt->op_bytes = def_op_bytes ^ 6;
+ break;
+ case 0x67: /* address-size override */
+ if (mode == X86EMUL_MODE_PROT64)
+ /* switch between 4/8 bytes */
+ ctxt->ad_bytes = def_ad_bytes ^ 12;
+ else
+ /* switch between 2/4 bytes */
+ ctxt->ad_bytes = def_ad_bytes ^ 6;
+ break;
+ case 0x26: /* ES override */
+ has_seg_override = true;
+ ctxt->seg_override = VCPU_SREG_ES;
+ break;
+ case 0x2e: /* CS override */
+ has_seg_override = true;
+ ctxt->seg_override = VCPU_SREG_CS;
+ break;
+ case 0x36: /* SS override */
+ has_seg_override = true;
+ ctxt->seg_override = VCPU_SREG_SS;
+ break;
+ case 0x3e: /* DS override */
+ has_seg_override = true;
+ ctxt->seg_override = VCPU_SREG_DS;
+ break;
+ case 0x64: /* FS override */
+ has_seg_override = true;
+ ctxt->seg_override = VCPU_SREG_FS;
+ break;
+ case 0x65: /* GS override */
+ has_seg_override = true;
+ ctxt->seg_override = VCPU_SREG_GS;
+ break;
+ case 0x40 ... 0x4f: /* REX */
+ if (mode != X86EMUL_MODE_PROT64)
+ goto done_prefixes;
+ ctxt->rex_prefix = ctxt->b;
+ continue;
+ case 0xf0: /* LOCK */
+ ctxt->lock_prefix = 1;
+ break;
+ case 0xf2: /* REPNE/REPNZ */
+ case 0xf3: /* REP/REPE/REPZ */
+ ctxt->rep_prefix = ctxt->b;
+ break;
+ default:
+ goto done_prefixes;
+ }
+
+ /* Any legacy prefix after a REX prefix nullifies its effect. */
+
+ ctxt->rex_prefix = 0;
+ }
+
+done_prefixes:
+
+ /* REX prefix. */
+ if (ctxt->rex_prefix & 8)
+ ctxt->op_bytes = 8; /* REX.W */
+
+ /* Opcode byte(s). */
+ opcode = opcode_table[ctxt->b];
+ /* Two-byte opcode? */
+ if (ctxt->b == 0x0f) {
+ ctxt->opcode_len = 2;
+ ctxt->b = insn_fetch(u8, ctxt);
+ opcode = twobyte_table[ctxt->b];
+
+ /* 0F_38 opcode map */
+ if (ctxt->b == 0x38) {
+ ctxt->opcode_len = 3;
+ ctxt->b = insn_fetch(u8, ctxt);
+ opcode = opcode_map_0f_38[ctxt->b];
+ }
+ }
+ ctxt->d = opcode.flags;
+
+ if (ctxt->d & ModRM)
+ ctxt->modrm = insn_fetch(u8, ctxt);
+
+ /* vex-prefix instructions are not implemented */
+ if (ctxt->opcode_len == 1 && (ctxt->b == 0xc5 || ctxt->b == 0xc4) &&
+ (mode == X86EMUL_MODE_PROT64 || (ctxt->modrm & 0xc0) == 0xc0)) {
+ ctxt->d = NotImpl;
+ }
+
+ while (ctxt->d & GroupMask) {
+ switch (ctxt->d & GroupMask) {
+ case Group:
+ goffset = (ctxt->modrm >> 3) & 7;
+ opcode = opcode.u.group[goffset];
+ break;
+ case GroupDual:
+ goffset = (ctxt->modrm >> 3) & 7;
+ if ((ctxt->modrm >> 6) == 3)
+ opcode = opcode.u.gdual->mod3[goffset];
+ else
+ opcode = opcode.u.gdual->mod012[goffset];
+ break;
+ case RMExt:
+ goffset = ctxt->modrm & 7;
+ opcode = opcode.u.group[goffset];
+ break;
+ case Prefix:
+ if (ctxt->rep_prefix && op_prefix)
+ return EMULATION_FAILED;
+ simd_prefix = op_prefix ? 0x66 : ctxt->rep_prefix;
+ switch (simd_prefix) {
+ case 0x00: opcode = opcode.u.gprefix->pfx_no; break;
+ case 0x66: opcode = opcode.u.gprefix->pfx_66; break;
+ case 0xf2: opcode = opcode.u.gprefix->pfx_f2; break;
+ case 0xf3: opcode = opcode.u.gprefix->pfx_f3; break;
+ }
+ break;
+ case Escape:
+ if (ctxt->modrm > 0xbf) {
+ size_t size = ARRAY_SIZE(opcode.u.esc->high);
+ u32 index = array_index_nospec(
+ ctxt->modrm - 0xc0, size);
+
+ opcode = opcode.u.esc->high[index];
+ } else {
+ opcode = opcode.u.esc->op[(ctxt->modrm >> 3) & 7];
+ }
+ break;
+ case InstrDual:
+ if ((ctxt->modrm >> 6) == 3)
+ opcode = opcode.u.idual->mod3;
+ else
+ opcode = opcode.u.idual->mod012;
+ break;
+ case ModeDual:
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ opcode = opcode.u.mdual->mode64;
+ else
+ opcode = opcode.u.mdual->mode32;
+ break;
+ default:
+ return EMULATION_FAILED;
+ }
+
+ ctxt->d &= ~(u64)GroupMask;
+ ctxt->d |= opcode.flags;
+ }
+
+ /* Unrecognised? */
+ if (ctxt->d == 0)
+ return EMULATION_FAILED;
+
+ ctxt->execute = opcode.u.execute;
+
+ if (unlikely(ctxt->ud) && likely(!(ctxt->d & EmulateOnUD)))
+ return EMULATION_FAILED;
+
+ if (unlikely(ctxt->d &
+ (NotImpl|Stack|Op3264|Sse|Mmx|Intercept|CheckPerm|NearBranch|
+ No16))) {
+ /*
+ * These are copied unconditionally here, and checked unconditionally
+ * in x86_emulate_insn.
+ */
+ ctxt->check_perm = opcode.check_perm;
+ ctxt->intercept = opcode.intercept;
+
+ if (ctxt->d & NotImpl)
+ return EMULATION_FAILED;
+
+ if (mode == X86EMUL_MODE_PROT64) {
+ if (ctxt->op_bytes == 4 && (ctxt->d & Stack))
+ ctxt->op_bytes = 8;
+ else if (ctxt->d & NearBranch)
+ ctxt->op_bytes = 8;
+ }
+
+ if (ctxt->d & Op3264) {
+ if (mode == X86EMUL_MODE_PROT64)
+ ctxt->op_bytes = 8;
+ else
+ ctxt->op_bytes = 4;
+ }
+
+ if ((ctxt->d & No16) && ctxt->op_bytes == 2)
+ ctxt->op_bytes = 4;
+
+ if (ctxt->d & Sse)
+ ctxt->op_bytes = 16;
+ else if (ctxt->d & Mmx)
+ ctxt->op_bytes = 8;
+ }
+
+ /* ModRM and SIB bytes. */
+ if (ctxt->d & ModRM) {
+ rc = decode_modrm(ctxt, &ctxt->memop);
+ if (!has_seg_override) {
+ has_seg_override = true;
+ ctxt->seg_override = ctxt->modrm_seg;
+ }
+ } else if (ctxt->d & MemAbs)
+ rc = decode_abs(ctxt, &ctxt->memop);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+
+ if (!has_seg_override)
+ ctxt->seg_override = VCPU_SREG_DS;
+
+ ctxt->memop.addr.mem.seg = ctxt->seg_override;
+
+ /*
+ * Decode and fetch the source operand: register, memory
+ * or immediate.
+ */
+ rc = decode_operand(ctxt, &ctxt->src, (ctxt->d >> SrcShift) & OpMask);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+
+ /*
+ * Decode and fetch the second source operand: register, memory
+ * or immediate.
+ */
+ rc = decode_operand(ctxt, &ctxt->src2, (ctxt->d >> Src2Shift) & OpMask);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+
+ /* Decode and fetch the destination operand: register or memory. */
+ rc = decode_operand(ctxt, &ctxt->dst, (ctxt->d >> DstShift) & OpMask);
+
+ if (ctxt->rip_relative && likely(ctxt->memopp))
+ ctxt->memopp->addr.mem.ea = address_mask(ctxt,
+ ctxt->memopp->addr.mem.ea + ctxt->_eip);
+
+done:
+ if (rc == X86EMUL_PROPAGATE_FAULT)
+ ctxt->have_exception = true;
+ return (rc != X86EMUL_CONTINUE) ? EMULATION_FAILED : EMULATION_OK;
+}
+
+bool x86_page_table_writing_insn(struct x86_emulate_ctxt *ctxt)
+{
+ return ctxt->d & PageTable;
+}
+
+static bool string_insn_completed(struct x86_emulate_ctxt *ctxt)
+{
+ /* The second termination condition only applies for REPE
+ * and REPNE. Test if the repeat string operation prefix is
+ * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the
+ * corresponding termination condition according to:
+ * - if REPE/REPZ and ZF = 0 then done
+ * - if REPNE/REPNZ and ZF = 1 then done
+ */
+ if (((ctxt->b == 0xa6) || (ctxt->b == 0xa7) ||
+ (ctxt->b == 0xae) || (ctxt->b == 0xaf))
+ && (((ctxt->rep_prefix == REPE_PREFIX) &&
+ ((ctxt->eflags & X86_EFLAGS_ZF) == 0))
+ || ((ctxt->rep_prefix == REPNE_PREFIX) &&
+ ((ctxt->eflags & X86_EFLAGS_ZF) == X86_EFLAGS_ZF))))
+ return true;
+
+ return false;
+}
+
+static int flush_pending_x87_faults(struct x86_emulate_ctxt *ctxt)
+{
+ int rc;
+
+ rc = asm_safe("fwait");
+
+ if (unlikely(rc != X86EMUL_CONTINUE))
+ return emulate_exception(ctxt, MF_VECTOR, 0, false);
+
+ return X86EMUL_CONTINUE;
+}
+
+static void fetch_possible_mmx_operand(struct x86_emulate_ctxt *ctxt,
+ struct operand *op)
+{
+ if (op->type == OP_MM)
+ read_mmx_reg(ctxt, &op->mm_val, op->addr.mm);
+}
+
+static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *))
+{
+ ulong flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF;
+
+ if (!(ctxt->d & ByteOp))
+ fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE;
+
+ asm("push %[flags]; popf; " CALL_NOSPEC " ; pushf; pop %[flags]\n"
+ : "+a"(ctxt->dst.val), "+d"(ctxt->src.val), [flags]"+D"(flags),
+ [thunk_target]"+S"(fop), ASM_CALL_CONSTRAINT
+ : "c"(ctxt->src2.val));
+
+ ctxt->eflags = (ctxt->eflags & ~EFLAGS_MASK) | (flags & EFLAGS_MASK);
+ if (!fop) /* exception is returned in fop variable */
+ return emulate_de(ctxt);
+ return X86EMUL_CONTINUE;
+}
+
+void init_decode_cache(struct x86_emulate_ctxt *ctxt)
+{
+ memset(&ctxt->rip_relative, 0,
+ (void *)&ctxt->modrm - (void *)&ctxt->rip_relative);
+
+ ctxt->io_read.pos = 0;
+ ctxt->io_read.end = 0;
+ ctxt->mem_read.end = 0;
+}
+
+int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
+{
+ const struct x86_emulate_ops *ops = ctxt->ops;
+ int rc = X86EMUL_CONTINUE;
+ int saved_dst_type = ctxt->dst.type;
+ unsigned emul_flags;
+
+ ctxt->mem_read.pos = 0;
+
+ /* LOCK prefix is allowed only with some instructions */
+ if (ctxt->lock_prefix && (!(ctxt->d & Lock) || ctxt->dst.type != OP_MEM)) {
+ rc = emulate_ud(ctxt);
+ goto done;
+ }
+
+ if ((ctxt->d & SrcMask) == SrcMemFAddr && ctxt->src.type != OP_MEM) {
+ rc = emulate_ud(ctxt);
+ goto done;
+ }
+
+ emul_flags = ctxt->ops->get_hflags(ctxt);
+ if (unlikely(ctxt->d &
+ (No64|Undefined|Sse|Mmx|Intercept|CheckPerm|Priv|Prot|String))) {
+ if ((ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) ||
+ (ctxt->d & Undefined)) {
+ rc = emulate_ud(ctxt);
+ goto done;
+ }
+
+ if (((ctxt->d & (Sse|Mmx)) && ((ops->get_cr(ctxt, 0) & X86_CR0_EM)))
+ || ((ctxt->d & Sse) && !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) {
+ rc = emulate_ud(ctxt);
+ goto done;
+ }
+
+ if ((ctxt->d & (Sse|Mmx)) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) {
+ rc = emulate_nm(ctxt);
+ goto done;
+ }
+
+ if (ctxt->d & Mmx) {
+ rc = flush_pending_x87_faults(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ /*
+ * Now that we know the fpu is exception safe, we can fetch
+ * operands from it.
+ */
+ fetch_possible_mmx_operand(ctxt, &ctxt->src);
+ fetch_possible_mmx_operand(ctxt, &ctxt->src2);
+ if (!(ctxt->d & Mov))
+ fetch_possible_mmx_operand(ctxt, &ctxt->dst);
+ }
+
+ if (unlikely(emul_flags & X86EMUL_GUEST_MASK) && ctxt->intercept) {
+ rc = emulator_check_intercept(ctxt, ctxt->intercept,
+ X86_ICPT_PRE_EXCEPT);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
+
+ /* Instruction can only be executed in protected mode */
+ if ((ctxt->d & Prot) && ctxt->mode < X86EMUL_MODE_PROT16) {
+ rc = emulate_ud(ctxt);
+ goto done;
+ }
+
+ /* Privileged instruction can be executed only in CPL=0 */
+ if ((ctxt->d & Priv) && ops->cpl(ctxt)) {
+ if (ctxt->d & PrivUD)
+ rc = emulate_ud(ctxt);
+ else
+ rc = emulate_gp(ctxt, 0);
+ goto done;
+ }
+
+ /* Do instruction specific permission checks */
+ if (ctxt->d & CheckPerm) {
+ rc = ctxt->check_perm(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
+
+ if (unlikely(emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) {
+ rc = emulator_check_intercept(ctxt, ctxt->intercept,
+ X86_ICPT_POST_EXCEPT);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
+
+ if (ctxt->rep_prefix && (ctxt->d & String)) {
+ /* All REP prefixes have the same first termination condition */
+ if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0) {
+ string_registers_quirk(ctxt);
+ ctxt->eip = ctxt->_eip;
+ ctxt->eflags &= ~X86_EFLAGS_RF;
+ goto done;
+ }
+ }
+ }
+
+ if ((ctxt->src.type == OP_MEM) && !(ctxt->d & NoAccess)) {
+ rc = segmented_read(ctxt, ctxt->src.addr.mem,
+ ctxt->src.valptr, ctxt->src.bytes);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ ctxt->src.orig_val64 = ctxt->src.val64;
+ }
+
+ if (ctxt->src2.type == OP_MEM) {
+ rc = segmented_read(ctxt, ctxt->src2.addr.mem,
+ &ctxt->src2.val, ctxt->src2.bytes);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
+
+ if ((ctxt->d & DstMask) == ImplicitOps)
+ goto special_insn;
+
+
+ if ((ctxt->dst.type == OP_MEM) && !(ctxt->d & Mov)) {
+ /* optimisation - avoid slow emulated read if Mov */
+ rc = segmented_read(ctxt, ctxt->dst.addr.mem,
+ &ctxt->dst.val, ctxt->dst.bytes);
+ if (rc != X86EMUL_CONTINUE) {
+ if (!(ctxt->d & NoWrite) &&
+ rc == X86EMUL_PROPAGATE_FAULT &&
+ ctxt->exception.vector == PF_VECTOR)
+ ctxt->exception.error_code |= PFERR_WRITE_MASK;
+ goto done;
+ }
+ }
+ /* Copy full 64-bit value for CMPXCHG8B. */
+ ctxt->dst.orig_val64 = ctxt->dst.val64;
+
+special_insn:
+
+ if (unlikely(emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) {
+ rc = emulator_check_intercept(ctxt, ctxt->intercept,
+ X86_ICPT_POST_MEMACCESS);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
+
+ if (ctxt->rep_prefix && (ctxt->d & String))
+ ctxt->eflags |= X86_EFLAGS_RF;
+ else
+ ctxt->eflags &= ~X86_EFLAGS_RF;
+
+ if (ctxt->execute) {
+ if (ctxt->d & Fastop) {
+ void (*fop)(struct fastop *) = (void *)ctxt->execute;
+ rc = fastop(ctxt, fop);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ goto writeback;
+ }
+ rc = ctxt->execute(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ goto writeback;
+ }
+
+ if (ctxt->opcode_len == 2)
+ goto twobyte_insn;
+ else if (ctxt->opcode_len == 3)
+ goto threebyte_insn;
+
+ switch (ctxt->b) {
+ case 0x70 ... 0x7f: /* jcc (short) */
+ if (test_cc(ctxt->b, ctxt->eflags))
+ rc = jmp_rel(ctxt, ctxt->src.val);
+ break;
+ case 0x8d: /* lea r16/r32, m */
+ ctxt->dst.val = ctxt->src.addr.mem.ea;
+ break;
+ case 0x90 ... 0x97: /* nop / xchg reg, rax */
+ if (ctxt->dst.addr.reg == reg_rmw(ctxt, VCPU_REGS_RAX))
+ ctxt->dst.type = OP_NONE;
+ else
+ rc = em_xchg(ctxt);
+ break;
+ case 0x98: /* cbw/cwde/cdqe */
+ switch (ctxt->op_bytes) {
+ case 2: ctxt->dst.val = (s8)ctxt->dst.val; break;
+ case 4: ctxt->dst.val = (s16)ctxt->dst.val; break;
+ case 8: ctxt->dst.val = (s32)ctxt->dst.val; break;
+ }
+ break;
+ case 0xcc: /* int3 */
+ rc = emulate_int(ctxt, 3);
+ break;
+ case 0xcd: /* int n */
+ rc = emulate_int(ctxt, ctxt->src.val);
+ break;
+ case 0xce: /* into */
+ if (ctxt->eflags & X86_EFLAGS_OF)
+ rc = emulate_int(ctxt, 4);
+ break;
+ case 0xe9: /* jmp rel */
+ case 0xeb: /* jmp rel short */
+ rc = jmp_rel(ctxt, ctxt->src.val);
+ ctxt->dst.type = OP_NONE; /* Disable writeback. */
+ break;
+ case 0xf4: /* hlt */
+ ctxt->ops->halt(ctxt);
+ break;
+ case 0xf5: /* cmc */
+ /* complement carry flag from eflags reg */
+ ctxt->eflags ^= X86_EFLAGS_CF;
+ break;
+ case 0xf8: /* clc */
+ ctxt->eflags &= ~X86_EFLAGS_CF;
+ break;
+ case 0xf9: /* stc */
+ ctxt->eflags |= X86_EFLAGS_CF;
+ break;
+ case 0xfc: /* cld */
+ ctxt->eflags &= ~X86_EFLAGS_DF;
+ break;
+ case 0xfd: /* std */
+ ctxt->eflags |= X86_EFLAGS_DF;
+ break;
+ default:
+ goto cannot_emulate;
+ }
+
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+
+writeback:
+ if (ctxt->d & SrcWrite) {
+ BUG_ON(ctxt->src.type == OP_MEM || ctxt->src.type == OP_MEM_STR);
+ rc = writeback(ctxt, &ctxt->src);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
+ if (!(ctxt->d & NoWrite)) {
+ rc = writeback(ctxt, &ctxt->dst);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
+
+ /*
+ * restore dst type in case the decoding will be reused
+ * (happens for string instruction )
+ */
+ ctxt->dst.type = saved_dst_type;
+
+ if ((ctxt->d & SrcMask) == SrcSI)
+ string_addr_inc(ctxt, VCPU_REGS_RSI, &ctxt->src);
+
+ if ((ctxt->d & DstMask) == DstDI)
+ string_addr_inc(ctxt, VCPU_REGS_RDI, &ctxt->dst);
+
+ if (ctxt->rep_prefix && (ctxt->d & String)) {
+ unsigned int count;
+ struct read_cache *r = &ctxt->io_read;
+ if ((ctxt->d & SrcMask) == SrcSI)
+ count = ctxt->src.count;
+ else
+ count = ctxt->dst.count;
+ register_address_increment(ctxt, VCPU_REGS_RCX, -count);
+
+ if (!string_insn_completed(ctxt)) {
+ /*
+ * Re-enter guest when pio read ahead buffer is empty
+ * or, if it is not used, after each 1024 iteration.
+ */
+ if ((r->end != 0 || reg_read(ctxt, VCPU_REGS_RCX) & 0x3ff) &&
+ (r->end == 0 || r->end != r->pos)) {
+ /*
+ * Reset read cache. Usually happens before
+ * decode, but since instruction is restarted
+ * we have to do it here.
+ */
+ ctxt->mem_read.end = 0;
+ writeback_registers(ctxt);
+ return EMULATION_RESTART;
+ }
+ goto done; /* skip rip writeback */
+ }
+ ctxt->eflags &= ~X86_EFLAGS_RF;
+ }
+
+ ctxt->eip = ctxt->_eip;
+
+done:
+ if (rc == X86EMUL_PROPAGATE_FAULT) {
+ WARN_ON(ctxt->exception.vector > 0x1f);
+ ctxt->have_exception = true;
+ }
+ if (rc == X86EMUL_INTERCEPTED)
+ return EMULATION_INTERCEPTED;
+
+ if (rc == X86EMUL_CONTINUE)
+ writeback_registers(ctxt);
+
+ return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
+
+twobyte_insn:
+ switch (ctxt->b) {
+ case 0x09: /* wbinvd */
+ (ctxt->ops->wbinvd)(ctxt);
+ break;
+ case 0x08: /* invd */
+ case 0x0d: /* GrpP (prefetch) */
+ case 0x18: /* Grp16 (prefetch/nop) */
+ case 0x1f: /* nop */
+ break;
+ case 0x20: /* mov cr, reg */
+ ctxt->dst.val = ops->get_cr(ctxt, ctxt->modrm_reg);
+ break;
+ case 0x21: /* mov from dr to reg */
+ ops->get_dr(ctxt, ctxt->modrm_reg, &ctxt->dst.val);
+ break;
+ case 0x40 ... 0x4f: /* cmov */
+ if (test_cc(ctxt->b, ctxt->eflags))
+ ctxt->dst.val = ctxt->src.val;
+ else if (ctxt->op_bytes != 4)
+ ctxt->dst.type = OP_NONE; /* no writeback */
+ break;
+ case 0x80 ... 0x8f: /* jnz rel, etc*/
+ if (test_cc(ctxt->b, ctxt->eflags))
+ rc = jmp_rel(ctxt, ctxt->src.val);
+ break;
+ case 0x90 ... 0x9f: /* setcc r/m8 */
+ ctxt->dst.val = test_cc(ctxt->b, ctxt->eflags);
+ break;
+ case 0xb6 ... 0xb7: /* movzx */
+ ctxt->dst.bytes = ctxt->op_bytes;
+ ctxt->dst.val = (ctxt->src.bytes == 1) ? (u8) ctxt->src.val
+ : (u16) ctxt->src.val;
+ break;
+ case 0xbe ... 0xbf: /* movsx */
+ ctxt->dst.bytes = ctxt->op_bytes;
+ ctxt->dst.val = (ctxt->src.bytes == 1) ? (s8) ctxt->src.val :
+ (s16) ctxt->src.val;
+ break;
+ default:
+ goto cannot_emulate;
+ }
+
+threebyte_insn:
+
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+
+ goto writeback;
+
+cannot_emulate:
+ return EMULATION_FAILED;
+}
+
+void emulator_invalidate_register_cache(struct x86_emulate_ctxt *ctxt)
+{
+ invalidate_registers(ctxt);
+}
+
+void emulator_writeback_register_cache(struct x86_emulate_ctxt *ctxt)
+{
+ writeback_registers(ctxt);
+}
+
+bool emulator_can_use_gpa(struct x86_emulate_ctxt *ctxt)
+{
+ if (ctxt->rep_prefix && (ctxt->d & String))
+ return false;
+
+ if (ctxt->d & TwoMemOp)
+ return false;
+
+ return true;
+}
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
new file mode 100644
index 000000000..ca5a6c3f8
--- /dev/null
+++ b/arch/x86/kvm/hyperv.c
@@ -0,0 +1,1683 @@
+/*
+ * KVM Microsoft Hyper-V emulation
+ *
+ * derived from arch/x86/kvm/x86.c
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright (C) 2008 Qumranet, Inc.
+ * Copyright IBM Corporation, 2008
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ * Copyright (C) 2015 Andrey Smetanin <asmetanin@virtuozzo.com>
+ *
+ * Authors:
+ * Avi Kivity <avi@qumranet.com>
+ * Yaniv Kamay <yaniv@qumranet.com>
+ * Amit Shah <amit.shah@qumranet.com>
+ * Ben-Ami Yassour <benami@il.ibm.com>
+ * Andrey Smetanin <asmetanin@virtuozzo.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include "x86.h"
+#include "lapic.h"
+#include "ioapic.h"
+#include "hyperv.h"
+
+#include <linux/kvm_host.h>
+#include <linux/highmem.h>
+#include <linux/sched/cputime.h>
+#include <linux/eventfd.h>
+
+#include <asm/apicdef.h>
+#include <trace/events/kvm.h>
+
+#include "trace.h"
+
+static inline u64 synic_read_sint(struct kvm_vcpu_hv_synic *synic, int sint)
+{
+ return atomic64_read(&synic->sint[sint]);
+}
+
+static inline int synic_get_sint_vector(u64 sint_value)
+{
+ if (sint_value & HV_SYNIC_SINT_MASKED)
+ return -1;
+ return sint_value & HV_SYNIC_SINT_VECTOR_MASK;
+}
+
+static bool synic_has_vector_connected(struct kvm_vcpu_hv_synic *synic,
+ int vector)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(synic->sint); i++) {
+ if (synic_get_sint_vector(synic_read_sint(synic, i)) == vector)
+ return true;
+ }
+ return false;
+}
+
+static bool synic_has_vector_auto_eoi(struct kvm_vcpu_hv_synic *synic,
+ int vector)
+{
+ int i;
+ u64 sint_value;
+
+ for (i = 0; i < ARRAY_SIZE(synic->sint); i++) {
+ sint_value = synic_read_sint(synic, i);
+ if (synic_get_sint_vector(sint_value) == vector &&
+ sint_value & HV_SYNIC_SINT_AUTO_EOI)
+ return true;
+ }
+ return false;
+}
+
+static void synic_update_vector(struct kvm_vcpu_hv_synic *synic,
+ int vector)
+{
+ if (vector < HV_SYNIC_FIRST_VALID_VECTOR)
+ return;
+
+ if (synic_has_vector_connected(synic, vector))
+ __set_bit(vector, synic->vec_bitmap);
+ else
+ __clear_bit(vector, synic->vec_bitmap);
+
+ if (synic_has_vector_auto_eoi(synic, vector))
+ __set_bit(vector, synic->auto_eoi_bitmap);
+ else
+ __clear_bit(vector, synic->auto_eoi_bitmap);
+}
+
+static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint,
+ u64 data, bool host)
+{
+ int vector, old_vector;
+ bool masked;
+
+ vector = data & HV_SYNIC_SINT_VECTOR_MASK;
+ masked = data & HV_SYNIC_SINT_MASKED;
+
+ /*
+ * Valid vectors are 16-255, however, nested Hyper-V attempts to write
+ * default '0x10000' value on boot and this should not #GP. We need to
+ * allow zero-initing the register from host as well.
+ */
+ if (vector < HV_SYNIC_FIRST_VALID_VECTOR && !host && !masked)
+ return 1;
+ /*
+ * Guest may configure multiple SINTs to use the same vector, so
+ * we maintain a bitmap of vectors handled by synic, and a
+ * bitmap of vectors with auto-eoi behavior. The bitmaps are
+ * updated here, and atomically queried on fast paths.
+ */
+ old_vector = synic_read_sint(synic, sint) & HV_SYNIC_SINT_VECTOR_MASK;
+
+ atomic64_set(&synic->sint[sint], data);
+
+ synic_update_vector(synic, old_vector);
+
+ synic_update_vector(synic, vector);
+
+ /* Load SynIC vectors into EOI exit bitmap */
+ kvm_make_request(KVM_REQ_SCAN_IOAPIC, synic_to_vcpu(synic));
+ return 0;
+}
+
+static struct kvm_vcpu *get_vcpu_by_vpidx(struct kvm *kvm, u32 vpidx)
+{
+ struct kvm_vcpu *vcpu = NULL;
+ int i;
+
+ if (vpidx >= KVM_MAX_VCPUS)
+ return NULL;
+
+ vcpu = kvm_get_vcpu(kvm, vpidx);
+ if (vcpu && vcpu_to_hv_vcpu(vcpu)->vp_index == vpidx)
+ return vcpu;
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ if (vcpu_to_hv_vcpu(vcpu)->vp_index == vpidx)
+ return vcpu;
+ return NULL;
+}
+
+static struct kvm_vcpu_hv_synic *synic_get(struct kvm *kvm, u32 vpidx)
+{
+ struct kvm_vcpu *vcpu;
+ struct kvm_vcpu_hv_synic *synic;
+
+ vcpu = get_vcpu_by_vpidx(kvm, vpidx);
+ if (!vcpu)
+ return NULL;
+ synic = vcpu_to_synic(vcpu);
+ return (synic->active) ? synic : NULL;
+}
+
+static void synic_clear_sint_msg_pending(struct kvm_vcpu_hv_synic *synic,
+ u32 sint)
+{
+ struct kvm_vcpu *vcpu = synic_to_vcpu(synic);
+ struct page *page;
+ gpa_t gpa;
+ struct hv_message *msg;
+ struct hv_message_page *msg_page;
+
+ gpa = synic->msg_page & PAGE_MASK;
+ page = kvm_vcpu_gfn_to_page(vcpu, gpa >> PAGE_SHIFT);
+ if (is_error_page(page)) {
+ vcpu_err(vcpu, "Hyper-V SynIC can't get msg page, gpa 0x%llx\n",
+ gpa);
+ return;
+ }
+ msg_page = kmap_atomic(page);
+
+ msg = &msg_page->sint_message[sint];
+ msg->header.message_flags.msg_pending = 0;
+
+ kunmap_atomic(msg_page);
+ kvm_release_page_dirty(page);
+ kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
+}
+
+static void kvm_hv_notify_acked_sint(struct kvm_vcpu *vcpu, u32 sint)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu);
+ struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu);
+ struct kvm_vcpu_hv_stimer *stimer;
+ int gsi, idx, stimers_pending;
+
+ trace_kvm_hv_notify_acked_sint(vcpu->vcpu_id, sint);
+
+ if (synic->msg_page & HV_SYNIC_SIMP_ENABLE)
+ synic_clear_sint_msg_pending(synic, sint);
+
+ /* Try to deliver pending Hyper-V SynIC timers messages */
+ stimers_pending = 0;
+ for (idx = 0; idx < ARRAY_SIZE(hv_vcpu->stimer); idx++) {
+ stimer = &hv_vcpu->stimer[idx];
+ if (stimer->msg_pending &&
+ (stimer->config & HV_STIMER_ENABLE) &&
+ HV_STIMER_SINT(stimer->config) == sint) {
+ set_bit(stimer->index,
+ hv_vcpu->stimer_pending_bitmap);
+ stimers_pending++;
+ }
+ }
+ if (stimers_pending)
+ kvm_make_request(KVM_REQ_HV_STIMER, vcpu);
+
+ idx = srcu_read_lock(&kvm->irq_srcu);
+ gsi = atomic_read(&synic->sint_to_gsi[sint]);
+ if (gsi != -1)
+ kvm_notify_acked_gsi(kvm, gsi);
+ srcu_read_unlock(&kvm->irq_srcu, idx);
+}
+
+static void synic_exit(struct kvm_vcpu_hv_synic *synic, u32 msr)
+{
+ struct kvm_vcpu *vcpu = synic_to_vcpu(synic);
+ struct kvm_vcpu_hv *hv_vcpu = &vcpu->arch.hyperv;
+
+ hv_vcpu->exit.type = KVM_EXIT_HYPERV_SYNIC;
+ hv_vcpu->exit.u.synic.msr = msr;
+ hv_vcpu->exit.u.synic.control = synic->control;
+ hv_vcpu->exit.u.synic.evt_page = synic->evt_page;
+ hv_vcpu->exit.u.synic.msg_page = synic->msg_page;
+
+ kvm_make_request(KVM_REQ_HV_EXIT, vcpu);
+}
+
+static int synic_set_msr(struct kvm_vcpu_hv_synic *synic,
+ u32 msr, u64 data, bool host)
+{
+ struct kvm_vcpu *vcpu = synic_to_vcpu(synic);
+ int ret;
+
+ if (!synic->active && (!host || data))
+ return 1;
+
+ trace_kvm_hv_synic_set_msr(vcpu->vcpu_id, msr, data, host);
+
+ ret = 0;
+ switch (msr) {
+ case HV_X64_MSR_SCONTROL:
+ synic->control = data;
+ if (!host)
+ synic_exit(synic, msr);
+ break;
+ case HV_X64_MSR_SVERSION:
+ if (!host) {
+ ret = 1;
+ break;
+ }
+ synic->version = data;
+ break;
+ case HV_X64_MSR_SIEFP:
+ if ((data & HV_SYNIC_SIEFP_ENABLE) && !host &&
+ !synic->dont_zero_synic_pages)
+ if (kvm_clear_guest(vcpu->kvm,
+ data & PAGE_MASK, PAGE_SIZE)) {
+ ret = 1;
+ break;
+ }
+ synic->evt_page = data;
+ if (!host)
+ synic_exit(synic, msr);
+ break;
+ case HV_X64_MSR_SIMP:
+ if ((data & HV_SYNIC_SIMP_ENABLE) && !host &&
+ !synic->dont_zero_synic_pages)
+ if (kvm_clear_guest(vcpu->kvm,
+ data & PAGE_MASK, PAGE_SIZE)) {
+ ret = 1;
+ break;
+ }
+ synic->msg_page = data;
+ if (!host)
+ synic_exit(synic, msr);
+ break;
+ case HV_X64_MSR_EOM: {
+ int i;
+
+ if (!synic->active)
+ break;
+
+ for (i = 0; i < ARRAY_SIZE(synic->sint); i++)
+ kvm_hv_notify_acked_sint(vcpu, i);
+ break;
+ }
+ case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15:
+ ret = synic_set_sint(synic, msr - HV_X64_MSR_SINT0, data, host);
+ break;
+ default:
+ ret = 1;
+ break;
+ }
+ return ret;
+}
+
+static int synic_get_msr(struct kvm_vcpu_hv_synic *synic, u32 msr, u64 *pdata,
+ bool host)
+{
+ int ret;
+
+ if (!synic->active && !host)
+ return 1;
+
+ ret = 0;
+ switch (msr) {
+ case HV_X64_MSR_SCONTROL:
+ *pdata = synic->control;
+ break;
+ case HV_X64_MSR_SVERSION:
+ *pdata = synic->version;
+ break;
+ case HV_X64_MSR_SIEFP:
+ *pdata = synic->evt_page;
+ break;
+ case HV_X64_MSR_SIMP:
+ *pdata = synic->msg_page;
+ break;
+ case HV_X64_MSR_EOM:
+ *pdata = 0;
+ break;
+ case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15:
+ *pdata = atomic64_read(&synic->sint[msr - HV_X64_MSR_SINT0]);
+ break;
+ default:
+ ret = 1;
+ break;
+ }
+ return ret;
+}
+
+static int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint)
+{
+ struct kvm_vcpu *vcpu = synic_to_vcpu(synic);
+ struct kvm_lapic_irq irq;
+ int ret, vector;
+
+ if (sint >= ARRAY_SIZE(synic->sint))
+ return -EINVAL;
+
+ vector = synic_get_sint_vector(synic_read_sint(synic, sint));
+ if (vector < 0)
+ return -ENOENT;
+
+ memset(&irq, 0, sizeof(irq));
+ irq.shorthand = APIC_DEST_SELF;
+ irq.dest_mode = APIC_DEST_PHYSICAL;
+ irq.delivery_mode = APIC_DM_FIXED;
+ irq.vector = vector;
+ irq.level = 1;
+
+ ret = kvm_irq_delivery_to_apic(vcpu->kvm, vcpu->arch.apic, &irq, NULL);
+ trace_kvm_hv_synic_set_irq(vcpu->vcpu_id, sint, irq.vector, ret);
+ return ret;
+}
+
+int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vpidx, u32 sint)
+{
+ struct kvm_vcpu_hv_synic *synic;
+
+ synic = synic_get(kvm, vpidx);
+ if (!synic)
+ return -EINVAL;
+
+ return synic_set_irq(synic, sint);
+}
+
+void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector)
+{
+ struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu);
+ int i;
+
+ trace_kvm_hv_synic_send_eoi(vcpu->vcpu_id, vector);
+
+ for (i = 0; i < ARRAY_SIZE(synic->sint); i++)
+ if (synic_get_sint_vector(synic_read_sint(synic, i)) == vector)
+ kvm_hv_notify_acked_sint(vcpu, i);
+}
+
+static int kvm_hv_set_sint_gsi(struct kvm *kvm, u32 vpidx, u32 sint, int gsi)
+{
+ struct kvm_vcpu_hv_synic *synic;
+
+ synic = synic_get(kvm, vpidx);
+ if (!synic)
+ return -EINVAL;
+
+ if (sint >= ARRAY_SIZE(synic->sint_to_gsi))
+ return -EINVAL;
+
+ atomic_set(&synic->sint_to_gsi[sint], gsi);
+ return 0;
+}
+
+void kvm_hv_irq_routing_update(struct kvm *kvm)
+{
+ struct kvm_irq_routing_table *irq_rt;
+ struct kvm_kernel_irq_routing_entry *e;
+ u32 gsi;
+
+ irq_rt = srcu_dereference_check(kvm->irq_routing, &kvm->irq_srcu,
+ lockdep_is_held(&kvm->irq_lock));
+
+ for (gsi = 0; gsi < irq_rt->nr_rt_entries; gsi++) {
+ hlist_for_each_entry(e, &irq_rt->map[gsi], link) {
+ if (e->type == KVM_IRQ_ROUTING_HV_SINT)
+ kvm_hv_set_sint_gsi(kvm, e->hv_sint.vcpu,
+ e->hv_sint.sint, gsi);
+ }
+ }
+}
+
+static void synic_init(struct kvm_vcpu_hv_synic *synic)
+{
+ int i;
+
+ memset(synic, 0, sizeof(*synic));
+ synic->version = HV_SYNIC_VERSION_1;
+ for (i = 0; i < ARRAY_SIZE(synic->sint); i++) {
+ atomic64_set(&synic->sint[i], HV_SYNIC_SINT_MASKED);
+ atomic_set(&synic->sint_to_gsi[i], -1);
+ }
+}
+
+static u64 get_time_ref_counter(struct kvm *kvm)
+{
+ struct kvm_hv *hv = &kvm->arch.hyperv;
+ struct kvm_vcpu *vcpu;
+ u64 tsc;
+
+ /*
+ * The guest has not set up the TSC page or the clock isn't
+ * stable, fall back to get_kvmclock_ns.
+ */
+ if (!hv->tsc_ref.tsc_sequence)
+ return div_u64(get_kvmclock_ns(kvm), 100);
+
+ vcpu = kvm_get_vcpu(kvm, 0);
+ tsc = kvm_read_l1_tsc(vcpu, rdtsc());
+ return mul_u64_u64_shr(tsc, hv->tsc_ref.tsc_scale, 64)
+ + hv->tsc_ref.tsc_offset;
+}
+
+static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer,
+ bool vcpu_kick)
+{
+ struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer);
+
+ set_bit(stimer->index,
+ vcpu_to_hv_vcpu(vcpu)->stimer_pending_bitmap);
+ kvm_make_request(KVM_REQ_HV_STIMER, vcpu);
+ if (vcpu_kick)
+ kvm_vcpu_kick(vcpu);
+}
+
+static void stimer_cleanup(struct kvm_vcpu_hv_stimer *stimer)
+{
+ struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer);
+
+ trace_kvm_hv_stimer_cleanup(stimer_to_vcpu(stimer)->vcpu_id,
+ stimer->index);
+
+ hrtimer_cancel(&stimer->timer);
+ clear_bit(stimer->index,
+ vcpu_to_hv_vcpu(vcpu)->stimer_pending_bitmap);
+ stimer->msg_pending = false;
+ stimer->exp_time = 0;
+}
+
+static enum hrtimer_restart stimer_timer_callback(struct hrtimer *timer)
+{
+ struct kvm_vcpu_hv_stimer *stimer;
+
+ stimer = container_of(timer, struct kvm_vcpu_hv_stimer, timer);
+ trace_kvm_hv_stimer_callback(stimer_to_vcpu(stimer)->vcpu_id,
+ stimer->index);
+ stimer_mark_pending(stimer, true);
+
+ return HRTIMER_NORESTART;
+}
+
+/*
+ * stimer_start() assumptions:
+ * a) stimer->count is not equal to 0
+ * b) stimer->config has HV_STIMER_ENABLE flag
+ */
+static int stimer_start(struct kvm_vcpu_hv_stimer *stimer)
+{
+ u64 time_now;
+ ktime_t ktime_now;
+
+ time_now = get_time_ref_counter(stimer_to_vcpu(stimer)->kvm);
+ ktime_now = ktime_get();
+
+ if (stimer->config & HV_STIMER_PERIODIC) {
+ if (stimer->exp_time) {
+ if (time_now >= stimer->exp_time) {
+ u64 remainder;
+
+ div64_u64_rem(time_now - stimer->exp_time,
+ stimer->count, &remainder);
+ stimer->exp_time =
+ time_now + (stimer->count - remainder);
+ }
+ } else
+ stimer->exp_time = time_now + stimer->count;
+
+ trace_kvm_hv_stimer_start_periodic(
+ stimer_to_vcpu(stimer)->vcpu_id,
+ stimer->index,
+ time_now, stimer->exp_time);
+
+ hrtimer_start(&stimer->timer,
+ ktime_add_ns(ktime_now,
+ 100 * (stimer->exp_time - time_now)),
+ HRTIMER_MODE_ABS);
+ return 0;
+ }
+ stimer->exp_time = stimer->count;
+ if (time_now >= stimer->count) {
+ /*
+ * Expire timer according to Hypervisor Top-Level Functional
+ * specification v4(15.3.1):
+ * "If a one shot is enabled and the specified count is in
+ * the past, it will expire immediately."
+ */
+ stimer_mark_pending(stimer, false);
+ return 0;
+ }
+
+ trace_kvm_hv_stimer_start_one_shot(stimer_to_vcpu(stimer)->vcpu_id,
+ stimer->index,
+ time_now, stimer->count);
+
+ hrtimer_start(&stimer->timer,
+ ktime_add_ns(ktime_now, 100 * (stimer->count - time_now)),
+ HRTIMER_MODE_ABS);
+ return 0;
+}
+
+static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config,
+ bool host)
+{
+ struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer);
+ struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu);
+
+ if (!synic->active && (!host || config))
+ return 1;
+
+ trace_kvm_hv_stimer_set_config(stimer_to_vcpu(stimer)->vcpu_id,
+ stimer->index, config, host);
+
+ stimer_cleanup(stimer);
+ if ((stimer->config & HV_STIMER_ENABLE) && HV_STIMER_SINT(config) == 0)
+ config &= ~HV_STIMER_ENABLE;
+ stimer->config = config;
+ stimer_mark_pending(stimer, false);
+ return 0;
+}
+
+static int stimer_set_count(struct kvm_vcpu_hv_stimer *stimer, u64 count,
+ bool host)
+{
+ struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer);
+ struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu);
+
+ if (!synic->active && (!host || count))
+ return 1;
+
+ trace_kvm_hv_stimer_set_count(stimer_to_vcpu(stimer)->vcpu_id,
+ stimer->index, count, host);
+
+ stimer_cleanup(stimer);
+ stimer->count = count;
+ if (stimer->count == 0)
+ stimer->config &= ~HV_STIMER_ENABLE;
+ else if (stimer->config & HV_STIMER_AUTOENABLE)
+ stimer->config |= HV_STIMER_ENABLE;
+ stimer_mark_pending(stimer, false);
+ return 0;
+}
+
+static int stimer_get_config(struct kvm_vcpu_hv_stimer *stimer, u64 *pconfig)
+{
+ *pconfig = stimer->config;
+ return 0;
+}
+
+static int stimer_get_count(struct kvm_vcpu_hv_stimer *stimer, u64 *pcount)
+{
+ *pcount = stimer->count;
+ return 0;
+}
+
+static int synic_deliver_msg(struct kvm_vcpu_hv_synic *synic, u32 sint,
+ struct hv_message *src_msg)
+{
+ struct kvm_vcpu *vcpu = synic_to_vcpu(synic);
+ struct page *page;
+ gpa_t gpa;
+ struct hv_message *dst_msg;
+ int r;
+ struct hv_message_page *msg_page;
+
+ if (!(synic->msg_page & HV_SYNIC_SIMP_ENABLE))
+ return -ENOENT;
+
+ gpa = synic->msg_page & PAGE_MASK;
+ page = kvm_vcpu_gfn_to_page(vcpu, gpa >> PAGE_SHIFT);
+ if (is_error_page(page))
+ return -EFAULT;
+
+ msg_page = kmap_atomic(page);
+ dst_msg = &msg_page->sint_message[sint];
+ if (sync_cmpxchg(&dst_msg->header.message_type, HVMSG_NONE,
+ src_msg->header.message_type) != HVMSG_NONE) {
+ dst_msg->header.message_flags.msg_pending = 1;
+ r = -EAGAIN;
+ } else {
+ memcpy(&dst_msg->u.payload, &src_msg->u.payload,
+ src_msg->header.payload_size);
+ dst_msg->header.message_type = src_msg->header.message_type;
+ dst_msg->header.payload_size = src_msg->header.payload_size;
+ r = synic_set_irq(synic, sint);
+ if (r >= 1)
+ r = 0;
+ else if (r == 0)
+ r = -EFAULT;
+ }
+ kunmap_atomic(msg_page);
+ kvm_release_page_dirty(page);
+ kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
+ return r;
+}
+
+static int stimer_send_msg(struct kvm_vcpu_hv_stimer *stimer)
+{
+ struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer);
+ struct hv_message *msg = &stimer->msg;
+ struct hv_timer_message_payload *payload =
+ (struct hv_timer_message_payload *)&msg->u.payload;
+
+ payload->expiration_time = stimer->exp_time;
+ payload->delivery_time = get_time_ref_counter(vcpu->kvm);
+ return synic_deliver_msg(vcpu_to_synic(vcpu),
+ HV_STIMER_SINT(stimer->config), msg);
+}
+
+static void stimer_expiration(struct kvm_vcpu_hv_stimer *stimer)
+{
+ int r;
+
+ stimer->msg_pending = true;
+ r = stimer_send_msg(stimer);
+ trace_kvm_hv_stimer_expiration(stimer_to_vcpu(stimer)->vcpu_id,
+ stimer->index, r);
+ if (!r) {
+ stimer->msg_pending = false;
+ if (!(stimer->config & HV_STIMER_PERIODIC))
+ stimer->config &= ~HV_STIMER_ENABLE;
+ }
+}
+
+void kvm_hv_process_stimers(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu);
+ struct kvm_vcpu_hv_stimer *stimer;
+ u64 time_now, exp_time;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++)
+ if (test_and_clear_bit(i, hv_vcpu->stimer_pending_bitmap)) {
+ stimer = &hv_vcpu->stimer[i];
+ if (stimer->config & HV_STIMER_ENABLE) {
+ exp_time = stimer->exp_time;
+
+ if (exp_time) {
+ time_now =
+ get_time_ref_counter(vcpu->kvm);
+ if (time_now >= exp_time)
+ stimer_expiration(stimer);
+ }
+
+ if ((stimer->config & HV_STIMER_ENABLE) &&
+ stimer->count) {
+ if (!stimer->msg_pending)
+ stimer_start(stimer);
+ } else
+ stimer_cleanup(stimer);
+ }
+ }
+}
+
+void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu);
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++)
+ stimer_cleanup(&hv_vcpu->stimer[i]);
+}
+
+bool kvm_hv_assist_page_enabled(struct kvm_vcpu *vcpu)
+{
+ if (!(vcpu->arch.hyperv.hv_vapic & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE))
+ return false;
+ return vcpu->arch.pv_eoi.msr_val & KVM_MSR_ENABLED;
+}
+EXPORT_SYMBOL_GPL(kvm_hv_assist_page_enabled);
+
+bool kvm_hv_get_assist_page(struct kvm_vcpu *vcpu,
+ struct hv_vp_assist_page *assist_page)
+{
+ if (!kvm_hv_assist_page_enabled(vcpu))
+ return false;
+ return !kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data,
+ assist_page, sizeof(*assist_page));
+}
+EXPORT_SYMBOL_GPL(kvm_hv_get_assist_page);
+
+static void stimer_prepare_msg(struct kvm_vcpu_hv_stimer *stimer)
+{
+ struct hv_message *msg = &stimer->msg;
+ struct hv_timer_message_payload *payload =
+ (struct hv_timer_message_payload *)&msg->u.payload;
+
+ memset(&msg->header, 0, sizeof(msg->header));
+ msg->header.message_type = HVMSG_TIMER_EXPIRED;
+ msg->header.payload_size = sizeof(*payload);
+
+ payload->timer_index = stimer->index;
+ payload->expiration_time = 0;
+ payload->delivery_time = 0;
+}
+
+static void stimer_init(struct kvm_vcpu_hv_stimer *stimer, int timer_index)
+{
+ memset(stimer, 0, sizeof(*stimer));
+ stimer->index = timer_index;
+ hrtimer_init(&stimer->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ stimer->timer.function = stimer_timer_callback;
+ stimer_prepare_msg(stimer);
+}
+
+void kvm_hv_vcpu_init(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu);
+ int i;
+
+ synic_init(&hv_vcpu->synic);
+
+ bitmap_zero(hv_vcpu->stimer_pending_bitmap, HV_SYNIC_STIMER_COUNT);
+ for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++)
+ stimer_init(&hv_vcpu->stimer[i], i);
+}
+
+void kvm_hv_vcpu_postcreate(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu);
+
+ hv_vcpu->vp_index = kvm_vcpu_get_idx(vcpu);
+}
+
+int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages)
+{
+ struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu);
+
+ /*
+ * Hyper-V SynIC auto EOI SINT's are
+ * not compatible with APICV, so deactivate APICV
+ */
+ kvm_vcpu_deactivate_apicv(vcpu);
+ synic->active = true;
+ synic->dont_zero_synic_pages = dont_zero_synic_pages;
+ return 0;
+}
+
+static bool kvm_hv_msr_partition_wide(u32 msr)
+{
+ bool r = false;
+
+ switch (msr) {
+ case HV_X64_MSR_GUEST_OS_ID:
+ case HV_X64_MSR_HYPERCALL:
+ case HV_X64_MSR_REFERENCE_TSC:
+ case HV_X64_MSR_TIME_REF_COUNT:
+ case HV_X64_MSR_CRASH_CTL:
+ case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
+ case HV_X64_MSR_RESET:
+ case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
+ case HV_X64_MSR_TSC_EMULATION_CONTROL:
+ case HV_X64_MSR_TSC_EMULATION_STATUS:
+ r = true;
+ break;
+ }
+
+ return r;
+}
+
+static int kvm_hv_msr_get_crash_data(struct kvm_vcpu *vcpu,
+ u32 index, u64 *pdata)
+{
+ struct kvm_hv *hv = &vcpu->kvm->arch.hyperv;
+ size_t size = ARRAY_SIZE(hv->hv_crash_param);
+
+ if (WARN_ON_ONCE(index >= size))
+ return -EINVAL;
+
+ *pdata = hv->hv_crash_param[array_index_nospec(index, size)];
+ return 0;
+}
+
+static int kvm_hv_msr_get_crash_ctl(struct kvm_vcpu *vcpu, u64 *pdata)
+{
+ struct kvm_hv *hv = &vcpu->kvm->arch.hyperv;
+
+ *pdata = hv->hv_crash_ctl;
+ return 0;
+}
+
+static int kvm_hv_msr_set_crash_ctl(struct kvm_vcpu *vcpu, u64 data, bool host)
+{
+ struct kvm_hv *hv = &vcpu->kvm->arch.hyperv;
+
+ if (host)
+ hv->hv_crash_ctl = data & HV_X64_MSR_CRASH_CTL_NOTIFY;
+
+ if (!host && (data & HV_X64_MSR_CRASH_CTL_NOTIFY)) {
+
+ vcpu_debug(vcpu, "hv crash (0x%llx 0x%llx 0x%llx 0x%llx 0x%llx)\n",
+ hv->hv_crash_param[0],
+ hv->hv_crash_param[1],
+ hv->hv_crash_param[2],
+ hv->hv_crash_param[3],
+ hv->hv_crash_param[4]);
+
+ /* Send notification about crash to user space */
+ kvm_make_request(KVM_REQ_HV_CRASH, vcpu);
+ }
+
+ return 0;
+}
+
+static int kvm_hv_msr_set_crash_data(struct kvm_vcpu *vcpu,
+ u32 index, u64 data)
+{
+ struct kvm_hv *hv = &vcpu->kvm->arch.hyperv;
+ size_t size = ARRAY_SIZE(hv->hv_crash_param);
+
+ if (WARN_ON_ONCE(index >= size))
+ return -EINVAL;
+
+ hv->hv_crash_param[array_index_nospec(index, size)] = data;
+ return 0;
+}
+
+/*
+ * The kvmclock and Hyper-V TSC page use similar formulas, and converting
+ * between them is possible:
+ *
+ * kvmclock formula:
+ * nsec = (ticks - tsc_timestamp) * tsc_to_system_mul * 2^(tsc_shift-32)
+ * + system_time
+ *
+ * Hyper-V formula:
+ * nsec/100 = ticks * scale / 2^64 + offset
+ *
+ * When tsc_timestamp = system_time = 0, offset is zero in the Hyper-V formula.
+ * By dividing the kvmclock formula by 100 and equating what's left we get:
+ * ticks * scale / 2^64 = ticks * tsc_to_system_mul * 2^(tsc_shift-32) / 100
+ * scale / 2^64 = tsc_to_system_mul * 2^(tsc_shift-32) / 100
+ * scale = tsc_to_system_mul * 2^(32+tsc_shift) / 100
+ *
+ * Now expand the kvmclock formula and divide by 100:
+ * nsec = ticks * tsc_to_system_mul * 2^(tsc_shift-32)
+ * - tsc_timestamp * tsc_to_system_mul * 2^(tsc_shift-32)
+ * + system_time
+ * nsec/100 = ticks * tsc_to_system_mul * 2^(tsc_shift-32) / 100
+ * - tsc_timestamp * tsc_to_system_mul * 2^(tsc_shift-32) / 100
+ * + system_time / 100
+ *
+ * Replace tsc_to_system_mul * 2^(tsc_shift-32) / 100 by scale / 2^64:
+ * nsec/100 = ticks * scale / 2^64
+ * - tsc_timestamp * scale / 2^64
+ * + system_time / 100
+ *
+ * Equate with the Hyper-V formula so that ticks * scale / 2^64 cancels out:
+ * offset = system_time / 100 - tsc_timestamp * scale / 2^64
+ *
+ * These two equivalencies are implemented in this function.
+ */
+static bool compute_tsc_page_parameters(struct pvclock_vcpu_time_info *hv_clock,
+ HV_REFERENCE_TSC_PAGE *tsc_ref)
+{
+ u64 max_mul;
+
+ if (!(hv_clock->flags & PVCLOCK_TSC_STABLE_BIT))
+ return false;
+
+ /*
+ * check if scale would overflow, if so we use the time ref counter
+ * tsc_to_system_mul * 2^(tsc_shift+32) / 100 >= 2^64
+ * tsc_to_system_mul / 100 >= 2^(32-tsc_shift)
+ * tsc_to_system_mul >= 100 * 2^(32-tsc_shift)
+ */
+ max_mul = 100ull << (32 - hv_clock->tsc_shift);
+ if (hv_clock->tsc_to_system_mul >= max_mul)
+ return false;
+
+ /*
+ * Otherwise compute the scale and offset according to the formulas
+ * derived above.
+ */
+ tsc_ref->tsc_scale =
+ mul_u64_u32_div(1ULL << (32 + hv_clock->tsc_shift),
+ hv_clock->tsc_to_system_mul,
+ 100);
+
+ tsc_ref->tsc_offset = hv_clock->system_time;
+ do_div(tsc_ref->tsc_offset, 100);
+ tsc_ref->tsc_offset -=
+ mul_u64_u64_shr(hv_clock->tsc_timestamp, tsc_ref->tsc_scale, 64);
+ return true;
+}
+
+void kvm_hv_setup_tsc_page(struct kvm *kvm,
+ struct pvclock_vcpu_time_info *hv_clock)
+{
+ struct kvm_hv *hv = &kvm->arch.hyperv;
+ u32 tsc_seq;
+ u64 gfn;
+
+ BUILD_BUG_ON(sizeof(tsc_seq) != sizeof(hv->tsc_ref.tsc_sequence));
+ BUILD_BUG_ON(offsetof(HV_REFERENCE_TSC_PAGE, tsc_sequence) != 0);
+
+ if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE))
+ return;
+
+ mutex_lock(&kvm->arch.hyperv.hv_lock);
+ if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE))
+ goto out_unlock;
+
+ gfn = hv->hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
+ /*
+ * Because the TSC parameters only vary when there is a
+ * change in the master clock, do not bother with caching.
+ */
+ if (unlikely(kvm_read_guest(kvm, gfn_to_gpa(gfn),
+ &tsc_seq, sizeof(tsc_seq))))
+ goto out_unlock;
+
+ /*
+ * While we're computing and writing the parameters, force the
+ * guest to use the time reference count MSR.
+ */
+ hv->tsc_ref.tsc_sequence = 0;
+ if (kvm_write_guest(kvm, gfn_to_gpa(gfn),
+ &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence)))
+ goto out_unlock;
+
+ if (!compute_tsc_page_parameters(hv_clock, &hv->tsc_ref))
+ goto out_unlock;
+
+ /* Ensure sequence is zero before writing the rest of the struct. */
+ smp_wmb();
+ if (kvm_write_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref)))
+ goto out_unlock;
+
+ /*
+ * Now switch to the TSC page mechanism by writing the sequence.
+ */
+ tsc_seq++;
+ if (tsc_seq == 0xFFFFFFFF || tsc_seq == 0)
+ tsc_seq = 1;
+
+ /* Write the struct entirely before the non-zero sequence. */
+ smp_wmb();
+
+ hv->tsc_ref.tsc_sequence = tsc_seq;
+ kvm_write_guest(kvm, gfn_to_gpa(gfn),
+ &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence));
+out_unlock:
+ mutex_unlock(&kvm->arch.hyperv.hv_lock);
+}
+
+static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
+ bool host)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_hv *hv = &kvm->arch.hyperv;
+
+ switch (msr) {
+ case HV_X64_MSR_GUEST_OS_ID:
+ hv->hv_guest_os_id = data;
+ /* setting guest os id to zero disables hypercall page */
+ if (!hv->hv_guest_os_id)
+ hv->hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE;
+ break;
+ case HV_X64_MSR_HYPERCALL: {
+ u64 gfn;
+ unsigned long addr;
+ u8 instructions[4];
+
+ /* if guest os id is not set hypercall should remain disabled */
+ if (!hv->hv_guest_os_id)
+ break;
+ if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) {
+ hv->hv_hypercall = data;
+ break;
+ }
+ gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT;
+ addr = gfn_to_hva(kvm, gfn);
+ if (kvm_is_error_hva(addr))
+ return 1;
+ kvm_x86_ops->patch_hypercall(vcpu, instructions);
+ ((unsigned char *)instructions)[3] = 0xc3; /* ret */
+ if (__copy_to_user((void __user *)addr, instructions, 4))
+ return 1;
+ hv->hv_hypercall = data;
+ mark_page_dirty(kvm, gfn);
+ break;
+ }
+ case HV_X64_MSR_REFERENCE_TSC:
+ hv->hv_tsc_page = data;
+ if (hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)
+ kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
+ break;
+ case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
+ return kvm_hv_msr_set_crash_data(vcpu,
+ msr - HV_X64_MSR_CRASH_P0,
+ data);
+ case HV_X64_MSR_CRASH_CTL:
+ return kvm_hv_msr_set_crash_ctl(vcpu, data, host);
+ case HV_X64_MSR_RESET:
+ if (data == 1) {
+ vcpu_debug(vcpu, "hyper-v reset requested\n");
+ kvm_make_request(KVM_REQ_HV_RESET, vcpu);
+ }
+ break;
+ case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
+ hv->hv_reenlightenment_control = data;
+ break;
+ case HV_X64_MSR_TSC_EMULATION_CONTROL:
+ hv->hv_tsc_emulation_control = data;
+ break;
+ case HV_X64_MSR_TSC_EMULATION_STATUS:
+ hv->hv_tsc_emulation_status = data;
+ break;
+ case HV_X64_MSR_TIME_REF_COUNT:
+ /* read-only, but still ignore it if host-initiated */
+ if (!host)
+ return 1;
+ break;
+ default:
+ vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n",
+ msr, data);
+ return 1;
+ }
+ return 0;
+}
+
+/* Calculate cpu time spent by current task in 100ns units */
+static u64 current_task_runtime_100ns(void)
+{
+ u64 utime, stime;
+
+ task_cputime_adjusted(current, &utime, &stime);
+
+ return div_u64(utime + stime, 100);
+}
+
+static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
+{
+ struct kvm_vcpu_hv *hv_vcpu = &vcpu->arch.hyperv;
+
+ switch (msr) {
+ case HV_X64_MSR_VP_INDEX: {
+ struct kvm_hv *hv = &vcpu->kvm->arch.hyperv;
+ int vcpu_idx = kvm_vcpu_get_idx(vcpu);
+ u32 new_vp_index = (u32)data;
+
+ if (!host || new_vp_index >= KVM_MAX_VCPUS)
+ return 1;
+
+ if (new_vp_index == hv_vcpu->vp_index)
+ return 0;
+
+ /*
+ * The VP index is initialized to vcpu_index by
+ * kvm_hv_vcpu_postcreate so they initially match. Now the
+ * VP index is changing, adjust num_mismatched_vp_indexes if
+ * it now matches or no longer matches vcpu_idx.
+ */
+ if (hv_vcpu->vp_index == vcpu_idx)
+ atomic_inc(&hv->num_mismatched_vp_indexes);
+ else if (new_vp_index == vcpu_idx)
+ atomic_dec(&hv->num_mismatched_vp_indexes);
+
+ hv_vcpu->vp_index = new_vp_index;
+ break;
+ }
+ case HV_X64_MSR_VP_ASSIST_PAGE: {
+ u64 gfn;
+ unsigned long addr;
+
+ if (!(data & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE)) {
+ hv_vcpu->hv_vapic = data;
+ if (kvm_lapic_enable_pv_eoi(vcpu, 0, 0))
+ return 1;
+ break;
+ }
+ gfn = data >> HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT;
+ addr = kvm_vcpu_gfn_to_hva(vcpu, gfn);
+ if (kvm_is_error_hva(addr))
+ return 1;
+ if (__clear_user((void __user *)addr, PAGE_SIZE))
+ return 1;
+ hv_vcpu->hv_vapic = data;
+ kvm_vcpu_mark_page_dirty(vcpu, gfn);
+ if (kvm_lapic_enable_pv_eoi(vcpu,
+ gfn_to_gpa(gfn) | KVM_MSR_ENABLED,
+ sizeof(struct hv_vp_assist_page)))
+ return 1;
+ break;
+ }
+ case HV_X64_MSR_EOI:
+ return kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data);
+ case HV_X64_MSR_ICR:
+ return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data);
+ case HV_X64_MSR_TPR:
+ return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data);
+ case HV_X64_MSR_VP_RUNTIME:
+ if (!host)
+ return 1;
+ hv_vcpu->runtime_offset = data - current_task_runtime_100ns();
+ break;
+ case HV_X64_MSR_SCONTROL:
+ case HV_X64_MSR_SVERSION:
+ case HV_X64_MSR_SIEFP:
+ case HV_X64_MSR_SIMP:
+ case HV_X64_MSR_EOM:
+ case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15:
+ return synic_set_msr(vcpu_to_synic(vcpu), msr, data, host);
+ case HV_X64_MSR_STIMER0_CONFIG:
+ case HV_X64_MSR_STIMER1_CONFIG:
+ case HV_X64_MSR_STIMER2_CONFIG:
+ case HV_X64_MSR_STIMER3_CONFIG: {
+ int timer_index = (msr - HV_X64_MSR_STIMER0_CONFIG)/2;
+
+ return stimer_set_config(vcpu_to_stimer(vcpu, timer_index),
+ data, host);
+ }
+ case HV_X64_MSR_STIMER0_COUNT:
+ case HV_X64_MSR_STIMER1_COUNT:
+ case HV_X64_MSR_STIMER2_COUNT:
+ case HV_X64_MSR_STIMER3_COUNT: {
+ int timer_index = (msr - HV_X64_MSR_STIMER0_COUNT)/2;
+
+ return stimer_set_count(vcpu_to_stimer(vcpu, timer_index),
+ data, host);
+ }
+ case HV_X64_MSR_TSC_FREQUENCY:
+ case HV_X64_MSR_APIC_FREQUENCY:
+ /* read-only, but still ignore it if host-initiated */
+ if (!host)
+ return 1;
+ break;
+ default:
+ vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n",
+ msr, data);
+ return 1;
+ }
+
+ return 0;
+}
+
+static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+{
+ u64 data = 0;
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_hv *hv = &kvm->arch.hyperv;
+
+ switch (msr) {
+ case HV_X64_MSR_GUEST_OS_ID:
+ data = hv->hv_guest_os_id;
+ break;
+ case HV_X64_MSR_HYPERCALL:
+ data = hv->hv_hypercall;
+ break;
+ case HV_X64_MSR_TIME_REF_COUNT:
+ data = get_time_ref_counter(kvm);
+ break;
+ case HV_X64_MSR_REFERENCE_TSC:
+ data = hv->hv_tsc_page;
+ break;
+ case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
+ return kvm_hv_msr_get_crash_data(vcpu,
+ msr - HV_X64_MSR_CRASH_P0,
+ pdata);
+ case HV_X64_MSR_CRASH_CTL:
+ return kvm_hv_msr_get_crash_ctl(vcpu, pdata);
+ case HV_X64_MSR_RESET:
+ data = 0;
+ break;
+ case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
+ data = hv->hv_reenlightenment_control;
+ break;
+ case HV_X64_MSR_TSC_EMULATION_CONTROL:
+ data = hv->hv_tsc_emulation_control;
+ break;
+ case HV_X64_MSR_TSC_EMULATION_STATUS:
+ data = hv->hv_tsc_emulation_status;
+ break;
+ default:
+ vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
+ return 1;
+ }
+
+ *pdata = data;
+ return 0;
+}
+
+static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata,
+ bool host)
+{
+ u64 data = 0;
+ struct kvm_vcpu_hv *hv_vcpu = &vcpu->arch.hyperv;
+
+ switch (msr) {
+ case HV_X64_MSR_VP_INDEX:
+ data = hv_vcpu->vp_index;
+ break;
+ case HV_X64_MSR_EOI:
+ return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata);
+ case HV_X64_MSR_ICR:
+ return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata);
+ case HV_X64_MSR_TPR:
+ return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata);
+ case HV_X64_MSR_VP_ASSIST_PAGE:
+ data = hv_vcpu->hv_vapic;
+ break;
+ case HV_X64_MSR_VP_RUNTIME:
+ data = current_task_runtime_100ns() + hv_vcpu->runtime_offset;
+ break;
+ case HV_X64_MSR_SCONTROL:
+ case HV_X64_MSR_SVERSION:
+ case HV_X64_MSR_SIEFP:
+ case HV_X64_MSR_SIMP:
+ case HV_X64_MSR_EOM:
+ case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15:
+ return synic_get_msr(vcpu_to_synic(vcpu), msr, pdata, host);
+ case HV_X64_MSR_STIMER0_CONFIG:
+ case HV_X64_MSR_STIMER1_CONFIG:
+ case HV_X64_MSR_STIMER2_CONFIG:
+ case HV_X64_MSR_STIMER3_CONFIG: {
+ int timer_index = (msr - HV_X64_MSR_STIMER0_CONFIG)/2;
+
+ return stimer_get_config(vcpu_to_stimer(vcpu, timer_index),
+ pdata);
+ }
+ case HV_X64_MSR_STIMER0_COUNT:
+ case HV_X64_MSR_STIMER1_COUNT:
+ case HV_X64_MSR_STIMER2_COUNT:
+ case HV_X64_MSR_STIMER3_COUNT: {
+ int timer_index = (msr - HV_X64_MSR_STIMER0_COUNT)/2;
+
+ return stimer_get_count(vcpu_to_stimer(vcpu, timer_index),
+ pdata);
+ }
+ case HV_X64_MSR_TSC_FREQUENCY:
+ data = (u64)vcpu->arch.virtual_tsc_khz * 1000;
+ break;
+ case HV_X64_MSR_APIC_FREQUENCY:
+ data = APIC_BUS_FREQUENCY;
+ break;
+ default:
+ vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
+ return 1;
+ }
+ *pdata = data;
+ return 0;
+}
+
+int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
+{
+ if (kvm_hv_msr_partition_wide(msr)) {
+ int r;
+
+ mutex_lock(&vcpu->kvm->arch.hyperv.hv_lock);
+ r = kvm_hv_set_msr_pw(vcpu, msr, data, host);
+ mutex_unlock(&vcpu->kvm->arch.hyperv.hv_lock);
+ return r;
+ } else
+ return kvm_hv_set_msr(vcpu, msr, data, host);
+}
+
+int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
+{
+ if (kvm_hv_msr_partition_wide(msr)) {
+ int r;
+
+ mutex_lock(&vcpu->kvm->arch.hyperv.hv_lock);
+ r = kvm_hv_get_msr_pw(vcpu, msr, pdata);
+ mutex_unlock(&vcpu->kvm->arch.hyperv.hv_lock);
+ return r;
+ } else
+ return kvm_hv_get_msr(vcpu, msr, pdata, host);
+}
+
+static __always_inline int get_sparse_bank_no(u64 valid_bank_mask, int bank_no)
+{
+ int i = 0, j;
+
+ if (!(valid_bank_mask & BIT_ULL(bank_no)))
+ return -1;
+
+ for (j = 0; j < bank_no; j++)
+ if (valid_bank_mask & BIT_ULL(j))
+ i++;
+
+ return i;
+}
+
+static u64 kvm_hv_flush_tlb(struct kvm_vcpu *current_vcpu, u64 ingpa,
+ u16 rep_cnt, bool ex)
+{
+ struct kvm *kvm = current_vcpu->kvm;
+ struct kvm_vcpu_hv *hv_current = &current_vcpu->arch.hyperv;
+ struct hv_tlb_flush_ex flush_ex;
+ struct hv_tlb_flush flush;
+ struct kvm_vcpu *vcpu;
+ unsigned long vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)] = {0};
+ unsigned long valid_bank_mask = 0;
+ u64 sparse_banks[64];
+ int sparse_banks_len, i;
+ bool all_cpus;
+
+ if (!ex) {
+ if (unlikely(kvm_read_guest(kvm, ingpa, &flush, sizeof(flush))))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+
+ trace_kvm_hv_flush_tlb(flush.processor_mask,
+ flush.address_space, flush.flags);
+
+ sparse_banks[0] = flush.processor_mask;
+
+ /*
+ * Work around possible WS2012 bug: it sends hypercalls
+ * with processor_mask = 0x0 and HV_FLUSH_ALL_PROCESSORS clear,
+ * while also expecting us to flush something and crashing if
+ * we don't. Let's treat processor_mask == 0 same as
+ * HV_FLUSH_ALL_PROCESSORS.
+ */
+ all_cpus = (flush.flags & HV_FLUSH_ALL_PROCESSORS) ||
+ flush.processor_mask == 0;
+ } else {
+ if (unlikely(kvm_read_guest(kvm, ingpa, &flush_ex,
+ sizeof(flush_ex))))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+
+ trace_kvm_hv_flush_tlb_ex(flush_ex.hv_vp_set.valid_bank_mask,
+ flush_ex.hv_vp_set.format,
+ flush_ex.address_space,
+ flush_ex.flags);
+
+ valid_bank_mask = flush_ex.hv_vp_set.valid_bank_mask;
+ all_cpus = flush_ex.hv_vp_set.format !=
+ HV_GENERIC_SET_SPARSE_4K;
+
+ sparse_banks_len = bitmap_weight(&valid_bank_mask, 64) *
+ sizeof(sparse_banks[0]);
+
+ if (!sparse_banks_len && !all_cpus)
+ goto ret_success;
+
+ if (!all_cpus &&
+ kvm_read_guest(kvm,
+ ingpa + offsetof(struct hv_tlb_flush_ex,
+ hv_vp_set.bank_contents),
+ sparse_banks,
+ sparse_banks_len))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ }
+
+ cpumask_clear(&hv_current->tlb_lush);
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv;
+ int bank = hv->vp_index / 64, sbank = 0;
+
+ if (!all_cpus) {
+ /* Banks >64 can't be represented */
+ if (bank >= 64)
+ continue;
+
+ /* Non-ex hypercalls can only address first 64 vCPUs */
+ if (!ex && bank)
+ continue;
+
+ if (ex) {
+ /*
+ * Check is the bank of this vCPU is in sparse
+ * set and get the sparse bank number.
+ */
+ sbank = get_sparse_bank_no(valid_bank_mask,
+ bank);
+
+ if (sbank < 0)
+ continue;
+ }
+
+ if (!(sparse_banks[sbank] & BIT_ULL(hv->vp_index % 64)))
+ continue;
+ }
+
+ /*
+ * vcpu->arch.cr3 may not be up-to-date for running vCPUs so we
+ * can't analyze it here, flush TLB regardless of the specified
+ * address space.
+ */
+ __set_bit(i, vcpu_bitmap);
+ }
+
+ kvm_make_vcpus_request_mask(kvm,
+ KVM_REQ_TLB_FLUSH | KVM_REQUEST_NO_WAKEUP,
+ vcpu_bitmap, &hv_current->tlb_lush);
+
+ret_success:
+ /* We always do full TLB flush, set rep_done = rep_cnt. */
+ return (u64)HV_STATUS_SUCCESS |
+ ((u64)rep_cnt << HV_HYPERCALL_REP_COMP_OFFSET);
+}
+
+bool kvm_hv_hypercall_enabled(struct kvm *kvm)
+{
+ return READ_ONCE(kvm->arch.hyperv.hv_hypercall) & HV_X64_MSR_HYPERCALL_ENABLE;
+}
+
+static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result)
+{
+ bool longmode;
+
+ longmode = is_64_bit_mode(vcpu);
+ if (longmode)
+ kvm_register_write(vcpu, VCPU_REGS_RAX, result);
+ else {
+ kvm_register_write(vcpu, VCPU_REGS_RDX, result >> 32);
+ kvm_register_write(vcpu, VCPU_REGS_RAX, result & 0xffffffff);
+ }
+}
+
+static int kvm_hv_hypercall_complete(struct kvm_vcpu *vcpu, u64 result)
+{
+ kvm_hv_hypercall_set_result(vcpu, result);
+ ++vcpu->stat.hypercalls;
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+static int kvm_hv_hypercall_complete_userspace(struct kvm_vcpu *vcpu)
+{
+ return kvm_hv_hypercall_complete(vcpu, vcpu->run->hyperv.u.hcall.result);
+}
+
+static u16 kvm_hvcall_signal_event(struct kvm_vcpu *vcpu, bool fast, u64 param)
+{
+ struct eventfd_ctx *eventfd;
+
+ if (unlikely(!fast)) {
+ int ret;
+ gpa_t gpa = param;
+
+ if ((gpa & (__alignof__(param) - 1)) ||
+ offset_in_page(gpa) + sizeof(param) > PAGE_SIZE)
+ return HV_STATUS_INVALID_ALIGNMENT;
+
+ ret = kvm_vcpu_read_guest(vcpu, gpa, &param, sizeof(param));
+ if (ret < 0)
+ return HV_STATUS_INVALID_ALIGNMENT;
+ }
+
+ /*
+ * Per spec, bits 32-47 contain the extra "flag number". However, we
+ * have no use for it, and in all known usecases it is zero, so just
+ * report lookup failure if it isn't.
+ */
+ if (param & 0xffff00000000ULL)
+ return HV_STATUS_INVALID_PORT_ID;
+ /* remaining bits are reserved-zero */
+ if (param & ~KVM_HYPERV_CONN_ID_MASK)
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+
+ /* the eventfd is protected by vcpu->kvm->srcu, but conn_to_evt isn't */
+ rcu_read_lock();
+ eventfd = idr_find(&vcpu->kvm->arch.hyperv.conn_to_evt, param);
+ rcu_read_unlock();
+ if (!eventfd)
+ return HV_STATUS_INVALID_PORT_ID;
+
+ eventfd_signal(eventfd, 1);
+ return HV_STATUS_SUCCESS;
+}
+
+int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
+{
+ u64 param, ingpa, outgpa, ret = HV_STATUS_SUCCESS;
+ uint16_t code, rep_idx, rep_cnt;
+ bool fast, longmode, rep;
+
+ /*
+ * hypercall generates UD from non zero cpl and real mode
+ * per HYPER-V spec
+ */
+ if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ longmode = is_64_bit_mode(vcpu);
+
+ if (!longmode) {
+ param = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) |
+ (kvm_register_read(vcpu, VCPU_REGS_RAX) & 0xffffffff);
+ ingpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RBX) << 32) |
+ (kvm_register_read(vcpu, VCPU_REGS_RCX) & 0xffffffff);
+ outgpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDI) << 32) |
+ (kvm_register_read(vcpu, VCPU_REGS_RSI) & 0xffffffff);
+ }
+#ifdef CONFIG_X86_64
+ else {
+ param = kvm_register_read(vcpu, VCPU_REGS_RCX);
+ ingpa = kvm_register_read(vcpu, VCPU_REGS_RDX);
+ outgpa = kvm_register_read(vcpu, VCPU_REGS_R8);
+ }
+#endif
+
+ code = param & 0xffff;
+ fast = !!(param & HV_HYPERCALL_FAST_BIT);
+ rep_cnt = (param >> HV_HYPERCALL_REP_COMP_OFFSET) & 0xfff;
+ rep_idx = (param >> HV_HYPERCALL_REP_START_OFFSET) & 0xfff;
+ rep = !!(rep_cnt || rep_idx);
+
+ trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa);
+
+ switch (code) {
+ case HVCALL_NOTIFY_LONG_SPIN_WAIT:
+ if (unlikely(rep)) {
+ ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
+ break;
+ }
+ kvm_vcpu_on_spin(vcpu, true);
+ break;
+ case HVCALL_SIGNAL_EVENT:
+ if (unlikely(rep)) {
+ ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
+ break;
+ }
+ ret = kvm_hvcall_signal_event(vcpu, fast, ingpa);
+ if (ret != HV_STATUS_INVALID_PORT_ID)
+ break;
+ /* maybe userspace knows this conn_id: fall through */
+ case HVCALL_POST_MESSAGE:
+ /* don't bother userspace if it has no way to handle it */
+ if (unlikely(rep || !vcpu_to_synic(vcpu)->active)) {
+ ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
+ break;
+ }
+ vcpu->run->exit_reason = KVM_EXIT_HYPERV;
+ vcpu->run->hyperv.type = KVM_EXIT_HYPERV_HCALL;
+ vcpu->run->hyperv.u.hcall.input = param;
+ vcpu->run->hyperv.u.hcall.params[0] = ingpa;
+ vcpu->run->hyperv.u.hcall.params[1] = outgpa;
+ vcpu->arch.complete_userspace_io =
+ kvm_hv_hypercall_complete_userspace;
+ return 0;
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST:
+ if (unlikely(fast || !rep_cnt || rep_idx)) {
+ ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
+ break;
+ }
+ ret = kvm_hv_flush_tlb(vcpu, ingpa, rep_cnt, false);
+ break;
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE:
+ if (unlikely(fast || rep)) {
+ ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
+ break;
+ }
+ ret = kvm_hv_flush_tlb(vcpu, ingpa, rep_cnt, false);
+ break;
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX:
+ if (unlikely(fast || !rep_cnt || rep_idx)) {
+ ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
+ break;
+ }
+ ret = kvm_hv_flush_tlb(vcpu, ingpa, rep_cnt, true);
+ break;
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX:
+ if (unlikely(fast || rep)) {
+ ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
+ break;
+ }
+ ret = kvm_hv_flush_tlb(vcpu, ingpa, rep_cnt, true);
+ break;
+ default:
+ ret = HV_STATUS_INVALID_HYPERCALL_CODE;
+ break;
+ }
+
+ return kvm_hv_hypercall_complete(vcpu, ret);
+}
+
+void kvm_hv_init_vm(struct kvm *kvm)
+{
+ mutex_init(&kvm->arch.hyperv.hv_lock);
+ idr_init(&kvm->arch.hyperv.conn_to_evt);
+}
+
+void kvm_hv_destroy_vm(struct kvm *kvm)
+{
+ struct eventfd_ctx *eventfd;
+ int i;
+
+ idr_for_each_entry(&kvm->arch.hyperv.conn_to_evt, eventfd, i)
+ eventfd_ctx_put(eventfd);
+ idr_destroy(&kvm->arch.hyperv.conn_to_evt);
+}
+
+static int kvm_hv_eventfd_assign(struct kvm *kvm, u32 conn_id, int fd)
+{
+ struct kvm_hv *hv = &kvm->arch.hyperv;
+ struct eventfd_ctx *eventfd;
+ int ret;
+
+ eventfd = eventfd_ctx_fdget(fd);
+ if (IS_ERR(eventfd))
+ return PTR_ERR(eventfd);
+
+ mutex_lock(&hv->hv_lock);
+ ret = idr_alloc(&hv->conn_to_evt, eventfd, conn_id, conn_id + 1,
+ GFP_KERNEL);
+ mutex_unlock(&hv->hv_lock);
+
+ if (ret >= 0)
+ return 0;
+
+ if (ret == -ENOSPC)
+ ret = -EEXIST;
+ eventfd_ctx_put(eventfd);
+ return ret;
+}
+
+static int kvm_hv_eventfd_deassign(struct kvm *kvm, u32 conn_id)
+{
+ struct kvm_hv *hv = &kvm->arch.hyperv;
+ struct eventfd_ctx *eventfd;
+
+ mutex_lock(&hv->hv_lock);
+ eventfd = idr_remove(&hv->conn_to_evt, conn_id);
+ mutex_unlock(&hv->hv_lock);
+
+ if (!eventfd)
+ return -ENOENT;
+
+ synchronize_srcu(&kvm->srcu);
+ eventfd_ctx_put(eventfd);
+ return 0;
+}
+
+int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args)
+{
+ if ((args->flags & ~KVM_HYPERV_EVENTFD_DEASSIGN) ||
+ (args->conn_id & ~KVM_HYPERV_CONN_ID_MASK))
+ return -EINVAL;
+
+ if (args->flags == KVM_HYPERV_EVENTFD_DEASSIGN)
+ return kvm_hv_eventfd_deassign(kvm, args->conn_id);
+ return kvm_hv_eventfd_assign(kvm, args->conn_id, args->fd);
+}
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
new file mode 100644
index 000000000..0e66c12ed
--- /dev/null
+++ b/arch/x86/kvm/hyperv.h
@@ -0,0 +1,99 @@
+/*
+ * KVM Microsoft Hyper-V emulation
+ *
+ * derived from arch/x86/kvm/x86.c
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright (C) 2008 Qumranet, Inc.
+ * Copyright IBM Corporation, 2008
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ * Copyright (C) 2015 Andrey Smetanin <asmetanin@virtuozzo.com>
+ *
+ * Authors:
+ * Avi Kivity <avi@qumranet.com>
+ * Yaniv Kamay <yaniv@qumranet.com>
+ * Amit Shah <amit.shah@qumranet.com>
+ * Ben-Ami Yassour <benami@il.ibm.com>
+ * Andrey Smetanin <asmetanin@virtuozzo.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef __ARCH_X86_KVM_HYPERV_H__
+#define __ARCH_X86_KVM_HYPERV_H__
+
+static inline struct kvm_vcpu_hv *vcpu_to_hv_vcpu(struct kvm_vcpu *vcpu)
+{
+ return &vcpu->arch.hyperv;
+}
+
+static inline struct kvm_vcpu *hv_vcpu_to_vcpu(struct kvm_vcpu_hv *hv_vcpu)
+{
+ struct kvm_vcpu_arch *arch;
+
+ arch = container_of(hv_vcpu, struct kvm_vcpu_arch, hyperv);
+ return container_of(arch, struct kvm_vcpu, arch);
+}
+
+static inline struct kvm_vcpu_hv_synic *vcpu_to_synic(struct kvm_vcpu *vcpu)
+{
+ return &vcpu->arch.hyperv.synic;
+}
+
+static inline struct kvm_vcpu *synic_to_vcpu(struct kvm_vcpu_hv_synic *synic)
+{
+ return hv_vcpu_to_vcpu(container_of(synic, struct kvm_vcpu_hv, synic));
+}
+
+int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host);
+int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host);
+
+bool kvm_hv_hypercall_enabled(struct kvm *kvm);
+int kvm_hv_hypercall(struct kvm_vcpu *vcpu);
+
+void kvm_hv_irq_routing_update(struct kvm *kvm);
+int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vcpu_id, u32 sint);
+void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector);
+int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages);
+
+void kvm_hv_vcpu_init(struct kvm_vcpu *vcpu);
+void kvm_hv_vcpu_postcreate(struct kvm_vcpu *vcpu);
+void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu);
+
+bool kvm_hv_assist_page_enabled(struct kvm_vcpu *vcpu);
+bool kvm_hv_get_assist_page(struct kvm_vcpu *vcpu,
+ struct hv_vp_assist_page *assist_page);
+
+static inline struct kvm_vcpu_hv_stimer *vcpu_to_stimer(struct kvm_vcpu *vcpu,
+ int timer_index)
+{
+ return &vcpu_to_hv_vcpu(vcpu)->stimer[timer_index];
+}
+
+static inline struct kvm_vcpu *stimer_to_vcpu(struct kvm_vcpu_hv_stimer *stimer)
+{
+ struct kvm_vcpu_hv *hv_vcpu;
+
+ hv_vcpu = container_of(stimer - stimer->index, struct kvm_vcpu_hv,
+ stimer[0]);
+ return hv_vcpu_to_vcpu(hv_vcpu);
+}
+
+static inline bool kvm_hv_has_stimer_pending(struct kvm_vcpu *vcpu)
+{
+ return !bitmap_empty(vcpu->arch.hyperv.stimer_pending_bitmap,
+ HV_SYNIC_STIMER_COUNT);
+}
+
+void kvm_hv_process_stimers(struct kvm_vcpu *vcpu);
+
+void kvm_hv_setup_tsc_page(struct kvm *kvm,
+ struct pvclock_vcpu_time_info *hv_clock);
+
+void kvm_hv_init_vm(struct kvm *kvm);
+void kvm_hv_destroy_vm(struct kvm *kvm);
+int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args);
+
+#endif
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
new file mode 100644
index 000000000..af192895b
--- /dev/null
+++ b/arch/x86/kvm/i8254.c
@@ -0,0 +1,737 @@
+/*
+ * 8253/8254 interval timer emulation
+ *
+ * Copyright (c) 2003-2004 Fabrice Bellard
+ * Copyright (c) 2006 Intel Corporation
+ * Copyright (c) 2007 Keir Fraser, XenSource Inc
+ * Copyright (c) 2008 Intel Corporation
+ * Copyright 2009 Red Hat, Inc. and/or its affiliates.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ * Authors:
+ * Sheng Yang <sheng.yang@intel.com>
+ * Based on QEMU and Xen.
+ */
+
+#define pr_fmt(fmt) "pit: " fmt
+
+#include <linux/kvm_host.h>
+#include <linux/slab.h>
+
+#include "ioapic.h"
+#include "irq.h"
+#include "i8254.h"
+#include "x86.h"
+
+#ifndef CONFIG_X86_64
+#define mod_64(x, y) ((x) - (y) * div64_u64(x, y))
+#else
+#define mod_64(x, y) ((x) % (y))
+#endif
+
+#define RW_STATE_LSB 1
+#define RW_STATE_MSB 2
+#define RW_STATE_WORD0 3
+#define RW_STATE_WORD1 4
+
+static void pit_set_gate(struct kvm_pit *pit, int channel, u32 val)
+{
+ struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel];
+
+ switch (c->mode) {
+ default:
+ case 0:
+ case 4:
+ /* XXX: just disable/enable counting */
+ break;
+ case 1:
+ case 2:
+ case 3:
+ case 5:
+ /* Restart counting on rising edge. */
+ if (c->gate < val)
+ c->count_load_time = ktime_get();
+ break;
+ }
+
+ c->gate = val;
+}
+
+static int pit_get_gate(struct kvm_pit *pit, int channel)
+{
+ return pit->pit_state.channels[channel].gate;
+}
+
+static s64 __kpit_elapsed(struct kvm_pit *pit)
+{
+ s64 elapsed;
+ ktime_t remaining;
+ struct kvm_kpit_state *ps = &pit->pit_state;
+
+ if (!ps->period)
+ return 0;
+
+ /*
+ * The Counter does not stop when it reaches zero. In
+ * Modes 0, 1, 4, and 5 the Counter ``wraps around'' to
+ * the highest count, either FFFF hex for binary counting
+ * or 9999 for BCD counting, and continues counting.
+ * Modes 2 and 3 are periodic; the Counter reloads
+ * itself with the initial count and continues counting
+ * from there.
+ */
+ remaining = hrtimer_get_remaining(&ps->timer);
+ elapsed = ps->period - ktime_to_ns(remaining);
+
+ return elapsed;
+}
+
+static s64 kpit_elapsed(struct kvm_pit *pit, struct kvm_kpit_channel_state *c,
+ int channel)
+{
+ if (channel == 0)
+ return __kpit_elapsed(pit);
+
+ return ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time));
+}
+
+static int pit_get_count(struct kvm_pit *pit, int channel)
+{
+ struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel];
+ s64 d, t;
+ int counter;
+
+ t = kpit_elapsed(pit, c, channel);
+ d = mul_u64_u32_div(t, KVM_PIT_FREQ, NSEC_PER_SEC);
+
+ switch (c->mode) {
+ case 0:
+ case 1:
+ case 4:
+ case 5:
+ counter = (c->count - d) & 0xffff;
+ break;
+ case 3:
+ /* XXX: may be incorrect for odd counts */
+ counter = c->count - (mod_64((2 * d), c->count));
+ break;
+ default:
+ counter = c->count - mod_64(d, c->count);
+ break;
+ }
+ return counter;
+}
+
+static int pit_get_out(struct kvm_pit *pit, int channel)
+{
+ struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel];
+ s64 d, t;
+ int out;
+
+ t = kpit_elapsed(pit, c, channel);
+ d = mul_u64_u32_div(t, KVM_PIT_FREQ, NSEC_PER_SEC);
+
+ switch (c->mode) {
+ default:
+ case 0:
+ out = (d >= c->count);
+ break;
+ case 1:
+ out = (d < c->count);
+ break;
+ case 2:
+ out = ((mod_64(d, c->count) == 0) && (d != 0));
+ break;
+ case 3:
+ out = (mod_64(d, c->count) < ((c->count + 1) >> 1));
+ break;
+ case 4:
+ case 5:
+ out = (d == c->count);
+ break;
+ }
+
+ return out;
+}
+
+static void pit_latch_count(struct kvm_pit *pit, int channel)
+{
+ struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel];
+
+ if (!c->count_latched) {
+ c->latched_count = pit_get_count(pit, channel);
+ c->count_latched = c->rw_mode;
+ }
+}
+
+static void pit_latch_status(struct kvm_pit *pit, int channel)
+{
+ struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel];
+
+ if (!c->status_latched) {
+ /* TODO: Return NULL COUNT (bit 6). */
+ c->status = ((pit_get_out(pit, channel) << 7) |
+ (c->rw_mode << 4) |
+ (c->mode << 1) |
+ c->bcd);
+ c->status_latched = 1;
+ }
+}
+
+static inline struct kvm_pit *pit_state_to_pit(struct kvm_kpit_state *ps)
+{
+ return container_of(ps, struct kvm_pit, pit_state);
+}
+
+static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
+{
+ struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state,
+ irq_ack_notifier);
+ struct kvm_pit *pit = pit_state_to_pit(ps);
+
+ atomic_set(&ps->irq_ack, 1);
+ /* irq_ack should be set before pending is read. Order accesses with
+ * inc(pending) in pit_timer_fn and xchg(irq_ack, 0) in pit_do_work.
+ */
+ smp_mb();
+ if (atomic_dec_if_positive(&ps->pending) > 0)
+ kthread_queue_work(pit->worker, &pit->expired);
+}
+
+void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pit *pit = vcpu->kvm->arch.vpit;
+ struct hrtimer *timer;
+
+ if (!kvm_vcpu_is_bsp(vcpu) || !pit)
+ return;
+
+ timer = &pit->pit_state.timer;
+ mutex_lock(&pit->pit_state.lock);
+ if (hrtimer_cancel(timer))
+ hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
+ mutex_unlock(&pit->pit_state.lock);
+}
+
+static void destroy_pit_timer(struct kvm_pit *pit)
+{
+ hrtimer_cancel(&pit->pit_state.timer);
+ kthread_flush_work(&pit->expired);
+}
+
+static void pit_do_work(struct kthread_work *work)
+{
+ struct kvm_pit *pit = container_of(work, struct kvm_pit, expired);
+ struct kvm *kvm = pit->kvm;
+ struct kvm_vcpu *vcpu;
+ int i;
+ struct kvm_kpit_state *ps = &pit->pit_state;
+
+ if (atomic_read(&ps->reinject) && !atomic_xchg(&ps->irq_ack, 0))
+ return;
+
+ kvm_set_irq(kvm, pit->irq_source_id, 0, 1, false);
+ kvm_set_irq(kvm, pit->irq_source_id, 0, 0, false);
+
+ /*
+ * Provides NMI watchdog support via Virtual Wire mode.
+ * The route is: PIT -> LVT0 in NMI mode.
+ *
+ * Note: Our Virtual Wire implementation does not follow
+ * the MP specification. We propagate a PIT interrupt to all
+ * VCPUs and only when LVT0 is in NMI mode. The interrupt can
+ * also be simultaneously delivered through PIC and IOAPIC.
+ */
+ if (atomic_read(&kvm->arch.vapics_in_nmi_mode) > 0)
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_apic_nmi_wd_deliver(vcpu);
+}
+
+static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
+{
+ struct kvm_kpit_state *ps = container_of(data, struct kvm_kpit_state, timer);
+ struct kvm_pit *pt = pit_state_to_pit(ps);
+
+ if (atomic_read(&ps->reinject))
+ atomic_inc(&ps->pending);
+
+ kthread_queue_work(pt->worker, &pt->expired);
+
+ if (ps->is_periodic) {
+ hrtimer_add_expires_ns(&ps->timer, ps->period);
+ return HRTIMER_RESTART;
+ } else
+ return HRTIMER_NORESTART;
+}
+
+static inline void kvm_pit_reset_reinject(struct kvm_pit *pit)
+{
+ atomic_set(&pit->pit_state.pending, 0);
+ atomic_set(&pit->pit_state.irq_ack, 1);
+}
+
+void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject)
+{
+ struct kvm_kpit_state *ps = &pit->pit_state;
+ struct kvm *kvm = pit->kvm;
+
+ if (atomic_read(&ps->reinject) == reinject)
+ return;
+
+ if (reinject) {
+ /* The initial state is preserved while ps->reinject == 0. */
+ kvm_pit_reset_reinject(pit);
+ kvm_register_irq_ack_notifier(kvm, &ps->irq_ack_notifier);
+ kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
+ } else {
+ kvm_unregister_irq_ack_notifier(kvm, &ps->irq_ack_notifier);
+ kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
+ }
+
+ atomic_set(&ps->reinject, reinject);
+}
+
+static void create_pit_timer(struct kvm_pit *pit, u32 val, int is_period)
+{
+ struct kvm_kpit_state *ps = &pit->pit_state;
+ struct kvm *kvm = pit->kvm;
+ s64 interval;
+
+ if (!ioapic_in_kernel(kvm) ||
+ ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)
+ return;
+
+ interval = mul_u64_u32_div(val, NSEC_PER_SEC, KVM_PIT_FREQ);
+
+ pr_debug("create pit timer, interval is %llu nsec\n", interval);
+
+ /* TODO The new value only affected after the retriggered */
+ hrtimer_cancel(&ps->timer);
+ kthread_flush_work(&pit->expired);
+ ps->period = interval;
+ ps->is_periodic = is_period;
+
+ kvm_pit_reset_reinject(pit);
+
+ /*
+ * Do not allow the guest to program periodic timers with small
+ * interval, since the hrtimers are not throttled by the host
+ * scheduler.
+ */
+ if (ps->is_periodic) {
+ s64 min_period = min_timer_period_us * 1000LL;
+
+ if (ps->period < min_period) {
+ pr_info_ratelimited(
+ "kvm: requested %lld ns "
+ "i8254 timer period limited to %lld ns\n",
+ ps->period, min_period);
+ ps->period = min_period;
+ }
+ }
+
+ hrtimer_start(&ps->timer, ktime_add_ns(ktime_get(), interval),
+ HRTIMER_MODE_ABS);
+}
+
+static void pit_load_count(struct kvm_pit *pit, int channel, u32 val)
+{
+ struct kvm_kpit_state *ps = &pit->pit_state;
+
+ pr_debug("load_count val is %d, channel is %d\n", val, channel);
+
+ /*
+ * The largest possible initial count is 0; this is equivalent
+ * to 216 for binary counting and 104 for BCD counting.
+ */
+ if (val == 0)
+ val = 0x10000;
+
+ ps->channels[channel].count = val;
+
+ if (channel != 0) {
+ ps->channels[channel].count_load_time = ktime_get();
+ return;
+ }
+
+ /* Two types of timer
+ * mode 1 is one shot, mode 2 is period, otherwise del timer */
+ switch (ps->channels[0].mode) {
+ case 0:
+ case 1:
+ /* FIXME: enhance mode 4 precision */
+ case 4:
+ create_pit_timer(pit, val, 0);
+ break;
+ case 2:
+ case 3:
+ create_pit_timer(pit, val, 1);
+ break;
+ default:
+ destroy_pit_timer(pit);
+ }
+}
+
+void kvm_pit_load_count(struct kvm_pit *pit, int channel, u32 val,
+ int hpet_legacy_start)
+{
+ u8 saved_mode;
+
+ WARN_ON_ONCE(!mutex_is_locked(&pit->pit_state.lock));
+
+ if (hpet_legacy_start) {
+ /* save existing mode for later reenablement */
+ WARN_ON(channel != 0);
+ saved_mode = pit->pit_state.channels[0].mode;
+ pit->pit_state.channels[0].mode = 0xff; /* disable timer */
+ pit_load_count(pit, channel, val);
+ pit->pit_state.channels[0].mode = saved_mode;
+ } else {
+ pit_load_count(pit, channel, val);
+ }
+}
+
+static inline struct kvm_pit *dev_to_pit(struct kvm_io_device *dev)
+{
+ return container_of(dev, struct kvm_pit, dev);
+}
+
+static inline struct kvm_pit *speaker_to_pit(struct kvm_io_device *dev)
+{
+ return container_of(dev, struct kvm_pit, speaker_dev);
+}
+
+static inline int pit_in_range(gpa_t addr)
+{
+ return ((addr >= KVM_PIT_BASE_ADDRESS) &&
+ (addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH));
+}
+
+static int pit_ioport_write(struct kvm_vcpu *vcpu,
+ struct kvm_io_device *this,
+ gpa_t addr, int len, const void *data)
+{
+ struct kvm_pit *pit = dev_to_pit(this);
+ struct kvm_kpit_state *pit_state = &pit->pit_state;
+ int channel, access;
+ struct kvm_kpit_channel_state *s;
+ u32 val = *(u32 *) data;
+ if (!pit_in_range(addr))
+ return -EOPNOTSUPP;
+
+ val &= 0xff;
+ addr &= KVM_PIT_CHANNEL_MASK;
+
+ mutex_lock(&pit_state->lock);
+
+ if (val != 0)
+ pr_debug("write addr is 0x%x, len is %d, val is 0x%x\n",
+ (unsigned int)addr, len, val);
+
+ if (addr == 3) {
+ channel = val >> 6;
+ if (channel == 3) {
+ /* Read-Back Command. */
+ for (channel = 0; channel < 3; channel++) {
+ s = &pit_state->channels[channel];
+ if (val & (2 << channel)) {
+ if (!(val & 0x20))
+ pit_latch_count(pit, channel);
+ if (!(val & 0x10))
+ pit_latch_status(pit, channel);
+ }
+ }
+ } else {
+ /* Select Counter <channel>. */
+ s = &pit_state->channels[channel];
+ access = (val >> 4) & KVM_PIT_CHANNEL_MASK;
+ if (access == 0) {
+ pit_latch_count(pit, channel);
+ } else {
+ s->rw_mode = access;
+ s->read_state = access;
+ s->write_state = access;
+ s->mode = (val >> 1) & 7;
+ if (s->mode > 5)
+ s->mode -= 4;
+ s->bcd = val & 1;
+ }
+ }
+ } else {
+ /* Write Count. */
+ s = &pit_state->channels[addr];
+ switch (s->write_state) {
+ default:
+ case RW_STATE_LSB:
+ pit_load_count(pit, addr, val);
+ break;
+ case RW_STATE_MSB:
+ pit_load_count(pit, addr, val << 8);
+ break;
+ case RW_STATE_WORD0:
+ s->write_latch = val;
+ s->write_state = RW_STATE_WORD1;
+ break;
+ case RW_STATE_WORD1:
+ pit_load_count(pit, addr, s->write_latch | (val << 8));
+ s->write_state = RW_STATE_WORD0;
+ break;
+ }
+ }
+
+ mutex_unlock(&pit_state->lock);
+ return 0;
+}
+
+static int pit_ioport_read(struct kvm_vcpu *vcpu,
+ struct kvm_io_device *this,
+ gpa_t addr, int len, void *data)
+{
+ struct kvm_pit *pit = dev_to_pit(this);
+ struct kvm_kpit_state *pit_state = &pit->pit_state;
+ int ret, count;
+ struct kvm_kpit_channel_state *s;
+ if (!pit_in_range(addr))
+ return -EOPNOTSUPP;
+
+ addr &= KVM_PIT_CHANNEL_MASK;
+ if (addr == 3)
+ return 0;
+
+ s = &pit_state->channels[addr];
+
+ mutex_lock(&pit_state->lock);
+
+ if (s->status_latched) {
+ s->status_latched = 0;
+ ret = s->status;
+ } else if (s->count_latched) {
+ switch (s->count_latched) {
+ default:
+ case RW_STATE_LSB:
+ ret = s->latched_count & 0xff;
+ s->count_latched = 0;
+ break;
+ case RW_STATE_MSB:
+ ret = s->latched_count >> 8;
+ s->count_latched = 0;
+ break;
+ case RW_STATE_WORD0:
+ ret = s->latched_count & 0xff;
+ s->count_latched = RW_STATE_MSB;
+ break;
+ }
+ } else {
+ switch (s->read_state) {
+ default:
+ case RW_STATE_LSB:
+ count = pit_get_count(pit, addr);
+ ret = count & 0xff;
+ break;
+ case RW_STATE_MSB:
+ count = pit_get_count(pit, addr);
+ ret = (count >> 8) & 0xff;
+ break;
+ case RW_STATE_WORD0:
+ count = pit_get_count(pit, addr);
+ ret = count & 0xff;
+ s->read_state = RW_STATE_WORD1;
+ break;
+ case RW_STATE_WORD1:
+ count = pit_get_count(pit, addr);
+ ret = (count >> 8) & 0xff;
+ s->read_state = RW_STATE_WORD0;
+ break;
+ }
+ }
+
+ if (len > sizeof(ret))
+ len = sizeof(ret);
+ memcpy(data, (char *)&ret, len);
+
+ mutex_unlock(&pit_state->lock);
+ return 0;
+}
+
+static int speaker_ioport_write(struct kvm_vcpu *vcpu,
+ struct kvm_io_device *this,
+ gpa_t addr, int len, const void *data)
+{
+ struct kvm_pit *pit = speaker_to_pit(this);
+ struct kvm_kpit_state *pit_state = &pit->pit_state;
+ u32 val = *(u32 *) data;
+ if (addr != KVM_SPEAKER_BASE_ADDRESS)
+ return -EOPNOTSUPP;
+
+ mutex_lock(&pit_state->lock);
+ pit_state->speaker_data_on = (val >> 1) & 1;
+ pit_set_gate(pit, 2, val & 1);
+ mutex_unlock(&pit_state->lock);
+ return 0;
+}
+
+static int speaker_ioport_read(struct kvm_vcpu *vcpu,
+ struct kvm_io_device *this,
+ gpa_t addr, int len, void *data)
+{
+ struct kvm_pit *pit = speaker_to_pit(this);
+ struct kvm_kpit_state *pit_state = &pit->pit_state;
+ unsigned int refresh_clock;
+ int ret;
+ if (addr != KVM_SPEAKER_BASE_ADDRESS)
+ return -EOPNOTSUPP;
+
+ /* Refresh clock toggles at about 15us. We approximate as 2^14ns. */
+ refresh_clock = ((unsigned int)ktime_to_ns(ktime_get()) >> 14) & 1;
+
+ mutex_lock(&pit_state->lock);
+ ret = ((pit_state->speaker_data_on << 1) | pit_get_gate(pit, 2) |
+ (pit_get_out(pit, 2) << 5) | (refresh_clock << 4));
+ if (len > sizeof(ret))
+ len = sizeof(ret);
+ memcpy(data, (char *)&ret, len);
+ mutex_unlock(&pit_state->lock);
+ return 0;
+}
+
+static void kvm_pit_reset(struct kvm_pit *pit)
+{
+ int i;
+ struct kvm_kpit_channel_state *c;
+
+ pit->pit_state.flags = 0;
+ for (i = 0; i < 3; i++) {
+ c = &pit->pit_state.channels[i];
+ c->mode = 0xff;
+ c->gate = (i != 2);
+ pit_load_count(pit, i, 0);
+ }
+
+ kvm_pit_reset_reinject(pit);
+}
+
+static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask)
+{
+ struct kvm_pit *pit = container_of(kimn, struct kvm_pit, mask_notifier);
+
+ if (!mask)
+ kvm_pit_reset_reinject(pit);
+}
+
+static const struct kvm_io_device_ops pit_dev_ops = {
+ .read = pit_ioport_read,
+ .write = pit_ioport_write,
+};
+
+static const struct kvm_io_device_ops speaker_dev_ops = {
+ .read = speaker_ioport_read,
+ .write = speaker_ioport_write,
+};
+
+struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
+{
+ struct kvm_pit *pit;
+ struct kvm_kpit_state *pit_state;
+ struct pid *pid;
+ pid_t pid_nr;
+ int ret;
+
+ pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL);
+ if (!pit)
+ return NULL;
+
+ pit->irq_source_id = kvm_request_irq_source_id(kvm);
+ if (pit->irq_source_id < 0)
+ goto fail_request;
+
+ mutex_init(&pit->pit_state.lock);
+
+ pid = get_pid(task_tgid(current));
+ pid_nr = pid_vnr(pid);
+ put_pid(pid);
+
+ pit->worker = kthread_create_worker(0, "kvm-pit/%d", pid_nr);
+ if (IS_ERR(pit->worker))
+ goto fail_kthread;
+
+ kthread_init_work(&pit->expired, pit_do_work);
+
+ pit->kvm = kvm;
+
+ pit_state = &pit->pit_state;
+ hrtimer_init(&pit_state->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ pit_state->timer.function = pit_timer_fn;
+
+ pit_state->irq_ack_notifier.gsi = 0;
+ pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq;
+ pit->mask_notifier.func = pit_mask_notifer;
+
+ kvm_pit_reset(pit);
+
+ kvm_pit_set_reinject(pit, true);
+
+ mutex_lock(&kvm->slots_lock);
+ kvm_iodevice_init(&pit->dev, &pit_dev_ops);
+ ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, KVM_PIT_BASE_ADDRESS,
+ KVM_PIT_MEM_LENGTH, &pit->dev);
+ if (ret < 0)
+ goto fail_register_pit;
+
+ if (flags & KVM_PIT_SPEAKER_DUMMY) {
+ kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops);
+ ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS,
+ KVM_SPEAKER_BASE_ADDRESS, 4,
+ &pit->speaker_dev);
+ if (ret < 0)
+ goto fail_register_speaker;
+ }
+ mutex_unlock(&kvm->slots_lock);
+
+ return pit;
+
+fail_register_speaker:
+ kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev);
+fail_register_pit:
+ mutex_unlock(&kvm->slots_lock);
+ kvm_pit_set_reinject(pit, false);
+ kthread_destroy_worker(pit->worker);
+fail_kthread:
+ kvm_free_irq_source_id(kvm, pit->irq_source_id);
+fail_request:
+ kfree(pit);
+ return NULL;
+}
+
+void kvm_free_pit(struct kvm *kvm)
+{
+ struct kvm_pit *pit = kvm->arch.vpit;
+
+ if (pit) {
+ mutex_lock(&kvm->slots_lock);
+ kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev);
+ kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->speaker_dev);
+ mutex_unlock(&kvm->slots_lock);
+ kvm_pit_set_reinject(pit, false);
+ hrtimer_cancel(&pit->pit_state.timer);
+ kthread_destroy_worker(pit->worker);
+ kvm_free_irq_source_id(kvm, pit->irq_source_id);
+ kfree(pit);
+ }
+}
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
new file mode 100644
index 000000000..394d9527d
--- /dev/null
+++ b/arch/x86/kvm/i8254.h
@@ -0,0 +1,66 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __I8254_H
+#define __I8254_H
+
+#include <linux/kthread.h>
+
+#include <kvm/iodev.h>
+
+struct kvm_kpit_channel_state {
+ u32 count; /* can be 65536 */
+ u16 latched_count;
+ u8 count_latched;
+ u8 status_latched;
+ u8 status;
+ u8 read_state;
+ u8 write_state;
+ u8 write_latch;
+ u8 rw_mode;
+ u8 mode;
+ u8 bcd; /* not supported */
+ u8 gate; /* timer start */
+ ktime_t count_load_time;
+};
+
+struct kvm_kpit_state {
+ /* All members before "struct mutex lock" are protected by the lock. */
+ struct kvm_kpit_channel_state channels[3];
+ u32 flags;
+ bool is_periodic;
+ s64 period; /* unit: ns */
+ struct hrtimer timer;
+ u32 speaker_data_on;
+
+ struct mutex lock;
+ atomic_t reinject;
+ atomic_t pending; /* accumulated triggered timers */
+ atomic_t irq_ack;
+ struct kvm_irq_ack_notifier irq_ack_notifier;
+};
+
+struct kvm_pit {
+ struct kvm_io_device dev;
+ struct kvm_io_device speaker_dev;
+ struct kvm *kvm;
+ struct kvm_kpit_state pit_state;
+ int irq_source_id;
+ struct kvm_irq_mask_notifier mask_notifier;
+ struct kthread_worker *worker;
+ struct kthread_work expired;
+};
+
+#define KVM_PIT_BASE_ADDRESS 0x40
+#define KVM_SPEAKER_BASE_ADDRESS 0x61
+#define KVM_PIT_MEM_LENGTH 4
+#define KVM_PIT_FREQ 1193181
+#define KVM_MAX_PIT_INTR_INTERVAL HZ / 100
+#define KVM_PIT_CHANNEL_MASK 0x3
+
+struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags);
+void kvm_free_pit(struct kvm *kvm);
+
+void kvm_pit_load_count(struct kvm_pit *pit, int channel, u32 val,
+ int hpet_legacy_start);
+void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject);
+
+#endif
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
new file mode 100644
index 000000000..38a36a1cc
--- /dev/null
+++ b/arch/x86/kvm/i8259.c
@@ -0,0 +1,655 @@
+/*
+ * 8259 interrupt controller emulation
+ *
+ * Copyright (c) 2003-2004 Fabrice Bellard
+ * Copyright (c) 2007 Intel Corporation
+ * Copyright 2009 Red Hat, Inc. and/or its affiliates.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ * Authors:
+ * Yaozu (Eddie) Dong <Eddie.dong@intel.com>
+ * Port from Qemu.
+ */
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/bitops.h>
+#include "irq.h"
+
+#include <linux/kvm_host.h>
+#include "trace.h"
+
+#define pr_pic_unimpl(fmt, ...) \
+ pr_err_ratelimited("kvm: pic: " fmt, ## __VA_ARGS__)
+
+static void pic_irq_request(struct kvm *kvm, int level);
+
+static void pic_lock(struct kvm_pic *s)
+ __acquires(&s->lock)
+{
+ spin_lock(&s->lock);
+}
+
+static void pic_unlock(struct kvm_pic *s)
+ __releases(&s->lock)
+{
+ bool wakeup = s->wakeup_needed;
+ struct kvm_vcpu *vcpu;
+ int i;
+
+ s->wakeup_needed = false;
+
+ spin_unlock(&s->lock);
+
+ if (wakeup) {
+ kvm_for_each_vcpu(i, vcpu, s->kvm) {
+ if (kvm_apic_accept_pic_intr(vcpu)) {
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ kvm_vcpu_kick(vcpu);
+ return;
+ }
+ }
+ }
+}
+
+static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
+{
+ s->isr &= ~(1 << irq);
+ if (s != &s->pics_state->pics[0])
+ irq += 8;
+ /*
+ * We are dropping lock while calling ack notifiers since ack
+ * notifier callbacks for assigned devices call into PIC recursively.
+ * Other interrupt may be delivered to PIC while lock is dropped but
+ * it should be safe since PIC state is already updated at this stage.
+ */
+ pic_unlock(s->pics_state);
+ kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq);
+ pic_lock(s->pics_state);
+}
+
+/*
+ * set irq level. If an edge is detected, then the IRR is set to 1
+ */
+static inline int pic_set_irq1(struct kvm_kpic_state *s, int irq, int level)
+{
+ int mask, ret = 1;
+ mask = 1 << irq;
+ if (s->elcr & mask) /* level triggered */
+ if (level) {
+ ret = !(s->irr & mask);
+ s->irr |= mask;
+ s->last_irr |= mask;
+ } else {
+ s->irr &= ~mask;
+ s->last_irr &= ~mask;
+ }
+ else /* edge triggered */
+ if (level) {
+ if ((s->last_irr & mask) == 0) {
+ ret = !(s->irr & mask);
+ s->irr |= mask;
+ }
+ s->last_irr |= mask;
+ } else
+ s->last_irr &= ~mask;
+
+ return (s->imr & mask) ? -1 : ret;
+}
+
+/*
+ * return the highest priority found in mask (highest = smallest
+ * number). Return 8 if no irq
+ */
+static inline int get_priority(struct kvm_kpic_state *s, int mask)
+{
+ int priority;
+ if (mask == 0)
+ return 8;
+ priority = 0;
+ while ((mask & (1 << ((priority + s->priority_add) & 7))) == 0)
+ priority++;
+ return priority;
+}
+
+/*
+ * return the pic wanted interrupt. return -1 if none
+ */
+static int pic_get_irq(struct kvm_kpic_state *s)
+{
+ int mask, cur_priority, priority;
+
+ mask = s->irr & ~s->imr;
+ priority = get_priority(s, mask);
+ if (priority == 8)
+ return -1;
+ /*
+ * compute current priority. If special fully nested mode on the
+ * master, the IRQ coming from the slave is not taken into account
+ * for the priority computation.
+ */
+ mask = s->isr;
+ if (s->special_fully_nested_mode && s == &s->pics_state->pics[0])
+ mask &= ~(1 << 2);
+ cur_priority = get_priority(s, mask);
+ if (priority < cur_priority)
+ /*
+ * higher priority found: an irq should be generated
+ */
+ return (priority + s->priority_add) & 7;
+ else
+ return -1;
+}
+
+/*
+ * raise irq to CPU if necessary. must be called every time the active
+ * irq may change
+ */
+static void pic_update_irq(struct kvm_pic *s)
+{
+ int irq2, irq;
+
+ irq2 = pic_get_irq(&s->pics[1]);
+ if (irq2 >= 0) {
+ /*
+ * if irq request by slave pic, signal master PIC
+ */
+ pic_set_irq1(&s->pics[0], 2, 1);
+ pic_set_irq1(&s->pics[0], 2, 0);
+ }
+ irq = pic_get_irq(&s->pics[0]);
+ pic_irq_request(s->kvm, irq >= 0);
+}
+
+void kvm_pic_update_irq(struct kvm_pic *s)
+{
+ pic_lock(s);
+ pic_update_irq(s);
+ pic_unlock(s);
+}
+
+int kvm_pic_set_irq(struct kvm_pic *s, int irq, int irq_source_id, int level)
+{
+ int ret, irq_level;
+
+ BUG_ON(irq < 0 || irq >= PIC_NUM_PINS);
+
+ pic_lock(s);
+ irq_level = __kvm_irq_line_state(&s->irq_states[irq],
+ irq_source_id, level);
+ ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, irq_level);
+ pic_update_irq(s);
+ trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr,
+ s->pics[irq >> 3].imr, ret == 0);
+ pic_unlock(s);
+
+ return ret;
+}
+
+void kvm_pic_clear_all(struct kvm_pic *s, int irq_source_id)
+{
+ int i;
+
+ pic_lock(s);
+ for (i = 0; i < PIC_NUM_PINS; i++)
+ __clear_bit(irq_source_id, &s->irq_states[i]);
+ pic_unlock(s);
+}
+
+/*
+ * acknowledge interrupt 'irq'
+ */
+static inline void pic_intack(struct kvm_kpic_state *s, int irq)
+{
+ s->isr |= 1 << irq;
+ /*
+ * We don't clear a level sensitive interrupt here
+ */
+ if (!(s->elcr & (1 << irq)))
+ s->irr &= ~(1 << irq);
+
+ if (s->auto_eoi) {
+ if (s->rotate_on_auto_eoi)
+ s->priority_add = (irq + 1) & 7;
+ pic_clear_isr(s, irq);
+ }
+
+}
+
+int kvm_pic_read_irq(struct kvm *kvm)
+{
+ int irq, irq2, intno;
+ struct kvm_pic *s = kvm->arch.vpic;
+
+ s->output = 0;
+
+ pic_lock(s);
+ irq = pic_get_irq(&s->pics[0]);
+ if (irq >= 0) {
+ pic_intack(&s->pics[0], irq);
+ if (irq == 2) {
+ irq2 = pic_get_irq(&s->pics[1]);
+ if (irq2 >= 0)
+ pic_intack(&s->pics[1], irq2);
+ else
+ /*
+ * spurious IRQ on slave controller
+ */
+ irq2 = 7;
+ intno = s->pics[1].irq_base + irq2;
+ irq = irq2 + 8;
+ } else
+ intno = s->pics[0].irq_base + irq;
+ } else {
+ /*
+ * spurious IRQ on host controller
+ */
+ irq = 7;
+ intno = s->pics[0].irq_base + irq;
+ }
+ pic_update_irq(s);
+ pic_unlock(s);
+
+ return intno;
+}
+
+static void kvm_pic_reset(struct kvm_kpic_state *s)
+{
+ int irq, i;
+ struct kvm_vcpu *vcpu;
+ u8 edge_irr = s->irr & ~s->elcr;
+ bool found = false;
+
+ s->last_irr = 0;
+ s->irr &= s->elcr;
+ s->imr = 0;
+ s->priority_add = 0;
+ s->special_mask = 0;
+ s->read_reg_select = 0;
+ if (!s->init4) {
+ s->special_fully_nested_mode = 0;
+ s->auto_eoi = 0;
+ }
+ s->init_state = 1;
+
+ kvm_for_each_vcpu(i, vcpu, s->pics_state->kvm)
+ if (kvm_apic_accept_pic_intr(vcpu)) {
+ found = true;
+ break;
+ }
+
+
+ if (!found)
+ return;
+
+ for (irq = 0; irq < PIC_NUM_PINS/2; irq++)
+ if (edge_irr & (1 << irq))
+ pic_clear_isr(s, irq);
+}
+
+static void pic_ioport_write(void *opaque, u32 addr, u32 val)
+{
+ struct kvm_kpic_state *s = opaque;
+ int priority, cmd, irq;
+
+ addr &= 1;
+ if (addr == 0) {
+ if (val & 0x10) {
+ s->init4 = val & 1;
+ if (val & 0x02)
+ pr_pic_unimpl("single mode not supported");
+ if (val & 0x08)
+ pr_pic_unimpl(
+ "level sensitive irq not supported");
+ kvm_pic_reset(s);
+ } else if (val & 0x08) {
+ if (val & 0x04)
+ s->poll = 1;
+ if (val & 0x02)
+ s->read_reg_select = val & 1;
+ if (val & 0x40)
+ s->special_mask = (val >> 5) & 1;
+ } else {
+ cmd = val >> 5;
+ switch (cmd) {
+ case 0:
+ case 4:
+ s->rotate_on_auto_eoi = cmd >> 2;
+ break;
+ case 1: /* end of interrupt */
+ case 5:
+ priority = get_priority(s, s->isr);
+ if (priority != 8) {
+ irq = (priority + s->priority_add) & 7;
+ if (cmd == 5)
+ s->priority_add = (irq + 1) & 7;
+ pic_clear_isr(s, irq);
+ pic_update_irq(s->pics_state);
+ }
+ break;
+ case 3:
+ irq = val & 7;
+ pic_clear_isr(s, irq);
+ pic_update_irq(s->pics_state);
+ break;
+ case 6:
+ s->priority_add = (val + 1) & 7;
+ pic_update_irq(s->pics_state);
+ break;
+ case 7:
+ irq = val & 7;
+ s->priority_add = (irq + 1) & 7;
+ pic_clear_isr(s, irq);
+ pic_update_irq(s->pics_state);
+ break;
+ default:
+ break; /* no operation */
+ }
+ }
+ } else
+ switch (s->init_state) {
+ case 0: { /* normal mode */
+ u8 imr_diff = s->imr ^ val,
+ off = (s == &s->pics_state->pics[0]) ? 0 : 8;
+ s->imr = val;
+ for (irq = 0; irq < PIC_NUM_PINS/2; irq++)
+ if (imr_diff & (1 << irq))
+ kvm_fire_mask_notifiers(
+ s->pics_state->kvm,
+ SELECT_PIC(irq + off),
+ irq + off,
+ !!(s->imr & (1 << irq)));
+ pic_update_irq(s->pics_state);
+ break;
+ }
+ case 1:
+ s->irq_base = val & 0xf8;
+ s->init_state = 2;
+ break;
+ case 2:
+ if (s->init4)
+ s->init_state = 3;
+ else
+ s->init_state = 0;
+ break;
+ case 3:
+ s->special_fully_nested_mode = (val >> 4) & 1;
+ s->auto_eoi = (val >> 1) & 1;
+ s->init_state = 0;
+ break;
+ }
+}
+
+static u32 pic_poll_read(struct kvm_kpic_state *s, u32 addr1)
+{
+ int ret;
+
+ ret = pic_get_irq(s);
+ if (ret >= 0) {
+ if (addr1 >> 7) {
+ s->pics_state->pics[0].isr &= ~(1 << 2);
+ s->pics_state->pics[0].irr &= ~(1 << 2);
+ }
+ s->irr &= ~(1 << ret);
+ pic_clear_isr(s, ret);
+ if (addr1 >> 7 || ret != 2)
+ pic_update_irq(s->pics_state);
+ } else {
+ ret = 0x07;
+ pic_update_irq(s->pics_state);
+ }
+
+ return ret;
+}
+
+static u32 pic_ioport_read(void *opaque, u32 addr)
+{
+ struct kvm_kpic_state *s = opaque;
+ int ret;
+
+ if (s->poll) {
+ ret = pic_poll_read(s, addr);
+ s->poll = 0;
+ } else
+ if ((addr & 1) == 0)
+ if (s->read_reg_select)
+ ret = s->isr;
+ else
+ ret = s->irr;
+ else
+ ret = s->imr;
+ return ret;
+}
+
+static void elcr_ioport_write(void *opaque, u32 addr, u32 val)
+{
+ struct kvm_kpic_state *s = opaque;
+ s->elcr = val & s->elcr_mask;
+}
+
+static u32 elcr_ioport_read(void *opaque, u32 addr1)
+{
+ struct kvm_kpic_state *s = opaque;
+ return s->elcr;
+}
+
+static int picdev_write(struct kvm_pic *s,
+ gpa_t addr, int len, const void *val)
+{
+ unsigned char data = *(unsigned char *)val;
+
+ if (len != 1) {
+ pr_pic_unimpl("non byte write\n");
+ return 0;
+ }
+ switch (addr) {
+ case 0x20:
+ case 0x21:
+ pic_lock(s);
+ pic_ioport_write(&s->pics[0], addr, data);
+ pic_unlock(s);
+ break;
+ case 0xa0:
+ case 0xa1:
+ pic_lock(s);
+ pic_ioport_write(&s->pics[1], addr, data);
+ pic_unlock(s);
+ break;
+ case 0x4d0:
+ case 0x4d1:
+ pic_lock(s);
+ elcr_ioport_write(&s->pics[addr & 1], addr, data);
+ pic_unlock(s);
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+ return 0;
+}
+
+static int picdev_read(struct kvm_pic *s,
+ gpa_t addr, int len, void *val)
+{
+ unsigned char *data = (unsigned char *)val;
+
+ if (len != 1) {
+ memset(val, 0, len);
+ pr_pic_unimpl("non byte read\n");
+ return 0;
+ }
+ switch (addr) {
+ case 0x20:
+ case 0x21:
+ case 0xa0:
+ case 0xa1:
+ pic_lock(s);
+ *data = pic_ioport_read(&s->pics[addr >> 7], addr);
+ pic_unlock(s);
+ break;
+ case 0x4d0:
+ case 0x4d1:
+ pic_lock(s);
+ *data = elcr_ioport_read(&s->pics[addr & 1], addr);
+ pic_unlock(s);
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+ return 0;
+}
+
+static int picdev_master_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+ gpa_t addr, int len, const void *val)
+{
+ return picdev_write(container_of(dev, struct kvm_pic, dev_master),
+ addr, len, val);
+}
+
+static int picdev_master_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+ gpa_t addr, int len, void *val)
+{
+ return picdev_read(container_of(dev, struct kvm_pic, dev_master),
+ addr, len, val);
+}
+
+static int picdev_slave_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+ gpa_t addr, int len, const void *val)
+{
+ return picdev_write(container_of(dev, struct kvm_pic, dev_slave),
+ addr, len, val);
+}
+
+static int picdev_slave_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+ gpa_t addr, int len, void *val)
+{
+ return picdev_read(container_of(dev, struct kvm_pic, dev_slave),
+ addr, len, val);
+}
+
+static int picdev_eclr_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+ gpa_t addr, int len, const void *val)
+{
+ return picdev_write(container_of(dev, struct kvm_pic, dev_eclr),
+ addr, len, val);
+}
+
+static int picdev_eclr_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+ gpa_t addr, int len, void *val)
+{
+ return picdev_read(container_of(dev, struct kvm_pic, dev_eclr),
+ addr, len, val);
+}
+
+/*
+ * callback when PIC0 irq status changed
+ */
+static void pic_irq_request(struct kvm *kvm, int level)
+{
+ struct kvm_pic *s = kvm->arch.vpic;
+
+ if (!s->output)
+ s->wakeup_needed = true;
+ s->output = level;
+}
+
+static const struct kvm_io_device_ops picdev_master_ops = {
+ .read = picdev_master_read,
+ .write = picdev_master_write,
+};
+
+static const struct kvm_io_device_ops picdev_slave_ops = {
+ .read = picdev_slave_read,
+ .write = picdev_slave_write,
+};
+
+static const struct kvm_io_device_ops picdev_eclr_ops = {
+ .read = picdev_eclr_read,
+ .write = picdev_eclr_write,
+};
+
+int kvm_pic_init(struct kvm *kvm)
+{
+ struct kvm_pic *s;
+ int ret;
+
+ s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL);
+ if (!s)
+ return -ENOMEM;
+ spin_lock_init(&s->lock);
+ s->kvm = kvm;
+ s->pics[0].elcr_mask = 0xf8;
+ s->pics[1].elcr_mask = 0xde;
+ s->pics[0].pics_state = s;
+ s->pics[1].pics_state = s;
+
+ /*
+ * Initialize PIO device
+ */
+ kvm_iodevice_init(&s->dev_master, &picdev_master_ops);
+ kvm_iodevice_init(&s->dev_slave, &picdev_slave_ops);
+ kvm_iodevice_init(&s->dev_eclr, &picdev_eclr_ops);
+ mutex_lock(&kvm->slots_lock);
+ ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0x20, 2,
+ &s->dev_master);
+ if (ret < 0)
+ goto fail_unlock;
+
+ ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0xa0, 2, &s->dev_slave);
+ if (ret < 0)
+ goto fail_unreg_2;
+
+ ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0x4d0, 2, &s->dev_eclr);
+ if (ret < 0)
+ goto fail_unreg_1;
+
+ mutex_unlock(&kvm->slots_lock);
+
+ kvm->arch.vpic = s;
+
+ return 0;
+
+fail_unreg_1:
+ kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &s->dev_slave);
+
+fail_unreg_2:
+ kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &s->dev_master);
+
+fail_unlock:
+ mutex_unlock(&kvm->slots_lock);
+
+ kfree(s);
+
+ return ret;
+}
+
+void kvm_pic_destroy(struct kvm *kvm)
+{
+ struct kvm_pic *vpic = kvm->arch.vpic;
+
+ if (!vpic)
+ return;
+
+ mutex_lock(&kvm->slots_lock);
+ kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_master);
+ kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_slave);
+ kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_eclr);
+ mutex_unlock(&kvm->slots_lock);
+
+ kvm->arch.vpic = NULL;
+ kfree(vpic);
+}
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
new file mode 100644
index 000000000..bac2ec9b4
--- /dev/null
+++ b/arch/x86/kvm/ioapic.c
@@ -0,0 +1,685 @@
+/*
+ * Copyright (C) 2001 MandrakeSoft S.A.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * MandrakeSoft S.A.
+ * 43, rue d'Aboukir
+ * 75002 Paris - France
+ * http://www.linux-mandrake.com/
+ * http://www.mandrakesoft.com/
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Yunhong Jiang <yunhong.jiang@intel.com>
+ * Yaozu (Eddie) Dong <eddie.dong@intel.com>
+ * Based on Xen 3.1 code.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/kvm.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/smp.h>
+#include <linux/hrtimer.h>
+#include <linux/io.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/nospec.h>
+#include <asm/processor.h>
+#include <asm/page.h>
+#include <asm/current.h>
+#include <trace/events/kvm.h>
+
+#include "ioapic.h"
+#include "lapic.h"
+#include "irq.h"
+
+#if 0
+#define ioapic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg)
+#else
+#define ioapic_debug(fmt, arg...)
+#endif
+static int ioapic_service(struct kvm_ioapic *vioapic, int irq,
+ bool line_status);
+
+static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,
+ unsigned long addr,
+ unsigned long length)
+{
+ unsigned long result = 0;
+
+ switch (ioapic->ioregsel) {
+ case IOAPIC_REG_VERSION:
+ result = ((((IOAPIC_NUM_PINS - 1) & 0xff) << 16)
+ | (IOAPIC_VERSION_ID & 0xff));
+ break;
+
+ case IOAPIC_REG_APIC_ID:
+ case IOAPIC_REG_ARB_ID:
+ result = ((ioapic->id & 0xf) << 24);
+ break;
+
+ default:
+ {
+ u32 redir_index = (ioapic->ioregsel - 0x10) >> 1;
+ u64 redir_content = ~0ULL;
+
+ if (redir_index < IOAPIC_NUM_PINS) {
+ u32 index = array_index_nospec(
+ redir_index, IOAPIC_NUM_PINS);
+
+ redir_content = ioapic->redirtbl[index].bits;
+ }
+
+ result = (ioapic->ioregsel & 0x1) ?
+ (redir_content >> 32) & 0xffffffff :
+ redir_content & 0xffffffff;
+ break;
+ }
+ }
+
+ return result;
+}
+
+static void rtc_irq_eoi_tracking_reset(struct kvm_ioapic *ioapic)
+{
+ ioapic->rtc_status.pending_eoi = 0;
+ bitmap_zero(ioapic->rtc_status.dest_map.map, KVM_MAX_VCPU_ID);
+}
+
+static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic);
+
+static void rtc_status_pending_eoi_check_valid(struct kvm_ioapic *ioapic)
+{
+ if (WARN_ON(ioapic->rtc_status.pending_eoi < 0))
+ kvm_rtc_eoi_tracking_restore_all(ioapic);
+}
+
+static void __rtc_irq_eoi_tracking_restore_one(struct kvm_vcpu *vcpu)
+{
+ bool new_val, old_val;
+ struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
+ struct dest_map *dest_map = &ioapic->rtc_status.dest_map;
+ union kvm_ioapic_redirect_entry *e;
+
+ e = &ioapic->redirtbl[RTC_GSI];
+ if (!kvm_apic_match_dest(vcpu, NULL, 0, e->fields.dest_id,
+ e->fields.dest_mode))
+ return;
+
+ new_val = kvm_apic_pending_eoi(vcpu, e->fields.vector);
+ old_val = test_bit(vcpu->vcpu_id, dest_map->map);
+
+ if (new_val == old_val)
+ return;
+
+ if (new_val) {
+ __set_bit(vcpu->vcpu_id, dest_map->map);
+ dest_map->vectors[vcpu->vcpu_id] = e->fields.vector;
+ ioapic->rtc_status.pending_eoi++;
+ } else {
+ __clear_bit(vcpu->vcpu_id, dest_map->map);
+ ioapic->rtc_status.pending_eoi--;
+ rtc_status_pending_eoi_check_valid(ioapic);
+ }
+}
+
+void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu)
+{
+ struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
+
+ spin_lock(&ioapic->lock);
+ __rtc_irq_eoi_tracking_restore_one(vcpu);
+ spin_unlock(&ioapic->lock);
+}
+
+static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic)
+{
+ struct kvm_vcpu *vcpu;
+ int i;
+
+ if (RTC_GSI >= IOAPIC_NUM_PINS)
+ return;
+
+ rtc_irq_eoi_tracking_reset(ioapic);
+ kvm_for_each_vcpu(i, vcpu, ioapic->kvm)
+ __rtc_irq_eoi_tracking_restore_one(vcpu);
+}
+
+static void rtc_irq_eoi(struct kvm_ioapic *ioapic, struct kvm_vcpu *vcpu)
+{
+ if (test_and_clear_bit(vcpu->vcpu_id,
+ ioapic->rtc_status.dest_map.map)) {
+ --ioapic->rtc_status.pending_eoi;
+ rtc_status_pending_eoi_check_valid(ioapic);
+ }
+}
+
+static bool rtc_irq_check_coalesced(struct kvm_ioapic *ioapic)
+{
+ if (ioapic->rtc_status.pending_eoi > 0)
+ return true; /* coalesced */
+
+ return false;
+}
+
+static int ioapic_set_irq(struct kvm_ioapic *ioapic, unsigned int irq,
+ int irq_level, bool line_status)
+{
+ union kvm_ioapic_redirect_entry entry;
+ u32 mask = 1 << irq;
+ u32 old_irr;
+ int edge, ret;
+
+ entry = ioapic->redirtbl[irq];
+ edge = (entry.fields.trig_mode == IOAPIC_EDGE_TRIG);
+
+ if (!irq_level) {
+ ioapic->irr &= ~mask;
+ ret = 1;
+ goto out;
+ }
+
+ /*
+ * Return 0 for coalesced interrupts; for edge-triggered interrupts,
+ * this only happens if a previous edge has not been delivered due
+ * do masking. For level interrupts, the remote_irr field tells
+ * us if the interrupt is waiting for an EOI.
+ *
+ * RTC is special: it is edge-triggered, but userspace likes to know
+ * if it has been already ack-ed via EOI because coalesced RTC
+ * interrupts lead to time drift in Windows guests. So we track
+ * EOI manually for the RTC interrupt.
+ */
+ if (irq == RTC_GSI && line_status &&
+ rtc_irq_check_coalesced(ioapic)) {
+ ret = 0;
+ goto out;
+ }
+
+ old_irr = ioapic->irr;
+ ioapic->irr |= mask;
+ if (edge) {
+ ioapic->irr_delivered &= ~mask;
+ if (old_irr == ioapic->irr) {
+ ret = 0;
+ goto out;
+ }
+ }
+
+ ret = ioapic_service(ioapic, irq, line_status);
+
+out:
+ trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0);
+ return ret;
+}
+
+static void kvm_ioapic_inject_all(struct kvm_ioapic *ioapic, unsigned long irr)
+{
+ u32 idx;
+
+ rtc_irq_eoi_tracking_reset(ioapic);
+ for_each_set_bit(idx, &irr, IOAPIC_NUM_PINS)
+ ioapic_set_irq(ioapic, idx, 1, true);
+
+ kvm_rtc_eoi_tracking_restore_all(ioapic);
+}
+
+
+void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors)
+{
+ struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
+ struct dest_map *dest_map = &ioapic->rtc_status.dest_map;
+ union kvm_ioapic_redirect_entry *e;
+ int index;
+
+ spin_lock(&ioapic->lock);
+
+ /* Make sure we see any missing RTC EOI */
+ if (test_bit(vcpu->vcpu_id, dest_map->map))
+ __set_bit(dest_map->vectors[vcpu->vcpu_id],
+ ioapic_handled_vectors);
+
+ for (index = 0; index < IOAPIC_NUM_PINS; index++) {
+ e = &ioapic->redirtbl[index];
+ if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG ||
+ kvm_irq_has_notifier(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index) ||
+ index == RTC_GSI) {
+ if (kvm_apic_match_dest(vcpu, NULL, 0,
+ e->fields.dest_id, e->fields.dest_mode) ||
+ kvm_apic_pending_eoi(vcpu, e->fields.vector))
+ __set_bit(e->fields.vector,
+ ioapic_handled_vectors);
+ }
+ }
+ spin_unlock(&ioapic->lock);
+}
+
+void kvm_arch_post_irq_ack_notifier_list_update(struct kvm *kvm)
+{
+ if (!ioapic_in_kernel(kvm))
+ return;
+ kvm_make_scan_ioapic_request(kvm);
+}
+
+static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
+{
+ unsigned index;
+ bool mask_before, mask_after;
+ int old_remote_irr, old_delivery_status;
+ union kvm_ioapic_redirect_entry *e;
+
+ switch (ioapic->ioregsel) {
+ case IOAPIC_REG_VERSION:
+ /* Writes are ignored. */
+ break;
+
+ case IOAPIC_REG_APIC_ID:
+ ioapic->id = (val >> 24) & 0xf;
+ break;
+
+ case IOAPIC_REG_ARB_ID:
+ break;
+
+ default:
+ index = (ioapic->ioregsel - 0x10) >> 1;
+
+ ioapic_debug("change redir index %x val %x\n", index, val);
+ if (index >= IOAPIC_NUM_PINS)
+ return;
+ index = array_index_nospec(index, IOAPIC_NUM_PINS);
+ e = &ioapic->redirtbl[index];
+ mask_before = e->fields.mask;
+ /* Preserve read-only fields */
+ old_remote_irr = e->fields.remote_irr;
+ old_delivery_status = e->fields.delivery_status;
+ if (ioapic->ioregsel & 1) {
+ e->bits &= 0xffffffff;
+ e->bits |= (u64) val << 32;
+ } else {
+ e->bits &= ~0xffffffffULL;
+ e->bits |= (u32) val;
+ }
+ e->fields.remote_irr = old_remote_irr;
+ e->fields.delivery_status = old_delivery_status;
+
+ /*
+ * Some OSes (Linux, Xen) assume that Remote IRR bit will
+ * be cleared by IOAPIC hardware when the entry is configured
+ * as edge-triggered. This behavior is used to simulate an
+ * explicit EOI on IOAPICs that don't have the EOI register.
+ */
+ if (e->fields.trig_mode == IOAPIC_EDGE_TRIG)
+ e->fields.remote_irr = 0;
+
+ mask_after = e->fields.mask;
+ if (mask_before != mask_after)
+ kvm_fire_mask_notifiers(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index, mask_after);
+ if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG
+ && ioapic->irr & (1 << index))
+ ioapic_service(ioapic, index, false);
+ kvm_make_scan_ioapic_request(ioapic->kvm);
+ break;
+ }
+}
+
+static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status)
+{
+ union kvm_ioapic_redirect_entry *entry = &ioapic->redirtbl[irq];
+ struct kvm_lapic_irq irqe;
+ int ret;
+
+ if (entry->fields.mask ||
+ (entry->fields.trig_mode == IOAPIC_LEVEL_TRIG &&
+ entry->fields.remote_irr))
+ return -1;
+
+ ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x "
+ "vector=%x trig_mode=%x\n",
+ entry->fields.dest_id, entry->fields.dest_mode,
+ entry->fields.delivery_mode, entry->fields.vector,
+ entry->fields.trig_mode);
+
+ irqe.dest_id = entry->fields.dest_id;
+ irqe.vector = entry->fields.vector;
+ irqe.dest_mode = entry->fields.dest_mode;
+ irqe.trig_mode = entry->fields.trig_mode;
+ irqe.delivery_mode = entry->fields.delivery_mode << 8;
+ irqe.level = 1;
+ irqe.shorthand = 0;
+ irqe.msi_redir_hint = false;
+
+ if (irqe.trig_mode == IOAPIC_EDGE_TRIG)
+ ioapic->irr_delivered |= 1 << irq;
+
+ if (irq == RTC_GSI && line_status) {
+ /*
+ * pending_eoi cannot ever become negative (see
+ * rtc_status_pending_eoi_check_valid) and the caller
+ * ensures that it is only called if it is >= zero, namely
+ * if rtc_irq_check_coalesced returns false).
+ */
+ BUG_ON(ioapic->rtc_status.pending_eoi != 0);
+ ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe,
+ &ioapic->rtc_status.dest_map);
+ ioapic->rtc_status.pending_eoi = (ret < 0 ? 0 : ret);
+ } else
+ ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe, NULL);
+
+ if (ret && irqe.trig_mode == IOAPIC_LEVEL_TRIG)
+ entry->fields.remote_irr = 1;
+
+ return ret;
+}
+
+int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
+ int level, bool line_status)
+{
+ int ret, irq_level;
+
+ BUG_ON(irq < 0 || irq >= IOAPIC_NUM_PINS);
+
+ spin_lock(&ioapic->lock);
+ irq_level = __kvm_irq_line_state(&ioapic->irq_states[irq],
+ irq_source_id, level);
+ ret = ioapic_set_irq(ioapic, irq, irq_level, line_status);
+
+ spin_unlock(&ioapic->lock);
+
+ return ret;
+}
+
+void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id)
+{
+ int i;
+
+ spin_lock(&ioapic->lock);
+ for (i = 0; i < KVM_IOAPIC_NUM_PINS; i++)
+ __clear_bit(irq_source_id, &ioapic->irq_states[i]);
+ spin_unlock(&ioapic->lock);
+}
+
+static void kvm_ioapic_eoi_inject_work(struct work_struct *work)
+{
+ int i;
+ struct kvm_ioapic *ioapic = container_of(work, struct kvm_ioapic,
+ eoi_inject.work);
+ spin_lock(&ioapic->lock);
+ for (i = 0; i < IOAPIC_NUM_PINS; i++) {
+ union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i];
+
+ if (ent->fields.trig_mode != IOAPIC_LEVEL_TRIG)
+ continue;
+
+ if (ioapic->irr & (1 << i) && !ent->fields.remote_irr)
+ ioapic_service(ioapic, i, false);
+ }
+ spin_unlock(&ioapic->lock);
+}
+
+#define IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT 10000
+
+static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu,
+ struct kvm_ioapic *ioapic, int vector, int trigger_mode)
+{
+ struct dest_map *dest_map = &ioapic->rtc_status.dest_map;
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ int i;
+
+ /* RTC special handling */
+ if (test_bit(vcpu->vcpu_id, dest_map->map) &&
+ vector == dest_map->vectors[vcpu->vcpu_id])
+ rtc_irq_eoi(ioapic, vcpu);
+
+ for (i = 0; i < IOAPIC_NUM_PINS; i++) {
+ union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i];
+
+ if (ent->fields.vector != vector)
+ continue;
+
+ /*
+ * We are dropping lock while calling ack notifiers because ack
+ * notifier callbacks for assigned devices call into IOAPIC
+ * recursively. Since remote_irr is cleared only after call
+ * to notifiers if the same vector will be delivered while lock
+ * is dropped it will be put into irr and will be delivered
+ * after ack notifier returns.
+ */
+ spin_unlock(&ioapic->lock);
+ kvm_notify_acked_irq(ioapic->kvm, KVM_IRQCHIP_IOAPIC, i);
+ spin_lock(&ioapic->lock);
+
+ if (trigger_mode != IOAPIC_LEVEL_TRIG ||
+ kvm_lapic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI)
+ continue;
+
+ ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);
+ ent->fields.remote_irr = 0;
+ if (!ent->fields.mask && (ioapic->irr & (1 << i))) {
+ ++ioapic->irq_eoi[i];
+ if (ioapic->irq_eoi[i] == IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT) {
+ /*
+ * Real hardware does not deliver the interrupt
+ * immediately during eoi broadcast, and this
+ * lets a buggy guest make slow progress
+ * even if it does not correctly handle a
+ * level-triggered interrupt. Emulate this
+ * behavior if we detect an interrupt storm.
+ */
+ schedule_delayed_work(&ioapic->eoi_inject, HZ / 100);
+ ioapic->irq_eoi[i] = 0;
+ trace_kvm_ioapic_delayed_eoi_inj(ent->bits);
+ } else {
+ ioapic_service(ioapic, i, false);
+ }
+ } else {
+ ioapic->irq_eoi[i] = 0;
+ }
+ }
+}
+
+void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector, int trigger_mode)
+{
+ struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
+
+ spin_lock(&ioapic->lock);
+ __kvm_ioapic_update_eoi(vcpu, ioapic, vector, trigger_mode);
+ spin_unlock(&ioapic->lock);
+}
+
+static inline struct kvm_ioapic *to_ioapic(struct kvm_io_device *dev)
+{
+ return container_of(dev, struct kvm_ioapic, dev);
+}
+
+static inline int ioapic_in_range(struct kvm_ioapic *ioapic, gpa_t addr)
+{
+ return ((addr >= ioapic->base_address &&
+ (addr < ioapic->base_address + IOAPIC_MEM_LENGTH)));
+}
+
+static int ioapic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
+ gpa_t addr, int len, void *val)
+{
+ struct kvm_ioapic *ioapic = to_ioapic(this);
+ u32 result;
+ if (!ioapic_in_range(ioapic, addr))
+ return -EOPNOTSUPP;
+
+ ioapic_debug("addr %lx\n", (unsigned long)addr);
+ ASSERT(!(addr & 0xf)); /* check alignment */
+
+ addr &= 0xff;
+ spin_lock(&ioapic->lock);
+ switch (addr) {
+ case IOAPIC_REG_SELECT:
+ result = ioapic->ioregsel;
+ break;
+
+ case IOAPIC_REG_WINDOW:
+ result = ioapic_read_indirect(ioapic, addr, len);
+ break;
+
+ default:
+ result = 0;
+ break;
+ }
+ spin_unlock(&ioapic->lock);
+
+ switch (len) {
+ case 8:
+ *(u64 *) val = result;
+ break;
+ case 1:
+ case 2:
+ case 4:
+ memcpy(val, (char *)&result, len);
+ break;
+ default:
+ printk(KERN_WARNING "ioapic: wrong length %d\n", len);
+ }
+ return 0;
+}
+
+static int ioapic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
+ gpa_t addr, int len, const void *val)
+{
+ struct kvm_ioapic *ioapic = to_ioapic(this);
+ u32 data;
+ if (!ioapic_in_range(ioapic, addr))
+ return -EOPNOTSUPP;
+
+ ioapic_debug("ioapic_mmio_write addr=%p len=%d val=%p\n",
+ (void*)addr, len, val);
+ ASSERT(!(addr & 0xf)); /* check alignment */
+
+ switch (len) {
+ case 8:
+ case 4:
+ data = *(u32 *) val;
+ break;
+ case 2:
+ data = *(u16 *) val;
+ break;
+ case 1:
+ data = *(u8 *) val;
+ break;
+ default:
+ printk(KERN_WARNING "ioapic: Unsupported size %d\n", len);
+ return 0;
+ }
+
+ addr &= 0xff;
+ spin_lock(&ioapic->lock);
+ switch (addr) {
+ case IOAPIC_REG_SELECT:
+ ioapic->ioregsel = data & 0xFF; /* 8-bit register */
+ break;
+
+ case IOAPIC_REG_WINDOW:
+ ioapic_write_indirect(ioapic, data);
+ break;
+
+ default:
+ break;
+ }
+ spin_unlock(&ioapic->lock);
+ return 0;
+}
+
+static void kvm_ioapic_reset(struct kvm_ioapic *ioapic)
+{
+ int i;
+
+ cancel_delayed_work_sync(&ioapic->eoi_inject);
+ for (i = 0; i < IOAPIC_NUM_PINS; i++)
+ ioapic->redirtbl[i].fields.mask = 1;
+ ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS;
+ ioapic->ioregsel = 0;
+ ioapic->irr = 0;
+ ioapic->irr_delivered = 0;
+ ioapic->id = 0;
+ memset(ioapic->irq_eoi, 0x00, sizeof(ioapic->irq_eoi));
+ rtc_irq_eoi_tracking_reset(ioapic);
+}
+
+static const struct kvm_io_device_ops ioapic_mmio_ops = {
+ .read = ioapic_mmio_read,
+ .write = ioapic_mmio_write,
+};
+
+int kvm_ioapic_init(struct kvm *kvm)
+{
+ struct kvm_ioapic *ioapic;
+ int ret;
+
+ ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL);
+ if (!ioapic)
+ return -ENOMEM;
+ spin_lock_init(&ioapic->lock);
+ INIT_DELAYED_WORK(&ioapic->eoi_inject, kvm_ioapic_eoi_inject_work);
+ kvm->arch.vioapic = ioapic;
+ kvm_ioapic_reset(ioapic);
+ kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops);
+ ioapic->kvm = kvm;
+ mutex_lock(&kvm->slots_lock);
+ ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, ioapic->base_address,
+ IOAPIC_MEM_LENGTH, &ioapic->dev);
+ mutex_unlock(&kvm->slots_lock);
+ if (ret < 0) {
+ kvm->arch.vioapic = NULL;
+ kfree(ioapic);
+ }
+
+ return ret;
+}
+
+void kvm_ioapic_destroy(struct kvm *kvm)
+{
+ struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+
+ if (!ioapic)
+ return;
+
+ cancel_delayed_work_sync(&ioapic->eoi_inject);
+ mutex_lock(&kvm->slots_lock);
+ kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &ioapic->dev);
+ mutex_unlock(&kvm->slots_lock);
+ kvm->arch.vioapic = NULL;
+ kfree(ioapic);
+}
+
+void kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
+{
+ struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+
+ spin_lock(&ioapic->lock);
+ memcpy(state, ioapic, sizeof(struct kvm_ioapic_state));
+ state->irr &= ~ioapic->irr_delivered;
+ spin_unlock(&ioapic->lock);
+}
+
+void kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
+{
+ struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+
+ spin_lock(&ioapic->lock);
+ memcpy(ioapic, state, sizeof(struct kvm_ioapic_state));
+ ioapic->irr = 0;
+ ioapic->irr_delivered = 0;
+ kvm_make_scan_ioapic_request(kvm);
+ kvm_ioapic_inject_all(ioapic, state->irr);
+ spin_unlock(&ioapic->lock);
+}
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
new file mode 100644
index 000000000..ea1a4e029
--- /dev/null
+++ b/arch/x86/kvm/ioapic.h
@@ -0,0 +1,138 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_IO_APIC_H
+#define __KVM_IO_APIC_H
+
+#include <linux/kvm_host.h>
+
+#include <kvm/iodev.h>
+
+struct kvm;
+struct kvm_vcpu;
+
+#define IOAPIC_NUM_PINS KVM_IOAPIC_NUM_PINS
+#define MAX_NR_RESERVED_IOAPIC_PINS KVM_MAX_IRQ_ROUTES
+#define IOAPIC_VERSION_ID 0x11 /* IOAPIC version */
+#define IOAPIC_EDGE_TRIG 0
+#define IOAPIC_LEVEL_TRIG 1
+
+#define IOAPIC_DEFAULT_BASE_ADDRESS 0xfec00000
+#define IOAPIC_MEM_LENGTH 0x100
+
+/* Direct registers. */
+#define IOAPIC_REG_SELECT 0x00
+#define IOAPIC_REG_WINDOW 0x10
+
+/* Indirect registers. */
+#define IOAPIC_REG_APIC_ID 0x00 /* x86 IOAPIC only */
+#define IOAPIC_REG_VERSION 0x01
+#define IOAPIC_REG_ARB_ID 0x02 /* x86 IOAPIC only */
+
+/*ioapic delivery mode*/
+#define IOAPIC_FIXED 0x0
+#define IOAPIC_LOWEST_PRIORITY 0x1
+#define IOAPIC_PMI 0x2
+#define IOAPIC_NMI 0x4
+#define IOAPIC_INIT 0x5
+#define IOAPIC_EXTINT 0x7
+
+#ifdef CONFIG_X86
+#define RTC_GSI 8
+#else
+#define RTC_GSI -1U
+#endif
+
+struct dest_map {
+ /* vcpu bitmap where IRQ has been sent */
+ DECLARE_BITMAP(map, KVM_MAX_VCPU_ID);
+
+ /*
+ * Vector sent to a given vcpu, only valid when
+ * the vcpu's bit in map is set
+ */
+ u8 vectors[KVM_MAX_VCPU_ID];
+};
+
+
+struct rtc_status {
+ int pending_eoi;
+ struct dest_map dest_map;
+};
+
+union kvm_ioapic_redirect_entry {
+ u64 bits;
+ struct {
+ u8 vector;
+ u8 delivery_mode:3;
+ u8 dest_mode:1;
+ u8 delivery_status:1;
+ u8 polarity:1;
+ u8 remote_irr:1;
+ u8 trig_mode:1;
+ u8 mask:1;
+ u8 reserve:7;
+ u8 reserved[4];
+ u8 dest_id;
+ } fields;
+};
+
+struct kvm_ioapic {
+ u64 base_address;
+ u32 ioregsel;
+ u32 id;
+ u32 irr;
+ u32 pad;
+ union kvm_ioapic_redirect_entry redirtbl[IOAPIC_NUM_PINS];
+ unsigned long irq_states[IOAPIC_NUM_PINS];
+ struct kvm_io_device dev;
+ struct kvm *kvm;
+ void (*ack_notifier)(void *opaque, int irq);
+ spinlock_t lock;
+ struct rtc_status rtc_status;
+ struct delayed_work eoi_inject;
+ u32 irq_eoi[IOAPIC_NUM_PINS];
+ u32 irr_delivered;
+};
+
+#ifdef DEBUG
+#define ASSERT(x) \
+do { \
+ if (!(x)) { \
+ printk(KERN_EMERG "assertion failed %s: %d: %s\n", \
+ __FILE__, __LINE__, #x); \
+ BUG(); \
+ } \
+} while (0)
+#else
+#define ASSERT(x) do { } while (0)
+#endif
+
+static inline int ioapic_in_kernel(struct kvm *kvm)
+{
+ int mode = kvm->arch.irqchip_mode;
+
+ /* Matches smp_wmb() when setting irqchip_mode */
+ smp_rmb();
+ return mode == KVM_IRQCHIP_KERNEL;
+}
+
+void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu);
+bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
+ int short_hand, unsigned int dest, int dest_mode);
+int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2);
+void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector,
+ int trigger_mode);
+int kvm_ioapic_init(struct kvm *kvm);
+void kvm_ioapic_destroy(struct kvm *kvm);
+int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
+ int level, bool line_status);
+void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id);
+int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
+ struct kvm_lapic_irq *irq,
+ struct dest_map *dest_map);
+void kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
+void kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
+void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu,
+ ulong *ioapic_handled_vectors);
+void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu,
+ ulong *ioapic_handled_vectors);
+#endif
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
new file mode 100644
index 000000000..295ebadb8
--- /dev/null
+++ b/arch/x86/kvm/irq.c
@@ -0,0 +1,164 @@
+/*
+ * irq.c: API for in kernel interrupt controller
+ * Copyright (c) 2007, Intel Corporation.
+ * Copyright 2009 Red Hat, Inc. and/or its affiliates.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * Authors:
+ * Yaozu (Eddie) Dong <Eddie.dong@intel.com>
+ *
+ */
+
+#include <linux/export.h>
+#include <linux/kvm_host.h>
+
+#include "irq.h"
+#include "i8254.h"
+#include "x86.h"
+
+/*
+ * check if there are pending timer events
+ * to be processed.
+ */
+int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
+{
+ if (lapic_in_kernel(vcpu))
+ return apic_has_pending_timer(vcpu);
+
+ return 0;
+}
+EXPORT_SYMBOL(kvm_cpu_has_pending_timer);
+
+/*
+ * check if there is a pending userspace external interrupt
+ */
+static int pending_userspace_extint(struct kvm_vcpu *v)
+{
+ return v->arch.pending_external_vector != -1;
+}
+
+/*
+ * check if there is pending interrupt from
+ * non-APIC source without intack.
+ */
+int kvm_cpu_has_extint(struct kvm_vcpu *v)
+{
+ /*
+ * FIXME: interrupt.injected represents an interrupt whose
+ * side-effects have already been applied (e.g. bit from IRR
+ * already moved to ISR). Therefore, it is incorrect to rely
+ * on interrupt.injected to know if there is a pending
+ * interrupt in the user-mode LAPIC.
+ * This leads to nVMX/nSVM not be able to distinguish
+ * if it should exit from L2 to L1 on EXTERNAL_INTERRUPT on
+ * pending interrupt or should re-inject an injected
+ * interrupt.
+ */
+ if (!lapic_in_kernel(v))
+ return v->arch.interrupt.injected;
+
+ if (!kvm_apic_accept_pic_intr(v))
+ return 0;
+
+ if (irqchip_split(v->kvm))
+ return pending_userspace_extint(v);
+ else
+ return v->kvm->arch.vpic->output;
+}
+
+/*
+ * check if there is injectable interrupt:
+ * when virtual interrupt delivery enabled,
+ * interrupt from apic will handled by hardware,
+ * we don't need to check it here.
+ */
+int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
+{
+ if (kvm_cpu_has_extint(v))
+ return 1;
+
+ if (!is_guest_mode(v) && kvm_vcpu_apicv_active(v))
+ return 0;
+
+ return kvm_apic_has_interrupt(v) != -1; /* LAPIC */
+}
+
+/*
+ * check if there is pending interrupt without
+ * intack.
+ */
+int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
+{
+ if (kvm_cpu_has_extint(v))
+ return 1;
+
+ return kvm_apic_has_interrupt(v) != -1; /* LAPIC */
+}
+EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt);
+
+/*
+ * Read pending interrupt(from non-APIC source)
+ * vector and intack.
+ */
+static int kvm_cpu_get_extint(struct kvm_vcpu *v)
+{
+ if (!kvm_cpu_has_extint(v)) {
+ WARN_ON(!lapic_in_kernel(v));
+ return -1;
+ }
+
+ if (!lapic_in_kernel(v))
+ return v->arch.interrupt.nr;
+
+ if (irqchip_split(v->kvm)) {
+ int vector = v->arch.pending_external_vector;
+
+ v->arch.pending_external_vector = -1;
+ return vector;
+ } else
+ return kvm_pic_read_irq(v->kvm); /* PIC */
+}
+
+/*
+ * Read pending interrupt vector and intack.
+ */
+int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
+{
+ int vector = kvm_cpu_get_extint(v);
+ if (vector != -1)
+ return vector; /* PIC */
+
+ return kvm_get_apic_interrupt(v); /* APIC */
+}
+EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
+
+void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
+{
+ if (lapic_in_kernel(vcpu))
+ kvm_inject_apic_timer_irqs(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
+
+void __kvm_migrate_timers(struct kvm_vcpu *vcpu)
+{
+ __kvm_migrate_apic_timer(vcpu);
+ __kvm_migrate_pit_timer(vcpu);
+}
+
+bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args)
+{
+ bool resample = args->flags & KVM_IRQFD_FLAG_RESAMPLE;
+
+ return resample ? irqchip_kernel(kvm) : irqchip_in_kernel(kvm);
+}
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
new file mode 100644
index 000000000..fd210cdd4
--- /dev/null
+++ b/arch/x86/kvm/irq.h
@@ -0,0 +1,130 @@
+/*
+ * irq.h: in kernel interrupt controller related definitions
+ * Copyright (c) 2007, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * Authors:
+ * Yaozu (Eddie) Dong <Eddie.dong@intel.com>
+ *
+ */
+
+#ifndef __IRQ_H
+#define __IRQ_H
+
+#include <linux/mm_types.h>
+#include <linux/hrtimer.h>
+#include <linux/kvm_host.h>
+#include <linux/spinlock.h>
+
+#include <kvm/iodev.h>
+#include "ioapic.h"
+#include "lapic.h"
+
+#define PIC_NUM_PINS 16
+#define SELECT_PIC(irq) \
+ ((irq) < 8 ? KVM_IRQCHIP_PIC_MASTER : KVM_IRQCHIP_PIC_SLAVE)
+
+struct kvm;
+struct kvm_vcpu;
+
+struct kvm_kpic_state {
+ u8 last_irr; /* edge detection */
+ u8 irr; /* interrupt request register */
+ u8 imr; /* interrupt mask register */
+ u8 isr; /* interrupt service register */
+ u8 priority_add; /* highest irq priority */
+ u8 irq_base;
+ u8 read_reg_select;
+ u8 poll;
+ u8 special_mask;
+ u8 init_state;
+ u8 auto_eoi;
+ u8 rotate_on_auto_eoi;
+ u8 special_fully_nested_mode;
+ u8 init4; /* true if 4 byte init */
+ u8 elcr; /* PIIX edge/trigger selection */
+ u8 elcr_mask;
+ u8 isr_ack; /* interrupt ack detection */
+ struct kvm_pic *pics_state;
+};
+
+struct kvm_pic {
+ spinlock_t lock;
+ bool wakeup_needed;
+ unsigned pending_acks;
+ struct kvm *kvm;
+ struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
+ int output; /* intr from master PIC */
+ struct kvm_io_device dev_master;
+ struct kvm_io_device dev_slave;
+ struct kvm_io_device dev_eclr;
+ void (*ack_notifier)(void *opaque, int irq);
+ unsigned long irq_states[PIC_NUM_PINS];
+};
+
+int kvm_pic_init(struct kvm *kvm);
+void kvm_pic_destroy(struct kvm *kvm);
+int kvm_pic_read_irq(struct kvm *kvm);
+void kvm_pic_update_irq(struct kvm_pic *s);
+
+static inline int pic_in_kernel(struct kvm *kvm)
+{
+ int mode = kvm->arch.irqchip_mode;
+
+ /* Matches smp_wmb() when setting irqchip_mode */
+ smp_rmb();
+ return mode == KVM_IRQCHIP_KERNEL;
+}
+
+static inline int irqchip_split(struct kvm *kvm)
+{
+ int mode = kvm->arch.irqchip_mode;
+
+ /* Matches smp_wmb() when setting irqchip_mode */
+ smp_rmb();
+ return mode == KVM_IRQCHIP_SPLIT;
+}
+
+static inline int irqchip_kernel(struct kvm *kvm)
+{
+ int mode = kvm->arch.irqchip_mode;
+
+ /* Matches smp_wmb() when setting irqchip_mode */
+ smp_rmb();
+ return mode == KVM_IRQCHIP_KERNEL;
+}
+
+static inline int irqchip_in_kernel(struct kvm *kvm)
+{
+ int mode = kvm->arch.irqchip_mode;
+
+ /* Matches smp_wmb() when setting irqchip_mode */
+ smp_rmb();
+ return mode != KVM_IRQCHIP_NONE;
+}
+
+bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args);
+void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
+void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
+void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu);
+void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
+void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu);
+void __kvm_migrate_timers(struct kvm_vcpu *vcpu);
+
+int apic_has_pending_timer(struct kvm_vcpu *vcpu);
+
+int kvm_setup_default_irq_routing(struct kvm *kvm);
+int kvm_setup_empty_irq_routing(struct kvm *kvm);
+
+#endif
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
new file mode 100644
index 000000000..4d000aea0
--- /dev/null
+++ b/arch/x86/kvm/irq_comm.c
@@ -0,0 +1,441 @@
+/*
+ * irq_comm.c: Common API for in kernel interrupt controller
+ * Copyright (c) 2007, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * Authors:
+ * Yaozu (Eddie) Dong <Eddie.dong@intel.com>
+ *
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/rculist.h>
+
+#include <trace/events/kvm.h>
+
+#include <asm/msidef.h>
+
+#include "irq.h"
+
+#include "ioapic.h"
+
+#include "lapic.h"
+
+#include "hyperv.h"
+#include "x86.h"
+
+static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm *kvm, int irq_source_id, int level,
+ bool line_status)
+{
+ struct kvm_pic *pic = kvm->arch.vpic;
+ return kvm_pic_set_irq(pic, e->irqchip.pin, irq_source_id, level);
+}
+
+static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm *kvm, int irq_source_id, int level,
+ bool line_status)
+{
+ struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+ return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, irq_source_id, level,
+ line_status);
+}
+
+int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
+ struct kvm_lapic_irq *irq, struct dest_map *dest_map)
+{
+ int i, r = -1;
+ struct kvm_vcpu *vcpu, *lowest = NULL;
+ unsigned long dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)];
+ unsigned int dest_vcpus = 0;
+
+ if (irq->dest_mode == 0 && irq->dest_id == 0xff &&
+ kvm_lowest_prio_delivery(irq)) {
+ printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n");
+ irq->delivery_mode = APIC_DM_FIXED;
+ }
+
+ if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map))
+ return r;
+
+ memset(dest_vcpu_bitmap, 0, sizeof(dest_vcpu_bitmap));
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (!kvm_apic_present(vcpu))
+ continue;
+
+ if (!kvm_apic_match_dest(vcpu, src, irq->shorthand,
+ irq->dest_id, irq->dest_mode))
+ continue;
+
+ if (!kvm_lowest_prio_delivery(irq)) {
+ if (r < 0)
+ r = 0;
+ r += kvm_apic_set_irq(vcpu, irq, dest_map);
+ } else if (kvm_lapic_enabled(vcpu)) {
+ if (!kvm_vector_hashing_enabled()) {
+ if (!lowest)
+ lowest = vcpu;
+ else if (kvm_apic_compare_prio(vcpu, lowest) < 0)
+ lowest = vcpu;
+ } else {
+ __set_bit(i, dest_vcpu_bitmap);
+ dest_vcpus++;
+ }
+ }
+ }
+
+ if (dest_vcpus != 0) {
+ int idx = kvm_vector_to_index(irq->vector, dest_vcpus,
+ dest_vcpu_bitmap, KVM_MAX_VCPUS);
+
+ lowest = kvm_get_vcpu(kvm, idx);
+ }
+
+ if (lowest)
+ r = kvm_apic_set_irq(lowest, irq, dest_map);
+
+ return r;
+}
+
+void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
+ struct kvm_lapic_irq *irq)
+{
+ trace_kvm_msi_set_irq(e->msi.address_lo | (kvm->arch.x2apic_format ?
+ (u64)e->msi.address_hi << 32 : 0),
+ e->msi.data);
+
+ irq->dest_id = (e->msi.address_lo &
+ MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT;
+ if (kvm->arch.x2apic_format)
+ irq->dest_id |= MSI_ADDR_EXT_DEST_ID(e->msi.address_hi);
+ irq->vector = (e->msi.data &
+ MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT;
+ irq->dest_mode = (1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo;
+ irq->trig_mode = (1 << MSI_DATA_TRIGGER_SHIFT) & e->msi.data;
+ irq->delivery_mode = e->msi.data & 0x700;
+ irq->msi_redir_hint = ((e->msi.address_lo
+ & MSI_ADDR_REDIRECTION_LOWPRI) > 0);
+ irq->level = 1;
+ irq->shorthand = 0;
+}
+EXPORT_SYMBOL_GPL(kvm_set_msi_irq);
+
+static inline bool kvm_msi_route_invalid(struct kvm *kvm,
+ struct kvm_kernel_irq_routing_entry *e)
+{
+ return kvm->arch.x2apic_format && (e->msi.address_hi & 0xff);
+}
+
+int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm *kvm, int irq_source_id, int level, bool line_status)
+{
+ struct kvm_lapic_irq irq;
+
+ if (kvm_msi_route_invalid(kvm, e))
+ return -EINVAL;
+
+ if (!level)
+ return -1;
+
+ kvm_set_msi_irq(kvm, e, &irq);
+
+ return kvm_irq_delivery_to_apic(kvm, NULL, &irq, NULL);
+}
+
+
+static int kvm_hv_set_sint(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm *kvm, int irq_source_id, int level,
+ bool line_status)
+{
+ if (!level)
+ return -1;
+
+ return kvm_hv_synic_set_irq(kvm, e->hv_sint.vcpu, e->hv_sint.sint);
+}
+
+int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm *kvm, int irq_source_id, int level,
+ bool line_status)
+{
+ struct kvm_lapic_irq irq;
+ int r;
+
+ switch (e->type) {
+ case KVM_IRQ_ROUTING_HV_SINT:
+ return kvm_hv_set_sint(e, kvm, irq_source_id, level,
+ line_status);
+
+ case KVM_IRQ_ROUTING_MSI:
+ if (kvm_msi_route_invalid(kvm, e))
+ return -EINVAL;
+
+ kvm_set_msi_irq(kvm, e, &irq);
+
+ if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r, NULL))
+ return r;
+ break;
+
+ default:
+ break;
+ }
+
+ return -EWOULDBLOCK;
+}
+
+int kvm_request_irq_source_id(struct kvm *kvm)
+{
+ unsigned long *bitmap = &kvm->arch.irq_sources_bitmap;
+ int irq_source_id;
+
+ mutex_lock(&kvm->irq_lock);
+ irq_source_id = find_first_zero_bit(bitmap, BITS_PER_LONG);
+
+ if (irq_source_id >= BITS_PER_LONG) {
+ printk(KERN_WARNING "kvm: exhaust allocatable IRQ sources!\n");
+ irq_source_id = -EFAULT;
+ goto unlock;
+ }
+
+ ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
+ ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID);
+ set_bit(irq_source_id, bitmap);
+unlock:
+ mutex_unlock(&kvm->irq_lock);
+
+ return irq_source_id;
+}
+
+void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)
+{
+ ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
+ ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID);
+
+ mutex_lock(&kvm->irq_lock);
+ if (irq_source_id < 0 ||
+ irq_source_id >= BITS_PER_LONG) {
+ printk(KERN_ERR "kvm: IRQ source ID out of range!\n");
+ goto unlock;
+ }
+ clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap);
+ if (!irqchip_kernel(kvm))
+ goto unlock;
+
+ kvm_ioapic_clear_all(kvm->arch.vioapic, irq_source_id);
+ kvm_pic_clear_all(kvm->arch.vpic, irq_source_id);
+unlock:
+ mutex_unlock(&kvm->irq_lock);
+}
+
+void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
+ struct kvm_irq_mask_notifier *kimn)
+{
+ mutex_lock(&kvm->irq_lock);
+ kimn->irq = irq;
+ hlist_add_head_rcu(&kimn->link, &kvm->arch.mask_notifier_list);
+ mutex_unlock(&kvm->irq_lock);
+}
+
+void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
+ struct kvm_irq_mask_notifier *kimn)
+{
+ mutex_lock(&kvm->irq_lock);
+ hlist_del_rcu(&kimn->link);
+ mutex_unlock(&kvm->irq_lock);
+ synchronize_srcu(&kvm->irq_srcu);
+}
+
+void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
+ bool mask)
+{
+ struct kvm_irq_mask_notifier *kimn;
+ int idx, gsi;
+
+ idx = srcu_read_lock(&kvm->irq_srcu);
+ gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin);
+ if (gsi != -1)
+ hlist_for_each_entry_rcu(kimn, &kvm->arch.mask_notifier_list, link)
+ if (kimn->irq == gsi)
+ kimn->func(kimn, mask);
+ srcu_read_unlock(&kvm->irq_srcu, idx);
+}
+
+bool kvm_arch_can_set_irq_routing(struct kvm *kvm)
+{
+ return irqchip_in_kernel(kvm);
+}
+
+int kvm_set_routing_entry(struct kvm *kvm,
+ struct kvm_kernel_irq_routing_entry *e,
+ const struct kvm_irq_routing_entry *ue)
+{
+ /* We can't check irqchip_in_kernel() here as some callers are
+ * currently inititalizing the irqchip. Other callers should therefore
+ * check kvm_arch_can_set_irq_routing() before calling this function.
+ */
+ switch (ue->type) {
+ case KVM_IRQ_ROUTING_IRQCHIP:
+ if (irqchip_split(kvm))
+ return -EINVAL;
+ e->irqchip.pin = ue->u.irqchip.pin;
+ switch (ue->u.irqchip.irqchip) {
+ case KVM_IRQCHIP_PIC_SLAVE:
+ e->irqchip.pin += PIC_NUM_PINS / 2;
+ /* fall through */
+ case KVM_IRQCHIP_PIC_MASTER:
+ if (ue->u.irqchip.pin >= PIC_NUM_PINS / 2)
+ return -EINVAL;
+ e->set = kvm_set_pic_irq;
+ break;
+ case KVM_IRQCHIP_IOAPIC:
+ if (ue->u.irqchip.pin >= KVM_IOAPIC_NUM_PINS)
+ return -EINVAL;
+ e->set = kvm_set_ioapic_irq;
+ break;
+ default:
+ return -EINVAL;
+ }
+ e->irqchip.irqchip = ue->u.irqchip.irqchip;
+ break;
+ case KVM_IRQ_ROUTING_MSI:
+ e->set = kvm_set_msi;
+ e->msi.address_lo = ue->u.msi.address_lo;
+ e->msi.address_hi = ue->u.msi.address_hi;
+ e->msi.data = ue->u.msi.data;
+
+ if (kvm_msi_route_invalid(kvm, e))
+ return -EINVAL;
+ break;
+ case KVM_IRQ_ROUTING_HV_SINT:
+ e->set = kvm_hv_set_sint;
+ e->hv_sint.vcpu = ue->u.hv_sint.vcpu;
+ e->hv_sint.sint = ue->u.hv_sint.sint;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
+ struct kvm_vcpu **dest_vcpu)
+{
+ int i, r = 0;
+ struct kvm_vcpu *vcpu;
+
+ if (kvm_intr_is_single_vcpu_fast(kvm, irq, dest_vcpu))
+ return true;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (!kvm_apic_present(vcpu))
+ continue;
+
+ if (!kvm_apic_match_dest(vcpu, NULL, irq->shorthand,
+ irq->dest_id, irq->dest_mode))
+ continue;
+
+ if (++r == 2)
+ return false;
+
+ *dest_vcpu = vcpu;
+ }
+
+ return r == 1;
+}
+EXPORT_SYMBOL_GPL(kvm_intr_is_single_vcpu);
+
+#define IOAPIC_ROUTING_ENTRY(irq) \
+ { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \
+ .u.irqchip = { .irqchip = KVM_IRQCHIP_IOAPIC, .pin = (irq) } }
+#define ROUTING_ENTRY1(irq) IOAPIC_ROUTING_ENTRY(irq)
+
+#define PIC_ROUTING_ENTRY(irq) \
+ { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \
+ .u.irqchip = { .irqchip = SELECT_PIC(irq), .pin = (irq) % 8 } }
+#define ROUTING_ENTRY2(irq) \
+ IOAPIC_ROUTING_ENTRY(irq), PIC_ROUTING_ENTRY(irq)
+
+static const struct kvm_irq_routing_entry default_routing[] = {
+ ROUTING_ENTRY2(0), ROUTING_ENTRY2(1),
+ ROUTING_ENTRY2(2), ROUTING_ENTRY2(3),
+ ROUTING_ENTRY2(4), ROUTING_ENTRY2(5),
+ ROUTING_ENTRY2(6), ROUTING_ENTRY2(7),
+ ROUTING_ENTRY2(8), ROUTING_ENTRY2(9),
+ ROUTING_ENTRY2(10), ROUTING_ENTRY2(11),
+ ROUTING_ENTRY2(12), ROUTING_ENTRY2(13),
+ ROUTING_ENTRY2(14), ROUTING_ENTRY2(15),
+ ROUTING_ENTRY1(16), ROUTING_ENTRY1(17),
+ ROUTING_ENTRY1(18), ROUTING_ENTRY1(19),
+ ROUTING_ENTRY1(20), ROUTING_ENTRY1(21),
+ ROUTING_ENTRY1(22), ROUTING_ENTRY1(23),
+};
+
+int kvm_setup_default_irq_routing(struct kvm *kvm)
+{
+ return kvm_set_irq_routing(kvm, default_routing,
+ ARRAY_SIZE(default_routing), 0);
+}
+
+static const struct kvm_irq_routing_entry empty_routing[] = {};
+
+int kvm_setup_empty_irq_routing(struct kvm *kvm)
+{
+ return kvm_set_irq_routing(kvm, empty_routing, 0, 0);
+}
+
+void kvm_arch_post_irq_routing_update(struct kvm *kvm)
+{
+ if (!irqchip_split(kvm))
+ return;
+ kvm_make_scan_ioapic_request(kvm);
+}
+
+void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu,
+ ulong *ioapic_handled_vectors)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_kernel_irq_routing_entry *entry;
+ struct kvm_irq_routing_table *table;
+ u32 i, nr_ioapic_pins;
+ int idx;
+
+ idx = srcu_read_lock(&kvm->irq_srcu);
+ table = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
+ nr_ioapic_pins = min_t(u32, table->nr_rt_entries,
+ kvm->arch.nr_reserved_ioapic_pins);
+ for (i = 0; i < nr_ioapic_pins; ++i) {
+ hlist_for_each_entry(entry, &table->map[i], link) {
+ struct kvm_lapic_irq irq;
+
+ if (entry->type != KVM_IRQ_ROUTING_MSI)
+ continue;
+
+ kvm_set_msi_irq(vcpu->kvm, entry, &irq);
+
+ if (irq.trig_mode && kvm_apic_match_dest(vcpu, NULL, 0,
+ irq.dest_id, irq.dest_mode))
+ __set_bit(irq.vector, ioapic_handled_vectors);
+ }
+ }
+ srcu_read_unlock(&kvm->irq_srcu, idx);
+}
+
+void kvm_arch_irq_routing_update(struct kvm *kvm)
+{
+ kvm_hv_irq_routing_update(kvm);
+}
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
new file mode 100644
index 000000000..7df600410
--- /dev/null
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -0,0 +1,113 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ASM_KVM_CACHE_REGS_H
+#define ASM_KVM_CACHE_REGS_H
+
+#define KVM_POSSIBLE_CR0_GUEST_BITS X86_CR0_TS
+#define KVM_POSSIBLE_CR4_GUEST_BITS \
+ (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
+ | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_PGE | X86_CR4_TSD)
+
+static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu,
+ enum kvm_reg reg)
+{
+ if (!test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail))
+ kvm_x86_ops->cache_reg(vcpu, reg);
+
+ return vcpu->arch.regs[reg];
+}
+
+static inline void kvm_register_write(struct kvm_vcpu *vcpu,
+ enum kvm_reg reg,
+ unsigned long val)
+{
+ vcpu->arch.regs[reg] = val;
+ __set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty);
+ __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
+}
+
+static inline unsigned long kvm_rip_read(struct kvm_vcpu *vcpu)
+{
+ return kvm_register_read(vcpu, VCPU_REGS_RIP);
+}
+
+static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val)
+{
+ kvm_register_write(vcpu, VCPU_REGS_RIP, val);
+}
+
+static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
+{
+ might_sleep(); /* on svm */
+
+ if (!test_bit(VCPU_EXREG_PDPTR,
+ (unsigned long *)&vcpu->arch.regs_avail))
+ kvm_x86_ops->cache_reg(vcpu, (enum kvm_reg)VCPU_EXREG_PDPTR);
+
+ return vcpu->arch.walk_mmu->pdptrs[index];
+}
+
+static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask)
+{
+ ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS;
+ if (tmask & vcpu->arch.cr0_guest_owned_bits)
+ kvm_x86_ops->decache_cr0_guest_bits(vcpu);
+ return vcpu->arch.cr0 & mask;
+}
+
+static inline ulong kvm_read_cr0(struct kvm_vcpu *vcpu)
+{
+ return kvm_read_cr0_bits(vcpu, ~0UL);
+}
+
+static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask)
+{
+ ulong tmask = mask & KVM_POSSIBLE_CR4_GUEST_BITS;
+ if (tmask & vcpu->arch.cr4_guest_owned_bits)
+ kvm_x86_ops->decache_cr4_guest_bits(vcpu);
+ return vcpu->arch.cr4 & mask;
+}
+
+static inline ulong kvm_read_cr3(struct kvm_vcpu *vcpu)
+{
+ if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
+ kvm_x86_ops->decache_cr3(vcpu);
+ return vcpu->arch.cr3;
+}
+
+static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu)
+{
+ return kvm_read_cr4_bits(vcpu, ~0UL);
+}
+
+static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu)
+{
+ return (kvm_register_read(vcpu, VCPU_REGS_RAX) & -1u)
+ | ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32);
+}
+
+static inline void enter_guest_mode(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.hflags |= HF_GUEST_MASK;
+}
+
+static inline void leave_guest_mode(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.hflags &= ~HF_GUEST_MASK;
+
+ if (vcpu->arch.load_eoi_exitmap_pending) {
+ vcpu->arch.load_eoi_exitmap_pending = false;
+ kvm_make_request(KVM_REQ_LOAD_EOI_EXITMAP, vcpu);
+ }
+}
+
+static inline bool is_guest_mode(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.hflags & HF_GUEST_MASK;
+}
+
+static inline bool is_smm(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.hflags & HF_SMM_MASK;
+}
+
+#endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
new file mode 100644
index 000000000..89d07312e
--- /dev/null
+++ b/arch/x86/kvm/lapic.c
@@ -0,0 +1,2710 @@
+
+/*
+ * Local APIC virtualization
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright (C) 2007 Novell
+ * Copyright (C) 2007 Intel
+ * Copyright 2009 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ * Dor Laor <dor.laor@qumranet.com>
+ * Gregory Haskins <ghaskins@novell.com>
+ * Yaozu (Eddie) Dong <eddie.dong@intel.com>
+ *
+ * Based on Xen 3.1 code, Copyright (c) 2004, Intel Corporation.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/kvm.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/smp.h>
+#include <linux/hrtimer.h>
+#include <linux/io.h>
+#include <linux/export.h>
+#include <linux/math64.h>
+#include <linux/slab.h>
+#include <asm/processor.h>
+#include <asm/msr.h>
+#include <asm/page.h>
+#include <asm/current.h>
+#include <asm/apicdef.h>
+#include <asm/delay.h>
+#include <linux/atomic.h>
+#include <linux/jump_label.h>
+#include "kvm_cache_regs.h"
+#include "irq.h"
+#include "trace.h"
+#include "x86.h"
+#include "cpuid.h"
+#include "hyperv.h"
+
+#ifndef CONFIG_X86_64
+#define mod_64(x, y) ((x) - (y) * div64_u64(x, y))
+#else
+#define mod_64(x, y) ((x) % (y))
+#endif
+
+#define PRId64 "d"
+#define PRIx64 "llx"
+#define PRIu64 "u"
+#define PRIo64 "o"
+
+/* #define apic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) */
+#define apic_debug(fmt, arg...) do {} while (0)
+
+/* 14 is the version for Xeon and Pentium 8.4.8*/
+#define APIC_VERSION (0x14UL | ((KVM_APIC_LVT_NUM - 1) << 16))
+#define LAPIC_MMIO_LENGTH (1 << 12)
+/* followed define is not in apicdef.h */
+#define APIC_SHORT_MASK 0xc0000
+#define APIC_DEST_NOSHORT 0x0
+#define APIC_DEST_MASK 0x800
+#define MAX_APIC_VECTOR 256
+#define APIC_VECTORS_PER_REG 32
+
+#define APIC_BROADCAST 0xFF
+#define X2APIC_BROADCAST 0xFFFFFFFFul
+
+static inline int apic_test_vector(int vec, void *bitmap)
+{
+ return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
+}
+
+bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ return apic_test_vector(vector, apic->regs + APIC_ISR) ||
+ apic_test_vector(vector, apic->regs + APIC_IRR);
+}
+
+static inline void apic_clear_vector(int vec, void *bitmap)
+{
+ clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
+}
+
+static inline int __apic_test_and_set_vector(int vec, void *bitmap)
+{
+ return __test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
+}
+
+static inline int __apic_test_and_clear_vector(int vec, void *bitmap)
+{
+ return __test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
+}
+
+struct static_key_deferred apic_hw_disabled __read_mostly;
+struct static_key_deferred apic_sw_disabled __read_mostly;
+
+static inline int apic_enabled(struct kvm_lapic *apic)
+{
+ return kvm_apic_sw_enabled(apic) && kvm_apic_hw_enabled(apic);
+}
+
+#define LVT_MASK \
+ (APIC_LVT_MASKED | APIC_SEND_PENDING | APIC_VECTOR_MASK)
+
+#define LINT_MASK \
+ (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
+ APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
+
+static inline u8 kvm_xapic_id(struct kvm_lapic *apic)
+{
+ return kvm_lapic_get_reg(apic, APIC_ID) >> 24;
+}
+
+static inline u32 kvm_x2apic_id(struct kvm_lapic *apic)
+{
+ return apic->vcpu->vcpu_id;
+}
+
+static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map,
+ u32 dest_id, struct kvm_lapic ***cluster, u16 *mask) {
+ switch (map->mode) {
+ case KVM_APIC_MODE_X2APIC: {
+ u32 offset = (dest_id >> 16) * 16;
+ u32 max_apic_id = map->max_apic_id;
+
+ if (offset <= max_apic_id) {
+ u8 cluster_size = min(max_apic_id - offset + 1, 16U);
+
+ offset = array_index_nospec(offset, map->max_apic_id + 1);
+ *cluster = &map->phys_map[offset];
+ *mask = dest_id & (0xffff >> (16 - cluster_size));
+ } else {
+ *mask = 0;
+ }
+
+ return true;
+ }
+ case KVM_APIC_MODE_XAPIC_FLAT:
+ *cluster = map->xapic_flat_map;
+ *mask = dest_id & 0xff;
+ return true;
+ case KVM_APIC_MODE_XAPIC_CLUSTER:
+ *cluster = map->xapic_cluster_map[(dest_id >> 4) & 0xf];
+ *mask = dest_id & 0xf;
+ return true;
+ default:
+ /* Not optimized. */
+ return false;
+ }
+}
+
+static void kvm_apic_map_free(struct rcu_head *rcu)
+{
+ struct kvm_apic_map *map = container_of(rcu, struct kvm_apic_map, rcu);
+
+ kvfree(map);
+}
+
+static void recalculate_apic_map(struct kvm *kvm)
+{
+ struct kvm_apic_map *new, *old = NULL;
+ struct kvm_vcpu *vcpu;
+ int i;
+ u32 max_id = 255; /* enough space for any xAPIC ID */
+
+ mutex_lock(&kvm->arch.apic_map_lock);
+
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ if (kvm_apic_present(vcpu))
+ max_id = max(max_id, kvm_x2apic_id(vcpu->arch.apic));
+
+ new = kvzalloc(sizeof(struct kvm_apic_map) +
+ sizeof(struct kvm_lapic *) * ((u64)max_id + 1), GFP_KERNEL);
+
+ if (!new)
+ goto out;
+
+ new->max_apic_id = max_id;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ struct kvm_lapic **cluster;
+ u16 mask;
+ u32 ldr;
+ u8 xapic_id;
+ u32 x2apic_id;
+
+ if (!kvm_apic_present(vcpu))
+ continue;
+
+ xapic_id = kvm_xapic_id(apic);
+ x2apic_id = kvm_x2apic_id(apic);
+
+ /* Hotplug hack: see kvm_apic_match_physical_addr(), ... */
+ if ((apic_x2apic_mode(apic) || x2apic_id > 0xff) &&
+ x2apic_id <= new->max_apic_id)
+ new->phys_map[x2apic_id] = apic;
+ /*
+ * ... xAPIC ID of VCPUs with APIC ID > 0xff will wrap-around,
+ * prevent them from masking VCPUs with APIC ID <= 0xff.
+ */
+ if (!apic_x2apic_mode(apic) && !new->phys_map[xapic_id])
+ new->phys_map[xapic_id] = apic;
+
+ if (!kvm_apic_sw_enabled(apic))
+ continue;
+
+ ldr = kvm_lapic_get_reg(apic, APIC_LDR);
+
+ if (apic_x2apic_mode(apic)) {
+ new->mode |= KVM_APIC_MODE_X2APIC;
+ } else if (ldr) {
+ ldr = GET_APIC_LOGICAL_ID(ldr);
+ if (kvm_lapic_get_reg(apic, APIC_DFR) == APIC_DFR_FLAT)
+ new->mode |= KVM_APIC_MODE_XAPIC_FLAT;
+ else
+ new->mode |= KVM_APIC_MODE_XAPIC_CLUSTER;
+ }
+
+ if (!kvm_apic_map_get_logical_dest(new, ldr, &cluster, &mask))
+ continue;
+
+ if (mask)
+ cluster[ffs(mask) - 1] = apic;
+ }
+out:
+ old = rcu_dereference_protected(kvm->arch.apic_map,
+ lockdep_is_held(&kvm->arch.apic_map_lock));
+ rcu_assign_pointer(kvm->arch.apic_map, new);
+ mutex_unlock(&kvm->arch.apic_map_lock);
+
+ if (old)
+ call_rcu(&old->rcu, kvm_apic_map_free);
+
+ kvm_make_scan_ioapic_request(kvm);
+}
+
+static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
+{
+ bool enabled = val & APIC_SPIV_APIC_ENABLED;
+
+ kvm_lapic_set_reg(apic, APIC_SPIV, val);
+
+ if (enabled != apic->sw_enabled) {
+ apic->sw_enabled = enabled;
+ if (enabled) {
+ static_key_slow_dec_deferred(&apic_sw_disabled);
+ recalculate_apic_map(apic->vcpu->kvm);
+ } else
+ static_key_slow_inc(&apic_sw_disabled.key);
+
+ recalculate_apic_map(apic->vcpu->kvm);
+ }
+}
+
+static inline void kvm_apic_set_xapic_id(struct kvm_lapic *apic, u8 id)
+{
+ kvm_lapic_set_reg(apic, APIC_ID, id << 24);
+ recalculate_apic_map(apic->vcpu->kvm);
+}
+
+static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id)
+{
+ kvm_lapic_set_reg(apic, APIC_LDR, id);
+ recalculate_apic_map(apic->vcpu->kvm);
+}
+
+static inline u32 kvm_apic_calc_x2apic_ldr(u32 id)
+{
+ return ((id >> 4) << 16) | (1 << (id & 0xf));
+}
+
+static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u32 id)
+{
+ u32 ldr = kvm_apic_calc_x2apic_ldr(id);
+
+ WARN_ON_ONCE(id != apic->vcpu->vcpu_id);
+
+ kvm_lapic_set_reg(apic, APIC_ID, id);
+ kvm_lapic_set_reg(apic, APIC_LDR, ldr);
+ recalculate_apic_map(apic->vcpu->kvm);
+}
+
+static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type)
+{
+ return !(kvm_lapic_get_reg(apic, lvt_type) & APIC_LVT_MASKED);
+}
+
+static inline int apic_lvt_vector(struct kvm_lapic *apic, int lvt_type)
+{
+ return kvm_lapic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK;
+}
+
+static inline int apic_lvtt_oneshot(struct kvm_lapic *apic)
+{
+ return apic->lapic_timer.timer_mode == APIC_LVT_TIMER_ONESHOT;
+}
+
+static inline int apic_lvtt_period(struct kvm_lapic *apic)
+{
+ return apic->lapic_timer.timer_mode == APIC_LVT_TIMER_PERIODIC;
+}
+
+static inline int apic_lvtt_tscdeadline(struct kvm_lapic *apic)
+{
+ return apic->lapic_timer.timer_mode == APIC_LVT_TIMER_TSCDEADLINE;
+}
+
+static inline int apic_lvt_nmi_mode(u32 lvt_val)
+{
+ return (lvt_val & (APIC_MODE_MASK | APIC_LVT_MASKED)) == APIC_DM_NMI;
+}
+
+void kvm_apic_set_version(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ struct kvm_cpuid_entry2 *feat;
+ u32 v = APIC_VERSION;
+
+ if (!lapic_in_kernel(vcpu))
+ return;
+
+ /*
+ * KVM emulates 82093AA datasheet (with in-kernel IOAPIC implementation)
+ * which doesn't have EOI register; Some buggy OSes (e.g. Windows with
+ * Hyper-V role) disable EOI broadcast in lapic not checking for IOAPIC
+ * version first and level-triggered interrupts never get EOIed in
+ * IOAPIC.
+ */
+ feat = kvm_find_cpuid_entry(apic->vcpu, 0x1, 0);
+ if (feat && (feat->ecx & (1 << (X86_FEATURE_X2APIC & 31))) &&
+ !ioapic_in_kernel(vcpu->kvm))
+ v |= APIC_LVR_DIRECTED_EOI;
+ kvm_lapic_set_reg(apic, APIC_LVR, v);
+}
+
+static const unsigned int apic_lvt_mask[KVM_APIC_LVT_NUM] = {
+ LVT_MASK , /* part LVTT mask, timer mode mask added at runtime */
+ LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */
+ LVT_MASK | APIC_MODE_MASK, /* LVTPC */
+ LINT_MASK, LINT_MASK, /* LVT0-1 */
+ LVT_MASK /* LVTERR */
+};
+
+static int find_highest_vector(void *bitmap)
+{
+ int vec;
+ u32 *reg;
+
+ for (vec = MAX_APIC_VECTOR - APIC_VECTORS_PER_REG;
+ vec >= 0; vec -= APIC_VECTORS_PER_REG) {
+ reg = bitmap + REG_POS(vec);
+ if (*reg)
+ return __fls(*reg) + vec;
+ }
+
+ return -1;
+}
+
+static u8 count_vectors(void *bitmap)
+{
+ int vec;
+ u32 *reg;
+ u8 count = 0;
+
+ for (vec = 0; vec < MAX_APIC_VECTOR; vec += APIC_VECTORS_PER_REG) {
+ reg = bitmap + REG_POS(vec);
+ count += hweight32(*reg);
+ }
+
+ return count;
+}
+
+bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr)
+{
+ u32 i, vec;
+ u32 pir_val, irr_val, prev_irr_val;
+ int max_updated_irr;
+
+ max_updated_irr = -1;
+ *max_irr = -1;
+
+ for (i = vec = 0; i <= 7; i++, vec += 32) {
+ pir_val = READ_ONCE(pir[i]);
+ irr_val = *((u32 *)(regs + APIC_IRR + i * 0x10));
+ if (pir_val) {
+ prev_irr_val = irr_val;
+ irr_val |= xchg(&pir[i], 0);
+ *((u32 *)(regs + APIC_IRR + i * 0x10)) = irr_val;
+ if (prev_irr_val != irr_val) {
+ max_updated_irr =
+ __fls(irr_val ^ prev_irr_val) + vec;
+ }
+ }
+ if (irr_val)
+ *max_irr = __fls(irr_val) + vec;
+ }
+
+ return ((max_updated_irr != -1) &&
+ (max_updated_irr == *max_irr));
+}
+EXPORT_SYMBOL_GPL(__kvm_apic_update_irr);
+
+bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir, int *max_irr)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ return __kvm_apic_update_irr(pir, apic->regs, max_irr);
+}
+EXPORT_SYMBOL_GPL(kvm_apic_update_irr);
+
+static inline int apic_search_irr(struct kvm_lapic *apic)
+{
+ return find_highest_vector(apic->regs + APIC_IRR);
+}
+
+static inline int apic_find_highest_irr(struct kvm_lapic *apic)
+{
+ int result;
+
+ /*
+ * Note that irr_pending is just a hint. It will be always
+ * true with virtual interrupt delivery enabled.
+ */
+ if (!apic->irr_pending)
+ return -1;
+
+ result = apic_search_irr(apic);
+ ASSERT(result == -1 || result >= 16);
+
+ return result;
+}
+
+static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
+{
+ struct kvm_vcpu *vcpu;
+
+ vcpu = apic->vcpu;
+
+ if (unlikely(vcpu->arch.apicv_active)) {
+ /* need to update RVI */
+ apic_clear_vector(vec, apic->regs + APIC_IRR);
+ kvm_x86_ops->hwapic_irr_update(vcpu,
+ apic_find_highest_irr(apic));
+ } else {
+ apic->irr_pending = false;
+ apic_clear_vector(vec, apic->regs + APIC_IRR);
+ if (apic_search_irr(apic) != -1)
+ apic->irr_pending = true;
+ }
+}
+
+static inline void apic_set_isr(int vec, struct kvm_lapic *apic)
+{
+ struct kvm_vcpu *vcpu;
+
+ if (__apic_test_and_set_vector(vec, apic->regs + APIC_ISR))
+ return;
+
+ vcpu = apic->vcpu;
+
+ /*
+ * With APIC virtualization enabled, all caching is disabled
+ * because the processor can modify ISR under the hood. Instead
+ * just set SVI.
+ */
+ if (unlikely(vcpu->arch.apicv_active))
+ kvm_x86_ops->hwapic_isr_update(vcpu, vec);
+ else {
+ ++apic->isr_count;
+ BUG_ON(apic->isr_count > MAX_APIC_VECTOR);
+ /*
+ * ISR (in service register) bit is set when injecting an interrupt.
+ * The highest vector is injected. Thus the latest bit set matches
+ * the highest bit in ISR.
+ */
+ apic->highest_isr_cache = vec;
+ }
+}
+
+static inline int apic_find_highest_isr(struct kvm_lapic *apic)
+{
+ int result;
+
+ /*
+ * Note that isr_count is always 1, and highest_isr_cache
+ * is always -1, with APIC virtualization enabled.
+ */
+ if (!apic->isr_count)
+ return -1;
+ if (likely(apic->highest_isr_cache != -1))
+ return apic->highest_isr_cache;
+
+ result = find_highest_vector(apic->regs + APIC_ISR);
+ ASSERT(result == -1 || result >= 16);
+
+ return result;
+}
+
+static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
+{
+ struct kvm_vcpu *vcpu;
+ if (!__apic_test_and_clear_vector(vec, apic->regs + APIC_ISR))
+ return;
+
+ vcpu = apic->vcpu;
+
+ /*
+ * We do get here for APIC virtualization enabled if the guest
+ * uses the Hyper-V APIC enlightenment. In this case we may need
+ * to trigger a new interrupt delivery by writing the SVI field;
+ * on the other hand isr_count and highest_isr_cache are unused
+ * and must be left alone.
+ */
+ if (unlikely(vcpu->arch.apicv_active))
+ kvm_x86_ops->hwapic_isr_update(vcpu,
+ apic_find_highest_isr(apic));
+ else {
+ --apic->isr_count;
+ BUG_ON(apic->isr_count < 0);
+ apic->highest_isr_cache = -1;
+ }
+}
+
+int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
+{
+ /* This may race with setting of irr in __apic_accept_irq() and
+ * value returned may be wrong, but kvm_vcpu_kick() in __apic_accept_irq
+ * will cause vmexit immediately and the value will be recalculated
+ * on the next vmentry.
+ */
+ return apic_find_highest_irr(vcpu->arch.apic);
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr);
+
+static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
+ int vector, int level, int trig_mode,
+ struct dest_map *dest_map);
+
+int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
+ struct dest_map *dest_map)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ return __apic_accept_irq(apic, irq->delivery_mode, irq->vector,
+ irq->level, irq->trig_mode, dest_map);
+}
+
+int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low,
+ unsigned long ipi_bitmap_high, u32 min,
+ unsigned long icr, int op_64_bit)
+{
+ int i;
+ struct kvm_apic_map *map;
+ struct kvm_vcpu *vcpu;
+ struct kvm_lapic_irq irq = {0};
+ int cluster_size = op_64_bit ? 64 : 32;
+ int count = 0;
+
+ irq.vector = icr & APIC_VECTOR_MASK;
+ irq.delivery_mode = icr & APIC_MODE_MASK;
+ irq.level = (icr & APIC_INT_ASSERT) != 0;
+ irq.trig_mode = icr & APIC_INT_LEVELTRIG;
+
+ if (icr & APIC_DEST_MASK)
+ return -KVM_EINVAL;
+ if (icr & APIC_SHORT_MASK)
+ return -KVM_EINVAL;
+
+ rcu_read_lock();
+ map = rcu_dereference(kvm->arch.apic_map);
+
+ if (unlikely(!map)) {
+ count = -EOPNOTSUPP;
+ goto out;
+ }
+
+ if (min > map->max_apic_id)
+ goto out;
+ /* Bits above cluster_size are masked in the caller. */
+ for_each_set_bit(i, &ipi_bitmap_low,
+ min((u32)BITS_PER_LONG, (map->max_apic_id - min + 1))) {
+ if (map->phys_map[min + i]) {
+ vcpu = map->phys_map[min + i]->vcpu;
+ count += kvm_apic_set_irq(vcpu, &irq, NULL);
+ }
+ }
+
+ min += cluster_size;
+
+ if (min > map->max_apic_id)
+ goto out;
+
+ for_each_set_bit(i, &ipi_bitmap_high,
+ min((u32)BITS_PER_LONG, (map->max_apic_id - min + 1))) {
+ if (map->phys_map[min + i]) {
+ vcpu = map->phys_map[min + i]->vcpu;
+ count += kvm_apic_set_irq(vcpu, &irq, NULL);
+ }
+ }
+
+out:
+ rcu_read_unlock();
+ return count;
+}
+
+static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val)
+{
+
+ return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, &val,
+ sizeof(val));
+}
+
+static int pv_eoi_get_user(struct kvm_vcpu *vcpu, u8 *val)
+{
+
+ return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, val,
+ sizeof(*val));
+}
+
+static inline bool pv_eoi_enabled(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.pv_eoi.msr_val & KVM_MSR_ENABLED;
+}
+
+static bool pv_eoi_get_pending(struct kvm_vcpu *vcpu)
+{
+ u8 val;
+ if (pv_eoi_get_user(vcpu, &val) < 0) {
+ apic_debug("Can't read EOI MSR value: 0x%llx\n",
+ (unsigned long long)vcpu->arch.pv_eoi.msr_val);
+ return false;
+ }
+ return val & 0x1;
+}
+
+static void pv_eoi_set_pending(struct kvm_vcpu *vcpu)
+{
+ if (pv_eoi_put_user(vcpu, KVM_PV_EOI_ENABLED) < 0) {
+ apic_debug("Can't set EOI MSR value: 0x%llx\n",
+ (unsigned long long)vcpu->arch.pv_eoi.msr_val);
+ return;
+ }
+ __set_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
+}
+
+static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu)
+{
+ if (pv_eoi_put_user(vcpu, KVM_PV_EOI_DISABLED) < 0) {
+ apic_debug("Can't clear EOI MSR value: 0x%llx\n",
+ (unsigned long long)vcpu->arch.pv_eoi.msr_val);
+ return;
+ }
+ __clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
+}
+
+static int apic_has_interrupt_for_ppr(struct kvm_lapic *apic, u32 ppr)
+{
+ int highest_irr;
+ if (apic->vcpu->arch.apicv_active)
+ highest_irr = kvm_x86_ops->sync_pir_to_irr(apic->vcpu);
+ else
+ highest_irr = apic_find_highest_irr(apic);
+ if (highest_irr == -1 || (highest_irr & 0xF0) <= ppr)
+ return -1;
+ return highest_irr;
+}
+
+static bool __apic_update_ppr(struct kvm_lapic *apic, u32 *new_ppr)
+{
+ u32 tpr, isrv, ppr, old_ppr;
+ int isr;
+
+ old_ppr = kvm_lapic_get_reg(apic, APIC_PROCPRI);
+ tpr = kvm_lapic_get_reg(apic, APIC_TASKPRI);
+ isr = apic_find_highest_isr(apic);
+ isrv = (isr != -1) ? isr : 0;
+
+ if ((tpr & 0xf0) >= (isrv & 0xf0))
+ ppr = tpr & 0xff;
+ else
+ ppr = isrv & 0xf0;
+
+ apic_debug("vlapic %p, ppr 0x%x, isr 0x%x, isrv 0x%x",
+ apic, ppr, isr, isrv);
+
+ *new_ppr = ppr;
+ if (old_ppr != ppr)
+ kvm_lapic_set_reg(apic, APIC_PROCPRI, ppr);
+
+ return ppr < old_ppr;
+}
+
+static void apic_update_ppr(struct kvm_lapic *apic)
+{
+ u32 ppr;
+
+ if (__apic_update_ppr(apic, &ppr) &&
+ apic_has_interrupt_for_ppr(apic, ppr) != -1)
+ kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
+}
+
+void kvm_apic_update_ppr(struct kvm_vcpu *vcpu)
+{
+ apic_update_ppr(vcpu->arch.apic);
+}
+EXPORT_SYMBOL_GPL(kvm_apic_update_ppr);
+
+static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
+{
+ kvm_lapic_set_reg(apic, APIC_TASKPRI, tpr);
+ apic_update_ppr(apic);
+}
+
+static bool kvm_apic_broadcast(struct kvm_lapic *apic, u32 mda)
+{
+ return mda == (apic_x2apic_mode(apic) ?
+ X2APIC_BROADCAST : APIC_BROADCAST);
+}
+
+static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 mda)
+{
+ if (kvm_apic_broadcast(apic, mda))
+ return true;
+
+ if (apic_x2apic_mode(apic))
+ return mda == kvm_x2apic_id(apic);
+
+ /*
+ * Hotplug hack: Make LAPIC in xAPIC mode also accept interrupts as if
+ * it were in x2APIC mode. Hotplugged VCPUs start in xAPIC mode and
+ * this allows unique addressing of VCPUs with APIC ID over 0xff.
+ * The 0xff condition is needed because writeable xAPIC ID.
+ */
+ if (kvm_x2apic_id(apic) > 0xff && mda == kvm_x2apic_id(apic))
+ return true;
+
+ return mda == kvm_xapic_id(apic);
+}
+
+static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
+{
+ u32 logical_id;
+
+ if (kvm_apic_broadcast(apic, mda))
+ return true;
+
+ logical_id = kvm_lapic_get_reg(apic, APIC_LDR);
+
+ if (apic_x2apic_mode(apic))
+ return ((logical_id >> 16) == (mda >> 16))
+ && (logical_id & mda & 0xffff) != 0;
+
+ logical_id = GET_APIC_LOGICAL_ID(logical_id);
+
+ switch (kvm_lapic_get_reg(apic, APIC_DFR)) {
+ case APIC_DFR_FLAT:
+ return (logical_id & mda) != 0;
+ case APIC_DFR_CLUSTER:
+ return ((logical_id >> 4) == (mda >> 4))
+ && (logical_id & mda & 0xf) != 0;
+ default:
+ apic_debug("Bad DFR vcpu %d: %08x\n",
+ apic->vcpu->vcpu_id, kvm_lapic_get_reg(apic, APIC_DFR));
+ return false;
+ }
+}
+
+/* The KVM local APIC implementation has two quirks:
+ *
+ * - Real hardware delivers interrupts destined to x2APIC ID > 0xff to LAPICs
+ * in xAPIC mode if the "destination & 0xff" matches its xAPIC ID.
+ * KVM doesn't do that aliasing.
+ *
+ * - in-kernel IOAPIC messages have to be delivered directly to
+ * x2APIC, because the kernel does not support interrupt remapping.
+ * In order to support broadcast without interrupt remapping, x2APIC
+ * rewrites the destination of non-IPI messages from APIC_BROADCAST
+ * to X2APIC_BROADCAST.
+ *
+ * The broadcast quirk can be disabled with KVM_CAP_X2APIC_API. This is
+ * important when userspace wants to use x2APIC-format MSIs, because
+ * APIC_BROADCAST (0xff) is a legal route for "cluster 0, CPUs 0-7".
+ */
+static u32 kvm_apic_mda(struct kvm_vcpu *vcpu, unsigned int dest_id,
+ struct kvm_lapic *source, struct kvm_lapic *target)
+{
+ bool ipi = source != NULL;
+
+ if (!vcpu->kvm->arch.x2apic_broadcast_quirk_disabled &&
+ !ipi && dest_id == APIC_BROADCAST && apic_x2apic_mode(target))
+ return X2APIC_BROADCAST;
+
+ return dest_id;
+}
+
+bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
+ int short_hand, unsigned int dest, int dest_mode)
+{
+ struct kvm_lapic *target = vcpu->arch.apic;
+ u32 mda = kvm_apic_mda(vcpu, dest, source, target);
+
+ apic_debug("target %p, source %p, dest 0x%x, "
+ "dest_mode 0x%x, short_hand 0x%x\n",
+ target, source, dest, dest_mode, short_hand);
+
+ ASSERT(target);
+ switch (short_hand) {
+ case APIC_DEST_NOSHORT:
+ if (dest_mode == APIC_DEST_PHYSICAL)
+ return kvm_apic_match_physical_addr(target, mda);
+ else
+ return kvm_apic_match_logical_addr(target, mda);
+ case APIC_DEST_SELF:
+ return target == source;
+ case APIC_DEST_ALLINC:
+ return true;
+ case APIC_DEST_ALLBUT:
+ return target != source;
+ default:
+ apic_debug("kvm: apic: Bad dest shorthand value %x\n",
+ short_hand);
+ return false;
+ }
+}
+EXPORT_SYMBOL_GPL(kvm_apic_match_dest);
+
+int kvm_vector_to_index(u32 vector, u32 dest_vcpus,
+ const unsigned long *bitmap, u32 bitmap_size)
+{
+ u32 mod;
+ int i, idx = -1;
+
+ mod = vector % dest_vcpus;
+
+ for (i = 0; i <= mod; i++) {
+ idx = find_next_bit(bitmap, bitmap_size, idx + 1);
+ BUG_ON(idx == bitmap_size);
+ }
+
+ return idx;
+}
+
+static void kvm_apic_disabled_lapic_found(struct kvm *kvm)
+{
+ if (!kvm->arch.disabled_lapic_found) {
+ kvm->arch.disabled_lapic_found = true;
+ printk(KERN_INFO
+ "Disabled LAPIC found during irq injection\n");
+ }
+}
+
+static bool kvm_apic_is_broadcast_dest(struct kvm *kvm, struct kvm_lapic **src,
+ struct kvm_lapic_irq *irq, struct kvm_apic_map *map)
+{
+ if (kvm->arch.x2apic_broadcast_quirk_disabled) {
+ if ((irq->dest_id == APIC_BROADCAST &&
+ map->mode != KVM_APIC_MODE_X2APIC))
+ return true;
+ if (irq->dest_id == X2APIC_BROADCAST)
+ return true;
+ } else {
+ bool x2apic_ipi = src && *src && apic_x2apic_mode(*src);
+ if (irq->dest_id == (x2apic_ipi ?
+ X2APIC_BROADCAST : APIC_BROADCAST))
+ return true;
+ }
+
+ return false;
+}
+
+/* Return true if the interrupt can be handled by using *bitmap as index mask
+ * for valid destinations in *dst array.
+ * Return false if kvm_apic_map_get_dest_lapic did nothing useful.
+ * Note: we may have zero kvm_lapic destinations when we return true, which
+ * means that the interrupt should be dropped. In this case, *bitmap would be
+ * zero and *dst undefined.
+ */
+static inline bool kvm_apic_map_get_dest_lapic(struct kvm *kvm,
+ struct kvm_lapic **src, struct kvm_lapic_irq *irq,
+ struct kvm_apic_map *map, struct kvm_lapic ***dst,
+ unsigned long *bitmap)
+{
+ int i, lowest;
+
+ if (irq->shorthand == APIC_DEST_SELF && src) {
+ *dst = src;
+ *bitmap = 1;
+ return true;
+ } else if (irq->shorthand)
+ return false;
+
+ if (!map || kvm_apic_is_broadcast_dest(kvm, src, irq, map))
+ return false;
+
+ if (irq->dest_mode == APIC_DEST_PHYSICAL) {
+ if (irq->dest_id > map->max_apic_id) {
+ *bitmap = 0;
+ } else {
+ u32 dest_id = array_index_nospec(irq->dest_id, map->max_apic_id + 1);
+ *dst = &map->phys_map[dest_id];
+ *bitmap = 1;
+ }
+ return true;
+ }
+
+ *bitmap = 0;
+ if (!kvm_apic_map_get_logical_dest(map, irq->dest_id, dst,
+ (u16 *)bitmap))
+ return false;
+
+ if (!kvm_lowest_prio_delivery(irq))
+ return true;
+
+ if (!kvm_vector_hashing_enabled()) {
+ lowest = -1;
+ for_each_set_bit(i, bitmap, 16) {
+ if (!(*dst)[i])
+ continue;
+ if (lowest < 0)
+ lowest = i;
+ else if (kvm_apic_compare_prio((*dst)[i]->vcpu,
+ (*dst)[lowest]->vcpu) < 0)
+ lowest = i;
+ }
+ } else {
+ if (!*bitmap)
+ return true;
+
+ lowest = kvm_vector_to_index(irq->vector, hweight16(*bitmap),
+ bitmap, 16);
+
+ if (!(*dst)[lowest]) {
+ kvm_apic_disabled_lapic_found(kvm);
+ *bitmap = 0;
+ return true;
+ }
+ }
+
+ *bitmap = (lowest >= 0) ? 1 << lowest : 0;
+
+ return true;
+}
+
+bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
+ struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map)
+{
+ struct kvm_apic_map *map;
+ unsigned long bitmap;
+ struct kvm_lapic **dst = NULL;
+ int i;
+ bool ret;
+
+ *r = -1;
+
+ if (irq->shorthand == APIC_DEST_SELF) {
+ *r = kvm_apic_set_irq(src->vcpu, irq, dest_map);
+ return true;
+ }
+
+ rcu_read_lock();
+ map = rcu_dereference(kvm->arch.apic_map);
+
+ ret = kvm_apic_map_get_dest_lapic(kvm, &src, irq, map, &dst, &bitmap);
+ if (ret)
+ for_each_set_bit(i, &bitmap, 16) {
+ if (!dst[i])
+ continue;
+ if (*r < 0)
+ *r = 0;
+ *r += kvm_apic_set_irq(dst[i]->vcpu, irq, dest_map);
+ }
+
+ rcu_read_unlock();
+ return ret;
+}
+
+/*
+ * This routine tries to handler interrupts in posted mode, here is how
+ * it deals with different cases:
+ * - For single-destination interrupts, handle it in posted mode
+ * - Else if vector hashing is enabled and it is a lowest-priority
+ * interrupt, handle it in posted mode and use the following mechanism
+ * to find the destinaiton vCPU.
+ * 1. For lowest-priority interrupts, store all the possible
+ * destination vCPUs in an array.
+ * 2. Use "guest vector % max number of destination vCPUs" to find
+ * the right destination vCPU in the array for the lowest-priority
+ * interrupt.
+ * - Otherwise, use remapped mode to inject the interrupt.
+ */
+bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
+ struct kvm_vcpu **dest_vcpu)
+{
+ struct kvm_apic_map *map;
+ unsigned long bitmap;
+ struct kvm_lapic **dst = NULL;
+ bool ret = false;
+
+ if (irq->shorthand)
+ return false;
+
+ rcu_read_lock();
+ map = rcu_dereference(kvm->arch.apic_map);
+
+ if (kvm_apic_map_get_dest_lapic(kvm, NULL, irq, map, &dst, &bitmap) &&
+ hweight16(bitmap) == 1) {
+ unsigned long i = find_first_bit(&bitmap, 16);
+
+ if (dst[i]) {
+ *dest_vcpu = dst[i]->vcpu;
+ ret = true;
+ }
+ }
+
+ rcu_read_unlock();
+ return ret;
+}
+
+/*
+ * Add a pending IRQ into lapic.
+ * Return 1 if successfully added and 0 if discarded.
+ */
+static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
+ int vector, int level, int trig_mode,
+ struct dest_map *dest_map)
+{
+ int result = 0;
+ struct kvm_vcpu *vcpu = apic->vcpu;
+
+ trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode,
+ trig_mode, vector);
+ switch (delivery_mode) {
+ case APIC_DM_LOWEST:
+ vcpu->arch.apic_arb_prio++;
+ case APIC_DM_FIXED:
+ if (unlikely(trig_mode && !level))
+ break;
+
+ /* FIXME add logic for vcpu on reset */
+ if (unlikely(!apic_enabled(apic)))
+ break;
+
+ result = 1;
+
+ if (dest_map) {
+ __set_bit(vcpu->vcpu_id, dest_map->map);
+ dest_map->vectors[vcpu->vcpu_id] = vector;
+ }
+
+ if (apic_test_vector(vector, apic->regs + APIC_TMR) != !!trig_mode) {
+ if (trig_mode)
+ kvm_lapic_set_vector(vector, apic->regs + APIC_TMR);
+ else
+ apic_clear_vector(vector, apic->regs + APIC_TMR);
+ }
+
+ if (kvm_x86_ops->deliver_posted_interrupt(vcpu, vector)) {
+ kvm_lapic_set_irr(vector, apic);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ kvm_vcpu_kick(vcpu);
+ }
+ break;
+
+ case APIC_DM_REMRD:
+ result = 1;
+ vcpu->arch.pv.pv_unhalted = 1;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ kvm_vcpu_kick(vcpu);
+ break;
+
+ case APIC_DM_SMI:
+ result = 1;
+ kvm_make_request(KVM_REQ_SMI, vcpu);
+ kvm_vcpu_kick(vcpu);
+ break;
+
+ case APIC_DM_NMI:
+ result = 1;
+ kvm_inject_nmi(vcpu);
+ kvm_vcpu_kick(vcpu);
+ break;
+
+ case APIC_DM_INIT:
+ if (!trig_mode || level) {
+ result = 1;
+ /* assumes that there are only KVM_APIC_INIT/SIPI */
+ apic->pending_events = (1UL << KVM_APIC_INIT);
+ /* make sure pending_events is visible before sending
+ * the request */
+ smp_wmb();
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ kvm_vcpu_kick(vcpu);
+ } else {
+ apic_debug("Ignoring de-assert INIT to vcpu %d\n",
+ vcpu->vcpu_id);
+ }
+ break;
+
+ case APIC_DM_STARTUP:
+ apic_debug("SIPI to vcpu %d vector 0x%02x\n",
+ vcpu->vcpu_id, vector);
+ result = 1;
+ apic->sipi_vector = vector;
+ /* make sure sipi_vector is visible for the receiver */
+ smp_wmb();
+ set_bit(KVM_APIC_SIPI, &apic->pending_events);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ kvm_vcpu_kick(vcpu);
+ break;
+
+ case APIC_DM_EXTINT:
+ /*
+ * Should only be called by kvm_apic_local_deliver() with LVT0,
+ * before NMI watchdog was enabled. Already handled by
+ * kvm_apic_accept_pic_intr().
+ */
+ break;
+
+ default:
+ printk(KERN_ERR "TODO: unsupported delivery mode %x\n",
+ delivery_mode);
+ break;
+ }
+ return result;
+}
+
+int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
+{
+ return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio;
+}
+
+static bool kvm_ioapic_handles_vector(struct kvm_lapic *apic, int vector)
+{
+ return test_bit(vector, apic->vcpu->arch.ioapic_handled_vectors);
+}
+
+static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector)
+{
+ int trigger_mode;
+
+ /* Eoi the ioapic only if the ioapic doesn't own the vector. */
+ if (!kvm_ioapic_handles_vector(apic, vector))
+ return;
+
+ /* Request a KVM exit to inform the userspace IOAPIC. */
+ if (irqchip_split(apic->vcpu->kvm)) {
+ apic->vcpu->arch.pending_ioapic_eoi = vector;
+ kvm_make_request(KVM_REQ_IOAPIC_EOI_EXIT, apic->vcpu);
+ return;
+ }
+
+ if (apic_test_vector(vector, apic->regs + APIC_TMR))
+ trigger_mode = IOAPIC_LEVEL_TRIG;
+ else
+ trigger_mode = IOAPIC_EDGE_TRIG;
+
+ kvm_ioapic_update_eoi(apic->vcpu, vector, trigger_mode);
+}
+
+static int apic_set_eoi(struct kvm_lapic *apic)
+{
+ int vector = apic_find_highest_isr(apic);
+
+ trace_kvm_eoi(apic, vector);
+
+ /*
+ * Not every write EOI will has corresponding ISR,
+ * one example is when Kernel check timer on setup_IO_APIC
+ */
+ if (vector == -1)
+ return vector;
+
+ apic_clear_isr(vector, apic);
+ apic_update_ppr(apic);
+
+ if (test_bit(vector, vcpu_to_synic(apic->vcpu)->vec_bitmap))
+ kvm_hv_synic_send_eoi(apic->vcpu, vector);
+
+ kvm_ioapic_send_eoi(apic, vector);
+ kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
+ return vector;
+}
+
+/*
+ * this interface assumes a trap-like exit, which has already finished
+ * desired side effect including vISR and vPPR update.
+ */
+void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ trace_kvm_eoi(apic, vector);
+
+ kvm_ioapic_send_eoi(apic, vector);
+ kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_apic_set_eoi_accelerated);
+
+static void apic_send_ipi(struct kvm_lapic *apic)
+{
+ u32 icr_low = kvm_lapic_get_reg(apic, APIC_ICR);
+ u32 icr_high = kvm_lapic_get_reg(apic, APIC_ICR2);
+ struct kvm_lapic_irq irq;
+
+ irq.vector = icr_low & APIC_VECTOR_MASK;
+ irq.delivery_mode = icr_low & APIC_MODE_MASK;
+ irq.dest_mode = icr_low & APIC_DEST_MASK;
+ irq.level = (icr_low & APIC_INT_ASSERT) != 0;
+ irq.trig_mode = icr_low & APIC_INT_LEVELTRIG;
+ irq.shorthand = icr_low & APIC_SHORT_MASK;
+ irq.msi_redir_hint = false;
+ if (apic_x2apic_mode(apic))
+ irq.dest_id = icr_high;
+ else
+ irq.dest_id = GET_APIC_DEST_FIELD(icr_high);
+
+ trace_kvm_apic_ipi(icr_low, irq.dest_id);
+
+ apic_debug("icr_high 0x%x, icr_low 0x%x, "
+ "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, "
+ "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x, "
+ "msi_redir_hint 0x%x\n",
+ icr_high, icr_low, irq.shorthand, irq.dest_id,
+ irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode,
+ irq.vector, irq.msi_redir_hint);
+
+ kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq, NULL);
+}
+
+static u32 apic_get_tmcct(struct kvm_lapic *apic)
+{
+ ktime_t remaining, now;
+ s64 ns;
+ u32 tmcct;
+
+ ASSERT(apic != NULL);
+
+ /* if initial count is 0, current count should also be 0 */
+ if (kvm_lapic_get_reg(apic, APIC_TMICT) == 0 ||
+ apic->lapic_timer.period == 0)
+ return 0;
+
+ now = ktime_get();
+ remaining = ktime_sub(apic->lapic_timer.target_expiration, now);
+ if (ktime_to_ns(remaining) < 0)
+ remaining = 0;
+
+ ns = mod_64(ktime_to_ns(remaining), apic->lapic_timer.period);
+ tmcct = div64_u64(ns,
+ (APIC_BUS_CYCLE_NS * apic->divide_count));
+
+ return tmcct;
+}
+
+static void __report_tpr_access(struct kvm_lapic *apic, bool write)
+{
+ struct kvm_vcpu *vcpu = apic->vcpu;
+ struct kvm_run *run = vcpu->run;
+
+ kvm_make_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu);
+ run->tpr_access.rip = kvm_rip_read(vcpu);
+ run->tpr_access.is_write = write;
+}
+
+static inline void report_tpr_access(struct kvm_lapic *apic, bool write)
+{
+ if (apic->vcpu->arch.tpr_access_reporting)
+ __report_tpr_access(apic, write);
+}
+
+static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
+{
+ u32 val = 0;
+
+ if (offset >= LAPIC_MMIO_LENGTH)
+ return 0;
+
+ switch (offset) {
+ case APIC_ARBPRI:
+ apic_debug("Access APIC ARBPRI register which is for P6\n");
+ break;
+
+ case APIC_TMCCT: /* Timer CCR */
+ if (apic_lvtt_tscdeadline(apic))
+ return 0;
+
+ val = apic_get_tmcct(apic);
+ break;
+ case APIC_PROCPRI:
+ apic_update_ppr(apic);
+ val = kvm_lapic_get_reg(apic, offset);
+ break;
+ case APIC_TASKPRI:
+ report_tpr_access(apic, false);
+ /* fall thru */
+ default:
+ val = kvm_lapic_get_reg(apic, offset);
+ break;
+ }
+
+ return val;
+}
+
+static inline struct kvm_lapic *to_lapic(struct kvm_io_device *dev)
+{
+ return container_of(dev, struct kvm_lapic, dev);
+}
+
+int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
+ void *data)
+{
+ unsigned char alignment = offset & 0xf;
+ u32 result;
+ /* this bitmask has a bit cleared for each reserved register */
+ static const u64 rmask = 0x43ff01ffffffe70cULL;
+
+ if ((alignment + len) > 4) {
+ apic_debug("KVM_APIC_READ: alignment error %x %d\n",
+ offset, len);
+ return 1;
+ }
+
+ if (offset > 0x3f0 || !(rmask & (1ULL << (offset >> 4)))) {
+ apic_debug("KVM_APIC_READ: read reserved register %x\n",
+ offset);
+ return 1;
+ }
+
+ result = __apic_read(apic, offset & ~0xf);
+
+ trace_kvm_apic_read(offset, result);
+
+ switch (len) {
+ case 1:
+ case 2:
+ case 4:
+ memcpy(data, (char *)&result + alignment, len);
+ break;
+ default:
+ printk(KERN_ERR "Local APIC read with len = %x, "
+ "should be 1,2, or 4 instead\n", len);
+ break;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_reg_read);
+
+static int apic_mmio_in_range(struct kvm_lapic *apic, gpa_t addr)
+{
+ return addr >= apic->base_address &&
+ addr < apic->base_address + LAPIC_MMIO_LENGTH;
+}
+
+static int apic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
+ gpa_t address, int len, void *data)
+{
+ struct kvm_lapic *apic = to_lapic(this);
+ u32 offset = address - apic->base_address;
+
+ if (!apic_mmio_in_range(apic, address))
+ return -EOPNOTSUPP;
+
+ if (!kvm_apic_hw_enabled(apic) || apic_x2apic_mode(apic)) {
+ if (!kvm_check_has_quirk(vcpu->kvm,
+ KVM_X86_QUIRK_LAPIC_MMIO_HOLE))
+ return -EOPNOTSUPP;
+
+ memset(data, 0xff, len);
+ return 0;
+ }
+
+ kvm_lapic_reg_read(apic, offset, len, data);
+
+ return 0;
+}
+
+static void update_divide_count(struct kvm_lapic *apic)
+{
+ u32 tmp1, tmp2, tdcr;
+
+ tdcr = kvm_lapic_get_reg(apic, APIC_TDCR);
+ tmp1 = tdcr & 0xf;
+ tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1;
+ apic->divide_count = 0x1 << (tmp2 & 0x7);
+
+ apic_debug("timer divide count is 0x%x\n",
+ apic->divide_count);
+}
+
+static void limit_periodic_timer_frequency(struct kvm_lapic *apic)
+{
+ /*
+ * Do not allow the guest to program periodic timers with small
+ * interval, since the hrtimers are not throttled by the host
+ * scheduler.
+ */
+ if (apic_lvtt_period(apic) && apic->lapic_timer.period) {
+ s64 min_period = min_timer_period_us * 1000LL;
+
+ if (apic->lapic_timer.period < min_period) {
+ pr_info_ratelimited(
+ "kvm: vcpu %i: requested %lld ns "
+ "lapic timer period limited to %lld ns\n",
+ apic->vcpu->vcpu_id,
+ apic->lapic_timer.period, min_period);
+ apic->lapic_timer.period = min_period;
+ }
+ }
+}
+
+static void apic_update_lvtt(struct kvm_lapic *apic)
+{
+ u32 timer_mode = kvm_lapic_get_reg(apic, APIC_LVTT) &
+ apic->lapic_timer.timer_mode_mask;
+
+ if (apic->lapic_timer.timer_mode != timer_mode) {
+ if (apic_lvtt_tscdeadline(apic) != (timer_mode ==
+ APIC_LVT_TIMER_TSCDEADLINE)) {
+ hrtimer_cancel(&apic->lapic_timer.timer);
+ kvm_lapic_set_reg(apic, APIC_TMICT, 0);
+ apic->lapic_timer.period = 0;
+ apic->lapic_timer.tscdeadline = 0;
+ }
+ apic->lapic_timer.timer_mode = timer_mode;
+ limit_periodic_timer_frequency(apic);
+ }
+}
+
+static void apic_timer_expired(struct kvm_lapic *apic)
+{
+ struct kvm_vcpu *vcpu = apic->vcpu;
+ struct swait_queue_head *q = &vcpu->wq;
+ struct kvm_timer *ktimer = &apic->lapic_timer;
+
+ if (atomic_read(&apic->lapic_timer.pending))
+ return;
+
+ atomic_inc(&apic->lapic_timer.pending);
+ kvm_set_pending_timer(vcpu);
+
+ /*
+ * For x86, the atomic_inc() is serialized, thus
+ * using swait_active() is safe.
+ */
+ if (swait_active(q))
+ swake_up_one(q);
+
+ if (apic_lvtt_tscdeadline(apic) || ktimer->hv_timer_in_use)
+ ktimer->expired_tscdeadline = ktimer->tscdeadline;
+}
+
+/*
+ * On APICv, this test will cause a busy wait
+ * during a higher-priority task.
+ */
+
+static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u32 reg = kvm_lapic_get_reg(apic, APIC_LVTT);
+
+ if (kvm_apic_hw_enabled(apic)) {
+ int vec = reg & APIC_VECTOR_MASK;
+ void *bitmap = apic->regs + APIC_ISR;
+
+ if (vcpu->arch.apicv_active)
+ bitmap = apic->regs + APIC_IRR;
+
+ if (apic_test_vector(vec, bitmap))
+ return true;
+ }
+ return false;
+}
+
+void wait_lapic_expire(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u64 guest_tsc, tsc_deadline;
+
+ if (!lapic_in_kernel(vcpu))
+ return;
+
+ if (apic->lapic_timer.expired_tscdeadline == 0)
+ return;
+
+ if (!lapic_timer_int_injected(vcpu))
+ return;
+
+ tsc_deadline = apic->lapic_timer.expired_tscdeadline;
+ apic->lapic_timer.expired_tscdeadline = 0;
+ guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
+ trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline);
+
+ /* __delay is delay_tsc whenever the hardware has TSC, thus always. */
+ if (guest_tsc < tsc_deadline)
+ __delay(min(tsc_deadline - guest_tsc,
+ nsec_to_cycles(vcpu, lapic_timer_advance_ns)));
+}
+
+static void start_sw_tscdeadline(struct kvm_lapic *apic)
+{
+ u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline;
+ u64 ns = 0;
+ ktime_t expire;
+ struct kvm_vcpu *vcpu = apic->vcpu;
+ unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz;
+ unsigned long flags;
+ ktime_t now;
+
+ if (unlikely(!tscdeadline || !this_tsc_khz))
+ return;
+
+ local_irq_save(flags);
+
+ now = ktime_get();
+ guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
+ if (likely(tscdeadline > guest_tsc)) {
+ ns = (tscdeadline - guest_tsc) * 1000000ULL;
+ do_div(ns, this_tsc_khz);
+ expire = ktime_add_ns(now, ns);
+ expire = ktime_sub_ns(expire, lapic_timer_advance_ns);
+ hrtimer_start(&apic->lapic_timer.timer,
+ expire, HRTIMER_MODE_ABS_PINNED);
+ } else
+ apic_timer_expired(apic);
+
+ local_irq_restore(flags);
+}
+
+static void update_target_expiration(struct kvm_lapic *apic, uint32_t old_divisor)
+{
+ ktime_t now, remaining;
+ u64 ns_remaining_old, ns_remaining_new;
+
+ apic->lapic_timer.period = (u64)kvm_lapic_get_reg(apic, APIC_TMICT)
+ * APIC_BUS_CYCLE_NS * apic->divide_count;
+ limit_periodic_timer_frequency(apic);
+
+ now = ktime_get();
+ remaining = ktime_sub(apic->lapic_timer.target_expiration, now);
+ if (ktime_to_ns(remaining) < 0)
+ remaining = 0;
+
+ ns_remaining_old = ktime_to_ns(remaining);
+ ns_remaining_new = mul_u64_u32_div(ns_remaining_old,
+ apic->divide_count, old_divisor);
+
+ apic->lapic_timer.tscdeadline +=
+ nsec_to_cycles(apic->vcpu, ns_remaining_new) -
+ nsec_to_cycles(apic->vcpu, ns_remaining_old);
+ apic->lapic_timer.target_expiration = ktime_add_ns(now, ns_remaining_new);
+}
+
+static bool set_target_expiration(struct kvm_lapic *apic)
+{
+ ktime_t now;
+ u64 tscl = rdtsc();
+
+ now = ktime_get();
+ apic->lapic_timer.period = (u64)kvm_lapic_get_reg(apic, APIC_TMICT)
+ * APIC_BUS_CYCLE_NS * apic->divide_count;
+
+ if (!apic->lapic_timer.period) {
+ apic->lapic_timer.tscdeadline = 0;
+ return false;
+ }
+
+ limit_periodic_timer_frequency(apic);
+
+ apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
+ PRIx64 ", "
+ "timer initial count 0x%x, period %lldns, "
+ "expire @ 0x%016" PRIx64 ".\n", __func__,
+ APIC_BUS_CYCLE_NS, ktime_to_ns(now),
+ kvm_lapic_get_reg(apic, APIC_TMICT),
+ apic->lapic_timer.period,
+ ktime_to_ns(ktime_add_ns(now,
+ apic->lapic_timer.period)));
+
+ apic->lapic_timer.tscdeadline = kvm_read_l1_tsc(apic->vcpu, tscl) +
+ nsec_to_cycles(apic->vcpu, apic->lapic_timer.period);
+ apic->lapic_timer.target_expiration = ktime_add_ns(now, apic->lapic_timer.period);
+
+ return true;
+}
+
+static void advance_periodic_target_expiration(struct kvm_lapic *apic)
+{
+ ktime_t now = ktime_get();
+ u64 tscl = rdtsc();
+ ktime_t delta;
+
+ /*
+ * Synchronize both deadlines to the same time source or
+ * differences in the periods (caused by differences in the
+ * underlying clocks or numerical approximation errors) will
+ * cause the two to drift apart over time as the errors
+ * accumulate.
+ */
+ apic->lapic_timer.target_expiration =
+ ktime_add_ns(apic->lapic_timer.target_expiration,
+ apic->lapic_timer.period);
+ delta = ktime_sub(apic->lapic_timer.target_expiration, now);
+ apic->lapic_timer.tscdeadline = kvm_read_l1_tsc(apic->vcpu, tscl) +
+ nsec_to_cycles(apic->vcpu, delta);
+}
+
+static void start_sw_period(struct kvm_lapic *apic)
+{
+ if (!apic->lapic_timer.period)
+ return;
+
+ if (ktime_after(ktime_get(),
+ apic->lapic_timer.target_expiration)) {
+ apic_timer_expired(apic);
+
+ if (apic_lvtt_oneshot(apic))
+ return;
+
+ advance_periodic_target_expiration(apic);
+ }
+
+ hrtimer_start(&apic->lapic_timer.timer,
+ apic->lapic_timer.target_expiration,
+ HRTIMER_MODE_ABS_PINNED);
+}
+
+bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu)
+{
+ if (!lapic_in_kernel(vcpu))
+ return false;
+
+ return vcpu->arch.apic->lapic_timer.hv_timer_in_use;
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_hv_timer_in_use);
+
+static void cancel_hv_timer(struct kvm_lapic *apic)
+{
+ WARN_ON(preemptible());
+ WARN_ON(!apic->lapic_timer.hv_timer_in_use);
+ kvm_x86_ops->cancel_hv_timer(apic->vcpu);
+ apic->lapic_timer.hv_timer_in_use = false;
+}
+
+static bool start_hv_timer(struct kvm_lapic *apic)
+{
+ struct kvm_timer *ktimer = &apic->lapic_timer;
+ int r;
+
+ WARN_ON(preemptible());
+ if (!kvm_x86_ops->set_hv_timer)
+ return false;
+
+ if (!apic_lvtt_period(apic) && atomic_read(&ktimer->pending))
+ return false;
+
+ if (!ktimer->tscdeadline)
+ return false;
+
+ r = kvm_x86_ops->set_hv_timer(apic->vcpu, ktimer->tscdeadline);
+ if (r < 0)
+ return false;
+
+ ktimer->hv_timer_in_use = true;
+ hrtimer_cancel(&ktimer->timer);
+
+ /*
+ * Also recheck ktimer->pending, in case the sw timer triggered in
+ * the window. For periodic timer, leave the hv timer running for
+ * simplicity, and the deadline will be recomputed on the next vmexit.
+ */
+ if (!apic_lvtt_period(apic) && (r || atomic_read(&ktimer->pending))) {
+ if (r)
+ apic_timer_expired(apic);
+ return false;
+ }
+
+ trace_kvm_hv_timer_state(apic->vcpu->vcpu_id, true);
+ return true;
+}
+
+static void start_sw_timer(struct kvm_lapic *apic)
+{
+ struct kvm_timer *ktimer = &apic->lapic_timer;
+
+ WARN_ON(preemptible());
+ if (apic->lapic_timer.hv_timer_in_use)
+ cancel_hv_timer(apic);
+ if (!apic_lvtt_period(apic) && atomic_read(&ktimer->pending))
+ return;
+
+ if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic))
+ start_sw_period(apic);
+ else if (apic_lvtt_tscdeadline(apic))
+ start_sw_tscdeadline(apic);
+ trace_kvm_hv_timer_state(apic->vcpu->vcpu_id, false);
+}
+
+static void restart_apic_timer(struct kvm_lapic *apic)
+{
+ preempt_disable();
+ if (!start_hv_timer(apic))
+ start_sw_timer(apic);
+ preempt_enable();
+}
+
+void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ preempt_disable();
+ /* If the preempt notifier has already run, it also called apic_timer_expired */
+ if (!apic->lapic_timer.hv_timer_in_use)
+ goto out;
+ WARN_ON(swait_active(&vcpu->wq));
+ cancel_hv_timer(apic);
+ apic_timer_expired(apic);
+
+ if (apic_lvtt_period(apic) && apic->lapic_timer.period) {
+ advance_periodic_target_expiration(apic);
+ restart_apic_timer(apic);
+ }
+out:
+ preempt_enable();
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer);
+
+void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu)
+{
+ restart_apic_timer(vcpu->arch.apic);
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_hv_timer);
+
+void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ preempt_disable();
+ /* Possibly the TSC deadline timer is not enabled yet */
+ if (apic->lapic_timer.hv_timer_in_use)
+ start_sw_timer(apic);
+ preempt_enable();
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer);
+
+void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ WARN_ON(!apic->lapic_timer.hv_timer_in_use);
+ restart_apic_timer(apic);
+}
+
+static void start_apic_timer(struct kvm_lapic *apic)
+{
+ atomic_set(&apic->lapic_timer.pending, 0);
+
+ if ((apic_lvtt_period(apic) || apic_lvtt_oneshot(apic))
+ && !set_target_expiration(apic))
+ return;
+
+ restart_apic_timer(apic);
+}
+
+static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
+{
+ bool lvt0_in_nmi_mode = apic_lvt_nmi_mode(lvt0_val);
+
+ if (apic->lvt0_in_nmi_mode != lvt0_in_nmi_mode) {
+ apic->lvt0_in_nmi_mode = lvt0_in_nmi_mode;
+ if (lvt0_in_nmi_mode) {
+ apic_debug("Receive NMI setting on APIC_LVT0 "
+ "for cpu %d\n", apic->vcpu->vcpu_id);
+ atomic_inc(&apic->vcpu->kvm->arch.vapics_in_nmi_mode);
+ } else
+ atomic_dec(&apic->vcpu->kvm->arch.vapics_in_nmi_mode);
+ }
+}
+
+int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
+{
+ int ret = 0;
+
+ trace_kvm_apic_write(reg, val);
+
+ switch (reg) {
+ case APIC_ID: /* Local APIC ID */
+ if (!apic_x2apic_mode(apic))
+ kvm_apic_set_xapic_id(apic, val >> 24);
+ else
+ ret = 1;
+ break;
+
+ case APIC_TASKPRI:
+ report_tpr_access(apic, true);
+ apic_set_tpr(apic, val & 0xff);
+ break;
+
+ case APIC_EOI:
+ apic_set_eoi(apic);
+ break;
+
+ case APIC_LDR:
+ if (!apic_x2apic_mode(apic))
+ kvm_apic_set_ldr(apic, val & APIC_LDR_MASK);
+ else
+ ret = 1;
+ break;
+
+ case APIC_DFR:
+ if (!apic_x2apic_mode(apic)) {
+ kvm_lapic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF);
+ recalculate_apic_map(apic->vcpu->kvm);
+ } else
+ ret = 1;
+ break;
+
+ case APIC_SPIV: {
+ u32 mask = 0x3ff;
+ if (kvm_lapic_get_reg(apic, APIC_LVR) & APIC_LVR_DIRECTED_EOI)
+ mask |= APIC_SPIV_DIRECTED_EOI;
+ apic_set_spiv(apic, val & mask);
+ if (!(val & APIC_SPIV_APIC_ENABLED)) {
+ int i;
+ u32 lvt_val;
+
+ for (i = 0; i < KVM_APIC_LVT_NUM; i++) {
+ lvt_val = kvm_lapic_get_reg(apic,
+ APIC_LVTT + 0x10 * i);
+ kvm_lapic_set_reg(apic, APIC_LVTT + 0x10 * i,
+ lvt_val | APIC_LVT_MASKED);
+ }
+ apic_update_lvtt(apic);
+ atomic_set(&apic->lapic_timer.pending, 0);
+
+ }
+ break;
+ }
+ case APIC_ICR:
+ /* No delay here, so we always clear the pending bit */
+ kvm_lapic_set_reg(apic, APIC_ICR, val & ~(1 << 12));
+ apic_send_ipi(apic);
+ break;
+
+ case APIC_ICR2:
+ if (!apic_x2apic_mode(apic))
+ val &= 0xff000000;
+ kvm_lapic_set_reg(apic, APIC_ICR2, val);
+ break;
+
+ case APIC_LVT0:
+ apic_manage_nmi_watchdog(apic, val);
+ case APIC_LVTTHMR:
+ case APIC_LVTPC:
+ case APIC_LVT1:
+ case APIC_LVTERR: {
+ /* TODO: Check vector */
+ size_t size;
+ u32 index;
+
+ if (!kvm_apic_sw_enabled(apic))
+ val |= APIC_LVT_MASKED;
+ size = ARRAY_SIZE(apic_lvt_mask);
+ index = array_index_nospec(
+ (reg - APIC_LVTT) >> 4, size);
+ val &= apic_lvt_mask[index];
+ kvm_lapic_set_reg(apic, reg, val);
+ break;
+ }
+
+ case APIC_LVTT:
+ if (!kvm_apic_sw_enabled(apic))
+ val |= APIC_LVT_MASKED;
+ val &= (apic_lvt_mask[0] | apic->lapic_timer.timer_mode_mask);
+ kvm_lapic_set_reg(apic, APIC_LVTT, val);
+ apic_update_lvtt(apic);
+ break;
+
+ case APIC_TMICT:
+ if (apic_lvtt_tscdeadline(apic))
+ break;
+
+ hrtimer_cancel(&apic->lapic_timer.timer);
+ kvm_lapic_set_reg(apic, APIC_TMICT, val);
+ start_apic_timer(apic);
+ break;
+
+ case APIC_TDCR: {
+ uint32_t old_divisor = apic->divide_count;
+
+ if (val & 4)
+ apic_debug("KVM_WRITE:TDCR %x\n", val);
+ kvm_lapic_set_reg(apic, APIC_TDCR, val);
+ update_divide_count(apic);
+ if (apic->divide_count != old_divisor &&
+ apic->lapic_timer.period) {
+ hrtimer_cancel(&apic->lapic_timer.timer);
+ update_target_expiration(apic, old_divisor);
+ restart_apic_timer(apic);
+ }
+ break;
+ }
+ case APIC_ESR:
+ if (apic_x2apic_mode(apic) && val != 0) {
+ apic_debug("KVM_WRITE:ESR not zero %x\n", val);
+ ret = 1;
+ }
+ break;
+
+ case APIC_SELF_IPI:
+ if (apic_x2apic_mode(apic)) {
+ kvm_lapic_reg_write(apic, APIC_ICR, 0x40000 | (val & 0xff));
+ } else
+ ret = 1;
+ break;
+ default:
+ ret = 1;
+ break;
+ }
+ if (ret)
+ apic_debug("Local APIC Write to read-only register %x\n", reg);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_reg_write);
+
+static int apic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
+ gpa_t address, int len, const void *data)
+{
+ struct kvm_lapic *apic = to_lapic(this);
+ unsigned int offset = address - apic->base_address;
+ u32 val;
+
+ if (!apic_mmio_in_range(apic, address))
+ return -EOPNOTSUPP;
+
+ if (!kvm_apic_hw_enabled(apic) || apic_x2apic_mode(apic)) {
+ if (!kvm_check_has_quirk(vcpu->kvm,
+ KVM_X86_QUIRK_LAPIC_MMIO_HOLE))
+ return -EOPNOTSUPP;
+
+ return 0;
+ }
+
+ /*
+ * APIC register must be aligned on 128-bits boundary.
+ * 32/64/128 bits registers must be accessed thru 32 bits.
+ * Refer SDM 8.4.1
+ */
+ if (len != 4 || (offset & 0xf)) {
+ /* Don't shout loud, $infamous_os would cause only noise. */
+ apic_debug("apic write: bad size=%d %lx\n", len, (long)address);
+ return 0;
+ }
+
+ val = *(u32*)data;
+
+ /* too common printing */
+ if (offset != APIC_EOI)
+ apic_debug("%s: offset 0x%x with length 0x%x, and value is "
+ "0x%x\n", __func__, offset, len, val);
+
+ kvm_lapic_reg_write(apic, offset & 0xff0, val);
+
+ return 0;
+}
+
+void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu)
+{
+ kvm_lapic_reg_write(vcpu->arch.apic, APIC_EOI, 0);
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi);
+
+/* emulate APIC access in a trap manner */
+void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset)
+{
+ u32 val = 0;
+
+ /* hw has done the conditional check and inst decode */
+ offset &= 0xff0;
+
+ kvm_lapic_reg_read(vcpu->arch.apic, offset, 4, &val);
+
+ /* TODO: optimize to just emulate side effect w/o one more write */
+ kvm_lapic_reg_write(vcpu->arch.apic, offset, val);
+}
+EXPORT_SYMBOL_GPL(kvm_apic_write_nodecode);
+
+void kvm_free_lapic(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ if (!vcpu->arch.apic)
+ return;
+
+ hrtimer_cancel(&apic->lapic_timer.timer);
+
+ if (!(vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE))
+ static_key_slow_dec_deferred(&apic_hw_disabled);
+
+ if (!apic->sw_enabled)
+ static_key_slow_dec_deferred(&apic_sw_disabled);
+
+ if (apic->regs)
+ free_page((unsigned long)apic->regs);
+
+ kfree(apic);
+}
+
+/*
+ *----------------------------------------------------------------------
+ * LAPIC interface
+ *----------------------------------------------------------------------
+ */
+u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ if (!lapic_in_kernel(vcpu) ||
+ !apic_lvtt_tscdeadline(apic))
+ return 0;
+
+ return apic->lapic_timer.tscdeadline;
+}
+
+void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ if (!kvm_apic_present(vcpu) || apic_lvtt_oneshot(apic) ||
+ apic_lvtt_period(apic))
+ return;
+
+ hrtimer_cancel(&apic->lapic_timer.timer);
+ apic->lapic_timer.tscdeadline = data;
+ start_apic_timer(apic);
+}
+
+void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
+{
+ apic_set_tpr(vcpu->arch.apic, (cr8 & 0x0f) << 4);
+}
+
+u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
+{
+ u64 tpr;
+
+ tpr = (u64) kvm_lapic_get_reg(vcpu->arch.apic, APIC_TASKPRI);
+
+ return (tpr & 0xf0) >> 4;
+}
+
+void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
+{
+ u64 old_value = vcpu->arch.apic_base;
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ if (!apic)
+ value |= MSR_IA32_APICBASE_BSP;
+
+ vcpu->arch.apic_base = value;
+
+ if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE)
+ kvm_update_cpuid(vcpu);
+
+ if (!apic)
+ return;
+
+ /* update jump label if enable bit changes */
+ if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) {
+ if (value & MSR_IA32_APICBASE_ENABLE) {
+ kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
+ static_key_slow_dec_deferred(&apic_hw_disabled);
+ } else {
+ static_key_slow_inc(&apic_hw_disabled.key);
+ recalculate_apic_map(vcpu->kvm);
+ }
+ }
+
+ if (((old_value ^ value) & X2APIC_ENABLE) && (value & X2APIC_ENABLE))
+ kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id);
+
+ if ((old_value ^ value) & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE))
+ kvm_x86_ops->set_virtual_apic_mode(vcpu);
+
+ apic->base_address = apic->vcpu->arch.apic_base &
+ MSR_IA32_APICBASE_BASE;
+
+ if ((value & MSR_IA32_APICBASE_ENABLE) &&
+ apic->base_address != APIC_DEFAULT_PHYS_BASE)
+ pr_warn_once("APIC base relocation is unsupported by KVM");
+
+ /* with FSB delivery interrupt, we can restart APIC functionality */
+ apic_debug("apic base msr is 0x%016" PRIx64 ", and base address is "
+ "0x%lx.\n", apic->vcpu->arch.apic_base, apic->base_address);
+
+}
+
+void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ int i;
+
+ if (!apic)
+ return;
+
+ apic_debug("%s\n", __func__);
+
+ /* Stop the timer in case it's a reset to an active apic */
+ hrtimer_cancel(&apic->lapic_timer.timer);
+
+ if (!init_event) {
+ kvm_lapic_set_base(vcpu, APIC_DEFAULT_PHYS_BASE |
+ MSR_IA32_APICBASE_ENABLE);
+ kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
+ }
+ kvm_apic_set_version(apic->vcpu);
+
+ for (i = 0; i < KVM_APIC_LVT_NUM; i++)
+ kvm_lapic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED);
+ apic_update_lvtt(apic);
+ if (kvm_vcpu_is_reset_bsp(vcpu) &&
+ kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_LINT0_REENABLED))
+ kvm_lapic_set_reg(apic, APIC_LVT0,
+ SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT));
+ apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0));
+
+ kvm_lapic_set_reg(apic, APIC_DFR, 0xffffffffU);
+ apic_set_spiv(apic, 0xff);
+ kvm_lapic_set_reg(apic, APIC_TASKPRI, 0);
+ if (!apic_x2apic_mode(apic))
+ kvm_apic_set_ldr(apic, 0);
+ kvm_lapic_set_reg(apic, APIC_ESR, 0);
+ kvm_lapic_set_reg(apic, APIC_ICR, 0);
+ kvm_lapic_set_reg(apic, APIC_ICR2, 0);
+ kvm_lapic_set_reg(apic, APIC_TDCR, 0);
+ kvm_lapic_set_reg(apic, APIC_TMICT, 0);
+ for (i = 0; i < 8; i++) {
+ kvm_lapic_set_reg(apic, APIC_IRR + 0x10 * i, 0);
+ kvm_lapic_set_reg(apic, APIC_ISR + 0x10 * i, 0);
+ kvm_lapic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
+ }
+ apic->irr_pending = vcpu->arch.apicv_active;
+ apic->isr_count = vcpu->arch.apicv_active ? 1 : 0;
+ apic->highest_isr_cache = -1;
+ update_divide_count(apic);
+ atomic_set(&apic->lapic_timer.pending, 0);
+ if (kvm_vcpu_is_bsp(vcpu))
+ kvm_lapic_set_base(vcpu,
+ vcpu->arch.apic_base | MSR_IA32_APICBASE_BSP);
+ vcpu->arch.pv_eoi.msr_val = 0;
+ apic_update_ppr(apic);
+ if (vcpu->arch.apicv_active) {
+ kvm_x86_ops->apicv_post_state_restore(vcpu);
+ kvm_x86_ops->hwapic_irr_update(vcpu, -1);
+ kvm_x86_ops->hwapic_isr_update(vcpu, -1);
+ }
+
+ vcpu->arch.apic_arb_prio = 0;
+ vcpu->arch.apic_attention = 0;
+
+ apic_debug("%s: vcpu=%p, id=0x%x, base_msr="
+ "0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__,
+ vcpu, kvm_lapic_get_reg(apic, APIC_ID),
+ vcpu->arch.apic_base, apic->base_address);
+}
+
+/*
+ *----------------------------------------------------------------------
+ * timer interface
+ *----------------------------------------------------------------------
+ */
+
+static bool lapic_is_periodic(struct kvm_lapic *apic)
+{
+ return apic_lvtt_period(apic);
+}
+
+int apic_has_pending_timer(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ if (apic_enabled(apic) && apic_lvt_enabled(apic, APIC_LVTT))
+ return atomic_read(&apic->lapic_timer.pending);
+
+ return 0;
+}
+
+int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type)
+{
+ u32 reg = kvm_lapic_get_reg(apic, lvt_type);
+ int vector, mode, trig_mode;
+
+ if (kvm_apic_hw_enabled(apic) && !(reg & APIC_LVT_MASKED)) {
+ vector = reg & APIC_VECTOR_MASK;
+ mode = reg & APIC_MODE_MASK;
+ trig_mode = reg & APIC_LVT_LEVEL_TRIGGER;
+ return __apic_accept_irq(apic, mode, vector, 1, trig_mode,
+ NULL);
+ }
+ return 0;
+}
+
+void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ if (apic)
+ kvm_apic_local_deliver(apic, APIC_LVT0);
+}
+
+static const struct kvm_io_device_ops apic_mmio_ops = {
+ .read = apic_mmio_read,
+ .write = apic_mmio_write,
+};
+
+static enum hrtimer_restart apic_timer_fn(struct hrtimer *data)
+{
+ struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
+ struct kvm_lapic *apic = container_of(ktimer, struct kvm_lapic, lapic_timer);
+
+ apic_timer_expired(apic);
+
+ if (lapic_is_periodic(apic)) {
+ advance_periodic_target_expiration(apic);
+ hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
+ return HRTIMER_RESTART;
+ } else
+ return HRTIMER_NORESTART;
+}
+
+int kvm_create_lapic(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic;
+
+ ASSERT(vcpu != NULL);
+ apic_debug("apic_init %d\n", vcpu->vcpu_id);
+
+ apic = kzalloc(sizeof(*apic), GFP_KERNEL);
+ if (!apic)
+ goto nomem;
+
+ vcpu->arch.apic = apic;
+
+ apic->regs = (void *)get_zeroed_page(GFP_KERNEL);
+ if (!apic->regs) {
+ printk(KERN_ERR "malloc apic regs error for vcpu %x\n",
+ vcpu->vcpu_id);
+ goto nomem_free_apic;
+ }
+ apic->vcpu = vcpu;
+
+ hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS_PINNED);
+ apic->lapic_timer.timer.function = apic_timer_fn;
+
+ /*
+ * APIC is created enabled. This will prevent kvm_lapic_set_base from
+ * thinking that APIC satet has changed.
+ */
+ vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE;
+ static_key_slow_inc(&apic_sw_disabled.key); /* sw disabled at reset */
+ kvm_iodevice_init(&apic->dev, &apic_mmio_ops);
+
+ return 0;
+nomem_free_apic:
+ kfree(apic);
+nomem:
+ return -ENOMEM;
+}
+
+int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u32 ppr;
+
+ if (!kvm_apic_present(vcpu))
+ return -1;
+
+ __apic_update_ppr(apic, &ppr);
+ return apic_has_interrupt_for_ppr(apic, ppr);
+}
+
+int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
+{
+ u32 lvt0 = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LVT0);
+ int r = 0;
+
+ if (!kvm_apic_hw_enabled(vcpu->arch.apic))
+ r = 1;
+ if ((lvt0 & APIC_LVT_MASKED) == 0 &&
+ GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT)
+ r = 1;
+ return r;
+}
+
+void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ if (atomic_read(&apic->lapic_timer.pending) > 0) {
+ kvm_apic_local_deliver(apic, APIC_LVTT);
+ if (apic_lvtt_tscdeadline(apic))
+ apic->lapic_timer.tscdeadline = 0;
+ if (apic_lvtt_oneshot(apic)) {
+ apic->lapic_timer.tscdeadline = 0;
+ apic->lapic_timer.target_expiration = 0;
+ }
+ atomic_set(&apic->lapic_timer.pending, 0);
+ }
+}
+
+int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
+{
+ int vector = kvm_apic_has_interrupt(vcpu);
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u32 ppr;
+
+ if (vector == -1)
+ return -1;
+
+ /*
+ * We get here even with APIC virtualization enabled, if doing
+ * nested virtualization and L1 runs with the "acknowledge interrupt
+ * on exit" mode. Then we cannot inject the interrupt via RVI,
+ * because the process would deliver it through the IDT.
+ */
+
+ apic_clear_irr(vector, apic);
+ if (test_bit(vector, vcpu_to_synic(vcpu)->auto_eoi_bitmap)) {
+ /*
+ * For auto-EOI interrupts, there might be another pending
+ * interrupt above PPR, so check whether to raise another
+ * KVM_REQ_EVENT.
+ */
+ apic_update_ppr(apic);
+ } else {
+ /*
+ * For normal interrupts, PPR has been raised and there cannot
+ * be a higher-priority pending interrupt---except if there was
+ * a concurrent interrupt injection, but that would have
+ * triggered KVM_REQ_EVENT already.
+ */
+ apic_set_isr(vector, apic);
+ __apic_update_ppr(apic, &ppr);
+ }
+
+ return vector;
+}
+
+static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,
+ struct kvm_lapic_state *s, bool set)
+{
+ if (apic_x2apic_mode(vcpu->arch.apic)) {
+ u32 *id = (u32 *)(s->regs + APIC_ID);
+ u32 *ldr = (u32 *)(s->regs + APIC_LDR);
+
+ if (vcpu->kvm->arch.x2apic_format) {
+ if (*id != vcpu->vcpu_id)
+ return -EINVAL;
+ } else {
+ if (set)
+ *id >>= 24;
+ else
+ *id <<= 24;
+ }
+
+ /* In x2APIC mode, the LDR is fixed and based on the id */
+ if (set)
+ *ldr = kvm_apic_calc_x2apic_ldr(*id);
+ }
+
+ return 0;
+}
+
+int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
+{
+ memcpy(s->regs, vcpu->arch.apic->regs, sizeof(*s));
+ return kvm_apic_state_fixup(vcpu, s, false);
+}
+
+int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ int r;
+
+
+ kvm_lapic_set_base(vcpu, vcpu->arch.apic_base);
+ /* set SPIV separately to get count of SW disabled APICs right */
+ apic_set_spiv(apic, *((u32 *)(s->regs + APIC_SPIV)));
+
+ r = kvm_apic_state_fixup(vcpu, s, true);
+ if (r)
+ return r;
+ memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
+
+ recalculate_apic_map(vcpu->kvm);
+ kvm_apic_set_version(vcpu);
+
+ apic_update_ppr(apic);
+ hrtimer_cancel(&apic->lapic_timer.timer);
+ apic_update_lvtt(apic);
+ apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0));
+ update_divide_count(apic);
+ start_apic_timer(apic);
+ apic->irr_pending = true;
+ apic->isr_count = vcpu->arch.apicv_active ?
+ 1 : count_vectors(apic->regs + APIC_ISR);
+ apic->highest_isr_cache = -1;
+ if (vcpu->arch.apicv_active) {
+ kvm_x86_ops->apicv_post_state_restore(vcpu);
+ kvm_x86_ops->hwapic_irr_update(vcpu,
+ apic_find_highest_irr(apic));
+ kvm_x86_ops->hwapic_isr_update(vcpu,
+ apic_find_highest_isr(apic));
+ }
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ if (ioapic_in_kernel(vcpu->kvm))
+ kvm_rtc_eoi_tracking_restore_one(vcpu);
+
+ vcpu->arch.apic_arb_prio = 0;
+
+ return 0;
+}
+
+void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
+{
+ struct hrtimer *timer;
+
+ if (!lapic_in_kernel(vcpu))
+ return;
+
+ timer = &vcpu->arch.apic->lapic_timer.timer;
+ if (hrtimer_cancel(timer))
+ hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
+}
+
+/*
+ * apic_sync_pv_eoi_from_guest - called on vmexit or cancel interrupt
+ *
+ * Detect whether guest triggered PV EOI since the
+ * last entry. If yes, set EOI on guests's behalf.
+ * Clear PV EOI in guest memory in any case.
+ */
+static void apic_sync_pv_eoi_from_guest(struct kvm_vcpu *vcpu,
+ struct kvm_lapic *apic)
+{
+ bool pending;
+ int vector;
+ /*
+ * PV EOI state is derived from KVM_APIC_PV_EOI_PENDING in host
+ * and KVM_PV_EOI_ENABLED in guest memory as follows:
+ *
+ * KVM_APIC_PV_EOI_PENDING is unset:
+ * -> host disabled PV EOI.
+ * KVM_APIC_PV_EOI_PENDING is set, KVM_PV_EOI_ENABLED is set:
+ * -> host enabled PV EOI, guest did not execute EOI yet.
+ * KVM_APIC_PV_EOI_PENDING is set, KVM_PV_EOI_ENABLED is unset:
+ * -> host enabled PV EOI, guest executed EOI.
+ */
+ BUG_ON(!pv_eoi_enabled(vcpu));
+ pending = pv_eoi_get_pending(vcpu);
+ /*
+ * Clear pending bit in any case: it will be set again on vmentry.
+ * While this might not be ideal from performance point of view,
+ * this makes sure pv eoi is only enabled when we know it's safe.
+ */
+ pv_eoi_clr_pending(vcpu);
+ if (pending)
+ return;
+ vector = apic_set_eoi(apic);
+ trace_kvm_pv_eoi(apic, vector);
+}
+
+void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu)
+{
+ u32 data;
+
+ if (test_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention))
+ apic_sync_pv_eoi_from_guest(vcpu, vcpu->arch.apic);
+
+ if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention))
+ return;
+
+ if (kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data,
+ sizeof(u32)))
+ return;
+
+ apic_set_tpr(vcpu->arch.apic, data & 0xff);
+}
+
+/*
+ * apic_sync_pv_eoi_to_guest - called before vmentry
+ *
+ * Detect whether it's safe to enable PV EOI and
+ * if yes do so.
+ */
+static void apic_sync_pv_eoi_to_guest(struct kvm_vcpu *vcpu,
+ struct kvm_lapic *apic)
+{
+ if (!pv_eoi_enabled(vcpu) ||
+ /* IRR set or many bits in ISR: could be nested. */
+ apic->irr_pending ||
+ /* Cache not set: could be safe but we don't bother. */
+ apic->highest_isr_cache == -1 ||
+ /* Need EOI to update ioapic. */
+ kvm_ioapic_handles_vector(apic, apic->highest_isr_cache)) {
+ /*
+ * PV EOI was disabled by apic_sync_pv_eoi_from_guest
+ * so we need not do anything here.
+ */
+ return;
+ }
+
+ pv_eoi_set_pending(apic->vcpu);
+}
+
+void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu)
+{
+ u32 data, tpr;
+ int max_irr, max_isr;
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ apic_sync_pv_eoi_to_guest(vcpu, apic);
+
+ if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention))
+ return;
+
+ tpr = kvm_lapic_get_reg(apic, APIC_TASKPRI) & 0xff;
+ max_irr = apic_find_highest_irr(apic);
+ if (max_irr < 0)
+ max_irr = 0;
+ max_isr = apic_find_highest_isr(apic);
+ if (max_isr < 0)
+ max_isr = 0;
+ data = (tpr & 0xff) | ((max_isr & 0xf0) << 8) | (max_irr << 24);
+
+ kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data,
+ sizeof(u32));
+}
+
+int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr)
+{
+ if (vapic_addr) {
+ if (kvm_gfn_to_hva_cache_init(vcpu->kvm,
+ &vcpu->arch.apic->vapic_cache,
+ vapic_addr, sizeof(u32)))
+ return -EINVAL;
+ __set_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention);
+ } else {
+ __clear_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention);
+ }
+
+ vcpu->arch.apic->vapic_addr = vapic_addr;
+ return 0;
+}
+
+int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u32 reg = (msr - APIC_BASE_MSR) << 4;
+
+ if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic))
+ return 1;
+
+ if (reg == APIC_ICR2)
+ return 1;
+
+ /* if this is ICR write vector before command */
+ if (reg == APIC_ICR)
+ kvm_lapic_reg_write(apic, APIC_ICR2, (u32)(data >> 32));
+ return kvm_lapic_reg_write(apic, reg, (u32)data);
+}
+
+int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u32 reg = (msr - APIC_BASE_MSR) << 4, low, high = 0;
+
+ if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic))
+ return 1;
+
+ if (reg == APIC_DFR || reg == APIC_ICR2) {
+ apic_debug("KVM_APIC_READ: read x2apic reserved register %x\n",
+ reg);
+ return 1;
+ }
+
+ if (kvm_lapic_reg_read(apic, reg, 4, &low))
+ return 1;
+ if (reg == APIC_ICR)
+ kvm_lapic_reg_read(apic, APIC_ICR2, 4, &high);
+
+ *data = (((u64)high) << 32) | low;
+
+ return 0;
+}
+
+int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 reg, u64 data)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ if (!lapic_in_kernel(vcpu))
+ return 1;
+
+ /* if this is ICR write vector before command */
+ if (reg == APIC_ICR)
+ kvm_lapic_reg_write(apic, APIC_ICR2, (u32)(data >> 32));
+ return kvm_lapic_reg_write(apic, reg, (u32)data);
+}
+
+int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u32 low, high = 0;
+
+ if (!lapic_in_kernel(vcpu))
+ return 1;
+
+ if (kvm_lapic_reg_read(apic, reg, 4, &low))
+ return 1;
+ if (reg == APIC_ICR)
+ kvm_lapic_reg_read(apic, APIC_ICR2, 4, &high);
+
+ *data = (((u64)high) << 32) | low;
+
+ return 0;
+}
+
+int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len)
+{
+ u64 addr = data & ~KVM_MSR_ENABLED;
+ struct gfn_to_hva_cache *ghc = &vcpu->arch.pv_eoi.data;
+ unsigned long new_len;
+
+ if (!IS_ALIGNED(addr, 4))
+ return 1;
+
+ vcpu->arch.pv_eoi.msr_val = data;
+ if (!pv_eoi_enabled(vcpu))
+ return 0;
+
+ if (addr == ghc->gpa && len <= ghc->len)
+ new_len = ghc->len;
+ else
+ new_len = len;
+
+ return kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, addr, new_len);
+}
+
+void kvm_apic_accept_events(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u8 sipi_vector;
+ unsigned long pe;
+
+ if (!lapic_in_kernel(vcpu) || !apic->pending_events)
+ return;
+
+ /*
+ * INITs are latched while in SMM. Because an SMM CPU cannot
+ * be in KVM_MP_STATE_INIT_RECEIVED state, just eat SIPIs
+ * and delay processing of INIT until the next RSM.
+ */
+ if (is_smm(vcpu)) {
+ WARN_ON_ONCE(vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED);
+ if (test_bit(KVM_APIC_SIPI, &apic->pending_events))
+ clear_bit(KVM_APIC_SIPI, &apic->pending_events);
+ return;
+ }
+
+ pe = xchg(&apic->pending_events, 0);
+ if (test_bit(KVM_APIC_INIT, &pe)) {
+ kvm_vcpu_reset(vcpu, true);
+ if (kvm_vcpu_is_bsp(apic->vcpu))
+ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+ else
+ vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
+ }
+ if (test_bit(KVM_APIC_SIPI, &pe) &&
+ vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
+ /* evaluate pending_events before reading the vector */
+ smp_rmb();
+ sipi_vector = apic->sipi_vector;
+ apic_debug("vcpu %d received sipi with vector # %x\n",
+ vcpu->vcpu_id, sipi_vector);
+ kvm_vcpu_deliver_sipi_vector(vcpu, sipi_vector);
+ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+ }
+}
+
+void kvm_lapic_init(void)
+{
+ /* do not patch jump label more than once per second */
+ jump_label_rate_limit(&apic_hw_disabled, HZ);
+ jump_label_rate_limit(&apic_sw_disabled, HZ);
+}
+
+void kvm_lapic_exit(void)
+{
+ static_key_deferred_flush(&apic_hw_disabled);
+ static_key_deferred_flush(&apic_sw_disabled);
+}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
new file mode 100644
index 000000000..ff6ef9c3d
--- /dev/null
+++ b/arch/x86/kvm/lapic.h
@@ -0,0 +1,237 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_LAPIC_H
+#define __KVM_X86_LAPIC_H
+
+#include <kvm/iodev.h>
+
+#include <linux/kvm_host.h>
+
+#define KVM_APIC_INIT 0
+#define KVM_APIC_SIPI 1
+#define KVM_APIC_LVT_NUM 6
+
+#define KVM_APIC_SHORT_MASK 0xc0000
+#define KVM_APIC_DEST_MASK 0x800
+
+#define APIC_BUS_CYCLE_NS 1
+#define APIC_BUS_FREQUENCY (1000000000ULL / APIC_BUS_CYCLE_NS)
+
+enum lapic_mode {
+ LAPIC_MODE_DISABLED = 0,
+ LAPIC_MODE_INVALID = X2APIC_ENABLE,
+ LAPIC_MODE_XAPIC = MSR_IA32_APICBASE_ENABLE,
+ LAPIC_MODE_X2APIC = MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE,
+};
+
+struct kvm_timer {
+ struct hrtimer timer;
+ s64 period; /* unit: ns */
+ ktime_t target_expiration;
+ u32 timer_mode;
+ u32 timer_mode_mask;
+ u64 tscdeadline;
+ u64 expired_tscdeadline;
+ atomic_t pending; /* accumulated triggered timers */
+ bool hv_timer_in_use;
+};
+
+struct kvm_lapic {
+ unsigned long base_address;
+ struct kvm_io_device dev;
+ struct kvm_timer lapic_timer;
+ u32 divide_count;
+ struct kvm_vcpu *vcpu;
+ bool sw_enabled;
+ bool irr_pending;
+ bool lvt0_in_nmi_mode;
+ /* Number of bits set in ISR. */
+ s16 isr_count;
+ /* The highest vector set in ISR; if -1 - invalid, must scan ISR. */
+ int highest_isr_cache;
+ /**
+ * APIC register page. The layout matches the register layout seen by
+ * the guest 1:1, because it is accessed by the vmx microcode.
+ * Note: Only one register, the TPR, is used by the microcode.
+ */
+ void *regs;
+ gpa_t vapic_addr;
+ struct gfn_to_hva_cache vapic_cache;
+ unsigned long pending_events;
+ unsigned int sipi_vector;
+};
+
+struct dest_map;
+
+int kvm_create_lapic(struct kvm_vcpu *vcpu);
+void kvm_free_lapic(struct kvm_vcpu *vcpu);
+
+int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
+int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu);
+int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
+void kvm_apic_accept_events(struct kvm_vcpu *vcpu);
+void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event);
+u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
+void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
+void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu);
+void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
+u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
+void kvm_apic_set_version(struct kvm_vcpu *vcpu);
+int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val);
+int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
+ void *data);
+bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
+ int short_hand, unsigned int dest, int dest_mode);
+
+bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr);
+bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir, int *max_irr);
+void kvm_apic_update_ppr(struct kvm_vcpu *vcpu);
+int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
+ struct dest_map *dest_map);
+int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type);
+
+bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
+ struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map);
+
+u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
+int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
+int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s);
+int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s);
+enum lapic_mode kvm_get_apic_mode(struct kvm_vcpu *vcpu);
+int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
+
+u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu);
+void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data);
+
+void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset);
+void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector);
+
+int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
+void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu);
+void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu);
+
+int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data);
+int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
+
+int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data);
+int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
+
+static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.hyperv.hv_vapic & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE;
+}
+
+int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len);
+void kvm_lapic_init(void);
+void kvm_lapic_exit(void);
+
+#define VEC_POS(v) ((v) & (32 - 1))
+#define REG_POS(v) (((v) >> 5) << 4)
+
+static inline void kvm_lapic_set_vector(int vec, void *bitmap)
+{
+ set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
+}
+
+static inline void kvm_lapic_set_irr(int vec, struct kvm_lapic *apic)
+{
+ kvm_lapic_set_vector(vec, apic->regs + APIC_IRR);
+ /*
+ * irr_pending must be true if any interrupt is pending; set it after
+ * APIC_IRR to avoid race with apic_clear_irr
+ */
+ apic->irr_pending = true;
+}
+
+static inline u32 kvm_lapic_get_reg(struct kvm_lapic *apic, int reg_off)
+{
+ return *((u32 *) (apic->regs + reg_off));
+}
+
+static inline void kvm_lapic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val)
+{
+ *((u32 *) (apic->regs + reg_off)) = val;
+}
+
+extern struct static_key kvm_no_apic_vcpu;
+
+static inline bool lapic_in_kernel(struct kvm_vcpu *vcpu)
+{
+ if (static_key_false(&kvm_no_apic_vcpu))
+ return vcpu->arch.apic;
+ return true;
+}
+
+extern struct static_key_deferred apic_hw_disabled;
+
+static inline int kvm_apic_hw_enabled(struct kvm_lapic *apic)
+{
+ if (static_key_false(&apic_hw_disabled.key))
+ return apic->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE;
+ return MSR_IA32_APICBASE_ENABLE;
+}
+
+extern struct static_key_deferred apic_sw_disabled;
+
+static inline bool kvm_apic_sw_enabled(struct kvm_lapic *apic)
+{
+ if (static_key_false(&apic_sw_disabled.key))
+ return apic->sw_enabled;
+ return true;
+}
+
+static inline bool kvm_apic_present(struct kvm_vcpu *vcpu)
+{
+ return lapic_in_kernel(vcpu) && kvm_apic_hw_enabled(vcpu->arch.apic);
+}
+
+static inline int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
+{
+ return kvm_apic_present(vcpu) && kvm_apic_sw_enabled(vcpu->arch.apic);
+}
+
+static inline int apic_x2apic_mode(struct kvm_lapic *apic)
+{
+ return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
+}
+
+static inline bool kvm_vcpu_apicv_active(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.apic && vcpu->arch.apicv_active;
+}
+
+static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu)
+{
+ return lapic_in_kernel(vcpu) && vcpu->arch.apic->pending_events;
+}
+
+static inline bool kvm_lowest_prio_delivery(struct kvm_lapic_irq *irq)
+{
+ return (irq->delivery_mode == APIC_DM_LOWEST ||
+ irq->msi_redir_hint);
+}
+
+static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu)
+{
+ return lapic_in_kernel(vcpu) && test_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
+}
+
+bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
+
+void wait_lapic_expire(struct kvm_vcpu *vcpu);
+
+bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
+ struct kvm_vcpu **dest_vcpu);
+int kvm_vector_to_index(u32 vector, u32 dest_vcpus,
+ const unsigned long *bitmap, u32 bitmap_size);
+void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu);
+void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu);
+void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu);
+bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu);
+void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu);
+
+static inline enum lapic_mode kvm_apic_mode(u64 apic_base)
+{
+ return apic_base & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE);
+}
+
+#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
new file mode 100644
index 000000000..0cb82172c
--- /dev/null
+++ b/arch/x86/kvm/mmu.c
@@ -0,0 +1,6292 @@
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * This module enables machines with Intel VT-x extensions to run virtual
+ * machines without emulation or binary translation.
+ *
+ * MMU support
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ * Yaniv Kamay <yaniv@qumranet.com>
+ * Avi Kivity <avi@qumranet.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include "irq.h"
+#include "mmu.h"
+#include "x86.h"
+#include "kvm_cache_regs.h"
+#include "cpuid.h"
+
+#include <linux/kvm_host.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/moduleparam.h>
+#include <linux/export.h>
+#include <linux/swap.h>
+#include <linux/hugetlb.h>
+#include <linux/compiler.h>
+#include <linux/srcu.h>
+#include <linux/slab.h>
+#include <linux/sched/signal.h>
+#include <linux/uaccess.h>
+#include <linux/hash.h>
+#include <linux/kern_levels.h>
+#include <linux/kthread.h>
+
+#include <asm/page.h>
+#include <asm/pat.h>
+#include <asm/cmpxchg.h>
+#include <asm/io.h>
+#include <asm/vmx.h>
+#include <asm/kvm_page_track.h>
+#include "trace.h"
+
+extern bool itlb_multihit_kvm_mitigation;
+
+static int __read_mostly nx_huge_pages = -1;
+static uint __read_mostly nx_huge_pages_recovery_ratio = 60;
+
+static int set_nx_huge_pages(const char *val, const struct kernel_param *kp);
+static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp);
+
+static struct kernel_param_ops nx_huge_pages_ops = {
+ .set = set_nx_huge_pages,
+ .get = param_get_bool,
+};
+
+static struct kernel_param_ops nx_huge_pages_recovery_ratio_ops = {
+ .set = set_nx_huge_pages_recovery_ratio,
+ .get = param_get_uint,
+};
+
+module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644);
+__MODULE_PARM_TYPE(nx_huge_pages, "bool");
+module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_ratio_ops,
+ &nx_huge_pages_recovery_ratio, 0644);
+__MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint");
+
+/*
+ * When setting this variable to true it enables Two-Dimensional-Paging
+ * where the hardware walks 2 page tables:
+ * 1. the guest-virtual to guest-physical
+ * 2. while doing 1. it walks guest-physical to host-physical
+ * If the hardware supports that we don't need to do shadow paging.
+ */
+bool tdp_enabled = false;
+
+enum {
+ AUDIT_PRE_PAGE_FAULT,
+ AUDIT_POST_PAGE_FAULT,
+ AUDIT_PRE_PTE_WRITE,
+ AUDIT_POST_PTE_WRITE,
+ AUDIT_PRE_SYNC,
+ AUDIT_POST_SYNC
+};
+
+#undef MMU_DEBUG
+
+#ifdef MMU_DEBUG
+static bool dbg = 0;
+module_param(dbg, bool, 0644);
+
+#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
+#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
+#define MMU_WARN_ON(x) WARN_ON(x)
+#else
+#define pgprintk(x...) do { } while (0)
+#define rmap_printk(x...) do { } while (0)
+#define MMU_WARN_ON(x) do { } while (0)
+#endif
+
+#define PTE_PREFETCH_NUM 8
+
+#define PT_FIRST_AVAIL_BITS_SHIFT 10
+#define PT64_SECOND_AVAIL_BITS_SHIFT 52
+
+#define PT64_LEVEL_BITS 9
+
+#define PT64_LEVEL_SHIFT(level) \
+ (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
+
+#define PT64_INDEX(address, level)\
+ (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
+
+
+#define PT32_LEVEL_BITS 10
+
+#define PT32_LEVEL_SHIFT(level) \
+ (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
+
+#define PT32_LVL_OFFSET_MASK(level) \
+ (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
+ * PT32_LEVEL_BITS))) - 1))
+
+#define PT32_INDEX(address, level)\
+ (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
+
+
+#define PT64_BASE_ADDR_MASK __sme_clr((((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)))
+#define PT64_DIR_BASE_ADDR_MASK \
+ (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
+#define PT64_LVL_ADDR_MASK(level) \
+ (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
+ * PT64_LEVEL_BITS))) - 1))
+#define PT64_LVL_OFFSET_MASK(level) \
+ (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
+ * PT64_LEVEL_BITS))) - 1))
+
+#define PT32_BASE_ADDR_MASK PAGE_MASK
+#define PT32_DIR_BASE_ADDR_MASK \
+ (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
+#define PT32_LVL_ADDR_MASK(level) \
+ (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
+ * PT32_LEVEL_BITS))) - 1))
+
+#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \
+ | shadow_x_mask | shadow_nx_mask | shadow_me_mask)
+
+#define ACC_EXEC_MASK 1
+#define ACC_WRITE_MASK PT_WRITABLE_MASK
+#define ACC_USER_MASK PT_USER_MASK
+#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
+
+/* The mask for the R/X bits in EPT PTEs */
+#define PT64_EPT_READABLE_MASK 0x1ull
+#define PT64_EPT_EXECUTABLE_MASK 0x4ull
+
+#include <trace/events/kvm.h>
+
+#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
+#define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
+
+#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
+
+/* make pte_list_desc fit well in cache line */
+#define PTE_LIST_EXT 3
+
+/*
+ * Return values of handle_mmio_page_fault and mmu.page_fault:
+ * RET_PF_RETRY: let CPU fault again on the address.
+ * RET_PF_EMULATE: mmio page fault, emulate the instruction directly.
+ *
+ * For handle_mmio_page_fault only:
+ * RET_PF_INVALID: the spte is invalid, let the real page fault path update it.
+ */
+enum {
+ RET_PF_RETRY = 0,
+ RET_PF_EMULATE = 1,
+ RET_PF_INVALID = 2,
+};
+
+struct pte_list_desc {
+ u64 *sptes[PTE_LIST_EXT];
+ struct pte_list_desc *more;
+};
+
+struct kvm_shadow_walk_iterator {
+ u64 addr;
+ hpa_t shadow_addr;
+ u64 *sptep;
+ int level;
+ unsigned index;
+};
+
+static const union kvm_mmu_page_role mmu_base_role_mask = {
+ .cr0_wp = 1,
+ .cr4_pae = 1,
+ .nxe = 1,
+ .smep_andnot_wp = 1,
+ .smap_andnot_wp = 1,
+ .smm = 1,
+ .guest_mode = 1,
+ .ad_disabled = 1,
+};
+
+#define for_each_shadow_entry_using_root(_vcpu, _root, _addr, _walker) \
+ for (shadow_walk_init_using_root(&(_walker), (_vcpu), \
+ (_root), (_addr)); \
+ shadow_walk_okay(&(_walker)); \
+ shadow_walk_next(&(_walker)))
+
+#define for_each_shadow_entry(_vcpu, _addr, _walker) \
+ for (shadow_walk_init(&(_walker), _vcpu, _addr); \
+ shadow_walk_okay(&(_walker)); \
+ shadow_walk_next(&(_walker)))
+
+#define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte) \
+ for (shadow_walk_init(&(_walker), _vcpu, _addr); \
+ shadow_walk_okay(&(_walker)) && \
+ ({ spte = mmu_spte_get_lockless(_walker.sptep); 1; }); \
+ __shadow_walk_next(&(_walker), spte))
+
+static struct kmem_cache *pte_list_desc_cache;
+static struct kmem_cache *mmu_page_header_cache;
+static struct percpu_counter kvm_total_used_mmu_pages;
+
+static u64 __read_mostly shadow_nx_mask;
+static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
+static u64 __read_mostly shadow_user_mask;
+static u64 __read_mostly shadow_accessed_mask;
+static u64 __read_mostly shadow_dirty_mask;
+static u64 __read_mostly shadow_mmio_mask;
+static u64 __read_mostly shadow_mmio_value;
+static u64 __read_mostly shadow_present_mask;
+static u64 __read_mostly shadow_me_mask;
+
+/*
+ * SPTEs used by MMUs without A/D bits are marked with shadow_acc_track_value.
+ * Non-present SPTEs with shadow_acc_track_value set are in place for access
+ * tracking.
+ */
+static u64 __read_mostly shadow_acc_track_mask;
+static const u64 shadow_acc_track_value = SPTE_SPECIAL_MASK;
+
+/*
+ * The mask/shift to use for saving the original R/X bits when marking the PTE
+ * as not-present for access tracking purposes. We do not save the W bit as the
+ * PTEs being access tracked also need to be dirty tracked, so the W bit will be
+ * restored only when a write is attempted to the page.
+ */
+static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK |
+ PT64_EPT_EXECUTABLE_MASK;
+static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT;
+
+/*
+ * This mask must be set on all non-zero Non-Present or Reserved SPTEs in order
+ * to guard against L1TF attacks.
+ */
+static u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
+
+/*
+ * The number of high-order 1 bits to use in the mask above.
+ */
+static const u64 shadow_nonpresent_or_rsvd_mask_len = 5;
+
+/*
+ * In some cases, we need to preserve the GFN of a non-present or reserved
+ * SPTE when we usurp the upper five bits of the physical address space to
+ * defend against L1TF, e.g. for MMIO SPTEs. To preserve the GFN, we'll
+ * shift bits of the GFN that overlap with shadow_nonpresent_or_rsvd_mask
+ * left into the reserved bits, i.e. the GFN in the SPTE will be split into
+ * high and low parts. This mask covers the lower bits of the GFN.
+ */
+static u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;
+
+/*
+ * The number of non-reserved physical address bits irrespective of features
+ * that repurpose legal bits, e.g. MKTME.
+ */
+static u8 __read_mostly shadow_phys_bits;
+
+static void mmu_spte_set(u64 *sptep, u64 spte);
+static bool is_executable_pte(u64 spte);
+static union kvm_mmu_page_role
+kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu);
+
+#define CREATE_TRACE_POINTS
+#include "mmutrace.h"
+
+
+void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value)
+{
+ BUG_ON((mmio_mask & mmio_value) != mmio_value);
+ WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask << shadow_nonpresent_or_rsvd_mask_len));
+ WARN_ON(mmio_value & shadow_nonpresent_or_rsvd_lower_gfn_mask);
+ shadow_mmio_value = mmio_value | SPTE_SPECIAL_MASK;
+ shadow_mmio_mask = mmio_mask | SPTE_SPECIAL_MASK;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
+
+static bool is_mmio_spte(u64 spte)
+{
+ return (spte & shadow_mmio_mask) == shadow_mmio_value;
+}
+
+static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
+{
+ return sp->role.ad_disabled;
+}
+
+static inline bool spte_ad_enabled(u64 spte)
+{
+ MMU_WARN_ON(is_mmio_spte(spte));
+ return !(spte & shadow_acc_track_value);
+}
+
+static bool is_nx_huge_page_enabled(void)
+{
+ return READ_ONCE(nx_huge_pages);
+}
+
+static inline u64 spte_shadow_accessed_mask(u64 spte)
+{
+ MMU_WARN_ON(is_mmio_spte(spte));
+ return spte_ad_enabled(spte) ? shadow_accessed_mask : 0;
+}
+
+static inline u64 spte_shadow_dirty_mask(u64 spte)
+{
+ MMU_WARN_ON(is_mmio_spte(spte));
+ return spte_ad_enabled(spte) ? shadow_dirty_mask : 0;
+}
+
+static inline bool is_access_track_spte(u64 spte)
+{
+ return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0;
+}
+
+/*
+ * the low bit of the generation number is always presumed to be zero.
+ * This disables mmio caching during memslot updates. The concept is
+ * similar to a seqcount but instead of retrying the access we just punt
+ * and ignore the cache.
+ *
+ * spte bits 3-11 are used as bits 1-9 of the generation number,
+ * the bits 52-61 are used as bits 10-19 of the generation number.
+ */
+#define MMIO_SPTE_GEN_LOW_SHIFT 2
+#define MMIO_SPTE_GEN_HIGH_SHIFT 52
+
+#define MMIO_GEN_SHIFT 20
+#define MMIO_GEN_LOW_SHIFT 10
+#define MMIO_GEN_LOW_MASK ((1 << MMIO_GEN_LOW_SHIFT) - 2)
+#define MMIO_GEN_MASK ((1 << MMIO_GEN_SHIFT) - 1)
+
+static u64 generation_mmio_spte_mask(unsigned int gen)
+{
+ u64 mask;
+
+ WARN_ON(gen & ~MMIO_GEN_MASK);
+
+ mask = (gen & MMIO_GEN_LOW_MASK) << MMIO_SPTE_GEN_LOW_SHIFT;
+ mask |= ((u64)gen >> MMIO_GEN_LOW_SHIFT) << MMIO_SPTE_GEN_HIGH_SHIFT;
+ return mask;
+}
+
+static unsigned int get_mmio_spte_generation(u64 spte)
+{
+ unsigned int gen;
+
+ spte &= ~shadow_mmio_mask;
+
+ gen = (spte >> MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_GEN_LOW_MASK;
+ gen |= (spte >> MMIO_SPTE_GEN_HIGH_SHIFT) << MMIO_GEN_LOW_SHIFT;
+ return gen;
+}
+
+static unsigned int kvm_current_mmio_generation(struct kvm_vcpu *vcpu)
+{
+ return kvm_vcpu_memslots(vcpu)->generation & MMIO_GEN_MASK;
+}
+
+static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
+ unsigned access)
+{
+ unsigned int gen = kvm_current_mmio_generation(vcpu);
+ u64 mask = generation_mmio_spte_mask(gen);
+ u64 gpa = gfn << PAGE_SHIFT;
+
+ access &= ACC_WRITE_MASK | ACC_USER_MASK;
+ mask |= shadow_mmio_value | access;
+ mask |= gpa | shadow_nonpresent_or_rsvd_mask;
+ mask |= (gpa & shadow_nonpresent_or_rsvd_mask)
+ << shadow_nonpresent_or_rsvd_mask_len;
+
+ trace_mark_mmio_spte(sptep, gfn, access, gen);
+ mmu_spte_set(sptep, mask);
+}
+
+static gfn_t get_mmio_spte_gfn(u64 spte)
+{
+ u64 gpa = spte & shadow_nonpresent_or_rsvd_lower_gfn_mask;
+
+ gpa |= (spte >> shadow_nonpresent_or_rsvd_mask_len)
+ & shadow_nonpresent_or_rsvd_mask;
+
+ return gpa >> PAGE_SHIFT;
+}
+
+static unsigned get_mmio_spte_access(u64 spte)
+{
+ u64 mask = generation_mmio_spte_mask(MMIO_GEN_MASK) | shadow_mmio_mask;
+ return (spte & ~mask) & ~PAGE_MASK;
+}
+
+static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
+ kvm_pfn_t pfn, unsigned access)
+{
+ if (unlikely(is_noslot_pfn(pfn))) {
+ mark_mmio_spte(vcpu, sptep, gfn, access);
+ return true;
+ }
+
+ return false;
+}
+
+static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
+{
+ unsigned int kvm_gen, spte_gen;
+
+ kvm_gen = kvm_current_mmio_generation(vcpu);
+ spte_gen = get_mmio_spte_generation(spte);
+
+ trace_check_mmio_spte(spte, kvm_gen, spte_gen);
+ return likely(kvm_gen == spte_gen);
+}
+
+/*
+ * Sets the shadow PTE masks used by the MMU.
+ *
+ * Assumptions:
+ * - Setting either @accessed_mask or @dirty_mask requires setting both
+ * - At least one of @accessed_mask or @acc_track_mask must be set
+ */
+void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
+ u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
+ u64 acc_track_mask, u64 me_mask)
+{
+ BUG_ON(!dirty_mask != !accessed_mask);
+ BUG_ON(!accessed_mask && !acc_track_mask);
+ BUG_ON(acc_track_mask & shadow_acc_track_value);
+
+ shadow_user_mask = user_mask;
+ shadow_accessed_mask = accessed_mask;
+ shadow_dirty_mask = dirty_mask;
+ shadow_nx_mask = nx_mask;
+ shadow_x_mask = x_mask;
+ shadow_present_mask = p_mask;
+ shadow_acc_track_mask = acc_track_mask;
+ shadow_me_mask = me_mask;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
+
+static u8 kvm_get_shadow_phys_bits(void)
+{
+ /*
+ * boot_cpu_data.x86_phys_bits is reduced when MKTME is detected
+ * in CPU detection code, but MKTME treats those reduced bits as
+ * 'keyID' thus they are not reserved bits. Therefore for MKTME
+ * we should still return physical address bits reported by CPUID.
+ */
+ if (!boot_cpu_has(X86_FEATURE_TME) ||
+ WARN_ON_ONCE(boot_cpu_data.extended_cpuid_level < 0x80000008))
+ return boot_cpu_data.x86_phys_bits;
+
+ return cpuid_eax(0x80000008) & 0xff;
+}
+
+static void kvm_mmu_reset_all_pte_masks(void)
+{
+ u8 low_phys_bits;
+
+ shadow_user_mask = 0;
+ shadow_accessed_mask = 0;
+ shadow_dirty_mask = 0;
+ shadow_nx_mask = 0;
+ shadow_x_mask = 0;
+ shadow_mmio_mask = 0;
+ shadow_present_mask = 0;
+ shadow_acc_track_mask = 0;
+
+ shadow_phys_bits = kvm_get_shadow_phys_bits();
+
+ /*
+ * If the CPU has 46 or less physical address bits, then set an
+ * appropriate mask to guard against L1TF attacks. Otherwise, it is
+ * assumed that the CPU is not vulnerable to L1TF.
+ *
+ * Some Intel CPUs address the L1 cache using more PA bits than are
+ * reported by CPUID. Use the PA width of the L1 cache when possible
+ * to achieve more effective mitigation, e.g. if system RAM overlaps
+ * the most significant bits of legal physical address space.
+ */
+ shadow_nonpresent_or_rsvd_mask = 0;
+ low_phys_bits = boot_cpu_data.x86_phys_bits;
+ if (boot_cpu_has_bug(X86_BUG_L1TF) &&
+ !WARN_ON_ONCE(boot_cpu_data.x86_cache_bits >=
+ 52 - shadow_nonpresent_or_rsvd_mask_len)) {
+ low_phys_bits = boot_cpu_data.x86_cache_bits
+ - shadow_nonpresent_or_rsvd_mask_len;
+ shadow_nonpresent_or_rsvd_mask =
+ rsvd_bits(low_phys_bits, boot_cpu_data.x86_cache_bits - 1);
+ }
+
+ shadow_nonpresent_or_rsvd_lower_gfn_mask =
+ GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT);
+}
+
+static int is_cpuid_PSE36(void)
+{
+ return 1;
+}
+
+static int is_nx(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.efer & EFER_NX;
+}
+
+static int is_shadow_present_pte(u64 pte)
+{
+ return (pte != 0) && !is_mmio_spte(pte);
+}
+
+static int is_large_pte(u64 pte)
+{
+ return pte & PT_PAGE_SIZE_MASK;
+}
+
+static int is_last_spte(u64 pte, int level)
+{
+ if (level == PT_PAGE_TABLE_LEVEL)
+ return 1;
+ if (is_large_pte(pte))
+ return 1;
+ return 0;
+}
+
+static bool is_executable_pte(u64 spte)
+{
+ return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask;
+}
+
+static kvm_pfn_t spte_to_pfn(u64 pte)
+{
+ return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
+}
+
+static gfn_t pse36_gfn_delta(u32 gpte)
+{
+ int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
+
+ return (gpte & PT32_DIR_PSE36_MASK) << shift;
+}
+
+#ifdef CONFIG_X86_64
+static void __set_spte(u64 *sptep, u64 spte)
+{
+ WRITE_ONCE(*sptep, spte);
+}
+
+static void __update_clear_spte_fast(u64 *sptep, u64 spte)
+{
+ WRITE_ONCE(*sptep, spte);
+}
+
+static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
+{
+ return xchg(sptep, spte);
+}
+
+static u64 __get_spte_lockless(u64 *sptep)
+{
+ return READ_ONCE(*sptep);
+}
+#else
+union split_spte {
+ struct {
+ u32 spte_low;
+ u32 spte_high;
+ };
+ u64 spte;
+};
+
+static void count_spte_clear(u64 *sptep, u64 spte)
+{
+ struct kvm_mmu_page *sp = page_header(__pa(sptep));
+
+ if (is_shadow_present_pte(spte))
+ return;
+
+ /* Ensure the spte is completely set before we increase the count */
+ smp_wmb();
+ sp->clear_spte_count++;
+}
+
+static void __set_spte(u64 *sptep, u64 spte)
+{
+ union split_spte *ssptep, sspte;
+
+ ssptep = (union split_spte *)sptep;
+ sspte = (union split_spte)spte;
+
+ ssptep->spte_high = sspte.spte_high;
+
+ /*
+ * If we map the spte from nonpresent to present, We should store
+ * the high bits firstly, then set present bit, so cpu can not
+ * fetch this spte while we are setting the spte.
+ */
+ smp_wmb();
+
+ WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
+}
+
+static void __update_clear_spte_fast(u64 *sptep, u64 spte)
+{
+ union split_spte *ssptep, sspte;
+
+ ssptep = (union split_spte *)sptep;
+ sspte = (union split_spte)spte;
+
+ WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
+
+ /*
+ * If we map the spte from present to nonpresent, we should clear
+ * present bit firstly to avoid vcpu fetch the old high bits.
+ */
+ smp_wmb();
+
+ ssptep->spte_high = sspte.spte_high;
+ count_spte_clear(sptep, spte);
+}
+
+static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
+{
+ union split_spte *ssptep, sspte, orig;
+
+ ssptep = (union split_spte *)sptep;
+ sspte = (union split_spte)spte;
+
+ /* xchg acts as a barrier before the setting of the high bits */
+ orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low);
+ orig.spte_high = ssptep->spte_high;
+ ssptep->spte_high = sspte.spte_high;
+ count_spte_clear(sptep, spte);
+
+ return orig.spte;
+}
+
+/*
+ * The idea using the light way get the spte on x86_32 guest is from
+ * gup_get_pte(arch/x86/mm/gup.c).
+ *
+ * An spte tlb flush may be pending, because kvm_set_pte_rmapp
+ * coalesces them and we are running out of the MMU lock. Therefore
+ * we need to protect against in-progress updates of the spte.
+ *
+ * Reading the spte while an update is in progress may get the old value
+ * for the high part of the spte. The race is fine for a present->non-present
+ * change (because the high part of the spte is ignored for non-present spte),
+ * but for a present->present change we must reread the spte.
+ *
+ * All such changes are done in two steps (present->non-present and
+ * non-present->present), hence it is enough to count the number of
+ * present->non-present updates: if it changed while reading the spte,
+ * we might have hit the race. This is done using clear_spte_count.
+ */
+static u64 __get_spte_lockless(u64 *sptep)
+{
+ struct kvm_mmu_page *sp = page_header(__pa(sptep));
+ union split_spte spte, *orig = (union split_spte *)sptep;
+ int count;
+
+retry:
+ count = sp->clear_spte_count;
+ smp_rmb();
+
+ spte.spte_low = orig->spte_low;
+ smp_rmb();
+
+ spte.spte_high = orig->spte_high;
+ smp_rmb();
+
+ if (unlikely(spte.spte_low != orig->spte_low ||
+ count != sp->clear_spte_count))
+ goto retry;
+
+ return spte.spte;
+}
+#endif
+
+static bool spte_can_locklessly_be_made_writable(u64 spte)
+{
+ return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) ==
+ (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE);
+}
+
+static bool spte_has_volatile_bits(u64 spte)
+{
+ if (!is_shadow_present_pte(spte))
+ return false;
+
+ /*
+ * Always atomically update spte if it can be updated
+ * out of mmu-lock, it can ensure dirty bit is not lost,
+ * also, it can help us to get a stable is_writable_pte()
+ * to ensure tlb flush is not missed.
+ */
+ if (spte_can_locklessly_be_made_writable(spte) ||
+ is_access_track_spte(spte))
+ return true;
+
+ if (spte_ad_enabled(spte)) {
+ if ((spte & shadow_accessed_mask) == 0 ||
+ (is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0))
+ return true;
+ }
+
+ return false;
+}
+
+static bool is_accessed_spte(u64 spte)
+{
+ u64 accessed_mask = spte_shadow_accessed_mask(spte);
+
+ return accessed_mask ? spte & accessed_mask
+ : !is_access_track_spte(spte);
+}
+
+static bool is_dirty_spte(u64 spte)
+{
+ u64 dirty_mask = spte_shadow_dirty_mask(spte);
+
+ return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK;
+}
+
+/* Rules for using mmu_spte_set:
+ * Set the sptep from nonpresent to present.
+ * Note: the sptep being assigned *must* be either not present
+ * or in a state where the hardware will not attempt to update
+ * the spte.
+ */
+static void mmu_spte_set(u64 *sptep, u64 new_spte)
+{
+ WARN_ON(is_shadow_present_pte(*sptep));
+ __set_spte(sptep, new_spte);
+}
+
+/*
+ * Update the SPTE (excluding the PFN), but do not track changes in its
+ * accessed/dirty status.
+ */
+static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
+{
+ u64 old_spte = *sptep;
+
+ WARN_ON(!is_shadow_present_pte(new_spte));
+
+ if (!is_shadow_present_pte(old_spte)) {
+ mmu_spte_set(sptep, new_spte);
+ return old_spte;
+ }
+
+ if (!spte_has_volatile_bits(old_spte))
+ __update_clear_spte_fast(sptep, new_spte);
+ else
+ old_spte = __update_clear_spte_slow(sptep, new_spte);
+
+ WARN_ON(spte_to_pfn(old_spte) != spte_to_pfn(new_spte));
+
+ return old_spte;
+}
+
+/* Rules for using mmu_spte_update:
+ * Update the state bits, it means the mapped pfn is not changed.
+ *
+ * Whenever we overwrite a writable spte with a read-only one we
+ * should flush remote TLBs. Otherwise rmap_write_protect
+ * will find a read-only spte, even though the writable spte
+ * might be cached on a CPU's TLB, the return value indicates this
+ * case.
+ *
+ * Returns true if the TLB needs to be flushed
+ */
+static bool mmu_spte_update(u64 *sptep, u64 new_spte)
+{
+ bool flush = false;
+ u64 old_spte = mmu_spte_update_no_track(sptep, new_spte);
+
+ if (!is_shadow_present_pte(old_spte))
+ return false;
+
+ /*
+ * For the spte updated out of mmu-lock is safe, since
+ * we always atomically update it, see the comments in
+ * spte_has_volatile_bits().
+ */
+ if (spte_can_locklessly_be_made_writable(old_spte) &&
+ !is_writable_pte(new_spte))
+ flush = true;
+
+ /*
+ * Flush TLB when accessed/dirty states are changed in the page tables,
+ * to guarantee consistency between TLB and page tables.
+ */
+
+ if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte)) {
+ flush = true;
+ kvm_set_pfn_accessed(spte_to_pfn(old_spte));
+ }
+
+ if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte)) {
+ flush = true;
+ kvm_set_pfn_dirty(spte_to_pfn(old_spte));
+ }
+
+ return flush;
+}
+
+/*
+ * Rules for using mmu_spte_clear_track_bits:
+ * It sets the sptep from present to nonpresent, and track the
+ * state bits, it is used to clear the last level sptep.
+ * Returns non-zero if the PTE was previously valid.
+ */
+static int mmu_spte_clear_track_bits(u64 *sptep)
+{
+ kvm_pfn_t pfn;
+ u64 old_spte = *sptep;
+
+ if (!spte_has_volatile_bits(old_spte))
+ __update_clear_spte_fast(sptep, 0ull);
+ else
+ old_spte = __update_clear_spte_slow(sptep, 0ull);
+
+ if (!is_shadow_present_pte(old_spte))
+ return 0;
+
+ pfn = spte_to_pfn(old_spte);
+
+ /*
+ * KVM does not hold the refcount of the page used by
+ * kvm mmu, before reclaiming the page, we should
+ * unmap it from mmu first.
+ */
+ WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn)));
+
+ if (is_accessed_spte(old_spte))
+ kvm_set_pfn_accessed(pfn);
+
+ if (is_dirty_spte(old_spte))
+ kvm_set_pfn_dirty(pfn);
+
+ return 1;
+}
+
+/*
+ * Rules for using mmu_spte_clear_no_track:
+ * Directly clear spte without caring the state bits of sptep,
+ * it is used to set the upper level spte.
+ */
+static void mmu_spte_clear_no_track(u64 *sptep)
+{
+ __update_clear_spte_fast(sptep, 0ull);
+}
+
+static u64 mmu_spte_get_lockless(u64 *sptep)
+{
+ return __get_spte_lockless(sptep);
+}
+
+static u64 mark_spte_for_access_track(u64 spte)
+{
+ if (spte_ad_enabled(spte))
+ return spte & ~shadow_accessed_mask;
+
+ if (is_access_track_spte(spte))
+ return spte;
+
+ /*
+ * Making an Access Tracking PTE will result in removal of write access
+ * from the PTE. So, verify that we will be able to restore the write
+ * access in the fast page fault path later on.
+ */
+ WARN_ONCE((spte & PT_WRITABLE_MASK) &&
+ !spte_can_locklessly_be_made_writable(spte),
+ "kvm: Writable SPTE is not locklessly dirty-trackable\n");
+
+ WARN_ONCE(spte & (shadow_acc_track_saved_bits_mask <<
+ shadow_acc_track_saved_bits_shift),
+ "kvm: Access Tracking saved bit locations are not zero\n");
+
+ spte |= (spte & shadow_acc_track_saved_bits_mask) <<
+ shadow_acc_track_saved_bits_shift;
+ spte &= ~shadow_acc_track_mask;
+
+ return spte;
+}
+
+/* Restore an acc-track PTE back to a regular PTE */
+static u64 restore_acc_track_spte(u64 spte)
+{
+ u64 new_spte = spte;
+ u64 saved_bits = (spte >> shadow_acc_track_saved_bits_shift)
+ & shadow_acc_track_saved_bits_mask;
+
+ WARN_ON_ONCE(spte_ad_enabled(spte));
+ WARN_ON_ONCE(!is_access_track_spte(spte));
+
+ new_spte &= ~shadow_acc_track_mask;
+ new_spte &= ~(shadow_acc_track_saved_bits_mask <<
+ shadow_acc_track_saved_bits_shift);
+ new_spte |= saved_bits;
+
+ return new_spte;
+}
+
+/* Returns the Accessed status of the PTE and resets it at the same time. */
+static bool mmu_spte_age(u64 *sptep)
+{
+ u64 spte = mmu_spte_get_lockless(sptep);
+
+ if (!is_accessed_spte(spte))
+ return false;
+
+ if (spte_ad_enabled(spte)) {
+ clear_bit((ffs(shadow_accessed_mask) - 1),
+ (unsigned long *)sptep);
+ } else {
+ /*
+ * Capture the dirty status of the page, so that it doesn't get
+ * lost when the SPTE is marked for access tracking.
+ */
+ if (is_writable_pte(spte))
+ kvm_set_pfn_dirty(spte_to_pfn(spte));
+
+ spte = mark_spte_for_access_track(spte);
+ mmu_spte_update_no_track(sptep, spte);
+ }
+
+ return true;
+}
+
+static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Prevent page table teardown by making any free-er wait during
+ * kvm_flush_remote_tlbs() IPI to all active vcpus.
+ */
+ local_irq_disable();
+
+ /*
+ * Make sure a following spte read is not reordered ahead of the write
+ * to vcpu->mode.
+ */
+ smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES);
+}
+
+static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Make sure the write to vcpu->mode is not reordered in front of
+ * reads to sptes. If it does, kvm_mmu_commit_zap_page() can see us
+ * OUTSIDE_GUEST_MODE and proceed to free the shadow page table.
+ */
+ smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE);
+ local_irq_enable();
+}
+
+static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
+ struct kmem_cache *base_cache, int min)
+{
+ void *obj;
+
+ if (cache->nobjs >= min)
+ return 0;
+ while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
+ obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
+ if (!obj)
+ return -ENOMEM;
+ cache->objects[cache->nobjs++] = obj;
+ }
+ return 0;
+}
+
+static int mmu_memory_cache_free_objects(struct kvm_mmu_memory_cache *cache)
+{
+ return cache->nobjs;
+}
+
+static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
+ struct kmem_cache *cache)
+{
+ while (mc->nobjs)
+ kmem_cache_free(cache, mc->objects[--mc->nobjs]);
+}
+
+static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
+ int min)
+{
+ void *page;
+
+ if (cache->nobjs >= min)
+ return 0;
+ while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
+ page = (void *)__get_free_page(GFP_KERNEL_ACCOUNT);
+ if (!page)
+ return -ENOMEM;
+ cache->objects[cache->nobjs++] = page;
+ }
+ return 0;
+}
+
+static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
+{
+ while (mc->nobjs)
+ free_page((unsigned long)mc->objects[--mc->nobjs]);
+}
+
+static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
+{
+ int r;
+
+ r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
+ pte_list_desc_cache, 8 + PTE_PREFETCH_NUM);
+ if (r)
+ goto out;
+ r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
+ if (r)
+ goto out;
+ r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
+ mmu_page_header_cache, 4);
+out:
+ return r;
+}
+
+static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
+{
+ mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
+ pte_list_desc_cache);
+ mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
+ mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache,
+ mmu_page_header_cache);
+}
+
+static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
+{
+ void *p;
+
+ BUG_ON(!mc->nobjs);
+ p = mc->objects[--mc->nobjs];
+ return p;
+}
+
+static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu)
+{
+ return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache);
+}
+
+static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
+{
+ kmem_cache_free(pte_list_desc_cache, pte_list_desc);
+}
+
+static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
+{
+ if (!sp->role.direct)
+ return sp->gfns[index];
+
+ return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS));
+}
+
+static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
+{
+ if (!sp->role.direct) {
+ sp->gfns[index] = gfn;
+ return;
+ }
+
+ if (WARN_ON(gfn != kvm_mmu_page_get_gfn(sp, index)))
+ pr_err_ratelimited("gfn mismatch under direct page %llx "
+ "(expected %llx, got %llx)\n",
+ sp->gfn,
+ kvm_mmu_page_get_gfn(sp, index), gfn);
+}
+
+/*
+ * Return the pointer to the large page information for a given gfn,
+ * handling slots that are not large page aligned.
+ */
+static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
+ struct kvm_memory_slot *slot,
+ int level)
+{
+ unsigned long idx;
+
+ idx = gfn_to_index(gfn, slot->base_gfn, level);
+ return &slot->arch.lpage_info[level - 2][idx];
+}
+
+static void update_gfn_disallow_lpage_count(struct kvm_memory_slot *slot,
+ gfn_t gfn, int count)
+{
+ struct kvm_lpage_info *linfo;
+ int i;
+
+ for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
+ linfo = lpage_info_slot(gfn, slot, i);
+ linfo->disallow_lpage += count;
+ WARN_ON(linfo->disallow_lpage < 0);
+ }
+}
+
+void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
+{
+ update_gfn_disallow_lpage_count(slot, gfn, 1);
+}
+
+void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
+{
+ update_gfn_disallow_lpage_count(slot, gfn, -1);
+}
+
+static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *slot;
+ gfn_t gfn;
+
+ kvm->arch.indirect_shadow_pages++;
+ gfn = sp->gfn;
+ slots = kvm_memslots_for_spte_role(kvm, sp->role);
+ slot = __gfn_to_memslot(slots, gfn);
+
+ /* the non-leaf shadow pages are keeping readonly. */
+ if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+ return kvm_slot_page_track_add_page(kvm, slot, gfn,
+ KVM_PAGE_TRACK_WRITE);
+
+ kvm_mmu_gfn_disallow_lpage(slot, gfn);
+}
+
+static void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ if (sp->lpage_disallowed)
+ return;
+
+ ++kvm->stat.nx_lpage_splits;
+ list_add_tail(&sp->lpage_disallowed_link,
+ &kvm->arch.lpage_disallowed_mmu_pages);
+ sp->lpage_disallowed = true;
+}
+
+static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *slot;
+ gfn_t gfn;
+
+ kvm->arch.indirect_shadow_pages--;
+ gfn = sp->gfn;
+ slots = kvm_memslots_for_spte_role(kvm, sp->role);
+ slot = __gfn_to_memslot(slots, gfn);
+ if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+ return kvm_slot_page_track_remove_page(kvm, slot, gfn,
+ KVM_PAGE_TRACK_WRITE);
+
+ kvm_mmu_gfn_allow_lpage(slot, gfn);
+}
+
+static void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ --kvm->stat.nx_lpage_splits;
+ sp->lpage_disallowed = false;
+ list_del(&sp->lpage_disallowed_link);
+}
+
+static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level,
+ struct kvm_memory_slot *slot)
+{
+ struct kvm_lpage_info *linfo;
+
+ if (slot) {
+ linfo = lpage_info_slot(gfn, slot, level);
+ return !!linfo->disallow_lpage;
+ }
+
+ return true;
+}
+
+static bool mmu_gfn_lpage_is_disallowed(struct kvm_vcpu *vcpu, gfn_t gfn,
+ int level)
+{
+ struct kvm_memory_slot *slot;
+
+ slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+ return __mmu_gfn_lpage_is_disallowed(gfn, level, slot);
+}
+
+static int host_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+ unsigned long page_size;
+ int i, ret = 0;
+
+ page_size = kvm_host_page_size(vcpu, gfn);
+
+ for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
+ if (page_size >= KVM_HPAGE_SIZE(i))
+ ret = i;
+ else
+ break;
+ }
+
+ return ret;
+}
+
+static inline bool memslot_valid_for_gpte(struct kvm_memory_slot *slot,
+ bool no_dirty_log)
+{
+ if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
+ return false;
+ if (no_dirty_log && slot->dirty_bitmap)
+ return false;
+
+ return true;
+}
+
+static struct kvm_memory_slot *
+gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
+ bool no_dirty_log)
+{
+ struct kvm_memory_slot *slot;
+
+ slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+ if (!memslot_valid_for_gpte(slot, no_dirty_log))
+ slot = NULL;
+
+ return slot;
+}
+
+static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn,
+ bool *force_pt_level)
+{
+ int host_level, level, max_level;
+ struct kvm_memory_slot *slot;
+
+ if (unlikely(*force_pt_level))
+ return PT_PAGE_TABLE_LEVEL;
+
+ slot = kvm_vcpu_gfn_to_memslot(vcpu, large_gfn);
+ *force_pt_level = !memslot_valid_for_gpte(slot, true);
+ if (unlikely(*force_pt_level))
+ return PT_PAGE_TABLE_LEVEL;
+
+ host_level = host_mapping_level(vcpu, large_gfn);
+
+ if (host_level == PT_PAGE_TABLE_LEVEL)
+ return host_level;
+
+ max_level = min(kvm_x86_ops->get_lpage_level(), host_level);
+
+ for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
+ if (__mmu_gfn_lpage_is_disallowed(large_gfn, level, slot))
+ break;
+
+ return level - 1;
+}
+
+/*
+ * About rmap_head encoding:
+ *
+ * If the bit zero of rmap_head->val is clear, then it points to the only spte
+ * in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct
+ * pte_list_desc containing more mappings.
+ */
+
+/*
+ * Returns the number of pointers in the rmap chain, not counting the new one.
+ */
+static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
+ struct kvm_rmap_head *rmap_head)
+{
+ struct pte_list_desc *desc;
+ int i, count = 0;
+
+ if (!rmap_head->val) {
+ rmap_printk("pte_list_add: %p %llx 0->1\n", spte, *spte);
+ rmap_head->val = (unsigned long)spte;
+ } else if (!(rmap_head->val & 1)) {
+ rmap_printk("pte_list_add: %p %llx 1->many\n", spte, *spte);
+ desc = mmu_alloc_pte_list_desc(vcpu);
+ desc->sptes[0] = (u64 *)rmap_head->val;
+ desc->sptes[1] = spte;
+ rmap_head->val = (unsigned long)desc | 1;
+ ++count;
+ } else {
+ rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte);
+ desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
+ while (desc->sptes[PTE_LIST_EXT-1] && desc->more) {
+ desc = desc->more;
+ count += PTE_LIST_EXT;
+ }
+ if (desc->sptes[PTE_LIST_EXT-1]) {
+ desc->more = mmu_alloc_pte_list_desc(vcpu);
+ desc = desc->more;
+ }
+ for (i = 0; desc->sptes[i]; ++i)
+ ++count;
+ desc->sptes[i] = spte;
+ }
+ return count;
+}
+
+static void
+pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
+ struct pte_list_desc *desc, int i,
+ struct pte_list_desc *prev_desc)
+{
+ int j;
+
+ for (j = PTE_LIST_EXT - 1; !desc->sptes[j] && j > i; --j)
+ ;
+ desc->sptes[i] = desc->sptes[j];
+ desc->sptes[j] = NULL;
+ if (j != 0)
+ return;
+ if (!prev_desc && !desc->more)
+ rmap_head->val = (unsigned long)desc->sptes[0];
+ else
+ if (prev_desc)
+ prev_desc->more = desc->more;
+ else
+ rmap_head->val = (unsigned long)desc->more | 1;
+ mmu_free_pte_list_desc(desc);
+}
+
+static void pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
+{
+ struct pte_list_desc *desc;
+ struct pte_list_desc *prev_desc;
+ int i;
+
+ if (!rmap_head->val) {
+ printk(KERN_ERR "pte_list_remove: %p 0->BUG\n", spte);
+ BUG();
+ } else if (!(rmap_head->val & 1)) {
+ rmap_printk("pte_list_remove: %p 1->0\n", spte);
+ if ((u64 *)rmap_head->val != spte) {
+ printk(KERN_ERR "pte_list_remove: %p 1->BUG\n", spte);
+ BUG();
+ }
+ rmap_head->val = 0;
+ } else {
+ rmap_printk("pte_list_remove: %p many->many\n", spte);
+ desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
+ prev_desc = NULL;
+ while (desc) {
+ for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) {
+ if (desc->sptes[i] == spte) {
+ pte_list_desc_remove_entry(rmap_head,
+ desc, i, prev_desc);
+ return;
+ }
+ }
+ prev_desc = desc;
+ desc = desc->more;
+ }
+ pr_err("pte_list_remove: %p many->many\n", spte);
+ BUG();
+ }
+}
+
+static struct kvm_rmap_head *__gfn_to_rmap(gfn_t gfn, int level,
+ struct kvm_memory_slot *slot)
+{
+ unsigned long idx;
+
+ idx = gfn_to_index(gfn, slot->base_gfn, level);
+ return &slot->arch.rmap[level - PT_PAGE_TABLE_LEVEL][idx];
+}
+
+static struct kvm_rmap_head *gfn_to_rmap(struct kvm *kvm, gfn_t gfn,
+ struct kvm_mmu_page *sp)
+{
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *slot;
+
+ slots = kvm_memslots_for_spte_role(kvm, sp->role);
+ slot = __gfn_to_memslot(slots, gfn);
+ return __gfn_to_rmap(gfn, sp->role.level, slot);
+}
+
+static bool rmap_can_add(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu_memory_cache *cache;
+
+ cache = &vcpu->arch.mmu_pte_list_desc_cache;
+ return mmu_memory_cache_free_objects(cache);
+}
+
+static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
+{
+ struct kvm_mmu_page *sp;
+ struct kvm_rmap_head *rmap_head;
+
+ sp = page_header(__pa(spte));
+ kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
+ rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
+ return pte_list_add(vcpu, spte, rmap_head);
+}
+
+static void rmap_remove(struct kvm *kvm, u64 *spte)
+{
+ struct kvm_mmu_page *sp;
+ gfn_t gfn;
+ struct kvm_rmap_head *rmap_head;
+
+ sp = page_header(__pa(spte));
+ gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
+ rmap_head = gfn_to_rmap(kvm, gfn, sp);
+ pte_list_remove(spte, rmap_head);
+}
+
+/*
+ * Used by the following functions to iterate through the sptes linked by a
+ * rmap. All fields are private and not assumed to be used outside.
+ */
+struct rmap_iterator {
+ /* private fields */
+ struct pte_list_desc *desc; /* holds the sptep if not NULL */
+ int pos; /* index of the sptep */
+};
+
+/*
+ * Iteration must be started by this function. This should also be used after
+ * removing/dropping sptes from the rmap link because in such cases the
+ * information in the itererator may not be valid.
+ *
+ * Returns sptep if found, NULL otherwise.
+ */
+static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head,
+ struct rmap_iterator *iter)
+{
+ u64 *sptep;
+
+ if (!rmap_head->val)
+ return NULL;
+
+ if (!(rmap_head->val & 1)) {
+ iter->desc = NULL;
+ sptep = (u64 *)rmap_head->val;
+ goto out;
+ }
+
+ iter->desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
+ iter->pos = 0;
+ sptep = iter->desc->sptes[iter->pos];
+out:
+ BUG_ON(!is_shadow_present_pte(*sptep));
+ return sptep;
+}
+
+/*
+ * Must be used with a valid iterator: e.g. after rmap_get_first().
+ *
+ * Returns sptep if found, NULL otherwise.
+ */
+static u64 *rmap_get_next(struct rmap_iterator *iter)
+{
+ u64 *sptep;
+
+ if (iter->desc) {
+ if (iter->pos < PTE_LIST_EXT - 1) {
+ ++iter->pos;
+ sptep = iter->desc->sptes[iter->pos];
+ if (sptep)
+ goto out;
+ }
+
+ iter->desc = iter->desc->more;
+
+ if (iter->desc) {
+ iter->pos = 0;
+ /* desc->sptes[0] cannot be NULL */
+ sptep = iter->desc->sptes[iter->pos];
+ goto out;
+ }
+ }
+
+ return NULL;
+out:
+ BUG_ON(!is_shadow_present_pte(*sptep));
+ return sptep;
+}
+
+#define for_each_rmap_spte(_rmap_head_, _iter_, _spte_) \
+ for (_spte_ = rmap_get_first(_rmap_head_, _iter_); \
+ _spte_; _spte_ = rmap_get_next(_iter_))
+
+static void drop_spte(struct kvm *kvm, u64 *sptep)
+{
+ if (mmu_spte_clear_track_bits(sptep))
+ rmap_remove(kvm, sptep);
+}
+
+
+static bool __drop_large_spte(struct kvm *kvm, u64 *sptep)
+{
+ if (is_large_pte(*sptep)) {
+ WARN_ON(page_header(__pa(sptep))->role.level ==
+ PT_PAGE_TABLE_LEVEL);
+ drop_spte(kvm, sptep);
+ --kvm->stat.lpages;
+ return true;
+ }
+
+ return false;
+}
+
+static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
+{
+ if (__drop_large_spte(vcpu->kvm, sptep))
+ kvm_flush_remote_tlbs(vcpu->kvm);
+}
+
+/*
+ * Write-protect on the specified @sptep, @pt_protect indicates whether
+ * spte write-protection is caused by protecting shadow page table.
+ *
+ * Note: write protection is difference between dirty logging and spte
+ * protection:
+ * - for dirty logging, the spte can be set to writable at anytime if
+ * its dirty bitmap is properly set.
+ * - for spte protection, the spte can be writable only after unsync-ing
+ * shadow page.
+ *
+ * Return true if tlb need be flushed.
+ */
+static bool spte_write_protect(u64 *sptep, bool pt_protect)
+{
+ u64 spte = *sptep;
+
+ if (!is_writable_pte(spte) &&
+ !(pt_protect && spte_can_locklessly_be_made_writable(spte)))
+ return false;
+
+ rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
+
+ if (pt_protect)
+ spte &= ~SPTE_MMU_WRITEABLE;
+ spte = spte & ~PT_WRITABLE_MASK;
+
+ return mmu_spte_update(sptep, spte);
+}
+
+static bool __rmap_write_protect(struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head,
+ bool pt_protect)
+{
+ u64 *sptep;
+ struct rmap_iterator iter;
+ bool flush = false;
+
+ for_each_rmap_spte(rmap_head, &iter, sptep)
+ flush |= spte_write_protect(sptep, pt_protect);
+
+ return flush;
+}
+
+static bool spte_clear_dirty(u64 *sptep)
+{
+ u64 spte = *sptep;
+
+ rmap_printk("rmap_clear_dirty: spte %p %llx\n", sptep, *sptep);
+
+ spte &= ~shadow_dirty_mask;
+
+ return mmu_spte_update(sptep, spte);
+}
+
+static bool wrprot_ad_disabled_spte(u64 *sptep)
+{
+ bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT,
+ (unsigned long *)sptep);
+ if (was_writable)
+ kvm_set_pfn_dirty(spte_to_pfn(*sptep));
+
+ return was_writable;
+}
+
+/*
+ * Gets the GFN ready for another round of dirty logging by clearing the
+ * - D bit on ad-enabled SPTEs, and
+ * - W bit on ad-disabled SPTEs.
+ * Returns true iff any D or W bits were cleared.
+ */
+static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
+{
+ u64 *sptep;
+ struct rmap_iterator iter;
+ bool flush = false;
+
+ for_each_rmap_spte(rmap_head, &iter, sptep)
+ if (spte_ad_enabled(*sptep))
+ flush |= spte_clear_dirty(sptep);
+ else
+ flush |= wrprot_ad_disabled_spte(sptep);
+
+ return flush;
+}
+
+static bool spte_set_dirty(u64 *sptep)
+{
+ u64 spte = *sptep;
+
+ rmap_printk("rmap_set_dirty: spte %p %llx\n", sptep, *sptep);
+
+ spte |= shadow_dirty_mask;
+
+ return mmu_spte_update(sptep, spte);
+}
+
+static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
+{
+ u64 *sptep;
+ struct rmap_iterator iter;
+ bool flush = false;
+
+ for_each_rmap_spte(rmap_head, &iter, sptep)
+ if (spte_ad_enabled(*sptep))
+ flush |= spte_set_dirty(sptep);
+
+ return flush;
+}
+
+/**
+ * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages
+ * @kvm: kvm instance
+ * @slot: slot to protect
+ * @gfn_offset: start of the BITS_PER_LONG pages we care about
+ * @mask: indicates which pages we should protect
+ *
+ * Used when we do not need to care about huge page mappings: e.g. during dirty
+ * logging we do not have any such mappings.
+ */
+static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ gfn_t gfn_offset, unsigned long mask)
+{
+ struct kvm_rmap_head *rmap_head;
+
+ while (mask) {
+ rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
+ PT_PAGE_TABLE_LEVEL, slot);
+ __rmap_write_protect(kvm, rmap_head, false);
+
+ /* clear the first set bit */
+ mask &= mask - 1;
+ }
+}
+
+/**
+ * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages, or write
+ * protect the page if the D-bit isn't supported.
+ * @kvm: kvm instance
+ * @slot: slot to clear D-bit
+ * @gfn_offset: start of the BITS_PER_LONG pages we care about
+ * @mask: indicates which pages we should clear D-bit
+ *
+ * Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap.
+ */
+void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ gfn_t gfn_offset, unsigned long mask)
+{
+ struct kvm_rmap_head *rmap_head;
+
+ while (mask) {
+ rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
+ PT_PAGE_TABLE_LEVEL, slot);
+ __rmap_clear_dirty(kvm, rmap_head);
+
+ /* clear the first set bit */
+ mask &= mask - 1;
+ }
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_clear_dirty_pt_masked);
+
+/**
+ * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
+ * PT level pages.
+ *
+ * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
+ * enable dirty logging for them.
+ *
+ * Used when we do not need to care about huge page mappings: e.g. during dirty
+ * logging we do not have any such mappings.
+ */
+void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ gfn_t gfn_offset, unsigned long mask)
+{
+ if (kvm_x86_ops->enable_log_dirty_pt_masked)
+ kvm_x86_ops->enable_log_dirty_pt_masked(kvm, slot, gfn_offset,
+ mask);
+ else
+ kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
+}
+
+/**
+ * kvm_arch_write_log_dirty - emulate dirty page logging
+ * @vcpu: Guest mode vcpu
+ *
+ * Emulate arch specific page modification logging for the
+ * nested hypervisor
+ */
+int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu, gpa_t l2_gpa)
+{
+ if (kvm_x86_ops->write_log_dirty)
+ return kvm_x86_ops->write_log_dirty(vcpu, l2_gpa);
+
+ return 0;
+}
+
+bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
+ struct kvm_memory_slot *slot, u64 gfn)
+{
+ struct kvm_rmap_head *rmap_head;
+ int i;
+ bool write_protected = false;
+
+ for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
+ rmap_head = __gfn_to_rmap(gfn, i, slot);
+ write_protected |= __rmap_write_protect(kvm, rmap_head, true);
+ }
+
+ return write_protected;
+}
+
+static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
+{
+ struct kvm_memory_slot *slot;
+
+ slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+ return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn);
+}
+
+static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
+{
+ u64 *sptep;
+ struct rmap_iterator iter;
+ bool flush = false;
+
+ while ((sptep = rmap_get_first(rmap_head, &iter))) {
+ rmap_printk("%s: spte %p %llx.\n", __func__, sptep, *sptep);
+
+ drop_spte(kvm, sptep);
+ flush = true;
+ }
+
+ return flush;
+}
+
+static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+ struct kvm_memory_slot *slot, gfn_t gfn, int level,
+ unsigned long data)
+{
+ return kvm_zap_rmapp(kvm, rmap_head);
+}
+
+static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+ struct kvm_memory_slot *slot, gfn_t gfn, int level,
+ unsigned long data)
+{
+ u64 *sptep;
+ struct rmap_iterator iter;
+ int need_flush = 0;
+ u64 new_spte;
+ pte_t *ptep = (pte_t *)data;
+ kvm_pfn_t new_pfn;
+
+ WARN_ON(pte_huge(*ptep));
+ new_pfn = pte_pfn(*ptep);
+
+restart:
+ for_each_rmap_spte(rmap_head, &iter, sptep) {
+ rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n",
+ sptep, *sptep, gfn, level);
+
+ need_flush = 1;
+
+ if (pte_write(*ptep)) {
+ drop_spte(kvm, sptep);
+ goto restart;
+ } else {
+ new_spte = *sptep & ~PT64_BASE_ADDR_MASK;
+ new_spte |= (u64)new_pfn << PAGE_SHIFT;
+
+ new_spte &= ~PT_WRITABLE_MASK;
+ new_spte &= ~SPTE_HOST_WRITEABLE;
+
+ new_spte = mark_spte_for_access_track(new_spte);
+
+ mmu_spte_clear_track_bits(sptep);
+ mmu_spte_set(sptep, new_spte);
+ }
+ }
+
+ if (need_flush)
+ kvm_flush_remote_tlbs(kvm);
+
+ return 0;
+}
+
+struct slot_rmap_walk_iterator {
+ /* input fields. */
+ struct kvm_memory_slot *slot;
+ gfn_t start_gfn;
+ gfn_t end_gfn;
+ int start_level;
+ int end_level;
+
+ /* output fields. */
+ gfn_t gfn;
+ struct kvm_rmap_head *rmap;
+ int level;
+
+ /* private field. */
+ struct kvm_rmap_head *end_rmap;
+};
+
+static void
+rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level)
+{
+ iterator->level = level;
+ iterator->gfn = iterator->start_gfn;
+ iterator->rmap = __gfn_to_rmap(iterator->gfn, level, iterator->slot);
+ iterator->end_rmap = __gfn_to_rmap(iterator->end_gfn, level,
+ iterator->slot);
+}
+
+static void
+slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator,
+ struct kvm_memory_slot *slot, int start_level,
+ int end_level, gfn_t start_gfn, gfn_t end_gfn)
+{
+ iterator->slot = slot;
+ iterator->start_level = start_level;
+ iterator->end_level = end_level;
+ iterator->start_gfn = start_gfn;
+ iterator->end_gfn = end_gfn;
+
+ rmap_walk_init_level(iterator, iterator->start_level);
+}
+
+static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator)
+{
+ return !!iterator->rmap;
+}
+
+static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
+{
+ if (++iterator->rmap <= iterator->end_rmap) {
+ iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level));
+ return;
+ }
+
+ if (++iterator->level > iterator->end_level) {
+ iterator->rmap = NULL;
+ return;
+ }
+
+ rmap_walk_init_level(iterator, iterator->level);
+}
+
+#define for_each_slot_rmap_range(_slot_, _start_level_, _end_level_, \
+ _start_gfn, _end_gfn, _iter_) \
+ for (slot_rmap_walk_init(_iter_, _slot_, _start_level_, \
+ _end_level_, _start_gfn, _end_gfn); \
+ slot_rmap_walk_okay(_iter_); \
+ slot_rmap_walk_next(_iter_))
+
+static int kvm_handle_hva_range(struct kvm *kvm,
+ unsigned long start,
+ unsigned long end,
+ unsigned long data,
+ int (*handler)(struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head,
+ struct kvm_memory_slot *slot,
+ gfn_t gfn,
+ int level,
+ unsigned long data))
+{
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *memslot;
+ struct slot_rmap_walk_iterator iterator;
+ int ret = 0;
+ int i;
+
+ for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+ slots = __kvm_memslots(kvm, i);
+ kvm_for_each_memslot(memslot, slots) {
+ unsigned long hva_start, hva_end;
+ gfn_t gfn_start, gfn_end;
+
+ hva_start = max(start, memslot->userspace_addr);
+ hva_end = min(end, memslot->userspace_addr +
+ (memslot->npages << PAGE_SHIFT));
+ if (hva_start >= hva_end)
+ continue;
+ /*
+ * {gfn(page) | page intersects with [hva_start, hva_end)} =
+ * {gfn_start, gfn_start+1, ..., gfn_end-1}.
+ */
+ gfn_start = hva_to_gfn_memslot(hva_start, memslot);
+ gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
+
+ for_each_slot_rmap_range(memslot, PT_PAGE_TABLE_LEVEL,
+ PT_MAX_HUGEPAGE_LEVEL,
+ gfn_start, gfn_end - 1,
+ &iterator)
+ ret |= handler(kvm, iterator.rmap, memslot,
+ iterator.gfn, iterator.level, data);
+ }
+ }
+
+ return ret;
+}
+
+static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
+ unsigned long data,
+ int (*handler)(struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head,
+ struct kvm_memory_slot *slot,
+ gfn_t gfn, int level,
+ unsigned long data))
+{
+ return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler);
+}
+
+int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end,
+ bool blockable)
+{
+ return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp);
+}
+
+void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+{
+ kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
+}
+
+static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+ struct kvm_memory_slot *slot, gfn_t gfn, int level,
+ unsigned long data)
+{
+ u64 *sptep;
+ struct rmap_iterator uninitialized_var(iter);
+ int young = 0;
+
+ for_each_rmap_spte(rmap_head, &iter, sptep)
+ young |= mmu_spte_age(sptep);
+
+ trace_kvm_age_page(gfn, level, slot, young);
+ return young;
+}
+
+static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+ struct kvm_memory_slot *slot, gfn_t gfn,
+ int level, unsigned long data)
+{
+ u64 *sptep;
+ struct rmap_iterator iter;
+
+ for_each_rmap_spte(rmap_head, &iter, sptep)
+ if (is_accessed_spte(*sptep))
+ return 1;
+ return 0;
+}
+
+#define RMAP_RECYCLE_THRESHOLD 1000
+
+static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
+{
+ struct kvm_rmap_head *rmap_head;
+ struct kvm_mmu_page *sp;
+
+ sp = page_header(__pa(spte));
+
+ rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
+
+ kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, 0);
+ kvm_flush_remote_tlbs(vcpu->kvm);
+}
+
+int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
+{
+ return kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp);
+}
+
+int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
+{
+ return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
+}
+
+#ifdef MMU_DEBUG
+static int is_empty_shadow_page(u64 *spt)
+{
+ u64 *pos;
+ u64 *end;
+
+ for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
+ if (is_shadow_present_pte(*pos)) {
+ printk(KERN_ERR "%s: %p %llx\n", __func__,
+ pos, *pos);
+ return 0;
+ }
+ return 1;
+}
+#endif
+
+/*
+ * This value is the sum of all of the kvm instances's
+ * kvm->arch.n_used_mmu_pages values. We need a global,
+ * aggregate version in order to make the slab shrinker
+ * faster
+ */
+static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, long nr)
+{
+ kvm->arch.n_used_mmu_pages += nr;
+ percpu_counter_add(&kvm_total_used_mmu_pages, nr);
+}
+
+static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
+{
+ MMU_WARN_ON(!is_empty_shadow_page(sp->spt));
+ hlist_del(&sp->hash_link);
+ list_del(&sp->link);
+ free_page((unsigned long)sp->spt);
+ if (!sp->role.direct)
+ free_page((unsigned long)sp->gfns);
+ kmem_cache_free(mmu_page_header_cache, sp);
+}
+
+static unsigned kvm_page_table_hashfn(gfn_t gfn)
+{
+ return hash_64(gfn, KVM_MMU_HASH_SHIFT);
+}
+
+static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp, u64 *parent_pte)
+{
+ if (!parent_pte)
+ return;
+
+ pte_list_add(vcpu, parent_pte, &sp->parent_ptes);
+}
+
+static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
+ u64 *parent_pte)
+{
+ pte_list_remove(parent_pte, &sp->parent_ptes);
+}
+
+static void drop_parent_pte(struct kvm_mmu_page *sp,
+ u64 *parent_pte)
+{
+ mmu_page_remove_parent_pte(sp, parent_pte);
+ mmu_spte_clear_no_track(parent_pte);
+}
+
+static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct)
+{
+ struct kvm_mmu_page *sp;
+
+ sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
+ sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
+ if (!direct)
+ sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
+ set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
+
+ /*
+ * The active_mmu_pages list is the FIFO list, do not move the
+ * page until it is zapped. kvm_zap_obsolete_pages depends on
+ * this feature. See the comments in kvm_zap_obsolete_pages().
+ */
+ list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
+ kvm_mod_used_mmu_pages(vcpu->kvm, +1);
+ return sp;
+}
+
+static void mark_unsync(u64 *spte);
+static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
+{
+ u64 *sptep;
+ struct rmap_iterator iter;
+
+ for_each_rmap_spte(&sp->parent_ptes, &iter, sptep) {
+ mark_unsync(sptep);
+ }
+}
+
+static void mark_unsync(u64 *spte)
+{
+ struct kvm_mmu_page *sp;
+ unsigned int index;
+
+ sp = page_header(__pa(spte));
+ index = spte - sp->spt;
+ if (__test_and_set_bit(index, sp->unsync_child_bitmap))
+ return;
+ if (sp->unsync_children++)
+ return;
+ kvm_mmu_mark_parents_unsync(sp);
+}
+
+static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp)
+{
+ return 0;
+}
+
+static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root)
+{
+}
+
+static void nonpaging_update_pte(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp, u64 *spte,
+ const void *pte)
+{
+ WARN_ON(1);
+}
+
+#define KVM_PAGE_ARRAY_NR 16
+
+struct kvm_mmu_pages {
+ struct mmu_page_and_offset {
+ struct kvm_mmu_page *sp;
+ unsigned int idx;
+ } page[KVM_PAGE_ARRAY_NR];
+ unsigned int nr;
+};
+
+static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
+ int idx)
+{
+ int i;
+
+ if (sp->unsync)
+ for (i=0; i < pvec->nr; i++)
+ if (pvec->page[i].sp == sp)
+ return 0;
+
+ pvec->page[pvec->nr].sp = sp;
+ pvec->page[pvec->nr].idx = idx;
+ pvec->nr++;
+ return (pvec->nr == KVM_PAGE_ARRAY_NR);
+}
+
+static inline void clear_unsync_child_bit(struct kvm_mmu_page *sp, int idx)
+{
+ --sp->unsync_children;
+ WARN_ON((int)sp->unsync_children < 0);
+ __clear_bit(idx, sp->unsync_child_bitmap);
+}
+
+static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
+ struct kvm_mmu_pages *pvec)
+{
+ int i, ret, nr_unsync_leaf = 0;
+
+ for_each_set_bit(i, sp->unsync_child_bitmap, 512) {
+ struct kvm_mmu_page *child;
+ u64 ent = sp->spt[i];
+
+ if (!is_shadow_present_pte(ent) || is_large_pte(ent)) {
+ clear_unsync_child_bit(sp, i);
+ continue;
+ }
+
+ child = page_header(ent & PT64_BASE_ADDR_MASK);
+
+ if (child->unsync_children) {
+ if (mmu_pages_add(pvec, child, i))
+ return -ENOSPC;
+
+ ret = __mmu_unsync_walk(child, pvec);
+ if (!ret) {
+ clear_unsync_child_bit(sp, i);
+ continue;
+ } else if (ret > 0) {
+ nr_unsync_leaf += ret;
+ } else
+ return ret;
+ } else if (child->unsync) {
+ nr_unsync_leaf++;
+ if (mmu_pages_add(pvec, child, i))
+ return -ENOSPC;
+ } else
+ clear_unsync_child_bit(sp, i);
+ }
+
+ return nr_unsync_leaf;
+}
+
+#define INVALID_INDEX (-1)
+
+static int mmu_unsync_walk(struct kvm_mmu_page *sp,
+ struct kvm_mmu_pages *pvec)
+{
+ pvec->nr = 0;
+ if (!sp->unsync_children)
+ return 0;
+
+ mmu_pages_add(pvec, sp, INVALID_INDEX);
+ return __mmu_unsync_walk(sp, pvec);
+}
+
+static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ WARN_ON(!sp->unsync);
+ trace_kvm_mmu_sync_page(sp);
+ sp->unsync = 0;
+ --kvm->stat.mmu_unsync;
+}
+
+static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+ struct list_head *invalid_list);
+static void kvm_mmu_commit_zap_page(struct kvm *kvm,
+ struct list_head *invalid_list);
+
+/*
+ * NOTE: we should pay more attention on the zapped-obsolete page
+ * (is_obsolete_sp(sp) && sp->role.invalid) when you do hash list walk
+ * since it has been deleted from active_mmu_pages but still can be found
+ * at hast list.
+ *
+ * for_each_valid_sp() has skipped that kind of pages.
+ */
+#define for_each_valid_sp(_kvm, _sp, _gfn) \
+ hlist_for_each_entry(_sp, \
+ &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
+ if (is_obsolete_sp((_kvm), (_sp)) || (_sp)->role.invalid) { \
+ } else
+
+#define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \
+ for_each_valid_sp(_kvm, _sp, _gfn) \
+ if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else
+
+/* @sp->gfn should be write-protected at the call site */
+static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ struct list_head *invalid_list)
+{
+ if (sp->role.cr4_pae != !!is_pae(vcpu)
+ || vcpu->arch.mmu.sync_page(vcpu, sp) == 0) {
+ kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
+ return false;
+ }
+
+ return true;
+}
+
+static void kvm_mmu_flush_or_zap(struct kvm_vcpu *vcpu,
+ struct list_head *invalid_list,
+ bool remote_flush, bool local_flush)
+{
+ if (!list_empty(invalid_list)) {
+ kvm_mmu_commit_zap_page(vcpu->kvm, invalid_list);
+ return;
+ }
+
+ if (remote_flush)
+ kvm_flush_remote_tlbs(vcpu->kvm);
+ else if (local_flush)
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+}
+
+#ifdef CONFIG_KVM_MMU_AUDIT
+#include "mmu_audit.c"
+#else
+static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { }
+static void mmu_audit_disable(void) { }
+#endif
+
+static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
+}
+
+static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ struct list_head *invalid_list)
+{
+ kvm_unlink_unsync_page(vcpu->kvm, sp);
+ return __kvm_sync_page(vcpu, sp, invalid_list);
+}
+
+/* @gfn should be write-protected at the call site */
+static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn,
+ struct list_head *invalid_list)
+{
+ struct kvm_mmu_page *s;
+ bool ret = false;
+
+ for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) {
+ if (!s->unsync)
+ continue;
+
+ WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
+ ret |= kvm_sync_page(vcpu, s, invalid_list);
+ }
+
+ return ret;
+}
+
+struct mmu_page_path {
+ struct kvm_mmu_page *parent[PT64_ROOT_MAX_LEVEL];
+ unsigned int idx[PT64_ROOT_MAX_LEVEL];
+};
+
+#define for_each_sp(pvec, sp, parents, i) \
+ for (i = mmu_pages_first(&pvec, &parents); \
+ i < pvec.nr && ({ sp = pvec.page[i].sp; 1;}); \
+ i = mmu_pages_next(&pvec, &parents, i))
+
+static int mmu_pages_next(struct kvm_mmu_pages *pvec,
+ struct mmu_page_path *parents,
+ int i)
+{
+ int n;
+
+ for (n = i+1; n < pvec->nr; n++) {
+ struct kvm_mmu_page *sp = pvec->page[n].sp;
+ unsigned idx = pvec->page[n].idx;
+ int level = sp->role.level;
+
+ parents->idx[level-1] = idx;
+ if (level == PT_PAGE_TABLE_LEVEL)
+ break;
+
+ parents->parent[level-2] = sp;
+ }
+
+ return n;
+}
+
+static int mmu_pages_first(struct kvm_mmu_pages *pvec,
+ struct mmu_page_path *parents)
+{
+ struct kvm_mmu_page *sp;
+ int level;
+
+ if (pvec->nr == 0)
+ return 0;
+
+ WARN_ON(pvec->page[0].idx != INVALID_INDEX);
+
+ sp = pvec->page[0].sp;
+ level = sp->role.level;
+ WARN_ON(level == PT_PAGE_TABLE_LEVEL);
+
+ parents->parent[level-2] = sp;
+
+ /* Also set up a sentinel. Further entries in pvec are all
+ * children of sp, so this element is never overwritten.
+ */
+ parents->parent[level-1] = NULL;
+ return mmu_pages_next(pvec, parents, 0);
+}
+
+static void mmu_pages_clear_parents(struct mmu_page_path *parents)
+{
+ struct kvm_mmu_page *sp;
+ unsigned int level = 0;
+
+ do {
+ unsigned int idx = parents->idx[level];
+ sp = parents->parent[level];
+ if (!sp)
+ return;
+
+ WARN_ON(idx == INVALID_INDEX);
+ clear_unsync_child_bit(sp, idx);
+ level++;
+ } while (!sp->unsync_children);
+}
+
+static void mmu_sync_children(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *parent)
+{
+ int i;
+ struct kvm_mmu_page *sp;
+ struct mmu_page_path parents;
+ struct kvm_mmu_pages pages;
+ LIST_HEAD(invalid_list);
+ bool flush = false;
+
+ while (mmu_unsync_walk(parent, &pages)) {
+ bool protected = false;
+
+ for_each_sp(pages, sp, parents, i)
+ protected |= rmap_write_protect(vcpu, sp->gfn);
+
+ if (protected) {
+ kvm_flush_remote_tlbs(vcpu->kvm);
+ flush = false;
+ }
+
+ for_each_sp(pages, sp, parents, i) {
+ flush |= kvm_sync_page(vcpu, sp, &invalid_list);
+ mmu_pages_clear_parents(&parents);
+ }
+ if (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock)) {
+ kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
+ cond_resched_lock(&vcpu->kvm->mmu_lock);
+ flush = false;
+ }
+ }
+
+ kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
+}
+
+static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
+{
+ atomic_set(&sp->write_flooding_count, 0);
+}
+
+static void clear_sp_write_flooding_count(u64 *spte)
+{
+ struct kvm_mmu_page *sp = page_header(__pa(spte));
+
+ __clear_sp_write_flooding_count(sp);
+}
+
+static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
+ gfn_t gfn,
+ gva_t gaddr,
+ unsigned level,
+ int direct,
+ unsigned access)
+{
+ union kvm_mmu_page_role role;
+ unsigned quadrant;
+ struct kvm_mmu_page *sp;
+ bool need_sync = false;
+ bool flush = false;
+ int collisions = 0;
+ LIST_HEAD(invalid_list);
+
+ role = vcpu->arch.mmu.base_role;
+ role.level = level;
+ role.direct = direct;
+ if (role.direct)
+ role.cr4_pae = 0;
+ role.access = access;
+ if (!vcpu->arch.mmu.direct_map
+ && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
+ quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
+ quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
+ role.quadrant = quadrant;
+ }
+ for_each_valid_sp(vcpu->kvm, sp, gfn) {
+ if (sp->gfn != gfn) {
+ collisions++;
+ continue;
+ }
+
+ if (!need_sync && sp->unsync)
+ need_sync = true;
+
+ if (sp->role.word != role.word)
+ continue;
+
+ if (sp->unsync) {
+ /* The page is good, but __kvm_sync_page might still end
+ * up zapping it. If so, break in order to rebuild it.
+ */
+ if (!__kvm_sync_page(vcpu, sp, &invalid_list))
+ break;
+
+ WARN_ON(!list_empty(&invalid_list));
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+ }
+
+ if (sp->unsync_children)
+ kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
+
+ __clear_sp_write_flooding_count(sp);
+ trace_kvm_mmu_get_page(sp, false);
+ goto out;
+ }
+
+ ++vcpu->kvm->stat.mmu_cache_miss;
+
+ sp = kvm_mmu_alloc_page(vcpu, direct);
+
+ sp->gfn = gfn;
+ sp->role = role;
+ hlist_add_head(&sp->hash_link,
+ &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
+ if (!direct) {
+ /*
+ * we should do write protection before syncing pages
+ * otherwise the content of the synced shadow page may
+ * be inconsistent with guest page table.
+ */
+ account_shadowed(vcpu->kvm, sp);
+ if (level == PT_PAGE_TABLE_LEVEL &&
+ rmap_write_protect(vcpu, gfn))
+ kvm_flush_remote_tlbs(vcpu->kvm);
+
+ if (level > PT_PAGE_TABLE_LEVEL && need_sync)
+ flush |= kvm_sync_pages(vcpu, gfn, &invalid_list);
+ }
+ sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
+ clear_page(sp->spt);
+ trace_kvm_mmu_get_page(sp, true);
+
+ kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
+out:
+ if (collisions > vcpu->kvm->stat.max_mmu_page_hash_collisions)
+ vcpu->kvm->stat.max_mmu_page_hash_collisions = collisions;
+ return sp;
+}
+
+static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterator,
+ struct kvm_vcpu *vcpu, hpa_t root,
+ u64 addr)
+{
+ iterator->addr = addr;
+ iterator->shadow_addr = root;
+ iterator->level = vcpu->arch.mmu.shadow_root_level;
+
+ if (iterator->level == PT64_ROOT_4LEVEL &&
+ vcpu->arch.mmu.root_level < PT64_ROOT_4LEVEL &&
+ !vcpu->arch.mmu.direct_map)
+ --iterator->level;
+
+ if (iterator->level == PT32E_ROOT_LEVEL) {
+ /*
+ * prev_root is currently only used for 64-bit hosts. So only
+ * the active root_hpa is valid here.
+ */
+ BUG_ON(root != vcpu->arch.mmu.root_hpa);
+
+ iterator->shadow_addr
+ = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
+ iterator->shadow_addr &= PT64_BASE_ADDR_MASK;
+ --iterator->level;
+ if (!iterator->shadow_addr)
+ iterator->level = 0;
+ }
+}
+
+static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
+ struct kvm_vcpu *vcpu, u64 addr)
+{
+ shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu.root_hpa,
+ addr);
+}
+
+static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
+{
+ if (iterator->level < PT_PAGE_TABLE_LEVEL)
+ return false;
+
+ iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
+ iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
+ return true;
+}
+
+static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator,
+ u64 spte)
+{
+ if (is_last_spte(spte, iterator->level)) {
+ iterator->level = 0;
+ return;
+ }
+
+ iterator->shadow_addr = spte & PT64_BASE_ADDR_MASK;
+ --iterator->level;
+}
+
+static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
+{
+ __shadow_walk_next(iterator, *iterator->sptep);
+}
+
+static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
+ struct kvm_mmu_page *sp)
+{
+ u64 spte;
+
+ BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
+
+ spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK |
+ shadow_user_mask | shadow_x_mask | shadow_me_mask;
+
+ if (sp_ad_disabled(sp))
+ spte |= shadow_acc_track_value;
+ else
+ spte |= shadow_accessed_mask;
+
+ mmu_spte_set(sptep, spte);
+
+ mmu_page_add_parent_pte(vcpu, sp, sptep);
+
+ if (sp->unsync_children || sp->unsync)
+ mark_unsync(sptep);
+}
+
+static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
+ unsigned direct_access)
+{
+ if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) {
+ struct kvm_mmu_page *child;
+
+ /*
+ * For the direct sp, if the guest pte's dirty bit
+ * changed form clean to dirty, it will corrupt the
+ * sp's access: allow writable in the read-only sp,
+ * so we should update the spte at this point to get
+ * a new sp with the correct access.
+ */
+ child = page_header(*sptep & PT64_BASE_ADDR_MASK);
+ if (child->role.access == direct_access)
+ return;
+
+ drop_parent_pte(child, sptep);
+ kvm_flush_remote_tlbs(vcpu->kvm);
+ }
+}
+
+static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
+ u64 *spte)
+{
+ u64 pte;
+ struct kvm_mmu_page *child;
+
+ pte = *spte;
+ if (is_shadow_present_pte(pte)) {
+ if (is_last_spte(pte, sp->role.level)) {
+ drop_spte(kvm, spte);
+ if (is_large_pte(pte))
+ --kvm->stat.lpages;
+ } else {
+ child = page_header(pte & PT64_BASE_ADDR_MASK);
+ drop_parent_pte(child, spte);
+ }
+ return true;
+ }
+
+ if (is_mmio_spte(pte))
+ mmu_spte_clear_no_track(spte);
+
+ return false;
+}
+
+static void kvm_mmu_page_unlink_children(struct kvm *kvm,
+ struct kvm_mmu_page *sp)
+{
+ unsigned i;
+
+ for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
+ mmu_page_zap_pte(kvm, sp, sp->spt + i);
+}
+
+static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ u64 *sptep;
+ struct rmap_iterator iter;
+
+ while ((sptep = rmap_get_first(&sp->parent_ptes, &iter)))
+ drop_parent_pte(sp, sptep);
+}
+
+static int mmu_zap_unsync_children(struct kvm *kvm,
+ struct kvm_mmu_page *parent,
+ struct list_head *invalid_list)
+{
+ int i, zapped = 0;
+ struct mmu_page_path parents;
+ struct kvm_mmu_pages pages;
+
+ if (parent->role.level == PT_PAGE_TABLE_LEVEL)
+ return 0;
+
+ while (mmu_unsync_walk(parent, &pages)) {
+ struct kvm_mmu_page *sp;
+
+ for_each_sp(pages, sp, parents, i) {
+ kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
+ mmu_pages_clear_parents(&parents);
+ zapped++;
+ }
+ }
+
+ return zapped;
+}
+
+static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+ struct list_head *invalid_list)
+{
+ int ret;
+
+ trace_kvm_mmu_prepare_zap_page(sp);
+ ++kvm->stat.mmu_shadow_zapped;
+ ret = mmu_zap_unsync_children(kvm, sp, invalid_list);
+ kvm_mmu_page_unlink_children(kvm, sp);
+ kvm_mmu_unlink_parents(kvm, sp);
+
+ if (!sp->role.invalid && !sp->role.direct)
+ unaccount_shadowed(kvm, sp);
+
+ if (sp->unsync)
+ kvm_unlink_unsync_page(kvm, sp);
+ if (!sp->root_count) {
+ /* Count self */
+ ret++;
+ list_move(&sp->link, invalid_list);
+ kvm_mod_used_mmu_pages(kvm, -1);
+ } else {
+ list_move(&sp->link, &kvm->arch.active_mmu_pages);
+
+ /*
+ * The obsolete pages can not be used on any vcpus.
+ * See the comments in kvm_mmu_invalidate_zap_all_pages().
+ */
+ if (!sp->role.invalid && !is_obsolete_sp(kvm, sp))
+ kvm_reload_remote_mmus(kvm);
+ }
+
+ if (sp->lpage_disallowed)
+ unaccount_huge_nx_page(kvm, sp);
+
+ sp->role.invalid = 1;
+ return ret;
+}
+
+static void kvm_mmu_commit_zap_page(struct kvm *kvm,
+ struct list_head *invalid_list)
+{
+ struct kvm_mmu_page *sp, *nsp;
+
+ if (list_empty(invalid_list))
+ return;
+
+ /*
+ * We need to make sure everyone sees our modifications to
+ * the page tables and see changes to vcpu->mode here. The barrier
+ * in the kvm_flush_remote_tlbs() achieves this. This pairs
+ * with vcpu_enter_guest and walk_shadow_page_lockless_begin/end.
+ *
+ * In addition, kvm_flush_remote_tlbs waits for all vcpus to exit
+ * guest mode and/or lockless shadow page table walks.
+ */
+ kvm_flush_remote_tlbs(kvm);
+
+ list_for_each_entry_safe(sp, nsp, invalid_list, link) {
+ WARN_ON(!sp->role.invalid || sp->root_count);
+ kvm_mmu_free_page(sp);
+ }
+}
+
+static bool prepare_zap_oldest_mmu_page(struct kvm *kvm,
+ struct list_head *invalid_list)
+{
+ struct kvm_mmu_page *sp;
+
+ if (list_empty(&kvm->arch.active_mmu_pages))
+ return false;
+
+ sp = list_last_entry(&kvm->arch.active_mmu_pages,
+ struct kvm_mmu_page, link);
+ return kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
+}
+
+/*
+ * Changing the number of mmu pages allocated to the vm
+ * Note: if goal_nr_mmu_pages is too small, you will get dead lock
+ */
+void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages)
+{
+ LIST_HEAD(invalid_list);
+
+ spin_lock(&kvm->mmu_lock);
+
+ if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
+ /* Need to free some mmu pages to achieve the goal. */
+ while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages)
+ if (!prepare_zap_oldest_mmu_page(kvm, &invalid_list))
+ break;
+
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
+ goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
+ }
+
+ kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
+
+ spin_unlock(&kvm->mmu_lock);
+}
+
+int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
+{
+ struct kvm_mmu_page *sp;
+ LIST_HEAD(invalid_list);
+ int r;
+
+ pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
+ r = 0;
+ spin_lock(&kvm->mmu_lock);
+ for_each_gfn_indirect_valid_sp(kvm, sp, gfn) {
+ pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
+ sp->role.word);
+ r = 1;
+ kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
+ }
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
+ spin_unlock(&kvm->mmu_lock);
+
+ return r;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page);
+
+static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+ trace_kvm_mmu_unsync_page(sp);
+ ++vcpu->kvm->stat.mmu_unsync;
+ sp->unsync = 1;
+
+ kvm_mmu_mark_parents_unsync(sp);
+}
+
+static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
+ bool can_unsync)
+{
+ struct kvm_mmu_page *sp;
+
+ if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
+ return true;
+
+ for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
+ if (!can_unsync)
+ return true;
+
+ if (sp->unsync)
+ continue;
+
+ WARN_ON(sp->role.level != PT_PAGE_TABLE_LEVEL);
+ kvm_unsync_page(vcpu, sp);
+ }
+
+ /*
+ * We need to ensure that the marking of unsync pages is visible
+ * before the SPTE is updated to allow writes because
+ * kvm_mmu_sync_roots() checks the unsync flags without holding
+ * the MMU lock and so can race with this. If the SPTE was updated
+ * before the page had been marked as unsync-ed, something like the
+ * following could happen:
+ *
+ * CPU 1 CPU 2
+ * ---------------------------------------------------------------------
+ * 1.2 Host updates SPTE
+ * to be writable
+ * 2.1 Guest writes a GPTE for GVA X.
+ * (GPTE being in the guest page table shadowed
+ * by the SP from CPU 1.)
+ * This reads SPTE during the page table walk.
+ * Since SPTE.W is read as 1, there is no
+ * fault.
+ *
+ * 2.2 Guest issues TLB flush.
+ * That causes a VM Exit.
+ *
+ * 2.3 kvm_mmu_sync_pages() reads sp->unsync.
+ * Since it is false, so it just returns.
+ *
+ * 2.4 Guest accesses GVA X.
+ * Since the mapping in the SP was not updated,
+ * so the old mapping for GVA X incorrectly
+ * gets used.
+ * 1.1 Host marks SP
+ * as unsync
+ * (sp->unsync = true)
+ *
+ * The write barrier below ensures that 1.1 happens before 1.2 and thus
+ * the situation in 2.4 does not arise. The implicit barrier in 2.2
+ * pairs with this write barrier.
+ */
+ smp_wmb();
+
+ return false;
+}
+
+static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
+{
+ if (pfn_valid(pfn))
+ return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)) &&
+ /*
+ * Some reserved pages, such as those from NVDIMM
+ * DAX devices, are not for MMIO, and can be mapped
+ * with cached memory type for better performance.
+ * However, the above check misconceives those pages
+ * as MMIO, and results in KVM mapping them with UC
+ * memory type, which would hurt the performance.
+ * Therefore, we check the host memory type in addition
+ * and only treat UC/UC-/WC pages as MMIO.
+ */
+ (!pat_enabled() || pat_pfn_immune_to_uc_mtrr(pfn));
+
+ return true;
+}
+
+/* Bits which may be returned by set_spte() */
+#define SET_SPTE_WRITE_PROTECTED_PT BIT(0)
+#define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1)
+
+static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
+ unsigned pte_access, int level,
+ gfn_t gfn, kvm_pfn_t pfn, bool speculative,
+ bool can_unsync, bool host_writable)
+{
+ u64 spte = 0;
+ int ret = 0;
+ struct kvm_mmu_page *sp;
+
+ if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access))
+ return 0;
+
+ sp = page_header(__pa(sptep));
+ if (sp_ad_disabled(sp))
+ spte |= shadow_acc_track_value;
+
+ /*
+ * For the EPT case, shadow_present_mask is 0 if hardware
+ * supports exec-only page table entries. In that case,
+ * ACC_USER_MASK and shadow_user_mask are used to represent
+ * read access. See FNAME(gpte_access) in paging_tmpl.h.
+ */
+ spte |= shadow_present_mask;
+ if (!speculative)
+ spte |= spte_shadow_accessed_mask(spte);
+
+ if (level > PT_PAGE_TABLE_LEVEL && (pte_access & ACC_EXEC_MASK) &&
+ is_nx_huge_page_enabled()) {
+ pte_access &= ~ACC_EXEC_MASK;
+ }
+
+ if (pte_access & ACC_EXEC_MASK)
+ spte |= shadow_x_mask;
+ else
+ spte |= shadow_nx_mask;
+
+ if (pte_access & ACC_USER_MASK)
+ spte |= shadow_user_mask;
+
+ if (level > PT_PAGE_TABLE_LEVEL)
+ spte |= PT_PAGE_SIZE_MASK;
+ if (tdp_enabled)
+ spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
+ kvm_is_mmio_pfn(pfn));
+
+ if (host_writable)
+ spte |= SPTE_HOST_WRITEABLE;
+ else
+ pte_access &= ~ACC_WRITE_MASK;
+
+ if (!kvm_is_mmio_pfn(pfn))
+ spte |= shadow_me_mask;
+
+ spte |= (u64)pfn << PAGE_SHIFT;
+
+ if (pte_access & ACC_WRITE_MASK) {
+
+ /*
+ * Other vcpu creates new sp in the window between
+ * mapping_level() and acquiring mmu-lock. We can
+ * allow guest to retry the access, the mapping can
+ * be fixed if guest refault.
+ */
+ if (level > PT_PAGE_TABLE_LEVEL &&
+ mmu_gfn_lpage_is_disallowed(vcpu, gfn, level))
+ goto done;
+
+ spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
+
+ /*
+ * Optimization: for pte sync, if spte was writable the hash
+ * lookup is unnecessary (and expensive). Write protection
+ * is responsibility of mmu_get_page / kvm_sync_page.
+ * Same reasoning can be applied to dirty page accounting.
+ */
+ if (!can_unsync && is_writable_pte(*sptep))
+ goto set_pte;
+
+ if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
+ pgprintk("%s: found shadow page for %llx, marking ro\n",
+ __func__, gfn);
+ ret |= SET_SPTE_WRITE_PROTECTED_PT;
+ pte_access &= ~ACC_WRITE_MASK;
+ spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);
+ }
+ }
+
+ if (pte_access & ACC_WRITE_MASK) {
+ kvm_vcpu_mark_page_dirty(vcpu, gfn);
+ spte |= spte_shadow_dirty_mask(spte);
+ }
+
+ if (speculative)
+ spte = mark_spte_for_access_track(spte);
+
+set_pte:
+ if (mmu_spte_update(sptep, spte))
+ ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH;
+done:
+ return ret;
+}
+
+static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
+ int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn,
+ bool speculative, bool host_writable)
+{
+ int was_rmapped = 0;
+ int rmap_count;
+ int set_spte_ret;
+ int ret = RET_PF_RETRY;
+ bool flush = false;
+
+ pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
+ *sptep, write_fault, gfn);
+
+ if (is_shadow_present_pte(*sptep)) {
+ /*
+ * If we overwrite a PTE page pointer with a 2MB PMD, unlink
+ * the parent of the now unreachable PTE.
+ */
+ if (level > PT_PAGE_TABLE_LEVEL &&
+ !is_large_pte(*sptep)) {
+ struct kvm_mmu_page *child;
+ u64 pte = *sptep;
+
+ child = page_header(pte & PT64_BASE_ADDR_MASK);
+ drop_parent_pte(child, sptep);
+ flush = true;
+ } else if (pfn != spte_to_pfn(*sptep)) {
+ pgprintk("hfn old %llx new %llx\n",
+ spte_to_pfn(*sptep), pfn);
+ drop_spte(vcpu->kvm, sptep);
+ flush = true;
+ } else
+ was_rmapped = 1;
+ }
+
+ set_spte_ret = set_spte(vcpu, sptep, pte_access, level, gfn, pfn,
+ speculative, true, host_writable);
+ if (set_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
+ if (write_fault)
+ ret = RET_PF_EMULATE;
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+ }
+ if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH || flush)
+ kvm_flush_remote_tlbs(vcpu->kvm);
+
+ if (unlikely(is_mmio_spte(*sptep)))
+ ret = RET_PF_EMULATE;
+
+ pgprintk("%s: setting spte %llx\n", __func__, *sptep);
+ trace_kvm_mmu_set_spte(level, gfn, sptep);
+ if (!was_rmapped && is_large_pte(*sptep))
+ ++vcpu->kvm->stat.lpages;
+
+ if (is_shadow_present_pte(*sptep)) {
+ if (!was_rmapped) {
+ rmap_count = rmap_add(vcpu, sptep, gfn);
+ if (rmap_count > RMAP_RECYCLE_THRESHOLD)
+ rmap_recycle(vcpu, sptep, gfn);
+ }
+ }
+
+ return ret;
+}
+
+static kvm_pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
+ bool no_dirty_log)
+{
+ struct kvm_memory_slot *slot;
+
+ slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
+ if (!slot)
+ return KVM_PFN_ERR_FAULT;
+
+ return gfn_to_pfn_memslot_atomic(slot, gfn);
+}
+
+static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp,
+ u64 *start, u64 *end)
+{
+ struct page *pages[PTE_PREFETCH_NUM];
+ struct kvm_memory_slot *slot;
+ unsigned access = sp->role.access;
+ int i, ret;
+ gfn_t gfn;
+
+ gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
+ slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK);
+ if (!slot)
+ return -1;
+
+ ret = gfn_to_page_many_atomic(slot, gfn, pages, end - start);
+ if (ret <= 0)
+ return -1;
+
+ for (i = 0; i < ret; i++, gfn++, start++) {
+ mmu_set_spte(vcpu, start, access, 0, sp->role.level, gfn,
+ page_to_pfn(pages[i]), true, true);
+ put_page(pages[i]);
+ }
+
+ return 0;
+}
+
+static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp, u64 *sptep)
+{
+ u64 *spte, *start = NULL;
+ int i;
+
+ WARN_ON(!sp->role.direct);
+
+ i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
+ spte = sp->spt + i;
+
+ for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
+ if (is_shadow_present_pte(*spte) || spte == sptep) {
+ if (!start)
+ continue;
+ if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
+ break;
+ start = NULL;
+ } else if (!start)
+ start = spte;
+ }
+}
+
+static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
+{
+ struct kvm_mmu_page *sp;
+
+ sp = page_header(__pa(sptep));
+
+ /*
+ * Without accessed bits, there's no way to distinguish between
+ * actually accessed translations and prefetched, so disable pte
+ * prefetch if accessed bits aren't available.
+ */
+ if (sp_ad_disabled(sp))
+ return;
+
+ if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+ return;
+
+ __direct_pte_prefetch(vcpu, sp, sptep);
+}
+
+static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it,
+ gfn_t gfn, kvm_pfn_t *pfnp, int *levelp)
+{
+ int level = *levelp;
+ u64 spte = *it.sptep;
+
+ if (it.level == level && level > PT_PAGE_TABLE_LEVEL &&
+ is_nx_huge_page_enabled() &&
+ is_shadow_present_pte(spte) &&
+ !is_large_pte(spte)) {
+ /*
+ * A small SPTE exists for this pfn, but FNAME(fetch)
+ * and __direct_map would like to create a large PTE
+ * instead: just force them to go down another level,
+ * patching back for them into pfn the next 9 bits of
+ * the address.
+ */
+ u64 page_mask = KVM_PAGES_PER_HPAGE(level) - KVM_PAGES_PER_HPAGE(level - 1);
+ *pfnp |= gfn & page_mask;
+ (*levelp)--;
+ }
+}
+
+static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
+ int map_writable, int level, kvm_pfn_t pfn,
+ bool prefault, bool lpage_disallowed)
+{
+ struct kvm_shadow_walk_iterator it;
+ struct kvm_mmu_page *sp;
+ int ret;
+ gfn_t gfn = gpa >> PAGE_SHIFT;
+ gfn_t base_gfn = gfn;
+
+ if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ return RET_PF_RETRY;
+
+ trace_kvm_mmu_spte_requested(gpa, level, pfn);
+ for_each_shadow_entry(vcpu, gpa, it) {
+ /*
+ * We cannot overwrite existing page tables with an NX
+ * large page, as the leaf could be executable.
+ */
+ disallowed_hugepage_adjust(it, gfn, &pfn, &level);
+
+ base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
+ if (it.level == level)
+ break;
+
+ drop_large_spte(vcpu, it.sptep);
+ if (!is_shadow_present_pte(*it.sptep)) {
+ sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr,
+ it.level - 1, true, ACC_ALL);
+
+ link_shadow_page(vcpu, it.sptep, sp);
+ if (lpage_disallowed)
+ account_huge_nx_page(vcpu->kvm, sp);
+ }
+ }
+
+ ret = mmu_set_spte(vcpu, it.sptep, ACC_ALL,
+ write, level, base_gfn, pfn, prefault,
+ map_writable);
+ direct_pte_prefetch(vcpu, it.sptep);
+ ++vcpu->stat.pf_fixed;
+ return ret;
+}
+
+static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
+{
+ siginfo_t info;
+
+ clear_siginfo(&info);
+ info.si_signo = SIGBUS;
+ info.si_errno = 0;
+ info.si_code = BUS_MCEERR_AR;
+ info.si_addr = (void __user *)address;
+ info.si_addr_lsb = PAGE_SHIFT;
+
+ send_sig_info(SIGBUS, &info, tsk);
+}
+
+static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
+{
+ /*
+ * Do not cache the mmio info caused by writing the readonly gfn
+ * into the spte otherwise read access on readonly gfn also can
+ * caused mmio page fault and treat it as mmio access.
+ */
+ if (pfn == KVM_PFN_ERR_RO_FAULT)
+ return RET_PF_EMULATE;
+
+ if (pfn == KVM_PFN_ERR_HWPOISON) {
+ kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current);
+ return RET_PF_RETRY;
+ }
+
+ return -EFAULT;
+}
+
+static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
+ gfn_t gfn, kvm_pfn_t *pfnp,
+ int *levelp)
+{
+ kvm_pfn_t pfn = *pfnp;
+ int level = *levelp;
+
+ /*
+ * Check if it's a transparent hugepage. If this would be an
+ * hugetlbfs page, level wouldn't be set to
+ * PT_PAGE_TABLE_LEVEL and there would be no adjustment done
+ * here.
+ */
+ if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
+ !kvm_is_zone_device_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL &&
+ PageTransCompoundMap(pfn_to_page(pfn)) &&
+ !mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) {
+ unsigned long mask;
+ /*
+ * mmu_notifier_retry was successful and we hold the
+ * mmu_lock here, so the pmd can't become splitting
+ * from under us, and in turn
+ * __split_huge_page_refcount() can't run from under
+ * us and we can safely transfer the refcount from
+ * PG_tail to PG_head as we switch the pfn to tail to
+ * head.
+ */
+ *levelp = level = PT_DIRECTORY_LEVEL;
+ mask = KVM_PAGES_PER_HPAGE(level) - 1;
+ VM_BUG_ON((gfn & mask) != (pfn & mask));
+ if (pfn & mask) {
+ kvm_release_pfn_clean(pfn);
+ pfn &= ~mask;
+ kvm_get_pfn(pfn);
+ *pfnp = pfn;
+ }
+ }
+}
+
+static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
+ kvm_pfn_t pfn, unsigned access, int *ret_val)
+{
+ /* The pfn is invalid, report the error! */
+ if (unlikely(is_error_pfn(pfn))) {
+ *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn);
+ return true;
+ }
+
+ if (unlikely(is_noslot_pfn(pfn)))
+ vcpu_cache_mmio_info(vcpu, gva, gfn, access);
+
+ return false;
+}
+
+static bool page_fault_can_be_fast(u32 error_code)
+{
+ /*
+ * Do not fix the mmio spte with invalid generation number which
+ * need to be updated by slow page fault path.
+ */
+ if (unlikely(error_code & PFERR_RSVD_MASK))
+ return false;
+
+ /* See if the page fault is due to an NX violation */
+ if (unlikely(((error_code & (PFERR_FETCH_MASK | PFERR_PRESENT_MASK))
+ == (PFERR_FETCH_MASK | PFERR_PRESENT_MASK))))
+ return false;
+
+ /*
+ * #PF can be fast if:
+ * 1. The shadow page table entry is not present, which could mean that
+ * the fault is potentially caused by access tracking (if enabled).
+ * 2. The shadow page table entry is present and the fault
+ * is caused by write-protect, that means we just need change the W
+ * bit of the spte which can be done out of mmu-lock.
+ *
+ * However, if access tracking is disabled we know that a non-present
+ * page must be a genuine page fault where we have to create a new SPTE.
+ * So, if access tracking is disabled, we return true only for write
+ * accesses to a present page.
+ */
+
+ return shadow_acc_track_mask != 0 ||
+ ((error_code & (PFERR_WRITE_MASK | PFERR_PRESENT_MASK))
+ == (PFERR_WRITE_MASK | PFERR_PRESENT_MASK));
+}
+
+/*
+ * Returns true if the SPTE was fixed successfully. Otherwise,
+ * someone else modified the SPTE from its original value.
+ */
+static bool
+fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ u64 *sptep, u64 old_spte, u64 new_spte)
+{
+ gfn_t gfn;
+
+ WARN_ON(!sp->role.direct);
+
+ /*
+ * Theoretically we could also set dirty bit (and flush TLB) here in
+ * order to eliminate unnecessary PML logging. See comments in
+ * set_spte. But fast_page_fault is very unlikely to happen with PML
+ * enabled, so we do not do this. This might result in the same GPA
+ * to be logged in PML buffer again when the write really happens, and
+ * eventually to be called by mark_page_dirty twice. But it's also no
+ * harm. This also avoids the TLB flush needed after setting dirty bit
+ * so non-PML cases won't be impacted.
+ *
+ * Compare with set_spte where instead shadow_dirty_mask is set.
+ */
+ if (cmpxchg64(sptep, old_spte, new_spte) != old_spte)
+ return false;
+
+ if (is_writable_pte(new_spte) && !is_writable_pte(old_spte)) {
+ /*
+ * The gfn of direct spte is stable since it is
+ * calculated by sp->gfn.
+ */
+ gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
+ kvm_vcpu_mark_page_dirty(vcpu, gfn);
+ }
+
+ return true;
+}
+
+static bool is_access_allowed(u32 fault_err_code, u64 spte)
+{
+ if (fault_err_code & PFERR_FETCH_MASK)
+ return is_executable_pte(spte);
+
+ if (fault_err_code & PFERR_WRITE_MASK)
+ return is_writable_pte(spte);
+
+ /* Fault was on Read access */
+ return spte & PT_PRESENT_MASK;
+}
+
+/*
+ * Return value:
+ * - true: let the vcpu to access on the same address again.
+ * - false: let the real page fault path to fix it.
+ */
+static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int level,
+ u32 error_code)
+{
+ struct kvm_shadow_walk_iterator iterator;
+ struct kvm_mmu_page *sp;
+ bool fault_handled = false;
+ u64 spte = 0ull;
+ uint retry_count = 0;
+
+ if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ return false;
+
+ if (!page_fault_can_be_fast(error_code))
+ return false;
+
+ walk_shadow_page_lockless_begin(vcpu);
+
+ do {
+ u64 new_spte;
+
+ for_each_shadow_entry_lockless(vcpu, cr2_or_gpa, iterator, spte)
+ if (!is_shadow_present_pte(spte) ||
+ iterator.level < level)
+ break;
+
+ sp = page_header(__pa(iterator.sptep));
+ if (!is_last_spte(spte, sp->role.level))
+ break;
+
+ /*
+ * Check whether the memory access that caused the fault would
+ * still cause it if it were to be performed right now. If not,
+ * then this is a spurious fault caused by TLB lazily flushed,
+ * or some other CPU has already fixed the PTE after the
+ * current CPU took the fault.
+ *
+ * Need not check the access of upper level table entries since
+ * they are always ACC_ALL.
+ */
+ if (is_access_allowed(error_code, spte)) {
+ fault_handled = true;
+ break;
+ }
+
+ new_spte = spte;
+
+ if (is_access_track_spte(spte))
+ new_spte = restore_acc_track_spte(new_spte);
+
+ /*
+ * Currently, to simplify the code, write-protection can
+ * be removed in the fast path only if the SPTE was
+ * write-protected for dirty-logging or access tracking.
+ */
+ if ((error_code & PFERR_WRITE_MASK) &&
+ spte_can_locklessly_be_made_writable(spte))
+ {
+ new_spte |= PT_WRITABLE_MASK;
+
+ /*
+ * Do not fix write-permission on the large spte. Since
+ * we only dirty the first page into the dirty-bitmap in
+ * fast_pf_fix_direct_spte(), other pages are missed
+ * if its slot has dirty logging enabled.
+ *
+ * Instead, we let the slow page fault path create a
+ * normal spte to fix the access.
+ *
+ * See the comments in kvm_arch_commit_memory_region().
+ */
+ if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+ break;
+ }
+
+ /* Verify that the fault can be handled in the fast path */
+ if (new_spte == spte ||
+ !is_access_allowed(error_code, new_spte))
+ break;
+
+ /*
+ * Currently, fast page fault only works for direct mapping
+ * since the gfn is not stable for indirect shadow page. See
+ * Documentation/virtual/kvm/locking.txt to get more detail.
+ */
+ fault_handled = fast_pf_fix_direct_spte(vcpu, sp,
+ iterator.sptep, spte,
+ new_spte);
+ if (fault_handled)
+ break;
+
+ if (++retry_count > 4) {
+ printk_once(KERN_WARNING
+ "kvm: Fast #PF retrying more than 4 times.\n");
+ break;
+ }
+
+ } while (true);
+
+ trace_fast_page_fault(vcpu, cr2_or_gpa, error_code, iterator.sptep,
+ spte, fault_handled);
+ walk_shadow_page_lockless_end(vcpu);
+
+ return fault_handled;
+}
+
+static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
+ gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write,
+ bool *writable);
+static int make_mmu_pages_available(struct kvm_vcpu *vcpu);
+
+static int nonpaging_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
+ gfn_t gfn, bool prefault)
+{
+ int r;
+ int level;
+ bool force_pt_level;
+ kvm_pfn_t pfn;
+ unsigned long mmu_seq;
+ bool map_writable, write = error_code & PFERR_WRITE_MASK;
+ bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
+ is_nx_huge_page_enabled();
+
+ force_pt_level = lpage_disallowed;
+ level = mapping_level(vcpu, gfn, &force_pt_level);
+ if (likely(!force_pt_level)) {
+ /*
+ * This path builds a PAE pagetable - so we can map
+ * 2mb pages at maximum. Therefore check if the level
+ * is larger than that.
+ */
+ if (level > PT_DIRECTORY_LEVEL)
+ level = PT_DIRECTORY_LEVEL;
+
+ gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
+ }
+
+ if (fast_page_fault(vcpu, gpa, level, error_code))
+ return RET_PF_RETRY;
+
+ mmu_seq = vcpu->kvm->mmu_notifier_seq;
+ smp_rmb();
+
+ if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
+ return RET_PF_RETRY;
+
+ if (handle_abnormal_pfn(vcpu, gpa, gfn, pfn, ACC_ALL, &r))
+ return r;
+
+ r = RET_PF_RETRY;
+ spin_lock(&vcpu->kvm->mmu_lock);
+ if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
+ goto out_unlock;
+ if (make_mmu_pages_available(vcpu) < 0)
+ goto out_unlock;
+ if (likely(!force_pt_level))
+ transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
+ r = __direct_map(vcpu, gpa, write, map_writable, level, pfn,
+ prefault, false);
+out_unlock:
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ kvm_release_pfn_clean(pfn);
+ return r;
+}
+
+static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
+ struct list_head *invalid_list)
+{
+ struct kvm_mmu_page *sp;
+
+ if (!VALID_PAGE(*root_hpa))
+ return;
+
+ sp = page_header(*root_hpa & PT64_BASE_ADDR_MASK);
+ --sp->root_count;
+ if (!sp->root_count && sp->role.invalid)
+ kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
+
+ *root_hpa = INVALID_PAGE;
+}
+
+/* roots_to_free must be some combination of the KVM_MMU_ROOT_* flags */
+void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, ulong roots_to_free)
+{
+ int i;
+ LIST_HEAD(invalid_list);
+ struct kvm_mmu *mmu = &vcpu->arch.mmu;
+ bool free_active_root = roots_to_free & KVM_MMU_ROOT_CURRENT;
+
+ BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG);
+
+ /* Before acquiring the MMU lock, see if we need to do any real work. */
+ if (!(free_active_root && VALID_PAGE(mmu->root_hpa))) {
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+ if ((roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) &&
+ VALID_PAGE(mmu->prev_roots[i].hpa))
+ break;
+
+ if (i == KVM_MMU_NUM_PREV_ROOTS)
+ return;
+ }
+
+ spin_lock(&vcpu->kvm->mmu_lock);
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+ if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i))
+ mmu_free_root_page(vcpu->kvm, &mmu->prev_roots[i].hpa,
+ &invalid_list);
+
+ if (free_active_root) {
+ if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
+ (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) {
+ mmu_free_root_page(vcpu->kvm, &mmu->root_hpa,
+ &invalid_list);
+ } else {
+ for (i = 0; i < 4; ++i)
+ if (mmu->pae_root[i] != 0)
+ mmu_free_root_page(vcpu->kvm,
+ &mmu->pae_root[i],
+ &invalid_list);
+ mmu->root_hpa = INVALID_PAGE;
+ }
+ }
+
+ kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+ spin_unlock(&vcpu->kvm->mmu_lock);
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_free_roots);
+
+static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
+{
+ int ret = 0;
+
+ if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) {
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ ret = 1;
+ }
+
+ return ret;
+}
+
+static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu_page *sp;
+ unsigned i;
+
+ if (vcpu->arch.mmu.shadow_root_level >= PT64_ROOT_4LEVEL) {
+ spin_lock(&vcpu->kvm->mmu_lock);
+ if(make_mmu_pages_available(vcpu) < 0) {
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ return -ENOSPC;
+ }
+ sp = kvm_mmu_get_page(vcpu, 0, 0,
+ vcpu->arch.mmu.shadow_root_level, 1, ACC_ALL);
+ ++sp->root_count;
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ vcpu->arch.mmu.root_hpa = __pa(sp->spt);
+ } else if (vcpu->arch.mmu.shadow_root_level == PT32E_ROOT_LEVEL) {
+ for (i = 0; i < 4; ++i) {
+ hpa_t root = vcpu->arch.mmu.pae_root[i];
+
+ MMU_WARN_ON(VALID_PAGE(root));
+ spin_lock(&vcpu->kvm->mmu_lock);
+ if (make_mmu_pages_available(vcpu) < 0) {
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ return -ENOSPC;
+ }
+ sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),
+ i << 30, PT32_ROOT_LEVEL, 1, ACC_ALL);
+ root = __pa(sp->spt);
+ ++sp->root_count;
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
+ }
+ vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
+ } else
+ BUG();
+
+ return 0;
+}
+
+static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu_page *sp;
+ u64 pdptr, pm_mask;
+ gfn_t root_gfn;
+ int i;
+
+ root_gfn = vcpu->arch.mmu.get_cr3(vcpu) >> PAGE_SHIFT;
+
+ if (mmu_check_root(vcpu, root_gfn))
+ return 1;
+
+ /*
+ * Do we shadow a long mode page table? If so we need to
+ * write-protect the guests page table root.
+ */
+ if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) {
+ hpa_t root = vcpu->arch.mmu.root_hpa;
+
+ MMU_WARN_ON(VALID_PAGE(root));
+
+ spin_lock(&vcpu->kvm->mmu_lock);
+ if (make_mmu_pages_available(vcpu) < 0) {
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ return -ENOSPC;
+ }
+ sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
+ vcpu->arch.mmu.shadow_root_level, 0, ACC_ALL);
+ root = __pa(sp->spt);
+ ++sp->root_count;
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ vcpu->arch.mmu.root_hpa = root;
+ return 0;
+ }
+
+ /*
+ * We shadow a 32 bit page table. This may be a legacy 2-level
+ * or a PAE 3-level page table. In either case we need to be aware that
+ * the shadow page table may be a PAE or a long mode page table.
+ */
+ pm_mask = PT_PRESENT_MASK;
+ if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL)
+ pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
+
+ for (i = 0; i < 4; ++i) {
+ hpa_t root = vcpu->arch.mmu.pae_root[i];
+
+ MMU_WARN_ON(VALID_PAGE(root));
+ if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
+ pdptr = vcpu->arch.mmu.get_pdptr(vcpu, i);
+ if (!(pdptr & PT_PRESENT_MASK)) {
+ vcpu->arch.mmu.pae_root[i] = 0;
+ continue;
+ }
+ root_gfn = pdptr >> PAGE_SHIFT;
+ if (mmu_check_root(vcpu, root_gfn))
+ return 1;
+ }
+ spin_lock(&vcpu->kvm->mmu_lock);
+ if (make_mmu_pages_available(vcpu) < 0) {
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ return -ENOSPC;
+ }
+ sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL,
+ 0, ACC_ALL);
+ root = __pa(sp->spt);
+ ++sp->root_count;
+ spin_unlock(&vcpu->kvm->mmu_lock);
+
+ vcpu->arch.mmu.pae_root[i] = root | pm_mask;
+ }
+ vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
+
+ /*
+ * If we shadow a 32 bit page table with a long mode page
+ * table we enter this path.
+ */
+ if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL) {
+ if (vcpu->arch.mmu.lm_root == NULL) {
+ /*
+ * The additional page necessary for this is only
+ * allocated on demand.
+ */
+
+ u64 *lm_root;
+
+ lm_root = (void*)get_zeroed_page(GFP_KERNEL);
+ if (lm_root == NULL)
+ return 1;
+
+ lm_root[0] = __pa(vcpu->arch.mmu.pae_root) | pm_mask;
+
+ vcpu->arch.mmu.lm_root = lm_root;
+ }
+
+ vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.lm_root);
+ }
+
+ return 0;
+}
+
+static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->arch.mmu.direct_map)
+ return mmu_alloc_direct_roots(vcpu);
+ else
+ return mmu_alloc_shadow_roots(vcpu);
+}
+
+void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
+{
+ int i;
+ struct kvm_mmu_page *sp;
+
+ if (vcpu->arch.mmu.direct_map)
+ return;
+
+ if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ return;
+
+ vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
+
+ if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) {
+ hpa_t root = vcpu->arch.mmu.root_hpa;
+
+ sp = page_header(root);
+
+ /*
+ * Even if another CPU was marking the SP as unsync-ed
+ * simultaneously, any guest page table changes are not
+ * guaranteed to be visible anyway until this VCPU issues a TLB
+ * flush strictly after those changes are made. We only need to
+ * ensure that the other CPU sets these flags before any actual
+ * changes to the page tables are made. The comments in
+ * mmu_need_write_protect() describe what could go wrong if this
+ * requirement isn't satisfied.
+ */
+ if (!smp_load_acquire(&sp->unsync) &&
+ !smp_load_acquire(&sp->unsync_children))
+ return;
+
+ spin_lock(&vcpu->kvm->mmu_lock);
+ kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
+
+ mmu_sync_children(vcpu, sp);
+
+ kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ return;
+ }
+
+ spin_lock(&vcpu->kvm->mmu_lock);
+ kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
+
+ for (i = 0; i < 4; ++i) {
+ hpa_t root = vcpu->arch.mmu.pae_root[i];
+
+ if (root && VALID_PAGE(root)) {
+ root &= PT64_BASE_ADDR_MASK;
+ sp = page_header(root);
+ mmu_sync_children(vcpu, sp);
+ }
+ }
+
+ kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
+ spin_unlock(&vcpu->kvm->mmu_lock);
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots);
+
+static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gpa_t vaddr,
+ u32 access, struct x86_exception *exception)
+{
+ if (exception)
+ exception->error_code = 0;
+ return vaddr;
+}
+
+static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gpa_t vaddr,
+ u32 access,
+ struct x86_exception *exception)
+{
+ if (exception)
+ exception->error_code = 0;
+ return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access, exception);
+}
+
+static bool
+__is_rsvd_bits_set(struct rsvd_bits_validate *rsvd_check, u64 pte, int level)
+{
+ int bit7 = (pte >> 7) & 1, low6 = pte & 0x3f;
+
+ return (pte & rsvd_check->rsvd_bits_mask[bit7][level-1]) |
+ ((rsvd_check->bad_mt_xwr & (1ull << low6)) != 0);
+}
+
+static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
+{
+ return __is_rsvd_bits_set(&mmu->guest_rsvd_check, gpte, level);
+}
+
+static bool is_shadow_zero_bits_set(struct kvm_mmu *mmu, u64 spte, int level)
+{
+ return __is_rsvd_bits_set(&mmu->shadow_zero_check, spte, level);
+}
+
+static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
+{
+ /*
+ * A nested guest cannot use the MMIO cache if it is using nested
+ * page tables, because cr2 is a nGPA while the cache stores GPAs.
+ */
+ if (mmu_is_nested(vcpu))
+ return false;
+
+ if (direct)
+ return vcpu_match_mmio_gpa(vcpu, addr);
+
+ return vcpu_match_mmio_gva(vcpu, addr);
+}
+
+/* return true if reserved bit is detected on spte. */
+static bool
+walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
+{
+ struct kvm_shadow_walk_iterator iterator;
+ u64 sptes[PT64_ROOT_MAX_LEVEL], spte = 0ull;
+ int root, leaf;
+ bool reserved = false;
+
+ if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ goto exit;
+
+ walk_shadow_page_lockless_begin(vcpu);
+
+ for (shadow_walk_init(&iterator, vcpu, addr),
+ leaf = root = iterator.level;
+ shadow_walk_okay(&iterator);
+ __shadow_walk_next(&iterator, spte)) {
+ spte = mmu_spte_get_lockless(iterator.sptep);
+
+ sptes[leaf - 1] = spte;
+ leaf--;
+
+ if (!is_shadow_present_pte(spte))
+ break;
+
+ reserved |= is_shadow_zero_bits_set(&vcpu->arch.mmu, spte,
+ iterator.level);
+ }
+
+ walk_shadow_page_lockless_end(vcpu);
+
+ if (reserved) {
+ pr_err("%s: detect reserved bits on spte, addr 0x%llx, dump hierarchy:\n",
+ __func__, addr);
+ while (root > leaf) {
+ pr_err("------ spte 0x%llx level %d.\n",
+ sptes[root - 1], root);
+ root--;
+ }
+ }
+exit:
+ *sptep = spte;
+ return reserved;
+}
+
+static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
+{
+ u64 spte;
+ bool reserved;
+
+ if (mmio_info_in_cache(vcpu, addr, direct))
+ return RET_PF_EMULATE;
+
+ reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte);
+ if (WARN_ON(reserved))
+ return -EINVAL;
+
+ if (is_mmio_spte(spte)) {
+ gfn_t gfn = get_mmio_spte_gfn(spte);
+ unsigned access = get_mmio_spte_access(spte);
+
+ if (!check_mmio_spte(vcpu, spte))
+ return RET_PF_INVALID;
+
+ if (direct)
+ addr = 0;
+
+ trace_handle_mmio_page_fault(addr, gfn, access);
+ vcpu_cache_mmio_info(vcpu, addr, gfn, access);
+ return RET_PF_EMULATE;
+ }
+
+ /*
+ * If the page table is zapped by other cpus, let CPU fault again on
+ * the address.
+ */
+ return RET_PF_RETRY;
+}
+
+static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu,
+ u32 error_code, gfn_t gfn)
+{
+ if (unlikely(error_code & PFERR_RSVD_MASK))
+ return false;
+
+ if (!(error_code & PFERR_PRESENT_MASK) ||
+ !(error_code & PFERR_WRITE_MASK))
+ return false;
+
+ /*
+ * guest is writing the page which is write tracked which can
+ * not be fixed by page fault handler.
+ */
+ if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
+ return true;
+
+ return false;
+}
+
+static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
+{
+ struct kvm_shadow_walk_iterator iterator;
+ u64 spte;
+
+ if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ return;
+
+ walk_shadow_page_lockless_begin(vcpu);
+ for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) {
+ clear_sp_write_flooding_count(iterator.sptep);
+ if (!is_shadow_present_pte(spte))
+ break;
+ }
+ walk_shadow_page_lockless_end(vcpu);
+}
+
+static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa,
+ u32 error_code, bool prefault)
+{
+ gfn_t gfn = gpa >> PAGE_SHIFT;
+ int r;
+
+ /* Note, paging is disabled, ergo gva == gpa. */
+ pgprintk("%s: gva %lx error %x\n", __func__, gpa, error_code);
+
+ if (page_fault_handle_page_track(vcpu, error_code, gfn))
+ return RET_PF_EMULATE;
+
+ r = mmu_topup_memory_caches(vcpu);
+ if (r)
+ return r;
+
+ MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+
+
+ return nonpaging_map(vcpu, gpa & PAGE_MASK,
+ error_code, gfn, prefault);
+}
+
+static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+ gfn_t gfn)
+{
+ struct kvm_arch_async_pf arch;
+
+ arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
+ arch.gfn = gfn;
+ arch.direct_map = vcpu->arch.mmu.direct_map;
+ arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu);
+
+ return kvm_setup_async_pf(vcpu, cr2_or_gpa,
+ kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
+}
+
+bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
+{
+ if (unlikely(!lapic_in_kernel(vcpu) ||
+ kvm_event_needs_reinjection(vcpu) ||
+ vcpu->arch.exception.pending))
+ return false;
+
+ if (!vcpu->arch.apf.delivery_as_pf_vmexit && is_guest_mode(vcpu))
+ return false;
+
+ return kvm_x86_ops->interrupt_allowed(vcpu);
+}
+
+static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
+ gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write,
+ bool *writable)
+{
+ struct kvm_memory_slot *slot;
+ bool async;
+
+ /*
+ * Don't expose private memslots to L2.
+ */
+ if (is_guest_mode(vcpu) && !kvm_is_visible_gfn(vcpu->kvm, gfn)) {
+ *pfn = KVM_PFN_NOSLOT;
+ return false;
+ }
+
+ slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+ async = false;
+ *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable);
+ if (!async)
+ return false; /* *pfn has correct page already */
+
+ if (!prefault && kvm_can_do_async_pf(vcpu)) {
+ trace_kvm_try_async_get_page(cr2_or_gpa, gfn);
+ if (kvm_find_async_pf_gfn(vcpu, gfn)) {
+ trace_kvm_async_pf_doublefault(cr2_or_gpa, gfn);
+ kvm_make_request(KVM_REQ_APF_HALT, vcpu);
+ return true;
+ } else if (kvm_arch_setup_async_pf(vcpu, cr2_or_gpa, gfn))
+ return true;
+ }
+
+ *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, writable);
+ return false;
+}
+
+int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
+ u64 fault_address, char *insn, int insn_len)
+{
+ int r = 1;
+
+#ifndef CONFIG_X86_64
+ /* A 64-bit CR2 should be impossible on 32-bit KVM. */
+ if (WARN_ON_ONCE(fault_address >> 32))
+ return -EFAULT;
+#endif
+
+ vcpu->arch.l1tf_flush_l1d = true;
+ switch (vcpu->arch.apf.host_apf_reason) {
+ default:
+ trace_kvm_page_fault(fault_address, error_code);
+
+ if (kvm_event_needs_reinjection(vcpu))
+ kvm_mmu_unprotect_page_virt(vcpu, fault_address);
+ r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn,
+ insn_len);
+ break;
+ case KVM_PV_REASON_PAGE_NOT_PRESENT:
+ vcpu->arch.apf.host_apf_reason = 0;
+ local_irq_disable();
+ kvm_async_pf_task_wait(fault_address, 0);
+ local_irq_enable();
+ break;
+ case KVM_PV_REASON_PAGE_READY:
+ vcpu->arch.apf.host_apf_reason = 0;
+ local_irq_disable();
+ kvm_async_pf_task_wake(fault_address);
+ local_irq_enable();
+ break;
+ }
+ return r;
+}
+EXPORT_SYMBOL_GPL(kvm_handle_page_fault);
+
+static bool
+check_hugepage_cache_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int level)
+{
+ int page_num = KVM_PAGES_PER_HPAGE(level);
+
+ gfn &= ~(page_num - 1);
+
+ return kvm_mtrr_check_gfn_range_consistency(vcpu, gfn, page_num);
+}
+
+static int tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
+ bool prefault)
+{
+ kvm_pfn_t pfn;
+ int r;
+ int level;
+ bool force_pt_level;
+ gfn_t gfn = gpa >> PAGE_SHIFT;
+ unsigned long mmu_seq;
+ int write = error_code & PFERR_WRITE_MASK;
+ bool map_writable;
+ bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
+ is_nx_huge_page_enabled();
+
+ MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+
+ if (page_fault_handle_page_track(vcpu, error_code, gfn))
+ return RET_PF_EMULATE;
+
+ r = mmu_topup_memory_caches(vcpu);
+ if (r)
+ return r;
+
+ force_pt_level =
+ lpage_disallowed ||
+ !check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL);
+ level = mapping_level(vcpu, gfn, &force_pt_level);
+ if (likely(!force_pt_level)) {
+ if (level > PT_DIRECTORY_LEVEL &&
+ !check_hugepage_cache_consistency(vcpu, gfn, level))
+ level = PT_DIRECTORY_LEVEL;
+ gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
+ }
+
+ if (fast_page_fault(vcpu, gpa, level, error_code))
+ return RET_PF_RETRY;
+
+ mmu_seq = vcpu->kvm->mmu_notifier_seq;
+ smp_rmb();
+
+ if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
+ return RET_PF_RETRY;
+
+ if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r))
+ return r;
+
+ r = RET_PF_RETRY;
+ spin_lock(&vcpu->kvm->mmu_lock);
+ if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
+ goto out_unlock;
+ if (make_mmu_pages_available(vcpu) < 0)
+ goto out_unlock;
+ if (likely(!force_pt_level))
+ transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
+ r = __direct_map(vcpu, gpa, write, map_writable, level, pfn,
+ prefault, lpage_disallowed);
+out_unlock:
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ kvm_release_pfn_clean(pfn);
+ return r;
+}
+
+static void nonpaging_init_context(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *context)
+{
+ context->page_fault = nonpaging_page_fault;
+ context->gva_to_gpa = nonpaging_gva_to_gpa;
+ context->sync_page = nonpaging_sync_page;
+ context->invlpg = nonpaging_invlpg;
+ context->update_pte = nonpaging_update_pte;
+ context->root_level = 0;
+ context->shadow_root_level = PT32E_ROOT_LEVEL;
+ context->direct_map = true;
+ context->nx = false;
+}
+
+/*
+ * Find out if a previously cached root matching the new CR3/role is available.
+ * The current root is also inserted into the cache.
+ * If a matching root was found, it is assigned to kvm_mmu->root_hpa and true is
+ * returned.
+ * Otherwise, the LRU root from the cache is assigned to kvm_mmu->root_hpa and
+ * false is returned. This root should now be freed by the caller.
+ */
+static bool cached_root_available(struct kvm_vcpu *vcpu, gpa_t new_cr3,
+ union kvm_mmu_page_role new_role)
+{
+ uint i;
+ struct kvm_mmu_root_info root;
+ struct kvm_mmu *mmu = &vcpu->arch.mmu;
+
+ root.cr3 = mmu->get_cr3(vcpu);
+ root.hpa = mmu->root_hpa;
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
+ swap(root, mmu->prev_roots[i]);
+
+ if (new_cr3 == root.cr3 && VALID_PAGE(root.hpa) &&
+ page_header(root.hpa) != NULL &&
+ new_role.word == page_header(root.hpa)->role.word)
+ break;
+ }
+
+ mmu->root_hpa = root.hpa;
+
+ return i < KVM_MMU_NUM_PREV_ROOTS;
+}
+
+static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3,
+ union kvm_mmu_page_role new_role,
+ bool skip_tlb_flush)
+{
+ struct kvm_mmu *mmu = &vcpu->arch.mmu;
+
+ /*
+ * For now, limit the fast switch to 64-bit hosts+VMs in order to avoid
+ * having to deal with PDPTEs. We may add support for 32-bit hosts/VMs
+ * later if necessary.
+ */
+ if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
+ mmu->root_level >= PT64_ROOT_4LEVEL) {
+ if (mmu_check_root(vcpu, new_cr3 >> PAGE_SHIFT))
+ return false;
+
+ if (cached_root_available(vcpu, new_cr3, new_role)) {
+ /*
+ * It is possible that the cached previous root page is
+ * obsolete because of a change in the MMU
+ * generation number. However, that is accompanied by
+ * KVM_REQ_MMU_RELOAD, which will free the root that we
+ * have set here and allocate a new one.
+ */
+
+ kvm_make_request(KVM_REQ_LOAD_CR3, vcpu);
+ if (!skip_tlb_flush) {
+ kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
+ kvm_x86_ops->tlb_flush(vcpu, true);
+ }
+
+ /*
+ * The last MMIO access's GVA and GPA are cached in the
+ * VCPU. When switching to a new CR3, that GVA->GPA
+ * mapping may no longer be valid. So clear any cached
+ * MMIO info even when we don't need to sync the shadow
+ * page tables.
+ */
+ vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
+
+ __clear_sp_write_flooding_count(
+ page_header(mmu->root_hpa));
+
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static void __kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3,
+ union kvm_mmu_page_role new_role,
+ bool skip_tlb_flush)
+{
+ if (!fast_cr3_switch(vcpu, new_cr3, new_role, skip_tlb_flush))
+ kvm_mmu_free_roots(vcpu, KVM_MMU_ROOT_CURRENT);
+}
+
+void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush)
+{
+ __kvm_mmu_new_cr3(vcpu, new_cr3, kvm_mmu_calc_root_page_role(vcpu),
+ skip_tlb_flush);
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_new_cr3);
+
+static unsigned long get_cr3(struct kvm_vcpu *vcpu)
+{
+ return kvm_read_cr3(vcpu);
+}
+
+static void inject_page_fault(struct kvm_vcpu *vcpu,
+ struct x86_exception *fault)
+{
+ vcpu->arch.mmu.inject_page_fault(vcpu, fault);
+}
+
+static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
+ unsigned access, int *nr_present)
+{
+ if (unlikely(is_mmio_spte(*sptep))) {
+ if (gfn != get_mmio_spte_gfn(*sptep)) {
+ mmu_spte_clear_no_track(sptep);
+ return true;
+ }
+
+ (*nr_present)++;
+ mark_mmio_spte(vcpu, sptep, gfn, access);
+ return true;
+ }
+
+ return false;
+}
+
+static inline bool is_last_gpte(struct kvm_mmu *mmu,
+ unsigned level, unsigned gpte)
+{
+ /*
+ * The RHS has bit 7 set iff level < mmu->last_nonleaf_level.
+ * If it is clear, there are no large pages at this level, so clear
+ * PT_PAGE_SIZE_MASK in gpte if that is the case.
+ */
+ gpte &= level - mmu->last_nonleaf_level;
+
+ /*
+ * PT_PAGE_TABLE_LEVEL always terminates. The RHS has bit 7 set
+ * iff level <= PT_PAGE_TABLE_LEVEL, which for our purpose means
+ * level == PT_PAGE_TABLE_LEVEL; set PT_PAGE_SIZE_MASK in gpte then.
+ */
+ gpte |= level - PT_PAGE_TABLE_LEVEL - 1;
+
+ return gpte & PT_PAGE_SIZE_MASK;
+}
+
+#define PTTYPE_EPT 18 /* arbitrary */
+#define PTTYPE PTTYPE_EPT
+#include "paging_tmpl.h"
+#undef PTTYPE
+
+#define PTTYPE 64
+#include "paging_tmpl.h"
+#undef PTTYPE
+
+#define PTTYPE 32
+#include "paging_tmpl.h"
+#undef PTTYPE
+
+static void
+__reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
+ struct rsvd_bits_validate *rsvd_check,
+ int maxphyaddr, int level, bool nx, bool gbpages,
+ bool pse, bool amd)
+{
+ u64 exb_bit_rsvd = 0;
+ u64 gbpages_bit_rsvd = 0;
+ u64 nonleaf_bit8_rsvd = 0;
+
+ rsvd_check->bad_mt_xwr = 0;
+
+ if (!nx)
+ exb_bit_rsvd = rsvd_bits(63, 63);
+ if (!gbpages)
+ gbpages_bit_rsvd = rsvd_bits(7, 7);
+
+ /*
+ * Non-leaf PML4Es and PDPEs reserve bit 8 (which would be the G bit for
+ * leaf entries) on AMD CPUs only.
+ */
+ if (amd)
+ nonleaf_bit8_rsvd = rsvd_bits(8, 8);
+
+ switch (level) {
+ case PT32_ROOT_LEVEL:
+ /* no rsvd bits for 2 level 4K page table entries */
+ rsvd_check->rsvd_bits_mask[0][1] = 0;
+ rsvd_check->rsvd_bits_mask[0][0] = 0;
+ rsvd_check->rsvd_bits_mask[1][0] =
+ rsvd_check->rsvd_bits_mask[0][0];
+
+ if (!pse) {
+ rsvd_check->rsvd_bits_mask[1][1] = 0;
+ break;
+ }
+
+ if (is_cpuid_PSE36())
+ /* 36bits PSE 4MB page */
+ rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
+ else
+ /* 32 bits PSE 4MB page */
+ rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
+ break;
+ case PT32E_ROOT_LEVEL:
+ rsvd_check->rsvd_bits_mask[0][2] =
+ rsvd_bits(maxphyaddr, 63) |
+ rsvd_bits(5, 8) | rsvd_bits(1, 2); /* PDPTE */
+ rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd |
+ rsvd_bits(maxphyaddr, 62); /* PDE */
+ rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd |
+ rsvd_bits(maxphyaddr, 62); /* PTE */
+ rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd |
+ rsvd_bits(maxphyaddr, 62) |
+ rsvd_bits(13, 20); /* large page */
+ rsvd_check->rsvd_bits_mask[1][0] =
+ rsvd_check->rsvd_bits_mask[0][0];
+ break;
+ case PT64_ROOT_5LEVEL:
+ rsvd_check->rsvd_bits_mask[0][4] = exb_bit_rsvd |
+ nonleaf_bit8_rsvd | rsvd_bits(7, 7) |
+ rsvd_bits(maxphyaddr, 51);
+ rsvd_check->rsvd_bits_mask[1][4] =
+ rsvd_check->rsvd_bits_mask[0][4];
+ case PT64_ROOT_4LEVEL:
+ rsvd_check->rsvd_bits_mask[0][3] = exb_bit_rsvd |
+ nonleaf_bit8_rsvd | rsvd_bits(7, 7) |
+ rsvd_bits(maxphyaddr, 51);
+ rsvd_check->rsvd_bits_mask[0][2] = exb_bit_rsvd |
+ gbpages_bit_rsvd |
+ rsvd_bits(maxphyaddr, 51);
+ rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd |
+ rsvd_bits(maxphyaddr, 51);
+ rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd |
+ rsvd_bits(maxphyaddr, 51);
+ rsvd_check->rsvd_bits_mask[1][3] =
+ rsvd_check->rsvd_bits_mask[0][3];
+ rsvd_check->rsvd_bits_mask[1][2] = exb_bit_rsvd |
+ gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51) |
+ rsvd_bits(13, 29);
+ rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd |
+ rsvd_bits(maxphyaddr, 51) |
+ rsvd_bits(13, 20); /* large page */
+ rsvd_check->rsvd_bits_mask[1][0] =
+ rsvd_check->rsvd_bits_mask[0][0];
+ break;
+ }
+}
+
+static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *context)
+{
+ __reset_rsvds_bits_mask(vcpu, &context->guest_rsvd_check,
+ cpuid_maxphyaddr(vcpu), context->root_level,
+ context->nx,
+ guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES),
+ is_pse(vcpu), guest_cpuid_is_amd(vcpu));
+}
+
+static void
+__reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
+ int maxphyaddr, bool execonly)
+{
+ u64 bad_mt_xwr;
+
+ rsvd_check->rsvd_bits_mask[0][4] =
+ rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
+ rsvd_check->rsvd_bits_mask[0][3] =
+ rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
+ rsvd_check->rsvd_bits_mask[0][2] =
+ rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
+ rsvd_check->rsvd_bits_mask[0][1] =
+ rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
+ rsvd_check->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51);
+
+ /* large page */
+ rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4];
+ rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3];
+ rsvd_check->rsvd_bits_mask[1][2] =
+ rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29);
+ rsvd_check->rsvd_bits_mask[1][1] =
+ rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20);
+ rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0];
+
+ bad_mt_xwr = 0xFFull << (2 * 8); /* bits 3..5 must not be 2 */
+ bad_mt_xwr |= 0xFFull << (3 * 8); /* bits 3..5 must not be 3 */
+ bad_mt_xwr |= 0xFFull << (7 * 8); /* bits 3..5 must not be 7 */
+ bad_mt_xwr |= REPEAT_BYTE(1ull << 2); /* bits 0..2 must not be 010 */
+ bad_mt_xwr |= REPEAT_BYTE(1ull << 6); /* bits 0..2 must not be 110 */
+ if (!execonly) {
+ /* bits 0..2 must not be 100 unless VMX capabilities allow it */
+ bad_mt_xwr |= REPEAT_BYTE(1ull << 4);
+ }
+ rsvd_check->bad_mt_xwr = bad_mt_xwr;
+}
+
+static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *context, bool execonly)
+{
+ __reset_rsvds_bits_mask_ept(&context->guest_rsvd_check,
+ cpuid_maxphyaddr(vcpu), execonly);
+}
+
+/*
+ * the page table on host is the shadow page table for the page
+ * table in guest or amd nested guest, its mmu features completely
+ * follow the features in guest.
+ */
+void
+reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
+{
+ /*
+ * KVM uses NX when TDP is disabled to handle a variety of scenarios,
+ * notably for huge SPTEs if iTLB multi-hit mitigation is enabled and
+ * to generate correct permissions for CR0.WP=0/CR4.SMEP=1/EFER.NX=0.
+ * The iTLB multi-hit workaround can be toggled at any time, so assume
+ * NX can be used by any non-nested shadow MMU to avoid having to reset
+ * MMU contexts. Note, KVM forces EFER.NX=1 when TDP is disabled.
+ */
+ bool uses_nx = context->nx || !tdp_enabled ||
+ context->base_role.smep_andnot_wp;
+ struct rsvd_bits_validate *shadow_zero_check;
+ int i;
+
+ /*
+ * Passing "true" to the last argument is okay; it adds a check
+ * on bit 8 of the SPTEs which KVM doesn't use anyway.
+ */
+ shadow_zero_check = &context->shadow_zero_check;
+ __reset_rsvds_bits_mask(vcpu, shadow_zero_check,
+ shadow_phys_bits,
+ context->shadow_root_level, uses_nx,
+ guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES),
+ is_pse(vcpu), true);
+
+ if (!shadow_me_mask)
+ return;
+
+ for (i = context->shadow_root_level; --i >= 0;) {
+ shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
+ shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
+ }
+
+}
+EXPORT_SYMBOL_GPL(reset_shadow_zero_bits_mask);
+
+static inline bool boot_cpu_is_amd(void)
+{
+ WARN_ON_ONCE(!tdp_enabled);
+ return shadow_x_mask == 0;
+}
+
+/*
+ * the direct page table on host, use as much mmu features as
+ * possible, however, kvm currently does not do execution-protection.
+ */
+static void
+reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *context)
+{
+ struct rsvd_bits_validate *shadow_zero_check;
+ int i;
+
+ shadow_zero_check = &context->shadow_zero_check;
+
+ if (boot_cpu_is_amd())
+ __reset_rsvds_bits_mask(vcpu, shadow_zero_check,
+ shadow_phys_bits,
+ context->shadow_root_level, false,
+ boot_cpu_has(X86_FEATURE_GBPAGES),
+ true, true);
+ else
+ __reset_rsvds_bits_mask_ept(shadow_zero_check,
+ shadow_phys_bits,
+ false);
+
+ if (!shadow_me_mask)
+ return;
+
+ for (i = context->shadow_root_level; --i >= 0;) {
+ shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
+ shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
+ }
+}
+
+/*
+ * as the comments in reset_shadow_zero_bits_mask() except it
+ * is the shadow page table for intel nested guest.
+ */
+static void
+reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *context, bool execonly)
+{
+ __reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
+ shadow_phys_bits, execonly);
+}
+
+#define BYTE_MASK(access) \
+ ((1 & (access) ? 2 : 0) | \
+ (2 & (access) ? 4 : 0) | \
+ (3 & (access) ? 8 : 0) | \
+ (4 & (access) ? 16 : 0) | \
+ (5 & (access) ? 32 : 0) | \
+ (6 & (access) ? 64 : 0) | \
+ (7 & (access) ? 128 : 0))
+
+
+static void update_permission_bitmask(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *mmu, bool ept)
+{
+ unsigned byte;
+
+ const u8 x = BYTE_MASK(ACC_EXEC_MASK);
+ const u8 w = BYTE_MASK(ACC_WRITE_MASK);
+ const u8 u = BYTE_MASK(ACC_USER_MASK);
+
+ bool cr4_smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP) != 0;
+ bool cr4_smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP) != 0;
+ bool cr0_wp = is_write_protection(vcpu);
+
+ for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) {
+ unsigned pfec = byte << 1;
+
+ /*
+ * Each "*f" variable has a 1 bit for each UWX value
+ * that causes a fault with the given PFEC.
+ */
+
+ /* Faults from writes to non-writable pages */
+ u8 wf = (pfec & PFERR_WRITE_MASK) ? (u8)~w : 0;
+ /* Faults from user mode accesses to supervisor pages */
+ u8 uf = (pfec & PFERR_USER_MASK) ? (u8)~u : 0;
+ /* Faults from fetches of non-executable pages*/
+ u8 ff = (pfec & PFERR_FETCH_MASK) ? (u8)~x : 0;
+ /* Faults from kernel mode fetches of user pages */
+ u8 smepf = 0;
+ /* Faults from kernel mode accesses of user pages */
+ u8 smapf = 0;
+
+ if (!ept) {
+ /* Faults from kernel mode accesses to user pages */
+ u8 kf = (pfec & PFERR_USER_MASK) ? 0 : u;
+
+ /* Not really needed: !nx will cause pte.nx to fault */
+ if (!mmu->nx)
+ ff = 0;
+
+ /* Allow supervisor writes if !cr0.wp */
+ if (!cr0_wp)
+ wf = (pfec & PFERR_USER_MASK) ? wf : 0;
+
+ /* Disallow supervisor fetches of user code if cr4.smep */
+ if (cr4_smep)
+ smepf = (pfec & PFERR_FETCH_MASK) ? kf : 0;
+
+ /*
+ * SMAP:kernel-mode data accesses from user-mode
+ * mappings should fault. A fault is considered
+ * as a SMAP violation if all of the following
+ * conditions are ture:
+ * - X86_CR4_SMAP is set in CR4
+ * - A user page is accessed
+ * - The access is not a fetch
+ * - Page fault in kernel mode
+ * - if CPL = 3 or X86_EFLAGS_AC is clear
+ *
+ * Here, we cover the first three conditions.
+ * The fourth is computed dynamically in permission_fault();
+ * PFERR_RSVD_MASK bit will be set in PFEC if the access is
+ * *not* subject to SMAP restrictions.
+ */
+ if (cr4_smap)
+ smapf = (pfec & (PFERR_RSVD_MASK|PFERR_FETCH_MASK)) ? 0 : kf;
+ }
+
+ mmu->permissions[byte] = ff | uf | wf | smepf | smapf;
+ }
+}
+
+/*
+* PKU is an additional mechanism by which the paging controls access to
+* user-mode addresses based on the value in the PKRU register. Protection
+* key violations are reported through a bit in the page fault error code.
+* Unlike other bits of the error code, the PK bit is not known at the
+* call site of e.g. gva_to_gpa; it must be computed directly in
+* permission_fault based on two bits of PKRU, on some machine state (CR4,
+* CR0, EFER, CPL), and on other bits of the error code and the page tables.
+*
+* In particular the following conditions come from the error code, the
+* page tables and the machine state:
+* - PK is always zero unless CR4.PKE=1 and EFER.LMA=1
+* - PK is always zero if RSVD=1 (reserved bit set) or F=1 (instruction fetch)
+* - PK is always zero if U=0 in the page tables
+* - PKRU.WD is ignored if CR0.WP=0 and the access is a supervisor access.
+*
+* The PKRU bitmask caches the result of these four conditions. The error
+* code (minus the P bit) and the page table's U bit form an index into the
+* PKRU bitmask. Two bits of the PKRU bitmask are then extracted and ANDed
+* with the two bits of the PKRU register corresponding to the protection key.
+* For the first three conditions above the bits will be 00, thus masking
+* away both AD and WD. For all reads or if the last condition holds, WD
+* only will be masked away.
+*/
+static void update_pkru_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ bool ept)
+{
+ unsigned bit;
+ bool wp;
+
+ if (ept) {
+ mmu->pkru_mask = 0;
+ return;
+ }
+
+ /* PKEY is enabled only if CR4.PKE and EFER.LMA are both set. */
+ if (!kvm_read_cr4_bits(vcpu, X86_CR4_PKE) || !is_long_mode(vcpu)) {
+ mmu->pkru_mask = 0;
+ return;
+ }
+
+ wp = is_write_protection(vcpu);
+
+ for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) {
+ unsigned pfec, pkey_bits;
+ bool check_pkey, check_write, ff, uf, wf, pte_user;
+
+ pfec = bit << 1;
+ ff = pfec & PFERR_FETCH_MASK;
+ uf = pfec & PFERR_USER_MASK;
+ wf = pfec & PFERR_WRITE_MASK;
+
+ /* PFEC.RSVD is replaced by ACC_USER_MASK. */
+ pte_user = pfec & PFERR_RSVD_MASK;
+
+ /*
+ * Only need to check the access which is not an
+ * instruction fetch and is to a user page.
+ */
+ check_pkey = (!ff && pte_user);
+ /*
+ * write access is controlled by PKRU if it is a
+ * user access or CR0.WP = 1.
+ */
+ check_write = check_pkey && wf && (uf || wp);
+
+ /* PKRU.AD stops both read and write access. */
+ pkey_bits = !!check_pkey;
+ /* PKRU.WD stops write access. */
+ pkey_bits |= (!!check_write) << 1;
+
+ mmu->pkru_mask |= (pkey_bits & 3) << pfec;
+ }
+}
+
+static void update_last_nonleaf_level(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
+{
+ unsigned root_level = mmu->root_level;
+
+ mmu->last_nonleaf_level = root_level;
+ if (root_level == PT32_ROOT_LEVEL && is_pse(vcpu))
+ mmu->last_nonleaf_level++;
+}
+
+static void paging64_init_context_common(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *context,
+ int level)
+{
+ context->nx = is_nx(vcpu);
+ context->root_level = level;
+
+ reset_rsvds_bits_mask(vcpu, context);
+ update_permission_bitmask(vcpu, context, false);
+ update_pkru_bitmask(vcpu, context, false);
+ update_last_nonleaf_level(vcpu, context);
+
+ MMU_WARN_ON(!is_pae(vcpu));
+ context->page_fault = paging64_page_fault;
+ context->gva_to_gpa = paging64_gva_to_gpa;
+ context->sync_page = paging64_sync_page;
+ context->invlpg = paging64_invlpg;
+ context->update_pte = paging64_update_pte;
+ context->shadow_root_level = level;
+ context->direct_map = false;
+}
+
+static void paging64_init_context(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *context)
+{
+ int root_level = is_la57_mode(vcpu) ?
+ PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
+
+ paging64_init_context_common(vcpu, context, root_level);
+}
+
+static void paging32_init_context(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *context)
+{
+ context->nx = false;
+ context->root_level = PT32_ROOT_LEVEL;
+
+ reset_rsvds_bits_mask(vcpu, context);
+ update_permission_bitmask(vcpu, context, false);
+ update_pkru_bitmask(vcpu, context, false);
+ update_last_nonleaf_level(vcpu, context);
+
+ context->page_fault = paging32_page_fault;
+ context->gva_to_gpa = paging32_gva_to_gpa;
+ context->sync_page = paging32_sync_page;
+ context->invlpg = paging32_invlpg;
+ context->update_pte = paging32_update_pte;
+ context->shadow_root_level = PT32E_ROOT_LEVEL;
+ context->direct_map = false;
+}
+
+static void paging32E_init_context(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *context)
+{
+ paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL);
+}
+
+static union kvm_mmu_page_role
+kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu)
+{
+ union kvm_mmu_page_role role = {0};
+
+ role.guest_mode = is_guest_mode(vcpu);
+ role.smm = is_smm(vcpu);
+ role.ad_disabled = (shadow_accessed_mask == 0);
+ role.level = kvm_x86_ops->get_tdp_level(vcpu);
+ role.direct = true;
+ role.access = ACC_ALL;
+
+ return role;
+}
+
+static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu *context = &vcpu->arch.mmu;
+
+ context->base_role.word = mmu_base_role_mask.word &
+ kvm_calc_tdp_mmu_root_page_role(vcpu).word;
+ context->page_fault = tdp_page_fault;
+ context->sync_page = nonpaging_sync_page;
+ context->invlpg = nonpaging_invlpg;
+ context->update_pte = nonpaging_update_pte;
+ context->shadow_root_level = kvm_x86_ops->get_tdp_level(vcpu);
+ context->direct_map = true;
+ context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
+ context->get_cr3 = get_cr3;
+ context->get_pdptr = kvm_pdptr_read;
+ context->inject_page_fault = kvm_inject_page_fault;
+
+ if (!is_paging(vcpu)) {
+ context->nx = false;
+ context->gva_to_gpa = nonpaging_gva_to_gpa;
+ context->root_level = 0;
+ } else if (is_long_mode(vcpu)) {
+ context->nx = is_nx(vcpu);
+ context->root_level = is_la57_mode(vcpu) ?
+ PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
+ reset_rsvds_bits_mask(vcpu, context);
+ context->gva_to_gpa = paging64_gva_to_gpa;
+ } else if (is_pae(vcpu)) {
+ context->nx = is_nx(vcpu);
+ context->root_level = PT32E_ROOT_LEVEL;
+ reset_rsvds_bits_mask(vcpu, context);
+ context->gva_to_gpa = paging64_gva_to_gpa;
+ } else {
+ context->nx = false;
+ context->root_level = PT32_ROOT_LEVEL;
+ reset_rsvds_bits_mask(vcpu, context);
+ context->gva_to_gpa = paging32_gva_to_gpa;
+ }
+
+ update_permission_bitmask(vcpu, context, false);
+ update_pkru_bitmask(vcpu, context, false);
+ update_last_nonleaf_level(vcpu, context);
+ reset_tdp_shadow_zero_bits_mask(vcpu, context);
+}
+
+static union kvm_mmu_page_role
+kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu)
+{
+ union kvm_mmu_page_role role = {0};
+ bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
+ bool smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP);
+
+ role.nxe = is_nx(vcpu);
+ role.cr4_pae = !!is_pae(vcpu);
+ role.cr0_wp = is_write_protection(vcpu);
+ role.smep_andnot_wp = smep && !is_write_protection(vcpu);
+ role.smap_andnot_wp = smap && !is_write_protection(vcpu);
+ role.guest_mode = is_guest_mode(vcpu);
+ role.smm = is_smm(vcpu);
+ role.direct = !is_paging(vcpu);
+ role.access = ACC_ALL;
+
+ if (!is_long_mode(vcpu))
+ role.level = PT32E_ROOT_LEVEL;
+ else if (is_la57_mode(vcpu))
+ role.level = PT64_ROOT_5LEVEL;
+ else
+ role.level = PT64_ROOT_4LEVEL;
+
+ return role;
+}
+
+void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu *context = &vcpu->arch.mmu;
+
+ if (!is_paging(vcpu))
+ nonpaging_init_context(vcpu, context);
+ else if (is_long_mode(vcpu))
+ paging64_init_context(vcpu, context);
+ else if (is_pae(vcpu))
+ paging32E_init_context(vcpu, context);
+ else
+ paging32_init_context(vcpu, context);
+
+ context->base_role.word = mmu_base_role_mask.word &
+ kvm_calc_shadow_mmu_root_page_role(vcpu).word;
+ reset_shadow_zero_bits_mask(vcpu, context);
+}
+EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
+
+static union kvm_mmu_page_role
+kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty)
+{
+ union kvm_mmu_page_role role = vcpu->arch.mmu.base_role;
+
+ role.level = PT64_ROOT_4LEVEL;
+ role.direct = false;
+ role.ad_disabled = !accessed_dirty;
+ role.guest_mode = true;
+ role.access = ACC_ALL;
+
+ return role;
+}
+
+void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
+ bool accessed_dirty, gpa_t new_eptp)
+{
+ struct kvm_mmu *context = &vcpu->arch.mmu;
+ union kvm_mmu_page_role root_page_role =
+ kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty);
+
+ __kvm_mmu_new_cr3(vcpu, new_eptp, root_page_role, false);
+ context->shadow_root_level = PT64_ROOT_4LEVEL;
+
+ context->nx = true;
+ context->ept_ad = accessed_dirty;
+ context->page_fault = ept_page_fault;
+ context->gva_to_gpa = ept_gva_to_gpa;
+ context->sync_page = ept_sync_page;
+ context->invlpg = ept_invlpg;
+ context->update_pte = ept_update_pte;
+ context->root_level = PT64_ROOT_4LEVEL;
+ context->direct_map = false;
+ context->base_role.word = root_page_role.word & mmu_base_role_mask.word;
+ update_permission_bitmask(vcpu, context, true);
+ update_pkru_bitmask(vcpu, context, true);
+ update_last_nonleaf_level(vcpu, context);
+ reset_rsvds_bits_mask_ept(vcpu, context, execonly);
+ reset_ept_shadow_zero_bits_mask(vcpu, context, execonly);
+}
+EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
+
+static void init_kvm_softmmu(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu *context = &vcpu->arch.mmu;
+
+ kvm_init_shadow_mmu(vcpu);
+ context->set_cr3 = kvm_x86_ops->set_cr3;
+ context->get_cr3 = get_cr3;
+ context->get_pdptr = kvm_pdptr_read;
+ context->inject_page_fault = kvm_inject_page_fault;
+}
+
+static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
+
+ g_context->get_cr3 = get_cr3;
+ g_context->get_pdptr = kvm_pdptr_read;
+ g_context->inject_page_fault = kvm_inject_page_fault;
+
+ /*
+ * Note that arch.mmu.gva_to_gpa translates l2_gpa to l1_gpa using
+ * L1's nested page tables (e.g. EPT12). The nested translation
+ * of l2_gva to l1_gpa is done by arch.nested_mmu.gva_to_gpa using
+ * L2's page tables as the first level of translation and L1's
+ * nested page tables as the second level of translation. Basically
+ * the gva_to_gpa functions between mmu and nested_mmu are swapped.
+ */
+ if (!is_paging(vcpu)) {
+ g_context->nx = false;
+ g_context->root_level = 0;
+ g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
+ } else if (is_long_mode(vcpu)) {
+ g_context->nx = is_nx(vcpu);
+ g_context->root_level = is_la57_mode(vcpu) ?
+ PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
+ reset_rsvds_bits_mask(vcpu, g_context);
+ g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
+ } else if (is_pae(vcpu)) {
+ g_context->nx = is_nx(vcpu);
+ g_context->root_level = PT32E_ROOT_LEVEL;
+ reset_rsvds_bits_mask(vcpu, g_context);
+ g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
+ } else {
+ g_context->nx = false;
+ g_context->root_level = PT32_ROOT_LEVEL;
+ reset_rsvds_bits_mask(vcpu, g_context);
+ g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
+ }
+
+ update_permission_bitmask(vcpu, g_context, false);
+ update_pkru_bitmask(vcpu, g_context, false);
+ update_last_nonleaf_level(vcpu, g_context);
+}
+
+void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots)
+{
+ if (reset_roots) {
+ uint i;
+
+ vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+ vcpu->arch.mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
+ }
+
+ if (mmu_is_nested(vcpu))
+ init_kvm_nested_mmu(vcpu);
+ else if (tdp_enabled)
+ init_kvm_tdp_mmu(vcpu);
+ else
+ init_kvm_softmmu(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_init_mmu);
+
+static union kvm_mmu_page_role
+kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu)
+{
+ if (tdp_enabled)
+ return kvm_calc_tdp_mmu_root_page_role(vcpu);
+ else
+ return kvm_calc_shadow_mmu_root_page_role(vcpu);
+}
+
+void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
+{
+ kvm_mmu_unload(vcpu);
+ kvm_init_mmu(vcpu, true);
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
+
+int kvm_mmu_load(struct kvm_vcpu *vcpu)
+{
+ int r;
+
+ r = mmu_topup_memory_caches(vcpu);
+ if (r)
+ goto out;
+ r = mmu_alloc_roots(vcpu);
+ kvm_mmu_sync_roots(vcpu);
+ if (r)
+ goto out;
+ kvm_mmu_load_cr3(vcpu);
+ kvm_x86_ops->tlb_flush(vcpu, true);
+out:
+ return r;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_load);
+
+void kvm_mmu_unload(struct kvm_vcpu *vcpu)
+{
+ kvm_mmu_free_roots(vcpu, KVM_MMU_ROOTS_ALL);
+ WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa));
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_unload);
+
+static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp, u64 *spte,
+ const void *new)
+{
+ if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
+ ++vcpu->kvm->stat.mmu_pde_zapped;
+ return;
+ }
+
+ ++vcpu->kvm->stat.mmu_pte_updated;
+ vcpu->arch.mmu.update_pte(vcpu, sp, spte, new);
+}
+
+static bool need_remote_flush(u64 old, u64 new)
+{
+ if (!is_shadow_present_pte(old))
+ return false;
+ if (!is_shadow_present_pte(new))
+ return true;
+ if ((old ^ new) & PT64_BASE_ADDR_MASK)
+ return true;
+ old ^= shadow_nx_mask;
+ new ^= shadow_nx_mask;
+ return (old & ~new & PT64_PERM_MASK) != 0;
+}
+
+static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
+ int *bytes)
+{
+ u64 gentry = 0;
+ int r;
+
+ /*
+ * Assume that the pte write on a page table of the same type
+ * as the current vcpu paging mode since we update the sptes only
+ * when they have the same mode.
+ */
+ if (is_pae(vcpu) && *bytes == 4) {
+ /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
+ *gpa &= ~(gpa_t)7;
+ *bytes = 8;
+ }
+
+ if (*bytes == 4 || *bytes == 8) {
+ r = kvm_vcpu_read_guest_atomic(vcpu, *gpa, &gentry, *bytes);
+ if (r)
+ gentry = 0;
+ }
+
+ return gentry;
+}
+
+/*
+ * If we're seeing too many writes to a page, it may no longer be a page table,
+ * or we may be forking, in which case it is better to unmap the page.
+ */
+static bool detect_write_flooding(struct kvm_mmu_page *sp)
+{
+ /*
+ * Skip write-flooding detected for the sp whose level is 1, because
+ * it can become unsync, then the guest page is not write-protected.
+ */
+ if (sp->role.level == PT_PAGE_TABLE_LEVEL)
+ return false;
+
+ atomic_inc(&sp->write_flooding_count);
+ return atomic_read(&sp->write_flooding_count) >= 3;
+}
+
+/*
+ * Misaligned accesses are too much trouble to fix up; also, they usually
+ * indicate a page is not used as a page table.
+ */
+static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa,
+ int bytes)
+{
+ unsigned offset, pte_size, misaligned;
+
+ pgprintk("misaligned: gpa %llx bytes %d role %x\n",
+ gpa, bytes, sp->role.word);
+
+ offset = offset_in_page(gpa);
+ pte_size = sp->role.cr4_pae ? 8 : 4;
+
+ /*
+ * Sometimes, the OS only writes the last one bytes to update status
+ * bits, for example, in linux, andb instruction is used in clear_bit().
+ */
+ if (!(offset & (pte_size - 1)) && bytes == 1)
+ return false;
+
+ misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
+ misaligned |= bytes < 4;
+
+ return misaligned;
+}
+
+static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte)
+{
+ unsigned page_offset, quadrant;
+ u64 *spte;
+ int level;
+
+ page_offset = offset_in_page(gpa);
+ level = sp->role.level;
+ *nspte = 1;
+ if (!sp->role.cr4_pae) {
+ page_offset <<= 1; /* 32->64 */
+ /*
+ * A 32-bit pde maps 4MB while the shadow pdes map
+ * only 2MB. So we need to double the offset again
+ * and zap two pdes instead of one.
+ */
+ if (level == PT32_ROOT_LEVEL) {
+ page_offset &= ~7; /* kill rounding error */
+ page_offset <<= 1;
+ *nspte = 2;
+ }
+ quadrant = page_offset >> PAGE_SHIFT;
+ page_offset &= ~PAGE_MASK;
+ if (quadrant != sp->role.quadrant)
+ return NULL;
+ }
+
+ spte = &sp->spt[page_offset / sizeof(*spte)];
+ return spte;
+}
+
+static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
+ const u8 *new, int bytes,
+ struct kvm_page_track_notifier_node *node)
+{
+ gfn_t gfn = gpa >> PAGE_SHIFT;
+ struct kvm_mmu_page *sp;
+ LIST_HEAD(invalid_list);
+ u64 entry, gentry, *spte;
+ int npte;
+ bool remote_flush, local_flush;
+
+ /*
+ * If we don't have indirect shadow pages, it means no page is
+ * write-protected, so we can exit simply.
+ */
+ if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
+ return;
+
+ remote_flush = local_flush = false;
+
+ pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
+
+ /*
+ * No need to care whether allocation memory is successful
+ * or not since pte prefetch is skiped if it does not have
+ * enough objects in the cache.
+ */
+ mmu_topup_memory_caches(vcpu);
+
+ spin_lock(&vcpu->kvm->mmu_lock);
+
+ gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes);
+
+ ++vcpu->kvm->stat.mmu_pte_write;
+ kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
+
+ for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
+ if (detect_write_misaligned(sp, gpa, bytes) ||
+ detect_write_flooding(sp)) {
+ kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
+ ++vcpu->kvm->stat.mmu_flooded;
+ continue;
+ }
+
+ spte = get_written_sptes(sp, gpa, &npte);
+ if (!spte)
+ continue;
+
+ local_flush = true;
+ while (npte--) {
+ entry = *spte;
+ mmu_page_zap_pte(vcpu->kvm, sp, spte);
+ if (gentry &&
+ !((sp->role.word ^ vcpu->arch.mmu.base_role.word)
+ & mmu_base_role_mask.word) && rmap_can_add(vcpu))
+ mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
+ if (need_remote_flush(entry, *spte))
+ remote_flush = true;
+ ++spte;
+ }
+ }
+ kvm_mmu_flush_or_zap(vcpu, &invalid_list, remote_flush, local_flush);
+ kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
+ spin_unlock(&vcpu->kvm->mmu_lock);
+}
+
+int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
+{
+ gpa_t gpa;
+ int r;
+
+ if (vcpu->arch.mmu.direct_map)
+ return 0;
+
+ gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
+
+ r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+
+ return r;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
+
+static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
+{
+ LIST_HEAD(invalid_list);
+
+ if (likely(kvm_mmu_available_pages(vcpu->kvm) >= KVM_MIN_FREE_MMU_PAGES))
+ return 0;
+
+ while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES) {
+ if (!prepare_zap_oldest_mmu_page(vcpu->kvm, &invalid_list))
+ break;
+
+ ++vcpu->kvm->stat.mmu_recycled;
+ }
+ kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+
+ if (!kvm_mmu_available_pages(vcpu->kvm))
+ return -ENOSPC;
+ return 0;
+}
+
+int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
+ void *insn, int insn_len)
+{
+ int r, emulation_type = 0;
+ enum emulation_result er;
+ bool direct = vcpu->arch.mmu.direct_map;
+
+ /* With shadow page tables, fault_address contains a GVA or nGPA. */
+ if (vcpu->arch.mmu.direct_map) {
+ vcpu->arch.gpa_available = true;
+ vcpu->arch.gpa_val = cr2_or_gpa;
+ }
+
+ r = RET_PF_INVALID;
+ if (unlikely(error_code & PFERR_RSVD_MASK)) {
+ r = handle_mmio_page_fault(vcpu, cr2_or_gpa, direct);
+ if (r == RET_PF_EMULATE)
+ goto emulate;
+ }
+
+ if (r == RET_PF_INVALID) {
+ r = vcpu->arch.mmu.page_fault(vcpu, cr2_or_gpa,
+ lower_32_bits(error_code),
+ false);
+ WARN_ON(r == RET_PF_INVALID);
+ }
+
+ if (r == RET_PF_RETRY)
+ return 1;
+ if (r < 0)
+ return r;
+
+ /*
+ * Before emulating the instruction, check if the error code
+ * was due to a RO violation while translating the guest page.
+ * This can occur when using nested virtualization with nested
+ * paging in both guests. If true, we simply unprotect the page
+ * and resume the guest.
+ */
+ if (vcpu->arch.mmu.direct_map &&
+ (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
+ kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa));
+ return 1;
+ }
+
+ /*
+ * vcpu->arch.mmu.page_fault returned RET_PF_EMULATE, but we can still
+ * optimistically try to just unprotect the page and let the processor
+ * re-execute the instruction that caused the page fault. Do not allow
+ * retrying MMIO emulation, as it's not only pointless but could also
+ * cause us to enter an infinite loop because the processor will keep
+ * faulting on the non-existent MMIO address. Retrying an instruction
+ * from a nested guest is also pointless and dangerous as we are only
+ * explicitly shadowing L1's page tables, i.e. unprotecting something
+ * for L1 isn't going to magically fix whatever issue cause L2 to fail.
+ */
+ if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu))
+ emulation_type = EMULTYPE_ALLOW_RETRY;
+emulate:
+ /*
+ * On AMD platforms, under certain conditions insn_len may be zero on #NPF.
+ * This can happen if a guest gets a page-fault on data access but the HW
+ * table walker is not able to read the instruction page (e.g instruction
+ * page is not present in memory). In those cases we simply restart the
+ * guest.
+ */
+ if (unlikely(insn && !insn_len))
+ return 1;
+
+ er = x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn, insn_len);
+
+ switch (er) {
+ case EMULATE_DONE:
+ return 1;
+ case EMULATE_USER_EXIT:
+ ++vcpu->stat.mmio_exits;
+ /* fall through */
+ case EMULATE_FAIL:
+ return 0;
+ default:
+ BUG();
+ }
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
+
+void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
+{
+ struct kvm_mmu *mmu = &vcpu->arch.mmu;
+ int i;
+
+ /* INVLPG on a * non-canonical address is a NOP according to the SDM. */
+ if (is_noncanonical_address(gva, vcpu))
+ return;
+
+ mmu->invlpg(vcpu, gva, mmu->root_hpa);
+
+ /*
+ * INVLPG is required to invalidate any global mappings for the VA,
+ * irrespective of PCID. Since it would take us roughly similar amount
+ * of work to determine whether any of the prev_root mappings of the VA
+ * is marked global, or to just sync it blindly, so we might as well
+ * just always sync it.
+ *
+ * Mappings not reachable via the current cr3 or the prev_roots will be
+ * synced when switching to that cr3, so nothing needs to be done here
+ * for them.
+ */
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+ if (VALID_PAGE(mmu->prev_roots[i].hpa))
+ mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
+
+ kvm_x86_ops->tlb_flush_gva(vcpu, gva);
+ ++vcpu->stat.invlpg;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
+
+void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
+{
+ struct kvm_mmu *mmu = &vcpu->arch.mmu;
+ bool tlb_flush = false;
+ uint i;
+
+ if (pcid == kvm_get_active_pcid(vcpu)) {
+ mmu->invlpg(vcpu, gva, mmu->root_hpa);
+ tlb_flush = true;
+ }
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
+ if (VALID_PAGE(mmu->prev_roots[i].hpa) &&
+ pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].cr3)) {
+ mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
+ tlb_flush = true;
+ }
+ }
+
+ if (tlb_flush)
+ kvm_x86_ops->tlb_flush_gva(vcpu, gva);
+
+ ++vcpu->stat.invlpg;
+
+ /*
+ * Mappings not reachable via the current cr3 or the prev_roots will be
+ * synced when switching to that cr3, so nothing needs to be done here
+ * for them.
+ */
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_invpcid_gva);
+
+void kvm_enable_tdp(void)
+{
+ tdp_enabled = true;
+}
+EXPORT_SYMBOL_GPL(kvm_enable_tdp);
+
+void kvm_disable_tdp(void)
+{
+ tdp_enabled = false;
+}
+EXPORT_SYMBOL_GPL(kvm_disable_tdp);
+
+static void free_mmu_pages(struct kvm_vcpu *vcpu)
+{
+ free_page((unsigned long)vcpu->arch.mmu.pae_root);
+ free_page((unsigned long)vcpu->arch.mmu.lm_root);
+}
+
+static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
+{
+ struct page *page;
+ int i;
+
+ /*
+ * When using PAE paging, the four PDPTEs are treated as 'root' pages,
+ * while the PDP table is a per-vCPU construct that's allocated at MMU
+ * creation. When emulating 32-bit mode, cr3 is only 32 bits even on
+ * x86_64. Therefore we need to allocate the PDP table in the first
+ * 4GB of memory, which happens to fit the DMA32 zone. Except for
+ * SVM's 32-bit NPT support, TDP paging doesn't use PAE paging and can
+ * skip allocating the PDP table.
+ */
+ if (tdp_enabled && kvm_x86_ops->get_tdp_level(vcpu) > PT32E_ROOT_LEVEL)
+ return 0;
+
+ /*
+ * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
+ * Therefore we need to allocate shadow page tables in the first
+ * 4GB of memory, which happens to fit the DMA32 zone.
+ */
+ page = alloc_page(GFP_KERNEL | __GFP_DMA32);
+ if (!page)
+ return -ENOMEM;
+
+ vcpu->arch.mmu.pae_root = page_address(page);
+ for (i = 0; i < 4; ++i)
+ vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
+
+ return 0;
+}
+
+int kvm_mmu_create(struct kvm_vcpu *vcpu)
+{
+ uint i;
+
+ vcpu->arch.walk_mmu = &vcpu->arch.mmu;
+ vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+ vcpu->arch.mmu.translate_gpa = translate_gpa;
+ vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+ vcpu->arch.mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
+
+ return alloc_mmu_pages(vcpu);
+}
+
+void kvm_mmu_setup(struct kvm_vcpu *vcpu)
+{
+ MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa));
+
+ /*
+ * kvm_mmu_setup() is called only on vCPU initialization.
+ * Therefore, no need to reset mmu roots as they are not yet
+ * initialized.
+ */
+ kvm_init_mmu(vcpu, false);
+}
+
+static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ struct kvm_page_track_notifier_node *node)
+{
+ kvm_mmu_invalidate_zap_all_pages(kvm);
+}
+
+void kvm_mmu_init_vm(struct kvm *kvm)
+{
+ struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
+
+ node->track_write = kvm_mmu_pte_write;
+ node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;
+ kvm_page_track_register_notifier(kvm, node);
+}
+
+void kvm_mmu_uninit_vm(struct kvm *kvm)
+{
+ struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
+
+ kvm_page_track_unregister_notifier(kvm, node);
+}
+
+/* The return value indicates if tlb flush on all vcpus is needed. */
+typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head);
+
+/* The caller should hold mmu-lock before calling this function. */
+static __always_inline bool
+slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ slot_level_handler fn, int start_level, int end_level,
+ gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb)
+{
+ struct slot_rmap_walk_iterator iterator;
+ bool flush = false;
+
+ for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn,
+ end_gfn, &iterator) {
+ if (iterator.rmap)
+ flush |= fn(kvm, iterator.rmap);
+
+ if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
+ if (flush && lock_flush_tlb) {
+ kvm_flush_remote_tlbs(kvm);
+ flush = false;
+ }
+ cond_resched_lock(&kvm->mmu_lock);
+ }
+ }
+
+ if (flush && lock_flush_tlb) {
+ kvm_flush_remote_tlbs(kvm);
+ flush = false;
+ }
+
+ return flush;
+}
+
+static __always_inline bool
+slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ slot_level_handler fn, int start_level, int end_level,
+ bool lock_flush_tlb)
+{
+ return slot_handle_level_range(kvm, memslot, fn, start_level,
+ end_level, memslot->base_gfn,
+ memslot->base_gfn + memslot->npages - 1,
+ lock_flush_tlb);
+}
+
+static __always_inline bool
+slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ slot_level_handler fn, bool lock_flush_tlb)
+{
+ return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
+ PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
+}
+
+static __always_inline bool
+slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ slot_level_handler fn, bool lock_flush_tlb)
+{
+ return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL + 1,
+ PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
+}
+
+static __always_inline bool
+slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ slot_level_handler fn, bool lock_flush_tlb)
+{
+ return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
+ PT_PAGE_TABLE_LEVEL, lock_flush_tlb);
+}
+
+void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
+{
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *memslot;
+ int i;
+
+ spin_lock(&kvm->mmu_lock);
+ for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+ slots = __kvm_memslots(kvm, i);
+ kvm_for_each_memslot(memslot, slots) {
+ gfn_t start, end;
+
+ start = max(gfn_start, memslot->base_gfn);
+ end = min(gfn_end, memslot->base_gfn + memslot->npages);
+ if (start >= end)
+ continue;
+
+ slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
+ PT_PAGE_TABLE_LEVEL, PT_MAX_HUGEPAGE_LEVEL,
+ start, end - 1, true);
+ }
+ }
+
+ spin_unlock(&kvm->mmu_lock);
+}
+
+static bool slot_rmap_write_protect(struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head)
+{
+ return __rmap_write_protect(kvm, rmap_head, false);
+}
+
+void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
+ struct kvm_memory_slot *memslot)
+{
+ bool flush;
+
+ spin_lock(&kvm->mmu_lock);
+ flush = slot_handle_all_level(kvm, memslot, slot_rmap_write_protect,
+ false);
+ spin_unlock(&kvm->mmu_lock);
+
+ /*
+ * kvm_mmu_slot_remove_write_access() and kvm_vm_ioctl_get_dirty_log()
+ * which do tlb flush out of mmu-lock should be serialized by
+ * kvm->slots_lock otherwise tlb flush would be missed.
+ */
+ lockdep_assert_held(&kvm->slots_lock);
+
+ /*
+ * We can flush all the TLBs out of the mmu lock without TLB
+ * corruption since we just change the spte from writable to
+ * readonly so that we only need to care the case of changing
+ * spte from present to present (changing the spte from present
+ * to nonpresent will flush all the TLBs immediately), in other
+ * words, the only case we care is mmu_spte_update() where we
+ * haved checked SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE
+ * instead of PT_WRITABLE_MASK, that means it does not depend
+ * on PT_WRITABLE_MASK anymore.
+ */
+ if (flush)
+ kvm_flush_remote_tlbs(kvm);
+}
+
+static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head)
+{
+ u64 *sptep;
+ struct rmap_iterator iter;
+ int need_tlb_flush = 0;
+ kvm_pfn_t pfn;
+ struct kvm_mmu_page *sp;
+
+restart:
+ for_each_rmap_spte(rmap_head, &iter, sptep) {
+ sp = page_header(__pa(sptep));
+ pfn = spte_to_pfn(*sptep);
+
+ /*
+ * We cannot do huge page mapping for indirect shadow pages,
+ * which are found on the last rmap (level = 1) when not using
+ * tdp; such shadow pages are synced with the page table in
+ * the guest, and the guest page table is using 4K page size
+ * mapping if the indirect sp has level = 1.
+ */
+ if (sp->role.direct && !kvm_is_reserved_pfn(pfn) &&
+ !kvm_is_zone_device_pfn(pfn) &&
+ PageTransCompoundMap(pfn_to_page(pfn))) {
+ drop_spte(kvm, sptep);
+ need_tlb_flush = 1;
+ goto restart;
+ }
+ }
+
+ return need_tlb_flush;
+}
+
+void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
+ const struct kvm_memory_slot *memslot)
+{
+ /* FIXME: const-ify all uses of struct kvm_memory_slot. */
+ spin_lock(&kvm->mmu_lock);
+ slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot,
+ kvm_mmu_zap_collapsible_spte, true);
+ spin_unlock(&kvm->mmu_lock);
+}
+
+void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
+ struct kvm_memory_slot *memslot)
+{
+ bool flush;
+
+ spin_lock(&kvm->mmu_lock);
+ flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false);
+ spin_unlock(&kvm->mmu_lock);
+
+ lockdep_assert_held(&kvm->slots_lock);
+
+ /*
+ * It's also safe to flush TLBs out of mmu lock here as currently this
+ * function is only used for dirty logging, in which case flushing TLB
+ * out of mmu lock also guarantees no dirty pages will be lost in
+ * dirty_bitmap.
+ */
+ if (flush)
+ kvm_flush_remote_tlbs(kvm);
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_slot_leaf_clear_dirty);
+
+void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
+ struct kvm_memory_slot *memslot)
+{
+ bool flush;
+
+ spin_lock(&kvm->mmu_lock);
+ flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect,
+ false);
+ spin_unlock(&kvm->mmu_lock);
+
+ /* see kvm_mmu_slot_remove_write_access */
+ lockdep_assert_held(&kvm->slots_lock);
+
+ if (flush)
+ kvm_flush_remote_tlbs(kvm);
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_slot_largepage_remove_write_access);
+
+void kvm_mmu_slot_set_dirty(struct kvm *kvm,
+ struct kvm_memory_slot *memslot)
+{
+ bool flush;
+
+ spin_lock(&kvm->mmu_lock);
+ flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false);
+ spin_unlock(&kvm->mmu_lock);
+
+ lockdep_assert_held(&kvm->slots_lock);
+
+ /* see kvm_mmu_slot_leaf_clear_dirty */
+ if (flush)
+ kvm_flush_remote_tlbs(kvm);
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
+
+#define BATCH_ZAP_PAGES 10
+static void kvm_zap_obsolete_pages(struct kvm *kvm)
+{
+ struct kvm_mmu_page *sp, *node;
+ int batch = 0;
+
+restart:
+ list_for_each_entry_safe_reverse(sp, node,
+ &kvm->arch.active_mmu_pages, link) {
+ int ret;
+
+ /*
+ * No obsolete page exists before new created page since
+ * active_mmu_pages is the FIFO list.
+ */
+ if (!is_obsolete_sp(kvm, sp))
+ break;
+
+ /*
+ * Since we are reversely walking the list and the invalid
+ * list will be moved to the head, skip the invalid page
+ * can help us to avoid the infinity list walking.
+ */
+ if (sp->role.invalid)
+ continue;
+
+ /*
+ * Need not flush tlb since we only zap the sp with invalid
+ * generation number.
+ */
+ if (batch >= BATCH_ZAP_PAGES &&
+ cond_resched_lock(&kvm->mmu_lock)) {
+ batch = 0;
+ goto restart;
+ }
+
+ ret = kvm_mmu_prepare_zap_page(kvm, sp,
+ &kvm->arch.zapped_obsolete_pages);
+ batch += ret;
+
+ if (ret)
+ goto restart;
+ }
+
+ /*
+ * Should flush tlb before free page tables since lockless-walking
+ * may use the pages.
+ */
+ kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages);
+}
+
+/*
+ * Fast invalidate all shadow pages and use lock-break technique
+ * to zap obsolete pages.
+ *
+ * It's required when memslot is being deleted or VM is being
+ * destroyed, in these cases, we should ensure that KVM MMU does
+ * not use any resource of the being-deleted slot or all slots
+ * after calling the function.
+ */
+void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm)
+{
+ spin_lock(&kvm->mmu_lock);
+ trace_kvm_mmu_invalidate_zap_all_pages(kvm);
+ kvm->arch.mmu_valid_gen++;
+
+ /*
+ * Notify all vcpus to reload its shadow page table
+ * and flush TLB. Then all vcpus will switch to new
+ * shadow page table with the new mmu_valid_gen.
+ *
+ * Note: we should do this under the protection of
+ * mmu-lock, otherwise, vcpu would purge shadow page
+ * but miss tlb flush.
+ */
+ kvm_reload_remote_mmus(kvm);
+
+ kvm_zap_obsolete_pages(kvm);
+ spin_unlock(&kvm->mmu_lock);
+}
+
+static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
+{
+ return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
+}
+
+void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
+{
+ gen &= MMIO_GEN_MASK;
+
+ /*
+ * Shift to eliminate the "update in-progress" flag, which isn't
+ * included in the spte's generation number.
+ */
+ gen >>= 1;
+
+ /*
+ * Generation numbers are incremented in multiples of the number of
+ * address spaces in order to provide unique generations across all
+ * address spaces. Strip what is effectively the address space
+ * modifier prior to checking for a wrap of the MMIO generation so
+ * that a wrap in any address space is detected.
+ */
+ gen &= ~((u64)KVM_ADDRESS_SPACE_NUM - 1);
+
+ /*
+ * The very rare case: if the MMIO generation number has wrapped,
+ * zap all shadow pages.
+ */
+ if (unlikely(gen == 0)) {
+ kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n");
+ kvm_mmu_invalidate_zap_all_pages(kvm);
+ }
+}
+
+static unsigned long
+mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
+{
+ struct kvm *kvm;
+ int nr_to_scan = sc->nr_to_scan;
+ unsigned long freed = 0;
+
+ mutex_lock(&kvm_lock);
+
+ list_for_each_entry(kvm, &vm_list, vm_list) {
+ int idx;
+ LIST_HEAD(invalid_list);
+
+ /*
+ * Never scan more than sc->nr_to_scan VM instances.
+ * Will not hit this condition practically since we do not try
+ * to shrink more than one VM and it is very unlikely to see
+ * !n_used_mmu_pages so many times.
+ */
+ if (!nr_to_scan--)
+ break;
+ /*
+ * n_used_mmu_pages is accessed without holding kvm->mmu_lock
+ * here. We may skip a VM instance errorneosly, but we do not
+ * want to shrink a VM that only started to populate its MMU
+ * anyway.
+ */
+ if (!kvm->arch.n_used_mmu_pages &&
+ !kvm_has_zapped_obsolete_pages(kvm))
+ continue;
+
+ idx = srcu_read_lock(&kvm->srcu);
+ spin_lock(&kvm->mmu_lock);
+
+ if (kvm_has_zapped_obsolete_pages(kvm)) {
+ kvm_mmu_commit_zap_page(kvm,
+ &kvm->arch.zapped_obsolete_pages);
+ goto unlock;
+ }
+
+ if (prepare_zap_oldest_mmu_page(kvm, &invalid_list))
+ freed++;
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
+
+unlock:
+ spin_unlock(&kvm->mmu_lock);
+ srcu_read_unlock(&kvm->srcu, idx);
+
+ /*
+ * unfair on small ones
+ * per-vm shrinkers cry out
+ * sadness comes quickly
+ */
+ list_move_tail(&kvm->vm_list, &vm_list);
+ break;
+ }
+
+ mutex_unlock(&kvm_lock);
+ return freed;
+}
+
+static unsigned long
+mmu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
+{
+ return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
+}
+
+static struct shrinker mmu_shrinker = {
+ .count_objects = mmu_shrink_count,
+ .scan_objects = mmu_shrink_scan,
+ .seeks = DEFAULT_SEEKS * 10,
+};
+
+static void mmu_destroy_caches(void)
+{
+ kmem_cache_destroy(pte_list_desc_cache);
+ kmem_cache_destroy(mmu_page_header_cache);
+}
+
+static bool get_nx_auto_mode(void)
+{
+ /* Return true when CPU has the bug, and mitigations are ON */
+ return boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT) && !cpu_mitigations_off();
+}
+
+static void __set_nx_huge_pages(bool val)
+{
+ nx_huge_pages = itlb_multihit_kvm_mitigation = val;
+}
+
+static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
+{
+ bool old_val = nx_huge_pages;
+ bool new_val;
+
+ /* In "auto" mode deploy workaround only if CPU has the bug. */
+ if (sysfs_streq(val, "off"))
+ new_val = 0;
+ else if (sysfs_streq(val, "force"))
+ new_val = 1;
+ else if (sysfs_streq(val, "auto"))
+ new_val = get_nx_auto_mode();
+ else if (strtobool(val, &new_val) < 0)
+ return -EINVAL;
+
+ __set_nx_huge_pages(new_val);
+
+ if (new_val != old_val) {
+ struct kvm *kvm;
+ int idx;
+
+ mutex_lock(&kvm_lock);
+
+ list_for_each_entry(kvm, &vm_list, vm_list) {
+ idx = srcu_read_lock(&kvm->srcu);
+ kvm_mmu_invalidate_zap_all_pages(kvm);
+ srcu_read_unlock(&kvm->srcu, idx);
+
+ wake_up_process(kvm->arch.nx_lpage_recovery_thread);
+ }
+ mutex_unlock(&kvm_lock);
+ }
+
+ return 0;
+}
+
+static void kvm_set_mmio_spte_mask(void)
+{
+ u64 mask;
+
+ /*
+ * Set a reserved PA bit in MMIO SPTEs to generate page faults with
+ * PFEC.RSVD=1 on MMIO accesses. 64-bit PTEs (PAE, x86-64, and EPT
+ * paging) support a maximum of 52 bits of PA, i.e. if the CPU supports
+ * 52-bit physical addresses then there are no reserved PA bits in the
+ * PTEs and so the reserved PA approach must be disabled.
+ */
+ if (shadow_phys_bits < 52)
+ mask = BIT_ULL(51) | PT_PRESENT_MASK;
+ else
+ mask = 0;
+
+ kvm_mmu_set_mmio_spte_mask(mask, mask);
+}
+
+int kvm_mmu_module_init(void)
+{
+ int ret = -ENOMEM;
+
+ if (nx_huge_pages == -1)
+ __set_nx_huge_pages(get_nx_auto_mode());
+
+ kvm_mmu_reset_all_pte_masks();
+
+ kvm_set_mmio_spte_mask();
+
+ pte_list_desc_cache = kmem_cache_create("pte_list_desc",
+ sizeof(struct pte_list_desc),
+ 0, SLAB_ACCOUNT, NULL);
+ if (!pte_list_desc_cache)
+ goto out;
+
+ mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
+ sizeof(struct kvm_mmu_page),
+ 0, SLAB_ACCOUNT, NULL);
+ if (!mmu_page_header_cache)
+ goto out;
+
+ if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL))
+ goto out;
+
+ ret = register_shrinker(&mmu_shrinker);
+ if (ret)
+ goto out;
+
+ return 0;
+
+out:
+ mmu_destroy_caches();
+ return ret;
+}
+
+/*
+ * Caculate mmu pages needed for kvm.
+ */
+unsigned long kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
+{
+ unsigned long nr_mmu_pages;
+ unsigned long nr_pages = 0;
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *memslot;
+ int i;
+
+ for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+ slots = __kvm_memslots(kvm, i);
+
+ kvm_for_each_memslot(memslot, slots)
+ nr_pages += memslot->npages;
+ }
+
+ nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
+ nr_mmu_pages = max(nr_mmu_pages, KVM_MIN_ALLOC_MMU_PAGES);
+
+ return nr_mmu_pages;
+}
+
+void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
+{
+ kvm_mmu_unload(vcpu);
+ free_mmu_pages(vcpu);
+ mmu_free_memory_caches(vcpu);
+}
+
+void kvm_mmu_module_exit(void)
+{
+ mmu_destroy_caches();
+ percpu_counter_destroy(&kvm_total_used_mmu_pages);
+ unregister_shrinker(&mmu_shrinker);
+ mmu_audit_disable();
+}
+
+static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp)
+{
+ unsigned int old_val;
+ int err;
+
+ old_val = nx_huge_pages_recovery_ratio;
+ err = param_set_uint(val, kp);
+ if (err)
+ return err;
+
+ if (READ_ONCE(nx_huge_pages) &&
+ !old_val && nx_huge_pages_recovery_ratio) {
+ struct kvm *kvm;
+
+ mutex_lock(&kvm_lock);
+
+ list_for_each_entry(kvm, &vm_list, vm_list)
+ wake_up_process(kvm->arch.nx_lpage_recovery_thread);
+
+ mutex_unlock(&kvm_lock);
+ }
+
+ return err;
+}
+
+static void kvm_recover_nx_lpages(struct kvm *kvm)
+{
+ int rcu_idx;
+ struct kvm_mmu_page *sp;
+ unsigned int ratio;
+ LIST_HEAD(invalid_list);
+ ulong to_zap;
+
+ rcu_idx = srcu_read_lock(&kvm->srcu);
+ spin_lock(&kvm->mmu_lock);
+
+ ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
+ to_zap = ratio ? DIV_ROUND_UP(kvm->stat.nx_lpage_splits, ratio) : 0;
+ while (to_zap && !list_empty(&kvm->arch.lpage_disallowed_mmu_pages)) {
+ /*
+ * We use a separate list instead of just using active_mmu_pages
+ * because the number of lpage_disallowed pages is expected to
+ * be relatively small compared to the total.
+ */
+ sp = list_first_entry(&kvm->arch.lpage_disallowed_mmu_pages,
+ struct kvm_mmu_page,
+ lpage_disallowed_link);
+ WARN_ON_ONCE(!sp->lpage_disallowed);
+ kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
+ WARN_ON_ONCE(sp->lpage_disallowed);
+
+ if (!--to_zap || need_resched() || spin_needbreak(&kvm->mmu_lock)) {
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
+ if (to_zap)
+ cond_resched_lock(&kvm->mmu_lock);
+ }
+ }
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
+
+ spin_unlock(&kvm->mmu_lock);
+ srcu_read_unlock(&kvm->srcu, rcu_idx);
+}
+
+static long get_nx_lpage_recovery_timeout(u64 start_time)
+{
+ return READ_ONCE(nx_huge_pages) && READ_ONCE(nx_huge_pages_recovery_ratio)
+ ? start_time + 60 * HZ - get_jiffies_64()
+ : MAX_SCHEDULE_TIMEOUT;
+}
+
+static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data)
+{
+ u64 start_time;
+ long remaining_time;
+
+ while (true) {
+ start_time = get_jiffies_64();
+ remaining_time = get_nx_lpage_recovery_timeout(start_time);
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ while (!kthread_should_stop() && remaining_time > 0) {
+ schedule_timeout(remaining_time);
+ remaining_time = get_nx_lpage_recovery_timeout(start_time);
+ set_current_state(TASK_INTERRUPTIBLE);
+ }
+
+ set_current_state(TASK_RUNNING);
+
+ if (kthread_should_stop())
+ return 0;
+
+ kvm_recover_nx_lpages(kvm);
+ }
+}
+
+int kvm_mmu_post_init_vm(struct kvm *kvm)
+{
+ int err;
+
+ err = kvm_vm_create_worker_thread(kvm, kvm_nx_lpage_recovery_worker, 0,
+ "kvm-nx-lpage-recovery",
+ &kvm->arch.nx_lpage_recovery_thread);
+ if (!err)
+ kthread_unpark(kvm->arch.nx_lpage_recovery_thread);
+
+ return err;
+}
+
+void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
+{
+ if (kvm->arch.nx_lpage_recovery_thread)
+ kthread_stop(kvm->arch.nx_lpage_recovery_thread);
+}
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
new file mode 100644
index 000000000..05a02b8ac
--- /dev/null
+++ b/arch/x86/kvm/mmu.h
@@ -0,0 +1,223 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_MMU_H
+#define __KVM_X86_MMU_H
+
+#include <linux/kvm_host.h>
+#include "kvm_cache_regs.h"
+
+#define PT64_PT_BITS 9
+#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
+#define PT32_PT_BITS 10
+#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS)
+
+#define PT_WRITABLE_SHIFT 1
+#define PT_USER_SHIFT 2
+
+#define PT_PRESENT_MASK (1ULL << 0)
+#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT)
+#define PT_USER_MASK (1ULL << PT_USER_SHIFT)
+#define PT_PWT_MASK (1ULL << 3)
+#define PT_PCD_MASK (1ULL << 4)
+#define PT_ACCESSED_SHIFT 5
+#define PT_ACCESSED_MASK (1ULL << PT_ACCESSED_SHIFT)
+#define PT_DIRTY_SHIFT 6
+#define PT_DIRTY_MASK (1ULL << PT_DIRTY_SHIFT)
+#define PT_PAGE_SIZE_SHIFT 7
+#define PT_PAGE_SIZE_MASK (1ULL << PT_PAGE_SIZE_SHIFT)
+#define PT_PAT_MASK (1ULL << 7)
+#define PT_GLOBAL_MASK (1ULL << 8)
+#define PT64_NX_SHIFT 63
+#define PT64_NX_MASK (1ULL << PT64_NX_SHIFT)
+
+#define PT_PAT_SHIFT 7
+#define PT_DIR_PAT_SHIFT 12
+#define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT)
+
+#define PT32_DIR_PSE36_SIZE 4
+#define PT32_DIR_PSE36_SHIFT 13
+#define PT32_DIR_PSE36_MASK \
+ (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
+
+#define PT64_ROOT_5LEVEL 5
+#define PT64_ROOT_4LEVEL 4
+#define PT32_ROOT_LEVEL 2
+#define PT32E_ROOT_LEVEL 3
+
+#define PT_PDPE_LEVEL 3
+#define PT_DIRECTORY_LEVEL 2
+#define PT_PAGE_TABLE_LEVEL 1
+#define PT_MAX_HUGEPAGE_LEVEL (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES - 1)
+
+static inline u64 rsvd_bits(int s, int e)
+{
+ if (e < s)
+ return 0;
+
+ return ((2ULL << (e - s)) - 1) << s;
+}
+
+void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value);
+
+void
+reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
+
+void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots);
+void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu);
+void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
+ bool accessed_dirty, gpa_t new_eptp);
+bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu);
+int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
+ u64 fault_address, char *insn, int insn_len);
+
+static inline unsigned long kvm_mmu_available_pages(struct kvm *kvm)
+{
+ if (kvm->arch.n_max_mmu_pages > kvm->arch.n_used_mmu_pages)
+ return kvm->arch.n_max_mmu_pages -
+ kvm->arch.n_used_mmu_pages;
+
+ return 0;
+}
+
+static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
+{
+ if (likely(vcpu->arch.mmu.root_hpa != INVALID_PAGE))
+ return 0;
+
+ return kvm_mmu_load(vcpu);
+}
+
+static inline unsigned long kvm_get_pcid(struct kvm_vcpu *vcpu, gpa_t cr3)
+{
+ BUILD_BUG_ON((X86_CR3_PCID_MASK & PAGE_MASK) != 0);
+
+ return kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)
+ ? cr3 & X86_CR3_PCID_MASK
+ : 0;
+}
+
+static inline unsigned long kvm_get_active_pcid(struct kvm_vcpu *vcpu)
+{
+ return kvm_get_pcid(vcpu, kvm_read_cr3(vcpu));
+}
+
+static inline void kvm_mmu_load_cr3(struct kvm_vcpu *vcpu)
+{
+ if (VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa |
+ kvm_get_active_pcid(vcpu));
+}
+
+/*
+ * Currently, we have two sorts of write-protection, a) the first one
+ * write-protects guest page to sync the guest modification, b) another one is
+ * used to sync dirty bitmap when we do KVM_GET_DIRTY_LOG. The differences
+ * between these two sorts are:
+ * 1) the first case clears SPTE_MMU_WRITEABLE bit.
+ * 2) the first case requires flushing tlb immediately avoiding corrupting
+ * shadow page table between all vcpus so it should be in the protection of
+ * mmu-lock. And the another case does not need to flush tlb until returning
+ * the dirty bitmap to userspace since it only write-protects the page
+ * logged in the bitmap, that means the page in the dirty bitmap is not
+ * missed, so it can flush tlb out of mmu-lock.
+ *
+ * So, there is the problem: the first case can meet the corrupted tlb caused
+ * by another case which write-protects pages but without flush tlb
+ * immediately. In order to making the first case be aware this problem we let
+ * it flush tlb if we try to write-protect a spte whose SPTE_MMU_WRITEABLE bit
+ * is set, it works since another case never touches SPTE_MMU_WRITEABLE bit.
+ *
+ * Anyway, whenever a spte is updated (only permission and status bits are
+ * changed) we need to check whether the spte with SPTE_MMU_WRITEABLE becomes
+ * readonly, if that happens, we need to flush tlb. Fortunately,
+ * mmu_spte_update() has already handled it perfectly.
+ *
+ * The rules to use SPTE_MMU_WRITEABLE and PT_WRITABLE_MASK:
+ * - if we want to see if it has writable tlb entry or if the spte can be
+ * writable on the mmu mapping, check SPTE_MMU_WRITEABLE, this is the most
+ * case, otherwise
+ * - if we fix page fault on the spte or do write-protection by dirty logging,
+ * check PT_WRITABLE_MASK.
+ *
+ * TODO: introduce APIs to split these two cases.
+ */
+static inline int is_writable_pte(unsigned long pte)
+{
+ return pte & PT_WRITABLE_MASK;
+}
+
+static inline bool is_write_protection(struct kvm_vcpu *vcpu)
+{
+ return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
+}
+
+/*
+ * Check if a given access (described through the I/D, W/R and U/S bits of a
+ * page fault error code pfec) causes a permission fault with the given PTE
+ * access rights (in ACC_* format).
+ *
+ * Return zero if the access does not fault; return the page fault error code
+ * if the access faults.
+ */
+static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ unsigned pte_access, unsigned pte_pkey,
+ unsigned pfec)
+{
+ int cpl = kvm_x86_ops->get_cpl(vcpu);
+ unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
+
+ /*
+ * If CPL < 3, SMAP prevention are disabled if EFLAGS.AC = 1.
+ *
+ * If CPL = 3, SMAP applies to all supervisor-mode data accesses
+ * (these are implicit supervisor accesses) regardless of the value
+ * of EFLAGS.AC.
+ *
+ * This computes (cpl < 3) && (rflags & X86_EFLAGS_AC), leaving
+ * the result in X86_EFLAGS_AC. We then insert it in place of
+ * the PFERR_RSVD_MASK bit; this bit will always be zero in pfec,
+ * but it will be one in index if SMAP checks are being overridden.
+ * It is important to keep this branchless.
+ */
+ unsigned long smap = (cpl - 3) & (rflags & X86_EFLAGS_AC);
+ int index = (pfec >> 1) +
+ (smap >> (X86_EFLAGS_AC_BIT - PFERR_RSVD_BIT + 1));
+ bool fault = (mmu->permissions[index] >> pte_access) & 1;
+ u32 errcode = PFERR_PRESENT_MASK;
+
+ WARN_ON(pfec & (PFERR_PK_MASK | PFERR_RSVD_MASK));
+ if (unlikely(mmu->pkru_mask)) {
+ u32 pkru_bits, offset;
+
+ /*
+ * PKRU defines 32 bits, there are 16 domains and 2
+ * attribute bits per domain in pkru. pte_pkey is the
+ * index of the protection domain, so pte_pkey * 2 is
+ * is the index of the first bit for the domain.
+ */
+ pkru_bits = (vcpu->arch.pkru >> (pte_pkey * 2)) & 3;
+
+ /* clear present bit, replace PFEC.RSVD with ACC_USER_MASK. */
+ offset = (pfec & ~1) +
+ ((pte_access & PT_USER_MASK) << (PFERR_RSVD_BIT - PT_USER_SHIFT));
+
+ pkru_bits &= mmu->pkru_mask >> offset;
+ errcode |= -pkru_bits & PFERR_PK_MASK;
+ fault |= (pkru_bits != 0);
+ }
+
+ return -(u32)fault & errcode;
+}
+
+void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm);
+void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end);
+
+void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
+void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
+bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
+ struct kvm_memory_slot *slot, u64 gfn);
+int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu, gpa_t l2_gpa);
+
+int kvm_mmu_post_init_vm(struct kvm *kvm);
+void kvm_mmu_pre_destroy_vm(struct kvm *kvm);
+
+#endif
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
new file mode 100644
index 000000000..1272861e7
--- /dev/null
+++ b/arch/x86/kvm/mmu_audit.c
@@ -0,0 +1,306 @@
+/*
+ * mmu_audit.c:
+ *
+ * Audit code for KVM MMU
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ * Yaniv Kamay <yaniv@qumranet.com>
+ * Avi Kivity <avi@qumranet.com>
+ * Marcelo Tosatti <mtosatti@redhat.com>
+ * Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include <linux/ratelimit.h>
+
+static char const *audit_point_name[] = {
+ "pre page fault",
+ "post page fault",
+ "pre pte write",
+ "post pte write",
+ "pre sync",
+ "post sync"
+};
+
+#define audit_printk(kvm, fmt, args...) \
+ printk(KERN_ERR "audit: (%s) error: " \
+ fmt, audit_point_name[kvm->arch.audit_point], ##args)
+
+typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level);
+
+static void __mmu_spte_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ inspect_spte_fn fn, int level)
+{
+ int i;
+
+ for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+ u64 *ent = sp->spt;
+
+ fn(vcpu, ent + i, level);
+
+ if (is_shadow_present_pte(ent[i]) &&
+ !is_last_spte(ent[i], level)) {
+ struct kvm_mmu_page *child;
+
+ child = page_header(ent[i] & PT64_BASE_ADDR_MASK);
+ __mmu_spte_walk(vcpu, child, fn, level - 1);
+ }
+ }
+}
+
+static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
+{
+ int i;
+ struct kvm_mmu_page *sp;
+
+ if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ return;
+
+ if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) {
+ hpa_t root = vcpu->arch.mmu.root_hpa;
+
+ sp = page_header(root);
+ __mmu_spte_walk(vcpu, sp, fn, vcpu->arch.mmu.root_level);
+ return;
+ }
+
+ for (i = 0; i < 4; ++i) {
+ hpa_t root = vcpu->arch.mmu.pae_root[i];
+
+ if (root && VALID_PAGE(root)) {
+ root &= PT64_BASE_ADDR_MASK;
+ sp = page_header(root);
+ __mmu_spte_walk(vcpu, sp, fn, 2);
+ }
+ }
+
+ return;
+}
+
+typedef void (*sp_handler) (struct kvm *kvm, struct kvm_mmu_page *sp);
+
+static void walk_all_active_sps(struct kvm *kvm, sp_handler fn)
+{
+ struct kvm_mmu_page *sp;
+
+ list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link)
+ fn(kvm, sp);
+}
+
+static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
+{
+ struct kvm_mmu_page *sp;
+ gfn_t gfn;
+ kvm_pfn_t pfn;
+ hpa_t hpa;
+
+ sp = page_header(__pa(sptep));
+
+ if (sp->unsync) {
+ if (level != PT_PAGE_TABLE_LEVEL) {
+ audit_printk(vcpu->kvm, "unsync sp: %p "
+ "level = %d\n", sp, level);
+ return;
+ }
+ }
+
+ if (!is_shadow_present_pte(*sptep) || !is_last_spte(*sptep, level))
+ return;
+
+ gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
+ pfn = kvm_vcpu_gfn_to_pfn_atomic(vcpu, gfn);
+
+ if (is_error_pfn(pfn))
+ return;
+
+ hpa = pfn << PAGE_SHIFT;
+ if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
+ audit_printk(vcpu->kvm, "levels %d pfn %llx hpa %llx "
+ "ent %llxn", vcpu->arch.mmu.root_level, pfn,
+ hpa, *sptep);
+}
+
+static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
+{
+ static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
+ struct kvm_rmap_head *rmap_head;
+ struct kvm_mmu_page *rev_sp;
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *slot;
+ gfn_t gfn;
+
+ rev_sp = page_header(__pa(sptep));
+ gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
+
+ slots = kvm_memslots_for_spte_role(kvm, rev_sp->role);
+ slot = __gfn_to_memslot(slots, gfn);
+ if (!slot) {
+ if (!__ratelimit(&ratelimit_state))
+ return;
+ audit_printk(kvm, "no memslot for gfn %llx\n", gfn);
+ audit_printk(kvm, "index %ld of sp (gfn=%llx)\n",
+ (long int)(sptep - rev_sp->spt), rev_sp->gfn);
+ dump_stack();
+ return;
+ }
+
+ rmap_head = __gfn_to_rmap(gfn, rev_sp->role.level, slot);
+ if (!rmap_head->val) {
+ if (!__ratelimit(&ratelimit_state))
+ return;
+ audit_printk(kvm, "no rmap for writable spte %llx\n",
+ *sptep);
+ dump_stack();
+ }
+}
+
+static void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu, u64 *sptep, int level)
+{
+ if (is_shadow_present_pte(*sptep) && is_last_spte(*sptep, level))
+ inspect_spte_has_rmap(vcpu->kvm, sptep);
+}
+
+static void audit_spte_after_sync(struct kvm_vcpu *vcpu, u64 *sptep, int level)
+{
+ struct kvm_mmu_page *sp = page_header(__pa(sptep));
+
+ if (vcpu->kvm->arch.audit_point == AUDIT_POST_SYNC && sp->unsync)
+ audit_printk(vcpu->kvm, "meet unsync sp(%p) after sync "
+ "root.\n", sp);
+}
+
+static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ int i;
+
+ if (sp->role.level != PT_PAGE_TABLE_LEVEL)
+ return;
+
+ for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+ if (!is_shadow_present_pte(sp->spt[i]))
+ continue;
+
+ inspect_spte_has_rmap(kvm, sp->spt + i);
+ }
+}
+
+static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ struct kvm_rmap_head *rmap_head;
+ u64 *sptep;
+ struct rmap_iterator iter;
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *slot;
+
+ if (sp->role.direct || sp->unsync || sp->role.invalid)
+ return;
+
+ slots = kvm_memslots_for_spte_role(kvm, sp->role);
+ slot = __gfn_to_memslot(slots, sp->gfn);
+ rmap_head = __gfn_to_rmap(sp->gfn, PT_PAGE_TABLE_LEVEL, slot);
+
+ for_each_rmap_spte(rmap_head, &iter, sptep) {
+ if (is_writable_pte(*sptep))
+ audit_printk(kvm, "shadow page has writable "
+ "mappings: gfn %llx role %x\n",
+ sp->gfn, sp->role.word);
+ }
+}
+
+static void audit_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ check_mappings_rmap(kvm, sp);
+ audit_write_protection(kvm, sp);
+}
+
+static void audit_all_active_sps(struct kvm *kvm)
+{
+ walk_all_active_sps(kvm, audit_sp);
+}
+
+static void audit_spte(struct kvm_vcpu *vcpu, u64 *sptep, int level)
+{
+ audit_sptes_have_rmaps(vcpu, sptep, level);
+ audit_mappings(vcpu, sptep, level);
+ audit_spte_after_sync(vcpu, sptep, level);
+}
+
+static void audit_vcpu_spte(struct kvm_vcpu *vcpu)
+{
+ mmu_spte_walk(vcpu, audit_spte);
+}
+
+static bool mmu_audit;
+static struct static_key mmu_audit_key;
+
+static void __kvm_mmu_audit(struct kvm_vcpu *vcpu, int point)
+{
+ static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
+
+ if (!__ratelimit(&ratelimit_state))
+ return;
+
+ vcpu->kvm->arch.audit_point = point;
+ audit_all_active_sps(vcpu->kvm);
+ audit_vcpu_spte(vcpu);
+}
+
+static inline void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point)
+{
+ if (static_key_false((&mmu_audit_key)))
+ __kvm_mmu_audit(vcpu, point);
+}
+
+static void mmu_audit_enable(void)
+{
+ if (mmu_audit)
+ return;
+
+ static_key_slow_inc(&mmu_audit_key);
+ mmu_audit = true;
+}
+
+static void mmu_audit_disable(void)
+{
+ if (!mmu_audit)
+ return;
+
+ static_key_slow_dec(&mmu_audit_key);
+ mmu_audit = false;
+}
+
+static int mmu_audit_set(const char *val, const struct kernel_param *kp)
+{
+ int ret;
+ unsigned long enable;
+
+ ret = kstrtoul(val, 10, &enable);
+ if (ret < 0)
+ return -EINVAL;
+
+ switch (enable) {
+ case 0:
+ mmu_audit_disable();
+ break;
+ case 1:
+ mmu_audit_enable();
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static const struct kernel_param_ops audit_param_ops = {
+ .set = mmu_audit_set,
+ .get = param_get_bool,
+};
+
+arch_param_cb(mmu_audit, &audit_param_ops, &mmu_audit, 0644);
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
new file mode 100644
index 000000000..7e0dc8c7d
--- /dev/null
+++ b/arch/x86/kvm/mmutrace.h
@@ -0,0 +1,395 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#if !defined(_TRACE_KVMMMU_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_KVMMMU_H
+
+#include <linux/tracepoint.h>
+#include <linux/trace_events.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM kvmmmu
+
+#define KVM_MMU_PAGE_FIELDS \
+ __field(unsigned long, mmu_valid_gen) \
+ __field(__u64, gfn) \
+ __field(__u32, role) \
+ __field(__u32, root_count) \
+ __field(bool, unsync)
+
+#define KVM_MMU_PAGE_ASSIGN(sp) \
+ __entry->mmu_valid_gen = sp->mmu_valid_gen; \
+ __entry->gfn = sp->gfn; \
+ __entry->role = sp->role.word; \
+ __entry->root_count = sp->root_count; \
+ __entry->unsync = sp->unsync;
+
+#define KVM_MMU_PAGE_PRINTK() ({ \
+ const char *saved_ptr = trace_seq_buffer_ptr(p); \
+ static const char *access_str[] = { \
+ "---", "--x", "w--", "w-x", "-u-", "-ux", "wu-", "wux" \
+ }; \
+ union kvm_mmu_page_role role; \
+ \
+ role.word = __entry->role; \
+ \
+ trace_seq_printf(p, "sp gen %lx gfn %llx l%u%s q%u%s %s%s" \
+ " %snxe %sad root %u %s%c", \
+ __entry->mmu_valid_gen, \
+ __entry->gfn, role.level, \
+ role.cr4_pae ? " pae" : "", \
+ role.quadrant, \
+ role.direct ? " direct" : "", \
+ access_str[role.access], \
+ role.invalid ? " invalid" : "", \
+ role.nxe ? "" : "!", \
+ role.ad_disabled ? "!" : "", \
+ __entry->root_count, \
+ __entry->unsync ? "unsync" : "sync", 0); \
+ saved_ptr; \
+ })
+
+#define kvm_mmu_trace_pferr_flags \
+ { PFERR_PRESENT_MASK, "P" }, \
+ { PFERR_WRITE_MASK, "W" }, \
+ { PFERR_USER_MASK, "U" }, \
+ { PFERR_RSVD_MASK, "RSVD" }, \
+ { PFERR_FETCH_MASK, "F" }
+
+/*
+ * A pagetable walk has started
+ */
+TRACE_EVENT(
+ kvm_mmu_pagetable_walk,
+ TP_PROTO(u64 addr, u32 pferr),
+ TP_ARGS(addr, pferr),
+
+ TP_STRUCT__entry(
+ __field(__u64, addr)
+ __field(__u32, pferr)
+ ),
+
+ TP_fast_assign(
+ __entry->addr = addr;
+ __entry->pferr = pferr;
+ ),
+
+ TP_printk("addr %llx pferr %x %s", __entry->addr, __entry->pferr,
+ __print_flags(__entry->pferr, "|", kvm_mmu_trace_pferr_flags))
+);
+
+
+/* We just walked a paging element */
+TRACE_EVENT(
+ kvm_mmu_paging_element,
+ TP_PROTO(u64 pte, int level),
+ TP_ARGS(pte, level),
+
+ TP_STRUCT__entry(
+ __field(__u64, pte)
+ __field(__u32, level)
+ ),
+
+ TP_fast_assign(
+ __entry->pte = pte;
+ __entry->level = level;
+ ),
+
+ TP_printk("pte %llx level %u", __entry->pte, __entry->level)
+);
+
+DECLARE_EVENT_CLASS(kvm_mmu_set_bit_class,
+
+ TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size),
+
+ TP_ARGS(table_gfn, index, size),
+
+ TP_STRUCT__entry(
+ __field(__u64, gpa)
+ ),
+
+ TP_fast_assign(
+ __entry->gpa = ((u64)table_gfn << PAGE_SHIFT)
+ + index * size;
+ ),
+
+ TP_printk("gpa %llx", __entry->gpa)
+);
+
+/* We set a pte accessed bit */
+DEFINE_EVENT(kvm_mmu_set_bit_class, kvm_mmu_set_accessed_bit,
+
+ TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size),
+
+ TP_ARGS(table_gfn, index, size)
+);
+
+/* We set a pte dirty bit */
+DEFINE_EVENT(kvm_mmu_set_bit_class, kvm_mmu_set_dirty_bit,
+
+ TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size),
+
+ TP_ARGS(table_gfn, index, size)
+);
+
+TRACE_EVENT(
+ kvm_mmu_walker_error,
+ TP_PROTO(u32 pferr),
+ TP_ARGS(pferr),
+
+ TP_STRUCT__entry(
+ __field(__u32, pferr)
+ ),
+
+ TP_fast_assign(
+ __entry->pferr = pferr;
+ ),
+
+ TP_printk("pferr %x %s", __entry->pferr,
+ __print_flags(__entry->pferr, "|", kvm_mmu_trace_pferr_flags))
+);
+
+TRACE_EVENT(
+ kvm_mmu_get_page,
+ TP_PROTO(struct kvm_mmu_page *sp, bool created),
+ TP_ARGS(sp, created),
+
+ TP_STRUCT__entry(
+ KVM_MMU_PAGE_FIELDS
+ __field(bool, created)
+ ),
+
+ TP_fast_assign(
+ KVM_MMU_PAGE_ASSIGN(sp)
+ __entry->created = created;
+ ),
+
+ TP_printk("%s %s", KVM_MMU_PAGE_PRINTK(),
+ __entry->created ? "new" : "existing")
+);
+
+DECLARE_EVENT_CLASS(kvm_mmu_page_class,
+
+ TP_PROTO(struct kvm_mmu_page *sp),
+ TP_ARGS(sp),
+
+ TP_STRUCT__entry(
+ KVM_MMU_PAGE_FIELDS
+ ),
+
+ TP_fast_assign(
+ KVM_MMU_PAGE_ASSIGN(sp)
+ ),
+
+ TP_printk("%s", KVM_MMU_PAGE_PRINTK())
+);
+
+DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_sync_page,
+ TP_PROTO(struct kvm_mmu_page *sp),
+
+ TP_ARGS(sp)
+);
+
+DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_unsync_page,
+ TP_PROTO(struct kvm_mmu_page *sp),
+
+ TP_ARGS(sp)
+);
+
+DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page,
+ TP_PROTO(struct kvm_mmu_page *sp),
+
+ TP_ARGS(sp)
+);
+
+TRACE_EVENT(
+ mark_mmio_spte,
+ TP_PROTO(u64 *sptep, gfn_t gfn, unsigned access, unsigned int gen),
+ TP_ARGS(sptep, gfn, access, gen),
+
+ TP_STRUCT__entry(
+ __field(void *, sptep)
+ __field(gfn_t, gfn)
+ __field(unsigned, access)
+ __field(unsigned int, gen)
+ ),
+
+ TP_fast_assign(
+ __entry->sptep = sptep;
+ __entry->gfn = gfn;
+ __entry->access = access;
+ __entry->gen = gen;
+ ),
+
+ TP_printk("sptep:%p gfn %llx access %x gen %x", __entry->sptep,
+ __entry->gfn, __entry->access, __entry->gen)
+);
+
+TRACE_EVENT(
+ handle_mmio_page_fault,
+ TP_PROTO(u64 addr, gfn_t gfn, unsigned access),
+ TP_ARGS(addr, gfn, access),
+
+ TP_STRUCT__entry(
+ __field(u64, addr)
+ __field(gfn_t, gfn)
+ __field(unsigned, access)
+ ),
+
+ TP_fast_assign(
+ __entry->addr = addr;
+ __entry->gfn = gfn;
+ __entry->access = access;
+ ),
+
+ TP_printk("addr:%llx gfn %llx access %x", __entry->addr, __entry->gfn,
+ __entry->access)
+);
+
+#define __spte_satisfied(__spte) \
+ (__entry->retry && is_writable_pte(__entry->__spte))
+
+TRACE_EVENT(
+ fast_page_fault,
+ TP_PROTO(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u32 error_code,
+ u64 *sptep, u64 old_spte, bool retry),
+ TP_ARGS(vcpu, cr2_or_gpa, error_code, sptep, old_spte, retry),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(gpa_t, cr2_or_gpa)
+ __field(u32, error_code)
+ __field(u64 *, sptep)
+ __field(u64, old_spte)
+ __field(u64, new_spte)
+ __field(bool, retry)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu->vcpu_id;
+ __entry->cr2_or_gpa = cr2_or_gpa;
+ __entry->error_code = error_code;
+ __entry->sptep = sptep;
+ __entry->old_spte = old_spte;
+ __entry->new_spte = *sptep;
+ __entry->retry = retry;
+ ),
+
+ TP_printk("vcpu %d gva %llx error_code %s sptep %p old %#llx"
+ " new %llx spurious %d fixed %d", __entry->vcpu_id,
+ __entry->cr2_or_gpa, __print_flags(__entry->error_code, "|",
+ kvm_mmu_trace_pferr_flags), __entry->sptep,
+ __entry->old_spte, __entry->new_spte,
+ __spte_satisfied(old_spte), __spte_satisfied(new_spte)
+ )
+);
+
+TRACE_EVENT(
+ kvm_mmu_invalidate_zap_all_pages,
+ TP_PROTO(struct kvm *kvm),
+ TP_ARGS(kvm),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, mmu_valid_gen)
+ __field(unsigned int, mmu_used_pages)
+ ),
+
+ TP_fast_assign(
+ __entry->mmu_valid_gen = kvm->arch.mmu_valid_gen;
+ __entry->mmu_used_pages = kvm->arch.n_used_mmu_pages;
+ ),
+
+ TP_printk("kvm-mmu-valid-gen %lx used_pages %x",
+ __entry->mmu_valid_gen, __entry->mmu_used_pages
+ )
+);
+
+
+TRACE_EVENT(
+ check_mmio_spte,
+ TP_PROTO(u64 spte, unsigned int kvm_gen, unsigned int spte_gen),
+ TP_ARGS(spte, kvm_gen, spte_gen),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, kvm_gen)
+ __field(unsigned int, spte_gen)
+ __field(u64, spte)
+ ),
+
+ TP_fast_assign(
+ __entry->kvm_gen = kvm_gen;
+ __entry->spte_gen = spte_gen;
+ __entry->spte = spte;
+ ),
+
+ TP_printk("spte %llx kvm_gen %x spte-gen %x valid %d", __entry->spte,
+ __entry->kvm_gen, __entry->spte_gen,
+ __entry->kvm_gen == __entry->spte_gen
+ )
+);
+
+TRACE_EVENT(
+ kvm_mmu_set_spte,
+ TP_PROTO(int level, gfn_t gfn, u64 *sptep),
+ TP_ARGS(level, gfn, sptep),
+
+ TP_STRUCT__entry(
+ __field(u64, gfn)
+ __field(u64, spte)
+ __field(u64, sptep)
+ __field(u8, level)
+ /* These depend on page entry type, so compute them now. */
+ __field(bool, r)
+ __field(bool, x)
+ __field(signed char, u)
+ ),
+
+ TP_fast_assign(
+ __entry->gfn = gfn;
+ __entry->spte = *sptep;
+ __entry->sptep = virt_to_phys(sptep);
+ __entry->level = level;
+ __entry->r = shadow_present_mask || (__entry->spte & PT_PRESENT_MASK);
+ __entry->x = is_executable_pte(__entry->spte);
+ __entry->u = shadow_user_mask ? !!(__entry->spte & shadow_user_mask) : -1;
+ ),
+
+ TP_printk("gfn %llx spte %llx (%s%s%s%s) level %d at %llx",
+ __entry->gfn, __entry->spte,
+ __entry->r ? "r" : "-",
+ __entry->spte & PT_WRITABLE_MASK ? "w" : "-",
+ __entry->x ? "x" : "-",
+ __entry->u == -1 ? "" : (__entry->u ? "u" : "-"),
+ __entry->level, __entry->sptep
+ )
+);
+
+TRACE_EVENT(
+ kvm_mmu_spte_requested,
+ TP_PROTO(gpa_t addr, int level, kvm_pfn_t pfn),
+ TP_ARGS(addr, level, pfn),
+
+ TP_STRUCT__entry(
+ __field(u64, gfn)
+ __field(u64, pfn)
+ __field(u8, level)
+ ),
+
+ TP_fast_assign(
+ __entry->gfn = addr >> PAGE_SHIFT;
+ __entry->pfn = pfn | (__entry->gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
+ __entry->level = level;
+ ),
+
+ TP_printk("gfn %llx pfn %llx level %d",
+ __entry->gfn, __entry->pfn, __entry->level
+ )
+);
+
+#endif /* _TRACE_KVMMMU_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE mmutrace
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c
new file mode 100644
index 000000000..fabce8769
--- /dev/null
+++ b/arch/x86/kvm/mtrr.c
@@ -0,0 +1,727 @@
+/*
+ * vMTRR implementation
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ * Copyright(C) 2015 Intel Corporation.
+ *
+ * Authors:
+ * Yaniv Kamay <yaniv@qumranet.com>
+ * Avi Kivity <avi@qumranet.com>
+ * Marcelo Tosatti <mtosatti@redhat.com>
+ * Paolo Bonzini <pbonzini@redhat.com>
+ * Xiao Guangrong <guangrong.xiao@linux.intel.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
+#include <linux/kvm_host.h>
+#include <asm/mtrr.h>
+
+#include "cpuid.h"
+#include "mmu.h"
+
+#define IA32_MTRR_DEF_TYPE_E (1ULL << 11)
+#define IA32_MTRR_DEF_TYPE_FE (1ULL << 10)
+#define IA32_MTRR_DEF_TYPE_TYPE_MASK (0xff)
+
+static bool msr_mtrr_valid(unsigned msr)
+{
+ switch (msr) {
+ case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1:
+ case MSR_MTRRfix64K_00000:
+ case MSR_MTRRfix16K_80000:
+ case MSR_MTRRfix16K_A0000:
+ case MSR_MTRRfix4K_C0000:
+ case MSR_MTRRfix4K_C8000:
+ case MSR_MTRRfix4K_D0000:
+ case MSR_MTRRfix4K_D8000:
+ case MSR_MTRRfix4K_E0000:
+ case MSR_MTRRfix4K_E8000:
+ case MSR_MTRRfix4K_F0000:
+ case MSR_MTRRfix4K_F8000:
+ case MSR_MTRRdefType:
+ case MSR_IA32_CR_PAT:
+ return true;
+ }
+ return false;
+}
+
+static bool valid_mtrr_type(unsigned t)
+{
+ return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */
+}
+
+bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+ int i;
+ u64 mask;
+
+ if (!msr_mtrr_valid(msr))
+ return false;
+
+ if (msr == MSR_IA32_CR_PAT) {
+ return kvm_pat_valid(data);
+ } else if (msr == MSR_MTRRdefType) {
+ if (data & ~0xcff)
+ return false;
+ return valid_mtrr_type(data & 0xff);
+ } else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) {
+ for (i = 0; i < 8 ; i++)
+ if (!valid_mtrr_type((data >> (i * 8)) & 0xff))
+ return false;
+ return true;
+ }
+
+ /* variable MTRRs */
+ WARN_ON(!(msr >= 0x200 && msr < 0x200 + 2 * KVM_NR_VAR_MTRR));
+
+ mask = (~0ULL) << cpuid_maxphyaddr(vcpu);
+ if ((msr & 1) == 0) {
+ /* MTRR base */
+ if (!valid_mtrr_type(data & 0xff))
+ return false;
+ mask |= 0xf00;
+ } else
+ /* MTRR mask */
+ mask |= 0x7ff;
+ if (data & mask) {
+ kvm_inject_gp(vcpu, 0);
+ return false;
+ }
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(kvm_mtrr_valid);
+
+static bool mtrr_is_enabled(struct kvm_mtrr *mtrr_state)
+{
+ return !!(mtrr_state->deftype & IA32_MTRR_DEF_TYPE_E);
+}
+
+static bool fixed_mtrr_is_enabled(struct kvm_mtrr *mtrr_state)
+{
+ return !!(mtrr_state->deftype & IA32_MTRR_DEF_TYPE_FE);
+}
+
+static u8 mtrr_default_type(struct kvm_mtrr *mtrr_state)
+{
+ return mtrr_state->deftype & IA32_MTRR_DEF_TYPE_TYPE_MASK;
+}
+
+static u8 mtrr_disabled_type(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Intel SDM 11.11.2.2: all MTRRs are disabled when
+ * IA32_MTRR_DEF_TYPE.E bit is cleared, and the UC
+ * memory type is applied to all of physical memory.
+ *
+ * However, virtual machines can be run with CPUID such that
+ * there are no MTRRs. In that case, the firmware will never
+ * enable MTRRs and it is obviously undesirable to run the
+ * guest entirely with UC memory and we use WB.
+ */
+ if (guest_cpuid_has(vcpu, X86_FEATURE_MTRR))
+ return MTRR_TYPE_UNCACHABLE;
+ else
+ return MTRR_TYPE_WRBACK;
+}
+
+/*
+* Three terms are used in the following code:
+* - segment, it indicates the address segments covered by fixed MTRRs.
+* - unit, it corresponds to the MSR entry in the segment.
+* - range, a range is covered in one memory cache type.
+*/
+struct fixed_mtrr_segment {
+ u64 start;
+ u64 end;
+
+ int range_shift;
+
+ /* the start position in kvm_mtrr.fixed_ranges[]. */
+ int range_start;
+};
+
+static struct fixed_mtrr_segment fixed_seg_table[] = {
+ /* MSR_MTRRfix64K_00000, 1 unit. 64K fixed mtrr. */
+ {
+ .start = 0x0,
+ .end = 0x80000,
+ .range_shift = 16, /* 64K */
+ .range_start = 0,
+ },
+
+ /*
+ * MSR_MTRRfix16K_80000 ... MSR_MTRRfix16K_A0000, 2 units,
+ * 16K fixed mtrr.
+ */
+ {
+ .start = 0x80000,
+ .end = 0xc0000,
+ .range_shift = 14, /* 16K */
+ .range_start = 8,
+ },
+
+ /*
+ * MSR_MTRRfix4K_C0000 ... MSR_MTRRfix4K_F8000, 8 units,
+ * 4K fixed mtrr.
+ */
+ {
+ .start = 0xc0000,
+ .end = 0x100000,
+ .range_shift = 12, /* 12K */
+ .range_start = 24,
+ }
+};
+
+/*
+ * The size of unit is covered in one MSR, one MSR entry contains
+ * 8 ranges so that unit size is always 8 * 2^range_shift.
+ */
+static u64 fixed_mtrr_seg_unit_size(int seg)
+{
+ return 8 << fixed_seg_table[seg].range_shift;
+}
+
+static bool fixed_msr_to_seg_unit(u32 msr, int *seg, int *unit)
+{
+ switch (msr) {
+ case MSR_MTRRfix64K_00000:
+ *seg = 0;
+ *unit = 0;
+ break;
+ case MSR_MTRRfix16K_80000 ... MSR_MTRRfix16K_A0000:
+ *seg = 1;
+ *unit = array_index_nospec(
+ msr - MSR_MTRRfix16K_80000,
+ MSR_MTRRfix16K_A0000 - MSR_MTRRfix16K_80000 + 1);
+ break;
+ case MSR_MTRRfix4K_C0000 ... MSR_MTRRfix4K_F8000:
+ *seg = 2;
+ *unit = array_index_nospec(
+ msr - MSR_MTRRfix4K_C0000,
+ MSR_MTRRfix4K_F8000 - MSR_MTRRfix4K_C0000 + 1);
+ break;
+ default:
+ return false;
+ }
+
+ return true;
+}
+
+static void fixed_mtrr_seg_unit_range(int seg, int unit, u64 *start, u64 *end)
+{
+ struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg];
+ u64 unit_size = fixed_mtrr_seg_unit_size(seg);
+
+ *start = mtrr_seg->start + unit * unit_size;
+ *end = *start + unit_size;
+ WARN_ON(*end > mtrr_seg->end);
+}
+
+static int fixed_mtrr_seg_unit_range_index(int seg, int unit)
+{
+ struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg];
+
+ WARN_ON(mtrr_seg->start + unit * fixed_mtrr_seg_unit_size(seg)
+ > mtrr_seg->end);
+
+ /* each unit has 8 ranges. */
+ return mtrr_seg->range_start + 8 * unit;
+}
+
+static int fixed_mtrr_seg_end_range_index(int seg)
+{
+ struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg];
+ int n;
+
+ n = (mtrr_seg->end - mtrr_seg->start) >> mtrr_seg->range_shift;
+ return mtrr_seg->range_start + n - 1;
+}
+
+static bool fixed_msr_to_range(u32 msr, u64 *start, u64 *end)
+{
+ int seg, unit;
+
+ if (!fixed_msr_to_seg_unit(msr, &seg, &unit))
+ return false;
+
+ fixed_mtrr_seg_unit_range(seg, unit, start, end);
+ return true;
+}
+
+static int fixed_msr_to_range_index(u32 msr)
+{
+ int seg, unit;
+
+ if (!fixed_msr_to_seg_unit(msr, &seg, &unit))
+ return -1;
+
+ return fixed_mtrr_seg_unit_range_index(seg, unit);
+}
+
+static int fixed_mtrr_addr_to_seg(u64 addr)
+{
+ struct fixed_mtrr_segment *mtrr_seg;
+ int seg, seg_num = ARRAY_SIZE(fixed_seg_table);
+
+ for (seg = 0; seg < seg_num; seg++) {
+ mtrr_seg = &fixed_seg_table[seg];
+ if (mtrr_seg->start <= addr && addr < mtrr_seg->end)
+ return seg;
+ }
+
+ return -1;
+}
+
+static int fixed_mtrr_addr_seg_to_range_index(u64 addr, int seg)
+{
+ struct fixed_mtrr_segment *mtrr_seg;
+ int index;
+
+ mtrr_seg = &fixed_seg_table[seg];
+ index = mtrr_seg->range_start;
+ index += (addr - mtrr_seg->start) >> mtrr_seg->range_shift;
+ return index;
+}
+
+static u64 fixed_mtrr_range_end_addr(int seg, int index)
+{
+ struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg];
+ int pos = index - mtrr_seg->range_start;
+
+ return mtrr_seg->start + ((pos + 1) << mtrr_seg->range_shift);
+}
+
+static void var_mtrr_range(struct kvm_mtrr_range *range, u64 *start, u64 *end)
+{
+ u64 mask;
+
+ *start = range->base & PAGE_MASK;
+
+ mask = range->mask & PAGE_MASK;
+
+ /* This cannot overflow because writing to the reserved bits of
+ * variable MTRRs causes a #GP.
+ */
+ *end = (*start | ~mask) + 1;
+}
+
+static void update_mtrr(struct kvm_vcpu *vcpu, u32 msr)
+{
+ struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state;
+ gfn_t start, end;
+ int index;
+
+ if (msr == MSR_IA32_CR_PAT || !tdp_enabled ||
+ !kvm_arch_has_noncoherent_dma(vcpu->kvm))
+ return;
+
+ if (!mtrr_is_enabled(mtrr_state) && msr != MSR_MTRRdefType)
+ return;
+
+ /* fixed MTRRs. */
+ if (fixed_msr_to_range(msr, &start, &end)) {
+ if (!fixed_mtrr_is_enabled(mtrr_state))
+ return;
+ } else if (msr == MSR_MTRRdefType) {
+ start = 0x0;
+ end = ~0ULL;
+ } else {
+ /* variable range MTRRs. */
+ index = (msr - 0x200) / 2;
+ var_mtrr_range(&mtrr_state->var_ranges[index], &start, &end);
+ }
+
+ kvm_zap_gfn_range(vcpu->kvm, gpa_to_gfn(start), gpa_to_gfn(end));
+}
+
+static bool var_mtrr_range_is_valid(struct kvm_mtrr_range *range)
+{
+ return (range->mask & (1 << 11)) != 0;
+}
+
+static void set_var_mtrr_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+ struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state;
+ struct kvm_mtrr_range *tmp, *cur;
+ int index, is_mtrr_mask;
+
+ index = (msr - 0x200) / 2;
+ is_mtrr_mask = msr - 0x200 - 2 * index;
+ cur = &mtrr_state->var_ranges[index];
+
+ /* remove the entry if it's in the list. */
+ if (var_mtrr_range_is_valid(cur))
+ list_del(&mtrr_state->var_ranges[index].node);
+
+ /* Extend the mask with all 1 bits to the left, since those
+ * bits must implicitly be 0. The bits are then cleared
+ * when reading them.
+ */
+ if (!is_mtrr_mask)
+ cur->base = data;
+ else
+ cur->mask = data | (-1LL << cpuid_maxphyaddr(vcpu));
+
+ /* add it to the list if it's enabled. */
+ if (var_mtrr_range_is_valid(cur)) {
+ list_for_each_entry(tmp, &mtrr_state->head, node)
+ if (cur->base >= tmp->base)
+ break;
+ list_add_tail(&cur->node, &tmp->node);
+ }
+}
+
+int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+ int index;
+
+ if (!kvm_mtrr_valid(vcpu, msr, data))
+ return 1;
+
+ index = fixed_msr_to_range_index(msr);
+ if (index >= 0)
+ *(u64 *)&vcpu->arch.mtrr_state.fixed_ranges[index] = data;
+ else if (msr == MSR_MTRRdefType)
+ vcpu->arch.mtrr_state.deftype = data;
+ else if (msr == MSR_IA32_CR_PAT)
+ vcpu->arch.pat = data;
+ else
+ set_var_mtrr_msr(vcpu, msr, data);
+
+ update_mtrr(vcpu, msr);
+ return 0;
+}
+
+int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+{
+ int index;
+
+ /* MSR_MTRRcap is a readonly MSR. */
+ if (msr == MSR_MTRRcap) {
+ /*
+ * SMRR = 0
+ * WC = 1
+ * FIX = 1
+ * VCNT = KVM_NR_VAR_MTRR
+ */
+ *pdata = 0x500 | KVM_NR_VAR_MTRR;
+ return 0;
+ }
+
+ if (!msr_mtrr_valid(msr))
+ return 1;
+
+ index = fixed_msr_to_range_index(msr);
+ if (index >= 0)
+ *pdata = *(u64 *)&vcpu->arch.mtrr_state.fixed_ranges[index];
+ else if (msr == MSR_MTRRdefType)
+ *pdata = vcpu->arch.mtrr_state.deftype;
+ else if (msr == MSR_IA32_CR_PAT)
+ *pdata = vcpu->arch.pat;
+ else { /* Variable MTRRs */
+ int is_mtrr_mask;
+
+ index = (msr - 0x200) / 2;
+ is_mtrr_mask = msr - 0x200 - 2 * index;
+ if (!is_mtrr_mask)
+ *pdata = vcpu->arch.mtrr_state.var_ranges[index].base;
+ else
+ *pdata = vcpu->arch.mtrr_state.var_ranges[index].mask;
+
+ *pdata &= (1ULL << cpuid_maxphyaddr(vcpu)) - 1;
+ }
+
+ return 0;
+}
+
+void kvm_vcpu_mtrr_init(struct kvm_vcpu *vcpu)
+{
+ INIT_LIST_HEAD(&vcpu->arch.mtrr_state.head);
+}
+
+struct mtrr_iter {
+ /* input fields. */
+ struct kvm_mtrr *mtrr_state;
+ u64 start;
+ u64 end;
+
+ /* output fields. */
+ int mem_type;
+ /* mtrr is completely disabled? */
+ bool mtrr_disabled;
+ /* [start, end) is not fully covered in MTRRs? */
+ bool partial_map;
+
+ /* private fields. */
+ union {
+ /* used for fixed MTRRs. */
+ struct {
+ int index;
+ int seg;
+ };
+
+ /* used for var MTRRs. */
+ struct {
+ struct kvm_mtrr_range *range;
+ /* max address has been covered in var MTRRs. */
+ u64 start_max;
+ };
+ };
+
+ bool fixed;
+};
+
+static bool mtrr_lookup_fixed_start(struct mtrr_iter *iter)
+{
+ int seg, index;
+
+ if (!fixed_mtrr_is_enabled(iter->mtrr_state))
+ return false;
+
+ seg = fixed_mtrr_addr_to_seg(iter->start);
+ if (seg < 0)
+ return false;
+
+ iter->fixed = true;
+ index = fixed_mtrr_addr_seg_to_range_index(iter->start, seg);
+ iter->index = index;
+ iter->seg = seg;
+ return true;
+}
+
+static bool match_var_range(struct mtrr_iter *iter,
+ struct kvm_mtrr_range *range)
+{
+ u64 start, end;
+
+ var_mtrr_range(range, &start, &end);
+ if (!(start >= iter->end || end <= iter->start)) {
+ iter->range = range;
+
+ /*
+ * the function is called when we do kvm_mtrr.head walking.
+ * Range has the minimum base address which interleaves
+ * [looker->start_max, looker->end).
+ */
+ iter->partial_map |= iter->start_max < start;
+
+ /* update the max address has been covered. */
+ iter->start_max = max(iter->start_max, end);
+ return true;
+ }
+
+ return false;
+}
+
+static void __mtrr_lookup_var_next(struct mtrr_iter *iter)
+{
+ struct kvm_mtrr *mtrr_state = iter->mtrr_state;
+
+ list_for_each_entry_continue(iter->range, &mtrr_state->head, node)
+ if (match_var_range(iter, iter->range))
+ return;
+
+ iter->range = NULL;
+ iter->partial_map |= iter->start_max < iter->end;
+}
+
+static void mtrr_lookup_var_start(struct mtrr_iter *iter)
+{
+ struct kvm_mtrr *mtrr_state = iter->mtrr_state;
+
+ iter->fixed = false;
+ iter->start_max = iter->start;
+ iter->range = NULL;
+ iter->range = list_prepare_entry(iter->range, &mtrr_state->head, node);
+
+ __mtrr_lookup_var_next(iter);
+}
+
+static void mtrr_lookup_fixed_next(struct mtrr_iter *iter)
+{
+ /* terminate the lookup. */
+ if (fixed_mtrr_range_end_addr(iter->seg, iter->index) >= iter->end) {
+ iter->fixed = false;
+ iter->range = NULL;
+ return;
+ }
+
+ iter->index++;
+
+ /* have looked up for all fixed MTRRs. */
+ if (iter->index >= ARRAY_SIZE(iter->mtrr_state->fixed_ranges))
+ return mtrr_lookup_var_start(iter);
+
+ /* switch to next segment. */
+ if (iter->index > fixed_mtrr_seg_end_range_index(iter->seg))
+ iter->seg++;
+}
+
+static void mtrr_lookup_var_next(struct mtrr_iter *iter)
+{
+ __mtrr_lookup_var_next(iter);
+}
+
+static void mtrr_lookup_start(struct mtrr_iter *iter)
+{
+ if (!mtrr_is_enabled(iter->mtrr_state)) {
+ iter->mtrr_disabled = true;
+ return;
+ }
+
+ if (!mtrr_lookup_fixed_start(iter))
+ mtrr_lookup_var_start(iter);
+}
+
+static void mtrr_lookup_init(struct mtrr_iter *iter,
+ struct kvm_mtrr *mtrr_state, u64 start, u64 end)
+{
+ iter->mtrr_state = mtrr_state;
+ iter->start = start;
+ iter->end = end;
+ iter->mtrr_disabled = false;
+ iter->partial_map = false;
+ iter->fixed = false;
+ iter->range = NULL;
+
+ mtrr_lookup_start(iter);
+}
+
+static bool mtrr_lookup_okay(struct mtrr_iter *iter)
+{
+ if (iter->fixed) {
+ iter->mem_type = iter->mtrr_state->fixed_ranges[iter->index];
+ return true;
+ }
+
+ if (iter->range) {
+ iter->mem_type = iter->range->base & 0xff;
+ return true;
+ }
+
+ return false;
+}
+
+static void mtrr_lookup_next(struct mtrr_iter *iter)
+{
+ if (iter->fixed)
+ mtrr_lookup_fixed_next(iter);
+ else
+ mtrr_lookup_var_next(iter);
+}
+
+#define mtrr_for_each_mem_type(_iter_, _mtrr_, _gpa_start_, _gpa_end_) \
+ for (mtrr_lookup_init(_iter_, _mtrr_, _gpa_start_, _gpa_end_); \
+ mtrr_lookup_okay(_iter_); mtrr_lookup_next(_iter_))
+
+u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+ struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state;
+ struct mtrr_iter iter;
+ u64 start, end;
+ int type = -1;
+ const int wt_wb_mask = (1 << MTRR_TYPE_WRBACK)
+ | (1 << MTRR_TYPE_WRTHROUGH);
+
+ start = gfn_to_gpa(gfn);
+ end = start + PAGE_SIZE;
+
+ mtrr_for_each_mem_type(&iter, mtrr_state, start, end) {
+ int curr_type = iter.mem_type;
+
+ /*
+ * Please refer to Intel SDM Volume 3: 11.11.4.1 MTRR
+ * Precedences.
+ */
+
+ if (type == -1) {
+ type = curr_type;
+ continue;
+ }
+
+ /*
+ * If two or more variable memory ranges match and the
+ * memory types are identical, then that memory type is
+ * used.
+ */
+ if (type == curr_type)
+ continue;
+
+ /*
+ * If two or more variable memory ranges match and one of
+ * the memory types is UC, the UC memory type used.
+ */
+ if (curr_type == MTRR_TYPE_UNCACHABLE)
+ return MTRR_TYPE_UNCACHABLE;
+
+ /*
+ * If two or more variable memory ranges match and the
+ * memory types are WT and WB, the WT memory type is used.
+ */
+ if (((1 << type) & wt_wb_mask) &&
+ ((1 << curr_type) & wt_wb_mask)) {
+ type = MTRR_TYPE_WRTHROUGH;
+ continue;
+ }
+
+ /*
+ * For overlaps not defined by the above rules, processor
+ * behavior is undefined.
+ */
+
+ /* We use WB for this undefined behavior. :( */
+ return MTRR_TYPE_WRBACK;
+ }
+
+ if (iter.mtrr_disabled)
+ return mtrr_disabled_type(vcpu);
+
+ /* not contained in any MTRRs. */
+ if (type == -1)
+ return mtrr_default_type(mtrr_state);
+
+ /*
+ * We just check one page, partially covered by MTRRs is
+ * impossible.
+ */
+ WARN_ON(iter.partial_map);
+
+ return type;
+}
+EXPORT_SYMBOL_GPL(kvm_mtrr_get_guest_memory_type);
+
+bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn,
+ int page_num)
+{
+ struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state;
+ struct mtrr_iter iter;
+ u64 start, end;
+ int type = -1;
+
+ start = gfn_to_gpa(gfn);
+ end = gfn_to_gpa(gfn + page_num);
+ mtrr_for_each_mem_type(&iter, mtrr_state, start, end) {
+ if (type == -1) {
+ type = iter.mem_type;
+ continue;
+ }
+
+ if (type != iter.mem_type)
+ return false;
+ }
+
+ if (iter.mtrr_disabled)
+ return true;
+
+ if (!iter.partial_map)
+ return true;
+
+ if (type == -1)
+ return true;
+
+ return type == mtrr_default_type(mtrr_state);
+}
diff --git a/arch/x86/kvm/page_track.c b/arch/x86/kvm/page_track.c
new file mode 100644
index 000000000..3052a59a3
--- /dev/null
+++ b/arch/x86/kvm/page_track.c
@@ -0,0 +1,267 @@
+/*
+ * Support KVM gust page tracking
+ *
+ * This feature allows us to track page access in guest. Currently, only
+ * write access is tracked.
+ *
+ * Copyright(C) 2015 Intel Corporation.
+ *
+ * Author:
+ * Xiao Guangrong <guangrong.xiao@linux.intel.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/rculist.h>
+
+#include <asm/kvm_host.h>
+#include <asm/kvm_page_track.h>
+
+#include "mmu.h"
+
+void kvm_page_track_free_memslot(struct kvm_memory_slot *free,
+ struct kvm_memory_slot *dont)
+{
+ int i;
+
+ for (i = 0; i < KVM_PAGE_TRACK_MAX; i++)
+ if (!dont || free->arch.gfn_track[i] !=
+ dont->arch.gfn_track[i]) {
+ kvfree(free->arch.gfn_track[i]);
+ free->arch.gfn_track[i] = NULL;
+ }
+}
+
+int kvm_page_track_create_memslot(struct kvm_memory_slot *slot,
+ unsigned long npages)
+{
+ int i;
+
+ for (i = 0; i < KVM_PAGE_TRACK_MAX; i++) {
+ slot->arch.gfn_track[i] =
+ kvcalloc(npages, sizeof(*slot->arch.gfn_track[i]),
+ GFP_KERNEL);
+ if (!slot->arch.gfn_track[i])
+ goto track_free;
+ }
+
+ return 0;
+
+track_free:
+ kvm_page_track_free_memslot(slot, NULL);
+ return -ENOMEM;
+}
+
+static inline bool page_track_mode_is_valid(enum kvm_page_track_mode mode)
+{
+ if (mode < 0 || mode >= KVM_PAGE_TRACK_MAX)
+ return false;
+
+ return true;
+}
+
+static void update_gfn_track(struct kvm_memory_slot *slot, gfn_t gfn,
+ enum kvm_page_track_mode mode, short count)
+{
+ int index, val;
+
+ index = gfn_to_index(gfn, slot->base_gfn, PT_PAGE_TABLE_LEVEL);
+
+ val = slot->arch.gfn_track[mode][index];
+
+ if (WARN_ON(val + count < 0 || val + count > USHRT_MAX))
+ return;
+
+ slot->arch.gfn_track[mode][index] += count;
+}
+
+/*
+ * add guest page to the tracking pool so that corresponding access on that
+ * page will be intercepted.
+ *
+ * It should be called under the protection both of mmu-lock and kvm->srcu
+ * or kvm->slots_lock.
+ *
+ * @kvm: the guest instance we are interested in.
+ * @slot: the @gfn belongs to.
+ * @gfn: the guest page.
+ * @mode: tracking mode, currently only write track is supported.
+ */
+void kvm_slot_page_track_add_page(struct kvm *kvm,
+ struct kvm_memory_slot *slot, gfn_t gfn,
+ enum kvm_page_track_mode mode)
+{
+
+ if (WARN_ON(!page_track_mode_is_valid(mode)))
+ return;
+
+ update_gfn_track(slot, gfn, mode, 1);
+
+ /*
+ * new track stops large page mapping for the
+ * tracked page.
+ */
+ kvm_mmu_gfn_disallow_lpage(slot, gfn);
+
+ if (mode == KVM_PAGE_TRACK_WRITE)
+ if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn))
+ kvm_flush_remote_tlbs(kvm);
+}
+EXPORT_SYMBOL_GPL(kvm_slot_page_track_add_page);
+
+/*
+ * remove the guest page from the tracking pool which stops the interception
+ * of corresponding access on that page. It is the opposed operation of
+ * kvm_slot_page_track_add_page().
+ *
+ * It should be called under the protection both of mmu-lock and kvm->srcu
+ * or kvm->slots_lock.
+ *
+ * @kvm: the guest instance we are interested in.
+ * @slot: the @gfn belongs to.
+ * @gfn: the guest page.
+ * @mode: tracking mode, currently only write track is supported.
+ */
+void kvm_slot_page_track_remove_page(struct kvm *kvm,
+ struct kvm_memory_slot *slot, gfn_t gfn,
+ enum kvm_page_track_mode mode)
+{
+ if (WARN_ON(!page_track_mode_is_valid(mode)))
+ return;
+
+ update_gfn_track(slot, gfn, mode, -1);
+
+ /*
+ * allow large page mapping for the tracked page
+ * after the tracker is gone.
+ */
+ kvm_mmu_gfn_allow_lpage(slot, gfn);
+}
+EXPORT_SYMBOL_GPL(kvm_slot_page_track_remove_page);
+
+/*
+ * check if the corresponding access on the specified guest page is tracked.
+ */
+bool kvm_page_track_is_active(struct kvm_vcpu *vcpu, gfn_t gfn,
+ enum kvm_page_track_mode mode)
+{
+ struct kvm_memory_slot *slot;
+ int index;
+
+ if (WARN_ON(!page_track_mode_is_valid(mode)))
+ return false;
+
+ slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+ if (!slot)
+ return false;
+
+ index = gfn_to_index(gfn, slot->base_gfn, PT_PAGE_TABLE_LEVEL);
+ return !!READ_ONCE(slot->arch.gfn_track[mode][index]);
+}
+
+void kvm_page_track_cleanup(struct kvm *kvm)
+{
+ struct kvm_page_track_notifier_head *head;
+
+ head = &kvm->arch.track_notifier_head;
+ cleanup_srcu_struct(&head->track_srcu);
+}
+
+void kvm_page_track_init(struct kvm *kvm)
+{
+ struct kvm_page_track_notifier_head *head;
+
+ head = &kvm->arch.track_notifier_head;
+ init_srcu_struct(&head->track_srcu);
+ INIT_HLIST_HEAD(&head->track_notifier_list);
+}
+
+/*
+ * register the notifier so that event interception for the tracked guest
+ * pages can be received.
+ */
+void
+kvm_page_track_register_notifier(struct kvm *kvm,
+ struct kvm_page_track_notifier_node *n)
+{
+ struct kvm_page_track_notifier_head *head;
+
+ head = &kvm->arch.track_notifier_head;
+
+ spin_lock(&kvm->mmu_lock);
+ hlist_add_head_rcu(&n->node, &head->track_notifier_list);
+ spin_unlock(&kvm->mmu_lock);
+}
+EXPORT_SYMBOL_GPL(kvm_page_track_register_notifier);
+
+/*
+ * stop receiving the event interception. It is the opposed operation of
+ * kvm_page_track_register_notifier().
+ */
+void
+kvm_page_track_unregister_notifier(struct kvm *kvm,
+ struct kvm_page_track_notifier_node *n)
+{
+ struct kvm_page_track_notifier_head *head;
+
+ head = &kvm->arch.track_notifier_head;
+
+ spin_lock(&kvm->mmu_lock);
+ hlist_del_rcu(&n->node);
+ spin_unlock(&kvm->mmu_lock);
+ synchronize_srcu(&head->track_srcu);
+}
+EXPORT_SYMBOL_GPL(kvm_page_track_unregister_notifier);
+
+/*
+ * Notify the node that write access is intercepted and write emulation is
+ * finished at this time.
+ *
+ * The node should figure out if the written page is the one that node is
+ * interested in by itself.
+ */
+void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
+ int bytes)
+{
+ struct kvm_page_track_notifier_head *head;
+ struct kvm_page_track_notifier_node *n;
+ int idx;
+
+ head = &vcpu->kvm->arch.track_notifier_head;
+
+ if (hlist_empty(&head->track_notifier_list))
+ return;
+
+ idx = srcu_read_lock(&head->track_srcu);
+ hlist_for_each_entry_rcu(n, &head->track_notifier_list, node)
+ if (n->track_write)
+ n->track_write(vcpu, gpa, new, bytes, n);
+ srcu_read_unlock(&head->track_srcu, idx);
+}
+
+/*
+ * Notify the node that memory slot is being removed or moved so that it can
+ * drop write-protection for the pages in the memory slot.
+ *
+ * The node should figure out it has any write-protected pages in this slot
+ * by itself.
+ */
+void kvm_page_track_flush_slot(struct kvm *kvm, struct kvm_memory_slot *slot)
+{
+ struct kvm_page_track_notifier_head *head;
+ struct kvm_page_track_notifier_node *n;
+ int idx;
+
+ head = &kvm->arch.track_notifier_head;
+
+ if (hlist_empty(&head->track_notifier_list))
+ return;
+
+ idx = srcu_read_lock(&head->track_srcu);
+ hlist_for_each_entry_rcu(n, &head->track_notifier_list, node)
+ if (n->track_flush_slot)
+ n->track_flush_slot(kvm, slot, n);
+ srcu_read_unlock(&head->track_srcu, idx);
+}
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
new file mode 100644
index 000000000..9e15818de
--- /dev/null
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -0,0 +1,1083 @@
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * This module enables machines with Intel VT-x extensions to run virtual
+ * machines without emulation or binary translation.
+ *
+ * MMU support
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ * Yaniv Kamay <yaniv@qumranet.com>
+ * Avi Kivity <avi@qumranet.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+/*
+ * We need the mmu code to access both 32-bit and 64-bit guest ptes,
+ * so the code in this file is compiled twice, once per pte size.
+ */
+
+#if PTTYPE == 64
+ #define pt_element_t u64
+ #define guest_walker guest_walker64
+ #define FNAME(name) paging##64_##name
+ #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
+ #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
+ #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
+ #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
+ #define PT_LEVEL_BITS PT64_LEVEL_BITS
+ #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
+ #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
+ #define PT_HAVE_ACCESSED_DIRTY(mmu) true
+ #ifdef CONFIG_X86_64
+ #define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL
+ #define CMPXCHG cmpxchg
+ #else
+ #define CMPXCHG cmpxchg64
+ #define PT_MAX_FULL_LEVELS 2
+ #endif
+#elif PTTYPE == 32
+ #define pt_element_t u32
+ #define guest_walker guest_walker32
+ #define FNAME(name) paging##32_##name
+ #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
+ #define PT_LVL_ADDR_MASK(lvl) PT32_LVL_ADDR_MASK(lvl)
+ #define PT_LVL_OFFSET_MASK(lvl) PT32_LVL_OFFSET_MASK(lvl)
+ #define PT_INDEX(addr, level) PT32_INDEX(addr, level)
+ #define PT_LEVEL_BITS PT32_LEVEL_BITS
+ #define PT_MAX_FULL_LEVELS 2
+ #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
+ #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
+ #define PT_HAVE_ACCESSED_DIRTY(mmu) true
+ #define CMPXCHG cmpxchg
+#elif PTTYPE == PTTYPE_EPT
+ #define pt_element_t u64
+ #define guest_walker guest_walkerEPT
+ #define FNAME(name) ept_##name
+ #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
+ #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
+ #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
+ #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
+ #define PT_LEVEL_BITS PT64_LEVEL_BITS
+ #define PT_GUEST_DIRTY_SHIFT 9
+ #define PT_GUEST_ACCESSED_SHIFT 8
+ #define PT_HAVE_ACCESSED_DIRTY(mmu) ((mmu)->ept_ad)
+ #define CMPXCHG cmpxchg64
+ #define PT_MAX_FULL_LEVELS 4
+#else
+ #error Invalid PTTYPE value
+#endif
+
+#define PT_GUEST_DIRTY_MASK (1 << PT_GUEST_DIRTY_SHIFT)
+#define PT_GUEST_ACCESSED_MASK (1 << PT_GUEST_ACCESSED_SHIFT)
+
+#define gpte_to_gfn_lvl FNAME(gpte_to_gfn_lvl)
+#define gpte_to_gfn(pte) gpte_to_gfn_lvl((pte), PT_PAGE_TABLE_LEVEL)
+
+/*
+ * The guest_walker structure emulates the behavior of the hardware page
+ * table walker.
+ */
+struct guest_walker {
+ int level;
+ unsigned max_level;
+ gfn_t table_gfn[PT_MAX_FULL_LEVELS];
+ pt_element_t ptes[PT_MAX_FULL_LEVELS];
+ pt_element_t prefetch_ptes[PTE_PREFETCH_NUM];
+ gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
+ pt_element_t __user *ptep_user[PT_MAX_FULL_LEVELS];
+ bool pte_writable[PT_MAX_FULL_LEVELS];
+ unsigned int pt_access[PT_MAX_FULL_LEVELS];
+ unsigned int pte_access;
+ gfn_t gfn;
+ struct x86_exception fault;
+};
+
+static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
+{
+ return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT;
+}
+
+static inline void FNAME(protect_clean_gpte)(struct kvm_mmu *mmu, unsigned *access,
+ unsigned gpte)
+{
+ unsigned mask;
+
+ /* dirty bit is not supported, so no need to track it */
+ if (!PT_HAVE_ACCESSED_DIRTY(mmu))
+ return;
+
+ BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK);
+
+ mask = (unsigned)~ACC_WRITE_MASK;
+ /* Allow write access to dirty gptes */
+ mask |= (gpte >> (PT_GUEST_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) &
+ PT_WRITABLE_MASK;
+ *access &= mask;
+}
+
+static inline int FNAME(is_present_gpte)(unsigned long pte)
+{
+#if PTTYPE != PTTYPE_EPT
+ return pte & PT_PRESENT_MASK;
+#else
+ return pte & 7;
+#endif
+}
+
+static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ pt_element_t __user *ptep_user, unsigned index,
+ pt_element_t orig_pte, pt_element_t new_pte)
+{
+ int npages;
+ pt_element_t ret;
+ pt_element_t *table;
+ struct page *page;
+
+ npages = get_user_pages_fast((unsigned long)ptep_user, 1, 1, &page);
+ /* Check if the user is doing something meaningless. */
+ if (unlikely(npages != 1))
+ return -EFAULT;
+
+ table = kmap_atomic(page);
+ ret = CMPXCHG(&table[index], orig_pte, new_pte);
+ kunmap_atomic(table);
+
+ kvm_release_page_dirty(page);
+
+ return (ret != orig_pte);
+}
+
+static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp, u64 *spte,
+ u64 gpte)
+{
+ if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
+ goto no_present;
+
+ if (!FNAME(is_present_gpte)(gpte))
+ goto no_present;
+
+ /* if accessed bit is not supported prefetch non accessed gpte */
+ if (PT_HAVE_ACCESSED_DIRTY(&vcpu->arch.mmu) && !(gpte & PT_GUEST_ACCESSED_MASK))
+ goto no_present;
+
+ return false;
+
+no_present:
+ drop_spte(vcpu->kvm, spte);
+ return true;
+}
+
+/*
+ * For PTTYPE_EPT, a page table can be executable but not readable
+ * on supported processors. Therefore, set_spte does not automatically
+ * set bit 0 if execute only is supported. Here, we repurpose ACC_USER_MASK
+ * to signify readability since it isn't used in the EPT case
+ */
+static inline unsigned FNAME(gpte_access)(u64 gpte)
+{
+ unsigned access;
+#if PTTYPE == PTTYPE_EPT
+ access = ((gpte & VMX_EPT_WRITABLE_MASK) ? ACC_WRITE_MASK : 0) |
+ ((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) |
+ ((gpte & VMX_EPT_READABLE_MASK) ? ACC_USER_MASK : 0);
+#else
+ BUILD_BUG_ON(ACC_EXEC_MASK != PT_PRESENT_MASK);
+ BUILD_BUG_ON(ACC_EXEC_MASK != 1);
+ access = gpte & (PT_WRITABLE_MASK | PT_USER_MASK | PT_PRESENT_MASK);
+ /* Combine NX with P (which is set here) to get ACC_EXEC_MASK. */
+ access ^= (gpte >> PT64_NX_SHIFT);
+#endif
+
+ return access;
+}
+
+static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *mmu,
+ struct guest_walker *walker,
+ gpa_t addr, int write_fault)
+{
+ unsigned level, index;
+ pt_element_t pte, orig_pte;
+ pt_element_t __user *ptep_user;
+ gfn_t table_gfn;
+ int ret;
+
+ /* dirty/accessed bits are not supported, so no need to update them */
+ if (!PT_HAVE_ACCESSED_DIRTY(mmu))
+ return 0;
+
+ for (level = walker->max_level; level >= walker->level; --level) {
+ pte = orig_pte = walker->ptes[level - 1];
+ table_gfn = walker->table_gfn[level - 1];
+ ptep_user = walker->ptep_user[level - 1];
+ index = offset_in_page(ptep_user) / sizeof(pt_element_t);
+ if (!(pte & PT_GUEST_ACCESSED_MASK)) {
+ trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte));
+ pte |= PT_GUEST_ACCESSED_MASK;
+ }
+ if (level == walker->level && write_fault &&
+ !(pte & PT_GUEST_DIRTY_MASK)) {
+ trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
+#if PTTYPE == PTTYPE_EPT
+ if (kvm_arch_write_log_dirty(vcpu, addr))
+ return -EINVAL;
+#endif
+ pte |= PT_GUEST_DIRTY_MASK;
+ }
+ if (pte == orig_pte)
+ continue;
+
+ /*
+ * If the slot is read-only, simply do not process the accessed
+ * and dirty bits. This is the correct thing to do if the slot
+ * is ROM, and page tables in read-as-ROM/write-as-MMIO slots
+ * are only supported if the accessed and dirty bits are already
+ * set in the ROM (so that MMIO writes are never needed).
+ *
+ * Note that NPT does not allow this at all and faults, since
+ * it always wants nested page table entries for the guest
+ * page tables to be writable. And EPT works but will simply
+ * overwrite the read-only memory to set the accessed and dirty
+ * bits.
+ */
+ if (unlikely(!walker->pte_writable[level - 1]))
+ continue;
+
+ ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, orig_pte, pte);
+ if (ret)
+ return ret;
+
+ kvm_vcpu_mark_page_dirty(vcpu, table_gfn);
+ walker->ptes[level - 1] = pte;
+ }
+ return 0;
+}
+
+static inline unsigned FNAME(gpte_pkeys)(struct kvm_vcpu *vcpu, u64 gpte)
+{
+ unsigned pkeys = 0;
+#if PTTYPE == 64
+ pte_t pte = {.pte = gpte};
+
+ pkeys = pte_flags_pkey(pte_flags(pte));
+#endif
+ return pkeys;
+}
+
+/*
+ * Fetch a guest pte for a guest virtual address, or for an L2's GPA.
+ */
+static int FNAME(walk_addr_generic)(struct guest_walker *walker,
+ struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ gpa_t addr, u32 access)
+{
+ int ret;
+ pt_element_t pte;
+ pt_element_t __user *uninitialized_var(ptep_user);
+ gfn_t table_gfn;
+ u64 pt_access, pte_access;
+ unsigned index, accessed_dirty, pte_pkey;
+ unsigned nested_access;
+ gpa_t pte_gpa;
+ bool have_ad;
+ int offset;
+ u64 walk_nx_mask = 0;
+ const int write_fault = access & PFERR_WRITE_MASK;
+ const int user_fault = access & PFERR_USER_MASK;
+ const int fetch_fault = access & PFERR_FETCH_MASK;
+ u16 errcode = 0;
+ gpa_t real_gpa;
+ gfn_t gfn;
+
+ trace_kvm_mmu_pagetable_walk(addr, access);
+retry_walk:
+ walker->level = mmu->root_level;
+ pte = mmu->get_cr3(vcpu);
+ have_ad = PT_HAVE_ACCESSED_DIRTY(mmu);
+
+#if PTTYPE == 64
+ walk_nx_mask = 1ULL << PT64_NX_SHIFT;
+ if (walker->level == PT32E_ROOT_LEVEL) {
+ pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3);
+ trace_kvm_mmu_paging_element(pte, walker->level);
+ if (!FNAME(is_present_gpte)(pte))
+ goto error;
+ --walker->level;
+ }
+#endif
+ walker->max_level = walker->level;
+ ASSERT(!(is_long_mode(vcpu) && !is_pae(vcpu)));
+
+ /*
+ * FIXME: on Intel processors, loads of the PDPTE registers for PAE paging
+ * by the MOV to CR instruction are treated as reads and do not cause the
+ * processor to set the dirty flag in any EPT paging-structure entry.
+ */
+ nested_access = (have_ad ? PFERR_WRITE_MASK : 0) | PFERR_USER_MASK;
+
+ pte_access = ~0;
+ ++walker->level;
+
+ do {
+ gfn_t real_gfn;
+ unsigned long host_addr;
+
+ pt_access = pte_access;
+ --walker->level;
+
+ index = PT_INDEX(addr, walker->level);
+ table_gfn = gpte_to_gfn(pte);
+ offset = index * sizeof(pt_element_t);
+ pte_gpa = gfn_to_gpa(table_gfn) + offset;
+
+ BUG_ON(walker->level < 1);
+ walker->table_gfn[walker->level - 1] = table_gfn;
+ walker->pte_gpa[walker->level - 1] = pte_gpa;
+
+ real_gfn = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn),
+ nested_access,
+ &walker->fault);
+
+ /*
+ * FIXME: This can happen if emulation (for of an INS/OUTS
+ * instruction) triggers a nested page fault. The exit
+ * qualification / exit info field will incorrectly have
+ * "guest page access" as the nested page fault's cause,
+ * instead of "guest page structure access". To fix this,
+ * the x86_exception struct should be augmented with enough
+ * information to fix the exit_qualification or exit_info_1
+ * fields.
+ */
+ if (unlikely(real_gfn == UNMAPPED_GVA))
+ return 0;
+
+ real_gfn = gpa_to_gfn(real_gfn);
+
+ host_addr = kvm_vcpu_gfn_to_hva_prot(vcpu, real_gfn,
+ &walker->pte_writable[walker->level - 1]);
+ if (unlikely(kvm_is_error_hva(host_addr)))
+ goto error;
+
+ ptep_user = (pt_element_t __user *)((void *)host_addr + offset);
+ if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte))))
+ goto error;
+ walker->ptep_user[walker->level - 1] = ptep_user;
+
+ trace_kvm_mmu_paging_element(pte, walker->level);
+
+ /*
+ * Inverting the NX it lets us AND it like other
+ * permission bits.
+ */
+ pte_access = pt_access & (pte ^ walk_nx_mask);
+
+ if (unlikely(!FNAME(is_present_gpte)(pte)))
+ goto error;
+
+ if (unlikely(is_rsvd_bits_set(mmu, pte, walker->level))) {
+ errcode = PFERR_RSVD_MASK | PFERR_PRESENT_MASK;
+ goto error;
+ }
+
+ walker->ptes[walker->level - 1] = pte;
+
+ /* Convert to ACC_*_MASK flags for struct guest_walker. */
+ walker->pt_access[walker->level - 1] = FNAME(gpte_access)(pt_access ^ walk_nx_mask);
+ } while (!is_last_gpte(mmu, walker->level, pte));
+
+ pte_pkey = FNAME(gpte_pkeys)(vcpu, pte);
+ accessed_dirty = have_ad ? pte_access & PT_GUEST_ACCESSED_MASK : 0;
+
+ /* Convert to ACC_*_MASK flags for struct guest_walker. */
+ walker->pte_access = FNAME(gpte_access)(pte_access ^ walk_nx_mask);
+ errcode = permission_fault(vcpu, mmu, walker->pte_access, pte_pkey, access);
+ if (unlikely(errcode))
+ goto error;
+
+ gfn = gpte_to_gfn_lvl(pte, walker->level);
+ gfn += (addr & PT_LVL_OFFSET_MASK(walker->level)) >> PAGE_SHIFT;
+
+ if (PTTYPE == 32 && walker->level == PT_DIRECTORY_LEVEL && is_cpuid_PSE36())
+ gfn += pse36_gfn_delta(pte);
+
+ real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), access, &walker->fault);
+ if (real_gpa == UNMAPPED_GVA)
+ return 0;
+
+ walker->gfn = real_gpa >> PAGE_SHIFT;
+
+ if (!write_fault)
+ FNAME(protect_clean_gpte)(mmu, &walker->pte_access, pte);
+ else
+ /*
+ * On a write fault, fold the dirty bit into accessed_dirty.
+ * For modes without A/D bits support accessed_dirty will be
+ * always clear.
+ */
+ accessed_dirty &= pte >>
+ (PT_GUEST_DIRTY_SHIFT - PT_GUEST_ACCESSED_SHIFT);
+
+ if (unlikely(!accessed_dirty)) {
+ ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker,
+ addr, write_fault);
+ if (unlikely(ret < 0))
+ goto error;
+ else if (ret)
+ goto retry_walk;
+ }
+
+ pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
+ __func__, (u64)pte, walker->pte_access,
+ walker->pt_access[walker->level - 1]);
+ return 1;
+
+error:
+ errcode |= write_fault | user_fault;
+ if (fetch_fault && (mmu->nx ||
+ kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)))
+ errcode |= PFERR_FETCH_MASK;
+
+ walker->fault.vector = PF_VECTOR;
+ walker->fault.error_code_valid = true;
+ walker->fault.error_code = errcode;
+
+#if PTTYPE == PTTYPE_EPT
+ /*
+ * Use PFERR_RSVD_MASK in error_code to to tell if EPT
+ * misconfiguration requires to be injected. The detection is
+ * done by is_rsvd_bits_set() above.
+ *
+ * We set up the value of exit_qualification to inject:
+ * [2:0] - Derive from the access bits. The exit_qualification might be
+ * out of date if it is serving an EPT misconfiguration.
+ * [5:3] - Calculated by the page walk of the guest EPT page tables
+ * [7:8] - Derived from [7:8] of real exit_qualification
+ *
+ * The other bits are set to 0.
+ */
+ if (!(errcode & PFERR_RSVD_MASK)) {
+ vcpu->arch.exit_qualification &= 0x180;
+ if (write_fault)
+ vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_WRITE;
+ if (user_fault)
+ vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_READ;
+ if (fetch_fault)
+ vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_INSTR;
+ vcpu->arch.exit_qualification |= (pte_access & 0x7) << 3;
+ }
+#endif
+ walker->fault.address = addr;
+ walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu;
+
+ trace_kvm_mmu_walker_error(walker->fault.error_code);
+ return 0;
+}
+
+static int FNAME(walk_addr)(struct guest_walker *walker,
+ struct kvm_vcpu *vcpu, gpa_t addr, u32 access)
+{
+ return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.mmu, addr,
+ access);
+}
+
+#if PTTYPE != PTTYPE_EPT
+static int FNAME(walk_addr_nested)(struct guest_walker *walker,
+ struct kvm_vcpu *vcpu, gva_t addr,
+ u32 access)
+{
+ return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu,
+ addr, access);
+}
+#endif
+
+static bool
+FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ u64 *spte, pt_element_t gpte, bool no_dirty_log)
+{
+ unsigned pte_access;
+ gfn_t gfn;
+ kvm_pfn_t pfn;
+
+ if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
+ return false;
+
+ pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
+
+ gfn = gpte_to_gfn(gpte);
+ pte_access = sp->role.access & FNAME(gpte_access)(gpte);
+ FNAME(protect_clean_gpte)(&vcpu->arch.mmu, &pte_access, gpte);
+ pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
+ no_dirty_log && (pte_access & ACC_WRITE_MASK));
+ if (is_error_pfn(pfn))
+ return false;
+
+ /*
+ * we call mmu_set_spte() with host_writable = true because
+ * pte_prefetch_gfn_to_pfn always gets a writable pfn.
+ */
+ mmu_set_spte(vcpu, spte, pte_access, 0, PT_PAGE_TABLE_LEVEL, gfn, pfn,
+ true, true);
+
+ kvm_release_pfn_clean(pfn);
+ return true;
+}
+
+static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ u64 *spte, const void *pte)
+{
+ pt_element_t gpte = *(const pt_element_t *)pte;
+
+ FNAME(prefetch_gpte)(vcpu, sp, spte, gpte, false);
+}
+
+static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
+ struct guest_walker *gw, int level)
+{
+ pt_element_t curr_pte;
+ gpa_t base_gpa, pte_gpa = gw->pte_gpa[level - 1];
+ u64 mask;
+ int r, index;
+
+ if (level == PT_PAGE_TABLE_LEVEL) {
+ mask = PTE_PREFETCH_NUM * sizeof(pt_element_t) - 1;
+ base_gpa = pte_gpa & ~mask;
+ index = (pte_gpa - base_gpa) / sizeof(pt_element_t);
+
+ r = kvm_vcpu_read_guest_atomic(vcpu, base_gpa,
+ gw->prefetch_ptes, sizeof(gw->prefetch_ptes));
+ curr_pte = gw->prefetch_ptes[index];
+ } else
+ r = kvm_vcpu_read_guest_atomic(vcpu, pte_gpa,
+ &curr_pte, sizeof(curr_pte));
+
+ return r || curr_pte != gw->ptes[level - 1];
+}
+
+static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
+ u64 *sptep)
+{
+ struct kvm_mmu_page *sp;
+ pt_element_t *gptep = gw->prefetch_ptes;
+ u64 *spte;
+ int i;
+
+ sp = page_header(__pa(sptep));
+
+ if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+ return;
+
+ if (sp->role.direct)
+ return __direct_pte_prefetch(vcpu, sp, sptep);
+
+ i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
+ spte = sp->spt + i;
+
+ for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
+ if (spte == sptep)
+ continue;
+
+ if (is_shadow_present_pte(*spte))
+ continue;
+
+ if (!FNAME(prefetch_gpte)(vcpu, sp, spte, gptep[i], true))
+ break;
+ }
+}
+
+/*
+ * Fetch a shadow pte for a specific level in the paging hierarchy.
+ * If the guest tries to write a write-protected page, we need to
+ * emulate this operation, return 1 to indicate this case.
+ */
+static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr,
+ struct guest_walker *gw,
+ int write_fault, int hlevel,
+ kvm_pfn_t pfn, bool map_writable, bool prefault,
+ bool lpage_disallowed)
+{
+ struct kvm_mmu_page *sp = NULL;
+ struct kvm_shadow_walk_iterator it;
+ unsigned int direct_access, access;
+ int top_level, ret;
+ gfn_t gfn, base_gfn;
+
+ direct_access = gw->pte_access;
+
+ top_level = vcpu->arch.mmu.root_level;
+ if (top_level == PT32E_ROOT_LEVEL)
+ top_level = PT32_ROOT_LEVEL;
+ /*
+ * Verify that the top-level gpte is still there. Since the page
+ * is a root page, it is either write protected (and cannot be
+ * changed from now on) or it is invalid (in which case, we don't
+ * really care if it changes underneath us after this point).
+ */
+ if (FNAME(gpte_changed)(vcpu, gw, top_level))
+ goto out_gpte_changed;
+
+ if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ goto out_gpte_changed;
+
+ for (shadow_walk_init(&it, vcpu, addr);
+ shadow_walk_okay(&it) && it.level > gw->level;
+ shadow_walk_next(&it)) {
+ gfn_t table_gfn;
+
+ clear_sp_write_flooding_count(it.sptep);
+ drop_large_spte(vcpu, it.sptep);
+
+ sp = NULL;
+ if (!is_shadow_present_pte(*it.sptep)) {
+ table_gfn = gw->table_gfn[it.level - 2];
+ access = gw->pt_access[it.level - 2];
+ sp = kvm_mmu_get_page(vcpu, table_gfn, addr, it.level-1,
+ false, access);
+ }
+
+ /*
+ * Verify that the gpte in the page we've just write
+ * protected is still there.
+ */
+ if (FNAME(gpte_changed)(vcpu, gw, it.level - 1))
+ goto out_gpte_changed;
+
+ if (sp)
+ link_shadow_page(vcpu, it.sptep, sp);
+ }
+
+ /*
+ * FNAME(page_fault) might have clobbered the bottom bits of
+ * gw->gfn, restore them from the virtual address.
+ */
+ gfn = gw->gfn | ((addr & PT_LVL_OFFSET_MASK(gw->level)) >> PAGE_SHIFT);
+ base_gfn = gfn;
+
+ trace_kvm_mmu_spte_requested(addr, gw->level, pfn);
+
+ for (; shadow_walk_okay(&it); shadow_walk_next(&it)) {
+ clear_sp_write_flooding_count(it.sptep);
+
+ /*
+ * We cannot overwrite existing page tables with an NX
+ * large page, as the leaf could be executable.
+ */
+ disallowed_hugepage_adjust(it, gfn, &pfn, &hlevel);
+
+ base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
+ if (it.level == hlevel)
+ break;
+
+ validate_direct_spte(vcpu, it.sptep, direct_access);
+
+ drop_large_spte(vcpu, it.sptep);
+
+ if (!is_shadow_present_pte(*it.sptep)) {
+ sp = kvm_mmu_get_page(vcpu, base_gfn, addr,
+ it.level - 1, true, direct_access);
+ link_shadow_page(vcpu, it.sptep, sp);
+ if (lpage_disallowed)
+ account_huge_nx_page(vcpu->kvm, sp);
+ }
+ }
+
+ ret = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault,
+ it.level, base_gfn, pfn, prefault, map_writable);
+ FNAME(pte_prefetch)(vcpu, gw, it.sptep);
+ ++vcpu->stat.pf_fixed;
+ return ret;
+
+out_gpte_changed:
+ return RET_PF_RETRY;
+}
+
+ /*
+ * To see whether the mapped gfn can write its page table in the current
+ * mapping.
+ *
+ * It is the helper function of FNAME(page_fault). When guest uses large page
+ * size to map the writable gfn which is used as current page table, we should
+ * force kvm to use small page size to map it because new shadow page will be
+ * created when kvm establishes shadow page table that stop kvm using large
+ * page size. Do it early can avoid unnecessary #PF and emulation.
+ *
+ * @write_fault_to_shadow_pgtable will return true if the fault gfn is
+ * currently used as its page table.
+ *
+ * Note: the PDPT page table is not checked for PAE-32 bit guest. It is ok
+ * since the PDPT is always shadowed, that means, we can not use large page
+ * size to map the gfn which is used as PDPT.
+ */
+static bool
+FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu,
+ struct guest_walker *walker, int user_fault,
+ bool *write_fault_to_shadow_pgtable)
+{
+ int level;
+ gfn_t mask = ~(KVM_PAGES_PER_HPAGE(walker->level) - 1);
+ bool self_changed = false;
+
+ if (!(walker->pte_access & ACC_WRITE_MASK ||
+ (!is_write_protection(vcpu) && !user_fault)))
+ return false;
+
+ for (level = walker->level; level <= walker->max_level; level++) {
+ gfn_t gfn = walker->gfn ^ walker->table_gfn[level - 1];
+
+ self_changed |= !(gfn & mask);
+ *write_fault_to_shadow_pgtable |= !gfn;
+ }
+
+ return self_changed;
+}
+
+/*
+ * Page fault handler. There are several causes for a page fault:
+ * - there is no shadow pte for the guest pte
+ * - write access through a shadow pte marked read only so that we can set
+ * the dirty bit
+ * - write access to a shadow pte marked read only so we can update the page
+ * dirty bitmap, when userspace requests it
+ * - mmio access; in this case we will never install a present shadow pte
+ * - normal guest page fault due to the guest pte marked not present, not
+ * writable, or not executable
+ *
+ * Returns: 1 if we need to emulate the instruction, 0 otherwise, or
+ * a negative value on error.
+ */
+static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
+ bool prefault)
+{
+ int write_fault = error_code & PFERR_WRITE_MASK;
+ int user_fault = error_code & PFERR_USER_MASK;
+ struct guest_walker walker;
+ int r;
+ kvm_pfn_t pfn;
+ int level = PT_PAGE_TABLE_LEVEL;
+ unsigned long mmu_seq;
+ bool map_writable, is_self_change_mapping;
+ bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
+ is_nx_huge_page_enabled();
+ bool force_pt_level = lpage_disallowed;
+
+ pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
+
+ r = mmu_topup_memory_caches(vcpu);
+ if (r)
+ return r;
+
+ /*
+ * If PFEC.RSVD is set, this is a shadow page fault.
+ * The bit needs to be cleared before walking guest page tables.
+ */
+ error_code &= ~PFERR_RSVD_MASK;
+
+ /*
+ * Look up the guest pte for the faulting address.
+ */
+ r = FNAME(walk_addr)(&walker, vcpu, addr, error_code);
+
+ /*
+ * The page is not mapped by the guest. Let the guest handle it.
+ */
+ if (!r) {
+ pgprintk("%s: guest page fault\n", __func__);
+ if (!prefault)
+ inject_page_fault(vcpu, &walker.fault);
+
+ return RET_PF_RETRY;
+ }
+
+ if (page_fault_handle_page_track(vcpu, error_code, walker.gfn)) {
+ shadow_page_table_clear_flood(vcpu, addr);
+ return RET_PF_EMULATE;
+ }
+
+ vcpu->arch.write_fault_to_shadow_pgtable = false;
+
+ is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,
+ &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable);
+
+ if (walker.level >= PT_DIRECTORY_LEVEL && !is_self_change_mapping) {
+ level = mapping_level(vcpu, walker.gfn, &force_pt_level);
+ if (likely(!force_pt_level)) {
+ level = min(walker.level, level);
+ walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
+ }
+ } else
+ force_pt_level = true;
+
+ mmu_seq = vcpu->kvm->mmu_notifier_seq;
+ smp_rmb();
+
+ if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault,
+ &map_writable))
+ return RET_PF_RETRY;
+
+ if (handle_abnormal_pfn(vcpu, addr, walker.gfn, pfn, walker.pte_access, &r))
+ return r;
+
+ /*
+ * Do not change pte_access if the pfn is a mmio page, otherwise
+ * we will cache the incorrect access into mmio spte.
+ */
+ if (write_fault && !(walker.pte_access & ACC_WRITE_MASK) &&
+ !is_write_protection(vcpu) && !user_fault &&
+ !is_noslot_pfn(pfn)) {
+ walker.pte_access |= ACC_WRITE_MASK;
+ walker.pte_access &= ~ACC_USER_MASK;
+
+ /*
+ * If we converted a user page to a kernel page,
+ * so that the kernel can write to it when cr0.wp=0,
+ * then we should prevent the kernel from executing it
+ * if SMEP is enabled.
+ */
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
+ walker.pte_access &= ~ACC_EXEC_MASK;
+ }
+
+ r = RET_PF_RETRY;
+ spin_lock(&vcpu->kvm->mmu_lock);
+ if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
+ goto out_unlock;
+
+ kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
+ if (make_mmu_pages_available(vcpu) < 0)
+ goto out_unlock;
+ if (!force_pt_level)
+ transparent_hugepage_adjust(vcpu, walker.gfn, &pfn, &level);
+ r = FNAME(fetch)(vcpu, addr, &walker, write_fault,
+ level, pfn, map_writable, prefault, lpage_disallowed);
+ kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
+
+out_unlock:
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ kvm_release_pfn_clean(pfn);
+ return r;
+}
+
+static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp)
+{
+ int offset = 0;
+
+ WARN_ON(sp->role.level != PT_PAGE_TABLE_LEVEL);
+
+ if (PTTYPE == 32)
+ offset = sp->role.quadrant << PT64_LEVEL_BITS;
+
+ return gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t);
+}
+
+static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa)
+{
+ struct kvm_shadow_walk_iterator iterator;
+ struct kvm_mmu_page *sp;
+ int level;
+ u64 *sptep;
+
+ vcpu_clear_mmio_info(vcpu, gva);
+
+ /*
+ * No need to check return value here, rmap_can_add() can
+ * help us to skip pte prefetch later.
+ */
+ mmu_topup_memory_caches(vcpu);
+
+ if (!VALID_PAGE(root_hpa)) {
+ WARN_ON(1);
+ return;
+ }
+
+ spin_lock(&vcpu->kvm->mmu_lock);
+ for_each_shadow_entry_using_root(vcpu, root_hpa, gva, iterator) {
+ level = iterator.level;
+ sptep = iterator.sptep;
+
+ sp = page_header(__pa(sptep));
+ if (is_last_spte(*sptep, level)) {
+ pt_element_t gpte;
+ gpa_t pte_gpa;
+
+ if (!sp->unsync)
+ break;
+
+ pte_gpa = FNAME(get_level1_sp_gpa)(sp);
+ pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
+
+ if (mmu_page_zap_pte(vcpu->kvm, sp, sptep))
+ kvm_flush_remote_tlbs(vcpu->kvm);
+
+ if (!rmap_can_add(vcpu))
+ break;
+
+ if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte,
+ sizeof(pt_element_t)))
+ break;
+
+ FNAME(update_pte)(vcpu, sp, sptep, &gpte);
+ }
+
+ if (!is_shadow_present_pte(*sptep) || !sp->unsync_children)
+ break;
+ }
+ spin_unlock(&vcpu->kvm->mmu_lock);
+}
+
+/* Note, @addr is a GPA when gva_to_gpa() translates an L2 GPA to an L1 GPA. */
+static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gpa_t addr, u32 access,
+ struct x86_exception *exception)
+{
+ struct guest_walker walker;
+ gpa_t gpa = UNMAPPED_GVA;
+ int r;
+
+ r = FNAME(walk_addr)(&walker, vcpu, addr, access);
+
+ if (r) {
+ gpa = gfn_to_gpa(walker.gfn);
+ gpa |= addr & ~PAGE_MASK;
+ } else if (exception)
+ *exception = walker.fault;
+
+ return gpa;
+}
+
+#if PTTYPE != PTTYPE_EPT
+/* Note, gva_to_gpa_nested() is only used to translate L2 GVAs. */
+static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gpa_t vaddr,
+ u32 access,
+ struct x86_exception *exception)
+{
+ struct guest_walker walker;
+ gpa_t gpa = UNMAPPED_GVA;
+ int r;
+
+#ifndef CONFIG_X86_64
+ /* A 64-bit GVA should be impossible on 32-bit KVM. */
+ WARN_ON_ONCE(vaddr >> 32);
+#endif
+
+ r = FNAME(walk_addr_nested)(&walker, vcpu, vaddr, access);
+
+ if (r) {
+ gpa = gfn_to_gpa(walker.gfn);
+ gpa |= vaddr & ~PAGE_MASK;
+ } else if (exception)
+ *exception = walker.fault;
+
+ return gpa;
+}
+#endif
+
+/*
+ * Using the cached information from sp->gfns is safe because:
+ * - The spte has a reference to the struct page, so the pfn for a given gfn
+ * can't change unless all sptes pointing to it are nuked first.
+ *
+ * Note:
+ * We should flush all tlbs if spte is dropped even though guest is
+ * responsible for it. Since if we don't, kvm_mmu_notifier_invalidate_page
+ * and kvm_mmu_notifier_invalidate_range_start detect the mapping page isn't
+ * used by guest then tlbs are not flushed, so guest is allowed to access the
+ * freed pages.
+ * And we increase kvm->tlbs_dirty to delay tlbs flush in this case.
+ */
+static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+ int i, nr_present = 0;
+ bool host_writable;
+ gpa_t first_pte_gpa;
+ int set_spte_ret = 0;
+
+ /* direct kvm_mmu_page can not be unsync. */
+ BUG_ON(sp->role.direct);
+
+ first_pte_gpa = FNAME(get_level1_sp_gpa)(sp);
+
+ for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
+ unsigned pte_access;
+ pt_element_t gpte;
+ gpa_t pte_gpa;
+ gfn_t gfn;
+
+ if (!sp->spt[i])
+ continue;
+
+ pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);
+
+ if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte,
+ sizeof(pt_element_t)))
+ return 0;
+
+ if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
+ /*
+ * Update spte before increasing tlbs_dirty to make
+ * sure no tlb flush is lost after spte is zapped; see
+ * the comments in kvm_flush_remote_tlbs().
+ */
+ smp_wmb();
+ vcpu->kvm->tlbs_dirty++;
+ continue;
+ }
+
+ gfn = gpte_to_gfn(gpte);
+ pte_access = sp->role.access;
+ pte_access &= FNAME(gpte_access)(gpte);
+ FNAME(protect_clean_gpte)(&vcpu->arch.mmu, &pte_access, gpte);
+
+ if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access,
+ &nr_present))
+ continue;
+
+ if (gfn != sp->gfns[i]) {
+ drop_spte(vcpu->kvm, &sp->spt[i]);
+ /*
+ * The same as above where we are doing
+ * prefetch_invalid_gpte().
+ */
+ smp_wmb();
+ vcpu->kvm->tlbs_dirty++;
+ continue;
+ }
+
+ nr_present++;
+
+ host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE;
+
+ set_spte_ret |= set_spte(vcpu, &sp->spt[i],
+ pte_access, PT_PAGE_TABLE_LEVEL,
+ gfn, spte_to_pfn(sp->spt[i]),
+ true, false, host_writable);
+ }
+
+ if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH)
+ kvm_flush_remote_tlbs(vcpu->kvm);
+
+ return nr_present;
+}
+
+#undef pt_element_t
+#undef guest_walker
+#undef FNAME
+#undef PT_BASE_ADDR_MASK
+#undef PT_INDEX
+#undef PT_LVL_ADDR_MASK
+#undef PT_LVL_OFFSET_MASK
+#undef PT_LEVEL_BITS
+#undef PT_MAX_FULL_LEVELS
+#undef gpte_to_gfn
+#undef gpte_to_gfn_lvl
+#undef CMPXCHG
+#undef PT_GUEST_ACCESSED_MASK
+#undef PT_GUEST_DIRTY_MASK
+#undef PT_GUEST_DIRTY_SHIFT
+#undef PT_GUEST_ACCESSED_SHIFT
+#undef PT_HAVE_ACCESSED_DIRTY
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
new file mode 100644
index 000000000..ad3d39c00
--- /dev/null
+++ b/arch/x86/kvm/pmu.c
@@ -0,0 +1,349 @@
+/*
+ * Kernel-based Virtual Machine -- Performance Monitoring Unit support
+ *
+ * Copyright 2015 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ * Avi Kivity <avi@redhat.com>
+ * Gleb Natapov <gleb@redhat.com>
+ * Wei Huang <wei@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/kvm_host.h>
+#include <linux/perf_event.h>
+#include <asm/perf_event.h>
+#include "x86.h"
+#include "cpuid.h"
+#include "lapic.h"
+#include "pmu.h"
+
+/* NOTE:
+ * - Each perf counter is defined as "struct kvm_pmc";
+ * - There are two types of perf counters: general purpose (gp) and fixed.
+ * gp counters are stored in gp_counters[] and fixed counters are stored
+ * in fixed_counters[] respectively. Both of them are part of "struct
+ * kvm_pmu";
+ * - pmu.c understands the difference between gp counters and fixed counters.
+ * However AMD doesn't support fixed-counters;
+ * - There are three types of index to access perf counters (PMC):
+ * 1. MSR (named msr): For example Intel has MSR_IA32_PERFCTRn and AMD
+ * has MSR_K7_PERFCTRn.
+ * 2. MSR Index (named idx): This normally is used by RDPMC instruction.
+ * For instance AMD RDPMC instruction uses 0000_0003h in ECX to access
+ * C001_0007h (MSR_K7_PERCTR3). Intel has a similar mechanism, except
+ * that it also supports fixed counters. idx can be used to as index to
+ * gp and fixed counters.
+ * 3. Global PMC Index (named pmc): pmc is an index specific to PMU
+ * code. Each pmc, stored in kvm_pmc.idx field, is unique across
+ * all perf counters (both gp and fixed). The mapping relationship
+ * between pmc and perf counters is as the following:
+ * * Intel: [0 .. INTEL_PMC_MAX_GENERIC-1] <=> gp counters
+ * [INTEL_PMC_IDX_FIXED .. INTEL_PMC_IDX_FIXED + 2] <=> fixed
+ * * AMD: [0 .. AMD64_NUM_COUNTERS-1] <=> gp counters
+ */
+
+static void kvm_pmi_trigger_fn(struct irq_work *irq_work)
+{
+ struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu, irq_work);
+ struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu);
+
+ kvm_pmu_deliver_pmi(vcpu);
+}
+
+static void kvm_perf_overflow(struct perf_event *perf_event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ struct kvm_pmc *pmc = perf_event->overflow_handler_context;
+ struct kvm_pmu *pmu = pmc_to_pmu(pmc);
+
+ if (!test_and_set_bit(pmc->idx,
+ (unsigned long *)&pmu->reprogram_pmi)) {
+ __set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
+ kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
+ }
+}
+
+static void kvm_perf_overflow_intr(struct perf_event *perf_event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ struct kvm_pmc *pmc = perf_event->overflow_handler_context;
+ struct kvm_pmu *pmu = pmc_to_pmu(pmc);
+
+ if (!test_and_set_bit(pmc->idx,
+ (unsigned long *)&pmu->reprogram_pmi)) {
+ __set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
+ kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
+
+ /*
+ * Inject PMI. If vcpu was in a guest mode during NMI PMI
+ * can be ejected on a guest mode re-entry. Otherwise we can't
+ * be sure that vcpu wasn't executing hlt instruction at the
+ * time of vmexit and is not going to re-enter guest mode until
+ * woken up. So we should wake it, but this is impossible from
+ * NMI context. Do it from irq work instead.
+ */
+ if (!kvm_is_in_guest())
+ irq_work_queue(&pmc_to_pmu(pmc)->irq_work);
+ else
+ kvm_make_request(KVM_REQ_PMI, pmc->vcpu);
+ }
+}
+
+static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
+ unsigned config, bool exclude_user,
+ bool exclude_kernel, bool intr,
+ bool in_tx, bool in_tx_cp)
+{
+ struct perf_event *event;
+ struct perf_event_attr attr = {
+ .type = type,
+ .size = sizeof(attr),
+ .pinned = true,
+ .exclude_idle = true,
+ .exclude_host = 1,
+ .exclude_user = exclude_user,
+ .exclude_kernel = exclude_kernel,
+ .config = config,
+ };
+
+ attr.sample_period = (-pmc->counter) & pmc_bitmask(pmc);
+
+ if (in_tx)
+ attr.config |= HSW_IN_TX;
+ if (in_tx_cp) {
+ /*
+ * HSW_IN_TX_CHECKPOINTED is not supported with nonzero
+ * period. Just clear the sample period so at least
+ * allocating the counter doesn't fail.
+ */
+ attr.sample_period = 0;
+ attr.config |= HSW_IN_TX_CHECKPOINTED;
+ }
+
+ event = perf_event_create_kernel_counter(&attr, -1, current,
+ intr ? kvm_perf_overflow_intr :
+ kvm_perf_overflow, pmc);
+ if (IS_ERR(event)) {
+ pr_debug_ratelimited("kvm_pmu: event creation failed %ld for pmc->idx = %d\n",
+ PTR_ERR(event), pmc->idx);
+ return;
+ }
+
+ pmc->perf_event = event;
+ clear_bit(pmc->idx, (unsigned long*)&pmc_to_pmu(pmc)->reprogram_pmi);
+}
+
+void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
+{
+ unsigned config, type = PERF_TYPE_RAW;
+ u8 event_select, unit_mask;
+
+ if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL)
+ printk_once("kvm pmu: pin control bit is ignored\n");
+
+ pmc->eventsel = eventsel;
+
+ pmc_stop_counter(pmc);
+
+ if (!(eventsel & ARCH_PERFMON_EVENTSEL_ENABLE) || !pmc_is_enabled(pmc))
+ return;
+
+ event_select = eventsel & ARCH_PERFMON_EVENTSEL_EVENT;
+ unit_mask = (eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
+
+ if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE |
+ ARCH_PERFMON_EVENTSEL_INV |
+ ARCH_PERFMON_EVENTSEL_CMASK |
+ HSW_IN_TX |
+ HSW_IN_TX_CHECKPOINTED))) {
+ config = kvm_x86_ops->pmu_ops->find_arch_event(pmc_to_pmu(pmc),
+ event_select,
+ unit_mask);
+ if (config != PERF_COUNT_HW_MAX)
+ type = PERF_TYPE_HARDWARE;
+ }
+
+ if (type == PERF_TYPE_RAW)
+ config = eventsel & AMD64_RAW_EVENT_MASK;
+
+ pmc_reprogram_counter(pmc, type, config,
+ !(eventsel & ARCH_PERFMON_EVENTSEL_USR),
+ !(eventsel & ARCH_PERFMON_EVENTSEL_OS),
+ eventsel & ARCH_PERFMON_EVENTSEL_INT,
+ (eventsel & HSW_IN_TX),
+ (eventsel & HSW_IN_TX_CHECKPOINTED));
+}
+EXPORT_SYMBOL_GPL(reprogram_gp_counter);
+
+void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx)
+{
+ unsigned en_field = ctrl & 0x3;
+ bool pmi = ctrl & 0x8;
+
+ pmc_stop_counter(pmc);
+
+ if (!en_field || !pmc_is_enabled(pmc))
+ return;
+
+ pmc_reprogram_counter(pmc, PERF_TYPE_HARDWARE,
+ kvm_x86_ops->pmu_ops->find_fixed_event(idx),
+ !(en_field & 0x2), /* exclude user */
+ !(en_field & 0x1), /* exclude kernel */
+ pmi, false, false);
+}
+EXPORT_SYMBOL_GPL(reprogram_fixed_counter);
+
+void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx)
+{
+ struct kvm_pmc *pmc = kvm_x86_ops->pmu_ops->pmc_idx_to_pmc(pmu, pmc_idx);
+
+ if (!pmc)
+ return;
+
+ if (pmc_is_gp(pmc))
+ reprogram_gp_counter(pmc, pmc->eventsel);
+ else {
+ int idx = pmc_idx - INTEL_PMC_IDX_FIXED;
+ u8 ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, idx);
+
+ reprogram_fixed_counter(pmc, ctrl, idx);
+ }
+}
+EXPORT_SYMBOL_GPL(reprogram_counter);
+
+void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ u64 bitmask;
+ int bit;
+
+ bitmask = pmu->reprogram_pmi;
+
+ for_each_set_bit(bit, (unsigned long *)&bitmask, X86_PMC_IDX_MAX) {
+ struct kvm_pmc *pmc = kvm_x86_ops->pmu_ops->pmc_idx_to_pmc(pmu, bit);
+
+ if (unlikely(!pmc || !pmc->perf_event)) {
+ clear_bit(bit, (unsigned long *)&pmu->reprogram_pmi);
+ continue;
+ }
+
+ reprogram_counter(pmu, bit);
+ }
+}
+
+/* check if idx is a valid index to access PMU */
+int kvm_pmu_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx)
+{
+ return kvm_x86_ops->pmu_ops->is_valid_msr_idx(vcpu, idx);
+}
+
+bool is_vmware_backdoor_pmc(u32 pmc_idx)
+{
+ switch (pmc_idx) {
+ case VMWARE_BACKDOOR_PMC_HOST_TSC:
+ case VMWARE_BACKDOOR_PMC_REAL_TIME:
+ case VMWARE_BACKDOOR_PMC_APPARENT_TIME:
+ return true;
+ }
+ return false;
+}
+
+static int kvm_pmu_rdpmc_vmware(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
+{
+ u64 ctr_val;
+
+ switch (idx) {
+ case VMWARE_BACKDOOR_PMC_HOST_TSC:
+ ctr_val = rdtsc();
+ break;
+ case VMWARE_BACKDOOR_PMC_REAL_TIME:
+ ctr_val = ktime_get_boot_ns();
+ break;
+ case VMWARE_BACKDOOR_PMC_APPARENT_TIME:
+ ctr_val = ktime_get_boot_ns() +
+ vcpu->kvm->arch.kvmclock_offset;
+ break;
+ default:
+ return 1;
+ }
+
+ *data = ctr_val;
+ return 0;
+}
+
+int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
+{
+ bool fast_mode = idx & (1u << 31);
+ struct kvm_pmc *pmc;
+ u64 mask = fast_mode ? ~0u : ~0ull;
+
+ if (is_vmware_backdoor_pmc(idx))
+ return kvm_pmu_rdpmc_vmware(vcpu, idx, data);
+
+ pmc = kvm_x86_ops->pmu_ops->msr_idx_to_pmc(vcpu, idx, &mask);
+ if (!pmc)
+ return 1;
+
+ *data = pmc_read_counter(pmc) & mask;
+ return 0;
+}
+
+void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu)
+{
+ if (lapic_in_kernel(vcpu))
+ kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC);
+}
+
+bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
+{
+ return kvm_x86_ops->pmu_ops->is_valid_msr(vcpu, msr);
+}
+
+int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
+{
+ return kvm_x86_ops->pmu_ops->get_msr(vcpu, msr, data);
+}
+
+int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ return kvm_x86_ops->pmu_ops->set_msr(vcpu, msr_info);
+}
+
+/* refresh PMU settings. This function generally is called when underlying
+ * settings are changed (such as changes of PMU CPUID by guest VMs), which
+ * should rarely happen.
+ */
+void kvm_pmu_refresh(struct kvm_vcpu *vcpu)
+{
+ kvm_x86_ops->pmu_ops->refresh(vcpu);
+}
+
+void kvm_pmu_reset(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+
+ irq_work_sync(&pmu->irq_work);
+ kvm_x86_ops->pmu_ops->reset(vcpu);
+}
+
+void kvm_pmu_init(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+
+ memset(pmu, 0, sizeof(*pmu));
+ kvm_x86_ops->pmu_ops->init(vcpu);
+ init_irq_work(&pmu->irq_work, kvm_pmi_trigger_fn);
+ kvm_pmu_refresh(vcpu);
+}
+
+void kvm_pmu_destroy(struct kvm_vcpu *vcpu)
+{
+ kvm_pmu_reset(vcpu);
+}
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
new file mode 100644
index 000000000..7b4828e50
--- /dev/null
+++ b/arch/x86/kvm/pmu.h
@@ -0,0 +1,136 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_PMU_H
+#define __KVM_X86_PMU_H
+
+#include <linux/nospec.h>
+
+#define vcpu_to_pmu(vcpu) (&(vcpu)->arch.pmu)
+#define pmu_to_vcpu(pmu) (container_of((pmu), struct kvm_vcpu, arch.pmu))
+#define pmc_to_pmu(pmc) (&(pmc)->vcpu->arch.pmu)
+
+/* retrieve the 4 bits for EN and PMI out of IA32_FIXED_CTR_CTRL */
+#define fixed_ctrl_field(ctrl_reg, idx) (((ctrl_reg) >> ((idx)*4)) & 0xf)
+
+#define VMWARE_BACKDOOR_PMC_HOST_TSC 0x10000
+#define VMWARE_BACKDOOR_PMC_REAL_TIME 0x10001
+#define VMWARE_BACKDOOR_PMC_APPARENT_TIME 0x10002
+
+struct kvm_event_hw_type_mapping {
+ u8 eventsel;
+ u8 unit_mask;
+ unsigned event_type;
+};
+
+struct kvm_pmu_ops {
+ unsigned (*find_arch_event)(struct kvm_pmu *pmu, u8 event_select,
+ u8 unit_mask);
+ unsigned (*find_fixed_event)(int idx);
+ bool (*pmc_is_enabled)(struct kvm_pmc *pmc);
+ struct kvm_pmc *(*pmc_idx_to_pmc)(struct kvm_pmu *pmu, int pmc_idx);
+ struct kvm_pmc *(*msr_idx_to_pmc)(struct kvm_vcpu *vcpu, unsigned idx,
+ u64 *mask);
+ int (*is_valid_msr_idx)(struct kvm_vcpu *vcpu, unsigned idx);
+ bool (*is_valid_msr)(struct kvm_vcpu *vcpu, u32 msr);
+ int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
+ int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
+ void (*refresh)(struct kvm_vcpu *vcpu);
+ void (*init)(struct kvm_vcpu *vcpu);
+ void (*reset)(struct kvm_vcpu *vcpu);
+};
+
+static inline u64 pmc_bitmask(struct kvm_pmc *pmc)
+{
+ struct kvm_pmu *pmu = pmc_to_pmu(pmc);
+
+ return pmu->counter_bitmask[pmc->type];
+}
+
+static inline u64 pmc_read_counter(struct kvm_pmc *pmc)
+{
+ u64 counter, enabled, running;
+
+ counter = pmc->counter;
+ if (pmc->perf_event)
+ counter += perf_event_read_value(pmc->perf_event,
+ &enabled, &running);
+ /* FIXME: Scaling needed? */
+ return counter & pmc_bitmask(pmc);
+}
+
+static inline void pmc_stop_counter(struct kvm_pmc *pmc)
+{
+ if (pmc->perf_event) {
+ pmc->counter = pmc_read_counter(pmc);
+ perf_event_release_kernel(pmc->perf_event);
+ pmc->perf_event = NULL;
+ }
+}
+
+static inline bool pmc_is_gp(struct kvm_pmc *pmc)
+{
+ return pmc->type == KVM_PMC_GP;
+}
+
+static inline bool pmc_is_fixed(struct kvm_pmc *pmc)
+{
+ return pmc->type == KVM_PMC_FIXED;
+}
+
+static inline bool pmc_is_enabled(struct kvm_pmc *pmc)
+{
+ return kvm_x86_ops->pmu_ops->pmc_is_enabled(pmc);
+}
+
+/* returns general purpose PMC with the specified MSR. Note that it can be
+ * used for both PERFCTRn and EVNTSELn; that is why it accepts base as a
+ * paramenter to tell them apart.
+ */
+static inline struct kvm_pmc *get_gp_pmc(struct kvm_pmu *pmu, u32 msr,
+ u32 base)
+{
+ if (msr >= base && msr < base + pmu->nr_arch_gp_counters) {
+ u32 index = array_index_nospec(msr - base,
+ pmu->nr_arch_gp_counters);
+
+ return &pmu->gp_counters[index];
+ }
+
+ return NULL;
+}
+
+/* returns fixed PMC with the specified MSR */
+static inline struct kvm_pmc *get_fixed_pmc(struct kvm_pmu *pmu, u32 msr)
+{
+ int base = MSR_CORE_PERF_FIXED_CTR0;
+
+ if (msr >= base && msr < base + pmu->nr_arch_fixed_counters) {
+ u32 index = array_index_nospec(msr - base,
+ pmu->nr_arch_fixed_counters);
+
+ return &pmu->fixed_counters[index];
+ }
+
+ return NULL;
+}
+
+void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel);
+void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int fixed_idx);
+void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx);
+
+void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu);
+void kvm_pmu_handle_event(struct kvm_vcpu *vcpu);
+int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data);
+int kvm_pmu_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx);
+bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr);
+int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
+int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
+void kvm_pmu_refresh(struct kvm_vcpu *vcpu);
+void kvm_pmu_reset(struct kvm_vcpu *vcpu);
+void kvm_pmu_init(struct kvm_vcpu *vcpu);
+void kvm_pmu_destroy(struct kvm_vcpu *vcpu);
+
+bool is_vmware_backdoor_pmc(u32 pmc_idx);
+
+extern struct kvm_pmu_ops intel_pmu_ops;
+extern struct kvm_pmu_ops amd_pmu_ops;
+#endif /* __KVM_X86_PMU_H */
diff --git a/arch/x86/kvm/pmu_amd.c b/arch/x86/kvm/pmu_amd.c
new file mode 100644
index 000000000..93a135f21
--- /dev/null
+++ b/arch/x86/kvm/pmu_amd.c
@@ -0,0 +1,317 @@
+/*
+ * KVM PMU support for AMD
+ *
+ * Copyright 2015, Red Hat, Inc. and/or its affiliates.
+ *
+ * Author:
+ * Wei Huang <wei@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ * Implementation is based on pmu_intel.c file
+ */
+#include <linux/types.h>
+#include <linux/kvm_host.h>
+#include <linux/perf_event.h>
+#include "x86.h"
+#include "cpuid.h"
+#include "lapic.h"
+#include "pmu.h"
+
+enum pmu_type {
+ PMU_TYPE_COUNTER = 0,
+ PMU_TYPE_EVNTSEL,
+};
+
+enum index {
+ INDEX_ZERO = 0,
+ INDEX_ONE,
+ INDEX_TWO,
+ INDEX_THREE,
+ INDEX_FOUR,
+ INDEX_FIVE,
+ INDEX_ERROR,
+};
+
+/* duplicated from amd_perfmon_event_map, K7 and above should work. */
+static struct kvm_event_hw_type_mapping amd_event_mapping[] = {
+ [0] = { 0x76, 0x00, PERF_COUNT_HW_CPU_CYCLES },
+ [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS },
+ [2] = { 0x7d, 0x07, PERF_COUNT_HW_CACHE_REFERENCES },
+ [3] = { 0x7e, 0x07, PERF_COUNT_HW_CACHE_MISSES },
+ [4] = { 0xc2, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
+ [5] = { 0xc3, 0x00, PERF_COUNT_HW_BRANCH_MISSES },
+ [6] = { 0xd0, 0x00, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND },
+ [7] = { 0xd1, 0x00, PERF_COUNT_HW_STALLED_CYCLES_BACKEND },
+};
+
+static unsigned int get_msr_base(struct kvm_pmu *pmu, enum pmu_type type)
+{
+ struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu);
+
+ if (guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) {
+ if (type == PMU_TYPE_COUNTER)
+ return MSR_F15H_PERF_CTR;
+ else
+ return MSR_F15H_PERF_CTL;
+ } else {
+ if (type == PMU_TYPE_COUNTER)
+ return MSR_K7_PERFCTR0;
+ else
+ return MSR_K7_EVNTSEL0;
+ }
+}
+
+static enum index msr_to_index(u32 msr)
+{
+ switch (msr) {
+ case MSR_F15H_PERF_CTL0:
+ case MSR_F15H_PERF_CTR0:
+ case MSR_K7_EVNTSEL0:
+ case MSR_K7_PERFCTR0:
+ return INDEX_ZERO;
+ case MSR_F15H_PERF_CTL1:
+ case MSR_F15H_PERF_CTR1:
+ case MSR_K7_EVNTSEL1:
+ case MSR_K7_PERFCTR1:
+ return INDEX_ONE;
+ case MSR_F15H_PERF_CTL2:
+ case MSR_F15H_PERF_CTR2:
+ case MSR_K7_EVNTSEL2:
+ case MSR_K7_PERFCTR2:
+ return INDEX_TWO;
+ case MSR_F15H_PERF_CTL3:
+ case MSR_F15H_PERF_CTR3:
+ case MSR_K7_EVNTSEL3:
+ case MSR_K7_PERFCTR3:
+ return INDEX_THREE;
+ case MSR_F15H_PERF_CTL4:
+ case MSR_F15H_PERF_CTR4:
+ return INDEX_FOUR;
+ case MSR_F15H_PERF_CTL5:
+ case MSR_F15H_PERF_CTR5:
+ return INDEX_FIVE;
+ default:
+ return INDEX_ERROR;
+ }
+}
+
+static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr,
+ enum pmu_type type)
+{
+ switch (msr) {
+ case MSR_F15H_PERF_CTL0:
+ case MSR_F15H_PERF_CTL1:
+ case MSR_F15H_PERF_CTL2:
+ case MSR_F15H_PERF_CTL3:
+ case MSR_F15H_PERF_CTL4:
+ case MSR_F15H_PERF_CTL5:
+ case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
+ if (type != PMU_TYPE_EVNTSEL)
+ return NULL;
+ break;
+ case MSR_F15H_PERF_CTR0:
+ case MSR_F15H_PERF_CTR1:
+ case MSR_F15H_PERF_CTR2:
+ case MSR_F15H_PERF_CTR3:
+ case MSR_F15H_PERF_CTR4:
+ case MSR_F15H_PERF_CTR5:
+ case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
+ if (type != PMU_TYPE_COUNTER)
+ return NULL;
+ break;
+ default:
+ return NULL;
+ }
+
+ return &pmu->gp_counters[msr_to_index(msr)];
+}
+
+static unsigned amd_find_arch_event(struct kvm_pmu *pmu,
+ u8 event_select,
+ u8 unit_mask)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(amd_event_mapping); i++)
+ if (amd_event_mapping[i].eventsel == event_select
+ && amd_event_mapping[i].unit_mask == unit_mask)
+ break;
+
+ if (i == ARRAY_SIZE(amd_event_mapping))
+ return PERF_COUNT_HW_MAX;
+
+ return amd_event_mapping[i].event_type;
+}
+
+/* return PERF_COUNT_HW_MAX as AMD doesn't have fixed events */
+static unsigned amd_find_fixed_event(int idx)
+{
+ return PERF_COUNT_HW_MAX;
+}
+
+/* check if a PMC is enabled by comparing it against global_ctrl bits. Because
+ * AMD CPU doesn't have global_ctrl MSR, all PMCs are enabled (return TRUE).
+ */
+static bool amd_pmc_is_enabled(struct kvm_pmc *pmc)
+{
+ return true;
+}
+
+static struct kvm_pmc *amd_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx)
+{
+ unsigned int base = get_msr_base(pmu, PMU_TYPE_COUNTER);
+ struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu);
+
+ if (guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) {
+ /*
+ * The idx is contiguous. The MSRs are not. The counter MSRs
+ * are interleaved with the event select MSRs.
+ */
+ pmc_idx *= 2;
+ }
+
+ return get_gp_pmc_amd(pmu, base + pmc_idx, PMU_TYPE_COUNTER);
+}
+
+/* returns 0 if idx's corresponding MSR exists; otherwise returns 1. */
+static int amd_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+
+ idx &= ~(3u << 30);
+
+ return (idx >= pmu->nr_arch_gp_counters);
+}
+
+/* idx is the ECX register of RDPMC instruction */
+static struct kvm_pmc *amd_msr_idx_to_pmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *mask)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *counters;
+
+ idx &= ~(3u << 30);
+ if (idx >= pmu->nr_arch_gp_counters)
+ return NULL;
+ counters = pmu->gp_counters;
+
+ return &counters[idx];
+}
+
+static bool amd_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ int ret = false;
+
+ ret = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER) ||
+ get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL);
+
+ return ret;
+}
+
+static int amd_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc;
+
+ /* MSR_PERFCTRn */
+ pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER);
+ if (pmc) {
+ *data = pmc_read_counter(pmc);
+ return 0;
+ }
+ /* MSR_EVNTSELn */
+ pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL);
+ if (pmc) {
+ *data = pmc->eventsel;
+ return 0;
+ }
+
+ return 1;
+}
+
+static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc;
+ u32 msr = msr_info->index;
+ u64 data = msr_info->data;
+
+ /* MSR_PERFCTRn */
+ pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER);
+ if (pmc) {
+ pmc->counter += data - pmc_read_counter(pmc);
+ return 0;
+ }
+ /* MSR_EVNTSELn */
+ pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL);
+ if (pmc) {
+ data &= ~pmu->reserved_bits;
+ if (data != pmc->eventsel)
+ reprogram_gp_counter(pmc, data);
+ return 0;
+ }
+
+ return 1;
+}
+
+static void amd_pmu_refresh(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+
+ if (guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE))
+ pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS_CORE;
+ else
+ pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS;
+
+ pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << 48) - 1;
+ pmu->reserved_bits = 0xffffffff00200000ull;
+ /* not applicable to AMD; but clean them to prevent any fall out */
+ pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
+ pmu->nr_arch_fixed_counters = 0;
+ pmu->version = 0;
+ pmu->global_status = 0;
+}
+
+static void amd_pmu_init(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ int i;
+
+ BUILD_BUG_ON(AMD64_NUM_COUNTERS_CORE > INTEL_PMC_MAX_GENERIC);
+
+ for (i = 0; i < AMD64_NUM_COUNTERS_CORE ; i++) {
+ pmu->gp_counters[i].type = KVM_PMC_GP;
+ pmu->gp_counters[i].vcpu = vcpu;
+ pmu->gp_counters[i].idx = i;
+ }
+}
+
+static void amd_pmu_reset(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ int i;
+
+ for (i = 0; i < AMD64_NUM_COUNTERS_CORE; i++) {
+ struct kvm_pmc *pmc = &pmu->gp_counters[i];
+
+ pmc_stop_counter(pmc);
+ pmc->counter = pmc->eventsel = 0;
+ }
+}
+
+struct kvm_pmu_ops amd_pmu_ops = {
+ .find_arch_event = amd_find_arch_event,
+ .find_fixed_event = amd_find_fixed_event,
+ .pmc_is_enabled = amd_pmc_is_enabled,
+ .pmc_idx_to_pmc = amd_pmc_idx_to_pmc,
+ .msr_idx_to_pmc = amd_msr_idx_to_pmc,
+ .is_valid_msr_idx = amd_is_valid_msr_idx,
+ .is_valid_msr = amd_is_valid_msr,
+ .get_msr = amd_pmu_get_msr,
+ .set_msr = amd_pmu_set_msr,
+ .refresh = amd_pmu_refresh,
+ .init = amd_pmu_init,
+ .reset = amd_pmu_reset,
+};
diff --git a/arch/x86/kvm/pmu_intel.c b/arch/x86/kvm/pmu_intel.c
new file mode 100644
index 000000000..611f9e60f
--- /dev/null
+++ b/arch/x86/kvm/pmu_intel.c
@@ -0,0 +1,374 @@
+/*
+ * KVM PMU support for Intel CPUs
+ *
+ * Copyright 2011 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ * Avi Kivity <avi@redhat.com>
+ * Gleb Natapov <gleb@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+#include <linux/types.h>
+#include <linux/kvm_host.h>
+#include <linux/perf_event.h>
+#include <asm/perf_event.h>
+#include "x86.h"
+#include "cpuid.h"
+#include "lapic.h"
+#include "pmu.h"
+
+static struct kvm_event_hw_type_mapping intel_arch_events[] = {
+ /* Index must match CPUID 0x0A.EBX bit vector */
+ [0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES },
+ [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS },
+ [2] = { 0x3c, 0x01, PERF_COUNT_HW_BUS_CYCLES },
+ [3] = { 0x2e, 0x4f, PERF_COUNT_HW_CACHE_REFERENCES },
+ [4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES },
+ [5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
+ [6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES },
+ [7] = { 0x00, 0x03, PERF_COUNT_HW_REF_CPU_CYCLES },
+};
+
+/* mapping between fixed pmc index and intel_arch_events array */
+static int fixed_pmc_events[] = {1, 0, 7};
+
+static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data)
+{
+ int i;
+
+ for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
+ u8 new_ctrl = fixed_ctrl_field(data, i);
+ u8 old_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, i);
+ struct kvm_pmc *pmc;
+
+ pmc = get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + i);
+
+ if (old_ctrl == new_ctrl)
+ continue;
+
+ reprogram_fixed_counter(pmc, new_ctrl, i);
+ }
+
+ pmu->fixed_ctr_ctrl = data;
+}
+
+/* function is called when global control register has been updated. */
+static void global_ctrl_changed(struct kvm_pmu *pmu, u64 data)
+{
+ int bit;
+ u64 diff = pmu->global_ctrl ^ data;
+
+ pmu->global_ctrl = data;
+
+ for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX)
+ reprogram_counter(pmu, bit);
+}
+
+static unsigned intel_find_arch_event(struct kvm_pmu *pmu,
+ u8 event_select,
+ u8 unit_mask)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(intel_arch_events); i++)
+ if (intel_arch_events[i].eventsel == event_select
+ && intel_arch_events[i].unit_mask == unit_mask
+ && (pmu->available_event_types & (1 << i)))
+ break;
+
+ if (i == ARRAY_SIZE(intel_arch_events))
+ return PERF_COUNT_HW_MAX;
+
+ return intel_arch_events[i].event_type;
+}
+
+static unsigned intel_find_fixed_event(int idx)
+{
+ u32 event;
+ size_t size = ARRAY_SIZE(fixed_pmc_events);
+
+ if (idx >= size)
+ return PERF_COUNT_HW_MAX;
+
+ event = fixed_pmc_events[array_index_nospec(idx, size)];
+ return intel_arch_events[event].event_type;
+}
+
+/* check if a PMC is enabled by comparing it with globl_ctrl bits. */
+static bool intel_pmc_is_enabled(struct kvm_pmc *pmc)
+{
+ struct kvm_pmu *pmu = pmc_to_pmu(pmc);
+
+ return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl);
+}
+
+static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx)
+{
+ if (pmc_idx < INTEL_PMC_IDX_FIXED)
+ return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + pmc_idx,
+ MSR_P6_EVNTSEL0);
+ else {
+ u32 idx = pmc_idx - INTEL_PMC_IDX_FIXED;
+
+ return get_fixed_pmc(pmu, idx + MSR_CORE_PERF_FIXED_CTR0);
+ }
+}
+
+/* returns 0 if idx's corresponding MSR exists; otherwise returns 1. */
+static int intel_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ bool fixed = idx & (1u << 30);
+
+ idx &= ~(3u << 30);
+
+ return (!fixed && idx >= pmu->nr_arch_gp_counters) ||
+ (fixed && idx >= pmu->nr_arch_fixed_counters);
+}
+
+static struct kvm_pmc *intel_msr_idx_to_pmc(struct kvm_vcpu *vcpu,
+ unsigned idx, u64 *mask)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ bool fixed = idx & (1u << 30);
+ struct kvm_pmc *counters;
+ unsigned int num_counters;
+
+ idx &= ~(3u << 30);
+ if (fixed) {
+ counters = pmu->fixed_counters;
+ num_counters = pmu->nr_arch_fixed_counters;
+ } else {
+ counters = pmu->gp_counters;
+ num_counters = pmu->nr_arch_gp_counters;
+ }
+ if (idx >= num_counters)
+ return NULL;
+ *mask &= pmu->counter_bitmask[fixed ? KVM_PMC_FIXED : KVM_PMC_GP];
+ return &counters[array_index_nospec(idx, num_counters)];
+}
+
+static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ int ret;
+
+ switch (msr) {
+ case MSR_CORE_PERF_FIXED_CTR_CTRL:
+ case MSR_CORE_PERF_GLOBAL_STATUS:
+ case MSR_CORE_PERF_GLOBAL_CTRL:
+ case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+ ret = pmu->version > 1;
+ break;
+ default:
+ ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) ||
+ get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0) ||
+ get_fixed_pmc(pmu, msr);
+ break;
+ }
+
+ return ret;
+}
+
+static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc;
+
+ switch (msr) {
+ case MSR_CORE_PERF_FIXED_CTR_CTRL:
+ *data = pmu->fixed_ctr_ctrl;
+ return 0;
+ case MSR_CORE_PERF_GLOBAL_STATUS:
+ *data = pmu->global_status;
+ return 0;
+ case MSR_CORE_PERF_GLOBAL_CTRL:
+ *data = pmu->global_ctrl;
+ return 0;
+ case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+ *data = pmu->global_ovf_ctrl;
+ return 0;
+ default:
+ if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0))) {
+ u64 val = pmc_read_counter(pmc);
+ *data = val & pmu->counter_bitmask[KVM_PMC_GP];
+ return 0;
+ } else if ((pmc = get_fixed_pmc(pmu, msr))) {
+ u64 val = pmc_read_counter(pmc);
+ *data = val & pmu->counter_bitmask[KVM_PMC_FIXED];
+ return 0;
+ } else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) {
+ *data = pmc->eventsel;
+ return 0;
+ }
+ }
+
+ return 1;
+}
+
+static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc;
+ u32 msr = msr_info->index;
+ u64 data = msr_info->data;
+
+ switch (msr) {
+ case MSR_CORE_PERF_FIXED_CTR_CTRL:
+ if (pmu->fixed_ctr_ctrl == data)
+ return 0;
+ if (!(data & 0xfffffffffffff444ull)) {
+ reprogram_fixed_counters(pmu, data);
+ return 0;
+ }
+ break;
+ case MSR_CORE_PERF_GLOBAL_STATUS:
+ if (msr_info->host_initiated) {
+ pmu->global_status = data;
+ return 0;
+ }
+ break; /* RO MSR */
+ case MSR_CORE_PERF_GLOBAL_CTRL:
+ if (pmu->global_ctrl == data)
+ return 0;
+ if (!(data & pmu->global_ctrl_mask)) {
+ global_ctrl_changed(pmu, data);
+ return 0;
+ }
+ break;
+ case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+ if (!(data & (pmu->global_ctrl_mask & ~(3ull<<62)))) {
+ if (!msr_info->host_initiated)
+ pmu->global_status &= ~data;
+ pmu->global_ovf_ctrl = data;
+ return 0;
+ }
+ break;
+ default:
+ if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0))) {
+ if (msr_info->host_initiated)
+ pmc->counter = data;
+ else
+ pmc->counter = (s32)data;
+ return 0;
+ } else if ((pmc = get_fixed_pmc(pmu, msr))) {
+ pmc->counter = data;
+ return 0;
+ } else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) {
+ if (data == pmc->eventsel)
+ return 0;
+ if (!(data & pmu->reserved_bits)) {
+ reprogram_gp_counter(pmc, data);
+ return 0;
+ }
+ }
+ }
+
+ return 1;
+}
+
+static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_cpuid_entry2 *entry;
+ union cpuid10_eax eax;
+ union cpuid10_edx edx;
+
+ pmu->nr_arch_gp_counters = 0;
+ pmu->nr_arch_fixed_counters = 0;
+ pmu->counter_bitmask[KVM_PMC_GP] = 0;
+ pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
+ pmu->version = 0;
+ pmu->reserved_bits = 0xffffffff00200000ull;
+
+ entry = kvm_find_cpuid_entry(vcpu, 0xa, 0);
+ if (!entry)
+ return;
+ eax.full = entry->eax;
+ edx.full = entry->edx;
+
+ pmu->version = eax.split.version_id;
+ if (!pmu->version)
+ return;
+
+ pmu->nr_arch_gp_counters = min_t(int, eax.split.num_counters,
+ INTEL_PMC_MAX_GENERIC);
+ pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << eax.split.bit_width) - 1;
+ pmu->available_event_types = ~entry->ebx &
+ ((1ull << eax.split.mask_length) - 1);
+
+ if (pmu->version == 1) {
+ pmu->nr_arch_fixed_counters = 0;
+ } else {
+ pmu->nr_arch_fixed_counters =
+ min_t(int, edx.split.num_counters_fixed,
+ INTEL_PMC_MAX_FIXED);
+ pmu->counter_bitmask[KVM_PMC_FIXED] =
+ ((u64)1 << edx.split.bit_width_fixed) - 1;
+ }
+
+ pmu->global_ctrl = ((1ull << pmu->nr_arch_gp_counters) - 1) |
+ (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED);
+ pmu->global_ctrl_mask = ~pmu->global_ctrl;
+
+ entry = kvm_find_cpuid_entry(vcpu, 7, 0);
+ if (entry &&
+ (boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) &&
+ (entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM)))
+ pmu->reserved_bits ^= HSW_IN_TX|HSW_IN_TX_CHECKPOINTED;
+}
+
+static void intel_pmu_init(struct kvm_vcpu *vcpu)
+{
+ int i;
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+
+ for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) {
+ pmu->gp_counters[i].type = KVM_PMC_GP;
+ pmu->gp_counters[i].vcpu = vcpu;
+ pmu->gp_counters[i].idx = i;
+ }
+
+ for (i = 0; i < INTEL_PMC_MAX_FIXED; i++) {
+ pmu->fixed_counters[i].type = KVM_PMC_FIXED;
+ pmu->fixed_counters[i].vcpu = vcpu;
+ pmu->fixed_counters[i].idx = i + INTEL_PMC_IDX_FIXED;
+ }
+}
+
+static void intel_pmu_reset(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ int i;
+
+ for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) {
+ struct kvm_pmc *pmc = &pmu->gp_counters[i];
+
+ pmc_stop_counter(pmc);
+ pmc->counter = pmc->eventsel = 0;
+ }
+
+ for (i = 0; i < INTEL_PMC_MAX_FIXED; i++)
+ pmc_stop_counter(&pmu->fixed_counters[i]);
+
+ pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status =
+ pmu->global_ovf_ctrl = 0;
+}
+
+struct kvm_pmu_ops intel_pmu_ops = {
+ .find_arch_event = intel_find_arch_event,
+ .find_fixed_event = intel_find_fixed_event,
+ .pmc_is_enabled = intel_pmc_is_enabled,
+ .pmc_idx_to_pmc = intel_pmc_idx_to_pmc,
+ .msr_idx_to_pmc = intel_msr_idx_to_pmc,
+ .is_valid_msr_idx = intel_is_valid_msr_idx,
+ .is_valid_msr = intel_is_valid_msr,
+ .get_msr = intel_pmu_get_msr,
+ .set_msr = intel_pmu_set_msr,
+ .refresh = intel_pmu_refresh,
+ .init = intel_pmu_init,
+ .reset = intel_pmu_reset,
+};
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
new file mode 100644
index 000000000..851814574
--- /dev/null
+++ b/arch/x86/kvm/svm.c
@@ -0,0 +1,7323 @@
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * AMD SVM support
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ * Yaniv Kamay <yaniv@qumranet.com>
+ * Avi Kivity <avi@qumranet.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#define pr_fmt(fmt) "SVM: " fmt
+
+#include <linux/kvm_host.h>
+
+#include "irq.h"
+#include "mmu.h"
+#include "kvm_cache_regs.h"
+#include "x86.h"
+#include "cpuid.h"
+#include "pmu.h"
+
+#include <linux/module.h>
+#include <linux/mod_devicetable.h>
+#include <linux/kernel.h>
+#include <linux/vmalloc.h>
+#include <linux/highmem.h>
+#include <linux/sched.h>
+#include <linux/trace_events.h>
+#include <linux/slab.h>
+#include <linux/amd-iommu.h>
+#include <linux/hashtable.h>
+#include <linux/frame.h>
+#include <linux/psp-sev.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+
+#include <asm/apic.h>
+#include <asm/perf_event.h>
+#include <asm/tlbflush.h>
+#include <asm/desc.h>
+#include <asm/debugreg.h>
+#include <asm/kvm_para.h>
+#include <asm/irq_remapping.h>
+#include <asm/spec-ctrl.h>
+
+#include <asm/virtext.h>
+#include "trace.h"
+
+#define __ex(x) __kvm_handle_fault_on_reboot(x)
+
+MODULE_AUTHOR("Qumranet");
+MODULE_LICENSE("GPL");
+
+static const struct x86_cpu_id svm_cpu_id[] = {
+ X86_FEATURE_MATCH(X86_FEATURE_SVM),
+ {}
+};
+MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
+
+#define IOPM_ALLOC_ORDER 2
+#define MSRPM_ALLOC_ORDER 1
+
+#define SEG_TYPE_LDT 2
+#define SEG_TYPE_BUSY_TSS16 3
+
+#define SVM_FEATURE_NPT (1 << 0)
+#define SVM_FEATURE_LBRV (1 << 1)
+#define SVM_FEATURE_SVML (1 << 2)
+#define SVM_FEATURE_NRIP (1 << 3)
+#define SVM_FEATURE_TSC_RATE (1 << 4)
+#define SVM_FEATURE_VMCB_CLEAN (1 << 5)
+#define SVM_FEATURE_FLUSH_ASID (1 << 6)
+#define SVM_FEATURE_DECODE_ASSIST (1 << 7)
+#define SVM_FEATURE_PAUSE_FILTER (1 << 10)
+
+#define SVM_AVIC_DOORBELL 0xc001011b
+
+#define NESTED_EXIT_HOST 0 /* Exit handled on host level */
+#define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */
+#define NESTED_EXIT_CONTINUE 2 /* Further checks needed */
+
+#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
+
+#define TSC_RATIO_RSVD 0xffffff0000000000ULL
+#define TSC_RATIO_MIN 0x0000000000000001ULL
+#define TSC_RATIO_MAX 0x000000ffffffffffULL
+
+#define AVIC_HPA_MASK ~((0xFFFULL << 52) | 0xFFF)
+
+/*
+ * 0xff is broadcast, so the max index allowed for physical APIC ID
+ * table is 0xfe. APIC IDs above 0xff are reserved.
+ */
+#define AVIC_MAX_PHYSICAL_ID_COUNT 255
+
+#define AVIC_UNACCEL_ACCESS_WRITE_MASK 1
+#define AVIC_UNACCEL_ACCESS_OFFSET_MASK 0xFF0
+#define AVIC_UNACCEL_ACCESS_VECTOR_MASK 0xFFFFFFFF
+
+/* AVIC GATAG is encoded using VM and VCPU IDs */
+#define AVIC_VCPU_ID_BITS 8
+#define AVIC_VCPU_ID_MASK ((1 << AVIC_VCPU_ID_BITS) - 1)
+
+#define AVIC_VM_ID_BITS 24
+#define AVIC_VM_ID_NR (1 << AVIC_VM_ID_BITS)
+#define AVIC_VM_ID_MASK ((1 << AVIC_VM_ID_BITS) - 1)
+
+#define AVIC_GATAG(x, y) (((x & AVIC_VM_ID_MASK) << AVIC_VCPU_ID_BITS) | \
+ (y & AVIC_VCPU_ID_MASK))
+#define AVIC_GATAG_TO_VMID(x) ((x >> AVIC_VCPU_ID_BITS) & AVIC_VM_ID_MASK)
+#define AVIC_GATAG_TO_VCPUID(x) (x & AVIC_VCPU_ID_MASK)
+
+static bool erratum_383_found __read_mostly;
+
+static const u32 host_save_user_msrs[] = {
+#ifdef CONFIG_X86_64
+ MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
+ MSR_FS_BASE,
+#endif
+ MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
+ MSR_TSC_AUX,
+};
+
+#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
+
+struct kvm_sev_info {
+ bool active; /* SEV enabled guest */
+ unsigned int asid; /* ASID used for this guest */
+ unsigned int handle; /* SEV firmware handle */
+ int fd; /* SEV device fd */
+ unsigned long pages_locked; /* Number of pages locked */
+ struct list_head regions_list; /* List of registered regions */
+};
+
+struct kvm_svm {
+ struct kvm kvm;
+
+ /* Struct members for AVIC */
+ u32 avic_vm_id;
+ u32 ldr_mode;
+ struct page *avic_logical_id_table_page;
+ struct page *avic_physical_id_table_page;
+ struct hlist_node hnode;
+
+ struct kvm_sev_info sev_info;
+};
+
+struct kvm_vcpu;
+
+struct nested_state {
+ struct vmcb *hsave;
+ u64 hsave_msr;
+ u64 vm_cr_msr;
+ u64 vmcb;
+
+ /* These are the merged vectors */
+ u32 *msrpm;
+
+ /* gpa pointers to the real vectors */
+ u64 vmcb_msrpm;
+ u64 vmcb_iopm;
+
+ /* A VMEXIT is required but not yet emulated */
+ bool exit_required;
+
+ /* cache for intercepts of the guest */
+ u32 intercept_cr;
+ u32 intercept_dr;
+ u32 intercept_exceptions;
+ u64 intercept;
+
+ /* Nested Paging related state */
+ u64 nested_cr3;
+};
+
+#define MSRPM_OFFSETS 16
+static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
+
+/*
+ * Set osvw_len to higher value when updated Revision Guides
+ * are published and we know what the new status bits are
+ */
+static uint64_t osvw_len = 4, osvw_status;
+
+struct vcpu_svm {
+ struct kvm_vcpu vcpu;
+ struct vmcb *vmcb;
+ unsigned long vmcb_pa;
+ struct svm_cpu_data *svm_data;
+ uint64_t asid_generation;
+ uint64_t sysenter_esp;
+ uint64_t sysenter_eip;
+ uint64_t tsc_aux;
+
+ u64 msr_decfg;
+
+ u64 next_rip;
+
+ u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
+ struct {
+ u16 fs;
+ u16 gs;
+ u16 ldt;
+ u64 gs_base;
+ } host;
+
+ u64 spec_ctrl;
+ /*
+ * Contains guest-controlled bits of VIRT_SPEC_CTRL, which will be
+ * translated into the appropriate L2_CFG bits on the host to
+ * perform speculative control.
+ */
+ u64 virt_spec_ctrl;
+
+ u32 *msrpm;
+
+ ulong nmi_iret_rip;
+
+ struct nested_state nested;
+
+ bool nmi_singlestep;
+ u64 nmi_singlestep_guest_rflags;
+
+ unsigned int3_injected;
+ unsigned long int3_rip;
+
+ /* cached guest cpuid flags for faster access */
+ bool nrips_enabled : 1;
+
+ u32 ldr_reg;
+ struct page *avic_backing_page;
+ u64 *avic_physical_id_cache;
+ bool avic_is_running;
+
+ /*
+ * Per-vcpu list of struct amd_svm_iommu_ir:
+ * This is used mainly to store interrupt remapping information used
+ * when update the vcpu affinity. This avoids the need to scan for
+ * IRTE and try to match ga_tag in the IOMMU driver.
+ */
+ struct list_head ir_list;
+ spinlock_t ir_list_lock;
+
+ /* which host CPU was used for running this vcpu */
+ unsigned int last_cpu;
+};
+
+/*
+ * This is a wrapper of struct amd_iommu_ir_data.
+ */
+struct amd_svm_iommu_ir {
+ struct list_head node; /* Used by SVM for per-vcpu ir_list */
+ void *data; /* Storing pointer to struct amd_ir_data */
+};
+
+#define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK (0xFF)
+#define AVIC_LOGICAL_ID_ENTRY_VALID_MASK (1 << 31)
+
+#define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK (0xFFULL)
+#define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK (0xFFFFFFFFFFULL << 12)
+#define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK (1ULL << 62)
+#define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK (1ULL << 63)
+
+static DEFINE_PER_CPU(u64, current_tsc_ratio);
+#define TSC_RATIO_DEFAULT 0x0100000000ULL
+
+#define MSR_INVALID 0xffffffffU
+
+static const struct svm_direct_access_msrs {
+ u32 index; /* Index of the MSR */
+ bool always; /* True if intercept is always on */
+} direct_access_msrs[] = {
+ { .index = MSR_STAR, .always = true },
+ { .index = MSR_IA32_SYSENTER_CS, .always = true },
+#ifdef CONFIG_X86_64
+ { .index = MSR_GS_BASE, .always = true },
+ { .index = MSR_FS_BASE, .always = true },
+ { .index = MSR_KERNEL_GS_BASE, .always = true },
+ { .index = MSR_LSTAR, .always = true },
+ { .index = MSR_CSTAR, .always = true },
+ { .index = MSR_SYSCALL_MASK, .always = true },
+#endif
+ { .index = MSR_IA32_SPEC_CTRL, .always = false },
+ { .index = MSR_IA32_PRED_CMD, .always = false },
+ { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false },
+ { .index = MSR_IA32_LASTBRANCHTOIP, .always = false },
+ { .index = MSR_IA32_LASTINTFROMIP, .always = false },
+ { .index = MSR_IA32_LASTINTTOIP, .always = false },
+ { .index = MSR_INVALID, .always = false },
+};
+
+/* enable NPT for AMD64 and X86 with PAE */
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
+static bool npt_enabled = true;
+#else
+static bool npt_enabled;
+#endif
+
+/*
+ * These 2 parameters are used to config the controls for Pause-Loop Exiting:
+ * pause_filter_count: On processors that support Pause filtering(indicated
+ * by CPUID Fn8000_000A_EDX), the VMCB provides a 16 bit pause filter
+ * count value. On VMRUN this value is loaded into an internal counter.
+ * Each time a pause instruction is executed, this counter is decremented
+ * until it reaches zero at which time a #VMEXIT is generated if pause
+ * intercept is enabled. Refer to AMD APM Vol 2 Section 15.14.4 Pause
+ * Intercept Filtering for more details.
+ * This also indicate if ple logic enabled.
+ *
+ * pause_filter_thresh: In addition, some processor families support advanced
+ * pause filtering (indicated by CPUID Fn8000_000A_EDX) upper bound on
+ * the amount of time a guest is allowed to execute in a pause loop.
+ * In this mode, a 16-bit pause filter threshold field is added in the
+ * VMCB. The threshold value is a cycle count that is used to reset the
+ * pause counter. As with simple pause filtering, VMRUN loads the pause
+ * count value from VMCB into an internal counter. Then, on each pause
+ * instruction the hardware checks the elapsed number of cycles since
+ * the most recent pause instruction against the pause filter threshold.
+ * If the elapsed cycle count is greater than the pause filter threshold,
+ * then the internal pause count is reloaded from the VMCB and execution
+ * continues. If the elapsed cycle count is less than the pause filter
+ * threshold, then the internal pause count is decremented. If the count
+ * value is less than zero and PAUSE intercept is enabled, a #VMEXIT is
+ * triggered. If advanced pause filtering is supported and pause filter
+ * threshold field is set to zero, the filter will operate in the simpler,
+ * count only mode.
+ */
+
+static unsigned short pause_filter_thresh = KVM_DEFAULT_PLE_GAP;
+module_param(pause_filter_thresh, ushort, 0444);
+
+static unsigned short pause_filter_count = KVM_SVM_DEFAULT_PLE_WINDOW;
+module_param(pause_filter_count, ushort, 0444);
+
+/* Default doubles per-vcpu window every exit. */
+static unsigned short pause_filter_count_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
+module_param(pause_filter_count_grow, ushort, 0444);
+
+/* Default resets per-vcpu window every exit to pause_filter_count. */
+static unsigned short pause_filter_count_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
+module_param(pause_filter_count_shrink, ushort, 0444);
+
+/* Default is to compute the maximum so we can never overflow. */
+static unsigned short pause_filter_count_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX;
+module_param(pause_filter_count_max, ushort, 0444);
+
+/* allow nested paging (virtualized MMU) for all guests */
+static int npt = true;
+module_param(npt, int, S_IRUGO);
+
+/* allow nested virtualization in KVM/SVM */
+static int nested = true;
+module_param(nested, int, S_IRUGO);
+
+/* enable / disable AVIC */
+static int avic;
+#ifdef CONFIG_X86_LOCAL_APIC
+module_param(avic, int, S_IRUGO);
+#endif
+
+/* enable/disable Virtual VMLOAD VMSAVE */
+static int vls = true;
+module_param(vls, int, 0444);
+
+/* enable/disable Virtual GIF */
+static int vgif = true;
+module_param(vgif, int, 0444);
+
+/* enable/disable SEV support */
+static int sev = IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT);
+module_param(sev, int, 0444);
+
+static u8 rsm_ins_bytes[] = "\x0f\xaa";
+
+static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
+static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa);
+static void svm_complete_interrupts(struct vcpu_svm *svm);
+
+static int nested_svm_exit_handled(struct vcpu_svm *svm);
+static int nested_svm_intercept(struct vcpu_svm *svm);
+static int nested_svm_vmexit(struct vcpu_svm *svm);
+static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
+ bool has_error_code, u32 error_code);
+
+enum {
+ VMCB_INTERCEPTS, /* Intercept vectors, TSC offset,
+ pause filter count */
+ VMCB_PERM_MAP, /* IOPM Base and MSRPM Base */
+ VMCB_ASID, /* ASID */
+ VMCB_INTR, /* int_ctl, int_vector */
+ VMCB_NPT, /* npt_en, nCR3, gPAT */
+ VMCB_CR, /* CR0, CR3, CR4, EFER */
+ VMCB_DR, /* DR6, DR7 */
+ VMCB_DT, /* GDT, IDT */
+ VMCB_SEG, /* CS, DS, SS, ES, CPL */
+ VMCB_CR2, /* CR2 only */
+ VMCB_LBR, /* DBGCTL, BR_FROM, BR_TO, LAST_EX_FROM, LAST_EX_TO */
+ VMCB_AVIC, /* AVIC APIC_BAR, AVIC APIC_BACKING_PAGE,
+ * AVIC PHYSICAL_TABLE pointer,
+ * AVIC LOGICAL_TABLE pointer
+ */
+ VMCB_DIRTY_MAX,
+};
+
+/* TPR and CR2 are always written before VMRUN */
+#define VMCB_ALWAYS_DIRTY_MASK ((1U << VMCB_INTR) | (1U << VMCB_CR2))
+
+#define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL
+
+static unsigned int max_sev_asid;
+static unsigned int min_sev_asid;
+static unsigned long *sev_asid_bitmap;
+#define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT)
+
+struct enc_region {
+ struct list_head list;
+ unsigned long npages;
+ struct page **pages;
+ unsigned long uaddr;
+ unsigned long size;
+};
+
+
+static inline struct kvm_svm *to_kvm_svm(struct kvm *kvm)
+{
+ return container_of(kvm, struct kvm_svm, kvm);
+}
+
+static inline bool svm_sev_enabled(void)
+{
+ return IS_ENABLED(CONFIG_KVM_AMD_SEV) ? max_sev_asid : 0;
+}
+
+static inline bool sev_guest(struct kvm *kvm)
+{
+#ifdef CONFIG_KVM_AMD_SEV
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+
+ return sev->active;
+#else
+ return false;
+#endif
+}
+
+static inline int sev_get_asid(struct kvm *kvm)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+
+ return sev->asid;
+}
+
+static inline void mark_all_dirty(struct vmcb *vmcb)
+{
+ vmcb->control.clean = 0;
+}
+
+static inline void mark_all_clean(struct vmcb *vmcb)
+{
+ vmcb->control.clean = ((1 << VMCB_DIRTY_MAX) - 1)
+ & ~VMCB_ALWAYS_DIRTY_MASK;
+}
+
+static inline void mark_dirty(struct vmcb *vmcb, int bit)
+{
+ vmcb->control.clean &= ~(1 << bit);
+}
+
+static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
+{
+ return container_of(vcpu, struct vcpu_svm, vcpu);
+}
+
+static inline void avic_update_vapic_bar(struct vcpu_svm *svm, u64 data)
+{
+ svm->vmcb->control.avic_vapic_bar = data & VMCB_AVIC_APIC_BAR_MASK;
+ mark_dirty(svm->vmcb, VMCB_AVIC);
+}
+
+static inline bool avic_vcpu_is_running(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u64 *entry = svm->avic_physical_id_cache;
+
+ if (!entry)
+ return false;
+
+ return (READ_ONCE(*entry) & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
+}
+
+static void recalc_intercepts(struct vcpu_svm *svm)
+{
+ struct vmcb_control_area *c, *h;
+ struct nested_state *g;
+
+ mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+
+ if (!is_guest_mode(&svm->vcpu))
+ return;
+
+ c = &svm->vmcb->control;
+ h = &svm->nested.hsave->control;
+ g = &svm->nested;
+
+ c->intercept_cr = h->intercept_cr | g->intercept_cr;
+ c->intercept_dr = h->intercept_dr | g->intercept_dr;
+ c->intercept_exceptions = h->intercept_exceptions | g->intercept_exceptions;
+ c->intercept = h->intercept | g->intercept;
+
+ c->intercept |= (1ULL << INTERCEPT_VMLOAD);
+ c->intercept |= (1ULL << INTERCEPT_VMSAVE);
+}
+
+static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm)
+{
+ if (is_guest_mode(&svm->vcpu))
+ return svm->nested.hsave;
+ else
+ return svm->vmcb;
+}
+
+static inline void set_cr_intercept(struct vcpu_svm *svm, int bit)
+{
+ struct vmcb *vmcb = get_host_vmcb(svm);
+
+ vmcb->control.intercept_cr |= (1U << bit);
+
+ recalc_intercepts(svm);
+}
+
+static inline void clr_cr_intercept(struct vcpu_svm *svm, int bit)
+{
+ struct vmcb *vmcb = get_host_vmcb(svm);
+
+ vmcb->control.intercept_cr &= ~(1U << bit);
+
+ recalc_intercepts(svm);
+}
+
+static inline bool is_cr_intercept(struct vcpu_svm *svm, int bit)
+{
+ struct vmcb *vmcb = get_host_vmcb(svm);
+
+ return vmcb->control.intercept_cr & (1U << bit);
+}
+
+static inline void set_dr_intercepts(struct vcpu_svm *svm)
+{
+ struct vmcb *vmcb = get_host_vmcb(svm);
+
+ vmcb->control.intercept_dr = (1 << INTERCEPT_DR0_READ)
+ | (1 << INTERCEPT_DR1_READ)
+ | (1 << INTERCEPT_DR2_READ)
+ | (1 << INTERCEPT_DR3_READ)
+ | (1 << INTERCEPT_DR4_READ)
+ | (1 << INTERCEPT_DR5_READ)
+ | (1 << INTERCEPT_DR6_READ)
+ | (1 << INTERCEPT_DR7_READ)
+ | (1 << INTERCEPT_DR0_WRITE)
+ | (1 << INTERCEPT_DR1_WRITE)
+ | (1 << INTERCEPT_DR2_WRITE)
+ | (1 << INTERCEPT_DR3_WRITE)
+ | (1 << INTERCEPT_DR4_WRITE)
+ | (1 << INTERCEPT_DR5_WRITE)
+ | (1 << INTERCEPT_DR6_WRITE)
+ | (1 << INTERCEPT_DR7_WRITE);
+
+ recalc_intercepts(svm);
+}
+
+static inline void clr_dr_intercepts(struct vcpu_svm *svm)
+{
+ struct vmcb *vmcb = get_host_vmcb(svm);
+
+ vmcb->control.intercept_dr = 0;
+
+ recalc_intercepts(svm);
+}
+
+static inline void set_exception_intercept(struct vcpu_svm *svm, int bit)
+{
+ struct vmcb *vmcb = get_host_vmcb(svm);
+
+ vmcb->control.intercept_exceptions |= (1U << bit);
+
+ recalc_intercepts(svm);
+}
+
+static inline void clr_exception_intercept(struct vcpu_svm *svm, int bit)
+{
+ struct vmcb *vmcb = get_host_vmcb(svm);
+
+ vmcb->control.intercept_exceptions &= ~(1U << bit);
+
+ recalc_intercepts(svm);
+}
+
+static inline void set_intercept(struct vcpu_svm *svm, int bit)
+{
+ struct vmcb *vmcb = get_host_vmcb(svm);
+
+ vmcb->control.intercept |= (1ULL << bit);
+
+ recalc_intercepts(svm);
+}
+
+static inline void clr_intercept(struct vcpu_svm *svm, int bit)
+{
+ struct vmcb *vmcb = get_host_vmcb(svm);
+
+ vmcb->control.intercept &= ~(1ULL << bit);
+
+ recalc_intercepts(svm);
+}
+
+static inline bool vgif_enabled(struct vcpu_svm *svm)
+{
+ return !!(svm->vmcb->control.int_ctl & V_GIF_ENABLE_MASK);
+}
+
+static inline void enable_gif(struct vcpu_svm *svm)
+{
+ if (vgif_enabled(svm))
+ svm->vmcb->control.int_ctl |= V_GIF_MASK;
+ else
+ svm->vcpu.arch.hflags |= HF_GIF_MASK;
+}
+
+static inline void disable_gif(struct vcpu_svm *svm)
+{
+ if (vgif_enabled(svm))
+ svm->vmcb->control.int_ctl &= ~V_GIF_MASK;
+ else
+ svm->vcpu.arch.hflags &= ~HF_GIF_MASK;
+}
+
+static inline bool gif_set(struct vcpu_svm *svm)
+{
+ if (vgif_enabled(svm))
+ return !!(svm->vmcb->control.int_ctl & V_GIF_MASK);
+ else
+ return !!(svm->vcpu.arch.hflags & HF_GIF_MASK);
+}
+
+static unsigned long iopm_base;
+
+struct kvm_ldttss_desc {
+ u16 limit0;
+ u16 base0;
+ unsigned base1:8, type:5, dpl:2, p:1;
+ unsigned limit1:4, zero0:3, g:1, base2:8;
+ u32 base3;
+ u32 zero1;
+} __attribute__((packed));
+
+struct svm_cpu_data {
+ int cpu;
+
+ u64 asid_generation;
+ u32 max_asid;
+ u32 next_asid;
+ u32 min_asid;
+ struct kvm_ldttss_desc *tss_desc;
+
+ struct page *save_area;
+ struct vmcb *current_vmcb;
+
+ /* index = sev_asid, value = vmcb pointer */
+ struct vmcb **sev_vmcbs;
+};
+
+static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
+
+struct svm_init_data {
+ int cpu;
+ int r;
+};
+
+static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
+
+#define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
+#define MSRS_RANGE_SIZE 2048
+#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
+
+static u32 svm_msrpm_offset(u32 msr)
+{
+ u32 offset;
+ int i;
+
+ for (i = 0; i < NUM_MSR_MAPS; i++) {
+ if (msr < msrpm_ranges[i] ||
+ msr >= msrpm_ranges[i] + MSRS_IN_RANGE)
+ continue;
+
+ offset = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */
+ offset += (i * MSRS_RANGE_SIZE); /* add range offset */
+
+ /* Now we have the u8 offset - but need the u32 offset */
+ return offset / 4;
+ }
+
+ /* MSR not in any range */
+ return MSR_INVALID;
+}
+
+#define MAX_INST_SIZE 15
+
+static inline void clgi(void)
+{
+ asm volatile (__ex(SVM_CLGI));
+}
+
+static inline void stgi(void)
+{
+ asm volatile (__ex(SVM_STGI));
+}
+
+static inline void invlpga(unsigned long addr, u32 asid)
+{
+ asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid));
+}
+
+static int get_npt_level(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_X86_64
+ return PT64_ROOT_4LEVEL;
+#else
+ return PT32E_ROOT_LEVEL;
+#endif
+}
+
+static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
+{
+ vcpu->arch.efer = efer;
+
+ if (!npt_enabled) {
+ /* Shadow paging assumes NX to be available. */
+ efer |= EFER_NX;
+
+ if (!(efer & EFER_LMA))
+ efer &= ~EFER_LME;
+ }
+
+ to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;
+ mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
+}
+
+static int is_external_interrupt(u32 info)
+{
+ info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
+ return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
+}
+
+static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u32 ret = 0;
+
+ if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
+ ret = KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS;
+ return ret;
+}
+
+static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (mask == 0)
+ svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
+ else
+ svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK;
+
+}
+
+static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (svm->vmcb->control.next_rip != 0) {
+ WARN_ON_ONCE(!static_cpu_has(X86_FEATURE_NRIPS));
+ svm->next_rip = svm->vmcb->control.next_rip;
+ }
+
+ if (!svm->next_rip) {
+ if (kvm_emulate_instruction(vcpu, EMULTYPE_SKIP) !=
+ EMULATE_DONE)
+ printk(KERN_DEBUG "%s: NOP\n", __func__);
+ return;
+ }
+ if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE)
+ printk(KERN_ERR "%s: ip 0x%lx next 0x%llx\n",
+ __func__, kvm_rip_read(vcpu), svm->next_rip);
+
+ kvm_rip_write(vcpu, svm->next_rip);
+ svm_set_interrupt_shadow(vcpu, 0);
+}
+
+static void svm_queue_exception(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ unsigned nr = vcpu->arch.exception.nr;
+ bool has_error_code = vcpu->arch.exception.has_error_code;
+ bool reinject = vcpu->arch.exception.injected;
+ u32 error_code = vcpu->arch.exception.error_code;
+
+ /*
+ * If we are within a nested VM we'd better #VMEXIT and let the guest
+ * handle the exception
+ */
+ if (!reinject &&
+ nested_svm_check_exception(svm, nr, has_error_code, error_code))
+ return;
+
+ if (nr == BP_VECTOR && !static_cpu_has(X86_FEATURE_NRIPS)) {
+ unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu);
+
+ /*
+ * For guest debugging where we have to reinject #BP if some
+ * INT3 is guest-owned:
+ * Emulate nRIP by moving RIP forward. Will fail if injection
+ * raises a fault that is not intercepted. Still better than
+ * failing in all cases.
+ */
+ skip_emulated_instruction(&svm->vcpu);
+ rip = kvm_rip_read(&svm->vcpu);
+ svm->int3_rip = rip + svm->vmcb->save.cs.base;
+ svm->int3_injected = rip - old_rip;
+ }
+
+ svm->vmcb->control.event_inj = nr
+ | SVM_EVTINJ_VALID
+ | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
+ | SVM_EVTINJ_TYPE_EXEPT;
+ svm->vmcb->control.event_inj_err = error_code;
+}
+
+static void svm_init_erratum_383(void)
+{
+ u32 low, high;
+ int err;
+ u64 val;
+
+ if (!static_cpu_has_bug(X86_BUG_AMD_TLB_MMATCH))
+ return;
+
+ /* Use _safe variants to not break nested virtualization */
+ val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err);
+ if (err)
+ return;
+
+ val |= (1ULL << 47);
+
+ low = lower_32_bits(val);
+ high = upper_32_bits(val);
+
+ native_write_msr_safe(MSR_AMD64_DC_CFG, low, high);
+
+ erratum_383_found = true;
+}
+
+static void svm_init_osvw(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Guests should see errata 400 and 415 as fixed (assuming that
+ * HLT and IO instructions are intercepted).
+ */
+ vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3;
+ vcpu->arch.osvw.status = osvw_status & ~(6ULL);
+
+ /*
+ * By increasing VCPU's osvw.length to 3 we are telling the guest that
+ * all osvw.status bits inside that length, including bit 0 (which is
+ * reserved for erratum 298), are valid. However, if host processor's
+ * osvw_len is 0 then osvw_status[0] carries no information. We need to
+ * be conservative here and therefore we tell the guest that erratum 298
+ * is present (because we really don't know).
+ */
+ if (osvw_len == 0 && boot_cpu_data.x86 == 0x10)
+ vcpu->arch.osvw.status |= 1;
+}
+
+static int has_svm(void)
+{
+ const char *msg;
+
+ if (!cpu_has_svm(&msg)) {
+ printk(KERN_INFO "has_svm: %s\n", msg);
+ return 0;
+ }
+
+ if (sev_active()) {
+ pr_info("KVM is unsupported when running as an SEV guest\n");
+ return 0;
+ }
+
+ return 1;
+}
+
+static void svm_hardware_disable(void)
+{
+ /* Make sure we clean up behind us */
+ if (static_cpu_has(X86_FEATURE_TSCRATEMSR))
+ wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
+
+ cpu_svm_disable();
+
+ amd_pmu_disable_virt();
+}
+
+static int svm_hardware_enable(void)
+{
+
+ struct svm_cpu_data *sd;
+ uint64_t efer;
+ struct desc_struct *gdt;
+ int me = raw_smp_processor_id();
+
+ rdmsrl(MSR_EFER, efer);
+ if (efer & EFER_SVME)
+ return -EBUSY;
+
+ if (!has_svm()) {
+ pr_err("%s: err EOPNOTSUPP on %d\n", __func__, me);
+ return -EINVAL;
+ }
+ sd = per_cpu(svm_data, me);
+ if (!sd) {
+ pr_err("%s: svm_data is NULL on %d\n", __func__, me);
+ return -EINVAL;
+ }
+
+ sd->asid_generation = 1;
+ sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
+ sd->next_asid = sd->max_asid + 1;
+ sd->min_asid = max_sev_asid + 1;
+
+ gdt = get_current_gdt_rw();
+ sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
+
+ wrmsrl(MSR_EFER, efer | EFER_SVME);
+
+ wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(sd->save_area) << PAGE_SHIFT);
+
+ if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
+ wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
+ __this_cpu_write(current_tsc_ratio, TSC_RATIO_DEFAULT);
+ }
+
+
+ /*
+ * Get OSVW bits.
+ *
+ * Note that it is possible to have a system with mixed processor
+ * revisions and therefore different OSVW bits. If bits are not the same
+ * on different processors then choose the worst case (i.e. if erratum
+ * is present on one processor and not on another then assume that the
+ * erratum is present everywhere).
+ */
+ if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) {
+ uint64_t len, status = 0;
+ int err;
+
+ len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err);
+ if (!err)
+ status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS,
+ &err);
+
+ if (err)
+ osvw_status = osvw_len = 0;
+ else {
+ if (len < osvw_len)
+ osvw_len = len;
+ osvw_status |= status;
+ osvw_status &= (1ULL << osvw_len) - 1;
+ }
+ } else
+ osvw_status = osvw_len = 0;
+
+ svm_init_erratum_383();
+
+ amd_pmu_enable_virt();
+
+ return 0;
+}
+
+static void svm_cpu_uninit(int cpu)
+{
+ struct svm_cpu_data *sd = per_cpu(svm_data, raw_smp_processor_id());
+
+ if (!sd)
+ return;
+
+ per_cpu(svm_data, raw_smp_processor_id()) = NULL;
+ kfree(sd->sev_vmcbs);
+ __free_page(sd->save_area);
+ kfree(sd);
+}
+
+static int svm_cpu_init(int cpu)
+{
+ struct svm_cpu_data *sd;
+
+ sd = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL);
+ if (!sd)
+ return -ENOMEM;
+ sd->cpu = cpu;
+ sd->save_area = alloc_page(GFP_KERNEL);
+ if (!sd->save_area)
+ goto free_cpu_data;
+
+ if (svm_sev_enabled()) {
+ sd->sev_vmcbs = kmalloc_array(max_sev_asid + 1,
+ sizeof(void *),
+ GFP_KERNEL);
+ if (!sd->sev_vmcbs)
+ goto free_save_area;
+ }
+
+ per_cpu(svm_data, cpu) = sd;
+
+ return 0;
+
+free_save_area:
+ __free_page(sd->save_area);
+free_cpu_data:
+ kfree(sd);
+ return -ENOMEM;
+
+}
+
+static bool valid_msr_intercept(u32 index)
+{
+ int i;
+
+ for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++)
+ if (direct_access_msrs[i].index == index)
+ return true;
+
+ return false;
+}
+
+static bool msr_write_intercepted(struct kvm_vcpu *vcpu, unsigned msr)
+{
+ u8 bit_write;
+ unsigned long tmp;
+ u32 offset;
+ u32 *msrpm;
+
+ msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm:
+ to_svm(vcpu)->msrpm;
+
+ offset = svm_msrpm_offset(msr);
+ bit_write = 2 * (msr & 0x0f) + 1;
+ tmp = msrpm[offset];
+
+ BUG_ON(offset == MSR_INVALID);
+
+ return !!test_bit(bit_write, &tmp);
+}
+
+static void set_msr_interception(u32 *msrpm, unsigned msr,
+ int read, int write)
+{
+ u8 bit_read, bit_write;
+ unsigned long tmp;
+ u32 offset;
+
+ /*
+ * If this warning triggers extend the direct_access_msrs list at the
+ * beginning of the file
+ */
+ WARN_ON(!valid_msr_intercept(msr));
+
+ offset = svm_msrpm_offset(msr);
+ bit_read = 2 * (msr & 0x0f);
+ bit_write = 2 * (msr & 0x0f) + 1;
+ tmp = msrpm[offset];
+
+ BUG_ON(offset == MSR_INVALID);
+
+ read ? clear_bit(bit_read, &tmp) : set_bit(bit_read, &tmp);
+ write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp);
+
+ msrpm[offset] = tmp;
+}
+
+static void svm_vcpu_init_msrpm(u32 *msrpm)
+{
+ int i;
+
+ memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
+
+ for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
+ if (!direct_access_msrs[i].always)
+ continue;
+
+ set_msr_interception(msrpm, direct_access_msrs[i].index, 1, 1);
+ }
+}
+
+static void add_msr_offset(u32 offset)
+{
+ int i;
+
+ for (i = 0; i < MSRPM_OFFSETS; ++i) {
+
+ /* Offset already in list? */
+ if (msrpm_offsets[i] == offset)
+ return;
+
+ /* Slot used by another offset? */
+ if (msrpm_offsets[i] != MSR_INVALID)
+ continue;
+
+ /* Add offset to list */
+ msrpm_offsets[i] = offset;
+
+ return;
+ }
+
+ /*
+ * If this BUG triggers the msrpm_offsets table has an overflow. Just
+ * increase MSRPM_OFFSETS in this case.
+ */
+ BUG();
+}
+
+static void init_msrpm_offsets(void)
+{
+ int i;
+
+ memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets));
+
+ for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
+ u32 offset;
+
+ offset = svm_msrpm_offset(direct_access_msrs[i].index);
+ BUG_ON(offset == MSR_INVALID);
+
+ add_msr_offset(offset);
+ }
+}
+
+static void svm_enable_lbrv(struct vcpu_svm *svm)
+{
+ u32 *msrpm = svm->msrpm;
+
+ svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
+ set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
+ set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
+ set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
+ set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
+}
+
+static void svm_disable_lbrv(struct vcpu_svm *svm)
+{
+ u32 *msrpm = svm->msrpm;
+
+ svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK;
+ set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
+ set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
+ set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
+ set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
+}
+
+static void disable_nmi_singlestep(struct vcpu_svm *svm)
+{
+ svm->nmi_singlestep = false;
+
+ if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) {
+ /* Clear our flags if they were not set by the guest */
+ if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
+ svm->vmcb->save.rflags &= ~X86_EFLAGS_TF;
+ if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
+ svm->vmcb->save.rflags &= ~X86_EFLAGS_RF;
+ }
+}
+
+/* Note:
+ * This hash table is used to map VM_ID to a struct kvm_svm,
+ * when handling AMD IOMMU GALOG notification to schedule in
+ * a particular vCPU.
+ */
+#define SVM_VM_DATA_HASH_BITS 8
+static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS);
+static u32 next_vm_id = 0;
+static bool next_vm_id_wrapped = 0;
+static DEFINE_SPINLOCK(svm_vm_data_hash_lock);
+
+/* Note:
+ * This function is called from IOMMU driver to notify
+ * SVM to schedule in a particular vCPU of a particular VM.
+ */
+static int avic_ga_log_notifier(u32 ga_tag)
+{
+ unsigned long flags;
+ struct kvm_svm *kvm_svm;
+ struct kvm_vcpu *vcpu = NULL;
+ u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag);
+ u32 vcpu_id = AVIC_GATAG_TO_VCPUID(ga_tag);
+
+ pr_debug("SVM: %s: vm_id=%#x, vcpu_id=%#x\n", __func__, vm_id, vcpu_id);
+
+ spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
+ hash_for_each_possible(svm_vm_data_hash, kvm_svm, hnode, vm_id) {
+ if (kvm_svm->avic_vm_id != vm_id)
+ continue;
+ vcpu = kvm_get_vcpu_by_id(&kvm_svm->kvm, vcpu_id);
+ break;
+ }
+ spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
+
+ /* Note:
+ * At this point, the IOMMU should have already set the pending
+ * bit in the vAPIC backing page. So, we just need to schedule
+ * in the vcpu.
+ */
+ if (vcpu)
+ kvm_vcpu_wake_up(vcpu);
+
+ return 0;
+}
+
+static __init int sev_hardware_setup(void)
+{
+ struct sev_user_data_status *status;
+ int rc;
+
+ /* Maximum number of encrypted guests supported simultaneously */
+ max_sev_asid = cpuid_ecx(0x8000001F);
+
+ if (!max_sev_asid)
+ return 1;
+
+ /* Minimum ASID value that should be used for SEV guest */
+ min_sev_asid = cpuid_edx(0x8000001F);
+
+ /* Initialize SEV ASID bitmap */
+ sev_asid_bitmap = bitmap_zalloc(max_sev_asid, GFP_KERNEL);
+ if (!sev_asid_bitmap)
+ return 1;
+
+ status = kmalloc(sizeof(*status), GFP_KERNEL);
+ if (!status)
+ return 1;
+
+ /*
+ * Check SEV platform status.
+ *
+ * PLATFORM_STATUS can be called in any state, if we failed to query
+ * the PLATFORM status then either PSP firmware does not support SEV
+ * feature or SEV firmware is dead.
+ */
+ rc = sev_platform_status(status, NULL);
+ if (rc)
+ goto err;
+
+ pr_info("SEV supported\n");
+
+err:
+ kfree(status);
+ return rc;
+}
+
+static void grow_ple_window(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ int old = control->pause_filter_count;
+
+ control->pause_filter_count = __grow_ple_window(old,
+ pause_filter_count,
+ pause_filter_count_grow,
+ pause_filter_count_max);
+
+ if (control->pause_filter_count != old)
+ mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+
+ trace_kvm_ple_window_grow(vcpu->vcpu_id,
+ control->pause_filter_count, old);
+}
+
+static void shrink_ple_window(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ int old = control->pause_filter_count;
+
+ control->pause_filter_count =
+ __shrink_ple_window(old,
+ pause_filter_count,
+ pause_filter_count_shrink,
+ pause_filter_count);
+ if (control->pause_filter_count != old)
+ mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+
+ trace_kvm_ple_window_shrink(vcpu->vcpu_id,
+ control->pause_filter_count, old);
+}
+
+/*
+ * The default MMIO mask is a single bit (excluding the present bit),
+ * which could conflict with the memory encryption bit. Check for
+ * memory encryption support and override the default MMIO mask if
+ * memory encryption is enabled.
+ */
+static __init void svm_adjust_mmio_mask(void)
+{
+ unsigned int enc_bit, mask_bit;
+ u64 msr, mask;
+
+ /* If there is no memory encryption support, use existing mask */
+ if (cpuid_eax(0x80000000) < 0x8000001f)
+ return;
+
+ /* If memory encryption is not enabled, use existing mask */
+ rdmsrl(MSR_K8_SYSCFG, msr);
+ if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT))
+ return;
+
+ enc_bit = cpuid_ebx(0x8000001f) & 0x3f;
+ mask_bit = boot_cpu_data.x86_phys_bits;
+
+ /* Increment the mask bit if it is the same as the encryption bit */
+ if (enc_bit == mask_bit)
+ mask_bit++;
+
+ /*
+ * If the mask bit location is below 52, then some bits above the
+ * physical addressing limit will always be reserved, so use the
+ * rsvd_bits() function to generate the mask. This mask, along with
+ * the present bit, will be used to generate a page fault with
+ * PFER.RSV = 1.
+ *
+ * If the mask bit location is 52 (or above), then clear the mask.
+ */
+ mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0;
+
+ kvm_mmu_set_mmio_spte_mask(mask, mask);
+}
+
+static __init int svm_hardware_setup(void)
+{
+ int cpu;
+ struct page *iopm_pages;
+ void *iopm_va;
+ int r;
+
+ iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER);
+
+ if (!iopm_pages)
+ return -ENOMEM;
+
+ iopm_va = page_address(iopm_pages);
+ memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER));
+ iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
+
+ init_msrpm_offsets();
+
+ if (boot_cpu_has(X86_FEATURE_NX))
+ kvm_enable_efer_bits(EFER_NX);
+
+ if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
+ kvm_enable_efer_bits(EFER_FFXSR);
+
+ if (boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
+ kvm_has_tsc_control = true;
+ kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX;
+ kvm_tsc_scaling_ratio_frac_bits = 32;
+ }
+
+ /* Check for pause filtering support */
+ if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
+ pause_filter_count = 0;
+ pause_filter_thresh = 0;
+ } else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) {
+ pause_filter_thresh = 0;
+ }
+
+ if (nested) {
+ printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
+ kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
+ }
+
+ if (sev) {
+ if (boot_cpu_has(X86_FEATURE_SEV) &&
+ IS_ENABLED(CONFIG_KVM_AMD_SEV)) {
+ r = sev_hardware_setup();
+ if (r)
+ sev = false;
+ } else {
+ sev = false;
+ }
+ }
+
+ svm_adjust_mmio_mask();
+
+ for_each_possible_cpu(cpu) {
+ r = svm_cpu_init(cpu);
+ if (r)
+ goto err;
+ }
+
+ if (!boot_cpu_has(X86_FEATURE_NPT))
+ npt_enabled = false;
+
+ if (npt_enabled && !npt) {
+ printk(KERN_INFO "kvm: Nested Paging disabled\n");
+ npt_enabled = false;
+ }
+
+ if (npt_enabled) {
+ printk(KERN_INFO "kvm: Nested Paging enabled\n");
+ kvm_enable_tdp();
+ } else
+ kvm_disable_tdp();
+
+ if (avic) {
+ if (!npt_enabled ||
+ !boot_cpu_has(X86_FEATURE_AVIC) ||
+ !IS_ENABLED(CONFIG_X86_LOCAL_APIC)) {
+ avic = false;
+ } else {
+ pr_info("AVIC enabled\n");
+
+ amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
+ }
+ }
+
+ if (vls) {
+ if (!npt_enabled ||
+ !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) ||
+ !IS_ENABLED(CONFIG_X86_64)) {
+ vls = false;
+ } else {
+ pr_info("Virtual VMLOAD VMSAVE supported\n");
+ }
+ }
+
+ vgif = false; /* Disabled for CVE-2021-3653 */
+
+ return 0;
+
+err:
+ __free_pages(iopm_pages, IOPM_ALLOC_ORDER);
+ iopm_base = 0;
+ return r;
+}
+
+static __exit void svm_hardware_unsetup(void)
+{
+ int cpu;
+
+ if (svm_sev_enabled())
+ bitmap_free(sev_asid_bitmap);
+
+ for_each_possible_cpu(cpu)
+ svm_cpu_uninit(cpu);
+
+ __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
+ iopm_base = 0;
+}
+
+static void init_seg(struct vmcb_seg *seg)
+{
+ seg->selector = 0;
+ seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
+ SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
+ seg->limit = 0xffff;
+ seg->base = 0;
+}
+
+static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
+{
+ seg->selector = 0;
+ seg->attrib = SVM_SELECTOR_P_MASK | type;
+ seg->limit = 0xffff;
+ seg->base = 0;
+}
+
+static u64 svm_read_l1_tsc_offset(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (is_guest_mode(vcpu))
+ return svm->nested.hsave->control.tsc_offset;
+
+ return vcpu->arch.tsc_offset;
+}
+
+static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u64 g_tsc_offset = 0;
+
+ if (is_guest_mode(vcpu)) {
+ /* Write L1's TSC offset. */
+ g_tsc_offset = svm->vmcb->control.tsc_offset -
+ svm->nested.hsave->control.tsc_offset;
+ svm->nested.hsave->control.tsc_offset = offset;
+ } else
+ trace_kvm_write_tsc_offset(vcpu->vcpu_id,
+ svm->vmcb->control.tsc_offset,
+ offset);
+
+ svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
+
+ mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+ return svm->vmcb->control.tsc_offset;
+}
+
+static void avic_init_vmcb(struct vcpu_svm *svm)
+{
+ struct vmcb *vmcb = svm->vmcb;
+ struct kvm_svm *kvm_svm = to_kvm_svm(svm->vcpu.kvm);
+ phys_addr_t bpa = __sme_set(page_to_phys(svm->avic_backing_page));
+ phys_addr_t lpa = __sme_set(page_to_phys(kvm_svm->avic_logical_id_table_page));
+ phys_addr_t ppa = __sme_set(page_to_phys(kvm_svm->avic_physical_id_table_page));
+
+ vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK;
+ vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK;
+ vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK;
+ vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID_COUNT;
+ vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
+}
+
+static void init_vmcb(struct vcpu_svm *svm)
+{
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ struct vmcb_save_area *save = &svm->vmcb->save;
+
+ svm->vcpu.arch.hflags = 0;
+
+ set_cr_intercept(svm, INTERCEPT_CR0_READ);
+ set_cr_intercept(svm, INTERCEPT_CR3_READ);
+ set_cr_intercept(svm, INTERCEPT_CR4_READ);
+ set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
+ set_cr_intercept(svm, INTERCEPT_CR3_WRITE);
+ set_cr_intercept(svm, INTERCEPT_CR4_WRITE);
+ if (!kvm_vcpu_apicv_active(&svm->vcpu))
+ set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
+
+ set_dr_intercepts(svm);
+
+ set_exception_intercept(svm, PF_VECTOR);
+ set_exception_intercept(svm, UD_VECTOR);
+ set_exception_intercept(svm, MC_VECTOR);
+ set_exception_intercept(svm, AC_VECTOR);
+ set_exception_intercept(svm, DB_VECTOR);
+ /*
+ * Guest access to VMware backdoor ports could legitimately
+ * trigger #GP because of TSS I/O permission bitmap.
+ * We intercept those #GP and allow access to them anyway
+ * as VMware does.
+ */
+ if (enable_vmware_backdoor)
+ set_exception_intercept(svm, GP_VECTOR);
+
+ set_intercept(svm, INTERCEPT_INTR);
+ set_intercept(svm, INTERCEPT_NMI);
+ set_intercept(svm, INTERCEPT_SMI);
+ set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
+ set_intercept(svm, INTERCEPT_RDPMC);
+ set_intercept(svm, INTERCEPT_CPUID);
+ set_intercept(svm, INTERCEPT_INVD);
+ set_intercept(svm, INTERCEPT_INVLPG);
+ set_intercept(svm, INTERCEPT_INVLPGA);
+ set_intercept(svm, INTERCEPT_IOIO_PROT);
+ set_intercept(svm, INTERCEPT_MSR_PROT);
+ set_intercept(svm, INTERCEPT_TASK_SWITCH);
+ set_intercept(svm, INTERCEPT_SHUTDOWN);
+ set_intercept(svm, INTERCEPT_VMRUN);
+ set_intercept(svm, INTERCEPT_VMMCALL);
+ set_intercept(svm, INTERCEPT_VMLOAD);
+ set_intercept(svm, INTERCEPT_VMSAVE);
+ set_intercept(svm, INTERCEPT_STGI);
+ set_intercept(svm, INTERCEPT_CLGI);
+ set_intercept(svm, INTERCEPT_SKINIT);
+ set_intercept(svm, INTERCEPT_WBINVD);
+ set_intercept(svm, INTERCEPT_XSETBV);
+ set_intercept(svm, INTERCEPT_RSM);
+
+ if (!kvm_mwait_in_guest(svm->vcpu.kvm)) {
+ set_intercept(svm, INTERCEPT_MONITOR);
+ set_intercept(svm, INTERCEPT_MWAIT);
+ }
+
+ if (!kvm_hlt_in_guest(svm->vcpu.kvm))
+ set_intercept(svm, INTERCEPT_HLT);
+
+ control->iopm_base_pa = __sme_set(iopm_base);
+ control->msrpm_base_pa = __sme_set(__pa(svm->msrpm));
+ control->int_ctl = V_INTR_MASKING_MASK;
+
+ init_seg(&save->es);
+ init_seg(&save->ss);
+ init_seg(&save->ds);
+ init_seg(&save->fs);
+ init_seg(&save->gs);
+
+ save->cs.selector = 0xf000;
+ save->cs.base = 0xffff0000;
+ /* Executable/Readable Code Segment */
+ save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK |
+ SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
+ save->cs.limit = 0xffff;
+
+ save->gdtr.limit = 0xffff;
+ save->idtr.limit = 0xffff;
+
+ init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
+ init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
+
+ svm_set_efer(&svm->vcpu, 0);
+ save->dr6 = 0xffff0ff0;
+ kvm_set_rflags(&svm->vcpu, 2);
+ save->rip = 0x0000fff0;
+ svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
+
+ /*
+ * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
+ * It also updates the guest-visible cr0 value.
+ */
+ svm_set_cr0(&svm->vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
+ kvm_mmu_reset_context(&svm->vcpu);
+
+ save->cr4 = X86_CR4_PAE;
+ /* rdx = ?? */
+
+ if (npt_enabled) {
+ /* Setup VMCB for Nested Paging */
+ control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE;
+ clr_intercept(svm, INTERCEPT_INVLPG);
+ clr_exception_intercept(svm, PF_VECTOR);
+ clr_cr_intercept(svm, INTERCEPT_CR3_READ);
+ clr_cr_intercept(svm, INTERCEPT_CR3_WRITE);
+ save->g_pat = svm->vcpu.arch.pat;
+ save->cr3 = 0;
+ save->cr4 = 0;
+ }
+ svm->asid_generation = 0;
+
+ svm->nested.vmcb = 0;
+ svm->vcpu.arch.hflags = 0;
+
+ if (pause_filter_count) {
+ control->pause_filter_count = pause_filter_count;
+ if (pause_filter_thresh)
+ control->pause_filter_thresh = pause_filter_thresh;
+ set_intercept(svm, INTERCEPT_PAUSE);
+ } else {
+ clr_intercept(svm, INTERCEPT_PAUSE);
+ }
+
+ if (kvm_vcpu_apicv_active(&svm->vcpu))
+ avic_init_vmcb(svm);
+
+ /*
+ * If hardware supports Virtual VMLOAD VMSAVE then enable it
+ * in VMCB and clear intercepts to avoid #VMEXIT.
+ */
+ if (vls) {
+ clr_intercept(svm, INTERCEPT_VMLOAD);
+ clr_intercept(svm, INTERCEPT_VMSAVE);
+ svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
+ }
+
+ if (vgif) {
+ clr_intercept(svm, INTERCEPT_STGI);
+ clr_intercept(svm, INTERCEPT_CLGI);
+ svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK;
+ }
+
+ if (sev_guest(svm->vcpu.kvm)) {
+ svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ENABLE;
+ clr_exception_intercept(svm, UD_VECTOR);
+ }
+
+ mark_all_dirty(svm->vmcb);
+
+ enable_gif(svm);
+
+}
+
+static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu,
+ unsigned int index)
+{
+ u64 *avic_physical_id_table;
+ struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
+
+ if (index >= AVIC_MAX_PHYSICAL_ID_COUNT)
+ return NULL;
+
+ avic_physical_id_table = page_address(kvm_svm->avic_physical_id_table_page);
+
+ return &avic_physical_id_table[index];
+}
+
+/**
+ * Note:
+ * AVIC hardware walks the nested page table to check permissions,
+ * but does not use the SPA address specified in the leaf page
+ * table entry since it uses address in the AVIC_BACKING_PAGE pointer
+ * field of the VMCB. Therefore, we set up the
+ * APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (4KB) here.
+ */
+static int avic_init_access_page(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+ int ret = 0;
+
+ mutex_lock(&kvm->slots_lock);
+ if (kvm->arch.apic_access_page_done)
+ goto out;
+
+ ret = __x86_set_memory_region(kvm,
+ APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
+ APIC_DEFAULT_PHYS_BASE,
+ PAGE_SIZE);
+ if (ret)
+ goto out;
+
+ kvm->arch.apic_access_page_done = true;
+out:
+ mutex_unlock(&kvm->slots_lock);
+ return ret;
+}
+
+static int avic_init_backing_page(struct kvm_vcpu *vcpu)
+{
+ int ret;
+ u64 *entry, new_entry;
+ int id = vcpu->vcpu_id;
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ ret = avic_init_access_page(vcpu);
+ if (ret)
+ return ret;
+
+ if (id >= AVIC_MAX_PHYSICAL_ID_COUNT)
+ return -EINVAL;
+
+ if (!svm->vcpu.arch.apic->regs)
+ return -EINVAL;
+
+ svm->avic_backing_page = virt_to_page(svm->vcpu.arch.apic->regs);
+
+ /* Setting AVIC backing page address in the phy APIC ID table */
+ entry = avic_get_physical_id_entry(vcpu, id);
+ if (!entry)
+ return -EINVAL;
+
+ new_entry = READ_ONCE(*entry);
+ new_entry = __sme_set((page_to_phys(svm->avic_backing_page) &
+ AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) |
+ AVIC_PHYSICAL_ID_ENTRY_VALID_MASK);
+ WRITE_ONCE(*entry, new_entry);
+
+ svm->avic_physical_id_cache = entry;
+
+ return 0;
+}
+
+static void __sev_asid_free(int asid)
+{
+ struct svm_cpu_data *sd;
+ int cpu, pos;
+
+ pos = asid - 1;
+ clear_bit(pos, sev_asid_bitmap);
+
+ for_each_possible_cpu(cpu) {
+ sd = per_cpu(svm_data, cpu);
+ sd->sev_vmcbs[asid] = NULL;
+ }
+}
+
+static void sev_asid_free(struct kvm *kvm)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+
+ __sev_asid_free(sev->asid);
+}
+
+static void sev_decommission(unsigned int handle)
+{
+ struct sev_data_decommission *decommission;
+
+ if (!handle)
+ return;
+
+ decommission = kzalloc(sizeof(*decommission), GFP_KERNEL);
+ if (!decommission)
+ return;
+
+ decommission->handle = handle;
+ sev_guest_decommission(decommission, NULL);
+
+ kfree(decommission);
+}
+
+static void sev_unbind_asid(struct kvm *kvm, unsigned int handle)
+{
+ struct sev_data_deactivate *data;
+
+ if (!handle)
+ return;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return;
+
+ /* deactivate handle */
+ data->handle = handle;
+ sev_guest_deactivate(data, NULL);
+
+ wbinvd_on_all_cpus();
+ sev_guest_df_flush(NULL);
+ kfree(data);
+
+ sev_decommission(handle);
+}
+
+static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
+ unsigned long ulen, unsigned long *n,
+ int write)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ unsigned long npages, npinned, size;
+ unsigned long locked, lock_limit;
+ struct page **pages;
+ unsigned long first, last;
+
+ lockdep_assert_held(&kvm->lock);
+
+ if (ulen == 0 || uaddr + ulen < uaddr)
+ return NULL;
+
+ /* Calculate number of pages. */
+ first = (uaddr & PAGE_MASK) >> PAGE_SHIFT;
+ last = ((uaddr + ulen - 1) & PAGE_MASK) >> PAGE_SHIFT;
+ npages = (last - first + 1);
+
+ locked = sev->pages_locked + npages;
+ lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+ if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
+ pr_err("SEV: %lu locked pages exceed the lock limit of %lu.\n", locked, lock_limit);
+ return NULL;
+ }
+
+ /* Avoid using vmalloc for smaller buffers. */
+ size = npages * sizeof(struct page *);
+ if (size > PAGE_SIZE)
+ pages = vmalloc(size);
+ else
+ pages = kmalloc(size, GFP_KERNEL);
+
+ if (!pages)
+ return NULL;
+
+ /* Pin the user virtual address. */
+ npinned = get_user_pages_fast(uaddr, npages, write ? FOLL_WRITE : 0, pages);
+ if (npinned != npages) {
+ pr_err("SEV: Failure locking %lu pages.\n", npages);
+ goto err;
+ }
+
+ *n = npages;
+ sev->pages_locked = locked;
+
+ return pages;
+
+err:
+ if (npinned > 0)
+ release_pages(pages, npinned);
+
+ kvfree(pages);
+ return NULL;
+}
+
+static void sev_unpin_memory(struct kvm *kvm, struct page **pages,
+ unsigned long npages)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+
+ release_pages(pages, npages);
+ kvfree(pages);
+ sev->pages_locked -= npages;
+}
+
+static void sev_clflush_pages(struct page *pages[], unsigned long npages)
+{
+ uint8_t *page_virtual;
+ unsigned long i;
+
+ if (npages == 0 || pages == NULL)
+ return;
+
+ for (i = 0; i < npages; i++) {
+ page_virtual = kmap_atomic(pages[i]);
+ clflush_cache_range(page_virtual, PAGE_SIZE);
+ kunmap_atomic(page_virtual);
+ }
+}
+
+static void __unregister_enc_region_locked(struct kvm *kvm,
+ struct enc_region *region)
+{
+ /*
+ * The guest may change the memory encryption attribute from C=0 -> C=1
+ * or vice versa for this memory range. Lets make sure caches are
+ * flushed to ensure that guest data gets written into memory with
+ * correct C-bit.
+ */
+ sev_clflush_pages(region->pages, region->npages);
+
+ sev_unpin_memory(kvm, region->pages, region->npages);
+ list_del(&region->list);
+ kfree(region);
+}
+
+static struct kvm *svm_vm_alloc(void)
+{
+ struct kvm_svm *kvm_svm = vzalloc(sizeof(struct kvm_svm));
+
+ if (!kvm_svm)
+ return NULL;
+
+ return &kvm_svm->kvm;
+}
+
+static void svm_vm_free(struct kvm *kvm)
+{
+ vfree(to_kvm_svm(kvm));
+}
+
+static void sev_vm_destroy(struct kvm *kvm)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct list_head *head = &sev->regions_list;
+ struct list_head *pos, *q;
+
+ if (!sev_guest(kvm))
+ return;
+
+ mutex_lock(&kvm->lock);
+
+ /*
+ * if userspace was terminated before unregistering the memory regions
+ * then lets unpin all the registered memory.
+ */
+ if (!list_empty(head)) {
+ list_for_each_safe(pos, q, head) {
+ __unregister_enc_region_locked(kvm,
+ list_entry(pos, struct enc_region, list));
+ cond_resched();
+ }
+ }
+
+ mutex_unlock(&kvm->lock);
+
+ sev_unbind_asid(kvm, sev->handle);
+ sev_asid_free(kvm);
+}
+
+static void avic_vm_destroy(struct kvm *kvm)
+{
+ unsigned long flags;
+ struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
+
+ if (!avic)
+ return;
+
+ if (kvm_svm->avic_logical_id_table_page)
+ __free_page(kvm_svm->avic_logical_id_table_page);
+ if (kvm_svm->avic_physical_id_table_page)
+ __free_page(kvm_svm->avic_physical_id_table_page);
+
+ spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
+ hash_del(&kvm_svm->hnode);
+ spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
+}
+
+static void svm_vm_destroy(struct kvm *kvm)
+{
+ avic_vm_destroy(kvm);
+ sev_vm_destroy(kvm);
+}
+
+static int avic_vm_init(struct kvm *kvm)
+{
+ unsigned long flags;
+ int err = -ENOMEM;
+ struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
+ struct kvm_svm *k2;
+ struct page *p_page;
+ struct page *l_page;
+ u32 vm_id;
+
+ if (!avic)
+ return 0;
+
+ /* Allocating physical APIC ID table (4KB) */
+ p_page = alloc_page(GFP_KERNEL);
+ if (!p_page)
+ goto free_avic;
+
+ kvm_svm->avic_physical_id_table_page = p_page;
+ clear_page(page_address(p_page));
+
+ /* Allocating logical APIC ID table (4KB) */
+ l_page = alloc_page(GFP_KERNEL);
+ if (!l_page)
+ goto free_avic;
+
+ kvm_svm->avic_logical_id_table_page = l_page;
+ clear_page(page_address(l_page));
+
+ spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
+ again:
+ vm_id = next_vm_id = (next_vm_id + 1) & AVIC_VM_ID_MASK;
+ if (vm_id == 0) { /* id is 1-based, zero is not okay */
+ next_vm_id_wrapped = 1;
+ goto again;
+ }
+ /* Is it still in use? Only possible if wrapped at least once */
+ if (next_vm_id_wrapped) {
+ hash_for_each_possible(svm_vm_data_hash, k2, hnode, vm_id) {
+ if (k2->avic_vm_id == vm_id)
+ goto again;
+ }
+ }
+ kvm_svm->avic_vm_id = vm_id;
+ hash_add(svm_vm_data_hash, &kvm_svm->hnode, kvm_svm->avic_vm_id);
+ spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
+
+ return 0;
+
+free_avic:
+ avic_vm_destroy(kvm);
+ return err;
+}
+
+static inline int
+avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r)
+{
+ int ret = 0;
+ unsigned long flags;
+ struct amd_svm_iommu_ir *ir;
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (!kvm_arch_has_assigned_device(vcpu->kvm))
+ return 0;
+
+ /*
+ * Here, we go through the per-vcpu ir_list to update all existing
+ * interrupt remapping table entry targeting this vcpu.
+ */
+ spin_lock_irqsave(&svm->ir_list_lock, flags);
+
+ if (list_empty(&svm->ir_list))
+ goto out;
+
+ list_for_each_entry(ir, &svm->ir_list, node) {
+ ret = amd_iommu_update_ga(cpu, r, ir->data);
+ if (ret)
+ break;
+ }
+out:
+ spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+ return ret;
+}
+
+static void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+ u64 entry;
+ /* ID = 0xff (broadcast), ID > 0xff (reserved) */
+ int h_physical_id = kvm_cpu_get_apicid(cpu);
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (!kvm_vcpu_apicv_active(vcpu))
+ return;
+
+ /*
+ * Since the host physical APIC id is 8 bits,
+ * we can support host APIC ID upto 255.
+ */
+ if (WARN_ON(h_physical_id > AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK))
+ return;
+
+ entry = READ_ONCE(*(svm->avic_physical_id_cache));
+ WARN_ON(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
+
+ entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
+ entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK);
+
+ entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
+ if (svm->avic_is_running)
+ entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
+
+ WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
+ avic_update_iommu_vcpu_affinity(vcpu, h_physical_id,
+ svm->avic_is_running);
+}
+
+static void avic_vcpu_put(struct kvm_vcpu *vcpu)
+{
+ u64 entry;
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (!kvm_vcpu_apicv_active(vcpu))
+ return;
+
+ entry = READ_ONCE(*(svm->avic_physical_id_cache));
+ if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)
+ avic_update_iommu_vcpu_affinity(vcpu, -1, 0);
+
+ entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
+ WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
+}
+
+/**
+ * This function is called during VCPU halt/unhalt.
+ */
+static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->avic_is_running = is_run;
+ if (is_run)
+ avic_vcpu_load(vcpu, vcpu->cpu);
+ else
+ avic_vcpu_put(vcpu);
+}
+
+static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u32 dummy;
+ u32 eax = 1;
+
+ vcpu->arch.microcode_version = 0x01000065;
+ svm->spec_ctrl = 0;
+ svm->virt_spec_ctrl = 0;
+
+ if (!init_event) {
+ svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
+ MSR_IA32_APICBASE_ENABLE;
+ if (kvm_vcpu_is_reset_bsp(&svm->vcpu))
+ svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
+ }
+ init_vmcb(svm);
+
+ kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, true);
+ kvm_register_write(vcpu, VCPU_REGS_RDX, eax);
+
+ if (kvm_vcpu_apicv_active(vcpu) && !init_event)
+ avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE);
+}
+
+static int avic_init_vcpu(struct vcpu_svm *svm)
+{
+ int ret;
+
+ if (!kvm_vcpu_apicv_active(&svm->vcpu))
+ return 0;
+
+ ret = avic_init_backing_page(&svm->vcpu);
+ if (ret)
+ return ret;
+
+ INIT_LIST_HEAD(&svm->ir_list);
+ spin_lock_init(&svm->ir_list_lock);
+
+ return ret;
+}
+
+static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
+{
+ struct vcpu_svm *svm;
+ struct page *page;
+ struct page *msrpm_pages;
+ struct page *hsave_page;
+ struct page *nested_msrpm_pages;
+ int err;
+
+ svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
+ if (!svm) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = kvm_vcpu_init(&svm->vcpu, kvm, id);
+ if (err)
+ goto free_svm;
+
+ err = -ENOMEM;
+ page = alloc_page(GFP_KERNEL);
+ if (!page)
+ goto uninit;
+
+ msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
+ if (!msrpm_pages)
+ goto free_page1;
+
+ nested_msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
+ if (!nested_msrpm_pages)
+ goto free_page2;
+
+ hsave_page = alloc_page(GFP_KERNEL);
+ if (!hsave_page)
+ goto free_page3;
+
+ err = avic_init_vcpu(svm);
+ if (err)
+ goto free_page4;
+
+ /* We initialize this flag to true to make sure that the is_running
+ * bit would be set the first time the vcpu is loaded.
+ */
+ svm->avic_is_running = true;
+
+ svm->nested.hsave = page_address(hsave_page);
+
+ svm->msrpm = page_address(msrpm_pages);
+ svm_vcpu_init_msrpm(svm->msrpm);
+
+ svm->nested.msrpm = page_address(nested_msrpm_pages);
+ svm_vcpu_init_msrpm(svm->nested.msrpm);
+
+ svm->vmcb = page_address(page);
+ clear_page(svm->vmcb);
+ svm->vmcb_pa = __sme_set(page_to_pfn(page) << PAGE_SHIFT);
+ svm->asid_generation = 0;
+ init_vmcb(svm);
+
+ svm_init_osvw(&svm->vcpu);
+
+ return &svm->vcpu;
+
+free_page4:
+ __free_page(hsave_page);
+free_page3:
+ __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
+free_page2:
+ __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER);
+free_page1:
+ __free_page(page);
+uninit:
+ kvm_vcpu_uninit(&svm->vcpu);
+free_svm:
+ kmem_cache_free(kvm_vcpu_cache, svm);
+out:
+ return ERR_PTR(err);
+}
+
+static void svm_clear_current_vmcb(struct vmcb *vmcb)
+{
+ int i;
+
+ for_each_online_cpu(i)
+ cmpxchg(&per_cpu(svm_data, i)->current_vmcb, vmcb, NULL);
+}
+
+static void svm_free_vcpu(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /*
+ * The vmcb page can be recycled, causing a false negative in
+ * svm_vcpu_load(). So, ensure that no logical CPU has this
+ * vmcb page recorded as its current vmcb.
+ */
+ svm_clear_current_vmcb(svm->vmcb);
+
+ __free_page(pfn_to_page(__sme_clr(svm->vmcb_pa) >> PAGE_SHIFT));
+ __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
+ __free_page(virt_to_page(svm->nested.hsave));
+ __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
+ kvm_vcpu_uninit(vcpu);
+ kmem_cache_free(kvm_vcpu_cache, svm);
+}
+
+static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
+ int i;
+
+ if (unlikely(cpu != vcpu->cpu)) {
+ svm->asid_generation = 0;
+ mark_all_dirty(svm->vmcb);
+ }
+
+#ifdef CONFIG_X86_64
+ rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base);
+#endif
+ savesegment(fs, svm->host.fs);
+ savesegment(gs, svm->host.gs);
+ svm->host.ldt = kvm_read_ldt();
+
+ for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
+ rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
+
+ if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
+ u64 tsc_ratio = vcpu->arch.tsc_scaling_ratio;
+ if (tsc_ratio != __this_cpu_read(current_tsc_ratio)) {
+ __this_cpu_write(current_tsc_ratio, tsc_ratio);
+ wrmsrl(MSR_AMD64_TSC_RATIO, tsc_ratio);
+ }
+ }
+ /* This assumes that the kernel never uses MSR_TSC_AUX */
+ if (static_cpu_has(X86_FEATURE_RDTSCP))
+ wrmsrl(MSR_TSC_AUX, svm->tsc_aux);
+
+ if (sd->current_vmcb != svm->vmcb) {
+ sd->current_vmcb = svm->vmcb;
+ indirect_branch_prediction_barrier();
+ }
+ avic_vcpu_load(vcpu, cpu);
+}
+
+static void svm_vcpu_put(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int i;
+
+ avic_vcpu_put(vcpu);
+
+ ++vcpu->stat.host_state_reload;
+ kvm_load_ldt(svm->host.ldt);
+#ifdef CONFIG_X86_64
+ loadsegment(fs, svm->host.fs);
+ wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gsbase);
+ load_gs_index(svm->host.gs);
+#else
+#ifdef CONFIG_X86_32_LAZY_GS
+ loadsegment(gs, svm->host.gs);
+#endif
+#endif
+ for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
+ wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
+}
+
+static void svm_vcpu_blocking(struct kvm_vcpu *vcpu)
+{
+ avic_set_running(vcpu, false);
+}
+
+static void svm_vcpu_unblocking(struct kvm_vcpu *vcpu)
+{
+ avic_set_running(vcpu, true);
+}
+
+static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ unsigned long rflags = svm->vmcb->save.rflags;
+
+ if (svm->nmi_singlestep) {
+ /* Hide our flags if they were not set by the guest */
+ if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
+ rflags &= ~X86_EFLAGS_TF;
+ if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
+ rflags &= ~X86_EFLAGS_RF;
+ }
+ return rflags;
+}
+
+static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
+{
+ if (to_svm(vcpu)->nmi_singlestep)
+ rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
+
+ /*
+ * Any change of EFLAGS.VM is accompanied by a reload of SS
+ * (caused by either a task switch or an inter-privilege IRET),
+ * so we do not need to update the CPL here.
+ */
+ to_svm(vcpu)->vmcb->save.rflags = rflags;
+}
+
+static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
+{
+ switch (reg) {
+ case VCPU_EXREG_PDPTR:
+ BUG_ON(!npt_enabled);
+ load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
+ break;
+ default:
+ BUG();
+ }
+}
+
+static void svm_set_vintr(struct vcpu_svm *svm)
+{
+ set_intercept(svm, INTERCEPT_VINTR);
+}
+
+static void svm_clear_vintr(struct vcpu_svm *svm)
+{
+ clr_intercept(svm, INTERCEPT_VINTR);
+}
+
+static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
+{
+ struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
+
+ switch (seg) {
+ case VCPU_SREG_CS: return &save->cs;
+ case VCPU_SREG_DS: return &save->ds;
+ case VCPU_SREG_ES: return &save->es;
+ case VCPU_SREG_FS: return &save->fs;
+ case VCPU_SREG_GS: return &save->gs;
+ case VCPU_SREG_SS: return &save->ss;
+ case VCPU_SREG_TR: return &save->tr;
+ case VCPU_SREG_LDTR: return &save->ldtr;
+ }
+ BUG();
+ return NULL;
+}
+
+static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg)
+{
+ struct vmcb_seg *s = svm_seg(vcpu, seg);
+
+ return s->base;
+}
+
+static void svm_get_segment(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg)
+{
+ struct vmcb_seg *s = svm_seg(vcpu, seg);
+
+ var->base = s->base;
+ var->limit = s->limit;
+ var->selector = s->selector;
+ var->type = s->attrib & SVM_SELECTOR_TYPE_MASK;
+ var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1;
+ var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
+ var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1;
+ var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1;
+ var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
+ var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
+
+ /*
+ * AMD CPUs circa 2014 track the G bit for all segments except CS.
+ * However, the SVM spec states that the G bit is not observed by the
+ * CPU, and some VMware virtual CPUs drop the G bit for all segments.
+ * So let's synthesize a legal G bit for all segments, this helps
+ * running KVM nested. It also helps cross-vendor migration, because
+ * Intel's vmentry has a check on the 'G' bit.
+ */
+ var->g = s->limit > 0xfffff;
+
+ /*
+ * AMD's VMCB does not have an explicit unusable field, so emulate it
+ * for cross vendor migration purposes by "not present"
+ */
+ var->unusable = !var->present;
+
+ switch (seg) {
+ case VCPU_SREG_TR:
+ /*
+ * Work around a bug where the busy flag in the tr selector
+ * isn't exposed
+ */
+ var->type |= 0x2;
+ break;
+ case VCPU_SREG_DS:
+ case VCPU_SREG_ES:
+ case VCPU_SREG_FS:
+ case VCPU_SREG_GS:
+ /*
+ * The accessed bit must always be set in the segment
+ * descriptor cache, although it can be cleared in the
+ * descriptor, the cached bit always remains at 1. Since
+ * Intel has a check on this, set it here to support
+ * cross-vendor migration.
+ */
+ if (!var->unusable)
+ var->type |= 0x1;
+ break;
+ case VCPU_SREG_SS:
+ /*
+ * On AMD CPUs sometimes the DB bit in the segment
+ * descriptor is left as 1, although the whole segment has
+ * been made unusable. Clear it here to pass an Intel VMX
+ * entry check when cross vendor migrating.
+ */
+ if (var->unusable)
+ var->db = 0;
+ /* This is symmetric with svm_set_segment() */
+ var->dpl = to_svm(vcpu)->vmcb->save.cpl;
+ break;
+ }
+}
+
+static int svm_get_cpl(struct kvm_vcpu *vcpu)
+{
+ struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
+
+ return save->cpl;
+}
+
+static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ dt->size = svm->vmcb->save.idtr.limit;
+ dt->address = svm->vmcb->save.idtr.base;
+}
+
+static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->vmcb->save.idtr.limit = dt->size;
+ svm->vmcb->save.idtr.base = dt->address ;
+ mark_dirty(svm->vmcb, VMCB_DT);
+}
+
+static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ dt->size = svm->vmcb->save.gdtr.limit;
+ dt->address = svm->vmcb->save.gdtr.base;
+}
+
+static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->vmcb->save.gdtr.limit = dt->size;
+ svm->vmcb->save.gdtr.base = dt->address ;
+ mark_dirty(svm->vmcb, VMCB_DT);
+}
+
+static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
+{
+}
+
+static void svm_decache_cr3(struct kvm_vcpu *vcpu)
+{
+}
+
+static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
+{
+}
+
+static void update_cr0_intercept(struct vcpu_svm *svm)
+{
+ ulong gcr0 = svm->vcpu.arch.cr0;
+ u64 *hcr0 = &svm->vmcb->save.cr0;
+
+ *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
+ | (gcr0 & SVM_CR0_SELECTIVE_MASK);
+
+ mark_dirty(svm->vmcb, VMCB_CR);
+
+ if (gcr0 == *hcr0) {
+ clr_cr_intercept(svm, INTERCEPT_CR0_READ);
+ clr_cr_intercept(svm, INTERCEPT_CR0_WRITE);
+ } else {
+ set_cr_intercept(svm, INTERCEPT_CR0_READ);
+ set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
+ }
+}
+
+static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+#ifdef CONFIG_X86_64
+ if (vcpu->arch.efer & EFER_LME) {
+ if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
+ vcpu->arch.efer |= EFER_LMA;
+ svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
+ }
+
+ if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
+ vcpu->arch.efer &= ~EFER_LMA;
+ svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
+ }
+ }
+#endif
+ vcpu->arch.cr0 = cr0;
+
+ if (!npt_enabled)
+ cr0 |= X86_CR0_PG | X86_CR0_WP;
+
+ /*
+ * re-enable caching here because the QEMU bios
+ * does not do it - this results in some delay at
+ * reboot
+ */
+ if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
+ cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
+ svm->vmcb->save.cr0 = cr0;
+ mark_dirty(svm->vmcb, VMCB_CR);
+ update_cr0_intercept(svm);
+}
+
+static int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+ unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE;
+ unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4;
+
+ if (cr4 & X86_CR4_VMXE)
+ return 1;
+
+ if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
+ svm_flush_tlb(vcpu, true);
+
+ vcpu->arch.cr4 = cr4;
+ if (!npt_enabled)
+ cr4 |= X86_CR4_PAE;
+ cr4 |= host_cr4_mce;
+ to_svm(vcpu)->vmcb->save.cr4 = cr4;
+ mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
+ return 0;
+}
+
+static void svm_set_segment(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_seg *s = svm_seg(vcpu, seg);
+
+ s->base = var->base;
+ s->limit = var->limit;
+ s->selector = var->selector;
+ s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
+ s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
+ s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
+ s->attrib |= ((var->present & 1) && !var->unusable) << SVM_SELECTOR_P_SHIFT;
+ s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
+ s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
+ s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
+ s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
+
+ /*
+ * This is always accurate, except if SYSRET returned to a segment
+ * with SS.DPL != 3. Intel does not have this quirk, and always
+ * forces SS.DPL to 3 on sysret, so we ignore that case; fixing it
+ * would entail passing the CPL to userspace and back.
+ */
+ if (seg == VCPU_SREG_SS)
+ /* This is symmetric with svm_get_segment() */
+ svm->vmcb->save.cpl = (var->dpl & 3);
+
+ mark_dirty(svm->vmcb, VMCB_SEG);
+}
+
+static void update_bp_intercept(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ clr_exception_intercept(svm, BP_VECTOR);
+
+ if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
+ if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
+ set_exception_intercept(svm, BP_VECTOR);
+ } else
+ vcpu->guest_debug = 0;
+}
+
+static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
+{
+ if (sd->next_asid > sd->max_asid) {
+ ++sd->asid_generation;
+ sd->next_asid = sd->min_asid;
+ svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
+ }
+
+ svm->asid_generation = sd->asid_generation;
+ svm->vmcb->control.asid = sd->next_asid++;
+
+ mark_dirty(svm->vmcb, VMCB_ASID);
+}
+
+static u64 svm_get_dr6(struct kvm_vcpu *vcpu)
+{
+ return to_svm(vcpu)->vmcb->save.dr6;
+}
+
+static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->vmcb->save.dr6 = value;
+ mark_dirty(svm->vmcb, VMCB_DR);
+}
+
+static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ get_debugreg(vcpu->arch.db[0], 0);
+ get_debugreg(vcpu->arch.db[1], 1);
+ get_debugreg(vcpu->arch.db[2], 2);
+ get_debugreg(vcpu->arch.db[3], 3);
+ vcpu->arch.dr6 = svm_get_dr6(vcpu);
+ vcpu->arch.dr7 = svm->vmcb->save.dr7;
+
+ vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
+ set_dr_intercepts(svm);
+}
+
+static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->vmcb->save.dr7 = value;
+ mark_dirty(svm->vmcb, VMCB_DR);
+}
+
+static int pf_interception(struct vcpu_svm *svm)
+{
+ u64 fault_address = __sme_clr(svm->vmcb->control.exit_info_2);
+ u64 error_code = svm->vmcb->control.exit_info_1;
+
+ return kvm_handle_page_fault(&svm->vcpu, error_code, fault_address,
+ static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
+ svm->vmcb->control.insn_bytes : NULL,
+ svm->vmcb->control.insn_len);
+}
+
+static int npf_interception(struct vcpu_svm *svm)
+{
+ u64 fault_address = __sme_clr(svm->vmcb->control.exit_info_2);
+ u64 error_code = svm->vmcb->control.exit_info_1;
+
+ trace_kvm_page_fault(fault_address, error_code);
+ return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code,
+ static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
+ svm->vmcb->control.insn_bytes : NULL,
+ svm->vmcb->control.insn_len);
+}
+
+static int db_interception(struct vcpu_svm *svm)
+{
+ struct kvm_run *kvm_run = svm->vcpu.run;
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+
+ if (!(svm->vcpu.guest_debug &
+ (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
+ !svm->nmi_singlestep) {
+ kvm_queue_exception(&svm->vcpu, DB_VECTOR);
+ return 1;
+ }
+
+ if (svm->nmi_singlestep) {
+ disable_nmi_singlestep(svm);
+ /* Make sure we check for pending NMIs upon entry */
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ }
+
+ if (svm->vcpu.guest_debug &
+ (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) {
+ kvm_run->exit_reason = KVM_EXIT_DEBUG;
+ kvm_run->debug.arch.pc =
+ svm->vmcb->save.cs.base + svm->vmcb->save.rip;
+ kvm_run->debug.arch.exception = DB_VECTOR;
+ return 0;
+ }
+
+ return 1;
+}
+
+static int bp_interception(struct vcpu_svm *svm)
+{
+ struct kvm_run *kvm_run = svm->vcpu.run;
+
+ kvm_run->exit_reason = KVM_EXIT_DEBUG;
+ kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
+ kvm_run->debug.arch.exception = BP_VECTOR;
+ return 0;
+}
+
+static int ud_interception(struct vcpu_svm *svm)
+{
+ return handle_ud(&svm->vcpu);
+}
+
+static int ac_interception(struct vcpu_svm *svm)
+{
+ kvm_queue_exception_e(&svm->vcpu, AC_VECTOR, 0);
+ return 1;
+}
+
+static int gp_interception(struct vcpu_svm *svm)
+{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ u32 error_code = svm->vmcb->control.exit_info_1;
+ int er;
+
+ WARN_ON_ONCE(!enable_vmware_backdoor);
+
+ er = kvm_emulate_instruction(vcpu,
+ EMULTYPE_VMWARE | EMULTYPE_NO_UD_ON_FAIL);
+ if (er == EMULATE_USER_EXIT)
+ return 0;
+ else if (er != EMULATE_DONE)
+ kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
+ return 1;
+}
+
+static bool is_erratum_383(void)
+{
+ int err, i;
+ u64 value;
+
+ if (!erratum_383_found)
+ return false;
+
+ value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err);
+ if (err)
+ return false;
+
+ /* Bit 62 may or may not be set for this mce */
+ value &= ~(1ULL << 62);
+
+ if (value != 0xb600000000010015ULL)
+ return false;
+
+ /* Clear MCi_STATUS registers */
+ for (i = 0; i < 6; ++i)
+ native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0);
+
+ value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err);
+ if (!err) {
+ u32 low, high;
+
+ value &= ~(1ULL << 2);
+ low = lower_32_bits(value);
+ high = upper_32_bits(value);
+
+ native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high);
+ }
+
+ /* Flush tlb to evict multi-match entries */
+ __flush_tlb_all();
+
+ return true;
+}
+
+static void svm_handle_mce(struct vcpu_svm *svm)
+{
+ if (is_erratum_383()) {
+ /*
+ * Erratum 383 triggered. Guest state is corrupt so kill the
+ * guest.
+ */
+ pr_err("KVM: Guest triggered AMD Erratum 383\n");
+
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, &svm->vcpu);
+
+ return;
+ }
+
+ /*
+ * On an #MC intercept the MCE handler is not called automatically in
+ * the host. So do it by hand here.
+ */
+ asm volatile (
+ "int $0x12\n");
+ /* not sure if we ever come back to this point */
+
+ return;
+}
+
+static int mc_interception(struct vcpu_svm *svm)
+{
+ return 1;
+}
+
+static int shutdown_interception(struct vcpu_svm *svm)
+{
+ struct kvm_run *kvm_run = svm->vcpu.run;
+
+ /*
+ * VMCB is undefined after a SHUTDOWN intercept
+ * so reinitialize it.
+ */
+ clear_page(svm->vmcb);
+ init_vmcb(svm);
+
+ kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
+ return 0;
+}
+
+static int io_interception(struct vcpu_svm *svm)
+{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
+ int size, in, string;
+ unsigned port;
+
+ ++svm->vcpu.stat.io_exits;
+ string = (io_info & SVM_IOIO_STR_MASK) != 0;
+ in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
+ if (string)
+ return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE;
+
+ port = io_info >> 16;
+ size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
+ svm->next_rip = svm->vmcb->control.exit_info_2;
+
+ return kvm_fast_pio(&svm->vcpu, size, port, in);
+}
+
+static int nmi_interception(struct vcpu_svm *svm)
+{
+ return 1;
+}
+
+static int intr_interception(struct vcpu_svm *svm)
+{
+ ++svm->vcpu.stat.irq_exits;
+ return 1;
+}
+
+static int nop_on_interception(struct vcpu_svm *svm)
+{
+ return 1;
+}
+
+static int halt_interception(struct vcpu_svm *svm)
+{
+ svm->next_rip = kvm_rip_read(&svm->vcpu) + 1;
+ return kvm_emulate_halt(&svm->vcpu);
+}
+
+static int vmmcall_interception(struct vcpu_svm *svm)
+{
+ svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
+ return kvm_emulate_hypercall(&svm->vcpu);
+}
+
+static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ return svm->nested.nested_cr3;
+}
+
+static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u64 cr3 = svm->nested.nested_cr3;
+ u64 pdpte;
+ int ret;
+
+ ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(__sme_clr(cr3)), &pdpte,
+ offset_in_page(cr3) + index * 8, 8);
+ if (ret)
+ return 0;
+ return pdpte;
+}
+
+static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu,
+ unsigned long root)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->vmcb->control.nested_cr3 = __sme_set(root);
+ mark_dirty(svm->vmcb, VMCB_NPT);
+}
+
+static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
+ struct x86_exception *fault)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (svm->vmcb->control.exit_code != SVM_EXIT_NPF) {
+ /*
+ * TODO: track the cause of the nested page fault, and
+ * correctly fill in the high bits of exit_info_1.
+ */
+ svm->vmcb->control.exit_code = SVM_EXIT_NPF;
+ svm->vmcb->control.exit_code_hi = 0;
+ svm->vmcb->control.exit_info_1 = (1ULL << 32);
+ svm->vmcb->control.exit_info_2 = fault->address;
+ }
+
+ svm->vmcb->control.exit_info_1 &= ~0xffffffffULL;
+ svm->vmcb->control.exit_info_1 |= fault->error_code;
+
+ /*
+ * The present bit is always zero for page structure faults on real
+ * hardware.
+ */
+ if (svm->vmcb->control.exit_info_1 & (2ULL << 32))
+ svm->vmcb->control.exit_info_1 &= ~1;
+
+ nested_svm_vmexit(svm);
+}
+
+static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
+{
+ WARN_ON(mmu_is_nested(vcpu));
+ kvm_init_shadow_mmu(vcpu);
+ vcpu->arch.mmu.set_cr3 = nested_svm_set_tdp_cr3;
+ vcpu->arch.mmu.get_cr3 = nested_svm_get_tdp_cr3;
+ vcpu->arch.mmu.get_pdptr = nested_svm_get_tdp_pdptr;
+ vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit;
+ vcpu->arch.mmu.shadow_root_level = get_npt_level(vcpu);
+ reset_shadow_zero_bits_mask(vcpu, &vcpu->arch.mmu);
+ vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu;
+}
+
+static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.walk_mmu = &vcpu->arch.mmu;
+}
+
+static int nested_svm_check_permissions(struct vcpu_svm *svm)
+{
+ if (!(svm->vcpu.arch.efer & EFER_SVME) ||
+ !is_paging(&svm->vcpu)) {
+ kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ if (svm->vmcb->save.cpl) {
+ kvm_inject_gp(&svm->vcpu, 0);
+ return 1;
+ }
+
+ return 0;
+}
+
+static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
+ bool has_error_code, u32 error_code)
+{
+ int vmexit;
+
+ if (!is_guest_mode(&svm->vcpu))
+ return 0;
+
+ vmexit = nested_svm_intercept(svm);
+ if (vmexit != NESTED_EXIT_DONE)
+ return 0;
+
+ svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
+ svm->vmcb->control.exit_code_hi = 0;
+ svm->vmcb->control.exit_info_1 = error_code;
+
+ /*
+ * FIXME: we should not write CR2 when L1 intercepts an L2 #PF exception.
+ * The fix is to add the ancillary datum (CR2 or DR6) to structs
+ * kvm_queued_exception and kvm_vcpu_events, so that CR2 and DR6 can be
+ * written only when inject_pending_event runs (DR6 would written here
+ * too). This should be conditional on a new capability---if the
+ * capability is disabled, kvm_multiple_exception would write the
+ * ancillary information to CR2 or DR6, for backwards ABI-compatibility.
+ */
+ if (svm->vcpu.arch.exception.nested_apf)
+ svm->vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token;
+ else
+ svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
+
+ svm->nested.exit_required = true;
+ return vmexit;
+}
+
+/* This function returns true if it is save to enable the irq window */
+static inline bool nested_svm_intr(struct vcpu_svm *svm)
+{
+ if (!is_guest_mode(&svm->vcpu))
+ return true;
+
+ if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
+ return true;
+
+ if (!(svm->vcpu.arch.hflags & HF_HIF_MASK))
+ return false;
+
+ /*
+ * if vmexit was already requested (by intercepted exception
+ * for instance) do not overwrite it with "external interrupt"
+ * vmexit.
+ */
+ if (svm->nested.exit_required)
+ return false;
+
+ svm->vmcb->control.exit_code = SVM_EXIT_INTR;
+ svm->vmcb->control.exit_info_1 = 0;
+ svm->vmcb->control.exit_info_2 = 0;
+
+ if (svm->nested.intercept & 1ULL) {
+ /*
+ * The #vmexit can't be emulated here directly because this
+ * code path runs with irqs and preemption disabled. A
+ * #vmexit emulation might sleep. Only signal request for
+ * the #vmexit here.
+ */
+ svm->nested.exit_required = true;
+ trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
+ return false;
+ }
+
+ return true;
+}
+
+/* This function returns true if it is save to enable the nmi window */
+static inline bool nested_svm_nmi(struct vcpu_svm *svm)
+{
+ if (!is_guest_mode(&svm->vcpu))
+ return true;
+
+ if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI)))
+ return true;
+
+ svm->vmcb->control.exit_code = SVM_EXIT_NMI;
+ svm->nested.exit_required = true;
+
+ return false;
+}
+
+static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page)
+{
+ struct page *page;
+
+ might_sleep();
+
+ page = kvm_vcpu_gfn_to_page(&svm->vcpu, gpa >> PAGE_SHIFT);
+ if (is_error_page(page))
+ goto error;
+
+ *_page = page;
+
+ return kmap(page);
+
+error:
+ kvm_inject_gp(&svm->vcpu, 0);
+
+ return NULL;
+}
+
+static void nested_svm_unmap(struct page *page)
+{
+ kunmap(page);
+ kvm_release_page_dirty(page);
+}
+
+static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
+{
+ unsigned port, size, iopm_len;
+ u16 val, mask;
+ u8 start_bit;
+ u64 gpa;
+
+ if (!(svm->nested.intercept & (1ULL << INTERCEPT_IOIO_PROT)))
+ return NESTED_EXIT_HOST;
+
+ port = svm->vmcb->control.exit_info_1 >> 16;
+ size = (svm->vmcb->control.exit_info_1 & SVM_IOIO_SIZE_MASK) >>
+ SVM_IOIO_SIZE_SHIFT;
+ gpa = svm->nested.vmcb_iopm + (port / 8);
+ start_bit = port % 8;
+ iopm_len = (start_bit + size > 8) ? 2 : 1;
+ mask = (0xf >> (4 - size)) << start_bit;
+ val = 0;
+
+ if (kvm_vcpu_read_guest(&svm->vcpu, gpa, &val, iopm_len))
+ return NESTED_EXIT_DONE;
+
+ return (val & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
+}
+
+static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
+{
+ u32 offset, msr, value;
+ int write, mask;
+
+ if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
+ return NESTED_EXIT_HOST;
+
+ msr = svm->vcpu.arch.regs[VCPU_REGS_RCX];
+ offset = svm_msrpm_offset(msr);
+ write = svm->vmcb->control.exit_info_1 & 1;
+ mask = 1 << ((2 * (msr & 0xf)) + write);
+
+ if (offset == MSR_INVALID)
+ return NESTED_EXIT_DONE;
+
+ /* Offset is in 32 bit units but need in 8 bit units */
+ offset *= 4;
+
+ if (kvm_vcpu_read_guest(&svm->vcpu, svm->nested.vmcb_msrpm + offset, &value, 4))
+ return NESTED_EXIT_DONE;
+
+ return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
+}
+
+/* DB exceptions for our internal use must not cause vmexit */
+static int nested_svm_intercept_db(struct vcpu_svm *svm)
+{
+ unsigned long dr6;
+
+ /* if we're not singlestepping, it's not ours */
+ if (!svm->nmi_singlestep)
+ return NESTED_EXIT_DONE;
+
+ /* if it's not a singlestep exception, it's not ours */
+ if (kvm_get_dr(&svm->vcpu, 6, &dr6))
+ return NESTED_EXIT_DONE;
+ if (!(dr6 & DR6_BS))
+ return NESTED_EXIT_DONE;
+
+ /* if the guest is singlestepping, it should get the vmexit */
+ if (svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF) {
+ disable_nmi_singlestep(svm);
+ return NESTED_EXIT_DONE;
+ }
+
+ /* it's ours, the nested hypervisor must not see this one */
+ return NESTED_EXIT_HOST;
+}
+
+static int nested_svm_exit_special(struct vcpu_svm *svm)
+{
+ u32 exit_code = svm->vmcb->control.exit_code;
+
+ switch (exit_code) {
+ case SVM_EXIT_INTR:
+ case SVM_EXIT_NMI:
+ case SVM_EXIT_EXCP_BASE + MC_VECTOR:
+ return NESTED_EXIT_HOST;
+ case SVM_EXIT_NPF:
+ /* For now we are always handling NPFs when using them */
+ if (npt_enabled)
+ return NESTED_EXIT_HOST;
+ break;
+ case SVM_EXIT_EXCP_BASE + PF_VECTOR:
+ /* Trap async PF even if not shadowing */
+ if (!npt_enabled || svm->vcpu.arch.apf.host_apf_reason)
+ return NESTED_EXIT_HOST;
+ break;
+ default:
+ break;
+ }
+
+ return NESTED_EXIT_CONTINUE;
+}
+
+/*
+ * If this function returns true, this #vmexit was already handled
+ */
+static int nested_svm_intercept(struct vcpu_svm *svm)
+{
+ u32 exit_code = svm->vmcb->control.exit_code;
+ int vmexit = NESTED_EXIT_HOST;
+
+ switch (exit_code) {
+ case SVM_EXIT_MSR:
+ vmexit = nested_svm_exit_handled_msr(svm);
+ break;
+ case SVM_EXIT_IOIO:
+ vmexit = nested_svm_intercept_ioio(svm);
+ break;
+ case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: {
+ u32 bit = 1U << (exit_code - SVM_EXIT_READ_CR0);
+ if (svm->nested.intercept_cr & bit)
+ vmexit = NESTED_EXIT_DONE;
+ break;
+ }
+ case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: {
+ u32 bit = 1U << (exit_code - SVM_EXIT_READ_DR0);
+ if (svm->nested.intercept_dr & bit)
+ vmexit = NESTED_EXIT_DONE;
+ break;
+ }
+ case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
+ u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
+ if (svm->nested.intercept_exceptions & excp_bits) {
+ if (exit_code == SVM_EXIT_EXCP_BASE + DB_VECTOR)
+ vmexit = nested_svm_intercept_db(svm);
+ else
+ vmexit = NESTED_EXIT_DONE;
+ }
+ /* async page fault always cause vmexit */
+ else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) &&
+ svm->vcpu.arch.exception.nested_apf != 0)
+ vmexit = NESTED_EXIT_DONE;
+ break;
+ }
+ case SVM_EXIT_ERR: {
+ vmexit = NESTED_EXIT_DONE;
+ break;
+ }
+ default: {
+ u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR);
+ if (svm->nested.intercept & exit_bits)
+ vmexit = NESTED_EXIT_DONE;
+ }
+ }
+
+ return vmexit;
+}
+
+static int nested_svm_exit_handled(struct vcpu_svm *svm)
+{
+ int vmexit;
+
+ vmexit = nested_svm_intercept(svm);
+
+ if (vmexit == NESTED_EXIT_DONE)
+ nested_svm_vmexit(svm);
+
+ return vmexit;
+}
+
+static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *from_vmcb)
+{
+ struct vmcb_control_area *dst = &dst_vmcb->control;
+ struct vmcb_control_area *from = &from_vmcb->control;
+
+ dst->intercept_cr = from->intercept_cr;
+ dst->intercept_dr = from->intercept_dr;
+ dst->intercept_exceptions = from->intercept_exceptions;
+ dst->intercept = from->intercept;
+ dst->iopm_base_pa = from->iopm_base_pa;
+ dst->msrpm_base_pa = from->msrpm_base_pa;
+ dst->tsc_offset = from->tsc_offset;
+ /* asid not copied, it is handled manually for svm->vmcb. */
+ dst->tlb_ctl = from->tlb_ctl;
+ dst->int_ctl = from->int_ctl;
+ dst->int_vector = from->int_vector;
+ dst->int_state = from->int_state;
+ dst->exit_code = from->exit_code;
+ dst->exit_code_hi = from->exit_code_hi;
+ dst->exit_info_1 = from->exit_info_1;
+ dst->exit_info_2 = from->exit_info_2;
+ dst->exit_int_info = from->exit_int_info;
+ dst->exit_int_info_err = from->exit_int_info_err;
+ dst->nested_ctl = from->nested_ctl;
+ dst->event_inj = from->event_inj;
+ dst->event_inj_err = from->event_inj_err;
+ dst->nested_cr3 = from->nested_cr3;
+ dst->virt_ext = from->virt_ext;
+}
+
+static int nested_svm_vmexit(struct vcpu_svm *svm)
+{
+ struct vmcb *nested_vmcb;
+ struct vmcb *hsave = svm->nested.hsave;
+ struct vmcb *vmcb = svm->vmcb;
+ struct page *page;
+
+ trace_kvm_nested_vmexit_inject(vmcb->control.exit_code,
+ vmcb->control.exit_info_1,
+ vmcb->control.exit_info_2,
+ vmcb->control.exit_int_info,
+ vmcb->control.exit_int_info_err,
+ KVM_ISA_SVM);
+
+ nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, &page);
+ if (!nested_vmcb)
+ return 1;
+
+ /* Exit Guest-Mode */
+ leave_guest_mode(&svm->vcpu);
+ svm->nested.vmcb = 0;
+
+ /* Give the current vmcb to the guest */
+ disable_gif(svm);
+
+ nested_vmcb->save.es = vmcb->save.es;
+ nested_vmcb->save.cs = vmcb->save.cs;
+ nested_vmcb->save.ss = vmcb->save.ss;
+ nested_vmcb->save.ds = vmcb->save.ds;
+ nested_vmcb->save.gdtr = vmcb->save.gdtr;
+ nested_vmcb->save.idtr = vmcb->save.idtr;
+ nested_vmcb->save.efer = svm->vcpu.arch.efer;
+ nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu);
+ nested_vmcb->save.cr3 = kvm_read_cr3(&svm->vcpu);
+ nested_vmcb->save.cr2 = vmcb->save.cr2;
+ nested_vmcb->save.cr4 = svm->vcpu.arch.cr4;
+ nested_vmcb->save.rflags = kvm_get_rflags(&svm->vcpu);
+ nested_vmcb->save.rip = vmcb->save.rip;
+ nested_vmcb->save.rsp = vmcb->save.rsp;
+ nested_vmcb->save.rax = vmcb->save.rax;
+ nested_vmcb->save.dr7 = vmcb->save.dr7;
+ nested_vmcb->save.dr6 = vmcb->save.dr6;
+ nested_vmcb->save.cpl = vmcb->save.cpl;
+
+ nested_vmcb->control.int_ctl = vmcb->control.int_ctl;
+ nested_vmcb->control.int_vector = vmcb->control.int_vector;
+ nested_vmcb->control.int_state = vmcb->control.int_state;
+ nested_vmcb->control.exit_code = vmcb->control.exit_code;
+ nested_vmcb->control.exit_code_hi = vmcb->control.exit_code_hi;
+ nested_vmcb->control.exit_info_1 = vmcb->control.exit_info_1;
+ nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2;
+ nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info;
+ nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err;
+
+ if (svm->nrips_enabled)
+ nested_vmcb->control.next_rip = vmcb->control.next_rip;
+
+ /*
+ * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have
+ * to make sure that we do not lose injected events. So check event_inj
+ * here and copy it to exit_int_info if it is valid.
+ * Exit_int_info and event_inj can't be both valid because the case
+ * below only happens on a VMRUN instruction intercept which has
+ * no valid exit_int_info set.
+ */
+ if (vmcb->control.event_inj & SVM_EVTINJ_VALID) {
+ struct vmcb_control_area *nc = &nested_vmcb->control;
+
+ nc->exit_int_info = vmcb->control.event_inj;
+ nc->exit_int_info_err = vmcb->control.event_inj_err;
+ }
+
+ nested_vmcb->control.tlb_ctl = 0;
+ nested_vmcb->control.event_inj = 0;
+ nested_vmcb->control.event_inj_err = 0;
+
+ /* We always set V_INTR_MASKING and remember the old value in hflags */
+ if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
+ nested_vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK;
+
+ /* Restore the original control entries */
+ copy_vmcb_control_area(vmcb, hsave);
+
+ svm->vcpu.arch.tsc_offset = svm->vmcb->control.tsc_offset;
+ kvm_clear_exception_queue(&svm->vcpu);
+ kvm_clear_interrupt_queue(&svm->vcpu);
+
+ svm->nested.nested_cr3 = 0;
+
+ /* Restore selected save entries */
+ svm->vmcb->save.es = hsave->save.es;
+ svm->vmcb->save.cs = hsave->save.cs;
+ svm->vmcb->save.ss = hsave->save.ss;
+ svm->vmcb->save.ds = hsave->save.ds;
+ svm->vmcb->save.gdtr = hsave->save.gdtr;
+ svm->vmcb->save.idtr = hsave->save.idtr;
+ kvm_set_rflags(&svm->vcpu, hsave->save.rflags);
+ svm_set_efer(&svm->vcpu, hsave->save.efer);
+ svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE);
+ svm_set_cr4(&svm->vcpu, hsave->save.cr4);
+ if (npt_enabled) {
+ svm->vmcb->save.cr3 = hsave->save.cr3;
+ svm->vcpu.arch.cr3 = hsave->save.cr3;
+ } else {
+ (void)kvm_set_cr3(&svm->vcpu, hsave->save.cr3);
+ }
+ kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, hsave->save.rax);
+ kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, hsave->save.rsp);
+ kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, hsave->save.rip);
+ svm->vmcb->save.dr7 = 0;
+ svm->vmcb->save.cpl = 0;
+ svm->vmcb->control.exit_int_info = 0;
+
+ mark_all_dirty(svm->vmcb);
+
+ nested_svm_unmap(page);
+
+ nested_svm_uninit_mmu_context(&svm->vcpu);
+ kvm_mmu_reset_context(&svm->vcpu);
+ kvm_mmu_load(&svm->vcpu);
+
+ /*
+ * Drop what we picked up for L2 via svm_complete_interrupts() so it
+ * doesn't end up in L1.
+ */
+ svm->vcpu.arch.nmi_injected = false;
+ kvm_clear_exception_queue(&svm->vcpu);
+ kvm_clear_interrupt_queue(&svm->vcpu);
+
+ return 0;
+}
+
+static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
+{
+ /*
+ * This function merges the msr permission bitmaps of kvm and the
+ * nested vmcb. It is optimized in that it only merges the parts where
+ * the kvm msr permission bitmap may contain zero bits
+ */
+ int i;
+
+ if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
+ return true;
+
+ for (i = 0; i < MSRPM_OFFSETS; i++) {
+ u32 value, p;
+ u64 offset;
+
+ if (msrpm_offsets[i] == 0xffffffff)
+ break;
+
+ p = msrpm_offsets[i];
+ offset = svm->nested.vmcb_msrpm + (p * 4);
+
+ if (kvm_vcpu_read_guest(&svm->vcpu, offset, &value, 4))
+ return false;
+
+ svm->nested.msrpm[p] = svm->msrpm[p] | value;
+ }
+
+ svm->vmcb->control.msrpm_base_pa = __sme_set(__pa(svm->nested.msrpm));
+
+ return true;
+}
+
+static bool nested_vmcb_checks(struct vmcb *vmcb)
+{
+ if ((vmcb->control.intercept & (1ULL << INTERCEPT_VMRUN)) == 0)
+ return false;
+
+ if (vmcb->control.asid == 0)
+ return false;
+
+ if ((vmcb->control.nested_ctl & SVM_NESTED_CTL_NP_ENABLE) &&
+ !npt_enabled)
+ return false;
+
+ return true;
+}
+
+static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
+ struct vmcb *nested_vmcb, struct page *page)
+{
+ if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF)
+ svm->vcpu.arch.hflags |= HF_HIF_MASK;
+ else
+ svm->vcpu.arch.hflags &= ~HF_HIF_MASK;
+
+ if (nested_vmcb->control.nested_ctl & SVM_NESTED_CTL_NP_ENABLE) {
+ kvm_mmu_unload(&svm->vcpu);
+ svm->nested.nested_cr3 = nested_vmcb->control.nested_cr3;
+ nested_svm_init_mmu_context(&svm->vcpu);
+ }
+
+ /* Load the nested guest state */
+ svm->vmcb->save.es = nested_vmcb->save.es;
+ svm->vmcb->save.cs = nested_vmcb->save.cs;
+ svm->vmcb->save.ss = nested_vmcb->save.ss;
+ svm->vmcb->save.ds = nested_vmcb->save.ds;
+ svm->vmcb->save.gdtr = nested_vmcb->save.gdtr;
+ svm->vmcb->save.idtr = nested_vmcb->save.idtr;
+ kvm_set_rflags(&svm->vcpu, nested_vmcb->save.rflags);
+ svm_set_efer(&svm->vcpu, nested_vmcb->save.efer);
+ svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0);
+ svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4);
+ if (npt_enabled) {
+ svm->vmcb->save.cr3 = nested_vmcb->save.cr3;
+ svm->vcpu.arch.cr3 = nested_vmcb->save.cr3;
+ } else
+ (void)kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3);
+
+ /* Guest paging mode is active - reset mmu */
+ kvm_mmu_reset_context(&svm->vcpu);
+
+ svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2;
+ kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax);
+ kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp);
+ kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, nested_vmcb->save.rip);
+
+ /* In case we don't even reach vcpu_run, the fields are not updated */
+ svm->vmcb->save.rax = nested_vmcb->save.rax;
+ svm->vmcb->save.rsp = nested_vmcb->save.rsp;
+ svm->vmcb->save.rip = nested_vmcb->save.rip;
+ svm->vmcb->save.dr7 = nested_vmcb->save.dr7;
+ svm->vmcb->save.dr6 = nested_vmcb->save.dr6;
+ svm->vmcb->save.cpl = nested_vmcb->save.cpl;
+
+ svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa & ~0x0fffULL;
+ svm->nested.vmcb_iopm = nested_vmcb->control.iopm_base_pa & ~0x0fffULL;
+
+ /* cache intercepts */
+ svm->nested.intercept_cr = nested_vmcb->control.intercept_cr;
+ svm->nested.intercept_dr = nested_vmcb->control.intercept_dr;
+ svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions;
+ svm->nested.intercept = nested_vmcb->control.intercept;
+
+ svm_flush_tlb(&svm->vcpu, true);
+
+ svm->vmcb->control.int_ctl &=
+ V_INTR_MASKING_MASK | V_GIF_ENABLE_MASK | V_GIF_MASK;
+
+ svm->vmcb->control.int_ctl |= nested_vmcb->control.int_ctl &
+ (V_TPR_MASK | V_IRQ_INJECTION_BITS_MASK);
+
+ if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK)
+ svm->vcpu.arch.hflags |= HF_VINTR_MASK;
+ else
+ svm->vcpu.arch.hflags &= ~HF_VINTR_MASK;
+
+ if (svm->vcpu.arch.hflags & HF_VINTR_MASK) {
+ /* We only want the cr8 intercept bits of the guest */
+ clr_cr_intercept(svm, INTERCEPT_CR8_READ);
+ clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
+ }
+
+ /* We don't want to see VMMCALLs from a nested guest */
+ clr_intercept(svm, INTERCEPT_VMMCALL);
+
+ svm->vcpu.arch.tsc_offset += nested_vmcb->control.tsc_offset;
+ svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset;
+
+ svm->vmcb->control.virt_ext = nested_vmcb->control.virt_ext;
+ svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
+ svm->vmcb->control.int_state = nested_vmcb->control.int_state;
+ svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
+ svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
+
+ nested_svm_unmap(page);
+
+ /* Enter Guest-Mode */
+ enter_guest_mode(&svm->vcpu);
+
+ /*
+ * Merge guest and host intercepts - must be called with vcpu in
+ * guest-mode to take affect here
+ */
+ recalc_intercepts(svm);
+
+ svm->nested.vmcb = vmcb_gpa;
+
+ enable_gif(svm);
+
+ mark_all_dirty(svm->vmcb);
+}
+
+static bool nested_svm_vmrun(struct vcpu_svm *svm)
+{
+ struct vmcb *nested_vmcb;
+ struct vmcb *hsave = svm->nested.hsave;
+ struct vmcb *vmcb = svm->vmcb;
+ struct page *page;
+ u64 vmcb_gpa;
+
+ vmcb_gpa = svm->vmcb->save.rax;
+
+ nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
+ if (!nested_vmcb)
+ return false;
+
+ if (!nested_vmcb_checks(nested_vmcb)) {
+ nested_vmcb->control.exit_code = SVM_EXIT_ERR;
+ nested_vmcb->control.exit_code_hi = 0;
+ nested_vmcb->control.exit_info_1 = 0;
+ nested_vmcb->control.exit_info_2 = 0;
+
+ nested_svm_unmap(page);
+
+ return false;
+ }
+
+ trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa,
+ nested_vmcb->save.rip,
+ nested_vmcb->control.int_ctl,
+ nested_vmcb->control.event_inj,
+ nested_vmcb->control.nested_ctl);
+
+ trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff,
+ nested_vmcb->control.intercept_cr >> 16,
+ nested_vmcb->control.intercept_exceptions,
+ nested_vmcb->control.intercept);
+
+ /* Clear internal status */
+ kvm_clear_exception_queue(&svm->vcpu);
+ kvm_clear_interrupt_queue(&svm->vcpu);
+
+ /*
+ * Save the old vmcb, so we don't need to pick what we save, but can
+ * restore everything when a VMEXIT occurs
+ */
+ hsave->save.es = vmcb->save.es;
+ hsave->save.cs = vmcb->save.cs;
+ hsave->save.ss = vmcb->save.ss;
+ hsave->save.ds = vmcb->save.ds;
+ hsave->save.gdtr = vmcb->save.gdtr;
+ hsave->save.idtr = vmcb->save.idtr;
+ hsave->save.efer = svm->vcpu.arch.efer;
+ hsave->save.cr0 = kvm_read_cr0(&svm->vcpu);
+ hsave->save.cr4 = svm->vcpu.arch.cr4;
+ hsave->save.rflags = kvm_get_rflags(&svm->vcpu);
+ hsave->save.rip = kvm_rip_read(&svm->vcpu);
+ hsave->save.rsp = vmcb->save.rsp;
+ hsave->save.rax = vmcb->save.rax;
+ if (npt_enabled)
+ hsave->save.cr3 = vmcb->save.cr3;
+ else
+ hsave->save.cr3 = kvm_read_cr3(&svm->vcpu);
+
+ copy_vmcb_control_area(hsave, vmcb);
+
+ enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb, page);
+
+ return true;
+}
+
+static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
+{
+ to_vmcb->save.fs = from_vmcb->save.fs;
+ to_vmcb->save.gs = from_vmcb->save.gs;
+ to_vmcb->save.tr = from_vmcb->save.tr;
+ to_vmcb->save.ldtr = from_vmcb->save.ldtr;
+ to_vmcb->save.kernel_gs_base = from_vmcb->save.kernel_gs_base;
+ to_vmcb->save.star = from_vmcb->save.star;
+ to_vmcb->save.lstar = from_vmcb->save.lstar;
+ to_vmcb->save.cstar = from_vmcb->save.cstar;
+ to_vmcb->save.sfmask = from_vmcb->save.sfmask;
+ to_vmcb->save.sysenter_cs = from_vmcb->save.sysenter_cs;
+ to_vmcb->save.sysenter_esp = from_vmcb->save.sysenter_esp;
+ to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip;
+}
+
+static int vmload_interception(struct vcpu_svm *svm)
+{
+ struct vmcb *nested_vmcb;
+ struct page *page;
+ int ret;
+
+ if (nested_svm_check_permissions(svm))
+ return 1;
+
+ nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
+ if (!nested_vmcb)
+ return 1;
+
+ svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
+ ret = kvm_skip_emulated_instruction(&svm->vcpu);
+
+ nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
+ nested_svm_unmap(page);
+
+ return ret;
+}
+
+static int vmsave_interception(struct vcpu_svm *svm)
+{
+ struct vmcb *nested_vmcb;
+ struct page *page;
+ int ret;
+
+ if (nested_svm_check_permissions(svm))
+ return 1;
+
+ nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
+ if (!nested_vmcb)
+ return 1;
+
+ svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
+ ret = kvm_skip_emulated_instruction(&svm->vcpu);
+
+ nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
+ nested_svm_unmap(page);
+
+ return ret;
+}
+
+static int vmrun_interception(struct vcpu_svm *svm)
+{
+ if (nested_svm_check_permissions(svm))
+ return 1;
+
+ /* Save rip after vmrun instruction */
+ kvm_rip_write(&svm->vcpu, kvm_rip_read(&svm->vcpu) + 3);
+
+ if (!nested_svm_vmrun(svm))
+ return 1;
+
+ if (!nested_svm_vmrun_msrpm(svm))
+ goto failed;
+
+ return 1;
+
+failed:
+
+ svm->vmcb->control.exit_code = SVM_EXIT_ERR;
+ svm->vmcb->control.exit_code_hi = 0;
+ svm->vmcb->control.exit_info_1 = 0;
+ svm->vmcb->control.exit_info_2 = 0;
+
+ nested_svm_vmexit(svm);
+
+ return 1;
+}
+
+static int stgi_interception(struct vcpu_svm *svm)
+{
+ int ret;
+
+ if (nested_svm_check_permissions(svm))
+ return 1;
+
+ /*
+ * If VGIF is enabled, the STGI intercept is only added to
+ * detect the opening of the SMI/NMI window; remove it now.
+ */
+ if (vgif_enabled(svm))
+ clr_intercept(svm, INTERCEPT_STGI);
+
+ svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
+ ret = kvm_skip_emulated_instruction(&svm->vcpu);
+ kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+
+ enable_gif(svm);
+
+ return ret;
+}
+
+static int clgi_interception(struct vcpu_svm *svm)
+{
+ int ret;
+
+ if (nested_svm_check_permissions(svm))
+ return 1;
+
+ svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
+ ret = kvm_skip_emulated_instruction(&svm->vcpu);
+
+ disable_gif(svm);
+
+ /* After a CLGI no interrupts should come */
+ if (!kvm_vcpu_apicv_active(&svm->vcpu)) {
+ svm_clear_vintr(svm);
+ svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
+ mark_dirty(svm->vmcb, VMCB_INTR);
+ }
+
+ return ret;
+}
+
+static int invlpga_interception(struct vcpu_svm *svm)
+{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+
+ trace_kvm_invlpga(svm->vmcb->save.rip, kvm_register_read(&svm->vcpu, VCPU_REGS_RCX),
+ kvm_register_read(&svm->vcpu, VCPU_REGS_RAX));
+
+ /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
+ kvm_mmu_invlpg(vcpu, kvm_register_read(&svm->vcpu, VCPU_REGS_RAX));
+
+ svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
+ return kvm_skip_emulated_instruction(&svm->vcpu);
+}
+
+static int skinit_interception(struct vcpu_svm *svm)
+{
+ trace_kvm_skinit(svm->vmcb->save.rip, kvm_register_read(&svm->vcpu, VCPU_REGS_RAX));
+
+ kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+ return 1;
+}
+
+static int wbinvd_interception(struct vcpu_svm *svm)
+{
+ return kvm_emulate_wbinvd(&svm->vcpu);
+}
+
+static int xsetbv_interception(struct vcpu_svm *svm)
+{
+ u64 new_bv = kvm_read_edx_eax(&svm->vcpu);
+ u32 index = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
+
+ if (kvm_set_xcr(&svm->vcpu, index, new_bv) == 0) {
+ svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
+ return kvm_skip_emulated_instruction(&svm->vcpu);
+ }
+
+ return 1;
+}
+
+static int task_switch_interception(struct vcpu_svm *svm)
+{
+ u16 tss_selector;
+ int reason;
+ int int_type = svm->vmcb->control.exit_int_info &
+ SVM_EXITINTINFO_TYPE_MASK;
+ int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK;
+ uint32_t type =
+ svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
+ uint32_t idt_v =
+ svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
+ bool has_error_code = false;
+ u32 error_code = 0;
+
+ tss_selector = (u16)svm->vmcb->control.exit_info_1;
+
+ if (svm->vmcb->control.exit_info_2 &
+ (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET))
+ reason = TASK_SWITCH_IRET;
+ else if (svm->vmcb->control.exit_info_2 &
+ (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
+ reason = TASK_SWITCH_JMP;
+ else if (idt_v)
+ reason = TASK_SWITCH_GATE;
+ else
+ reason = TASK_SWITCH_CALL;
+
+ if (reason == TASK_SWITCH_GATE) {
+ switch (type) {
+ case SVM_EXITINTINFO_TYPE_NMI:
+ svm->vcpu.arch.nmi_injected = false;
+ break;
+ case SVM_EXITINTINFO_TYPE_EXEPT:
+ if (svm->vmcb->control.exit_info_2 &
+ (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) {
+ has_error_code = true;
+ error_code =
+ (u32)svm->vmcb->control.exit_info_2;
+ }
+ kvm_clear_exception_queue(&svm->vcpu);
+ break;
+ case SVM_EXITINTINFO_TYPE_INTR:
+ kvm_clear_interrupt_queue(&svm->vcpu);
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (reason != TASK_SWITCH_GATE ||
+ int_type == SVM_EXITINTINFO_TYPE_SOFT ||
+ (int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
+ (int_vec == OF_VECTOR || int_vec == BP_VECTOR)))
+ skip_emulated_instruction(&svm->vcpu);
+
+ if (int_type != SVM_EXITINTINFO_TYPE_SOFT)
+ int_vec = -1;
+
+ if (kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason,
+ has_error_code, error_code) == EMULATE_FAIL) {
+ svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+ svm->vcpu.run->internal.ndata = 0;
+ return 0;
+ }
+ return 1;
+}
+
+static int cpuid_interception(struct vcpu_svm *svm)
+{
+ svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
+ return kvm_emulate_cpuid(&svm->vcpu);
+}
+
+static int iret_interception(struct vcpu_svm *svm)
+{
+ ++svm->vcpu.stat.nmi_window_exits;
+ clr_intercept(svm, INTERCEPT_IRET);
+ svm->vcpu.arch.hflags |= HF_IRET_MASK;
+ svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu);
+ kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+ return 1;
+}
+
+static int invd_interception(struct vcpu_svm *svm)
+{
+ /* Treat an INVD instruction as a NOP and just skip it. */
+ return kvm_skip_emulated_instruction(&svm->vcpu);
+}
+
+static int invlpg_interception(struct vcpu_svm *svm)
+{
+ if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
+ return kvm_emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
+
+ kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1);
+ return kvm_skip_emulated_instruction(&svm->vcpu);
+}
+
+static int emulate_on_interception(struct vcpu_svm *svm)
+{
+ return kvm_emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
+}
+
+static int rsm_interception(struct vcpu_svm *svm)
+{
+ return kvm_emulate_instruction_from_buffer(&svm->vcpu,
+ rsm_ins_bytes, 2) == EMULATE_DONE;
+}
+
+static int rdpmc_interception(struct vcpu_svm *svm)
+{
+ int err;
+
+ if (!static_cpu_has(X86_FEATURE_NRIPS))
+ return emulate_on_interception(svm);
+
+ err = kvm_rdpmc(&svm->vcpu);
+ return kvm_complete_insn_gp(&svm->vcpu, err);
+}
+
+static bool check_selective_cr0_intercepted(struct vcpu_svm *svm,
+ unsigned long val)
+{
+ unsigned long cr0 = svm->vcpu.arch.cr0;
+ bool ret = false;
+ u64 intercept;
+
+ intercept = svm->nested.intercept;
+
+ if (!is_guest_mode(&svm->vcpu) ||
+ (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0))))
+ return false;
+
+ cr0 &= ~SVM_CR0_SELECTIVE_MASK;
+ val &= ~SVM_CR0_SELECTIVE_MASK;
+
+ if (cr0 ^ val) {
+ svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
+ ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE);
+ }
+
+ return ret;
+}
+
+#define CR_VALID (1ULL << 63)
+
+static int cr_interception(struct vcpu_svm *svm)
+{
+ int reg, cr;
+ unsigned long val;
+ int err;
+
+ if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
+ return emulate_on_interception(svm);
+
+ if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0))
+ return emulate_on_interception(svm);
+
+ reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
+ if (svm->vmcb->control.exit_code == SVM_EXIT_CR0_SEL_WRITE)
+ cr = SVM_EXIT_WRITE_CR0 - SVM_EXIT_READ_CR0;
+ else
+ cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0;
+
+ err = 0;
+ if (cr >= 16) { /* mov to cr */
+ cr -= 16;
+ val = kvm_register_readl(&svm->vcpu, reg);
+ switch (cr) {
+ case 0:
+ if (!check_selective_cr0_intercepted(svm, val))
+ err = kvm_set_cr0(&svm->vcpu, val);
+ else
+ return 1;
+
+ break;
+ case 3:
+ err = kvm_set_cr3(&svm->vcpu, val);
+ break;
+ case 4:
+ err = kvm_set_cr4(&svm->vcpu, val);
+ break;
+ case 8:
+ err = kvm_set_cr8(&svm->vcpu, val);
+ break;
+ default:
+ WARN(1, "unhandled write to CR%d", cr);
+ kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+ return 1;
+ }
+ } else { /* mov from cr */
+ switch (cr) {
+ case 0:
+ val = kvm_read_cr0(&svm->vcpu);
+ break;
+ case 2:
+ val = svm->vcpu.arch.cr2;
+ break;
+ case 3:
+ val = kvm_read_cr3(&svm->vcpu);
+ break;
+ case 4:
+ val = kvm_read_cr4(&svm->vcpu);
+ break;
+ case 8:
+ val = kvm_get_cr8(&svm->vcpu);
+ break;
+ default:
+ WARN(1, "unhandled read from CR%d", cr);
+ kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+ return 1;
+ }
+ kvm_register_writel(&svm->vcpu, reg, val);
+ }
+ return kvm_complete_insn_gp(&svm->vcpu, err);
+}
+
+static int dr_interception(struct vcpu_svm *svm)
+{
+ int reg, dr;
+ unsigned long val;
+
+ if (svm->vcpu.guest_debug == 0) {
+ /*
+ * No more DR vmexits; force a reload of the debug registers
+ * and reenter on this instruction. The next vmexit will
+ * retrieve the full state of the debug registers.
+ */
+ clr_dr_intercepts(svm);
+ svm->vcpu.arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
+ return 1;
+ }
+
+ if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
+ return emulate_on_interception(svm);
+
+ reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
+ dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
+
+ if (dr >= 16) { /* mov to DRn */
+ if (!kvm_require_dr(&svm->vcpu, dr - 16))
+ return 1;
+ val = kvm_register_readl(&svm->vcpu, reg);
+ kvm_set_dr(&svm->vcpu, dr - 16, val);
+ } else {
+ if (!kvm_require_dr(&svm->vcpu, dr))
+ return 1;
+ kvm_get_dr(&svm->vcpu, dr, &val);
+ kvm_register_writel(&svm->vcpu, reg, val);
+ }
+
+ return kvm_skip_emulated_instruction(&svm->vcpu);
+}
+
+static int cr8_write_interception(struct vcpu_svm *svm)
+{
+ struct kvm_run *kvm_run = svm->vcpu.run;
+ int r;
+
+ u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
+ /* instruction emulation calls kvm_set_cr8() */
+ r = cr_interception(svm);
+ if (lapic_in_kernel(&svm->vcpu))
+ return r;
+ if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
+ return r;
+ kvm_run->exit_reason = KVM_EXIT_SET_TPR;
+ return 0;
+}
+
+static int svm_get_msr_feature(struct kvm_msr_entry *msr)
+{
+ msr->data = 0;
+
+ switch (msr->index) {
+ case MSR_F10H_DECFG:
+ if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
+ msr->data |= MSR_F10H_DECFG_LFENCE_SERIALIZE;
+ break;
+ default:
+ return 1;
+ }
+
+ return 0;
+}
+
+static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ switch (msr_info->index) {
+ case MSR_STAR:
+ msr_info->data = svm->vmcb->save.star;
+ break;
+#ifdef CONFIG_X86_64
+ case MSR_LSTAR:
+ msr_info->data = svm->vmcb->save.lstar;
+ break;
+ case MSR_CSTAR:
+ msr_info->data = svm->vmcb->save.cstar;
+ break;
+ case MSR_KERNEL_GS_BASE:
+ msr_info->data = svm->vmcb->save.kernel_gs_base;
+ break;
+ case MSR_SYSCALL_MASK:
+ msr_info->data = svm->vmcb->save.sfmask;
+ break;
+#endif
+ case MSR_IA32_SYSENTER_CS:
+ msr_info->data = svm->vmcb->save.sysenter_cs;
+ break;
+ case MSR_IA32_SYSENTER_EIP:
+ msr_info->data = svm->sysenter_eip;
+ break;
+ case MSR_IA32_SYSENTER_ESP:
+ msr_info->data = svm->sysenter_esp;
+ break;
+ case MSR_TSC_AUX:
+ if (!boot_cpu_has(X86_FEATURE_RDTSCP))
+ return 1;
+ msr_info->data = svm->tsc_aux;
+ break;
+ /*
+ * Nobody will change the following 5 values in the VMCB so we can
+ * safely return them on rdmsr. They will always be 0 until LBRV is
+ * implemented.
+ */
+ case MSR_IA32_DEBUGCTLMSR:
+ msr_info->data = svm->vmcb->save.dbgctl;
+ break;
+ case MSR_IA32_LASTBRANCHFROMIP:
+ msr_info->data = svm->vmcb->save.br_from;
+ break;
+ case MSR_IA32_LASTBRANCHTOIP:
+ msr_info->data = svm->vmcb->save.br_to;
+ break;
+ case MSR_IA32_LASTINTFROMIP:
+ msr_info->data = svm->vmcb->save.last_excp_from;
+ break;
+ case MSR_IA32_LASTINTTOIP:
+ msr_info->data = svm->vmcb->save.last_excp_to;
+ break;
+ case MSR_VM_HSAVE_PA:
+ msr_info->data = svm->nested.hsave_msr;
+ break;
+ case MSR_VM_CR:
+ msr_info->data = svm->nested.vm_cr_msr;
+ break;
+ case MSR_IA32_SPEC_CTRL:
+ if (!msr_info->host_initiated &&
+ !guest_has_spec_ctrl_msr(vcpu))
+ return 1;
+
+ msr_info->data = svm->spec_ctrl;
+ break;
+ case MSR_AMD64_VIRT_SPEC_CTRL:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
+ return 1;
+
+ msr_info->data = svm->virt_spec_ctrl;
+ break;
+ case MSR_F15H_IC_CFG: {
+
+ int family, model;
+
+ family = guest_cpuid_family(vcpu);
+ model = guest_cpuid_model(vcpu);
+
+ if (family < 0 || model < 0)
+ return kvm_get_msr_common(vcpu, msr_info);
+
+ msr_info->data = 0;
+
+ if (family == 0x15 &&
+ (model >= 0x2 && model < 0x20))
+ msr_info->data = 0x1E;
+ }
+ break;
+ case MSR_F10H_DECFG:
+ msr_info->data = svm->msr_decfg;
+ break;
+ default:
+ return kvm_get_msr_common(vcpu, msr_info);
+ }
+ return 0;
+}
+
+static int rdmsr_interception(struct vcpu_svm *svm)
+{
+ u32 ecx = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
+ struct msr_data msr_info;
+
+ msr_info.index = ecx;
+ msr_info.host_initiated = false;
+ if (svm_get_msr(&svm->vcpu, &msr_info)) {
+ trace_kvm_msr_read_ex(ecx);
+ kvm_inject_gp(&svm->vcpu, 0);
+ return 1;
+ } else {
+ trace_kvm_msr_read(ecx, msr_info.data);
+
+ kvm_register_write(&svm->vcpu, VCPU_REGS_RAX,
+ msr_info.data & 0xffffffff);
+ kvm_register_write(&svm->vcpu, VCPU_REGS_RDX,
+ msr_info.data >> 32);
+ svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
+ return kvm_skip_emulated_instruction(&svm->vcpu);
+ }
+}
+
+static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int svm_dis, chg_mask;
+
+ if (data & ~SVM_VM_CR_VALID_MASK)
+ return 1;
+
+ chg_mask = SVM_VM_CR_VALID_MASK;
+
+ if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK)
+ chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK);
+
+ svm->nested.vm_cr_msr &= ~chg_mask;
+ svm->nested.vm_cr_msr |= (data & chg_mask);
+
+ svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK;
+
+ /* check for svm_disable while efer.svme is set */
+ if (svm_dis && (vcpu->arch.efer & EFER_SVME))
+ return 1;
+
+ return 0;
+}
+
+static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ u32 ecx = msr->index;
+ u64 data = msr->data;
+ switch (ecx) {
+ case MSR_IA32_CR_PAT:
+ if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
+ return 1;
+ vcpu->arch.pat = data;
+ svm->vmcb->save.g_pat = data;
+ mark_dirty(svm->vmcb, VMCB_NPT);
+ break;
+ case MSR_IA32_SPEC_CTRL:
+ if (!msr->host_initiated &&
+ !guest_has_spec_ctrl_msr(vcpu))
+ return 1;
+
+ /* The STIBP bit doesn't fault even if it's not advertised */
+ if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD))
+ return 1;
+
+ svm->spec_ctrl = data;
+
+ if (!data)
+ break;
+
+ /*
+ * For non-nested:
+ * When it's written (to non-zero) for the first time, pass
+ * it through.
+ *
+ * For nested:
+ * The handling of the MSR bitmap for L2 guests is done in
+ * nested_svm_vmrun_msrpm.
+ * We update the L1 MSR bit as well since it will end up
+ * touching the MSR anyway now.
+ */
+ set_msr_interception(svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
+ break;
+ case MSR_IA32_PRED_CMD:
+ if (!msr->host_initiated &&
+ !guest_has_pred_cmd_msr(vcpu))
+ return 1;
+
+ if (data & ~PRED_CMD_IBPB)
+ return 1;
+ if (!data)
+ break;
+
+ wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
+ if (is_guest_mode(vcpu))
+ break;
+ set_msr_interception(svm->msrpm, MSR_IA32_PRED_CMD, 0, 1);
+ break;
+ case MSR_AMD64_VIRT_SPEC_CTRL:
+ if (!msr->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
+ return 1;
+
+ if (data & ~SPEC_CTRL_SSBD)
+ return 1;
+
+ svm->virt_spec_ctrl = data;
+ break;
+ case MSR_STAR:
+ svm->vmcb->save.star = data;
+ break;
+#ifdef CONFIG_X86_64
+ case MSR_LSTAR:
+ svm->vmcb->save.lstar = data;
+ break;
+ case MSR_CSTAR:
+ svm->vmcb->save.cstar = data;
+ break;
+ case MSR_KERNEL_GS_BASE:
+ svm->vmcb->save.kernel_gs_base = data;
+ break;
+ case MSR_SYSCALL_MASK:
+ svm->vmcb->save.sfmask = data;
+ break;
+#endif
+ case MSR_IA32_SYSENTER_CS:
+ svm->vmcb->save.sysenter_cs = data;
+ break;
+ case MSR_IA32_SYSENTER_EIP:
+ svm->sysenter_eip = data;
+ svm->vmcb->save.sysenter_eip = data;
+ break;
+ case MSR_IA32_SYSENTER_ESP:
+ svm->sysenter_esp = data;
+ svm->vmcb->save.sysenter_esp = data;
+ break;
+ case MSR_TSC_AUX:
+ if (!boot_cpu_has(X86_FEATURE_RDTSCP))
+ return 1;
+
+ /*
+ * This is rare, so we update the MSR here instead of using
+ * direct_access_msrs. Doing that would require a rdmsr in
+ * svm_vcpu_put.
+ */
+ svm->tsc_aux = data;
+ wrmsrl(MSR_TSC_AUX, svm->tsc_aux);
+ break;
+ case MSR_IA32_DEBUGCTLMSR:
+ if (!boot_cpu_has(X86_FEATURE_LBRV)) {
+ vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
+ __func__, data);
+ break;
+ }
+ if (data & DEBUGCTL_RESERVED_BITS)
+ return 1;
+
+ svm->vmcb->save.dbgctl = data;
+ mark_dirty(svm->vmcb, VMCB_LBR);
+ if (data & (1ULL<<0))
+ svm_enable_lbrv(svm);
+ else
+ svm_disable_lbrv(svm);
+ break;
+ case MSR_VM_HSAVE_PA:
+ svm->nested.hsave_msr = data;
+ break;
+ case MSR_VM_CR:
+ return svm_set_vm_cr(vcpu, data);
+ case MSR_VM_IGNNE:
+ vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
+ break;
+ case MSR_F10H_DECFG: {
+ struct kvm_msr_entry msr_entry;
+
+ msr_entry.index = msr->index;
+ if (svm_get_msr_feature(&msr_entry))
+ return 1;
+
+ /* Check the supported bits */
+ if (data & ~msr_entry.data)
+ return 1;
+
+ /* Don't allow the guest to change a bit, #GP */
+ if (!msr->host_initiated && (data ^ msr_entry.data))
+ return 1;
+
+ svm->msr_decfg = data;
+ break;
+ }
+ case MSR_IA32_APICBASE:
+ if (kvm_vcpu_apicv_active(vcpu))
+ avic_update_vapic_bar(to_svm(vcpu), data);
+ /* Follow through */
+ default:
+ return kvm_set_msr_common(vcpu, msr);
+ }
+ return 0;
+}
+
+static int wrmsr_interception(struct vcpu_svm *svm)
+{
+ struct msr_data msr;
+ u32 ecx = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
+ u64 data = kvm_read_edx_eax(&svm->vcpu);
+
+ msr.data = data;
+ msr.index = ecx;
+ msr.host_initiated = false;
+
+ svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
+ if (kvm_set_msr(&svm->vcpu, &msr)) {
+ trace_kvm_msr_write_ex(ecx, data);
+ kvm_inject_gp(&svm->vcpu, 0);
+ return 1;
+ } else {
+ trace_kvm_msr_write(ecx, data);
+ return kvm_skip_emulated_instruction(&svm->vcpu);
+ }
+}
+
+static int msr_interception(struct vcpu_svm *svm)
+{
+ if (svm->vmcb->control.exit_info_1)
+ return wrmsr_interception(svm);
+ else
+ return rdmsr_interception(svm);
+}
+
+static int interrupt_window_interception(struct vcpu_svm *svm)
+{
+ kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+ svm_clear_vintr(svm);
+ svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
+ mark_dirty(svm->vmcb, VMCB_INTR);
+ ++svm->vcpu.stat.irq_window_exits;
+ return 1;
+}
+
+static int pause_interception(struct vcpu_svm *svm)
+{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ bool in_kernel = (svm_get_cpl(vcpu) == 0);
+
+ if (pause_filter_thresh)
+ grow_ple_window(vcpu);
+
+ kvm_vcpu_on_spin(vcpu, in_kernel);
+ return 1;
+}
+
+static int nop_interception(struct vcpu_svm *svm)
+{
+ return kvm_skip_emulated_instruction(&(svm->vcpu));
+}
+
+static int monitor_interception(struct vcpu_svm *svm)
+{
+ printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
+ return nop_interception(svm);
+}
+
+static int mwait_interception(struct vcpu_svm *svm)
+{
+ printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n");
+ return nop_interception(svm);
+}
+
+enum avic_ipi_failure_cause {
+ AVIC_IPI_FAILURE_INVALID_INT_TYPE,
+ AVIC_IPI_FAILURE_TARGET_NOT_RUNNING,
+ AVIC_IPI_FAILURE_INVALID_TARGET,
+ AVIC_IPI_FAILURE_INVALID_BACKING_PAGE,
+};
+
+static int avic_incomplete_ipi_interception(struct vcpu_svm *svm)
+{
+ u32 icrh = svm->vmcb->control.exit_info_1 >> 32;
+ u32 icrl = svm->vmcb->control.exit_info_1;
+ u32 id = svm->vmcb->control.exit_info_2 >> 32;
+ u32 index = svm->vmcb->control.exit_info_2 & 0xFF;
+ struct kvm_lapic *apic = svm->vcpu.arch.apic;
+
+ trace_kvm_avic_incomplete_ipi(svm->vcpu.vcpu_id, icrh, icrl, id, index);
+
+ switch (id) {
+ case AVIC_IPI_FAILURE_INVALID_INT_TYPE:
+ /*
+ * AVIC hardware handles the generation of
+ * IPIs when the specified Message Type is Fixed
+ * (also known as fixed delivery mode) and
+ * the Trigger Mode is edge-triggered. The hardware
+ * also supports self and broadcast delivery modes
+ * specified via the Destination Shorthand(DSH)
+ * field of the ICRL. Logical and physical APIC ID
+ * formats are supported. All other IPI types cause
+ * a #VMEXIT, which needs to emulated.
+ */
+ kvm_lapic_reg_write(apic, APIC_ICR2, icrh);
+ kvm_lapic_reg_write(apic, APIC_ICR, icrl);
+ break;
+ case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING: {
+ int i;
+ struct kvm_vcpu *vcpu;
+ struct kvm *kvm = svm->vcpu.kvm;
+ struct kvm_lapic *apic = svm->vcpu.arch.apic;
+
+ /*
+ * At this point, we expect that the AVIC HW has already
+ * set the appropriate IRR bits on the valid target
+ * vcpus. So, we just need to kick the appropriate vcpu.
+ */
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ bool m = kvm_apic_match_dest(vcpu, apic,
+ icrl & KVM_APIC_SHORT_MASK,
+ GET_APIC_DEST_FIELD(icrh),
+ icrl & KVM_APIC_DEST_MASK);
+
+ if (m && !avic_vcpu_is_running(vcpu))
+ kvm_vcpu_wake_up(vcpu);
+ }
+ break;
+ }
+ case AVIC_IPI_FAILURE_INVALID_TARGET:
+ break;
+ case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE:
+ WARN_ONCE(1, "Invalid backing page\n");
+ break;
+ default:
+ pr_err("Unknown IPI interception\n");
+ }
+
+ return 1;
+}
+
+static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat)
+{
+ struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
+ int index;
+ u32 *logical_apic_id_table;
+ int dlid = GET_APIC_LOGICAL_ID(ldr);
+
+ if (!dlid)
+ return NULL;
+
+ if (flat) { /* flat */
+ index = ffs(dlid) - 1;
+ if (index > 7)
+ return NULL;
+ } else { /* cluster */
+ int cluster = (dlid & 0xf0) >> 4;
+ int apic = ffs(dlid & 0x0f) - 1;
+
+ if ((apic < 0) || (apic > 7) ||
+ (cluster >= 0xf))
+ return NULL;
+ index = (cluster << 2) + apic;
+ }
+
+ logical_apic_id_table = (u32 *) page_address(kvm_svm->avic_logical_id_table_page);
+
+ return &logical_apic_id_table[index];
+}
+
+static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr,
+ bool valid)
+{
+ bool flat;
+ u32 *entry, new_entry;
+
+ flat = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR) == APIC_DFR_FLAT;
+ entry = avic_get_logical_id_entry(vcpu, ldr, flat);
+ if (!entry)
+ return -EINVAL;
+
+ new_entry = READ_ONCE(*entry);
+ new_entry &= ~AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
+ new_entry |= (g_physical_id & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK);
+ if (valid)
+ new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK;
+ else
+ new_entry &= ~AVIC_LOGICAL_ID_ENTRY_VALID_MASK;
+ WRITE_ONCE(*entry, new_entry);
+
+ return 0;
+}
+
+static int avic_handle_ldr_update(struct kvm_vcpu *vcpu)
+{
+ int ret;
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR);
+
+ if (!ldr)
+ return 1;
+
+ ret = avic_ldr_write(vcpu, vcpu->vcpu_id, ldr, true);
+ if (ret && svm->ldr_reg) {
+ avic_ldr_write(vcpu, 0, svm->ldr_reg, false);
+ svm->ldr_reg = 0;
+ } else {
+ svm->ldr_reg = ldr;
+ }
+ return ret;
+}
+
+static int avic_handle_apic_id_update(struct kvm_vcpu *vcpu)
+{
+ u64 *old, *new;
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u32 apic_id_reg = kvm_lapic_get_reg(vcpu->arch.apic, APIC_ID);
+ u32 id = (apic_id_reg >> 24) & 0xff;
+
+ if (vcpu->vcpu_id == id)
+ return 0;
+
+ old = avic_get_physical_id_entry(vcpu, vcpu->vcpu_id);
+ new = avic_get_physical_id_entry(vcpu, id);
+ if (!new || !old)
+ return 1;
+
+ /* We need to move physical_id_entry to new offset */
+ *new = *old;
+ *old = 0ULL;
+ to_svm(vcpu)->avic_physical_id_cache = new;
+
+ /*
+ * Also update the guest physical APIC ID in the logical
+ * APIC ID table entry if already setup the LDR.
+ */
+ if (svm->ldr_reg)
+ avic_handle_ldr_update(vcpu);
+
+ return 0;
+}
+
+static int avic_handle_dfr_update(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
+ u32 dfr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR);
+ u32 mod = (dfr >> 28) & 0xf;
+
+ /*
+ * We assume that all local APICs are using the same type.
+ * If this changes, we need to flush the AVIC logical
+ * APID id table.
+ */
+ if (kvm_svm->ldr_mode == mod)
+ return 0;
+
+ clear_page(page_address(kvm_svm->avic_logical_id_table_page));
+ kvm_svm->ldr_mode = mod;
+
+ if (svm->ldr_reg)
+ avic_handle_ldr_update(vcpu);
+ return 0;
+}
+
+static int avic_unaccel_trap_write(struct vcpu_svm *svm)
+{
+ struct kvm_lapic *apic = svm->vcpu.arch.apic;
+ u32 offset = svm->vmcb->control.exit_info_1 &
+ AVIC_UNACCEL_ACCESS_OFFSET_MASK;
+
+ switch (offset) {
+ case APIC_ID:
+ if (avic_handle_apic_id_update(&svm->vcpu))
+ return 0;
+ break;
+ case APIC_LDR:
+ if (avic_handle_ldr_update(&svm->vcpu))
+ return 0;
+ break;
+ case APIC_DFR:
+ avic_handle_dfr_update(&svm->vcpu);
+ break;
+ default:
+ break;
+ }
+
+ kvm_lapic_reg_write(apic, offset, kvm_lapic_get_reg(apic, offset));
+
+ return 1;
+}
+
+static bool is_avic_unaccelerated_access_trap(u32 offset)
+{
+ bool ret = false;
+
+ switch (offset) {
+ case APIC_ID:
+ case APIC_EOI:
+ case APIC_RRR:
+ case APIC_LDR:
+ case APIC_DFR:
+ case APIC_SPIV:
+ case APIC_ESR:
+ case APIC_ICR:
+ case APIC_LVTT:
+ case APIC_LVTTHMR:
+ case APIC_LVTPC:
+ case APIC_LVT0:
+ case APIC_LVT1:
+ case APIC_LVTERR:
+ case APIC_TMICT:
+ case APIC_TDCR:
+ ret = true;
+ break;
+ default:
+ break;
+ }
+ return ret;
+}
+
+static int avic_unaccelerated_access_interception(struct vcpu_svm *svm)
+{
+ int ret = 0;
+ u32 offset = svm->vmcb->control.exit_info_1 &
+ AVIC_UNACCEL_ACCESS_OFFSET_MASK;
+ u32 vector = svm->vmcb->control.exit_info_2 &
+ AVIC_UNACCEL_ACCESS_VECTOR_MASK;
+ bool write = (svm->vmcb->control.exit_info_1 >> 32) &
+ AVIC_UNACCEL_ACCESS_WRITE_MASK;
+ bool trap = is_avic_unaccelerated_access_trap(offset);
+
+ trace_kvm_avic_unaccelerated_access(svm->vcpu.vcpu_id, offset,
+ trap, write, vector);
+ if (trap) {
+ /* Handling Trap */
+ WARN_ONCE(!write, "svm: Handling trap read.\n");
+ ret = avic_unaccel_trap_write(svm);
+ } else {
+ /* Handling Fault */
+ ret = (kvm_emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE);
+ }
+
+ return ret;
+}
+
+static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
+ [SVM_EXIT_READ_CR0] = cr_interception,
+ [SVM_EXIT_READ_CR3] = cr_interception,
+ [SVM_EXIT_READ_CR4] = cr_interception,
+ [SVM_EXIT_READ_CR8] = cr_interception,
+ [SVM_EXIT_CR0_SEL_WRITE] = cr_interception,
+ [SVM_EXIT_WRITE_CR0] = cr_interception,
+ [SVM_EXIT_WRITE_CR3] = cr_interception,
+ [SVM_EXIT_WRITE_CR4] = cr_interception,
+ [SVM_EXIT_WRITE_CR8] = cr8_write_interception,
+ [SVM_EXIT_READ_DR0] = dr_interception,
+ [SVM_EXIT_READ_DR1] = dr_interception,
+ [SVM_EXIT_READ_DR2] = dr_interception,
+ [SVM_EXIT_READ_DR3] = dr_interception,
+ [SVM_EXIT_READ_DR4] = dr_interception,
+ [SVM_EXIT_READ_DR5] = dr_interception,
+ [SVM_EXIT_READ_DR6] = dr_interception,
+ [SVM_EXIT_READ_DR7] = dr_interception,
+ [SVM_EXIT_WRITE_DR0] = dr_interception,
+ [SVM_EXIT_WRITE_DR1] = dr_interception,
+ [SVM_EXIT_WRITE_DR2] = dr_interception,
+ [SVM_EXIT_WRITE_DR3] = dr_interception,
+ [SVM_EXIT_WRITE_DR4] = dr_interception,
+ [SVM_EXIT_WRITE_DR5] = dr_interception,
+ [SVM_EXIT_WRITE_DR6] = dr_interception,
+ [SVM_EXIT_WRITE_DR7] = dr_interception,
+ [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception,
+ [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception,
+ [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception,
+ [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception,
+ [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception,
+ [SVM_EXIT_EXCP_BASE + AC_VECTOR] = ac_interception,
+ [SVM_EXIT_EXCP_BASE + GP_VECTOR] = gp_interception,
+ [SVM_EXIT_INTR] = intr_interception,
+ [SVM_EXIT_NMI] = nmi_interception,
+ [SVM_EXIT_SMI] = nop_on_interception,
+ [SVM_EXIT_INIT] = nop_on_interception,
+ [SVM_EXIT_VINTR] = interrupt_window_interception,
+ [SVM_EXIT_RDPMC] = rdpmc_interception,
+ [SVM_EXIT_CPUID] = cpuid_interception,
+ [SVM_EXIT_IRET] = iret_interception,
+ [SVM_EXIT_INVD] = invd_interception,
+ [SVM_EXIT_PAUSE] = pause_interception,
+ [SVM_EXIT_HLT] = halt_interception,
+ [SVM_EXIT_INVLPG] = invlpg_interception,
+ [SVM_EXIT_INVLPGA] = invlpga_interception,
+ [SVM_EXIT_IOIO] = io_interception,
+ [SVM_EXIT_MSR] = msr_interception,
+ [SVM_EXIT_TASK_SWITCH] = task_switch_interception,
+ [SVM_EXIT_SHUTDOWN] = shutdown_interception,
+ [SVM_EXIT_VMRUN] = vmrun_interception,
+ [SVM_EXIT_VMMCALL] = vmmcall_interception,
+ [SVM_EXIT_VMLOAD] = vmload_interception,
+ [SVM_EXIT_VMSAVE] = vmsave_interception,
+ [SVM_EXIT_STGI] = stgi_interception,
+ [SVM_EXIT_CLGI] = clgi_interception,
+ [SVM_EXIT_SKINIT] = skinit_interception,
+ [SVM_EXIT_WBINVD] = wbinvd_interception,
+ [SVM_EXIT_MONITOR] = monitor_interception,
+ [SVM_EXIT_MWAIT] = mwait_interception,
+ [SVM_EXIT_XSETBV] = xsetbv_interception,
+ [SVM_EXIT_NPF] = npf_interception,
+ [SVM_EXIT_RSM] = rsm_interception,
+ [SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception,
+ [SVM_EXIT_AVIC_UNACCELERATED_ACCESS] = avic_unaccelerated_access_interception,
+};
+
+static void dump_vmcb(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ struct vmcb_save_area *save = &svm->vmcb->save;
+
+ pr_err("VMCB Control Area:\n");
+ pr_err("%-20s%04x\n", "cr_read:", control->intercept_cr & 0xffff);
+ pr_err("%-20s%04x\n", "cr_write:", control->intercept_cr >> 16);
+ pr_err("%-20s%04x\n", "dr_read:", control->intercept_dr & 0xffff);
+ pr_err("%-20s%04x\n", "dr_write:", control->intercept_dr >> 16);
+ pr_err("%-20s%08x\n", "exceptions:", control->intercept_exceptions);
+ pr_err("%-20s%016llx\n", "intercepts:", control->intercept);
+ pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count);
+ pr_err("%-20s%d\n", "pause filter threshold:",
+ control->pause_filter_thresh);
+ pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa);
+ pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa);
+ pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset);
+ pr_err("%-20s%d\n", "asid:", control->asid);
+ pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl);
+ pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl);
+ pr_err("%-20s%08x\n", "int_vector:", control->int_vector);
+ pr_err("%-20s%08x\n", "int_state:", control->int_state);
+ pr_err("%-20s%08x\n", "exit_code:", control->exit_code);
+ pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1);
+ pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2);
+ pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info);
+ pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err);
+ pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl);
+ pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3);
+ pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar);
+ pr_err("%-20s%08x\n", "event_inj:", control->event_inj);
+ pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err);
+ pr_err("%-20s%lld\n", "virt_ext:", control->virt_ext);
+ pr_err("%-20s%016llx\n", "next_rip:", control->next_rip);
+ pr_err("%-20s%016llx\n", "avic_backing_page:", control->avic_backing_page);
+ pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id);
+ pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id);
+ pr_err("VMCB State Save Area:\n");
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "es:",
+ save->es.selector, save->es.attrib,
+ save->es.limit, save->es.base);
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "cs:",
+ save->cs.selector, save->cs.attrib,
+ save->cs.limit, save->cs.base);
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "ss:",
+ save->ss.selector, save->ss.attrib,
+ save->ss.limit, save->ss.base);
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "ds:",
+ save->ds.selector, save->ds.attrib,
+ save->ds.limit, save->ds.base);
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "fs:",
+ save->fs.selector, save->fs.attrib,
+ save->fs.limit, save->fs.base);
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "gs:",
+ save->gs.selector, save->gs.attrib,
+ save->gs.limit, save->gs.base);
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "gdtr:",
+ save->gdtr.selector, save->gdtr.attrib,
+ save->gdtr.limit, save->gdtr.base);
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "ldtr:",
+ save->ldtr.selector, save->ldtr.attrib,
+ save->ldtr.limit, save->ldtr.base);
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "idtr:",
+ save->idtr.selector, save->idtr.attrib,
+ save->idtr.limit, save->idtr.base);
+ pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
+ "tr:",
+ save->tr.selector, save->tr.attrib,
+ save->tr.limit, save->tr.base);
+ pr_err("cpl: %d efer: %016llx\n",
+ save->cpl, save->efer);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "cr0:", save->cr0, "cr2:", save->cr2);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "cr3:", save->cr3, "cr4:", save->cr4);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "dr6:", save->dr6, "dr7:", save->dr7);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "rip:", save->rip, "rflags:", save->rflags);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "rsp:", save->rsp, "rax:", save->rax);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "star:", save->star, "lstar:", save->lstar);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "cstar:", save->cstar, "sfmask:", save->sfmask);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "kernel_gs_base:", save->kernel_gs_base,
+ "sysenter_cs:", save->sysenter_cs);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "sysenter_esp:", save->sysenter_esp,
+ "sysenter_eip:", save->sysenter_eip);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "gpat:", save->g_pat, "dbgctl:", save->dbgctl);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "br_from:", save->br_from, "br_to:", save->br_to);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "excp_from:", save->last_excp_from,
+ "excp_to:", save->last_excp_to);
+}
+
+static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
+{
+ struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
+
+ *info1 = control->exit_info_1;
+ *info2 = control->exit_info_2;
+}
+
+static int handle_exit(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct kvm_run *kvm_run = vcpu->run;
+ u32 exit_code = svm->vmcb->control.exit_code;
+
+ trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM);
+
+ if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE))
+ vcpu->arch.cr0 = svm->vmcb->save.cr0;
+ if (npt_enabled)
+ vcpu->arch.cr3 = svm->vmcb->save.cr3;
+
+ if (unlikely(svm->nested.exit_required)) {
+ nested_svm_vmexit(svm);
+ svm->nested.exit_required = false;
+
+ return 1;
+ }
+
+ if (is_guest_mode(vcpu)) {
+ int vmexit;
+
+ trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code,
+ svm->vmcb->control.exit_info_1,
+ svm->vmcb->control.exit_info_2,
+ svm->vmcb->control.exit_int_info,
+ svm->vmcb->control.exit_int_info_err,
+ KVM_ISA_SVM);
+
+ vmexit = nested_svm_exit_special(svm);
+
+ if (vmexit == NESTED_EXIT_CONTINUE)
+ vmexit = nested_svm_exit_handled(svm);
+
+ if (vmexit == NESTED_EXIT_DONE)
+ return 1;
+ }
+
+ svm_complete_interrupts(svm);
+
+ if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
+ kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
+ kvm_run->fail_entry.hardware_entry_failure_reason
+ = svm->vmcb->control.exit_code;
+ pr_err("KVM: FAILED VMRUN WITH VMCB:\n");
+ dump_vmcb(vcpu);
+ return 0;
+ }
+
+ if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
+ exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
+ exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH &&
+ exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI)
+ printk(KERN_ERR "%s: unexpected exit_int_info 0x%x "
+ "exit_code 0x%x\n",
+ __func__, svm->vmcb->control.exit_int_info,
+ exit_code);
+
+ if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
+ || !svm_exit_handlers[exit_code]) {
+ WARN_ONCE(1, "svm: unexpected exit reason 0x%x\n", exit_code);
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ return svm_exit_handlers[exit_code](svm);
+}
+
+static void reload_tss(struct kvm_vcpu *vcpu)
+{
+ int cpu = raw_smp_processor_id();
+
+ struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
+ sd->tss_desc->type = 9; /* available 32/64-bit TSS */
+ load_TR_desc();
+}
+
+static void pre_sev_run(struct vcpu_svm *svm, int cpu)
+{
+ struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
+ int asid = sev_get_asid(svm->vcpu.kvm);
+
+ /* Assign the asid allocated with this SEV guest */
+ svm->vmcb->control.asid = asid;
+
+ /*
+ * Flush guest TLB:
+ *
+ * 1) when different VMCB for the same ASID is to be run on the same host CPU.
+ * 2) or this VMCB was executed on different host CPU in previous VMRUNs.
+ */
+ if (sd->sev_vmcbs[asid] == svm->vmcb &&
+ svm->last_cpu == cpu)
+ return;
+
+ svm->last_cpu = cpu;
+ sd->sev_vmcbs[asid] = svm->vmcb;
+ svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
+ mark_dirty(svm->vmcb, VMCB_ASID);
+}
+
+static void pre_svm_run(struct vcpu_svm *svm)
+{
+ int cpu = raw_smp_processor_id();
+
+ struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
+
+ if (sev_guest(svm->vcpu.kvm))
+ return pre_sev_run(svm, cpu);
+
+ /* FIXME: handle wraparound of asid_generation */
+ if (svm->asid_generation != sd->asid_generation)
+ new_asid(svm, sd);
+}
+
+static void svm_inject_nmi(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
+ vcpu->arch.hflags |= HF_NMI_MASK;
+ set_intercept(svm, INTERCEPT_IRET);
+ ++vcpu->stat.nmi_injections;
+}
+
+static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
+{
+ struct vmcb_control_area *control;
+
+ /* The following fields are ignored when AVIC is enabled */
+ control = &svm->vmcb->control;
+ control->int_vector = irq;
+ control->int_ctl &= ~V_INTR_PRIO_MASK;
+ control->int_ctl |= V_IRQ_MASK |
+ ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
+ mark_dirty(svm->vmcb, VMCB_INTR);
+}
+
+static void svm_set_irq(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ BUG_ON(!(gif_set(svm)));
+
+ trace_kvm_inj_virq(vcpu->arch.interrupt.nr);
+ ++vcpu->stat.irq_injections;
+
+ svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr |
+ SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
+}
+
+static inline bool svm_nested_virtualize_tpr(struct kvm_vcpu *vcpu)
+{
+ return is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK);
+}
+
+static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (svm_nested_virtualize_tpr(vcpu) ||
+ kvm_vcpu_apicv_active(vcpu))
+ return;
+
+ clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
+
+ if (irr == -1)
+ return;
+
+ if (tpr >= irr)
+ set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
+}
+
+static void svm_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
+{
+ return;
+}
+
+static bool svm_get_enable_apicv(struct kvm_vcpu *vcpu)
+{
+ return avic && irqchip_split(vcpu->kvm);
+}
+
+static void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
+{
+}
+
+static void svm_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
+{
+}
+
+/* Note: Currently only used by Hyper-V. */
+static void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb *vmcb = svm->vmcb;
+
+ if (!kvm_vcpu_apicv_active(&svm->vcpu))
+ return;
+
+ vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK;
+ mark_dirty(vmcb, VMCB_INTR);
+}
+
+static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
+{
+ return;
+}
+
+static int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec)
+{
+ if (!vcpu->arch.apicv_active)
+ return -1;
+
+ kvm_lapic_set_irr(vec, vcpu->arch.apic);
+ smp_mb__after_atomic();
+
+ if (avic_vcpu_is_running(vcpu))
+ wrmsrl(SVM_AVIC_DOORBELL,
+ kvm_cpu_get_apicid(vcpu->cpu));
+ else
+ kvm_vcpu_wake_up(vcpu);
+
+ return 0;
+}
+
+static bool svm_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu)
+{
+ return false;
+}
+
+static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
+{
+ unsigned long flags;
+ struct amd_svm_iommu_ir *cur;
+
+ spin_lock_irqsave(&svm->ir_list_lock, flags);
+ list_for_each_entry(cur, &svm->ir_list, node) {
+ if (cur->data != pi->ir_data)
+ continue;
+ list_del(&cur->node);
+ kfree(cur);
+ break;
+ }
+ spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+}
+
+static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
+{
+ int ret = 0;
+ unsigned long flags;
+ struct amd_svm_iommu_ir *ir;
+
+ /**
+ * In some cases, the existing irte is updaed and re-set,
+ * so we need to check here if it's already been * added
+ * to the ir_list.
+ */
+ if (pi->ir_data && (pi->prev_ga_tag != 0)) {
+ struct kvm *kvm = svm->vcpu.kvm;
+ u32 vcpu_id = AVIC_GATAG_TO_VCPUID(pi->prev_ga_tag);
+ struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
+ struct vcpu_svm *prev_svm;
+
+ if (!prev_vcpu) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ prev_svm = to_svm(prev_vcpu);
+ svm_ir_list_del(prev_svm, pi);
+ }
+
+ /**
+ * Allocating new amd_iommu_pi_data, which will get
+ * add to the per-vcpu ir_list.
+ */
+ ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL);
+ if (!ir) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ir->data = pi->ir_data;
+
+ spin_lock_irqsave(&svm->ir_list_lock, flags);
+ list_add(&ir->node, &svm->ir_list);
+ spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+out:
+ return ret;
+}
+
+/**
+ * Note:
+ * The HW cannot support posting multicast/broadcast
+ * interrupts to a vCPU. So, we still use legacy interrupt
+ * remapping for these kind of interrupts.
+ *
+ * For lowest-priority interrupts, we only support
+ * those with single CPU as the destination, e.g. user
+ * configures the interrupts via /proc/irq or uses
+ * irqbalance to make the interrupts single-CPU.
+ */
+static int
+get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
+ struct vcpu_data *vcpu_info, struct vcpu_svm **svm)
+{
+ struct kvm_lapic_irq irq;
+ struct kvm_vcpu *vcpu = NULL;
+
+ kvm_set_msi_irq(kvm, e, &irq);
+
+ if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) {
+ pr_debug("SVM: %s: use legacy intr remap mode for irq %u\n",
+ __func__, irq.vector);
+ return -1;
+ }
+
+ pr_debug("SVM: %s: use GA mode for irq %u\n", __func__,
+ irq.vector);
+ *svm = to_svm(vcpu);
+ vcpu_info->pi_desc_addr = __sme_set(page_to_phys((*svm)->avic_backing_page));
+ vcpu_info->vector = irq.vector;
+
+ return 0;
+}
+
+/*
+ * svm_update_pi_irte - set IRTE for Posted-Interrupts
+ *
+ * @kvm: kvm
+ * @host_irq: host irq of the interrupt
+ * @guest_irq: gsi of the interrupt
+ * @set: set or unset PI
+ * returns 0 on success, < 0 on failure
+ */
+static int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
+ uint32_t guest_irq, bool set)
+{
+ struct kvm_kernel_irq_routing_entry *e;
+ struct kvm_irq_routing_table *irq_rt;
+ int idx, ret = -EINVAL;
+
+ if (!kvm_arch_has_assigned_device(kvm) ||
+ !irq_remapping_cap(IRQ_POSTING_CAP))
+ return 0;
+
+ pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n",
+ __func__, host_irq, guest_irq, set);
+
+ idx = srcu_read_lock(&kvm->irq_srcu);
+ irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
+ WARN_ON(guest_irq >= irq_rt->nr_rt_entries);
+
+ hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
+ struct vcpu_data vcpu_info;
+ struct vcpu_svm *svm = NULL;
+
+ if (e->type != KVM_IRQ_ROUTING_MSI)
+ continue;
+
+ /**
+ * Here, we setup with legacy mode in the following cases:
+ * 1. When cannot target interrupt to a specific vcpu.
+ * 2. Unsetting posted interrupt.
+ * 3. APIC virtialization is disabled for the vcpu.
+ */
+ if (!get_pi_vcpu_info(kvm, e, &vcpu_info, &svm) && set &&
+ kvm_vcpu_apicv_active(&svm->vcpu)) {
+ struct amd_iommu_pi_data pi;
+
+ /* Try to enable guest_mode in IRTE */
+ pi.base = __sme_set(page_to_phys(svm->avic_backing_page) &
+ AVIC_HPA_MASK);
+ pi.ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id,
+ svm->vcpu.vcpu_id);
+ pi.is_guest_mode = true;
+ pi.vcpu_data = &vcpu_info;
+ ret = irq_set_vcpu_affinity(host_irq, &pi);
+
+ /**
+ * Here, we successfully setting up vcpu affinity in
+ * IOMMU guest mode. Now, we need to store the posted
+ * interrupt information in a per-vcpu ir_list so that
+ * we can reference to them directly when we update vcpu
+ * scheduling information in IOMMU irte.
+ */
+ if (!ret && pi.is_guest_mode)
+ svm_ir_list_add(svm, &pi);
+ } else {
+ /* Use legacy mode in IRTE */
+ struct amd_iommu_pi_data pi;
+
+ /**
+ * Here, pi is used to:
+ * - Tell IOMMU to use legacy mode for this interrupt.
+ * - Retrieve ga_tag of prior interrupt remapping data.
+ */
+ pi.prev_ga_tag = 0;
+ pi.is_guest_mode = false;
+ ret = irq_set_vcpu_affinity(host_irq, &pi);
+
+ /**
+ * Check if the posted interrupt was previously
+ * setup with the guest_mode by checking if the ga_tag
+ * was cached. If so, we need to clean up the per-vcpu
+ * ir_list.
+ */
+ if (!ret && pi.prev_ga_tag) {
+ int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag);
+ struct kvm_vcpu *vcpu;
+
+ vcpu = kvm_get_vcpu_by_id(kvm, id);
+ if (vcpu)
+ svm_ir_list_del(to_svm(vcpu), &pi);
+ }
+ }
+
+ if (!ret && svm) {
+ trace_kvm_pi_irte_update(host_irq, svm->vcpu.vcpu_id,
+ e->gsi, vcpu_info.vector,
+ vcpu_info.pi_desc_addr, set);
+ }
+
+ if (ret < 0) {
+ pr_err("%s: failed to update PI IRTE\n", __func__);
+ goto out;
+ }
+ }
+
+ ret = 0;
+out:
+ srcu_read_unlock(&kvm->irq_srcu, idx);
+ return ret;
+}
+
+static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb *vmcb = svm->vmcb;
+ int ret;
+ ret = !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
+ !(svm->vcpu.arch.hflags & HF_NMI_MASK);
+ ret = ret && gif_set(svm) && nested_svm_nmi(svm);
+
+ return ret;
+}
+
+static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ return !!(svm->vcpu.arch.hflags & HF_NMI_MASK);
+}
+
+static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (masked) {
+ svm->vcpu.arch.hflags |= HF_NMI_MASK;
+ set_intercept(svm, INTERCEPT_IRET);
+ } else {
+ svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
+ clr_intercept(svm, INTERCEPT_IRET);
+ }
+}
+
+static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb *vmcb = svm->vmcb;
+ int ret;
+
+ if (!gif_set(svm) ||
+ (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK))
+ return 0;
+
+ ret = !!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF);
+
+ if (is_guest_mode(vcpu))
+ return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK);
+
+ return ret;
+}
+
+static void enable_irq_window(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (kvm_vcpu_apicv_active(vcpu))
+ return;
+
+ /*
+ * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes
+ * 1, because that's a separate STGI/VMRUN intercept. The next time we
+ * get that intercept, this function will be called again though and
+ * we'll get the vintr intercept. However, if the vGIF feature is
+ * enabled, the STGI interception will not occur. Enable the irq
+ * window under the assumption that the hardware will set the GIF.
+ */
+ if ((vgif_enabled(svm) || gif_set(svm)) && nested_svm_intr(svm)) {
+ svm_set_vintr(svm);
+ svm_inject_irq(svm, 0x0);
+ }
+}
+
+static void enable_nmi_window(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if ((svm->vcpu.arch.hflags & (HF_NMI_MASK | HF_IRET_MASK))
+ == HF_NMI_MASK)
+ return; /* IRET will cause a vm exit */
+
+ if (!gif_set(svm)) {
+ if (vgif_enabled(svm))
+ set_intercept(svm, INTERCEPT_STGI);
+ return; /* STGI will cause a vm exit */
+ }
+
+ if (svm->nested.exit_required)
+ return; /* we're not going to run the guest yet */
+
+ /*
+ * Something prevents NMI from been injected. Single step over possible
+ * problem (IRET or exception injection or interrupt shadow)
+ */
+ svm->nmi_singlestep_guest_rflags = svm_get_rflags(vcpu);
+ svm->nmi_singlestep = true;
+ svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
+}
+
+static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
+{
+ return 0;
+}
+
+static int svm_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
+{
+ return 0;
+}
+
+static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
+ svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
+ else
+ svm->asid_generation--;
+}
+
+static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ invlpga(gva, svm->vmcb->control.asid);
+}
+
+static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
+{
+}
+
+static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (svm_nested_virtualize_tpr(vcpu))
+ return;
+
+ if (!is_cr_intercept(svm, INTERCEPT_CR8_WRITE)) {
+ int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
+ kvm_set_cr8(vcpu, cr8);
+ }
+}
+
+static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u64 cr8;
+
+ if (svm_nested_virtualize_tpr(vcpu) ||
+ kvm_vcpu_apicv_active(vcpu))
+ return;
+
+ cr8 = kvm_get_cr8(vcpu);
+ svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
+ svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
+}
+
+static void svm_complete_interrupts(struct vcpu_svm *svm)
+{
+ u8 vector;
+ int type;
+ u32 exitintinfo = svm->vmcb->control.exit_int_info;
+ unsigned int3_injected = svm->int3_injected;
+
+ svm->int3_injected = 0;
+
+ /*
+ * If we've made progress since setting HF_IRET_MASK, we've
+ * executed an IRET and can allow NMI injection.
+ */
+ if ((svm->vcpu.arch.hflags & HF_IRET_MASK)
+ && kvm_rip_read(&svm->vcpu) != svm->nmi_iret_rip) {
+ svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
+ kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+ }
+
+ svm->vcpu.arch.nmi_injected = false;
+ kvm_clear_exception_queue(&svm->vcpu);
+ kvm_clear_interrupt_queue(&svm->vcpu);
+
+ if (!(exitintinfo & SVM_EXITINTINFO_VALID))
+ return;
+
+ kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+
+ vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
+ type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
+
+ switch (type) {
+ case SVM_EXITINTINFO_TYPE_NMI:
+ svm->vcpu.arch.nmi_injected = true;
+ break;
+ case SVM_EXITINTINFO_TYPE_EXEPT:
+ /*
+ * In case of software exceptions, do not reinject the vector,
+ * but re-execute the instruction instead. Rewind RIP first
+ * if we emulated INT3 before.
+ */
+ if (kvm_exception_is_soft(vector)) {
+ if (vector == BP_VECTOR && int3_injected &&
+ kvm_is_linear_rip(&svm->vcpu, svm->int3_rip))
+ kvm_rip_write(&svm->vcpu,
+ kvm_rip_read(&svm->vcpu) -
+ int3_injected);
+ break;
+ }
+ if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
+ u32 err = svm->vmcb->control.exit_int_info_err;
+ kvm_requeue_exception_e(&svm->vcpu, vector, err);
+
+ } else
+ kvm_requeue_exception(&svm->vcpu, vector);
+ break;
+ case SVM_EXITINTINFO_TYPE_INTR:
+ kvm_queue_interrupt(&svm->vcpu, vector, false);
+ break;
+ default:
+ break;
+ }
+}
+
+static void svm_cancel_injection(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_control_area *control = &svm->vmcb->control;
+
+ control->exit_int_info = control->event_inj;
+ control->exit_int_info_err = control->event_inj_err;
+ control->event_inj = 0;
+ svm_complete_interrupts(svm);
+}
+
+static void svm_vcpu_run(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
+ svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
+ svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
+
+ /*
+ * A vmexit emulation is required before the vcpu can be executed
+ * again.
+ */
+ if (unlikely(svm->nested.exit_required))
+ return;
+
+ /*
+ * Disable singlestep if we're injecting an interrupt/exception.
+ * We don't want our modified rflags to be pushed on the stack where
+ * we might not be able to easily reset them if we disabled NMI
+ * singlestep later.
+ */
+ if (svm->nmi_singlestep && svm->vmcb->control.event_inj) {
+ /*
+ * Event injection happens before external interrupts cause a
+ * vmexit and interrupts are disabled here, so smp_send_reschedule
+ * is enough to force an immediate vmexit.
+ */
+ disable_nmi_singlestep(svm);
+ smp_send_reschedule(vcpu->cpu);
+ }
+
+ pre_svm_run(svm);
+
+ sync_lapic_to_cr8(vcpu);
+
+ svm->vmcb->save.cr2 = vcpu->arch.cr2;
+
+ clgi();
+ kvm_load_guest_xcr0(vcpu);
+
+ /*
+ * If this vCPU has touched SPEC_CTRL, restore the guest's value if
+ * it's non-zero. Since vmentry is serialising on affected CPUs, there
+ * is no need to worry about the conditional branch over the wrmsr
+ * being speculatively taken.
+ */
+ x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl);
+
+ local_irq_enable();
+
+ asm volatile (
+ "push %%" _ASM_BP "; \n\t"
+ "mov %c[rbx](%[svm]), %%" _ASM_BX " \n\t"
+ "mov %c[rcx](%[svm]), %%" _ASM_CX " \n\t"
+ "mov %c[rdx](%[svm]), %%" _ASM_DX " \n\t"
+ "mov %c[rsi](%[svm]), %%" _ASM_SI " \n\t"
+ "mov %c[rdi](%[svm]), %%" _ASM_DI " \n\t"
+ "mov %c[rbp](%[svm]), %%" _ASM_BP " \n\t"
+#ifdef CONFIG_X86_64
+ "mov %c[r8](%[svm]), %%r8 \n\t"
+ "mov %c[r9](%[svm]), %%r9 \n\t"
+ "mov %c[r10](%[svm]), %%r10 \n\t"
+ "mov %c[r11](%[svm]), %%r11 \n\t"
+ "mov %c[r12](%[svm]), %%r12 \n\t"
+ "mov %c[r13](%[svm]), %%r13 \n\t"
+ "mov %c[r14](%[svm]), %%r14 \n\t"
+ "mov %c[r15](%[svm]), %%r15 \n\t"
+#endif
+
+ /* Enter guest mode */
+ "push %%" _ASM_AX " \n\t"
+ "mov %c[vmcb](%[svm]), %%" _ASM_AX " \n\t"
+ __ex(SVM_VMLOAD) "\n\t"
+ __ex(SVM_VMRUN) "\n\t"
+ __ex(SVM_VMSAVE) "\n\t"
+ "pop %%" _ASM_AX " \n\t"
+
+ /* Save guest registers, load host registers */
+ "mov %%" _ASM_BX ", %c[rbx](%[svm]) \n\t"
+ "mov %%" _ASM_CX ", %c[rcx](%[svm]) \n\t"
+ "mov %%" _ASM_DX ", %c[rdx](%[svm]) \n\t"
+ "mov %%" _ASM_SI ", %c[rsi](%[svm]) \n\t"
+ "mov %%" _ASM_DI ", %c[rdi](%[svm]) \n\t"
+ "mov %%" _ASM_BP ", %c[rbp](%[svm]) \n\t"
+#ifdef CONFIG_X86_64
+ "mov %%r8, %c[r8](%[svm]) \n\t"
+ "mov %%r9, %c[r9](%[svm]) \n\t"
+ "mov %%r10, %c[r10](%[svm]) \n\t"
+ "mov %%r11, %c[r11](%[svm]) \n\t"
+ "mov %%r12, %c[r12](%[svm]) \n\t"
+ "mov %%r13, %c[r13](%[svm]) \n\t"
+ "mov %%r14, %c[r14](%[svm]) \n\t"
+ "mov %%r15, %c[r15](%[svm]) \n\t"
+#endif
+ /*
+ * Clear host registers marked as clobbered to prevent
+ * speculative use.
+ */
+ "xor %%" _ASM_BX ", %%" _ASM_BX " \n\t"
+ "xor %%" _ASM_CX ", %%" _ASM_CX " \n\t"
+ "xor %%" _ASM_DX ", %%" _ASM_DX " \n\t"
+ "xor %%" _ASM_SI ", %%" _ASM_SI " \n\t"
+ "xor %%" _ASM_DI ", %%" _ASM_DI " \n\t"
+#ifdef CONFIG_X86_64
+ "xor %%r8, %%r8 \n\t"
+ "xor %%r9, %%r9 \n\t"
+ "xor %%r10, %%r10 \n\t"
+ "xor %%r11, %%r11 \n\t"
+ "xor %%r12, %%r12 \n\t"
+ "xor %%r13, %%r13 \n\t"
+ "xor %%r14, %%r14 \n\t"
+ "xor %%r15, %%r15 \n\t"
+#endif
+ "pop %%" _ASM_BP
+ :
+ : [svm]"a"(svm),
+ [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)),
+ [rbx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBX])),
+ [rcx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RCX])),
+ [rdx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDX])),
+ [rsi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RSI])),
+ [rdi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDI])),
+ [rbp]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBP]))
+#ifdef CONFIG_X86_64
+ , [r8]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R8])),
+ [r9]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R9])),
+ [r10]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R10])),
+ [r11]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R11])),
+ [r12]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R12])),
+ [r13]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R13])),
+ [r14]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R14])),
+ [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15]))
+#endif
+ : "cc", "memory"
+#ifdef CONFIG_X86_64
+ , "rbx", "rcx", "rdx", "rsi", "rdi"
+ , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15"
+#else
+ , "ebx", "ecx", "edx", "esi", "edi"
+#endif
+ );
+
+ /* Eliminate branch target predictions from guest mode */
+ vmexit_fill_RSB();
+
+#ifdef CONFIG_X86_64
+ wrmsrl(MSR_GS_BASE, svm->host.gs_base);
+#else
+ loadsegment(fs, svm->host.fs);
+#ifndef CONFIG_X86_32_LAZY_GS
+ loadsegment(gs, svm->host.gs);
+#endif
+#endif
+
+ /*
+ * We do not use IBRS in the kernel. If this vCPU has used the
+ * SPEC_CTRL MSR it may have left it on; save the value and
+ * turn it off. This is much more efficient than blindly adding
+ * it to the atomic save/restore list. Especially as the former
+ * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
+ *
+ * For non-nested case:
+ * If the L01 MSR bitmap does not intercept the MSR, then we need to
+ * save it.
+ *
+ * For nested case:
+ * If the L02 MSR bitmap does not intercept the MSR, then we need to
+ * save it.
+ */
+ if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
+ svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
+
+ reload_tss(vcpu);
+
+ local_irq_disable();
+
+ x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl);
+
+ vcpu->arch.cr2 = svm->vmcb->save.cr2;
+ vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
+ vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
+ vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
+
+ if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
+ kvm_before_interrupt(&svm->vcpu);
+
+ kvm_put_guest_xcr0(vcpu);
+ stgi();
+
+ /* Any pending NMI will happen here */
+
+ if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
+ kvm_after_interrupt(&svm->vcpu);
+
+ sync_cr8_to_lapic(vcpu);
+
+ svm->next_rip = 0;
+
+ svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
+
+ /* if exit due to PF check for async PF */
+ if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
+ svm->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason();
+
+ if (npt_enabled) {
+ vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR);
+ vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR);
+ }
+
+ /*
+ * We need to handle MC intercepts here before the vcpu has a chance to
+ * change the physical cpu
+ */
+ if (unlikely(svm->vmcb->control.exit_code ==
+ SVM_EXIT_EXCP_BASE + MC_VECTOR))
+ svm_handle_mce(svm);
+
+ mark_all_clean(svm->vmcb);
+}
+STACK_FRAME_NON_STANDARD(svm_vcpu_run);
+
+static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->vmcb->save.cr3 = __sme_set(root);
+ mark_dirty(svm->vmcb, VMCB_CR);
+}
+
+static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->vmcb->control.nested_cr3 = __sme_set(root);
+ mark_dirty(svm->vmcb, VMCB_NPT);
+
+ /* Also sync guest cr3 here in case we live migrate */
+ svm->vmcb->save.cr3 = kvm_read_cr3(vcpu);
+ mark_dirty(svm->vmcb, VMCB_CR);
+}
+
+static int is_disabled(void)
+{
+ u64 vm_cr;
+
+ rdmsrl(MSR_VM_CR, vm_cr);
+ if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE))
+ return 1;
+
+ return 0;
+}
+
+static void
+svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
+{
+ /*
+ * Patch in the VMMCALL instruction:
+ */
+ hypercall[0] = 0x0f;
+ hypercall[1] = 0x01;
+ hypercall[2] = 0xd9;
+}
+
+static void svm_check_processor_compat(void *rtn)
+{
+ *(int *)rtn = 0;
+}
+
+static bool svm_cpu_has_accelerated_tpr(void)
+{
+ return false;
+}
+
+static bool svm_has_emulated_msr(int index)
+{
+ switch (index) {
+ case MSR_IA32_MCG_EXT_CTL:
+ return false;
+ default:
+ break;
+ }
+
+ return true;
+}
+
+static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
+{
+ return 0;
+}
+
+static void svm_cpuid_update(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /* Update nrips enabled cache */
+ svm->nrips_enabled = !!guest_cpuid_has(&svm->vcpu, X86_FEATURE_NRIPS);
+
+ if (!kvm_vcpu_apicv_active(vcpu))
+ return;
+
+ guest_cpuid_clear(vcpu, X86_FEATURE_X2APIC);
+}
+
+static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
+{
+ switch (func) {
+ case 0x1:
+ if (avic)
+ entry->ecx &= ~bit(X86_FEATURE_X2APIC);
+ break;
+ case 0x80000001:
+ if (nested)
+ entry->ecx |= (1 << 2); /* Set SVM bit */
+ break;
+ case 0x8000000A:
+ entry->eax = 1; /* SVM revision 1 */
+ entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper
+ ASID emulation to nested SVM */
+ entry->ecx = 0; /* Reserved */
+ entry->edx = 0; /* Per default do not support any
+ additional features */
+
+ /* Support next_rip if host supports it */
+ if (boot_cpu_has(X86_FEATURE_NRIPS))
+ entry->edx |= SVM_FEATURE_NRIP;
+
+ /* Support NPT for the guest if enabled */
+ if (npt_enabled)
+ entry->edx |= SVM_FEATURE_NPT;
+
+ break;
+ case 0x8000001F:
+ /* Support memory encryption cpuid if host supports it */
+ if (boot_cpu_has(X86_FEATURE_SEV))
+ cpuid(0x8000001f, &entry->eax, &entry->ebx,
+ &entry->ecx, &entry->edx);
+
+ }
+}
+
+static int svm_get_lpage_level(void)
+{
+ return PT_PDPE_LEVEL;
+}
+
+static bool svm_rdtscp_supported(void)
+{
+ return boot_cpu_has(X86_FEATURE_RDTSCP);
+}
+
+static bool svm_invpcid_supported(void)
+{
+ return false;
+}
+
+static bool svm_mpx_supported(void)
+{
+ return false;
+}
+
+static bool svm_xsaves_supported(void)
+{
+ return false;
+}
+
+static bool svm_umip_emulated(void)
+{
+ return false;
+}
+
+static bool svm_has_wbinvd_exit(void)
+{
+ return true;
+}
+
+#define PRE_EX(exit) { .exit_code = (exit), \
+ .stage = X86_ICPT_PRE_EXCEPT, }
+#define POST_EX(exit) { .exit_code = (exit), \
+ .stage = X86_ICPT_POST_EXCEPT, }
+#define POST_MEM(exit) { .exit_code = (exit), \
+ .stage = X86_ICPT_POST_MEMACCESS, }
+
+static const struct __x86_intercept {
+ u32 exit_code;
+ enum x86_intercept_stage stage;
+} x86_intercept_map[] = {
+ [x86_intercept_cr_read] = POST_EX(SVM_EXIT_READ_CR0),
+ [x86_intercept_cr_write] = POST_EX(SVM_EXIT_WRITE_CR0),
+ [x86_intercept_clts] = POST_EX(SVM_EXIT_WRITE_CR0),
+ [x86_intercept_lmsw] = POST_EX(SVM_EXIT_WRITE_CR0),
+ [x86_intercept_smsw] = POST_EX(SVM_EXIT_READ_CR0),
+ [x86_intercept_dr_read] = POST_EX(SVM_EXIT_READ_DR0),
+ [x86_intercept_dr_write] = POST_EX(SVM_EXIT_WRITE_DR0),
+ [x86_intercept_sldt] = POST_EX(SVM_EXIT_LDTR_READ),
+ [x86_intercept_str] = POST_EX(SVM_EXIT_TR_READ),
+ [x86_intercept_lldt] = POST_EX(SVM_EXIT_LDTR_WRITE),
+ [x86_intercept_ltr] = POST_EX(SVM_EXIT_TR_WRITE),
+ [x86_intercept_sgdt] = POST_EX(SVM_EXIT_GDTR_READ),
+ [x86_intercept_sidt] = POST_EX(SVM_EXIT_IDTR_READ),
+ [x86_intercept_lgdt] = POST_EX(SVM_EXIT_GDTR_WRITE),
+ [x86_intercept_lidt] = POST_EX(SVM_EXIT_IDTR_WRITE),
+ [x86_intercept_vmrun] = POST_EX(SVM_EXIT_VMRUN),
+ [x86_intercept_vmmcall] = POST_EX(SVM_EXIT_VMMCALL),
+ [x86_intercept_vmload] = POST_EX(SVM_EXIT_VMLOAD),
+ [x86_intercept_vmsave] = POST_EX(SVM_EXIT_VMSAVE),
+ [x86_intercept_stgi] = POST_EX(SVM_EXIT_STGI),
+ [x86_intercept_clgi] = POST_EX(SVM_EXIT_CLGI),
+ [x86_intercept_skinit] = POST_EX(SVM_EXIT_SKINIT),
+ [x86_intercept_invlpga] = POST_EX(SVM_EXIT_INVLPGA),
+ [x86_intercept_rdtscp] = POST_EX(SVM_EXIT_RDTSCP),
+ [x86_intercept_monitor] = POST_MEM(SVM_EXIT_MONITOR),
+ [x86_intercept_mwait] = POST_EX(SVM_EXIT_MWAIT),
+ [x86_intercept_invlpg] = POST_EX(SVM_EXIT_INVLPG),
+ [x86_intercept_invd] = POST_EX(SVM_EXIT_INVD),
+ [x86_intercept_wbinvd] = POST_EX(SVM_EXIT_WBINVD),
+ [x86_intercept_wrmsr] = POST_EX(SVM_EXIT_MSR),
+ [x86_intercept_rdtsc] = POST_EX(SVM_EXIT_RDTSC),
+ [x86_intercept_rdmsr] = POST_EX(SVM_EXIT_MSR),
+ [x86_intercept_rdpmc] = POST_EX(SVM_EXIT_RDPMC),
+ [x86_intercept_cpuid] = PRE_EX(SVM_EXIT_CPUID),
+ [x86_intercept_rsm] = PRE_EX(SVM_EXIT_RSM),
+ [x86_intercept_pause] = PRE_EX(SVM_EXIT_PAUSE),
+ [x86_intercept_pushf] = PRE_EX(SVM_EXIT_PUSHF),
+ [x86_intercept_popf] = PRE_EX(SVM_EXIT_POPF),
+ [x86_intercept_intn] = PRE_EX(SVM_EXIT_SWINT),
+ [x86_intercept_iret] = PRE_EX(SVM_EXIT_IRET),
+ [x86_intercept_icebp] = PRE_EX(SVM_EXIT_ICEBP),
+ [x86_intercept_hlt] = POST_EX(SVM_EXIT_HLT),
+ [x86_intercept_in] = POST_EX(SVM_EXIT_IOIO),
+ [x86_intercept_ins] = POST_EX(SVM_EXIT_IOIO),
+ [x86_intercept_out] = POST_EX(SVM_EXIT_IOIO),
+ [x86_intercept_outs] = POST_EX(SVM_EXIT_IOIO),
+};
+
+#undef PRE_EX
+#undef POST_EX
+#undef POST_MEM
+
+static int svm_check_intercept(struct kvm_vcpu *vcpu,
+ struct x86_instruction_info *info,
+ enum x86_intercept_stage stage)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int vmexit, ret = X86EMUL_CONTINUE;
+ struct __x86_intercept icpt_info;
+ struct vmcb *vmcb = svm->vmcb;
+
+ if (info->intercept >= ARRAY_SIZE(x86_intercept_map))
+ goto out;
+
+ icpt_info = x86_intercept_map[info->intercept];
+
+ if (stage != icpt_info.stage)
+ goto out;
+
+ switch (icpt_info.exit_code) {
+ case SVM_EXIT_READ_CR0:
+ if (info->intercept == x86_intercept_cr_read)
+ icpt_info.exit_code += info->modrm_reg;
+ break;
+ case SVM_EXIT_WRITE_CR0: {
+ unsigned long cr0, val;
+ u64 intercept;
+
+ if (info->intercept == x86_intercept_cr_write)
+ icpt_info.exit_code += info->modrm_reg;
+
+ if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0 ||
+ info->intercept == x86_intercept_clts)
+ break;
+
+ intercept = svm->nested.intercept;
+
+ if (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0)))
+ break;
+
+ cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK;
+ val = info->src_val & ~SVM_CR0_SELECTIVE_MASK;
+
+ if (info->intercept == x86_intercept_lmsw) {
+ cr0 &= 0xfUL;
+ val &= 0xfUL;
+ /* lmsw can't clear PE - catch this here */
+ if (cr0 & X86_CR0_PE)
+ val |= X86_CR0_PE;
+ }
+
+ if (cr0 ^ val)
+ icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE;
+
+ break;
+ }
+ case SVM_EXIT_READ_DR0:
+ case SVM_EXIT_WRITE_DR0:
+ icpt_info.exit_code += info->modrm_reg;
+ break;
+ case SVM_EXIT_MSR:
+ if (info->intercept == x86_intercept_wrmsr)
+ vmcb->control.exit_info_1 = 1;
+ else
+ vmcb->control.exit_info_1 = 0;
+ break;
+ case SVM_EXIT_PAUSE:
+ /*
+ * We get this for NOP only, but pause
+ * is rep not, check this here
+ */
+ if (info->rep_prefix != REPE_PREFIX)
+ goto out;
+ break;
+ case SVM_EXIT_IOIO: {
+ u64 exit_info;
+ u32 bytes;
+
+ if (info->intercept == x86_intercept_in ||
+ info->intercept == x86_intercept_ins) {
+ exit_info = ((info->src_val & 0xffff) << 16) |
+ SVM_IOIO_TYPE_MASK;
+ bytes = info->dst_bytes;
+ } else {
+ exit_info = (info->dst_val & 0xffff) << 16;
+ bytes = info->src_bytes;
+ }
+
+ if (info->intercept == x86_intercept_outs ||
+ info->intercept == x86_intercept_ins)
+ exit_info |= SVM_IOIO_STR_MASK;
+
+ if (info->rep_prefix)
+ exit_info |= SVM_IOIO_REP_MASK;
+
+ bytes = min(bytes, 4u);
+
+ exit_info |= bytes << SVM_IOIO_SIZE_SHIFT;
+
+ exit_info |= (u32)info->ad_bytes << (SVM_IOIO_ASIZE_SHIFT - 1);
+
+ vmcb->control.exit_info_1 = exit_info;
+ vmcb->control.exit_info_2 = info->next_rip;
+
+ break;
+ }
+ default:
+ break;
+ }
+
+ /* TODO: Advertise NRIPS to guest hypervisor unconditionally */
+ if (static_cpu_has(X86_FEATURE_NRIPS))
+ vmcb->control.next_rip = info->next_rip;
+ vmcb->control.exit_code = icpt_info.exit_code;
+ vmexit = nested_svm_exit_handled(svm);
+
+ ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED
+ : X86EMUL_CONTINUE;
+
+out:
+ return ret;
+}
+
+static void svm_handle_external_intr(struct kvm_vcpu *vcpu)
+{
+ local_irq_enable();
+ /*
+ * We must have an instruction with interrupts enabled, so
+ * the timer interrupt isn't delayed by the interrupt shadow.
+ */
+ asm("nop");
+ local_irq_disable();
+}
+
+static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
+{
+ if (pause_filter_thresh)
+ shrink_ple_window(vcpu);
+}
+
+static inline void avic_post_state_restore(struct kvm_vcpu *vcpu)
+{
+ if (avic_handle_apic_id_update(vcpu) != 0)
+ return;
+ if (avic_handle_dfr_update(vcpu) != 0)
+ return;
+ avic_handle_ldr_update(vcpu);
+}
+
+static void svm_setup_mce(struct kvm_vcpu *vcpu)
+{
+ /* [63:9] are reserved. */
+ vcpu->arch.mcg_cap &= 0x1ff;
+}
+
+static int svm_smi_allowed(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /* Per APM Vol.2 15.22.2 "Response to SMI" */
+ if (!gif_set(svm))
+ return 0;
+
+ if (is_guest_mode(&svm->vcpu) &&
+ svm->nested.intercept & (1ULL << INTERCEPT_SMI)) {
+ /* TODO: Might need to set exit_info_1 and exit_info_2 here */
+ svm->vmcb->control.exit_code = SVM_EXIT_SMI;
+ svm->nested.exit_required = true;
+ return 0;
+ }
+
+ return 1;
+}
+
+static int svm_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int ret;
+
+ if (is_guest_mode(vcpu)) {
+ /* FED8h - SVM Guest */
+ put_smstate(u64, smstate, 0x7ed8, 1);
+ /* FEE0h - SVM Guest VMCB Physical Address */
+ put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb);
+
+ svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
+ svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
+ svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
+
+ ret = nested_svm_vmexit(svm);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb *nested_vmcb;
+ struct page *page;
+ struct {
+ u64 guest;
+ u64 vmcb;
+ } svm_state_save;
+ int ret;
+
+ ret = kvm_vcpu_read_guest(vcpu, smbase + 0xfed8, &svm_state_save,
+ sizeof(svm_state_save));
+ if (ret)
+ return ret;
+
+ if (svm_state_save.guest) {
+ vcpu->arch.hflags &= ~HF_SMM_MASK;
+ nested_vmcb = nested_svm_map(svm, svm_state_save.vmcb, &page);
+ if (nested_vmcb)
+ enter_svm_guest_mode(svm, svm_state_save.vmcb, nested_vmcb, page);
+ else
+ ret = 1;
+ vcpu->arch.hflags |= HF_SMM_MASK;
+ }
+ return ret;
+}
+
+static int enable_smi_window(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (!gif_set(svm)) {
+ if (vgif_enabled(svm))
+ set_intercept(svm, INTERCEPT_STGI);
+ /* STGI will cause a vm exit */
+ return 1;
+ }
+ return 0;
+}
+
+static int sev_asid_new(void)
+{
+ int pos;
+
+ /*
+ * SEV-enabled guest must use asid from min_sev_asid to max_sev_asid.
+ */
+ pos = find_next_zero_bit(sev_asid_bitmap, max_sev_asid, min_sev_asid - 1);
+ if (pos >= max_sev_asid)
+ return -EBUSY;
+
+ set_bit(pos, sev_asid_bitmap);
+ return pos + 1;
+}
+
+static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ int asid, ret;
+
+ ret = -EBUSY;
+ if (unlikely(sev->active))
+ return ret;
+
+ asid = sev_asid_new();
+ if (asid < 0)
+ return ret;
+
+ ret = sev_platform_init(&argp->error);
+ if (ret)
+ goto e_free;
+
+ sev->active = true;
+ sev->asid = asid;
+ INIT_LIST_HEAD(&sev->regions_list);
+
+ return 0;
+
+e_free:
+ __sev_asid_free(asid);
+ return ret;
+}
+
+static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error)
+{
+ struct sev_data_activate *data;
+ int asid = sev_get_asid(kvm);
+ int ret;
+
+ wbinvd_on_all_cpus();
+
+ ret = sev_guest_df_flush(error);
+ if (ret)
+ return ret;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+
+ /* activate ASID on the given handle */
+ data->handle = handle;
+ data->asid = asid;
+ ret = sev_guest_activate(data, error);
+ kfree(data);
+
+ return ret;
+}
+
+static int __sev_issue_cmd(int fd, int id, void *data, int *error)
+{
+ struct fd f;
+ int ret;
+
+ f = fdget(fd);
+ if (!f.file)
+ return -EBADF;
+
+ ret = sev_issue_cmd_external_user(f.file, id, data, error);
+
+ fdput(f);
+ return ret;
+}
+
+static int sev_issue_cmd(struct kvm *kvm, int id, void *data, int *error)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+
+ return __sev_issue_cmd(sev->fd, id, data, error);
+}
+
+static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_launch_start *start;
+ struct kvm_sev_launch_start params;
+ void *dh_blob, *session_blob;
+ int *error = &argp->error;
+ int ret;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
+ return -EFAULT;
+
+ start = kzalloc(sizeof(*start), GFP_KERNEL);
+ if (!start)
+ return -ENOMEM;
+
+ dh_blob = NULL;
+ if (params.dh_uaddr) {
+ dh_blob = psp_copy_user_blob(params.dh_uaddr, params.dh_len);
+ if (IS_ERR(dh_blob)) {
+ ret = PTR_ERR(dh_blob);
+ goto e_free;
+ }
+
+ start->dh_cert_address = __sme_set(__pa(dh_blob));
+ start->dh_cert_len = params.dh_len;
+ }
+
+ session_blob = NULL;
+ if (params.session_uaddr) {
+ session_blob = psp_copy_user_blob(params.session_uaddr, params.session_len);
+ if (IS_ERR(session_blob)) {
+ ret = PTR_ERR(session_blob);
+ goto e_free_dh;
+ }
+
+ start->session_address = __sme_set(__pa(session_blob));
+ start->session_len = params.session_len;
+ }
+
+ start->handle = params.handle;
+ start->policy = params.policy;
+
+ /* create memory encryption context */
+ ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_LAUNCH_START, start, error);
+ if (ret)
+ goto e_free_session;
+
+ /* Bind ASID to this guest */
+ ret = sev_bind_asid(kvm, start->handle, error);
+ if (ret) {
+ sev_decommission(start->handle);
+ goto e_free_session;
+ }
+
+ /* return handle to userspace */
+ params.handle = start->handle;
+ if (copy_to_user((void __user *)(uintptr_t)argp->data, &params, sizeof(params))) {
+ sev_unbind_asid(kvm, start->handle);
+ ret = -EFAULT;
+ goto e_free_session;
+ }
+
+ sev->handle = start->handle;
+ sev->fd = argp->sev_fd;
+
+e_free_session:
+ kfree(session_blob);
+e_free_dh:
+ kfree(dh_blob);
+e_free:
+ kfree(start);
+ return ret;
+}
+
+static unsigned long get_num_contig_pages(unsigned long idx,
+ struct page **inpages, unsigned long npages)
+{
+ unsigned long paddr, next_paddr;
+ unsigned long i = idx + 1, pages = 1;
+
+ /* find the number of contiguous pages starting from idx */
+ paddr = __sme_page_pa(inpages[idx]);
+ while (i < npages) {
+ next_paddr = __sme_page_pa(inpages[i++]);
+ if ((paddr + PAGE_SIZE) == next_paddr) {
+ pages++;
+ paddr = next_paddr;
+ continue;
+ }
+ break;
+ }
+
+ return pages;
+}
+
+static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ unsigned long vaddr, vaddr_end, next_vaddr, npages, pages, size, i;
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct kvm_sev_launch_update_data params;
+ struct sev_data_launch_update_data *data;
+ struct page **inpages;
+ int ret;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
+ return -EFAULT;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+
+ vaddr = params.uaddr;
+ size = params.len;
+ vaddr_end = vaddr + size;
+
+ /* Lock the user memory. */
+ inpages = sev_pin_memory(kvm, vaddr, size, &npages, 1);
+ if (!inpages) {
+ ret = -ENOMEM;
+ goto e_free;
+ }
+
+ /*
+ * The LAUNCH_UPDATE command will perform in-place encryption of the
+ * memory content (i.e it will write the same memory region with C=1).
+ * It's possible that the cache may contain the data with C=0, i.e.,
+ * unencrypted so invalidate it first.
+ */
+ sev_clflush_pages(inpages, npages);
+
+ for (i = 0; vaddr < vaddr_end; vaddr = next_vaddr, i += pages) {
+ int offset, len;
+
+ /*
+ * If the user buffer is not page-aligned, calculate the offset
+ * within the page.
+ */
+ offset = vaddr & (PAGE_SIZE - 1);
+
+ /* Calculate the number of pages that can be encrypted in one go. */
+ pages = get_num_contig_pages(i, inpages, npages);
+
+ len = min_t(size_t, ((pages * PAGE_SIZE) - offset), size);
+
+ data->handle = sev->handle;
+ data->len = len;
+ data->address = __sme_page_pa(inpages[i]) + offset;
+ ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_DATA, data, &argp->error);
+ if (ret)
+ goto e_unpin;
+
+ size -= len;
+ next_vaddr = vaddr + len;
+ }
+
+e_unpin:
+ /* content of memory is updated, mark pages dirty */
+ for (i = 0; i < npages; i++) {
+ set_page_dirty_lock(inpages[i]);
+ mark_page_accessed(inpages[i]);
+ }
+ /* unlock the user pages */
+ sev_unpin_memory(kvm, inpages, npages);
+e_free:
+ kfree(data);
+ return ret;
+}
+
+static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ void __user *measure = (void __user *)(uintptr_t)argp->data;
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_launch_measure *data;
+ struct kvm_sev_launch_measure params;
+ void __user *p = NULL;
+ void *blob = NULL;
+ int ret;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(&params, measure, sizeof(params)))
+ return -EFAULT;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+
+ /* User wants to query the blob length */
+ if (!params.len)
+ goto cmd;
+
+ p = (void __user *)(uintptr_t)params.uaddr;
+ if (p) {
+ if (params.len > SEV_FW_BLOB_MAX_SIZE) {
+ ret = -EINVAL;
+ goto e_free;
+ }
+
+ ret = -ENOMEM;
+ blob = kmalloc(params.len, GFP_KERNEL);
+ if (!blob)
+ goto e_free;
+
+ data->address = __psp_pa(blob);
+ data->len = params.len;
+ }
+
+cmd:
+ data->handle = sev->handle;
+ ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_MEASURE, data, &argp->error);
+
+ /*
+ * If we query the session length, FW responded with expected data.
+ */
+ if (!params.len)
+ goto done;
+
+ if (ret)
+ goto e_free_blob;
+
+ if (blob) {
+ if (copy_to_user(p, blob, params.len))
+ ret = -EFAULT;
+ }
+
+done:
+ params.len = data->len;
+ if (copy_to_user(measure, &params, sizeof(params)))
+ ret = -EFAULT;
+e_free_blob:
+ kfree(blob);
+e_free:
+ kfree(data);
+ return ret;
+}
+
+static int sev_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_launch_finish *data;
+ int ret;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+
+ data->handle = sev->handle;
+ ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_FINISH, data, &argp->error);
+
+ kfree(data);
+ return ret;
+}
+
+static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct kvm_sev_guest_status params;
+ struct sev_data_guest_status *data;
+ int ret;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+
+ data->handle = sev->handle;
+ ret = sev_issue_cmd(kvm, SEV_CMD_GUEST_STATUS, data, &argp->error);
+ if (ret)
+ goto e_free;
+
+ params.policy = data->policy;
+ params.state = data->state;
+ params.handle = data->handle;
+
+ if (copy_to_user((void __user *)(uintptr_t)argp->data, &params, sizeof(params)))
+ ret = -EFAULT;
+e_free:
+ kfree(data);
+ return ret;
+}
+
+static int __sev_issue_dbg_cmd(struct kvm *kvm, unsigned long src,
+ unsigned long dst, int size,
+ int *error, bool enc)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_dbg *data;
+ int ret;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+
+ data->handle = sev->handle;
+ data->dst_addr = dst;
+ data->src_addr = src;
+ data->len = size;
+
+ ret = sev_issue_cmd(kvm,
+ enc ? SEV_CMD_DBG_ENCRYPT : SEV_CMD_DBG_DECRYPT,
+ data, error);
+ kfree(data);
+ return ret;
+}
+
+static int __sev_dbg_decrypt(struct kvm *kvm, unsigned long src_paddr,
+ unsigned long dst_paddr, int sz, int *err)
+{
+ int offset;
+
+ /*
+ * Its safe to read more than we are asked, caller should ensure that
+ * destination has enough space.
+ */
+ src_paddr = round_down(src_paddr, 16);
+ offset = src_paddr & 15;
+ sz = round_up(sz + offset, 16);
+
+ return __sev_issue_dbg_cmd(kvm, src_paddr, dst_paddr, sz, err, false);
+}
+
+static int __sev_dbg_decrypt_user(struct kvm *kvm, unsigned long paddr,
+ unsigned long __user dst_uaddr,
+ unsigned long dst_paddr,
+ int size, int *err)
+{
+ struct page *tpage = NULL;
+ int ret, offset;
+
+ /* if inputs are not 16-byte then use intermediate buffer */
+ if (!IS_ALIGNED(dst_paddr, 16) ||
+ !IS_ALIGNED(paddr, 16) ||
+ !IS_ALIGNED(size, 16)) {
+ tpage = (void *)alloc_page(GFP_KERNEL);
+ if (!tpage)
+ return -ENOMEM;
+
+ dst_paddr = __sme_page_pa(tpage);
+ }
+
+ ret = __sev_dbg_decrypt(kvm, paddr, dst_paddr, size, err);
+ if (ret)
+ goto e_free;
+
+ if (tpage) {
+ offset = paddr & 15;
+ if (copy_to_user((void __user *)(uintptr_t)dst_uaddr,
+ page_address(tpage) + offset, size))
+ ret = -EFAULT;
+ }
+
+e_free:
+ if (tpage)
+ __free_page(tpage);
+
+ return ret;
+}
+
+static int __sev_dbg_encrypt_user(struct kvm *kvm, unsigned long paddr,
+ unsigned long __user vaddr,
+ unsigned long dst_paddr,
+ unsigned long __user dst_vaddr,
+ int size, int *error)
+{
+ struct page *src_tpage = NULL;
+ struct page *dst_tpage = NULL;
+ int ret, len = size;
+
+ /* If source buffer is not aligned then use an intermediate buffer */
+ if (!IS_ALIGNED(vaddr, 16)) {
+ src_tpage = alloc_page(GFP_KERNEL);
+ if (!src_tpage)
+ return -ENOMEM;
+
+ if (copy_from_user(page_address(src_tpage),
+ (void __user *)(uintptr_t)vaddr, size)) {
+ __free_page(src_tpage);
+ return -EFAULT;
+ }
+
+ paddr = __sme_page_pa(src_tpage);
+ }
+
+ /*
+ * If destination buffer or length is not aligned then do read-modify-write:
+ * - decrypt destination in an intermediate buffer
+ * - copy the source buffer in an intermediate buffer
+ * - use the intermediate buffer as source buffer
+ */
+ if (!IS_ALIGNED(dst_vaddr, 16) || !IS_ALIGNED(size, 16)) {
+ int dst_offset;
+
+ dst_tpage = alloc_page(GFP_KERNEL);
+ if (!dst_tpage) {
+ ret = -ENOMEM;
+ goto e_free;
+ }
+
+ ret = __sev_dbg_decrypt(kvm, dst_paddr,
+ __sme_page_pa(dst_tpage), size, error);
+ if (ret)
+ goto e_free;
+
+ /*
+ * If source is kernel buffer then use memcpy() otherwise
+ * copy_from_user().
+ */
+ dst_offset = dst_paddr & 15;
+
+ if (src_tpage)
+ memcpy(page_address(dst_tpage) + dst_offset,
+ page_address(src_tpage), size);
+ else {
+ if (copy_from_user(page_address(dst_tpage) + dst_offset,
+ (void __user *)(uintptr_t)vaddr, size)) {
+ ret = -EFAULT;
+ goto e_free;
+ }
+ }
+
+ paddr = __sme_page_pa(dst_tpage);
+ dst_paddr = round_down(dst_paddr, 16);
+ len = round_up(size, 16);
+ }
+
+ ret = __sev_issue_dbg_cmd(kvm, paddr, dst_paddr, len, error, true);
+
+e_free:
+ if (src_tpage)
+ __free_page(src_tpage);
+ if (dst_tpage)
+ __free_page(dst_tpage);
+ return ret;
+}
+
+static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec)
+{
+ unsigned long vaddr, vaddr_end, next_vaddr;
+ unsigned long dst_vaddr;
+ struct page **src_p, **dst_p;
+ struct kvm_sev_dbg debug;
+ unsigned long n;
+ unsigned int size;
+ int ret;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(&debug, (void __user *)(uintptr_t)argp->data, sizeof(debug)))
+ return -EFAULT;
+
+ if (!debug.len || debug.src_uaddr + debug.len < debug.src_uaddr)
+ return -EINVAL;
+ if (!debug.dst_uaddr)
+ return -EINVAL;
+
+ vaddr = debug.src_uaddr;
+ size = debug.len;
+ vaddr_end = vaddr + size;
+ dst_vaddr = debug.dst_uaddr;
+
+ for (; vaddr < vaddr_end; vaddr = next_vaddr) {
+ int len, s_off, d_off;
+
+ /* lock userspace source and destination page */
+ src_p = sev_pin_memory(kvm, vaddr & PAGE_MASK, PAGE_SIZE, &n, 0);
+ if (!src_p)
+ return -EFAULT;
+
+ dst_p = sev_pin_memory(kvm, dst_vaddr & PAGE_MASK, PAGE_SIZE, &n, 1);
+ if (!dst_p) {
+ sev_unpin_memory(kvm, src_p, n);
+ return -EFAULT;
+ }
+
+ /*
+ * The DBG_{DE,EN}CRYPT commands will perform {dec,en}cryption of the
+ * memory content (i.e it will write the same memory region with C=1).
+ * It's possible that the cache may contain the data with C=0, i.e.,
+ * unencrypted so invalidate it first.
+ */
+ sev_clflush_pages(src_p, 1);
+ sev_clflush_pages(dst_p, 1);
+
+ /*
+ * Since user buffer may not be page aligned, calculate the
+ * offset within the page.
+ */
+ s_off = vaddr & ~PAGE_MASK;
+ d_off = dst_vaddr & ~PAGE_MASK;
+ len = min_t(size_t, (PAGE_SIZE - s_off), size);
+
+ if (dec)
+ ret = __sev_dbg_decrypt_user(kvm,
+ __sme_page_pa(src_p[0]) + s_off,
+ dst_vaddr,
+ __sme_page_pa(dst_p[0]) + d_off,
+ len, &argp->error);
+ else
+ ret = __sev_dbg_encrypt_user(kvm,
+ __sme_page_pa(src_p[0]) + s_off,
+ vaddr,
+ __sme_page_pa(dst_p[0]) + d_off,
+ dst_vaddr,
+ len, &argp->error);
+
+ sev_unpin_memory(kvm, src_p, n);
+ sev_unpin_memory(kvm, dst_p, n);
+
+ if (ret)
+ goto err;
+
+ next_vaddr = vaddr + len;
+ dst_vaddr = dst_vaddr + len;
+ size -= len;
+ }
+err:
+ return ret;
+}
+
+static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_launch_secret *data;
+ struct kvm_sev_launch_secret params;
+ struct page **pages;
+ void *blob, *hdr;
+ unsigned long n;
+ int ret, offset;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
+ return -EFAULT;
+
+ pages = sev_pin_memory(kvm, params.guest_uaddr, params.guest_len, &n, 1);
+ if (!pages)
+ return -ENOMEM;
+
+ /*
+ * The secret must be copied into contiguous memory region, lets verify
+ * that userspace memory pages are contiguous before we issue command.
+ */
+ if (get_num_contig_pages(0, pages, n) != n) {
+ ret = -EINVAL;
+ goto e_unpin_memory;
+ }
+
+ ret = -ENOMEM;
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ goto e_unpin_memory;
+
+ offset = params.guest_uaddr & (PAGE_SIZE - 1);
+ data->guest_address = __sme_page_pa(pages[0]) + offset;
+ data->guest_len = params.guest_len;
+
+ blob = psp_copy_user_blob(params.trans_uaddr, params.trans_len);
+ if (IS_ERR(blob)) {
+ ret = PTR_ERR(blob);
+ goto e_free;
+ }
+
+ data->trans_address = __psp_pa(blob);
+ data->trans_len = params.trans_len;
+
+ hdr = psp_copy_user_blob(params.hdr_uaddr, params.hdr_len);
+ if (IS_ERR(hdr)) {
+ ret = PTR_ERR(hdr);
+ goto e_free_blob;
+ }
+ data->hdr_address = __psp_pa(hdr);
+ data->hdr_len = params.hdr_len;
+
+ data->handle = sev->handle;
+ ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_SECRET, data, &argp->error);
+
+ kfree(hdr);
+
+e_free_blob:
+ kfree(blob);
+e_free:
+ kfree(data);
+e_unpin_memory:
+ sev_unpin_memory(kvm, pages, n);
+ return ret;
+}
+
+static int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
+{
+ struct kvm_sev_cmd sev_cmd;
+ int r;
+
+ if (!svm_sev_enabled())
+ return -ENOTTY;
+
+ if (copy_from_user(&sev_cmd, argp, sizeof(struct kvm_sev_cmd)))
+ return -EFAULT;
+
+ mutex_lock(&kvm->lock);
+
+ switch (sev_cmd.id) {
+ case KVM_SEV_INIT:
+ r = sev_guest_init(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_LAUNCH_START:
+ r = sev_launch_start(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_LAUNCH_UPDATE_DATA:
+ r = sev_launch_update_data(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_LAUNCH_MEASURE:
+ r = sev_launch_measure(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_LAUNCH_FINISH:
+ r = sev_launch_finish(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_GUEST_STATUS:
+ r = sev_guest_status(kvm, &sev_cmd);
+ break;
+ case KVM_SEV_DBG_DECRYPT:
+ r = sev_dbg_crypt(kvm, &sev_cmd, true);
+ break;
+ case KVM_SEV_DBG_ENCRYPT:
+ r = sev_dbg_crypt(kvm, &sev_cmd, false);
+ break;
+ case KVM_SEV_LAUNCH_SECRET:
+ r = sev_launch_secret(kvm, &sev_cmd);
+ break;
+ default:
+ r = -EINVAL;
+ goto out;
+ }
+
+ if (copy_to_user(argp, &sev_cmd, sizeof(struct kvm_sev_cmd)))
+ r = -EFAULT;
+
+out:
+ mutex_unlock(&kvm->lock);
+ return r;
+}
+
+static int svm_register_enc_region(struct kvm *kvm,
+ struct kvm_enc_region *range)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct enc_region *region;
+ int ret = 0;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ if (range->addr > ULONG_MAX || range->size > ULONG_MAX)
+ return -EINVAL;
+
+ region = kzalloc(sizeof(*region), GFP_KERNEL);
+ if (!region)
+ return -ENOMEM;
+
+ mutex_lock(&kvm->lock);
+ region->pages = sev_pin_memory(kvm, range->addr, range->size, &region->npages, 1);
+ if (!region->pages) {
+ ret = -ENOMEM;
+ mutex_unlock(&kvm->lock);
+ goto e_free;
+ }
+
+ region->uaddr = range->addr;
+ region->size = range->size;
+
+ list_add_tail(&region->list, &sev->regions_list);
+ mutex_unlock(&kvm->lock);
+
+ /*
+ * The guest may change the memory encryption attribute from C=0 -> C=1
+ * or vice versa for this memory range. Lets make sure caches are
+ * flushed to ensure that guest data gets written into memory with
+ * correct C-bit.
+ */
+ sev_clflush_pages(region->pages, region->npages);
+
+ return ret;
+
+e_free:
+ kfree(region);
+ return ret;
+}
+
+static struct enc_region *
+find_enc_region(struct kvm *kvm, struct kvm_enc_region *range)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct list_head *head = &sev->regions_list;
+ struct enc_region *i;
+
+ list_for_each_entry(i, head, list) {
+ if (i->uaddr == range->addr &&
+ i->size == range->size)
+ return i;
+ }
+
+ return NULL;
+}
+
+
+static int svm_unregister_enc_region(struct kvm *kvm,
+ struct kvm_enc_region *range)
+{
+ struct enc_region *region;
+ int ret;
+
+ mutex_lock(&kvm->lock);
+
+ if (!sev_guest(kvm)) {
+ ret = -ENOTTY;
+ goto failed;
+ }
+
+ region = find_enc_region(kvm, range);
+ if (!region) {
+ ret = -EINVAL;
+ goto failed;
+ }
+
+ __unregister_enc_region_locked(kvm, region);
+
+ mutex_unlock(&kvm->lock);
+ return 0;
+
+failed:
+ mutex_unlock(&kvm->lock);
+ return ret;
+}
+
+static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
+ .cpu_has_kvm_support = has_svm,
+ .disabled_by_bios = is_disabled,
+ .hardware_setup = svm_hardware_setup,
+ .hardware_unsetup = svm_hardware_unsetup,
+ .check_processor_compatibility = svm_check_processor_compat,
+ .hardware_enable = svm_hardware_enable,
+ .hardware_disable = svm_hardware_disable,
+ .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,
+ .has_emulated_msr = svm_has_emulated_msr,
+
+ .vcpu_create = svm_create_vcpu,
+ .vcpu_free = svm_free_vcpu,
+ .vcpu_reset = svm_vcpu_reset,
+
+ .vm_alloc = svm_vm_alloc,
+ .vm_free = svm_vm_free,
+ .vm_init = avic_vm_init,
+ .vm_destroy = svm_vm_destroy,
+
+ .prepare_guest_switch = svm_prepare_guest_switch,
+ .vcpu_load = svm_vcpu_load,
+ .vcpu_put = svm_vcpu_put,
+ .vcpu_blocking = svm_vcpu_blocking,
+ .vcpu_unblocking = svm_vcpu_unblocking,
+
+ .update_bp_intercept = update_bp_intercept,
+ .get_msr_feature = svm_get_msr_feature,
+ .get_msr = svm_get_msr,
+ .set_msr = svm_set_msr,
+ .get_segment_base = svm_get_segment_base,
+ .get_segment = svm_get_segment,
+ .set_segment = svm_set_segment,
+ .get_cpl = svm_get_cpl,
+ .get_cs_db_l_bits = kvm_get_cs_db_l_bits,
+ .decache_cr0_guest_bits = svm_decache_cr0_guest_bits,
+ .decache_cr3 = svm_decache_cr3,
+ .decache_cr4_guest_bits = svm_decache_cr4_guest_bits,
+ .set_cr0 = svm_set_cr0,
+ .set_cr3 = svm_set_cr3,
+ .set_cr4 = svm_set_cr4,
+ .set_efer = svm_set_efer,
+ .get_idt = svm_get_idt,
+ .set_idt = svm_set_idt,
+ .get_gdt = svm_get_gdt,
+ .set_gdt = svm_set_gdt,
+ .get_dr6 = svm_get_dr6,
+ .set_dr6 = svm_set_dr6,
+ .set_dr7 = svm_set_dr7,
+ .sync_dirty_debug_regs = svm_sync_dirty_debug_regs,
+ .cache_reg = svm_cache_reg,
+ .get_rflags = svm_get_rflags,
+ .set_rflags = svm_set_rflags,
+
+ .tlb_flush = svm_flush_tlb,
+ .tlb_flush_gva = svm_flush_tlb_gva,
+
+ .run = svm_vcpu_run,
+ .handle_exit = handle_exit,
+ .skip_emulated_instruction = skip_emulated_instruction,
+ .set_interrupt_shadow = svm_set_interrupt_shadow,
+ .get_interrupt_shadow = svm_get_interrupt_shadow,
+ .patch_hypercall = svm_patch_hypercall,
+ .set_irq = svm_set_irq,
+ .set_nmi = svm_inject_nmi,
+ .queue_exception = svm_queue_exception,
+ .cancel_injection = svm_cancel_injection,
+ .interrupt_allowed = svm_interrupt_allowed,
+ .nmi_allowed = svm_nmi_allowed,
+ .get_nmi_mask = svm_get_nmi_mask,
+ .set_nmi_mask = svm_set_nmi_mask,
+ .enable_nmi_window = enable_nmi_window,
+ .enable_irq_window = enable_irq_window,
+ .update_cr8_intercept = update_cr8_intercept,
+ .set_virtual_apic_mode = svm_set_virtual_apic_mode,
+ .get_enable_apicv = svm_get_enable_apicv,
+ .refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl,
+ .load_eoi_exitmap = svm_load_eoi_exitmap,
+ .hwapic_irr_update = svm_hwapic_irr_update,
+ .hwapic_isr_update = svm_hwapic_isr_update,
+ .sync_pir_to_irr = kvm_lapic_find_highest_irr,
+ .apicv_post_state_restore = avic_post_state_restore,
+
+ .set_tss_addr = svm_set_tss_addr,
+ .set_identity_map_addr = svm_set_identity_map_addr,
+ .get_tdp_level = get_npt_level,
+ .get_mt_mask = svm_get_mt_mask,
+
+ .get_exit_info = svm_get_exit_info,
+
+ .get_lpage_level = svm_get_lpage_level,
+
+ .cpuid_update = svm_cpuid_update,
+
+ .rdtscp_supported = svm_rdtscp_supported,
+ .invpcid_supported = svm_invpcid_supported,
+ .mpx_supported = svm_mpx_supported,
+ .xsaves_supported = svm_xsaves_supported,
+ .umip_emulated = svm_umip_emulated,
+
+ .set_supported_cpuid = svm_set_supported_cpuid,
+
+ .has_wbinvd_exit = svm_has_wbinvd_exit,
+
+ .read_l1_tsc_offset = svm_read_l1_tsc_offset,
+ .write_l1_tsc_offset = svm_write_l1_tsc_offset,
+
+ .set_tdp_cr3 = set_tdp_cr3,
+
+ .check_intercept = svm_check_intercept,
+ .handle_external_intr = svm_handle_external_intr,
+
+ .request_immediate_exit = __kvm_request_immediate_exit,
+
+ .sched_in = svm_sched_in,
+
+ .pmu_ops = &amd_pmu_ops,
+ .deliver_posted_interrupt = svm_deliver_avic_intr,
+ .dy_apicv_has_pending_interrupt = svm_dy_apicv_has_pending_interrupt,
+ .update_pi_irte = svm_update_pi_irte,
+ .setup_mce = svm_setup_mce,
+
+ .smi_allowed = svm_smi_allowed,
+ .pre_enter_smm = svm_pre_enter_smm,
+ .pre_leave_smm = svm_pre_leave_smm,
+ .enable_smi_window = enable_smi_window,
+
+ .mem_enc_op = svm_mem_enc_op,
+ .mem_enc_reg_region = svm_register_enc_region,
+ .mem_enc_unreg_region = svm_unregister_enc_region,
+};
+
+static int __init svm_init(void)
+{
+ return kvm_init(&svm_x86_ops, sizeof(struct vcpu_svm),
+ __alignof__(struct vcpu_svm), THIS_MODULE);
+}
+
+static void __exit svm_exit(void)
+{
+ kvm_exit();
+}
+
+module_init(svm_init)
+module_exit(svm_exit)
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
new file mode 100644
index 000000000..b3f219b7c
--- /dev/null
+++ b/arch/x86/kvm/trace.h
@@ -0,0 +1,1429 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#if !defined(_TRACE_KVM_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_KVM_H
+
+#include <linux/tracepoint.h>
+#include <asm/vmx.h>
+#include <asm/svm.h>
+#include <asm/clocksource.h>
+#include <asm/pvclock-abi.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM kvm
+
+/*
+ * Tracepoint for guest mode entry.
+ */
+TRACE_EVENT(kvm_entry,
+ TP_PROTO(unsigned int vcpu_id),
+ TP_ARGS(vcpu_id),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, vcpu_id )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ ),
+
+ TP_printk("vcpu %u", __entry->vcpu_id)
+);
+
+/*
+ * Tracepoint for hypercall.
+ */
+TRACE_EVENT(kvm_hypercall,
+ TP_PROTO(unsigned long nr, unsigned long a0, unsigned long a1,
+ unsigned long a2, unsigned long a3),
+ TP_ARGS(nr, a0, a1, a2, a3),
+
+ TP_STRUCT__entry(
+ __field( unsigned long, nr )
+ __field( unsigned long, a0 )
+ __field( unsigned long, a1 )
+ __field( unsigned long, a2 )
+ __field( unsigned long, a3 )
+ ),
+
+ TP_fast_assign(
+ __entry->nr = nr;
+ __entry->a0 = a0;
+ __entry->a1 = a1;
+ __entry->a2 = a2;
+ __entry->a3 = a3;
+ ),
+
+ TP_printk("nr 0x%lx a0 0x%lx a1 0x%lx a2 0x%lx a3 0x%lx",
+ __entry->nr, __entry->a0, __entry->a1, __entry->a2,
+ __entry->a3)
+);
+
+/*
+ * Tracepoint for hypercall.
+ */
+TRACE_EVENT(kvm_hv_hypercall,
+ TP_PROTO(__u16 code, bool fast, __u16 rep_cnt, __u16 rep_idx,
+ __u64 ingpa, __u64 outgpa),
+ TP_ARGS(code, fast, rep_cnt, rep_idx, ingpa, outgpa),
+
+ TP_STRUCT__entry(
+ __field( __u16, rep_cnt )
+ __field( __u16, rep_idx )
+ __field( __u64, ingpa )
+ __field( __u64, outgpa )
+ __field( __u16, code )
+ __field( bool, fast )
+ ),
+
+ TP_fast_assign(
+ __entry->rep_cnt = rep_cnt;
+ __entry->rep_idx = rep_idx;
+ __entry->ingpa = ingpa;
+ __entry->outgpa = outgpa;
+ __entry->code = code;
+ __entry->fast = fast;
+ ),
+
+ TP_printk("code 0x%x %s cnt 0x%x idx 0x%x in 0x%llx out 0x%llx",
+ __entry->code, __entry->fast ? "fast" : "slow",
+ __entry->rep_cnt, __entry->rep_idx, __entry->ingpa,
+ __entry->outgpa)
+);
+
+/*
+ * Tracepoint for PIO.
+ */
+
+#define KVM_PIO_IN 0
+#define KVM_PIO_OUT 1
+
+TRACE_EVENT(kvm_pio,
+ TP_PROTO(unsigned int rw, unsigned int port, unsigned int size,
+ unsigned int count, void *data),
+ TP_ARGS(rw, port, size, count, data),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, rw )
+ __field( unsigned int, port )
+ __field( unsigned int, size )
+ __field( unsigned int, count )
+ __field( unsigned int, val )
+ ),
+
+ TP_fast_assign(
+ __entry->rw = rw;
+ __entry->port = port;
+ __entry->size = size;
+ __entry->count = count;
+ if (size == 1)
+ __entry->val = *(unsigned char *)data;
+ else if (size == 2)
+ __entry->val = *(unsigned short *)data;
+ else
+ __entry->val = *(unsigned int *)data;
+ ),
+
+ TP_printk("pio_%s at 0x%x size %d count %d val 0x%x %s",
+ __entry->rw ? "write" : "read",
+ __entry->port, __entry->size, __entry->count, __entry->val,
+ __entry->count > 1 ? "(...)" : "")
+);
+
+/*
+ * Tracepoint for fast mmio.
+ */
+TRACE_EVENT(kvm_fast_mmio,
+ TP_PROTO(u64 gpa),
+ TP_ARGS(gpa),
+
+ TP_STRUCT__entry(
+ __field(u64, gpa)
+ ),
+
+ TP_fast_assign(
+ __entry->gpa = gpa;
+ ),
+
+ TP_printk("fast mmio at gpa 0x%llx", __entry->gpa)
+);
+
+/*
+ * Tracepoint for cpuid.
+ */
+TRACE_EVENT(kvm_cpuid,
+ TP_PROTO(unsigned int function, unsigned long rax, unsigned long rbx,
+ unsigned long rcx, unsigned long rdx, bool found),
+ TP_ARGS(function, rax, rbx, rcx, rdx, found),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, function )
+ __field( unsigned long, rax )
+ __field( unsigned long, rbx )
+ __field( unsigned long, rcx )
+ __field( unsigned long, rdx )
+ __field( bool, found )
+ ),
+
+ TP_fast_assign(
+ __entry->function = function;
+ __entry->rax = rax;
+ __entry->rbx = rbx;
+ __entry->rcx = rcx;
+ __entry->rdx = rdx;
+ __entry->found = found;
+ ),
+
+ TP_printk("func %x rax %lx rbx %lx rcx %lx rdx %lx, cpuid entry %s",
+ __entry->function, __entry->rax,
+ __entry->rbx, __entry->rcx, __entry->rdx,
+ __entry->found ? "found" : "not found")
+);
+
+#define AREG(x) { APIC_##x, "APIC_" #x }
+
+#define kvm_trace_symbol_apic \
+ AREG(ID), AREG(LVR), AREG(TASKPRI), AREG(ARBPRI), AREG(PROCPRI), \
+ AREG(EOI), AREG(RRR), AREG(LDR), AREG(DFR), AREG(SPIV), AREG(ISR), \
+ AREG(TMR), AREG(IRR), AREG(ESR), AREG(ICR), AREG(ICR2), AREG(LVTT), \
+ AREG(LVTTHMR), AREG(LVTPC), AREG(LVT0), AREG(LVT1), AREG(LVTERR), \
+ AREG(TMICT), AREG(TMCCT), AREG(TDCR), AREG(SELF_IPI), AREG(EFEAT), \
+ AREG(ECTRL)
+/*
+ * Tracepoint for apic access.
+ */
+TRACE_EVENT(kvm_apic,
+ TP_PROTO(unsigned int rw, unsigned int reg, unsigned int val),
+ TP_ARGS(rw, reg, val),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, rw )
+ __field( unsigned int, reg )
+ __field( unsigned int, val )
+ ),
+
+ TP_fast_assign(
+ __entry->rw = rw;
+ __entry->reg = reg;
+ __entry->val = val;
+ ),
+
+ TP_printk("apic_%s %s = 0x%x",
+ __entry->rw ? "write" : "read",
+ __print_symbolic(__entry->reg, kvm_trace_symbol_apic),
+ __entry->val)
+);
+
+#define trace_kvm_apic_read(reg, val) trace_kvm_apic(0, reg, val)
+#define trace_kvm_apic_write(reg, val) trace_kvm_apic(1, reg, val)
+
+#define KVM_ISA_VMX 1
+#define KVM_ISA_SVM 2
+
+/*
+ * Tracepoint for kvm guest exit:
+ */
+TRACE_EVENT(kvm_exit,
+ TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu, u32 isa),
+ TP_ARGS(exit_reason, vcpu, isa),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, exit_reason )
+ __field( unsigned long, guest_rip )
+ __field( u32, isa )
+ __field( u64, info1 )
+ __field( u64, info2 )
+ ),
+
+ TP_fast_assign(
+ __entry->exit_reason = exit_reason;
+ __entry->guest_rip = kvm_rip_read(vcpu);
+ __entry->isa = isa;
+ kvm_x86_ops->get_exit_info(vcpu, &__entry->info1,
+ &__entry->info2);
+ ),
+
+ TP_printk("reason %s rip 0x%lx info %llx %llx",
+ (__entry->isa == KVM_ISA_VMX) ?
+ __print_symbolic(__entry->exit_reason, VMX_EXIT_REASONS) :
+ __print_symbolic(__entry->exit_reason, SVM_EXIT_REASONS),
+ __entry->guest_rip, __entry->info1, __entry->info2)
+);
+
+/*
+ * Tracepoint for kvm interrupt injection:
+ */
+TRACE_EVENT(kvm_inj_virq,
+ TP_PROTO(unsigned int irq),
+ TP_ARGS(irq),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, irq )
+ ),
+
+ TP_fast_assign(
+ __entry->irq = irq;
+ ),
+
+ TP_printk("irq %u", __entry->irq)
+);
+
+#define EXS(x) { x##_VECTOR, "#" #x }
+
+#define kvm_trace_sym_exc \
+ EXS(DE), EXS(DB), EXS(BP), EXS(OF), EXS(BR), EXS(UD), EXS(NM), \
+ EXS(DF), EXS(TS), EXS(NP), EXS(SS), EXS(GP), EXS(PF), \
+ EXS(MF), EXS(AC), EXS(MC)
+
+/*
+ * Tracepoint for kvm interrupt injection:
+ */
+TRACE_EVENT(kvm_inj_exception,
+ TP_PROTO(unsigned exception, bool has_error, unsigned error_code),
+ TP_ARGS(exception, has_error, error_code),
+
+ TP_STRUCT__entry(
+ __field( u8, exception )
+ __field( u8, has_error )
+ __field( u32, error_code )
+ ),
+
+ TP_fast_assign(
+ __entry->exception = exception;
+ __entry->has_error = has_error;
+ __entry->error_code = error_code;
+ ),
+
+ TP_printk("%s (0x%x)",
+ __print_symbolic(__entry->exception, kvm_trace_sym_exc),
+ /* FIXME: don't print error_code if not present */
+ __entry->has_error ? __entry->error_code : 0)
+);
+
+/*
+ * Tracepoint for page fault.
+ */
+TRACE_EVENT(kvm_page_fault,
+ TP_PROTO(unsigned long fault_address, unsigned int error_code),
+ TP_ARGS(fault_address, error_code),
+
+ TP_STRUCT__entry(
+ __field( unsigned long, fault_address )
+ __field( unsigned int, error_code )
+ ),
+
+ TP_fast_assign(
+ __entry->fault_address = fault_address;
+ __entry->error_code = error_code;
+ ),
+
+ TP_printk("address %lx error_code %x",
+ __entry->fault_address, __entry->error_code)
+);
+
+/*
+ * Tracepoint for guest MSR access.
+ */
+TRACE_EVENT(kvm_msr,
+ TP_PROTO(unsigned write, u32 ecx, u64 data, bool exception),
+ TP_ARGS(write, ecx, data, exception),
+
+ TP_STRUCT__entry(
+ __field( unsigned, write )
+ __field( u32, ecx )
+ __field( u64, data )
+ __field( u8, exception )
+ ),
+
+ TP_fast_assign(
+ __entry->write = write;
+ __entry->ecx = ecx;
+ __entry->data = data;
+ __entry->exception = exception;
+ ),
+
+ TP_printk("msr_%s %x = 0x%llx%s",
+ __entry->write ? "write" : "read",
+ __entry->ecx, __entry->data,
+ __entry->exception ? " (#GP)" : "")
+);
+
+#define trace_kvm_msr_read(ecx, data) trace_kvm_msr(0, ecx, data, false)
+#define trace_kvm_msr_write(ecx, data) trace_kvm_msr(1, ecx, data, false)
+#define trace_kvm_msr_read_ex(ecx) trace_kvm_msr(0, ecx, 0, true)
+#define trace_kvm_msr_write_ex(ecx, data) trace_kvm_msr(1, ecx, data, true)
+
+/*
+ * Tracepoint for guest CR access.
+ */
+TRACE_EVENT(kvm_cr,
+ TP_PROTO(unsigned int rw, unsigned int cr, unsigned long val),
+ TP_ARGS(rw, cr, val),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, rw )
+ __field( unsigned int, cr )
+ __field( unsigned long, val )
+ ),
+
+ TP_fast_assign(
+ __entry->rw = rw;
+ __entry->cr = cr;
+ __entry->val = val;
+ ),
+
+ TP_printk("cr_%s %x = 0x%lx",
+ __entry->rw ? "write" : "read",
+ __entry->cr, __entry->val)
+);
+
+#define trace_kvm_cr_read(cr, val) trace_kvm_cr(0, cr, val)
+#define trace_kvm_cr_write(cr, val) trace_kvm_cr(1, cr, val)
+
+TRACE_EVENT(kvm_pic_set_irq,
+ TP_PROTO(__u8 chip, __u8 pin, __u8 elcr, __u8 imr, bool coalesced),
+ TP_ARGS(chip, pin, elcr, imr, coalesced),
+
+ TP_STRUCT__entry(
+ __field( __u8, chip )
+ __field( __u8, pin )
+ __field( __u8, elcr )
+ __field( __u8, imr )
+ __field( bool, coalesced )
+ ),
+
+ TP_fast_assign(
+ __entry->chip = chip;
+ __entry->pin = pin;
+ __entry->elcr = elcr;
+ __entry->imr = imr;
+ __entry->coalesced = coalesced;
+ ),
+
+ TP_printk("chip %u pin %u (%s%s)%s",
+ __entry->chip, __entry->pin,
+ (__entry->elcr & (1 << __entry->pin)) ? "level":"edge",
+ (__entry->imr & (1 << __entry->pin)) ? "|masked":"",
+ __entry->coalesced ? " (coalesced)" : "")
+);
+
+#define kvm_apic_dst_shorthand \
+ {0x0, "dst"}, \
+ {0x1, "self"}, \
+ {0x2, "all"}, \
+ {0x3, "all-but-self"}
+
+TRACE_EVENT(kvm_apic_ipi,
+ TP_PROTO(__u32 icr_low, __u32 dest_id),
+ TP_ARGS(icr_low, dest_id),
+
+ TP_STRUCT__entry(
+ __field( __u32, icr_low )
+ __field( __u32, dest_id )
+ ),
+
+ TP_fast_assign(
+ __entry->icr_low = icr_low;
+ __entry->dest_id = dest_id;
+ ),
+
+ TP_printk("dst %x vec %u (%s|%s|%s|%s|%s)",
+ __entry->dest_id, (u8)__entry->icr_low,
+ __print_symbolic((__entry->icr_low >> 8 & 0x7),
+ kvm_deliver_mode),
+ (__entry->icr_low & (1<<11)) ? "logical" : "physical",
+ (__entry->icr_low & (1<<14)) ? "assert" : "de-assert",
+ (__entry->icr_low & (1<<15)) ? "level" : "edge",
+ __print_symbolic((__entry->icr_low >> 18 & 0x3),
+ kvm_apic_dst_shorthand))
+);
+
+TRACE_EVENT(kvm_apic_accept_irq,
+ TP_PROTO(__u32 apicid, __u16 dm, __u16 tm, __u8 vec),
+ TP_ARGS(apicid, dm, tm, vec),
+
+ TP_STRUCT__entry(
+ __field( __u32, apicid )
+ __field( __u16, dm )
+ __field( __u16, tm )
+ __field( __u8, vec )
+ ),
+
+ TP_fast_assign(
+ __entry->apicid = apicid;
+ __entry->dm = dm;
+ __entry->tm = tm;
+ __entry->vec = vec;
+ ),
+
+ TP_printk("apicid %x vec %u (%s|%s)",
+ __entry->apicid, __entry->vec,
+ __print_symbolic((__entry->dm >> 8 & 0x7), kvm_deliver_mode),
+ __entry->tm ? "level" : "edge")
+);
+
+TRACE_EVENT(kvm_eoi,
+ TP_PROTO(struct kvm_lapic *apic, int vector),
+ TP_ARGS(apic, vector),
+
+ TP_STRUCT__entry(
+ __field( __u32, apicid )
+ __field( int, vector )
+ ),
+
+ TP_fast_assign(
+ __entry->apicid = apic->vcpu->vcpu_id;
+ __entry->vector = vector;
+ ),
+
+ TP_printk("apicid %x vector %d", __entry->apicid, __entry->vector)
+);
+
+TRACE_EVENT(kvm_pv_eoi,
+ TP_PROTO(struct kvm_lapic *apic, int vector),
+ TP_ARGS(apic, vector),
+
+ TP_STRUCT__entry(
+ __field( __u32, apicid )
+ __field( int, vector )
+ ),
+
+ TP_fast_assign(
+ __entry->apicid = apic->vcpu->vcpu_id;
+ __entry->vector = vector;
+ ),
+
+ TP_printk("apicid %x vector %d", __entry->apicid, __entry->vector)
+);
+
+/*
+ * Tracepoint for nested VMRUN
+ */
+TRACE_EVENT(kvm_nested_vmrun,
+ TP_PROTO(__u64 rip, __u64 vmcb, __u64 nested_rip, __u32 int_ctl,
+ __u32 event_inj, bool npt),
+ TP_ARGS(rip, vmcb, nested_rip, int_ctl, event_inj, npt),
+
+ TP_STRUCT__entry(
+ __field( __u64, rip )
+ __field( __u64, vmcb )
+ __field( __u64, nested_rip )
+ __field( __u32, int_ctl )
+ __field( __u32, event_inj )
+ __field( bool, npt )
+ ),
+
+ TP_fast_assign(
+ __entry->rip = rip;
+ __entry->vmcb = vmcb;
+ __entry->nested_rip = nested_rip;
+ __entry->int_ctl = int_ctl;
+ __entry->event_inj = event_inj;
+ __entry->npt = npt;
+ ),
+
+ TP_printk("rip: 0x%016llx vmcb: 0x%016llx nrip: 0x%016llx int_ctl: 0x%08x "
+ "event_inj: 0x%08x npt: %s",
+ __entry->rip, __entry->vmcb, __entry->nested_rip,
+ __entry->int_ctl, __entry->event_inj,
+ __entry->npt ? "on" : "off")
+);
+
+TRACE_EVENT(kvm_nested_intercepts,
+ TP_PROTO(__u16 cr_read, __u16 cr_write, __u32 exceptions, __u64 intercept),
+ TP_ARGS(cr_read, cr_write, exceptions, intercept),
+
+ TP_STRUCT__entry(
+ __field( __u16, cr_read )
+ __field( __u16, cr_write )
+ __field( __u32, exceptions )
+ __field( __u64, intercept )
+ ),
+
+ TP_fast_assign(
+ __entry->cr_read = cr_read;
+ __entry->cr_write = cr_write;
+ __entry->exceptions = exceptions;
+ __entry->intercept = intercept;
+ ),
+
+ TP_printk("cr_read: %04x cr_write: %04x excp: %08x intercept: %016llx",
+ __entry->cr_read, __entry->cr_write, __entry->exceptions,
+ __entry->intercept)
+);
+/*
+ * Tracepoint for #VMEXIT while nested
+ */
+TRACE_EVENT(kvm_nested_vmexit,
+ TP_PROTO(__u64 rip, __u32 exit_code,
+ __u64 exit_info1, __u64 exit_info2,
+ __u32 exit_int_info, __u32 exit_int_info_err, __u32 isa),
+ TP_ARGS(rip, exit_code, exit_info1, exit_info2,
+ exit_int_info, exit_int_info_err, isa),
+
+ TP_STRUCT__entry(
+ __field( __u64, rip )
+ __field( __u32, exit_code )
+ __field( __u64, exit_info1 )
+ __field( __u64, exit_info2 )
+ __field( __u32, exit_int_info )
+ __field( __u32, exit_int_info_err )
+ __field( __u32, isa )
+ ),
+
+ TP_fast_assign(
+ __entry->rip = rip;
+ __entry->exit_code = exit_code;
+ __entry->exit_info1 = exit_info1;
+ __entry->exit_info2 = exit_info2;
+ __entry->exit_int_info = exit_int_info;
+ __entry->exit_int_info_err = exit_int_info_err;
+ __entry->isa = isa;
+ ),
+ TP_printk("rip: 0x%016llx reason: %s ext_inf1: 0x%016llx "
+ "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x",
+ __entry->rip,
+ (__entry->isa == KVM_ISA_VMX) ?
+ __print_symbolic(__entry->exit_code, VMX_EXIT_REASONS) :
+ __print_symbolic(__entry->exit_code, SVM_EXIT_REASONS),
+ __entry->exit_info1, __entry->exit_info2,
+ __entry->exit_int_info, __entry->exit_int_info_err)
+);
+
+/*
+ * Tracepoint for #VMEXIT reinjected to the guest
+ */
+TRACE_EVENT(kvm_nested_vmexit_inject,
+ TP_PROTO(__u32 exit_code,
+ __u64 exit_info1, __u64 exit_info2,
+ __u32 exit_int_info, __u32 exit_int_info_err, __u32 isa),
+ TP_ARGS(exit_code, exit_info1, exit_info2,
+ exit_int_info, exit_int_info_err, isa),
+
+ TP_STRUCT__entry(
+ __field( __u32, exit_code )
+ __field( __u64, exit_info1 )
+ __field( __u64, exit_info2 )
+ __field( __u32, exit_int_info )
+ __field( __u32, exit_int_info_err )
+ __field( __u32, isa )
+ ),
+
+ TP_fast_assign(
+ __entry->exit_code = exit_code;
+ __entry->exit_info1 = exit_info1;
+ __entry->exit_info2 = exit_info2;
+ __entry->exit_int_info = exit_int_info;
+ __entry->exit_int_info_err = exit_int_info_err;
+ __entry->isa = isa;
+ ),
+
+ TP_printk("reason: %s ext_inf1: 0x%016llx "
+ "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x",
+ (__entry->isa == KVM_ISA_VMX) ?
+ __print_symbolic(__entry->exit_code, VMX_EXIT_REASONS) :
+ __print_symbolic(__entry->exit_code, SVM_EXIT_REASONS),
+ __entry->exit_info1, __entry->exit_info2,
+ __entry->exit_int_info, __entry->exit_int_info_err)
+);
+
+/*
+ * Tracepoint for nested #vmexit because of interrupt pending
+ */
+TRACE_EVENT(kvm_nested_intr_vmexit,
+ TP_PROTO(__u64 rip),
+ TP_ARGS(rip),
+
+ TP_STRUCT__entry(
+ __field( __u64, rip )
+ ),
+
+ TP_fast_assign(
+ __entry->rip = rip
+ ),
+
+ TP_printk("rip: 0x%016llx", __entry->rip)
+);
+
+/*
+ * Tracepoint for nested #vmexit because of interrupt pending
+ */
+TRACE_EVENT(kvm_invlpga,
+ TP_PROTO(__u64 rip, int asid, u64 address),
+ TP_ARGS(rip, asid, address),
+
+ TP_STRUCT__entry(
+ __field( __u64, rip )
+ __field( int, asid )
+ __field( __u64, address )
+ ),
+
+ TP_fast_assign(
+ __entry->rip = rip;
+ __entry->asid = asid;
+ __entry->address = address;
+ ),
+
+ TP_printk("rip: 0x%016llx asid: %d address: 0x%016llx",
+ __entry->rip, __entry->asid, __entry->address)
+);
+
+/*
+ * Tracepoint for nested #vmexit because of interrupt pending
+ */
+TRACE_EVENT(kvm_skinit,
+ TP_PROTO(__u64 rip, __u32 slb),
+ TP_ARGS(rip, slb),
+
+ TP_STRUCT__entry(
+ __field( __u64, rip )
+ __field( __u32, slb )
+ ),
+
+ TP_fast_assign(
+ __entry->rip = rip;
+ __entry->slb = slb;
+ ),
+
+ TP_printk("rip: 0x%016llx slb: 0x%08x",
+ __entry->rip, __entry->slb)
+);
+
+#define KVM_EMUL_INSN_F_CR0_PE (1 << 0)
+#define KVM_EMUL_INSN_F_EFL_VM (1 << 1)
+#define KVM_EMUL_INSN_F_CS_D (1 << 2)
+#define KVM_EMUL_INSN_F_CS_L (1 << 3)
+
+#define kvm_trace_symbol_emul_flags \
+ { 0, "real" }, \
+ { KVM_EMUL_INSN_F_CR0_PE \
+ | KVM_EMUL_INSN_F_EFL_VM, "vm16" }, \
+ { KVM_EMUL_INSN_F_CR0_PE, "prot16" }, \
+ { KVM_EMUL_INSN_F_CR0_PE \
+ | KVM_EMUL_INSN_F_CS_D, "prot32" }, \
+ { KVM_EMUL_INSN_F_CR0_PE \
+ | KVM_EMUL_INSN_F_CS_L, "prot64" }
+
+#define kei_decode_mode(mode) ({ \
+ u8 flags = 0xff; \
+ switch (mode) { \
+ case X86EMUL_MODE_REAL: \
+ flags = 0; \
+ break; \
+ case X86EMUL_MODE_VM86: \
+ flags = KVM_EMUL_INSN_F_EFL_VM; \
+ break; \
+ case X86EMUL_MODE_PROT16: \
+ flags = KVM_EMUL_INSN_F_CR0_PE; \
+ break; \
+ case X86EMUL_MODE_PROT32: \
+ flags = KVM_EMUL_INSN_F_CR0_PE \
+ | KVM_EMUL_INSN_F_CS_D; \
+ break; \
+ case X86EMUL_MODE_PROT64: \
+ flags = KVM_EMUL_INSN_F_CR0_PE \
+ | KVM_EMUL_INSN_F_CS_L; \
+ break; \
+ } \
+ flags; \
+ })
+
+TRACE_EVENT(kvm_emulate_insn,
+ TP_PROTO(struct kvm_vcpu *vcpu, __u8 failed),
+ TP_ARGS(vcpu, failed),
+
+ TP_STRUCT__entry(
+ __field( __u64, rip )
+ __field( __u32, csbase )
+ __field( __u8, len )
+ __array( __u8, insn, 15 )
+ __field( __u8, flags )
+ __field( __u8, failed )
+ ),
+
+ TP_fast_assign(
+ __entry->csbase = kvm_x86_ops->get_segment_base(vcpu, VCPU_SREG_CS);
+ __entry->len = vcpu->arch.emulate_ctxt.fetch.ptr
+ - vcpu->arch.emulate_ctxt.fetch.data;
+ __entry->rip = vcpu->arch.emulate_ctxt._eip - __entry->len;
+ memcpy(__entry->insn,
+ vcpu->arch.emulate_ctxt.fetch.data,
+ 15);
+ __entry->flags = kei_decode_mode(vcpu->arch.emulate_ctxt.mode);
+ __entry->failed = failed;
+ ),
+
+ TP_printk("%x:%llx:%s (%s)%s",
+ __entry->csbase, __entry->rip,
+ __print_hex(__entry->insn, __entry->len),
+ __print_symbolic(__entry->flags,
+ kvm_trace_symbol_emul_flags),
+ __entry->failed ? " failed" : ""
+ )
+ );
+
+#define trace_kvm_emulate_insn_start(vcpu) trace_kvm_emulate_insn(vcpu, 0)
+#define trace_kvm_emulate_insn_failed(vcpu) trace_kvm_emulate_insn(vcpu, 1)
+
+TRACE_EVENT(
+ vcpu_match_mmio,
+ TP_PROTO(gva_t gva, gpa_t gpa, bool write, bool gpa_match),
+ TP_ARGS(gva, gpa, write, gpa_match),
+
+ TP_STRUCT__entry(
+ __field(gva_t, gva)
+ __field(gpa_t, gpa)
+ __field(bool, write)
+ __field(bool, gpa_match)
+ ),
+
+ TP_fast_assign(
+ __entry->gva = gva;
+ __entry->gpa = gpa;
+ __entry->write = write;
+ __entry->gpa_match = gpa_match
+ ),
+
+ TP_printk("gva %#lx gpa %#llx %s %s", __entry->gva, __entry->gpa,
+ __entry->write ? "Write" : "Read",
+ __entry->gpa_match ? "GPA" : "GVA")
+);
+
+TRACE_EVENT(kvm_write_tsc_offset,
+ TP_PROTO(unsigned int vcpu_id, __u64 previous_tsc_offset,
+ __u64 next_tsc_offset),
+ TP_ARGS(vcpu_id, previous_tsc_offset, next_tsc_offset),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, vcpu_id )
+ __field( __u64, previous_tsc_offset )
+ __field( __u64, next_tsc_offset )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->previous_tsc_offset = previous_tsc_offset;
+ __entry->next_tsc_offset = next_tsc_offset;
+ ),
+
+ TP_printk("vcpu=%u prev=%llu next=%llu", __entry->vcpu_id,
+ __entry->previous_tsc_offset, __entry->next_tsc_offset)
+);
+
+#ifdef CONFIG_X86_64
+
+#define host_clocks \
+ {VCLOCK_NONE, "none"}, \
+ {VCLOCK_TSC, "tsc"} \
+
+TRACE_EVENT(kvm_update_master_clock,
+ TP_PROTO(bool use_master_clock, unsigned int host_clock, bool offset_matched),
+ TP_ARGS(use_master_clock, host_clock, offset_matched),
+
+ TP_STRUCT__entry(
+ __field( bool, use_master_clock )
+ __field( unsigned int, host_clock )
+ __field( bool, offset_matched )
+ ),
+
+ TP_fast_assign(
+ __entry->use_master_clock = use_master_clock;
+ __entry->host_clock = host_clock;
+ __entry->offset_matched = offset_matched;
+ ),
+
+ TP_printk("masterclock %d hostclock %s offsetmatched %u",
+ __entry->use_master_clock,
+ __print_symbolic(__entry->host_clock, host_clocks),
+ __entry->offset_matched)
+);
+
+TRACE_EVENT(kvm_track_tsc,
+ TP_PROTO(unsigned int vcpu_id, unsigned int nr_matched,
+ unsigned int online_vcpus, bool use_master_clock,
+ unsigned int host_clock),
+ TP_ARGS(vcpu_id, nr_matched, online_vcpus, use_master_clock,
+ host_clock),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, vcpu_id )
+ __field( unsigned int, nr_vcpus_matched_tsc )
+ __field( unsigned int, online_vcpus )
+ __field( bool, use_master_clock )
+ __field( unsigned int, host_clock )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->nr_vcpus_matched_tsc = nr_matched;
+ __entry->online_vcpus = online_vcpus;
+ __entry->use_master_clock = use_master_clock;
+ __entry->host_clock = host_clock;
+ ),
+
+ TP_printk("vcpu_id %u masterclock %u offsetmatched %u nr_online %u"
+ " hostclock %s",
+ __entry->vcpu_id, __entry->use_master_clock,
+ __entry->nr_vcpus_matched_tsc, __entry->online_vcpus,
+ __print_symbolic(__entry->host_clock, host_clocks))
+);
+
+#endif /* CONFIG_X86_64 */
+
+/*
+ * Tracepoint for PML full VMEXIT.
+ */
+TRACE_EVENT(kvm_pml_full,
+ TP_PROTO(unsigned int vcpu_id),
+ TP_ARGS(vcpu_id),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, vcpu_id )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ ),
+
+ TP_printk("vcpu %d: PML full", __entry->vcpu_id)
+);
+
+TRACE_EVENT(kvm_ple_window,
+ TP_PROTO(bool grow, unsigned int vcpu_id, int new, int old),
+ TP_ARGS(grow, vcpu_id, new, old),
+
+ TP_STRUCT__entry(
+ __field( bool, grow )
+ __field( unsigned int, vcpu_id )
+ __field( int, new )
+ __field( int, old )
+ ),
+
+ TP_fast_assign(
+ __entry->grow = grow;
+ __entry->vcpu_id = vcpu_id;
+ __entry->new = new;
+ __entry->old = old;
+ ),
+
+ TP_printk("vcpu %u: ple_window %d (%s %d)",
+ __entry->vcpu_id,
+ __entry->new,
+ __entry->grow ? "grow" : "shrink",
+ __entry->old)
+);
+
+#define trace_kvm_ple_window_grow(vcpu_id, new, old) \
+ trace_kvm_ple_window(true, vcpu_id, new, old)
+#define trace_kvm_ple_window_shrink(vcpu_id, new, old) \
+ trace_kvm_ple_window(false, vcpu_id, new, old)
+
+TRACE_EVENT(kvm_pvclock_update,
+ TP_PROTO(unsigned int vcpu_id, struct pvclock_vcpu_time_info *pvclock),
+ TP_ARGS(vcpu_id, pvclock),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, vcpu_id )
+ __field( __u32, version )
+ __field( __u64, tsc_timestamp )
+ __field( __u64, system_time )
+ __field( __u32, tsc_to_system_mul )
+ __field( __s8, tsc_shift )
+ __field( __u8, flags )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->version = pvclock->version;
+ __entry->tsc_timestamp = pvclock->tsc_timestamp;
+ __entry->system_time = pvclock->system_time;
+ __entry->tsc_to_system_mul = pvclock->tsc_to_system_mul;
+ __entry->tsc_shift = pvclock->tsc_shift;
+ __entry->flags = pvclock->flags;
+ ),
+
+ TP_printk("vcpu_id %u, pvclock { version %u, tsc_timestamp 0x%llx, "
+ "system_time 0x%llx, tsc_to_system_mul 0x%x, tsc_shift %d, "
+ "flags 0x%x }",
+ __entry->vcpu_id,
+ __entry->version,
+ __entry->tsc_timestamp,
+ __entry->system_time,
+ __entry->tsc_to_system_mul,
+ __entry->tsc_shift,
+ __entry->flags)
+);
+
+TRACE_EVENT(kvm_wait_lapic_expire,
+ TP_PROTO(unsigned int vcpu_id, s64 delta),
+ TP_ARGS(vcpu_id, delta),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, vcpu_id )
+ __field( s64, delta )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->delta = delta;
+ ),
+
+ TP_printk("vcpu %u: delta %lld (%s)",
+ __entry->vcpu_id,
+ __entry->delta,
+ __entry->delta < 0 ? "early" : "late")
+);
+
+TRACE_EVENT(kvm_enter_smm,
+ TP_PROTO(unsigned int vcpu_id, u64 smbase, bool entering),
+ TP_ARGS(vcpu_id, smbase, entering),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, vcpu_id )
+ __field( u64, smbase )
+ __field( bool, entering )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->smbase = smbase;
+ __entry->entering = entering;
+ ),
+
+ TP_printk("vcpu %u: %s SMM, smbase 0x%llx",
+ __entry->vcpu_id,
+ __entry->entering ? "entering" : "leaving",
+ __entry->smbase)
+);
+
+/*
+ * Tracepoint for VT-d posted-interrupts.
+ */
+TRACE_EVENT(kvm_pi_irte_update,
+ TP_PROTO(unsigned int host_irq, unsigned int vcpu_id,
+ unsigned int gsi, unsigned int gvec,
+ u64 pi_desc_addr, bool set),
+ TP_ARGS(host_irq, vcpu_id, gsi, gvec, pi_desc_addr, set),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, host_irq )
+ __field( unsigned int, vcpu_id )
+ __field( unsigned int, gsi )
+ __field( unsigned int, gvec )
+ __field( u64, pi_desc_addr )
+ __field( bool, set )
+ ),
+
+ TP_fast_assign(
+ __entry->host_irq = host_irq;
+ __entry->vcpu_id = vcpu_id;
+ __entry->gsi = gsi;
+ __entry->gvec = gvec;
+ __entry->pi_desc_addr = pi_desc_addr;
+ __entry->set = set;
+ ),
+
+ TP_printk("VT-d PI is %s for irq %u, vcpu %u, gsi: 0x%x, "
+ "gvec: 0x%x, pi_desc_addr: 0x%llx",
+ __entry->set ? "enabled and being updated" : "disabled",
+ __entry->host_irq,
+ __entry->vcpu_id,
+ __entry->gsi,
+ __entry->gvec,
+ __entry->pi_desc_addr)
+);
+
+/*
+ * Tracepoint for kvm_hv_notify_acked_sint.
+ */
+TRACE_EVENT(kvm_hv_notify_acked_sint,
+ TP_PROTO(int vcpu_id, u32 sint),
+ TP_ARGS(vcpu_id, sint),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(u32, sint)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->sint = sint;
+ ),
+
+ TP_printk("vcpu_id %d sint %u", __entry->vcpu_id, __entry->sint)
+);
+
+/*
+ * Tracepoint for synic_set_irq.
+ */
+TRACE_EVENT(kvm_hv_synic_set_irq,
+ TP_PROTO(int vcpu_id, u32 sint, int vector, int ret),
+ TP_ARGS(vcpu_id, sint, vector, ret),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(u32, sint)
+ __field(int, vector)
+ __field(int, ret)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->sint = sint;
+ __entry->vector = vector;
+ __entry->ret = ret;
+ ),
+
+ TP_printk("vcpu_id %d sint %u vector %d ret %d",
+ __entry->vcpu_id, __entry->sint, __entry->vector,
+ __entry->ret)
+);
+
+/*
+ * Tracepoint for kvm_hv_synic_send_eoi.
+ */
+TRACE_EVENT(kvm_hv_synic_send_eoi,
+ TP_PROTO(int vcpu_id, int vector),
+ TP_ARGS(vcpu_id, vector),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(u32, sint)
+ __field(int, vector)
+ __field(int, ret)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->vector = vector;
+ ),
+
+ TP_printk("vcpu_id %d vector %d", __entry->vcpu_id, __entry->vector)
+);
+
+/*
+ * Tracepoint for synic_set_msr.
+ */
+TRACE_EVENT(kvm_hv_synic_set_msr,
+ TP_PROTO(int vcpu_id, u32 msr, u64 data, bool host),
+ TP_ARGS(vcpu_id, msr, data, host),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(u32, msr)
+ __field(u64, data)
+ __field(bool, host)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->msr = msr;
+ __entry->data = data;
+ __entry->host = host
+ ),
+
+ TP_printk("vcpu_id %d msr 0x%x data 0x%llx host %d",
+ __entry->vcpu_id, __entry->msr, __entry->data, __entry->host)
+);
+
+/*
+ * Tracepoint for stimer_set_config.
+ */
+TRACE_EVENT(kvm_hv_stimer_set_config,
+ TP_PROTO(int vcpu_id, int timer_index, u64 config, bool host),
+ TP_ARGS(vcpu_id, timer_index, config, host),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(int, timer_index)
+ __field(u64, config)
+ __field(bool, host)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->timer_index = timer_index;
+ __entry->config = config;
+ __entry->host = host;
+ ),
+
+ TP_printk("vcpu_id %d timer %d config 0x%llx host %d",
+ __entry->vcpu_id, __entry->timer_index, __entry->config,
+ __entry->host)
+);
+
+/*
+ * Tracepoint for stimer_set_count.
+ */
+TRACE_EVENT(kvm_hv_stimer_set_count,
+ TP_PROTO(int vcpu_id, int timer_index, u64 count, bool host),
+ TP_ARGS(vcpu_id, timer_index, count, host),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(int, timer_index)
+ __field(u64, count)
+ __field(bool, host)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->timer_index = timer_index;
+ __entry->count = count;
+ __entry->host = host;
+ ),
+
+ TP_printk("vcpu_id %d timer %d count %llu host %d",
+ __entry->vcpu_id, __entry->timer_index, __entry->count,
+ __entry->host)
+);
+
+/*
+ * Tracepoint for stimer_start(periodic timer case).
+ */
+TRACE_EVENT(kvm_hv_stimer_start_periodic,
+ TP_PROTO(int vcpu_id, int timer_index, u64 time_now, u64 exp_time),
+ TP_ARGS(vcpu_id, timer_index, time_now, exp_time),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(int, timer_index)
+ __field(u64, time_now)
+ __field(u64, exp_time)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->timer_index = timer_index;
+ __entry->time_now = time_now;
+ __entry->exp_time = exp_time;
+ ),
+
+ TP_printk("vcpu_id %d timer %d time_now %llu exp_time %llu",
+ __entry->vcpu_id, __entry->timer_index, __entry->time_now,
+ __entry->exp_time)
+);
+
+/*
+ * Tracepoint for stimer_start(one-shot timer case).
+ */
+TRACE_EVENT(kvm_hv_stimer_start_one_shot,
+ TP_PROTO(int vcpu_id, int timer_index, u64 time_now, u64 count),
+ TP_ARGS(vcpu_id, timer_index, time_now, count),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(int, timer_index)
+ __field(u64, time_now)
+ __field(u64, count)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->timer_index = timer_index;
+ __entry->time_now = time_now;
+ __entry->count = count;
+ ),
+
+ TP_printk("vcpu_id %d timer %d time_now %llu count %llu",
+ __entry->vcpu_id, __entry->timer_index, __entry->time_now,
+ __entry->count)
+);
+
+/*
+ * Tracepoint for stimer_timer_callback.
+ */
+TRACE_EVENT(kvm_hv_stimer_callback,
+ TP_PROTO(int vcpu_id, int timer_index),
+ TP_ARGS(vcpu_id, timer_index),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(int, timer_index)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->timer_index = timer_index;
+ ),
+
+ TP_printk("vcpu_id %d timer %d",
+ __entry->vcpu_id, __entry->timer_index)
+);
+
+/*
+ * Tracepoint for stimer_expiration.
+ */
+TRACE_EVENT(kvm_hv_stimer_expiration,
+ TP_PROTO(int vcpu_id, int timer_index, int msg_send_result),
+ TP_ARGS(vcpu_id, timer_index, msg_send_result),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(int, timer_index)
+ __field(int, msg_send_result)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->timer_index = timer_index;
+ __entry->msg_send_result = msg_send_result;
+ ),
+
+ TP_printk("vcpu_id %d timer %d msg send result %d",
+ __entry->vcpu_id, __entry->timer_index,
+ __entry->msg_send_result)
+);
+
+/*
+ * Tracepoint for stimer_cleanup.
+ */
+TRACE_EVENT(kvm_hv_stimer_cleanup,
+ TP_PROTO(int vcpu_id, int timer_index),
+ TP_ARGS(vcpu_id, timer_index),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(int, timer_index)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->timer_index = timer_index;
+ ),
+
+ TP_printk("vcpu_id %d timer %d",
+ __entry->vcpu_id, __entry->timer_index)
+);
+
+/*
+ * Tracepoint for AMD AVIC
+ */
+TRACE_EVENT(kvm_avic_incomplete_ipi,
+ TP_PROTO(u32 vcpu, u32 icrh, u32 icrl, u32 id, u32 index),
+ TP_ARGS(vcpu, icrh, icrl, id, index),
+
+ TP_STRUCT__entry(
+ __field(u32, vcpu)
+ __field(u32, icrh)
+ __field(u32, icrl)
+ __field(u32, id)
+ __field(u32, index)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu = vcpu;
+ __entry->icrh = icrh;
+ __entry->icrl = icrl;
+ __entry->id = id;
+ __entry->index = index;
+ ),
+
+ TP_printk("vcpu=%u, icrh:icrl=%#010x:%08x, id=%u, index=%u\n",
+ __entry->vcpu, __entry->icrh, __entry->icrl,
+ __entry->id, __entry->index)
+);
+
+TRACE_EVENT(kvm_avic_unaccelerated_access,
+ TP_PROTO(u32 vcpu, u32 offset, bool ft, bool rw, u32 vec),
+ TP_ARGS(vcpu, offset, ft, rw, vec),
+
+ TP_STRUCT__entry(
+ __field(u32, vcpu)
+ __field(u32, offset)
+ __field(bool, ft)
+ __field(bool, rw)
+ __field(u32, vec)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu = vcpu;
+ __entry->offset = offset;
+ __entry->ft = ft;
+ __entry->rw = rw;
+ __entry->vec = vec;
+ ),
+
+ TP_printk("vcpu=%u, offset=%#x(%s), %s, %s, vec=%#x\n",
+ __entry->vcpu,
+ __entry->offset,
+ __print_symbolic(__entry->offset, kvm_trace_symbol_apic),
+ __entry->ft ? "trap" : "fault",
+ __entry->rw ? "write" : "read",
+ __entry->vec)
+);
+
+TRACE_EVENT(kvm_hv_timer_state,
+ TP_PROTO(unsigned int vcpu_id, unsigned int hv_timer_in_use),
+ TP_ARGS(vcpu_id, hv_timer_in_use),
+ TP_STRUCT__entry(
+ __field(unsigned int, vcpu_id)
+ __field(unsigned int, hv_timer_in_use)
+ ),
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->hv_timer_in_use = hv_timer_in_use;
+ ),
+ TP_printk("vcpu_id %x hv_timer %x\n",
+ __entry->vcpu_id,
+ __entry->hv_timer_in_use)
+);
+
+/*
+ * Tracepoint for kvm_hv_flush_tlb.
+ */
+TRACE_EVENT(kvm_hv_flush_tlb,
+ TP_PROTO(u64 processor_mask, u64 address_space, u64 flags),
+ TP_ARGS(processor_mask, address_space, flags),
+
+ TP_STRUCT__entry(
+ __field(u64, processor_mask)
+ __field(u64, address_space)
+ __field(u64, flags)
+ ),
+
+ TP_fast_assign(
+ __entry->processor_mask = processor_mask;
+ __entry->address_space = address_space;
+ __entry->flags = flags;
+ ),
+
+ TP_printk("processor_mask 0x%llx address_space 0x%llx flags 0x%llx",
+ __entry->processor_mask, __entry->address_space,
+ __entry->flags)
+);
+
+/*
+ * Tracepoint for kvm_hv_flush_tlb_ex.
+ */
+TRACE_EVENT(kvm_hv_flush_tlb_ex,
+ TP_PROTO(u64 valid_bank_mask, u64 format, u64 address_space, u64 flags),
+ TP_ARGS(valid_bank_mask, format, address_space, flags),
+
+ TP_STRUCT__entry(
+ __field(u64, valid_bank_mask)
+ __field(u64, format)
+ __field(u64, address_space)
+ __field(u64, flags)
+ ),
+
+ TP_fast_assign(
+ __entry->valid_bank_mask = valid_bank_mask;
+ __entry->format = format;
+ __entry->address_space = address_space;
+ __entry->flags = flags;
+ ),
+
+ TP_printk("valid_bank_mask 0x%llx format 0x%llx "
+ "address_space 0x%llx flags 0x%llx",
+ __entry->valid_bank_mask, __entry->format,
+ __entry->address_space, __entry->flags)
+);
+#endif /* _TRACE_KVM_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH arch/x86/kvm
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/arch/x86/kvm/tss.h b/arch/x86/kvm/tss.h
new file mode 100644
index 000000000..3f9150125
--- /dev/null
+++ b/arch/x86/kvm/tss.h
@@ -0,0 +1,60 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __TSS_SEGMENT_H
+#define __TSS_SEGMENT_H
+
+struct tss_segment_32 {
+ u32 prev_task_link;
+ u32 esp0;
+ u32 ss0;
+ u32 esp1;
+ u32 ss1;
+ u32 esp2;
+ u32 ss2;
+ u32 cr3;
+ u32 eip;
+ u32 eflags;
+ u32 eax;
+ u32 ecx;
+ u32 edx;
+ u32 ebx;
+ u32 esp;
+ u32 ebp;
+ u32 esi;
+ u32 edi;
+ u32 es;
+ u32 cs;
+ u32 ss;
+ u32 ds;
+ u32 fs;
+ u32 gs;
+ u32 ldt_selector;
+ u16 t;
+ u16 io_map;
+};
+
+struct tss_segment_16 {
+ u16 prev_task_link;
+ u16 sp0;
+ u16 ss0;
+ u16 sp1;
+ u16 ss1;
+ u16 sp2;
+ u16 ss2;
+ u16 ip;
+ u16 flag;
+ u16 ax;
+ u16 cx;
+ u16 dx;
+ u16 bx;
+ u16 sp;
+ u16 bp;
+ u16 si;
+ u16 di;
+ u16 es;
+ u16 cs;
+ u16 ss;
+ u16 ds;
+ u16 ldt;
+};
+
+#endif
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
new file mode 100644
index 000000000..44cce3e8e
--- /dev/null
+++ b/arch/x86/kvm/vmx.c
@@ -0,0 +1,14715 @@
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * This module enables machines with Intel VT-x extensions to run virtual
+ * machines without emulation or binary translation.
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ * Avi Kivity <avi@qumranet.com>
+ * Yaniv Kamay <yaniv@qumranet.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include "irq.h"
+#include "mmu.h"
+#include "cpuid.h"
+#include "lapic.h"
+
+#include <linux/kvm_host.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/sched.h>
+#include <linux/sched/smt.h>
+#include <linux/moduleparam.h>
+#include <linux/mod_devicetable.h>
+#include <linux/trace_events.h>
+#include <linux/slab.h>
+#include <linux/tboot.h>
+#include <linux/hrtimer.h>
+#include <linux/frame.h>
+#include <linux/nospec.h>
+#include "kvm_cache_regs.h"
+#include "x86.h"
+
+#include <asm/asm.h>
+#include <asm/cpu.h>
+#include <asm/io.h>
+#include <asm/desc.h>
+#include <asm/vmx.h>
+#include <asm/virtext.h>
+#include <asm/mce.h>
+#include <asm/fpu/internal.h>
+#include <asm/perf_event.h>
+#include <asm/debugreg.h>
+#include <asm/kexec.h>
+#include <asm/apic.h>
+#include <asm/irq_remapping.h>
+#include <asm/mmu_context.h>
+#include <asm/spec-ctrl.h>
+#include <asm/mshyperv.h>
+
+#include "trace.h"
+#include "pmu.h"
+#include "vmx_evmcs.h"
+
+#define __ex(x) __kvm_handle_fault_on_reboot(x)
+#define __ex_clear(x, reg) \
+ ____kvm_handle_fault_on_reboot(x, "xor " reg " , " reg)
+
+MODULE_AUTHOR("Qumranet");
+MODULE_LICENSE("GPL");
+
+static const struct x86_cpu_id vmx_cpu_id[] = {
+ X86_FEATURE_MATCH(X86_FEATURE_VMX),
+ {}
+};
+MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
+
+static bool __read_mostly enable_vpid = 1;
+module_param_named(vpid, enable_vpid, bool, 0444);
+
+static bool __read_mostly enable_vnmi = 1;
+module_param_named(vnmi, enable_vnmi, bool, S_IRUGO);
+
+static bool __read_mostly flexpriority_enabled = 1;
+module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
+
+static bool __read_mostly enable_ept = 1;
+module_param_named(ept, enable_ept, bool, S_IRUGO);
+
+static bool __read_mostly enable_unrestricted_guest = 1;
+module_param_named(unrestricted_guest,
+ enable_unrestricted_guest, bool, S_IRUGO);
+
+static bool __read_mostly enable_ept_ad_bits = 1;
+module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);
+
+static bool __read_mostly emulate_invalid_guest_state = true;
+module_param(emulate_invalid_guest_state, bool, S_IRUGO);
+
+static bool __read_mostly fasteoi = 1;
+module_param(fasteoi, bool, S_IRUGO);
+
+static bool __read_mostly enable_apicv = 1;
+module_param(enable_apicv, bool, S_IRUGO);
+
+static bool __read_mostly enable_shadow_vmcs = 1;
+module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
+/*
+ * If nested=1, nested virtualization is supported, i.e., guests may use
+ * VMX and be a hypervisor for its own guests. If nested=0, guests may not
+ * use VMX instructions.
+ */
+static bool __read_mostly nested = 0;
+module_param(nested, bool, S_IRUGO);
+
+static u64 __read_mostly host_xss;
+
+static bool __read_mostly enable_pml = 1;
+module_param_named(pml, enable_pml, bool, S_IRUGO);
+
+#define MSR_TYPE_R 1
+#define MSR_TYPE_W 2
+#define MSR_TYPE_RW 3
+
+#define MSR_BITMAP_MODE_X2APIC 1
+#define MSR_BITMAP_MODE_X2APIC_APICV 2
+
+#define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL
+
+/* Guest_tsc -> host_tsc conversion requires 64-bit division. */
+static int __read_mostly cpu_preemption_timer_multi;
+static bool __read_mostly enable_preemption_timer = 1;
+#ifdef CONFIG_X86_64
+module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
+#endif
+
+#define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
+#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE
+#define KVM_VM_CR0_ALWAYS_ON \
+ (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | \
+ X86_CR0_WP | X86_CR0_PG | X86_CR0_PE)
+#define KVM_CR4_GUEST_OWNED_BITS \
+ (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
+ | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_TSD)
+
+#define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE
+#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
+#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
+
+#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))
+
+#define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5
+
+/*
+ * Hyper-V requires all of these, so mark them as supported even though
+ * they are just treated the same as all-context.
+ */
+#define VMX_VPID_EXTENT_SUPPORTED_MASK \
+ (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \
+ VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \
+ VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \
+ VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT)
+
+/*
+ * These 2 parameters are used to config the controls for Pause-Loop Exiting:
+ * ple_gap: upper bound on the amount of time between two successive
+ * executions of PAUSE in a loop. Also indicate if ple enabled.
+ * According to test, this time is usually smaller than 128 cycles.
+ * ple_window: upper bound on the amount of time a guest is allowed to execute
+ * in a PAUSE loop. Tests indicate that most spinlocks are held for
+ * less than 2^12 cycles
+ * Time is measured based on a counter that runs at the same rate as the TSC,
+ * refer SDM volume 3b section 21.6.13 & 22.1.3.
+ */
+static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP;
+module_param(ple_gap, uint, 0444);
+
+static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
+module_param(ple_window, uint, 0444);
+
+/* Default doubles per-vcpu window every exit. */
+static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
+module_param(ple_window_grow, uint, 0444);
+
+/* Default resets per-vcpu window every exit to ple_window. */
+static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
+module_param(ple_window_shrink, uint, 0444);
+
+/* Default is to compute the maximum so we can never overflow. */
+static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
+module_param(ple_window_max, uint, 0444);
+
+extern const ulong vmx_return;
+
+static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
+static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
+static DEFINE_MUTEX(vmx_l1d_flush_mutex);
+
+/* Storage for pre module init parameter parsing */
+static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO;
+
+static const struct {
+ const char *option;
+ bool for_parse;
+} vmentry_l1d_param[] = {
+ [VMENTER_L1D_FLUSH_AUTO] = {"auto", true},
+ [VMENTER_L1D_FLUSH_NEVER] = {"never", true},
+ [VMENTER_L1D_FLUSH_COND] = {"cond", true},
+ [VMENTER_L1D_FLUSH_ALWAYS] = {"always", true},
+ [VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false},
+ [VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false},
+};
+
+#define L1D_CACHE_ORDER 4
+static void *vmx_l1d_flush_pages;
+
+/* Control for disabling CPU Fill buffer clear */
+static bool __read_mostly vmx_fb_clear_ctrl_available;
+
+static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
+{
+ struct page *page;
+ unsigned int i;
+
+ if (!enable_ept) {
+ l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED;
+ return 0;
+ }
+
+ if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
+ u64 msr;
+
+ rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
+ if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
+ l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
+ return 0;
+ }
+ }
+
+ /* If set to auto use the default l1tf mitigation method */
+ if (l1tf == VMENTER_L1D_FLUSH_AUTO) {
+ switch (l1tf_mitigation) {
+ case L1TF_MITIGATION_OFF:
+ l1tf = VMENTER_L1D_FLUSH_NEVER;
+ break;
+ case L1TF_MITIGATION_FLUSH_NOWARN:
+ case L1TF_MITIGATION_FLUSH:
+ case L1TF_MITIGATION_FLUSH_NOSMT:
+ l1tf = VMENTER_L1D_FLUSH_COND;
+ break;
+ case L1TF_MITIGATION_FULL:
+ case L1TF_MITIGATION_FULL_FORCE:
+ l1tf = VMENTER_L1D_FLUSH_ALWAYS;
+ break;
+ }
+ } else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) {
+ l1tf = VMENTER_L1D_FLUSH_ALWAYS;
+ }
+
+ if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages &&
+ !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {
+ page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
+ if (!page)
+ return -ENOMEM;
+ vmx_l1d_flush_pages = page_address(page);
+
+ /*
+ * Initialize each page with a different pattern in
+ * order to protect against KSM in the nested
+ * virtualization case.
+ */
+ for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) {
+ memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1,
+ PAGE_SIZE);
+ }
+ }
+
+ l1tf_vmx_mitigation = l1tf;
+
+ if (l1tf != VMENTER_L1D_FLUSH_NEVER)
+ static_branch_enable(&vmx_l1d_should_flush);
+ else
+ static_branch_disable(&vmx_l1d_should_flush);
+
+ if (l1tf == VMENTER_L1D_FLUSH_COND)
+ static_branch_enable(&vmx_l1d_flush_cond);
+ else
+ static_branch_disable(&vmx_l1d_flush_cond);
+ return 0;
+}
+
+static int vmentry_l1d_flush_parse(const char *s)
+{
+ unsigned int i;
+
+ if (s) {
+ for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) {
+ if (vmentry_l1d_param[i].for_parse &&
+ sysfs_streq(s, vmentry_l1d_param[i].option))
+ return i;
+ }
+ }
+ return -EINVAL;
+}
+
+static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
+{
+ int l1tf, ret;
+
+ l1tf = vmentry_l1d_flush_parse(s);
+ if (l1tf < 0)
+ return l1tf;
+
+ if (!boot_cpu_has(X86_BUG_L1TF))
+ return 0;
+
+ /*
+ * Has vmx_init() run already? If not then this is the pre init
+ * parameter parsing. In that case just store the value and let
+ * vmx_init() do the proper setup after enable_ept has been
+ * established.
+ */
+ if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) {
+ vmentry_l1d_flush_param = l1tf;
+ return 0;
+ }
+
+ mutex_lock(&vmx_l1d_flush_mutex);
+ ret = vmx_setup_l1d_flush(l1tf);
+ mutex_unlock(&vmx_l1d_flush_mutex);
+ return ret;
+}
+
+static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
+{
+ if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param)))
+ return sprintf(s, "???\n");
+
+ return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
+}
+
+static const struct kernel_param_ops vmentry_l1d_flush_ops = {
+ .set = vmentry_l1d_flush_set,
+ .get = vmentry_l1d_flush_get,
+};
+module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
+
+enum ept_pointers_status {
+ EPT_POINTERS_CHECK = 0,
+ EPT_POINTERS_MATCH = 1,
+ EPT_POINTERS_MISMATCH = 2
+};
+
+struct kvm_vmx {
+ struct kvm kvm;
+
+ unsigned int tss_addr;
+ bool ept_identity_pagetable_done;
+ gpa_t ept_identity_map_addr;
+
+ enum ept_pointers_status ept_pointers_match;
+ spinlock_t ept_pointer_lock;
+};
+
+#define NR_AUTOLOAD_MSRS 8
+
+struct vmcs_hdr {
+ u32 revision_id:31;
+ u32 shadow_vmcs:1;
+};
+
+struct vmcs {
+ struct vmcs_hdr hdr;
+ u32 abort;
+ char data[0];
+};
+
+/*
+ * vmcs_host_state tracks registers that are loaded from the VMCS on VMEXIT
+ * and whose values change infrequently, but are not constant. I.e. this is
+ * used as a write-through cache of the corresponding VMCS fields.
+ */
+struct vmcs_host_state {
+ unsigned long cr3; /* May not match real cr3 */
+ unsigned long cr4; /* May not match real cr4 */
+ unsigned long gs_base;
+ unsigned long fs_base;
+
+ u16 fs_sel, gs_sel, ldt_sel;
+#ifdef CONFIG_X86_64
+ u16 ds_sel, es_sel;
+#endif
+};
+
+/*
+ * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also
+ * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs
+ * loaded on this CPU (so we can clear them if the CPU goes down).
+ */
+struct loaded_vmcs {
+ struct vmcs *vmcs;
+ struct vmcs *shadow_vmcs;
+ int cpu;
+ bool launched;
+ bool nmi_known_unmasked;
+ bool hv_timer_armed;
+ /* Support for vnmi-less CPUs */
+ int soft_vnmi_blocked;
+ ktime_t entry_time;
+ s64 vnmi_blocked_time;
+ unsigned long *msr_bitmap;
+ struct list_head loaded_vmcss_on_cpu_link;
+ struct vmcs_host_state host_state;
+};
+
+struct shared_msr_entry {
+ unsigned index;
+ u64 data;
+ u64 mask;
+};
+
+/*
+ * struct vmcs12 describes the state that our guest hypervisor (L1) keeps for a
+ * single nested guest (L2), hence the name vmcs12. Any VMX implementation has
+ * a VMCS structure, and vmcs12 is our emulated VMX's VMCS. This structure is
+ * stored in guest memory specified by VMPTRLD, but is opaque to the guest,
+ * which must access it using VMREAD/VMWRITE/VMCLEAR instructions.
+ * More than one of these structures may exist, if L1 runs multiple L2 guests.
+ * nested_vmx_run() will use the data here to build the vmcs02: a VMCS for the
+ * underlying hardware which will be used to run L2.
+ * This structure is packed to ensure that its layout is identical across
+ * machines (necessary for live migration).
+ *
+ * IMPORTANT: Changing the layout of existing fields in this structure
+ * will break save/restore compatibility with older kvm releases. When
+ * adding new fields, either use space in the reserved padding* arrays
+ * or add the new fields to the end of the structure.
+ */
+typedef u64 natural_width;
+struct __packed vmcs12 {
+ /* According to the Intel spec, a VMCS region must start with the
+ * following two fields. Then follow implementation-specific data.
+ */
+ struct vmcs_hdr hdr;
+ u32 abort;
+
+ u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */
+ u32 padding[7]; /* room for future expansion */
+
+ u64 io_bitmap_a;
+ u64 io_bitmap_b;
+ u64 msr_bitmap;
+ u64 vm_exit_msr_store_addr;
+ u64 vm_exit_msr_load_addr;
+ u64 vm_entry_msr_load_addr;
+ u64 tsc_offset;
+ u64 virtual_apic_page_addr;
+ u64 apic_access_addr;
+ u64 posted_intr_desc_addr;
+ u64 ept_pointer;
+ u64 eoi_exit_bitmap0;
+ u64 eoi_exit_bitmap1;
+ u64 eoi_exit_bitmap2;
+ u64 eoi_exit_bitmap3;
+ u64 xss_exit_bitmap;
+ u64 guest_physical_address;
+ u64 vmcs_link_pointer;
+ u64 guest_ia32_debugctl;
+ u64 guest_ia32_pat;
+ u64 guest_ia32_efer;
+ u64 guest_ia32_perf_global_ctrl;
+ u64 guest_pdptr0;
+ u64 guest_pdptr1;
+ u64 guest_pdptr2;
+ u64 guest_pdptr3;
+ u64 guest_bndcfgs;
+ u64 host_ia32_pat;
+ u64 host_ia32_efer;
+ u64 host_ia32_perf_global_ctrl;
+ u64 vmread_bitmap;
+ u64 vmwrite_bitmap;
+ u64 vm_function_control;
+ u64 eptp_list_address;
+ u64 pml_address;
+ u64 padding64[3]; /* room for future expansion */
+ /*
+ * To allow migration of L1 (complete with its L2 guests) between
+ * machines of different natural widths (32 or 64 bit), we cannot have
+ * unsigned long fields with no explict size. We use u64 (aliased
+ * natural_width) instead. Luckily, x86 is little-endian.
+ */
+ natural_width cr0_guest_host_mask;
+ natural_width cr4_guest_host_mask;
+ natural_width cr0_read_shadow;
+ natural_width cr4_read_shadow;
+ natural_width cr3_target_value0;
+ natural_width cr3_target_value1;
+ natural_width cr3_target_value2;
+ natural_width cr3_target_value3;
+ natural_width exit_qualification;
+ natural_width guest_linear_address;
+ natural_width guest_cr0;
+ natural_width guest_cr3;
+ natural_width guest_cr4;
+ natural_width guest_es_base;
+ natural_width guest_cs_base;
+ natural_width guest_ss_base;
+ natural_width guest_ds_base;
+ natural_width guest_fs_base;
+ natural_width guest_gs_base;
+ natural_width guest_ldtr_base;
+ natural_width guest_tr_base;
+ natural_width guest_gdtr_base;
+ natural_width guest_idtr_base;
+ natural_width guest_dr7;
+ natural_width guest_rsp;
+ natural_width guest_rip;
+ natural_width guest_rflags;
+ natural_width guest_pending_dbg_exceptions;
+ natural_width guest_sysenter_esp;
+ natural_width guest_sysenter_eip;
+ natural_width host_cr0;
+ natural_width host_cr3;
+ natural_width host_cr4;
+ natural_width host_fs_base;
+ natural_width host_gs_base;
+ natural_width host_tr_base;
+ natural_width host_gdtr_base;
+ natural_width host_idtr_base;
+ natural_width host_ia32_sysenter_esp;
+ natural_width host_ia32_sysenter_eip;
+ natural_width host_rsp;
+ natural_width host_rip;
+ natural_width paddingl[8]; /* room for future expansion */
+ u32 pin_based_vm_exec_control;
+ u32 cpu_based_vm_exec_control;
+ u32 exception_bitmap;
+ u32 page_fault_error_code_mask;
+ u32 page_fault_error_code_match;
+ u32 cr3_target_count;
+ u32 vm_exit_controls;
+ u32 vm_exit_msr_store_count;
+ u32 vm_exit_msr_load_count;
+ u32 vm_entry_controls;
+ u32 vm_entry_msr_load_count;
+ u32 vm_entry_intr_info_field;
+ u32 vm_entry_exception_error_code;
+ u32 vm_entry_instruction_len;
+ u32 tpr_threshold;
+ u32 secondary_vm_exec_control;
+ u32 vm_instruction_error;
+ u32 vm_exit_reason;
+ u32 vm_exit_intr_info;
+ u32 vm_exit_intr_error_code;
+ u32 idt_vectoring_info_field;
+ u32 idt_vectoring_error_code;
+ u32 vm_exit_instruction_len;
+ u32 vmx_instruction_info;
+ u32 guest_es_limit;
+ u32 guest_cs_limit;
+ u32 guest_ss_limit;
+ u32 guest_ds_limit;
+ u32 guest_fs_limit;
+ u32 guest_gs_limit;
+ u32 guest_ldtr_limit;
+ u32 guest_tr_limit;
+ u32 guest_gdtr_limit;
+ u32 guest_idtr_limit;
+ u32 guest_es_ar_bytes;
+ u32 guest_cs_ar_bytes;
+ u32 guest_ss_ar_bytes;
+ u32 guest_ds_ar_bytes;
+ u32 guest_fs_ar_bytes;
+ u32 guest_gs_ar_bytes;
+ u32 guest_ldtr_ar_bytes;
+ u32 guest_tr_ar_bytes;
+ u32 guest_interruptibility_info;
+ u32 guest_activity_state;
+ u32 guest_sysenter_cs;
+ u32 host_ia32_sysenter_cs;
+ u32 vmx_preemption_timer_value;
+ u32 padding32[7]; /* room for future expansion */
+ u16 virtual_processor_id;
+ u16 posted_intr_nv;
+ u16 guest_es_selector;
+ u16 guest_cs_selector;
+ u16 guest_ss_selector;
+ u16 guest_ds_selector;
+ u16 guest_fs_selector;
+ u16 guest_gs_selector;
+ u16 guest_ldtr_selector;
+ u16 guest_tr_selector;
+ u16 guest_intr_status;
+ u16 host_es_selector;
+ u16 host_cs_selector;
+ u16 host_ss_selector;
+ u16 host_ds_selector;
+ u16 host_fs_selector;
+ u16 host_gs_selector;
+ u16 host_tr_selector;
+ u16 guest_pml_index;
+};
+
+/*
+ * For save/restore compatibility, the vmcs12 field offsets must not change.
+ */
+#define CHECK_OFFSET(field, loc) \
+ BUILD_BUG_ON_MSG(offsetof(struct vmcs12, field) != (loc), \
+ "Offset of " #field " in struct vmcs12 has changed.")
+
+static inline void vmx_check_vmcs12_offsets(void) {
+ CHECK_OFFSET(hdr, 0);
+ CHECK_OFFSET(abort, 4);
+ CHECK_OFFSET(launch_state, 8);
+ CHECK_OFFSET(io_bitmap_a, 40);
+ CHECK_OFFSET(io_bitmap_b, 48);
+ CHECK_OFFSET(msr_bitmap, 56);
+ CHECK_OFFSET(vm_exit_msr_store_addr, 64);
+ CHECK_OFFSET(vm_exit_msr_load_addr, 72);
+ CHECK_OFFSET(vm_entry_msr_load_addr, 80);
+ CHECK_OFFSET(tsc_offset, 88);
+ CHECK_OFFSET(virtual_apic_page_addr, 96);
+ CHECK_OFFSET(apic_access_addr, 104);
+ CHECK_OFFSET(posted_intr_desc_addr, 112);
+ CHECK_OFFSET(ept_pointer, 120);
+ CHECK_OFFSET(eoi_exit_bitmap0, 128);
+ CHECK_OFFSET(eoi_exit_bitmap1, 136);
+ CHECK_OFFSET(eoi_exit_bitmap2, 144);
+ CHECK_OFFSET(eoi_exit_bitmap3, 152);
+ CHECK_OFFSET(xss_exit_bitmap, 160);
+ CHECK_OFFSET(guest_physical_address, 168);
+ CHECK_OFFSET(vmcs_link_pointer, 176);
+ CHECK_OFFSET(guest_ia32_debugctl, 184);
+ CHECK_OFFSET(guest_ia32_pat, 192);
+ CHECK_OFFSET(guest_ia32_efer, 200);
+ CHECK_OFFSET(guest_ia32_perf_global_ctrl, 208);
+ CHECK_OFFSET(guest_pdptr0, 216);
+ CHECK_OFFSET(guest_pdptr1, 224);
+ CHECK_OFFSET(guest_pdptr2, 232);
+ CHECK_OFFSET(guest_pdptr3, 240);
+ CHECK_OFFSET(guest_bndcfgs, 248);
+ CHECK_OFFSET(host_ia32_pat, 256);
+ CHECK_OFFSET(host_ia32_efer, 264);
+ CHECK_OFFSET(host_ia32_perf_global_ctrl, 272);
+ CHECK_OFFSET(vmread_bitmap, 280);
+ CHECK_OFFSET(vmwrite_bitmap, 288);
+ CHECK_OFFSET(vm_function_control, 296);
+ CHECK_OFFSET(eptp_list_address, 304);
+ CHECK_OFFSET(pml_address, 312);
+ CHECK_OFFSET(cr0_guest_host_mask, 344);
+ CHECK_OFFSET(cr4_guest_host_mask, 352);
+ CHECK_OFFSET(cr0_read_shadow, 360);
+ CHECK_OFFSET(cr4_read_shadow, 368);
+ CHECK_OFFSET(cr3_target_value0, 376);
+ CHECK_OFFSET(cr3_target_value1, 384);
+ CHECK_OFFSET(cr3_target_value2, 392);
+ CHECK_OFFSET(cr3_target_value3, 400);
+ CHECK_OFFSET(exit_qualification, 408);
+ CHECK_OFFSET(guest_linear_address, 416);
+ CHECK_OFFSET(guest_cr0, 424);
+ CHECK_OFFSET(guest_cr3, 432);
+ CHECK_OFFSET(guest_cr4, 440);
+ CHECK_OFFSET(guest_es_base, 448);
+ CHECK_OFFSET(guest_cs_base, 456);
+ CHECK_OFFSET(guest_ss_base, 464);
+ CHECK_OFFSET(guest_ds_base, 472);
+ CHECK_OFFSET(guest_fs_base, 480);
+ CHECK_OFFSET(guest_gs_base, 488);
+ CHECK_OFFSET(guest_ldtr_base, 496);
+ CHECK_OFFSET(guest_tr_base, 504);
+ CHECK_OFFSET(guest_gdtr_base, 512);
+ CHECK_OFFSET(guest_idtr_base, 520);
+ CHECK_OFFSET(guest_dr7, 528);
+ CHECK_OFFSET(guest_rsp, 536);
+ CHECK_OFFSET(guest_rip, 544);
+ CHECK_OFFSET(guest_rflags, 552);
+ CHECK_OFFSET(guest_pending_dbg_exceptions, 560);
+ CHECK_OFFSET(guest_sysenter_esp, 568);
+ CHECK_OFFSET(guest_sysenter_eip, 576);
+ CHECK_OFFSET(host_cr0, 584);
+ CHECK_OFFSET(host_cr3, 592);
+ CHECK_OFFSET(host_cr4, 600);
+ CHECK_OFFSET(host_fs_base, 608);
+ CHECK_OFFSET(host_gs_base, 616);
+ CHECK_OFFSET(host_tr_base, 624);
+ CHECK_OFFSET(host_gdtr_base, 632);
+ CHECK_OFFSET(host_idtr_base, 640);
+ CHECK_OFFSET(host_ia32_sysenter_esp, 648);
+ CHECK_OFFSET(host_ia32_sysenter_eip, 656);
+ CHECK_OFFSET(host_rsp, 664);
+ CHECK_OFFSET(host_rip, 672);
+ CHECK_OFFSET(pin_based_vm_exec_control, 744);
+ CHECK_OFFSET(cpu_based_vm_exec_control, 748);
+ CHECK_OFFSET(exception_bitmap, 752);
+ CHECK_OFFSET(page_fault_error_code_mask, 756);
+ CHECK_OFFSET(page_fault_error_code_match, 760);
+ CHECK_OFFSET(cr3_target_count, 764);
+ CHECK_OFFSET(vm_exit_controls, 768);
+ CHECK_OFFSET(vm_exit_msr_store_count, 772);
+ CHECK_OFFSET(vm_exit_msr_load_count, 776);
+ CHECK_OFFSET(vm_entry_controls, 780);
+ CHECK_OFFSET(vm_entry_msr_load_count, 784);
+ CHECK_OFFSET(vm_entry_intr_info_field, 788);
+ CHECK_OFFSET(vm_entry_exception_error_code, 792);
+ CHECK_OFFSET(vm_entry_instruction_len, 796);
+ CHECK_OFFSET(tpr_threshold, 800);
+ CHECK_OFFSET(secondary_vm_exec_control, 804);
+ CHECK_OFFSET(vm_instruction_error, 808);
+ CHECK_OFFSET(vm_exit_reason, 812);
+ CHECK_OFFSET(vm_exit_intr_info, 816);
+ CHECK_OFFSET(vm_exit_intr_error_code, 820);
+ CHECK_OFFSET(idt_vectoring_info_field, 824);
+ CHECK_OFFSET(idt_vectoring_error_code, 828);
+ CHECK_OFFSET(vm_exit_instruction_len, 832);
+ CHECK_OFFSET(vmx_instruction_info, 836);
+ CHECK_OFFSET(guest_es_limit, 840);
+ CHECK_OFFSET(guest_cs_limit, 844);
+ CHECK_OFFSET(guest_ss_limit, 848);
+ CHECK_OFFSET(guest_ds_limit, 852);
+ CHECK_OFFSET(guest_fs_limit, 856);
+ CHECK_OFFSET(guest_gs_limit, 860);
+ CHECK_OFFSET(guest_ldtr_limit, 864);
+ CHECK_OFFSET(guest_tr_limit, 868);
+ CHECK_OFFSET(guest_gdtr_limit, 872);
+ CHECK_OFFSET(guest_idtr_limit, 876);
+ CHECK_OFFSET(guest_es_ar_bytes, 880);
+ CHECK_OFFSET(guest_cs_ar_bytes, 884);
+ CHECK_OFFSET(guest_ss_ar_bytes, 888);
+ CHECK_OFFSET(guest_ds_ar_bytes, 892);
+ CHECK_OFFSET(guest_fs_ar_bytes, 896);
+ CHECK_OFFSET(guest_gs_ar_bytes, 900);
+ CHECK_OFFSET(guest_ldtr_ar_bytes, 904);
+ CHECK_OFFSET(guest_tr_ar_bytes, 908);
+ CHECK_OFFSET(guest_interruptibility_info, 912);
+ CHECK_OFFSET(guest_activity_state, 916);
+ CHECK_OFFSET(guest_sysenter_cs, 920);
+ CHECK_OFFSET(host_ia32_sysenter_cs, 924);
+ CHECK_OFFSET(vmx_preemption_timer_value, 928);
+ CHECK_OFFSET(virtual_processor_id, 960);
+ CHECK_OFFSET(posted_intr_nv, 962);
+ CHECK_OFFSET(guest_es_selector, 964);
+ CHECK_OFFSET(guest_cs_selector, 966);
+ CHECK_OFFSET(guest_ss_selector, 968);
+ CHECK_OFFSET(guest_ds_selector, 970);
+ CHECK_OFFSET(guest_fs_selector, 972);
+ CHECK_OFFSET(guest_gs_selector, 974);
+ CHECK_OFFSET(guest_ldtr_selector, 976);
+ CHECK_OFFSET(guest_tr_selector, 978);
+ CHECK_OFFSET(guest_intr_status, 980);
+ CHECK_OFFSET(host_es_selector, 982);
+ CHECK_OFFSET(host_cs_selector, 984);
+ CHECK_OFFSET(host_ss_selector, 986);
+ CHECK_OFFSET(host_ds_selector, 988);
+ CHECK_OFFSET(host_fs_selector, 990);
+ CHECK_OFFSET(host_gs_selector, 992);
+ CHECK_OFFSET(host_tr_selector, 994);
+ CHECK_OFFSET(guest_pml_index, 996);
+}
+
+/*
+ * VMCS12_REVISION is an arbitrary id that should be changed if the content or
+ * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and
+ * VMPTRLD verifies that the VMCS region that L1 is loading contains this id.
+ *
+ * IMPORTANT: Changing this value will break save/restore compatibility with
+ * older kvm releases.
+ */
+#define VMCS12_REVISION 0x11e57ed0
+
+/*
+ * VMCS12_SIZE is the number of bytes L1 should allocate for the VMXON region
+ * and any VMCS region. Although only sizeof(struct vmcs12) are used by the
+ * current implementation, 4K are reserved to avoid future complications.
+ */
+#define VMCS12_SIZE 0x1000
+
+/*
+ * VMCS12_MAX_FIELD_INDEX is the highest index value used in any
+ * supported VMCS12 field encoding.
+ */
+#define VMCS12_MAX_FIELD_INDEX 0x17
+
+struct nested_vmx_msrs {
+ /*
+ * We only store the "true" versions of the VMX capability MSRs. We
+ * generate the "non-true" versions by setting the must-be-1 bits
+ * according to the SDM.
+ */
+ u32 procbased_ctls_low;
+ u32 procbased_ctls_high;
+ u32 secondary_ctls_low;
+ u32 secondary_ctls_high;
+ u32 pinbased_ctls_low;
+ u32 pinbased_ctls_high;
+ u32 exit_ctls_low;
+ u32 exit_ctls_high;
+ u32 entry_ctls_low;
+ u32 entry_ctls_high;
+ u32 misc_low;
+ u32 misc_high;
+ u32 ept_caps;
+ u32 vpid_caps;
+ u64 basic;
+ u64 cr0_fixed0;
+ u64 cr0_fixed1;
+ u64 cr4_fixed0;
+ u64 cr4_fixed1;
+ u64 vmcs_enum;
+ u64 vmfunc_controls;
+};
+
+/*
+ * The nested_vmx structure is part of vcpu_vmx, and holds information we need
+ * for correct emulation of VMX (i.e., nested VMX) on this vcpu.
+ */
+struct nested_vmx {
+ /* Has the level1 guest done vmxon? */
+ bool vmxon;
+ gpa_t vmxon_ptr;
+ bool pml_full;
+
+ /* The guest-physical address of the current VMCS L1 keeps for L2 */
+ gpa_t current_vmptr;
+ /*
+ * Cache of the guest's VMCS, existing outside of guest memory.
+ * Loaded from guest memory during VMPTRLD. Flushed to guest
+ * memory during VMCLEAR and VMPTRLD.
+ */
+ struct vmcs12 *cached_vmcs12;
+ /*
+ * Cache of the guest's shadow VMCS, existing outside of guest
+ * memory. Loaded from guest memory during VM entry. Flushed
+ * to guest memory during VM exit.
+ */
+ struct vmcs12 *cached_shadow_vmcs12;
+ /*
+ * Indicates if the shadow vmcs must be updated with the
+ * data hold by vmcs12
+ */
+ bool sync_shadow_vmcs;
+ bool dirty_vmcs12;
+
+ bool change_vmcs01_virtual_apic_mode;
+
+ /* L2 must run next, and mustn't decide to exit to L1. */
+ bool nested_run_pending;
+
+ struct loaded_vmcs vmcs02;
+
+ /*
+ * Guest pages referred to in the vmcs02 with host-physical
+ * pointers, so we must keep them pinned while L2 runs.
+ */
+ struct page *apic_access_page;
+ struct page *virtual_apic_page;
+ struct page *pi_desc_page;
+ struct pi_desc *pi_desc;
+ bool pi_pending;
+ u16 posted_intr_nv;
+
+ struct hrtimer preemption_timer;
+ bool preemption_timer_expired;
+
+ /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
+ u64 vmcs01_debugctl;
+ u64 vmcs01_guest_bndcfgs;
+
+ u16 vpid02;
+ u16 last_vpid;
+
+ struct nested_vmx_msrs msrs;
+
+ /* SMM related state */
+ struct {
+ /* in VMX operation on SMM entry? */
+ bool vmxon;
+ /* in guest mode on SMM entry? */
+ bool guest_mode;
+ } smm;
+};
+
+#define POSTED_INTR_ON 0
+#define POSTED_INTR_SN 1
+
+/* Posted-Interrupt Descriptor */
+struct pi_desc {
+ u32 pir[8]; /* Posted interrupt requested */
+ union {
+ struct {
+ /* bit 256 - Outstanding Notification */
+ u16 on : 1,
+ /* bit 257 - Suppress Notification */
+ sn : 1,
+ /* bit 271:258 - Reserved */
+ rsvd_1 : 14;
+ /* bit 279:272 - Notification Vector */
+ u8 nv;
+ /* bit 287:280 - Reserved */
+ u8 rsvd_2;
+ /* bit 319:288 - Notification Destination */
+ u32 ndst;
+ };
+ u64 control;
+ };
+ u32 rsvd[6];
+} __aligned(64);
+
+static bool pi_test_and_set_on(struct pi_desc *pi_desc)
+{
+ return test_and_set_bit(POSTED_INTR_ON,
+ (unsigned long *)&pi_desc->control);
+}
+
+static bool pi_test_and_clear_on(struct pi_desc *pi_desc)
+{
+ return test_and_clear_bit(POSTED_INTR_ON,
+ (unsigned long *)&pi_desc->control);
+}
+
+static int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
+{
+ return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
+}
+
+static inline void pi_clear_sn(struct pi_desc *pi_desc)
+{
+ return clear_bit(POSTED_INTR_SN,
+ (unsigned long *)&pi_desc->control);
+}
+
+static inline void pi_set_sn(struct pi_desc *pi_desc)
+{
+ return set_bit(POSTED_INTR_SN,
+ (unsigned long *)&pi_desc->control);
+}
+
+static inline void pi_clear_on(struct pi_desc *pi_desc)
+{
+ clear_bit(POSTED_INTR_ON,
+ (unsigned long *)&pi_desc->control);
+}
+
+static inline int pi_test_on(struct pi_desc *pi_desc)
+{
+ return test_bit(POSTED_INTR_ON,
+ (unsigned long *)&pi_desc->control);
+}
+
+static inline int pi_test_sn(struct pi_desc *pi_desc)
+{
+ return test_bit(POSTED_INTR_SN,
+ (unsigned long *)&pi_desc->control);
+}
+
+struct vmx_msrs {
+ unsigned int nr;
+ struct vmx_msr_entry val[NR_AUTOLOAD_MSRS];
+};
+
+struct vcpu_vmx {
+ struct kvm_vcpu vcpu;
+ unsigned long host_rsp;
+ u8 fail;
+ u8 msr_bitmap_mode;
+ u32 exit_intr_info;
+ u32 idt_vectoring_info;
+ ulong rflags;
+ struct shared_msr_entry *guest_msrs;
+ int nmsrs;
+ int save_nmsrs;
+ bool guest_msrs_dirty;
+ unsigned long host_idt_base;
+#ifdef CONFIG_X86_64
+ u64 msr_host_kernel_gs_base;
+ u64 msr_guest_kernel_gs_base;
+#endif
+
+ u64 spec_ctrl;
+
+ u32 vm_entry_controls_shadow;
+ u32 vm_exit_controls_shadow;
+ u32 secondary_exec_control;
+
+ /*
+ * loaded_vmcs points to the VMCS currently used in this vcpu. For a
+ * non-nested (L1) guest, it always points to vmcs01. For a nested
+ * guest (L2), it points to a different VMCS. loaded_cpu_state points
+ * to the VMCS whose state is loaded into the CPU registers that only
+ * need to be switched when transitioning to/from the kernel; a NULL
+ * value indicates that host state is loaded.
+ */
+ struct loaded_vmcs vmcs01;
+ struct loaded_vmcs *loaded_vmcs;
+ struct loaded_vmcs *loaded_cpu_state;
+ bool __launched; /* temporary, used in vmx_vcpu_run */
+ struct msr_autoload {
+ struct vmx_msrs guest;
+ struct vmx_msrs host;
+ } msr_autoload;
+
+ struct {
+ int vm86_active;
+ ulong save_rflags;
+ struct kvm_segment segs[8];
+ } rmode;
+ struct {
+ u32 bitmask; /* 4 bits per segment (1 bit per field) */
+ struct kvm_save_segment {
+ u16 selector;
+ unsigned long base;
+ u32 limit;
+ u32 ar;
+ } seg[8];
+ } segment_cache;
+ int vpid;
+ bool emulation_required;
+
+ u32 exit_reason;
+
+ /* Posted interrupt descriptor */
+ struct pi_desc pi_desc;
+
+ /* Support for a guest hypervisor (nested VMX) */
+ struct nested_vmx nested;
+
+ /* Dynamic PLE window. */
+ int ple_window;
+ bool ple_window_dirty;
+
+ bool req_immediate_exit;
+
+ /* Support for PML */
+#define PML_ENTITY_NUM 512
+ struct page *pml_pg;
+
+ /* apic deadline value in host tsc */
+ u64 hv_deadline_tsc;
+
+ u64 current_tsc_ratio;
+
+ u32 host_pkru;
+
+ unsigned long host_debugctlmsr;
+
+ /*
+ * Only bits masked by msr_ia32_feature_control_valid_bits can be set in
+ * msr_ia32_feature_control. FEATURE_CONTROL_LOCKED is always included
+ * in msr_ia32_feature_control_valid_bits.
+ */
+ u64 msr_ia32_feature_control;
+ u64 msr_ia32_feature_control_valid_bits;
+ u64 ept_pointer;
+ u64 msr_ia32_mcu_opt_ctrl;
+ bool disable_fb_clear;
+};
+
+enum segment_cache_field {
+ SEG_FIELD_SEL = 0,
+ SEG_FIELD_BASE = 1,
+ SEG_FIELD_LIMIT = 2,
+ SEG_FIELD_AR = 3,
+
+ SEG_FIELD_NR = 4
+};
+
+static inline struct kvm_vmx *to_kvm_vmx(struct kvm *kvm)
+{
+ return container_of(kvm, struct kvm_vmx, kvm);
+}
+
+static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
+{
+ return container_of(vcpu, struct vcpu_vmx, vcpu);
+}
+
+static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
+{
+ return &(to_vmx(vcpu)->pi_desc);
+}
+
+#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n)))))
+#define VMCS12_OFFSET(x) offsetof(struct vmcs12, x)
+#define FIELD(number, name) [ROL16(number, 6)] = VMCS12_OFFSET(name)
+#define FIELD64(number, name) \
+ FIELD(number, name), \
+ [ROL16(number##_HIGH, 6)] = VMCS12_OFFSET(name) + sizeof(u32)
+
+
+static u16 shadow_read_only_fields[] = {
+#define SHADOW_FIELD_RO(x) x,
+#include "vmx_shadow_fields.h"
+};
+static int max_shadow_read_only_fields =
+ ARRAY_SIZE(shadow_read_only_fields);
+
+static u16 shadow_read_write_fields[] = {
+#define SHADOW_FIELD_RW(x) x,
+#include "vmx_shadow_fields.h"
+};
+static int max_shadow_read_write_fields =
+ ARRAY_SIZE(shadow_read_write_fields);
+
+static const unsigned short vmcs_field_to_offset_table[] = {
+ FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
+ FIELD(POSTED_INTR_NV, posted_intr_nv),
+ FIELD(GUEST_ES_SELECTOR, guest_es_selector),
+ FIELD(GUEST_CS_SELECTOR, guest_cs_selector),
+ FIELD(GUEST_SS_SELECTOR, guest_ss_selector),
+ FIELD(GUEST_DS_SELECTOR, guest_ds_selector),
+ FIELD(GUEST_FS_SELECTOR, guest_fs_selector),
+ FIELD(GUEST_GS_SELECTOR, guest_gs_selector),
+ FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector),
+ FIELD(GUEST_TR_SELECTOR, guest_tr_selector),
+ FIELD(GUEST_INTR_STATUS, guest_intr_status),
+ FIELD(GUEST_PML_INDEX, guest_pml_index),
+ FIELD(HOST_ES_SELECTOR, host_es_selector),
+ FIELD(HOST_CS_SELECTOR, host_cs_selector),
+ FIELD(HOST_SS_SELECTOR, host_ss_selector),
+ FIELD(HOST_DS_SELECTOR, host_ds_selector),
+ FIELD(HOST_FS_SELECTOR, host_fs_selector),
+ FIELD(HOST_GS_SELECTOR, host_gs_selector),
+ FIELD(HOST_TR_SELECTOR, host_tr_selector),
+ FIELD64(IO_BITMAP_A, io_bitmap_a),
+ FIELD64(IO_BITMAP_B, io_bitmap_b),
+ FIELD64(MSR_BITMAP, msr_bitmap),
+ FIELD64(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr),
+ FIELD64(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr),
+ FIELD64(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr),
+ FIELD64(PML_ADDRESS, pml_address),
+ FIELD64(TSC_OFFSET, tsc_offset),
+ FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
+ FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
+ FIELD64(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr),
+ FIELD64(VM_FUNCTION_CONTROL, vm_function_control),
+ FIELD64(EPT_POINTER, ept_pointer),
+ FIELD64(EOI_EXIT_BITMAP0, eoi_exit_bitmap0),
+ FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1),
+ FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2),
+ FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3),
+ FIELD64(EPTP_LIST_ADDRESS, eptp_list_address),
+ FIELD64(VMREAD_BITMAP, vmread_bitmap),
+ FIELD64(VMWRITE_BITMAP, vmwrite_bitmap),
+ FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
+ FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
+ FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
+ FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl),
+ FIELD64(GUEST_IA32_PAT, guest_ia32_pat),
+ FIELD64(GUEST_IA32_EFER, guest_ia32_efer),
+ FIELD64(GUEST_IA32_PERF_GLOBAL_CTRL, guest_ia32_perf_global_ctrl),
+ FIELD64(GUEST_PDPTR0, guest_pdptr0),
+ FIELD64(GUEST_PDPTR1, guest_pdptr1),
+ FIELD64(GUEST_PDPTR2, guest_pdptr2),
+ FIELD64(GUEST_PDPTR3, guest_pdptr3),
+ FIELD64(GUEST_BNDCFGS, guest_bndcfgs),
+ FIELD64(HOST_IA32_PAT, host_ia32_pat),
+ FIELD64(HOST_IA32_EFER, host_ia32_efer),
+ FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl),
+ FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control),
+ FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control),
+ FIELD(EXCEPTION_BITMAP, exception_bitmap),
+ FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask),
+ FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match),
+ FIELD(CR3_TARGET_COUNT, cr3_target_count),
+ FIELD(VM_EXIT_CONTROLS, vm_exit_controls),
+ FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count),
+ FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count),
+ FIELD(VM_ENTRY_CONTROLS, vm_entry_controls),
+ FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count),
+ FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field),
+ FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE, vm_entry_exception_error_code),
+ FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len),
+ FIELD(TPR_THRESHOLD, tpr_threshold),
+ FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control),
+ FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error),
+ FIELD(VM_EXIT_REASON, vm_exit_reason),
+ FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info),
+ FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code),
+ FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field),
+ FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code),
+ FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len),
+ FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info),
+ FIELD(GUEST_ES_LIMIT, guest_es_limit),
+ FIELD(GUEST_CS_LIMIT, guest_cs_limit),
+ FIELD(GUEST_SS_LIMIT, guest_ss_limit),
+ FIELD(GUEST_DS_LIMIT, guest_ds_limit),
+ FIELD(GUEST_FS_LIMIT, guest_fs_limit),
+ FIELD(GUEST_GS_LIMIT, guest_gs_limit),
+ FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit),
+ FIELD(GUEST_TR_LIMIT, guest_tr_limit),
+ FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit),
+ FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit),
+ FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes),
+ FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes),
+ FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes),
+ FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes),
+ FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes),
+ FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes),
+ FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes),
+ FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes),
+ FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info),
+ FIELD(GUEST_ACTIVITY_STATE, guest_activity_state),
+ FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs),
+ FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs),
+ FIELD(VMX_PREEMPTION_TIMER_VALUE, vmx_preemption_timer_value),
+ FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask),
+ FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask),
+ FIELD(CR0_READ_SHADOW, cr0_read_shadow),
+ FIELD(CR4_READ_SHADOW, cr4_read_shadow),
+ FIELD(CR3_TARGET_VALUE0, cr3_target_value0),
+ FIELD(CR3_TARGET_VALUE1, cr3_target_value1),
+ FIELD(CR3_TARGET_VALUE2, cr3_target_value2),
+ FIELD(CR3_TARGET_VALUE3, cr3_target_value3),
+ FIELD(EXIT_QUALIFICATION, exit_qualification),
+ FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address),
+ FIELD(GUEST_CR0, guest_cr0),
+ FIELD(GUEST_CR3, guest_cr3),
+ FIELD(GUEST_CR4, guest_cr4),
+ FIELD(GUEST_ES_BASE, guest_es_base),
+ FIELD(GUEST_CS_BASE, guest_cs_base),
+ FIELD(GUEST_SS_BASE, guest_ss_base),
+ FIELD(GUEST_DS_BASE, guest_ds_base),
+ FIELD(GUEST_FS_BASE, guest_fs_base),
+ FIELD(GUEST_GS_BASE, guest_gs_base),
+ FIELD(GUEST_LDTR_BASE, guest_ldtr_base),
+ FIELD(GUEST_TR_BASE, guest_tr_base),
+ FIELD(GUEST_GDTR_BASE, guest_gdtr_base),
+ FIELD(GUEST_IDTR_BASE, guest_idtr_base),
+ FIELD(GUEST_DR7, guest_dr7),
+ FIELD(GUEST_RSP, guest_rsp),
+ FIELD(GUEST_RIP, guest_rip),
+ FIELD(GUEST_RFLAGS, guest_rflags),
+ FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions),
+ FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp),
+ FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip),
+ FIELD(HOST_CR0, host_cr0),
+ FIELD(HOST_CR3, host_cr3),
+ FIELD(HOST_CR4, host_cr4),
+ FIELD(HOST_FS_BASE, host_fs_base),
+ FIELD(HOST_GS_BASE, host_gs_base),
+ FIELD(HOST_TR_BASE, host_tr_base),
+ FIELD(HOST_GDTR_BASE, host_gdtr_base),
+ FIELD(HOST_IDTR_BASE, host_idtr_base),
+ FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp),
+ FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip),
+ FIELD(HOST_RSP, host_rsp),
+ FIELD(HOST_RIP, host_rip),
+};
+
+static inline short vmcs_field_to_offset(unsigned long field)
+{
+ const size_t size = ARRAY_SIZE(vmcs_field_to_offset_table);
+ unsigned short offset;
+ unsigned index;
+
+ if (field >> 15)
+ return -ENOENT;
+
+ index = ROL16(field, 6);
+ if (index >= size)
+ return -ENOENT;
+
+ index = array_index_nospec(index, size);
+ offset = vmcs_field_to_offset_table[index];
+ if (offset == 0)
+ return -ENOENT;
+ return offset;
+}
+
+static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
+{
+ return to_vmx(vcpu)->nested.cached_vmcs12;
+}
+
+static inline struct vmcs12 *get_shadow_vmcs12(struct kvm_vcpu *vcpu)
+{
+ return to_vmx(vcpu)->nested.cached_shadow_vmcs12;
+}
+
+static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu);
+static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
+static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa);
+static bool vmx_xsaves_supported(void);
+static void vmx_set_segment(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg);
+static void vmx_get_segment(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg);
+static bool guest_state_valid(struct kvm_vcpu *vcpu);
+static u32 vmx_segment_access_rights(struct kvm_segment *var);
+static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx);
+static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
+static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
+static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
+ u16 error_code);
+static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
+static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
+ u32 msr, int type);
+
+static DEFINE_PER_CPU(struct vmcs *, vmxarea);
+static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
+/*
+ * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
+ * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
+ */
+static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
+
+/*
+ * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we
+ * can find which vCPU should be waken up.
+ */
+static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
+static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
+
+enum {
+ VMX_VMREAD_BITMAP,
+ VMX_VMWRITE_BITMAP,
+ VMX_BITMAP_NR
+};
+
+static unsigned long *vmx_bitmap[VMX_BITMAP_NR];
+
+#define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP])
+#define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP])
+
+static bool cpu_has_load_ia32_efer;
+static bool cpu_has_load_perf_global_ctrl;
+
+static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
+static DEFINE_SPINLOCK(vmx_vpid_lock);
+
+static struct vmcs_config {
+ int size;
+ int order;
+ u32 basic_cap;
+ u32 revision_id;
+ u32 pin_based_exec_ctrl;
+ u32 cpu_based_exec_ctrl;
+ u32 cpu_based_2nd_exec_ctrl;
+ u32 vmexit_ctrl;
+ u32 vmentry_ctrl;
+ struct nested_vmx_msrs nested;
+} vmcs_config;
+
+static struct vmx_capability {
+ u32 ept;
+ u32 vpid;
+} vmx_capability;
+
+#define VMX_SEGMENT_FIELD(seg) \
+ [VCPU_SREG_##seg] = { \
+ .selector = GUEST_##seg##_SELECTOR, \
+ .base = GUEST_##seg##_BASE, \
+ .limit = GUEST_##seg##_LIMIT, \
+ .ar_bytes = GUEST_##seg##_AR_BYTES, \
+ }
+
+static const struct kvm_vmx_segment_field {
+ unsigned selector;
+ unsigned base;
+ unsigned limit;
+ unsigned ar_bytes;
+} kvm_vmx_segment_fields[] = {
+ VMX_SEGMENT_FIELD(CS),
+ VMX_SEGMENT_FIELD(DS),
+ VMX_SEGMENT_FIELD(ES),
+ VMX_SEGMENT_FIELD(FS),
+ VMX_SEGMENT_FIELD(GS),
+ VMX_SEGMENT_FIELD(SS),
+ VMX_SEGMENT_FIELD(TR),
+ VMX_SEGMENT_FIELD(LDTR),
+};
+
+static u64 host_efer;
+
+static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
+
+/*
+ * Keep MSR_STAR at the end, as setup_msrs() will try to optimize it
+ * away by decrementing the array size.
+ */
+static const u32 vmx_msr_index[] = {
+#ifdef CONFIG_X86_64
+ MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
+#endif
+ MSR_EFER, MSR_TSC_AUX, MSR_STAR,
+};
+
+DEFINE_STATIC_KEY_FALSE(enable_evmcs);
+
+#define current_evmcs ((struct hv_enlightened_vmcs *)this_cpu_read(current_vmcs))
+
+#define KVM_EVMCS_VERSION 1
+
+#if IS_ENABLED(CONFIG_HYPERV)
+static bool __read_mostly enlightened_vmcs = true;
+module_param(enlightened_vmcs, bool, 0444);
+
+static inline void evmcs_write64(unsigned long field, u64 value)
+{
+ u16 clean_field;
+ int offset = get_evmcs_offset(field, &clean_field);
+
+ if (offset < 0)
+ return;
+
+ *(u64 *)((char *)current_evmcs + offset) = value;
+
+ current_evmcs->hv_clean_fields &= ~clean_field;
+}
+
+static inline void evmcs_write32(unsigned long field, u32 value)
+{
+ u16 clean_field;
+ int offset = get_evmcs_offset(field, &clean_field);
+
+ if (offset < 0)
+ return;
+
+ *(u32 *)((char *)current_evmcs + offset) = value;
+ current_evmcs->hv_clean_fields &= ~clean_field;
+}
+
+static inline void evmcs_write16(unsigned long field, u16 value)
+{
+ u16 clean_field;
+ int offset = get_evmcs_offset(field, &clean_field);
+
+ if (offset < 0)
+ return;
+
+ *(u16 *)((char *)current_evmcs + offset) = value;
+ current_evmcs->hv_clean_fields &= ~clean_field;
+}
+
+static inline u64 evmcs_read64(unsigned long field)
+{
+ int offset = get_evmcs_offset(field, NULL);
+
+ if (offset < 0)
+ return 0;
+
+ return *(u64 *)((char *)current_evmcs + offset);
+}
+
+static inline u32 evmcs_read32(unsigned long field)
+{
+ int offset = get_evmcs_offset(field, NULL);
+
+ if (offset < 0)
+ return 0;
+
+ return *(u32 *)((char *)current_evmcs + offset);
+}
+
+static inline u16 evmcs_read16(unsigned long field)
+{
+ int offset = get_evmcs_offset(field, NULL);
+
+ if (offset < 0)
+ return 0;
+
+ return *(u16 *)((char *)current_evmcs + offset);
+}
+
+static inline void evmcs_touch_msr_bitmap(void)
+{
+ if (unlikely(!current_evmcs))
+ return;
+
+ if (current_evmcs->hv_enlightenments_control.msr_bitmap)
+ current_evmcs->hv_clean_fields &=
+ ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP;
+}
+
+static void evmcs_load(u64 phys_addr)
+{
+ struct hv_vp_assist_page *vp_ap =
+ hv_get_vp_assist_page(smp_processor_id());
+
+ vp_ap->current_nested_vmcs = phys_addr;
+ vp_ap->enlighten_vmentry = 1;
+}
+
+static void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf)
+{
+ /*
+ * Enlightened VMCSv1 doesn't support these:
+ *
+ * POSTED_INTR_NV = 0x00000002,
+ * GUEST_INTR_STATUS = 0x00000810,
+ * APIC_ACCESS_ADDR = 0x00002014,
+ * POSTED_INTR_DESC_ADDR = 0x00002016,
+ * EOI_EXIT_BITMAP0 = 0x0000201c,
+ * EOI_EXIT_BITMAP1 = 0x0000201e,
+ * EOI_EXIT_BITMAP2 = 0x00002020,
+ * EOI_EXIT_BITMAP3 = 0x00002022,
+ */
+ vmcs_conf->pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
+ vmcs_conf->cpu_based_2nd_exec_ctrl &=
+ ~SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
+ vmcs_conf->cpu_based_2nd_exec_ctrl &=
+ ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+ vmcs_conf->cpu_based_2nd_exec_ctrl &=
+ ~SECONDARY_EXEC_APIC_REGISTER_VIRT;
+
+ /*
+ * GUEST_PML_INDEX = 0x00000812,
+ * PML_ADDRESS = 0x0000200e,
+ */
+ vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_ENABLE_PML;
+
+ /* VM_FUNCTION_CONTROL = 0x00002018, */
+ vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_ENABLE_VMFUNC;
+
+ /*
+ * EPTP_LIST_ADDRESS = 0x00002024,
+ * VMREAD_BITMAP = 0x00002026,
+ * VMWRITE_BITMAP = 0x00002028,
+ */
+ vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_SHADOW_VMCS;
+
+ /*
+ * TSC_MULTIPLIER = 0x00002032,
+ */
+ vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_TSC_SCALING;
+
+ /*
+ * PLE_GAP = 0x00004020,
+ * PLE_WINDOW = 0x00004022,
+ */
+ vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
+
+ /*
+ * VMX_PREEMPTION_TIMER_VALUE = 0x0000482E,
+ */
+ vmcs_conf->pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+
+ /*
+ * GUEST_IA32_PERF_GLOBAL_CTRL = 0x00002808,
+ * HOST_IA32_PERF_GLOBAL_CTRL = 0x00002c04,
+ */
+ vmcs_conf->vmexit_ctrl &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
+ vmcs_conf->vmentry_ctrl &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
+
+ /*
+ * Currently unsupported in KVM:
+ * GUEST_IA32_RTIT_CTL = 0x00002814,
+ */
+}
+
+/* check_ept_pointer() should be under protection of ept_pointer_lock. */
+static void check_ept_pointer_match(struct kvm *kvm)
+{
+ struct kvm_vcpu *vcpu;
+ u64 tmp_eptp = INVALID_PAGE;
+ int i;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (!VALID_PAGE(tmp_eptp)) {
+ tmp_eptp = to_vmx(vcpu)->ept_pointer;
+ } else if (tmp_eptp != to_vmx(vcpu)->ept_pointer) {
+ to_kvm_vmx(kvm)->ept_pointers_match
+ = EPT_POINTERS_MISMATCH;
+ return;
+ }
+ }
+
+ to_kvm_vmx(kvm)->ept_pointers_match = EPT_POINTERS_MATCH;
+}
+
+static int vmx_hv_remote_flush_tlb(struct kvm *kvm)
+{
+ int ret;
+
+ spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
+
+ if (to_kvm_vmx(kvm)->ept_pointers_match == EPT_POINTERS_CHECK)
+ check_ept_pointer_match(kvm);
+
+ if (to_kvm_vmx(kvm)->ept_pointers_match != EPT_POINTERS_MATCH) {
+ ret = -ENOTSUPP;
+ goto out;
+ }
+
+ /*
+ * FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE hypercall needs the address of the
+ * base of EPT PML4 table, strip off EPT configuration information.
+ */
+ ret = hyperv_flush_guest_mapping(
+ to_vmx(kvm_get_vcpu(kvm, 0))->ept_pointer & PAGE_MASK);
+
+out:
+ spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
+ return ret;
+}
+#else /* !IS_ENABLED(CONFIG_HYPERV) */
+static inline void evmcs_write64(unsigned long field, u64 value) {}
+static inline void evmcs_write32(unsigned long field, u32 value) {}
+static inline void evmcs_write16(unsigned long field, u16 value) {}
+static inline u64 evmcs_read64(unsigned long field) { return 0; }
+static inline u32 evmcs_read32(unsigned long field) { return 0; }
+static inline u16 evmcs_read16(unsigned long field) { return 0; }
+static inline void evmcs_load(u64 phys_addr) {}
+static inline void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) {}
+static inline void evmcs_touch_msr_bitmap(void) {}
+#endif /* IS_ENABLED(CONFIG_HYPERV) */
+
+static inline bool is_exception_n(u32 intr_info, u8 vector)
+{
+ return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
+ INTR_INFO_VALID_MASK)) ==
+ (INTR_TYPE_HARD_EXCEPTION | vector | INTR_INFO_VALID_MASK);
+}
+
+static inline bool is_debug(u32 intr_info)
+{
+ return is_exception_n(intr_info, DB_VECTOR);
+}
+
+static inline bool is_breakpoint(u32 intr_info)
+{
+ return is_exception_n(intr_info, BP_VECTOR);
+}
+
+static inline bool is_page_fault(u32 intr_info)
+{
+ return is_exception_n(intr_info, PF_VECTOR);
+}
+
+static inline bool is_no_device(u32 intr_info)
+{
+ return is_exception_n(intr_info, NM_VECTOR);
+}
+
+static inline bool is_invalid_opcode(u32 intr_info)
+{
+ return is_exception_n(intr_info, UD_VECTOR);
+}
+
+static inline bool is_gp_fault(u32 intr_info)
+{
+ return is_exception_n(intr_info, GP_VECTOR);
+}
+
+static inline bool is_external_interrupt(u32 intr_info)
+{
+ return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
+ == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
+}
+
+static inline bool is_machine_check(u32 intr_info)
+{
+ return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
+ INTR_INFO_VALID_MASK)) ==
+ (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK);
+}
+
+/* Undocumented: icebp/int1 */
+static inline bool is_icebp(u32 intr_info)
+{
+ return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
+ == (INTR_TYPE_PRIV_SW_EXCEPTION | INTR_INFO_VALID_MASK);
+}
+
+static inline bool cpu_has_vmx_msr_bitmap(void)
+{
+ return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS;
+}
+
+static inline bool cpu_has_vmx_tpr_shadow(void)
+{
+ return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW;
+}
+
+static inline bool cpu_need_tpr_shadow(struct kvm_vcpu *vcpu)
+{
+ return cpu_has_vmx_tpr_shadow() && lapic_in_kernel(vcpu);
+}
+
+static inline bool cpu_has_secondary_exec_ctrls(void)
+{
+ return vmcs_config.cpu_based_exec_ctrl &
+ CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
+}
+
+static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+}
+
+static inline bool cpu_has_vmx_virtualize_x2apic_mode(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
+}
+
+static inline bool cpu_has_vmx_apic_register_virt(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_APIC_REGISTER_VIRT;
+}
+
+static inline bool cpu_has_vmx_virtual_intr_delivery(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
+}
+
+static inline bool cpu_has_vmx_encls_vmexit(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_ENCLS_EXITING;
+}
+
+/*
+ * Comment's format: document - errata name - stepping - processor name.
+ * Refer from
+ * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp
+ */
+static u32 vmx_preemption_cpu_tfms[] = {
+/* 323344.pdf - BA86 - D0 - Xeon 7500 Series */
+0x000206E6,
+/* 323056.pdf - AAX65 - C2 - Xeon L3406 */
+/* 322814.pdf - AAT59 - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */
+/* 322911.pdf - AAU65 - C2 - i5-600, i3-500 Desktop and Pentium G6950 */
+0x00020652,
+/* 322911.pdf - AAU65 - K0 - i5-600, i3-500 Desktop and Pentium G6950 */
+0x00020655,
+/* 322373.pdf - AAO95 - B1 - Xeon 3400 Series */
+/* 322166.pdf - AAN92 - B1 - i7-800 and i5-700 Desktop */
+/*
+ * 320767.pdf - AAP86 - B1 -
+ * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile
+ */
+0x000106E5,
+/* 321333.pdf - AAM126 - C0 - Xeon 3500 */
+0x000106A0,
+/* 321333.pdf - AAM126 - C1 - Xeon 3500 */
+0x000106A1,
+/* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */
+0x000106A4,
+ /* 321333.pdf - AAM126 - D0 - Xeon 3500 */
+ /* 321324.pdf - AAK139 - D0 - Xeon 5500 */
+ /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */
+0x000106A5,
+};
+
+static inline bool cpu_has_broken_vmx_preemption_timer(void)
+{
+ u32 eax = cpuid_eax(0x00000001), i;
+
+ /* Clear the reserved bits */
+ eax &= ~(0x3U << 14 | 0xfU << 28);
+ for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++)
+ if (eax == vmx_preemption_cpu_tfms[i])
+ return true;
+
+ return false;
+}
+
+static inline bool cpu_has_vmx_preemption_timer(void)
+{
+ return vmcs_config.pin_based_exec_ctrl &
+ PIN_BASED_VMX_PREEMPTION_TIMER;
+}
+
+static inline bool cpu_has_vmx_posted_intr(void)
+{
+ return IS_ENABLED(CONFIG_X86_LOCAL_APIC) &&
+ vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR;
+}
+
+static inline bool cpu_has_vmx_apicv(void)
+{
+ return cpu_has_vmx_apic_register_virt() &&
+ cpu_has_vmx_virtual_intr_delivery() &&
+ cpu_has_vmx_posted_intr();
+}
+
+static inline bool cpu_has_vmx_flexpriority(void)
+{
+ return cpu_has_vmx_tpr_shadow() &&
+ cpu_has_vmx_virtualize_apic_accesses();
+}
+
+static inline bool cpu_has_vmx_ept_execute_only(void)
+{
+ return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT;
+}
+
+static inline bool cpu_has_vmx_ept_2m_page(void)
+{
+ return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT;
+}
+
+static inline bool cpu_has_vmx_ept_1g_page(void)
+{
+ return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT;
+}
+
+static inline bool cpu_has_vmx_ept_4levels(void)
+{
+ return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT;
+}
+
+static inline bool cpu_has_vmx_ept_mt_wb(void)
+{
+ return vmx_capability.ept & VMX_EPTP_WB_BIT;
+}
+
+static inline bool cpu_has_vmx_ept_5levels(void)
+{
+ return vmx_capability.ept & VMX_EPT_PAGE_WALK_5_BIT;
+}
+
+static inline bool cpu_has_vmx_ept_ad_bits(void)
+{
+ return vmx_capability.ept & VMX_EPT_AD_BIT;
+}
+
+static inline bool cpu_has_vmx_invept_context(void)
+{
+ return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT;
+}
+
+static inline bool cpu_has_vmx_invept_global(void)
+{
+ return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT;
+}
+
+static inline bool cpu_has_vmx_invvpid_individual_addr(void)
+{
+ return vmx_capability.vpid & VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT;
+}
+
+static inline bool cpu_has_vmx_invvpid_single(void)
+{
+ return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT;
+}
+
+static inline bool cpu_has_vmx_invvpid_global(void)
+{
+ return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
+}
+
+static inline bool cpu_has_vmx_invvpid(void)
+{
+ return vmx_capability.vpid & VMX_VPID_INVVPID_BIT;
+}
+
+static inline bool cpu_has_vmx_ept(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_ENABLE_EPT;
+}
+
+static inline bool cpu_has_vmx_unrestricted_guest(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_UNRESTRICTED_GUEST;
+}
+
+static inline bool cpu_has_vmx_ple(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_PAUSE_LOOP_EXITING;
+}
+
+static inline bool cpu_has_vmx_basic_inout(void)
+{
+ return (((u64)vmcs_config.basic_cap << 32) & VMX_BASIC_INOUT);
+}
+
+static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
+{
+ return flexpriority_enabled && lapic_in_kernel(vcpu);
+}
+
+static inline bool cpu_has_vmx_vpid(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_ENABLE_VPID;
+}
+
+static inline bool cpu_has_vmx_rdtscp(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_RDTSCP;
+}
+
+static inline bool cpu_has_vmx_invpcid(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_ENABLE_INVPCID;
+}
+
+static inline bool cpu_has_virtual_nmis(void)
+{
+ return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
+}
+
+static inline bool cpu_has_vmx_wbinvd_exit(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_WBINVD_EXITING;
+}
+
+static inline bool cpu_has_vmx_shadow_vmcs(void)
+{
+ u64 vmx_msr;
+ rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
+ /* check if the cpu supports writing r/o exit information fields */
+ if (!(vmx_msr & MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS))
+ return false;
+
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_SHADOW_VMCS;
+}
+
+static inline bool cpu_has_vmx_pml(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML;
+}
+
+static inline bool cpu_has_vmx_tsc_scaling(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_TSC_SCALING;
+}
+
+static inline bool cpu_has_vmx_vmfunc(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_ENABLE_VMFUNC;
+}
+
+static bool vmx_umip_emulated(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_DESC;
+}
+
+static inline bool report_flexpriority(void)
+{
+ return flexpriority_enabled;
+}
+
+static inline unsigned nested_cpu_vmx_misc_cr3_count(struct kvm_vcpu *vcpu)
+{
+ return vmx_misc_cr3_count(to_vmx(vcpu)->nested.msrs.misc_low);
+}
+
+/*
+ * Do the virtual VMX capability MSRs specify that L1 can use VMWRITE
+ * to modify any valid field of the VMCS, or are the VM-exit
+ * information fields read-only?
+ */
+static inline bool nested_cpu_has_vmwrite_any_field(struct kvm_vcpu *vcpu)
+{
+ return to_vmx(vcpu)->nested.msrs.misc_low &
+ MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS;
+}
+
+static inline bool nested_cpu_has_zero_length_injection(struct kvm_vcpu *vcpu)
+{
+ return to_vmx(vcpu)->nested.msrs.misc_low & VMX_MISC_ZERO_LEN_INS;
+}
+
+static inline bool nested_cpu_supports_monitor_trap_flag(struct kvm_vcpu *vcpu)
+{
+ return to_vmx(vcpu)->nested.msrs.procbased_ctls_high &
+ CPU_BASED_MONITOR_TRAP_FLAG;
+}
+
+static inline bool nested_cpu_has_vmx_shadow_vmcs(struct kvm_vcpu *vcpu)
+{
+ return to_vmx(vcpu)->nested.msrs.secondary_ctls_high &
+ SECONDARY_EXEC_SHADOW_VMCS;
+}
+
+static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit)
+{
+ return vmcs12->cpu_based_vm_exec_control & bit;
+}
+
+static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit)
+{
+ return (vmcs12->cpu_based_vm_exec_control &
+ CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
+ (vmcs12->secondary_vm_exec_control & bit);
+}
+
+static inline bool nested_cpu_has_preemption_timer(struct vmcs12 *vmcs12)
+{
+ return vmcs12->pin_based_vm_exec_control &
+ PIN_BASED_VMX_PREEMPTION_TIMER;
+}
+
+static inline bool nested_cpu_has_nmi_exiting(struct vmcs12 *vmcs12)
+{
+ return vmcs12->pin_based_vm_exec_control & PIN_BASED_NMI_EXITING;
+}
+
+static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12)
+{
+ return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
+}
+
+static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT);
+}
+
+static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
+}
+
+static inline bool nested_cpu_has_pml(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_PML);
+}
+
+static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
+}
+
+static inline bool nested_cpu_has_vpid(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VPID);
+}
+
+static inline bool nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT);
+}
+
+static inline bool nested_cpu_has_vid(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
+}
+
+static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12)
+{
+ return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR;
+}
+
+static inline bool nested_cpu_has_vmfunc(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VMFUNC);
+}
+
+static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has_vmfunc(vmcs12) &&
+ (vmcs12->vm_function_control &
+ VMX_VMFUNC_EPTP_SWITCHING);
+}
+
+static inline bool nested_cpu_has_shadow_vmcs(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_SHADOW_VMCS);
+}
+
+static inline bool is_nmi(u32 intr_info)
+{
+ return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
+ == (INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK);
+}
+
+static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
+ u32 exit_intr_info,
+ unsigned long exit_qualification);
+static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12,
+ u32 reason, unsigned long qualification);
+
+static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
+{
+ int i;
+
+ for (i = 0; i < vmx->nmsrs; ++i)
+ if (vmx_msr_index[vmx->guest_msrs[i].index] == msr)
+ return i;
+ return -1;
+}
+
+static inline void __invvpid(unsigned long ext, u16 vpid, gva_t gva)
+{
+ struct {
+ u64 vpid : 16;
+ u64 rsvd : 48;
+ u64 gva;
+ } operand = { vpid, 0, gva };
+ bool error;
+
+ asm volatile (__ex(ASM_VMX_INVVPID) CC_SET(na)
+ : CC_OUT(na) (error) : "a"(&operand), "c"(ext)
+ : "memory");
+ BUG_ON(error);
+}
+
+static inline void __invept(unsigned long ext, u64 eptp, gpa_t gpa)
+{
+ struct {
+ u64 eptp, gpa;
+ } operand = {eptp, gpa};
+ bool error;
+
+ asm volatile (__ex(ASM_VMX_INVEPT) CC_SET(na)
+ : CC_OUT(na) (error) : "a" (&operand), "c" (ext)
+ : "memory");
+ BUG_ON(error);
+}
+
+static void vmx_setup_fb_clear_ctrl(void)
+{
+ u64 msr;
+
+ if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES) &&
+ !boot_cpu_has_bug(X86_BUG_MDS) &&
+ !boot_cpu_has_bug(X86_BUG_TAA)) {
+ rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
+ if (msr & ARCH_CAP_FB_CLEAR_CTRL)
+ vmx_fb_clear_ctrl_available = true;
+ }
+}
+
+static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx)
+{
+ u64 msr;
+
+ if (!vmx->disable_fb_clear)
+ return;
+
+ rdmsrl(MSR_IA32_MCU_OPT_CTRL, msr);
+ msr |= FB_CLEAR_DIS;
+ wrmsrl(MSR_IA32_MCU_OPT_CTRL, msr);
+ /* Cache the MSR value to avoid reading it later */
+ vmx->msr_ia32_mcu_opt_ctrl = msr;
+}
+
+static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx)
+{
+ if (!vmx->disable_fb_clear)
+ return;
+
+ vmx->msr_ia32_mcu_opt_ctrl &= ~FB_CLEAR_DIS;
+ wrmsrl(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl);
+}
+
+static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx)
+{
+ vmx->disable_fb_clear = vmx_fb_clear_ctrl_available;
+
+ /*
+ * If guest will not execute VERW, there is no need to set FB_CLEAR_DIS
+ * at VMEntry. Skip the MSR read/write when a guest has no use case to
+ * execute VERW.
+ */
+ if ((vcpu->arch.arch_capabilities & ARCH_CAP_FB_CLEAR) ||
+ ((vcpu->arch.arch_capabilities & ARCH_CAP_MDS_NO) &&
+ (vcpu->arch.arch_capabilities & ARCH_CAP_TAA_NO) &&
+ (vcpu->arch.arch_capabilities & ARCH_CAP_PSDP_NO) &&
+ (vcpu->arch.arch_capabilities & ARCH_CAP_FBSDP_NO) &&
+ (vcpu->arch.arch_capabilities & ARCH_CAP_SBDR_SSDP_NO)))
+ vmx->disable_fb_clear = false;
+}
+
+static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
+{
+ int i;
+
+ i = __find_msr_index(vmx, msr);
+ if (i >= 0)
+ return &vmx->guest_msrs[i];
+ return NULL;
+}
+
+static void vmcs_clear(struct vmcs *vmcs)
+{
+ u64 phys_addr = __pa(vmcs);
+ bool error;
+
+ asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) CC_SET(na)
+ : CC_OUT(na) (error) : "a"(&phys_addr), "m"(phys_addr)
+ : "memory");
+ if (unlikely(error))
+ printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
+ vmcs, phys_addr);
+}
+
+static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs)
+{
+ vmcs_clear(loaded_vmcs->vmcs);
+ if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
+ vmcs_clear(loaded_vmcs->shadow_vmcs);
+ loaded_vmcs->cpu = -1;
+ loaded_vmcs->launched = 0;
+}
+
+static void vmcs_load(struct vmcs *vmcs)
+{
+ u64 phys_addr = __pa(vmcs);
+ bool error;
+
+ if (static_branch_unlikely(&enable_evmcs))
+ return evmcs_load(phys_addr);
+
+ asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) CC_SET(na)
+ : CC_OUT(na) (error) : "a"(&phys_addr), "m"(phys_addr)
+ : "memory");
+ if (unlikely(error))
+ printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n",
+ vmcs, phys_addr);
+}
+
+#ifdef CONFIG_KEXEC_CORE
+static void crash_vmclear_local_loaded_vmcss(void)
+{
+ int cpu = raw_smp_processor_id();
+ struct loaded_vmcs *v;
+
+ list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
+ loaded_vmcss_on_cpu_link)
+ vmcs_clear(v->vmcs);
+}
+#endif /* CONFIG_KEXEC_CORE */
+
+static void __loaded_vmcs_clear(void *arg)
+{
+ struct loaded_vmcs *loaded_vmcs = arg;
+ int cpu = raw_smp_processor_id();
+
+ if (loaded_vmcs->cpu != cpu)
+ return; /* vcpu migration can race with cpu offline */
+ if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
+ per_cpu(current_vmcs, cpu) = NULL;
+
+ vmcs_clear(loaded_vmcs->vmcs);
+ if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
+ vmcs_clear(loaded_vmcs->shadow_vmcs);
+
+ list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
+
+ /*
+ * Ensure all writes to loaded_vmcs, including deleting it from its
+ * current percpu list, complete before setting loaded_vmcs->vcpu to
+ * -1, otherwise a different cpu can see vcpu == -1 first and add
+ * loaded_vmcs to its percpu list before it's deleted from this cpu's
+ * list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs().
+ */
+ smp_wmb();
+
+ loaded_vmcs->cpu = -1;
+ loaded_vmcs->launched = 0;
+}
+
+static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
+{
+ int cpu = loaded_vmcs->cpu;
+
+ if (cpu != -1)
+ smp_call_function_single(cpu,
+ __loaded_vmcs_clear, loaded_vmcs, 1);
+}
+
+static inline bool vpid_sync_vcpu_addr(int vpid, gva_t addr)
+{
+ if (vpid == 0)
+ return true;
+
+ if (cpu_has_vmx_invvpid_individual_addr()) {
+ __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR, vpid, addr);
+ return true;
+ }
+
+ return false;
+}
+
+static inline void vpid_sync_vcpu_single(int vpid)
+{
+ if (vpid == 0)
+ return;
+
+ if (cpu_has_vmx_invvpid_single())
+ __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vpid, 0);
+}
+
+static inline void vpid_sync_vcpu_global(void)
+{
+ if (cpu_has_vmx_invvpid_global())
+ __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
+}
+
+static inline void vpid_sync_context(int vpid)
+{
+ if (cpu_has_vmx_invvpid_single())
+ vpid_sync_vcpu_single(vpid);
+ else
+ vpid_sync_vcpu_global();
+}
+
+static inline void ept_sync_global(void)
+{
+ __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
+}
+
+static inline void ept_sync_context(u64 eptp)
+{
+ if (cpu_has_vmx_invept_context())
+ __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
+ else
+ ept_sync_global();
+}
+
+static __always_inline void vmcs_check16(unsigned long field)
+{
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000,
+ "16-bit accessor invalid for 64-bit field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
+ "16-bit accessor invalid for 64-bit high field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
+ "16-bit accessor invalid for 32-bit high field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
+ "16-bit accessor invalid for natural width field");
+}
+
+static __always_inline void vmcs_check32(unsigned long field)
+{
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
+ "32-bit accessor invalid for 16-bit field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
+ "32-bit accessor invalid for natural width field");
+}
+
+static __always_inline void vmcs_check64(unsigned long field)
+{
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
+ "64-bit accessor invalid for 16-bit field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
+ "64-bit accessor invalid for 64-bit high field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
+ "64-bit accessor invalid for 32-bit field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
+ "64-bit accessor invalid for natural width field");
+}
+
+static __always_inline void vmcs_checkl(unsigned long field)
+{
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
+ "Natural width accessor invalid for 16-bit field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000,
+ "Natural width accessor invalid for 64-bit field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
+ "Natural width accessor invalid for 64-bit high field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
+ "Natural width accessor invalid for 32-bit field");
+}
+
+static __always_inline unsigned long __vmcs_readl(unsigned long field)
+{
+ unsigned long value;
+
+ asm volatile (__ex_clear(ASM_VMX_VMREAD_RDX_RAX, "%0")
+ : "=a"(value) : "d"(field) : "cc");
+ return value;
+}
+
+static __always_inline u16 vmcs_read16(unsigned long field)
+{
+ vmcs_check16(field);
+ if (static_branch_unlikely(&enable_evmcs))
+ return evmcs_read16(field);
+ return __vmcs_readl(field);
+}
+
+static __always_inline u32 vmcs_read32(unsigned long field)
+{
+ vmcs_check32(field);
+ if (static_branch_unlikely(&enable_evmcs))
+ return evmcs_read32(field);
+ return __vmcs_readl(field);
+}
+
+static __always_inline u64 vmcs_read64(unsigned long field)
+{
+ vmcs_check64(field);
+ if (static_branch_unlikely(&enable_evmcs))
+ return evmcs_read64(field);
+#ifdef CONFIG_X86_64
+ return __vmcs_readl(field);
+#else
+ return __vmcs_readl(field) | ((u64)__vmcs_readl(field+1) << 32);
+#endif
+}
+
+static __always_inline unsigned long vmcs_readl(unsigned long field)
+{
+ vmcs_checkl(field);
+ if (static_branch_unlikely(&enable_evmcs))
+ return evmcs_read64(field);
+ return __vmcs_readl(field);
+}
+
+static noinline void vmwrite_error(unsigned long field, unsigned long value)
+{
+ printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n",
+ field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
+ dump_stack();
+}
+
+static __always_inline void __vmcs_writel(unsigned long field, unsigned long value)
+{
+ bool error;
+
+ asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) CC_SET(na)
+ : CC_OUT(na) (error) : "a"(value), "d"(field));
+ if (unlikely(error))
+ vmwrite_error(field, value);
+}
+
+static __always_inline void vmcs_write16(unsigned long field, u16 value)
+{
+ vmcs_check16(field);
+ if (static_branch_unlikely(&enable_evmcs))
+ return evmcs_write16(field, value);
+
+ __vmcs_writel(field, value);
+}
+
+static __always_inline void vmcs_write32(unsigned long field, u32 value)
+{
+ vmcs_check32(field);
+ if (static_branch_unlikely(&enable_evmcs))
+ return evmcs_write32(field, value);
+
+ __vmcs_writel(field, value);
+}
+
+static __always_inline void vmcs_write64(unsigned long field, u64 value)
+{
+ vmcs_check64(field);
+ if (static_branch_unlikely(&enable_evmcs))
+ return evmcs_write64(field, value);
+
+ __vmcs_writel(field, value);
+#ifndef CONFIG_X86_64
+ asm volatile ("");
+ __vmcs_writel(field+1, value >> 32);
+#endif
+}
+
+static __always_inline void vmcs_writel(unsigned long field, unsigned long value)
+{
+ vmcs_checkl(field);
+ if (static_branch_unlikely(&enable_evmcs))
+ return evmcs_write64(field, value);
+
+ __vmcs_writel(field, value);
+}
+
+static __always_inline void vmcs_clear_bits(unsigned long field, u32 mask)
+{
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000,
+ "vmcs_clear_bits does not support 64-bit fields");
+ if (static_branch_unlikely(&enable_evmcs))
+ return evmcs_write32(field, evmcs_read32(field) & ~mask);
+
+ __vmcs_writel(field, __vmcs_readl(field) & ~mask);
+}
+
+static __always_inline void vmcs_set_bits(unsigned long field, u32 mask)
+{
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000,
+ "vmcs_set_bits does not support 64-bit fields");
+ if (static_branch_unlikely(&enable_evmcs))
+ return evmcs_write32(field, evmcs_read32(field) | mask);
+
+ __vmcs_writel(field, __vmcs_readl(field) | mask);
+}
+
+static inline void vm_entry_controls_reset_shadow(struct vcpu_vmx *vmx)
+{
+ vmx->vm_entry_controls_shadow = vmcs_read32(VM_ENTRY_CONTROLS);
+}
+
+static inline void vm_entry_controls_init(struct vcpu_vmx *vmx, u32 val)
+{
+ vmcs_write32(VM_ENTRY_CONTROLS, val);
+ vmx->vm_entry_controls_shadow = val;
+}
+
+static inline void vm_entry_controls_set(struct vcpu_vmx *vmx, u32 val)
+{
+ if (vmx->vm_entry_controls_shadow != val)
+ vm_entry_controls_init(vmx, val);
+}
+
+static inline u32 vm_entry_controls_get(struct vcpu_vmx *vmx)
+{
+ return vmx->vm_entry_controls_shadow;
+}
+
+
+static inline void vm_entry_controls_setbit(struct vcpu_vmx *vmx, u32 val)
+{
+ vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) | val);
+}
+
+static inline void vm_entry_controls_clearbit(struct vcpu_vmx *vmx, u32 val)
+{
+ vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) & ~val);
+}
+
+static inline void vm_exit_controls_reset_shadow(struct vcpu_vmx *vmx)
+{
+ vmx->vm_exit_controls_shadow = vmcs_read32(VM_EXIT_CONTROLS);
+}
+
+static inline void vm_exit_controls_init(struct vcpu_vmx *vmx, u32 val)
+{
+ vmcs_write32(VM_EXIT_CONTROLS, val);
+ vmx->vm_exit_controls_shadow = val;
+}
+
+static inline void vm_exit_controls_set(struct vcpu_vmx *vmx, u32 val)
+{
+ if (vmx->vm_exit_controls_shadow != val)
+ vm_exit_controls_init(vmx, val);
+}
+
+static inline u32 vm_exit_controls_get(struct vcpu_vmx *vmx)
+{
+ return vmx->vm_exit_controls_shadow;
+}
+
+
+static inline void vm_exit_controls_setbit(struct vcpu_vmx *vmx, u32 val)
+{
+ vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) | val);
+}
+
+static inline void vm_exit_controls_clearbit(struct vcpu_vmx *vmx, u32 val)
+{
+ vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) & ~val);
+}
+
+static void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
+{
+ vmx->segment_cache.bitmask = 0;
+}
+
+static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg,
+ unsigned field)
+{
+ bool ret;
+ u32 mask = 1 << (seg * SEG_FIELD_NR + field);
+
+ if (!(vmx->vcpu.arch.regs_avail & (1 << VCPU_EXREG_SEGMENTS))) {
+ vmx->vcpu.arch.regs_avail |= (1 << VCPU_EXREG_SEGMENTS);
+ vmx->segment_cache.bitmask = 0;
+ }
+ ret = vmx->segment_cache.bitmask & mask;
+ vmx->segment_cache.bitmask |= mask;
+ return ret;
+}
+
+static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg)
+{
+ u16 *p = &vmx->segment_cache.seg[seg].selector;
+
+ if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL))
+ *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector);
+ return *p;
+}
+
+static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg)
+{
+ ulong *p = &vmx->segment_cache.seg[seg].base;
+
+ if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE))
+ *p = vmcs_readl(kvm_vmx_segment_fields[seg].base);
+ return *p;
+}
+
+static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg)
+{
+ u32 *p = &vmx->segment_cache.seg[seg].limit;
+
+ if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT))
+ *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit);
+ return *p;
+}
+
+static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg)
+{
+ u32 *p = &vmx->segment_cache.seg[seg].ar;
+
+ if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR))
+ *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes);
+ return *p;
+}
+
+static void update_exception_bitmap(struct kvm_vcpu *vcpu)
+{
+ u32 eb;
+
+ eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
+ (1u << DB_VECTOR) | (1u << AC_VECTOR);
+ /*
+ * Guest access to VMware backdoor ports could legitimately
+ * trigger #GP because of TSS I/O permission bitmap.
+ * We intercept those #GP and allow access to them anyway
+ * as VMware does.
+ */
+ if (enable_vmware_backdoor)
+ eb |= (1u << GP_VECTOR);
+ if ((vcpu->guest_debug &
+ (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
+ (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
+ eb |= 1u << BP_VECTOR;
+ if (to_vmx(vcpu)->rmode.vm86_active)
+ eb = ~0;
+ if (enable_ept)
+ eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
+
+ /* When we are running a nested L2 guest and L1 specified for it a
+ * certain exception bitmap, we must trap the same exceptions and pass
+ * them to L1. When running L2, we will only handle the exceptions
+ * specified above if L1 did not want them.
+ */
+ if (is_guest_mode(vcpu))
+ eb |= get_vmcs12(vcpu)->exception_bitmap;
+
+ vmcs_write32(EXCEPTION_BITMAP, eb);
+}
+
+/*
+ * Check if MSR is intercepted for currently loaded MSR bitmap.
+ */
+static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
+{
+ unsigned long *msr_bitmap;
+ int f = sizeof(unsigned long);
+
+ if (!cpu_has_vmx_msr_bitmap())
+ return true;
+
+ msr_bitmap = to_vmx(vcpu)->loaded_vmcs->msr_bitmap;
+
+ if (msr <= 0x1fff) {
+ return !!test_bit(msr, msr_bitmap + 0x800 / f);
+ } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
+ msr &= 0x1fff;
+ return !!test_bit(msr, msr_bitmap + 0xc00 / f);
+ }
+
+ return true;
+}
+
+/*
+ * Check if MSR is intercepted for L01 MSR bitmap.
+ */
+static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr)
+{
+ unsigned long *msr_bitmap;
+ int f = sizeof(unsigned long);
+
+ if (!cpu_has_vmx_msr_bitmap())
+ return true;
+
+ msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap;
+
+ if (msr <= 0x1fff) {
+ return !!test_bit(msr, msr_bitmap + 0x800 / f);
+ } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
+ msr &= 0x1fff;
+ return !!test_bit(msr, msr_bitmap + 0xc00 / f);
+ }
+
+ return true;
+}
+
+static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
+ unsigned long entry, unsigned long exit)
+{
+ vm_entry_controls_clearbit(vmx, entry);
+ vm_exit_controls_clearbit(vmx, exit);
+}
+
+static int find_msr(struct vmx_msrs *m, unsigned int msr)
+{
+ unsigned int i;
+
+ for (i = 0; i < m->nr; ++i) {
+ if (m->val[i].index == msr)
+ return i;
+ }
+ return -ENOENT;
+}
+
+static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
+{
+ int i;
+ struct msr_autoload *m = &vmx->msr_autoload;
+
+ switch (msr) {
+ case MSR_EFER:
+ if (cpu_has_load_ia32_efer) {
+ clear_atomic_switch_msr_special(vmx,
+ VM_ENTRY_LOAD_IA32_EFER,
+ VM_EXIT_LOAD_IA32_EFER);
+ return;
+ }
+ break;
+ case MSR_CORE_PERF_GLOBAL_CTRL:
+ if (cpu_has_load_perf_global_ctrl) {
+ clear_atomic_switch_msr_special(vmx,
+ VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
+ VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
+ return;
+ }
+ break;
+ }
+ i = find_msr(&m->guest, msr);
+ if (i < 0)
+ goto skip_guest;
+ --m->guest.nr;
+ m->guest.val[i] = m->guest.val[m->guest.nr];
+ vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
+
+skip_guest:
+ i = find_msr(&m->host, msr);
+ if (i < 0)
+ return;
+
+ --m->host.nr;
+ m->host.val[i] = m->host.val[m->host.nr];
+ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
+}
+
+static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
+ unsigned long entry, unsigned long exit,
+ unsigned long guest_val_vmcs, unsigned long host_val_vmcs,
+ u64 guest_val, u64 host_val)
+{
+ vmcs_write64(guest_val_vmcs, guest_val);
+ vmcs_write64(host_val_vmcs, host_val);
+ vm_entry_controls_setbit(vmx, entry);
+ vm_exit_controls_setbit(vmx, exit);
+}
+
+static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
+ u64 guest_val, u64 host_val, bool entry_only)
+{
+ int i, j = 0;
+ struct msr_autoload *m = &vmx->msr_autoload;
+
+ switch (msr) {
+ case MSR_EFER:
+ if (cpu_has_load_ia32_efer) {
+ add_atomic_switch_msr_special(vmx,
+ VM_ENTRY_LOAD_IA32_EFER,
+ VM_EXIT_LOAD_IA32_EFER,
+ GUEST_IA32_EFER,
+ HOST_IA32_EFER,
+ guest_val, host_val);
+ return;
+ }
+ break;
+ case MSR_CORE_PERF_GLOBAL_CTRL:
+ if (cpu_has_load_perf_global_ctrl) {
+ add_atomic_switch_msr_special(vmx,
+ VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
+ VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL,
+ GUEST_IA32_PERF_GLOBAL_CTRL,
+ HOST_IA32_PERF_GLOBAL_CTRL,
+ guest_val, host_val);
+ return;
+ }
+ break;
+ case MSR_IA32_PEBS_ENABLE:
+ /* PEBS needs a quiescent period after being disabled (to write
+ * a record). Disabling PEBS through VMX MSR swapping doesn't
+ * provide that period, so a CPU could write host's record into
+ * guest's memory.
+ */
+ wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
+ }
+
+ i = find_msr(&m->guest, msr);
+ if (!entry_only)
+ j = find_msr(&m->host, msr);
+
+ if ((i < 0 && m->guest.nr == NR_AUTOLOAD_MSRS) ||
+ (j < 0 && m->host.nr == NR_AUTOLOAD_MSRS)) {
+ printk_once(KERN_WARNING "Not enough msr switch entries. "
+ "Can't add msr %x\n", msr);
+ return;
+ }
+ if (i < 0) {
+ i = m->guest.nr++;
+ vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
+ }
+ m->guest.val[i].index = msr;
+ m->guest.val[i].value = guest_val;
+
+ if (entry_only)
+ return;
+
+ if (j < 0) {
+ j = m->host.nr++;
+ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
+ }
+ m->host.val[j].index = msr;
+ m->host.val[j].value = host_val;
+}
+
+static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
+{
+ u64 guest_efer = vmx->vcpu.arch.efer;
+ u64 ignore_bits = 0;
+
+ /* Shadow paging assumes NX to be available. */
+ if (!enable_ept)
+ guest_efer |= EFER_NX;
+
+ /*
+ * LMA and LME handled by hardware; SCE meaningless outside long mode.
+ */
+ ignore_bits |= EFER_SCE;
+#ifdef CONFIG_X86_64
+ ignore_bits |= EFER_LMA | EFER_LME;
+ /* SCE is meaningful only in long mode on Intel */
+ if (guest_efer & EFER_LMA)
+ ignore_bits &= ~(u64)EFER_SCE;
+#endif
+
+ clear_atomic_switch_msr(vmx, MSR_EFER);
+
+ /*
+ * On EPT, we can't emulate NX, so we must switch EFER atomically.
+ * On CPUs that support "load IA32_EFER", always switch EFER
+ * atomically, since it's faster than switching it manually.
+ */
+ if (cpu_has_load_ia32_efer ||
+ (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) {
+ if (!(guest_efer & EFER_LMA))
+ guest_efer &= ~EFER_LME;
+ if (guest_efer != host_efer)
+ add_atomic_switch_msr(vmx, MSR_EFER,
+ guest_efer, host_efer, false);
+ return false;
+ } else {
+ guest_efer &= ~ignore_bits;
+ guest_efer |= host_efer & ignore_bits;
+
+ vmx->guest_msrs[efer_offset].data = guest_efer;
+ vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
+
+ return true;
+ }
+}
+
+#ifdef CONFIG_X86_32
+/*
+ * On 32-bit kernels, VM exits still load the FS and GS bases from the
+ * VMCS rather than the segment table. KVM uses this helper to figure
+ * out the current bases to poke them into the VMCS before entry.
+ */
+static unsigned long segment_base(u16 selector)
+{
+ struct desc_struct *table;
+ unsigned long v;
+
+ if (!(selector & ~SEGMENT_RPL_MASK))
+ return 0;
+
+ table = get_current_gdt_ro();
+
+ if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) {
+ u16 ldt_selector = kvm_read_ldt();
+
+ if (!(ldt_selector & ~SEGMENT_RPL_MASK))
+ return 0;
+
+ table = (struct desc_struct *)segment_base(ldt_selector);
+ }
+ v = get_desc_base(&table[selector >> 3]);
+ return v;
+}
+#endif
+
+static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vmcs_host_state *host_state;
+#ifdef CONFIG_X86_64
+ int cpu = raw_smp_processor_id();
+#endif
+ unsigned long fs_base, gs_base;
+ u16 fs_sel, gs_sel;
+ int i;
+
+ vmx->req_immediate_exit = false;
+
+ /*
+ * Note that guest MSRs to be saved/restored can also be changed
+ * when guest state is loaded. This happens when guest transitions
+ * to/from long-mode by setting MSR_EFER.LMA.
+ */
+ if (!vmx->loaded_cpu_state || vmx->guest_msrs_dirty) {
+ vmx->guest_msrs_dirty = false;
+ for (i = 0; i < vmx->save_nmsrs; ++i)
+ kvm_set_shared_msr(vmx->guest_msrs[i].index,
+ vmx->guest_msrs[i].data,
+ vmx->guest_msrs[i].mask);
+
+ }
+
+ if (vmx->loaded_cpu_state)
+ return;
+
+ vmx->loaded_cpu_state = vmx->loaded_vmcs;
+ host_state = &vmx->loaded_cpu_state->host_state;
+
+ /*
+ * Set host fs and gs selectors. Unfortunately, 22.2.3 does not
+ * allow segment selectors with cpl > 0 or ti == 1.
+ */
+ host_state->ldt_sel = kvm_read_ldt();
+
+#ifdef CONFIG_X86_64
+ savesegment(ds, host_state->ds_sel);
+ savesegment(es, host_state->es_sel);
+
+ gs_base = cpu_kernelmode_gs_base(cpu);
+ if (likely(is_64bit_mm(current->mm))) {
+ save_fsgs_for_kvm();
+ fs_sel = current->thread.fsindex;
+ gs_sel = current->thread.gsindex;
+ fs_base = current->thread.fsbase;
+ vmx->msr_host_kernel_gs_base = current->thread.gsbase;
+ } else {
+ savesegment(fs, fs_sel);
+ savesegment(gs, gs_sel);
+ fs_base = read_msr(MSR_FS_BASE);
+ vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
+ }
+
+ wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
+#else
+ savesegment(fs, fs_sel);
+ savesegment(gs, gs_sel);
+ fs_base = segment_base(fs_sel);
+ gs_base = segment_base(gs_sel);
+#endif
+
+ if (unlikely(fs_sel != host_state->fs_sel)) {
+ if (!(fs_sel & 7))
+ vmcs_write16(HOST_FS_SELECTOR, fs_sel);
+ else
+ vmcs_write16(HOST_FS_SELECTOR, 0);
+ host_state->fs_sel = fs_sel;
+ }
+ if (unlikely(gs_sel != host_state->gs_sel)) {
+ if (!(gs_sel & 7))
+ vmcs_write16(HOST_GS_SELECTOR, gs_sel);
+ else
+ vmcs_write16(HOST_GS_SELECTOR, 0);
+ host_state->gs_sel = gs_sel;
+ }
+ if (unlikely(fs_base != host_state->fs_base)) {
+ vmcs_writel(HOST_FS_BASE, fs_base);
+ host_state->fs_base = fs_base;
+ }
+ if (unlikely(gs_base != host_state->gs_base)) {
+ vmcs_writel(HOST_GS_BASE, gs_base);
+ host_state->gs_base = gs_base;
+ }
+}
+
+static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
+{
+ struct vmcs_host_state *host_state;
+
+ if (!vmx->loaded_cpu_state)
+ return;
+
+ WARN_ON_ONCE(vmx->loaded_cpu_state != vmx->loaded_vmcs);
+ host_state = &vmx->loaded_cpu_state->host_state;
+
+ ++vmx->vcpu.stat.host_state_reload;
+ vmx->loaded_cpu_state = NULL;
+
+#ifdef CONFIG_X86_64
+ rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
+#endif
+ if (host_state->ldt_sel || (host_state->gs_sel & 7)) {
+ kvm_load_ldt(host_state->ldt_sel);
+#ifdef CONFIG_X86_64
+ load_gs_index(host_state->gs_sel);
+#else
+ loadsegment(gs, host_state->gs_sel);
+#endif
+ }
+ if (host_state->fs_sel & 7)
+ loadsegment(fs, host_state->fs_sel);
+#ifdef CONFIG_X86_64
+ if (unlikely(host_state->ds_sel | host_state->es_sel)) {
+ loadsegment(ds, host_state->ds_sel);
+ loadsegment(es, host_state->es_sel);
+ }
+#endif
+ invalidate_tss_limit();
+#ifdef CONFIG_X86_64
+ wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
+#endif
+ load_fixmap_gdt(raw_smp_processor_id());
+}
+
+#ifdef CONFIG_X86_64
+static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx)
+{
+ preempt_disable();
+ if (vmx->loaded_cpu_state)
+ rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
+ preempt_enable();
+ return vmx->msr_guest_kernel_gs_base;
+}
+
+static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
+{
+ preempt_disable();
+ if (vmx->loaded_cpu_state)
+ wrmsrl(MSR_KERNEL_GS_BASE, data);
+ preempt_enable();
+ vmx->msr_guest_kernel_gs_base = data;
+}
+#endif
+
+static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
+{
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+ struct pi_desc old, new;
+ unsigned int dest;
+
+ /*
+ * In case of hot-plug or hot-unplug, we may have to undo
+ * vmx_vcpu_pi_put even if there is no assigned device. And we
+ * always keep PI.NDST up to date for simplicity: it makes the
+ * code easier, and CPU migration is not a fast path.
+ */
+ if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
+ return;
+
+ /*
+ * First handle the simple case where no cmpxchg is necessary; just
+ * allow posting non-urgent interrupts.
+ *
+ * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
+ * PI.NDST: pi_post_block will do it for us and the wakeup_handler
+ * expects the VCPU to be on the blocked_vcpu_list that matches
+ * PI.NDST.
+ */
+ if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR ||
+ vcpu->cpu == cpu) {
+ pi_clear_sn(pi_desc);
+ return;
+ }
+
+ /* The full case. */
+ do {
+ old.control = new.control = pi_desc->control;
+
+ dest = cpu_physical_id(cpu);
+
+ if (x2apic_enabled())
+ new.ndst = dest;
+ else
+ new.ndst = (dest << 8) & 0xFF00;
+
+ new.sn = 0;
+ } while (cmpxchg64(&pi_desc->control, old.control,
+ new.control) != old.control);
+}
+
+static void decache_tsc_multiplier(struct vcpu_vmx *vmx)
+{
+ vmx->current_tsc_ratio = vmx->vcpu.arch.tsc_scaling_ratio;
+ vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio);
+}
+
+/*
+ * Switches to specified vcpu, until a matching vcpu_put(), but assumes
+ * vcpu mutex is already taken.
+ */
+static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
+
+ if (!already_loaded) {
+ loaded_vmcs_clear(vmx->loaded_vmcs);
+ local_irq_disable();
+
+ /*
+ * Ensure loaded_vmcs->cpu is read before adding loaded_vmcs to
+ * this cpu's percpu list, otherwise it may not yet be deleted
+ * from its previous cpu's percpu list. Pairs with the
+ * smb_wmb() in __loaded_vmcs_clear().
+ */
+ smp_rmb();
+
+ list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
+ &per_cpu(loaded_vmcss_on_cpu, cpu));
+ local_irq_enable();
+ }
+
+ if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
+ per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
+ vmcs_load(vmx->loaded_vmcs->vmcs);
+ indirect_branch_prediction_barrier();
+ }
+
+ if (!already_loaded) {
+ void *gdt = get_current_gdt_ro();
+ unsigned long sysenter_esp;
+
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+
+ /*
+ * Linux uses per-cpu TSS and GDT, so set these when switching
+ * processors. See 22.2.4.
+ */
+ vmcs_writel(HOST_TR_BASE,
+ (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
+ vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */
+
+ /*
+ * VM exits change the host TR limit to 0x67 after a VM
+ * exit. This is okay, since 0x67 covers everything except
+ * the IO bitmap and have have code to handle the IO bitmap
+ * being lost after a VM exit.
+ */
+ BUILD_BUG_ON(IO_BITMAP_OFFSET - 1 != 0x67);
+
+ rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
+ vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
+
+ vmx->loaded_vmcs->cpu = cpu;
+ }
+
+ /* Setup TSC multiplier */
+ if (kvm_has_tsc_control &&
+ vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio)
+ decache_tsc_multiplier(vmx);
+
+ vmx_vcpu_pi_load(vcpu, cpu);
+ vmx->host_pkru = read_pkru();
+ vmx->host_debugctlmsr = get_debugctlmsr();
+}
+
+static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
+{
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+
+ if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
+ !irq_remapping_cap(IRQ_POSTING_CAP) ||
+ !kvm_vcpu_apicv_active(vcpu))
+ return;
+
+ /* Set SN when the vCPU is preempted */
+ if (vcpu->preempted)
+ pi_set_sn(pi_desc);
+}
+
+static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
+{
+ vmx_vcpu_pi_put(vcpu);
+
+ vmx_prepare_switch_to_host(to_vmx(vcpu));
+}
+
+static bool emulation_required(struct kvm_vcpu *vcpu)
+{
+ return emulate_invalid_guest_state && !guest_state_valid(vcpu);
+}
+
+static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
+
+/*
+ * Return the cr0 value that a nested guest would read. This is a combination
+ * of the real cr0 used to run the guest (guest_cr0), and the bits shadowed by
+ * its hypervisor (cr0_read_shadow).
+ */
+static inline unsigned long nested_read_cr0(struct vmcs12 *fields)
+{
+ return (fields->guest_cr0 & ~fields->cr0_guest_host_mask) |
+ (fields->cr0_read_shadow & fields->cr0_guest_host_mask);
+}
+static inline unsigned long nested_read_cr4(struct vmcs12 *fields)
+{
+ return (fields->guest_cr4 & ~fields->cr4_guest_host_mask) |
+ (fields->cr4_read_shadow & fields->cr4_guest_host_mask);
+}
+
+static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
+{
+ unsigned long rflags, save_rflags;
+
+ if (!test_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail)) {
+ __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
+ rflags = vmcs_readl(GUEST_RFLAGS);
+ if (to_vmx(vcpu)->rmode.vm86_active) {
+ rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
+ save_rflags = to_vmx(vcpu)->rmode.save_rflags;
+ rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
+ }
+ to_vmx(vcpu)->rflags = rflags;
+ }
+ return to_vmx(vcpu)->rflags;
+}
+
+static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
+{
+ unsigned long old_rflags = vmx_get_rflags(vcpu);
+
+ __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
+ to_vmx(vcpu)->rflags = rflags;
+ if (to_vmx(vcpu)->rmode.vm86_active) {
+ to_vmx(vcpu)->rmode.save_rflags = rflags;
+ rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
+ }
+ vmcs_writel(GUEST_RFLAGS, rflags);
+
+ if ((old_rflags ^ to_vmx(vcpu)->rflags) & X86_EFLAGS_VM)
+ to_vmx(vcpu)->emulation_required = emulation_required(vcpu);
+}
+
+static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
+{
+ u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
+ int ret = 0;
+
+ if (interruptibility & GUEST_INTR_STATE_STI)
+ ret |= KVM_X86_SHADOW_INT_STI;
+ if (interruptibility & GUEST_INTR_STATE_MOV_SS)
+ ret |= KVM_X86_SHADOW_INT_MOV_SS;
+
+ return ret;
+}
+
+static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
+{
+ u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
+ u32 interruptibility = interruptibility_old;
+
+ interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
+
+ if (mask & KVM_X86_SHADOW_INT_MOV_SS)
+ interruptibility |= GUEST_INTR_STATE_MOV_SS;
+ else if (mask & KVM_X86_SHADOW_INT_STI)
+ interruptibility |= GUEST_INTR_STATE_STI;
+
+ if ((interruptibility != interruptibility_old))
+ vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
+}
+
+static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
+{
+ unsigned long rip;
+
+ rip = kvm_rip_read(vcpu);
+ rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+ kvm_rip_write(vcpu, rip);
+
+ /* skipping an emulated instruction also counts */
+ vmx_set_interrupt_shadow(vcpu, 0);
+}
+
+static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu,
+ unsigned long exit_qual)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ unsigned int nr = vcpu->arch.exception.nr;
+ u32 intr_info = nr | INTR_INFO_VALID_MASK;
+
+ if (vcpu->arch.exception.has_error_code) {
+ vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code;
+ intr_info |= INTR_INFO_DELIVER_CODE_MASK;
+ }
+
+ if (kvm_exception_is_soft(nr))
+ intr_info |= INTR_TYPE_SOFT_EXCEPTION;
+ else
+ intr_info |= INTR_TYPE_HARD_EXCEPTION;
+
+ if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) &&
+ vmx_get_nmi_mask(vcpu))
+ intr_info |= INTR_INFO_UNBLOCK_NMI;
+
+ nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual);
+}
+
+/*
+ * KVM wants to inject page-faults which it got to the guest. This function
+ * checks whether in a nested guest, we need to inject them to L1 or L2.
+ */
+static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit_qual)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ unsigned int nr = vcpu->arch.exception.nr;
+
+ if (nr == PF_VECTOR) {
+ if (vcpu->arch.exception.nested_apf) {
+ *exit_qual = vcpu->arch.apf.nested_apf_token;
+ return 1;
+ }
+ /*
+ * FIXME: we must not write CR2 when L1 intercepts an L2 #PF exception.
+ * The fix is to add the ancillary datum (CR2 or DR6) to structs
+ * kvm_queued_exception and kvm_vcpu_events, so that CR2 and DR6
+ * can be written only when inject_pending_event runs. This should be
+ * conditional on a new capability---if the capability is disabled,
+ * kvm_multiple_exception would write the ancillary information to
+ * CR2 or DR6, for backwards ABI-compatibility.
+ */
+ if (nested_vmx_is_page_fault_vmexit(vmcs12,
+ vcpu->arch.exception.error_code)) {
+ *exit_qual = vcpu->arch.cr2;
+ return 1;
+ }
+ } else {
+ if (vmcs12->exception_bitmap & (1u << nr)) {
+ if (nr == DB_VECTOR) {
+ *exit_qual = vcpu->arch.dr6;
+ *exit_qual &= ~(DR6_FIXED_1 | DR6_BT);
+ *exit_qual ^= DR6_RTM;
+ } else {
+ *exit_qual = 0;
+ }
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Ensure that we clear the HLT state in the VMCS. We don't need to
+ * explicitly skip the instruction because if the HLT state is set,
+ * then the instruction is already executing and RIP has already been
+ * advanced.
+ */
+ if (kvm_hlt_in_guest(vcpu->kvm) &&
+ vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
+ vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
+}
+
+static void vmx_queue_exception(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned nr = vcpu->arch.exception.nr;
+ bool has_error_code = vcpu->arch.exception.has_error_code;
+ u32 error_code = vcpu->arch.exception.error_code;
+ u32 intr_info = nr | INTR_INFO_VALID_MASK;
+
+ if (has_error_code) {
+ vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
+ intr_info |= INTR_INFO_DELIVER_CODE_MASK;
+ }
+
+ if (vmx->rmode.vm86_active) {
+ int inc_eip = 0;
+ if (kvm_exception_is_soft(nr))
+ inc_eip = vcpu->arch.event_exit_inst_len;
+ if (kvm_inject_realmode_interrupt(vcpu, nr, inc_eip) != EMULATE_DONE)
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ return;
+ }
+
+ WARN_ON_ONCE(vmx->emulation_required);
+
+ if (kvm_exception_is_soft(nr)) {
+ vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
+ vmx->vcpu.arch.event_exit_inst_len);
+ intr_info |= INTR_TYPE_SOFT_EXCEPTION;
+ } else
+ intr_info |= INTR_TYPE_HARD_EXCEPTION;
+
+ vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
+
+ vmx_clear_hlt(vcpu);
+}
+
+static bool vmx_rdtscp_supported(void)
+{
+ return cpu_has_vmx_rdtscp();
+}
+
+static bool vmx_invpcid_supported(void)
+{
+ return cpu_has_vmx_invpcid();
+}
+
+/*
+ * Swap MSR entry in host/guest MSR entry array.
+ */
+static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
+{
+ struct shared_msr_entry tmp;
+
+ tmp = vmx->guest_msrs[to];
+ vmx->guest_msrs[to] = vmx->guest_msrs[from];
+ vmx->guest_msrs[from] = tmp;
+}
+
+/*
+ * Set up the vmcs to automatically save and restore system
+ * msrs. Don't touch the 64-bit msrs if the guest is in legacy
+ * mode, as fiddling with msrs is very expensive.
+ */
+static void setup_msrs(struct vcpu_vmx *vmx)
+{
+ int save_nmsrs, index;
+
+ save_nmsrs = 0;
+#ifdef CONFIG_X86_64
+ if (is_long_mode(&vmx->vcpu)) {
+ index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
+ if (index >= 0)
+ move_msr_up(vmx, index, save_nmsrs++);
+ index = __find_msr_index(vmx, MSR_LSTAR);
+ if (index >= 0)
+ move_msr_up(vmx, index, save_nmsrs++);
+ index = __find_msr_index(vmx, MSR_CSTAR);
+ if (index >= 0)
+ move_msr_up(vmx, index, save_nmsrs++);
+ /*
+ * MSR_STAR is only needed on long mode guests, and only
+ * if efer.sce is enabled.
+ */
+ index = __find_msr_index(vmx, MSR_STAR);
+ if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE))
+ move_msr_up(vmx, index, save_nmsrs++);
+ }
+#endif
+ index = __find_msr_index(vmx, MSR_EFER);
+ if (index >= 0 && update_transition_efer(vmx, index))
+ move_msr_up(vmx, index, save_nmsrs++);
+ index = __find_msr_index(vmx, MSR_TSC_AUX);
+ if (index >= 0 && guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP))
+ move_msr_up(vmx, index, save_nmsrs++);
+
+ vmx->save_nmsrs = save_nmsrs;
+ vmx->guest_msrs_dirty = true;
+
+ if (cpu_has_vmx_msr_bitmap())
+ vmx_update_msr_bitmap(&vmx->vcpu);
+}
+
+static u64 vmx_read_l1_tsc_offset(struct kvm_vcpu *vcpu)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+ if (is_guest_mode(vcpu) &&
+ (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING))
+ return vcpu->arch.tsc_offset - vmcs12->tsc_offset;
+
+ return vcpu->arch.tsc_offset;
+}
+
+static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
+{
+ u64 active_offset = offset;
+ if (is_guest_mode(vcpu)) {
+ /*
+ * We're here if L1 chose not to trap WRMSR to TSC. According
+ * to the spec, this should set L1's TSC; The offset that L1
+ * set for L2 remains unchanged, and still needs to be added
+ * to the newly set TSC to get L2's TSC.
+ */
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING))
+ active_offset += vmcs12->tsc_offset;
+ } else {
+ trace_kvm_write_tsc_offset(vcpu->vcpu_id,
+ vmcs_read64(TSC_OFFSET), offset);
+ }
+
+ vmcs_write64(TSC_OFFSET, active_offset);
+ return active_offset;
+}
+
+/*
+ * nested_vmx_allowed() checks whether a guest should be allowed to use VMX
+ * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for
+ * all guests if the "nested" module option is off, and can also be disabled
+ * for a single guest by disabling its VMX cpuid bit.
+ */
+static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
+{
+ return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX);
+}
+
+/*
+ * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be
+ * returned for the various VMX controls MSRs when nested VMX is enabled.
+ * The same values should also be used to verify that vmcs12 control fields are
+ * valid during nested entry from L1 to L2.
+ * Each of these control msrs has a low and high 32-bit half: A low bit is on
+ * if the corresponding bit in the (32-bit) control field *must* be on, and a
+ * bit in the high half is on if the corresponding bit in the control field
+ * may be on. See also vmx_control_verify().
+ */
+static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, bool apicv)
+{
+ if (!nested) {
+ memset(msrs, 0, sizeof(*msrs));
+ return;
+ }
+
+ /*
+ * Note that as a general rule, the high half of the MSRs (bits in
+ * the control fields which may be 1) should be initialized by the
+ * intersection of the underlying hardware's MSR (i.e., features which
+ * can be supported) and the list of features we want to expose -
+ * because they are known to be properly supported in our code.
+ * Also, usually, the low half of the MSRs (bits which must be 1) can
+ * be set to 0, meaning that L1 may turn off any of these bits. The
+ * reason is that if one of these bits is necessary, it will appear
+ * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
+ * fields of vmcs01 and vmcs02, will turn these bits off - and
+ * nested_vmx_exit_reflected() will not pass related exits to L1.
+ * These rules have exceptions below.
+ */
+
+ /* pin-based controls */
+ rdmsr(MSR_IA32_VMX_PINBASED_CTLS,
+ msrs->pinbased_ctls_low,
+ msrs->pinbased_ctls_high);
+ msrs->pinbased_ctls_low |=
+ PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
+ msrs->pinbased_ctls_high &=
+ PIN_BASED_EXT_INTR_MASK |
+ PIN_BASED_NMI_EXITING |
+ PIN_BASED_VIRTUAL_NMIS |
+ (apicv ? PIN_BASED_POSTED_INTR : 0);
+ msrs->pinbased_ctls_high |=
+ PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
+ PIN_BASED_VMX_PREEMPTION_TIMER;
+
+ /* exit controls */
+ rdmsr(MSR_IA32_VMX_EXIT_CTLS,
+ msrs->exit_ctls_low,
+ msrs->exit_ctls_high);
+ msrs->exit_ctls_low =
+ VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
+
+ msrs->exit_ctls_high &=
+#ifdef CONFIG_X86_64
+ VM_EXIT_HOST_ADDR_SPACE_SIZE |
+#endif
+ VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
+ msrs->exit_ctls_high |=
+ VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
+ VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
+ VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
+
+ /* We support free control of debug control saving. */
+ msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS;
+
+ /* entry controls */
+ rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
+ msrs->entry_ctls_low,
+ msrs->entry_ctls_high);
+ msrs->entry_ctls_low =
+ VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
+ msrs->entry_ctls_high &=
+#ifdef CONFIG_X86_64
+ VM_ENTRY_IA32E_MODE |
+#endif
+ VM_ENTRY_LOAD_IA32_PAT;
+ msrs->entry_ctls_high |=
+ (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
+
+ /* We support free control of debug control loading. */
+ msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
+
+ /* cpu-based controls */
+ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
+ msrs->procbased_ctls_low,
+ msrs->procbased_ctls_high);
+ msrs->procbased_ctls_low =
+ CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
+ msrs->procbased_ctls_high &=
+ CPU_BASED_VIRTUAL_INTR_PENDING |
+ CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING |
+ CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
+ CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING |
+ CPU_BASED_CR3_STORE_EXITING |
+#ifdef CONFIG_X86_64
+ CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING |
+#endif
+ CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
+ CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG |
+ CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING |
+ CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING |
+ CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
+ /*
+ * We can allow some features even when not supported by the
+ * hardware. For example, L1 can specify an MSR bitmap - and we
+ * can use it to avoid exits to L1 - even when L0 runs L2
+ * without MSR bitmaps.
+ */
+ msrs->procbased_ctls_high |=
+ CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
+ CPU_BASED_USE_MSR_BITMAPS;
+
+ /* We support free control of CR3 access interception. */
+ msrs->procbased_ctls_low &=
+ ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
+
+ /*
+ * secondary cpu-based controls. Do not include those that
+ * depend on CPUID bits, they are added later by vmx_cpuid_update.
+ */
+ if (msrs->procbased_ctls_high & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)
+ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
+ msrs->secondary_ctls_low,
+ msrs->secondary_ctls_high);
+
+ msrs->secondary_ctls_low = 0;
+ msrs->secondary_ctls_high &=
+ SECONDARY_EXEC_DESC |
+ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
+ SECONDARY_EXEC_APIC_REGISTER_VIRT |
+ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
+ SECONDARY_EXEC_WBINVD_EXITING;
+
+ /*
+ * We can emulate "VMCS shadowing," even if the hardware
+ * doesn't support it.
+ */
+ msrs->secondary_ctls_high |=
+ SECONDARY_EXEC_SHADOW_VMCS;
+
+ if (enable_ept) {
+ /* nested EPT: emulate EPT also to L1 */
+ msrs->secondary_ctls_high |=
+ SECONDARY_EXEC_ENABLE_EPT;
+ msrs->ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
+ VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT;
+ if (cpu_has_vmx_ept_execute_only())
+ msrs->ept_caps |=
+ VMX_EPT_EXECUTE_ONLY_BIT;
+ msrs->ept_caps &= vmx_capability.ept;
+ msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
+ VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT |
+ VMX_EPT_1GB_PAGE_BIT;
+ if (enable_ept_ad_bits) {
+ msrs->secondary_ctls_high |=
+ SECONDARY_EXEC_ENABLE_PML;
+ msrs->ept_caps |= VMX_EPT_AD_BIT;
+ }
+ }
+
+ if (cpu_has_vmx_vmfunc()) {
+ msrs->secondary_ctls_high |=
+ SECONDARY_EXEC_ENABLE_VMFUNC;
+ /*
+ * Advertise EPTP switching unconditionally
+ * since we emulate it
+ */
+ if (enable_ept)
+ msrs->vmfunc_controls =
+ VMX_VMFUNC_EPTP_SWITCHING;
+ }
+
+ /*
+ * Old versions of KVM use the single-context version without
+ * checking for support, so declare that it is supported even
+ * though it is treated as global context. The alternative is
+ * not failing the single-context invvpid, and it is worse.
+ */
+ if (enable_vpid) {
+ msrs->secondary_ctls_high |=
+ SECONDARY_EXEC_ENABLE_VPID;
+ msrs->vpid_caps = VMX_VPID_INVVPID_BIT |
+ VMX_VPID_EXTENT_SUPPORTED_MASK;
+ }
+
+ if (enable_unrestricted_guest)
+ msrs->secondary_ctls_high |=
+ SECONDARY_EXEC_UNRESTRICTED_GUEST;
+
+ if (flexpriority_enabled)
+ msrs->secondary_ctls_high |=
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+
+ /* miscellaneous data */
+ rdmsr(MSR_IA32_VMX_MISC,
+ msrs->misc_low,
+ msrs->misc_high);
+ msrs->misc_low &= VMX_MISC_SAVE_EFER_LMA;
+ msrs->misc_low |=
+ MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS |
+ VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
+ VMX_MISC_ACTIVITY_HLT;
+ msrs->misc_high = 0;
+
+ /*
+ * This MSR reports some information about VMX support. We
+ * should return information about the VMX we emulate for the
+ * guest, and the VMCS structure we give it - not about the
+ * VMX support of the underlying hardware.
+ */
+ msrs->basic =
+ VMCS12_REVISION |
+ VMX_BASIC_TRUE_CTLS |
+ ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
+ (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
+
+ if (cpu_has_vmx_basic_inout())
+ msrs->basic |= VMX_BASIC_INOUT;
+
+ /*
+ * These MSRs specify bits which the guest must keep fixed on
+ * while L1 is in VMXON mode (in L1's root mode, or running an L2).
+ * We picked the standard core2 setting.
+ */
+#define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
+#define VMXON_CR4_ALWAYSON X86_CR4_VMXE
+ msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON;
+ msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON;
+
+ /* These MSRs specify bits which the guest must keep fixed off. */
+ rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1);
+ rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1);
+
+ /* highest index: VMX_PREEMPTION_TIMER_VALUE */
+ msrs->vmcs_enum = VMCS12_MAX_FIELD_INDEX << 1;
+}
+
+/*
+ * if fixed0[i] == 1: val[i] must be 1
+ * if fixed1[i] == 0: val[i] must be 0
+ */
+static inline bool fixed_bits_valid(u64 val, u64 fixed0, u64 fixed1)
+{
+ return ((val & fixed1) | fixed0) == val;
+}
+
+static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
+{
+ return fixed_bits_valid(control, low, high);
+}
+
+static inline u64 vmx_control_msr(u32 low, u32 high)
+{
+ return low | ((u64)high << 32);
+}
+
+static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask)
+{
+ superset &= mask;
+ subset &= mask;
+
+ return (superset | subset) == superset;
+}
+
+static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data)
+{
+ const u64 feature_and_reserved =
+ /* feature (except bit 48; see below) */
+ BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) |
+ /* reserved */
+ BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56);
+ u64 vmx_basic = vmx->nested.msrs.basic;
+
+ if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved))
+ return -EINVAL;
+
+ /*
+ * KVM does not emulate a version of VMX that constrains physical
+ * addresses of VMX structures (e.g. VMCS) to 32-bits.
+ */
+ if (data & BIT_ULL(48))
+ return -EINVAL;
+
+ if (vmx_basic_vmcs_revision_id(vmx_basic) !=
+ vmx_basic_vmcs_revision_id(data))
+ return -EINVAL;
+
+ if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data))
+ return -EINVAL;
+
+ vmx->nested.msrs.basic = data;
+ return 0;
+}
+
+static int
+vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
+{
+ u64 supported;
+ u32 *lowp, *highp;
+
+ switch (msr_index) {
+ case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
+ lowp = &vmx->nested.msrs.pinbased_ctls_low;
+ highp = &vmx->nested.msrs.pinbased_ctls_high;
+ break;
+ case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
+ lowp = &vmx->nested.msrs.procbased_ctls_low;
+ highp = &vmx->nested.msrs.procbased_ctls_high;
+ break;
+ case MSR_IA32_VMX_TRUE_EXIT_CTLS:
+ lowp = &vmx->nested.msrs.exit_ctls_low;
+ highp = &vmx->nested.msrs.exit_ctls_high;
+ break;
+ case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
+ lowp = &vmx->nested.msrs.entry_ctls_low;
+ highp = &vmx->nested.msrs.entry_ctls_high;
+ break;
+ case MSR_IA32_VMX_PROCBASED_CTLS2:
+ lowp = &vmx->nested.msrs.secondary_ctls_low;
+ highp = &vmx->nested.msrs.secondary_ctls_high;
+ break;
+ default:
+ BUG();
+ }
+
+ supported = vmx_control_msr(*lowp, *highp);
+
+ /* Check must-be-1 bits are still 1. */
+ if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0)))
+ return -EINVAL;
+
+ /* Check must-be-0 bits are still 0. */
+ if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32)))
+ return -EINVAL;
+
+ *lowp = data;
+ *highp = data >> 32;
+ return 0;
+}
+
+static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data)
+{
+ const u64 feature_and_reserved_bits =
+ /* feature */
+ BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) |
+ BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) |
+ /* reserved */
+ GENMASK_ULL(13, 9) | BIT_ULL(31);
+ u64 vmx_misc;
+
+ vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low,
+ vmx->nested.msrs.misc_high);
+
+ if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits))
+ return -EINVAL;
+
+ if ((vmx->nested.msrs.pinbased_ctls_high &
+ PIN_BASED_VMX_PREEMPTION_TIMER) &&
+ vmx_misc_preemption_timer_rate(data) !=
+ vmx_misc_preemption_timer_rate(vmx_misc))
+ return -EINVAL;
+
+ if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc))
+ return -EINVAL;
+
+ if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc))
+ return -EINVAL;
+
+ if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc))
+ return -EINVAL;
+
+ vmx->nested.msrs.misc_low = data;
+ vmx->nested.msrs.misc_high = data >> 32;
+
+ /*
+ * If L1 has read-only VM-exit information fields, use the
+ * less permissive vmx_vmwrite_bitmap to specify write
+ * permissions for the shadow VMCS.
+ */
+ if (enable_shadow_vmcs && !nested_cpu_has_vmwrite_any_field(&vmx->vcpu))
+ vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
+
+ return 0;
+}
+
+static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data)
+{
+ u64 vmx_ept_vpid_cap;
+
+ vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.msrs.ept_caps,
+ vmx->nested.msrs.vpid_caps);
+
+ /* Every bit is either reserved or a feature bit. */
+ if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL))
+ return -EINVAL;
+
+ vmx->nested.msrs.ept_caps = data;
+ vmx->nested.msrs.vpid_caps = data >> 32;
+ return 0;
+}
+
+static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
+{
+ u64 *msr;
+
+ switch (msr_index) {
+ case MSR_IA32_VMX_CR0_FIXED0:
+ msr = &vmx->nested.msrs.cr0_fixed0;
+ break;
+ case MSR_IA32_VMX_CR4_FIXED0:
+ msr = &vmx->nested.msrs.cr4_fixed0;
+ break;
+ default:
+ BUG();
+ }
+
+ /*
+ * 1 bits (which indicates bits which "must-be-1" during VMX operation)
+ * must be 1 in the restored value.
+ */
+ if (!is_bitwise_subset(data, *msr, -1ULL))
+ return -EINVAL;
+
+ *msr = data;
+ return 0;
+}
+
+/*
+ * Called when userspace is restoring VMX MSRs.
+ *
+ * Returns 0 on success, non-0 otherwise.
+ */
+static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ /*
+ * Don't allow changes to the VMX capability MSRs while the vCPU
+ * is in VMX operation.
+ */
+ if (vmx->nested.vmxon)
+ return -EBUSY;
+
+ switch (msr_index) {
+ case MSR_IA32_VMX_BASIC:
+ return vmx_restore_vmx_basic(vmx, data);
+ case MSR_IA32_VMX_PINBASED_CTLS:
+ case MSR_IA32_VMX_PROCBASED_CTLS:
+ case MSR_IA32_VMX_EXIT_CTLS:
+ case MSR_IA32_VMX_ENTRY_CTLS:
+ /*
+ * The "non-true" VMX capability MSRs are generated from the
+ * "true" MSRs, so we do not support restoring them directly.
+ *
+ * If userspace wants to emulate VMX_BASIC[55]=0, userspace
+ * should restore the "true" MSRs with the must-be-1 bits
+ * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND
+ * DEFAULT SETTINGS".
+ */
+ return -EINVAL;
+ case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
+ case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
+ case MSR_IA32_VMX_TRUE_EXIT_CTLS:
+ case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
+ case MSR_IA32_VMX_PROCBASED_CTLS2:
+ return vmx_restore_control_msr(vmx, msr_index, data);
+ case MSR_IA32_VMX_MISC:
+ return vmx_restore_vmx_misc(vmx, data);
+ case MSR_IA32_VMX_CR0_FIXED0:
+ case MSR_IA32_VMX_CR4_FIXED0:
+ return vmx_restore_fixed0_msr(vmx, msr_index, data);
+ case MSR_IA32_VMX_CR0_FIXED1:
+ case MSR_IA32_VMX_CR4_FIXED1:
+ /*
+ * These MSRs are generated based on the vCPU's CPUID, so we
+ * do not support restoring them directly.
+ */
+ return -EINVAL;
+ case MSR_IA32_VMX_EPT_VPID_CAP:
+ return vmx_restore_vmx_ept_vpid_cap(vmx, data);
+ case MSR_IA32_VMX_VMCS_ENUM:
+ vmx->nested.msrs.vmcs_enum = data;
+ return 0;
+ default:
+ /*
+ * The rest of the VMX capability MSRs do not support restore.
+ */
+ return -EINVAL;
+ }
+}
+
+/* Returns 0 on success, non-0 otherwise. */
+static int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata)
+{
+ switch (msr_index) {
+ case MSR_IA32_VMX_BASIC:
+ *pdata = msrs->basic;
+ break;
+ case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
+ case MSR_IA32_VMX_PINBASED_CTLS:
+ *pdata = vmx_control_msr(
+ msrs->pinbased_ctls_low,
+ msrs->pinbased_ctls_high);
+ if (msr_index == MSR_IA32_VMX_PINBASED_CTLS)
+ *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
+ break;
+ case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
+ case MSR_IA32_VMX_PROCBASED_CTLS:
+ *pdata = vmx_control_msr(
+ msrs->procbased_ctls_low,
+ msrs->procbased_ctls_high);
+ if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS)
+ *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
+ break;
+ case MSR_IA32_VMX_TRUE_EXIT_CTLS:
+ case MSR_IA32_VMX_EXIT_CTLS:
+ *pdata = vmx_control_msr(
+ msrs->exit_ctls_low,
+ msrs->exit_ctls_high);
+ if (msr_index == MSR_IA32_VMX_EXIT_CTLS)
+ *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
+ break;
+ case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
+ case MSR_IA32_VMX_ENTRY_CTLS:
+ *pdata = vmx_control_msr(
+ msrs->entry_ctls_low,
+ msrs->entry_ctls_high);
+ if (msr_index == MSR_IA32_VMX_ENTRY_CTLS)
+ *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
+ break;
+ case MSR_IA32_VMX_MISC:
+ *pdata = vmx_control_msr(
+ msrs->misc_low,
+ msrs->misc_high);
+ break;
+ case MSR_IA32_VMX_CR0_FIXED0:
+ *pdata = msrs->cr0_fixed0;
+ break;
+ case MSR_IA32_VMX_CR0_FIXED1:
+ *pdata = msrs->cr0_fixed1;
+ break;
+ case MSR_IA32_VMX_CR4_FIXED0:
+ *pdata = msrs->cr4_fixed0;
+ break;
+ case MSR_IA32_VMX_CR4_FIXED1:
+ *pdata = msrs->cr4_fixed1;
+ break;
+ case MSR_IA32_VMX_VMCS_ENUM:
+ *pdata = msrs->vmcs_enum;
+ break;
+ case MSR_IA32_VMX_PROCBASED_CTLS2:
+ *pdata = vmx_control_msr(
+ msrs->secondary_ctls_low,
+ msrs->secondary_ctls_high);
+ break;
+ case MSR_IA32_VMX_EPT_VPID_CAP:
+ *pdata = msrs->ept_caps |
+ ((u64)msrs->vpid_caps << 32);
+ break;
+ case MSR_IA32_VMX_VMFUNC:
+ *pdata = msrs->vmfunc_controls;
+ break;
+ default:
+ return 1;
+ }
+
+ return 0;
+}
+
+static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
+ uint64_t val)
+{
+ uint64_t valid_bits = to_vmx(vcpu)->msr_ia32_feature_control_valid_bits;
+
+ return !(val & ~valid_bits);
+}
+
+static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
+{
+ switch (msr->index) {
+ case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
+ if (!nested)
+ return 1;
+ return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data);
+ default:
+ return 1;
+ }
+
+ return 0;
+}
+
+/*
+ * Reads an msr value (of 'msr_index') into 'pdata'.
+ * Returns 0 on success, non-0 otherwise.
+ * Assumes vcpu_load() was already called.
+ */
+static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct shared_msr_entry *msr;
+
+ switch (msr_info->index) {
+#ifdef CONFIG_X86_64
+ case MSR_FS_BASE:
+ msr_info->data = vmcs_readl(GUEST_FS_BASE);
+ break;
+ case MSR_GS_BASE:
+ msr_info->data = vmcs_readl(GUEST_GS_BASE);
+ break;
+ case MSR_KERNEL_GS_BASE:
+ msr_info->data = vmx_read_guest_kernel_gs_base(vmx);
+ break;
+#endif
+ case MSR_EFER:
+ return kvm_get_msr_common(vcpu, msr_info);
+ case MSR_IA32_SPEC_CTRL:
+ if (!msr_info->host_initiated &&
+ !guest_has_spec_ctrl_msr(vcpu))
+ return 1;
+
+ msr_info->data = to_vmx(vcpu)->spec_ctrl;
+ break;
+ case MSR_IA32_SYSENTER_CS:
+ msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
+ break;
+ case MSR_IA32_SYSENTER_EIP:
+ msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP);
+ break;
+ case MSR_IA32_SYSENTER_ESP:
+ msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP);
+ break;
+ case MSR_IA32_BNDCFGS:
+ if (!kvm_mpx_supported() ||
+ (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
+ return 1;
+ msr_info->data = vmcs_read64(GUEST_BNDCFGS);
+ break;
+ case MSR_IA32_MCG_EXT_CTL:
+ if (!msr_info->host_initiated &&
+ !(vmx->msr_ia32_feature_control &
+ FEATURE_CONTROL_LMCE))
+ return 1;
+ msr_info->data = vcpu->arch.mcg_ext_ctl;
+ break;
+ case MSR_IA32_FEATURE_CONTROL:
+ msr_info->data = vmx->msr_ia32_feature_control;
+ break;
+ case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
+ if (!nested_vmx_allowed(vcpu))
+ return 1;
+ return vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
+ &msr_info->data);
+ case MSR_IA32_XSS:
+ if (!vmx_xsaves_supported() ||
+ (!msr_info->host_initiated &&
+ !(guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
+ guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))))
+ return 1;
+ msr_info->data = vcpu->arch.ia32_xss;
+ break;
+ case MSR_TSC_AUX:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
+ return 1;
+ /* Otherwise falls through */
+ default:
+ msr = find_msr_entry(vmx, msr_info->index);
+ if (msr) {
+ msr_info->data = msr->data;
+ break;
+ }
+ return kvm_get_msr_common(vcpu, msr_info);
+ }
+
+ return 0;
+}
+
+static void vmx_leave_nested(struct kvm_vcpu *vcpu);
+
+/*
+ * Writes msr value into into the appropriate "register".
+ * Returns 0 on success, non-0 otherwise.
+ * Assumes vcpu_load() was already called.
+ */
+static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct shared_msr_entry *msr;
+ int ret = 0;
+ u32 msr_index = msr_info->index;
+ u64 data = msr_info->data;
+
+ switch (msr_index) {
+ case MSR_EFER:
+ ret = kvm_set_msr_common(vcpu, msr_info);
+ break;
+#ifdef CONFIG_X86_64
+ case MSR_FS_BASE:
+ vmx_segment_cache_clear(vmx);
+ vmcs_writel(GUEST_FS_BASE, data);
+ break;
+ case MSR_GS_BASE:
+ vmx_segment_cache_clear(vmx);
+ vmcs_writel(GUEST_GS_BASE, data);
+ break;
+ case MSR_KERNEL_GS_BASE:
+ vmx_write_guest_kernel_gs_base(vmx, data);
+ break;
+#endif
+ case MSR_IA32_SYSENTER_CS:
+ vmcs_write32(GUEST_SYSENTER_CS, data);
+ break;
+ case MSR_IA32_SYSENTER_EIP:
+ vmcs_writel(GUEST_SYSENTER_EIP, data);
+ break;
+ case MSR_IA32_SYSENTER_ESP:
+ vmcs_writel(GUEST_SYSENTER_ESP, data);
+ break;
+ case MSR_IA32_BNDCFGS:
+ if (!kvm_mpx_supported() ||
+ (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
+ return 1;
+ if (is_noncanonical_address(data & PAGE_MASK, vcpu) ||
+ (data & MSR_IA32_BNDCFGS_RSVD))
+ return 1;
+ vmcs_write64(GUEST_BNDCFGS, data);
+ break;
+ case MSR_IA32_SPEC_CTRL:
+ if (!msr_info->host_initiated &&
+ !guest_has_spec_ctrl_msr(vcpu))
+ return 1;
+
+ /* The STIBP bit doesn't fault even if it's not advertised */
+ if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD))
+ return 1;
+
+ vmx->spec_ctrl = data;
+
+ if (!data)
+ break;
+
+ /*
+ * For non-nested:
+ * When it's written (to non-zero) for the first time, pass
+ * it through.
+ *
+ * For nested:
+ * The handling of the MSR bitmap for L2 guests is done in
+ * nested_vmx_merge_msr_bitmap. We should not touch the
+ * vmcs02.msr_bitmap here since it gets completely overwritten
+ * in the merging. We update the vmcs01 here for L1 as well
+ * since it will end up touching the MSR anyway now.
+ */
+ vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap,
+ MSR_IA32_SPEC_CTRL,
+ MSR_TYPE_RW);
+ break;
+ case MSR_IA32_PRED_CMD:
+ if (!msr_info->host_initiated &&
+ !guest_has_pred_cmd_msr(vcpu))
+ return 1;
+
+ if (data & ~PRED_CMD_IBPB)
+ return 1;
+
+ if (!data)
+ break;
+
+ wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
+
+ /*
+ * For non-nested:
+ * When it's written (to non-zero) for the first time, pass
+ * it through.
+ *
+ * For nested:
+ * The handling of the MSR bitmap for L2 guests is done in
+ * nested_vmx_merge_msr_bitmap. We should not touch the
+ * vmcs02.msr_bitmap here since it gets completely overwritten
+ * in the merging.
+ */
+ vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD,
+ MSR_TYPE_W);
+ break;
+ case MSR_IA32_CR_PAT:
+ if (!kvm_pat_valid(data))
+ return 1;
+
+ if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
+ vmcs_write64(GUEST_IA32_PAT, data);
+ vcpu->arch.pat = data;
+ break;
+ }
+ ret = kvm_set_msr_common(vcpu, msr_info);
+ break;
+ case MSR_IA32_TSC_ADJUST:
+ ret = kvm_set_msr_common(vcpu, msr_info);
+ break;
+ case MSR_IA32_MCG_EXT_CTL:
+ if ((!msr_info->host_initiated &&
+ !(to_vmx(vcpu)->msr_ia32_feature_control &
+ FEATURE_CONTROL_LMCE)) ||
+ (data & ~MCG_EXT_CTL_LMCE_EN))
+ return 1;
+ vcpu->arch.mcg_ext_ctl = data;
+ break;
+ case MSR_IA32_FEATURE_CONTROL:
+ if (!vmx_feature_control_msr_valid(vcpu, data) ||
+ (to_vmx(vcpu)->msr_ia32_feature_control &
+ FEATURE_CONTROL_LOCKED && !msr_info->host_initiated))
+ return 1;
+ vmx->msr_ia32_feature_control = data;
+ if (msr_info->host_initiated && data == 0)
+ vmx_leave_nested(vcpu);
+ break;
+ case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
+ if (!msr_info->host_initiated)
+ return 1; /* they are read-only */
+ if (!nested_vmx_allowed(vcpu))
+ return 1;
+ return vmx_set_vmx_msr(vcpu, msr_index, data);
+ case MSR_IA32_XSS:
+ if (!vmx_xsaves_supported() ||
+ (!msr_info->host_initiated &&
+ !(guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
+ guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))))
+ return 1;
+ /*
+ * The only supported bit as of Skylake is bit 8, but
+ * it is not supported on KVM.
+ */
+ if (data != 0)
+ return 1;
+ vcpu->arch.ia32_xss = data;
+ if (vcpu->arch.ia32_xss != host_xss)
+ add_atomic_switch_msr(vmx, MSR_IA32_XSS,
+ vcpu->arch.ia32_xss, host_xss, false);
+ else
+ clear_atomic_switch_msr(vmx, MSR_IA32_XSS);
+ break;
+ case MSR_TSC_AUX:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
+ return 1;
+ /* Check reserved bit, higher 32 bits should be zero */
+ if ((data >> 32) != 0)
+ return 1;
+ /* Otherwise falls through */
+ default:
+ msr = find_msr_entry(vmx, msr_index);
+ if (msr) {
+ u64 old_msr_data = msr->data;
+ msr->data = data;
+ if (msr - vmx->guest_msrs < vmx->save_nmsrs) {
+ preempt_disable();
+ ret = kvm_set_shared_msr(msr->index, msr->data,
+ msr->mask);
+ preempt_enable();
+ if (ret)
+ msr->data = old_msr_data;
+ }
+ break;
+ }
+ ret = kvm_set_msr_common(vcpu, msr_info);
+ }
+
+ /* FB_CLEAR may have changed, also update the FB_CLEAR_DIS behavior */
+ if (msr_index == MSR_IA32_ARCH_CAPABILITIES)
+ vmx_update_fb_clear_dis(vcpu, vmx);
+
+ return ret;
+}
+
+static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
+{
+ __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
+ switch (reg) {
+ case VCPU_REGS_RSP:
+ vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
+ break;
+ case VCPU_REGS_RIP:
+ vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
+ break;
+ case VCPU_EXREG_PDPTR:
+ if (enable_ept)
+ ept_save_pdptrs(vcpu);
+ break;
+ default:
+ break;
+ }
+}
+
+static __init int cpu_has_kvm_support(void)
+{
+ return cpu_has_vmx();
+}
+
+static __init int vmx_disabled_by_bios(void)
+{
+ u64 msr;
+
+ rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
+ if (msr & FEATURE_CONTROL_LOCKED) {
+ /* launched w/ TXT and VMX disabled */
+ if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
+ && tboot_enabled())
+ return 1;
+ /* launched w/o TXT and VMX only enabled w/ TXT */
+ if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
+ && (msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
+ && !tboot_enabled()) {
+ printk(KERN_WARNING "kvm: disable TXT in the BIOS or "
+ "activate TXT before enabling KVM\n");
+ return 1;
+ }
+ /* launched w/o TXT and VMX disabled */
+ if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
+ && !tboot_enabled())
+ return 1;
+ }
+
+ return 0;
+}
+
+static void kvm_cpu_vmxon(u64 addr)
+{
+ cr4_set_bits(X86_CR4_VMXE);
+ intel_pt_handle_vmx(1);
+
+ asm volatile (ASM_VMX_VMXON_RAX
+ : : "a"(&addr), "m"(addr)
+ : "memory", "cc");
+}
+
+static int hardware_enable(void)
+{
+ int cpu = raw_smp_processor_id();
+ u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
+ u64 old, test_bits;
+
+ if (cr4_read_shadow() & X86_CR4_VMXE)
+ return -EBUSY;
+
+ /*
+ * This can happen if we hot-added a CPU but failed to allocate
+ * VP assist page for it.
+ */
+ if (static_branch_unlikely(&enable_evmcs) &&
+ !hv_get_vp_assist_page(cpu))
+ return -EFAULT;
+
+ rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
+
+ test_bits = FEATURE_CONTROL_LOCKED;
+ test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
+ if (tboot_enabled())
+ test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX;
+
+ if ((old & test_bits) != test_bits) {
+ /* enable and lock */
+ wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
+ }
+ kvm_cpu_vmxon(phys_addr);
+ if (enable_ept)
+ ept_sync_global();
+
+ return 0;
+}
+
+static void vmclear_local_loaded_vmcss(void)
+{
+ int cpu = raw_smp_processor_id();
+ struct loaded_vmcs *v, *n;
+
+ list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
+ loaded_vmcss_on_cpu_link)
+ __loaded_vmcs_clear(v);
+}
+
+
+/* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot()
+ * tricks.
+ */
+static void kvm_cpu_vmxoff(void)
+{
+ asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
+
+ intel_pt_handle_vmx(0);
+ cr4_clear_bits(X86_CR4_VMXE);
+}
+
+static void hardware_disable(void)
+{
+ vmclear_local_loaded_vmcss();
+ kvm_cpu_vmxoff();
+}
+
+static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
+ u32 msr, u32 *result)
+{
+ u32 vmx_msr_low, vmx_msr_high;
+ u32 ctl = ctl_min | ctl_opt;
+
+ rdmsr(msr, vmx_msr_low, vmx_msr_high);
+
+ ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
+ ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */
+
+ /* Ensure minimum (required) set of control bits are supported. */
+ if (ctl_min & ~ctl)
+ return -EIO;
+
+ *result = ctl;
+ return 0;
+}
+
+static __init bool allow_1_setting(u32 msr, u32 ctl)
+{
+ u32 vmx_msr_low, vmx_msr_high;
+
+ rdmsr(msr, vmx_msr_low, vmx_msr_high);
+ return vmx_msr_high & ctl;
+}
+
+static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
+{
+ u32 vmx_msr_low, vmx_msr_high;
+ u32 min, opt, min2, opt2;
+ u32 _pin_based_exec_control = 0;
+ u32 _cpu_based_exec_control = 0;
+ u32 _cpu_based_2nd_exec_control = 0;
+ u32 _vmexit_control = 0;
+ u32 _vmentry_control = 0;
+
+ memset(vmcs_conf, 0, sizeof(*vmcs_conf));
+ min = CPU_BASED_HLT_EXITING |
+#ifdef CONFIG_X86_64
+ CPU_BASED_CR8_LOAD_EXITING |
+ CPU_BASED_CR8_STORE_EXITING |
+#endif
+ CPU_BASED_CR3_LOAD_EXITING |
+ CPU_BASED_CR3_STORE_EXITING |
+ CPU_BASED_UNCOND_IO_EXITING |
+ CPU_BASED_MOV_DR_EXITING |
+ CPU_BASED_USE_TSC_OFFSETING |
+ CPU_BASED_MWAIT_EXITING |
+ CPU_BASED_MONITOR_EXITING |
+ CPU_BASED_INVLPG_EXITING |
+ CPU_BASED_RDPMC_EXITING;
+
+ opt = CPU_BASED_TPR_SHADOW |
+ CPU_BASED_USE_MSR_BITMAPS |
+ CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
+ if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
+ &_cpu_based_exec_control) < 0)
+ return -EIO;
+#ifdef CONFIG_X86_64
+ if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
+ _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
+ ~CPU_BASED_CR8_STORE_EXITING;
+#endif
+ if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
+ min2 = 0;
+ opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
+ SECONDARY_EXEC_WBINVD_EXITING |
+ SECONDARY_EXEC_ENABLE_VPID |
+ SECONDARY_EXEC_ENABLE_EPT |
+ SECONDARY_EXEC_UNRESTRICTED_GUEST |
+ SECONDARY_EXEC_PAUSE_LOOP_EXITING |
+ SECONDARY_EXEC_DESC |
+ SECONDARY_EXEC_RDTSCP |
+ SECONDARY_EXEC_ENABLE_INVPCID |
+ SECONDARY_EXEC_APIC_REGISTER_VIRT |
+ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
+ SECONDARY_EXEC_SHADOW_VMCS |
+ SECONDARY_EXEC_XSAVES |
+ SECONDARY_EXEC_RDSEED_EXITING |
+ SECONDARY_EXEC_RDRAND_EXITING |
+ SECONDARY_EXEC_ENABLE_PML |
+ SECONDARY_EXEC_TSC_SCALING |
+ SECONDARY_EXEC_ENABLE_VMFUNC |
+ SECONDARY_EXEC_ENCLS_EXITING;
+ if (adjust_vmx_controls(min2, opt2,
+ MSR_IA32_VMX_PROCBASED_CTLS2,
+ &_cpu_based_2nd_exec_control) < 0)
+ return -EIO;
+ }
+#ifndef CONFIG_X86_64
+ if (!(_cpu_based_2nd_exec_control &
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
+ _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
+#endif
+
+ if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
+ _cpu_based_2nd_exec_control &= ~(
+ SECONDARY_EXEC_APIC_REGISTER_VIRT |
+ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
+ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
+
+ rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP,
+ &vmx_capability.ept, &vmx_capability.vpid);
+
+ if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
+ /* CR3 accesses and invlpg don't need to cause VM Exits when EPT
+ enabled */
+ _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
+ CPU_BASED_CR3_STORE_EXITING |
+ CPU_BASED_INVLPG_EXITING);
+ } else if (vmx_capability.ept) {
+ vmx_capability.ept = 0;
+ pr_warn_once("EPT CAP should not exist if not support "
+ "1-setting enable EPT VM-execution control\n");
+ }
+ if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
+ vmx_capability.vpid) {
+ vmx_capability.vpid = 0;
+ pr_warn_once("VPID CAP should not exist if not support "
+ "1-setting enable VPID VM-execution control\n");
+ }
+
+ min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT;
+#ifdef CONFIG_X86_64
+ min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
+#endif
+ opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT |
+ VM_EXIT_CLEAR_BNDCFGS;
+ if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
+ &_vmexit_control) < 0)
+ return -EIO;
+
+ min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
+ opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR |
+ PIN_BASED_VMX_PREEMPTION_TIMER;
+ if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
+ &_pin_based_exec_control) < 0)
+ return -EIO;
+
+ if (cpu_has_broken_vmx_preemption_timer())
+ _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+ if (!(_cpu_based_2nd_exec_control &
+ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY))
+ _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
+
+ min = VM_ENTRY_LOAD_DEBUG_CONTROLS;
+ opt = VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS;
+ if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
+ &_vmentry_control) < 0)
+ return -EIO;
+
+ rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
+
+ /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
+ if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
+ return -EIO;
+
+#ifdef CONFIG_X86_64
+ /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
+ if (vmx_msr_high & (1u<<16))
+ return -EIO;
+#endif
+
+ /* Require Write-Back (WB) memory type for VMCS accesses. */
+ if (((vmx_msr_high >> 18) & 15) != 6)
+ return -EIO;
+
+ vmcs_conf->size = vmx_msr_high & 0x1fff;
+ vmcs_conf->order = get_order(vmcs_conf->size);
+ vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
+
+ vmcs_conf->revision_id = vmx_msr_low;
+
+ vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
+ vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
+ vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
+ vmcs_conf->vmexit_ctrl = _vmexit_control;
+ vmcs_conf->vmentry_ctrl = _vmentry_control;
+
+ if (static_branch_unlikely(&enable_evmcs))
+ evmcs_sanitize_exec_ctrls(vmcs_conf);
+
+ cpu_has_load_ia32_efer =
+ allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS,
+ VM_ENTRY_LOAD_IA32_EFER)
+ && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS,
+ VM_EXIT_LOAD_IA32_EFER);
+
+ cpu_has_load_perf_global_ctrl =
+ allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS,
+ VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
+ && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS,
+ VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
+
+ /*
+ * Some cpus support VM_ENTRY_(LOAD|SAVE)_IA32_PERF_GLOBAL_CTRL
+ * but due to errata below it can't be used. Workaround is to use
+ * msr load mechanism to switch IA32_PERF_GLOBAL_CTRL.
+ *
+ * VM Exit May Incorrectly Clear IA32_PERF_GLOBAL_CTRL [34:32]
+ *
+ * AAK155 (model 26)
+ * AAP115 (model 30)
+ * AAT100 (model 37)
+ * BC86,AAY89,BD102 (model 44)
+ * BA97 (model 46)
+ *
+ */
+ if (cpu_has_load_perf_global_ctrl && boot_cpu_data.x86 == 0x6) {
+ switch (boot_cpu_data.x86_model) {
+ case 26:
+ case 30:
+ case 37:
+ case 44:
+ case 46:
+ cpu_has_load_perf_global_ctrl = false;
+ printk_once(KERN_WARNING"kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
+ "does not work properly. Using workaround\n");
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (boot_cpu_has(X86_FEATURE_XSAVES))
+ rdmsrl(MSR_IA32_XSS, host_xss);
+
+ return 0;
+}
+
+static struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu)
+{
+ int node = cpu_to_node(cpu);
+ struct page *pages;
+ struct vmcs *vmcs;
+
+ pages = __alloc_pages_node(node, GFP_KERNEL, vmcs_config.order);
+ if (!pages)
+ return NULL;
+ vmcs = page_address(pages);
+ memset(vmcs, 0, vmcs_config.size);
+
+ /* KVM supports Enlightened VMCS v1 only */
+ if (static_branch_unlikely(&enable_evmcs))
+ vmcs->hdr.revision_id = KVM_EVMCS_VERSION;
+ else
+ vmcs->hdr.revision_id = vmcs_config.revision_id;
+
+ if (shadow)
+ vmcs->hdr.shadow_vmcs = 1;
+ return vmcs;
+}
+
+static void free_vmcs(struct vmcs *vmcs)
+{
+ free_pages((unsigned long)vmcs, vmcs_config.order);
+}
+
+/*
+ * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded
+ */
+static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
+{
+ if (!loaded_vmcs->vmcs)
+ return;
+ loaded_vmcs_clear(loaded_vmcs);
+ free_vmcs(loaded_vmcs->vmcs);
+ loaded_vmcs->vmcs = NULL;
+ if (loaded_vmcs->msr_bitmap)
+ free_page((unsigned long)loaded_vmcs->msr_bitmap);
+ WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
+}
+
+static struct vmcs *alloc_vmcs(bool shadow)
+{
+ return alloc_vmcs_cpu(shadow, raw_smp_processor_id());
+}
+
+static int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
+{
+ loaded_vmcs->vmcs = alloc_vmcs(false);
+ if (!loaded_vmcs->vmcs)
+ return -ENOMEM;
+
+ loaded_vmcs->shadow_vmcs = NULL;
+ loaded_vmcs_init(loaded_vmcs);
+
+ if (cpu_has_vmx_msr_bitmap()) {
+ loaded_vmcs->msr_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
+ if (!loaded_vmcs->msr_bitmap)
+ goto out_vmcs;
+ memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
+
+ if (IS_ENABLED(CONFIG_HYPERV) &&
+ static_branch_unlikely(&enable_evmcs) &&
+ (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
+ struct hv_enlightened_vmcs *evmcs =
+ (struct hv_enlightened_vmcs *)loaded_vmcs->vmcs;
+
+ evmcs->hv_enlightenments_control.msr_bitmap = 1;
+ }
+ }
+
+ memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state));
+
+ return 0;
+
+out_vmcs:
+ free_loaded_vmcs(loaded_vmcs);
+ return -ENOMEM;
+}
+
+static void free_kvm_area(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ free_vmcs(per_cpu(vmxarea, cpu));
+ per_cpu(vmxarea, cpu) = NULL;
+ }
+}
+
+enum vmcs_field_width {
+ VMCS_FIELD_WIDTH_U16 = 0,
+ VMCS_FIELD_WIDTH_U64 = 1,
+ VMCS_FIELD_WIDTH_U32 = 2,
+ VMCS_FIELD_WIDTH_NATURAL_WIDTH = 3
+};
+
+static inline int vmcs_field_width(unsigned long field)
+{
+ if (0x1 & field) /* the *_HIGH fields are all 32 bit */
+ return VMCS_FIELD_WIDTH_U32;
+ return (field >> 13) & 0x3 ;
+}
+
+static inline int vmcs_field_readonly(unsigned long field)
+{
+ return (((field >> 10) & 0x3) == 1);
+}
+
+static void init_vmcs_shadow_fields(void)
+{
+ int i, j;
+
+ for (i = j = 0; i < max_shadow_read_only_fields; i++) {
+ u16 field = shadow_read_only_fields[i];
+ if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
+ (i + 1 == max_shadow_read_only_fields ||
+ shadow_read_only_fields[i + 1] != field + 1))
+ pr_err("Missing field from shadow_read_only_field %x\n",
+ field + 1);
+
+ clear_bit(field, vmx_vmread_bitmap);
+#ifdef CONFIG_X86_64
+ if (field & 1)
+ continue;
+#endif
+ if (j < i)
+ shadow_read_only_fields[j] = field;
+ j++;
+ }
+ max_shadow_read_only_fields = j;
+
+ for (i = j = 0; i < max_shadow_read_write_fields; i++) {
+ u16 field = shadow_read_write_fields[i];
+ if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
+ (i + 1 == max_shadow_read_write_fields ||
+ shadow_read_write_fields[i + 1] != field + 1))
+ pr_err("Missing field from shadow_read_write_field %x\n",
+ field + 1);
+
+ /*
+ * PML and the preemption timer can be emulated, but the
+ * processor cannot vmwrite to fields that don't exist
+ * on bare metal.
+ */
+ switch (field) {
+ case GUEST_PML_INDEX:
+ if (!cpu_has_vmx_pml())
+ continue;
+ break;
+ case VMX_PREEMPTION_TIMER_VALUE:
+ if (!cpu_has_vmx_preemption_timer())
+ continue;
+ break;
+ case GUEST_INTR_STATUS:
+ if (!cpu_has_vmx_apicv())
+ continue;
+ break;
+ default:
+ break;
+ }
+
+ clear_bit(field, vmx_vmwrite_bitmap);
+ clear_bit(field, vmx_vmread_bitmap);
+#ifdef CONFIG_X86_64
+ if (field & 1)
+ continue;
+#endif
+ if (j < i)
+ shadow_read_write_fields[j] = field;
+ j++;
+ }
+ max_shadow_read_write_fields = j;
+}
+
+static __init int alloc_kvm_area(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct vmcs *vmcs;
+
+ vmcs = alloc_vmcs_cpu(false, cpu);
+ if (!vmcs) {
+ free_kvm_area();
+ return -ENOMEM;
+ }
+
+ /*
+ * When eVMCS is enabled, alloc_vmcs_cpu() sets
+ * vmcs->revision_id to KVM_EVMCS_VERSION instead of
+ * revision_id reported by MSR_IA32_VMX_BASIC.
+ *
+ * However, even though not explictly documented by
+ * TLFS, VMXArea passed as VMXON argument should
+ * still be marked with revision_id reported by
+ * physical CPU.
+ */
+ if (static_branch_unlikely(&enable_evmcs))
+ vmcs->hdr.revision_id = vmcs_config.revision_id;
+
+ per_cpu(vmxarea, cpu) = vmcs;
+ }
+ return 0;
+}
+
+static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
+ struct kvm_segment *save)
+{
+ if (!emulate_invalid_guest_state) {
+ /*
+ * CS and SS RPL should be equal during guest entry according
+ * to VMX spec, but in reality it is not always so. Since vcpu
+ * is in the middle of the transition from real mode to
+ * protected mode it is safe to assume that RPL 0 is a good
+ * default value.
+ */
+ if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS)
+ save->selector &= ~SEGMENT_RPL_MASK;
+ save->dpl = save->selector & SEGMENT_RPL_MASK;
+ save->s = 1;
+ }
+ vmx_set_segment(vcpu, save, seg);
+}
+
+static void enter_pmode(struct kvm_vcpu *vcpu)
+{
+ unsigned long flags;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ /*
+ * Update real mode segment cache. It may be not up-to-date if sement
+ * register was written while vcpu was in a guest mode.
+ */
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
+
+ vmx->rmode.vm86_active = 0;
+
+ vmx_segment_cache_clear(vmx);
+
+ vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
+
+ flags = vmcs_readl(GUEST_RFLAGS);
+ flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
+ flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
+ vmcs_writel(GUEST_RFLAGS, flags);
+
+ vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
+ (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
+
+ update_exception_bitmap(vcpu);
+
+ fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
+ fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
+ fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
+ fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
+ fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
+ fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
+}
+
+static void fix_rmode_seg(int seg, struct kvm_segment *save)
+{
+ const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+ struct kvm_segment var = *save;
+
+ var.dpl = 0x3;
+ if (seg == VCPU_SREG_CS)
+ var.type = 0x3;
+
+ if (!emulate_invalid_guest_state) {
+ var.selector = var.base >> 4;
+ var.base = var.base & 0xffff0;
+ var.limit = 0xffff;
+ var.g = 0;
+ var.db = 0;
+ var.present = 1;
+ var.s = 1;
+ var.l = 0;
+ var.unusable = 0;
+ var.type = 0x3;
+ var.avl = 0;
+ if (save->base & 0xf)
+ printk_once(KERN_WARNING "kvm: segment base is not "
+ "paragraph aligned when entering "
+ "protected mode (seg=%d)", seg);
+ }
+
+ vmcs_write16(sf->selector, var.selector);
+ vmcs_writel(sf->base, var.base);
+ vmcs_write32(sf->limit, var.limit);
+ vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var));
+}
+
+static void enter_rmode(struct kvm_vcpu *vcpu)
+{
+ unsigned long flags;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm);
+
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
+ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
+
+ vmx->rmode.vm86_active = 1;
+
+ /*
+ * Very old userspace does not call KVM_SET_TSS_ADDR before entering
+ * vcpu. Warn the user that an update is overdue.
+ */
+ if (!kvm_vmx->tss_addr)
+ printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be "
+ "called before entering vcpu\n");
+
+ vmx_segment_cache_clear(vmx);
+
+ vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr);
+ vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
+ vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
+
+ flags = vmcs_readl(GUEST_RFLAGS);
+ vmx->rmode.save_rflags = flags;
+
+ flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
+
+ vmcs_writel(GUEST_RFLAGS, flags);
+ vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
+ update_exception_bitmap(vcpu);
+
+ fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
+ fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
+ fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
+ fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
+ fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
+ fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
+
+ kvm_mmu_reset_context(vcpu);
+}
+
+static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
+
+ if (!msr)
+ return;
+
+ vcpu->arch.efer = efer;
+ if (efer & EFER_LMA) {
+ vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
+ msr->data = efer;
+ } else {
+ vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
+
+ msr->data = efer & ~EFER_LME;
+ }
+ setup_msrs(vmx);
+}
+
+#ifdef CONFIG_X86_64
+
+static void enter_lmode(struct kvm_vcpu *vcpu)
+{
+ u32 guest_tr_ar;
+
+ vmx_segment_cache_clear(to_vmx(vcpu));
+
+ guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
+ if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) {
+ pr_debug_ratelimited("%s: tss fixup for long mode. \n",
+ __func__);
+ vmcs_write32(GUEST_TR_AR_BYTES,
+ (guest_tr_ar & ~VMX_AR_TYPE_MASK)
+ | VMX_AR_TYPE_BUSY_64_TSS);
+ }
+ vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA);
+}
+
+static void exit_lmode(struct kvm_vcpu *vcpu)
+{
+ vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
+ vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA);
+}
+
+#endif
+
+static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid,
+ bool invalidate_gpa)
+{
+ if (enable_ept && (invalidate_gpa || !enable_vpid)) {
+ if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ return;
+ ept_sync_context(construct_eptp(vcpu, vcpu->arch.mmu.root_hpa));
+ } else {
+ vpid_sync_context(vpid);
+ }
+}
+
+static void vmx_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
+{
+ __vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid, invalidate_gpa);
+}
+
+static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
+{
+ int vpid = to_vmx(vcpu)->vpid;
+
+ if (!vpid_sync_vcpu_addr(vpid, addr))
+ vpid_sync_context(vpid);
+
+ /*
+ * If VPIDs are not supported or enabled, then the above is a no-op.
+ * But we don't really need a TLB flush in that case anyway, because
+ * each VM entry/exit includes an implicit flush when VPID is 0.
+ */
+}
+
+static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
+{
+ ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
+
+ vcpu->arch.cr0 &= ~cr0_guest_owned_bits;
+ vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits;
+}
+
+static void vmx_decache_cr3(struct kvm_vcpu *vcpu)
+{
+ if (enable_unrestricted_guest || (enable_ept && is_paging(vcpu)))
+ vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
+ __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
+}
+
+static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
+{
+ ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
+
+ vcpu->arch.cr4 &= ~cr4_guest_owned_bits;
+ vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits;
+}
+
+static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
+
+ if (!test_bit(VCPU_EXREG_PDPTR,
+ (unsigned long *)&vcpu->arch.regs_dirty))
+ return;
+
+ if (is_pae_paging(vcpu)) {
+ vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]);
+ vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]);
+ vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]);
+ vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]);
+ }
+}
+
+static void ept_save_pdptrs(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
+
+ if (is_pae_paging(vcpu)) {
+ mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
+ mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
+ mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
+ mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
+ }
+
+ __set_bit(VCPU_EXREG_PDPTR,
+ (unsigned long *)&vcpu->arch.regs_avail);
+ __set_bit(VCPU_EXREG_PDPTR,
+ (unsigned long *)&vcpu->arch.regs_dirty);
+}
+
+static bool nested_guest_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
+{
+ u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr0_fixed0;
+ u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr0_fixed1;
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+ if (to_vmx(vcpu)->nested.msrs.secondary_ctls_high &
+ SECONDARY_EXEC_UNRESTRICTED_GUEST &&
+ nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
+ fixed0 &= ~(X86_CR0_PE | X86_CR0_PG);
+
+ return fixed_bits_valid(val, fixed0, fixed1);
+}
+
+static bool nested_host_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
+{
+ u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr0_fixed0;
+ u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr0_fixed1;
+
+ return fixed_bits_valid(val, fixed0, fixed1);
+}
+
+static bool nested_cr4_valid(struct kvm_vcpu *vcpu, unsigned long val)
+{
+ u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr4_fixed0;
+ u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr4_fixed1;
+
+ return fixed_bits_valid(val, fixed0, fixed1);
+}
+
+/* No difference in the restrictions on guest and host CR4 in VMX operation. */
+#define nested_guest_cr4_valid nested_cr4_valid
+#define nested_host_cr4_valid nested_cr4_valid
+
+static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
+
+static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
+ unsigned long cr0,
+ struct kvm_vcpu *vcpu)
+{
+ if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
+ vmx_decache_cr3(vcpu);
+ if (!(cr0 & X86_CR0_PG)) {
+ /* From paging/starting to nonpaging */
+ vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
+ vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) |
+ (CPU_BASED_CR3_LOAD_EXITING |
+ CPU_BASED_CR3_STORE_EXITING));
+ vcpu->arch.cr0 = cr0;
+ vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
+ } else if (!is_paging(vcpu)) {
+ /* From nonpaging to paging */
+ vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
+ vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) &
+ ~(CPU_BASED_CR3_LOAD_EXITING |
+ CPU_BASED_CR3_STORE_EXITING));
+ vcpu->arch.cr0 = cr0;
+ vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
+ }
+
+ if (!(cr0 & X86_CR0_WP))
+ *hw_cr0 &= ~X86_CR0_WP;
+}
+
+static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long hw_cr0;
+
+ hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK);
+ if (enable_unrestricted_guest)
+ hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
+ else {
+ hw_cr0 |= KVM_VM_CR0_ALWAYS_ON;
+
+ if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
+ enter_pmode(vcpu);
+
+ if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE))
+ enter_rmode(vcpu);
+ }
+
+#ifdef CONFIG_X86_64
+ if (vcpu->arch.efer & EFER_LME) {
+ if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
+ enter_lmode(vcpu);
+ if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
+ exit_lmode(vcpu);
+ }
+#endif
+
+ if (enable_ept && !enable_unrestricted_guest)
+ ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
+
+ vmcs_writel(CR0_READ_SHADOW, cr0);
+ vmcs_writel(GUEST_CR0, hw_cr0);
+ vcpu->arch.cr0 = cr0;
+
+ /* depends on vcpu->arch.cr0 to be set to a new value */
+ vmx->emulation_required = emulation_required(vcpu);
+}
+
+static int get_ept_level(struct kvm_vcpu *vcpu)
+{
+ /* Nested EPT currently only supports 4-level walks. */
+ if (is_guest_mode(vcpu) && nested_cpu_has_ept(get_vmcs12(vcpu)))
+ return 4;
+ if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48))
+ return 5;
+ return 4;
+}
+
+static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa)
+{
+ u64 eptp = VMX_EPTP_MT_WB;
+
+ eptp |= (get_ept_level(vcpu) == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4;
+
+ if (enable_ept_ad_bits &&
+ (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu)))
+ eptp |= VMX_EPTP_AD_ENABLE_BIT;
+ eptp |= (root_hpa & PAGE_MASK);
+
+ return eptp;
+}
+
+static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
+{
+ struct kvm *kvm = vcpu->kvm;
+ unsigned long guest_cr3;
+ u64 eptp;
+
+ guest_cr3 = cr3;
+ if (enable_ept) {
+ eptp = construct_eptp(vcpu, cr3);
+ vmcs_write64(EPT_POINTER, eptp);
+
+ if (kvm_x86_ops->tlb_remote_flush) {
+ spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
+ to_vmx(vcpu)->ept_pointer = eptp;
+ to_kvm_vmx(kvm)->ept_pointers_match
+ = EPT_POINTERS_CHECK;
+ spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
+ }
+
+ if (enable_unrestricted_guest || is_paging(vcpu) ||
+ is_guest_mode(vcpu))
+ guest_cr3 = kvm_read_cr3(vcpu);
+ else
+ guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr;
+ ept_load_pdptrs(vcpu);
+ }
+
+ vmcs_writel(GUEST_CR3, guest_cr3);
+}
+
+static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+ /*
+ * Pass through host's Machine Check Enable value to hw_cr4, which
+ * is in force while we are in guest mode. Do not let guests control
+ * this bit, even if host CR4.MCE == 0.
+ */
+ unsigned long hw_cr4;
+
+ hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE);
+ if (enable_unrestricted_guest)
+ hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST;
+ else if (to_vmx(vcpu)->rmode.vm86_active)
+ hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON;
+ else
+ hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON;
+
+ if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated()) {
+ if (cr4 & X86_CR4_UMIP) {
+ vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
+ SECONDARY_EXEC_DESC);
+ hw_cr4 &= ~X86_CR4_UMIP;
+ } else if (!is_guest_mode(vcpu) ||
+ !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC))
+ vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
+ SECONDARY_EXEC_DESC);
+ }
+
+ if (cr4 & X86_CR4_VMXE) {
+ /*
+ * To use VMXON (and later other VMX instructions), a guest
+ * must first be able to turn on cr4.VMXE (see handle_vmon()).
+ * So basically the check on whether to allow nested VMX
+ * is here. We operate under the default treatment of SMM,
+ * so VMX cannot be enabled under SMM.
+ */
+ if (!nested_vmx_allowed(vcpu) || is_smm(vcpu))
+ return 1;
+ }
+
+ if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4))
+ return 1;
+
+ vcpu->arch.cr4 = cr4;
+
+ if (!enable_unrestricted_guest) {
+ if (enable_ept) {
+ if (!is_paging(vcpu)) {
+ hw_cr4 &= ~X86_CR4_PAE;
+ hw_cr4 |= X86_CR4_PSE;
+ } else if (!(cr4 & X86_CR4_PAE)) {
+ hw_cr4 &= ~X86_CR4_PAE;
+ }
+ }
+
+ /*
+ * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in
+ * hardware. To emulate this behavior, SMEP/SMAP/PKU needs
+ * to be manually disabled when guest switches to non-paging
+ * mode.
+ *
+ * If !enable_unrestricted_guest, the CPU is always running
+ * with CR0.PG=1 and CR4 needs to be modified.
+ * If enable_unrestricted_guest, the CPU automatically
+ * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0.
+ */
+ if (!is_paging(vcpu))
+ hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
+ }
+
+ vmcs_writel(CR4_READ_SHADOW, cr4);
+ vmcs_writel(GUEST_CR4, hw_cr4);
+ return 0;
+}
+
+static void vmx_get_segment(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u32 ar;
+
+ if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
+ *var = vmx->rmode.segs[seg];
+ if (seg == VCPU_SREG_TR
+ || var->selector == vmx_read_guest_seg_selector(vmx, seg))
+ return;
+ var->base = vmx_read_guest_seg_base(vmx, seg);
+ var->selector = vmx_read_guest_seg_selector(vmx, seg);
+ return;
+ }
+ var->base = vmx_read_guest_seg_base(vmx, seg);
+ var->limit = vmx_read_guest_seg_limit(vmx, seg);
+ var->selector = vmx_read_guest_seg_selector(vmx, seg);
+ ar = vmx_read_guest_seg_ar(vmx, seg);
+ var->unusable = (ar >> 16) & 1;
+ var->type = ar & 15;
+ var->s = (ar >> 4) & 1;
+ var->dpl = (ar >> 5) & 3;
+ /*
+ * Some userspaces do not preserve unusable property. Since usable
+ * segment has to be present according to VMX spec we can use present
+ * property to amend userspace bug by making unusable segment always
+ * nonpresent. vmx_segment_access_rights() already marks nonpresent
+ * segment as unusable.
+ */
+ var->present = !var->unusable;
+ var->avl = (ar >> 12) & 1;
+ var->l = (ar >> 13) & 1;
+ var->db = (ar >> 14) & 1;
+ var->g = (ar >> 15) & 1;
+}
+
+static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
+{
+ struct kvm_segment s;
+
+ if (to_vmx(vcpu)->rmode.vm86_active) {
+ vmx_get_segment(vcpu, &s, seg);
+ return s.base;
+ }
+ return vmx_read_guest_seg_base(to_vmx(vcpu), seg);
+}
+
+static int vmx_get_cpl(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (unlikely(vmx->rmode.vm86_active))
+ return 0;
+ else {
+ int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS);
+ return VMX_AR_DPL(ar);
+ }
+}
+
+static u32 vmx_segment_access_rights(struct kvm_segment *var)
+{
+ u32 ar;
+
+ if (var->unusable || !var->present)
+ ar = 1 << 16;
+ else {
+ ar = var->type & 15;
+ ar |= (var->s & 1) << 4;
+ ar |= (var->dpl & 3) << 5;
+ ar |= (var->present & 1) << 7;
+ ar |= (var->avl & 1) << 12;
+ ar |= (var->l & 1) << 13;
+ ar |= (var->db & 1) << 14;
+ ar |= (var->g & 1) << 15;
+ }
+
+ return ar;
+}
+
+static void vmx_set_segment(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+
+ vmx_segment_cache_clear(vmx);
+
+ if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
+ vmx->rmode.segs[seg] = *var;
+ if (seg == VCPU_SREG_TR)
+ vmcs_write16(sf->selector, var->selector);
+ else if (var->s)
+ fix_rmode_seg(seg, &vmx->rmode.segs[seg]);
+ goto out;
+ }
+
+ vmcs_writel(sf->base, var->base);
+ vmcs_write32(sf->limit, var->limit);
+ vmcs_write16(sf->selector, var->selector);
+
+ /*
+ * Fix the "Accessed" bit in AR field of segment registers for older
+ * qemu binaries.
+ * IA32 arch specifies that at the time of processor reset the
+ * "Accessed" bit in the AR field of segment registers is 1. And qemu
+ * is setting it to 0 in the userland code. This causes invalid guest
+ * state vmexit when "unrestricted guest" mode is turned on.
+ * Fix for this setup issue in cpu_reset is being pushed in the qemu
+ * tree. Newer qemu binaries with that qemu fix would not need this
+ * kvm hack.
+ */
+ if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR))
+ var->type |= 0x1; /* Accessed */
+
+ vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var));
+
+out:
+ vmx->emulation_required = emulation_required(vcpu);
+}
+
+static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
+{
+ u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS);
+
+ *db = (ar >> 14) & 1;
+ *l = (ar >> 13) & 1;
+}
+
+static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+{
+ dt->size = vmcs_read32(GUEST_IDTR_LIMIT);
+ dt->address = vmcs_readl(GUEST_IDTR_BASE);
+}
+
+static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+{
+ vmcs_write32(GUEST_IDTR_LIMIT, dt->size);
+ vmcs_writel(GUEST_IDTR_BASE, dt->address);
+}
+
+static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+{
+ dt->size = vmcs_read32(GUEST_GDTR_LIMIT);
+ dt->address = vmcs_readl(GUEST_GDTR_BASE);
+}
+
+static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+{
+ vmcs_write32(GUEST_GDTR_LIMIT, dt->size);
+ vmcs_writel(GUEST_GDTR_BASE, dt->address);
+}
+
+static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
+{
+ struct kvm_segment var;
+ u32 ar;
+
+ vmx_get_segment(vcpu, &var, seg);
+ var.dpl = 0x3;
+ if (seg == VCPU_SREG_CS)
+ var.type = 0x3;
+ ar = vmx_segment_access_rights(&var);
+
+ if (var.base != (var.selector << 4))
+ return false;
+ if (var.limit != 0xffff)
+ return false;
+ if (ar != 0xf3)
+ return false;
+
+ return true;
+}
+
+static bool code_segment_valid(struct kvm_vcpu *vcpu)
+{
+ struct kvm_segment cs;
+ unsigned int cs_rpl;
+
+ vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
+ cs_rpl = cs.selector & SEGMENT_RPL_MASK;
+
+ if (cs.unusable)
+ return false;
+ if (~cs.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_ACCESSES_MASK))
+ return false;
+ if (!cs.s)
+ return false;
+ if (cs.type & VMX_AR_TYPE_WRITEABLE_MASK) {
+ if (cs.dpl > cs_rpl)
+ return false;
+ } else {
+ if (cs.dpl != cs_rpl)
+ return false;
+ }
+ if (!cs.present)
+ return false;
+
+ /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */
+ return true;
+}
+
+static bool stack_segment_valid(struct kvm_vcpu *vcpu)
+{
+ struct kvm_segment ss;
+ unsigned int ss_rpl;
+
+ vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
+ ss_rpl = ss.selector & SEGMENT_RPL_MASK;
+
+ if (ss.unusable)
+ return true;
+ if (ss.type != 3 && ss.type != 7)
+ return false;
+ if (!ss.s)
+ return false;
+ if (ss.dpl != ss_rpl) /* DPL != RPL */
+ return false;
+ if (!ss.present)
+ return false;
+
+ return true;
+}
+
+static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
+{
+ struct kvm_segment var;
+ unsigned int rpl;
+
+ vmx_get_segment(vcpu, &var, seg);
+ rpl = var.selector & SEGMENT_RPL_MASK;
+
+ if (var.unusable)
+ return true;
+ if (!var.s)
+ return false;
+ if (!var.present)
+ return false;
+ if (~var.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_WRITEABLE_MASK)) {
+ if (var.dpl < rpl) /* DPL < RPL */
+ return false;
+ }
+
+ /* TODO: Add other members to kvm_segment_field to allow checking for other access
+ * rights flags
+ */
+ return true;
+}
+
+static bool tr_valid(struct kvm_vcpu *vcpu)
+{
+ struct kvm_segment tr;
+
+ vmx_get_segment(vcpu, &tr, VCPU_SREG_TR);
+
+ if (tr.unusable)
+ return false;
+ if (tr.selector & SEGMENT_TI_MASK) /* TI = 1 */
+ return false;
+ if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */
+ return false;
+ if (!tr.present)
+ return false;
+
+ return true;
+}
+
+static bool ldtr_valid(struct kvm_vcpu *vcpu)
+{
+ struct kvm_segment ldtr;
+
+ vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR);
+
+ if (ldtr.unusable)
+ return true;
+ if (ldtr.selector & SEGMENT_TI_MASK) /* TI = 1 */
+ return false;
+ if (ldtr.type != 2)
+ return false;
+ if (!ldtr.present)
+ return false;
+
+ return true;
+}
+
+static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
+{
+ struct kvm_segment cs, ss;
+
+ vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
+ vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
+
+ return ((cs.selector & SEGMENT_RPL_MASK) ==
+ (ss.selector & SEGMENT_RPL_MASK));
+}
+
+static bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu,
+ unsigned int port, int size);
+static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ unsigned long exit_qualification;
+ unsigned short port;
+ int size;
+
+ if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
+ return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
+
+ exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+
+ port = exit_qualification >> 16;
+ size = (exit_qualification & 7) + 1;
+
+ return nested_vmx_check_io_bitmaps(vcpu, port, size);
+}
+
+/*
+ * Check if guest state is valid. Returns true if valid, false if
+ * not.
+ * We assume that registers are always usable
+ */
+static bool guest_state_valid(struct kvm_vcpu *vcpu)
+{
+ if (enable_unrestricted_guest)
+ return true;
+
+ /* real mode guest state checks */
+ if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
+ if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
+ return false;
+ if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
+ return false;
+ if (!rmode_segment_valid(vcpu, VCPU_SREG_DS))
+ return false;
+ if (!rmode_segment_valid(vcpu, VCPU_SREG_ES))
+ return false;
+ if (!rmode_segment_valid(vcpu, VCPU_SREG_FS))
+ return false;
+ if (!rmode_segment_valid(vcpu, VCPU_SREG_GS))
+ return false;
+ } else {
+ /* protected mode guest state checks */
+ if (!cs_ss_rpl_check(vcpu))
+ return false;
+ if (!code_segment_valid(vcpu))
+ return false;
+ if (!stack_segment_valid(vcpu))
+ return false;
+ if (!data_segment_valid(vcpu, VCPU_SREG_DS))
+ return false;
+ if (!data_segment_valid(vcpu, VCPU_SREG_ES))
+ return false;
+ if (!data_segment_valid(vcpu, VCPU_SREG_FS))
+ return false;
+ if (!data_segment_valid(vcpu, VCPU_SREG_GS))
+ return false;
+ if (!tr_valid(vcpu))
+ return false;
+ if (!ldtr_valid(vcpu))
+ return false;
+ }
+ /* TODO:
+ * - Add checks on RIP
+ * - Add checks on RFLAGS
+ */
+
+ return true;
+}
+
+static bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa)
+{
+ return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu));
+}
+
+static int init_rmode_tss(struct kvm *kvm)
+{
+ gfn_t fn;
+ u16 data = 0;
+ int idx, r;
+
+ idx = srcu_read_lock(&kvm->srcu);
+ fn = to_kvm_vmx(kvm)->tss_addr >> PAGE_SHIFT;
+ r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
+ if (r < 0)
+ goto out;
+ data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
+ r = kvm_write_guest_page(kvm, fn++, &data,
+ TSS_IOPB_BASE_OFFSET, sizeof(u16));
+ if (r < 0)
+ goto out;
+ r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
+ if (r < 0)
+ goto out;
+ r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
+ if (r < 0)
+ goto out;
+ data = ~0;
+ r = kvm_write_guest_page(kvm, fn, &data,
+ RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1,
+ sizeof(u8));
+out:
+ srcu_read_unlock(&kvm->srcu, idx);
+ return r;
+}
+
+static int init_rmode_identity_map(struct kvm *kvm)
+{
+ struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
+ int i, idx, r = 0;
+ kvm_pfn_t identity_map_pfn;
+ u32 tmp;
+
+ /* Protect kvm_vmx->ept_identity_pagetable_done. */
+ mutex_lock(&kvm->slots_lock);
+
+ if (likely(kvm_vmx->ept_identity_pagetable_done))
+ goto out2;
+
+ if (!kvm_vmx->ept_identity_map_addr)
+ kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;
+ identity_map_pfn = kvm_vmx->ept_identity_map_addr >> PAGE_SHIFT;
+
+ r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
+ kvm_vmx->ept_identity_map_addr, PAGE_SIZE);
+ if (r < 0)
+ goto out2;
+
+ idx = srcu_read_lock(&kvm->srcu);
+ r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
+ if (r < 0)
+ goto out;
+ /* Set up identity-mapping pagetable for EPT in real mode */
+ for (i = 0; i < PT32_ENT_PER_PAGE; i++) {
+ tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
+ _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
+ r = kvm_write_guest_page(kvm, identity_map_pfn,
+ &tmp, i * sizeof(tmp), sizeof(tmp));
+ if (r < 0)
+ goto out;
+ }
+ kvm_vmx->ept_identity_pagetable_done = true;
+
+out:
+ srcu_read_unlock(&kvm->srcu, idx);
+
+out2:
+ mutex_unlock(&kvm->slots_lock);
+ return r;
+}
+
+static void seg_setup(int seg)
+{
+ const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
+ unsigned int ar;
+
+ vmcs_write16(sf->selector, 0);
+ vmcs_writel(sf->base, 0);
+ vmcs_write32(sf->limit, 0xffff);
+ ar = 0x93;
+ if (seg == VCPU_SREG_CS)
+ ar |= 0x08; /* code segment */
+
+ vmcs_write32(sf->ar_bytes, ar);
+}
+
+static int alloc_apic_access_page(struct kvm *kvm)
+{
+ struct page *page;
+ int r = 0;
+
+ mutex_lock(&kvm->slots_lock);
+ if (kvm->arch.apic_access_page_done)
+ goto out;
+ r = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
+ APIC_DEFAULT_PHYS_BASE, PAGE_SIZE);
+ if (r)
+ goto out;
+
+ page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
+ if (is_error_page(page)) {
+ r = -EFAULT;
+ goto out;
+ }
+
+ /*
+ * Do not pin the page in memory, so that memory hot-unplug
+ * is able to migrate it.
+ */
+ put_page(page);
+ kvm->arch.apic_access_page_done = true;
+out:
+ mutex_unlock(&kvm->slots_lock);
+ return r;
+}
+
+static int allocate_vpid(void)
+{
+ int vpid;
+
+ if (!enable_vpid)
+ return 0;
+ spin_lock(&vmx_vpid_lock);
+ vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
+ if (vpid < VMX_NR_VPIDS)
+ __set_bit(vpid, vmx_vpid_bitmap);
+ else
+ vpid = 0;
+ spin_unlock(&vmx_vpid_lock);
+ return vpid;
+}
+
+static void free_vpid(int vpid)
+{
+ if (!enable_vpid || vpid == 0)
+ return;
+ spin_lock(&vmx_vpid_lock);
+ __clear_bit(vpid, vmx_vpid_bitmap);
+ spin_unlock(&vmx_vpid_lock);
+}
+
+static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
+ u32 msr, int type)
+{
+ int f = sizeof(unsigned long);
+
+ if (!cpu_has_vmx_msr_bitmap())
+ return;
+
+ if (static_branch_unlikely(&enable_evmcs))
+ evmcs_touch_msr_bitmap();
+
+ /*
+ * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
+ * have the write-low and read-high bitmap offsets the wrong way round.
+ * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
+ */
+ if (msr <= 0x1fff) {
+ if (type & MSR_TYPE_R)
+ /* read-low */
+ __clear_bit(msr, msr_bitmap + 0x000 / f);
+
+ if (type & MSR_TYPE_W)
+ /* write-low */
+ __clear_bit(msr, msr_bitmap + 0x800 / f);
+
+ } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
+ msr &= 0x1fff;
+ if (type & MSR_TYPE_R)
+ /* read-high */
+ __clear_bit(msr, msr_bitmap + 0x400 / f);
+
+ if (type & MSR_TYPE_W)
+ /* write-high */
+ __clear_bit(msr, msr_bitmap + 0xc00 / f);
+
+ }
+}
+
+static __always_inline void vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
+ u32 msr, int type)
+{
+ int f = sizeof(unsigned long);
+
+ if (!cpu_has_vmx_msr_bitmap())
+ return;
+
+ if (static_branch_unlikely(&enable_evmcs))
+ evmcs_touch_msr_bitmap();
+
+ /*
+ * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
+ * have the write-low and read-high bitmap offsets the wrong way round.
+ * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
+ */
+ if (msr <= 0x1fff) {
+ if (type & MSR_TYPE_R)
+ /* read-low */
+ __set_bit(msr, msr_bitmap + 0x000 / f);
+
+ if (type & MSR_TYPE_W)
+ /* write-low */
+ __set_bit(msr, msr_bitmap + 0x800 / f);
+
+ } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
+ msr &= 0x1fff;
+ if (type & MSR_TYPE_R)
+ /* read-high */
+ __set_bit(msr, msr_bitmap + 0x400 / f);
+
+ if (type & MSR_TYPE_W)
+ /* write-high */
+ __set_bit(msr, msr_bitmap + 0xc00 / f);
+
+ }
+}
+
+static __always_inline void vmx_set_intercept_for_msr(unsigned long *msr_bitmap,
+ u32 msr, int type, bool value)
+{
+ if (value)
+ vmx_enable_intercept_for_msr(msr_bitmap, msr, type);
+ else
+ vmx_disable_intercept_for_msr(msr_bitmap, msr, type);
+}
+
+/*
+ * If a msr is allowed by L0, we should check whether it is allowed by L1.
+ * The corresponding bit will be cleared unless both of L0 and L1 allow it.
+ */
+static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1,
+ unsigned long *msr_bitmap_nested,
+ u32 msr, int type)
+{
+ int f = sizeof(unsigned long);
+
+ /*
+ * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
+ * have the write-low and read-high bitmap offsets the wrong way round.
+ * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
+ */
+ if (msr <= 0x1fff) {
+ if (type & MSR_TYPE_R &&
+ !test_bit(msr, msr_bitmap_l1 + 0x000 / f))
+ /* read-low */
+ __clear_bit(msr, msr_bitmap_nested + 0x000 / f);
+
+ if (type & MSR_TYPE_W &&
+ !test_bit(msr, msr_bitmap_l1 + 0x800 / f))
+ /* write-low */
+ __clear_bit(msr, msr_bitmap_nested + 0x800 / f);
+
+ } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
+ msr &= 0x1fff;
+ if (type & MSR_TYPE_R &&
+ !test_bit(msr, msr_bitmap_l1 + 0x400 / f))
+ /* read-high */
+ __clear_bit(msr, msr_bitmap_nested + 0x400 / f);
+
+ if (type & MSR_TYPE_W &&
+ !test_bit(msr, msr_bitmap_l1 + 0xc00 / f))
+ /* write-high */
+ __clear_bit(msr, msr_bitmap_nested + 0xc00 / f);
+
+ }
+}
+
+static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu)
+{
+ u8 mode = 0;
+
+ if (cpu_has_secondary_exec_ctrls() &&
+ (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
+ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
+ mode |= MSR_BITMAP_MODE_X2APIC;
+ if (enable_apicv && kvm_vcpu_apicv_active(vcpu))
+ mode |= MSR_BITMAP_MODE_X2APIC_APICV;
+ }
+
+ return mode;
+}
+
+#define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4))
+
+static void vmx_update_msr_bitmap_x2apic(unsigned long *msr_bitmap,
+ u8 mode)
+{
+ int msr;
+
+ for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
+ unsigned word = msr / BITS_PER_LONG;
+ msr_bitmap[word] = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0;
+ msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
+ }
+
+ if (mode & MSR_BITMAP_MODE_X2APIC) {
+ /*
+ * TPR reads and writes can be virtualized even if virtual interrupt
+ * delivery is not in use.
+ */
+ vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW);
+ if (mode & MSR_BITMAP_MODE_X2APIC_APICV) {
+ vmx_enable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_R);
+ vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
+ vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
+ }
+ }
+}
+
+static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
+ u8 mode = vmx_msr_bitmap_mode(vcpu);
+ u8 changed = mode ^ vmx->msr_bitmap_mode;
+
+ if (!changed)
+ return;
+
+ if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV))
+ vmx_update_msr_bitmap_x2apic(msr_bitmap, mode);
+
+ vmx->msr_bitmap_mode = mode;
+}
+
+static bool vmx_get_enable_apicv(struct kvm_vcpu *vcpu)
+{
+ return enable_apicv;
+}
+
+static void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ gfn_t gfn;
+
+ /*
+ * Don't need to mark the APIC access page dirty; it is never
+ * written to by the CPU during APIC virtualization.
+ */
+
+ if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
+ gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT;
+ kvm_vcpu_mark_page_dirty(vcpu, gfn);
+ }
+
+ if (nested_cpu_has_posted_intr(vmcs12)) {
+ gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT;
+ kvm_vcpu_mark_page_dirty(vcpu, gfn);
+ }
+}
+
+
+static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int max_irr;
+ void *vapic_page;
+ u16 status;
+
+ if (!vmx->nested.pi_desc || !vmx->nested.pi_pending)
+ return;
+
+ vmx->nested.pi_pending = false;
+ if (!pi_test_and_clear_on(vmx->nested.pi_desc))
+ return;
+
+ max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256);
+ if (max_irr != 256) {
+ vapic_page = kmap(vmx->nested.virtual_apic_page);
+ __kvm_apic_update_irr(vmx->nested.pi_desc->pir,
+ vapic_page, &max_irr);
+ kunmap(vmx->nested.virtual_apic_page);
+
+ status = vmcs_read16(GUEST_INTR_STATUS);
+ if ((u8)max_irr > ((u8)status & 0xff)) {
+ status &= ~0xff;
+ status |= (u8)max_irr;
+ vmcs_write16(GUEST_INTR_STATUS, status);
+ }
+ }
+
+ nested_mark_vmcs12_pages_dirty(vcpu);
+}
+
+static u8 vmx_get_rvi(void)
+{
+ return vmcs_read16(GUEST_INTR_STATUS) & 0xff;
+}
+
+static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ void *vapic_page;
+ u32 vppr;
+ int rvi;
+
+ if (WARN_ON_ONCE(!is_guest_mode(vcpu)) ||
+ !nested_cpu_has_vid(get_vmcs12(vcpu)) ||
+ WARN_ON_ONCE(!vmx->nested.virtual_apic_page))
+ return false;
+
+ rvi = vmx_get_rvi();
+
+ vapic_page = kmap(vmx->nested.virtual_apic_page);
+ vppr = *((u32 *)(vapic_page + APIC_PROCPRI));
+ kunmap(vmx->nested.virtual_apic_page);
+
+ return ((rvi & 0xf0) > (vppr & 0xf0));
+}
+
+static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
+ bool nested)
+{
+#ifdef CONFIG_SMP
+ int pi_vec = nested ? POSTED_INTR_NESTED_VECTOR : POSTED_INTR_VECTOR;
+
+ if (vcpu->mode == IN_GUEST_MODE) {
+ /*
+ * The vector of interrupt to be delivered to vcpu had
+ * been set in PIR before this function.
+ *
+ * Following cases will be reached in this block, and
+ * we always send a notification event in all cases as
+ * explained below.
+ *
+ * Case 1: vcpu keeps in non-root mode. Sending a
+ * notification event posts the interrupt to vcpu.
+ *
+ * Case 2: vcpu exits to root mode and is still
+ * runnable. PIR will be synced to vIRR before the
+ * next vcpu entry. Sending a notification event in
+ * this case has no effect, as vcpu is not in root
+ * mode.
+ *
+ * Case 3: vcpu exits to root mode and is blocked.
+ * vcpu_block() has already synced PIR to vIRR and
+ * never blocks vcpu if vIRR is not cleared. Therefore,
+ * a blocked vcpu here does not wait for any requested
+ * interrupts in PIR, and sending a notification event
+ * which has no effect is safe here.
+ */
+
+ apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec);
+ return true;
+ }
+#endif
+ return false;
+}
+
+static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
+ int vector)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (is_guest_mode(vcpu) &&
+ vector == vmx->nested.posted_intr_nv) {
+ /*
+ * If a posted intr is not recognized by hardware,
+ * we will accomplish it in the next vmentry.
+ */
+ vmx->nested.pi_pending = true;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ /* the PIR and ON have been set by L1. */
+ if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true))
+ kvm_vcpu_kick(vcpu);
+ return 0;
+ }
+ return -1;
+}
+/*
+ * Send interrupt to vcpu via posted interrupt way.
+ * 1. If target vcpu is running(non-root mode), send posted interrupt
+ * notification to vcpu and hardware will sync PIR to vIRR atomically.
+ * 2. If target vcpu isn't running(root mode), kick it to pick up the
+ * interrupt from PIR in next vmentry.
+ */
+static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int r;
+
+ r = vmx_deliver_nested_posted_interrupt(vcpu, vector);
+ if (!r)
+ return 0;
+
+ if (!vcpu->arch.apicv_active)
+ return -1;
+
+ if (pi_test_and_set_pir(vector, &vmx->pi_desc))
+ return 0;
+
+ /* If a previous notification has sent the IPI, nothing to do. */
+ if (pi_test_and_set_on(&vmx->pi_desc))
+ return 0;
+
+ if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false))
+ kvm_vcpu_kick(vcpu);
+
+ return 0;
+}
+
+/*
+ * Set up the vmcs's constant host-state fields, i.e., host-state fields that
+ * will not change in the lifetime of the guest.
+ * Note that host-state that does change is set elsewhere. E.g., host-state
+ * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
+ */
+static void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
+{
+ u32 low32, high32;
+ unsigned long tmpl;
+ struct desc_ptr dt;
+ unsigned long cr0, cr3, cr4;
+
+ cr0 = read_cr0();
+ WARN_ON(cr0 & X86_CR0_TS);
+ vmcs_writel(HOST_CR0, cr0); /* 22.2.3 */
+
+ /*
+ * Save the most likely value for this task's CR3 in the VMCS.
+ * We can't use __get_current_cr3_fast() because we're not atomic.
+ */
+ cr3 = __read_cr3();
+ vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */
+ vmx->loaded_vmcs->host_state.cr3 = cr3;
+
+ /* Save the most likely value for this task's CR4 in the VMCS. */
+ cr4 = cr4_read_shadow();
+ vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */
+ vmx->loaded_vmcs->host_state.cr4 = cr4;
+
+ vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */
+#ifdef CONFIG_X86_64
+ /*
+ * Load null selectors, so we can avoid reloading them in
+ * vmx_prepare_switch_to_host(), in case userspace uses
+ * the null selectors too (the expected case).
+ */
+ vmcs_write16(HOST_DS_SELECTOR, 0);
+ vmcs_write16(HOST_ES_SELECTOR, 0);
+#else
+ vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
+ vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */
+#endif
+ vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
+ vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
+
+ store_idt(&dt);
+ vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */
+ vmx->host_idt_base = dt.address;
+
+ vmcs_writel(HOST_RIP, vmx_return); /* 22.2.5 */
+
+ rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
+ vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
+ rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
+ vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */
+
+ if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
+ rdmsr(MSR_IA32_CR_PAT, low32, high32);
+ vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32));
+ }
+}
+
+static void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
+{
+ BUILD_BUG_ON(KVM_CR4_GUEST_OWNED_BITS & ~KVM_POSSIBLE_CR4_GUEST_BITS);
+
+ vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS;
+ if (enable_ept)
+ vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
+ if (is_guest_mode(&vmx->vcpu))
+ vmx->vcpu.arch.cr4_guest_owned_bits &=
+ ~get_vmcs12(&vmx->vcpu)->cr4_guest_host_mask;
+ vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
+}
+
+static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
+{
+ u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
+
+ if (!kvm_vcpu_apicv_active(&vmx->vcpu))
+ pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
+
+ if (!enable_vnmi)
+ pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS;
+
+ /* Enable the preemption timer dynamically */
+ pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+ return pin_based_exec_ctrl;
+}
+
+static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx));
+ if (cpu_has_secondary_exec_ctrls()) {
+ if (kvm_vcpu_apicv_active(vcpu))
+ vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
+ SECONDARY_EXEC_APIC_REGISTER_VIRT |
+ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
+ else
+ vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
+ SECONDARY_EXEC_APIC_REGISTER_VIRT |
+ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
+ }
+
+ if (cpu_has_vmx_msr_bitmap())
+ vmx_update_msr_bitmap(vcpu);
+}
+
+static u32 vmx_exec_control(struct vcpu_vmx *vmx)
+{
+ u32 exec_control = vmcs_config.cpu_based_exec_ctrl;
+
+ if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)
+ exec_control &= ~CPU_BASED_MOV_DR_EXITING;
+
+ if (!cpu_need_tpr_shadow(&vmx->vcpu)) {
+ exec_control &= ~CPU_BASED_TPR_SHADOW;
+#ifdef CONFIG_X86_64
+ exec_control |= CPU_BASED_CR8_STORE_EXITING |
+ CPU_BASED_CR8_LOAD_EXITING;
+#endif
+ }
+ if (!enable_ept)
+ exec_control |= CPU_BASED_CR3_STORE_EXITING |
+ CPU_BASED_CR3_LOAD_EXITING |
+ CPU_BASED_INVLPG_EXITING;
+ if (kvm_mwait_in_guest(vmx->vcpu.kvm))
+ exec_control &= ~(CPU_BASED_MWAIT_EXITING |
+ CPU_BASED_MONITOR_EXITING);
+ if (kvm_hlt_in_guest(vmx->vcpu.kvm))
+ exec_control &= ~CPU_BASED_HLT_EXITING;
+ return exec_control;
+}
+
+static bool vmx_rdrand_supported(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_RDRAND_EXITING;
+}
+
+static bool vmx_rdseed_supported(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_RDSEED_EXITING;
+}
+
+static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
+{
+ struct kvm_vcpu *vcpu = &vmx->vcpu;
+
+ u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
+
+ if (!cpu_need_virtualize_apic_accesses(vcpu))
+ exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+ if (vmx->vpid == 0)
+ exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
+ if (!enable_ept) {
+ exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
+ enable_unrestricted_guest = 0;
+ }
+ if (!enable_unrestricted_guest)
+ exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
+ if (kvm_pause_in_guest(vmx->vcpu.kvm))
+ exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
+ if (!kvm_vcpu_apicv_active(vcpu))
+ exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
+ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
+ exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
+
+ /* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP,
+ * in vmx_set_cr4. */
+ exec_control &= ~SECONDARY_EXEC_DESC;
+
+ /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD
+ (handle_vmptrld).
+ We can NOT enable shadow_vmcs here because we don't have yet
+ a current VMCS12
+ */
+ exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
+
+ if (!enable_pml)
+ exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
+
+ if (vmx_xsaves_supported()) {
+ /* Exposing XSAVES only when XSAVE is exposed */
+ bool xsaves_enabled =
+ guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
+ guest_cpuid_has(vcpu, X86_FEATURE_XSAVES);
+
+ if (!xsaves_enabled)
+ exec_control &= ~SECONDARY_EXEC_XSAVES;
+
+ if (nested) {
+ if (xsaves_enabled)
+ vmx->nested.msrs.secondary_ctls_high |=
+ SECONDARY_EXEC_XSAVES;
+ else
+ vmx->nested.msrs.secondary_ctls_high &=
+ ~SECONDARY_EXEC_XSAVES;
+ }
+ }
+
+ if (vmx_rdtscp_supported()) {
+ bool rdtscp_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP);
+ if (!rdtscp_enabled)
+ exec_control &= ~SECONDARY_EXEC_RDTSCP;
+
+ if (nested) {
+ if (rdtscp_enabled)
+ vmx->nested.msrs.secondary_ctls_high |=
+ SECONDARY_EXEC_RDTSCP;
+ else
+ vmx->nested.msrs.secondary_ctls_high &=
+ ~SECONDARY_EXEC_RDTSCP;
+ }
+ }
+
+ if (vmx_invpcid_supported()) {
+ /* Exposing INVPCID only when PCID is exposed */
+ bool invpcid_enabled =
+ guest_cpuid_has(vcpu, X86_FEATURE_INVPCID) &&
+ guest_cpuid_has(vcpu, X86_FEATURE_PCID);
+
+ if (!invpcid_enabled) {
+ exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
+ guest_cpuid_clear(vcpu, X86_FEATURE_INVPCID);
+ }
+
+ if (nested) {
+ if (invpcid_enabled)
+ vmx->nested.msrs.secondary_ctls_high |=
+ SECONDARY_EXEC_ENABLE_INVPCID;
+ else
+ vmx->nested.msrs.secondary_ctls_high &=
+ ~SECONDARY_EXEC_ENABLE_INVPCID;
+ }
+ }
+
+ if (vmx_rdrand_supported()) {
+ bool rdrand_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDRAND);
+ if (rdrand_enabled)
+ exec_control &= ~SECONDARY_EXEC_RDRAND_EXITING;
+
+ if (nested) {
+ if (rdrand_enabled)
+ vmx->nested.msrs.secondary_ctls_high |=
+ SECONDARY_EXEC_RDRAND_EXITING;
+ else
+ vmx->nested.msrs.secondary_ctls_high &=
+ ~SECONDARY_EXEC_RDRAND_EXITING;
+ }
+ }
+
+ if (vmx_rdseed_supported()) {
+ bool rdseed_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDSEED);
+ if (rdseed_enabled)
+ exec_control &= ~SECONDARY_EXEC_RDSEED_EXITING;
+
+ if (nested) {
+ if (rdseed_enabled)
+ vmx->nested.msrs.secondary_ctls_high |=
+ SECONDARY_EXEC_RDSEED_EXITING;
+ else
+ vmx->nested.msrs.secondary_ctls_high &=
+ ~SECONDARY_EXEC_RDSEED_EXITING;
+ }
+ }
+
+ vmx->secondary_exec_control = exec_control;
+}
+
+static void ept_set_mmio_spte_mask(void)
+{
+ /*
+ * EPT Misconfigurations can be generated if the value of bits 2:0
+ * of an EPT paging-structure entry is 110b (write/execute).
+ */
+ kvm_mmu_set_mmio_spte_mask(VMX_EPT_RWX_MASK,
+ VMX_EPT_MISCONFIG_WX_VALUE);
+}
+
+#define VMX_XSS_EXIT_BITMAP 0
+/*
+ * Sets up the vmcs for emulated real mode.
+ */
+static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
+{
+ int i;
+
+ if (enable_shadow_vmcs) {
+ /*
+ * At vCPU creation, "VMWRITE to any supported field
+ * in the VMCS" is supported, so use the more
+ * permissive vmx_vmread_bitmap to specify both read
+ * and write permissions for the shadow VMCS.
+ */
+ vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap));
+ vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmread_bitmap));
+ }
+ if (cpu_has_vmx_msr_bitmap())
+ vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap));
+
+ vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
+
+ /* Control */
+ vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx));
+ vmx->hv_deadline_tsc = -1;
+
+ vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx));
+
+ if (cpu_has_secondary_exec_ctrls()) {
+ vmx_compute_secondary_exec_control(vmx);
+ vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
+ vmx->secondary_exec_control);
+ }
+
+ if (kvm_vcpu_apicv_active(&vmx->vcpu)) {
+ vmcs_write64(EOI_EXIT_BITMAP0, 0);
+ vmcs_write64(EOI_EXIT_BITMAP1, 0);
+ vmcs_write64(EOI_EXIT_BITMAP2, 0);
+ vmcs_write64(EOI_EXIT_BITMAP3, 0);
+
+ vmcs_write16(GUEST_INTR_STATUS, 0);
+
+ vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
+ vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc)));
+ }
+
+ if (!kvm_pause_in_guest(vmx->vcpu.kvm)) {
+ vmcs_write32(PLE_GAP, ple_gap);
+ vmx->ple_window = ple_window;
+ vmx->ple_window_dirty = true;
+ }
+
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
+ vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
+
+ vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */
+ vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */
+ vmx_set_constant_host_state(vmx);
+ vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
+ vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
+
+ if (cpu_has_vmx_vmfunc())
+ vmcs_write64(VM_FUNCTION_CONTROL, 0);
+
+ vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
+ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
+ vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
+ vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
+ vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
+
+ if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
+ vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
+
+ for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) {
+ u32 index = vmx_msr_index[i];
+ u32 data_low, data_high;
+ int j = vmx->nmsrs;
+
+ if (rdmsr_safe(index, &data_low, &data_high) < 0)
+ continue;
+ if (wrmsr_safe(index, data_low, data_high) < 0)
+ continue;
+ vmx->guest_msrs[j].index = i;
+ vmx->guest_msrs[j].data = 0;
+ vmx->guest_msrs[j].mask = -1ull;
+ ++vmx->nmsrs;
+ }
+
+ vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl);
+
+ /* 22.2.1, 20.8.1 */
+ vm_entry_controls_init(vmx, vmcs_config.vmentry_ctrl);
+
+ vmx->vcpu.arch.cr0_guest_owned_bits = X86_CR0_TS;
+ vmcs_writel(CR0_GUEST_HOST_MASK, ~X86_CR0_TS);
+
+ set_cr4_guest_host_mask(vmx);
+
+ if (vmx_xsaves_supported())
+ vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP);
+
+ if (enable_pml) {
+ ASSERT(vmx->pml_pg);
+ vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
+ vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
+ }
+
+ if (cpu_has_vmx_encls_vmexit())
+ vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
+}
+
+static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct msr_data apic_base_msr;
+ u64 cr0;
+
+ vmx->rmode.vm86_active = 0;
+ vmx->spec_ctrl = 0;
+
+ vcpu->arch.microcode_version = 0x100000000ULL;
+ vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
+ kvm_set_cr8(vcpu, 0);
+
+ if (!init_event) {
+ apic_base_msr.data = APIC_DEFAULT_PHYS_BASE |
+ MSR_IA32_APICBASE_ENABLE;
+ if (kvm_vcpu_is_reset_bsp(vcpu))
+ apic_base_msr.data |= MSR_IA32_APICBASE_BSP;
+ apic_base_msr.host_initiated = true;
+ kvm_set_apic_base(vcpu, &apic_base_msr);
+ }
+
+ vmx_segment_cache_clear(vmx);
+
+ seg_setup(VCPU_SREG_CS);
+ vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
+ vmcs_writel(GUEST_CS_BASE, 0xffff0000ul);
+
+ seg_setup(VCPU_SREG_DS);
+ seg_setup(VCPU_SREG_ES);
+ seg_setup(VCPU_SREG_FS);
+ seg_setup(VCPU_SREG_GS);
+ seg_setup(VCPU_SREG_SS);
+
+ vmcs_write16(GUEST_TR_SELECTOR, 0);
+ vmcs_writel(GUEST_TR_BASE, 0);
+ vmcs_write32(GUEST_TR_LIMIT, 0xffff);
+ vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
+
+ vmcs_write16(GUEST_LDTR_SELECTOR, 0);
+ vmcs_writel(GUEST_LDTR_BASE, 0);
+ vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
+ vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
+
+ if (!init_event) {
+ vmcs_write32(GUEST_SYSENTER_CS, 0);
+ vmcs_writel(GUEST_SYSENTER_ESP, 0);
+ vmcs_writel(GUEST_SYSENTER_EIP, 0);
+ vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
+ }
+
+ kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
+ kvm_rip_write(vcpu, 0xfff0);
+
+ vmcs_writel(GUEST_GDTR_BASE, 0);
+ vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
+
+ vmcs_writel(GUEST_IDTR_BASE, 0);
+ vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
+
+ vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
+ vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
+ vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0);
+ if (kvm_mpx_supported())
+ vmcs_write64(GUEST_BNDCFGS, 0);
+
+ setup_msrs(vmx);
+
+ vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */
+
+ if (cpu_has_vmx_tpr_shadow() && !init_event) {
+ vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
+ if (cpu_need_tpr_shadow(vcpu))
+ vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
+ __pa(vcpu->arch.apic->regs));
+ vmcs_write32(TPR_THRESHOLD, 0);
+ }
+
+ kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
+
+ if (vmx->vpid != 0)
+ vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
+
+ cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
+ vmx->vcpu.arch.cr0 = cr0;
+ vmx_set_cr0(vcpu, cr0); /* enter rmode */
+ vmx_set_cr4(vcpu, 0);
+ vmx_set_efer(vcpu, 0);
+
+ update_exception_bitmap(vcpu);
+
+ vpid_sync_context(vmx->vpid);
+ if (init_event)
+ vmx_clear_hlt(vcpu);
+
+ vmx_update_fb_clear_dis(vcpu, vmx);
+}
+
+/*
+ * In nested virtualization, check if L1 asked to exit on external interrupts.
+ * For most existing hypervisors, this will always return true.
+ */
+static bool nested_exit_on_intr(struct kvm_vcpu *vcpu)
+{
+ return get_vmcs12(vcpu)->pin_based_vm_exec_control &
+ PIN_BASED_EXT_INTR_MASK;
+}
+
+/*
+ * In nested virtualization, check if L1 has set
+ * VM_EXIT_ACK_INTR_ON_EXIT
+ */
+static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu)
+{
+ return get_vmcs12(vcpu)->vm_exit_controls &
+ VM_EXIT_ACK_INTR_ON_EXIT;
+}
+
+static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu)
+{
+ return nested_cpu_has_nmi_exiting(get_vmcs12(vcpu));
+}
+
+static void enable_irq_window(struct kvm_vcpu *vcpu)
+{
+ vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL,
+ CPU_BASED_VIRTUAL_INTR_PENDING);
+}
+
+static void enable_nmi_window(struct kvm_vcpu *vcpu)
+{
+ if (!enable_vnmi ||
+ vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
+ enable_irq_window(vcpu);
+ return;
+ }
+
+ vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL,
+ CPU_BASED_VIRTUAL_NMI_PENDING);
+}
+
+static void vmx_inject_irq(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ uint32_t intr;
+ int irq = vcpu->arch.interrupt.nr;
+
+ trace_kvm_inj_virq(irq);
+
+ ++vcpu->stat.irq_injections;
+ if (vmx->rmode.vm86_active) {
+ int inc_eip = 0;
+ if (vcpu->arch.interrupt.soft)
+ inc_eip = vcpu->arch.event_exit_inst_len;
+ if (kvm_inject_realmode_interrupt(vcpu, irq, inc_eip) != EMULATE_DONE)
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ return;
+ }
+ intr = irq | INTR_INFO_VALID_MASK;
+ if (vcpu->arch.interrupt.soft) {
+ intr |= INTR_TYPE_SOFT_INTR;
+ vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
+ vmx->vcpu.arch.event_exit_inst_len);
+ } else
+ intr |= INTR_TYPE_EXT_INTR;
+ vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
+
+ vmx_clear_hlt(vcpu);
+}
+
+static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (!enable_vnmi) {
+ /*
+ * Tracking the NMI-blocked state in software is built upon
+ * finding the next open IRQ window. This, in turn, depends on
+ * well-behaving guests: They have to keep IRQs disabled at
+ * least as long as the NMI handler runs. Otherwise we may
+ * cause NMI nesting, maybe breaking the guest. But as this is
+ * highly unlikely, we can live with the residual risk.
+ */
+ vmx->loaded_vmcs->soft_vnmi_blocked = 1;
+ vmx->loaded_vmcs->vnmi_blocked_time = 0;
+ }
+
+ ++vcpu->stat.nmi_injections;
+ vmx->loaded_vmcs->nmi_known_unmasked = false;
+
+ if (vmx->rmode.vm86_active) {
+ if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) != EMULATE_DONE)
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ return;
+ }
+
+ vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+ INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
+
+ vmx_clear_hlt(vcpu);
+}
+
+static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ bool masked;
+
+ if (!enable_vnmi)
+ return vmx->loaded_vmcs->soft_vnmi_blocked;
+ if (vmx->loaded_vmcs->nmi_known_unmasked)
+ return false;
+ masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
+ vmx->loaded_vmcs->nmi_known_unmasked = !masked;
+ return masked;
+}
+
+static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (!enable_vnmi) {
+ if (vmx->loaded_vmcs->soft_vnmi_blocked != masked) {
+ vmx->loaded_vmcs->soft_vnmi_blocked = masked;
+ vmx->loaded_vmcs->vnmi_blocked_time = 0;
+ }
+ } else {
+ vmx->loaded_vmcs->nmi_known_unmasked = !masked;
+ if (masked)
+ vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+ GUEST_INTR_STATE_NMI);
+ else
+ vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
+ GUEST_INTR_STATE_NMI);
+ }
+}
+
+static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
+{
+ if (to_vmx(vcpu)->nested.nested_run_pending)
+ return 0;
+
+ if (!enable_vnmi &&
+ to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked)
+ return 0;
+
+ return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
+ (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI
+ | GUEST_INTR_STATE_NMI));
+}
+
+static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
+{
+ if (to_vmx(vcpu)->nested.nested_run_pending)
+ return false;
+
+ if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
+ return true;
+
+ return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
+ !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
+ (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
+}
+
+static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
+{
+ int ret;
+
+ if (enable_unrestricted_guest)
+ return 0;
+
+ ret = x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr,
+ PAGE_SIZE * 3);
+ if (ret)
+ return ret;
+ to_kvm_vmx(kvm)->tss_addr = addr;
+ return init_rmode_tss(kvm);
+}
+
+static int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
+{
+ to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr;
+ return 0;
+}
+
+static bool rmode_exception(struct kvm_vcpu *vcpu, int vec)
+{
+ switch (vec) {
+ case BP_VECTOR:
+ /*
+ * Update instruction length as we may reinject the exception
+ * from user space while in guest debugging mode.
+ */
+ to_vmx(vcpu)->vcpu.arch.event_exit_inst_len =
+ vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+ if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
+ return false;
+ /* fall through */
+ case DB_VECTOR:
+ if (vcpu->guest_debug &
+ (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
+ return false;
+ /* fall through */
+ case DE_VECTOR:
+ case OF_VECTOR:
+ case BR_VECTOR:
+ case UD_VECTOR:
+ case DF_VECTOR:
+ case SS_VECTOR:
+ case GP_VECTOR:
+ case MF_VECTOR:
+ return true;
+ break;
+ }
+ return false;
+}
+
+static int handle_rmode_exception(struct kvm_vcpu *vcpu,
+ int vec, u32 err_code)
+{
+ /*
+ * Instruction with address size override prefix opcode 0x67
+ * Cause the #SS fault with 0 error code in VM86 mode.
+ */
+ if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) {
+ if (kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE) {
+ if (vcpu->arch.halt_request) {
+ vcpu->arch.halt_request = 0;
+ return kvm_vcpu_halt(vcpu);
+ }
+ return 1;
+ }
+ return 0;
+ }
+
+ /*
+ * Forward all other exceptions that are valid in real mode.
+ * FIXME: Breaks guest debugging in real mode, needs to be fixed with
+ * the required debugging infrastructure rework.
+ */
+ kvm_queue_exception(vcpu, vec);
+ return 1;
+}
+
+/*
+ * Trigger machine check on the host. We assume all the MSRs are already set up
+ * by the CPU and that we still run on the same CPU as the MCE occurred on.
+ * We pass a fake environment to the machine check handler because we want
+ * the guest to be always treated like user space, no matter what context
+ * it used internally.
+ */
+static void kvm_machine_check(void)
+{
+#if defined(CONFIG_X86_MCE)
+ struct pt_regs regs = {
+ .cs = 3, /* Fake ring 3 no matter what the guest ran on */
+ .flags = X86_EFLAGS_IF,
+ };
+
+ do_machine_check(&regs, 0);
+#endif
+}
+
+static int handle_machine_check(struct kvm_vcpu *vcpu)
+{
+ /* already handled by vcpu_run */
+ return 1;
+}
+
+static int handle_exception(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct kvm_run *kvm_run = vcpu->run;
+ u32 intr_info, ex_no, error_code;
+ unsigned long cr2, rip, dr6;
+ u32 vect_info;
+ enum emulation_result er;
+
+ vect_info = vmx->idt_vectoring_info;
+ intr_info = vmx->exit_intr_info;
+
+ if (is_machine_check(intr_info))
+ return handle_machine_check(vcpu);
+
+ if (is_nmi(intr_info))
+ return 1; /* already handled by vmx_vcpu_run() */
+
+ if (is_invalid_opcode(intr_info))
+ return handle_ud(vcpu);
+
+ error_code = 0;
+ if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
+ error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+
+ if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) {
+ WARN_ON_ONCE(!enable_vmware_backdoor);
+ er = kvm_emulate_instruction(vcpu,
+ EMULTYPE_VMWARE | EMULTYPE_NO_UD_ON_FAIL);
+ if (er == EMULATE_USER_EXIT)
+ return 0;
+ else if (er != EMULATE_DONE)
+ kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
+ return 1;
+ }
+
+ /*
+ * The #PF with PFEC.RSVD = 1 indicates the guest is accessing
+ * MMIO, it is better to report an internal error.
+ * See the comments in vmx_handle_exit.
+ */
+ if ((vect_info & VECTORING_INFO_VALID_MASK) &&
+ !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) {
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
+ vcpu->run->internal.ndata = 3;
+ vcpu->run->internal.data[0] = vect_info;
+ vcpu->run->internal.data[1] = intr_info;
+ vcpu->run->internal.data[2] = error_code;
+ return 0;
+ }
+
+ if (is_page_fault(intr_info)) {
+ cr2 = vmcs_readl(EXIT_QUALIFICATION);
+ /* EPT won't cause page fault directly */
+ WARN_ON_ONCE(!vcpu->arch.apf.host_apf_reason && enable_ept);
+ return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0);
+ }
+
+ ex_no = intr_info & INTR_INFO_VECTOR_MASK;
+
+ if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no))
+ return handle_rmode_exception(vcpu, ex_no, error_code);
+
+ switch (ex_no) {
+ case AC_VECTOR:
+ kvm_queue_exception_e(vcpu, AC_VECTOR, error_code);
+ return 1;
+ case DB_VECTOR:
+ dr6 = vmcs_readl(EXIT_QUALIFICATION);
+ if (!(vcpu->guest_debug &
+ (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
+ vcpu->arch.dr6 &= ~15;
+ vcpu->arch.dr6 |= dr6 | DR6_RTM;
+ if (is_icebp(intr_info))
+ skip_emulated_instruction(vcpu);
+
+ kvm_queue_exception(vcpu, DB_VECTOR);
+ return 1;
+ }
+ kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1;
+ kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
+ /* fall through */
+ case BP_VECTOR:
+ /*
+ * Update instruction length as we may reinject #BP from
+ * user space while in guest debugging mode. Reading it for
+ * #DB as well causes no harm, it is not used in that case.
+ */
+ vmx->vcpu.arch.event_exit_inst_len =
+ vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+ kvm_run->exit_reason = KVM_EXIT_DEBUG;
+ rip = kvm_rip_read(vcpu);
+ kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip;
+ kvm_run->debug.arch.exception = ex_no;
+ break;
+ default:
+ kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
+ kvm_run->ex.exception = ex_no;
+ kvm_run->ex.error_code = error_code;
+ break;
+ }
+ return 0;
+}
+
+static int handle_external_interrupt(struct kvm_vcpu *vcpu)
+{
+ ++vcpu->stat.irq_exits;
+ return 1;
+}
+
+static int handle_triple_fault(struct kvm_vcpu *vcpu)
+{
+ vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
+ vcpu->mmio_needed = 0;
+ return 0;
+}
+
+static int handle_io(struct kvm_vcpu *vcpu)
+{
+ unsigned long exit_qualification;
+ int size, in, string;
+ unsigned port;
+
+ exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ string = (exit_qualification & 16) != 0;
+
+ ++vcpu->stat.io_exits;
+
+ if (string)
+ return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE;
+
+ port = exit_qualification >> 16;
+ size = (exit_qualification & 7) + 1;
+ in = (exit_qualification & 8) != 0;
+
+ return kvm_fast_pio(vcpu, size, port, in);
+}
+
+static void
+vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
+{
+ /*
+ * Patch in the VMCALL instruction:
+ */
+ hypercall[0] = 0x0f;
+ hypercall[1] = 0x01;
+ hypercall[2] = 0xc1;
+}
+
+/* called to set cr0 as appropriate for a mov-to-cr0 exit. */
+static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
+{
+ if (is_guest_mode(vcpu)) {
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ unsigned long orig_val = val;
+
+ /*
+ * We get here when L2 changed cr0 in a way that did not change
+ * any of L1's shadowed bits (see nested_vmx_exit_handled_cr),
+ * but did change L0 shadowed bits. So we first calculate the
+ * effective cr0 value that L1 would like to write into the
+ * hardware. It consists of the L2-owned bits from the new
+ * value combined with the L1-owned bits from L1's guest_cr0.
+ */
+ val = (val & ~vmcs12->cr0_guest_host_mask) |
+ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);
+
+ if (!nested_guest_cr0_valid(vcpu, val))
+ return 1;
+
+ if (kvm_set_cr0(vcpu, val))
+ return 1;
+ vmcs_writel(CR0_READ_SHADOW, orig_val);
+ return 0;
+ } else {
+ if (to_vmx(vcpu)->nested.vmxon &&
+ !nested_host_cr0_valid(vcpu, val))
+ return 1;
+
+ return kvm_set_cr0(vcpu, val);
+ }
+}
+
+static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
+{
+ if (is_guest_mode(vcpu)) {
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ unsigned long orig_val = val;
+
+ /* analogously to handle_set_cr0 */
+ val = (val & ~vmcs12->cr4_guest_host_mask) |
+ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask);
+ if (kvm_set_cr4(vcpu, val))
+ return 1;
+ vmcs_writel(CR4_READ_SHADOW, orig_val);
+ return 0;
+ } else
+ return kvm_set_cr4(vcpu, val);
+}
+
+static int handle_desc(struct kvm_vcpu *vcpu)
+{
+ WARN_ON(!(vcpu->arch.cr4 & X86_CR4_UMIP));
+ return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE;
+}
+
+static int handle_cr(struct kvm_vcpu *vcpu)
+{
+ unsigned long exit_qualification, val;
+ int cr;
+ int reg;
+ int err;
+ int ret;
+
+ exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ cr = exit_qualification & 15;
+ reg = (exit_qualification >> 8) & 15;
+ switch ((exit_qualification >> 4) & 3) {
+ case 0: /* mov to cr */
+ val = kvm_register_readl(vcpu, reg);
+ trace_kvm_cr_write(cr, val);
+ switch (cr) {
+ case 0:
+ err = handle_set_cr0(vcpu, val);
+ return kvm_complete_insn_gp(vcpu, err);
+ case 3:
+ WARN_ON_ONCE(enable_unrestricted_guest);
+ err = kvm_set_cr3(vcpu, val);
+ return kvm_complete_insn_gp(vcpu, err);
+ case 4:
+ err = handle_set_cr4(vcpu, val);
+ return kvm_complete_insn_gp(vcpu, err);
+ case 8: {
+ u8 cr8_prev = kvm_get_cr8(vcpu);
+ u8 cr8 = (u8)val;
+ err = kvm_set_cr8(vcpu, cr8);
+ ret = kvm_complete_insn_gp(vcpu, err);
+ if (lapic_in_kernel(vcpu))
+ return ret;
+ if (cr8_prev <= cr8)
+ return ret;
+ /*
+ * TODO: we might be squashing a
+ * KVM_GUESTDBG_SINGLESTEP-triggered
+ * KVM_EXIT_DEBUG here.
+ */
+ vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
+ return 0;
+ }
+ }
+ break;
+ case 2: /* clts */
+ WARN_ONCE(1, "Guest should always own CR0.TS");
+ vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
+ trace_kvm_cr_write(0, kvm_read_cr0(vcpu));
+ return kvm_skip_emulated_instruction(vcpu);
+ case 1: /*mov from cr*/
+ switch (cr) {
+ case 3:
+ WARN_ON_ONCE(enable_unrestricted_guest);
+ val = kvm_read_cr3(vcpu);
+ kvm_register_write(vcpu, reg, val);
+ trace_kvm_cr_read(cr, val);
+ return kvm_skip_emulated_instruction(vcpu);
+ case 8:
+ val = kvm_get_cr8(vcpu);
+ kvm_register_write(vcpu, reg, val);
+ trace_kvm_cr_read(cr, val);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+ break;
+ case 3: /* lmsw */
+ val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
+ trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val);
+ kvm_lmsw(vcpu, val);
+
+ return kvm_skip_emulated_instruction(vcpu);
+ default:
+ break;
+ }
+ vcpu->run->exit_reason = 0;
+ vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
+ (int)(exit_qualification >> 4) & 3, cr);
+ return 0;
+}
+
+static int handle_dr(struct kvm_vcpu *vcpu)
+{
+ unsigned long exit_qualification;
+ int dr, dr7, reg;
+
+ exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
+
+ /* First, if DR does not exist, trigger UD */
+ if (!kvm_require_dr(vcpu, dr))
+ return 1;
+
+ /* Do not handle if the CPL > 0, will trigger GP on re-entry */
+ if (!kvm_require_cpl(vcpu, 0))
+ return 1;
+ dr7 = vmcs_readl(GUEST_DR7);
+ if (dr7 & DR7_GD) {
+ /*
+ * As the vm-exit takes precedence over the debug trap, we
+ * need to emulate the latter, either for the host or the
+ * guest debugging itself.
+ */
+ if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
+ vcpu->run->debug.arch.dr6 = vcpu->arch.dr6;
+ vcpu->run->debug.arch.dr7 = dr7;
+ vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu);
+ vcpu->run->debug.arch.exception = DB_VECTOR;
+ vcpu->run->exit_reason = KVM_EXIT_DEBUG;
+ return 0;
+ } else {
+ vcpu->arch.dr6 &= ~15;
+ vcpu->arch.dr6 |= DR6_BD | DR6_RTM;
+ kvm_queue_exception(vcpu, DB_VECTOR);
+ return 1;
+ }
+ }
+
+ if (vcpu->guest_debug == 0) {
+ vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
+ CPU_BASED_MOV_DR_EXITING);
+
+ /*
+ * No more DR vmexits; force a reload of the debug registers
+ * and reenter on this instruction. The next vmexit will
+ * retrieve the full state of the debug registers.
+ */
+ vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
+ return 1;
+ }
+
+ reg = DEBUG_REG_ACCESS_REG(exit_qualification);
+ if (exit_qualification & TYPE_MOV_FROM_DR) {
+ unsigned long val;
+
+ if (kvm_get_dr(vcpu, dr, &val))
+ return 1;
+ kvm_register_write(vcpu, reg, val);
+ } else
+ if (kvm_set_dr(vcpu, dr, kvm_register_readl(vcpu, reg)))
+ return 1;
+
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+static u64 vmx_get_dr6(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.dr6;
+}
+
+static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val)
+{
+}
+
+static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
+{
+ get_debugreg(vcpu->arch.db[0], 0);
+ get_debugreg(vcpu->arch.db[1], 1);
+ get_debugreg(vcpu->arch.db[2], 2);
+ get_debugreg(vcpu->arch.db[3], 3);
+ get_debugreg(vcpu->arch.dr6, 6);
+ vcpu->arch.dr7 = vmcs_readl(GUEST_DR7);
+
+ vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
+ vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, CPU_BASED_MOV_DR_EXITING);
+}
+
+static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
+{
+ vmcs_writel(GUEST_DR7, val);
+}
+
+static int handle_cpuid(struct kvm_vcpu *vcpu)
+{
+ return kvm_emulate_cpuid(vcpu);
+}
+
+static int handle_rdmsr(struct kvm_vcpu *vcpu)
+{
+ u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
+ struct msr_data msr_info;
+
+ msr_info.index = ecx;
+ msr_info.host_initiated = false;
+ if (vmx_get_msr(vcpu, &msr_info)) {
+ trace_kvm_msr_read_ex(ecx);
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ trace_kvm_msr_read(ecx, msr_info.data);
+
+ /* FIXME: handling of bits 32:63 of rax, rdx */
+ vcpu->arch.regs[VCPU_REGS_RAX] = msr_info.data & -1u;
+ vcpu->arch.regs[VCPU_REGS_RDX] = (msr_info.data >> 32) & -1u;
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+static int handle_wrmsr(struct kvm_vcpu *vcpu)
+{
+ struct msr_data msr;
+ u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
+ u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
+ | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
+
+ msr.data = data;
+ msr.index = ecx;
+ msr.host_initiated = false;
+ if (kvm_set_msr(vcpu, &msr) != 0) {
+ trace_kvm_msr_write_ex(ecx, data);
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ trace_kvm_msr_write(ecx, data);
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
+{
+ kvm_apic_update_ppr(vcpu);
+ return 1;
+}
+
+static int handle_interrupt_window(struct kvm_vcpu *vcpu)
+{
+ vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
+ CPU_BASED_VIRTUAL_INTR_PENDING);
+
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ ++vcpu->stat.irq_window_exits;
+ return 1;
+}
+
+static int handle_halt(struct kvm_vcpu *vcpu)
+{
+ return kvm_emulate_halt(vcpu);
+}
+
+static int handle_vmcall(struct kvm_vcpu *vcpu)
+{
+ return kvm_emulate_hypercall(vcpu);
+}
+
+static int handle_invd(struct kvm_vcpu *vcpu)
+{
+ return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE;
+}
+
+static int handle_invlpg(struct kvm_vcpu *vcpu)
+{
+ unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+
+ kvm_mmu_invlpg(vcpu, exit_qualification);
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+static int handle_rdpmc(struct kvm_vcpu *vcpu)
+{
+ int err;
+
+ err = kvm_rdpmc(vcpu);
+ return kvm_complete_insn_gp(vcpu, err);
+}
+
+static int handle_wbinvd(struct kvm_vcpu *vcpu)
+{
+ return kvm_emulate_wbinvd(vcpu);
+}
+
+static int handle_xsetbv(struct kvm_vcpu *vcpu)
+{
+ u64 new_bv = kvm_read_edx_eax(vcpu);
+ u32 index = kvm_register_read(vcpu, VCPU_REGS_RCX);
+
+ if (kvm_set_xcr(vcpu, index, new_bv) == 0)
+ return kvm_skip_emulated_instruction(vcpu);
+ return 1;
+}
+
+static int handle_xsaves(struct kvm_vcpu *vcpu)
+{
+ kvm_skip_emulated_instruction(vcpu);
+ WARN(1, "this should never happen\n");
+ return 1;
+}
+
+static int handle_xrstors(struct kvm_vcpu *vcpu)
+{
+ kvm_skip_emulated_instruction(vcpu);
+ WARN(1, "this should never happen\n");
+ return 1;
+}
+
+static int handle_apic_access(struct kvm_vcpu *vcpu)
+{
+ if (likely(fasteoi)) {
+ unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ int access_type, offset;
+
+ access_type = exit_qualification & APIC_ACCESS_TYPE;
+ offset = exit_qualification & APIC_ACCESS_OFFSET;
+ /*
+ * Sane guest uses MOV to write EOI, with written value
+ * not cared. So make a short-circuit here by avoiding
+ * heavy instruction emulation.
+ */
+ if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) &&
+ (offset == APIC_EOI)) {
+ kvm_lapic_set_eoi(vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+ }
+ return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE;
+}
+
+static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
+{
+ unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ int vector = exit_qualification & 0xff;
+
+ /* EOI-induced VM exit is trap-like and thus no need to adjust IP */
+ kvm_apic_set_eoi_accelerated(vcpu, vector);
+ return 1;
+}
+
+static int handle_apic_write(struct kvm_vcpu *vcpu)
+{
+ unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ u32 offset = exit_qualification & 0xfff;
+
+ /* APIC-write VM exit is trap-like and thus no need to adjust IP */
+ kvm_apic_write_nodecode(vcpu, offset);
+ return 1;
+}
+
+static int handle_task_switch(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long exit_qualification;
+ bool has_error_code = false;
+ u32 error_code = 0;
+ u16 tss_selector;
+ int reason, type, idt_v, idt_index;
+
+ idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
+ idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK);
+ type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
+
+ exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+
+ reason = (u32)exit_qualification >> 30;
+ if (reason == TASK_SWITCH_GATE && idt_v) {
+ switch (type) {
+ case INTR_TYPE_NMI_INTR:
+ vcpu->arch.nmi_injected = false;
+ vmx_set_nmi_mask(vcpu, true);
+ break;
+ case INTR_TYPE_EXT_INTR:
+ case INTR_TYPE_SOFT_INTR:
+ kvm_clear_interrupt_queue(vcpu);
+ break;
+ case INTR_TYPE_HARD_EXCEPTION:
+ if (vmx->idt_vectoring_info &
+ VECTORING_INFO_DELIVER_CODE_MASK) {
+ has_error_code = true;
+ error_code =
+ vmcs_read32(IDT_VECTORING_ERROR_CODE);
+ }
+ /* fall through */
+ case INTR_TYPE_SOFT_EXCEPTION:
+ kvm_clear_exception_queue(vcpu);
+ break;
+ default:
+ break;
+ }
+ }
+ tss_selector = exit_qualification;
+
+ if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION &&
+ type != INTR_TYPE_EXT_INTR &&
+ type != INTR_TYPE_NMI_INTR))
+ skip_emulated_instruction(vcpu);
+
+ if (kvm_task_switch(vcpu, tss_selector,
+ type == INTR_TYPE_SOFT_INTR ? idt_index : -1, reason,
+ has_error_code, error_code) == EMULATE_FAIL) {
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 0;
+ return 0;
+ }
+
+ /*
+ * TODO: What about debug traps on tss switch?
+ * Are we supposed to inject them and update dr6?
+ */
+
+ return 1;
+}
+
+static int handle_ept_violation(struct kvm_vcpu *vcpu)
+{
+ unsigned long exit_qualification;
+ gpa_t gpa;
+ u64 error_code;
+
+ exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+
+ /*
+ * EPT violation happened while executing iret from NMI,
+ * "blocked by NMI" bit has to be set before next VM entry.
+ * There are errata that may cause this bit to not be set:
+ * AAK134, BY25.
+ */
+ if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
+ enable_vnmi &&
+ (exit_qualification & INTR_INFO_UNBLOCK_NMI))
+ vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI);
+
+ gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
+ trace_kvm_page_fault(gpa, exit_qualification);
+
+ /* Is it a read fault? */
+ error_code = (exit_qualification & EPT_VIOLATION_ACC_READ)
+ ? PFERR_USER_MASK : 0;
+ /* Is it a write fault? */
+ error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE)
+ ? PFERR_WRITE_MASK : 0;
+ /* Is it a fetch fault? */
+ error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR)
+ ? PFERR_FETCH_MASK : 0;
+ /* ept page table entry is present? */
+ error_code |= (exit_qualification &
+ (EPT_VIOLATION_READABLE | EPT_VIOLATION_WRITABLE |
+ EPT_VIOLATION_EXECUTABLE))
+ ? PFERR_PRESENT_MASK : 0;
+
+ error_code |= (exit_qualification & 0x100) != 0 ?
+ PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
+
+ vcpu->arch.exit_qualification = exit_qualification;
+ return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
+}
+
+static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
+{
+ gpa_t gpa;
+
+ /*
+ * A nested guest cannot optimize MMIO vmexits, because we have an
+ * nGPA here instead of the required GPA.
+ */
+ gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
+ if (!is_guest_mode(vcpu) &&
+ !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
+ trace_kvm_fast_mmio(gpa);
+ /*
+ * Doing kvm_skip_emulated_instruction() depends on undefined
+ * behavior: Intel's manual doesn't mandate
+ * VM_EXIT_INSTRUCTION_LEN to be set in VMCS when EPT MISCONFIG
+ * occurs and while on real hardware it was observed to be set,
+ * other hypervisors (namely Hyper-V) don't set it, we end up
+ * advancing IP with some random value. Disable fast mmio when
+ * running nested and keep it for real hardware in hope that
+ * VM_EXIT_INSTRUCTION_LEN will always be set correctly.
+ */
+ if (!static_cpu_has(X86_FEATURE_HYPERVISOR))
+ return kvm_skip_emulated_instruction(vcpu);
+ else
+ return kvm_emulate_instruction(vcpu, EMULTYPE_SKIP) ==
+ EMULATE_DONE;
+ }
+
+ return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0);
+}
+
+static int handle_nmi_window(struct kvm_vcpu *vcpu)
+{
+ WARN_ON_ONCE(!enable_vnmi);
+ vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
+ CPU_BASED_VIRTUAL_NMI_PENDING);
+ ++vcpu->stat.nmi_window_exits;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ return 1;
+}
+
+static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ enum emulation_result err = EMULATE_DONE;
+ int ret = 1;
+ u32 cpu_exec_ctrl;
+ bool intr_window_requested;
+ unsigned count = 130;
+
+ /*
+ * We should never reach the point where we are emulating L2
+ * due to invalid guest state as that means we incorrectly
+ * allowed a nested VMEntry with an invalid vmcs12.
+ */
+ WARN_ON_ONCE(vmx->emulation_required && vmx->nested.nested_run_pending);
+
+ cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+ intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING;
+
+ while (vmx->emulation_required && count-- != 0) {
+ if (intr_window_requested && vmx_interrupt_allowed(vcpu))
+ return handle_interrupt_window(&vmx->vcpu);
+
+ if (kvm_test_request(KVM_REQ_EVENT, vcpu))
+ return 1;
+
+ err = kvm_emulate_instruction(vcpu, 0);
+
+ if (err == EMULATE_USER_EXIT) {
+ ++vcpu->stat.mmio_exits;
+ ret = 0;
+ goto out;
+ }
+
+ if (err != EMULATE_DONE)
+ goto emulation_error;
+
+ if (vmx->emulation_required && !vmx->rmode.vm86_active &&
+ vcpu->arch.exception.pending)
+ goto emulation_error;
+
+ if (vcpu->arch.halt_request) {
+ vcpu->arch.halt_request = 0;
+ ret = kvm_vcpu_halt(vcpu);
+ goto out;
+ }
+
+ if (signal_pending(current))
+ goto out;
+ if (need_resched())
+ schedule();
+ }
+
+out:
+ return ret;
+
+emulation_error:
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 0;
+ return 0;
+}
+
+static void grow_ple_window(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int old = vmx->ple_window;
+
+ vmx->ple_window = __grow_ple_window(old, ple_window,
+ ple_window_grow,
+ ple_window_max);
+
+ if (vmx->ple_window != old)
+ vmx->ple_window_dirty = true;
+
+ trace_kvm_ple_window_grow(vcpu->vcpu_id, vmx->ple_window, old);
+}
+
+static void shrink_ple_window(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int old = vmx->ple_window;
+
+ vmx->ple_window = __shrink_ple_window(old, ple_window,
+ ple_window_shrink,
+ ple_window);
+
+ if (vmx->ple_window != old)
+ vmx->ple_window_dirty = true;
+
+ trace_kvm_ple_window_shrink(vcpu->vcpu_id, vmx->ple_window, old);
+}
+
+/*
+ * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
+ */
+static void wakeup_handler(void)
+{
+ struct kvm_vcpu *vcpu;
+ int cpu = smp_processor_id();
+
+ spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
+ list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu),
+ blocked_vcpu_list) {
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+
+ if (pi_test_on(pi_desc) == 1)
+ kvm_vcpu_kick(vcpu);
+ }
+ spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
+}
+
+static void vmx_enable_tdp(void)
+{
+ kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
+ enable_ept_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull,
+ enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull,
+ 0ull, VMX_EPT_EXECUTABLE_MASK,
+ cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK,
+ VMX_EPT_RWX_MASK, 0ull);
+
+ ept_set_mmio_spte_mask();
+ kvm_enable_tdp();
+}
+
+static __init int hardware_setup(void)
+{
+ unsigned long host_bndcfgs;
+ int r = -ENOMEM, i;
+
+ rdmsrl_safe(MSR_EFER, &host_efer);
+
+ for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i)
+ kvm_define_shared_msr(i, vmx_msr_index[i]);
+
+ for (i = 0; i < VMX_BITMAP_NR; i++) {
+ vmx_bitmap[i] = (unsigned long *)__get_free_page(GFP_KERNEL);
+ if (!vmx_bitmap[i])
+ goto out;
+ }
+
+ memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
+ memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
+
+ if (setup_vmcs_config(&vmcs_config) < 0) {
+ r = -EIO;
+ goto out;
+ }
+
+ if (boot_cpu_has(X86_FEATURE_NX))
+ kvm_enable_efer_bits(EFER_NX);
+
+ if (boot_cpu_has(X86_FEATURE_MPX)) {
+ rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs);
+ WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost");
+ }
+
+ if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() ||
+ !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global()))
+ enable_vpid = 0;
+
+ if (!cpu_has_vmx_ept() ||
+ !cpu_has_vmx_ept_4levels() ||
+ !cpu_has_vmx_ept_mt_wb() ||
+ !cpu_has_vmx_invept_global())
+ enable_ept = 0;
+
+ if (!cpu_has_vmx_ept_ad_bits() || !enable_ept)
+ enable_ept_ad_bits = 0;
+
+ if (!cpu_has_vmx_unrestricted_guest() || !enable_ept)
+ enable_unrestricted_guest = 0;
+
+ if (!cpu_has_vmx_flexpriority())
+ flexpriority_enabled = 0;
+
+ if (!cpu_has_virtual_nmis())
+ enable_vnmi = 0;
+
+ /*
+ * set_apic_access_page_addr() is used to reload apic access
+ * page upon invalidation. No need to do anything if not
+ * using the APIC_ACCESS_ADDR VMCS field.
+ */
+ if (!flexpriority_enabled)
+ kvm_x86_ops->set_apic_access_page_addr = NULL;
+
+ if (!cpu_has_vmx_tpr_shadow())
+ kvm_x86_ops->update_cr8_intercept = NULL;
+
+ if (enable_ept && !cpu_has_vmx_ept_2m_page())
+ kvm_disable_largepages();
+
+#if IS_ENABLED(CONFIG_HYPERV)
+ if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH
+ && enable_ept)
+ kvm_x86_ops->tlb_remote_flush = vmx_hv_remote_flush_tlb;
+#endif
+
+ if (!cpu_has_vmx_ple()) {
+ ple_gap = 0;
+ ple_window = 0;
+ ple_window_grow = 0;
+ ple_window_max = 0;
+ ple_window_shrink = 0;
+ }
+
+ if (!cpu_has_vmx_apicv()) {
+ enable_apicv = 0;
+ kvm_x86_ops->sync_pir_to_irr = NULL;
+ }
+
+ if (cpu_has_vmx_tsc_scaling()) {
+ kvm_has_tsc_control = true;
+ kvm_max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX;
+ kvm_tsc_scaling_ratio_frac_bits = 48;
+ }
+
+ set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
+
+ if (enable_ept)
+ vmx_enable_tdp();
+ else
+ kvm_disable_tdp();
+
+ if (!nested) {
+ kvm_x86_ops->get_nested_state = NULL;
+ kvm_x86_ops->set_nested_state = NULL;
+ }
+
+ /*
+ * Only enable PML when hardware supports PML feature, and both EPT
+ * and EPT A/D bit features are enabled -- PML depends on them to work.
+ */
+ if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml())
+ enable_pml = 0;
+
+ if (!enable_pml) {
+ kvm_x86_ops->slot_enable_log_dirty = NULL;
+ kvm_x86_ops->slot_disable_log_dirty = NULL;
+ kvm_x86_ops->flush_log_dirty = NULL;
+ kvm_x86_ops->enable_log_dirty_pt_masked = NULL;
+ }
+
+ if (!cpu_has_vmx_preemption_timer())
+ kvm_x86_ops->request_immediate_exit = __kvm_request_immediate_exit;
+
+ if (cpu_has_vmx_preemption_timer() && enable_preemption_timer) {
+ u64 vmx_msr;
+
+ rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
+ cpu_preemption_timer_multi =
+ vmx_msr & VMX_MISC_PREEMPTION_TIMER_RATE_MASK;
+ } else {
+ kvm_x86_ops->set_hv_timer = NULL;
+ kvm_x86_ops->cancel_hv_timer = NULL;
+ }
+
+ if (!cpu_has_vmx_shadow_vmcs())
+ enable_shadow_vmcs = 0;
+ if (enable_shadow_vmcs)
+ init_vmcs_shadow_fields();
+
+ kvm_set_posted_intr_wakeup_handler(wakeup_handler);
+ nested_vmx_setup_ctls_msrs(&vmcs_config.nested, enable_apicv);
+
+ kvm_mce_cap_supported |= MCG_LMCE_P;
+
+ r = alloc_kvm_area();
+ if (r)
+ goto out;
+ return 0;
+
+out:
+ for (i = 0; i < VMX_BITMAP_NR; i++)
+ free_page((unsigned long)vmx_bitmap[i]);
+
+ return r;
+}
+
+static __exit void hardware_unsetup(void)
+{
+ int i;
+
+ for (i = 0; i < VMX_BITMAP_NR; i++)
+ free_page((unsigned long)vmx_bitmap[i]);
+
+ free_kvm_area();
+}
+
+/*
+ * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
+ * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
+ */
+static int handle_pause(struct kvm_vcpu *vcpu)
+{
+ if (!kvm_pause_in_guest(vcpu->kvm))
+ grow_ple_window(vcpu);
+
+ /*
+ * Intel sdm vol3 ch-25.1.3 says: The "PAUSE-loop exiting"
+ * VM-execution control is ignored if CPL > 0. OTOH, KVM
+ * never set PAUSE_EXITING and just set PLE if supported,
+ * so the vcpu must be CPL=0 if it gets a PAUSE exit.
+ */
+ kvm_vcpu_on_spin(vcpu, true);
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+static int handle_nop(struct kvm_vcpu *vcpu)
+{
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+static int handle_mwait(struct kvm_vcpu *vcpu)
+{
+ printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n");
+ return handle_nop(vcpu);
+}
+
+static int handle_invalid_op(struct kvm_vcpu *vcpu)
+{
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+}
+
+static int handle_monitor_trap(struct kvm_vcpu *vcpu)
+{
+ return 1;
+}
+
+static int handle_monitor(struct kvm_vcpu *vcpu)
+{
+ printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
+ return handle_nop(vcpu);
+}
+
+/*
+ * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
+ * set the success or error code of an emulated VMX instruction, as specified
+ * by Vol 2B, VMX Instruction Reference, "Conventions".
+ */
+static void nested_vmx_succeed(struct kvm_vcpu *vcpu)
+{
+ vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
+ & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
+ X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
+}
+
+static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
+{
+ vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
+ & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
+ X86_EFLAGS_SF | X86_EFLAGS_OF))
+ | X86_EFLAGS_CF);
+}
+
+static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
+ u32 vm_instruction_error)
+{
+ if (to_vmx(vcpu)->nested.current_vmptr == -1ull) {
+ /*
+ * failValid writes the error number to the current VMCS, which
+ * can't be done there isn't a current VMCS.
+ */
+ nested_vmx_failInvalid(vcpu);
+ return;
+ }
+ vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
+ & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
+ X86_EFLAGS_SF | X86_EFLAGS_OF))
+ | X86_EFLAGS_ZF);
+ get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
+ /*
+ * We don't need to force a shadow sync because
+ * VM_INSTRUCTION_ERROR is not shadowed
+ */
+}
+
+static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
+{
+ /* TODO: not to reset guest simply here. */
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator);
+}
+
+static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
+{
+ struct vcpu_vmx *vmx =
+ container_of(timer, struct vcpu_vmx, nested.preemption_timer);
+
+ vmx->nested.preemption_timer_expired = true;
+ kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu);
+ kvm_vcpu_kick(&vmx->vcpu);
+
+ return HRTIMER_NORESTART;
+}
+
+/*
+ * Decode the memory-address operand of a vmx instruction, as recorded on an
+ * exit caused by such an instruction (run by a guest hypervisor).
+ * On success, returns 0. When the operand is invalid, returns 1 and throws
+ * #UD or #GP.
+ */
+static int get_vmx_mem_address(struct kvm_vcpu *vcpu,
+ unsigned long exit_qualification,
+ u32 vmx_instruction_info, bool wr, gva_t *ret)
+{
+ gva_t off;
+ bool exn;
+ struct kvm_segment s;
+
+ /*
+ * According to Vol. 3B, "Information for VM Exits Due to Instruction
+ * Execution", on an exit, vmx_instruction_info holds most of the
+ * addressing components of the operand. Only the displacement part
+ * is put in exit_qualification (see 3B, "Basic VM-Exit Information").
+ * For how an actual address is calculated from all these components,
+ * refer to Vol. 1, "Operand Addressing".
+ */
+ int scaling = vmx_instruction_info & 3;
+ int addr_size = (vmx_instruction_info >> 7) & 7;
+ bool is_reg = vmx_instruction_info & (1u << 10);
+ int seg_reg = (vmx_instruction_info >> 15) & 7;
+ int index_reg = (vmx_instruction_info >> 18) & 0xf;
+ bool index_is_valid = !(vmx_instruction_info & (1u << 22));
+ int base_reg = (vmx_instruction_info >> 23) & 0xf;
+ bool base_is_valid = !(vmx_instruction_info & (1u << 27));
+
+ if (is_reg) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ /* Addr = segment_base + offset */
+ /* offset = base + [index * scale] + displacement */
+ off = exit_qualification; /* holds the displacement */
+ if (addr_size == 1)
+ off = (gva_t)sign_extend64(off, 31);
+ else if (addr_size == 0)
+ off = (gva_t)sign_extend64(off, 15);
+ if (base_is_valid)
+ off += kvm_register_read(vcpu, base_reg);
+ if (index_is_valid)
+ off += kvm_register_read(vcpu, index_reg)<<scaling;
+ vmx_get_segment(vcpu, &s, seg_reg);
+
+ /*
+ * The effective address, i.e. @off, of a memory operand is truncated
+ * based on the address size of the instruction. Note that this is
+ * the *effective address*, i.e. the address prior to accounting for
+ * the segment's base.
+ */
+ if (addr_size == 1) /* 32 bit */
+ off &= 0xffffffff;
+ else if (addr_size == 0) /* 16 bit */
+ off &= 0xffff;
+
+ /* Checks for #GP/#SS exceptions. */
+ exn = false;
+ if (is_long_mode(vcpu)) {
+ /*
+ * The virtual/linear address is never truncated in 64-bit
+ * mode, e.g. a 32-bit address size can yield a 64-bit virtual
+ * address when using FS/GS with a non-zero base.
+ */
+ *ret = s.base + off;
+
+ /* Long mode: #GP(0)/#SS(0) if the memory address is in a
+ * non-canonical form. This is the only check on the memory
+ * destination for long mode!
+ */
+ exn = is_noncanonical_address(*ret, vcpu);
+ } else if (is_protmode(vcpu)) {
+ /*
+ * When not in long mode, the virtual/linear address is
+ * unconditionally truncated to 32 bits regardless of the
+ * address size.
+ */
+ *ret = (s.base + off) & 0xffffffff;
+
+ /* Protected mode: apply checks for segment validity in the
+ * following order:
+ * - segment type check (#GP(0) may be thrown)
+ * - usability check (#GP(0)/#SS(0))
+ * - limit check (#GP(0)/#SS(0))
+ */
+ if (wr)
+ /* #GP(0) if the destination operand is located in a
+ * read-only data segment or any code segment.
+ */
+ exn = ((s.type & 0xa) == 0 || (s.type & 8));
+ else
+ /* #GP(0) if the source operand is located in an
+ * execute-only code segment
+ */
+ exn = ((s.type & 0xa) == 8);
+ if (exn) {
+ kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
+ return 1;
+ }
+ /* Protected mode: #GP(0)/#SS(0) if the segment is unusable.
+ */
+ exn = (s.unusable != 0);
+
+ /*
+ * Protected mode: #GP(0)/#SS(0) if the memory operand is
+ * outside the segment limit. All CPUs that support VMX ignore
+ * limit checks for flat segments, i.e. segments with base==0,
+ * limit==0xffffffff and of type expand-up data or code.
+ */
+ if (!(s.base == 0 && s.limit == 0xffffffff &&
+ ((s.type & 8) || !(s.type & 4))))
+ exn = exn || (off + sizeof(u64) > s.limit);
+ }
+ if (exn) {
+ kvm_queue_exception_e(vcpu,
+ seg_reg == VCPU_SREG_SS ?
+ SS_VECTOR : GP_VECTOR,
+ 0);
+ return 1;
+ }
+
+ return 0;
+}
+
+static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer)
+{
+ gva_t gva;
+ struct x86_exception e;
+
+ if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
+ vmcs_read32(VMX_INSTRUCTION_INFO), false, &gva))
+ return 1;
+
+ if (kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e)) {
+ kvm_inject_page_fault(vcpu, &e);
+ return 1;
+ }
+
+ return 0;
+}
+
+/*
+ * Allocate a shadow VMCS and associate it with the currently loaded
+ * VMCS, unless such a shadow VMCS already exists. The newly allocated
+ * VMCS is also VMCLEARed, so that it is ready for use.
+ */
+static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs;
+
+ /*
+ * We should allocate a shadow vmcs for vmcs01 only when L1
+ * executes VMXON and free it when L1 executes VMXOFF.
+ * As it is invalid to execute VMXON twice, we shouldn't reach
+ * here when vmcs01 already have an allocated shadow vmcs.
+ */
+ WARN_ON(loaded_vmcs == &vmx->vmcs01 && loaded_vmcs->shadow_vmcs);
+
+ if (!loaded_vmcs->shadow_vmcs) {
+ loaded_vmcs->shadow_vmcs = alloc_vmcs(true);
+ if (loaded_vmcs->shadow_vmcs)
+ vmcs_clear(loaded_vmcs->shadow_vmcs);
+ }
+ return loaded_vmcs->shadow_vmcs;
+}
+
+static int enter_vmx_operation(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int r;
+
+ r = alloc_loaded_vmcs(&vmx->nested.vmcs02);
+ if (r < 0)
+ goto out_vmcs02;
+
+ vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL);
+ if (!vmx->nested.cached_vmcs12)
+ goto out_cached_vmcs12;
+
+ vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL);
+ if (!vmx->nested.cached_shadow_vmcs12)
+ goto out_cached_shadow_vmcs12;
+
+ if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu))
+ goto out_shadow_vmcs;
+
+ hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL_PINNED);
+ vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
+
+ vmx->nested.vpid02 = allocate_vpid();
+
+ vmx->nested.vmxon = true;
+ return 0;
+
+out_shadow_vmcs:
+ kfree(vmx->nested.cached_shadow_vmcs12);
+
+out_cached_shadow_vmcs12:
+ kfree(vmx->nested.cached_vmcs12);
+
+out_cached_vmcs12:
+ free_loaded_vmcs(&vmx->nested.vmcs02);
+
+out_vmcs02:
+ return -ENOMEM;
+}
+
+/*
+ * Emulate the VMXON instruction.
+ * Currently, we just remember that VMX is active, and do not save or even
+ * inspect the argument to VMXON (the so-called "VMXON pointer") because we
+ * do not currently need to store anything in that guest-allocated memory
+ * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their
+ * argument is different from the VMXON pointer (which the spec says they do).
+ */
+static int handle_vmon(struct kvm_vcpu *vcpu)
+{
+ int ret;
+ gpa_t vmptr;
+ struct page *page;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED
+ | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
+
+ /*
+ * The Intel VMX Instruction Reference lists a bunch of bits that are
+ * prerequisite to running VMXON, most notably cr4.VMXE must be set to
+ * 1 (see vmx_set_cr4() for when we allow the guest to set this).
+ * Otherwise, we should fail with #UD. But most faulting conditions
+ * have already been checked by hardware, prior to the VM-exit for
+ * VMXON. We do test guest cr4.VMXE because processor CR4 always has
+ * that bit set to 1 in non-root mode.
+ */
+ if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ /* CPL=0 must be checked manually. */
+ if (vmx_get_cpl(vcpu)) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ if (vmx->nested.vmxon) {
+ nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+
+ if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
+ != VMXON_NEEDED_FEATURES) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ if (nested_vmx_get_vmptr(vcpu, &vmptr))
+ return 1;
+
+ /*
+ * SDM 3: 24.11.5
+ * The first 4 bytes of VMXON region contain the supported
+ * VMCS revision identifier
+ *
+ * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case;
+ * which replaces physical address width with 32
+ */
+ if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) {
+ nested_vmx_failInvalid(vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+
+ page = kvm_vcpu_gpa_to_page(vcpu, vmptr);
+ if (is_error_page(page)) {
+ nested_vmx_failInvalid(vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+ if (*(u32 *)kmap(page) != VMCS12_REVISION) {
+ kunmap(page);
+ kvm_release_page_clean(page);
+ nested_vmx_failInvalid(vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+ kunmap(page);
+ kvm_release_page_clean(page);
+
+ vmx->nested.vmxon_ptr = vmptr;
+ ret = enter_vmx_operation(vcpu);
+ if (ret)
+ return ret;
+
+ nested_vmx_succeed(vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+/*
+ * Intel's VMX Instruction Reference specifies a common set of prerequisites
+ * for running VMX instructions (except VMXON, whose prerequisites are
+ * slightly different). It also specifies what exception to inject otherwise.
+ * Note that many of these exceptions have priority over VM exits, so they
+ * don't have to be checked again here.
+ */
+static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
+{
+ if (!to_vmx(vcpu)->nested.vmxon) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 0;
+ }
+
+ if (vmx_get_cpl(vcpu)) {
+ kvm_inject_gp(vcpu, 0);
+ return 0;
+ }
+
+ return 1;
+}
+
+static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx)
+{
+ vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_SHADOW_VMCS);
+ vmcs_write64(VMCS_LINK_POINTER, -1ull);
+ vmx->nested.sync_shadow_vmcs = false;
+}
+
+static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
+{
+ if (vmx->nested.current_vmptr == -1ull)
+ return;
+
+ if (enable_shadow_vmcs) {
+ /* copy to memory all shadowed fields in case
+ they were modified */
+ copy_shadow_to_vmcs12(vmx);
+ vmx_disable_shadow_vmcs(vmx);
+ }
+ vmx->nested.posted_intr_nv = -1;
+
+ /* Flush VMCS12 to guest memory */
+ kvm_vcpu_write_guest_page(&vmx->vcpu,
+ vmx->nested.current_vmptr >> PAGE_SHIFT,
+ vmx->nested.cached_vmcs12, 0, VMCS12_SIZE);
+
+ vmx->nested.current_vmptr = -1ull;
+}
+
+/*
+ * Free whatever needs to be freed from vmx->nested when L1 goes down, or
+ * just stops using VMX.
+ */
+static void free_nested(struct vcpu_vmx *vmx)
+{
+ if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon)
+ return;
+
+ kvm_clear_request(KVM_REQ_GET_VMCS12_PAGES, &vmx->vcpu);
+
+ hrtimer_cancel(&vmx->nested.preemption_timer);
+ vmx->nested.vmxon = false;
+ vmx->nested.smm.vmxon = false;
+ free_vpid(vmx->nested.vpid02);
+ vmx->nested.posted_intr_nv = -1;
+ vmx->nested.current_vmptr = -1ull;
+ if (enable_shadow_vmcs) {
+ vmx_disable_shadow_vmcs(vmx);
+ vmcs_clear(vmx->vmcs01.shadow_vmcs);
+ free_vmcs(vmx->vmcs01.shadow_vmcs);
+ vmx->vmcs01.shadow_vmcs = NULL;
+ }
+ kfree(vmx->nested.cached_vmcs12);
+ kfree(vmx->nested.cached_shadow_vmcs12);
+ /* Unpin physical memory we referred to in the vmcs02 */
+ if (vmx->nested.apic_access_page) {
+ kvm_release_page_dirty(vmx->nested.apic_access_page);
+ vmx->nested.apic_access_page = NULL;
+ }
+ if (vmx->nested.virtual_apic_page) {
+ kvm_release_page_dirty(vmx->nested.virtual_apic_page);
+ vmx->nested.virtual_apic_page = NULL;
+ }
+ if (vmx->nested.pi_desc_page) {
+ kunmap(vmx->nested.pi_desc_page);
+ kvm_release_page_dirty(vmx->nested.pi_desc_page);
+ vmx->nested.pi_desc_page = NULL;
+ vmx->nested.pi_desc = NULL;
+ }
+
+ free_loaded_vmcs(&vmx->nested.vmcs02);
+}
+
+/* Emulate the VMXOFF instruction */
+static int handle_vmoff(struct kvm_vcpu *vcpu)
+{
+ if (!nested_vmx_check_permission(vcpu))
+ return 1;
+ free_nested(to_vmx(vcpu));
+ nested_vmx_succeed(vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+/* Emulate the VMCLEAR instruction */
+static int handle_vmclear(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u32 zero = 0;
+ gpa_t vmptr;
+
+ if (!nested_vmx_check_permission(vcpu))
+ return 1;
+
+ if (nested_vmx_get_vmptr(vcpu, &vmptr))
+ return 1;
+
+ if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) {
+ nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+
+ if (vmptr == vmx->nested.vmxon_ptr) {
+ nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_VMXON_POINTER);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+
+ if (vmptr == vmx->nested.current_vmptr)
+ nested_release_vmcs12(vmx);
+
+ kvm_vcpu_write_guest(vcpu,
+ vmptr + offsetof(struct vmcs12, launch_state),
+ &zero, sizeof(zero));
+
+ nested_vmx_succeed(vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch);
+
+/* Emulate the VMLAUNCH instruction */
+static int handle_vmlaunch(struct kvm_vcpu *vcpu)
+{
+ return nested_vmx_run(vcpu, true);
+}
+
+/* Emulate the VMRESUME instruction */
+static int handle_vmresume(struct kvm_vcpu *vcpu)
+{
+
+ return nested_vmx_run(vcpu, false);
+}
+
+/*
+ * Read a vmcs12 field. Since these can have varying lengths and we return
+ * one type, we chose the biggest type (u64) and zero-extend the return value
+ * to that size. Note that the caller, handle_vmread, might need to use only
+ * some of the bits we return here (e.g., on 32-bit guests, only 32 bits of
+ * 64-bit fields are to be returned).
+ */
+static inline int vmcs12_read_any(struct vmcs12 *vmcs12,
+ unsigned long field, u64 *ret)
+{
+ short offset = vmcs_field_to_offset(field);
+ char *p;
+
+ if (offset < 0)
+ return offset;
+
+ p = (char *)vmcs12 + offset;
+
+ switch (vmcs_field_width(field)) {
+ case VMCS_FIELD_WIDTH_NATURAL_WIDTH:
+ *ret = *((natural_width *)p);
+ return 0;
+ case VMCS_FIELD_WIDTH_U16:
+ *ret = *((u16 *)p);
+ return 0;
+ case VMCS_FIELD_WIDTH_U32:
+ *ret = *((u32 *)p);
+ return 0;
+ case VMCS_FIELD_WIDTH_U64:
+ *ret = *((u64 *)p);
+ return 0;
+ default:
+ WARN_ON(1);
+ return -ENOENT;
+ }
+}
+
+
+static inline int vmcs12_write_any(struct vmcs12 *vmcs12,
+ unsigned long field, u64 field_value){
+ short offset = vmcs_field_to_offset(field);
+ char *p = (char *)vmcs12 + offset;
+ if (offset < 0)
+ return offset;
+
+ switch (vmcs_field_width(field)) {
+ case VMCS_FIELD_WIDTH_U16:
+ *(u16 *)p = field_value;
+ return 0;
+ case VMCS_FIELD_WIDTH_U32:
+ *(u32 *)p = field_value;
+ return 0;
+ case VMCS_FIELD_WIDTH_U64:
+ *(u64 *)p = field_value;
+ return 0;
+ case VMCS_FIELD_WIDTH_NATURAL_WIDTH:
+ *(natural_width *)p = field_value;
+ return 0;
+ default:
+ WARN_ON(1);
+ return -ENOENT;
+ }
+
+}
+
+/*
+ * Copy the writable VMCS shadow fields back to the VMCS12, in case
+ * they have been modified by the L1 guest. Note that the "read-only"
+ * VM-exit information fields are actually writable if the vCPU is
+ * configured to support "VMWRITE to any supported field in the VMCS."
+ */
+static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
+{
+ const u16 *fields[] = {
+ shadow_read_write_fields,
+ shadow_read_only_fields
+ };
+ const int max_fields[] = {
+ max_shadow_read_write_fields,
+ max_shadow_read_only_fields
+ };
+ int i, q;
+ unsigned long field;
+ u64 field_value;
+ struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
+
+ if (WARN_ON(!shadow_vmcs))
+ return;
+
+ preempt_disable();
+
+ vmcs_load(shadow_vmcs);
+
+ for (q = 0; q < ARRAY_SIZE(fields); q++) {
+ for (i = 0; i < max_fields[q]; i++) {
+ field = fields[q][i];
+ field_value = __vmcs_readl(field);
+ vmcs12_write_any(get_vmcs12(&vmx->vcpu), field, field_value);
+ }
+ /*
+ * Skip the VM-exit information fields if they are read-only.
+ */
+ if (!nested_cpu_has_vmwrite_any_field(&vmx->vcpu))
+ break;
+ }
+
+ vmcs_clear(shadow_vmcs);
+ vmcs_load(vmx->loaded_vmcs->vmcs);
+
+ preempt_enable();
+}
+
+static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
+{
+ const u16 *fields[] = {
+ shadow_read_write_fields,
+ shadow_read_only_fields
+ };
+ const int max_fields[] = {
+ max_shadow_read_write_fields,
+ max_shadow_read_only_fields
+ };
+ int i, q;
+ unsigned long field;
+ u64 field_value = 0;
+ struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
+
+ if (WARN_ON(!shadow_vmcs))
+ return;
+
+ vmcs_load(shadow_vmcs);
+
+ for (q = 0; q < ARRAY_SIZE(fields); q++) {
+ for (i = 0; i < max_fields[q]; i++) {
+ field = fields[q][i];
+ vmcs12_read_any(get_vmcs12(&vmx->vcpu), field, &field_value);
+ __vmcs_writel(field, field_value);
+ }
+ }
+
+ vmcs_clear(shadow_vmcs);
+ vmcs_load(vmx->loaded_vmcs->vmcs);
+}
+
+/*
+ * VMX instructions which assume a current vmcs12 (i.e., that VMPTRLD was
+ * used before) all generate the same failure when it is missing.
+ */
+static int nested_vmx_check_vmcs12(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ if (vmx->nested.current_vmptr == -1ull) {
+ nested_vmx_failInvalid(vcpu);
+ return 0;
+ }
+ return 1;
+}
+
+static int handle_vmread(struct kvm_vcpu *vcpu)
+{
+ unsigned long field;
+ u64 field_value;
+ unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+ gva_t gva = 0;
+ struct vmcs12 *vmcs12;
+ struct x86_exception e;
+
+ if (!nested_vmx_check_permission(vcpu))
+ return 1;
+
+ if (!nested_vmx_check_vmcs12(vcpu))
+ return kvm_skip_emulated_instruction(vcpu);
+
+ if (!is_guest_mode(vcpu))
+ vmcs12 = get_vmcs12(vcpu);
+ else {
+ /*
+ * When vmcs->vmcs_link_pointer is -1ull, any VMREAD
+ * to shadowed-field sets the ALU flags for VMfailInvalid.
+ */
+ if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) {
+ nested_vmx_failInvalid(vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+ vmcs12 = get_shadow_vmcs12(vcpu);
+ }
+
+ /* Decode instruction info and find the field to read */
+ field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
+ /* Read the field, zero-extended to a u64 field_value */
+ if (vmcs12_read_any(vmcs12, field, &field_value) < 0) {
+ nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+ /*
+ * Now copy part of this value to register or memory, as requested.
+ * Note that the number of bits actually copied is 32 or 64 depending
+ * on the guest's mode (32 or 64 bit), not on the given field's length.
+ */
+ if (vmx_instruction_info & (1u << 10)) {
+ kvm_register_writel(vcpu, (((vmx_instruction_info) >> 3) & 0xf),
+ field_value);
+ } else {
+ if (get_vmx_mem_address(vcpu, exit_qualification,
+ vmx_instruction_info, true, &gva))
+ return 1;
+ /* _system ok, nested_vmx_check_permission has verified cpl=0 */
+ if (kvm_write_guest_virt_system(vcpu, gva, &field_value,
+ (is_long_mode(vcpu) ? 8 : 4),
+ &e)) {
+ kvm_inject_page_fault(vcpu, &e);
+ return 1;
+ }
+ }
+
+ nested_vmx_succeed(vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+
+static int handle_vmwrite(struct kvm_vcpu *vcpu)
+{
+ unsigned long field;
+ gva_t gva;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+
+ /* The value to write might be 32 or 64 bits, depending on L1's long
+ * mode, and eventually we need to write that into a field of several
+ * possible lengths. The code below first zero-extends the value to 64
+ * bit (field_value), and then copies only the appropriate number of
+ * bits into the vmcs12 field.
+ */
+ u64 field_value = 0;
+ struct x86_exception e;
+ struct vmcs12 *vmcs12;
+
+ if (!nested_vmx_check_permission(vcpu))
+ return 1;
+
+ if (!nested_vmx_check_vmcs12(vcpu))
+ return kvm_skip_emulated_instruction(vcpu);
+
+ if (vmx_instruction_info & (1u << 10))
+ field_value = kvm_register_readl(vcpu,
+ (((vmx_instruction_info) >> 3) & 0xf));
+ else {
+ if (get_vmx_mem_address(vcpu, exit_qualification,
+ vmx_instruction_info, false, &gva))
+ return 1;
+ if (kvm_read_guest_virt(vcpu, gva, &field_value,
+ (is_64_bit_mode(vcpu) ? 8 : 4), &e)) {
+ kvm_inject_page_fault(vcpu, &e);
+ return 1;
+ }
+ }
+
+
+ field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
+ /*
+ * If the vCPU supports "VMWRITE to any supported field in the
+ * VMCS," then the "read-only" fields are actually read/write.
+ */
+ if (vmcs_field_readonly(field) &&
+ !nested_cpu_has_vmwrite_any_field(vcpu)) {
+ nested_vmx_failValid(vcpu,
+ VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+
+ if (!is_guest_mode(vcpu))
+ vmcs12 = get_vmcs12(vcpu);
+ else {
+ /*
+ * When vmcs->vmcs_link_pointer is -1ull, any VMWRITE
+ * to shadowed-field sets the ALU flags for VMfailInvalid.
+ */
+ if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) {
+ nested_vmx_failInvalid(vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+ vmcs12 = get_shadow_vmcs12(vcpu);
+
+ }
+
+ if (vmcs12_write_any(vmcs12, field, field_value) < 0) {
+ nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+
+ /*
+ * Do not track vmcs12 dirty-state if in guest-mode
+ * as we actually dirty shadow vmcs12 instead of vmcs12.
+ */
+ if (!is_guest_mode(vcpu)) {
+ switch (field) {
+#define SHADOW_FIELD_RW(x) case x:
+#include "vmx_shadow_fields.h"
+ /*
+ * The fields that can be updated by L1 without a vmexit are
+ * always updated in the vmcs02, the others go down the slow
+ * path of prepare_vmcs02.
+ */
+ break;
+ default:
+ vmx->nested.dirty_vmcs12 = true;
+ break;
+ }
+ }
+
+ nested_vmx_succeed(vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr)
+{
+ vmx->nested.current_vmptr = vmptr;
+ if (enable_shadow_vmcs) {
+ vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
+ SECONDARY_EXEC_SHADOW_VMCS);
+ vmcs_write64(VMCS_LINK_POINTER,
+ __pa(vmx->vmcs01.shadow_vmcs));
+ vmx->nested.sync_shadow_vmcs = true;
+ }
+ vmx->nested.dirty_vmcs12 = true;
+}
+
+/* Emulate the VMPTRLD instruction */
+static int handle_vmptrld(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ gpa_t vmptr;
+
+ if (!nested_vmx_check_permission(vcpu))
+ return 1;
+
+ if (nested_vmx_get_vmptr(vcpu, &vmptr))
+ return 1;
+
+ if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) {
+ nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+
+ if (vmptr == vmx->nested.vmxon_ptr) {
+ nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_VMXON_POINTER);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+
+ if (vmx->nested.current_vmptr != vmptr) {
+ struct vmcs12 *new_vmcs12;
+ struct page *page;
+ page = kvm_vcpu_gpa_to_page(vcpu, vmptr);
+ if (is_error_page(page)) {
+ nested_vmx_failInvalid(vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+ new_vmcs12 = kmap(page);
+ if (new_vmcs12->hdr.revision_id != VMCS12_REVISION ||
+ (new_vmcs12->hdr.shadow_vmcs &&
+ !nested_cpu_has_vmx_shadow_vmcs(vcpu))) {
+ kunmap(page);
+ kvm_release_page_clean(page);
+ nested_vmx_failValid(vcpu,
+ VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+
+ nested_release_vmcs12(vmx);
+ /*
+ * Load VMCS12 from guest memory since it is not already
+ * cached.
+ */
+ memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE);
+ kunmap(page);
+ kvm_release_page_clean(page);
+
+ set_current_vmptr(vmx, vmptr);
+ }
+
+ nested_vmx_succeed(vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+/* Emulate the VMPTRST instruction */
+static int handle_vmptrst(struct kvm_vcpu *vcpu)
+{
+ unsigned long exit_qual = vmcs_readl(EXIT_QUALIFICATION);
+ u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+ gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr;
+ struct x86_exception e;
+ gva_t gva;
+
+ if (!nested_vmx_check_permission(vcpu))
+ return 1;
+
+ if (get_vmx_mem_address(vcpu, exit_qual, instr_info, true, &gva))
+ return 1;
+ /* *_system ok, nested_vmx_check_permission has verified cpl=0 */
+ if (kvm_write_guest_virt_system(vcpu, gva, (void *)&current_vmptr,
+ sizeof(gpa_t), &e)) {
+ kvm_inject_page_fault(vcpu, &e);
+ return 1;
+ }
+ nested_vmx_succeed(vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+/* Emulate the INVEPT instruction */
+static int handle_invept(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u32 vmx_instruction_info, types;
+ unsigned long type;
+ gva_t gva;
+ struct x86_exception e;
+ struct {
+ u64 eptp, gpa;
+ } operand;
+
+ if (!(vmx->nested.msrs.secondary_ctls_high &
+ SECONDARY_EXEC_ENABLE_EPT) ||
+ !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ if (!nested_vmx_check_permission(vcpu))
+ return 1;
+
+ vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+ type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
+
+ types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
+
+ if (type >= 32 || !(types & (1 << type))) {
+ nested_vmx_failValid(vcpu,
+ VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+
+ /* According to the Intel VMX instruction reference, the memory
+ * operand is read even if it isn't needed (e.g., for type==global)
+ */
+ if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
+ vmx_instruction_info, false, &gva))
+ return 1;
+ if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
+ kvm_inject_page_fault(vcpu, &e);
+ return 1;
+ }
+
+ switch (type) {
+ case VMX_EPT_EXTENT_GLOBAL:
+ /*
+ * TODO: track mappings and invalidate
+ * single context requests appropriately
+ */
+ case VMX_EPT_EXTENT_CONTEXT:
+ kvm_mmu_sync_roots(vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+ nested_vmx_succeed(vcpu);
+ break;
+ default:
+ BUG_ON(1);
+ break;
+ }
+
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+static int handle_invvpid(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u32 vmx_instruction_info;
+ unsigned long type, types;
+ gva_t gva;
+ struct x86_exception e;
+ struct {
+ u64 vpid;
+ u64 gla;
+ } operand;
+
+ if (!(vmx->nested.msrs.secondary_ctls_high &
+ SECONDARY_EXEC_ENABLE_VPID) ||
+ !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ if (!nested_vmx_check_permission(vcpu))
+ return 1;
+
+ vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+ type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
+
+ types = (vmx->nested.msrs.vpid_caps &
+ VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8;
+
+ if (type >= 32 || !(types & (1 << type))) {
+ nested_vmx_failValid(vcpu,
+ VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+
+ /* according to the intel vmx instruction reference, the memory
+ * operand is read even if it isn't needed (e.g., for type==global)
+ */
+ if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
+ vmx_instruction_info, false, &gva))
+ return 1;
+ if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
+ kvm_inject_page_fault(vcpu, &e);
+ return 1;
+ }
+ if (operand.vpid >> 16) {
+ nested_vmx_failValid(vcpu,
+ VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+
+ switch (type) {
+ case VMX_VPID_EXTENT_INDIVIDUAL_ADDR:
+ if (!operand.vpid ||
+ is_noncanonical_address(operand.gla, vcpu)) {
+ nested_vmx_failValid(vcpu,
+ VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+ if (cpu_has_vmx_invvpid_individual_addr() &&
+ vmx->nested.vpid02) {
+ __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR,
+ vmx->nested.vpid02, operand.gla);
+ } else
+ __vmx_flush_tlb(vcpu, vmx->nested.vpid02, true);
+ break;
+ case VMX_VPID_EXTENT_SINGLE_CONTEXT:
+ case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL:
+ if (!operand.vpid) {
+ nested_vmx_failValid(vcpu,
+ VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+ __vmx_flush_tlb(vcpu, vmx->nested.vpid02, true);
+ break;
+ case VMX_VPID_EXTENT_ALL_CONTEXT:
+ __vmx_flush_tlb(vcpu, vmx->nested.vpid02, true);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+
+ nested_vmx_succeed(vcpu);
+
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+static int handle_invpcid(struct kvm_vcpu *vcpu)
+{
+ u32 vmx_instruction_info;
+ unsigned long type;
+ bool pcid_enabled;
+ gva_t gva;
+ struct x86_exception e;
+ unsigned i;
+ unsigned long roots_to_free = 0;
+ struct {
+ u64 pcid;
+ u64 gla;
+ } operand;
+
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+ type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
+
+ if (type > 3) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ /* According to the Intel instruction reference, the memory operand
+ * is read even if it isn't needed (e.g., for type==all)
+ */
+ if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
+ vmx_instruction_info, false, &gva))
+ return 1;
+
+ if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
+ kvm_inject_page_fault(vcpu, &e);
+ return 1;
+ }
+
+ if (operand.pcid >> 12 != 0) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
+
+ switch (type) {
+ case INVPCID_TYPE_INDIV_ADDR:
+ if ((!pcid_enabled && (operand.pcid != 0)) ||
+ is_noncanonical_address(operand.gla, vcpu)) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+ kvm_mmu_invpcid_gva(vcpu, operand.gla, operand.pcid);
+ return kvm_skip_emulated_instruction(vcpu);
+
+ case INVPCID_TYPE_SINGLE_CTXT:
+ if (!pcid_enabled && (operand.pcid != 0)) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ if (kvm_get_active_pcid(vcpu) == operand.pcid) {
+ kvm_mmu_sync_roots(vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+ }
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+ if (kvm_get_pcid(vcpu, vcpu->arch.mmu.prev_roots[i].cr3)
+ == operand.pcid)
+ roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
+
+ kvm_mmu_free_roots(vcpu, roots_to_free);
+ /*
+ * If neither the current cr3 nor any of the prev_roots use the
+ * given PCID, then nothing needs to be done here because a
+ * resync will happen anyway before switching to any other CR3.
+ */
+
+ return kvm_skip_emulated_instruction(vcpu);
+
+ case INVPCID_TYPE_ALL_NON_GLOBAL:
+ /*
+ * Currently, KVM doesn't mark global entries in the shadow
+ * page tables, so a non-global flush just degenerates to a
+ * global flush. If needed, we could optimize this later by
+ * keeping track of global entries in shadow page tables.
+ */
+
+ /* fall-through */
+ case INVPCID_TYPE_ALL_INCL_GLOBAL:
+ kvm_mmu_unload(vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
+
+ default:
+ BUG(); /* We have already checked above that type <= 3 */
+ }
+}
+
+static int handle_pml_full(struct kvm_vcpu *vcpu)
+{
+ unsigned long exit_qualification;
+
+ trace_kvm_pml_full(vcpu->vcpu_id);
+
+ exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+
+ /*
+ * PML buffer FULL happened while executing iret from NMI,
+ * "blocked by NMI" bit has to be set before next VM entry.
+ */
+ if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
+ enable_vnmi &&
+ (exit_qualification & INTR_INFO_UNBLOCK_NMI))
+ vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+ GUEST_INTR_STATE_NMI);
+
+ /*
+ * PML buffer already flushed at beginning of VMEXIT. Nothing to do
+ * here.., and there's no userspace involvement needed for PML.
+ */
+ return 1;
+}
+
+static int handle_preemption_timer(struct kvm_vcpu *vcpu)
+{
+ if (!to_vmx(vcpu)->req_immediate_exit)
+ kvm_lapic_expired_hv_timer(vcpu);
+ return 1;
+}
+
+static bool valid_ept_address(struct kvm_vcpu *vcpu, u64 address)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int maxphyaddr = cpuid_maxphyaddr(vcpu);
+
+ /* Check for memory type validity */
+ switch (address & VMX_EPTP_MT_MASK) {
+ case VMX_EPTP_MT_UC:
+ if (!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT))
+ return false;
+ break;
+ case VMX_EPTP_MT_WB:
+ if (!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT))
+ return false;
+ break;
+ default:
+ return false;
+ }
+
+ /* only 4 levels page-walk length are valid */
+ if ((address & VMX_EPTP_PWL_MASK) != VMX_EPTP_PWL_4)
+ return false;
+
+ /* Reserved bits should not be set */
+ if (address >> maxphyaddr || ((address >> 7) & 0x1f))
+ return false;
+
+ /* AD, if set, should be supported */
+ if (address & VMX_EPTP_AD_ENABLE_BIT) {
+ if (!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT))
+ return false;
+ }
+
+ return true;
+}
+
+static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ u32 index = vcpu->arch.regs[VCPU_REGS_RCX];
+ u64 address;
+ bool accessed_dirty;
+ struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
+
+ if (!nested_cpu_has_eptp_switching(vmcs12) ||
+ !nested_cpu_has_ept(vmcs12))
+ return 1;
+
+ if (index >= VMFUNC_EPTP_ENTRIES)
+ return 1;
+
+
+ if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT,
+ &address, index * 8, 8))
+ return 1;
+
+ accessed_dirty = !!(address & VMX_EPTP_AD_ENABLE_BIT);
+
+ /*
+ * If the (L2) guest does a vmfunc to the currently
+ * active ept pointer, we don't have to do anything else
+ */
+ if (vmcs12->ept_pointer != address) {
+ if (!valid_ept_address(vcpu, address))
+ return 1;
+
+ kvm_mmu_unload(vcpu);
+ mmu->ept_ad = accessed_dirty;
+ mmu->base_role.ad_disabled = !accessed_dirty;
+ vmcs12->ept_pointer = address;
+ /*
+ * TODO: Check what's the correct approach in case
+ * mmu reload fails. Currently, we just let the next
+ * reload potentially fail
+ */
+ kvm_mmu_reload(vcpu);
+ }
+
+ return 0;
+}
+
+static int handle_vmfunc(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vmcs12 *vmcs12;
+ u32 function = vcpu->arch.regs[VCPU_REGS_RAX];
+
+ /*
+ * VMFUNC is only supported for nested guests, but we always enable the
+ * secondary control for simplicity; for non-nested mode, fake that we
+ * didn't by injecting #UD.
+ */
+ if (!is_guest_mode(vcpu)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ vmcs12 = get_vmcs12(vcpu);
+ if ((vmcs12->vm_function_control & (1 << function)) == 0)
+ goto fail;
+
+ switch (function) {
+ case 0:
+ if (nested_vmx_eptp_switching(vcpu, vmcs12))
+ goto fail;
+ break;
+ default:
+ goto fail;
+ }
+ return kvm_skip_emulated_instruction(vcpu);
+
+fail:
+ nested_vmx_vmexit(vcpu, vmx->exit_reason,
+ vmcs_read32(VM_EXIT_INTR_INFO),
+ vmcs_readl(EXIT_QUALIFICATION));
+ return 1;
+}
+
+static int handle_encls(struct kvm_vcpu *vcpu)
+{
+ /*
+ * SGX virtualization is not yet supported. There is no software
+ * enable bit for SGX, so we have to trap ENCLS and inject a #UD
+ * to prevent the guest from executing ENCLS.
+ */
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+}
+
+/*
+ * The exit handlers return 1 if the exit was handled fully and guest execution
+ * may resume. Otherwise they set the kvm_run parameter to indicate what needs
+ * to be done to userspace and return 0.
+ */
+static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
+ [EXIT_REASON_EXCEPTION_NMI] = handle_exception,
+ [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
+ [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault,
+ [EXIT_REASON_NMI_WINDOW] = handle_nmi_window,
+ [EXIT_REASON_IO_INSTRUCTION] = handle_io,
+ [EXIT_REASON_CR_ACCESS] = handle_cr,
+ [EXIT_REASON_DR_ACCESS] = handle_dr,
+ [EXIT_REASON_CPUID] = handle_cpuid,
+ [EXIT_REASON_MSR_READ] = handle_rdmsr,
+ [EXIT_REASON_MSR_WRITE] = handle_wrmsr,
+ [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window,
+ [EXIT_REASON_HLT] = handle_halt,
+ [EXIT_REASON_INVD] = handle_invd,
+ [EXIT_REASON_INVLPG] = handle_invlpg,
+ [EXIT_REASON_RDPMC] = handle_rdpmc,
+ [EXIT_REASON_VMCALL] = handle_vmcall,
+ [EXIT_REASON_VMCLEAR] = handle_vmclear,
+ [EXIT_REASON_VMLAUNCH] = handle_vmlaunch,
+ [EXIT_REASON_VMPTRLD] = handle_vmptrld,
+ [EXIT_REASON_VMPTRST] = handle_vmptrst,
+ [EXIT_REASON_VMREAD] = handle_vmread,
+ [EXIT_REASON_VMRESUME] = handle_vmresume,
+ [EXIT_REASON_VMWRITE] = handle_vmwrite,
+ [EXIT_REASON_VMOFF] = handle_vmoff,
+ [EXIT_REASON_VMON] = handle_vmon,
+ [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
+ [EXIT_REASON_APIC_ACCESS] = handle_apic_access,
+ [EXIT_REASON_APIC_WRITE] = handle_apic_write,
+ [EXIT_REASON_EOI_INDUCED] = handle_apic_eoi_induced,
+ [EXIT_REASON_WBINVD] = handle_wbinvd,
+ [EXIT_REASON_XSETBV] = handle_xsetbv,
+ [EXIT_REASON_TASK_SWITCH] = handle_task_switch,
+ [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check,
+ [EXIT_REASON_GDTR_IDTR] = handle_desc,
+ [EXIT_REASON_LDTR_TR] = handle_desc,
+ [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
+ [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig,
+ [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause,
+ [EXIT_REASON_MWAIT_INSTRUCTION] = handle_mwait,
+ [EXIT_REASON_MONITOR_TRAP_FLAG] = handle_monitor_trap,
+ [EXIT_REASON_MONITOR_INSTRUCTION] = handle_monitor,
+ [EXIT_REASON_INVEPT] = handle_invept,
+ [EXIT_REASON_INVVPID] = handle_invvpid,
+ [EXIT_REASON_RDRAND] = handle_invalid_op,
+ [EXIT_REASON_RDSEED] = handle_invalid_op,
+ [EXIT_REASON_XSAVES] = handle_xsaves,
+ [EXIT_REASON_XRSTORS] = handle_xrstors,
+ [EXIT_REASON_PML_FULL] = handle_pml_full,
+ [EXIT_REASON_INVPCID] = handle_invpcid,
+ [EXIT_REASON_VMFUNC] = handle_vmfunc,
+ [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer,
+ [EXIT_REASON_ENCLS] = handle_encls,
+};
+
+static const int kvm_vmx_max_exit_handlers =
+ ARRAY_SIZE(kvm_vmx_exit_handlers);
+
+/*
+ * Return true if an IO instruction with the specified port and size should cause
+ * a VM-exit into L1.
+ */
+bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port,
+ int size)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ gpa_t bitmap, last_bitmap;
+ u8 b;
+
+ last_bitmap = (gpa_t)-1;
+ b = -1;
+
+ while (size > 0) {
+ if (port < 0x8000)
+ bitmap = vmcs12->io_bitmap_a;
+ else if (port < 0x10000)
+ bitmap = vmcs12->io_bitmap_b;
+ else
+ return true;
+ bitmap += (port & 0x7fff) / 8;
+
+ if (last_bitmap != bitmap)
+ if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1))
+ return true;
+ if (b & (1 << (port & 7)))
+ return true;
+
+ port++;
+ size--;
+ last_bitmap = bitmap;
+ }
+
+ return false;
+}
+
+/*
+ * Return 1 if we should exit from L2 to L1 to handle an MSR access access,
+ * rather than handle it ourselves in L0. I.e., check whether L1 expressed
+ * disinterest in the current event (read or write a specific MSR) by using an
+ * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps.
+ */
+static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12, u32 exit_reason)
+{
+ u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX];
+ gpa_t bitmap;
+
+ if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
+ return true;
+
+ /*
+ * The MSR_BITMAP page is divided into four 1024-byte bitmaps,
+ * for the four combinations of read/write and low/high MSR numbers.
+ * First we need to figure out which of the four to use:
+ */
+ bitmap = vmcs12->msr_bitmap;
+ if (exit_reason == EXIT_REASON_MSR_WRITE)
+ bitmap += 2048;
+ if (msr_index >= 0xc0000000) {
+ msr_index -= 0xc0000000;
+ bitmap += 1024;
+ }
+
+ /* Then read the msr_index'th bit from this bitmap: */
+ if (msr_index < 1024*8) {
+ unsigned char b;
+ if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1))
+ return true;
+ return 1 & (b >> (msr_index & 7));
+ } else
+ return true; /* let L1 handle the wrong parameter */
+}
+
+/*
+ * Return 1 if we should exit from L2 to L1 to handle a CR access exit,
+ * rather than handle it ourselves in L0. I.e., check if L1 wanted to
+ * intercept (via guest_host_mask etc.) the current event.
+ */
+static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ int cr = exit_qualification & 15;
+ int reg;
+ unsigned long val;
+
+ switch ((exit_qualification >> 4) & 3) {
+ case 0: /* mov to cr */
+ reg = (exit_qualification >> 8) & 15;
+ val = kvm_register_readl(vcpu, reg);
+ switch (cr) {
+ case 0:
+ if (vmcs12->cr0_guest_host_mask &
+ (val ^ vmcs12->cr0_read_shadow))
+ return true;
+ break;
+ case 3:
+ if ((vmcs12->cr3_target_count >= 1 &&
+ vmcs12->cr3_target_value0 == val) ||
+ (vmcs12->cr3_target_count >= 2 &&
+ vmcs12->cr3_target_value1 == val) ||
+ (vmcs12->cr3_target_count >= 3 &&
+ vmcs12->cr3_target_value2 == val) ||
+ (vmcs12->cr3_target_count >= 4 &&
+ vmcs12->cr3_target_value3 == val))
+ return false;
+ if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING))
+ return true;
+ break;
+ case 4:
+ if (vmcs12->cr4_guest_host_mask &
+ (vmcs12->cr4_read_shadow ^ val))
+ return true;
+ break;
+ case 8:
+ if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING))
+ return true;
+ break;
+ }
+ break;
+ case 2: /* clts */
+ if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) &&
+ (vmcs12->cr0_read_shadow & X86_CR0_TS))
+ return true;
+ break;
+ case 1: /* mov from cr */
+ switch (cr) {
+ case 3:
+ if (vmcs12->cpu_based_vm_exec_control &
+ CPU_BASED_CR3_STORE_EXITING)
+ return true;
+ break;
+ case 8:
+ if (vmcs12->cpu_based_vm_exec_control &
+ CPU_BASED_CR8_STORE_EXITING)
+ return true;
+ break;
+ }
+ break;
+ case 3: /* lmsw */
+ /*
+ * lmsw can change bits 1..3 of cr0, and only set bit 0 of
+ * cr0. Other attempted changes are ignored, with no exit.
+ */
+ val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
+ if (vmcs12->cr0_guest_host_mask & 0xe &
+ (val ^ vmcs12->cr0_read_shadow))
+ return true;
+ if ((vmcs12->cr0_guest_host_mask & 0x1) &&
+ !(vmcs12->cr0_read_shadow & 0x1) &&
+ (val & 0x1))
+ return true;
+ break;
+ }
+ return false;
+}
+
+static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12, gpa_t bitmap)
+{
+ u32 vmx_instruction_info;
+ unsigned long field;
+ u8 b;
+
+ if (!nested_cpu_has_shadow_vmcs(vmcs12))
+ return true;
+
+ /* Decode instruction info and find the field to access */
+ vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+ field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
+
+ /* Out-of-range fields always cause a VM exit from L2 to L1 */
+ if (field >> 15)
+ return true;
+
+ if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1))
+ return true;
+
+ return 1 & (b >> (field & 7));
+}
+
+/*
+ * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we
+ * should handle it ourselves in L0 (and then continue L2). Only call this
+ * when in is_guest_mode (L2).
+ */
+static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
+{
+ u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+ if (vmx->nested.nested_run_pending)
+ return false;
+
+ if (unlikely(vmx->fail)) {
+ pr_info_ratelimited("%s failed vm entry %x\n", __func__,
+ vmcs_read32(VM_INSTRUCTION_ERROR));
+ return true;
+ }
+
+ /*
+ * The host physical addresses of some pages of guest memory
+ * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
+ * Page). The CPU may write to these pages via their host
+ * physical address while L2 is running, bypassing any
+ * address-translation-based dirty tracking (e.g. EPT write
+ * protection).
+ *
+ * Mark them dirty on every exit from L2 to prevent them from
+ * getting out of sync with dirty tracking.
+ */
+ nested_mark_vmcs12_pages_dirty(vcpu);
+
+ trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason,
+ vmcs_readl(EXIT_QUALIFICATION),
+ vmx->idt_vectoring_info,
+ intr_info,
+ vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
+ KVM_ISA_VMX);
+
+ switch ((u16)exit_reason) {
+ case EXIT_REASON_EXCEPTION_NMI:
+ if (is_nmi(intr_info))
+ return false;
+ else if (is_page_fault(intr_info))
+ return !vmx->vcpu.arch.apf.host_apf_reason && enable_ept;
+ else if (is_no_device(intr_info) &&
+ !(vmcs12->guest_cr0 & X86_CR0_TS))
+ return false;
+ else if (is_debug(intr_info) &&
+ vcpu->guest_debug &
+ (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
+ return false;
+ else if (is_breakpoint(intr_info) &&
+ vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
+ return false;
+ return vmcs12->exception_bitmap &
+ (1u << (intr_info & INTR_INFO_VECTOR_MASK));
+ case EXIT_REASON_EXTERNAL_INTERRUPT:
+ return false;
+ case EXIT_REASON_TRIPLE_FAULT:
+ return true;
+ case EXIT_REASON_PENDING_INTERRUPT:
+ return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_INTR_PENDING);
+ case EXIT_REASON_NMI_WINDOW:
+ return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING);
+ case EXIT_REASON_TASK_SWITCH:
+ return true;
+ case EXIT_REASON_CPUID:
+ return true;
+ case EXIT_REASON_HLT:
+ return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING);
+ case EXIT_REASON_INVD:
+ return true;
+ case EXIT_REASON_INVLPG:
+ return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
+ case EXIT_REASON_RDPMC:
+ return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING);
+ case EXIT_REASON_RDRAND:
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING);
+ case EXIT_REASON_RDSEED:
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING);
+ case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP:
+ return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
+ case EXIT_REASON_VMREAD:
+ return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
+ vmcs12->vmread_bitmap);
+ case EXIT_REASON_VMWRITE:
+ return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
+ vmcs12->vmwrite_bitmap);
+ case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
+ case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD:
+ case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME:
+ case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
+ case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID:
+ /*
+ * VMX instructions trap unconditionally. This allows L1 to
+ * emulate them for its L2 guest, i.e., allows 3-level nesting!
+ */
+ return true;
+ case EXIT_REASON_CR_ACCESS:
+ return nested_vmx_exit_handled_cr(vcpu, vmcs12);
+ case EXIT_REASON_DR_ACCESS:
+ return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING);
+ case EXIT_REASON_IO_INSTRUCTION:
+ return nested_vmx_exit_handled_io(vcpu, vmcs12);
+ case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR:
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC);
+ case EXIT_REASON_MSR_READ:
+ case EXIT_REASON_MSR_WRITE:
+ return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason);
+ case EXIT_REASON_INVALID_STATE:
+ return true;
+ case EXIT_REASON_MWAIT_INSTRUCTION:
+ return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING);
+ case EXIT_REASON_MONITOR_TRAP_FLAG:
+ return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_TRAP_FLAG);
+ case EXIT_REASON_MONITOR_INSTRUCTION:
+ return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING);
+ case EXIT_REASON_PAUSE_INSTRUCTION:
+ return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) ||
+ nested_cpu_has2(vmcs12,
+ SECONDARY_EXEC_PAUSE_LOOP_EXITING);
+ case EXIT_REASON_MCE_DURING_VMENTRY:
+ return false;
+ case EXIT_REASON_TPR_BELOW_THRESHOLD:
+ return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW);
+ case EXIT_REASON_APIC_ACCESS:
+ case EXIT_REASON_APIC_WRITE:
+ case EXIT_REASON_EOI_INDUCED:
+ /*
+ * The controls for "virtualize APIC accesses," "APIC-
+ * register virtualization," and "virtual-interrupt
+ * delivery" only come from vmcs12.
+ */
+ return true;
+ case EXIT_REASON_EPT_VIOLATION:
+ /*
+ * L0 always deals with the EPT violation. If nested EPT is
+ * used, and the nested mmu code discovers that the address is
+ * missing in the guest EPT table (EPT12), the EPT violation
+ * will be injected with nested_ept_inject_page_fault()
+ */
+ return false;
+ case EXIT_REASON_EPT_MISCONFIG:
+ /*
+ * L2 never uses directly L1's EPT, but rather L0's own EPT
+ * table (shadow on EPT) or a merged EPT table that L0 built
+ * (EPT on EPT). So any problems with the structure of the
+ * table is L0's fault.
+ */
+ return false;
+ case EXIT_REASON_INVPCID:
+ return
+ nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) &&
+ nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
+ case EXIT_REASON_WBINVD:
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
+ case EXIT_REASON_XSETBV:
+ return true;
+ case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS:
+ /*
+ * This should never happen, since it is not possible to
+ * set XSS to a non-zero value---neither in L1 nor in L2.
+ * If if it were, XSS would have to be checked against
+ * the XSS exit bitmap in vmcs12.
+ */
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
+ case EXIT_REASON_PREEMPTION_TIMER:
+ return false;
+ case EXIT_REASON_PML_FULL:
+ /* We emulate PML support to L1. */
+ return false;
+ case EXIT_REASON_VMFUNC:
+ /* VM functions are emulated through L2->L0 vmexits. */
+ return false;
+ case EXIT_REASON_ENCLS:
+ /* SGX is never exposed to L1 */
+ return false;
+ default:
+ return true;
+ }
+}
+
+static int nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason)
+{
+ u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+
+ /*
+ * At this point, the exit interruption info in exit_intr_info
+ * is only valid for EXCEPTION_NMI exits. For EXTERNAL_INTERRUPT
+ * we need to query the in-kernel LAPIC.
+ */
+ WARN_ON(exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT);
+ if ((exit_intr_info &
+ (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) ==
+ (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) {
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ vmcs12->vm_exit_intr_error_code =
+ vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+ }
+
+ nested_vmx_vmexit(vcpu, exit_reason, exit_intr_info,
+ vmcs_readl(EXIT_QUALIFICATION));
+ return 1;
+}
+
+static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
+{
+ *info1 = vmcs_readl(EXIT_QUALIFICATION);
+ *info2 = vmcs_read32(VM_EXIT_INTR_INFO);
+}
+
+static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
+{
+ if (vmx->pml_pg) {
+ __free_page(vmx->pml_pg);
+ vmx->pml_pg = NULL;
+ }
+}
+
+static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u64 *pml_buf;
+ u16 pml_idx;
+
+ pml_idx = vmcs_read16(GUEST_PML_INDEX);
+
+ /* Do nothing if PML buffer is empty */
+ if (pml_idx == (PML_ENTITY_NUM - 1))
+ return;
+
+ /* PML index always points to next available PML buffer entity */
+ if (pml_idx >= PML_ENTITY_NUM)
+ pml_idx = 0;
+ else
+ pml_idx++;
+
+ pml_buf = page_address(vmx->pml_pg);
+ for (; pml_idx < PML_ENTITY_NUM; pml_idx++) {
+ u64 gpa;
+
+ gpa = pml_buf[pml_idx];
+ WARN_ON(gpa & (PAGE_SIZE - 1));
+ kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
+ }
+
+ /* reset PML index */
+ vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
+}
+
+/*
+ * Flush all vcpus' PML buffer and update logged GPAs to dirty_bitmap.
+ * Called before reporting dirty_bitmap to userspace.
+ */
+static void kvm_flush_pml_buffers(struct kvm *kvm)
+{
+ int i;
+ struct kvm_vcpu *vcpu;
+ /*
+ * We only need to kick vcpu out of guest mode here, as PML buffer
+ * is flushed at beginning of all VMEXITs, and it's obvious that only
+ * vcpus running in guest are possible to have unflushed GPAs in PML
+ * buffer.
+ */
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_vcpu_kick(vcpu);
+}
+
+static void vmx_dump_sel(char *name, uint32_t sel)
+{
+ pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n",
+ name, vmcs_read16(sel),
+ vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR),
+ vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR),
+ vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR));
+}
+
+static void vmx_dump_dtsel(char *name, uint32_t limit)
+{
+ pr_err("%s limit=0x%08x, base=0x%016lx\n",
+ name, vmcs_read32(limit),
+ vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT));
+}
+
+static void dump_vmcs(void)
+{
+ u32 vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS);
+ u32 vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS);
+ u32 cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+ u32 pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
+ u32 secondary_exec_control = 0;
+ unsigned long cr4 = vmcs_readl(GUEST_CR4);
+ u64 efer = vmcs_read64(GUEST_IA32_EFER);
+ int i, n;
+
+ if (cpu_has_secondary_exec_ctrls())
+ secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+
+ pr_err("*** Guest State ***\n");
+ pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
+ vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW),
+ vmcs_readl(CR0_GUEST_HOST_MASK));
+ pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
+ cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK));
+ pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3));
+ if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT) &&
+ (cr4 & X86_CR4_PAE) && !(efer & EFER_LMA))
+ {
+ pr_err("PDPTR0 = 0x%016llx PDPTR1 = 0x%016llx\n",
+ vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1));
+ pr_err("PDPTR2 = 0x%016llx PDPTR3 = 0x%016llx\n",
+ vmcs_read64(GUEST_PDPTR2), vmcs_read64(GUEST_PDPTR3));
+ }
+ pr_err("RSP = 0x%016lx RIP = 0x%016lx\n",
+ vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP));
+ pr_err("RFLAGS=0x%08lx DR7 = 0x%016lx\n",
+ vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7));
+ pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
+ vmcs_readl(GUEST_SYSENTER_ESP),
+ vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP));
+ vmx_dump_sel("CS: ", GUEST_CS_SELECTOR);
+ vmx_dump_sel("DS: ", GUEST_DS_SELECTOR);
+ vmx_dump_sel("SS: ", GUEST_SS_SELECTOR);
+ vmx_dump_sel("ES: ", GUEST_ES_SELECTOR);
+ vmx_dump_sel("FS: ", GUEST_FS_SELECTOR);
+ vmx_dump_sel("GS: ", GUEST_GS_SELECTOR);
+ vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT);
+ vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR);
+ vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT);
+ vmx_dump_sel("TR: ", GUEST_TR_SELECTOR);
+ if ((vmexit_ctl & (VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER)) ||
+ (vmentry_ctl & (VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_IA32_EFER)))
+ pr_err("EFER = 0x%016llx PAT = 0x%016llx\n",
+ efer, vmcs_read64(GUEST_IA32_PAT));
+ pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n",
+ vmcs_read64(GUEST_IA32_DEBUGCTL),
+ vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS));
+ if (cpu_has_load_perf_global_ctrl &&
+ vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
+ pr_err("PerfGlobCtl = 0x%016llx\n",
+ vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL));
+ if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS)
+ pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS));
+ pr_err("Interruptibility = %08x ActivityState = %08x\n",
+ vmcs_read32(GUEST_INTERRUPTIBILITY_INFO),
+ vmcs_read32(GUEST_ACTIVITY_STATE));
+ if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
+ pr_err("InterruptStatus = %04x\n",
+ vmcs_read16(GUEST_INTR_STATUS));
+
+ pr_err("*** Host State ***\n");
+ pr_err("RIP = 0x%016lx RSP = 0x%016lx\n",
+ vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP));
+ pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n",
+ vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR),
+ vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR),
+ vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR),
+ vmcs_read16(HOST_TR_SELECTOR));
+ pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n",
+ vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE),
+ vmcs_readl(HOST_TR_BASE));
+ pr_err("GDTBase=%016lx IDTBase=%016lx\n",
+ vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE));
+ pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n",
+ vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3),
+ vmcs_readl(HOST_CR4));
+ pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
+ vmcs_readl(HOST_IA32_SYSENTER_ESP),
+ vmcs_read32(HOST_IA32_SYSENTER_CS),
+ vmcs_readl(HOST_IA32_SYSENTER_EIP));
+ if (vmexit_ctl & (VM_EXIT_LOAD_IA32_PAT | VM_EXIT_LOAD_IA32_EFER))
+ pr_err("EFER = 0x%016llx PAT = 0x%016llx\n",
+ vmcs_read64(HOST_IA32_EFER),
+ vmcs_read64(HOST_IA32_PAT));
+ if (cpu_has_load_perf_global_ctrl &&
+ vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
+ pr_err("PerfGlobCtl = 0x%016llx\n",
+ vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL));
+
+ pr_err("*** Control State ***\n");
+ pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n",
+ pin_based_exec_ctrl, cpu_based_exec_ctrl, secondary_exec_control);
+ pr_err("EntryControls=%08x ExitControls=%08x\n", vmentry_ctl, vmexit_ctl);
+ pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n",
+ vmcs_read32(EXCEPTION_BITMAP),
+ vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK),
+ vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH));
+ pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n",
+ vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
+ vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE),
+ vmcs_read32(VM_ENTRY_INSTRUCTION_LEN));
+ pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n",
+ vmcs_read32(VM_EXIT_INTR_INFO),
+ vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
+ vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
+ pr_err(" reason=%08x qualification=%016lx\n",
+ vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION));
+ pr_err("IDTVectoring: info=%08x errcode=%08x\n",
+ vmcs_read32(IDT_VECTORING_INFO_FIELD),
+ vmcs_read32(IDT_VECTORING_ERROR_CODE));
+ pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET));
+ if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING)
+ pr_err("TSC Multiplier = 0x%016llx\n",
+ vmcs_read64(TSC_MULTIPLIER));
+ if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW)
+ pr_err("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD));
+ if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR)
+ pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV));
+ if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT))
+ pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER));
+ n = vmcs_read32(CR3_TARGET_COUNT);
+ for (i = 0; i + 1 < n; i += 4)
+ pr_err("CR3 target%u=%016lx target%u=%016lx\n",
+ i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2),
+ i + 1, vmcs_readl(CR3_TARGET_VALUE0 + i * 2 + 2));
+ if (i < n)
+ pr_err("CR3 target%u=%016lx\n",
+ i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2));
+ if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING)
+ pr_err("PLE Gap=%08x Window=%08x\n",
+ vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW));
+ if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID)
+ pr_err("Virtual processor ID = 0x%04x\n",
+ vmcs_read16(VIRTUAL_PROCESSOR_ID));
+}
+
+/*
+ * The guest has exited. See if we can fix it or if we need userspace
+ * assistance.
+ */
+static int vmx_handle_exit(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u32 exit_reason = vmx->exit_reason;
+ u32 vectoring_info = vmx->idt_vectoring_info;
+
+ trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX);
+
+ /*
+ * Flush logged GPAs PML buffer, this will make dirty_bitmap more
+ * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before
+ * querying dirty_bitmap, we only need to kick all vcpus out of guest
+ * mode as if vcpus is in root mode, the PML buffer must has been
+ * flushed already.
+ */
+ if (enable_pml)
+ vmx_flush_pml_buffer(vcpu);
+
+ /* If guest state is invalid, start emulating */
+ if (vmx->emulation_required)
+ return handle_invalid_guest_state(vcpu);
+
+ if (is_guest_mode(vcpu) && nested_vmx_exit_reflected(vcpu, exit_reason))
+ return nested_vmx_reflect_vmexit(vcpu, exit_reason);
+
+ if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
+ dump_vmcs();
+ vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
+ vcpu->run->fail_entry.hardware_entry_failure_reason
+ = exit_reason;
+ return 0;
+ }
+
+ if (unlikely(vmx->fail)) {
+ vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
+ vcpu->run->fail_entry.hardware_entry_failure_reason
+ = vmcs_read32(VM_INSTRUCTION_ERROR);
+ return 0;
+ }
+
+ /*
+ * Note:
+ * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by
+ * delivery event since it indicates guest is accessing MMIO.
+ * The vm-exit can be triggered again after return to guest that
+ * will cause infinite loop.
+ */
+ if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
+ (exit_reason != EXIT_REASON_EXCEPTION_NMI &&
+ exit_reason != EXIT_REASON_EPT_VIOLATION &&
+ exit_reason != EXIT_REASON_PML_FULL &&
+ exit_reason != EXIT_REASON_APIC_ACCESS &&
+ exit_reason != EXIT_REASON_TASK_SWITCH)) {
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
+ vcpu->run->internal.ndata = 3;
+ vcpu->run->internal.data[0] = vectoring_info;
+ vcpu->run->internal.data[1] = exit_reason;
+ vcpu->run->internal.data[2] = vcpu->arch.exit_qualification;
+ if (exit_reason == EXIT_REASON_EPT_MISCONFIG) {
+ vcpu->run->internal.ndata++;
+ vcpu->run->internal.data[3] =
+ vmcs_read64(GUEST_PHYSICAL_ADDRESS);
+ }
+ return 0;
+ }
+
+ if (unlikely(!enable_vnmi &&
+ vmx->loaded_vmcs->soft_vnmi_blocked)) {
+ if (vmx_interrupt_allowed(vcpu)) {
+ vmx->loaded_vmcs->soft_vnmi_blocked = 0;
+ } else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL &&
+ vcpu->arch.nmi_pending) {
+ /*
+ * This CPU don't support us in finding the end of an
+ * NMI-blocked window if the guest runs with IRQs
+ * disabled. So we pull the trigger after 1 s of
+ * futile waiting, but inform the user about this.
+ */
+ printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
+ "state on VCPU %d after 1 s timeout\n",
+ __func__, vcpu->vcpu_id);
+ vmx->loaded_vmcs->soft_vnmi_blocked = 0;
+ }
+ }
+
+ if (exit_reason < kvm_vmx_max_exit_handlers
+ && kvm_vmx_exit_handlers[exit_reason])
+ return kvm_vmx_exit_handlers[exit_reason](vcpu);
+ else {
+ vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n",
+ exit_reason);
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+}
+
+/*
+ * Software based L1D cache flush which is used when microcode providing
+ * the cache control MSR is not loaded.
+ *
+ * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to
+ * flush it is required to read in 64 KiB because the replacement algorithm
+ * is not exactly LRU. This could be sized at runtime via topology
+ * information but as all relevant affected CPUs have 32KiB L1D cache size
+ * there is no point in doing so.
+ */
+static void vmx_l1d_flush(struct kvm_vcpu *vcpu)
+{
+ int size = PAGE_SIZE << L1D_CACHE_ORDER;
+
+ /*
+ * This code is only executed when the the flush mode is 'cond' or
+ * 'always'
+ */
+ if (static_branch_likely(&vmx_l1d_flush_cond)) {
+ bool flush_l1d;
+
+ /*
+ * Clear the per-vcpu flush bit, it gets set again
+ * either from vcpu_run() or from one of the unsafe
+ * VMEXIT handlers.
+ */
+ flush_l1d = vcpu->arch.l1tf_flush_l1d;
+ vcpu->arch.l1tf_flush_l1d = false;
+
+ /*
+ * Clear the per-cpu flush bit, it gets set again from
+ * the interrupt handlers.
+ */
+ flush_l1d |= kvm_get_cpu_l1tf_flush_l1d();
+ kvm_clear_cpu_l1tf_flush_l1d();
+
+ if (!flush_l1d)
+ return;
+ }
+
+ vcpu->stat.l1d_flush++;
+
+ if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
+ wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
+ return;
+ }
+
+ asm volatile(
+ /* First ensure the pages are in the TLB */
+ "xorl %%eax, %%eax\n"
+ ".Lpopulate_tlb:\n\t"
+ "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
+ "addl $4096, %%eax\n\t"
+ "cmpl %%eax, %[size]\n\t"
+ "jne .Lpopulate_tlb\n\t"
+ "xorl %%eax, %%eax\n\t"
+ "cpuid\n\t"
+ /* Now fill the cache */
+ "xorl %%eax, %%eax\n"
+ ".Lfill_cache:\n"
+ "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
+ "addl $64, %%eax\n\t"
+ "cmpl %%eax, %[size]\n\t"
+ "jne .Lfill_cache\n\t"
+ "lfence\n"
+ :: [flush_pages] "r" (vmx_l1d_flush_pages),
+ [size] "r" (size)
+ : "eax", "ebx", "ecx", "edx");
+}
+
+static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+ if (is_guest_mode(vcpu) &&
+ nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
+ return;
+
+ if (irr == -1 || tpr < irr) {
+ vmcs_write32(TPR_THRESHOLD, 0);
+ return;
+ }
+
+ vmcs_write32(TPR_THRESHOLD, irr);
+}
+
+static void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
+{
+ u32 sec_exec_control;
+
+ if (!lapic_in_kernel(vcpu))
+ return;
+
+ if (!flexpriority_enabled &&
+ !cpu_has_vmx_virtualize_x2apic_mode())
+ return;
+
+ /* Postpone execution until vmcs01 is the current VMCS. */
+ if (is_guest_mode(vcpu)) {
+ to_vmx(vcpu)->nested.change_vmcs01_virtual_apic_mode = true;
+ return;
+ }
+
+ sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+ sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
+
+ switch (kvm_get_apic_mode(vcpu)) {
+ case LAPIC_MODE_INVALID:
+ WARN_ONCE(true, "Invalid local APIC state");
+ case LAPIC_MODE_DISABLED:
+ break;
+ case LAPIC_MODE_XAPIC:
+ if (flexpriority_enabled) {
+ sec_exec_control |=
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+ vmx_flush_tlb(vcpu, true);
+ }
+ break;
+ case LAPIC_MODE_X2APIC:
+ if (cpu_has_vmx_virtualize_x2apic_mode())
+ sec_exec_control |=
+ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
+ break;
+ }
+ vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control);
+
+ vmx_update_msr_bitmap(vcpu);
+}
+
+static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa)
+{
+ if (!is_guest_mode(vcpu)) {
+ vmcs_write64(APIC_ACCESS_ADDR, hpa);
+ vmx_flush_tlb(vcpu, true);
+ }
+}
+
+static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
+{
+ u16 status;
+ u8 old;
+
+ if (max_isr == -1)
+ max_isr = 0;
+
+ status = vmcs_read16(GUEST_INTR_STATUS);
+ old = status >> 8;
+ if (max_isr != old) {
+ status &= 0xff;
+ status |= max_isr << 8;
+ vmcs_write16(GUEST_INTR_STATUS, status);
+ }
+}
+
+static void vmx_set_rvi(int vector)
+{
+ u16 status;
+ u8 old;
+
+ if (vector == -1)
+ vector = 0;
+
+ status = vmcs_read16(GUEST_INTR_STATUS);
+ old = (u8)status & 0xff;
+ if ((u8)vector != old) {
+ status &= ~0xff;
+ status |= (u8)vector;
+ vmcs_write16(GUEST_INTR_STATUS, status);
+ }
+}
+
+static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
+{
+ /*
+ * When running L2, updating RVI is only relevant when
+ * vmcs12 virtual-interrupt-delivery enabled.
+ * However, it can be enabled only when L1 also
+ * intercepts external-interrupts and in that case
+ * we should not update vmcs02 RVI but instead intercept
+ * interrupt. Therefore, do nothing when running L2.
+ */
+ if (!is_guest_mode(vcpu))
+ vmx_set_rvi(max_irr);
+}
+
+static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int max_irr;
+ bool max_irr_updated;
+
+ WARN_ON(!vcpu->arch.apicv_active);
+ if (pi_test_on(&vmx->pi_desc)) {
+ pi_clear_on(&vmx->pi_desc);
+ /*
+ * IOMMU can write to PIR.ON, so the barrier matters even on UP.
+ * But on x86 this is just a compiler barrier anyway.
+ */
+ smp_mb__after_atomic();
+ max_irr_updated =
+ kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr);
+
+ /*
+ * If we are running L2 and L1 has a new pending interrupt
+ * which can be injected, we should re-evaluate
+ * what should be done with this new L1 interrupt.
+ * If L1 intercepts external-interrupts, we should
+ * exit from L2 to L1. Otherwise, interrupt should be
+ * delivered directly to L2.
+ */
+ if (is_guest_mode(vcpu) && max_irr_updated) {
+ if (nested_exit_on_intr(vcpu))
+ kvm_vcpu_exiting_guest_mode(vcpu);
+ else
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ }
+ } else {
+ max_irr = kvm_lapic_find_highest_irr(vcpu);
+ }
+ vmx_hwapic_irr_update(vcpu, max_irr);
+ return max_irr;
+}
+
+static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu)
+{
+ u8 rvi = vmx_get_rvi();
+ u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI);
+
+ return ((rvi & 0xf0) > (vppr & 0xf0));
+}
+
+static bool vmx_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu)
+{
+ return pi_test_on(vcpu_to_pi_desc(vcpu));
+}
+
+static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
+{
+ if (!kvm_vcpu_apicv_active(vcpu))
+ return;
+
+ vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]);
+ vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]);
+ vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]);
+ vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
+}
+
+static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ pi_clear_on(&vmx->pi_desc);
+ memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir));
+}
+
+static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
+{
+ if (vmx->exit_reason != EXIT_REASON_EXCEPTION_NMI)
+ return;
+
+ vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+
+ /* if exit due to PF check for async PF */
+ if (is_page_fault(vmx->exit_intr_info))
+ vmx->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason();
+
+ /* Handle machine checks before interrupts are enabled */
+ if (is_machine_check(vmx->exit_intr_info))
+ kvm_machine_check();
+
+ /* We need to handle NMIs before interrupts are enabled */
+ if (is_nmi(vmx->exit_intr_info)) {
+ kvm_before_interrupt(&vmx->vcpu);
+ asm("int $2");
+ kvm_after_interrupt(&vmx->vcpu);
+ }
+}
+
+static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
+{
+ u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+
+ if ((exit_intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK))
+ == (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR)) {
+ unsigned int vector;
+ unsigned long entry;
+ gate_desc *desc;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+#ifdef CONFIG_X86_64
+ unsigned long tmp;
+#endif
+
+ vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
+ desc = (gate_desc *)vmx->host_idt_base + vector;
+ entry = gate_offset(desc);
+ asm volatile(
+#ifdef CONFIG_X86_64
+ "mov %%" _ASM_SP ", %[sp]\n\t"
+ "and $0xfffffffffffffff0, %%" _ASM_SP "\n\t"
+ "push $%c[ss]\n\t"
+ "push %[sp]\n\t"
+#endif
+ "pushf\n\t"
+ __ASM_SIZE(push) " $%c[cs]\n\t"
+ CALL_NOSPEC
+ :
+#ifdef CONFIG_X86_64
+ [sp]"=&r"(tmp),
+#endif
+ ASM_CALL_CONSTRAINT
+ :
+ THUNK_TARGET(entry),
+ [ss]"i"(__KERNEL_DS),
+ [cs]"i"(__KERNEL_CS)
+ );
+ }
+}
+STACK_FRAME_NON_STANDARD(vmx_handle_external_intr);
+
+static bool vmx_has_emulated_msr(int index)
+{
+ switch (index) {
+ case MSR_IA32_SMBASE:
+ /*
+ * We cannot do SMM unless we can run the guest in big
+ * real mode.
+ */
+ return enable_unrestricted_guest || emulate_invalid_guest_state;
+ case MSR_AMD64_VIRT_SPEC_CTRL:
+ /* This is AMD only. */
+ return false;
+ default:
+ return true;
+ }
+}
+
+static bool vmx_mpx_supported(void)
+{
+ return (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_BNDCFGS) &&
+ (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS);
+}
+
+static bool vmx_xsaves_supported(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_XSAVES;
+}
+
+static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
+{
+ u32 exit_intr_info;
+ bool unblock_nmi;
+ u8 vector;
+ bool idtv_info_valid;
+
+ idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;
+
+ if (enable_vnmi) {
+ if (vmx->loaded_vmcs->nmi_known_unmasked)
+ return;
+ /*
+ * Can't use vmx->exit_intr_info since we're not sure what
+ * the exit reason is.
+ */
+ exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+ unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
+ vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
+ /*
+ * SDM 3: 27.7.1.2 (September 2008)
+ * Re-set bit "block by NMI" before VM entry if vmexit caused by
+ * a guest IRET fault.
+ * SDM 3: 23.2.2 (September 2008)
+ * Bit 12 is undefined in any of the following cases:
+ * If the VM exit sets the valid bit in the IDT-vectoring
+ * information field.
+ * If the VM exit is due to a double fault.
+ */
+ if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
+ vector != DF_VECTOR && !idtv_info_valid)
+ vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+ GUEST_INTR_STATE_NMI);
+ else
+ vmx->loaded_vmcs->nmi_known_unmasked =
+ !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
+ & GUEST_INTR_STATE_NMI);
+ } else if (unlikely(vmx->loaded_vmcs->soft_vnmi_blocked))
+ vmx->loaded_vmcs->vnmi_blocked_time +=
+ ktime_to_ns(ktime_sub(ktime_get(),
+ vmx->loaded_vmcs->entry_time));
+}
+
+static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
+ u32 idt_vectoring_info,
+ int instr_len_field,
+ int error_code_field)
+{
+ u8 vector;
+ int type;
+ bool idtv_info_valid;
+
+ idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
+
+ vcpu->arch.nmi_injected = false;
+ kvm_clear_exception_queue(vcpu);
+ kvm_clear_interrupt_queue(vcpu);
+
+ if (!idtv_info_valid)
+ return;
+
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
+ type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
+
+ switch (type) {
+ case INTR_TYPE_NMI_INTR:
+ vcpu->arch.nmi_injected = true;
+ /*
+ * SDM 3: 27.7.1.2 (September 2008)
+ * Clear bit "block by NMI" before VM entry if a NMI
+ * delivery faulted.
+ */
+ vmx_set_nmi_mask(vcpu, false);
+ break;
+ case INTR_TYPE_SOFT_EXCEPTION:
+ vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
+ /* fall through */
+ case INTR_TYPE_HARD_EXCEPTION:
+ if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
+ u32 err = vmcs_read32(error_code_field);
+ kvm_requeue_exception_e(vcpu, vector, err);
+ } else
+ kvm_requeue_exception(vcpu, vector);
+ break;
+ case INTR_TYPE_SOFT_INTR:
+ vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
+ /* fall through */
+ case INTR_TYPE_EXT_INTR:
+ kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR);
+ break;
+ default:
+ break;
+ }
+}
+
+static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
+{
+ __vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info,
+ VM_EXIT_INSTRUCTION_LEN,
+ IDT_VECTORING_ERROR_CODE);
+}
+
+static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
+{
+ __vmx_complete_interrupts(vcpu,
+ vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
+ VM_ENTRY_INSTRUCTION_LEN,
+ VM_ENTRY_EXCEPTION_ERROR_CODE);
+
+ vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
+}
+
+static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
+{
+ int i, nr_msrs;
+ struct perf_guest_switch_msr *msrs;
+
+ msrs = perf_guest_get_msrs(&nr_msrs);
+
+ if (!msrs)
+ return;
+
+ for (i = 0; i < nr_msrs; i++)
+ if (msrs[i].host == msrs[i].guest)
+ clear_atomic_switch_msr(vmx, msrs[i].msr);
+ else
+ add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest,
+ msrs[i].host, false);
+}
+
+static void vmx_arm_hv_timer(struct vcpu_vmx *vmx, u32 val)
+{
+ vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, val);
+ if (!vmx->loaded_vmcs->hv_timer_armed)
+ vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
+ PIN_BASED_VMX_PREEMPTION_TIMER);
+ vmx->loaded_vmcs->hv_timer_armed = true;
+}
+
+static void vmx_update_hv_timer(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u64 tscl;
+ u32 delta_tsc;
+
+ if (vmx->req_immediate_exit) {
+ vmx_arm_hv_timer(vmx, 0);
+ return;
+ }
+
+ if (vmx->hv_deadline_tsc != -1) {
+ tscl = rdtsc();
+ if (vmx->hv_deadline_tsc > tscl)
+ /* set_hv_timer ensures the delta fits in 32-bits */
+ delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >>
+ cpu_preemption_timer_multi);
+ else
+ delta_tsc = 0;
+
+ vmx_arm_hv_timer(vmx, delta_tsc);
+ return;
+ }
+
+ if (vmx->loaded_vmcs->hv_timer_armed)
+ vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
+ PIN_BASED_VMX_PREEMPTION_TIMER);
+ vmx->loaded_vmcs->hv_timer_armed = false;
+}
+
+static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long cr3, cr4, evmcs_rsp;
+
+ /* Record the guest's net vcpu time for enforced NMI injections. */
+ if (unlikely(!enable_vnmi &&
+ vmx->loaded_vmcs->soft_vnmi_blocked))
+ vmx->loaded_vmcs->entry_time = ktime_get();
+
+ /* Don't enter VMX if guest state is invalid, let the exit handler
+ start emulation until we arrive back to a valid state */
+ if (vmx->emulation_required)
+ return;
+
+ if (vmx->ple_window_dirty) {
+ vmx->ple_window_dirty = false;
+ vmcs_write32(PLE_WINDOW, vmx->ple_window);
+ }
+
+ if (vmx->nested.sync_shadow_vmcs) {
+ copy_vmcs12_to_shadow(vmx);
+ vmx->nested.sync_shadow_vmcs = false;
+ }
+
+ if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
+ vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
+ if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
+ vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
+
+ cr3 = __get_current_cr3_fast();
+ if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
+ vmcs_writel(HOST_CR3, cr3);
+ vmx->loaded_vmcs->host_state.cr3 = cr3;
+ }
+
+ cr4 = cr4_read_shadow();
+ if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
+ vmcs_writel(HOST_CR4, cr4);
+ vmx->loaded_vmcs->host_state.cr4 = cr4;
+ }
+
+ /* When single-stepping over STI and MOV SS, we must clear the
+ * corresponding interruptibility bits in the guest state. Otherwise
+ * vmentry fails as it then expects bit 14 (BS) in pending debug
+ * exceptions being set, but that's not correct for the guest debugging
+ * case. */
+ if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+ vmx_set_interrupt_shadow(vcpu, 0);
+
+ kvm_load_guest_xcr0(vcpu);
+
+ if (static_cpu_has(X86_FEATURE_PKU) &&
+ kvm_read_cr4_bits(vcpu, X86_CR4_PKE) &&
+ vcpu->arch.pkru != vmx->host_pkru)
+ __write_pkru(vcpu->arch.pkru);
+
+ atomic_switch_perf_msrs(vmx);
+
+ vmx_update_hv_timer(vcpu);
+
+ /*
+ * If this vCPU has touched SPEC_CTRL, restore the guest's value if
+ * it's non-zero. Since vmentry is serialising on affected CPUs, there
+ * is no need to worry about the conditional branch over the wrmsr
+ * being speculatively taken.
+ */
+ x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0);
+
+ vmx->__launched = vmx->loaded_vmcs->launched;
+
+ evmcs_rsp = static_branch_unlikely(&enable_evmcs) ?
+ (unsigned long)&current_evmcs->host_rsp : 0;
+
+ /* L1D Flush includes CPU buffer clear to mitigate MDS */
+ if (static_branch_unlikely(&vmx_l1d_should_flush))
+ vmx_l1d_flush(vcpu);
+ else if (static_branch_unlikely(&mds_user_clear))
+ mds_clear_cpu_buffers();
+ else if (static_branch_unlikely(&mmio_stale_data_clear) &&
+ kvm_arch_has_assigned_device(vcpu->kvm))
+ mds_clear_cpu_buffers();
+
+ vmx_disable_fb_clear(vmx);
+
+ asm volatile (
+ /* Store host registers */
+ "push %%" _ASM_DX "; push %%" _ASM_BP ";"
+ "push %%" _ASM_CX " \n\t" /* placeholder for guest rcx */
+ "push %%" _ASM_CX " \n\t"
+ "cmp %%" _ASM_SP ", %c[host_rsp](%%" _ASM_CX ") \n\t"
+ "je 1f \n\t"
+ "mov %%" _ASM_SP ", %c[host_rsp](%%" _ASM_CX ") \n\t"
+ /* Avoid VMWRITE when Enlightened VMCS is in use */
+ "test %%" _ASM_SI ", %%" _ASM_SI " \n\t"
+ "jz 2f \n\t"
+ "mov %%" _ASM_SP ", (%%" _ASM_SI ") \n\t"
+ "jmp 1f \n\t"
+ "2: \n\t"
+ __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
+ "1: \n\t"
+ /* Reload cr2 if changed */
+ "mov %c[cr2](%%" _ASM_CX "), %%" _ASM_AX " \n\t"
+ "mov %%cr2, %%" _ASM_DX " \n\t"
+ "cmp %%" _ASM_AX ", %%" _ASM_DX " \n\t"
+ "je 3f \n\t"
+ "mov %%" _ASM_AX", %%cr2 \n\t"
+ "3: \n\t"
+ /* Check if vmlaunch of vmresume is needed */
+ "cmpb $0, %c[launched](%%" _ASM_CX ") \n\t"
+ /* Load guest registers. Don't clobber flags. */
+ "mov %c[rax](%%" _ASM_CX "), %%" _ASM_AX " \n\t"
+ "mov %c[rbx](%%" _ASM_CX "), %%" _ASM_BX " \n\t"
+ "mov %c[rdx](%%" _ASM_CX "), %%" _ASM_DX " \n\t"
+ "mov %c[rsi](%%" _ASM_CX "), %%" _ASM_SI " \n\t"
+ "mov %c[rdi](%%" _ASM_CX "), %%" _ASM_DI " \n\t"
+ "mov %c[rbp](%%" _ASM_CX "), %%" _ASM_BP " \n\t"
+#ifdef CONFIG_X86_64
+ "mov %c[r8](%%" _ASM_CX "), %%r8 \n\t"
+ "mov %c[r9](%%" _ASM_CX "), %%r9 \n\t"
+ "mov %c[r10](%%" _ASM_CX "), %%r10 \n\t"
+ "mov %c[r11](%%" _ASM_CX "), %%r11 \n\t"
+ "mov %c[r12](%%" _ASM_CX "), %%r12 \n\t"
+ "mov %c[r13](%%" _ASM_CX "), %%r13 \n\t"
+ "mov %c[r14](%%" _ASM_CX "), %%r14 \n\t"
+ "mov %c[r15](%%" _ASM_CX "), %%r15 \n\t"
+#endif
+ /* Load guest RCX. This kills the vmx_vcpu pointer! */
+ "mov %c[rcx](%%" _ASM_CX "), %%" _ASM_CX " \n\t"
+
+ /* Enter guest mode */
+ "jne 1f \n\t"
+ __ex(ASM_VMX_VMLAUNCH) "\n\t"
+ "jmp 2f \n\t"
+ "1: " __ex(ASM_VMX_VMRESUME) "\n\t"
+ "2: "
+
+ /* Save guest's RCX to the stack placeholder (see above) */
+ "mov %%" _ASM_CX ", %c[wordsize](%%" _ASM_SP ") \n\t"
+
+ /* Load host's RCX, i.e. the vmx_vcpu pointer */
+ "pop %%" _ASM_CX " \n\t"
+
+ /* Set vmx->fail based on EFLAGS.{CF,ZF} */
+ "setbe %c[fail](%%" _ASM_CX ")\n\t"
+
+ /* Save all guest registers, including RCX from the stack */
+ "mov %%" _ASM_AX ", %c[rax](%%" _ASM_CX ") \n\t"
+ "mov %%" _ASM_BX ", %c[rbx](%%" _ASM_CX ") \n\t"
+ __ASM_SIZE(pop) " %c[rcx](%%" _ASM_CX ") \n\t"
+ "mov %%" _ASM_DX ", %c[rdx](%%" _ASM_CX ") \n\t"
+ "mov %%" _ASM_SI ", %c[rsi](%%" _ASM_CX ") \n\t"
+ "mov %%" _ASM_DI ", %c[rdi](%%" _ASM_CX ") \n\t"
+ "mov %%" _ASM_BP ", %c[rbp](%%" _ASM_CX ") \n\t"
+#ifdef CONFIG_X86_64
+ "mov %%r8, %c[r8](%%" _ASM_CX ") \n\t"
+ "mov %%r9, %c[r9](%%" _ASM_CX ") \n\t"
+ "mov %%r10, %c[r10](%%" _ASM_CX ") \n\t"
+ "mov %%r11, %c[r11](%%" _ASM_CX ") \n\t"
+ "mov %%r12, %c[r12](%%" _ASM_CX ") \n\t"
+ "mov %%r13, %c[r13](%%" _ASM_CX ") \n\t"
+ "mov %%r14, %c[r14](%%" _ASM_CX ") \n\t"
+ "mov %%r15, %c[r15](%%" _ASM_CX ") \n\t"
+
+ /*
+ * Clear all general purpose registers (except RSP, which is loaded by
+ * the CPU during VM-Exit) to prevent speculative use of the guest's
+ * values, even those that are saved/loaded via the stack. In theory,
+ * an L1 cache miss when restoring registers could lead to speculative
+ * execution with the guest's values. Zeroing XORs are dirt cheap,
+ * i.e. the extra paranoia is essentially free.
+ */
+ "xor %%r8d, %%r8d \n\t"
+ "xor %%r9d, %%r9d \n\t"
+ "xor %%r10d, %%r10d \n\t"
+ "xor %%r11d, %%r11d \n\t"
+ "xor %%r12d, %%r12d \n\t"
+ "xor %%r13d, %%r13d \n\t"
+ "xor %%r14d, %%r14d \n\t"
+ "xor %%r15d, %%r15d \n\t"
+#endif
+ "mov %%cr2, %%" _ASM_AX " \n\t"
+ "mov %%" _ASM_AX ", %c[cr2](%%" _ASM_CX ") \n\t"
+
+ "xor %%eax, %%eax \n\t"
+ "xor %%ebx, %%ebx \n\t"
+ "xor %%ecx, %%ecx \n\t"
+ "xor %%edx, %%edx \n\t"
+ "xor %%esi, %%esi \n\t"
+ "xor %%edi, %%edi \n\t"
+ "xor %%ebp, %%ebp \n\t"
+ "pop %%" _ASM_BP "; pop %%" _ASM_DX " \n\t"
+ ".pushsection .rodata \n\t"
+ ".global vmx_return \n\t"
+ "vmx_return: " _ASM_PTR " 2b \n\t"
+ ".popsection"
+ : "=c"((int){0}), "=d"((int){0}), "=S"((int){0})
+ : "c"(vmx), "d"((unsigned long)HOST_RSP), "S"(evmcs_rsp),
+ [launched]"i"(offsetof(struct vcpu_vmx, __launched)),
+ [fail]"i"(offsetof(struct vcpu_vmx, fail)),
+ [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
+ [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
+ [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])),
+ [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])),
+ [rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])),
+ [rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])),
+ [rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])),
+ [rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])),
+#ifdef CONFIG_X86_64
+ [r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])),
+ [r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])),
+ [r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])),
+ [r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])),
+ [r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])),
+ [r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])),
+ [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])),
+ [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])),
+#endif
+ [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)),
+ [wordsize]"i"(sizeof(ulong))
+ : "cc", "memory"
+#ifdef CONFIG_X86_64
+ , "rax", "rbx", "rdi"
+ , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
+#else
+ , "eax", "ebx", "edi"
+#endif
+ );
+
+ vmx_enable_fb_clear(vmx);
+
+ /*
+ * We do not use IBRS in the kernel. If this vCPU has used the
+ * SPEC_CTRL MSR it may have left it on; save the value and
+ * turn it off. This is much more efficient than blindly adding
+ * it to the atomic save/restore list. Especially as the former
+ * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
+ *
+ * For non-nested case:
+ * If the L01 MSR bitmap does not intercept the MSR, then we need to
+ * save it.
+ *
+ * For nested case:
+ * If the L02 MSR bitmap does not intercept the MSR, then we need to
+ * save it.
+ */
+ if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
+ vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
+
+ x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0);
+
+ /* Eliminate branch target predictions from guest mode */
+ vmexit_fill_RSB();
+
+ /* All fields are clean at this point */
+ if (static_branch_unlikely(&enable_evmcs))
+ current_evmcs->hv_clean_fields |=
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
+
+ /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
+ if (vmx->host_debugctlmsr)
+ update_debugctlmsr(vmx->host_debugctlmsr);
+
+#ifndef CONFIG_X86_64
+ /*
+ * The sysexit path does not restore ds/es, so we must set them to
+ * a reasonable value ourselves.
+ *
+ * We can't defer this to vmx_prepare_switch_to_host() since that
+ * function may be executed in interrupt context, which saves and
+ * restore segments around it, nullifying its effect.
+ */
+ loadsegment(ds, __USER_DS);
+ loadsegment(es, __USER_DS);
+#endif
+
+ vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
+ | (1 << VCPU_EXREG_RFLAGS)
+ | (1 << VCPU_EXREG_PDPTR)
+ | (1 << VCPU_EXREG_SEGMENTS)
+ | (1 << VCPU_EXREG_CR3));
+ vcpu->arch.regs_dirty = 0;
+
+ /*
+ * eager fpu is enabled if PKEY is supported and CR4 is switched
+ * back on host, so it is safe to read guest PKRU from current
+ * XSAVE.
+ */
+ if (static_cpu_has(X86_FEATURE_PKU) &&
+ kvm_read_cr4_bits(vcpu, X86_CR4_PKE)) {
+ vcpu->arch.pkru = __read_pkru();
+ if (vcpu->arch.pkru != vmx->host_pkru)
+ __write_pkru(vmx->host_pkru);
+ }
+
+ kvm_put_guest_xcr0(vcpu);
+
+ vmx->nested.nested_run_pending = 0;
+ vmx->idt_vectoring_info = 0;
+
+ vmx->exit_reason = vmx->fail ? 0xdead : vmcs_read32(VM_EXIT_REASON);
+ if ((u16)vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)
+ kvm_machine_check();
+
+ if (vmx->fail || (vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
+ return;
+
+ vmx->loaded_vmcs->launched = 1;
+ vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
+
+ vmx_complete_atomic_exit(vmx);
+ vmx_recover_nmi_blocking(vmx);
+ vmx_complete_interrupts(vmx);
+}
+STACK_FRAME_NON_STANDARD(vmx_vcpu_run);
+
+static struct kvm *vmx_vm_alloc(void)
+{
+ struct kvm_vmx *kvm_vmx = vzalloc(sizeof(struct kvm_vmx));
+
+ if (!kvm_vmx)
+ return NULL;
+
+ return &kvm_vmx->kvm;
+}
+
+static void vmx_vm_free(struct kvm *kvm)
+{
+ vfree(to_kvm_vmx(kvm));
+}
+
+static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int cpu;
+
+ if (vmx->loaded_vmcs == vmcs)
+ return;
+
+ cpu = get_cpu();
+ vmx_vcpu_put(vcpu);
+ vmx->loaded_vmcs = vmcs;
+ vmx_vcpu_load(vcpu, cpu);
+ put_cpu();
+}
+
+/*
+ * Ensure that the current vmcs of the logical processor is the
+ * vmcs01 of the vcpu before calling free_nested().
+ */
+static void vmx_free_vcpu_nested(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ vcpu_load(vcpu);
+ vmx_switch_vmcs(vcpu, &vmx->vmcs01);
+ free_nested(vmx);
+ vcpu_put(vcpu);
+}
+
+static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (enable_pml)
+ vmx_destroy_pml_buffer(vmx);
+ free_vpid(vmx->vpid);
+ leave_guest_mode(vcpu);
+ vmx_free_vcpu_nested(vcpu);
+ free_loaded_vmcs(vmx->loaded_vmcs);
+ kfree(vmx->guest_msrs);
+ kvm_vcpu_uninit(vcpu);
+ kmem_cache_free(kvm_vcpu_cache, vmx);
+}
+
+static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
+{
+ int err;
+ struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
+ unsigned long *msr_bitmap;
+ int cpu;
+
+ if (!vmx)
+ return ERR_PTR(-ENOMEM);
+
+ vmx->vpid = allocate_vpid();
+
+ err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
+ if (err)
+ goto free_vcpu;
+
+ err = -ENOMEM;
+
+ /*
+ * If PML is turned on, failure on enabling PML just results in failure
+ * of creating the vcpu, therefore we can simplify PML logic (by
+ * avoiding dealing with cases, such as enabling PML partially on vcpus
+ * for the guest, etc.
+ */
+ if (enable_pml) {
+ vmx->pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ if (!vmx->pml_pg)
+ goto uninit_vcpu;
+ }
+
+ vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) * sizeof(vmx->guest_msrs[0])
+ > PAGE_SIZE);
+
+ if (!vmx->guest_msrs)
+ goto free_pml;
+
+ err = alloc_loaded_vmcs(&vmx->vmcs01);
+ if (err < 0)
+ goto free_msrs;
+
+ msr_bitmap = vmx->vmcs01.msr_bitmap;
+ vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
+ vmx->msr_bitmap_mode = 0;
+
+ vmx->loaded_vmcs = &vmx->vmcs01;
+ cpu = get_cpu();
+ vmx_vcpu_load(&vmx->vcpu, cpu);
+ vmx->vcpu.cpu = cpu;
+ vmx_vcpu_setup(vmx);
+ vmx_vcpu_put(&vmx->vcpu);
+ put_cpu();
+ if (cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
+ err = alloc_apic_access_page(kvm);
+ if (err)
+ goto free_vmcs;
+ }
+
+ if (enable_ept && !enable_unrestricted_guest) {
+ err = init_rmode_identity_map(kvm);
+ if (err)
+ goto free_vmcs;
+ }
+
+ if (nested)
+ nested_vmx_setup_ctls_msrs(&vmx->nested.msrs,
+ kvm_vcpu_apicv_active(&vmx->vcpu));
+
+ vmx->nested.posted_intr_nv = -1;
+ vmx->nested.current_vmptr = -1ull;
+
+ vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED;
+
+ /*
+ * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR
+ * or POSTED_INTR_WAKEUP_VECTOR.
+ */
+ vmx->pi_desc.nv = POSTED_INTR_VECTOR;
+ vmx->pi_desc.sn = 1;
+
+ return &vmx->vcpu;
+
+free_vmcs:
+ free_loaded_vmcs(vmx->loaded_vmcs);
+free_msrs:
+ kfree(vmx->guest_msrs);
+free_pml:
+ vmx_destroy_pml_buffer(vmx);
+uninit_vcpu:
+ kvm_vcpu_uninit(&vmx->vcpu);
+free_vcpu:
+ free_vpid(vmx->vpid);
+ kmem_cache_free(kvm_vcpu_cache, vmx);
+ return ERR_PTR(err);
+}
+
+#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
+#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
+
+static int vmx_vm_init(struct kvm *kvm)
+{
+ spin_lock_init(&to_kvm_vmx(kvm)->ept_pointer_lock);
+
+ if (!ple_gap)
+ kvm->arch.pause_in_guest = true;
+
+ if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) {
+ switch (l1tf_mitigation) {
+ case L1TF_MITIGATION_OFF:
+ case L1TF_MITIGATION_FLUSH_NOWARN:
+ /* 'I explicitly don't care' is set */
+ break;
+ case L1TF_MITIGATION_FLUSH:
+ case L1TF_MITIGATION_FLUSH_NOSMT:
+ case L1TF_MITIGATION_FULL:
+ /*
+ * Warn upon starting the first VM in a potentially
+ * insecure environment.
+ */
+ if (sched_smt_active())
+ pr_warn_once(L1TF_MSG_SMT);
+ if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER)
+ pr_warn_once(L1TF_MSG_L1D);
+ break;
+ case L1TF_MITIGATION_FULL_FORCE:
+ /* Flush is enforced */
+ break;
+ }
+ }
+ return 0;
+}
+
+static void __init vmx_check_processor_compat(void *rtn)
+{
+ struct vmcs_config vmcs_conf;
+
+ *(int *)rtn = 0;
+ if (setup_vmcs_config(&vmcs_conf) < 0)
+ *(int *)rtn = -EIO;
+ nested_vmx_setup_ctls_msrs(&vmcs_conf.nested, enable_apicv);
+ if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
+ printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
+ smp_processor_id());
+ *(int *)rtn = -EIO;
+ }
+}
+
+static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
+{
+ u8 cache;
+ u64 ipat = 0;
+
+ /* For VT-d and EPT combination
+ * 1. MMIO: always map as UC
+ * 2. EPT with VT-d:
+ * a. VT-d without snooping control feature: can't guarantee the
+ * result, try to trust guest.
+ * b. VT-d with snooping control feature: snooping control feature of
+ * VT-d engine can guarantee the cache correctness. Just set it
+ * to WB to keep consistent with host. So the same as item 3.
+ * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep
+ * consistent with host MTRR
+ */
+ if (is_mmio) {
+ cache = MTRR_TYPE_UNCACHABLE;
+ goto exit;
+ }
+
+ if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) {
+ ipat = VMX_EPT_IPAT_BIT;
+ cache = MTRR_TYPE_WRBACK;
+ goto exit;
+ }
+
+ if (kvm_read_cr0(vcpu) & X86_CR0_CD) {
+ ipat = VMX_EPT_IPAT_BIT;
+ if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
+ cache = MTRR_TYPE_WRBACK;
+ else
+ cache = MTRR_TYPE_UNCACHABLE;
+ goto exit;
+ }
+
+ cache = kvm_mtrr_get_guest_memory_type(vcpu, gfn);
+
+exit:
+ return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat;
+}
+
+static int vmx_get_lpage_level(void)
+{
+ if (enable_ept && !cpu_has_vmx_ept_1g_page())
+ return PT_DIRECTORY_LEVEL;
+ else
+ /* For shadow and EPT supported 1GB page */
+ return PT_PDPE_LEVEL;
+}
+
+static void vmcs_set_secondary_exec_control(u32 new_ctl)
+{
+ /*
+ * These bits in the secondary execution controls field
+ * are dynamic, the others are mostly based on the hypervisor
+ * architecture and the guest's CPUID. Do not touch the
+ * dynamic bits.
+ */
+ u32 mask =
+ SECONDARY_EXEC_SHADOW_VMCS |
+ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+ SECONDARY_EXEC_DESC;
+
+ u32 cur_ctl = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+
+ vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
+ (new_ctl & ~mask) | (cur_ctl & mask));
+}
+
+/*
+ * Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits
+ * (indicating "allowed-1") if they are supported in the guest's CPUID.
+ */
+static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct kvm_cpuid_entry2 *entry;
+
+ vmx->nested.msrs.cr0_fixed1 = 0xffffffff;
+ vmx->nested.msrs.cr4_fixed1 = X86_CR4_PCE;
+
+#define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do { \
+ if (entry && (entry->_reg & (_cpuid_mask))) \
+ vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask); \
+} while (0)
+
+ entry = kvm_find_cpuid_entry(vcpu, 0x1, 0);
+ cr4_fixed1_update(X86_CR4_VME, edx, bit(X86_FEATURE_VME));
+ cr4_fixed1_update(X86_CR4_PVI, edx, bit(X86_FEATURE_VME));
+ cr4_fixed1_update(X86_CR4_TSD, edx, bit(X86_FEATURE_TSC));
+ cr4_fixed1_update(X86_CR4_DE, edx, bit(X86_FEATURE_DE));
+ cr4_fixed1_update(X86_CR4_PSE, edx, bit(X86_FEATURE_PSE));
+ cr4_fixed1_update(X86_CR4_PAE, edx, bit(X86_FEATURE_PAE));
+ cr4_fixed1_update(X86_CR4_MCE, edx, bit(X86_FEATURE_MCE));
+ cr4_fixed1_update(X86_CR4_PGE, edx, bit(X86_FEATURE_PGE));
+ cr4_fixed1_update(X86_CR4_OSFXSR, edx, bit(X86_FEATURE_FXSR));
+ cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, bit(X86_FEATURE_XMM));
+ cr4_fixed1_update(X86_CR4_VMXE, ecx, bit(X86_FEATURE_VMX));
+ cr4_fixed1_update(X86_CR4_SMXE, ecx, bit(X86_FEATURE_SMX));
+ cr4_fixed1_update(X86_CR4_PCIDE, ecx, bit(X86_FEATURE_PCID));
+ cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, bit(X86_FEATURE_XSAVE));
+
+ entry = kvm_find_cpuid_entry(vcpu, 0x7, 0);
+ cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, bit(X86_FEATURE_FSGSBASE));
+ cr4_fixed1_update(X86_CR4_SMEP, ebx, bit(X86_FEATURE_SMEP));
+ cr4_fixed1_update(X86_CR4_SMAP, ebx, bit(X86_FEATURE_SMAP));
+ cr4_fixed1_update(X86_CR4_PKE, ecx, bit(X86_FEATURE_PKU));
+ cr4_fixed1_update(X86_CR4_UMIP, ecx, bit(X86_FEATURE_UMIP));
+
+#undef cr4_fixed1_update
+}
+
+static void nested_vmx_entry_exit_ctls_update(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (kvm_mpx_supported()) {
+ bool mpx_enabled = guest_cpuid_has(vcpu, X86_FEATURE_MPX);
+
+ if (mpx_enabled) {
+ vmx->nested.msrs.entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
+ vmx->nested.msrs.exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
+ } else {
+ vmx->nested.msrs.entry_ctls_high &= ~VM_ENTRY_LOAD_BNDCFGS;
+ vmx->nested.msrs.exit_ctls_high &= ~VM_EXIT_CLEAR_BNDCFGS;
+ }
+ }
+}
+
+static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (cpu_has_secondary_exec_ctrls()) {
+ vmx_compute_secondary_exec_control(vmx);
+ vmcs_set_secondary_exec_control(vmx->secondary_exec_control);
+ }
+
+ if (nested_vmx_allowed(vcpu))
+ to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
+ FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
+ else
+ to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
+ ~FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
+
+ if (nested_vmx_allowed(vcpu)) {
+ nested_vmx_cr_fixed1_bits_update(vcpu);
+ nested_vmx_entry_exit_ctls_update(vcpu);
+ }
+}
+
+static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
+{
+ if (func == 1 && nested)
+ entry->ecx |= bit(X86_FEATURE_VMX);
+}
+
+static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
+ struct x86_exception *fault)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u32 exit_reason;
+ unsigned long exit_qualification = vcpu->arch.exit_qualification;
+
+ if (vmx->nested.pml_full) {
+ exit_reason = EXIT_REASON_PML_FULL;
+ vmx->nested.pml_full = false;
+ exit_qualification &= INTR_INFO_UNBLOCK_NMI;
+ } else if (fault->error_code & PFERR_RSVD_MASK)
+ exit_reason = EXIT_REASON_EPT_MISCONFIG;
+ else
+ exit_reason = EXIT_REASON_EPT_VIOLATION;
+
+ nested_vmx_vmexit(vcpu, exit_reason, 0, exit_qualification);
+ vmcs12->guest_physical_address = fault->address;
+}
+
+static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu)
+{
+ return nested_ept_get_cr3(vcpu) & VMX_EPTP_AD_ENABLE_BIT;
+}
+
+/* Callbacks for nested_ept_init_mmu_context: */
+
+static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
+{
+ /* return the page table to be shadowed - in our case, EPT12 */
+ return get_vmcs12(vcpu)->ept_pointer;
+}
+
+static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
+{
+ WARN_ON(mmu_is_nested(vcpu));
+ if (!valid_ept_address(vcpu, nested_ept_get_cr3(vcpu)))
+ return 1;
+
+ kvm_init_shadow_ept_mmu(vcpu,
+ to_vmx(vcpu)->nested.msrs.ept_caps &
+ VMX_EPT_EXECUTE_ONLY_BIT,
+ nested_ept_ad_enabled(vcpu),
+ nested_ept_get_cr3(vcpu));
+ vcpu->arch.mmu.set_cr3 = vmx_set_cr3;
+ vcpu->arch.mmu.get_cr3 = nested_ept_get_cr3;
+ vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault;
+
+ vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu;
+ return 0;
+}
+
+static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.walk_mmu = &vcpu->arch.mmu;
+}
+
+static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
+ u16 error_code)
+{
+ bool inequality, bit;
+
+ bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0;
+ inequality =
+ (error_code & vmcs12->page_fault_error_code_mask) !=
+ vmcs12->page_fault_error_code_match;
+ return inequality ^ bit;
+}
+
+static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
+ struct x86_exception *fault)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+ WARN_ON(!is_guest_mode(vcpu));
+
+ if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) &&
+ !to_vmx(vcpu)->nested.nested_run_pending) {
+ vmcs12->vm_exit_intr_error_code = fault->error_code;
+ nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
+ PF_VECTOR | INTR_TYPE_HARD_EXCEPTION |
+ INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK,
+ fault->address);
+ } else {
+ kvm_inject_page_fault(vcpu, fault);
+ }
+}
+
+static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12);
+
+static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct page *page;
+ u64 hpa;
+
+ if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
+ /*
+ * Translate L1 physical address to host physical
+ * address for vmcs02. Keep the page pinned, so this
+ * physical address remains valid. We keep a reference
+ * to it so we can release it later.
+ */
+ if (vmx->nested.apic_access_page) { /* shouldn't happen */
+ kvm_release_page_dirty(vmx->nested.apic_access_page);
+ vmx->nested.apic_access_page = NULL;
+ }
+ page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr);
+ /*
+ * If translation failed, no matter: This feature asks
+ * to exit when accessing the given address, and if it
+ * can never be accessed, this feature won't do
+ * anything anyway.
+ */
+ if (!is_error_page(page)) {
+ vmx->nested.apic_access_page = page;
+ hpa = page_to_phys(vmx->nested.apic_access_page);
+ vmcs_write64(APIC_ACCESS_ADDR, hpa);
+ } else {
+ vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
+ }
+ }
+
+ if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
+ if (vmx->nested.virtual_apic_page) { /* shouldn't happen */
+ kvm_release_page_dirty(vmx->nested.virtual_apic_page);
+ vmx->nested.virtual_apic_page = NULL;
+ }
+ page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->virtual_apic_page_addr);
+
+ /*
+ * If translation failed, VM entry will fail because
+ * prepare_vmcs02 set VIRTUAL_APIC_PAGE_ADDR to -1ull.
+ * Failing the vm entry is _not_ what the processor
+ * does but it's basically the only possibility we
+ * have. We could still enter the guest if CR8 load
+ * exits are enabled, CR8 store exits are enabled, and
+ * virtualize APIC access is disabled; in this case
+ * the processor would never use the TPR shadow and we
+ * could simply clear the bit from the execution
+ * control. But such a configuration is useless, so
+ * let's keep the code simple.
+ */
+ if (!is_error_page(page)) {
+ vmx->nested.virtual_apic_page = page;
+ hpa = page_to_phys(vmx->nested.virtual_apic_page);
+ vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, hpa);
+ }
+ }
+
+ if (nested_cpu_has_posted_intr(vmcs12)) {
+ if (vmx->nested.pi_desc_page) { /* shouldn't happen */
+ kunmap(vmx->nested.pi_desc_page);
+ kvm_release_page_dirty(vmx->nested.pi_desc_page);
+ vmx->nested.pi_desc_page = NULL;
+ vmx->nested.pi_desc = NULL;
+ vmcs_write64(POSTED_INTR_DESC_ADDR, -1ull);
+ }
+ page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->posted_intr_desc_addr);
+ if (is_error_page(page))
+ return;
+ vmx->nested.pi_desc_page = page;
+ vmx->nested.pi_desc = kmap(vmx->nested.pi_desc_page);
+ vmx->nested.pi_desc =
+ (struct pi_desc *)((void *)vmx->nested.pi_desc +
+ (unsigned long)(vmcs12->posted_intr_desc_addr &
+ (PAGE_SIZE - 1)));
+ vmcs_write64(POSTED_INTR_DESC_ADDR,
+ page_to_phys(vmx->nested.pi_desc_page) +
+ (unsigned long)(vmcs12->posted_intr_desc_addr &
+ (PAGE_SIZE - 1)));
+ }
+ if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12))
+ vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL,
+ CPU_BASED_USE_MSR_BITMAPS);
+ else
+ vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
+ CPU_BASED_USE_MSR_BITMAPS);
+}
+
+static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu)
+{
+ u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ /*
+ * A timer value of zero is architecturally guaranteed to cause
+ * a VMExit prior to executing any instructions in the guest.
+ */
+ if (preemption_timeout == 0) {
+ vmx_preemption_timer_fn(&vmx->nested.preemption_timer);
+ return;
+ }
+
+ if (vcpu->arch.virtual_tsc_khz == 0)
+ return;
+
+ preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
+ preemption_timeout *= 1000000;
+ do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz);
+ hrtimer_start(&vmx->nested.preemption_timer,
+ ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL);
+}
+
+static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
+ return 0;
+
+ if (!page_address_valid(vcpu, vmcs12->io_bitmap_a) ||
+ !page_address_valid(vcpu, vmcs12->io_bitmap_b))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
+ return 0;
+
+ if (!page_address_valid(vcpu, vmcs12->msr_bitmap))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
+ return 0;
+
+ if (!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr))
+ return -EINVAL;
+
+ return 0;
+}
+
+static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) {
+ int msr;
+
+ for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
+ unsigned word = msr / BITS_PER_LONG;
+
+ msr_bitmap[word] = ~0;
+ msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
+ }
+}
+
+/*
+ * Merge L0's and L1's MSR bitmap, return false to indicate that
+ * we do not use the hardware.
+ */
+static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ int msr;
+ struct page *page;
+ unsigned long *msr_bitmap_l1;
+ unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap;
+ /*
+ * pred_cmd & spec_ctrl are trying to verify two things:
+ *
+ * 1. L0 gave a permission to L1 to actually passthrough the MSR. This
+ * ensures that we do not accidentally generate an L02 MSR bitmap
+ * from the L12 MSR bitmap that is too permissive.
+ * 2. That L1 or L2s have actually used the MSR. This avoids
+ * unnecessarily merging of the bitmap if the MSR is unused. This
+ * works properly because we only update the L01 MSR bitmap lazily.
+ * So even if L0 should pass L1 these MSRs, the L01 bitmap is only
+ * updated to reflect this when L1 (or its L2s) actually write to
+ * the MSR.
+ */
+ bool pred_cmd = !msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD);
+ bool spec_ctrl = !msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL);
+
+ /* Nothing to do if the MSR bitmap is not in use. */
+ if (!cpu_has_vmx_msr_bitmap() ||
+ !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
+ return false;
+
+ if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
+ !pred_cmd && !spec_ctrl)
+ return false;
+
+ page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->msr_bitmap);
+ if (is_error_page(page))
+ return false;
+
+ msr_bitmap_l1 = (unsigned long *)kmap(page);
+
+ /*
+ * To keep the control flow simple, pay eight 8-byte writes (sixteen
+ * 4-byte writes on 32-bit systems) up front to enable intercepts for
+ * the x2APIC MSR range and selectively disable them below.
+ */
+ enable_x2apic_msr_intercepts(msr_bitmap_l0);
+
+ if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
+ if (nested_cpu_has_apic_reg_virt(vmcs12)) {
+ /*
+ * L0 need not intercept reads for MSRs between 0x800
+ * and 0x8ff, it just lets the processor take the value
+ * from the virtual-APIC page; take those 256 bits
+ * directly from the L1 bitmap.
+ */
+ for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
+ unsigned word = msr / BITS_PER_LONG;
+
+ msr_bitmap_l0[word] = msr_bitmap_l1[word];
+ }
+ }
+
+ nested_vmx_disable_intercept_for_msr(
+ msr_bitmap_l1, msr_bitmap_l0,
+ X2APIC_MSR(APIC_TASKPRI),
+ MSR_TYPE_R | MSR_TYPE_W);
+
+ if (nested_cpu_has_vid(vmcs12)) {
+ nested_vmx_disable_intercept_for_msr(
+ msr_bitmap_l1, msr_bitmap_l0,
+ X2APIC_MSR(APIC_EOI),
+ MSR_TYPE_W);
+ nested_vmx_disable_intercept_for_msr(
+ msr_bitmap_l1, msr_bitmap_l0,
+ X2APIC_MSR(APIC_SELF_IPI),
+ MSR_TYPE_W);
+ }
+ }
+
+ if (spec_ctrl)
+ nested_vmx_disable_intercept_for_msr(
+ msr_bitmap_l1, msr_bitmap_l0,
+ MSR_IA32_SPEC_CTRL,
+ MSR_TYPE_R | MSR_TYPE_W);
+
+ if (pred_cmd)
+ nested_vmx_disable_intercept_for_msr(
+ msr_bitmap_l1, msr_bitmap_l0,
+ MSR_IA32_PRED_CMD,
+ MSR_TYPE_W);
+
+ kunmap(page);
+ kvm_release_page_clean(page);
+
+ return true;
+}
+
+static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ struct vmcs12 *shadow;
+ struct page *page;
+
+ if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
+ vmcs12->vmcs_link_pointer == -1ull)
+ return;
+
+ shadow = get_shadow_vmcs12(vcpu);
+ page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer);
+
+ memcpy(shadow, kmap(page), VMCS12_SIZE);
+
+ kunmap(page);
+ kvm_release_page_clean(page);
+}
+
+static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
+ vmcs12->vmcs_link_pointer == -1ull)
+ return;
+
+ kvm_write_guest(vmx->vcpu.kvm, vmcs12->vmcs_link_pointer,
+ get_shadow_vmcs12(vcpu), VMCS12_SIZE);
+}
+
+static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) &&
+ !page_address_valid(vcpu, vmcs12->apic_access_addr))
+ return -EINVAL;
+ else
+ return 0;
+}
+
+static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
+ !nested_cpu_has_apic_reg_virt(vmcs12) &&
+ !nested_cpu_has_vid(vmcs12) &&
+ !nested_cpu_has_posted_intr(vmcs12))
+ return 0;
+
+ /*
+ * If virtualize x2apic mode is enabled,
+ * virtualize apic access must be disabled.
+ */
+ if (nested_cpu_has_virt_x2apic_mode(vmcs12) &&
+ nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
+ return -EINVAL;
+
+ /*
+ * If virtual interrupt delivery is enabled,
+ * we must exit on external interrupts.
+ */
+ if (nested_cpu_has_vid(vmcs12) &&
+ !nested_exit_on_intr(vcpu))
+ return -EINVAL;
+
+ /*
+ * bits 15:8 should be zero in posted_intr_nv,
+ * the descriptor address has been already checked
+ * in nested_get_vmcs12_pages.
+ *
+ * bits 5:0 of posted_intr_desc_addr should be zero.
+ */
+ if (nested_cpu_has_posted_intr(vmcs12) &&
+ (!nested_cpu_has_vid(vmcs12) ||
+ !nested_exit_intr_ack_set(vcpu) ||
+ (vmcs12->posted_intr_nv & 0xff00) ||
+ (vmcs12->posted_intr_desc_addr & 0x3f) ||
+ (vmcs12->posted_intr_desc_addr >> cpuid_maxphyaddr(vcpu))))
+ return -EINVAL;
+
+ /* tpr shadow is needed by all apicv features. */
+ if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
+ unsigned long count_field,
+ unsigned long addr_field)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ int maxphyaddr;
+ u64 count, addr;
+
+ if (vmcs12_read_any(vmcs12, count_field, &count) ||
+ vmcs12_read_any(vmcs12, addr_field, &addr)) {
+ WARN_ON(1);
+ return -EINVAL;
+ }
+ if (count == 0)
+ return 0;
+ maxphyaddr = cpuid_maxphyaddr(vcpu);
+ if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr ||
+ (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) {
+ pr_debug_ratelimited(
+ "nVMX: invalid MSR switch (0x%lx, %d, %llu, 0x%08llx)",
+ addr_field, maxphyaddr, count, addr);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ if (vmcs12->vm_exit_msr_load_count == 0 &&
+ vmcs12->vm_exit_msr_store_count == 0 &&
+ vmcs12->vm_entry_msr_load_count == 0)
+ return 0; /* Fast path */
+ if (nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_LOAD_COUNT,
+ VM_EXIT_MSR_LOAD_ADDR) ||
+ nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_STORE_COUNT,
+ VM_EXIT_MSR_STORE_ADDR) ||
+ nested_vmx_check_msr_switch(vcpu, VM_ENTRY_MSR_LOAD_COUNT,
+ VM_ENTRY_MSR_LOAD_ADDR))
+ return -EINVAL;
+ return 0;
+}
+
+static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ u64 address = vmcs12->pml_address;
+ int maxphyaddr = cpuid_maxphyaddr(vcpu);
+
+ if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_PML)) {
+ if (!nested_cpu_has_ept(vmcs12) ||
+ !IS_ALIGNED(address, 4096) ||
+ address >> maxphyaddr)
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ if (!nested_cpu_has_shadow_vmcs(vmcs12))
+ return 0;
+
+ if (!page_address_valid(vcpu, vmcs12->vmread_bitmap) ||
+ !page_address_valid(vcpu, vmcs12->vmwrite_bitmap))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu,
+ struct vmx_msr_entry *e)
+{
+ /* x2APIC MSR accesses are not allowed */
+ if (vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8)
+ return -EINVAL;
+ if (e->index == MSR_IA32_UCODE_WRITE || /* SDM Table 35-2 */
+ e->index == MSR_IA32_UCODE_REV)
+ return -EINVAL;
+ if (e->reserved != 0)
+ return -EINVAL;
+ return 0;
+}
+
+static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu,
+ struct vmx_msr_entry *e)
+{
+ if (e->index == MSR_FS_BASE ||
+ e->index == MSR_GS_BASE ||
+ e->index == MSR_IA32_SMM_MONITOR_CTL || /* SMM is not supported */
+ nested_vmx_msr_check_common(vcpu, e))
+ return -EINVAL;
+ return 0;
+}
+
+static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu,
+ struct vmx_msr_entry *e)
+{
+ if (e->index == MSR_IA32_SMBASE || /* SMM is not supported */
+ nested_vmx_msr_check_common(vcpu, e))
+ return -EINVAL;
+ return 0;
+}
+
+/*
+ * Load guest's/host's msr at nested entry/exit.
+ * return 0 for success, entry index for failure.
+ */
+static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
+{
+ u32 i;
+ struct vmx_msr_entry e;
+ struct msr_data msr;
+
+ msr.host_initiated = false;
+ for (i = 0; i < count; i++) {
+ if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e),
+ &e, sizeof(e))) {
+ pr_debug_ratelimited(
+ "%s cannot read MSR entry (%u, 0x%08llx)\n",
+ __func__, i, gpa + i * sizeof(e));
+ goto fail;
+ }
+ if (nested_vmx_load_msr_check(vcpu, &e)) {
+ pr_debug_ratelimited(
+ "%s check failed (%u, 0x%x, 0x%x)\n",
+ __func__, i, e.index, e.reserved);
+ goto fail;
+ }
+ msr.index = e.index;
+ msr.data = e.value;
+ if (kvm_set_msr(vcpu, &msr)) {
+ pr_debug_ratelimited(
+ "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
+ __func__, i, e.index, e.value);
+ goto fail;
+ }
+ }
+ return 0;
+fail:
+ return i + 1;
+}
+
+static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
+{
+ u32 i;
+ struct vmx_msr_entry e;
+
+ for (i = 0; i < count; i++) {
+ struct msr_data msr_info;
+ if (kvm_vcpu_read_guest(vcpu,
+ gpa + i * sizeof(e),
+ &e, 2 * sizeof(u32))) {
+ pr_debug_ratelimited(
+ "%s cannot read MSR entry (%u, 0x%08llx)\n",
+ __func__, i, gpa + i * sizeof(e));
+ return -EINVAL;
+ }
+ if (nested_vmx_store_msr_check(vcpu, &e)) {
+ pr_debug_ratelimited(
+ "%s check failed (%u, 0x%x, 0x%x)\n",
+ __func__, i, e.index, e.reserved);
+ return -EINVAL;
+ }
+ msr_info.host_initiated = false;
+ msr_info.index = e.index;
+ if (kvm_get_msr(vcpu, &msr_info)) {
+ pr_debug_ratelimited(
+ "%s cannot read MSR (%u, 0x%x)\n",
+ __func__, i, e.index);
+ return -EINVAL;
+ }
+ if (kvm_vcpu_write_guest(vcpu,
+ gpa + i * sizeof(e) +
+ offsetof(struct vmx_msr_entry, value),
+ &msr_info.data, sizeof(msr_info.data))) {
+ pr_debug_ratelimited(
+ "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
+ __func__, i, e.index, msr_info.data);
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val)
+{
+ unsigned long invalid_mask;
+
+ invalid_mask = (~0ULL) << cpuid_maxphyaddr(vcpu);
+ return (val & invalid_mask) == 0;
+}
+
+/*
+ * Load guest's/host's cr3 at nested entry/exit. nested_ept is true if we are
+ * emulating VM entry into a guest with EPT enabled.
+ * Returns 0 on success, 1 on failure. Invalid state exit qualification code
+ * is assigned to entry_failure_code on failure.
+ */
+static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept,
+ u32 *entry_failure_code)
+{
+ if (cr3 != kvm_read_cr3(vcpu) || (!nested_ept && pdptrs_changed(vcpu))) {
+ if (!nested_cr3_valid(vcpu, cr3)) {
+ *entry_failure_code = ENTRY_FAIL_DEFAULT;
+ return 1;
+ }
+
+ /*
+ * If PAE paging and EPT are both on, CR3 is not used by the CPU and
+ * must not be dereferenced.
+ */
+ if (is_pae_paging(vcpu) && !nested_ept) {
+ if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) {
+ *entry_failure_code = ENTRY_FAIL_PDPTE;
+ return 1;
+ }
+ }
+ }
+
+ if (!nested_ept)
+ kvm_mmu_new_cr3(vcpu, cr3, false);
+
+ vcpu->arch.cr3 = cr3;
+ __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
+
+ kvm_init_mmu(vcpu, false);
+
+ return 0;
+}
+
+static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
+ vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
+ vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector);
+ vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector);
+ vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector);
+ vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector);
+ vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector);
+ vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit);
+ vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit);
+ vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit);
+ vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit);
+ vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit);
+ vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit);
+ vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit);
+ vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit);
+ vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit);
+ vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes);
+ vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
+ vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes);
+ vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes);
+ vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes);
+ vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes);
+ vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes);
+ vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base);
+ vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base);
+ vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base);
+ vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base);
+ vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base);
+ vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base);
+ vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
+ vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
+
+ vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
+ vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
+ vmcs12->guest_pending_dbg_exceptions);
+ vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
+ vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);
+
+ if (nested_cpu_has_xsaves(vmcs12))
+ vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap);
+ vmcs_write64(VMCS_LINK_POINTER, -1ull);
+
+ if (cpu_has_vmx_posted_intr())
+ vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR);
+
+ /*
+ * Whether page-faults are trapped is determined by a combination of
+ * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.
+ * If enable_ept, L0 doesn't care about page faults and we should
+ * set all of these to L1's desires. However, if !enable_ept, L0 does
+ * care about (at least some) page faults, and because it is not easy
+ * (if at all possible?) to merge L0 and L1's desires, we simply ask
+ * to exit on each and every L2 page fault. This is done by setting
+ * MASK=MATCH=0 and (see below) EB.PF=1.
+ * Note that below we don't need special code to set EB.PF beyond the
+ * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
+ * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
+ * !enable_ept, EB.PF is 1, so the "or" will always be 1.
+ */
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
+ enable_ept ? vmcs12->page_fault_error_code_mask : 0);
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
+ enable_ept ? vmcs12->page_fault_error_code_match : 0);
+
+ /* All VMFUNCs are currently emulated through L0 vmexits. */
+ if (cpu_has_vmx_vmfunc())
+ vmcs_write64(VM_FUNCTION_CONTROL, 0);
+
+ if (cpu_has_vmx_apicv()) {
+ vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0);
+ vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1);
+ vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2);
+ vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3);
+ }
+
+ /*
+ * Set host-state according to L0's settings (vmcs12 is irrelevant here)
+ * Some constant fields are set here by vmx_set_constant_host_state().
+ * Other fields are different per CPU, and will be set later when
+ * vmx_vcpu_load() is called, and when vmx_prepare_switch_to_guest()
+ * is called.
+ */
+ vmx_set_constant_host_state(vmx);
+
+ /*
+ * Set the MSR load/store lists to match L0's settings.
+ */
+ vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
+ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
+ vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
+ vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
+ vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
+
+ set_cr4_guest_host_mask(vmx);
+
+ if (kvm_mpx_supported() && vmx->nested.nested_run_pending &&
+ (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
+ vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
+
+ if (enable_vpid) {
+ if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02)
+ vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02);
+ else
+ vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
+ }
+
+ /*
+ * L1 may access the L2's PDPTR, so save them to construct vmcs12
+ */
+ if (enable_ept) {
+ vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
+ vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
+ vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
+ vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
+ }
+
+ if (cpu_has_vmx_msr_bitmap())
+ vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
+}
+
+/*
+ * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
+ * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
+ * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2
+ * guest in a way that will both be appropriate to L1's requests, and our
+ * needs. In addition to modifying the active vmcs (which is vmcs02), this
+ * function also has additional necessary side-effects, like setting various
+ * vcpu->arch fields.
+ * Returns 0 on success, 1 on failure. Invalid state exit qualification code
+ * is assigned to entry_failure_code on failure.
+ */
+static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
+ u32 *entry_failure_code)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u32 exec_control, vmcs12_exec_ctrl;
+
+ if (vmx->nested.dirty_vmcs12) {
+ prepare_vmcs02_full(vcpu, vmcs12);
+ vmx->nested.dirty_vmcs12 = false;
+ }
+
+ /*
+ * First, the fields that are shadowed. This must be kept in sync
+ * with vmx_shadow_fields.h.
+ */
+
+ vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
+ vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit);
+ vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
+ vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
+ vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
+
+ if (vmx->nested.nested_run_pending &&
+ (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
+ kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
+ vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
+ } else {
+ kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
+ vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl);
+ }
+ if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending ||
+ !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
+ vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs);
+ if (vmx->nested.nested_run_pending) {
+ vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+ vmcs12->vm_entry_intr_info_field);
+ vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
+ vmcs12->vm_entry_exception_error_code);
+ vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
+ vmcs12->vm_entry_instruction_len);
+ vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
+ vmcs12->guest_interruptibility_info);
+ vmx->loaded_vmcs->nmi_known_unmasked =
+ !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI);
+ } else {
+ vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
+ }
+ vmx_set_rflags(vcpu, vmcs12->guest_rflags);
+
+ exec_control = vmcs12->pin_based_vm_exec_control;
+
+ /* Preemption timer setting is computed directly in vmx_vcpu_run. */
+ exec_control |= vmcs_config.pin_based_exec_ctrl;
+ exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+ vmx->loaded_vmcs->hv_timer_armed = false;
+
+ /* Posted interrupts setting is only taken from vmcs12. */
+ if (nested_cpu_has_posted_intr(vmcs12)) {
+ vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
+ vmx->nested.pi_pending = false;
+ } else {
+ exec_control &= ~PIN_BASED_POSTED_INTR;
+ }
+
+ vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control);
+
+ vmx->nested.preemption_timer_expired = false;
+ if (nested_cpu_has_preemption_timer(vmcs12))
+ vmx_start_preemption_timer(vcpu);
+
+ if (cpu_has_secondary_exec_ctrls()) {
+ exec_control = vmx->secondary_exec_control;
+
+ /* Take the following fields only from vmcs12 */
+ exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+ SECONDARY_EXEC_ENABLE_INVPCID |
+ SECONDARY_EXEC_RDTSCP |
+ SECONDARY_EXEC_XSAVES |
+ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
+ SECONDARY_EXEC_APIC_REGISTER_VIRT |
+ SECONDARY_EXEC_ENABLE_VMFUNC);
+ if (nested_cpu_has(vmcs12,
+ CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) {
+ vmcs12_exec_ctrl = vmcs12->secondary_vm_exec_control &
+ ~SECONDARY_EXEC_ENABLE_PML;
+ exec_control |= vmcs12_exec_ctrl;
+ }
+
+ /* VMCS shadowing for L2 is emulated for now */
+ exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
+
+ if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
+ vmcs_write16(GUEST_INTR_STATUS,
+ vmcs12->guest_intr_status);
+
+ /*
+ * Write an illegal value to APIC_ACCESS_ADDR. Later,
+ * nested_get_vmcs12_pages will either fix it up or
+ * remove the VM execution control.
+ */
+ if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)
+ vmcs_write64(APIC_ACCESS_ADDR, -1ull);
+
+ if (exec_control & SECONDARY_EXEC_ENCLS_EXITING)
+ vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
+
+ vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+ }
+
+ /*
+ * HOST_RSP is normally set correctly in vmx_vcpu_run() just before
+ * entry, but only if the current (host) sp changed from the value
+ * we wrote last (vmx->host_rsp). This cache is no longer relevant
+ * if we switch vmcs, and rather than hold a separate cache per vmcs,
+ * here we just force the write to happen on entry.
+ */
+ vmx->host_rsp = 0;
+
+ exec_control = vmx_exec_control(vmx); /* L0's desires */
+ exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
+ exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
+ exec_control &= ~CPU_BASED_TPR_SHADOW;
+ exec_control |= vmcs12->cpu_based_vm_exec_control;
+
+ /*
+ * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR. Later, if
+ * nested_get_vmcs12_pages can't fix it up, the illegal value
+ * will result in a VM entry failure.
+ */
+ if (exec_control & CPU_BASED_TPR_SHADOW) {
+ vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull);
+ vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
+ } else {
+#ifdef CONFIG_X86_64
+ exec_control |= CPU_BASED_CR8_LOAD_EXITING |
+ CPU_BASED_CR8_STORE_EXITING;
+#endif
+ }
+
+ /*
+ * A vmexit (to either L1 hypervisor or L0 userspace) is always needed
+ * for I/O port accesses.
+ */
+ exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
+ exec_control |= CPU_BASED_UNCOND_IO_EXITING;
+
+ vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
+
+ /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
+ * bitwise-or of what L1 wants to trap for L2, and what we want to
+ * trap. Note that CR0.TS also needs updating - we do this later.
+ */
+ update_exception_bitmap(vcpu);
+ vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
+ vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
+
+ /* L2->L1 exit controls are emulated - the hardware exit is to L0 so
+ * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
+ * bits are further modified by vmx_set_efer() below.
+ */
+ vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
+
+ /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
+ * emulated by vmx_set_efer(), below.
+ */
+ vm_entry_controls_init(vmx,
+ (vmcs12->vm_entry_controls & ~VM_ENTRY_LOAD_IA32_EFER &
+ ~VM_ENTRY_IA32E_MODE) |
+ (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE));
+
+ if (vmx->nested.nested_run_pending &&
+ (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) {
+ vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
+ vcpu->arch.pat = vmcs12->guest_ia32_pat;
+ } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
+ vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
+ }
+
+ vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
+
+ if (kvm_has_tsc_control)
+ decache_tsc_multiplier(vmx);
+
+ if (enable_vpid) {
+ /*
+ * There is no direct mapping between vpid02 and vpid12, the
+ * vpid02 is per-vCPU for L0 and reused while the value of
+ * vpid12 is changed w/ one invvpid during nested vmentry.
+ * The vpid12 is allocated by L1 for L2, so it will not
+ * influence global bitmap(for vpid01 and vpid02 allocation)
+ * even if spawn a lot of nested vCPUs.
+ */
+ if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) {
+ if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
+ vmx->nested.last_vpid = vmcs12->virtual_processor_id;
+ __vmx_flush_tlb(vcpu, vmx->nested.vpid02, true);
+ }
+ } else {
+ vmx_flush_tlb(vcpu, true);
+ }
+ }
+
+ if (enable_pml) {
+ /*
+ * Conceptually we want to copy the PML address and index from
+ * vmcs01 here, and then back to vmcs01 on nested vmexit. But,
+ * since we always flush the log on each vmexit, this happens
+ * to be equivalent to simply resetting the fields in vmcs02.
+ */
+ ASSERT(vmx->pml_pg);
+ vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
+ vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
+ }
+
+ if (nested_cpu_has_ept(vmcs12)) {
+ if (nested_ept_init_mmu_context(vcpu)) {
+ *entry_failure_code = ENTRY_FAIL_DEFAULT;
+ return 1;
+ }
+ } else if (nested_cpu_has2(vmcs12,
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
+ vmx_flush_tlb(vcpu, true);
+ }
+
+ /*
+ * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those
+ * bits which we consider mandatory enabled.
+ * The CR0_READ_SHADOW is what L2 should have expected to read given
+ * the specifications by L1; It's not enough to take
+ * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we
+ * have more bits than L1 expected.
+ */
+ vmx_set_cr0(vcpu, vmcs12->guest_cr0);
+ vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
+
+ vmx_set_cr4(vcpu, vmcs12->guest_cr4);
+ vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
+
+ if (vmx->nested.nested_run_pending &&
+ (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER))
+ vcpu->arch.efer = vmcs12->guest_ia32_efer;
+ else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
+ vcpu->arch.efer |= (EFER_LMA | EFER_LME);
+ else
+ vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
+ /* Note: modifies VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
+ vmx_set_efer(vcpu, vcpu->arch.efer);
+
+ /*
+ * Guest state is invalid and unrestricted guest is disabled,
+ * which means L1 attempted VMEntry to L2 with invalid state.
+ * Fail the VMEntry.
+ */
+ if (vmx->emulation_required) {
+ *entry_failure_code = ENTRY_FAIL_DEFAULT;
+ return 1;
+ }
+
+ /* Shadow page tables on either EPT or shadow page tables. */
+ if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12),
+ entry_failure_code))
+ return 1;
+
+ if (!enable_ept)
+ vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested;
+
+ kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp);
+ kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip);
+ return 0;
+}
+
+static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12)
+{
+ if (!nested_cpu_has_nmi_exiting(vmcs12) &&
+ nested_cpu_has_virtual_nmis(vmcs12))
+ return -EINVAL;
+
+ if (!nested_cpu_has_virtual_nmis(vmcs12) &&
+ nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE &&
+ vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT)
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
+ if (nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id)
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
+ if (nested_vmx_check_io_bitmap_controls(vcpu, vmcs12))
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
+ if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12))
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
+ if (nested_vmx_check_apic_access_controls(vcpu, vmcs12))
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
+ if (nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12))
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
+ if (nested_vmx_check_apicv_controls(vcpu, vmcs12))
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
+ if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12))
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
+ if (nested_vmx_check_pml_controls(vcpu, vmcs12))
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
+ if (nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12))
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
+ if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
+ vmx->nested.msrs.procbased_ctls_low,
+ vmx->nested.msrs.procbased_ctls_high) ||
+ (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
+ !vmx_control_verify(vmcs12->secondary_vm_exec_control,
+ vmx->nested.msrs.secondary_ctls_low,
+ vmx->nested.msrs.secondary_ctls_high)) ||
+ !vmx_control_verify(vmcs12->pin_based_vm_exec_control,
+ vmx->nested.msrs.pinbased_ctls_low,
+ vmx->nested.msrs.pinbased_ctls_high) ||
+ !vmx_control_verify(vmcs12->vm_exit_controls,
+ vmx->nested.msrs.exit_ctls_low,
+ vmx->nested.msrs.exit_ctls_high) ||
+ !vmx_control_verify(vmcs12->vm_entry_controls,
+ vmx->nested.msrs.entry_ctls_low,
+ vmx->nested.msrs.entry_ctls_high))
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
+ if (nested_vmx_check_nmi_controls(vmcs12))
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
+ if (nested_cpu_has_vmfunc(vmcs12)) {
+ if (vmcs12->vm_function_control &
+ ~vmx->nested.msrs.vmfunc_controls)
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
+ if (nested_cpu_has_eptp_switching(vmcs12)) {
+ if (!nested_cpu_has_ept(vmcs12) ||
+ !page_address_valid(vcpu, vmcs12->eptp_list_address))
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+ }
+ }
+
+ if (vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu))
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
+ if (!nested_host_cr0_valid(vcpu, vmcs12->host_cr0) ||
+ !nested_host_cr4_valid(vcpu, vmcs12->host_cr4) ||
+ !nested_cr3_valid(vcpu, vmcs12->host_cr3))
+ return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD;
+
+ /*
+ * From the Intel SDM, volume 3:
+ * Fields relevant to VM-entry event injection must be set properly.
+ * These fields are the VM-entry interruption-information field, the
+ * VM-entry exception error code, and the VM-entry instruction length.
+ */
+ if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) {
+ u32 intr_info = vmcs12->vm_entry_intr_info_field;
+ u8 vector = intr_info & INTR_INFO_VECTOR_MASK;
+ u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK;
+ bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK;
+ bool should_have_error_code;
+ bool urg = nested_cpu_has2(vmcs12,
+ SECONDARY_EXEC_UNRESTRICTED_GUEST);
+ bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE;
+
+ /* VM-entry interruption-info field: interruption type */
+ if (intr_type == INTR_TYPE_RESERVED ||
+ (intr_type == INTR_TYPE_OTHER_EVENT &&
+ !nested_cpu_supports_monitor_trap_flag(vcpu)))
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
+ /* VM-entry interruption-info field: vector */
+ if ((intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) ||
+ (intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) ||
+ (intr_type == INTR_TYPE_OTHER_EVENT && vector != 0))
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
+ /* VM-entry interruption-info field: deliver error code */
+ should_have_error_code =
+ intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode &&
+ x86_exception_has_error_code(vector);
+ if (has_error_code != should_have_error_code)
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
+ /* VM-entry exception error code */
+ if (has_error_code &&
+ vmcs12->vm_entry_exception_error_code & GENMASK(31, 16))
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
+ /* VM-entry interruption-info field: reserved bits */
+ if (intr_info & INTR_INFO_RESVD_BITS_MASK)
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
+ /* VM-entry instruction length */
+ switch (intr_type) {
+ case INTR_TYPE_SOFT_EXCEPTION:
+ case INTR_TYPE_SOFT_INTR:
+ case INTR_TYPE_PRIV_SW_EXCEPTION:
+ if ((vmcs12->vm_entry_instruction_len > 15) ||
+ (vmcs12->vm_entry_instruction_len == 0 &&
+ !nested_cpu_has_zero_length_injection(vcpu)))
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+ }
+ }
+
+ return 0;
+}
+
+static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ int r;
+ struct page *page;
+ struct vmcs12 *shadow;
+
+ if (vmcs12->vmcs_link_pointer == -1ull)
+ return 0;
+
+ if (!page_address_valid(vcpu, vmcs12->vmcs_link_pointer))
+ return -EINVAL;
+
+ page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer);
+ if (is_error_page(page))
+ return -EINVAL;
+
+ r = 0;
+ shadow = kmap(page);
+ if (shadow->hdr.revision_id != VMCS12_REVISION ||
+ shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))
+ r = -EINVAL;
+ kunmap(page);
+ kvm_release_page_clean(page);
+ return r;
+}
+
+static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
+ u32 *exit_qual)
+{
+ bool ia32e;
+
+ *exit_qual = ENTRY_FAIL_DEFAULT;
+
+ if (!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0) ||
+ !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))
+ return 1;
+
+ if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) {
+ *exit_qual = ENTRY_FAIL_VMCS_LINK_PTR;
+ return 1;
+ }
+
+ /*
+ * If the load IA32_EFER VM-entry control is 1, the following checks
+ * are performed on the field for the IA32_EFER MSR:
+ * - Bits reserved in the IA32_EFER MSR must be 0.
+ * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of
+ * the IA-32e mode guest VM-exit control. It must also be identical
+ * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to
+ * CR0.PG) is 1.
+ */
+ if (to_vmx(vcpu)->nested.nested_run_pending &&
+ (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) {
+ ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0;
+ if (!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer) ||
+ ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA) ||
+ ((vmcs12->guest_cr0 & X86_CR0_PG) &&
+ ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))
+ return 1;
+ }
+
+ /*
+ * If the load IA32_EFER VM-exit control is 1, bits reserved in the
+ * IA32_EFER MSR must be 0 in the field for that register. In addition,
+ * the values of the LMA and LME bits in the field must each be that of
+ * the host address-space size VM-exit control.
+ */
+ if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) {
+ ia32e = (vmcs12->vm_exit_controls &
+ VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0;
+ if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) ||
+ ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) ||
+ ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))
+ return 1;
+ }
+
+ if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) &&
+ (is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu) ||
+ (vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))
+ return 1;
+
+ return 0;
+}
+
+/*
+ * If exit_qual is NULL, this is being called from state restore (either RSM
+ * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume.
+ */
+static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, u32 *exit_qual)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ bool from_vmentry = !!exit_qual;
+ u32 dummy_exit_qual;
+ bool evaluate_pending_interrupts;
+ int r = 0;
+
+ evaluate_pending_interrupts = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) &
+ (CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING);
+ if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu))
+ evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu);
+
+ enter_guest_mode(vcpu);
+
+ if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
+ vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
+ if (kvm_mpx_supported() &&
+ !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
+ vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
+
+ vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
+ vmx_segment_cache_clear(vmx);
+
+ if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
+ vcpu->arch.tsc_offset += vmcs12->tsc_offset;
+
+ r = EXIT_REASON_INVALID_STATE;
+ if (prepare_vmcs02(vcpu, vmcs12, from_vmentry ? exit_qual : &dummy_exit_qual))
+ goto fail;
+
+ if (from_vmentry) {
+ nested_get_vmcs12_pages(vcpu);
+
+ r = EXIT_REASON_MSR_LOAD_FAIL;
+ *exit_qual = nested_vmx_load_msr(vcpu,
+ vmcs12->vm_entry_msr_load_addr,
+ vmcs12->vm_entry_msr_load_count);
+ if (*exit_qual)
+ goto fail;
+ } else {
+ /*
+ * The MMU is not initialized to point at the right entities yet and
+ * "get pages" would need to read data from the guest (i.e. we will
+ * need to perform gpa to hpa translation). Request a call
+ * to nested_get_vmcs12_pages before the next VM-entry. The MSRs
+ * have already been set at vmentry time and should not be reset.
+ */
+ kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu);
+ }
+
+ /*
+ * If L1 had a pending IRQ/NMI until it executed
+ * VMLAUNCH/VMRESUME which wasn't delivered because it was
+ * disallowed (e.g. interrupts disabled), L0 needs to
+ * evaluate if this pending event should cause an exit from L2
+ * to L1 or delivered directly to L2 (e.g. In case L1 don't
+ * intercept EXTERNAL_INTERRUPT).
+ *
+ * Usually this would be handled by the processor noticing an
+ * IRQ/NMI window request, or checking RVI during evaluation of
+ * pending virtual interrupts. However, this setting was done
+ * on VMCS01 and now VMCS02 is active instead. Thus, we force L0
+ * to perform pending event evaluation by requesting a KVM_REQ_EVENT.
+ */
+ if (unlikely(evaluate_pending_interrupts))
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ /*
+ * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
+ * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
+ * returned as far as L1 is concerned. It will only return (and set
+ * the success flag) when L2 exits (see nested_vmx_vmexit()).
+ */
+ return 0;
+
+fail:
+ if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
+ vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
+ leave_guest_mode(vcpu);
+ vmx_switch_vmcs(vcpu, &vmx->vmcs01);
+ return r;
+}
+
+/*
+ * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1
+ * for running an L2 nested guest.
+ */
+static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
+{
+ struct vmcs12 *vmcs12;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu);
+ u32 exit_qual;
+ int ret;
+
+ if (!nested_vmx_check_permission(vcpu))
+ return 1;
+
+ if (!nested_vmx_check_vmcs12(vcpu))
+ goto out;
+
+ vmcs12 = get_vmcs12(vcpu);
+
+ /*
+ * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact
+ * that there *is* a valid VMCS pointer, RFLAGS.CF is set
+ * rather than RFLAGS.ZF, and no error number is stored to the
+ * VM-instruction error field.
+ */
+ if (vmcs12->hdr.shadow_vmcs) {
+ nested_vmx_failInvalid(vcpu);
+ goto out;
+ }
+
+ if (enable_shadow_vmcs)
+ copy_shadow_to_vmcs12(vmx);
+
+ /*
+ * The nested entry process starts with enforcing various prerequisites
+ * on vmcs12 as required by the Intel SDM, and act appropriately when
+ * they fail: As the SDM explains, some conditions should cause the
+ * instruction to fail, while others will cause the instruction to seem
+ * to succeed, but return an EXIT_REASON_INVALID_STATE.
+ * To speed up the normal (success) code path, we should avoid checking
+ * for misconfigurations which will anyway be caught by the processor
+ * when using the merged vmcs02.
+ */
+ if (interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS) {
+ nested_vmx_failValid(vcpu,
+ VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS);
+ goto out;
+ }
+
+ if (vmcs12->launch_state == launch) {
+ nested_vmx_failValid(vcpu,
+ launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
+ : VMXERR_VMRESUME_NONLAUNCHED_VMCS);
+ goto out;
+ }
+
+ ret = check_vmentry_prereqs(vcpu, vmcs12);
+ if (ret) {
+ nested_vmx_failValid(vcpu, ret);
+ goto out;
+ }
+
+ /*
+ * After this point, the trap flag no longer triggers a singlestep trap
+ * on the vm entry instructions; don't call kvm_skip_emulated_instruction.
+ * This is not 100% correct; for performance reasons, we delegate most
+ * of the checks on host state to the processor. If those fail,
+ * the singlestep trap is missed.
+ */
+ skip_emulated_instruction(vcpu);
+
+ ret = check_vmentry_postreqs(vcpu, vmcs12, &exit_qual);
+ if (ret) {
+ nested_vmx_entry_failure(vcpu, vmcs12,
+ EXIT_REASON_INVALID_STATE, exit_qual);
+ return 1;
+ }
+
+ /*
+ * We're finally done with prerequisite checking, and can start with
+ * the nested entry.
+ */
+
+ vmx->nested.nested_run_pending = 1;
+ ret = enter_vmx_non_root_mode(vcpu, &exit_qual);
+ if (ret) {
+ nested_vmx_entry_failure(vcpu, vmcs12, ret, exit_qual);
+ vmx->nested.nested_run_pending = 0;
+ return 1;
+ }
+
+ /* Hide L1D cache contents from the nested guest. */
+ vmx->vcpu.arch.l1tf_flush_l1d = true;
+
+ /*
+ * Must happen outside of enter_vmx_non_root_mode() as it will
+ * also be used as part of restoring nVMX state for
+ * snapshot restore (migration).
+ *
+ * In this flow, it is assumed that vmcs12 cache was
+ * trasferred as part of captured nVMX state and should
+ * therefore not be read from guest memory (which may not
+ * exist on destination host yet).
+ */
+ nested_cache_shadow_vmcs12(vcpu, vmcs12);
+
+ /*
+ * If we're entering a halted L2 vcpu and the L2 vcpu won't be
+ * awakened by event injection or by an NMI-window VM-exit or
+ * by an interrupt-window VM-exit, halt the vcpu.
+ */
+ if ((vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) &&
+ !(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) &&
+ !(vmcs12->cpu_based_vm_exec_control & CPU_BASED_VIRTUAL_NMI_PENDING) &&
+ !((vmcs12->cpu_based_vm_exec_control & CPU_BASED_VIRTUAL_INTR_PENDING) &&
+ (vmcs12->guest_rflags & X86_EFLAGS_IF))) {
+ vmx->nested.nested_run_pending = 0;
+ return kvm_vcpu_halt(vcpu);
+ }
+ return 1;
+
+out:
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+/*
+ * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date
+ * because L2 may have changed some cr0 bits directly (CRO_GUEST_HOST_MASK).
+ * This function returns the new value we should put in vmcs12.guest_cr0.
+ * It's not enough to just return the vmcs02 GUEST_CR0. Rather,
+ * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now
+ * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0
+ * didn't trap the bit, because if L1 did, so would L0).
+ * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have
+ * been modified by L2, and L1 knows it. So just leave the old value of
+ * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0
+ * isn't relevant, because if L0 traps this bit it can set it to anything.
+ * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have
+ * changed these bits, and therefore they need to be updated, but L0
+ * didn't necessarily allow them to be changed in GUEST_CR0 - and rather
+ * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there.
+ */
+static inline unsigned long
+vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+{
+ return
+ /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) |
+ /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) |
+ /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask |
+ vcpu->arch.cr0_guest_owned_bits));
+}
+
+static inline unsigned long
+vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+{
+ return
+ /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) |
+ /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) |
+ /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask |
+ vcpu->arch.cr4_guest_owned_bits));
+}
+
+static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ u32 idt_vectoring;
+ unsigned int nr;
+
+ if (vcpu->arch.exception.injected) {
+ nr = vcpu->arch.exception.nr;
+ idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
+
+ if (kvm_exception_is_soft(nr)) {
+ vmcs12->vm_exit_instruction_len =
+ vcpu->arch.event_exit_inst_len;
+ idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION;
+ } else
+ idt_vectoring |= INTR_TYPE_HARD_EXCEPTION;
+
+ if (vcpu->arch.exception.has_error_code) {
+ idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK;
+ vmcs12->idt_vectoring_error_code =
+ vcpu->arch.exception.error_code;
+ }
+
+ vmcs12->idt_vectoring_info_field = idt_vectoring;
+ } else if (vcpu->arch.nmi_injected) {
+ vmcs12->idt_vectoring_info_field =
+ INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR;
+ } else if (vcpu->arch.interrupt.injected) {
+ nr = vcpu->arch.interrupt.nr;
+ idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
+
+ if (vcpu->arch.interrupt.soft) {
+ idt_vectoring |= INTR_TYPE_SOFT_INTR;
+ vmcs12->vm_entry_instruction_len =
+ vcpu->arch.event_exit_inst_len;
+ } else
+ idt_vectoring |= INTR_TYPE_EXT_INTR;
+
+ vmcs12->idt_vectoring_info_field = idt_vectoring;
+ }
+}
+
+static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long exit_qual;
+ bool block_nested_events =
+ vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu);
+
+ if (vcpu->arch.exception.pending &&
+ nested_vmx_check_exception(vcpu, &exit_qual)) {
+ if (block_nested_events)
+ return -EBUSY;
+ nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
+ return 0;
+ }
+
+ if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) &&
+ vmx->nested.preemption_timer_expired) {
+ if (block_nested_events)
+ return -EBUSY;
+ nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0);
+ return 0;
+ }
+
+ if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) {
+ if (block_nested_events)
+ return -EBUSY;
+ nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
+ NMI_VECTOR | INTR_TYPE_NMI_INTR |
+ INTR_INFO_VALID_MASK, 0);
+ /*
+ * The NMI-triggered VM exit counts as injection:
+ * clear this one and block further NMIs.
+ */
+ vcpu->arch.nmi_pending = 0;
+ vmx_set_nmi_mask(vcpu, true);
+ return 0;
+ }
+
+ if (kvm_cpu_has_interrupt(vcpu) && nested_exit_on_intr(vcpu)) {
+ if (block_nested_events)
+ return -EBUSY;
+ nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
+ return 0;
+ }
+
+ vmx_complete_nested_posted_interrupt(vcpu);
+ return 0;
+}
+
+static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu)
+{
+ to_vmx(vcpu)->req_immediate_exit = true;
+}
+
+static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
+{
+ ktime_t remaining =
+ hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer);
+ u64 value;
+
+ if (ktime_to_ns(remaining) <= 0)
+ return 0;
+
+ value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz;
+ do_div(value, 1000000);
+ return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
+}
+
+/*
+ * Update the guest state fields of vmcs12 to reflect changes that
+ * occurred while L2 was running. (The "IA-32e mode guest" bit of the
+ * VM-entry controls is also updated, since this is really a guest
+ * state bit.)
+ */
+static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+{
+ vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
+ vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
+
+ vmcs12->guest_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
+ vmcs12->guest_rip = kvm_register_read(vcpu, VCPU_REGS_RIP);
+ vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS);
+
+ vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
+ vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
+ vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
+ vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
+ vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
+ vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
+ vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
+ vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
+ vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
+ vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
+ vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
+ vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
+ vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
+ vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
+ vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
+ vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
+ vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
+ vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
+ vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
+ vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
+ vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
+ vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
+ vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
+ vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
+ vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
+ vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
+ vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE);
+ vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
+ vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
+ vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
+ vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
+ vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
+ vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
+ vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
+ vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
+ vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
+
+ vmcs12->guest_interruptibility_info =
+ vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
+ vmcs12->guest_pending_dbg_exceptions =
+ vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
+ if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
+ vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT;
+ else
+ vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE;
+
+ if (nested_cpu_has_preemption_timer(vmcs12)) {
+ if (vmcs12->vm_exit_controls &
+ VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
+ vmcs12->vmx_preemption_timer_value =
+ vmx_get_preemption_timer_value(vcpu);
+ hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer);
+ }
+
+ /*
+ * In some cases (usually, nested EPT), L2 is allowed to change its
+ * own CR3 without exiting. If it has changed it, we must keep it.
+ * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined
+ * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12.
+ *
+ * Additionally, restore L2's PDPTR to vmcs12.
+ */
+ if (enable_ept) {
+ vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3);
+ vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
+ vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
+ vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
+ vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
+ }
+
+ vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
+
+ if (nested_cpu_has_vid(vmcs12))
+ vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS);
+
+ vmcs12->vm_entry_controls =
+ (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
+ (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
+
+ if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) {
+ kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7);
+ vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
+ }
+
+ /* TODO: These cannot have changed unless we have MSR bitmaps and
+ * the relevant bit asks not to trap the change */
+ if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT)
+ vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
+ if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER)
+ vmcs12->guest_ia32_efer = vcpu->arch.efer;
+ vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
+ vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
+ vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
+ if (kvm_mpx_supported())
+ vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
+}
+
+/*
+ * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
+ * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
+ * and this function updates it to reflect the changes to the guest state while
+ * L2 was running (and perhaps made some exits which were handled directly by L0
+ * without going back to L1), and to reflect the exit reason.
+ * Note that we do not have to copy here all VMCS fields, just those that
+ * could have changed by the L2 guest or the exit - i.e., the guest-state and
+ * exit-information fields only. Other fields are modified by L1 with VMWRITE,
+ * which already writes to vmcs12 directly.
+ */
+static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
+ u32 exit_reason, u32 exit_intr_info,
+ unsigned long exit_qualification)
+{
+ /* update guest state fields: */
+ sync_vmcs12(vcpu, vmcs12);
+
+ /* update exit information fields: */
+
+ vmcs12->vm_exit_reason = exit_reason;
+ vmcs12->exit_qualification = exit_qualification;
+ vmcs12->vm_exit_intr_info = exit_intr_info;
+
+ vmcs12->idt_vectoring_info_field = 0;
+ vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+ vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+
+ if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) {
+ vmcs12->launch_state = 1;
+
+ /* vm_entry_intr_info_field is cleared on exit. Emulate this
+ * instead of reading the real value. */
+ vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK;
+
+ /*
+ * Transfer the event that L0 or L1 may wanted to inject into
+ * L2 to IDT_VECTORING_INFO_FIELD.
+ */
+ vmcs12_save_pending_event(vcpu, vmcs12);
+ }
+
+ /*
+ * Drop what we picked up for L2 via vmx_complete_interrupts. It is
+ * preserved above and would only end up incorrectly in L1.
+ */
+ vcpu->arch.nmi_injected = false;
+ kvm_clear_exception_queue(vcpu);
+ kvm_clear_interrupt_queue(vcpu);
+}
+
+/*
+ * A part of what we need to when the nested L2 guest exits and we want to
+ * run its L1 parent, is to reset L1's guest state to the host state specified
+ * in vmcs12.
+ * This function is to be called not only on normal nested exit, but also on
+ * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry
+ * Failures During or After Loading Guest State").
+ * This function should be called when the active VMCS is L1's (vmcs01).
+ */
+static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ struct kvm_segment seg;
+ u32 entry_failure_code;
+
+ if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
+ vcpu->arch.efer = vmcs12->host_ia32_efer;
+ else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
+ vcpu->arch.efer |= (EFER_LMA | EFER_LME);
+ else
+ vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
+ vmx_set_efer(vcpu, vcpu->arch.efer);
+
+ kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->host_rsp);
+ kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip);
+ vmx_set_rflags(vcpu, X86_EFLAGS_FIXED);
+ /*
+ * Note that calling vmx_set_cr0 is important, even if cr0 hasn't
+ * actually changed, because vmx_set_cr0 refers to efer set above.
+ *
+ * CR0_GUEST_HOST_MASK is already set in the original vmcs01
+ * (KVM doesn't change it);
+ */
+ vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
+ vmx_set_cr0(vcpu, vmcs12->host_cr0);
+
+ /* Same as above - no reason to call set_cr4_guest_host_mask(). */
+ vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
+ vmx_set_cr4(vcpu, vmcs12->host_cr4);
+
+ nested_ept_uninit_mmu_context(vcpu);
+
+ /*
+ * Only PDPTE load can fail as the value of cr3 was checked on entry and
+ * couldn't have changed.
+ */
+ if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code))
+ nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
+
+ if (!enable_ept)
+ vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
+
+ /*
+ * If vmcs01 don't use VPID, CPU flushes TLB on every
+ * VMEntry/VMExit. Thus, no need to flush TLB.
+ *
+ * If vmcs12 uses VPID, TLB entries populated by L2 are
+ * tagged with vmx->nested.vpid02 while L1 entries are tagged
+ * with vmx->vpid. Thus, no need to flush TLB.
+ *
+ * Therefore, flush TLB only in case vmcs01 uses VPID and
+ * vmcs12 don't use VPID as in this case L1 & L2 TLB entries
+ * are both tagged with vmx->vpid.
+ */
+ if (enable_vpid &&
+ !(nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02)) {
+ vmx_flush_tlb(vcpu, true);
+ }
+
+ vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
+ vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp);
+ vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
+ vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
+ vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
+ vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF);
+ vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF);
+
+ /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */
+ if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS)
+ vmcs_write64(GUEST_BNDCFGS, 0);
+
+ if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) {
+ vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
+ vcpu->arch.pat = vmcs12->host_ia32_pat;
+ }
+ if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
+ vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL,
+ vmcs12->host_ia32_perf_global_ctrl);
+
+ /* Set L1 segment info according to Intel SDM
+ 27.5.2 Loading Host Segment and Descriptor-Table Registers */
+ seg = (struct kvm_segment) {
+ .base = 0,
+ .limit = 0xFFFFFFFF,
+ .selector = vmcs12->host_cs_selector,
+ .type = 11,
+ .present = 1,
+ .s = 1,
+ .g = 1
+ };
+ if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
+ seg.l = 1;
+ else
+ seg.db = 1;
+ vmx_set_segment(vcpu, &seg, VCPU_SREG_CS);
+ seg = (struct kvm_segment) {
+ .base = 0,
+ .limit = 0xFFFFFFFF,
+ .type = 3,
+ .present = 1,
+ .s = 1,
+ .db = 1,
+ .g = 1
+ };
+ seg.selector = vmcs12->host_ds_selector;
+ vmx_set_segment(vcpu, &seg, VCPU_SREG_DS);
+ seg.selector = vmcs12->host_es_selector;
+ vmx_set_segment(vcpu, &seg, VCPU_SREG_ES);
+ seg.selector = vmcs12->host_ss_selector;
+ vmx_set_segment(vcpu, &seg, VCPU_SREG_SS);
+ seg.selector = vmcs12->host_fs_selector;
+ seg.base = vmcs12->host_fs_base;
+ vmx_set_segment(vcpu, &seg, VCPU_SREG_FS);
+ seg.selector = vmcs12->host_gs_selector;
+ seg.base = vmcs12->host_gs_base;
+ vmx_set_segment(vcpu, &seg, VCPU_SREG_GS);
+ seg = (struct kvm_segment) {
+ .base = vmcs12->host_tr_base,
+ .limit = 0x67,
+ .selector = vmcs12->host_tr_selector,
+ .type = 11,
+ .present = 1
+ };
+ vmx_set_segment(vcpu, &seg, VCPU_SREG_TR);
+
+ kvm_set_dr(vcpu, 7, 0x400);
+ vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
+
+ if (cpu_has_vmx_msr_bitmap())
+ vmx_update_msr_bitmap(vcpu);
+
+ if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
+ vmcs12->vm_exit_msr_load_count))
+ nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
+}
+
+static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx)
+{
+ struct shared_msr_entry *efer_msr;
+ unsigned int i;
+
+ if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER)
+ return vmcs_read64(GUEST_IA32_EFER);
+
+ if (cpu_has_load_ia32_efer)
+ return host_efer;
+
+ for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) {
+ if (vmx->msr_autoload.guest.val[i].index == MSR_EFER)
+ return vmx->msr_autoload.guest.val[i].value;
+ }
+
+ efer_msr = find_msr_entry(vmx, MSR_EFER);
+ if (efer_msr)
+ return efer_msr->data;
+
+ return host_efer;
+}
+
+static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vmx_msr_entry g, h;
+ struct msr_data msr;
+ gpa_t gpa;
+ u32 i, j;
+
+ vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT);
+
+ if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) {
+ /*
+ * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set
+ * as vmcs01.GUEST_DR7 contains a userspace defined value
+ * and vcpu->arch.dr7 is not squirreled away before the
+ * nested VMENTER (not worth adding a variable in nested_vmx).
+ */
+ if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
+ kvm_set_dr(vcpu, 7, DR7_FIXED_1);
+ else
+ WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7)));
+ }
+
+ /*
+ * Note that calling vmx_set_{efer,cr0,cr4} is important as they
+ * handle a variety of side effects to KVM's software model.
+ */
+ vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx));
+
+ vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
+ vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW));
+
+ vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
+ vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW));
+
+ nested_ept_uninit_mmu_context(vcpu);
+ vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
+ __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
+
+ /*
+ * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs
+ * from vmcs01 (if necessary). The PDPTRs are not loaded on
+ * VMFail, like everything else we just need to ensure our
+ * software model is up-to-date.
+ */
+ ept_save_pdptrs(vcpu);
+
+ kvm_mmu_reset_context(vcpu);
+
+ if (cpu_has_vmx_msr_bitmap())
+ vmx_update_msr_bitmap(vcpu);
+
+ /*
+ * This nasty bit of open coding is a compromise between blindly
+ * loading L1's MSRs using the exit load lists (incorrect emulation
+ * of VMFail), leaving the nested VM's MSRs in the software model
+ * (incorrect behavior) and snapshotting the modified MSRs (too
+ * expensive since the lists are unbound by hardware). For each
+ * MSR that was (prematurely) loaded from the nested VMEntry load
+ * list, reload it from the exit load list if it exists and differs
+ * from the guest value. The intent is to stuff host state as
+ * silently as possible, not to fully process the exit load list.
+ */
+ msr.host_initiated = false;
+ for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) {
+ gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g));
+ if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) {
+ pr_debug_ratelimited(
+ "%s read MSR index failed (%u, 0x%08llx)\n",
+ __func__, i, gpa);
+ goto vmabort;
+ }
+
+ for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) {
+ gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h));
+ if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) {
+ pr_debug_ratelimited(
+ "%s read MSR failed (%u, 0x%08llx)\n",
+ __func__, j, gpa);
+ goto vmabort;
+ }
+ if (h.index != g.index)
+ continue;
+ if (h.value == g.value)
+ break;
+
+ if (nested_vmx_load_msr_check(vcpu, &h)) {
+ pr_debug_ratelimited(
+ "%s check failed (%u, 0x%x, 0x%x)\n",
+ __func__, j, h.index, h.reserved);
+ goto vmabort;
+ }
+
+ msr.index = h.index;
+ msr.data = h.value;
+ if (kvm_set_msr(vcpu, &msr)) {
+ pr_debug_ratelimited(
+ "%s WRMSR failed (%u, 0x%x, 0x%llx)\n",
+ __func__, j, h.index, h.value);
+ goto vmabort;
+ }
+ }
+ }
+
+ return;
+
+vmabort:
+ nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
+}
+
+/*
+ * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1
+ * and modify vmcs12 to make it see what it would expect to see there if
+ * L2 was its real guest. Must only be called when in L2 (is_guest_mode())
+ */
+static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
+ u32 exit_intr_info,
+ unsigned long exit_qualification)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+ /* trying to cancel vmlaunch/vmresume is a bug */
+ WARN_ON_ONCE(vmx->nested.nested_run_pending);
+
+ /*
+ * The only expected VM-instruction error is "VM entry with
+ * invalid control field(s)." Anything else indicates a
+ * problem with L0.
+ */
+ WARN_ON_ONCE(vmx->fail && (vmcs_read32(VM_INSTRUCTION_ERROR) !=
+ VMXERR_ENTRY_INVALID_CONTROL_FIELD));
+
+ leave_guest_mode(vcpu);
+
+ if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
+ vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
+
+ if (likely(!vmx->fail)) {
+ if (exit_reason == -1)
+ sync_vmcs12(vcpu, vmcs12);
+ else
+ prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
+ exit_qualification);
+
+ /*
+ * Must happen outside of sync_vmcs12() as it will
+ * also be used to capture vmcs12 cache as part of
+ * capturing nVMX state for snapshot (migration).
+ *
+ * Otherwise, this flush will dirty guest memory at a
+ * point it is already assumed by user-space to be
+ * immutable.
+ */
+ nested_flush_cached_shadow_vmcs12(vcpu, vmcs12);
+
+ if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr,
+ vmcs12->vm_exit_msr_store_count))
+ nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL);
+ }
+
+ vmx_switch_vmcs(vcpu, &vmx->vmcs01);
+ vm_entry_controls_reset_shadow(vmx);
+ vm_exit_controls_reset_shadow(vmx);
+ vmx_segment_cache_clear(vmx);
+
+ /* Update any VMCS fields that might have changed while L2 ran */
+ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
+ vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
+ vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
+
+ if (kvm_has_tsc_control)
+ decache_tsc_multiplier(vmx);
+
+ if (vmx->nested.change_vmcs01_virtual_apic_mode) {
+ vmx->nested.change_vmcs01_virtual_apic_mode = false;
+ vmx_set_virtual_apic_mode(vcpu);
+ } else if (!nested_cpu_has_ept(vmcs12) &&
+ nested_cpu_has2(vmcs12,
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
+ vmx_flush_tlb(vcpu, true);
+ }
+
+ /* This is needed for same reason as it was needed in prepare_vmcs02 */
+ vmx->host_rsp = 0;
+
+ /* Unpin physical memory we referred to in vmcs02 */
+ if (vmx->nested.apic_access_page) {
+ kvm_release_page_dirty(vmx->nested.apic_access_page);
+ vmx->nested.apic_access_page = NULL;
+ }
+ if (vmx->nested.virtual_apic_page) {
+ kvm_release_page_dirty(vmx->nested.virtual_apic_page);
+ vmx->nested.virtual_apic_page = NULL;
+ }
+ if (vmx->nested.pi_desc_page) {
+ kunmap(vmx->nested.pi_desc_page);
+ kvm_release_page_dirty(vmx->nested.pi_desc_page);
+ vmx->nested.pi_desc_page = NULL;
+ vmx->nested.pi_desc = NULL;
+ }
+
+ /*
+ * We are now running in L2, mmu_notifier will force to reload the
+ * page's hpa for L2 vmcs. Need to reload it for L1 before entering L1.
+ */
+ kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
+
+ if (enable_shadow_vmcs && exit_reason != -1)
+ vmx->nested.sync_shadow_vmcs = true;
+
+ /* in case we halted in L2 */
+ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+
+ if (likely(!vmx->fail)) {
+ if (exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
+ nested_exit_intr_ack_set(vcpu)) {
+ int irq = kvm_cpu_get_interrupt(vcpu);
+ WARN_ON(irq < 0);
+ vmcs12->vm_exit_intr_info = irq |
+ INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
+ }
+
+ if (exit_reason != -1)
+ trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
+ vmcs12->exit_qualification,
+ vmcs12->idt_vectoring_info_field,
+ vmcs12->vm_exit_intr_info,
+ vmcs12->vm_exit_intr_error_code,
+ KVM_ISA_VMX);
+
+ load_vmcs12_host_state(vcpu, vmcs12);
+
+ return;
+ }
+
+ /*
+ * After an early L2 VM-entry failure, we're now back
+ * in L1 which thinks it just finished a VMLAUNCH or
+ * VMRESUME instruction, so we need to set the failure
+ * flag and the VM-instruction error field of the VMCS
+ * accordingly.
+ */
+ nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+
+ /*
+ * Restore L1's host state to KVM's software model. We're here
+ * because a consistency check was caught by hardware, which
+ * means some amount of guest state has been propagated to KVM's
+ * model and needs to be unwound to the host's state.
+ */
+ nested_vmx_restore_host_state(vcpu);
+
+ /*
+ * The emulated instruction was already skipped in
+ * nested_vmx_run, but the updated RIP was never
+ * written back to the vmcs01.
+ */
+ skip_emulated_instruction(vcpu);
+ vmx->fail = 0;
+}
+
+/*
+ * Forcibly leave nested mode in order to be able to reset the VCPU later on.
+ */
+static void vmx_leave_nested(struct kvm_vcpu *vcpu)
+{
+ if (is_guest_mode(vcpu)) {
+ to_vmx(vcpu)->nested.nested_run_pending = 0;
+ nested_vmx_vmexit(vcpu, -1, 0, 0);
+ }
+ free_nested(to_vmx(vcpu));
+}
+
+/*
+ * L1's failure to enter L2 is a subset of a normal exit, as explained in
+ * 23.7 "VM-entry failures during or after loading guest state" (this also
+ * lists the acceptable exit-reason and exit-qualification parameters).
+ * It should only be called before L2 actually succeeded to run, and when
+ * vmcs01 is current (it doesn't leave_guest_mode() or switch vmcss).
+ */
+static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12,
+ u32 reason, unsigned long qualification)
+{
+ load_vmcs12_host_state(vcpu, vmcs12);
+ vmcs12->vm_exit_reason = reason | VMX_EXIT_REASONS_FAILED_VMENTRY;
+ vmcs12->exit_qualification = qualification;
+ nested_vmx_succeed(vcpu);
+ if (enable_shadow_vmcs)
+ to_vmx(vcpu)->nested.sync_shadow_vmcs = true;
+}
+
+static int vmx_check_intercept_io(struct kvm_vcpu *vcpu,
+ struct x86_instruction_info *info)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ unsigned short port;
+ bool intercept;
+ int size;
+
+ if (info->intercept == x86_intercept_in ||
+ info->intercept == x86_intercept_ins) {
+ port = info->src_val;
+ size = info->dst_bytes;
+ } else {
+ port = info->dst_val;
+ size = info->src_bytes;
+ }
+
+ /*
+ * If the 'use IO bitmaps' VM-execution control is 0, IO instruction
+ * VM-exits depend on the 'unconditional IO exiting' VM-execution
+ * control.
+ *
+ * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps.
+ */
+ if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
+ intercept = nested_cpu_has(vmcs12,
+ CPU_BASED_UNCOND_IO_EXITING);
+ else
+ intercept = nested_vmx_check_io_bitmaps(vcpu, port, size);
+
+ /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */
+ return intercept ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE;
+}
+
+static int vmx_check_intercept(struct kvm_vcpu *vcpu,
+ struct x86_instruction_info *info,
+ enum x86_intercept_stage stage)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+
+ switch (info->intercept) {
+ /*
+ * RDPID causes #UD if disabled through secondary execution controls.
+ * Because it is marked as EmulateOnUD, we need to intercept it here.
+ */
+ case x86_intercept_rdtscp:
+ if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) {
+ ctxt->exception.vector = UD_VECTOR;
+ ctxt->exception.error_code_valid = false;
+ return X86EMUL_PROPAGATE_FAULT;
+ }
+ break;
+
+ case x86_intercept_in:
+ case x86_intercept_ins:
+ case x86_intercept_out:
+ case x86_intercept_outs:
+ return vmx_check_intercept_io(vcpu, info);
+
+ case x86_intercept_lgdt:
+ case x86_intercept_lidt:
+ case x86_intercept_lldt:
+ case x86_intercept_ltr:
+ case x86_intercept_sgdt:
+ case x86_intercept_sidt:
+ case x86_intercept_sldt:
+ case x86_intercept_str:
+ if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC))
+ return X86EMUL_CONTINUE;
+
+ /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */
+ break;
+
+ /* TODO: check more intercepts... */
+ default:
+ break;
+ }
+
+ return X86EMUL_UNHANDLEABLE;
+}
+
+#ifdef CONFIG_X86_64
+/* (a << shift) / divisor, return 1 if overflow otherwise 0 */
+static inline int u64_shl_div_u64(u64 a, unsigned int shift,
+ u64 divisor, u64 *result)
+{
+ u64 low = a << shift, high = a >> (64 - shift);
+
+ /* To avoid the overflow on divq */
+ if (high >= divisor)
+ return 1;
+
+ /* Low hold the result, high hold rem which is discarded */
+ asm("divq %2\n\t" : "=a" (low), "=d" (high) :
+ "rm" (divisor), "0" (low), "1" (high));
+ *result = low;
+
+ return 0;
+}
+
+static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc)
+{
+ struct vcpu_vmx *vmx;
+ u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles;
+
+ if (kvm_mwait_in_guest(vcpu->kvm))
+ return -EOPNOTSUPP;
+
+ vmx = to_vmx(vcpu);
+ tscl = rdtsc();
+ guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
+ delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
+ lapic_timer_advance_cycles = nsec_to_cycles(vcpu, lapic_timer_advance_ns);
+
+ if (delta_tsc > lapic_timer_advance_cycles)
+ delta_tsc -= lapic_timer_advance_cycles;
+ else
+ delta_tsc = 0;
+
+ /* Convert to host delta tsc if tsc scaling is enabled */
+ if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio &&
+ u64_shl_div_u64(delta_tsc,
+ kvm_tsc_scaling_ratio_frac_bits,
+ vcpu->arch.tsc_scaling_ratio,
+ &delta_tsc))
+ return -ERANGE;
+
+ /*
+ * If the delta tsc can't fit in the 32 bit after the multi shift,
+ * we can't use the preemption timer.
+ * It's possible that it fits on later vmentries, but checking
+ * on every vmentry is costly so we just use an hrtimer.
+ */
+ if (delta_tsc >> (cpu_preemption_timer_multi + 32))
+ return -ERANGE;
+
+ vmx->hv_deadline_tsc = tscl + delta_tsc;
+ return delta_tsc == 0;
+}
+
+static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
+{
+ to_vmx(vcpu)->hv_deadline_tsc = -1;
+}
+#endif
+
+static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
+{
+ if (!kvm_pause_in_guest(vcpu->kvm))
+ shrink_ple_window(vcpu);
+}
+
+static void vmx_slot_enable_log_dirty(struct kvm *kvm,
+ struct kvm_memory_slot *slot)
+{
+ kvm_mmu_slot_leaf_clear_dirty(kvm, slot);
+ kvm_mmu_slot_largepage_remove_write_access(kvm, slot);
+}
+
+static void vmx_slot_disable_log_dirty(struct kvm *kvm,
+ struct kvm_memory_slot *slot)
+{
+ kvm_mmu_slot_set_dirty(kvm, slot);
+}
+
+static void vmx_flush_log_dirty(struct kvm *kvm)
+{
+ kvm_flush_pml_buffers(kvm);
+}
+
+static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa)
+{
+ struct vmcs12 *vmcs12;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct page *page = NULL;
+ u64 *pml_address;
+
+ if (is_guest_mode(vcpu)) {
+ WARN_ON_ONCE(vmx->nested.pml_full);
+
+ /*
+ * Check if PML is enabled for the nested guest.
+ * Whether eptp bit 6 is set is already checked
+ * as part of A/D emulation.
+ */
+ vmcs12 = get_vmcs12(vcpu);
+ if (!nested_cpu_has_pml(vmcs12))
+ return 0;
+
+ if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) {
+ vmx->nested.pml_full = true;
+ return 1;
+ }
+
+ gpa &= ~0xFFFull;
+
+ page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->pml_address);
+ if (is_error_page(page))
+ return 0;
+
+ pml_address = kmap(page);
+ pml_address[vmcs12->guest_pml_index--] = gpa;
+ kunmap(page);
+ kvm_release_page_clean(page);
+ }
+
+ return 0;
+}
+
+static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
+ struct kvm_memory_slot *memslot,
+ gfn_t offset, unsigned long mask)
+{
+ kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask);
+}
+
+static void __pi_post_block(struct kvm_vcpu *vcpu)
+{
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+ struct pi_desc old, new;
+ unsigned int dest;
+
+ do {
+ old.control = new.control = pi_desc->control;
+ WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR,
+ "Wakeup handler not enabled while the VCPU is blocked\n");
+
+ dest = cpu_physical_id(vcpu->cpu);
+
+ if (x2apic_enabled())
+ new.ndst = dest;
+ else
+ new.ndst = (dest << 8) & 0xFF00;
+
+ /* set 'NV' to 'notification vector' */
+ new.nv = POSTED_INTR_VECTOR;
+ } while (cmpxchg64(&pi_desc->control, old.control,
+ new.control) != old.control);
+
+ if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) {
+ spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
+ list_del(&vcpu->blocked_vcpu_list);
+ spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
+ vcpu->pre_pcpu = -1;
+ }
+}
+
+/*
+ * This routine does the following things for vCPU which is going
+ * to be blocked if VT-d PI is enabled.
+ * - Store the vCPU to the wakeup list, so when interrupts happen
+ * we can find the right vCPU to wake up.
+ * - Change the Posted-interrupt descriptor as below:
+ * 'NDST' <-- vcpu->pre_pcpu
+ * 'NV' <-- POSTED_INTR_WAKEUP_VECTOR
+ * - If 'ON' is set during this process, which means at least one
+ * interrupt is posted for this vCPU, we cannot block it, in
+ * this case, return 1, otherwise, return 0.
+ *
+ */
+static int pi_pre_block(struct kvm_vcpu *vcpu)
+{
+ unsigned int dest;
+ struct pi_desc old, new;
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+
+ if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
+ !irq_remapping_cap(IRQ_POSTING_CAP) ||
+ !kvm_vcpu_apicv_active(vcpu))
+ return 0;
+
+ WARN_ON(irqs_disabled());
+ local_irq_disable();
+ if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) {
+ vcpu->pre_pcpu = vcpu->cpu;
+ spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
+ list_add_tail(&vcpu->blocked_vcpu_list,
+ &per_cpu(blocked_vcpu_on_cpu,
+ vcpu->pre_pcpu));
+ spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
+ }
+
+ do {
+ old.control = new.control = pi_desc->control;
+
+ WARN((pi_desc->sn == 1),
+ "Warning: SN field of posted-interrupts "
+ "is set before blocking\n");
+
+ /*
+ * Since vCPU can be preempted during this process,
+ * vcpu->cpu could be different with pre_pcpu, we
+ * need to set pre_pcpu as the destination of wakeup
+ * notification event, then we can find the right vCPU
+ * to wakeup in wakeup handler if interrupts happen
+ * when the vCPU is in blocked state.
+ */
+ dest = cpu_physical_id(vcpu->pre_pcpu);
+
+ if (x2apic_enabled())
+ new.ndst = dest;
+ else
+ new.ndst = (dest << 8) & 0xFF00;
+
+ /* set 'NV' to 'wakeup vector' */
+ new.nv = POSTED_INTR_WAKEUP_VECTOR;
+ } while (cmpxchg64(&pi_desc->control, old.control,
+ new.control) != old.control);
+
+ /* We should not block the vCPU if an interrupt is posted for it. */
+ if (pi_test_on(pi_desc) == 1)
+ __pi_post_block(vcpu);
+
+ local_irq_enable();
+ return (vcpu->pre_pcpu == -1);
+}
+
+static int vmx_pre_block(struct kvm_vcpu *vcpu)
+{
+ if (pi_pre_block(vcpu))
+ return 1;
+
+ if (kvm_lapic_hv_timer_in_use(vcpu))
+ kvm_lapic_switch_to_sw_timer(vcpu);
+
+ return 0;
+}
+
+static void pi_post_block(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->pre_pcpu == -1)
+ return;
+
+ WARN_ON(irqs_disabled());
+ local_irq_disable();
+ __pi_post_block(vcpu);
+ local_irq_enable();
+}
+
+static void vmx_post_block(struct kvm_vcpu *vcpu)
+{
+ if (kvm_x86_ops->set_hv_timer)
+ kvm_lapic_switch_to_hv_timer(vcpu);
+
+ pi_post_block(vcpu);
+}
+
+/*
+ * vmx_update_pi_irte - set IRTE for Posted-Interrupts
+ *
+ * @kvm: kvm
+ * @host_irq: host irq of the interrupt
+ * @guest_irq: gsi of the interrupt
+ * @set: set or unset PI
+ * returns 0 on success, < 0 on failure
+ */
+static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
+ uint32_t guest_irq, bool set)
+{
+ struct kvm_kernel_irq_routing_entry *e;
+ struct kvm_irq_routing_table *irq_rt;
+ struct kvm_lapic_irq irq;
+ struct kvm_vcpu *vcpu;
+ struct vcpu_data vcpu_info;
+ int idx, ret = 0;
+
+ if (!kvm_arch_has_assigned_device(kvm) ||
+ !irq_remapping_cap(IRQ_POSTING_CAP) ||
+ !kvm_vcpu_apicv_active(kvm->vcpus[0]))
+ return 0;
+
+ idx = srcu_read_lock(&kvm->irq_srcu);
+ irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
+ if (guest_irq >= irq_rt->nr_rt_entries ||
+ hlist_empty(&irq_rt->map[guest_irq])) {
+ pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
+ guest_irq, irq_rt->nr_rt_entries);
+ goto out;
+ }
+
+ hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
+ if (e->type != KVM_IRQ_ROUTING_MSI)
+ continue;
+ /*
+ * VT-d PI cannot support posting multicast/broadcast
+ * interrupts to a vCPU, we still use interrupt remapping
+ * for these kind of interrupts.
+ *
+ * For lowest-priority interrupts, we only support
+ * those with single CPU as the destination, e.g. user
+ * configures the interrupts via /proc/irq or uses
+ * irqbalance to make the interrupts single-CPU.
+ *
+ * We will support full lowest-priority interrupt later.
+ */
+
+ kvm_set_msi_irq(kvm, e, &irq);
+ if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) {
+ /*
+ * Make sure the IRTE is in remapped mode if
+ * we don't handle it in posted mode.
+ */
+ ret = irq_set_vcpu_affinity(host_irq, NULL);
+ if (ret < 0) {
+ printk(KERN_INFO
+ "failed to back to remapped mode, irq: %u\n",
+ host_irq);
+ goto out;
+ }
+
+ continue;
+ }
+
+ vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
+ vcpu_info.vector = irq.vector;
+
+ trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi,
+ vcpu_info.vector, vcpu_info.pi_desc_addr, set);
+
+ if (set)
+ ret = irq_set_vcpu_affinity(host_irq, &vcpu_info);
+ else
+ ret = irq_set_vcpu_affinity(host_irq, NULL);
+
+ if (ret < 0) {
+ printk(KERN_INFO "%s: failed to update PI IRTE\n",
+ __func__);
+ goto out;
+ }
+ }
+
+ ret = 0;
+out:
+ srcu_read_unlock(&kvm->irq_srcu, idx);
+ return ret;
+}
+
+static void vmx_setup_mce(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->arch.mcg_cap & MCG_LMCE_P)
+ to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
+ FEATURE_CONTROL_LMCE;
+ else
+ to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
+ ~FEATURE_CONTROL_LMCE;
+}
+
+static int vmx_smi_allowed(struct kvm_vcpu *vcpu)
+{
+ /* we need a nested vmexit to enter SMM, postpone if run is pending */
+ if (to_vmx(vcpu)->nested.nested_run_pending)
+ return 0;
+ return 1;
+}
+
+static int vmx_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ vmx->nested.smm.guest_mode = is_guest_mode(vcpu);
+ if (vmx->nested.smm.guest_mode)
+ nested_vmx_vmexit(vcpu, -1, 0, 0);
+
+ vmx->nested.smm.vmxon = vmx->nested.vmxon;
+ vmx->nested.vmxon = false;
+ vmx_clear_hlt(vcpu);
+ return 0;
+}
+
+static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int ret;
+
+ if (vmx->nested.smm.vmxon) {
+ vmx->nested.vmxon = true;
+ vmx->nested.smm.vmxon = false;
+ }
+
+ if (vmx->nested.smm.guest_mode) {
+ vcpu->arch.hflags &= ~HF_SMM_MASK;
+ ret = enter_vmx_non_root_mode(vcpu, NULL);
+ vcpu->arch.hflags |= HF_SMM_MASK;
+ if (ret)
+ return ret;
+
+ vmx->nested.smm.guest_mode = false;
+ }
+ return 0;
+}
+
+static int enable_smi_window(struct kvm_vcpu *vcpu)
+{
+ return 0;
+}
+
+static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
+ struct kvm_nested_state __user *user_kvm_nested_state,
+ u32 user_data_size)
+{
+ struct vcpu_vmx *vmx;
+ struct vmcs12 *vmcs12;
+ struct kvm_nested_state kvm_state = {
+ .flags = 0,
+ .format = 0,
+ .size = sizeof(kvm_state),
+ .vmx.vmxon_pa = -1ull,
+ .vmx.vmcs_pa = -1ull,
+ };
+
+ if (!vcpu)
+ return kvm_state.size + 2 * VMCS12_SIZE;
+
+ vmx = to_vmx(vcpu);
+ vmcs12 = get_vmcs12(vcpu);
+ if (nested_vmx_allowed(vcpu) &&
+ (vmx->nested.vmxon || vmx->nested.smm.vmxon)) {
+ kvm_state.vmx.vmxon_pa = vmx->nested.vmxon_ptr;
+ kvm_state.vmx.vmcs_pa = vmx->nested.current_vmptr;
+
+ if (vmx->nested.current_vmptr != -1ull) {
+ kvm_state.size += VMCS12_SIZE;
+
+ if (is_guest_mode(vcpu) &&
+ nested_cpu_has_shadow_vmcs(vmcs12) &&
+ vmcs12->vmcs_link_pointer != -1ull)
+ kvm_state.size += VMCS12_SIZE;
+ }
+
+ if (vmx->nested.smm.vmxon)
+ kvm_state.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON;
+
+ if (vmx->nested.smm.guest_mode)
+ kvm_state.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE;
+
+ if (is_guest_mode(vcpu)) {
+ kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE;
+
+ if (vmx->nested.nested_run_pending)
+ kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING;
+ }
+ }
+
+ if (user_data_size < kvm_state.size)
+ goto out;
+
+ if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state)))
+ return -EFAULT;
+
+ if (vmx->nested.current_vmptr == -1ull)
+ goto out;
+
+ /*
+ * When running L2, the authoritative vmcs12 state is in the
+ * vmcs02. When running L1, the authoritative vmcs12 state is
+ * in the shadow vmcs linked to vmcs01, unless
+ * sync_shadow_vmcs is set, in which case, the authoritative
+ * vmcs12 state is in the vmcs12 already.
+ */
+ if (is_guest_mode(vcpu))
+ sync_vmcs12(vcpu, vmcs12);
+ else if (enable_shadow_vmcs && !vmx->nested.sync_shadow_vmcs)
+ copy_shadow_to_vmcs12(vmx);
+
+ /*
+ * Copy over the full allocated size of vmcs12 rather than just the size
+ * of the struct.
+ */
+ if (copy_to_user(user_kvm_nested_state->data, vmcs12, VMCS12_SIZE))
+ return -EFAULT;
+
+ if (nested_cpu_has_shadow_vmcs(vmcs12) &&
+ vmcs12->vmcs_link_pointer != -1ull) {
+ if (copy_to_user(user_kvm_nested_state->data + VMCS12_SIZE,
+ get_shadow_vmcs12(vcpu), VMCS12_SIZE))
+ return -EFAULT;
+ }
+
+out:
+ return kvm_state.size;
+}
+
+static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
+ struct kvm_nested_state __user *user_kvm_nested_state,
+ struct kvm_nested_state *kvm_state)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vmcs12 *vmcs12;
+ u32 exit_qual;
+ int ret;
+
+ if (kvm_state->format != 0)
+ return -EINVAL;
+
+ if (!nested_vmx_allowed(vcpu))
+ return kvm_state->vmx.vmxon_pa == -1ull ? 0 : -EINVAL;
+
+ if (kvm_state->vmx.vmxon_pa == -1ull) {
+ if (kvm_state->vmx.smm.flags)
+ return -EINVAL;
+
+ if (kvm_state->vmx.vmcs_pa != -1ull)
+ return -EINVAL;
+
+ vmx_leave_nested(vcpu);
+ return 0;
+ }
+
+ if (!page_address_valid(vcpu, kvm_state->vmx.vmxon_pa))
+ return -EINVAL;
+
+ if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
+ (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
+ return -EINVAL;
+
+ if (kvm_state->vmx.smm.flags &
+ ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON))
+ return -EINVAL;
+
+ /*
+ * SMM temporarily disables VMX, so we cannot be in guest mode,
+ * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags
+ * must be zero.
+ */
+ if (is_smm(vcpu) ? kvm_state->flags : kvm_state->vmx.smm.flags)
+ return -EINVAL;
+
+ if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
+ !(kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON))
+ return -EINVAL;
+
+ vmx_leave_nested(vcpu);
+ if (kvm_state->vmx.vmxon_pa == -1ull)
+ return 0;
+
+ vmx->nested.vmxon_ptr = kvm_state->vmx.vmxon_pa;
+ ret = enter_vmx_operation(vcpu);
+ if (ret)
+ return ret;
+
+ /* Empty 'VMXON' state is permitted */
+ if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12))
+ return 0;
+
+ if (kvm_state->vmx.vmcs_pa == kvm_state->vmx.vmxon_pa ||
+ !page_address_valid(vcpu, kvm_state->vmx.vmcs_pa))
+ return -EINVAL;
+
+ set_current_vmptr(vmx, kvm_state->vmx.vmcs_pa);
+
+ if (kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) {
+ vmx->nested.smm.vmxon = true;
+ vmx->nested.vmxon = false;
+
+ if (kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE)
+ vmx->nested.smm.guest_mode = true;
+ }
+
+ vmcs12 = get_vmcs12(vcpu);
+ if (copy_from_user(vmcs12, user_kvm_nested_state->data, sizeof(*vmcs12)))
+ return -EFAULT;
+
+ if (vmcs12->hdr.revision_id != VMCS12_REVISION)
+ return -EINVAL;
+
+ if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
+ return 0;
+
+ vmx->nested.nested_run_pending =
+ !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
+
+ if (nested_cpu_has_shadow_vmcs(vmcs12) &&
+ vmcs12->vmcs_link_pointer != -1ull) {
+ struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu);
+ if (kvm_state->size < sizeof(*kvm_state) + 2 * sizeof(*vmcs12))
+ return -EINVAL;
+
+ if (copy_from_user(shadow_vmcs12,
+ user_kvm_nested_state->data + VMCS12_SIZE,
+ sizeof(*vmcs12)))
+ return -EFAULT;
+
+ if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION ||
+ !shadow_vmcs12->hdr.shadow_vmcs)
+ return -EINVAL;
+ }
+
+ if (check_vmentry_prereqs(vcpu, vmcs12) ||
+ check_vmentry_postreqs(vcpu, vmcs12, &exit_qual))
+ return -EINVAL;
+
+ vmx->nested.dirty_vmcs12 = true;
+ ret = enter_vmx_non_root_mode(vcpu, NULL);
+ if (ret)
+ return -EINVAL;
+
+ return 0;
+}
+
+static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
+ .cpu_has_kvm_support = cpu_has_kvm_support,
+ .disabled_by_bios = vmx_disabled_by_bios,
+ .hardware_setup = hardware_setup,
+ .hardware_unsetup = hardware_unsetup,
+ .check_processor_compatibility = vmx_check_processor_compat,
+ .hardware_enable = hardware_enable,
+ .hardware_disable = hardware_disable,
+ .cpu_has_accelerated_tpr = report_flexpriority,
+ .has_emulated_msr = vmx_has_emulated_msr,
+
+ .vm_init = vmx_vm_init,
+ .vm_alloc = vmx_vm_alloc,
+ .vm_free = vmx_vm_free,
+
+ .vcpu_create = vmx_create_vcpu,
+ .vcpu_free = vmx_free_vcpu,
+ .vcpu_reset = vmx_vcpu_reset,
+
+ .prepare_guest_switch = vmx_prepare_switch_to_guest,
+ .vcpu_load = vmx_vcpu_load,
+ .vcpu_put = vmx_vcpu_put,
+
+ .update_bp_intercept = update_exception_bitmap,
+ .get_msr_feature = vmx_get_msr_feature,
+ .get_msr = vmx_get_msr,
+ .set_msr = vmx_set_msr,
+ .get_segment_base = vmx_get_segment_base,
+ .get_segment = vmx_get_segment,
+ .set_segment = vmx_set_segment,
+ .get_cpl = vmx_get_cpl,
+ .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
+ .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits,
+ .decache_cr3 = vmx_decache_cr3,
+ .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
+ .set_cr0 = vmx_set_cr0,
+ .set_cr3 = vmx_set_cr3,
+ .set_cr4 = vmx_set_cr4,
+ .set_efer = vmx_set_efer,
+ .get_idt = vmx_get_idt,
+ .set_idt = vmx_set_idt,
+ .get_gdt = vmx_get_gdt,
+ .set_gdt = vmx_set_gdt,
+ .get_dr6 = vmx_get_dr6,
+ .set_dr6 = vmx_set_dr6,
+ .set_dr7 = vmx_set_dr7,
+ .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
+ .cache_reg = vmx_cache_reg,
+ .get_rflags = vmx_get_rflags,
+ .set_rflags = vmx_set_rflags,
+
+ .tlb_flush = vmx_flush_tlb,
+ .tlb_flush_gva = vmx_flush_tlb_gva,
+
+ .run = vmx_vcpu_run,
+ .handle_exit = vmx_handle_exit,
+ .skip_emulated_instruction = skip_emulated_instruction,
+ .set_interrupt_shadow = vmx_set_interrupt_shadow,
+ .get_interrupt_shadow = vmx_get_interrupt_shadow,
+ .patch_hypercall = vmx_patch_hypercall,
+ .set_irq = vmx_inject_irq,
+ .set_nmi = vmx_inject_nmi,
+ .queue_exception = vmx_queue_exception,
+ .cancel_injection = vmx_cancel_injection,
+ .interrupt_allowed = vmx_interrupt_allowed,
+ .nmi_allowed = vmx_nmi_allowed,
+ .get_nmi_mask = vmx_get_nmi_mask,
+ .set_nmi_mask = vmx_set_nmi_mask,
+ .enable_nmi_window = enable_nmi_window,
+ .enable_irq_window = enable_irq_window,
+ .update_cr8_intercept = update_cr8_intercept,
+ .set_virtual_apic_mode = vmx_set_virtual_apic_mode,
+ .set_apic_access_page_addr = vmx_set_apic_access_page_addr,
+ .get_enable_apicv = vmx_get_enable_apicv,
+ .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
+ .load_eoi_exitmap = vmx_load_eoi_exitmap,
+ .apicv_post_state_restore = vmx_apicv_post_state_restore,
+ .hwapic_irr_update = vmx_hwapic_irr_update,
+ .hwapic_isr_update = vmx_hwapic_isr_update,
+ .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
+ .sync_pir_to_irr = vmx_sync_pir_to_irr,
+ .deliver_posted_interrupt = vmx_deliver_posted_interrupt,
+ .dy_apicv_has_pending_interrupt = vmx_dy_apicv_has_pending_interrupt,
+
+ .set_tss_addr = vmx_set_tss_addr,
+ .set_identity_map_addr = vmx_set_identity_map_addr,
+ .get_tdp_level = get_ept_level,
+ .get_mt_mask = vmx_get_mt_mask,
+
+ .get_exit_info = vmx_get_exit_info,
+
+ .get_lpage_level = vmx_get_lpage_level,
+
+ .cpuid_update = vmx_cpuid_update,
+
+ .rdtscp_supported = vmx_rdtscp_supported,
+ .invpcid_supported = vmx_invpcid_supported,
+
+ .set_supported_cpuid = vmx_set_supported_cpuid,
+
+ .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
+
+ .read_l1_tsc_offset = vmx_read_l1_tsc_offset,
+ .write_l1_tsc_offset = vmx_write_l1_tsc_offset,
+
+ .set_tdp_cr3 = vmx_set_cr3,
+
+ .check_intercept = vmx_check_intercept,
+ .handle_external_intr = vmx_handle_external_intr,
+ .mpx_supported = vmx_mpx_supported,
+ .xsaves_supported = vmx_xsaves_supported,
+ .umip_emulated = vmx_umip_emulated,
+
+ .check_nested_events = vmx_check_nested_events,
+ .request_immediate_exit = vmx_request_immediate_exit,
+
+ .sched_in = vmx_sched_in,
+
+ .slot_enable_log_dirty = vmx_slot_enable_log_dirty,
+ .slot_disable_log_dirty = vmx_slot_disable_log_dirty,
+ .flush_log_dirty = vmx_flush_log_dirty,
+ .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked,
+ .write_log_dirty = vmx_write_pml_buffer,
+
+ .pre_block = vmx_pre_block,
+ .post_block = vmx_post_block,
+
+ .pmu_ops = &intel_pmu_ops,
+
+ .update_pi_irte = vmx_update_pi_irte,
+
+#ifdef CONFIG_X86_64
+ .set_hv_timer = vmx_set_hv_timer,
+ .cancel_hv_timer = vmx_cancel_hv_timer,
+#endif
+
+ .setup_mce = vmx_setup_mce,
+
+ .get_nested_state = vmx_get_nested_state,
+ .set_nested_state = vmx_set_nested_state,
+ .get_vmcs12_pages = nested_get_vmcs12_pages,
+
+ .smi_allowed = vmx_smi_allowed,
+ .pre_enter_smm = vmx_pre_enter_smm,
+ .pre_leave_smm = vmx_pre_leave_smm,
+ .enable_smi_window = enable_smi_window,
+};
+
+static void vmx_cleanup_l1d_flush(void)
+{
+ if (vmx_l1d_flush_pages) {
+ free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER);
+ vmx_l1d_flush_pages = NULL;
+ }
+ /* Restore state so sysfs ignores VMX */
+ l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
+}
+
+static void vmx_exit(void)
+{
+#ifdef CONFIG_KEXEC_CORE
+ RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
+ synchronize_rcu();
+#endif
+
+ kvm_exit();
+
+#if IS_ENABLED(CONFIG_HYPERV)
+ if (static_branch_unlikely(&enable_evmcs)) {
+ int cpu;
+ struct hv_vp_assist_page *vp_ap;
+ /*
+ * Reset everything to support using non-enlightened VMCS
+ * access later (e.g. when we reload the module with
+ * enlightened_vmcs=0)
+ */
+ for_each_online_cpu(cpu) {
+ vp_ap = hv_get_vp_assist_page(cpu);
+
+ if (!vp_ap)
+ continue;
+
+ vp_ap->current_nested_vmcs = 0;
+ vp_ap->enlighten_vmentry = 0;
+ }
+
+ static_branch_disable(&enable_evmcs);
+ }
+#endif
+ vmx_cleanup_l1d_flush();
+}
+module_exit(vmx_exit);
+
+static int __init vmx_init(void)
+{
+ int r, cpu;
+
+#if IS_ENABLED(CONFIG_HYPERV)
+ /*
+ * Enlightened VMCS usage should be recommended and the host needs
+ * to support eVMCS v1 or above. We can also disable eVMCS support
+ * with module parameter.
+ */
+ if (enlightened_vmcs &&
+ ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
+ (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
+ KVM_EVMCS_VERSION) {
+ int cpu;
+
+ /* Check that we have assist pages on all online CPUs */
+ for_each_online_cpu(cpu) {
+ if (!hv_get_vp_assist_page(cpu)) {
+ enlightened_vmcs = false;
+ break;
+ }
+ }
+
+ if (enlightened_vmcs) {
+ pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n");
+ static_branch_enable(&enable_evmcs);
+ }
+ } else {
+ enlightened_vmcs = false;
+ }
+#endif
+
+ r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
+ __alignof__(struct vcpu_vmx), THIS_MODULE);
+ if (r)
+ return r;
+
+ /*
+ * Must be called after kvm_init() so enable_ept is properly set
+ * up. Hand the parameter mitigation value in which was stored in
+ * the pre module init parser. If no parameter was given, it will
+ * contain 'auto' which will be turned into the default 'cond'
+ * mitigation mode.
+ */
+ if (boot_cpu_has(X86_BUG_L1TF)) {
+ r = vmx_setup_l1d_flush(vmentry_l1d_flush_param);
+ if (r) {
+ vmx_exit();
+ return r;
+ }
+ }
+
+ vmx_setup_fb_clear_ctrl();
+
+ for_each_possible_cpu(cpu) {
+ INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
+
+ INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
+ spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
+ }
+
+#ifdef CONFIG_KEXEC_CORE
+ rcu_assign_pointer(crash_vmclear_loaded_vmcss,
+ crash_vmclear_local_loaded_vmcss);
+#endif
+ vmx_check_vmcs12_offsets();
+
+ return 0;
+}
+module_init(vmx_init);
diff --git a/arch/x86/kvm/vmx_evmcs.h b/arch/x86/kvm/vmx_evmcs.h
new file mode 100644
index 000000000..210a88409
--- /dev/null
+++ b/arch/x86/kvm/vmx_evmcs.h
@@ -0,0 +1,324 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_VMX_EVMCS_H
+#define __KVM_X86_VMX_EVMCS_H
+
+#include <asm/hyperv-tlfs.h>
+
+#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n)))))
+#define EVMCS1_OFFSET(x) offsetof(struct hv_enlightened_vmcs, x)
+#define EVMCS1_FIELD(number, name, clean_field)[ROL16(number, 6)] = \
+ {EVMCS1_OFFSET(name), clean_field}
+
+struct evmcs_field {
+ u16 offset;
+ u16 clean_field;
+};
+
+static const struct evmcs_field vmcs_field_to_evmcs_1[] = {
+ /* 64 bit rw */
+ EVMCS1_FIELD(GUEST_RIP, guest_rip,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(GUEST_RSP, guest_rsp,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC),
+ EVMCS1_FIELD(GUEST_RFLAGS, guest_rflags,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC),
+ EVMCS1_FIELD(HOST_IA32_PAT, host_ia32_pat,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_IA32_EFER, host_ia32_efer,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_CR0, host_cr0,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_CR3, host_cr3,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_CR4, host_cr4,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_RIP, host_rip,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(IO_BITMAP_A, io_bitmap_a,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP),
+ EVMCS1_FIELD(IO_BITMAP_B, io_bitmap_b,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP),
+ EVMCS1_FIELD(MSR_BITMAP, msr_bitmap,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP),
+ EVMCS1_FIELD(GUEST_ES_BASE, guest_es_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_CS_BASE, guest_cs_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_SS_BASE, guest_ss_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_DS_BASE, guest_ds_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_FS_BASE, guest_fs_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_GS_BASE, guest_gs_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_LDTR_BASE, guest_ldtr_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_TR_BASE, guest_tr_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_GDTR_BASE, guest_gdtr_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_IDTR_BASE, guest_idtr_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(TSC_OFFSET, tsc_offset,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2),
+ EVMCS1_FIELD(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2),
+ EVMCS1_FIELD(VMCS_LINK_POINTER, vmcs_link_pointer,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_IA32_PAT, guest_ia32_pat,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_IA32_EFER, guest_ia32_efer,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_PDPTR0, guest_pdptr0,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_PDPTR1, guest_pdptr1,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_PDPTR2, guest_pdptr2,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_PDPTR3, guest_pdptr3,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
+ EVMCS1_FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
+ EVMCS1_FIELD(CR0_READ_SHADOW, cr0_read_shadow,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
+ EVMCS1_FIELD(CR4_READ_SHADOW, cr4_read_shadow,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
+ EVMCS1_FIELD(GUEST_CR0, guest_cr0,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
+ EVMCS1_FIELD(GUEST_CR3, guest_cr3,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
+ EVMCS1_FIELD(GUEST_CR4, guest_cr4,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
+ EVMCS1_FIELD(GUEST_DR7, guest_dr7,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR),
+ EVMCS1_FIELD(HOST_FS_BASE, host_fs_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER),
+ EVMCS1_FIELD(HOST_GS_BASE, host_gs_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER),
+ EVMCS1_FIELD(HOST_TR_BASE, host_tr_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER),
+ EVMCS1_FIELD(HOST_GDTR_BASE, host_gdtr_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER),
+ EVMCS1_FIELD(HOST_IDTR_BASE, host_idtr_base,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER),
+ EVMCS1_FIELD(HOST_RSP, host_rsp,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER),
+ EVMCS1_FIELD(EPT_POINTER, ept_pointer,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT),
+ EVMCS1_FIELD(GUEST_BNDCFGS, guest_bndcfgs,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(XSS_EXIT_BITMAP, xss_exit_bitmap,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2),
+
+ /* 64 bit read only */
+ EVMCS1_FIELD(GUEST_PHYSICAL_ADDRESS, guest_physical_address,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(EXIT_QUALIFICATION, exit_qualification,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ /*
+ * Not defined in KVM:
+ *
+ * EVMCS1_FIELD(0x00006402, exit_io_instruction_ecx,
+ * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE);
+ * EVMCS1_FIELD(0x00006404, exit_io_instruction_esi,
+ * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE);
+ * EVMCS1_FIELD(0x00006406, exit_io_instruction_esi,
+ * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE);
+ * EVMCS1_FIELD(0x00006408, exit_io_instruction_eip,
+ * HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE);
+ */
+ EVMCS1_FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+
+ /*
+ * No mask defined in the spec as Hyper-V doesn't currently support
+ * these. Future proof by resetting the whole clean field mask on
+ * access.
+ */
+ EVMCS1_FIELD(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(CR3_TARGET_VALUE0, cr3_target_value0,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(CR3_TARGET_VALUE1, cr3_target_value1,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(CR3_TARGET_VALUE2, cr3_target_value2,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(CR3_TARGET_VALUE3, cr3_target_value3,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+
+ /* 32 bit rw */
+ EVMCS1_FIELD(TPR_THRESHOLD, tpr_threshold,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC),
+ EVMCS1_FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC),
+ EVMCS1_FIELD(EXCEPTION_BITMAP, exception_bitmap,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN),
+ EVMCS1_FIELD(VM_ENTRY_CONTROLS, vm_entry_controls,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY),
+ EVMCS1_FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT),
+ EVMCS1_FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE,
+ vm_entry_exception_error_code,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT),
+ EVMCS1_FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT),
+ EVMCS1_FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1),
+ EVMCS1_FIELD(VM_EXIT_CONTROLS, vm_exit_controls,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1),
+ EVMCS1_FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1),
+ EVMCS1_FIELD(GUEST_ES_LIMIT, guest_es_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_CS_LIMIT, guest_cs_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_SS_LIMIT, guest_ss_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_DS_LIMIT, guest_ds_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_FS_LIMIT, guest_fs_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_GS_LIMIT, guest_gs_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_TR_LIMIT, guest_tr_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_ACTIVITY_STATE, guest_activity_state,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+
+ /* 32 bit read only */
+ EVMCS1_FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(VM_EXIT_REASON, vm_exit_reason,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+ EVMCS1_FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
+
+ /* No mask defined in the spec (not used) */
+ EVMCS1_FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(CR3_TARGET_COUNT, cr3_target_count,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+ EVMCS1_FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
+
+ /* 16 bit rw */
+ EVMCS1_FIELD(HOST_ES_SELECTOR, host_es_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_CS_SELECTOR, host_cs_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_SS_SELECTOR, host_ss_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_DS_SELECTOR, host_ds_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_FS_SELECTOR, host_fs_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_GS_SELECTOR, host_gs_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_TR_SELECTOR, host_tr_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(GUEST_ES_SELECTOR, guest_es_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_CS_SELECTOR, guest_cs_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_SS_SELECTOR, guest_ss_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_DS_SELECTOR, guest_ds_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_FS_SELECTOR, guest_fs_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_GS_SELECTOR, guest_gs_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(GUEST_TR_SELECTOR, guest_tr_selector,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2),
+ EVMCS1_FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT),
+};
+
+static __always_inline int get_evmcs_offset(unsigned long field,
+ u16 *clean_field)
+{
+ unsigned int index = ROL16(field, 6);
+ const struct evmcs_field *evmcs_field;
+
+ if (unlikely(index >= ARRAY_SIZE(vmcs_field_to_evmcs_1))) {
+ WARN_ONCE(1, "KVM: accessing unsupported EVMCS field %lx\n",
+ field);
+ return -ENOENT;
+ }
+
+ evmcs_field = &vmcs_field_to_evmcs_1[index];
+
+ if (clean_field)
+ *clean_field = evmcs_field->clean_field;
+
+ return evmcs_field->offset;
+}
+
+#undef ROL16
+
+#endif /* __KVM_X86_VMX_EVMCS_H */
diff --git a/arch/x86/kvm/vmx_shadow_fields.h b/arch/x86/kvm/vmx_shadow_fields.h
new file mode 100644
index 000000000..cd0c75f6d
--- /dev/null
+++ b/arch/x86/kvm/vmx_shadow_fields.h
@@ -0,0 +1,77 @@
+#ifndef SHADOW_FIELD_RO
+#define SHADOW_FIELD_RO(x)
+#endif
+#ifndef SHADOW_FIELD_RW
+#define SHADOW_FIELD_RW(x)
+#endif
+
+/*
+ * We do NOT shadow fields that are modified when L0
+ * traps and emulates any vmx instruction (e.g. VMPTRLD,
+ * VMXON...) executed by L1.
+ * For example, VM_INSTRUCTION_ERROR is read
+ * by L1 if a vmx instruction fails (part of the error path).
+ * Note the code assumes this logic. If for some reason
+ * we start shadowing these fields then we need to
+ * force a shadow sync when L0 emulates vmx instructions
+ * (e.g. force a sync if VM_INSTRUCTION_ERROR is modified
+ * by nested_vmx_failValid)
+ *
+ * When adding or removing fields here, note that shadowed
+ * fields must always be synced by prepare_vmcs02, not just
+ * prepare_vmcs02_full.
+ */
+
+/*
+ * Keeping the fields ordered by size is an attempt at improving
+ * branch prediction in vmcs_read_any and vmcs_write_any.
+ */
+
+/* 16-bits */
+SHADOW_FIELD_RW(GUEST_CS_SELECTOR)
+SHADOW_FIELD_RW(GUEST_INTR_STATUS)
+SHADOW_FIELD_RW(GUEST_PML_INDEX)
+SHADOW_FIELD_RW(HOST_FS_SELECTOR)
+SHADOW_FIELD_RW(HOST_GS_SELECTOR)
+
+/* 32-bits */
+SHADOW_FIELD_RO(VM_EXIT_REASON)
+SHADOW_FIELD_RO(VM_EXIT_INTR_INFO)
+SHADOW_FIELD_RO(VM_EXIT_INSTRUCTION_LEN)
+SHADOW_FIELD_RO(IDT_VECTORING_INFO_FIELD)
+SHADOW_FIELD_RO(IDT_VECTORING_ERROR_CODE)
+SHADOW_FIELD_RO(VM_EXIT_INTR_ERROR_CODE)
+SHADOW_FIELD_RW(CPU_BASED_VM_EXEC_CONTROL)
+SHADOW_FIELD_RW(EXCEPTION_BITMAP)
+SHADOW_FIELD_RW(VM_ENTRY_EXCEPTION_ERROR_CODE)
+SHADOW_FIELD_RW(VM_ENTRY_INTR_INFO_FIELD)
+SHADOW_FIELD_RW(VM_ENTRY_INSTRUCTION_LEN)
+SHADOW_FIELD_RW(TPR_THRESHOLD)
+SHADOW_FIELD_RW(GUEST_CS_LIMIT)
+SHADOW_FIELD_RW(GUEST_CS_AR_BYTES)
+SHADOW_FIELD_RW(GUEST_INTERRUPTIBILITY_INFO)
+SHADOW_FIELD_RW(VMX_PREEMPTION_TIMER_VALUE)
+
+/* Natural width */
+SHADOW_FIELD_RO(EXIT_QUALIFICATION)
+SHADOW_FIELD_RO(GUEST_LINEAR_ADDRESS)
+SHADOW_FIELD_RW(GUEST_RIP)
+SHADOW_FIELD_RW(GUEST_RSP)
+SHADOW_FIELD_RW(GUEST_CR0)
+SHADOW_FIELD_RW(GUEST_CR3)
+SHADOW_FIELD_RW(GUEST_CR4)
+SHADOW_FIELD_RW(GUEST_RFLAGS)
+SHADOW_FIELD_RW(GUEST_CS_BASE)
+SHADOW_FIELD_RW(GUEST_ES_BASE)
+SHADOW_FIELD_RW(CR0_GUEST_HOST_MASK)
+SHADOW_FIELD_RW(CR0_READ_SHADOW)
+SHADOW_FIELD_RW(CR4_READ_SHADOW)
+SHADOW_FIELD_RW(HOST_FS_BASE)
+SHADOW_FIELD_RW(HOST_GS_BASE)
+
+/* 64-bit */
+SHADOW_FIELD_RO(GUEST_PHYSICAL_ADDRESS)
+SHADOW_FIELD_RO(GUEST_PHYSICAL_ADDRESS_HIGH)
+
+#undef SHADOW_FIELD_RO
+#undef SHADOW_FIELD_RW
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
new file mode 100644
index 000000000..be4697d91
--- /dev/null
+++ b/arch/x86/kvm/x86.c
@@ -0,0 +1,9835 @@
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * derived from drivers/kvm/kvm_main.c
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright (C) 2008 Qumranet, Inc.
+ * Copyright IBM Corporation, 2008
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ * Avi Kivity <avi@qumranet.com>
+ * Yaniv Kamay <yaniv@qumranet.com>
+ * Amit Shah <amit.shah@qumranet.com>
+ * Ben-Ami Yassour <benami@il.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include <linux/kvm_host.h>
+#include "irq.h"
+#include "mmu.h"
+#include "i8254.h"
+#include "tss.h"
+#include "kvm_cache_regs.h"
+#include "x86.h"
+#include "cpuid.h"
+#include "pmu.h"
+#include "hyperv.h"
+
+#include <linux/clocksource.h>
+#include <linux/interrupt.h>
+#include <linux/kvm.h>
+#include <linux/fs.h>
+#include <linux/vmalloc.h>
+#include <linux/export.h>
+#include <linux/moduleparam.h>
+#include <linux/mman.h>
+#include <linux/highmem.h>
+#include <linux/iommu.h>
+#include <linux/intel-iommu.h>
+#include <linux/cpufreq.h>
+#include <linux/user-return-notifier.h>
+#include <linux/srcu.h>
+#include <linux/slab.h>
+#include <linux/perf_event.h>
+#include <linux/uaccess.h>
+#include <linux/hash.h>
+#include <linux/pci.h>
+#include <linux/timekeeper_internal.h>
+#include <linux/pvclock_gtod.h>
+#include <linux/kvm_irqfd.h>
+#include <linux/irqbypass.h>
+#include <linux/sched/stat.h>
+#include <linux/mem_encrypt.h>
+
+#include <trace/events/kvm.h>
+
+#include <asm/debugreg.h>
+#include <asm/msr.h>
+#include <asm/desc.h>
+#include <asm/mce.h>
+#include <linux/kernel_stat.h>
+#include <asm/fpu/internal.h> /* Ugh! */
+#include <asm/pvclock.h>
+#include <asm/div64.h>
+#include <asm/irq_remapping.h>
+#include <asm/mshyperv.h>
+#include <asm/hypervisor.h>
+
+#define CREATE_TRACE_POINTS
+#include "trace.h"
+
+#define MAX_IO_MSRS 256
+#define KVM_MAX_MCE_BANKS 32
+u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P;
+EXPORT_SYMBOL_GPL(kvm_mce_cap_supported);
+
+#define emul_to_vcpu(ctxt) \
+ container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt)
+
+/* EFER defaults:
+ * - enable syscall per default because its emulated by KVM
+ * - enable LME and LMA per default on 64 bit KVM
+ */
+#ifdef CONFIG_X86_64
+static
+u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA));
+#else
+static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
+#endif
+
+static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS;
+
+#define VM_STAT(x, ...) offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__
+#define VCPU_STAT(x, ...) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__
+
+#define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \
+ KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
+
+static void update_cr8_intercept(struct kvm_vcpu *vcpu);
+static void process_nmi(struct kvm_vcpu *vcpu);
+static void process_smi(struct kvm_vcpu *vcpu);
+static void enter_smm(struct kvm_vcpu *vcpu);
+static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
+static void store_regs(struct kvm_vcpu *vcpu);
+static int sync_regs(struct kvm_vcpu *vcpu);
+
+struct kvm_x86_ops *kvm_x86_ops __read_mostly;
+EXPORT_SYMBOL_GPL(kvm_x86_ops);
+
+static bool __read_mostly ignore_msrs = 0;
+module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
+
+static bool __read_mostly report_ignored_msrs = true;
+module_param(report_ignored_msrs, bool, S_IRUGO | S_IWUSR);
+
+unsigned int min_timer_period_us = 200;
+module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
+
+static bool __read_mostly kvmclock_periodic_sync = true;
+module_param(kvmclock_periodic_sync, bool, S_IRUGO);
+
+bool __read_mostly kvm_has_tsc_control;
+EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
+u32 __read_mostly kvm_max_guest_tsc_khz;
+EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
+u8 __read_mostly kvm_tsc_scaling_ratio_frac_bits;
+EXPORT_SYMBOL_GPL(kvm_tsc_scaling_ratio_frac_bits);
+u64 __read_mostly kvm_max_tsc_scaling_ratio;
+EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio);
+u64 __read_mostly kvm_default_tsc_scaling_ratio;
+EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio);
+
+/* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
+static u32 __read_mostly tsc_tolerance_ppm = 250;
+module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
+
+/* lapic timer advance (tscdeadline mode only) in nanoseconds */
+unsigned int __read_mostly lapic_timer_advance_ns = 0;
+module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR);
+EXPORT_SYMBOL_GPL(lapic_timer_advance_ns);
+
+static bool __read_mostly vector_hashing = true;
+module_param(vector_hashing, bool, S_IRUGO);
+
+bool __read_mostly enable_vmware_backdoor = false;
+module_param(enable_vmware_backdoor, bool, S_IRUGO);
+EXPORT_SYMBOL_GPL(enable_vmware_backdoor);
+
+static bool __read_mostly force_emulation_prefix = false;
+module_param(force_emulation_prefix, bool, S_IRUGO);
+
+#define KVM_NR_SHARED_MSRS 16
+
+struct kvm_shared_msrs_global {
+ int nr;
+ u32 msrs[KVM_NR_SHARED_MSRS];
+};
+
+struct kvm_shared_msrs {
+ struct user_return_notifier urn;
+ bool registered;
+ struct kvm_shared_msr_values {
+ u64 host;
+ u64 curr;
+ } values[KVM_NR_SHARED_MSRS];
+};
+
+static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
+static struct kvm_shared_msrs __percpu *shared_msrs;
+
+struct kvm_stats_debugfs_item debugfs_entries[] = {
+ { "pf_fixed", VCPU_STAT(pf_fixed) },
+ { "pf_guest", VCPU_STAT(pf_guest) },
+ { "tlb_flush", VCPU_STAT(tlb_flush) },
+ { "invlpg", VCPU_STAT(invlpg) },
+ { "exits", VCPU_STAT(exits) },
+ { "io_exits", VCPU_STAT(io_exits) },
+ { "mmio_exits", VCPU_STAT(mmio_exits) },
+ { "signal_exits", VCPU_STAT(signal_exits) },
+ { "irq_window", VCPU_STAT(irq_window_exits) },
+ { "nmi_window", VCPU_STAT(nmi_window_exits) },
+ { "halt_exits", VCPU_STAT(halt_exits) },
+ { "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
+ { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) },
+ { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) },
+ { "halt_wakeup", VCPU_STAT(halt_wakeup) },
+ { "hypercalls", VCPU_STAT(hypercalls) },
+ { "request_irq", VCPU_STAT(request_irq_exits) },
+ { "irq_exits", VCPU_STAT(irq_exits) },
+ { "host_state_reload", VCPU_STAT(host_state_reload) },
+ { "fpu_reload", VCPU_STAT(fpu_reload) },
+ { "insn_emulation", VCPU_STAT(insn_emulation) },
+ { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
+ { "irq_injections", VCPU_STAT(irq_injections) },
+ { "nmi_injections", VCPU_STAT(nmi_injections) },
+ { "req_event", VCPU_STAT(req_event) },
+ { "l1d_flush", VCPU_STAT(l1d_flush) },
+ { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
+ { "mmu_pte_write", VM_STAT(mmu_pte_write) },
+ { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
+ { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
+ { "mmu_flooded", VM_STAT(mmu_flooded) },
+ { "mmu_recycled", VM_STAT(mmu_recycled) },
+ { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
+ { "mmu_unsync", VM_STAT(mmu_unsync) },
+ { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
+ { "largepages", VM_STAT(lpages, .mode = 0444) },
+ { "nx_largepages_splitted", VM_STAT(nx_lpage_splits, .mode = 0444) },
+ { "max_mmu_page_hash_collisions",
+ VM_STAT(max_mmu_page_hash_collisions) },
+ { NULL }
+};
+
+u64 __read_mostly host_xcr0;
+
+static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
+
+static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
+{
+ int i;
+ for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++)
+ vcpu->arch.apf.gfns[i] = ~0;
+}
+
+static void kvm_on_user_return(struct user_return_notifier *urn)
+{
+ unsigned slot;
+ struct kvm_shared_msrs *locals
+ = container_of(urn, struct kvm_shared_msrs, urn);
+ struct kvm_shared_msr_values *values;
+ unsigned long flags;
+
+ /*
+ * Disabling irqs at this point since the following code could be
+ * interrupted and executed through kvm_arch_hardware_disable()
+ */
+ local_irq_save(flags);
+ if (locals->registered) {
+ locals->registered = false;
+ user_return_notifier_unregister(urn);
+ }
+ local_irq_restore(flags);
+ for (slot = 0; slot < shared_msrs_global.nr; ++slot) {
+ values = &locals->values[slot];
+ if (values->host != values->curr) {
+ wrmsrl(shared_msrs_global.msrs[slot], values->host);
+ values->curr = values->host;
+ }
+ }
+}
+
+static void shared_msr_update(unsigned slot, u32 msr)
+{
+ u64 value;
+ unsigned int cpu = smp_processor_id();
+ struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
+
+ /* only read, and nobody should modify it at this time,
+ * so don't need lock */
+ if (slot >= shared_msrs_global.nr) {
+ printk(KERN_ERR "kvm: invalid MSR slot!");
+ return;
+ }
+ rdmsrl_safe(msr, &value);
+ smsr->values[slot].host = value;
+ smsr->values[slot].curr = value;
+}
+
+void kvm_define_shared_msr(unsigned slot, u32 msr)
+{
+ BUG_ON(slot >= KVM_NR_SHARED_MSRS);
+ shared_msrs_global.msrs[slot] = msr;
+ if (slot >= shared_msrs_global.nr)
+ shared_msrs_global.nr = slot + 1;
+}
+EXPORT_SYMBOL_GPL(kvm_define_shared_msr);
+
+static void kvm_shared_msr_cpu_online(void)
+{
+ unsigned i;
+
+ for (i = 0; i < shared_msrs_global.nr; ++i)
+ shared_msr_update(i, shared_msrs_global.msrs[i]);
+}
+
+int kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
+{
+ unsigned int cpu = smp_processor_id();
+ struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
+ int err;
+
+ value = (value & mask) | (smsr->values[slot].host & ~mask);
+ if (value == smsr->values[slot].curr)
+ return 0;
+ err = wrmsrl_safe(shared_msrs_global.msrs[slot], value);
+ if (err)
+ return 1;
+
+ smsr->values[slot].curr = value;
+ if (!smsr->registered) {
+ smsr->urn.on_user_return = kvm_on_user_return;
+ user_return_notifier_register(&smsr->urn);
+ smsr->registered = true;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_set_shared_msr);
+
+static void drop_user_return_notifiers(void)
+{
+ unsigned int cpu = smp_processor_id();
+ struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
+
+ if (smsr->registered)
+ kvm_on_user_return(&smsr->urn);
+}
+
+u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.apic_base;
+}
+EXPORT_SYMBOL_GPL(kvm_get_apic_base);
+
+enum lapic_mode kvm_get_apic_mode(struct kvm_vcpu *vcpu)
+{
+ return kvm_apic_mode(kvm_get_apic_base(vcpu));
+}
+EXPORT_SYMBOL_GPL(kvm_get_apic_mode);
+
+int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ enum lapic_mode old_mode = kvm_get_apic_mode(vcpu);
+ enum lapic_mode new_mode = kvm_apic_mode(msr_info->data);
+ u64 reserved_bits = ((~0ULL) << cpuid_maxphyaddr(vcpu)) | 0x2ff |
+ (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC) ? 0 : X2APIC_ENABLE);
+
+ if ((msr_info->data & reserved_bits) != 0 || new_mode == LAPIC_MODE_INVALID)
+ return 1;
+ if (!msr_info->host_initiated) {
+ if (old_mode == LAPIC_MODE_X2APIC && new_mode == LAPIC_MODE_XAPIC)
+ return 1;
+ if (old_mode == LAPIC_MODE_DISABLED && new_mode == LAPIC_MODE_X2APIC)
+ return 1;
+ }
+
+ kvm_lapic_set_base(vcpu, msr_info->data);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_set_apic_base);
+
+asmlinkage __visible void kvm_spurious_fault(void)
+{
+ /* Fault while not rebooting. We want the trace. */
+ BUG();
+}
+EXPORT_SYMBOL_GPL(kvm_spurious_fault);
+
+#define EXCPT_BENIGN 0
+#define EXCPT_CONTRIBUTORY 1
+#define EXCPT_PF 2
+
+static int exception_class(int vector)
+{
+ switch (vector) {
+ case PF_VECTOR:
+ return EXCPT_PF;
+ case DE_VECTOR:
+ case TS_VECTOR:
+ case NP_VECTOR:
+ case SS_VECTOR:
+ case GP_VECTOR:
+ return EXCPT_CONTRIBUTORY;
+ default:
+ break;
+ }
+ return EXCPT_BENIGN;
+}
+
+#define EXCPT_FAULT 0
+#define EXCPT_TRAP 1
+#define EXCPT_ABORT 2
+#define EXCPT_INTERRUPT 3
+
+static int exception_type(int vector)
+{
+ unsigned int mask;
+
+ if (WARN_ON(vector > 31 || vector == NMI_VECTOR))
+ return EXCPT_INTERRUPT;
+
+ mask = 1 << vector;
+
+ /* #DB is trap, as instruction watchpoints are handled elsewhere */
+ if (mask & ((1 << DB_VECTOR) | (1 << BP_VECTOR) | (1 << OF_VECTOR)))
+ return EXCPT_TRAP;
+
+ if (mask & ((1 << DF_VECTOR) | (1 << MC_VECTOR)))
+ return EXCPT_ABORT;
+
+ /* Reserved exceptions will result in fault */
+ return EXCPT_FAULT;
+}
+
+static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
+ unsigned nr, bool has_error, u32 error_code,
+ bool reinject)
+{
+ u32 prev_nr;
+ int class1, class2;
+
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) {
+ queue:
+ if (reinject) {
+ /*
+ * On vmentry, vcpu->arch.exception.pending is only
+ * true if an event injection was blocked by
+ * nested_run_pending. In that case, however,
+ * vcpu_enter_guest requests an immediate exit,
+ * and the guest shouldn't proceed far enough to
+ * need reinjection.
+ */
+ WARN_ON_ONCE(vcpu->arch.exception.pending);
+ vcpu->arch.exception.injected = true;
+ } else {
+ vcpu->arch.exception.pending = true;
+ vcpu->arch.exception.injected = false;
+ }
+ vcpu->arch.exception.has_error_code = has_error;
+ vcpu->arch.exception.nr = nr;
+ vcpu->arch.exception.error_code = error_code;
+ return;
+ }
+
+ /* to check exception */
+ prev_nr = vcpu->arch.exception.nr;
+ if (prev_nr == DF_VECTOR) {
+ /* triple fault -> shutdown */
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ return;
+ }
+ class1 = exception_class(prev_nr);
+ class2 = exception_class(nr);
+ if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY)
+ || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
+ /*
+ * Generate double fault per SDM Table 5-5. Set
+ * exception.pending = true so that the double fault
+ * can trigger a nested vmexit.
+ */
+ vcpu->arch.exception.pending = true;
+ vcpu->arch.exception.injected = false;
+ vcpu->arch.exception.has_error_code = true;
+ vcpu->arch.exception.nr = DF_VECTOR;
+ vcpu->arch.exception.error_code = 0;
+ } else
+ /* replace previous exception with a new one in a hope
+ that instruction re-execution will regenerate lost
+ exception */
+ goto queue;
+}
+
+void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
+{
+ kvm_multiple_exception(vcpu, nr, false, 0, false);
+}
+EXPORT_SYMBOL_GPL(kvm_queue_exception);
+
+void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
+{
+ kvm_multiple_exception(vcpu, nr, false, 0, true);
+}
+EXPORT_SYMBOL_GPL(kvm_requeue_exception);
+
+int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
+{
+ if (err)
+ kvm_inject_gp(vcpu, 0);
+ else
+ return kvm_skip_emulated_instruction(vcpu);
+
+ return 1;
+}
+EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
+
+void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
+{
+ ++vcpu->stat.pf_guest;
+ vcpu->arch.exception.nested_apf =
+ is_guest_mode(vcpu) && fault->async_page_fault;
+ if (vcpu->arch.exception.nested_apf)
+ vcpu->arch.apf.nested_apf_token = fault->address;
+ else
+ vcpu->arch.cr2 = fault->address;
+ kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
+}
+EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
+
+static bool kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
+{
+ if (mmu_is_nested(vcpu) && !fault->nested_page_fault)
+ vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault);
+ else
+ vcpu->arch.mmu.inject_page_fault(vcpu, fault);
+
+ return fault->nested_page_fault;
+}
+
+void kvm_inject_nmi(struct kvm_vcpu *vcpu)
+{
+ atomic_inc(&vcpu->arch.nmi_queued);
+ kvm_make_request(KVM_REQ_NMI, vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_inject_nmi);
+
+void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
+{
+ kvm_multiple_exception(vcpu, nr, true, error_code, false);
+}
+EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
+
+void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
+{
+ kvm_multiple_exception(vcpu, nr, true, error_code, true);
+}
+EXPORT_SYMBOL_GPL(kvm_requeue_exception_e);
+
+/*
+ * Checks if cpl <= required_cpl; if true, return true. Otherwise queue
+ * a #GP and return false.
+ */
+bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
+{
+ if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl)
+ return true;
+ kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
+ return false;
+}
+EXPORT_SYMBOL_GPL(kvm_require_cpl);
+
+bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr)
+{
+ if ((dr != 4 && dr != 5) || !kvm_read_cr4_bits(vcpu, X86_CR4_DE))
+ return true;
+
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return false;
+}
+EXPORT_SYMBOL_GPL(kvm_require_dr);
+
+/*
+ * This function will be used to read from the physical memory of the currently
+ * running guest. The difference to kvm_vcpu_read_guest_page is that this function
+ * can read from guest physical or from the guest's guest physical memory.
+ */
+int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ gfn_t ngfn, void *data, int offset, int len,
+ u32 access)
+{
+ struct x86_exception exception;
+ gfn_t real_gfn;
+ gpa_t ngpa;
+
+ ngpa = gfn_to_gpa(ngfn);
+ real_gfn = mmu->translate_gpa(vcpu, ngpa, access, &exception);
+ if (real_gfn == UNMAPPED_GVA)
+ return -EFAULT;
+
+ real_gfn = gpa_to_gfn(real_gfn);
+
+ return kvm_vcpu_read_guest_page(vcpu, real_gfn, data, offset, len);
+}
+EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
+
+static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
+ void *data, int offset, int len, u32 access)
+{
+ return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn,
+ data, offset, len, access);
+}
+
+static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu)
+{
+ return rsvd_bits(cpuid_maxphyaddr(vcpu), 63) | rsvd_bits(5, 8) |
+ rsvd_bits(1, 2);
+}
+
+/*
+ * Load the pae pdptrs. Return 1 if they are all valid, 0 otherwise.
+ */
+int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
+{
+ gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
+ unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
+ int i;
+ int ret;
+ u64 pdpte[ARRAY_SIZE(mmu->pdptrs)];
+
+ ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte,
+ offset * sizeof(u64), sizeof(pdpte),
+ PFERR_USER_MASK|PFERR_WRITE_MASK);
+ if (ret < 0) {
+ ret = 0;
+ goto out;
+ }
+ for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
+ if ((pdpte[i] & PT_PRESENT_MASK) &&
+ (pdpte[i] & pdptr_rsvd_bits(vcpu))) {
+ ret = 0;
+ goto out;
+ }
+ }
+ ret = 1;
+
+ memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
+ __set_bit(VCPU_EXREG_PDPTR,
+ (unsigned long *)&vcpu->arch.regs_avail);
+ __set_bit(VCPU_EXREG_PDPTR,
+ (unsigned long *)&vcpu->arch.regs_dirty);
+out:
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(load_pdptrs);
+
+bool pdptrs_changed(struct kvm_vcpu *vcpu)
+{
+ u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)];
+ bool changed = true;
+ int offset;
+ gfn_t gfn;
+ int r;
+
+ if (!is_pae_paging(vcpu))
+ return false;
+
+ if (!test_bit(VCPU_EXREG_PDPTR,
+ (unsigned long *)&vcpu->arch.regs_avail))
+ return true;
+
+ gfn = (kvm_read_cr3(vcpu) & 0xffffffe0ul) >> PAGE_SHIFT;
+ offset = (kvm_read_cr3(vcpu) & 0xffffffe0ul) & (PAGE_SIZE - 1);
+ r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
+ PFERR_USER_MASK | PFERR_WRITE_MASK);
+ if (r < 0)
+ goto out;
+ changed = memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
+out:
+
+ return changed;
+}
+EXPORT_SYMBOL_GPL(pdptrs_changed);
+
+int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+{
+ unsigned long old_cr0 = kvm_read_cr0(vcpu);
+ unsigned long update_bits = X86_CR0_PG | X86_CR0_WP;
+
+ cr0 |= X86_CR0_ET;
+
+#ifdef CONFIG_X86_64
+ if (cr0 & 0xffffffff00000000UL)
+ return 1;
+#endif
+
+ cr0 &= ~CR0_RESERVED_BITS;
+
+ if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD))
+ return 1;
+
+ if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
+ return 1;
+
+ if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
+#ifdef CONFIG_X86_64
+ if ((vcpu->arch.efer & EFER_LME)) {
+ int cs_db, cs_l;
+
+ if (!is_pae(vcpu))
+ return 1;
+ kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
+ if (cs_l)
+ return 1;
+ } else
+#endif
+ if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
+ kvm_read_cr3(vcpu)))
+ return 1;
+ }
+
+ if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))
+ return 1;
+
+ kvm_x86_ops->set_cr0(vcpu, cr0);
+
+ if ((cr0 ^ old_cr0) & X86_CR0_PG) {
+ kvm_clear_async_pf_completion_queue(vcpu);
+ kvm_async_pf_hash_reset(vcpu);
+ }
+
+ if ((cr0 ^ old_cr0) & update_bits)
+ kvm_mmu_reset_context(vcpu);
+
+ if (((cr0 ^ old_cr0) & X86_CR0_CD) &&
+ kvm_arch_has_noncoherent_dma(vcpu->kvm) &&
+ !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
+ kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_set_cr0);
+
+void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
+{
+ (void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
+}
+EXPORT_SYMBOL_GPL(kvm_lmsw);
+
+void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu)
+{
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&
+ !vcpu->guest_xcr0_loaded) {
+ /* kvm_set_xcr() also depends on this */
+ if (vcpu->arch.xcr0 != host_xcr0)
+ xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
+ vcpu->guest_xcr0_loaded = 1;
+ }
+}
+EXPORT_SYMBOL_GPL(kvm_load_guest_xcr0);
+
+void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->guest_xcr0_loaded) {
+ if (vcpu->arch.xcr0 != host_xcr0)
+ xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
+ vcpu->guest_xcr0_loaded = 0;
+ }
+}
+EXPORT_SYMBOL_GPL(kvm_put_guest_xcr0);
+
+static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
+{
+ u64 xcr0 = xcr;
+ u64 old_xcr0 = vcpu->arch.xcr0;
+ u64 valid_bits;
+
+ /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now */
+ if (index != XCR_XFEATURE_ENABLED_MASK)
+ return 1;
+ if (!(xcr0 & XFEATURE_MASK_FP))
+ return 1;
+ if ((xcr0 & XFEATURE_MASK_YMM) && !(xcr0 & XFEATURE_MASK_SSE))
+ return 1;
+
+ /*
+ * Do not allow the guest to set bits that we do not support
+ * saving. However, xcr0 bit 0 is always set, even if the
+ * emulated CPU does not support XSAVE (see fx_init).
+ */
+ valid_bits = vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FP;
+ if (xcr0 & ~valid_bits)
+ return 1;
+
+ if ((!(xcr0 & XFEATURE_MASK_BNDREGS)) !=
+ (!(xcr0 & XFEATURE_MASK_BNDCSR)))
+ return 1;
+
+ if (xcr0 & XFEATURE_MASK_AVX512) {
+ if (!(xcr0 & XFEATURE_MASK_YMM))
+ return 1;
+ if ((xcr0 & XFEATURE_MASK_AVX512) != XFEATURE_MASK_AVX512)
+ return 1;
+ }
+ vcpu->arch.xcr0 = xcr0;
+
+ if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND)
+ kvm_update_cpuid(vcpu);
+ return 0;
+}
+
+int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
+{
+ if (kvm_x86_ops->get_cpl(vcpu) != 0 ||
+ __kvm_set_xcr(vcpu, index, xcr)) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_set_xcr);
+
+static u64 kvm_host_cr4_reserved_bits(struct cpuinfo_x86 *c)
+{
+ u64 reserved_bits = CR4_RESERVED_BITS;
+
+ if (!cpu_has(c, X86_FEATURE_XSAVE))
+ reserved_bits |= X86_CR4_OSXSAVE;
+
+ if (!cpu_has(c, X86_FEATURE_SMEP))
+ reserved_bits |= X86_CR4_SMEP;
+
+ if (!cpu_has(c, X86_FEATURE_SMAP))
+ reserved_bits |= X86_CR4_SMAP;
+
+ if (!cpu_has(c, X86_FEATURE_FSGSBASE))
+ reserved_bits |= X86_CR4_FSGSBASE;
+
+ if (!cpu_has(c, X86_FEATURE_PKU))
+ reserved_bits |= X86_CR4_PKE;
+
+ if (!cpu_has(c, X86_FEATURE_LA57) &&
+ !(cpuid_ecx(0x7) & bit(X86_FEATURE_LA57)))
+ reserved_bits |= X86_CR4_LA57;
+
+ if (!cpu_has(c, X86_FEATURE_UMIP) && !kvm_x86_ops->umip_emulated())
+ reserved_bits |= X86_CR4_UMIP;
+
+ return reserved_bits;
+}
+
+static int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+ if (cr4 & cr4_reserved_bits)
+ return -EINVAL;
+
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && (cr4 & X86_CR4_OSXSAVE))
+ return -EINVAL;
+
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_SMEP) && (cr4 & X86_CR4_SMEP))
+ return -EINVAL;
+
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_SMAP) && (cr4 & X86_CR4_SMAP))
+ return -EINVAL;
+
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_FSGSBASE) && (cr4 & X86_CR4_FSGSBASE))
+ return -EINVAL;
+
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_PKU) && (cr4 & X86_CR4_PKE))
+ return -EINVAL;
+
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_LA57) && (cr4 & X86_CR4_LA57))
+ return -EINVAL;
+
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_UMIP) && (cr4 & X86_CR4_UMIP))
+ return -EINVAL;
+
+ return 0;
+}
+
+int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+ unsigned long old_cr4 = kvm_read_cr4(vcpu);
+ unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE |
+ X86_CR4_SMEP;
+ unsigned long mmu_role_bits = pdptr_bits | X86_CR4_SMAP | X86_CR4_PKE;
+
+ if (kvm_valid_cr4(vcpu, cr4))
+ return 1;
+
+ if (is_long_mode(vcpu)) {
+ if (!(cr4 & X86_CR4_PAE))
+ return 1;
+ if ((cr4 ^ old_cr4) & X86_CR4_LA57)
+ return 1;
+ } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
+ && ((cr4 ^ old_cr4) & pdptr_bits)
+ && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
+ kvm_read_cr3(vcpu)))
+ return 1;
+
+ if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) {
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_PCID))
+ return 1;
+
+ /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
+ if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu))
+ return 1;
+ }
+
+ if (kvm_x86_ops->set_cr4(vcpu, cr4))
+ return 1;
+
+ if (((cr4 ^ old_cr4) & mmu_role_bits) ||
+ (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
+ kvm_mmu_reset_context(vcpu);
+
+ if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
+ kvm_update_cpuid(vcpu);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_set_cr4);
+
+int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
+{
+ bool skip_tlb_flush = false;
+#ifdef CONFIG_X86_64
+ bool pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
+
+ if (pcid_enabled) {
+ skip_tlb_flush = cr3 & X86_CR3_PCID_NOFLUSH;
+ cr3 &= ~X86_CR3_PCID_NOFLUSH;
+ }
+#endif
+
+ if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
+ if (!skip_tlb_flush) {
+ kvm_mmu_sync_roots(vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+ }
+ return 0;
+ }
+
+ if (is_long_mode(vcpu) &&
+ (cr3 & rsvd_bits(cpuid_maxphyaddr(vcpu), 63)))
+ return 1;
+ else if (is_pae_paging(vcpu) &&
+ !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
+ return 1;
+
+ kvm_mmu_new_cr3(vcpu, cr3, skip_tlb_flush);
+ vcpu->arch.cr3 = cr3;
+ __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_set_cr3);
+
+int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
+{
+ if (cr8 & CR8_RESERVED_BITS)
+ return 1;
+ if (lapic_in_kernel(vcpu))
+ kvm_lapic_set_tpr(vcpu, cr8);
+ else
+ vcpu->arch.cr8 = cr8;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_set_cr8);
+
+unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
+{
+ if (lapic_in_kernel(vcpu))
+ return kvm_lapic_get_cr8(vcpu);
+ else
+ return vcpu->arch.cr8;
+}
+EXPORT_SYMBOL_GPL(kvm_get_cr8);
+
+static void kvm_update_dr0123(struct kvm_vcpu *vcpu)
+{
+ int i;
+
+ if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
+ for (i = 0; i < KVM_NR_DB_REGS; i++)
+ vcpu->arch.eff_db[i] = vcpu->arch.db[i];
+ vcpu->arch.switch_db_regs |= KVM_DEBUGREG_RELOAD;
+ }
+}
+
+static void kvm_update_dr6(struct kvm_vcpu *vcpu)
+{
+ if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
+ kvm_x86_ops->set_dr6(vcpu, vcpu->arch.dr6);
+}
+
+static void kvm_update_dr7(struct kvm_vcpu *vcpu)
+{
+ unsigned long dr7;
+
+ if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
+ dr7 = vcpu->arch.guest_debug_dr7;
+ else
+ dr7 = vcpu->arch.dr7;
+ kvm_x86_ops->set_dr7(vcpu, dr7);
+ vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED;
+ if (dr7 & DR7_BP_EN_MASK)
+ vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED;
+}
+
+static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu)
+{
+ u64 fixed = DR6_FIXED_1;
+
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_RTM))
+ fixed |= DR6_RTM;
+ return fixed;
+}
+
+static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
+{
+ size_t size = ARRAY_SIZE(vcpu->arch.db);
+
+ switch (dr) {
+ case 0 ... 3:
+ vcpu->arch.db[array_index_nospec(dr, size)] = val;
+ if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
+ vcpu->arch.eff_db[dr] = val;
+ break;
+ case 4:
+ /* fall through */
+ case 6:
+ if (val & 0xffffffff00000000ULL)
+ return -1; /* #GP */
+ vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu);
+ kvm_update_dr6(vcpu);
+ break;
+ case 5:
+ /* fall through */
+ default: /* 7 */
+ if (val & 0xffffffff00000000ULL)
+ return -1; /* #GP */
+ vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
+ kvm_update_dr7(vcpu);
+ break;
+ }
+
+ return 0;
+}
+
+int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
+{
+ if (__kvm_set_dr(vcpu, dr, val)) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_set_dr);
+
+int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
+{
+ size_t size = ARRAY_SIZE(vcpu->arch.db);
+
+ switch (dr) {
+ case 0 ... 3:
+ *val = vcpu->arch.db[array_index_nospec(dr, size)];
+ break;
+ case 4:
+ /* fall through */
+ case 6:
+ if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
+ *val = vcpu->arch.dr6;
+ else
+ *val = kvm_x86_ops->get_dr6(vcpu);
+ break;
+ case 5:
+ /* fall through */
+ default: /* 7 */
+ *val = vcpu->arch.dr7;
+ break;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_get_dr);
+
+bool kvm_rdpmc(struct kvm_vcpu *vcpu)
+{
+ u32 ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
+ u64 data;
+ int err;
+
+ err = kvm_pmu_rdpmc(vcpu, ecx, &data);
+ if (err)
+ return err;
+ kvm_register_write(vcpu, VCPU_REGS_RAX, (u32)data);
+ kvm_register_write(vcpu, VCPU_REGS_RDX, data >> 32);
+ return err;
+}
+EXPORT_SYMBOL_GPL(kvm_rdpmc);
+
+/*
+ * List of msr numbers which we expose to userspace through KVM_GET_MSRS
+ * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
+ *
+ * This list is modified at module load time to reflect the
+ * capabilities of the host cpu. This capabilities test skips MSRs that are
+ * kvm-specific. Those are put in emulated_msrs; filtering of emulated_msrs
+ * may depend on host virtualization features rather than host cpu features.
+ */
+
+static u32 msrs_to_save[] = {
+ MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
+ MSR_STAR,
+#ifdef CONFIG_X86_64
+ MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
+#endif
+ MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
+ MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
+ MSR_IA32_SPEC_CTRL, MSR_IA32_ARCH_CAPABILITIES
+};
+
+static unsigned num_msrs_to_save;
+
+static u32 emulated_msrs[] = {
+ MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
+ MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
+ HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
+ HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
+ HV_X64_MSR_TSC_FREQUENCY, HV_X64_MSR_APIC_FREQUENCY,
+ HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2,
+ HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL,
+ HV_X64_MSR_RESET,
+ HV_X64_MSR_VP_INDEX,
+ HV_X64_MSR_VP_RUNTIME,
+ HV_X64_MSR_SCONTROL,
+ HV_X64_MSR_STIMER0_CONFIG,
+ HV_X64_MSR_VP_ASSIST_PAGE,
+ HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL,
+ HV_X64_MSR_TSC_EMULATION_STATUS,
+
+ MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
+ MSR_KVM_PV_EOI_EN,
+
+ MSR_IA32_TSC_ADJUST,
+ MSR_IA32_TSCDEADLINE,
+ MSR_IA32_MISC_ENABLE,
+ MSR_IA32_MCG_STATUS,
+ MSR_IA32_MCG_CTL,
+ MSR_IA32_MCG_EXT_CTL,
+ MSR_IA32_SMBASE,
+ MSR_SMI_COUNT,
+ MSR_PLATFORM_INFO,
+ MSR_MISC_FEATURES_ENABLES,
+ MSR_AMD64_VIRT_SPEC_CTRL,
+};
+
+static unsigned num_emulated_msrs;
+
+/*
+ * List of msr numbers which are used to expose MSR-based features that
+ * can be used by a hypervisor to validate requested CPU features.
+ */
+static u32 msr_based_features[] = {
+ MSR_IA32_VMX_BASIC,
+ MSR_IA32_VMX_TRUE_PINBASED_CTLS,
+ MSR_IA32_VMX_PINBASED_CTLS,
+ MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
+ MSR_IA32_VMX_PROCBASED_CTLS,
+ MSR_IA32_VMX_TRUE_EXIT_CTLS,
+ MSR_IA32_VMX_EXIT_CTLS,
+ MSR_IA32_VMX_TRUE_ENTRY_CTLS,
+ MSR_IA32_VMX_ENTRY_CTLS,
+ MSR_IA32_VMX_MISC,
+ MSR_IA32_VMX_CR0_FIXED0,
+ MSR_IA32_VMX_CR0_FIXED1,
+ MSR_IA32_VMX_CR4_FIXED0,
+ MSR_IA32_VMX_CR4_FIXED1,
+ MSR_IA32_VMX_VMCS_ENUM,
+ MSR_IA32_VMX_PROCBASED_CTLS2,
+ MSR_IA32_VMX_EPT_VPID_CAP,
+ MSR_IA32_VMX_VMFUNC,
+
+ MSR_F10H_DECFG,
+ MSR_IA32_UCODE_REV,
+ MSR_IA32_ARCH_CAPABILITIES,
+};
+
+static unsigned int num_msr_based_features;
+
+u64 kvm_get_arch_capabilities(void)
+{
+ u64 data;
+
+ rdmsrl_safe(MSR_IA32_ARCH_CAPABILITIES, &data);
+
+ /*
+ * If nx_huge_pages is enabled, KVM's shadow paging will ensure that
+ * the nested hypervisor runs with NX huge pages. If it is not,
+ * L1 is anyway vulnerable to ITLB_MULTIHIT explots from other
+ * L1 guests, so it need not worry about its own (L2) guests.
+ */
+ data |= ARCH_CAP_PSCHANGE_MC_NO;
+
+ /*
+ * If we're doing cache flushes (either "always" or "cond")
+ * we will do one whenever the guest does a vmlaunch/vmresume.
+ * If an outer hypervisor is doing the cache flush for us
+ * (VMENTER_L1D_FLUSH_NESTED_VM), we can safely pass that
+ * capability to the guest too, and if EPT is disabled we're not
+ * vulnerable. Overall, only VMENTER_L1D_FLUSH_NEVER will
+ * require a nested hypervisor to do a flush of its own.
+ */
+ if (l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NEVER)
+ data |= ARCH_CAP_SKIP_VMENTRY_L1DFLUSH;
+
+ if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
+ data |= ARCH_CAP_RDCL_NO;
+ if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
+ data |= ARCH_CAP_SSB_NO;
+ if (!boot_cpu_has_bug(X86_BUG_MDS))
+ data |= ARCH_CAP_MDS_NO;
+
+ /*
+ * On TAA affected systems, export MDS_NO=0 when:
+ * - TSX is enabled on the host, i.e. X86_FEATURE_RTM=1.
+ * - Updated microcode is present. This is detected by
+ * the presence of ARCH_CAP_TSX_CTRL_MSR and ensures
+ * that VERW clears CPU buffers.
+ *
+ * When MDS_NO=0 is exported, guests deploy clear CPU buffer
+ * mitigation and don't complain:
+ *
+ * "Vulnerable: Clear CPU buffers attempted, no microcode"
+ *
+ * If TSX is disabled on the system, guests are also mitigated against
+ * TAA and clear CPU buffer mitigation is not required for guests.
+ */
+ if (!boot_cpu_has(X86_FEATURE_RTM))
+ data &= ~ARCH_CAP_TAA_NO;
+ else if (!boot_cpu_has_bug(X86_BUG_TAA))
+ data |= ARCH_CAP_TAA_NO;
+ else if (data & ARCH_CAP_TSX_CTRL_MSR)
+ data &= ~ARCH_CAP_MDS_NO;
+
+ /* KVM does not emulate MSR_IA32_TSX_CTRL. */
+ data &= ~ARCH_CAP_TSX_CTRL_MSR;
+
+ /* Guests don't need to know "Fill buffer clear control" exists */
+ data &= ~ARCH_CAP_FB_CLEAR_CTRL;
+
+ return data;
+}
+
+EXPORT_SYMBOL_GPL(kvm_get_arch_capabilities);
+
+static int kvm_get_msr_feature(struct kvm_msr_entry *msr)
+{
+ switch (msr->index) {
+ case MSR_IA32_ARCH_CAPABILITIES:
+ msr->data = kvm_get_arch_capabilities();
+ break;
+ case MSR_IA32_UCODE_REV:
+ rdmsrl_safe(msr->index, &msr->data);
+ break;
+ default:
+ if (kvm_x86_ops->get_msr_feature(msr))
+ return 1;
+ }
+ return 0;
+}
+
+static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
+{
+ struct kvm_msr_entry msr;
+ int r;
+
+ msr.index = index;
+ r = kvm_get_msr_feature(&msr);
+ if (r)
+ return r;
+
+ *data = msr.data;
+
+ return 0;
+}
+
+static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
+{
+ if (efer & EFER_FFXSR && !guest_cpuid_has(vcpu, X86_FEATURE_FXSR_OPT))
+ return false;
+
+ if (efer & EFER_SVME && !guest_cpuid_has(vcpu, X86_FEATURE_SVM))
+ return false;
+
+ return true;
+
+}
+bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
+{
+ if (efer & efer_reserved_bits)
+ return false;
+
+ return __kvm_valid_efer(vcpu, efer);
+}
+EXPORT_SYMBOL_GPL(kvm_valid_efer);
+
+static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ u64 old_efer = vcpu->arch.efer;
+ u64 efer = msr_info->data;
+
+ if (efer & efer_reserved_bits)
+ return 1;
+
+ if (!msr_info->host_initiated) {
+ if (!__kvm_valid_efer(vcpu, efer))
+ return 1;
+
+ if (is_paging(vcpu) &&
+ (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
+ return 1;
+ }
+
+ efer &= ~EFER_LMA;
+ efer |= vcpu->arch.efer & EFER_LMA;
+
+ kvm_x86_ops->set_efer(vcpu, efer);
+
+ /* Update reserved bits */
+ if ((efer ^ old_efer) & EFER_NX)
+ kvm_mmu_reset_context(vcpu);
+
+ return 0;
+}
+
+void kvm_enable_efer_bits(u64 mask)
+{
+ efer_reserved_bits &= ~mask;
+}
+EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
+
+/*
+ * Writes msr value into into the appropriate "register".
+ * Returns 0 on success, non-0 otherwise.
+ * Assumes vcpu_load() was already called.
+ */
+int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
+{
+ switch (msr->index) {
+ case MSR_FS_BASE:
+ case MSR_GS_BASE:
+ case MSR_KERNEL_GS_BASE:
+ case MSR_CSTAR:
+ case MSR_LSTAR:
+ if (is_noncanonical_address(msr->data, vcpu))
+ return 1;
+ break;
+ case MSR_IA32_SYSENTER_EIP:
+ case MSR_IA32_SYSENTER_ESP:
+ /*
+ * IA32_SYSENTER_ESP and IA32_SYSENTER_EIP cause #GP if
+ * non-canonical address is written on Intel but not on
+ * AMD (which ignores the top 32-bits, because it does
+ * not implement 64-bit SYSENTER).
+ *
+ * 64-bit code should hence be able to write a non-canonical
+ * value on AMD. Making the address canonical ensures that
+ * vmentry does not fail on Intel after writing a non-canonical
+ * value, and that something deterministic happens if the guest
+ * invokes 64-bit SYSENTER.
+ */
+ msr->data = get_canonical(msr->data, vcpu_virt_addr_bits(vcpu));
+ }
+ return kvm_x86_ops->set_msr(vcpu, msr);
+}
+EXPORT_SYMBOL_GPL(kvm_set_msr);
+
+/*
+ * Adapt set_msr() to msr_io()'s calling convention
+ */
+static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
+{
+ struct msr_data msr;
+ int r;
+
+ msr.index = index;
+ msr.host_initiated = true;
+ r = kvm_get_msr(vcpu, &msr);
+ if (r)
+ return r;
+
+ *data = msr.data;
+ return 0;
+}
+
+static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
+{
+ struct msr_data msr;
+
+ msr.data = *data;
+ msr.index = index;
+ msr.host_initiated = true;
+ return kvm_set_msr(vcpu, &msr);
+}
+
+#ifdef CONFIG_X86_64
+struct pvclock_gtod_data {
+ seqcount_t seq;
+
+ struct { /* extract of a clocksource struct */
+ int vclock_mode;
+ u64 cycle_last;
+ u64 mask;
+ u32 mult;
+ u32 shift;
+ } clock;
+
+ u64 boot_ns;
+ u64 nsec_base;
+ u64 wall_time_sec;
+};
+
+static struct pvclock_gtod_data pvclock_gtod_data;
+
+static void update_pvclock_gtod(struct timekeeper *tk)
+{
+ struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
+ u64 boot_ns;
+
+ boot_ns = ktime_to_ns(ktime_add(tk->tkr_mono.base, tk->offs_boot));
+
+ write_seqcount_begin(&vdata->seq);
+
+ /* copy pvclock gtod data */
+ vdata->clock.vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode;
+ vdata->clock.cycle_last = tk->tkr_mono.cycle_last;
+ vdata->clock.mask = tk->tkr_mono.mask;
+ vdata->clock.mult = tk->tkr_mono.mult;
+ vdata->clock.shift = tk->tkr_mono.shift;
+
+ vdata->boot_ns = boot_ns;
+ vdata->nsec_base = tk->tkr_mono.xtime_nsec;
+
+ vdata->wall_time_sec = tk->xtime_sec;
+
+ write_seqcount_end(&vdata->seq);
+}
+#endif
+
+void kvm_set_pending_timer(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Note: KVM_REQ_PENDING_TIMER is implicitly checked in
+ * vcpu_enter_guest. This function is only called from
+ * the physical CPU that is running vcpu.
+ */
+ kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
+}
+
+static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
+{
+ int version;
+ int r;
+ struct pvclock_wall_clock wc;
+ struct timespec64 boot;
+
+ if (!wall_clock)
+ return;
+
+ r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version));
+ if (r)
+ return;
+
+ if (version & 1)
+ ++version; /* first time write, random junk */
+
+ ++version;
+
+ if (kvm_write_guest(kvm, wall_clock, &version, sizeof(version)))
+ return;
+
+ /*
+ * The guest calculates current wall clock time by adding
+ * system time (updated by kvm_guest_time_update below) to the
+ * wall clock specified here. guest system time equals host
+ * system time for us, thus we must fill in host boot time here.
+ */
+ getboottime64(&boot);
+
+ if (kvm->arch.kvmclock_offset) {
+ struct timespec64 ts = ns_to_timespec64(kvm->arch.kvmclock_offset);
+ boot = timespec64_sub(boot, ts);
+ }
+ wc.sec = (u32)boot.tv_sec; /* overflow in 2106 guest time */
+ wc.nsec = boot.tv_nsec;
+ wc.version = version;
+
+ kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
+
+ version++;
+ kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
+}
+
+static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
+{
+ do_shl32_div32(dividend, divisor);
+ return dividend;
+}
+
+static void kvm_get_time_scale(uint64_t scaled_hz, uint64_t base_hz,
+ s8 *pshift, u32 *pmultiplier)
+{
+ uint64_t scaled64;
+ int32_t shift = 0;
+ uint64_t tps64;
+ uint32_t tps32;
+
+ tps64 = base_hz;
+ scaled64 = scaled_hz;
+ while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) {
+ tps64 >>= 1;
+ shift--;
+ }
+
+ tps32 = (uint32_t)tps64;
+ while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) {
+ if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000)
+ scaled64 >>= 1;
+ else
+ tps32 <<= 1;
+ shift++;
+ }
+
+ *pshift = shift;
+ *pmultiplier = div_frac(scaled64, tps32);
+
+ pr_debug("%s: base_hz %llu => %llu, shift %d, mul %u\n",
+ __func__, base_hz, scaled_hz, shift, *pmultiplier);
+}
+
+#ifdef CONFIG_X86_64
+static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);
+#endif
+
+static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
+static unsigned long max_tsc_khz;
+
+static u32 adjust_tsc_khz(u32 khz, s32 ppm)
+{
+ u64 v = (u64)khz * (1000000 + ppm);
+ do_div(v, 1000000);
+ return v;
+}
+
+static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
+{
+ u64 ratio;
+
+ /* Guest TSC same frequency as host TSC? */
+ if (!scale) {
+ vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
+ return 0;
+ }
+
+ /* TSC scaling supported? */
+ if (!kvm_has_tsc_control) {
+ if (user_tsc_khz > tsc_khz) {
+ vcpu->arch.tsc_catchup = 1;
+ vcpu->arch.tsc_always_catchup = 1;
+ return 0;
+ } else {
+ pr_warn_ratelimited("user requested TSC rate below hardware speed\n");
+ return -1;
+ }
+ }
+
+ /* TSC scaling required - calculate ratio */
+ ratio = mul_u64_u32_div(1ULL << kvm_tsc_scaling_ratio_frac_bits,
+ user_tsc_khz, tsc_khz);
+
+ if (ratio == 0 || ratio >= kvm_max_tsc_scaling_ratio) {
+ pr_warn_ratelimited("Invalid TSC scaling ratio - virtual-tsc-khz=%u\n",
+ user_tsc_khz);
+ return -1;
+ }
+
+ vcpu->arch.tsc_scaling_ratio = ratio;
+ return 0;
+}
+
+static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
+{
+ u32 thresh_lo, thresh_hi;
+ int use_scaling = 0;
+
+ /* tsc_khz can be zero if TSC calibration fails */
+ if (user_tsc_khz == 0) {
+ /* set tsc_scaling_ratio to a safe value */
+ vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
+ return -1;
+ }
+
+ /* Compute a scale to convert nanoseconds in TSC cycles */
+ kvm_get_time_scale(user_tsc_khz * 1000LL, NSEC_PER_SEC,
+ &vcpu->arch.virtual_tsc_shift,
+ &vcpu->arch.virtual_tsc_mult);
+ vcpu->arch.virtual_tsc_khz = user_tsc_khz;
+
+ /*
+ * Compute the variation in TSC rate which is acceptable
+ * within the range of tolerance and decide if the
+ * rate being applied is within that bounds of the hardware
+ * rate. If so, no scaling or compensation need be done.
+ */
+ thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm);
+ thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm);
+ if (user_tsc_khz < thresh_lo || user_tsc_khz > thresh_hi) {
+ pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", user_tsc_khz, thresh_lo, thresh_hi);
+ use_scaling = 1;
+ }
+ return set_tsc_khz(vcpu, user_tsc_khz, use_scaling);
+}
+
+static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
+{
+ u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec,
+ vcpu->arch.virtual_tsc_mult,
+ vcpu->arch.virtual_tsc_shift);
+ tsc += vcpu->arch.this_tsc_write;
+ return tsc;
+}
+
+static inline int gtod_is_based_on_tsc(int mode)
+{
+ return mode == VCLOCK_TSC || mode == VCLOCK_HVCLOCK;
+}
+
+static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_X86_64
+ bool vcpus_matched;
+ struct kvm_arch *ka = &vcpu->kvm->arch;
+ struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
+
+ vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
+ atomic_read(&vcpu->kvm->online_vcpus));
+
+ /*
+ * Once the masterclock is enabled, always perform request in
+ * order to update it.
+ *
+ * In order to enable masterclock, the host clocksource must be TSC
+ * and the vcpus need to have matched TSCs. When that happens,
+ * perform request to enable masterclock.
+ */
+ if (ka->use_master_clock ||
+ (gtod_is_based_on_tsc(gtod->clock.vclock_mode) && vcpus_matched))
+ kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
+
+ trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc,
+ atomic_read(&vcpu->kvm->online_vcpus),
+ ka->use_master_clock, gtod->clock.vclock_mode);
+#endif
+}
+
+static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset)
+{
+ u64 curr_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu);
+ vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset;
+}
+
+/*
+ * Multiply tsc by a fixed point number represented by ratio.
+ *
+ * The most significant 64-N bits (mult) of ratio represent the
+ * integral part of the fixed point number; the remaining N bits
+ * (frac) represent the fractional part, ie. ratio represents a fixed
+ * point number (mult + frac * 2^(-N)).
+ *
+ * N equals to kvm_tsc_scaling_ratio_frac_bits.
+ */
+static inline u64 __scale_tsc(u64 ratio, u64 tsc)
+{
+ return mul_u64_u64_shr(tsc, ratio, kvm_tsc_scaling_ratio_frac_bits);
+}
+
+u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc)
+{
+ u64 _tsc = tsc;
+ u64 ratio = vcpu->arch.tsc_scaling_ratio;
+
+ if (ratio != kvm_default_tsc_scaling_ratio)
+ _tsc = __scale_tsc(ratio, tsc);
+
+ return _tsc;
+}
+EXPORT_SYMBOL_GPL(kvm_scale_tsc);
+
+static u64 kvm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
+{
+ u64 tsc;
+
+ tsc = kvm_scale_tsc(vcpu, rdtsc());
+
+ return target_tsc - tsc;
+}
+
+u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
+{
+ u64 tsc_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu);
+
+ return tsc_offset + kvm_scale_tsc(vcpu, host_tsc);
+}
+EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
+
+static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
+{
+ vcpu->arch.tsc_offset = kvm_x86_ops->write_l1_tsc_offset(vcpu, offset);
+}
+
+static inline bool kvm_check_tsc_unstable(void)
+{
+#ifdef CONFIG_X86_64
+ /*
+ * TSC is marked unstable when we're running on Hyper-V,
+ * 'TSC page' clocksource is good.
+ */
+ if (pvclock_gtod_data.clock.vclock_mode == VCLOCK_HVCLOCK)
+ return false;
+#endif
+ return check_tsc_unstable();
+}
+
+void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
+{
+ struct kvm *kvm = vcpu->kvm;
+ u64 offset, ns, elapsed;
+ unsigned long flags;
+ bool matched;
+ bool already_matched;
+ u64 data = msr->data;
+ bool synchronizing = false;
+
+ raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
+ offset = kvm_compute_tsc_offset(vcpu, data);
+ ns = ktime_get_boot_ns();
+ elapsed = ns - kvm->arch.last_tsc_nsec;
+
+ if (vcpu->arch.virtual_tsc_khz) {
+ if (data == 0 && msr->host_initiated) {
+ /*
+ * detection of vcpu initialization -- need to sync
+ * with other vCPUs. This particularly helps to keep
+ * kvm_clock stable after CPU hotplug
+ */
+ synchronizing = true;
+ } else {
+ u64 tsc_exp = kvm->arch.last_tsc_write +
+ nsec_to_cycles(vcpu, elapsed);
+ u64 tsc_hz = vcpu->arch.virtual_tsc_khz * 1000LL;
+ /*
+ * Special case: TSC write with a small delta (1 second)
+ * of virtual cycle time against real time is
+ * interpreted as an attempt to synchronize the CPU.
+ */
+ synchronizing = data < tsc_exp + tsc_hz &&
+ data + tsc_hz > tsc_exp;
+ }
+ }
+
+ /*
+ * For a reliable TSC, we can match TSC offsets, and for an unstable
+ * TSC, we add elapsed time in this computation. We could let the
+ * compensation code attempt to catch up if we fall behind, but
+ * it's better to try to match offsets from the beginning.
+ */
+ if (synchronizing &&
+ vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
+ if (!kvm_check_tsc_unstable()) {
+ offset = kvm->arch.cur_tsc_offset;
+ pr_debug("kvm: matched tsc offset for %llu\n", data);
+ } else {
+ u64 delta = nsec_to_cycles(vcpu, elapsed);
+ data += delta;
+ offset = kvm_compute_tsc_offset(vcpu, data);
+ pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
+ }
+ matched = true;
+ already_matched = (vcpu->arch.this_tsc_generation == kvm->arch.cur_tsc_generation);
+ } else {
+ /*
+ * We split periods of matched TSC writes into generations.
+ * For each generation, we track the original measured
+ * nanosecond time, offset, and write, so if TSCs are in
+ * sync, we can match exact offset, and if not, we can match
+ * exact software computation in compute_guest_tsc()
+ *
+ * These values are tracked in kvm->arch.cur_xxx variables.
+ */
+ kvm->arch.cur_tsc_generation++;
+ kvm->arch.cur_tsc_nsec = ns;
+ kvm->arch.cur_tsc_write = data;
+ kvm->arch.cur_tsc_offset = offset;
+ matched = false;
+ pr_debug("kvm: new tsc generation %llu, clock %llu\n",
+ kvm->arch.cur_tsc_generation, data);
+ }
+
+ /*
+ * We also track th most recent recorded KHZ, write and time to
+ * allow the matching interval to be extended at each write.
+ */
+ kvm->arch.last_tsc_nsec = ns;
+ kvm->arch.last_tsc_write = data;
+ kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
+
+ vcpu->arch.last_guest_tsc = data;
+
+ /* Keep track of which generation this VCPU has synchronized to */
+ vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation;
+ vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
+ vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
+
+ if (!msr->host_initiated && guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST))
+ update_ia32_tsc_adjust_msr(vcpu, offset);
+
+ kvm_vcpu_write_tsc_offset(vcpu, offset);
+ raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
+
+ spin_lock(&kvm->arch.pvclock_gtod_sync_lock);
+ if (!matched) {
+ kvm->arch.nr_vcpus_matched_tsc = 0;
+ } else if (!already_matched) {
+ kvm->arch.nr_vcpus_matched_tsc++;
+ }
+
+ kvm_track_tsc_matching(vcpu);
+ spin_unlock(&kvm->arch.pvclock_gtod_sync_lock);
+}
+
+EXPORT_SYMBOL_GPL(kvm_write_tsc);
+
+static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
+ s64 adjustment)
+{
+ u64 tsc_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu);
+ kvm_vcpu_write_tsc_offset(vcpu, tsc_offset + adjustment);
+}
+
+static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
+{
+ if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio)
+ WARN_ON(adjustment < 0);
+ adjustment = kvm_scale_tsc(vcpu, (u64) adjustment);
+ adjust_tsc_offset_guest(vcpu, adjustment);
+}
+
+#ifdef CONFIG_X86_64
+
+static u64 read_tsc(void)
+{
+ u64 ret = (u64)rdtsc_ordered();
+ u64 last = pvclock_gtod_data.clock.cycle_last;
+
+ if (likely(ret >= last))
+ return ret;
+
+ /*
+ * GCC likes to generate cmov here, but this branch is extremely
+ * predictable (it's just a function of time and the likely is
+ * very likely) and there's a data dependence, so force GCC
+ * to generate a branch instead. I don't barrier() because
+ * we don't actually need a barrier, and if this function
+ * ever gets inlined it will generate worse code.
+ */
+ asm volatile ("");
+ return last;
+}
+
+static inline u64 vgettsc(u64 *tsc_timestamp, int *mode)
+{
+ long v;
+ struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
+ u64 tsc_pg_val;
+
+ switch (gtod->clock.vclock_mode) {
+ case VCLOCK_HVCLOCK:
+ tsc_pg_val = hv_read_tsc_page_tsc(hv_get_tsc_page(),
+ tsc_timestamp);
+ if (tsc_pg_val != U64_MAX) {
+ /* TSC page valid */
+ *mode = VCLOCK_HVCLOCK;
+ v = (tsc_pg_val - gtod->clock.cycle_last) &
+ gtod->clock.mask;
+ } else {
+ /* TSC page invalid */
+ *mode = VCLOCK_NONE;
+ }
+ break;
+ case VCLOCK_TSC:
+ *mode = VCLOCK_TSC;
+ *tsc_timestamp = read_tsc();
+ v = (*tsc_timestamp - gtod->clock.cycle_last) &
+ gtod->clock.mask;
+ break;
+ default:
+ *mode = VCLOCK_NONE;
+ }
+
+ if (*mode == VCLOCK_NONE)
+ *tsc_timestamp = v = 0;
+
+ return v * gtod->clock.mult;
+}
+
+static int do_monotonic_boot(s64 *t, u64 *tsc_timestamp)
+{
+ struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
+ unsigned long seq;
+ int mode;
+ u64 ns;
+
+ do {
+ seq = read_seqcount_begin(&gtod->seq);
+ ns = gtod->nsec_base;
+ ns += vgettsc(tsc_timestamp, &mode);
+ ns >>= gtod->clock.shift;
+ ns += gtod->boot_ns;
+ } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
+ *t = ns;
+
+ return mode;
+}
+
+static int do_realtime(struct timespec64 *ts, u64 *tsc_timestamp)
+{
+ struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
+ unsigned long seq;
+ int mode;
+ u64 ns;
+
+ do {
+ seq = read_seqcount_begin(&gtod->seq);
+ ts->tv_sec = gtod->wall_time_sec;
+ ns = gtod->nsec_base;
+ ns += vgettsc(tsc_timestamp, &mode);
+ ns >>= gtod->clock.shift;
+ } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
+
+ ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
+ ts->tv_nsec = ns;
+
+ return mode;
+}
+
+/* returns true if host is using TSC based clocksource */
+static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp)
+{
+ /* checked again under seqlock below */
+ if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
+ return false;
+
+ return gtod_is_based_on_tsc(do_monotonic_boot(kernel_ns,
+ tsc_timestamp));
+}
+
+/* returns true if host is using TSC based clocksource */
+static bool kvm_get_walltime_and_clockread(struct timespec64 *ts,
+ u64 *tsc_timestamp)
+{
+ /* checked again under seqlock below */
+ if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
+ return false;
+
+ return gtod_is_based_on_tsc(do_realtime(ts, tsc_timestamp));
+}
+#endif
+
+/*
+ *
+ * Assuming a stable TSC across physical CPUS, and a stable TSC
+ * across virtual CPUs, the following condition is possible.
+ * Each numbered line represents an event visible to both
+ * CPUs at the next numbered event.
+ *
+ * "timespecX" represents host monotonic time. "tscX" represents
+ * RDTSC value.
+ *
+ * VCPU0 on CPU0 | VCPU1 on CPU1
+ *
+ * 1. read timespec0,tsc0
+ * 2. | timespec1 = timespec0 + N
+ * | tsc1 = tsc0 + M
+ * 3. transition to guest | transition to guest
+ * 4. ret0 = timespec0 + (rdtsc - tsc0) |
+ * 5. | ret1 = timespec1 + (rdtsc - tsc1)
+ * | ret1 = timespec0 + N + (rdtsc - (tsc0 + M))
+ *
+ * Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity:
+ *
+ * - ret0 < ret1
+ * - timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M))
+ * ...
+ * - 0 < N - M => M < N
+ *
+ * That is, when timespec0 != timespec1, M < N. Unfortunately that is not
+ * always the case (the difference between two distinct xtime instances
+ * might be smaller then the difference between corresponding TSC reads,
+ * when updating guest vcpus pvclock areas).
+ *
+ * To avoid that problem, do not allow visibility of distinct
+ * system_timestamp/tsc_timestamp values simultaneously: use a master
+ * copy of host monotonic time values. Update that master copy
+ * in lockstep.
+ *
+ * Rely on synchronization of host TSCs and guest TSCs for monotonicity.
+ *
+ */
+
+static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
+{
+#ifdef CONFIG_X86_64
+ struct kvm_arch *ka = &kvm->arch;
+ int vclock_mode;
+ bool host_tsc_clocksource, vcpus_matched;
+
+ vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
+ atomic_read(&kvm->online_vcpus));
+
+ /*
+ * If the host uses TSC clock, then passthrough TSC as stable
+ * to the guest.
+ */
+ host_tsc_clocksource = kvm_get_time_and_clockread(
+ &ka->master_kernel_ns,
+ &ka->master_cycle_now);
+
+ ka->use_master_clock = host_tsc_clocksource && vcpus_matched
+ && !ka->backwards_tsc_observed
+ && !ka->boot_vcpu_runs_old_kvmclock;
+
+ if (ka->use_master_clock)
+ atomic_set(&kvm_guest_has_master_clock, 1);
+
+ vclock_mode = pvclock_gtod_data.clock.vclock_mode;
+ trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode,
+ vcpus_matched);
+#endif
+}
+
+void kvm_make_mclock_inprogress_request(struct kvm *kvm)
+{
+ kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);
+}
+
+static void kvm_gen_update_masterclock(struct kvm *kvm)
+{
+#ifdef CONFIG_X86_64
+ int i;
+ struct kvm_vcpu *vcpu;
+ struct kvm_arch *ka = &kvm->arch;
+
+ spin_lock(&ka->pvclock_gtod_sync_lock);
+ kvm_make_mclock_inprogress_request(kvm);
+ /* no guest entries from this point */
+ pvclock_update_vm_gtod_copy(kvm);
+
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+
+ /* guest entries allowed */
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu);
+
+ spin_unlock(&ka->pvclock_gtod_sync_lock);
+#endif
+}
+
+u64 get_kvmclock_ns(struct kvm *kvm)
+{
+ struct kvm_arch *ka = &kvm->arch;
+ struct pvclock_vcpu_time_info hv_clock;
+ u64 ret;
+
+ spin_lock(&ka->pvclock_gtod_sync_lock);
+ if (!ka->use_master_clock) {
+ spin_unlock(&ka->pvclock_gtod_sync_lock);
+ return ktime_get_boot_ns() + ka->kvmclock_offset;
+ }
+
+ hv_clock.tsc_timestamp = ka->master_cycle_now;
+ hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
+ spin_unlock(&ka->pvclock_gtod_sync_lock);
+
+ /* both __this_cpu_read() and rdtsc() should be on the same cpu */
+ get_cpu();
+
+ if (__this_cpu_read(cpu_tsc_khz)) {
+ kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL,
+ &hv_clock.tsc_shift,
+ &hv_clock.tsc_to_system_mul);
+ ret = __pvclock_read_cycles(&hv_clock, rdtsc());
+ } else
+ ret = ktime_get_boot_ns() + ka->kvmclock_offset;
+
+ put_cpu();
+
+ return ret;
+}
+
+static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
+{
+ struct kvm_vcpu_arch *vcpu = &v->arch;
+ struct pvclock_vcpu_time_info guest_hv_clock;
+
+ if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
+ &guest_hv_clock, sizeof(guest_hv_clock))))
+ return;
+
+ /* This VCPU is paused, but it's legal for a guest to read another
+ * VCPU's kvmclock, so we really have to follow the specification where
+ * it says that version is odd if data is being modified, and even after
+ * it is consistent.
+ *
+ * Version field updates must be kept separate. This is because
+ * kvm_write_guest_cached might use a "rep movs" instruction, and
+ * writes within a string instruction are weakly ordered. So there
+ * are three writes overall.
+ *
+ * As a small optimization, only write the version field in the first
+ * and third write. The vcpu->pv_time cache is still valid, because the
+ * version field is the first in the struct.
+ */
+ BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
+
+ if (guest_hv_clock.version & 1)
+ ++guest_hv_clock.version; /* first time write, random junk */
+
+ vcpu->hv_clock.version = guest_hv_clock.version + 1;
+ kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+ &vcpu->hv_clock,
+ sizeof(vcpu->hv_clock.version));
+
+ smp_wmb();
+
+ /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
+ vcpu->hv_clock.flags |= (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
+
+ if (vcpu->pvclock_set_guest_stopped_request) {
+ vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED;
+ vcpu->pvclock_set_guest_stopped_request = false;
+ }
+
+ trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
+
+ kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+ &vcpu->hv_clock,
+ sizeof(vcpu->hv_clock));
+
+ smp_wmb();
+
+ vcpu->hv_clock.version++;
+ kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+ &vcpu->hv_clock,
+ sizeof(vcpu->hv_clock.version));
+}
+
+static int kvm_guest_time_update(struct kvm_vcpu *v)
+{
+ unsigned long flags, tgt_tsc_khz;
+ struct kvm_vcpu_arch *vcpu = &v->arch;
+ struct kvm_arch *ka = &v->kvm->arch;
+ s64 kernel_ns;
+ u64 tsc_timestamp, host_tsc;
+ u8 pvclock_flags;
+ bool use_master_clock;
+
+ kernel_ns = 0;
+ host_tsc = 0;
+
+ /*
+ * If the host uses TSC clock, then passthrough TSC as stable
+ * to the guest.
+ */
+ spin_lock(&ka->pvclock_gtod_sync_lock);
+ use_master_clock = ka->use_master_clock;
+ if (use_master_clock) {
+ host_tsc = ka->master_cycle_now;
+ kernel_ns = ka->master_kernel_ns;
+ }
+ spin_unlock(&ka->pvclock_gtod_sync_lock);
+
+ /* Keep irq disabled to prevent changes to the clock */
+ local_irq_save(flags);
+ tgt_tsc_khz = __this_cpu_read(cpu_tsc_khz);
+ if (unlikely(tgt_tsc_khz == 0)) {
+ local_irq_restore(flags);
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
+ return 1;
+ }
+ if (!use_master_clock) {
+ host_tsc = rdtsc();
+ kernel_ns = ktime_get_boot_ns();
+ }
+
+ tsc_timestamp = kvm_read_l1_tsc(v, host_tsc);
+
+ /*
+ * We may have to catch up the TSC to match elapsed wall clock
+ * time for two reasons, even if kvmclock is used.
+ * 1) CPU could have been running below the maximum TSC rate
+ * 2) Broken TSC compensation resets the base at each VCPU
+ * entry to avoid unknown leaps of TSC even when running
+ * again on the same CPU. This may cause apparent elapsed
+ * time to disappear, and the guest to stand still or run
+ * very slowly.
+ */
+ if (vcpu->tsc_catchup) {
+ u64 tsc = compute_guest_tsc(v, kernel_ns);
+ if (tsc > tsc_timestamp) {
+ adjust_tsc_offset_guest(v, tsc - tsc_timestamp);
+ tsc_timestamp = tsc;
+ }
+ }
+
+ local_irq_restore(flags);
+
+ /* With all the info we got, fill in the values */
+
+ if (kvm_has_tsc_control)
+ tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz);
+
+ if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
+ kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL,
+ &vcpu->hv_clock.tsc_shift,
+ &vcpu->hv_clock.tsc_to_system_mul);
+ vcpu->hw_tsc_khz = tgt_tsc_khz;
+ }
+
+ vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
+ vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
+ vcpu->last_guest_tsc = tsc_timestamp;
+
+ /* If the host uses TSC clocksource, then it is stable */
+ pvclock_flags = 0;
+ if (use_master_clock)
+ pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
+
+ vcpu->hv_clock.flags = pvclock_flags;
+
+ if (vcpu->pv_time_enabled)
+ kvm_setup_pvclock_page(v);
+ if (v == kvm_get_vcpu(v->kvm, 0))
+ kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
+ return 0;
+}
+
+/*
+ * kvmclock updates which are isolated to a given vcpu, such as
+ * vcpu->cpu migration, should not allow system_timestamp from
+ * the rest of the vcpus to remain static. Otherwise ntp frequency
+ * correction applies to one vcpu's system_timestamp but not
+ * the others.
+ *
+ * So in those cases, request a kvmclock update for all vcpus.
+ * We need to rate-limit these requests though, as they can
+ * considerably slow guests that have a large number of vcpus.
+ * The time for a remote vcpu to update its kvmclock is bound
+ * by the delay we use to rate-limit the updates.
+ */
+
+#define KVMCLOCK_UPDATE_DELAY msecs_to_jiffies(100)
+
+static void kvmclock_update_fn(struct work_struct *work)
+{
+ int i;
+ struct delayed_work *dwork = to_delayed_work(work);
+ struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
+ kvmclock_update_work);
+ struct kvm *kvm = container_of(ka, struct kvm, arch);
+ struct kvm_vcpu *vcpu;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+ kvm_vcpu_kick(vcpu);
+ }
+}
+
+static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
+{
+ struct kvm *kvm = v->kvm;
+
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
+ schedule_delayed_work(&kvm->arch.kvmclock_update_work,
+ KVMCLOCK_UPDATE_DELAY);
+}
+
+#define KVMCLOCK_SYNC_PERIOD (300 * HZ)
+
+static void kvmclock_sync_fn(struct work_struct *work)
+{
+ struct delayed_work *dwork = to_delayed_work(work);
+ struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
+ kvmclock_sync_work);
+ struct kvm *kvm = container_of(ka, struct kvm, arch);
+
+ if (!kvmclock_periodic_sync)
+ return;
+
+ schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0);
+ schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
+ KVMCLOCK_SYNC_PERIOD);
+}
+
+static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ u64 mcg_cap = vcpu->arch.mcg_cap;
+ unsigned bank_num = mcg_cap & 0xff;
+ u32 msr = msr_info->index;
+ u64 data = msr_info->data;
+
+ switch (msr) {
+ case MSR_IA32_MCG_STATUS:
+ vcpu->arch.mcg_status = data;
+ break;
+ case MSR_IA32_MCG_CTL:
+ if (!(mcg_cap & MCG_CTL_P) &&
+ (data || !msr_info->host_initiated))
+ return 1;
+ if (data != 0 && data != ~(u64)0)
+ return 1;
+ vcpu->arch.mcg_ctl = data;
+ break;
+ default:
+ if (msr >= MSR_IA32_MC0_CTL &&
+ msr < MSR_IA32_MCx_CTL(bank_num)) {
+ u32 offset = array_index_nospec(
+ msr - MSR_IA32_MC0_CTL,
+ MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL);
+
+ /* only 0 or all 1s can be written to IA32_MCi_CTL
+ * some Linux kernels though clear bit 10 in bank 4 to
+ * workaround a BIOS/GART TBL issue on AMD K8s, ignore
+ * this to avoid an uncatched #GP in the guest
+ */
+ if ((offset & 0x3) == 0 &&
+ data != 0 && (data | (1 << 10)) != ~(u64)0)
+ return -1;
+ if (!msr_info->host_initiated &&
+ (offset & 0x3) == 1 && data != 0)
+ return -1;
+ vcpu->arch.mce_banks[offset] = data;
+ break;
+ }
+ return 1;
+ }
+ return 0;
+}
+
+static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)
+{
+ struct kvm *kvm = vcpu->kvm;
+ int lm = is_long_mode(vcpu);
+ u8 *blob_addr = lm ? (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_64
+ : (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_32;
+ u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64
+ : kvm->arch.xen_hvm_config.blob_size_32;
+ u32 page_num = data & ~PAGE_MASK;
+ u64 page_addr = data & PAGE_MASK;
+ u8 *page;
+ int r;
+
+ r = -E2BIG;
+ if (page_num >= blob_size)
+ goto out;
+ r = -ENOMEM;
+ page = memdup_user(blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE);
+ if (IS_ERR(page)) {
+ r = PTR_ERR(page);
+ goto out;
+ }
+ if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE))
+ goto out_free;
+ r = 0;
+out_free:
+ kfree(page);
+out:
+ return r;
+}
+
+static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
+{
+ gpa_t gpa = data & ~0x3f;
+
+ /* Bits 3:5 are reserved, Should be zero */
+ if (data & 0x38)
+ return 1;
+
+ vcpu->arch.apf.msr_val = data;
+
+ if (!(data & KVM_ASYNC_PF_ENABLED)) {
+ kvm_clear_async_pf_completion_queue(vcpu);
+ kvm_async_pf_hash_reset(vcpu);
+ return 0;
+ }
+
+ if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa,
+ sizeof(u32)))
+ return 1;
+
+ vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);
+ vcpu->arch.apf.delivery_as_pf_vmexit = data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;
+ kvm_async_pf_wakeup_all(vcpu);
+ return 0;
+}
+
+static void kvmclock_reset(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.pv_time_enabled = false;
+}
+
+static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
+{
+ ++vcpu->stat.tlb_flush;
+ kvm_x86_ops->tlb_flush(vcpu, invalidate_gpa);
+}
+
+static void record_steal_time(struct kvm_vcpu *vcpu)
+{
+ struct kvm_host_map map;
+ struct kvm_steal_time *st;
+
+ if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
+ return;
+
+ /* -EAGAIN is returned in atomic context so we can just return. */
+ if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT,
+ &map, &vcpu->arch.st.cache, false))
+ return;
+
+ st = map.hva +
+ offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
+
+ /*
+ * Doing a TLB flush here, on the guest's behalf, can avoid
+ * expensive IPIs.
+ */
+ if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB)
+ kvm_vcpu_flush_tlb(vcpu, false);
+
+ vcpu->arch.st.preempted = 0;
+
+ if (st->version & 1)
+ st->version += 1; /* first time write, random junk */
+
+ st->version += 1;
+
+ smp_wmb();
+
+ st->steal += current->sched_info.run_delay -
+ vcpu->arch.st.last_steal;
+ vcpu->arch.st.last_steal = current->sched_info.run_delay;
+
+ smp_wmb();
+
+ st->version += 1;
+
+ kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, false);
+}
+
+int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ bool pr = false;
+ u32 msr = msr_info->index;
+ u64 data = msr_info->data;
+
+ switch (msr) {
+ case MSR_AMD64_NB_CFG:
+ case MSR_IA32_UCODE_WRITE:
+ case MSR_VM_HSAVE_PA:
+ case MSR_AMD64_PATCH_LOADER:
+ case MSR_AMD64_BU_CFG2:
+ case MSR_AMD64_DC_CFG:
+ case MSR_F15H_EX_CFG:
+ break;
+
+ case MSR_IA32_UCODE_REV:
+ if (msr_info->host_initiated)
+ vcpu->arch.microcode_version = data;
+ break;
+ case MSR_IA32_ARCH_CAPABILITIES:
+ if (!msr_info->host_initiated)
+ return 1;
+ vcpu->arch.arch_capabilities = data;
+ break;
+ case MSR_EFER:
+ return set_efer(vcpu, msr_info);
+ case MSR_K7_HWCR:
+ data &= ~(u64)0x40; /* ignore flush filter disable */
+ data &= ~(u64)0x100; /* ignore ignne emulation enable */
+ data &= ~(u64)0x8; /* ignore TLB cache disable */
+ data &= ~(u64)0x40000; /* ignore Mc status write enable */
+ if (data != 0) {
+ vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
+ data);
+ return 1;
+ }
+ break;
+ case MSR_FAM10H_MMIO_CONF_BASE:
+ if (data != 0) {
+ vcpu_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: "
+ "0x%llx\n", data);
+ return 1;
+ }
+ break;
+ case MSR_IA32_DEBUGCTLMSR:
+ if (!data) {
+ /* We support the non-activated case already */
+ break;
+ } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {
+ /* Values other than LBR and BTF are vendor-specific,
+ thus reserved and should throw a #GP */
+ return 1;
+ }
+ vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
+ __func__, data);
+ break;
+ case 0x200 ... 0x2ff:
+ return kvm_mtrr_set_msr(vcpu, msr, data);
+ case MSR_IA32_APICBASE:
+ return kvm_set_apic_base(vcpu, msr_info);
+ case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
+ return kvm_x2apic_msr_write(vcpu, msr, data);
+ case MSR_IA32_TSCDEADLINE:
+ kvm_set_lapic_tscdeadline_msr(vcpu, data);
+ break;
+ case MSR_IA32_TSC_ADJUST:
+ if (guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST)) {
+ if (!msr_info->host_initiated) {
+ s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr;
+ adjust_tsc_offset_guest(vcpu, adj);
+ /* Before back to guest, tsc_timestamp must be adjusted
+ * as well, otherwise guest's percpu pvclock time could jump.
+ */
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+ }
+ vcpu->arch.ia32_tsc_adjust_msr = data;
+ }
+ break;
+ case MSR_IA32_MISC_ENABLE:
+ vcpu->arch.ia32_misc_enable_msr = data;
+ break;
+ case MSR_IA32_SMBASE:
+ if (!msr_info->host_initiated)
+ return 1;
+ vcpu->arch.smbase = data;
+ break;
+ case MSR_IA32_TSC:
+ kvm_write_tsc(vcpu, msr_info);
+ break;
+ case MSR_SMI_COUNT:
+ if (!msr_info->host_initiated)
+ return 1;
+ vcpu->arch.smi_count = data;
+ break;
+ case MSR_KVM_WALL_CLOCK_NEW:
+ case MSR_KVM_WALL_CLOCK:
+ vcpu->kvm->arch.wall_clock = data;
+ kvm_write_wall_clock(vcpu->kvm, data);
+ break;
+ case MSR_KVM_SYSTEM_TIME_NEW:
+ case MSR_KVM_SYSTEM_TIME: {
+ struct kvm_arch *ka = &vcpu->kvm->arch;
+
+ kvmclock_reset(vcpu);
+
+ if (vcpu->vcpu_id == 0 && !msr_info->host_initiated) {
+ bool tmp = (msr == MSR_KVM_SYSTEM_TIME);
+
+ if (ka->boot_vcpu_runs_old_kvmclock != tmp)
+ kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
+
+ ka->boot_vcpu_runs_old_kvmclock = tmp;
+ }
+
+ vcpu->arch.time = data;
+ kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
+
+ /* we verify if the enable bit is set... */
+ if (!(data & 1))
+ break;
+
+ if (kvm_gfn_to_hva_cache_init(vcpu->kvm,
+ &vcpu->arch.pv_time, data & ~1ULL,
+ sizeof(struct pvclock_vcpu_time_info)))
+ vcpu->arch.pv_time_enabled = false;
+ else
+ vcpu->arch.pv_time_enabled = true;
+
+ break;
+ }
+ case MSR_KVM_ASYNC_PF_EN:
+ if (kvm_pv_enable_async_pf(vcpu, data))
+ return 1;
+ break;
+ case MSR_KVM_STEAL_TIME:
+
+ if (unlikely(!sched_info_on()))
+ return 1;
+
+ if (data & KVM_STEAL_RESERVED_MASK)
+ return 1;
+
+ vcpu->arch.st.msr_val = data;
+
+ if (!(data & KVM_MSR_ENABLED))
+ break;
+
+ kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
+
+ break;
+ case MSR_KVM_PV_EOI_EN:
+ if (kvm_lapic_enable_pv_eoi(vcpu, data, sizeof(u8)))
+ return 1;
+ break;
+
+ case MSR_IA32_MCG_CTL:
+ case MSR_IA32_MCG_STATUS:
+ case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
+ return set_msr_mce(vcpu, msr_info);
+
+ case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
+ case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
+ pr = true; /* fall through */
+ case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
+ case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
+ if (kvm_pmu_is_valid_msr(vcpu, msr))
+ return kvm_pmu_set_msr(vcpu, msr_info);
+
+ if (pr || data != 0)
+ vcpu_unimpl(vcpu, "disabled perfctr wrmsr: "
+ "0x%x data 0x%llx\n", msr, data);
+ break;
+ case MSR_K7_CLK_CTL:
+ /*
+ * Ignore all writes to this no longer documented MSR.
+ * Writes are only relevant for old K7 processors,
+ * all pre-dating SVM, but a recommended workaround from
+ * AMD for these chips. It is possible to specify the
+ * affected processor models on the command line, hence
+ * the need to ignore the workaround.
+ */
+ break;
+ case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
+ case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
+ case HV_X64_MSR_CRASH_CTL:
+ case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
+ case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
+ case HV_X64_MSR_TSC_EMULATION_CONTROL:
+ case HV_X64_MSR_TSC_EMULATION_STATUS:
+ return kvm_hv_set_msr_common(vcpu, msr, data,
+ msr_info->host_initiated);
+ case MSR_IA32_BBL_CR_CTL3:
+ /* Drop writes to this legacy MSR -- see rdmsr
+ * counterpart for further detail.
+ */
+ if (report_ignored_msrs)
+ vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data 0x%llx\n",
+ msr, data);
+ break;
+ case MSR_AMD64_OSVW_ID_LENGTH:
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
+ return 1;
+ vcpu->arch.osvw.length = data;
+ break;
+ case MSR_AMD64_OSVW_STATUS:
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
+ return 1;
+ vcpu->arch.osvw.status = data;
+ break;
+ case MSR_PLATFORM_INFO:
+ if (!msr_info->host_initiated ||
+ (!(data & MSR_PLATFORM_INFO_CPUID_FAULT) &&
+ cpuid_fault_enabled(vcpu)))
+ return 1;
+ vcpu->arch.msr_platform_info = data;
+ break;
+ case MSR_MISC_FEATURES_ENABLES:
+ if (data & ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT ||
+ (data & MSR_MISC_FEATURES_ENABLES_CPUID_FAULT &&
+ !supports_cpuid_fault(vcpu)))
+ return 1;
+ vcpu->arch.msr_misc_features_enables = data;
+ break;
+ default:
+ if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
+ return xen_hvm_config(vcpu, data);
+ if (kvm_pmu_is_valid_msr(vcpu, msr))
+ return kvm_pmu_set_msr(vcpu, msr_info);
+ if (!ignore_msrs) {
+ vcpu_debug_ratelimited(vcpu, "unhandled wrmsr: 0x%x data 0x%llx\n",
+ msr, data);
+ return 1;
+ } else {
+ if (report_ignored_msrs)
+ vcpu_unimpl(vcpu,
+ "ignored wrmsr: 0x%x data 0x%llx\n",
+ msr, data);
+ break;
+ }
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_set_msr_common);
+
+
+/*
+ * Reads an msr value (of 'msr_index') into 'pdata'.
+ * Returns 0 on success, non-0 otherwise.
+ * Assumes vcpu_load() was already called.
+ */
+int kvm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
+{
+ return kvm_x86_ops->get_msr(vcpu, msr);
+}
+EXPORT_SYMBOL_GPL(kvm_get_msr);
+
+static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
+{
+ u64 data;
+ u64 mcg_cap = vcpu->arch.mcg_cap;
+ unsigned bank_num = mcg_cap & 0xff;
+
+ switch (msr) {
+ case MSR_IA32_P5_MC_ADDR:
+ case MSR_IA32_P5_MC_TYPE:
+ data = 0;
+ break;
+ case MSR_IA32_MCG_CAP:
+ data = vcpu->arch.mcg_cap;
+ break;
+ case MSR_IA32_MCG_CTL:
+ if (!(mcg_cap & MCG_CTL_P) && !host)
+ return 1;
+ data = vcpu->arch.mcg_ctl;
+ break;
+ case MSR_IA32_MCG_STATUS:
+ data = vcpu->arch.mcg_status;
+ break;
+ default:
+ if (msr >= MSR_IA32_MC0_CTL &&
+ msr < MSR_IA32_MCx_CTL(bank_num)) {
+ u32 offset = array_index_nospec(
+ msr - MSR_IA32_MC0_CTL,
+ MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL);
+
+ data = vcpu->arch.mce_banks[offset];
+ break;
+ }
+ return 1;
+ }
+ *pdata = data;
+ return 0;
+}
+
+int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ switch (msr_info->index) {
+ case MSR_IA32_PLATFORM_ID:
+ case MSR_IA32_EBL_CR_POWERON:
+ case MSR_IA32_DEBUGCTLMSR:
+ case MSR_IA32_LASTBRANCHFROMIP:
+ case MSR_IA32_LASTBRANCHTOIP:
+ case MSR_IA32_LASTINTFROMIP:
+ case MSR_IA32_LASTINTTOIP:
+ case MSR_K8_SYSCFG:
+ case MSR_K8_TSEG_ADDR:
+ case MSR_K8_TSEG_MASK:
+ case MSR_K7_HWCR:
+ case MSR_VM_HSAVE_PA:
+ case MSR_K8_INT_PENDING_MSG:
+ case MSR_AMD64_NB_CFG:
+ case MSR_FAM10H_MMIO_CONF_BASE:
+ case MSR_AMD64_BU_CFG2:
+ case MSR_IA32_PERF_CTL:
+ case MSR_AMD64_DC_CFG:
+ case MSR_F15H_EX_CFG:
+ msr_info->data = 0;
+ break;
+ case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5:
+ case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
+ case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
+ case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
+ case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
+ if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
+ return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data);
+ msr_info->data = 0;
+ break;
+ case MSR_IA32_UCODE_REV:
+ msr_info->data = vcpu->arch.microcode_version;
+ break;
+ case MSR_IA32_ARCH_CAPABILITIES:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES))
+ return 1;
+ msr_info->data = vcpu->arch.arch_capabilities;
+ break;
+ case MSR_IA32_TSC:
+ msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + vcpu->arch.tsc_offset;
+ break;
+ case MSR_MTRRcap:
+ case 0x200 ... 0x2ff:
+ return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data);
+ case 0xcd: /* fsb frequency */
+ msr_info->data = 3;
+ break;
+ /*
+ * MSR_EBC_FREQUENCY_ID
+ * Conservative value valid for even the basic CPU models.
+ * Models 0,1: 000 in bits 23:21 indicating a bus speed of
+ * 100MHz, model 2 000 in bits 18:16 indicating 100MHz,
+ * and 266MHz for model 3, or 4. Set Core Clock
+ * Frequency to System Bus Frequency Ratio to 1 (bits
+ * 31:24) even though these are only valid for CPU
+ * models > 2, however guests may end up dividing or
+ * multiplying by zero otherwise.
+ */
+ case MSR_EBC_FREQUENCY_ID:
+ msr_info->data = 1 << 24;
+ break;
+ case MSR_IA32_APICBASE:
+ msr_info->data = kvm_get_apic_base(vcpu);
+ break;
+ case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
+ return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data);
+ break;
+ case MSR_IA32_TSCDEADLINE:
+ msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu);
+ break;
+ case MSR_IA32_TSC_ADJUST:
+ msr_info->data = (u64)vcpu->arch.ia32_tsc_adjust_msr;
+ break;
+ case MSR_IA32_MISC_ENABLE:
+ msr_info->data = vcpu->arch.ia32_misc_enable_msr;
+ break;
+ case MSR_IA32_SMBASE:
+ if (!msr_info->host_initiated)
+ return 1;
+ msr_info->data = vcpu->arch.smbase;
+ break;
+ case MSR_SMI_COUNT:
+ msr_info->data = vcpu->arch.smi_count;
+ break;
+ case MSR_IA32_PERF_STATUS:
+ /* TSC increment by tick */
+ msr_info->data = 1000ULL;
+ /* CPU multiplier */
+ msr_info->data |= (((uint64_t)4ULL) << 40);
+ break;
+ case MSR_EFER:
+ msr_info->data = vcpu->arch.efer;
+ break;
+ case MSR_KVM_WALL_CLOCK:
+ case MSR_KVM_WALL_CLOCK_NEW:
+ msr_info->data = vcpu->kvm->arch.wall_clock;
+ break;
+ case MSR_KVM_SYSTEM_TIME:
+ case MSR_KVM_SYSTEM_TIME_NEW:
+ msr_info->data = vcpu->arch.time;
+ break;
+ case MSR_KVM_ASYNC_PF_EN:
+ msr_info->data = vcpu->arch.apf.msr_val;
+ break;
+ case MSR_KVM_STEAL_TIME:
+ msr_info->data = vcpu->arch.st.msr_val;
+ break;
+ case MSR_KVM_PV_EOI_EN:
+ msr_info->data = vcpu->arch.pv_eoi.msr_val;
+ break;
+ case MSR_IA32_P5_MC_ADDR:
+ case MSR_IA32_P5_MC_TYPE:
+ case MSR_IA32_MCG_CAP:
+ case MSR_IA32_MCG_CTL:
+ case MSR_IA32_MCG_STATUS:
+ case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
+ return get_msr_mce(vcpu, msr_info->index, &msr_info->data,
+ msr_info->host_initiated);
+ case MSR_K7_CLK_CTL:
+ /*
+ * Provide expected ramp-up count for K7. All other
+ * are set to zero, indicating minimum divisors for
+ * every field.
+ *
+ * This prevents guest kernels on AMD host with CPU
+ * type 6, model 8 and higher from exploding due to
+ * the rdmsr failing.
+ */
+ msr_info->data = 0x20000000;
+ break;
+ case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
+ case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
+ case HV_X64_MSR_CRASH_CTL:
+ case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
+ case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
+ case HV_X64_MSR_TSC_EMULATION_CONTROL:
+ case HV_X64_MSR_TSC_EMULATION_STATUS:
+ return kvm_hv_get_msr_common(vcpu,
+ msr_info->index, &msr_info->data,
+ msr_info->host_initiated);
+ break;
+ case MSR_IA32_BBL_CR_CTL3:
+ /* This legacy MSR exists but isn't fully documented in current
+ * silicon. It is however accessed by winxp in very narrow
+ * scenarios where it sets bit #19, itself documented as
+ * a "reserved" bit. Best effort attempt to source coherent
+ * read data here should the balance of the register be
+ * interpreted by the guest:
+ *
+ * L2 cache control register 3: 64GB range, 256KB size,
+ * enabled, latency 0x1, configured
+ */
+ msr_info->data = 0xbe702111;
+ break;
+ case MSR_AMD64_OSVW_ID_LENGTH:
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
+ return 1;
+ msr_info->data = vcpu->arch.osvw.length;
+ break;
+ case MSR_AMD64_OSVW_STATUS:
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
+ return 1;
+ msr_info->data = vcpu->arch.osvw.status;
+ break;
+ case MSR_PLATFORM_INFO:
+ if (!msr_info->host_initiated &&
+ !vcpu->kvm->arch.guest_can_read_msr_platform_info)
+ return 1;
+ msr_info->data = vcpu->arch.msr_platform_info;
+ break;
+ case MSR_MISC_FEATURES_ENABLES:
+ msr_info->data = vcpu->arch.msr_misc_features_enables;
+ break;
+ default:
+ if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
+ return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data);
+ if (!ignore_msrs) {
+ vcpu_debug_ratelimited(vcpu, "unhandled rdmsr: 0x%x\n",
+ msr_info->index);
+ return 1;
+ } else {
+ if (report_ignored_msrs)
+ vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n",
+ msr_info->index);
+ msr_info->data = 0;
+ }
+ break;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_get_msr_common);
+
+/*
+ * Read or write a bunch of msrs. All parameters are kernel addresses.
+ *
+ * @return number of msrs set successfully.
+ */
+static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
+ struct kvm_msr_entry *entries,
+ int (*do_msr)(struct kvm_vcpu *vcpu,
+ unsigned index, u64 *data))
+{
+ int i;
+
+ for (i = 0; i < msrs->nmsrs; ++i)
+ if (do_msr(vcpu, entries[i].index, &entries[i].data))
+ break;
+
+ return i;
+}
+
+/*
+ * Read or write a bunch of msrs. Parameters are user addresses.
+ *
+ * @return number of msrs set successfully.
+ */
+static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
+ int (*do_msr)(struct kvm_vcpu *vcpu,
+ unsigned index, u64 *data),
+ int writeback)
+{
+ struct kvm_msrs msrs;
+ struct kvm_msr_entry *entries;
+ int r, n;
+ unsigned size;
+
+ r = -EFAULT;
+ if (copy_from_user(&msrs, user_msrs, sizeof msrs))
+ goto out;
+
+ r = -E2BIG;
+ if (msrs.nmsrs >= MAX_IO_MSRS)
+ goto out;
+
+ size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
+ entries = memdup_user(user_msrs->entries, size);
+ if (IS_ERR(entries)) {
+ r = PTR_ERR(entries);
+ goto out;
+ }
+
+ r = n = __msr_io(vcpu, &msrs, entries, do_msr);
+ if (r < 0)
+ goto out_free;
+
+ r = -EFAULT;
+ if (writeback && copy_to_user(user_msrs->entries, entries, size))
+ goto out_free;
+
+ r = n;
+
+out_free:
+ kfree(entries);
+out:
+ return r;
+}
+
+static inline bool kvm_can_mwait_in_guest(void)
+{
+ return boot_cpu_has(X86_FEATURE_MWAIT) &&
+ !boot_cpu_has_bug(X86_BUG_MONITOR) &&
+ boot_cpu_has(X86_FEATURE_ARAT);
+}
+
+int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
+{
+ int r = 0;
+
+ switch (ext) {
+ case KVM_CAP_IRQCHIP:
+ case KVM_CAP_HLT:
+ case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
+ case KVM_CAP_SET_TSS_ADDR:
+ case KVM_CAP_EXT_CPUID:
+ case KVM_CAP_EXT_EMUL_CPUID:
+ case KVM_CAP_CLOCKSOURCE:
+ case KVM_CAP_PIT:
+ case KVM_CAP_NOP_IO_DELAY:
+ case KVM_CAP_MP_STATE:
+ case KVM_CAP_SYNC_MMU:
+ case KVM_CAP_USER_NMI:
+ case KVM_CAP_REINJECT_CONTROL:
+ case KVM_CAP_IRQ_INJECT_STATUS:
+ case KVM_CAP_IOEVENTFD:
+ case KVM_CAP_IOEVENTFD_NO_LENGTH:
+ case KVM_CAP_PIT2:
+ case KVM_CAP_PIT_STATE2:
+ case KVM_CAP_SET_IDENTITY_MAP_ADDR:
+ case KVM_CAP_XEN_HVM:
+ case KVM_CAP_VCPU_EVENTS:
+ case KVM_CAP_HYPERV:
+ case KVM_CAP_HYPERV_VAPIC:
+ case KVM_CAP_HYPERV_SPIN:
+ case KVM_CAP_HYPERV_SYNIC:
+ case KVM_CAP_HYPERV_SYNIC2:
+ case KVM_CAP_HYPERV_VP_INDEX:
+ case KVM_CAP_HYPERV_EVENTFD:
+ case KVM_CAP_HYPERV_TLBFLUSH:
+ case KVM_CAP_PCI_SEGMENT:
+ case KVM_CAP_DEBUGREGS:
+ case KVM_CAP_X86_ROBUST_SINGLESTEP:
+ case KVM_CAP_XSAVE:
+ case KVM_CAP_ASYNC_PF:
+ case KVM_CAP_GET_TSC_KHZ:
+ case KVM_CAP_KVMCLOCK_CTRL:
+ case KVM_CAP_READONLY_MEM:
+ case KVM_CAP_HYPERV_TIME:
+ case KVM_CAP_IOAPIC_POLARITY_IGNORED:
+ case KVM_CAP_TSC_DEADLINE_TIMER:
+ case KVM_CAP_ENABLE_CAP_VM:
+ case KVM_CAP_DISABLE_QUIRKS:
+ case KVM_CAP_SET_BOOT_CPU_ID:
+ case KVM_CAP_SPLIT_IRQCHIP:
+ case KVM_CAP_IMMEDIATE_EXIT:
+ case KVM_CAP_GET_MSR_FEATURES:
+ case KVM_CAP_MSR_PLATFORM_INFO:
+ r = 1;
+ break;
+ case KVM_CAP_SYNC_REGS:
+ r = KVM_SYNC_X86_VALID_FIELDS;
+ break;
+ case KVM_CAP_ADJUST_CLOCK:
+ r = KVM_CLOCK_TSC_STABLE;
+ break;
+ case KVM_CAP_X86_DISABLE_EXITS:
+ r |= KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_PAUSE;
+ if(kvm_can_mwait_in_guest())
+ r |= KVM_X86_DISABLE_EXITS_MWAIT;
+ break;
+ case KVM_CAP_X86_SMM:
+ /* SMBASE is usually relocated above 1M on modern chipsets,
+ * and SMM handlers might indeed rely on 4G segment limits,
+ * so do not report SMM to be available if real mode is
+ * emulated via vm86 mode. Still, do not go to great lengths
+ * to avoid userspace's usage of the feature, because it is a
+ * fringe case that is not enabled except via specific settings
+ * of the module parameters.
+ */
+ r = kvm_x86_ops->has_emulated_msr(MSR_IA32_SMBASE);
+ break;
+ case KVM_CAP_VAPIC:
+ r = !kvm_x86_ops->cpu_has_accelerated_tpr();
+ break;
+ case KVM_CAP_NR_VCPUS:
+ r = KVM_SOFT_MAX_VCPUS;
+ break;
+ case KVM_CAP_MAX_VCPUS:
+ r = KVM_MAX_VCPUS;
+ break;
+ case KVM_CAP_MAX_VCPU_ID:
+ r = KVM_MAX_VCPU_ID;
+ break;
+ case KVM_CAP_NR_MEMSLOTS:
+ r = KVM_USER_MEM_SLOTS;
+ break;
+ case KVM_CAP_PV_MMU: /* obsolete */
+ r = 0;
+ break;
+ case KVM_CAP_MCE:
+ r = KVM_MAX_MCE_BANKS;
+ break;
+ case KVM_CAP_XCRS:
+ r = boot_cpu_has(X86_FEATURE_XSAVE);
+ break;
+ case KVM_CAP_TSC_CONTROL:
+ r = kvm_has_tsc_control;
+ break;
+ case KVM_CAP_X2APIC_API:
+ r = KVM_X2APIC_API_VALID_FLAGS;
+ break;
+ case KVM_CAP_NESTED_STATE:
+ r = kvm_x86_ops->get_nested_state ?
+ kvm_x86_ops->get_nested_state(NULL, 0, 0) : 0;
+ break;
+ default:
+ break;
+ }
+ return r;
+
+}
+
+long kvm_arch_dev_ioctl(struct file *filp,
+ unsigned int ioctl, unsigned long arg)
+{
+ void __user *argp = (void __user *)arg;
+ long r;
+
+ switch (ioctl) {
+ case KVM_GET_MSR_INDEX_LIST: {
+ struct kvm_msr_list __user *user_msr_list = argp;
+ struct kvm_msr_list msr_list;
+ unsigned n;
+
+ r = -EFAULT;
+ if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
+ goto out;
+ n = msr_list.nmsrs;
+ msr_list.nmsrs = num_msrs_to_save + num_emulated_msrs;
+ if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
+ goto out;
+ r = -E2BIG;
+ if (n < msr_list.nmsrs)
+ goto out;
+ r = -EFAULT;
+ if (copy_to_user(user_msr_list->indices, &msrs_to_save,
+ num_msrs_to_save * sizeof(u32)))
+ goto out;
+ if (copy_to_user(user_msr_list->indices + num_msrs_to_save,
+ &emulated_msrs,
+ num_emulated_msrs * sizeof(u32)))
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_GET_SUPPORTED_CPUID:
+ case KVM_GET_EMULATED_CPUID: {
+ struct kvm_cpuid2 __user *cpuid_arg = argp;
+ struct kvm_cpuid2 cpuid;
+
+ r = -EFAULT;
+ if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
+ goto out;
+
+ r = kvm_dev_ioctl_get_cpuid(&cpuid, cpuid_arg->entries,
+ ioctl);
+ if (r)
+ goto out;
+
+ r = -EFAULT;
+ if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_X86_GET_MCE_CAP_SUPPORTED: {
+ r = -EFAULT;
+ if (copy_to_user(argp, &kvm_mce_cap_supported,
+ sizeof(kvm_mce_cap_supported)))
+ goto out;
+ r = 0;
+ break;
+ case KVM_GET_MSR_FEATURE_INDEX_LIST: {
+ struct kvm_msr_list __user *user_msr_list = argp;
+ struct kvm_msr_list msr_list;
+ unsigned int n;
+
+ r = -EFAULT;
+ if (copy_from_user(&msr_list, user_msr_list, sizeof(msr_list)))
+ goto out;
+ n = msr_list.nmsrs;
+ msr_list.nmsrs = num_msr_based_features;
+ if (copy_to_user(user_msr_list, &msr_list, sizeof(msr_list)))
+ goto out;
+ r = -E2BIG;
+ if (n < msr_list.nmsrs)
+ goto out;
+ r = -EFAULT;
+ if (copy_to_user(user_msr_list->indices, &msr_based_features,
+ num_msr_based_features * sizeof(u32)))
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_GET_MSRS:
+ r = msr_io(NULL, argp, do_get_msr_feature, 1);
+ break;
+ }
+ default:
+ r = -EINVAL;
+ }
+out:
+ return r;
+}
+
+static void wbinvd_ipi(void *garbage)
+{
+ wbinvd();
+}
+
+static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
+{
+ return kvm_arch_has_noncoherent_dma(vcpu->kvm);
+}
+
+void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+ /* Address WBINVD may be executed by guest */
+ if (need_emulate_wbinvd(vcpu)) {
+ if (kvm_x86_ops->has_wbinvd_exit())
+ cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
+ else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
+ smp_call_function_single(vcpu->cpu,
+ wbinvd_ipi, NULL, 1);
+ }
+
+ kvm_x86_ops->vcpu_load(vcpu, cpu);
+
+ /* Apply any externally detected TSC adjustments (due to suspend) */
+ if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
+ adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
+ vcpu->arch.tsc_offset_adjustment = 0;
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+ }
+
+ if (unlikely(vcpu->cpu != cpu) || kvm_check_tsc_unstable()) {
+ s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
+ rdtsc() - vcpu->arch.last_host_tsc;
+ if (tsc_delta < 0)
+ mark_tsc_unstable("KVM discovered backwards TSC");
+
+ if (kvm_check_tsc_unstable()) {
+ u64 offset = kvm_compute_tsc_offset(vcpu,
+ vcpu->arch.last_guest_tsc);
+ kvm_vcpu_write_tsc_offset(vcpu, offset);
+ vcpu->arch.tsc_catchup = 1;
+ }
+
+ if (kvm_lapic_hv_timer_in_use(vcpu))
+ kvm_lapic_restart_hv_timer(vcpu);
+
+ /*
+ * On a host with synchronized TSC, there is no need to update
+ * kvmclock on vcpu->cpu migration
+ */
+ if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1)
+ kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
+ if (vcpu->cpu != cpu)
+ kvm_make_request(KVM_REQ_MIGRATE_TIMER, vcpu);
+ vcpu->cpu = cpu;
+ }
+
+ kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
+}
+
+static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
+{
+ struct kvm_host_map map;
+ struct kvm_steal_time *st;
+
+ if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
+ return;
+
+ if (vcpu->arch.st.preempted)
+ return;
+
+ if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map,
+ &vcpu->arch.st.cache, true))
+ return;
+
+ st = map.hva +
+ offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
+
+ st->preempted = vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
+
+ kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true);
+}
+
+void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
+{
+ int idx;
+
+ if (vcpu->preempted)
+ vcpu->arch.preempted_in_kernel = !kvm_x86_ops->get_cpl(vcpu);
+
+ /*
+ * Disable page faults because we're in atomic context here.
+ * kvm_write_guest_offset_cached() would call might_fault()
+ * that relies on pagefault_disable() to tell if there's a
+ * bug. NOTE: the write to guest memory may not go through if
+ * during postcopy live migration or if there's heavy guest
+ * paging.
+ */
+ pagefault_disable();
+ /*
+ * kvm_memslots() will be called by
+ * kvm_write_guest_offset_cached() so take the srcu lock.
+ */
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ kvm_steal_time_set_preempted(vcpu);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ pagefault_enable();
+ kvm_x86_ops->vcpu_put(vcpu);
+ vcpu->arch.last_host_tsc = rdtsc();
+ /*
+ * If userspace has set any breakpoints or watchpoints, dr6 is restored
+ * on every vmexit, but if not, we might have a stale dr6 from the
+ * guest. do_debug expects dr6 to be cleared after it runs, do the same.
+ */
+ set_debugreg(0, 6);
+}
+
+static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
+ struct kvm_lapic_state *s)
+{
+ if (vcpu->arch.apicv_active)
+ kvm_x86_ops->sync_pir_to_irr(vcpu);
+
+ return kvm_apic_get_state(vcpu, s);
+}
+
+static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
+ struct kvm_lapic_state *s)
+{
+ int r;
+
+ r = kvm_apic_set_state(vcpu, s);
+ if (r)
+ return r;
+ update_cr8_intercept(vcpu);
+
+ return 0;
+}
+
+static int kvm_cpu_accept_dm_intr(struct kvm_vcpu *vcpu)
+{
+ /*
+ * We can accept userspace's request for interrupt injection
+ * as long as we have a place to store the interrupt number.
+ * The actual injection will happen when the CPU is able to
+ * deliver the interrupt.
+ */
+ if (kvm_cpu_has_extint(vcpu))
+ return false;
+
+ /* Acknowledging ExtINT does not happen if LINT0 is masked. */
+ return (!lapic_in_kernel(vcpu) ||
+ kvm_apic_accept_pic_intr(vcpu));
+}
+
+static int kvm_vcpu_ready_for_interrupt_injection(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Do not cause an interrupt window exit if an exception
+ * is pending or an event needs reinjection; userspace
+ * might want to inject the interrupt manually using KVM_SET_REGS
+ * or KVM_SET_SREGS. For that to work, we must be at an
+ * instruction boundary and with no events half-injected.
+ */
+ return (kvm_arch_interrupt_allowed(vcpu) &&
+ kvm_cpu_accept_dm_intr(vcpu) &&
+ !kvm_event_needs_reinjection(vcpu) &&
+ !vcpu->arch.exception.pending);
+}
+
+static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
+ struct kvm_interrupt *irq)
+{
+ if (irq->irq >= KVM_NR_INTERRUPTS)
+ return -EINVAL;
+
+ if (!irqchip_in_kernel(vcpu->kvm)) {
+ kvm_queue_interrupt(vcpu, irq->irq, false);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ return 0;
+ }
+
+ /*
+ * With in-kernel LAPIC, we only use this to inject EXTINT, so
+ * fail for in-kernel 8259.
+ */
+ if (pic_in_kernel(vcpu->kvm))
+ return -ENXIO;
+
+ if (vcpu->arch.pending_external_vector != -1)
+ return -EEXIST;
+
+ vcpu->arch.pending_external_vector = irq->irq;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ return 0;
+}
+
+static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
+{
+ kvm_inject_nmi(vcpu);
+
+ return 0;
+}
+
+static int kvm_vcpu_ioctl_smi(struct kvm_vcpu *vcpu)
+{
+ kvm_make_request(KVM_REQ_SMI, vcpu);
+
+ return 0;
+}
+
+static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
+ struct kvm_tpr_access_ctl *tac)
+{
+ if (tac->flags)
+ return -EINVAL;
+ vcpu->arch.tpr_access_reporting = !!tac->enabled;
+ return 0;
+}
+
+static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
+ u64 mcg_cap)
+{
+ int r;
+ unsigned bank_num = mcg_cap & 0xff, bank;
+
+ r = -EINVAL;
+ if (!bank_num || bank_num > KVM_MAX_MCE_BANKS)
+ goto out;
+ if (mcg_cap & ~(kvm_mce_cap_supported | 0xff | 0xff0000))
+ goto out;
+ r = 0;
+ vcpu->arch.mcg_cap = mcg_cap;
+ /* Init IA32_MCG_CTL to all 1s */
+ if (mcg_cap & MCG_CTL_P)
+ vcpu->arch.mcg_ctl = ~(u64)0;
+ /* Init IA32_MCi_CTL to all 1s */
+ for (bank = 0; bank < bank_num; bank++)
+ vcpu->arch.mce_banks[bank*4] = ~(u64)0;
+
+ if (kvm_x86_ops->setup_mce)
+ kvm_x86_ops->setup_mce(vcpu);
+out:
+ return r;
+}
+
+static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
+ struct kvm_x86_mce *mce)
+{
+ u64 mcg_cap = vcpu->arch.mcg_cap;
+ unsigned bank_num = mcg_cap & 0xff;
+ u64 *banks = vcpu->arch.mce_banks;
+
+ if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))
+ return -EINVAL;
+ /*
+ * if IA32_MCG_CTL is not all 1s, the uncorrected error
+ * reporting is disabled
+ */
+ if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&
+ vcpu->arch.mcg_ctl != ~(u64)0)
+ return 0;
+ banks += 4 * mce->bank;
+ /*
+ * if IA32_MCi_CTL is not all 1s, the uncorrected error
+ * reporting is disabled for the bank
+ */
+ if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0)
+ return 0;
+ if (mce->status & MCI_STATUS_UC) {
+ if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
+ !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) {
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ return 0;
+ }
+ if (banks[1] & MCI_STATUS_VAL)
+ mce->status |= MCI_STATUS_OVER;
+ banks[2] = mce->addr;
+ banks[3] = mce->misc;
+ vcpu->arch.mcg_status = mce->mcg_status;
+ banks[1] = mce->status;
+ kvm_queue_exception(vcpu, MC_VECTOR);
+ } else if (!(banks[1] & MCI_STATUS_VAL)
+ || !(banks[1] & MCI_STATUS_UC)) {
+ if (banks[1] & MCI_STATUS_VAL)
+ mce->status |= MCI_STATUS_OVER;
+ banks[2] = mce->addr;
+ banks[3] = mce->misc;
+ banks[1] = mce->status;
+ } else
+ banks[1] |= MCI_STATUS_OVER;
+ return 0;
+}
+
+static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
+ struct kvm_vcpu_events *events)
+{
+ process_nmi(vcpu);
+
+ if (kvm_check_request(KVM_REQ_SMI, vcpu))
+ process_smi(vcpu);
+
+ /*
+ * FIXME: pass injected and pending separately. This is only
+ * needed for nested virtualization, whose state cannot be
+ * migrated yet. For now we can combine them.
+ */
+ events->exception.injected =
+ (vcpu->arch.exception.pending ||
+ vcpu->arch.exception.injected) &&
+ !kvm_exception_is_soft(vcpu->arch.exception.nr);
+ events->exception.nr = vcpu->arch.exception.nr;
+ events->exception.has_error_code = vcpu->arch.exception.has_error_code;
+ events->exception.pad = 0;
+ events->exception.error_code = vcpu->arch.exception.error_code;
+
+ events->interrupt.injected =
+ vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft;
+ events->interrupt.nr = vcpu->arch.interrupt.nr;
+ events->interrupt.soft = 0;
+ events->interrupt.shadow = kvm_x86_ops->get_interrupt_shadow(vcpu);
+
+ events->nmi.injected = vcpu->arch.nmi_injected;
+ events->nmi.pending = vcpu->arch.nmi_pending != 0;
+ events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu);
+ events->nmi.pad = 0;
+
+ events->sipi_vector = 0; /* never valid when reporting to user space */
+
+ events->smi.smm = is_smm(vcpu);
+ events->smi.pending = vcpu->arch.smi_pending;
+ events->smi.smm_inside_nmi =
+ !!(vcpu->arch.hflags & HF_SMM_INSIDE_NMI_MASK);
+ events->smi.latched_init = kvm_lapic_latched_init(vcpu);
+
+ events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
+ | KVM_VCPUEVENT_VALID_SHADOW
+ | KVM_VCPUEVENT_VALID_SMM);
+ memset(&events->reserved, 0, sizeof(events->reserved));
+}
+
+static void kvm_set_hflags(struct kvm_vcpu *vcpu, unsigned emul_flags);
+
+static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
+ struct kvm_vcpu_events *events)
+{
+ if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
+ | KVM_VCPUEVENT_VALID_SIPI_VECTOR
+ | KVM_VCPUEVENT_VALID_SHADOW
+ | KVM_VCPUEVENT_VALID_SMM))
+ return -EINVAL;
+
+ if (events->exception.injected &&
+ (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR ||
+ is_guest_mode(vcpu)))
+ return -EINVAL;
+
+ /* INITs are latched while in SMM */
+ if (events->flags & KVM_VCPUEVENT_VALID_SMM &&
+ (events->smi.smm || events->smi.pending) &&
+ vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED)
+ return -EINVAL;
+
+ process_nmi(vcpu);
+ vcpu->arch.exception.injected = false;
+ vcpu->arch.exception.pending = events->exception.injected;
+ vcpu->arch.exception.nr = events->exception.nr;
+ vcpu->arch.exception.has_error_code = events->exception.has_error_code;
+ vcpu->arch.exception.error_code = events->exception.error_code;
+
+ vcpu->arch.interrupt.injected = events->interrupt.injected;
+ vcpu->arch.interrupt.nr = events->interrupt.nr;
+ vcpu->arch.interrupt.soft = events->interrupt.soft;
+ if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
+ kvm_x86_ops->set_interrupt_shadow(vcpu,
+ events->interrupt.shadow);
+
+ vcpu->arch.nmi_injected = events->nmi.injected;
+ if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
+ vcpu->arch.nmi_pending = events->nmi.pending;
+ kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked);
+
+ if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
+ lapic_in_kernel(vcpu))
+ vcpu->arch.apic->sipi_vector = events->sipi_vector;
+
+ if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
+ u32 hflags = vcpu->arch.hflags;
+ if (events->smi.smm)
+ hflags |= HF_SMM_MASK;
+ else
+ hflags &= ~HF_SMM_MASK;
+ kvm_set_hflags(vcpu, hflags);
+
+ vcpu->arch.smi_pending = events->smi.pending;
+
+ if (events->smi.smm) {
+ if (events->smi.smm_inside_nmi)
+ vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
+ else
+ vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK;
+ if (lapic_in_kernel(vcpu)) {
+ if (events->smi.latched_init)
+ set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
+ else
+ clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
+ }
+ }
+ }
+
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ return 0;
+}
+
+static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
+ struct kvm_debugregs *dbgregs)
+{
+ unsigned long val;
+
+ memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
+ kvm_get_dr(vcpu, 6, &val);
+ dbgregs->dr6 = val;
+ dbgregs->dr7 = vcpu->arch.dr7;
+ dbgregs->flags = 0;
+ memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved));
+}
+
+static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
+ struct kvm_debugregs *dbgregs)
+{
+ if (dbgregs->flags)
+ return -EINVAL;
+
+ if (dbgregs->dr6 & ~0xffffffffull)
+ return -EINVAL;
+ if (dbgregs->dr7 & ~0xffffffffull)
+ return -EINVAL;
+
+ memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
+ kvm_update_dr0123(vcpu);
+ vcpu->arch.dr6 = dbgregs->dr6;
+ kvm_update_dr6(vcpu);
+ vcpu->arch.dr7 = dbgregs->dr7;
+ kvm_update_dr7(vcpu);
+
+ return 0;
+}
+
+#define XSTATE_COMPACTION_ENABLED (1ULL << 63)
+
+static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu)
+{
+ struct xregs_state *xsave = &vcpu->arch.guest_fpu.state.xsave;
+ u64 xstate_bv = xsave->header.xfeatures;
+ u64 valid;
+
+ /*
+ * Copy legacy XSAVE area, to avoid complications with CPUID
+ * leaves 0 and 1 in the loop below.
+ */
+ memcpy(dest, xsave, XSAVE_HDR_OFFSET);
+
+ /* Set XSTATE_BV */
+ xstate_bv &= vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FPSSE;
+ *(u64 *)(dest + XSAVE_HDR_OFFSET) = xstate_bv;
+
+ /*
+ * Copy each region from the possibly compacted offset to the
+ * non-compacted offset.
+ */
+ valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
+ while (valid) {
+ u64 feature = valid & -valid;
+ int index = fls64(feature) - 1;
+ void *src = get_xsave_addr(xsave, feature);
+
+ if (src) {
+ u32 size, offset, ecx, edx;
+ cpuid_count(XSTATE_CPUID, index,
+ &size, &offset, &ecx, &edx);
+ if (feature == XFEATURE_MASK_PKRU)
+ memcpy(dest + offset, &vcpu->arch.pkru,
+ sizeof(vcpu->arch.pkru));
+ else
+ memcpy(dest + offset, src, size);
+
+ }
+
+ valid -= feature;
+ }
+}
+
+static void load_xsave(struct kvm_vcpu *vcpu, u8 *src)
+{
+ struct xregs_state *xsave = &vcpu->arch.guest_fpu.state.xsave;
+ u64 xstate_bv = *(u64 *)(src + XSAVE_HDR_OFFSET);
+ u64 valid;
+
+ /*
+ * Copy legacy XSAVE area, to avoid complications with CPUID
+ * leaves 0 and 1 in the loop below.
+ */
+ memcpy(xsave, src, XSAVE_HDR_OFFSET);
+
+ /* Set XSTATE_BV and possibly XCOMP_BV. */
+ xsave->header.xfeatures = xstate_bv;
+ if (boot_cpu_has(X86_FEATURE_XSAVES))
+ xsave->header.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED;
+
+ /*
+ * Copy each region from the non-compacted offset to the
+ * possibly compacted offset.
+ */
+ valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
+ while (valid) {
+ u64 feature = valid & -valid;
+ int index = fls64(feature) - 1;
+ void *dest = get_xsave_addr(xsave, feature);
+
+ if (dest) {
+ u32 size, offset, ecx, edx;
+ cpuid_count(XSTATE_CPUID, index,
+ &size, &offset, &ecx, &edx);
+ if (feature == XFEATURE_MASK_PKRU)
+ memcpy(&vcpu->arch.pkru, src + offset,
+ sizeof(vcpu->arch.pkru));
+ else
+ memcpy(dest, src + offset, size);
+ }
+
+ valid -= feature;
+ }
+}
+
+static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
+ struct kvm_xsave *guest_xsave)
+{
+ if (boot_cpu_has(X86_FEATURE_XSAVE)) {
+ memset(guest_xsave, 0, sizeof(struct kvm_xsave));
+ fill_xsave((u8 *) guest_xsave->region, vcpu);
+ } else {
+ memcpy(guest_xsave->region,
+ &vcpu->arch.guest_fpu.state.fxsave,
+ sizeof(struct fxregs_state));
+ *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] =
+ XFEATURE_MASK_FPSSE;
+ }
+}
+
+#define XSAVE_MXCSR_OFFSET 24
+
+static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
+ struct kvm_xsave *guest_xsave)
+{
+ u64 xstate_bv =
+ *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)];
+ u32 mxcsr = *(u32 *)&guest_xsave->region[XSAVE_MXCSR_OFFSET / sizeof(u32)];
+
+ if (boot_cpu_has(X86_FEATURE_XSAVE)) {
+ /*
+ * Here we allow setting states that are not present in
+ * CPUID leaf 0xD, index 0, EDX:EAX. This is for compatibility
+ * with old userspace.
+ */
+ if (xstate_bv & ~kvm_supported_xcr0() ||
+ mxcsr & ~mxcsr_feature_mask)
+ return -EINVAL;
+ load_xsave(vcpu, (u8 *)guest_xsave->region);
+ } else {
+ if (xstate_bv & ~XFEATURE_MASK_FPSSE ||
+ mxcsr & ~mxcsr_feature_mask)
+ return -EINVAL;
+ memcpy(&vcpu->arch.guest_fpu.state.fxsave,
+ guest_xsave->region, sizeof(struct fxregs_state));
+ }
+ return 0;
+}
+
+static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
+ struct kvm_xcrs *guest_xcrs)
+{
+ if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
+ guest_xcrs->nr_xcrs = 0;
+ return;
+ }
+
+ guest_xcrs->nr_xcrs = 1;
+ guest_xcrs->flags = 0;
+ guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK;
+ guest_xcrs->xcrs[0].value = vcpu->arch.xcr0;
+}
+
+static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
+ struct kvm_xcrs *guest_xcrs)
+{
+ int i, r = 0;
+
+ if (!boot_cpu_has(X86_FEATURE_XSAVE))
+ return -EINVAL;
+
+ if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags)
+ return -EINVAL;
+
+ for (i = 0; i < guest_xcrs->nr_xcrs; i++)
+ /* Only support XCR0 currently */
+ if (guest_xcrs->xcrs[i].xcr == XCR_XFEATURE_ENABLED_MASK) {
+ r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK,
+ guest_xcrs->xcrs[i].value);
+ break;
+ }
+ if (r)
+ r = -EINVAL;
+ return r;
+}
+
+/*
+ * kvm_set_guest_paused() indicates to the guest kernel that it has been
+ * stopped by the hypervisor. This function will be called from the host only.
+ * EINVAL is returned when the host attempts to set the flag for a guest that
+ * does not support pv clocks.
+ */
+static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
+{
+ if (!vcpu->arch.pv_time_enabled)
+ return -EINVAL;
+ vcpu->arch.pvclock_set_guest_stopped_request = true;
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+ return 0;
+}
+
+static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
+ struct kvm_enable_cap *cap)
+{
+ if (cap->flags)
+ return -EINVAL;
+
+ switch (cap->cap) {
+ case KVM_CAP_HYPERV_SYNIC2:
+ if (cap->args[0])
+ return -EINVAL;
+ case KVM_CAP_HYPERV_SYNIC:
+ if (!irqchip_in_kernel(vcpu->kvm))
+ return -EINVAL;
+ return kvm_hv_activate_synic(vcpu, cap->cap ==
+ KVM_CAP_HYPERV_SYNIC2);
+ default:
+ return -EINVAL;
+ }
+}
+
+long kvm_arch_vcpu_ioctl(struct file *filp,
+ unsigned int ioctl, unsigned long arg)
+{
+ struct kvm_vcpu *vcpu = filp->private_data;
+ void __user *argp = (void __user *)arg;
+ int r;
+ union {
+ struct kvm_lapic_state *lapic;
+ struct kvm_xsave *xsave;
+ struct kvm_xcrs *xcrs;
+ void *buffer;
+ } u;
+
+ vcpu_load(vcpu);
+
+ u.buffer = NULL;
+ switch (ioctl) {
+ case KVM_GET_LAPIC: {
+ r = -EINVAL;
+ if (!lapic_in_kernel(vcpu))
+ goto out;
+ u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
+
+ r = -ENOMEM;
+ if (!u.lapic)
+ goto out;
+ r = kvm_vcpu_ioctl_get_lapic(vcpu, u.lapic);
+ if (r)
+ goto out;
+ r = -EFAULT;
+ if (copy_to_user(argp, u.lapic, sizeof(struct kvm_lapic_state)))
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_SET_LAPIC: {
+ r = -EINVAL;
+ if (!lapic_in_kernel(vcpu))
+ goto out;
+ u.lapic = memdup_user(argp, sizeof(*u.lapic));
+ if (IS_ERR(u.lapic)) {
+ r = PTR_ERR(u.lapic);
+ goto out_nofree;
+ }
+
+ r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic);
+ break;
+ }
+ case KVM_INTERRUPT: {
+ struct kvm_interrupt irq;
+
+ r = -EFAULT;
+ if (copy_from_user(&irq, argp, sizeof irq))
+ goto out;
+ r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
+ break;
+ }
+ case KVM_NMI: {
+ r = kvm_vcpu_ioctl_nmi(vcpu);
+ break;
+ }
+ case KVM_SMI: {
+ r = kvm_vcpu_ioctl_smi(vcpu);
+ break;
+ }
+ case KVM_SET_CPUID: {
+ struct kvm_cpuid __user *cpuid_arg = argp;
+ struct kvm_cpuid cpuid;
+
+ r = -EFAULT;
+ if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
+ goto out;
+ r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
+ break;
+ }
+ case KVM_SET_CPUID2: {
+ struct kvm_cpuid2 __user *cpuid_arg = argp;
+ struct kvm_cpuid2 cpuid;
+
+ r = -EFAULT;
+ if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
+ goto out;
+ r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
+ cpuid_arg->entries);
+ break;
+ }
+ case KVM_GET_CPUID2: {
+ struct kvm_cpuid2 __user *cpuid_arg = argp;
+ struct kvm_cpuid2 cpuid;
+
+ r = -EFAULT;
+ if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
+ goto out;
+ r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
+ cpuid_arg->entries);
+ if (r)
+ goto out;
+ r = -EFAULT;
+ if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_GET_MSRS: {
+ int idx = srcu_read_lock(&vcpu->kvm->srcu);
+ r = msr_io(vcpu, argp, do_get_msr, 1);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ break;
+ }
+ case KVM_SET_MSRS: {
+ int idx = srcu_read_lock(&vcpu->kvm->srcu);
+ r = msr_io(vcpu, argp, do_set_msr, 0);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ break;
+ }
+ case KVM_TPR_ACCESS_REPORTING: {
+ struct kvm_tpr_access_ctl tac;
+
+ r = -EFAULT;
+ if (copy_from_user(&tac, argp, sizeof tac))
+ goto out;
+ r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac);
+ if (r)
+ goto out;
+ r = -EFAULT;
+ if (copy_to_user(argp, &tac, sizeof tac))
+ goto out;
+ r = 0;
+ break;
+ };
+ case KVM_SET_VAPIC_ADDR: {
+ struct kvm_vapic_addr va;
+ int idx;
+
+ r = -EINVAL;
+ if (!lapic_in_kernel(vcpu))
+ goto out;
+ r = -EFAULT;
+ if (copy_from_user(&va, argp, sizeof va))
+ goto out;
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ r = kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ break;
+ }
+ case KVM_X86_SETUP_MCE: {
+ u64 mcg_cap;
+
+ r = -EFAULT;
+ if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap))
+ goto out;
+ r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap);
+ break;
+ }
+ case KVM_X86_SET_MCE: {
+ struct kvm_x86_mce mce;
+
+ r = -EFAULT;
+ if (copy_from_user(&mce, argp, sizeof mce))
+ goto out;
+ r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
+ break;
+ }
+ case KVM_GET_VCPU_EVENTS: {
+ struct kvm_vcpu_events events;
+
+ kvm_vcpu_ioctl_x86_get_vcpu_events(vcpu, &events);
+
+ r = -EFAULT;
+ if (copy_to_user(argp, &events, sizeof(struct kvm_vcpu_events)))
+ break;
+ r = 0;
+ break;
+ }
+ case KVM_SET_VCPU_EVENTS: {
+ struct kvm_vcpu_events events;
+
+ r = -EFAULT;
+ if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events)))
+ break;
+
+ r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events);
+ break;
+ }
+ case KVM_GET_DEBUGREGS: {
+ struct kvm_debugregs dbgregs;
+
+ kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs);
+
+ r = -EFAULT;
+ if (copy_to_user(argp, &dbgregs,
+ sizeof(struct kvm_debugregs)))
+ break;
+ r = 0;
+ break;
+ }
+ case KVM_SET_DEBUGREGS: {
+ struct kvm_debugregs dbgregs;
+
+ r = -EFAULT;
+ if (copy_from_user(&dbgregs, argp,
+ sizeof(struct kvm_debugregs)))
+ break;
+
+ r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs);
+ break;
+ }
+ case KVM_GET_XSAVE: {
+ u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
+ r = -ENOMEM;
+ if (!u.xsave)
+ break;
+
+ kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave);
+
+ r = -EFAULT;
+ if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave)))
+ break;
+ r = 0;
+ break;
+ }
+ case KVM_SET_XSAVE: {
+ u.xsave = memdup_user(argp, sizeof(*u.xsave));
+ if (IS_ERR(u.xsave)) {
+ r = PTR_ERR(u.xsave);
+ goto out_nofree;
+ }
+
+ r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
+ break;
+ }
+ case KVM_GET_XCRS: {
+ u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
+ r = -ENOMEM;
+ if (!u.xcrs)
+ break;
+
+ kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs);
+
+ r = -EFAULT;
+ if (copy_to_user(argp, u.xcrs,
+ sizeof(struct kvm_xcrs)))
+ break;
+ r = 0;
+ break;
+ }
+ case KVM_SET_XCRS: {
+ u.xcrs = memdup_user(argp, sizeof(*u.xcrs));
+ if (IS_ERR(u.xcrs)) {
+ r = PTR_ERR(u.xcrs);
+ goto out_nofree;
+ }
+
+ r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs);
+ break;
+ }
+ case KVM_SET_TSC_KHZ: {
+ u32 user_tsc_khz;
+
+ r = -EINVAL;
+ user_tsc_khz = (u32)arg;
+
+ if (user_tsc_khz >= kvm_max_guest_tsc_khz)
+ goto out;
+
+ if (user_tsc_khz == 0)
+ user_tsc_khz = tsc_khz;
+
+ if (!kvm_set_tsc_khz(vcpu, user_tsc_khz))
+ r = 0;
+
+ goto out;
+ }
+ case KVM_GET_TSC_KHZ: {
+ r = vcpu->arch.virtual_tsc_khz;
+ goto out;
+ }
+ case KVM_KVMCLOCK_CTRL: {
+ r = kvm_set_guest_paused(vcpu);
+ goto out;
+ }
+ case KVM_ENABLE_CAP: {
+ struct kvm_enable_cap cap;
+
+ r = -EFAULT;
+ if (copy_from_user(&cap, argp, sizeof(cap)))
+ goto out;
+ r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap);
+ break;
+ }
+ case KVM_GET_NESTED_STATE: {
+ struct kvm_nested_state __user *user_kvm_nested_state = argp;
+ u32 user_data_size;
+
+ r = -EINVAL;
+ if (!kvm_x86_ops->get_nested_state)
+ break;
+
+ BUILD_BUG_ON(sizeof(user_data_size) != sizeof(user_kvm_nested_state->size));
+ r = -EFAULT;
+ if (get_user(user_data_size, &user_kvm_nested_state->size))
+ break;
+
+ r = kvm_x86_ops->get_nested_state(vcpu, user_kvm_nested_state,
+ user_data_size);
+ if (r < 0)
+ break;
+
+ if (r > user_data_size) {
+ if (put_user(r, &user_kvm_nested_state->size))
+ r = -EFAULT;
+ else
+ r = -E2BIG;
+ break;
+ }
+
+ r = 0;
+ break;
+ }
+ case KVM_SET_NESTED_STATE: {
+ struct kvm_nested_state __user *user_kvm_nested_state = argp;
+ struct kvm_nested_state kvm_state;
+ int idx;
+
+ r = -EINVAL;
+ if (!kvm_x86_ops->set_nested_state)
+ break;
+
+ r = -EFAULT;
+ if (copy_from_user(&kvm_state, user_kvm_nested_state, sizeof(kvm_state)))
+ break;
+
+ r = -EINVAL;
+ if (kvm_state.size < sizeof(kvm_state))
+ break;
+
+ if (kvm_state.flags &
+ ~(KVM_STATE_NESTED_RUN_PENDING | KVM_STATE_NESTED_GUEST_MODE))
+ break;
+
+ /* nested_run_pending implies guest_mode. */
+ if (kvm_state.flags == KVM_STATE_NESTED_RUN_PENDING)
+ break;
+
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ r = kvm_x86_ops->set_nested_state(vcpu, user_kvm_nested_state, &kvm_state);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ break;
+ }
+ default:
+ r = -EINVAL;
+ }
+out:
+ kfree(u.buffer);
+out_nofree:
+ vcpu_put(vcpu);
+ return r;
+}
+
+vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
+{
+ return VM_FAULT_SIGBUS;
+}
+
+static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
+{
+ int ret;
+
+ if (addr > (unsigned int)(-3 * PAGE_SIZE))
+ return -EINVAL;
+ ret = kvm_x86_ops->set_tss_addr(kvm, addr);
+ return ret;
+}
+
+static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm,
+ u64 ident_addr)
+{
+ return kvm_x86_ops->set_identity_map_addr(kvm, ident_addr);
+}
+
+static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
+ unsigned long kvm_nr_mmu_pages)
+{
+ if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
+ return -EINVAL;
+
+ mutex_lock(&kvm->slots_lock);
+
+ kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
+ kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
+
+ mutex_unlock(&kvm->slots_lock);
+ return 0;
+}
+
+static unsigned long kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
+{
+ return kvm->arch.n_max_mmu_pages;
+}
+
+static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
+{
+ struct kvm_pic *pic = kvm->arch.vpic;
+ int r;
+
+ r = 0;
+ switch (chip->chip_id) {
+ case KVM_IRQCHIP_PIC_MASTER:
+ memcpy(&chip->chip.pic, &pic->pics[0],
+ sizeof(struct kvm_pic_state));
+ break;
+ case KVM_IRQCHIP_PIC_SLAVE:
+ memcpy(&chip->chip.pic, &pic->pics[1],
+ sizeof(struct kvm_pic_state));
+ break;
+ case KVM_IRQCHIP_IOAPIC:
+ kvm_get_ioapic(kvm, &chip->chip.ioapic);
+ break;
+ default:
+ r = -EINVAL;
+ break;
+ }
+ return r;
+}
+
+static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
+{
+ struct kvm_pic *pic = kvm->arch.vpic;
+ int r;
+
+ r = 0;
+ switch (chip->chip_id) {
+ case KVM_IRQCHIP_PIC_MASTER:
+ spin_lock(&pic->lock);
+ memcpy(&pic->pics[0], &chip->chip.pic,
+ sizeof(struct kvm_pic_state));
+ spin_unlock(&pic->lock);
+ break;
+ case KVM_IRQCHIP_PIC_SLAVE:
+ spin_lock(&pic->lock);
+ memcpy(&pic->pics[1], &chip->chip.pic,
+ sizeof(struct kvm_pic_state));
+ spin_unlock(&pic->lock);
+ break;
+ case KVM_IRQCHIP_IOAPIC:
+ kvm_set_ioapic(kvm, &chip->chip.ioapic);
+ break;
+ default:
+ r = -EINVAL;
+ break;
+ }
+ kvm_pic_update_irq(pic);
+ return r;
+}
+
+static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
+{
+ struct kvm_kpit_state *kps = &kvm->arch.vpit->pit_state;
+
+ BUILD_BUG_ON(sizeof(*ps) != sizeof(kps->channels));
+
+ mutex_lock(&kps->lock);
+ memcpy(ps, &kps->channels, sizeof(*ps));
+ mutex_unlock(&kps->lock);
+ return 0;
+}
+
+static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
+{
+ int i;
+ struct kvm_pit *pit = kvm->arch.vpit;
+
+ mutex_lock(&pit->pit_state.lock);
+ memcpy(&pit->pit_state.channels, ps, sizeof(*ps));
+ for (i = 0; i < 3; i++)
+ kvm_pit_load_count(pit, i, ps->channels[i].count, 0);
+ mutex_unlock(&pit->pit_state.lock);
+ return 0;
+}
+
+static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
+{
+ mutex_lock(&kvm->arch.vpit->pit_state.lock);
+ memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,
+ sizeof(ps->channels));
+ ps->flags = kvm->arch.vpit->pit_state.flags;
+ mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+ memset(&ps->reserved, 0, sizeof(ps->reserved));
+ return 0;
+}
+
+static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
+{
+ int start = 0;
+ int i;
+ u32 prev_legacy, cur_legacy;
+ struct kvm_pit *pit = kvm->arch.vpit;
+
+ mutex_lock(&pit->pit_state.lock);
+ prev_legacy = pit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
+ cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
+ if (!prev_legacy && cur_legacy)
+ start = 1;
+ memcpy(&pit->pit_state.channels, &ps->channels,
+ sizeof(pit->pit_state.channels));
+ pit->pit_state.flags = ps->flags;
+ for (i = 0; i < 3; i++)
+ kvm_pit_load_count(pit, i, pit->pit_state.channels[i].count,
+ start && i == 0);
+ mutex_unlock(&pit->pit_state.lock);
+ return 0;
+}
+
+static int kvm_vm_ioctl_reinject(struct kvm *kvm,
+ struct kvm_reinject_control *control)
+{
+ struct kvm_pit *pit = kvm->arch.vpit;
+
+ if (!pit)
+ return -ENXIO;
+
+ /* pit->pit_state.lock was overloaded to prevent userspace from getting
+ * an inconsistent state after running multiple KVM_REINJECT_CONTROL
+ * ioctls in parallel. Use a separate lock if that ioctl isn't rare.
+ */
+ mutex_lock(&pit->pit_state.lock);
+ kvm_pit_set_reinject(pit, control->pit_reinject);
+ mutex_unlock(&pit->pit_state.lock);
+
+ return 0;
+}
+
+/**
+ * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
+ * @kvm: kvm instance
+ * @log: slot id and address to which we copy the log
+ *
+ * Steps 1-4 below provide general overview of dirty page logging. See
+ * kvm_get_dirty_log_protect() function description for additional details.
+ *
+ * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
+ * always flush the TLB (step 4) even if previous step failed and the dirty
+ * bitmap may be corrupt. Regardless of previous outcome the KVM logging API
+ * does not preclude user space subsequent dirty log read. Flushing TLB ensures
+ * writes will be marked dirty for next log read.
+ *
+ * 1. Take a snapshot of the bit and clear it if needed.
+ * 2. Write protect the corresponding page.
+ * 3. Copy the snapshot to the userspace.
+ * 4. Flush TLB's if needed.
+ */
+int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
+{
+ bool is_dirty = false;
+ int r;
+
+ mutex_lock(&kvm->slots_lock);
+
+ /*
+ * Flush potentially hardware-cached dirty pages to dirty_bitmap.
+ */
+ if (kvm_x86_ops->flush_log_dirty)
+ kvm_x86_ops->flush_log_dirty(kvm);
+
+ r = kvm_get_dirty_log_protect(kvm, log, &is_dirty);
+
+ /*
+ * All the TLBs can be flushed out of mmu lock, see the comments in
+ * kvm_mmu_slot_remove_write_access().
+ */
+ lockdep_assert_held(&kvm->slots_lock);
+ if (is_dirty)
+ kvm_flush_remote_tlbs(kvm);
+
+ mutex_unlock(&kvm->slots_lock);
+ return r;
+}
+
+int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
+ bool line_status)
+{
+ if (!irqchip_in_kernel(kvm))
+ return -ENXIO;
+
+ irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
+ irq_event->irq, irq_event->level,
+ line_status);
+ return 0;
+}
+
+static int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
+ struct kvm_enable_cap *cap)
+{
+ int r;
+
+ if (cap->flags)
+ return -EINVAL;
+
+ switch (cap->cap) {
+ case KVM_CAP_DISABLE_QUIRKS:
+ kvm->arch.disabled_quirks = cap->args[0];
+ r = 0;
+ break;
+ case KVM_CAP_SPLIT_IRQCHIP: {
+ mutex_lock(&kvm->lock);
+ r = -EINVAL;
+ if (cap->args[0] > MAX_NR_RESERVED_IOAPIC_PINS)
+ goto split_irqchip_unlock;
+ r = -EEXIST;
+ if (irqchip_in_kernel(kvm))
+ goto split_irqchip_unlock;
+ if (kvm->created_vcpus)
+ goto split_irqchip_unlock;
+ r = kvm_setup_empty_irq_routing(kvm);
+ if (r)
+ goto split_irqchip_unlock;
+ /* Pairs with irqchip_in_kernel. */
+ smp_wmb();
+ kvm->arch.irqchip_mode = KVM_IRQCHIP_SPLIT;
+ kvm->arch.nr_reserved_ioapic_pins = cap->args[0];
+ r = 0;
+split_irqchip_unlock:
+ mutex_unlock(&kvm->lock);
+ break;
+ }
+ case KVM_CAP_X2APIC_API:
+ r = -EINVAL;
+ if (cap->args[0] & ~KVM_X2APIC_API_VALID_FLAGS)
+ break;
+
+ if (cap->args[0] & KVM_X2APIC_API_USE_32BIT_IDS)
+ kvm->arch.x2apic_format = true;
+ if (cap->args[0] & KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
+ kvm->arch.x2apic_broadcast_quirk_disabled = true;
+
+ r = 0;
+ break;
+ case KVM_CAP_X86_DISABLE_EXITS:
+ r = -EINVAL;
+ if (cap->args[0] & ~KVM_X86_DISABLE_VALID_EXITS)
+ break;
+
+ if ((cap->args[0] & KVM_X86_DISABLE_EXITS_MWAIT) &&
+ kvm_can_mwait_in_guest())
+ kvm->arch.mwait_in_guest = true;
+ if (cap->args[0] & KVM_X86_DISABLE_EXITS_HLT)
+ kvm->arch.hlt_in_guest = true;
+ if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE)
+ kvm->arch.pause_in_guest = true;
+ r = 0;
+ break;
+ case KVM_CAP_MSR_PLATFORM_INFO:
+ kvm->arch.guest_can_read_msr_platform_info = cap->args[0];
+ r = 0;
+ break;
+ default:
+ r = -EINVAL;
+ break;
+ }
+ return r;
+}
+
+long kvm_arch_vm_ioctl(struct file *filp,
+ unsigned int ioctl, unsigned long arg)
+{
+ struct kvm *kvm = filp->private_data;
+ void __user *argp = (void __user *)arg;
+ int r = -ENOTTY;
+ /*
+ * This union makes it completely explicit to gcc-3.x
+ * that these two variables' stack usage should be
+ * combined, not added together.
+ */
+ union {
+ struct kvm_pit_state ps;
+ struct kvm_pit_state2 ps2;
+ struct kvm_pit_config pit_config;
+ } u;
+
+ switch (ioctl) {
+ case KVM_SET_TSS_ADDR:
+ r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
+ break;
+ case KVM_SET_IDENTITY_MAP_ADDR: {
+ u64 ident_addr;
+
+ mutex_lock(&kvm->lock);
+ r = -EINVAL;
+ if (kvm->created_vcpus)
+ goto set_identity_unlock;
+ r = -EFAULT;
+ if (copy_from_user(&ident_addr, argp, sizeof ident_addr))
+ goto set_identity_unlock;
+ r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr);
+set_identity_unlock:
+ mutex_unlock(&kvm->lock);
+ break;
+ }
+ case KVM_SET_NR_MMU_PAGES:
+ r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
+ break;
+ case KVM_GET_NR_MMU_PAGES:
+ r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
+ break;
+ case KVM_CREATE_IRQCHIP: {
+ mutex_lock(&kvm->lock);
+
+ r = -EEXIST;
+ if (irqchip_in_kernel(kvm))
+ goto create_irqchip_unlock;
+
+ r = -EINVAL;
+ if (kvm->created_vcpus)
+ goto create_irqchip_unlock;
+
+ r = kvm_pic_init(kvm);
+ if (r)
+ goto create_irqchip_unlock;
+
+ r = kvm_ioapic_init(kvm);
+ if (r) {
+ kvm_pic_destroy(kvm);
+ goto create_irqchip_unlock;
+ }
+
+ r = kvm_setup_default_irq_routing(kvm);
+ if (r) {
+ kvm_ioapic_destroy(kvm);
+ kvm_pic_destroy(kvm);
+ goto create_irqchip_unlock;
+ }
+ /* Write kvm->irq_routing before enabling irqchip_in_kernel. */
+ smp_wmb();
+ kvm->arch.irqchip_mode = KVM_IRQCHIP_KERNEL;
+ create_irqchip_unlock:
+ mutex_unlock(&kvm->lock);
+ break;
+ }
+ case KVM_CREATE_PIT:
+ u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY;
+ goto create_pit;
+ case KVM_CREATE_PIT2:
+ r = -EFAULT;
+ if (copy_from_user(&u.pit_config, argp,
+ sizeof(struct kvm_pit_config)))
+ goto out;
+ create_pit:
+ mutex_lock(&kvm->lock);
+ r = -EEXIST;
+ if (kvm->arch.vpit)
+ goto create_pit_unlock;
+ r = -ENOMEM;
+ kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags);
+ if (kvm->arch.vpit)
+ r = 0;
+ create_pit_unlock:
+ mutex_unlock(&kvm->lock);
+ break;
+ case KVM_GET_IRQCHIP: {
+ /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
+ struct kvm_irqchip *chip;
+
+ chip = memdup_user(argp, sizeof(*chip));
+ if (IS_ERR(chip)) {
+ r = PTR_ERR(chip);
+ goto out;
+ }
+
+ r = -ENXIO;
+ if (!irqchip_kernel(kvm))
+ goto get_irqchip_out;
+ r = kvm_vm_ioctl_get_irqchip(kvm, chip);
+ if (r)
+ goto get_irqchip_out;
+ r = -EFAULT;
+ if (copy_to_user(argp, chip, sizeof *chip))
+ goto get_irqchip_out;
+ r = 0;
+ get_irqchip_out:
+ kfree(chip);
+ break;
+ }
+ case KVM_SET_IRQCHIP: {
+ /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
+ struct kvm_irqchip *chip;
+
+ chip = memdup_user(argp, sizeof(*chip));
+ if (IS_ERR(chip)) {
+ r = PTR_ERR(chip);
+ goto out;
+ }
+
+ r = -ENXIO;
+ if (!irqchip_kernel(kvm))
+ goto set_irqchip_out;
+ r = kvm_vm_ioctl_set_irqchip(kvm, chip);
+ if (r)
+ goto set_irqchip_out;
+ r = 0;
+ set_irqchip_out:
+ kfree(chip);
+ break;
+ }
+ case KVM_GET_PIT: {
+ r = -EFAULT;
+ if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state)))
+ goto out;
+ r = -ENXIO;
+ if (!kvm->arch.vpit)
+ goto out;
+ r = kvm_vm_ioctl_get_pit(kvm, &u.ps);
+ if (r)
+ goto out;
+ r = -EFAULT;
+ if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state)))
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_SET_PIT: {
+ r = -EFAULT;
+ if (copy_from_user(&u.ps, argp, sizeof u.ps))
+ goto out;
+ mutex_lock(&kvm->lock);
+ r = -ENXIO;
+ if (!kvm->arch.vpit)
+ goto set_pit_out;
+ r = kvm_vm_ioctl_set_pit(kvm, &u.ps);
+set_pit_out:
+ mutex_unlock(&kvm->lock);
+ break;
+ }
+ case KVM_GET_PIT2: {
+ r = -ENXIO;
+ if (!kvm->arch.vpit)
+ goto out;
+ r = kvm_vm_ioctl_get_pit2(kvm, &u.ps2);
+ if (r)
+ goto out;
+ r = -EFAULT;
+ if (copy_to_user(argp, &u.ps2, sizeof(u.ps2)))
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_SET_PIT2: {
+ r = -EFAULT;
+ if (copy_from_user(&u.ps2, argp, sizeof(u.ps2)))
+ goto out;
+ mutex_lock(&kvm->lock);
+ r = -ENXIO;
+ if (!kvm->arch.vpit)
+ goto set_pit2_out;
+ r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2);
+set_pit2_out:
+ mutex_unlock(&kvm->lock);
+ break;
+ }
+ case KVM_REINJECT_CONTROL: {
+ struct kvm_reinject_control control;
+ r = -EFAULT;
+ if (copy_from_user(&control, argp, sizeof(control)))
+ goto out;
+ r = kvm_vm_ioctl_reinject(kvm, &control);
+ break;
+ }
+ case KVM_SET_BOOT_CPU_ID:
+ r = 0;
+ mutex_lock(&kvm->lock);
+ if (kvm->created_vcpus)
+ r = -EBUSY;
+ else
+ kvm->arch.bsp_vcpu_id = arg;
+ mutex_unlock(&kvm->lock);
+ break;
+ case KVM_XEN_HVM_CONFIG: {
+ struct kvm_xen_hvm_config xhc;
+ r = -EFAULT;
+ if (copy_from_user(&xhc, argp, sizeof(xhc)))
+ goto out;
+ r = -EINVAL;
+ if (xhc.flags)
+ goto out;
+ memcpy(&kvm->arch.xen_hvm_config, &xhc, sizeof(xhc));
+ r = 0;
+ break;
+ }
+ case KVM_SET_CLOCK: {
+ struct kvm_clock_data user_ns;
+ u64 now_ns;
+
+ r = -EFAULT;
+ if (copy_from_user(&user_ns, argp, sizeof(user_ns)))
+ goto out;
+
+ r = -EINVAL;
+ if (user_ns.flags)
+ goto out;
+
+ r = 0;
+ /*
+ * TODO: userspace has to take care of races with VCPU_RUN, so
+ * kvm_gen_update_masterclock() can be cut down to locked
+ * pvclock_update_vm_gtod_copy().
+ */
+ kvm_gen_update_masterclock(kvm);
+ now_ns = get_kvmclock_ns(kvm);
+ kvm->arch.kvmclock_offset += user_ns.clock - now_ns;
+ kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE);
+ break;
+ }
+ case KVM_GET_CLOCK: {
+ struct kvm_clock_data user_ns;
+ u64 now_ns;
+
+ now_ns = get_kvmclock_ns(kvm);
+ user_ns.clock = now_ns;
+ user_ns.flags = kvm->arch.use_master_clock ? KVM_CLOCK_TSC_STABLE : 0;
+ memset(&user_ns.pad, 0, sizeof(user_ns.pad));
+
+ r = -EFAULT;
+ if (copy_to_user(argp, &user_ns, sizeof(user_ns)))
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_ENABLE_CAP: {
+ struct kvm_enable_cap cap;
+
+ r = -EFAULT;
+ if (copy_from_user(&cap, argp, sizeof(cap)))
+ goto out;
+ r = kvm_vm_ioctl_enable_cap(kvm, &cap);
+ break;
+ }
+ case KVM_MEMORY_ENCRYPT_OP: {
+ r = -ENOTTY;
+ if (kvm_x86_ops->mem_enc_op)
+ r = kvm_x86_ops->mem_enc_op(kvm, argp);
+ break;
+ }
+ case KVM_MEMORY_ENCRYPT_REG_REGION: {
+ struct kvm_enc_region region;
+
+ r = -EFAULT;
+ if (copy_from_user(&region, argp, sizeof(region)))
+ goto out;
+
+ r = -ENOTTY;
+ if (kvm_x86_ops->mem_enc_reg_region)
+ r = kvm_x86_ops->mem_enc_reg_region(kvm, &region);
+ break;
+ }
+ case KVM_MEMORY_ENCRYPT_UNREG_REGION: {
+ struct kvm_enc_region region;
+
+ r = -EFAULT;
+ if (copy_from_user(&region, argp, sizeof(region)))
+ goto out;
+
+ r = -ENOTTY;
+ if (kvm_x86_ops->mem_enc_unreg_region)
+ r = kvm_x86_ops->mem_enc_unreg_region(kvm, &region);
+ break;
+ }
+ case KVM_HYPERV_EVENTFD: {
+ struct kvm_hyperv_eventfd hvevfd;
+
+ r = -EFAULT;
+ if (copy_from_user(&hvevfd, argp, sizeof(hvevfd)))
+ goto out;
+ r = kvm_vm_ioctl_hv_eventfd(kvm, &hvevfd);
+ break;
+ }
+ default:
+ r = -ENOTTY;
+ }
+out:
+ return r;
+}
+
+static void kvm_init_msr_list(void)
+{
+ u32 dummy[2];
+ unsigned i, j;
+
+ for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
+ if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
+ continue;
+
+ /*
+ * Even MSRs that are valid in the host may not be exposed
+ * to the guests in some cases.
+ */
+ switch (msrs_to_save[i]) {
+ case MSR_IA32_BNDCFGS:
+ if (!kvm_mpx_supported())
+ continue;
+ break;
+ case MSR_TSC_AUX:
+ if (!kvm_x86_ops->rdtscp_supported())
+ continue;
+ break;
+ default:
+ break;
+ }
+
+ if (j < i)
+ msrs_to_save[j] = msrs_to_save[i];
+ j++;
+ }
+ num_msrs_to_save = j;
+
+ for (i = j = 0; i < ARRAY_SIZE(emulated_msrs); i++) {
+ if (!kvm_x86_ops->has_emulated_msr(emulated_msrs[i]))
+ continue;
+
+ if (j < i)
+ emulated_msrs[j] = emulated_msrs[i];
+ j++;
+ }
+ num_emulated_msrs = j;
+
+ for (i = j = 0; i < ARRAY_SIZE(msr_based_features); i++) {
+ struct kvm_msr_entry msr;
+
+ msr.index = msr_based_features[i];
+ if (kvm_get_msr_feature(&msr))
+ continue;
+
+ if (j < i)
+ msr_based_features[j] = msr_based_features[i];
+ j++;
+ }
+ num_msr_based_features = j;
+}
+
+static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
+ const void *v)
+{
+ int handled = 0;
+ int n;
+
+ do {
+ n = min(len, 8);
+ if (!(lapic_in_kernel(vcpu) &&
+ !kvm_iodevice_write(vcpu, &vcpu->arch.apic->dev, addr, n, v))
+ && kvm_io_bus_write(vcpu, KVM_MMIO_BUS, addr, n, v))
+ break;
+ handled += n;
+ addr += n;
+ len -= n;
+ v += n;
+ } while (len);
+
+ return handled;
+}
+
+static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
+{
+ int handled = 0;
+ int n;
+
+ do {
+ n = min(len, 8);
+ if (!(lapic_in_kernel(vcpu) &&
+ !kvm_iodevice_read(vcpu, &vcpu->arch.apic->dev,
+ addr, n, v))
+ && kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, n, v))
+ break;
+ trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, v);
+ handled += n;
+ addr += n;
+ len -= n;
+ v += n;
+ } while (len);
+
+ return handled;
+}
+
+static void kvm_set_segment(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg)
+{
+ kvm_x86_ops->set_segment(vcpu, var, seg);
+}
+
+void kvm_get_segment(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg)
+{
+ kvm_x86_ops->get_segment(vcpu, var, seg);
+}
+
+gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
+ struct x86_exception *exception)
+{
+ gpa_t t_gpa;
+
+ BUG_ON(!mmu_is_nested(vcpu));
+
+ /* NPT walks are always user-walks */
+ access |= PFERR_USER_MASK;
+ t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, exception);
+
+ return t_gpa;
+}
+
+gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
+ struct x86_exception *exception)
+{
+ u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
+}
+
+ gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
+ struct x86_exception *exception)
+{
+ u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ access |= PFERR_FETCH_MASK;
+ return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
+}
+
+gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
+ struct x86_exception *exception)
+{
+ u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ access |= PFERR_WRITE_MASK;
+ return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
+}
+
+/* uses this to access any guest's mapped memory without checking CPL */
+gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
+ struct x86_exception *exception)
+{
+ return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, exception);
+}
+
+static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
+ struct kvm_vcpu *vcpu, u32 access,
+ struct x86_exception *exception)
+{
+ void *data = val;
+ int r = X86EMUL_CONTINUE;
+
+ while (bytes) {
+ gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access,
+ exception);
+ unsigned offset = addr & (PAGE_SIZE-1);
+ unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
+ int ret;
+
+ if (gpa == UNMAPPED_GVA)
+ return X86EMUL_PROPAGATE_FAULT;
+ ret = kvm_vcpu_read_guest_page(vcpu, gpa >> PAGE_SHIFT, data,
+ offset, toread);
+ if (ret < 0) {
+ r = X86EMUL_IO_NEEDED;
+ goto out;
+ }
+
+ bytes -= toread;
+ data += toread;
+ addr += toread;
+ }
+out:
+ return r;
+}
+
+/* used for instruction fetching */
+static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt,
+ gva_t addr, void *val, unsigned int bytes,
+ struct x86_exception *exception)
+{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ unsigned offset;
+ int ret;
+
+ /* Inline kvm_read_guest_virt_helper for speed. */
+ gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access|PFERR_FETCH_MASK,
+ exception);
+ if (unlikely(gpa == UNMAPPED_GVA))
+ return X86EMUL_PROPAGATE_FAULT;
+
+ offset = addr & (PAGE_SIZE-1);
+ if (WARN_ON(offset + bytes > PAGE_SIZE))
+ bytes = (unsigned)PAGE_SIZE - offset;
+ ret = kvm_vcpu_read_guest_page(vcpu, gpa >> PAGE_SHIFT, val,
+ offset, bytes);
+ if (unlikely(ret < 0))
+ return X86EMUL_IO_NEEDED;
+
+ return X86EMUL_CONTINUE;
+}
+
+int kvm_read_guest_virt(struct kvm_vcpu *vcpu,
+ gva_t addr, void *val, unsigned int bytes,
+ struct x86_exception *exception)
+{
+ u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+
+ /*
+ * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED
+ * is returned, but our callers are not ready for that and they blindly
+ * call kvm_inject_page_fault. Ensure that they at least do not leak
+ * uninitialized kernel stack memory into cr2 and error code.
+ */
+ memset(exception, 0, sizeof(*exception));
+ return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
+ exception);
+}
+EXPORT_SYMBOL_GPL(kvm_read_guest_virt);
+
+static int emulator_read_std(struct x86_emulate_ctxt *ctxt,
+ gva_t addr, void *val, unsigned int bytes,
+ struct x86_exception *exception, bool system)
+{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ u32 access = 0;
+
+ if (!system && kvm_x86_ops->get_cpl(vcpu) == 3)
+ access |= PFERR_USER_MASK;
+
+ return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception);
+}
+
+static int kvm_read_guest_phys_system(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr, void *val, unsigned int bytes)
+{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ int r = kvm_vcpu_read_guest(vcpu, addr, val, bytes);
+
+ return r < 0 ? X86EMUL_IO_NEEDED : X86EMUL_CONTINUE;
+}
+
+static int kvm_write_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
+ struct kvm_vcpu *vcpu, u32 access,
+ struct x86_exception *exception)
+{
+ void *data = val;
+ int r = X86EMUL_CONTINUE;
+
+ while (bytes) {
+ gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr,
+ access,
+ exception);
+ unsigned offset = addr & (PAGE_SIZE-1);
+ unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
+ int ret;
+
+ if (gpa == UNMAPPED_GVA)
+ return X86EMUL_PROPAGATE_FAULT;
+ ret = kvm_vcpu_write_guest(vcpu, gpa, data, towrite);
+ if (ret < 0) {
+ r = X86EMUL_IO_NEEDED;
+ goto out;
+ }
+
+ bytes -= towrite;
+ data += towrite;
+ addr += towrite;
+ }
+out:
+ return r;
+}
+
+static int emulator_write_std(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val,
+ unsigned int bytes, struct x86_exception *exception,
+ bool system)
+{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ u32 access = PFERR_WRITE_MASK;
+
+ if (!system && kvm_x86_ops->get_cpl(vcpu) == 3)
+ access |= PFERR_USER_MASK;
+
+ return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
+ access, exception);
+}
+
+int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val,
+ unsigned int bytes, struct x86_exception *exception)
+{
+ /* kvm_write_guest_virt_system can pull in tons of pages. */
+ vcpu->arch.l1tf_flush_l1d = true;
+
+ /*
+ * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED
+ * is returned, but our callers are not ready for that and they blindly
+ * call kvm_inject_page_fault. Ensure that they at least do not leak
+ * uninitialized kernel stack memory into cr2 and error code.
+ */
+ memset(exception, 0, sizeof(*exception));
+ return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
+ PFERR_WRITE_MASK, exception);
+}
+EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);
+
+int handle_ud(struct kvm_vcpu *vcpu)
+{
+ int emul_type = EMULTYPE_TRAP_UD;
+ enum emulation_result er;
+ char sig[5]; /* ud2; .ascii "kvm" */
+ struct x86_exception e;
+
+ if (force_emulation_prefix &&
+ kvm_read_guest_virt(vcpu, kvm_get_linear_rip(vcpu),
+ sig, sizeof(sig), &e) == 0 &&
+ memcmp(sig, "\xf\xbkvm", sizeof(sig)) == 0) {
+ kvm_rip_write(vcpu, kvm_rip_read(vcpu) + sizeof(sig));
+ emul_type = 0;
+ }
+
+ er = kvm_emulate_instruction(vcpu, emul_type);
+ if (er == EMULATE_USER_EXIT)
+ return 0;
+ if (er != EMULATE_DONE)
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+}
+EXPORT_SYMBOL_GPL(handle_ud);
+
+static int vcpu_is_mmio_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
+ gpa_t gpa, bool write)
+{
+ /* For APIC access vmexit */
+ if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
+ return 1;
+
+ if (vcpu_match_mmio_gpa(vcpu, gpa)) {
+ trace_vcpu_match_mmio(gva, gpa, write, true);
+ return 1;
+ }
+
+ return 0;
+}
+
+static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
+ gpa_t *gpa, struct x86_exception *exception,
+ bool write)
+{
+ u32 access = ((kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0)
+ | (write ? PFERR_WRITE_MASK : 0);
+
+ /*
+ * currently PKRU is only applied to ept enabled guest so
+ * there is no pkey in EPT page table for L1 guest or EPT
+ * shadow page table for L2 guest.
+ */
+ if (vcpu_match_mmio_gva(vcpu, gva)
+ && !permission_fault(vcpu, vcpu->arch.walk_mmu,
+ vcpu->arch.access, 0, access)) {
+ *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
+ (gva & (PAGE_SIZE - 1));
+ trace_vcpu_match_mmio(gva, *gpa, write, false);
+ return 1;
+ }
+
+ *gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
+
+ if (*gpa == UNMAPPED_GVA)
+ return -1;
+
+ return vcpu_is_mmio_gpa(vcpu, gva, *gpa, write);
+}
+
+int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
+ const void *val, int bytes)
+{
+ int ret;
+
+ ret = kvm_vcpu_write_guest(vcpu, gpa, val, bytes);
+ if (ret < 0)
+ return 0;
+ kvm_page_track_write(vcpu, gpa, val, bytes);
+ return 1;
+}
+
+struct read_write_emulator_ops {
+ int (*read_write_prepare)(struct kvm_vcpu *vcpu, void *val,
+ int bytes);
+ int (*read_write_emulate)(struct kvm_vcpu *vcpu, gpa_t gpa,
+ void *val, int bytes);
+ int (*read_write_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa,
+ int bytes, void *val);
+ int (*read_write_exit_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa,
+ void *val, int bytes);
+ bool write;
+};
+
+static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes)
+{
+ if (vcpu->mmio_read_completed) {
+ trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes,
+ vcpu->mmio_fragments[0].gpa, val);
+ vcpu->mmio_read_completed = 0;
+ return 1;
+ }
+
+ return 0;
+}
+
+static int read_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
+ void *val, int bytes)
+{
+ return !kvm_vcpu_read_guest(vcpu, gpa, val, bytes);
+}
+
+static int write_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
+ void *val, int bytes)
+{
+ return emulator_write_phys(vcpu, gpa, val, bytes);
+}
+
+static int write_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes, void *val)
+{
+ trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, val);
+ return vcpu_mmio_write(vcpu, gpa, bytes, val);
+}
+
+static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
+ void *val, int bytes)
+{
+ trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, NULL);
+ return X86EMUL_IO_NEEDED;
+}
+
+static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
+ void *val, int bytes)
+{
+ struct kvm_mmio_fragment *frag = &vcpu->mmio_fragments[0];
+
+ memcpy(vcpu->run->mmio.data, frag->data, min(8u, frag->len));
+ return X86EMUL_CONTINUE;
+}
+
+static const struct read_write_emulator_ops read_emultor = {
+ .read_write_prepare = read_prepare,
+ .read_write_emulate = read_emulate,
+ .read_write_mmio = vcpu_mmio_read,
+ .read_write_exit_mmio = read_exit_mmio,
+};
+
+static const struct read_write_emulator_ops write_emultor = {
+ .read_write_emulate = write_emulate,
+ .read_write_mmio = write_mmio,
+ .read_write_exit_mmio = write_exit_mmio,
+ .write = true,
+};
+
+static int emulator_read_write_onepage(unsigned long addr, void *val,
+ unsigned int bytes,
+ struct x86_exception *exception,
+ struct kvm_vcpu *vcpu,
+ const struct read_write_emulator_ops *ops)
+{
+ gpa_t gpa;
+ int handled, ret;
+ bool write = ops->write;
+ struct kvm_mmio_fragment *frag;
+ struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+
+ /*
+ * If the exit was due to a NPF we may already have a GPA.
+ * If the GPA is present, use it to avoid the GVA to GPA table walk.
+ * Note, this cannot be used on string operations since string
+ * operation using rep will only have the initial GPA from the NPF
+ * occurred.
+ */
+ if (vcpu->arch.gpa_available &&
+ emulator_can_use_gpa(ctxt) &&
+ (addr & ~PAGE_MASK) == (vcpu->arch.gpa_val & ~PAGE_MASK)) {
+ gpa = vcpu->arch.gpa_val;
+ ret = vcpu_is_mmio_gpa(vcpu, addr, gpa, write);
+ } else {
+ ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write);
+ if (ret < 0)
+ return X86EMUL_PROPAGATE_FAULT;
+ }
+
+ if (!ret && ops->read_write_emulate(vcpu, gpa, val, bytes))
+ return X86EMUL_CONTINUE;
+
+ /*
+ * Is this MMIO handled locally?
+ */
+ handled = ops->read_write_mmio(vcpu, gpa, bytes, val);
+ if (handled == bytes)
+ return X86EMUL_CONTINUE;
+
+ gpa += handled;
+ bytes -= handled;
+ val += handled;
+
+ WARN_ON(vcpu->mmio_nr_fragments >= KVM_MAX_MMIO_FRAGMENTS);
+ frag = &vcpu->mmio_fragments[vcpu->mmio_nr_fragments++];
+ frag->gpa = gpa;
+ frag->data = val;
+ frag->len = bytes;
+ return X86EMUL_CONTINUE;
+}
+
+static int emulator_read_write(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr,
+ void *val, unsigned int bytes,
+ struct x86_exception *exception,
+ const struct read_write_emulator_ops *ops)
+{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ gpa_t gpa;
+ int rc;
+
+ if (ops->read_write_prepare &&
+ ops->read_write_prepare(vcpu, val, bytes))
+ return X86EMUL_CONTINUE;
+
+ vcpu->mmio_nr_fragments = 0;
+
+ /* Crossing a page boundary? */
+ if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
+ int now;
+
+ now = -addr & ~PAGE_MASK;
+ rc = emulator_read_write_onepage(addr, val, now, exception,
+ vcpu, ops);
+
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ addr += now;
+ if (ctxt->mode != X86EMUL_MODE_PROT64)
+ addr = (u32)addr;
+ val += now;
+ bytes -= now;
+ }
+
+ rc = emulator_read_write_onepage(addr, val, bytes, exception,
+ vcpu, ops);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ if (!vcpu->mmio_nr_fragments)
+ return rc;
+
+ gpa = vcpu->mmio_fragments[0].gpa;
+
+ vcpu->mmio_needed = 1;
+ vcpu->mmio_cur_fragment = 0;
+
+ vcpu->run->mmio.len = min(8u, vcpu->mmio_fragments[0].len);
+ vcpu->run->mmio.is_write = vcpu->mmio_is_write = ops->write;
+ vcpu->run->exit_reason = KVM_EXIT_MMIO;
+ vcpu->run->mmio.phys_addr = gpa;
+
+ return ops->read_write_exit_mmio(vcpu, gpa, val, bytes);
+}
+
+static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr,
+ void *val,
+ unsigned int bytes,
+ struct x86_exception *exception)
+{
+ return emulator_read_write(ctxt, addr, val, bytes,
+ exception, &read_emultor);
+}
+
+static int emulator_write_emulated(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr,
+ const void *val,
+ unsigned int bytes,
+ struct x86_exception *exception)
+{
+ return emulator_read_write(ctxt, addr, (void *)val, bytes,
+ exception, &write_emultor);
+}
+
+#define CMPXCHG_TYPE(t, ptr, old, new) \
+ (cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old))
+
+#ifdef CONFIG_X86_64
+# define CMPXCHG64(ptr, old, new) CMPXCHG_TYPE(u64, ptr, old, new)
+#else
+# define CMPXCHG64(ptr, old, new) \
+ (cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old))
+#endif
+
+static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr,
+ const void *old,
+ const void *new,
+ unsigned int bytes,
+ struct x86_exception *exception)
+{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ gpa_t gpa;
+ struct page *page;
+ char *kaddr;
+ bool exchanged;
+
+ /* guests cmpxchg8b have to be emulated atomically */
+ if (bytes > 8 || (bytes & (bytes - 1)))
+ goto emul_write;
+
+ gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
+
+ if (gpa == UNMAPPED_GVA ||
+ (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
+ goto emul_write;
+
+ if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
+ goto emul_write;
+
+ page = kvm_vcpu_gfn_to_page(vcpu, gpa >> PAGE_SHIFT);
+ if (is_error_page(page))
+ goto emul_write;
+
+ kaddr = kmap_atomic(page);
+ kaddr += offset_in_page(gpa);
+ switch (bytes) {
+ case 1:
+ exchanged = CMPXCHG_TYPE(u8, kaddr, old, new);
+ break;
+ case 2:
+ exchanged = CMPXCHG_TYPE(u16, kaddr, old, new);
+ break;
+ case 4:
+ exchanged = CMPXCHG_TYPE(u32, kaddr, old, new);
+ break;
+ case 8:
+ exchanged = CMPXCHG64(kaddr, old, new);
+ break;
+ default:
+ BUG();
+ }
+ kunmap_atomic(kaddr);
+ kvm_release_page_dirty(page);
+
+ if (!exchanged)
+ return X86EMUL_CMPXCHG_FAILED;
+
+ kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
+ kvm_page_track_write(vcpu, gpa, new, bytes);
+
+ return X86EMUL_CONTINUE;
+
+emul_write:
+ printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
+
+ return emulator_write_emulated(ctxt, addr, new, bytes, exception);
+}
+
+static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
+{
+ int r = 0, i;
+
+ for (i = 0; i < vcpu->arch.pio.count; i++) {
+ if (vcpu->arch.pio.in)
+ r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, vcpu->arch.pio.port,
+ vcpu->arch.pio.size, pd);
+ else
+ r = kvm_io_bus_write(vcpu, KVM_PIO_BUS,
+ vcpu->arch.pio.port, vcpu->arch.pio.size,
+ pd);
+ if (r)
+ break;
+ pd += vcpu->arch.pio.size;
+ }
+ return r;
+}
+
+static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size,
+ unsigned short port, void *val,
+ unsigned int count, bool in)
+{
+ vcpu->arch.pio.port = port;
+ vcpu->arch.pio.in = in;
+ vcpu->arch.pio.count = count;
+ vcpu->arch.pio.size = size;
+
+ if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
+ vcpu->arch.pio.count = 0;
+ return 1;
+ }
+
+ vcpu->run->exit_reason = KVM_EXIT_IO;
+ vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
+ vcpu->run->io.size = size;
+ vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
+ vcpu->run->io.count = count;
+ vcpu->run->io.port = port;
+
+ return 0;
+}
+
+static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
+ int size, unsigned short port, void *val,
+ unsigned int count)
+{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ int ret;
+
+ if (vcpu->arch.pio.count)
+ goto data_avail;
+
+ memset(vcpu->arch.pio_data, 0, size * count);
+
+ ret = emulator_pio_in_out(vcpu, size, port, val, count, true);
+ if (ret) {
+data_avail:
+ memcpy(val, vcpu->arch.pio_data, size * count);
+ trace_kvm_pio(KVM_PIO_IN, port, size, count, vcpu->arch.pio_data);
+ vcpu->arch.pio.count = 0;
+ return 1;
+ }
+
+ return 0;
+}
+
+static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
+ int size, unsigned short port,
+ const void *val, unsigned int count)
+{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+
+ memcpy(vcpu->arch.pio_data, val, size * count);
+ trace_kvm_pio(KVM_PIO_OUT, port, size, count, vcpu->arch.pio_data);
+ return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false);
+}
+
+static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
+{
+ return kvm_x86_ops->get_segment_base(vcpu, seg);
+}
+
+static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address)
+{
+ kvm_mmu_invlpg(emul_to_vcpu(ctxt), address);
+}
+
+static int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu)
+{
+ if (!need_emulate_wbinvd(vcpu))
+ return X86EMUL_CONTINUE;
+
+ if (kvm_x86_ops->has_wbinvd_exit()) {
+ int cpu = get_cpu();
+
+ cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
+ smp_call_function_many(vcpu->arch.wbinvd_dirty_mask,
+ wbinvd_ipi, NULL, 1);
+ put_cpu();
+ cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
+ } else
+ wbinvd();
+ return X86EMUL_CONTINUE;
+}
+
+int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
+{
+ kvm_emulate_wbinvd_noskip(vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
+
+
+
+static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt)
+{
+ kvm_emulate_wbinvd_noskip(emul_to_vcpu(ctxt));
+}
+
+static int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr,
+ unsigned long *dest)
+{
+ return kvm_get_dr(emul_to_vcpu(ctxt), dr, dest);
+}
+
+static int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
+ unsigned long value)
+{
+
+ return __kvm_set_dr(emul_to_vcpu(ctxt), dr, value);
+}
+
+static u64 mk_cr_64(u64 curr_cr, u32 new_val)
+{
+ return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
+}
+
+static unsigned long emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr)
+{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ unsigned long value;
+
+ switch (cr) {
+ case 0:
+ value = kvm_read_cr0(vcpu);
+ break;
+ case 2:
+ value = vcpu->arch.cr2;
+ break;
+ case 3:
+ value = kvm_read_cr3(vcpu);
+ break;
+ case 4:
+ value = kvm_read_cr4(vcpu);
+ break;
+ case 8:
+ value = kvm_get_cr8(vcpu);
+ break;
+ default:
+ kvm_err("%s: unexpected cr %u\n", __func__, cr);
+ return 0;
+ }
+
+ return value;
+}
+
+static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
+{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ int res = 0;
+
+ switch (cr) {
+ case 0:
+ res = kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
+ break;
+ case 2:
+ vcpu->arch.cr2 = val;
+ break;
+ case 3:
+ res = kvm_set_cr3(vcpu, val);
+ break;
+ case 4:
+ res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
+ break;
+ case 8:
+ res = kvm_set_cr8(vcpu, val);
+ break;
+ default:
+ kvm_err("%s: unexpected cr %u\n", __func__, cr);
+ res = -1;
+ }
+
+ return res;
+}
+
+static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt)
+{
+ return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt));
+}
+
+static void emulator_get_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
+{
+ kvm_x86_ops->get_gdt(emul_to_vcpu(ctxt), dt);
+}
+
+static void emulator_get_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
+{
+ kvm_x86_ops->get_idt(emul_to_vcpu(ctxt), dt);
+}
+
+static void emulator_set_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
+{
+ kvm_x86_ops->set_gdt(emul_to_vcpu(ctxt), dt);
+}
+
+static void emulator_set_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
+{
+ kvm_x86_ops->set_idt(emul_to_vcpu(ctxt), dt);
+}
+
+static unsigned long emulator_get_cached_segment_base(
+ struct x86_emulate_ctxt *ctxt, int seg)
+{
+ return get_segment_base(emul_to_vcpu(ctxt), seg);
+}
+
+static bool emulator_get_segment(struct x86_emulate_ctxt *ctxt, u16 *selector,
+ struct desc_struct *desc, u32 *base3,
+ int seg)
+{
+ struct kvm_segment var;
+
+ kvm_get_segment(emul_to_vcpu(ctxt), &var, seg);
+ *selector = var.selector;
+
+ if (var.unusable) {
+ memset(desc, 0, sizeof(*desc));
+ if (base3)
+ *base3 = 0;
+ return false;
+ }
+
+ if (var.g)
+ var.limit >>= 12;
+ set_desc_limit(desc, var.limit);
+ set_desc_base(desc, (unsigned long)var.base);
+#ifdef CONFIG_X86_64
+ if (base3)
+ *base3 = var.base >> 32;
+#endif
+ desc->type = var.type;
+ desc->s = var.s;
+ desc->dpl = var.dpl;
+ desc->p = var.present;
+ desc->avl = var.avl;
+ desc->l = var.l;
+ desc->d = var.db;
+ desc->g = var.g;
+
+ return true;
+}
+
+static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector,
+ struct desc_struct *desc, u32 base3,
+ int seg)
+{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ struct kvm_segment var;
+
+ var.selector = selector;
+ var.base = get_desc_base(desc);
+#ifdef CONFIG_X86_64
+ var.base |= ((u64)base3) << 32;
+#endif
+ var.limit = get_desc_limit(desc);
+ if (desc->g)
+ var.limit = (var.limit << 12) | 0xfff;
+ var.type = desc->type;
+ var.dpl = desc->dpl;
+ var.db = desc->d;
+ var.s = desc->s;
+ var.l = desc->l;
+ var.g = desc->g;
+ var.avl = desc->avl;
+ var.present = desc->p;
+ var.unusable = !var.present;
+ var.padding = 0;
+
+ kvm_set_segment(vcpu, &var, seg);
+ return;
+}
+
+static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
+ u32 msr_index, u64 *pdata)
+{
+ struct msr_data msr;
+ int r;
+
+ msr.index = msr_index;
+ msr.host_initiated = false;
+ r = kvm_get_msr(emul_to_vcpu(ctxt), &msr);
+ if (r)
+ return r;
+
+ *pdata = msr.data;
+ return 0;
+}
+
+static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
+ u32 msr_index, u64 data)
+{
+ struct msr_data msr;
+
+ msr.data = data;
+ msr.index = msr_index;
+ msr.host_initiated = false;
+ return kvm_set_msr(emul_to_vcpu(ctxt), &msr);
+}
+
+static u64 emulator_get_smbase(struct x86_emulate_ctxt *ctxt)
+{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+
+ return vcpu->arch.smbase;
+}
+
+static void emulator_set_smbase(struct x86_emulate_ctxt *ctxt, u64 smbase)
+{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+
+ vcpu->arch.smbase = smbase;
+}
+
+static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt,
+ u32 pmc)
+{
+ return kvm_pmu_is_valid_msr_idx(emul_to_vcpu(ctxt), pmc);
+}
+
+static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt,
+ u32 pmc, u64 *pdata)
+{
+ return kvm_pmu_rdpmc(emul_to_vcpu(ctxt), pmc, pdata);
+}
+
+static void emulator_halt(struct x86_emulate_ctxt *ctxt)
+{
+ emul_to_vcpu(ctxt)->arch.halt_request = 1;
+}
+
+static int emulator_intercept(struct x86_emulate_ctxt *ctxt,
+ struct x86_instruction_info *info,
+ enum x86_intercept_stage stage)
+{
+ return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage);
+}
+
+static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
+ u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, bool check_limit)
+{
+ return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, check_limit);
+}
+
+static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg)
+{
+ return kvm_register_read(emul_to_vcpu(ctxt), reg);
+}
+
+static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulong val)
+{
+ kvm_register_write(emul_to_vcpu(ctxt), reg, val);
+}
+
+static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked)
+{
+ kvm_x86_ops->set_nmi_mask(emul_to_vcpu(ctxt), masked);
+}
+
+static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt)
+{
+ return emul_to_vcpu(ctxt)->arch.hflags;
+}
+
+static void emulator_set_hflags(struct x86_emulate_ctxt *ctxt, unsigned emul_flags)
+{
+ kvm_set_hflags(emul_to_vcpu(ctxt), emul_flags);
+}
+
+static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt, u64 smbase)
+{
+ return kvm_x86_ops->pre_leave_smm(emul_to_vcpu(ctxt), smbase);
+}
+
+static const struct x86_emulate_ops emulate_ops = {
+ .read_gpr = emulator_read_gpr,
+ .write_gpr = emulator_write_gpr,
+ .read_std = emulator_read_std,
+ .write_std = emulator_write_std,
+ .read_phys = kvm_read_guest_phys_system,
+ .fetch = kvm_fetch_guest_virt,
+ .read_emulated = emulator_read_emulated,
+ .write_emulated = emulator_write_emulated,
+ .cmpxchg_emulated = emulator_cmpxchg_emulated,
+ .invlpg = emulator_invlpg,
+ .pio_in_emulated = emulator_pio_in_emulated,
+ .pio_out_emulated = emulator_pio_out_emulated,
+ .get_segment = emulator_get_segment,
+ .set_segment = emulator_set_segment,
+ .get_cached_segment_base = emulator_get_cached_segment_base,
+ .get_gdt = emulator_get_gdt,
+ .get_idt = emulator_get_idt,
+ .set_gdt = emulator_set_gdt,
+ .set_idt = emulator_set_idt,
+ .get_cr = emulator_get_cr,
+ .set_cr = emulator_set_cr,
+ .cpl = emulator_get_cpl,
+ .get_dr = emulator_get_dr,
+ .set_dr = emulator_set_dr,
+ .get_smbase = emulator_get_smbase,
+ .set_smbase = emulator_set_smbase,
+ .set_msr = emulator_set_msr,
+ .get_msr = emulator_get_msr,
+ .check_pmc = emulator_check_pmc,
+ .read_pmc = emulator_read_pmc,
+ .halt = emulator_halt,
+ .wbinvd = emulator_wbinvd,
+ .fix_hypercall = emulator_fix_hypercall,
+ .intercept = emulator_intercept,
+ .get_cpuid = emulator_get_cpuid,
+ .set_nmi_mask = emulator_set_nmi_mask,
+ .get_hflags = emulator_get_hflags,
+ .set_hflags = emulator_set_hflags,
+ .pre_leave_smm = emulator_pre_leave_smm,
+};
+
+static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
+{
+ u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu);
+ /*
+ * an sti; sti; sequence only disable interrupts for the first
+ * instruction. So, if the last instruction, be it emulated or
+ * not, left the system with the INT_STI flag enabled, it
+ * means that the last instruction is an sti. We should not
+ * leave the flag on in this case. The same goes for mov ss
+ */
+ if (int_shadow & mask)
+ mask = 0;
+ if (unlikely(int_shadow || mask)) {
+ kvm_x86_ops->set_interrupt_shadow(vcpu, mask);
+ if (!mask)
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ }
+}
+
+static bool inject_emulated_exception(struct kvm_vcpu *vcpu)
+{
+ struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+ if (ctxt->exception.vector == PF_VECTOR)
+ return kvm_propagate_fault(vcpu, &ctxt->exception);
+
+ if (ctxt->exception.error_code_valid)
+ kvm_queue_exception_e(vcpu, ctxt->exception.vector,
+ ctxt->exception.error_code);
+ else
+ kvm_queue_exception(vcpu, ctxt->exception.vector);
+ return false;
+}
+
+static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
+{
+ struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+ int cs_db, cs_l;
+
+ kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
+
+ ctxt->eflags = kvm_get_rflags(vcpu);
+ ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0;
+
+ ctxt->eip = kvm_rip_read(vcpu);
+ ctxt->mode = (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
+ (ctxt->eflags & X86_EFLAGS_VM) ? X86EMUL_MODE_VM86 :
+ (cs_l && is_long_mode(vcpu)) ? X86EMUL_MODE_PROT64 :
+ cs_db ? X86EMUL_MODE_PROT32 :
+ X86EMUL_MODE_PROT16;
+ BUILD_BUG_ON(HF_GUEST_MASK != X86EMUL_GUEST_MASK);
+ BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK);
+ BUILD_BUG_ON(HF_SMM_INSIDE_NMI_MASK != X86EMUL_SMM_INSIDE_NMI_MASK);
+
+ init_decode_cache(ctxt);
+ vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
+}
+
+int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
+{
+ struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+ int ret;
+
+ init_emulate_ctxt(vcpu);
+
+ ctxt->op_bytes = 2;
+ ctxt->ad_bytes = 2;
+ ctxt->_eip = ctxt->eip + inc_eip;
+ ret = emulate_int_real(ctxt, irq);
+
+ if (ret != X86EMUL_CONTINUE)
+ return EMULATE_FAIL;
+
+ ctxt->eip = ctxt->_eip;
+ kvm_rip_write(vcpu, ctxt->eip);
+ kvm_set_rflags(vcpu, ctxt->eflags);
+
+ return EMULATE_DONE;
+}
+EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
+
+static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
+{
+ int r = EMULATE_DONE;
+
+ ++vcpu->stat.insn_emulation_fail;
+ trace_kvm_emulate_insn_failed(vcpu);
+
+ if (emulation_type & EMULTYPE_NO_UD_ON_FAIL)
+ return EMULATE_FAIL;
+
+ if (!is_guest_mode(vcpu) && kvm_x86_ops->get_cpl(vcpu) == 0) {
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 0;
+ r = EMULATE_USER_EXIT;
+ }
+
+ kvm_queue_exception(vcpu, UD_VECTOR);
+
+ return r;
+}
+
+static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+ bool write_fault_to_shadow_pgtable,
+ int emulation_type)
+{
+ gpa_t gpa = cr2_or_gpa;
+ kvm_pfn_t pfn;
+
+ if (!(emulation_type & EMULTYPE_ALLOW_RETRY))
+ return false;
+
+ if (WARN_ON_ONCE(is_guest_mode(vcpu)))
+ return false;
+
+ if (!vcpu->arch.mmu.direct_map) {
+ /*
+ * Write permission should be allowed since only
+ * write access need to be emulated.
+ */
+ gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL);
+
+ /*
+ * If the mapping is invalid in guest, let cpu retry
+ * it to generate fault.
+ */
+ if (gpa == UNMAPPED_GVA)
+ return true;
+ }
+
+ /*
+ * Do not retry the unhandleable instruction if it faults on the
+ * readonly host memory, otherwise it will goto a infinite loop:
+ * retry instruction -> write #PF -> emulation fail -> retry
+ * instruction -> ...
+ */
+ pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa));
+
+ /*
+ * If the instruction failed on the error pfn, it can not be fixed,
+ * report the error to userspace.
+ */
+ if (is_error_noslot_pfn(pfn))
+ return false;
+
+ kvm_release_pfn_clean(pfn);
+
+ /* The instructions are well-emulated on direct mmu. */
+ if (vcpu->arch.mmu.direct_map) {
+ unsigned int indirect_shadow_pages;
+
+ spin_lock(&vcpu->kvm->mmu_lock);
+ indirect_shadow_pages = vcpu->kvm->arch.indirect_shadow_pages;
+ spin_unlock(&vcpu->kvm->mmu_lock);
+
+ if (indirect_shadow_pages)
+ kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
+
+ return true;
+ }
+
+ /*
+ * if emulation was due to access to shadowed page table
+ * and it failed try to unshadow page and re-enter the
+ * guest to let CPU execute the instruction.
+ */
+ kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
+
+ /*
+ * If the access faults on its page table, it can not
+ * be fixed by unprotecting shadow page and it should
+ * be reported to userspace.
+ */
+ return !write_fault_to_shadow_pgtable;
+}
+
+static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
+ gpa_t cr2_or_gpa, int emulation_type)
+{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ unsigned long last_retry_eip, last_retry_addr, gpa = cr2_or_gpa;
+
+ last_retry_eip = vcpu->arch.last_retry_eip;
+ last_retry_addr = vcpu->arch.last_retry_addr;
+
+ /*
+ * If the emulation is caused by #PF and it is non-page_table
+ * writing instruction, it means the VM-EXIT is caused by shadow
+ * page protected, we can zap the shadow page and retry this
+ * instruction directly.
+ *
+ * Note: if the guest uses a non-page-table modifying instruction
+ * on the PDE that points to the instruction, then we will unmap
+ * the instruction and go to an infinite loop. So, we cache the
+ * last retried eip and the last fault address, if we meet the eip
+ * and the address again, we can break out of the potential infinite
+ * loop.
+ */
+ vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0;
+
+ if (!(emulation_type & EMULTYPE_ALLOW_RETRY))
+ return false;
+
+ if (WARN_ON_ONCE(is_guest_mode(vcpu)))
+ return false;
+
+ if (x86_page_table_writing_insn(ctxt))
+ return false;
+
+ if (ctxt->eip == last_retry_eip && last_retry_addr == cr2_or_gpa)
+ return false;
+
+ vcpu->arch.last_retry_eip = ctxt->eip;
+ vcpu->arch.last_retry_addr = cr2_or_gpa;
+
+ if (!vcpu->arch.mmu.direct_map)
+ gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL);
+
+ kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
+
+ return true;
+}
+
+static int complete_emulated_mmio(struct kvm_vcpu *vcpu);
+static int complete_emulated_pio(struct kvm_vcpu *vcpu);
+
+static void kvm_smm_changed(struct kvm_vcpu *vcpu)
+{
+ if (!(vcpu->arch.hflags & HF_SMM_MASK)) {
+ /* This is a good place to trace that we are exiting SMM. */
+ trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, false);
+
+ /* Process a latched INIT or SMI, if any. */
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ }
+
+ kvm_mmu_reset_context(vcpu);
+}
+
+static void kvm_set_hflags(struct kvm_vcpu *vcpu, unsigned emul_flags)
+{
+ unsigned changed = vcpu->arch.hflags ^ emul_flags;
+
+ vcpu->arch.hflags = emul_flags;
+
+ if (changed & HF_SMM_MASK)
+ kvm_smm_changed(vcpu);
+}
+
+static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7,
+ unsigned long *db)
+{
+ u32 dr6 = 0;
+ int i;
+ u32 enable, rwlen;
+
+ enable = dr7;
+ rwlen = dr7 >> 16;
+ for (i = 0; i < 4; i++, enable >>= 2, rwlen >>= 4)
+ if ((enable & 3) && (rwlen & 15) == type && db[i] == addr)
+ dr6 |= (1 << i);
+ return dr6;
+}
+
+static void kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu, int *r)
+{
+ struct kvm_run *kvm_run = vcpu->run;
+
+ if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
+ kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 | DR6_RTM;
+ kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip;
+ kvm_run->debug.arch.exception = DB_VECTOR;
+ kvm_run->exit_reason = KVM_EXIT_DEBUG;
+ *r = EMULATE_USER_EXIT;
+ } else {
+ /*
+ * "Certain debug exceptions may clear bit 0-3. The
+ * remaining contents of the DR6 register are never
+ * cleared by the processor".
+ */
+ vcpu->arch.dr6 &= ~15;
+ vcpu->arch.dr6 |= DR6_BS | DR6_RTM;
+ kvm_queue_exception(vcpu, DB_VECTOR);
+ }
+}
+
+int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
+{
+ unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
+ int r = EMULATE_DONE;
+
+ kvm_x86_ops->skip_emulated_instruction(vcpu);
+
+ /*
+ * rflags is the old, "raw" value of the flags. The new value has
+ * not been saved yet.
+ *
+ * This is correct even for TF set by the guest, because "the
+ * processor will not generate this exception after the instruction
+ * that sets the TF flag".
+ */
+ if (unlikely(rflags & X86_EFLAGS_TF))
+ kvm_vcpu_do_singlestep(vcpu, &r);
+ return r == EMULATE_DONE;
+}
+EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction);
+
+static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r)
+{
+ if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) &&
+ (vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) {
+ struct kvm_run *kvm_run = vcpu->run;
+ unsigned long eip = kvm_get_linear_rip(vcpu);
+ u32 dr6 = kvm_vcpu_check_hw_bp(eip, 0,
+ vcpu->arch.guest_debug_dr7,
+ vcpu->arch.eff_db);
+
+ if (dr6 != 0) {
+ kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1 | DR6_RTM;
+ kvm_run->debug.arch.pc = eip;
+ kvm_run->debug.arch.exception = DB_VECTOR;
+ kvm_run->exit_reason = KVM_EXIT_DEBUG;
+ *r = EMULATE_USER_EXIT;
+ return true;
+ }
+ }
+
+ if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK) &&
+ !(kvm_get_rflags(vcpu) & X86_EFLAGS_RF)) {
+ unsigned long eip = kvm_get_linear_rip(vcpu);
+ u32 dr6 = kvm_vcpu_check_hw_bp(eip, 0,
+ vcpu->arch.dr7,
+ vcpu->arch.db);
+
+ if (dr6 != 0) {
+ vcpu->arch.dr6 &= ~15;
+ vcpu->arch.dr6 |= dr6 | DR6_RTM;
+ kvm_queue_exception(vcpu, DB_VECTOR);
+ *r = EMULATE_DONE;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static bool is_vmware_backdoor_opcode(struct x86_emulate_ctxt *ctxt)
+{
+ switch (ctxt->opcode_len) {
+ case 1:
+ switch (ctxt->b) {
+ case 0xe4: /* IN */
+ case 0xe5:
+ case 0xec:
+ case 0xed:
+ case 0xe6: /* OUT */
+ case 0xe7:
+ case 0xee:
+ case 0xef:
+ case 0x6c: /* INS */
+ case 0x6d:
+ case 0x6e: /* OUTS */
+ case 0x6f:
+ return true;
+ }
+ break;
+ case 2:
+ switch (ctxt->b) {
+ case 0x33: /* RDPMC */
+ return true;
+ }
+ break;
+ }
+
+ return false;
+}
+
+int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+ int emulation_type, void *insn, int insn_len)
+{
+ int r;
+ struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+ bool writeback = true;
+ bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
+
+ vcpu->arch.l1tf_flush_l1d = true;
+
+ /*
+ * Clear write_fault_to_shadow_pgtable here to ensure it is
+ * never reused.
+ */
+ vcpu->arch.write_fault_to_shadow_pgtable = false;
+ kvm_clear_exception_queue(vcpu);
+
+ if (!(emulation_type & EMULTYPE_NO_DECODE)) {
+ init_emulate_ctxt(vcpu);
+
+ /*
+ * We will reenter on the same instruction since
+ * we do not set complete_userspace_io. This does not
+ * handle watchpoints yet, those would be handled in
+ * the emulate_ops.
+ */
+ if (!(emulation_type & EMULTYPE_SKIP) &&
+ kvm_vcpu_check_breakpoint(vcpu, &r))
+ return r;
+
+ ctxt->interruptibility = 0;
+ ctxt->have_exception = false;
+ ctxt->exception.vector = -1;
+ ctxt->perm_ok = false;
+
+ ctxt->ud = emulation_type & EMULTYPE_TRAP_UD;
+
+ r = x86_decode_insn(ctxt, insn, insn_len);
+
+ trace_kvm_emulate_insn_start(vcpu);
+ ++vcpu->stat.insn_emulation;
+ if (r != EMULATION_OK) {
+ if (emulation_type & EMULTYPE_TRAP_UD)
+ return EMULATE_FAIL;
+ if (reexecute_instruction(vcpu, cr2_or_gpa, write_fault_to_spt,
+ emulation_type))
+ return EMULATE_DONE;
+ if (ctxt->have_exception) {
+ /*
+ * #UD should result in just EMULATION_FAILED, and trap-like
+ * exception should not be encountered during decode.
+ */
+ WARN_ON_ONCE(ctxt->exception.vector == UD_VECTOR ||
+ exception_type(ctxt->exception.vector) == EXCPT_TRAP);
+ inject_emulated_exception(vcpu);
+ return EMULATE_DONE;
+ }
+ if (emulation_type & EMULTYPE_SKIP)
+ return EMULATE_FAIL;
+ return handle_emulation_failure(vcpu, emulation_type);
+ }
+ }
+
+ if ((emulation_type & EMULTYPE_VMWARE) &&
+ !is_vmware_backdoor_opcode(ctxt))
+ return EMULATE_FAIL;
+
+ if (emulation_type & EMULTYPE_SKIP) {
+ kvm_rip_write(vcpu, ctxt->_eip);
+ if (ctxt->eflags & X86_EFLAGS_RF)
+ kvm_set_rflags(vcpu, ctxt->eflags & ~X86_EFLAGS_RF);
+ return EMULATE_DONE;
+ }
+
+ if (retry_instruction(ctxt, cr2_or_gpa, emulation_type))
+ return EMULATE_DONE;
+
+ /* this is needed for vmware backdoor interface to work since it
+ changes registers values during IO operation */
+ if (vcpu->arch.emulate_regs_need_sync_from_vcpu) {
+ vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
+ emulator_invalidate_register_cache(ctxt);
+ }
+
+restart:
+ /* Save the faulting GPA (cr2) in the address field */
+ ctxt->exception.address = cr2_or_gpa;
+
+ r = x86_emulate_insn(ctxt);
+
+ if (r == EMULATION_INTERCEPTED)
+ return EMULATE_DONE;
+
+ if (r == EMULATION_FAILED) {
+ if (reexecute_instruction(vcpu, cr2_or_gpa, write_fault_to_spt,
+ emulation_type))
+ return EMULATE_DONE;
+
+ return handle_emulation_failure(vcpu, emulation_type);
+ }
+
+ if (ctxt->have_exception) {
+ r = EMULATE_DONE;
+ if (inject_emulated_exception(vcpu))
+ return r;
+ } else if (vcpu->arch.pio.count) {
+ if (!vcpu->arch.pio.in) {
+ /* FIXME: return into emulator if single-stepping. */
+ vcpu->arch.pio.count = 0;
+ } else {
+ writeback = false;
+ vcpu->arch.complete_userspace_io = complete_emulated_pio;
+ }
+ r = EMULATE_USER_EXIT;
+ } else if (vcpu->mmio_needed) {
+ if (!vcpu->mmio_is_write)
+ writeback = false;
+ r = EMULATE_USER_EXIT;
+ vcpu->arch.complete_userspace_io = complete_emulated_mmio;
+ } else if (r == EMULATION_RESTART)
+ goto restart;
+ else
+ r = EMULATE_DONE;
+
+ if (writeback) {
+ unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
+ toggle_interruptibility(vcpu, ctxt->interruptibility);
+ vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
+ if (!ctxt->have_exception ||
+ exception_type(ctxt->exception.vector) == EXCPT_TRAP) {
+ kvm_rip_write(vcpu, ctxt->eip);
+ if (r == EMULATE_DONE && ctxt->tf)
+ kvm_vcpu_do_singlestep(vcpu, &r);
+ __kvm_set_rflags(vcpu, ctxt->eflags);
+ }
+
+ /*
+ * For STI, interrupts are shadowed; so KVM_REQ_EVENT will
+ * do nothing, and it will be requested again as soon as
+ * the shadow expires. But we still need to check here,
+ * because POPF has no interrupt shadow.
+ */
+ if (unlikely((ctxt->eflags & ~rflags) & X86_EFLAGS_IF))
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ } else
+ vcpu->arch.emulate_regs_need_sync_to_vcpu = true;
+
+ return r;
+}
+
+int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type)
+{
+ return x86_emulate_instruction(vcpu, 0, emulation_type, NULL, 0);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_instruction);
+
+int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu,
+ void *insn, int insn_len)
+{
+ return x86_emulate_instruction(vcpu, 0, 0, insn, insn_len);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_instruction_from_buffer);
+
+static int complete_fast_pio_out_port_0x7e(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.pio.count = 0;
+ return 1;
+}
+
+static int complete_fast_pio_out(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.pio.count = 0;
+
+ if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip)))
+ return 1;
+
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size,
+ unsigned short port)
+{
+ unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX);
+ int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt,
+ size, port, &val, 1);
+ if (ret)
+ return ret;
+
+ /*
+ * Workaround userspace that relies on old KVM behavior of %rip being
+ * incremented prior to exiting to userspace to handle "OUT 0x7e".
+ */
+ if (port == 0x7e &&
+ kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_OUT_7E_INC_RIP)) {
+ vcpu->arch.complete_userspace_io =
+ complete_fast_pio_out_port_0x7e;
+ kvm_skip_emulated_instruction(vcpu);
+ } else {
+ vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu);
+ vcpu->arch.complete_userspace_io = complete_fast_pio_out;
+ }
+ return 0;
+}
+
+static int complete_fast_pio_in(struct kvm_vcpu *vcpu)
+{
+ unsigned long val;
+
+ /* We should only ever be called with arch.pio.count equal to 1 */
+ BUG_ON(vcpu->arch.pio.count != 1);
+
+ if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip))) {
+ vcpu->arch.pio.count = 0;
+ return 1;
+ }
+
+ /* For size less than 4 we merge, else we zero extend */
+ val = (vcpu->arch.pio.size < 4) ? kvm_register_read(vcpu, VCPU_REGS_RAX)
+ : 0;
+
+ /*
+ * Since vcpu->arch.pio.count == 1 let emulator_pio_in_emulated perform
+ * the copy and tracing
+ */
+ emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, vcpu->arch.pio.size,
+ vcpu->arch.pio.port, &val, 1);
+ kvm_register_write(vcpu, VCPU_REGS_RAX, val);
+
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size,
+ unsigned short port)
+{
+ unsigned long val;
+ int ret;
+
+ /* For size less than 4 we merge, else we zero extend */
+ val = (size < 4) ? kvm_register_read(vcpu, VCPU_REGS_RAX) : 0;
+
+ ret = emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, size, port,
+ &val, 1);
+ if (ret) {
+ kvm_register_write(vcpu, VCPU_REGS_RAX, val);
+ return ret;
+ }
+
+ vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu);
+ vcpu->arch.complete_userspace_io = complete_fast_pio_in;
+
+ return 0;
+}
+
+int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in)
+{
+ int ret;
+
+ if (in)
+ ret = kvm_fast_pio_in(vcpu, size, port);
+ else
+ ret = kvm_fast_pio_out(vcpu, size, port);
+ return ret && kvm_skip_emulated_instruction(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_fast_pio);
+
+static int kvmclock_cpu_down_prep(unsigned int cpu)
+{
+ __this_cpu_write(cpu_tsc_khz, 0);
+ return 0;
+}
+
+static void tsc_khz_changed(void *data)
+{
+ struct cpufreq_freqs *freq = data;
+ unsigned long khz = 0;
+
+ if (data)
+ khz = freq->new;
+ else if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
+ khz = cpufreq_quick_get(raw_smp_processor_id());
+ if (!khz)
+ khz = tsc_khz;
+ __this_cpu_write(cpu_tsc_khz, khz);
+}
+
+#ifdef CONFIG_X86_64
+static void kvm_hyperv_tsc_notifier(void)
+{
+ struct kvm *kvm;
+ struct kvm_vcpu *vcpu;
+ int cpu;
+
+ mutex_lock(&kvm_lock);
+ list_for_each_entry(kvm, &vm_list, vm_list)
+ kvm_make_mclock_inprogress_request(kvm);
+
+ hyperv_stop_tsc_emulation();
+
+ /* TSC frequency always matches when on Hyper-V */
+ for_each_present_cpu(cpu)
+ per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
+ kvm_max_guest_tsc_khz = tsc_khz;
+
+ list_for_each_entry(kvm, &vm_list, vm_list) {
+ struct kvm_arch *ka = &kvm->arch;
+
+ spin_lock(&ka->pvclock_gtod_sync_lock);
+
+ pvclock_update_vm_gtod_copy(kvm);
+
+ kvm_for_each_vcpu(cpu, vcpu, kvm)
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+
+ kvm_for_each_vcpu(cpu, vcpu, kvm)
+ kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu);
+
+ spin_unlock(&ka->pvclock_gtod_sync_lock);
+ }
+ mutex_unlock(&kvm_lock);
+}
+#endif
+
+static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ struct cpufreq_freqs *freq = data;
+ struct kvm *kvm;
+ struct kvm_vcpu *vcpu;
+ int i, send_ipi = 0;
+
+ /*
+ * We allow guests to temporarily run on slowing clocks,
+ * provided we notify them after, or to run on accelerating
+ * clocks, provided we notify them before. Thus time never
+ * goes backwards.
+ *
+ * However, we have a problem. We can't atomically update
+ * the frequency of a given CPU from this function; it is
+ * merely a notifier, which can be called from any CPU.
+ * Changing the TSC frequency at arbitrary points in time
+ * requires a recomputation of local variables related to
+ * the TSC for each VCPU. We must flag these local variables
+ * to be updated and be sure the update takes place with the
+ * new frequency before any guests proceed.
+ *
+ * Unfortunately, the combination of hotplug CPU and frequency
+ * change creates an intractable locking scenario; the order
+ * of when these callouts happen is undefined with respect to
+ * CPU hotplug, and they can race with each other. As such,
+ * merely setting per_cpu(cpu_tsc_khz) = X during a hotadd is
+ * undefined; you can actually have a CPU frequency change take
+ * place in between the computation of X and the setting of the
+ * variable. To protect against this problem, all updates of
+ * the per_cpu tsc_khz variable are done in an interrupt
+ * protected IPI, and all callers wishing to update the value
+ * must wait for a synchronous IPI to complete (which is trivial
+ * if the caller is on the CPU already). This establishes the
+ * necessary total order on variable updates.
+ *
+ * Note that because a guest time update may take place
+ * anytime after the setting of the VCPU's request bit, the
+ * correct TSC value must be set before the request. However,
+ * to ensure the update actually makes it to any guest which
+ * starts running in hardware virtualization between the set
+ * and the acquisition of the spinlock, we must also ping the
+ * CPU after setting the request bit.
+ *
+ */
+
+ if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
+ return 0;
+ if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
+ return 0;
+
+ smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
+
+ mutex_lock(&kvm_lock);
+ list_for_each_entry(kvm, &vm_list, vm_list) {
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (vcpu->cpu != freq->cpu)
+ continue;
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+ if (vcpu->cpu != raw_smp_processor_id())
+ send_ipi = 1;
+ }
+ }
+ mutex_unlock(&kvm_lock);
+
+ if (freq->old < freq->new && send_ipi) {
+ /*
+ * We upscale the frequency. Must make the guest
+ * doesn't see old kvmclock values while running with
+ * the new frequency, otherwise we risk the guest sees
+ * time go backwards.
+ *
+ * In case we update the frequency for another cpu
+ * (which might be in guest context) send an interrupt
+ * to kick the cpu out of guest context. Next time
+ * guest context is entered kvmclock will be updated,
+ * so the guest will not see stale values.
+ */
+ smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
+ }
+ return 0;
+}
+
+static struct notifier_block kvmclock_cpufreq_notifier_block = {
+ .notifier_call = kvmclock_cpufreq_notifier
+};
+
+static int kvmclock_cpu_online(unsigned int cpu)
+{
+ tsc_khz_changed(NULL);
+ return 0;
+}
+
+static void kvm_timer_init(void)
+{
+ max_tsc_khz = tsc_khz;
+
+ if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
+#ifdef CONFIG_CPU_FREQ
+ struct cpufreq_policy policy;
+ int cpu;
+
+ memset(&policy, 0, sizeof(policy));
+ cpu = get_cpu();
+ cpufreq_get_policy(&policy, cpu);
+ if (policy.cpuinfo.max_freq)
+ max_tsc_khz = policy.cpuinfo.max_freq;
+ put_cpu();
+#endif
+ cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
+ CPUFREQ_TRANSITION_NOTIFIER);
+ }
+ pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz);
+
+ cpuhp_setup_state(CPUHP_AP_X86_KVM_CLK_ONLINE, "x86/kvm/clk:online",
+ kvmclock_cpu_online, kvmclock_cpu_down_prep);
+}
+
+DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
+EXPORT_PER_CPU_SYMBOL_GPL(current_vcpu);
+
+int kvm_is_in_guest(void)
+{
+ return __this_cpu_read(current_vcpu) != NULL;
+}
+
+static int kvm_is_user_mode(void)
+{
+ int user_mode = 3;
+
+ if (__this_cpu_read(current_vcpu))
+ user_mode = kvm_x86_ops->get_cpl(__this_cpu_read(current_vcpu));
+
+ return user_mode != 0;
+}
+
+static unsigned long kvm_get_guest_ip(void)
+{
+ unsigned long ip = 0;
+
+ if (__this_cpu_read(current_vcpu))
+ ip = kvm_rip_read(__this_cpu_read(current_vcpu));
+
+ return ip;
+}
+
+static struct perf_guest_info_callbacks kvm_guest_cbs = {
+ .is_in_guest = kvm_is_in_guest,
+ .is_user_mode = kvm_is_user_mode,
+ .get_guest_ip = kvm_get_guest_ip,
+};
+
+#ifdef CONFIG_X86_64
+static void pvclock_gtod_update_fn(struct work_struct *work)
+{
+ struct kvm *kvm;
+
+ struct kvm_vcpu *vcpu;
+ int i;
+
+ mutex_lock(&kvm_lock);
+ list_for_each_entry(kvm, &vm_list, vm_list)
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
+ atomic_set(&kvm_guest_has_master_clock, 0);
+ mutex_unlock(&kvm_lock);
+}
+
+static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
+
+/*
+ * Notification about pvclock gtod data update.
+ */
+static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
+ void *priv)
+{
+ struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
+ struct timekeeper *tk = priv;
+
+ update_pvclock_gtod(tk);
+
+ /* disable master clock if host does not trust, or does not
+ * use, TSC based clocksource.
+ */
+ if (!gtod_is_based_on_tsc(gtod->clock.vclock_mode) &&
+ atomic_read(&kvm_guest_has_master_clock) != 0)
+ queue_work(system_long_wq, &pvclock_gtod_work);
+
+ return 0;
+}
+
+static struct notifier_block pvclock_gtod_notifier = {
+ .notifier_call = pvclock_gtod_notify,
+};
+#endif
+
+int kvm_arch_init(void *opaque)
+{
+ int r;
+ struct kvm_x86_ops *ops = opaque;
+
+ if (kvm_x86_ops) {
+ printk(KERN_ERR "kvm: already loaded the other module\n");
+ r = -EEXIST;
+ goto out;
+ }
+
+ if (!ops->cpu_has_kvm_support()) {
+ printk(KERN_ERR "kvm: no hardware support\n");
+ r = -EOPNOTSUPP;
+ goto out;
+ }
+ if (ops->disabled_by_bios()) {
+ printk(KERN_ERR "kvm: disabled by bios\n");
+ r = -EOPNOTSUPP;
+ goto out;
+ }
+
+ r = -ENOMEM;
+ shared_msrs = alloc_percpu(struct kvm_shared_msrs);
+ if (!shared_msrs) {
+ printk(KERN_ERR "kvm: failed to allocate percpu kvm_shared_msrs\n");
+ goto out;
+ }
+
+ r = kvm_mmu_module_init();
+ if (r)
+ goto out_free_percpu;
+
+ kvm_x86_ops = ops;
+
+ kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
+ PT_DIRTY_MASK, PT64_NX_MASK, 0,
+ PT_PRESENT_MASK, 0, sme_me_mask);
+ kvm_timer_init();
+
+ perf_register_guest_info_callbacks(&kvm_guest_cbs);
+
+ if (boot_cpu_has(X86_FEATURE_XSAVE))
+ host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
+
+ kvm_lapic_init();
+#ifdef CONFIG_X86_64
+ pvclock_gtod_register_notifier(&pvclock_gtod_notifier);
+
+ if (hypervisor_is_type(X86_HYPER_MS_HYPERV))
+ set_hv_tscchange_cb(kvm_hyperv_tsc_notifier);
+#endif
+
+ return 0;
+
+out_free_percpu:
+ free_percpu(shared_msrs);
+out:
+ return r;
+}
+
+void kvm_arch_exit(void)
+{
+#ifdef CONFIG_X86_64
+ if (hypervisor_is_type(X86_HYPER_MS_HYPERV))
+ clear_hv_tscchange_cb();
+#endif
+ kvm_lapic_exit();
+ perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
+
+ if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
+ cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
+ CPUFREQ_TRANSITION_NOTIFIER);
+ cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE);
+#ifdef CONFIG_X86_64
+ pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier);
+ cancel_work_sync(&pvclock_gtod_work);
+#endif
+ kvm_x86_ops = NULL;
+ kvm_mmu_module_exit();
+ free_percpu(shared_msrs);
+}
+
+int kvm_vcpu_halt(struct kvm_vcpu *vcpu)
+{
+ ++vcpu->stat.halt_exits;
+ if (lapic_in_kernel(vcpu)) {
+ vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
+ return 1;
+ } else {
+ vcpu->run->exit_reason = KVM_EXIT_HLT;
+ return 0;
+ }
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_halt);
+
+int kvm_emulate_halt(struct kvm_vcpu *vcpu)
+{
+ int ret = kvm_skip_emulated_instruction(vcpu);
+ /*
+ * TODO: we might be squashing a GUESTDBG_SINGLESTEP-triggered
+ * KVM_EXIT_DEBUG here.
+ */
+ return kvm_vcpu_halt(vcpu) && ret;
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_halt);
+
+#ifdef CONFIG_X86_64
+static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr,
+ unsigned long clock_type)
+{
+ struct kvm_clock_pairing clock_pairing;
+ struct timespec64 ts;
+ u64 cycle;
+ int ret;
+
+ if (clock_type != KVM_CLOCK_PAIRING_WALLCLOCK)
+ return -KVM_EOPNOTSUPP;
+
+ if (kvm_get_walltime_and_clockread(&ts, &cycle) == false)
+ return -KVM_EOPNOTSUPP;
+
+ clock_pairing.sec = ts.tv_sec;
+ clock_pairing.nsec = ts.tv_nsec;
+ clock_pairing.tsc = kvm_read_l1_tsc(vcpu, cycle);
+ clock_pairing.flags = 0;
+ memset(&clock_pairing.pad, 0, sizeof(clock_pairing.pad));
+
+ ret = 0;
+ if (kvm_write_guest(vcpu->kvm, paddr, &clock_pairing,
+ sizeof(struct kvm_clock_pairing)))
+ ret = -KVM_EFAULT;
+
+ return ret;
+}
+#endif
+
+/*
+ * kvm_pv_kick_cpu_op: Kick a vcpu.
+ *
+ * @apicid - apicid of vcpu to be kicked.
+ */
+static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
+{
+ struct kvm_lapic_irq lapic_irq;
+
+ lapic_irq.shorthand = 0;
+ lapic_irq.dest_mode = 0;
+ lapic_irq.level = 0;
+ lapic_irq.dest_id = apicid;
+ lapic_irq.msi_redir_hint = false;
+
+ lapic_irq.delivery_mode = APIC_DM_REMRD;
+ kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL);
+}
+
+void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.apicv_active = false;
+ kvm_x86_ops->refresh_apicv_exec_ctrl(vcpu);
+}
+
+int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
+{
+ unsigned long nr, a0, a1, a2, a3, ret;
+ int op_64_bit;
+
+ if (kvm_hv_hypercall_enabled(vcpu->kvm))
+ return kvm_hv_hypercall(vcpu);
+
+ nr = kvm_register_read(vcpu, VCPU_REGS_RAX);
+ a0 = kvm_register_read(vcpu, VCPU_REGS_RBX);
+ a1 = kvm_register_read(vcpu, VCPU_REGS_RCX);
+ a2 = kvm_register_read(vcpu, VCPU_REGS_RDX);
+ a3 = kvm_register_read(vcpu, VCPU_REGS_RSI);
+
+ trace_kvm_hypercall(nr, a0, a1, a2, a3);
+
+ op_64_bit = is_64_bit_mode(vcpu);
+ if (!op_64_bit) {
+ nr &= 0xFFFFFFFF;
+ a0 &= 0xFFFFFFFF;
+ a1 &= 0xFFFFFFFF;
+ a2 &= 0xFFFFFFFF;
+ a3 &= 0xFFFFFFFF;
+ }
+
+ if (kvm_x86_ops->get_cpl(vcpu) != 0) {
+ ret = -KVM_EPERM;
+ goto out;
+ }
+
+ switch (nr) {
+ case KVM_HC_VAPIC_POLL_IRQ:
+ ret = 0;
+ break;
+ case KVM_HC_KICK_CPU:
+ kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1);
+ ret = 0;
+ break;
+#ifdef CONFIG_X86_64
+ case KVM_HC_CLOCK_PAIRING:
+ ret = kvm_pv_clock_pairing(vcpu, a0, a1);
+ break;
+#endif
+ case KVM_HC_SEND_IPI:
+ ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit);
+ break;
+ default:
+ ret = -KVM_ENOSYS;
+ break;
+ }
+out:
+ if (!op_64_bit)
+ ret = (u32)ret;
+ kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
+
+ ++vcpu->stat.hypercalls;
+ return kvm_skip_emulated_instruction(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
+
+static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
+{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ char instruction[3];
+ unsigned long rip = kvm_rip_read(vcpu);
+
+ kvm_x86_ops->patch_hypercall(vcpu, instruction);
+
+ return emulator_write_emulated(ctxt, rip, instruction, 3,
+ &ctxt->exception);
+}
+
+static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
+{
+ return vcpu->run->request_interrupt_window &&
+ likely(!pic_in_kernel(vcpu->kvm));
+}
+
+static void post_kvm_run_save(struct kvm_vcpu *vcpu)
+{
+ struct kvm_run *kvm_run = vcpu->run;
+
+ kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
+ kvm_run->flags = is_smm(vcpu) ? KVM_RUN_X86_SMM : 0;
+ kvm_run->cr8 = kvm_get_cr8(vcpu);
+ kvm_run->apic_base = kvm_get_apic_base(vcpu);
+ kvm_run->ready_for_interrupt_injection =
+ pic_in_kernel(vcpu->kvm) ||
+ kvm_vcpu_ready_for_interrupt_injection(vcpu);
+}
+
+static void update_cr8_intercept(struct kvm_vcpu *vcpu)
+{
+ int max_irr, tpr;
+
+ if (!kvm_x86_ops->update_cr8_intercept)
+ return;
+
+ if (!lapic_in_kernel(vcpu))
+ return;
+
+ if (vcpu->arch.apicv_active)
+ return;
+
+ if (!vcpu->arch.apic->vapic_addr)
+ max_irr = kvm_lapic_find_highest_irr(vcpu);
+ else
+ max_irr = -1;
+
+ if (max_irr != -1)
+ max_irr >>= 4;
+
+ tpr = kvm_lapic_get_cr8(vcpu);
+
+ kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
+}
+
+static void kvm_inject_exception(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->arch.exception.error_code && !is_protmode(vcpu))
+ vcpu->arch.exception.error_code = false;
+ kvm_x86_ops->queue_exception(vcpu);
+}
+
+static int inject_pending_event(struct kvm_vcpu *vcpu)
+{
+ int r;
+
+ /* try to reinject previous events if any */
+
+ if (vcpu->arch.exception.injected)
+ kvm_inject_exception(vcpu);
+ /*
+ * Do not inject an NMI or interrupt if there is a pending
+ * exception. Exceptions and interrupts are recognized at
+ * instruction boundaries, i.e. the start of an instruction.
+ * Trap-like exceptions, e.g. #DB, have higher priority than
+ * NMIs and interrupts, i.e. traps are recognized before an
+ * NMI/interrupt that's pending on the same instruction.
+ * Fault-like exceptions, e.g. #GP and #PF, are the lowest
+ * priority, but are only generated (pended) during instruction
+ * execution, i.e. a pending fault-like exception means the
+ * fault occurred on the *previous* instruction and must be
+ * serviced prior to recognizing any new events in order to
+ * fully complete the previous instruction.
+ */
+ else if (!vcpu->arch.exception.pending) {
+ if (vcpu->arch.nmi_injected)
+ kvm_x86_ops->set_nmi(vcpu);
+ else if (vcpu->arch.interrupt.injected)
+ kvm_x86_ops->set_irq(vcpu);
+ }
+
+ /*
+ * Call check_nested_events() even if we reinjected a previous event
+ * in order for caller to determine if it should require immediate-exit
+ * from L2 to L1 due to pending L1 events which require exit
+ * from L2 to L1.
+ */
+ if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) {
+ r = kvm_x86_ops->check_nested_events(vcpu);
+ if (r != 0)
+ return r;
+ }
+
+ /* try to inject new event if pending */
+ if (vcpu->arch.exception.pending) {
+ trace_kvm_inj_exception(vcpu->arch.exception.nr,
+ vcpu->arch.exception.has_error_code,
+ vcpu->arch.exception.error_code);
+
+ WARN_ON_ONCE(vcpu->arch.exception.injected);
+ vcpu->arch.exception.pending = false;
+ vcpu->arch.exception.injected = true;
+
+ if (exception_type(vcpu->arch.exception.nr) == EXCPT_FAULT)
+ __kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) |
+ X86_EFLAGS_RF);
+
+ if (vcpu->arch.exception.nr == DB_VECTOR &&
+ (vcpu->arch.dr7 & DR7_GD)) {
+ vcpu->arch.dr7 &= ~DR7_GD;
+ kvm_update_dr7(vcpu);
+ }
+
+ kvm_inject_exception(vcpu);
+ }
+
+ /* Don't consider new event if we re-injected an event */
+ if (kvm_event_needs_reinjection(vcpu))
+ return 0;
+
+ if (vcpu->arch.smi_pending && !is_smm(vcpu) &&
+ kvm_x86_ops->smi_allowed(vcpu)) {
+ vcpu->arch.smi_pending = false;
+ ++vcpu->arch.smi_count;
+ enter_smm(vcpu);
+ } else if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) {
+ --vcpu->arch.nmi_pending;
+ vcpu->arch.nmi_injected = true;
+ kvm_x86_ops->set_nmi(vcpu);
+ } else if (kvm_cpu_has_injectable_intr(vcpu)) {
+ /*
+ * Because interrupts can be injected asynchronously, we are
+ * calling check_nested_events again here to avoid a race condition.
+ * See https://lkml.org/lkml/2014/7/2/60 for discussion about this
+ * proposal and current concerns. Perhaps we should be setting
+ * KVM_REQ_EVENT only on certain events and not unconditionally?
+ */
+ if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) {
+ r = kvm_x86_ops->check_nested_events(vcpu);
+ if (r != 0)
+ return r;
+ }
+ if (kvm_x86_ops->interrupt_allowed(vcpu)) {
+ kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
+ false);
+ kvm_x86_ops->set_irq(vcpu);
+ }
+ }
+
+ return 0;
+}
+
+static void process_nmi(struct kvm_vcpu *vcpu)
+{
+ unsigned limit = 2;
+
+ /*
+ * x86 is limited to one NMI running, and one NMI pending after it.
+ * If an NMI is already in progress, limit further NMIs to just one.
+ * Otherwise, allow two (and we'll inject the first one immediately).
+ */
+ if (kvm_x86_ops->get_nmi_mask(vcpu) || vcpu->arch.nmi_injected)
+ limit = 1;
+
+ vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0);
+ vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+}
+
+static u32 enter_smm_get_segment_flags(struct kvm_segment *seg)
+{
+ u32 flags = 0;
+ flags |= seg->g << 23;
+ flags |= seg->db << 22;
+ flags |= seg->l << 21;
+ flags |= seg->avl << 20;
+ flags |= seg->present << 15;
+ flags |= seg->dpl << 13;
+ flags |= seg->s << 12;
+ flags |= seg->type << 8;
+ return flags;
+}
+
+static void enter_smm_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n)
+{
+ struct kvm_segment seg;
+ int offset;
+
+ kvm_get_segment(vcpu, &seg, n);
+ put_smstate(u32, buf, 0x7fa8 + n * 4, seg.selector);
+
+ if (n < 3)
+ offset = 0x7f84 + n * 12;
+ else
+ offset = 0x7f2c + (n - 3) * 12;
+
+ put_smstate(u32, buf, offset + 8, seg.base);
+ put_smstate(u32, buf, offset + 4, seg.limit);
+ put_smstate(u32, buf, offset, enter_smm_get_segment_flags(&seg));
+}
+
+#ifdef CONFIG_X86_64
+static void enter_smm_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n)
+{
+ struct kvm_segment seg;
+ int offset;
+ u16 flags;
+
+ kvm_get_segment(vcpu, &seg, n);
+ offset = 0x7e00 + n * 16;
+
+ flags = enter_smm_get_segment_flags(&seg) >> 8;
+ put_smstate(u16, buf, offset, seg.selector);
+ put_smstate(u16, buf, offset + 2, flags);
+ put_smstate(u32, buf, offset + 4, seg.limit);
+ put_smstate(u64, buf, offset + 8, seg.base);
+}
+#endif
+
+static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, char *buf)
+{
+ struct desc_ptr dt;
+ struct kvm_segment seg;
+ unsigned long val;
+ int i;
+
+ put_smstate(u32, buf, 0x7ffc, kvm_read_cr0(vcpu));
+ put_smstate(u32, buf, 0x7ff8, kvm_read_cr3(vcpu));
+ put_smstate(u32, buf, 0x7ff4, kvm_get_rflags(vcpu));
+ put_smstate(u32, buf, 0x7ff0, kvm_rip_read(vcpu));
+
+ for (i = 0; i < 8; i++)
+ put_smstate(u32, buf, 0x7fd0 + i * 4, kvm_register_read(vcpu, i));
+
+ kvm_get_dr(vcpu, 6, &val);
+ put_smstate(u32, buf, 0x7fcc, (u32)val);
+ kvm_get_dr(vcpu, 7, &val);
+ put_smstate(u32, buf, 0x7fc8, (u32)val);
+
+ kvm_get_segment(vcpu, &seg, VCPU_SREG_TR);
+ put_smstate(u32, buf, 0x7fc4, seg.selector);
+ put_smstate(u32, buf, 0x7f64, seg.base);
+ put_smstate(u32, buf, 0x7f60, seg.limit);
+ put_smstate(u32, buf, 0x7f5c, enter_smm_get_segment_flags(&seg));
+
+ kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR);
+ put_smstate(u32, buf, 0x7fc0, seg.selector);
+ put_smstate(u32, buf, 0x7f80, seg.base);
+ put_smstate(u32, buf, 0x7f7c, seg.limit);
+ put_smstate(u32, buf, 0x7f78, enter_smm_get_segment_flags(&seg));
+
+ kvm_x86_ops->get_gdt(vcpu, &dt);
+ put_smstate(u32, buf, 0x7f74, dt.address);
+ put_smstate(u32, buf, 0x7f70, dt.size);
+
+ kvm_x86_ops->get_idt(vcpu, &dt);
+ put_smstate(u32, buf, 0x7f58, dt.address);
+ put_smstate(u32, buf, 0x7f54, dt.size);
+
+ for (i = 0; i < 6; i++)
+ enter_smm_save_seg_32(vcpu, buf, i);
+
+ put_smstate(u32, buf, 0x7f14, kvm_read_cr4(vcpu));
+
+ /* revision id */
+ put_smstate(u32, buf, 0x7efc, 0x00020000);
+ put_smstate(u32, buf, 0x7ef8, vcpu->arch.smbase);
+}
+
+#ifdef CONFIG_X86_64
+static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf)
+{
+ struct desc_ptr dt;
+ struct kvm_segment seg;
+ unsigned long val;
+ int i;
+
+ for (i = 0; i < 16; i++)
+ put_smstate(u64, buf, 0x7ff8 - i * 8, kvm_register_read(vcpu, i));
+
+ put_smstate(u64, buf, 0x7f78, kvm_rip_read(vcpu));
+ put_smstate(u32, buf, 0x7f70, kvm_get_rflags(vcpu));
+
+ kvm_get_dr(vcpu, 6, &val);
+ put_smstate(u64, buf, 0x7f68, val);
+ kvm_get_dr(vcpu, 7, &val);
+ put_smstate(u64, buf, 0x7f60, val);
+
+ put_smstate(u64, buf, 0x7f58, kvm_read_cr0(vcpu));
+ put_smstate(u64, buf, 0x7f50, kvm_read_cr3(vcpu));
+ put_smstate(u64, buf, 0x7f48, kvm_read_cr4(vcpu));
+
+ put_smstate(u32, buf, 0x7f00, vcpu->arch.smbase);
+
+ /* revision id */
+ put_smstate(u32, buf, 0x7efc, 0x00020064);
+
+ put_smstate(u64, buf, 0x7ed0, vcpu->arch.efer);
+
+ kvm_get_segment(vcpu, &seg, VCPU_SREG_TR);
+ put_smstate(u16, buf, 0x7e90, seg.selector);
+ put_smstate(u16, buf, 0x7e92, enter_smm_get_segment_flags(&seg) >> 8);
+ put_smstate(u32, buf, 0x7e94, seg.limit);
+ put_smstate(u64, buf, 0x7e98, seg.base);
+
+ kvm_x86_ops->get_idt(vcpu, &dt);
+ put_smstate(u32, buf, 0x7e84, dt.size);
+ put_smstate(u64, buf, 0x7e88, dt.address);
+
+ kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR);
+ put_smstate(u16, buf, 0x7e70, seg.selector);
+ put_smstate(u16, buf, 0x7e72, enter_smm_get_segment_flags(&seg) >> 8);
+ put_smstate(u32, buf, 0x7e74, seg.limit);
+ put_smstate(u64, buf, 0x7e78, seg.base);
+
+ kvm_x86_ops->get_gdt(vcpu, &dt);
+ put_smstate(u32, buf, 0x7e64, dt.size);
+ put_smstate(u64, buf, 0x7e68, dt.address);
+
+ for (i = 0; i < 6; i++)
+ enter_smm_save_seg_64(vcpu, buf, i);
+}
+#endif
+
+static void enter_smm(struct kvm_vcpu *vcpu)
+{
+ struct kvm_segment cs, ds;
+ struct desc_ptr dt;
+ char buf[512];
+ u32 cr0;
+
+ trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true);
+ memset(buf, 0, 512);
+#ifdef CONFIG_X86_64
+ if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
+ enter_smm_save_state_64(vcpu, buf);
+ else
+#endif
+ enter_smm_save_state_32(vcpu, buf);
+
+ /*
+ * Give pre_enter_smm() a chance to make ISA-specific changes to the
+ * vCPU state (e.g. leave guest mode) after we've saved the state into
+ * the SMM state-save area.
+ */
+ kvm_x86_ops->pre_enter_smm(vcpu, buf);
+
+ vcpu->arch.hflags |= HF_SMM_MASK;
+ kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf));
+
+ if (kvm_x86_ops->get_nmi_mask(vcpu))
+ vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
+ else
+ kvm_x86_ops->set_nmi_mask(vcpu, true);
+
+ kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
+ kvm_rip_write(vcpu, 0x8000);
+
+ cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG);
+ kvm_x86_ops->set_cr0(vcpu, cr0);
+ vcpu->arch.cr0 = cr0;
+
+ kvm_x86_ops->set_cr4(vcpu, 0);
+
+ /* Undocumented: IDT limit is set to zero on entry to SMM. */
+ dt.address = dt.size = 0;
+ kvm_x86_ops->set_idt(vcpu, &dt);
+
+ __kvm_set_dr(vcpu, 7, DR7_FIXED_1);
+
+ cs.selector = (vcpu->arch.smbase >> 4) & 0xffff;
+ cs.base = vcpu->arch.smbase;
+
+ ds.selector = 0;
+ ds.base = 0;
+
+ cs.limit = ds.limit = 0xffffffff;
+ cs.type = ds.type = 0x3;
+ cs.dpl = ds.dpl = 0;
+ cs.db = ds.db = 0;
+ cs.s = ds.s = 1;
+ cs.l = ds.l = 0;
+ cs.g = ds.g = 1;
+ cs.avl = ds.avl = 0;
+ cs.present = ds.present = 1;
+ cs.unusable = ds.unusable = 0;
+ cs.padding = ds.padding = 0;
+
+ kvm_set_segment(vcpu, &cs, VCPU_SREG_CS);
+ kvm_set_segment(vcpu, &ds, VCPU_SREG_DS);
+ kvm_set_segment(vcpu, &ds, VCPU_SREG_ES);
+ kvm_set_segment(vcpu, &ds, VCPU_SREG_FS);
+ kvm_set_segment(vcpu, &ds, VCPU_SREG_GS);
+ kvm_set_segment(vcpu, &ds, VCPU_SREG_SS);
+
+#ifdef CONFIG_X86_64
+ if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
+ kvm_x86_ops->set_efer(vcpu, 0);
+#endif
+
+ kvm_update_cpuid(vcpu);
+ kvm_mmu_reset_context(vcpu);
+}
+
+static void process_smi(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.smi_pending = true;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+}
+
+void kvm_make_scan_ioapic_request(struct kvm *kvm)
+{
+ kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC);
+}
+
+static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
+{
+ if (!kvm_apic_present(vcpu))
+ return;
+
+ bitmap_zero(vcpu->arch.ioapic_handled_vectors, 256);
+
+ if (irqchip_split(vcpu->kvm))
+ kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors);
+ else {
+ if (vcpu->arch.apicv_active)
+ kvm_x86_ops->sync_pir_to_irr(vcpu);
+ if (ioapic_in_kernel(vcpu->kvm))
+ kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
+ }
+
+ if (is_guest_mode(vcpu))
+ vcpu->arch.load_eoi_exitmap_pending = true;
+ else
+ kvm_make_request(KVM_REQ_LOAD_EOI_EXITMAP, vcpu);
+}
+
+static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu)
+{
+ u64 eoi_exit_bitmap[4];
+
+ if (!kvm_apic_hw_enabled(vcpu->arch.apic))
+ return;
+
+ bitmap_or((ulong *)eoi_exit_bitmap, vcpu->arch.ioapic_handled_vectors,
+ vcpu_to_synic(vcpu)->vec_bitmap, 256);
+ kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
+}
+
+void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
+ unsigned long start, unsigned long end)
+{
+ unsigned long apic_address;
+
+ /*
+ * The physical address of apic access page is stored in the VMCS.
+ * Update it when it becomes invalid.
+ */
+ apic_address = gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
+ if (start <= apic_address && apic_address < end)
+ kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
+}
+
+void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
+{
+ struct page *page = NULL;
+
+ if (!lapic_in_kernel(vcpu))
+ return;
+
+ if (!kvm_x86_ops->set_apic_access_page_addr)
+ return;
+
+ page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
+ if (is_error_page(page))
+ return;
+ kvm_x86_ops->set_apic_access_page_addr(vcpu, page_to_phys(page));
+
+ /*
+ * Do not pin apic access page in memory, the MMU notifier
+ * will call us again if it is migrated or swapped out.
+ */
+ put_page(page);
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_reload_apic_access_page);
+
+void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu)
+{
+ smp_send_reschedule(vcpu->cpu);
+}
+EXPORT_SYMBOL_GPL(__kvm_request_immediate_exit);
+
+/*
+ * Returns 1 to let vcpu_run() continue the guest execution loop without
+ * exiting to the userspace. Otherwise, the value will be returned to the
+ * userspace.
+ */
+static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
+{
+ int r;
+ bool req_int_win =
+ dm_request_for_irq_injection(vcpu) &&
+ kvm_cpu_accept_dm_intr(vcpu);
+
+ bool req_immediate_exit = false;
+
+ if (kvm_request_pending(vcpu)) {
+ if (kvm_check_request(KVM_REQ_GET_VMCS12_PAGES, vcpu))
+ kvm_x86_ops->get_vmcs12_pages(vcpu);
+ if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
+ kvm_mmu_unload(vcpu);
+ if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
+ __kvm_migrate_timers(vcpu);
+ if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu))
+ kvm_gen_update_masterclock(vcpu->kvm);
+ if (kvm_check_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu))
+ kvm_gen_kvmclock_update(vcpu);
+ if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {
+ r = kvm_guest_time_update(vcpu);
+ if (unlikely(r))
+ goto out;
+ }
+ if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
+ kvm_mmu_sync_roots(vcpu);
+ if (kvm_check_request(KVM_REQ_LOAD_CR3, vcpu))
+ kvm_mmu_load_cr3(vcpu);
+ if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
+ kvm_vcpu_flush_tlb(vcpu, true);
+ if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
+ vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
+ r = 0;
+ goto out;
+ }
+ if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
+ vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
+ vcpu->mmio_needed = 0;
+ r = 0;
+ goto out;
+ }
+ if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) {
+ /* Page is swapped out. Do synthetic halt */
+ vcpu->arch.apf.halted = true;
+ r = 1;
+ goto out;
+ }
+ if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
+ record_steal_time(vcpu);
+ if (kvm_check_request(KVM_REQ_SMI, vcpu))
+ process_smi(vcpu);
+ if (kvm_check_request(KVM_REQ_NMI, vcpu))
+ process_nmi(vcpu);
+ if (kvm_check_request(KVM_REQ_PMU, vcpu))
+ kvm_pmu_handle_event(vcpu);
+ if (kvm_check_request(KVM_REQ_PMI, vcpu))
+ kvm_pmu_deliver_pmi(vcpu);
+ if (kvm_check_request(KVM_REQ_IOAPIC_EOI_EXIT, vcpu)) {
+ BUG_ON(vcpu->arch.pending_ioapic_eoi > 255);
+ if (test_bit(vcpu->arch.pending_ioapic_eoi,
+ vcpu->arch.ioapic_handled_vectors)) {
+ vcpu->run->exit_reason = KVM_EXIT_IOAPIC_EOI;
+ vcpu->run->eoi.vector =
+ vcpu->arch.pending_ioapic_eoi;
+ r = 0;
+ goto out;
+ }
+ }
+ if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu))
+ vcpu_scan_ioapic(vcpu);
+ if (kvm_check_request(KVM_REQ_LOAD_EOI_EXITMAP, vcpu))
+ vcpu_load_eoi_exitmap(vcpu);
+ if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu))
+ kvm_vcpu_reload_apic_access_page(vcpu);
+ if (kvm_check_request(KVM_REQ_HV_CRASH, vcpu)) {
+ vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
+ vcpu->run->system_event.type = KVM_SYSTEM_EVENT_CRASH;
+ r = 0;
+ goto out;
+ }
+ if (kvm_check_request(KVM_REQ_HV_RESET, vcpu)) {
+ vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
+ vcpu->run->system_event.type = KVM_SYSTEM_EVENT_RESET;
+ r = 0;
+ goto out;
+ }
+ if (kvm_check_request(KVM_REQ_HV_EXIT, vcpu)) {
+ vcpu->run->exit_reason = KVM_EXIT_HYPERV;
+ vcpu->run->hyperv = vcpu->arch.hyperv.exit;
+ r = 0;
+ goto out;
+ }
+
+ /*
+ * KVM_REQ_HV_STIMER has to be processed after
+ * KVM_REQ_CLOCK_UPDATE, because Hyper-V SynIC timers
+ * depend on the guest clock being up-to-date
+ */
+ if (kvm_check_request(KVM_REQ_HV_STIMER, vcpu))
+ kvm_hv_process_stimers(vcpu);
+ }
+
+ if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
+ ++vcpu->stat.req_event;
+ kvm_apic_accept_events(vcpu);
+ if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
+ r = 1;
+ goto out;
+ }
+
+ if (inject_pending_event(vcpu) != 0)
+ req_immediate_exit = true;
+ else {
+ /* Enable SMI/NMI/IRQ window open exits if needed.
+ *
+ * SMIs have three cases:
+ * 1) They can be nested, and then there is nothing to
+ * do here because RSM will cause a vmexit anyway.
+ * 2) There is an ISA-specific reason why SMI cannot be
+ * injected, and the moment when this changes can be
+ * intercepted.
+ * 3) Or the SMI can be pending because
+ * inject_pending_event has completed the injection
+ * of an IRQ or NMI from the previous vmexit, and
+ * then we request an immediate exit to inject the
+ * SMI.
+ */
+ if (vcpu->arch.smi_pending && !is_smm(vcpu))
+ if (!kvm_x86_ops->enable_smi_window(vcpu))
+ req_immediate_exit = true;
+ if (vcpu->arch.nmi_pending)
+ kvm_x86_ops->enable_nmi_window(vcpu);
+ if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win)
+ kvm_x86_ops->enable_irq_window(vcpu);
+ WARN_ON(vcpu->arch.exception.pending);
+ }
+
+ if (kvm_lapic_enabled(vcpu)) {
+ update_cr8_intercept(vcpu);
+ kvm_lapic_sync_to_vapic(vcpu);
+ }
+ }
+
+ r = kvm_mmu_reload(vcpu);
+ if (unlikely(r)) {
+ goto cancel_injection;
+ }
+
+ preempt_disable();
+
+ kvm_x86_ops->prepare_guest_switch(vcpu);
+
+ /*
+ * Disable IRQs before setting IN_GUEST_MODE. Posted interrupt
+ * IPI are then delayed after guest entry, which ensures that they
+ * result in virtual interrupt delivery.
+ */
+ local_irq_disable();
+ vcpu->mode = IN_GUEST_MODE;
+
+ srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
+
+ /*
+ * 1) We should set ->mode before checking ->requests. Please see
+ * the comment in kvm_vcpu_exiting_guest_mode().
+ *
+ * 2) For APICv, we should set ->mode before checking PIR.ON. This
+ * pairs with the memory barrier implicit in pi_test_and_set_on
+ * (see vmx_deliver_posted_interrupt).
+ *
+ * 3) This also orders the write to mode from any reads to the page
+ * tables done while the VCPU is running. Please see the comment
+ * in kvm_flush_remote_tlbs.
+ */
+ smp_mb__after_srcu_read_unlock();
+
+ /*
+ * This handles the case where a posted interrupt was
+ * notified with kvm_vcpu_kick.
+ */
+ if (kvm_lapic_enabled(vcpu) && vcpu->arch.apicv_active)
+ kvm_x86_ops->sync_pir_to_irr(vcpu);
+
+ if (vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu)
+ || need_resched() || signal_pending(current)) {
+ vcpu->mode = OUTSIDE_GUEST_MODE;
+ smp_wmb();
+ local_irq_enable();
+ preempt_enable();
+ vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+ r = 1;
+ goto cancel_injection;
+ }
+
+ if (req_immediate_exit) {
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ kvm_x86_ops->request_immediate_exit(vcpu);
+ }
+
+ trace_kvm_entry(vcpu->vcpu_id);
+ if (lapic_timer_advance_ns)
+ wait_lapic_expire(vcpu);
+ guest_enter_irqoff();
+
+ if (unlikely(vcpu->arch.switch_db_regs)) {
+ set_debugreg(0, 7);
+ set_debugreg(vcpu->arch.eff_db[0], 0);
+ set_debugreg(vcpu->arch.eff_db[1], 1);
+ set_debugreg(vcpu->arch.eff_db[2], 2);
+ set_debugreg(vcpu->arch.eff_db[3], 3);
+ set_debugreg(vcpu->arch.dr6, 6);
+ vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
+ } else if (unlikely(hw_breakpoint_active())) {
+ set_debugreg(0, 7);
+ }
+
+ kvm_x86_ops->run(vcpu);
+
+ /*
+ * Do this here before restoring debug registers on the host. And
+ * since we do this before handling the vmexit, a DR access vmexit
+ * can (a) read the correct value of the debug registers, (b) set
+ * KVM_DEBUGREG_WONT_EXIT again.
+ */
+ if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) {
+ WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP);
+ kvm_x86_ops->sync_dirty_debug_regs(vcpu);
+ kvm_update_dr0123(vcpu);
+ kvm_update_dr6(vcpu);
+ kvm_update_dr7(vcpu);
+ vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
+ }
+
+ /*
+ * If the guest has used debug registers, at least dr7
+ * will be disabled while returning to the host.
+ * If we don't have active breakpoints in the host, we don't
+ * care about the messed up debug address registers. But if
+ * we have some of them active, restore the old state.
+ */
+ if (hw_breakpoint_active())
+ hw_breakpoint_restore();
+
+ vcpu->arch.last_guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
+
+ vcpu->mode = OUTSIDE_GUEST_MODE;
+ smp_wmb();
+
+ kvm_before_interrupt(vcpu);
+ kvm_x86_ops->handle_external_intr(vcpu);
+ kvm_after_interrupt(vcpu);
+
+ ++vcpu->stat.exits;
+
+ guest_exit_irqoff();
+
+ local_irq_enable();
+ preempt_enable();
+
+ vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+
+ /*
+ * Profile KVM exit RIPs:
+ */
+ if (unlikely(prof_on == KVM_PROFILING)) {
+ unsigned long rip = kvm_rip_read(vcpu);
+ profile_hit(KVM_PROFILING, (void *)rip);
+ }
+
+ if (unlikely(vcpu->arch.tsc_always_catchup))
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+
+ if (vcpu->arch.apic_attention)
+ kvm_lapic_sync_from_vapic(vcpu);
+
+ vcpu->arch.gpa_available = false;
+ r = kvm_x86_ops->handle_exit(vcpu);
+ return r;
+
+cancel_injection:
+ kvm_x86_ops->cancel_injection(vcpu);
+ if (unlikely(vcpu->arch.apic_attention))
+ kvm_lapic_sync_from_vapic(vcpu);
+out:
+ return r;
+}
+
+static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
+{
+ if (!kvm_arch_vcpu_runnable(vcpu) &&
+ (!kvm_x86_ops->pre_block || kvm_x86_ops->pre_block(vcpu) == 0)) {
+ srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
+ kvm_vcpu_block(vcpu);
+ vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
+
+ if (kvm_x86_ops->post_block)
+ kvm_x86_ops->post_block(vcpu);
+
+ if (!kvm_check_request(KVM_REQ_UNHALT, vcpu))
+ return 1;
+ }
+
+ kvm_apic_accept_events(vcpu);
+ switch(vcpu->arch.mp_state) {
+ case KVM_MP_STATE_HALTED:
+ vcpu->arch.pv.pv_unhalted = false;
+ vcpu->arch.mp_state =
+ KVM_MP_STATE_RUNNABLE;
+ case KVM_MP_STATE_RUNNABLE:
+ vcpu->arch.apf.halted = false;
+ break;
+ case KVM_MP_STATE_INIT_RECEIVED:
+ break;
+ default:
+ return -EINTR;
+ break;
+ }
+ return 1;
+}
+
+static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
+{
+ if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events)
+ kvm_x86_ops->check_nested_events(vcpu);
+
+ return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
+ !vcpu->arch.apf.halted);
+}
+
+static int vcpu_run(struct kvm_vcpu *vcpu)
+{
+ int r;
+ struct kvm *kvm = vcpu->kvm;
+
+ vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
+ vcpu->arch.l1tf_flush_l1d = true;
+
+ for (;;) {
+ if (kvm_vcpu_running(vcpu)) {
+ r = vcpu_enter_guest(vcpu);
+ } else {
+ r = vcpu_block(kvm, vcpu);
+ }
+
+ if (r <= 0)
+ break;
+
+ kvm_clear_request(KVM_REQ_PENDING_TIMER, vcpu);
+ if (kvm_cpu_has_pending_timer(vcpu))
+ kvm_inject_pending_timer_irqs(vcpu);
+
+ if (dm_request_for_irq_injection(vcpu) &&
+ kvm_vcpu_ready_for_interrupt_injection(vcpu)) {
+ r = 0;
+ vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
+ ++vcpu->stat.request_irq_exits;
+ break;
+ }
+
+ kvm_check_async_pf_completion(vcpu);
+
+ if (signal_pending(current)) {
+ r = -EINTR;
+ vcpu->run->exit_reason = KVM_EXIT_INTR;
+ ++vcpu->stat.signal_exits;
+ break;
+ }
+ if (need_resched()) {
+ srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
+ cond_resched();
+ vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
+ }
+ }
+
+ srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
+
+ return r;
+}
+
+static inline int complete_emulated_io(struct kvm_vcpu *vcpu)
+{
+ int r;
+ vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+ r = kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
+ srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
+ if (r != EMULATE_DONE)
+ return 0;
+ return 1;
+}
+
+static int complete_emulated_pio(struct kvm_vcpu *vcpu)
+{
+ BUG_ON(!vcpu->arch.pio.count);
+
+ return complete_emulated_io(vcpu);
+}
+
+/*
+ * Implements the following, as a state machine:
+ *
+ * read:
+ * for each fragment
+ * for each mmio piece in the fragment
+ * write gpa, len
+ * exit
+ * copy data
+ * execute insn
+ *
+ * write:
+ * for each fragment
+ * for each mmio piece in the fragment
+ * write gpa, len
+ * copy data
+ * exit
+ */
+static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
+{
+ struct kvm_run *run = vcpu->run;
+ struct kvm_mmio_fragment *frag;
+ unsigned len;
+
+ BUG_ON(!vcpu->mmio_needed);
+
+ /* Complete previous fragment */
+ frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment];
+ len = min(8u, frag->len);
+ if (!vcpu->mmio_is_write)
+ memcpy(frag->data, run->mmio.data, len);
+
+ if (frag->len <= 8) {
+ /* Switch to the next fragment. */
+ frag++;
+ vcpu->mmio_cur_fragment++;
+ } else {
+ /* Go forward to the next mmio piece. */
+ frag->data += len;
+ frag->gpa += len;
+ frag->len -= len;
+ }
+
+ if (vcpu->mmio_cur_fragment >= vcpu->mmio_nr_fragments) {
+ vcpu->mmio_needed = 0;
+
+ /* FIXME: return into emulator if single-stepping. */
+ if (vcpu->mmio_is_write)
+ return 1;
+ vcpu->mmio_read_completed = 1;
+ return complete_emulated_io(vcpu);
+ }
+
+ run->exit_reason = KVM_EXIT_MMIO;
+ run->mmio.phys_addr = frag->gpa;
+ if (vcpu->mmio_is_write)
+ memcpy(run->mmio.data, frag->data, min(8u, frag->len));
+ run->mmio.len = min(8u, frag->len);
+ run->mmio.is_write = vcpu->mmio_is_write;
+ vcpu->arch.complete_userspace_io = complete_emulated_mmio;
+ return 0;
+}
+
+/* Swap (qemu) user FPU context for the guest FPU context. */
+static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
+{
+ preempt_disable();
+ copy_fpregs_to_fpstate(&vcpu->arch.user_fpu);
+ /* PKRU is separately restored in kvm_x86_ops->run. */
+ __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state,
+ ~XFEATURE_MASK_PKRU);
+ preempt_enable();
+ trace_kvm_fpu(1);
+}
+
+/* When vcpu_run ends, restore user space FPU context. */
+static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
+{
+ preempt_disable();
+ copy_fpregs_to_fpstate(&vcpu->arch.guest_fpu);
+ copy_kernel_to_fpregs(&vcpu->arch.user_fpu.state);
+ preempt_enable();
+ ++vcpu->stat.fpu_reload;
+ trace_kvm_fpu(0);
+}
+
+int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+ int r;
+
+ vcpu_load(vcpu);
+ kvm_sigset_activate(vcpu);
+ kvm_load_guest_fpu(vcpu);
+
+ if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
+ if (kvm_run->immediate_exit) {
+ r = -EINTR;
+ goto out;
+ }
+ kvm_vcpu_block(vcpu);
+ kvm_apic_accept_events(vcpu);
+ kvm_clear_request(KVM_REQ_UNHALT, vcpu);
+ r = -EAGAIN;
+ if (signal_pending(current)) {
+ r = -EINTR;
+ vcpu->run->exit_reason = KVM_EXIT_INTR;
+ ++vcpu->stat.signal_exits;
+ }
+ goto out;
+ }
+
+ if (vcpu->run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) {
+ r = -EINVAL;
+ goto out;
+ }
+
+ if (vcpu->run->kvm_dirty_regs) {
+ r = sync_regs(vcpu);
+ if (r != 0)
+ goto out;
+ }
+
+ /* re-sync apic's tpr */
+ if (!lapic_in_kernel(vcpu)) {
+ if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
+ r = -EINVAL;
+ goto out;
+ }
+ }
+
+ if (unlikely(vcpu->arch.complete_userspace_io)) {
+ int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io;
+ vcpu->arch.complete_userspace_io = NULL;
+ r = cui(vcpu);
+ if (r <= 0)
+ goto out;
+ } else
+ WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed);
+
+ if (kvm_run->immediate_exit)
+ r = -EINTR;
+ else
+ r = vcpu_run(vcpu);
+
+out:
+ kvm_put_guest_fpu(vcpu);
+ if (vcpu->run->kvm_valid_regs)
+ store_regs(vcpu);
+ post_kvm_run_save(vcpu);
+ kvm_sigset_deactivate(vcpu);
+
+ vcpu_put(vcpu);
+ return r;
+}
+
+static void __get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+ if (vcpu->arch.emulate_regs_need_sync_to_vcpu) {
+ /*
+ * We are here if userspace calls get_regs() in the middle of
+ * instruction emulation. Registers state needs to be copied
+ * back from emulation context to vcpu. Userspace shouldn't do
+ * that usually, but some bad designed PV devices (vmware
+ * backdoor interface) need this to work
+ */
+ emulator_writeback_register_cache(&vcpu->arch.emulate_ctxt);
+ vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
+ }
+ regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
+ regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);
+ regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);
+ regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX);
+ regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI);
+ regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI);
+ regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
+ regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP);
+#ifdef CONFIG_X86_64
+ regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8);
+ regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9);
+ regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10);
+ regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11);
+ regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12);
+ regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13);
+ regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14);
+ regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15);
+#endif
+
+ regs->rip = kvm_rip_read(vcpu);
+ regs->rflags = kvm_get_rflags(vcpu);
+}
+
+int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+ vcpu_load(vcpu);
+ __get_regs(vcpu, regs);
+ vcpu_put(vcpu);
+ return 0;
+}
+
+static void __set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+ vcpu->arch.emulate_regs_need_sync_from_vcpu = true;
+ vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
+
+ kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
+ kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
+ kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
+ kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx);
+ kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi);
+ kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi);
+ kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp);
+ kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp);
+#ifdef CONFIG_X86_64
+ kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8);
+ kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9);
+ kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10);
+ kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11);
+ kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12);
+ kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13);
+ kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14);
+ kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15);
+#endif
+
+ kvm_rip_write(vcpu, regs->rip);
+ kvm_set_rflags(vcpu, regs->rflags | X86_EFLAGS_FIXED);
+
+ vcpu->arch.exception.pending = false;
+
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+}
+
+int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+ vcpu_load(vcpu);
+ __set_regs(vcpu, regs);
+ vcpu_put(vcpu);
+ return 0;
+}
+
+void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
+{
+ struct kvm_segment cs;
+
+ kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);
+ *db = cs.db;
+ *l = cs.l;
+}
+EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
+
+static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+{
+ struct desc_ptr dt;
+
+ kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
+ kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
+ kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
+ kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
+ kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
+ kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
+
+ kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
+ kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
+
+ kvm_x86_ops->get_idt(vcpu, &dt);
+ sregs->idt.limit = dt.size;
+ sregs->idt.base = dt.address;
+ kvm_x86_ops->get_gdt(vcpu, &dt);
+ sregs->gdt.limit = dt.size;
+ sregs->gdt.base = dt.address;
+
+ sregs->cr0 = kvm_read_cr0(vcpu);
+ sregs->cr2 = vcpu->arch.cr2;
+ sregs->cr3 = kvm_read_cr3(vcpu);
+ sregs->cr4 = kvm_read_cr4(vcpu);
+ sregs->cr8 = kvm_get_cr8(vcpu);
+ sregs->efer = vcpu->arch.efer;
+ sregs->apic_base = kvm_get_apic_base(vcpu);
+
+ memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap);
+
+ if (vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft)
+ set_bit(vcpu->arch.interrupt.nr,
+ (unsigned long *)sregs->interrupt_bitmap);
+}
+
+int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
+ struct kvm_sregs *sregs)
+{
+ vcpu_load(vcpu);
+ __get_sregs(vcpu, sregs);
+ vcpu_put(vcpu);
+ return 0;
+}
+
+int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
+ struct kvm_mp_state *mp_state)
+{
+ vcpu_load(vcpu);
+ if (kvm_mpx_supported())
+ kvm_load_guest_fpu(vcpu);
+
+ kvm_apic_accept_events(vcpu);
+ if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED &&
+ vcpu->arch.pv.pv_unhalted)
+ mp_state->mp_state = KVM_MP_STATE_RUNNABLE;
+ else
+ mp_state->mp_state = vcpu->arch.mp_state;
+
+ if (kvm_mpx_supported())
+ kvm_put_guest_fpu(vcpu);
+ vcpu_put(vcpu);
+ return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
+ struct kvm_mp_state *mp_state)
+{
+ int ret = -EINVAL;
+
+ vcpu_load(vcpu);
+
+ if (!lapic_in_kernel(vcpu) &&
+ mp_state->mp_state != KVM_MP_STATE_RUNNABLE)
+ goto out;
+
+ /* INITs are latched while in SMM */
+ if ((is_smm(vcpu) || vcpu->arch.smi_pending) &&
+ (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED ||
+ mp_state->mp_state == KVM_MP_STATE_INIT_RECEIVED))
+ goto out;
+
+ if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) {
+ vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
+ set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events);
+ } else
+ vcpu->arch.mp_state = mp_state->mp_state;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ ret = 0;
+out:
+ vcpu_put(vcpu);
+ return ret;
+}
+
+int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
+ int reason, bool has_error_code, u32 error_code)
+{
+ struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+ int ret;
+
+ init_emulate_ctxt(vcpu);
+
+ ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason,
+ has_error_code, error_code);
+
+ if (ret)
+ return EMULATE_FAIL;
+
+ kvm_rip_write(vcpu, ctxt->eip);
+ kvm_set_rflags(vcpu, ctxt->eflags);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ return EMULATE_DONE;
+}
+EXPORT_SYMBOL_GPL(kvm_task_switch);
+
+static int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+{
+ if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG)) {
+ /*
+ * When EFER.LME and CR0.PG are set, the processor is in
+ * 64-bit mode (though maybe in a 32-bit code segment).
+ * CR4.PAE and EFER.LMA must be set.
+ */
+ if (!(sregs->cr4 & X86_CR4_PAE)
+ || !(sregs->efer & EFER_LMA))
+ return -EINVAL;
+ } else {
+ /*
+ * Not in 64-bit mode: EFER.LMA is clear and the code
+ * segment cannot be 64-bit.
+ */
+ if (sregs->efer & EFER_LMA || sregs->cs.l)
+ return -EINVAL;
+ }
+
+ return kvm_valid_cr4(vcpu, sregs->cr4);
+}
+
+static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+{
+ struct msr_data apic_base_msr;
+ int mmu_reset_needed = 0;
+ int cpuid_update_needed = 0;
+ int pending_vec, max_bits, idx;
+ struct desc_ptr dt;
+ int ret = -EINVAL;
+
+ if (kvm_valid_sregs(vcpu, sregs))
+ goto out;
+
+ apic_base_msr.data = sregs->apic_base;
+ apic_base_msr.host_initiated = true;
+ if (kvm_set_apic_base(vcpu, &apic_base_msr))
+ goto out;
+
+ dt.size = sregs->idt.limit;
+ dt.address = sregs->idt.base;
+ kvm_x86_ops->set_idt(vcpu, &dt);
+ dt.size = sregs->gdt.limit;
+ dt.address = sregs->gdt.base;
+ kvm_x86_ops->set_gdt(vcpu, &dt);
+
+ vcpu->arch.cr2 = sregs->cr2;
+ mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
+ vcpu->arch.cr3 = sregs->cr3;
+ __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
+
+ kvm_set_cr8(vcpu, sregs->cr8);
+
+ mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
+ kvm_x86_ops->set_efer(vcpu, sregs->efer);
+
+ mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
+ kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
+ vcpu->arch.cr0 = sregs->cr0;
+
+ mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
+ cpuid_update_needed |= ((kvm_read_cr4(vcpu) ^ sregs->cr4) &
+ (X86_CR4_OSXSAVE | X86_CR4_PKE));
+ kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
+ if (cpuid_update_needed)
+ kvm_update_cpuid(vcpu);
+
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ if (is_pae_paging(vcpu)) {
+ load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
+ mmu_reset_needed = 1;
+ }
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+
+ if (mmu_reset_needed)
+ kvm_mmu_reset_context(vcpu);
+
+ max_bits = KVM_NR_INTERRUPTS;
+ pending_vec = find_first_bit(
+ (const unsigned long *)sregs->interrupt_bitmap, max_bits);
+ if (pending_vec < max_bits) {
+ kvm_queue_interrupt(vcpu, pending_vec, false);
+ pr_debug("Set back pending irq %d\n", pending_vec);
+ }
+
+ kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
+ kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
+ kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
+ kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
+ kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
+ kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
+
+ kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
+ kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
+
+ update_cr8_intercept(vcpu);
+
+ /* Older userspace won't unhalt the vcpu on reset. */
+ if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 &&
+ sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
+ !is_protmode(vcpu))
+ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ ret = 0;
+out:
+ return ret;
+}
+
+int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
+ struct kvm_sregs *sregs)
+{
+ int ret;
+
+ vcpu_load(vcpu);
+ ret = __set_sregs(vcpu, sregs);
+ vcpu_put(vcpu);
+ return ret;
+}
+
+int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
+ struct kvm_guest_debug *dbg)
+{
+ unsigned long rflags;
+ int i, r;
+
+ vcpu_load(vcpu);
+
+ if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {
+ r = -EBUSY;
+ if (vcpu->arch.exception.pending)
+ goto out;
+ if (dbg->control & KVM_GUESTDBG_INJECT_DB)
+ kvm_queue_exception(vcpu, DB_VECTOR);
+ else
+ kvm_queue_exception(vcpu, BP_VECTOR);
+ }
+
+ /*
+ * Read rflags as long as potentially injected trace flags are still
+ * filtered out.
+ */
+ rflags = kvm_get_rflags(vcpu);
+
+ vcpu->guest_debug = dbg->control;
+ if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE))
+ vcpu->guest_debug = 0;
+
+ if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
+ for (i = 0; i < KVM_NR_DB_REGS; ++i)
+ vcpu->arch.eff_db[i] = dbg->arch.debugreg[i];
+ vcpu->arch.guest_debug_dr7 = dbg->arch.debugreg[7];
+ } else {
+ for (i = 0; i < KVM_NR_DB_REGS; i++)
+ vcpu->arch.eff_db[i] = vcpu->arch.db[i];
+ }
+ kvm_update_dr7(vcpu);
+
+ if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+ vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) +
+ get_segment_base(vcpu, VCPU_SREG_CS);
+
+ /*
+ * Trigger an rflags update that will inject or remove the trace
+ * flags.
+ */
+ kvm_set_rflags(vcpu, rflags);
+
+ kvm_x86_ops->update_bp_intercept(vcpu);
+
+ r = 0;
+
+out:
+ vcpu_put(vcpu);
+ return r;
+}
+
+/*
+ * Translate a guest virtual address to a guest physical address.
+ */
+int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
+ struct kvm_translation *tr)
+{
+ unsigned long vaddr = tr->linear_address;
+ gpa_t gpa;
+ int idx;
+
+ vcpu_load(vcpu);
+
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ tr->physical_address = gpa;
+ tr->valid = gpa != UNMAPPED_GVA;
+ tr->writeable = 1;
+ tr->usermode = 0;
+
+ vcpu_put(vcpu);
+ return 0;
+}
+
+int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
+{
+ struct fxregs_state *fxsave;
+
+ vcpu_load(vcpu);
+
+ fxsave = &vcpu->arch.guest_fpu.state.fxsave;
+ memcpy(fpu->fpr, fxsave->st_space, 128);
+ fpu->fcw = fxsave->cwd;
+ fpu->fsw = fxsave->swd;
+ fpu->ftwx = fxsave->twd;
+ fpu->last_opcode = fxsave->fop;
+ fpu->last_ip = fxsave->rip;
+ fpu->last_dp = fxsave->rdp;
+ memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
+
+ vcpu_put(vcpu);
+ return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
+{
+ struct fxregs_state *fxsave;
+
+ vcpu_load(vcpu);
+
+ fxsave = &vcpu->arch.guest_fpu.state.fxsave;
+
+ memcpy(fxsave->st_space, fpu->fpr, 128);
+ fxsave->cwd = fpu->fcw;
+ fxsave->swd = fpu->fsw;
+ fxsave->twd = fpu->ftwx;
+ fxsave->fop = fpu->last_opcode;
+ fxsave->rip = fpu->last_ip;
+ fxsave->rdp = fpu->last_dp;
+ memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
+
+ vcpu_put(vcpu);
+ return 0;
+}
+
+static void store_regs(struct kvm_vcpu *vcpu)
+{
+ BUILD_BUG_ON(sizeof(struct kvm_sync_regs) > SYNC_REGS_SIZE_BYTES);
+
+ if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_REGS)
+ __get_regs(vcpu, &vcpu->run->s.regs.regs);
+
+ if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_SREGS)
+ __get_sregs(vcpu, &vcpu->run->s.regs.sregs);
+
+ if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_EVENTS)
+ kvm_vcpu_ioctl_x86_get_vcpu_events(
+ vcpu, &vcpu->run->s.regs.events);
+}
+
+static int sync_regs(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->run->kvm_dirty_regs & ~KVM_SYNC_X86_VALID_FIELDS)
+ return -EINVAL;
+
+ if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_REGS) {
+ __set_regs(vcpu, &vcpu->run->s.regs.regs);
+ vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_REGS;
+ }
+ if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_SREGS) {
+ if (__set_sregs(vcpu, &vcpu->run->s.regs.sregs))
+ return -EINVAL;
+ vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_SREGS;
+ }
+ if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_EVENTS) {
+ if (kvm_vcpu_ioctl_x86_set_vcpu_events(
+ vcpu, &vcpu->run->s.regs.events))
+ return -EINVAL;
+ vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_EVENTS;
+ }
+
+ return 0;
+}
+
+static void fx_init(struct kvm_vcpu *vcpu)
+{
+ fpstate_init(&vcpu->arch.guest_fpu.state);
+ if (boot_cpu_has(X86_FEATURE_XSAVES))
+ vcpu->arch.guest_fpu.state.xsave.header.xcomp_bv =
+ host_xcr0 | XSTATE_COMPACTION_ENABLED;
+
+ /*
+ * Ensure guest xcr0 is valid for loading
+ */
+ vcpu->arch.xcr0 = XFEATURE_MASK_FP;
+
+ vcpu->arch.cr0 |= X86_CR0_ET;
+}
+
+void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
+{
+ void *wbinvd_dirty_mask = vcpu->arch.wbinvd_dirty_mask;
+ struct gfn_to_pfn_cache *cache = &vcpu->arch.st.cache;
+
+ kvm_release_pfn(cache->pfn, cache->dirty, cache);
+
+ kvmclock_reset(vcpu);
+
+ kvm_x86_ops->vcpu_free(vcpu);
+ free_cpumask_var(wbinvd_dirty_mask);
+}
+
+struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
+ unsigned int id)
+{
+ struct kvm_vcpu *vcpu;
+
+ if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
+ printk_once(KERN_WARNING
+ "kvm: SMP vm created on host with unstable TSC; "
+ "guest TSC will not be reliable\n");
+
+ vcpu = kvm_x86_ops->vcpu_create(kvm, id);
+
+ return vcpu;
+}
+
+int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
+ kvm_vcpu_mtrr_init(vcpu);
+ vcpu_load(vcpu);
+ kvm_vcpu_reset(vcpu, false);
+ kvm_mmu_setup(vcpu);
+ vcpu_put(vcpu);
+ return 0;
+}
+
+void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
+{
+ struct msr_data msr;
+ struct kvm *kvm = vcpu->kvm;
+
+ kvm_hv_vcpu_postcreate(vcpu);
+
+ if (mutex_lock_killable(&vcpu->mutex))
+ return;
+ vcpu_load(vcpu);
+ msr.data = 0x0;
+ msr.index = MSR_IA32_TSC;
+ msr.host_initiated = true;
+ kvm_write_tsc(vcpu, &msr);
+ vcpu_put(vcpu);
+ mutex_unlock(&vcpu->mutex);
+
+ if (!kvmclock_periodic_sync)
+ return;
+
+ schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
+ KVMCLOCK_SYNC_PERIOD);
+}
+
+void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
+{
+ kvm_arch_vcpu_free(vcpu);
+}
+
+void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
+{
+ kvm_lapic_reset(vcpu, init_event);
+
+ vcpu->arch.hflags = 0;
+
+ vcpu->arch.smi_pending = 0;
+ vcpu->arch.smi_count = 0;
+ atomic_set(&vcpu->arch.nmi_queued, 0);
+ vcpu->arch.nmi_pending = 0;
+ vcpu->arch.nmi_injected = false;
+ kvm_clear_interrupt_queue(vcpu);
+ kvm_clear_exception_queue(vcpu);
+ vcpu->arch.exception.pending = false;
+
+ memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
+ kvm_update_dr0123(vcpu);
+ vcpu->arch.dr6 = DR6_INIT;
+ kvm_update_dr6(vcpu);
+ vcpu->arch.dr7 = DR7_FIXED_1;
+ kvm_update_dr7(vcpu);
+
+ vcpu->arch.cr2 = 0;
+
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ vcpu->arch.apf.msr_val = 0;
+ vcpu->arch.st.msr_val = 0;
+
+ kvmclock_reset(vcpu);
+
+ kvm_clear_async_pf_completion_queue(vcpu);
+ kvm_async_pf_hash_reset(vcpu);
+ vcpu->arch.apf.halted = false;
+
+ if (kvm_mpx_supported()) {
+ void *mpx_state_buffer;
+
+ /*
+ * To avoid have the INIT path from kvm_apic_has_events() that be
+ * called with loaded FPU and does not let userspace fix the state.
+ */
+ if (init_event)
+ kvm_put_guest_fpu(vcpu);
+ mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu.state.xsave,
+ XFEATURE_MASK_BNDREGS);
+ if (mpx_state_buffer)
+ memset(mpx_state_buffer, 0, sizeof(struct mpx_bndreg_state));
+ mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu.state.xsave,
+ XFEATURE_MASK_BNDCSR);
+ if (mpx_state_buffer)
+ memset(mpx_state_buffer, 0, sizeof(struct mpx_bndcsr));
+ if (init_event)
+ kvm_load_guest_fpu(vcpu);
+ }
+
+ if (!init_event) {
+ kvm_pmu_reset(vcpu);
+ vcpu->arch.smbase = 0x30000;
+
+ vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
+ vcpu->arch.msr_misc_features_enables = 0;
+
+ vcpu->arch.xcr0 = XFEATURE_MASK_FP;
+ }
+
+ memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
+ vcpu->arch.regs_avail = ~0;
+ vcpu->arch.regs_dirty = ~0;
+
+ vcpu->arch.ia32_xss = 0;
+
+ kvm_x86_ops->vcpu_reset(vcpu, init_event);
+}
+
+void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
+{
+ struct kvm_segment cs;
+
+ kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);
+ cs.selector = vector << 8;
+ cs.base = vector << 12;
+ kvm_set_segment(vcpu, &cs, VCPU_SREG_CS);
+ kvm_rip_write(vcpu, 0);
+}
+
+int kvm_arch_hardware_enable(void)
+{
+ struct kvm *kvm;
+ struct kvm_vcpu *vcpu;
+ int i;
+ int ret;
+ u64 local_tsc;
+ u64 max_tsc = 0;
+ bool stable, backwards_tsc = false;
+
+ kvm_shared_msr_cpu_online();
+ ret = kvm_x86_ops->hardware_enable();
+ if (ret != 0)
+ return ret;
+
+ local_tsc = rdtsc();
+ stable = !kvm_check_tsc_unstable();
+ list_for_each_entry(kvm, &vm_list, vm_list) {
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (!stable && vcpu->cpu == smp_processor_id())
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+ if (stable && vcpu->arch.last_host_tsc > local_tsc) {
+ backwards_tsc = true;
+ if (vcpu->arch.last_host_tsc > max_tsc)
+ max_tsc = vcpu->arch.last_host_tsc;
+ }
+ }
+ }
+
+ /*
+ * Sometimes, even reliable TSCs go backwards. This happens on
+ * platforms that reset TSC during suspend or hibernate actions, but
+ * maintain synchronization. We must compensate. Fortunately, we can
+ * detect that condition here, which happens early in CPU bringup,
+ * before any KVM threads can be running. Unfortunately, we can't
+ * bring the TSCs fully up to date with real time, as we aren't yet far
+ * enough into CPU bringup that we know how much real time has actually
+ * elapsed; our helper function, ktime_get_boot_ns() will be using boot
+ * variables that haven't been updated yet.
+ *
+ * So we simply find the maximum observed TSC above, then record the
+ * adjustment to TSC in each VCPU. When the VCPU later gets loaded,
+ * the adjustment will be applied. Note that we accumulate
+ * adjustments, in case multiple suspend cycles happen before some VCPU
+ * gets a chance to run again. In the event that no KVM threads get a
+ * chance to run, we will miss the entire elapsed period, as we'll have
+ * reset last_host_tsc, so VCPUs will not have the TSC adjusted and may
+ * loose cycle time. This isn't too big a deal, since the loss will be
+ * uniform across all VCPUs (not to mention the scenario is extremely
+ * unlikely). It is possible that a second hibernate recovery happens
+ * much faster than a first, causing the observed TSC here to be
+ * smaller; this would require additional padding adjustment, which is
+ * why we set last_host_tsc to the local tsc observed here.
+ *
+ * N.B. - this code below runs only on platforms with reliable TSC,
+ * as that is the only way backwards_tsc is set above. Also note
+ * that this runs for ALL vcpus, which is not a bug; all VCPUs should
+ * have the same delta_cyc adjustment applied if backwards_tsc
+ * is detected. Note further, this adjustment is only done once,
+ * as we reset last_host_tsc on all VCPUs to stop this from being
+ * called multiple times (one for each physical CPU bringup).
+ *
+ * Platforms with unreliable TSCs don't have to deal with this, they
+ * will be compensated by the logic in vcpu_load, which sets the TSC to
+ * catchup mode. This will catchup all VCPUs to real time, but cannot
+ * guarantee that they stay in perfect synchronization.
+ */
+ if (backwards_tsc) {
+ u64 delta_cyc = max_tsc - local_tsc;
+ list_for_each_entry(kvm, &vm_list, vm_list) {
+ kvm->arch.backwards_tsc_observed = true;
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ vcpu->arch.tsc_offset_adjustment += delta_cyc;
+ vcpu->arch.last_host_tsc = local_tsc;
+ kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
+ }
+
+ /*
+ * We have to disable TSC offset matching.. if you were
+ * booting a VM while issuing an S4 host suspend....
+ * you may have some problem. Solving this issue is
+ * left as an exercise to the reader.
+ */
+ kvm->arch.last_tsc_nsec = 0;
+ kvm->arch.last_tsc_write = 0;
+ }
+
+ }
+ return 0;
+}
+
+void kvm_arch_hardware_disable(void)
+{
+ kvm_x86_ops->hardware_disable();
+ drop_user_return_notifiers();
+}
+
+int kvm_arch_hardware_setup(void)
+{
+ int r;
+
+ r = kvm_x86_ops->hardware_setup();
+ if (r != 0)
+ return r;
+
+ cr4_reserved_bits = kvm_host_cr4_reserved_bits(&boot_cpu_data);
+
+ if (kvm_has_tsc_control) {
+ /*
+ * Make sure the user can only configure tsc_khz values that
+ * fit into a signed integer.
+ * A min value is not calculated because it will always
+ * be 1 on all machines.
+ */
+ u64 max = min(0x7fffffffULL,
+ __scale_tsc(kvm_max_tsc_scaling_ratio, tsc_khz));
+ kvm_max_guest_tsc_khz = max;
+
+ kvm_default_tsc_scaling_ratio = 1ULL << kvm_tsc_scaling_ratio_frac_bits;
+ }
+
+ kvm_init_msr_list();
+ return 0;
+}
+
+void kvm_arch_hardware_unsetup(void)
+{
+ kvm_x86_ops->hardware_unsetup();
+}
+
+void kvm_arch_check_processor_compat(void *rtn)
+{
+ kvm_x86_ops->check_processor_compatibility(rtn);
+}
+
+bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu)
+{
+ return vcpu->kvm->arch.bsp_vcpu_id == vcpu->vcpu_id;
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_is_reset_bsp);
+
+bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
+{
+ return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0;
+}
+
+struct static_key kvm_no_apic_vcpu __read_mostly;
+EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu);
+
+int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
+{
+ struct page *page;
+ int r;
+
+ vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu);
+ vcpu->arch.emulate_ctxt.ops = &emulate_ops;
+ if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
+ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+ else
+ vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
+
+ page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ if (!page) {
+ r = -ENOMEM;
+ goto fail;
+ }
+ vcpu->arch.pio_data = page_address(page);
+
+ kvm_set_tsc_khz(vcpu, max_tsc_khz);
+
+ r = kvm_mmu_create(vcpu);
+ if (r < 0)
+ goto fail_free_pio_data;
+
+ if (irqchip_in_kernel(vcpu->kvm)) {
+ r = kvm_create_lapic(vcpu);
+ if (r < 0)
+ goto fail_mmu_destroy;
+ } else
+ static_key_slow_inc(&kvm_no_apic_vcpu);
+
+ vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
+ GFP_KERNEL);
+ if (!vcpu->arch.mce_banks) {
+ r = -ENOMEM;
+ goto fail_free_lapic;
+ }
+ vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
+
+ if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) {
+ r = -ENOMEM;
+ goto fail_free_mce_banks;
+ }
+
+ fx_init(vcpu);
+
+ vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
+
+ vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
+
+ vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
+
+ kvm_async_pf_hash_reset(vcpu);
+ kvm_pmu_init(vcpu);
+
+ vcpu->arch.pending_external_vector = -1;
+ vcpu->arch.preempted_in_kernel = false;
+
+ kvm_hv_vcpu_init(vcpu);
+
+ return 0;
+
+fail_free_mce_banks:
+ kfree(vcpu->arch.mce_banks);
+fail_free_lapic:
+ kvm_free_lapic(vcpu);
+fail_mmu_destroy:
+ kvm_mmu_destroy(vcpu);
+fail_free_pio_data:
+ free_page((unsigned long)vcpu->arch.pio_data);
+fail:
+ return r;
+}
+
+void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
+{
+ int idx;
+
+ kvm_hv_vcpu_uninit(vcpu);
+ kvm_pmu_destroy(vcpu);
+ kfree(vcpu->arch.mce_banks);
+ kvm_free_lapic(vcpu);
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ kvm_mmu_destroy(vcpu);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ free_page((unsigned long)vcpu->arch.pio_data);
+ if (!lapic_in_kernel(vcpu))
+ static_key_slow_dec(&kvm_no_apic_vcpu);
+}
+
+void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
+{
+ vcpu->arch.l1tf_flush_l1d = true;
+ kvm_x86_ops->sched_in(vcpu, cpu);
+}
+
+int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
+{
+ if (type)
+ return -EINVAL;
+
+ INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);
+ INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
+ INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
+ INIT_LIST_HEAD(&kvm->arch.lpage_disallowed_mmu_pages);
+ INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
+ atomic_set(&kvm->arch.noncoherent_dma_count, 0);
+
+ /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
+ set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
+ /* Reserve bit 1 of irq_sources_bitmap for irqfd-resampler */
+ set_bit(KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
+ &kvm->arch.irq_sources_bitmap);
+
+ raw_spin_lock_init(&kvm->arch.tsc_write_lock);
+ mutex_init(&kvm->arch.apic_map_lock);
+ spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
+
+ kvm->arch.kvmclock_offset = -ktime_get_boot_ns();
+ pvclock_update_vm_gtod_copy(kvm);
+
+ kvm->arch.guest_can_read_msr_platform_info = true;
+
+ INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
+ INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn);
+
+ kvm_hv_init_vm(kvm);
+ kvm_page_track_init(kvm);
+ kvm_mmu_init_vm(kvm);
+
+ if (kvm_x86_ops->vm_init)
+ return kvm_x86_ops->vm_init(kvm);
+
+ return 0;
+}
+
+int kvm_arch_post_init_vm(struct kvm *kvm)
+{
+ return kvm_mmu_post_init_vm(kvm);
+}
+
+static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
+{
+ vcpu_load(vcpu);
+ kvm_mmu_unload(vcpu);
+ vcpu_put(vcpu);
+}
+
+static void kvm_free_vcpus(struct kvm *kvm)
+{
+ unsigned int i;
+ struct kvm_vcpu *vcpu;
+
+ /*
+ * Unpin any mmu pages first.
+ */
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ kvm_clear_async_pf_completion_queue(vcpu);
+ kvm_unload_vcpu_mmu(vcpu);
+ }
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_arch_vcpu_free(vcpu);
+
+ mutex_lock(&kvm->lock);
+ for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
+ kvm->vcpus[i] = NULL;
+
+ atomic_set(&kvm->online_vcpus, 0);
+ mutex_unlock(&kvm->lock);
+}
+
+void kvm_arch_sync_events(struct kvm *kvm)
+{
+ cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work);
+ cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work);
+ kvm_free_pit(kvm);
+}
+
+int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
+{
+ int i, r;
+ unsigned long hva;
+ struct kvm_memslots *slots = kvm_memslots(kvm);
+ struct kvm_memory_slot *slot, old;
+
+ /* Called with kvm->slots_lock held. */
+ if (WARN_ON(id >= KVM_MEM_SLOTS_NUM))
+ return -EINVAL;
+
+ slot = id_to_memslot(slots, id);
+ if (size) {
+ if (slot->npages)
+ return -EEXIST;
+
+ /*
+ * MAP_SHARED to prevent internal slot pages from being moved
+ * by fork()/COW.
+ */
+ hva = vm_mmap(NULL, 0, size, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_ANONYMOUS, 0);
+ if (IS_ERR((void *)hva))
+ return PTR_ERR((void *)hva);
+ } else {
+ if (!slot->npages)
+ return 0;
+
+ hva = 0;
+ }
+
+ old = *slot;
+ for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+ struct kvm_userspace_memory_region m;
+
+ m.slot = id | (i << 16);
+ m.flags = 0;
+ m.guest_phys_addr = gpa;
+ m.userspace_addr = hva;
+ m.memory_size = size;
+ r = __kvm_set_memory_region(kvm, &m);
+ if (r < 0)
+ return r;
+ }
+
+ if (!size)
+ vm_munmap(old.userspace_addr, old.npages * PAGE_SIZE);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(__x86_set_memory_region);
+
+int x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
+{
+ int r;
+
+ mutex_lock(&kvm->slots_lock);
+ r = __x86_set_memory_region(kvm, id, gpa, size);
+ mutex_unlock(&kvm->slots_lock);
+
+ return r;
+}
+EXPORT_SYMBOL_GPL(x86_set_memory_region);
+
+void kvm_arch_pre_destroy_vm(struct kvm *kvm)
+{
+ kvm_mmu_pre_destroy_vm(kvm);
+}
+
+void kvm_arch_destroy_vm(struct kvm *kvm)
+{
+ if (current->mm == kvm->mm) {
+ /*
+ * Free memory regions allocated on behalf of userspace,
+ * unless the the memory map has changed due to process exit
+ * or fd copying.
+ */
+ x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, 0, 0);
+ x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, 0, 0);
+ x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0);
+ }
+ if (kvm_x86_ops->vm_destroy)
+ kvm_x86_ops->vm_destroy(kvm);
+ kvm_pic_destroy(kvm);
+ kvm_ioapic_destroy(kvm);
+ kvm_free_vcpus(kvm);
+ kvfree(rcu_dereference_check(kvm->arch.apic_map, 1));
+ kvm_mmu_uninit_vm(kvm);
+ kvm_page_track_cleanup(kvm);
+ kvm_hv_destroy_vm(kvm);
+}
+
+void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
+ struct kvm_memory_slot *dont)
+{
+ int i;
+
+ for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
+ if (!dont || free->arch.rmap[i] != dont->arch.rmap[i]) {
+ kvfree(free->arch.rmap[i]);
+ free->arch.rmap[i] = NULL;
+ }
+ if (i == 0)
+ continue;
+
+ if (!dont || free->arch.lpage_info[i - 1] !=
+ dont->arch.lpage_info[i - 1]) {
+ kvfree(free->arch.lpage_info[i - 1]);
+ free->arch.lpage_info[i - 1] = NULL;
+ }
+ }
+
+ kvm_page_track_free_memslot(free, dont);
+}
+
+int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
+ unsigned long npages)
+{
+ int i;
+
+ /*
+ * Clear out the previous array pointers for the KVM_MR_MOVE case. The
+ * old arrays will be freed by __kvm_set_memory_region() if installing
+ * the new memslot is successful.
+ */
+ memset(&slot->arch, 0, sizeof(slot->arch));
+
+ for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
+ struct kvm_lpage_info *linfo;
+ unsigned long ugfn;
+ int lpages;
+ int level = i + 1;
+
+ lpages = gfn_to_index(slot->base_gfn + npages - 1,
+ slot->base_gfn, level) + 1;
+
+ slot->arch.rmap[i] =
+ kvcalloc(lpages, sizeof(*slot->arch.rmap[i]),
+ GFP_KERNEL);
+ if (!slot->arch.rmap[i])
+ goto out_free;
+ if (i == 0)
+ continue;
+
+ linfo = kvcalloc(lpages, sizeof(*linfo), GFP_KERNEL);
+ if (!linfo)
+ goto out_free;
+
+ slot->arch.lpage_info[i - 1] = linfo;
+
+ if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
+ linfo[0].disallow_lpage = 1;
+ if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
+ linfo[lpages - 1].disallow_lpage = 1;
+ ugfn = slot->userspace_addr >> PAGE_SHIFT;
+ /*
+ * If the gfn and userspace address are not aligned wrt each
+ * other, or if explicitly asked to, disable large page
+ * support for this slot
+ */
+ if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) ||
+ !kvm_largepages_enabled()) {
+ unsigned long j;
+
+ for (j = 0; j < lpages; ++j)
+ linfo[j].disallow_lpage = 1;
+ }
+ }
+
+ if (kvm_page_track_create_memslot(slot, npages))
+ goto out_free;
+
+ return 0;
+
+out_free:
+ for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
+ kvfree(slot->arch.rmap[i]);
+ slot->arch.rmap[i] = NULL;
+ if (i == 0)
+ continue;
+
+ kvfree(slot->arch.lpage_info[i - 1]);
+ slot->arch.lpage_info[i - 1] = NULL;
+ }
+ return -ENOMEM;
+}
+
+void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
+{
+ struct kvm_vcpu *vcpu;
+ int i;
+
+ /*
+ * memslots->generation has been incremented.
+ * mmio generation may have reached its maximum value.
+ */
+ kvm_mmu_invalidate_mmio_sptes(kvm, gen);
+
+ /* Force re-initialization of steal_time cache */
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_vcpu_kick(vcpu);
+}
+
+int kvm_arch_prepare_memory_region(struct kvm *kvm,
+ struct kvm_memory_slot *memslot,
+ const struct kvm_userspace_memory_region *mem,
+ enum kvm_mr_change change)
+{
+ if (change == KVM_MR_MOVE)
+ return kvm_arch_create_memslot(kvm, memslot,
+ mem->memory_size >> PAGE_SHIFT);
+
+ return 0;
+}
+
+static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
+ struct kvm_memory_slot *new)
+{
+ /* Still write protect RO slot */
+ if (new->flags & KVM_MEM_READONLY) {
+ kvm_mmu_slot_remove_write_access(kvm, new);
+ return;
+ }
+
+ /*
+ * Call kvm_x86_ops dirty logging hooks when they are valid.
+ *
+ * kvm_x86_ops->slot_disable_log_dirty is called when:
+ *
+ * - KVM_MR_CREATE with dirty logging is disabled
+ * - KVM_MR_FLAGS_ONLY with dirty logging is disabled in new flag
+ *
+ * The reason is, in case of PML, we need to set D-bit for any slots
+ * with dirty logging disabled in order to eliminate unnecessary GPA
+ * logging in PML buffer (and potential PML buffer full VMEXT). This
+ * guarantees leaving PML enabled during guest's lifetime won't have
+ * any additonal overhead from PML when guest is running with dirty
+ * logging disabled for memory slots.
+ *
+ * kvm_x86_ops->slot_enable_log_dirty is called when switching new slot
+ * to dirty logging mode.
+ *
+ * If kvm_x86_ops dirty logging hooks are invalid, use write protect.
+ *
+ * In case of write protect:
+ *
+ * Write protect all pages for dirty logging.
+ *
+ * All the sptes including the large sptes which point to this
+ * slot are set to readonly. We can not create any new large
+ * spte on this slot until the end of the logging.
+ *
+ * See the comments in fast_page_fault().
+ */
+ if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
+ if (kvm_x86_ops->slot_enable_log_dirty)
+ kvm_x86_ops->slot_enable_log_dirty(kvm, new);
+ else
+ kvm_mmu_slot_remove_write_access(kvm, new);
+ } else {
+ if (kvm_x86_ops->slot_disable_log_dirty)
+ kvm_x86_ops->slot_disable_log_dirty(kvm, new);
+ }
+}
+
+void kvm_arch_commit_memory_region(struct kvm *kvm,
+ const struct kvm_userspace_memory_region *mem,
+ const struct kvm_memory_slot *old,
+ const struct kvm_memory_slot *new,
+ enum kvm_mr_change change)
+{
+ int nr_mmu_pages = 0;
+
+ if (!kvm->arch.n_requested_mmu_pages)
+ nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
+
+ if (nr_mmu_pages)
+ kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
+
+ /*
+ * Dirty logging tracks sptes in 4k granularity, meaning that large
+ * sptes have to be split. If live migration is successful, the guest
+ * in the source machine will be destroyed and large sptes will be
+ * created in the destination. However, if the guest continues to run
+ * in the source machine (for example if live migration fails), small
+ * sptes will remain around and cause bad performance.
+ *
+ * Scan sptes if dirty logging has been stopped, dropping those
+ * which can be collapsed into a single large-page spte. Later
+ * page faults will create the large-page sptes.
+ */
+ if ((change != KVM_MR_DELETE) &&
+ (old->flags & KVM_MEM_LOG_DIRTY_PAGES) &&
+ !(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
+ kvm_mmu_zap_collapsible_sptes(kvm, new);
+
+ /*
+ * Set up write protection and/or dirty logging for the new slot.
+ *
+ * For KVM_MR_DELETE and KVM_MR_MOVE, the shadow pages of old slot have
+ * been zapped so no dirty logging staff is needed for old slot. For
+ * KVM_MR_FLAGS_ONLY, the old slot is essentially the same one as the
+ * new and it's also covered when dealing with the new slot.
+ *
+ * FIXME: const-ify all uses of struct kvm_memory_slot.
+ */
+ if (change != KVM_MR_DELETE)
+ kvm_mmu_slot_apply_flags(kvm, (struct kvm_memory_slot *) new);
+}
+
+void kvm_arch_flush_shadow_all(struct kvm *kvm)
+{
+ kvm_mmu_invalidate_zap_all_pages(kvm);
+}
+
+void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
+ struct kvm_memory_slot *slot)
+{
+ kvm_page_track_flush_slot(kvm, slot);
+}
+
+static inline bool kvm_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
+{
+ return (is_guest_mode(vcpu) &&
+ kvm_x86_ops->guest_apic_has_interrupt &&
+ kvm_x86_ops->guest_apic_has_interrupt(vcpu));
+}
+
+static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
+{
+ if (!list_empty_careful(&vcpu->async_pf.done))
+ return true;
+
+ if (kvm_apic_has_events(vcpu))
+ return true;
+
+ if (vcpu->arch.pv.pv_unhalted)
+ return true;
+
+ if (vcpu->arch.exception.pending)
+ return true;
+
+ if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
+ (vcpu->arch.nmi_pending &&
+ kvm_x86_ops->nmi_allowed(vcpu)))
+ return true;
+
+ if (kvm_test_request(KVM_REQ_SMI, vcpu) ||
+ (vcpu->arch.smi_pending && !is_smm(vcpu)))
+ return true;
+
+ if (kvm_arch_interrupt_allowed(vcpu) &&
+ (kvm_cpu_has_interrupt(vcpu) ||
+ kvm_guest_apic_has_interrupt(vcpu)))
+ return true;
+
+ if (kvm_hv_has_stimer_pending(vcpu))
+ return true;
+
+ return false;
+}
+
+int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
+{
+ return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu);
+}
+
+bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
+{
+ if (READ_ONCE(vcpu->arch.pv.pv_unhalted))
+ return true;
+
+ if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
+ kvm_test_request(KVM_REQ_SMI, vcpu) ||
+ kvm_test_request(KVM_REQ_EVENT, vcpu))
+ return true;
+
+ if (vcpu->arch.apicv_active && kvm_x86_ops->dy_apicv_has_pending_interrupt(vcpu))
+ return true;
+
+ return false;
+}
+
+bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.preempted_in_kernel;
+}
+
+int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
+{
+ return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE;
+}
+
+int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
+{
+ return kvm_x86_ops->interrupt_allowed(vcpu);
+}
+
+unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu)
+{
+ if (is_64_bit_mode(vcpu))
+ return kvm_rip_read(vcpu);
+ return (u32)(get_segment_base(vcpu, VCPU_SREG_CS) +
+ kvm_rip_read(vcpu));
+}
+EXPORT_SYMBOL_GPL(kvm_get_linear_rip);
+
+bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip)
+{
+ return kvm_get_linear_rip(vcpu) == linear_rip;
+}
+EXPORT_SYMBOL_GPL(kvm_is_linear_rip);
+
+unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
+{
+ unsigned long rflags;
+
+ rflags = kvm_x86_ops->get_rflags(vcpu);
+ if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+ rflags &= ~X86_EFLAGS_TF;
+ return rflags;
+}
+EXPORT_SYMBOL_GPL(kvm_get_rflags);
+
+static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
+{
+ if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
+ kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
+ rflags |= X86_EFLAGS_TF;
+ kvm_x86_ops->set_rflags(vcpu, rflags);
+}
+
+void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
+{
+ __kvm_set_rflags(vcpu, rflags);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_set_rflags);
+
+void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
+{
+ int r;
+
+ if ((vcpu->arch.mmu.direct_map != work->arch.direct_map) ||
+ work->wakeup_all)
+ return;
+
+ r = kvm_mmu_reload(vcpu);
+ if (unlikely(r))
+ return;
+
+ if (!vcpu->arch.mmu.direct_map &&
+ work->arch.cr3 != vcpu->arch.mmu.get_cr3(vcpu))
+ return;
+
+ vcpu->arch.mmu.page_fault(vcpu, work->cr2_or_gpa, 0, true);
+}
+
+static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
+{
+ return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU));
+}
+
+static inline u32 kvm_async_pf_next_probe(u32 key)
+{
+ return (key + 1) & (roundup_pow_of_two(ASYNC_PF_PER_VCPU) - 1);
+}
+
+static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+ u32 key = kvm_async_pf_hash_fn(gfn);
+
+ while (vcpu->arch.apf.gfns[key] != ~0)
+ key = kvm_async_pf_next_probe(key);
+
+ vcpu->arch.apf.gfns[key] = gfn;
+}
+
+static u32 kvm_async_pf_gfn_slot(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+ int i;
+ u32 key = kvm_async_pf_hash_fn(gfn);
+
+ for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU) &&
+ (vcpu->arch.apf.gfns[key] != gfn &&
+ vcpu->arch.apf.gfns[key] != ~0); i++)
+ key = kvm_async_pf_next_probe(key);
+
+ return key;
+}
+
+bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+ return vcpu->arch.apf.gfns[kvm_async_pf_gfn_slot(vcpu, gfn)] == gfn;
+}
+
+static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+ u32 i, j, k;
+
+ i = j = kvm_async_pf_gfn_slot(vcpu, gfn);
+ while (true) {
+ vcpu->arch.apf.gfns[i] = ~0;
+ do {
+ j = kvm_async_pf_next_probe(j);
+ if (vcpu->arch.apf.gfns[j] == ~0)
+ return;
+ k = kvm_async_pf_hash_fn(vcpu->arch.apf.gfns[j]);
+ /*
+ * k lies cyclically in ]i,j]
+ * | i.k.j |
+ * |....j i.k.| or |.k..j i...|
+ */
+ } while ((i <= j) ? (i < k && k <= j) : (i < k || k <= j));
+ vcpu->arch.apf.gfns[i] = vcpu->arch.apf.gfns[j];
+ i = j;
+ }
+}
+
+static int apf_put_user(struct kvm_vcpu *vcpu, u32 val)
+{
+
+ return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val,
+ sizeof(val));
+}
+
+static int apf_get_user(struct kvm_vcpu *vcpu, u32 *val)
+{
+
+ return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, val,
+ sizeof(u32));
+}
+
+void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
+ struct kvm_async_pf *work)
+{
+ struct x86_exception fault;
+
+ trace_kvm_async_pf_not_present(work->arch.token, work->cr2_or_gpa);
+ kvm_add_async_pf_gfn(vcpu, work->arch.gfn);
+
+ if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) ||
+ (vcpu->arch.apf.send_user_only &&
+ kvm_x86_ops->get_cpl(vcpu) == 0))
+ kvm_make_request(KVM_REQ_APF_HALT, vcpu);
+ else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) {
+ fault.vector = PF_VECTOR;
+ fault.error_code_valid = true;
+ fault.error_code = 0;
+ fault.nested_page_fault = false;
+ fault.address = work->arch.token;
+ fault.async_page_fault = true;
+ kvm_inject_page_fault(vcpu, &fault);
+ }
+}
+
+void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
+ struct kvm_async_pf *work)
+{
+ struct x86_exception fault;
+ u32 val;
+
+ if (work->wakeup_all)
+ work->arch.token = ~0; /* broadcast wakeup */
+ else
+ kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
+ trace_kvm_async_pf_ready(work->arch.token, work->cr2_or_gpa);
+
+ if (vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED &&
+ !apf_get_user(vcpu, &val)) {
+ if (val == KVM_PV_REASON_PAGE_NOT_PRESENT &&
+ vcpu->arch.exception.pending &&
+ vcpu->arch.exception.nr == PF_VECTOR &&
+ !apf_put_user(vcpu, 0)) {
+ vcpu->arch.exception.injected = false;
+ vcpu->arch.exception.pending = false;
+ vcpu->arch.exception.nr = 0;
+ vcpu->arch.exception.has_error_code = false;
+ vcpu->arch.exception.error_code = 0;
+ } else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) {
+ fault.vector = PF_VECTOR;
+ fault.error_code_valid = true;
+ fault.error_code = 0;
+ fault.nested_page_fault = false;
+ fault.address = work->arch.token;
+ fault.async_page_fault = true;
+ kvm_inject_page_fault(vcpu, &fault);
+ }
+ }
+ vcpu->arch.apf.halted = false;
+ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+}
+
+bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
+{
+ if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED))
+ return true;
+ else
+ return kvm_can_do_async_pf(vcpu);
+}
+
+void kvm_arch_start_assignment(struct kvm *kvm)
+{
+ atomic_inc(&kvm->arch.assigned_device_count);
+}
+EXPORT_SYMBOL_GPL(kvm_arch_start_assignment);
+
+void kvm_arch_end_assignment(struct kvm *kvm)
+{
+ atomic_dec(&kvm->arch.assigned_device_count);
+}
+EXPORT_SYMBOL_GPL(kvm_arch_end_assignment);
+
+bool kvm_arch_has_assigned_device(struct kvm *kvm)
+{
+ return atomic_read(&kvm->arch.assigned_device_count);
+}
+EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device);
+
+void kvm_arch_register_noncoherent_dma(struct kvm *kvm)
+{
+ atomic_inc(&kvm->arch.noncoherent_dma_count);
+}
+EXPORT_SYMBOL_GPL(kvm_arch_register_noncoherent_dma);
+
+void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm)
+{
+ atomic_dec(&kvm->arch.noncoherent_dma_count);
+}
+EXPORT_SYMBOL_GPL(kvm_arch_unregister_noncoherent_dma);
+
+bool kvm_arch_has_noncoherent_dma(struct kvm *kvm)
+{
+ return atomic_read(&kvm->arch.noncoherent_dma_count);
+}
+EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma);
+
+bool kvm_arch_has_irq_bypass(void)
+{
+ return kvm_x86_ops->update_pi_irte != NULL;
+}
+
+int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
+ struct irq_bypass_producer *prod)
+{
+ struct kvm_kernel_irqfd *irqfd =
+ container_of(cons, struct kvm_kernel_irqfd, consumer);
+
+ irqfd->producer = prod;
+
+ return kvm_x86_ops->update_pi_irte(irqfd->kvm,
+ prod->irq, irqfd->gsi, 1);
+}
+
+void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
+ struct irq_bypass_producer *prod)
+{
+ int ret;
+ struct kvm_kernel_irqfd *irqfd =
+ container_of(cons, struct kvm_kernel_irqfd, consumer);
+
+ WARN_ON(irqfd->producer != prod);
+ irqfd->producer = NULL;
+
+ /*
+ * When producer of consumer is unregistered, we change back to
+ * remapped mode, so we can re-use the current implementation
+ * when the irq is masked/disabled or the consumer side (KVM
+ * int this case doesn't want to receive the interrupts.
+ */
+ ret = kvm_x86_ops->update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 0);
+ if (ret)
+ printk(KERN_INFO "irq bypass consumer (token %p) unregistration"
+ " fails: %d\n", irqfd->consumer.token, ret);
+}
+
+int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
+ uint32_t guest_irq, bool set)
+{
+ if (!kvm_x86_ops->update_pi_irte)
+ return -EINVAL;
+
+ return kvm_x86_ops->update_pi_irte(kvm, host_irq, guest_irq, set);
+}
+
+bool kvm_vector_hashing_enabled(void)
+{
+ return vector_hashing;
+}
+EXPORT_SYMBOL_GPL(kvm_vector_hashing_enabled);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmrun);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
new file mode 100644
index 000000000..422331b25
--- /dev/null
+++ b/arch/x86/kvm/x86.h
@@ -0,0 +1,365 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ARCH_X86_KVM_X86_H
+#define ARCH_X86_KVM_X86_H
+
+#include <linux/kvm_host.h>
+#include <asm/pvclock.h>
+#include "kvm_cache_regs.h"
+
+#define KVM_DEFAULT_PLE_GAP 128
+#define KVM_VMX_DEFAULT_PLE_WINDOW 4096
+#define KVM_DEFAULT_PLE_WINDOW_GROW 2
+#define KVM_DEFAULT_PLE_WINDOW_SHRINK 0
+#define KVM_VMX_DEFAULT_PLE_WINDOW_MAX UINT_MAX
+#define KVM_SVM_DEFAULT_PLE_WINDOW_MAX USHRT_MAX
+#define KVM_SVM_DEFAULT_PLE_WINDOW 3000
+
+static inline unsigned int __grow_ple_window(unsigned int val,
+ unsigned int base, unsigned int modifier, unsigned int max)
+{
+ u64 ret = val;
+
+ if (modifier < 1)
+ return base;
+
+ if (modifier < base)
+ ret *= modifier;
+ else
+ ret += modifier;
+
+ return min(ret, (u64)max);
+}
+
+static inline unsigned int __shrink_ple_window(unsigned int val,
+ unsigned int base, unsigned int modifier, unsigned int min)
+{
+ if (modifier < 1)
+ return base;
+
+ if (modifier < base)
+ val /= modifier;
+ else
+ val -= modifier;
+
+ return max(val, min);
+}
+
+#define MSR_IA32_CR_PAT_DEFAULT 0x0007040600070406ULL
+
+static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.exception.pending = false;
+ vcpu->arch.exception.injected = false;
+}
+
+static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector,
+ bool soft)
+{
+ vcpu->arch.interrupt.injected = true;
+ vcpu->arch.interrupt.soft = soft;
+ vcpu->arch.interrupt.nr = vector;
+}
+
+static inline void kvm_clear_interrupt_queue(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.interrupt.injected = false;
+}
+
+static inline bool kvm_event_needs_reinjection(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.exception.injected || vcpu->arch.interrupt.injected ||
+ vcpu->arch.nmi_injected;
+}
+
+static inline bool kvm_exception_is_soft(unsigned int nr)
+{
+ return (nr == BP_VECTOR) || (nr == OF_VECTOR);
+}
+
+static inline bool is_protmode(struct kvm_vcpu *vcpu)
+{
+ return kvm_read_cr0_bits(vcpu, X86_CR0_PE);
+}
+
+static inline int is_long_mode(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_X86_64
+ return vcpu->arch.efer & EFER_LMA;
+#else
+ return 0;
+#endif
+}
+
+static inline bool is_64_bit_mode(struct kvm_vcpu *vcpu)
+{
+ int cs_db, cs_l;
+
+ if (!is_long_mode(vcpu))
+ return false;
+ kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
+ return cs_l;
+}
+
+static inline bool is_la57_mode(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_X86_64
+ return (vcpu->arch.efer & EFER_LMA) &&
+ kvm_read_cr4_bits(vcpu, X86_CR4_LA57);
+#else
+ return 0;
+#endif
+}
+
+static inline bool x86_exception_has_error_code(unsigned int vector)
+{
+ static u32 exception_has_error_code = BIT(DF_VECTOR) | BIT(TS_VECTOR) |
+ BIT(NP_VECTOR) | BIT(SS_VECTOR) | BIT(GP_VECTOR) |
+ BIT(PF_VECTOR) | BIT(AC_VECTOR);
+
+ return (1U << vector) & exception_has_error_code;
+}
+
+static inline bool mmu_is_nested(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu;
+}
+
+static inline int is_pae(struct kvm_vcpu *vcpu)
+{
+ return kvm_read_cr4_bits(vcpu, X86_CR4_PAE);
+}
+
+static inline int is_pse(struct kvm_vcpu *vcpu)
+{
+ return kvm_read_cr4_bits(vcpu, X86_CR4_PSE);
+}
+
+static inline int is_paging(struct kvm_vcpu *vcpu)
+{
+ return likely(kvm_read_cr0_bits(vcpu, X86_CR0_PG));
+}
+
+static inline bool is_pae_paging(struct kvm_vcpu *vcpu)
+{
+ return !is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu);
+}
+
+static inline u32 bit(int bitno)
+{
+ return 1 << (bitno & 31);
+}
+
+static inline u8 vcpu_virt_addr_bits(struct kvm_vcpu *vcpu)
+{
+ return kvm_read_cr4_bits(vcpu, X86_CR4_LA57) ? 57 : 48;
+}
+
+static inline u8 ctxt_virt_addr_bits(struct x86_emulate_ctxt *ctxt)
+{
+ return (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_LA57) ? 57 : 48;
+}
+
+static inline u64 get_canonical(u64 la, u8 vaddr_bits)
+{
+ return ((int64_t)la << (64 - vaddr_bits)) >> (64 - vaddr_bits);
+}
+
+static inline bool is_noncanonical_address(u64 la, struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_X86_64
+ return get_canonical(la, vcpu_virt_addr_bits(vcpu)) != la;
+#else
+ return false;
+#endif
+}
+
+static inline bool emul_is_noncanonical_address(u64 la,
+ struct x86_emulate_ctxt *ctxt)
+{
+#ifdef CONFIG_X86_64
+ return get_canonical(la, ctxt_virt_addr_bits(ctxt)) != la;
+#else
+ return false;
+#endif
+}
+
+static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu,
+ gva_t gva, gfn_t gfn, unsigned access)
+{
+ u64 gen = kvm_memslots(vcpu->kvm)->generation;
+
+ if (unlikely(gen & 1))
+ return;
+
+ /*
+ * If this is a shadow nested page table, the "GVA" is
+ * actually a nGPA.
+ */
+ vcpu->arch.mmio_gva = mmu_is_nested(vcpu) ? 0 : gva & PAGE_MASK;
+ vcpu->arch.access = access;
+ vcpu->arch.mmio_gfn = gfn;
+ vcpu->arch.mmio_gen = gen;
+}
+
+static inline bool vcpu_match_mmio_gen(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.mmio_gen == kvm_memslots(vcpu->kvm)->generation;
+}
+
+/*
+ * Clear the mmio cache info for the given gva. If gva is MMIO_GVA_ANY, we
+ * clear all mmio cache info.
+ */
+#define MMIO_GVA_ANY (~(gva_t)0)
+
+static inline void vcpu_clear_mmio_info(struct kvm_vcpu *vcpu, gva_t gva)
+{
+ if (gva != MMIO_GVA_ANY && vcpu->arch.mmio_gva != (gva & PAGE_MASK))
+ return;
+
+ vcpu->arch.mmio_gva = 0;
+}
+
+static inline bool vcpu_match_mmio_gva(struct kvm_vcpu *vcpu, unsigned long gva)
+{
+ if (vcpu_match_mmio_gen(vcpu) && vcpu->arch.mmio_gva &&
+ vcpu->arch.mmio_gva == (gva & PAGE_MASK))
+ return true;
+
+ return false;
+}
+
+static inline bool vcpu_match_mmio_gpa(struct kvm_vcpu *vcpu, gpa_t gpa)
+{
+ if (vcpu_match_mmio_gen(vcpu) && vcpu->arch.mmio_gfn &&
+ vcpu->arch.mmio_gfn == gpa >> PAGE_SHIFT)
+ return true;
+
+ return false;
+}
+
+static inline unsigned long kvm_register_readl(struct kvm_vcpu *vcpu,
+ enum kvm_reg reg)
+{
+ unsigned long val = kvm_register_read(vcpu, reg);
+
+ return is_64_bit_mode(vcpu) ? val : (u32)val;
+}
+
+static inline void kvm_register_writel(struct kvm_vcpu *vcpu,
+ enum kvm_reg reg,
+ unsigned long val)
+{
+ if (!is_64_bit_mode(vcpu))
+ val = (u32)val;
+ return kvm_register_write(vcpu, reg, val);
+}
+
+static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk)
+{
+ return !(kvm->arch.disabled_quirks & quirk);
+}
+
+void kvm_set_pending_timer(struct kvm_vcpu *vcpu);
+int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
+
+void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr);
+u64 get_kvmclock_ns(struct kvm *kvm);
+
+int kvm_read_guest_virt(struct kvm_vcpu *vcpu,
+ gva_t addr, void *val, unsigned int bytes,
+ struct x86_exception *exception);
+
+int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu,
+ gva_t addr, void *val, unsigned int bytes,
+ struct x86_exception *exception);
+
+int handle_ud(struct kvm_vcpu *vcpu);
+
+void kvm_vcpu_mtrr_init(struct kvm_vcpu *vcpu);
+u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
+bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data);
+int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data);
+int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
+bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn,
+ int page_num);
+bool kvm_vector_hashing_enabled(void);
+int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+ int emulation_type, void *insn, int insn_len);
+
+#define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
+ | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \
+ | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \
+ | XFEATURE_MASK_PKRU)
+extern u64 host_xcr0;
+
+extern u64 kvm_supported_xcr0(void);
+
+extern unsigned int min_timer_period_us;
+
+extern unsigned int lapic_timer_advance_ns;
+
+extern bool enable_vmware_backdoor;
+
+extern struct static_key kvm_no_apic_vcpu;
+
+static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
+{
+ return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult,
+ vcpu->arch.virtual_tsc_shift);
+}
+
+/* Same "calling convention" as do_div:
+ * - divide (n << 32) by base
+ * - put result in n
+ * - return remainder
+ */
+#define do_shl32_div32(n, base) \
+ ({ \
+ u32 __quot, __rem; \
+ asm("divl %2" : "=a" (__quot), "=d" (__rem) \
+ : "rm" (base), "0" (0), "1" ((u32) n)); \
+ n = __quot; \
+ __rem; \
+ })
+
+static inline bool kvm_mwait_in_guest(struct kvm *kvm)
+{
+ return kvm->arch.mwait_in_guest;
+}
+
+static inline bool kvm_hlt_in_guest(struct kvm *kvm)
+{
+ return kvm->arch.hlt_in_guest;
+}
+
+static inline bool kvm_pause_in_guest(struct kvm *kvm)
+{
+ return kvm->arch.pause_in_guest;
+}
+
+DECLARE_PER_CPU(struct kvm_vcpu *, current_vcpu);
+
+static inline void kvm_before_interrupt(struct kvm_vcpu *vcpu)
+{
+ __this_cpu_write(current_vcpu, vcpu);
+}
+
+static inline void kvm_after_interrupt(struct kvm_vcpu *vcpu)
+{
+ __this_cpu_write(current_vcpu, NULL);
+}
+
+
+static inline bool kvm_pat_valid(u64 data)
+{
+ if (data & 0xF8F8F8F8F8F8F8F8ull)
+ return false;
+ /* 0, 1, 4, 5, 6, 7 are valid values. */
+ return (data | ((data & 0x0202020202020202ull) << 1)) == data;
+}
+
+void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu);
+void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu);
+
+#endif
diff --git a/arch/x86/lib/.gitignore b/arch/x86/lib/.gitignore
new file mode 100644
index 000000000..8df89f0a3
--- /dev/null
+++ b/arch/x86/lib/.gitignore
@@ -0,0 +1 @@
+inat-tables.c
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
new file mode 100644
index 000000000..3c19d6031
--- /dev/null
+++ b/arch/x86/lib/Makefile
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for x86 specific library files.
+#
+
+# Produces uninteresting flaky coverage.
+KCOV_INSTRUMENT_delay.o := n
+
+# Early boot use of cmdline; don't instrument it
+ifdef CONFIG_AMD_MEM_ENCRYPT
+KCOV_INSTRUMENT_cmdline.o := n
+KASAN_SANITIZE_cmdline.o := n
+
+ifdef CONFIG_FUNCTION_TRACER
+CFLAGS_REMOVE_cmdline.o = -pg
+endif
+
+CFLAGS_cmdline.o := $(call cc-option, -fno-stack-protector)
+endif
+
+inat_tables_script = $(srctree)/arch/x86/tools/gen-insn-attr-x86.awk
+inat_tables_maps = $(srctree)/arch/x86/lib/x86-opcode-map.txt
+quiet_cmd_inat_tables = GEN $@
+ cmd_inat_tables = $(AWK) -f $(inat_tables_script) $(inat_tables_maps) > $@ || rm -f $@
+
+$(obj)/inat-tables.c: $(inat_tables_script) $(inat_tables_maps)
+ $(call cmd,inat_tables)
+
+$(obj)/inat.o: $(obj)/inat-tables.c
+
+clean-files := inat-tables.c
+
+obj-$(CONFIG_SMP) += msr-smp.o cache-smp.o
+
+lib-y := delay.o misc.o cmdline.o cpu.o
+lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o
+lib-y += memcpy_$(BITS).o
+lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
+lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o insn-eval.o
+lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o
+lib-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
+lib-$(CONFIG_RETPOLINE) += retpoline.o
+
+obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o
+
+ifeq ($(CONFIG_X86_32),y)
+ obj-y += atomic64_32.o
+ lib-y += atomic64_cx8_32.o
+ lib-y += checksum_32.o
+ lib-y += strstr_32.o
+ lib-y += string_32.o
+ifneq ($(CONFIG_X86_CMPXCHG64),y)
+ lib-y += cmpxchg8b_emu.o atomic64_386_32.o
+endif
+ lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o
+else
+ obj-y += iomap_copy_64.o
+ lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
+ lib-y += clear_page_64.o copy_page_64.o
+ lib-y += memmove_64.o memset_64.o
+ lib-y += copy_user_64.o
+ lib-y += cmpxchg16b_emu.o
+endif
diff --git a/arch/x86/lib/atomic64_32.c b/arch/x86/lib/atomic64_32.c
new file mode 100644
index 000000000..a0b4a350d
--- /dev/null
+++ b/arch/x86/lib/atomic64_32.c
@@ -0,0 +1,4 @@
+#define ATOMIC64_EXPORT EXPORT_SYMBOL
+
+#include <linux/export.h>
+#include <linux/atomic.h>
diff --git a/arch/x86/lib/atomic64_386_32.S b/arch/x86/lib/atomic64_386_32.S
new file mode 100644
index 000000000..9b0ca8fe8
--- /dev/null
+++ b/arch/x86/lib/atomic64_386_32.S
@@ -0,0 +1,191 @@
+/*
+ * atomic64_t for 386/486
+ *
+ * Copyright © 2010 Luca Barbieri
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/linkage.h>
+#include <asm/alternative-asm.h>
+
+/* if you want SMP support, implement these with real spinlocks */
+.macro LOCK reg
+ pushfl
+ cli
+.endm
+
+.macro UNLOCK reg
+ popfl
+.endm
+
+#define BEGIN(op) \
+.macro endp; \
+ENDPROC(atomic64_##op##_386); \
+.purgem endp; \
+.endm; \
+ENTRY(atomic64_##op##_386); \
+ LOCK v;
+
+#define ENDP endp
+
+#define RET \
+ UNLOCK v; \
+ ret
+
+#define RET_ENDP \
+ RET; \
+ ENDP
+
+#define v %ecx
+BEGIN(read)
+ movl (v), %eax
+ movl 4(v), %edx
+RET_ENDP
+#undef v
+
+#define v %esi
+BEGIN(set)
+ movl %ebx, (v)
+ movl %ecx, 4(v)
+RET_ENDP
+#undef v
+
+#define v %esi
+BEGIN(xchg)
+ movl (v), %eax
+ movl 4(v), %edx
+ movl %ebx, (v)
+ movl %ecx, 4(v)
+RET_ENDP
+#undef v
+
+#define v %ecx
+BEGIN(add)
+ addl %eax, (v)
+ adcl %edx, 4(v)
+RET_ENDP
+#undef v
+
+#define v %ecx
+BEGIN(add_return)
+ addl (v), %eax
+ adcl 4(v), %edx
+ movl %eax, (v)
+ movl %edx, 4(v)
+RET_ENDP
+#undef v
+
+#define v %ecx
+BEGIN(sub)
+ subl %eax, (v)
+ sbbl %edx, 4(v)
+RET_ENDP
+#undef v
+
+#define v %ecx
+BEGIN(sub_return)
+ negl %edx
+ negl %eax
+ sbbl $0, %edx
+ addl (v), %eax
+ adcl 4(v), %edx
+ movl %eax, (v)
+ movl %edx, 4(v)
+RET_ENDP
+#undef v
+
+#define v %esi
+BEGIN(inc)
+ addl $1, (v)
+ adcl $0, 4(v)
+RET_ENDP
+#undef v
+
+#define v %esi
+BEGIN(inc_return)
+ movl (v), %eax
+ movl 4(v), %edx
+ addl $1, %eax
+ adcl $0, %edx
+ movl %eax, (v)
+ movl %edx, 4(v)
+RET_ENDP
+#undef v
+
+#define v %esi
+BEGIN(dec)
+ subl $1, (v)
+ sbbl $0, 4(v)
+RET_ENDP
+#undef v
+
+#define v %esi
+BEGIN(dec_return)
+ movl (v), %eax
+ movl 4(v), %edx
+ subl $1, %eax
+ sbbl $0, %edx
+ movl %eax, (v)
+ movl %edx, 4(v)
+RET_ENDP
+#undef v
+
+#define v %esi
+BEGIN(add_unless)
+ addl %eax, %ecx
+ adcl %edx, %edi
+ addl (v), %eax
+ adcl 4(v), %edx
+ cmpl %eax, %ecx
+ je 3f
+1:
+ movl %eax, (v)
+ movl %edx, 4(v)
+ movl $1, %eax
+2:
+ RET
+3:
+ cmpl %edx, %edi
+ jne 1b
+ xorl %eax, %eax
+ jmp 2b
+ENDP
+#undef v
+
+#define v %esi
+BEGIN(inc_not_zero)
+ movl (v), %eax
+ movl 4(v), %edx
+ testl %eax, %eax
+ je 3f
+1:
+ addl $1, %eax
+ adcl $0, %edx
+ movl %eax, (v)
+ movl %edx, 4(v)
+ movl $1, %eax
+2:
+ RET
+3:
+ testl %edx, %edx
+ jne 1b
+ jmp 2b
+ENDP
+#undef v
+
+#define v %esi
+BEGIN(dec_if_positive)
+ movl (v), %eax
+ movl 4(v), %edx
+ subl $1, %eax
+ sbbl $0, %edx
+ js 1f
+ movl %eax, (v)
+ movl %edx, 4(v)
+1:
+RET_ENDP
+#undef v
diff --git a/arch/x86/lib/atomic64_cx8_32.S b/arch/x86/lib/atomic64_cx8_32.S
new file mode 100644
index 000000000..db3ae8544
--- /dev/null
+++ b/arch/x86/lib/atomic64_cx8_32.S
@@ -0,0 +1,184 @@
+/*
+ * atomic64_t for 586+
+ *
+ * Copyright © 2010 Luca Barbieri
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/linkage.h>
+#include <asm/alternative-asm.h>
+
+.macro read64 reg
+ movl %ebx, %eax
+ movl %ecx, %edx
+/* we need LOCK_PREFIX since otherwise cmpxchg8b always does the write */
+ LOCK_PREFIX
+ cmpxchg8b (\reg)
+.endm
+
+ENTRY(atomic64_read_cx8)
+ read64 %ecx
+ ret
+ENDPROC(atomic64_read_cx8)
+
+ENTRY(atomic64_set_cx8)
+1:
+/* we don't need LOCK_PREFIX since aligned 64-bit writes
+ * are atomic on 586 and newer */
+ cmpxchg8b (%esi)
+ jne 1b
+
+ ret
+ENDPROC(atomic64_set_cx8)
+
+ENTRY(atomic64_xchg_cx8)
+1:
+ LOCK_PREFIX
+ cmpxchg8b (%esi)
+ jne 1b
+
+ ret
+ENDPROC(atomic64_xchg_cx8)
+
+.macro addsub_return func ins insc
+ENTRY(atomic64_\func\()_return_cx8)
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+
+ movl %eax, %esi
+ movl %edx, %edi
+ movl %ecx, %ebp
+
+ read64 %ecx
+1:
+ movl %eax, %ebx
+ movl %edx, %ecx
+ \ins\()l %esi, %ebx
+ \insc\()l %edi, %ecx
+ LOCK_PREFIX
+ cmpxchg8b (%ebp)
+ jne 1b
+
+10:
+ movl %ebx, %eax
+ movl %ecx, %edx
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+ENDPROC(atomic64_\func\()_return_cx8)
+.endm
+
+addsub_return add add adc
+addsub_return sub sub sbb
+
+.macro incdec_return func ins insc
+ENTRY(atomic64_\func\()_return_cx8)
+ pushl %ebx
+
+ read64 %esi
+1:
+ movl %eax, %ebx
+ movl %edx, %ecx
+ \ins\()l $1, %ebx
+ \insc\()l $0, %ecx
+ LOCK_PREFIX
+ cmpxchg8b (%esi)
+ jne 1b
+
+10:
+ movl %ebx, %eax
+ movl %ecx, %edx
+ popl %ebx
+ ret
+ENDPROC(atomic64_\func\()_return_cx8)
+.endm
+
+incdec_return inc add adc
+incdec_return dec sub sbb
+
+ENTRY(atomic64_dec_if_positive_cx8)
+ pushl %ebx
+
+ read64 %esi
+1:
+ movl %eax, %ebx
+ movl %edx, %ecx
+ subl $1, %ebx
+ sbb $0, %ecx
+ js 2f
+ LOCK_PREFIX
+ cmpxchg8b (%esi)
+ jne 1b
+
+2:
+ movl %ebx, %eax
+ movl %ecx, %edx
+ popl %ebx
+ ret
+ENDPROC(atomic64_dec_if_positive_cx8)
+
+ENTRY(atomic64_add_unless_cx8)
+ pushl %ebp
+ pushl %ebx
+/* these just push these two parameters on the stack */
+ pushl %edi
+ pushl %ecx
+
+ movl %eax, %ebp
+ movl %edx, %edi
+
+ read64 %esi
+1:
+ cmpl %eax, 0(%esp)
+ je 4f
+2:
+ movl %eax, %ebx
+ movl %edx, %ecx
+ addl %ebp, %ebx
+ adcl %edi, %ecx
+ LOCK_PREFIX
+ cmpxchg8b (%esi)
+ jne 1b
+
+ movl $1, %eax
+3:
+ addl $8, %esp
+ popl %ebx
+ popl %ebp
+ ret
+4:
+ cmpl %edx, 4(%esp)
+ jne 2b
+ xorl %eax, %eax
+ jmp 3b
+ENDPROC(atomic64_add_unless_cx8)
+
+ENTRY(atomic64_inc_not_zero_cx8)
+ pushl %ebx
+
+ read64 %esi
+1:
+ movl %eax, %ecx
+ orl %edx, %ecx
+ jz 3f
+ movl %eax, %ebx
+ xorl %ecx, %ecx
+ addl $1, %ebx
+ adcl %edx, %ecx
+ LOCK_PREFIX
+ cmpxchg8b (%esi)
+ jne 1b
+
+ movl $1, %eax
+3:
+ popl %ebx
+ ret
+ENDPROC(atomic64_inc_not_zero_cx8)
diff --git a/arch/x86/lib/cache-smp.c b/arch/x86/lib/cache-smp.c
new file mode 100644
index 000000000..1811fa4a1
--- /dev/null
+++ b/arch/x86/lib/cache-smp.c
@@ -0,0 +1,20 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/smp.h>
+#include <linux/export.h>
+
+static void __wbinvd(void *dummy)
+{
+ wbinvd();
+}
+
+void wbinvd_on_cpu(int cpu)
+{
+ smp_call_function_single(cpu, __wbinvd, NULL, 1);
+}
+EXPORT_SYMBOL(wbinvd_on_cpu);
+
+int wbinvd_on_all_cpus(void)
+{
+ return on_each_cpu(__wbinvd, NULL, 1);
+}
+EXPORT_SYMBOL(wbinvd_on_all_cpus);
diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S
new file mode 100644
index 000000000..46e71a74e
--- /dev/null
+++ b/arch/x86/lib/checksum_32.S
@@ -0,0 +1,496 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * IP/TCP/UDP checksumming routines
+ *
+ * Authors: Jorge Cwik, <jorge@laser.satlink.net>
+ * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ * Tom May, <ftom@netcom.com>
+ * Pentium Pro/II routines:
+ * Alexander Kjeldaas <astor@guardian.no>
+ * Finn Arne Gangstad <finnag@guardian.no>
+ * Lots of code moved from tcp.c and ip.c; see those files
+ * for more names.
+ *
+ * Changes: Ingo Molnar, converted csum_partial_copy() to 2.1 exception
+ * handling.
+ * Andi Kleen, add zeroing on error
+ * converted to pure assembler
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/linkage.h>
+#include <asm/errno.h>
+#include <asm/asm.h>
+#include <asm/export.h>
+#include <asm/nospec-branch.h>
+
+/*
+ * computes a partial checksum, e.g. for TCP/UDP fragments
+ */
+
+/*
+unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
+ */
+
+.text
+
+#ifndef CONFIG_X86_USE_PPRO_CHECKSUM
+
+ /*
+ * Experiments with Ethernet and SLIP connections show that buff
+ * is aligned on either a 2-byte or 4-byte boundary. We get at
+ * least a twofold speedup on 486 and Pentium if it is 4-byte aligned.
+ * Fortunately, it is easy to convert 2-byte alignment to 4-byte
+ * alignment for the unrolled loop.
+ */
+ENTRY(csum_partial)
+ pushl %esi
+ pushl %ebx
+ movl 20(%esp),%eax # Function arg: unsigned int sum
+ movl 16(%esp),%ecx # Function arg: int len
+ movl 12(%esp),%esi # Function arg: unsigned char *buff
+ testl $3, %esi # Check alignment.
+ jz 2f # Jump if alignment is ok.
+ testl $1, %esi # Check alignment.
+ jz 10f # Jump if alignment is boundary of 2 bytes.
+
+ # buf is odd
+ dec %ecx
+ jl 8f
+ movzbl (%esi), %ebx
+ adcl %ebx, %eax
+ roll $8, %eax
+ inc %esi
+ testl $2, %esi
+ jz 2f
+10:
+ subl $2, %ecx # Alignment uses up two bytes.
+ jae 1f # Jump if we had at least two bytes.
+ addl $2, %ecx # ecx was < 2. Deal with it.
+ jmp 4f
+1: movw (%esi), %bx
+ addl $2, %esi
+ addw %bx, %ax
+ adcl $0, %eax
+2:
+ movl %ecx, %edx
+ shrl $5, %ecx
+ jz 2f
+ testl %esi, %esi
+1: movl (%esi), %ebx
+ adcl %ebx, %eax
+ movl 4(%esi), %ebx
+ adcl %ebx, %eax
+ movl 8(%esi), %ebx
+ adcl %ebx, %eax
+ movl 12(%esi), %ebx
+ adcl %ebx, %eax
+ movl 16(%esi), %ebx
+ adcl %ebx, %eax
+ movl 20(%esi), %ebx
+ adcl %ebx, %eax
+ movl 24(%esi), %ebx
+ adcl %ebx, %eax
+ movl 28(%esi), %ebx
+ adcl %ebx, %eax
+ lea 32(%esi), %esi
+ dec %ecx
+ jne 1b
+ adcl $0, %eax
+2: movl %edx, %ecx
+ andl $0x1c, %edx
+ je 4f
+ shrl $2, %edx # This clears CF
+3: adcl (%esi), %eax
+ lea 4(%esi), %esi
+ dec %edx
+ jne 3b
+ adcl $0, %eax
+4: andl $3, %ecx
+ jz 7f
+ cmpl $2, %ecx
+ jb 5f
+ movw (%esi),%cx
+ leal 2(%esi),%esi
+ je 6f
+ shll $16,%ecx
+5: movb (%esi),%cl
+6: addl %ecx,%eax
+ adcl $0, %eax
+7:
+ testb $1, 12(%esp)
+ jz 8f
+ roll $8, %eax
+8:
+ popl %ebx
+ popl %esi
+ ret
+ENDPROC(csum_partial)
+
+#else
+
+/* Version for PentiumII/PPro */
+
+ENTRY(csum_partial)
+ pushl %esi
+ pushl %ebx
+ movl 20(%esp),%eax # Function arg: unsigned int sum
+ movl 16(%esp),%ecx # Function arg: int len
+ movl 12(%esp),%esi # Function arg: const unsigned char *buf
+
+ testl $3, %esi
+ jnz 25f
+10:
+ movl %ecx, %edx
+ movl %ecx, %ebx
+ andl $0x7c, %ebx
+ shrl $7, %ecx
+ addl %ebx,%esi
+ shrl $2, %ebx
+ negl %ebx
+ lea 45f(%ebx,%ebx,2), %ebx
+ testl %esi, %esi
+ JMP_NOSPEC %ebx
+
+ # Handle 2-byte-aligned regions
+20: addw (%esi), %ax
+ lea 2(%esi), %esi
+ adcl $0, %eax
+ jmp 10b
+25:
+ testl $1, %esi
+ jz 30f
+ # buf is odd
+ dec %ecx
+ jl 90f
+ movzbl (%esi), %ebx
+ addl %ebx, %eax
+ adcl $0, %eax
+ roll $8, %eax
+ inc %esi
+ testl $2, %esi
+ jz 10b
+
+30: subl $2, %ecx
+ ja 20b
+ je 32f
+ addl $2, %ecx
+ jz 80f
+ movzbl (%esi),%ebx # csumming 1 byte, 2-aligned
+ addl %ebx, %eax
+ adcl $0, %eax
+ jmp 80f
+32:
+ addw (%esi), %ax # csumming 2 bytes, 2-aligned
+ adcl $0, %eax
+ jmp 80f
+
+40:
+ addl -128(%esi), %eax
+ adcl -124(%esi), %eax
+ adcl -120(%esi), %eax
+ adcl -116(%esi), %eax
+ adcl -112(%esi), %eax
+ adcl -108(%esi), %eax
+ adcl -104(%esi), %eax
+ adcl -100(%esi), %eax
+ adcl -96(%esi), %eax
+ adcl -92(%esi), %eax
+ adcl -88(%esi), %eax
+ adcl -84(%esi), %eax
+ adcl -80(%esi), %eax
+ adcl -76(%esi), %eax
+ adcl -72(%esi), %eax
+ adcl -68(%esi), %eax
+ adcl -64(%esi), %eax
+ adcl -60(%esi), %eax
+ adcl -56(%esi), %eax
+ adcl -52(%esi), %eax
+ adcl -48(%esi), %eax
+ adcl -44(%esi), %eax
+ adcl -40(%esi), %eax
+ adcl -36(%esi), %eax
+ adcl -32(%esi), %eax
+ adcl -28(%esi), %eax
+ adcl -24(%esi), %eax
+ adcl -20(%esi), %eax
+ adcl -16(%esi), %eax
+ adcl -12(%esi), %eax
+ adcl -8(%esi), %eax
+ adcl -4(%esi), %eax
+45:
+ lea 128(%esi), %esi
+ adcl $0, %eax
+ dec %ecx
+ jge 40b
+ movl %edx, %ecx
+50: andl $3, %ecx
+ jz 80f
+
+ # Handle the last 1-3 bytes without jumping
+ notl %ecx # 1->2, 2->1, 3->0, higher bits are masked
+ movl $0xffffff,%ebx # by the shll and shrl instructions
+ shll $3,%ecx
+ shrl %cl,%ebx
+ andl -128(%esi),%ebx # esi is 4-aligned so should be ok
+ addl %ebx,%eax
+ adcl $0,%eax
+80:
+ testb $1, 12(%esp)
+ jz 90f
+ roll $8, %eax
+90:
+ popl %ebx
+ popl %esi
+ ret
+ENDPROC(csum_partial)
+
+#endif
+EXPORT_SYMBOL(csum_partial)
+
+/*
+unsigned int csum_partial_copy_generic (const char *src, char *dst,
+ int len, int sum, int *src_err_ptr, int *dst_err_ptr)
+ */
+
+/*
+ * Copy from ds while checksumming, otherwise like csum_partial
+ *
+ * The macros SRC and DST specify the type of access for the instruction.
+ * thus we can call a custom exception handler for all access types.
+ *
+ * FIXME: could someone double-check whether I haven't mixed up some SRC and
+ * DST definitions? It's damn hard to trigger all cases. I hope I got
+ * them all but there's no guarantee.
+ */
+
+#define SRC(y...) \
+ 9999: y; \
+ _ASM_EXTABLE(9999b, 6001f)
+
+#define DST(y...) \
+ 9999: y; \
+ _ASM_EXTABLE(9999b, 6002f)
+
+#ifndef CONFIG_X86_USE_PPRO_CHECKSUM
+
+#define ARGBASE 16
+#define FP 12
+
+ENTRY(csum_partial_copy_generic)
+ subl $4,%esp
+ pushl %edi
+ pushl %esi
+ pushl %ebx
+ movl ARGBASE+16(%esp),%eax # sum
+ movl ARGBASE+12(%esp),%ecx # len
+ movl ARGBASE+4(%esp),%esi # src
+ movl ARGBASE+8(%esp),%edi # dst
+
+ testl $2, %edi # Check alignment.
+ jz 2f # Jump if alignment is ok.
+ subl $2, %ecx # Alignment uses up two bytes.
+ jae 1f # Jump if we had at least two bytes.
+ addl $2, %ecx # ecx was < 2. Deal with it.
+ jmp 4f
+SRC(1: movw (%esi), %bx )
+ addl $2, %esi
+DST( movw %bx, (%edi) )
+ addl $2, %edi
+ addw %bx, %ax
+ adcl $0, %eax
+2:
+ movl %ecx, FP(%esp)
+ shrl $5, %ecx
+ jz 2f
+ testl %esi, %esi
+SRC(1: movl (%esi), %ebx )
+SRC( movl 4(%esi), %edx )
+ adcl %ebx, %eax
+DST( movl %ebx, (%edi) )
+ adcl %edx, %eax
+DST( movl %edx, 4(%edi) )
+
+SRC( movl 8(%esi), %ebx )
+SRC( movl 12(%esi), %edx )
+ adcl %ebx, %eax
+DST( movl %ebx, 8(%edi) )
+ adcl %edx, %eax
+DST( movl %edx, 12(%edi) )
+
+SRC( movl 16(%esi), %ebx )
+SRC( movl 20(%esi), %edx )
+ adcl %ebx, %eax
+DST( movl %ebx, 16(%edi) )
+ adcl %edx, %eax
+DST( movl %edx, 20(%edi) )
+
+SRC( movl 24(%esi), %ebx )
+SRC( movl 28(%esi), %edx )
+ adcl %ebx, %eax
+DST( movl %ebx, 24(%edi) )
+ adcl %edx, %eax
+DST( movl %edx, 28(%edi) )
+
+ lea 32(%esi), %esi
+ lea 32(%edi), %edi
+ dec %ecx
+ jne 1b
+ adcl $0, %eax
+2: movl FP(%esp), %edx
+ movl %edx, %ecx
+ andl $0x1c, %edx
+ je 4f
+ shrl $2, %edx # This clears CF
+SRC(3: movl (%esi), %ebx )
+ adcl %ebx, %eax
+DST( movl %ebx, (%edi) )
+ lea 4(%esi), %esi
+ lea 4(%edi), %edi
+ dec %edx
+ jne 3b
+ adcl $0, %eax
+4: andl $3, %ecx
+ jz 7f
+ cmpl $2, %ecx
+ jb 5f
+SRC( movw (%esi), %cx )
+ leal 2(%esi), %esi
+DST( movw %cx, (%edi) )
+ leal 2(%edi), %edi
+ je 6f
+ shll $16,%ecx
+SRC(5: movb (%esi), %cl )
+DST( movb %cl, (%edi) )
+6: addl %ecx, %eax
+ adcl $0, %eax
+7:
+5000:
+
+# Exception handler:
+.section .fixup, "ax"
+
+6001:
+ movl ARGBASE+20(%esp), %ebx # src_err_ptr
+ movl $-EFAULT, (%ebx)
+
+ # zero the complete destination - computing the rest
+ # is too much work
+ movl ARGBASE+8(%esp), %edi # dst
+ movl ARGBASE+12(%esp), %ecx # len
+ xorl %eax,%eax
+ rep ; stosb
+
+ jmp 5000b
+
+6002:
+ movl ARGBASE+24(%esp), %ebx # dst_err_ptr
+ movl $-EFAULT,(%ebx)
+ jmp 5000b
+
+.previous
+
+ popl %ebx
+ popl %esi
+ popl %edi
+ popl %ecx # equivalent to addl $4,%esp
+ ret
+ENDPROC(csum_partial_copy_generic)
+
+#else
+
+/* Version for PentiumII/PPro */
+
+#define ROUND1(x) \
+ SRC(movl x(%esi), %ebx ) ; \
+ addl %ebx, %eax ; \
+ DST(movl %ebx, x(%edi) ) ;
+
+#define ROUND(x) \
+ SRC(movl x(%esi), %ebx ) ; \
+ adcl %ebx, %eax ; \
+ DST(movl %ebx, x(%edi) ) ;
+
+#define ARGBASE 12
+
+ENTRY(csum_partial_copy_generic)
+ pushl %ebx
+ pushl %edi
+ pushl %esi
+ movl ARGBASE+4(%esp),%esi #src
+ movl ARGBASE+8(%esp),%edi #dst
+ movl ARGBASE+12(%esp),%ecx #len
+ movl ARGBASE+16(%esp),%eax #sum
+# movl %ecx, %edx
+ movl %ecx, %ebx
+ movl %esi, %edx
+ shrl $6, %ecx
+ andl $0x3c, %ebx
+ negl %ebx
+ subl %ebx, %esi
+ subl %ebx, %edi
+ lea -1(%esi),%edx
+ andl $-32,%edx
+ lea 3f(%ebx,%ebx), %ebx
+ testl %esi, %esi
+ JMP_NOSPEC %ebx
+1: addl $64,%esi
+ addl $64,%edi
+ SRC(movb -32(%edx),%bl) ; SRC(movb (%edx),%bl)
+ ROUND1(-64) ROUND(-60) ROUND(-56) ROUND(-52)
+ ROUND (-48) ROUND(-44) ROUND(-40) ROUND(-36)
+ ROUND (-32) ROUND(-28) ROUND(-24) ROUND(-20)
+ ROUND (-16) ROUND(-12) ROUND(-8) ROUND(-4)
+3: adcl $0,%eax
+ addl $64, %edx
+ dec %ecx
+ jge 1b
+4: movl ARGBASE+12(%esp),%edx #len
+ andl $3, %edx
+ jz 7f
+ cmpl $2, %edx
+ jb 5f
+SRC( movw (%esi), %dx )
+ leal 2(%esi), %esi
+DST( movw %dx, (%edi) )
+ leal 2(%edi), %edi
+ je 6f
+ shll $16,%edx
+5:
+SRC( movb (%esi), %dl )
+DST( movb %dl, (%edi) )
+6: addl %edx, %eax
+ adcl $0, %eax
+7:
+.section .fixup, "ax"
+6001: movl ARGBASE+20(%esp), %ebx # src_err_ptr
+ movl $-EFAULT, (%ebx)
+ # zero the complete destination (computing the rest is too much work)
+ movl ARGBASE+8(%esp),%edi # dst
+ movl ARGBASE+12(%esp),%ecx # len
+ xorl %eax,%eax
+ rep; stosb
+ jmp 7b
+6002: movl ARGBASE+24(%esp), %ebx # dst_err_ptr
+ movl $-EFAULT, (%ebx)
+ jmp 7b
+.previous
+
+ popl %esi
+ popl %edi
+ popl %ebx
+ ret
+ENDPROC(csum_partial_copy_generic)
+
+#undef ROUND
+#undef ROUND1
+
+#endif
+EXPORT_SYMBOL(csum_partial_copy_generic)
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
new file mode 100644
index 000000000..88acd3499
--- /dev/null
+++ b/arch/x86/lib/clear_page_64.S
@@ -0,0 +1,51 @@
+#include <linux/linkage.h>
+#include <asm/export.h>
+
+/*
+ * Most CPUs support enhanced REP MOVSB/STOSB instructions. It is
+ * recommended to use this when possible and we do use them by default.
+ * If enhanced REP MOVSB/STOSB is not available, try to use fast string.
+ * Otherwise, use original.
+ */
+
+/*
+ * Zero a page.
+ * %rdi - page
+ */
+ENTRY(clear_page_rep)
+ movl $4096/8,%ecx
+ xorl %eax,%eax
+ rep stosq
+ ret
+ENDPROC(clear_page_rep)
+EXPORT_SYMBOL_GPL(clear_page_rep)
+
+ENTRY(clear_page_orig)
+ xorl %eax,%eax
+ movl $4096/64,%ecx
+ .p2align 4
+.Lloop:
+ decl %ecx
+#define PUT(x) movq %rax,x*8(%rdi)
+ movq %rax,(%rdi)
+ PUT(1)
+ PUT(2)
+ PUT(3)
+ PUT(4)
+ PUT(5)
+ PUT(6)
+ PUT(7)
+ leaq 64(%rdi),%rdi
+ jnz .Lloop
+ nop
+ ret
+ENDPROC(clear_page_orig)
+EXPORT_SYMBOL_GPL(clear_page_orig)
+
+ENTRY(clear_page_erms)
+ movl $4096,%ecx
+ xorl %eax,%eax
+ rep stosb
+ ret
+ENDPROC(clear_page_erms)
+EXPORT_SYMBOL_GPL(clear_page_erms)
diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c
new file mode 100644
index 000000000..3261abb21
--- /dev/null
+++ b/arch/x86/lib/cmdline.c
@@ -0,0 +1,215 @@
+/*
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2.
+ *
+ * Misc librarized functions for cmdline poking.
+ */
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
+#include <asm/setup.h>
+
+static inline int myisspace(u8 c)
+{
+ return c <= ' '; /* Close enough approximation */
+}
+
+/**
+ * Find a boolean option (like quiet,noapic,nosmp....)
+ *
+ * @cmdline: the cmdline string
+ * @option: option string to look for
+ *
+ * Returns the position of that @option (starts counting with 1)
+ * or 0 on not found. @option will only be found if it is found
+ * as an entire word in @cmdline. For instance, if @option="car"
+ * then a cmdline which contains "cart" will not match.
+ */
+static int
+__cmdline_find_option_bool(const char *cmdline, int max_cmdline_size,
+ const char *option)
+{
+ char c;
+ int pos = 0, wstart = 0;
+ const char *opptr = NULL;
+ enum {
+ st_wordstart = 0, /* Start of word/after whitespace */
+ st_wordcmp, /* Comparing this word */
+ st_wordskip, /* Miscompare, skip */
+ } state = st_wordstart;
+
+ if (!cmdline)
+ return -1; /* No command line */
+
+ /*
+ * This 'pos' check ensures we do not overrun
+ * a non-NULL-terminated 'cmdline'
+ */
+ while (pos < max_cmdline_size) {
+ c = *(char *)cmdline++;
+ pos++;
+
+ switch (state) {
+ case st_wordstart:
+ if (!c)
+ return 0;
+ else if (myisspace(c))
+ break;
+
+ state = st_wordcmp;
+ opptr = option;
+ wstart = pos;
+ /* fall through */
+
+ case st_wordcmp:
+ if (!*opptr) {
+ /*
+ * We matched all the way to the end of the
+ * option we were looking for. If the
+ * command-line has a space _or_ ends, then
+ * we matched!
+ */
+ if (!c || myisspace(c))
+ return wstart;
+ /*
+ * We hit the end of the option, but _not_
+ * the end of a word on the cmdline. Not
+ * a match.
+ */
+ } else if (!c) {
+ /*
+ * Hit the NULL terminator on the end of
+ * cmdline.
+ */
+ return 0;
+ } else if (c == *opptr++) {
+ /*
+ * We are currently matching, so continue
+ * to the next character on the cmdline.
+ */
+ break;
+ }
+ state = st_wordskip;
+ /* fall through */
+
+ case st_wordskip:
+ if (!c)
+ return 0;
+ else if (myisspace(c))
+ state = st_wordstart;
+ break;
+ }
+ }
+
+ return 0; /* Buffer overrun */
+}
+
+/*
+ * Find a non-boolean option (i.e. option=argument). In accordance with
+ * standard Linux practice, if this option is repeated, this returns the
+ * last instance on the command line.
+ *
+ * @cmdline: the cmdline string
+ * @max_cmdline_size: the maximum size of cmdline
+ * @option: option string to look for
+ * @buffer: memory buffer to return the option argument
+ * @bufsize: size of the supplied memory buffer
+ *
+ * Returns the length of the argument (regardless of if it was
+ * truncated to fit in the buffer), or -1 on not found.
+ */
+static int
+__cmdline_find_option(const char *cmdline, int max_cmdline_size,
+ const char *option, char *buffer, int bufsize)
+{
+ char c;
+ int pos = 0, len = -1;
+ const char *opptr = NULL;
+ char *bufptr = buffer;
+ enum {
+ st_wordstart = 0, /* Start of word/after whitespace */
+ st_wordcmp, /* Comparing this word */
+ st_wordskip, /* Miscompare, skip */
+ st_bufcpy, /* Copying this to buffer */
+ } state = st_wordstart;
+
+ if (!cmdline)
+ return -1; /* No command line */
+
+ /*
+ * This 'pos' check ensures we do not overrun
+ * a non-NULL-terminated 'cmdline'
+ */
+ while (pos++ < max_cmdline_size) {
+ c = *(char *)cmdline++;
+ if (!c)
+ break;
+
+ switch (state) {
+ case st_wordstart:
+ if (myisspace(c))
+ break;
+
+ state = st_wordcmp;
+ opptr = option;
+ /* fall through */
+
+ case st_wordcmp:
+ if ((c == '=') && !*opptr) {
+ /*
+ * We matched all the way to the end of the
+ * option we were looking for, prepare to
+ * copy the argument.
+ */
+ len = 0;
+ bufptr = buffer;
+ state = st_bufcpy;
+ break;
+ } else if (c == *opptr++) {
+ /*
+ * We are currently matching, so continue
+ * to the next character on the cmdline.
+ */
+ break;
+ }
+ state = st_wordskip;
+ /* fall through */
+
+ case st_wordskip:
+ if (myisspace(c))
+ state = st_wordstart;
+ break;
+
+ case st_bufcpy:
+ if (myisspace(c)) {
+ state = st_wordstart;
+ } else {
+ /*
+ * Increment len, but don't overrun the
+ * supplied buffer and leave room for the
+ * NULL terminator.
+ */
+ if (++len < bufsize)
+ *bufptr++ = c;
+ }
+ break;
+ }
+ }
+
+ if (bufsize)
+ *bufptr = '\0';
+
+ return len;
+}
+
+int cmdline_find_option_bool(const char *cmdline, const char *option)
+{
+ return __cmdline_find_option_bool(cmdline, COMMAND_LINE_SIZE, option);
+}
+
+int cmdline_find_option(const char *cmdline, const char *option, char *buffer,
+ int bufsize)
+{
+ return __cmdline_find_option(cmdline, COMMAND_LINE_SIZE, option,
+ buffer, bufsize);
+}
diff --git a/arch/x86/lib/cmpxchg16b_emu.S b/arch/x86/lib/cmpxchg16b_emu.S
new file mode 100644
index 000000000..9b330242e
--- /dev/null
+++ b/arch/x86/lib/cmpxchg16b_emu.S
@@ -0,0 +1,53 @@
+/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ *
+ */
+#include <linux/linkage.h>
+#include <asm/percpu.h>
+
+.text
+
+/*
+ * Inputs:
+ * %rsi : memory location to compare
+ * %rax : low 64 bits of old value
+ * %rdx : high 64 bits of old value
+ * %rbx : low 64 bits of new value
+ * %rcx : high 64 bits of new value
+ * %al : Operation successful
+ */
+ENTRY(this_cpu_cmpxchg16b_emu)
+
+#
+# Emulate 'cmpxchg16b %gs:(%rsi)' except we return the result in %al not
+# via the ZF. Caller will access %al to get result.
+#
+# Note that this is only useful for a cpuops operation. Meaning that we
+# do *not* have a fully atomic operation but just an operation that is
+# *atomic* on a single cpu (as provided by the this_cpu_xx class of
+# macros).
+#
+ pushfq
+ cli
+
+ cmpq PER_CPU_VAR((%rsi)), %rax
+ jne .Lnot_same
+ cmpq PER_CPU_VAR(8(%rsi)), %rdx
+ jne .Lnot_same
+
+ movq %rbx, PER_CPU_VAR((%rsi))
+ movq %rcx, PER_CPU_VAR(8(%rsi))
+
+ popfq
+ mov $1, %al
+ ret
+
+.Lnot_same:
+ popfq
+ xor %al,%al
+ ret
+
+ENDPROC(this_cpu_cmpxchg16b_emu)
diff --git a/arch/x86/lib/cmpxchg8b_emu.S b/arch/x86/lib/cmpxchg8b_emu.S
new file mode 100644
index 000000000..03a186fc0
--- /dev/null
+++ b/arch/x86/lib/cmpxchg8b_emu.S
@@ -0,0 +1,52 @@
+/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ *
+ */
+
+#include <linux/linkage.h>
+#include <asm/export.h>
+
+.text
+
+/*
+ * Inputs:
+ * %esi : memory location to compare
+ * %eax : low 32 bits of old value
+ * %edx : high 32 bits of old value
+ * %ebx : low 32 bits of new value
+ * %ecx : high 32 bits of new value
+ */
+ENTRY(cmpxchg8b_emu)
+
+#
+# Emulate 'cmpxchg8b (%esi)' on UP except we don't
+# set the whole ZF thing (caller will just compare
+# eax:edx with the expected value)
+#
+ pushfl
+ cli
+
+ cmpl (%esi), %eax
+ jne .Lnot_same
+ cmpl 4(%esi), %edx
+ jne .Lhalf_same
+
+ movl %ebx, (%esi)
+ movl %ecx, 4(%esi)
+
+ popfl
+ ret
+
+.Lnot_same:
+ movl (%esi), %eax
+.Lhalf_same:
+ movl 4(%esi), %edx
+
+ popfl
+ ret
+
+ENDPROC(cmpxchg8b_emu)
+EXPORT_SYMBOL(cmpxchg8b_emu)
diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S
new file mode 100644
index 000000000..fd2d09afa
--- /dev/null
+++ b/arch/x86/lib/copy_page_64.S
@@ -0,0 +1,89 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */
+
+#include <linux/linkage.h>
+#include <asm/cpufeatures.h>
+#include <asm/alternative-asm.h>
+#include <asm/export.h>
+
+/*
+ * Some CPUs run faster using the string copy instructions (sane microcode).
+ * It is also a lot simpler. Use this when possible. But, don't use streaming
+ * copy unless the CPU indicates X86_FEATURE_REP_GOOD. Could vary the
+ * prefetch distance based on SMP/UP.
+ */
+ ALIGN
+ENTRY(copy_page)
+ ALTERNATIVE "jmp copy_page_regs", "", X86_FEATURE_REP_GOOD
+ movl $4096/8, %ecx
+ rep movsq
+ ret
+ENDPROC(copy_page)
+EXPORT_SYMBOL(copy_page)
+
+ENTRY(copy_page_regs)
+ subq $2*8, %rsp
+ movq %rbx, (%rsp)
+ movq %r12, 1*8(%rsp)
+
+ movl $(4096/64)-5, %ecx
+ .p2align 4
+.Loop64:
+ dec %rcx
+ movq 0x8*0(%rsi), %rax
+ movq 0x8*1(%rsi), %rbx
+ movq 0x8*2(%rsi), %rdx
+ movq 0x8*3(%rsi), %r8
+ movq 0x8*4(%rsi), %r9
+ movq 0x8*5(%rsi), %r10
+ movq 0x8*6(%rsi), %r11
+ movq 0x8*7(%rsi), %r12
+
+ prefetcht0 5*64(%rsi)
+
+ movq %rax, 0x8*0(%rdi)
+ movq %rbx, 0x8*1(%rdi)
+ movq %rdx, 0x8*2(%rdi)
+ movq %r8, 0x8*3(%rdi)
+ movq %r9, 0x8*4(%rdi)
+ movq %r10, 0x8*5(%rdi)
+ movq %r11, 0x8*6(%rdi)
+ movq %r12, 0x8*7(%rdi)
+
+ leaq 64 (%rsi), %rsi
+ leaq 64 (%rdi), %rdi
+
+ jnz .Loop64
+
+ movl $5, %ecx
+ .p2align 4
+.Loop2:
+ decl %ecx
+
+ movq 0x8*0(%rsi), %rax
+ movq 0x8*1(%rsi), %rbx
+ movq 0x8*2(%rsi), %rdx
+ movq 0x8*3(%rsi), %r8
+ movq 0x8*4(%rsi), %r9
+ movq 0x8*5(%rsi), %r10
+ movq 0x8*6(%rsi), %r11
+ movq 0x8*7(%rsi), %r12
+
+ movq %rax, 0x8*0(%rdi)
+ movq %rbx, 0x8*1(%rdi)
+ movq %rdx, 0x8*2(%rdi)
+ movq %r8, 0x8*3(%rdi)
+ movq %r9, 0x8*4(%rdi)
+ movq %r10, 0x8*5(%rdi)
+ movq %r11, 0x8*6(%rdi)
+ movq %r12, 0x8*7(%rdi)
+
+ leaq 64(%rdi), %rdi
+ leaq 64(%rsi), %rsi
+ jnz .Loop2
+
+ movq (%rsp), %rbx
+ movq 1*8(%rsp), %r12
+ addq $2*8, %rsp
+ ret
+ENDPROC(copy_page_regs)
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
new file mode 100644
index 000000000..020f75cc8
--- /dev/null
+++ b/arch/x86/lib/copy_user_64.S
@@ -0,0 +1,345 @@
+/*
+ * Copyright 2008 Vitaly Mayatskikh <vmayatsk@redhat.com>
+ * Copyright 2002 Andi Kleen, SuSE Labs.
+ * Subject to the GNU Public License v2.
+ *
+ * Functions to copy from and to user space.
+ */
+
+#include <linux/linkage.h>
+#include <asm/current.h>
+#include <asm/asm-offsets.h>
+#include <asm/thread_info.h>
+#include <asm/cpufeatures.h>
+#include <asm/alternative-asm.h>
+#include <asm/asm.h>
+#include <asm/smap.h>
+#include <asm/export.h>
+
+/*
+ * copy_user_generic_unrolled - memory copy with exception handling.
+ * This version is for CPUs like P4 that don't have efficient micro
+ * code for rep movsq
+ *
+ * Input:
+ * rdi destination
+ * rsi source
+ * rdx count
+ *
+ * Output:
+ * eax uncopied bytes or 0 if successful.
+ */
+ENTRY(copy_user_generic_unrolled)
+ ASM_STAC
+ cmpl $8,%edx
+ jb 20f /* less then 8 bytes, go to byte copy loop */
+ ALIGN_DESTINATION
+ movl %edx,%ecx
+ andl $63,%edx
+ shrl $6,%ecx
+ jz .L_copy_short_string
+1: movq (%rsi),%r8
+2: movq 1*8(%rsi),%r9
+3: movq 2*8(%rsi),%r10
+4: movq 3*8(%rsi),%r11
+5: movq %r8,(%rdi)
+6: movq %r9,1*8(%rdi)
+7: movq %r10,2*8(%rdi)
+8: movq %r11,3*8(%rdi)
+9: movq 4*8(%rsi),%r8
+10: movq 5*8(%rsi),%r9
+11: movq 6*8(%rsi),%r10
+12: movq 7*8(%rsi),%r11
+13: movq %r8,4*8(%rdi)
+14: movq %r9,5*8(%rdi)
+15: movq %r10,6*8(%rdi)
+16: movq %r11,7*8(%rdi)
+ leaq 64(%rsi),%rsi
+ leaq 64(%rdi),%rdi
+ decl %ecx
+ jnz 1b
+.L_copy_short_string:
+ movl %edx,%ecx
+ andl $7,%edx
+ shrl $3,%ecx
+ jz 20f
+18: movq (%rsi),%r8
+19: movq %r8,(%rdi)
+ leaq 8(%rsi),%rsi
+ leaq 8(%rdi),%rdi
+ decl %ecx
+ jnz 18b
+20: andl %edx,%edx
+ jz 23f
+ movl %edx,%ecx
+21: movb (%rsi),%al
+22: movb %al,(%rdi)
+ incq %rsi
+ incq %rdi
+ decl %ecx
+ jnz 21b
+23: xor %eax,%eax
+ ASM_CLAC
+ ret
+
+ .section .fixup,"ax"
+30: shll $6,%ecx
+ addl %ecx,%edx
+ jmp 60f
+40: leal (%rdx,%rcx,8),%edx
+ jmp 60f
+50: movl %ecx,%edx
+60: jmp copy_user_handle_tail /* ecx is zerorest also */
+ .previous
+
+ _ASM_EXTABLE(1b,30b)
+ _ASM_EXTABLE(2b,30b)
+ _ASM_EXTABLE(3b,30b)
+ _ASM_EXTABLE(4b,30b)
+ _ASM_EXTABLE(5b,30b)
+ _ASM_EXTABLE(6b,30b)
+ _ASM_EXTABLE(7b,30b)
+ _ASM_EXTABLE(8b,30b)
+ _ASM_EXTABLE(9b,30b)
+ _ASM_EXTABLE(10b,30b)
+ _ASM_EXTABLE(11b,30b)
+ _ASM_EXTABLE(12b,30b)
+ _ASM_EXTABLE(13b,30b)
+ _ASM_EXTABLE(14b,30b)
+ _ASM_EXTABLE(15b,30b)
+ _ASM_EXTABLE(16b,30b)
+ _ASM_EXTABLE(18b,40b)
+ _ASM_EXTABLE(19b,40b)
+ _ASM_EXTABLE(21b,50b)
+ _ASM_EXTABLE(22b,50b)
+ENDPROC(copy_user_generic_unrolled)
+EXPORT_SYMBOL(copy_user_generic_unrolled)
+
+/* Some CPUs run faster using the string copy instructions.
+ * This is also a lot simpler. Use them when possible.
+ *
+ * Only 4GB of copy is supported. This shouldn't be a problem
+ * because the kernel normally only writes from/to page sized chunks
+ * even if user space passed a longer buffer.
+ * And more would be dangerous because both Intel and AMD have
+ * errata with rep movsq > 4GB. If someone feels the need to fix
+ * this please consider this.
+ *
+ * Input:
+ * rdi destination
+ * rsi source
+ * rdx count
+ *
+ * Output:
+ * eax uncopied bytes or 0 if successful.
+ */
+ENTRY(copy_user_generic_string)
+ ASM_STAC
+ cmpl $8,%edx
+ jb 2f /* less than 8 bytes, go to byte copy loop */
+ ALIGN_DESTINATION
+ movl %edx,%ecx
+ shrl $3,%ecx
+ andl $7,%edx
+1: rep
+ movsq
+2: movl %edx,%ecx
+3: rep
+ movsb
+ xorl %eax,%eax
+ ASM_CLAC
+ ret
+
+ .section .fixup,"ax"
+11: leal (%rdx,%rcx,8),%ecx
+12: movl %ecx,%edx /* ecx is zerorest also */
+ jmp copy_user_handle_tail
+ .previous
+
+ _ASM_EXTABLE(1b,11b)
+ _ASM_EXTABLE(3b,12b)
+ENDPROC(copy_user_generic_string)
+EXPORT_SYMBOL(copy_user_generic_string)
+
+/*
+ * Some CPUs are adding enhanced REP MOVSB/STOSB instructions.
+ * It's recommended to use enhanced REP MOVSB/STOSB if it's enabled.
+ *
+ * Input:
+ * rdi destination
+ * rsi source
+ * rdx count
+ *
+ * Output:
+ * eax uncopied bytes or 0 if successful.
+ */
+ENTRY(copy_user_enhanced_fast_string)
+ ASM_STAC
+ cmpl $64,%edx
+ jb .L_copy_short_string /* less then 64 bytes, avoid the costly 'rep' */
+ movl %edx,%ecx
+1: rep
+ movsb
+ xorl %eax,%eax
+ ASM_CLAC
+ ret
+
+ .section .fixup,"ax"
+12: movl %ecx,%edx /* ecx is zerorest also */
+ jmp copy_user_handle_tail
+ .previous
+
+ _ASM_EXTABLE(1b,12b)
+ENDPROC(copy_user_enhanced_fast_string)
+EXPORT_SYMBOL(copy_user_enhanced_fast_string)
+
+/*
+ * copy_user_nocache - Uncached memory copy with exception handling
+ * This will force destination out of cache for more performance.
+ *
+ * Note: Cached memory copy is used when destination or size is not
+ * naturally aligned. That is:
+ * - Require 8-byte alignment when size is 8 bytes or larger.
+ * - Require 4-byte alignment when size is 4 bytes.
+ */
+ENTRY(__copy_user_nocache)
+ ASM_STAC
+
+ /* If size is less than 8 bytes, go to 4-byte copy */
+ cmpl $8,%edx
+ jb .L_4b_nocache_copy_entry
+
+ /* If destination is not 8-byte aligned, "cache" copy to align it */
+ ALIGN_DESTINATION
+
+ /* Set 4x8-byte copy count and remainder */
+ movl %edx,%ecx
+ andl $63,%edx
+ shrl $6,%ecx
+ jz .L_8b_nocache_copy_entry /* jump if count is 0 */
+
+ /* Perform 4x8-byte nocache loop-copy */
+.L_4x8b_nocache_copy_loop:
+1: movq (%rsi),%r8
+2: movq 1*8(%rsi),%r9
+3: movq 2*8(%rsi),%r10
+4: movq 3*8(%rsi),%r11
+5: movnti %r8,(%rdi)
+6: movnti %r9,1*8(%rdi)
+7: movnti %r10,2*8(%rdi)
+8: movnti %r11,3*8(%rdi)
+9: movq 4*8(%rsi),%r8
+10: movq 5*8(%rsi),%r9
+11: movq 6*8(%rsi),%r10
+12: movq 7*8(%rsi),%r11
+13: movnti %r8,4*8(%rdi)
+14: movnti %r9,5*8(%rdi)
+15: movnti %r10,6*8(%rdi)
+16: movnti %r11,7*8(%rdi)
+ leaq 64(%rsi),%rsi
+ leaq 64(%rdi),%rdi
+ decl %ecx
+ jnz .L_4x8b_nocache_copy_loop
+
+ /* Set 8-byte copy count and remainder */
+.L_8b_nocache_copy_entry:
+ movl %edx,%ecx
+ andl $7,%edx
+ shrl $3,%ecx
+ jz .L_4b_nocache_copy_entry /* jump if count is 0 */
+
+ /* Perform 8-byte nocache loop-copy */
+.L_8b_nocache_copy_loop:
+20: movq (%rsi),%r8
+21: movnti %r8,(%rdi)
+ leaq 8(%rsi),%rsi
+ leaq 8(%rdi),%rdi
+ decl %ecx
+ jnz .L_8b_nocache_copy_loop
+
+ /* If no byte left, we're done */
+.L_4b_nocache_copy_entry:
+ andl %edx,%edx
+ jz .L_finish_copy
+
+ /* If destination is not 4-byte aligned, go to byte copy: */
+ movl %edi,%ecx
+ andl $3,%ecx
+ jnz .L_1b_cache_copy_entry
+
+ /* Set 4-byte copy count (1 or 0) and remainder */
+ movl %edx,%ecx
+ andl $3,%edx
+ shrl $2,%ecx
+ jz .L_1b_cache_copy_entry /* jump if count is 0 */
+
+ /* Perform 4-byte nocache copy: */
+30: movl (%rsi),%r8d
+31: movnti %r8d,(%rdi)
+ leaq 4(%rsi),%rsi
+ leaq 4(%rdi),%rdi
+
+ /* If no bytes left, we're done: */
+ andl %edx,%edx
+ jz .L_finish_copy
+
+ /* Perform byte "cache" loop-copy for the remainder */
+.L_1b_cache_copy_entry:
+ movl %edx,%ecx
+.L_1b_cache_copy_loop:
+40: movb (%rsi),%al
+41: movb %al,(%rdi)
+ incq %rsi
+ incq %rdi
+ decl %ecx
+ jnz .L_1b_cache_copy_loop
+
+ /* Finished copying; fence the prior stores */
+.L_finish_copy:
+ xorl %eax,%eax
+ ASM_CLAC
+ sfence
+ ret
+
+ .section .fixup,"ax"
+.L_fixup_4x8b_copy:
+ shll $6,%ecx
+ addl %ecx,%edx
+ jmp .L_fixup_handle_tail
+.L_fixup_8b_copy:
+ lea (%rdx,%rcx,8),%rdx
+ jmp .L_fixup_handle_tail
+.L_fixup_4b_copy:
+ lea (%rdx,%rcx,4),%rdx
+ jmp .L_fixup_handle_tail
+.L_fixup_1b_copy:
+ movl %ecx,%edx
+.L_fixup_handle_tail:
+ sfence
+ jmp copy_user_handle_tail
+ .previous
+
+ _ASM_EXTABLE(1b,.L_fixup_4x8b_copy)
+ _ASM_EXTABLE(2b,.L_fixup_4x8b_copy)
+ _ASM_EXTABLE(3b,.L_fixup_4x8b_copy)
+ _ASM_EXTABLE(4b,.L_fixup_4x8b_copy)
+ _ASM_EXTABLE(5b,.L_fixup_4x8b_copy)
+ _ASM_EXTABLE(6b,.L_fixup_4x8b_copy)
+ _ASM_EXTABLE(7b,.L_fixup_4x8b_copy)
+ _ASM_EXTABLE(8b,.L_fixup_4x8b_copy)
+ _ASM_EXTABLE(9b,.L_fixup_4x8b_copy)
+ _ASM_EXTABLE(10b,.L_fixup_4x8b_copy)
+ _ASM_EXTABLE(11b,.L_fixup_4x8b_copy)
+ _ASM_EXTABLE(12b,.L_fixup_4x8b_copy)
+ _ASM_EXTABLE(13b,.L_fixup_4x8b_copy)
+ _ASM_EXTABLE(14b,.L_fixup_4x8b_copy)
+ _ASM_EXTABLE(15b,.L_fixup_4x8b_copy)
+ _ASM_EXTABLE(16b,.L_fixup_4x8b_copy)
+ _ASM_EXTABLE(20b,.L_fixup_8b_copy)
+ _ASM_EXTABLE(21b,.L_fixup_8b_copy)
+ _ASM_EXTABLE(30b,.L_fixup_4b_copy)
+ _ASM_EXTABLE(31b,.L_fixup_4b_copy)
+ _ASM_EXTABLE(40b,.L_fixup_1b_copy)
+ _ASM_EXTABLE(41b,.L_fixup_1b_copy)
+ENDPROC(__copy_user_nocache)
+EXPORT_SYMBOL(__copy_user_nocache)
diff --git a/arch/x86/lib/cpu.c b/arch/x86/lib/cpu.c
new file mode 100644
index 000000000..19f707992
--- /dev/null
+++ b/arch/x86/lib/cpu.c
@@ -0,0 +1,37 @@
+#include <linux/types.h>
+#include <linux/export.h>
+#include <asm/cpu.h>
+
+unsigned int x86_family(unsigned int sig)
+{
+ unsigned int x86;
+
+ x86 = (sig >> 8) & 0xf;
+
+ if (x86 == 0xf)
+ x86 += (sig >> 20) & 0xff;
+
+ return x86;
+}
+EXPORT_SYMBOL_GPL(x86_family);
+
+unsigned int x86_model(unsigned int sig)
+{
+ unsigned int fam, model;
+
+ fam = x86_family(sig);
+
+ model = (sig >> 4) & 0xf;
+
+ if (fam >= 0x6)
+ model += ((sig >> 16) & 0xf) << 4;
+
+ return model;
+}
+EXPORT_SYMBOL_GPL(x86_model);
+
+unsigned int x86_stepping(unsigned int sig)
+{
+ return sig & 0xf;
+}
+EXPORT_SYMBOL_GPL(x86_stepping);
diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S
new file mode 100644
index 000000000..45a53dfe1
--- /dev/null
+++ b/arch/x86/lib/csum-copy_64.S
@@ -0,0 +1,224 @@
+/*
+ * Copyright 2002, 2003 Andi Kleen, SuSE Labs.
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file COPYING in the main directory of this archive
+ * for more details. No warranty for anything given at all.
+ */
+#include <linux/linkage.h>
+#include <asm/errno.h>
+#include <asm/asm.h>
+
+/*
+ * Checksum copy with exception handling.
+ * On exceptions src_err_ptr or dst_err_ptr is set to -EFAULT and the
+ * destination is zeroed.
+ *
+ * Input
+ * rdi source
+ * rsi destination
+ * edx len (32bit)
+ * ecx sum (32bit)
+ * r8 src_err_ptr (int)
+ * r9 dst_err_ptr (int)
+ *
+ * Output
+ * eax 64bit sum. undefined in case of exception.
+ *
+ * Wrappers need to take care of valid exception sum and zeroing.
+ * They also should align source or destination to 8 bytes.
+ */
+
+ .macro source
+10:
+ _ASM_EXTABLE(10b, .Lbad_source)
+ .endm
+
+ .macro dest
+20:
+ _ASM_EXTABLE(20b, .Lbad_dest)
+ .endm
+
+ .macro ignore L=.Lignore
+30:
+ _ASM_EXTABLE(30b, \L)
+ .endm
+
+
+ENTRY(csum_partial_copy_generic)
+ cmpl $3*64, %edx
+ jle .Lignore
+
+.Lignore:
+ subq $7*8, %rsp
+ movq %rbx, 2*8(%rsp)
+ movq %r12, 3*8(%rsp)
+ movq %r14, 4*8(%rsp)
+ movq %r13, 5*8(%rsp)
+ movq %r15, 6*8(%rsp)
+
+ movq %r8, (%rsp)
+ movq %r9, 1*8(%rsp)
+
+ movl %ecx, %eax
+ movl %edx, %ecx
+
+ xorl %r9d, %r9d
+ movq %rcx, %r12
+
+ shrq $6, %r12
+ jz .Lhandle_tail /* < 64 */
+
+ clc
+
+ /* main loop. clear in 64 byte blocks */
+ /* r9: zero, r8: temp2, rbx: temp1, rax: sum, rcx: saved length */
+ /* r11: temp3, rdx: temp4, r12 loopcnt */
+ /* r10: temp5, r15: temp6, r14 temp7, r13 temp8 */
+ .p2align 4
+.Lloop:
+ source
+ movq (%rdi), %rbx
+ source
+ movq 8(%rdi), %r8
+ source
+ movq 16(%rdi), %r11
+ source
+ movq 24(%rdi), %rdx
+
+ source
+ movq 32(%rdi), %r10
+ source
+ movq 40(%rdi), %r15
+ source
+ movq 48(%rdi), %r14
+ source
+ movq 56(%rdi), %r13
+
+ ignore 2f
+ prefetcht0 5*64(%rdi)
+2:
+ adcq %rbx, %rax
+ adcq %r8, %rax
+ adcq %r11, %rax
+ adcq %rdx, %rax
+ adcq %r10, %rax
+ adcq %r15, %rax
+ adcq %r14, %rax
+ adcq %r13, %rax
+
+ decl %r12d
+
+ dest
+ movq %rbx, (%rsi)
+ dest
+ movq %r8, 8(%rsi)
+ dest
+ movq %r11, 16(%rsi)
+ dest
+ movq %rdx, 24(%rsi)
+
+ dest
+ movq %r10, 32(%rsi)
+ dest
+ movq %r15, 40(%rsi)
+ dest
+ movq %r14, 48(%rsi)
+ dest
+ movq %r13, 56(%rsi)
+
+3:
+
+ leaq 64(%rdi), %rdi
+ leaq 64(%rsi), %rsi
+
+ jnz .Lloop
+
+ adcq %r9, %rax
+
+ /* do last up to 56 bytes */
+.Lhandle_tail:
+ /* ecx: count */
+ movl %ecx, %r10d
+ andl $63, %ecx
+ shrl $3, %ecx
+ jz .Lfold
+ clc
+ .p2align 4
+.Lloop_8:
+ source
+ movq (%rdi), %rbx
+ adcq %rbx, %rax
+ decl %ecx
+ dest
+ movq %rbx, (%rsi)
+ leaq 8(%rsi), %rsi /* preserve carry */
+ leaq 8(%rdi), %rdi
+ jnz .Lloop_8
+ adcq %r9, %rax /* add in carry */
+
+.Lfold:
+ /* reduce checksum to 32bits */
+ movl %eax, %ebx
+ shrq $32, %rax
+ addl %ebx, %eax
+ adcl %r9d, %eax
+
+ /* do last up to 6 bytes */
+.Lhandle_7:
+ movl %r10d, %ecx
+ andl $7, %ecx
+ shrl $1, %ecx
+ jz .Lhandle_1
+ movl $2, %edx
+ xorl %ebx, %ebx
+ clc
+ .p2align 4
+.Lloop_1:
+ source
+ movw (%rdi), %bx
+ adcl %ebx, %eax
+ decl %ecx
+ dest
+ movw %bx, (%rsi)
+ leaq 2(%rdi), %rdi
+ leaq 2(%rsi), %rsi
+ jnz .Lloop_1
+ adcl %r9d, %eax /* add in carry */
+
+ /* handle last odd byte */
+.Lhandle_1:
+ testb $1, %r10b
+ jz .Lende
+ xorl %ebx, %ebx
+ source
+ movb (%rdi), %bl
+ dest
+ movb %bl, (%rsi)
+ addl %ebx, %eax
+ adcl %r9d, %eax /* carry */
+
+.Lende:
+ movq 2*8(%rsp), %rbx
+ movq 3*8(%rsp), %r12
+ movq 4*8(%rsp), %r14
+ movq 5*8(%rsp), %r13
+ movq 6*8(%rsp), %r15
+ addq $7*8, %rsp
+ ret
+
+ /* Exception handlers. Very simple, zeroing is done in the wrappers */
+.Lbad_source:
+ movq (%rsp), %rax
+ testq %rax, %rax
+ jz .Lende
+ movl $-EFAULT, (%rax)
+ jmp .Lende
+
+.Lbad_dest:
+ movq 8(%rsp), %rax
+ testq %rax, %rax
+ jz .Lende
+ movl $-EFAULT, (%rax)
+ jmp .Lende
+ENDPROC(csum_partial_copy_generic)
diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
new file mode 100644
index 000000000..9baca3e05
--- /dev/null
+++ b/arch/x86/lib/csum-partial_64.c
@@ -0,0 +1,150 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * arch/x86_64/lib/csum-partial.c
+ *
+ * This file contains network checksum routines that are better done
+ * in an architecture-specific manner due to speed.
+ */
+
+#include <linux/compiler.h>
+#include <linux/export.h>
+#include <asm/checksum.h>
+
+static inline unsigned short from32to16(unsigned a)
+{
+ unsigned short b = a >> 16;
+ asm("addw %w2,%w0\n\t"
+ "adcw $0,%w0\n"
+ : "=r" (b)
+ : "0" (b), "r" (a));
+ return b;
+}
+
+/*
+ * Do a 64-bit checksum on an arbitrary memory area.
+ * Returns a 32bit checksum.
+ *
+ * This isn't as time critical as it used to be because many NICs
+ * do hardware checksumming these days.
+ *
+ * Things tried and found to not make it faster:
+ * Manual Prefetching
+ * Unrolling to an 128 bytes inner loop.
+ * Using interleaving with more registers to break the carry chains.
+ */
+static unsigned do_csum(const unsigned char *buff, unsigned len)
+{
+ unsigned odd, count;
+ unsigned long result = 0;
+
+ if (unlikely(len == 0))
+ return result;
+ odd = 1 & (unsigned long) buff;
+ if (unlikely(odd)) {
+ result = *buff << 8;
+ len--;
+ buff++;
+ }
+ count = len >> 1; /* nr of 16-bit words.. */
+ if (count) {
+ if (2 & (unsigned long) buff) {
+ result += *(unsigned short *)buff;
+ count--;
+ len -= 2;
+ buff += 2;
+ }
+ count >>= 1; /* nr of 32-bit words.. */
+ if (count) {
+ unsigned long zero;
+ unsigned count64;
+ if (4 & (unsigned long) buff) {
+ result += *(unsigned int *) buff;
+ count--;
+ len -= 4;
+ buff += 4;
+ }
+ count >>= 1; /* nr of 64-bit words.. */
+
+ /* main loop using 64byte blocks */
+ zero = 0;
+ count64 = count >> 3;
+ while (count64) {
+ asm("addq 0*8(%[src]),%[res]\n\t"
+ "adcq 1*8(%[src]),%[res]\n\t"
+ "adcq 2*8(%[src]),%[res]\n\t"
+ "adcq 3*8(%[src]),%[res]\n\t"
+ "adcq 4*8(%[src]),%[res]\n\t"
+ "adcq 5*8(%[src]),%[res]\n\t"
+ "adcq 6*8(%[src]),%[res]\n\t"
+ "adcq 7*8(%[src]),%[res]\n\t"
+ "adcq %[zero],%[res]"
+ : [res] "=r" (result)
+ : [src] "r" (buff), [zero] "r" (zero),
+ "[res]" (result));
+ buff += 64;
+ count64--;
+ }
+
+ /* last up to 7 8byte blocks */
+ count %= 8;
+ while (count) {
+ asm("addq %1,%0\n\t"
+ "adcq %2,%0\n"
+ : "=r" (result)
+ : "m" (*(unsigned long *)buff),
+ "r" (zero), "0" (result));
+ --count;
+ buff += 8;
+ }
+ result = add32_with_carry(result>>32,
+ result&0xffffffff);
+
+ if (len & 4) {
+ result += *(unsigned int *) buff;
+ buff += 4;
+ }
+ }
+ if (len & 2) {
+ result += *(unsigned short *) buff;
+ buff += 2;
+ }
+ }
+ if (len & 1)
+ result += *buff;
+ result = add32_with_carry(result>>32, result & 0xffffffff);
+ if (unlikely(odd)) {
+ result = from32to16(result);
+ result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
+ }
+ return result;
+}
+
+/*
+ * computes the checksum of a memory block at buff, length len,
+ * and adds in "sum" (32-bit)
+ *
+ * returns a 32-bit number suitable for feeding into itself
+ * or csum_tcpudp_magic
+ *
+ * this function must be called with even lengths, except
+ * for the last fragment, which may be odd
+ *
+ * it's best to have buff aligned on a 64-bit boundary
+ */
+__wsum csum_partial(const void *buff, int len, __wsum sum)
+{
+ return (__force __wsum)add32_with_carry(do_csum(buff, len),
+ (__force u32)sum);
+}
+EXPORT_SYMBOL(csum_partial);
+
+/*
+ * this routine is used for miscellaneous IP-like checksums, mainly
+ * in icmp.c
+ */
+__sum16 ip_compute_csum(const void *buff, int len)
+{
+ return csum_fold(csum_partial(buff,len,0));
+}
+EXPORT_SYMBOL(ip_compute_csum);
+
diff --git a/arch/x86/lib/csum-wrappers_64.c b/arch/x86/lib/csum-wrappers_64.c
new file mode 100644
index 000000000..8bd53589e
--- /dev/null
+++ b/arch/x86/lib/csum-wrappers_64.c
@@ -0,0 +1,158 @@
+/*
+ * Copyright 2002, 2003 Andi Kleen, SuSE Labs.
+ * Subject to the GNU Public License v.2
+ *
+ * Wrappers of assembly checksum functions for x86-64.
+ */
+#include <asm/checksum.h>
+#include <linux/export.h>
+#include <linux/uaccess.h>
+#include <asm/smap.h>
+
+/**
+ * csum_partial_copy_from_user - Copy and checksum from user space.
+ * @src: source address (user space)
+ * @dst: destination address
+ * @len: number of bytes to be copied.
+ * @isum: initial sum that is added into the result (32bit unfolded)
+ * @errp: set to -EFAULT for an bad source address.
+ *
+ * Returns an 32bit unfolded checksum of the buffer.
+ * src and dst are best aligned to 64bits.
+ */
+__wsum
+csum_partial_copy_from_user(const void __user *src, void *dst,
+ int len, __wsum isum, int *errp)
+{
+ might_sleep();
+ *errp = 0;
+
+ if (!likely(access_ok(VERIFY_READ, src, len)))
+ goto out_err;
+
+ /*
+ * Why 6, not 7? To handle odd addresses aligned we
+ * would need to do considerable complications to fix the
+ * checksum which is defined as an 16bit accumulator. The
+ * fix alignment code is primarily for performance
+ * compatibility with 32bit and that will handle odd
+ * addresses slowly too.
+ */
+ if (unlikely((unsigned long)src & 6)) {
+ while (((unsigned long)src & 6) && len >= 2) {
+ __u16 val16;
+
+ if (__get_user(val16, (const __u16 __user *)src))
+ goto out_err;
+
+ *(__u16 *)dst = val16;
+ isum = (__force __wsum)add32_with_carry(
+ (__force unsigned)isum, val16);
+ src += 2;
+ dst += 2;
+ len -= 2;
+ }
+ }
+ stac();
+ isum = csum_partial_copy_generic((__force const void *)src,
+ dst, len, isum, errp, NULL);
+ clac();
+ if (unlikely(*errp))
+ goto out_err;
+
+ return isum;
+
+out_err:
+ *errp = -EFAULT;
+ memset(dst, 0, len);
+
+ return isum;
+}
+EXPORT_SYMBOL(csum_partial_copy_from_user);
+
+/**
+ * csum_partial_copy_to_user - Copy and checksum to user space.
+ * @src: source address
+ * @dst: destination address (user space)
+ * @len: number of bytes to be copied.
+ * @isum: initial sum that is added into the result (32bit unfolded)
+ * @errp: set to -EFAULT for an bad destination address.
+ *
+ * Returns an 32bit unfolded checksum of the buffer.
+ * src and dst are best aligned to 64bits.
+ */
+__wsum
+csum_partial_copy_to_user(const void *src, void __user *dst,
+ int len, __wsum isum, int *errp)
+{
+ __wsum ret;
+
+ might_sleep();
+
+ if (unlikely(!access_ok(VERIFY_WRITE, dst, len))) {
+ *errp = -EFAULT;
+ return 0;
+ }
+
+ if (unlikely((unsigned long)dst & 6)) {
+ while (((unsigned long)dst & 6) && len >= 2) {
+ __u16 val16 = *(__u16 *)src;
+
+ isum = (__force __wsum)add32_with_carry(
+ (__force unsigned)isum, val16);
+ *errp = __put_user(val16, (__u16 __user *)dst);
+ if (*errp)
+ return isum;
+ src += 2;
+ dst += 2;
+ len -= 2;
+ }
+ }
+
+ *errp = 0;
+ stac();
+ ret = csum_partial_copy_generic(src, (void __force *)dst,
+ len, isum, NULL, errp);
+ clac();
+ return ret;
+}
+EXPORT_SYMBOL(csum_partial_copy_to_user);
+
+/**
+ * csum_partial_copy_nocheck - Copy and checksum.
+ * @src: source address
+ * @dst: destination address
+ * @len: number of bytes to be copied.
+ * @sum: initial sum that is added into the result (32bit unfolded)
+ *
+ * Returns an 32bit unfolded checksum of the buffer.
+ */
+__wsum
+csum_partial_copy_nocheck(const void *src, void *dst, int len, __wsum sum)
+{
+ return csum_partial_copy_generic(src, dst, len, sum, NULL, NULL);
+}
+EXPORT_SYMBOL(csum_partial_copy_nocheck);
+
+__sum16 csum_ipv6_magic(const struct in6_addr *saddr,
+ const struct in6_addr *daddr,
+ __u32 len, __u8 proto, __wsum sum)
+{
+ __u64 rest, sum64;
+
+ rest = (__force __u64)htonl(len) + (__force __u64)htons(proto) +
+ (__force __u64)sum;
+
+ asm(" addq (%[saddr]),%[sum]\n"
+ " adcq 8(%[saddr]),%[sum]\n"
+ " adcq (%[daddr]),%[sum]\n"
+ " adcq 8(%[daddr]),%[sum]\n"
+ " adcq $0,%[sum]\n"
+
+ : [sum] "=r" (sum64)
+ : "[sum]" (rest), [saddr] "r" (saddr), [daddr] "r" (daddr));
+
+ return csum_fold(
+ (__force __wsum)add32_with_carry(sum64 & 0xffffffff, sum64>>32));
+}
+EXPORT_SYMBOL(csum_ipv6_magic);
diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c
new file mode 100644
index 000000000..68ca883ab
--- /dev/null
+++ b/arch/x86/lib/delay.c
@@ -0,0 +1,189 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Precise Delay Loops for i386
+ *
+ * Copyright (C) 1993 Linus Torvalds
+ * Copyright (C) 1997 Martin Mares <mj@atrey.karlin.mff.cuni.cz>
+ * Copyright (C) 2008 Jiri Hladky <hladky _dot_ jiri _at_ gmail _dot_ com>
+ *
+ * The __delay function must _NOT_ be inlined as its execution time
+ * depends wildly on alignment on many x86 processors. The additional
+ * jump magic is needed to get the timing stable on all the CPU's
+ * we have to worry about.
+ */
+
+#include <linux/export.h>
+#include <linux/sched.h>
+#include <linux/timex.h>
+#include <linux/preempt.h>
+#include <linux/delay.h>
+
+#include <asm/processor.h>
+#include <asm/delay.h>
+#include <asm/timer.h>
+#include <asm/mwait.h>
+
+#ifdef CONFIG_SMP
+# include <asm/smp.h>
+#endif
+
+/* simple loop based delay: */
+static void delay_loop(unsigned long loops)
+{
+ asm volatile(
+ " test %0,%0 \n"
+ " jz 3f \n"
+ " jmp 1f \n"
+
+ ".align 16 \n"
+ "1: jmp 2f \n"
+
+ ".align 16 \n"
+ "2: dec %0 \n"
+ " jnz 2b \n"
+ "3: dec %0 \n"
+
+ : "+a" (loops)
+ :
+ );
+}
+
+/* TSC based delay: */
+static void delay_tsc(unsigned long __loops)
+{
+ u64 bclock, now, loops = __loops;
+ int cpu;
+
+ preempt_disable();
+ cpu = smp_processor_id();
+ bclock = rdtsc_ordered();
+ for (;;) {
+ now = rdtsc_ordered();
+ if ((now - bclock) >= loops)
+ break;
+
+ /* Allow RT tasks to run */
+ preempt_enable();
+ rep_nop();
+ preempt_disable();
+
+ /*
+ * It is possible that we moved to another CPU, and
+ * since TSC's are per-cpu we need to calculate
+ * that. The delay must guarantee that we wait "at
+ * least" the amount of time. Being moved to another
+ * CPU could make the wait longer but we just need to
+ * make sure we waited long enough. Rebalance the
+ * counter for this CPU.
+ */
+ if (unlikely(cpu != smp_processor_id())) {
+ loops -= (now - bclock);
+ cpu = smp_processor_id();
+ bclock = rdtsc_ordered();
+ }
+ }
+ preempt_enable();
+}
+
+/*
+ * On some AMD platforms, MWAITX has a configurable 32-bit timer, that
+ * counts with TSC frequency. The input value is the loop of the
+ * counter, it will exit when the timer expires.
+ */
+static void delay_mwaitx(unsigned long __loops)
+{
+ u64 start, end, delay, loops = __loops;
+
+ /*
+ * Timer value of 0 causes MWAITX to wait indefinitely, unless there
+ * is a store on the memory monitored by MONITORX.
+ */
+ if (loops == 0)
+ return;
+
+ start = rdtsc_ordered();
+
+ for (;;) {
+ delay = min_t(u64, MWAITX_MAX_LOOPS, loops);
+
+ /*
+ * Use cpu_tss_rw as a cacheline-aligned, seldomly
+ * accessed per-cpu variable as the monitor target.
+ */
+ __monitorx(raw_cpu_ptr(&cpu_tss_rw), 0, 0);
+
+ /*
+ * AMD, like Intel's MWAIT version, supports the EAX hint and
+ * EAX=0xf0 means, do not enter any deep C-state and we use it
+ * here in delay() to minimize wakeup latency.
+ */
+ __mwaitx(MWAITX_DISABLE_CSTATES, delay, MWAITX_ECX_TIMER_ENABLE);
+
+ end = rdtsc_ordered();
+
+ if (loops <= end - start)
+ break;
+
+ loops -= end - start;
+
+ start = end;
+ }
+}
+
+/*
+ * Since we calibrate only once at boot, this
+ * function should be set once at boot and not changed
+ */
+static void (*delay_fn)(unsigned long) = delay_loop;
+
+void use_tsc_delay(void)
+{
+ if (delay_fn == delay_loop)
+ delay_fn = delay_tsc;
+}
+
+void use_mwaitx_delay(void)
+{
+ delay_fn = delay_mwaitx;
+}
+
+int read_current_timer(unsigned long *timer_val)
+{
+ if (delay_fn == delay_tsc) {
+ *timer_val = rdtsc();
+ return 0;
+ }
+ return -1;
+}
+
+void __delay(unsigned long loops)
+{
+ delay_fn(loops);
+}
+EXPORT_SYMBOL(__delay);
+
+void __const_udelay(unsigned long xloops)
+{
+ unsigned long lpj = this_cpu_read(cpu_info.loops_per_jiffy) ? : loops_per_jiffy;
+ int d0;
+
+ xloops *= 4;
+ asm("mull %%edx"
+ :"=d" (xloops), "=&a" (d0)
+ :"1" (xloops), "0" (lpj * (HZ / 4)));
+
+ __delay(++xloops);
+}
+EXPORT_SYMBOL(__const_udelay);
+
+void __udelay(unsigned long usecs)
+{
+ __const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */
+}
+EXPORT_SYMBOL(__udelay);
+
+void __ndelay(unsigned long nsecs)
+{
+ __const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */
+}
+EXPORT_SYMBOL(__ndelay);
diff --git a/arch/x86/lib/error-inject.c b/arch/x86/lib/error-inject.c
new file mode 100644
index 000000000..3cdf06128
--- /dev/null
+++ b/arch/x86/lib/error-inject.c
@@ -0,0 +1,20 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/error-injection.h>
+#include <linux/kprobes.h>
+
+asmlinkage void just_return_func(void);
+
+asm(
+ ".type just_return_func, @function\n"
+ ".globl just_return_func\n"
+ "just_return_func:\n"
+ " ret\n"
+ ".size just_return_func, .-just_return_func\n"
+);
+
+void override_function_with_return(struct pt_regs *regs)
+{
+ regs->ip = (unsigned long)&just_return_func;
+}
+NOKPROBE_SYMBOL(override_function_with_return);
diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S
new file mode 100644
index 000000000..49b167f73
--- /dev/null
+++ b/arch/x86/lib/getuser.S
@@ -0,0 +1,143 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * __get_user functions.
+ *
+ * (C) Copyright 1998 Linus Torvalds
+ * (C) Copyright 2005 Andi Kleen
+ * (C) Copyright 2008 Glauber Costa
+ *
+ * These functions have a non-standard call interface
+ * to make them more efficient, especially as they
+ * return an error value in addition to the "real"
+ * return value.
+ */
+
+/*
+ * __get_user_X
+ *
+ * Inputs: %[r|e]ax contains the address.
+ *
+ * Outputs: %[r|e]ax is error code (0 or -EFAULT)
+ * %[r|e]dx contains zero-extended value
+ * %ecx contains the high half for 32-bit __get_user_8
+ *
+ *
+ * These functions should not modify any other registers,
+ * as they get called from within inline assembly.
+ */
+
+#include <linux/linkage.h>
+#include <asm/page_types.h>
+#include <asm/errno.h>
+#include <asm/asm-offsets.h>
+#include <asm/thread_info.h>
+#include <asm/asm.h>
+#include <asm/smap.h>
+#include <asm/export.h>
+
+ .text
+ENTRY(__get_user_1)
+ mov PER_CPU_VAR(current_task), %_ASM_DX
+ cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX
+ jae bad_get_user
+ sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */
+ and %_ASM_DX, %_ASM_AX
+ ASM_STAC
+1: movzbl (%_ASM_AX),%edx
+ xor %eax,%eax
+ ASM_CLAC
+ ret
+ENDPROC(__get_user_1)
+EXPORT_SYMBOL(__get_user_1)
+
+ENTRY(__get_user_2)
+ add $1,%_ASM_AX
+ jc bad_get_user
+ mov PER_CPU_VAR(current_task), %_ASM_DX
+ cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX
+ jae bad_get_user
+ sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */
+ and %_ASM_DX, %_ASM_AX
+ ASM_STAC
+2: movzwl -1(%_ASM_AX),%edx
+ xor %eax,%eax
+ ASM_CLAC
+ ret
+ENDPROC(__get_user_2)
+EXPORT_SYMBOL(__get_user_2)
+
+ENTRY(__get_user_4)
+ add $3,%_ASM_AX
+ jc bad_get_user
+ mov PER_CPU_VAR(current_task), %_ASM_DX
+ cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX
+ jae bad_get_user
+ sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */
+ and %_ASM_DX, %_ASM_AX
+ ASM_STAC
+3: movl -3(%_ASM_AX),%edx
+ xor %eax,%eax
+ ASM_CLAC
+ ret
+ENDPROC(__get_user_4)
+EXPORT_SYMBOL(__get_user_4)
+
+ENTRY(__get_user_8)
+#ifdef CONFIG_X86_64
+ add $7,%_ASM_AX
+ jc bad_get_user
+ mov PER_CPU_VAR(current_task), %_ASM_DX
+ cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX
+ jae bad_get_user
+ sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */
+ and %_ASM_DX, %_ASM_AX
+ ASM_STAC
+4: movq -7(%_ASM_AX),%rdx
+ xor %eax,%eax
+ ASM_CLAC
+ ret
+#else
+ add $7,%_ASM_AX
+ jc bad_get_user_8
+ mov PER_CPU_VAR(current_task), %_ASM_DX
+ cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX
+ jae bad_get_user_8
+ sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */
+ and %_ASM_DX, %_ASM_AX
+ ASM_STAC
+4: movl -7(%_ASM_AX),%edx
+5: movl -3(%_ASM_AX),%ecx
+ xor %eax,%eax
+ ASM_CLAC
+ ret
+#endif
+ENDPROC(__get_user_8)
+EXPORT_SYMBOL(__get_user_8)
+
+
+bad_get_user:
+ xor %edx,%edx
+ mov $(-EFAULT),%_ASM_AX
+ ASM_CLAC
+ ret
+END(bad_get_user)
+
+#ifdef CONFIG_X86_32
+bad_get_user_8:
+ xor %edx,%edx
+ xor %ecx,%ecx
+ mov $(-EFAULT),%_ASM_AX
+ ASM_CLAC
+ ret
+END(bad_get_user_8)
+#endif
+
+ _ASM_EXTABLE(1b,bad_get_user)
+ _ASM_EXTABLE(2b,bad_get_user)
+ _ASM_EXTABLE(3b,bad_get_user)
+#ifdef CONFIG_X86_64
+ _ASM_EXTABLE(4b,bad_get_user)
+#else
+ _ASM_EXTABLE(4b,bad_get_user_8)
+ _ASM_EXTABLE(5b,bad_get_user_8)
+#endif
diff --git a/arch/x86/lib/hweight.S b/arch/x86/lib/hweight.S
new file mode 100644
index 000000000..a14f9939c
--- /dev/null
+++ b/arch/x86/lib/hweight.S
@@ -0,0 +1,83 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/linkage.h>
+#include <asm/export.h>
+
+#include <asm/asm.h>
+
+/*
+ * unsigned int __sw_hweight32(unsigned int w)
+ * %rdi: w
+ */
+ENTRY(__sw_hweight32)
+
+#ifdef CONFIG_X86_64
+ movl %edi, %eax # w
+#endif
+ __ASM_SIZE(push,) %__ASM_REG(dx)
+ movl %eax, %edx # w -> t
+ shrl %edx # t >>= 1
+ andl $0x55555555, %edx # t &= 0x55555555
+ subl %edx, %eax # w -= t
+
+ movl %eax, %edx # w -> t
+ shrl $2, %eax # w_tmp >>= 2
+ andl $0x33333333, %edx # t &= 0x33333333
+ andl $0x33333333, %eax # w_tmp &= 0x33333333
+ addl %edx, %eax # w = w_tmp + t
+
+ movl %eax, %edx # w -> t
+ shrl $4, %edx # t >>= 4
+ addl %edx, %eax # w_tmp += t
+ andl $0x0f0f0f0f, %eax # w_tmp &= 0x0f0f0f0f
+ imull $0x01010101, %eax, %eax # w_tmp *= 0x01010101
+ shrl $24, %eax # w = w_tmp >> 24
+ __ASM_SIZE(pop,) %__ASM_REG(dx)
+ ret
+ENDPROC(__sw_hweight32)
+EXPORT_SYMBOL(__sw_hweight32)
+
+ENTRY(__sw_hweight64)
+#ifdef CONFIG_X86_64
+ pushq %rdi
+ pushq %rdx
+
+ movq %rdi, %rdx # w -> t
+ movabsq $0x5555555555555555, %rax
+ shrq %rdx # t >>= 1
+ andq %rdx, %rax # t &= 0x5555555555555555
+ movabsq $0x3333333333333333, %rdx
+ subq %rax, %rdi # w -= t
+
+ movq %rdi, %rax # w -> t
+ shrq $2, %rdi # w_tmp >>= 2
+ andq %rdx, %rax # t &= 0x3333333333333333
+ andq %rdi, %rdx # w_tmp &= 0x3333333333333333
+ addq %rdx, %rax # w = w_tmp + t
+
+ movq %rax, %rdx # w -> t
+ shrq $4, %rdx # t >>= 4
+ addq %rdx, %rax # w_tmp += t
+ movabsq $0x0f0f0f0f0f0f0f0f, %rdx
+ andq %rdx, %rax # w_tmp &= 0x0f0f0f0f0f0f0f0f
+ movabsq $0x0101010101010101, %rdx
+ imulq %rdx, %rax # w_tmp *= 0x0101010101010101
+ shrq $56, %rax # w = w_tmp >> 56
+
+ popq %rdx
+ popq %rdi
+ ret
+#else /* CONFIG_X86_32 */
+ /* We're getting an u64 arg in (%eax,%edx): unsigned long hweight64(__u64 w) */
+ pushl %ecx
+
+ call __sw_hweight32
+ movl %eax, %ecx # stash away result
+ movl %edx, %eax # second part of input
+ call __sw_hweight32
+ addl %ecx, %eax # result
+
+ popl %ecx
+ ret
+#endif
+ENDPROC(__sw_hweight64)
+EXPORT_SYMBOL(__sw_hweight64)
diff --git a/arch/x86/lib/inat.c b/arch/x86/lib/inat.c
new file mode 100644
index 000000000..c1f01a8e9
--- /dev/null
+++ b/arch/x86/lib/inat.c
@@ -0,0 +1,97 @@
+/*
+ * x86 instruction attribute tables
+ *
+ * Written by Masami Hiramatsu <mhiramat@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ */
+#include <asm/insn.h>
+
+/* Attribute tables are generated from opcode map */
+#include "inat-tables.c"
+
+/* Attribute search APIs */
+insn_attr_t inat_get_opcode_attribute(insn_byte_t opcode)
+{
+ return inat_primary_table[opcode];
+}
+
+int inat_get_last_prefix_id(insn_byte_t last_pfx)
+{
+ insn_attr_t lpfx_attr;
+
+ lpfx_attr = inat_get_opcode_attribute(last_pfx);
+ return inat_last_prefix_id(lpfx_attr);
+}
+
+insn_attr_t inat_get_escape_attribute(insn_byte_t opcode, int lpfx_id,
+ insn_attr_t esc_attr)
+{
+ const insn_attr_t *table;
+ int n;
+
+ n = inat_escape_id(esc_attr);
+
+ table = inat_escape_tables[n][0];
+ if (!table)
+ return 0;
+ if (inat_has_variant(table[opcode]) && lpfx_id) {
+ table = inat_escape_tables[n][lpfx_id];
+ if (!table)
+ return 0;
+ }
+ return table[opcode];
+}
+
+insn_attr_t inat_get_group_attribute(insn_byte_t modrm, int lpfx_id,
+ insn_attr_t grp_attr)
+{
+ const insn_attr_t *table;
+ int n;
+
+ n = inat_group_id(grp_attr);
+
+ table = inat_group_tables[n][0];
+ if (!table)
+ return inat_group_common_attribute(grp_attr);
+ if (inat_has_variant(table[X86_MODRM_REG(modrm)]) && lpfx_id) {
+ table = inat_group_tables[n][lpfx_id];
+ if (!table)
+ return inat_group_common_attribute(grp_attr);
+ }
+ return table[X86_MODRM_REG(modrm)] |
+ inat_group_common_attribute(grp_attr);
+}
+
+insn_attr_t inat_get_avx_attribute(insn_byte_t opcode, insn_byte_t vex_m,
+ insn_byte_t vex_p)
+{
+ const insn_attr_t *table;
+ if (vex_m > X86_VEX_M_MAX || vex_p > INAT_LSTPFX_MAX)
+ return 0;
+ /* At first, this checks the master table */
+ table = inat_avx_tables[vex_m][0];
+ if (!table)
+ return 0;
+ if (!inat_is_group(table[opcode]) && vex_p) {
+ /* If this is not a group, get attribute directly */
+ table = inat_avx_tables[vex_m][vex_p];
+ if (!table)
+ return 0;
+ }
+ return table[opcode];
+}
+
diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c
new file mode 100644
index 000000000..3172eaf4e
--- /dev/null
+++ b/arch/x86/lib/insn-eval.c
@@ -0,0 +1,1366 @@
+/*
+ * Utility functions for x86 operand and address decoding
+ *
+ * Copyright (C) Intel Corporation 2017
+ */
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/ratelimit.h>
+#include <linux/mmu_context.h>
+#include <asm/desc_defs.h>
+#include <asm/desc.h>
+#include <asm/inat.h>
+#include <asm/insn.h>
+#include <asm/insn-eval.h>
+#include <asm/ldt.h>
+#include <asm/vm86.h>
+
+#undef pr_fmt
+#define pr_fmt(fmt) "insn: " fmt
+
+enum reg_type {
+ REG_TYPE_RM = 0,
+ REG_TYPE_INDEX,
+ REG_TYPE_BASE,
+};
+
+/**
+ * is_string_insn() - Determine if instruction is a string instruction
+ * @insn: Instruction containing the opcode to inspect
+ *
+ * Returns:
+ *
+ * true if the instruction, determined by the opcode, is any of the
+ * string instructions as defined in the Intel Software Development manual.
+ * False otherwise.
+ */
+static bool is_string_insn(struct insn *insn)
+{
+ insn_get_opcode(insn);
+
+ /* All string instructions have a 1-byte opcode. */
+ if (insn->opcode.nbytes != 1)
+ return false;
+
+ switch (insn->opcode.bytes[0]) {
+ case 0x6c ... 0x6f: /* INS, OUTS */
+ case 0xa4 ... 0xa7: /* MOVS, CMPS */
+ case 0xaa ... 0xaf: /* STOS, LODS, SCAS */
+ return true;
+ default:
+ return false;
+ }
+}
+
+/**
+ * get_seg_reg_override_idx() - obtain segment register override index
+ * @insn: Valid instruction with segment override prefixes
+ *
+ * Inspect the instruction prefixes in @insn and find segment overrides, if any.
+ *
+ * Returns:
+ *
+ * A constant identifying the segment register to use, among CS, SS, DS,
+ * ES, FS, or GS. INAT_SEG_REG_DEFAULT is returned if no segment override
+ * prefixes were found.
+ *
+ * -EINVAL in case of error.
+ */
+static int get_seg_reg_override_idx(struct insn *insn)
+{
+ int idx = INAT_SEG_REG_DEFAULT;
+ int num_overrides = 0, i;
+ insn_byte_t p;
+
+ insn_get_prefixes(insn);
+
+ /* Look for any segment override prefixes. */
+ for_each_insn_prefix(insn, i, p) {
+ insn_attr_t attr;
+
+ attr = inat_get_opcode_attribute(p);
+ switch (attr) {
+ case INAT_MAKE_PREFIX(INAT_PFX_CS):
+ idx = INAT_SEG_REG_CS;
+ num_overrides++;
+ break;
+ case INAT_MAKE_PREFIX(INAT_PFX_SS):
+ idx = INAT_SEG_REG_SS;
+ num_overrides++;
+ break;
+ case INAT_MAKE_PREFIX(INAT_PFX_DS):
+ idx = INAT_SEG_REG_DS;
+ num_overrides++;
+ break;
+ case INAT_MAKE_PREFIX(INAT_PFX_ES):
+ idx = INAT_SEG_REG_ES;
+ num_overrides++;
+ break;
+ case INAT_MAKE_PREFIX(INAT_PFX_FS):
+ idx = INAT_SEG_REG_FS;
+ num_overrides++;
+ break;
+ case INAT_MAKE_PREFIX(INAT_PFX_GS):
+ idx = INAT_SEG_REG_GS;
+ num_overrides++;
+ break;
+ /* No default action needed. */
+ }
+ }
+
+ /* More than one segment override prefix leads to undefined behavior. */
+ if (num_overrides > 1)
+ return -EINVAL;
+
+ return idx;
+}
+
+/**
+ * check_seg_overrides() - check if segment override prefixes are allowed
+ * @insn: Valid instruction with segment override prefixes
+ * @regoff: Operand offset, in pt_regs, for which the check is performed
+ *
+ * For a particular register used in register-indirect addressing, determine if
+ * segment override prefixes can be used. Specifically, no overrides are allowed
+ * for rDI if used with a string instruction.
+ *
+ * Returns:
+ *
+ * True if segment override prefixes can be used with the register indicated
+ * in @regoff. False if otherwise.
+ */
+static bool check_seg_overrides(struct insn *insn, int regoff)
+{
+ if (regoff == offsetof(struct pt_regs, di) && is_string_insn(insn))
+ return false;
+
+ return true;
+}
+
+/**
+ * resolve_default_seg() - resolve default segment register index for an operand
+ * @insn: Instruction with opcode and address size. Must be valid.
+ * @regs: Register values as seen when entering kernel mode
+ * @off: Operand offset, in pt_regs, for which resolution is needed
+ *
+ * Resolve the default segment register index associated with the instruction
+ * operand register indicated by @off. Such index is resolved based on defaults
+ * described in the Intel Software Development Manual.
+ *
+ * Returns:
+ *
+ * If in protected mode, a constant identifying the segment register to use,
+ * among CS, SS, ES or DS. If in long mode, INAT_SEG_REG_IGNORE.
+ *
+ * -EINVAL in case of error.
+ */
+static int resolve_default_seg(struct insn *insn, struct pt_regs *regs, int off)
+{
+ if (user_64bit_mode(regs))
+ return INAT_SEG_REG_IGNORE;
+ /*
+ * Resolve the default segment register as described in Section 3.7.4
+ * of the Intel Software Development Manual Vol. 1:
+ *
+ * + DS for all references involving r[ABCD]X, and rSI.
+ * + If used in a string instruction, ES for rDI. Otherwise, DS.
+ * + AX, CX and DX are not valid register operands in 16-bit address
+ * encodings but are valid for 32-bit and 64-bit encodings.
+ * + -EDOM is reserved to identify for cases in which no register
+ * is used (i.e., displacement-only addressing). Use DS.
+ * + SS for rSP or rBP.
+ * + CS for rIP.
+ */
+
+ switch (off) {
+ case offsetof(struct pt_regs, ax):
+ case offsetof(struct pt_regs, cx):
+ case offsetof(struct pt_regs, dx):
+ /* Need insn to verify address size. */
+ if (insn->addr_bytes == 2)
+ return -EINVAL;
+
+ case -EDOM:
+ case offsetof(struct pt_regs, bx):
+ case offsetof(struct pt_regs, si):
+ return INAT_SEG_REG_DS;
+
+ case offsetof(struct pt_regs, di):
+ if (is_string_insn(insn))
+ return INAT_SEG_REG_ES;
+ return INAT_SEG_REG_DS;
+
+ case offsetof(struct pt_regs, bp):
+ case offsetof(struct pt_regs, sp):
+ return INAT_SEG_REG_SS;
+
+ case offsetof(struct pt_regs, ip):
+ return INAT_SEG_REG_CS;
+
+ default:
+ return -EINVAL;
+ }
+}
+
+/**
+ * resolve_seg_reg() - obtain segment register index
+ * @insn: Instruction with operands
+ * @regs: Register values as seen when entering kernel mode
+ * @regoff: Operand offset, in pt_regs, used to deterimine segment register
+ *
+ * Determine the segment register associated with the operands and, if
+ * applicable, prefixes and the instruction pointed by @insn.
+ *
+ * The segment register associated to an operand used in register-indirect
+ * addressing depends on:
+ *
+ * a) Whether running in long mode (in such a case segments are ignored, except
+ * if FS or GS are used).
+ *
+ * b) Whether segment override prefixes can be used. Certain instructions and
+ * registers do not allow override prefixes.
+ *
+ * c) Whether segment overrides prefixes are found in the instruction prefixes.
+ *
+ * d) If there are not segment override prefixes or they cannot be used, the
+ * default segment register associated with the operand register is used.
+ *
+ * The function checks first if segment override prefixes can be used with the
+ * operand indicated by @regoff. If allowed, obtain such overridden segment
+ * register index. Lastly, if not prefixes were found or cannot be used, resolve
+ * the segment register index to use based on the defaults described in the
+ * Intel documentation. In long mode, all segment register indexes will be
+ * ignored, except if overrides were found for FS or GS. All these operations
+ * are done using helper functions.
+ *
+ * The operand register, @regoff, is represented as the offset from the base of
+ * pt_regs.
+ *
+ * As stated, the main use of this function is to determine the segment register
+ * index based on the instruction, its operands and prefixes. Hence, @insn
+ * must be valid. However, if @regoff indicates rIP, we don't need to inspect
+ * @insn at all as in this case CS is used in all cases. This case is checked
+ * before proceeding further.
+ *
+ * Please note that this function does not return the value in the segment
+ * register (i.e., the segment selector) but our defined index. The segment
+ * selector needs to be obtained using get_segment_selector() and passing the
+ * segment register index resolved by this function.
+ *
+ * Returns:
+ *
+ * An index identifying the segment register to use, among CS, SS, DS,
+ * ES, FS, or GS. INAT_SEG_REG_IGNORE is returned if running in long mode.
+ *
+ * -EINVAL in case of error.
+ */
+static int resolve_seg_reg(struct insn *insn, struct pt_regs *regs, int regoff)
+{
+ int idx;
+
+ /*
+ * In the unlikely event of having to resolve the segment register
+ * index for rIP, do it first. Segment override prefixes should not
+ * be used. Hence, it is not necessary to inspect the instruction,
+ * which may be invalid at this point.
+ */
+ if (regoff == offsetof(struct pt_regs, ip)) {
+ if (user_64bit_mode(regs))
+ return INAT_SEG_REG_IGNORE;
+ else
+ return INAT_SEG_REG_CS;
+ }
+
+ if (!insn)
+ return -EINVAL;
+
+ if (!check_seg_overrides(insn, regoff))
+ return resolve_default_seg(insn, regs, regoff);
+
+ idx = get_seg_reg_override_idx(insn);
+ if (idx < 0)
+ return idx;
+
+ if (idx == INAT_SEG_REG_DEFAULT)
+ return resolve_default_seg(insn, regs, regoff);
+
+ /*
+ * In long mode, segment override prefixes are ignored, except for
+ * overrides for FS and GS.
+ */
+ if (user_64bit_mode(regs)) {
+ if (idx != INAT_SEG_REG_FS &&
+ idx != INAT_SEG_REG_GS)
+ idx = INAT_SEG_REG_IGNORE;
+ }
+
+ return idx;
+}
+
+/**
+ * get_segment_selector() - obtain segment selector
+ * @regs: Register values as seen when entering kernel mode
+ * @seg_reg_idx: Segment register index to use
+ *
+ * Obtain the segment selector from any of the CS, SS, DS, ES, FS, GS segment
+ * registers. In CONFIG_X86_32, the segment is obtained from either pt_regs or
+ * kernel_vm86_regs as applicable. In CONFIG_X86_64, CS and SS are obtained
+ * from pt_regs. DS, ES, FS and GS are obtained by reading the actual CPU
+ * registers. This done for only for completeness as in CONFIG_X86_64 segment
+ * registers are ignored.
+ *
+ * Returns:
+ *
+ * Value of the segment selector, including null when running in
+ * long mode.
+ *
+ * -EINVAL on error.
+ */
+static short get_segment_selector(struct pt_regs *regs, int seg_reg_idx)
+{
+#ifdef CONFIG_X86_64
+ unsigned short sel;
+
+ switch (seg_reg_idx) {
+ case INAT_SEG_REG_IGNORE:
+ return 0;
+ case INAT_SEG_REG_CS:
+ return (unsigned short)(regs->cs & 0xffff);
+ case INAT_SEG_REG_SS:
+ return (unsigned short)(regs->ss & 0xffff);
+ case INAT_SEG_REG_DS:
+ savesegment(ds, sel);
+ return sel;
+ case INAT_SEG_REG_ES:
+ savesegment(es, sel);
+ return sel;
+ case INAT_SEG_REG_FS:
+ savesegment(fs, sel);
+ return sel;
+ case INAT_SEG_REG_GS:
+ savesegment(gs, sel);
+ return sel;
+ default:
+ return -EINVAL;
+ }
+#else /* CONFIG_X86_32 */
+ struct kernel_vm86_regs *vm86regs = (struct kernel_vm86_regs *)regs;
+
+ if (v8086_mode(regs)) {
+ switch (seg_reg_idx) {
+ case INAT_SEG_REG_CS:
+ return (unsigned short)(regs->cs & 0xffff);
+ case INAT_SEG_REG_SS:
+ return (unsigned short)(regs->ss & 0xffff);
+ case INAT_SEG_REG_DS:
+ return vm86regs->ds;
+ case INAT_SEG_REG_ES:
+ return vm86regs->es;
+ case INAT_SEG_REG_FS:
+ return vm86regs->fs;
+ case INAT_SEG_REG_GS:
+ return vm86regs->gs;
+ case INAT_SEG_REG_IGNORE:
+ /* fall through */
+ default:
+ return -EINVAL;
+ }
+ }
+
+ switch (seg_reg_idx) {
+ case INAT_SEG_REG_CS:
+ return (unsigned short)(regs->cs & 0xffff);
+ case INAT_SEG_REG_SS:
+ return (unsigned short)(regs->ss & 0xffff);
+ case INAT_SEG_REG_DS:
+ return (unsigned short)(regs->ds & 0xffff);
+ case INAT_SEG_REG_ES:
+ return (unsigned short)(regs->es & 0xffff);
+ case INAT_SEG_REG_FS:
+ return (unsigned short)(regs->fs & 0xffff);
+ case INAT_SEG_REG_GS:
+ /*
+ * GS may or may not be in regs as per CONFIG_X86_32_LAZY_GS.
+ * The macro below takes care of both cases.
+ */
+ return get_user_gs(regs);
+ case INAT_SEG_REG_IGNORE:
+ /* fall through */
+ default:
+ return -EINVAL;
+ }
+#endif /* CONFIG_X86_64 */
+}
+
+static int get_reg_offset(struct insn *insn, struct pt_regs *regs,
+ enum reg_type type)
+{
+ int regno = 0;
+
+ static const int regoff[] = {
+ offsetof(struct pt_regs, ax),
+ offsetof(struct pt_regs, cx),
+ offsetof(struct pt_regs, dx),
+ offsetof(struct pt_regs, bx),
+ offsetof(struct pt_regs, sp),
+ offsetof(struct pt_regs, bp),
+ offsetof(struct pt_regs, si),
+ offsetof(struct pt_regs, di),
+#ifdef CONFIG_X86_64
+ offsetof(struct pt_regs, r8),
+ offsetof(struct pt_regs, r9),
+ offsetof(struct pt_regs, r10),
+ offsetof(struct pt_regs, r11),
+ offsetof(struct pt_regs, r12),
+ offsetof(struct pt_regs, r13),
+ offsetof(struct pt_regs, r14),
+ offsetof(struct pt_regs, r15),
+#endif
+ };
+ int nr_registers = ARRAY_SIZE(regoff);
+ /*
+ * Don't possibly decode a 32-bit instructions as
+ * reading a 64-bit-only register.
+ */
+ if (IS_ENABLED(CONFIG_X86_64) && !insn->x86_64)
+ nr_registers -= 8;
+
+ switch (type) {
+ case REG_TYPE_RM:
+ regno = X86_MODRM_RM(insn->modrm.value);
+
+ /*
+ * ModRM.mod == 0 and ModRM.rm == 5 means a 32-bit displacement
+ * follows the ModRM byte.
+ */
+ if (!X86_MODRM_MOD(insn->modrm.value) && regno == 5)
+ return -EDOM;
+
+ if (X86_REX_B(insn->rex_prefix.value))
+ regno += 8;
+ break;
+
+ case REG_TYPE_INDEX:
+ regno = X86_SIB_INDEX(insn->sib.value);
+ if (X86_REX_X(insn->rex_prefix.value))
+ regno += 8;
+
+ /*
+ * If ModRM.mod != 3 and SIB.index = 4 the scale*index
+ * portion of the address computation is null. This is
+ * true only if REX.X is 0. In such a case, the SIB index
+ * is used in the address computation.
+ */
+ if (X86_MODRM_MOD(insn->modrm.value) != 3 && regno == 4)
+ return -EDOM;
+ break;
+
+ case REG_TYPE_BASE:
+ regno = X86_SIB_BASE(insn->sib.value);
+ /*
+ * If ModRM.mod is 0 and SIB.base == 5, the base of the
+ * register-indirect addressing is 0. In this case, a
+ * 32-bit displacement follows the SIB byte.
+ */
+ if (!X86_MODRM_MOD(insn->modrm.value) && regno == 5)
+ return -EDOM;
+
+ if (X86_REX_B(insn->rex_prefix.value))
+ regno += 8;
+ break;
+
+ default:
+ pr_err_ratelimited("invalid register type: %d\n", type);
+ return -EINVAL;
+ }
+
+ if (regno >= nr_registers) {
+ WARN_ONCE(1, "decoded an instruction with an invalid register");
+ return -EINVAL;
+ }
+ return regoff[regno];
+}
+
+/**
+ * get_reg_offset_16() - Obtain offset of register indicated by instruction
+ * @insn: Instruction containing ModRM byte
+ * @regs: Register values as seen when entering kernel mode
+ * @offs1: Offset of the first operand register
+ * @offs2: Offset of the second opeand register, if applicable
+ *
+ * Obtain the offset, in pt_regs, of the registers indicated by the ModRM byte
+ * in @insn. This function is to be used with 16-bit address encodings. The
+ * @offs1 and @offs2 will be written with the offset of the two registers
+ * indicated by the instruction. In cases where any of the registers is not
+ * referenced by the instruction, the value will be set to -EDOM.
+ *
+ * Returns:
+ *
+ * 0 on success, -EINVAL on error.
+ */
+static int get_reg_offset_16(struct insn *insn, struct pt_regs *regs,
+ int *offs1, int *offs2)
+{
+ /*
+ * 16-bit addressing can use one or two registers. Specifics of
+ * encodings are given in Table 2-1. "16-Bit Addressing Forms with the
+ * ModR/M Byte" of the Intel Software Development Manual.
+ */
+ static const int regoff1[] = {
+ offsetof(struct pt_regs, bx),
+ offsetof(struct pt_regs, bx),
+ offsetof(struct pt_regs, bp),
+ offsetof(struct pt_regs, bp),
+ offsetof(struct pt_regs, si),
+ offsetof(struct pt_regs, di),
+ offsetof(struct pt_regs, bp),
+ offsetof(struct pt_regs, bx),
+ };
+
+ static const int regoff2[] = {
+ offsetof(struct pt_regs, si),
+ offsetof(struct pt_regs, di),
+ offsetof(struct pt_regs, si),
+ offsetof(struct pt_regs, di),
+ -EDOM,
+ -EDOM,
+ -EDOM,
+ -EDOM,
+ };
+
+ if (!offs1 || !offs2)
+ return -EINVAL;
+
+ /* Operand is a register, use the generic function. */
+ if (X86_MODRM_MOD(insn->modrm.value) == 3) {
+ *offs1 = insn_get_modrm_rm_off(insn, regs);
+ *offs2 = -EDOM;
+ return 0;
+ }
+
+ *offs1 = regoff1[X86_MODRM_RM(insn->modrm.value)];
+ *offs2 = regoff2[X86_MODRM_RM(insn->modrm.value)];
+
+ /*
+ * If ModRM.mod is 0 and ModRM.rm is 110b, then we use displacement-
+ * only addressing. This means that no registers are involved in
+ * computing the effective address. Thus, ensure that the first
+ * register offset is invalild. The second register offset is already
+ * invalid under the aforementioned conditions.
+ */
+ if ((X86_MODRM_MOD(insn->modrm.value) == 0) &&
+ (X86_MODRM_RM(insn->modrm.value) == 6))
+ *offs1 = -EDOM;
+
+ return 0;
+}
+
+/**
+ * get_desc() - Obtain contents of a segment descriptor
+ * @out: Segment descriptor contents on success
+ * @sel: Segment selector
+ *
+ * Given a segment selector, obtain a pointer to the segment descriptor.
+ * Both global and local descriptor tables are supported.
+ *
+ * Returns:
+ *
+ * True on success, false on failure.
+ *
+ * NULL on error.
+ */
+static bool get_desc(struct desc_struct *out, unsigned short sel)
+{
+ struct desc_ptr gdt_desc = {0, 0};
+ unsigned long desc_base;
+
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+ if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT) {
+ bool success = false;
+ struct ldt_struct *ldt;
+
+ /* Bits [15:3] contain the index of the desired entry. */
+ sel >>= 3;
+
+ mutex_lock(&current->active_mm->context.lock);
+ ldt = current->active_mm->context.ldt;
+ if (ldt && sel < ldt->nr_entries) {
+ *out = ldt->entries[sel];
+ success = true;
+ }
+
+ mutex_unlock(&current->active_mm->context.lock);
+
+ return success;
+ }
+#endif
+ native_store_gdt(&gdt_desc);
+
+ /*
+ * Segment descriptors have a size of 8 bytes. Thus, the index is
+ * multiplied by 8 to obtain the memory offset of the desired descriptor
+ * from the base of the GDT. As bits [15:3] of the segment selector
+ * contain the index, it can be regarded as multiplied by 8 already.
+ * All that remains is to clear bits [2:0].
+ */
+ desc_base = sel & ~(SEGMENT_RPL_MASK | SEGMENT_TI_MASK);
+
+ if (desc_base > gdt_desc.size)
+ return false;
+
+ *out = *(struct desc_struct *)(gdt_desc.address + desc_base);
+ return true;
+}
+
+/**
+ * insn_get_seg_base() - Obtain base address of segment descriptor.
+ * @regs: Register values as seen when entering kernel mode
+ * @seg_reg_idx: Index of the segment register pointing to seg descriptor
+ *
+ * Obtain the base address of the segment as indicated by the segment descriptor
+ * pointed by the segment selector. The segment selector is obtained from the
+ * input segment register index @seg_reg_idx.
+ *
+ * Returns:
+ *
+ * In protected mode, base address of the segment. Zero in long mode,
+ * except when FS or GS are used. In virtual-8086 mode, the segment
+ * selector shifted 4 bits to the right.
+ *
+ * -1L in case of error.
+ */
+unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx)
+{
+ struct desc_struct desc;
+ short sel;
+
+ sel = get_segment_selector(regs, seg_reg_idx);
+ if (sel < 0)
+ return -1L;
+
+ if (v8086_mode(regs))
+ /*
+ * Base is simply the segment selector shifted 4
+ * bits to the right.
+ */
+ return (unsigned long)(sel << 4);
+
+ if (user_64bit_mode(regs)) {
+ /*
+ * Only FS or GS will have a base address, the rest of
+ * the segments' bases are forced to 0.
+ */
+ unsigned long base;
+
+ if (seg_reg_idx == INAT_SEG_REG_FS)
+ rdmsrl(MSR_FS_BASE, base);
+ else if (seg_reg_idx == INAT_SEG_REG_GS)
+ /*
+ * swapgs was called at the kernel entry point. Thus,
+ * MSR_KERNEL_GS_BASE will have the user-space GS base.
+ */
+ rdmsrl(MSR_KERNEL_GS_BASE, base);
+ else
+ base = 0;
+ return base;
+ }
+
+ /* In protected mode the segment selector cannot be null. */
+ if (!sel)
+ return -1L;
+
+ if (!get_desc(&desc, sel))
+ return -1L;
+
+ return get_desc_base(&desc);
+}
+
+/**
+ * get_seg_limit() - Obtain the limit of a segment descriptor
+ * @regs: Register values as seen when entering kernel mode
+ * @seg_reg_idx: Index of the segment register pointing to seg descriptor
+ *
+ * Obtain the limit of the segment as indicated by the segment descriptor
+ * pointed by the segment selector. The segment selector is obtained from the
+ * input segment register index @seg_reg_idx.
+ *
+ * Returns:
+ *
+ * In protected mode, the limit of the segment descriptor in bytes.
+ * In long mode and virtual-8086 mode, segment limits are not enforced. Thus,
+ * limit is returned as -1L to imply a limit-less segment.
+ *
+ * Zero is returned on error.
+ */
+static unsigned long get_seg_limit(struct pt_regs *regs, int seg_reg_idx)
+{
+ struct desc_struct desc;
+ unsigned long limit;
+ short sel;
+
+ sel = get_segment_selector(regs, seg_reg_idx);
+ if (sel < 0)
+ return 0;
+
+ if (user_64bit_mode(regs) || v8086_mode(regs))
+ return -1L;
+
+ if (!sel)
+ return 0;
+
+ if (!get_desc(&desc, sel))
+ return 0;
+
+ /*
+ * If the granularity bit is set, the limit is given in multiples
+ * of 4096. This also means that the 12 least significant bits are
+ * not tested when checking the segment limits. In practice,
+ * this means that the segment ends in (limit << 12) + 0xfff.
+ */
+ limit = get_desc_limit(&desc);
+ if (desc.g)
+ limit = (limit << 12) + 0xfff;
+
+ return limit;
+}
+
+/**
+ * insn_get_code_seg_params() - Obtain code segment parameters
+ * @regs: Structure with register values as seen when entering kernel mode
+ *
+ * Obtain address and operand sizes of the code segment. It is obtained from the
+ * selector contained in the CS register in regs. In protected mode, the default
+ * address is determined by inspecting the L and D bits of the segment
+ * descriptor. In virtual-8086 mode, the default is always two bytes for both
+ * address and operand sizes.
+ *
+ * Returns:
+ *
+ * An int containing ORed-in default parameters on success.
+ *
+ * -EINVAL on error.
+ */
+int insn_get_code_seg_params(struct pt_regs *regs)
+{
+ struct desc_struct desc;
+ short sel;
+
+ if (v8086_mode(regs))
+ /* Address and operand size are both 16-bit. */
+ return INSN_CODE_SEG_PARAMS(2, 2);
+
+ sel = get_segment_selector(regs, INAT_SEG_REG_CS);
+ if (sel < 0)
+ return sel;
+
+ if (!get_desc(&desc, sel))
+ return -EINVAL;
+
+ /*
+ * The most significant byte of the Type field of the segment descriptor
+ * determines whether a segment contains data or code. If this is a data
+ * segment, return error.
+ */
+ if (!(desc.type & BIT(3)))
+ return -EINVAL;
+
+ switch ((desc.l << 1) | desc.d) {
+ case 0: /*
+ * Legacy mode. CS.L=0, CS.D=0. Address and operand size are
+ * both 16-bit.
+ */
+ return INSN_CODE_SEG_PARAMS(2, 2);
+ case 1: /*
+ * Legacy mode. CS.L=0, CS.D=1. Address and operand size are
+ * both 32-bit.
+ */
+ return INSN_CODE_SEG_PARAMS(4, 4);
+ case 2: /*
+ * IA-32e 64-bit mode. CS.L=1, CS.D=0. Address size is 64-bit;
+ * operand size is 32-bit.
+ */
+ return INSN_CODE_SEG_PARAMS(4, 8);
+ case 3: /* Invalid setting. CS.L=1, CS.D=1 */
+ /* fall through */
+ default:
+ return -EINVAL;
+ }
+}
+
+/**
+ * insn_get_modrm_rm_off() - Obtain register in r/m part of the ModRM byte
+ * @insn: Instruction containing the ModRM byte
+ * @regs: Register values as seen when entering kernel mode
+ *
+ * Returns:
+ *
+ * The register indicated by the r/m part of the ModRM byte. The
+ * register is obtained as an offset from the base of pt_regs. In specific
+ * cases, the returned value can be -EDOM to indicate that the particular value
+ * of ModRM does not refer to a register and shall be ignored.
+ */
+int insn_get_modrm_rm_off(struct insn *insn, struct pt_regs *regs)
+{
+ return get_reg_offset(insn, regs, REG_TYPE_RM);
+}
+
+/**
+ * get_seg_base_limit() - obtain base address and limit of a segment
+ * @insn: Instruction. Must be valid.
+ * @regs: Register values as seen when entering kernel mode
+ * @regoff: Operand offset, in pt_regs, used to resolve segment descriptor
+ * @base: Obtained segment base
+ * @limit: Obtained segment limit
+ *
+ * Obtain the base address and limit of the segment associated with the operand
+ * @regoff and, if any or allowed, override prefixes in @insn. This function is
+ * different from insn_get_seg_base() as the latter does not resolve the segment
+ * associated with the instruction operand. If a limit is not needed (e.g.,
+ * when running in long mode), @limit can be NULL.
+ *
+ * Returns:
+ *
+ * 0 on success. @base and @limit will contain the base address and of the
+ * resolved segment, respectively.
+ *
+ * -EINVAL on error.
+ */
+static int get_seg_base_limit(struct insn *insn, struct pt_regs *regs,
+ int regoff, unsigned long *base,
+ unsigned long *limit)
+{
+ int seg_reg_idx;
+
+ if (!base)
+ return -EINVAL;
+
+ seg_reg_idx = resolve_seg_reg(insn, regs, regoff);
+ if (seg_reg_idx < 0)
+ return seg_reg_idx;
+
+ *base = insn_get_seg_base(regs, seg_reg_idx);
+ if (*base == -1L)
+ return -EINVAL;
+
+ if (!limit)
+ return 0;
+
+ *limit = get_seg_limit(regs, seg_reg_idx);
+ if (!(*limit))
+ return -EINVAL;
+
+ return 0;
+}
+
+/**
+ * get_eff_addr_reg() - Obtain effective address from register operand
+ * @insn: Instruction. Must be valid.
+ * @regs: Register values as seen when entering kernel mode
+ * @regoff: Obtained operand offset, in pt_regs, with the effective address
+ * @eff_addr: Obtained effective address
+ *
+ * Obtain the effective address stored in the register operand as indicated by
+ * the ModRM byte. This function is to be used only with register addressing
+ * (i.e., ModRM.mod is 3). The effective address is saved in @eff_addr. The
+ * register operand, as an offset from the base of pt_regs, is saved in @regoff;
+ * such offset can then be used to resolve the segment associated with the
+ * operand. This function can be used with any of the supported address sizes
+ * in x86.
+ *
+ * Returns:
+ *
+ * 0 on success. @eff_addr will have the effective address stored in the
+ * operand indicated by ModRM. @regoff will have such operand as an offset from
+ * the base of pt_regs.
+ *
+ * -EINVAL on error.
+ */
+static int get_eff_addr_reg(struct insn *insn, struct pt_regs *regs,
+ int *regoff, long *eff_addr)
+{
+ insn_get_modrm(insn);
+
+ if (!insn->modrm.nbytes)
+ return -EINVAL;
+
+ if (X86_MODRM_MOD(insn->modrm.value) != 3)
+ return -EINVAL;
+
+ *regoff = get_reg_offset(insn, regs, REG_TYPE_RM);
+ if (*regoff < 0)
+ return -EINVAL;
+
+ /* Ignore bytes that are outside the address size. */
+ if (insn->addr_bytes == 2)
+ *eff_addr = regs_get_register(regs, *regoff) & 0xffff;
+ else if (insn->addr_bytes == 4)
+ *eff_addr = regs_get_register(regs, *regoff) & 0xffffffff;
+ else /* 64-bit address */
+ *eff_addr = regs_get_register(regs, *regoff);
+
+ return 0;
+}
+
+/**
+ * get_eff_addr_modrm() - Obtain referenced effective address via ModRM
+ * @insn: Instruction. Must be valid.
+ * @regs: Register values as seen when entering kernel mode
+ * @regoff: Obtained operand offset, in pt_regs, associated with segment
+ * @eff_addr: Obtained effective address
+ *
+ * Obtain the effective address referenced by the ModRM byte of @insn. After
+ * identifying the registers involved in the register-indirect memory reference,
+ * its value is obtained from the operands in @regs. The computed address is
+ * stored @eff_addr. Also, the register operand that indicates the associated
+ * segment is stored in @regoff, this parameter can later be used to determine
+ * such segment.
+ *
+ * Returns:
+ *
+ * 0 on success. @eff_addr will have the referenced effective address. @regoff
+ * will have a register, as an offset from the base of pt_regs, that can be used
+ * to resolve the associated segment.
+ *
+ * -EINVAL on error.
+ */
+static int get_eff_addr_modrm(struct insn *insn, struct pt_regs *regs,
+ int *regoff, long *eff_addr)
+{
+ long tmp;
+
+ if (insn->addr_bytes != 8 && insn->addr_bytes != 4)
+ return -EINVAL;
+
+ insn_get_modrm(insn);
+
+ if (!insn->modrm.nbytes)
+ return -EINVAL;
+
+ if (X86_MODRM_MOD(insn->modrm.value) > 2)
+ return -EINVAL;
+
+ *regoff = get_reg_offset(insn, regs, REG_TYPE_RM);
+
+ /*
+ * -EDOM means that we must ignore the address_offset. In such a case,
+ * in 64-bit mode the effective address relative to the rIP of the
+ * following instruction.
+ */
+ if (*regoff == -EDOM) {
+ if (user_64bit_mode(regs))
+ tmp = regs->ip + insn->length;
+ else
+ tmp = 0;
+ } else if (*regoff < 0) {
+ return -EINVAL;
+ } else {
+ tmp = regs_get_register(regs, *regoff);
+ }
+
+ if (insn->addr_bytes == 4) {
+ int addr32 = (int)(tmp & 0xffffffff) + insn->displacement.value;
+
+ *eff_addr = addr32 & 0xffffffff;
+ } else {
+ *eff_addr = tmp + insn->displacement.value;
+ }
+
+ return 0;
+}
+
+/**
+ * get_eff_addr_modrm_16() - Obtain referenced effective address via ModRM
+ * @insn: Instruction. Must be valid.
+ * @regs: Register values as seen when entering kernel mode
+ * @regoff: Obtained operand offset, in pt_regs, associated with segment
+ * @eff_addr: Obtained effective address
+ *
+ * Obtain the 16-bit effective address referenced by the ModRM byte of @insn.
+ * After identifying the registers involved in the register-indirect memory
+ * reference, its value is obtained from the operands in @regs. The computed
+ * address is stored @eff_addr. Also, the register operand that indicates
+ * the associated segment is stored in @regoff, this parameter can later be used
+ * to determine such segment.
+ *
+ * Returns:
+ *
+ * 0 on success. @eff_addr will have the referenced effective address. @regoff
+ * will have a register, as an offset from the base of pt_regs, that can be used
+ * to resolve the associated segment.
+ *
+ * -EINVAL on error.
+ */
+static int get_eff_addr_modrm_16(struct insn *insn, struct pt_regs *regs,
+ int *regoff, short *eff_addr)
+{
+ int addr_offset1, addr_offset2, ret;
+ short addr1 = 0, addr2 = 0, displacement;
+
+ if (insn->addr_bytes != 2)
+ return -EINVAL;
+
+ insn_get_modrm(insn);
+
+ if (!insn->modrm.nbytes)
+ return -EINVAL;
+
+ if (X86_MODRM_MOD(insn->modrm.value) > 2)
+ return -EINVAL;
+
+ ret = get_reg_offset_16(insn, regs, &addr_offset1, &addr_offset2);
+ if (ret < 0)
+ return -EINVAL;
+
+ /*
+ * Don't fail on invalid offset values. They might be invalid because
+ * they cannot be used for this particular value of ModRM. Instead, use
+ * them in the computation only if they contain a valid value.
+ */
+ if (addr_offset1 != -EDOM)
+ addr1 = regs_get_register(regs, addr_offset1) & 0xffff;
+
+ if (addr_offset2 != -EDOM)
+ addr2 = regs_get_register(regs, addr_offset2) & 0xffff;
+
+ displacement = insn->displacement.value & 0xffff;
+ *eff_addr = addr1 + addr2 + displacement;
+
+ /*
+ * The first operand register could indicate to use of either SS or DS
+ * registers to obtain the segment selector. The second operand
+ * register can only indicate the use of DS. Thus, the first operand
+ * will be used to obtain the segment selector.
+ */
+ *regoff = addr_offset1;
+
+ return 0;
+}
+
+/**
+ * get_eff_addr_sib() - Obtain referenced effective address via SIB
+ * @insn: Instruction. Must be valid.
+ * @regs: Register values as seen when entering kernel mode
+ * @regoff: Obtained operand offset, in pt_regs, associated with segment
+ * @eff_addr: Obtained effective address
+ *
+ * Obtain the effective address referenced by the SIB byte of @insn. After
+ * identifying the registers involved in the indexed, register-indirect memory
+ * reference, its value is obtained from the operands in @regs. The computed
+ * address is stored @eff_addr. Also, the register operand that indicates the
+ * associated segment is stored in @regoff, this parameter can later be used to
+ * determine such segment.
+ *
+ * Returns:
+ *
+ * 0 on success. @eff_addr will have the referenced effective address.
+ * @base_offset will have a register, as an offset from the base of pt_regs,
+ * that can be used to resolve the associated segment.
+ *
+ * -EINVAL on error.
+ */
+static int get_eff_addr_sib(struct insn *insn, struct pt_regs *regs,
+ int *base_offset, long *eff_addr)
+{
+ long base, indx;
+ int indx_offset;
+
+ if (insn->addr_bytes != 8 && insn->addr_bytes != 4)
+ return -EINVAL;
+
+ insn_get_modrm(insn);
+
+ if (!insn->modrm.nbytes)
+ return -EINVAL;
+
+ if (X86_MODRM_MOD(insn->modrm.value) > 2)
+ return -EINVAL;
+
+ insn_get_sib(insn);
+
+ if (!insn->sib.nbytes)
+ return -EINVAL;
+
+ *base_offset = get_reg_offset(insn, regs, REG_TYPE_BASE);
+ indx_offset = get_reg_offset(insn, regs, REG_TYPE_INDEX);
+
+ /*
+ * Negative values in the base and index offset means an error when
+ * decoding the SIB byte. Except -EDOM, which means that the registers
+ * should not be used in the address computation.
+ */
+ if (*base_offset == -EDOM)
+ base = 0;
+ else if (*base_offset < 0)
+ return -EINVAL;
+ else
+ base = regs_get_register(regs, *base_offset);
+
+ if (indx_offset == -EDOM)
+ indx = 0;
+ else if (indx_offset < 0)
+ return -EINVAL;
+ else
+ indx = regs_get_register(regs, indx_offset);
+
+ if (insn->addr_bytes == 4) {
+ int addr32, base32, idx32;
+
+ base32 = base & 0xffffffff;
+ idx32 = indx & 0xffffffff;
+
+ addr32 = base32 + idx32 * (1 << X86_SIB_SCALE(insn->sib.value));
+ addr32 += insn->displacement.value;
+
+ *eff_addr = addr32 & 0xffffffff;
+ } else {
+ *eff_addr = base + indx * (1 << X86_SIB_SCALE(insn->sib.value));
+ *eff_addr += insn->displacement.value;
+ }
+
+ return 0;
+}
+
+/**
+ * get_addr_ref_16() - Obtain the 16-bit address referred by instruction
+ * @insn: Instruction containing ModRM byte and displacement
+ * @regs: Register values as seen when entering kernel mode
+ *
+ * This function is to be used with 16-bit address encodings. Obtain the memory
+ * address referred by the instruction's ModRM and displacement bytes. Also, the
+ * segment used as base is determined by either any segment override prefixes in
+ * @insn or the default segment of the registers involved in the address
+ * computation. In protected mode, segment limits are enforced.
+ *
+ * Returns:
+ *
+ * Linear address referenced by the instruction operands on success.
+ *
+ * -1L on error.
+ */
+static void __user *get_addr_ref_16(struct insn *insn, struct pt_regs *regs)
+{
+ unsigned long linear_addr = -1L, seg_base, seg_limit;
+ int ret, regoff;
+ short eff_addr;
+ long tmp;
+
+ insn_get_modrm(insn);
+ insn_get_displacement(insn);
+
+ if (insn->addr_bytes != 2)
+ goto out;
+
+ if (X86_MODRM_MOD(insn->modrm.value) == 3) {
+ ret = get_eff_addr_reg(insn, regs, &regoff, &tmp);
+ if (ret)
+ goto out;
+
+ eff_addr = tmp;
+ } else {
+ ret = get_eff_addr_modrm_16(insn, regs, &regoff, &eff_addr);
+ if (ret)
+ goto out;
+ }
+
+ ret = get_seg_base_limit(insn, regs, regoff, &seg_base, &seg_limit);
+ if (ret)
+ goto out;
+
+ /*
+ * Before computing the linear address, make sure the effective address
+ * is within the limits of the segment. In virtual-8086 mode, segment
+ * limits are not enforced. In such a case, the segment limit is -1L to
+ * reflect this fact.
+ */
+ if ((unsigned long)(eff_addr & 0xffff) > seg_limit)
+ goto out;
+
+ linear_addr = (unsigned long)(eff_addr & 0xffff) + seg_base;
+
+ /* Limit linear address to 20 bits */
+ if (v8086_mode(regs))
+ linear_addr &= 0xfffff;
+
+out:
+ return (void __user *)linear_addr;
+}
+
+/**
+ * get_addr_ref_32() - Obtain a 32-bit linear address
+ * @insn: Instruction with ModRM, SIB bytes and displacement
+ * @regs: Register values as seen when entering kernel mode
+ *
+ * This function is to be used with 32-bit address encodings to obtain the
+ * linear memory address referred by the instruction's ModRM, SIB,
+ * displacement bytes and segment base address, as applicable. If in protected
+ * mode, segment limits are enforced.
+ *
+ * Returns:
+ *
+ * Linear address referenced by instruction and registers on success.
+ *
+ * -1L on error.
+ */
+static void __user *get_addr_ref_32(struct insn *insn, struct pt_regs *regs)
+{
+ unsigned long linear_addr = -1L, seg_base, seg_limit;
+ int eff_addr, regoff;
+ long tmp;
+ int ret;
+
+ if (insn->addr_bytes != 4)
+ goto out;
+
+ if (X86_MODRM_MOD(insn->modrm.value) == 3) {
+ ret = get_eff_addr_reg(insn, regs, &regoff, &tmp);
+ if (ret)
+ goto out;
+
+ eff_addr = tmp;
+
+ } else {
+ if (insn->sib.nbytes) {
+ ret = get_eff_addr_sib(insn, regs, &regoff, &tmp);
+ if (ret)
+ goto out;
+
+ eff_addr = tmp;
+ } else {
+ ret = get_eff_addr_modrm(insn, regs, &regoff, &tmp);
+ if (ret)
+ goto out;
+
+ eff_addr = tmp;
+ }
+ }
+
+ ret = get_seg_base_limit(insn, regs, regoff, &seg_base, &seg_limit);
+ if (ret)
+ goto out;
+
+ /*
+ * In protected mode, before computing the linear address, make sure
+ * the effective address is within the limits of the segment.
+ * 32-bit addresses can be used in long and virtual-8086 modes if an
+ * address override prefix is used. In such cases, segment limits are
+ * not enforced. When in virtual-8086 mode, the segment limit is -1L
+ * to reflect this situation.
+ *
+ * After computed, the effective address is treated as an unsigned
+ * quantity.
+ */
+ if (!user_64bit_mode(regs) && ((unsigned int)eff_addr > seg_limit))
+ goto out;
+
+ /*
+ * Even though 32-bit address encodings are allowed in virtual-8086
+ * mode, the address range is still limited to [0x-0xffff].
+ */
+ if (v8086_mode(regs) && (eff_addr & ~0xffff))
+ goto out;
+
+ /*
+ * Data type long could be 64 bits in size. Ensure that our 32-bit
+ * effective address is not sign-extended when computing the linear
+ * address.
+ */
+ linear_addr = (unsigned long)(eff_addr & 0xffffffff) + seg_base;
+
+ /* Limit linear address to 20 bits */
+ if (v8086_mode(regs))
+ linear_addr &= 0xfffff;
+
+out:
+ return (void __user *)linear_addr;
+}
+
+/**
+ * get_addr_ref_64() - Obtain a 64-bit linear address
+ * @insn: Instruction struct with ModRM and SIB bytes and displacement
+ * @regs: Structure with register values as seen when entering kernel mode
+ *
+ * This function is to be used with 64-bit address encodings to obtain the
+ * linear memory address referred by the instruction's ModRM, SIB,
+ * displacement bytes and segment base address, as applicable.
+ *
+ * Returns:
+ *
+ * Linear address referenced by instruction and registers on success.
+ *
+ * -1L on error.
+ */
+#ifndef CONFIG_X86_64
+static void __user *get_addr_ref_64(struct insn *insn, struct pt_regs *regs)
+{
+ return (void __user *)-1L;
+}
+#else
+static void __user *get_addr_ref_64(struct insn *insn, struct pt_regs *regs)
+{
+ unsigned long linear_addr = -1L, seg_base;
+ int regoff, ret;
+ long eff_addr;
+
+ if (insn->addr_bytes != 8)
+ goto out;
+
+ if (X86_MODRM_MOD(insn->modrm.value) == 3) {
+ ret = get_eff_addr_reg(insn, regs, &regoff, &eff_addr);
+ if (ret)
+ goto out;
+
+ } else {
+ if (insn->sib.nbytes) {
+ ret = get_eff_addr_sib(insn, regs, &regoff, &eff_addr);
+ if (ret)
+ goto out;
+ } else {
+ ret = get_eff_addr_modrm(insn, regs, &regoff, &eff_addr);
+ if (ret)
+ goto out;
+ }
+
+ }
+
+ ret = get_seg_base_limit(insn, regs, regoff, &seg_base, NULL);
+ if (ret)
+ goto out;
+
+ linear_addr = (unsigned long)eff_addr + seg_base;
+
+out:
+ return (void __user *)linear_addr;
+}
+#endif /* CONFIG_X86_64 */
+
+/**
+ * insn_get_addr_ref() - Obtain the linear address referred by instruction
+ * @insn: Instruction structure containing ModRM byte and displacement
+ * @regs: Structure with register values as seen when entering kernel mode
+ *
+ * Obtain the linear address referred by the instruction's ModRM, SIB and
+ * displacement bytes, and segment base, as applicable. In protected mode,
+ * segment limits are enforced.
+ *
+ * Returns:
+ *
+ * Linear address referenced by instruction and registers on success.
+ *
+ * -1L on error.
+ */
+void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs)
+{
+ if (!insn || !regs)
+ return (void __user *)-1L;
+
+ switch (insn->addr_bytes) {
+ case 2:
+ return get_addr_ref_16(insn, regs);
+ case 4:
+ return get_addr_ref_32(insn, regs);
+ case 8:
+ return get_addr_ref_64(insn, regs);
+ default:
+ return (void __user *)-1L;
+ }
+}
diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c
new file mode 100644
index 000000000..1088eb8f3
--- /dev/null
+++ b/arch/x86/lib/insn.c
@@ -0,0 +1,606 @@
+/*
+ * x86 instruction analysis
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2002, 2004, 2009
+ */
+
+#ifdef __KERNEL__
+#include <linux/string.h>
+#else
+#include <string.h>
+#endif
+#include <asm/inat.h>
+#include <asm/insn.h>
+
+/* Verify next sizeof(t) bytes can be on the same instruction */
+#define validate_next(t, insn, n) \
+ ((insn)->next_byte + sizeof(t) + n <= (insn)->end_kaddr)
+
+#define __get_next(t, insn) \
+ ({ t r = *(t*)insn->next_byte; insn->next_byte += sizeof(t); r; })
+
+#define __peek_nbyte_next(t, insn, n) \
+ ({ t r = *(t*)((insn)->next_byte + n); r; })
+
+#define get_next(t, insn) \
+ ({ if (unlikely(!validate_next(t, insn, 0))) goto err_out; __get_next(t, insn); })
+
+#define peek_nbyte_next(t, insn, n) \
+ ({ if (unlikely(!validate_next(t, insn, n))) goto err_out; __peek_nbyte_next(t, insn, n); })
+
+#define peek_next(t, insn) peek_nbyte_next(t, insn, 0)
+
+/**
+ * insn_init() - initialize struct insn
+ * @insn: &struct insn to be initialized
+ * @kaddr: address (in kernel memory) of instruction (or copy thereof)
+ * @x86_64: !0 for 64-bit kernel or 64-bit app
+ */
+void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64)
+{
+ /*
+ * Instructions longer than MAX_INSN_SIZE (15 bytes) are invalid
+ * even if the input buffer is long enough to hold them.
+ */
+ if (buf_len > MAX_INSN_SIZE)
+ buf_len = MAX_INSN_SIZE;
+
+ memset(insn, 0, sizeof(*insn));
+ insn->kaddr = kaddr;
+ insn->end_kaddr = kaddr + buf_len;
+ insn->next_byte = kaddr;
+ insn->x86_64 = x86_64 ? 1 : 0;
+ insn->opnd_bytes = 4;
+ if (x86_64)
+ insn->addr_bytes = 8;
+ else
+ insn->addr_bytes = 4;
+}
+
+/**
+ * insn_get_prefixes - scan x86 instruction prefix bytes
+ * @insn: &struct insn containing instruction
+ *
+ * Populates the @insn->prefixes bitmap, and updates @insn->next_byte
+ * to point to the (first) opcode. No effect if @insn->prefixes.got
+ * is already set.
+ */
+void insn_get_prefixes(struct insn *insn)
+{
+ struct insn_field *prefixes = &insn->prefixes;
+ insn_attr_t attr;
+ insn_byte_t b, lb;
+ int i, nb;
+
+ if (prefixes->got)
+ return;
+
+ nb = 0;
+ lb = 0;
+ b = peek_next(insn_byte_t, insn);
+ attr = inat_get_opcode_attribute(b);
+ while (inat_is_legacy_prefix(attr)) {
+ /* Skip if same prefix */
+ for (i = 0; i < nb; i++)
+ if (prefixes->bytes[i] == b)
+ goto found;
+ if (nb == 4)
+ /* Invalid instruction */
+ break;
+ prefixes->bytes[nb++] = b;
+ if (inat_is_address_size_prefix(attr)) {
+ /* address size switches 2/4 or 4/8 */
+ if (insn->x86_64)
+ insn->addr_bytes ^= 12;
+ else
+ insn->addr_bytes ^= 6;
+ } else if (inat_is_operand_size_prefix(attr)) {
+ /* oprand size switches 2/4 */
+ insn->opnd_bytes ^= 6;
+ }
+found:
+ prefixes->nbytes++;
+ insn->next_byte++;
+ lb = b;
+ b = peek_next(insn_byte_t, insn);
+ attr = inat_get_opcode_attribute(b);
+ }
+ /* Set the last prefix */
+ if (lb && lb != insn->prefixes.bytes[3]) {
+ if (unlikely(insn->prefixes.bytes[3])) {
+ /* Swap the last prefix */
+ b = insn->prefixes.bytes[3];
+ for (i = 0; i < nb; i++)
+ if (prefixes->bytes[i] == lb)
+ prefixes->bytes[i] = b;
+ }
+ insn->prefixes.bytes[3] = lb;
+ }
+
+ /* Decode REX prefix */
+ if (insn->x86_64) {
+ b = peek_next(insn_byte_t, insn);
+ attr = inat_get_opcode_attribute(b);
+ if (inat_is_rex_prefix(attr)) {
+ insn->rex_prefix.value = b;
+ insn->rex_prefix.nbytes = 1;
+ insn->next_byte++;
+ if (X86_REX_W(b))
+ /* REX.W overrides opnd_size */
+ insn->opnd_bytes = 8;
+ }
+ }
+ insn->rex_prefix.got = 1;
+
+ /* Decode VEX prefix */
+ b = peek_next(insn_byte_t, insn);
+ attr = inat_get_opcode_attribute(b);
+ if (inat_is_vex_prefix(attr)) {
+ insn_byte_t b2 = peek_nbyte_next(insn_byte_t, insn, 1);
+ if (!insn->x86_64) {
+ /*
+ * In 32-bits mode, if the [7:6] bits (mod bits of
+ * ModRM) on the second byte are not 11b, it is
+ * LDS or LES or BOUND.
+ */
+ if (X86_MODRM_MOD(b2) != 3)
+ goto vex_end;
+ }
+ insn->vex_prefix.bytes[0] = b;
+ insn->vex_prefix.bytes[1] = b2;
+ if (inat_is_evex_prefix(attr)) {
+ b2 = peek_nbyte_next(insn_byte_t, insn, 2);
+ insn->vex_prefix.bytes[2] = b2;
+ b2 = peek_nbyte_next(insn_byte_t, insn, 3);
+ insn->vex_prefix.bytes[3] = b2;
+ insn->vex_prefix.nbytes = 4;
+ insn->next_byte += 4;
+ if (insn->x86_64 && X86_VEX_W(b2))
+ /* VEX.W overrides opnd_size */
+ insn->opnd_bytes = 8;
+ } else if (inat_is_vex3_prefix(attr)) {
+ b2 = peek_nbyte_next(insn_byte_t, insn, 2);
+ insn->vex_prefix.bytes[2] = b2;
+ insn->vex_prefix.nbytes = 3;
+ insn->next_byte += 3;
+ if (insn->x86_64 && X86_VEX_W(b2))
+ /* VEX.W overrides opnd_size */
+ insn->opnd_bytes = 8;
+ } else {
+ /*
+ * For VEX2, fake VEX3-like byte#2.
+ * Makes it easier to decode vex.W, vex.vvvv,
+ * vex.L and vex.pp. Masking with 0x7f sets vex.W == 0.
+ */
+ insn->vex_prefix.bytes[2] = b2 & 0x7f;
+ insn->vex_prefix.nbytes = 2;
+ insn->next_byte += 2;
+ }
+ }
+vex_end:
+ insn->vex_prefix.got = 1;
+
+ prefixes->got = 1;
+
+err_out:
+ return;
+}
+
+/**
+ * insn_get_opcode - collect opcode(s)
+ * @insn: &struct insn containing instruction
+ *
+ * Populates @insn->opcode, updates @insn->next_byte to point past the
+ * opcode byte(s), and set @insn->attr (except for groups).
+ * If necessary, first collects any preceding (prefix) bytes.
+ * Sets @insn->opcode.value = opcode1. No effect if @insn->opcode.got
+ * is already 1.
+ */
+void insn_get_opcode(struct insn *insn)
+{
+ struct insn_field *opcode = &insn->opcode;
+ insn_byte_t op;
+ int pfx_id;
+ if (opcode->got)
+ return;
+ if (!insn->prefixes.got)
+ insn_get_prefixes(insn);
+
+ /* Get first opcode */
+ op = get_next(insn_byte_t, insn);
+ opcode->bytes[0] = op;
+ opcode->nbytes = 1;
+
+ /* Check if there is VEX prefix or not */
+ if (insn_is_avx(insn)) {
+ insn_byte_t m, p;
+ m = insn_vex_m_bits(insn);
+ p = insn_vex_p_bits(insn);
+ insn->attr = inat_get_avx_attribute(op, m, p);
+ if ((inat_must_evex(insn->attr) && !insn_is_evex(insn)) ||
+ (!inat_accept_vex(insn->attr) &&
+ !inat_is_group(insn->attr)))
+ insn->attr = 0; /* This instruction is bad */
+ goto end; /* VEX has only 1 byte for opcode */
+ }
+
+ insn->attr = inat_get_opcode_attribute(op);
+ while (inat_is_escape(insn->attr)) {
+ /* Get escaped opcode */
+ op = get_next(insn_byte_t, insn);
+ opcode->bytes[opcode->nbytes++] = op;
+ pfx_id = insn_last_prefix_id(insn);
+ insn->attr = inat_get_escape_attribute(op, pfx_id, insn->attr);
+ }
+ if (inat_must_vex(insn->attr))
+ insn->attr = 0; /* This instruction is bad */
+end:
+ opcode->got = 1;
+
+err_out:
+ return;
+}
+
+/**
+ * insn_get_modrm - collect ModRM byte, if any
+ * @insn: &struct insn containing instruction
+ *
+ * Populates @insn->modrm and updates @insn->next_byte to point past the
+ * ModRM byte, if any. If necessary, first collects the preceding bytes
+ * (prefixes and opcode(s)). No effect if @insn->modrm.got is already 1.
+ */
+void insn_get_modrm(struct insn *insn)
+{
+ struct insn_field *modrm = &insn->modrm;
+ insn_byte_t pfx_id, mod;
+ if (modrm->got)
+ return;
+ if (!insn->opcode.got)
+ insn_get_opcode(insn);
+
+ if (inat_has_modrm(insn->attr)) {
+ mod = get_next(insn_byte_t, insn);
+ modrm->value = mod;
+ modrm->nbytes = 1;
+ if (inat_is_group(insn->attr)) {
+ pfx_id = insn_last_prefix_id(insn);
+ insn->attr = inat_get_group_attribute(mod, pfx_id,
+ insn->attr);
+ if (insn_is_avx(insn) && !inat_accept_vex(insn->attr))
+ insn->attr = 0; /* This is bad */
+ }
+ }
+
+ if (insn->x86_64 && inat_is_force64(insn->attr))
+ insn->opnd_bytes = 8;
+ modrm->got = 1;
+
+err_out:
+ return;
+}
+
+
+/**
+ * insn_rip_relative() - Does instruction use RIP-relative addressing mode?
+ * @insn: &struct insn containing instruction
+ *
+ * If necessary, first collects the instruction up to and including the
+ * ModRM byte. No effect if @insn->x86_64 is 0.
+ */
+int insn_rip_relative(struct insn *insn)
+{
+ struct insn_field *modrm = &insn->modrm;
+
+ if (!insn->x86_64)
+ return 0;
+ if (!modrm->got)
+ insn_get_modrm(insn);
+ /*
+ * For rip-relative instructions, the mod field (top 2 bits)
+ * is zero and the r/m field (bottom 3 bits) is 0x5.
+ */
+ return (modrm->nbytes && (modrm->value & 0xc7) == 0x5);
+}
+
+/**
+ * insn_get_sib() - Get the SIB byte of instruction
+ * @insn: &struct insn containing instruction
+ *
+ * If necessary, first collects the instruction up to and including the
+ * ModRM byte.
+ */
+void insn_get_sib(struct insn *insn)
+{
+ insn_byte_t modrm;
+
+ if (insn->sib.got)
+ return;
+ if (!insn->modrm.got)
+ insn_get_modrm(insn);
+ if (insn->modrm.nbytes) {
+ modrm = (insn_byte_t)insn->modrm.value;
+ if (insn->addr_bytes != 2 &&
+ X86_MODRM_MOD(modrm) != 3 && X86_MODRM_RM(modrm) == 4) {
+ insn->sib.value = get_next(insn_byte_t, insn);
+ insn->sib.nbytes = 1;
+ }
+ }
+ insn->sib.got = 1;
+
+err_out:
+ return;
+}
+
+
+/**
+ * insn_get_displacement() - Get the displacement of instruction
+ * @insn: &struct insn containing instruction
+ *
+ * If necessary, first collects the instruction up to and including the
+ * SIB byte.
+ * Displacement value is sign-expanded.
+ */
+void insn_get_displacement(struct insn *insn)
+{
+ insn_byte_t mod, rm, base;
+
+ if (insn->displacement.got)
+ return;
+ if (!insn->sib.got)
+ insn_get_sib(insn);
+ if (insn->modrm.nbytes) {
+ /*
+ * Interpreting the modrm byte:
+ * mod = 00 - no displacement fields (exceptions below)
+ * mod = 01 - 1-byte displacement field
+ * mod = 10 - displacement field is 4 bytes, or 2 bytes if
+ * address size = 2 (0x67 prefix in 32-bit mode)
+ * mod = 11 - no memory operand
+ *
+ * If address size = 2...
+ * mod = 00, r/m = 110 - displacement field is 2 bytes
+ *
+ * If address size != 2...
+ * mod != 11, r/m = 100 - SIB byte exists
+ * mod = 00, SIB base = 101 - displacement field is 4 bytes
+ * mod = 00, r/m = 101 - rip-relative addressing, displacement
+ * field is 4 bytes
+ */
+ mod = X86_MODRM_MOD(insn->modrm.value);
+ rm = X86_MODRM_RM(insn->modrm.value);
+ base = X86_SIB_BASE(insn->sib.value);
+ if (mod == 3)
+ goto out;
+ if (mod == 1) {
+ insn->displacement.value = get_next(signed char, insn);
+ insn->displacement.nbytes = 1;
+ } else if (insn->addr_bytes == 2) {
+ if ((mod == 0 && rm == 6) || mod == 2) {
+ insn->displacement.value =
+ get_next(short, insn);
+ insn->displacement.nbytes = 2;
+ }
+ } else {
+ if ((mod == 0 && rm == 5) || mod == 2 ||
+ (mod == 0 && base == 5)) {
+ insn->displacement.value = get_next(int, insn);
+ insn->displacement.nbytes = 4;
+ }
+ }
+ }
+out:
+ insn->displacement.got = 1;
+
+err_out:
+ return;
+}
+
+/* Decode moffset16/32/64. Return 0 if failed */
+static int __get_moffset(struct insn *insn)
+{
+ switch (insn->addr_bytes) {
+ case 2:
+ insn->moffset1.value = get_next(short, insn);
+ insn->moffset1.nbytes = 2;
+ break;
+ case 4:
+ insn->moffset1.value = get_next(int, insn);
+ insn->moffset1.nbytes = 4;
+ break;
+ case 8:
+ insn->moffset1.value = get_next(int, insn);
+ insn->moffset1.nbytes = 4;
+ insn->moffset2.value = get_next(int, insn);
+ insn->moffset2.nbytes = 4;
+ break;
+ default: /* opnd_bytes must be modified manually */
+ goto err_out;
+ }
+ insn->moffset1.got = insn->moffset2.got = 1;
+
+ return 1;
+
+err_out:
+ return 0;
+}
+
+/* Decode imm v32(Iz). Return 0 if failed */
+static int __get_immv32(struct insn *insn)
+{
+ switch (insn->opnd_bytes) {
+ case 2:
+ insn->immediate.value = get_next(short, insn);
+ insn->immediate.nbytes = 2;
+ break;
+ case 4:
+ case 8:
+ insn->immediate.value = get_next(int, insn);
+ insn->immediate.nbytes = 4;
+ break;
+ default: /* opnd_bytes must be modified manually */
+ goto err_out;
+ }
+
+ return 1;
+
+err_out:
+ return 0;
+}
+
+/* Decode imm v64(Iv/Ov), Return 0 if failed */
+static int __get_immv(struct insn *insn)
+{
+ switch (insn->opnd_bytes) {
+ case 2:
+ insn->immediate1.value = get_next(short, insn);
+ insn->immediate1.nbytes = 2;
+ break;
+ case 4:
+ insn->immediate1.value = get_next(int, insn);
+ insn->immediate1.nbytes = 4;
+ break;
+ case 8:
+ insn->immediate1.value = get_next(int, insn);
+ insn->immediate1.nbytes = 4;
+ insn->immediate2.value = get_next(int, insn);
+ insn->immediate2.nbytes = 4;
+ break;
+ default: /* opnd_bytes must be modified manually */
+ goto err_out;
+ }
+ insn->immediate1.got = insn->immediate2.got = 1;
+
+ return 1;
+err_out:
+ return 0;
+}
+
+/* Decode ptr16:16/32(Ap) */
+static int __get_immptr(struct insn *insn)
+{
+ switch (insn->opnd_bytes) {
+ case 2:
+ insn->immediate1.value = get_next(short, insn);
+ insn->immediate1.nbytes = 2;
+ break;
+ case 4:
+ insn->immediate1.value = get_next(int, insn);
+ insn->immediate1.nbytes = 4;
+ break;
+ case 8:
+ /* ptr16:64 is not exist (no segment) */
+ return 0;
+ default: /* opnd_bytes must be modified manually */
+ goto err_out;
+ }
+ insn->immediate2.value = get_next(unsigned short, insn);
+ insn->immediate2.nbytes = 2;
+ insn->immediate1.got = insn->immediate2.got = 1;
+
+ return 1;
+err_out:
+ return 0;
+}
+
+/**
+ * insn_get_immediate() - Get the immediates of instruction
+ * @insn: &struct insn containing instruction
+ *
+ * If necessary, first collects the instruction up to and including the
+ * displacement bytes.
+ * Basically, most of immediates are sign-expanded. Unsigned-value can be
+ * get by bit masking with ((1 << (nbytes * 8)) - 1)
+ */
+void insn_get_immediate(struct insn *insn)
+{
+ if (insn->immediate.got)
+ return;
+ if (!insn->displacement.got)
+ insn_get_displacement(insn);
+
+ if (inat_has_moffset(insn->attr)) {
+ if (!__get_moffset(insn))
+ goto err_out;
+ goto done;
+ }
+
+ if (!inat_has_immediate(insn->attr))
+ /* no immediates */
+ goto done;
+
+ switch (inat_immediate_size(insn->attr)) {
+ case INAT_IMM_BYTE:
+ insn->immediate.value = get_next(signed char, insn);
+ insn->immediate.nbytes = 1;
+ break;
+ case INAT_IMM_WORD:
+ insn->immediate.value = get_next(short, insn);
+ insn->immediate.nbytes = 2;
+ break;
+ case INAT_IMM_DWORD:
+ insn->immediate.value = get_next(int, insn);
+ insn->immediate.nbytes = 4;
+ break;
+ case INAT_IMM_QWORD:
+ insn->immediate1.value = get_next(int, insn);
+ insn->immediate1.nbytes = 4;
+ insn->immediate2.value = get_next(int, insn);
+ insn->immediate2.nbytes = 4;
+ break;
+ case INAT_IMM_PTR:
+ if (!__get_immptr(insn))
+ goto err_out;
+ break;
+ case INAT_IMM_VWORD32:
+ if (!__get_immv32(insn))
+ goto err_out;
+ break;
+ case INAT_IMM_VWORD:
+ if (!__get_immv(insn))
+ goto err_out;
+ break;
+ default:
+ /* Here, insn must have an immediate, but failed */
+ goto err_out;
+ }
+ if (inat_has_second_immediate(insn->attr)) {
+ insn->immediate2.value = get_next(signed char, insn);
+ insn->immediate2.nbytes = 1;
+ }
+done:
+ insn->immediate.got = 1;
+
+err_out:
+ return;
+}
+
+/**
+ * insn_get_length() - Get the length of instruction
+ * @insn: &struct insn containing instruction
+ *
+ * If necessary, first collects the instruction up to and including the
+ * immediates bytes.
+ */
+void insn_get_length(struct insn *insn)
+{
+ if (insn->length)
+ return;
+ if (!insn->immediate.got)
+ insn_get_immediate(insn);
+ insn->length = (unsigned char)((unsigned long)insn->next_byte
+ - (unsigned long)insn->kaddr);
+}
diff --git a/arch/x86/lib/iomap_copy_64.S b/arch/x86/lib/iomap_copy_64.S
new file mode 100644
index 000000000..33147fef3
--- /dev/null
+++ b/arch/x86/lib/iomap_copy_64.S
@@ -0,0 +1,27 @@
+/*
+ * Copyright 2006 PathScale, Inc. All Rights Reserved.
+ *
+ * This file is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include <linux/linkage.h>
+
+/*
+ * override generic version in lib/iomap_copy.c
+ */
+ENTRY(__iowrite32_copy)
+ movl %edx,%ecx
+ rep movsd
+ ret
+ENDPROC(__iowrite32_copy)
diff --git a/arch/x86/lib/kaslr.c b/arch/x86/lib/kaslr.c
new file mode 100644
index 000000000..a53665116
--- /dev/null
+++ b/arch/x86/lib/kaslr.c
@@ -0,0 +1,92 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Entropy functions used on early boot for KASLR base and memory
+ * randomization. The base randomization is done in the compressed
+ * kernel and memory randomization is done early when the regular
+ * kernel starts. This file is included in the compressed kernel and
+ * normally linked in the regular.
+ */
+#include <asm/asm.h>
+#include <asm/kaslr.h>
+#include <asm/msr.h>
+#include <asm/archrandom.h>
+#include <asm/e820/api.h>
+#include <asm/io.h>
+
+/*
+ * When built for the regular kernel, several functions need to be stubbed out
+ * or changed to their regular kernel equivalent.
+ */
+#ifndef KASLR_COMPRESSED_BOOT
+#include <asm/cpufeature.h>
+#include <asm/setup.h>
+
+#define debug_putstr(v) early_printk("%s", v)
+#define has_cpuflag(f) boot_cpu_has(f)
+#define get_boot_seed() kaslr_offset()
+#endif
+
+#define I8254_PORT_CONTROL 0x43
+#define I8254_PORT_COUNTER0 0x40
+#define I8254_CMD_READBACK 0xC0
+#define I8254_SELECT_COUNTER0 0x02
+#define I8254_STATUS_NOTREADY 0x40
+static inline u16 i8254(void)
+{
+ u16 status, timer;
+
+ do {
+ outb(I8254_CMD_READBACK | I8254_SELECT_COUNTER0,
+ I8254_PORT_CONTROL);
+ status = inb(I8254_PORT_COUNTER0);
+ timer = inb(I8254_PORT_COUNTER0);
+ timer |= inb(I8254_PORT_COUNTER0) << 8;
+ } while (status & I8254_STATUS_NOTREADY);
+
+ return timer;
+}
+
+unsigned long kaslr_get_random_long(const char *purpose)
+{
+#ifdef CONFIG_X86_64
+ const unsigned long mix_const = 0x5d6008cbf3848dd3UL;
+#else
+ const unsigned long mix_const = 0x3f39e593UL;
+#endif
+ unsigned long raw, random = get_boot_seed();
+ bool use_i8254 = true;
+
+ debug_putstr(purpose);
+ debug_putstr(" KASLR using");
+
+ if (has_cpuflag(X86_FEATURE_RDRAND)) {
+ debug_putstr(" RDRAND");
+ if (rdrand_long(&raw)) {
+ random ^= raw;
+ use_i8254 = false;
+ }
+ }
+
+ if (has_cpuflag(X86_FEATURE_TSC)) {
+ debug_putstr(" RDTSC");
+ raw = rdtsc();
+
+ random ^= raw;
+ use_i8254 = false;
+ }
+
+ if (use_i8254) {
+ debug_putstr(" i8254");
+ random ^= i8254();
+ }
+
+ /* Circular multiply for better bit diffusion */
+ asm(_ASM_MUL "%3"
+ : "=a" (random), "=d" (raw)
+ : "a" (random), "rm" (mix_const));
+ random += raw;
+
+ debug_putstr("...\n");
+
+ return random;
+}
diff --git a/arch/x86/lib/memcpy_32.c b/arch/x86/lib/memcpy_32.c
new file mode 100644
index 000000000..e565d1c90
--- /dev/null
+++ b/arch/x86/lib/memcpy_32.c
@@ -0,0 +1,209 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/string.h>
+#include <linux/export.h>
+
+#undef memcpy
+#undef memset
+
+__visible void *memcpy(void *to, const void *from, size_t n)
+{
+#if defined(CONFIG_X86_USE_3DNOW) && !defined(CONFIG_FORTIFY_SOURCE)
+ return __memcpy3d(to, from, n);
+#else
+ return __memcpy(to, from, n);
+#endif
+}
+EXPORT_SYMBOL(memcpy);
+
+__visible void *memset(void *s, int c, size_t count)
+{
+ return __memset(s, c, count);
+}
+EXPORT_SYMBOL(memset);
+
+__visible void *memmove(void *dest, const void *src, size_t n)
+{
+ int d0,d1,d2,d3,d4,d5;
+ char *ret = dest;
+
+ __asm__ __volatile__(
+ /* Handle more 16 bytes in loop */
+ "cmp $0x10, %0\n\t"
+ "jb 1f\n\t"
+
+ /* Decide forward/backward copy mode */
+ "cmp %2, %1\n\t"
+ "jb 2f\n\t"
+
+ /*
+ * movs instruction have many startup latency
+ * so we handle small size by general register.
+ */
+ "cmp $680, %0\n\t"
+ "jb 3f\n\t"
+ /*
+ * movs instruction is only good for aligned case.
+ */
+ "mov %1, %3\n\t"
+ "xor %2, %3\n\t"
+ "and $0xff, %3\n\t"
+ "jz 4f\n\t"
+ "3:\n\t"
+ "sub $0x10, %0\n\t"
+
+ /*
+ * We gobble 16 bytes forward in each loop.
+ */
+ "3:\n\t"
+ "sub $0x10, %0\n\t"
+ "mov 0*4(%1), %3\n\t"
+ "mov 1*4(%1), %4\n\t"
+ "mov %3, 0*4(%2)\n\t"
+ "mov %4, 1*4(%2)\n\t"
+ "mov 2*4(%1), %3\n\t"
+ "mov 3*4(%1), %4\n\t"
+ "mov %3, 2*4(%2)\n\t"
+ "mov %4, 3*4(%2)\n\t"
+ "lea 0x10(%1), %1\n\t"
+ "lea 0x10(%2), %2\n\t"
+ "jae 3b\n\t"
+ "add $0x10, %0\n\t"
+ "jmp 1f\n\t"
+
+ /*
+ * Handle data forward by movs.
+ */
+ ".p2align 4\n\t"
+ "4:\n\t"
+ "mov -4(%1, %0), %3\n\t"
+ "lea -4(%2, %0), %4\n\t"
+ "shr $2, %0\n\t"
+ "rep movsl\n\t"
+ "mov %3, (%4)\n\t"
+ "jmp 11f\n\t"
+ /*
+ * Handle data backward by movs.
+ */
+ ".p2align 4\n\t"
+ "6:\n\t"
+ "mov (%1), %3\n\t"
+ "mov %2, %4\n\t"
+ "lea -4(%1, %0), %1\n\t"
+ "lea -4(%2, %0), %2\n\t"
+ "shr $2, %0\n\t"
+ "std\n\t"
+ "rep movsl\n\t"
+ "mov %3,(%4)\n\t"
+ "cld\n\t"
+ "jmp 11f\n\t"
+
+ /*
+ * Start to prepare for backward copy.
+ */
+ ".p2align 4\n\t"
+ "2:\n\t"
+ "cmp $680, %0\n\t"
+ "jb 5f\n\t"
+ "mov %1, %3\n\t"
+ "xor %2, %3\n\t"
+ "and $0xff, %3\n\t"
+ "jz 6b\n\t"
+
+ /*
+ * Calculate copy position to tail.
+ */
+ "5:\n\t"
+ "add %0, %1\n\t"
+ "add %0, %2\n\t"
+ "sub $0x10, %0\n\t"
+
+ /*
+ * We gobble 16 bytes backward in each loop.
+ */
+ "7:\n\t"
+ "sub $0x10, %0\n\t"
+
+ "mov -1*4(%1), %3\n\t"
+ "mov -2*4(%1), %4\n\t"
+ "mov %3, -1*4(%2)\n\t"
+ "mov %4, -2*4(%2)\n\t"
+ "mov -3*4(%1), %3\n\t"
+ "mov -4*4(%1), %4\n\t"
+ "mov %3, -3*4(%2)\n\t"
+ "mov %4, -4*4(%2)\n\t"
+ "lea -0x10(%1), %1\n\t"
+ "lea -0x10(%2), %2\n\t"
+ "jae 7b\n\t"
+ /*
+ * Calculate copy position to head.
+ */
+ "add $0x10, %0\n\t"
+ "sub %0, %1\n\t"
+ "sub %0, %2\n\t"
+
+ /*
+ * Move data from 8 bytes to 15 bytes.
+ */
+ ".p2align 4\n\t"
+ "1:\n\t"
+ "cmp $8, %0\n\t"
+ "jb 8f\n\t"
+ "mov 0*4(%1), %3\n\t"
+ "mov 1*4(%1), %4\n\t"
+ "mov -2*4(%1, %0), %5\n\t"
+ "mov -1*4(%1, %0), %1\n\t"
+
+ "mov %3, 0*4(%2)\n\t"
+ "mov %4, 1*4(%2)\n\t"
+ "mov %5, -2*4(%2, %0)\n\t"
+ "mov %1, -1*4(%2, %0)\n\t"
+ "jmp 11f\n\t"
+
+ /*
+ * Move data from 4 bytes to 7 bytes.
+ */
+ ".p2align 4\n\t"
+ "8:\n\t"
+ "cmp $4, %0\n\t"
+ "jb 9f\n\t"
+ "mov 0*4(%1), %3\n\t"
+ "mov -1*4(%1, %0), %4\n\t"
+ "mov %3, 0*4(%2)\n\t"
+ "mov %4, -1*4(%2, %0)\n\t"
+ "jmp 11f\n\t"
+
+ /*
+ * Move data from 2 bytes to 3 bytes.
+ */
+ ".p2align 4\n\t"
+ "9:\n\t"
+ "cmp $2, %0\n\t"
+ "jb 10f\n\t"
+ "movw 0*2(%1), %%dx\n\t"
+ "movw -1*2(%1, %0), %%bx\n\t"
+ "movw %%dx, 0*2(%2)\n\t"
+ "movw %%bx, -1*2(%2, %0)\n\t"
+ "jmp 11f\n\t"
+
+ /*
+ * Move data for 1 byte.
+ */
+ ".p2align 4\n\t"
+ "10:\n\t"
+ "cmp $1, %0\n\t"
+ "jb 11f\n\t"
+ "movb (%1), %%cl\n\t"
+ "movb %%cl, (%2)\n\t"
+ ".p2align 4\n\t"
+ "11:"
+ : "=&c" (d0), "=&S" (d1), "=&D" (d2),
+ "=r" (d3),"=r" (d4), "=r"(d5)
+ :"0" (n),
+ "1" (src),
+ "2" (dest)
+ :"memory");
+
+ return ret;
+
+}
+EXPORT_SYMBOL(memmove);
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
new file mode 100644
index 000000000..84b007827
--- /dev/null
+++ b/arch/x86/lib/memcpy_64.S
@@ -0,0 +1,298 @@
+/* Copyright 2002 Andi Kleen */
+
+#include <linux/linkage.h>
+#include <asm/errno.h>
+#include <asm/cpufeatures.h>
+#include <asm/mcsafe_test.h>
+#include <asm/alternative-asm.h>
+#include <asm/export.h>
+
+/*
+ * We build a jump to memcpy_orig by default which gets NOPped out on
+ * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
+ * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
+ * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
+ */
+
+/*
+ * memcpy - Copy a memory block.
+ *
+ * Input:
+ * rdi destination
+ * rsi source
+ * rdx count
+ *
+ * Output:
+ * rax original destination
+ */
+ENTRY(__memcpy)
+.weak memcpy
+.p2align 4, 0x90
+memcpy:
+ ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
+ "jmp memcpy_erms", X86_FEATURE_ERMS
+
+ movq %rdi, %rax
+ movq %rdx, %rcx
+ shrq $3, %rcx
+ andl $7, %edx
+ rep movsq
+ movl %edx, %ecx
+ rep movsb
+ ret
+ENDPROC(memcpy)
+ENDPROC(__memcpy)
+EXPORT_SYMBOL(memcpy)
+EXPORT_SYMBOL(__memcpy)
+
+/*
+ * memcpy_erms() - enhanced fast string memcpy. This is faster and
+ * simpler than memcpy. Use memcpy_erms when possible.
+ */
+ENTRY(memcpy_erms)
+ movq %rdi, %rax
+ movq %rdx, %rcx
+ rep movsb
+ ret
+ENDPROC(memcpy_erms)
+
+ENTRY(memcpy_orig)
+ movq %rdi, %rax
+
+ cmpq $0x20, %rdx
+ jb .Lhandle_tail
+
+ /*
+ * We check whether memory false dependence could occur,
+ * then jump to corresponding copy mode.
+ */
+ cmp %dil, %sil
+ jl .Lcopy_backward
+ subq $0x20, %rdx
+.Lcopy_forward_loop:
+ subq $0x20, %rdx
+
+ /*
+ * Move in blocks of 4x8 bytes:
+ */
+ movq 0*8(%rsi), %r8
+ movq 1*8(%rsi), %r9
+ movq 2*8(%rsi), %r10
+ movq 3*8(%rsi), %r11
+ leaq 4*8(%rsi), %rsi
+
+ movq %r8, 0*8(%rdi)
+ movq %r9, 1*8(%rdi)
+ movq %r10, 2*8(%rdi)
+ movq %r11, 3*8(%rdi)
+ leaq 4*8(%rdi), %rdi
+ jae .Lcopy_forward_loop
+ addl $0x20, %edx
+ jmp .Lhandle_tail
+
+.Lcopy_backward:
+ /*
+ * Calculate copy position to tail.
+ */
+ addq %rdx, %rsi
+ addq %rdx, %rdi
+ subq $0x20, %rdx
+ /*
+ * At most 3 ALU operations in one cycle,
+ * so append NOPS in the same 16 bytes trunk.
+ */
+ .p2align 4
+.Lcopy_backward_loop:
+ subq $0x20, %rdx
+ movq -1*8(%rsi), %r8
+ movq -2*8(%rsi), %r9
+ movq -3*8(%rsi), %r10
+ movq -4*8(%rsi), %r11
+ leaq -4*8(%rsi), %rsi
+ movq %r8, -1*8(%rdi)
+ movq %r9, -2*8(%rdi)
+ movq %r10, -3*8(%rdi)
+ movq %r11, -4*8(%rdi)
+ leaq -4*8(%rdi), %rdi
+ jae .Lcopy_backward_loop
+
+ /*
+ * Calculate copy position to head.
+ */
+ addl $0x20, %edx
+ subq %rdx, %rsi
+ subq %rdx, %rdi
+.Lhandle_tail:
+ cmpl $16, %edx
+ jb .Lless_16bytes
+
+ /*
+ * Move data from 16 bytes to 31 bytes.
+ */
+ movq 0*8(%rsi), %r8
+ movq 1*8(%rsi), %r9
+ movq -2*8(%rsi, %rdx), %r10
+ movq -1*8(%rsi, %rdx), %r11
+ movq %r8, 0*8(%rdi)
+ movq %r9, 1*8(%rdi)
+ movq %r10, -2*8(%rdi, %rdx)
+ movq %r11, -1*8(%rdi, %rdx)
+ retq
+ .p2align 4
+.Lless_16bytes:
+ cmpl $8, %edx
+ jb .Lless_8bytes
+ /*
+ * Move data from 8 bytes to 15 bytes.
+ */
+ movq 0*8(%rsi), %r8
+ movq -1*8(%rsi, %rdx), %r9
+ movq %r8, 0*8(%rdi)
+ movq %r9, -1*8(%rdi, %rdx)
+ retq
+ .p2align 4
+.Lless_8bytes:
+ cmpl $4, %edx
+ jb .Lless_3bytes
+
+ /*
+ * Move data from 4 bytes to 7 bytes.
+ */
+ movl (%rsi), %ecx
+ movl -4(%rsi, %rdx), %r8d
+ movl %ecx, (%rdi)
+ movl %r8d, -4(%rdi, %rdx)
+ retq
+ .p2align 4
+.Lless_3bytes:
+ subl $1, %edx
+ jb .Lend
+ /*
+ * Move data from 1 bytes to 3 bytes.
+ */
+ movzbl (%rsi), %ecx
+ jz .Lstore_1byte
+ movzbq 1(%rsi), %r8
+ movzbq (%rsi, %rdx), %r9
+ movb %r8b, 1(%rdi)
+ movb %r9b, (%rdi, %rdx)
+.Lstore_1byte:
+ movb %cl, (%rdi)
+
+.Lend:
+ retq
+ENDPROC(memcpy_orig)
+
+#ifndef CONFIG_UML
+
+MCSAFE_TEST_CTL
+
+/*
+ * __memcpy_mcsafe - memory copy with machine check exception handling
+ * Note that we only catch machine checks when reading the source addresses.
+ * Writes to target are posted and don't generate machine checks.
+ */
+ENTRY(__memcpy_mcsafe)
+ cmpl $8, %edx
+ /* Less than 8 bytes? Go to byte copy loop */
+ jb .L_no_whole_words
+
+ /* Check for bad alignment of source */
+ testl $7, %esi
+ /* Already aligned */
+ jz .L_8byte_aligned
+
+ /* Copy one byte at a time until source is 8-byte aligned */
+ movl %esi, %ecx
+ andl $7, %ecx
+ subl $8, %ecx
+ negl %ecx
+ subl %ecx, %edx
+.L_read_leading_bytes:
+ movb (%rsi), %al
+ MCSAFE_TEST_SRC %rsi 1 .E_leading_bytes
+ MCSAFE_TEST_DST %rdi 1 .E_leading_bytes
+.L_write_leading_bytes:
+ movb %al, (%rdi)
+ incq %rsi
+ incq %rdi
+ decl %ecx
+ jnz .L_read_leading_bytes
+
+.L_8byte_aligned:
+ movl %edx, %ecx
+ andl $7, %edx
+ shrl $3, %ecx
+ jz .L_no_whole_words
+
+.L_read_words:
+ movq (%rsi), %r8
+ MCSAFE_TEST_SRC %rsi 8 .E_read_words
+ MCSAFE_TEST_DST %rdi 8 .E_write_words
+.L_write_words:
+ movq %r8, (%rdi)
+ addq $8, %rsi
+ addq $8, %rdi
+ decl %ecx
+ jnz .L_read_words
+
+ /* Any trailing bytes? */
+.L_no_whole_words:
+ andl %edx, %edx
+ jz .L_done_memcpy_trap
+
+ /* Copy trailing bytes */
+ movl %edx, %ecx
+.L_read_trailing_bytes:
+ movb (%rsi), %al
+ MCSAFE_TEST_SRC %rsi 1 .E_trailing_bytes
+ MCSAFE_TEST_DST %rdi 1 .E_trailing_bytes
+.L_write_trailing_bytes:
+ movb %al, (%rdi)
+ incq %rsi
+ incq %rdi
+ decl %ecx
+ jnz .L_read_trailing_bytes
+
+ /* Copy successful. Return zero */
+.L_done_memcpy_trap:
+ xorl %eax, %eax
+.L_done:
+ ret
+ENDPROC(__memcpy_mcsafe)
+EXPORT_SYMBOL_GPL(__memcpy_mcsafe)
+
+ .section .fixup, "ax"
+ /*
+ * Return number of bytes not copied for any failure. Note that
+ * there is no "tail" handling since the source buffer is 8-byte
+ * aligned and poison is cacheline aligned.
+ */
+.E_read_words:
+ shll $3, %ecx
+.E_leading_bytes:
+ addl %edx, %ecx
+.E_trailing_bytes:
+ mov %ecx, %eax
+ jmp .L_done
+
+ /*
+ * For write fault handling, given the destination is unaligned,
+ * we handle faults on multi-byte writes with a byte-by-byte
+ * copy up to the write-protected page.
+ */
+.E_write_words:
+ shll $3, %ecx
+ addl %edx, %ecx
+ movl %ecx, %edx
+ jmp mcsafe_handle_tail
+
+ .previous
+
+ _ASM_EXTABLE_FAULT(.L_read_leading_bytes, .E_leading_bytes)
+ _ASM_EXTABLE_FAULT(.L_read_words, .E_read_words)
+ _ASM_EXTABLE_FAULT(.L_read_trailing_bytes, .E_trailing_bytes)
+ _ASM_EXTABLE(.L_write_leading_bytes, .E_leading_bytes)
+ _ASM_EXTABLE(.L_write_words, .E_write_words)
+ _ASM_EXTABLE(.L_write_trailing_bytes, .E_trailing_bytes)
+#endif
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S
new file mode 100644
index 000000000..e1cfc880f
--- /dev/null
+++ b/arch/x86/lib/memmove_64.S
@@ -0,0 +1,213 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Normally compiler builtins are used, but sometimes the compiler calls out
+ * of line code. Based on asm-i386/string.h.
+ *
+ * This assembly file is re-written from memmove_64.c file.
+ * - Copyright 2011 Fenghua Yu <fenghua.yu@intel.com>
+ */
+#include <linux/linkage.h>
+#include <asm/cpufeatures.h>
+#include <asm/alternative-asm.h>
+#include <asm/export.h>
+
+#undef memmove
+
+/*
+ * Implement memmove(). This can handle overlap between src and dst.
+ *
+ * Input:
+ * rdi: dest
+ * rsi: src
+ * rdx: count
+ *
+ * Output:
+ * rax: dest
+ */
+.weak memmove
+.p2align 4, 0x90
+memmove:
+ENTRY(__memmove)
+
+ /* Handle more 32 bytes in loop */
+ mov %rdi, %rax
+ cmp $0x20, %rdx
+ jb 1f
+
+ /* Decide forward/backward copy mode */
+ cmp %rdi, %rsi
+ jge .Lmemmove_begin_forward
+ mov %rsi, %r8
+ add %rdx, %r8
+ cmp %rdi, %r8
+ jg 2f
+
+.Lmemmove_begin_forward:
+ ALTERNATIVE "", "movq %rdx, %rcx; rep movsb; retq", X86_FEATURE_ERMS
+
+ /*
+ * movsq instruction have many startup latency
+ * so we handle small size by general register.
+ */
+ cmp $680, %rdx
+ jb 3f
+ /*
+ * movsq instruction is only good for aligned case.
+ */
+
+ cmpb %dil, %sil
+ je 4f
+3:
+ sub $0x20, %rdx
+ /*
+ * We gobble 32 bytes forward in each loop.
+ */
+5:
+ sub $0x20, %rdx
+ movq 0*8(%rsi), %r11
+ movq 1*8(%rsi), %r10
+ movq 2*8(%rsi), %r9
+ movq 3*8(%rsi), %r8
+ leaq 4*8(%rsi), %rsi
+
+ movq %r11, 0*8(%rdi)
+ movq %r10, 1*8(%rdi)
+ movq %r9, 2*8(%rdi)
+ movq %r8, 3*8(%rdi)
+ leaq 4*8(%rdi), %rdi
+ jae 5b
+ addq $0x20, %rdx
+ jmp 1f
+ /*
+ * Handle data forward by movsq.
+ */
+ .p2align 4
+4:
+ movq %rdx, %rcx
+ movq -8(%rsi, %rdx), %r11
+ lea -8(%rdi, %rdx), %r10
+ shrq $3, %rcx
+ rep movsq
+ movq %r11, (%r10)
+ jmp 13f
+.Lmemmove_end_forward:
+
+ /*
+ * Handle data backward by movsq.
+ */
+ .p2align 4
+7:
+ movq %rdx, %rcx
+ movq (%rsi), %r11
+ movq %rdi, %r10
+ leaq -8(%rsi, %rdx), %rsi
+ leaq -8(%rdi, %rdx), %rdi
+ shrq $3, %rcx
+ std
+ rep movsq
+ cld
+ movq %r11, (%r10)
+ jmp 13f
+
+ /*
+ * Start to prepare for backward copy.
+ */
+ .p2align 4
+2:
+ cmp $680, %rdx
+ jb 6f
+ cmp %dil, %sil
+ je 7b
+6:
+ /*
+ * Calculate copy position to tail.
+ */
+ addq %rdx, %rsi
+ addq %rdx, %rdi
+ subq $0x20, %rdx
+ /*
+ * We gobble 32 bytes backward in each loop.
+ */
+8:
+ subq $0x20, %rdx
+ movq -1*8(%rsi), %r11
+ movq -2*8(%rsi), %r10
+ movq -3*8(%rsi), %r9
+ movq -4*8(%rsi), %r8
+ leaq -4*8(%rsi), %rsi
+
+ movq %r11, -1*8(%rdi)
+ movq %r10, -2*8(%rdi)
+ movq %r9, -3*8(%rdi)
+ movq %r8, -4*8(%rdi)
+ leaq -4*8(%rdi), %rdi
+ jae 8b
+ /*
+ * Calculate copy position to head.
+ */
+ addq $0x20, %rdx
+ subq %rdx, %rsi
+ subq %rdx, %rdi
+1:
+ cmpq $16, %rdx
+ jb 9f
+ /*
+ * Move data from 16 bytes to 31 bytes.
+ */
+ movq 0*8(%rsi), %r11
+ movq 1*8(%rsi), %r10
+ movq -2*8(%rsi, %rdx), %r9
+ movq -1*8(%rsi, %rdx), %r8
+ movq %r11, 0*8(%rdi)
+ movq %r10, 1*8(%rdi)
+ movq %r9, -2*8(%rdi, %rdx)
+ movq %r8, -1*8(%rdi, %rdx)
+ jmp 13f
+ .p2align 4
+9:
+ cmpq $8, %rdx
+ jb 10f
+ /*
+ * Move data from 8 bytes to 15 bytes.
+ */
+ movq 0*8(%rsi), %r11
+ movq -1*8(%rsi, %rdx), %r10
+ movq %r11, 0*8(%rdi)
+ movq %r10, -1*8(%rdi, %rdx)
+ jmp 13f
+10:
+ cmpq $4, %rdx
+ jb 11f
+ /*
+ * Move data from 4 bytes to 7 bytes.
+ */
+ movl (%rsi), %r11d
+ movl -4(%rsi, %rdx), %r10d
+ movl %r11d, (%rdi)
+ movl %r10d, -4(%rdi, %rdx)
+ jmp 13f
+11:
+ cmp $2, %rdx
+ jb 12f
+ /*
+ * Move data from 2 bytes to 3 bytes.
+ */
+ movw (%rsi), %r11w
+ movw -2(%rsi, %rdx), %r10w
+ movw %r11w, (%rdi)
+ movw %r10w, -2(%rdi, %rdx)
+ jmp 13f
+12:
+ cmp $1, %rdx
+ jb 13f
+ /*
+ * Move data for 1 byte.
+ */
+ movb (%rsi), %r11b
+ movb %r11b, (%rdi)
+13:
+ retq
+ENDPROC(__memmove)
+ENDPROC(memmove)
+EXPORT_SYMBOL(__memmove)
+EXPORT_SYMBOL(memmove)
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
new file mode 100644
index 000000000..084189acd
--- /dev/null
+++ b/arch/x86/lib/memset_64.S
@@ -0,0 +1,142 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright 2002 Andi Kleen, SuSE Labs */
+
+#include <linux/linkage.h>
+#include <asm/cpufeatures.h>
+#include <asm/alternative-asm.h>
+#include <asm/export.h>
+
+/*
+ * ISO C memset - set a memory block to a byte value. This function uses fast
+ * string to get better performance than the original function. The code is
+ * simpler and shorter than the original function as well.
+ *
+ * rdi destination
+ * rsi value (char)
+ * rdx count (bytes)
+ *
+ * rax original destination
+ */
+.weak memset
+.p2align 4, 0x90
+memset:
+ENTRY(__memset)
+ /*
+ * Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended
+ * to use it when possible. If not available, use fast string instructions.
+ *
+ * Otherwise, use original memset function.
+ */
+ ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \
+ "jmp memset_erms", X86_FEATURE_ERMS
+
+ movq %rdi,%r9
+ movq %rdx,%rcx
+ andl $7,%edx
+ shrq $3,%rcx
+ /* expand byte value */
+ movzbl %sil,%esi
+ movabs $0x0101010101010101,%rax
+ imulq %rsi,%rax
+ rep stosq
+ movl %edx,%ecx
+ rep stosb
+ movq %r9,%rax
+ ret
+ENDPROC(memset)
+ENDPROC(__memset)
+EXPORT_SYMBOL(memset)
+EXPORT_SYMBOL(__memset)
+
+/*
+ * ISO C memset - set a memory block to a byte value. This function uses
+ * enhanced rep stosb to override the fast string function.
+ * The code is simpler and shorter than the fast string function as well.
+ *
+ * rdi destination
+ * rsi value (char)
+ * rdx count (bytes)
+ *
+ * rax original destination
+ */
+ENTRY(memset_erms)
+ movq %rdi,%r9
+ movb %sil,%al
+ movq %rdx,%rcx
+ rep stosb
+ movq %r9,%rax
+ ret
+ENDPROC(memset_erms)
+
+ENTRY(memset_orig)
+ movq %rdi,%r10
+
+ /* expand byte value */
+ movzbl %sil,%ecx
+ movabs $0x0101010101010101,%rax
+ imulq %rcx,%rax
+
+ /* align dst */
+ movl %edi,%r9d
+ andl $7,%r9d
+ jnz .Lbad_alignment
+.Lafter_bad_alignment:
+
+ movq %rdx,%rcx
+ shrq $6,%rcx
+ jz .Lhandle_tail
+
+ .p2align 4
+.Lloop_64:
+ decq %rcx
+ movq %rax,(%rdi)
+ movq %rax,8(%rdi)
+ movq %rax,16(%rdi)
+ movq %rax,24(%rdi)
+ movq %rax,32(%rdi)
+ movq %rax,40(%rdi)
+ movq %rax,48(%rdi)
+ movq %rax,56(%rdi)
+ leaq 64(%rdi),%rdi
+ jnz .Lloop_64
+
+ /* Handle tail in loops. The loops should be faster than hard
+ to predict jump tables. */
+ .p2align 4
+.Lhandle_tail:
+ movl %edx,%ecx
+ andl $63&(~7),%ecx
+ jz .Lhandle_7
+ shrl $3,%ecx
+ .p2align 4
+.Lloop_8:
+ decl %ecx
+ movq %rax,(%rdi)
+ leaq 8(%rdi),%rdi
+ jnz .Lloop_8
+
+.Lhandle_7:
+ andl $7,%edx
+ jz .Lende
+ .p2align 4
+.Lloop_1:
+ decl %edx
+ movb %al,(%rdi)
+ leaq 1(%rdi),%rdi
+ jnz .Lloop_1
+
+.Lende:
+ movq %r10,%rax
+ ret
+
+.Lbad_alignment:
+ cmpq $7,%rdx
+ jbe .Lhandle_7
+ movq %rax,(%rdi) /* unaligned store */
+ movq $8,%r8
+ subq %r9,%r8
+ addq %r8,%rdi
+ subq %r8,%rdx
+ jmp .Lafter_bad_alignment
+.Lfinal:
+ENDPROC(memset_orig)
diff --git a/arch/x86/lib/misc.c b/arch/x86/lib/misc.c
new file mode 100644
index 000000000..a018ec4fb
--- /dev/null
+++ b/arch/x86/lib/misc.c
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Count the digits of @val including a possible sign.
+ *
+ * (Typed on and submitted from hpa's mobile phone.)
+ */
+int num_digits(int val)
+{
+ int m = 10;
+ int d = 1;
+
+ if (val < 0) {
+ d++;
+ val = -val;
+ }
+
+ while (val >= m) {
+ m *= 10;
+ d++;
+ }
+ return d;
+}
diff --git a/arch/x86/lib/mmx_32.c b/arch/x86/lib/mmx_32.c
new file mode 100644
index 000000000..4321fa02e
--- /dev/null
+++ b/arch/x86/lib/mmx_32.c
@@ -0,0 +1,378 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * MMX 3DNow! library helper functions
+ *
+ * To do:
+ * We can use MMX just for prefetch in IRQ's. This may be a win.
+ * (reported so on K6-III)
+ * We should use a better code neutral filler for the short jump
+ * leal ebx. [ebx] is apparently best for K6-2, but Cyrix ??
+ * We also want to clobber the filler register so we don't get any
+ * register forwarding stalls on the filler.
+ *
+ * Add *user handling. Checksums are not a win with MMX on any CPU
+ * tested so far for any MMX solution figured.
+ *
+ * 22/09/2000 - Arjan van de Ven
+ * Improved for non-egineering-sample Athlons
+ *
+ */
+#include <linux/hardirq.h>
+#include <linux/string.h>
+#include <linux/export.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+
+#include <asm/fpu/api.h>
+#include <asm/asm.h>
+
+void *_mmx_memcpy(void *to, const void *from, size_t len)
+{
+ void *p;
+ int i;
+
+ if (unlikely(in_interrupt()))
+ return __memcpy(to, from, len);
+
+ p = to;
+ i = len >> 6; /* len/64 */
+
+ kernel_fpu_begin();
+
+ __asm__ __volatile__ (
+ "1: prefetch (%0)\n" /* This set is 28 bytes */
+ " prefetch 64(%0)\n"
+ " prefetch 128(%0)\n"
+ " prefetch 192(%0)\n"
+ " prefetch 256(%0)\n"
+ "2: \n"
+ ".section .fixup, \"ax\"\n"
+ "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
+ " jmp 2b\n"
+ ".previous\n"
+ _ASM_EXTABLE(1b, 3b)
+ : : "r" (from));
+
+ for ( ; i > 5; i--) {
+ __asm__ __volatile__ (
+ "1: prefetch 320(%0)\n"
+ "2: movq (%0), %%mm0\n"
+ " movq 8(%0), %%mm1\n"
+ " movq 16(%0), %%mm2\n"
+ " movq 24(%0), %%mm3\n"
+ " movq %%mm0, (%1)\n"
+ " movq %%mm1, 8(%1)\n"
+ " movq %%mm2, 16(%1)\n"
+ " movq %%mm3, 24(%1)\n"
+ " movq 32(%0), %%mm0\n"
+ " movq 40(%0), %%mm1\n"
+ " movq 48(%0), %%mm2\n"
+ " movq 56(%0), %%mm3\n"
+ " movq %%mm0, 32(%1)\n"
+ " movq %%mm1, 40(%1)\n"
+ " movq %%mm2, 48(%1)\n"
+ " movq %%mm3, 56(%1)\n"
+ ".section .fixup, \"ax\"\n"
+ "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
+ " jmp 2b\n"
+ ".previous\n"
+ _ASM_EXTABLE(1b, 3b)
+ : : "r" (from), "r" (to) : "memory");
+
+ from += 64;
+ to += 64;
+ }
+
+ for ( ; i > 0; i--) {
+ __asm__ __volatile__ (
+ " movq (%0), %%mm0\n"
+ " movq 8(%0), %%mm1\n"
+ " movq 16(%0), %%mm2\n"
+ " movq 24(%0), %%mm3\n"
+ " movq %%mm0, (%1)\n"
+ " movq %%mm1, 8(%1)\n"
+ " movq %%mm2, 16(%1)\n"
+ " movq %%mm3, 24(%1)\n"
+ " movq 32(%0), %%mm0\n"
+ " movq 40(%0), %%mm1\n"
+ " movq 48(%0), %%mm2\n"
+ " movq 56(%0), %%mm3\n"
+ " movq %%mm0, 32(%1)\n"
+ " movq %%mm1, 40(%1)\n"
+ " movq %%mm2, 48(%1)\n"
+ " movq %%mm3, 56(%1)\n"
+ : : "r" (from), "r" (to) : "memory");
+
+ from += 64;
+ to += 64;
+ }
+ /*
+ * Now do the tail of the block:
+ */
+ __memcpy(to, from, len & 63);
+ kernel_fpu_end();
+
+ return p;
+}
+EXPORT_SYMBOL(_mmx_memcpy);
+
+#ifdef CONFIG_MK7
+
+/*
+ * The K7 has streaming cache bypass load/store. The Cyrix III, K6 and
+ * other MMX using processors do not.
+ */
+
+static void fast_clear_page(void *page)
+{
+ int i;
+
+ kernel_fpu_begin();
+
+ __asm__ __volatile__ (
+ " pxor %%mm0, %%mm0\n" : :
+ );
+
+ for (i = 0; i < 4096/64; i++) {
+ __asm__ __volatile__ (
+ " movntq %%mm0, (%0)\n"
+ " movntq %%mm0, 8(%0)\n"
+ " movntq %%mm0, 16(%0)\n"
+ " movntq %%mm0, 24(%0)\n"
+ " movntq %%mm0, 32(%0)\n"
+ " movntq %%mm0, 40(%0)\n"
+ " movntq %%mm0, 48(%0)\n"
+ " movntq %%mm0, 56(%0)\n"
+ : : "r" (page) : "memory");
+ page += 64;
+ }
+
+ /*
+ * Since movntq is weakly-ordered, a "sfence" is needed to become
+ * ordered again:
+ */
+ __asm__ __volatile__("sfence\n"::);
+
+ kernel_fpu_end();
+}
+
+static void fast_copy_page(void *to, void *from)
+{
+ int i;
+
+ kernel_fpu_begin();
+
+ /*
+ * maybe the prefetch stuff can go before the expensive fnsave...
+ * but that is for later. -AV
+ */
+ __asm__ __volatile__(
+ "1: prefetch (%0)\n"
+ " prefetch 64(%0)\n"
+ " prefetch 128(%0)\n"
+ " prefetch 192(%0)\n"
+ " prefetch 256(%0)\n"
+ "2: \n"
+ ".section .fixup, \"ax\"\n"
+ "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
+ " jmp 2b\n"
+ ".previous\n"
+ _ASM_EXTABLE(1b, 3b) : : "r" (from));
+
+ for (i = 0; i < (4096-320)/64; i++) {
+ __asm__ __volatile__ (
+ "1: prefetch 320(%0)\n"
+ "2: movq (%0), %%mm0\n"
+ " movntq %%mm0, (%1)\n"
+ " movq 8(%0), %%mm1\n"
+ " movntq %%mm1, 8(%1)\n"
+ " movq 16(%0), %%mm2\n"
+ " movntq %%mm2, 16(%1)\n"
+ " movq 24(%0), %%mm3\n"
+ " movntq %%mm3, 24(%1)\n"
+ " movq 32(%0), %%mm4\n"
+ " movntq %%mm4, 32(%1)\n"
+ " movq 40(%0), %%mm5\n"
+ " movntq %%mm5, 40(%1)\n"
+ " movq 48(%0), %%mm6\n"
+ " movntq %%mm6, 48(%1)\n"
+ " movq 56(%0), %%mm7\n"
+ " movntq %%mm7, 56(%1)\n"
+ ".section .fixup, \"ax\"\n"
+ "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
+ " jmp 2b\n"
+ ".previous\n"
+ _ASM_EXTABLE(1b, 3b) : : "r" (from), "r" (to) : "memory");
+
+ from += 64;
+ to += 64;
+ }
+
+ for (i = (4096-320)/64; i < 4096/64; i++) {
+ __asm__ __volatile__ (
+ "2: movq (%0), %%mm0\n"
+ " movntq %%mm0, (%1)\n"
+ " movq 8(%0), %%mm1\n"
+ " movntq %%mm1, 8(%1)\n"
+ " movq 16(%0), %%mm2\n"
+ " movntq %%mm2, 16(%1)\n"
+ " movq 24(%0), %%mm3\n"
+ " movntq %%mm3, 24(%1)\n"
+ " movq 32(%0), %%mm4\n"
+ " movntq %%mm4, 32(%1)\n"
+ " movq 40(%0), %%mm5\n"
+ " movntq %%mm5, 40(%1)\n"
+ " movq 48(%0), %%mm6\n"
+ " movntq %%mm6, 48(%1)\n"
+ " movq 56(%0), %%mm7\n"
+ " movntq %%mm7, 56(%1)\n"
+ : : "r" (from), "r" (to) : "memory");
+ from += 64;
+ to += 64;
+ }
+ /*
+ * Since movntq is weakly-ordered, a "sfence" is needed to become
+ * ordered again:
+ */
+ __asm__ __volatile__("sfence \n"::);
+ kernel_fpu_end();
+}
+
+#else /* CONFIG_MK7 */
+
+/*
+ * Generic MMX implementation without K7 specific streaming
+ */
+static void fast_clear_page(void *page)
+{
+ int i;
+
+ kernel_fpu_begin();
+
+ __asm__ __volatile__ (
+ " pxor %%mm0, %%mm0\n" : :
+ );
+
+ for (i = 0; i < 4096/128; i++) {
+ __asm__ __volatile__ (
+ " movq %%mm0, (%0)\n"
+ " movq %%mm0, 8(%0)\n"
+ " movq %%mm0, 16(%0)\n"
+ " movq %%mm0, 24(%0)\n"
+ " movq %%mm0, 32(%0)\n"
+ " movq %%mm0, 40(%0)\n"
+ " movq %%mm0, 48(%0)\n"
+ " movq %%mm0, 56(%0)\n"
+ " movq %%mm0, 64(%0)\n"
+ " movq %%mm0, 72(%0)\n"
+ " movq %%mm0, 80(%0)\n"
+ " movq %%mm0, 88(%0)\n"
+ " movq %%mm0, 96(%0)\n"
+ " movq %%mm0, 104(%0)\n"
+ " movq %%mm0, 112(%0)\n"
+ " movq %%mm0, 120(%0)\n"
+ : : "r" (page) : "memory");
+ page += 128;
+ }
+
+ kernel_fpu_end();
+}
+
+static void fast_copy_page(void *to, void *from)
+{
+ int i;
+
+ kernel_fpu_begin();
+
+ __asm__ __volatile__ (
+ "1: prefetch (%0)\n"
+ " prefetch 64(%0)\n"
+ " prefetch 128(%0)\n"
+ " prefetch 192(%0)\n"
+ " prefetch 256(%0)\n"
+ "2: \n"
+ ".section .fixup, \"ax\"\n"
+ "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
+ " jmp 2b\n"
+ ".previous\n"
+ _ASM_EXTABLE(1b, 3b) : : "r" (from));
+
+ for (i = 0; i < 4096/64; i++) {
+ __asm__ __volatile__ (
+ "1: prefetch 320(%0)\n"
+ "2: movq (%0), %%mm0\n"
+ " movq 8(%0), %%mm1\n"
+ " movq 16(%0), %%mm2\n"
+ " movq 24(%0), %%mm3\n"
+ " movq %%mm0, (%1)\n"
+ " movq %%mm1, 8(%1)\n"
+ " movq %%mm2, 16(%1)\n"
+ " movq %%mm3, 24(%1)\n"
+ " movq 32(%0), %%mm0\n"
+ " movq 40(%0), %%mm1\n"
+ " movq 48(%0), %%mm2\n"
+ " movq 56(%0), %%mm3\n"
+ " movq %%mm0, 32(%1)\n"
+ " movq %%mm1, 40(%1)\n"
+ " movq %%mm2, 48(%1)\n"
+ " movq %%mm3, 56(%1)\n"
+ ".section .fixup, \"ax\"\n"
+ "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
+ " jmp 2b\n"
+ ".previous\n"
+ _ASM_EXTABLE(1b, 3b)
+ : : "r" (from), "r" (to) : "memory");
+
+ from += 64;
+ to += 64;
+ }
+ kernel_fpu_end();
+}
+
+#endif /* !CONFIG_MK7 */
+
+/*
+ * Favour MMX for page clear and copy:
+ */
+static void slow_zero_page(void *page)
+{
+ int d0, d1;
+
+ __asm__ __volatile__(
+ "cld\n\t"
+ "rep ; stosl"
+
+ : "=&c" (d0), "=&D" (d1)
+ :"a" (0), "1" (page), "0" (1024)
+ :"memory");
+}
+
+void mmx_clear_page(void *page)
+{
+ if (unlikely(in_interrupt()))
+ slow_zero_page(page);
+ else
+ fast_clear_page(page);
+}
+EXPORT_SYMBOL(mmx_clear_page);
+
+static void slow_copy_page(void *to, void *from)
+{
+ int d0, d1, d2;
+
+ __asm__ __volatile__(
+ "cld\n\t"
+ "rep ; movsl"
+ : "=&c" (d0), "=&D" (d1), "=&S" (d2)
+ : "0" (1024), "1" ((long) to), "2" ((long) from)
+ : "memory");
+}
+
+void mmx_copy_page(void *to, void *from)
+{
+ if (unlikely(in_interrupt()))
+ slow_copy_page(to, from);
+ else
+ fast_copy_page(to, from);
+}
+EXPORT_SYMBOL(mmx_copy_page);
diff --git a/arch/x86/lib/msr-reg-export.c b/arch/x86/lib/msr-reg-export.c
new file mode 100644
index 000000000..876b4168a
--- /dev/null
+++ b/arch/x86/lib/msr-reg-export.c
@@ -0,0 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/export.h>
+#include <asm/msr.h>
+
+EXPORT_SYMBOL(rdmsr_safe_regs);
+EXPORT_SYMBOL(wrmsr_safe_regs);
diff --git a/arch/x86/lib/msr-reg.S b/arch/x86/lib/msr-reg.S
new file mode 100644
index 000000000..ed33cbab3
--- /dev/null
+++ b/arch/x86/lib/msr-reg.S
@@ -0,0 +1,93 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/linkage.h>
+#include <linux/errno.h>
+#include <asm/asm.h>
+#include <asm/msr.h>
+
+#ifdef CONFIG_X86_64
+/*
+ * int {rdmsr,wrmsr}_safe_regs(u32 gprs[8]);
+ *
+ * reg layout: u32 gprs[eax, ecx, edx, ebx, esp, ebp, esi, edi]
+ *
+ */
+.macro op_safe_regs op
+ENTRY(\op\()_safe_regs)
+ pushq %rbx
+ pushq %r12
+ movq %rdi, %r10 /* Save pointer */
+ xorl %r11d, %r11d /* Return value */
+ movl (%rdi), %eax
+ movl 4(%rdi), %ecx
+ movl 8(%rdi), %edx
+ movl 12(%rdi), %ebx
+ movl 20(%rdi), %r12d
+ movl 24(%rdi), %esi
+ movl 28(%rdi), %edi
+1: \op
+2: movl %eax, (%r10)
+ movl %r11d, %eax /* Return value */
+ movl %ecx, 4(%r10)
+ movl %edx, 8(%r10)
+ movl %ebx, 12(%r10)
+ movl %r12d, 20(%r10)
+ movl %esi, 24(%r10)
+ movl %edi, 28(%r10)
+ popq %r12
+ popq %rbx
+ ret
+3:
+ movl $-EIO, %r11d
+ jmp 2b
+
+ _ASM_EXTABLE(1b, 3b)
+ENDPROC(\op\()_safe_regs)
+.endm
+
+#else /* X86_32 */
+
+.macro op_safe_regs op
+ENTRY(\op\()_safe_regs)
+ pushl %ebx
+ pushl %ebp
+ pushl %esi
+ pushl %edi
+ pushl $0 /* Return value */
+ pushl %eax
+ movl 4(%eax), %ecx
+ movl 8(%eax), %edx
+ movl 12(%eax), %ebx
+ movl 20(%eax), %ebp
+ movl 24(%eax), %esi
+ movl 28(%eax), %edi
+ movl (%eax), %eax
+1: \op
+2: pushl %eax
+ movl 4(%esp), %eax
+ popl (%eax)
+ addl $4, %esp
+ movl %ecx, 4(%eax)
+ movl %edx, 8(%eax)
+ movl %ebx, 12(%eax)
+ movl %ebp, 20(%eax)
+ movl %esi, 24(%eax)
+ movl %edi, 28(%eax)
+ popl %eax
+ popl %edi
+ popl %esi
+ popl %ebp
+ popl %ebx
+ ret
+3:
+ movl $-EIO, 4(%esp)
+ jmp 2b
+
+ _ASM_EXTABLE(1b, 3b)
+ENDPROC(\op\()_safe_regs)
+.endm
+
+#endif
+
+op_safe_regs rdmsr
+op_safe_regs wrmsr
+
diff --git a/arch/x86/lib/msr-smp.c b/arch/x86/lib/msr-smp.c
new file mode 100644
index 000000000..9009393f4
--- /dev/null
+++ b/arch/x86/lib/msr-smp.c
@@ -0,0 +1,280 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/export.h>
+#include <linux/preempt.h>
+#include <linux/smp.h>
+#include <linux/completion.h>
+#include <asm/msr.h>
+
+static void __rdmsr_on_cpu(void *info)
+{
+ struct msr_info *rv = info;
+ struct msr *reg;
+ int this_cpu = raw_smp_processor_id();
+
+ if (rv->msrs)
+ reg = per_cpu_ptr(rv->msrs, this_cpu);
+ else
+ reg = &rv->reg;
+
+ rdmsr(rv->msr_no, reg->l, reg->h);
+}
+
+static void __wrmsr_on_cpu(void *info)
+{
+ struct msr_info *rv = info;
+ struct msr *reg;
+ int this_cpu = raw_smp_processor_id();
+
+ if (rv->msrs)
+ reg = per_cpu_ptr(rv->msrs, this_cpu);
+ else
+ reg = &rv->reg;
+
+ wrmsr(rv->msr_no, reg->l, reg->h);
+}
+
+int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
+{
+ int err;
+ struct msr_info rv;
+
+ memset(&rv, 0, sizeof(rv));
+
+ rv.msr_no = msr_no;
+ err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1);
+ *l = rv.reg.l;
+ *h = rv.reg.h;
+
+ return err;
+}
+EXPORT_SYMBOL(rdmsr_on_cpu);
+
+int rdmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 *q)
+{
+ int err;
+ struct msr_info rv;
+
+ memset(&rv, 0, sizeof(rv));
+
+ rv.msr_no = msr_no;
+ err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1);
+ *q = rv.reg.q;
+
+ return err;
+}
+EXPORT_SYMBOL(rdmsrl_on_cpu);
+
+int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
+{
+ int err;
+ struct msr_info rv;
+
+ memset(&rv, 0, sizeof(rv));
+
+ rv.msr_no = msr_no;
+ rv.reg.l = l;
+ rv.reg.h = h;
+ err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1);
+
+ return err;
+}
+EXPORT_SYMBOL(wrmsr_on_cpu);
+
+int wrmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 q)
+{
+ int err;
+ struct msr_info rv;
+
+ memset(&rv, 0, sizeof(rv));
+
+ rv.msr_no = msr_no;
+ rv.reg.q = q;
+
+ err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1);
+
+ return err;
+}
+EXPORT_SYMBOL(wrmsrl_on_cpu);
+
+static void __rwmsr_on_cpus(const struct cpumask *mask, u32 msr_no,
+ struct msr *msrs,
+ void (*msr_func) (void *info))
+{
+ struct msr_info rv;
+ int this_cpu;
+
+ memset(&rv, 0, sizeof(rv));
+
+ rv.msrs = msrs;
+ rv.msr_no = msr_no;
+
+ this_cpu = get_cpu();
+
+ if (cpumask_test_cpu(this_cpu, mask))
+ msr_func(&rv);
+
+ smp_call_function_many(mask, msr_func, &rv, 1);
+ put_cpu();
+}
+
+/* rdmsr on a bunch of CPUs
+ *
+ * @mask: which CPUs
+ * @msr_no: which MSR
+ * @msrs: array of MSR values
+ *
+ */
+void rdmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs)
+{
+ __rwmsr_on_cpus(mask, msr_no, msrs, __rdmsr_on_cpu);
+}
+EXPORT_SYMBOL(rdmsr_on_cpus);
+
+/*
+ * wrmsr on a bunch of CPUs
+ *
+ * @mask: which CPUs
+ * @msr_no: which MSR
+ * @msrs: array of MSR values
+ *
+ */
+void wrmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs)
+{
+ __rwmsr_on_cpus(mask, msr_no, msrs, __wrmsr_on_cpu);
+}
+EXPORT_SYMBOL(wrmsr_on_cpus);
+
+struct msr_info_completion {
+ struct msr_info msr;
+ struct completion done;
+};
+
+/* These "safe" variants are slower and should be used when the target MSR
+ may not actually exist. */
+static void __rdmsr_safe_on_cpu(void *info)
+{
+ struct msr_info_completion *rv = info;
+
+ rv->msr.err = rdmsr_safe(rv->msr.msr_no, &rv->msr.reg.l, &rv->msr.reg.h);
+ complete(&rv->done);
+}
+
+static void __wrmsr_safe_on_cpu(void *info)
+{
+ struct msr_info *rv = info;
+
+ rv->err = wrmsr_safe(rv->msr_no, rv->reg.l, rv->reg.h);
+}
+
+int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
+{
+ struct msr_info_completion rv;
+ call_single_data_t csd = {
+ .func = __rdmsr_safe_on_cpu,
+ .info = &rv,
+ };
+ int err;
+
+ memset(&rv, 0, sizeof(rv));
+ init_completion(&rv.done);
+ rv.msr.msr_no = msr_no;
+
+ err = smp_call_function_single_async(cpu, &csd);
+ if (!err) {
+ wait_for_completion(&rv.done);
+ err = rv.msr.err;
+ }
+ *l = rv.msr.reg.l;
+ *h = rv.msr.reg.h;
+
+ return err;
+}
+EXPORT_SYMBOL(rdmsr_safe_on_cpu);
+
+int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
+{
+ int err;
+ struct msr_info rv;
+
+ memset(&rv, 0, sizeof(rv));
+
+ rv.msr_no = msr_no;
+ rv.reg.l = l;
+ rv.reg.h = h;
+ err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1);
+
+ return err ? err : rv.err;
+}
+EXPORT_SYMBOL(wrmsr_safe_on_cpu);
+
+int wrmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 q)
+{
+ int err;
+ struct msr_info rv;
+
+ memset(&rv, 0, sizeof(rv));
+
+ rv.msr_no = msr_no;
+ rv.reg.q = q;
+
+ err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1);
+
+ return err ? err : rv.err;
+}
+EXPORT_SYMBOL(wrmsrl_safe_on_cpu);
+
+int rdmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 *q)
+{
+ u32 low, high;
+ int err;
+
+ err = rdmsr_safe_on_cpu(cpu, msr_no, &low, &high);
+ *q = (u64)high << 32 | low;
+
+ return err;
+}
+EXPORT_SYMBOL(rdmsrl_safe_on_cpu);
+
+/*
+ * These variants are significantly slower, but allows control over
+ * the entire 32-bit GPR set.
+ */
+static void __rdmsr_safe_regs_on_cpu(void *info)
+{
+ struct msr_regs_info *rv = info;
+
+ rv->err = rdmsr_safe_regs(rv->regs);
+}
+
+static void __wrmsr_safe_regs_on_cpu(void *info)
+{
+ struct msr_regs_info *rv = info;
+
+ rv->err = wrmsr_safe_regs(rv->regs);
+}
+
+int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8])
+{
+ int err;
+ struct msr_regs_info rv;
+
+ rv.regs = regs;
+ rv.err = -EIO;
+ err = smp_call_function_single(cpu, __rdmsr_safe_regs_on_cpu, &rv, 1);
+
+ return err ? err : rv.err;
+}
+EXPORT_SYMBOL(rdmsr_safe_regs_on_cpu);
+
+int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8])
+{
+ int err;
+ struct msr_regs_info rv;
+
+ rv.regs = regs;
+ rv.err = -EIO;
+ err = smp_call_function_single(cpu, __wrmsr_safe_regs_on_cpu, &rv, 1);
+
+ return err ? err : rv.err;
+}
+EXPORT_SYMBOL(wrmsr_safe_regs_on_cpu);
diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c
new file mode 100644
index 000000000..3bd905e10
--- /dev/null
+++ b/arch/x86/lib/msr.c
@@ -0,0 +1,138 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/export.h>
+#include <linux/percpu.h>
+#include <linux/preempt.h>
+#include <asm/msr.h>
+#define CREATE_TRACE_POINTS
+#include <asm/msr-trace.h>
+
+struct msr *msrs_alloc(void)
+{
+ struct msr *msrs = NULL;
+
+ msrs = alloc_percpu(struct msr);
+ if (!msrs) {
+ pr_warn("%s: error allocating msrs\n", __func__);
+ return NULL;
+ }
+
+ return msrs;
+}
+EXPORT_SYMBOL(msrs_alloc);
+
+void msrs_free(struct msr *msrs)
+{
+ free_percpu(msrs);
+}
+EXPORT_SYMBOL(msrs_free);
+
+/**
+ * Read an MSR with error handling
+ *
+ * @msr: MSR to read
+ * @m: value to read into
+ *
+ * It returns read data only on success, otherwise it doesn't change the output
+ * argument @m.
+ *
+ */
+int msr_read(u32 msr, struct msr *m)
+{
+ int err;
+ u64 val;
+
+ err = rdmsrl_safe(msr, &val);
+ if (!err)
+ m->q = val;
+
+ return err;
+}
+
+/**
+ * Write an MSR with error handling
+ *
+ * @msr: MSR to write
+ * @m: value to write
+ */
+int msr_write(u32 msr, struct msr *m)
+{
+ return wrmsrl_safe(msr, m->q);
+}
+
+static inline int __flip_bit(u32 msr, u8 bit, bool set)
+{
+ struct msr m, m1;
+ int err = -EINVAL;
+
+ if (bit > 63)
+ return err;
+
+ err = msr_read(msr, &m);
+ if (err)
+ return err;
+
+ m1 = m;
+ if (set)
+ m1.q |= BIT_64(bit);
+ else
+ m1.q &= ~BIT_64(bit);
+
+ if (m1.q == m.q)
+ return 0;
+
+ err = msr_write(msr, &m1);
+ if (err)
+ return err;
+
+ return 1;
+}
+
+/**
+ * Set @bit in a MSR @msr.
+ *
+ * Retval:
+ * < 0: An error was encountered.
+ * = 0: Bit was already set.
+ * > 0: Hardware accepted the MSR write.
+ */
+int msr_set_bit(u32 msr, u8 bit)
+{
+ return __flip_bit(msr, bit, true);
+}
+
+/**
+ * Clear @bit in a MSR @msr.
+ *
+ * Retval:
+ * < 0: An error was encountered.
+ * = 0: Bit was already cleared.
+ * > 0: Hardware accepted the MSR write.
+ */
+int msr_clear_bit(u32 msr, u8 bit)
+{
+ return __flip_bit(msr, bit, false);
+}
+
+#ifdef CONFIG_TRACEPOINTS
+void do_trace_write_msr(unsigned int msr, u64 val, int failed)
+{
+ trace_write_msr(msr, val, failed);
+}
+EXPORT_SYMBOL(do_trace_write_msr);
+EXPORT_TRACEPOINT_SYMBOL(write_msr);
+
+void do_trace_read_msr(unsigned int msr, u64 val, int failed)
+{
+ trace_read_msr(msr, val, failed);
+}
+EXPORT_SYMBOL(do_trace_read_msr);
+EXPORT_TRACEPOINT_SYMBOL(read_msr);
+
+void do_trace_rdpmc(unsigned counter, u64 val, int failed)
+{
+ trace_rdpmc(counter, val, failed);
+}
+EXPORT_SYMBOL(do_trace_rdpmc);
+EXPORT_TRACEPOINT_SYMBOL(rdpmc);
+
+#endif
diff --git a/arch/x86/lib/putuser.S b/arch/x86/lib/putuser.S
new file mode 100644
index 000000000..96dce5fe2
--- /dev/null
+++ b/arch/x86/lib/putuser.S
@@ -0,0 +1,103 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * __put_user functions.
+ *
+ * (C) Copyright 2005 Linus Torvalds
+ * (C) Copyright 2005 Andi Kleen
+ * (C) Copyright 2008 Glauber Costa
+ *
+ * These functions have a non-standard call interface
+ * to make them more efficient, especially as they
+ * return an error value in addition to the "real"
+ * return value.
+ */
+#include <linux/linkage.h>
+#include <asm/thread_info.h>
+#include <asm/errno.h>
+#include <asm/asm.h>
+#include <asm/smap.h>
+#include <asm/export.h>
+
+
+/*
+ * __put_user_X
+ *
+ * Inputs: %eax[:%edx] contains the data
+ * %ecx contains the address
+ *
+ * Outputs: %eax is error code (0 or -EFAULT)
+ *
+ * These functions should not modify any other registers,
+ * as they get called from within inline assembly.
+ */
+
+#define ENTER mov PER_CPU_VAR(current_task), %_ASM_BX
+#define EXIT ASM_CLAC ; \
+ ret
+
+.text
+ENTRY(__put_user_1)
+ ENTER
+ cmp TASK_addr_limit(%_ASM_BX),%_ASM_CX
+ jae bad_put_user
+ ASM_STAC
+1: movb %al,(%_ASM_CX)
+ xor %eax,%eax
+ EXIT
+ENDPROC(__put_user_1)
+EXPORT_SYMBOL(__put_user_1)
+
+ENTRY(__put_user_2)
+ ENTER
+ mov TASK_addr_limit(%_ASM_BX),%_ASM_BX
+ sub $1,%_ASM_BX
+ cmp %_ASM_BX,%_ASM_CX
+ jae bad_put_user
+ ASM_STAC
+2: movw %ax,(%_ASM_CX)
+ xor %eax,%eax
+ EXIT
+ENDPROC(__put_user_2)
+EXPORT_SYMBOL(__put_user_2)
+
+ENTRY(__put_user_4)
+ ENTER
+ mov TASK_addr_limit(%_ASM_BX),%_ASM_BX
+ sub $3,%_ASM_BX
+ cmp %_ASM_BX,%_ASM_CX
+ jae bad_put_user
+ ASM_STAC
+3: movl %eax,(%_ASM_CX)
+ xor %eax,%eax
+ EXIT
+ENDPROC(__put_user_4)
+EXPORT_SYMBOL(__put_user_4)
+
+ENTRY(__put_user_8)
+ ENTER
+ mov TASK_addr_limit(%_ASM_BX),%_ASM_BX
+ sub $7,%_ASM_BX
+ cmp %_ASM_BX,%_ASM_CX
+ jae bad_put_user
+ ASM_STAC
+4: mov %_ASM_AX,(%_ASM_CX)
+#ifdef CONFIG_X86_32
+5: movl %edx,4(%_ASM_CX)
+#endif
+ xor %eax,%eax
+ EXIT
+ENDPROC(__put_user_8)
+EXPORT_SYMBOL(__put_user_8)
+
+bad_put_user:
+ movl $-EFAULT,%eax
+ EXIT
+END(bad_put_user)
+
+ _ASM_EXTABLE(1b,bad_put_user)
+ _ASM_EXTABLE(2b,bad_put_user)
+ _ASM_EXTABLE(3b,bad_put_user)
+ _ASM_EXTABLE(4b,bad_put_user)
+#ifdef CONFIG_X86_32
+ _ASM_EXTABLE(5b,bad_put_user)
+#endif
diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S
new file mode 100644
index 000000000..c909961e6
--- /dev/null
+++ b/arch/x86/lib/retpoline.S
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <linux/stringify.h>
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+#include <asm/cpufeatures.h>
+#include <asm/alternative-asm.h>
+#include <asm/export.h>
+#include <asm/nospec-branch.h>
+
+.macro THUNK reg
+ .section .text.__x86.indirect_thunk
+
+ENTRY(__x86_indirect_thunk_\reg)
+ CFI_STARTPROC
+ JMP_NOSPEC %\reg
+ CFI_ENDPROC
+ENDPROC(__x86_indirect_thunk_\reg)
+.endm
+
+/*
+ * Despite being an assembler file we can't just use .irp here
+ * because __KSYM_DEPS__ only uses the C preprocessor and would
+ * only see one instance of "__x86_indirect_thunk_\reg" rather
+ * than one per register with the correct names. So we do it
+ * the simple and nasty way...
+ */
+#define __EXPORT_THUNK(sym) _ASM_NOKPROBE(sym); EXPORT_SYMBOL(sym)
+#define EXPORT_THUNK(reg) __EXPORT_THUNK(__x86_indirect_thunk_ ## reg)
+#define GENERATE_THUNK(reg) THUNK reg ; EXPORT_THUNK(reg)
+
+GENERATE_THUNK(_ASM_AX)
+GENERATE_THUNK(_ASM_BX)
+GENERATE_THUNK(_ASM_CX)
+GENERATE_THUNK(_ASM_DX)
+GENERATE_THUNK(_ASM_SI)
+GENERATE_THUNK(_ASM_DI)
+GENERATE_THUNK(_ASM_BP)
+#ifdef CONFIG_64BIT
+GENERATE_THUNK(r8)
+GENERATE_THUNK(r9)
+GENERATE_THUNK(r10)
+GENERATE_THUNK(r11)
+GENERATE_THUNK(r12)
+GENERATE_THUNK(r13)
+GENERATE_THUNK(r14)
+GENERATE_THUNK(r15)
+#endif
diff --git a/arch/x86/lib/rwsem.S b/arch/x86/lib/rwsem.S
new file mode 100644
index 000000000..dc2ab6ea6
--- /dev/null
+++ b/arch/x86/lib/rwsem.S
@@ -0,0 +1,156 @@
+/*
+ * x86 semaphore implementation.
+ *
+ * (C) Copyright 1999 Linus Torvalds
+ *
+ * Portions Copyright 1999 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * rw semaphores implemented November 1999 by Benjamin LaHaise <bcrl@kvack.org>
+ */
+
+#include <linux/linkage.h>
+#include <asm/alternative-asm.h>
+#include <asm/frame.h>
+
+#define __ASM_HALF_REG(reg) __ASM_SEL(reg, e##reg)
+#define __ASM_HALF_SIZE(inst) __ASM_SEL(inst##w, inst##l)
+
+#ifdef CONFIG_X86_32
+
+/*
+ * The semaphore operations have a special calling sequence that
+ * allow us to do a simpler in-line version of them. These routines
+ * need to convert that sequence back into the C sequence when
+ * there is contention on the semaphore.
+ *
+ * %eax contains the semaphore pointer on entry. Save the C-clobbered
+ * registers (%eax, %edx and %ecx) except %eax which is either a return
+ * value or just gets clobbered. Same is true for %edx so make sure GCC
+ * reloads it after the slow path, by making it hold a temporary, for
+ * example see ____down_write().
+ */
+
+#define save_common_regs \
+ pushl %ecx
+
+#define restore_common_regs \
+ popl %ecx
+
+ /* Avoid uglifying the argument copying x86-64 needs to do. */
+ .macro movq src, dst
+ .endm
+
+#else
+
+/*
+ * x86-64 rwsem wrappers
+ *
+ * This interfaces the inline asm code to the slow-path
+ * C routines. We need to save the call-clobbered regs
+ * that the asm does not mark as clobbered, and move the
+ * argument from %rax to %rdi.
+ *
+ * NOTE! We don't need to save %rax, because the functions
+ * will always return the semaphore pointer in %rax (which
+ * is also the input argument to these helpers)
+ *
+ * The following can clobber %rdx because the asm clobbers it:
+ * call_rwsem_down_write_failed
+ * call_rwsem_wake
+ * but %rdi, %rsi, %rcx, %r8-r11 always need saving.
+ */
+
+#define save_common_regs \
+ pushq %rdi; \
+ pushq %rsi; \
+ pushq %rcx; \
+ pushq %r8; \
+ pushq %r9; \
+ pushq %r10; \
+ pushq %r11
+
+#define restore_common_regs \
+ popq %r11; \
+ popq %r10; \
+ popq %r9; \
+ popq %r8; \
+ popq %rcx; \
+ popq %rsi; \
+ popq %rdi
+
+#endif
+
+/* Fix up special calling conventions */
+ENTRY(call_rwsem_down_read_failed)
+ FRAME_BEGIN
+ save_common_regs
+ __ASM_SIZE(push,) %__ASM_REG(dx)
+ movq %rax,%rdi
+ call rwsem_down_read_failed
+ __ASM_SIZE(pop,) %__ASM_REG(dx)
+ restore_common_regs
+ FRAME_END
+ ret
+ENDPROC(call_rwsem_down_read_failed)
+
+ENTRY(call_rwsem_down_read_failed_killable)
+ FRAME_BEGIN
+ save_common_regs
+ __ASM_SIZE(push,) %__ASM_REG(dx)
+ movq %rax,%rdi
+ call rwsem_down_read_failed_killable
+ __ASM_SIZE(pop,) %__ASM_REG(dx)
+ restore_common_regs
+ FRAME_END
+ ret
+ENDPROC(call_rwsem_down_read_failed_killable)
+
+ENTRY(call_rwsem_down_write_failed)
+ FRAME_BEGIN
+ save_common_regs
+ movq %rax,%rdi
+ call rwsem_down_write_failed
+ restore_common_regs
+ FRAME_END
+ ret
+ENDPROC(call_rwsem_down_write_failed)
+
+ENTRY(call_rwsem_down_write_failed_killable)
+ FRAME_BEGIN
+ save_common_regs
+ movq %rax,%rdi
+ call rwsem_down_write_failed_killable
+ restore_common_regs
+ FRAME_END
+ ret
+ENDPROC(call_rwsem_down_write_failed_killable)
+
+ENTRY(call_rwsem_wake)
+ FRAME_BEGIN
+ /* do nothing if still outstanding active readers */
+ __ASM_HALF_SIZE(dec) %__ASM_HALF_REG(dx)
+ jnz 1f
+ save_common_regs
+ movq %rax,%rdi
+ call rwsem_wake
+ restore_common_regs
+1: FRAME_END
+ ret
+ENDPROC(call_rwsem_wake)
+
+ENTRY(call_rwsem_downgrade_wake)
+ FRAME_BEGIN
+ save_common_regs
+ __ASM_SIZE(push,) %__ASM_REG(dx)
+ movq %rax,%rdi
+ call rwsem_downgrade_wake
+ __ASM_SIZE(pop,) %__ASM_REG(dx)
+ restore_common_regs
+ FRAME_END
+ ret
+ENDPROC(call_rwsem_downgrade_wake)
diff --git a/arch/x86/lib/string_32.c b/arch/x86/lib/string_32.c
new file mode 100644
index 000000000..d15fdae96
--- /dev/null
+++ b/arch/x86/lib/string_32.c
@@ -0,0 +1,236 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Most of the string-functions are rather heavily hand-optimized,
+ * see especially strsep,strstr,str[c]spn. They should work, but are not
+ * very easy to understand. Everything is done entirely within the register
+ * set, making the functions fast and clean. String instructions have been
+ * used through-out, making for "slightly" unclear code :-)
+ *
+ * AK: On P4 and K7 using non string instruction implementations might be faster
+ * for large memory blocks. But most of them are unlikely to be used on large
+ * strings.
+ */
+
+#include <linux/string.h>
+#include <linux/export.h>
+
+#ifdef __HAVE_ARCH_STRCPY
+char *strcpy(char *dest, const char *src)
+{
+ int d0, d1, d2;
+ asm volatile("1:\tlodsb\n\t"
+ "stosb\n\t"
+ "testb %%al,%%al\n\t"
+ "jne 1b"
+ : "=&S" (d0), "=&D" (d1), "=&a" (d2)
+ : "0" (src), "1" (dest) : "memory");
+ return dest;
+}
+EXPORT_SYMBOL(strcpy);
+#endif
+
+#ifdef __HAVE_ARCH_STRNCPY
+char *strncpy(char *dest, const char *src, size_t count)
+{
+ int d0, d1, d2, d3;
+ asm volatile("1:\tdecl %2\n\t"
+ "js 2f\n\t"
+ "lodsb\n\t"
+ "stosb\n\t"
+ "testb %%al,%%al\n\t"
+ "jne 1b\n\t"
+ "rep\n\t"
+ "stosb\n"
+ "2:"
+ : "=&S" (d0), "=&D" (d1), "=&c" (d2), "=&a" (d3)
+ : "0" (src), "1" (dest), "2" (count) : "memory");
+ return dest;
+}
+EXPORT_SYMBOL(strncpy);
+#endif
+
+#ifdef __HAVE_ARCH_STRCAT
+char *strcat(char *dest, const char *src)
+{
+ int d0, d1, d2, d3;
+ asm volatile("repne\n\t"
+ "scasb\n\t"
+ "decl %1\n"
+ "1:\tlodsb\n\t"
+ "stosb\n\t"
+ "testb %%al,%%al\n\t"
+ "jne 1b"
+ : "=&S" (d0), "=&D" (d1), "=&a" (d2), "=&c" (d3)
+ : "0" (src), "1" (dest), "2" (0), "3" (0xffffffffu) : "memory");
+ return dest;
+}
+EXPORT_SYMBOL(strcat);
+#endif
+
+#ifdef __HAVE_ARCH_STRNCAT
+char *strncat(char *dest, const char *src, size_t count)
+{
+ int d0, d1, d2, d3;
+ asm volatile("repne\n\t"
+ "scasb\n\t"
+ "decl %1\n\t"
+ "movl %8,%3\n"
+ "1:\tdecl %3\n\t"
+ "js 2f\n\t"
+ "lodsb\n\t"
+ "stosb\n\t"
+ "testb %%al,%%al\n\t"
+ "jne 1b\n"
+ "2:\txorl %2,%2\n\t"
+ "stosb"
+ : "=&S" (d0), "=&D" (d1), "=&a" (d2), "=&c" (d3)
+ : "0" (src), "1" (dest), "2" (0), "3" (0xffffffffu), "g" (count)
+ : "memory");
+ return dest;
+}
+EXPORT_SYMBOL(strncat);
+#endif
+
+#ifdef __HAVE_ARCH_STRCMP
+int strcmp(const char *cs, const char *ct)
+{
+ int d0, d1;
+ int res;
+ asm volatile("1:\tlodsb\n\t"
+ "scasb\n\t"
+ "jne 2f\n\t"
+ "testb %%al,%%al\n\t"
+ "jne 1b\n\t"
+ "xorl %%eax,%%eax\n\t"
+ "jmp 3f\n"
+ "2:\tsbbl %%eax,%%eax\n\t"
+ "orb $1,%%al\n"
+ "3:"
+ : "=a" (res), "=&S" (d0), "=&D" (d1)
+ : "1" (cs), "2" (ct)
+ : "memory");
+ return res;
+}
+EXPORT_SYMBOL(strcmp);
+#endif
+
+#ifdef __HAVE_ARCH_STRNCMP
+int strncmp(const char *cs, const char *ct, size_t count)
+{
+ int res;
+ int d0, d1, d2;
+ asm volatile("1:\tdecl %3\n\t"
+ "js 2f\n\t"
+ "lodsb\n\t"
+ "scasb\n\t"
+ "jne 3f\n\t"
+ "testb %%al,%%al\n\t"
+ "jne 1b\n"
+ "2:\txorl %%eax,%%eax\n\t"
+ "jmp 4f\n"
+ "3:\tsbbl %%eax,%%eax\n\t"
+ "orb $1,%%al\n"
+ "4:"
+ : "=a" (res), "=&S" (d0), "=&D" (d1), "=&c" (d2)
+ : "1" (cs), "2" (ct), "3" (count)
+ : "memory");
+ return res;
+}
+EXPORT_SYMBOL(strncmp);
+#endif
+
+#ifdef __HAVE_ARCH_STRCHR
+char *strchr(const char *s, int c)
+{
+ int d0;
+ char *res;
+ asm volatile("movb %%al,%%ah\n"
+ "1:\tlodsb\n\t"
+ "cmpb %%ah,%%al\n\t"
+ "je 2f\n\t"
+ "testb %%al,%%al\n\t"
+ "jne 1b\n\t"
+ "movl $1,%1\n"
+ "2:\tmovl %1,%0\n\t"
+ "decl %0"
+ : "=a" (res), "=&S" (d0)
+ : "1" (s), "0" (c)
+ : "memory");
+ return res;
+}
+EXPORT_SYMBOL(strchr);
+#endif
+
+#ifdef __HAVE_ARCH_STRLEN
+size_t strlen(const char *s)
+{
+ int d0;
+ size_t res;
+ asm volatile("repne\n\t"
+ "scasb"
+ : "=c" (res), "=&D" (d0)
+ : "1" (s), "a" (0), "0" (0xffffffffu)
+ : "memory");
+ return ~res - 1;
+}
+EXPORT_SYMBOL(strlen);
+#endif
+
+#ifdef __HAVE_ARCH_MEMCHR
+void *memchr(const void *cs, int c, size_t count)
+{
+ int d0;
+ void *res;
+ if (!count)
+ return NULL;
+ asm volatile("repne\n\t"
+ "scasb\n\t"
+ "je 1f\n\t"
+ "movl $1,%0\n"
+ "1:\tdecl %0"
+ : "=D" (res), "=&c" (d0)
+ : "a" (c), "0" (cs), "1" (count)
+ : "memory");
+ return res;
+}
+EXPORT_SYMBOL(memchr);
+#endif
+
+#ifdef __HAVE_ARCH_MEMSCAN
+void *memscan(void *addr, int c, size_t size)
+{
+ if (!size)
+ return addr;
+ asm volatile("repnz; scasb\n\t"
+ "jnz 1f\n\t"
+ "dec %%edi\n"
+ "1:"
+ : "=D" (addr), "=c" (size)
+ : "0" (addr), "1" (size), "a" (c)
+ : "memory");
+ return addr;
+}
+EXPORT_SYMBOL(memscan);
+#endif
+
+#ifdef __HAVE_ARCH_STRNLEN
+size_t strnlen(const char *s, size_t count)
+{
+ int d0;
+ int res;
+ asm volatile("movl %2,%0\n\t"
+ "jmp 2f\n"
+ "1:\tcmpb $0,(%0)\n\t"
+ "je 3f\n\t"
+ "incl %0\n"
+ "2:\tdecl %1\n\t"
+ "cmpl $-1,%1\n\t"
+ "jne 1b\n"
+ "3:\tsubl %2,%0"
+ : "=a" (res), "=&d" (d0)
+ : "c" (s), "1" (count)
+ : "memory");
+ return res;
+}
+EXPORT_SYMBOL(strnlen);
+#endif
diff --git a/arch/x86/lib/strstr_32.c b/arch/x86/lib/strstr_32.c
new file mode 100644
index 000000000..38f37df05
--- /dev/null
+++ b/arch/x86/lib/strstr_32.c
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/string.h>
+#include <linux/export.h>
+
+char *strstr(const char *cs, const char *ct)
+{
+int d0, d1;
+register char *__res;
+__asm__ __volatile__(
+ "movl %6,%%edi\n\t"
+ "repne\n\t"
+ "scasb\n\t"
+ "notl %%ecx\n\t"
+ "decl %%ecx\n\t" /* NOTE! This also sets Z if searchstring='' */
+ "movl %%ecx,%%edx\n"
+ "1:\tmovl %6,%%edi\n\t"
+ "movl %%esi,%%eax\n\t"
+ "movl %%edx,%%ecx\n\t"
+ "repe\n\t"
+ "cmpsb\n\t"
+ "je 2f\n\t" /* also works for empty string, see above */
+ "xchgl %%eax,%%esi\n\t"
+ "incl %%esi\n\t"
+ "cmpb $0,-1(%%eax)\n\t"
+ "jne 1b\n\t"
+ "xorl %%eax,%%eax\n\t"
+ "2:"
+ : "=a" (__res), "=&c" (d0), "=&S" (d1)
+ : "0" (0), "1" (0xffffffff), "2" (cs), "g" (ct)
+ : "dx", "di");
+return __res;
+}
+EXPORT_SYMBOL(strstr);
diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c
new file mode 100644
index 000000000..3f435d7fc
--- /dev/null
+++ b/arch/x86/lib/usercopy.c
@@ -0,0 +1,38 @@
+/*
+ * User address space access functions.
+ *
+ * For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/uaccess.h>
+#include <linux/export.h>
+
+#include <asm/tlbflush.h>
+
+/*
+ * We rely on the nested NMI work to allow atomic faults from the NMI path; the
+ * nested NMI paths are careful to preserve CR2.
+ */
+unsigned long
+copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
+{
+ unsigned long ret;
+
+ if (__range_not_ok(from, n, TASK_SIZE))
+ return n;
+
+ if (!nmi_uaccess_okay())
+ return n;
+
+ /*
+ * Even though this function is typically called from NMI/IRQ context
+ * disable pagefaults so that its behaviour is consistent even when
+ * called form other contexts.
+ */
+ pagefault_disable();
+ ret = __copy_from_user_inatomic(to, from, n);
+ pagefault_enable();
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(copy_from_user_nmi);
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
new file mode 100644
index 000000000..7add8ba06
--- /dev/null
+++ b/arch/x86/lib/usercopy_32.c
@@ -0,0 +1,359 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * User address space access functions.
+ * The non inlined parts of asm-i386/uaccess.h are here.
+ *
+ * Copyright 1997 Andi Kleen <ak@muc.de>
+ * Copyright 1997 Linus Torvalds
+ */
+#include <linux/export.h>
+#include <linux/uaccess.h>
+#include <asm/mmx.h>
+#include <asm/asm.h>
+
+#ifdef CONFIG_X86_INTEL_USERCOPY
+/*
+ * Alignment at which movsl is preferred for bulk memory copies.
+ */
+struct movsl_mask movsl_mask __read_mostly;
+#endif
+
+static inline int __movsl_is_ok(unsigned long a1, unsigned long a2, unsigned long n)
+{
+#ifdef CONFIG_X86_INTEL_USERCOPY
+ if (n >= 64 && ((a1 ^ a2) & movsl_mask.mask))
+ return 0;
+#endif
+ return 1;
+}
+#define movsl_is_ok(a1, a2, n) \
+ __movsl_is_ok((unsigned long)(a1), (unsigned long)(a2), (n))
+
+/*
+ * Zero Userspace
+ */
+
+#define __do_clear_user(addr,size) \
+do { \
+ int __d0; \
+ might_fault(); \
+ __asm__ __volatile__( \
+ ASM_STAC "\n" \
+ "0: rep; stosl\n" \
+ " movl %2,%0\n" \
+ "1: rep; stosb\n" \
+ "2: " ASM_CLAC "\n" \
+ ".section .fixup,\"ax\"\n" \
+ "3: lea 0(%2,%0,4),%0\n" \
+ " jmp 2b\n" \
+ ".previous\n" \
+ _ASM_EXTABLE(0b,3b) \
+ _ASM_EXTABLE(1b,2b) \
+ : "=&c"(size), "=&D" (__d0) \
+ : "r"(size & 3), "0"(size / 4), "1"(addr), "a"(0)); \
+} while (0)
+
+/**
+ * clear_user: - Zero a block of memory in user space.
+ * @to: Destination address, in user space.
+ * @n: Number of bytes to zero.
+ *
+ * Zero a block of memory in user space.
+ *
+ * Returns number of bytes that could not be cleared.
+ * On success, this will be zero.
+ */
+unsigned long
+clear_user(void __user *to, unsigned long n)
+{
+ might_fault();
+ if (access_ok(VERIFY_WRITE, to, n))
+ __do_clear_user(to, n);
+ return n;
+}
+EXPORT_SYMBOL(clear_user);
+
+/**
+ * __clear_user: - Zero a block of memory in user space, with less checking.
+ * @to: Destination address, in user space.
+ * @n: Number of bytes to zero.
+ *
+ * Zero a block of memory in user space. Caller must check
+ * the specified block with access_ok() before calling this function.
+ *
+ * Returns number of bytes that could not be cleared.
+ * On success, this will be zero.
+ */
+unsigned long
+__clear_user(void __user *to, unsigned long n)
+{
+ __do_clear_user(to, n);
+ return n;
+}
+EXPORT_SYMBOL(__clear_user);
+
+#ifdef CONFIG_X86_INTEL_USERCOPY
+static unsigned long
+__copy_user_intel(void __user *to, const void *from, unsigned long size)
+{
+ int d0, d1;
+ __asm__ __volatile__(
+ " .align 2,0x90\n"
+ "1: movl 32(%4), %%eax\n"
+ " cmpl $67, %0\n"
+ " jbe 3f\n"
+ "2: movl 64(%4), %%eax\n"
+ " .align 2,0x90\n"
+ "3: movl 0(%4), %%eax\n"
+ "4: movl 4(%4), %%edx\n"
+ "5: movl %%eax, 0(%3)\n"
+ "6: movl %%edx, 4(%3)\n"
+ "7: movl 8(%4), %%eax\n"
+ "8: movl 12(%4),%%edx\n"
+ "9: movl %%eax, 8(%3)\n"
+ "10: movl %%edx, 12(%3)\n"
+ "11: movl 16(%4), %%eax\n"
+ "12: movl 20(%4), %%edx\n"
+ "13: movl %%eax, 16(%3)\n"
+ "14: movl %%edx, 20(%3)\n"
+ "15: movl 24(%4), %%eax\n"
+ "16: movl 28(%4), %%edx\n"
+ "17: movl %%eax, 24(%3)\n"
+ "18: movl %%edx, 28(%3)\n"
+ "19: movl 32(%4), %%eax\n"
+ "20: movl 36(%4), %%edx\n"
+ "21: movl %%eax, 32(%3)\n"
+ "22: movl %%edx, 36(%3)\n"
+ "23: movl 40(%4), %%eax\n"
+ "24: movl 44(%4), %%edx\n"
+ "25: movl %%eax, 40(%3)\n"
+ "26: movl %%edx, 44(%3)\n"
+ "27: movl 48(%4), %%eax\n"
+ "28: movl 52(%4), %%edx\n"
+ "29: movl %%eax, 48(%3)\n"
+ "30: movl %%edx, 52(%3)\n"
+ "31: movl 56(%4), %%eax\n"
+ "32: movl 60(%4), %%edx\n"
+ "33: movl %%eax, 56(%3)\n"
+ "34: movl %%edx, 60(%3)\n"
+ " addl $-64, %0\n"
+ " addl $64, %4\n"
+ " addl $64, %3\n"
+ " cmpl $63, %0\n"
+ " ja 1b\n"
+ "35: movl %0, %%eax\n"
+ " shrl $2, %0\n"
+ " andl $3, %%eax\n"
+ " cld\n"
+ "99: rep; movsl\n"
+ "36: movl %%eax, %0\n"
+ "37: rep; movsb\n"
+ "100:\n"
+ ".section .fixup,\"ax\"\n"
+ "101: lea 0(%%eax,%0,4),%0\n"
+ " jmp 100b\n"
+ ".previous\n"
+ _ASM_EXTABLE(1b,100b)
+ _ASM_EXTABLE(2b,100b)
+ _ASM_EXTABLE(3b,100b)
+ _ASM_EXTABLE(4b,100b)
+ _ASM_EXTABLE(5b,100b)
+ _ASM_EXTABLE(6b,100b)
+ _ASM_EXTABLE(7b,100b)
+ _ASM_EXTABLE(8b,100b)
+ _ASM_EXTABLE(9b,100b)
+ _ASM_EXTABLE(10b,100b)
+ _ASM_EXTABLE(11b,100b)
+ _ASM_EXTABLE(12b,100b)
+ _ASM_EXTABLE(13b,100b)
+ _ASM_EXTABLE(14b,100b)
+ _ASM_EXTABLE(15b,100b)
+ _ASM_EXTABLE(16b,100b)
+ _ASM_EXTABLE(17b,100b)
+ _ASM_EXTABLE(18b,100b)
+ _ASM_EXTABLE(19b,100b)
+ _ASM_EXTABLE(20b,100b)
+ _ASM_EXTABLE(21b,100b)
+ _ASM_EXTABLE(22b,100b)
+ _ASM_EXTABLE(23b,100b)
+ _ASM_EXTABLE(24b,100b)
+ _ASM_EXTABLE(25b,100b)
+ _ASM_EXTABLE(26b,100b)
+ _ASM_EXTABLE(27b,100b)
+ _ASM_EXTABLE(28b,100b)
+ _ASM_EXTABLE(29b,100b)
+ _ASM_EXTABLE(30b,100b)
+ _ASM_EXTABLE(31b,100b)
+ _ASM_EXTABLE(32b,100b)
+ _ASM_EXTABLE(33b,100b)
+ _ASM_EXTABLE(34b,100b)
+ _ASM_EXTABLE(35b,100b)
+ _ASM_EXTABLE(36b,100b)
+ _ASM_EXTABLE(37b,100b)
+ _ASM_EXTABLE(99b,101b)
+ : "=&c"(size), "=&D" (d0), "=&S" (d1)
+ : "1"(to), "2"(from), "0"(size)
+ : "eax", "edx", "memory");
+ return size;
+}
+
+static unsigned long __copy_user_intel_nocache(void *to,
+ const void __user *from, unsigned long size)
+{
+ int d0, d1;
+
+ __asm__ __volatile__(
+ " .align 2,0x90\n"
+ "0: movl 32(%4), %%eax\n"
+ " cmpl $67, %0\n"
+ " jbe 2f\n"
+ "1: movl 64(%4), %%eax\n"
+ " .align 2,0x90\n"
+ "2: movl 0(%4), %%eax\n"
+ "21: movl 4(%4), %%edx\n"
+ " movnti %%eax, 0(%3)\n"
+ " movnti %%edx, 4(%3)\n"
+ "3: movl 8(%4), %%eax\n"
+ "31: movl 12(%4),%%edx\n"
+ " movnti %%eax, 8(%3)\n"
+ " movnti %%edx, 12(%3)\n"
+ "4: movl 16(%4), %%eax\n"
+ "41: movl 20(%4), %%edx\n"
+ " movnti %%eax, 16(%3)\n"
+ " movnti %%edx, 20(%3)\n"
+ "10: movl 24(%4), %%eax\n"
+ "51: movl 28(%4), %%edx\n"
+ " movnti %%eax, 24(%3)\n"
+ " movnti %%edx, 28(%3)\n"
+ "11: movl 32(%4), %%eax\n"
+ "61: movl 36(%4), %%edx\n"
+ " movnti %%eax, 32(%3)\n"
+ " movnti %%edx, 36(%3)\n"
+ "12: movl 40(%4), %%eax\n"
+ "71: movl 44(%4), %%edx\n"
+ " movnti %%eax, 40(%3)\n"
+ " movnti %%edx, 44(%3)\n"
+ "13: movl 48(%4), %%eax\n"
+ "81: movl 52(%4), %%edx\n"
+ " movnti %%eax, 48(%3)\n"
+ " movnti %%edx, 52(%3)\n"
+ "14: movl 56(%4), %%eax\n"
+ "91: movl 60(%4), %%edx\n"
+ " movnti %%eax, 56(%3)\n"
+ " movnti %%edx, 60(%3)\n"
+ " addl $-64, %0\n"
+ " addl $64, %4\n"
+ " addl $64, %3\n"
+ " cmpl $63, %0\n"
+ " ja 0b\n"
+ " sfence \n"
+ "5: movl %0, %%eax\n"
+ " shrl $2, %0\n"
+ " andl $3, %%eax\n"
+ " cld\n"
+ "6: rep; movsl\n"
+ " movl %%eax,%0\n"
+ "7: rep; movsb\n"
+ "8:\n"
+ ".section .fixup,\"ax\"\n"
+ "9: lea 0(%%eax,%0,4),%0\n"
+ "16: jmp 8b\n"
+ ".previous\n"
+ _ASM_EXTABLE(0b,16b)
+ _ASM_EXTABLE(1b,16b)
+ _ASM_EXTABLE(2b,16b)
+ _ASM_EXTABLE(21b,16b)
+ _ASM_EXTABLE(3b,16b)
+ _ASM_EXTABLE(31b,16b)
+ _ASM_EXTABLE(4b,16b)
+ _ASM_EXTABLE(41b,16b)
+ _ASM_EXTABLE(10b,16b)
+ _ASM_EXTABLE(51b,16b)
+ _ASM_EXTABLE(11b,16b)
+ _ASM_EXTABLE(61b,16b)
+ _ASM_EXTABLE(12b,16b)
+ _ASM_EXTABLE(71b,16b)
+ _ASM_EXTABLE(13b,16b)
+ _ASM_EXTABLE(81b,16b)
+ _ASM_EXTABLE(14b,16b)
+ _ASM_EXTABLE(91b,16b)
+ _ASM_EXTABLE(6b,9b)
+ _ASM_EXTABLE(7b,16b)
+ : "=&c"(size), "=&D" (d0), "=&S" (d1)
+ : "1"(to), "2"(from), "0"(size)
+ : "eax", "edx", "memory");
+ return size;
+}
+
+#else
+
+/*
+ * Leave these declared but undefined. They should not be any references to
+ * them
+ */
+unsigned long __copy_user_intel(void __user *to, const void *from,
+ unsigned long size);
+#endif /* CONFIG_X86_INTEL_USERCOPY */
+
+/* Generic arbitrary sized copy. */
+#define __copy_user(to, from, size) \
+do { \
+ int __d0, __d1, __d2; \
+ __asm__ __volatile__( \
+ " cmp $7,%0\n" \
+ " jbe 1f\n" \
+ " movl %1,%0\n" \
+ " negl %0\n" \
+ " andl $7,%0\n" \
+ " subl %0,%3\n" \
+ "4: rep; movsb\n" \
+ " movl %3,%0\n" \
+ " shrl $2,%0\n" \
+ " andl $3,%3\n" \
+ " .align 2,0x90\n" \
+ "0: rep; movsl\n" \
+ " movl %3,%0\n" \
+ "1: rep; movsb\n" \
+ "2:\n" \
+ ".section .fixup,\"ax\"\n" \
+ "5: addl %3,%0\n" \
+ " jmp 2b\n" \
+ "3: lea 0(%3,%0,4),%0\n" \
+ " jmp 2b\n" \
+ ".previous\n" \
+ _ASM_EXTABLE(4b,5b) \
+ _ASM_EXTABLE(0b,3b) \
+ _ASM_EXTABLE(1b,2b) \
+ : "=&c"(size), "=&D" (__d0), "=&S" (__d1), "=r"(__d2) \
+ : "3"(size), "0"(size), "1"(to), "2"(from) \
+ : "memory"); \
+} while (0)
+
+unsigned long __copy_user_ll(void *to, const void *from, unsigned long n)
+{
+ __uaccess_begin_nospec();
+ if (movsl_is_ok(to, from, n))
+ __copy_user(to, from, n);
+ else
+ n = __copy_user_intel(to, from, n);
+ __uaccess_end();
+ return n;
+}
+EXPORT_SYMBOL(__copy_user_ll);
+
+unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *from,
+ unsigned long n)
+{
+ __uaccess_begin_nospec();
+#ifdef CONFIG_X86_INTEL_USERCOPY
+ if (n > 64 && static_cpu_has(X86_FEATURE_XMM2))
+ n = __copy_user_intel_nocache(to, from, n);
+ else
+ __copy_user(to, from, n);
+#else
+ __copy_user(to, from, n);
+#endif
+ __uaccess_end();
+ return n;
+}
+EXPORT_SYMBOL(__copy_from_user_ll_nocache_nozero);
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
new file mode 100644
index 000000000..8c6d0fb72
--- /dev/null
+++ b/arch/x86/lib/usercopy_64.c
@@ -0,0 +1,230 @@
+/*
+ * User address space access functions.
+ *
+ * Copyright 1997 Andi Kleen <ak@muc.de>
+ * Copyright 1997 Linus Torvalds
+ * Copyright 2002 Andi Kleen <ak@suse.de>
+ */
+#include <linux/export.h>
+#include <linux/uaccess.h>
+#include <linux/highmem.h>
+
+/*
+ * Zero Userspace
+ */
+
+unsigned long __clear_user(void __user *addr, unsigned long size)
+{
+ long __d0;
+ might_fault();
+ /* no memory constraint because it doesn't change any memory gcc knows
+ about */
+ stac();
+ asm volatile(
+ " testq %[size8],%[size8]\n"
+ " jz 4f\n"
+ " .align 16\n"
+ "0: movq $0,(%[dst])\n"
+ " addq $8,%[dst]\n"
+ " decl %%ecx ; jnz 0b\n"
+ "4: movq %[size1],%%rcx\n"
+ " testl %%ecx,%%ecx\n"
+ " jz 2f\n"
+ "1: movb $0,(%[dst])\n"
+ " incq %[dst]\n"
+ " decl %%ecx ; jnz 1b\n"
+ "2:\n"
+ ".section .fixup,\"ax\"\n"
+ "3: lea 0(%[size1],%[size8],8),%[size8]\n"
+ " jmp 2b\n"
+ ".previous\n"
+ _ASM_EXTABLE(0b,3b)
+ _ASM_EXTABLE(1b,2b)
+ : [size8] "=&c"(size), [dst] "=&D" (__d0)
+ : [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr));
+ clac();
+ return size;
+}
+EXPORT_SYMBOL(__clear_user);
+
+unsigned long clear_user(void __user *to, unsigned long n)
+{
+ if (access_ok(VERIFY_WRITE, to, n))
+ return __clear_user(to, n);
+ return n;
+}
+EXPORT_SYMBOL(clear_user);
+
+/*
+ * Try to copy last bytes and clear the rest if needed.
+ * Since protection fault in copy_from/to_user is not a normal situation,
+ * it is not necessary to optimize tail handling.
+ */
+__visible unsigned long
+copy_user_handle_tail(char *to, char *from, unsigned len)
+{
+ for (; len; --len, to++) {
+ char c;
+
+ if (__get_user_nocheck(c, from++, sizeof(char)))
+ break;
+ if (__put_user_nocheck(c, to, sizeof(char)))
+ break;
+ }
+ clac();
+ return len;
+}
+
+/*
+ * Similar to copy_user_handle_tail, probe for the write fault point,
+ * but reuse __memcpy_mcsafe in case a new read error is encountered.
+ * clac() is handled in _copy_to_iter_mcsafe().
+ */
+__visible unsigned long
+mcsafe_handle_tail(char *to, char *from, unsigned len)
+{
+ for (; len; --len, to++, from++) {
+ /*
+ * Call the assembly routine back directly since
+ * memcpy_mcsafe() may silently fallback to memcpy.
+ */
+ unsigned long rem = __memcpy_mcsafe(to, from, 1);
+
+ if (rem)
+ break;
+ }
+ return len;
+}
+
+#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
+/**
+ * clean_cache_range - write back a cache range with CLWB
+ * @vaddr: virtual start address
+ * @size: number of bytes to write back
+ *
+ * Write back a cache range using the CLWB (cache line write back)
+ * instruction. Note that @size is internally rounded up to be cache
+ * line size aligned.
+ */
+static void clean_cache_range(void *addr, size_t size)
+{
+ u16 x86_clflush_size = boot_cpu_data.x86_clflush_size;
+ unsigned long clflush_mask = x86_clflush_size - 1;
+ void *vend = addr + size;
+ void *p;
+
+ for (p = (void *)((unsigned long)addr & ~clflush_mask);
+ p < vend; p += x86_clflush_size)
+ clwb(p);
+}
+
+void arch_wb_cache_pmem(void *addr, size_t size)
+{
+ clean_cache_range(addr, size);
+}
+EXPORT_SYMBOL_GPL(arch_wb_cache_pmem);
+
+long __copy_user_flushcache(void *dst, const void __user *src, unsigned size)
+{
+ unsigned long flushed, dest = (unsigned long) dst;
+ long rc = __copy_user_nocache(dst, src, size, 0);
+
+ /*
+ * __copy_user_nocache() uses non-temporal stores for the bulk
+ * of the transfer, but we need to manually flush if the
+ * transfer is unaligned. A cached memory copy is used when
+ * destination or size is not naturally aligned. That is:
+ * - Require 8-byte alignment when size is 8 bytes or larger.
+ * - Require 4-byte alignment when size is 4 bytes.
+ */
+ if (size < 8) {
+ if (!IS_ALIGNED(dest, 4) || size != 4)
+ clean_cache_range(dst, size);
+ } else {
+ if (!IS_ALIGNED(dest, 8)) {
+ dest = ALIGN(dest, boot_cpu_data.x86_clflush_size);
+ clean_cache_range(dst, 1);
+ }
+
+ flushed = dest - (unsigned long) dst;
+ if (size > flushed && !IS_ALIGNED(size - flushed, 8))
+ clean_cache_range(dst + size - 1, 1);
+ }
+
+ return rc;
+}
+
+void memcpy_flushcache(void *_dst, const void *_src, size_t size)
+{
+ unsigned long dest = (unsigned long) _dst;
+ unsigned long source = (unsigned long) _src;
+
+ /* cache copy and flush to align dest */
+ if (!IS_ALIGNED(dest, 8)) {
+ size_t len = min_t(size_t, size, ALIGN(dest, 8) - dest);
+
+ memcpy((void *) dest, (void *) source, len);
+ clean_cache_range((void *) dest, len);
+ dest += len;
+ source += len;
+ size -= len;
+ if (!size)
+ return;
+ }
+
+ /* 4x8 movnti loop */
+ while (size >= 32) {
+ asm("movq (%0), %%r8\n"
+ "movq 8(%0), %%r9\n"
+ "movq 16(%0), %%r10\n"
+ "movq 24(%0), %%r11\n"
+ "movnti %%r8, (%1)\n"
+ "movnti %%r9, 8(%1)\n"
+ "movnti %%r10, 16(%1)\n"
+ "movnti %%r11, 24(%1)\n"
+ :: "r" (source), "r" (dest)
+ : "memory", "r8", "r9", "r10", "r11");
+ dest += 32;
+ source += 32;
+ size -= 32;
+ }
+
+ /* 1x8 movnti loop */
+ while (size >= 8) {
+ asm("movq (%0), %%r8\n"
+ "movnti %%r8, (%1)\n"
+ :: "r" (source), "r" (dest)
+ : "memory", "r8");
+ dest += 8;
+ source += 8;
+ size -= 8;
+ }
+
+ /* 1x4 movnti loop */
+ while (size >= 4) {
+ asm("movl (%0), %%r8d\n"
+ "movnti %%r8d, (%1)\n"
+ :: "r" (source), "r" (dest)
+ : "memory", "r8");
+ dest += 4;
+ source += 4;
+ size -= 4;
+ }
+
+ /* cache copy for remaining bytes */
+ if (size) {
+ memcpy((void *) dest, (void *) source, size);
+ clean_cache_range((void *) dest, size);
+ }
+}
+EXPORT_SYMBOL_GPL(memcpy_flushcache);
+
+void memcpy_page_flushcache(char *to, struct page *page, size_t offset,
+ size_t len)
+{
+ char *from = kmap_atomic(page);
+
+ memcpy_flushcache(to, from + offset, len);
+ kunmap_atomic(from);
+}
+#endif
diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt
new file mode 100644
index 000000000..5cb9f009f
--- /dev/null
+++ b/arch/x86/lib/x86-opcode-map.txt
@@ -0,0 +1,1078 @@
+# x86 Opcode Maps
+#
+# This is (mostly) based on following documentations.
+# - Intel(R) 64 and IA-32 Architectures Software Developer's Manual Vol.2C
+# (#326018-047US, June 2013)
+#
+#<Opcode maps>
+# Table: table-name
+# Referrer: escaped-name
+# AVXcode: avx-code
+# opcode: mnemonic|GrpXXX [operand1[,operand2...]] [(extra1)[,(extra2)...] [| 2nd-mnemonic ...]
+# (or)
+# opcode: escape # escaped-name
+# EndTable
+#
+# mnemonics that begin with lowercase 'v' accept a VEX or EVEX prefix
+# mnemonics that begin with lowercase 'k' accept a VEX prefix
+#
+#<group maps>
+# GrpTable: GrpXXX
+# reg: mnemonic [operand1[,operand2...]] [(extra1)[,(extra2)...] [| 2nd-mnemonic ...]
+# EndTable
+#
+# AVX Superscripts
+# (ev): this opcode requires EVEX prefix.
+# (evo): this opcode is changed by EVEX prefix (EVEX opcode)
+# (v): this opcode requires VEX prefix.
+# (v1): this opcode only supports 128bit VEX.
+#
+# Last Prefix Superscripts
+# - (66): the last prefix is 0x66
+# - (F3): the last prefix is 0xF3
+# - (F2): the last prefix is 0xF2
+# - (!F3) : the last prefix is not 0xF3 (including non-last prefix case)
+# - (66&F2): Both 0x66 and 0xF2 prefixes are specified.
+
+Table: one byte opcode
+Referrer:
+AVXcode:
+# 0x00 - 0x0f
+00: ADD Eb,Gb
+01: ADD Ev,Gv
+02: ADD Gb,Eb
+03: ADD Gv,Ev
+04: ADD AL,Ib
+05: ADD rAX,Iz
+06: PUSH ES (i64)
+07: POP ES (i64)
+08: OR Eb,Gb
+09: OR Ev,Gv
+0a: OR Gb,Eb
+0b: OR Gv,Ev
+0c: OR AL,Ib
+0d: OR rAX,Iz
+0e: PUSH CS (i64)
+0f: escape # 2-byte escape
+# 0x10 - 0x1f
+10: ADC Eb,Gb
+11: ADC Ev,Gv
+12: ADC Gb,Eb
+13: ADC Gv,Ev
+14: ADC AL,Ib
+15: ADC rAX,Iz
+16: PUSH SS (i64)
+17: POP SS (i64)
+18: SBB Eb,Gb
+19: SBB Ev,Gv
+1a: SBB Gb,Eb
+1b: SBB Gv,Ev
+1c: SBB AL,Ib
+1d: SBB rAX,Iz
+1e: PUSH DS (i64)
+1f: POP DS (i64)
+# 0x20 - 0x2f
+20: AND Eb,Gb
+21: AND Ev,Gv
+22: AND Gb,Eb
+23: AND Gv,Ev
+24: AND AL,Ib
+25: AND rAx,Iz
+26: SEG=ES (Prefix)
+27: DAA (i64)
+28: SUB Eb,Gb
+29: SUB Ev,Gv
+2a: SUB Gb,Eb
+2b: SUB Gv,Ev
+2c: SUB AL,Ib
+2d: SUB rAX,Iz
+2e: SEG=CS (Prefix)
+2f: DAS (i64)
+# 0x30 - 0x3f
+30: XOR Eb,Gb
+31: XOR Ev,Gv
+32: XOR Gb,Eb
+33: XOR Gv,Ev
+34: XOR AL,Ib
+35: XOR rAX,Iz
+36: SEG=SS (Prefix)
+37: AAA (i64)
+38: CMP Eb,Gb
+39: CMP Ev,Gv
+3a: CMP Gb,Eb
+3b: CMP Gv,Ev
+3c: CMP AL,Ib
+3d: CMP rAX,Iz
+3e: SEG=DS (Prefix)
+3f: AAS (i64)
+# 0x40 - 0x4f
+40: INC eAX (i64) | REX (o64)
+41: INC eCX (i64) | REX.B (o64)
+42: INC eDX (i64) | REX.X (o64)
+43: INC eBX (i64) | REX.XB (o64)
+44: INC eSP (i64) | REX.R (o64)
+45: INC eBP (i64) | REX.RB (o64)
+46: INC eSI (i64) | REX.RX (o64)
+47: INC eDI (i64) | REX.RXB (o64)
+48: DEC eAX (i64) | REX.W (o64)
+49: DEC eCX (i64) | REX.WB (o64)
+4a: DEC eDX (i64) | REX.WX (o64)
+4b: DEC eBX (i64) | REX.WXB (o64)
+4c: DEC eSP (i64) | REX.WR (o64)
+4d: DEC eBP (i64) | REX.WRB (o64)
+4e: DEC eSI (i64) | REX.WRX (o64)
+4f: DEC eDI (i64) | REX.WRXB (o64)
+# 0x50 - 0x5f
+50: PUSH rAX/r8 (d64)
+51: PUSH rCX/r9 (d64)
+52: PUSH rDX/r10 (d64)
+53: PUSH rBX/r11 (d64)
+54: PUSH rSP/r12 (d64)
+55: PUSH rBP/r13 (d64)
+56: PUSH rSI/r14 (d64)
+57: PUSH rDI/r15 (d64)
+58: POP rAX/r8 (d64)
+59: POP rCX/r9 (d64)
+5a: POP rDX/r10 (d64)
+5b: POP rBX/r11 (d64)
+5c: POP rSP/r12 (d64)
+5d: POP rBP/r13 (d64)
+5e: POP rSI/r14 (d64)
+5f: POP rDI/r15 (d64)
+# 0x60 - 0x6f
+60: PUSHA/PUSHAD (i64)
+61: POPA/POPAD (i64)
+62: BOUND Gv,Ma (i64) | EVEX (Prefix)
+63: ARPL Ew,Gw (i64) | MOVSXD Gv,Ev (o64)
+64: SEG=FS (Prefix)
+65: SEG=GS (Prefix)
+66: Operand-Size (Prefix)
+67: Address-Size (Prefix)
+68: PUSH Iz (d64)
+69: IMUL Gv,Ev,Iz
+6a: PUSH Ib (d64)
+6b: IMUL Gv,Ev,Ib
+6c: INS/INSB Yb,DX
+6d: INS/INSW/INSD Yz,DX
+6e: OUTS/OUTSB DX,Xb
+6f: OUTS/OUTSW/OUTSD DX,Xz
+# 0x70 - 0x7f
+70: JO Jb
+71: JNO Jb
+72: JB/JNAE/JC Jb
+73: JNB/JAE/JNC Jb
+74: JZ/JE Jb
+75: JNZ/JNE Jb
+76: JBE/JNA Jb
+77: JNBE/JA Jb
+78: JS Jb
+79: JNS Jb
+7a: JP/JPE Jb
+7b: JNP/JPO Jb
+7c: JL/JNGE Jb
+7d: JNL/JGE Jb
+7e: JLE/JNG Jb
+7f: JNLE/JG Jb
+# 0x80 - 0x8f
+80: Grp1 Eb,Ib (1A)
+81: Grp1 Ev,Iz (1A)
+82: Grp1 Eb,Ib (1A),(i64)
+83: Grp1 Ev,Ib (1A)
+84: TEST Eb,Gb
+85: TEST Ev,Gv
+86: XCHG Eb,Gb
+87: XCHG Ev,Gv
+88: MOV Eb,Gb
+89: MOV Ev,Gv
+8a: MOV Gb,Eb
+8b: MOV Gv,Ev
+8c: MOV Ev,Sw
+8d: LEA Gv,M
+8e: MOV Sw,Ew
+8f: Grp1A (1A) | POP Ev (d64)
+# 0x90 - 0x9f
+90: NOP | PAUSE (F3) | XCHG r8,rAX
+91: XCHG rCX/r9,rAX
+92: XCHG rDX/r10,rAX
+93: XCHG rBX/r11,rAX
+94: XCHG rSP/r12,rAX
+95: XCHG rBP/r13,rAX
+96: XCHG rSI/r14,rAX
+97: XCHG rDI/r15,rAX
+98: CBW/CWDE/CDQE
+99: CWD/CDQ/CQO
+9a: CALLF Ap (i64)
+9b: FWAIT/WAIT
+9c: PUSHF/D/Q Fv (d64)
+9d: POPF/D/Q Fv (d64)
+9e: SAHF
+9f: LAHF
+# 0xa0 - 0xaf
+a0: MOV AL,Ob
+a1: MOV rAX,Ov
+a2: MOV Ob,AL
+a3: MOV Ov,rAX
+a4: MOVS/B Yb,Xb
+a5: MOVS/W/D/Q Yv,Xv
+a6: CMPS/B Xb,Yb
+a7: CMPS/W/D Xv,Yv
+a8: TEST AL,Ib
+a9: TEST rAX,Iz
+aa: STOS/B Yb,AL
+ab: STOS/W/D/Q Yv,rAX
+ac: LODS/B AL,Xb
+ad: LODS/W/D/Q rAX,Xv
+ae: SCAS/B AL,Yb
+# Note: The May 2011 Intel manual shows Xv for the second parameter of the
+# next instruction but Yv is correct
+af: SCAS/W/D/Q rAX,Yv
+# 0xb0 - 0xbf
+b0: MOV AL/R8L,Ib
+b1: MOV CL/R9L,Ib
+b2: MOV DL/R10L,Ib
+b3: MOV BL/R11L,Ib
+b4: MOV AH/R12L,Ib
+b5: MOV CH/R13L,Ib
+b6: MOV DH/R14L,Ib
+b7: MOV BH/R15L,Ib
+b8: MOV rAX/r8,Iv
+b9: MOV rCX/r9,Iv
+ba: MOV rDX/r10,Iv
+bb: MOV rBX/r11,Iv
+bc: MOV rSP/r12,Iv
+bd: MOV rBP/r13,Iv
+be: MOV rSI/r14,Iv
+bf: MOV rDI/r15,Iv
+# 0xc0 - 0xcf
+c0: Grp2 Eb,Ib (1A)
+c1: Grp2 Ev,Ib (1A)
+c2: RETN Iw (f64)
+c3: RETN
+c4: LES Gz,Mp (i64) | VEX+2byte (Prefix)
+c5: LDS Gz,Mp (i64) | VEX+1byte (Prefix)
+c6: Grp11A Eb,Ib (1A)
+c7: Grp11B Ev,Iz (1A)
+c8: ENTER Iw,Ib
+c9: LEAVE (d64)
+ca: RETF Iw
+cb: RETF
+cc: INT3
+cd: INT Ib
+ce: INTO (i64)
+cf: IRET/D/Q
+# 0xd0 - 0xdf
+d0: Grp2 Eb,1 (1A)
+d1: Grp2 Ev,1 (1A)
+d2: Grp2 Eb,CL (1A)
+d3: Grp2 Ev,CL (1A)
+d4: AAM Ib (i64)
+d5: AAD Ib (i64)
+d6:
+d7: XLAT/XLATB
+d8: ESC
+d9: ESC
+da: ESC
+db: ESC
+dc: ESC
+dd: ESC
+de: ESC
+df: ESC
+# 0xe0 - 0xef
+# Note: "forced64" is Intel CPU behavior: they ignore 0x66 prefix
+# in 64-bit mode. AMD CPUs accept 0x66 prefix, it causes RIP truncation
+# to 16 bits. In 32-bit mode, 0x66 is accepted by both Intel and AMD.
+e0: LOOPNE/LOOPNZ Jb (f64)
+e1: LOOPE/LOOPZ Jb (f64)
+e2: LOOP Jb (f64)
+e3: JrCXZ Jb (f64)
+e4: IN AL,Ib
+e5: IN eAX,Ib
+e6: OUT Ib,AL
+e7: OUT Ib,eAX
+# With 0x66 prefix in 64-bit mode, for AMD CPUs immediate offset
+# in "near" jumps and calls is 16-bit. For CALL,
+# push of return address is 16-bit wide, RSP is decremented by 2
+# but is not truncated to 16 bits, unlike RIP.
+e8: CALL Jz (f64)
+e9: JMP-near Jz (f64)
+ea: JMP-far Ap (i64)
+eb: JMP-short Jb (f64)
+ec: IN AL,DX
+ed: IN eAX,DX
+ee: OUT DX,AL
+ef: OUT DX,eAX
+# 0xf0 - 0xff
+f0: LOCK (Prefix)
+f1:
+f2: REPNE (Prefix) | XACQUIRE (Prefix)
+f3: REP/REPE (Prefix) | XRELEASE (Prefix)
+f4: HLT
+f5: CMC
+f6: Grp3_1 Eb (1A)
+f7: Grp3_2 Ev (1A)
+f8: CLC
+f9: STC
+fa: CLI
+fb: STI
+fc: CLD
+fd: STD
+fe: Grp4 (1A)
+ff: Grp5 (1A)
+EndTable
+
+Table: 2-byte opcode (0x0f)
+Referrer: 2-byte escape
+AVXcode: 1
+# 0x0f 0x00-0x0f
+00: Grp6 (1A)
+01: Grp7 (1A)
+02: LAR Gv,Ew
+03: LSL Gv,Ew
+04:
+05: SYSCALL (o64)
+06: CLTS
+07: SYSRET (o64)
+08: INVD
+09: WBINVD | WBNOINVD (F3)
+0a:
+0b: UD2 (1B)
+0c:
+# AMD's prefetch group. Intel supports prefetchw(/1) only.
+0d: GrpP
+0e: FEMMS
+# 3DNow! uses the last imm byte as opcode extension.
+0f: 3DNow! Pq,Qq,Ib
+# 0x0f 0x10-0x1f
+# NOTE: According to Intel SDM opcode map, vmovups and vmovupd has no operands
+# but it actually has operands. And also, vmovss and vmovsd only accept 128bit.
+# MOVSS/MOVSD has too many forms(3) on SDM. This map just shows a typical form.
+# Many AVX instructions lack v1 superscript, according to Intel AVX-Prgramming
+# Reference A.1
+10: vmovups Vps,Wps | vmovupd Vpd,Wpd (66) | vmovss Vx,Hx,Wss (F3),(v1) | vmovsd Vx,Hx,Wsd (F2),(v1)
+11: vmovups Wps,Vps | vmovupd Wpd,Vpd (66) | vmovss Wss,Hx,Vss (F3),(v1) | vmovsd Wsd,Hx,Vsd (F2),(v1)
+12: vmovlps Vq,Hq,Mq (v1) | vmovhlps Vq,Hq,Uq (v1) | vmovlpd Vq,Hq,Mq (66),(v1) | vmovsldup Vx,Wx (F3) | vmovddup Vx,Wx (F2)
+13: vmovlps Mq,Vq (v1) | vmovlpd Mq,Vq (66),(v1)
+14: vunpcklps Vx,Hx,Wx | vunpcklpd Vx,Hx,Wx (66)
+15: vunpckhps Vx,Hx,Wx | vunpckhpd Vx,Hx,Wx (66)
+16: vmovhps Vdq,Hq,Mq (v1) | vmovlhps Vdq,Hq,Uq (v1) | vmovhpd Vdq,Hq,Mq (66),(v1) | vmovshdup Vx,Wx (F3)
+17: vmovhps Mq,Vq (v1) | vmovhpd Mq,Vq (66),(v1)
+18: Grp16 (1A)
+19:
+# Intel SDM opcode map does not list MPX instructions. For now using Gv for
+# bnd registers and Ev for everything else is OK because the instruction
+# decoder does not use the information except as an indication that there is
+# a ModR/M byte.
+1a: BNDCL Gv,Ev (F3) | BNDCU Gv,Ev (F2) | BNDMOV Gv,Ev (66) | BNDLDX Gv,Ev
+1b: BNDCN Gv,Ev (F2) | BNDMOV Ev,Gv (66) | BNDMK Gv,Ev (F3) | BNDSTX Ev,Gv
+1c: Grp20 (1A),(1C)
+1d:
+1e:
+1f: NOP Ev
+# 0x0f 0x20-0x2f
+20: MOV Rd,Cd
+21: MOV Rd,Dd
+22: MOV Cd,Rd
+23: MOV Dd,Rd
+24:
+25:
+26:
+27:
+28: vmovaps Vps,Wps | vmovapd Vpd,Wpd (66)
+29: vmovaps Wps,Vps | vmovapd Wpd,Vpd (66)
+2a: cvtpi2ps Vps,Qpi | cvtpi2pd Vpd,Qpi (66) | vcvtsi2ss Vss,Hss,Ey (F3),(v1) | vcvtsi2sd Vsd,Hsd,Ey (F2),(v1)
+2b: vmovntps Mps,Vps | vmovntpd Mpd,Vpd (66)
+2c: cvttps2pi Ppi,Wps | cvttpd2pi Ppi,Wpd (66) | vcvttss2si Gy,Wss (F3),(v1) | vcvttsd2si Gy,Wsd (F2),(v1)
+2d: cvtps2pi Ppi,Wps | cvtpd2pi Qpi,Wpd (66) | vcvtss2si Gy,Wss (F3),(v1) | vcvtsd2si Gy,Wsd (F2),(v1)
+2e: vucomiss Vss,Wss (v1) | vucomisd Vsd,Wsd (66),(v1)
+2f: vcomiss Vss,Wss (v1) | vcomisd Vsd,Wsd (66),(v1)
+# 0x0f 0x30-0x3f
+30: WRMSR
+31: RDTSC
+32: RDMSR
+33: RDPMC
+34: SYSENTER
+35: SYSEXIT
+36:
+37: GETSEC
+38: escape # 3-byte escape 1
+39:
+3a: escape # 3-byte escape 2
+3b:
+3c:
+3d:
+3e:
+3f:
+# 0x0f 0x40-0x4f
+40: CMOVO Gv,Ev
+41: CMOVNO Gv,Ev | kandw/q Vk,Hk,Uk | kandb/d Vk,Hk,Uk (66)
+42: CMOVB/C/NAE Gv,Ev | kandnw/q Vk,Hk,Uk | kandnb/d Vk,Hk,Uk (66)
+43: CMOVAE/NB/NC Gv,Ev
+44: CMOVE/Z Gv,Ev | knotw/q Vk,Uk | knotb/d Vk,Uk (66)
+45: CMOVNE/NZ Gv,Ev | korw/q Vk,Hk,Uk | korb/d Vk,Hk,Uk (66)
+46: CMOVBE/NA Gv,Ev | kxnorw/q Vk,Hk,Uk | kxnorb/d Vk,Hk,Uk (66)
+47: CMOVA/NBE Gv,Ev | kxorw/q Vk,Hk,Uk | kxorb/d Vk,Hk,Uk (66)
+48: CMOVS Gv,Ev
+49: CMOVNS Gv,Ev
+4a: CMOVP/PE Gv,Ev | kaddw/q Vk,Hk,Uk | kaddb/d Vk,Hk,Uk (66)
+4b: CMOVNP/PO Gv,Ev | kunpckbw Vk,Hk,Uk (66) | kunpckwd/dq Vk,Hk,Uk
+4c: CMOVL/NGE Gv,Ev
+4d: CMOVNL/GE Gv,Ev
+4e: CMOVLE/NG Gv,Ev
+4f: CMOVNLE/G Gv,Ev
+# 0x0f 0x50-0x5f
+50: vmovmskps Gy,Ups | vmovmskpd Gy,Upd (66)
+51: vsqrtps Vps,Wps | vsqrtpd Vpd,Wpd (66) | vsqrtss Vss,Hss,Wss (F3),(v1) | vsqrtsd Vsd,Hsd,Wsd (F2),(v1)
+52: vrsqrtps Vps,Wps | vrsqrtss Vss,Hss,Wss (F3),(v1)
+53: vrcpps Vps,Wps | vrcpss Vss,Hss,Wss (F3),(v1)
+54: vandps Vps,Hps,Wps | vandpd Vpd,Hpd,Wpd (66)
+55: vandnps Vps,Hps,Wps | vandnpd Vpd,Hpd,Wpd (66)
+56: vorps Vps,Hps,Wps | vorpd Vpd,Hpd,Wpd (66)
+57: vxorps Vps,Hps,Wps | vxorpd Vpd,Hpd,Wpd (66)
+58: vaddps Vps,Hps,Wps | vaddpd Vpd,Hpd,Wpd (66) | vaddss Vss,Hss,Wss (F3),(v1) | vaddsd Vsd,Hsd,Wsd (F2),(v1)
+59: vmulps Vps,Hps,Wps | vmulpd Vpd,Hpd,Wpd (66) | vmulss Vss,Hss,Wss (F3),(v1) | vmulsd Vsd,Hsd,Wsd (F2),(v1)
+5a: vcvtps2pd Vpd,Wps | vcvtpd2ps Vps,Wpd (66) | vcvtss2sd Vsd,Hx,Wss (F3),(v1) | vcvtsd2ss Vss,Hx,Wsd (F2),(v1)
+5b: vcvtdq2ps Vps,Wdq | vcvtqq2ps Vps,Wqq (evo) | vcvtps2dq Vdq,Wps (66) | vcvttps2dq Vdq,Wps (F3)
+5c: vsubps Vps,Hps,Wps | vsubpd Vpd,Hpd,Wpd (66) | vsubss Vss,Hss,Wss (F3),(v1) | vsubsd Vsd,Hsd,Wsd (F2),(v1)
+5d: vminps Vps,Hps,Wps | vminpd Vpd,Hpd,Wpd (66) | vminss Vss,Hss,Wss (F3),(v1) | vminsd Vsd,Hsd,Wsd (F2),(v1)
+5e: vdivps Vps,Hps,Wps | vdivpd Vpd,Hpd,Wpd (66) | vdivss Vss,Hss,Wss (F3),(v1) | vdivsd Vsd,Hsd,Wsd (F2),(v1)
+5f: vmaxps Vps,Hps,Wps | vmaxpd Vpd,Hpd,Wpd (66) | vmaxss Vss,Hss,Wss (F3),(v1) | vmaxsd Vsd,Hsd,Wsd (F2),(v1)
+# 0x0f 0x60-0x6f
+60: punpcklbw Pq,Qd | vpunpcklbw Vx,Hx,Wx (66),(v1)
+61: punpcklwd Pq,Qd | vpunpcklwd Vx,Hx,Wx (66),(v1)
+62: punpckldq Pq,Qd | vpunpckldq Vx,Hx,Wx (66),(v1)
+63: packsswb Pq,Qq | vpacksswb Vx,Hx,Wx (66),(v1)
+64: pcmpgtb Pq,Qq | vpcmpgtb Vx,Hx,Wx (66),(v1)
+65: pcmpgtw Pq,Qq | vpcmpgtw Vx,Hx,Wx (66),(v1)
+66: pcmpgtd Pq,Qq | vpcmpgtd Vx,Hx,Wx (66),(v1)
+67: packuswb Pq,Qq | vpackuswb Vx,Hx,Wx (66),(v1)
+68: punpckhbw Pq,Qd | vpunpckhbw Vx,Hx,Wx (66),(v1)
+69: punpckhwd Pq,Qd | vpunpckhwd Vx,Hx,Wx (66),(v1)
+6a: punpckhdq Pq,Qd | vpunpckhdq Vx,Hx,Wx (66),(v1)
+6b: packssdw Pq,Qd | vpackssdw Vx,Hx,Wx (66),(v1)
+6c: vpunpcklqdq Vx,Hx,Wx (66),(v1)
+6d: vpunpckhqdq Vx,Hx,Wx (66),(v1)
+6e: movd/q Pd,Ey | vmovd/q Vy,Ey (66),(v1)
+6f: movq Pq,Qq | vmovdqa Vx,Wx (66) | vmovdqa32/64 Vx,Wx (66),(evo) | vmovdqu Vx,Wx (F3) | vmovdqu32/64 Vx,Wx (F3),(evo) | vmovdqu8/16 Vx,Wx (F2),(ev)
+# 0x0f 0x70-0x7f
+70: pshufw Pq,Qq,Ib | vpshufd Vx,Wx,Ib (66),(v1) | vpshufhw Vx,Wx,Ib (F3),(v1) | vpshuflw Vx,Wx,Ib (F2),(v1)
+71: Grp12 (1A)
+72: Grp13 (1A)
+73: Grp14 (1A)
+74: pcmpeqb Pq,Qq | vpcmpeqb Vx,Hx,Wx (66),(v1)
+75: pcmpeqw Pq,Qq | vpcmpeqw Vx,Hx,Wx (66),(v1)
+76: pcmpeqd Pq,Qq | vpcmpeqd Vx,Hx,Wx (66),(v1)
+# Note: Remove (v), because vzeroall and vzeroupper becomes emms without VEX.
+77: emms | vzeroupper | vzeroall
+78: VMREAD Ey,Gy | vcvttps2udq/pd2udq Vx,Wpd (evo) | vcvttsd2usi Gv,Wx (F2),(ev) | vcvttss2usi Gv,Wx (F3),(ev) | vcvttps2uqq/pd2uqq Vx,Wx (66),(ev)
+79: VMWRITE Gy,Ey | vcvtps2udq/pd2udq Vx,Wpd (evo) | vcvtsd2usi Gv,Wx (F2),(ev) | vcvtss2usi Gv,Wx (F3),(ev) | vcvtps2uqq/pd2uqq Vx,Wx (66),(ev)
+7a: vcvtudq2pd/uqq2pd Vpd,Wx (F3),(ev) | vcvtudq2ps/uqq2ps Vpd,Wx (F2),(ev) | vcvttps2qq/pd2qq Vx,Wx (66),(ev)
+7b: vcvtusi2sd Vpd,Hpd,Ev (F2),(ev) | vcvtusi2ss Vps,Hps,Ev (F3),(ev) | vcvtps2qq/pd2qq Vx,Wx (66),(ev)
+7c: vhaddpd Vpd,Hpd,Wpd (66) | vhaddps Vps,Hps,Wps (F2)
+7d: vhsubpd Vpd,Hpd,Wpd (66) | vhsubps Vps,Hps,Wps (F2)
+7e: movd/q Ey,Pd | vmovd/q Ey,Vy (66),(v1) | vmovq Vq,Wq (F3),(v1)
+7f: movq Qq,Pq | vmovdqa Wx,Vx (66) | vmovdqa32/64 Wx,Vx (66),(evo) | vmovdqu Wx,Vx (F3) | vmovdqu32/64 Wx,Vx (F3),(evo) | vmovdqu8/16 Wx,Vx (F2),(ev)
+# 0x0f 0x80-0x8f
+# Note: "forced64" is Intel CPU behavior (see comment about CALL insn).
+80: JO Jz (f64)
+81: JNO Jz (f64)
+82: JB/JC/JNAE Jz (f64)
+83: JAE/JNB/JNC Jz (f64)
+84: JE/JZ Jz (f64)
+85: JNE/JNZ Jz (f64)
+86: JBE/JNA Jz (f64)
+87: JA/JNBE Jz (f64)
+88: JS Jz (f64)
+89: JNS Jz (f64)
+8a: JP/JPE Jz (f64)
+8b: JNP/JPO Jz (f64)
+8c: JL/JNGE Jz (f64)
+8d: JNL/JGE Jz (f64)
+8e: JLE/JNG Jz (f64)
+8f: JNLE/JG Jz (f64)
+# 0x0f 0x90-0x9f
+90: SETO Eb | kmovw/q Vk,Wk | kmovb/d Vk,Wk (66)
+91: SETNO Eb | kmovw/q Mv,Vk | kmovb/d Mv,Vk (66)
+92: SETB/C/NAE Eb | kmovw Vk,Rv | kmovb Vk,Rv (66) | kmovq/d Vk,Rv (F2)
+93: SETAE/NB/NC Eb | kmovw Gv,Uk | kmovb Gv,Uk (66) | kmovq/d Gv,Uk (F2)
+94: SETE/Z Eb
+95: SETNE/NZ Eb
+96: SETBE/NA Eb
+97: SETA/NBE Eb
+98: SETS Eb | kortestw/q Vk,Uk | kortestb/d Vk,Uk (66)
+99: SETNS Eb | ktestw/q Vk,Uk | ktestb/d Vk,Uk (66)
+9a: SETP/PE Eb
+9b: SETNP/PO Eb
+9c: SETL/NGE Eb
+9d: SETNL/GE Eb
+9e: SETLE/NG Eb
+9f: SETNLE/G Eb
+# 0x0f 0xa0-0xaf
+a0: PUSH FS (d64)
+a1: POP FS (d64)
+a2: CPUID
+a3: BT Ev,Gv
+a4: SHLD Ev,Gv,Ib
+a5: SHLD Ev,Gv,CL
+a6: GrpPDLK
+a7: GrpRNG
+a8: PUSH GS (d64)
+a9: POP GS (d64)
+aa: RSM
+ab: BTS Ev,Gv
+ac: SHRD Ev,Gv,Ib
+ad: SHRD Ev,Gv,CL
+ae: Grp15 (1A),(1C)
+af: IMUL Gv,Ev
+# 0x0f 0xb0-0xbf
+b0: CMPXCHG Eb,Gb
+b1: CMPXCHG Ev,Gv
+b2: LSS Gv,Mp
+b3: BTR Ev,Gv
+b4: LFS Gv,Mp
+b5: LGS Gv,Mp
+b6: MOVZX Gv,Eb
+b7: MOVZX Gv,Ew
+b8: JMPE (!F3) | POPCNT Gv,Ev (F3)
+b9: Grp10 (1A)
+ba: Grp8 Ev,Ib (1A)
+bb: BTC Ev,Gv
+bc: BSF Gv,Ev (!F3) | TZCNT Gv,Ev (F3)
+bd: BSR Gv,Ev (!F3) | LZCNT Gv,Ev (F3)
+be: MOVSX Gv,Eb
+bf: MOVSX Gv,Ew
+# 0x0f 0xc0-0xcf
+c0: XADD Eb,Gb
+c1: XADD Ev,Gv
+c2: vcmpps Vps,Hps,Wps,Ib | vcmppd Vpd,Hpd,Wpd,Ib (66) | vcmpss Vss,Hss,Wss,Ib (F3),(v1) | vcmpsd Vsd,Hsd,Wsd,Ib (F2),(v1)
+c3: movnti My,Gy
+c4: pinsrw Pq,Ry/Mw,Ib | vpinsrw Vdq,Hdq,Ry/Mw,Ib (66),(v1)
+c5: pextrw Gd,Nq,Ib | vpextrw Gd,Udq,Ib (66),(v1)
+c6: vshufps Vps,Hps,Wps,Ib | vshufpd Vpd,Hpd,Wpd,Ib (66)
+c7: Grp9 (1A)
+c8: BSWAP RAX/EAX/R8/R8D
+c9: BSWAP RCX/ECX/R9/R9D
+ca: BSWAP RDX/EDX/R10/R10D
+cb: BSWAP RBX/EBX/R11/R11D
+cc: BSWAP RSP/ESP/R12/R12D
+cd: BSWAP RBP/EBP/R13/R13D
+ce: BSWAP RSI/ESI/R14/R14D
+cf: BSWAP RDI/EDI/R15/R15D
+# 0x0f 0xd0-0xdf
+d0: vaddsubpd Vpd,Hpd,Wpd (66) | vaddsubps Vps,Hps,Wps (F2)
+d1: psrlw Pq,Qq | vpsrlw Vx,Hx,Wx (66),(v1)
+d2: psrld Pq,Qq | vpsrld Vx,Hx,Wx (66),(v1)
+d3: psrlq Pq,Qq | vpsrlq Vx,Hx,Wx (66),(v1)
+d4: paddq Pq,Qq | vpaddq Vx,Hx,Wx (66),(v1)
+d5: pmullw Pq,Qq | vpmullw Vx,Hx,Wx (66),(v1)
+d6: vmovq Wq,Vq (66),(v1) | movq2dq Vdq,Nq (F3) | movdq2q Pq,Uq (F2)
+d7: pmovmskb Gd,Nq | vpmovmskb Gd,Ux (66),(v1)
+d8: psubusb Pq,Qq | vpsubusb Vx,Hx,Wx (66),(v1)
+d9: psubusw Pq,Qq | vpsubusw Vx,Hx,Wx (66),(v1)
+da: pminub Pq,Qq | vpminub Vx,Hx,Wx (66),(v1)
+db: pand Pq,Qq | vpand Vx,Hx,Wx (66),(v1) | vpandd/q Vx,Hx,Wx (66),(evo)
+dc: paddusb Pq,Qq | vpaddusb Vx,Hx,Wx (66),(v1)
+dd: paddusw Pq,Qq | vpaddusw Vx,Hx,Wx (66),(v1)
+de: pmaxub Pq,Qq | vpmaxub Vx,Hx,Wx (66),(v1)
+df: pandn Pq,Qq | vpandn Vx,Hx,Wx (66),(v1) | vpandnd/q Vx,Hx,Wx (66),(evo)
+# 0x0f 0xe0-0xef
+e0: pavgb Pq,Qq | vpavgb Vx,Hx,Wx (66),(v1)
+e1: psraw Pq,Qq | vpsraw Vx,Hx,Wx (66),(v1)
+e2: psrad Pq,Qq | vpsrad Vx,Hx,Wx (66),(v1)
+e3: pavgw Pq,Qq | vpavgw Vx,Hx,Wx (66),(v1)
+e4: pmulhuw Pq,Qq | vpmulhuw Vx,Hx,Wx (66),(v1)
+e5: pmulhw Pq,Qq | vpmulhw Vx,Hx,Wx (66),(v1)
+e6: vcvttpd2dq Vx,Wpd (66) | vcvtdq2pd Vx,Wdq (F3) | vcvtdq2pd/qq2pd Vx,Wdq (F3),(evo) | vcvtpd2dq Vx,Wpd (F2)
+e7: movntq Mq,Pq | vmovntdq Mx,Vx (66)
+e8: psubsb Pq,Qq | vpsubsb Vx,Hx,Wx (66),(v1)
+e9: psubsw Pq,Qq | vpsubsw Vx,Hx,Wx (66),(v1)
+ea: pminsw Pq,Qq | vpminsw Vx,Hx,Wx (66),(v1)
+eb: por Pq,Qq | vpor Vx,Hx,Wx (66),(v1) | vpord/q Vx,Hx,Wx (66),(evo)
+ec: paddsb Pq,Qq | vpaddsb Vx,Hx,Wx (66),(v1)
+ed: paddsw Pq,Qq | vpaddsw Vx,Hx,Wx (66),(v1)
+ee: pmaxsw Pq,Qq | vpmaxsw Vx,Hx,Wx (66),(v1)
+ef: pxor Pq,Qq | vpxor Vx,Hx,Wx (66),(v1) | vpxord/q Vx,Hx,Wx (66),(evo)
+# 0x0f 0xf0-0xff
+f0: vlddqu Vx,Mx (F2)
+f1: psllw Pq,Qq | vpsllw Vx,Hx,Wx (66),(v1)
+f2: pslld Pq,Qq | vpslld Vx,Hx,Wx (66),(v1)
+f3: psllq Pq,Qq | vpsllq Vx,Hx,Wx (66),(v1)
+f4: pmuludq Pq,Qq | vpmuludq Vx,Hx,Wx (66),(v1)
+f5: pmaddwd Pq,Qq | vpmaddwd Vx,Hx,Wx (66),(v1)
+f6: psadbw Pq,Qq | vpsadbw Vx,Hx,Wx (66),(v1)
+f7: maskmovq Pq,Nq | vmaskmovdqu Vx,Ux (66),(v1)
+f8: psubb Pq,Qq | vpsubb Vx,Hx,Wx (66),(v1)
+f9: psubw Pq,Qq | vpsubw Vx,Hx,Wx (66),(v1)
+fa: psubd Pq,Qq | vpsubd Vx,Hx,Wx (66),(v1)
+fb: psubq Pq,Qq | vpsubq Vx,Hx,Wx (66),(v1)
+fc: paddb Pq,Qq | vpaddb Vx,Hx,Wx (66),(v1)
+fd: paddw Pq,Qq | vpaddw Vx,Hx,Wx (66),(v1)
+fe: paddd Pq,Qq | vpaddd Vx,Hx,Wx (66),(v1)
+ff: UD0
+EndTable
+
+Table: 3-byte opcode 1 (0x0f 0x38)
+Referrer: 3-byte escape 1
+AVXcode: 2
+# 0x0f 0x38 0x00-0x0f
+00: pshufb Pq,Qq | vpshufb Vx,Hx,Wx (66),(v1)
+01: phaddw Pq,Qq | vphaddw Vx,Hx,Wx (66),(v1)
+02: phaddd Pq,Qq | vphaddd Vx,Hx,Wx (66),(v1)
+03: phaddsw Pq,Qq | vphaddsw Vx,Hx,Wx (66),(v1)
+04: pmaddubsw Pq,Qq | vpmaddubsw Vx,Hx,Wx (66),(v1)
+05: phsubw Pq,Qq | vphsubw Vx,Hx,Wx (66),(v1)
+06: phsubd Pq,Qq | vphsubd Vx,Hx,Wx (66),(v1)
+07: phsubsw Pq,Qq | vphsubsw Vx,Hx,Wx (66),(v1)
+08: psignb Pq,Qq | vpsignb Vx,Hx,Wx (66),(v1)
+09: psignw Pq,Qq | vpsignw Vx,Hx,Wx (66),(v1)
+0a: psignd Pq,Qq | vpsignd Vx,Hx,Wx (66),(v1)
+0b: pmulhrsw Pq,Qq | vpmulhrsw Vx,Hx,Wx (66),(v1)
+0c: vpermilps Vx,Hx,Wx (66),(v)
+0d: vpermilpd Vx,Hx,Wx (66),(v)
+0e: vtestps Vx,Wx (66),(v)
+0f: vtestpd Vx,Wx (66),(v)
+# 0x0f 0x38 0x10-0x1f
+10: pblendvb Vdq,Wdq (66) | vpsrlvw Vx,Hx,Wx (66),(evo) | vpmovuswb Wx,Vx (F3),(ev)
+11: vpmovusdb Wx,Vd (F3),(ev) | vpsravw Vx,Hx,Wx (66),(ev)
+12: vpmovusqb Wx,Vq (F3),(ev) | vpsllvw Vx,Hx,Wx (66),(ev)
+13: vcvtph2ps Vx,Wx (66),(v) | vpmovusdw Wx,Vd (F3),(ev)
+14: blendvps Vdq,Wdq (66) | vpmovusqw Wx,Vq (F3),(ev) | vprorvd/q Vx,Hx,Wx (66),(evo)
+15: blendvpd Vdq,Wdq (66) | vpmovusqd Wx,Vq (F3),(ev) | vprolvd/q Vx,Hx,Wx (66),(evo)
+16: vpermps Vqq,Hqq,Wqq (66),(v) | vpermps/d Vqq,Hqq,Wqq (66),(evo)
+17: vptest Vx,Wx (66)
+18: vbroadcastss Vx,Wd (66),(v)
+19: vbroadcastsd Vqq,Wq (66),(v) | vbroadcastf32x2 Vqq,Wq (66),(evo)
+1a: vbroadcastf128 Vqq,Mdq (66),(v) | vbroadcastf32x4/64x2 Vqq,Wq (66),(evo)
+1b: vbroadcastf32x8/64x4 Vqq,Mdq (66),(ev)
+1c: pabsb Pq,Qq | vpabsb Vx,Wx (66),(v1)
+1d: pabsw Pq,Qq | vpabsw Vx,Wx (66),(v1)
+1e: pabsd Pq,Qq | vpabsd Vx,Wx (66),(v1)
+1f: vpabsq Vx,Wx (66),(ev)
+# 0x0f 0x38 0x20-0x2f
+20: vpmovsxbw Vx,Ux/Mq (66),(v1) | vpmovswb Wx,Vx (F3),(ev)
+21: vpmovsxbd Vx,Ux/Md (66),(v1) | vpmovsdb Wx,Vd (F3),(ev)
+22: vpmovsxbq Vx,Ux/Mw (66),(v1) | vpmovsqb Wx,Vq (F3),(ev)
+23: vpmovsxwd Vx,Ux/Mq (66),(v1) | vpmovsdw Wx,Vd (F3),(ev)
+24: vpmovsxwq Vx,Ux/Md (66),(v1) | vpmovsqw Wx,Vq (F3),(ev)
+25: vpmovsxdq Vx,Ux/Mq (66),(v1) | vpmovsqd Wx,Vq (F3),(ev)
+26: vptestmb/w Vk,Hx,Wx (66),(ev) | vptestnmb/w Vk,Hx,Wx (F3),(ev)
+27: vptestmd/q Vk,Hx,Wx (66),(ev) | vptestnmd/q Vk,Hx,Wx (F3),(ev)
+28: vpmuldq Vx,Hx,Wx (66),(v1) | vpmovm2b/w Vx,Uk (F3),(ev)
+29: vpcmpeqq Vx,Hx,Wx (66),(v1) | vpmovb2m/w2m Vk,Ux (F3),(ev)
+2a: vmovntdqa Vx,Mx (66),(v1) | vpbroadcastmb2q Vx,Uk (F3),(ev)
+2b: vpackusdw Vx,Hx,Wx (66),(v1)
+2c: vmaskmovps Vx,Hx,Mx (66),(v) | vscalefps/d Vx,Hx,Wx (66),(evo)
+2d: vmaskmovpd Vx,Hx,Mx (66),(v) | vscalefss/d Vx,Hx,Wx (66),(evo)
+2e: vmaskmovps Mx,Hx,Vx (66),(v)
+2f: vmaskmovpd Mx,Hx,Vx (66),(v)
+# 0x0f 0x38 0x30-0x3f
+30: vpmovzxbw Vx,Ux/Mq (66),(v1) | vpmovwb Wx,Vx (F3),(ev)
+31: vpmovzxbd Vx,Ux/Md (66),(v1) | vpmovdb Wx,Vd (F3),(ev)
+32: vpmovzxbq Vx,Ux/Mw (66),(v1) | vpmovqb Wx,Vq (F3),(ev)
+33: vpmovzxwd Vx,Ux/Mq (66),(v1) | vpmovdw Wx,Vd (F3),(ev)
+34: vpmovzxwq Vx,Ux/Md (66),(v1) | vpmovqw Wx,Vq (F3),(ev)
+35: vpmovzxdq Vx,Ux/Mq (66),(v1) | vpmovqd Wx,Vq (F3),(ev)
+36: vpermd Vqq,Hqq,Wqq (66),(v) | vpermd/q Vqq,Hqq,Wqq (66),(evo)
+37: vpcmpgtq Vx,Hx,Wx (66),(v1)
+38: vpminsb Vx,Hx,Wx (66),(v1) | vpmovm2d/q Vx,Uk (F3),(ev)
+39: vpminsd Vx,Hx,Wx (66),(v1) | vpminsd/q Vx,Hx,Wx (66),(evo) | vpmovd2m/q2m Vk,Ux (F3),(ev)
+3a: vpminuw Vx,Hx,Wx (66),(v1) | vpbroadcastmw2d Vx,Uk (F3),(ev)
+3b: vpminud Vx,Hx,Wx (66),(v1) | vpminud/q Vx,Hx,Wx (66),(evo)
+3c: vpmaxsb Vx,Hx,Wx (66),(v1)
+3d: vpmaxsd Vx,Hx,Wx (66),(v1) | vpmaxsd/q Vx,Hx,Wx (66),(evo)
+3e: vpmaxuw Vx,Hx,Wx (66),(v1)
+3f: vpmaxud Vx,Hx,Wx (66),(v1) | vpmaxud/q Vx,Hx,Wx (66),(evo)
+# 0x0f 0x38 0x40-0x8f
+40: vpmulld Vx,Hx,Wx (66),(v1) | vpmulld/q Vx,Hx,Wx (66),(evo)
+41: vphminposuw Vdq,Wdq (66),(v1)
+42: vgetexpps/d Vx,Wx (66),(ev)
+43: vgetexpss/d Vx,Hx,Wx (66),(ev)
+44: vplzcntd/q Vx,Wx (66),(ev)
+45: vpsrlvd/q Vx,Hx,Wx (66),(v)
+46: vpsravd Vx,Hx,Wx (66),(v) | vpsravd/q Vx,Hx,Wx (66),(evo)
+47: vpsllvd/q Vx,Hx,Wx (66),(v)
+# Skip 0x48-0x4b
+4c: vrcp14ps/d Vpd,Wpd (66),(ev)
+4d: vrcp14ss/d Vsd,Hpd,Wsd (66),(ev)
+4e: vrsqrt14ps/d Vpd,Wpd (66),(ev)
+4f: vrsqrt14ss/d Vsd,Hsd,Wsd (66),(ev)
+# Skip 0x50-0x57
+58: vpbroadcastd Vx,Wx (66),(v)
+59: vpbroadcastq Vx,Wx (66),(v) | vbroadcasti32x2 Vx,Wx (66),(evo)
+5a: vbroadcasti128 Vqq,Mdq (66),(v) | vbroadcasti32x4/64x2 Vx,Wx (66),(evo)
+5b: vbroadcasti32x8/64x4 Vqq,Mdq (66),(ev)
+# Skip 0x5c-0x63
+64: vpblendmd/q Vx,Hx,Wx (66),(ev)
+65: vblendmps/d Vx,Hx,Wx (66),(ev)
+66: vpblendmb/w Vx,Hx,Wx (66),(ev)
+# Skip 0x67-0x74
+75: vpermi2b/w Vx,Hx,Wx (66),(ev)
+76: vpermi2d/q Vx,Hx,Wx (66),(ev)
+77: vpermi2ps/d Vx,Hx,Wx (66),(ev)
+78: vpbroadcastb Vx,Wx (66),(v)
+79: vpbroadcastw Vx,Wx (66),(v)
+7a: vpbroadcastb Vx,Rv (66),(ev)
+7b: vpbroadcastw Vx,Rv (66),(ev)
+7c: vpbroadcastd/q Vx,Rv (66),(ev)
+7d: vpermt2b/w Vx,Hx,Wx (66),(ev)
+7e: vpermt2d/q Vx,Hx,Wx (66),(ev)
+7f: vpermt2ps/d Vx,Hx,Wx (66),(ev)
+80: INVEPT Gy,Mdq (66)
+81: INVVPID Gy,Mdq (66)
+82: INVPCID Gy,Mdq (66)
+83: vpmultishiftqb Vx,Hx,Wx (66),(ev)
+88: vexpandps/d Vpd,Wpd (66),(ev)
+89: vpexpandd/q Vx,Wx (66),(ev)
+8a: vcompressps/d Wx,Vx (66),(ev)
+8b: vpcompressd/q Wx,Vx (66),(ev)
+8c: vpmaskmovd/q Vx,Hx,Mx (66),(v)
+8d: vpermb/w Vx,Hx,Wx (66),(ev)
+8e: vpmaskmovd/q Mx,Vx,Hx (66),(v)
+# 0x0f 0x38 0x90-0xbf (FMA)
+90: vgatherdd/q Vx,Hx,Wx (66),(v) | vpgatherdd/q Vx,Wx (66),(evo)
+91: vgatherqd/q Vx,Hx,Wx (66),(v) | vpgatherqd/q Vx,Wx (66),(evo)
+92: vgatherdps/d Vx,Hx,Wx (66),(v)
+93: vgatherqps/d Vx,Hx,Wx (66),(v)
+94:
+95:
+96: vfmaddsub132ps/d Vx,Hx,Wx (66),(v)
+97: vfmsubadd132ps/d Vx,Hx,Wx (66),(v)
+98: vfmadd132ps/d Vx,Hx,Wx (66),(v)
+99: vfmadd132ss/d Vx,Hx,Wx (66),(v),(v1)
+9a: vfmsub132ps/d Vx,Hx,Wx (66),(v)
+9b: vfmsub132ss/d Vx,Hx,Wx (66),(v),(v1)
+9c: vfnmadd132ps/d Vx,Hx,Wx (66),(v)
+9d: vfnmadd132ss/d Vx,Hx,Wx (66),(v),(v1)
+9e: vfnmsub132ps/d Vx,Hx,Wx (66),(v)
+9f: vfnmsub132ss/d Vx,Hx,Wx (66),(v),(v1)
+a0: vpscatterdd/q Wx,Vx (66),(ev)
+a1: vpscatterqd/q Wx,Vx (66),(ev)
+a2: vscatterdps/d Wx,Vx (66),(ev)
+a3: vscatterqps/d Wx,Vx (66),(ev)
+a6: vfmaddsub213ps/d Vx,Hx,Wx (66),(v)
+a7: vfmsubadd213ps/d Vx,Hx,Wx (66),(v)
+a8: vfmadd213ps/d Vx,Hx,Wx (66),(v)
+a9: vfmadd213ss/d Vx,Hx,Wx (66),(v),(v1)
+aa: vfmsub213ps/d Vx,Hx,Wx (66),(v)
+ab: vfmsub213ss/d Vx,Hx,Wx (66),(v),(v1)
+ac: vfnmadd213ps/d Vx,Hx,Wx (66),(v)
+ad: vfnmadd213ss/d Vx,Hx,Wx (66),(v),(v1)
+ae: vfnmsub213ps/d Vx,Hx,Wx (66),(v)
+af: vfnmsub213ss/d Vx,Hx,Wx (66),(v),(v1)
+b4: vpmadd52luq Vx,Hx,Wx (66),(ev)
+b5: vpmadd52huq Vx,Hx,Wx (66),(ev)
+b6: vfmaddsub231ps/d Vx,Hx,Wx (66),(v)
+b7: vfmsubadd231ps/d Vx,Hx,Wx (66),(v)
+b8: vfmadd231ps/d Vx,Hx,Wx (66),(v)
+b9: vfmadd231ss/d Vx,Hx,Wx (66),(v),(v1)
+ba: vfmsub231ps/d Vx,Hx,Wx (66),(v)
+bb: vfmsub231ss/d Vx,Hx,Wx (66),(v),(v1)
+bc: vfnmadd231ps/d Vx,Hx,Wx (66),(v)
+bd: vfnmadd231ss/d Vx,Hx,Wx (66),(v),(v1)
+be: vfnmsub231ps/d Vx,Hx,Wx (66),(v)
+bf: vfnmsub231ss/d Vx,Hx,Wx (66),(v),(v1)
+# 0x0f 0x38 0xc0-0xff
+c4: vpconflictd/q Vx,Wx (66),(ev)
+c6: Grp18 (1A)
+c7: Grp19 (1A)
+c8: sha1nexte Vdq,Wdq | vexp2ps/d Vx,Wx (66),(ev)
+c9: sha1msg1 Vdq,Wdq
+ca: sha1msg2 Vdq,Wdq | vrcp28ps/d Vx,Wx (66),(ev)
+cb: sha256rnds2 Vdq,Wdq | vrcp28ss/d Vx,Hx,Wx (66),(ev)
+cc: sha256msg1 Vdq,Wdq | vrsqrt28ps/d Vx,Wx (66),(ev)
+cd: sha256msg2 Vdq,Wdq | vrsqrt28ss/d Vx,Hx,Wx (66),(ev)
+db: VAESIMC Vdq,Wdq (66),(v1)
+dc: VAESENC Vdq,Hdq,Wdq (66),(v1)
+dd: VAESENCLAST Vdq,Hdq,Wdq (66),(v1)
+de: VAESDEC Vdq,Hdq,Wdq (66),(v1)
+df: VAESDECLAST Vdq,Hdq,Wdq (66),(v1)
+f0: MOVBE Gy,My | MOVBE Gw,Mw (66) | CRC32 Gd,Eb (F2) | CRC32 Gd,Eb (66&F2)
+f1: MOVBE My,Gy | MOVBE Mw,Gw (66) | CRC32 Gd,Ey (F2) | CRC32 Gd,Ew (66&F2)
+f2: ANDN Gy,By,Ey (v)
+f3: Grp17 (1A)
+f5: BZHI Gy,Ey,By (v) | PEXT Gy,By,Ey (F3),(v) | PDEP Gy,By,Ey (F2),(v)
+f6: ADCX Gy,Ey (66) | ADOX Gy,Ey (F3) | MULX By,Gy,rDX,Ey (F2),(v)
+f7: BEXTR Gy,Ey,By (v) | SHLX Gy,Ey,By (66),(v) | SARX Gy,Ey,By (F3),(v) | SHRX Gy,Ey,By (F2),(v)
+f8: MOVDIR64B Gv,Mdqq (66) | ENQCMD Gv,Mdqq (F2) | ENQCMDS Gv,Mdqq (F3)
+f9: MOVDIRI My,Gy
+EndTable
+
+Table: 3-byte opcode 2 (0x0f 0x3a)
+Referrer: 3-byte escape 2
+AVXcode: 3
+# 0x0f 0x3a 0x00-0xff
+00: vpermq Vqq,Wqq,Ib (66),(v)
+01: vpermpd Vqq,Wqq,Ib (66),(v)
+02: vpblendd Vx,Hx,Wx,Ib (66),(v)
+03: valignd/q Vx,Hx,Wx,Ib (66),(ev)
+04: vpermilps Vx,Wx,Ib (66),(v)
+05: vpermilpd Vx,Wx,Ib (66),(v)
+06: vperm2f128 Vqq,Hqq,Wqq,Ib (66),(v)
+07:
+08: vroundps Vx,Wx,Ib (66) | vrndscaleps Vx,Wx,Ib (66),(evo)
+09: vroundpd Vx,Wx,Ib (66) | vrndscalepd Vx,Wx,Ib (66),(evo)
+0a: vroundss Vss,Wss,Ib (66),(v1) | vrndscaless Vx,Hx,Wx,Ib (66),(evo)
+0b: vroundsd Vsd,Wsd,Ib (66),(v1) | vrndscalesd Vx,Hx,Wx,Ib (66),(evo)
+0c: vblendps Vx,Hx,Wx,Ib (66)
+0d: vblendpd Vx,Hx,Wx,Ib (66)
+0e: vpblendw Vx,Hx,Wx,Ib (66),(v1)
+0f: palignr Pq,Qq,Ib | vpalignr Vx,Hx,Wx,Ib (66),(v1)
+14: vpextrb Rd/Mb,Vdq,Ib (66),(v1)
+15: vpextrw Rd/Mw,Vdq,Ib (66),(v1)
+16: vpextrd/q Ey,Vdq,Ib (66),(v1)
+17: vextractps Ed,Vdq,Ib (66),(v1)
+18: vinsertf128 Vqq,Hqq,Wqq,Ib (66),(v) | vinsertf32x4/64x2 Vqq,Hqq,Wqq,Ib (66),(evo)
+19: vextractf128 Wdq,Vqq,Ib (66),(v) | vextractf32x4/64x2 Wdq,Vqq,Ib (66),(evo)
+1a: vinsertf32x8/64x4 Vqq,Hqq,Wqq,Ib (66),(ev)
+1b: vextractf32x8/64x4 Wdq,Vqq,Ib (66),(ev)
+1d: vcvtps2ph Wx,Vx,Ib (66),(v)
+1e: vpcmpud/q Vk,Hd,Wd,Ib (66),(ev)
+1f: vpcmpd/q Vk,Hd,Wd,Ib (66),(ev)
+20: vpinsrb Vdq,Hdq,Ry/Mb,Ib (66),(v1)
+21: vinsertps Vdq,Hdq,Udq/Md,Ib (66),(v1)
+22: vpinsrd/q Vdq,Hdq,Ey,Ib (66),(v1)
+23: vshuff32x4/64x2 Vx,Hx,Wx,Ib (66),(ev)
+25: vpternlogd/q Vx,Hx,Wx,Ib (66),(ev)
+26: vgetmantps/d Vx,Wx,Ib (66),(ev)
+27: vgetmantss/d Vx,Hx,Wx,Ib (66),(ev)
+30: kshiftrb/w Vk,Uk,Ib (66),(v)
+31: kshiftrd/q Vk,Uk,Ib (66),(v)
+32: kshiftlb/w Vk,Uk,Ib (66),(v)
+33: kshiftld/q Vk,Uk,Ib (66),(v)
+38: vinserti128 Vqq,Hqq,Wqq,Ib (66),(v) | vinserti32x4/64x2 Vqq,Hqq,Wqq,Ib (66),(evo)
+39: vextracti128 Wdq,Vqq,Ib (66),(v) | vextracti32x4/64x2 Wdq,Vqq,Ib (66),(evo)
+3a: vinserti32x8/64x4 Vqq,Hqq,Wqq,Ib (66),(ev)
+3b: vextracti32x8/64x4 Wdq,Vqq,Ib (66),(ev)
+3e: vpcmpub/w Vk,Hk,Wx,Ib (66),(ev)
+3f: vpcmpb/w Vk,Hk,Wx,Ib (66),(ev)
+40: vdpps Vx,Hx,Wx,Ib (66)
+41: vdppd Vdq,Hdq,Wdq,Ib (66),(v1)
+42: vmpsadbw Vx,Hx,Wx,Ib (66),(v1) | vdbpsadbw Vx,Hx,Wx,Ib (66),(evo)
+43: vshufi32x4/64x2 Vx,Hx,Wx,Ib (66),(ev)
+44: vpclmulqdq Vdq,Hdq,Wdq,Ib (66),(v1)
+46: vperm2i128 Vqq,Hqq,Wqq,Ib (66),(v)
+4a: vblendvps Vx,Hx,Wx,Lx (66),(v)
+4b: vblendvpd Vx,Hx,Wx,Lx (66),(v)
+4c: vpblendvb Vx,Hx,Wx,Lx (66),(v1)
+50: vrangeps/d Vx,Hx,Wx,Ib (66),(ev)
+51: vrangess/d Vx,Hx,Wx,Ib (66),(ev)
+54: vfixupimmps/d Vx,Hx,Wx,Ib (66),(ev)
+55: vfixupimmss/d Vx,Hx,Wx,Ib (66),(ev)
+56: vreduceps/d Vx,Wx,Ib (66),(ev)
+57: vreducess/d Vx,Hx,Wx,Ib (66),(ev)
+60: vpcmpestrm Vdq,Wdq,Ib (66),(v1)
+61: vpcmpestri Vdq,Wdq,Ib (66),(v1)
+62: vpcmpistrm Vdq,Wdq,Ib (66),(v1)
+63: vpcmpistri Vdq,Wdq,Ib (66),(v1)
+66: vfpclassps/d Vk,Wx,Ib (66),(ev)
+67: vfpclassss/d Vk,Wx,Ib (66),(ev)
+cc: sha1rnds4 Vdq,Wdq,Ib
+df: VAESKEYGEN Vdq,Wdq,Ib (66),(v1)
+f0: RORX Gy,Ey,Ib (F2),(v)
+EndTable
+
+GrpTable: Grp1
+0: ADD
+1: OR
+2: ADC
+3: SBB
+4: AND
+5: SUB
+6: XOR
+7: CMP
+EndTable
+
+GrpTable: Grp1A
+0: POP
+EndTable
+
+GrpTable: Grp2
+0: ROL
+1: ROR
+2: RCL
+3: RCR
+4: SHL/SAL
+5: SHR
+6:
+7: SAR
+EndTable
+
+GrpTable: Grp3_1
+0: TEST Eb,Ib
+1: TEST Eb,Ib
+2: NOT Eb
+3: NEG Eb
+4: MUL AL,Eb
+5: IMUL AL,Eb
+6: DIV AL,Eb
+7: IDIV AL,Eb
+EndTable
+
+GrpTable: Grp3_2
+0: TEST Ev,Iz
+1: TEST Ev,Iz
+2: NOT Ev
+3: NEG Ev
+4: MUL rAX,Ev
+5: IMUL rAX,Ev
+6: DIV rAX,Ev
+7: IDIV rAX,Ev
+EndTable
+
+GrpTable: Grp4
+0: INC Eb
+1: DEC Eb
+EndTable
+
+GrpTable: Grp5
+0: INC Ev
+1: DEC Ev
+# Note: "forced64" is Intel CPU behavior (see comment about CALL insn).
+2: CALLN Ev (f64)
+3: CALLF Ep
+4: JMPN Ev (f64)
+5: JMPF Mp
+6: PUSH Ev (d64)
+7:
+EndTable
+
+GrpTable: Grp6
+0: SLDT Rv/Mw
+1: STR Rv/Mw
+2: LLDT Ew
+3: LTR Ew
+4: VERR Ew
+5: VERW Ew
+EndTable
+
+GrpTable: Grp7
+0: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B) | PCONFIG (101),(11B) | ENCLV (000),(11B)
+1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001),(11B) | CLAC (010),(11B) | STAC (011),(11B) | ENCLS (111),(11B)
+2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B) | XEND (101)(11B) | XTEST (110)(11B) | ENCLU (111),(11B)
+3: LIDT Ms
+4: SMSW Mw/Rv
+5: rdpkru (110),(11B) | wrpkru (111),(11B)
+6: LMSW Ew
+7: INVLPG Mb | SWAPGS (o64),(000),(11B) | RDTSCP (001),(11B)
+EndTable
+
+GrpTable: Grp8
+4: BT
+5: BTS
+6: BTR
+7: BTC
+EndTable
+
+GrpTable: Grp9
+1: CMPXCHG8B/16B Mq/Mdq
+3: xrstors
+4: xsavec
+5: xsaves
+6: VMPTRLD Mq | VMCLEAR Mq (66) | VMXON Mq (F3) | RDRAND Rv (11B)
+7: VMPTRST Mq | VMPTRST Mq (F3) | RDSEED Rv (11B)
+EndTable
+
+GrpTable: Grp10
+# all are UD1
+0: UD1
+1: UD1
+2: UD1
+3: UD1
+4: UD1
+5: UD1
+6: UD1
+7: UD1
+EndTable
+
+# Grp11A and Grp11B are expressed as Grp11 in Intel SDM
+GrpTable: Grp11A
+0: MOV Eb,Ib
+7: XABORT Ib (000),(11B)
+EndTable
+
+GrpTable: Grp11B
+0: MOV Eb,Iz
+7: XBEGIN Jz (000),(11B)
+EndTable
+
+GrpTable: Grp12
+2: psrlw Nq,Ib (11B) | vpsrlw Hx,Ux,Ib (66),(11B),(v1)
+4: psraw Nq,Ib (11B) | vpsraw Hx,Ux,Ib (66),(11B),(v1)
+6: psllw Nq,Ib (11B) | vpsllw Hx,Ux,Ib (66),(11B),(v1)
+EndTable
+
+GrpTable: Grp13
+0: vprord/q Hx,Wx,Ib (66),(ev)
+1: vprold/q Hx,Wx,Ib (66),(ev)
+2: psrld Nq,Ib (11B) | vpsrld Hx,Ux,Ib (66),(11B),(v1)
+4: psrad Nq,Ib (11B) | vpsrad Hx,Ux,Ib (66),(11B),(v1) | vpsrad/q Hx,Ux,Ib (66),(evo)
+6: pslld Nq,Ib (11B) | vpslld Hx,Ux,Ib (66),(11B),(v1)
+EndTable
+
+GrpTable: Grp14
+2: psrlq Nq,Ib (11B) | vpsrlq Hx,Ux,Ib (66),(11B),(v1)
+3: vpsrldq Hx,Ux,Ib (66),(11B),(v1)
+6: psllq Nq,Ib (11B) | vpsllq Hx,Ux,Ib (66),(11B),(v1)
+7: vpslldq Hx,Ux,Ib (66),(11B),(v1)
+EndTable
+
+GrpTable: Grp15
+0: fxsave | RDFSBASE Ry (F3),(11B)
+1: fxstor | RDGSBASE Ry (F3),(11B)
+2: vldmxcsr Md (v1) | WRFSBASE Ry (F3),(11B)
+3: vstmxcsr Md (v1) | WRGSBASE Ry (F3),(11B)
+4: XSAVE | ptwrite Ey (F3),(11B)
+5: XRSTOR | lfence (11B)
+6: XSAVEOPT | clwb (66) | mfence (11B) | TPAUSE Rd (66),(11B) | UMONITOR Rv (F3),(11B) | UMWAIT Rd (F2),(11B)
+7: clflush | clflushopt (66) | sfence (11B)
+EndTable
+
+GrpTable: Grp16
+0: prefetch NTA
+1: prefetch T0
+2: prefetch T1
+3: prefetch T2
+EndTable
+
+GrpTable: Grp17
+1: BLSR By,Ey (v)
+2: BLSMSK By,Ey (v)
+3: BLSI By,Ey (v)
+EndTable
+
+GrpTable: Grp18
+1: vgatherpf0dps/d Wx (66),(ev)
+2: vgatherpf1dps/d Wx (66),(ev)
+5: vscatterpf0dps/d Wx (66),(ev)
+6: vscatterpf1dps/d Wx (66),(ev)
+EndTable
+
+GrpTable: Grp19
+1: vgatherpf0qps/d Wx (66),(ev)
+2: vgatherpf1qps/d Wx (66),(ev)
+5: vscatterpf0qps/d Wx (66),(ev)
+6: vscatterpf1qps/d Wx (66),(ev)
+EndTable
+
+GrpTable: Grp20
+0: cldemote Mb
+EndTable
+
+# AMD's Prefetch Group
+GrpTable: GrpP
+0: PREFETCH
+1: PREFETCHW
+EndTable
+
+GrpTable: GrpPDLK
+0: MONTMUL
+1: XSHA1
+2: XSHA2
+EndTable
+
+GrpTable: GrpRNG
+0: xstore-rng
+1: xcrypt-ecb
+2: xcrypt-cbc
+4: xcrypt-cfb
+5: xcrypt-ofb
+EndTable
diff --git a/arch/x86/math-emu/Makefile b/arch/x86/math-emu/Makefile
new file mode 100644
index 000000000..02211fc6f
--- /dev/null
+++ b/arch/x86/math-emu/Makefile
@@ -0,0 +1,30 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for wm-FPU-emu
+#
+
+#DEBUG = -DDEBUGGING
+DEBUG =
+PARANOID = -DPARANOID
+ccflags-y += $(PARANOID) $(DEBUG) -fno-builtin $(MATH_EMULATION)
+asflags-y += $(PARANOID)
+
+# From 'C' language sources:
+C_OBJS =fpu_entry.o errors.o \
+ fpu_arith.o fpu_aux.o fpu_etc.o fpu_tags.o fpu_trig.o \
+ load_store.o get_address.o \
+ poly_atan.o poly_l2.o poly_2xm1.o poly_sin.o poly_tan.o \
+ reg_add_sub.o reg_compare.o reg_constant.o reg_convert.o \
+ reg_ld_str.o reg_divide.o reg_mul.o
+
+# From 80x86 assembler sources:
+A_OBJS =reg_u_add.o reg_u_div.o reg_u_mul.o reg_u_sub.o \
+ div_small.o reg_norm.o reg_round.o \
+ wm_shrx.o wm_sqrt.o \
+ div_Xsig.o polynom_Xsig.o round_Xsig.o \
+ shr_Xsig.o mul_Xsig.o
+
+obj-y =$(C_OBJS) $(A_OBJS)
+
+proto:
+ cproto -e -DMAKING_PROTO *.c >fpu_proto.h
diff --git a/arch/x86/math-emu/README b/arch/x86/math-emu/README
new file mode 100644
index 000000000..e6235491d
--- /dev/null
+++ b/arch/x86/math-emu/README
@@ -0,0 +1,427 @@
+ +---------------------------------------------------------------------------+
+ | wm-FPU-emu an FPU emulator for 80386 and 80486SX microprocessors. |
+ | |
+ | Copyright (C) 1992,1993,1994,1995,1996,1997,1999 |
+ | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, |
+ | Australia. E-mail billm@melbpc.org.au |
+ | |
+ | This program is free software; you can redistribute it and/or modify |
+ | it under the terms of the GNU General Public License version 2 as |
+ | published by the Free Software Foundation. |
+ | |
+ | This program is distributed in the hope that it will be useful, |
+ | but WITHOUT ANY WARRANTY; without even the implied warranty of |
+ | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
+ | GNU General Public License for more details. |
+ | |
+ | You should have received a copy of the GNU General Public License |
+ | along with this program; if not, write to the Free Software |
+ | Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
+ | |
+ +---------------------------------------------------------------------------+
+
+
+
+wm-FPU-emu is an FPU emulator for Linux. It is derived from wm-emu387
+which was my 80387 emulator for early versions of djgpp (gcc under
+msdos); wm-emu387 was in turn based upon emu387 which was written by
+DJ Delorie for djgpp. The interface to the Linux kernel is based upon
+the original Linux math emulator by Linus Torvalds.
+
+My target FPU for wm-FPU-emu is that described in the Intel486
+Programmer's Reference Manual (1992 edition). Unfortunately, numerous
+facets of the functioning of the FPU are not well covered in the
+Reference Manual. The information in the manual has been supplemented
+with measurements on real 80486's. Unfortunately, it is simply not
+possible to be sure that all of the peculiarities of the 80486 have
+been discovered, so there is always likely to be obscure differences
+in the detailed behaviour of the emulator and a real 80486.
+
+wm-FPU-emu does not implement all of the behaviour of the 80486 FPU,
+but is very close. See "Limitations" later in this file for a list of
+some differences.
+
+Please report bugs, etc to me at:
+ billm@melbpc.org.au
+or b.metzenthen@medoto.unimelb.edu.au
+
+For more information on the emulator and on floating point topics, see
+my web pages, currently at http://www.suburbia.net/~billm/
+
+
+--Bill Metzenthen
+ December 1999
+
+
+----------------------- Internals of wm-FPU-emu -----------------------
+
+Numeric algorithms:
+(1) Add, subtract, and multiply. Nothing remarkable in these.
+(2) Divide has been tuned to get reasonable performance. The algorithm
+ is not the obvious one which most people seem to use, but is designed
+ to take advantage of the characteristics of the 80386. I expect that
+ it has been invented many times before I discovered it, but I have not
+ seen it. It is based upon one of those ideas which one carries around
+ for years without ever bothering to check it out.
+(3) The sqrt function has been tuned to get good performance. It is based
+ upon Newton's classic method. Performance was improved by capitalizing
+ upon the properties of Newton's method, and the code is once again
+ structured taking account of the 80386 characteristics.
+(4) The trig, log, and exp functions are based in each case upon quasi-
+ "optimal" polynomial approximations. My definition of "optimal" was
+ based upon getting good accuracy with reasonable speed.
+(5) The argument reducing code for the trig function effectively uses
+ a value of pi which is accurate to more than 128 bits. As a consequence,
+ the reduced argument is accurate to more than 64 bits for arguments up
+ to a few pi, and accurate to more than 64 bits for most arguments,
+ even for arguments approaching 2^63. This is far superior to an
+ 80486, which uses a value of pi which is accurate to 66 bits.
+
+The code of the emulator is complicated slightly by the need to
+account for a limited form of re-entrancy. Normally, the emulator will
+emulate each FPU instruction to completion without interruption.
+However, it may happen that when the emulator is accessing the user
+memory space, swapping may be needed. In this case the emulator may be
+temporarily suspended while disk i/o takes place. During this time
+another process may use the emulator, thereby perhaps changing static
+variables. The code which accesses user memory is confined to five
+files:
+ fpu_entry.c
+ reg_ld_str.c
+ load_store.c
+ get_address.c
+ errors.c
+As from version 1.12 of the emulator, no static variables are used
+(apart from those in the kernel's per-process tables). The emulator is
+therefore now fully re-entrant, rather than having just the restricted
+form of re-entrancy which is required by the Linux kernel.
+
+----------------------- Limitations of wm-FPU-emu -----------------------
+
+There are a number of differences between the current wm-FPU-emu
+(version 2.01) and the 80486 FPU (apart from bugs). The differences
+are fewer than those which applied to the 1.xx series of the emulator.
+Some of the more important differences are listed below:
+
+The Roundup flag does not have much meaning for the transcendental
+functions and its 80486 value with these functions is likely to differ
+from its emulator value.
+
+In a few rare cases the Underflow flag obtained with the emulator will
+be different from that obtained with an 80486. This occurs when the
+following conditions apply simultaneously:
+(a) the operands have a higher precision than the current setting of the
+ precision control (PC) flags.
+(b) the underflow exception is masked.
+(c) the magnitude of the exact result (before rounding) is less than 2^-16382.
+(d) the magnitude of the final result (after rounding) is exactly 2^-16382.
+(e) the magnitude of the exact result would be exactly 2^-16382 if the
+ operands were rounded to the current precision before the arithmetic
+ operation was performed.
+If all of these apply, the emulator will set the Underflow flag but a real
+80486 will not.
+
+NOTE: Certain formats of Extended Real are UNSUPPORTED. They are
+unsupported by the 80486. They are the Pseudo-NaNs, Pseudoinfinities,
+and Unnormals. None of these will be generated by an 80486 or by the
+emulator. Do not use them. The emulator treats them differently in
+detail from the way an 80486 does.
+
+Self modifying code can cause the emulator to fail. An example of such
+code is:
+ movl %esp,[%ebx]
+ fld1
+The FPU instruction may be (usually will be) loaded into the pre-fetch
+queue of the CPU before the mov instruction is executed. If the
+destination of the 'movl' overlaps the FPU instruction then the bytes
+in the prefetch queue and memory will be inconsistent when the FPU
+instruction is executed. The emulator will be invoked but will not be
+able to find the instruction which caused the device-not-present
+exception. For this case, the emulator cannot emulate the behaviour of
+an 80486DX.
+
+Handling of the address size override prefix byte (0x67) has not been
+extensively tested yet. A major problem exists because using it in
+vm86 mode can cause a general protection fault. Address offsets
+greater than 0xffff appear to be illegal in vm86 mode but are quite
+acceptable (and work) in real mode. A small test program developed to
+check the addressing, and which runs successfully in real mode,
+crashes dosemu under Linux and also brings Windows down with a general
+protection fault message when run under the MS-DOS prompt of Windows
+3.1. (The program simply reads data from a valid address).
+
+The emulator supports 16-bit protected mode, with one difference from
+an 80486DX. A 80486DX will allow some floating point instructions to
+write a few bytes below the lowest address of the stack. The emulator
+will not allow this in 16-bit protected mode: no instructions are
+allowed to write outside the bounds set by the protection.
+
+----------------------- Performance of wm-FPU-emu -----------------------
+
+Speed.
+-----
+
+The speed of floating point computation with the emulator will depend
+upon instruction mix. Relative performance is best for the instructions
+which require most computation. The simple instructions are adversely
+affected by the FPU instruction trap overhead.
+
+
+Timing: Some simple timing tests have been made on the emulator functions.
+The times include load/store instructions. All times are in microseconds
+measured on a 33MHz 386 with 64k cache. The Turbo C tests were under
+ms-dos, the next two columns are for emulators running with the djgpp
+ms-dos extender. The final column is for wm-FPU-emu in Linux 0.97,
+using libm4.0 (hard).
+
+function Turbo C djgpp 1.06 WM-emu387 wm-FPU-emu
+
+ + 60.5 154.8 76.5 139.4
+ - 61.1-65.5 157.3-160.8 76.2-79.5 142.9-144.7
+ * 71.0 190.8 79.6 146.6
+ / 61.2-75.0 261.4-266.9 75.3-91.6 142.2-158.1
+
+ sin() 310.8 4692.0 319.0 398.5
+ cos() 284.4 4855.2 308.0 388.7
+ tan() 495.0 8807.1 394.9 504.7
+ atan() 328.9 4866.4 601.1 419.5-491.9
+
+ sqrt() 128.7 crashed 145.2 227.0
+ log() 413.1-419.1 5103.4-5354.21 254.7-282.2 409.4-437.1
+ exp() 479.1 6619.2 469.1 850.8
+
+
+The performance under Linux is improved by the use of look-ahead code.
+The following results show the improvement which is obtained under
+Linux due to the look-ahead code. Also given are the times for the
+original Linux emulator with the 4.1 'soft' lib.
+
+ [ Linus' note: I changed look-ahead to be the default under linux, as
+ there was no reason not to use it after I had edited it to be
+ disabled during tracing ]
+
+ wm-FPU-emu w original w
+ look-ahead 'soft' lib
+ + 106.4 190.2
+ - 108.6-111.6 192.4-216.2
+ * 113.4 193.1
+ / 108.8-124.4 700.1-706.2
+
+ sin() 390.5 2642.0
+ cos() 381.5 2767.4
+ tan() 496.5 3153.3
+ atan() 367.2-435.5 2439.4-3396.8
+
+ sqrt() 195.1 4732.5
+ log() 358.0-387.5 3359.2-3390.3
+ exp() 619.3 4046.4
+
+
+These figures are now somewhat out-of-date. The emulator has become
+progressively slower for most functions as more of the 80486 features
+have been implemented.
+
+
+----------------------- Accuracy of wm-FPU-emu -----------------------
+
+
+The accuracy of the emulator is in almost all cases equal to or better
+than that of an Intel 80486 FPU.
+
+The results of the basic arithmetic functions (+,-,*,/), and fsqrt
+match those of an 80486 FPU. They are the best possible; the error for
+these never exceeds 1/2 an lsb. The fprem and fprem1 instructions
+return exact results; they have no error.
+
+
+The following table compares the emulator accuracy for the sqrt(),
+trig and log functions against the Turbo C "emulator". For this table,
+each function was tested at about 400 points. Ideal worst-case results
+would be 64 bits. The reduced Turbo C accuracy of cos() and tan() for
+arguments greater than pi/4 can be thought of as being related to the
+precision of the argument x; e.g. an argument of pi/2-(1e-10) which is
+accurate to 64 bits can result in a relative accuracy in cos() of
+about 64 + log2(cos(x)) = 31 bits.
+
+
+Function Tested x range Worst result Turbo C
+ (relative bits)
+
+sqrt(x) 1 .. 2 64.1 63.2
+atan(x) 1e-10 .. 200 64.2 62.8
+cos(x) 0 .. pi/2-(1e-10) 64.4 (x <= pi/4) 62.4
+ 64.1 (x = pi/2-(1e-10)) 31.9
+sin(x) 1e-10 .. pi/2 64.0 62.8
+tan(x) 1e-10 .. pi/2-(1e-10) 64.0 (x <= pi/4) 62.1
+ 64.1 (x = pi/2-(1e-10)) 31.9
+exp(x) 0 .. 1 63.1 ** 62.9
+log(x) 1+1e-6 .. 2 63.8 ** 62.1
+
+** The accuracy for exp() and log() is low because the FPU (emulator)
+does not compute them directly; two operations are required.
+
+
+The emulator passes the "paranoia" tests (compiled with gcc 2.3.3 or
+later) for 'float' variables (24 bit precision numbers) when precision
+control is set to 24, 53 or 64 bits, and for 'double' variables (53
+bit precision numbers) when precision control is set to 53 bits (a
+properly performing FPU cannot pass the 'paranoia' tests for 'double'
+variables when precision control is set to 64 bits).
+
+The code for reducing the argument for the trig functions (fsin, fcos,
+fptan and fsincos) has been improved and now effectively uses a value
+for pi which is accurate to more than 128 bits precision. As a
+consequence, the accuracy of these functions for large arguments has
+been dramatically improved (and is now very much better than an 80486
+FPU). There is also now no degradation of accuracy for fcos and fptan
+for operands close to pi/2. Measured results are (note that the
+definition of accuracy has changed slightly from that used for the
+above table):
+
+Function Tested x range Worst result
+ (absolute bits)
+
+cos(x) 0 .. 9.22e+18 62.0
+sin(x) 1e-16 .. 9.22e+18 62.1
+tan(x) 1e-16 .. 9.22e+18 61.8
+
+It is possible with some effort to find very large arguments which
+give much degraded precision. For example, the integer number
+ 8227740058411162616.0
+is within about 10e-7 of a multiple of pi. To find the tan (for
+example) of this number to 64 bits precision it would be necessary to
+have a value of pi which had about 150 bits precision. The FPU
+emulator computes the result to about 42.6 bits precision (the correct
+result is about -9.739715e-8). On the other hand, an 80486 FPU returns
+0.01059, which in relative terms is hopelessly inaccurate.
+
+For arguments close to critical angles (which occur at multiples of
+pi/2) the emulator is more accurate than an 80486 FPU. For very large
+arguments, the emulator is far more accurate.
+
+
+Prior to version 1.20 of the emulator, the accuracy of the results for
+the transcendental functions (in their principal range) was not as
+good as the results from an 80486 FPU. From version 1.20, the accuracy
+has been considerably improved and these functions now give measured
+worst-case results which are better than the worst-case results given
+by an 80486 FPU.
+
+The following table gives the measured results for the emulator. The
+number of randomly selected arguments in each case is about half a
+million. The group of three columns gives the frequency of the given
+accuracy in number of times per million, thus the second of these
+columns shows that an accuracy of between 63.80 and 63.89 bits was
+found at a rate of 133 times per one million measurements for fsin.
+The results show that the fsin, fcos and fptan instructions return
+results which are in error (i.e. less accurate than the best possible
+result (which is 64 bits)) for about one per cent of all arguments
+between -pi/2 and +pi/2. The other instructions have a lower
+frequency of results which are in error. The last two columns give
+the worst accuracy which was found (in bits) and the approximate value
+of the argument which produced it.
+
+ frequency (per M)
+ ------------------- ---------------
+instr arg range # tests 63.7 63.8 63.9 worst at arg
+ bits bits bits bits
+----- ------------ ------- ---- ---- ----- ----- --------
+fsin (0,pi/2) 547756 0 133 10673 63.89 0.451317
+fcos (0,pi/2) 547563 0 126 10532 63.85 0.700801
+fptan (0,pi/2) 536274 11 267 10059 63.74 0.784876
+fpatan 4 quadrants 517087 0 8 1855 63.88 0.435121 (4q)
+fyl2x (0,20) 541861 0 0 1323 63.94 1.40923 (x)
+fyl2xp1 (-.293,.414) 520256 0 0 5678 63.93 0.408542 (x)
+f2xm1 (-1,1) 538847 4 481 6488 63.79 0.167709
+
+
+Tests performed on an 80486 FPU showed results of lower accuracy. The
+following table gives the results which were obtained with an AMD
+486DX2/66 (other tests indicate that an Intel 486DX produces
+identical results). The tests were basically the same as those used
+to measure the emulator (the values, being random, were in general not
+the same). The total number of tests for each instruction are given
+at the end of the table, in case each about 100k tests were performed.
+Another line of figures at the end of the table shows that most of the
+instructions return results which are in error for more than 10
+percent of the arguments tested.
+
+The numbers in the body of the table give the approx number of times a
+result of the given accuracy in bits (given in the left-most column)
+was obtained per one million arguments. For three of the instructions,
+two columns of results are given: * The second column for f2xm1 gives
+the number cases where the results of the first column were for a
+positive argument, this shows that this instruction gives better
+results for positive arguments than it does for negative. * In the
+cases of fcos and fptan, the first column gives the results when all
+cases where arguments greater than 1.5 were removed from the results
+given in the second column. Unlike the emulator, an 80486 FPU returns
+results of relatively poor accuracy for these instructions when the
+argument approaches pi/2. The table does not show those cases when the
+accuracy of the results were less than 62 bits, which occurs quite
+often for fsin and fptan when the argument approaches pi/2. This poor
+accuracy is discussed above in relation to the Turbo C "emulator", and
+the accuracy of the value of pi.
+
+
+bits f2xm1 f2xm1 fpatan fcos fcos fyl2x fyl2xp1 fsin fptan fptan
+62.0 0 0 0 0 437 0 0 0 0 925
+62.1 0 0 10 0 894 0 0 0 0 1023
+62.2 14 0 0 0 1033 0 0 0 0 945
+62.3 57 0 0 0 1202 0 0 0 0 1023
+62.4 385 0 0 10 1292 0 23 0 0 1178
+62.5 1140 0 0 119 1649 0 39 0 0 1149
+62.6 2037 0 0 189 1620 0 16 0 0 1169
+62.7 5086 14 0 646 2315 10 101 35 39 1402
+62.8 8818 86 0 984 3050 59 287 131 224 2036
+62.9 11340 1355 0 2126 4153 79 605 357 321 1948
+63.0 15557 4750 0 3319 5376 246 1281 862 808 2688
+63.1 20016 8288 0 4620 6628 511 2569 1723 1510 3302
+63.2 24945 11127 10 6588 8098 1120 4470 2968 2990 4724
+63.3 25686 12382 69 8774 10682 1906 6775 4482 5474 7236
+63.4 29219 14722 79 11109 12311 3094 9414 7259 8912 10587
+63.5 30458 14936 393 13802 15014 5874 12666 9609 13762 15262
+63.6 32439 16448 1277 17945 19028 10226 15537 14657 19158 20346
+63.7 35031 16805 4067 23003 23947 18910 20116 21333 25001 26209
+63.8 33251 15820 7673 24781 25675 24617 25354 24440 29433 30329
+63.9 33293 16833 18529 28318 29233 31267 31470 27748 29676 30601
+
+Per cent with error:
+ 30.9 3.2 18.5 9.8 13.1 11.6 17.4
+Total arguments tested:
+ 70194 70099 101784 100641 100641 101799 128853 114893 102675 102675
+
+
+------------------------- Contributors -------------------------------
+
+A number of people have contributed to the development of the
+emulator, often by just reporting bugs, sometimes with suggested
+fixes, and a few kind people have provided me with access in one way
+or another to an 80486 machine. Contributors include (to those people
+who I may have forgotten, please forgive me):
+
+Linus Torvalds
+Tommy.Thorn@daimi.aau.dk
+Andrew.Tridgell@anu.edu.au
+Nick Holloway, alfie@dcs.warwick.ac.uk
+Hermano Moura, moura@dcs.gla.ac.uk
+Jon Jagger, J.Jagger@scp.ac.uk
+Lennart Benschop
+Brian Gallew, geek+@CMU.EDU
+Thomas Staniszewski, ts3v+@andrew.cmu.edu
+Martin Howell, mph@plasma.apana.org.au
+M Saggaf, alsaggaf@athena.mit.edu
+Peter Barker, PETER@socpsy.sci.fau.edu
+tom@vlsivie.tuwien.ac.at
+Dan Russel, russed@rpi.edu
+Daniel Carosone, danielce@ee.mu.oz.au
+cae@jpmorgan.com
+Hamish Coleman, t933093@minyos.xx.rmit.oz.au
+Bruce Evans, bde@kralizec.zeta.org.au
+Timo Korvola, Timo.Korvola@hut.fi
+Rick Lyons, rick@razorback.brisnet.org.au
+Rick, jrs@world.std.com
+
+...and numerous others who responded to my request for help with
+a real 80486.
+
diff --git a/arch/x86/math-emu/control_w.h b/arch/x86/math-emu/control_w.h
new file mode 100644
index 000000000..60f4dcc5e
--- /dev/null
+++ b/arch/x86/math-emu/control_w.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*---------------------------------------------------------------------------+
+ | control_w.h |
+ | |
+ | Copyright (C) 1992,1993 |
+ | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, |
+ | Australia. E-mail billm@vaxc.cc.monash.edu.au |
+ | |
+ +---------------------------------------------------------------------------*/
+
+#ifndef _CONTROLW_H_
+#define _CONTROLW_H_
+
+#ifdef __ASSEMBLY__
+#define _Const_(x) $##x
+#else
+#define _Const_(x) x
+#endif
+
+#define CW_RC _Const_(0x0C00) /* rounding control */
+#define CW_PC _Const_(0x0300) /* precision control */
+
+#define CW_Precision Const_(0x0020) /* loss of precision mask */
+#define CW_Underflow Const_(0x0010) /* underflow mask */
+#define CW_Overflow Const_(0x0008) /* overflow mask */
+#define CW_ZeroDiv Const_(0x0004) /* divide by zero mask */
+#define CW_Denormal Const_(0x0002) /* denormalized operand mask */
+#define CW_Invalid Const_(0x0001) /* invalid operation mask */
+
+#define CW_Exceptions _Const_(0x003f) /* all masks */
+
+#define RC_RND _Const_(0x0000)
+#define RC_DOWN _Const_(0x0400)
+#define RC_UP _Const_(0x0800)
+#define RC_CHOP _Const_(0x0C00)
+
+/* p 15-5: Precision control bits affect only the following:
+ ADD, SUB(R), MUL, DIV(R), and SQRT */
+#define PR_24_BITS _Const_(0x000)
+#define PR_53_BITS _Const_(0x200)
+#define PR_64_BITS _Const_(0x300)
+#define PR_RESERVED_BITS _Const_(0x100)
+/* FULL_PRECISION simulates all exceptions masked */
+#define FULL_PRECISION (PR_64_BITS | RC_RND | 0x3f)
+
+#endif /* _CONTROLW_H_ */
diff --git a/arch/x86/math-emu/div_Xsig.S b/arch/x86/math-emu/div_Xsig.S
new file mode 100644
index 000000000..ee08449d2
--- /dev/null
+++ b/arch/x86/math-emu/div_Xsig.S
@@ -0,0 +1,367 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+ .file "div_Xsig.S"
+/*---------------------------------------------------------------------------+
+ | div_Xsig.S |
+ | |
+ | Division subroutine for 96 bit quantities |
+ | |
+ | Copyright (C) 1994,1995 |
+ | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, |
+ | Australia. E-mail billm@jacobi.maths.monash.edu.au |
+ | |
+ | |
+ +---------------------------------------------------------------------------*/
+
+/*---------------------------------------------------------------------------+
+ | Divide the 96 bit quantity pointed to by a, by that pointed to by b, and |
+ | put the 96 bit result at the location d. |
+ | |
+ | The result may not be accurate to 96 bits. It is intended for use where |
+ | a result better than 64 bits is required. The result should usually be |
+ | good to at least 94 bits. |
+ | The returned result is actually divided by one half. This is done to |
+ | prevent overflow. |
+ | |
+ | .aaaaaaaaaaaaaa / .bbbbbbbbbbbbb -> .dddddddddddd |
+ | |
+ | void div_Xsig(Xsig *a, Xsig *b, Xsig *dest) |
+ | |
+ +---------------------------------------------------------------------------*/
+
+#include "exception.h"
+#include "fpu_emu.h"
+
+
+#define XsigLL(x) (x)
+#define XsigL(x) 4(x)
+#define XsigH(x) 8(x)
+
+
+#ifndef NON_REENTRANT_FPU
+/*
+ Local storage on the stack:
+ Accumulator: FPU_accum_3:FPU_accum_2:FPU_accum_1:FPU_accum_0
+ */
+#define FPU_accum_3 -4(%ebp)
+#define FPU_accum_2 -8(%ebp)
+#define FPU_accum_1 -12(%ebp)
+#define FPU_accum_0 -16(%ebp)
+#define FPU_result_3 -20(%ebp)
+#define FPU_result_2 -24(%ebp)
+#define FPU_result_1 -28(%ebp)
+
+#else
+.data
+/*
+ Local storage in a static area:
+ Accumulator: FPU_accum_3:FPU_accum_2:FPU_accum_1:FPU_accum_0
+ */
+ .align 4,0
+FPU_accum_3:
+ .long 0
+FPU_accum_2:
+ .long 0
+FPU_accum_1:
+ .long 0
+FPU_accum_0:
+ .long 0
+FPU_result_3:
+ .long 0
+FPU_result_2:
+ .long 0
+FPU_result_1:
+ .long 0
+#endif /* NON_REENTRANT_FPU */
+
+
+.text
+ENTRY(div_Xsig)
+ pushl %ebp
+ movl %esp,%ebp
+#ifndef NON_REENTRANT_FPU
+ subl $28,%esp
+#endif /* NON_REENTRANT_FPU */
+
+ pushl %esi
+ pushl %edi
+ pushl %ebx
+
+ movl PARAM1,%esi /* pointer to num */
+ movl PARAM2,%ebx /* pointer to denom */
+
+#ifdef PARANOID
+ testl $0x80000000, XsigH(%ebx) /* Divisor */
+ je L_bugged
+#endif /* PARANOID */
+
+
+/*---------------------------------------------------------------------------+
+ | Divide: Return arg1/arg2 to arg3. |
+ | |
+ | The maximum returned value is (ignoring exponents) |
+ | .ffffffff ffffffff |
+ | ------------------ = 1.ffffffff fffffffe |
+ | .80000000 00000000 |
+ | and the minimum is |
+ | .80000000 00000000 |
+ | ------------------ = .80000000 00000001 (rounded) |
+ | .ffffffff ffffffff |
+ | |
+ +---------------------------------------------------------------------------*/
+
+ /* Save extended dividend in local register */
+
+ /* Divide by 2 to prevent overflow */
+ clc
+ movl XsigH(%esi),%eax
+ rcrl %eax
+ movl %eax,FPU_accum_3
+ movl XsigL(%esi),%eax
+ rcrl %eax
+ movl %eax,FPU_accum_2
+ movl XsigLL(%esi),%eax
+ rcrl %eax
+ movl %eax,FPU_accum_1
+ movl $0,%eax
+ rcrl %eax
+ movl %eax,FPU_accum_0
+
+ movl FPU_accum_2,%eax /* Get the current num */
+ movl FPU_accum_3,%edx
+
+/*----------------------------------------------------------------------*/
+/* Initialization done.
+ Do the first 32 bits. */
+
+ /* We will divide by a number which is too large */
+ movl XsigH(%ebx),%ecx
+ addl $1,%ecx
+ jnc LFirst_div_not_1
+
+ /* here we need to divide by 100000000h,
+ i.e., no division at all.. */
+ mov %edx,%eax
+ jmp LFirst_div_done
+
+LFirst_div_not_1:
+ divl %ecx /* Divide the numerator by the augmented
+ denom ms dw */
+
+LFirst_div_done:
+ movl %eax,FPU_result_3 /* Put the result in the answer */
+
+ mull XsigH(%ebx) /* mul by the ms dw of the denom */
+
+ subl %eax,FPU_accum_2 /* Subtract from the num local reg */
+ sbbl %edx,FPU_accum_3
+
+ movl FPU_result_3,%eax /* Get the result back */
+ mull XsigL(%ebx) /* now mul the ls dw of the denom */
+
+ subl %eax,FPU_accum_1 /* Subtract from the num local reg */
+ sbbl %edx,FPU_accum_2
+ sbbl $0,FPU_accum_3
+ je LDo_2nd_32_bits /* Must check for non-zero result here */
+
+#ifdef PARANOID
+ jb L_bugged_1
+#endif /* PARANOID */
+
+ /* need to subtract another once of the denom */
+ incl FPU_result_3 /* Correct the answer */
+
+ movl XsigL(%ebx),%eax
+ movl XsigH(%ebx),%edx
+ subl %eax,FPU_accum_1 /* Subtract from the num local reg */
+ sbbl %edx,FPU_accum_2
+
+#ifdef PARANOID
+ sbbl $0,FPU_accum_3
+ jne L_bugged_1 /* Must check for non-zero result here */
+#endif /* PARANOID */
+
+/*----------------------------------------------------------------------*/
+/* Half of the main problem is done, there is just a reduced numerator
+ to handle now.
+ Work with the second 32 bits, FPU_accum_0 not used from now on */
+LDo_2nd_32_bits:
+ movl FPU_accum_2,%edx /* get the reduced num */
+ movl FPU_accum_1,%eax
+
+ /* need to check for possible subsequent overflow */
+ cmpl XsigH(%ebx),%edx
+ jb LDo_2nd_div
+ ja LPrevent_2nd_overflow
+
+ cmpl XsigL(%ebx),%eax
+ jb LDo_2nd_div
+
+LPrevent_2nd_overflow:
+/* The numerator is greater or equal, would cause overflow */
+ /* prevent overflow */
+ subl XsigL(%ebx),%eax
+ sbbl XsigH(%ebx),%edx
+ movl %edx,FPU_accum_2
+ movl %eax,FPU_accum_1
+
+ incl FPU_result_3 /* Reflect the subtraction in the answer */
+
+#ifdef PARANOID
+ je L_bugged_2 /* Can't bump the result to 1.0 */
+#endif /* PARANOID */
+
+LDo_2nd_div:
+ cmpl $0,%ecx /* augmented denom msw */
+ jnz LSecond_div_not_1
+
+ /* %ecx == 0, we are dividing by 1.0 */
+ mov %edx,%eax
+ jmp LSecond_div_done
+
+LSecond_div_not_1:
+ divl %ecx /* Divide the numerator by the denom ms dw */
+
+LSecond_div_done:
+ movl %eax,FPU_result_2 /* Put the result in the answer */
+
+ mull XsigH(%ebx) /* mul by the ms dw of the denom */
+
+ subl %eax,FPU_accum_1 /* Subtract from the num local reg */
+ sbbl %edx,FPU_accum_2
+
+#ifdef PARANOID
+ jc L_bugged_2
+#endif /* PARANOID */
+
+ movl FPU_result_2,%eax /* Get the result back */
+ mull XsigL(%ebx) /* now mul the ls dw of the denom */
+
+ subl %eax,FPU_accum_0 /* Subtract from the num local reg */
+ sbbl %edx,FPU_accum_1 /* Subtract from the num local reg */
+ sbbl $0,FPU_accum_2
+
+#ifdef PARANOID
+ jc L_bugged_2
+#endif /* PARANOID */
+
+ jz LDo_3rd_32_bits
+
+#ifdef PARANOID
+ cmpl $1,FPU_accum_2
+ jne L_bugged_2
+#endif /* PARANOID */
+
+ /* need to subtract another once of the denom */
+ movl XsigL(%ebx),%eax
+ movl XsigH(%ebx),%edx
+ subl %eax,FPU_accum_0 /* Subtract from the num local reg */
+ sbbl %edx,FPU_accum_1
+ sbbl $0,FPU_accum_2
+
+#ifdef PARANOID
+ jc L_bugged_2
+ jne L_bugged_2
+#endif /* PARANOID */
+
+ addl $1,FPU_result_2 /* Correct the answer */
+ adcl $0,FPU_result_3
+
+#ifdef PARANOID
+ jc L_bugged_2 /* Must check for non-zero result here */
+#endif /* PARANOID */
+
+/*----------------------------------------------------------------------*/
+/* The division is essentially finished here, we just need to perform
+ tidying operations.
+ Deal with the 3rd 32 bits */
+LDo_3rd_32_bits:
+ /* We use an approximation for the third 32 bits.
+ To take account of the 3rd 32 bits of the divisor
+ (call them del), we subtract del * (a/b) */
+
+ movl FPU_result_3,%eax /* a/b */
+ mull XsigLL(%ebx) /* del */
+
+ subl %edx,FPU_accum_1
+
+ /* A borrow indicates that the result is negative */
+ jnb LTest_over
+
+ movl XsigH(%ebx),%edx
+ addl %edx,FPU_accum_1
+
+ subl $1,FPU_result_2 /* Adjust the answer */
+ sbbl $0,FPU_result_3
+
+ /* The above addition might not have been enough, check again. */
+ movl FPU_accum_1,%edx /* get the reduced num */
+ cmpl XsigH(%ebx),%edx /* denom */
+ jb LDo_3rd_div
+
+ movl XsigH(%ebx),%edx
+ addl %edx,FPU_accum_1
+
+ subl $1,FPU_result_2 /* Adjust the answer */
+ sbbl $0,FPU_result_3
+ jmp LDo_3rd_div
+
+LTest_over:
+ movl FPU_accum_1,%edx /* get the reduced num */
+
+ /* need to check for possible subsequent overflow */
+ cmpl XsigH(%ebx),%edx /* denom */
+ jb LDo_3rd_div
+
+ /* prevent overflow */
+ subl XsigH(%ebx),%edx
+ movl %edx,FPU_accum_1
+
+ addl $1,FPU_result_2 /* Reflect the subtraction in the answer */
+ adcl $0,FPU_result_3
+
+LDo_3rd_div:
+ movl FPU_accum_0,%eax
+ movl FPU_accum_1,%edx
+ divl XsigH(%ebx)
+
+ movl %eax,FPU_result_1 /* Rough estimate of third word */
+
+ movl PARAM3,%esi /* pointer to answer */
+
+ movl FPU_result_1,%eax
+ movl %eax,XsigLL(%esi)
+ movl FPU_result_2,%eax
+ movl %eax,XsigL(%esi)
+ movl FPU_result_3,%eax
+ movl %eax,XsigH(%esi)
+
+L_exit:
+ popl %ebx
+ popl %edi
+ popl %esi
+
+ leave
+ ret
+
+
+#ifdef PARANOID
+/* The logic is wrong if we got here */
+L_bugged:
+ pushl EX_INTERNAL|0x240
+ call EXCEPTION
+ pop %ebx
+ jmp L_exit
+
+L_bugged_1:
+ pushl EX_INTERNAL|0x241
+ call EXCEPTION
+ pop %ebx
+ jmp L_exit
+
+L_bugged_2:
+ pushl EX_INTERNAL|0x242
+ call EXCEPTION
+ pop %ebx
+ jmp L_exit
+#endif /* PARANOID */
+ENDPROC(div_Xsig)
diff --git a/arch/x86/math-emu/div_small.S b/arch/x86/math-emu/div_small.S
new file mode 100644
index 000000000..8f5025c80
--- /dev/null
+++ b/arch/x86/math-emu/div_small.S
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+ .file "div_small.S"
+/*---------------------------------------------------------------------------+
+ | div_small.S |
+ | |
+ | Divide a 64 bit integer by a 32 bit integer & return remainder. |
+ | |
+ | Copyright (C) 1992,1995 |
+ | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, |
+ | Australia. E-mail billm@jacobi.maths.monash.edu.au |
+ | |
+ | |
+ +---------------------------------------------------------------------------*/
+
+/*---------------------------------------------------------------------------+
+ | unsigned long FPU_div_small(unsigned long long *x, unsigned long y) |
+ +---------------------------------------------------------------------------*/
+
+#include "fpu_emu.h"
+
+.text
+ENTRY(FPU_div_small)
+ pushl %ebp
+ movl %esp,%ebp
+
+ pushl %esi
+
+ movl PARAM1,%esi /* pointer to num */
+ movl PARAM2,%ecx /* The denominator */
+
+ movl 4(%esi),%eax /* Get the current num msw */
+ xorl %edx,%edx
+ divl %ecx
+
+ movl %eax,4(%esi)
+
+ movl (%esi),%eax /* Get the num lsw */
+ divl %ecx
+
+ movl %eax,(%esi)
+
+ movl %edx,%eax /* Return the remainder in eax */
+
+ popl %esi
+
+ leave
+ ret
+ENDPROC(FPU_div_small)
diff --git a/arch/x86/math-emu/errors.c b/arch/x86/math-emu/errors.c
new file mode 100644
index 000000000..6b468517a
--- /dev/null
+++ b/arch/x86/math-emu/errors.c
@@ -0,0 +1,685 @@
+// SPDX-License-Identifier: GPL-2.0
+/*---------------------------------------------------------------------------+
+ | errors.c |
+ | |
+ | The error handling functions for wm-FPU-emu |
+ | |
+ | Copyright (C) 1992,1993,1994,1996 |
+ | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia |
+ | E-mail billm@jacobi.maths.monash.edu.au |
+ | |
+ | |
+ +---------------------------------------------------------------------------*/
+
+/*---------------------------------------------------------------------------+
+ | Note: |
+ | The file contains code which accesses user memory. |
+ | Emulator static data may change when user memory is accessed, due to |
+ | other processes using the emulator while swapping is in progress. |
+ +---------------------------------------------------------------------------*/
+
+#include <linux/signal.h>
+
+#include <linux/uaccess.h>
+
+#include "fpu_emu.h"
+#include "fpu_system.h"
+#include "exception.h"
+#include "status_w.h"
+#include "control_w.h"
+#include "reg_constant.h"
+#include "version.h"
+
+/* */
+#undef PRINT_MESSAGES
+/* */
+
+#if 0
+void Un_impl(void)
+{
+ u_char byte1, FPU_modrm;
+ unsigned long address = FPU_ORIG_EIP;
+
+ RE_ENTRANT_CHECK_OFF;
+ /* No need to check access_ok(), we have previously fetched these bytes. */
+ printk("Unimplemented FPU Opcode at eip=%p : ", (void __user *)address);
+ if (FPU_CS == __USER_CS) {
+ while (1) {
+ FPU_get_user(byte1, (u_char __user *) address);
+ if ((byte1 & 0xf8) == 0xd8)
+ break;
+ printk("[%02x]", byte1);
+ address++;
+ }
+ printk("%02x ", byte1);
+ FPU_get_user(FPU_modrm, 1 + (u_char __user *) address);
+
+ if (FPU_modrm >= 0300)
+ printk("%02x (%02x+%d)\n", FPU_modrm, FPU_modrm & 0xf8,
+ FPU_modrm & 7);
+ else
+ printk("/%d\n", (FPU_modrm >> 3) & 7);
+ } else {
+ printk("cs selector = %04x\n", FPU_CS);
+ }
+
+ RE_ENTRANT_CHECK_ON;
+
+ EXCEPTION(EX_Invalid);
+
+}
+#endif /* 0 */
+
+/*
+ Called for opcodes which are illegal and which are known to result in a
+ SIGILL with a real 80486.
+ */
+void FPU_illegal(void)
+{
+ math_abort(FPU_info, SIGILL);
+}
+
+void FPU_printall(void)
+{
+ int i;
+ static const char *tag_desc[] = { "Valid", "Zero", "ERROR", "Empty",
+ "DeNorm", "Inf", "NaN"
+ };
+ u_char byte1, FPU_modrm;
+ unsigned long address = FPU_ORIG_EIP;
+
+ RE_ENTRANT_CHECK_OFF;
+ /* No need to check access_ok(), we have previously fetched these bytes. */
+ printk("At %p:", (void *)address);
+ if (FPU_CS == __USER_CS) {
+#define MAX_PRINTED_BYTES 20
+ for (i = 0; i < MAX_PRINTED_BYTES; i++) {
+ FPU_get_user(byte1, (u_char __user *) address);
+ if ((byte1 & 0xf8) == 0xd8) {
+ printk(" %02x", byte1);
+ break;
+ }
+ printk(" [%02x]", byte1);
+ address++;
+ }
+ if (i == MAX_PRINTED_BYTES)
+ printk(" [more..]\n");
+ else {
+ FPU_get_user(FPU_modrm, 1 + (u_char __user *) address);
+
+ if (FPU_modrm >= 0300)
+ printk(" %02x (%02x+%d)\n", FPU_modrm,
+ FPU_modrm & 0xf8, FPU_modrm & 7);
+ else
+ printk(" /%d, mod=%d rm=%d\n",
+ (FPU_modrm >> 3) & 7,
+ (FPU_modrm >> 6) & 3, FPU_modrm & 7);
+ }
+ } else {
+ printk("%04x\n", FPU_CS);
+ }
+
+ partial_status = status_word();
+
+#ifdef DEBUGGING
+ if (partial_status & SW_Backward)
+ printk("SW: backward compatibility\n");
+ if (partial_status & SW_C3)
+ printk("SW: condition bit 3\n");
+ if (partial_status & SW_C2)
+ printk("SW: condition bit 2\n");
+ if (partial_status & SW_C1)
+ printk("SW: condition bit 1\n");
+ if (partial_status & SW_C0)
+ printk("SW: condition bit 0\n");
+ if (partial_status & SW_Summary)
+ printk("SW: exception summary\n");
+ if (partial_status & SW_Stack_Fault)
+ printk("SW: stack fault\n");
+ if (partial_status & SW_Precision)
+ printk("SW: loss of precision\n");
+ if (partial_status & SW_Underflow)
+ printk("SW: underflow\n");
+ if (partial_status & SW_Overflow)
+ printk("SW: overflow\n");
+ if (partial_status & SW_Zero_Div)
+ printk("SW: divide by zero\n");
+ if (partial_status & SW_Denorm_Op)
+ printk("SW: denormalized operand\n");
+ if (partial_status & SW_Invalid)
+ printk("SW: invalid operation\n");
+#endif /* DEBUGGING */
+
+ printk(" SW: b=%d st=%d es=%d sf=%d cc=%d%d%d%d ef=%d%d%d%d%d%d\n", partial_status & 0x8000 ? 1 : 0, /* busy */
+ (partial_status & 0x3800) >> 11, /* stack top pointer */
+ partial_status & 0x80 ? 1 : 0, /* Error summary status */
+ partial_status & 0x40 ? 1 : 0, /* Stack flag */
+ partial_status & SW_C3 ? 1 : 0, partial_status & SW_C2 ? 1 : 0, /* cc */
+ partial_status & SW_C1 ? 1 : 0, partial_status & SW_C0 ? 1 : 0, /* cc */
+ partial_status & SW_Precision ? 1 : 0,
+ partial_status & SW_Underflow ? 1 : 0,
+ partial_status & SW_Overflow ? 1 : 0,
+ partial_status & SW_Zero_Div ? 1 : 0,
+ partial_status & SW_Denorm_Op ? 1 : 0,
+ partial_status & SW_Invalid ? 1 : 0);
+
+ printk(" CW: ic=%d rc=%d%d pc=%d%d iem=%d ef=%d%d%d%d%d%d\n",
+ control_word & 0x1000 ? 1 : 0,
+ (control_word & 0x800) >> 11, (control_word & 0x400) >> 10,
+ (control_word & 0x200) >> 9, (control_word & 0x100) >> 8,
+ control_word & 0x80 ? 1 : 0,
+ control_word & SW_Precision ? 1 : 0,
+ control_word & SW_Underflow ? 1 : 0,
+ control_word & SW_Overflow ? 1 : 0,
+ control_word & SW_Zero_Div ? 1 : 0,
+ control_word & SW_Denorm_Op ? 1 : 0,
+ control_word & SW_Invalid ? 1 : 0);
+
+ for (i = 0; i < 8; i++) {
+ FPU_REG *r = &st(i);
+ u_char tagi = FPU_gettagi(i);
+ switch (tagi) {
+ case TAG_Empty:
+ continue;
+ break;
+ case TAG_Zero:
+ case TAG_Special:
+ tagi = FPU_Special(r);
+ case TAG_Valid:
+ printk("st(%d) %c .%04lx %04lx %04lx %04lx e%+-6d ", i,
+ getsign(r) ? '-' : '+',
+ (long)(r->sigh >> 16),
+ (long)(r->sigh & 0xFFFF),
+ (long)(r->sigl >> 16),
+ (long)(r->sigl & 0xFFFF),
+ exponent(r) - EXP_BIAS + 1);
+ break;
+ default:
+ printk("Whoops! Error in errors.c: tag%d is %d ", i,
+ tagi);
+ continue;
+ break;
+ }
+ printk("%s\n", tag_desc[(int)(unsigned)tagi]);
+ }
+
+ RE_ENTRANT_CHECK_ON;
+
+}
+
+static struct {
+ int type;
+ const char *name;
+} exception_names[] = {
+ {
+ EX_StackOver, "stack overflow"}, {
+ EX_StackUnder, "stack underflow"}, {
+ EX_Precision, "loss of precision"}, {
+ EX_Underflow, "underflow"}, {
+ EX_Overflow, "overflow"}, {
+ EX_ZeroDiv, "divide by zero"}, {
+ EX_Denormal, "denormalized operand"}, {
+ EX_Invalid, "invalid operation"}, {
+ EX_INTERNAL, "INTERNAL BUG in " FPU_VERSION}, {
+ 0, NULL}
+};
+
+/*
+ EX_INTERNAL is always given with a code which indicates where the
+ error was detected.
+
+ Internal error types:
+ 0x14 in fpu_etc.c
+ 0x1nn in a *.c file:
+ 0x101 in reg_add_sub.c
+ 0x102 in reg_mul.c
+ 0x104 in poly_atan.c
+ 0x105 in reg_mul.c
+ 0x107 in fpu_trig.c
+ 0x108 in reg_compare.c
+ 0x109 in reg_compare.c
+ 0x110 in reg_add_sub.c
+ 0x111 in fpe_entry.c
+ 0x112 in fpu_trig.c
+ 0x113 in errors.c
+ 0x115 in fpu_trig.c
+ 0x116 in fpu_trig.c
+ 0x117 in fpu_trig.c
+ 0x118 in fpu_trig.c
+ 0x119 in fpu_trig.c
+ 0x120 in poly_atan.c
+ 0x121 in reg_compare.c
+ 0x122 in reg_compare.c
+ 0x123 in reg_compare.c
+ 0x125 in fpu_trig.c
+ 0x126 in fpu_entry.c
+ 0x127 in poly_2xm1.c
+ 0x128 in fpu_entry.c
+ 0x129 in fpu_entry.c
+ 0x130 in get_address.c
+ 0x131 in get_address.c
+ 0x132 in get_address.c
+ 0x133 in get_address.c
+ 0x140 in load_store.c
+ 0x141 in load_store.c
+ 0x150 in poly_sin.c
+ 0x151 in poly_sin.c
+ 0x160 in reg_ld_str.c
+ 0x161 in reg_ld_str.c
+ 0x162 in reg_ld_str.c
+ 0x163 in reg_ld_str.c
+ 0x164 in reg_ld_str.c
+ 0x170 in fpu_tags.c
+ 0x171 in fpu_tags.c
+ 0x172 in fpu_tags.c
+ 0x180 in reg_convert.c
+ 0x2nn in an *.S file:
+ 0x201 in reg_u_add.S
+ 0x202 in reg_u_div.S
+ 0x203 in reg_u_div.S
+ 0x204 in reg_u_div.S
+ 0x205 in reg_u_mul.S
+ 0x206 in reg_u_sub.S
+ 0x207 in wm_sqrt.S
+ 0x208 in reg_div.S
+ 0x209 in reg_u_sub.S
+ 0x210 in reg_u_sub.S
+ 0x211 in reg_u_sub.S
+ 0x212 in reg_u_sub.S
+ 0x213 in wm_sqrt.S
+ 0x214 in wm_sqrt.S
+ 0x215 in wm_sqrt.S
+ 0x220 in reg_norm.S
+ 0x221 in reg_norm.S
+ 0x230 in reg_round.S
+ 0x231 in reg_round.S
+ 0x232 in reg_round.S
+ 0x233 in reg_round.S
+ 0x234 in reg_round.S
+ 0x235 in reg_round.S
+ 0x236 in reg_round.S
+ 0x240 in div_Xsig.S
+ 0x241 in div_Xsig.S
+ 0x242 in div_Xsig.S
+ */
+
+asmlinkage __visible void FPU_exception(int n)
+{
+ int i, int_type;
+
+ int_type = 0; /* Needed only to stop compiler warnings */
+ if (n & EX_INTERNAL) {
+ int_type = n - EX_INTERNAL;
+ n = EX_INTERNAL;
+ /* Set lots of exception bits! */
+ partial_status |= (SW_Exc_Mask | SW_Summary | SW_Backward);
+ } else {
+ /* Extract only the bits which we use to set the status word */
+ n &= (SW_Exc_Mask);
+ /* Set the corresponding exception bit */
+ partial_status |= n;
+ /* Set summary bits iff exception isn't masked */
+ if (partial_status & ~control_word & CW_Exceptions)
+ partial_status |= (SW_Summary | SW_Backward);
+ if (n & (SW_Stack_Fault | EX_Precision)) {
+ if (!(n & SW_C1))
+ /* This bit distinguishes over- from underflow for a stack fault,
+ and roundup from round-down for precision loss. */
+ partial_status &= ~SW_C1;
+ }
+ }
+
+ RE_ENTRANT_CHECK_OFF;
+ if ((~control_word & n & CW_Exceptions) || (n == EX_INTERNAL)) {
+ /* Get a name string for error reporting */
+ for (i = 0; exception_names[i].type; i++)
+ if ((exception_names[i].type & n) ==
+ exception_names[i].type)
+ break;
+
+ if (exception_names[i].type) {
+#ifdef PRINT_MESSAGES
+ printk("FP Exception: %s!\n", exception_names[i].name);
+#endif /* PRINT_MESSAGES */
+ } else
+ printk("FPU emulator: Unknown Exception: 0x%04x!\n", n);
+
+ if (n == EX_INTERNAL) {
+ printk("FPU emulator: Internal error type 0x%04x\n",
+ int_type);
+ FPU_printall();
+ }
+#ifdef PRINT_MESSAGES
+ else
+ FPU_printall();
+#endif /* PRINT_MESSAGES */
+
+ /*
+ * The 80486 generates an interrupt on the next non-control FPU
+ * instruction. So we need some means of flagging it.
+ * We use the ES (Error Summary) bit for this.
+ */
+ }
+ RE_ENTRANT_CHECK_ON;
+
+#ifdef __DEBUG__
+ math_abort(FPU_info, SIGFPE);
+#endif /* __DEBUG__ */
+
+}
+
+/* Real operation attempted on a NaN. */
+/* Returns < 0 if the exception is unmasked */
+int real_1op_NaN(FPU_REG *a)
+{
+ int signalling, isNaN;
+
+ isNaN = (exponent(a) == EXP_OVER) && (a->sigh & 0x80000000);
+
+ /* The default result for the case of two "equal" NaNs (signs may
+ differ) is chosen to reproduce 80486 behaviour */
+ signalling = isNaN && !(a->sigh & 0x40000000);
+
+ if (!signalling) {
+ if (!isNaN) { /* pseudo-NaN, or other unsupported? */
+ if (control_word & CW_Invalid) {
+ /* Masked response */
+ reg_copy(&CONST_QNaN, a);
+ }
+ EXCEPTION(EX_Invalid);
+ return (!(control_word & CW_Invalid) ? FPU_Exception :
+ 0) | TAG_Special;
+ }
+ return TAG_Special;
+ }
+
+ if (control_word & CW_Invalid) {
+ /* The masked response */
+ if (!(a->sigh & 0x80000000)) { /* pseudo-NaN ? */
+ reg_copy(&CONST_QNaN, a);
+ }
+ /* ensure a Quiet NaN */
+ a->sigh |= 0x40000000;
+ }
+
+ EXCEPTION(EX_Invalid);
+
+ return (!(control_word & CW_Invalid) ? FPU_Exception : 0) | TAG_Special;
+}
+
+/* Real operation attempted on two operands, one a NaN. */
+/* Returns < 0 if the exception is unmasked */
+int real_2op_NaN(FPU_REG const *b, u_char tagb,
+ int deststnr, FPU_REG const *defaultNaN)
+{
+ FPU_REG *dest = &st(deststnr);
+ FPU_REG const *a = dest;
+ u_char taga = FPU_gettagi(deststnr);
+ FPU_REG const *x;
+ int signalling, unsupported;
+
+ if (taga == TAG_Special)
+ taga = FPU_Special(a);
+ if (tagb == TAG_Special)
+ tagb = FPU_Special(b);
+
+ /* TW_NaN is also used for unsupported data types. */
+ unsupported = ((taga == TW_NaN)
+ && !((exponent(a) == EXP_OVER)
+ && (a->sigh & 0x80000000)))
+ || ((tagb == TW_NaN)
+ && !((exponent(b) == EXP_OVER) && (b->sigh & 0x80000000)));
+ if (unsupported) {
+ if (control_word & CW_Invalid) {
+ /* Masked response */
+ FPU_copy_to_regi(&CONST_QNaN, TAG_Special, deststnr);
+ }
+ EXCEPTION(EX_Invalid);
+ return (!(control_word & CW_Invalid) ? FPU_Exception : 0) |
+ TAG_Special;
+ }
+
+ if (taga == TW_NaN) {
+ x = a;
+ if (tagb == TW_NaN) {
+ signalling = !(a->sigh & b->sigh & 0x40000000);
+ if (significand(b) > significand(a))
+ x = b;
+ else if (significand(b) == significand(a)) {
+ /* The default result for the case of two "equal" NaNs (signs may
+ differ) is chosen to reproduce 80486 behaviour */
+ x = defaultNaN;
+ }
+ } else {
+ /* return the quiet version of the NaN in a */
+ signalling = !(a->sigh & 0x40000000);
+ }
+ } else
+#ifdef PARANOID
+ if (tagb == TW_NaN)
+#endif /* PARANOID */
+ {
+ signalling = !(b->sigh & 0x40000000);
+ x = b;
+ }
+#ifdef PARANOID
+ else {
+ signalling = 0;
+ EXCEPTION(EX_INTERNAL | 0x113);
+ x = &CONST_QNaN;
+ }
+#endif /* PARANOID */
+
+ if ((!signalling) || (control_word & CW_Invalid)) {
+ if (!x)
+ x = b;
+
+ if (!(x->sigh & 0x80000000)) /* pseudo-NaN ? */
+ x = &CONST_QNaN;
+
+ FPU_copy_to_regi(x, TAG_Special, deststnr);
+
+ if (!signalling)
+ return TAG_Special;
+
+ /* ensure a Quiet NaN */
+ dest->sigh |= 0x40000000;
+ }
+
+ EXCEPTION(EX_Invalid);
+
+ return (!(control_word & CW_Invalid) ? FPU_Exception : 0) | TAG_Special;
+}
+
+/* Invalid arith operation on Valid registers */
+/* Returns < 0 if the exception is unmasked */
+asmlinkage __visible int arith_invalid(int deststnr)
+{
+
+ EXCEPTION(EX_Invalid);
+
+ if (control_word & CW_Invalid) {
+ /* The masked response */
+ FPU_copy_to_regi(&CONST_QNaN, TAG_Special, deststnr);
+ }
+
+ return (!(control_word & CW_Invalid) ? FPU_Exception : 0) | TAG_Valid;
+
+}
+
+/* Divide a finite number by zero */
+asmlinkage __visible int FPU_divide_by_zero(int deststnr, u_char sign)
+{
+ FPU_REG *dest = &st(deststnr);
+ int tag = TAG_Valid;
+
+ if (control_word & CW_ZeroDiv) {
+ /* The masked response */
+ FPU_copy_to_regi(&CONST_INF, TAG_Special, deststnr);
+ setsign(dest, sign);
+ tag = TAG_Special;
+ }
+
+ EXCEPTION(EX_ZeroDiv);
+
+ return (!(control_word & CW_ZeroDiv) ? FPU_Exception : 0) | tag;
+
+}
+
+/* This may be called often, so keep it lean */
+int set_precision_flag(int flags)
+{
+ if (control_word & CW_Precision) {
+ partial_status &= ~(SW_C1 & flags);
+ partial_status |= flags; /* The masked response */
+ return 0;
+ } else {
+ EXCEPTION(flags);
+ return 1;
+ }
+}
+
+/* This may be called often, so keep it lean */
+asmlinkage __visible void set_precision_flag_up(void)
+{
+ if (control_word & CW_Precision)
+ partial_status |= (SW_Precision | SW_C1); /* The masked response */
+ else
+ EXCEPTION(EX_Precision | SW_C1);
+}
+
+/* This may be called often, so keep it lean */
+asmlinkage __visible void set_precision_flag_down(void)
+{
+ if (control_word & CW_Precision) { /* The masked response */
+ partial_status &= ~SW_C1;
+ partial_status |= SW_Precision;
+ } else
+ EXCEPTION(EX_Precision);
+}
+
+asmlinkage __visible int denormal_operand(void)
+{
+ if (control_word & CW_Denormal) { /* The masked response */
+ partial_status |= SW_Denorm_Op;
+ return TAG_Special;
+ } else {
+ EXCEPTION(EX_Denormal);
+ return TAG_Special | FPU_Exception;
+ }
+}
+
+asmlinkage __visible int arith_overflow(FPU_REG *dest)
+{
+ int tag = TAG_Valid;
+
+ if (control_word & CW_Overflow) {
+ /* The masked response */
+/* ###### The response here depends upon the rounding mode */
+ reg_copy(&CONST_INF, dest);
+ tag = TAG_Special;
+ } else {
+ /* Subtract the magic number from the exponent */
+ addexponent(dest, (-3 * (1 << 13)));
+ }
+
+ EXCEPTION(EX_Overflow);
+ if (control_word & CW_Overflow) {
+ /* The overflow exception is masked. */
+ /* By definition, precision is lost.
+ The roundup bit (C1) is also set because we have
+ "rounded" upwards to Infinity. */
+ EXCEPTION(EX_Precision | SW_C1);
+ return tag;
+ }
+
+ return tag;
+
+}
+
+asmlinkage __visible int arith_underflow(FPU_REG *dest)
+{
+ int tag = TAG_Valid;
+
+ if (control_word & CW_Underflow) {
+ /* The masked response */
+ if (exponent16(dest) <= EXP_UNDER - 63) {
+ reg_copy(&CONST_Z, dest);
+ partial_status &= ~SW_C1; /* Round down. */
+ tag = TAG_Zero;
+ } else {
+ stdexp(dest);
+ }
+ } else {
+ /* Add the magic number to the exponent. */
+ addexponent(dest, (3 * (1 << 13)) + EXTENDED_Ebias);
+ }
+
+ EXCEPTION(EX_Underflow);
+ if (control_word & CW_Underflow) {
+ /* The underflow exception is masked. */
+ EXCEPTION(EX_Precision);
+ return tag;
+ }
+
+ return tag;
+
+}
+
+void FPU_stack_overflow(void)
+{
+
+ if (control_word & CW_Invalid) {
+ /* The masked response */
+ top--;
+ FPU_copy_to_reg0(&CONST_QNaN, TAG_Special);
+ }
+
+ EXCEPTION(EX_StackOver);
+
+ return;
+
+}
+
+void FPU_stack_underflow(void)
+{
+
+ if (control_word & CW_Invalid) {
+ /* The masked response */
+ FPU_copy_to_reg0(&CONST_QNaN, TAG_Special);
+ }
+
+ EXCEPTION(EX_StackUnder);
+
+ return;
+
+}
+
+void FPU_stack_underflow_i(int i)
+{
+
+ if (control_word & CW_Invalid) {
+ /* The masked response */
+ FPU_copy_to_regi(&CONST_QNaN, TAG_Special, i);
+ }
+
+ EXCEPTION(EX_StackUnder);
+
+ return;
+
+}
+
+void FPU_stack_underflow_pop(int i)
+{
+
+ if (control_word & CW_Invalid) {
+ /* The masked response */
+ FPU_copy_to_regi(&CONST_QNaN, TAG_Special, i);
+ FPU_pop();
+ }
+
+ EXCEPTION(EX_StackUnder);
+
+ return;
+
+}
diff --git a/arch/x86/math-emu/exception.h b/arch/x86/math-emu/exception.h
new file mode 100644
index 000000000..75230b977
--- /dev/null
+++ b/arch/x86/math-emu/exception.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*---------------------------------------------------------------------------+
+ | exception.h |
+ | |
+ | Copyright (C) 1992 W. Metzenthen, 22 Parker St, Ormond, Vic 3163, |
+ | Australia. E-mail billm@vaxc.cc.monash.edu.au |
+ | |
+ +---------------------------------------------------------------------------*/
+
+#ifndef _EXCEPTION_H_
+#define _EXCEPTION_H_
+
+#ifdef __ASSEMBLY__
+#define Const_(x) $##x
+#else
+#define Const_(x) x
+#endif
+
+#ifndef SW_C1
+#include "fpu_emu.h"
+#endif /* SW_C1 */
+
+#define FPU_BUSY Const_(0x8000) /* FPU busy bit (8087 compatibility) */
+#define EX_ErrorSummary Const_(0x0080) /* Error summary status */
+/* Special exceptions: */
+#define EX_INTERNAL Const_(0x8000) /* Internal error in wm-FPU-emu */
+#define EX_StackOver Const_(0x0041|SW_C1) /* stack overflow */
+#define EX_StackUnder Const_(0x0041) /* stack underflow */
+/* Exception flags: */
+#define EX_Precision Const_(0x0020) /* loss of precision */
+#define EX_Underflow Const_(0x0010) /* underflow */
+#define EX_Overflow Const_(0x0008) /* overflow */
+#define EX_ZeroDiv Const_(0x0004) /* divide by zero */
+#define EX_Denormal Const_(0x0002) /* denormalized operand */
+#define EX_Invalid Const_(0x0001) /* invalid operation */
+
+#define PRECISION_LOST_UP Const_((EX_Precision | SW_C1))
+#define PRECISION_LOST_DOWN Const_(EX_Precision)
+
+#ifndef __ASSEMBLY__
+
+#ifdef DEBUG
+#define EXCEPTION(x) { printk("exception in %s at line %d\n", \
+ __FILE__, __LINE__); FPU_exception(x); }
+#else
+#define EXCEPTION(x) FPU_exception(x)
+#endif
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _EXCEPTION_H_ */
diff --git a/arch/x86/math-emu/fpu_arith.c b/arch/x86/math-emu/fpu_arith.c
new file mode 100644
index 000000000..09006dc47
--- /dev/null
+++ b/arch/x86/math-emu/fpu_arith.c
@@ -0,0 +1,153 @@
+// SPDX-License-Identifier: GPL-2.0
+/*---------------------------------------------------------------------------+
+ | fpu_arith.c |
+ | |
+ | Code to implement the FPU register/register arithmetic instructions |
+ | |
+ | Copyright (C) 1992,1993,1997 |
+ | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia |
+ | E-mail billm@suburbia.net |
+ | |
+ | |
+ +---------------------------------------------------------------------------*/
+
+#include "fpu_system.h"
+#include "fpu_emu.h"
+#include "control_w.h"
+#include "status_w.h"
+
+void fadd__(void)
+{
+ /* fadd st,st(i) */
+ int i = FPU_rm;
+ clear_C1();
+ FPU_add(&st(i), FPU_gettagi(i), 0, control_word);
+}
+
+void fmul__(void)
+{
+ /* fmul st,st(i) */
+ int i = FPU_rm;
+ clear_C1();
+ FPU_mul(&st(i), FPU_gettagi(i), 0, control_word);
+}
+
+void fsub__(void)
+{
+ /* fsub st,st(i) */
+ clear_C1();
+ FPU_sub(0, FPU_rm, control_word);
+}
+
+void fsubr_(void)
+{
+ /* fsubr st,st(i) */
+ clear_C1();
+ FPU_sub(REV, FPU_rm, control_word);
+}
+
+void fdiv__(void)
+{
+ /* fdiv st,st(i) */
+ clear_C1();
+ FPU_div(0, FPU_rm, control_word);
+}
+
+void fdivr_(void)
+{
+ /* fdivr st,st(i) */
+ clear_C1();
+ FPU_div(REV, FPU_rm, control_word);
+}
+
+void fadd_i(void)
+{
+ /* fadd st(i),st */
+ int i = FPU_rm;
+ clear_C1();
+ FPU_add(&st(i), FPU_gettagi(i), i, control_word);
+}
+
+void fmul_i(void)
+{
+ /* fmul st(i),st */
+ clear_C1();
+ FPU_mul(&st(0), FPU_gettag0(), FPU_rm, control_word);
+}
+
+void fsubri(void)
+{
+ /* fsubr st(i),st */
+ clear_C1();
+ FPU_sub(DEST_RM, FPU_rm, control_word);
+}
+
+void fsub_i(void)
+{
+ /* fsub st(i),st */
+ clear_C1();
+ FPU_sub(REV | DEST_RM, FPU_rm, control_word);
+}
+
+void fdivri(void)
+{
+ /* fdivr st(i),st */
+ clear_C1();
+ FPU_div(DEST_RM, FPU_rm, control_word);
+}
+
+void fdiv_i(void)
+{
+ /* fdiv st(i),st */
+ clear_C1();
+ FPU_div(REV | DEST_RM, FPU_rm, control_word);
+}
+
+void faddp_(void)
+{
+ /* faddp st(i),st */
+ int i = FPU_rm;
+ clear_C1();
+ if (FPU_add(&st(i), FPU_gettagi(i), i, control_word) >= 0)
+ FPU_pop();
+}
+
+void fmulp_(void)
+{
+ /* fmulp st(i),st */
+ clear_C1();
+ if (FPU_mul(&st(0), FPU_gettag0(), FPU_rm, control_word) >= 0)
+ FPU_pop();
+}
+
+void fsubrp(void)
+{
+ /* fsubrp st(i),st */
+ clear_C1();
+ if (FPU_sub(DEST_RM, FPU_rm, control_word) >= 0)
+ FPU_pop();
+}
+
+void fsubp_(void)
+{
+ /* fsubp st(i),st */
+ clear_C1();
+ if (FPU_sub(REV | DEST_RM, FPU_rm, control_word) >= 0)
+ FPU_pop();
+}
+
+void fdivrp(void)
+{
+ /* fdivrp st(i),st */
+ clear_C1();
+ if (FPU_div(DEST_RM, FPU_rm, control_word) >= 0)
+ FPU_pop();
+}
+
+void fdivp_(void)
+{
+ /* fdivp st(i),st */
+ clear_C1();
+ if (FPU_div(REV | DEST_RM, FPU_rm, control_word) >= 0)
+ FPU_pop();
+}
diff --git a/arch/x86/math-emu/fpu_asm.h b/arch/x86/math-emu/fpu_asm.h
new file mode 100644
index 000000000..a83353d52
--- /dev/null
+++ b/arch/x86/math-emu/fpu_asm.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*---------------------------------------------------------------------------+
+ | fpu_asm.h |
+ | |
+ | Copyright (C) 1992,1995,1997 |
+ | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, |
+ | Australia. E-mail billm@suburbia.net |
+ | |
+ +---------------------------------------------------------------------------*/
+
+#ifndef _FPU_ASM_H_
+#define _FPU_ASM_H_
+
+#include <linux/linkage.h>
+
+#define EXCEPTION FPU_exception
+
+#define PARAM1 8(%ebp)
+#define PARAM2 12(%ebp)
+#define PARAM3 16(%ebp)
+#define PARAM4 20(%ebp)
+#define PARAM5 24(%ebp)
+#define PARAM6 28(%ebp)
+#define PARAM7 32(%ebp)
+
+#define SIGL_OFFSET 0
+#define EXP(x) 8(x)
+#define SIG(x) SIGL_OFFSET##(x)
+#define SIGL(x) SIGL_OFFSET##(x)
+#define SIGH(x) 4(x)
+
+#endif /* _FPU_ASM_H_ */
diff --git a/arch/x86/math-emu/fpu_aux.c b/arch/x86/math-emu/fpu_aux.c
new file mode 100644
index 000000000..034748459
--- /dev/null
+++ b/arch/x86/math-emu/fpu_aux.c
@@ -0,0 +1,267 @@
+// SPDX-License-Identifier: GPL-2.0
+/*---------------------------------------------------------------------------+
+ | fpu_aux.c |
+ | |
+ | Code to implement some of the FPU auxiliary instructions. |
+ | |
+ | Copyright (C) 1992,1993,1994,1997 |
+ | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia |
+ | E-mail billm@suburbia.net |
+ | |
+ | |
+ +---------------------------------------------------------------------------*/
+
+#include "fpu_system.h"
+#include "exception.h"
+#include "fpu_emu.h"
+#include "status_w.h"
+#include "control_w.h"
+
+static void fnop(void)
+{
+}
+
+static void fclex(void)
+{
+ partial_status &=
+ ~(SW_Backward | SW_Summary | SW_Stack_Fault | SW_Precision |
+ SW_Underflow | SW_Overflow | SW_Zero_Div | SW_Denorm_Op |
+ SW_Invalid);
+ no_ip_update = 1;
+}
+
+/* Needs to be externally visible */
+void fpstate_init_soft(struct swregs_state *soft)
+{
+ struct address *oaddr, *iaddr;
+ memset(soft, 0, sizeof(*soft));
+ soft->cwd = 0x037f;
+ soft->swd = 0;
+ soft->ftop = 0; /* We don't keep top in the status word internally. */
+ soft->twd = 0xffff;
+ /* The behaviour is different from that detailed in
+ Section 15.1.6 of the Intel manual */
+ oaddr = (struct address *)&soft->foo;
+ oaddr->offset = 0;
+ oaddr->selector = 0;
+ iaddr = (struct address *)&soft->fip;
+ iaddr->offset = 0;
+ iaddr->selector = 0;
+ iaddr->opcode = 0;
+ soft->no_update = 1;
+}
+
+void finit(void)
+{
+ fpstate_init_soft(&current->thread.fpu.state.soft);
+}
+
+/*
+ * These are nops on the i387..
+ */
+#define feni fnop
+#define fdisi fnop
+#define fsetpm fnop
+
+static FUNC const finit_table[] = {
+ feni, fdisi, fclex, finit,
+ fsetpm, FPU_illegal, FPU_illegal, FPU_illegal
+};
+
+void finit_(void)
+{
+ (finit_table[FPU_rm]) ();
+}
+
+static void fstsw_ax(void)
+{
+ *(short *)&FPU_EAX = status_word();
+ no_ip_update = 1;
+}
+
+static FUNC const fstsw_table[] = {
+ fstsw_ax, FPU_illegal, FPU_illegal, FPU_illegal,
+ FPU_illegal, FPU_illegal, FPU_illegal, FPU_illegal
+};
+
+void fstsw_(void)
+{
+ (fstsw_table[FPU_rm]) ();
+}
+
+static FUNC const fp_nop_table[] = {
+ fnop, FPU_illegal, FPU_illegal, FPU_illegal,
+ FPU_illegal, FPU_illegal, FPU_illegal, FPU_illegal
+};
+
+void fp_nop(void)
+{
+ (fp_nop_table[FPU_rm]) ();
+}
+
+void fld_i_(void)
+{
+ FPU_REG *st_new_ptr;
+ int i;
+ u_char tag;
+
+ if (STACK_OVERFLOW) {
+ FPU_stack_overflow();
+ return;
+ }
+
+ /* fld st(i) */
+ i = FPU_rm;
+ if (NOT_EMPTY(i)) {
+ reg_copy(&st(i), st_new_ptr);
+ tag = FPU_gettagi(i);
+ push();
+ FPU_settag0(tag);
+ } else {
+ if (control_word & CW_Invalid) {
+ /* The masked response */
+ FPU_stack_underflow();
+ } else
+ EXCEPTION(EX_StackUnder);
+ }
+
+}
+
+void fxch_i(void)
+{
+ /* fxch st(i) */
+ FPU_REG t;
+ int i = FPU_rm;
+ FPU_REG *st0_ptr = &st(0), *sti_ptr = &st(i);
+ long tag_word = fpu_tag_word;
+ int regnr = top & 7, regnri = ((regnr + i) & 7);
+ u_char st0_tag = (tag_word >> (regnr * 2)) & 3;
+ u_char sti_tag = (tag_word >> (regnri * 2)) & 3;
+
+ if (st0_tag == TAG_Empty) {
+ if (sti_tag == TAG_Empty) {
+ FPU_stack_underflow();
+ FPU_stack_underflow_i(i);
+ return;
+ }
+ if (control_word & CW_Invalid) {
+ /* Masked response */
+ FPU_copy_to_reg0(sti_ptr, sti_tag);
+ }
+ FPU_stack_underflow_i(i);
+ return;
+ }
+ if (sti_tag == TAG_Empty) {
+ if (control_word & CW_Invalid) {
+ /* Masked response */
+ FPU_copy_to_regi(st0_ptr, st0_tag, i);
+ }
+ FPU_stack_underflow();
+ return;
+ }
+ clear_C1();
+
+ reg_copy(st0_ptr, &t);
+ reg_copy(sti_ptr, st0_ptr);
+ reg_copy(&t, sti_ptr);
+
+ tag_word &= ~(3 << (regnr * 2)) & ~(3 << (regnri * 2));
+ tag_word |= (sti_tag << (regnr * 2)) | (st0_tag << (regnri * 2));
+ fpu_tag_word = tag_word;
+}
+
+static void fcmovCC(void)
+{
+ /* fcmovCC st(i) */
+ int i = FPU_rm;
+ FPU_REG *st0_ptr = &st(0);
+ FPU_REG *sti_ptr = &st(i);
+ long tag_word = fpu_tag_word;
+ int regnr = top & 7;
+ int regnri = (top + i) & 7;
+ u_char sti_tag = (tag_word >> (regnri * 2)) & 3;
+
+ if (sti_tag == TAG_Empty) {
+ FPU_stack_underflow();
+ clear_C1();
+ return;
+ }
+ reg_copy(sti_ptr, st0_ptr);
+ tag_word &= ~(3 << (regnr * 2));
+ tag_word |= (sti_tag << (regnr * 2));
+ fpu_tag_word = tag_word;
+}
+
+void fcmovb(void)
+{
+ if (FPU_EFLAGS & X86_EFLAGS_CF)
+ fcmovCC();
+}
+
+void fcmove(void)
+{
+ if (FPU_EFLAGS & X86_EFLAGS_ZF)
+ fcmovCC();
+}
+
+void fcmovbe(void)
+{
+ if (FPU_EFLAGS & (X86_EFLAGS_CF|X86_EFLAGS_ZF))
+ fcmovCC();
+}
+
+void fcmovu(void)
+{
+ if (FPU_EFLAGS & X86_EFLAGS_PF)
+ fcmovCC();
+}
+
+void fcmovnb(void)
+{
+ if (!(FPU_EFLAGS & X86_EFLAGS_CF))
+ fcmovCC();
+}
+
+void fcmovne(void)
+{
+ if (!(FPU_EFLAGS & X86_EFLAGS_ZF))
+ fcmovCC();
+}
+
+void fcmovnbe(void)
+{
+ if (!(FPU_EFLAGS & (X86_EFLAGS_CF|X86_EFLAGS_ZF)))
+ fcmovCC();
+}
+
+void fcmovnu(void)
+{
+ if (!(FPU_EFLAGS & X86_EFLAGS_PF))
+ fcmovCC();
+}
+
+void ffree_(void)
+{
+ /* ffree st(i) */
+ FPU_settagi(FPU_rm, TAG_Empty);
+}
+
+void ffreep(void)
+{
+ /* ffree st(i) + pop - unofficial code */
+ FPU_settagi(FPU_rm, TAG_Empty);
+ FPU_pop();
+}
+
+void fst_i_(void)
+{
+ /* fst st(i) */
+ FPU_copy_to_regi(&st(0), FPU_gettag0(), FPU_rm);
+}
+
+void fstp_i(void)
+{
+ /* fstp st(i) */
+ FPU_copy_to_regi(&st(0), FPU_gettag0(), FPU_rm);
+ FPU_pop();
+}
diff --git a/arch/x86/math-emu/fpu_emu.h b/arch/x86/math-emu/fpu_emu.h
new file mode 100644
index 000000000..0c122226c
--- /dev/null
+++ b/arch/x86/math-emu/fpu_emu.h
@@ -0,0 +1,218 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*---------------------------------------------------------------------------+
+ | fpu_emu.h |
+ | |
+ | Copyright (C) 1992,1993,1994,1997 |
+ | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, |
+ | Australia. E-mail billm@suburbia.net |
+ | |
+ +---------------------------------------------------------------------------*/
+
+#ifndef _FPU_EMU_H_
+#define _FPU_EMU_H_
+
+/*
+ * Define PECULIAR_486 to get a closer approximation to 80486 behaviour,
+ * rather than behaviour which appears to be cleaner.
+ * This is a matter of opinion: for all I know, the 80486 may simply
+ * be complying with the IEEE spec. Maybe one day I'll get to see the
+ * spec...
+ */
+#define PECULIAR_486
+
+#ifdef __ASSEMBLY__
+#include "fpu_asm.h"
+#define Const(x) $##x
+#else
+#define Const(x) x
+#endif
+
+#define EXP_BIAS Const(0)
+#define EXP_OVER Const(0x4000) /* smallest invalid large exponent */
+#define EXP_UNDER Const(-0x3fff) /* largest invalid small exponent */
+#define EXP_WAY_UNDER Const(-0x6000) /* Below the smallest denormal, but
+ still a 16 bit nr. */
+#define EXP_Infinity EXP_OVER
+#define EXP_NaN EXP_OVER
+
+#define EXTENDED_Ebias Const(0x3fff)
+#define EXTENDED_Emin (-0x3ffe) /* smallest valid exponent */
+
+#define SIGN_POS Const(0)
+#define SIGN_NEG Const(0x80)
+
+#define SIGN_Positive Const(0)
+#define SIGN_Negative Const(0x8000)
+
+/* Keep the order TAG_Valid, TAG_Zero, TW_Denormal */
+/* The following fold to 2 (Special) in the Tag Word */
+#define TW_Denormal Const(4) /* De-normal */
+#define TW_Infinity Const(5) /* + or - infinity */
+#define TW_NaN Const(6) /* Not a Number */
+#define TW_Unsupported Const(7) /* Not supported by an 80486 */
+
+#define TAG_Valid Const(0) /* valid */
+#define TAG_Zero Const(1) /* zero */
+#define TAG_Special Const(2) /* De-normal, + or - infinity,
+ or Not a Number */
+#define TAG_Empty Const(3) /* empty */
+#define TAG_Error Const(0x80) /* probably need to abort */
+
+#define LOADED_DATA Const(10101) /* Special st() number to identify
+ loaded data (not on stack). */
+
+/* A few flags (must be >= 0x10). */
+#define REV 0x10
+#define DEST_RM 0x20
+#define LOADED 0x40
+
+#define FPU_Exception Const(0x80000000) /* Added to tag returns. */
+
+#ifndef __ASSEMBLY__
+
+#include "fpu_system.h"
+
+#include <uapi/asm/sigcontext.h> /* for struct _fpstate */
+#include <asm/math_emu.h>
+#include <linux/linkage.h>
+
+/*
+#define RE_ENTRANT_CHECKING
+ */
+
+#ifdef RE_ENTRANT_CHECKING
+extern u_char emulating;
+# define RE_ENTRANT_CHECK_OFF emulating = 0
+# define RE_ENTRANT_CHECK_ON emulating = 1
+#else
+# define RE_ENTRANT_CHECK_OFF
+# define RE_ENTRANT_CHECK_ON
+#endif /* RE_ENTRANT_CHECKING */
+
+#define FWAIT_OPCODE 0x9b
+#define OP_SIZE_PREFIX 0x66
+#define ADDR_SIZE_PREFIX 0x67
+#define PREFIX_CS 0x2e
+#define PREFIX_DS 0x3e
+#define PREFIX_ES 0x26
+#define PREFIX_SS 0x36
+#define PREFIX_FS 0x64
+#define PREFIX_GS 0x65
+#define PREFIX_REPE 0xf3
+#define PREFIX_REPNE 0xf2
+#define PREFIX_LOCK 0xf0
+#define PREFIX_CS_ 1
+#define PREFIX_DS_ 2
+#define PREFIX_ES_ 3
+#define PREFIX_FS_ 4
+#define PREFIX_GS_ 5
+#define PREFIX_SS_ 6
+#define PREFIX_DEFAULT 7
+
+struct address {
+ unsigned int offset;
+ unsigned int selector:16;
+ unsigned int opcode:11;
+ unsigned int empty:5;
+};
+struct fpu__reg {
+ unsigned sigl;
+ unsigned sigh;
+ short exp;
+};
+
+typedef void (*FUNC) (void);
+typedef struct fpu__reg FPU_REG;
+typedef void (*FUNC_ST0) (FPU_REG *st0_ptr, u_char st0_tag);
+typedef struct {
+ u_char address_size, operand_size, segment;
+} overrides;
+/* This structure is 32 bits: */
+typedef struct {
+ overrides override;
+ u_char default_mode;
+} fpu_addr_modes;
+/* PROTECTED has a restricted meaning in the emulator; it is used
+ to signal that the emulator needs to do special things to ensure
+ that protection is respected in a segmented model. */
+#define PROTECTED 4
+#define SIXTEEN 1 /* We rely upon this being 1 (true) */
+#define VM86 SIXTEEN
+#define PM16 (SIXTEEN | PROTECTED)
+#define SEG32 PROTECTED
+extern u_char const data_sizes_16[32];
+
+#define register_base ((u_char *) registers )
+#define fpu_register(x) ( * ((FPU_REG *)( register_base + 10 * (x & 7) )) )
+#define st(x) ( * ((FPU_REG *)( register_base + 10 * ((top+x) & 7) )) )
+
+#define STACK_OVERFLOW (FPU_stackoverflow(&st_new_ptr))
+#define NOT_EMPTY(i) (!FPU_empty_i(i))
+
+#define NOT_EMPTY_ST0 (st0_tag ^ TAG_Empty)
+
+#define poppop() { FPU_pop(); FPU_pop(); }
+
+/* push() does not affect the tags */
+#define push() { top--; }
+
+#define signbyte(a) (((u_char *)(a))[9])
+#define getsign(a) (signbyte(a) & 0x80)
+#define setsign(a,b) { if ((b) != 0) signbyte(a) |= 0x80; else signbyte(a) &= 0x7f; }
+#define copysign(a,b) { if (getsign(a)) signbyte(b) |= 0x80; \
+ else signbyte(b) &= 0x7f; }
+#define changesign(a) { signbyte(a) ^= 0x80; }
+#define setpositive(a) { signbyte(a) &= 0x7f; }
+#define setnegative(a) { signbyte(a) |= 0x80; }
+#define signpositive(a) ( (signbyte(a) & 0x80) == 0 )
+#define signnegative(a) (signbyte(a) & 0x80)
+
+static inline void reg_copy(FPU_REG const *x, FPU_REG *y)
+{
+ *(short *)&(y->exp) = *(const short *)&(x->exp);
+ *(long long *)&(y->sigl) = *(const long long *)&(x->sigl);
+}
+
+#define exponent(x) (((*(short *)&((x)->exp)) & 0x7fff) - EXTENDED_Ebias)
+#define setexponentpos(x,y) { (*(short *)&((x)->exp)) = \
+ ((y) + EXTENDED_Ebias) & 0x7fff; }
+#define exponent16(x) (*(short *)&((x)->exp))
+#define setexponent16(x,y) { (*(short *)&((x)->exp)) = (u16)(y); }
+#define addexponent(x,y) { (*(short *)&((x)->exp)) += (y); }
+#define stdexp(x) { (*(short *)&((x)->exp)) += EXTENDED_Ebias; }
+
+#define isdenormal(ptr) (exponent(ptr) == EXP_BIAS+EXP_UNDER)
+
+#define significand(x) ( ((unsigned long long *)&((x)->sigl))[0] )
+
+/*----- Prototypes for functions written in assembler -----*/
+/* extern void reg_move(FPU_REG *a, FPU_REG *b); */
+
+asmlinkage int FPU_normalize(FPU_REG *x);
+asmlinkage int FPU_normalize_nuo(FPU_REG *x);
+asmlinkage int FPU_u_sub(FPU_REG const *arg1, FPU_REG const *arg2,
+ FPU_REG * answ, unsigned int control_w, u_char sign,
+ int expa, int expb);
+asmlinkage int FPU_u_mul(FPU_REG const *arg1, FPU_REG const *arg2,
+ FPU_REG * answ, unsigned int control_w, u_char sign,
+ int expon);
+asmlinkage int FPU_u_div(FPU_REG const *arg1, FPU_REG const *arg2,
+ FPU_REG * answ, unsigned int control_w, u_char sign);
+asmlinkage int FPU_u_add(FPU_REG const *arg1, FPU_REG const *arg2,
+ FPU_REG * answ, unsigned int control_w, u_char sign,
+ int expa, int expb);
+asmlinkage int wm_sqrt(FPU_REG *n, int dummy1, int dummy2,
+ unsigned int control_w, u_char sign);
+asmlinkage unsigned FPU_shrx(void *l, unsigned x);
+asmlinkage unsigned FPU_shrxs(void *v, unsigned x);
+asmlinkage unsigned long FPU_div_small(unsigned long long *x, unsigned long y);
+asmlinkage int FPU_round(FPU_REG *arg, unsigned int extent, int dummy,
+ unsigned int control_w, u_char sign);
+
+#ifndef MAKING_PROTO
+#include "fpu_proto.h"
+#endif
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _FPU_EMU_H_ */
diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c
new file mode 100644
index 000000000..9e2ba7e66
--- /dev/null
+++ b/arch/x86/math-emu/fpu_entry.c
@@ -0,0 +1,729 @@
+// SPDX-License-Identifier: GPL-2.0
+/*---------------------------------------------------------------------------+
+ | fpu_entry.c |
+ | |
+ | The entry functions for wm-FPU-emu |
+ | |
+ | Copyright (C) 1992,1993,1994,1996,1997 |
+ | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia |
+ | E-mail billm@suburbia.net |
+ | |
+ | See the files "README" and "COPYING" for further copyright and warranty |
+ | information. |
+ | |
+ +---------------------------------------------------------------------------*/
+
+/*---------------------------------------------------------------------------+
+ | Note: |
+ | The file contains code which accesses user memory. |
+ | Emulator static data may change when user memory is accessed, due to |
+ | other processes using the emulator while swapping is in progress. |
+ +---------------------------------------------------------------------------*/
+
+/*---------------------------------------------------------------------------+
+ | math_emulate(), restore_i387_soft() and save_i387_soft() are the only |
+ | entry points for wm-FPU-emu. |
+ +---------------------------------------------------------------------------*/
+
+#include <linux/signal.h>
+#include <linux/regset.h>
+
+#include <linux/uaccess.h>
+#include <asm/traps.h>
+#include <asm/user.h>
+#include <asm/fpu/internal.h>
+
+#include "fpu_system.h"
+#include "fpu_emu.h"
+#include "exception.h"
+#include "control_w.h"
+#include "status_w.h"
+
+#define __BAD__ FPU_illegal /* Illegal on an 80486, causes SIGILL */
+
+/* fcmovCC and f(u)comi(p) are enabled if CPUID(1).EDX(15) "cmov" is set */
+
+/* WARNING: "u" entries are not documented by Intel in their 80486 manual
+ and may not work on FPU clones or later Intel FPUs.
+ Changes to support them provided by Linus Torvalds. */
+
+static FUNC const st_instr_table[64] = {
+/* Opcode: d8 d9 da db */
+/* dc dd de df */
+/* c0..7 */ fadd__, fld_i_, fcmovb, fcmovnb,
+/* c0..7 */ fadd_i, ffree_, faddp_, ffreep,/*u*/
+/* c8..f */ fmul__, fxch_i, fcmove, fcmovne,
+/* c8..f */ fmul_i, fxch_i,/*u*/ fmulp_, fxch_i,/*u*/
+/* d0..7 */ fcom_st, fp_nop, fcmovbe, fcmovnbe,
+/* d0..7 */ fcom_st,/*u*/ fst_i_, fcompst,/*u*/ fstp_i,/*u*/
+/* d8..f */ fcompst, fstp_i,/*u*/ fcmovu, fcmovnu,
+/* d8..f */ fcompst,/*u*/ fstp_i, fcompp, fstp_i,/*u*/
+/* e0..7 */ fsub__, FPU_etc, __BAD__, finit_,
+/* e0..7 */ fsubri, fucom_, fsubrp, fstsw_,
+/* e8..f */ fsubr_, fconst, fucompp, fucomi_,
+/* e8..f */ fsub_i, fucomp, fsubp_, fucomip,
+/* f0..7 */ fdiv__, FPU_triga, __BAD__, fcomi_,
+/* f0..7 */ fdivri, __BAD__, fdivrp, fcomip,
+/* f8..f */ fdivr_, FPU_trigb, __BAD__, __BAD__,
+/* f8..f */ fdiv_i, __BAD__, fdivp_, __BAD__,
+};
+
+#define _NONE_ 0 /* Take no special action */
+#define _REG0_ 1 /* Need to check for not empty st(0) */
+#define _REGI_ 2 /* Need to check for not empty st(0) and st(rm) */
+#define _REGi_ 0 /* Uses st(rm) */
+#define _PUSH_ 3 /* Need to check for space to push onto stack */
+#define _null_ 4 /* Function illegal or not implemented */
+#define _REGIi 5 /* Uses st(0) and st(rm), result to st(rm) */
+#define _REGIp 6 /* Uses st(0) and st(rm), result to st(rm) then pop */
+#define _REGIc 0 /* Compare st(0) and st(rm) */
+#define _REGIn 0 /* Uses st(0) and st(rm), but handle checks later */
+
+static u_char const type_table[64] = {
+/* Opcode: d8 d9 da db dc dd de df */
+/* c0..7 */ _REGI_, _NONE_, _REGIn, _REGIn, _REGIi, _REGi_, _REGIp, _REGi_,
+/* c8..f */ _REGI_, _REGIn, _REGIn, _REGIn, _REGIi, _REGI_, _REGIp, _REGI_,
+/* d0..7 */ _REGIc, _NONE_, _REGIn, _REGIn, _REGIc, _REG0_, _REGIc, _REG0_,
+/* d8..f */ _REGIc, _REG0_, _REGIn, _REGIn, _REGIc, _REG0_, _REGIc, _REG0_,
+/* e0..7 */ _REGI_, _NONE_, _null_, _NONE_, _REGIi, _REGIc, _REGIp, _NONE_,
+/* e8..f */ _REGI_, _NONE_, _REGIc, _REGIc, _REGIi, _REGIc, _REGIp, _REGIc,
+/* f0..7 */ _REGI_, _NONE_, _null_, _REGIc, _REGIi, _null_, _REGIp, _REGIc,
+/* f8..f */ _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_,
+};
+
+#ifdef RE_ENTRANT_CHECKING
+u_char emulating = 0;
+#endif /* RE_ENTRANT_CHECKING */
+
+static int valid_prefix(u_char *Byte, u_char __user ** fpu_eip,
+ overrides * override);
+
+void math_emulate(struct math_emu_info *info)
+{
+ u_char FPU_modrm, byte1;
+ unsigned short code;
+ fpu_addr_modes addr_modes;
+ int unmasked;
+ FPU_REG loaded_data;
+ FPU_REG *st0_ptr;
+ u_char loaded_tag, st0_tag;
+ void __user *data_address;
+ struct address data_sel_off;
+ struct address entry_sel_off;
+ unsigned long code_base = 0;
+ unsigned long code_limit = 0; /* Initialized to stop compiler warnings */
+ struct desc_struct code_descriptor;
+ struct fpu *fpu = &current->thread.fpu;
+
+ fpu__initialize(fpu);
+
+#ifdef RE_ENTRANT_CHECKING
+ if (emulating) {
+ printk("ERROR: wm-FPU-emu is not RE-ENTRANT!\n");
+ }
+ RE_ENTRANT_CHECK_ON;
+#endif /* RE_ENTRANT_CHECKING */
+
+ FPU_info = info;
+
+ FPU_ORIG_EIP = FPU_EIP;
+
+ if ((FPU_EFLAGS & 0x00020000) != 0) {
+ /* Virtual 8086 mode */
+ addr_modes.default_mode = VM86;
+ FPU_EIP += code_base = FPU_CS << 4;
+ code_limit = code_base + 0xffff; /* Assumes code_base <= 0xffff0000 */
+ } else if (FPU_CS == __USER_CS && FPU_DS == __USER_DS) {
+ addr_modes.default_mode = 0;
+ } else if (FPU_CS == __KERNEL_CS) {
+ printk("math_emulate: %04x:%08lx\n", FPU_CS, FPU_EIP);
+ panic("Math emulation needed in kernel");
+ } else {
+
+ if ((FPU_CS & 4) != 4) { /* Must be in the LDT */
+ /* Can only handle segmented addressing via the LDT
+ for now, and it must be 16 bit */
+ printk("FPU emulator: Unsupported addressing mode\n");
+ math_abort(FPU_info, SIGILL);
+ }
+
+ code_descriptor = FPU_get_ldt_descriptor(FPU_CS);
+ if (code_descriptor.d) {
+ /* The above test may be wrong, the book is not clear */
+ /* Segmented 32 bit protected mode */
+ addr_modes.default_mode = SEG32;
+ } else {
+ /* 16 bit protected mode */
+ addr_modes.default_mode = PM16;
+ }
+ FPU_EIP += code_base = seg_get_base(&code_descriptor);
+ code_limit = seg_get_limit(&code_descriptor) + 1;
+ code_limit *= seg_get_granularity(&code_descriptor);
+ code_limit += code_base - 1;
+ if (code_limit < code_base)
+ code_limit = 0xffffffff;
+ }
+
+ FPU_lookahead = !(FPU_EFLAGS & X86_EFLAGS_TF);
+
+ if (!valid_prefix(&byte1, (u_char __user **) & FPU_EIP,
+ &addr_modes.override)) {
+ RE_ENTRANT_CHECK_OFF;
+ printk
+ ("FPU emulator: Unknown prefix byte 0x%02x, probably due to\n"
+ "FPU emulator: self-modifying code! (emulation impossible)\n",
+ byte1);
+ RE_ENTRANT_CHECK_ON;
+ EXCEPTION(EX_INTERNAL | 0x126);
+ math_abort(FPU_info, SIGILL);
+ }
+
+ do_another_FPU_instruction:
+
+ no_ip_update = 0;
+
+ FPU_EIP++; /* We have fetched the prefix and first code bytes. */
+
+ if (addr_modes.default_mode) {
+ /* This checks for the minimum instruction bytes.
+ We also need to check any extra (address mode) code access. */
+ if (FPU_EIP > code_limit)
+ math_abort(FPU_info, SIGSEGV);
+ }
+
+ if ((byte1 & 0xf8) != 0xd8) {
+ if (byte1 == FWAIT_OPCODE) {
+ if (partial_status & SW_Summary)
+ goto do_the_FPU_interrupt;
+ else
+ goto FPU_fwait_done;
+ }
+#ifdef PARANOID
+ EXCEPTION(EX_INTERNAL | 0x128);
+ math_abort(FPU_info, SIGILL);
+#endif /* PARANOID */
+ }
+
+ RE_ENTRANT_CHECK_OFF;
+ FPU_code_access_ok(1);
+ FPU_get_user(FPU_modrm, (u_char __user *) FPU_EIP);
+ RE_ENTRANT_CHECK_ON;
+ FPU_EIP++;
+
+ if (partial_status & SW_Summary) {
+ /* Ignore the error for now if the current instruction is a no-wait
+ control instruction */
+ /* The 80486 manual contradicts itself on this topic,
+ but a real 80486 uses the following instructions:
+ fninit, fnstenv, fnsave, fnstsw, fnstenv, fnclex.
+ */
+ code = (FPU_modrm << 8) | byte1;
+ if (!((((code & 0xf803) == 0xe003) || /* fnclex, fninit, fnstsw */
+ (((code & 0x3003) == 0x3001) && /* fnsave, fnstcw, fnstenv,
+ fnstsw */
+ ((code & 0xc000) != 0xc000))))) {
+ /*
+ * We need to simulate the action of the kernel to FPU
+ * interrupts here.
+ */
+ do_the_FPU_interrupt:
+
+ FPU_EIP = FPU_ORIG_EIP; /* Point to current FPU instruction. */
+
+ RE_ENTRANT_CHECK_OFF;
+ current->thread.trap_nr = X86_TRAP_MF;
+ current->thread.error_code = 0;
+ send_sig(SIGFPE, current, 1);
+ return;
+ }
+ }
+
+ entry_sel_off.offset = FPU_ORIG_EIP;
+ entry_sel_off.selector = FPU_CS;
+ entry_sel_off.opcode = (byte1 << 8) | FPU_modrm;
+ entry_sel_off.empty = 0;
+
+ FPU_rm = FPU_modrm & 7;
+
+ if (FPU_modrm < 0300) {
+ /* All of these instructions use the mod/rm byte to get a data address */
+
+ if ((addr_modes.default_mode & SIXTEEN)
+ ^ (addr_modes.override.address_size == ADDR_SIZE_PREFIX))
+ data_address =
+ FPU_get_address_16(FPU_modrm, &FPU_EIP,
+ &data_sel_off, addr_modes);
+ else
+ data_address =
+ FPU_get_address(FPU_modrm, &FPU_EIP, &data_sel_off,
+ addr_modes);
+
+ if (addr_modes.default_mode) {
+ if (FPU_EIP - 1 > code_limit)
+ math_abort(FPU_info, SIGSEGV);
+ }
+
+ if (!(byte1 & 1)) {
+ unsigned short status1 = partial_status;
+
+ st0_ptr = &st(0);
+ st0_tag = FPU_gettag0();
+
+ /* Stack underflow has priority */
+ if (NOT_EMPTY_ST0) {
+ if (addr_modes.default_mode & PROTECTED) {
+ /* This table works for 16 and 32 bit protected mode */
+ if (access_limit <
+ data_sizes_16[(byte1 >> 1) & 3])
+ math_abort(FPU_info, SIGSEGV);
+ }
+
+ unmasked = 0; /* Do this here to stop compiler warnings. */
+ switch ((byte1 >> 1) & 3) {
+ case 0:
+ unmasked =
+ FPU_load_single((float __user *)
+ data_address,
+ &loaded_data);
+ loaded_tag = unmasked & 0xff;
+ unmasked &= ~0xff;
+ break;
+ case 1:
+ loaded_tag =
+ FPU_load_int32((long __user *)
+ data_address,
+ &loaded_data);
+ break;
+ case 2:
+ unmasked =
+ FPU_load_double((double __user *)
+ data_address,
+ &loaded_data);
+ loaded_tag = unmasked & 0xff;
+ unmasked &= ~0xff;
+ break;
+ case 3:
+ default: /* Used here to suppress gcc warnings. */
+ loaded_tag =
+ FPU_load_int16((short __user *)
+ data_address,
+ &loaded_data);
+ break;
+ }
+
+ /* No more access to user memory, it is safe
+ to use static data now */
+
+ /* NaN operands have the next priority. */
+ /* We have to delay looking at st(0) until after
+ loading the data, because that data might contain an SNaN */
+ if (((st0_tag == TAG_Special) && isNaN(st0_ptr))
+ || ((loaded_tag == TAG_Special)
+ && isNaN(&loaded_data))) {
+ /* Restore the status word; we might have loaded a
+ denormal. */
+ partial_status = status1;
+ if ((FPU_modrm & 0x30) == 0x10) {
+ /* fcom or fcomp */
+ EXCEPTION(EX_Invalid);
+ setcc(SW_C3 | SW_C2 | SW_C0);
+ if ((FPU_modrm & 0x08)
+ && (control_word &
+ CW_Invalid))
+ FPU_pop(); /* fcomp, masked, so we pop. */
+ } else {
+ if (loaded_tag == TAG_Special)
+ loaded_tag =
+ FPU_Special
+ (&loaded_data);
+#ifdef PECULIAR_486
+ /* This is not really needed, but gives behaviour
+ identical to an 80486 */
+ if ((FPU_modrm & 0x28) == 0x20)
+ /* fdiv or fsub */
+ real_2op_NaN
+ (&loaded_data,
+ loaded_tag, 0,
+ &loaded_data);
+ else
+#endif /* PECULIAR_486 */
+ /* fadd, fdivr, fmul, or fsubr */
+ real_2op_NaN
+ (&loaded_data,
+ loaded_tag, 0,
+ st0_ptr);
+ }
+ goto reg_mem_instr_done;
+ }
+
+ if (unmasked && !((FPU_modrm & 0x30) == 0x10)) {
+ /* Is not a comparison instruction. */
+ if ((FPU_modrm & 0x38) == 0x38) {
+ /* fdivr */
+ if ((st0_tag == TAG_Zero) &&
+ ((loaded_tag == TAG_Valid)
+ || (loaded_tag ==
+ TAG_Special
+ &&
+ isdenormal
+ (&loaded_data)))) {
+ if (FPU_divide_by_zero
+ (0,
+ getsign
+ (&loaded_data))
+ < 0) {
+ /* We use the fact here that the unmasked
+ exception in the loaded data was for a
+ denormal operand */
+ /* Restore the state of the denormal op bit */
+ partial_status
+ &=
+ ~SW_Denorm_Op;
+ partial_status
+ |=
+ status1 &
+ SW_Denorm_Op;
+ } else
+ setsign(st0_ptr,
+ getsign
+ (&loaded_data));
+ }
+ }
+ goto reg_mem_instr_done;
+ }
+
+ switch ((FPU_modrm >> 3) & 7) {
+ case 0: /* fadd */
+ clear_C1();
+ FPU_add(&loaded_data, loaded_tag, 0,
+ control_word);
+ break;
+ case 1: /* fmul */
+ clear_C1();
+ FPU_mul(&loaded_data, loaded_tag, 0,
+ control_word);
+ break;
+ case 2: /* fcom */
+ FPU_compare_st_data(&loaded_data,
+ loaded_tag);
+ break;
+ case 3: /* fcomp */
+ if (!FPU_compare_st_data
+ (&loaded_data, loaded_tag)
+ && !unmasked)
+ FPU_pop();
+ break;
+ case 4: /* fsub */
+ clear_C1();
+ FPU_sub(LOADED | loaded_tag,
+ (int)&loaded_data,
+ control_word);
+ break;
+ case 5: /* fsubr */
+ clear_C1();
+ FPU_sub(REV | LOADED | loaded_tag,
+ (int)&loaded_data,
+ control_word);
+ break;
+ case 6: /* fdiv */
+ clear_C1();
+ FPU_div(LOADED | loaded_tag,
+ (int)&loaded_data,
+ control_word);
+ break;
+ case 7: /* fdivr */
+ clear_C1();
+ if (st0_tag == TAG_Zero)
+ partial_status = status1; /* Undo any denorm tag,
+ zero-divide has priority. */
+ FPU_div(REV | LOADED | loaded_tag,
+ (int)&loaded_data,
+ control_word);
+ break;
+ }
+ } else {
+ if ((FPU_modrm & 0x30) == 0x10) {
+ /* The instruction is fcom or fcomp */
+ EXCEPTION(EX_StackUnder);
+ setcc(SW_C3 | SW_C2 | SW_C0);
+ if ((FPU_modrm & 0x08)
+ && (control_word & CW_Invalid))
+ FPU_pop(); /* fcomp */
+ } else
+ FPU_stack_underflow();
+ }
+ reg_mem_instr_done:
+ operand_address = data_sel_off;
+ } else {
+ if (!(no_ip_update =
+ FPU_load_store(((FPU_modrm & 0x38) | (byte1 & 6))
+ >> 1, addr_modes, data_address))) {
+ operand_address = data_sel_off;
+ }
+ }
+
+ } else {
+ /* None of these instructions access user memory */
+ u_char instr_index = (FPU_modrm & 0x38) | (byte1 & 7);
+
+#ifdef PECULIAR_486
+ /* This is supposed to be undefined, but a real 80486 seems
+ to do this: */
+ operand_address.offset = 0;
+ operand_address.selector = FPU_DS;
+#endif /* PECULIAR_486 */
+
+ st0_ptr = &st(0);
+ st0_tag = FPU_gettag0();
+ switch (type_table[(int)instr_index]) {
+ case _NONE_: /* also _REGIc: _REGIn */
+ break;
+ case _REG0_:
+ if (!NOT_EMPTY_ST0) {
+ FPU_stack_underflow();
+ goto FPU_instruction_done;
+ }
+ break;
+ case _REGIi:
+ if (!NOT_EMPTY_ST0 || !NOT_EMPTY(FPU_rm)) {
+ FPU_stack_underflow_i(FPU_rm);
+ goto FPU_instruction_done;
+ }
+ break;
+ case _REGIp:
+ if (!NOT_EMPTY_ST0 || !NOT_EMPTY(FPU_rm)) {
+ FPU_stack_underflow_pop(FPU_rm);
+ goto FPU_instruction_done;
+ }
+ break;
+ case _REGI_:
+ if (!NOT_EMPTY_ST0 || !NOT_EMPTY(FPU_rm)) {
+ FPU_stack_underflow();
+ goto FPU_instruction_done;
+ }
+ break;
+ case _PUSH_: /* Only used by the fld st(i) instruction */
+ break;
+ case _null_:
+ FPU_illegal();
+ goto FPU_instruction_done;
+ default:
+ EXCEPTION(EX_INTERNAL | 0x111);
+ goto FPU_instruction_done;
+ }
+ (*st_instr_table[(int)instr_index]) ();
+
+ FPU_instruction_done:
+ ;
+ }
+
+ if (!no_ip_update)
+ instruction_address = entry_sel_off;
+
+ FPU_fwait_done:
+
+#ifdef DEBUG
+ RE_ENTRANT_CHECK_OFF;
+ FPU_printall();
+ RE_ENTRANT_CHECK_ON;
+#endif /* DEBUG */
+
+ if (FPU_lookahead && !need_resched()) {
+ FPU_ORIG_EIP = FPU_EIP - code_base;
+ if (valid_prefix(&byte1, (u_char __user **) & FPU_EIP,
+ &addr_modes.override))
+ goto do_another_FPU_instruction;
+ }
+
+ if (addr_modes.default_mode)
+ FPU_EIP -= code_base;
+
+ RE_ENTRANT_CHECK_OFF;
+}
+
+/* Support for prefix bytes is not yet complete. To properly handle
+ all prefix bytes, further changes are needed in the emulator code
+ which accesses user address space. Access to separate segments is
+ important for msdos emulation. */
+static int valid_prefix(u_char *Byte, u_char __user **fpu_eip,
+ overrides * override)
+{
+ u_char byte;
+ u_char __user *ip = *fpu_eip;
+
+ *override = (overrides) {
+ 0, 0, PREFIX_DEFAULT}; /* defaults */
+
+ RE_ENTRANT_CHECK_OFF;
+ FPU_code_access_ok(1);
+ FPU_get_user(byte, ip);
+ RE_ENTRANT_CHECK_ON;
+
+ while (1) {
+ switch (byte) {
+ case ADDR_SIZE_PREFIX:
+ override->address_size = ADDR_SIZE_PREFIX;
+ goto do_next_byte;
+
+ case OP_SIZE_PREFIX:
+ override->operand_size = OP_SIZE_PREFIX;
+ goto do_next_byte;
+
+ case PREFIX_CS:
+ override->segment = PREFIX_CS_;
+ goto do_next_byte;
+ case PREFIX_ES:
+ override->segment = PREFIX_ES_;
+ goto do_next_byte;
+ case PREFIX_SS:
+ override->segment = PREFIX_SS_;
+ goto do_next_byte;
+ case PREFIX_FS:
+ override->segment = PREFIX_FS_;
+ goto do_next_byte;
+ case PREFIX_GS:
+ override->segment = PREFIX_GS_;
+ goto do_next_byte;
+ case PREFIX_DS:
+ override->segment = PREFIX_DS_;
+ goto do_next_byte;
+
+/* lock is not a valid prefix for FPU instructions,
+ let the cpu handle it to generate a SIGILL. */
+/* case PREFIX_LOCK: */
+
+ /* rep.. prefixes have no meaning for FPU instructions */
+ case PREFIX_REPE:
+ case PREFIX_REPNE:
+
+ do_next_byte:
+ ip++;
+ RE_ENTRANT_CHECK_OFF;
+ FPU_code_access_ok(1);
+ FPU_get_user(byte, ip);
+ RE_ENTRANT_CHECK_ON;
+ break;
+ case FWAIT_OPCODE:
+ *Byte = byte;
+ return 1;
+ default:
+ if ((byte & 0xf8) == 0xd8) {
+ *Byte = byte;
+ *fpu_eip = ip;
+ return 1;
+ } else {
+ /* Not a valid sequence of prefix bytes followed by
+ an FPU instruction. */
+ *Byte = byte; /* Needed for error message. */
+ return 0;
+ }
+ }
+ }
+}
+
+void math_abort(struct math_emu_info *info, unsigned int signal)
+{
+ FPU_EIP = FPU_ORIG_EIP;
+ current->thread.trap_nr = X86_TRAP_MF;
+ current->thread.error_code = 0;
+ send_sig(signal, current, 1);
+ RE_ENTRANT_CHECK_OFF;
+ __asm__("movl %0,%%esp ; ret": :"g"(((long)info) - 4));
+#ifdef PARANOID
+ printk("ERROR: wm-FPU-emu math_abort failed!\n");
+#endif /* PARANOID */
+}
+
+#define S387 ((struct swregs_state *)s387)
+#define sstatus_word() \
+ ((S387->swd & ~SW_Top & 0xffff) | ((S387->ftop << SW_Top_Shift) & SW_Top))
+
+int fpregs_soft_set(struct task_struct *target,
+ const struct user_regset *regset,
+ unsigned int pos, unsigned int count,
+ const void *kbuf, const void __user *ubuf)
+{
+ struct swregs_state *s387 = &target->thread.fpu.state.soft;
+ void *space = s387->st_space;
+ int ret;
+ int offset, other, i, tags, regnr, tag, newtop;
+
+ RE_ENTRANT_CHECK_OFF;
+ ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, s387, 0,
+ offsetof(struct swregs_state, st_space));
+ RE_ENTRANT_CHECK_ON;
+
+ if (ret)
+ return ret;
+
+ S387->ftop = (S387->swd >> SW_Top_Shift) & 7;
+ offset = (S387->ftop & 7) * 10;
+ other = 80 - offset;
+
+ RE_ENTRANT_CHECK_OFF;
+
+ /* Copy all registers in stack order. */
+ ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+ space + offset, 0, other);
+ if (!ret && offset)
+ ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+ space, 0, offset);
+
+ RE_ENTRANT_CHECK_ON;
+
+ /* The tags may need to be corrected now. */
+ tags = S387->twd;
+ newtop = S387->ftop;
+ for (i = 0; i < 8; i++) {
+ regnr = (i + newtop) & 7;
+ if (((tags >> ((regnr & 7) * 2)) & 3) != TAG_Empty) {
+ /* The loaded data over-rides all other cases. */
+ tag =
+ FPU_tagof((FPU_REG *) ((u_char *) S387->st_space +
+ 10 * regnr));
+ tags &= ~(3 << (regnr * 2));
+ tags |= (tag & 3) << (regnr * 2);
+ }
+ }
+ S387->twd = tags;
+
+ return ret;
+}
+
+int fpregs_soft_get(struct task_struct *target,
+ const struct user_regset *regset,
+ unsigned int pos, unsigned int count,
+ void *kbuf, void __user *ubuf)
+{
+ struct swregs_state *s387 = &target->thread.fpu.state.soft;
+ const void *space = s387->st_space;
+ int ret;
+ int offset = (S387->ftop & 7) * 10, other = 80 - offset;
+
+ RE_ENTRANT_CHECK_OFF;
+
+#ifdef PECULIAR_486
+ S387->cwd &= ~0xe080;
+ /* An 80486 sets nearly all of the reserved bits to 1. */
+ S387->cwd |= 0xffff0040;
+ S387->swd = sstatus_word() | 0xffff0000;
+ S387->twd |= 0xffff0000;
+ S387->fcs &= ~0xf8000000;
+ S387->fos |= 0xffff0000;
+#endif /* PECULIAR_486 */
+
+ ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, s387, 0,
+ offsetof(struct swregs_state, st_space));
+
+ /* Copy all registers in stack order. */
+ if (!ret)
+ ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+ space + offset, 0, other);
+ if (!ret)
+ ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+ space, 0, offset);
+
+ RE_ENTRANT_CHECK_ON;
+
+ return ret;
+}
diff --git a/arch/x86/math-emu/fpu_etc.c b/arch/x86/math-emu/fpu_etc.c
new file mode 100644
index 000000000..1b118fd93
--- /dev/null
+++ b/arch/x86/math-emu/fpu_etc.c
@@ -0,0 +1,131 @@
+// SPDX-License-Identifier: GPL-2.0
+/*---------------------------------------------------------------------------+
+ | fpu_etc.c |
+ | |
+ | Implement a few FPU instructions. |
+ | |
+ | Copyright (C) 1992,1993,1994,1997 |
+ | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, |
+ | Australia. E-mail billm@suburbia.net |
+ | |
+ | |
+ +---------------------------------------------------------------------------*/
+
+#include "fpu_system.h"
+#include "exception.h"
+#include "fpu_emu.h"
+#include "status_w.h"
+#include "reg_constant.h"
+
+static void fchs(FPU_REG *st0_ptr, u_char st0tag)
+{
+ if (st0tag ^ TAG_Empty) {
+ signbyte(st0_ptr) ^= SIGN_NEG;
+ clear_C1();
+ } else
+ FPU_stack_underflow();
+}
+
+static void fabs(FPU_REG *st0_ptr, u_char st0tag)
+{
+ if (st0tag ^ TAG_Empty) {
+ setpositive(st0_ptr);
+ clear_C1();
+ } else
+ FPU_stack_underflow();
+}
+
+static void ftst_(FPU_REG *st0_ptr, u_char st0tag)
+{
+ switch (st0tag) {
+ case TAG_Zero:
+ setcc(SW_C3);
+ break;
+ case TAG_Valid:
+ if (getsign(st0_ptr) == SIGN_POS)
+ setcc(0);
+ else
+ setcc(SW_C0);
+ break;
+ case TAG_Special:
+ switch (FPU_Special(st0_ptr)) {
+ case TW_Denormal:
+ if (getsign(st0_ptr) == SIGN_POS)
+ setcc(0);
+ else
+ setcc(SW_C0);
+ if (denormal_operand() < 0) {
+#ifdef PECULIAR_486
+ /* This is weird! */
+ if (getsign(st0_ptr) == SIGN_POS)
+ setcc(SW_C3);
+#endif /* PECULIAR_486 */
+ return;
+ }
+ break;
+ case TW_NaN:
+ setcc(SW_C0 | SW_C2 | SW_C3); /* Operand is not comparable */
+ EXCEPTION(EX_Invalid);
+ break;
+ case TW_Infinity:
+ if (getsign(st0_ptr) == SIGN_POS)
+ setcc(0);
+ else
+ setcc(SW_C0);
+ break;
+ default:
+ setcc(SW_C0 | SW_C2 | SW_C3); /* Operand is not comparable */
+ EXCEPTION(EX_INTERNAL | 0x14);
+ break;
+ }
+ break;
+ case TAG_Empty:
+ setcc(SW_C0 | SW_C2 | SW_C3);
+ EXCEPTION(EX_StackUnder);
+ break;
+ }
+}
+
+static void fxam(FPU_REG *st0_ptr, u_char st0tag)
+{
+ int c = 0;
+ switch (st0tag) {
+ case TAG_Empty:
+ c = SW_C3 | SW_C0;
+ break;
+ case TAG_Zero:
+ c = SW_C3;
+ break;
+ case TAG_Valid:
+ c = SW_C2;
+ break;
+ case TAG_Special:
+ switch (FPU_Special(st0_ptr)) {
+ case TW_Denormal:
+ c = SW_C2 | SW_C3; /* Denormal */
+ break;
+ case TW_NaN:
+ /* We also use NaN for unsupported types. */
+ if ((st0_ptr->sigh & 0x80000000)
+ && (exponent(st0_ptr) == EXP_OVER))
+ c = SW_C0;
+ break;
+ case TW_Infinity:
+ c = SW_C2 | SW_C0;
+ break;
+ }
+ }
+ if (getsign(st0_ptr) == SIGN_NEG)
+ c |= SW_C1;
+ setcc(c);
+}
+
+static FUNC_ST0 const fp_etc_table[] = {
+ fchs, fabs, (FUNC_ST0) FPU_illegal, (FUNC_ST0) FPU_illegal,
+ ftst_, fxam, (FUNC_ST0) FPU_illegal, (FUNC_ST0) FPU_illegal
+};
+
+void FPU_etc(void)
+{
+ (fp_etc_table[FPU_rm]) (&st(0), FPU_gettag0());
+}
diff --git a/arch/x86/math-emu/fpu_proto.h b/arch/x86/math-emu/fpu_proto.h
new file mode 100644
index 000000000..70d35c200
--- /dev/null
+++ b/arch/x86/math-emu/fpu_proto.h
@@ -0,0 +1,157 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _FPU_PROTO_H
+#define _FPU_PROTO_H
+
+/* errors.c */
+extern void FPU_illegal(void);
+extern void FPU_printall(void);
+asmlinkage void FPU_exception(int n);
+extern int real_1op_NaN(FPU_REG *a);
+extern int real_2op_NaN(FPU_REG const *b, u_char tagb, int deststnr,
+ FPU_REG const *defaultNaN);
+asmlinkage int arith_invalid(int deststnr);
+asmlinkage int FPU_divide_by_zero(int deststnr, u_char sign);
+extern int set_precision_flag(int flags);
+asmlinkage void set_precision_flag_up(void);
+asmlinkage void set_precision_flag_down(void);
+asmlinkage int denormal_operand(void);
+asmlinkage int arith_overflow(FPU_REG *dest);
+asmlinkage int arith_underflow(FPU_REG *dest);
+extern void FPU_stack_overflow(void);
+extern void FPU_stack_underflow(void);
+extern void FPU_stack_underflow_i(int i);
+extern void FPU_stack_underflow_pop(int i);
+/* fpu_arith.c */
+extern void fadd__(void);
+extern void fmul__(void);
+extern void fsub__(void);
+extern void fsubr_(void);
+extern void fdiv__(void);
+extern void fdivr_(void);
+extern void fadd_i(void);
+extern void fmul_i(void);
+extern void fsubri(void);
+extern void fsub_i(void);
+extern void fdivri(void);
+extern void fdiv_i(void);
+extern void faddp_(void);
+extern void fmulp_(void);
+extern void fsubrp(void);
+extern void fsubp_(void);
+extern void fdivrp(void);
+extern void fdivp_(void);
+/* fpu_aux.c */
+extern void finit(void);
+extern void finit_(void);
+extern void fstsw_(void);
+extern void fp_nop(void);
+extern void fld_i_(void);
+extern void fxch_i(void);
+extern void fcmovb(void);
+extern void fcmove(void);
+extern void fcmovbe(void);
+extern void fcmovu(void);
+extern void fcmovnb(void);
+extern void fcmovne(void);
+extern void fcmovnbe(void);
+extern void fcmovnu(void);
+extern void ffree_(void);
+extern void ffreep(void);
+extern void fst_i_(void);
+extern void fstp_i(void);
+/* fpu_entry.c */
+extern void math_emulate(struct math_emu_info *info);
+extern void math_abort(struct math_emu_info *info, unsigned int signal);
+/* fpu_etc.c */
+extern void FPU_etc(void);
+/* fpu_tags.c */
+extern int FPU_gettag0(void);
+extern int FPU_gettagi(int stnr);
+extern int FPU_gettag(int regnr);
+extern void FPU_settag0(int tag);
+extern void FPU_settagi(int stnr, int tag);
+extern void FPU_settag(int regnr, int tag);
+extern int FPU_Special(FPU_REG const *ptr);
+extern int isNaN(FPU_REG const *ptr);
+extern void FPU_pop(void);
+extern int FPU_empty_i(int stnr);
+extern int FPU_stackoverflow(FPU_REG ** st_new_ptr);
+extern void FPU_copy_to_regi(FPU_REG const *r, u_char tag, int stnr);
+extern void FPU_copy_to_reg1(FPU_REG const *r, u_char tag);
+extern void FPU_copy_to_reg0(FPU_REG const *r, u_char tag);
+/* fpu_trig.c */
+extern void FPU_triga(void);
+extern void FPU_trigb(void);
+/* get_address.c */
+extern void __user *FPU_get_address(u_char FPU_modrm, unsigned long *fpu_eip,
+ struct address *addr,
+ fpu_addr_modes addr_modes);
+extern void __user *FPU_get_address_16(u_char FPU_modrm, unsigned long *fpu_eip,
+ struct address *addr,
+ fpu_addr_modes addr_modes);
+/* load_store.c */
+extern int FPU_load_store(u_char type, fpu_addr_modes addr_modes,
+ void __user * data_address);
+/* poly_2xm1.c */
+extern int poly_2xm1(u_char sign, FPU_REG * arg, FPU_REG *result);
+/* poly_atan.c */
+extern void poly_atan(FPU_REG * st0_ptr, u_char st0_tag, FPU_REG *st1_ptr,
+ u_char st1_tag);
+/* poly_l2.c */
+extern void poly_l2(FPU_REG *st0_ptr, FPU_REG *st1_ptr, u_char st1_sign);
+extern int poly_l2p1(u_char s0, u_char s1, FPU_REG *r0, FPU_REG *r1,
+ FPU_REG * d);
+/* poly_sin.c */
+extern void poly_sine(FPU_REG *st0_ptr);
+extern void poly_cos(FPU_REG *st0_ptr);
+/* poly_tan.c */
+extern void poly_tan(FPU_REG *st0_ptr);
+/* reg_add_sub.c */
+extern int FPU_add(FPU_REG const *b, u_char tagb, int destrnr, int control_w);
+extern int FPU_sub(int flags, int rm, int control_w);
+/* reg_compare.c */
+extern int FPU_compare_st_data(FPU_REG const *loaded_data, u_char loaded_tag);
+extern void fcom_st(void);
+extern void fcompst(void);
+extern void fcompp(void);
+extern void fucom_(void);
+extern void fucomp(void);
+extern void fucompp(void);
+extern void fcomi_(void);
+extern void fcomip(void);
+extern void fucomi_(void);
+extern void fucomip(void);
+/* reg_constant.c */
+extern void fconst(void);
+/* reg_ld_str.c */
+extern int FPU_load_extended(long double __user *s, int stnr);
+extern int FPU_load_double(double __user *dfloat, FPU_REG *loaded_data);
+extern int FPU_load_single(float __user *single, FPU_REG *loaded_data);
+extern int FPU_load_int64(long long __user *_s);
+extern int FPU_load_int32(long __user *_s, FPU_REG *loaded_data);
+extern int FPU_load_int16(short __user *_s, FPU_REG *loaded_data);
+extern int FPU_load_bcd(u_char __user *s);
+extern int FPU_store_extended(FPU_REG *st0_ptr, u_char st0_tag,
+ long double __user * d);
+extern int FPU_store_double(FPU_REG *st0_ptr, u_char st0_tag,
+ double __user * dfloat);
+extern int FPU_store_single(FPU_REG *st0_ptr, u_char st0_tag,
+ float __user * single);
+extern int FPU_store_int64(FPU_REG *st0_ptr, u_char st0_tag,
+ long long __user * d);
+extern int FPU_store_int32(FPU_REG *st0_ptr, u_char st0_tag, long __user *d);
+extern int FPU_store_int16(FPU_REG *st0_ptr, u_char st0_tag, short __user *d);
+extern int FPU_store_bcd(FPU_REG *st0_ptr, u_char st0_tag, u_char __user *d);
+extern int FPU_round_to_int(FPU_REG *r, u_char tag);
+extern u_char __user *fldenv(fpu_addr_modes addr_modes, u_char __user *s);
+extern void frstor(fpu_addr_modes addr_modes, u_char __user *data_address);
+extern u_char __user *fstenv(fpu_addr_modes addr_modes, u_char __user *d);
+extern void fsave(fpu_addr_modes addr_modes, u_char __user *data_address);
+extern int FPU_tagof(FPU_REG *ptr);
+/* reg_mul.c */
+extern int FPU_mul(FPU_REG const *b, u_char tagb, int deststnr, int control_w);
+
+extern int FPU_div(int flags, int regrm, int control_w);
+/* reg_convert.c */
+extern int FPU_to_exp16(FPU_REG const *a, FPU_REG *x);
+#endif /* _FPU_PROTO_H */
diff --git a/arch/x86/math-emu/fpu_system.h b/arch/x86/math-emu/fpu_system.h
new file mode 100644
index 000000000..c8b1b31ed
--- /dev/null
+++ b/arch/x86/math-emu/fpu_system.h
@@ -0,0 +1,128 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*---------------------------------------------------------------------------+
+ | fpu_system.h |
+ | |
+ | Copyright (C) 1992,1994,1997 |
+ | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, |
+ | Australia. E-mail billm@suburbia.net |
+ | |
+ +---------------------------------------------------------------------------*/
+
+#ifndef _FPU_SYSTEM_H
+#define _FPU_SYSTEM_H
+
+/* system dependent definitions */
+
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+
+#include <asm/desc.h>
+#include <asm/mmu_context.h>
+
+static inline struct desc_struct FPU_get_ldt_descriptor(unsigned seg)
+{
+ static struct desc_struct zero_desc;
+ struct desc_struct ret = zero_desc;
+
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+ seg >>= 3;
+ mutex_lock(&current->mm->context.lock);
+ if (current->mm->context.ldt && seg < current->mm->context.ldt->nr_entries)
+ ret = current->mm->context.ldt->entries[seg];
+ mutex_unlock(&current->mm->context.lock);
+#endif
+ return ret;
+}
+
+#define SEG_TYPE_WRITABLE (1U << 1)
+#define SEG_TYPE_EXPANDS_DOWN (1U << 2)
+#define SEG_TYPE_EXECUTE (1U << 3)
+#define SEG_TYPE_EXPAND_MASK (SEG_TYPE_EXPANDS_DOWN | SEG_TYPE_EXECUTE)
+#define SEG_TYPE_EXECUTE_MASK (SEG_TYPE_WRITABLE | SEG_TYPE_EXECUTE)
+
+static inline unsigned long seg_get_base(struct desc_struct *d)
+{
+ unsigned long base = (unsigned long)d->base2 << 24;
+
+ return base | ((unsigned long)d->base1 << 16) | d->base0;
+}
+
+static inline unsigned long seg_get_limit(struct desc_struct *d)
+{
+ return ((unsigned long)d->limit1 << 16) | d->limit0;
+}
+
+static inline unsigned long seg_get_granularity(struct desc_struct *d)
+{
+ return d->g ? 4096 : 1;
+}
+
+static inline bool seg_expands_down(struct desc_struct *d)
+{
+ return (d->type & SEG_TYPE_EXPAND_MASK) == SEG_TYPE_EXPANDS_DOWN;
+}
+
+static inline bool seg_execute_only(struct desc_struct *d)
+{
+ return (d->type & SEG_TYPE_EXECUTE_MASK) == SEG_TYPE_EXECUTE;
+}
+
+static inline bool seg_writable(struct desc_struct *d)
+{
+ return (d->type & SEG_TYPE_EXECUTE_MASK) == SEG_TYPE_WRITABLE;
+}
+
+#define I387 (&current->thread.fpu.state)
+#define FPU_info (I387->soft.info)
+
+#define FPU_CS (*(unsigned short *) &(FPU_info->regs->cs))
+#define FPU_SS (*(unsigned short *) &(FPU_info->regs->ss))
+#define FPU_DS (*(unsigned short *) &(FPU_info->regs->ds))
+#define FPU_EAX (FPU_info->regs->ax)
+#define FPU_EFLAGS (FPU_info->regs->flags)
+#define FPU_EIP (FPU_info->regs->ip)
+#define FPU_ORIG_EIP (FPU_info->___orig_eip)
+
+#define FPU_lookahead (I387->soft.lookahead)
+
+/* nz if ip_offset and cs_selector are not to be set for the current
+ instruction. */
+#define no_ip_update (*(u_char *)&(I387->soft.no_update))
+#define FPU_rm (*(u_char *)&(I387->soft.rm))
+
+/* Number of bytes of data which can be legally accessed by the current
+ instruction. This only needs to hold a number <= 108, so a byte will do. */
+#define access_limit (*(u_char *)&(I387->soft.alimit))
+
+#define partial_status (I387->soft.swd)
+#define control_word (I387->soft.cwd)
+#define fpu_tag_word (I387->soft.twd)
+#define registers (I387->soft.st_space)
+#define top (I387->soft.ftop)
+
+#define instruction_address (*(struct address *)&I387->soft.fip)
+#define operand_address (*(struct address *)&I387->soft.foo)
+
+#define FPU_access_ok(x,y,z) if ( !access_ok(x,y,z) ) \
+ math_abort(FPU_info,SIGSEGV)
+#define FPU_abort math_abort(FPU_info, SIGSEGV)
+
+#undef FPU_IGNORE_CODE_SEGV
+#ifdef FPU_IGNORE_CODE_SEGV
+/* access_ok() is very expensive, and causes the emulator to run
+ about 20% slower if applied to the code. Anyway, errors due to bad
+ code addresses should be much rarer than errors due to bad data
+ addresses. */
+#define FPU_code_access_ok(z)
+#else
+/* A simpler test than access_ok() can probably be done for
+ FPU_code_access_ok() because the only possible error is to step
+ past the upper boundary of a legal code area. */
+#define FPU_code_access_ok(z) FPU_access_ok(VERIFY_READ,(void __user *)FPU_EIP,z)
+#endif
+
+#define FPU_get_user(x,y) get_user((x),(y))
+#define FPU_put_user(x,y) put_user((x),(y))
+
+#endif
diff --git a/arch/x86/math-emu/fpu_tags.c b/arch/x86/math-emu/fpu_tags.c
new file mode 100644
index 000000000..bff95d4e7
--- /dev/null
+++ b/arch/x86/math-emu/fpu_tags.c
@@ -0,0 +1,116 @@
+// SPDX-License-Identifier: GPL-2.0
+/*---------------------------------------------------------------------------+
+ | fpu_tags.c |
+ | |
+ | Set FPU register tags. |
+ | |
+ | Copyright (C) 1997 |
+ | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia |
+ | E-mail billm@jacobi.maths.monash.edu.au |
+ | |
+ | |
+ +---------------------------------------------------------------------------*/
+
+#include "fpu_emu.h"
+#include "fpu_system.h"
+#include "exception.h"
+
+void FPU_pop(void)
+{
+ fpu_tag_word |= 3 << ((top & 7) * 2);
+ top++;
+}
+
+int FPU_gettag0(void)
+{
+ return (fpu_tag_word >> ((top & 7) * 2)) & 3;
+}
+
+int FPU_gettagi(int stnr)
+{
+ return (fpu_tag_word >> (((top + stnr) & 7) * 2)) & 3;
+}
+
+int FPU_gettag(int regnr)
+{
+ return (fpu_tag_word >> ((regnr & 7) * 2)) & 3;
+}
+
+void FPU_settag0(int tag)
+{
+ int regnr = top;
+ regnr &= 7;
+ fpu_tag_word &= ~(3 << (regnr * 2));
+ fpu_tag_word |= (tag & 3) << (regnr * 2);
+}
+
+void FPU_settagi(int stnr, int tag)
+{
+ int regnr = stnr + top;
+ regnr &= 7;
+ fpu_tag_word &= ~(3 << (regnr * 2));
+ fpu_tag_word |= (tag & 3) << (regnr * 2);
+}
+
+void FPU_settag(int regnr, int tag)
+{
+ regnr &= 7;
+ fpu_tag_word &= ~(3 << (regnr * 2));
+ fpu_tag_word |= (tag & 3) << (regnr * 2);
+}
+
+int FPU_Special(FPU_REG const *ptr)
+{
+ int exp = exponent(ptr);
+
+ if (exp == EXP_BIAS + EXP_UNDER)
+ return TW_Denormal;
+ else if (exp != EXP_BIAS + EXP_OVER)
+ return TW_NaN;
+ else if ((ptr->sigh == 0x80000000) && (ptr->sigl == 0))
+ return TW_Infinity;
+ return TW_NaN;
+}
+
+int isNaN(FPU_REG const *ptr)
+{
+ return ((exponent(ptr) == EXP_BIAS + EXP_OVER)
+ && !((ptr->sigh == 0x80000000) && (ptr->sigl == 0)));
+}
+
+int FPU_empty_i(int stnr)
+{
+ int regnr = (top + stnr) & 7;
+
+ return ((fpu_tag_word >> (regnr * 2)) & 3) == TAG_Empty;
+}
+
+int FPU_stackoverflow(FPU_REG ** st_new_ptr)
+{
+ *st_new_ptr = &st(-1);
+
+ return ((fpu_tag_word >> (((top - 1) & 7) * 2)) & 3) != TAG_Empty;
+}
+
+void FPU_copy_to_regi(FPU_REG const *r, u_char tag, int stnr)
+{
+ reg_copy(r, &st(stnr));
+ FPU_settagi(stnr, tag);
+}
+
+void FPU_copy_to_reg1(FPU_REG const *r, u_char tag)
+{
+ reg_copy(r, &st(1));
+ FPU_settagi(1, tag);
+}
+
+void FPU_copy_to_reg0(FPU_REG const *r, u_char tag)
+{
+ int regnr = top;
+ regnr &= 7;
+
+ reg_copy(r, &st(0));
+
+ fpu_tag_word &= ~(3 << (regnr * 2));
+ fpu_tag_word |= (tag & 3) << (regnr * 2);
+}
diff --git a/arch/x86/math-emu/fpu_trig.c b/arch/x86/math-emu/fpu_trig.c
new file mode 100644
index 000000000..783c509f9
--- /dev/null
+++ b/arch/x86/math-emu/fpu_trig.c
@@ -0,0 +1,1644 @@
+// SPDX-License-Identifier: GPL-2.0
+/*---------------------------------------------------------------------------+
+ | fpu_trig.c |
+ | |
+ | Implementation of the FPU "transcendental" functions. |
+ | |
+ | Copyright (C) 1992,1993,1994,1997,1999 |
+ | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, |
+ | Australia. E-mail billm@melbpc.org.au |
+ | |
+ | |
+ +---------------------------------------------------------------------------*/
+
+#include "fpu_system.h"
+#include "exception.h"
+#include "fpu_emu.h"
+#include "status_w.h"
+#include "control_w.h"
+#include "reg_constant.h"
+
+static void rem_kernel(unsigned long long st0, unsigned long long *y,
+ unsigned long long st1, unsigned long long q, int n);
+
+#define BETTER_THAN_486
+
+#define FCOS 4
+
+/* Used only by fptan, fsin, fcos, and fsincos. */
+/* This routine produces very accurate results, similar to
+ using a value of pi with more than 128 bits precision. */
+/* Limited measurements show no results worse than 64 bit precision
+ except for the results for arguments close to 2^63, where the
+ precision of the result sometimes degrades to about 63.9 bits */
+static int trig_arg(FPU_REG *st0_ptr, int even)
+{
+ FPU_REG tmp;
+ u_char tmptag;
+ unsigned long long q;
+ int old_cw = control_word, saved_status = partial_status;
+ int tag, st0_tag = TAG_Valid;
+
+ if (exponent(st0_ptr) >= 63) {
+ partial_status |= SW_C2; /* Reduction incomplete. */
+ return -1;
+ }
+
+ control_word &= ~CW_RC;
+ control_word |= RC_CHOP;
+
+ setpositive(st0_ptr);
+ tag = FPU_u_div(st0_ptr, &CONST_PI2, &tmp, PR_64_BITS | RC_CHOP | 0x3f,
+ SIGN_POS);
+
+ FPU_round_to_int(&tmp, tag); /* Fortunately, this can't overflow
+ to 2^64 */
+ q = significand(&tmp);
+ if (q) {
+ rem_kernel(significand(st0_ptr),
+ &significand(&tmp),
+ significand(&CONST_PI2),
+ q, exponent(st0_ptr) - exponent(&CONST_PI2));
+ setexponent16(&tmp, exponent(&CONST_PI2));
+ st0_tag = FPU_normalize(&tmp);
+ FPU_copy_to_reg0(&tmp, st0_tag);
+ }
+
+ if ((even && !(q & 1)) || (!even && (q & 1))) {
+ st0_tag =
+ FPU_sub(REV | LOADED | TAG_Valid, (int)&CONST_PI2,
+ FULL_PRECISION);
+
+#ifdef BETTER_THAN_486
+ /* So far, the results are exact but based upon a 64 bit
+ precision approximation to pi/2. The technique used
+ now is equivalent to using an approximation to pi/2 which
+ is accurate to about 128 bits. */
+ if ((exponent(st0_ptr) <= exponent(&CONST_PI2extra) + 64)
+ || (q > 1)) {
+ /* This code gives the effect of having pi/2 to better than
+ 128 bits precision. */
+
+ significand(&tmp) = q + 1;
+ setexponent16(&tmp, 63);
+ FPU_normalize(&tmp);
+ tmptag =
+ FPU_u_mul(&CONST_PI2extra, &tmp, &tmp,
+ FULL_PRECISION, SIGN_POS,
+ exponent(&CONST_PI2extra) +
+ exponent(&tmp));
+ setsign(&tmp, getsign(&CONST_PI2extra));
+ st0_tag = FPU_add(&tmp, tmptag, 0, FULL_PRECISION);
+ if (signnegative(st0_ptr)) {
+ /* CONST_PI2extra is negative, so the result of the addition
+ can be negative. This means that the argument is actually
+ in a different quadrant. The correction is always < pi/2,
+ so it can't overflow into yet another quadrant. */
+ setpositive(st0_ptr);
+ q++;
+ }
+ }
+#endif /* BETTER_THAN_486 */
+ }
+#ifdef BETTER_THAN_486
+ else {
+ /* So far, the results are exact but based upon a 64 bit
+ precision approximation to pi/2. The technique used
+ now is equivalent to using an approximation to pi/2 which
+ is accurate to about 128 bits. */
+ if (((q > 0)
+ && (exponent(st0_ptr) <= exponent(&CONST_PI2extra) + 64))
+ || (q > 1)) {
+ /* This code gives the effect of having p/2 to better than
+ 128 bits precision. */
+
+ significand(&tmp) = q;
+ setexponent16(&tmp, 63);
+ FPU_normalize(&tmp); /* This must return TAG_Valid */
+ tmptag =
+ FPU_u_mul(&CONST_PI2extra, &tmp, &tmp,
+ FULL_PRECISION, SIGN_POS,
+ exponent(&CONST_PI2extra) +
+ exponent(&tmp));
+ setsign(&tmp, getsign(&CONST_PI2extra));
+ st0_tag = FPU_sub(LOADED | (tmptag & 0x0f), (int)&tmp,
+ FULL_PRECISION);
+ if ((exponent(st0_ptr) == exponent(&CONST_PI2)) &&
+ ((st0_ptr->sigh > CONST_PI2.sigh)
+ || ((st0_ptr->sigh == CONST_PI2.sigh)
+ && (st0_ptr->sigl > CONST_PI2.sigl)))) {
+ /* CONST_PI2extra is negative, so the result of the
+ subtraction can be larger than pi/2. This means
+ that the argument is actually in a different quadrant.
+ The correction is always < pi/2, so it can't overflow
+ into yet another quadrant. */
+ st0_tag =
+ FPU_sub(REV | LOADED | TAG_Valid,
+ (int)&CONST_PI2, FULL_PRECISION);
+ q++;
+ }
+ }
+ }
+#endif /* BETTER_THAN_486 */
+
+ FPU_settag0(st0_tag);
+ control_word = old_cw;
+ partial_status = saved_status & ~SW_C2; /* Reduction complete. */
+
+ return (q & 3) | even;
+}
+
+/* Convert a long to register */
+static void convert_l2reg(long const *arg, int deststnr)
+{
+ int tag;
+ long num = *arg;
+ u_char sign;
+ FPU_REG *dest = &st(deststnr);
+
+ if (num == 0) {
+ FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr);
+ return;
+ }
+
+ if (num > 0) {
+ sign = SIGN_POS;
+ } else {
+ num = -num;
+ sign = SIGN_NEG;
+ }
+
+ dest->sigh = num;
+ dest->sigl = 0;
+ setexponent16(dest, 31);
+ tag = FPU_normalize(dest);
+ FPU_settagi(deststnr, tag);
+ setsign(dest, sign);
+ return;
+}
+
+static void single_arg_error(FPU_REG *st0_ptr, u_char st0_tag)
+{
+ if (st0_tag == TAG_Empty)
+ FPU_stack_underflow(); /* Puts a QNaN in st(0) */
+ else if (st0_tag == TW_NaN)
+ real_1op_NaN(st0_ptr); /* return with a NaN in st(0) */
+#ifdef PARANOID
+ else
+ EXCEPTION(EX_INTERNAL | 0x0112);
+#endif /* PARANOID */
+}
+
+static void single_arg_2_error(FPU_REG *st0_ptr, u_char st0_tag)
+{
+ int isNaN;
+
+ switch (st0_tag) {
+ case TW_NaN:
+ isNaN = (exponent(st0_ptr) == EXP_OVER)
+ && (st0_ptr->sigh & 0x80000000);
+ if (isNaN && !(st0_ptr->sigh & 0x40000000)) { /* Signaling ? */
+ EXCEPTION(EX_Invalid);
+ if (control_word & CW_Invalid) {
+ /* The masked response */
+ /* Convert to a QNaN */
+ st0_ptr->sigh |= 0x40000000;
+ push();
+ FPU_copy_to_reg0(st0_ptr, TAG_Special);
+ }
+ } else if (isNaN) {
+ /* A QNaN */
+ push();
+ FPU_copy_to_reg0(st0_ptr, TAG_Special);
+ } else {
+ /* pseudoNaN or other unsupported */
+ EXCEPTION(EX_Invalid);
+ if (control_word & CW_Invalid) {
+ /* The masked response */
+ FPU_copy_to_reg0(&CONST_QNaN, TAG_Special);
+ push();
+ FPU_copy_to_reg0(&CONST_QNaN, TAG_Special);
+ }
+ }
+ break; /* return with a NaN in st(0) */
+#ifdef PARANOID
+ default:
+ EXCEPTION(EX_INTERNAL | 0x0112);
+#endif /* PARANOID */
+ }
+}
+
+/*---------------------------------------------------------------------------*/
+
+static void f2xm1(FPU_REG *st0_ptr, u_char tag)
+{
+ FPU_REG a;
+
+ clear_C1();
+
+ if (tag == TAG_Valid) {
+ /* For an 80486 FPU, the result is undefined if the arg is >= 1.0 */
+ if (exponent(st0_ptr) < 0) {
+ denormal_arg:
+
+ FPU_to_exp16(st0_ptr, &a);
+
+ /* poly_2xm1(x) requires 0 < st(0) < 1. */
+ poly_2xm1(getsign(st0_ptr), &a, st0_ptr);
+ }
+ set_precision_flag_up(); /* 80486 appears to always do this */
+ return;
+ }
+
+ if (tag == TAG_Zero)
+ return;
+
+ if (tag == TAG_Special)
+ tag = FPU_Special(st0_ptr);
+
+ switch (tag) {
+ case TW_Denormal:
+ if (denormal_operand() < 0)
+ return;
+ goto denormal_arg;
+ case TW_Infinity:
+ if (signnegative(st0_ptr)) {
+ /* -infinity gives -1 (p16-10) */
+ FPU_copy_to_reg0(&CONST_1, TAG_Valid);
+ setnegative(st0_ptr);
+ }
+ return;
+ default:
+ single_arg_error(st0_ptr, tag);
+ }
+}
+
+static void fptan(FPU_REG *st0_ptr, u_char st0_tag)
+{
+ FPU_REG *st_new_ptr;
+ int q;
+ u_char arg_sign = getsign(st0_ptr);
+
+ /* Stack underflow has higher priority */
+ if (st0_tag == TAG_Empty) {
+ FPU_stack_underflow(); /* Puts a QNaN in st(0) */
+ if (control_word & CW_Invalid) {
+ st_new_ptr = &st(-1);
+ push();
+ FPU_stack_underflow(); /* Puts a QNaN in the new st(0) */
+ }
+ return;
+ }
+
+ if (STACK_OVERFLOW) {
+ FPU_stack_overflow();
+ return;
+ }
+
+ if (st0_tag == TAG_Valid) {
+ if (exponent(st0_ptr) > -40) {
+ if ((q = trig_arg(st0_ptr, 0)) == -1) {
+ /* Operand is out of range */
+ return;
+ }
+
+ poly_tan(st0_ptr);
+ setsign(st0_ptr, (q & 1) ^ (arg_sign != 0));
+ set_precision_flag_up(); /* We do not really know if up or down */
+ } else {
+ /* For a small arg, the result == the argument */
+ /* Underflow may happen */
+
+ denormal_arg:
+
+ FPU_to_exp16(st0_ptr, st0_ptr);
+
+ st0_tag =
+ FPU_round(st0_ptr, 1, 0, FULL_PRECISION, arg_sign);
+ FPU_settag0(st0_tag);
+ }
+ push();
+ FPU_copy_to_reg0(&CONST_1, TAG_Valid);
+ return;
+ }
+
+ if (st0_tag == TAG_Zero) {
+ push();
+ FPU_copy_to_reg0(&CONST_1, TAG_Valid);
+ setcc(0);
+ return;
+ }
+
+ if (st0_tag == TAG_Special)
+ st0_tag = FPU_Special(st0_ptr);
+
+ if (st0_tag == TW_Denormal) {
+ if (denormal_operand() < 0)
+ return;
+
+ goto denormal_arg;
+ }
+
+ if (st0_tag == TW_Infinity) {
+ /* The 80486 treats infinity as an invalid operand */
+ if (arith_invalid(0) >= 0) {
+ st_new_ptr = &st(-1);
+ push();
+ arith_invalid(0);
+ }
+ return;
+ }
+
+ single_arg_2_error(st0_ptr, st0_tag);
+}
+
+static void fxtract(FPU_REG *st0_ptr, u_char st0_tag)
+{
+ FPU_REG *st_new_ptr;
+ u_char sign;
+ register FPU_REG *st1_ptr = st0_ptr; /* anticipate */
+
+ if (STACK_OVERFLOW) {
+ FPU_stack_overflow();
+ return;
+ }
+
+ clear_C1();
+
+ if (st0_tag == TAG_Valid) {
+ long e;
+
+ push();
+ sign = getsign(st1_ptr);
+ reg_copy(st1_ptr, st_new_ptr);
+ setexponent16(st_new_ptr, exponent(st_new_ptr));
+
+ denormal_arg:
+
+ e = exponent16(st_new_ptr);
+ convert_l2reg(&e, 1);
+ setexponentpos(st_new_ptr, 0);
+ setsign(st_new_ptr, sign);
+ FPU_settag0(TAG_Valid); /* Needed if arg was a denormal */
+ return;
+ } else if (st0_tag == TAG_Zero) {
+ sign = getsign(st0_ptr);
+
+ if (FPU_divide_by_zero(0, SIGN_NEG) < 0)
+ return;
+
+ push();
+ FPU_copy_to_reg0(&CONST_Z, TAG_Zero);
+ setsign(st_new_ptr, sign);
+ return;
+ }
+
+ if (st0_tag == TAG_Special)
+ st0_tag = FPU_Special(st0_ptr);
+
+ if (st0_tag == TW_Denormal) {
+ if (denormal_operand() < 0)
+ return;
+
+ push();
+ sign = getsign(st1_ptr);
+ FPU_to_exp16(st1_ptr, st_new_ptr);
+ goto denormal_arg;
+ } else if (st0_tag == TW_Infinity) {
+ sign = getsign(st0_ptr);
+ setpositive(st0_ptr);
+ push();
+ FPU_copy_to_reg0(&CONST_INF, TAG_Special);
+ setsign(st_new_ptr, sign);
+ return;
+ } else if (st0_tag == TW_NaN) {
+ if (real_1op_NaN(st0_ptr) < 0)
+ return;
+
+ push();
+ FPU_copy_to_reg0(st0_ptr, TAG_Special);
+ return;
+ } else if (st0_tag == TAG_Empty) {
+ /* Is this the correct behaviour? */
+ if (control_word & EX_Invalid) {
+ FPU_stack_underflow();
+ push();
+ FPU_stack_underflow();
+ } else
+ EXCEPTION(EX_StackUnder);
+ }
+#ifdef PARANOID
+ else
+ EXCEPTION(EX_INTERNAL | 0x119);
+#endif /* PARANOID */
+}
+
+static void fdecstp(void)
+{
+ clear_C1();
+ top--;
+}
+
+static void fincstp(void)
+{
+ clear_C1();
+ top++;
+}
+
+static void fsqrt_(FPU_REG *st0_ptr, u_char st0_tag)
+{
+ int expon;
+
+ clear_C1();
+
+ if (st0_tag == TAG_Valid) {
+ u_char tag;
+
+ if (signnegative(st0_ptr)) {
+ arith_invalid(0); /* sqrt(negative) is invalid */
+ return;
+ }
+
+ /* make st(0) in [1.0 .. 4.0) */
+ expon = exponent(st0_ptr);
+
+ denormal_arg:
+
+ setexponent16(st0_ptr, (expon & 1));
+
+ /* Do the computation, the sign of the result will be positive. */
+ tag = wm_sqrt(st0_ptr, 0, 0, control_word, SIGN_POS);
+ addexponent(st0_ptr, expon >> 1);
+ FPU_settag0(tag);
+ return;
+ }
+
+ if (st0_tag == TAG_Zero)
+ return;
+
+ if (st0_tag == TAG_Special)
+ st0_tag = FPU_Special(st0_ptr);
+
+ if (st0_tag == TW_Infinity) {
+ if (signnegative(st0_ptr))
+ arith_invalid(0); /* sqrt(-Infinity) is invalid */
+ return;
+ } else if (st0_tag == TW_Denormal) {
+ if (signnegative(st0_ptr)) {
+ arith_invalid(0); /* sqrt(negative) is invalid */
+ return;
+ }
+
+ if (denormal_operand() < 0)
+ return;
+
+ FPU_to_exp16(st0_ptr, st0_ptr);
+
+ expon = exponent16(st0_ptr);
+
+ goto denormal_arg;
+ }
+
+ single_arg_error(st0_ptr, st0_tag);
+
+}
+
+static void frndint_(FPU_REG *st0_ptr, u_char st0_tag)
+{
+ int flags, tag;
+
+ if (st0_tag == TAG_Valid) {
+ u_char sign;
+
+ denormal_arg:
+
+ sign = getsign(st0_ptr);
+
+ if (exponent(st0_ptr) > 63)
+ return;
+
+ if (st0_tag == TW_Denormal) {
+ if (denormal_operand() < 0)
+ return;
+ }
+
+ /* Fortunately, this can't overflow to 2^64 */
+ if ((flags = FPU_round_to_int(st0_ptr, st0_tag)))
+ set_precision_flag(flags);
+
+ setexponent16(st0_ptr, 63);
+ tag = FPU_normalize(st0_ptr);
+ setsign(st0_ptr, sign);
+ FPU_settag0(tag);
+ return;
+ }
+
+ if (st0_tag == TAG_Zero)
+ return;
+
+ if (st0_tag == TAG_Special)
+ st0_tag = FPU_Special(st0_ptr);
+
+ if (st0_tag == TW_Denormal)
+ goto denormal_arg;
+ else if (st0_tag == TW_Infinity)
+ return;
+ else
+ single_arg_error(st0_ptr, st0_tag);
+}
+
+static int fsin(FPU_REG *st0_ptr, u_char tag)
+{
+ u_char arg_sign = getsign(st0_ptr);
+
+ if (tag == TAG_Valid) {
+ int q;
+
+ if (exponent(st0_ptr) > -40) {
+ if ((q = trig_arg(st0_ptr, 0)) == -1) {
+ /* Operand is out of range */
+ return 1;
+ }
+
+ poly_sine(st0_ptr);
+
+ if (q & 2)
+ changesign(st0_ptr);
+
+ setsign(st0_ptr, getsign(st0_ptr) ^ arg_sign);
+
+ /* We do not really know if up or down */
+ set_precision_flag_up();
+ return 0;
+ } else {
+ /* For a small arg, the result == the argument */
+ set_precision_flag_up(); /* Must be up. */
+ return 0;
+ }
+ }
+
+ if (tag == TAG_Zero) {
+ setcc(0);
+ return 0;
+ }
+
+ if (tag == TAG_Special)
+ tag = FPU_Special(st0_ptr);
+
+ if (tag == TW_Denormal) {
+ if (denormal_operand() < 0)
+ return 1;
+
+ /* For a small arg, the result == the argument */
+ /* Underflow may happen */
+ FPU_to_exp16(st0_ptr, st0_ptr);
+
+ tag = FPU_round(st0_ptr, 1, 0, FULL_PRECISION, arg_sign);
+
+ FPU_settag0(tag);
+
+ return 0;
+ } else if (tag == TW_Infinity) {
+ /* The 80486 treats infinity as an invalid operand */
+ arith_invalid(0);
+ return 1;
+ } else {
+ single_arg_error(st0_ptr, tag);
+ return 1;
+ }
+}
+
+static int f_cos(FPU_REG *st0_ptr, u_char tag)
+{
+ u_char st0_sign;
+
+ st0_sign = getsign(st0_ptr);
+
+ if (tag == TAG_Valid) {
+ int q;
+
+ if (exponent(st0_ptr) > -40) {
+ if ((exponent(st0_ptr) < 0)
+ || ((exponent(st0_ptr) == 0)
+ && (significand(st0_ptr) <=
+ 0xc90fdaa22168c234LL))) {
+ poly_cos(st0_ptr);
+
+ /* We do not really know if up or down */
+ set_precision_flag_down();
+
+ return 0;
+ } else if ((q = trig_arg(st0_ptr, FCOS)) != -1) {
+ poly_sine(st0_ptr);
+
+ if ((q + 1) & 2)
+ changesign(st0_ptr);
+
+ /* We do not really know if up or down */
+ set_precision_flag_down();
+
+ return 0;
+ } else {
+ /* Operand is out of range */
+ return 1;
+ }
+ } else {
+ denormal_arg:
+
+ setcc(0);
+ FPU_copy_to_reg0(&CONST_1, TAG_Valid);
+#ifdef PECULIAR_486
+ set_precision_flag_down(); /* 80486 appears to do this. */
+#else
+ set_precision_flag_up(); /* Must be up. */
+#endif /* PECULIAR_486 */
+ return 0;
+ }
+ } else if (tag == TAG_Zero) {
+ FPU_copy_to_reg0(&CONST_1, TAG_Valid);
+ setcc(0);
+ return 0;
+ }
+
+ if (tag == TAG_Special)
+ tag = FPU_Special(st0_ptr);
+
+ if (tag == TW_Denormal) {
+ if (denormal_operand() < 0)
+ return 1;
+
+ goto denormal_arg;
+ } else if (tag == TW_Infinity) {
+ /* The 80486 treats infinity as an invalid operand */
+ arith_invalid(0);
+ return 1;
+ } else {
+ single_arg_error(st0_ptr, tag); /* requires st0_ptr == &st(0) */
+ return 1;
+ }
+}
+
+static void fcos(FPU_REG *st0_ptr, u_char st0_tag)
+{
+ f_cos(st0_ptr, st0_tag);
+}
+
+static void fsincos(FPU_REG *st0_ptr, u_char st0_tag)
+{
+ FPU_REG *st_new_ptr;
+ FPU_REG arg;
+ u_char tag;
+
+ /* Stack underflow has higher priority */
+ if (st0_tag == TAG_Empty) {
+ FPU_stack_underflow(); /* Puts a QNaN in st(0) */
+ if (control_word & CW_Invalid) {
+ st_new_ptr = &st(-1);
+ push();
+ FPU_stack_underflow(); /* Puts a QNaN in the new st(0) */
+ }
+ return;
+ }
+
+ if (STACK_OVERFLOW) {
+ FPU_stack_overflow();
+ return;
+ }
+
+ if (st0_tag == TAG_Special)
+ tag = FPU_Special(st0_ptr);
+ else
+ tag = st0_tag;
+
+ if (tag == TW_NaN) {
+ single_arg_2_error(st0_ptr, TW_NaN);
+ return;
+ } else if (tag == TW_Infinity) {
+ /* The 80486 treats infinity as an invalid operand */
+ if (arith_invalid(0) >= 0) {
+ /* Masked response */
+ push();
+ arith_invalid(0);
+ }
+ return;
+ }
+
+ reg_copy(st0_ptr, &arg);
+ if (!fsin(st0_ptr, st0_tag)) {
+ push();
+ FPU_copy_to_reg0(&arg, st0_tag);
+ f_cos(&st(0), st0_tag);
+ } else {
+ /* An error, so restore st(0) */
+ FPU_copy_to_reg0(&arg, st0_tag);
+ }
+}
+
+/*---------------------------------------------------------------------------*/
+/* The following all require two arguments: st(0) and st(1) */
+
+/* A lean, mean kernel for the fprem instructions. This relies upon
+ the division and rounding to an integer in do_fprem giving an
+ exact result. Because of this, rem_kernel() needs to deal only with
+ the least significant 64 bits, the more significant bits of the
+ result must be zero.
+ */
+static void rem_kernel(unsigned long long st0, unsigned long long *y,
+ unsigned long long st1, unsigned long long q, int n)
+{
+ int dummy;
+ unsigned long long x;
+
+ x = st0 << n;
+
+ /* Do the required multiplication and subtraction in the one operation */
+
+ /* lsw x -= lsw st1 * lsw q */
+ asm volatile ("mull %4; subl %%eax,%0; sbbl %%edx,%1":"=m"
+ (((unsigned *)&x)[0]), "=m"(((unsigned *)&x)[1]),
+ "=a"(dummy)
+ :"2"(((unsigned *)&st1)[0]), "m"(((unsigned *)&q)[0])
+ :"%dx");
+ /* msw x -= msw st1 * lsw q */
+ asm volatile ("mull %3; subl %%eax,%0":"=m" (((unsigned *)&x)[1]),
+ "=a"(dummy)
+ :"1"(((unsigned *)&st1)[1]), "m"(((unsigned *)&q)[0])
+ :"%dx");
+ /* msw x -= lsw st1 * msw q */
+ asm volatile ("mull %3; subl %%eax,%0":"=m" (((unsigned *)&x)[1]),
+ "=a"(dummy)
+ :"1"(((unsigned *)&st1)[0]), "m"(((unsigned *)&q)[1])
+ :"%dx");
+
+ *y = x;
+}
+
+/* Remainder of st(0) / st(1) */
+/* This routine produces exact results, i.e. there is never any
+ rounding or truncation, etc of the result. */
+static void do_fprem(FPU_REG *st0_ptr, u_char st0_tag, int round)
+{
+ FPU_REG *st1_ptr = &st(1);
+ u_char st1_tag = FPU_gettagi(1);
+
+ if (!((st0_tag ^ TAG_Valid) | (st1_tag ^ TAG_Valid))) {
+ FPU_REG tmp, st0, st1;
+ u_char st0_sign, st1_sign;
+ u_char tmptag;
+ int tag;
+ int old_cw;
+ int expdif;
+ long long q;
+ unsigned short saved_status;
+ int cc;
+
+ fprem_valid:
+ /* Convert registers for internal use. */
+ st0_sign = FPU_to_exp16(st0_ptr, &st0);
+ st1_sign = FPU_to_exp16(st1_ptr, &st1);
+ expdif = exponent16(&st0) - exponent16(&st1);
+
+ old_cw = control_word;
+ cc = 0;
+
+ /* We want the status following the denorm tests, but don't want
+ the status changed by the arithmetic operations. */
+ saved_status = partial_status;
+ control_word &= ~CW_RC;
+ control_word |= RC_CHOP;
+
+ if (expdif < 64) {
+ /* This should be the most common case */
+
+ if (expdif > -2) {
+ u_char sign = st0_sign ^ st1_sign;
+ tag = FPU_u_div(&st0, &st1, &tmp,
+ PR_64_BITS | RC_CHOP | 0x3f,
+ sign);
+ setsign(&tmp, sign);
+
+ if (exponent(&tmp) >= 0) {
+ FPU_round_to_int(&tmp, tag); /* Fortunately, this can't
+ overflow to 2^64 */
+ q = significand(&tmp);
+
+ rem_kernel(significand(&st0),
+ &significand(&tmp),
+ significand(&st1),
+ q, expdif);
+
+ setexponent16(&tmp, exponent16(&st1));
+ } else {
+ reg_copy(&st0, &tmp);
+ q = 0;
+ }
+
+ if ((round == RC_RND)
+ && (tmp.sigh & 0xc0000000)) {
+ /* We may need to subtract st(1) once more,
+ to get a result <= 1/2 of st(1). */
+ unsigned long long x;
+ expdif =
+ exponent16(&st1) - exponent16(&tmp);
+ if (expdif <= 1) {
+ if (expdif == 0)
+ x = significand(&st1) -
+ significand(&tmp);
+ else /* expdif is 1 */
+ x = (significand(&st1)
+ << 1) -
+ significand(&tmp);
+ if ((x < significand(&tmp)) ||
+ /* or equi-distant (from 0 & st(1)) and q is odd */
+ ((x == significand(&tmp))
+ && (q & 1))) {
+ st0_sign = !st0_sign;
+ significand(&tmp) = x;
+ q++;
+ }
+ }
+ }
+
+ if (q & 4)
+ cc |= SW_C0;
+ if (q & 2)
+ cc |= SW_C3;
+ if (q & 1)
+ cc |= SW_C1;
+ } else {
+ control_word = old_cw;
+ setcc(0);
+ return;
+ }
+ } else {
+ /* There is a large exponent difference ( >= 64 ) */
+ /* To make much sense, the code in this section should
+ be done at high precision. */
+ int exp_1, N;
+ u_char sign;
+
+ /* prevent overflow here */
+ /* N is 'a number between 32 and 63' (p26-113) */
+ reg_copy(&st0, &tmp);
+ tmptag = st0_tag;
+ N = (expdif & 0x0000001f) + 32; /* This choice gives results
+ identical to an AMD 486 */
+ setexponent16(&tmp, N);
+ exp_1 = exponent16(&st1);
+ setexponent16(&st1, 0);
+ expdif -= N;
+
+ sign = getsign(&tmp) ^ st1_sign;
+ tag =
+ FPU_u_div(&tmp, &st1, &tmp,
+ PR_64_BITS | RC_CHOP | 0x3f, sign);
+ setsign(&tmp, sign);
+
+ FPU_round_to_int(&tmp, tag); /* Fortunately, this can't
+ overflow to 2^64 */
+
+ rem_kernel(significand(&st0),
+ &significand(&tmp),
+ significand(&st1),
+ significand(&tmp), exponent(&tmp)
+ );
+ setexponent16(&tmp, exp_1 + expdif);
+
+ /* It is possible for the operation to be complete here.
+ What does the IEEE standard say? The Intel 80486 manual
+ implies that the operation will never be completed at this
+ point, and the behaviour of a real 80486 confirms this.
+ */
+ if (!(tmp.sigh | tmp.sigl)) {
+ /* The result is zero */
+ control_word = old_cw;
+ partial_status = saved_status;
+ FPU_copy_to_reg0(&CONST_Z, TAG_Zero);
+ setsign(&st0, st0_sign);
+#ifdef PECULIAR_486
+ setcc(SW_C2);
+#else
+ setcc(0);
+#endif /* PECULIAR_486 */
+ return;
+ }
+ cc = SW_C2;
+ }
+
+ control_word = old_cw;
+ partial_status = saved_status;
+ tag = FPU_normalize_nuo(&tmp);
+ reg_copy(&tmp, st0_ptr);
+
+ /* The only condition to be looked for is underflow,
+ and it can occur here only if underflow is unmasked. */
+ if ((exponent16(&tmp) <= EXP_UNDER) && (tag != TAG_Zero)
+ && !(control_word & CW_Underflow)) {
+ setcc(cc);
+ tag = arith_underflow(st0_ptr);
+ setsign(st0_ptr, st0_sign);
+ FPU_settag0(tag);
+ return;
+ } else if ((exponent16(&tmp) > EXP_UNDER) || (tag == TAG_Zero)) {
+ stdexp(st0_ptr);
+ setsign(st0_ptr, st0_sign);
+ } else {
+ tag =
+ FPU_round(st0_ptr, 0, 0, FULL_PRECISION, st0_sign);
+ }
+ FPU_settag0(tag);
+ setcc(cc);
+
+ return;
+ }
+
+ if (st0_tag == TAG_Special)
+ st0_tag = FPU_Special(st0_ptr);
+ if (st1_tag == TAG_Special)
+ st1_tag = FPU_Special(st1_ptr);
+
+ if (((st0_tag == TAG_Valid) && (st1_tag == TW_Denormal))
+ || ((st0_tag == TW_Denormal) && (st1_tag == TAG_Valid))
+ || ((st0_tag == TW_Denormal) && (st1_tag == TW_Denormal))) {
+ if (denormal_operand() < 0)
+ return;
+ goto fprem_valid;
+ } else if ((st0_tag == TAG_Empty) || (st1_tag == TAG_Empty)) {
+ FPU_stack_underflow();
+ return;
+ } else if (st0_tag == TAG_Zero) {
+ if (st1_tag == TAG_Valid) {
+ setcc(0);
+ return;
+ } else if (st1_tag == TW_Denormal) {
+ if (denormal_operand() < 0)
+ return;
+ setcc(0);
+ return;
+ } else if (st1_tag == TAG_Zero) {
+ arith_invalid(0);
+ return;
+ } /* fprem(?,0) always invalid */
+ else if (st1_tag == TW_Infinity) {
+ setcc(0);
+ return;
+ }
+ } else if ((st0_tag == TAG_Valid) || (st0_tag == TW_Denormal)) {
+ if (st1_tag == TAG_Zero) {
+ arith_invalid(0); /* fprem(Valid,Zero) is invalid */
+ return;
+ } else if (st1_tag != TW_NaN) {
+ if (((st0_tag == TW_Denormal)
+ || (st1_tag == TW_Denormal))
+ && (denormal_operand() < 0))
+ return;
+
+ if (st1_tag == TW_Infinity) {
+ /* fprem(Valid,Infinity) is o.k. */
+ setcc(0);
+ return;
+ }
+ }
+ } else if (st0_tag == TW_Infinity) {
+ if (st1_tag != TW_NaN) {
+ arith_invalid(0); /* fprem(Infinity,?) is invalid */
+ return;
+ }
+ }
+
+ /* One of the registers must contain a NaN if we got here. */
+
+#ifdef PARANOID
+ if ((st0_tag != TW_NaN) && (st1_tag != TW_NaN))
+ EXCEPTION(EX_INTERNAL | 0x118);
+#endif /* PARANOID */
+
+ real_2op_NaN(st1_ptr, st1_tag, 0, st1_ptr);
+
+}
+
+/* ST(1) <- ST(1) * log ST; pop ST */
+static void fyl2x(FPU_REG *st0_ptr, u_char st0_tag)
+{
+ FPU_REG *st1_ptr = &st(1), exponent;
+ u_char st1_tag = FPU_gettagi(1);
+ u_char sign;
+ int e, tag;
+
+ clear_C1();
+
+ if ((st0_tag == TAG_Valid) && (st1_tag == TAG_Valid)) {
+ both_valid:
+ /* Both regs are Valid or Denormal */
+ if (signpositive(st0_ptr)) {
+ if (st0_tag == TW_Denormal)
+ FPU_to_exp16(st0_ptr, st0_ptr);
+ else
+ /* Convert st(0) for internal use. */
+ setexponent16(st0_ptr, exponent(st0_ptr));
+
+ if ((st0_ptr->sigh == 0x80000000)
+ && (st0_ptr->sigl == 0)) {
+ /* Special case. The result can be precise. */
+ u_char esign;
+ e = exponent16(st0_ptr);
+ if (e >= 0) {
+ exponent.sigh = e;
+ esign = SIGN_POS;
+ } else {
+ exponent.sigh = -e;
+ esign = SIGN_NEG;
+ }
+ exponent.sigl = 0;
+ setexponent16(&exponent, 31);
+ tag = FPU_normalize_nuo(&exponent);
+ stdexp(&exponent);
+ setsign(&exponent, esign);
+ tag =
+ FPU_mul(&exponent, tag, 1, FULL_PRECISION);
+ if (tag >= 0)
+ FPU_settagi(1, tag);
+ } else {
+ /* The usual case */
+ sign = getsign(st1_ptr);
+ if (st1_tag == TW_Denormal)
+ FPU_to_exp16(st1_ptr, st1_ptr);
+ else
+ /* Convert st(1) for internal use. */
+ setexponent16(st1_ptr,
+ exponent(st1_ptr));
+ poly_l2(st0_ptr, st1_ptr, sign);
+ }
+ } else {
+ /* negative */
+ if (arith_invalid(1) < 0)
+ return;
+ }
+
+ FPU_pop();
+
+ return;
+ }
+
+ if (st0_tag == TAG_Special)
+ st0_tag = FPU_Special(st0_ptr);
+ if (st1_tag == TAG_Special)
+ st1_tag = FPU_Special(st1_ptr);
+
+ if ((st0_tag == TAG_Empty) || (st1_tag == TAG_Empty)) {
+ FPU_stack_underflow_pop(1);
+ return;
+ } else if ((st0_tag <= TW_Denormal) && (st1_tag <= TW_Denormal)) {
+ if (st0_tag == TAG_Zero) {
+ if (st1_tag == TAG_Zero) {
+ /* Both args zero is invalid */
+ if (arith_invalid(1) < 0)
+ return;
+ } else {
+ u_char sign;
+ sign = getsign(st1_ptr) ^ SIGN_NEG;
+ if (FPU_divide_by_zero(1, sign) < 0)
+ return;
+
+ setsign(st1_ptr, sign);
+ }
+ } else if (st1_tag == TAG_Zero) {
+ /* st(1) contains zero, st(0) valid <> 0 */
+ /* Zero is the valid answer */
+ sign = getsign(st1_ptr);
+
+ if (signnegative(st0_ptr)) {
+ /* log(negative) */
+ if (arith_invalid(1) < 0)
+ return;
+ } else if ((st0_tag == TW_Denormal)
+ && (denormal_operand() < 0))
+ return;
+ else {
+ if (exponent(st0_ptr) < 0)
+ sign ^= SIGN_NEG;
+
+ FPU_copy_to_reg1(&CONST_Z, TAG_Zero);
+ setsign(st1_ptr, sign);
+ }
+ } else {
+ /* One or both operands are denormals. */
+ if (denormal_operand() < 0)
+ return;
+ goto both_valid;
+ }
+ } else if ((st0_tag == TW_NaN) || (st1_tag == TW_NaN)) {
+ if (real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0)
+ return;
+ }
+ /* One or both arg must be an infinity */
+ else if (st0_tag == TW_Infinity) {
+ if ((signnegative(st0_ptr)) || (st1_tag == TAG_Zero)) {
+ /* log(-infinity) or 0*log(infinity) */
+ if (arith_invalid(1) < 0)
+ return;
+ } else {
+ u_char sign = getsign(st1_ptr);
+
+ if ((st1_tag == TW_Denormal)
+ && (denormal_operand() < 0))
+ return;
+
+ FPU_copy_to_reg1(&CONST_INF, TAG_Special);
+ setsign(st1_ptr, sign);
+ }
+ }
+ /* st(1) must be infinity here */
+ else if (((st0_tag == TAG_Valid) || (st0_tag == TW_Denormal))
+ && (signpositive(st0_ptr))) {
+ if (exponent(st0_ptr) >= 0) {
+ if ((exponent(st0_ptr) == 0) &&
+ (st0_ptr->sigh == 0x80000000) &&
+ (st0_ptr->sigl == 0)) {
+ /* st(0) holds 1.0 */
+ /* infinity*log(1) */
+ if (arith_invalid(1) < 0)
+ return;
+ }
+ /* else st(0) is positive and > 1.0 */
+ } else {
+ /* st(0) is positive and < 1.0 */
+
+ if ((st0_tag == TW_Denormal)
+ && (denormal_operand() < 0))
+ return;
+
+ changesign(st1_ptr);
+ }
+ } else {
+ /* st(0) must be zero or negative */
+ if (st0_tag == TAG_Zero) {
+ /* This should be invalid, but a real 80486 is happy with it. */
+
+#ifndef PECULIAR_486
+ sign = getsign(st1_ptr);
+ if (FPU_divide_by_zero(1, sign) < 0)
+ return;
+#endif /* PECULIAR_486 */
+
+ changesign(st1_ptr);
+ } else if (arith_invalid(1) < 0) /* log(negative) */
+ return;
+ }
+
+ FPU_pop();
+}
+
+static void fpatan(FPU_REG *st0_ptr, u_char st0_tag)
+{
+ FPU_REG *st1_ptr = &st(1);
+ u_char st1_tag = FPU_gettagi(1);
+ int tag;
+
+ clear_C1();
+ if (!((st0_tag ^ TAG_Valid) | (st1_tag ^ TAG_Valid))) {
+ valid_atan:
+
+ poly_atan(st0_ptr, st0_tag, st1_ptr, st1_tag);
+
+ FPU_pop();
+
+ return;
+ }
+
+ if (st0_tag == TAG_Special)
+ st0_tag = FPU_Special(st0_ptr);
+ if (st1_tag == TAG_Special)
+ st1_tag = FPU_Special(st1_ptr);
+
+ if (((st0_tag == TAG_Valid) && (st1_tag == TW_Denormal))
+ || ((st0_tag == TW_Denormal) && (st1_tag == TAG_Valid))
+ || ((st0_tag == TW_Denormal) && (st1_tag == TW_Denormal))) {
+ if (denormal_operand() < 0)
+ return;
+
+ goto valid_atan;
+ } else if ((st0_tag == TAG_Empty) || (st1_tag == TAG_Empty)) {
+ FPU_stack_underflow_pop(1);
+ return;
+ } else if ((st0_tag == TW_NaN) || (st1_tag == TW_NaN)) {
+ if (real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) >= 0)
+ FPU_pop();
+ return;
+ } else if ((st0_tag == TW_Infinity) || (st1_tag == TW_Infinity)) {
+ u_char sign = getsign(st1_ptr);
+ if (st0_tag == TW_Infinity) {
+ if (st1_tag == TW_Infinity) {
+ if (signpositive(st0_ptr)) {
+ FPU_copy_to_reg1(&CONST_PI4, TAG_Valid);
+ } else {
+ setpositive(st1_ptr);
+ tag =
+ FPU_u_add(&CONST_PI4, &CONST_PI2,
+ st1_ptr, FULL_PRECISION,
+ SIGN_POS,
+ exponent(&CONST_PI4),
+ exponent(&CONST_PI2));
+ if (tag >= 0)
+ FPU_settagi(1, tag);
+ }
+ } else {
+ if ((st1_tag == TW_Denormal)
+ && (denormal_operand() < 0))
+ return;
+
+ if (signpositive(st0_ptr)) {
+ FPU_copy_to_reg1(&CONST_Z, TAG_Zero);
+ setsign(st1_ptr, sign); /* An 80486 preserves the sign */
+ FPU_pop();
+ return;
+ } else {
+ FPU_copy_to_reg1(&CONST_PI, TAG_Valid);
+ }
+ }
+ } else {
+ /* st(1) is infinity, st(0) not infinity */
+ if ((st0_tag == TW_Denormal)
+ && (denormal_operand() < 0))
+ return;
+
+ FPU_copy_to_reg1(&CONST_PI2, TAG_Valid);
+ }
+ setsign(st1_ptr, sign);
+ } else if (st1_tag == TAG_Zero) {
+ /* st(0) must be valid or zero */
+ u_char sign = getsign(st1_ptr);
+
+ if ((st0_tag == TW_Denormal) && (denormal_operand() < 0))
+ return;
+
+ if (signpositive(st0_ptr)) {
+ /* An 80486 preserves the sign */
+ FPU_pop();
+ return;
+ }
+
+ FPU_copy_to_reg1(&CONST_PI, TAG_Valid);
+ setsign(st1_ptr, sign);
+ } else if (st0_tag == TAG_Zero) {
+ /* st(1) must be TAG_Valid here */
+ u_char sign = getsign(st1_ptr);
+
+ if ((st1_tag == TW_Denormal) && (denormal_operand() < 0))
+ return;
+
+ FPU_copy_to_reg1(&CONST_PI2, TAG_Valid);
+ setsign(st1_ptr, sign);
+ }
+#ifdef PARANOID
+ else
+ EXCEPTION(EX_INTERNAL | 0x125);
+#endif /* PARANOID */
+
+ FPU_pop();
+ set_precision_flag_up(); /* We do not really know if up or down */
+}
+
+static void fprem(FPU_REG *st0_ptr, u_char st0_tag)
+{
+ do_fprem(st0_ptr, st0_tag, RC_CHOP);
+}
+
+static void fprem1(FPU_REG *st0_ptr, u_char st0_tag)
+{
+ do_fprem(st0_ptr, st0_tag, RC_RND);
+}
+
+static void fyl2xp1(FPU_REG *st0_ptr, u_char st0_tag)
+{
+ u_char sign, sign1;
+ FPU_REG *st1_ptr = &st(1), a, b;
+ u_char st1_tag = FPU_gettagi(1);
+
+ clear_C1();
+ if (!((st0_tag ^ TAG_Valid) | (st1_tag ^ TAG_Valid))) {
+ valid_yl2xp1:
+
+ sign = getsign(st0_ptr);
+ sign1 = getsign(st1_ptr);
+
+ FPU_to_exp16(st0_ptr, &a);
+ FPU_to_exp16(st1_ptr, &b);
+
+ if (poly_l2p1(sign, sign1, &a, &b, st1_ptr))
+ return;
+
+ FPU_pop();
+ return;
+ }
+
+ if (st0_tag == TAG_Special)
+ st0_tag = FPU_Special(st0_ptr);
+ if (st1_tag == TAG_Special)
+ st1_tag = FPU_Special(st1_ptr);
+
+ if (((st0_tag == TAG_Valid) && (st1_tag == TW_Denormal))
+ || ((st0_tag == TW_Denormal) && (st1_tag == TAG_Valid))
+ || ((st0_tag == TW_Denormal) && (st1_tag == TW_Denormal))) {
+ if (denormal_operand() < 0)
+ return;
+
+ goto valid_yl2xp1;
+ } else if ((st0_tag == TAG_Empty) | (st1_tag == TAG_Empty)) {
+ FPU_stack_underflow_pop(1);
+ return;
+ } else if (st0_tag == TAG_Zero) {
+ switch (st1_tag) {
+ case TW_Denormal:
+ if (denormal_operand() < 0)
+ return;
+
+ case TAG_Zero:
+ case TAG_Valid:
+ setsign(st0_ptr, getsign(st0_ptr) ^ getsign(st1_ptr));
+ FPU_copy_to_reg1(st0_ptr, st0_tag);
+ break;
+
+ case TW_Infinity:
+ /* Infinity*log(1) */
+ if (arith_invalid(1) < 0)
+ return;
+ break;
+
+ case TW_NaN:
+ if (real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0)
+ return;
+ break;
+
+ default:
+#ifdef PARANOID
+ EXCEPTION(EX_INTERNAL | 0x116);
+ return;
+#endif /* PARANOID */
+ break;
+ }
+ } else if ((st0_tag == TAG_Valid) || (st0_tag == TW_Denormal)) {
+ switch (st1_tag) {
+ case TAG_Zero:
+ if (signnegative(st0_ptr)) {
+ if (exponent(st0_ptr) >= 0) {
+ /* st(0) holds <= -1.0 */
+#ifdef PECULIAR_486 /* Stupid 80486 doesn't worry about log(negative). */
+ changesign(st1_ptr);
+#else
+ if (arith_invalid(1) < 0)
+ return;
+#endif /* PECULIAR_486 */
+ } else if ((st0_tag == TW_Denormal)
+ && (denormal_operand() < 0))
+ return;
+ else
+ changesign(st1_ptr);
+ } else if ((st0_tag == TW_Denormal)
+ && (denormal_operand() < 0))
+ return;
+ break;
+
+ case TW_Infinity:
+ if (signnegative(st0_ptr)) {
+ if ((exponent(st0_ptr) >= 0) &&
+ !((st0_ptr->sigh == 0x80000000) &&
+ (st0_ptr->sigl == 0))) {
+ /* st(0) holds < -1.0 */
+#ifdef PECULIAR_486 /* Stupid 80486 doesn't worry about log(negative). */
+ changesign(st1_ptr);
+#else
+ if (arith_invalid(1) < 0)
+ return;
+#endif /* PECULIAR_486 */
+ } else if ((st0_tag == TW_Denormal)
+ && (denormal_operand() < 0))
+ return;
+ else
+ changesign(st1_ptr);
+ } else if ((st0_tag == TW_Denormal)
+ && (denormal_operand() < 0))
+ return;
+ break;
+
+ case TW_NaN:
+ if (real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0)
+ return;
+ }
+
+ } else if (st0_tag == TW_NaN) {
+ if (real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0)
+ return;
+ } else if (st0_tag == TW_Infinity) {
+ if (st1_tag == TW_NaN) {
+ if (real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0)
+ return;
+ } else if (signnegative(st0_ptr)) {
+#ifndef PECULIAR_486
+ /* This should have higher priority than denormals, but... */
+ if (arith_invalid(1) < 0) /* log(-infinity) */
+ return;
+#endif /* PECULIAR_486 */
+ if ((st1_tag == TW_Denormal)
+ && (denormal_operand() < 0))
+ return;
+#ifdef PECULIAR_486
+ /* Denormal operands actually get higher priority */
+ if (arith_invalid(1) < 0) /* log(-infinity) */
+ return;
+#endif /* PECULIAR_486 */
+ } else if (st1_tag == TAG_Zero) {
+ /* log(infinity) */
+ if (arith_invalid(1) < 0)
+ return;
+ }
+
+ /* st(1) must be valid here. */
+
+ else if ((st1_tag == TW_Denormal) && (denormal_operand() < 0))
+ return;
+
+ /* The Manual says that log(Infinity) is invalid, but a real
+ 80486 sensibly says that it is o.k. */
+ else {
+ u_char sign = getsign(st1_ptr);
+ FPU_copy_to_reg1(&CONST_INF, TAG_Special);
+ setsign(st1_ptr, sign);
+ }
+ }
+#ifdef PARANOID
+ else {
+ EXCEPTION(EX_INTERNAL | 0x117);
+ return;
+ }
+#endif /* PARANOID */
+
+ FPU_pop();
+ return;
+
+}
+
+static void fscale(FPU_REG *st0_ptr, u_char st0_tag)
+{
+ FPU_REG *st1_ptr = &st(1);
+ u_char st1_tag = FPU_gettagi(1);
+ int old_cw = control_word;
+ u_char sign = getsign(st0_ptr);
+
+ clear_C1();
+ if (!((st0_tag ^ TAG_Valid) | (st1_tag ^ TAG_Valid))) {
+ long scale;
+ FPU_REG tmp;
+
+ /* Convert register for internal use. */
+ setexponent16(st0_ptr, exponent(st0_ptr));
+
+ valid_scale:
+
+ if (exponent(st1_ptr) > 30) {
+ /* 2^31 is far too large, would require 2^(2^30) or 2^(-2^30) */
+
+ if (signpositive(st1_ptr)) {
+ EXCEPTION(EX_Overflow);
+ FPU_copy_to_reg0(&CONST_INF, TAG_Special);
+ } else {
+ EXCEPTION(EX_Underflow);
+ FPU_copy_to_reg0(&CONST_Z, TAG_Zero);
+ }
+ setsign(st0_ptr, sign);
+ return;
+ }
+
+ control_word &= ~CW_RC;
+ control_word |= RC_CHOP;
+ reg_copy(st1_ptr, &tmp);
+ FPU_round_to_int(&tmp, st1_tag); /* This can never overflow here */
+ control_word = old_cw;
+ scale = signnegative(st1_ptr) ? -tmp.sigl : tmp.sigl;
+ scale += exponent16(st0_ptr);
+
+ setexponent16(st0_ptr, scale);
+
+ /* Use FPU_round() to properly detect under/overflow etc */
+ FPU_round(st0_ptr, 0, 0, control_word, sign);
+
+ return;
+ }
+
+ if (st0_tag == TAG_Special)
+ st0_tag = FPU_Special(st0_ptr);
+ if (st1_tag == TAG_Special)
+ st1_tag = FPU_Special(st1_ptr);
+
+ if ((st0_tag == TAG_Valid) || (st0_tag == TW_Denormal)) {
+ switch (st1_tag) {
+ case TAG_Valid:
+ /* st(0) must be a denormal */
+ if ((st0_tag == TW_Denormal)
+ && (denormal_operand() < 0))
+ return;
+
+ FPU_to_exp16(st0_ptr, st0_ptr); /* Will not be left on stack */
+ goto valid_scale;
+
+ case TAG_Zero:
+ if (st0_tag == TW_Denormal)
+ denormal_operand();
+ return;
+
+ case TW_Denormal:
+ denormal_operand();
+ return;
+
+ case TW_Infinity:
+ if ((st0_tag == TW_Denormal)
+ && (denormal_operand() < 0))
+ return;
+
+ if (signpositive(st1_ptr))
+ FPU_copy_to_reg0(&CONST_INF, TAG_Special);
+ else
+ FPU_copy_to_reg0(&CONST_Z, TAG_Zero);
+ setsign(st0_ptr, sign);
+ return;
+
+ case TW_NaN:
+ real_2op_NaN(st1_ptr, st1_tag, 0, st0_ptr);
+ return;
+ }
+ } else if (st0_tag == TAG_Zero) {
+ switch (st1_tag) {
+ case TAG_Valid:
+ case TAG_Zero:
+ return;
+
+ case TW_Denormal:
+ denormal_operand();
+ return;
+
+ case TW_Infinity:
+ if (signpositive(st1_ptr))
+ arith_invalid(0); /* Zero scaled by +Infinity */
+ return;
+
+ case TW_NaN:
+ real_2op_NaN(st1_ptr, st1_tag, 0, st0_ptr);
+ return;
+ }
+ } else if (st0_tag == TW_Infinity) {
+ switch (st1_tag) {
+ case TAG_Valid:
+ case TAG_Zero:
+ return;
+
+ case TW_Denormal:
+ denormal_operand();
+ return;
+
+ case TW_Infinity:
+ if (signnegative(st1_ptr))
+ arith_invalid(0); /* Infinity scaled by -Infinity */
+ return;
+
+ case TW_NaN:
+ real_2op_NaN(st1_ptr, st1_tag, 0, st0_ptr);
+ return;
+ }
+ } else if (st0_tag == TW_NaN) {
+ if (st1_tag != TAG_Empty) {
+ real_2op_NaN(st1_ptr, st1_tag, 0, st0_ptr);
+ return;
+ }
+ }
+#ifdef PARANOID
+ if (!((st0_tag == TAG_Empty) || (st1_tag == TAG_Empty))) {
+ EXCEPTION(EX_INTERNAL | 0x115);
+ return;
+ }
+#endif
+
+ /* At least one of st(0), st(1) must be empty */
+ FPU_stack_underflow();
+
+}
+
+/*---------------------------------------------------------------------------*/
+
+static FUNC_ST0 const trig_table_a[] = {
+ f2xm1, fyl2x, fptan, fpatan,
+ fxtract, fprem1, (FUNC_ST0) fdecstp, (FUNC_ST0) fincstp
+};
+
+void FPU_triga(void)
+{
+ (trig_table_a[FPU_rm]) (&st(0), FPU_gettag0());
+}
+
+static FUNC_ST0 const trig_table_b[] = {
+ fprem, fyl2xp1, fsqrt_, fsincos, frndint_, fscale, (FUNC_ST0) fsin, fcos
+};
+
+void FPU_trigb(void)
+{
+ (trig_table_b[FPU_rm]) (&st(0), FPU_gettag0());
+}
diff --git a/arch/x86/math-emu/get_address.c b/arch/x86/math-emu/get_address.c
new file mode 100644
index 000000000..b82ca14ba
--- /dev/null
+++ b/arch/x86/math-emu/get_address.c
@@ -0,0 +1,401 @@
+// SPDX-License-Identifier: GPL-2.0
+/*---------------------------------------------------------------------------+
+ | get_address.c |
+ | |
+ | Get the effective address from an FPU instruction. |
+ | |
+ | Copyright (C) 1992,1993,1994,1997 |
+ | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, |
+ | Australia. E-mail billm@suburbia.net |
+ | |
+ | |
+ +---------------------------------------------------------------------------*/
+
+/*---------------------------------------------------------------------------+
+ | Note: |
+ | The file contains code which accesses user memory. |
+ | Emulator static data may change when user memory is accessed, due to |
+ | other processes using the emulator while swapping is in progress. |
+ +---------------------------------------------------------------------------*/
+
+#include <linux/stddef.h>
+
+#include <linux/uaccess.h>
+#include <asm/vm86.h>
+
+#include "fpu_system.h"
+#include "exception.h"
+#include "fpu_emu.h"
+
+#define FPU_WRITE_BIT 0x10
+
+static int reg_offset[] = {
+ offsetof(struct pt_regs, ax),
+ offsetof(struct pt_regs, cx),
+ offsetof(struct pt_regs, dx),
+ offsetof(struct pt_regs, bx),
+ offsetof(struct pt_regs, sp),
+ offsetof(struct pt_regs, bp),
+ offsetof(struct pt_regs, si),
+ offsetof(struct pt_regs, di)
+};
+
+#define REG_(x) (*(long *)(reg_offset[(x)] + (u_char *)FPU_info->regs))
+
+static int reg_offset_vm86[] = {
+ offsetof(struct pt_regs, cs),
+ offsetof(struct kernel_vm86_regs, ds),
+ offsetof(struct kernel_vm86_regs, es),
+ offsetof(struct kernel_vm86_regs, fs),
+ offsetof(struct kernel_vm86_regs, gs),
+ offsetof(struct pt_regs, ss),
+ offsetof(struct kernel_vm86_regs, ds)
+};
+
+#define VM86_REG_(x) (*(unsigned short *) \
+ (reg_offset_vm86[((unsigned)x)] + (u_char *)FPU_info->regs))
+
+static int reg_offset_pm[] = {
+ offsetof(struct pt_regs, cs),
+ offsetof(struct pt_regs, ds),
+ offsetof(struct pt_regs, es),
+ offsetof(struct pt_regs, fs),
+ offsetof(struct pt_regs, ds), /* dummy, not saved on stack */
+ offsetof(struct pt_regs, ss),
+ offsetof(struct pt_regs, ds)
+};
+
+#define PM_REG_(x) (*(unsigned short *) \
+ (reg_offset_pm[((unsigned)x)] + (u_char *)FPU_info->regs))
+
+/* Decode the SIB byte. This function assumes mod != 0 */
+static int sib(int mod, unsigned long *fpu_eip)
+{
+ u_char ss, index, base;
+ long offset;
+
+ RE_ENTRANT_CHECK_OFF;
+ FPU_code_access_ok(1);
+ FPU_get_user(base, (u_char __user *) (*fpu_eip)); /* The SIB byte */
+ RE_ENTRANT_CHECK_ON;
+ (*fpu_eip)++;
+ ss = base >> 6;
+ index = (base >> 3) & 7;
+ base &= 7;
+
+ if ((mod == 0) && (base == 5))
+ offset = 0; /* No base register */
+ else
+ offset = REG_(base);
+
+ if (index == 4) {
+ /* No index register */
+ /* A non-zero ss is illegal */
+ if (ss)
+ EXCEPTION(EX_Invalid);
+ } else {
+ offset += (REG_(index)) << ss;
+ }
+
+ if (mod == 1) {
+ /* 8 bit signed displacement */
+ long displacement;
+ RE_ENTRANT_CHECK_OFF;
+ FPU_code_access_ok(1);
+ FPU_get_user(displacement, (signed char __user *)(*fpu_eip));
+ offset += displacement;
+ RE_ENTRANT_CHECK_ON;
+ (*fpu_eip)++;
+ } else if (mod == 2 || base == 5) { /* The second condition also has mod==0 */
+ /* 32 bit displacement */
+ long displacement;
+ RE_ENTRANT_CHECK_OFF;
+ FPU_code_access_ok(4);
+ FPU_get_user(displacement, (long __user *)(*fpu_eip));
+ offset += displacement;
+ RE_ENTRANT_CHECK_ON;
+ (*fpu_eip) += 4;
+ }
+
+ return offset;
+}
+
+static unsigned long vm86_segment(u_char segment, struct address *addr)
+{
+ segment--;
+#ifdef PARANOID
+ if (segment > PREFIX_SS_) {
+ EXCEPTION(EX_INTERNAL | 0x130);
+ math_abort(FPU_info, SIGSEGV);
+ }
+#endif /* PARANOID */
+ addr->selector = VM86_REG_(segment);
+ return (unsigned long)VM86_REG_(segment) << 4;
+}
+
+/* This should work for 16 and 32 bit protected mode. */
+static long pm_address(u_char FPU_modrm, u_char segment,
+ struct address *addr, long offset)
+{
+ struct desc_struct descriptor;
+ unsigned long base_address, limit, address, seg_top;
+
+ segment--;
+
+#ifdef PARANOID
+ /* segment is unsigned, so this also detects if segment was 0: */
+ if (segment > PREFIX_SS_) {
+ EXCEPTION(EX_INTERNAL | 0x132);
+ math_abort(FPU_info, SIGSEGV);
+ }
+#endif /* PARANOID */
+
+ switch (segment) {
+ case PREFIX_GS_ - 1:
+ /* user gs handling can be lazy, use special accessors */
+ addr->selector = get_user_gs(FPU_info->regs);
+ break;
+ default:
+ addr->selector = PM_REG_(segment);
+ }
+
+ descriptor = FPU_get_ldt_descriptor(addr->selector);
+ base_address = seg_get_base(&descriptor);
+ address = base_address + offset;
+ limit = seg_get_limit(&descriptor) + 1;
+ limit *= seg_get_granularity(&descriptor);
+ limit += base_address - 1;
+ if (limit < base_address)
+ limit = 0xffffffff;
+
+ if (seg_expands_down(&descriptor)) {
+ if (descriptor.g) {
+ seg_top = 0xffffffff;
+ } else {
+ seg_top = base_address + (1 << 20);
+ if (seg_top < base_address)
+ seg_top = 0xffffffff;
+ }
+ access_limit =
+ (address <= limit) || (address >= seg_top) ? 0 :
+ ((seg_top - address) >= 255 ? 255 : seg_top - address);
+ } else {
+ access_limit =
+ (address > limit) || (address < base_address) ? 0 :
+ ((limit - address) >= 254 ? 255 : limit - address + 1);
+ }
+ if (seg_execute_only(&descriptor) ||
+ (!seg_writable(&descriptor) && (FPU_modrm & FPU_WRITE_BIT))) {
+ access_limit = 0;
+ }
+ return address;
+}
+
+/*
+ MOD R/M byte: MOD == 3 has a special use for the FPU
+ SIB byte used iff R/M = 100b
+
+ 7 6 5 4 3 2 1 0
+ ..... ......... .........
+ MOD OPCODE(2) R/M
+
+ SIB byte
+
+ 7 6 5 4 3 2 1 0
+ ..... ......... .........
+ SS INDEX BASE
+
+*/
+
+void __user *FPU_get_address(u_char FPU_modrm, unsigned long *fpu_eip,
+ struct address *addr, fpu_addr_modes addr_modes)
+{
+ u_char mod;
+ unsigned rm = FPU_modrm & 7;
+ long *cpu_reg_ptr;
+ int address = 0; /* Initialized just to stop compiler warnings. */
+
+ /* Memory accessed via the cs selector is write protected
+ in `non-segmented' 32 bit protected mode. */
+ if (!addr_modes.default_mode && (FPU_modrm & FPU_WRITE_BIT)
+ && (addr_modes.override.segment == PREFIX_CS_)) {
+ math_abort(FPU_info, SIGSEGV);
+ }
+
+ addr->selector = FPU_DS; /* Default, for 32 bit non-segmented mode. */
+
+ mod = (FPU_modrm >> 6) & 3;
+
+ if (rm == 4 && mod != 3) {
+ address = sib(mod, fpu_eip);
+ } else {
+ cpu_reg_ptr = &REG_(rm);
+ switch (mod) {
+ case 0:
+ if (rm == 5) {
+ /* Special case: disp32 */
+ RE_ENTRANT_CHECK_OFF;
+ FPU_code_access_ok(4);
+ FPU_get_user(address,
+ (unsigned long __user
+ *)(*fpu_eip));
+ (*fpu_eip) += 4;
+ RE_ENTRANT_CHECK_ON;
+ addr->offset = address;
+ return (void __user *)address;
+ } else {
+ address = *cpu_reg_ptr; /* Just return the contents
+ of the cpu register */
+ addr->offset = address;
+ return (void __user *)address;
+ }
+ case 1:
+ /* 8 bit signed displacement */
+ RE_ENTRANT_CHECK_OFF;
+ FPU_code_access_ok(1);
+ FPU_get_user(address, (signed char __user *)(*fpu_eip));
+ RE_ENTRANT_CHECK_ON;
+ (*fpu_eip)++;
+ break;
+ case 2:
+ /* 32 bit displacement */
+ RE_ENTRANT_CHECK_OFF;
+ FPU_code_access_ok(4);
+ FPU_get_user(address, (long __user *)(*fpu_eip));
+ (*fpu_eip) += 4;
+ RE_ENTRANT_CHECK_ON;
+ break;
+ case 3:
+ /* Not legal for the FPU */
+ EXCEPTION(EX_Invalid);
+ }
+ address += *cpu_reg_ptr;
+ }
+
+ addr->offset = address;
+
+ switch (addr_modes.default_mode) {
+ case 0:
+ break;
+ case VM86:
+ address += vm86_segment(addr_modes.override.segment, addr);
+ break;
+ case PM16:
+ case SEG32:
+ address = pm_address(FPU_modrm, addr_modes.override.segment,
+ addr, address);
+ break;
+ default:
+ EXCEPTION(EX_INTERNAL | 0x133);
+ }
+
+ return (void __user *)address;
+}
+
+void __user *FPU_get_address_16(u_char FPU_modrm, unsigned long *fpu_eip,
+ struct address *addr, fpu_addr_modes addr_modes)
+{
+ u_char mod;
+ unsigned rm = FPU_modrm & 7;
+ int address = 0; /* Default used for mod == 0 */
+
+ /* Memory accessed via the cs selector is write protected
+ in `non-segmented' 32 bit protected mode. */
+ if (!addr_modes.default_mode && (FPU_modrm & FPU_WRITE_BIT)
+ && (addr_modes.override.segment == PREFIX_CS_)) {
+ math_abort(FPU_info, SIGSEGV);
+ }
+
+ addr->selector = FPU_DS; /* Default, for 32 bit non-segmented mode. */
+
+ mod = (FPU_modrm >> 6) & 3;
+
+ switch (mod) {
+ case 0:
+ if (rm == 6) {
+ /* Special case: disp16 */
+ RE_ENTRANT_CHECK_OFF;
+ FPU_code_access_ok(2);
+ FPU_get_user(address,
+ (unsigned short __user *)(*fpu_eip));
+ (*fpu_eip) += 2;
+ RE_ENTRANT_CHECK_ON;
+ goto add_segment;
+ }
+ break;
+ case 1:
+ /* 8 bit signed displacement */
+ RE_ENTRANT_CHECK_OFF;
+ FPU_code_access_ok(1);
+ FPU_get_user(address, (signed char __user *)(*fpu_eip));
+ RE_ENTRANT_CHECK_ON;
+ (*fpu_eip)++;
+ break;
+ case 2:
+ /* 16 bit displacement */
+ RE_ENTRANT_CHECK_OFF;
+ FPU_code_access_ok(2);
+ FPU_get_user(address, (unsigned short __user *)(*fpu_eip));
+ (*fpu_eip) += 2;
+ RE_ENTRANT_CHECK_ON;
+ break;
+ case 3:
+ /* Not legal for the FPU */
+ EXCEPTION(EX_Invalid);
+ break;
+ }
+ switch (rm) {
+ case 0:
+ address += FPU_info->regs->bx + FPU_info->regs->si;
+ break;
+ case 1:
+ address += FPU_info->regs->bx + FPU_info->regs->di;
+ break;
+ case 2:
+ address += FPU_info->regs->bp + FPU_info->regs->si;
+ if (addr_modes.override.segment == PREFIX_DEFAULT)
+ addr_modes.override.segment = PREFIX_SS_;
+ break;
+ case 3:
+ address += FPU_info->regs->bp + FPU_info->regs->di;
+ if (addr_modes.override.segment == PREFIX_DEFAULT)
+ addr_modes.override.segment = PREFIX_SS_;
+ break;
+ case 4:
+ address += FPU_info->regs->si;
+ break;
+ case 5:
+ address += FPU_info->regs->di;
+ break;
+ case 6:
+ address += FPU_info->regs->bp;
+ if (addr_modes.override.segment == PREFIX_DEFAULT)
+ addr_modes.override.segment = PREFIX_SS_;
+ break;
+ case 7:
+ address += FPU_info->regs->bx;
+ break;
+ }
+
+ add_segment:
+ address &= 0xffff;
+
+ addr->offset = address;
+
+ switch (addr_modes.default_mode) {
+ case 0:
+ break;
+ case VM86:
+ address += vm86_segment(addr_modes.override.segment, addr);
+ break;
+ case PM16:
+ case SEG32:
+ address = pm_address(FPU_modrm, addr_modes.override.segment,
+ addr, address);
+ break;
+ default:
+ EXCEPTION(EX_INTERNAL | 0x131);
+ }
+
+ return (void __user *)address;
+}
diff --git a/arch/x86/math-emu/load_store.c b/arch/x86/math-emu/load_store.c
new file mode 100644
index 000000000..f821a9cd7
--- /dev/null
+++ b/arch/x86/math-emu/load_store.c
@@ -0,0 +1,322 @@
+// SPDX-License-Identifier: GPL-2.0
+/*---------------------------------------------------------------------------+
+ | load_store.c |
+ | |
+ | This file contains most of the code to interpret the FPU instructions |
+ | which load and store from user memory. |
+ | |
+ | Copyright (C) 1992,1993,1994,1997 |
+ | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, |
+ | Australia. E-mail billm@suburbia.net |
+ | |
+ | |
+ +---------------------------------------------------------------------------*/
+
+/*---------------------------------------------------------------------------+
+ | Note: |
+ | The file contains code which accesses user memory. |
+ | Emulator static data may change when user memory is accessed, due to |
+ | other processes using the emulator while swapping is in progress. |
+ +---------------------------------------------------------------------------*/
+
+#include <linux/uaccess.h>
+
+#include "fpu_system.h"
+#include "exception.h"
+#include "fpu_emu.h"
+#include "status_w.h"
+#include "control_w.h"
+
+#define _NONE_ 0 /* st0_ptr etc not needed */
+#define _REG0_ 1 /* Will be storing st(0) */
+#define _PUSH_ 3 /* Need to check for space to push onto stack */
+#define _null_ 4 /* Function illegal or not implemented */
+
+#define pop_0() { FPU_settag0(TAG_Empty); top++; }
+
+/* index is a 5-bit value: (3-bit FPU_modrm.reg field | opcode[2,1]) */
+static u_char const type_table[32] = {
+ _PUSH_, _PUSH_, _PUSH_, _PUSH_, /* /0: d9:fld f32, db:fild m32, dd:fld f64, df:fild m16 */
+ _null_, _REG0_, _REG0_, _REG0_, /* /1: d9:undef, db,dd,df:fisttp m32/64/16 */
+ _REG0_, _REG0_, _REG0_, _REG0_, /* /2: d9:fst f32, db:fist m32, dd:fst f64, df:fist m16 */
+ _REG0_, _REG0_, _REG0_, _REG0_, /* /3: d9:fstp f32, db:fistp m32, dd:fstp f64, df:fistp m16 */
+ _NONE_, _null_, _NONE_, _PUSH_,
+ _NONE_, _PUSH_, _null_, _PUSH_,
+ _NONE_, _null_, _NONE_, _REG0_,
+ _NONE_, _REG0_, _NONE_, _REG0_
+};
+
+u_char const data_sizes_16[32] = {
+ 4, 4, 8, 2,
+ 0, 4, 8, 2, /* /1: d9:undef, db,dd,df:fisttp */
+ 4, 4, 8, 2,
+ 4, 4, 8, 2,
+ 14, 0, 94, 10, 2, 10, 0, 8,
+ 14, 0, 94, 10, 2, 10, 2, 8
+};
+
+static u_char const data_sizes_32[32] = {
+ 4, 4, 8, 2,
+ 0, 4, 8, 2, /* /1: d9:undef, db,dd,df:fisttp */
+ 4, 4, 8, 2,
+ 4, 4, 8, 2,
+ 28, 0, 108, 10, 2, 10, 0, 8,
+ 28, 0, 108, 10, 2, 10, 2, 8
+};
+
+int FPU_load_store(u_char type, fpu_addr_modes addr_modes,
+ void __user * data_address)
+{
+ FPU_REG loaded_data;
+ FPU_REG *st0_ptr;
+ u_char st0_tag = TAG_Empty; /* This is just to stop a gcc warning. */
+ u_char loaded_tag;
+ int sv_cw;
+
+ st0_ptr = NULL; /* Initialized just to stop compiler warnings. */
+
+ if (addr_modes.default_mode & PROTECTED) {
+ if (addr_modes.default_mode == SEG32) {
+ if (access_limit < data_sizes_32[type])
+ math_abort(FPU_info, SIGSEGV);
+ } else if (addr_modes.default_mode == PM16) {
+ if (access_limit < data_sizes_16[type])
+ math_abort(FPU_info, SIGSEGV);
+ }
+#ifdef PARANOID
+ else
+ EXCEPTION(EX_INTERNAL | 0x140);
+#endif /* PARANOID */
+ }
+
+ switch (type_table[type]) {
+ case _NONE_:
+ break;
+ case _REG0_:
+ st0_ptr = &st(0); /* Some of these instructions pop after
+ storing */
+ st0_tag = FPU_gettag0();
+ break;
+ case _PUSH_:
+ {
+ if (FPU_gettagi(-1) != TAG_Empty) {
+ FPU_stack_overflow();
+ return 0;
+ }
+ top--;
+ st0_ptr = &st(0);
+ }
+ break;
+ case _null_:
+ FPU_illegal();
+ return 0;
+#ifdef PARANOID
+ default:
+ EXCEPTION(EX_INTERNAL | 0x141);
+ return 0;
+#endif /* PARANOID */
+ }
+
+ switch (type) {
+ /* type is a 5-bit value: (3-bit FPU_modrm.reg field | opcode[2,1]) */
+ case 000: /* fld m32real (d9 /0) */
+ clear_C1();
+ loaded_tag =
+ FPU_load_single((float __user *)data_address, &loaded_data);
+ if ((loaded_tag == TAG_Special)
+ && isNaN(&loaded_data)
+ && (real_1op_NaN(&loaded_data) < 0)) {
+ top++;
+ break;
+ }
+ FPU_copy_to_reg0(&loaded_data, loaded_tag);
+ break;
+ case 001: /* fild m32int (db /0) */
+ clear_C1();
+ loaded_tag =
+ FPU_load_int32((long __user *)data_address, &loaded_data);
+ FPU_copy_to_reg0(&loaded_data, loaded_tag);
+ break;
+ case 002: /* fld m64real (dd /0) */
+ clear_C1();
+ loaded_tag =
+ FPU_load_double((double __user *)data_address,
+ &loaded_data);
+ if ((loaded_tag == TAG_Special)
+ && isNaN(&loaded_data)
+ && (real_1op_NaN(&loaded_data) < 0)) {
+ top++;
+ break;
+ }
+ FPU_copy_to_reg0(&loaded_data, loaded_tag);
+ break;
+ case 003: /* fild m16int (df /0) */
+ clear_C1();
+ loaded_tag =
+ FPU_load_int16((short __user *)data_address, &loaded_data);
+ FPU_copy_to_reg0(&loaded_data, loaded_tag);
+ break;
+ /* case 004: undefined (d9 /1) */
+ /* fisttp are enabled if CPUID(1).ECX(0) "sse3" is set */
+ case 005: /* fisttp m32int (db /1) */
+ clear_C1();
+ sv_cw = control_word;
+ control_word |= RC_CHOP;
+ if (FPU_store_int32
+ (st0_ptr, st0_tag, (long __user *)data_address))
+ pop_0(); /* pop only if the number was actually stored
+ (see the 80486 manual p16-28) */
+ control_word = sv_cw;
+ break;
+ case 006: /* fisttp m64int (dd /1) */
+ clear_C1();
+ sv_cw = control_word;
+ control_word |= RC_CHOP;
+ if (FPU_store_int64
+ (st0_ptr, st0_tag, (long long __user *)data_address))
+ pop_0(); /* pop only if the number was actually stored
+ (see the 80486 manual p16-28) */
+ control_word = sv_cw;
+ break;
+ case 007: /* fisttp m16int (df /1) */
+ clear_C1();
+ sv_cw = control_word;
+ control_word |= RC_CHOP;
+ if (FPU_store_int16
+ (st0_ptr, st0_tag, (short __user *)data_address))
+ pop_0(); /* pop only if the number was actually stored
+ (see the 80486 manual p16-28) */
+ control_word = sv_cw;
+ break;
+ case 010: /* fst m32real */
+ clear_C1();
+ FPU_store_single(st0_ptr, st0_tag,
+ (float __user *)data_address);
+ break;
+ case 011: /* fist m32int */
+ clear_C1();
+ FPU_store_int32(st0_ptr, st0_tag, (long __user *)data_address);
+ break;
+ case 012: /* fst m64real */
+ clear_C1();
+ FPU_store_double(st0_ptr, st0_tag,
+ (double __user *)data_address);
+ break;
+ case 013: /* fist m16int */
+ clear_C1();
+ FPU_store_int16(st0_ptr, st0_tag, (short __user *)data_address);
+ break;
+ case 014: /* fstp m32real */
+ clear_C1();
+ if (FPU_store_single
+ (st0_ptr, st0_tag, (float __user *)data_address))
+ pop_0(); /* pop only if the number was actually stored
+ (see the 80486 manual p16-28) */
+ break;
+ case 015: /* fistp m32int */
+ clear_C1();
+ if (FPU_store_int32
+ (st0_ptr, st0_tag, (long __user *)data_address))
+ pop_0(); /* pop only if the number was actually stored
+ (see the 80486 manual p16-28) */
+ break;
+ case 016: /* fstp m64real */
+ clear_C1();
+ if (FPU_store_double
+ (st0_ptr, st0_tag, (double __user *)data_address))
+ pop_0(); /* pop only if the number was actually stored
+ (see the 80486 manual p16-28) */
+ break;
+ case 017: /* fistp m16int */
+ clear_C1();
+ if (FPU_store_int16
+ (st0_ptr, st0_tag, (short __user *)data_address))
+ pop_0(); /* pop only if the number was actually stored
+ (see the 80486 manual p16-28) */
+ break;
+ case 020: /* fldenv m14/28byte */
+ fldenv(addr_modes, (u_char __user *) data_address);
+ /* Ensure that the values just loaded are not changed by
+ fix-up operations. */
+ return 1;
+ case 022: /* frstor m94/108byte */
+ frstor(addr_modes, (u_char __user *) data_address);
+ /* Ensure that the values just loaded are not changed by
+ fix-up operations. */
+ return 1;
+ case 023: /* fbld m80dec */
+ clear_C1();
+ loaded_tag = FPU_load_bcd((u_char __user *) data_address);
+ FPU_settag0(loaded_tag);
+ break;
+ case 024: /* fldcw */
+ RE_ENTRANT_CHECK_OFF;
+ FPU_access_ok(VERIFY_READ, data_address, 2);
+ FPU_get_user(control_word,
+ (unsigned short __user *)data_address);
+ RE_ENTRANT_CHECK_ON;
+ if (partial_status & ~control_word & CW_Exceptions)
+ partial_status |= (SW_Summary | SW_Backward);
+ else
+ partial_status &= ~(SW_Summary | SW_Backward);
+#ifdef PECULIAR_486
+ control_word |= 0x40; /* An 80486 appears to always set this bit */
+#endif /* PECULIAR_486 */
+ return 1;
+ case 025: /* fld m80real */
+ clear_C1();
+ loaded_tag =
+ FPU_load_extended((long double __user *)data_address, 0);
+ FPU_settag0(loaded_tag);
+ break;
+ case 027: /* fild m64int */
+ clear_C1();
+ loaded_tag = FPU_load_int64((long long __user *)data_address);
+ if (loaded_tag == TAG_Error)
+ return 0;
+ FPU_settag0(loaded_tag);
+ break;
+ case 030: /* fstenv m14/28byte */
+ fstenv(addr_modes, (u_char __user *) data_address);
+ return 1;
+ case 032: /* fsave */
+ fsave(addr_modes, (u_char __user *) data_address);
+ return 1;
+ case 033: /* fbstp m80dec */
+ clear_C1();
+ if (FPU_store_bcd
+ (st0_ptr, st0_tag, (u_char __user *) data_address))
+ pop_0(); /* pop only if the number was actually stored
+ (see the 80486 manual p16-28) */
+ break;
+ case 034: /* fstcw m16int */
+ RE_ENTRANT_CHECK_OFF;
+ FPU_access_ok(VERIFY_WRITE, data_address, 2);
+ FPU_put_user(control_word,
+ (unsigned short __user *)data_address);
+ RE_ENTRANT_CHECK_ON;
+ return 1;
+ case 035: /* fstp m80real */
+ clear_C1();
+ if (FPU_store_extended
+ (st0_ptr, st0_tag, (long double __user *)data_address))
+ pop_0(); /* pop only if the number was actually stored
+ (see the 80486 manual p16-28) */
+ break;
+ case 036: /* fstsw m2byte */
+ RE_ENTRANT_CHECK_OFF;
+ FPU_access_ok(VERIFY_WRITE, data_address, 2);
+ FPU_put_user(status_word(),
+ (unsigned short __user *)data_address);
+ RE_ENTRANT_CHECK_ON;
+ return 1;
+ case 037: /* fistp m64int */
+ clear_C1();
+ if (FPU_store_int64
+ (st0_ptr, st0_tag, (long long __user *)data_address))
+ pop_0(); /* pop only if the number was actually stored
+ (see the 80486 manual p16-28) */
+ break;
+ }
+ return 0;
+}
diff --git a/arch/x86/math-emu/mul_Xsig.S b/arch/x86/math-emu/mul_Xsig.S
new file mode 100644
index 000000000..3e489122a
--- /dev/null
+++ b/arch/x86/math-emu/mul_Xsig.S
@@ -0,0 +1,179 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*---------------------------------------------------------------------------+
+ | mul_Xsig.S |
+ | |
+ | Multiply a 12 byte fixed point number by another fixed point number. |
+ | |
+ | Copyright (C) 1992,1994,1995 |
+ | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, |
+ | Australia. E-mail billm@jacobi.maths.monash.edu.au |
+ | |
+ | Call from C as: |
+ | void mul32_Xsig(Xsig *x, unsigned b) |
+ | |
+ | void mul64_Xsig(Xsig *x, unsigned long long *b) |
+ | |
+ | void mul_Xsig_Xsig(Xsig *x, unsigned *b) |
+ | |
+ | The result is neither rounded nor normalized, and the ls bit or so may |
+ | be wrong. |
+ | |
+ +---------------------------------------------------------------------------*/
+ .file "mul_Xsig.S"
+
+
+#include "fpu_emu.h"
+
+.text
+ENTRY(mul32_Xsig)
+ pushl %ebp
+ movl %esp,%ebp
+ subl $16,%esp
+ pushl %esi
+
+ movl PARAM1,%esi
+ movl PARAM2,%ecx
+
+ xor %eax,%eax
+ movl %eax,-4(%ebp)
+ movl %eax,-8(%ebp)
+
+ movl (%esi),%eax /* lsl of Xsig */
+ mull %ecx /* msl of b */
+ movl %edx,-12(%ebp)
+
+ movl 4(%esi),%eax /* midl of Xsig */
+ mull %ecx /* msl of b */
+ addl %eax,-12(%ebp)
+ adcl %edx,-8(%ebp)
+ adcl $0,-4(%ebp)
+
+ movl 8(%esi),%eax /* msl of Xsig */
+ mull %ecx /* msl of b */
+ addl %eax,-8(%ebp)
+ adcl %edx,-4(%ebp)
+
+ movl -12(%ebp),%eax
+ movl %eax,(%esi)
+ movl -8(%ebp),%eax
+ movl %eax,4(%esi)
+ movl -4(%ebp),%eax
+ movl %eax,8(%esi)
+
+ popl %esi
+ leave
+ ret
+ENDPROC(mul32_Xsig)
+
+
+ENTRY(mul64_Xsig)
+ pushl %ebp
+ movl %esp,%ebp
+ subl $16,%esp
+ pushl %esi
+
+ movl PARAM1,%esi
+ movl PARAM2,%ecx
+
+ xor %eax,%eax
+ movl %eax,-4(%ebp)
+ movl %eax,-8(%ebp)
+
+ movl (%esi),%eax /* lsl of Xsig */
+ mull 4(%ecx) /* msl of b */
+ movl %edx,-12(%ebp)
+
+ movl 4(%esi),%eax /* midl of Xsig */
+ mull (%ecx) /* lsl of b */
+ addl %edx,-12(%ebp)
+ adcl $0,-8(%ebp)
+ adcl $0,-4(%ebp)
+
+ movl 4(%esi),%eax /* midl of Xsig */
+ mull 4(%ecx) /* msl of b */
+ addl %eax,-12(%ebp)
+ adcl %edx,-8(%ebp)
+ adcl $0,-4(%ebp)
+
+ movl 8(%esi),%eax /* msl of Xsig */
+ mull (%ecx) /* lsl of b */
+ addl %eax,-12(%ebp)
+ adcl %edx,-8(%ebp)
+ adcl $0,-4(%ebp)
+
+ movl 8(%esi),%eax /* msl of Xsig */
+ mull 4(%ecx) /* msl of b */
+ addl %eax,-8(%ebp)
+ adcl %edx,-4(%ebp)
+
+ movl -12(%ebp),%eax
+ movl %eax,(%esi)
+ movl -8(%ebp),%eax
+ movl %eax,4(%esi)
+ movl -4(%ebp),%eax
+ movl %eax,8(%esi)
+
+ popl %esi
+ leave
+ ret
+ENDPROC(mul64_Xsig)
+
+
+
+ENTRY(mul_Xsig_Xsig)
+ pushl %ebp
+ movl %esp,%ebp
+ subl $16,%esp
+ pushl %esi
+
+ movl PARAM1,%esi
+ movl PARAM2,%ecx
+
+ xor %eax,%eax
+ movl %eax,-4(%ebp)
+ movl %eax,-8(%ebp)
+
+ movl (%esi),%eax /* lsl of Xsig */
+ mull 8(%ecx) /* msl of b */
+ movl %edx,-12(%ebp)
+
+ movl 4(%esi),%eax /* midl of Xsig */
+ mull 4(%ecx) /* midl of b */
+ addl %edx,-12(%ebp)
+ adcl $0,-8(%ebp)
+ adcl $0,-4(%ebp)
+
+ movl 8(%esi),%eax /* msl of Xsig */
+ mull (%ecx) /* lsl of b */
+ addl %edx,-12(%ebp)
+ adcl $0,-8(%ebp)
+ adcl $0,-4(%ebp)
+
+ movl 4(%esi),%eax /* midl of Xsig */
+ mull 8(%ecx) /* msl of b */
+ addl %eax,-12(%ebp)
+ adcl %edx,-8(%ebp)
+ adcl $0,-4(%ebp)
+
+ movl 8(%esi),%eax /* msl of Xsig */
+ mull 4(%ecx) /* midl of b */
+ addl %eax,-12(%ebp)
+ adcl %edx,-8(%ebp)
+ adcl $0,-4(%ebp)
+
+ movl 8(%esi),%eax /* msl of Xsig */
+ mull 8(%ecx) /* msl of b */
+ addl %eax,-8(%ebp)
+ adcl %edx,-4(%ebp)
+
+ movl -12(%ebp),%edx
+ movl %edx,(%esi)
+ movl -8(%ebp),%edx
+ movl %edx,4(%esi)
+ movl -4(%ebp),%edx
+ movl %edx,8(%esi)
+
+ popl %esi
+ leave
+ ret
+ENDPROC(mul_Xsig_Xsig)
diff --git a/arch/x86/math-emu/poly.h b/arch/x86/math-emu/poly.h
new file mode 100644
index 000000000..fc1c887ca
--- /dev/null
+++ b/arch/x86/math-emu/poly.h
@@ -0,0 +1,115 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*---------------------------------------------------------------------------+
+ | poly.h |
+ | |
+ | Header file for the FPU-emu poly*.c source files. |
+ | |
+ | Copyright (C) 1994,1999 |
+ | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, |
+ | Australia. E-mail billm@melbpc.org.au |
+ | |
+ | Declarations and definitions for functions operating on Xsig (12-byte |
+ | extended-significand) quantities. |
+ | |
+ +---------------------------------------------------------------------------*/
+
+#ifndef _POLY_H
+#define _POLY_H
+
+/* This 12-byte structure is used to improve the accuracy of computation
+ of transcendental functions.
+ Intended to be used to get results better than 8-byte computation
+ allows. 9-byte would probably be sufficient.
+ */
+typedef struct {
+ unsigned long lsw;
+ unsigned long midw;
+ unsigned long msw;
+} Xsig;
+
+asmlinkage void mul64(unsigned long long const *a, unsigned long long const *b,
+ unsigned long long *result);
+asmlinkage void polynomial_Xsig(Xsig *, const unsigned long long *x,
+ const unsigned long long terms[], const int n);
+
+asmlinkage void mul32_Xsig(Xsig *, const unsigned long mult);
+asmlinkage void mul64_Xsig(Xsig *, const unsigned long long *mult);
+asmlinkage void mul_Xsig_Xsig(Xsig *dest, const Xsig *mult);
+
+asmlinkage void shr_Xsig(Xsig *, const int n);
+asmlinkage int round_Xsig(Xsig *);
+asmlinkage int norm_Xsig(Xsig *);
+asmlinkage void div_Xsig(Xsig *x1, const Xsig *x2, const Xsig *dest);
+
+/* Macro to extract the most significant 32 bits from a long long */
+#define LL_MSW(x) (((unsigned long *)&x)[1])
+
+/* Macro to initialize an Xsig struct */
+#define MK_XSIG(a,b,c) { c, b, a }
+
+/* Macro to access the 8 ms bytes of an Xsig as a long long */
+#define XSIG_LL(x) (*(unsigned long long *)&x.midw)
+
+/*
+ Need to run gcc with optimizations on to get these to
+ actually be in-line.
+ */
+
+/* Multiply two fixed-point 32 bit numbers, producing a 32 bit result.
+ The answer is the ms word of the product. */
+/* Some versions of gcc make it difficult to stop eax from being clobbered.
+ Merely specifying that it is used doesn't work...
+ */
+static inline unsigned long mul_32_32(const unsigned long arg1,
+ const unsigned long arg2)
+{
+ int retval;
+ asm volatile ("mull %2; movl %%edx,%%eax":"=a" (retval)
+ :"0"(arg1), "g"(arg2)
+ :"dx");
+ return retval;
+}
+
+/* Add the 12 byte Xsig x2 to Xsig dest, with no checks for overflow. */
+static inline void add_Xsig_Xsig(Xsig *dest, const Xsig *x2)
+{
+ asm volatile ("movl %1,%%edi; movl %2,%%esi;\n"
+ "movl (%%esi),%%eax; addl %%eax,(%%edi);\n"
+ "movl 4(%%esi),%%eax; adcl %%eax,4(%%edi);\n"
+ "movl 8(%%esi),%%eax; adcl %%eax,8(%%edi);\n":"=g"
+ (*dest):"g"(dest), "g"(x2)
+ :"ax", "si", "di");
+}
+
+/* Add the 12 byte Xsig x2 to Xsig dest, adjust exp if overflow occurs. */
+/* Note: the constraints in the asm statement didn't always work properly
+ with gcc 2.5.8. Changing from using edi to using ecx got around the
+ problem, but keep fingers crossed! */
+static inline void add_two_Xsig(Xsig *dest, const Xsig *x2, long int *exp)
+{
+ asm volatile ("movl %2,%%ecx; movl %3,%%esi;\n"
+ "movl (%%esi),%%eax; addl %%eax,(%%ecx);\n"
+ "movl 4(%%esi),%%eax; adcl %%eax,4(%%ecx);\n"
+ "movl 8(%%esi),%%eax; adcl %%eax,8(%%ecx);\n"
+ "jnc 0f;\n"
+ "rcrl 8(%%ecx); rcrl 4(%%ecx); rcrl (%%ecx)\n"
+ "movl %4,%%ecx; incl (%%ecx)\n"
+ "movl $1,%%eax; jmp 1f;\n"
+ "0: xorl %%eax,%%eax;\n" "1:\n":"=g" (*exp), "=g"(*dest)
+ :"g"(dest), "g"(x2), "g"(exp)
+ :"cx", "si", "ax");
+}
+
+/* Negate (subtract from 1.0) the 12 byte Xsig */
+/* This is faster in a loop on my 386 than using the "neg" instruction. */
+static inline void negate_Xsig(Xsig *x)
+{
+ asm volatile ("movl %1,%%esi;\n"
+ "xorl %%ecx,%%ecx;\n"
+ "movl %%ecx,%%eax; subl (%%esi),%%eax; movl %%eax,(%%esi);\n"
+ "movl %%ecx,%%eax; sbbl 4(%%esi),%%eax; movl %%eax,4(%%esi);\n"
+ "movl %%ecx,%%eax; sbbl 8(%%esi),%%eax; movl %%eax,8(%%esi);\n":"=g"
+ (*x):"g"(x):"si", "ax", "cx");
+}
+
+#endif /* _POLY_H */
diff --git a/arch/x86/math-emu/poly_2xm1.c b/arch/x86/math-emu/poly_2xm1.c
new file mode 100644
index 000000000..aa33006ba
--- /dev/null
+++ b/arch/x86/math-emu/poly_2xm1.c
@@ -0,0 +1,146 @@
+// SPDX-License-Identifier: GPL-2.0
+/*---------------------------------------------------------------------------+
+ | poly_2xm1.c |
+ | |
+ | Function to compute 2^x-1 by a polynomial approximation. |
+ | |
+ | Copyright (C) 1992,1993,1994,1997 |
+ | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia |
+ | E-mail billm@suburbia.net |
+ | |
+ | |
+ +---------------------------------------------------------------------------*/
+
+#include "exception.h"
+#include "reg_constant.h"
+#include "fpu_emu.h"
+#include "fpu_system.h"
+#include "control_w.h"
+#include "poly.h"
+
+#define HIPOWER 11
+static const unsigned long long lterms[HIPOWER] = {
+ 0x0000000000000000LL, /* This term done separately as 12 bytes */
+ 0xf5fdeffc162c7543LL,
+ 0x1c6b08d704a0bfa6LL,
+ 0x0276556df749cc21LL,
+ 0x002bb0ffcf14f6b8LL,
+ 0x0002861225ef751cLL,
+ 0x00001ffcbfcd5422LL,
+ 0x00000162c005d5f1LL,
+ 0x0000000da96ccb1bLL,
+ 0x0000000078d1b897LL,
+ 0x000000000422b029LL
+};
+
+static const Xsig hiterm = MK_XSIG(0xb17217f7, 0xd1cf79ab, 0xc8a39194);
+
+/* Four slices: 0.0 : 0.25 : 0.50 : 0.75 : 1.0,
+ These numbers are 2^(1/4), 2^(1/2), and 2^(3/4)
+ */
+static const Xsig shiftterm0 = MK_XSIG(0, 0, 0);
+static const Xsig shiftterm1 = MK_XSIG(0x9837f051, 0x8db8a96f, 0x46ad2318);
+static const Xsig shiftterm2 = MK_XSIG(0xb504f333, 0xf9de6484, 0x597d89b3);
+static const Xsig shiftterm3 = MK_XSIG(0xd744fcca, 0xd69d6af4, 0x39a68bb9);
+
+static const Xsig *shiftterm[] = { &shiftterm0, &shiftterm1,
+ &shiftterm2, &shiftterm3
+};
+
+/*--- poly_2xm1() -----------------------------------------------------------+
+ | Requires st(0) which is TAG_Valid and < 1. |
+ +---------------------------------------------------------------------------*/
+int poly_2xm1(u_char sign, FPU_REG *arg, FPU_REG *result)
+{
+ long int exponent, shift;
+ unsigned long long Xll;
+ Xsig accumulator, Denom, argSignif;
+ u_char tag;
+
+ exponent = exponent16(arg);
+
+#ifdef PARANOID
+ if (exponent >= 0) { /* Don't want a |number| >= 1.0 */
+ /* Number negative, too large, or not Valid. */
+ EXCEPTION(EX_INTERNAL | 0x127);
+ return 1;
+ }
+#endif /* PARANOID */
+
+ argSignif.lsw = 0;
+ XSIG_LL(argSignif) = Xll = significand(arg);
+
+ if (exponent == -1) {
+ shift = (argSignif.msw & 0x40000000) ? 3 : 2;
+ /* subtract 0.5 or 0.75 */
+ exponent -= 2;
+ XSIG_LL(argSignif) <<= 2;
+ Xll <<= 2;
+ } else if (exponent == -2) {
+ shift = 1;
+ /* subtract 0.25 */
+ exponent--;
+ XSIG_LL(argSignif) <<= 1;
+ Xll <<= 1;
+ } else
+ shift = 0;
+
+ if (exponent < -2) {
+ /* Shift the argument right by the required places. */
+ if (FPU_shrx(&Xll, -2 - exponent) >= 0x80000000U)
+ Xll++; /* round up */
+ }
+
+ accumulator.lsw = accumulator.midw = accumulator.msw = 0;
+ polynomial_Xsig(&accumulator, &Xll, lterms, HIPOWER - 1);
+ mul_Xsig_Xsig(&accumulator, &argSignif);
+ shr_Xsig(&accumulator, 3);
+
+ mul_Xsig_Xsig(&argSignif, &hiterm); /* The leading term */
+ add_two_Xsig(&accumulator, &argSignif, &exponent);
+
+ if (shift) {
+ /* The argument is large, use the identity:
+ f(x+a) = f(a) * (f(x) + 1) - 1;
+ */
+ shr_Xsig(&accumulator, -exponent);
+ accumulator.msw |= 0x80000000; /* add 1.0 */
+ mul_Xsig_Xsig(&accumulator, shiftterm[shift]);
+ accumulator.msw &= 0x3fffffff; /* subtract 1.0 */
+ exponent = 1;
+ }
+
+ if (sign != SIGN_POS) {
+ /* The argument is negative, use the identity:
+ f(-x) = -f(x) / (1 + f(x))
+ */
+ Denom.lsw = accumulator.lsw;
+ XSIG_LL(Denom) = XSIG_LL(accumulator);
+ if (exponent < 0)
+ shr_Xsig(&Denom, -exponent);
+ else if (exponent > 0) {
+ /* exponent must be 1 here */
+ XSIG_LL(Denom) <<= 1;
+ if (Denom.lsw & 0x80000000)
+ XSIG_LL(Denom) |= 1;
+ (Denom.lsw) <<= 1;
+ }
+ Denom.msw |= 0x80000000; /* add 1.0 */
+ div_Xsig(&accumulator, &Denom, &accumulator);
+ }
+
+ /* Convert to 64 bit signed-compatible */
+ exponent += round_Xsig(&accumulator);
+
+ result = &st(0);
+ significand(result) = XSIG_LL(accumulator);
+ setexponent16(result, exponent);
+
+ tag = FPU_round(result, 1, 0, FULL_PRECISION, sign);
+
+ setsign(result, sign);
+ FPU_settag0(tag);
+
+ return 0;
+
+}
diff --git a/arch/x86/math-emu/poly_atan.c b/arch/x86/math-emu/poly_atan.c
new file mode 100644
index 000000000..7e7412c5a
--- /dev/null
+++ b/arch/x86/math-emu/poly_atan.c
@@ -0,0 +1,209 @@
+// SPDX-License-Identifier: GPL-2.0
+/*---------------------------------------------------------------------------+
+ | poly_atan.c |
+ | |
+ | Compute the arctan of a FPU_REG, using a polynomial approximation. |
+ | |
+ | Copyright (C) 1992,1993,1994,1997 |
+ | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia |
+ | E-mail billm@suburbia.net |
+ | |
+ | |
+ +---------------------------------------------------------------------------*/
+
+#include "exception.h"
+#include "reg_constant.h"
+#include "fpu_emu.h"
+#include "fpu_system.h"
+#include "status_w.h"
+#include "control_w.h"
+#include "poly.h"
+
+#define HIPOWERon 6 /* odd poly, negative terms */
+static const unsigned long long oddnegterms[HIPOWERon] = {
+ 0x0000000000000000LL, /* Dummy (not for - 1.0) */
+ 0x015328437f756467LL,
+ 0x0005dda27b73dec6LL,
+ 0x0000226bf2bfb91aLL,
+ 0x000000ccc439c5f7LL,
+ 0x0000000355438407LL
+};
+
+#define HIPOWERop 6 /* odd poly, positive terms */
+static const unsigned long long oddplterms[HIPOWERop] = {
+/* 0xaaaaaaaaaaaaaaabLL, transferred to fixedpterm[] */
+ 0x0db55a71875c9ac2LL,
+ 0x0029fce2d67880b0LL,
+ 0x0000dfd3908b4596LL,
+ 0x00000550fd61dab4LL,
+ 0x0000001c9422b3f9LL,
+ 0x000000003e3301e1LL
+};
+
+static const unsigned long long denomterm = 0xebd9b842c5c53a0eLL;
+
+static const Xsig fixedpterm = MK_XSIG(0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa);
+
+static const Xsig pi_signif = MK_XSIG(0xc90fdaa2, 0x2168c234, 0xc4c6628b);
+
+/*--- poly_atan() -----------------------------------------------------------+
+ | |
+ +---------------------------------------------------------------------------*/
+void poly_atan(FPU_REG *st0_ptr, u_char st0_tag,
+ FPU_REG *st1_ptr, u_char st1_tag)
+{
+ u_char transformed, inverted, sign1, sign2;
+ int exponent;
+ long int dummy_exp;
+ Xsig accumulator, Numer, Denom, accumulatore, argSignif, argSq, argSqSq;
+ u_char tag;
+
+ sign1 = getsign(st0_ptr);
+ sign2 = getsign(st1_ptr);
+ if (st0_tag == TAG_Valid) {
+ exponent = exponent(st0_ptr);
+ } else {
+ /* This gives non-compatible stack contents... */
+ FPU_to_exp16(st0_ptr, st0_ptr);
+ exponent = exponent16(st0_ptr);
+ }
+ if (st1_tag == TAG_Valid) {
+ exponent -= exponent(st1_ptr);
+ } else {
+ /* This gives non-compatible stack contents... */
+ FPU_to_exp16(st1_ptr, st1_ptr);
+ exponent -= exponent16(st1_ptr);
+ }
+
+ if ((exponent < 0) || ((exponent == 0) &&
+ ((st0_ptr->sigh < st1_ptr->sigh) ||
+ ((st0_ptr->sigh == st1_ptr->sigh) &&
+ (st0_ptr->sigl < st1_ptr->sigl))))) {
+ inverted = 1;
+ Numer.lsw = Denom.lsw = 0;
+ XSIG_LL(Numer) = significand(st0_ptr);
+ XSIG_LL(Denom) = significand(st1_ptr);
+ } else {
+ inverted = 0;
+ exponent = -exponent;
+ Numer.lsw = Denom.lsw = 0;
+ XSIG_LL(Numer) = significand(st1_ptr);
+ XSIG_LL(Denom) = significand(st0_ptr);
+ }
+ div_Xsig(&Numer, &Denom, &argSignif);
+ exponent += norm_Xsig(&argSignif);
+
+ if ((exponent >= -1)
+ || ((exponent == -2) && (argSignif.msw > 0xd413ccd0))) {
+ /* The argument is greater than sqrt(2)-1 (=0.414213562...) */
+ /* Convert the argument by an identity for atan */
+ transformed = 1;
+
+ if (exponent >= 0) {
+#ifdef PARANOID
+ if (!((exponent == 0) &&
+ (argSignif.lsw == 0) && (argSignif.midw == 0) &&
+ (argSignif.msw == 0x80000000))) {
+ EXCEPTION(EX_INTERNAL | 0x104); /* There must be a logic error */
+ return;
+ }
+#endif /* PARANOID */
+ argSignif.msw = 0; /* Make the transformed arg -> 0.0 */
+ } else {
+ Numer.lsw = Denom.lsw = argSignif.lsw;
+ XSIG_LL(Numer) = XSIG_LL(Denom) = XSIG_LL(argSignif);
+
+ if (exponent < -1)
+ shr_Xsig(&Numer, -1 - exponent);
+ negate_Xsig(&Numer);
+
+ shr_Xsig(&Denom, -exponent);
+ Denom.msw |= 0x80000000;
+
+ div_Xsig(&Numer, &Denom, &argSignif);
+
+ exponent = -1 + norm_Xsig(&argSignif);
+ }
+ } else {
+ transformed = 0;
+ }
+
+ argSq.lsw = argSignif.lsw;
+ argSq.midw = argSignif.midw;
+ argSq.msw = argSignif.msw;
+ mul_Xsig_Xsig(&argSq, &argSq);
+
+ argSqSq.lsw = argSq.lsw;
+ argSqSq.midw = argSq.midw;
+ argSqSq.msw = argSq.msw;
+ mul_Xsig_Xsig(&argSqSq, &argSqSq);
+
+ accumulatore.lsw = argSq.lsw;
+ XSIG_LL(accumulatore) = XSIG_LL(argSq);
+
+ shr_Xsig(&argSq, 2 * (-1 - exponent - 1));
+ shr_Xsig(&argSqSq, 4 * (-1 - exponent - 1));
+
+ /* Now have argSq etc with binary point at the left
+ .1xxxxxxxx */
+
+ /* Do the basic fixed point polynomial evaluation */
+ accumulator.msw = accumulator.midw = accumulator.lsw = 0;
+ polynomial_Xsig(&accumulator, &XSIG_LL(argSqSq),
+ oddplterms, HIPOWERop - 1);
+ mul64_Xsig(&accumulator, &XSIG_LL(argSq));
+ negate_Xsig(&accumulator);
+ polynomial_Xsig(&accumulator, &XSIG_LL(argSqSq), oddnegterms,
+ HIPOWERon - 1);
+ negate_Xsig(&accumulator);
+ add_two_Xsig(&accumulator, &fixedpterm, &dummy_exp);
+
+ mul64_Xsig(&accumulatore, &denomterm);
+ shr_Xsig(&accumulatore, 1 + 2 * (-1 - exponent));
+ accumulatore.msw |= 0x80000000;
+
+ div_Xsig(&accumulator, &accumulatore, &accumulator);
+
+ mul_Xsig_Xsig(&accumulator, &argSignif);
+ mul_Xsig_Xsig(&accumulator, &argSq);
+
+ shr_Xsig(&accumulator, 3);
+ negate_Xsig(&accumulator);
+ add_Xsig_Xsig(&accumulator, &argSignif);
+
+ if (transformed) {
+ /* compute pi/4 - accumulator */
+ shr_Xsig(&accumulator, -1 - exponent);
+ negate_Xsig(&accumulator);
+ add_Xsig_Xsig(&accumulator, &pi_signif);
+ exponent = -1;
+ }
+
+ if (inverted) {
+ /* compute pi/2 - accumulator */
+ shr_Xsig(&accumulator, -exponent);
+ negate_Xsig(&accumulator);
+ add_Xsig_Xsig(&accumulator, &pi_signif);
+ exponent = 0;
+ }
+
+ if (sign1) {
+ /* compute pi - accumulator */
+ shr_Xsig(&accumulator, 1 - exponent);
+ negate_Xsig(&accumulator);
+ add_Xsig_Xsig(&accumulator, &pi_signif);
+ exponent = 1;
+ }
+
+ exponent += round_Xsig(&accumulator);
+
+ significand(st1_ptr) = XSIG_LL(accumulator);
+ setexponent16(st1_ptr, exponent);
+
+ tag = FPU_round(st1_ptr, 1, 0, FULL_PRECISION, sign2);
+ FPU_settagi(1, tag);
+
+ set_precision_flag_up(); /* We do not really know if up or down,
+ use this as the default. */
+
+}
diff --git a/arch/x86/math-emu/poly_l2.c b/arch/x86/math-emu/poly_l2.c
new file mode 100644
index 000000000..98b6949bb
--- /dev/null
+++ b/arch/x86/math-emu/poly_l2.c
@@ -0,0 +1,245 @@
+// SPDX-License-Identifier: GPL-2.0
+/*---------------------------------------------------------------------------+
+ | poly_l2.c |
+ | |
+ | Compute the base 2 log of a FPU_REG, using a polynomial approximation. |
+ | |
+ | Copyright (C) 1992,1993,1994,1997 |
+ | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia |
+ | E-mail billm@suburbia.net |
+ | |
+ | |
+ +---------------------------------------------------------------------------*/
+
+#include "exception.h"
+#include "reg_constant.h"
+#include "fpu_emu.h"
+#include "fpu_system.h"
+#include "control_w.h"
+#include "poly.h"
+
+static void log2_kernel(FPU_REG const *arg, u_char argsign,
+ Xsig * accum_result, long int *expon);
+
+/*--- poly_l2() -------------------------------------------------------------+
+ | Base 2 logarithm by a polynomial approximation. |
+ +---------------------------------------------------------------------------*/
+void poly_l2(FPU_REG *st0_ptr, FPU_REG *st1_ptr, u_char st1_sign)
+{
+ long int exponent, expon, expon_expon;
+ Xsig accumulator, expon_accum, yaccum;
+ u_char sign, argsign;
+ FPU_REG x;
+ int tag;
+
+ exponent = exponent16(st0_ptr);
+
+ /* From st0_ptr, make a number > sqrt(2)/2 and < sqrt(2) */
+ if (st0_ptr->sigh > (unsigned)0xb504f334) {
+ /* Treat as sqrt(2)/2 < st0_ptr < 1 */
+ significand(&x) = -significand(st0_ptr);
+ setexponent16(&x, -1);
+ exponent++;
+ argsign = SIGN_NEG;
+ } else {
+ /* Treat as 1 <= st0_ptr < sqrt(2) */
+ x.sigh = st0_ptr->sigh - 0x80000000;
+ x.sigl = st0_ptr->sigl;
+ setexponent16(&x, 0);
+ argsign = SIGN_POS;
+ }
+ tag = FPU_normalize_nuo(&x);
+
+ if (tag == TAG_Zero) {
+ expon = 0;
+ accumulator.msw = accumulator.midw = accumulator.lsw = 0;
+ } else {
+ log2_kernel(&x, argsign, &accumulator, &expon);
+ }
+
+ if (exponent < 0) {
+ sign = SIGN_NEG;
+ exponent = -exponent;
+ } else
+ sign = SIGN_POS;
+ expon_accum.msw = exponent;
+ expon_accum.midw = expon_accum.lsw = 0;
+ if (exponent) {
+ expon_expon = 31 + norm_Xsig(&expon_accum);
+ shr_Xsig(&accumulator, expon_expon - expon);
+
+ if (sign ^ argsign)
+ negate_Xsig(&accumulator);
+ add_Xsig_Xsig(&accumulator, &expon_accum);
+ } else {
+ expon_expon = expon;
+ sign = argsign;
+ }
+
+ yaccum.lsw = 0;
+ XSIG_LL(yaccum) = significand(st1_ptr);
+ mul_Xsig_Xsig(&accumulator, &yaccum);
+
+ expon_expon += round_Xsig(&accumulator);
+
+ if (accumulator.msw == 0) {
+ FPU_copy_to_reg1(&CONST_Z, TAG_Zero);
+ return;
+ }
+
+ significand(st1_ptr) = XSIG_LL(accumulator);
+ setexponent16(st1_ptr, expon_expon + exponent16(st1_ptr) + 1);
+
+ tag = FPU_round(st1_ptr, 1, 0, FULL_PRECISION, sign ^ st1_sign);
+ FPU_settagi(1, tag);
+
+ set_precision_flag_up(); /* 80486 appears to always do this */
+
+ return;
+
+}
+
+/*--- poly_l2p1() -----------------------------------------------------------+
+ | Base 2 logarithm by a polynomial approximation. |
+ | log2(x+1) |
+ +---------------------------------------------------------------------------*/
+int poly_l2p1(u_char sign0, u_char sign1,
+ FPU_REG * st0_ptr, FPU_REG * st1_ptr, FPU_REG * dest)
+{
+ u_char tag;
+ long int exponent;
+ Xsig accumulator, yaccum;
+
+ if (exponent16(st0_ptr) < 0) {
+ log2_kernel(st0_ptr, sign0, &accumulator, &exponent);
+
+ yaccum.lsw = 0;
+ XSIG_LL(yaccum) = significand(st1_ptr);
+ mul_Xsig_Xsig(&accumulator, &yaccum);
+
+ exponent += round_Xsig(&accumulator);
+
+ exponent += exponent16(st1_ptr) + 1;
+ if (exponent < EXP_WAY_UNDER)
+ exponent = EXP_WAY_UNDER;
+
+ significand(dest) = XSIG_LL(accumulator);
+ setexponent16(dest, exponent);
+
+ tag = FPU_round(dest, 1, 0, FULL_PRECISION, sign0 ^ sign1);
+ FPU_settagi(1, tag);
+
+ if (tag == TAG_Valid)
+ set_precision_flag_up(); /* 80486 appears to always do this */
+ } else {
+ /* The magnitude of st0_ptr is far too large. */
+
+ if (sign0 != SIGN_POS) {
+ /* Trying to get the log of a negative number. */
+#ifdef PECULIAR_486 /* Stupid 80486 doesn't worry about log(negative). */
+ changesign(st1_ptr);
+#else
+ if (arith_invalid(1) < 0)
+ return 1;
+#endif /* PECULIAR_486 */
+ }
+
+ /* 80486 appears to do this */
+ if (sign0 == SIGN_NEG)
+ set_precision_flag_down();
+ else
+ set_precision_flag_up();
+ }
+
+ if (exponent(dest) <= EXP_UNDER)
+ EXCEPTION(EX_Underflow);
+
+ return 0;
+
+}
+
+#undef HIPOWER
+#define HIPOWER 10
+static const unsigned long long logterms[HIPOWER] = {
+ 0x2a8eca5705fc2ef0LL,
+ 0xf6384ee1d01febceLL,
+ 0x093bb62877cdf642LL,
+ 0x006985d8a9ec439bLL,
+ 0x0005212c4f55a9c8LL,
+ 0x00004326a16927f0LL,
+ 0x0000038d1d80a0e7LL,
+ 0x0000003141cc80c6LL,
+ 0x00000002b1668c9fLL,
+ 0x000000002c7a46aaLL
+};
+
+static const unsigned long leadterm = 0xb8000000;
+
+/*--- log2_kernel() ---------------------------------------------------------+
+ | Base 2 logarithm by a polynomial approximation. |
+ | log2(x+1) |
+ +---------------------------------------------------------------------------*/
+static void log2_kernel(FPU_REG const *arg, u_char argsign, Xsig *accum_result,
+ long int *expon)
+{
+ long int exponent, adj;
+ unsigned long long Xsq;
+ Xsig accumulator, Numer, Denom, argSignif, arg_signif;
+
+ exponent = exponent16(arg);
+ Numer.lsw = Denom.lsw = 0;
+ XSIG_LL(Numer) = XSIG_LL(Denom) = significand(arg);
+ if (argsign == SIGN_POS) {
+ shr_Xsig(&Denom, 2 - (1 + exponent));
+ Denom.msw |= 0x80000000;
+ div_Xsig(&Numer, &Denom, &argSignif);
+ } else {
+ shr_Xsig(&Denom, 1 - (1 + exponent));
+ negate_Xsig(&Denom);
+ if (Denom.msw & 0x80000000) {
+ div_Xsig(&Numer, &Denom, &argSignif);
+ exponent++;
+ } else {
+ /* Denom must be 1.0 */
+ argSignif.lsw = Numer.lsw;
+ argSignif.midw = Numer.midw;
+ argSignif.msw = Numer.msw;
+ }
+ }
+
+#ifndef PECULIAR_486
+ /* Should check here that |local_arg| is within the valid range */
+ if (exponent >= -2) {
+ if ((exponent > -2) || (argSignif.msw > (unsigned)0xafb0ccc0)) {
+ /* The argument is too large */
+ }
+ }
+#endif /* PECULIAR_486 */
+
+ arg_signif.lsw = argSignif.lsw;
+ XSIG_LL(arg_signif) = XSIG_LL(argSignif);
+ adj = norm_Xsig(&argSignif);
+ accumulator.lsw = argSignif.lsw;
+ XSIG_LL(accumulator) = XSIG_LL(argSignif);
+ mul_Xsig_Xsig(&accumulator, &accumulator);
+ shr_Xsig(&accumulator, 2 * (-1 - (1 + exponent + adj)));
+ Xsq = XSIG_LL(accumulator);
+ if (accumulator.lsw & 0x80000000)
+ Xsq++;
+
+ accumulator.msw = accumulator.midw = accumulator.lsw = 0;
+ /* Do the basic fixed point polynomial evaluation */
+ polynomial_Xsig(&accumulator, &Xsq, logterms, HIPOWER - 1);
+
+ mul_Xsig_Xsig(&accumulator, &argSignif);
+ shr_Xsig(&accumulator, 6 - adj);
+
+ mul32_Xsig(&arg_signif, leadterm);
+ add_two_Xsig(&accumulator, &arg_signif, &exponent);
+
+ *expon = exponent + 1;
+ accum_result->lsw = accumulator.lsw;
+ accum_result->midw = accumulator.midw;
+ accum_result->msw = accumulator.msw;
+
+}
diff --git a/arch/x86/math-emu/poly_sin.c b/arch/x86/math-emu/poly_sin.c
new file mode 100644
index 000000000..c192fba51
--- /dev/null
+++ b/arch/x86/math-emu/poly_sin.c
@@ -0,0 +1,379 @@
+// SPDX-License-Identifier: GPL-2.0
+/*---------------------------------------------------------------------------+
+ | poly_sin.c |
+ | |
+ | Computation of an approximation of the sin function and the cosine |
+ | function by a polynomial. |
+ | |
+ | Copyright (C) 1992,1993,1994,1997,1999 |
+ | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia |
+ | E-mail billm@melbpc.org.au |
+ | |
+ | |
+ +---------------------------------------------------------------------------*/
+
+#include "exception.h"
+#include "reg_constant.h"
+#include "fpu_emu.h"
+#include "fpu_system.h"
+#include "control_w.h"
+#include "poly.h"
+
+#define N_COEFF_P 4
+#define N_COEFF_N 4
+
+static const unsigned long long pos_terms_l[N_COEFF_P] = {
+ 0xaaaaaaaaaaaaaaabLL,
+ 0x00d00d00d00cf906LL,
+ 0x000006b99159a8bbLL,
+ 0x000000000d7392e6LL
+};
+
+static const unsigned long long neg_terms_l[N_COEFF_N] = {
+ 0x2222222222222167LL,
+ 0x0002e3bc74aab624LL,
+ 0x0000000b09229062LL,
+ 0x00000000000c7973LL
+};
+
+#define N_COEFF_PH 4
+#define N_COEFF_NH 4
+static const unsigned long long pos_terms_h[N_COEFF_PH] = {
+ 0x0000000000000000LL,
+ 0x05b05b05b05b0406LL,
+ 0x000049f93edd91a9LL,
+ 0x00000000c9c9ed62LL
+};
+
+static const unsigned long long neg_terms_h[N_COEFF_NH] = {
+ 0xaaaaaaaaaaaaaa98LL,
+ 0x001a01a01a019064LL,
+ 0x0000008f76c68a77LL,
+ 0x0000000000d58f5eLL
+};
+
+/*--- poly_sine() -----------------------------------------------------------+
+ | |
+ +---------------------------------------------------------------------------*/
+void poly_sine(FPU_REG *st0_ptr)
+{
+ int exponent, echange;
+ Xsig accumulator, argSqrd, argTo4;
+ unsigned long fix_up, adj;
+ unsigned long long fixed_arg;
+ FPU_REG result;
+
+ exponent = exponent(st0_ptr);
+
+ accumulator.lsw = accumulator.midw = accumulator.msw = 0;
+
+ /* Split into two ranges, for arguments below and above 1.0 */
+ /* The boundary between upper and lower is approx 0.88309101259 */
+ if ((exponent < -1)
+ || ((exponent == -1) && (st0_ptr->sigh <= 0xe21240aa))) {
+ /* The argument is <= 0.88309101259 */
+
+ argSqrd.msw = st0_ptr->sigh;
+ argSqrd.midw = st0_ptr->sigl;
+ argSqrd.lsw = 0;
+ mul64_Xsig(&argSqrd, &significand(st0_ptr));
+ shr_Xsig(&argSqrd, 2 * (-1 - exponent));
+ argTo4.msw = argSqrd.msw;
+ argTo4.midw = argSqrd.midw;
+ argTo4.lsw = argSqrd.lsw;
+ mul_Xsig_Xsig(&argTo4, &argTo4);
+
+ polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), neg_terms_l,
+ N_COEFF_N - 1);
+ mul_Xsig_Xsig(&accumulator, &argSqrd);
+ negate_Xsig(&accumulator);
+
+ polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), pos_terms_l,
+ N_COEFF_P - 1);
+
+ shr_Xsig(&accumulator, 2); /* Divide by four */
+ accumulator.msw |= 0x80000000; /* Add 1.0 */
+
+ mul64_Xsig(&accumulator, &significand(st0_ptr));
+ mul64_Xsig(&accumulator, &significand(st0_ptr));
+ mul64_Xsig(&accumulator, &significand(st0_ptr));
+
+ /* Divide by four, FPU_REG compatible, etc */
+ exponent = 3 * exponent;
+
+ /* The minimum exponent difference is 3 */
+ shr_Xsig(&accumulator, exponent(st0_ptr) - exponent);
+
+ negate_Xsig(&accumulator);
+ XSIG_LL(accumulator) += significand(st0_ptr);
+
+ echange = round_Xsig(&accumulator);
+
+ setexponentpos(&result, exponent(st0_ptr) + echange);
+ } else {
+ /* The argument is > 0.88309101259 */
+ /* We use sin(st(0)) = cos(pi/2-st(0)) */
+
+ fixed_arg = significand(st0_ptr);
+
+ if (exponent == 0) {
+ /* The argument is >= 1.0 */
+
+ /* Put the binary point at the left. */
+ fixed_arg <<= 1;
+ }
+ /* pi/2 in hex is: 1.921fb54442d18469 898CC51701B839A2 52049C1 */
+ fixed_arg = 0x921fb54442d18469LL - fixed_arg;
+ /* There is a special case which arises due to rounding, to fix here. */
+ if (fixed_arg == 0xffffffffffffffffLL)
+ fixed_arg = 0;
+
+ XSIG_LL(argSqrd) = fixed_arg;
+ argSqrd.lsw = 0;
+ mul64_Xsig(&argSqrd, &fixed_arg);
+
+ XSIG_LL(argTo4) = XSIG_LL(argSqrd);
+ argTo4.lsw = argSqrd.lsw;
+ mul_Xsig_Xsig(&argTo4, &argTo4);
+
+ polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), neg_terms_h,
+ N_COEFF_NH - 1);
+ mul_Xsig_Xsig(&accumulator, &argSqrd);
+ negate_Xsig(&accumulator);
+
+ polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), pos_terms_h,
+ N_COEFF_PH - 1);
+ negate_Xsig(&accumulator);
+
+ mul64_Xsig(&accumulator, &fixed_arg);
+ mul64_Xsig(&accumulator, &fixed_arg);
+
+ shr_Xsig(&accumulator, 3);
+ negate_Xsig(&accumulator);
+
+ add_Xsig_Xsig(&accumulator, &argSqrd);
+
+ shr_Xsig(&accumulator, 1);
+
+ accumulator.lsw |= 1; /* A zero accumulator here would cause problems */
+ negate_Xsig(&accumulator);
+
+ /* The basic computation is complete. Now fix the answer to
+ compensate for the error due to the approximation used for
+ pi/2
+ */
+
+ /* This has an exponent of -65 */
+ fix_up = 0x898cc517;
+ /* The fix-up needs to be improved for larger args */
+ if (argSqrd.msw & 0xffc00000) {
+ /* Get about 32 bit precision in these: */
+ fix_up -= mul_32_32(0x898cc517, argSqrd.msw) / 6;
+ }
+ fix_up = mul_32_32(fix_up, LL_MSW(fixed_arg));
+
+ adj = accumulator.lsw; /* temp save */
+ accumulator.lsw -= fix_up;
+ if (accumulator.lsw > adj)
+ XSIG_LL(accumulator)--;
+
+ echange = round_Xsig(&accumulator);
+
+ setexponentpos(&result, echange - 1);
+ }
+
+ significand(&result) = XSIG_LL(accumulator);
+ setsign(&result, getsign(st0_ptr));
+ FPU_copy_to_reg0(&result, TAG_Valid);
+
+#ifdef PARANOID
+ if ((exponent(&result) >= 0)
+ && (significand(&result) > 0x8000000000000000LL)) {
+ EXCEPTION(EX_INTERNAL | 0x150);
+ }
+#endif /* PARANOID */
+
+}
+
+/*--- poly_cos() ------------------------------------------------------------+
+ | |
+ +---------------------------------------------------------------------------*/
+void poly_cos(FPU_REG *st0_ptr)
+{
+ FPU_REG result;
+ long int exponent, exp2, echange;
+ Xsig accumulator, argSqrd, fix_up, argTo4;
+ unsigned long long fixed_arg;
+
+#ifdef PARANOID
+ if ((exponent(st0_ptr) > 0)
+ || ((exponent(st0_ptr) == 0)
+ && (significand(st0_ptr) > 0xc90fdaa22168c234LL))) {
+ EXCEPTION(EX_Invalid);
+ FPU_copy_to_reg0(&CONST_QNaN, TAG_Special);
+ return;
+ }
+#endif /* PARANOID */
+
+ exponent = exponent(st0_ptr);
+
+ accumulator.lsw = accumulator.midw = accumulator.msw = 0;
+
+ if ((exponent < -1)
+ || ((exponent == -1) && (st0_ptr->sigh <= 0xb00d6f54))) {
+ /* arg is < 0.687705 */
+
+ argSqrd.msw = st0_ptr->sigh;
+ argSqrd.midw = st0_ptr->sigl;
+ argSqrd.lsw = 0;
+ mul64_Xsig(&argSqrd, &significand(st0_ptr));
+
+ if (exponent < -1) {
+ /* shift the argument right by the required places */
+ shr_Xsig(&argSqrd, 2 * (-1 - exponent));
+ }
+
+ argTo4.msw = argSqrd.msw;
+ argTo4.midw = argSqrd.midw;
+ argTo4.lsw = argSqrd.lsw;
+ mul_Xsig_Xsig(&argTo4, &argTo4);
+
+ polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), neg_terms_h,
+ N_COEFF_NH - 1);
+ mul_Xsig_Xsig(&accumulator, &argSqrd);
+ negate_Xsig(&accumulator);
+
+ polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), pos_terms_h,
+ N_COEFF_PH - 1);
+ negate_Xsig(&accumulator);
+
+ mul64_Xsig(&accumulator, &significand(st0_ptr));
+ mul64_Xsig(&accumulator, &significand(st0_ptr));
+ shr_Xsig(&accumulator, -2 * (1 + exponent));
+
+ shr_Xsig(&accumulator, 3);
+ negate_Xsig(&accumulator);
+
+ add_Xsig_Xsig(&accumulator, &argSqrd);
+
+ shr_Xsig(&accumulator, 1);
+
+ /* It doesn't matter if accumulator is all zero here, the
+ following code will work ok */
+ negate_Xsig(&accumulator);
+
+ if (accumulator.lsw & 0x80000000)
+ XSIG_LL(accumulator)++;
+ if (accumulator.msw == 0) {
+ /* The result is 1.0 */
+ FPU_copy_to_reg0(&CONST_1, TAG_Valid);
+ return;
+ } else {
+ significand(&result) = XSIG_LL(accumulator);
+
+ /* will be a valid positive nr with expon = -1 */
+ setexponentpos(&result, -1);
+ }
+ } else {
+ fixed_arg = significand(st0_ptr);
+
+ if (exponent == 0) {
+ /* The argument is >= 1.0 */
+
+ /* Put the binary point at the left. */
+ fixed_arg <<= 1;
+ }
+ /* pi/2 in hex is: 1.921fb54442d18469 898CC51701B839A2 52049C1 */
+ fixed_arg = 0x921fb54442d18469LL - fixed_arg;
+ /* There is a special case which arises due to rounding, to fix here. */
+ if (fixed_arg == 0xffffffffffffffffLL)
+ fixed_arg = 0;
+
+ exponent = -1;
+ exp2 = -1;
+
+ /* A shift is needed here only for a narrow range of arguments,
+ i.e. for fixed_arg approx 2^-32, but we pick up more... */
+ if (!(LL_MSW(fixed_arg) & 0xffff0000)) {
+ fixed_arg <<= 16;
+ exponent -= 16;
+ exp2 -= 16;
+ }
+
+ XSIG_LL(argSqrd) = fixed_arg;
+ argSqrd.lsw = 0;
+ mul64_Xsig(&argSqrd, &fixed_arg);
+
+ if (exponent < -1) {
+ /* shift the argument right by the required places */
+ shr_Xsig(&argSqrd, 2 * (-1 - exponent));
+ }
+
+ argTo4.msw = argSqrd.msw;
+ argTo4.midw = argSqrd.midw;
+ argTo4.lsw = argSqrd.lsw;
+ mul_Xsig_Xsig(&argTo4, &argTo4);
+
+ polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), neg_terms_l,
+ N_COEFF_N - 1);
+ mul_Xsig_Xsig(&accumulator, &argSqrd);
+ negate_Xsig(&accumulator);
+
+ polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), pos_terms_l,
+ N_COEFF_P - 1);
+
+ shr_Xsig(&accumulator, 2); /* Divide by four */
+ accumulator.msw |= 0x80000000; /* Add 1.0 */
+
+ mul64_Xsig(&accumulator, &fixed_arg);
+ mul64_Xsig(&accumulator, &fixed_arg);
+ mul64_Xsig(&accumulator, &fixed_arg);
+
+ /* Divide by four, FPU_REG compatible, etc */
+ exponent = 3 * exponent;
+
+ /* The minimum exponent difference is 3 */
+ shr_Xsig(&accumulator, exp2 - exponent);
+
+ negate_Xsig(&accumulator);
+ XSIG_LL(accumulator) += fixed_arg;
+
+ /* The basic computation is complete. Now fix the answer to
+ compensate for the error due to the approximation used for
+ pi/2
+ */
+
+ /* This has an exponent of -65 */
+ XSIG_LL(fix_up) = 0x898cc51701b839a2ll;
+ fix_up.lsw = 0;
+
+ /* The fix-up needs to be improved for larger args */
+ if (argSqrd.msw & 0xffc00000) {
+ /* Get about 32 bit precision in these: */
+ fix_up.msw -= mul_32_32(0x898cc517, argSqrd.msw) / 2;
+ fix_up.msw += mul_32_32(0x898cc517, argTo4.msw) / 24;
+ }
+
+ exp2 += norm_Xsig(&accumulator);
+ shr_Xsig(&accumulator, 1); /* Prevent overflow */
+ exp2++;
+ shr_Xsig(&fix_up, 65 + exp2);
+
+ add_Xsig_Xsig(&accumulator, &fix_up);
+
+ echange = round_Xsig(&accumulator);
+
+ setexponentpos(&result, exp2 + echange);
+ significand(&result) = XSIG_LL(accumulator);
+ }
+
+ FPU_copy_to_reg0(&result, TAG_Valid);
+
+#ifdef PARANOID
+ if ((exponent(&result) >= 0)
+ && (significand(&result) > 0x8000000000000000LL)) {
+ EXCEPTION(EX_INTERNAL | 0x151);
+ }
+#endif /* PARANOID */
+
+}
diff --git a/arch/x86/math-emu/poly_tan.c b/arch/x86/math-emu/poly_tan.c
new file mode 100644
index 000000000..1f5b1d712
--- /dev/null
+++ b/arch/x86/math-emu/poly_tan.c
@@ -0,0 +1,213 @@
+// SPDX-License-Identifier: GPL-2.0
+/*---------------------------------------------------------------------------+
+ | poly_tan.c |
+ | |
+ | Compute the tan of a FPU_REG, using a polynomial approximation. |
+ | |
+ | Copyright (C) 1992,1993,1994,1997,1999 |
+ | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, |
+ | Australia. E-mail billm@melbpc.org.au |
+ | |
+ | |
+ +---------------------------------------------------------------------------*/
+
+#include "exception.h"
+#include "reg_constant.h"
+#include "fpu_emu.h"
+#include "fpu_system.h"
+#include "control_w.h"
+#include "poly.h"
+
+#define HiPOWERop 3 /* odd poly, positive terms */
+static const unsigned long long oddplterm[HiPOWERop] = {
+ 0x0000000000000000LL,
+ 0x0051a1cf08fca228LL,
+ 0x0000000071284ff7LL
+};
+
+#define HiPOWERon 2 /* odd poly, negative terms */
+static const unsigned long long oddnegterm[HiPOWERon] = {
+ 0x1291a9a184244e80LL,
+ 0x0000583245819c21LL
+};
+
+#define HiPOWERep 2 /* even poly, positive terms */
+static const unsigned long long evenplterm[HiPOWERep] = {
+ 0x0e848884b539e888LL,
+ 0x00003c7f18b887daLL
+};
+
+#define HiPOWERen 2 /* even poly, negative terms */
+static const unsigned long long evennegterm[HiPOWERen] = {
+ 0xf1f0200fd51569ccLL,
+ 0x003afb46105c4432LL
+};
+
+static const unsigned long long twothirds = 0xaaaaaaaaaaaaaaabLL;
+
+/*--- poly_tan() ------------------------------------------------------------+
+ | |
+ +---------------------------------------------------------------------------*/
+void poly_tan(FPU_REG *st0_ptr)
+{
+ long int exponent;
+ int invert;
+ Xsig argSq, argSqSq, accumulatoro, accumulatore, accum,
+ argSignif, fix_up;
+ unsigned long adj;
+
+ exponent = exponent(st0_ptr);
+
+#ifdef PARANOID
+ if (signnegative(st0_ptr)) { /* Can't hack a number < 0.0 */
+ arith_invalid(0);
+ return;
+ } /* Need a positive number */
+#endif /* PARANOID */
+
+ /* Split the problem into two domains, smaller and larger than pi/4 */
+ if ((exponent == 0)
+ || ((exponent == -1) && (st0_ptr->sigh > 0xc90fdaa2))) {
+ /* The argument is greater than (approx) pi/4 */
+ invert = 1;
+ accum.lsw = 0;
+ XSIG_LL(accum) = significand(st0_ptr);
+
+ if (exponent == 0) {
+ /* The argument is >= 1.0 */
+ /* Put the binary point at the left. */
+ XSIG_LL(accum) <<= 1;
+ }
+ /* pi/2 in hex is: 1.921fb54442d18469 898CC51701B839A2 52049C1 */
+ XSIG_LL(accum) = 0x921fb54442d18469LL - XSIG_LL(accum);
+ /* This is a special case which arises due to rounding. */
+ if (XSIG_LL(accum) == 0xffffffffffffffffLL) {
+ FPU_settag0(TAG_Valid);
+ significand(st0_ptr) = 0x8a51e04daabda360LL;
+ setexponent16(st0_ptr,
+ (0x41 + EXTENDED_Ebias) | SIGN_Negative);
+ return;
+ }
+
+ argSignif.lsw = accum.lsw;
+ XSIG_LL(argSignif) = XSIG_LL(accum);
+ exponent = -1 + norm_Xsig(&argSignif);
+ } else {
+ invert = 0;
+ argSignif.lsw = 0;
+ XSIG_LL(accum) = XSIG_LL(argSignif) = significand(st0_ptr);
+
+ if (exponent < -1) {
+ /* shift the argument right by the required places */
+ if (FPU_shrx(&XSIG_LL(accum), -1 - exponent) >=
+ 0x80000000U)
+ XSIG_LL(accum)++; /* round up */
+ }
+ }
+
+ XSIG_LL(argSq) = XSIG_LL(accum);
+ argSq.lsw = accum.lsw;
+ mul_Xsig_Xsig(&argSq, &argSq);
+ XSIG_LL(argSqSq) = XSIG_LL(argSq);
+ argSqSq.lsw = argSq.lsw;
+ mul_Xsig_Xsig(&argSqSq, &argSqSq);
+
+ /* Compute the negative terms for the numerator polynomial */
+ accumulatoro.msw = accumulatoro.midw = accumulatoro.lsw = 0;
+ polynomial_Xsig(&accumulatoro, &XSIG_LL(argSqSq), oddnegterm,
+ HiPOWERon - 1);
+ mul_Xsig_Xsig(&accumulatoro, &argSq);
+ negate_Xsig(&accumulatoro);
+ /* Add the positive terms */
+ polynomial_Xsig(&accumulatoro, &XSIG_LL(argSqSq), oddplterm,
+ HiPOWERop - 1);
+
+ /* Compute the positive terms for the denominator polynomial */
+ accumulatore.msw = accumulatore.midw = accumulatore.lsw = 0;
+ polynomial_Xsig(&accumulatore, &XSIG_LL(argSqSq), evenplterm,
+ HiPOWERep - 1);
+ mul_Xsig_Xsig(&accumulatore, &argSq);
+ negate_Xsig(&accumulatore);
+ /* Add the negative terms */
+ polynomial_Xsig(&accumulatore, &XSIG_LL(argSqSq), evennegterm,
+ HiPOWERen - 1);
+ /* Multiply by arg^2 */
+ mul64_Xsig(&accumulatore, &XSIG_LL(argSignif));
+ mul64_Xsig(&accumulatore, &XSIG_LL(argSignif));
+ /* de-normalize and divide by 2 */
+ shr_Xsig(&accumulatore, -2 * (1 + exponent) + 1);
+ negate_Xsig(&accumulatore); /* This does 1 - accumulator */
+
+ /* Now find the ratio. */
+ if (accumulatore.msw == 0) {
+ /* accumulatoro must contain 1.0 here, (actually, 0) but it
+ really doesn't matter what value we use because it will
+ have negligible effect in later calculations
+ */
+ XSIG_LL(accum) = 0x8000000000000000LL;
+ accum.lsw = 0;
+ } else {
+ div_Xsig(&accumulatoro, &accumulatore, &accum);
+ }
+
+ /* Multiply by 1/3 * arg^3 */
+ mul64_Xsig(&accum, &XSIG_LL(argSignif));
+ mul64_Xsig(&accum, &XSIG_LL(argSignif));
+ mul64_Xsig(&accum, &XSIG_LL(argSignif));
+ mul64_Xsig(&accum, &twothirds);
+ shr_Xsig(&accum, -2 * (exponent + 1));
+
+ /* tan(arg) = arg + accum */
+ add_two_Xsig(&accum, &argSignif, &exponent);
+
+ if (invert) {
+ /* We now have the value of tan(pi_2 - arg) where pi_2 is an
+ approximation for pi/2
+ */
+ /* The next step is to fix the answer to compensate for the
+ error due to the approximation used for pi/2
+ */
+
+ /* This is (approx) delta, the error in our approx for pi/2
+ (see above). It has an exponent of -65
+ */
+ XSIG_LL(fix_up) = 0x898cc51701b839a2LL;
+ fix_up.lsw = 0;
+
+ if (exponent == 0)
+ adj = 0xffffffff; /* We want approx 1.0 here, but
+ this is close enough. */
+ else if (exponent > -30) {
+ adj = accum.msw >> -(exponent + 1); /* tan */
+ adj = mul_32_32(adj, adj); /* tan^2 */
+ } else
+ adj = 0;
+ adj = mul_32_32(0x898cc517, adj); /* delta * tan^2 */
+
+ fix_up.msw += adj;
+ if (!(fix_up.msw & 0x80000000)) { /* did fix_up overflow ? */
+ /* Yes, we need to add an msb */
+ shr_Xsig(&fix_up, 1);
+ fix_up.msw |= 0x80000000;
+ shr_Xsig(&fix_up, 64 + exponent);
+ } else
+ shr_Xsig(&fix_up, 65 + exponent);
+
+ add_two_Xsig(&accum, &fix_up, &exponent);
+
+ /* accum now contains tan(pi/2 - arg).
+ Use tan(arg) = 1.0 / tan(pi/2 - arg)
+ */
+ accumulatoro.lsw = accumulatoro.midw = 0;
+ accumulatoro.msw = 0x80000000;
+ div_Xsig(&accumulatoro, &accum, &accum);
+ exponent = -exponent - 1;
+ }
+
+ /* Transfer the result */
+ round_Xsig(&accum);
+ FPU_settag0(TAG_Valid);
+ significand(st0_ptr) = XSIG_LL(accum);
+ setexponent16(st0_ptr, exponent + EXTENDED_Ebias); /* Result is positive. */
+
+}
diff --git a/arch/x86/math-emu/polynom_Xsig.S b/arch/x86/math-emu/polynom_Xsig.S
new file mode 100644
index 000000000..604f0b2d1
--- /dev/null
+++ b/arch/x86/math-emu/polynom_Xsig.S
@@ -0,0 +1,137 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*---------------------------------------------------------------------------+
+ | polynomial_Xsig.S |
+ | |
+ | Fixed point arithmetic polynomial evaluation. |
+ | |
+ | Copyright (C) 1992,1993,1994,1995 |
+ | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, |
+ | Australia. E-mail billm@jacobi.maths.monash.edu.au |
+ | |
+ | Call from C as: |
+ | void polynomial_Xsig(Xsig *accum, unsigned long long x, |
+ | unsigned long long terms[], int n) |
+ | |
+ | Computes: |
+ | terms[0] + (terms[1] + (terms[2] + ... + (terms[n-1]*x)*x)*x)*x) ... )*x |
+ | and adds the result to the 12 byte Xsig. |
+ | The terms[] are each 8 bytes, but all computation is performed to 12 byte |
+ | precision. |
+ | |
+ | This function must be used carefully: most overflow of intermediate |
+ | results is controlled, but overflow of the result is not. |
+ | |
+ +---------------------------------------------------------------------------*/
+ .file "polynomial_Xsig.S"
+
+#include "fpu_emu.h"
+
+
+#define TERM_SIZE $8
+#define SUM_MS -20(%ebp) /* sum ms long */
+#define SUM_MIDDLE -24(%ebp) /* sum middle long */
+#define SUM_LS -28(%ebp) /* sum ls long */
+#define ACCUM_MS -4(%ebp) /* accum ms long */
+#define ACCUM_MIDDLE -8(%ebp) /* accum middle long */
+#define ACCUM_LS -12(%ebp) /* accum ls long */
+#define OVERFLOWED -16(%ebp) /* addition overflow flag */
+
+.text
+ENTRY(polynomial_Xsig)
+ pushl %ebp
+ movl %esp,%ebp
+ subl $32,%esp
+ pushl %esi
+ pushl %edi
+ pushl %ebx
+
+ movl PARAM2,%esi /* x */
+ movl PARAM3,%edi /* terms */
+
+ movl TERM_SIZE,%eax
+ mull PARAM4 /* n */
+ addl %eax,%edi
+
+ movl 4(%edi),%edx /* terms[n] */
+ movl %edx,SUM_MS
+ movl (%edi),%edx /* terms[n] */
+ movl %edx,SUM_MIDDLE
+ xor %eax,%eax
+ movl %eax,SUM_LS
+ movb %al,OVERFLOWED
+
+ subl TERM_SIZE,%edi
+ decl PARAM4
+ js L_accum_done
+
+L_accum_loop:
+ xor %eax,%eax
+ movl %eax,ACCUM_MS
+ movl %eax,ACCUM_MIDDLE
+
+ movl SUM_MIDDLE,%eax
+ mull (%esi) /* x ls long */
+ movl %edx,ACCUM_LS
+
+ movl SUM_MIDDLE,%eax
+ mull 4(%esi) /* x ms long */
+ addl %eax,ACCUM_LS
+ adcl %edx,ACCUM_MIDDLE
+ adcl $0,ACCUM_MS
+
+ movl SUM_MS,%eax
+ mull (%esi) /* x ls long */
+ addl %eax,ACCUM_LS
+ adcl %edx,ACCUM_MIDDLE
+ adcl $0,ACCUM_MS
+
+ movl SUM_MS,%eax
+ mull 4(%esi) /* x ms long */
+ addl %eax,ACCUM_MIDDLE
+ adcl %edx,ACCUM_MS
+
+ testb $0xff,OVERFLOWED
+ jz L_no_overflow
+
+ movl (%esi),%eax
+ addl %eax,ACCUM_MIDDLE
+ movl 4(%esi),%eax
+ adcl %eax,ACCUM_MS /* This could overflow too */
+
+L_no_overflow:
+
+/*
+ * Now put the sum of next term and the accumulator
+ * into the sum register
+ */
+ movl ACCUM_LS,%eax
+ addl (%edi),%eax /* term ls long */
+ movl %eax,SUM_LS
+ movl ACCUM_MIDDLE,%eax
+ adcl (%edi),%eax /* term ls long */
+ movl %eax,SUM_MIDDLE
+ movl ACCUM_MS,%eax
+ adcl 4(%edi),%eax /* term ms long */
+ movl %eax,SUM_MS
+ sbbb %al,%al
+ movb %al,OVERFLOWED /* Used in the next iteration */
+
+ subl TERM_SIZE,%edi
+ decl PARAM4
+ jns L_accum_loop
+
+L_accum_done:
+ movl PARAM1,%edi /* accum */
+ movl SUM_LS,%eax
+ addl %eax,(%edi)
+ movl SUM_MIDDLE,%eax
+ adcl %eax,4(%edi)
+ movl SUM_MS,%eax
+ adcl %eax,8(%edi)
+
+ popl %ebx
+ popl %edi
+ popl %esi
+ leave
+ ret
+ENDPROC(polynomial_Xsig)
diff --git a/arch/x86/math-emu/reg_add_sub.c b/arch/x86/math-emu/reg_add_sub.c
new file mode 100644
index 000000000..29451dd07
--- /dev/null
+++ b/arch/x86/math-emu/reg_add_sub.c
@@ -0,0 +1,334 @@
+// SPDX-License-Identifier: GPL-2.0
+/*---------------------------------------------------------------------------+
+ | reg_add_sub.c |
+ | |
+ | Functions to add or subtract two registers and put the result in a third. |
+ | |
+ | Copyright (C) 1992,1993,1997 |
+ | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia |
+ | E-mail billm@suburbia.net |
+ | |
+ | |
+ +---------------------------------------------------------------------------*/
+
+/*---------------------------------------------------------------------------+
+ | For each function, the destination may be any FPU_REG, including one of |
+ | the source FPU_REGs. |
+ | Each function returns 0 if the answer is o.k., otherwise a non-zero |
+ | value is returned, indicating either an exception condition or an |
+ | internal error. |
+ +---------------------------------------------------------------------------*/
+
+#include "exception.h"
+#include "reg_constant.h"
+#include "fpu_emu.h"
+#include "control_w.h"
+#include "fpu_system.h"
+
+static
+int add_sub_specials(FPU_REG const *a, u_char taga, u_char signa,
+ FPU_REG const *b, u_char tagb, u_char signb,
+ FPU_REG * dest, int deststnr, int control_w);
+
+/*
+ Operates on st(0) and st(n), or on st(0) and temporary data.
+ The destination must be one of the source st(x).
+ */
+int FPU_add(FPU_REG const *b, u_char tagb, int deststnr, int control_w)
+{
+ FPU_REG *a = &st(0);
+ FPU_REG *dest = &st(deststnr);
+ u_char signb = getsign(b);
+ u_char taga = FPU_gettag0();
+ u_char signa = getsign(a);
+ u_char saved_sign = getsign(dest);
+ int diff, tag, expa, expb;
+
+ if (!(taga | tagb)) {
+ expa = exponent(a);
+ expb = exponent(b);
+
+ valid_add:
+ /* Both registers are valid */
+ if (!(signa ^ signb)) {
+ /* signs are the same */
+ tag =
+ FPU_u_add(a, b, dest, control_w, signa, expa, expb);
+ } else {
+ /* The signs are different, so do a subtraction */
+ diff = expa - expb;
+ if (!diff) {
+ diff = a->sigh - b->sigh; /* This works only if the ms bits
+ are identical. */
+ if (!diff) {
+ diff = a->sigl > b->sigl;
+ if (!diff)
+ diff = -(a->sigl < b->sigl);
+ }
+ }
+
+ if (diff > 0) {
+ tag =
+ FPU_u_sub(a, b, dest, control_w, signa,
+ expa, expb);
+ } else if (diff < 0) {
+ tag =
+ FPU_u_sub(b, a, dest, control_w, signb,
+ expb, expa);
+ } else {
+ FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr);
+ /* sign depends upon rounding mode */
+ setsign(dest, ((control_w & CW_RC) != RC_DOWN)
+ ? SIGN_POS : SIGN_NEG);
+ return TAG_Zero;
+ }
+ }
+
+ if (tag < 0) {
+ setsign(dest, saved_sign);
+ return tag;
+ }
+ FPU_settagi(deststnr, tag);
+ return tag;
+ }
+
+ if (taga == TAG_Special)
+ taga = FPU_Special(a);
+ if (tagb == TAG_Special)
+ tagb = FPU_Special(b);
+
+ if (((taga == TAG_Valid) && (tagb == TW_Denormal))
+ || ((taga == TW_Denormal) && (tagb == TAG_Valid))
+ || ((taga == TW_Denormal) && (tagb == TW_Denormal))) {
+ FPU_REG x, y;
+
+ if (denormal_operand() < 0)
+ return FPU_Exception;
+
+ FPU_to_exp16(a, &x);
+ FPU_to_exp16(b, &y);
+ a = &x;
+ b = &y;
+ expa = exponent16(a);
+ expb = exponent16(b);
+ goto valid_add;
+ }
+
+ if ((taga == TW_NaN) || (tagb == TW_NaN)) {
+ if (deststnr == 0)
+ return real_2op_NaN(b, tagb, deststnr, a);
+ else
+ return real_2op_NaN(a, taga, deststnr, a);
+ }
+
+ return add_sub_specials(a, taga, signa, b, tagb, signb,
+ dest, deststnr, control_w);
+}
+
+/* Subtract b from a. (a-b) -> dest */
+int FPU_sub(int flags, int rm, int control_w)
+{
+ FPU_REG const *a, *b;
+ FPU_REG *dest;
+ u_char taga, tagb, signa, signb, saved_sign, sign;
+ int diff, tag = 0, expa, expb, deststnr;
+
+ a = &st(0);
+ taga = FPU_gettag0();
+
+ deststnr = 0;
+ if (flags & LOADED) {
+ b = (FPU_REG *) rm;
+ tagb = flags & 0x0f;
+ } else {
+ b = &st(rm);
+ tagb = FPU_gettagi(rm);
+
+ if (flags & DEST_RM)
+ deststnr = rm;
+ }
+
+ signa = getsign(a);
+ signb = getsign(b);
+
+ if (flags & REV) {
+ signa ^= SIGN_NEG;
+ signb ^= SIGN_NEG;
+ }
+
+ dest = &st(deststnr);
+ saved_sign = getsign(dest);
+
+ if (!(taga | tagb)) {
+ expa = exponent(a);
+ expb = exponent(b);
+
+ valid_subtract:
+ /* Both registers are valid */
+
+ diff = expa - expb;
+
+ if (!diff) {
+ diff = a->sigh - b->sigh; /* Works only if ms bits are identical */
+ if (!diff) {
+ diff = a->sigl > b->sigl;
+ if (!diff)
+ diff = -(a->sigl < b->sigl);
+ }
+ }
+
+ switch ((((int)signa) * 2 + signb) / SIGN_NEG) {
+ case 0: /* P - P */
+ case 3: /* N - N */
+ if (diff > 0) {
+ /* |a| > |b| */
+ tag =
+ FPU_u_sub(a, b, dest, control_w, signa,
+ expa, expb);
+ } else if (diff == 0) {
+ FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr);
+
+ /* sign depends upon rounding mode */
+ setsign(dest, ((control_w & CW_RC) != RC_DOWN)
+ ? SIGN_POS : SIGN_NEG);
+ return TAG_Zero;
+ } else {
+ sign = signa ^ SIGN_NEG;
+ tag =
+ FPU_u_sub(b, a, dest, control_w, sign, expb,
+ expa);
+ }
+ break;
+ case 1: /* P - N */
+ tag =
+ FPU_u_add(a, b, dest, control_w, SIGN_POS, expa,
+ expb);
+ break;
+ case 2: /* N - P */
+ tag =
+ FPU_u_add(a, b, dest, control_w, SIGN_NEG, expa,
+ expb);
+ break;
+#ifdef PARANOID
+ default:
+ EXCEPTION(EX_INTERNAL | 0x111);
+ return -1;
+#endif
+ }
+ if (tag < 0) {
+ setsign(dest, saved_sign);
+ return tag;
+ }
+ FPU_settagi(deststnr, tag);
+ return tag;
+ }
+
+ if (taga == TAG_Special)
+ taga = FPU_Special(a);
+ if (tagb == TAG_Special)
+ tagb = FPU_Special(b);
+
+ if (((taga == TAG_Valid) && (tagb == TW_Denormal))
+ || ((taga == TW_Denormal) && (tagb == TAG_Valid))
+ || ((taga == TW_Denormal) && (tagb == TW_Denormal))) {
+ FPU_REG x, y;
+
+ if (denormal_operand() < 0)
+ return FPU_Exception;
+
+ FPU_to_exp16(a, &x);
+ FPU_to_exp16(b, &y);
+ a = &x;
+ b = &y;
+ expa = exponent16(a);
+ expb = exponent16(b);
+
+ goto valid_subtract;
+ }
+
+ if ((taga == TW_NaN) || (tagb == TW_NaN)) {
+ FPU_REG const *d1, *d2;
+ if (flags & REV) {
+ d1 = b;
+ d2 = a;
+ } else {
+ d1 = a;
+ d2 = b;
+ }
+ if (flags & LOADED)
+ return real_2op_NaN(b, tagb, deststnr, d1);
+ if (flags & DEST_RM)
+ return real_2op_NaN(a, taga, deststnr, d2);
+ else
+ return real_2op_NaN(b, tagb, deststnr, d2);
+ }
+
+ return add_sub_specials(a, taga, signa, b, tagb, signb ^ SIGN_NEG,
+ dest, deststnr, control_w);
+}
+
+static
+int add_sub_specials(FPU_REG const *a, u_char taga, u_char signa,
+ FPU_REG const *b, u_char tagb, u_char signb,
+ FPU_REG * dest, int deststnr, int control_w)
+{
+ if (((taga == TW_Denormal) || (tagb == TW_Denormal))
+ && (denormal_operand() < 0))
+ return FPU_Exception;
+
+ if (taga == TAG_Zero) {
+ if (tagb == TAG_Zero) {
+ /* Both are zero, result will be zero. */
+ u_char different_signs = signa ^ signb;
+
+ FPU_copy_to_regi(a, TAG_Zero, deststnr);
+ if (different_signs) {
+ /* Signs are different. */
+ /* Sign of answer depends upon rounding mode. */
+ setsign(dest, ((control_w & CW_RC) != RC_DOWN)
+ ? SIGN_POS : SIGN_NEG);
+ } else
+ setsign(dest, signa); /* signa may differ from the sign of a. */
+ return TAG_Zero;
+ } else {
+ reg_copy(b, dest);
+ if ((tagb == TW_Denormal) && (b->sigh & 0x80000000)) {
+ /* A pseudoDenormal, convert it. */
+ addexponent(dest, 1);
+ tagb = TAG_Valid;
+ } else if (tagb > TAG_Empty)
+ tagb = TAG_Special;
+ setsign(dest, signb); /* signb may differ from the sign of b. */
+ FPU_settagi(deststnr, tagb);
+ return tagb;
+ }
+ } else if (tagb == TAG_Zero) {
+ reg_copy(a, dest);
+ if ((taga == TW_Denormal) && (a->sigh & 0x80000000)) {
+ /* A pseudoDenormal */
+ addexponent(dest, 1);
+ taga = TAG_Valid;
+ } else if (taga > TAG_Empty)
+ taga = TAG_Special;
+ setsign(dest, signa); /* signa may differ from the sign of a. */
+ FPU_settagi(deststnr, taga);
+ return taga;
+ } else if (taga == TW_Infinity) {
+ if ((tagb != TW_Infinity) || (signa == signb)) {
+ FPU_copy_to_regi(a, TAG_Special, deststnr);
+ setsign(dest, signa); /* signa may differ from the sign of a. */
+ return taga;
+ }
+ /* Infinity-Infinity is undefined. */
+ return arith_invalid(deststnr);
+ } else if (tagb == TW_Infinity) {
+ FPU_copy_to_regi(b, TAG_Special, deststnr);
+ setsign(dest, signb); /* signb may differ from the sign of b. */
+ return tagb;
+ }
+#ifdef PARANOID
+ EXCEPTION(EX_INTERNAL | 0x101);
+#endif
+
+ return FPU_Exception;
+}
diff --git a/arch/x86/math-emu/reg_compare.c b/arch/x86/math-emu/reg_compare.c
new file mode 100644
index 000000000..eacb5128f
--- /dev/null
+++ b/arch/x86/math-emu/reg_compare.c
@@ -0,0 +1,479 @@
+// SPDX-License-Identifier: GPL-2.0
+/*---------------------------------------------------------------------------+
+ | reg_compare.c |
+ | |
+ | Compare two floating point registers |
+ | |
+ | Copyright (C) 1992,1993,1994,1997 |
+ | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia |
+ | E-mail billm@suburbia.net |
+ | |
+ | |
+ +---------------------------------------------------------------------------*/
+
+/*---------------------------------------------------------------------------+
+ | compare() is the core FPU_REG comparison function |
+ +---------------------------------------------------------------------------*/
+
+#include "fpu_system.h"
+#include "exception.h"
+#include "fpu_emu.h"
+#include "control_w.h"
+#include "status_w.h"
+
+static int compare(FPU_REG const *b, int tagb)
+{
+ int diff, exp0, expb;
+ u_char st0_tag;
+ FPU_REG *st0_ptr;
+ FPU_REG x, y;
+ u_char st0_sign, signb = getsign(b);
+
+ st0_ptr = &st(0);
+ st0_tag = FPU_gettag0();
+ st0_sign = getsign(st0_ptr);
+
+ if (tagb == TAG_Special)
+ tagb = FPU_Special(b);
+ if (st0_tag == TAG_Special)
+ st0_tag = FPU_Special(st0_ptr);
+
+ if (((st0_tag != TAG_Valid) && (st0_tag != TW_Denormal))
+ || ((tagb != TAG_Valid) && (tagb != TW_Denormal))) {
+ if (st0_tag == TAG_Zero) {
+ if (tagb == TAG_Zero)
+ return COMP_A_eq_B;
+ if (tagb == TAG_Valid)
+ return ((signb ==
+ SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B);
+ if (tagb == TW_Denormal)
+ return ((signb ==
+ SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B)
+ | COMP_Denormal;
+ } else if (tagb == TAG_Zero) {
+ if (st0_tag == TAG_Valid)
+ return ((st0_sign ==
+ SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B);
+ if (st0_tag == TW_Denormal)
+ return ((st0_sign ==
+ SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B)
+ | COMP_Denormal;
+ }
+
+ if (st0_tag == TW_Infinity) {
+ if ((tagb == TAG_Valid) || (tagb == TAG_Zero))
+ return ((st0_sign ==
+ SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B);
+ else if (tagb == TW_Denormal)
+ return ((st0_sign ==
+ SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B)
+ | COMP_Denormal;
+ else if (tagb == TW_Infinity) {
+ /* The 80486 book says that infinities can be equal! */
+ return (st0_sign == signb) ? COMP_A_eq_B :
+ ((st0_sign ==
+ SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B);
+ }
+ /* Fall through to the NaN code */
+ } else if (tagb == TW_Infinity) {
+ if ((st0_tag == TAG_Valid) || (st0_tag == TAG_Zero))
+ return ((signb ==
+ SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B);
+ if (st0_tag == TW_Denormal)
+ return ((signb ==
+ SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B)
+ | COMP_Denormal;
+ /* Fall through to the NaN code */
+ }
+
+ /* The only possibility now should be that one of the arguments
+ is a NaN */
+ if ((st0_tag == TW_NaN) || (tagb == TW_NaN)) {
+ int signalling = 0, unsupported = 0;
+ if (st0_tag == TW_NaN) {
+ signalling =
+ (st0_ptr->sigh & 0xc0000000) == 0x80000000;
+ unsupported = !((exponent(st0_ptr) == EXP_OVER)
+ && (st0_ptr->
+ sigh & 0x80000000));
+ }
+ if (tagb == TW_NaN) {
+ signalling |=
+ (b->sigh & 0xc0000000) == 0x80000000;
+ unsupported |= !((exponent(b) == EXP_OVER)
+ && (b->sigh & 0x80000000));
+ }
+ if (signalling || unsupported)
+ return COMP_No_Comp | COMP_SNaN | COMP_NaN;
+ else
+ /* Neither is a signaling NaN */
+ return COMP_No_Comp | COMP_NaN;
+ }
+
+ EXCEPTION(EX_Invalid);
+ }
+
+ if (st0_sign != signb) {
+ return ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B)
+ | (((st0_tag == TW_Denormal) || (tagb == TW_Denormal)) ?
+ COMP_Denormal : 0);
+ }
+
+ if ((st0_tag == TW_Denormal) || (tagb == TW_Denormal)) {
+ FPU_to_exp16(st0_ptr, &x);
+ FPU_to_exp16(b, &y);
+ st0_ptr = &x;
+ b = &y;
+ exp0 = exponent16(st0_ptr);
+ expb = exponent16(b);
+ } else {
+ exp0 = exponent(st0_ptr);
+ expb = exponent(b);
+ }
+
+#ifdef PARANOID
+ if (!(st0_ptr->sigh & 0x80000000))
+ EXCEPTION(EX_Invalid);
+ if (!(b->sigh & 0x80000000))
+ EXCEPTION(EX_Invalid);
+#endif /* PARANOID */
+
+ diff = exp0 - expb;
+ if (diff == 0) {
+ diff = st0_ptr->sigh - b->sigh; /* Works only if ms bits are
+ identical */
+ if (diff == 0) {
+ diff = st0_ptr->sigl > b->sigl;
+ if (diff == 0)
+ diff = -(st0_ptr->sigl < b->sigl);
+ }
+ }
+
+ if (diff > 0) {
+ return ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B)
+ | (((st0_tag == TW_Denormal) || (tagb == TW_Denormal)) ?
+ COMP_Denormal : 0);
+ }
+ if (diff < 0) {
+ return ((st0_sign == SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B)
+ | (((st0_tag == TW_Denormal) || (tagb == TW_Denormal)) ?
+ COMP_Denormal : 0);
+ }
+
+ return COMP_A_eq_B
+ | (((st0_tag == TW_Denormal) || (tagb == TW_Denormal)) ?
+ COMP_Denormal : 0);
+
+}
+
+/* This function requires that st(0) is not empty */
+int FPU_compare_st_data(FPU_REG const *loaded_data, u_char loaded_tag)
+{
+ int f, c;
+
+ c = compare(loaded_data, loaded_tag);
+
+ if (c & COMP_NaN) {
+ EXCEPTION(EX_Invalid);
+ f = SW_C3 | SW_C2 | SW_C0;
+ } else
+ switch (c & 7) {
+ case COMP_A_lt_B:
+ f = SW_C0;
+ break;
+ case COMP_A_eq_B:
+ f = SW_C3;
+ break;
+ case COMP_A_gt_B:
+ f = 0;
+ break;
+ case COMP_No_Comp:
+ f = SW_C3 | SW_C2 | SW_C0;
+ break;
+ default:
+#ifdef PARANOID
+ EXCEPTION(EX_INTERNAL | 0x121);
+#endif /* PARANOID */
+ f = SW_C3 | SW_C2 | SW_C0;
+ break;
+ }
+ setcc(f);
+ if (c & COMP_Denormal) {
+ return denormal_operand() < 0;
+ }
+ return 0;
+}
+
+static int compare_st_st(int nr)
+{
+ int f, c;
+ FPU_REG *st_ptr;
+
+ if (!NOT_EMPTY(0) || !NOT_EMPTY(nr)) {
+ setcc(SW_C3 | SW_C2 | SW_C0);
+ /* Stack fault */
+ EXCEPTION(EX_StackUnder);
+ return !(control_word & CW_Invalid);
+ }
+
+ st_ptr = &st(nr);
+ c = compare(st_ptr, FPU_gettagi(nr));
+ if (c & COMP_NaN) {
+ setcc(SW_C3 | SW_C2 | SW_C0);
+ EXCEPTION(EX_Invalid);
+ return !(control_word & CW_Invalid);
+ } else
+ switch (c & 7) {
+ case COMP_A_lt_B:
+ f = SW_C0;
+ break;
+ case COMP_A_eq_B:
+ f = SW_C3;
+ break;
+ case COMP_A_gt_B:
+ f = 0;
+ break;
+ case COMP_No_Comp:
+ f = SW_C3 | SW_C2 | SW_C0;
+ break;
+ default:
+#ifdef PARANOID
+ EXCEPTION(EX_INTERNAL | 0x122);
+#endif /* PARANOID */
+ f = SW_C3 | SW_C2 | SW_C0;
+ break;
+ }
+ setcc(f);
+ if (c & COMP_Denormal) {
+ return denormal_operand() < 0;
+ }
+ return 0;
+}
+
+static int compare_i_st_st(int nr)
+{
+ int f, c;
+ FPU_REG *st_ptr;
+
+ if (!NOT_EMPTY(0) || !NOT_EMPTY(nr)) {
+ FPU_EFLAGS |= (X86_EFLAGS_ZF | X86_EFLAGS_PF | X86_EFLAGS_CF);
+ /* Stack fault */
+ EXCEPTION(EX_StackUnder);
+ return !(control_word & CW_Invalid);
+ }
+
+ partial_status &= ~SW_C0;
+ st_ptr = &st(nr);
+ c = compare(st_ptr, FPU_gettagi(nr));
+ if (c & COMP_NaN) {
+ FPU_EFLAGS |= (X86_EFLAGS_ZF | X86_EFLAGS_PF | X86_EFLAGS_CF);
+ EXCEPTION(EX_Invalid);
+ return !(control_word & CW_Invalid);
+ }
+
+ switch (c & 7) {
+ case COMP_A_lt_B:
+ f = X86_EFLAGS_CF;
+ break;
+ case COMP_A_eq_B:
+ f = X86_EFLAGS_ZF;
+ break;
+ case COMP_A_gt_B:
+ f = 0;
+ break;
+ case COMP_No_Comp:
+ f = X86_EFLAGS_ZF | X86_EFLAGS_PF | X86_EFLAGS_CF;
+ break;
+ default:
+#ifdef PARANOID
+ EXCEPTION(EX_INTERNAL | 0x122);
+#endif /* PARANOID */
+ f = 0;
+ break;
+ }
+ FPU_EFLAGS = (FPU_EFLAGS & ~(X86_EFLAGS_ZF | X86_EFLAGS_PF | X86_EFLAGS_CF)) | f;
+ if (c & COMP_Denormal) {
+ return denormal_operand() < 0;
+ }
+ return 0;
+}
+
+static int compare_u_st_st(int nr)
+{
+ int f = 0, c;
+ FPU_REG *st_ptr;
+
+ if (!NOT_EMPTY(0) || !NOT_EMPTY(nr)) {
+ setcc(SW_C3 | SW_C2 | SW_C0);
+ /* Stack fault */
+ EXCEPTION(EX_StackUnder);
+ return !(control_word & CW_Invalid);
+ }
+
+ st_ptr = &st(nr);
+ c = compare(st_ptr, FPU_gettagi(nr));
+ if (c & COMP_NaN) {
+ setcc(SW_C3 | SW_C2 | SW_C0);
+ if (c & COMP_SNaN) { /* This is the only difference between
+ un-ordered and ordinary comparisons */
+ EXCEPTION(EX_Invalid);
+ return !(control_word & CW_Invalid);
+ }
+ return 0;
+ } else
+ switch (c & 7) {
+ case COMP_A_lt_B:
+ f = SW_C0;
+ break;
+ case COMP_A_eq_B:
+ f = SW_C3;
+ break;
+ case COMP_A_gt_B:
+ f = 0;
+ break;
+ case COMP_No_Comp:
+ f = SW_C3 | SW_C2 | SW_C0;
+ break;
+#ifdef PARANOID
+ default:
+ EXCEPTION(EX_INTERNAL | 0x123);
+ f = SW_C3 | SW_C2 | SW_C0;
+ break;
+#endif /* PARANOID */
+ }
+ setcc(f);
+ if (c & COMP_Denormal) {
+ return denormal_operand() < 0;
+ }
+ return 0;
+}
+
+static int compare_ui_st_st(int nr)
+{
+ int f = 0, c;
+ FPU_REG *st_ptr;
+
+ if (!NOT_EMPTY(0) || !NOT_EMPTY(nr)) {
+ FPU_EFLAGS |= (X86_EFLAGS_ZF | X86_EFLAGS_PF | X86_EFLAGS_CF);
+ /* Stack fault */
+ EXCEPTION(EX_StackUnder);
+ return !(control_word & CW_Invalid);
+ }
+
+ partial_status &= ~SW_C0;
+ st_ptr = &st(nr);
+ c = compare(st_ptr, FPU_gettagi(nr));
+ if (c & COMP_NaN) {
+ FPU_EFLAGS |= (X86_EFLAGS_ZF | X86_EFLAGS_PF | X86_EFLAGS_CF);
+ if (c & COMP_SNaN) { /* This is the only difference between
+ un-ordered and ordinary comparisons */
+ EXCEPTION(EX_Invalid);
+ return !(control_word & CW_Invalid);
+ }
+ return 0;
+ }
+
+ switch (c & 7) {
+ case COMP_A_lt_B:
+ f = X86_EFLAGS_CF;
+ break;
+ case COMP_A_eq_B:
+ f = X86_EFLAGS_ZF;
+ break;
+ case COMP_A_gt_B:
+ f = 0;
+ break;
+ case COMP_No_Comp:
+ f = X86_EFLAGS_ZF | X86_EFLAGS_PF | X86_EFLAGS_CF;
+ break;
+#ifdef PARANOID
+ default:
+ EXCEPTION(EX_INTERNAL | 0x123);
+ f = 0;
+ break;
+#endif /* PARANOID */
+ }
+ FPU_EFLAGS = (FPU_EFLAGS & ~(X86_EFLAGS_ZF | X86_EFLAGS_PF | X86_EFLAGS_CF)) | f;
+ if (c & COMP_Denormal) {
+ return denormal_operand() < 0;
+ }
+ return 0;
+}
+
+/*---------------------------------------------------------------------------*/
+
+void fcom_st(void)
+{
+ /* fcom st(i) */
+ compare_st_st(FPU_rm);
+}
+
+void fcompst(void)
+{
+ /* fcomp st(i) */
+ if (!compare_st_st(FPU_rm))
+ FPU_pop();
+}
+
+void fcompp(void)
+{
+ /* fcompp */
+ if (FPU_rm != 1) {
+ FPU_illegal();
+ return;
+ }
+ if (!compare_st_st(1))
+ poppop();
+}
+
+void fucom_(void)
+{
+ /* fucom st(i) */
+ compare_u_st_st(FPU_rm);
+
+}
+
+void fucomp(void)
+{
+ /* fucomp st(i) */
+ if (!compare_u_st_st(FPU_rm))
+ FPU_pop();
+}
+
+void fucompp(void)
+{
+ /* fucompp */
+ if (FPU_rm == 1) {
+ if (!compare_u_st_st(1))
+ poppop();
+ } else
+ FPU_illegal();
+}
+
+/* P6+ compare-to-EFLAGS ops */
+
+void fcomi_(void)
+{
+ /* fcomi st(i) */
+ compare_i_st_st(FPU_rm);
+}
+
+void fcomip(void)
+{
+ /* fcomip st(i) */
+ if (!compare_i_st_st(FPU_rm))
+ FPU_pop();
+}
+
+void fucomi_(void)
+{
+ /* fucomi st(i) */
+ compare_ui_st_st(FPU_rm);
+}
+
+void fucomip(void)
+{
+ /* fucomip st(i) */
+ if (!compare_ui_st_st(FPU_rm))
+ FPU_pop();
+}
diff --git a/arch/x86/math-emu/reg_constant.c b/arch/x86/math-emu/reg_constant.c
new file mode 100644
index 000000000..742619e94
--- /dev/null
+++ b/arch/x86/math-emu/reg_constant.c
@@ -0,0 +1,118 @@
+// SPDX-License-Identifier: GPL-2.0
+/*---------------------------------------------------------------------------+
+ | reg_constant.c |
+ | |
+ | All of the constant FPU_REGs |
+ | |
+ | Copyright (C) 1992,1993,1994,1997 |
+ | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, |
+ | Australia. E-mail billm@suburbia.net |
+ | |
+ | |
+ +---------------------------------------------------------------------------*/
+
+#include "fpu_system.h"
+#include "fpu_emu.h"
+#include "status_w.h"
+#include "reg_constant.h"
+#include "control_w.h"
+
+#define MAKE_REG(s, e, l, h) { l, h, \
+ (u16)((EXTENDED_Ebias+(e)) | ((SIGN_##s != 0)*0x8000)) }
+
+FPU_REG const CONST_1 = MAKE_REG(POS, 0, 0x00000000, 0x80000000);
+#if 0
+FPU_REG const CONST_2 = MAKE_REG(POS, 1, 0x00000000, 0x80000000);
+FPU_REG const CONST_HALF = MAKE_REG(POS, -1, 0x00000000, 0x80000000);
+#endif /* 0 */
+static FPU_REG const CONST_L2T = MAKE_REG(POS, 1, 0xcd1b8afe, 0xd49a784b);
+static FPU_REG const CONST_L2E = MAKE_REG(POS, 0, 0x5c17f0bc, 0xb8aa3b29);
+FPU_REG const CONST_PI = MAKE_REG(POS, 1, 0x2168c235, 0xc90fdaa2);
+FPU_REG const CONST_PI2 = MAKE_REG(POS, 0, 0x2168c235, 0xc90fdaa2);
+FPU_REG const CONST_PI4 = MAKE_REG(POS, -1, 0x2168c235, 0xc90fdaa2);
+static FPU_REG const CONST_LG2 = MAKE_REG(POS, -2, 0xfbcff799, 0x9a209a84);
+static FPU_REG const CONST_LN2 = MAKE_REG(POS, -1, 0xd1cf79ac, 0xb17217f7);
+
+/* Extra bits to take pi/2 to more than 128 bits precision. */
+FPU_REG const CONST_PI2extra = MAKE_REG(NEG, -66,
+ 0xfc8f8cbb, 0xece675d1);
+
+/* Only the sign (and tag) is used in internal zeroes */
+FPU_REG const CONST_Z = MAKE_REG(POS, EXP_UNDER, 0x0, 0x0);
+
+/* Only the sign and significand (and tag) are used in internal NaNs */
+/* The 80486 never generates one of these
+FPU_REG const CONST_SNAN = MAKE_REG(POS, EXP_OVER, 0x00000001, 0x80000000);
+ */
+/* This is the real indefinite QNaN */
+FPU_REG const CONST_QNaN = MAKE_REG(NEG, EXP_OVER, 0x00000000, 0xC0000000);
+
+/* Only the sign (and tag) is used in internal infinities */
+FPU_REG const CONST_INF = MAKE_REG(POS, EXP_OVER, 0x00000000, 0x80000000);
+
+static void fld_const(FPU_REG const * c, int adj, u_char tag)
+{
+ FPU_REG *st_new_ptr;
+
+ if (STACK_OVERFLOW) {
+ FPU_stack_overflow();
+ return;
+ }
+ push();
+ reg_copy(c, st_new_ptr);
+ st_new_ptr->sigl += adj; /* For all our fldxxx constants, we don't need to
+ borrow or carry. */
+ FPU_settag0(tag);
+ clear_C1();
+}
+
+/* A fast way to find out whether x is one of RC_DOWN or RC_CHOP
+ (and not one of RC_RND or RC_UP).
+ */
+#define DOWN_OR_CHOP(x) (x & RC_DOWN)
+
+static void fld1(int rc)
+{
+ fld_const(&CONST_1, 0, TAG_Valid);
+}
+
+static void fldl2t(int rc)
+{
+ fld_const(&CONST_L2T, (rc == RC_UP) ? 1 : 0, TAG_Valid);
+}
+
+static void fldl2e(int rc)
+{
+ fld_const(&CONST_L2E, DOWN_OR_CHOP(rc) ? -1 : 0, TAG_Valid);
+}
+
+static void fldpi(int rc)
+{
+ fld_const(&CONST_PI, DOWN_OR_CHOP(rc) ? -1 : 0, TAG_Valid);
+}
+
+static void fldlg2(int rc)
+{
+ fld_const(&CONST_LG2, DOWN_OR_CHOP(rc) ? -1 : 0, TAG_Valid);
+}
+
+static void fldln2(int rc)
+{
+ fld_const(&CONST_LN2, DOWN_OR_CHOP(rc) ? -1 : 0, TAG_Valid);
+}
+
+static void fldz(int rc)
+{
+ fld_const(&CONST_Z, 0, TAG_Zero);
+}
+
+typedef void (*FUNC_RC) (int);
+
+static FUNC_RC constants_table[] = {
+ fld1, fldl2t, fldl2e, fldpi, fldlg2, fldln2, fldz, (FUNC_RC) FPU_illegal
+};
+
+void fconst(void)
+{
+ (constants_table[FPU_rm]) (control_word & CW_RC);
+}
diff --git a/arch/x86/math-emu/reg_constant.h b/arch/x86/math-emu/reg_constant.h
new file mode 100644
index 000000000..f2fdd344d
--- /dev/null
+++ b/arch/x86/math-emu/reg_constant.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*---------------------------------------------------------------------------+
+ | reg_constant.h |
+ | |
+ | Copyright (C) 1992 W. Metzenthen, 22 Parker St, Ormond, Vic 3163, |
+ | Australia. E-mail billm@vaxc.cc.monash.edu.au |
+ | |
+ +---------------------------------------------------------------------------*/
+
+#ifndef _REG_CONSTANT_H_
+#define _REG_CONSTANT_H_
+
+#include "fpu_emu.h"
+
+extern FPU_REG const CONST_1;
+extern FPU_REG const CONST_PI;
+extern FPU_REG const CONST_PI2;
+extern FPU_REG const CONST_PI2extra;
+extern FPU_REG const CONST_PI4;
+extern FPU_REG const CONST_Z;
+extern FPU_REG const CONST_PINF;
+extern FPU_REG const CONST_INF;
+extern FPU_REG const CONST_MINF;
+extern FPU_REG const CONST_QNaN;
+
+#endif /* _REG_CONSTANT_H_ */
diff --git a/arch/x86/math-emu/reg_convert.c b/arch/x86/math-emu/reg_convert.c
new file mode 100644
index 000000000..251180623
--- /dev/null
+++ b/arch/x86/math-emu/reg_convert.c
@@ -0,0 +1,47 @@
+// SPDX-License-Identifier: GPL-2.0
+/*---------------------------------------------------------------------------+
+ | reg_convert.c |
+ | |
+ | Convert register representation. |
+ | |
+ | Copyright (C) 1992,1993,1994,1996,1997 |
+ | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia |
+ | E-mail billm@suburbia.net |
+ | |
+ | |
+ +---------------------------------------------------------------------------*/
+
+#include "exception.h"
+#include "fpu_emu.h"
+
+int FPU_to_exp16(FPU_REG const *a, FPU_REG *x)
+{
+ int sign = getsign(a);
+
+ *(long long *)&(x->sigl) = *(const long long *)&(a->sigl);
+
+ /* Set up the exponent as a 16 bit quantity. */
+ setexponent16(x, exponent(a));
+
+ if (exponent16(x) == EXP_UNDER) {
+ /* The number is a de-normal or pseudodenormal. */
+ /* We only deal with the significand and exponent. */
+
+ if (x->sigh & 0x80000000) {
+ /* Is a pseudodenormal. */
+ /* This is non-80486 behaviour because the number
+ loses its 'denormal' identity. */
+ addexponent(x, 1);
+ } else {
+ /* Is a denormal. */
+ addexponent(x, 1);
+ FPU_normalize_nuo(x);
+ }
+ }
+
+ if (!(x->sigh & 0x80000000)) {
+ EXCEPTION(EX_INTERNAL | 0x180);
+ }
+
+ return sign;
+}
diff --git a/arch/x86/math-emu/reg_divide.c b/arch/x86/math-emu/reg_divide.c
new file mode 100644
index 000000000..08c2f6de0
--- /dev/null
+++ b/arch/x86/math-emu/reg_divide.c
@@ -0,0 +1,183 @@
+// SPDX-License-Identifier: GPL-2.0
+/*---------------------------------------------------------------------------+
+ | reg_divide.c |
+ | |
+ | Divide one FPU_REG by another and put the result in a destination FPU_REG.|
+ | |
+ | Copyright (C) 1996 |
+ | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia |
+ | E-mail billm@jacobi.maths.monash.edu.au |
+ | |
+ | Return value is the tag of the answer, or-ed with FPU_Exception if |
+ | one was raised, or -1 on internal error. |
+ | |
+ +---------------------------------------------------------------------------*/
+
+/*---------------------------------------------------------------------------+
+ | The destination may be any FPU_REG, including one of the source FPU_REGs. |
+ +---------------------------------------------------------------------------*/
+
+#include "exception.h"
+#include "reg_constant.h"
+#include "fpu_emu.h"
+#include "fpu_system.h"
+
+/*
+ Divide one register by another and put the result into a third register.
+ */
+int FPU_div(int flags, int rm, int control_w)
+{
+ FPU_REG x, y;
+ FPU_REG const *a, *b, *st0_ptr, *st_ptr;
+ FPU_REG *dest;
+ u_char taga, tagb, signa, signb, sign, saved_sign;
+ int tag, deststnr;
+
+ if (flags & DEST_RM)
+ deststnr = rm;
+ else
+ deststnr = 0;
+
+ if (flags & REV) {
+ b = &st(0);
+ st0_ptr = b;
+ tagb = FPU_gettag0();
+ if (flags & LOADED) {
+ a = (FPU_REG *) rm;
+ taga = flags & 0x0f;
+ } else {
+ a = &st(rm);
+ st_ptr = a;
+ taga = FPU_gettagi(rm);
+ }
+ } else {
+ a = &st(0);
+ st0_ptr = a;
+ taga = FPU_gettag0();
+ if (flags & LOADED) {
+ b = (FPU_REG *) rm;
+ tagb = flags & 0x0f;
+ } else {
+ b = &st(rm);
+ st_ptr = b;
+ tagb = FPU_gettagi(rm);
+ }
+ }
+
+ signa = getsign(a);
+ signb = getsign(b);
+
+ sign = signa ^ signb;
+
+ dest = &st(deststnr);
+ saved_sign = getsign(dest);
+
+ if (!(taga | tagb)) {
+ /* Both regs Valid, this should be the most common case. */
+ reg_copy(a, &x);
+ reg_copy(b, &y);
+ setpositive(&x);
+ setpositive(&y);
+ tag = FPU_u_div(&x, &y, dest, control_w, sign);
+
+ if (tag < 0)
+ return tag;
+
+ FPU_settagi(deststnr, tag);
+ return tag;
+ }
+
+ if (taga == TAG_Special)
+ taga = FPU_Special(a);
+ if (tagb == TAG_Special)
+ tagb = FPU_Special(b);
+
+ if (((taga == TAG_Valid) && (tagb == TW_Denormal))
+ || ((taga == TW_Denormal) && (tagb == TAG_Valid))
+ || ((taga == TW_Denormal) && (tagb == TW_Denormal))) {
+ if (denormal_operand() < 0)
+ return FPU_Exception;
+
+ FPU_to_exp16(a, &x);
+ FPU_to_exp16(b, &y);
+ tag = FPU_u_div(&x, &y, dest, control_w, sign);
+ if (tag < 0)
+ return tag;
+
+ FPU_settagi(deststnr, tag);
+ return tag;
+ } else if ((taga <= TW_Denormal) && (tagb <= TW_Denormal)) {
+ if (tagb != TAG_Zero) {
+ /* Want to find Zero/Valid */
+ if (tagb == TW_Denormal) {
+ if (denormal_operand() < 0)
+ return FPU_Exception;
+ }
+
+ /* The result is zero. */
+ FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr);
+ setsign(dest, sign);
+ return TAG_Zero;
+ }
+ /* We have an exception condition, either 0/0 or Valid/Zero. */
+ if (taga == TAG_Zero) {
+ /* 0/0 */
+ return arith_invalid(deststnr);
+ }
+ /* Valid/Zero */
+ return FPU_divide_by_zero(deststnr, sign);
+ }
+ /* Must have infinities, NaNs, etc */
+ else if ((taga == TW_NaN) || (tagb == TW_NaN)) {
+ if (flags & LOADED)
+ return real_2op_NaN((FPU_REG *) rm, flags & 0x0f, 0,
+ st0_ptr);
+
+ if (flags & DEST_RM) {
+ int tag;
+ tag = FPU_gettag0();
+ if (tag == TAG_Special)
+ tag = FPU_Special(st0_ptr);
+ return real_2op_NaN(st0_ptr, tag, rm,
+ (flags & REV) ? st0_ptr : &st(rm));
+ } else {
+ int tag;
+ tag = FPU_gettagi(rm);
+ if (tag == TAG_Special)
+ tag = FPU_Special(&st(rm));
+ return real_2op_NaN(&st(rm), tag, 0,
+ (flags & REV) ? st0_ptr : &st(rm));
+ }
+ } else if (taga == TW_Infinity) {
+ if (tagb == TW_Infinity) {
+ /* infinity/infinity */
+ return arith_invalid(deststnr);
+ } else {
+ /* tagb must be Valid or Zero */
+ if ((tagb == TW_Denormal) && (denormal_operand() < 0))
+ return FPU_Exception;
+
+ /* Infinity divided by Zero or Valid does
+ not raise and exception, but returns Infinity */
+ FPU_copy_to_regi(a, TAG_Special, deststnr);
+ setsign(dest, sign);
+ return taga;
+ }
+ } else if (tagb == TW_Infinity) {
+ if ((taga == TW_Denormal) && (denormal_operand() < 0))
+ return FPU_Exception;
+
+ /* The result is zero. */
+ FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr);
+ setsign(dest, sign);
+ return TAG_Zero;
+ }
+#ifdef PARANOID
+ else {
+ EXCEPTION(EX_INTERNAL | 0x102);
+ return FPU_Exception;
+ }
+#endif /* PARANOID */
+
+ return 0;
+}
diff --git a/arch/x86/math-emu/reg_ld_str.c b/arch/x86/math-emu/reg_ld_str.c
new file mode 100644
index 000000000..d40ff4549
--- /dev/null
+++ b/arch/x86/math-emu/reg_ld_str.c
@@ -0,0 +1,1220 @@
+// SPDX-License-Identifier: GPL-2.0
+/*---------------------------------------------------------------------------+
+ | reg_ld_str.c |
+ | |
+ | All of the functions which transfer data between user memory and FPU_REGs.|
+ | |
+ | Copyright (C) 1992,1993,1994,1996,1997 |
+ | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia |
+ | E-mail billm@suburbia.net |
+ | |
+ | |
+ +---------------------------------------------------------------------------*/
+
+/*---------------------------------------------------------------------------+
+ | Note: |
+ | The file contains code which accesses user memory. |
+ | Emulator static data may change when user memory is accessed, due to |
+ | other processes using the emulator while swapping is in progress. |
+ +---------------------------------------------------------------------------*/
+
+#include "fpu_emu.h"
+
+#include <linux/uaccess.h>
+
+#include "fpu_system.h"
+#include "exception.h"
+#include "reg_constant.h"
+#include "control_w.h"
+#include "status_w.h"
+
+#define DOUBLE_Emax 1023 /* largest valid exponent */
+#define DOUBLE_Ebias 1023
+#define DOUBLE_Emin (-1022) /* smallest valid exponent */
+
+#define SINGLE_Emax 127 /* largest valid exponent */
+#define SINGLE_Ebias 127
+#define SINGLE_Emin (-126) /* smallest valid exponent */
+
+static u_char normalize_no_excep(FPU_REG *r, int exp, int sign)
+{
+ u_char tag;
+
+ setexponent16(r, exp);
+
+ tag = FPU_normalize_nuo(r);
+ stdexp(r);
+ if (sign)
+ setnegative(r);
+
+ return tag;
+}
+
+int FPU_tagof(FPU_REG *ptr)
+{
+ int exp;
+
+ exp = exponent16(ptr) & 0x7fff;
+ if (exp == 0) {
+ if (!(ptr->sigh | ptr->sigl)) {
+ return TAG_Zero;
+ }
+ /* The number is a de-normal or pseudodenormal. */
+ return TAG_Special;
+ }
+
+ if (exp == 0x7fff) {
+ /* Is an Infinity, a NaN, or an unsupported data type. */
+ return TAG_Special;
+ }
+
+ if (!(ptr->sigh & 0x80000000)) {
+ /* Unsupported data type. */
+ /* Valid numbers have the ms bit set to 1. */
+ /* Unnormal. */
+ return TAG_Special;
+ }
+
+ return TAG_Valid;
+}
+
+/* Get a long double from user memory */
+int FPU_load_extended(long double __user *s, int stnr)
+{
+ FPU_REG *sti_ptr = &st(stnr);
+
+ RE_ENTRANT_CHECK_OFF;
+ FPU_access_ok(VERIFY_READ, s, 10);
+ __copy_from_user(sti_ptr, s, 10);
+ RE_ENTRANT_CHECK_ON;
+
+ return FPU_tagof(sti_ptr);
+}
+
+/* Get a double from user memory */
+int FPU_load_double(double __user *dfloat, FPU_REG *loaded_data)
+{
+ int exp, tag, negative;
+ unsigned m64, l64;
+
+ RE_ENTRANT_CHECK_OFF;
+ FPU_access_ok(VERIFY_READ, dfloat, 8);
+ FPU_get_user(m64, 1 + (unsigned long __user *)dfloat);
+ FPU_get_user(l64, (unsigned long __user *)dfloat);
+ RE_ENTRANT_CHECK_ON;
+
+ negative = (m64 & 0x80000000) ? SIGN_Negative : SIGN_Positive;
+ exp = ((m64 & 0x7ff00000) >> 20) - DOUBLE_Ebias + EXTENDED_Ebias;
+ m64 &= 0xfffff;
+ if (exp > DOUBLE_Emax + EXTENDED_Ebias) {
+ /* Infinity or NaN */
+ if ((m64 == 0) && (l64 == 0)) {
+ /* +- infinity */
+ loaded_data->sigh = 0x80000000;
+ loaded_data->sigl = 0x00000000;
+ exp = EXP_Infinity + EXTENDED_Ebias;
+ tag = TAG_Special;
+ } else {
+ /* Must be a signaling or quiet NaN */
+ exp = EXP_NaN + EXTENDED_Ebias;
+ loaded_data->sigh = (m64 << 11) | 0x80000000;
+ loaded_data->sigh |= l64 >> 21;
+ loaded_data->sigl = l64 << 11;
+ tag = TAG_Special; /* The calling function must look for NaNs */
+ }
+ } else if (exp < DOUBLE_Emin + EXTENDED_Ebias) {
+ /* Zero or de-normal */
+ if ((m64 == 0) && (l64 == 0)) {
+ /* Zero */
+ reg_copy(&CONST_Z, loaded_data);
+ exp = 0;
+ tag = TAG_Zero;
+ } else {
+ /* De-normal */
+ loaded_data->sigh = m64 << 11;
+ loaded_data->sigh |= l64 >> 21;
+ loaded_data->sigl = l64 << 11;
+
+ return normalize_no_excep(loaded_data, DOUBLE_Emin,
+ negative)
+ | (denormal_operand() < 0 ? FPU_Exception : 0);
+ }
+ } else {
+ loaded_data->sigh = (m64 << 11) | 0x80000000;
+ loaded_data->sigh |= l64 >> 21;
+ loaded_data->sigl = l64 << 11;
+
+ tag = TAG_Valid;
+ }
+
+ setexponent16(loaded_data, exp | negative);
+
+ return tag;
+}
+
+/* Get a float from user memory */
+int FPU_load_single(float __user *single, FPU_REG *loaded_data)
+{
+ unsigned m32;
+ int exp, tag, negative;
+
+ RE_ENTRANT_CHECK_OFF;
+ FPU_access_ok(VERIFY_READ, single, 4);
+ FPU_get_user(m32, (unsigned long __user *)single);
+ RE_ENTRANT_CHECK_ON;
+
+ negative = (m32 & 0x80000000) ? SIGN_Negative : SIGN_Positive;
+
+ if (!(m32 & 0x7fffffff)) {
+ /* Zero */
+ reg_copy(&CONST_Z, loaded_data);
+ addexponent(loaded_data, negative);
+ return TAG_Zero;
+ }
+ exp = ((m32 & 0x7f800000) >> 23) - SINGLE_Ebias + EXTENDED_Ebias;
+ m32 = (m32 & 0x7fffff) << 8;
+ if (exp < SINGLE_Emin + EXTENDED_Ebias) {
+ /* De-normals */
+ loaded_data->sigh = m32;
+ loaded_data->sigl = 0;
+
+ return normalize_no_excep(loaded_data, SINGLE_Emin, negative)
+ | (denormal_operand() < 0 ? FPU_Exception : 0);
+ } else if (exp > SINGLE_Emax + EXTENDED_Ebias) {
+ /* Infinity or NaN */
+ if (m32 == 0) {
+ /* +- infinity */
+ loaded_data->sigh = 0x80000000;
+ loaded_data->sigl = 0x00000000;
+ exp = EXP_Infinity + EXTENDED_Ebias;
+ tag = TAG_Special;
+ } else {
+ /* Must be a signaling or quiet NaN */
+ exp = EXP_NaN + EXTENDED_Ebias;
+ loaded_data->sigh = m32 | 0x80000000;
+ loaded_data->sigl = 0;
+ tag = TAG_Special; /* The calling function must look for NaNs */
+ }
+ } else {
+ loaded_data->sigh = m32 | 0x80000000;
+ loaded_data->sigl = 0;
+ tag = TAG_Valid;
+ }
+
+ setexponent16(loaded_data, exp | negative); /* Set the sign. */
+
+ return tag;
+}
+
+/* Get a long long from user memory */
+int FPU_load_int64(long long __user *_s)
+{
+ long long s;
+ int sign;
+ FPU_REG *st0_ptr = &st(0);
+
+ RE_ENTRANT_CHECK_OFF;
+ FPU_access_ok(VERIFY_READ, _s, 8);
+ if (copy_from_user(&s, _s, 8))
+ FPU_abort;
+ RE_ENTRANT_CHECK_ON;
+
+ if (s == 0) {
+ reg_copy(&CONST_Z, st0_ptr);
+ return TAG_Zero;
+ }
+
+ if (s > 0)
+ sign = SIGN_Positive;
+ else {
+ s = -s;
+ sign = SIGN_Negative;
+ }
+
+ significand(st0_ptr) = s;
+
+ return normalize_no_excep(st0_ptr, 63, sign);
+}
+
+/* Get a long from user memory */
+int FPU_load_int32(long __user *_s, FPU_REG *loaded_data)
+{
+ long s;
+ int negative;
+
+ RE_ENTRANT_CHECK_OFF;
+ FPU_access_ok(VERIFY_READ, _s, 4);
+ FPU_get_user(s, _s);
+ RE_ENTRANT_CHECK_ON;
+
+ if (s == 0) {
+ reg_copy(&CONST_Z, loaded_data);
+ return TAG_Zero;
+ }
+
+ if (s > 0)
+ negative = SIGN_Positive;
+ else {
+ s = -s;
+ negative = SIGN_Negative;
+ }
+
+ loaded_data->sigh = s;
+ loaded_data->sigl = 0;
+
+ return normalize_no_excep(loaded_data, 31, negative);
+}
+
+/* Get a short from user memory */
+int FPU_load_int16(short __user *_s, FPU_REG *loaded_data)
+{
+ int s, negative;
+
+ RE_ENTRANT_CHECK_OFF;
+ FPU_access_ok(VERIFY_READ, _s, 2);
+ /* Cast as short to get the sign extended. */
+ FPU_get_user(s, _s);
+ RE_ENTRANT_CHECK_ON;
+
+ if (s == 0) {
+ reg_copy(&CONST_Z, loaded_data);
+ return TAG_Zero;
+ }
+
+ if (s > 0)
+ negative = SIGN_Positive;
+ else {
+ s = -s;
+ negative = SIGN_Negative;
+ }
+
+ loaded_data->sigh = s << 16;
+ loaded_data->sigl = 0;
+
+ return normalize_no_excep(loaded_data, 15, negative);
+}
+
+/* Get a packed bcd array from user memory */
+int FPU_load_bcd(u_char __user *s)
+{
+ FPU_REG *st0_ptr = &st(0);
+ int pos;
+ u_char bcd;
+ long long l = 0;
+ int sign;
+
+ RE_ENTRANT_CHECK_OFF;
+ FPU_access_ok(VERIFY_READ, s, 10);
+ RE_ENTRANT_CHECK_ON;
+ for (pos = 8; pos >= 0; pos--) {
+ l *= 10;
+ RE_ENTRANT_CHECK_OFF;
+ FPU_get_user(bcd, s + pos);
+ RE_ENTRANT_CHECK_ON;
+ l += bcd >> 4;
+ l *= 10;
+ l += bcd & 0x0f;
+ }
+
+ RE_ENTRANT_CHECK_OFF;
+ FPU_get_user(sign, s + 9);
+ sign = sign & 0x80 ? SIGN_Negative : SIGN_Positive;
+ RE_ENTRANT_CHECK_ON;
+
+ if (l == 0) {
+ reg_copy(&CONST_Z, st0_ptr);
+ addexponent(st0_ptr, sign); /* Set the sign. */
+ return TAG_Zero;
+ } else {
+ significand(st0_ptr) = l;
+ return normalize_no_excep(st0_ptr, 63, sign);
+ }
+}
+
+/*===========================================================================*/
+
+/* Put a long double into user memory */
+int FPU_store_extended(FPU_REG *st0_ptr, u_char st0_tag,
+ long double __user * d)
+{
+ /*
+ The only exception raised by an attempt to store to an
+ extended format is the Invalid Stack exception, i.e.
+ attempting to store from an empty register.
+ */
+
+ if (st0_tag != TAG_Empty) {
+ RE_ENTRANT_CHECK_OFF;
+ FPU_access_ok(VERIFY_WRITE, d, 10);
+
+ FPU_put_user(st0_ptr->sigl, (unsigned long __user *)d);
+ FPU_put_user(st0_ptr->sigh,
+ (unsigned long __user *)((u_char __user *) d + 4));
+ FPU_put_user(exponent16(st0_ptr),
+ (unsigned short __user *)((u_char __user *) d +
+ 8));
+ RE_ENTRANT_CHECK_ON;
+
+ return 1;
+ }
+
+ /* Empty register (stack underflow) */
+ EXCEPTION(EX_StackUnder);
+ if (control_word & CW_Invalid) {
+ /* The masked response */
+ /* Put out the QNaN indefinite */
+ RE_ENTRANT_CHECK_OFF;
+ FPU_access_ok(VERIFY_WRITE, d, 10);
+ FPU_put_user(0, (unsigned long __user *)d);
+ FPU_put_user(0xc0000000, 1 + (unsigned long __user *)d);
+ FPU_put_user(0xffff, 4 + (short __user *)d);
+ RE_ENTRANT_CHECK_ON;
+ return 1;
+ } else
+ return 0;
+
+}
+
+/* Put a double into user memory */
+int FPU_store_double(FPU_REG *st0_ptr, u_char st0_tag, double __user *dfloat)
+{
+ unsigned long l[2];
+ unsigned long increment = 0; /* avoid gcc warnings */
+ int precision_loss;
+ int exp;
+ FPU_REG tmp;
+
+ l[0] = 0;
+ l[1] = 0;
+ if (st0_tag == TAG_Valid) {
+ reg_copy(st0_ptr, &tmp);
+ exp = exponent(&tmp);
+
+ if (exp < DOUBLE_Emin) { /* It may be a denormal */
+ addexponent(&tmp, -DOUBLE_Emin + 52); /* largest exp to be 51 */
+denormal_arg:
+ if ((precision_loss = FPU_round_to_int(&tmp, st0_tag))) {
+#ifdef PECULIAR_486
+ /* Did it round to a non-denormal ? */
+ /* This behaviour might be regarded as peculiar, it appears
+ that the 80486 rounds to the dest precision, then
+ converts to decide underflow. */
+ if (!
+ ((tmp.sigh == 0x00100000) && (tmp.sigl == 0)
+ && (st0_ptr->sigl & 0x000007ff)))
+#endif /* PECULIAR_486 */
+ {
+ EXCEPTION(EX_Underflow);
+ /* This is a special case: see sec 16.2.5.1 of
+ the 80486 book */
+ if (!(control_word & CW_Underflow))
+ return 0;
+ }
+ EXCEPTION(precision_loss);
+ if (!(control_word & CW_Precision))
+ return 0;
+ }
+ l[0] = tmp.sigl;
+ l[1] = tmp.sigh;
+ } else {
+ if (tmp.sigl & 0x000007ff) {
+ precision_loss = 1;
+ switch (control_word & CW_RC) {
+ case RC_RND:
+ /* Rounding can get a little messy.. */
+ increment = ((tmp.sigl & 0x7ff) > 0x400) | /* nearest */
+ ((tmp.sigl & 0xc00) == 0xc00); /* odd -> even */
+ break;
+ case RC_DOWN: /* towards -infinity */
+ increment =
+ signpositive(&tmp) ? 0 : tmp.
+ sigl & 0x7ff;
+ break;
+ case RC_UP: /* towards +infinity */
+ increment =
+ signpositive(&tmp) ? tmp.
+ sigl & 0x7ff : 0;
+ break;
+ case RC_CHOP:
+ increment = 0;
+ break;
+ }
+
+ /* Truncate the mantissa */
+ tmp.sigl &= 0xfffff800;
+
+ if (increment) {
+ if (tmp.sigl >= 0xfffff800) {
+ /* the sigl part overflows */
+ if (tmp.sigh == 0xffffffff) {
+ /* The sigh part overflows */
+ tmp.sigh = 0x80000000;
+ exp++;
+ if (exp >= EXP_OVER)
+ goto overflow;
+ } else {
+ tmp.sigh++;
+ }
+ tmp.sigl = 0x00000000;
+ } else {
+ /* We only need to increment sigl */
+ tmp.sigl += 0x00000800;
+ }
+ }
+ } else
+ precision_loss = 0;
+
+ l[0] = (tmp.sigl >> 11) | (tmp.sigh << 21);
+ l[1] = ((tmp.sigh >> 11) & 0xfffff);
+
+ if (exp > DOUBLE_Emax) {
+ overflow:
+ EXCEPTION(EX_Overflow);
+ if (!(control_word & CW_Overflow))
+ return 0;
+ set_precision_flag_up();
+ if (!(control_word & CW_Precision))
+ return 0;
+
+ /* This is a special case: see sec 16.2.5.1 of the 80486 book */
+ /* Overflow to infinity */
+ l[1] = 0x7ff00000; /* Set to + INF */
+ } else {
+ if (precision_loss) {
+ if (increment)
+ set_precision_flag_up();
+ else
+ set_precision_flag_down();
+ }
+ /* Add the exponent */
+ l[1] |= (((exp + DOUBLE_Ebias) & 0x7ff) << 20);
+ }
+ }
+ } else if (st0_tag == TAG_Zero) {
+ /* Number is zero */
+ } else if (st0_tag == TAG_Special) {
+ st0_tag = FPU_Special(st0_ptr);
+ if (st0_tag == TW_Denormal) {
+ /* A denormal will always underflow. */
+#ifndef PECULIAR_486
+ /* An 80486 is supposed to be able to generate
+ a denormal exception here, but... */
+ /* Underflow has priority. */
+ if (control_word & CW_Underflow)
+ denormal_operand();
+#endif /* PECULIAR_486 */
+ reg_copy(st0_ptr, &tmp);
+ goto denormal_arg;
+ } else if (st0_tag == TW_Infinity) {
+ l[1] = 0x7ff00000;
+ } else if (st0_tag == TW_NaN) {
+ /* Is it really a NaN ? */
+ if ((exponent(st0_ptr) == EXP_OVER)
+ && (st0_ptr->sigh & 0x80000000)) {
+ /* See if we can get a valid NaN from the FPU_REG */
+ l[0] =
+ (st0_ptr->sigl >> 11) | (st0_ptr->
+ sigh << 21);
+ l[1] = ((st0_ptr->sigh >> 11) & 0xfffff);
+ if (!(st0_ptr->sigh & 0x40000000)) {
+ /* It is a signalling NaN */
+ EXCEPTION(EX_Invalid);
+ if (!(control_word & CW_Invalid))
+ return 0;
+ l[1] |= (0x40000000 >> 11);
+ }
+ l[1] |= 0x7ff00000;
+ } else {
+ /* It is an unsupported data type */
+ EXCEPTION(EX_Invalid);
+ if (!(control_word & CW_Invalid))
+ return 0;
+ l[1] = 0xfff80000;
+ }
+ }
+ } else if (st0_tag == TAG_Empty) {
+ /* Empty register (stack underflow) */
+ EXCEPTION(EX_StackUnder);
+ if (control_word & CW_Invalid) {
+ /* The masked response */
+ /* Put out the QNaN indefinite */
+ RE_ENTRANT_CHECK_OFF;
+ FPU_access_ok(VERIFY_WRITE, dfloat, 8);
+ FPU_put_user(0, (unsigned long __user *)dfloat);
+ FPU_put_user(0xfff80000,
+ 1 + (unsigned long __user *)dfloat);
+ RE_ENTRANT_CHECK_ON;
+ return 1;
+ } else
+ return 0;
+ }
+ if (getsign(st0_ptr))
+ l[1] |= 0x80000000;
+
+ RE_ENTRANT_CHECK_OFF;
+ FPU_access_ok(VERIFY_WRITE, dfloat, 8);
+ FPU_put_user(l[0], (unsigned long __user *)dfloat);
+ FPU_put_user(l[1], 1 + (unsigned long __user *)dfloat);
+ RE_ENTRANT_CHECK_ON;
+
+ return 1;
+}
+
+/* Put a float into user memory */
+int FPU_store_single(FPU_REG *st0_ptr, u_char st0_tag, float __user *single)
+{
+ long templ = 0;
+ unsigned long increment = 0; /* avoid gcc warnings */
+ int precision_loss;
+ int exp;
+ FPU_REG tmp;
+
+ if (st0_tag == TAG_Valid) {
+
+ reg_copy(st0_ptr, &tmp);
+ exp = exponent(&tmp);
+
+ if (exp < SINGLE_Emin) {
+ addexponent(&tmp, -SINGLE_Emin + 23); /* largest exp to be 22 */
+
+ denormal_arg:
+
+ if ((precision_loss = FPU_round_to_int(&tmp, st0_tag))) {
+#ifdef PECULIAR_486
+ /* Did it round to a non-denormal ? */
+ /* This behaviour might be regarded as peculiar, it appears
+ that the 80486 rounds to the dest precision, then
+ converts to decide underflow. */
+ if (!((tmp.sigl == 0x00800000) &&
+ ((st0_ptr->sigh & 0x000000ff)
+ || st0_ptr->sigl)))
+#endif /* PECULIAR_486 */
+ {
+ EXCEPTION(EX_Underflow);
+ /* This is a special case: see sec 16.2.5.1 of
+ the 80486 book */
+ if (!(control_word & CW_Underflow))
+ return 0;
+ }
+ EXCEPTION(precision_loss);
+ if (!(control_word & CW_Precision))
+ return 0;
+ }
+ templ = tmp.sigl;
+ } else {
+ if (tmp.sigl | (tmp.sigh & 0x000000ff)) {
+ unsigned long sigh = tmp.sigh;
+ unsigned long sigl = tmp.sigl;
+
+ precision_loss = 1;
+ switch (control_word & CW_RC) {
+ case RC_RND:
+ increment = ((sigh & 0xff) > 0x80) /* more than half */
+ ||(((sigh & 0xff) == 0x80) && sigl) /* more than half */
+ ||((sigh & 0x180) == 0x180); /* round to even */
+ break;
+ case RC_DOWN: /* towards -infinity */
+ increment = signpositive(&tmp)
+ ? 0 : (sigl | (sigh & 0xff));
+ break;
+ case RC_UP: /* towards +infinity */
+ increment = signpositive(&tmp)
+ ? (sigl | (sigh & 0xff)) : 0;
+ break;
+ case RC_CHOP:
+ increment = 0;
+ break;
+ }
+
+ /* Truncate part of the mantissa */
+ tmp.sigl = 0;
+
+ if (increment) {
+ if (sigh >= 0xffffff00) {
+ /* The sigh part overflows */
+ tmp.sigh = 0x80000000;
+ exp++;
+ if (exp >= EXP_OVER)
+ goto overflow;
+ } else {
+ tmp.sigh &= 0xffffff00;
+ tmp.sigh += 0x100;
+ }
+ } else {
+ tmp.sigh &= 0xffffff00; /* Finish the truncation */
+ }
+ } else
+ precision_loss = 0;
+
+ templ = (tmp.sigh >> 8) & 0x007fffff;
+
+ if (exp > SINGLE_Emax) {
+ overflow:
+ EXCEPTION(EX_Overflow);
+ if (!(control_word & CW_Overflow))
+ return 0;
+ set_precision_flag_up();
+ if (!(control_word & CW_Precision))
+ return 0;
+
+ /* This is a special case: see sec 16.2.5.1 of the 80486 book. */
+ /* Masked response is overflow to infinity. */
+ templ = 0x7f800000;
+ } else {
+ if (precision_loss) {
+ if (increment)
+ set_precision_flag_up();
+ else
+ set_precision_flag_down();
+ }
+ /* Add the exponent */
+ templ |= ((exp + SINGLE_Ebias) & 0xff) << 23;
+ }
+ }
+ } else if (st0_tag == TAG_Zero) {
+ templ = 0;
+ } else if (st0_tag == TAG_Special) {
+ st0_tag = FPU_Special(st0_ptr);
+ if (st0_tag == TW_Denormal) {
+ reg_copy(st0_ptr, &tmp);
+
+ /* A denormal will always underflow. */
+#ifndef PECULIAR_486
+ /* An 80486 is supposed to be able to generate
+ a denormal exception here, but... */
+ /* Underflow has priority. */
+ if (control_word & CW_Underflow)
+ denormal_operand();
+#endif /* PECULIAR_486 */
+ goto denormal_arg;
+ } else if (st0_tag == TW_Infinity) {
+ templ = 0x7f800000;
+ } else if (st0_tag == TW_NaN) {
+ /* Is it really a NaN ? */
+ if ((exponent(st0_ptr) == EXP_OVER)
+ && (st0_ptr->sigh & 0x80000000)) {
+ /* See if we can get a valid NaN from the FPU_REG */
+ templ = st0_ptr->sigh >> 8;
+ if (!(st0_ptr->sigh & 0x40000000)) {
+ /* It is a signalling NaN */
+ EXCEPTION(EX_Invalid);
+ if (!(control_word & CW_Invalid))
+ return 0;
+ templ |= (0x40000000 >> 8);
+ }
+ templ |= 0x7f800000;
+ } else {
+ /* It is an unsupported data type */
+ EXCEPTION(EX_Invalid);
+ if (!(control_word & CW_Invalid))
+ return 0;
+ templ = 0xffc00000;
+ }
+ }
+#ifdef PARANOID
+ else {
+ EXCEPTION(EX_INTERNAL | 0x164);
+ return 0;
+ }
+#endif
+ } else if (st0_tag == TAG_Empty) {
+ /* Empty register (stack underflow) */
+ EXCEPTION(EX_StackUnder);
+ if (control_word & EX_Invalid) {
+ /* The masked response */
+ /* Put out the QNaN indefinite */
+ RE_ENTRANT_CHECK_OFF;
+ FPU_access_ok(VERIFY_WRITE, single, 4);
+ FPU_put_user(0xffc00000,
+ (unsigned long __user *)single);
+ RE_ENTRANT_CHECK_ON;
+ return 1;
+ } else
+ return 0;
+ }
+#ifdef PARANOID
+ else {
+ EXCEPTION(EX_INTERNAL | 0x163);
+ return 0;
+ }
+#endif
+ if (getsign(st0_ptr))
+ templ |= 0x80000000;
+
+ RE_ENTRANT_CHECK_OFF;
+ FPU_access_ok(VERIFY_WRITE, single, 4);
+ FPU_put_user(templ, (unsigned long __user *)single);
+ RE_ENTRANT_CHECK_ON;
+
+ return 1;
+}
+
+/* Put a long long into user memory */
+int FPU_store_int64(FPU_REG *st0_ptr, u_char st0_tag, long long __user *d)
+{
+ FPU_REG t;
+ long long tll;
+ int precision_loss;
+
+ if (st0_tag == TAG_Empty) {
+ /* Empty register (stack underflow) */
+ EXCEPTION(EX_StackUnder);
+ goto invalid_operand;
+ } else if (st0_tag == TAG_Special) {
+ st0_tag = FPU_Special(st0_ptr);
+ if ((st0_tag == TW_Infinity) || (st0_tag == TW_NaN)) {
+ EXCEPTION(EX_Invalid);
+ goto invalid_operand;
+ }
+ }
+
+ reg_copy(st0_ptr, &t);
+ precision_loss = FPU_round_to_int(&t, st0_tag);
+ ((long *)&tll)[0] = t.sigl;
+ ((long *)&tll)[1] = t.sigh;
+ if ((precision_loss == 1) ||
+ ((t.sigh & 0x80000000) &&
+ !((t.sigh == 0x80000000) && (t.sigl == 0) && signnegative(&t)))) {
+ EXCEPTION(EX_Invalid);
+ /* This is a special case: see sec 16.2.5.1 of the 80486 book */
+ invalid_operand:
+ if (control_word & EX_Invalid) {
+ /* Produce something like QNaN "indefinite" */
+ tll = 0x8000000000000000LL;
+ } else
+ return 0;
+ } else {
+ if (precision_loss)
+ set_precision_flag(precision_loss);
+ if (signnegative(&t))
+ tll = -tll;
+ }
+
+ RE_ENTRANT_CHECK_OFF;
+ FPU_access_ok(VERIFY_WRITE, d, 8);
+ if (copy_to_user(d, &tll, 8))
+ FPU_abort;
+ RE_ENTRANT_CHECK_ON;
+
+ return 1;
+}
+
+/* Put a long into user memory */
+int FPU_store_int32(FPU_REG *st0_ptr, u_char st0_tag, long __user *d)
+{
+ FPU_REG t;
+ int precision_loss;
+
+ if (st0_tag == TAG_Empty) {
+ /* Empty register (stack underflow) */
+ EXCEPTION(EX_StackUnder);
+ goto invalid_operand;
+ } else if (st0_tag == TAG_Special) {
+ st0_tag = FPU_Special(st0_ptr);
+ if ((st0_tag == TW_Infinity) || (st0_tag == TW_NaN)) {
+ EXCEPTION(EX_Invalid);
+ goto invalid_operand;
+ }
+ }
+
+ reg_copy(st0_ptr, &t);
+ precision_loss = FPU_round_to_int(&t, st0_tag);
+ if (t.sigh ||
+ ((t.sigl & 0x80000000) &&
+ !((t.sigl == 0x80000000) && signnegative(&t)))) {
+ EXCEPTION(EX_Invalid);
+ /* This is a special case: see sec 16.2.5.1 of the 80486 book */
+ invalid_operand:
+ if (control_word & EX_Invalid) {
+ /* Produce something like QNaN "indefinite" */
+ t.sigl = 0x80000000;
+ } else
+ return 0;
+ } else {
+ if (precision_loss)
+ set_precision_flag(precision_loss);
+ if (signnegative(&t))
+ t.sigl = -(long)t.sigl;
+ }
+
+ RE_ENTRANT_CHECK_OFF;
+ FPU_access_ok(VERIFY_WRITE, d, 4);
+ FPU_put_user(t.sigl, (unsigned long __user *)d);
+ RE_ENTRANT_CHECK_ON;
+
+ return 1;
+}
+
+/* Put a short into user memory */
+int FPU_store_int16(FPU_REG *st0_ptr, u_char st0_tag, short __user *d)
+{
+ FPU_REG t;
+ int precision_loss;
+
+ if (st0_tag == TAG_Empty) {
+ /* Empty register (stack underflow) */
+ EXCEPTION(EX_StackUnder);
+ goto invalid_operand;
+ } else if (st0_tag == TAG_Special) {
+ st0_tag = FPU_Special(st0_ptr);
+ if ((st0_tag == TW_Infinity) || (st0_tag == TW_NaN)) {
+ EXCEPTION(EX_Invalid);
+ goto invalid_operand;
+ }
+ }
+
+ reg_copy(st0_ptr, &t);
+ precision_loss = FPU_round_to_int(&t, st0_tag);
+ if (t.sigh ||
+ ((t.sigl & 0xffff8000) &&
+ !((t.sigl == 0x8000) && signnegative(&t)))) {
+ EXCEPTION(EX_Invalid);
+ /* This is a special case: see sec 16.2.5.1 of the 80486 book */
+ invalid_operand:
+ if (control_word & EX_Invalid) {
+ /* Produce something like QNaN "indefinite" */
+ t.sigl = 0x8000;
+ } else
+ return 0;
+ } else {
+ if (precision_loss)
+ set_precision_flag(precision_loss);
+ if (signnegative(&t))
+ t.sigl = -t.sigl;
+ }
+
+ RE_ENTRANT_CHECK_OFF;
+ FPU_access_ok(VERIFY_WRITE, d, 2);
+ FPU_put_user((short)t.sigl, d);
+ RE_ENTRANT_CHECK_ON;
+
+ return 1;
+}
+
+/* Put a packed bcd array into user memory */
+int FPU_store_bcd(FPU_REG *st0_ptr, u_char st0_tag, u_char __user *d)
+{
+ FPU_REG t;
+ unsigned long long ll;
+ u_char b;
+ int i, precision_loss;
+ u_char sign = (getsign(st0_ptr) == SIGN_NEG) ? 0x80 : 0;
+
+ if (st0_tag == TAG_Empty) {
+ /* Empty register (stack underflow) */
+ EXCEPTION(EX_StackUnder);
+ goto invalid_operand;
+ } else if (st0_tag == TAG_Special) {
+ st0_tag = FPU_Special(st0_ptr);
+ if ((st0_tag == TW_Infinity) || (st0_tag == TW_NaN)) {
+ EXCEPTION(EX_Invalid);
+ goto invalid_operand;
+ }
+ }
+
+ reg_copy(st0_ptr, &t);
+ precision_loss = FPU_round_to_int(&t, st0_tag);
+ ll = significand(&t);
+
+ /* Check for overflow, by comparing with 999999999999999999 decimal. */
+ if ((t.sigh > 0x0de0b6b3) ||
+ ((t.sigh == 0x0de0b6b3) && (t.sigl > 0xa763ffff))) {
+ EXCEPTION(EX_Invalid);
+ /* This is a special case: see sec 16.2.5.1 of the 80486 book */
+ invalid_operand:
+ if (control_word & CW_Invalid) {
+ /* Produce the QNaN "indefinite" */
+ RE_ENTRANT_CHECK_OFF;
+ FPU_access_ok(VERIFY_WRITE, d, 10);
+ for (i = 0; i < 7; i++)
+ FPU_put_user(0, d + i); /* These bytes "undefined" */
+ FPU_put_user(0xc0, d + 7); /* This byte "undefined" */
+ FPU_put_user(0xff, d + 8);
+ FPU_put_user(0xff, d + 9);
+ RE_ENTRANT_CHECK_ON;
+ return 1;
+ } else
+ return 0;
+ } else if (precision_loss) {
+ /* Precision loss doesn't stop the data transfer */
+ set_precision_flag(precision_loss);
+ }
+
+ RE_ENTRANT_CHECK_OFF;
+ FPU_access_ok(VERIFY_WRITE, d, 10);
+ RE_ENTRANT_CHECK_ON;
+ for (i = 0; i < 9; i++) {
+ b = FPU_div_small(&ll, 10);
+ b |= (FPU_div_small(&ll, 10)) << 4;
+ RE_ENTRANT_CHECK_OFF;
+ FPU_put_user(b, d + i);
+ RE_ENTRANT_CHECK_ON;
+ }
+ RE_ENTRANT_CHECK_OFF;
+ FPU_put_user(sign, d + 9);
+ RE_ENTRANT_CHECK_ON;
+
+ return 1;
+}
+
+/*===========================================================================*/
+
+/* r gets mangled such that sig is int, sign:
+ it is NOT normalized */
+/* The return value (in eax) is zero if the result is exact,
+ if bits are changed due to rounding, truncation, etc, then
+ a non-zero value is returned */
+/* Overflow is signalled by a non-zero return value (in eax).
+ In the case of overflow, the returned significand always has the
+ largest possible value */
+int FPU_round_to_int(FPU_REG *r, u_char tag)
+{
+ u_char very_big;
+ unsigned eax;
+
+ if (tag == TAG_Zero) {
+ /* Make sure that zero is returned */
+ significand(r) = 0;
+ return 0; /* o.k. */
+ }
+
+ if (exponent(r) > 63) {
+ r->sigl = r->sigh = ~0; /* The largest representable number */
+ return 1; /* overflow */
+ }
+
+ eax = FPU_shrxs(&r->sigl, 63 - exponent(r));
+ very_big = !(~(r->sigh) | ~(r->sigl)); /* test for 0xfff...fff */
+#define half_or_more (eax & 0x80000000)
+#define frac_part (eax)
+#define more_than_half ((eax & 0x80000001) == 0x80000001)
+ switch (control_word & CW_RC) {
+ case RC_RND:
+ if (more_than_half /* nearest */
+ || (half_or_more && (r->sigl & 1))) { /* odd -> even */
+ if (very_big)
+ return 1; /* overflow */
+ significand(r)++;
+ return PRECISION_LOST_UP;
+ }
+ break;
+ case RC_DOWN:
+ if (frac_part && getsign(r)) {
+ if (very_big)
+ return 1; /* overflow */
+ significand(r)++;
+ return PRECISION_LOST_UP;
+ }
+ break;
+ case RC_UP:
+ if (frac_part && !getsign(r)) {
+ if (very_big)
+ return 1; /* overflow */
+ significand(r)++;
+ return PRECISION_LOST_UP;
+ }
+ break;
+ case RC_CHOP:
+ break;
+ }
+
+ return eax ? PRECISION_LOST_DOWN : 0;
+
+}
+
+/*===========================================================================*/
+
+u_char __user *fldenv(fpu_addr_modes addr_modes, u_char __user *s)
+{
+ unsigned short tag_word = 0;
+ u_char tag;
+ int i;
+
+ if ((addr_modes.default_mode == VM86) ||
+ ((addr_modes.default_mode == PM16)
+ ^ (addr_modes.override.operand_size == OP_SIZE_PREFIX))) {
+ RE_ENTRANT_CHECK_OFF;
+ FPU_access_ok(VERIFY_READ, s, 0x0e);
+ FPU_get_user(control_word, (unsigned short __user *)s);
+ FPU_get_user(partial_status, (unsigned short __user *)(s + 2));
+ FPU_get_user(tag_word, (unsigned short __user *)(s + 4));
+ FPU_get_user(instruction_address.offset,
+ (unsigned short __user *)(s + 6));
+ FPU_get_user(instruction_address.selector,
+ (unsigned short __user *)(s + 8));
+ FPU_get_user(operand_address.offset,
+ (unsigned short __user *)(s + 0x0a));
+ FPU_get_user(operand_address.selector,
+ (unsigned short __user *)(s + 0x0c));
+ RE_ENTRANT_CHECK_ON;
+ s += 0x0e;
+ if (addr_modes.default_mode == VM86) {
+ instruction_address.offset
+ += (instruction_address.selector & 0xf000) << 4;
+ operand_address.offset +=
+ (operand_address.selector & 0xf000) << 4;
+ }
+ } else {
+ RE_ENTRANT_CHECK_OFF;
+ FPU_access_ok(VERIFY_READ, s, 0x1c);
+ FPU_get_user(control_word, (unsigned short __user *)s);
+ FPU_get_user(partial_status, (unsigned short __user *)(s + 4));
+ FPU_get_user(tag_word, (unsigned short __user *)(s + 8));
+ FPU_get_user(instruction_address.offset,
+ (unsigned long __user *)(s + 0x0c));
+ FPU_get_user(instruction_address.selector,
+ (unsigned short __user *)(s + 0x10));
+ FPU_get_user(instruction_address.opcode,
+ (unsigned short __user *)(s + 0x12));
+ FPU_get_user(operand_address.offset,
+ (unsigned long __user *)(s + 0x14));
+ FPU_get_user(operand_address.selector,
+ (unsigned long __user *)(s + 0x18));
+ RE_ENTRANT_CHECK_ON;
+ s += 0x1c;
+ }
+
+#ifdef PECULIAR_486
+ control_word &= ~0xe080;
+#endif /* PECULIAR_486 */
+
+ top = (partial_status >> SW_Top_Shift) & 7;
+
+ if (partial_status & ~control_word & CW_Exceptions)
+ partial_status |= (SW_Summary | SW_Backward);
+ else
+ partial_status &= ~(SW_Summary | SW_Backward);
+
+ for (i = 0; i < 8; i++) {
+ tag = tag_word & 3;
+ tag_word >>= 2;
+
+ if (tag == TAG_Empty)
+ /* New tag is empty. Accept it */
+ FPU_settag(i, TAG_Empty);
+ else if (FPU_gettag(i) == TAG_Empty) {
+ /* Old tag is empty and new tag is not empty. New tag is determined
+ by old reg contents */
+ if (exponent(&fpu_register(i)) == -EXTENDED_Ebias) {
+ if (!
+ (fpu_register(i).sigl | fpu_register(i).
+ sigh))
+ FPU_settag(i, TAG_Zero);
+ else
+ FPU_settag(i, TAG_Special);
+ } else if (exponent(&fpu_register(i)) ==
+ 0x7fff - EXTENDED_Ebias) {
+ FPU_settag(i, TAG_Special);
+ } else if (fpu_register(i).sigh & 0x80000000)
+ FPU_settag(i, TAG_Valid);
+ else
+ FPU_settag(i, TAG_Special); /* An Un-normal */
+ }
+ /* Else old tag is not empty and new tag is not empty. Old tag
+ remains correct */
+ }
+
+ return s;
+}
+
+void frstor(fpu_addr_modes addr_modes, u_char __user *data_address)
+{
+ int i, regnr;
+ u_char __user *s = fldenv(addr_modes, data_address);
+ int offset = (top & 7) * 10, other = 80 - offset;
+
+ /* Copy all registers in stack order. */
+ RE_ENTRANT_CHECK_OFF;
+ FPU_access_ok(VERIFY_READ, s, 80);
+ __copy_from_user(register_base + offset, s, other);
+ if (offset)
+ __copy_from_user(register_base, s + other, offset);
+ RE_ENTRANT_CHECK_ON;
+
+ for (i = 0; i < 8; i++) {
+ regnr = (i + top) & 7;
+ if (FPU_gettag(regnr) != TAG_Empty)
+ /* The loaded data over-rides all other cases. */
+ FPU_settag(regnr, FPU_tagof(&st(i)));
+ }
+
+}
+
+u_char __user *fstenv(fpu_addr_modes addr_modes, u_char __user *d)
+{
+ if ((addr_modes.default_mode == VM86) ||
+ ((addr_modes.default_mode == PM16)
+ ^ (addr_modes.override.operand_size == OP_SIZE_PREFIX))) {
+ RE_ENTRANT_CHECK_OFF;
+ FPU_access_ok(VERIFY_WRITE, d, 14);
+#ifdef PECULIAR_486
+ FPU_put_user(control_word & ~0xe080, (unsigned long __user *)d);
+#else
+ FPU_put_user(control_word, (unsigned short __user *)d);
+#endif /* PECULIAR_486 */
+ FPU_put_user(status_word(), (unsigned short __user *)(d + 2));
+ FPU_put_user(fpu_tag_word, (unsigned short __user *)(d + 4));
+ FPU_put_user(instruction_address.offset,
+ (unsigned short __user *)(d + 6));
+ FPU_put_user(operand_address.offset,
+ (unsigned short __user *)(d + 0x0a));
+ if (addr_modes.default_mode == VM86) {
+ FPU_put_user((instruction_address.
+ offset & 0xf0000) >> 4,
+ (unsigned short __user *)(d + 8));
+ FPU_put_user((operand_address.offset & 0xf0000) >> 4,
+ (unsigned short __user *)(d + 0x0c));
+ } else {
+ FPU_put_user(instruction_address.selector,
+ (unsigned short __user *)(d + 8));
+ FPU_put_user(operand_address.selector,
+ (unsigned short __user *)(d + 0x0c));
+ }
+ RE_ENTRANT_CHECK_ON;
+ d += 0x0e;
+ } else {
+ RE_ENTRANT_CHECK_OFF;
+ FPU_access_ok(VERIFY_WRITE, d, 7 * 4);
+#ifdef PECULIAR_486
+ control_word &= ~0xe080;
+ /* An 80486 sets nearly all of the reserved bits to 1. */
+ control_word |= 0xffff0040;
+ partial_status = status_word() | 0xffff0000;
+ fpu_tag_word |= 0xffff0000;
+ I387->soft.fcs &= ~0xf8000000;
+ I387->soft.fos |= 0xffff0000;
+#endif /* PECULIAR_486 */
+ if (__copy_to_user(d, &control_word, 7 * 4))
+ FPU_abort;
+ RE_ENTRANT_CHECK_ON;
+ d += 0x1c;
+ }
+
+ control_word |= CW_Exceptions;
+ partial_status &= ~(SW_Summary | SW_Backward);
+
+ return d;
+}
+
+void fsave(fpu_addr_modes addr_modes, u_char __user *data_address)
+{
+ u_char __user *d;
+ int offset = (top & 7) * 10, other = 80 - offset;
+
+ d = fstenv(addr_modes, data_address);
+
+ RE_ENTRANT_CHECK_OFF;
+ FPU_access_ok(VERIFY_WRITE, d, 80);
+
+ /* Copy all registers in stack order. */
+ if (__copy_to_user(d, register_base + offset, other))
+ FPU_abort;
+ if (offset)
+ if (__copy_to_user(d + other, register_base, offset))
+ FPU_abort;
+ RE_ENTRANT_CHECK_ON;
+
+ finit();
+}
+
+/*===========================================================================*/
diff --git a/arch/x86/math-emu/reg_mul.c b/arch/x86/math-emu/reg_mul.c
new file mode 100644
index 000000000..d69618572
--- /dev/null
+++ b/arch/x86/math-emu/reg_mul.c
@@ -0,0 +1,116 @@
+// SPDX-License-Identifier: GPL-2.0
+/*---------------------------------------------------------------------------+
+ | reg_mul.c |
+ | |
+ | Multiply one FPU_REG by another, put the result in a destination FPU_REG. |
+ | |
+ | Copyright (C) 1992,1993,1997 |
+ | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia |
+ | E-mail billm@suburbia.net |
+ | |
+ | Returns the tag of the result if no exceptions or errors occurred. |
+ | |
+ +---------------------------------------------------------------------------*/
+
+/*---------------------------------------------------------------------------+
+ | The destination may be any FPU_REG, including one of the source FPU_REGs. |
+ +---------------------------------------------------------------------------*/
+
+#include "fpu_emu.h"
+#include "exception.h"
+#include "reg_constant.h"
+#include "fpu_system.h"
+
+/*
+ Multiply two registers to give a register result.
+ The sources are st(deststnr) and (b,tagb,signb).
+ The destination is st(deststnr).
+ */
+/* This routine must be called with non-empty source registers */
+int FPU_mul(FPU_REG const *b, u_char tagb, int deststnr, int control_w)
+{
+ FPU_REG *a = &st(deststnr);
+ FPU_REG *dest = a;
+ u_char taga = FPU_gettagi(deststnr);
+ u_char saved_sign = getsign(dest);
+ u_char sign = (getsign(a) ^ getsign(b));
+ int tag;
+
+ if (!(taga | tagb)) {
+ /* Both regs Valid, this should be the most common case. */
+
+ tag =
+ FPU_u_mul(a, b, dest, control_w, sign,
+ exponent(a) + exponent(b));
+ if (tag < 0) {
+ setsign(dest, saved_sign);
+ return tag;
+ }
+ FPU_settagi(deststnr, tag);
+ return tag;
+ }
+
+ if (taga == TAG_Special)
+ taga = FPU_Special(a);
+ if (tagb == TAG_Special)
+ tagb = FPU_Special(b);
+
+ if (((taga == TAG_Valid) && (tagb == TW_Denormal))
+ || ((taga == TW_Denormal) && (tagb == TAG_Valid))
+ || ((taga == TW_Denormal) && (tagb == TW_Denormal))) {
+ FPU_REG x, y;
+ if (denormal_operand() < 0)
+ return FPU_Exception;
+
+ FPU_to_exp16(a, &x);
+ FPU_to_exp16(b, &y);
+ tag = FPU_u_mul(&x, &y, dest, control_w, sign,
+ exponent16(&x) + exponent16(&y));
+ if (tag < 0) {
+ setsign(dest, saved_sign);
+ return tag;
+ }
+ FPU_settagi(deststnr, tag);
+ return tag;
+ } else if ((taga <= TW_Denormal) && (tagb <= TW_Denormal)) {
+ if (((tagb == TW_Denormal) || (taga == TW_Denormal))
+ && (denormal_operand() < 0))
+ return FPU_Exception;
+
+ /* Must have either both arguments == zero, or
+ one valid and the other zero.
+ The result is therefore zero. */
+ FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr);
+ /* The 80486 book says that the answer is +0, but a real
+ 80486 behaves this way.
+ IEEE-754 apparently says it should be this way. */
+ setsign(dest, sign);
+ return TAG_Zero;
+ }
+ /* Must have infinities, NaNs, etc */
+ else if ((taga == TW_NaN) || (tagb == TW_NaN)) {
+ return real_2op_NaN(b, tagb, deststnr, &st(0));
+ } else if (((taga == TW_Infinity) && (tagb == TAG_Zero))
+ || ((tagb == TW_Infinity) && (taga == TAG_Zero))) {
+ return arith_invalid(deststnr); /* Zero*Infinity is invalid */
+ } else if (((taga == TW_Denormal) || (tagb == TW_Denormal))
+ && (denormal_operand() < 0)) {
+ return FPU_Exception;
+ } else if (taga == TW_Infinity) {
+ FPU_copy_to_regi(a, TAG_Special, deststnr);
+ setsign(dest, sign);
+ return TAG_Special;
+ } else if (tagb == TW_Infinity) {
+ FPU_copy_to_regi(b, TAG_Special, deststnr);
+ setsign(dest, sign);
+ return TAG_Special;
+ }
+#ifdef PARANOID
+ else {
+ EXCEPTION(EX_INTERNAL | 0x102);
+ return FPU_Exception;
+ }
+#endif /* PARANOID */
+
+ return 0;
+}
diff --git a/arch/x86/math-emu/reg_norm.S b/arch/x86/math-emu/reg_norm.S
new file mode 100644
index 000000000..7f6b4392a
--- /dev/null
+++ b/arch/x86/math-emu/reg_norm.S
@@ -0,0 +1,150 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*---------------------------------------------------------------------------+
+ | reg_norm.S |
+ | |
+ | Copyright (C) 1992,1993,1994,1995,1997 |
+ | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, |
+ | Australia. E-mail billm@suburbia.net |
+ | |
+ | Normalize the value in a FPU_REG. |
+ | |
+ | Call from C as: |
+ | int FPU_normalize(FPU_REG *n) |
+ | |
+ | int FPU_normalize_nuo(FPU_REG *n) |
+ | |
+ | Return value is the tag of the answer, or-ed with FPU_Exception if |
+ | one was raised, or -1 on internal error. |
+ | |
+ +---------------------------------------------------------------------------*/
+
+#include "fpu_emu.h"
+
+
+.text
+ENTRY(FPU_normalize)
+ pushl %ebp
+ movl %esp,%ebp
+ pushl %ebx
+
+ movl PARAM1,%ebx
+
+ movl SIGH(%ebx),%edx
+ movl SIGL(%ebx),%eax
+
+ orl %edx,%edx /* ms bits */
+ js L_done /* Already normalized */
+ jnz L_shift_1 /* Shift left 1 - 31 bits */
+
+ orl %eax,%eax
+ jz L_zero /* The contents are zero */
+
+ movl %eax,%edx
+ xorl %eax,%eax
+ subw $32,EXP(%ebx) /* This can cause an underflow */
+
+/* We need to shift left by 1 - 31 bits */
+L_shift_1:
+ bsrl %edx,%ecx /* get the required shift in %ecx */
+ subl $31,%ecx
+ negl %ecx
+ shld %cl,%eax,%edx
+ shl %cl,%eax
+ subw %cx,EXP(%ebx) /* This can cause an underflow */
+
+ movl %edx,SIGH(%ebx)
+ movl %eax,SIGL(%ebx)
+
+L_done:
+ cmpw EXP_OVER,EXP(%ebx)
+ jge L_overflow
+
+ cmpw EXP_UNDER,EXP(%ebx)
+ jle L_underflow
+
+L_exit_valid:
+ movl TAG_Valid,%eax
+
+ /* Convert the exponent to 80x87 form. */
+ addw EXTENDED_Ebias,EXP(%ebx)
+ andw $0x7fff,EXP(%ebx)
+
+L_exit:
+ popl %ebx
+ leave
+ ret
+
+
+L_zero:
+ movw $0,EXP(%ebx)
+ movl TAG_Zero,%eax
+ jmp L_exit
+
+L_underflow:
+ /* Convert the exponent to 80x87 form. */
+ addw EXTENDED_Ebias,EXP(%ebx)
+ push %ebx
+ call arith_underflow
+ pop %ebx
+ jmp L_exit
+
+L_overflow:
+ /* Convert the exponent to 80x87 form. */
+ addw EXTENDED_Ebias,EXP(%ebx)
+ push %ebx
+ call arith_overflow
+ pop %ebx
+ jmp L_exit
+ENDPROC(FPU_normalize)
+
+
+
+/* Normalise without reporting underflow or overflow */
+ENTRY(FPU_normalize_nuo)
+ pushl %ebp
+ movl %esp,%ebp
+ pushl %ebx
+
+ movl PARAM1,%ebx
+
+ movl SIGH(%ebx),%edx
+ movl SIGL(%ebx),%eax
+
+ orl %edx,%edx /* ms bits */
+ js L_exit_nuo_valid /* Already normalized */
+ jnz L_nuo_shift_1 /* Shift left 1 - 31 bits */
+
+ orl %eax,%eax
+ jz L_exit_nuo_zero /* The contents are zero */
+
+ movl %eax,%edx
+ xorl %eax,%eax
+ subw $32,EXP(%ebx) /* This can cause an underflow */
+
+/* We need to shift left by 1 - 31 bits */
+L_nuo_shift_1:
+ bsrl %edx,%ecx /* get the required shift in %ecx */
+ subl $31,%ecx
+ negl %ecx
+ shld %cl,%eax,%edx
+ shl %cl,%eax
+ subw %cx,EXP(%ebx) /* This can cause an underflow */
+
+ movl %edx,SIGH(%ebx)
+ movl %eax,SIGL(%ebx)
+
+L_exit_nuo_valid:
+ movl TAG_Valid,%eax
+
+ popl %ebx
+ leave
+ ret
+
+L_exit_nuo_zero:
+ movl TAG_Zero,%eax
+ movw EXP_UNDER,EXP(%ebx)
+
+ popl %ebx
+ leave
+ ret
+ENDPROC(FPU_normalize_nuo)
diff --git a/arch/x86/math-emu/reg_round.S b/arch/x86/math-emu/reg_round.S
new file mode 100644
index 000000000..04563421e
--- /dev/null
+++ b/arch/x86/math-emu/reg_round.S
@@ -0,0 +1,711 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+ .file "reg_round.S"
+/*---------------------------------------------------------------------------+
+ | reg_round.S |
+ | |
+ | Rounding/truncation/etc for FPU basic arithmetic functions. |
+ | |
+ | Copyright (C) 1993,1995,1997 |
+ | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, |
+ | Australia. E-mail billm@suburbia.net |
+ | |
+ | This code has four possible entry points. |
+ | The following must be entered by a jmp instruction: |
+ | fpu_reg_round, fpu_reg_round_sqrt, and fpu_Arith_exit. |
+ | |
+ | The FPU_round entry point is intended to be used by C code. |
+ | From C, call as: |
+ | int FPU_round(FPU_REG *arg, unsigned int extent, unsigned int control_w) |
+ | |
+ | Return value is the tag of the answer, or-ed with FPU_Exception if |
+ | one was raised, or -1 on internal error. |
+ | |
+ | For correct "up" and "down" rounding, the argument must have the correct |
+ | sign. |
+ | |
+ +---------------------------------------------------------------------------*/
+
+/*---------------------------------------------------------------------------+
+ | Four entry points. |
+ | |
+ | Needed by both the fpu_reg_round and fpu_reg_round_sqrt entry points: |
+ | %eax:%ebx 64 bit significand |
+ | %edx 32 bit extension of the significand |
+ | %edi pointer to an FPU_REG for the result to be stored |
+ | stack calling function must have set up a C stack frame and |
+ | pushed %esi, %edi, and %ebx |
+ | |
+ | Needed just for the fpu_reg_round_sqrt entry point: |
+ | %cx A control word in the same format as the FPU control word. |
+ | Otherwise, PARAM4 must give such a value. |
+ | |
+ | |
+ | The significand and its extension are assumed to be exact in the |
+ | following sense: |
+ | If the significand by itself is the exact result then the significand |
+ | extension (%edx) must contain 0, otherwise the significand extension |
+ | must be non-zero. |
+ | If the significand extension is non-zero then the significand is |
+ | smaller than the magnitude of the correct exact result by an amount |
+ | greater than zero and less than one ls bit of the significand. |
+ | The significand extension is only required to have three possible |
+ | non-zero values: |
+ | less than 0x80000000 <=> the significand is less than 1/2 an ls |
+ | bit smaller than the magnitude of the |
+ | true exact result. |
+ | exactly 0x80000000 <=> the significand is exactly 1/2 an ls bit |
+ | smaller than the magnitude of the true |
+ | exact result. |
+ | greater than 0x80000000 <=> the significand is more than 1/2 an ls |
+ | bit smaller than the magnitude of the |
+ | true exact result. |
+ | |
+ +---------------------------------------------------------------------------*/
+
+/*---------------------------------------------------------------------------+
+ | The code in this module has become quite complex, but it should handle |
+ | all of the FPU flags which are set at this stage of the basic arithmetic |
+ | computations. |
+ | There are a few rare cases where the results are not set identically to |
+ | a real FPU. These require a bit more thought because at this stage the |
+ | results of the code here appear to be more consistent... |
+ | This may be changed in a future version. |
+ +---------------------------------------------------------------------------*/
+
+
+#include "fpu_emu.h"
+#include "exception.h"
+#include "control_w.h"
+
+/* Flags for FPU_bits_lost */
+#define LOST_DOWN $1
+#define LOST_UP $2
+
+/* Flags for FPU_denormal */
+#define DENORMAL $1
+#define UNMASKED_UNDERFLOW $2
+
+
+#ifndef NON_REENTRANT_FPU
+/* Make the code re-entrant by putting
+ local storage on the stack: */
+#define FPU_bits_lost (%esp)
+#define FPU_denormal 1(%esp)
+
+#else
+/* Not re-entrant, so we can gain speed by putting
+ local storage in a static area: */
+.data
+ .align 4,0
+FPU_bits_lost:
+ .byte 0
+FPU_denormal:
+ .byte 0
+#endif /* NON_REENTRANT_FPU */
+
+
+.text
+.globl fpu_reg_round
+.globl fpu_Arith_exit
+
+/* Entry point when called from C */
+ENTRY(FPU_round)
+ pushl %ebp
+ movl %esp,%ebp
+ pushl %esi
+ pushl %edi
+ pushl %ebx
+
+ movl PARAM1,%edi
+ movl SIGH(%edi),%eax
+ movl SIGL(%edi),%ebx
+ movl PARAM2,%edx
+
+fpu_reg_round: /* Normal entry point */
+ movl PARAM4,%ecx
+
+#ifndef NON_REENTRANT_FPU
+ pushl %ebx /* adjust the stack pointer */
+#endif /* NON_REENTRANT_FPU */
+
+#ifdef PARANOID
+/* Cannot use this here yet */
+/* orl %eax,%eax */
+/* jns L_entry_bugged */
+#endif /* PARANOID */
+
+ cmpw EXP_UNDER,EXP(%edi)
+ jle L_Make_denorm /* The number is a de-normal */
+
+ movb $0,FPU_denormal /* 0 -> not a de-normal */
+
+Denorm_done:
+ movb $0,FPU_bits_lost /* No bits yet lost in rounding */
+
+ movl %ecx,%esi
+ andl CW_PC,%ecx
+ cmpl PR_64_BITS,%ecx
+ je LRound_To_64
+
+ cmpl PR_53_BITS,%ecx
+ je LRound_To_53
+
+ cmpl PR_24_BITS,%ecx
+ je LRound_To_24
+
+#ifdef PECULIAR_486
+/* With the precision control bits set to 01 "(reserved)", a real 80486
+ behaves as if the precision control bits were set to 11 "64 bits" */
+ cmpl PR_RESERVED_BITS,%ecx
+ je LRound_To_64
+#ifdef PARANOID
+ jmp L_bugged_denorm_486
+#endif /* PARANOID */
+#else
+#ifdef PARANOID
+ jmp L_bugged_denorm /* There is no bug, just a bad control word */
+#endif /* PARANOID */
+#endif /* PECULIAR_486 */
+
+
+/* Round etc to 24 bit precision */
+LRound_To_24:
+ movl %esi,%ecx
+ andl CW_RC,%ecx
+ cmpl RC_RND,%ecx
+ je LRound_nearest_24
+
+ cmpl RC_CHOP,%ecx
+ je LCheck_truncate_24
+
+ cmpl RC_UP,%ecx /* Towards +infinity */
+ je LUp_24
+
+ cmpl RC_DOWN,%ecx /* Towards -infinity */
+ je LDown_24
+
+#ifdef PARANOID
+ jmp L_bugged_round24
+#endif /* PARANOID */
+
+LUp_24:
+ cmpb SIGN_POS,PARAM5
+ jne LCheck_truncate_24 /* If negative then up==truncate */
+
+ jmp LCheck_24_round_up
+
+LDown_24:
+ cmpb SIGN_POS,PARAM5
+ je LCheck_truncate_24 /* If positive then down==truncate */
+
+LCheck_24_round_up:
+ movl %eax,%ecx
+ andl $0x000000ff,%ecx
+ orl %ebx,%ecx
+ orl %edx,%ecx
+ jnz LDo_24_round_up
+ jmp L_Re_normalise
+
+LRound_nearest_24:
+ /* Do rounding of the 24th bit if needed (nearest or even) */
+ movl %eax,%ecx
+ andl $0x000000ff,%ecx
+ cmpl $0x00000080,%ecx
+ jc LCheck_truncate_24 /* less than half, no increment needed */
+
+ jne LGreater_Half_24 /* greater than half, increment needed */
+
+ /* Possibly half, we need to check the ls bits */
+ orl %ebx,%ebx
+ jnz LGreater_Half_24 /* greater than half, increment needed */
+
+ orl %edx,%edx
+ jnz LGreater_Half_24 /* greater than half, increment needed */
+
+ /* Exactly half, increment only if 24th bit is 1 (round to even) */
+ testl $0x00000100,%eax
+ jz LDo_truncate_24
+
+LGreater_Half_24: /* Rounding: increment at the 24th bit */
+LDo_24_round_up:
+ andl $0xffffff00,%eax /* Truncate to 24 bits */
+ xorl %ebx,%ebx
+ movb LOST_UP,FPU_bits_lost
+ addl $0x00000100,%eax
+ jmp LCheck_Round_Overflow
+
+LCheck_truncate_24:
+ movl %eax,%ecx
+ andl $0x000000ff,%ecx
+ orl %ebx,%ecx
+ orl %edx,%ecx
+ jz L_Re_normalise /* No truncation needed */
+
+LDo_truncate_24:
+ andl $0xffffff00,%eax /* Truncate to 24 bits */
+ xorl %ebx,%ebx
+ movb LOST_DOWN,FPU_bits_lost
+ jmp L_Re_normalise
+
+
+/* Round etc to 53 bit precision */
+LRound_To_53:
+ movl %esi,%ecx
+ andl CW_RC,%ecx
+ cmpl RC_RND,%ecx
+ je LRound_nearest_53
+
+ cmpl RC_CHOP,%ecx
+ je LCheck_truncate_53
+
+ cmpl RC_UP,%ecx /* Towards +infinity */
+ je LUp_53
+
+ cmpl RC_DOWN,%ecx /* Towards -infinity */
+ je LDown_53
+
+#ifdef PARANOID
+ jmp L_bugged_round53
+#endif /* PARANOID */
+
+LUp_53:
+ cmpb SIGN_POS,PARAM5
+ jne LCheck_truncate_53 /* If negative then up==truncate */
+
+ jmp LCheck_53_round_up
+
+LDown_53:
+ cmpb SIGN_POS,PARAM5
+ je LCheck_truncate_53 /* If positive then down==truncate */
+
+LCheck_53_round_up:
+ movl %ebx,%ecx
+ andl $0x000007ff,%ecx
+ orl %edx,%ecx
+ jnz LDo_53_round_up
+ jmp L_Re_normalise
+
+LRound_nearest_53:
+ /* Do rounding of the 53rd bit if needed (nearest or even) */
+ movl %ebx,%ecx
+ andl $0x000007ff,%ecx
+ cmpl $0x00000400,%ecx
+ jc LCheck_truncate_53 /* less than half, no increment needed */
+
+ jnz LGreater_Half_53 /* greater than half, increment needed */
+
+ /* Possibly half, we need to check the ls bits */
+ orl %edx,%edx
+ jnz LGreater_Half_53 /* greater than half, increment needed */
+
+ /* Exactly half, increment only if 53rd bit is 1 (round to even) */
+ testl $0x00000800,%ebx
+ jz LTruncate_53
+
+LGreater_Half_53: /* Rounding: increment at the 53rd bit */
+LDo_53_round_up:
+ movb LOST_UP,FPU_bits_lost
+ andl $0xfffff800,%ebx /* Truncate to 53 bits */
+ addl $0x00000800,%ebx
+ adcl $0,%eax
+ jmp LCheck_Round_Overflow
+
+LCheck_truncate_53:
+ movl %ebx,%ecx
+ andl $0x000007ff,%ecx
+ orl %edx,%ecx
+ jz L_Re_normalise
+
+LTruncate_53:
+ movb LOST_DOWN,FPU_bits_lost
+ andl $0xfffff800,%ebx /* Truncate to 53 bits */
+ jmp L_Re_normalise
+
+
+/* Round etc to 64 bit precision */
+LRound_To_64:
+ movl %esi,%ecx
+ andl CW_RC,%ecx
+ cmpl RC_RND,%ecx
+ je LRound_nearest_64
+
+ cmpl RC_CHOP,%ecx
+ je LCheck_truncate_64
+
+ cmpl RC_UP,%ecx /* Towards +infinity */
+ je LUp_64
+
+ cmpl RC_DOWN,%ecx /* Towards -infinity */
+ je LDown_64
+
+#ifdef PARANOID
+ jmp L_bugged_round64
+#endif /* PARANOID */
+
+LUp_64:
+ cmpb SIGN_POS,PARAM5
+ jne LCheck_truncate_64 /* If negative then up==truncate */
+
+ orl %edx,%edx
+ jnz LDo_64_round_up
+ jmp L_Re_normalise
+
+LDown_64:
+ cmpb SIGN_POS,PARAM5
+ je LCheck_truncate_64 /* If positive then down==truncate */
+
+ orl %edx,%edx
+ jnz LDo_64_round_up
+ jmp L_Re_normalise
+
+LRound_nearest_64:
+ cmpl $0x80000000,%edx
+ jc LCheck_truncate_64
+
+ jne LDo_64_round_up
+
+ /* Now test for round-to-even */
+ testb $1,%bl
+ jz LCheck_truncate_64
+
+LDo_64_round_up:
+ movb LOST_UP,FPU_bits_lost
+ addl $1,%ebx
+ adcl $0,%eax
+
+LCheck_Round_Overflow:
+ jnc L_Re_normalise
+
+ /* Overflow, adjust the result (significand to 1.0) */
+ rcrl $1,%eax
+ rcrl $1,%ebx
+ incw EXP(%edi)
+ jmp L_Re_normalise
+
+LCheck_truncate_64:
+ orl %edx,%edx
+ jz L_Re_normalise
+
+LTruncate_64:
+ movb LOST_DOWN,FPU_bits_lost
+
+L_Re_normalise:
+ testb $0xff,FPU_denormal
+ jnz Normalise_result
+
+L_Normalised:
+ movl TAG_Valid,%edx
+
+L_deNormalised:
+ cmpb LOST_UP,FPU_bits_lost
+ je L_precision_lost_up
+
+ cmpb LOST_DOWN,FPU_bits_lost
+ je L_precision_lost_down
+
+L_no_precision_loss:
+ /* store the result */
+
+L_Store_significand:
+ movl %eax,SIGH(%edi)
+ movl %ebx,SIGL(%edi)
+
+ cmpw EXP_OVER,EXP(%edi)
+ jge L_overflow
+
+ movl %edx,%eax
+
+ /* Convert the exponent to 80x87 form. */
+ addw EXTENDED_Ebias,EXP(%edi)
+ andw $0x7fff,EXP(%edi)
+
+fpu_reg_round_signed_special_exit:
+
+ cmpb SIGN_POS,PARAM5
+ je fpu_reg_round_special_exit
+
+ orw $0x8000,EXP(%edi) /* Negative sign for the result. */
+
+fpu_reg_round_special_exit:
+
+#ifndef NON_REENTRANT_FPU
+ popl %ebx /* adjust the stack pointer */
+#endif /* NON_REENTRANT_FPU */
+
+fpu_Arith_exit:
+ popl %ebx
+ popl %edi
+ popl %esi
+ leave
+ ret
+
+
+/*
+ * Set the FPU status flags to represent precision loss due to
+ * round-up.
+ */
+L_precision_lost_up:
+ push %edx
+ push %eax
+ call set_precision_flag_up
+ popl %eax
+ popl %edx
+ jmp L_no_precision_loss
+
+/*
+ * Set the FPU status flags to represent precision loss due to
+ * truncation.
+ */
+L_precision_lost_down:
+ push %edx
+ push %eax
+ call set_precision_flag_down
+ popl %eax
+ popl %edx
+ jmp L_no_precision_loss
+
+
+/*
+ * The number is a denormal (which might get rounded up to a normal)
+ * Shift the number right the required number of bits, which will
+ * have to be undone later...
+ */
+L_Make_denorm:
+ /* The action to be taken depends upon whether the underflow
+ exception is masked */
+ testb CW_Underflow,%cl /* Underflow mask. */
+ jz Unmasked_underflow /* Do not make a denormal. */
+
+ movb DENORMAL,FPU_denormal
+
+ pushl %ecx /* Save */
+ movw EXP_UNDER+1,%cx
+ subw EXP(%edi),%cx
+
+ cmpw $64,%cx /* shrd only works for 0..31 bits */
+ jnc Denorm_shift_more_than_63
+
+ cmpw $32,%cx /* shrd only works for 0..31 bits */
+ jnc Denorm_shift_more_than_32
+
+/*
+ * We got here without jumps by assuming that the most common requirement
+ * is for a small de-normalising shift.
+ * Shift by [1..31] bits
+ */
+ addw %cx,EXP(%edi)
+ orl %edx,%edx /* extension */
+ setne %ch /* Save whether %edx is non-zero */
+ xorl %edx,%edx
+ shrd %cl,%ebx,%edx
+ shrd %cl,%eax,%ebx
+ shr %cl,%eax
+ orb %ch,%dl
+ popl %ecx
+ jmp Denorm_done
+
+/* Shift by [32..63] bits */
+Denorm_shift_more_than_32:
+ addw %cx,EXP(%edi)
+ subb $32,%cl
+ orl %edx,%edx
+ setne %ch
+ orb %ch,%bl
+ xorl %edx,%edx
+ shrd %cl,%ebx,%edx
+ shrd %cl,%eax,%ebx
+ shr %cl,%eax
+ orl %edx,%edx /* test these 32 bits */
+ setne %cl
+ orb %ch,%bl
+ orb %cl,%bl
+ movl %ebx,%edx
+ movl %eax,%ebx
+ xorl %eax,%eax
+ popl %ecx
+ jmp Denorm_done
+
+/* Shift by [64..) bits */
+Denorm_shift_more_than_63:
+ cmpw $64,%cx
+ jne Denorm_shift_more_than_64
+
+/* Exactly 64 bit shift */
+ addw %cx,EXP(%edi)
+ xorl %ecx,%ecx
+ orl %edx,%edx
+ setne %cl
+ orl %ebx,%ebx
+ setne %ch
+ orb %ch,%cl
+ orb %cl,%al
+ movl %eax,%edx
+ xorl %eax,%eax
+ xorl %ebx,%ebx
+ popl %ecx
+ jmp Denorm_done
+
+Denorm_shift_more_than_64:
+ movw EXP_UNDER+1,EXP(%edi)
+/* This is easy, %eax must be non-zero, so.. */
+ movl $1,%edx
+ xorl %eax,%eax
+ xorl %ebx,%ebx
+ popl %ecx
+ jmp Denorm_done
+
+
+Unmasked_underflow:
+ movb UNMASKED_UNDERFLOW,FPU_denormal
+ jmp Denorm_done
+
+
+/* Undo the de-normalisation. */
+Normalise_result:
+ cmpb UNMASKED_UNDERFLOW,FPU_denormal
+ je Signal_underflow
+
+/* The number must be a denormal if we got here. */
+#ifdef PARANOID
+ /* But check it... just in case. */
+ cmpw EXP_UNDER+1,EXP(%edi)
+ jne L_norm_bugged
+#endif /* PARANOID */
+
+#ifdef PECULIAR_486
+ /*
+ * This implements a special feature of 80486 behaviour.
+ * Underflow will be signalled even if the number is
+ * not a denormal after rounding.
+ * This difference occurs only for masked underflow, and not
+ * in the unmasked case.
+ * Actual 80486 behaviour differs from this in some circumstances.
+ */
+ orl %eax,%eax /* ms bits */
+ js LPseudoDenormal /* Will be masked underflow */
+#else
+ orl %eax,%eax /* ms bits */
+ js L_Normalised /* No longer a denormal */
+#endif /* PECULIAR_486 */
+
+ jnz LDenormal_adj_exponent
+
+ orl %ebx,%ebx
+ jz L_underflow_to_zero /* The contents are zero */
+
+LDenormal_adj_exponent:
+ decw EXP(%edi)
+
+LPseudoDenormal:
+ testb $0xff,FPU_bits_lost /* bits lost == underflow */
+ movl TAG_Special,%edx
+ jz L_deNormalised
+
+ /* There must be a masked underflow */
+ push %eax
+ pushl EX_Underflow
+ call EXCEPTION
+ popl %eax
+ popl %eax
+ movl TAG_Special,%edx
+ jmp L_deNormalised
+
+
+/*
+ * The operations resulted in a number too small to represent.
+ * Masked response.
+ */
+L_underflow_to_zero:
+ push %eax
+ call set_precision_flag_down
+ popl %eax
+
+ push %eax
+ pushl EX_Underflow
+ call EXCEPTION
+ popl %eax
+ popl %eax
+
+/* Reduce the exponent to EXP_UNDER */
+ movw EXP_UNDER,EXP(%edi)
+ movl TAG_Zero,%edx
+ jmp L_Store_significand
+
+
+/* The operations resulted in a number too large to represent. */
+L_overflow:
+ addw EXTENDED_Ebias,EXP(%edi) /* Set for unmasked response. */
+ push %edi
+ call arith_overflow
+ pop %edi
+ jmp fpu_reg_round_signed_special_exit
+
+
+Signal_underflow:
+ /* The number may have been changed to a non-denormal */
+ /* by the rounding operations. */
+ cmpw EXP_UNDER,EXP(%edi)
+ jle Do_unmasked_underflow
+
+ jmp L_Normalised
+
+Do_unmasked_underflow:
+ /* Increase the exponent by the magic number */
+ addw $(3*(1<<13)),EXP(%edi)
+ push %eax
+ pushl EX_Underflow
+ call EXCEPTION
+ popl %eax
+ popl %eax
+ jmp L_Normalised
+
+
+#ifdef PARANOID
+#ifdef PECULIAR_486
+L_bugged_denorm_486:
+ pushl EX_INTERNAL|0x236
+ call EXCEPTION
+ popl %ebx
+ jmp L_exception_exit
+#else
+L_bugged_denorm:
+ pushl EX_INTERNAL|0x230
+ call EXCEPTION
+ popl %ebx
+ jmp L_exception_exit
+#endif /* PECULIAR_486 */
+
+L_bugged_round24:
+ pushl EX_INTERNAL|0x231
+ call EXCEPTION
+ popl %ebx
+ jmp L_exception_exit
+
+L_bugged_round53:
+ pushl EX_INTERNAL|0x232
+ call EXCEPTION
+ popl %ebx
+ jmp L_exception_exit
+
+L_bugged_round64:
+ pushl EX_INTERNAL|0x233
+ call EXCEPTION
+ popl %ebx
+ jmp L_exception_exit
+
+L_norm_bugged:
+ pushl EX_INTERNAL|0x234
+ call EXCEPTION
+ popl %ebx
+ jmp L_exception_exit
+
+L_entry_bugged:
+ pushl EX_INTERNAL|0x235
+ call EXCEPTION
+ popl %ebx
+L_exception_exit:
+ mov $-1,%eax
+ jmp fpu_reg_round_special_exit
+#endif /* PARANOID */
+
+ENDPROC(FPU_round)
diff --git a/arch/x86/math-emu/reg_u_add.S b/arch/x86/math-emu/reg_u_add.S
new file mode 100644
index 000000000..50fe9f8c8
--- /dev/null
+++ b/arch/x86/math-emu/reg_u_add.S
@@ -0,0 +1,169 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+ .file "reg_u_add.S"
+/*---------------------------------------------------------------------------+
+ | reg_u_add.S |
+ | |
+ | Add two valid (TAG_Valid) FPU_REG numbers, of the same sign, and put the |
+ | result in a destination FPU_REG. |
+ | |
+ | Copyright (C) 1992,1993,1995,1997 |
+ | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia |
+ | E-mail billm@suburbia.net |
+ | |
+ | Call from C as: |
+ | int FPU_u_add(FPU_REG *arg1, FPU_REG *arg2, FPU_REG *answ, |
+ | int control_w) |
+ | Return value is the tag of the answer, or-ed with FPU_Exception if |
+ | one was raised, or -1 on internal error. |
+ | |
+ +---------------------------------------------------------------------------*/
+
+/*
+ | Kernel addition routine FPU_u_add(reg *arg1, reg *arg2, reg *answ).
+ | Takes two valid reg f.p. numbers (TAG_Valid), which are
+ | treated as unsigned numbers,
+ | and returns their sum as a TAG_Valid or TAG_Special f.p. number.
+ | The returned number is normalized.
+ | Basic checks are performed if PARANOID is defined.
+ */
+
+#include "exception.h"
+#include "fpu_emu.h"
+#include "control_w.h"
+
+.text
+ENTRY(FPU_u_add)
+ pushl %ebp
+ movl %esp,%ebp
+ pushl %esi
+ pushl %edi
+ pushl %ebx
+
+ movl PARAM1,%esi /* source 1 */
+ movl PARAM2,%edi /* source 2 */
+
+ movl PARAM6,%ecx
+ movl %ecx,%edx
+ subl PARAM7,%ecx /* exp1 - exp2 */
+ jge L_arg1_larger
+
+ /* num1 is smaller */
+ movl SIGL(%esi),%ebx
+ movl SIGH(%esi),%eax
+
+ movl %edi,%esi
+ movl PARAM7,%edx
+ negw %cx
+ jmp L_accum_loaded
+
+L_arg1_larger:
+ /* num1 has larger or equal exponent */
+ movl SIGL(%edi),%ebx
+ movl SIGH(%edi),%eax
+
+L_accum_loaded:
+ movl PARAM3,%edi /* destination */
+ movw %dx,EXP(%edi) /* Copy exponent to destination */
+
+ xorl %edx,%edx /* clear the extension */
+
+#ifdef PARANOID
+ testl $0x80000000,%eax
+ je L_bugged
+
+ testl $0x80000000,SIGH(%esi)
+ je L_bugged
+#endif /* PARANOID */
+
+/* The number to be shifted is in %eax:%ebx:%edx */
+ cmpw $32,%cx /* shrd only works for 0..31 bits */
+ jnc L_more_than_31
+
+/* less than 32 bits */
+ shrd %cl,%ebx,%edx
+ shrd %cl,%eax,%ebx
+ shr %cl,%eax
+ jmp L_shift_done
+
+L_more_than_31:
+ cmpw $64,%cx
+ jnc L_more_than_63
+
+ subb $32,%cl
+ jz L_exactly_32
+
+ shrd %cl,%eax,%edx
+ shr %cl,%eax
+ orl %ebx,%ebx
+ jz L_more_31_no_low /* none of the lowest bits is set */
+
+ orl $1,%edx /* record the fact in the extension */
+
+L_more_31_no_low:
+ movl %eax,%ebx
+ xorl %eax,%eax
+ jmp L_shift_done
+
+L_exactly_32:
+ movl %ebx,%edx
+ movl %eax,%ebx
+ xorl %eax,%eax
+ jmp L_shift_done
+
+L_more_than_63:
+ cmpw $65,%cx
+ jnc L_more_than_64
+
+ movl %eax,%edx
+ orl %ebx,%ebx
+ jz L_more_63_no_low
+
+ orl $1,%edx
+ jmp L_more_63_no_low
+
+L_more_than_64:
+ movl $1,%edx /* The shifted nr always at least one '1' */
+
+L_more_63_no_low:
+ xorl %ebx,%ebx
+ xorl %eax,%eax
+
+L_shift_done:
+ /* Now do the addition */
+ addl SIGL(%esi),%ebx
+ adcl SIGH(%esi),%eax
+ jnc L_round_the_result
+
+ /* Overflow, adjust the result */
+ rcrl $1,%eax
+ rcrl $1,%ebx
+ rcrl $1,%edx
+ jnc L_no_bit_lost
+
+ orl $1,%edx
+
+L_no_bit_lost:
+ incw EXP(%edi)
+
+L_round_the_result:
+ jmp fpu_reg_round /* Round the result */
+
+
+
+#ifdef PARANOID
+/* If we ever get here then we have problems! */
+L_bugged:
+ pushl EX_INTERNAL|0x201
+ call EXCEPTION
+ pop %ebx
+ movl $-1,%eax
+ jmp L_exit
+
+L_exit:
+ popl %ebx
+ popl %edi
+ popl %esi
+ leave
+ ret
+#endif /* PARANOID */
+ENDPROC(FPU_u_add)
diff --git a/arch/x86/math-emu/reg_u_div.S b/arch/x86/math-emu/reg_u_div.S
new file mode 100644
index 000000000..94d545e11
--- /dev/null
+++ b/arch/x86/math-emu/reg_u_div.S
@@ -0,0 +1,474 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+ .file "reg_u_div.S"
+/*---------------------------------------------------------------------------+
+ | reg_u_div.S |
+ | |
+ | Divide one FPU_REG by another and put the result in a destination FPU_REG.|
+ | |
+ | Copyright (C) 1992,1993,1995,1997 |
+ | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia |
+ | E-mail billm@suburbia.net |
+ | |
+ | |
+ +---------------------------------------------------------------------------*/
+
+/*---------------------------------------------------------------------------+
+ | Call from C as: |
+ | int FPU_u_div(FPU_REG *a, FPU_REG *b, FPU_REG *dest, |
+ | unsigned int control_word, char *sign) |
+ | |
+ | Does not compute the destination exponent, but does adjust it. |
+ | |
+ | Return value is the tag of the answer, or-ed with FPU_Exception if |
+ | one was raised, or -1 on internal error. |
+ +---------------------------------------------------------------------------*/
+
+#include "exception.h"
+#include "fpu_emu.h"
+#include "control_w.h"
+
+
+/* #define dSIGL(x) (x) */
+/* #define dSIGH(x) 4(x) */
+
+
+#ifndef NON_REENTRANT_FPU
+/*
+ Local storage on the stack:
+ Result: FPU_accum_3:FPU_accum_2:FPU_accum_1:FPU_accum_0
+ Overflow flag: ovfl_flag
+ */
+#define FPU_accum_3 -4(%ebp)
+#define FPU_accum_2 -8(%ebp)
+#define FPU_accum_1 -12(%ebp)
+#define FPU_accum_0 -16(%ebp)
+#define FPU_result_1 -20(%ebp)
+#define FPU_result_2 -24(%ebp)
+#define FPU_ovfl_flag -28(%ebp)
+
+#else
+.data
+/*
+ Local storage in a static area:
+ Result: FPU_accum_3:FPU_accum_2:FPU_accum_1:FPU_accum_0
+ Overflow flag: ovfl_flag
+ */
+ .align 4,0
+FPU_accum_3:
+ .long 0
+FPU_accum_2:
+ .long 0
+FPU_accum_1:
+ .long 0
+FPU_accum_0:
+ .long 0
+FPU_result_1:
+ .long 0
+FPU_result_2:
+ .long 0
+FPU_ovfl_flag:
+ .byte 0
+#endif /* NON_REENTRANT_FPU */
+
+#define REGA PARAM1
+#define REGB PARAM2
+#define DEST PARAM3
+
+.text
+ENTRY(FPU_u_div)
+ pushl %ebp
+ movl %esp,%ebp
+#ifndef NON_REENTRANT_FPU
+ subl $28,%esp
+#endif /* NON_REENTRANT_FPU */
+
+ pushl %esi
+ pushl %edi
+ pushl %ebx
+
+ movl REGA,%esi
+ movl REGB,%ebx
+ movl DEST,%edi
+
+ movswl EXP(%esi),%edx
+ movswl EXP(%ebx),%eax
+ subl %eax,%edx
+ addl EXP_BIAS,%edx
+
+ /* A denormal and a large number can cause an exponent underflow */
+ cmpl EXP_WAY_UNDER,%edx
+ jg xExp_not_underflow
+
+ /* Set to a really low value allow correct handling */
+ movl EXP_WAY_UNDER,%edx
+
+xExp_not_underflow:
+
+ movw %dx,EXP(%edi)
+
+#ifdef PARANOID
+/* testl $0x80000000, SIGH(%esi) // Dividend */
+/* je L_bugged */
+ testl $0x80000000, SIGH(%ebx) /* Divisor */
+ je L_bugged
+#endif /* PARANOID */
+
+/* Check if the divisor can be treated as having just 32 bits */
+ cmpl $0,SIGL(%ebx)
+ jnz L_Full_Division /* Can't do a quick divide */
+
+/* We should be able to zip through the division here */
+ movl SIGH(%ebx),%ecx /* The divisor */
+ movl SIGH(%esi),%edx /* Dividend */
+ movl SIGL(%esi),%eax /* Dividend */
+
+ cmpl %ecx,%edx
+ setaeb FPU_ovfl_flag /* Keep a record */
+ jb L_no_adjust
+
+ subl %ecx,%edx /* Prevent the overflow */
+
+L_no_adjust:
+ /* Divide the 64 bit number by the 32 bit denominator */
+ divl %ecx
+ movl %eax,FPU_result_2
+
+ /* Work on the remainder of the first division */
+ xorl %eax,%eax
+ divl %ecx
+ movl %eax,FPU_result_1
+
+ /* Work on the remainder of the 64 bit division */
+ xorl %eax,%eax
+ divl %ecx
+
+ testb $255,FPU_ovfl_flag /* was the num > denom ? */
+ je L_no_overflow
+
+ /* Do the shifting here */
+ /* increase the exponent */
+ incw EXP(%edi)
+
+ /* shift the mantissa right one bit */
+ stc /* To set the ms bit */
+ rcrl FPU_result_2
+ rcrl FPU_result_1
+ rcrl %eax
+
+L_no_overflow:
+ jmp LRound_precision /* Do the rounding as required */
+
+
+/*---------------------------------------------------------------------------+
+ | Divide: Return arg1/arg2 to arg3. |
+ | |
+ | This routine does not use the exponents of arg1 and arg2, but does |
+ | adjust the exponent of arg3. |
+ | |
+ | The maximum returned value is (ignoring exponents) |
+ | .ffffffff ffffffff |
+ | ------------------ = 1.ffffffff fffffffe |
+ | .80000000 00000000 |
+ | and the minimum is |
+ | .80000000 00000000 |
+ | ------------------ = .80000000 00000001 (rounded) |
+ | .ffffffff ffffffff |
+ | |
+ +---------------------------------------------------------------------------*/
+
+
+L_Full_Division:
+ /* Save extended dividend in local register */
+ movl SIGL(%esi),%eax
+ movl %eax,FPU_accum_2
+ movl SIGH(%esi),%eax
+ movl %eax,FPU_accum_3
+ xorl %eax,%eax
+ movl %eax,FPU_accum_1 /* zero the extension */
+ movl %eax,FPU_accum_0 /* zero the extension */
+
+ movl SIGL(%esi),%eax /* Get the current num */
+ movl SIGH(%esi),%edx
+
+/*----------------------------------------------------------------------*/
+/* Initialization done.
+ Do the first 32 bits. */
+
+ movb $0,FPU_ovfl_flag
+ cmpl SIGH(%ebx),%edx /* Test for imminent overflow */
+ jb LLess_than_1
+ ja LGreater_than_1
+
+ cmpl SIGL(%ebx),%eax
+ jb LLess_than_1
+
+LGreater_than_1:
+/* The dividend is greater or equal, would cause overflow */
+ setaeb FPU_ovfl_flag /* Keep a record */
+
+ subl SIGL(%ebx),%eax
+ sbbl SIGH(%ebx),%edx /* Prevent the overflow */
+ movl %eax,FPU_accum_2
+ movl %edx,FPU_accum_3
+
+LLess_than_1:
+/* At this point, we have a dividend < divisor, with a record of
+ adjustment in FPU_ovfl_flag */
+
+ /* We will divide by a number which is too large */
+ movl SIGH(%ebx),%ecx
+ addl $1,%ecx
+ jnc LFirst_div_not_1
+
+ /* here we need to divide by 100000000h,
+ i.e., no division at all.. */
+ mov %edx,%eax
+ jmp LFirst_div_done
+
+LFirst_div_not_1:
+ divl %ecx /* Divide the numerator by the augmented
+ denom ms dw */
+
+LFirst_div_done:
+ movl %eax,FPU_result_2 /* Put the result in the answer */
+
+ mull SIGH(%ebx) /* mul by the ms dw of the denom */
+
+ subl %eax,FPU_accum_2 /* Subtract from the num local reg */
+ sbbl %edx,FPU_accum_3
+
+ movl FPU_result_2,%eax /* Get the result back */
+ mull SIGL(%ebx) /* now mul the ls dw of the denom */
+
+ subl %eax,FPU_accum_1 /* Subtract from the num local reg */
+ sbbl %edx,FPU_accum_2
+ sbbl $0,FPU_accum_3
+ je LDo_2nd_32_bits /* Must check for non-zero result here */
+
+#ifdef PARANOID
+ jb L_bugged_1
+#endif /* PARANOID */
+
+ /* need to subtract another once of the denom */
+ incl FPU_result_2 /* Correct the answer */
+
+ movl SIGL(%ebx),%eax
+ movl SIGH(%ebx),%edx
+ subl %eax,FPU_accum_1 /* Subtract from the num local reg */
+ sbbl %edx,FPU_accum_2
+
+#ifdef PARANOID
+ sbbl $0,FPU_accum_3
+ jne L_bugged_1 /* Must check for non-zero result here */
+#endif /* PARANOID */
+
+/*----------------------------------------------------------------------*/
+/* Half of the main problem is done, there is just a reduced numerator
+ to handle now.
+ Work with the second 32 bits, FPU_accum_0 not used from now on */
+LDo_2nd_32_bits:
+ movl FPU_accum_2,%edx /* get the reduced num */
+ movl FPU_accum_1,%eax
+
+ /* need to check for possible subsequent overflow */
+ cmpl SIGH(%ebx),%edx
+ jb LDo_2nd_div
+ ja LPrevent_2nd_overflow
+
+ cmpl SIGL(%ebx),%eax
+ jb LDo_2nd_div
+
+LPrevent_2nd_overflow:
+/* The numerator is greater or equal, would cause overflow */
+ /* prevent overflow */
+ subl SIGL(%ebx),%eax
+ sbbl SIGH(%ebx),%edx
+ movl %edx,FPU_accum_2
+ movl %eax,FPU_accum_1
+
+ incl FPU_result_2 /* Reflect the subtraction in the answer */
+
+#ifdef PARANOID
+ je L_bugged_2 /* Can't bump the result to 1.0 */
+#endif /* PARANOID */
+
+LDo_2nd_div:
+ cmpl $0,%ecx /* augmented denom msw */
+ jnz LSecond_div_not_1
+
+ /* %ecx == 0, we are dividing by 1.0 */
+ mov %edx,%eax
+ jmp LSecond_div_done
+
+LSecond_div_not_1:
+ divl %ecx /* Divide the numerator by the denom ms dw */
+
+LSecond_div_done:
+ movl %eax,FPU_result_1 /* Put the result in the answer */
+
+ mull SIGH(%ebx) /* mul by the ms dw of the denom */
+
+ subl %eax,FPU_accum_1 /* Subtract from the num local reg */
+ sbbl %edx,FPU_accum_2
+
+#ifdef PARANOID
+ jc L_bugged_2
+#endif /* PARANOID */
+
+ movl FPU_result_1,%eax /* Get the result back */
+ mull SIGL(%ebx) /* now mul the ls dw of the denom */
+
+ subl %eax,FPU_accum_0 /* Subtract from the num local reg */
+ sbbl %edx,FPU_accum_1 /* Subtract from the num local reg */
+ sbbl $0,FPU_accum_2
+
+#ifdef PARANOID
+ jc L_bugged_2
+#endif /* PARANOID */
+
+ jz LDo_3rd_32_bits
+
+#ifdef PARANOID
+ cmpl $1,FPU_accum_2
+ jne L_bugged_2
+#endif /* PARANOID */
+
+ /* need to subtract another once of the denom */
+ movl SIGL(%ebx),%eax
+ movl SIGH(%ebx),%edx
+ subl %eax,FPU_accum_0 /* Subtract from the num local reg */
+ sbbl %edx,FPU_accum_1
+ sbbl $0,FPU_accum_2
+
+#ifdef PARANOID
+ jc L_bugged_2
+ jne L_bugged_2
+#endif /* PARANOID */
+
+ addl $1,FPU_result_1 /* Correct the answer */
+ adcl $0,FPU_result_2
+
+#ifdef PARANOID
+ jc L_bugged_2 /* Must check for non-zero result here */
+#endif /* PARANOID */
+
+/*----------------------------------------------------------------------*/
+/* The division is essentially finished here, we just need to perform
+ tidying operations.
+ Deal with the 3rd 32 bits */
+LDo_3rd_32_bits:
+ movl FPU_accum_1,%edx /* get the reduced num */
+ movl FPU_accum_0,%eax
+
+ /* need to check for possible subsequent overflow */
+ cmpl SIGH(%ebx),%edx /* denom */
+ jb LRound_prep
+ ja LPrevent_3rd_overflow
+
+ cmpl SIGL(%ebx),%eax /* denom */
+ jb LRound_prep
+
+LPrevent_3rd_overflow:
+ /* prevent overflow */
+ subl SIGL(%ebx),%eax
+ sbbl SIGH(%ebx),%edx
+ movl %edx,FPU_accum_1
+ movl %eax,FPU_accum_0
+
+ addl $1,FPU_result_1 /* Reflect the subtraction in the answer */
+ adcl $0,FPU_result_2
+ jne LRound_prep
+ jnc LRound_prep
+
+ /* This is a tricky spot, there is an overflow of the answer */
+ movb $255,FPU_ovfl_flag /* Overflow -> 1.000 */
+
+LRound_prep:
+/*
+ * Prepare for rounding.
+ * To test for rounding, we just need to compare 2*accum with the
+ * denom.
+ */
+ movl FPU_accum_0,%ecx
+ movl FPU_accum_1,%edx
+ movl %ecx,%eax
+ orl %edx,%eax
+ jz LRound_ovfl /* The accumulator contains zero. */
+
+ /* Multiply by 2 */
+ clc
+ rcll $1,%ecx
+ rcll $1,%edx
+ jc LRound_large /* No need to compare, denom smaller */
+
+ subl SIGL(%ebx),%ecx
+ sbbl SIGH(%ebx),%edx
+ jnc LRound_not_small
+
+ movl $0x70000000,%eax /* Denom was larger */
+ jmp LRound_ovfl
+
+LRound_not_small:
+ jnz LRound_large
+
+ movl $0x80000000,%eax /* Remainder was exactly 1/2 denom */
+ jmp LRound_ovfl
+
+LRound_large:
+ movl $0xff000000,%eax /* Denom was smaller */
+
+LRound_ovfl:
+/* We are now ready to deal with rounding, but first we must get
+ the bits properly aligned */
+ testb $255,FPU_ovfl_flag /* was the num > denom ? */
+ je LRound_precision
+
+ incw EXP(%edi)
+
+ /* shift the mantissa right one bit */
+ stc /* Will set the ms bit */
+ rcrl FPU_result_2
+ rcrl FPU_result_1
+ rcrl %eax
+
+/* Round the result as required */
+LRound_precision:
+ decw EXP(%edi) /* binary point between 1st & 2nd bits */
+
+ movl %eax,%edx
+ movl FPU_result_1,%ebx
+ movl FPU_result_2,%eax
+ jmp fpu_reg_round
+
+
+#ifdef PARANOID
+/* The logic is wrong if we got here */
+L_bugged:
+ pushl EX_INTERNAL|0x202
+ call EXCEPTION
+ pop %ebx
+ jmp L_exit
+
+L_bugged_1:
+ pushl EX_INTERNAL|0x203
+ call EXCEPTION
+ pop %ebx
+ jmp L_exit
+
+L_bugged_2:
+ pushl EX_INTERNAL|0x204
+ call EXCEPTION
+ pop %ebx
+ jmp L_exit
+
+L_exit:
+ movl $-1,%eax
+ popl %ebx
+ popl %edi
+ popl %esi
+
+ leave
+ ret
+#endif /* PARANOID */
+
+ENDPROC(FPU_u_div)
diff --git a/arch/x86/math-emu/reg_u_mul.S b/arch/x86/math-emu/reg_u_mul.S
new file mode 100644
index 000000000..21cde47fb
--- /dev/null
+++ b/arch/x86/math-emu/reg_u_mul.S
@@ -0,0 +1,150 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+ .file "reg_u_mul.S"
+/*---------------------------------------------------------------------------+
+ | reg_u_mul.S |
+ | |
+ | Core multiplication routine |
+ | |
+ | Copyright (C) 1992,1993,1995,1997 |
+ | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia |
+ | E-mail billm@suburbia.net |
+ | |
+ | |
+ +---------------------------------------------------------------------------*/
+
+/*---------------------------------------------------------------------------+
+ | Basic multiplication routine. |
+ | Does not check the resulting exponent for overflow/underflow |
+ | |
+ | FPU_u_mul(FPU_REG *a, FPU_REG *b, FPU_REG *c, unsigned int cw); |
+ | |
+ | Internal working is at approx 128 bits. |
+ | Result is rounded to nearest 53 or 64 bits, using "nearest or even". |
+ +---------------------------------------------------------------------------*/
+
+#include "exception.h"
+#include "fpu_emu.h"
+#include "control_w.h"
+
+
+
+#ifndef NON_REENTRANT_FPU
+/* Local storage on the stack: */
+#define FPU_accum_0 -4(%ebp) /* ms word */
+#define FPU_accum_1 -8(%ebp)
+
+#else
+/* Local storage in a static area: */
+.data
+ .align 4,0
+FPU_accum_0:
+ .long 0
+FPU_accum_1:
+ .long 0
+#endif /* NON_REENTRANT_FPU */
+
+
+.text
+ENTRY(FPU_u_mul)
+ pushl %ebp
+ movl %esp,%ebp
+#ifndef NON_REENTRANT_FPU
+ subl $8,%esp
+#endif /* NON_REENTRANT_FPU */
+
+ pushl %esi
+ pushl %edi
+ pushl %ebx
+
+ movl PARAM1,%esi
+ movl PARAM2,%edi
+
+#ifdef PARANOID
+ testl $0x80000000,SIGH(%esi)
+ jz L_bugged
+ testl $0x80000000,SIGH(%edi)
+ jz L_bugged
+#endif /* PARANOID */
+
+ xorl %ecx,%ecx
+ xorl %ebx,%ebx
+
+ movl SIGL(%esi),%eax
+ mull SIGL(%edi)
+ movl %eax,FPU_accum_0
+ movl %edx,FPU_accum_1
+
+ movl SIGL(%esi),%eax
+ mull SIGH(%edi)
+ addl %eax,FPU_accum_1
+ adcl %edx,%ebx
+/* adcl $0,%ecx // overflow here is not possible */
+
+ movl SIGH(%esi),%eax
+ mull SIGL(%edi)
+ addl %eax,FPU_accum_1
+ adcl %edx,%ebx
+ adcl $0,%ecx
+
+ movl SIGH(%esi),%eax
+ mull SIGH(%edi)
+ addl %eax,%ebx
+ adcl %edx,%ecx
+
+ /* Get the sum of the exponents. */
+ movl PARAM6,%eax
+ subl EXP_BIAS-1,%eax
+
+ /* Two denormals can cause an exponent underflow */
+ cmpl EXP_WAY_UNDER,%eax
+ jg Exp_not_underflow
+
+ /* Set to a really low value allow correct handling */
+ movl EXP_WAY_UNDER,%eax
+
+Exp_not_underflow:
+
+/* Have now finished with the sources */
+ movl PARAM3,%edi /* Point to the destination */
+ movw %ax,EXP(%edi)
+
+/* Now make sure that the result is normalized */
+ testl $0x80000000,%ecx
+ jnz LResult_Normalised
+
+ /* Normalize by shifting left one bit */
+ shll $1,FPU_accum_0
+ rcll $1,FPU_accum_1
+ rcll $1,%ebx
+ rcll $1,%ecx
+ decw EXP(%edi)
+
+LResult_Normalised:
+ movl FPU_accum_0,%eax
+ movl FPU_accum_1,%edx
+ orl %eax,%eax
+ jz L_extent_zero
+
+ orl $1,%edx
+
+L_extent_zero:
+ movl %ecx,%eax
+ jmp fpu_reg_round
+
+
+#ifdef PARANOID
+L_bugged:
+ pushl EX_INTERNAL|0x205
+ call EXCEPTION
+ pop %ebx
+ jmp L_exit
+
+L_exit:
+ popl %ebx
+ popl %edi
+ popl %esi
+ leave
+ ret
+#endif /* PARANOID */
+
+ENDPROC(FPU_u_mul)
diff --git a/arch/x86/math-emu/reg_u_sub.S b/arch/x86/math-emu/reg_u_sub.S
new file mode 100644
index 000000000..f05dea7de
--- /dev/null
+++ b/arch/x86/math-emu/reg_u_sub.S
@@ -0,0 +1,274 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+ .file "reg_u_sub.S"
+/*---------------------------------------------------------------------------+
+ | reg_u_sub.S |
+ | |
+ | Core floating point subtraction routine. |
+ | |
+ | Copyright (C) 1992,1993,1995,1997 |
+ | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia |
+ | E-mail billm@suburbia.net |
+ | |
+ | Call from C as: |
+ | int FPU_u_sub(FPU_REG *arg1, FPU_REG *arg2, FPU_REG *answ, |
+ | int control_w) |
+ | Return value is the tag of the answer, or-ed with FPU_Exception if |
+ | one was raised, or -1 on internal error. |
+ | |
+ +---------------------------------------------------------------------------*/
+
+/*
+ | Kernel subtraction routine FPU_u_sub(reg *arg1, reg *arg2, reg *answ).
+ | Takes two valid reg f.p. numbers (TAG_Valid), which are
+ | treated as unsigned numbers,
+ | and returns their difference as a TAG_Valid or TAG_Zero f.p.
+ | number.
+ | The first number (arg1) must be the larger.
+ | The returned number is normalized.
+ | Basic checks are performed if PARANOID is defined.
+ */
+
+#include "exception.h"
+#include "fpu_emu.h"
+#include "control_w.h"
+
+.text
+ENTRY(FPU_u_sub)
+ pushl %ebp
+ movl %esp,%ebp
+ pushl %esi
+ pushl %edi
+ pushl %ebx
+
+ movl PARAM1,%esi /* source 1 */
+ movl PARAM2,%edi /* source 2 */
+
+ movl PARAM6,%ecx
+ subl PARAM7,%ecx /* exp1 - exp2 */
+
+#ifdef PARANOID
+ /* source 2 is always smaller than source 1 */
+ js L_bugged_1
+
+ testl $0x80000000,SIGH(%edi) /* The args are assumed to be be normalized */
+ je L_bugged_2
+
+ testl $0x80000000,SIGH(%esi)
+ je L_bugged_2
+#endif /* PARANOID */
+
+/*--------------------------------------+
+ | Form a register holding the |
+ | smaller number |
+ +--------------------------------------*/
+ movl SIGH(%edi),%eax /* register ms word */
+ movl SIGL(%edi),%ebx /* register ls word */
+
+ movl PARAM3,%edi /* destination */
+ movl PARAM6,%edx
+ movw %dx,EXP(%edi) /* Copy exponent to destination */
+
+ xorl %edx,%edx /* register extension */
+
+/*--------------------------------------+
+ | Shift the temporary register |
+ | right the required number of |
+ | places. |
+ +--------------------------------------*/
+
+ cmpw $32,%cx /* shrd only works for 0..31 bits */
+ jnc L_more_than_31
+
+/* less than 32 bits */
+ shrd %cl,%ebx,%edx
+ shrd %cl,%eax,%ebx
+ shr %cl,%eax
+ jmp L_shift_done
+
+L_more_than_31:
+ cmpw $64,%cx
+ jnc L_more_than_63
+
+ subb $32,%cl
+ jz L_exactly_32
+
+ shrd %cl,%eax,%edx
+ shr %cl,%eax
+ orl %ebx,%ebx
+ jz L_more_31_no_low /* none of the lowest bits is set */
+
+ orl $1,%edx /* record the fact in the extension */
+
+L_more_31_no_low:
+ movl %eax,%ebx
+ xorl %eax,%eax
+ jmp L_shift_done
+
+L_exactly_32:
+ movl %ebx,%edx
+ movl %eax,%ebx
+ xorl %eax,%eax
+ jmp L_shift_done
+
+L_more_than_63:
+ cmpw $65,%cx
+ jnc L_more_than_64
+
+ /* Shift right by 64 bits */
+ movl %eax,%edx
+ orl %ebx,%ebx
+ jz L_more_63_no_low
+
+ orl $1,%edx
+ jmp L_more_63_no_low
+
+L_more_than_64:
+ jne L_more_than_65
+
+ /* Shift right by 65 bits */
+ /* Carry is clear if we get here */
+ movl %eax,%edx
+ rcrl %edx
+ jnc L_shift_65_nc
+
+ orl $1,%edx
+ jmp L_more_63_no_low
+
+L_shift_65_nc:
+ orl %ebx,%ebx
+ jz L_more_63_no_low
+
+ orl $1,%edx
+ jmp L_more_63_no_low
+
+L_more_than_65:
+ movl $1,%edx /* The shifted nr always at least one '1' */
+
+L_more_63_no_low:
+ xorl %ebx,%ebx
+ xorl %eax,%eax
+
+L_shift_done:
+L_subtr:
+/*------------------------------+
+ | Do the subtraction |
+ +------------------------------*/
+ xorl %ecx,%ecx
+ subl %edx,%ecx
+ movl %ecx,%edx
+ movl SIGL(%esi),%ecx
+ sbbl %ebx,%ecx
+ movl %ecx,%ebx
+ movl SIGH(%esi),%ecx
+ sbbl %eax,%ecx
+ movl %ecx,%eax
+
+#ifdef PARANOID
+ /* We can never get a borrow */
+ jc L_bugged
+#endif /* PARANOID */
+
+/*--------------------------------------+
+ | Normalize the result |
+ +--------------------------------------*/
+ testl $0x80000000,%eax
+ jnz L_round /* no shifting needed */
+
+ orl %eax,%eax
+ jnz L_shift_1 /* shift left 1 - 31 bits */
+
+ orl %ebx,%ebx
+ jnz L_shift_32 /* shift left 32 - 63 bits */
+
+/*
+ * A rare case, the only one which is non-zero if we got here
+ * is: 1000000 .... 0000
+ * -0111111 .... 1111 1
+ * --------------------
+ * 0000000 .... 0000 1
+ */
+
+ cmpl $0x80000000,%edx
+ jnz L_must_be_zero
+
+ /* Shift left 64 bits */
+ subw $64,EXP(%edi)
+ xchg %edx,%eax
+ jmp fpu_reg_round
+
+L_must_be_zero:
+#ifdef PARANOID
+ orl %edx,%edx
+ jnz L_bugged_3
+#endif /* PARANOID */
+
+ /* The result is zero */
+ movw $0,EXP(%edi) /* exponent */
+ movl $0,SIGL(%edi)
+ movl $0,SIGH(%edi)
+ movl TAG_Zero,%eax
+ jmp L_exit
+
+L_shift_32:
+ movl %ebx,%eax
+ movl %edx,%ebx
+ movl $0,%edx
+ subw $32,EXP(%edi) /* Can get underflow here */
+
+/* We need to shift left by 1 - 31 bits */
+L_shift_1:
+ bsrl %eax,%ecx /* get the required shift in %ecx */
+ subl $31,%ecx
+ negl %ecx
+ shld %cl,%ebx,%eax
+ shld %cl,%edx,%ebx
+ shl %cl,%edx
+ subw %cx,EXP(%edi) /* Can get underflow here */
+
+L_round:
+ jmp fpu_reg_round /* Round the result */
+
+
+#ifdef PARANOID
+L_bugged_1:
+ pushl EX_INTERNAL|0x206
+ call EXCEPTION
+ pop %ebx
+ jmp L_error_exit
+
+L_bugged_2:
+ pushl EX_INTERNAL|0x209
+ call EXCEPTION
+ pop %ebx
+ jmp L_error_exit
+
+L_bugged_3:
+ pushl EX_INTERNAL|0x210
+ call EXCEPTION
+ pop %ebx
+ jmp L_error_exit
+
+L_bugged_4:
+ pushl EX_INTERNAL|0x211
+ call EXCEPTION
+ pop %ebx
+ jmp L_error_exit
+
+L_bugged:
+ pushl EX_INTERNAL|0x212
+ call EXCEPTION
+ pop %ebx
+ jmp L_error_exit
+
+L_error_exit:
+ movl $-1,%eax
+
+#endif /* PARANOID */
+
+L_exit:
+ popl %ebx
+ popl %edi
+ popl %esi
+ leave
+ ret
+ENDPROC(FPU_u_sub)
diff --git a/arch/x86/math-emu/round_Xsig.S b/arch/x86/math-emu/round_Xsig.S
new file mode 100644
index 000000000..226a51e99
--- /dev/null
+++ b/arch/x86/math-emu/round_Xsig.S
@@ -0,0 +1,142 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*---------------------------------------------------------------------------+
+ | round_Xsig.S |
+ | |
+ | Copyright (C) 1992,1993,1994,1995 |
+ | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, |
+ | Australia. E-mail billm@jacobi.maths.monash.edu.au |
+ | |
+ | Normalize and round a 12 byte quantity. |
+ | Call from C as: |
+ | int round_Xsig(Xsig *n) |
+ | |
+ | Normalize a 12 byte quantity. |
+ | Call from C as: |
+ | int norm_Xsig(Xsig *n) |
+ | |
+ | Each function returns the size of the shift (nr of bits). |
+ | |
+ +---------------------------------------------------------------------------*/
+ .file "round_Xsig.S"
+
+#include "fpu_emu.h"
+
+
+.text
+ENTRY(round_Xsig)
+ pushl %ebp
+ movl %esp,%ebp
+ pushl %ebx /* Reserve some space */
+ pushl %ebx
+ pushl %esi
+
+ movl PARAM1,%esi
+
+ movl 8(%esi),%edx
+ movl 4(%esi),%ebx
+ movl (%esi),%eax
+
+ movl $0,-4(%ebp)
+
+ orl %edx,%edx /* ms bits */
+ js L_round /* Already normalized */
+ jnz L_shift_1 /* Shift left 1 - 31 bits */
+
+ movl %ebx,%edx
+ movl %eax,%ebx
+ xorl %eax,%eax
+ movl $-32,-4(%ebp)
+
+/* We need to shift left by 1 - 31 bits */
+L_shift_1:
+ bsrl %edx,%ecx /* get the required shift in %ecx */
+ subl $31,%ecx
+ negl %ecx
+ subl %ecx,-4(%ebp)
+ shld %cl,%ebx,%edx
+ shld %cl,%eax,%ebx
+ shl %cl,%eax
+
+L_round:
+ testl $0x80000000,%eax
+ jz L_exit
+
+ addl $1,%ebx
+ adcl $0,%edx
+ jnz L_exit
+
+ movl $0x80000000,%edx
+ incl -4(%ebp)
+
+L_exit:
+ movl %edx,8(%esi)
+ movl %ebx,4(%esi)
+ movl %eax,(%esi)
+
+ movl -4(%ebp),%eax
+
+ popl %esi
+ popl %ebx
+ leave
+ ret
+ENDPROC(round_Xsig)
+
+
+
+ENTRY(norm_Xsig)
+ pushl %ebp
+ movl %esp,%ebp
+ pushl %ebx /* Reserve some space */
+ pushl %ebx
+ pushl %esi
+
+ movl PARAM1,%esi
+
+ movl 8(%esi),%edx
+ movl 4(%esi),%ebx
+ movl (%esi),%eax
+
+ movl $0,-4(%ebp)
+
+ orl %edx,%edx /* ms bits */
+ js L_n_exit /* Already normalized */
+ jnz L_n_shift_1 /* Shift left 1 - 31 bits */
+
+ movl %ebx,%edx
+ movl %eax,%ebx
+ xorl %eax,%eax
+ movl $-32,-4(%ebp)
+
+ orl %edx,%edx /* ms bits */
+ js L_n_exit /* Normalized now */
+ jnz L_n_shift_1 /* Shift left 1 - 31 bits */
+
+ movl %ebx,%edx
+ movl %eax,%ebx
+ xorl %eax,%eax
+ addl $-32,-4(%ebp)
+ jmp L_n_exit /* Might not be normalized,
+ but shift no more. */
+
+/* We need to shift left by 1 - 31 bits */
+L_n_shift_1:
+ bsrl %edx,%ecx /* get the required shift in %ecx */
+ subl $31,%ecx
+ negl %ecx
+ subl %ecx,-4(%ebp)
+ shld %cl,%ebx,%edx
+ shld %cl,%eax,%ebx
+ shl %cl,%eax
+
+L_n_exit:
+ movl %edx,8(%esi)
+ movl %ebx,4(%esi)
+ movl %eax,(%esi)
+
+ movl -4(%ebp),%eax
+
+ popl %esi
+ popl %ebx
+ leave
+ ret
+ENDPROC(norm_Xsig)
diff --git a/arch/x86/math-emu/shr_Xsig.S b/arch/x86/math-emu/shr_Xsig.S
new file mode 100644
index 000000000..96f4779aa
--- /dev/null
+++ b/arch/x86/math-emu/shr_Xsig.S
@@ -0,0 +1,89 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+ .file "shr_Xsig.S"
+/*---------------------------------------------------------------------------+
+ | shr_Xsig.S |
+ | |
+ | 12 byte right shift function |
+ | |
+ | Copyright (C) 1992,1994,1995 |
+ | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, |
+ | Australia. E-mail billm@jacobi.maths.monash.edu.au |
+ | |
+ | Call from C as: |
+ | void shr_Xsig(Xsig *arg, unsigned nr) |
+ | |
+ | Extended shift right function. |
+ | Fastest for small shifts. |
+ | Shifts the 12 byte quantity pointed to by the first arg (arg) |
+ | right by the number of bits specified by the second arg (nr). |
+ | |
+ +---------------------------------------------------------------------------*/
+
+#include "fpu_emu.h"
+
+.text
+ENTRY(shr_Xsig)
+ push %ebp
+ movl %esp,%ebp
+ pushl %esi
+ movl PARAM2,%ecx
+ movl PARAM1,%esi
+ cmpl $32,%ecx /* shrd only works for 0..31 bits */
+ jnc L_more_than_31
+
+/* less than 32 bits */
+ pushl %ebx
+ movl (%esi),%eax /* lsl */
+ movl 4(%esi),%ebx /* midl */
+ movl 8(%esi),%edx /* msl */
+ shrd %cl,%ebx,%eax
+ shrd %cl,%edx,%ebx
+ shr %cl,%edx
+ movl %eax,(%esi)
+ movl %ebx,4(%esi)
+ movl %edx,8(%esi)
+ popl %ebx
+ popl %esi
+ leave
+ ret
+
+L_more_than_31:
+ cmpl $64,%ecx
+ jnc L_more_than_63
+
+ subb $32,%cl
+ movl 4(%esi),%eax /* midl */
+ movl 8(%esi),%edx /* msl */
+ shrd %cl,%edx,%eax
+ shr %cl,%edx
+ movl %eax,(%esi)
+ movl %edx,4(%esi)
+ movl $0,8(%esi)
+ popl %esi
+ leave
+ ret
+
+L_more_than_63:
+ cmpl $96,%ecx
+ jnc L_more_than_95
+
+ subb $64,%cl
+ movl 8(%esi),%eax /* msl */
+ shr %cl,%eax
+ xorl %edx,%edx
+ movl %eax,(%esi)
+ movl %edx,4(%esi)
+ movl %edx,8(%esi)
+ popl %esi
+ leave
+ ret
+
+L_more_than_95:
+ xorl %eax,%eax
+ movl %eax,(%esi)
+ movl %eax,4(%esi)
+ movl %eax,8(%esi)
+ popl %esi
+ leave
+ ret
+ENDPROC(shr_Xsig)
diff --git a/arch/x86/math-emu/status_w.h b/arch/x86/math-emu/status_w.h
new file mode 100644
index 000000000..b77bafec9
--- /dev/null
+++ b/arch/x86/math-emu/status_w.h
@@ -0,0 +1,68 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*---------------------------------------------------------------------------+
+ | status_w.h |
+ | |
+ | Copyright (C) 1992,1993 |
+ | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, |
+ | Australia. E-mail billm@vaxc.cc.monash.edu.au |
+ | |
+ +---------------------------------------------------------------------------*/
+
+#ifndef _STATUS_H_
+#define _STATUS_H_
+
+#include "fpu_emu.h" /* for definition of PECULIAR_486 */
+
+#ifdef __ASSEMBLY__
+#define Const__(x) $##x
+#else
+#define Const__(x) x
+#endif
+
+#define SW_Backward Const__(0x8000) /* backward compatibility */
+#define SW_C3 Const__(0x4000) /* condition bit 3 */
+#define SW_Top Const__(0x3800) /* top of stack */
+#define SW_Top_Shift Const__(11) /* shift for top of stack bits */
+#define SW_C2 Const__(0x0400) /* condition bit 2 */
+#define SW_C1 Const__(0x0200) /* condition bit 1 */
+#define SW_C0 Const__(0x0100) /* condition bit 0 */
+#define SW_Summary Const__(0x0080) /* exception summary */
+#define SW_Stack_Fault Const__(0x0040) /* stack fault */
+#define SW_Precision Const__(0x0020) /* loss of precision */
+#define SW_Underflow Const__(0x0010) /* underflow */
+#define SW_Overflow Const__(0x0008) /* overflow */
+#define SW_Zero_Div Const__(0x0004) /* divide by zero */
+#define SW_Denorm_Op Const__(0x0002) /* denormalized operand */
+#define SW_Invalid Const__(0x0001) /* invalid operation */
+
+#define SW_Exc_Mask Const__(0x27f) /* Status word exception bit mask */
+
+#ifndef __ASSEMBLY__
+
+#define COMP_A_gt_B 1
+#define COMP_A_eq_B 2
+#define COMP_A_lt_B 3
+#define COMP_No_Comp 4
+#define COMP_Denormal 0x20
+#define COMP_NaN 0x40
+#define COMP_SNaN 0x80
+
+#define status_word() \
+ ((partial_status & ~SW_Top & 0xffff) | ((top << SW_Top_Shift) & SW_Top))
+static inline void setcc(int cc)
+{
+ partial_status &= ~(SW_C0 | SW_C1 | SW_C2 | SW_C3);
+ partial_status |= (cc) & (SW_C0 | SW_C1 | SW_C2 | SW_C3);
+}
+
+#ifdef PECULIAR_486
+ /* Default, this conveys no information, but an 80486 does it. */
+ /* Clear the SW_C1 bit, "other bits undefined". */
+# define clear_C1() { partial_status &= ~SW_C1; }
+# else
+# define clear_C1()
+#endif /* PECULIAR_486 */
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _STATUS_H_ */
diff --git a/arch/x86/math-emu/version.h b/arch/x86/math-emu/version.h
new file mode 100644
index 000000000..a0d73a1d2
--- /dev/null
+++ b/arch/x86/math-emu/version.h
@@ -0,0 +1,12 @@
+/*---------------------------------------------------------------------------+
+ | version.h |
+ | |
+ | |
+ | Copyright (C) 1992,1993,1994,1996,1997,1999 |
+ | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, Australia |
+ | E-mail billm@melbpc.org.au |
+ | |
+ | |
+ +---------------------------------------------------------------------------*/
+
+#define FPU_VERSION "wm-FPU-emu version 2.01"
diff --git a/arch/x86/math-emu/wm_shrx.S b/arch/x86/math-emu/wm_shrx.S
new file mode 100644
index 000000000..d588874eb
--- /dev/null
+++ b/arch/x86/math-emu/wm_shrx.S
@@ -0,0 +1,207 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+ .file "wm_shrx.S"
+/*---------------------------------------------------------------------------+
+ | wm_shrx.S |
+ | |
+ | 64 bit right shift functions |
+ | |
+ | Copyright (C) 1992,1995 |
+ | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, |
+ | Australia. E-mail billm@jacobi.maths.monash.edu.au |
+ | |
+ | Call from C as: |
+ | unsigned FPU_shrx(void *arg1, unsigned arg2) |
+ | and |
+ | unsigned FPU_shrxs(void *arg1, unsigned arg2) |
+ | |
+ +---------------------------------------------------------------------------*/
+
+#include "fpu_emu.h"
+
+.text
+/*---------------------------------------------------------------------------+
+ | unsigned FPU_shrx(void *arg1, unsigned arg2) |
+ | |
+ | Extended shift right function. |
+ | Fastest for small shifts. |
+ | Shifts the 64 bit quantity pointed to by the first arg (arg1) |
+ | right by the number of bits specified by the second arg (arg2). |
+ | Forms a 96 bit quantity from the 64 bit arg and eax: |
+ | [ 64 bit arg ][ eax ] |
+ | shift right ---------> |
+ | The eax register is initialized to 0 before the shifting. |
+ | Results returned in the 64 bit arg and eax. |
+ +---------------------------------------------------------------------------*/
+
+ENTRY(FPU_shrx)
+ push %ebp
+ movl %esp,%ebp
+ pushl %esi
+ movl PARAM2,%ecx
+ movl PARAM1,%esi
+ cmpl $32,%ecx /* shrd only works for 0..31 bits */
+ jnc L_more_than_31
+
+/* less than 32 bits */
+ pushl %ebx
+ movl (%esi),%ebx /* lsl */
+ movl 4(%esi),%edx /* msl */
+ xorl %eax,%eax /* extension */
+ shrd %cl,%ebx,%eax
+ shrd %cl,%edx,%ebx
+ shr %cl,%edx
+ movl %ebx,(%esi)
+ movl %edx,4(%esi)
+ popl %ebx
+ popl %esi
+ leave
+ ret
+
+L_more_than_31:
+ cmpl $64,%ecx
+ jnc L_more_than_63
+
+ subb $32,%cl
+ movl (%esi),%eax /* lsl */
+ movl 4(%esi),%edx /* msl */
+ shrd %cl,%edx,%eax
+ shr %cl,%edx
+ movl %edx,(%esi)
+ movl $0,4(%esi)
+ popl %esi
+ leave
+ ret
+
+L_more_than_63:
+ cmpl $96,%ecx
+ jnc L_more_than_95
+
+ subb $64,%cl
+ movl 4(%esi),%eax /* msl */
+ shr %cl,%eax
+ xorl %edx,%edx
+ movl %edx,(%esi)
+ movl %edx,4(%esi)
+ popl %esi
+ leave
+ ret
+
+L_more_than_95:
+ xorl %eax,%eax
+ movl %eax,(%esi)
+ movl %eax,4(%esi)
+ popl %esi
+ leave
+ ret
+ENDPROC(FPU_shrx)
+
+
+/*---------------------------------------------------------------------------+
+ | unsigned FPU_shrxs(void *arg1, unsigned arg2) |
+ | |
+ | Extended shift right function (optimized for small floating point |
+ | integers). |
+ | Shifts the 64 bit quantity pointed to by the first arg (arg1) |
+ | right by the number of bits specified by the second arg (arg2). |
+ | Forms a 96 bit quantity from the 64 bit arg and eax: |
+ | [ 64 bit arg ][ eax ] |
+ | shift right ---------> |
+ | The eax register is initialized to 0 before the shifting. |
+ | The lower 8 bits of eax are lost and replaced by a flag which is |
+ | set (to 0x01) if any bit, apart from the first one, is set in the |
+ | part which has been shifted out of the arg. |
+ | Results returned in the 64 bit arg and eax. |
+ +---------------------------------------------------------------------------*/
+ENTRY(FPU_shrxs)
+ push %ebp
+ movl %esp,%ebp
+ pushl %esi
+ pushl %ebx
+ movl PARAM2,%ecx
+ movl PARAM1,%esi
+ cmpl $64,%ecx /* shrd only works for 0..31 bits */
+ jnc Ls_more_than_63
+
+ cmpl $32,%ecx /* shrd only works for 0..31 bits */
+ jc Ls_less_than_32
+
+/* We got here without jumps by assuming that the most common requirement
+ is for small integers */
+/* Shift by [32..63] bits */
+ subb $32,%cl
+ movl (%esi),%eax /* lsl */
+ movl 4(%esi),%edx /* msl */
+ xorl %ebx,%ebx
+ shrd %cl,%eax,%ebx
+ shrd %cl,%edx,%eax
+ shr %cl,%edx
+ orl %ebx,%ebx /* test these 32 bits */
+ setne %bl
+ test $0x7fffffff,%eax /* and 31 bits here */
+ setne %bh
+ orw %bx,%bx /* Any of the 63 bit set ? */
+ setne %al
+ movl %edx,(%esi)
+ movl $0,4(%esi)
+ popl %ebx
+ popl %esi
+ leave
+ ret
+
+/* Shift by [0..31] bits */
+Ls_less_than_32:
+ movl (%esi),%ebx /* lsl */
+ movl 4(%esi),%edx /* msl */
+ xorl %eax,%eax /* extension */
+ shrd %cl,%ebx,%eax
+ shrd %cl,%edx,%ebx
+ shr %cl,%edx
+ test $0x7fffffff,%eax /* only need to look at eax here */
+ setne %al
+ movl %ebx,(%esi)
+ movl %edx,4(%esi)
+ popl %ebx
+ popl %esi
+ leave
+ ret
+
+/* Shift by [64..95] bits */
+Ls_more_than_63:
+ cmpl $96,%ecx
+ jnc Ls_more_than_95
+
+ subb $64,%cl
+ movl (%esi),%ebx /* lsl */
+ movl 4(%esi),%eax /* msl */
+ xorl %edx,%edx /* extension */
+ shrd %cl,%ebx,%edx
+ shrd %cl,%eax,%ebx
+ shr %cl,%eax
+ orl %ebx,%edx
+ setne %bl
+ test $0x7fffffff,%eax /* only need to look at eax here */
+ setne %bh
+ orw %bx,%bx
+ setne %al
+ xorl %edx,%edx
+ movl %edx,(%esi) /* set to zero */
+ movl %edx,4(%esi) /* set to zero */
+ popl %ebx
+ popl %esi
+ leave
+ ret
+
+Ls_more_than_95:
+/* Shift by [96..inf) bits */
+ xorl %eax,%eax
+ movl (%esi),%ebx
+ orl 4(%esi),%ebx
+ setne %al
+ xorl %ebx,%ebx
+ movl %ebx,(%esi)
+ movl %ebx,4(%esi)
+ popl %ebx
+ popl %esi
+ leave
+ ret
+ENDPROC(FPU_shrxs)
diff --git a/arch/x86/math-emu/wm_sqrt.S b/arch/x86/math-emu/wm_sqrt.S
new file mode 100644
index 000000000..515cdee90
--- /dev/null
+++ b/arch/x86/math-emu/wm_sqrt.S
@@ -0,0 +1,472 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+ .file "wm_sqrt.S"
+/*---------------------------------------------------------------------------+
+ | wm_sqrt.S |
+ | |
+ | Fixed point arithmetic square root evaluation. |
+ | |
+ | Copyright (C) 1992,1993,1995,1997 |
+ | W. Metzenthen, 22 Parker St, Ormond, Vic 3163, |
+ | Australia. E-mail billm@suburbia.net |
+ | |
+ | Call from C as: |
+ | int wm_sqrt(FPU_REG *n, unsigned int control_word) |
+ | |
+ +---------------------------------------------------------------------------*/
+
+/*---------------------------------------------------------------------------+
+ | wm_sqrt(FPU_REG *n, unsigned int control_word) |
+ | returns the square root of n in n. |
+ | |
+ | Use Newton's method to compute the square root of a number, which must |
+ | be in the range [1.0 .. 4.0), to 64 bits accuracy. |
+ | Does not check the sign or tag of the argument. |
+ | Sets the exponent, but not the sign or tag of the result. |
+ | |
+ | The guess is kept in %esi:%edi |
+ +---------------------------------------------------------------------------*/
+
+#include "exception.h"
+#include "fpu_emu.h"
+
+
+#ifndef NON_REENTRANT_FPU
+/* Local storage on the stack: */
+#define FPU_accum_3 -4(%ebp) /* ms word */
+#define FPU_accum_2 -8(%ebp)
+#define FPU_accum_1 -12(%ebp)
+#define FPU_accum_0 -16(%ebp)
+
+/*
+ * The de-normalised argument:
+ * sq_2 sq_1 sq_0
+ * b b b b b b b ... b b b b b b .... b b b b 0 0 0 ... 0
+ * ^ binary point here
+ */
+#define FPU_fsqrt_arg_2 -20(%ebp) /* ms word */
+#define FPU_fsqrt_arg_1 -24(%ebp)
+#define FPU_fsqrt_arg_0 -28(%ebp) /* ls word, at most the ms bit is set */
+
+#else
+/* Local storage in a static area: */
+.data
+ .align 4,0
+FPU_accum_3:
+ .long 0 /* ms word */
+FPU_accum_2:
+ .long 0
+FPU_accum_1:
+ .long 0
+FPU_accum_0:
+ .long 0
+
+/* The de-normalised argument:
+ sq_2 sq_1 sq_0
+ b b b b b b b ... b b b b b b .... b b b b 0 0 0 ... 0
+ ^ binary point here
+ */
+FPU_fsqrt_arg_2:
+ .long 0 /* ms word */
+FPU_fsqrt_arg_1:
+ .long 0
+FPU_fsqrt_arg_0:
+ .long 0 /* ls word, at most the ms bit is set */
+#endif /* NON_REENTRANT_FPU */
+
+
+.text
+ENTRY(wm_sqrt)
+ pushl %ebp
+ movl %esp,%ebp
+#ifndef NON_REENTRANT_FPU
+ subl $28,%esp
+#endif /* NON_REENTRANT_FPU */
+ pushl %esi
+ pushl %edi
+ pushl %ebx
+
+ movl PARAM1,%esi
+
+ movl SIGH(%esi),%eax
+ movl SIGL(%esi),%ecx
+ xorl %edx,%edx
+
+/* We use a rough linear estimate for the first guess.. */
+
+ cmpw EXP_BIAS,EXP(%esi)
+ jnz sqrt_arg_ge_2
+
+ shrl $1,%eax /* arg is in the range [1.0 .. 2.0) */
+ rcrl $1,%ecx
+ rcrl $1,%edx
+
+sqrt_arg_ge_2:
+/* From here on, n is never accessed directly again until it is
+ replaced by the answer. */
+
+ movl %eax,FPU_fsqrt_arg_2 /* ms word of n */
+ movl %ecx,FPU_fsqrt_arg_1
+ movl %edx,FPU_fsqrt_arg_0
+
+/* Make a linear first estimate */
+ shrl $1,%eax
+ addl $0x40000000,%eax
+ movl $0xaaaaaaaa,%ecx
+ mull %ecx
+ shll %edx /* max result was 7fff... */
+ testl $0x80000000,%edx /* but min was 3fff... */
+ jnz sqrt_prelim_no_adjust
+
+ movl $0x80000000,%edx /* round up */
+
+sqrt_prelim_no_adjust:
+ movl %edx,%esi /* Our first guess */
+
+/* We have now computed (approx) (2 + x) / 3, which forms the basis
+ for a few iterations of Newton's method */
+
+ movl FPU_fsqrt_arg_2,%ecx /* ms word */
+
+/*
+ * From our initial estimate, three iterations are enough to get us
+ * to 30 bits or so. This will then allow two iterations at better
+ * precision to complete the process.
+ */
+
+/* Compute (g + n/g)/2 at each iteration (g is the guess). */
+ shrl %ecx /* Doing this first will prevent a divide */
+ /* overflow later. */
+
+ movl %ecx,%edx /* msw of the arg / 2 */
+ divl %esi /* current estimate */
+ shrl %esi /* divide by 2 */
+ addl %eax,%esi /* the new estimate */
+
+ movl %ecx,%edx
+ divl %esi
+ shrl %esi
+ addl %eax,%esi
+
+ movl %ecx,%edx
+ divl %esi
+ shrl %esi
+ addl %eax,%esi
+
+/*
+ * Now that an estimate accurate to about 30 bits has been obtained (in %esi),
+ * we improve it to 60 bits or so.
+ *
+ * The strategy from now on is to compute new estimates from
+ * guess := guess + (n - guess^2) / (2 * guess)
+ */
+
+/* First, find the square of the guess */
+ movl %esi,%eax
+ mull %esi
+/* guess^2 now in %edx:%eax */
+
+ movl FPU_fsqrt_arg_1,%ecx
+ subl %ecx,%eax
+ movl FPU_fsqrt_arg_2,%ecx /* ms word of normalized n */
+ sbbl %ecx,%edx
+ jnc sqrt_stage_2_positive
+
+/* Subtraction gives a negative result,
+ negate the result before division. */
+ notl %edx
+ notl %eax
+ addl $1,%eax
+ adcl $0,%edx
+
+ divl %esi
+ movl %eax,%ecx
+
+ movl %edx,%eax
+ divl %esi
+ jmp sqrt_stage_2_finish
+
+sqrt_stage_2_positive:
+ divl %esi
+ movl %eax,%ecx
+
+ movl %edx,%eax
+ divl %esi
+
+ notl %ecx
+ notl %eax
+ addl $1,%eax
+ adcl $0,%ecx
+
+sqrt_stage_2_finish:
+ sarl $1,%ecx /* divide by 2 */
+ rcrl $1,%eax
+
+ /* Form the new estimate in %esi:%edi */
+ movl %eax,%edi
+ addl %ecx,%esi
+
+ jnz sqrt_stage_2_done /* result should be [1..2) */
+
+#ifdef PARANOID
+/* It should be possible to get here only if the arg is ffff....ffff */
+ cmpl $0xffffffff,FPU_fsqrt_arg_1
+ jnz sqrt_stage_2_error
+#endif /* PARANOID */
+
+/* The best rounded result. */
+ xorl %eax,%eax
+ decl %eax
+ movl %eax,%edi
+ movl %eax,%esi
+ movl $0x7fffffff,%eax
+ jmp sqrt_round_result
+
+#ifdef PARANOID
+sqrt_stage_2_error:
+ pushl EX_INTERNAL|0x213
+ call EXCEPTION
+#endif /* PARANOID */
+
+sqrt_stage_2_done:
+
+/* Now the square root has been computed to better than 60 bits. */
+
+/* Find the square of the guess. */
+ movl %edi,%eax /* ls word of guess */
+ mull %edi
+ movl %edx,FPU_accum_1
+
+ movl %esi,%eax
+ mull %esi
+ movl %edx,FPU_accum_3
+ movl %eax,FPU_accum_2
+
+ movl %edi,%eax
+ mull %esi
+ addl %eax,FPU_accum_1
+ adcl %edx,FPU_accum_2
+ adcl $0,FPU_accum_3
+
+/* movl %esi,%eax */
+/* mull %edi */
+ addl %eax,FPU_accum_1
+ adcl %edx,FPU_accum_2
+ adcl $0,FPU_accum_3
+
+/* guess^2 now in FPU_accum_3:FPU_accum_2:FPU_accum_1 */
+
+ movl FPU_fsqrt_arg_0,%eax /* get normalized n */
+ subl %eax,FPU_accum_1
+ movl FPU_fsqrt_arg_1,%eax
+ sbbl %eax,FPU_accum_2
+ movl FPU_fsqrt_arg_2,%eax /* ms word of normalized n */
+ sbbl %eax,FPU_accum_3
+ jnc sqrt_stage_3_positive
+
+/* Subtraction gives a negative result,
+ negate the result before division */
+ notl FPU_accum_1
+ notl FPU_accum_2
+ notl FPU_accum_3
+ addl $1,FPU_accum_1
+ adcl $0,FPU_accum_2
+
+#ifdef PARANOID
+ adcl $0,FPU_accum_3 /* This must be zero */
+ jz sqrt_stage_3_no_error
+
+sqrt_stage_3_error:
+ pushl EX_INTERNAL|0x207
+ call EXCEPTION
+
+sqrt_stage_3_no_error:
+#endif /* PARANOID */
+
+ movl FPU_accum_2,%edx
+ movl FPU_accum_1,%eax
+ divl %esi
+ movl %eax,%ecx
+
+ movl %edx,%eax
+ divl %esi
+
+ sarl $1,%ecx /* divide by 2 */
+ rcrl $1,%eax
+
+ /* prepare to round the result */
+
+ addl %ecx,%edi
+ adcl $0,%esi
+
+ jmp sqrt_stage_3_finished
+
+sqrt_stage_3_positive:
+ movl FPU_accum_2,%edx
+ movl FPU_accum_1,%eax
+ divl %esi
+ movl %eax,%ecx
+
+ movl %edx,%eax
+ divl %esi
+
+ sarl $1,%ecx /* divide by 2 */
+ rcrl $1,%eax
+
+ /* prepare to round the result */
+
+ notl %eax /* Negate the correction term */
+ notl %ecx
+ addl $1,%eax
+ adcl $0,%ecx /* carry here ==> correction == 0 */
+ adcl $0xffffffff,%esi
+
+ addl %ecx,%edi
+ adcl $0,%esi
+
+sqrt_stage_3_finished:
+
+/*
+ * The result in %esi:%edi:%esi should be good to about 90 bits here,
+ * and the rounding information here does not have sufficient accuracy
+ * in a few rare cases.
+ */
+ cmpl $0xffffffe0,%eax
+ ja sqrt_near_exact_x
+
+ cmpl $0x00000020,%eax
+ jb sqrt_near_exact
+
+ cmpl $0x7fffffe0,%eax
+ jb sqrt_round_result
+
+ cmpl $0x80000020,%eax
+ jb sqrt_get_more_precision
+
+sqrt_round_result:
+/* Set up for rounding operations */
+ movl %eax,%edx
+ movl %esi,%eax
+ movl %edi,%ebx
+ movl PARAM1,%edi
+ movw EXP_BIAS,EXP(%edi) /* Result is in [1.0 .. 2.0) */
+ jmp fpu_reg_round
+
+
+sqrt_near_exact_x:
+/* First, the estimate must be rounded up. */
+ addl $1,%edi
+ adcl $0,%esi
+
+sqrt_near_exact:
+/*
+ * This is an easy case because x^1/2 is monotonic.
+ * We need just find the square of our estimate, compare it
+ * with the argument, and deduce whether our estimate is
+ * above, below, or exact. We use the fact that the estimate
+ * is known to be accurate to about 90 bits.
+ */
+ movl %edi,%eax /* ls word of guess */
+ mull %edi
+ movl %edx,%ebx /* 2nd ls word of square */
+ movl %eax,%ecx /* ls word of square */
+
+ movl %edi,%eax
+ mull %esi
+ addl %eax,%ebx
+ addl %eax,%ebx
+
+#ifdef PARANOID
+ cmp $0xffffffb0,%ebx
+ jb sqrt_near_exact_ok
+
+ cmp $0x00000050,%ebx
+ ja sqrt_near_exact_ok
+
+ pushl EX_INTERNAL|0x214
+ call EXCEPTION
+
+sqrt_near_exact_ok:
+#endif /* PARANOID */
+
+ or %ebx,%ebx
+ js sqrt_near_exact_small
+
+ jnz sqrt_near_exact_large
+
+ or %ebx,%edx
+ jnz sqrt_near_exact_large
+
+/* Our estimate is exactly the right answer */
+ xorl %eax,%eax
+ jmp sqrt_round_result
+
+sqrt_near_exact_small:
+/* Our estimate is too small */
+ movl $0x000000ff,%eax
+ jmp sqrt_round_result
+
+sqrt_near_exact_large:
+/* Our estimate is too large, we need to decrement it */
+ subl $1,%edi
+ sbbl $0,%esi
+ movl $0xffffff00,%eax
+ jmp sqrt_round_result
+
+
+sqrt_get_more_precision:
+/* This case is almost the same as the above, except we start
+ with an extra bit of precision in the estimate. */
+ stc /* The extra bit. */
+ rcll $1,%edi /* Shift the estimate left one bit */
+ rcll $1,%esi
+
+ movl %edi,%eax /* ls word of guess */
+ mull %edi
+ movl %edx,%ebx /* 2nd ls word of square */
+ movl %eax,%ecx /* ls word of square */
+
+ movl %edi,%eax
+ mull %esi
+ addl %eax,%ebx
+ addl %eax,%ebx
+
+/* Put our estimate back to its original value */
+ stc /* The ms bit. */
+ rcrl $1,%esi /* Shift the estimate left one bit */
+ rcrl $1,%edi
+
+#ifdef PARANOID
+ cmp $0xffffff60,%ebx
+ jb sqrt_more_prec_ok
+
+ cmp $0x000000a0,%ebx
+ ja sqrt_more_prec_ok
+
+ pushl EX_INTERNAL|0x215
+ call EXCEPTION
+
+sqrt_more_prec_ok:
+#endif /* PARANOID */
+
+ or %ebx,%ebx
+ js sqrt_more_prec_small
+
+ jnz sqrt_more_prec_large
+
+ or %ebx,%ecx
+ jnz sqrt_more_prec_large
+
+/* Our estimate is exactly the right answer */
+ movl $0x80000000,%eax
+ jmp sqrt_round_result
+
+sqrt_more_prec_small:
+/* Our estimate is too small */
+ movl $0x800000ff,%eax
+ jmp sqrt_round_result
+
+sqrt_more_prec_large:
+/* Our estimate is too large */
+ movl $0x7fffff00,%eax
+ jmp sqrt_round_result
+ENDPROC(wm_sqrt)
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
new file mode 100644
index 000000000..4b101dd6e
--- /dev/null
+++ b/arch/x86/mm/Makefile
@@ -0,0 +1,55 @@
+# SPDX-License-Identifier: GPL-2.0
+# Kernel does not boot with instrumentation of tlb.c and mem_encrypt*.c
+KCOV_INSTRUMENT_tlb.o := n
+KCOV_INSTRUMENT_mem_encrypt.o := n
+KCOV_INSTRUMENT_mem_encrypt_identity.o := n
+
+KASAN_SANITIZE_mem_encrypt.o := n
+KASAN_SANITIZE_mem_encrypt_identity.o := n
+
+ifdef CONFIG_FUNCTION_TRACER
+CFLAGS_REMOVE_mem_encrypt.o = -pg
+CFLAGS_REMOVE_mem_encrypt_identity.o = -pg
+endif
+
+obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
+ pat.o pgtable.o physaddr.o setup_nx.o tlb.o cpu_entry_area.o
+
+# Make sure __phys_addr has no stackprotector
+nostackp := $(call cc-option, -fno-stack-protector)
+CFLAGS_physaddr.o := $(nostackp)
+CFLAGS_setup_nx.o := $(nostackp)
+CFLAGS_mem_encrypt_identity.o := $(nostackp)
+
+CFLAGS_fault.o := -I$(src)/../include/asm/trace
+
+obj-$(CONFIG_X86_PAT) += pat_rbtree.o
+
+obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o
+
+obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
+obj-$(CONFIG_X86_PTDUMP_CORE) += dump_pagetables.o
+obj-$(CONFIG_X86_PTDUMP) += debug_pagetables.o
+
+obj-$(CONFIG_HIGHMEM) += highmem_32.o
+
+KASAN_SANITIZE_kasan_init_$(BITS).o := n
+obj-$(CONFIG_KASAN) += kasan_init_$(BITS).o
+
+obj-$(CONFIG_MMIOTRACE) += mmiotrace.o
+mmiotrace-y := kmmio.o pf_in.o mmio-mod.o
+obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o
+
+obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o
+obj-$(CONFIG_AMD_NUMA) += amdtopology.o
+obj-$(CONFIG_ACPI_NUMA) += srat.o
+obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
+
+obj-$(CONFIG_X86_INTEL_MPX) += mpx.o
+obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
+obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
+obj-$(CONFIG_PAGE_TABLE_ISOLATION) += pti.o
+
+obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt.o
+obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_identity.o
+obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_boot.o
diff --git a/arch/x86/mm/amdtopology.c b/arch/x86/mm/amdtopology.c
new file mode 100644
index 000000000..048c761d9
--- /dev/null
+++ b/arch/x86/mm/amdtopology.c
@@ -0,0 +1,184 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * AMD NUMA support.
+ * Discover the memory map and associated nodes.
+ *
+ * This version reads it directly from the AMD northbridge.
+ *
+ * Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/nodemask.h>
+#include <linux/memblock.h>
+#include <linux/bootmem.h>
+
+#include <asm/io.h>
+#include <linux/pci_ids.h>
+#include <linux/acpi.h>
+#include <asm/types.h>
+#include <asm/mmzone.h>
+#include <asm/proto.h>
+#include <asm/e820/api.h>
+#include <asm/pci-direct.h>
+#include <asm/numa.h>
+#include <asm/mpspec.h>
+#include <asm/apic.h>
+#include <asm/amd_nb.h>
+
+static unsigned char __initdata nodeids[8];
+
+static __init int find_northbridge(void)
+{
+ int num;
+
+ for (num = 0; num < 32; num++) {
+ u32 header;
+
+ header = read_pci_config(0, num, 0, 0x00);
+ if (header != (PCI_VENDOR_ID_AMD | (0x1100<<16)) &&
+ header != (PCI_VENDOR_ID_AMD | (0x1200<<16)) &&
+ header != (PCI_VENDOR_ID_AMD | (0x1300<<16)))
+ continue;
+
+ header = read_pci_config(0, num, 1, 0x00);
+ if (header != (PCI_VENDOR_ID_AMD | (0x1101<<16)) &&
+ header != (PCI_VENDOR_ID_AMD | (0x1201<<16)) &&
+ header != (PCI_VENDOR_ID_AMD | (0x1301<<16)))
+ continue;
+ return num;
+ }
+
+ return -ENOENT;
+}
+
+int __init amd_numa_init(void)
+{
+ u64 start = PFN_PHYS(0);
+ u64 end = PFN_PHYS(max_pfn);
+ unsigned numnodes;
+ u64 prevbase;
+ int i, j, nb;
+ u32 nodeid, reg;
+ unsigned int bits, cores, apicid_base;
+
+ if (!early_pci_allowed())
+ return -EINVAL;
+
+ nb = find_northbridge();
+ if (nb < 0)
+ return nb;
+
+ pr_info("Scanning NUMA topology in Northbridge %d\n", nb);
+
+ reg = read_pci_config(0, nb, 0, 0x60);
+ numnodes = ((reg >> 4) & 0xF) + 1;
+ if (numnodes <= 1)
+ return -ENOENT;
+
+ pr_info("Number of physical nodes %d\n", numnodes);
+
+ prevbase = 0;
+ for (i = 0; i < 8; i++) {
+ u64 base, limit;
+
+ base = read_pci_config(0, nb, 1, 0x40 + i*8);
+ limit = read_pci_config(0, nb, 1, 0x44 + i*8);
+
+ nodeids[i] = nodeid = limit & 7;
+ if ((base & 3) == 0) {
+ if (i < numnodes)
+ pr_info("Skipping disabled node %d\n", i);
+ continue;
+ }
+ if (nodeid >= numnodes) {
+ pr_info("Ignoring excess node %d (%Lx:%Lx)\n", nodeid,
+ base, limit);
+ continue;
+ }
+
+ if (!limit) {
+ pr_info("Skipping node entry %d (base %Lx)\n",
+ i, base);
+ continue;
+ }
+ if ((base >> 8) & 3 || (limit >> 8) & 3) {
+ pr_err("Node %d using interleaving mode %Lx/%Lx\n",
+ nodeid, (base >> 8) & 3, (limit >> 8) & 3);
+ return -EINVAL;
+ }
+ if (node_isset(nodeid, numa_nodes_parsed)) {
+ pr_info("Node %d already present, skipping\n",
+ nodeid);
+ continue;
+ }
+
+ limit >>= 16;
+ limit++;
+ limit <<= 24;
+
+ if (limit > end)
+ limit = end;
+ if (limit <= base)
+ continue;
+
+ base >>= 16;
+ base <<= 24;
+
+ if (base < start)
+ base = start;
+ if (limit > end)
+ limit = end;
+ if (limit == base) {
+ pr_err("Empty node %d\n", nodeid);
+ continue;
+ }
+ if (limit < base) {
+ pr_err("Node %d bogus settings %Lx-%Lx.\n",
+ nodeid, base, limit);
+ continue;
+ }
+
+ /* Could sort here, but pun for now. Should not happen anyroads. */
+ if (prevbase > base) {
+ pr_err("Node map not sorted %Lx,%Lx\n",
+ prevbase, base);
+ return -EINVAL;
+ }
+
+ pr_info("Node %d MemBase %016Lx Limit %016Lx\n",
+ nodeid, base, limit);
+
+ prevbase = base;
+ numa_add_memblk(nodeid, base, limit);
+ node_set(nodeid, numa_nodes_parsed);
+ }
+
+ if (!nodes_weight(numa_nodes_parsed))
+ return -ENOENT;
+
+ /*
+ * We seem to have valid NUMA configuration. Map apicids to nodes
+ * using the coreid bits from early_identify_cpu.
+ */
+ bits = boot_cpu_data.x86_coreid_bits;
+ cores = 1 << bits;
+ apicid_base = 0;
+
+ /*
+ * get boot-time SMP configuration:
+ */
+ early_get_smp_config();
+
+ if (boot_cpu_physical_apicid > 0) {
+ pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid);
+ apicid_base = boot_cpu_physical_apicid;
+ }
+
+ for_each_node_mask(i, numa_nodes_parsed)
+ for (j = apicid_base; j < cores + apicid_base; j++)
+ set_apicid_to_node((i << bits) + j, i);
+
+ return 0;
+}
diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c
new file mode 100644
index 000000000..076ebdce9
--- /dev/null
+++ b/arch/x86/mm/cpu_entry_area.c
@@ -0,0 +1,217 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/spinlock.h>
+#include <linux/percpu.h>
+#include <linux/kallsyms.h>
+#include <linux/kcore.h>
+
+#include <asm/cpu_entry_area.h>
+#include <asm/pgtable.h>
+#include <asm/fixmap.h>
+#include <asm/desc.h>
+
+static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page, entry_stack_storage);
+
+#ifdef CONFIG_X86_64
+static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
+ [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
+static DEFINE_PER_CPU(struct kcore_list, kcore_entry_trampoline);
+#endif
+
+struct cpu_entry_area *get_cpu_entry_area(int cpu)
+{
+ unsigned long va = CPU_ENTRY_AREA_PER_CPU + cpu * CPU_ENTRY_AREA_SIZE;
+ BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0);
+
+ return (struct cpu_entry_area *) va;
+}
+EXPORT_SYMBOL(get_cpu_entry_area);
+
+void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags)
+{
+ unsigned long va = (unsigned long) cea_vaddr;
+ pte_t pte = pfn_pte(pa >> PAGE_SHIFT, flags);
+
+ /*
+ * The cpu_entry_area is shared between the user and kernel
+ * page tables. All of its ptes can safely be global.
+ * _PAGE_GLOBAL gets reused to help indicate PROT_NONE for
+ * non-present PTEs, so be careful not to set it in that
+ * case to avoid confusion.
+ */
+ if (boot_cpu_has(X86_FEATURE_PGE) &&
+ (pgprot_val(flags) & _PAGE_PRESENT))
+ pte = pte_set_flags(pte, _PAGE_GLOBAL);
+
+ set_pte_vaddr(va, pte);
+}
+
+static void __init
+cea_map_percpu_pages(void *cea_vaddr, void *ptr, int pages, pgprot_t prot)
+{
+ for ( ; pages; pages--, cea_vaddr+= PAGE_SIZE, ptr += PAGE_SIZE)
+ cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot);
+}
+
+static void percpu_setup_debug_store(int cpu)
+{
+#ifdef CONFIG_CPU_SUP_INTEL
+ int npages;
+ void *cea;
+
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return;
+
+ cea = &get_cpu_entry_area(cpu)->cpu_debug_store;
+ npages = sizeof(struct debug_store) / PAGE_SIZE;
+ BUILD_BUG_ON(sizeof(struct debug_store) % PAGE_SIZE != 0);
+ cea_map_percpu_pages(cea, &per_cpu(cpu_debug_store, cpu), npages,
+ PAGE_KERNEL);
+
+ cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers;
+ /*
+ * Force the population of PMDs for not yet allocated per cpu
+ * memory like debug store buffers.
+ */
+ npages = sizeof(struct debug_store_buffers) / PAGE_SIZE;
+ for (; npages; npages--, cea += PAGE_SIZE)
+ cea_set_pte(cea, 0, PAGE_NONE);
+#endif
+}
+
+/* Setup the fixmap mappings only once per-processor */
+static void __init setup_cpu_entry_area(int cpu)
+{
+#ifdef CONFIG_X86_64
+ extern char _entry_trampoline[];
+
+ /* On 64-bit systems, we use a read-only fixmap GDT and TSS. */
+ pgprot_t gdt_prot = PAGE_KERNEL_RO;
+ pgprot_t tss_prot = PAGE_KERNEL_RO;
+#else
+ /*
+ * On native 32-bit systems, the GDT cannot be read-only because
+ * our double fault handler uses a task gate, and entering through
+ * a task gate needs to change an available TSS to busy. If the
+ * GDT is read-only, that will triple fault. The TSS cannot be
+ * read-only because the CPU writes to it on task switches.
+ *
+ * On Xen PV, the GDT must be read-only because the hypervisor
+ * requires it.
+ */
+ pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ?
+ PAGE_KERNEL_RO : PAGE_KERNEL;
+ pgprot_t tss_prot = PAGE_KERNEL;
+#endif
+
+ cea_set_pte(&get_cpu_entry_area(cpu)->gdt, get_cpu_gdt_paddr(cpu),
+ gdt_prot);
+
+ cea_map_percpu_pages(&get_cpu_entry_area(cpu)->entry_stack_page,
+ per_cpu_ptr(&entry_stack_storage, cpu), 1,
+ PAGE_KERNEL);
+
+ /*
+ * The Intel SDM says (Volume 3, 7.2.1):
+ *
+ * Avoid placing a page boundary in the part of the TSS that the
+ * processor reads during a task switch (the first 104 bytes). The
+ * processor may not correctly perform address translations if a
+ * boundary occurs in this area. During a task switch, the processor
+ * reads and writes into the first 104 bytes of each TSS (using
+ * contiguous physical addresses beginning with the physical address
+ * of the first byte of the TSS). So, after TSS access begins, if
+ * part of the 104 bytes is not physically contiguous, the processor
+ * will access incorrect information without generating a page-fault
+ * exception.
+ *
+ * There are also a lot of errata involving the TSS spanning a page
+ * boundary. Assert that we're not doing that.
+ */
+ BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^
+ offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK);
+ BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0);
+ cea_map_percpu_pages(&get_cpu_entry_area(cpu)->tss,
+ &per_cpu(cpu_tss_rw, cpu),
+ sizeof(struct tss_struct) / PAGE_SIZE, tss_prot);
+
+#ifdef CONFIG_X86_32
+ per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu);
+#endif
+
+#ifdef CONFIG_X86_64
+ BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0);
+ BUILD_BUG_ON(sizeof(exception_stacks) !=
+ sizeof(((struct cpu_entry_area *)0)->exception_stacks));
+ cea_map_percpu_pages(&get_cpu_entry_area(cpu)->exception_stacks,
+ &per_cpu(exception_stacks, cpu),
+ sizeof(exception_stacks) / PAGE_SIZE, PAGE_KERNEL);
+
+ cea_set_pte(&get_cpu_entry_area(cpu)->entry_trampoline,
+ __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX);
+ /*
+ * The cpu_entry_area alias addresses are not in the kernel binary
+ * so they do not show up in /proc/kcore normally. This adds entries
+ * for them manually.
+ */
+ kclist_add_remap(&per_cpu(kcore_entry_trampoline, cpu),
+ _entry_trampoline,
+ &get_cpu_entry_area(cpu)->entry_trampoline, PAGE_SIZE);
+#endif
+ percpu_setup_debug_store(cpu);
+}
+
+#ifdef CONFIG_X86_64
+int arch_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
+ char *name)
+{
+ unsigned int cpu, ncpu = 0;
+
+ if (symnum >= num_possible_cpus())
+ return -EINVAL;
+
+ for_each_possible_cpu(cpu) {
+ if (ncpu++ >= symnum)
+ break;
+ }
+
+ *value = (unsigned long)&get_cpu_entry_area(cpu)->entry_trampoline;
+ *type = 't';
+ strlcpy(name, "__entry_SYSCALL_64_trampoline", KSYM_NAME_LEN);
+
+ return 0;
+}
+#endif
+
+static __init void setup_cpu_entry_area_ptes(void)
+{
+#ifdef CONFIG_X86_32
+ unsigned long start, end;
+
+ BUILD_BUG_ON(CPU_ENTRY_AREA_PAGES * PAGE_SIZE < CPU_ENTRY_AREA_MAP_SIZE);
+ BUG_ON(CPU_ENTRY_AREA_BASE & ~PMD_MASK);
+
+ start = CPU_ENTRY_AREA_BASE;
+ end = start + CPU_ENTRY_AREA_MAP_SIZE;
+
+ /* Careful here: start + PMD_SIZE might wrap around */
+ for (; start < end && start >= CPU_ENTRY_AREA_BASE; start += PMD_SIZE)
+ populate_extra_pte(start);
+#endif
+}
+
+void __init setup_cpu_entry_areas(void)
+{
+ unsigned int cpu;
+
+ setup_cpu_entry_area_ptes();
+
+ for_each_possible_cpu(cpu)
+ setup_cpu_entry_area(cpu);
+
+ /*
+ * This is the last essential update to swapper_pgdir which needs
+ * to be synchronized to initial_page_table on 32bit.
+ */
+ sync_initial_page_table();
+}
diff --git a/arch/x86/mm/debug_pagetables.c b/arch/x86/mm/debug_pagetables.c
new file mode 100644
index 000000000..225fe2f0b
--- /dev/null
+++ b/arch/x86/mm/debug_pagetables.c
@@ -0,0 +1,146 @@
+#include <linux/debugfs.h>
+#include <linux/efi.h>
+#include <linux/module.h>
+#include <linux/seq_file.h>
+#include <asm/pgtable.h>
+
+static int ptdump_show(struct seq_file *m, void *v)
+{
+ ptdump_walk_pgd_level_debugfs(m, NULL, false);
+ return 0;
+}
+
+static int ptdump_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, ptdump_show, NULL);
+}
+
+static const struct file_operations ptdump_fops = {
+ .owner = THIS_MODULE,
+ .open = ptdump_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int ptdump_show_curknl(struct seq_file *m, void *v)
+{
+ if (current->mm->pgd) {
+ down_read(&current->mm->mmap_sem);
+ ptdump_walk_pgd_level_debugfs(m, current->mm->pgd, false);
+ up_read(&current->mm->mmap_sem);
+ }
+ return 0;
+}
+
+static int ptdump_open_curknl(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, ptdump_show_curknl, NULL);
+}
+
+static const struct file_operations ptdump_curknl_fops = {
+ .owner = THIS_MODULE,
+ .open = ptdump_open_curknl,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+static struct dentry *pe_curusr;
+
+static int ptdump_show_curusr(struct seq_file *m, void *v)
+{
+ if (current->mm->pgd) {
+ down_read(&current->mm->mmap_sem);
+ ptdump_walk_pgd_level_debugfs(m, current->mm->pgd, true);
+ up_read(&current->mm->mmap_sem);
+ }
+ return 0;
+}
+
+static int ptdump_open_curusr(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, ptdump_show_curusr, NULL);
+}
+
+static const struct file_operations ptdump_curusr_fops = {
+ .owner = THIS_MODULE,
+ .open = ptdump_open_curusr,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+#endif
+
+#if defined(CONFIG_EFI) && defined(CONFIG_X86_64)
+static struct dentry *pe_efi;
+
+static int ptdump_show_efi(struct seq_file *m, void *v)
+{
+ if (efi_mm.pgd)
+ ptdump_walk_pgd_level_debugfs(m, efi_mm.pgd, false);
+ return 0;
+}
+
+static int ptdump_open_efi(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, ptdump_show_efi, NULL);
+}
+
+static const struct file_operations ptdump_efi_fops = {
+ .owner = THIS_MODULE,
+ .open = ptdump_open_efi,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+#endif
+
+static struct dentry *dir, *pe_knl, *pe_curknl;
+
+static int __init pt_dump_debug_init(void)
+{
+ dir = debugfs_create_dir("page_tables", NULL);
+ if (!dir)
+ return -ENOMEM;
+
+ pe_knl = debugfs_create_file("kernel", 0400, dir, NULL,
+ &ptdump_fops);
+ if (!pe_knl)
+ goto err;
+
+ pe_curknl = debugfs_create_file("current_kernel", 0400,
+ dir, NULL, &ptdump_curknl_fops);
+ if (!pe_curknl)
+ goto err;
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+ pe_curusr = debugfs_create_file("current_user", 0400,
+ dir, NULL, &ptdump_curusr_fops);
+ if (!pe_curusr)
+ goto err;
+#endif
+
+#if defined(CONFIG_EFI) && defined(CONFIG_X86_64)
+ pe_efi = debugfs_create_file("efi", 0400, dir, NULL, &ptdump_efi_fops);
+ if (!pe_efi)
+ goto err;
+#endif
+
+ return 0;
+err:
+ debugfs_remove_recursive(dir);
+ return -ENOMEM;
+}
+
+static void __exit pt_dump_debug_exit(void)
+{
+ debugfs_remove_recursive(dir);
+}
+
+module_init(pt_dump_debug_init);
+module_exit(pt_dump_debug_exit);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
+MODULE_DESCRIPTION("Kernel debugging helper that dumps pagetables");
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
new file mode 100644
index 000000000..abcb8d00b
--- /dev/null
+++ b/arch/x86/mm/dump_pagetables.c
@@ -0,0 +1,640 @@
+/*
+ * Debug helper to dump the current kernel pagetables of the system
+ * so that we can see what the various memory ranges are set to.
+ *
+ * (C) Copyright 2008 Intel Corporation
+ *
+ * Author: Arjan van de Ven <arjan@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/debugfs.h>
+#include <linux/kasan.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/highmem.h>
+#include <linux/pci.h>
+
+#include <asm/e820/types.h>
+#include <asm/pgtable.h>
+
+/*
+ * The dumper groups pagetable entries of the same type into one, and for
+ * that it needs to keep some state when walking, and flush this state
+ * when a "break" in the continuity is found.
+ */
+struct pg_state {
+ int level;
+ pgprot_t current_prot;
+ pgprotval_t effective_prot;
+ unsigned long start_address;
+ unsigned long current_address;
+ const struct addr_marker *marker;
+ unsigned long lines;
+ bool to_dmesg;
+ bool check_wx;
+ unsigned long wx_pages;
+};
+
+struct addr_marker {
+ unsigned long start_address;
+ const char *name;
+ unsigned long max_lines;
+};
+
+/* Address space markers hints */
+
+#ifdef CONFIG_X86_64
+
+enum address_markers_idx {
+ USER_SPACE_NR = 0,
+ KERNEL_SPACE_NR,
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+ LDT_NR,
+#endif
+ LOW_KERNEL_NR,
+ VMALLOC_START_NR,
+ VMEMMAP_START_NR,
+#ifdef CONFIG_KASAN
+ KASAN_SHADOW_START_NR,
+ KASAN_SHADOW_END_NR,
+#endif
+ CPU_ENTRY_AREA_NR,
+#ifdef CONFIG_X86_ESPFIX64
+ ESPFIX_START_NR,
+#endif
+#ifdef CONFIG_EFI
+ EFI_END_NR,
+#endif
+ HIGH_KERNEL_NR,
+ MODULES_VADDR_NR,
+ MODULES_END_NR,
+ FIXADDR_START_NR,
+ END_OF_SPACE_NR,
+};
+
+static struct addr_marker address_markers[] = {
+ [USER_SPACE_NR] = { 0, "User Space" },
+ [KERNEL_SPACE_NR] = { (1UL << 63), "Kernel Space" },
+ [LOW_KERNEL_NR] = { 0UL, "Low Kernel Mapping" },
+ [VMALLOC_START_NR] = { 0UL, "vmalloc() Area" },
+ [VMEMMAP_START_NR] = { 0UL, "Vmemmap" },
+#ifdef CONFIG_KASAN
+ /*
+ * These fields get initialized with the (dynamic)
+ * KASAN_SHADOW_{START,END} values in pt_dump_init().
+ */
+ [KASAN_SHADOW_START_NR] = { 0UL, "KASAN shadow" },
+ [KASAN_SHADOW_END_NR] = { 0UL, "KASAN shadow end" },
+#endif
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+ [LDT_NR] = { 0UL, "LDT remap" },
+#endif
+ [CPU_ENTRY_AREA_NR] = { CPU_ENTRY_AREA_BASE,"CPU entry Area" },
+#ifdef CONFIG_X86_ESPFIX64
+ [ESPFIX_START_NR] = { ESPFIX_BASE_ADDR, "ESPfix Area", 16 },
+#endif
+#ifdef CONFIG_EFI
+ [EFI_END_NR] = { EFI_VA_END, "EFI Runtime Services" },
+#endif
+ [HIGH_KERNEL_NR] = { __START_KERNEL_map, "High Kernel Mapping" },
+ [MODULES_VADDR_NR] = { MODULES_VADDR, "Modules" },
+ [MODULES_END_NR] = { MODULES_END, "End Modules" },
+ [FIXADDR_START_NR] = { FIXADDR_START, "Fixmap Area" },
+ [END_OF_SPACE_NR] = { -1, NULL }
+};
+
+#define INIT_PGD ((pgd_t *) &init_top_pgt)
+
+#else /* CONFIG_X86_64 */
+
+enum address_markers_idx {
+ USER_SPACE_NR = 0,
+ KERNEL_SPACE_NR,
+ VMALLOC_START_NR,
+ VMALLOC_END_NR,
+#ifdef CONFIG_HIGHMEM
+ PKMAP_BASE_NR,
+#endif
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+ LDT_NR,
+#endif
+ CPU_ENTRY_AREA_NR,
+ FIXADDR_START_NR,
+ END_OF_SPACE_NR,
+};
+
+static struct addr_marker address_markers[] = {
+ [USER_SPACE_NR] = { 0, "User Space" },
+ [KERNEL_SPACE_NR] = { PAGE_OFFSET, "Kernel Mapping" },
+ [VMALLOC_START_NR] = { 0UL, "vmalloc() Area" },
+ [VMALLOC_END_NR] = { 0UL, "vmalloc() End" },
+#ifdef CONFIG_HIGHMEM
+ [PKMAP_BASE_NR] = { 0UL, "Persistent kmap() Area" },
+#endif
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+ [LDT_NR] = { 0UL, "LDT remap" },
+#endif
+ [CPU_ENTRY_AREA_NR] = { 0UL, "CPU entry area" },
+ [FIXADDR_START_NR] = { 0UL, "Fixmap area" },
+ [END_OF_SPACE_NR] = { -1, NULL }
+};
+
+#define INIT_PGD (swapper_pg_dir)
+
+#endif /* !CONFIG_X86_64 */
+
+/* Multipliers for offsets within the PTEs */
+#define PTE_LEVEL_MULT (PAGE_SIZE)
+#define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT)
+#define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT)
+#define P4D_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT)
+#define PGD_LEVEL_MULT (PTRS_PER_P4D * P4D_LEVEL_MULT)
+
+#define pt_dump_seq_printf(m, to_dmesg, fmt, args...) \
+({ \
+ if (to_dmesg) \
+ printk(KERN_INFO fmt, ##args); \
+ else \
+ if (m) \
+ seq_printf(m, fmt, ##args); \
+})
+
+#define pt_dump_cont_printf(m, to_dmesg, fmt, args...) \
+({ \
+ if (to_dmesg) \
+ printk(KERN_CONT fmt, ##args); \
+ else \
+ if (m) \
+ seq_printf(m, fmt, ##args); \
+})
+
+/*
+ * Print a readable form of a pgprot_t to the seq_file
+ */
+static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg)
+{
+ pgprotval_t pr = pgprot_val(prot);
+ static const char * const level_name[] =
+ { "cr3", "pgd", "p4d", "pud", "pmd", "pte" };
+
+ if (!(pr & _PAGE_PRESENT)) {
+ /* Not present */
+ pt_dump_cont_printf(m, dmsg, " ");
+ } else {
+ if (pr & _PAGE_USER)
+ pt_dump_cont_printf(m, dmsg, "USR ");
+ else
+ pt_dump_cont_printf(m, dmsg, " ");
+ if (pr & _PAGE_RW)
+ pt_dump_cont_printf(m, dmsg, "RW ");
+ else
+ pt_dump_cont_printf(m, dmsg, "ro ");
+ if (pr & _PAGE_PWT)
+ pt_dump_cont_printf(m, dmsg, "PWT ");
+ else
+ pt_dump_cont_printf(m, dmsg, " ");
+ if (pr & _PAGE_PCD)
+ pt_dump_cont_printf(m, dmsg, "PCD ");
+ else
+ pt_dump_cont_printf(m, dmsg, " ");
+
+ /* Bit 7 has a different meaning on level 3 vs 4 */
+ if (level <= 4 && pr & _PAGE_PSE)
+ pt_dump_cont_printf(m, dmsg, "PSE ");
+ else
+ pt_dump_cont_printf(m, dmsg, " ");
+ if ((level == 5 && pr & _PAGE_PAT) ||
+ ((level == 4 || level == 3) && pr & _PAGE_PAT_LARGE))
+ pt_dump_cont_printf(m, dmsg, "PAT ");
+ else
+ pt_dump_cont_printf(m, dmsg, " ");
+ if (pr & _PAGE_GLOBAL)
+ pt_dump_cont_printf(m, dmsg, "GLB ");
+ else
+ pt_dump_cont_printf(m, dmsg, " ");
+ if (pr & _PAGE_NX)
+ pt_dump_cont_printf(m, dmsg, "NX ");
+ else
+ pt_dump_cont_printf(m, dmsg, "x ");
+ }
+ pt_dump_cont_printf(m, dmsg, "%s\n", level_name[level]);
+}
+
+/*
+ * On 64 bits, sign-extend the 48 bit address to 64 bit
+ */
+static unsigned long normalize_addr(unsigned long u)
+{
+ int shift;
+ if (!IS_ENABLED(CONFIG_X86_64))
+ return u;
+
+ shift = 64 - (__VIRTUAL_MASK_SHIFT + 1);
+ return (signed long)(u << shift) >> shift;
+}
+
+static void note_wx(struct pg_state *st)
+{
+ unsigned long npages;
+
+ npages = (st->current_address - st->start_address) / PAGE_SIZE;
+
+#ifdef CONFIG_PCI_BIOS
+ /*
+ * If PCI BIOS is enabled, the PCI BIOS area is forced to WX.
+ * Inform about it, but avoid the warning.
+ */
+ if (pcibios_enabled && st->start_address >= PAGE_OFFSET + BIOS_BEGIN &&
+ st->current_address <= PAGE_OFFSET + BIOS_END) {
+ pr_warn_once("x86/mm: PCI BIOS W+X mapping %lu pages\n", npages);
+ return;
+ }
+#endif
+ /* Account the WX pages */
+ st->wx_pages += npages;
+ WARN_ONCE(1, "x86/mm: Found insecure W+X mapping at address %pS\n",
+ (void *)st->start_address);
+}
+
+/*
+ * This function gets called on a break in a continuous series
+ * of PTE entries; the next one is different so we need to
+ * print what we collected so far.
+ */
+static void note_page(struct seq_file *m, struct pg_state *st,
+ pgprot_t new_prot, pgprotval_t new_eff, int level)
+{
+ pgprotval_t prot, cur, eff;
+ static const char units[] = "BKMGTPE";
+
+ /*
+ * If we have a "break" in the series, we need to flush the state that
+ * we have now. "break" is either changing perms, levels or
+ * address space marker.
+ */
+ prot = pgprot_val(new_prot);
+ cur = pgprot_val(st->current_prot);
+ eff = st->effective_prot;
+
+ if (!st->level) {
+ /* First entry */
+ st->current_prot = new_prot;
+ st->effective_prot = new_eff;
+ st->level = level;
+ st->marker = address_markers;
+ st->lines = 0;
+ pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n",
+ st->marker->name);
+ } else if (prot != cur || new_eff != eff || level != st->level ||
+ st->current_address >= st->marker[1].start_address) {
+ const char *unit = units;
+ unsigned long delta;
+ int width = sizeof(unsigned long) * 2;
+
+ if (st->check_wx && (eff & _PAGE_RW) && !(eff & _PAGE_NX))
+ note_wx(st);
+
+ /*
+ * Now print the actual finished series
+ */
+ if (!st->marker->max_lines ||
+ st->lines < st->marker->max_lines) {
+ pt_dump_seq_printf(m, st->to_dmesg,
+ "0x%0*lx-0x%0*lx ",
+ width, st->start_address,
+ width, st->current_address);
+
+ delta = st->current_address - st->start_address;
+ while (!(delta & 1023) && unit[1]) {
+ delta >>= 10;
+ unit++;
+ }
+ pt_dump_cont_printf(m, st->to_dmesg, "%9lu%c ",
+ delta, *unit);
+ printk_prot(m, st->current_prot, st->level,
+ st->to_dmesg);
+ }
+ st->lines++;
+
+ /*
+ * We print markers for special areas of address space,
+ * such as the start of vmalloc space etc.
+ * This helps in the interpretation.
+ */
+ if (st->current_address >= st->marker[1].start_address) {
+ if (st->marker->max_lines &&
+ st->lines > st->marker->max_lines) {
+ unsigned long nskip =
+ st->lines - st->marker->max_lines;
+ pt_dump_seq_printf(m, st->to_dmesg,
+ "... %lu entr%s skipped ... \n",
+ nskip,
+ nskip == 1 ? "y" : "ies");
+ }
+ st->marker++;
+ st->lines = 0;
+ pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n",
+ st->marker->name);
+ }
+
+ st->start_address = st->current_address;
+ st->current_prot = new_prot;
+ st->effective_prot = new_eff;
+ st->level = level;
+ }
+}
+
+static inline pgprotval_t effective_prot(pgprotval_t prot1, pgprotval_t prot2)
+{
+ return (prot1 & prot2 & (_PAGE_USER | _PAGE_RW)) |
+ ((prot1 | prot2) & _PAGE_NX);
+}
+
+static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr,
+ pgprotval_t eff_in, unsigned long P)
+{
+ int i;
+ pte_t *pte;
+ pgprotval_t prot, eff;
+
+ for (i = 0; i < PTRS_PER_PTE; i++) {
+ st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT);
+ pte = pte_offset_map(&addr, st->current_address);
+ prot = pte_flags(*pte);
+ eff = effective_prot(eff_in, prot);
+ note_page(m, st, __pgprot(prot), eff, 5);
+ pte_unmap(pte);
+ }
+}
+#ifdef CONFIG_KASAN
+
+/*
+ * This is an optimization for KASAN=y case. Since all kasan page tables
+ * eventually point to the kasan_zero_page we could call note_page()
+ * right away without walking through lower level page tables. This saves
+ * us dozens of seconds (minutes for 5-level config) while checking for
+ * W+X mapping or reading kernel_page_tables debugfs file.
+ */
+static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st,
+ void *pt)
+{
+ if (__pa(pt) == __pa(kasan_zero_pmd) ||
+ (pgtable_l5_enabled() && __pa(pt) == __pa(kasan_zero_p4d)) ||
+ __pa(pt) == __pa(kasan_zero_pud)) {
+ pgprotval_t prot = pte_flags(kasan_zero_pte[0]);
+ note_page(m, st, __pgprot(prot), 0, 5);
+ return true;
+ }
+ return false;
+}
+#else
+static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st,
+ void *pt)
+{
+ return false;
+}
+#endif
+
+#if PTRS_PER_PMD > 1
+
+static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr,
+ pgprotval_t eff_in, unsigned long P)
+{
+ int i;
+ pmd_t *start, *pmd_start;
+ pgprotval_t prot, eff;
+
+ pmd_start = start = (pmd_t *)pud_page_vaddr(addr);
+ for (i = 0; i < PTRS_PER_PMD; i++) {
+ st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT);
+ if (!pmd_none(*start)) {
+ prot = pmd_flags(*start);
+ eff = effective_prot(eff_in, prot);
+ if (pmd_large(*start) || !pmd_present(*start)) {
+ note_page(m, st, __pgprot(prot), eff, 4);
+ } else if (!kasan_page_table(m, st, pmd_start)) {
+ walk_pte_level(m, st, *start, eff,
+ P + i * PMD_LEVEL_MULT);
+ }
+ } else
+ note_page(m, st, __pgprot(0), 0, 4);
+ start++;
+ }
+}
+
+#else
+#define walk_pmd_level(m,s,a,e,p) walk_pte_level(m,s,__pmd(pud_val(a)),e,p)
+#define pud_large(a) pmd_large(__pmd(pud_val(a)))
+#define pud_none(a) pmd_none(__pmd(pud_val(a)))
+#endif
+
+#if PTRS_PER_PUD > 1
+
+static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr,
+ pgprotval_t eff_in, unsigned long P)
+{
+ int i;
+ pud_t *start, *pud_start;
+ pgprotval_t prot, eff;
+ pud_t *prev_pud = NULL;
+
+ pud_start = start = (pud_t *)p4d_page_vaddr(addr);
+
+ for (i = 0; i < PTRS_PER_PUD; i++) {
+ st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT);
+ if (!pud_none(*start)) {
+ prot = pud_flags(*start);
+ eff = effective_prot(eff_in, prot);
+ if (pud_large(*start) || !pud_present(*start)) {
+ note_page(m, st, __pgprot(prot), eff, 3);
+ } else if (!kasan_page_table(m, st, pud_start)) {
+ walk_pmd_level(m, st, *start, eff,
+ P + i * PUD_LEVEL_MULT);
+ }
+ } else
+ note_page(m, st, __pgprot(0), 0, 3);
+
+ prev_pud = start;
+ start++;
+ }
+}
+
+#else
+#define walk_pud_level(m,s,a,e,p) walk_pmd_level(m,s,__pud(p4d_val(a)),e,p)
+#define p4d_large(a) pud_large(__pud(p4d_val(a)))
+#define p4d_none(a) pud_none(__pud(p4d_val(a)))
+#endif
+
+static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
+ pgprotval_t eff_in, unsigned long P)
+{
+ int i;
+ p4d_t *start, *p4d_start;
+ pgprotval_t prot, eff;
+
+ if (PTRS_PER_P4D == 1)
+ return walk_pud_level(m, st, __p4d(pgd_val(addr)), eff_in, P);
+
+ p4d_start = start = (p4d_t *)pgd_page_vaddr(addr);
+
+ for (i = 0; i < PTRS_PER_P4D; i++) {
+ st->current_address = normalize_addr(P + i * P4D_LEVEL_MULT);
+ if (!p4d_none(*start)) {
+ prot = p4d_flags(*start);
+ eff = effective_prot(eff_in, prot);
+ if (p4d_large(*start) || !p4d_present(*start)) {
+ note_page(m, st, __pgprot(prot), eff, 2);
+ } else if (!kasan_page_table(m, st, p4d_start)) {
+ walk_pud_level(m, st, *start, eff,
+ P + i * P4D_LEVEL_MULT);
+ }
+ } else
+ note_page(m, st, __pgprot(0), 0, 2);
+
+ start++;
+ }
+}
+
+#define pgd_large(a) (pgtable_l5_enabled() ? pgd_large(a) : p4d_large(__p4d(pgd_val(a))))
+#define pgd_none(a) (pgtable_l5_enabled() ? pgd_none(a) : p4d_none(__p4d(pgd_val(a))))
+
+static inline bool is_hypervisor_range(int idx)
+{
+#ifdef CONFIG_X86_64
+ /*
+ * A hole in the beginning of kernel address space reserved
+ * for a hypervisor.
+ */
+ return (idx >= pgd_index(GUARD_HOLE_BASE_ADDR)) &&
+ (idx < pgd_index(GUARD_HOLE_END_ADDR));
+#else
+ return false;
+#endif
+}
+
+static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
+ bool checkwx, bool dmesg)
+{
+ pgd_t *start = INIT_PGD;
+ pgprotval_t prot, eff;
+ int i;
+ struct pg_state st = {};
+
+ if (pgd) {
+ start = pgd;
+ st.to_dmesg = dmesg;
+ }
+
+ st.check_wx = checkwx;
+ if (checkwx)
+ st.wx_pages = 0;
+
+ for (i = 0; i < PTRS_PER_PGD; i++) {
+ st.current_address = normalize_addr(i * PGD_LEVEL_MULT);
+ if (!pgd_none(*start) && !is_hypervisor_range(i)) {
+ prot = pgd_flags(*start);
+#ifdef CONFIG_X86_PAE
+ eff = _PAGE_USER | _PAGE_RW;
+#else
+ eff = prot;
+#endif
+ if (pgd_large(*start) || !pgd_present(*start)) {
+ note_page(m, &st, __pgprot(prot), eff, 1);
+ } else {
+ walk_p4d_level(m, &st, *start, eff,
+ i * PGD_LEVEL_MULT);
+ }
+ } else
+ note_page(m, &st, __pgprot(0), 0, 1);
+
+ cond_resched();
+ start++;
+ }
+
+ /* Flush out the last page */
+ st.current_address = normalize_addr(PTRS_PER_PGD*PGD_LEVEL_MULT);
+ note_page(m, &st, __pgprot(0), 0, 0);
+ if (!checkwx)
+ return;
+ if (st.wx_pages)
+ pr_info("x86/mm: Checked W+X mappings: FAILED, %lu W+X pages found.\n",
+ st.wx_pages);
+ else
+ pr_info("x86/mm: Checked W+X mappings: passed, no W+X pages found.\n");
+}
+
+void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd)
+{
+ ptdump_walk_pgd_level_core(m, pgd, false, true);
+}
+
+void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user)
+{
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+ if (user && static_cpu_has(X86_FEATURE_PTI))
+ pgd = kernel_to_user_pgdp(pgd);
+#endif
+ ptdump_walk_pgd_level_core(m, pgd, false, false);
+}
+EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs);
+
+void ptdump_walk_user_pgd_level_checkwx(void)
+{
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+ pgd_t *pgd = INIT_PGD;
+
+ if (!(__supported_pte_mask & _PAGE_NX) ||
+ !static_cpu_has(X86_FEATURE_PTI))
+ return;
+
+ pr_info("x86/mm: Checking user space page tables\n");
+ pgd = kernel_to_user_pgdp(pgd);
+ ptdump_walk_pgd_level_core(NULL, pgd, true, false);
+#endif
+}
+
+void ptdump_walk_pgd_level_checkwx(void)
+{
+ ptdump_walk_pgd_level_core(NULL, NULL, true, false);
+}
+
+static int __init pt_dump_init(void)
+{
+ /*
+ * Various markers are not compile-time constants, so assign them
+ * here.
+ */
+#ifdef CONFIG_X86_64
+ address_markers[LOW_KERNEL_NR].start_address = PAGE_OFFSET;
+ address_markers[VMALLOC_START_NR].start_address = VMALLOC_START;
+ address_markers[VMEMMAP_START_NR].start_address = VMEMMAP_START;
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+ address_markers[LDT_NR].start_address = LDT_BASE_ADDR;
+#endif
+#ifdef CONFIG_KASAN
+ address_markers[KASAN_SHADOW_START_NR].start_address = KASAN_SHADOW_START;
+ address_markers[KASAN_SHADOW_END_NR].start_address = KASAN_SHADOW_END;
+#endif
+#endif
+#ifdef CONFIG_X86_32
+ address_markers[VMALLOC_START_NR].start_address = VMALLOC_START;
+ address_markers[VMALLOC_END_NR].start_address = VMALLOC_END;
+# ifdef CONFIG_HIGHMEM
+ address_markers[PKMAP_BASE_NR].start_address = PKMAP_BASE;
+# endif
+ address_markers[FIXADDR_START_NR].start_address = FIXADDR_START;
+ address_markers[CPU_ENTRY_AREA_NR].start_address = CPU_ENTRY_AREA_BASE;
+# ifdef CONFIG_MODIFY_LDT_SYSCALL
+ address_markers[LDT_NR].start_address = LDT_BASE_ADDR;
+# endif
+#endif
+ return 0;
+}
+__initcall(pt_dump_init);
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
new file mode 100644
index 000000000..45f5d6cf6
--- /dev/null
+++ b/arch/x86/mm/extable.c
@@ -0,0 +1,251 @@
+#include <linux/extable.h>
+#include <linux/uaccess.h>
+#include <linux/sched/debug.h>
+#include <xen/xen.h>
+
+#include <asm/fpu/internal.h>
+#include <asm/traps.h>
+#include <asm/kdebug.h>
+
+typedef bool (*ex_handler_t)(const struct exception_table_entry *,
+ struct pt_regs *, int);
+
+static inline unsigned long
+ex_fixup_addr(const struct exception_table_entry *x)
+{
+ return (unsigned long)&x->fixup + x->fixup;
+}
+static inline ex_handler_t
+ex_fixup_handler(const struct exception_table_entry *x)
+{
+ return (ex_handler_t)((unsigned long)&x->handler + x->handler);
+}
+
+__visible bool ex_handler_default(const struct exception_table_entry *fixup,
+ struct pt_regs *regs, int trapnr)
+{
+ regs->ip = ex_fixup_addr(fixup);
+ return true;
+}
+EXPORT_SYMBOL(ex_handler_default);
+
+__visible bool ex_handler_fault(const struct exception_table_entry *fixup,
+ struct pt_regs *regs, int trapnr)
+{
+ regs->ip = ex_fixup_addr(fixup);
+ regs->ax = trapnr;
+ return true;
+}
+EXPORT_SYMBOL_GPL(ex_handler_fault);
+
+/*
+ * Handler for UD0 exception following a failed test against the
+ * result of a refcount inc/dec/add/sub.
+ */
+__visible bool ex_handler_refcount(const struct exception_table_entry *fixup,
+ struct pt_regs *regs, int trapnr)
+{
+ /* First unconditionally saturate the refcount. */
+ *(int *)regs->cx = INT_MIN / 2;
+
+ /*
+ * Strictly speaking, this reports the fixup destination, not
+ * the fault location, and not the actually overflowing
+ * instruction, which is the instruction before the "js", but
+ * since that instruction could be a variety of lengths, just
+ * report the location after the overflow, which should be close
+ * enough for finding the overflow, as it's at least back in
+ * the function, having returned from .text.unlikely.
+ */
+ regs->ip = ex_fixup_addr(fixup);
+
+ /*
+ * This function has been called because either a negative refcount
+ * value was seen by any of the refcount functions, or a zero
+ * refcount value was seen by refcount_dec().
+ *
+ * If we crossed from INT_MAX to INT_MIN, OF (Overflow Flag: result
+ * wrapped around) will be set. Additionally, seeing the refcount
+ * reach 0 will set ZF (Zero Flag: result was zero). In each of
+ * these cases we want a report, since it's a boundary condition.
+ * The SF case is not reported since it indicates post-boundary
+ * manipulations below zero or above INT_MAX. And if none of the
+ * flags are set, something has gone very wrong, so report it.
+ */
+ if (regs->flags & (X86_EFLAGS_OF | X86_EFLAGS_ZF)) {
+ bool zero = regs->flags & X86_EFLAGS_ZF;
+
+ refcount_error_report(regs, zero ? "hit zero" : "overflow");
+ } else if ((regs->flags & X86_EFLAGS_SF) == 0) {
+ /* Report if none of OF, ZF, nor SF are set. */
+ refcount_error_report(regs, "unexpected saturation");
+ }
+
+ return true;
+}
+EXPORT_SYMBOL(ex_handler_refcount);
+
+/*
+ * Handler for when we fail to restore a task's FPU state. We should never get
+ * here because the FPU state of a task using the FPU (task->thread.fpu.state)
+ * should always be valid. However, past bugs have allowed userspace to set
+ * reserved bits in the XSAVE area using PTRACE_SETREGSET or sys_rt_sigreturn().
+ * These caused XRSTOR to fail when switching to the task, leaking the FPU
+ * registers of the task previously executing on the CPU. Mitigate this class
+ * of vulnerability by restoring from the initial state (essentially, zeroing
+ * out all the FPU registers) if we can't restore from the task's FPU state.
+ */
+__visible bool ex_handler_fprestore(const struct exception_table_entry *fixup,
+ struct pt_regs *regs, int trapnr)
+{
+ regs->ip = ex_fixup_addr(fixup);
+
+ WARN_ONCE(1, "Bad FPU state detected at %pB, reinitializing FPU registers.",
+ (void *)instruction_pointer(regs));
+
+ __copy_kernel_to_fpregs(&init_fpstate, -1);
+ return true;
+}
+EXPORT_SYMBOL_GPL(ex_handler_fprestore);
+
+__visible bool ex_handler_ext(const struct exception_table_entry *fixup,
+ struct pt_regs *regs, int trapnr)
+{
+ /* Special hack for uaccess_err */
+ current->thread.uaccess_err = 1;
+ regs->ip = ex_fixup_addr(fixup);
+ return true;
+}
+EXPORT_SYMBOL(ex_handler_ext);
+
+__visible bool ex_handler_rdmsr_unsafe(const struct exception_table_entry *fixup,
+ struct pt_regs *regs, int trapnr)
+{
+ if (pr_warn_once("unchecked MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pF)\n",
+ (unsigned int)regs->cx, regs->ip, (void *)regs->ip))
+ show_stack_regs(regs);
+
+ /* Pretend that the read succeeded and returned 0. */
+ regs->ip = ex_fixup_addr(fixup);
+ regs->ax = 0;
+ regs->dx = 0;
+ return true;
+}
+EXPORT_SYMBOL(ex_handler_rdmsr_unsafe);
+
+__visible bool ex_handler_wrmsr_unsafe(const struct exception_table_entry *fixup,
+ struct pt_regs *regs, int trapnr)
+{
+ if (pr_warn_once("unchecked MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pF)\n",
+ (unsigned int)regs->cx, (unsigned int)regs->dx,
+ (unsigned int)regs->ax, regs->ip, (void *)regs->ip))
+ show_stack_regs(regs);
+
+ /* Pretend that the write succeeded. */
+ regs->ip = ex_fixup_addr(fixup);
+ return true;
+}
+EXPORT_SYMBOL(ex_handler_wrmsr_unsafe);
+
+__visible bool ex_handler_clear_fs(const struct exception_table_entry *fixup,
+ struct pt_regs *regs, int trapnr)
+{
+ if (static_cpu_has(X86_BUG_NULL_SEG))
+ asm volatile ("mov %0, %%fs" : : "rm" (__USER_DS));
+ asm volatile ("mov %0, %%fs" : : "rm" (0));
+ return ex_handler_default(fixup, regs, trapnr);
+}
+EXPORT_SYMBOL(ex_handler_clear_fs);
+
+__visible bool ex_has_fault_handler(unsigned long ip)
+{
+ const struct exception_table_entry *e;
+ ex_handler_t handler;
+
+ e = search_exception_tables(ip);
+ if (!e)
+ return false;
+ handler = ex_fixup_handler(e);
+
+ return handler == ex_handler_fault;
+}
+
+int fixup_exception(struct pt_regs *regs, int trapnr)
+{
+ const struct exception_table_entry *e;
+ ex_handler_t handler;
+
+#ifdef CONFIG_PNPBIOS
+ if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs))) {
+ extern u32 pnp_bios_fault_eip, pnp_bios_fault_esp;
+ extern u32 pnp_bios_is_utter_crap;
+ pnp_bios_is_utter_crap = 1;
+ printk(KERN_CRIT "PNPBIOS fault.. attempting recovery.\n");
+ __asm__ volatile(
+ "movl %0, %%esp\n\t"
+ "jmp *%1\n\t"
+ : : "g" (pnp_bios_fault_esp), "g" (pnp_bios_fault_eip));
+ panic("do_trap: can't hit this");
+ }
+#endif
+
+ e = search_exception_tables(regs->ip);
+ if (!e)
+ return 0;
+
+ handler = ex_fixup_handler(e);
+ return handler(e, regs, trapnr);
+}
+
+extern unsigned int early_recursion_flag;
+
+/* Restricted version used during very early boot */
+void __init early_fixup_exception(struct pt_regs *regs, int trapnr)
+{
+ /* Ignore early NMIs. */
+ if (trapnr == X86_TRAP_NMI)
+ return;
+
+ if (early_recursion_flag > 2)
+ goto halt_loop;
+
+ /*
+ * Old CPUs leave the high bits of CS on the stack
+ * undefined. I'm not sure which CPUs do this, but at least
+ * the 486 DX works this way.
+ * Xen pv domains are not using the default __KERNEL_CS.
+ */
+ if (!xen_pv_domain() && regs->cs != __KERNEL_CS)
+ goto fail;
+
+ /*
+ * The full exception fixup machinery is available as soon as
+ * the early IDT is loaded. This means that it is the
+ * responsibility of extable users to either function correctly
+ * when handlers are invoked early or to simply avoid causing
+ * exceptions before they're ready to handle them.
+ *
+ * This is better than filtering which handlers can be used,
+ * because refusing to call a handler here is guaranteed to
+ * result in a hard-to-debug panic.
+ *
+ * Keep in mind that not all vectors actually get here. Early
+ * fage faults, for example, are special.
+ */
+ if (fixup_exception(regs, trapnr))
+ return;
+
+ if (fixup_bug(regs, trapnr))
+ return;
+
+fail:
+ early_printk("PANIC: early exception 0x%02x IP %lx:%lx error %lx cr2 0x%lx\n",
+ (unsigned)trapnr, (unsigned long)regs->cs, regs->ip,
+ regs->orig_ax, read_cr2());
+
+ show_regs(regs);
+
+halt_loop:
+ while (true)
+ halt();
+}
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
new file mode 100644
index 000000000..c61acf635
--- /dev/null
+++ b/arch/x86/mm/fault.c
@@ -0,0 +1,1490 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 1995 Linus Torvalds
+ * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs.
+ * Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar
+ */
+#include <linux/sched.h> /* test_thread_flag(), ... */
+#include <linux/sched/task_stack.h> /* task_stack_*(), ... */
+#include <linux/kdebug.h> /* oops_begin/end, ... */
+#include <linux/extable.h> /* search_exception_tables */
+#include <linux/bootmem.h> /* max_low_pfn */
+#include <linux/kprobes.h> /* NOKPROBE_SYMBOL, ... */
+#include <linux/mmiotrace.h> /* kmmio_handler, ... */
+#include <linux/perf_event.h> /* perf_sw_event */
+#include <linux/hugetlb.h> /* hstate_index_to_shift */
+#include <linux/prefetch.h> /* prefetchw */
+#include <linux/context_tracking.h> /* exception_enter(), ... */
+#include <linux/uaccess.h> /* faulthandler_disabled() */
+#include <linux/mm_types.h>
+
+#include <asm/cpufeature.h> /* boot_cpu_has, ... */
+#include <asm/traps.h> /* dotraplinkage, ... */
+#include <asm/pgalloc.h> /* pgd_*(), ... */
+#include <asm/fixmap.h> /* VSYSCALL_ADDR */
+#include <asm/vsyscall.h> /* emulate_vsyscall */
+#include <asm/vm86.h> /* struct vm86 */
+#include <asm/mmu_context.h> /* vma_pkey() */
+
+#define CREATE_TRACE_POINTS
+#include <asm/trace/exceptions.h>
+
+/*
+ * Returns 0 if mmiotrace is disabled, or if the fault is not
+ * handled by mmiotrace:
+ */
+static nokprobe_inline int
+kmmio_fault(struct pt_regs *regs, unsigned long addr)
+{
+ if (unlikely(is_kmmio_active()))
+ if (kmmio_handler(regs, addr) == 1)
+ return -1;
+ return 0;
+}
+
+static nokprobe_inline int kprobes_fault(struct pt_regs *regs)
+{
+ int ret = 0;
+
+ /* kprobe_running() needs smp_processor_id() */
+ if (kprobes_built_in() && !user_mode(regs)) {
+ preempt_disable();
+ if (kprobe_running() && kprobe_fault_handler(regs, 14))
+ ret = 1;
+ preempt_enable();
+ }
+
+ return ret;
+}
+
+/*
+ * Prefetch quirks:
+ *
+ * 32-bit mode:
+ *
+ * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
+ * Check that here and ignore it.
+ *
+ * 64-bit mode:
+ *
+ * Sometimes the CPU reports invalid exceptions on prefetch.
+ * Check that here and ignore it.
+ *
+ * Opcode checker based on code by Richard Brunner.
+ */
+static inline int
+check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr,
+ unsigned char opcode, int *prefetch)
+{
+ unsigned char instr_hi = opcode & 0xf0;
+ unsigned char instr_lo = opcode & 0x0f;
+
+ switch (instr_hi) {
+ case 0x20:
+ case 0x30:
+ /*
+ * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
+ * In X86_64 long mode, the CPU will signal invalid
+ * opcode if some of these prefixes are present so
+ * X86_64 will never get here anyway
+ */
+ return ((instr_lo & 7) == 0x6);
+#ifdef CONFIG_X86_64
+ case 0x40:
+ /*
+ * In AMD64 long mode 0x40..0x4F are valid REX prefixes
+ * Need to figure out under what instruction mode the
+ * instruction was issued. Could check the LDT for lm,
+ * but for now it's good enough to assume that long
+ * mode only uses well known segments or kernel.
+ */
+ return (!user_mode(regs) || user_64bit_mode(regs));
+#endif
+ case 0x60:
+ /* 0x64 thru 0x67 are valid prefixes in all modes. */
+ return (instr_lo & 0xC) == 0x4;
+ case 0xF0:
+ /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
+ return !instr_lo || (instr_lo>>1) == 1;
+ case 0x00:
+ /* Prefetch instruction is 0x0F0D or 0x0F18 */
+ if (probe_kernel_address(instr, opcode))
+ return 0;
+
+ *prefetch = (instr_lo == 0xF) &&
+ (opcode == 0x0D || opcode == 0x18);
+ return 0;
+ default:
+ return 0;
+ }
+}
+
+static int
+is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
+{
+ unsigned char *max_instr;
+ unsigned char *instr;
+ int prefetch = 0;
+
+ /*
+ * If it was a exec (instruction fetch) fault on NX page, then
+ * do not ignore the fault:
+ */
+ if (error_code & X86_PF_INSTR)
+ return 0;
+
+ instr = (void *)convert_ip_to_linear(current, regs);
+ max_instr = instr + 15;
+
+ if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE_MAX)
+ return 0;
+
+ while (instr < max_instr) {
+ unsigned char opcode;
+
+ if (probe_kernel_address(instr, opcode))
+ break;
+
+ instr++;
+
+ if (!check_prefetch_opcode(regs, instr, opcode, &prefetch))
+ break;
+ }
+ return prefetch;
+}
+
+/*
+ * A protection key fault means that the PKRU value did not allow
+ * access to some PTE. Userspace can figure out what PKRU was
+ * from the XSAVE state, and this function fills out a field in
+ * siginfo so userspace can discover which protection key was set
+ * on the PTE.
+ *
+ * If we get here, we know that the hardware signaled a X86_PF_PK
+ * fault and that there was a VMA once we got in the fault
+ * handler. It does *not* guarantee that the VMA we find here
+ * was the one that we faulted on.
+ *
+ * 1. T1 : mprotect_key(foo, PAGE_SIZE, pkey=4);
+ * 2. T1 : set PKRU to deny access to pkey=4, touches page
+ * 3. T1 : faults...
+ * 4. T2: mprotect_key(foo, PAGE_SIZE, pkey=5);
+ * 5. T1 : enters fault handler, takes mmap_sem, etc...
+ * 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really
+ * faulted on a pte with its pkey=4.
+ */
+static void fill_sig_info_pkey(int si_signo, int si_code, siginfo_t *info,
+ u32 *pkey)
+{
+ /* This is effectively an #ifdef */
+ if (!boot_cpu_has(X86_FEATURE_OSPKE))
+ return;
+
+ /* Fault not from Protection Keys: nothing to do */
+ if ((si_code != SEGV_PKUERR) || (si_signo != SIGSEGV))
+ return;
+ /*
+ * force_sig_info_fault() is called from a number of
+ * contexts, some of which have a VMA and some of which
+ * do not. The X86_PF_PK handing happens after we have a
+ * valid VMA, so we should never reach this without a
+ * valid VMA.
+ */
+ if (!pkey) {
+ WARN_ONCE(1, "PKU fault with no VMA passed in");
+ info->si_pkey = 0;
+ return;
+ }
+ /*
+ * si_pkey should be thought of as a strong hint, but not
+ * absolutely guranteed to be 100% accurate because of
+ * the race explained above.
+ */
+ info->si_pkey = *pkey;
+}
+
+static void
+force_sig_info_fault(int si_signo, int si_code, unsigned long address,
+ struct task_struct *tsk, u32 *pkey, int fault)
+{
+ unsigned lsb = 0;
+ siginfo_t info;
+
+ clear_siginfo(&info);
+ info.si_signo = si_signo;
+ info.si_errno = 0;
+ info.si_code = si_code;
+ info.si_addr = (void __user *)address;
+ if (fault & VM_FAULT_HWPOISON_LARGE)
+ lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
+ if (fault & VM_FAULT_HWPOISON)
+ lsb = PAGE_SHIFT;
+ info.si_addr_lsb = lsb;
+
+ fill_sig_info_pkey(si_signo, si_code, &info, pkey);
+
+ force_sig_info(si_signo, &info, tsk);
+}
+
+DEFINE_SPINLOCK(pgd_lock);
+LIST_HEAD(pgd_list);
+
+#ifdef CONFIG_X86_32
+static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
+{
+ unsigned index = pgd_index(address);
+ pgd_t *pgd_k;
+ p4d_t *p4d, *p4d_k;
+ pud_t *pud, *pud_k;
+ pmd_t *pmd, *pmd_k;
+
+ pgd += index;
+ pgd_k = init_mm.pgd + index;
+
+ if (!pgd_present(*pgd_k))
+ return NULL;
+
+ /*
+ * set_pgd(pgd, *pgd_k); here would be useless on PAE
+ * and redundant with the set_pmd() on non-PAE. As would
+ * set_p4d/set_pud.
+ */
+ p4d = p4d_offset(pgd, address);
+ p4d_k = p4d_offset(pgd_k, address);
+ if (!p4d_present(*p4d_k))
+ return NULL;
+
+ pud = pud_offset(p4d, address);
+ pud_k = pud_offset(p4d_k, address);
+ if (!pud_present(*pud_k))
+ return NULL;
+
+ pmd = pmd_offset(pud, address);
+ pmd_k = pmd_offset(pud_k, address);
+
+ if (pmd_present(*pmd) != pmd_present(*pmd_k))
+ set_pmd(pmd, *pmd_k);
+
+ if (!pmd_present(*pmd_k))
+ return NULL;
+ else
+ BUG_ON(pmd_pfn(*pmd) != pmd_pfn(*pmd_k));
+
+ return pmd_k;
+}
+
+static void vmalloc_sync(void)
+{
+ unsigned long address;
+
+ if (SHARED_KERNEL_PMD)
+ return;
+
+ for (address = VMALLOC_START & PMD_MASK;
+ address >= TASK_SIZE_MAX && address < VMALLOC_END;
+ address += PMD_SIZE) {
+ struct page *page;
+
+ spin_lock(&pgd_lock);
+ list_for_each_entry(page, &pgd_list, lru) {
+ spinlock_t *pgt_lock;
+
+ /* the pgt_lock only for Xen */
+ pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
+
+ spin_lock(pgt_lock);
+ vmalloc_sync_one(page_address(page), address);
+ spin_unlock(pgt_lock);
+ }
+ spin_unlock(&pgd_lock);
+ }
+}
+
+void vmalloc_sync_mappings(void)
+{
+ vmalloc_sync();
+}
+
+void vmalloc_sync_unmappings(void)
+{
+ vmalloc_sync();
+}
+
+/*
+ * 32-bit:
+ *
+ * Handle a fault on the vmalloc or module mapping area
+ */
+static noinline int vmalloc_fault(unsigned long address)
+{
+ unsigned long pgd_paddr;
+ pmd_t *pmd_k;
+ pte_t *pte_k;
+
+ /* Make sure we are in vmalloc area: */
+ if (!(address >= VMALLOC_START && address < VMALLOC_END))
+ return -1;
+
+ /*
+ * Synchronize this task's top level page-table
+ * with the 'reference' page table.
+ *
+ * Do _not_ use "current" here. We might be inside
+ * an interrupt in the middle of a task switch..
+ */
+ pgd_paddr = read_cr3_pa();
+ pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
+ if (!pmd_k)
+ return -1;
+
+ if (pmd_large(*pmd_k))
+ return 0;
+
+ pte_k = pte_offset_kernel(pmd_k, address);
+ if (!pte_present(*pte_k))
+ return -1;
+
+ return 0;
+}
+NOKPROBE_SYMBOL(vmalloc_fault);
+
+/*
+ * Did it hit the DOS screen memory VA from vm86 mode?
+ */
+static inline void
+check_v8086_mode(struct pt_regs *regs, unsigned long address,
+ struct task_struct *tsk)
+{
+#ifdef CONFIG_VM86
+ unsigned long bit;
+
+ if (!v8086_mode(regs) || !tsk->thread.vm86)
+ return;
+
+ bit = (address - 0xA0000) >> PAGE_SHIFT;
+ if (bit < 32)
+ tsk->thread.vm86->screen_bitmap |= 1 << bit;
+#endif
+}
+
+static bool low_pfn(unsigned long pfn)
+{
+ return pfn < max_low_pfn;
+}
+
+static void dump_pagetable(unsigned long address)
+{
+ pgd_t *base = __va(read_cr3_pa());
+ pgd_t *pgd = &base[pgd_index(address)];
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+#ifdef CONFIG_X86_PAE
+ pr_info("*pdpt = %016Lx ", pgd_val(*pgd));
+ if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd))
+ goto out;
+#define pr_pde pr_cont
+#else
+#define pr_pde pr_info
+#endif
+ p4d = p4d_offset(pgd, address);
+ pud = pud_offset(p4d, address);
+ pmd = pmd_offset(pud, address);
+ pr_pde("*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd));
+#undef pr_pde
+
+ /*
+ * We must not directly access the pte in the highpte
+ * case if the page table is located in highmem.
+ * And let's rather not kmap-atomic the pte, just in case
+ * it's allocated already:
+ */
+ if (!low_pfn(pmd_pfn(*pmd)) || !pmd_present(*pmd) || pmd_large(*pmd))
+ goto out;
+
+ pte = pte_offset_kernel(pmd, address);
+ pr_cont("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte));
+out:
+ pr_cont("\n");
+}
+
+#else /* CONFIG_X86_64: */
+
+void vmalloc_sync_mappings(void)
+{
+ /*
+ * 64-bit mappings might allocate new p4d/pud pages
+ * that need to be propagated to all tasks' PGDs.
+ */
+ sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END);
+}
+
+void vmalloc_sync_unmappings(void)
+{
+ /*
+ * Unmappings never allocate or free p4d/pud pages.
+ * No work is required here.
+ */
+}
+
+/*
+ * 64-bit:
+ *
+ * Handle a fault on the vmalloc area
+ */
+static noinline int vmalloc_fault(unsigned long address)
+{
+ pgd_t *pgd, *pgd_k;
+ p4d_t *p4d, *p4d_k;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ /* Make sure we are in vmalloc area: */
+ if (!(address >= VMALLOC_START && address < VMALLOC_END))
+ return -1;
+
+ /*
+ * Copy kernel mappings over when needed. This can also
+ * happen within a race in page table update. In the later
+ * case just flush:
+ */
+ pgd = (pgd_t *)__va(read_cr3_pa()) + pgd_index(address);
+ pgd_k = pgd_offset_k(address);
+ if (pgd_none(*pgd_k))
+ return -1;
+
+ if (pgtable_l5_enabled()) {
+ if (pgd_none(*pgd)) {
+ set_pgd(pgd, *pgd_k);
+ arch_flush_lazy_mmu_mode();
+ } else {
+ BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_k));
+ }
+ }
+
+ /* With 4-level paging, copying happens on the p4d level. */
+ p4d = p4d_offset(pgd, address);
+ p4d_k = p4d_offset(pgd_k, address);
+ if (p4d_none(*p4d_k))
+ return -1;
+
+ if (p4d_none(*p4d) && !pgtable_l5_enabled()) {
+ set_p4d(p4d, *p4d_k);
+ arch_flush_lazy_mmu_mode();
+ } else {
+ BUG_ON(p4d_pfn(*p4d) != p4d_pfn(*p4d_k));
+ }
+
+ BUILD_BUG_ON(CONFIG_PGTABLE_LEVELS < 4);
+
+ pud = pud_offset(p4d, address);
+ if (pud_none(*pud))
+ return -1;
+
+ if (pud_large(*pud))
+ return 0;
+
+ pmd = pmd_offset(pud, address);
+ if (pmd_none(*pmd))
+ return -1;
+
+ if (pmd_large(*pmd))
+ return 0;
+
+ pte = pte_offset_kernel(pmd, address);
+ if (!pte_present(*pte))
+ return -1;
+
+ return 0;
+}
+NOKPROBE_SYMBOL(vmalloc_fault);
+
+#ifdef CONFIG_CPU_SUP_AMD
+static const char errata93_warning[] =
+KERN_ERR
+"******* Your BIOS seems to not contain a fix for K8 errata #93\n"
+"******* Working around it, but it may cause SEGVs or burn power.\n"
+"******* Please consider a BIOS update.\n"
+"******* Disabling USB legacy in the BIOS may also help.\n";
+#endif
+
+/*
+ * No vm86 mode in 64-bit mode:
+ */
+static inline void
+check_v8086_mode(struct pt_regs *regs, unsigned long address,
+ struct task_struct *tsk)
+{
+}
+
+static int bad_address(void *p)
+{
+ unsigned long dummy;
+
+ return probe_kernel_address((unsigned long *)p, dummy);
+}
+
+static void dump_pagetable(unsigned long address)
+{
+ pgd_t *base = __va(read_cr3_pa());
+ pgd_t *pgd = base + pgd_index(address);
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ if (bad_address(pgd))
+ goto bad;
+
+ pr_info("PGD %lx ", pgd_val(*pgd));
+
+ if (!pgd_present(*pgd))
+ goto out;
+
+ p4d = p4d_offset(pgd, address);
+ if (bad_address(p4d))
+ goto bad;
+
+ pr_cont("P4D %lx ", p4d_val(*p4d));
+ if (!p4d_present(*p4d) || p4d_large(*p4d))
+ goto out;
+
+ pud = pud_offset(p4d, address);
+ if (bad_address(pud))
+ goto bad;
+
+ pr_cont("PUD %lx ", pud_val(*pud));
+ if (!pud_present(*pud) || pud_large(*pud))
+ goto out;
+
+ pmd = pmd_offset(pud, address);
+ if (bad_address(pmd))
+ goto bad;
+
+ pr_cont("PMD %lx ", pmd_val(*pmd));
+ if (!pmd_present(*pmd) || pmd_large(*pmd))
+ goto out;
+
+ pte = pte_offset_kernel(pmd, address);
+ if (bad_address(pte))
+ goto bad;
+
+ pr_cont("PTE %lx", pte_val(*pte));
+out:
+ pr_cont("\n");
+ return;
+bad:
+ pr_info("BAD\n");
+}
+
+#endif /* CONFIG_X86_64 */
+
+/*
+ * Workaround for K8 erratum #93 & buggy BIOS.
+ *
+ * BIOS SMM functions are required to use a specific workaround
+ * to avoid corruption of the 64bit RIP register on C stepping K8.
+ *
+ * A lot of BIOS that didn't get tested properly miss this.
+ *
+ * The OS sees this as a page fault with the upper 32bits of RIP cleared.
+ * Try to work around it here.
+ *
+ * Note we only handle faults in kernel here.
+ * Does nothing on 32-bit.
+ */
+static int is_errata93(struct pt_regs *regs, unsigned long address)
+{
+#if defined(CONFIG_X86_64) && defined(CONFIG_CPU_SUP_AMD)
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD
+ || boot_cpu_data.x86 != 0xf)
+ return 0;
+
+ if (address != regs->ip)
+ return 0;
+
+ if ((address >> 32) != 0)
+ return 0;
+
+ address |= 0xffffffffUL << 32;
+ if ((address >= (u64)_stext && address <= (u64)_etext) ||
+ (address >= MODULES_VADDR && address <= MODULES_END)) {
+ printk_once(errata93_warning);
+ regs->ip = address;
+ return 1;
+ }
+#endif
+ return 0;
+}
+
+/*
+ * Work around K8 erratum #100 K8 in compat mode occasionally jumps
+ * to illegal addresses >4GB.
+ *
+ * We catch this in the page fault handler because these addresses
+ * are not reachable. Just detect this case and return. Any code
+ * segment in LDT is compatibility mode.
+ */
+static int is_errata100(struct pt_regs *regs, unsigned long address)
+{
+#ifdef CONFIG_X86_64
+ if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && (address >> 32))
+ return 1;
+#endif
+ return 0;
+}
+
+static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
+{
+#ifdef CONFIG_X86_F00F_BUG
+ unsigned long nr;
+
+ /*
+ * Pentium F0 0F C7 C8 bug workaround:
+ */
+ if (boot_cpu_has_bug(X86_BUG_F00F)) {
+ nr = (address - idt_descr.address) >> 3;
+
+ if (nr == 6) {
+ do_invalid_op(regs, 0);
+ return 1;
+ }
+ }
+#endif
+ return 0;
+}
+
+static void
+show_fault_oops(struct pt_regs *regs, unsigned long error_code,
+ unsigned long address)
+{
+ if (!oops_may_print())
+ return;
+
+ if (error_code & X86_PF_INSTR) {
+ unsigned int level;
+ pgd_t *pgd;
+ pte_t *pte;
+
+ pgd = __va(read_cr3_pa());
+ pgd += pgd_index(address);
+
+ pte = lookup_address_in_pgd(pgd, address, &level);
+
+ if (pte && pte_present(*pte) && !pte_exec(*pte))
+ pr_crit("kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n",
+ from_kuid(&init_user_ns, current_uid()));
+ if (pte && pte_present(*pte) && pte_exec(*pte) &&
+ (pgd_flags(*pgd) & _PAGE_USER) &&
+ (__read_cr4() & X86_CR4_SMEP))
+ pr_crit("unable to execute userspace code (SMEP?) (uid: %d)\n",
+ from_kuid(&init_user_ns, current_uid()));
+ }
+
+ pr_alert("BUG: unable to handle kernel %s at %px\n",
+ address < PAGE_SIZE ? "NULL pointer dereference" : "paging request",
+ (void *)address);
+
+ dump_pagetable(address);
+}
+
+static noinline void
+pgtable_bad(struct pt_regs *regs, unsigned long error_code,
+ unsigned long address)
+{
+ struct task_struct *tsk;
+ unsigned long flags;
+ int sig;
+
+ flags = oops_begin();
+ tsk = current;
+ sig = SIGKILL;
+
+ printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
+ tsk->comm, address);
+ dump_pagetable(address);
+
+ tsk->thread.cr2 = address;
+ tsk->thread.trap_nr = X86_TRAP_PF;
+ tsk->thread.error_code = error_code;
+
+ if (__die("Bad pagetable", regs, error_code))
+ sig = 0;
+
+ oops_end(flags, regs, sig);
+}
+
+static noinline void
+no_context(struct pt_regs *regs, unsigned long error_code,
+ unsigned long address, int signal, int si_code)
+{
+ struct task_struct *tsk = current;
+ unsigned long flags;
+ int sig;
+
+ /* Are we prepared to handle this kernel fault? */
+ if (fixup_exception(regs, X86_TRAP_PF)) {
+ /*
+ * Any interrupt that takes a fault gets the fixup. This makes
+ * the below recursive fault logic only apply to a faults from
+ * task context.
+ */
+ if (in_interrupt())
+ return;
+
+ /*
+ * Per the above we're !in_interrupt(), aka. task context.
+ *
+ * In this case we need to make sure we're not recursively
+ * faulting through the emulate_vsyscall() logic.
+ */
+ if (current->thread.sig_on_uaccess_err && signal) {
+ tsk->thread.trap_nr = X86_TRAP_PF;
+ tsk->thread.error_code = error_code | X86_PF_USER;
+ tsk->thread.cr2 = address;
+
+ /* XXX: hwpoison faults will set the wrong code. */
+ force_sig_info_fault(signal, si_code, address,
+ tsk, NULL, 0);
+ }
+
+ /*
+ * Barring that, we can do the fixup and be happy.
+ */
+ return;
+ }
+
+#ifdef CONFIG_VMAP_STACK
+ /*
+ * Stack overflow? During boot, we can fault near the initial
+ * stack in the direct map, but that's not an overflow -- check
+ * that we're in vmalloc space to avoid this.
+ */
+ if (is_vmalloc_addr((void *)address) &&
+ (((unsigned long)tsk->stack - 1 - address < PAGE_SIZE) ||
+ address - ((unsigned long)tsk->stack + THREAD_SIZE) < PAGE_SIZE)) {
+ unsigned long stack = this_cpu_read(orig_ist.ist[DOUBLEFAULT_STACK]) - sizeof(void *);
+ /*
+ * We're likely to be running with very little stack space
+ * left. It's plausible that we'd hit this condition but
+ * double-fault even before we get this far, in which case
+ * we're fine: the double-fault handler will deal with it.
+ *
+ * We don't want to make it all the way into the oops code
+ * and then double-fault, though, because we're likely to
+ * break the console driver and lose most of the stack dump.
+ */
+ asm volatile ("movq %[stack], %%rsp\n\t"
+ "call handle_stack_overflow\n\t"
+ "1: jmp 1b"
+ : ASM_CALL_CONSTRAINT
+ : "D" ("kernel stack overflow (page fault)"),
+ "S" (regs), "d" (address),
+ [stack] "rm" (stack));
+ unreachable();
+ }
+#endif
+
+ /*
+ * 32-bit:
+ *
+ * Valid to do another page fault here, because if this fault
+ * had been triggered by is_prefetch fixup_exception would have
+ * handled it.
+ *
+ * 64-bit:
+ *
+ * Hall of shame of CPU/BIOS bugs.
+ */
+ if (is_prefetch(regs, error_code, address))
+ return;
+
+ if (is_errata93(regs, address))
+ return;
+
+ /*
+ * Oops. The kernel tried to access some bad page. We'll have to
+ * terminate things with extreme prejudice:
+ */
+ flags = oops_begin();
+
+ show_fault_oops(regs, error_code, address);
+
+ if (task_stack_end_corrupted(tsk))
+ printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
+
+ tsk->thread.cr2 = address;
+ tsk->thread.trap_nr = X86_TRAP_PF;
+ tsk->thread.error_code = error_code;
+
+ sig = SIGKILL;
+ if (__die("Oops", regs, error_code))
+ sig = 0;
+
+ /* Executive summary in case the body of the oops scrolled away */
+ printk(KERN_DEFAULT "CR2: %016lx\n", address);
+
+ oops_end(flags, regs, sig);
+}
+
+/*
+ * Print out info about fatal segfaults, if the show_unhandled_signals
+ * sysctl is set:
+ */
+static inline void
+show_signal_msg(struct pt_regs *regs, unsigned long error_code,
+ unsigned long address, struct task_struct *tsk)
+{
+ const char *loglvl = task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG;
+
+ if (!unhandled_signal(tsk, SIGSEGV))
+ return;
+
+ if (!printk_ratelimit())
+ return;
+
+ printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx",
+ loglvl, tsk->comm, task_pid_nr(tsk), address,
+ (void *)regs->ip, (void *)regs->sp, error_code);
+
+ print_vma_addr(KERN_CONT " in ", regs->ip);
+
+ printk(KERN_CONT "\n");
+
+ show_opcodes(regs, loglvl);
+}
+
+static void
+__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
+ unsigned long address, u32 *pkey, int si_code)
+{
+ struct task_struct *tsk = current;
+
+ /* User mode accesses just cause a SIGSEGV */
+ if (error_code & X86_PF_USER) {
+ /*
+ * It's possible to have interrupts off here:
+ */
+ local_irq_enable();
+
+ /*
+ * Valid to do another page fault here because this one came
+ * from user space:
+ */
+ if (is_prefetch(regs, error_code, address))
+ return;
+
+ if (is_errata100(regs, address))
+ return;
+
+#ifdef CONFIG_X86_64
+ /*
+ * Instruction fetch faults in the vsyscall page might need
+ * emulation.
+ */
+ if (unlikely((error_code & X86_PF_INSTR) &&
+ ((address & ~0xfff) == VSYSCALL_ADDR))) {
+ if (emulate_vsyscall(regs, address))
+ return;
+ }
+#endif
+
+ /*
+ * To avoid leaking information about the kernel page table
+ * layout, pretend that user-mode accesses to kernel addresses
+ * are always protection faults.
+ */
+ if (address >= TASK_SIZE_MAX)
+ error_code |= X86_PF_PROT;
+
+ if (likely(show_unhandled_signals))
+ show_signal_msg(regs, error_code, address, tsk);
+
+ tsk->thread.cr2 = address;
+ tsk->thread.error_code = error_code;
+ tsk->thread.trap_nr = X86_TRAP_PF;
+
+ force_sig_info_fault(SIGSEGV, si_code, address, tsk, pkey, 0);
+
+ return;
+ }
+
+ if (is_f00f_bug(regs, address))
+ return;
+
+ no_context(regs, error_code, address, SIGSEGV, si_code);
+}
+
+static noinline void
+bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
+ unsigned long address, u32 *pkey)
+{
+ __bad_area_nosemaphore(regs, error_code, address, pkey, SEGV_MAPERR);
+}
+
+static void
+__bad_area(struct pt_regs *regs, unsigned long error_code,
+ unsigned long address, struct vm_area_struct *vma, int si_code)
+{
+ struct mm_struct *mm = current->mm;
+ u32 pkey;
+
+ if (vma)
+ pkey = vma_pkey(vma);
+
+ /*
+ * Something tried to access memory that isn't in our memory map..
+ * Fix it, but check if it's kernel or user first..
+ */
+ up_read(&mm->mmap_sem);
+
+ __bad_area_nosemaphore(regs, error_code, address,
+ (vma) ? &pkey : NULL, si_code);
+}
+
+static noinline void
+bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address)
+{
+ __bad_area(regs, error_code, address, NULL, SEGV_MAPERR);
+}
+
+static inline bool bad_area_access_from_pkeys(unsigned long error_code,
+ struct vm_area_struct *vma)
+{
+ /* This code is always called on the current mm */
+ bool foreign = false;
+
+ if (!boot_cpu_has(X86_FEATURE_OSPKE))
+ return false;
+ if (error_code & X86_PF_PK)
+ return true;
+ /* this checks permission keys on the VMA: */
+ if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE),
+ (error_code & X86_PF_INSTR), foreign))
+ return true;
+ return false;
+}
+
+static noinline void
+bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
+ unsigned long address, struct vm_area_struct *vma)
+{
+ /*
+ * This OSPKE check is not strictly necessary at runtime.
+ * But, doing it this way allows compiler optimizations
+ * if pkeys are compiled out.
+ */
+ if (bad_area_access_from_pkeys(error_code, vma))
+ __bad_area(regs, error_code, address, vma, SEGV_PKUERR);
+ else
+ __bad_area(regs, error_code, address, vma, SEGV_ACCERR);
+}
+
+static void
+do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
+ u32 *pkey, unsigned int fault)
+{
+ struct task_struct *tsk = current;
+ int code = BUS_ADRERR;
+
+ /* Kernel mode? Handle exceptions or die: */
+ if (!(error_code & X86_PF_USER)) {
+ no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
+ return;
+ }
+
+ /* User-space => ok to do another page fault: */
+ if (is_prefetch(regs, error_code, address))
+ return;
+
+ tsk->thread.cr2 = address;
+ tsk->thread.error_code = error_code;
+ tsk->thread.trap_nr = X86_TRAP_PF;
+
+#ifdef CONFIG_MEMORY_FAILURE
+ if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
+ printk(KERN_ERR
+ "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
+ tsk->comm, tsk->pid, address);
+ code = BUS_MCEERR_AR;
+ }
+#endif
+ force_sig_info_fault(SIGBUS, code, address, tsk, pkey, fault);
+}
+
+static noinline void
+mm_fault_error(struct pt_regs *regs, unsigned long error_code,
+ unsigned long address, u32 *pkey, vm_fault_t fault)
+{
+ if (fatal_signal_pending(current) && !(error_code & X86_PF_USER)) {
+ no_context(regs, error_code, address, 0, 0);
+ return;
+ }
+
+ if (fault & VM_FAULT_OOM) {
+ /* Kernel mode? Handle exceptions or die: */
+ if (!(error_code & X86_PF_USER)) {
+ no_context(regs, error_code, address,
+ SIGSEGV, SEGV_MAPERR);
+ return;
+ }
+
+ /*
+ * We ran out of memory, call the OOM killer, and return the
+ * userspace (which will retry the fault, or kill us if we got
+ * oom-killed):
+ */
+ pagefault_out_of_memory();
+ } else {
+ if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
+ VM_FAULT_HWPOISON_LARGE))
+ do_sigbus(regs, error_code, address, pkey, fault);
+ else if (fault & VM_FAULT_SIGSEGV)
+ bad_area_nosemaphore(regs, error_code, address, pkey);
+ else
+ BUG();
+ }
+}
+
+static int spurious_fault_check(unsigned long error_code, pte_t *pte)
+{
+ if ((error_code & X86_PF_WRITE) && !pte_write(*pte))
+ return 0;
+
+ if ((error_code & X86_PF_INSTR) && !pte_exec(*pte))
+ return 0;
+ /*
+ * Note: We do not do lazy flushing on protection key
+ * changes, so no spurious fault will ever set X86_PF_PK.
+ */
+ if ((error_code & X86_PF_PK))
+ return 1;
+
+ return 1;
+}
+
+/*
+ * Handle a spurious fault caused by a stale TLB entry.
+ *
+ * This allows us to lazily refresh the TLB when increasing the
+ * permissions of a kernel page (RO -> RW or NX -> X). Doing it
+ * eagerly is very expensive since that implies doing a full
+ * cross-processor TLB flush, even if no stale TLB entries exist
+ * on other processors.
+ *
+ * Spurious faults may only occur if the TLB contains an entry with
+ * fewer permission than the page table entry. Non-present (P = 0)
+ * and reserved bit (R = 1) faults are never spurious.
+ *
+ * There are no security implications to leaving a stale TLB when
+ * increasing the permissions on a page.
+ *
+ * Returns non-zero if a spurious fault was handled, zero otherwise.
+ *
+ * See Intel Developer's Manual Vol 3 Section 4.10.4.3, bullet 3
+ * (Optional Invalidation).
+ */
+static noinline int
+spurious_fault(unsigned long error_code, unsigned long address)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+ int ret;
+
+ /*
+ * Only writes to RO or instruction fetches from NX may cause
+ * spurious faults.
+ *
+ * These could be from user or supervisor accesses but the TLB
+ * is only lazily flushed after a kernel mapping protection
+ * change, so user accesses are not expected to cause spurious
+ * faults.
+ */
+ if (error_code != (X86_PF_WRITE | X86_PF_PROT) &&
+ error_code != (X86_PF_INSTR | X86_PF_PROT))
+ return 0;
+
+ pgd = init_mm.pgd + pgd_index(address);
+ if (!pgd_present(*pgd))
+ return 0;
+
+ p4d = p4d_offset(pgd, address);
+ if (!p4d_present(*p4d))
+ return 0;
+
+ if (p4d_large(*p4d))
+ return spurious_fault_check(error_code, (pte_t *) p4d);
+
+ pud = pud_offset(p4d, address);
+ if (!pud_present(*pud))
+ return 0;
+
+ if (pud_large(*pud))
+ return spurious_fault_check(error_code, (pte_t *) pud);
+
+ pmd = pmd_offset(pud, address);
+ if (!pmd_present(*pmd))
+ return 0;
+
+ if (pmd_large(*pmd))
+ return spurious_fault_check(error_code, (pte_t *) pmd);
+
+ pte = pte_offset_kernel(pmd, address);
+ if (!pte_present(*pte))
+ return 0;
+
+ ret = spurious_fault_check(error_code, pte);
+ if (!ret)
+ return 0;
+
+ /*
+ * Make sure we have permissions in PMD.
+ * If not, then there's a bug in the page tables:
+ */
+ ret = spurious_fault_check(error_code, (pte_t *) pmd);
+ WARN_ONCE(!ret, "PMD has incorrect permission bits\n");
+
+ return ret;
+}
+NOKPROBE_SYMBOL(spurious_fault);
+
+int show_unhandled_signals = 1;
+
+static inline int
+access_error(unsigned long error_code, struct vm_area_struct *vma)
+{
+ /* This is only called for the current mm, so: */
+ bool foreign = false;
+
+ /*
+ * Read or write was blocked by protection keys. This is
+ * always an unconditional error and can never result in
+ * a follow-up action to resolve the fault, like a COW.
+ */
+ if (error_code & X86_PF_PK)
+ return 1;
+
+ /*
+ * Make sure to check the VMA so that we do not perform
+ * faults just to hit a X86_PF_PK as soon as we fill in a
+ * page.
+ */
+ if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE),
+ (error_code & X86_PF_INSTR), foreign))
+ return 1;
+
+ if (error_code & X86_PF_WRITE) {
+ /* write, present and write, not present: */
+ if (unlikely(!(vma->vm_flags & VM_WRITE)))
+ return 1;
+ return 0;
+ }
+
+ /* read, present: */
+ if (unlikely(error_code & X86_PF_PROT))
+ return 1;
+
+ /* read, not present: */
+ if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))))
+ return 1;
+
+ return 0;
+}
+
+static int fault_in_kernel_space(unsigned long address)
+{
+ return address >= TASK_SIZE_MAX;
+}
+
+static inline bool smap_violation(int error_code, struct pt_regs *regs)
+{
+ if (!IS_ENABLED(CONFIG_X86_SMAP))
+ return false;
+
+ if (!static_cpu_has(X86_FEATURE_SMAP))
+ return false;
+
+ if (error_code & X86_PF_USER)
+ return false;
+
+ if (!user_mode(regs) && (regs->flags & X86_EFLAGS_AC))
+ return false;
+
+ return true;
+}
+
+/*
+ * This routine handles page faults. It determines the address,
+ * and the problem, and then passes it off to one of the appropriate
+ * routines.
+ */
+static noinline void
+__do_page_fault(struct pt_regs *regs, unsigned long error_code,
+ unsigned long address)
+{
+ struct vm_area_struct *vma;
+ struct task_struct *tsk;
+ struct mm_struct *mm;
+ vm_fault_t fault, major = 0;
+ unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
+ u32 pkey;
+
+ tsk = current;
+ mm = tsk->mm;
+
+ prefetchw(&mm->mmap_sem);
+
+ if (unlikely(kmmio_fault(regs, address)))
+ return;
+
+ /*
+ * We fault-in kernel-space virtual memory on-demand. The
+ * 'reference' page table is init_mm.pgd.
+ *
+ * NOTE! We MUST NOT take any locks for this case. We may
+ * be in an interrupt or a critical region, and should
+ * only copy the information from the master page table,
+ * nothing more.
+ *
+ * This verifies that the fault happens in kernel space
+ * (error_code & 4) == 0, and that the fault was not a
+ * protection error (error_code & 9) == 0.
+ */
+ if (unlikely(fault_in_kernel_space(address))) {
+ if (!(error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
+ if (vmalloc_fault(address) >= 0)
+ return;
+ }
+
+ /* Can handle a stale RO->RW TLB: */
+ if (spurious_fault(error_code, address))
+ return;
+
+ /* kprobes don't want to hook the spurious faults: */
+ if (kprobes_fault(regs))
+ return;
+ /*
+ * Don't take the mm semaphore here. If we fixup a prefetch
+ * fault we could otherwise deadlock:
+ */
+ bad_area_nosemaphore(regs, error_code, address, NULL);
+
+ return;
+ }
+
+ /* kprobes don't want to hook the spurious faults: */
+ if (unlikely(kprobes_fault(regs)))
+ return;
+
+ if (unlikely(error_code & X86_PF_RSVD))
+ pgtable_bad(regs, error_code, address);
+
+ if (unlikely(smap_violation(error_code, regs))) {
+ bad_area_nosemaphore(regs, error_code, address, NULL);
+ return;
+ }
+
+ /*
+ * If we're in an interrupt, have no user context or are running
+ * in a region with pagefaults disabled then we must not take the fault
+ */
+ if (unlikely(faulthandler_disabled() || !mm)) {
+ bad_area_nosemaphore(regs, error_code, address, NULL);
+ return;
+ }
+
+ /*
+ * It's safe to allow irq's after cr2 has been saved and the
+ * vmalloc fault has been handled.
+ *
+ * User-mode registers count as a user access even for any
+ * potential system fault or CPU buglet:
+ */
+ if (user_mode(regs)) {
+ local_irq_enable();
+ error_code |= X86_PF_USER;
+ flags |= FAULT_FLAG_USER;
+ } else {
+ if (regs->flags & X86_EFLAGS_IF)
+ local_irq_enable();
+ }
+
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+
+ if (error_code & X86_PF_WRITE)
+ flags |= FAULT_FLAG_WRITE;
+ if (error_code & X86_PF_INSTR)
+ flags |= FAULT_FLAG_INSTRUCTION;
+
+ /*
+ * When running in the kernel we expect faults to occur only to
+ * addresses in user space. All other faults represent errors in
+ * the kernel and should generate an OOPS. Unfortunately, in the
+ * case of an erroneous fault occurring in a code path which already
+ * holds mmap_sem we will deadlock attempting to validate the fault
+ * against the address space. Luckily the kernel only validly
+ * references user space from well defined areas of code, which are
+ * listed in the exceptions table.
+ *
+ * As the vast majority of faults will be valid we will only perform
+ * the source reference check when there is a possibility of a
+ * deadlock. Attempt to lock the address space, if we cannot we then
+ * validate the source. If this is invalid we can skip the address
+ * space check, thus avoiding the deadlock:
+ */
+ if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
+ if (!(error_code & X86_PF_USER) &&
+ !search_exception_tables(regs->ip)) {
+ bad_area_nosemaphore(regs, error_code, address, NULL);
+ return;
+ }
+retry:
+ down_read(&mm->mmap_sem);
+ } else {
+ /*
+ * The above down_read_trylock() might have succeeded in
+ * which case we'll have missed the might_sleep() from
+ * down_read():
+ */
+ might_sleep();
+ }
+
+ vma = find_vma(mm, address);
+ if (unlikely(!vma)) {
+ bad_area(regs, error_code, address);
+ return;
+ }
+ if (likely(vma->vm_start <= address))
+ goto good_area;
+ if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
+ bad_area(regs, error_code, address);
+ return;
+ }
+ if (error_code & X86_PF_USER) {
+ /*
+ * Accessing the stack below %sp is always a bug.
+ * The large cushion allows instructions like enter
+ * and pusha to work. ("enter $65535, $31" pushes
+ * 32 pointers and then decrements %sp by 65535.)
+ */
+ if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) {
+ bad_area(regs, error_code, address);
+ return;
+ }
+ }
+ if (unlikely(expand_stack(vma, address))) {
+ bad_area(regs, error_code, address);
+ return;
+ }
+
+ /*
+ * Ok, we have a good vm_area for this memory access, so
+ * we can handle it..
+ */
+good_area:
+ if (unlikely(access_error(error_code, vma))) {
+ bad_area_access_error(regs, error_code, address, vma);
+ return;
+ }
+
+ /*
+ * If for any reason at all we couldn't handle the fault,
+ * make sure we exit gracefully rather than endlessly redo
+ * the fault. Since we never set FAULT_FLAG_RETRY_NOWAIT, if
+ * we get VM_FAULT_RETRY back, the mmap_sem has been unlocked.
+ *
+ * Note that handle_userfault() may also release and reacquire mmap_sem
+ * (and not return with VM_FAULT_RETRY), when returning to userland to
+ * repeat the page fault later with a VM_FAULT_NOPAGE retval
+ * (potentially after handling any pending signal during the return to
+ * userland). The return to userland is identified whenever
+ * FAULT_FLAG_USER|FAULT_FLAG_KILLABLE are both set in flags.
+ * Thus we have to be careful about not touching vma after handling the
+ * fault, so we read the pkey beforehand.
+ */
+ pkey = vma_pkey(vma);
+ fault = handle_mm_fault(vma, address, flags);
+ major |= fault & VM_FAULT_MAJOR;
+
+ /*
+ * If we need to retry the mmap_sem has already been released,
+ * and if there is a fatal signal pending there is no guarantee
+ * that we made any progress. Handle this case first.
+ */
+ if (unlikely(fault & VM_FAULT_RETRY)) {
+ /* Retry at most once */
+ if (flags & FAULT_FLAG_ALLOW_RETRY) {
+ flags &= ~FAULT_FLAG_ALLOW_RETRY;
+ flags |= FAULT_FLAG_TRIED;
+ if (!fatal_signal_pending(tsk))
+ goto retry;
+ }
+
+ /* User mode? Just return to handle the fatal exception */
+ if (flags & FAULT_FLAG_USER)
+ return;
+
+ /* Not returning to user mode? Handle exceptions or die: */
+ no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
+ return;
+ }
+
+ up_read(&mm->mmap_sem);
+ if (unlikely(fault & VM_FAULT_ERROR)) {
+ mm_fault_error(regs, error_code, address, &pkey, fault);
+ return;
+ }
+
+ /*
+ * Major/minor page fault accounting. If any of the events
+ * returned VM_FAULT_MAJOR, we account it as a major fault.
+ */
+ if (major) {
+ tsk->maj_flt++;
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
+ } else {
+ tsk->min_flt++;
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
+ }
+
+ check_v8086_mode(regs, address, tsk);
+}
+NOKPROBE_SYMBOL(__do_page_fault);
+
+static nokprobe_inline void
+trace_page_fault_entries(unsigned long address, struct pt_regs *regs,
+ unsigned long error_code)
+{
+ if (user_mode(regs))
+ trace_page_fault_user(address, regs, error_code);
+ else
+ trace_page_fault_kernel(address, regs, error_code);
+}
+
+/*
+ * We must have this function blacklisted from kprobes, tagged with notrace
+ * and call read_cr2() before calling anything else. To avoid calling any
+ * kind of tracing machinery before we've observed the CR2 value.
+ *
+ * exception_{enter,exit}() contains all sorts of tracepoints.
+ */
+dotraplinkage void notrace
+do_page_fault(struct pt_regs *regs, unsigned long error_code)
+{
+ unsigned long address = read_cr2(); /* Get the faulting address */
+ enum ctx_state prev_state;
+
+ prev_state = exception_enter();
+ if (trace_pagefault_enabled())
+ trace_page_fault_entries(address, regs, error_code);
+
+ __do_page_fault(regs, error_code, address);
+ exception_exit(prev_state);
+}
+NOKPROBE_SYMBOL(do_page_fault);
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
new file mode 100644
index 000000000..6d18b70ed
--- /dev/null
+++ b/arch/x86/mm/highmem_32.c
@@ -0,0 +1,133 @@
+#include <linux/highmem.h>
+#include <linux/export.h>
+#include <linux/swap.h> /* for totalram_pages */
+#include <linux/bootmem.h>
+
+void *kmap(struct page *page)
+{
+ might_sleep();
+ if (!PageHighMem(page))
+ return page_address(page);
+ return kmap_high(page);
+}
+EXPORT_SYMBOL(kmap);
+
+void kunmap(struct page *page)
+{
+ if (in_interrupt())
+ BUG();
+ if (!PageHighMem(page))
+ return;
+ kunmap_high(page);
+}
+EXPORT_SYMBOL(kunmap);
+
+/*
+ * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
+ * no global lock is needed and because the kmap code must perform a global TLB
+ * invalidation when the kmap pool wraps.
+ *
+ * However when holding an atomic kmap it is not legal to sleep, so atomic
+ * kmaps are appropriate for short, tight code paths only.
+ */
+void *kmap_atomic_prot(struct page *page, pgprot_t prot)
+{
+ unsigned long vaddr;
+ int idx, type;
+
+ preempt_disable();
+ pagefault_disable();
+
+ if (!PageHighMem(page))
+ return page_address(page);
+
+ type = kmap_atomic_idx_push();
+ idx = type + KM_TYPE_NR*smp_processor_id();
+ vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
+ BUG_ON(!pte_none(*(kmap_pte-idx)));
+ set_pte(kmap_pte-idx, mk_pte(page, prot));
+ arch_flush_lazy_mmu_mode();
+
+ return (void *)vaddr;
+}
+EXPORT_SYMBOL(kmap_atomic_prot);
+
+void *kmap_atomic(struct page *page)
+{
+ return kmap_atomic_prot(page, kmap_prot);
+}
+EXPORT_SYMBOL(kmap_atomic);
+
+/*
+ * This is the same as kmap_atomic() but can map memory that doesn't
+ * have a struct page associated with it.
+ */
+void *kmap_atomic_pfn(unsigned long pfn)
+{
+ return kmap_atomic_prot_pfn(pfn, kmap_prot);
+}
+EXPORT_SYMBOL_GPL(kmap_atomic_pfn);
+
+void __kunmap_atomic(void *kvaddr)
+{
+ unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
+
+ if (vaddr >= __fix_to_virt(FIX_KMAP_END) &&
+ vaddr <= __fix_to_virt(FIX_KMAP_BEGIN)) {
+ int idx, type;
+
+ type = kmap_atomic_idx();
+ idx = type + KM_TYPE_NR * smp_processor_id();
+
+#ifdef CONFIG_DEBUG_HIGHMEM
+ WARN_ON_ONCE(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx));
+#endif
+ /*
+ * Force other mappings to Oops if they'll try to access this
+ * pte without first remap it. Keeping stale mappings around
+ * is a bad idea also, in case the page changes cacheability
+ * attributes or becomes a protected page in a hypervisor.
+ */
+ kpte_clear_flush(kmap_pte-idx, vaddr);
+ kmap_atomic_idx_pop();
+ arch_flush_lazy_mmu_mode();
+ }
+#ifdef CONFIG_DEBUG_HIGHMEM
+ else {
+ BUG_ON(vaddr < PAGE_OFFSET);
+ BUG_ON(vaddr >= (unsigned long)high_memory);
+ }
+#endif
+
+ pagefault_enable();
+ preempt_enable();
+}
+EXPORT_SYMBOL(__kunmap_atomic);
+
+void __init set_highmem_pages_init(void)
+{
+ struct zone *zone;
+ int nid;
+
+ /*
+ * Explicitly reset zone->managed_pages because set_highmem_pages_init()
+ * is invoked before free_all_bootmem()
+ */
+ reset_all_zones_managed_pages();
+ for_each_zone(zone) {
+ unsigned long zone_start_pfn, zone_end_pfn;
+
+ if (!is_highmem(zone))
+ continue;
+
+ zone_start_pfn = zone->zone_start_pfn;
+ zone_end_pfn = zone_start_pfn + zone->spanned_pages;
+
+ nid = zone_to_nid(zone);
+ printk(KERN_INFO "Initializing %s for node %d (%08lx:%08lx)\n",
+ zone->name, nid, zone_start_pfn, zone_end_pfn);
+
+ add_highpages_with_active_regions(nid, zone_start_pfn,
+ zone_end_pfn);
+ }
+}
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
new file mode 100644
index 000000000..00b296617
--- /dev/null
+++ b/arch/x86/mm/hugetlbpage.c
@@ -0,0 +1,216 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * IA-32 Huge TLB Page Support for Kernel.
+ *
+ * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
+ */
+
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/sched/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/pagemap.h>
+#include <linux/err.h>
+#include <linux/sysctl.h>
+#include <linux/compat.h>
+#include <asm/mman.h>
+#include <asm/tlb.h>
+#include <asm/tlbflush.h>
+#include <asm/pgalloc.h>
+#include <asm/elf.h>
+#include <asm/mpx.h>
+
+#if 0 /* This is just for testing */
+struct page *
+follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
+{
+ unsigned long start = address;
+ int length = 1;
+ int nr;
+ struct page *page;
+ struct vm_area_struct *vma;
+
+ vma = find_vma(mm, addr);
+ if (!vma || !is_vm_hugetlb_page(vma))
+ return ERR_PTR(-EINVAL);
+
+ pte = huge_pte_offset(mm, address, vma_mmu_pagesize(vma));
+
+ /* hugetlb should be locked, and hence, prefaulted */
+ WARN_ON(!pte || pte_none(*pte));
+
+ page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
+
+ WARN_ON(!PageHead(page));
+
+ return page;
+}
+
+int pmd_huge(pmd_t pmd)
+{
+ return 0;
+}
+
+int pud_huge(pud_t pud)
+{
+ return 0;
+}
+
+#else
+
+/*
+ * pmd_huge() returns 1 if @pmd is hugetlb related entry, that is normal
+ * hugetlb entry or non-present (migration or hwpoisoned) hugetlb entry.
+ * Otherwise, returns 0.
+ */
+int pmd_huge(pmd_t pmd)
+{
+ return !pmd_none(pmd) &&
+ (pmd_val(pmd) & (_PAGE_PRESENT|_PAGE_PSE)) != _PAGE_PRESENT;
+}
+
+int pud_huge(pud_t pud)
+{
+ return !!(pud_val(pud) & _PAGE_PSE);
+}
+#endif
+
+#ifdef CONFIG_HUGETLB_PAGE
+static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
+ unsigned long addr, unsigned long len,
+ unsigned long pgoff, unsigned long flags)
+{
+ struct hstate *h = hstate_file(file);
+ struct vm_unmapped_area_info info;
+
+ info.flags = 0;
+ info.length = len;
+ info.low_limit = get_mmap_base(1);
+
+ /*
+ * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area
+ * in the full address space.
+ */
+ info.high_limit = in_compat_syscall() ?
+ task_size_32bit() : task_size_64bit(addr > DEFAULT_MAP_WINDOW);
+
+ info.align_mask = PAGE_MASK & ~huge_page_mask(h);
+ info.align_offset = 0;
+ return vm_unmapped_area(&info);
+}
+
+static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
+ unsigned long addr, unsigned long len,
+ unsigned long pgoff, unsigned long flags)
+{
+ struct hstate *h = hstate_file(file);
+ struct vm_unmapped_area_info info;
+
+ info.flags = VM_UNMAPPED_AREA_TOPDOWN;
+ info.length = len;
+ info.low_limit = PAGE_SIZE;
+ info.high_limit = get_mmap_base(0);
+
+ /*
+ * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area
+ * in the full address space.
+ */
+ if (addr > DEFAULT_MAP_WINDOW && !in_compat_syscall())
+ info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW;
+
+ info.align_mask = PAGE_MASK & ~huge_page_mask(h);
+ info.align_offset = 0;
+ addr = vm_unmapped_area(&info);
+
+ /*
+ * A failed mmap() very likely causes application failure,
+ * so fall back to the bottom-up function here. This scenario
+ * can happen with large stack limits and large mmap()
+ * allocations.
+ */
+ if (addr & ~PAGE_MASK) {
+ VM_BUG_ON(addr != -ENOMEM);
+ info.flags = 0;
+ info.low_limit = TASK_UNMAPPED_BASE;
+ info.high_limit = TASK_SIZE_LOW;
+ addr = vm_unmapped_area(&info);
+ }
+
+ return addr;
+}
+
+unsigned long
+hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
+ unsigned long len, unsigned long pgoff, unsigned long flags)
+{
+ struct hstate *h = hstate_file(file);
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+
+ if (len & ~huge_page_mask(h))
+ return -EINVAL;
+
+ addr = mpx_unmapped_area_check(addr, len, flags);
+ if (IS_ERR_VALUE(addr))
+ return addr;
+
+ if (len > TASK_SIZE)
+ return -ENOMEM;
+
+ /* No address checking. See comment at mmap_address_hint_valid() */
+ if (flags & MAP_FIXED) {
+ if (prepare_hugepage_range(file, addr, len))
+ return -EINVAL;
+ return addr;
+ }
+
+ if (addr) {
+ addr &= huge_page_mask(h);
+ if (!mmap_address_hint_valid(addr, len))
+ goto get_unmapped_area;
+
+ vma = find_vma(mm, addr);
+ if (!vma || addr + len <= vm_start_gap(vma))
+ return addr;
+ }
+
+get_unmapped_area:
+ if (mm->get_unmapped_area == arch_get_unmapped_area)
+ return hugetlb_get_unmapped_area_bottomup(file, addr, len,
+ pgoff, flags);
+ else
+ return hugetlb_get_unmapped_area_topdown(file, addr, len,
+ pgoff, flags);
+}
+#endif /* CONFIG_HUGETLB_PAGE */
+
+#ifdef CONFIG_X86_64
+static __init int setup_hugepagesz(char *opt)
+{
+ unsigned long ps = memparse(opt, &opt);
+ if (ps == PMD_SIZE) {
+ hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT);
+ } else if (ps == PUD_SIZE && boot_cpu_has(X86_FEATURE_GBPAGES)) {
+ hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
+ } else {
+ hugetlb_bad_size();
+ printk(KERN_ERR "hugepagesz: Unsupported page size %lu M\n",
+ ps >> 20);
+ return 0;
+ }
+ return 1;
+}
+__setup("hugepagesz=", setup_hugepagesz);
+
+#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA)
+static __init int gigantic_pages_init(void)
+{
+ /* With compaction or CMA we can allocate gigantic pages at runtime */
+ if (boot_cpu_has(X86_FEATURE_GBPAGES) && !size_to_hstate(1UL << PUD_SHIFT))
+ hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
+ return 0;
+}
+arch_initcall(gigantic_pages_init);
+#endif
+#endif
diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c
new file mode 100644
index 000000000..968d7005f
--- /dev/null
+++ b/arch/x86/mm/ident_map.c
@@ -0,0 +1,147 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Helper routines for building identity mapping page tables. This is
+ * included by both the compressed kernel and the regular kernel.
+ */
+
+static void ident_pmd_init(struct x86_mapping_info *info, pmd_t *pmd_page,
+ unsigned long addr, unsigned long end)
+{
+ addr &= PMD_MASK;
+ for (; addr < end; addr += PMD_SIZE) {
+ pmd_t *pmd = pmd_page + pmd_index(addr);
+
+ if (pmd_present(*pmd))
+ continue;
+
+ set_pmd(pmd, __pmd((addr - info->offset) | info->page_flag));
+ }
+}
+
+static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page,
+ unsigned long addr, unsigned long end)
+{
+ unsigned long next;
+
+ for (; addr < end; addr = next) {
+ pud_t *pud = pud_page + pud_index(addr);
+ pmd_t *pmd;
+
+ next = (addr & PUD_MASK) + PUD_SIZE;
+ if (next > end)
+ next = end;
+
+ if (info->direct_gbpages) {
+ pud_t pudval;
+
+ if (pud_present(*pud))
+ continue;
+
+ addr &= PUD_MASK;
+ pudval = __pud((addr - info->offset) | info->page_flag);
+ set_pud(pud, pudval);
+ continue;
+ }
+
+ if (pud_present(*pud)) {
+ pmd = pmd_offset(pud, 0);
+ ident_pmd_init(info, pmd, addr, next);
+ continue;
+ }
+ pmd = (pmd_t *)info->alloc_pgt_page(info->context);
+ if (!pmd)
+ return -ENOMEM;
+ ident_pmd_init(info, pmd, addr, next);
+ set_pud(pud, __pud(__pa(pmd) | info->kernpg_flag));
+ }
+
+ return 0;
+}
+
+static int ident_p4d_init(struct x86_mapping_info *info, p4d_t *p4d_page,
+ unsigned long addr, unsigned long end)
+{
+ unsigned long next;
+ int result;
+
+ for (; addr < end; addr = next) {
+ p4d_t *p4d = p4d_page + p4d_index(addr);
+ pud_t *pud;
+
+ next = (addr & P4D_MASK) + P4D_SIZE;
+ if (next > end)
+ next = end;
+
+ if (p4d_present(*p4d)) {
+ pud = pud_offset(p4d, 0);
+ result = ident_pud_init(info, pud, addr, next);
+ if (result)
+ return result;
+
+ continue;
+ }
+ pud = (pud_t *)info->alloc_pgt_page(info->context);
+ if (!pud)
+ return -ENOMEM;
+
+ result = ident_pud_init(info, pud, addr, next);
+ if (result)
+ return result;
+
+ set_p4d(p4d, __p4d(__pa(pud) | info->kernpg_flag));
+ }
+
+ return 0;
+}
+
+int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
+ unsigned long pstart, unsigned long pend)
+{
+ unsigned long addr = pstart + info->offset;
+ unsigned long end = pend + info->offset;
+ unsigned long next;
+ int result;
+
+ /* Set the default pagetable flags if not supplied */
+ if (!info->kernpg_flag)
+ info->kernpg_flag = _KERNPG_TABLE;
+
+ /* Filter out unsupported __PAGE_KERNEL_* bits: */
+ info->kernpg_flag &= __default_kernel_pte_mask;
+
+ for (; addr < end; addr = next) {
+ pgd_t *pgd = pgd_page + pgd_index(addr);
+ p4d_t *p4d;
+
+ next = (addr & PGDIR_MASK) + PGDIR_SIZE;
+ if (next > end)
+ next = end;
+
+ if (pgd_present(*pgd)) {
+ p4d = p4d_offset(pgd, 0);
+ result = ident_p4d_init(info, p4d, addr, next);
+ if (result)
+ return result;
+ continue;
+ }
+
+ p4d = (p4d_t *)info->alloc_pgt_page(info->context);
+ if (!p4d)
+ return -ENOMEM;
+ result = ident_p4d_init(info, p4d, addr, next);
+ if (result)
+ return result;
+ if (pgtable_l5_enabled()) {
+ set_pgd(pgd, __pgd(__pa(p4d) | info->kernpg_flag));
+ } else {
+ /*
+ * With p4d folded, pgd is equal to p4d.
+ * The pgd entry has to point to the pud page table in this case.
+ */
+ pud_t *pud = pud_offset(p4d, 0);
+ set_pgd(pgd, __pgd(__pa(pud) | info->kernpg_flag));
+ }
+ }
+
+ return 0;
+}
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
new file mode 100644
index 000000000..b1dba0987
--- /dev/null
+++ b/arch/x86/mm/init.c
@@ -0,0 +1,953 @@
+#include <linux/gfp.h>
+#include <linux/initrd.h>
+#include <linux/ioport.h>
+#include <linux/swap.h>
+#include <linux/memblock.h>
+#include <linux/bootmem.h> /* for max_low_pfn */
+#include <linux/swapfile.h>
+#include <linux/swapops.h>
+#include <linux/kmemleak.h>
+
+#include <asm/set_memory.h>
+#include <asm/e820/api.h>
+#include <asm/init.h>
+#include <asm/page.h>
+#include <asm/page_types.h>
+#include <asm/sections.h>
+#include <asm/setup.h>
+#include <asm/tlbflush.h>
+#include <asm/tlb.h>
+#include <asm/proto.h>
+#include <asm/dma.h> /* for MAX_DMA_PFN */
+#include <asm/microcode.h>
+#include <asm/kaslr.h>
+#include <asm/hypervisor.h>
+#include <asm/cpufeature.h>
+#include <asm/pti.h>
+
+/*
+ * We need to define the tracepoints somewhere, and tlb.c
+ * is only compied when SMP=y.
+ */
+#define CREATE_TRACE_POINTS
+#include <trace/events/tlb.h>
+
+#include "mm_internal.h"
+
+/*
+ * Tables translating between page_cache_type_t and pte encoding.
+ *
+ * The default values are defined statically as minimal supported mode;
+ * WC and WT fall back to UC-. pat_init() updates these values to support
+ * more cache modes, WC and WT, when it is safe to do so. See pat_init()
+ * for the details. Note, __early_ioremap() used during early boot-time
+ * takes pgprot_t (pte encoding) and does not use these tables.
+ *
+ * Index into __cachemode2pte_tbl[] is the cachemode.
+ *
+ * Index into __pte2cachemode_tbl[] are the caching attribute bits of the pte
+ * (_PAGE_PWT, _PAGE_PCD, _PAGE_PAT) at index bit positions 0, 1, 2.
+ */
+uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = {
+ [_PAGE_CACHE_MODE_WB ] = 0 | 0 ,
+ [_PAGE_CACHE_MODE_WC ] = 0 | _PAGE_PCD,
+ [_PAGE_CACHE_MODE_UC_MINUS] = 0 | _PAGE_PCD,
+ [_PAGE_CACHE_MODE_UC ] = _PAGE_PWT | _PAGE_PCD,
+ [_PAGE_CACHE_MODE_WT ] = 0 | _PAGE_PCD,
+ [_PAGE_CACHE_MODE_WP ] = 0 | _PAGE_PCD,
+};
+EXPORT_SYMBOL(__cachemode2pte_tbl);
+
+uint8_t __pte2cachemode_tbl[8] = {
+ [__pte2cm_idx( 0 | 0 | 0 )] = _PAGE_CACHE_MODE_WB,
+ [__pte2cm_idx(_PAGE_PWT | 0 | 0 )] = _PAGE_CACHE_MODE_UC_MINUS,
+ [__pte2cm_idx( 0 | _PAGE_PCD | 0 )] = _PAGE_CACHE_MODE_UC_MINUS,
+ [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | 0 )] = _PAGE_CACHE_MODE_UC,
+ [__pte2cm_idx( 0 | 0 | _PAGE_PAT)] = _PAGE_CACHE_MODE_WB,
+ [__pte2cm_idx(_PAGE_PWT | 0 | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS,
+ [__pte2cm_idx(0 | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS,
+ [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC,
+};
+EXPORT_SYMBOL(__pte2cachemode_tbl);
+
+static unsigned long __initdata pgt_buf_start;
+static unsigned long __initdata pgt_buf_end;
+static unsigned long __initdata pgt_buf_top;
+
+static unsigned long min_pfn_mapped;
+
+static bool __initdata can_use_brk_pgt = true;
+
+/*
+ * Pages returned are already directly mapped.
+ *
+ * Changing that is likely to break Xen, see commit:
+ *
+ * 279b706 x86,xen: introduce x86_init.mapping.pagetable_reserve
+ *
+ * for detailed information.
+ */
+__ref void *alloc_low_pages(unsigned int num)
+{
+ unsigned long pfn;
+ int i;
+
+ if (after_bootmem) {
+ unsigned int order;
+
+ order = get_order((unsigned long)num << PAGE_SHIFT);
+ return (void *)__get_free_pages(GFP_ATOMIC | __GFP_ZERO, order);
+ }
+
+ if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) {
+ unsigned long ret = 0;
+
+ if (min_pfn_mapped < max_pfn_mapped) {
+ ret = memblock_find_in_range(
+ min_pfn_mapped << PAGE_SHIFT,
+ max_pfn_mapped << PAGE_SHIFT,
+ PAGE_SIZE * num , PAGE_SIZE);
+ }
+ if (ret)
+ memblock_reserve(ret, PAGE_SIZE * num);
+ else if (can_use_brk_pgt)
+ ret = __pa(extend_brk(PAGE_SIZE * num, PAGE_SIZE));
+
+ if (!ret)
+ panic("alloc_low_pages: can not alloc memory");
+
+ pfn = ret >> PAGE_SHIFT;
+ } else {
+ pfn = pgt_buf_end;
+ pgt_buf_end += num;
+ }
+
+ for (i = 0; i < num; i++) {
+ void *adr;
+
+ adr = __va((pfn + i) << PAGE_SHIFT);
+ clear_page(adr);
+ }
+
+ return __va(pfn << PAGE_SHIFT);
+}
+
+/*
+ * By default need 3 4k for initial PMD_SIZE, 3 4k for 0-ISA_END_ADDRESS.
+ * With KASLR memory randomization, depending on the machine e820 memory
+ * and the PUD alignment. We may need twice more pages when KASLR memory
+ * randomization is enabled.
+ */
+#ifndef CONFIG_RANDOMIZE_MEMORY
+#define INIT_PGD_PAGE_COUNT 6
+#else
+#define INIT_PGD_PAGE_COUNT 12
+#endif
+#define INIT_PGT_BUF_SIZE (INIT_PGD_PAGE_COUNT * PAGE_SIZE)
+RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE);
+void __init early_alloc_pgt_buf(void)
+{
+ unsigned long tables = INIT_PGT_BUF_SIZE;
+ phys_addr_t base;
+
+ base = __pa(extend_brk(tables, PAGE_SIZE));
+
+ pgt_buf_start = base >> PAGE_SHIFT;
+ pgt_buf_end = pgt_buf_start;
+ pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT);
+}
+
+int after_bootmem;
+
+early_param_on_off("gbpages", "nogbpages", direct_gbpages, CONFIG_X86_DIRECT_GBPAGES);
+
+struct map_range {
+ unsigned long start;
+ unsigned long end;
+ unsigned page_size_mask;
+};
+
+static int page_size_mask;
+
+static void __init probe_page_size_mask(void)
+{
+ /*
+ * For pagealloc debugging, identity mapping will use small pages.
+ * This will simplify cpa(), which otherwise needs to support splitting
+ * large pages into small in interrupt context, etc.
+ */
+ if (boot_cpu_has(X86_FEATURE_PSE) && !debug_pagealloc_enabled())
+ page_size_mask |= 1 << PG_LEVEL_2M;
+ else
+ direct_gbpages = 0;
+
+ /* Enable PSE if available */
+ if (boot_cpu_has(X86_FEATURE_PSE))
+ cr4_set_bits_and_update_boot(X86_CR4_PSE);
+
+ /* Enable PGE if available */
+ __supported_pte_mask &= ~_PAGE_GLOBAL;
+ if (boot_cpu_has(X86_FEATURE_PGE)) {
+ cr4_set_bits_and_update_boot(X86_CR4_PGE);
+ __supported_pte_mask |= _PAGE_GLOBAL;
+ }
+
+ /* By the default is everything supported: */
+ __default_kernel_pte_mask = __supported_pte_mask;
+ /* Except when with PTI where the kernel is mostly non-Global: */
+ if (cpu_feature_enabled(X86_FEATURE_PTI))
+ __default_kernel_pte_mask &= ~_PAGE_GLOBAL;
+
+ /* Enable 1 GB linear kernel mappings if available: */
+ if (direct_gbpages && boot_cpu_has(X86_FEATURE_GBPAGES)) {
+ printk(KERN_INFO "Using GB pages for direct mapping\n");
+ page_size_mask |= 1 << PG_LEVEL_1G;
+ } else {
+ direct_gbpages = 0;
+ }
+}
+
+static void setup_pcid(void)
+{
+ if (!IS_ENABLED(CONFIG_X86_64))
+ return;
+
+ if (!boot_cpu_has(X86_FEATURE_PCID))
+ return;
+
+ if (boot_cpu_has(X86_FEATURE_PGE)) {
+ /*
+ * This can't be cr4_set_bits_and_update_boot() -- the
+ * trampoline code can't handle CR4.PCIDE and it wouldn't
+ * do any good anyway. Despite the name,
+ * cr4_set_bits_and_update_boot() doesn't actually cause
+ * the bits in question to remain set all the way through
+ * the secondary boot asm.
+ *
+ * Instead, we brute-force it and set CR4.PCIDE manually in
+ * start_secondary().
+ */
+ cr4_set_bits(X86_CR4_PCIDE);
+
+ /*
+ * INVPCID's single-context modes (2/3) only work if we set
+ * X86_CR4_PCIDE, *and* we INVPCID support. It's unusable
+ * on systems that have X86_CR4_PCIDE clear, or that have
+ * no INVPCID support at all.
+ */
+ if (boot_cpu_has(X86_FEATURE_INVPCID))
+ setup_force_cpu_cap(X86_FEATURE_INVPCID_SINGLE);
+ } else {
+ /*
+ * flush_tlb_all(), as currently implemented, won't work if
+ * PCID is on but PGE is not. Since that combination
+ * doesn't exist on real hardware, there's no reason to try
+ * to fully support it, but it's polite to avoid corrupting
+ * data if we're on an improperly configured VM.
+ */
+ setup_clear_cpu_cap(X86_FEATURE_PCID);
+ }
+}
+
+#ifdef CONFIG_X86_32
+#define NR_RANGE_MR 3
+#else /* CONFIG_X86_64 */
+#define NR_RANGE_MR 5
+#endif
+
+static int __meminit save_mr(struct map_range *mr, int nr_range,
+ unsigned long start_pfn, unsigned long end_pfn,
+ unsigned long page_size_mask)
+{
+ if (start_pfn < end_pfn) {
+ if (nr_range >= NR_RANGE_MR)
+ panic("run out of range for init_memory_mapping\n");
+ mr[nr_range].start = start_pfn<<PAGE_SHIFT;
+ mr[nr_range].end = end_pfn<<PAGE_SHIFT;
+ mr[nr_range].page_size_mask = page_size_mask;
+ nr_range++;
+ }
+
+ return nr_range;
+}
+
+/*
+ * adjust the page_size_mask for small range to go with
+ * big page size instead small one if nearby are ram too.
+ */
+static void __ref adjust_range_page_size_mask(struct map_range *mr,
+ int nr_range)
+{
+ int i;
+
+ for (i = 0; i < nr_range; i++) {
+ if ((page_size_mask & (1<<PG_LEVEL_2M)) &&
+ !(mr[i].page_size_mask & (1<<PG_LEVEL_2M))) {
+ unsigned long start = round_down(mr[i].start, PMD_SIZE);
+ unsigned long end = round_up(mr[i].end, PMD_SIZE);
+
+#ifdef CONFIG_X86_32
+ if ((end >> PAGE_SHIFT) > max_low_pfn)
+ continue;
+#endif
+
+ if (memblock_is_region_memory(start, end - start))
+ mr[i].page_size_mask |= 1<<PG_LEVEL_2M;
+ }
+ if ((page_size_mask & (1<<PG_LEVEL_1G)) &&
+ !(mr[i].page_size_mask & (1<<PG_LEVEL_1G))) {
+ unsigned long start = round_down(mr[i].start, PUD_SIZE);
+ unsigned long end = round_up(mr[i].end, PUD_SIZE);
+
+ if (memblock_is_region_memory(start, end - start))
+ mr[i].page_size_mask |= 1<<PG_LEVEL_1G;
+ }
+ }
+}
+
+static const char *page_size_string(struct map_range *mr)
+{
+ static const char str_1g[] = "1G";
+ static const char str_2m[] = "2M";
+ static const char str_4m[] = "4M";
+ static const char str_4k[] = "4k";
+
+ if (mr->page_size_mask & (1<<PG_LEVEL_1G))
+ return str_1g;
+ /*
+ * 32-bit without PAE has a 4M large page size.
+ * PG_LEVEL_2M is misnamed, but we can at least
+ * print out the right size in the string.
+ */
+ if (IS_ENABLED(CONFIG_X86_32) &&
+ !IS_ENABLED(CONFIG_X86_PAE) &&
+ mr->page_size_mask & (1<<PG_LEVEL_2M))
+ return str_4m;
+
+ if (mr->page_size_mask & (1<<PG_LEVEL_2M))
+ return str_2m;
+
+ return str_4k;
+}
+
+static int __meminit split_mem_range(struct map_range *mr, int nr_range,
+ unsigned long start,
+ unsigned long end)
+{
+ unsigned long start_pfn, end_pfn, limit_pfn;
+ unsigned long pfn;
+ int i;
+
+ limit_pfn = PFN_DOWN(end);
+
+ /* head if not big page alignment ? */
+ pfn = start_pfn = PFN_DOWN(start);
+#ifdef CONFIG_X86_32
+ /*
+ * Don't use a large page for the first 2/4MB of memory
+ * because there are often fixed size MTRRs in there
+ * and overlapping MTRRs into large pages can cause
+ * slowdowns.
+ */
+ if (pfn == 0)
+ end_pfn = PFN_DOWN(PMD_SIZE);
+ else
+ end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
+#else /* CONFIG_X86_64 */
+ end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
+#endif
+ if (end_pfn > limit_pfn)
+ end_pfn = limit_pfn;
+ if (start_pfn < end_pfn) {
+ nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
+ pfn = end_pfn;
+ }
+
+ /* big page (2M) range */
+ start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
+#ifdef CONFIG_X86_32
+ end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
+#else /* CONFIG_X86_64 */
+ end_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE));
+ if (end_pfn > round_down(limit_pfn, PFN_DOWN(PMD_SIZE)))
+ end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
+#endif
+
+ if (start_pfn < end_pfn) {
+ nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+ page_size_mask & (1<<PG_LEVEL_2M));
+ pfn = end_pfn;
+ }
+
+#ifdef CONFIG_X86_64
+ /* big page (1G) range */
+ start_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE));
+ end_pfn = round_down(limit_pfn, PFN_DOWN(PUD_SIZE));
+ if (start_pfn < end_pfn) {
+ nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+ page_size_mask &
+ ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
+ pfn = end_pfn;
+ }
+
+ /* tail is not big page (1G) alignment */
+ start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
+ end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
+ if (start_pfn < end_pfn) {
+ nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+ page_size_mask & (1<<PG_LEVEL_2M));
+ pfn = end_pfn;
+ }
+#endif
+
+ /* tail is not big page (2M) alignment */
+ start_pfn = pfn;
+ end_pfn = limit_pfn;
+ nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
+
+ if (!after_bootmem)
+ adjust_range_page_size_mask(mr, nr_range);
+
+ /* try to merge same page size and continuous */
+ for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
+ unsigned long old_start;
+ if (mr[i].end != mr[i+1].start ||
+ mr[i].page_size_mask != mr[i+1].page_size_mask)
+ continue;
+ /* move it */
+ old_start = mr[i].start;
+ memmove(&mr[i], &mr[i+1],
+ (nr_range - 1 - i) * sizeof(struct map_range));
+ mr[i--].start = old_start;
+ nr_range--;
+ }
+
+ for (i = 0; i < nr_range; i++)
+ pr_debug(" [mem %#010lx-%#010lx] page %s\n",
+ mr[i].start, mr[i].end - 1,
+ page_size_string(&mr[i]));
+
+ return nr_range;
+}
+
+struct range pfn_mapped[E820_MAX_ENTRIES];
+int nr_pfn_mapped;
+
+static void add_pfn_range_mapped(unsigned long start_pfn, unsigned long end_pfn)
+{
+ nr_pfn_mapped = add_range_with_merge(pfn_mapped, E820_MAX_ENTRIES,
+ nr_pfn_mapped, start_pfn, end_pfn);
+ nr_pfn_mapped = clean_sort_range(pfn_mapped, E820_MAX_ENTRIES);
+
+ max_pfn_mapped = max(max_pfn_mapped, end_pfn);
+
+ if (start_pfn < (1UL<<(32-PAGE_SHIFT)))
+ max_low_pfn_mapped = max(max_low_pfn_mapped,
+ min(end_pfn, 1UL<<(32-PAGE_SHIFT)));
+}
+
+bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn)
+{
+ int i;
+
+ for (i = 0; i < nr_pfn_mapped; i++)
+ if ((start_pfn >= pfn_mapped[i].start) &&
+ (end_pfn <= pfn_mapped[i].end))
+ return true;
+
+ return false;
+}
+
+/*
+ * Setup the direct mapping of the physical memory at PAGE_OFFSET.
+ * This runs before bootmem is initialized and gets pages directly from
+ * the physical memory. To access them they are temporarily mapped.
+ */
+unsigned long __ref init_memory_mapping(unsigned long start,
+ unsigned long end)
+{
+ struct map_range mr[NR_RANGE_MR];
+ unsigned long ret = 0;
+ int nr_range, i;
+
+ pr_debug("init_memory_mapping: [mem %#010lx-%#010lx]\n",
+ start, end - 1);
+
+ memset(mr, 0, sizeof(mr));
+ nr_range = split_mem_range(mr, 0, start, end);
+
+ for (i = 0; i < nr_range; i++)
+ ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
+ mr[i].page_size_mask);
+
+ add_pfn_range_mapped(start >> PAGE_SHIFT, ret >> PAGE_SHIFT);
+
+ return ret >> PAGE_SHIFT;
+}
+
+/*
+ * We need to iterate through the E820 memory map and create direct mappings
+ * for only E820_TYPE_RAM and E820_KERN_RESERVED regions. We cannot simply
+ * create direct mappings for all pfns from [0 to max_low_pfn) and
+ * [4GB to max_pfn) because of possible memory holes in high addresses
+ * that cannot be marked as UC by fixed/variable range MTRRs.
+ * Depending on the alignment of E820 ranges, this may possibly result
+ * in using smaller size (i.e. 4K instead of 2M or 1G) page tables.
+ *
+ * init_mem_mapping() calls init_range_memory_mapping() with big range.
+ * That range would have hole in the middle or ends, and only ram parts
+ * will be mapped in init_range_memory_mapping().
+ */
+static unsigned long __init init_range_memory_mapping(
+ unsigned long r_start,
+ unsigned long r_end)
+{
+ unsigned long start_pfn, end_pfn;
+ unsigned long mapped_ram_size = 0;
+ int i;
+
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
+ u64 start = clamp_val(PFN_PHYS(start_pfn), r_start, r_end);
+ u64 end = clamp_val(PFN_PHYS(end_pfn), r_start, r_end);
+ if (start >= end)
+ continue;
+
+ /*
+ * if it is overlapping with brk pgt, we need to
+ * alloc pgt buf from memblock instead.
+ */
+ can_use_brk_pgt = max(start, (u64)pgt_buf_end<<PAGE_SHIFT) >=
+ min(end, (u64)pgt_buf_top<<PAGE_SHIFT);
+ init_memory_mapping(start, end);
+ mapped_ram_size += end - start;
+ can_use_brk_pgt = true;
+ }
+
+ return mapped_ram_size;
+}
+
+static unsigned long __init get_new_step_size(unsigned long step_size)
+{
+ /*
+ * Initial mapped size is PMD_SIZE (2M).
+ * We can not set step_size to be PUD_SIZE (1G) yet.
+ * In worse case, when we cross the 1G boundary, and
+ * PG_LEVEL_2M is not set, we will need 1+1+512 pages (2M + 8k)
+ * to map 1G range with PTE. Hence we use one less than the
+ * difference of page table level shifts.
+ *
+ * Don't need to worry about overflow in the top-down case, on 32bit,
+ * when step_size is 0, round_down() returns 0 for start, and that
+ * turns it into 0x100000000ULL.
+ * In the bottom-up case, round_up(x, 0) returns 0 though too, which
+ * needs to be taken into consideration by the code below.
+ */
+ return step_size << (PMD_SHIFT - PAGE_SHIFT - 1);
+}
+
+/**
+ * memory_map_top_down - Map [map_start, map_end) top down
+ * @map_start: start address of the target memory range
+ * @map_end: end address of the target memory range
+ *
+ * This function will setup direct mapping for memory range
+ * [map_start, map_end) in top-down. That said, the page tables
+ * will be allocated at the end of the memory, and we map the
+ * memory in top-down.
+ */
+static void __init memory_map_top_down(unsigned long map_start,
+ unsigned long map_end)
+{
+ unsigned long real_end, start, last_start;
+ unsigned long step_size;
+ unsigned long addr;
+ unsigned long mapped_ram_size = 0;
+
+ /* xen has big range in reserved near end of ram, skip it at first.*/
+ addr = memblock_find_in_range(map_start, map_end, PMD_SIZE, PMD_SIZE);
+ real_end = addr + PMD_SIZE;
+
+ /* step_size need to be small so pgt_buf from BRK could cover it */
+ step_size = PMD_SIZE;
+ max_pfn_mapped = 0; /* will get exact value next */
+ min_pfn_mapped = real_end >> PAGE_SHIFT;
+ last_start = start = real_end;
+
+ /*
+ * We start from the top (end of memory) and go to the bottom.
+ * The memblock_find_in_range() gets us a block of RAM from the
+ * end of RAM in [min_pfn_mapped, max_pfn_mapped) used as new pages
+ * for page table.
+ */
+ while (last_start > map_start) {
+ if (last_start > step_size) {
+ start = round_down(last_start - 1, step_size);
+ if (start < map_start)
+ start = map_start;
+ } else
+ start = map_start;
+ mapped_ram_size += init_range_memory_mapping(start,
+ last_start);
+ last_start = start;
+ min_pfn_mapped = last_start >> PAGE_SHIFT;
+ if (mapped_ram_size >= step_size)
+ step_size = get_new_step_size(step_size);
+ }
+
+ if (real_end < map_end)
+ init_range_memory_mapping(real_end, map_end);
+}
+
+/**
+ * memory_map_bottom_up - Map [map_start, map_end) bottom up
+ * @map_start: start address of the target memory range
+ * @map_end: end address of the target memory range
+ *
+ * This function will setup direct mapping for memory range
+ * [map_start, map_end) in bottom-up. Since we have limited the
+ * bottom-up allocation above the kernel, the page tables will
+ * be allocated just above the kernel and we map the memory
+ * in [map_start, map_end) in bottom-up.
+ */
+static void __init memory_map_bottom_up(unsigned long map_start,
+ unsigned long map_end)
+{
+ unsigned long next, start;
+ unsigned long mapped_ram_size = 0;
+ /* step_size need to be small so pgt_buf from BRK could cover it */
+ unsigned long step_size = PMD_SIZE;
+
+ start = map_start;
+ min_pfn_mapped = start >> PAGE_SHIFT;
+
+ /*
+ * We start from the bottom (@map_start) and go to the top (@map_end).
+ * The memblock_find_in_range() gets us a block of RAM from the
+ * end of RAM in [min_pfn_mapped, max_pfn_mapped) used as new pages
+ * for page table.
+ */
+ while (start < map_end) {
+ if (step_size && map_end - start > step_size) {
+ next = round_up(start + 1, step_size);
+ if (next > map_end)
+ next = map_end;
+ } else {
+ next = map_end;
+ }
+
+ mapped_ram_size += init_range_memory_mapping(start, next);
+ start = next;
+
+ if (mapped_ram_size >= step_size)
+ step_size = get_new_step_size(step_size);
+ }
+}
+
+void __init init_mem_mapping(void)
+{
+ unsigned long end;
+
+ pti_check_boottime_disable();
+ probe_page_size_mask();
+ setup_pcid();
+
+#ifdef CONFIG_X86_64
+ end = max_pfn << PAGE_SHIFT;
+#else
+ end = max_low_pfn << PAGE_SHIFT;
+#endif
+
+ /* the ISA range is always mapped regardless of memory holes */
+ init_memory_mapping(0, ISA_END_ADDRESS);
+
+ /* Init the trampoline, possibly with KASLR memory offset */
+ init_trampoline();
+
+ /*
+ * If the allocation is in bottom-up direction, we setup direct mapping
+ * in bottom-up, otherwise we setup direct mapping in top-down.
+ */
+ if (memblock_bottom_up()) {
+ unsigned long kernel_end = __pa_symbol(_end);
+
+ /*
+ * we need two separate calls here. This is because we want to
+ * allocate page tables above the kernel. So we first map
+ * [kernel_end, end) to make memory above the kernel be mapped
+ * as soon as possible. And then use page tables allocated above
+ * the kernel to map [ISA_END_ADDRESS, kernel_end).
+ */
+ memory_map_bottom_up(kernel_end, end);
+ memory_map_bottom_up(ISA_END_ADDRESS, kernel_end);
+ } else {
+ memory_map_top_down(ISA_END_ADDRESS, end);
+ }
+
+#ifdef CONFIG_X86_64
+ if (max_pfn > max_low_pfn) {
+ /* can we preseve max_low_pfn ?*/
+ max_low_pfn = max_pfn;
+ }
+#else
+ early_ioremap_page_table_range_init();
+#endif
+
+ load_cr3(swapper_pg_dir);
+ __flush_tlb_all();
+
+ x86_init.hyper.init_mem_mapping();
+
+ early_memtest(0, max_pfn_mapped << PAGE_SHIFT);
+}
+
+/*
+ * devmem_is_allowed() checks to see if /dev/mem access to a certain address
+ * is valid. The argument is a physical page number.
+ *
+ * On x86, access has to be given to the first megabyte of RAM because that
+ * area traditionally contains BIOS code and data regions used by X, dosemu,
+ * and similar apps. Since they map the entire memory range, the whole range
+ * must be allowed (for mapping), but any areas that would otherwise be
+ * disallowed are flagged as being "zero filled" instead of rejected.
+ * Access has to be given to non-kernel-ram areas as well, these contain the
+ * PCI mmio resources as well as potential bios/acpi data regions.
+ */
+int devmem_is_allowed(unsigned long pagenr)
+{
+ if (region_intersects(PFN_PHYS(pagenr), PAGE_SIZE,
+ IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE)
+ != REGION_DISJOINT) {
+ /*
+ * For disallowed memory regions in the low 1MB range,
+ * request that the page be shown as all zeros.
+ */
+ if (pagenr < 256)
+ return 2;
+
+ return 0;
+ }
+
+ /*
+ * This must follow RAM test, since System RAM is considered a
+ * restricted resource under CONFIG_STRICT_IOMEM.
+ */
+ if (iomem_is_exclusive(pagenr << PAGE_SHIFT)) {
+ /* Low 1MB bypasses iomem restrictions. */
+ if (pagenr < 256)
+ return 1;
+
+ return 0;
+ }
+
+ return 1;
+}
+
+void free_init_pages(char *what, unsigned long begin, unsigned long end)
+{
+ unsigned long begin_aligned, end_aligned;
+
+ /* Make sure boundaries are page aligned */
+ begin_aligned = PAGE_ALIGN(begin);
+ end_aligned = end & PAGE_MASK;
+
+ if (WARN_ON(begin_aligned != begin || end_aligned != end)) {
+ begin = begin_aligned;
+ end = end_aligned;
+ }
+
+ if (begin >= end)
+ return;
+
+ /*
+ * If debugging page accesses then do not free this memory but
+ * mark them not present - any buggy init-section access will
+ * create a kernel page fault:
+ */
+ if (debug_pagealloc_enabled()) {
+ pr_info("debug: unmapping init [mem %#010lx-%#010lx]\n",
+ begin, end - 1);
+ /*
+ * Inform kmemleak about the hole in the memory since the
+ * corresponding pages will be unmapped.
+ */
+ kmemleak_free_part((void *)begin, end - begin);
+ set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
+ } else {
+ /*
+ * We just marked the kernel text read only above, now that
+ * we are going to free part of that, we need to make that
+ * writeable and non-executable first.
+ */
+ set_memory_nx(begin, (end - begin) >> PAGE_SHIFT);
+ set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
+
+ free_reserved_area((void *)begin, (void *)end,
+ POISON_FREE_INITMEM, what);
+ }
+}
+
+/*
+ * begin/end can be in the direct map or the "high kernel mapping"
+ * used for the kernel image only. free_init_pages() will do the
+ * right thing for either kind of address.
+ */
+void free_kernel_image_pages(void *begin, void *end)
+{
+ unsigned long begin_ul = (unsigned long)begin;
+ unsigned long end_ul = (unsigned long)end;
+ unsigned long len_pages = (end_ul - begin_ul) >> PAGE_SHIFT;
+
+
+ free_init_pages("unused kernel image", begin_ul, end_ul);
+
+ /*
+ * PTI maps some of the kernel into userspace. For performance,
+ * this includes some kernel areas that do not contain secrets.
+ * Those areas might be adjacent to the parts of the kernel image
+ * being freed, which may contain secrets. Remove the "high kernel
+ * image mapping" for these freed areas, ensuring they are not even
+ * potentially vulnerable to Meltdown regardless of the specific
+ * optimizations PTI is currently using.
+ *
+ * The "noalias" prevents unmapping the direct map alias which is
+ * needed to access the freed pages.
+ *
+ * This is only valid for 64bit kernels. 32bit has only one mapping
+ * which can't be treated in this way for obvious reasons.
+ */
+ if (IS_ENABLED(CONFIG_X86_64) && cpu_feature_enabled(X86_FEATURE_PTI))
+ set_memory_np_noalias(begin_ul, len_pages);
+}
+
+void __weak mem_encrypt_free_decrypted_mem(void) { }
+
+void __ref free_initmem(void)
+{
+ e820__reallocate_tables();
+
+ mem_encrypt_free_decrypted_mem();
+
+ free_kernel_image_pages(&__init_begin, &__init_end);
+}
+
+#ifdef CONFIG_BLK_DEV_INITRD
+void __init free_initrd_mem(unsigned long start, unsigned long end)
+{
+ /*
+ * end could be not aligned, and We can not align that,
+ * decompresser could be confused by aligned initrd_end
+ * We already reserve the end partial page before in
+ * - i386_start_kernel()
+ * - x86_64_start_kernel()
+ * - relocate_initrd()
+ * So here We can do PAGE_ALIGN() safely to get partial page to be freed
+ */
+ free_init_pages("initrd", start, PAGE_ALIGN(end));
+}
+#endif
+
+/*
+ * Calculate the precise size of the DMA zone (first 16 MB of RAM),
+ * and pass it to the MM layer - to help it set zone watermarks more
+ * accurately.
+ *
+ * Done on 64-bit systems only for the time being, although 32-bit systems
+ * might benefit from this as well.
+ */
+void __init memblock_find_dma_reserve(void)
+{
+#ifdef CONFIG_X86_64
+ u64 nr_pages = 0, nr_free_pages = 0;
+ unsigned long start_pfn, end_pfn;
+ phys_addr_t start_addr, end_addr;
+ int i;
+ u64 u;
+
+ /*
+ * Iterate over all memory ranges (free and reserved ones alike),
+ * to calculate the total number of pages in the first 16 MB of RAM:
+ */
+ nr_pages = 0;
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
+ start_pfn = min(start_pfn, MAX_DMA_PFN);
+ end_pfn = min(end_pfn, MAX_DMA_PFN);
+
+ nr_pages += end_pfn - start_pfn;
+ }
+
+ /*
+ * Iterate over free memory ranges to calculate the number of free
+ * pages in the DMA zone, while not counting potential partial
+ * pages at the beginning or the end of the range:
+ */
+ nr_free_pages = 0;
+ for_each_free_mem_range(u, NUMA_NO_NODE, MEMBLOCK_NONE, &start_addr, &end_addr, NULL) {
+ start_pfn = min_t(unsigned long, PFN_UP(start_addr), MAX_DMA_PFN);
+ end_pfn = min_t(unsigned long, PFN_DOWN(end_addr), MAX_DMA_PFN);
+
+ if (start_pfn < end_pfn)
+ nr_free_pages += end_pfn - start_pfn;
+ }
+
+ set_dma_reserve(nr_pages - nr_free_pages);
+#endif
+}
+
+void __init zone_sizes_init(void)
+{
+ unsigned long max_zone_pfns[MAX_NR_ZONES];
+
+ memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
+
+#ifdef CONFIG_ZONE_DMA
+ max_zone_pfns[ZONE_DMA] = min(MAX_DMA_PFN, max_low_pfn);
+#endif
+#ifdef CONFIG_ZONE_DMA32
+ max_zone_pfns[ZONE_DMA32] = min(MAX_DMA32_PFN, max_low_pfn);
+#endif
+ max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
+#ifdef CONFIG_HIGHMEM
+ max_zone_pfns[ZONE_HIGHMEM] = max_pfn;
+#endif
+
+ free_area_init_nodes(max_zone_pfns);
+}
+
+__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
+ .loaded_mm = &init_mm,
+ .next_asid = 1,
+ .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */
+};
+EXPORT_PER_CPU_SYMBOL(cpu_tlbstate);
+
+void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache)
+{
+ /* entry 0 MUST be WB (hardwired to speed up translations) */
+ BUG_ON(!entry && cache != _PAGE_CACHE_MODE_WB);
+
+ __cachemode2pte_tbl[cache] = __cm_idx2pte(entry);
+ __pte2cachemode_tbl[entry] = cache;
+}
+
+#ifdef CONFIG_SWAP
+unsigned long max_swapfile_size(void)
+{
+ unsigned long pages;
+
+ pages = generic_max_swapfile_size();
+
+ if (boot_cpu_has_bug(X86_BUG_L1TF) && l1tf_mitigation != L1TF_MITIGATION_OFF) {
+ /* Limit the swap file size to MAX_PA/2 for L1TF workaround */
+ unsigned long long l1tf_limit = l1tf_pfn_limit();
+ /*
+ * We encode swap offsets also with 3 bits below those for pfn
+ * which makes the usable limit higher.
+ */
+#if CONFIG_PGTABLE_LEVELS > 2
+ l1tf_limit <<= PAGE_SHIFT - SWP_OFFSET_FIRST_BIT;
+#endif
+ pages = min_t(unsigned long long, l1tf_limit, pages);
+ }
+ return pages;
+}
+#endif
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
new file mode 100644
index 000000000..79b95910f
--- /dev/null
+++ b/arch/x86/mm/init_32.c
@@ -0,0 +1,956 @@
+/*
+ *
+ * Copyright (C) 1995 Linus Torvalds
+ *
+ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
+ */
+
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/swap.h>
+#include <linux/smp.h>
+#include <linux/init.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/pci.h>
+#include <linux/pfn.h>
+#include <linux/poison.h>
+#include <linux/bootmem.h>
+#include <linux/memblock.h>
+#include <linux/proc_fs.h>
+#include <linux/memory_hotplug.h>
+#include <linux/initrd.h>
+#include <linux/cpumask.h>
+#include <linux/gfp.h>
+
+#include <asm/asm.h>
+#include <asm/bios_ebda.h>
+#include <asm/processor.h>
+#include <linux/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/dma.h>
+#include <asm/fixmap.h>
+#include <asm/e820/api.h>
+#include <asm/apic.h>
+#include <asm/bugs.h>
+#include <asm/tlb.h>
+#include <asm/tlbflush.h>
+#include <asm/olpc_ofw.h>
+#include <asm/pgalloc.h>
+#include <asm/sections.h>
+#include <asm/paravirt.h>
+#include <asm/setup.h>
+#include <asm/set_memory.h>
+#include <asm/page_types.h>
+#include <asm/cpu_entry_area.h>
+#include <asm/init.h>
+
+#include "mm_internal.h"
+
+unsigned long highstart_pfn, highend_pfn;
+
+bool __read_mostly __vmalloc_start_set = false;
+
+/*
+ * Creates a middle page table and puts a pointer to it in the
+ * given global directory entry. This only returns the gd entry
+ * in non-PAE compilation mode, since the middle layer is folded.
+ */
+static pmd_t * __init one_md_table_init(pgd_t *pgd)
+{
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd_table;
+
+#ifdef CONFIG_X86_PAE
+ if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
+ pmd_table = (pmd_t *)alloc_low_page();
+ paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
+ set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
+ p4d = p4d_offset(pgd, 0);
+ pud = pud_offset(p4d, 0);
+ BUG_ON(pmd_table != pmd_offset(pud, 0));
+
+ return pmd_table;
+ }
+#endif
+ p4d = p4d_offset(pgd, 0);
+ pud = pud_offset(p4d, 0);
+ pmd_table = pmd_offset(pud, 0);
+
+ return pmd_table;
+}
+
+/*
+ * Create a page table and place a pointer to it in a middle page
+ * directory entry:
+ */
+static pte_t * __init one_page_table_init(pmd_t *pmd)
+{
+ if (!(pmd_val(*pmd) & _PAGE_PRESENT)) {
+ pte_t *page_table = (pte_t *)alloc_low_page();
+
+ paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT);
+ set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
+ BUG_ON(page_table != pte_offset_kernel(pmd, 0));
+ }
+
+ return pte_offset_kernel(pmd, 0);
+}
+
+pmd_t * __init populate_extra_pmd(unsigned long vaddr)
+{
+ int pgd_idx = pgd_index(vaddr);
+ int pmd_idx = pmd_index(vaddr);
+
+ return one_md_table_init(swapper_pg_dir + pgd_idx) + pmd_idx;
+}
+
+pte_t * __init populate_extra_pte(unsigned long vaddr)
+{
+ int pte_idx = pte_index(vaddr);
+ pmd_t *pmd;
+
+ pmd = populate_extra_pmd(vaddr);
+ return one_page_table_init(pmd) + pte_idx;
+}
+
+static unsigned long __init
+page_table_range_init_count(unsigned long start, unsigned long end)
+{
+ unsigned long count = 0;
+#ifdef CONFIG_HIGHMEM
+ int pmd_idx_kmap_begin = fix_to_virt(FIX_KMAP_END) >> PMD_SHIFT;
+ int pmd_idx_kmap_end = fix_to_virt(FIX_KMAP_BEGIN) >> PMD_SHIFT;
+ int pgd_idx, pmd_idx;
+ unsigned long vaddr;
+
+ if (pmd_idx_kmap_begin == pmd_idx_kmap_end)
+ return 0;
+
+ vaddr = start;
+ pgd_idx = pgd_index(vaddr);
+ pmd_idx = pmd_index(vaddr);
+
+ for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd_idx++) {
+ for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
+ pmd_idx++) {
+ if ((vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin &&
+ (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end)
+ count++;
+ vaddr += PMD_SIZE;
+ }
+ pmd_idx = 0;
+ }
+#endif
+ return count;
+}
+
+static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
+ unsigned long vaddr, pte_t *lastpte,
+ void **adr)
+{
+#ifdef CONFIG_HIGHMEM
+ /*
+ * Something (early fixmap) may already have put a pte
+ * page here, which causes the page table allocation
+ * to become nonlinear. Attempt to fix it, and if it
+ * is still nonlinear then we have to bug.
+ */
+ int pmd_idx_kmap_begin = fix_to_virt(FIX_KMAP_END) >> PMD_SHIFT;
+ int pmd_idx_kmap_end = fix_to_virt(FIX_KMAP_BEGIN) >> PMD_SHIFT;
+
+ if (pmd_idx_kmap_begin != pmd_idx_kmap_end
+ && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin
+ && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end) {
+ pte_t *newpte;
+ int i;
+
+ BUG_ON(after_bootmem);
+ newpte = *adr;
+ for (i = 0; i < PTRS_PER_PTE; i++)
+ set_pte(newpte + i, pte[i]);
+ *adr = (void *)(((unsigned long)(*adr)) + PAGE_SIZE);
+
+ paravirt_alloc_pte(&init_mm, __pa(newpte) >> PAGE_SHIFT);
+ set_pmd(pmd, __pmd(__pa(newpte)|_PAGE_TABLE));
+ BUG_ON(newpte != pte_offset_kernel(pmd, 0));
+ __flush_tlb_all();
+
+ paravirt_release_pte(__pa(pte) >> PAGE_SHIFT);
+ pte = newpte;
+ }
+ BUG_ON(vaddr < fix_to_virt(FIX_KMAP_BEGIN - 1)
+ && vaddr > fix_to_virt(FIX_KMAP_END)
+ && lastpte && lastpte + PTRS_PER_PTE != pte);
+#endif
+ return pte;
+}
+
+/*
+ * This function initializes a certain range of kernel virtual memory
+ * with new bootmem page tables, everywhere page tables are missing in
+ * the given range.
+ *
+ * NOTE: The pagetables are allocated contiguous on the physical space
+ * so we can cache the place of the first one and move around without
+ * checking the pgd every time.
+ */
+static void __init
+page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
+{
+ int pgd_idx, pmd_idx;
+ unsigned long vaddr;
+ pgd_t *pgd;
+ pmd_t *pmd;
+ pte_t *pte = NULL;
+ unsigned long count = page_table_range_init_count(start, end);
+ void *adr = NULL;
+
+ if (count)
+ adr = alloc_low_pages(count);
+
+ vaddr = start;
+ pgd_idx = pgd_index(vaddr);
+ pmd_idx = pmd_index(vaddr);
+ pgd = pgd_base + pgd_idx;
+
+ for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) {
+ pmd = one_md_table_init(pgd);
+ pmd = pmd + pmd_index(vaddr);
+ for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
+ pmd++, pmd_idx++) {
+ pte = page_table_kmap_check(one_page_table_init(pmd),
+ pmd, vaddr, pte, &adr);
+
+ vaddr += PMD_SIZE;
+ }
+ pmd_idx = 0;
+ }
+}
+
+static inline int is_kernel_text(unsigned long addr)
+{
+ if (addr >= (unsigned long)_text && addr <= (unsigned long)__init_end)
+ return 1;
+ return 0;
+}
+
+/*
+ * This maps the physical memory to kernel virtual address space, a total
+ * of max_low_pfn pages, by creating page tables starting from address
+ * PAGE_OFFSET:
+ */
+unsigned long __init
+kernel_physical_mapping_init(unsigned long start,
+ unsigned long end,
+ unsigned long page_size_mask)
+{
+ int use_pse = page_size_mask == (1<<PG_LEVEL_2M);
+ unsigned long last_map_addr = end;
+ unsigned long start_pfn, end_pfn;
+ pgd_t *pgd_base = swapper_pg_dir;
+ int pgd_idx, pmd_idx, pte_ofs;
+ unsigned long pfn;
+ pgd_t *pgd;
+ pmd_t *pmd;
+ pte_t *pte;
+ unsigned pages_2m, pages_4k;
+ int mapping_iter;
+
+ start_pfn = start >> PAGE_SHIFT;
+ end_pfn = end >> PAGE_SHIFT;
+
+ /*
+ * First iteration will setup identity mapping using large/small pages
+ * based on use_pse, with other attributes same as set by
+ * the early code in head_32.S
+ *
+ * Second iteration will setup the appropriate attributes (NX, GLOBAL..)
+ * as desired for the kernel identity mapping.
+ *
+ * This two pass mechanism conforms to the TLB app note which says:
+ *
+ * "Software should not write to a paging-structure entry in a way
+ * that would change, for any linear address, both the page size
+ * and either the page frame or attributes."
+ */
+ mapping_iter = 1;
+
+ if (!boot_cpu_has(X86_FEATURE_PSE))
+ use_pse = 0;
+
+repeat:
+ pages_2m = pages_4k = 0;
+ pfn = start_pfn;
+ pgd_idx = pgd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
+ pgd = pgd_base + pgd_idx;
+ for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
+ pmd = one_md_table_init(pgd);
+
+ if (pfn >= end_pfn)
+ continue;
+#ifdef CONFIG_X86_PAE
+ pmd_idx = pmd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
+ pmd += pmd_idx;
+#else
+ pmd_idx = 0;
+#endif
+ for (; pmd_idx < PTRS_PER_PMD && pfn < end_pfn;
+ pmd++, pmd_idx++) {
+ unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET;
+
+ /*
+ * Map with big pages if possible, otherwise
+ * create normal page tables:
+ */
+ if (use_pse) {
+ unsigned int addr2;
+ pgprot_t prot = PAGE_KERNEL_LARGE;
+ /*
+ * first pass will use the same initial
+ * identity mapping attribute + _PAGE_PSE.
+ */
+ pgprot_t init_prot =
+ __pgprot(PTE_IDENT_ATTR |
+ _PAGE_PSE);
+
+ pfn &= PMD_MASK >> PAGE_SHIFT;
+ addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE +
+ PAGE_OFFSET + PAGE_SIZE-1;
+
+ if (is_kernel_text(addr) ||
+ is_kernel_text(addr2))
+ prot = PAGE_KERNEL_LARGE_EXEC;
+
+ pages_2m++;
+ if (mapping_iter == 1)
+ set_pmd(pmd, pfn_pmd(pfn, init_prot));
+ else
+ set_pmd(pmd, pfn_pmd(pfn, prot));
+
+ pfn += PTRS_PER_PTE;
+ continue;
+ }
+ pte = one_page_table_init(pmd);
+
+ pte_ofs = pte_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
+ pte += pte_ofs;
+ for (; pte_ofs < PTRS_PER_PTE && pfn < end_pfn;
+ pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) {
+ pgprot_t prot = PAGE_KERNEL;
+ /*
+ * first pass will use the same initial
+ * identity mapping attribute.
+ */
+ pgprot_t init_prot = __pgprot(PTE_IDENT_ATTR);
+
+ if (is_kernel_text(addr))
+ prot = PAGE_KERNEL_EXEC;
+
+ pages_4k++;
+ if (mapping_iter == 1) {
+ set_pte(pte, pfn_pte(pfn, init_prot));
+ last_map_addr = (pfn << PAGE_SHIFT) + PAGE_SIZE;
+ } else
+ set_pte(pte, pfn_pte(pfn, prot));
+ }
+ }
+ }
+ if (mapping_iter == 1) {
+ /*
+ * update direct mapping page count only in the first
+ * iteration.
+ */
+ update_page_count(PG_LEVEL_2M, pages_2m);
+ update_page_count(PG_LEVEL_4K, pages_4k);
+
+ /*
+ * local global flush tlb, which will flush the previous
+ * mappings present in both small and large page TLB's.
+ */
+ __flush_tlb_all();
+
+ /*
+ * Second iteration will set the actual desired PTE attributes.
+ */
+ mapping_iter = 2;
+ goto repeat;
+ }
+ return last_map_addr;
+}
+
+pte_t *kmap_pte;
+
+static inline pte_t *kmap_get_fixmap_pte(unsigned long vaddr)
+{
+ pgd_t *pgd = pgd_offset_k(vaddr);
+ p4d_t *p4d = p4d_offset(pgd, vaddr);
+ pud_t *pud = pud_offset(p4d, vaddr);
+ pmd_t *pmd = pmd_offset(pud, vaddr);
+ return pte_offset_kernel(pmd, vaddr);
+}
+
+static void __init kmap_init(void)
+{
+ unsigned long kmap_vstart;
+
+ /*
+ * Cache the first kmap pte:
+ */
+ kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN);
+ kmap_pte = kmap_get_fixmap_pte(kmap_vstart);
+}
+
+#ifdef CONFIG_HIGHMEM
+static void __init permanent_kmaps_init(pgd_t *pgd_base)
+{
+ unsigned long vaddr;
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ vaddr = PKMAP_BASE;
+ page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base);
+
+ pgd = swapper_pg_dir + pgd_index(vaddr);
+ p4d = p4d_offset(pgd, vaddr);
+ pud = pud_offset(p4d, vaddr);
+ pmd = pmd_offset(pud, vaddr);
+ pte = pte_offset_kernel(pmd, vaddr);
+ pkmap_page_table = pte;
+}
+
+void __init add_highpages_with_active_regions(int nid,
+ unsigned long start_pfn, unsigned long end_pfn)
+{
+ phys_addr_t start, end;
+ u64 i;
+
+ for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &start, &end, NULL) {
+ unsigned long pfn = clamp_t(unsigned long, PFN_UP(start),
+ start_pfn, end_pfn);
+ unsigned long e_pfn = clamp_t(unsigned long, PFN_DOWN(end),
+ start_pfn, end_pfn);
+ for ( ; pfn < e_pfn; pfn++)
+ if (pfn_valid(pfn))
+ free_highmem_page(pfn_to_page(pfn));
+ }
+}
+#else
+static inline void permanent_kmaps_init(pgd_t *pgd_base)
+{
+}
+#endif /* CONFIG_HIGHMEM */
+
+void __init sync_initial_page_table(void)
+{
+ clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY,
+ swapper_pg_dir + KERNEL_PGD_BOUNDARY,
+ KERNEL_PGD_PTRS);
+
+ /*
+ * sync back low identity map too. It is used for example
+ * in the 32-bit EFI stub.
+ */
+ clone_pgd_range(initial_page_table,
+ swapper_pg_dir + KERNEL_PGD_BOUNDARY,
+ min(KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY));
+}
+
+void __init native_pagetable_init(void)
+{
+ unsigned long pfn, va;
+ pgd_t *pgd, *base = swapper_pg_dir;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ /*
+ * Remove any mappings which extend past the end of physical
+ * memory from the boot time page table.
+ * In virtual address space, we should have at least two pages
+ * from VMALLOC_END to pkmap or fixmap according to VMALLOC_END
+ * definition. And max_low_pfn is set to VMALLOC_END physical
+ * address. If initial memory mapping is doing right job, we
+ * should have pte used near max_low_pfn or one pmd is not present.
+ */
+ for (pfn = max_low_pfn; pfn < 1<<(32-PAGE_SHIFT); pfn++) {
+ va = PAGE_OFFSET + (pfn<<PAGE_SHIFT);
+ pgd = base + pgd_index(va);
+ if (!pgd_present(*pgd))
+ break;
+
+ p4d = p4d_offset(pgd, va);
+ pud = pud_offset(p4d, va);
+ pmd = pmd_offset(pud, va);
+ if (!pmd_present(*pmd))
+ break;
+
+ /* should not be large page here */
+ if (pmd_large(*pmd)) {
+ pr_warn("try to clear pte for ram above max_low_pfn: pfn: %lx pmd: %p pmd phys: %lx, but pmd is big page and is not using pte !\n",
+ pfn, pmd, __pa(pmd));
+ BUG_ON(1);
+ }
+
+ pte = pte_offset_kernel(pmd, va);
+ if (!pte_present(*pte))
+ break;
+
+ printk(KERN_DEBUG "clearing pte for ram above max_low_pfn: pfn: %lx pmd: %p pmd phys: %lx pte: %p pte phys: %lx\n",
+ pfn, pmd, __pa(pmd), pte, __pa(pte));
+ pte_clear(NULL, va, pte);
+ }
+ paravirt_alloc_pmd(&init_mm, __pa(base) >> PAGE_SHIFT);
+ paging_init();
+}
+
+/*
+ * Build a proper pagetable for the kernel mappings. Up until this
+ * point, we've been running on some set of pagetables constructed by
+ * the boot process.
+ *
+ * If we're booting on native hardware, this will be a pagetable
+ * constructed in arch/x86/kernel/head_32.S. The root of the
+ * pagetable will be swapper_pg_dir.
+ *
+ * If we're booting paravirtualized under a hypervisor, then there are
+ * more options: we may already be running PAE, and the pagetable may
+ * or may not be based in swapper_pg_dir. In any case,
+ * paravirt_pagetable_init() will set up swapper_pg_dir
+ * appropriately for the rest of the initialization to work.
+ *
+ * In general, pagetable_init() assumes that the pagetable may already
+ * be partially populated, and so it avoids stomping on any existing
+ * mappings.
+ */
+void __init early_ioremap_page_table_range_init(void)
+{
+ pgd_t *pgd_base = swapper_pg_dir;
+ unsigned long vaddr, end;
+
+ /*
+ * Fixed mappings, only the page table structure has to be
+ * created - mappings will be set by set_fixmap():
+ */
+ vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
+ end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
+ page_table_range_init(vaddr, end, pgd_base);
+ early_ioremap_reset();
+}
+
+static void __init pagetable_init(void)
+{
+ pgd_t *pgd_base = swapper_pg_dir;
+
+ permanent_kmaps_init(pgd_base);
+}
+
+#define DEFAULT_PTE_MASK ~(_PAGE_NX | _PAGE_GLOBAL)
+/* Bits supported by the hardware: */
+pteval_t __supported_pte_mask __read_mostly = DEFAULT_PTE_MASK;
+/* Bits allowed in normal kernel mappings: */
+pteval_t __default_kernel_pte_mask __read_mostly = DEFAULT_PTE_MASK;
+EXPORT_SYMBOL_GPL(__supported_pte_mask);
+/* Used in PAGE_KERNEL_* macros which are reasonably used out-of-tree: */
+EXPORT_SYMBOL(__default_kernel_pte_mask);
+
+/* user-defined highmem size */
+static unsigned int highmem_pages = -1;
+
+/*
+ * highmem=size forces highmem to be exactly 'size' bytes.
+ * This works even on boxes that have no highmem otherwise.
+ * This also works to reduce highmem size on bigger boxes.
+ */
+static int __init parse_highmem(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
+
+ highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT;
+ return 0;
+}
+early_param("highmem", parse_highmem);
+
+#define MSG_HIGHMEM_TOO_BIG \
+ "highmem size (%luMB) is bigger than pages available (%luMB)!\n"
+
+#define MSG_LOWMEM_TOO_SMALL \
+ "highmem size (%luMB) results in <64MB lowmem, ignoring it!\n"
+/*
+ * All of RAM fits into lowmem - but if user wants highmem
+ * artificially via the highmem=x boot parameter then create
+ * it:
+ */
+static void __init lowmem_pfn_init(void)
+{
+ /* max_low_pfn is 0, we already have early_res support */
+ max_low_pfn = max_pfn;
+
+ if (highmem_pages == -1)
+ highmem_pages = 0;
+#ifdef CONFIG_HIGHMEM
+ if (highmem_pages >= max_pfn) {
+ printk(KERN_ERR MSG_HIGHMEM_TOO_BIG,
+ pages_to_mb(highmem_pages), pages_to_mb(max_pfn));
+ highmem_pages = 0;
+ }
+ if (highmem_pages) {
+ if (max_low_pfn - highmem_pages < 64*1024*1024/PAGE_SIZE) {
+ printk(KERN_ERR MSG_LOWMEM_TOO_SMALL,
+ pages_to_mb(highmem_pages));
+ highmem_pages = 0;
+ }
+ max_low_pfn -= highmem_pages;
+ }
+#else
+ if (highmem_pages)
+ printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n");
+#endif
+}
+
+#define MSG_HIGHMEM_TOO_SMALL \
+ "only %luMB highmem pages available, ignoring highmem size of %luMB!\n"
+
+#define MSG_HIGHMEM_TRIMMED \
+ "Warning: only 4GB will be used. Use a HIGHMEM64G enabled kernel!\n"
+/*
+ * We have more RAM than fits into lowmem - we try to put it into
+ * highmem, also taking the highmem=x boot parameter into account:
+ */
+static void __init highmem_pfn_init(void)
+{
+ max_low_pfn = MAXMEM_PFN;
+
+ if (highmem_pages == -1)
+ highmem_pages = max_pfn - MAXMEM_PFN;
+
+ if (highmem_pages + MAXMEM_PFN < max_pfn)
+ max_pfn = MAXMEM_PFN + highmem_pages;
+
+ if (highmem_pages + MAXMEM_PFN > max_pfn) {
+ printk(KERN_WARNING MSG_HIGHMEM_TOO_SMALL,
+ pages_to_mb(max_pfn - MAXMEM_PFN),
+ pages_to_mb(highmem_pages));
+ highmem_pages = 0;
+ }
+#ifndef CONFIG_HIGHMEM
+ /* Maximum memory usable is what is directly addressable */
+ printk(KERN_WARNING "Warning only %ldMB will be used.\n", MAXMEM>>20);
+ if (max_pfn > MAX_NONPAE_PFN)
+ printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
+ else
+ printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
+ max_pfn = MAXMEM_PFN;
+#else /* !CONFIG_HIGHMEM */
+#ifndef CONFIG_HIGHMEM64G
+ if (max_pfn > MAX_NONPAE_PFN) {
+ max_pfn = MAX_NONPAE_PFN;
+ printk(KERN_WARNING MSG_HIGHMEM_TRIMMED);
+ }
+#endif /* !CONFIG_HIGHMEM64G */
+#endif /* !CONFIG_HIGHMEM */
+}
+
+/*
+ * Determine low and high memory ranges:
+ */
+void __init find_low_pfn_range(void)
+{
+ /* it could update max_pfn */
+
+ if (max_pfn <= MAXMEM_PFN)
+ lowmem_pfn_init();
+ else
+ highmem_pfn_init();
+}
+
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+void __init initmem_init(void)
+{
+#ifdef CONFIG_HIGHMEM
+ highstart_pfn = highend_pfn = max_pfn;
+ if (max_pfn > max_low_pfn)
+ highstart_pfn = max_low_pfn;
+ printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
+ pages_to_mb(highend_pfn - highstart_pfn));
+ high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
+#else
+ high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
+#endif
+
+ memblock_set_node(0, PHYS_ADDR_MAX, &memblock.memory, 0);
+ sparse_memory_present_with_active_regions(0);
+
+#ifdef CONFIG_FLATMEM
+ max_mapnr = IS_ENABLED(CONFIG_HIGHMEM) ? highend_pfn : max_low_pfn;
+#endif
+ __vmalloc_start_set = true;
+
+ printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
+ pages_to_mb(max_low_pfn));
+
+ setup_bootmem_allocator();
+}
+#endif /* !CONFIG_NEED_MULTIPLE_NODES */
+
+void __init setup_bootmem_allocator(void)
+{
+ printk(KERN_INFO " mapped low ram: 0 - %08lx\n",
+ max_pfn_mapped<<PAGE_SHIFT);
+ printk(KERN_INFO " low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT);
+}
+
+/*
+ * paging_init() sets up the page tables - note that the first 8MB are
+ * already mapped by head.S.
+ *
+ * This routines also unmaps the page at virtual kernel address 0, so
+ * that we can trap those pesky NULL-reference errors in the kernel.
+ */
+void __init paging_init(void)
+{
+ pagetable_init();
+
+ __flush_tlb_all();
+
+ kmap_init();
+
+ /*
+ * NOTE: at this point the bootmem allocator is fully available.
+ */
+ olpc_dt_build_devicetree();
+ sparse_memory_present_with_active_regions(MAX_NUMNODES);
+ sparse_init();
+ zone_sizes_init();
+}
+
+/*
+ * Test if the WP bit works in supervisor mode. It isn't supported on 386's
+ * and also on some strange 486's. All 586+'s are OK. This used to involve
+ * black magic jumps to work around some nasty CPU bugs, but fortunately the
+ * switch to using exceptions got rid of all that.
+ */
+static void __init test_wp_bit(void)
+{
+ char z = 0;
+
+ printk(KERN_INFO "Checking if this processor honours the WP bit even in supervisor mode...");
+
+ __set_fixmap(FIX_WP_TEST, __pa_symbol(empty_zero_page), PAGE_KERNEL_RO);
+
+ if (probe_kernel_write((char *)fix_to_virt(FIX_WP_TEST), &z, 1)) {
+ clear_fixmap(FIX_WP_TEST);
+ printk(KERN_CONT "Ok.\n");
+ return;
+ }
+
+ printk(KERN_CONT "No.\n");
+ panic("Linux doesn't support CPUs with broken WP.");
+}
+
+void __init mem_init(void)
+{
+ pci_iommu_alloc();
+
+#ifdef CONFIG_FLATMEM
+ BUG_ON(!mem_map);
+#endif
+ /*
+ * With CONFIG_DEBUG_PAGEALLOC initialization of highmem pages has to
+ * be done before free_all_bootmem(). Memblock use free low memory for
+ * temporary data (see find_range_array()) and for this purpose can use
+ * pages that was already passed to the buddy allocator, hence marked as
+ * not accessible in the page tables when compiled with
+ * CONFIG_DEBUG_PAGEALLOC. Otherwise order of initialization is not
+ * important here.
+ */
+ set_highmem_pages_init();
+
+ /* this will put all low memory onto the freelists */
+ free_all_bootmem();
+
+ after_bootmem = 1;
+ x86_init.hyper.init_after_bootmem();
+
+ mem_init_print_info(NULL);
+ printk(KERN_INFO "virtual kernel memory layout:\n"
+ " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
+ " cpu_entry : 0x%08lx - 0x%08lx (%4ld kB)\n"
+#ifdef CONFIG_HIGHMEM
+ " pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
+#endif
+ " vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n"
+ " lowmem : 0x%08lx - 0x%08lx (%4ld MB)\n"
+ " .init : 0x%08lx - 0x%08lx (%4ld kB)\n"
+ " .data : 0x%08lx - 0x%08lx (%4ld kB)\n"
+ " .text : 0x%08lx - 0x%08lx (%4ld kB)\n",
+ FIXADDR_START, FIXADDR_TOP,
+ (FIXADDR_TOP - FIXADDR_START) >> 10,
+
+ CPU_ENTRY_AREA_BASE,
+ CPU_ENTRY_AREA_BASE + CPU_ENTRY_AREA_MAP_SIZE,
+ CPU_ENTRY_AREA_MAP_SIZE >> 10,
+
+#ifdef CONFIG_HIGHMEM
+ PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
+ (LAST_PKMAP*PAGE_SIZE) >> 10,
+#endif
+
+ VMALLOC_START, VMALLOC_END,
+ (VMALLOC_END - VMALLOC_START) >> 20,
+
+ (unsigned long)__va(0), (unsigned long)high_memory,
+ ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20,
+
+ (unsigned long)&__init_begin, (unsigned long)&__init_end,
+ ((unsigned long)&__init_end -
+ (unsigned long)&__init_begin) >> 10,
+
+ (unsigned long)&_etext, (unsigned long)&_edata,
+ ((unsigned long)&_edata - (unsigned long)&_etext) >> 10,
+
+ (unsigned long)&_text, (unsigned long)&_etext,
+ ((unsigned long)&_etext - (unsigned long)&_text) >> 10);
+
+ /*
+ * Check boundaries twice: Some fundamental inconsistencies can
+ * be detected at build time already.
+ */
+#define __FIXADDR_TOP (-PAGE_SIZE)
+#ifdef CONFIG_HIGHMEM
+ BUILD_BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
+ BUILD_BUG_ON(VMALLOC_END > PKMAP_BASE);
+#endif
+#define high_memory (-128UL << 20)
+ BUILD_BUG_ON(VMALLOC_START >= VMALLOC_END);
+#undef high_memory
+#undef __FIXADDR_TOP
+
+#ifdef CONFIG_HIGHMEM
+ BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
+ BUG_ON(VMALLOC_END > PKMAP_BASE);
+#endif
+ BUG_ON(VMALLOC_START >= VMALLOC_END);
+ BUG_ON((unsigned long)high_memory > VMALLOC_START);
+
+ test_wp_bit();
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
+ bool want_memblock)
+{
+ unsigned long start_pfn = start >> PAGE_SHIFT;
+ unsigned long nr_pages = size >> PAGE_SHIFT;
+
+ return __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
+}
+
+void arch_remove_memory(int nid, u64 start, u64 size,
+ struct vmem_altmap *altmap)
+{
+ unsigned long start_pfn = start >> PAGE_SHIFT;
+ unsigned long nr_pages = size >> PAGE_SHIFT;
+
+ __remove_pages(start_pfn, nr_pages, altmap);
+}
+#endif
+
+int kernel_set_to_readonly __read_mostly;
+
+void set_kernel_text_rw(void)
+{
+ unsigned long start = PFN_ALIGN(_text);
+ unsigned long size = PFN_ALIGN(_etext) - start;
+
+ if (!kernel_set_to_readonly)
+ return;
+
+ pr_debug("Set kernel text: %lx - %lx for read write\n",
+ start, start+size);
+
+ set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT);
+}
+
+void set_kernel_text_ro(void)
+{
+ unsigned long start = PFN_ALIGN(_text);
+ unsigned long size = PFN_ALIGN(_etext) - start;
+
+ if (!kernel_set_to_readonly)
+ return;
+
+ pr_debug("Set kernel text: %lx - %lx for read only\n",
+ start, start+size);
+
+ set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
+}
+
+static void mark_nxdata_nx(void)
+{
+ /*
+ * When this called, init has already been executed and released,
+ * so everything past _etext should be NX.
+ */
+ unsigned long start = PFN_ALIGN(_etext);
+ /*
+ * This comes from is_kernel_text upper limit. Also HPAGE where used:
+ */
+ unsigned long size = (((unsigned long)__init_end + HPAGE_SIZE) & HPAGE_MASK) - start;
+
+ if (__supported_pte_mask & _PAGE_NX)
+ printk(KERN_INFO "NX-protecting the kernel data: %luk\n", size >> 10);
+ set_pages_nx(virt_to_page(start), size >> PAGE_SHIFT);
+}
+
+void mark_rodata_ro(void)
+{
+ unsigned long start = PFN_ALIGN(_text);
+ unsigned long size = PFN_ALIGN(_etext) - start;
+
+ set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
+ printk(KERN_INFO "Write protecting the kernel text: %luk\n",
+ size >> 10);
+
+ kernel_set_to_readonly = 1;
+
+#ifdef CONFIG_CPA_DEBUG
+ printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n",
+ start, start+size);
+ set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT);
+
+ printk(KERN_INFO "Testing CPA: write protecting again\n");
+ set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
+#endif
+
+ start += size;
+ size = (unsigned long)__end_rodata - start;
+ set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
+ printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
+ size >> 10);
+
+#ifdef CONFIG_CPA_DEBUG
+ printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, start + size);
+ set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT);
+
+ printk(KERN_INFO "Testing CPA: write protecting again\n");
+ set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
+#endif
+ mark_nxdata_nx();
+ if (__supported_pte_mask & _PAGE_NX)
+ debug_checkwx();
+}
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
new file mode 100644
index 000000000..4b25a1ad1
--- /dev/null
+++ b/arch/x86/mm/init_64.c
@@ -0,0 +1,1544 @@
+/*
+ * linux/arch/x86_64/mm/init.c
+ *
+ * Copyright (C) 1995 Linus Torvalds
+ * Copyright (C) 2000 Pavel Machek <pavel@ucw.cz>
+ * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
+ */
+
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/smp.h>
+#include <linux/init.h>
+#include <linux/initrd.h>
+#include <linux/pagemap.h>
+#include <linux/bootmem.h>
+#include <linux/memblock.h>
+#include <linux/proc_fs.h>
+#include <linux/pci.h>
+#include <linux/pfn.h>
+#include <linux/poison.h>
+#include <linux/dma-mapping.h>
+#include <linux/memory.h>
+#include <linux/memory_hotplug.h>
+#include <linux/memremap.h>
+#include <linux/nmi.h>
+#include <linux/gfp.h>
+#include <linux/kcore.h>
+
+#include <asm/processor.h>
+#include <asm/bios_ebda.h>
+#include <linux/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/dma.h>
+#include <asm/fixmap.h>
+#include <asm/e820/api.h>
+#include <asm/apic.h>
+#include <asm/tlb.h>
+#include <asm/mmu_context.h>
+#include <asm/proto.h>
+#include <asm/smp.h>
+#include <asm/sections.h>
+#include <asm/kdebug.h>
+#include <asm/numa.h>
+#include <asm/set_memory.h>
+#include <asm/init.h>
+#include <asm/uv/uv.h>
+#include <asm/setup.h>
+
+#include "mm_internal.h"
+
+#include "ident_map.c"
+
+/*
+ * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
+ * physical space so we can cache the place of the first one and move
+ * around without checking the pgd every time.
+ */
+
+/* Bits supported by the hardware: */
+pteval_t __supported_pte_mask __read_mostly = ~0;
+/* Bits allowed in normal kernel mappings: */
+pteval_t __default_kernel_pte_mask __read_mostly = ~0;
+EXPORT_SYMBOL_GPL(__supported_pte_mask);
+/* Used in PAGE_KERNEL_* macros which are reasonably used out-of-tree: */
+EXPORT_SYMBOL(__default_kernel_pte_mask);
+
+int force_personality32;
+
+/*
+ * noexec32=on|off
+ * Control non executable heap for 32bit processes.
+ * To control the stack too use noexec=off
+ *
+ * on PROT_READ does not imply PROT_EXEC for 32-bit processes (default)
+ * off PROT_READ implies PROT_EXEC
+ */
+static int __init nonx32_setup(char *str)
+{
+ if (!strcmp(str, "on"))
+ force_personality32 &= ~READ_IMPLIES_EXEC;
+ else if (!strcmp(str, "off"))
+ force_personality32 |= READ_IMPLIES_EXEC;
+ return 1;
+}
+__setup("noexec32=", nonx32_setup);
+
+static void sync_global_pgds_l5(unsigned long start, unsigned long end)
+{
+ unsigned long addr;
+
+ for (addr = start; addr <= end; addr = ALIGN(addr + 1, PGDIR_SIZE)) {
+ const pgd_t *pgd_ref = pgd_offset_k(addr);
+ struct page *page;
+
+ /* Check for overflow */
+ if (addr < start)
+ break;
+
+ if (pgd_none(*pgd_ref))
+ continue;
+
+ spin_lock(&pgd_lock);
+ list_for_each_entry(page, &pgd_list, lru) {
+ pgd_t *pgd;
+ spinlock_t *pgt_lock;
+
+ pgd = (pgd_t *)page_address(page) + pgd_index(addr);
+ /* the pgt_lock only for Xen */
+ pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
+ spin_lock(pgt_lock);
+
+ if (!pgd_none(*pgd_ref) && !pgd_none(*pgd))
+ BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
+
+ if (pgd_none(*pgd))
+ set_pgd(pgd, *pgd_ref);
+
+ spin_unlock(pgt_lock);
+ }
+ spin_unlock(&pgd_lock);
+ }
+}
+
+static void sync_global_pgds_l4(unsigned long start, unsigned long end)
+{
+ unsigned long addr;
+
+ for (addr = start; addr <= end; addr = ALIGN(addr + 1, PGDIR_SIZE)) {
+ pgd_t *pgd_ref = pgd_offset_k(addr);
+ const p4d_t *p4d_ref;
+ struct page *page;
+
+ /*
+ * With folded p4d, pgd_none() is always false, we need to
+ * handle synchonization on p4d level.
+ */
+ MAYBE_BUILD_BUG_ON(pgd_none(*pgd_ref));
+ p4d_ref = p4d_offset(pgd_ref, addr);
+
+ if (p4d_none(*p4d_ref))
+ continue;
+
+ spin_lock(&pgd_lock);
+ list_for_each_entry(page, &pgd_list, lru) {
+ pgd_t *pgd;
+ p4d_t *p4d;
+ spinlock_t *pgt_lock;
+
+ pgd = (pgd_t *)page_address(page) + pgd_index(addr);
+ p4d = p4d_offset(pgd, addr);
+ /* the pgt_lock only for Xen */
+ pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
+ spin_lock(pgt_lock);
+
+ if (!p4d_none(*p4d_ref) && !p4d_none(*p4d))
+ BUG_ON(p4d_page_vaddr(*p4d)
+ != p4d_page_vaddr(*p4d_ref));
+
+ if (p4d_none(*p4d))
+ set_p4d(p4d, *p4d_ref);
+
+ spin_unlock(pgt_lock);
+ }
+ spin_unlock(&pgd_lock);
+ }
+}
+
+/*
+ * When memory was added make sure all the processes MM have
+ * suitable PGD entries in the local PGD level page.
+ */
+void sync_global_pgds(unsigned long start, unsigned long end)
+{
+ if (pgtable_l5_enabled())
+ sync_global_pgds_l5(start, end);
+ else
+ sync_global_pgds_l4(start, end);
+}
+
+/*
+ * NOTE: This function is marked __ref because it calls __init function
+ * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
+ */
+static __ref void *spp_getpage(void)
+{
+ void *ptr;
+
+ if (after_bootmem)
+ ptr = (void *) get_zeroed_page(GFP_ATOMIC);
+ else
+ ptr = alloc_bootmem_pages(PAGE_SIZE);
+
+ if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) {
+ panic("set_pte_phys: cannot allocate page data %s\n",
+ after_bootmem ? "after bootmem" : "");
+ }
+
+ pr_debug("spp_getpage %p\n", ptr);
+
+ return ptr;
+}
+
+static p4d_t *fill_p4d(pgd_t *pgd, unsigned long vaddr)
+{
+ if (pgd_none(*pgd)) {
+ p4d_t *p4d = (p4d_t *)spp_getpage();
+ pgd_populate(&init_mm, pgd, p4d);
+ if (p4d != p4d_offset(pgd, 0))
+ printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n",
+ p4d, p4d_offset(pgd, 0));
+ }
+ return p4d_offset(pgd, vaddr);
+}
+
+static pud_t *fill_pud(p4d_t *p4d, unsigned long vaddr)
+{
+ if (p4d_none(*p4d)) {
+ pud_t *pud = (pud_t *)spp_getpage();
+ p4d_populate(&init_mm, p4d, pud);
+ if (pud != pud_offset(p4d, 0))
+ printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
+ pud, pud_offset(p4d, 0));
+ }
+ return pud_offset(p4d, vaddr);
+}
+
+static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr)
+{
+ if (pud_none(*pud)) {
+ pmd_t *pmd = (pmd_t *) spp_getpage();
+ pud_populate(&init_mm, pud, pmd);
+ if (pmd != pmd_offset(pud, 0))
+ printk(KERN_ERR "PAGETABLE BUG #02! %p <-> %p\n",
+ pmd, pmd_offset(pud, 0));
+ }
+ return pmd_offset(pud, vaddr);
+}
+
+static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr)
+{
+ if (pmd_none(*pmd)) {
+ pte_t *pte = (pte_t *) spp_getpage();
+ pmd_populate_kernel(&init_mm, pmd, pte);
+ if (pte != pte_offset_kernel(pmd, 0))
+ printk(KERN_ERR "PAGETABLE BUG #03!\n");
+ }
+ return pte_offset_kernel(pmd, vaddr);
+}
+
+static void __set_pte_vaddr(pud_t *pud, unsigned long vaddr, pte_t new_pte)
+{
+ pmd_t *pmd = fill_pmd(pud, vaddr);
+ pte_t *pte = fill_pte(pmd, vaddr);
+
+ set_pte(pte, new_pte);
+
+ /*
+ * It's enough to flush this one mapping.
+ * (PGE mappings get flushed as well)
+ */
+ __flush_tlb_one_kernel(vaddr);
+}
+
+void set_pte_vaddr_p4d(p4d_t *p4d_page, unsigned long vaddr, pte_t new_pte)
+{
+ p4d_t *p4d = p4d_page + p4d_index(vaddr);
+ pud_t *pud = fill_pud(p4d, vaddr);
+
+ __set_pte_vaddr(pud, vaddr, new_pte);
+}
+
+void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
+{
+ pud_t *pud = pud_page + pud_index(vaddr);
+
+ __set_pte_vaddr(pud, vaddr, new_pte);
+}
+
+void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
+{
+ pgd_t *pgd;
+ p4d_t *p4d_page;
+
+ pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(pteval));
+
+ pgd = pgd_offset_k(vaddr);
+ if (pgd_none(*pgd)) {
+ printk(KERN_ERR
+ "PGD FIXMAP MISSING, it should be setup in head.S!\n");
+ return;
+ }
+
+ p4d_page = p4d_offset(pgd, 0);
+ set_pte_vaddr_p4d(p4d_page, vaddr, pteval);
+}
+
+pmd_t * __init populate_extra_pmd(unsigned long vaddr)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+
+ pgd = pgd_offset_k(vaddr);
+ p4d = fill_p4d(pgd, vaddr);
+ pud = fill_pud(p4d, vaddr);
+ return fill_pmd(pud, vaddr);
+}
+
+pte_t * __init populate_extra_pte(unsigned long vaddr)
+{
+ pmd_t *pmd;
+
+ pmd = populate_extra_pmd(vaddr);
+ return fill_pte(pmd, vaddr);
+}
+
+/*
+ * Create large page table mappings for a range of physical addresses.
+ */
+static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
+ enum page_cache_mode cache)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pgprot_t prot;
+
+ pgprot_val(prot) = pgprot_val(PAGE_KERNEL_LARGE) |
+ pgprot_val(pgprot_4k_2_large(cachemode2pgprot(cache)));
+ BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK));
+ for (; size; phys += PMD_SIZE, size -= PMD_SIZE) {
+ pgd = pgd_offset_k((unsigned long)__va(phys));
+ if (pgd_none(*pgd)) {
+ p4d = (p4d_t *) spp_getpage();
+ set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE |
+ _PAGE_USER));
+ }
+ p4d = p4d_offset(pgd, (unsigned long)__va(phys));
+ if (p4d_none(*p4d)) {
+ pud = (pud_t *) spp_getpage();
+ set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE |
+ _PAGE_USER));
+ }
+ pud = pud_offset(p4d, (unsigned long)__va(phys));
+ if (pud_none(*pud)) {
+ pmd = (pmd_t *) spp_getpage();
+ set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE |
+ _PAGE_USER));
+ }
+ pmd = pmd_offset(pud, phys);
+ BUG_ON(!pmd_none(*pmd));
+ set_pmd(pmd, __pmd(phys | pgprot_val(prot)));
+ }
+}
+
+void __init init_extra_mapping_wb(unsigned long phys, unsigned long size)
+{
+ __init_extra_mapping(phys, size, _PAGE_CACHE_MODE_WB);
+}
+
+void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
+{
+ __init_extra_mapping(phys, size, _PAGE_CACHE_MODE_UC);
+}
+
+/*
+ * The head.S code sets up the kernel high mapping:
+ *
+ * from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text)
+ *
+ * phys_base holds the negative offset to the kernel, which is added
+ * to the compile time generated pmds. This results in invalid pmds up
+ * to the point where we hit the physaddr 0 mapping.
+ *
+ * We limit the mappings to the region from _text to _brk_end. _brk_end
+ * is rounded up to the 2MB boundary. This catches the invalid pmds as
+ * well, as they are located before _text:
+ */
+void __init cleanup_highmap(void)
+{
+ unsigned long vaddr = __START_KERNEL_map;
+ unsigned long vaddr_end = __START_KERNEL_map + KERNEL_IMAGE_SIZE;
+ unsigned long end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1;
+ pmd_t *pmd = level2_kernel_pgt;
+
+ /*
+ * Native path, max_pfn_mapped is not set yet.
+ * Xen has valid max_pfn_mapped set in
+ * arch/x86/xen/mmu.c:xen_setup_kernel_pagetable().
+ */
+ if (max_pfn_mapped)
+ vaddr_end = __START_KERNEL_map + (max_pfn_mapped << PAGE_SHIFT);
+
+ for (; vaddr + PMD_SIZE - 1 < vaddr_end; pmd++, vaddr += PMD_SIZE) {
+ if (pmd_none(*pmd))
+ continue;
+ if (vaddr < (unsigned long) _text || vaddr > end)
+ set_pmd(pmd, __pmd(0));
+ }
+}
+
+/*
+ * Create PTE level page table mapping for physical addresses.
+ * It returns the last physical address mapped.
+ */
+static unsigned long __meminit
+phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end,
+ pgprot_t prot)
+{
+ unsigned long pages = 0, paddr_next;
+ unsigned long paddr_last = paddr_end;
+ pte_t *pte;
+ int i;
+
+ pte = pte_page + pte_index(paddr);
+ i = pte_index(paddr);
+
+ for (; i < PTRS_PER_PTE; i++, paddr = paddr_next, pte++) {
+ paddr_next = (paddr & PAGE_MASK) + PAGE_SIZE;
+ if (paddr >= paddr_end) {
+ if (!after_bootmem &&
+ !e820__mapped_any(paddr & PAGE_MASK, paddr_next,
+ E820_TYPE_RAM) &&
+ !e820__mapped_any(paddr & PAGE_MASK, paddr_next,
+ E820_TYPE_RESERVED_KERN))
+ set_pte(pte, __pte(0));
+ continue;
+ }
+
+ /*
+ * We will re-use the existing mapping.
+ * Xen for example has some special requirements, like mapping
+ * pagetable pages as RO. So assume someone who pre-setup
+ * these mappings are more intelligent.
+ */
+ if (!pte_none(*pte)) {
+ if (!after_bootmem)
+ pages++;
+ continue;
+ }
+
+ if (0)
+ pr_info(" pte=%p addr=%lx pte=%016lx\n", pte, paddr,
+ pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL).pte);
+ pages++;
+ set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot));
+ paddr_last = (paddr & PAGE_MASK) + PAGE_SIZE;
+ }
+
+ update_page_count(PG_LEVEL_4K, pages);
+
+ return paddr_last;
+}
+
+/*
+ * Create PMD level page table mapping for physical addresses. The virtual
+ * and physical address have to be aligned at this level.
+ * It returns the last physical address mapped.
+ */
+static unsigned long __meminit
+phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end,
+ unsigned long page_size_mask, pgprot_t prot)
+{
+ unsigned long pages = 0, paddr_next;
+ unsigned long paddr_last = paddr_end;
+
+ int i = pmd_index(paddr);
+
+ for (; i < PTRS_PER_PMD; i++, paddr = paddr_next) {
+ pmd_t *pmd = pmd_page + pmd_index(paddr);
+ pte_t *pte;
+ pgprot_t new_prot = prot;
+
+ paddr_next = (paddr & PMD_MASK) + PMD_SIZE;
+ if (paddr >= paddr_end) {
+ if (!after_bootmem &&
+ !e820__mapped_any(paddr & PMD_MASK, paddr_next,
+ E820_TYPE_RAM) &&
+ !e820__mapped_any(paddr & PMD_MASK, paddr_next,
+ E820_TYPE_RESERVED_KERN))
+ set_pmd(pmd, __pmd(0));
+ continue;
+ }
+
+ if (!pmd_none(*pmd)) {
+ if (!pmd_large(*pmd)) {
+ spin_lock(&init_mm.page_table_lock);
+ pte = (pte_t *)pmd_page_vaddr(*pmd);
+ paddr_last = phys_pte_init(pte, paddr,
+ paddr_end, prot);
+ spin_unlock(&init_mm.page_table_lock);
+ continue;
+ }
+ /*
+ * If we are ok with PG_LEVEL_2M mapping, then we will
+ * use the existing mapping,
+ *
+ * Otherwise, we will split the large page mapping but
+ * use the same existing protection bits except for
+ * large page, so that we don't violate Intel's TLB
+ * Application note (317080) which says, while changing
+ * the page sizes, new and old translations should
+ * not differ with respect to page frame and
+ * attributes.
+ */
+ if (page_size_mask & (1 << PG_LEVEL_2M)) {
+ if (!after_bootmem)
+ pages++;
+ paddr_last = paddr_next;
+ continue;
+ }
+ new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd));
+ }
+
+ if (page_size_mask & (1<<PG_LEVEL_2M)) {
+ pages++;
+ spin_lock(&init_mm.page_table_lock);
+ set_pte((pte_t *)pmd,
+ pfn_pte((paddr & PMD_MASK) >> PAGE_SHIFT,
+ __pgprot(pgprot_val(prot) | _PAGE_PSE)));
+ spin_unlock(&init_mm.page_table_lock);
+ paddr_last = paddr_next;
+ continue;
+ }
+
+ pte = alloc_low_page();
+ paddr_last = phys_pte_init(pte, paddr, paddr_end, new_prot);
+
+ spin_lock(&init_mm.page_table_lock);
+ pmd_populate_kernel(&init_mm, pmd, pte);
+ spin_unlock(&init_mm.page_table_lock);
+ }
+ update_page_count(PG_LEVEL_2M, pages);
+ return paddr_last;
+}
+
+/*
+ * Create PUD level page table mapping for physical addresses. The virtual
+ * and physical address do not have to be aligned at this level. KASLR can
+ * randomize virtual addresses up to this level.
+ * It returns the last physical address mapped.
+ */
+static unsigned long __meminit
+phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end,
+ unsigned long page_size_mask)
+{
+ unsigned long pages = 0, paddr_next;
+ unsigned long paddr_last = paddr_end;
+ unsigned long vaddr = (unsigned long)__va(paddr);
+ int i = pud_index(vaddr);
+
+ for (; i < PTRS_PER_PUD; i++, paddr = paddr_next) {
+ pud_t *pud;
+ pmd_t *pmd;
+ pgprot_t prot = PAGE_KERNEL;
+
+ vaddr = (unsigned long)__va(paddr);
+ pud = pud_page + pud_index(vaddr);
+ paddr_next = (paddr & PUD_MASK) + PUD_SIZE;
+
+ if (paddr >= paddr_end) {
+ if (!after_bootmem &&
+ !e820__mapped_any(paddr & PUD_MASK, paddr_next,
+ E820_TYPE_RAM) &&
+ !e820__mapped_any(paddr & PUD_MASK, paddr_next,
+ E820_TYPE_RESERVED_KERN))
+ set_pud(pud, __pud(0));
+ continue;
+ }
+
+ if (!pud_none(*pud)) {
+ if (!pud_large(*pud)) {
+ pmd = pmd_offset(pud, 0);
+ paddr_last = phys_pmd_init(pmd, paddr,
+ paddr_end,
+ page_size_mask,
+ prot);
+ continue;
+ }
+ /*
+ * If we are ok with PG_LEVEL_1G mapping, then we will
+ * use the existing mapping.
+ *
+ * Otherwise, we will split the gbpage mapping but use
+ * the same existing protection bits except for large
+ * page, so that we don't violate Intel's TLB
+ * Application note (317080) which says, while changing
+ * the page sizes, new and old translations should
+ * not differ with respect to page frame and
+ * attributes.
+ */
+ if (page_size_mask & (1 << PG_LEVEL_1G)) {
+ if (!after_bootmem)
+ pages++;
+ paddr_last = paddr_next;
+ continue;
+ }
+ prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud));
+ }
+
+ if (page_size_mask & (1<<PG_LEVEL_1G)) {
+ pages++;
+ spin_lock(&init_mm.page_table_lock);
+ set_pte((pte_t *)pud,
+ pfn_pte((paddr & PUD_MASK) >> PAGE_SHIFT,
+ PAGE_KERNEL_LARGE));
+ spin_unlock(&init_mm.page_table_lock);
+ paddr_last = paddr_next;
+ continue;
+ }
+
+ pmd = alloc_low_page();
+ paddr_last = phys_pmd_init(pmd, paddr, paddr_end,
+ page_size_mask, prot);
+
+ spin_lock(&init_mm.page_table_lock);
+ pud_populate(&init_mm, pud, pmd);
+ spin_unlock(&init_mm.page_table_lock);
+ }
+
+ update_page_count(PG_LEVEL_1G, pages);
+
+ return paddr_last;
+}
+
+static unsigned long __meminit
+phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end,
+ unsigned long page_size_mask)
+{
+ unsigned long paddr_next, paddr_last = paddr_end;
+ unsigned long vaddr = (unsigned long)__va(paddr);
+ int i = p4d_index(vaddr);
+
+ if (!pgtable_l5_enabled())
+ return phys_pud_init((pud_t *) p4d_page, paddr, paddr_end, page_size_mask);
+
+ for (; i < PTRS_PER_P4D; i++, paddr = paddr_next) {
+ p4d_t *p4d;
+ pud_t *pud;
+
+ vaddr = (unsigned long)__va(paddr);
+ p4d = p4d_page + p4d_index(vaddr);
+ paddr_next = (paddr & P4D_MASK) + P4D_SIZE;
+
+ if (paddr >= paddr_end) {
+ if (!after_bootmem &&
+ !e820__mapped_any(paddr & P4D_MASK, paddr_next,
+ E820_TYPE_RAM) &&
+ !e820__mapped_any(paddr & P4D_MASK, paddr_next,
+ E820_TYPE_RESERVED_KERN))
+ set_p4d(p4d, __p4d(0));
+ continue;
+ }
+
+ if (!p4d_none(*p4d)) {
+ pud = pud_offset(p4d, 0);
+ paddr_last = phys_pud_init(pud, paddr,
+ paddr_end,
+ page_size_mask);
+ continue;
+ }
+
+ pud = alloc_low_page();
+ paddr_last = phys_pud_init(pud, paddr, paddr_end,
+ page_size_mask);
+
+ spin_lock(&init_mm.page_table_lock);
+ p4d_populate(&init_mm, p4d, pud);
+ spin_unlock(&init_mm.page_table_lock);
+ }
+
+ return paddr_last;
+}
+
+/*
+ * Create page table mapping for the physical memory for specific physical
+ * addresses. The virtual and physical addresses have to be aligned on PMD level
+ * down. It returns the last physical address mapped.
+ */
+unsigned long __meminit
+kernel_physical_mapping_init(unsigned long paddr_start,
+ unsigned long paddr_end,
+ unsigned long page_size_mask)
+{
+ bool pgd_changed = false;
+ unsigned long vaddr, vaddr_start, vaddr_end, vaddr_next, paddr_last;
+
+ paddr_last = paddr_end;
+ vaddr = (unsigned long)__va(paddr_start);
+ vaddr_end = (unsigned long)__va(paddr_end);
+ vaddr_start = vaddr;
+
+ for (; vaddr < vaddr_end; vaddr = vaddr_next) {
+ pgd_t *pgd = pgd_offset_k(vaddr);
+ p4d_t *p4d;
+
+ vaddr_next = (vaddr & PGDIR_MASK) + PGDIR_SIZE;
+
+ if (pgd_val(*pgd)) {
+ p4d = (p4d_t *)pgd_page_vaddr(*pgd);
+ paddr_last = phys_p4d_init(p4d, __pa(vaddr),
+ __pa(vaddr_end),
+ page_size_mask);
+ continue;
+ }
+
+ p4d = alloc_low_page();
+ paddr_last = phys_p4d_init(p4d, __pa(vaddr), __pa(vaddr_end),
+ page_size_mask);
+
+ spin_lock(&init_mm.page_table_lock);
+ if (pgtable_l5_enabled())
+ pgd_populate(&init_mm, pgd, p4d);
+ else
+ p4d_populate(&init_mm, p4d_offset(pgd, vaddr), (pud_t *) p4d);
+ spin_unlock(&init_mm.page_table_lock);
+ pgd_changed = true;
+ }
+
+ if (pgd_changed)
+ sync_global_pgds(vaddr_start, vaddr_end - 1);
+
+ return paddr_last;
+}
+
+#ifndef CONFIG_NUMA
+void __init initmem_init(void)
+{
+ memblock_set_node(0, PHYS_ADDR_MAX, &memblock.memory, 0);
+}
+#endif
+
+void __init paging_init(void)
+{
+ sparse_memory_present_with_active_regions(MAX_NUMNODES);
+ sparse_init();
+
+ /*
+ * clear the default setting with node 0
+ * note: don't use nodes_clear here, that is really clearing when
+ * numa support is not compiled in, and later node_set_state
+ * will not set it back.
+ */
+ node_clear_state(0, N_MEMORY);
+ if (N_MEMORY != N_NORMAL_MEMORY)
+ node_clear_state(0, N_NORMAL_MEMORY);
+
+ zone_sizes_init();
+}
+
+/*
+ * Memory hotplug specific functions
+ */
+#ifdef CONFIG_MEMORY_HOTPLUG
+/*
+ * After memory hotplug the variables max_pfn, max_low_pfn and high_memory need
+ * updating.
+ */
+static void update_end_of_memory_vars(u64 start, u64 size)
+{
+ unsigned long end_pfn = PFN_UP(start + size);
+
+ if (end_pfn > max_pfn) {
+ max_pfn = end_pfn;
+ max_low_pfn = end_pfn;
+ high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
+ }
+}
+
+int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
+ struct vmem_altmap *altmap, bool want_memblock)
+{
+ int ret;
+
+ ret = __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
+ WARN_ON_ONCE(ret);
+
+ /* update max_pfn, max_low_pfn and high_memory */
+ update_end_of_memory_vars(start_pfn << PAGE_SHIFT,
+ nr_pages << PAGE_SHIFT);
+
+ return ret;
+}
+
+int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
+ bool want_memblock)
+{
+ unsigned long start_pfn = start >> PAGE_SHIFT;
+ unsigned long nr_pages = size >> PAGE_SHIFT;
+
+ init_memory_mapping(start, start + size);
+
+ return add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
+}
+
+#define PAGE_INUSE 0xFD
+
+static void __meminit free_pagetable(struct page *page, int order)
+{
+ unsigned long magic;
+ unsigned int nr_pages = 1 << order;
+
+ /* bootmem page has reserved flag */
+ if (PageReserved(page)) {
+ __ClearPageReserved(page);
+
+ magic = (unsigned long)page->freelist;
+ if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) {
+ while (nr_pages--)
+ put_page_bootmem(page++);
+ } else
+ while (nr_pages--)
+ free_reserved_page(page++);
+ } else
+ free_pages((unsigned long)page_address(page), order);
+}
+
+static void __meminit free_hugepage_table(struct page *page,
+ struct vmem_altmap *altmap)
+{
+ if (altmap)
+ vmem_altmap_free(altmap, PMD_SIZE / PAGE_SIZE);
+ else
+ free_pagetable(page, get_order(PMD_SIZE));
+}
+
+static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd)
+{
+ pte_t *pte;
+ int i;
+
+ for (i = 0; i < PTRS_PER_PTE; i++) {
+ pte = pte_start + i;
+ if (!pte_none(*pte))
+ return;
+ }
+
+ /* free a pte talbe */
+ free_pagetable(pmd_page(*pmd), 0);
+ spin_lock(&init_mm.page_table_lock);
+ pmd_clear(pmd);
+ spin_unlock(&init_mm.page_table_lock);
+}
+
+static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud)
+{
+ pmd_t *pmd;
+ int i;
+
+ for (i = 0; i < PTRS_PER_PMD; i++) {
+ pmd = pmd_start + i;
+ if (!pmd_none(*pmd))
+ return;
+ }
+
+ /* free a pmd talbe */
+ free_pagetable(pud_page(*pud), 0);
+ spin_lock(&init_mm.page_table_lock);
+ pud_clear(pud);
+ spin_unlock(&init_mm.page_table_lock);
+}
+
+static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d)
+{
+ pud_t *pud;
+ int i;
+
+ for (i = 0; i < PTRS_PER_PUD; i++) {
+ pud = pud_start + i;
+ if (!pud_none(*pud))
+ return;
+ }
+
+ /* free a pud talbe */
+ free_pagetable(p4d_page(*p4d), 0);
+ spin_lock(&init_mm.page_table_lock);
+ p4d_clear(p4d);
+ spin_unlock(&init_mm.page_table_lock);
+}
+
+static void __meminit
+remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
+ bool direct)
+{
+ unsigned long next, pages = 0;
+ pte_t *pte;
+ void *page_addr;
+ phys_addr_t phys_addr;
+
+ pte = pte_start + pte_index(addr);
+ for (; addr < end; addr = next, pte++) {
+ next = (addr + PAGE_SIZE) & PAGE_MASK;
+ if (next > end)
+ next = end;
+
+ if (!pte_present(*pte))
+ continue;
+
+ /*
+ * We mapped [0,1G) memory as identity mapping when
+ * initializing, in arch/x86/kernel/head_64.S. These
+ * pagetables cannot be removed.
+ */
+ phys_addr = pte_val(*pte) + (addr & PAGE_MASK);
+ if (phys_addr < (phys_addr_t)0x40000000)
+ return;
+
+ if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) {
+ /*
+ * Do not free direct mapping pages since they were
+ * freed when offlining, or simplely not in use.
+ */
+ if (!direct)
+ free_pagetable(pte_page(*pte), 0);
+
+ spin_lock(&init_mm.page_table_lock);
+ pte_clear(&init_mm, addr, pte);
+ spin_unlock(&init_mm.page_table_lock);
+
+ /* For non-direct mapping, pages means nothing. */
+ pages++;
+ } else {
+ /*
+ * If we are here, we are freeing vmemmap pages since
+ * direct mapped memory ranges to be freed are aligned.
+ *
+ * If we are not removing the whole page, it means
+ * other page structs in this page are being used and
+ * we canot remove them. So fill the unused page_structs
+ * with 0xFD, and remove the page when it is wholly
+ * filled with 0xFD.
+ */
+ memset((void *)addr, PAGE_INUSE, next - addr);
+
+ page_addr = page_address(pte_page(*pte));
+ if (!memchr_inv(page_addr, PAGE_INUSE, PAGE_SIZE)) {
+ free_pagetable(pte_page(*pte), 0);
+
+ spin_lock(&init_mm.page_table_lock);
+ pte_clear(&init_mm, addr, pte);
+ spin_unlock(&init_mm.page_table_lock);
+ }
+ }
+ }
+
+ /* Call free_pte_table() in remove_pmd_table(). */
+ flush_tlb_all();
+ if (direct)
+ update_page_count(PG_LEVEL_4K, -pages);
+}
+
+static void __meminit
+remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end,
+ bool direct, struct vmem_altmap *altmap)
+{
+ unsigned long next, pages = 0;
+ pte_t *pte_base;
+ pmd_t *pmd;
+ void *page_addr;
+
+ pmd = pmd_start + pmd_index(addr);
+ for (; addr < end; addr = next, pmd++) {
+ next = pmd_addr_end(addr, end);
+
+ if (!pmd_present(*pmd))
+ continue;
+
+ if (pmd_large(*pmd)) {
+ if (IS_ALIGNED(addr, PMD_SIZE) &&
+ IS_ALIGNED(next, PMD_SIZE)) {
+ if (!direct)
+ free_hugepage_table(pmd_page(*pmd),
+ altmap);
+
+ spin_lock(&init_mm.page_table_lock);
+ pmd_clear(pmd);
+ spin_unlock(&init_mm.page_table_lock);
+ pages++;
+ } else {
+ /* If here, we are freeing vmemmap pages. */
+ memset((void *)addr, PAGE_INUSE, next - addr);
+
+ page_addr = page_address(pmd_page(*pmd));
+ if (!memchr_inv(page_addr, PAGE_INUSE,
+ PMD_SIZE)) {
+ free_hugepage_table(pmd_page(*pmd),
+ altmap);
+
+ spin_lock(&init_mm.page_table_lock);
+ pmd_clear(pmd);
+ spin_unlock(&init_mm.page_table_lock);
+ }
+ }
+
+ continue;
+ }
+
+ pte_base = (pte_t *)pmd_page_vaddr(*pmd);
+ remove_pte_table(pte_base, addr, next, direct);
+ free_pte_table(pte_base, pmd);
+ }
+
+ /* Call free_pmd_table() in remove_pud_table(). */
+ if (direct)
+ update_page_count(PG_LEVEL_2M, -pages);
+}
+
+static void __meminit
+remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end,
+ struct vmem_altmap *altmap, bool direct)
+{
+ unsigned long next, pages = 0;
+ pmd_t *pmd_base;
+ pud_t *pud;
+ void *page_addr;
+
+ pud = pud_start + pud_index(addr);
+ for (; addr < end; addr = next, pud++) {
+ next = pud_addr_end(addr, end);
+
+ if (!pud_present(*pud))
+ continue;
+
+ if (pud_large(*pud)) {
+ if (IS_ALIGNED(addr, PUD_SIZE) &&
+ IS_ALIGNED(next, PUD_SIZE)) {
+ if (!direct)
+ free_pagetable(pud_page(*pud),
+ get_order(PUD_SIZE));
+
+ spin_lock(&init_mm.page_table_lock);
+ pud_clear(pud);
+ spin_unlock(&init_mm.page_table_lock);
+ pages++;
+ } else {
+ /* If here, we are freeing vmemmap pages. */
+ memset((void *)addr, PAGE_INUSE, next - addr);
+
+ page_addr = page_address(pud_page(*pud));
+ if (!memchr_inv(page_addr, PAGE_INUSE,
+ PUD_SIZE)) {
+ free_pagetable(pud_page(*pud),
+ get_order(PUD_SIZE));
+
+ spin_lock(&init_mm.page_table_lock);
+ pud_clear(pud);
+ spin_unlock(&init_mm.page_table_lock);
+ }
+ }
+
+ continue;
+ }
+
+ pmd_base = pmd_offset(pud, 0);
+ remove_pmd_table(pmd_base, addr, next, direct, altmap);
+ free_pmd_table(pmd_base, pud);
+ }
+
+ if (direct)
+ update_page_count(PG_LEVEL_1G, -pages);
+}
+
+static void __meminit
+remove_p4d_table(p4d_t *p4d_start, unsigned long addr, unsigned long end,
+ struct vmem_altmap *altmap, bool direct)
+{
+ unsigned long next, pages = 0;
+ pud_t *pud_base;
+ p4d_t *p4d;
+
+ p4d = p4d_start + p4d_index(addr);
+ for (; addr < end; addr = next, p4d++) {
+ next = p4d_addr_end(addr, end);
+
+ if (!p4d_present(*p4d))
+ continue;
+
+ BUILD_BUG_ON(p4d_large(*p4d));
+
+ pud_base = pud_offset(p4d, 0);
+ remove_pud_table(pud_base, addr, next, altmap, direct);
+ /*
+ * For 4-level page tables we do not want to free PUDs, but in the
+ * 5-level case we should free them. This code will have to change
+ * to adapt for boot-time switching between 4 and 5 level page tables.
+ */
+ if (pgtable_l5_enabled())
+ free_pud_table(pud_base, p4d);
+ }
+
+ if (direct)
+ update_page_count(PG_LEVEL_512G, -pages);
+}
+
+/* start and end are both virtual address. */
+static void __meminit
+remove_pagetable(unsigned long start, unsigned long end, bool direct,
+ struct vmem_altmap *altmap)
+{
+ unsigned long next;
+ unsigned long addr;
+ pgd_t *pgd;
+ p4d_t *p4d;
+
+ for (addr = start; addr < end; addr = next) {
+ next = pgd_addr_end(addr, end);
+
+ pgd = pgd_offset_k(addr);
+ if (!pgd_present(*pgd))
+ continue;
+
+ p4d = p4d_offset(pgd, 0);
+ remove_p4d_table(p4d, addr, next, altmap, direct);
+ }
+
+ flush_tlb_all();
+}
+
+void __ref vmemmap_free(unsigned long start, unsigned long end,
+ struct vmem_altmap *altmap)
+{
+ remove_pagetable(start, end, false, altmap);
+}
+
+static void __meminit
+kernel_physical_mapping_remove(unsigned long start, unsigned long end)
+{
+ start = (unsigned long)__va(start);
+ end = (unsigned long)__va(end);
+
+ remove_pagetable(start, end, true, NULL);
+}
+
+void __ref arch_remove_memory(int nid, u64 start, u64 size,
+ struct vmem_altmap *altmap)
+{
+ unsigned long start_pfn = start >> PAGE_SHIFT;
+ unsigned long nr_pages = size >> PAGE_SHIFT;
+
+ __remove_pages(start_pfn, nr_pages, altmap);
+ kernel_physical_mapping_remove(start, start + size);
+}
+#endif /* CONFIG_MEMORY_HOTPLUG */
+
+static struct kcore_list kcore_vsyscall;
+
+static void __init register_page_bootmem_info(void)
+{
+#ifdef CONFIG_NUMA
+ int i;
+
+ for_each_online_node(i)
+ register_page_bootmem_info_node(NODE_DATA(i));
+#endif
+}
+
+void __init mem_init(void)
+{
+ pci_iommu_alloc();
+
+ /* clear_bss() already clear the empty_zero_page */
+
+ /* this will put all memory onto the freelists */
+ free_all_bootmem();
+ after_bootmem = 1;
+ x86_init.hyper.init_after_bootmem();
+
+ /*
+ * Must be done after boot memory is put on freelist, because here we
+ * might set fields in deferred struct pages that have not yet been
+ * initialized, and free_all_bootmem() initializes all the reserved
+ * deferred pages for us.
+ */
+ register_page_bootmem_info();
+
+ /* Register memory areas for /proc/kcore */
+ if (get_gate_vma(&init_mm))
+ kclist_add(&kcore_vsyscall, (void *)VSYSCALL_ADDR, PAGE_SIZE, KCORE_USER);
+
+ mem_init_print_info(NULL);
+}
+
+int kernel_set_to_readonly;
+
+void set_kernel_text_rw(void)
+{
+ unsigned long start = PFN_ALIGN(_text);
+ unsigned long end = PFN_ALIGN(__stop___ex_table);
+
+ if (!kernel_set_to_readonly)
+ return;
+
+ pr_debug("Set kernel text: %lx - %lx for read write\n",
+ start, end);
+
+ /*
+ * Make the kernel identity mapping for text RW. Kernel text
+ * mapping will always be RO. Refer to the comment in
+ * static_protections() in pageattr.c
+ */
+ set_memory_rw(start, (end - start) >> PAGE_SHIFT);
+}
+
+void set_kernel_text_ro(void)
+{
+ unsigned long start = PFN_ALIGN(_text);
+ unsigned long end = PFN_ALIGN(__stop___ex_table);
+
+ if (!kernel_set_to_readonly)
+ return;
+
+ pr_debug("Set kernel text: %lx - %lx for read only\n",
+ start, end);
+
+ /*
+ * Set the kernel identity mapping for text RO.
+ */
+ set_memory_ro(start, (end - start) >> PAGE_SHIFT);
+}
+
+void mark_rodata_ro(void)
+{
+ unsigned long start = PFN_ALIGN(_text);
+ unsigned long rodata_start = PFN_ALIGN(__start_rodata);
+ unsigned long end = (unsigned long) &__end_rodata_hpage_align;
+ unsigned long text_end = PFN_ALIGN(&__stop___ex_table);
+ unsigned long rodata_end = PFN_ALIGN(&__end_rodata);
+ unsigned long all_end;
+
+ printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
+ (end - start) >> 10);
+ set_memory_ro(start, (end - start) >> PAGE_SHIFT);
+
+ kernel_set_to_readonly = 1;
+
+ /*
+ * The rodata/data/bss/brk section (but not the kernel text!)
+ * should also be not-executable.
+ *
+ * We align all_end to PMD_SIZE because the existing mapping
+ * is a full PMD. If we would align _brk_end to PAGE_SIZE we
+ * split the PMD and the reminder between _brk_end and the end
+ * of the PMD will remain mapped executable.
+ *
+ * Any PMD which was setup after the one which covers _brk_end
+ * has been zapped already via cleanup_highmem().
+ */
+ all_end = roundup((unsigned long)_brk_end, PMD_SIZE);
+ set_memory_nx(text_end, (all_end - text_end) >> PAGE_SHIFT);
+
+#ifdef CONFIG_CPA_DEBUG
+ printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end);
+ set_memory_rw(start, (end-start) >> PAGE_SHIFT);
+
+ printk(KERN_INFO "Testing CPA: again\n");
+ set_memory_ro(start, (end-start) >> PAGE_SHIFT);
+#endif
+
+ free_kernel_image_pages((void *)text_end, (void *)rodata_start);
+ free_kernel_image_pages((void *)rodata_end, (void *)_sdata);
+
+ debug_checkwx();
+}
+
+int kern_addr_valid(unsigned long addr)
+{
+ unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ if (above != 0 && above != -1UL)
+ return 0;
+
+ pgd = pgd_offset_k(addr);
+ if (pgd_none(*pgd))
+ return 0;
+
+ p4d = p4d_offset(pgd, addr);
+ if (!p4d_present(*p4d))
+ return 0;
+
+ pud = pud_offset(p4d, addr);
+ if (!pud_present(*pud))
+ return 0;
+
+ if (pud_large(*pud))
+ return pfn_valid(pud_pfn(*pud));
+
+ pmd = pmd_offset(pud, addr);
+ if (!pmd_present(*pmd))
+ return 0;
+
+ if (pmd_large(*pmd))
+ return pfn_valid(pmd_pfn(*pmd));
+
+ pte = pte_offset_kernel(pmd, addr);
+ if (pte_none(*pte))
+ return 0;
+
+ return pfn_valid(pte_pfn(*pte));
+}
+
+/*
+ * Block size is the minimum amount of memory which can be hotplugged or
+ * hotremoved. It must be power of two and must be equal or larger than
+ * MIN_MEMORY_BLOCK_SIZE.
+ */
+#define MAX_BLOCK_SIZE (2UL << 30)
+
+/* Amount of ram needed to start using large blocks */
+#define MEM_SIZE_FOR_LARGE_BLOCK (64UL << 30)
+
+/* Adjustable memory block size */
+static unsigned long set_memory_block_size;
+int __init set_memory_block_size_order(unsigned int order)
+{
+ unsigned long size = 1UL << order;
+
+ if (size > MEM_SIZE_FOR_LARGE_BLOCK || size < MIN_MEMORY_BLOCK_SIZE)
+ return -EINVAL;
+
+ set_memory_block_size = size;
+ return 0;
+}
+
+static unsigned long probe_memory_block_size(void)
+{
+ unsigned long boot_mem_end = max_pfn << PAGE_SHIFT;
+ unsigned long bz;
+
+ /* If memory block size has been set, then use it */
+ bz = set_memory_block_size;
+ if (bz)
+ goto done;
+
+ /* Use regular block if RAM is smaller than MEM_SIZE_FOR_LARGE_BLOCK */
+ if (boot_mem_end < MEM_SIZE_FOR_LARGE_BLOCK) {
+ bz = MIN_MEMORY_BLOCK_SIZE;
+ goto done;
+ }
+
+ /* Find the largest allowed block size that aligns to memory end */
+ for (bz = MAX_BLOCK_SIZE; bz > MIN_MEMORY_BLOCK_SIZE; bz >>= 1) {
+ if (IS_ALIGNED(boot_mem_end, bz))
+ break;
+ }
+done:
+ pr_info("x86/mm: Memory block size: %ldMB\n", bz >> 20);
+
+ return bz;
+}
+
+static unsigned long memory_block_size_probed;
+unsigned long memory_block_size_bytes(void)
+{
+ if (!memory_block_size_probed)
+ memory_block_size_probed = probe_memory_block_size();
+
+ return memory_block_size_probed;
+}
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+/*
+ * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
+ */
+static long __meminitdata addr_start, addr_end;
+static void __meminitdata *p_start, *p_end;
+static int __meminitdata node_start;
+
+static int __meminit vmemmap_populate_hugepages(unsigned long start,
+ unsigned long end, int node, struct vmem_altmap *altmap)
+{
+ unsigned long addr;
+ unsigned long next;
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+
+ for (addr = start; addr < end; addr = next) {
+ next = pmd_addr_end(addr, end);
+
+ pgd = vmemmap_pgd_populate(addr, node);
+ if (!pgd)
+ return -ENOMEM;
+
+ p4d = vmemmap_p4d_populate(pgd, addr, node);
+ if (!p4d)
+ return -ENOMEM;
+
+ pud = vmemmap_pud_populate(p4d, addr, node);
+ if (!pud)
+ return -ENOMEM;
+
+ pmd = pmd_offset(pud, addr);
+ if (pmd_none(*pmd)) {
+ void *p;
+
+ if (altmap)
+ p = altmap_alloc_block_buf(PMD_SIZE, altmap);
+ else
+ p = vmemmap_alloc_block_buf(PMD_SIZE, node);
+ if (p) {
+ pte_t entry;
+
+ entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
+ PAGE_KERNEL_LARGE);
+ set_pmd(pmd, __pmd(pte_val(entry)));
+
+ /* check to see if we have contiguous blocks */
+ if (p_end != p || node_start != node) {
+ if (p_start)
+ pr_debug(" [%lx-%lx] PMD -> [%p-%p] on node %d\n",
+ addr_start, addr_end-1, p_start, p_end-1, node_start);
+ addr_start = addr;
+ node_start = node;
+ p_start = p;
+ }
+
+ addr_end = addr + PMD_SIZE;
+ p_end = p + PMD_SIZE;
+ continue;
+ } else if (altmap)
+ return -ENOMEM; /* no fallback */
+ } else if (pmd_large(*pmd)) {
+ vmemmap_verify((pte_t *)pmd, node, addr, next);
+ continue;
+ }
+ if (vmemmap_populate_basepages(addr, next, node))
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
+ struct vmem_altmap *altmap)
+{
+ int err;
+
+ if (boot_cpu_has(X86_FEATURE_PSE))
+ err = vmemmap_populate_hugepages(start, end, node, altmap);
+ else if (altmap) {
+ pr_err_once("%s: no cpu support for altmap allocations\n",
+ __func__);
+ err = -ENOMEM;
+ } else
+ err = vmemmap_populate_basepages(start, end, node);
+ if (!err)
+ sync_global_pgds(start, end - 1);
+ return err;
+}
+
+#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HAVE_BOOTMEM_INFO_NODE)
+void register_page_bootmem_memmap(unsigned long section_nr,
+ struct page *start_page, unsigned long nr_pages)
+{
+ unsigned long addr = (unsigned long)start_page;
+ unsigned long end = (unsigned long)(start_page + nr_pages);
+ unsigned long next;
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ unsigned int nr_pmd_pages;
+ struct page *page;
+
+ for (; addr < end; addr = next) {
+ pte_t *pte = NULL;
+
+ pgd = pgd_offset_k(addr);
+ if (pgd_none(*pgd)) {
+ next = (addr + PAGE_SIZE) & PAGE_MASK;
+ continue;
+ }
+ get_page_bootmem(section_nr, pgd_page(*pgd), MIX_SECTION_INFO);
+
+ p4d = p4d_offset(pgd, addr);
+ if (p4d_none(*p4d)) {
+ next = (addr + PAGE_SIZE) & PAGE_MASK;
+ continue;
+ }
+ get_page_bootmem(section_nr, p4d_page(*p4d), MIX_SECTION_INFO);
+
+ pud = pud_offset(p4d, addr);
+ if (pud_none(*pud)) {
+ next = (addr + PAGE_SIZE) & PAGE_MASK;
+ continue;
+ }
+ get_page_bootmem(section_nr, pud_page(*pud), MIX_SECTION_INFO);
+
+ if (!boot_cpu_has(X86_FEATURE_PSE)) {
+ next = (addr + PAGE_SIZE) & PAGE_MASK;
+ pmd = pmd_offset(pud, addr);
+ if (pmd_none(*pmd))
+ continue;
+ get_page_bootmem(section_nr, pmd_page(*pmd),
+ MIX_SECTION_INFO);
+
+ pte = pte_offset_kernel(pmd, addr);
+ if (pte_none(*pte))
+ continue;
+ get_page_bootmem(section_nr, pte_page(*pte),
+ SECTION_INFO);
+ } else {
+ next = pmd_addr_end(addr, end);
+
+ pmd = pmd_offset(pud, addr);
+ if (pmd_none(*pmd))
+ continue;
+
+ nr_pmd_pages = 1 << get_order(PMD_SIZE);
+ page = pmd_page(*pmd);
+ while (nr_pmd_pages--)
+ get_page_bootmem(section_nr, page++,
+ SECTION_INFO);
+ }
+ }
+}
+#endif
+
+void __meminit vmemmap_populate_print_last(void)
+{
+ if (p_start) {
+ pr_debug(" [%lx-%lx] PMD -> [%p-%p] on node %d\n",
+ addr_start, addr_end-1, p_start, p_end-1, node_start);
+ p_start = NULL;
+ p_end = NULL;
+ node_start = 0;
+ }
+}
+#endif
diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
new file mode 100644
index 000000000..b3294d367
--- /dev/null
+++ b/arch/x86/mm/iomap_32.c
@@ -0,0 +1,129 @@
+/*
+ * Copyright © 2008 Ingo Molnar
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ */
+
+#include <asm/iomap.h>
+#include <asm/pat.h>
+#include <linux/export.h>
+#include <linux/highmem.h>
+
+static int is_io_mapping_possible(resource_size_t base, unsigned long size)
+{
+#if !defined(CONFIG_X86_PAE) && defined(CONFIG_PHYS_ADDR_T_64BIT)
+ /* There is no way to map greater than 1 << 32 address without PAE */
+ if (base + size > 0x100000000ULL)
+ return 0;
+#endif
+ return 1;
+}
+
+int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot)
+{
+ enum page_cache_mode pcm = _PAGE_CACHE_MODE_WC;
+ int ret;
+
+ if (!is_io_mapping_possible(base, size))
+ return -EINVAL;
+
+ ret = io_reserve_memtype(base, base + size, &pcm);
+ if (ret)
+ return ret;
+
+ *prot = __pgprot(__PAGE_KERNEL | cachemode2protval(pcm));
+ /* Filter out unsupported __PAGE_KERNEL* bits: */
+ pgprot_val(*prot) &= __default_kernel_pte_mask;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(iomap_create_wc);
+
+void iomap_free(resource_size_t base, unsigned long size)
+{
+ io_free_memtype(base, base + size);
+}
+EXPORT_SYMBOL_GPL(iomap_free);
+
+void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
+{
+ unsigned long vaddr;
+ int idx, type;
+
+ preempt_disable();
+ pagefault_disable();
+
+ type = kmap_atomic_idx_push();
+ idx = type + KM_TYPE_NR * smp_processor_id();
+ vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
+ set_pte(kmap_pte - idx, pfn_pte(pfn, prot));
+ arch_flush_lazy_mmu_mode();
+
+ return (void *)vaddr;
+}
+
+/*
+ * Map 'pfn' using protections 'prot'
+ */
+void __iomem *
+iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
+{
+ /*
+ * For non-PAT systems, translate non-WB request to UC- just in
+ * case the caller set the PWT bit to prot directly without using
+ * pgprot_writecombine(). UC- translates to uncached if the MTRR
+ * is UC or WC. UC- gets the real intention, of the user, which is
+ * "WC if the MTRR is WC, UC if you can't do that."
+ */
+ if (!pat_enabled() && pgprot2cachemode(prot) != _PAGE_CACHE_MODE_WB)
+ prot = __pgprot(__PAGE_KERNEL |
+ cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS));
+
+ /* Filter out unsupported __PAGE_KERNEL* bits: */
+ pgprot_val(prot) &= __default_kernel_pte_mask;
+
+ return (void __force __iomem *) kmap_atomic_prot_pfn(pfn, prot);
+}
+EXPORT_SYMBOL_GPL(iomap_atomic_prot_pfn);
+
+void
+iounmap_atomic(void __iomem *kvaddr)
+{
+ unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
+
+ if (vaddr >= __fix_to_virt(FIX_KMAP_END) &&
+ vaddr <= __fix_to_virt(FIX_KMAP_BEGIN)) {
+ int idx, type;
+
+ type = kmap_atomic_idx();
+ idx = type + KM_TYPE_NR * smp_processor_id();
+
+#ifdef CONFIG_DEBUG_HIGHMEM
+ WARN_ON_ONCE(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx));
+#endif
+ /*
+ * Force other mappings to Oops if they'll try to access this
+ * pte without first remap it. Keeping stale mappings around
+ * is a bad idea also, in case the page changes cacheability
+ * attributes or becomes a protected page in a hypervisor.
+ */
+ kpte_clear_flush(kmap_pte-idx, vaddr);
+ kmap_atomic_idx_pop();
+ }
+
+ pagefault_enable();
+ preempt_enable();
+}
+EXPORT_SYMBOL_GPL(iounmap_atomic);
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
new file mode 100644
index 000000000..adc77904f
--- /dev/null
+++ b/arch/x86/mm/ioremap.c
@@ -0,0 +1,827 @@
+/*
+ * Re-map IO memory to kernel address space so that we can access it.
+ * This is needed for high PCI addresses that aren't mapped in the
+ * 640k-1MB IO memory area on PC's
+ *
+ * (C) Copyright 1995 1996 Linus Torvalds
+ */
+
+#include <linux/bootmem.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/ioport.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/mmiotrace.h>
+#include <linux/mem_encrypt.h>
+#include <linux/efi.h>
+
+#include <asm/set_memory.h>
+#include <asm/e820/api.h>
+#include <asm/fixmap.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/pgalloc.h>
+#include <asm/pat.h>
+#include <asm/setup.h>
+
+#include "physaddr.h"
+
+struct ioremap_mem_flags {
+ bool system_ram;
+ bool desc_other;
+};
+
+/*
+ * Fix up the linear direct mapping of the kernel to avoid cache attribute
+ * conflicts.
+ */
+int ioremap_change_attr(unsigned long vaddr, unsigned long size,
+ enum page_cache_mode pcm)
+{
+ unsigned long nrpages = size >> PAGE_SHIFT;
+ int err;
+
+ switch (pcm) {
+ case _PAGE_CACHE_MODE_UC:
+ default:
+ err = _set_memory_uc(vaddr, nrpages);
+ break;
+ case _PAGE_CACHE_MODE_WC:
+ err = _set_memory_wc(vaddr, nrpages);
+ break;
+ case _PAGE_CACHE_MODE_WT:
+ err = _set_memory_wt(vaddr, nrpages);
+ break;
+ case _PAGE_CACHE_MODE_WB:
+ err = _set_memory_wb(vaddr, nrpages);
+ break;
+ }
+
+ return err;
+}
+
+static bool __ioremap_check_ram(struct resource *res)
+{
+ unsigned long start_pfn, stop_pfn;
+ unsigned long i;
+
+ if ((res->flags & IORESOURCE_SYSTEM_RAM) != IORESOURCE_SYSTEM_RAM)
+ return false;
+
+ start_pfn = (res->start + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ stop_pfn = (res->end + 1) >> PAGE_SHIFT;
+ if (stop_pfn > start_pfn) {
+ for (i = 0; i < (stop_pfn - start_pfn); ++i)
+ if (pfn_valid(start_pfn + i) &&
+ !PageReserved(pfn_to_page(start_pfn + i)))
+ return true;
+ }
+
+ return false;
+}
+
+static int __ioremap_check_desc_other(struct resource *res)
+{
+ return (res->desc != IORES_DESC_NONE);
+}
+
+static int __ioremap_res_check(struct resource *res, void *arg)
+{
+ struct ioremap_mem_flags *flags = arg;
+
+ if (!flags->system_ram)
+ flags->system_ram = __ioremap_check_ram(res);
+
+ if (!flags->desc_other)
+ flags->desc_other = __ioremap_check_desc_other(res);
+
+ return flags->system_ram && flags->desc_other;
+}
+
+/*
+ * To avoid multiple resource walks, this function walks resources marked as
+ * IORESOURCE_MEM and IORESOURCE_BUSY and looking for system RAM and/or a
+ * resource described not as IORES_DESC_NONE (e.g. IORES_DESC_ACPI_TABLES).
+ */
+static void __ioremap_check_mem(resource_size_t addr, unsigned long size,
+ struct ioremap_mem_flags *flags)
+{
+ u64 start, end;
+
+ start = (u64)addr;
+ end = start + size - 1;
+ memset(flags, 0, sizeof(*flags));
+
+ walk_mem_res(start, end, flags, __ioremap_res_check);
+}
+
+/*
+ * Remap an arbitrary physical address space into the kernel virtual
+ * address space. It transparently creates kernel huge I/O mapping when
+ * the physical address is aligned by a huge page size (1GB or 2MB) and
+ * the requested size is at least the huge page size.
+ *
+ * NOTE: MTRRs can override PAT memory types with a 4KB granularity.
+ * Therefore, the mapping code falls back to use a smaller page toward 4KB
+ * when a mapping range is covered by non-WB type of MTRRs.
+ *
+ * NOTE! We need to allow non-page-aligned mappings too: we will obviously
+ * have to convert them into an offset in a page-aligned mapping, but the
+ * caller shouldn't need to know that small detail.
+ */
+static void __iomem *__ioremap_caller(resource_size_t phys_addr,
+ unsigned long size, enum page_cache_mode pcm, void *caller)
+{
+ unsigned long offset, vaddr;
+ resource_size_t last_addr;
+ const resource_size_t unaligned_phys_addr = phys_addr;
+ const unsigned long unaligned_size = size;
+ struct ioremap_mem_flags mem_flags;
+ struct vm_struct *area;
+ enum page_cache_mode new_pcm;
+ pgprot_t prot;
+ int retval;
+ void __iomem *ret_addr;
+
+ /* Don't allow wraparound or zero size */
+ last_addr = phys_addr + size - 1;
+ if (!size || last_addr < phys_addr)
+ return NULL;
+
+ if (!phys_addr_valid(phys_addr)) {
+ printk(KERN_WARNING "ioremap: invalid physical address %llx\n",
+ (unsigned long long)phys_addr);
+ WARN_ON_ONCE(1);
+ return NULL;
+ }
+
+ __ioremap_check_mem(phys_addr, size, &mem_flags);
+
+ /*
+ * Don't allow anybody to remap normal RAM that we're using..
+ */
+ if (mem_flags.system_ram) {
+ WARN_ONCE(1, "ioremap on RAM at %pa - %pa\n",
+ &phys_addr, &last_addr);
+ return NULL;
+ }
+
+ /*
+ * Mappings have to be page-aligned
+ */
+ offset = phys_addr & ~PAGE_MASK;
+ phys_addr &= PHYSICAL_PAGE_MASK;
+ size = PAGE_ALIGN(last_addr+1) - phys_addr;
+
+ retval = reserve_memtype(phys_addr, (u64)phys_addr + size,
+ pcm, &new_pcm);
+ if (retval) {
+ printk(KERN_ERR "ioremap reserve_memtype failed %d\n", retval);
+ return NULL;
+ }
+
+ if (pcm != new_pcm) {
+ if (!is_new_memtype_allowed(phys_addr, size, pcm, new_pcm)) {
+ printk(KERN_ERR
+ "ioremap error for 0x%llx-0x%llx, requested 0x%x, got 0x%x\n",
+ (unsigned long long)phys_addr,
+ (unsigned long long)(phys_addr + size),
+ pcm, new_pcm);
+ goto err_free_memtype;
+ }
+ pcm = new_pcm;
+ }
+
+ /*
+ * If the page being mapped is in memory and SEV is active then
+ * make sure the memory encryption attribute is enabled in the
+ * resulting mapping.
+ */
+ prot = PAGE_KERNEL_IO;
+ if (sev_active() && mem_flags.desc_other)
+ prot = pgprot_encrypted(prot);
+
+ switch (pcm) {
+ case _PAGE_CACHE_MODE_UC:
+ default:
+ prot = __pgprot(pgprot_val(prot) |
+ cachemode2protval(_PAGE_CACHE_MODE_UC));
+ break;
+ case _PAGE_CACHE_MODE_UC_MINUS:
+ prot = __pgprot(pgprot_val(prot) |
+ cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS));
+ break;
+ case _PAGE_CACHE_MODE_WC:
+ prot = __pgprot(pgprot_val(prot) |
+ cachemode2protval(_PAGE_CACHE_MODE_WC));
+ break;
+ case _PAGE_CACHE_MODE_WT:
+ prot = __pgprot(pgprot_val(prot) |
+ cachemode2protval(_PAGE_CACHE_MODE_WT));
+ break;
+ case _PAGE_CACHE_MODE_WB:
+ break;
+ }
+
+ /*
+ * Ok, go for it..
+ */
+ area = get_vm_area_caller(size, VM_IOREMAP, caller);
+ if (!area)
+ goto err_free_memtype;
+ area->phys_addr = phys_addr;
+ vaddr = (unsigned long) area->addr;
+
+ if (kernel_map_sync_memtype(phys_addr, size, pcm))
+ goto err_free_area;
+
+ if (ioremap_page_range(vaddr, vaddr + size, phys_addr, prot))
+ goto err_free_area;
+
+ ret_addr = (void __iomem *) (vaddr + offset);
+ mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr);
+
+ /*
+ * Check if the request spans more than any BAR in the iomem resource
+ * tree.
+ */
+ if (iomem_map_sanity_check(unaligned_phys_addr, unaligned_size))
+ pr_warn("caller %pS mapping multiple BARs\n", caller);
+
+ return ret_addr;
+err_free_area:
+ free_vm_area(area);
+err_free_memtype:
+ free_memtype(phys_addr, phys_addr + size);
+ return NULL;
+}
+
+/**
+ * ioremap_nocache - map bus memory into CPU space
+ * @phys_addr: bus address of the memory
+ * @size: size of the resource to map
+ *
+ * ioremap_nocache performs a platform specific sequence of operations to
+ * make bus memory CPU accessible via the readb/readw/readl/writeb/
+ * writew/writel functions and the other mmio helpers. The returned
+ * address is not guaranteed to be usable directly as a virtual
+ * address.
+ *
+ * This version of ioremap ensures that the memory is marked uncachable
+ * on the CPU as well as honouring existing caching rules from things like
+ * the PCI bus. Note that there are other caches and buffers on many
+ * busses. In particular driver authors should read up on PCI writes
+ *
+ * It's useful if some control registers are in such an area and
+ * write combining or read caching is not desirable:
+ *
+ * Must be freed with iounmap.
+ */
+void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size)
+{
+ /*
+ * Ideally, this should be:
+ * pat_enabled() ? _PAGE_CACHE_MODE_UC : _PAGE_CACHE_MODE_UC_MINUS;
+ *
+ * Till we fix all X drivers to use ioremap_wc(), we will use
+ * UC MINUS. Drivers that are certain they need or can already
+ * be converted over to strong UC can use ioremap_uc().
+ */
+ enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC_MINUS;
+
+ return __ioremap_caller(phys_addr, size, pcm,
+ __builtin_return_address(0));
+}
+EXPORT_SYMBOL(ioremap_nocache);
+
+/**
+ * ioremap_uc - map bus memory into CPU space as strongly uncachable
+ * @phys_addr: bus address of the memory
+ * @size: size of the resource to map
+ *
+ * ioremap_uc performs a platform specific sequence of operations to
+ * make bus memory CPU accessible via the readb/readw/readl/writeb/
+ * writew/writel functions and the other mmio helpers. The returned
+ * address is not guaranteed to be usable directly as a virtual
+ * address.
+ *
+ * This version of ioremap ensures that the memory is marked with a strong
+ * preference as completely uncachable on the CPU when possible. For non-PAT
+ * systems this ends up setting page-attribute flags PCD=1, PWT=1. For PAT
+ * systems this will set the PAT entry for the pages as strong UC. This call
+ * will honor existing caching rules from things like the PCI bus. Note that
+ * there are other caches and buffers on many busses. In particular driver
+ * authors should read up on PCI writes.
+ *
+ * It's useful if some control registers are in such an area and
+ * write combining or read caching is not desirable:
+ *
+ * Must be freed with iounmap.
+ */
+void __iomem *ioremap_uc(resource_size_t phys_addr, unsigned long size)
+{
+ enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC;
+
+ return __ioremap_caller(phys_addr, size, pcm,
+ __builtin_return_address(0));
+}
+EXPORT_SYMBOL_GPL(ioremap_uc);
+
+/**
+ * ioremap_wc - map memory into CPU space write combined
+ * @phys_addr: bus address of the memory
+ * @size: size of the resource to map
+ *
+ * This version of ioremap ensures that the memory is marked write combining.
+ * Write combining allows faster writes to some hardware devices.
+ *
+ * Must be freed with iounmap.
+ */
+void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size)
+{
+ return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WC,
+ __builtin_return_address(0));
+}
+EXPORT_SYMBOL(ioremap_wc);
+
+/**
+ * ioremap_wt - map memory into CPU space write through
+ * @phys_addr: bus address of the memory
+ * @size: size of the resource to map
+ *
+ * This version of ioremap ensures that the memory is marked write through.
+ * Write through stores data into memory while keeping the cache up-to-date.
+ *
+ * Must be freed with iounmap.
+ */
+void __iomem *ioremap_wt(resource_size_t phys_addr, unsigned long size)
+{
+ return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WT,
+ __builtin_return_address(0));
+}
+EXPORT_SYMBOL(ioremap_wt);
+
+void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
+{
+ return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB,
+ __builtin_return_address(0));
+}
+EXPORT_SYMBOL(ioremap_cache);
+
+void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
+ unsigned long prot_val)
+{
+ return __ioremap_caller(phys_addr, size,
+ pgprot2cachemode(__pgprot(prot_val)),
+ __builtin_return_address(0));
+}
+EXPORT_SYMBOL(ioremap_prot);
+
+/**
+ * iounmap - Free a IO remapping
+ * @addr: virtual address from ioremap_*
+ *
+ * Caller must ensure there is only one unmapping for the same pointer.
+ */
+void iounmap(volatile void __iomem *addr)
+{
+ struct vm_struct *p, *o;
+
+ if ((void __force *)addr <= high_memory)
+ return;
+
+ /*
+ * The PCI/ISA range special-casing was removed from __ioremap()
+ * so this check, in theory, can be removed. However, there are
+ * cases where iounmap() is called for addresses not obtained via
+ * ioremap() (vga16fb for example). Add a warning so that these
+ * cases can be caught and fixed.
+ */
+ if ((void __force *)addr >= phys_to_virt(ISA_START_ADDRESS) &&
+ (void __force *)addr < phys_to_virt(ISA_END_ADDRESS)) {
+ WARN(1, "iounmap() called for ISA range not obtained using ioremap()\n");
+ return;
+ }
+
+ mmiotrace_iounmap(addr);
+
+ addr = (volatile void __iomem *)
+ (PAGE_MASK & (unsigned long __force)addr);
+
+ /* Use the vm area unlocked, assuming the caller
+ ensures there isn't another iounmap for the same address
+ in parallel. Reuse of the virtual address is prevented by
+ leaving it in the global lists until we're done with it.
+ cpa takes care of the direct mappings. */
+ p = find_vm_area((void __force *)addr);
+
+ if (!p) {
+ printk(KERN_ERR "iounmap: bad address %p\n", addr);
+ dump_stack();
+ return;
+ }
+
+ free_memtype(p->phys_addr, p->phys_addr + get_vm_area_size(p));
+
+ /* Finally remove it */
+ o = remove_vm_area((void __force *)addr);
+ BUG_ON(p != o || o == NULL);
+ kfree(p);
+}
+EXPORT_SYMBOL(iounmap);
+
+int __init arch_ioremap_pud_supported(void)
+{
+#ifdef CONFIG_X86_64
+ return boot_cpu_has(X86_FEATURE_GBPAGES);
+#else
+ return 0;
+#endif
+}
+
+int __init arch_ioremap_pmd_supported(void)
+{
+ return boot_cpu_has(X86_FEATURE_PSE);
+}
+
+/*
+ * Convert a physical pointer to a virtual kernel pointer for /dev/mem
+ * access
+ */
+void *xlate_dev_mem_ptr(phys_addr_t phys)
+{
+ unsigned long start = phys & PAGE_MASK;
+ unsigned long offset = phys & ~PAGE_MASK;
+ void *vaddr;
+
+ /* memremap() maps if RAM, otherwise falls back to ioremap() */
+ vaddr = memremap(start, PAGE_SIZE, MEMREMAP_WB);
+
+ /* Only add the offset on success and return NULL if memremap() failed */
+ if (vaddr)
+ vaddr += offset;
+
+ return vaddr;
+}
+
+void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr)
+{
+ memunmap((void *)((unsigned long)addr & PAGE_MASK));
+}
+
+/*
+ * Examine the physical address to determine if it is an area of memory
+ * that should be mapped decrypted. If the memory is not part of the
+ * kernel usable area it was accessed and created decrypted, so these
+ * areas should be mapped decrypted. And since the encryption key can
+ * change across reboots, persistent memory should also be mapped
+ * decrypted.
+ *
+ * If SEV is active, that implies that BIOS/UEFI also ran encrypted so
+ * only persistent memory should be mapped decrypted.
+ */
+static bool memremap_should_map_decrypted(resource_size_t phys_addr,
+ unsigned long size)
+{
+ int is_pmem;
+
+ /*
+ * Check if the address is part of a persistent memory region.
+ * This check covers areas added by E820, EFI and ACPI.
+ */
+ is_pmem = region_intersects(phys_addr, size, IORESOURCE_MEM,
+ IORES_DESC_PERSISTENT_MEMORY);
+ if (is_pmem != REGION_DISJOINT)
+ return true;
+
+ /*
+ * Check if the non-volatile attribute is set for an EFI
+ * reserved area.
+ */
+ if (efi_enabled(EFI_BOOT)) {
+ switch (efi_mem_type(phys_addr)) {
+ case EFI_RESERVED_TYPE:
+ if (efi_mem_attributes(phys_addr) & EFI_MEMORY_NV)
+ return true;
+ break;
+ default:
+ break;
+ }
+ }
+
+ /* Check if the address is outside kernel usable area */
+ switch (e820__get_entry_type(phys_addr, phys_addr + size - 1)) {
+ case E820_TYPE_RESERVED:
+ case E820_TYPE_ACPI:
+ case E820_TYPE_NVS:
+ case E820_TYPE_UNUSABLE:
+ /* For SEV, these areas are encrypted */
+ if (sev_active())
+ break;
+ /* Fallthrough */
+
+ case E820_TYPE_PRAM:
+ return true;
+ default:
+ break;
+ }
+
+ return false;
+}
+
+/*
+ * Examine the physical address to determine if it is EFI data. Check
+ * it against the boot params structure and EFI tables and memory types.
+ */
+static bool memremap_is_efi_data(resource_size_t phys_addr,
+ unsigned long size)
+{
+ u64 paddr;
+
+ /* Check if the address is part of EFI boot/runtime data */
+ if (!efi_enabled(EFI_BOOT))
+ return false;
+
+ paddr = boot_params.efi_info.efi_memmap_hi;
+ paddr <<= 32;
+ paddr |= boot_params.efi_info.efi_memmap;
+ if (phys_addr == paddr)
+ return true;
+
+ paddr = boot_params.efi_info.efi_systab_hi;
+ paddr <<= 32;
+ paddr |= boot_params.efi_info.efi_systab;
+ if (phys_addr == paddr)
+ return true;
+
+ if (efi_is_table_address(phys_addr))
+ return true;
+
+ switch (efi_mem_type(phys_addr)) {
+ case EFI_BOOT_SERVICES_DATA:
+ case EFI_RUNTIME_SERVICES_DATA:
+ return true;
+ default:
+ break;
+ }
+
+ return false;
+}
+
+/*
+ * Examine the physical address to determine if it is boot data by checking
+ * it against the boot params setup_data chain.
+ */
+static bool memremap_is_setup_data(resource_size_t phys_addr,
+ unsigned long size)
+{
+ struct setup_data *data;
+ u64 paddr, paddr_next;
+
+ paddr = boot_params.hdr.setup_data;
+ while (paddr) {
+ unsigned int len;
+
+ if (phys_addr == paddr)
+ return true;
+
+ data = memremap(paddr, sizeof(*data),
+ MEMREMAP_WB | MEMREMAP_DEC);
+
+ paddr_next = data->next;
+ len = data->len;
+
+ memunmap(data);
+
+ if ((phys_addr > paddr) && (phys_addr < (paddr + len)))
+ return true;
+
+ paddr = paddr_next;
+ }
+
+ return false;
+}
+
+/*
+ * Examine the physical address to determine if it is boot data by checking
+ * it against the boot params setup_data chain (early boot version).
+ */
+static bool __init early_memremap_is_setup_data(resource_size_t phys_addr,
+ unsigned long size)
+{
+ struct setup_data *data;
+ u64 paddr, paddr_next;
+
+ paddr = boot_params.hdr.setup_data;
+ while (paddr) {
+ unsigned int len;
+
+ if (phys_addr == paddr)
+ return true;
+
+ data = early_memremap_decrypted(paddr, sizeof(*data));
+
+ paddr_next = data->next;
+ len = data->len;
+
+ early_memunmap(data, sizeof(*data));
+
+ if ((phys_addr > paddr) && (phys_addr < (paddr + len)))
+ return true;
+
+ paddr = paddr_next;
+ }
+
+ return false;
+}
+
+/*
+ * Architecture function to determine if RAM remap is allowed. By default, a
+ * RAM remap will map the data as encrypted. Determine if a RAM remap should
+ * not be done so that the data will be mapped decrypted.
+ */
+bool arch_memremap_can_ram_remap(resource_size_t phys_addr, unsigned long size,
+ unsigned long flags)
+{
+ if (!mem_encrypt_active())
+ return true;
+
+ if (flags & MEMREMAP_ENC)
+ return true;
+
+ if (flags & MEMREMAP_DEC)
+ return false;
+
+ if (sme_active()) {
+ if (memremap_is_setup_data(phys_addr, size) ||
+ memremap_is_efi_data(phys_addr, size))
+ return false;
+ }
+
+ return !memremap_should_map_decrypted(phys_addr, size);
+}
+
+/*
+ * Architecture override of __weak function to adjust the protection attributes
+ * used when remapping memory. By default, early_memremap() will map the data
+ * as encrypted. Determine if an encrypted mapping should not be done and set
+ * the appropriate protection attributes.
+ */
+pgprot_t __init early_memremap_pgprot_adjust(resource_size_t phys_addr,
+ unsigned long size,
+ pgprot_t prot)
+{
+ bool encrypted_prot;
+
+ if (!mem_encrypt_active())
+ return prot;
+
+ encrypted_prot = true;
+
+ if (sme_active()) {
+ if (early_memremap_is_setup_data(phys_addr, size) ||
+ memremap_is_efi_data(phys_addr, size))
+ encrypted_prot = false;
+ }
+
+ if (encrypted_prot && memremap_should_map_decrypted(phys_addr, size))
+ encrypted_prot = false;
+
+ return encrypted_prot ? pgprot_encrypted(prot)
+ : pgprot_decrypted(prot);
+}
+
+bool phys_mem_access_encrypted(unsigned long phys_addr, unsigned long size)
+{
+ return arch_memremap_can_ram_remap(phys_addr, size, 0);
+}
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+/* Remap memory with encryption */
+void __init *early_memremap_encrypted(resource_size_t phys_addr,
+ unsigned long size)
+{
+ return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_ENC);
+}
+
+/*
+ * Remap memory with encryption and write-protected - cannot be called
+ * before pat_init() is called
+ */
+void __init *early_memremap_encrypted_wp(resource_size_t phys_addr,
+ unsigned long size)
+{
+ /* Be sure the write-protect PAT entry is set for write-protect */
+ if (__pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] != _PAGE_CACHE_MODE_WP)
+ return NULL;
+
+ return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_ENC_WP);
+}
+
+/* Remap memory without encryption */
+void __init *early_memremap_decrypted(resource_size_t phys_addr,
+ unsigned long size)
+{
+ return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_NOENC);
+}
+
+/*
+ * Remap memory without encryption and write-protected - cannot be called
+ * before pat_init() is called
+ */
+void __init *early_memremap_decrypted_wp(resource_size_t phys_addr,
+ unsigned long size)
+{
+ /* Be sure the write-protect PAT entry is set for write-protect */
+ if (__pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] != _PAGE_CACHE_MODE_WP)
+ return NULL;
+
+ return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_NOENC_WP);
+}
+#endif /* CONFIG_AMD_MEM_ENCRYPT */
+
+static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;
+
+static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
+{
+ /* Don't assume we're using swapper_pg_dir at this point */
+ pgd_t *base = __va(read_cr3_pa());
+ pgd_t *pgd = &base[pgd_index(addr)];
+ p4d_t *p4d = p4d_offset(pgd, addr);
+ pud_t *pud = pud_offset(p4d, addr);
+ pmd_t *pmd = pmd_offset(pud, addr);
+
+ return pmd;
+}
+
+static inline pte_t * __init early_ioremap_pte(unsigned long addr)
+{
+ return &bm_pte[pte_index(addr)];
+}
+
+bool __init is_early_ioremap_ptep(pte_t *ptep)
+{
+ return ptep >= &bm_pte[0] && ptep < &bm_pte[PAGE_SIZE/sizeof(pte_t)];
+}
+
+void __init early_ioremap_init(void)
+{
+ pmd_t *pmd;
+
+#ifdef CONFIG_X86_64
+ BUILD_BUG_ON((fix_to_virt(0) + PAGE_SIZE) & ((1 << PMD_SHIFT) - 1));
+#else
+ WARN_ON((fix_to_virt(0) + PAGE_SIZE) & ((1 << PMD_SHIFT) - 1));
+#endif
+
+ early_ioremap_setup();
+
+ pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
+ memset(bm_pte, 0, sizeof(bm_pte));
+ pmd_populate_kernel(&init_mm, pmd, bm_pte);
+
+ /*
+ * The boot-ioremap range spans multiple pmds, for which
+ * we are not prepared:
+ */
+#define __FIXADDR_TOP (-PAGE_SIZE)
+ BUILD_BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT)
+ != (__fix_to_virt(FIX_BTMAP_END) >> PMD_SHIFT));
+#undef __FIXADDR_TOP
+ if (pmd != early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END))) {
+ WARN_ON(1);
+ printk(KERN_WARNING "pmd %p != %p\n",
+ pmd, early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END)));
+ printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_BEGIN): %08lx\n",
+ fix_to_virt(FIX_BTMAP_BEGIN));
+ printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_END): %08lx\n",
+ fix_to_virt(FIX_BTMAP_END));
+
+ printk(KERN_WARNING "FIX_BTMAP_END: %d\n", FIX_BTMAP_END);
+ printk(KERN_WARNING "FIX_BTMAP_BEGIN: %d\n",
+ FIX_BTMAP_BEGIN);
+ }
+}
+
+void __init __early_set_fixmap(enum fixed_addresses idx,
+ phys_addr_t phys, pgprot_t flags)
+{
+ unsigned long addr = __fix_to_virt(idx);
+ pte_t *pte;
+
+ if (idx >= __end_of_fixed_addresses) {
+ BUG();
+ return;
+ }
+ pte = early_ioremap_pte(addr);
+
+ /* Sanitize 'prot' against any unsupported bits: */
+ pgprot_val(flags) &= __default_kernel_pte_mask;
+
+ if (pgprot_val(flags))
+ set_pte(pte, pfn_pte(phys >> PAGE_SHIFT, flags));
+ else
+ pte_clear(&init_mm, addr, pte);
+ __flush_tlb_one_kernel(addr);
+}
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
new file mode 100644
index 000000000..4bfd14d5d
--- /dev/null
+++ b/arch/x86/mm/kasan_init_64.c
@@ -0,0 +1,392 @@
+// SPDX-License-Identifier: GPL-2.0
+#define DISABLE_BRANCH_PROFILING
+#define pr_fmt(fmt) "kasan: " fmt
+
+/* cpu_feature_enabled() cannot be used this early */
+#define USE_EARLY_PGTABLE_L5
+
+#include <linux/bootmem.h>
+#include <linux/kasan.h>
+#include <linux/kdebug.h>
+#include <linux/memblock.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/sched/task.h>
+#include <linux/vmalloc.h>
+
+#include <asm/e820/types.h>
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+#include <asm/sections.h>
+#include <asm/pgtable.h>
+#include <asm/cpu_entry_area.h>
+
+extern struct range pfn_mapped[E820_MAX_ENTRIES];
+
+static p4d_t tmp_p4d_table[MAX_PTRS_PER_P4D] __initdata __aligned(PAGE_SIZE);
+
+static __init void *early_alloc(size_t size, int nid, bool panic)
+{
+ if (panic)
+ return memblock_virt_alloc_try_nid(size, size,
+ __pa(MAX_DMA_ADDRESS), BOOTMEM_ALLOC_ACCESSIBLE, nid);
+ else
+ return memblock_virt_alloc_try_nid_nopanic(size, size,
+ __pa(MAX_DMA_ADDRESS), BOOTMEM_ALLOC_ACCESSIBLE, nid);
+}
+
+static void __init kasan_populate_pmd(pmd_t *pmd, unsigned long addr,
+ unsigned long end, int nid)
+{
+ pte_t *pte;
+
+ if (pmd_none(*pmd)) {
+ void *p;
+
+ if (boot_cpu_has(X86_FEATURE_PSE) &&
+ ((end - addr) == PMD_SIZE) &&
+ IS_ALIGNED(addr, PMD_SIZE)) {
+ p = early_alloc(PMD_SIZE, nid, false);
+ if (p && pmd_set_huge(pmd, __pa(p), PAGE_KERNEL))
+ return;
+ else if (p)
+ memblock_free(__pa(p), PMD_SIZE);
+ }
+
+ p = early_alloc(PAGE_SIZE, nid, true);
+ pmd_populate_kernel(&init_mm, pmd, p);
+ }
+
+ pte = pte_offset_kernel(pmd, addr);
+ do {
+ pte_t entry;
+ void *p;
+
+ if (!pte_none(*pte))
+ continue;
+
+ p = early_alloc(PAGE_SIZE, nid, true);
+ entry = pfn_pte(PFN_DOWN(__pa(p)), PAGE_KERNEL);
+ set_pte_at(&init_mm, addr, pte, entry);
+ } while (pte++, addr += PAGE_SIZE, addr != end);
+}
+
+static void __init kasan_populate_pud(pud_t *pud, unsigned long addr,
+ unsigned long end, int nid)
+{
+ pmd_t *pmd;
+ unsigned long next;
+
+ if (pud_none(*pud)) {
+ void *p;
+
+ if (boot_cpu_has(X86_FEATURE_GBPAGES) &&
+ ((end - addr) == PUD_SIZE) &&
+ IS_ALIGNED(addr, PUD_SIZE)) {
+ p = early_alloc(PUD_SIZE, nid, false);
+ if (p && pud_set_huge(pud, __pa(p), PAGE_KERNEL))
+ return;
+ else if (p)
+ memblock_free(__pa(p), PUD_SIZE);
+ }
+
+ p = early_alloc(PAGE_SIZE, nid, true);
+ pud_populate(&init_mm, pud, p);
+ }
+
+ pmd = pmd_offset(pud, addr);
+ do {
+ next = pmd_addr_end(addr, end);
+ if (!pmd_large(*pmd))
+ kasan_populate_pmd(pmd, addr, next, nid);
+ } while (pmd++, addr = next, addr != end);
+}
+
+static void __init kasan_populate_p4d(p4d_t *p4d, unsigned long addr,
+ unsigned long end, int nid)
+{
+ pud_t *pud;
+ unsigned long next;
+
+ if (p4d_none(*p4d)) {
+ void *p = early_alloc(PAGE_SIZE, nid, true);
+
+ p4d_populate(&init_mm, p4d, p);
+ }
+
+ pud = pud_offset(p4d, addr);
+ do {
+ next = pud_addr_end(addr, end);
+ if (!pud_large(*pud))
+ kasan_populate_pud(pud, addr, next, nid);
+ } while (pud++, addr = next, addr != end);
+}
+
+static void __init kasan_populate_pgd(pgd_t *pgd, unsigned long addr,
+ unsigned long end, int nid)
+{
+ void *p;
+ p4d_t *p4d;
+ unsigned long next;
+
+ if (pgd_none(*pgd)) {
+ p = early_alloc(PAGE_SIZE, nid, true);
+ pgd_populate(&init_mm, pgd, p);
+ }
+
+ p4d = p4d_offset(pgd, addr);
+ do {
+ next = p4d_addr_end(addr, end);
+ kasan_populate_p4d(p4d, addr, next, nid);
+ } while (p4d++, addr = next, addr != end);
+}
+
+static void __init kasan_populate_shadow(unsigned long addr, unsigned long end,
+ int nid)
+{
+ pgd_t *pgd;
+ unsigned long next;
+
+ addr = addr & PAGE_MASK;
+ end = round_up(end, PAGE_SIZE);
+ pgd = pgd_offset_k(addr);
+ do {
+ next = pgd_addr_end(addr, end);
+ kasan_populate_pgd(pgd, addr, next, nid);
+ } while (pgd++, addr = next, addr != end);
+}
+
+static void __init map_range(struct range *range)
+{
+ unsigned long start;
+ unsigned long end;
+
+ start = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->start));
+ end = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->end));
+
+ kasan_populate_shadow(start, end, early_pfn_to_nid(range->start));
+}
+
+static void __init clear_pgds(unsigned long start,
+ unsigned long end)
+{
+ pgd_t *pgd;
+ /* See comment in kasan_init() */
+ unsigned long pgd_end = end & PGDIR_MASK;
+
+ for (; start < pgd_end; start += PGDIR_SIZE) {
+ pgd = pgd_offset_k(start);
+ /*
+ * With folded p4d, pgd_clear() is nop, use p4d_clear()
+ * instead.
+ */
+ if (pgtable_l5_enabled())
+ pgd_clear(pgd);
+ else
+ p4d_clear(p4d_offset(pgd, start));
+ }
+
+ pgd = pgd_offset_k(start);
+ for (; start < end; start += P4D_SIZE)
+ p4d_clear(p4d_offset(pgd, start));
+}
+
+static inline p4d_t *early_p4d_offset(pgd_t *pgd, unsigned long addr)
+{
+ unsigned long p4d;
+
+ if (!pgtable_l5_enabled())
+ return (p4d_t *)pgd;
+
+ p4d = pgd_val(*pgd) & PTE_PFN_MASK;
+ p4d += __START_KERNEL_map - phys_base;
+ return (p4d_t *)p4d + p4d_index(addr);
+}
+
+static void __init kasan_early_p4d_populate(pgd_t *pgd,
+ unsigned long addr,
+ unsigned long end)
+{
+ pgd_t pgd_entry;
+ p4d_t *p4d, p4d_entry;
+ unsigned long next;
+
+ if (pgd_none(*pgd)) {
+ pgd_entry = __pgd(_KERNPG_TABLE | __pa_nodebug(kasan_zero_p4d));
+ set_pgd(pgd, pgd_entry);
+ }
+
+ p4d = early_p4d_offset(pgd, addr);
+ do {
+ next = p4d_addr_end(addr, end);
+
+ if (!p4d_none(*p4d))
+ continue;
+
+ p4d_entry = __p4d(_KERNPG_TABLE | __pa_nodebug(kasan_zero_pud));
+ set_p4d(p4d, p4d_entry);
+ } while (p4d++, addr = next, addr != end && p4d_none(*p4d));
+}
+
+static void __init kasan_map_early_shadow(pgd_t *pgd)
+{
+ /* See comment in kasan_init() */
+ unsigned long addr = KASAN_SHADOW_START & PGDIR_MASK;
+ unsigned long end = KASAN_SHADOW_END;
+ unsigned long next;
+
+ pgd += pgd_index(addr);
+ do {
+ next = pgd_addr_end(addr, end);
+ kasan_early_p4d_populate(pgd, addr, next);
+ } while (pgd++, addr = next, addr != end);
+}
+
+#ifdef CONFIG_KASAN_INLINE
+static int kasan_die_handler(struct notifier_block *self,
+ unsigned long val,
+ void *data)
+{
+ if (val == DIE_GPF) {
+ pr_emerg("CONFIG_KASAN_INLINE enabled\n");
+ pr_emerg("GPF could be caused by NULL-ptr deref or user memory access\n");
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block kasan_die_notifier = {
+ .notifier_call = kasan_die_handler,
+};
+#endif
+
+void __init kasan_early_init(void)
+{
+ int i;
+ pteval_t pte_val = __pa_nodebug(kasan_zero_page) | __PAGE_KERNEL | _PAGE_ENC;
+ pmdval_t pmd_val = __pa_nodebug(kasan_zero_pte) | _KERNPG_TABLE;
+ pudval_t pud_val = __pa_nodebug(kasan_zero_pmd) | _KERNPG_TABLE;
+ p4dval_t p4d_val = __pa_nodebug(kasan_zero_pud) | _KERNPG_TABLE;
+
+ /* Mask out unsupported __PAGE_KERNEL bits: */
+ pte_val &= __default_kernel_pte_mask;
+ pmd_val &= __default_kernel_pte_mask;
+ pud_val &= __default_kernel_pte_mask;
+ p4d_val &= __default_kernel_pte_mask;
+
+ for (i = 0; i < PTRS_PER_PTE; i++)
+ kasan_zero_pte[i] = __pte(pte_val);
+
+ for (i = 0; i < PTRS_PER_PMD; i++)
+ kasan_zero_pmd[i] = __pmd(pmd_val);
+
+ for (i = 0; i < PTRS_PER_PUD; i++)
+ kasan_zero_pud[i] = __pud(pud_val);
+
+ for (i = 0; pgtable_l5_enabled() && i < PTRS_PER_P4D; i++)
+ kasan_zero_p4d[i] = __p4d(p4d_val);
+
+ kasan_map_early_shadow(early_top_pgt);
+ kasan_map_early_shadow(init_top_pgt);
+}
+
+void __init kasan_init(void)
+{
+ int i;
+ void *shadow_cpu_entry_begin, *shadow_cpu_entry_end;
+
+#ifdef CONFIG_KASAN_INLINE
+ register_die_notifier(&kasan_die_notifier);
+#endif
+
+ memcpy(early_top_pgt, init_top_pgt, sizeof(early_top_pgt));
+
+ /*
+ * We use the same shadow offset for 4- and 5-level paging to
+ * facilitate boot-time switching between paging modes.
+ * As result in 5-level paging mode KASAN_SHADOW_START and
+ * KASAN_SHADOW_END are not aligned to PGD boundary.
+ *
+ * KASAN_SHADOW_START doesn't share PGD with anything else.
+ * We claim whole PGD entry to make things easier.
+ *
+ * KASAN_SHADOW_END lands in the last PGD entry and it collides with
+ * bunch of things like kernel code, modules, EFI mapping, etc.
+ * We need to take extra steps to not overwrite them.
+ */
+ if (pgtable_l5_enabled()) {
+ void *ptr;
+
+ ptr = (void *)pgd_page_vaddr(*pgd_offset_k(KASAN_SHADOW_END));
+ memcpy(tmp_p4d_table, (void *)ptr, sizeof(tmp_p4d_table));
+ set_pgd(&early_top_pgt[pgd_index(KASAN_SHADOW_END)],
+ __pgd(__pa(tmp_p4d_table) | _KERNPG_TABLE));
+ }
+
+ load_cr3(early_top_pgt);
+ __flush_tlb_all();
+
+ clear_pgds(KASAN_SHADOW_START & PGDIR_MASK, KASAN_SHADOW_END);
+
+ kasan_populate_zero_shadow((void *)(KASAN_SHADOW_START & PGDIR_MASK),
+ kasan_mem_to_shadow((void *)PAGE_OFFSET));
+
+ for (i = 0; i < E820_MAX_ENTRIES; i++) {
+ if (pfn_mapped[i].end == 0)
+ break;
+
+ map_range(&pfn_mapped[i]);
+ }
+
+ shadow_cpu_entry_begin = (void *)CPU_ENTRY_AREA_BASE;
+ shadow_cpu_entry_begin = kasan_mem_to_shadow(shadow_cpu_entry_begin);
+ shadow_cpu_entry_begin = (void *)round_down((unsigned long)shadow_cpu_entry_begin,
+ PAGE_SIZE);
+
+ shadow_cpu_entry_end = (void *)(CPU_ENTRY_AREA_BASE +
+ CPU_ENTRY_AREA_MAP_SIZE);
+ shadow_cpu_entry_end = kasan_mem_to_shadow(shadow_cpu_entry_end);
+ shadow_cpu_entry_end = (void *)round_up((unsigned long)shadow_cpu_entry_end,
+ PAGE_SIZE);
+
+ kasan_populate_zero_shadow(
+ kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM),
+ shadow_cpu_entry_begin);
+
+ kasan_populate_shadow((unsigned long)shadow_cpu_entry_begin,
+ (unsigned long)shadow_cpu_entry_end, 0);
+
+ kasan_populate_zero_shadow(shadow_cpu_entry_end,
+ kasan_mem_to_shadow((void *)__START_KERNEL_map));
+
+ kasan_populate_shadow((unsigned long)kasan_mem_to_shadow(_stext),
+ (unsigned long)kasan_mem_to_shadow(_end),
+ early_pfn_to_nid(__pa(_stext)));
+
+ kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END),
+ (void *)KASAN_SHADOW_END);
+
+ load_cr3(init_top_pgt);
+ __flush_tlb_all();
+
+ /*
+ * kasan_zero_page has been used as early shadow memory, thus it may
+ * contain some garbage. Now we can clear and write protect it, since
+ * after the TLB flush no one should write to it.
+ */
+ memset(kasan_zero_page, 0, PAGE_SIZE);
+ for (i = 0; i < PTRS_PER_PTE; i++) {
+ pte_t pte;
+ pgprot_t prot;
+
+ prot = __pgprot(__PAGE_KERNEL_RO | _PAGE_ENC);
+ pgprot_val(prot) &= __default_kernel_pte_mask;
+
+ pte = __pte(__pa(kasan_zero_page) | pgprot_val(prot));
+ set_pte(&kasan_zero_pte[i], pte);
+ }
+ /* Flush TLBs again to be sure that write protection applied. */
+ __flush_tlb_all();
+
+ init_task.kasan_depth = 0;
+ pr_info("KernelAddressSanitizer initialized\n");
+}
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
new file mode 100644
index 000000000..bfe769209
--- /dev/null
+++ b/arch/x86/mm/kaslr.c
@@ -0,0 +1,228 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This file implements KASLR memory randomization for x86_64. It randomizes
+ * the virtual address space of kernel memory regions (physical memory
+ * mapping, vmalloc & vmemmap) for x86_64. This security feature mitigates
+ * exploits relying on predictable kernel addresses.
+ *
+ * Entropy is generated using the KASLR early boot functions now shared in
+ * the lib directory (originally written by Kees Cook). Randomization is
+ * done on PGD & P4D/PUD page table levels to increase possible addresses.
+ * The physical memory mapping code was adapted to support P4D/PUD level
+ * virtual addresses. This implementation on the best configuration provides
+ * 30,000 possible virtual addresses in average for each memory region.
+ * An additional low memory page is used to ensure each CPU can start with
+ * a PGD aligned virtual address (for realmode).
+ *
+ * The order of each memory region is not changed. The feature looks at
+ * the available space for the regions based on different configuration
+ * options and randomizes the base and space between each. The size of the
+ * physical memory mapping is the available physical memory.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/random.h>
+
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
+#include <asm/setup.h>
+#include <asm/kaslr.h>
+
+#include "mm_internal.h"
+
+#define TB_SHIFT 40
+
+/*
+ * The end address could depend on more configuration options to make the
+ * highest amount of space for randomization available, but that's too hard
+ * to keep straight and caused issues already.
+ */
+static const unsigned long vaddr_end = CPU_ENTRY_AREA_BASE;
+
+/*
+ * Memory regions randomized by KASLR (except modules that use a separate logic
+ * earlier during boot). The list is ordered based on virtual addresses. This
+ * order is kept after randomization.
+ */
+static __initdata struct kaslr_memory_region {
+ unsigned long *base;
+ unsigned long size_tb;
+} kaslr_regions[] = {
+ { &page_offset_base, 0 },
+ { &vmalloc_base, 0 },
+ { &vmemmap_base, 0 },
+};
+
+/* Get size in bytes used by the memory region */
+static inline unsigned long get_padding(struct kaslr_memory_region *region)
+{
+ return (region->size_tb << TB_SHIFT);
+}
+
+/*
+ * Apply no randomization if KASLR was disabled at boot or if KASAN
+ * is enabled. KASAN shadow mappings rely on regions being PGD aligned.
+ */
+static inline bool kaslr_memory_enabled(void)
+{
+ return kaslr_enabled() && !IS_ENABLED(CONFIG_KASAN);
+}
+
+/* Initialize base and padding for each memory region randomized with KASLR */
+void __init kernel_randomize_memory(void)
+{
+ size_t i;
+ unsigned long vaddr_start, vaddr;
+ unsigned long rand, memory_tb;
+ struct rnd_state rand_state;
+ unsigned long remain_entropy;
+ unsigned long vmemmap_size;
+
+ vaddr_start = pgtable_l5_enabled() ? __PAGE_OFFSET_BASE_L5 : __PAGE_OFFSET_BASE_L4;
+ vaddr = vaddr_start;
+
+ /*
+ * These BUILD_BUG_ON checks ensure the memory layout is consistent
+ * with the vaddr_start/vaddr_end variables. These checks are very
+ * limited....
+ */
+ BUILD_BUG_ON(vaddr_start >= vaddr_end);
+ BUILD_BUG_ON(vaddr_end != CPU_ENTRY_AREA_BASE);
+ BUILD_BUG_ON(vaddr_end > __START_KERNEL_map);
+
+ if (!kaslr_memory_enabled())
+ return;
+
+ kaslr_regions[0].size_tb = 1 << (MAX_PHYSMEM_BITS - TB_SHIFT);
+ kaslr_regions[1].size_tb = VMALLOC_SIZE_TB;
+
+ /*
+ * Update Physical memory mapping to available and
+ * add padding if needed (especially for memory hotplug support).
+ */
+ BUG_ON(kaslr_regions[0].base != &page_offset_base);
+ memory_tb = DIV_ROUND_UP(max_pfn << PAGE_SHIFT, 1UL << TB_SHIFT) +
+ CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING;
+
+ /* Adapt phyiscal memory region size based on available memory */
+ if (memory_tb < kaslr_regions[0].size_tb)
+ kaslr_regions[0].size_tb = memory_tb;
+
+ /*
+ * Calculate the vmemmap region size in TBs, aligned to a TB
+ * boundary.
+ */
+ vmemmap_size = (kaslr_regions[0].size_tb << (TB_SHIFT - PAGE_SHIFT)) *
+ sizeof(struct page);
+ kaslr_regions[2].size_tb = DIV_ROUND_UP(vmemmap_size, 1UL << TB_SHIFT);
+
+ /* Calculate entropy available between regions */
+ remain_entropy = vaddr_end - vaddr_start;
+ for (i = 0; i < ARRAY_SIZE(kaslr_regions); i++)
+ remain_entropy -= get_padding(&kaslr_regions[i]);
+
+ prandom_seed_state(&rand_state, kaslr_get_random_long("Memory"));
+
+ for (i = 0; i < ARRAY_SIZE(kaslr_regions); i++) {
+ unsigned long entropy;
+
+ /*
+ * Select a random virtual address using the extra entropy
+ * available.
+ */
+ entropy = remain_entropy / (ARRAY_SIZE(kaslr_regions) - i);
+ prandom_bytes_state(&rand_state, &rand, sizeof(rand));
+ if (pgtable_l5_enabled())
+ entropy = (rand % (entropy + 1)) & P4D_MASK;
+ else
+ entropy = (rand % (entropy + 1)) & PUD_MASK;
+ vaddr += entropy;
+ *kaslr_regions[i].base = vaddr;
+
+ /*
+ * Jump the region and add a minimum padding based on
+ * randomization alignment.
+ */
+ vaddr += get_padding(&kaslr_regions[i]);
+ if (pgtable_l5_enabled())
+ vaddr = round_up(vaddr + 1, P4D_SIZE);
+ else
+ vaddr = round_up(vaddr + 1, PUD_SIZE);
+ remain_entropy -= entropy;
+ }
+}
+
+static void __meminit init_trampoline_pud(void)
+{
+ unsigned long paddr, paddr_next;
+ pgd_t *pgd;
+ pud_t *pud_page, *pud_page_tramp;
+ int i;
+
+ pud_page_tramp = alloc_low_page();
+
+ paddr = 0;
+ pgd = pgd_offset_k((unsigned long)__va(paddr));
+ pud_page = (pud_t *) pgd_page_vaddr(*pgd);
+
+ for (i = pud_index(paddr); i < PTRS_PER_PUD; i++, paddr = paddr_next) {
+ pud_t *pud, *pud_tramp;
+ unsigned long vaddr = (unsigned long)__va(paddr);
+
+ pud_tramp = pud_page_tramp + pud_index(paddr);
+ pud = pud_page + pud_index(vaddr);
+ paddr_next = (paddr & PUD_MASK) + PUD_SIZE;
+
+ *pud_tramp = *pud;
+ }
+
+ set_pgd(&trampoline_pgd_entry,
+ __pgd(_KERNPG_TABLE | __pa(pud_page_tramp)));
+}
+
+static void __meminit init_trampoline_p4d(void)
+{
+ unsigned long paddr, paddr_next;
+ pgd_t *pgd;
+ p4d_t *p4d_page, *p4d_page_tramp;
+ int i;
+
+ p4d_page_tramp = alloc_low_page();
+
+ paddr = 0;
+ pgd = pgd_offset_k((unsigned long)__va(paddr));
+ p4d_page = (p4d_t *) pgd_page_vaddr(*pgd);
+
+ for (i = p4d_index(paddr); i < PTRS_PER_P4D; i++, paddr = paddr_next) {
+ p4d_t *p4d, *p4d_tramp;
+ unsigned long vaddr = (unsigned long)__va(paddr);
+
+ p4d_tramp = p4d_page_tramp + p4d_index(paddr);
+ p4d = p4d_page + p4d_index(vaddr);
+ paddr_next = (paddr & P4D_MASK) + P4D_SIZE;
+
+ *p4d_tramp = *p4d;
+ }
+
+ set_pgd(&trampoline_pgd_entry,
+ __pgd(_KERNPG_TABLE | __pa(p4d_page_tramp)));
+}
+
+/*
+ * Create PGD aligned trampoline table to allow real mode initialization
+ * of additional CPUs. Consume only 1 low memory page.
+ */
+void __meminit init_trampoline(void)
+{
+
+ if (!kaslr_memory_enabled()) {
+ init_trampoline_default();
+ return;
+ }
+
+ if (pgtable_l5_enabled())
+ init_trampoline_p4d();
+ else
+ init_trampoline_pud();
+}
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
new file mode 100644
index 000000000..79eb55ce6
--- /dev/null
+++ b/arch/x86/mm/kmmio.c
@@ -0,0 +1,627 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Support for MMIO probes.
+ * Benfit many code from kprobes
+ * (C) 2002 Louis Zhuang <louis.zhuang@intel.com>.
+ * 2007 Alexander Eichner
+ * 2008 Pekka Paalanen <pq@iki.fi>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/list.h>
+#include <linux/rculist.h>
+#include <linux/spinlock.h>
+#include <linux/hash.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/uaccess.h>
+#include <linux/ptrace.h>
+#include <linux/preempt.h>
+#include <linux/percpu.h>
+#include <linux/kdebug.h>
+#include <linux/mutex.h>
+#include <linux/io.h>
+#include <linux/slab.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+#include <linux/errno.h>
+#include <asm/debugreg.h>
+#include <linux/mmiotrace.h>
+
+#define KMMIO_PAGE_HASH_BITS 4
+#define KMMIO_PAGE_TABLE_SIZE (1 << KMMIO_PAGE_HASH_BITS)
+
+struct kmmio_fault_page {
+ struct list_head list;
+ struct kmmio_fault_page *release_next;
+ unsigned long addr; /* the requested address */
+ pteval_t old_presence; /* page presence prior to arming */
+ bool armed;
+
+ /*
+ * Number of times this page has been registered as a part
+ * of a probe. If zero, page is disarmed and this may be freed.
+ * Used only by writers (RCU) and post_kmmio_handler().
+ * Protected by kmmio_lock, when linked into kmmio_page_table.
+ */
+ int count;
+
+ bool scheduled_for_release;
+};
+
+struct kmmio_delayed_release {
+ struct rcu_head rcu;
+ struct kmmio_fault_page *release_list;
+};
+
+struct kmmio_context {
+ struct kmmio_fault_page *fpage;
+ struct kmmio_probe *probe;
+ unsigned long saved_flags;
+ unsigned long addr;
+ int active;
+};
+
+static DEFINE_SPINLOCK(kmmio_lock);
+
+/* Protected by kmmio_lock */
+unsigned int kmmio_count;
+
+/* Read-protected by RCU, write-protected by kmmio_lock. */
+static struct list_head kmmio_page_table[KMMIO_PAGE_TABLE_SIZE];
+static LIST_HEAD(kmmio_probes);
+
+static struct list_head *kmmio_page_list(unsigned long addr)
+{
+ unsigned int l;
+ pte_t *pte = lookup_address(addr, &l);
+
+ if (!pte)
+ return NULL;
+ addr &= page_level_mask(l);
+
+ return &kmmio_page_table[hash_long(addr, KMMIO_PAGE_HASH_BITS)];
+}
+
+/* Accessed per-cpu */
+static DEFINE_PER_CPU(struct kmmio_context, kmmio_ctx);
+
+/*
+ * this is basically a dynamic stabbing problem:
+ * Could use the existing prio tree code or
+ * Possible better implementations:
+ * The Interval Skip List: A Data Structure for Finding All Intervals That
+ * Overlap a Point (might be simple)
+ * Space Efficient Dynamic Stabbing with Fast Queries - Mikkel Thorup
+ */
+/* Get the kmmio at this addr (if any). You must be holding RCU read lock. */
+static struct kmmio_probe *get_kmmio_probe(unsigned long addr)
+{
+ struct kmmio_probe *p;
+ list_for_each_entry_rcu(p, &kmmio_probes, list) {
+ if (addr >= p->addr && addr < (p->addr + p->len))
+ return p;
+ }
+ return NULL;
+}
+
+/* You must be holding RCU read lock. */
+static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long addr)
+{
+ struct list_head *head;
+ struct kmmio_fault_page *f;
+ unsigned int l;
+ pte_t *pte = lookup_address(addr, &l);
+
+ if (!pte)
+ return NULL;
+ addr &= page_level_mask(l);
+ head = kmmio_page_list(addr);
+ list_for_each_entry_rcu(f, head, list) {
+ if (f->addr == addr)
+ return f;
+ }
+ return NULL;
+}
+
+static void clear_pmd_presence(pmd_t *pmd, bool clear, pmdval_t *old)
+{
+ pmd_t new_pmd;
+ pmdval_t v = pmd_val(*pmd);
+ if (clear) {
+ *old = v;
+ new_pmd = pmd_mknotpresent(*pmd);
+ } else {
+ /* Presume this has been called with clear==true previously */
+ new_pmd = __pmd(*old);
+ }
+ set_pmd(pmd, new_pmd);
+}
+
+static void clear_pte_presence(pte_t *pte, bool clear, pteval_t *old)
+{
+ pteval_t v = pte_val(*pte);
+ if (clear) {
+ *old = v;
+ /* Nothing should care about address */
+ pte_clear(&init_mm, 0, pte);
+ } else {
+ /* Presume this has been called with clear==true previously */
+ set_pte_atomic(pte, __pte(*old));
+ }
+}
+
+static int clear_page_presence(struct kmmio_fault_page *f, bool clear)
+{
+ unsigned int level;
+ pte_t *pte = lookup_address(f->addr, &level);
+
+ if (!pte) {
+ pr_err("no pte for addr 0x%08lx\n", f->addr);
+ return -1;
+ }
+
+ switch (level) {
+ case PG_LEVEL_2M:
+ clear_pmd_presence((pmd_t *)pte, clear, &f->old_presence);
+ break;
+ case PG_LEVEL_4K:
+ clear_pte_presence(pte, clear, &f->old_presence);
+ break;
+ default:
+ pr_err("unexpected page level 0x%x.\n", level);
+ return -1;
+ }
+
+ __flush_tlb_one_kernel(f->addr);
+ return 0;
+}
+
+/*
+ * Mark the given page as not present. Access to it will trigger a fault.
+ *
+ * Struct kmmio_fault_page is protected by RCU and kmmio_lock, but the
+ * protection is ignored here. RCU read lock is assumed held, so the struct
+ * will not disappear unexpectedly. Furthermore, the caller must guarantee,
+ * that double arming the same virtual address (page) cannot occur.
+ *
+ * Double disarming on the other hand is allowed, and may occur when a fault
+ * and mmiotrace shutdown happen simultaneously.
+ */
+static int arm_kmmio_fault_page(struct kmmio_fault_page *f)
+{
+ int ret;
+ WARN_ONCE(f->armed, KERN_ERR pr_fmt("kmmio page already armed.\n"));
+ if (f->armed) {
+ pr_warning("double-arm: addr 0x%08lx, ref %d, old %d\n",
+ f->addr, f->count, !!f->old_presence);
+ }
+ ret = clear_page_presence(f, true);
+ WARN_ONCE(ret < 0, KERN_ERR pr_fmt("arming at 0x%08lx failed.\n"),
+ f->addr);
+ f->armed = true;
+ return ret;
+}
+
+/** Restore the given page to saved presence state. */
+static void disarm_kmmio_fault_page(struct kmmio_fault_page *f)
+{
+ int ret = clear_page_presence(f, false);
+ WARN_ONCE(ret < 0,
+ KERN_ERR "kmmio disarming at 0x%08lx failed.\n", f->addr);
+ f->armed = false;
+}
+
+/*
+ * This is being called from do_page_fault().
+ *
+ * We may be in an interrupt or a critical section. Also prefecthing may
+ * trigger a page fault. We may be in the middle of process switch.
+ * We cannot take any locks, because we could be executing especially
+ * within a kmmio critical section.
+ *
+ * Local interrupts are disabled, so preemption cannot happen.
+ * Do not enable interrupts, do not sleep, and watch out for other CPUs.
+ */
+/*
+ * Interrupts are disabled on entry as trap3 is an interrupt gate
+ * and they remain disabled throughout this function.
+ */
+int kmmio_handler(struct pt_regs *regs, unsigned long addr)
+{
+ struct kmmio_context *ctx;
+ struct kmmio_fault_page *faultpage;
+ int ret = 0; /* default to fault not handled */
+ unsigned long page_base = addr;
+ unsigned int l;
+ pte_t *pte = lookup_address(addr, &l);
+ if (!pte)
+ return -EINVAL;
+ page_base &= page_level_mask(l);
+
+ /*
+ * Preemption is now disabled to prevent process switch during
+ * single stepping. We can only handle one active kmmio trace
+ * per cpu, so ensure that we finish it before something else
+ * gets to run. We also hold the RCU read lock over single
+ * stepping to avoid looking up the probe and kmmio_fault_page
+ * again.
+ */
+ preempt_disable();
+ rcu_read_lock();
+
+ faultpage = get_kmmio_fault_page(page_base);
+ if (!faultpage) {
+ /*
+ * Either this page fault is not caused by kmmio, or
+ * another CPU just pulled the kmmio probe from under
+ * our feet. The latter case should not be possible.
+ */
+ goto no_kmmio;
+ }
+
+ ctx = &get_cpu_var(kmmio_ctx);
+ if (ctx->active) {
+ if (page_base == ctx->addr) {
+ /*
+ * A second fault on the same page means some other
+ * condition needs handling by do_page_fault(), the
+ * page really not being present is the most common.
+ */
+ pr_debug("secondary hit for 0x%08lx CPU %d.\n",
+ addr, smp_processor_id());
+
+ if (!faultpage->old_presence)
+ pr_info("unexpected secondary hit for address 0x%08lx on CPU %d.\n",
+ addr, smp_processor_id());
+ } else {
+ /*
+ * Prevent overwriting already in-flight context.
+ * This should not happen, let's hope disarming at
+ * least prevents a panic.
+ */
+ pr_emerg("recursive probe hit on CPU %d, for address 0x%08lx. Ignoring.\n",
+ smp_processor_id(), addr);
+ pr_emerg("previous hit was at 0x%08lx.\n", ctx->addr);
+ disarm_kmmio_fault_page(faultpage);
+ }
+ goto no_kmmio_ctx;
+ }
+ ctx->active++;
+
+ ctx->fpage = faultpage;
+ ctx->probe = get_kmmio_probe(page_base);
+ ctx->saved_flags = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF));
+ ctx->addr = page_base;
+
+ if (ctx->probe && ctx->probe->pre_handler)
+ ctx->probe->pre_handler(ctx->probe, regs, addr);
+
+ /*
+ * Enable single-stepping and disable interrupts for the faulting
+ * context. Local interrupts must not get enabled during stepping.
+ */
+ regs->flags |= X86_EFLAGS_TF;
+ regs->flags &= ~X86_EFLAGS_IF;
+
+ /* Now we set present bit in PTE and single step. */
+ disarm_kmmio_fault_page(ctx->fpage);
+
+ /*
+ * If another cpu accesses the same page while we are stepping,
+ * the access will not be caught. It will simply succeed and the
+ * only downside is we lose the event. If this becomes a problem,
+ * the user should drop to single cpu before tracing.
+ */
+
+ put_cpu_var(kmmio_ctx);
+ return 1; /* fault handled */
+
+no_kmmio_ctx:
+ put_cpu_var(kmmio_ctx);
+no_kmmio:
+ rcu_read_unlock();
+ preempt_enable_no_resched();
+ return ret;
+}
+
+/*
+ * Interrupts are disabled on entry as trap1 is an interrupt gate
+ * and they remain disabled throughout this function.
+ * This must always get called as the pair to kmmio_handler().
+ */
+static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
+{
+ int ret = 0;
+ struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx);
+
+ if (!ctx->active) {
+ /*
+ * debug traps without an active context are due to either
+ * something external causing them (f.e. using a debugger while
+ * mmio tracing enabled), or erroneous behaviour
+ */
+ pr_warning("unexpected debug trap on CPU %d.\n",
+ smp_processor_id());
+ goto out;
+ }
+
+ if (ctx->probe && ctx->probe->post_handler)
+ ctx->probe->post_handler(ctx->probe, condition, regs);
+
+ /* Prevent racing against release_kmmio_fault_page(). */
+ spin_lock(&kmmio_lock);
+ if (ctx->fpage->count)
+ arm_kmmio_fault_page(ctx->fpage);
+ spin_unlock(&kmmio_lock);
+
+ regs->flags &= ~X86_EFLAGS_TF;
+ regs->flags |= ctx->saved_flags;
+
+ /* These were acquired in kmmio_handler(). */
+ ctx->active--;
+ BUG_ON(ctx->active);
+ rcu_read_unlock();
+ preempt_enable_no_resched();
+
+ /*
+ * if somebody else is singlestepping across a probe point, flags
+ * will have TF set, in which case, continue the remaining processing
+ * of do_debug, as if this is not a probe hit.
+ */
+ if (!(regs->flags & X86_EFLAGS_TF))
+ ret = 1;
+out:
+ put_cpu_var(kmmio_ctx);
+ return ret;
+}
+
+/* You must be holding kmmio_lock. */
+static int add_kmmio_fault_page(unsigned long addr)
+{
+ struct kmmio_fault_page *f;
+
+ f = get_kmmio_fault_page(addr);
+ if (f) {
+ if (!f->count)
+ arm_kmmio_fault_page(f);
+ f->count++;
+ return 0;
+ }
+
+ f = kzalloc(sizeof(*f), GFP_ATOMIC);
+ if (!f)
+ return -1;
+
+ f->count = 1;
+ f->addr = addr;
+
+ if (arm_kmmio_fault_page(f)) {
+ kfree(f);
+ return -1;
+ }
+
+ list_add_rcu(&f->list, kmmio_page_list(f->addr));
+
+ return 0;
+}
+
+/* You must be holding kmmio_lock. */
+static void release_kmmio_fault_page(unsigned long addr,
+ struct kmmio_fault_page **release_list)
+{
+ struct kmmio_fault_page *f;
+
+ f = get_kmmio_fault_page(addr);
+ if (!f)
+ return;
+
+ f->count--;
+ BUG_ON(f->count < 0);
+ if (!f->count) {
+ disarm_kmmio_fault_page(f);
+ if (!f->scheduled_for_release) {
+ f->release_next = *release_list;
+ *release_list = f;
+ f->scheduled_for_release = true;
+ }
+ }
+}
+
+/*
+ * With page-unaligned ioremaps, one or two armed pages may contain
+ * addresses from outside the intended mapping. Events for these addresses
+ * are currently silently dropped. The events may result only from programming
+ * mistakes by accessing addresses before the beginning or past the end of a
+ * mapping.
+ */
+int register_kmmio_probe(struct kmmio_probe *p)
+{
+ unsigned long flags;
+ int ret = 0;
+ unsigned long size = 0;
+ unsigned long addr = p->addr & PAGE_MASK;
+ const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
+ unsigned int l;
+ pte_t *pte;
+
+ spin_lock_irqsave(&kmmio_lock, flags);
+ if (get_kmmio_probe(addr)) {
+ ret = -EEXIST;
+ goto out;
+ }
+
+ pte = lookup_address(addr, &l);
+ if (!pte) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ kmmio_count++;
+ list_add_rcu(&p->list, &kmmio_probes);
+ while (size < size_lim) {
+ if (add_kmmio_fault_page(addr + size))
+ pr_err("Unable to set page fault.\n");
+ size += page_level_size(l);
+ }
+out:
+ spin_unlock_irqrestore(&kmmio_lock, flags);
+ /*
+ * XXX: What should I do here?
+ * Here was a call to global_flush_tlb(), but it does not exist
+ * anymore. It seems it's not needed after all.
+ */
+ return ret;
+}
+EXPORT_SYMBOL(register_kmmio_probe);
+
+static void rcu_free_kmmio_fault_pages(struct rcu_head *head)
+{
+ struct kmmio_delayed_release *dr = container_of(
+ head,
+ struct kmmio_delayed_release,
+ rcu);
+ struct kmmio_fault_page *f = dr->release_list;
+ while (f) {
+ struct kmmio_fault_page *next = f->release_next;
+ BUG_ON(f->count);
+ kfree(f);
+ f = next;
+ }
+ kfree(dr);
+}
+
+static void remove_kmmio_fault_pages(struct rcu_head *head)
+{
+ struct kmmio_delayed_release *dr =
+ container_of(head, struct kmmio_delayed_release, rcu);
+ struct kmmio_fault_page *f = dr->release_list;
+ struct kmmio_fault_page **prevp = &dr->release_list;
+ unsigned long flags;
+
+ spin_lock_irqsave(&kmmio_lock, flags);
+ while (f) {
+ if (!f->count) {
+ list_del_rcu(&f->list);
+ prevp = &f->release_next;
+ } else {
+ *prevp = f->release_next;
+ f->release_next = NULL;
+ f->scheduled_for_release = false;
+ }
+ f = *prevp;
+ }
+ spin_unlock_irqrestore(&kmmio_lock, flags);
+
+ /* This is the real RCU destroy call. */
+ call_rcu(&dr->rcu, rcu_free_kmmio_fault_pages);
+}
+
+/*
+ * Remove a kmmio probe. You have to synchronize_rcu() before you can be
+ * sure that the callbacks will not be called anymore. Only after that
+ * you may actually release your struct kmmio_probe.
+ *
+ * Unregistering a kmmio fault page has three steps:
+ * 1. release_kmmio_fault_page()
+ * Disarm the page, wait a grace period to let all faults finish.
+ * 2. remove_kmmio_fault_pages()
+ * Remove the pages from kmmio_page_table.
+ * 3. rcu_free_kmmio_fault_pages()
+ * Actually free the kmmio_fault_page structs as with RCU.
+ */
+void unregister_kmmio_probe(struct kmmio_probe *p)
+{
+ unsigned long flags;
+ unsigned long size = 0;
+ unsigned long addr = p->addr & PAGE_MASK;
+ const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
+ struct kmmio_fault_page *release_list = NULL;
+ struct kmmio_delayed_release *drelease;
+ unsigned int l;
+ pte_t *pte;
+
+ pte = lookup_address(addr, &l);
+ if (!pte)
+ return;
+
+ spin_lock_irqsave(&kmmio_lock, flags);
+ while (size < size_lim) {
+ release_kmmio_fault_page(addr + size, &release_list);
+ size += page_level_size(l);
+ }
+ list_del_rcu(&p->list);
+ kmmio_count--;
+ spin_unlock_irqrestore(&kmmio_lock, flags);
+
+ if (!release_list)
+ return;
+
+ drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC);
+ if (!drelease) {
+ pr_crit("leaking kmmio_fault_page objects.\n");
+ return;
+ }
+ drelease->release_list = release_list;
+
+ /*
+ * This is not really RCU here. We have just disarmed a set of
+ * pages so that they cannot trigger page faults anymore. However,
+ * we cannot remove the pages from kmmio_page_table,
+ * because a probe hit might be in flight on another CPU. The
+ * pages are collected into a list, and they will be removed from
+ * kmmio_page_table when it is certain that no probe hit related to
+ * these pages can be in flight. RCU grace period sounds like a
+ * good choice.
+ *
+ * If we removed the pages too early, kmmio page fault handler might
+ * not find the respective kmmio_fault_page and determine it's not
+ * a kmmio fault, when it actually is. This would lead to madness.
+ */
+ call_rcu(&drelease->rcu, remove_kmmio_fault_pages);
+}
+EXPORT_SYMBOL(unregister_kmmio_probe);
+
+static int
+kmmio_die_notifier(struct notifier_block *nb, unsigned long val, void *args)
+{
+ struct die_args *arg = args;
+ unsigned long* dr6_p = (unsigned long *)ERR_PTR(arg->err);
+
+ if (val == DIE_DEBUG && (*dr6_p & DR_STEP))
+ if (post_kmmio_handler(*dr6_p, arg->regs) == 1) {
+ /*
+ * Reset the BS bit in dr6 (pointed by args->err) to
+ * denote completion of processing
+ */
+ *dr6_p &= ~DR_STEP;
+ return NOTIFY_STOP;
+ }
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block nb_die = {
+ .notifier_call = kmmio_die_notifier
+};
+
+int kmmio_init(void)
+{
+ int i;
+
+ for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++)
+ INIT_LIST_HEAD(&kmmio_page_table[i]);
+
+ return register_die_notifier(&nb_die);
+}
+
+void kmmio_cleanup(void)
+{
+ int i;
+
+ unregister_die_notifier(&nb_die);
+ for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++) {
+ WARN_ONCE(!list_empty(&kmmio_page_table[i]),
+ KERN_ERR "kmmio_page_table not empty at cleanup, any further tracing will leak memory.\n");
+ }
+}
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
new file mode 100644
index 000000000..c0499c389
--- /dev/null
+++ b/arch/x86/mm/mem_encrypt.c
@@ -0,0 +1,400 @@
+/*
+ * AMD Memory Encryption Support
+ *
+ * Copyright (C) 2016 Advanced Micro Devices, Inc.
+ *
+ * Author: Tom Lendacky <thomas.lendacky@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#define DISABLE_BRANCH_PROFILING
+
+#include <linux/linkage.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/dma-direct.h>
+#include <linux/swiotlb.h>
+#include <linux/mem_encrypt.h>
+
+#include <asm/tlbflush.h>
+#include <asm/fixmap.h>
+#include <asm/setup.h>
+#include <asm/bootparam.h>
+#include <asm/set_memory.h>
+#include <asm/cacheflush.h>
+#include <asm/processor-flags.h>
+#include <asm/msr.h>
+#include <asm/cmdline.h>
+
+#include "mm_internal.h"
+
+/*
+ * Since SME related variables are set early in the boot process they must
+ * reside in the .data section so as not to be zeroed out when the .bss
+ * section is later cleared.
+ */
+u64 sme_me_mask __section(.data) = 0;
+EXPORT_SYMBOL(sme_me_mask);
+DEFINE_STATIC_KEY_FALSE(sev_enable_key);
+EXPORT_SYMBOL_GPL(sev_enable_key);
+
+bool sev_enabled __section(.data);
+
+/* Buffer used for early in-place encryption by BSP, no locking needed */
+static char sme_early_buffer[PAGE_SIZE] __aligned(PAGE_SIZE);
+
+/*
+ * This routine does not change the underlying encryption setting of the
+ * page(s) that map this memory. It assumes that eventually the memory is
+ * meant to be accessed as either encrypted or decrypted but the contents
+ * are currently not in the desired state.
+ *
+ * This routine follows the steps outlined in the AMD64 Architecture
+ * Programmer's Manual Volume 2, Section 7.10.8 Encrypt-in-Place.
+ */
+static void __init __sme_early_enc_dec(resource_size_t paddr,
+ unsigned long size, bool enc)
+{
+ void *src, *dst;
+ size_t len;
+
+ if (!sme_me_mask)
+ return;
+
+ wbinvd();
+
+ /*
+ * There are limited number of early mapping slots, so map (at most)
+ * one page at time.
+ */
+ while (size) {
+ len = min_t(size_t, sizeof(sme_early_buffer), size);
+
+ /*
+ * Create mappings for the current and desired format of
+ * the memory. Use a write-protected mapping for the source.
+ */
+ src = enc ? early_memremap_decrypted_wp(paddr, len) :
+ early_memremap_encrypted_wp(paddr, len);
+
+ dst = enc ? early_memremap_encrypted(paddr, len) :
+ early_memremap_decrypted(paddr, len);
+
+ /*
+ * If a mapping can't be obtained to perform the operation,
+ * then eventual access of that area in the desired mode
+ * will cause a crash.
+ */
+ BUG_ON(!src || !dst);
+
+ /*
+ * Use a temporary buffer, of cache-line multiple size, to
+ * avoid data corruption as documented in the APM.
+ */
+ memcpy(sme_early_buffer, src, len);
+ memcpy(dst, sme_early_buffer, len);
+
+ early_memunmap(dst, len);
+ early_memunmap(src, len);
+
+ paddr += len;
+ size -= len;
+ }
+}
+
+void __init sme_early_encrypt(resource_size_t paddr, unsigned long size)
+{
+ __sme_early_enc_dec(paddr, size, true);
+}
+
+void __init sme_early_decrypt(resource_size_t paddr, unsigned long size)
+{
+ __sme_early_enc_dec(paddr, size, false);
+}
+
+static void __init __sme_early_map_unmap_mem(void *vaddr, unsigned long size,
+ bool map)
+{
+ unsigned long paddr = (unsigned long)vaddr - __PAGE_OFFSET;
+ pmdval_t pmd_flags, pmd;
+
+ /* Use early_pmd_flags but remove the encryption mask */
+ pmd_flags = __sme_clr(early_pmd_flags);
+
+ do {
+ pmd = map ? (paddr & PMD_MASK) + pmd_flags : 0;
+ __early_make_pgtable((unsigned long)vaddr, pmd);
+
+ vaddr += PMD_SIZE;
+ paddr += PMD_SIZE;
+ size = (size <= PMD_SIZE) ? 0 : size - PMD_SIZE;
+ } while (size);
+
+ __native_flush_tlb();
+}
+
+void __init sme_unmap_bootdata(char *real_mode_data)
+{
+ struct boot_params *boot_data;
+ unsigned long cmdline_paddr;
+
+ if (!sme_active())
+ return;
+
+ /* Get the command line address before unmapping the real_mode_data */
+ boot_data = (struct boot_params *)real_mode_data;
+ cmdline_paddr = boot_data->hdr.cmd_line_ptr | ((u64)boot_data->ext_cmd_line_ptr << 32);
+
+ __sme_early_map_unmap_mem(real_mode_data, sizeof(boot_params), false);
+
+ if (!cmdline_paddr)
+ return;
+
+ __sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, false);
+}
+
+void __init sme_map_bootdata(char *real_mode_data)
+{
+ struct boot_params *boot_data;
+ unsigned long cmdline_paddr;
+
+ if (!sme_active())
+ return;
+
+ __sme_early_map_unmap_mem(real_mode_data, sizeof(boot_params), true);
+
+ /* Get the command line address after mapping the real_mode_data */
+ boot_data = (struct boot_params *)real_mode_data;
+ cmdline_paddr = boot_data->hdr.cmd_line_ptr | ((u64)boot_data->ext_cmd_line_ptr << 32);
+
+ if (!cmdline_paddr)
+ return;
+
+ __sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, true);
+}
+
+void __init sme_early_init(void)
+{
+ unsigned int i;
+
+ if (!sme_me_mask)
+ return;
+
+ early_pmd_flags = __sme_set(early_pmd_flags);
+
+ __supported_pte_mask = __sme_set(__supported_pte_mask);
+
+ /* Update the protection map with memory encryption mask */
+ for (i = 0; i < ARRAY_SIZE(protection_map); i++)
+ protection_map[i] = pgprot_encrypted(protection_map[i]);
+
+ if (sev_active())
+ swiotlb_force = SWIOTLB_FORCE;
+}
+
+static void __init __set_clr_pte_enc(pte_t *kpte, int level, bool enc)
+{
+ pgprot_t old_prot, new_prot;
+ unsigned long pfn, pa, size;
+ pte_t new_pte;
+
+ switch (level) {
+ case PG_LEVEL_4K:
+ pfn = pte_pfn(*kpte);
+ old_prot = pte_pgprot(*kpte);
+ break;
+ case PG_LEVEL_2M:
+ pfn = pmd_pfn(*(pmd_t *)kpte);
+ old_prot = pmd_pgprot(*(pmd_t *)kpte);
+ break;
+ case PG_LEVEL_1G:
+ pfn = pud_pfn(*(pud_t *)kpte);
+ old_prot = pud_pgprot(*(pud_t *)kpte);
+ break;
+ default:
+ return;
+ }
+
+ new_prot = old_prot;
+ if (enc)
+ pgprot_val(new_prot) |= _PAGE_ENC;
+ else
+ pgprot_val(new_prot) &= ~_PAGE_ENC;
+
+ /* If prot is same then do nothing. */
+ if (pgprot_val(old_prot) == pgprot_val(new_prot))
+ return;
+
+ pa = pfn << PAGE_SHIFT;
+ size = page_level_size(level);
+
+ /*
+ * We are going to perform in-place en-/decryption and change the
+ * physical page attribute from C=1 to C=0 or vice versa. Flush the
+ * caches to ensure that data gets accessed with the correct C-bit.
+ */
+ clflush_cache_range(__va(pa), size);
+
+ /* Encrypt/decrypt the contents in-place */
+ if (enc)
+ sme_early_encrypt(pa, size);
+ else
+ sme_early_decrypt(pa, size);
+
+ /* Change the page encryption mask. */
+ new_pte = pfn_pte(pfn, new_prot);
+ set_pte_atomic(kpte, new_pte);
+}
+
+static int __init early_set_memory_enc_dec(unsigned long vaddr,
+ unsigned long size, bool enc)
+{
+ unsigned long vaddr_end, vaddr_next;
+ unsigned long psize, pmask;
+ int split_page_size_mask;
+ int level, ret;
+ pte_t *kpte;
+
+ vaddr_next = vaddr;
+ vaddr_end = vaddr + size;
+
+ for (; vaddr < vaddr_end; vaddr = vaddr_next) {
+ kpte = lookup_address(vaddr, &level);
+ if (!kpte || pte_none(*kpte)) {
+ ret = 1;
+ goto out;
+ }
+
+ if (level == PG_LEVEL_4K) {
+ __set_clr_pte_enc(kpte, level, enc);
+ vaddr_next = (vaddr & PAGE_MASK) + PAGE_SIZE;
+ continue;
+ }
+
+ psize = page_level_size(level);
+ pmask = page_level_mask(level);
+
+ /*
+ * Check whether we can change the large page in one go.
+ * We request a split when the address is not aligned and
+ * the number of pages to set/clear encryption bit is smaller
+ * than the number of pages in the large page.
+ */
+ if (vaddr == (vaddr & pmask) &&
+ ((vaddr_end - vaddr) >= psize)) {
+ __set_clr_pte_enc(kpte, level, enc);
+ vaddr_next = (vaddr & pmask) + psize;
+ continue;
+ }
+
+ /*
+ * The virtual address is part of a larger page, create the next
+ * level page table mapping (4K or 2M). If it is part of a 2M
+ * page then we request a split of the large page into 4K
+ * chunks. A 1GB large page is split into 2M pages, resp.
+ */
+ if (level == PG_LEVEL_2M)
+ split_page_size_mask = 0;
+ else
+ split_page_size_mask = 1 << PG_LEVEL_2M;
+
+ kernel_physical_mapping_init(__pa(vaddr & pmask),
+ __pa((vaddr_end & pmask) + psize),
+ split_page_size_mask);
+ }
+
+ ret = 0;
+
+out:
+ __flush_tlb_all();
+ return ret;
+}
+
+int __init early_set_memory_decrypted(unsigned long vaddr, unsigned long size)
+{
+ return early_set_memory_enc_dec(vaddr, size, false);
+}
+
+int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size)
+{
+ return early_set_memory_enc_dec(vaddr, size, true);
+}
+
+/*
+ * SME and SEV are very similar but they are not the same, so there are
+ * times that the kernel will need to distinguish between SME and SEV. The
+ * sme_active() and sev_active() functions are used for this. When a
+ * distinction isn't needed, the mem_encrypt_active() function can be used.
+ *
+ * The trampoline code is a good example for this requirement. Before
+ * paging is activated, SME will access all memory as decrypted, but SEV
+ * will access all memory as encrypted. So, when APs are being brought
+ * up under SME the trampoline area cannot be encrypted, whereas under SEV
+ * the trampoline area must be encrypted.
+ */
+bool sme_active(void)
+{
+ return sme_me_mask && !sev_enabled;
+}
+EXPORT_SYMBOL(sme_active);
+
+bool sev_active(void)
+{
+ return sme_me_mask && sev_enabled;
+}
+EXPORT_SYMBOL(sev_active);
+
+/* Architecture __weak replacement functions */
+void __init mem_encrypt_free_decrypted_mem(void)
+{
+ unsigned long vaddr, vaddr_end, npages;
+ int r;
+
+ vaddr = (unsigned long)__start_bss_decrypted_unused;
+ vaddr_end = (unsigned long)__end_bss_decrypted;
+ npages = (vaddr_end - vaddr) >> PAGE_SHIFT;
+
+ /*
+ * The unused memory range was mapped decrypted, change the encryption
+ * attribute from decrypted to encrypted before freeing it.
+ */
+ if (mem_encrypt_active()) {
+ r = set_memory_encrypted(vaddr, npages);
+ if (r) {
+ pr_warn("failed to free unused decrypted pages\n");
+ return;
+ }
+ }
+
+ free_init_pages("unused decrypted", vaddr, vaddr_end);
+}
+
+void __init mem_encrypt_init(void)
+{
+ if (!sme_me_mask)
+ return;
+
+ /* Call into SWIOTLB to update the SWIOTLB DMA buffers */
+ swiotlb_update_mem_attributes();
+
+ /*
+ * With SEV, DMA operations cannot use encryption, we need to use
+ * SWIOTLB to bounce buffer DMA operation.
+ */
+ if (sev_active())
+ dma_ops = &swiotlb_dma_ops;
+
+ /*
+ * With SEV, we need to unroll the rep string I/O instructions.
+ */
+ if (sev_active())
+ static_branch_enable(&sev_enable_key);
+
+ pr_info("AMD %s active\n",
+ sev_active() ? "Secure Encrypted Virtualization (SEV)"
+ : "Secure Memory Encryption (SME)");
+}
+
diff --git a/arch/x86/mm/mem_encrypt_boot.S b/arch/x86/mm/mem_encrypt_boot.S
new file mode 100644
index 000000000..40a608506
--- /dev/null
+++ b/arch/x86/mm/mem_encrypt_boot.S
@@ -0,0 +1,159 @@
+/*
+ * AMD Memory Encryption Support
+ *
+ * Copyright (C) 2016 Advanced Micro Devices, Inc.
+ *
+ * Author: Tom Lendacky <thomas.lendacky@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/pgtable.h>
+#include <asm/page.h>
+#include <asm/processor-flags.h>
+#include <asm/msr-index.h>
+#include <asm/nospec-branch.h>
+
+ .text
+ .code64
+ENTRY(sme_encrypt_execute)
+
+ /*
+ * Entry parameters:
+ * RDI - virtual address for the encrypted mapping
+ * RSI - virtual address for the decrypted mapping
+ * RDX - length to encrypt
+ * RCX - virtual address of the encryption workarea, including:
+ * - stack page (PAGE_SIZE)
+ * - encryption routine page (PAGE_SIZE)
+ * - intermediate copy buffer (PMD_PAGE_SIZE)
+ * R8 - physcial address of the pagetables to use for encryption
+ */
+
+ push %rbp
+ movq %rsp, %rbp /* RBP now has original stack pointer */
+
+ /* Set up a one page stack in the non-encrypted memory area */
+ movq %rcx, %rax /* Workarea stack page */
+ leaq PAGE_SIZE(%rax), %rsp /* Set new stack pointer */
+ addq $PAGE_SIZE, %rax /* Workarea encryption routine */
+
+ push %r12
+ movq %rdi, %r10 /* Encrypted area */
+ movq %rsi, %r11 /* Decrypted area */
+ movq %rdx, %r12 /* Area length */
+
+ /* Copy encryption routine into the workarea */
+ movq %rax, %rdi /* Workarea encryption routine */
+ leaq __enc_copy(%rip), %rsi /* Encryption routine */
+ movq $(.L__enc_copy_end - __enc_copy), %rcx /* Encryption routine length */
+ rep movsb
+
+ /* Setup registers for call */
+ movq %r10, %rdi /* Encrypted area */
+ movq %r11, %rsi /* Decrypted area */
+ movq %r8, %rdx /* Pagetables used for encryption */
+ movq %r12, %rcx /* Area length */
+ movq %rax, %r8 /* Workarea encryption routine */
+ addq $PAGE_SIZE, %r8 /* Workarea intermediate copy buffer */
+
+ ANNOTATE_RETPOLINE_SAFE
+ call *%rax /* Call the encryption routine */
+
+ pop %r12
+
+ movq %rbp, %rsp /* Restore original stack pointer */
+ pop %rbp
+
+ ret
+ENDPROC(sme_encrypt_execute)
+
+ENTRY(__enc_copy)
+/*
+ * Routine used to encrypt memory in place.
+ * This routine must be run outside of the kernel proper since
+ * the kernel will be encrypted during the process. So this
+ * routine is defined here and then copied to an area outside
+ * of the kernel where it will remain and run decrypted
+ * during execution.
+ *
+ * On entry the registers must be:
+ * RDI - virtual address for the encrypted mapping
+ * RSI - virtual address for the decrypted mapping
+ * RDX - address of the pagetables to use for encryption
+ * RCX - length of area
+ * R8 - intermediate copy buffer
+ *
+ * RAX - points to this routine
+ *
+ * The area will be encrypted by copying from the non-encrypted
+ * memory space to an intermediate buffer and then copying from the
+ * intermediate buffer back to the encrypted memory space. The physical
+ * addresses of the two mappings are the same which results in the area
+ * being encrypted "in place".
+ */
+ /* Enable the new page tables */
+ mov %rdx, %cr3
+
+ /* Flush any global TLBs */
+ mov %cr4, %rdx
+ andq $~X86_CR4_PGE, %rdx
+ mov %rdx, %cr4
+ orq $X86_CR4_PGE, %rdx
+ mov %rdx, %cr4
+
+ push %r15
+ push %r12
+
+ movq %rcx, %r9 /* Save area length */
+ movq %rdi, %r10 /* Save encrypted area address */
+ movq %rsi, %r11 /* Save decrypted area address */
+
+ /* Set the PAT register PA5 entry to write-protect */
+ movl $MSR_IA32_CR_PAT, %ecx
+ rdmsr
+ mov %rdx, %r15 /* Save original PAT value */
+ andl $0xffff00ff, %edx /* Clear PA5 */
+ orl $0x00000500, %edx /* Set PA5 to WP */
+ wrmsr
+
+ wbinvd /* Invalidate any cache entries */
+
+ /* Copy/encrypt up to 2MB at a time */
+ movq $PMD_PAGE_SIZE, %r12
+1:
+ cmpq %r12, %r9
+ jnb 2f
+ movq %r9, %r12
+
+2:
+ movq %r11, %rsi /* Source - decrypted area */
+ movq %r8, %rdi /* Dest - intermediate copy buffer */
+ movq %r12, %rcx
+ rep movsb
+
+ movq %r8, %rsi /* Source - intermediate copy buffer */
+ movq %r10, %rdi /* Dest - encrypted area */
+ movq %r12, %rcx
+ rep movsb
+
+ addq %r12, %r11
+ addq %r12, %r10
+ subq %r12, %r9 /* Kernel length decrement */
+ jnz 1b /* Kernel length not zero? */
+
+ /* Restore PAT register */
+ movl $MSR_IA32_CR_PAT, %ecx
+ rdmsr
+ mov %r15, %rdx /* Restore original PAT value */
+ wrmsr
+
+ pop %r12
+ pop %r15
+
+ ret
+.L__enc_copy_end:
+ENDPROC(__enc_copy)
diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c
new file mode 100644
index 000000000..650d5a6ca
--- /dev/null
+++ b/arch/x86/mm/mem_encrypt_identity.c
@@ -0,0 +1,576 @@
+/*
+ * AMD Memory Encryption Support
+ *
+ * Copyright (C) 2016 Advanced Micro Devices, Inc.
+ *
+ * Author: Tom Lendacky <thomas.lendacky@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#define DISABLE_BRANCH_PROFILING
+
+/*
+ * Since we're dealing with identity mappings, physical and virtual
+ * addresses are the same, so override these defines which are ultimately
+ * used by the headers in misc.h.
+ */
+#define __pa(x) ((unsigned long)(x))
+#define __va(x) ((void *)((unsigned long)(x)))
+
+/*
+ * Special hack: we have to be careful, because no indirections are
+ * allowed here, and paravirt_ops is a kind of one. As it will only run in
+ * baremetal anyway, we just keep it from happening. (This list needs to
+ * be extended when new paravirt and debugging variants are added.)
+ */
+#undef CONFIG_PARAVIRT
+#undef CONFIG_PARAVIRT_SPINLOCKS
+
+/*
+ * This code runs before CPU feature bits are set. By default, the
+ * pgtable_l5_enabled() function uses bit X86_FEATURE_LA57 to determine if
+ * 5-level paging is active, so that won't work here. USE_EARLY_PGTABLE_L5
+ * is provided to handle this situation and, instead, use a variable that
+ * has been set by the early boot code.
+ */
+#define USE_EARLY_PGTABLE_L5
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/mem_encrypt.h>
+
+#include <asm/setup.h>
+#include <asm/sections.h>
+#include <asm/cmdline.h>
+
+#include "mm_internal.h"
+
+#define PGD_FLAGS _KERNPG_TABLE_NOENC
+#define P4D_FLAGS _KERNPG_TABLE_NOENC
+#define PUD_FLAGS _KERNPG_TABLE_NOENC
+#define PMD_FLAGS _KERNPG_TABLE_NOENC
+
+#define PMD_FLAGS_LARGE (__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL)
+
+#define PMD_FLAGS_DEC PMD_FLAGS_LARGE
+#define PMD_FLAGS_DEC_WP ((PMD_FLAGS_DEC & ~_PAGE_LARGE_CACHE_MASK) | \
+ (_PAGE_PAT_LARGE | _PAGE_PWT))
+
+#define PMD_FLAGS_ENC (PMD_FLAGS_LARGE | _PAGE_ENC)
+
+#define PTE_FLAGS (__PAGE_KERNEL_EXEC & ~_PAGE_GLOBAL)
+
+#define PTE_FLAGS_DEC PTE_FLAGS
+#define PTE_FLAGS_DEC_WP ((PTE_FLAGS_DEC & ~_PAGE_CACHE_MASK) | \
+ (_PAGE_PAT | _PAGE_PWT))
+
+#define PTE_FLAGS_ENC (PTE_FLAGS | _PAGE_ENC)
+
+struct sme_populate_pgd_data {
+ void *pgtable_area;
+ pgd_t *pgd;
+
+ pmdval_t pmd_flags;
+ pteval_t pte_flags;
+ unsigned long paddr;
+
+ unsigned long vaddr;
+ unsigned long vaddr_end;
+};
+
+static char sme_cmdline_arg[] __initdata = "mem_encrypt";
+static char sme_cmdline_on[] __initdata = "on";
+static char sme_cmdline_off[] __initdata = "off";
+
+static void __init sme_clear_pgd(struct sme_populate_pgd_data *ppd)
+{
+ unsigned long pgd_start, pgd_end, pgd_size;
+ pgd_t *pgd_p;
+
+ pgd_start = ppd->vaddr & PGDIR_MASK;
+ pgd_end = ppd->vaddr_end & PGDIR_MASK;
+
+ pgd_size = (((pgd_end - pgd_start) / PGDIR_SIZE) + 1) * sizeof(pgd_t);
+
+ pgd_p = ppd->pgd + pgd_index(ppd->vaddr);
+
+ memset(pgd_p, 0, pgd_size);
+}
+
+static pud_t __init *sme_prepare_pgd(struct sme_populate_pgd_data *ppd)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+
+ pgd = ppd->pgd + pgd_index(ppd->vaddr);
+ if (pgd_none(*pgd)) {
+ p4d = ppd->pgtable_area;
+ memset(p4d, 0, sizeof(*p4d) * PTRS_PER_P4D);
+ ppd->pgtable_area += sizeof(*p4d) * PTRS_PER_P4D;
+ set_pgd(pgd, __pgd(PGD_FLAGS | __pa(p4d)));
+ }
+
+ p4d = p4d_offset(pgd, ppd->vaddr);
+ if (p4d_none(*p4d)) {
+ pud = ppd->pgtable_area;
+ memset(pud, 0, sizeof(*pud) * PTRS_PER_PUD);
+ ppd->pgtable_area += sizeof(*pud) * PTRS_PER_PUD;
+ set_p4d(p4d, __p4d(P4D_FLAGS | __pa(pud)));
+ }
+
+ pud = pud_offset(p4d, ppd->vaddr);
+ if (pud_none(*pud)) {
+ pmd = ppd->pgtable_area;
+ memset(pmd, 0, sizeof(*pmd) * PTRS_PER_PMD);
+ ppd->pgtable_area += sizeof(*pmd) * PTRS_PER_PMD;
+ set_pud(pud, __pud(PUD_FLAGS | __pa(pmd)));
+ }
+
+ if (pud_large(*pud))
+ return NULL;
+
+ return pud;
+}
+
+static void __init sme_populate_pgd_large(struct sme_populate_pgd_data *ppd)
+{
+ pud_t *pud;
+ pmd_t *pmd;
+
+ pud = sme_prepare_pgd(ppd);
+ if (!pud)
+ return;
+
+ pmd = pmd_offset(pud, ppd->vaddr);
+ if (pmd_large(*pmd))
+ return;
+
+ set_pmd(pmd, __pmd(ppd->paddr | ppd->pmd_flags));
+}
+
+static void __init sme_populate_pgd(struct sme_populate_pgd_data *ppd)
+{
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ pud = sme_prepare_pgd(ppd);
+ if (!pud)
+ return;
+
+ pmd = pmd_offset(pud, ppd->vaddr);
+ if (pmd_none(*pmd)) {
+ pte = ppd->pgtable_area;
+ memset(pte, 0, sizeof(*pte) * PTRS_PER_PTE);
+ ppd->pgtable_area += sizeof(*pte) * PTRS_PER_PTE;
+ set_pmd(pmd, __pmd(PMD_FLAGS | __pa(pte)));
+ }
+
+ if (pmd_large(*pmd))
+ return;
+
+ pte = pte_offset_map(pmd, ppd->vaddr);
+ if (pte_none(*pte))
+ set_pte(pte, __pte(ppd->paddr | ppd->pte_flags));
+}
+
+static void __init __sme_map_range_pmd(struct sme_populate_pgd_data *ppd)
+{
+ while (ppd->vaddr < ppd->vaddr_end) {
+ sme_populate_pgd_large(ppd);
+
+ ppd->vaddr += PMD_PAGE_SIZE;
+ ppd->paddr += PMD_PAGE_SIZE;
+ }
+}
+
+static void __init __sme_map_range_pte(struct sme_populate_pgd_data *ppd)
+{
+ while (ppd->vaddr < ppd->vaddr_end) {
+ sme_populate_pgd(ppd);
+
+ ppd->vaddr += PAGE_SIZE;
+ ppd->paddr += PAGE_SIZE;
+ }
+}
+
+static void __init __sme_map_range(struct sme_populate_pgd_data *ppd,
+ pmdval_t pmd_flags, pteval_t pte_flags)
+{
+ unsigned long vaddr_end;
+
+ ppd->pmd_flags = pmd_flags;
+ ppd->pte_flags = pte_flags;
+
+ /* Save original end value since we modify the struct value */
+ vaddr_end = ppd->vaddr_end;
+
+ /* If start is not 2MB aligned, create PTE entries */
+ ppd->vaddr_end = ALIGN(ppd->vaddr, PMD_PAGE_SIZE);
+ __sme_map_range_pte(ppd);
+
+ /* Create PMD entries */
+ ppd->vaddr_end = vaddr_end & PMD_PAGE_MASK;
+ __sme_map_range_pmd(ppd);
+
+ /* If end is not 2MB aligned, create PTE entries */
+ ppd->vaddr_end = vaddr_end;
+ __sme_map_range_pte(ppd);
+}
+
+static void __init sme_map_range_encrypted(struct sme_populate_pgd_data *ppd)
+{
+ __sme_map_range(ppd, PMD_FLAGS_ENC, PTE_FLAGS_ENC);
+}
+
+static void __init sme_map_range_decrypted(struct sme_populate_pgd_data *ppd)
+{
+ __sme_map_range(ppd, PMD_FLAGS_DEC, PTE_FLAGS_DEC);
+}
+
+static void __init sme_map_range_decrypted_wp(struct sme_populate_pgd_data *ppd)
+{
+ __sme_map_range(ppd, PMD_FLAGS_DEC_WP, PTE_FLAGS_DEC_WP);
+}
+
+static unsigned long __init sme_pgtable_calc(unsigned long len)
+{
+ unsigned long entries = 0, tables = 0;
+
+ /*
+ * Perform a relatively simplistic calculation of the pagetable
+ * entries that are needed. Those mappings will be covered mostly
+ * by 2MB PMD entries so we can conservatively calculate the required
+ * number of P4D, PUD and PMD structures needed to perform the
+ * mappings. For mappings that are not 2MB aligned, PTE mappings
+ * would be needed for the start and end portion of the address range
+ * that fall outside of the 2MB alignment. This results in, at most,
+ * two extra pages to hold PTE entries for each range that is mapped.
+ * Incrementing the count for each covers the case where the addresses
+ * cross entries.
+ */
+
+ /* PGDIR_SIZE is equal to P4D_SIZE on 4-level machine. */
+ if (PTRS_PER_P4D > 1)
+ entries += (DIV_ROUND_UP(len, PGDIR_SIZE) + 1) * sizeof(p4d_t) * PTRS_PER_P4D;
+ entries += (DIV_ROUND_UP(len, P4D_SIZE) + 1) * sizeof(pud_t) * PTRS_PER_PUD;
+ entries += (DIV_ROUND_UP(len, PUD_SIZE) + 1) * sizeof(pmd_t) * PTRS_PER_PMD;
+ entries += 2 * sizeof(pte_t) * PTRS_PER_PTE;
+
+ /*
+ * Now calculate the added pagetable structures needed to populate
+ * the new pagetables.
+ */
+
+ if (PTRS_PER_P4D > 1)
+ tables += DIV_ROUND_UP(entries, PGDIR_SIZE) * sizeof(p4d_t) * PTRS_PER_P4D;
+ tables += DIV_ROUND_UP(entries, P4D_SIZE) * sizeof(pud_t) * PTRS_PER_PUD;
+ tables += DIV_ROUND_UP(entries, PUD_SIZE) * sizeof(pmd_t) * PTRS_PER_PMD;
+
+ return entries + tables;
+}
+
+void __init sme_encrypt_kernel(struct boot_params *bp)
+{
+ unsigned long workarea_start, workarea_end, workarea_len;
+ unsigned long execute_start, execute_end, execute_len;
+ unsigned long kernel_start, kernel_end, kernel_len;
+ unsigned long initrd_start, initrd_end, initrd_len;
+ struct sme_populate_pgd_data ppd;
+ unsigned long pgtable_area_len;
+ unsigned long decrypted_base;
+
+ if (!sme_active())
+ return;
+
+ /*
+ * Prepare for encrypting the kernel and initrd by building new
+ * pagetables with the necessary attributes needed to encrypt the
+ * kernel in place.
+ *
+ * One range of virtual addresses will map the memory occupied
+ * by the kernel and initrd as encrypted.
+ *
+ * Another range of virtual addresses will map the memory occupied
+ * by the kernel and initrd as decrypted and write-protected.
+ *
+ * The use of write-protect attribute will prevent any of the
+ * memory from being cached.
+ */
+
+ /* Physical addresses gives us the identity mapped virtual addresses */
+ kernel_start = __pa_symbol(_text);
+ kernel_end = ALIGN(__pa_symbol(_end), PMD_PAGE_SIZE);
+ kernel_len = kernel_end - kernel_start;
+
+ initrd_start = 0;
+ initrd_end = 0;
+ initrd_len = 0;
+#ifdef CONFIG_BLK_DEV_INITRD
+ initrd_len = (unsigned long)bp->hdr.ramdisk_size |
+ ((unsigned long)bp->ext_ramdisk_size << 32);
+ if (initrd_len) {
+ initrd_start = (unsigned long)bp->hdr.ramdisk_image |
+ ((unsigned long)bp->ext_ramdisk_image << 32);
+ initrd_end = PAGE_ALIGN(initrd_start + initrd_len);
+ initrd_len = initrd_end - initrd_start;
+ }
+#endif
+
+ /* Set the encryption workarea to be immediately after the kernel */
+ workarea_start = kernel_end;
+
+ /*
+ * Calculate required number of workarea bytes needed:
+ * executable encryption area size:
+ * stack page (PAGE_SIZE)
+ * encryption routine page (PAGE_SIZE)
+ * intermediate copy buffer (PMD_PAGE_SIZE)
+ * pagetable structures for the encryption of the kernel
+ * pagetable structures for workarea (in case not currently mapped)
+ */
+ execute_start = workarea_start;
+ execute_end = execute_start + (PAGE_SIZE * 2) + PMD_PAGE_SIZE;
+ execute_len = execute_end - execute_start;
+
+ /*
+ * One PGD for both encrypted and decrypted mappings and a set of
+ * PUDs and PMDs for each of the encrypted and decrypted mappings.
+ */
+ pgtable_area_len = sizeof(pgd_t) * PTRS_PER_PGD;
+ pgtable_area_len += sme_pgtable_calc(execute_end - kernel_start) * 2;
+ if (initrd_len)
+ pgtable_area_len += sme_pgtable_calc(initrd_len) * 2;
+
+ /* PUDs and PMDs needed in the current pagetables for the workarea */
+ pgtable_area_len += sme_pgtable_calc(execute_len + pgtable_area_len);
+
+ /*
+ * The total workarea includes the executable encryption area and
+ * the pagetable area. The start of the workarea is already 2MB
+ * aligned, align the end of the workarea on a 2MB boundary so that
+ * we don't try to create/allocate PTE entries from the workarea
+ * before it is mapped.
+ */
+ workarea_len = execute_len + pgtable_area_len;
+ workarea_end = ALIGN(workarea_start + workarea_len, PMD_PAGE_SIZE);
+
+ /*
+ * Set the address to the start of where newly created pagetable
+ * structures (PGDs, PUDs and PMDs) will be allocated. New pagetable
+ * structures are created when the workarea is added to the current
+ * pagetables and when the new encrypted and decrypted kernel
+ * mappings are populated.
+ */
+ ppd.pgtable_area = (void *)execute_end;
+
+ /*
+ * Make sure the current pagetable structure has entries for
+ * addressing the workarea.
+ */
+ ppd.pgd = (pgd_t *)native_read_cr3_pa();
+ ppd.paddr = workarea_start;
+ ppd.vaddr = workarea_start;
+ ppd.vaddr_end = workarea_end;
+ sme_map_range_decrypted(&ppd);
+
+ /* Flush the TLB - no globals so cr3 is enough */
+ native_write_cr3(__native_read_cr3());
+
+ /*
+ * A new pagetable structure is being built to allow for the kernel
+ * and initrd to be encrypted. It starts with an empty PGD that will
+ * then be populated with new PUDs and PMDs as the encrypted and
+ * decrypted kernel mappings are created.
+ */
+ ppd.pgd = ppd.pgtable_area;
+ memset(ppd.pgd, 0, sizeof(pgd_t) * PTRS_PER_PGD);
+ ppd.pgtable_area += sizeof(pgd_t) * PTRS_PER_PGD;
+
+ /*
+ * A different PGD index/entry must be used to get different
+ * pagetable entries for the decrypted mapping. Choose the next
+ * PGD index and convert it to a virtual address to be used as
+ * the base of the mapping.
+ */
+ decrypted_base = (pgd_index(workarea_end) + 1) & (PTRS_PER_PGD - 1);
+ if (initrd_len) {
+ unsigned long check_base;
+
+ check_base = (pgd_index(initrd_end) + 1) & (PTRS_PER_PGD - 1);
+ decrypted_base = max(decrypted_base, check_base);
+ }
+ decrypted_base <<= PGDIR_SHIFT;
+
+ /* Add encrypted kernel (identity) mappings */
+ ppd.paddr = kernel_start;
+ ppd.vaddr = kernel_start;
+ ppd.vaddr_end = kernel_end;
+ sme_map_range_encrypted(&ppd);
+
+ /* Add decrypted, write-protected kernel (non-identity) mappings */
+ ppd.paddr = kernel_start;
+ ppd.vaddr = kernel_start + decrypted_base;
+ ppd.vaddr_end = kernel_end + decrypted_base;
+ sme_map_range_decrypted_wp(&ppd);
+
+ if (initrd_len) {
+ /* Add encrypted initrd (identity) mappings */
+ ppd.paddr = initrd_start;
+ ppd.vaddr = initrd_start;
+ ppd.vaddr_end = initrd_end;
+ sme_map_range_encrypted(&ppd);
+ /*
+ * Add decrypted, write-protected initrd (non-identity) mappings
+ */
+ ppd.paddr = initrd_start;
+ ppd.vaddr = initrd_start + decrypted_base;
+ ppd.vaddr_end = initrd_end + decrypted_base;
+ sme_map_range_decrypted_wp(&ppd);
+ }
+
+ /* Add decrypted workarea mappings to both kernel mappings */
+ ppd.paddr = workarea_start;
+ ppd.vaddr = workarea_start;
+ ppd.vaddr_end = workarea_end;
+ sme_map_range_decrypted(&ppd);
+
+ ppd.paddr = workarea_start;
+ ppd.vaddr = workarea_start + decrypted_base;
+ ppd.vaddr_end = workarea_end + decrypted_base;
+ sme_map_range_decrypted(&ppd);
+
+ /* Perform the encryption */
+ sme_encrypt_execute(kernel_start, kernel_start + decrypted_base,
+ kernel_len, workarea_start, (unsigned long)ppd.pgd);
+
+ if (initrd_len)
+ sme_encrypt_execute(initrd_start, initrd_start + decrypted_base,
+ initrd_len, workarea_start,
+ (unsigned long)ppd.pgd);
+
+ /*
+ * At this point we are running encrypted. Remove the mappings for
+ * the decrypted areas - all that is needed for this is to remove
+ * the PGD entry/entries.
+ */
+ ppd.vaddr = kernel_start + decrypted_base;
+ ppd.vaddr_end = kernel_end + decrypted_base;
+ sme_clear_pgd(&ppd);
+
+ if (initrd_len) {
+ ppd.vaddr = initrd_start + decrypted_base;
+ ppd.vaddr_end = initrd_end + decrypted_base;
+ sme_clear_pgd(&ppd);
+ }
+
+ ppd.vaddr = workarea_start + decrypted_base;
+ ppd.vaddr_end = workarea_end + decrypted_base;
+ sme_clear_pgd(&ppd);
+
+ /* Flush the TLB - no globals so cr3 is enough */
+ native_write_cr3(__native_read_cr3());
+}
+
+void __init sme_enable(struct boot_params *bp)
+{
+ const char *cmdline_ptr, *cmdline_arg, *cmdline_on, *cmdline_off;
+ unsigned int eax, ebx, ecx, edx;
+ unsigned long feature_mask;
+ bool active_by_default;
+ unsigned long me_mask;
+ char buffer[16];
+ u64 msr;
+
+ /* Check for the SME/SEV support leaf */
+ eax = 0x80000000;
+ ecx = 0;
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+ if (eax < 0x8000001f)
+ return;
+
+#define AMD_SME_BIT BIT(0)
+#define AMD_SEV_BIT BIT(1)
+ /*
+ * Set the feature mask (SME or SEV) based on whether we are
+ * running under a hypervisor.
+ */
+ eax = 1;
+ ecx = 0;
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+ feature_mask = (ecx & BIT(31)) ? AMD_SEV_BIT : AMD_SME_BIT;
+
+ /*
+ * Check for the SME/SEV feature:
+ * CPUID Fn8000_001F[EAX]
+ * - Bit 0 - Secure Memory Encryption support
+ * - Bit 1 - Secure Encrypted Virtualization support
+ * CPUID Fn8000_001F[EBX]
+ * - Bits 5:0 - Pagetable bit position used to indicate encryption
+ */
+ eax = 0x8000001f;
+ ecx = 0;
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+ if (!(eax & feature_mask))
+ return;
+
+ me_mask = 1UL << (ebx & 0x3f);
+
+ /* Check if memory encryption is enabled */
+ if (feature_mask == AMD_SME_BIT) {
+ /* For SME, check the SYSCFG MSR */
+ msr = __rdmsr(MSR_K8_SYSCFG);
+ if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT))
+ return;
+ } else {
+ /* For SEV, check the SEV MSR */
+ msr = __rdmsr(MSR_AMD64_SEV);
+ if (!(msr & MSR_AMD64_SEV_ENABLED))
+ return;
+
+ /* SEV state cannot be controlled by a command line option */
+ sme_me_mask = me_mask;
+ sev_enabled = true;
+ physical_mask &= ~sme_me_mask;
+ return;
+ }
+
+ /*
+ * Fixups have not been applied to phys_base yet and we're running
+ * identity mapped, so we must obtain the address to the SME command
+ * line argument data using rip-relative addressing.
+ */
+ asm ("lea sme_cmdline_arg(%%rip), %0"
+ : "=r" (cmdline_arg)
+ : "p" (sme_cmdline_arg));
+ asm ("lea sme_cmdline_on(%%rip), %0"
+ : "=r" (cmdline_on)
+ : "p" (sme_cmdline_on));
+ asm ("lea sme_cmdline_off(%%rip), %0"
+ : "=r" (cmdline_off)
+ : "p" (sme_cmdline_off));
+
+ if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT))
+ active_by_default = true;
+ else
+ active_by_default = false;
+
+ cmdline_ptr = (const char *)((u64)bp->hdr.cmd_line_ptr |
+ ((u64)bp->ext_cmd_line_ptr << 32));
+
+ cmdline_find_option(cmdline_ptr, cmdline_arg, buffer, sizeof(buffer));
+
+ if (!strncmp(buffer, cmdline_on, sizeof(buffer)))
+ sme_me_mask = me_mask;
+ else if (!strncmp(buffer, cmdline_off, sizeof(buffer)))
+ sme_me_mask = 0;
+ else
+ sme_me_mask = active_by_default ? me_mask : 0;
+
+ physical_mask &= ~sme_me_mask;
+}
diff --git a/arch/x86/mm/mm_internal.h b/arch/x86/mm/mm_internal.h
new file mode 100644
index 000000000..4e1f6e1b8
--- /dev/null
+++ b/arch/x86/mm/mm_internal.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __X86_MM_INTERNAL_H
+#define __X86_MM_INTERNAL_H
+
+void *alloc_low_pages(unsigned int num);
+static inline void *alloc_low_page(void)
+{
+ return alloc_low_pages(1);
+}
+
+void early_ioremap_page_table_range_init(void);
+
+unsigned long kernel_physical_mapping_init(unsigned long start,
+ unsigned long end,
+ unsigned long page_size_mask);
+void zone_sizes_init(void);
+
+extern int after_bootmem;
+
+void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache);
+
+#endif /* __X86_MM_INTERNAL_H */
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
new file mode 100644
index 000000000..b69f7d428
--- /dev/null
+++ b/arch/x86/mm/mmap.c
@@ -0,0 +1,263 @@
+/*
+ * Flexible mmap layout support
+ *
+ * Based on code by Ingo Molnar and Andi Kleen, copyrighted
+ * as follows:
+ *
+ * Copyright 2003-2009 Red Hat Inc.
+ * All Rights Reserved.
+ * Copyright 2005 Andi Kleen, SUSE Labs.
+ * Copyright 2007 Jiri Kosina, SUSE Labs.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/personality.h>
+#include <linux/mm.h>
+#include <linux/random.h>
+#include <linux/limits.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/mm.h>
+#include <linux/compat.h>
+#include <asm/elf.h>
+
+#include "physaddr.h"
+
+struct va_alignment __read_mostly va_align = {
+ .flags = -1,
+};
+
+unsigned long task_size_32bit(void)
+{
+ return IA32_PAGE_OFFSET;
+}
+
+unsigned long task_size_64bit(int full_addr_space)
+{
+ return full_addr_space ? TASK_SIZE_MAX : DEFAULT_MAP_WINDOW;
+}
+
+static unsigned long stack_maxrandom_size(unsigned long task_size)
+{
+ unsigned long max = 0;
+ if (current->flags & PF_RANDOMIZE) {
+ max = (-1UL) & __STACK_RND_MASK(task_size == task_size_32bit());
+ max <<= PAGE_SHIFT;
+ }
+
+ return max;
+}
+
+#ifdef CONFIG_COMPAT
+# define mmap32_rnd_bits mmap_rnd_compat_bits
+# define mmap64_rnd_bits mmap_rnd_bits
+#else
+# define mmap32_rnd_bits mmap_rnd_bits
+# define mmap64_rnd_bits mmap_rnd_bits
+#endif
+
+#define SIZE_128M (128 * 1024 * 1024UL)
+
+static int mmap_is_legacy(void)
+{
+ if (current->personality & ADDR_COMPAT_LAYOUT)
+ return 1;
+
+ return sysctl_legacy_va_layout;
+}
+
+static unsigned long arch_rnd(unsigned int rndbits)
+{
+ if (!(current->flags & PF_RANDOMIZE))
+ return 0;
+ return (get_random_long() & ((1UL << rndbits) - 1)) << PAGE_SHIFT;
+}
+
+unsigned long arch_mmap_rnd(void)
+{
+ return arch_rnd(mmap_is_ia32() ? mmap32_rnd_bits : mmap64_rnd_bits);
+}
+
+static unsigned long mmap_base(unsigned long rnd, unsigned long task_size,
+ struct rlimit *rlim_stack)
+{
+ unsigned long gap = rlim_stack->rlim_cur;
+ unsigned long pad = stack_maxrandom_size(task_size) + stack_guard_gap;
+ unsigned long gap_min, gap_max;
+
+ /* Values close to RLIM_INFINITY can overflow. */
+ if (gap + pad > gap)
+ gap += pad;
+
+ /*
+ * Top of mmap area (just below the process stack).
+ * Leave an at least ~128 MB hole with possible stack randomization.
+ */
+ gap_min = SIZE_128M;
+ gap_max = (task_size / 6) * 5;
+
+ if (gap < gap_min)
+ gap = gap_min;
+ else if (gap > gap_max)
+ gap = gap_max;
+
+ return PAGE_ALIGN(task_size - gap - rnd);
+}
+
+static unsigned long mmap_legacy_base(unsigned long rnd,
+ unsigned long task_size)
+{
+ return __TASK_UNMAPPED_BASE(task_size) + rnd;
+}
+
+/*
+ * This function, called very early during the creation of a new
+ * process VM image, sets up which VM layout function to use:
+ */
+static void arch_pick_mmap_base(unsigned long *base, unsigned long *legacy_base,
+ unsigned long random_factor, unsigned long task_size,
+ struct rlimit *rlim_stack)
+{
+ *legacy_base = mmap_legacy_base(random_factor, task_size);
+ if (mmap_is_legacy())
+ *base = *legacy_base;
+ else
+ *base = mmap_base(random_factor, task_size, rlim_stack);
+}
+
+void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
+{
+ if (mmap_is_legacy())
+ mm->get_unmapped_area = arch_get_unmapped_area;
+ else
+ mm->get_unmapped_area = arch_get_unmapped_area_topdown;
+
+ arch_pick_mmap_base(&mm->mmap_base, &mm->mmap_legacy_base,
+ arch_rnd(mmap64_rnd_bits), task_size_64bit(0),
+ rlim_stack);
+
+#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES
+ /*
+ * The mmap syscall mapping base decision depends solely on the
+ * syscall type (64-bit or compat). This applies for 64bit
+ * applications and 32bit applications. The 64bit syscall uses
+ * mmap_base, the compat syscall uses mmap_compat_base.
+ */
+ arch_pick_mmap_base(&mm->mmap_compat_base, &mm->mmap_compat_legacy_base,
+ arch_rnd(mmap32_rnd_bits), task_size_32bit(),
+ rlim_stack);
+#endif
+}
+
+unsigned long get_mmap_base(int is_legacy)
+{
+ struct mm_struct *mm = current->mm;
+
+#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES
+ if (in_compat_syscall()) {
+ return is_legacy ? mm->mmap_compat_legacy_base
+ : mm->mmap_compat_base;
+ }
+#endif
+ return is_legacy ? mm->mmap_legacy_base : mm->mmap_base;
+}
+
+const char *arch_vma_name(struct vm_area_struct *vma)
+{
+ if (vma->vm_flags & VM_MPX)
+ return "[mpx]";
+ return NULL;
+}
+
+/**
+ * mmap_address_hint_valid - Validate the address hint of mmap
+ * @addr: Address hint
+ * @len: Mapping length
+ *
+ * Check whether @addr and @addr + @len result in a valid mapping.
+ *
+ * On 32bit this only checks whether @addr + @len is <= TASK_SIZE.
+ *
+ * On 64bit with 5-level page tables another sanity check is required
+ * because mappings requested by mmap(@addr, 0) which cross the 47-bit
+ * virtual address boundary can cause the following theoretical issue:
+ *
+ * An application calls mmap(addr, 0), i.e. without MAP_FIXED, where @addr
+ * is below the border of the 47-bit address space and @addr + @len is
+ * above the border.
+ *
+ * With 4-level paging this request succeeds, but the resulting mapping
+ * address will always be within the 47-bit virtual address space, because
+ * the hint address does not result in a valid mapping and is
+ * ignored. Hence applications which are not prepared to handle virtual
+ * addresses above 47-bit work correctly.
+ *
+ * With 5-level paging this request would be granted and result in a
+ * mapping which crosses the border of the 47-bit virtual address
+ * space. If the application cannot handle addresses above 47-bit this
+ * will lead to misbehaviour and hard to diagnose failures.
+ *
+ * Therefore ignore address hints which would result in a mapping crossing
+ * the 47-bit virtual address boundary.
+ *
+ * Note, that in the same scenario with MAP_FIXED the behaviour is
+ * different. The request with @addr < 47-bit and @addr + @len > 47-bit
+ * fails on a 4-level paging machine but succeeds on a 5-level paging
+ * machine. It is reasonable to expect that an application does not rely on
+ * the failure of such a fixed mapping request, so the restriction is not
+ * applied.
+ */
+bool mmap_address_hint_valid(unsigned long addr, unsigned long len)
+{
+ if (TASK_SIZE - len < addr)
+ return false;
+
+ return (addr > DEFAULT_MAP_WINDOW) == (addr + len > DEFAULT_MAP_WINDOW);
+}
+
+/* Can we access it for direct reading/writing? Must be RAM: */
+int valid_phys_addr_range(phys_addr_t addr, size_t count)
+{
+ return addr + count - 1 <= __pa(high_memory - 1);
+}
+
+/* Can we access it through mmap? Must be a valid physical address: */
+int valid_mmap_phys_addr_range(unsigned long pfn, size_t count)
+{
+ phys_addr_t addr = (phys_addr_t)pfn << PAGE_SHIFT;
+
+ return phys_addr_valid(addr + count - 1);
+}
+
+/*
+ * Only allow root to set high MMIO mappings to PROT_NONE.
+ * This prevents an unpriv. user to set them to PROT_NONE and invert
+ * them, then pointing to valid memory for L1TF speculation.
+ *
+ * Note: for locked down kernels may want to disable the root override.
+ */
+bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot)
+{
+ if (!boot_cpu_has_bug(X86_BUG_L1TF))
+ return true;
+ if (!__pte_needs_invert(pgprot_val(prot)))
+ return true;
+ /* If it's real memory always allow */
+ if (pfn_valid(pfn))
+ return true;
+ if (pfn >= l1tf_pfn_limit() && !capable(CAP_SYS_ADMIN))
+ return false;
+ return true;
+}
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
new file mode 100644
index 000000000..e32b003e0
--- /dev/null
+++ b/arch/x86/mm/mmio-mod.c
@@ -0,0 +1,477 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2005
+ * Jeff Muizelaar, 2006, 2007
+ * Pekka Paalanen, 2008 <pq@iki.fi>
+ *
+ * Derived from the read-mod example from relay-examples by Tom Zanussi.
+ */
+
+#define pr_fmt(fmt) "mmiotrace: " fmt
+
+#define DEBUG 1
+
+#include <linux/moduleparam.h>
+#include <linux/debugfs.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/io.h>
+#include <asm/pgtable.h>
+#include <linux/mmiotrace.h>
+#include <asm/e820/api.h> /* for ISA_START_ADDRESS */
+#include <linux/atomic.h>
+#include <linux/percpu.h>
+#include <linux/cpu.h>
+
+#include "pf_in.h"
+
+struct trap_reason {
+ unsigned long addr;
+ unsigned long ip;
+ enum reason_type type;
+ int active_traces;
+};
+
+struct remap_trace {
+ struct list_head list;
+ struct kmmio_probe probe;
+ resource_size_t phys;
+ unsigned long id;
+};
+
+/* Accessed per-cpu. */
+static DEFINE_PER_CPU(struct trap_reason, pf_reason);
+static DEFINE_PER_CPU(struct mmiotrace_rw, cpu_trace);
+
+static DEFINE_MUTEX(mmiotrace_mutex);
+static DEFINE_SPINLOCK(trace_lock);
+static atomic_t mmiotrace_enabled;
+static LIST_HEAD(trace_list); /* struct remap_trace */
+
+/*
+ * Locking in this file:
+ * - mmiotrace_mutex enforces enable/disable_mmiotrace() critical sections.
+ * - mmiotrace_enabled may be modified only when holding mmiotrace_mutex
+ * and trace_lock.
+ * - Routines depending on is_enabled() must take trace_lock.
+ * - trace_list users must hold trace_lock.
+ * - is_enabled() guarantees that mmio_trace_{rw,mapping} are allowed.
+ * - pre/post callbacks assume the effect of is_enabled() being true.
+ */
+
+/* module parameters */
+static unsigned long filter_offset;
+static bool nommiotrace;
+static bool trace_pc;
+
+module_param(filter_offset, ulong, 0);
+module_param(nommiotrace, bool, 0);
+module_param(trace_pc, bool, 0);
+
+MODULE_PARM_DESC(filter_offset, "Start address of traced mappings.");
+MODULE_PARM_DESC(nommiotrace, "Disable actual MMIO tracing.");
+MODULE_PARM_DESC(trace_pc, "Record address of faulting instructions.");
+
+static bool is_enabled(void)
+{
+ return atomic_read(&mmiotrace_enabled);
+}
+
+static void print_pte(unsigned long address)
+{
+ unsigned int level;
+ pte_t *pte = lookup_address(address, &level);
+
+ if (!pte) {
+ pr_err("Error in %s: no pte for page 0x%08lx\n",
+ __func__, address);
+ return;
+ }
+
+ if (level == PG_LEVEL_2M) {
+ pr_emerg("4MB pages are not currently supported: 0x%08lx\n",
+ address);
+ BUG();
+ }
+ pr_info("pte for 0x%lx: 0x%llx 0x%llx\n",
+ address,
+ (unsigned long long)pte_val(*pte),
+ (unsigned long long)pte_val(*pte) & _PAGE_PRESENT);
+}
+
+/*
+ * For some reason the pre/post pairs have been called in an
+ * unmatched order. Report and die.
+ */
+static void die_kmmio_nesting_error(struct pt_regs *regs, unsigned long addr)
+{
+ const struct trap_reason *my_reason = &get_cpu_var(pf_reason);
+ pr_emerg("unexpected fault for address: 0x%08lx, last fault for address: 0x%08lx\n",
+ addr, my_reason->addr);
+ print_pte(addr);
+ pr_emerg("faulting IP is at %pS\n", (void *)regs->ip);
+ pr_emerg("last faulting IP was at %pS\n", (void *)my_reason->ip);
+#ifdef __i386__
+ pr_emerg("eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n",
+ regs->ax, regs->bx, regs->cx, regs->dx);
+ pr_emerg("esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n",
+ regs->si, regs->di, regs->bp, regs->sp);
+#else
+ pr_emerg("rax: %016lx rcx: %016lx rdx: %016lx\n",
+ regs->ax, regs->cx, regs->dx);
+ pr_emerg("rsi: %016lx rdi: %016lx rbp: %016lx rsp: %016lx\n",
+ regs->si, regs->di, regs->bp, regs->sp);
+#endif
+ put_cpu_var(pf_reason);
+ BUG();
+}
+
+static void pre(struct kmmio_probe *p, struct pt_regs *regs,
+ unsigned long addr)
+{
+ struct trap_reason *my_reason = &get_cpu_var(pf_reason);
+ struct mmiotrace_rw *my_trace = &get_cpu_var(cpu_trace);
+ const unsigned long instptr = instruction_pointer(regs);
+ const enum reason_type type = get_ins_type(instptr);
+ struct remap_trace *trace = p->private;
+
+ /* it doesn't make sense to have more than one active trace per cpu */
+ if (my_reason->active_traces)
+ die_kmmio_nesting_error(regs, addr);
+ else
+ my_reason->active_traces++;
+
+ my_reason->type = type;
+ my_reason->addr = addr;
+ my_reason->ip = instptr;
+
+ my_trace->phys = addr - trace->probe.addr + trace->phys;
+ my_trace->map_id = trace->id;
+
+ /*
+ * Only record the program counter when requested.
+ * It may taint clean-room reverse engineering.
+ */
+ if (trace_pc)
+ my_trace->pc = instptr;
+ else
+ my_trace->pc = 0;
+
+ /*
+ * XXX: the timestamp recorded will be *after* the tracing has been
+ * done, not at the time we hit the instruction. SMP implications
+ * on event ordering?
+ */
+
+ switch (type) {
+ case REG_READ:
+ my_trace->opcode = MMIO_READ;
+ my_trace->width = get_ins_mem_width(instptr);
+ break;
+ case REG_WRITE:
+ my_trace->opcode = MMIO_WRITE;
+ my_trace->width = get_ins_mem_width(instptr);
+ my_trace->value = get_ins_reg_val(instptr, regs);
+ break;
+ case IMM_WRITE:
+ my_trace->opcode = MMIO_WRITE;
+ my_trace->width = get_ins_mem_width(instptr);
+ my_trace->value = get_ins_imm_val(instptr);
+ break;
+ default:
+ {
+ unsigned char *ip = (unsigned char *)instptr;
+ my_trace->opcode = MMIO_UNKNOWN_OP;
+ my_trace->width = 0;
+ my_trace->value = (*ip) << 16 | *(ip + 1) << 8 |
+ *(ip + 2);
+ }
+ }
+ put_cpu_var(cpu_trace);
+ put_cpu_var(pf_reason);
+}
+
+static void post(struct kmmio_probe *p, unsigned long condition,
+ struct pt_regs *regs)
+{
+ struct trap_reason *my_reason = &get_cpu_var(pf_reason);
+ struct mmiotrace_rw *my_trace = &get_cpu_var(cpu_trace);
+
+ /* this should always return the active_trace count to 0 */
+ my_reason->active_traces--;
+ if (my_reason->active_traces) {
+ pr_emerg("unexpected post handler");
+ BUG();
+ }
+
+ switch (my_reason->type) {
+ case REG_READ:
+ my_trace->value = get_ins_reg_val(my_reason->ip, regs);
+ break;
+ default:
+ break;
+ }
+
+ mmio_trace_rw(my_trace);
+ put_cpu_var(cpu_trace);
+ put_cpu_var(pf_reason);
+}
+
+static void ioremap_trace_core(resource_size_t offset, unsigned long size,
+ void __iomem *addr)
+{
+ static atomic_t next_id;
+ struct remap_trace *trace = kmalloc(sizeof(*trace), GFP_KERNEL);
+ /* These are page-unaligned. */
+ struct mmiotrace_map map = {
+ .phys = offset,
+ .virt = (unsigned long)addr,
+ .len = size,
+ .opcode = MMIO_PROBE
+ };
+
+ if (!trace) {
+ pr_err("kmalloc failed in ioremap\n");
+ return;
+ }
+
+ *trace = (struct remap_trace) {
+ .probe = {
+ .addr = (unsigned long)addr,
+ .len = size,
+ .pre_handler = pre,
+ .post_handler = post,
+ .private = trace
+ },
+ .phys = offset,
+ .id = atomic_inc_return(&next_id)
+ };
+ map.map_id = trace->id;
+
+ spin_lock_irq(&trace_lock);
+ if (!is_enabled()) {
+ kfree(trace);
+ goto not_enabled;
+ }
+
+ mmio_trace_mapping(&map);
+ list_add_tail(&trace->list, &trace_list);
+ if (!nommiotrace)
+ register_kmmio_probe(&trace->probe);
+
+not_enabled:
+ spin_unlock_irq(&trace_lock);
+}
+
+void mmiotrace_ioremap(resource_size_t offset, unsigned long size,
+ void __iomem *addr)
+{
+ if (!is_enabled()) /* recheck and proper locking in *_core() */
+ return;
+
+ pr_debug("ioremap_*(0x%llx, 0x%lx) = %p\n",
+ (unsigned long long)offset, size, addr);
+ if ((filter_offset) && (offset != filter_offset))
+ return;
+ ioremap_trace_core(offset, size, addr);
+}
+
+static void iounmap_trace_core(volatile void __iomem *addr)
+{
+ struct mmiotrace_map map = {
+ .phys = 0,
+ .virt = (unsigned long)addr,
+ .len = 0,
+ .opcode = MMIO_UNPROBE
+ };
+ struct remap_trace *trace;
+ struct remap_trace *tmp;
+ struct remap_trace *found_trace = NULL;
+
+ pr_debug("Unmapping %p.\n", addr);
+
+ spin_lock_irq(&trace_lock);
+ if (!is_enabled())
+ goto not_enabled;
+
+ list_for_each_entry_safe(trace, tmp, &trace_list, list) {
+ if ((unsigned long)addr == trace->probe.addr) {
+ if (!nommiotrace)
+ unregister_kmmio_probe(&trace->probe);
+ list_del(&trace->list);
+ found_trace = trace;
+ break;
+ }
+ }
+ map.map_id = (found_trace) ? found_trace->id : -1;
+ mmio_trace_mapping(&map);
+
+not_enabled:
+ spin_unlock_irq(&trace_lock);
+ if (found_trace) {
+ synchronize_rcu(); /* unregister_kmmio_probe() requirement */
+ kfree(found_trace);
+ }
+}
+
+void mmiotrace_iounmap(volatile void __iomem *addr)
+{
+ might_sleep();
+ if (is_enabled()) /* recheck and proper locking in *_core() */
+ iounmap_trace_core(addr);
+}
+
+int mmiotrace_printk(const char *fmt, ...)
+{
+ int ret = 0;
+ va_list args;
+ unsigned long flags;
+ va_start(args, fmt);
+
+ spin_lock_irqsave(&trace_lock, flags);
+ if (is_enabled())
+ ret = mmio_trace_printk(fmt, args);
+ spin_unlock_irqrestore(&trace_lock, flags);
+
+ va_end(args);
+ return ret;
+}
+EXPORT_SYMBOL(mmiotrace_printk);
+
+static void clear_trace_list(void)
+{
+ struct remap_trace *trace;
+ struct remap_trace *tmp;
+
+ /*
+ * No locking required, because the caller ensures we are in a
+ * critical section via mutex, and is_enabled() is false,
+ * i.e. nothing can traverse or modify this list.
+ * Caller also ensures is_enabled() cannot change.
+ */
+ list_for_each_entry(trace, &trace_list, list) {
+ pr_notice("purging non-iounmapped trace @0x%08lx, size 0x%lx.\n",
+ trace->probe.addr, trace->probe.len);
+ if (!nommiotrace)
+ unregister_kmmio_probe(&trace->probe);
+ }
+ synchronize_rcu(); /* unregister_kmmio_probe() requirement */
+
+ list_for_each_entry_safe(trace, tmp, &trace_list, list) {
+ list_del(&trace->list);
+ kfree(trace);
+ }
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static cpumask_var_t downed_cpus;
+
+static void enter_uniprocessor(void)
+{
+ int cpu;
+ int err;
+
+ if (!cpumask_available(downed_cpus) &&
+ !alloc_cpumask_var(&downed_cpus, GFP_KERNEL)) {
+ pr_notice("Failed to allocate mask\n");
+ goto out;
+ }
+
+ get_online_cpus();
+ cpumask_copy(downed_cpus, cpu_online_mask);
+ cpumask_clear_cpu(cpumask_first(cpu_online_mask), downed_cpus);
+ if (num_online_cpus() > 1)
+ pr_notice("Disabling non-boot CPUs...\n");
+ put_online_cpus();
+
+ for_each_cpu(cpu, downed_cpus) {
+ err = cpu_down(cpu);
+ if (!err)
+ pr_info("CPU%d is down.\n", cpu);
+ else
+ pr_err("Error taking CPU%d down: %d\n", cpu, err);
+ }
+out:
+ if (num_online_cpus() > 1)
+ pr_warning("multiple CPUs still online, may miss events.\n");
+}
+
+static void leave_uniprocessor(void)
+{
+ int cpu;
+ int err;
+
+ if (!cpumask_available(downed_cpus) || cpumask_weight(downed_cpus) == 0)
+ return;
+ pr_notice("Re-enabling CPUs...\n");
+ for_each_cpu(cpu, downed_cpus) {
+ err = cpu_up(cpu);
+ if (!err)
+ pr_info("enabled CPU%d.\n", cpu);
+ else
+ pr_err("cannot re-enable CPU%d: %d\n", cpu, err);
+ }
+}
+
+#else /* !CONFIG_HOTPLUG_CPU */
+static void enter_uniprocessor(void)
+{
+ if (num_online_cpus() > 1)
+ pr_warning("multiple CPUs are online, may miss events. "
+ "Suggest booting with maxcpus=1 kernel argument.\n");
+}
+
+static void leave_uniprocessor(void)
+{
+}
+#endif
+
+void enable_mmiotrace(void)
+{
+ mutex_lock(&mmiotrace_mutex);
+ if (is_enabled())
+ goto out;
+
+ if (nommiotrace)
+ pr_info("MMIO tracing disabled.\n");
+ kmmio_init();
+ enter_uniprocessor();
+ spin_lock_irq(&trace_lock);
+ atomic_inc(&mmiotrace_enabled);
+ spin_unlock_irq(&trace_lock);
+ pr_info("enabled.\n");
+out:
+ mutex_unlock(&mmiotrace_mutex);
+}
+
+void disable_mmiotrace(void)
+{
+ mutex_lock(&mmiotrace_mutex);
+ if (!is_enabled())
+ goto out;
+
+ spin_lock_irq(&trace_lock);
+ atomic_dec(&mmiotrace_enabled);
+ BUG_ON(is_enabled());
+ spin_unlock_irq(&trace_lock);
+
+ clear_trace_list(); /* guarantees: no more kmmio callbacks */
+ leave_uniprocessor();
+ kmmio_cleanup();
+ pr_info("disabled.\n");
+out:
+ mutex_unlock(&mmiotrace_mutex);
+}
diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c
new file mode 100644
index 000000000..e500949ba
--- /dev/null
+++ b/arch/x86/mm/mpx.c
@@ -0,0 +1,948 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * mpx.c - Memory Protection eXtensions
+ *
+ * Copyright (c) 2014, Intel Corporation.
+ * Qiaowei Ren <qiaowei.ren@intel.com>
+ * Dave Hansen <dave.hansen@intel.com>
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/mm_types.h>
+#include <linux/syscalls.h>
+#include <linux/sched/sysctl.h>
+
+#include <asm/insn.h>
+#include <asm/insn-eval.h>
+#include <asm/mman.h>
+#include <asm/mmu_context.h>
+#include <asm/mpx.h>
+#include <asm/processor.h>
+#include <asm/fpu/internal.h>
+
+#define CREATE_TRACE_POINTS
+#include <asm/trace/mpx.h>
+
+static inline unsigned long mpx_bd_size_bytes(struct mm_struct *mm)
+{
+ if (is_64bit_mm(mm))
+ return MPX_BD_SIZE_BYTES_64;
+ else
+ return MPX_BD_SIZE_BYTES_32;
+}
+
+static inline unsigned long mpx_bt_size_bytes(struct mm_struct *mm)
+{
+ if (is_64bit_mm(mm))
+ return MPX_BT_SIZE_BYTES_64;
+ else
+ return MPX_BT_SIZE_BYTES_32;
+}
+
+/*
+ * This is really a simplified "vm_mmap". it only handles MPX
+ * bounds tables (the bounds directory is user-allocated).
+ */
+static unsigned long mpx_mmap(unsigned long len)
+{
+ struct mm_struct *mm = current->mm;
+ unsigned long addr, populate;
+
+ /* Only bounds table can be allocated here */
+ if (len != mpx_bt_size_bytes(mm))
+ return -EINVAL;
+
+ down_write(&mm->mmap_sem);
+ addr = do_mmap(NULL, 0, len, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, VM_MPX, 0, &populate, NULL);
+ up_write(&mm->mmap_sem);
+ if (populate)
+ mm_populate(addr, populate);
+
+ return addr;
+}
+
+static int mpx_insn_decode(struct insn *insn,
+ struct pt_regs *regs)
+{
+ unsigned char buf[MAX_INSN_SIZE];
+ int x86_64 = !test_thread_flag(TIF_IA32);
+ int not_copied;
+ int nr_copied;
+
+ not_copied = copy_from_user(buf, (void __user *)regs->ip, sizeof(buf));
+ nr_copied = sizeof(buf) - not_copied;
+ /*
+ * The decoder _should_ fail nicely if we pass it a short buffer.
+ * But, let's not depend on that implementation detail. If we
+ * did not get anything, just error out now.
+ */
+ if (!nr_copied)
+ return -EFAULT;
+ insn_init(insn, buf, nr_copied, x86_64);
+ insn_get_length(insn);
+ /*
+ * copy_from_user() tries to get as many bytes as we could see in
+ * the largest possible instruction. If the instruction we are
+ * after is shorter than that _and_ we attempt to copy from
+ * something unreadable, we might get a short read. This is OK
+ * as long as the read did not stop in the middle of the
+ * instruction. Check to see if we got a partial instruction.
+ */
+ if (nr_copied < insn->length)
+ return -EFAULT;
+
+ insn_get_opcode(insn);
+ /*
+ * We only _really_ need to decode bndcl/bndcn/bndcu
+ * Error out on anything else.
+ */
+ if (insn->opcode.bytes[0] != 0x0f)
+ goto bad_opcode;
+ if ((insn->opcode.bytes[1] != 0x1a) &&
+ (insn->opcode.bytes[1] != 0x1b))
+ goto bad_opcode;
+
+ return 0;
+bad_opcode:
+ return -EINVAL;
+}
+
+/*
+ * If a bounds overflow occurs then a #BR is generated. This
+ * function decodes MPX instructions to get violation address
+ * and set this address into extended struct siginfo.
+ *
+ * Note that this is not a super precise way of doing this.
+ * Userspace could have, by the time we get here, written
+ * anything it wants in to the instructions. We can not
+ * trust anything about it. They might not be valid
+ * instructions or might encode invalid registers, etc...
+ *
+ * The caller is expected to kfree() the returned siginfo_t.
+ */
+siginfo_t *mpx_generate_siginfo(struct pt_regs *regs)
+{
+ const struct mpx_bndreg_state *bndregs;
+ const struct mpx_bndreg *bndreg;
+ siginfo_t *info = NULL;
+ struct insn insn;
+ uint8_t bndregno;
+ int err;
+
+ err = mpx_insn_decode(&insn, regs);
+ if (err)
+ goto err_out;
+
+ /*
+ * We know at this point that we are only dealing with
+ * MPX instructions.
+ */
+ insn_get_modrm(&insn);
+ bndregno = X86_MODRM_REG(insn.modrm.value);
+ if (bndregno > 3) {
+ err = -EINVAL;
+ goto err_out;
+ }
+ /* get bndregs field from current task's xsave area */
+ bndregs = get_xsave_field_ptr(XFEATURE_MASK_BNDREGS);
+ if (!bndregs) {
+ err = -EINVAL;
+ goto err_out;
+ }
+ /* now go select the individual register in the set of 4 */
+ bndreg = &bndregs->bndreg[bndregno];
+
+ info = kzalloc(sizeof(*info), GFP_KERNEL);
+ if (!info) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+ /*
+ * The registers are always 64-bit, but the upper 32
+ * bits are ignored in 32-bit mode. Also, note that the
+ * upper bounds are architecturally represented in 1's
+ * complement form.
+ *
+ * The 'unsigned long' cast is because the compiler
+ * complains when casting from integers to different-size
+ * pointers.
+ */
+ info->si_lower = (void __user *)(unsigned long)bndreg->lower_bound;
+ info->si_upper = (void __user *)(unsigned long)~bndreg->upper_bound;
+ info->si_addr_lsb = 0;
+ info->si_signo = SIGSEGV;
+ info->si_errno = 0;
+ info->si_code = SEGV_BNDERR;
+ info->si_addr = insn_get_addr_ref(&insn, regs);
+ /*
+ * We were not able to extract an address from the instruction,
+ * probably because there was something invalid in it.
+ */
+ if (info->si_addr == (void __user *)-1) {
+ err = -EINVAL;
+ goto err_out;
+ }
+ trace_mpx_bounds_register_exception(info->si_addr, bndreg);
+ return info;
+err_out:
+ /* info might be NULL, but kfree() handles that */
+ kfree(info);
+ return ERR_PTR(err);
+}
+
+static __user void *mpx_get_bounds_dir(void)
+{
+ const struct mpx_bndcsr *bndcsr;
+
+ if (!cpu_feature_enabled(X86_FEATURE_MPX))
+ return MPX_INVALID_BOUNDS_DIR;
+
+ /*
+ * The bounds directory pointer is stored in a register
+ * only accessible if we first do an xsave.
+ */
+ bndcsr = get_xsave_field_ptr(XFEATURE_MASK_BNDCSR);
+ if (!bndcsr)
+ return MPX_INVALID_BOUNDS_DIR;
+
+ /*
+ * Make sure the register looks valid by checking the
+ * enable bit.
+ */
+ if (!(bndcsr->bndcfgu & MPX_BNDCFG_ENABLE_FLAG))
+ return MPX_INVALID_BOUNDS_DIR;
+
+ /*
+ * Lastly, mask off the low bits used for configuration
+ * flags, and return the address of the bounds table.
+ */
+ return (void __user *)(unsigned long)
+ (bndcsr->bndcfgu & MPX_BNDCFG_ADDR_MASK);
+}
+
+int mpx_enable_management(void)
+{
+ void __user *bd_base = MPX_INVALID_BOUNDS_DIR;
+ struct mm_struct *mm = current->mm;
+ int ret = 0;
+
+ /*
+ * runtime in the userspace will be responsible for allocation of
+ * the bounds directory. Then, it will save the base of the bounds
+ * directory into XSAVE/XRSTOR Save Area and enable MPX through
+ * XRSTOR instruction.
+ *
+ * The copy_xregs_to_kernel() beneath get_xsave_field_ptr() is
+ * expected to be relatively expensive. Storing the bounds
+ * directory here means that we do not have to do xsave in the
+ * unmap path; we can just use mm->context.bd_addr instead.
+ */
+ bd_base = mpx_get_bounds_dir();
+ down_write(&mm->mmap_sem);
+
+ /* MPX doesn't support addresses above 47 bits yet. */
+ if (find_vma(mm, DEFAULT_MAP_WINDOW)) {
+ pr_warn_once("%s (%d): MPX cannot handle addresses "
+ "above 47-bits. Disabling.",
+ current->comm, current->pid);
+ ret = -ENXIO;
+ goto out;
+ }
+ mm->context.bd_addr = bd_base;
+ if (mm->context.bd_addr == MPX_INVALID_BOUNDS_DIR)
+ ret = -ENXIO;
+out:
+ up_write(&mm->mmap_sem);
+ return ret;
+}
+
+int mpx_disable_management(void)
+{
+ struct mm_struct *mm = current->mm;
+
+ if (!cpu_feature_enabled(X86_FEATURE_MPX))
+ return -ENXIO;
+
+ down_write(&mm->mmap_sem);
+ mm->context.bd_addr = MPX_INVALID_BOUNDS_DIR;
+ up_write(&mm->mmap_sem);
+ return 0;
+}
+
+static int mpx_cmpxchg_bd_entry(struct mm_struct *mm,
+ unsigned long *curval,
+ unsigned long __user *addr,
+ unsigned long old_val, unsigned long new_val)
+{
+ int ret;
+ /*
+ * user_atomic_cmpxchg_inatomic() actually uses sizeof()
+ * the pointer that we pass to it to figure out how much
+ * data to cmpxchg. We have to be careful here not to
+ * pass a pointer to a 64-bit data type when we only want
+ * a 32-bit copy.
+ */
+ if (is_64bit_mm(mm)) {
+ ret = user_atomic_cmpxchg_inatomic(curval,
+ addr, old_val, new_val);
+ } else {
+ u32 uninitialized_var(curval_32);
+ u32 old_val_32 = old_val;
+ u32 new_val_32 = new_val;
+ u32 __user *addr_32 = (u32 __user *)addr;
+
+ ret = user_atomic_cmpxchg_inatomic(&curval_32,
+ addr_32, old_val_32, new_val_32);
+ *curval = curval_32;
+ }
+ return ret;
+}
+
+/*
+ * With 32-bit mode, a bounds directory is 4MB, and the size of each
+ * bounds table is 16KB. With 64-bit mode, a bounds directory is 2GB,
+ * and the size of each bounds table is 4MB.
+ */
+static int allocate_bt(struct mm_struct *mm, long __user *bd_entry)
+{
+ unsigned long expected_old_val = 0;
+ unsigned long actual_old_val = 0;
+ unsigned long bt_addr;
+ unsigned long bd_new_entry;
+ int ret = 0;
+
+ /*
+ * Carve the virtual space out of userspace for the new
+ * bounds table:
+ */
+ bt_addr = mpx_mmap(mpx_bt_size_bytes(mm));
+ if (IS_ERR((void *)bt_addr))
+ return PTR_ERR((void *)bt_addr);
+ /*
+ * Set the valid flag (kinda like _PAGE_PRESENT in a pte)
+ */
+ bd_new_entry = bt_addr | MPX_BD_ENTRY_VALID_FLAG;
+
+ /*
+ * Go poke the address of the new bounds table in to the
+ * bounds directory entry out in userspace memory. Note:
+ * we may race with another CPU instantiating the same table.
+ * In that case the cmpxchg will see an unexpected
+ * 'actual_old_val'.
+ *
+ * This can fault, but that's OK because we do not hold
+ * mmap_sem at this point, unlike some of the other part
+ * of the MPX code that have to pagefault_disable().
+ */
+ ret = mpx_cmpxchg_bd_entry(mm, &actual_old_val, bd_entry,
+ expected_old_val, bd_new_entry);
+ if (ret)
+ goto out_unmap;
+
+ /*
+ * The user_atomic_cmpxchg_inatomic() will only return nonzero
+ * for faults, *not* if the cmpxchg itself fails. Now we must
+ * verify that the cmpxchg itself completed successfully.
+ */
+ /*
+ * We expected an empty 'expected_old_val', but instead found
+ * an apparently valid entry. Assume we raced with another
+ * thread to instantiate this table and desclare succecss.
+ */
+ if (actual_old_val & MPX_BD_ENTRY_VALID_FLAG) {
+ ret = 0;
+ goto out_unmap;
+ }
+ /*
+ * We found a non-empty bd_entry but it did not have the
+ * VALID_FLAG set. Return an error which will result in
+ * a SEGV since this probably means that somebody scribbled
+ * some invalid data in to a bounds table.
+ */
+ if (expected_old_val != actual_old_val) {
+ ret = -EINVAL;
+ goto out_unmap;
+ }
+ trace_mpx_new_bounds_table(bt_addr);
+ return 0;
+out_unmap:
+ vm_munmap(bt_addr, mpx_bt_size_bytes(mm));
+ return ret;
+}
+
+/*
+ * When a BNDSTX instruction attempts to save bounds to a bounds
+ * table, it will first attempt to look up the table in the
+ * first-level bounds directory. If it does not find a table in
+ * the directory, a #BR is generated and we get here in order to
+ * allocate a new table.
+ *
+ * With 32-bit mode, the size of BD is 4MB, and the size of each
+ * bound table is 16KB. With 64-bit mode, the size of BD is 2GB,
+ * and the size of each bound table is 4MB.
+ */
+static int do_mpx_bt_fault(void)
+{
+ unsigned long bd_entry, bd_base;
+ const struct mpx_bndcsr *bndcsr;
+ struct mm_struct *mm = current->mm;
+
+ bndcsr = get_xsave_field_ptr(XFEATURE_MASK_BNDCSR);
+ if (!bndcsr)
+ return -EINVAL;
+ /*
+ * Mask off the preserve and enable bits
+ */
+ bd_base = bndcsr->bndcfgu & MPX_BNDCFG_ADDR_MASK;
+ /*
+ * The hardware provides the address of the missing or invalid
+ * entry via BNDSTATUS, so we don't have to go look it up.
+ */
+ bd_entry = bndcsr->bndstatus & MPX_BNDSTA_ADDR_MASK;
+ /*
+ * Make sure the directory entry is within where we think
+ * the directory is.
+ */
+ if ((bd_entry < bd_base) ||
+ (bd_entry >= bd_base + mpx_bd_size_bytes(mm)))
+ return -EINVAL;
+
+ return allocate_bt(mm, (long __user *)bd_entry);
+}
+
+int mpx_handle_bd_fault(void)
+{
+ /*
+ * Userspace never asked us to manage the bounds tables,
+ * so refuse to help.
+ */
+ if (!kernel_managing_mpx_tables(current->mm))
+ return -EINVAL;
+
+ return do_mpx_bt_fault();
+}
+
+/*
+ * A thin wrapper around get_user_pages(). Returns 0 if the
+ * fault was resolved or -errno if not.
+ */
+static int mpx_resolve_fault(long __user *addr, int write)
+{
+ long gup_ret;
+ int nr_pages = 1;
+
+ gup_ret = get_user_pages((unsigned long)addr, nr_pages,
+ write ? FOLL_WRITE : 0, NULL, NULL);
+ /*
+ * get_user_pages() returns number of pages gotten.
+ * 0 means we failed to fault in and get anything,
+ * probably because 'addr' is bad.
+ */
+ if (!gup_ret)
+ return -EFAULT;
+ /* Other error, return it */
+ if (gup_ret < 0)
+ return gup_ret;
+ /* must have gup'd a page and gup_ret>0, success */
+ return 0;
+}
+
+static unsigned long mpx_bd_entry_to_bt_addr(struct mm_struct *mm,
+ unsigned long bd_entry)
+{
+ unsigned long bt_addr = bd_entry;
+ int align_to_bytes;
+ /*
+ * Bit 0 in a bt_entry is always the valid bit.
+ */
+ bt_addr &= ~MPX_BD_ENTRY_VALID_FLAG;
+ /*
+ * Tables are naturally aligned at 8-byte boundaries
+ * on 64-bit and 4-byte boundaries on 32-bit. The
+ * documentation makes it appear that the low bits
+ * are ignored by the hardware, so we do the same.
+ */
+ if (is_64bit_mm(mm))
+ align_to_bytes = 8;
+ else
+ align_to_bytes = 4;
+ bt_addr &= ~(align_to_bytes-1);
+ return bt_addr;
+}
+
+/*
+ * We only want to do a 4-byte get_user() on 32-bit. Otherwise,
+ * we might run off the end of the bounds table if we are on
+ * a 64-bit kernel and try to get 8 bytes.
+ */
+static int get_user_bd_entry(struct mm_struct *mm, unsigned long *bd_entry_ret,
+ long __user *bd_entry_ptr)
+{
+ u32 bd_entry_32;
+ int ret;
+
+ if (is_64bit_mm(mm))
+ return get_user(*bd_entry_ret, bd_entry_ptr);
+
+ /*
+ * Note that get_user() uses the type of the *pointer* to
+ * establish the size of the get, not the destination.
+ */
+ ret = get_user(bd_entry_32, (u32 __user *)bd_entry_ptr);
+ *bd_entry_ret = bd_entry_32;
+ return ret;
+}
+
+/*
+ * Get the base of bounds tables pointed by specific bounds
+ * directory entry.
+ */
+static int get_bt_addr(struct mm_struct *mm,
+ long __user *bd_entry_ptr,
+ unsigned long *bt_addr_result)
+{
+ int ret;
+ int valid_bit;
+ unsigned long bd_entry;
+ unsigned long bt_addr;
+
+ if (!access_ok(VERIFY_READ, (bd_entry_ptr), sizeof(*bd_entry_ptr)))
+ return -EFAULT;
+
+ while (1) {
+ int need_write = 0;
+
+ pagefault_disable();
+ ret = get_user_bd_entry(mm, &bd_entry, bd_entry_ptr);
+ pagefault_enable();
+ if (!ret)
+ break;
+ if (ret == -EFAULT)
+ ret = mpx_resolve_fault(bd_entry_ptr, need_write);
+ /*
+ * If we could not resolve the fault, consider it
+ * userspace's fault and error out.
+ */
+ if (ret)
+ return ret;
+ }
+
+ valid_bit = bd_entry & MPX_BD_ENTRY_VALID_FLAG;
+ bt_addr = mpx_bd_entry_to_bt_addr(mm, bd_entry);
+
+ /*
+ * When the kernel is managing bounds tables, a bounds directory
+ * entry will either have a valid address (plus the valid bit)
+ * *OR* be completely empty. If we see a !valid entry *and* some
+ * data in the address field, we know something is wrong. This
+ * -EINVAL return will cause a SIGSEGV.
+ */
+ if (!valid_bit && bt_addr)
+ return -EINVAL;
+ /*
+ * Do we have an completely zeroed bt entry? That is OK. It
+ * just means there was no bounds table for this memory. Make
+ * sure to distinguish this from -EINVAL, which will cause
+ * a SEGV.
+ */
+ if (!valid_bit)
+ return -ENOENT;
+
+ *bt_addr_result = bt_addr;
+ return 0;
+}
+
+static inline int bt_entry_size_bytes(struct mm_struct *mm)
+{
+ if (is_64bit_mm(mm))
+ return MPX_BT_ENTRY_BYTES_64;
+ else
+ return MPX_BT_ENTRY_BYTES_32;
+}
+
+/*
+ * Take a virtual address and turns it in to the offset in bytes
+ * inside of the bounds table where the bounds table entry
+ * controlling 'addr' can be found.
+ */
+static unsigned long mpx_get_bt_entry_offset_bytes(struct mm_struct *mm,
+ unsigned long addr)
+{
+ unsigned long bt_table_nr_entries;
+ unsigned long offset = addr;
+
+ if (is_64bit_mm(mm)) {
+ /* Bottom 3 bits are ignored on 64-bit */
+ offset >>= 3;
+ bt_table_nr_entries = MPX_BT_NR_ENTRIES_64;
+ } else {
+ /* Bottom 2 bits are ignored on 32-bit */
+ offset >>= 2;
+ bt_table_nr_entries = MPX_BT_NR_ENTRIES_32;
+ }
+ /*
+ * We know the size of the table in to which we are
+ * indexing, and we have eliminated all the low bits
+ * which are ignored for indexing.
+ *
+ * Mask out all the high bits which we do not need
+ * to index in to the table. Note that the tables
+ * are always powers of two so this gives us a proper
+ * mask.
+ */
+ offset &= (bt_table_nr_entries-1);
+ /*
+ * We now have an entry offset in terms of *entries* in
+ * the table. We need to scale it back up to bytes.
+ */
+ offset *= bt_entry_size_bytes(mm);
+ return offset;
+}
+
+/*
+ * How much virtual address space does a single bounds
+ * directory entry cover?
+ *
+ * Note, we need a long long because 4GB doesn't fit in
+ * to a long on 32-bit.
+ */
+static inline unsigned long bd_entry_virt_space(struct mm_struct *mm)
+{
+ unsigned long long virt_space;
+ unsigned long long GB = (1ULL << 30);
+
+ /*
+ * This covers 32-bit emulation as well as 32-bit kernels
+ * running on 64-bit hardware.
+ */
+ if (!is_64bit_mm(mm))
+ return (4ULL * GB) / MPX_BD_NR_ENTRIES_32;
+
+ /*
+ * 'x86_virt_bits' returns what the hardware is capable
+ * of, and returns the full >32-bit address space when
+ * running 32-bit kernels on 64-bit hardware.
+ */
+ virt_space = (1ULL << boot_cpu_data.x86_virt_bits);
+ return virt_space / MPX_BD_NR_ENTRIES_64;
+}
+
+/*
+ * Free the backing physical pages of bounds table 'bt_addr'.
+ * Assume start...end is within that bounds table.
+ */
+static noinline int zap_bt_entries_mapping(struct mm_struct *mm,
+ unsigned long bt_addr,
+ unsigned long start_mapping, unsigned long end_mapping)
+{
+ struct vm_area_struct *vma;
+ unsigned long addr, len;
+ unsigned long start;
+ unsigned long end;
+
+ /*
+ * if we 'end' on a boundary, the offset will be 0 which
+ * is not what we want. Back it up a byte to get the
+ * last bt entry. Then once we have the entry itself,
+ * move 'end' back up by the table entry size.
+ */
+ start = bt_addr + mpx_get_bt_entry_offset_bytes(mm, start_mapping);
+ end = bt_addr + mpx_get_bt_entry_offset_bytes(mm, end_mapping - 1);
+ /*
+ * Move end back up by one entry. Among other things
+ * this ensures that it remains page-aligned and does
+ * not screw up zap_page_range()
+ */
+ end += bt_entry_size_bytes(mm);
+
+ /*
+ * Find the first overlapping vma. If vma->vm_start > start, there
+ * will be a hole in the bounds table. This -EINVAL return will
+ * cause a SIGSEGV.
+ */
+ vma = find_vma(mm, start);
+ if (!vma || vma->vm_start > start)
+ return -EINVAL;
+
+ /*
+ * A NUMA policy on a VM_MPX VMA could cause this bounds table to
+ * be split. So we need to look across the entire 'start -> end'
+ * range of this bounds table, find all of the VM_MPX VMAs, and
+ * zap only those.
+ */
+ addr = start;
+ while (vma && vma->vm_start < end) {
+ /*
+ * We followed a bounds directory entry down
+ * here. If we find a non-MPX VMA, that's bad,
+ * so stop immediately and return an error. This
+ * probably results in a SIGSEGV.
+ */
+ if (!(vma->vm_flags & VM_MPX))
+ return -EINVAL;
+
+ len = min(vma->vm_end, end) - addr;
+ zap_page_range(vma, addr, len);
+ trace_mpx_unmap_zap(addr, addr+len);
+
+ vma = vma->vm_next;
+ addr = vma->vm_start;
+ }
+ return 0;
+}
+
+static unsigned long mpx_get_bd_entry_offset(struct mm_struct *mm,
+ unsigned long addr)
+{
+ /*
+ * There are several ways to derive the bd offsets. We
+ * use the following approach here:
+ * 1. We know the size of the virtual address space
+ * 2. We know the number of entries in a bounds table
+ * 3. We know that each entry covers a fixed amount of
+ * virtual address space.
+ * So, we can just divide the virtual address by the
+ * virtual space used by one entry to determine which
+ * entry "controls" the given virtual address.
+ */
+ if (is_64bit_mm(mm)) {
+ int bd_entry_size = 8; /* 64-bit pointer */
+ /*
+ * Take the 64-bit addressing hole in to account.
+ */
+ addr &= ((1UL << boot_cpu_data.x86_virt_bits) - 1);
+ return (addr / bd_entry_virt_space(mm)) * bd_entry_size;
+ } else {
+ int bd_entry_size = 4; /* 32-bit pointer */
+ /*
+ * 32-bit has no hole so this case needs no mask
+ */
+ return (addr / bd_entry_virt_space(mm)) * bd_entry_size;
+ }
+ /*
+ * The two return calls above are exact copies. If we
+ * pull out a single copy and put it in here, gcc won't
+ * realize that we're doing a power-of-2 divide and use
+ * shifts. It uses a real divide. If we put them up
+ * there, it manages to figure it out (gcc 4.8.3).
+ */
+}
+
+static int unmap_entire_bt(struct mm_struct *mm,
+ long __user *bd_entry, unsigned long bt_addr)
+{
+ unsigned long expected_old_val = bt_addr | MPX_BD_ENTRY_VALID_FLAG;
+ unsigned long uninitialized_var(actual_old_val);
+ int ret;
+
+ while (1) {
+ int need_write = 1;
+ unsigned long cleared_bd_entry = 0;
+
+ pagefault_disable();
+ ret = mpx_cmpxchg_bd_entry(mm, &actual_old_val,
+ bd_entry, expected_old_val, cleared_bd_entry);
+ pagefault_enable();
+ if (!ret)
+ break;
+ if (ret == -EFAULT)
+ ret = mpx_resolve_fault(bd_entry, need_write);
+ /*
+ * If we could not resolve the fault, consider it
+ * userspace's fault and error out.
+ */
+ if (ret)
+ return ret;
+ }
+ /*
+ * The cmpxchg was performed, check the results.
+ */
+ if (actual_old_val != expected_old_val) {
+ /*
+ * Someone else raced with us to unmap the table.
+ * That is OK, since we were both trying to do
+ * the same thing. Declare success.
+ */
+ if (!actual_old_val)
+ return 0;
+ /*
+ * Something messed with the bounds directory
+ * entry. We hold mmap_sem for read or write
+ * here, so it could not be a _new_ bounds table
+ * that someone just allocated. Something is
+ * wrong, so pass up the error and SIGSEGV.
+ */
+ return -EINVAL;
+ }
+ /*
+ * Note, we are likely being called under do_munmap() already. To
+ * avoid recursion, do_munmap() will check whether it comes
+ * from one bounds table through VM_MPX flag.
+ */
+ return do_munmap(mm, bt_addr, mpx_bt_size_bytes(mm), NULL);
+}
+
+static int try_unmap_single_bt(struct mm_struct *mm,
+ unsigned long start, unsigned long end)
+{
+ struct vm_area_struct *next;
+ struct vm_area_struct *prev;
+ /*
+ * "bta" == Bounds Table Area: the area controlled by the
+ * bounds table that we are unmapping.
+ */
+ unsigned long bta_start_vaddr = start & ~(bd_entry_virt_space(mm)-1);
+ unsigned long bta_end_vaddr = bta_start_vaddr + bd_entry_virt_space(mm);
+ unsigned long uninitialized_var(bt_addr);
+ void __user *bde_vaddr;
+ int ret;
+ /*
+ * We already unlinked the VMAs from the mm's rbtree so 'start'
+ * is guaranteed to be in a hole. This gets us the first VMA
+ * before the hole in to 'prev' and the next VMA after the hole
+ * in to 'next'.
+ */
+ next = find_vma_prev(mm, start, &prev);
+ /*
+ * Do not count other MPX bounds table VMAs as neighbors.
+ * Although theoretically possible, we do not allow bounds
+ * tables for bounds tables so our heads do not explode.
+ * If we count them as neighbors here, we may end up with
+ * lots of tables even though we have no actual table
+ * entries in use.
+ */
+ while (next && (next->vm_flags & VM_MPX))
+ next = next->vm_next;
+ while (prev && (prev->vm_flags & VM_MPX))
+ prev = prev->vm_prev;
+ /*
+ * We know 'start' and 'end' lie within an area controlled
+ * by a single bounds table. See if there are any other
+ * VMAs controlled by that bounds table. If there are not
+ * then we can "expand" the are we are unmapping to possibly
+ * cover the entire table.
+ */
+ next = find_vma_prev(mm, start, &prev);
+ if ((!prev || prev->vm_end <= bta_start_vaddr) &&
+ (!next || next->vm_start >= bta_end_vaddr)) {
+ /*
+ * No neighbor VMAs controlled by same bounds
+ * table. Try to unmap the whole thing
+ */
+ start = bta_start_vaddr;
+ end = bta_end_vaddr;
+ }
+
+ bde_vaddr = mm->context.bd_addr + mpx_get_bd_entry_offset(mm, start);
+ ret = get_bt_addr(mm, bde_vaddr, &bt_addr);
+ /*
+ * No bounds table there, so nothing to unmap.
+ */
+ if (ret == -ENOENT) {
+ ret = 0;
+ return 0;
+ }
+ if (ret)
+ return ret;
+ /*
+ * We are unmapping an entire table. Either because the
+ * unmap that started this whole process was large enough
+ * to cover an entire table, or that the unmap was small
+ * but was the area covered by a bounds table.
+ */
+ if ((start == bta_start_vaddr) &&
+ (end == bta_end_vaddr))
+ return unmap_entire_bt(mm, bde_vaddr, bt_addr);
+ return zap_bt_entries_mapping(mm, bt_addr, start, end);
+}
+
+static int mpx_unmap_tables(struct mm_struct *mm,
+ unsigned long start, unsigned long end)
+{
+ unsigned long one_unmap_start;
+ trace_mpx_unmap_search(start, end);
+
+ one_unmap_start = start;
+ while (one_unmap_start < end) {
+ int ret;
+ unsigned long next_unmap_start = ALIGN(one_unmap_start+1,
+ bd_entry_virt_space(mm));
+ unsigned long one_unmap_end = end;
+ /*
+ * if the end is beyond the current bounds table,
+ * move it back so we only deal with a single one
+ * at a time
+ */
+ if (one_unmap_end > next_unmap_start)
+ one_unmap_end = next_unmap_start;
+ ret = try_unmap_single_bt(mm, one_unmap_start, one_unmap_end);
+ if (ret)
+ return ret;
+
+ one_unmap_start = next_unmap_start;
+ }
+ return 0;
+}
+
+/*
+ * Free unused bounds tables covered in a virtual address region being
+ * munmap()ed. Assume end > start.
+ *
+ * This function will be called by do_munmap(), and the VMAs covering
+ * the virtual address region start...end have already been split if
+ * necessary, and the 'vma' is the first vma in this range (start -> end).
+ */
+void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long start, unsigned long end)
+{
+ int ret;
+
+ /*
+ * Refuse to do anything unless userspace has asked
+ * the kernel to help manage the bounds tables,
+ */
+ if (!kernel_managing_mpx_tables(current->mm))
+ return;
+ /*
+ * This will look across the entire 'start -> end' range,
+ * and find all of the non-VM_MPX VMAs.
+ *
+ * To avoid recursion, if a VM_MPX vma is found in the range
+ * (start->end), we will not continue follow-up work. This
+ * recursion represents having bounds tables for bounds tables,
+ * which should not occur normally. Being strict about it here
+ * helps ensure that we do not have an exploitable stack overflow.
+ */
+ do {
+ if (vma->vm_flags & VM_MPX)
+ return;
+ vma = vma->vm_next;
+ } while (vma && vma->vm_start < end);
+
+ ret = mpx_unmap_tables(mm, start, end);
+ if (ret)
+ force_sig(SIGSEGV, current);
+}
+
+/* MPX cannot handle addresses above 47 bits yet. */
+unsigned long mpx_unmapped_area_check(unsigned long addr, unsigned long len,
+ unsigned long flags)
+{
+ if (!kernel_managing_mpx_tables(current->mm))
+ return addr;
+ if (addr + len <= DEFAULT_MAP_WINDOW)
+ return addr;
+ if (flags & MAP_FIXED)
+ return -ENOMEM;
+
+ /*
+ * Requested len is larger than the whole area we're allowed to map in.
+ * Resetting hinting address wouldn't do much good -- fail early.
+ */
+ if (len > DEFAULT_MAP_WINDOW)
+ return -ENOMEM;
+
+ /* Look for unmap area within DEFAULT_MAP_WINDOW */
+ return 0;
+}
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
new file mode 100644
index 000000000..fa1508556
--- /dev/null
+++ b/arch/x86/mm/numa.c
@@ -0,0 +1,901 @@
+/* Common code for 32 and 64-bit NUMA */
+#include <linux/acpi.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/memblock.h>
+#include <linux/mmzone.h>
+#include <linux/ctype.h>
+#include <linux/nodemask.h>
+#include <linux/sched.h>
+#include <linux/topology.h>
+
+#include <asm/e820/api.h>
+#include <asm/proto.h>
+#include <asm/dma.h>
+#include <asm/amd_nb.h>
+
+#include "numa_internal.h"
+
+int numa_off;
+nodemask_t numa_nodes_parsed __initdata;
+
+struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
+EXPORT_SYMBOL(node_data);
+
+static struct numa_meminfo numa_meminfo
+#ifndef CONFIG_MEMORY_HOTPLUG
+__initdata
+#endif
+;
+
+static int numa_distance_cnt;
+static u8 *numa_distance;
+
+static __init int numa_setup(char *opt)
+{
+ if (!opt)
+ return -EINVAL;
+ if (!strncmp(opt, "off", 3))
+ numa_off = 1;
+#ifdef CONFIG_NUMA_EMU
+ if (!strncmp(opt, "fake=", 5))
+ numa_emu_cmdline(opt + 5);
+#endif
+#ifdef CONFIG_ACPI_NUMA
+ if (!strncmp(opt, "noacpi", 6))
+ acpi_numa = -1;
+#endif
+ return 0;
+}
+early_param("numa", numa_setup);
+
+/*
+ * apicid, cpu, node mappings
+ */
+s16 __apicid_to_node[MAX_LOCAL_APIC] = {
+ [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
+};
+
+int numa_cpu_node(int cpu)
+{
+ int apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
+
+ if (apicid != BAD_APICID)
+ return __apicid_to_node[apicid];
+ return NUMA_NO_NODE;
+}
+
+cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
+EXPORT_SYMBOL(node_to_cpumask_map);
+
+/*
+ * Map cpu index to node index
+ */
+DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
+EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
+
+void numa_set_node(int cpu, int node)
+{
+ int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
+
+ /* early setting, no percpu area yet */
+ if (cpu_to_node_map) {
+ cpu_to_node_map[cpu] = node;
+ return;
+ }
+
+#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+ if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) {
+ printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu);
+ dump_stack();
+ return;
+ }
+#endif
+ per_cpu(x86_cpu_to_node_map, cpu) = node;
+
+ set_cpu_numa_node(cpu, node);
+}
+
+void numa_clear_node(int cpu)
+{
+ numa_set_node(cpu, NUMA_NO_NODE);
+}
+
+/*
+ * Allocate node_to_cpumask_map based on number of available nodes
+ * Requires node_possible_map to be valid.
+ *
+ * Note: cpumask_of_node() is not valid until after this is done.
+ * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.)
+ */
+void __init setup_node_to_cpumask_map(void)
+{
+ unsigned int node;
+
+ /* setup nr_node_ids if not done yet */
+ if (nr_node_ids == MAX_NUMNODES)
+ setup_nr_node_ids();
+
+ /* allocate the map */
+ for (node = 0; node < nr_node_ids; node++)
+ alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]);
+
+ /* cpumask_of_node() will now work */
+ pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids);
+}
+
+static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
+ struct numa_meminfo *mi)
+{
+ /* ignore zero length blks */
+ if (start == end)
+ return 0;
+
+ /* whine about and ignore invalid blks */
+ if (start > end || nid < 0 || nid >= MAX_NUMNODES) {
+ pr_warn("Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n",
+ nid, start, end - 1);
+ return 0;
+ }
+
+ if (mi->nr_blks >= NR_NODE_MEMBLKS) {
+ pr_err("too many memblk ranges\n");
+ return -EINVAL;
+ }
+
+ mi->blk[mi->nr_blks].start = start;
+ mi->blk[mi->nr_blks].end = end;
+ mi->blk[mi->nr_blks].nid = nid;
+ mi->nr_blks++;
+ return 0;
+}
+
+/**
+ * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo
+ * @idx: Index of memblk to remove
+ * @mi: numa_meminfo to remove memblk from
+ *
+ * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and
+ * decrementing @mi->nr_blks.
+ */
+void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
+{
+ mi->nr_blks--;
+ memmove(&mi->blk[idx], &mi->blk[idx + 1],
+ (mi->nr_blks - idx) * sizeof(mi->blk[0]));
+}
+
+/**
+ * numa_add_memblk - Add one numa_memblk to numa_meminfo
+ * @nid: NUMA node ID of the new memblk
+ * @start: Start address of the new memblk
+ * @end: End address of the new memblk
+ *
+ * Add a new memblk to the default numa_meminfo.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int __init numa_add_memblk(int nid, u64 start, u64 end)
+{
+ return numa_add_memblk_to(nid, start, end, &numa_meminfo);
+}
+
+/* Allocate NODE_DATA for a node on the local memory */
+static void __init alloc_node_data(int nid)
+{
+ const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
+ u64 nd_pa;
+ void *nd;
+ int tnid;
+
+ /*
+ * Allocate node data. Try node-local memory and then any node.
+ * Never allocate in DMA zone.
+ */
+ nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid);
+ if (!nd_pa) {
+ nd_pa = __memblock_alloc_base(nd_size, SMP_CACHE_BYTES,
+ MEMBLOCK_ALLOC_ACCESSIBLE);
+ if (!nd_pa) {
+ pr_err("Cannot find %zu bytes in any node (initial node: %d)\n",
+ nd_size, nid);
+ return;
+ }
+ }
+ nd = __va(nd_pa);
+
+ /* report and initialize */
+ printk(KERN_INFO "NODE_DATA(%d) allocated [mem %#010Lx-%#010Lx]\n", nid,
+ nd_pa, nd_pa + nd_size - 1);
+ tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
+ if (tnid != nid)
+ printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nid, tnid);
+
+ node_data[nid] = nd;
+ memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
+
+ node_set_online(nid);
+}
+
+/**
+ * numa_cleanup_meminfo - Cleanup a numa_meminfo
+ * @mi: numa_meminfo to clean up
+ *
+ * Sanitize @mi by merging and removing unnecessary memblks. Also check for
+ * conflicts and clear unused memblks.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
+{
+ const u64 low = 0;
+ const u64 high = PFN_PHYS(max_pfn);
+ int i, j, k;
+
+ /* first, trim all entries */
+ for (i = 0; i < mi->nr_blks; i++) {
+ struct numa_memblk *bi = &mi->blk[i];
+
+ /* make sure all blocks are inside the limits */
+ bi->start = max(bi->start, low);
+ bi->end = min(bi->end, high);
+
+ /* and there's no empty or non-exist block */
+ if (bi->start >= bi->end ||
+ !memblock_overlaps_region(&memblock.memory,
+ bi->start, bi->end - bi->start))
+ numa_remove_memblk_from(i--, mi);
+ }
+
+ /* merge neighboring / overlapping entries */
+ for (i = 0; i < mi->nr_blks; i++) {
+ struct numa_memblk *bi = &mi->blk[i];
+
+ for (j = i + 1; j < mi->nr_blks; j++) {
+ struct numa_memblk *bj = &mi->blk[j];
+ u64 start, end;
+
+ /*
+ * See whether there are overlapping blocks. Whine
+ * about but allow overlaps of the same nid. They
+ * will be merged below.
+ */
+ if (bi->end > bj->start && bi->start < bj->end) {
+ if (bi->nid != bj->nid) {
+ pr_err("node %d [mem %#010Lx-%#010Lx] overlaps with node %d [mem %#010Lx-%#010Lx]\n",
+ bi->nid, bi->start, bi->end - 1,
+ bj->nid, bj->start, bj->end - 1);
+ return -EINVAL;
+ }
+ pr_warn("Warning: node %d [mem %#010Lx-%#010Lx] overlaps with itself [mem %#010Lx-%#010Lx]\n",
+ bi->nid, bi->start, bi->end - 1,
+ bj->start, bj->end - 1);
+ }
+
+ /*
+ * Join together blocks on the same node, holes
+ * between which don't overlap with memory on other
+ * nodes.
+ */
+ if (bi->nid != bj->nid)
+ continue;
+ start = min(bi->start, bj->start);
+ end = max(bi->end, bj->end);
+ for (k = 0; k < mi->nr_blks; k++) {
+ struct numa_memblk *bk = &mi->blk[k];
+
+ if (bi->nid == bk->nid)
+ continue;
+ if (start < bk->end && end > bk->start)
+ break;
+ }
+ if (k < mi->nr_blks)
+ continue;
+ printk(KERN_INFO "NUMA: Node %d [mem %#010Lx-%#010Lx] + [mem %#010Lx-%#010Lx] -> [mem %#010Lx-%#010Lx]\n",
+ bi->nid, bi->start, bi->end - 1, bj->start,
+ bj->end - 1, start, end - 1);
+ bi->start = start;
+ bi->end = end;
+ numa_remove_memblk_from(j--, mi);
+ }
+ }
+
+ /* clear unused ones */
+ for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) {
+ mi->blk[i].start = mi->blk[i].end = 0;
+ mi->blk[i].nid = NUMA_NO_NODE;
+ }
+
+ return 0;
+}
+
+/*
+ * Set nodes, which have memory in @mi, in *@nodemask.
+ */
+static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask,
+ const struct numa_meminfo *mi)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(mi->blk); i++)
+ if (mi->blk[i].start != mi->blk[i].end &&
+ mi->blk[i].nid != NUMA_NO_NODE)
+ node_set(mi->blk[i].nid, *nodemask);
+}
+
+/**
+ * numa_reset_distance - Reset NUMA distance table
+ *
+ * The current table is freed. The next numa_set_distance() call will
+ * create a new one.
+ */
+void __init numa_reset_distance(void)
+{
+ size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]);
+
+ /* numa_distance could be 1LU marking allocation failure, test cnt */
+ if (numa_distance_cnt)
+ memblock_free(__pa(numa_distance), size);
+ numa_distance_cnt = 0;
+ numa_distance = NULL; /* enable table creation */
+}
+
+static int __init numa_alloc_distance(void)
+{
+ nodemask_t nodes_parsed;
+ size_t size;
+ int i, j, cnt = 0;
+ u64 phys;
+
+ /* size the new table and allocate it */
+ nodes_parsed = numa_nodes_parsed;
+ numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo);
+
+ for_each_node_mask(i, nodes_parsed)
+ cnt = i;
+ cnt++;
+ size = cnt * cnt * sizeof(numa_distance[0]);
+
+ phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
+ size, PAGE_SIZE);
+ if (!phys) {
+ pr_warn("Warning: can't allocate distance table!\n");
+ /* don't retry until explicitly reset */
+ numa_distance = (void *)1LU;
+ return -ENOMEM;
+ }
+ memblock_reserve(phys, size);
+
+ numa_distance = __va(phys);
+ numa_distance_cnt = cnt;
+
+ /* fill with the default distances */
+ for (i = 0; i < cnt; i++)
+ for (j = 0; j < cnt; j++)
+ numa_distance[i * cnt + j] = i == j ?
+ LOCAL_DISTANCE : REMOTE_DISTANCE;
+ printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt);
+
+ return 0;
+}
+
+/**
+ * numa_set_distance - Set NUMA distance from one NUMA to another
+ * @from: the 'from' node to set distance
+ * @to: the 'to' node to set distance
+ * @distance: NUMA distance
+ *
+ * Set the distance from node @from to @to to @distance. If distance table
+ * doesn't exist, one which is large enough to accommodate all the currently
+ * known nodes will be created.
+ *
+ * If such table cannot be allocated, a warning is printed and further
+ * calls are ignored until the distance table is reset with
+ * numa_reset_distance().
+ *
+ * If @from or @to is higher than the highest known node or lower than zero
+ * at the time of table creation or @distance doesn't make sense, the call
+ * is ignored.
+ * This is to allow simplification of specific NUMA config implementations.
+ */
+void __init numa_set_distance(int from, int to, int distance)
+{
+ if (!numa_distance && numa_alloc_distance() < 0)
+ return;
+
+ if (from >= numa_distance_cnt || to >= numa_distance_cnt ||
+ from < 0 || to < 0) {
+ pr_warn_once("Warning: node ids are out of bound, from=%d to=%d distance=%d\n",
+ from, to, distance);
+ return;
+ }
+
+ if ((u8)distance != distance ||
+ (from == to && distance != LOCAL_DISTANCE)) {
+ pr_warn_once("Warning: invalid distance parameter, from=%d to=%d distance=%d\n",
+ from, to, distance);
+ return;
+ }
+
+ numa_distance[from * numa_distance_cnt + to] = distance;
+}
+
+int __node_distance(int from, int to)
+{
+ if (from >= numa_distance_cnt || to >= numa_distance_cnt)
+ return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE;
+ return numa_distance[from * numa_distance_cnt + to];
+}
+EXPORT_SYMBOL(__node_distance);
+
+/*
+ * Sanity check to catch more bad NUMA configurations (they are amazingly
+ * common). Make sure the nodes cover all memory.
+ */
+static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi)
+{
+ u64 numaram, e820ram;
+ int i;
+
+ numaram = 0;
+ for (i = 0; i < mi->nr_blks; i++) {
+ u64 s = mi->blk[i].start >> PAGE_SHIFT;
+ u64 e = mi->blk[i].end >> PAGE_SHIFT;
+ numaram += e - s;
+ numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e);
+ if ((s64)numaram < 0)
+ numaram = 0;
+ }
+
+ e820ram = max_pfn - absent_pages_in_range(0, max_pfn);
+
+ /* We seem to lose 3 pages somewhere. Allow 1M of slack. */
+ if ((s64)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) {
+ printk(KERN_ERR "NUMA: nodes only cover %LuMB of your %LuMB e820 RAM. Not used.\n",
+ (numaram << PAGE_SHIFT) >> 20,
+ (e820ram << PAGE_SHIFT) >> 20);
+ return false;
+ }
+ return true;
+}
+
+/*
+ * Mark all currently memblock-reserved physical memory (which covers the
+ * kernel's own memory ranges) as hot-unswappable.
+ */
+static void __init numa_clear_kernel_node_hotplug(void)
+{
+ nodemask_t reserved_nodemask = NODE_MASK_NONE;
+ struct memblock_region *mb_region;
+ int i;
+
+ /*
+ * We have to do some preprocessing of memblock regions, to
+ * make them suitable for reservation.
+ *
+ * At this time, all memory regions reserved by memblock are
+ * used by the kernel, but those regions are not split up
+ * along node boundaries yet, and don't necessarily have their
+ * node ID set yet either.
+ *
+ * So iterate over all memory known to the x86 architecture,
+ * and use those ranges to set the nid in memblock.reserved.
+ * This will split up the memblock regions along node
+ * boundaries and will set the node IDs as well.
+ */
+ for (i = 0; i < numa_meminfo.nr_blks; i++) {
+ struct numa_memblk *mb = numa_meminfo.blk + i;
+ int ret;
+
+ ret = memblock_set_node(mb->start, mb->end - mb->start, &memblock.reserved, mb->nid);
+ WARN_ON_ONCE(ret);
+ }
+
+ /*
+ * Now go over all reserved memblock regions, to construct a
+ * node mask of all kernel reserved memory areas.
+ *
+ * [ Note, when booting with mem=nn[kMG] or in a kdump kernel,
+ * numa_meminfo might not include all memblock.reserved
+ * memory ranges, because quirks such as trim_snb_memory()
+ * reserve specific pages for Sandy Bridge graphics. ]
+ */
+ for_each_memblock(reserved, mb_region) {
+ if (mb_region->nid != MAX_NUMNODES)
+ node_set(mb_region->nid, reserved_nodemask);
+ }
+
+ /*
+ * Finally, clear the MEMBLOCK_HOTPLUG flag for all memory
+ * belonging to the reserved node mask.
+ *
+ * Note that this will include memory regions that reside
+ * on nodes that contain kernel memory - entire nodes
+ * become hot-unpluggable:
+ */
+ for (i = 0; i < numa_meminfo.nr_blks; i++) {
+ struct numa_memblk *mb = numa_meminfo.blk + i;
+
+ if (!node_isset(mb->nid, reserved_nodemask))
+ continue;
+
+ memblock_clear_hotplug(mb->start, mb->end - mb->start);
+ }
+}
+
+static int __init numa_register_memblks(struct numa_meminfo *mi)
+{
+ unsigned long uninitialized_var(pfn_align);
+ int i, nid;
+
+ /* Account for nodes with cpus and no memory */
+ node_possible_map = numa_nodes_parsed;
+ numa_nodemask_from_meminfo(&node_possible_map, mi);
+ if (WARN_ON(nodes_empty(node_possible_map)))
+ return -EINVAL;
+
+ for (i = 0; i < mi->nr_blks; i++) {
+ struct numa_memblk *mb = &mi->blk[i];
+ memblock_set_node(mb->start, mb->end - mb->start,
+ &memblock.memory, mb->nid);
+ }
+
+ /*
+ * At very early time, the kernel have to use some memory such as
+ * loading the kernel image. We cannot prevent this anyway. So any
+ * node the kernel resides in should be un-hotpluggable.
+ *
+ * And when we come here, alloc node data won't fail.
+ */
+ numa_clear_kernel_node_hotplug();
+
+ /*
+ * If sections array is gonna be used for pfn -> nid mapping, check
+ * whether its granularity is fine enough.
+ */
+#ifdef NODE_NOT_IN_PAGE_FLAGS
+ pfn_align = node_map_pfn_alignment();
+ if (pfn_align && pfn_align < PAGES_PER_SECTION) {
+ printk(KERN_WARNING "Node alignment %LuMB < min %LuMB, rejecting NUMA config\n",
+ PFN_PHYS(pfn_align) >> 20,
+ PFN_PHYS(PAGES_PER_SECTION) >> 20);
+ return -EINVAL;
+ }
+#endif
+ if (!numa_meminfo_cover_memory(mi))
+ return -EINVAL;
+
+ /* Finally register nodes. */
+ for_each_node_mask(nid, node_possible_map) {
+ u64 start = PFN_PHYS(max_pfn);
+ u64 end = 0;
+
+ for (i = 0; i < mi->nr_blks; i++) {
+ if (nid != mi->blk[i].nid)
+ continue;
+ start = min(mi->blk[i].start, start);
+ end = max(mi->blk[i].end, end);
+ }
+
+ if (start >= end)
+ continue;
+
+ /*
+ * Don't confuse VM with a node that doesn't have the
+ * minimum amount of memory:
+ */
+ if (end && (end - start) < NODE_MIN_SIZE)
+ continue;
+
+ alloc_node_data(nid);
+ }
+
+ /* Dump memblock with node info and return. */
+ memblock_dump_all();
+ return 0;
+}
+
+/*
+ * There are unfortunately some poorly designed mainboards around that
+ * only connect memory to a single CPU. This breaks the 1:1 cpu->node
+ * mapping. To avoid this fill in the mapping for all possible CPUs,
+ * as the number of CPUs is not known yet. We round robin the existing
+ * nodes.
+ */
+static void __init numa_init_array(void)
+{
+ int rr, i;
+
+ rr = first_node(node_online_map);
+ for (i = 0; i < nr_cpu_ids; i++) {
+ if (early_cpu_to_node(i) != NUMA_NO_NODE)
+ continue;
+ numa_set_node(i, rr);
+ rr = next_node_in(rr, node_online_map);
+ }
+}
+
+static int __init numa_init(int (*init_func)(void))
+{
+ int i;
+ int ret;
+
+ for (i = 0; i < MAX_LOCAL_APIC; i++)
+ set_apicid_to_node(i, NUMA_NO_NODE);
+
+ nodes_clear(numa_nodes_parsed);
+ nodes_clear(node_possible_map);
+ nodes_clear(node_online_map);
+ memset(&numa_meminfo, 0, sizeof(numa_meminfo));
+ WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.memory,
+ MAX_NUMNODES));
+ WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.reserved,
+ MAX_NUMNODES));
+ /* In case that parsing SRAT failed. */
+ WARN_ON(memblock_clear_hotplug(0, ULLONG_MAX));
+ numa_reset_distance();
+
+ ret = init_func();
+ if (ret < 0)
+ return ret;
+
+ /*
+ * We reset memblock back to the top-down direction
+ * here because if we configured ACPI_NUMA, we have
+ * parsed SRAT in init_func(). It is ok to have the
+ * reset here even if we did't configure ACPI_NUMA
+ * or acpi numa init fails and fallbacks to dummy
+ * numa init.
+ */
+ memblock_set_bottom_up(false);
+
+ ret = numa_cleanup_meminfo(&numa_meminfo);
+ if (ret < 0)
+ return ret;
+
+ numa_emulation(&numa_meminfo, numa_distance_cnt);
+
+ ret = numa_register_memblks(&numa_meminfo);
+ if (ret < 0)
+ return ret;
+
+ for (i = 0; i < nr_cpu_ids; i++) {
+ int nid = early_cpu_to_node(i);
+
+ if (nid == NUMA_NO_NODE)
+ continue;
+ if (!node_online(nid))
+ numa_clear_node(i);
+ }
+ numa_init_array();
+
+ return 0;
+}
+
+/**
+ * dummy_numa_init - Fallback dummy NUMA init
+ *
+ * Used if there's no underlying NUMA architecture, NUMA initialization
+ * fails, or NUMA is disabled on the command line.
+ *
+ * Must online at least one node and add memory blocks that cover all
+ * allowed memory. This function must not fail.
+ */
+static int __init dummy_numa_init(void)
+{
+ printk(KERN_INFO "%s\n",
+ numa_off ? "NUMA turned off" : "No NUMA configuration found");
+ printk(KERN_INFO "Faking a node at [mem %#018Lx-%#018Lx]\n",
+ 0LLU, PFN_PHYS(max_pfn) - 1);
+
+ node_set(0, numa_nodes_parsed);
+ numa_add_memblk(0, 0, PFN_PHYS(max_pfn));
+
+ return 0;
+}
+
+/**
+ * x86_numa_init - Initialize NUMA
+ *
+ * Try each configured NUMA initialization method until one succeeds. The
+ * last fallback is dummy single node config encomapssing whole memory and
+ * never fails.
+ */
+void __init x86_numa_init(void)
+{
+ if (!numa_off) {
+#ifdef CONFIG_ACPI_NUMA
+ if (!numa_init(x86_acpi_numa_init))
+ return;
+#endif
+#ifdef CONFIG_AMD_NUMA
+ if (!numa_init(amd_numa_init))
+ return;
+#endif
+ }
+
+ numa_init(dummy_numa_init);
+}
+
+static void __init init_memory_less_node(int nid)
+{
+ unsigned long zones_size[MAX_NR_ZONES] = {0};
+ unsigned long zholes_size[MAX_NR_ZONES] = {0};
+
+ /* Allocate and initialize node data. Memory-less node is now online.*/
+ alloc_node_data(nid);
+ free_area_init_node(nid, zones_size, 0, zholes_size);
+
+ /*
+ * All zonelists will be built later in start_kernel() after per cpu
+ * areas are initialized.
+ */
+}
+
+/*
+ * Setup early cpu_to_node.
+ *
+ * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
+ * and apicid_to_node[] tables have valid entries for a CPU.
+ * This means we skip cpu_to_node[] initialisation for NUMA
+ * emulation and faking node case (when running a kernel compiled
+ * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
+ * is already initialized in a round robin manner at numa_init_array,
+ * prior to this call, and this initialization is good enough
+ * for the fake NUMA cases.
+ *
+ * Called before the per_cpu areas are setup.
+ */
+void __init init_cpu_to_node(void)
+{
+ int cpu;
+ u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
+
+ BUG_ON(cpu_to_apicid == NULL);
+
+ for_each_possible_cpu(cpu) {
+ int node = numa_cpu_node(cpu);
+
+ if (node == NUMA_NO_NODE)
+ continue;
+
+ if (!node_online(node))
+ init_memory_less_node(node);
+
+ numa_set_node(cpu, node);
+ }
+}
+
+#ifndef CONFIG_DEBUG_PER_CPU_MAPS
+
+# ifndef CONFIG_NUMA_EMU
+void numa_add_cpu(int cpu)
+{
+ cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
+}
+
+void numa_remove_cpu(int cpu)
+{
+ cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
+}
+# endif /* !CONFIG_NUMA_EMU */
+
+#else /* !CONFIG_DEBUG_PER_CPU_MAPS */
+
+int __cpu_to_node(int cpu)
+{
+ if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
+ printk(KERN_WARNING
+ "cpu_to_node(%d): usage too early!\n", cpu);
+ dump_stack();
+ return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
+ }
+ return per_cpu(x86_cpu_to_node_map, cpu);
+}
+EXPORT_SYMBOL(__cpu_to_node);
+
+/*
+ * Same function as cpu_to_node() but used if called before the
+ * per_cpu areas are setup.
+ */
+int early_cpu_to_node(int cpu)
+{
+ if (early_per_cpu_ptr(x86_cpu_to_node_map))
+ return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
+
+ if (!cpu_possible(cpu)) {
+ printk(KERN_WARNING
+ "early_cpu_to_node(%d): no per_cpu area!\n", cpu);
+ dump_stack();
+ return NUMA_NO_NODE;
+ }
+ return per_cpu(x86_cpu_to_node_map, cpu);
+}
+
+void debug_cpumask_set_cpu(int cpu, int node, bool enable)
+{
+ struct cpumask *mask;
+
+ if (node == NUMA_NO_NODE) {
+ /* early_cpu_to_node() already emits a warning and trace */
+ return;
+ }
+ mask = node_to_cpumask_map[node];
+ if (!mask) {
+ pr_err("node_to_cpumask_map[%i] NULL\n", node);
+ dump_stack();
+ return;
+ }
+
+ if (enable)
+ cpumask_set_cpu(cpu, mask);
+ else
+ cpumask_clear_cpu(cpu, mask);
+
+ printk(KERN_DEBUG "%s cpu %d node %d: mask now %*pbl\n",
+ enable ? "numa_add_cpu" : "numa_remove_cpu",
+ cpu, node, cpumask_pr_args(mask));
+ return;
+}
+
+# ifndef CONFIG_NUMA_EMU
+static void numa_set_cpumask(int cpu, bool enable)
+{
+ debug_cpumask_set_cpu(cpu, early_cpu_to_node(cpu), enable);
+}
+
+void numa_add_cpu(int cpu)
+{
+ numa_set_cpumask(cpu, true);
+}
+
+void numa_remove_cpu(int cpu)
+{
+ numa_set_cpumask(cpu, false);
+}
+# endif /* !CONFIG_NUMA_EMU */
+
+/*
+ * Returns a pointer to the bitmask of CPUs on Node 'node'.
+ */
+const struct cpumask *cpumask_of_node(int node)
+{
+ if (node >= nr_node_ids) {
+ printk(KERN_WARNING
+ "cpumask_of_node(%d): node > nr_node_ids(%d)\n",
+ node, nr_node_ids);
+ dump_stack();
+ return cpu_none_mask;
+ }
+ if (node_to_cpumask_map[node] == NULL) {
+ printk(KERN_WARNING
+ "cpumask_of_node(%d): no node_to_cpumask_map!\n",
+ node);
+ dump_stack();
+ return cpu_online_mask;
+ }
+ return node_to_cpumask_map[node];
+}
+EXPORT_SYMBOL(cpumask_of_node);
+
+#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+int memory_add_physaddr_to_nid(u64 start)
+{
+ struct numa_meminfo *mi = &numa_meminfo;
+ int nid = mi->blk[0].nid;
+ int i;
+
+ for (i = 0; i < mi->nr_blks; i++)
+ if (mi->blk[i].start <= start && mi->blk[i].end > start)
+ nid = mi->blk[i].nid;
+ return nid;
+}
+EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
+#endif
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
new file mode 100644
index 000000000..e8a4a09e2
--- /dev/null
+++ b/arch/x86/mm/numa_32.c
@@ -0,0 +1,94 @@
+/*
+ * Written by: Patricia Gaughen <gone@us.ibm.com>, IBM Corporation
+ * August 2002: added remote node KVA remap - Martin J. Bligh
+ *
+ * Copyright (C) 2002, IBM Corp.
+ *
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/bootmem.h>
+#include <linux/memblock.h>
+#include <linux/init.h>
+
+#include "numa_internal.h"
+
+#ifdef CONFIG_DISCONTIGMEM
+/*
+ * 4) physnode_map - the mapping between a pfn and owning node
+ * physnode_map keeps track of the physical memory layout of a generic
+ * numa node on a 64Mb break (each element of the array will
+ * represent 64Mb of memory and will be marked by the node id. so,
+ * if the first gig is on node 0, and the second gig is on node 1
+ * physnode_map will contain:
+ *
+ * physnode_map[0-15] = 0;
+ * physnode_map[16-31] = 1;
+ * physnode_map[32- ] = -1;
+ */
+s8 physnode_map[MAX_SECTIONS] __read_mostly = { [0 ... (MAX_SECTIONS - 1)] = -1};
+EXPORT_SYMBOL(physnode_map);
+
+void memory_present(int nid, unsigned long start, unsigned long end)
+{
+ unsigned long pfn;
+
+ printk(KERN_INFO "Node: %d, start_pfn: %lx, end_pfn: %lx\n",
+ nid, start, end);
+ printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid);
+ printk(KERN_DEBUG " ");
+ start = round_down(start, PAGES_PER_SECTION);
+ end = round_up(end, PAGES_PER_SECTION);
+ for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
+ physnode_map[pfn / PAGES_PER_SECTION] = nid;
+ printk(KERN_CONT "%lx ", pfn);
+ }
+ printk(KERN_CONT "\n");
+}
+#endif
+
+extern unsigned long highend_pfn, highstart_pfn;
+
+void __init initmem_init(void)
+{
+ x86_numa_init();
+
+#ifdef CONFIG_HIGHMEM
+ highstart_pfn = highend_pfn = max_pfn;
+ if (max_pfn > max_low_pfn)
+ highstart_pfn = max_low_pfn;
+ printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
+ pages_to_mb(highend_pfn - highstart_pfn));
+ high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
+#else
+ high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
+#endif
+ printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
+ pages_to_mb(max_low_pfn));
+ printk(KERN_DEBUG "max_low_pfn = %lx, highstart_pfn = %lx\n",
+ max_low_pfn, highstart_pfn);
+
+ printk(KERN_DEBUG "Low memory ends at vaddr %08lx\n",
+ (ulong) pfn_to_kaddr(max_low_pfn));
+
+ printk(KERN_DEBUG "High memory starts at vaddr %08lx\n",
+ (ulong) pfn_to_kaddr(highstart_pfn));
+
+ __vmalloc_start_set = true;
+ setup_bootmem_allocator();
+}
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
new file mode 100644
index 000000000..066f3511d
--- /dev/null
+++ b/arch/x86/mm/numa_64.c
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Generic VM initialization for x86-64 NUMA setups.
+ * Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ */
+#include <linux/bootmem.h>
+
+#include "numa_internal.h"
+
+void __init initmem_init(void)
+{
+ x86_numa_init();
+}
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
new file mode 100644
index 000000000..4686757a7
--- /dev/null
+++ b/arch/x86/mm/numa_emulation.c
@@ -0,0 +1,587 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NUMA emulation
+ */
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/topology.h>
+#include <linux/memblock.h>
+#include <linux/bootmem.h>
+#include <asm/dma.h>
+
+#include "numa_internal.h"
+
+static int emu_nid_to_phys[MAX_NUMNODES];
+static char *emu_cmdline __initdata;
+
+void __init numa_emu_cmdline(char *str)
+{
+ emu_cmdline = str;
+}
+
+static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi)
+{
+ int i;
+
+ for (i = 0; i < mi->nr_blks; i++)
+ if (mi->blk[i].nid == nid)
+ return i;
+ return -ENOENT;
+}
+
+static u64 __init mem_hole_size(u64 start, u64 end)
+{
+ unsigned long start_pfn = PFN_UP(start);
+ unsigned long end_pfn = PFN_DOWN(end);
+
+ if (start_pfn < end_pfn)
+ return PFN_PHYS(absent_pages_in_range(start_pfn, end_pfn));
+ return 0;
+}
+
+/*
+ * Sets up nid to range from @start to @end. The return value is -errno if
+ * something went wrong, 0 otherwise.
+ */
+static int __init emu_setup_memblk(struct numa_meminfo *ei,
+ struct numa_meminfo *pi,
+ int nid, int phys_blk, u64 size)
+{
+ struct numa_memblk *eb = &ei->blk[ei->nr_blks];
+ struct numa_memblk *pb = &pi->blk[phys_blk];
+
+ if (ei->nr_blks >= NR_NODE_MEMBLKS) {
+ pr_err("NUMA: Too many emulated memblks, failing emulation\n");
+ return -EINVAL;
+ }
+
+ ei->nr_blks++;
+ eb->start = pb->start;
+ eb->end = pb->start + size;
+ eb->nid = nid;
+
+ if (emu_nid_to_phys[nid] == NUMA_NO_NODE)
+ emu_nid_to_phys[nid] = pb->nid;
+
+ pb->start += size;
+ if (pb->start >= pb->end) {
+ WARN_ON_ONCE(pb->start > pb->end);
+ numa_remove_memblk_from(phys_blk, pi);
+ }
+
+ printk(KERN_INFO "Faking node %d at [mem %#018Lx-%#018Lx] (%LuMB)\n",
+ nid, eb->start, eb->end - 1, (eb->end - eb->start) >> 20);
+ return 0;
+}
+
+/*
+ * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
+ * to max_addr.
+ *
+ * Returns zero on success or negative on error.
+ */
+static int __init split_nodes_interleave(struct numa_meminfo *ei,
+ struct numa_meminfo *pi,
+ u64 addr, u64 max_addr, int nr_nodes)
+{
+ nodemask_t physnode_mask = numa_nodes_parsed;
+ u64 size;
+ int big;
+ int nid = 0;
+ int i, ret;
+
+ if (nr_nodes <= 0)
+ return -1;
+ if (nr_nodes > MAX_NUMNODES) {
+ pr_info("numa=fake=%d too large, reducing to %d\n",
+ nr_nodes, MAX_NUMNODES);
+ nr_nodes = MAX_NUMNODES;
+ }
+
+ /*
+ * Calculate target node size. x86_32 freaks on __udivdi3() so do
+ * the division in ulong number of pages and convert back.
+ */
+ size = max_addr - addr - mem_hole_size(addr, max_addr);
+ size = PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes);
+
+ /*
+ * Calculate the number of big nodes that can be allocated as a result
+ * of consolidating the remainder.
+ */
+ big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) /
+ FAKE_NODE_MIN_SIZE;
+
+ size &= FAKE_NODE_MIN_HASH_MASK;
+ if (!size) {
+ pr_err("Not enough memory for each node. "
+ "NUMA emulation disabled.\n");
+ return -1;
+ }
+
+ /*
+ * Continue to fill physical nodes with fake nodes until there is no
+ * memory left on any of them.
+ */
+ while (nodes_weight(physnode_mask)) {
+ for_each_node_mask(i, physnode_mask) {
+ u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
+ u64 start, limit, end;
+ int phys_blk;
+
+ phys_blk = emu_find_memblk_by_nid(i, pi);
+ if (phys_blk < 0) {
+ node_clear(i, physnode_mask);
+ continue;
+ }
+ start = pi->blk[phys_blk].start;
+ limit = pi->blk[phys_blk].end;
+ end = start + size;
+
+ if (nid < big)
+ end += FAKE_NODE_MIN_SIZE;
+
+ /*
+ * Continue to add memory to this fake node if its
+ * non-reserved memory is less than the per-node size.
+ */
+ while (end - start - mem_hole_size(start, end) < size) {
+ end += FAKE_NODE_MIN_SIZE;
+ if (end > limit) {
+ end = limit;
+ break;
+ }
+ }
+
+ /*
+ * If there won't be at least FAKE_NODE_MIN_SIZE of
+ * non-reserved memory in ZONE_DMA32 for the next node,
+ * this one must extend to the boundary.
+ */
+ if (end < dma32_end && dma32_end - end -
+ mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
+ end = dma32_end;
+
+ /*
+ * If there won't be enough non-reserved memory for the
+ * next node, this one must extend to the end of the
+ * physical node.
+ */
+ if (limit - end - mem_hole_size(end, limit) < size)
+ end = limit;
+
+ ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes,
+ phys_blk,
+ min(end, limit) - start);
+ if (ret < 0)
+ return ret;
+ }
+ }
+ return 0;
+}
+
+/*
+ * Returns the end address of a node so that there is at least `size' amount of
+ * non-reserved memory or `max_addr' is reached.
+ */
+static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
+{
+ u64 end = start + size;
+
+ while (end - start - mem_hole_size(start, end) < size) {
+ end += FAKE_NODE_MIN_SIZE;
+ if (end > max_addr) {
+ end = max_addr;
+ break;
+ }
+ }
+ return end;
+}
+
+static u64 uniform_size(u64 max_addr, u64 base, u64 hole, int nr_nodes)
+{
+ unsigned long max_pfn = PHYS_PFN(max_addr);
+ unsigned long base_pfn = PHYS_PFN(base);
+ unsigned long hole_pfns = PHYS_PFN(hole);
+
+ return PFN_PHYS((max_pfn - base_pfn - hole_pfns) / nr_nodes);
+}
+
+/*
+ * Sets up fake nodes of `size' interleaved over physical nodes ranging from
+ * `addr' to `max_addr'.
+ *
+ * Returns zero on success or negative on error.
+ */
+static int __init split_nodes_size_interleave_uniform(struct numa_meminfo *ei,
+ struct numa_meminfo *pi,
+ u64 addr, u64 max_addr, u64 size,
+ int nr_nodes, struct numa_memblk *pblk,
+ int nid)
+{
+ nodemask_t physnode_mask = numa_nodes_parsed;
+ int i, ret, uniform = 0;
+ u64 min_size;
+
+ if ((!size && !nr_nodes) || (nr_nodes && !pblk))
+ return -1;
+
+ /*
+ * In the 'uniform' case split the passed in physical node by
+ * nr_nodes, in the non-uniform case, ignore the passed in
+ * physical block and try to create nodes of at least size
+ * @size.
+ *
+ * In the uniform case, split the nodes strictly by physical
+ * capacity, i.e. ignore holes. In the non-uniform case account
+ * for holes and treat @size as a minimum floor.
+ */
+ if (!nr_nodes)
+ nr_nodes = MAX_NUMNODES;
+ else {
+ nodes_clear(physnode_mask);
+ node_set(pblk->nid, physnode_mask);
+ uniform = 1;
+ }
+
+ if (uniform) {
+ min_size = uniform_size(max_addr, addr, 0, nr_nodes);
+ size = min_size;
+ } else {
+ /*
+ * The limit on emulated nodes is MAX_NUMNODES, so the
+ * size per node is increased accordingly if the
+ * requested size is too small. This creates a uniform
+ * distribution of node sizes across the entire machine
+ * (but not necessarily over physical nodes).
+ */
+ min_size = uniform_size(max_addr, addr,
+ mem_hole_size(addr, max_addr), nr_nodes);
+ }
+ min_size = ALIGN(max(min_size, FAKE_NODE_MIN_SIZE), FAKE_NODE_MIN_SIZE);
+ if (size < min_size) {
+ pr_err("Fake node size %LuMB too small, increasing to %LuMB\n",
+ size >> 20, min_size >> 20);
+ size = min_size;
+ }
+ size = ALIGN_DOWN(size, FAKE_NODE_MIN_SIZE);
+
+ /*
+ * Fill physical nodes with fake nodes of size until there is no memory
+ * left on any of them.
+ */
+ while (nodes_weight(physnode_mask)) {
+ for_each_node_mask(i, physnode_mask) {
+ u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
+ u64 start, limit, end;
+ int phys_blk;
+
+ phys_blk = emu_find_memblk_by_nid(i, pi);
+ if (phys_blk < 0) {
+ node_clear(i, physnode_mask);
+ continue;
+ }
+
+ start = pi->blk[phys_blk].start;
+ limit = pi->blk[phys_blk].end;
+
+ if (uniform)
+ end = start + size;
+ else
+ end = find_end_of_node(start, limit, size);
+ /*
+ * If there won't be at least FAKE_NODE_MIN_SIZE of
+ * non-reserved memory in ZONE_DMA32 for the next node,
+ * this one must extend to the boundary.
+ */
+ if (end < dma32_end && dma32_end - end -
+ mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
+ end = dma32_end;
+
+ /*
+ * If there won't be enough non-reserved memory for the
+ * next node, this one must extend to the end of the
+ * physical node.
+ */
+ if ((limit - end - mem_hole_size(end, limit) < size)
+ && !uniform)
+ end = limit;
+
+ ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES,
+ phys_blk,
+ min(end, limit) - start);
+ if (ret < 0)
+ return ret;
+ }
+ }
+ return nid;
+}
+
+static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
+ struct numa_meminfo *pi,
+ u64 addr, u64 max_addr, u64 size)
+{
+ return split_nodes_size_interleave_uniform(ei, pi, addr, max_addr, size,
+ 0, NULL, 0);
+}
+
+int __init setup_emu2phys_nid(int *dfl_phys_nid)
+{
+ int i, max_emu_nid = 0;
+
+ *dfl_phys_nid = NUMA_NO_NODE;
+ for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) {
+ if (emu_nid_to_phys[i] != NUMA_NO_NODE) {
+ max_emu_nid = i;
+ if (*dfl_phys_nid == NUMA_NO_NODE)
+ *dfl_phys_nid = emu_nid_to_phys[i];
+ }
+ }
+
+ return max_emu_nid;
+}
+
+/**
+ * numa_emulation - Emulate NUMA nodes
+ * @numa_meminfo: NUMA configuration to massage
+ * @numa_dist_cnt: The size of the physical NUMA distance table
+ *
+ * Emulate NUMA nodes according to the numa=fake kernel parameter.
+ * @numa_meminfo contains the physical memory configuration and is modified
+ * to reflect the emulated configuration on success. @numa_dist_cnt is
+ * used to determine the size of the physical distance table.
+ *
+ * On success, the following modifications are made.
+ *
+ * - @numa_meminfo is updated to reflect the emulated nodes.
+ *
+ * - __apicid_to_node[] is updated such that APIC IDs are mapped to the
+ * emulated nodes.
+ *
+ * - NUMA distance table is rebuilt to represent distances between emulated
+ * nodes. The distances are determined considering how emulated nodes
+ * are mapped to physical nodes and match the actual distances.
+ *
+ * - emu_nid_to_phys[] reflects how emulated nodes are mapped to physical
+ * nodes. This is used by numa_add_cpu() and numa_remove_cpu().
+ *
+ * If emulation is not enabled or fails, emu_nid_to_phys[] is filled with
+ * identity mapping and no other modification is made.
+ */
+void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
+{
+ static struct numa_meminfo ei __initdata;
+ static struct numa_meminfo pi __initdata;
+ const u64 max_addr = PFN_PHYS(max_pfn);
+ u8 *phys_dist = NULL;
+ size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]);
+ int max_emu_nid, dfl_phys_nid;
+ int i, j, ret;
+
+ if (!emu_cmdline)
+ goto no_emu;
+
+ memset(&ei, 0, sizeof(ei));
+ pi = *numa_meminfo;
+
+ for (i = 0; i < MAX_NUMNODES; i++)
+ emu_nid_to_phys[i] = NUMA_NO_NODE;
+
+ /*
+ * If the numa=fake command-line contains a 'M' or 'G', it represents
+ * the fixed node size. Otherwise, if it is just a single number N,
+ * split the system RAM into N fake nodes.
+ */
+ if (strchr(emu_cmdline, 'U')) {
+ nodemask_t physnode_mask = numa_nodes_parsed;
+ unsigned long n;
+ int nid = 0;
+
+ n = simple_strtoul(emu_cmdline, &emu_cmdline, 0);
+ ret = -1;
+ for_each_node_mask(i, physnode_mask) {
+ /*
+ * The reason we pass in blk[0] is due to
+ * numa_remove_memblk_from() called by
+ * emu_setup_memblk() will delete entry 0
+ * and then move everything else up in the pi.blk
+ * array. Therefore we should always be looking
+ * at blk[0].
+ */
+ ret = split_nodes_size_interleave_uniform(&ei, &pi,
+ pi.blk[0].start, pi.blk[0].end, 0,
+ n, &pi.blk[0], nid);
+ if (ret < 0)
+ break;
+ if (ret < n) {
+ pr_info("%s: phys: %d only got %d of %ld nodes, failing\n",
+ __func__, i, ret, n);
+ ret = -1;
+ break;
+ }
+ nid = ret;
+ }
+ } else if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) {
+ u64 size;
+
+ size = memparse(emu_cmdline, &emu_cmdline);
+ ret = split_nodes_size_interleave(&ei, &pi, 0, max_addr, size);
+ } else {
+ unsigned long n;
+
+ n = simple_strtoul(emu_cmdline, &emu_cmdline, 0);
+ ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n);
+ }
+ if (*emu_cmdline == ':')
+ emu_cmdline++;
+
+ if (ret < 0)
+ goto no_emu;
+
+ if (numa_cleanup_meminfo(&ei) < 0) {
+ pr_warning("NUMA: Warning: constructed meminfo invalid, disabling emulation\n");
+ goto no_emu;
+ }
+
+ /* copy the physical distance table */
+ if (numa_dist_cnt) {
+ u64 phys;
+
+ phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
+ phys_size, PAGE_SIZE);
+ if (!phys) {
+ pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n");
+ goto no_emu;
+ }
+ memblock_reserve(phys, phys_size);
+ phys_dist = __va(phys);
+
+ for (i = 0; i < numa_dist_cnt; i++)
+ for (j = 0; j < numa_dist_cnt; j++)
+ phys_dist[i * numa_dist_cnt + j] =
+ node_distance(i, j);
+ }
+
+ /*
+ * Determine the max emulated nid and the default phys nid to use
+ * for unmapped nodes.
+ */
+ max_emu_nid = setup_emu2phys_nid(&dfl_phys_nid);
+
+ /* commit */
+ *numa_meminfo = ei;
+
+ /* Make sure numa_nodes_parsed only contains emulated nodes */
+ nodes_clear(numa_nodes_parsed);
+ for (i = 0; i < ARRAY_SIZE(ei.blk); i++)
+ if (ei.blk[i].start != ei.blk[i].end &&
+ ei.blk[i].nid != NUMA_NO_NODE)
+ node_set(ei.blk[i].nid, numa_nodes_parsed);
+
+ /*
+ * Transform __apicid_to_node table to use emulated nids by
+ * reverse-mapping phys_nid. The maps should always exist but fall
+ * back to zero just in case.
+ */
+ for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) {
+ if (__apicid_to_node[i] == NUMA_NO_NODE)
+ continue;
+ for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++)
+ if (__apicid_to_node[i] == emu_nid_to_phys[j])
+ break;
+ __apicid_to_node[i] = j < ARRAY_SIZE(emu_nid_to_phys) ? j : 0;
+ }
+
+ /* make sure all emulated nodes are mapped to a physical node */
+ for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
+ if (emu_nid_to_phys[i] == NUMA_NO_NODE)
+ emu_nid_to_phys[i] = dfl_phys_nid;
+
+ /* transform distance table */
+ numa_reset_distance();
+ for (i = 0; i < max_emu_nid + 1; i++) {
+ for (j = 0; j < max_emu_nid + 1; j++) {
+ int physi = emu_nid_to_phys[i];
+ int physj = emu_nid_to_phys[j];
+ int dist;
+
+ if (get_option(&emu_cmdline, &dist) == 2)
+ ;
+ else if (physi >= numa_dist_cnt || physj >= numa_dist_cnt)
+ dist = physi == physj ?
+ LOCAL_DISTANCE : REMOTE_DISTANCE;
+ else
+ dist = phys_dist[physi * numa_dist_cnt + physj];
+
+ numa_set_distance(i, j, dist);
+ }
+ }
+
+ /* free the copied physical distance table */
+ if (phys_dist)
+ memblock_free(__pa(phys_dist), phys_size);
+ return;
+
+no_emu:
+ /* No emulation. Build identity emu_nid_to_phys[] for numa_add_cpu() */
+ for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
+ emu_nid_to_phys[i] = i;
+}
+
+#ifndef CONFIG_DEBUG_PER_CPU_MAPS
+void numa_add_cpu(int cpu)
+{
+ int physnid, nid;
+
+ nid = early_cpu_to_node(cpu);
+ BUG_ON(nid == NUMA_NO_NODE || !node_online(nid));
+
+ physnid = emu_nid_to_phys[nid];
+
+ /*
+ * Map the cpu to each emulated node that is allocated on the physical
+ * node of the cpu's apic id.
+ */
+ for_each_online_node(nid)
+ if (emu_nid_to_phys[nid] == physnid)
+ cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
+}
+
+void numa_remove_cpu(int cpu)
+{
+ int i;
+
+ for_each_online_node(i)
+ cpumask_clear_cpu(cpu, node_to_cpumask_map[i]);
+}
+#else /* !CONFIG_DEBUG_PER_CPU_MAPS */
+static void numa_set_cpumask(int cpu, bool enable)
+{
+ int nid, physnid;
+
+ nid = early_cpu_to_node(cpu);
+ if (nid == NUMA_NO_NODE) {
+ /* early_cpu_to_node() already emits a warning and trace */
+ return;
+ }
+
+ physnid = emu_nid_to_phys[nid];
+
+ for_each_online_node(nid) {
+ if (emu_nid_to_phys[nid] != physnid)
+ continue;
+
+ debug_cpumask_set_cpu(cpu, nid, enable);
+ }
+}
+
+void numa_add_cpu(int cpu)
+{
+ numa_set_cpumask(cpu, true);
+}
+
+void numa_remove_cpu(int cpu)
+{
+ numa_set_cpumask(cpu, false);
+}
+#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h
new file mode 100644
index 000000000..86860f279
--- /dev/null
+++ b/arch/x86/mm/numa_internal.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __X86_MM_NUMA_INTERNAL_H
+#define __X86_MM_NUMA_INTERNAL_H
+
+#include <linux/types.h>
+#include <asm/numa.h>
+
+struct numa_memblk {
+ u64 start;
+ u64 end;
+ int nid;
+};
+
+struct numa_meminfo {
+ int nr_blks;
+ struct numa_memblk blk[NR_NODE_MEMBLKS];
+};
+
+void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi);
+int __init numa_cleanup_meminfo(struct numa_meminfo *mi);
+void __init numa_reset_distance(void);
+
+void __init x86_numa_init(void);
+
+#ifdef CONFIG_NUMA_EMU
+void __init numa_emulation(struct numa_meminfo *numa_meminfo,
+ int numa_dist_cnt);
+#else
+static inline void numa_emulation(struct numa_meminfo *numa_meminfo,
+ int numa_dist_cnt)
+{ }
+#endif
+
+#endif /* __X86_MM_NUMA_INTERNAL_H */
diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c
new file mode 100644
index 000000000..a25588ad7
--- /dev/null
+++ b/arch/x86/mm/pageattr-test.c
@@ -0,0 +1,261 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * self test for change_page_attr.
+ *
+ * Clears the a test pte bit on random pages in the direct mapping,
+ * then reverts and compares page tables forwards and afterwards.
+ */
+#include <linux/bootmem.h>
+#include <linux/kthread.h>
+#include <linux/random.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+
+#include <asm/cacheflush.h>
+#include <asm/pgtable.h>
+#include <asm/kdebug.h>
+
+/*
+ * Only print the results of the first pass:
+ */
+static __read_mostly int print = 1;
+
+enum {
+ NTEST = 400,
+#ifdef CONFIG_X86_64
+ LPS = (1 << PMD_SHIFT),
+#elif defined(CONFIG_X86_PAE)
+ LPS = (1 << PMD_SHIFT),
+#else
+ LPS = (1 << 22),
+#endif
+ GPS = (1<<30)
+};
+
+#define PAGE_CPA_TEST __pgprot(_PAGE_CPA_TEST)
+
+static int pte_testbit(pte_t pte)
+{
+ return pte_flags(pte) & _PAGE_SOFTW1;
+}
+
+struct split_state {
+ long lpg, gpg, spg, exec;
+ long min_exec, max_exec;
+};
+
+static int print_split(struct split_state *s)
+{
+ long i, expected, missed = 0;
+ int err = 0;
+
+ s->lpg = s->gpg = s->spg = s->exec = 0;
+ s->min_exec = ~0UL;
+ s->max_exec = 0;
+ for (i = 0; i < max_pfn_mapped; ) {
+ unsigned long addr = (unsigned long)__va(i << PAGE_SHIFT);
+ unsigned int level;
+ pte_t *pte;
+
+ pte = lookup_address(addr, &level);
+ if (!pte) {
+ missed++;
+ i++;
+ continue;
+ }
+
+ if (level == PG_LEVEL_1G && sizeof(long) == 8) {
+ s->gpg++;
+ i += GPS/PAGE_SIZE;
+ } else if (level == PG_LEVEL_2M) {
+ if ((pte_val(*pte) & _PAGE_PRESENT) && !(pte_val(*pte) & _PAGE_PSE)) {
+ printk(KERN_ERR
+ "%lx level %d but not PSE %Lx\n",
+ addr, level, (u64)pte_val(*pte));
+ err = 1;
+ }
+ s->lpg++;
+ i += LPS/PAGE_SIZE;
+ } else {
+ s->spg++;
+ i++;
+ }
+ if (!(pte_val(*pte) & _PAGE_NX)) {
+ s->exec++;
+ if (addr < s->min_exec)
+ s->min_exec = addr;
+ if (addr > s->max_exec)
+ s->max_exec = addr;
+ }
+ }
+ if (print) {
+ printk(KERN_INFO
+ " 4k %lu large %lu gb %lu x %lu[%lx-%lx] miss %lu\n",
+ s->spg, s->lpg, s->gpg, s->exec,
+ s->min_exec != ~0UL ? s->min_exec : 0,
+ s->max_exec, missed);
+ }
+
+ expected = (s->gpg*GPS + s->lpg*LPS)/PAGE_SIZE + s->spg + missed;
+ if (expected != i) {
+ printk(KERN_ERR "CPA max_pfn_mapped %lu but expected %lu\n",
+ max_pfn_mapped, expected);
+ return 1;
+ }
+ return err;
+}
+
+static unsigned long addr[NTEST];
+static unsigned int len[NTEST];
+
+/* Change the global bit on random pages in the direct mapping */
+static int pageattr_test(void)
+{
+ struct split_state sa, sb, sc;
+ unsigned long *bm;
+ pte_t *pte, pte0;
+ int failed = 0;
+ unsigned int level;
+ int i, k;
+ int err;
+ unsigned long test_addr;
+
+ if (print)
+ printk(KERN_INFO "CPA self-test:\n");
+
+ bm = vzalloc((max_pfn_mapped + 7) / 8);
+ if (!bm) {
+ printk(KERN_ERR "CPA Cannot vmalloc bitmap\n");
+ return -ENOMEM;
+ }
+
+ failed += print_split(&sa);
+
+ for (i = 0; i < NTEST; i++) {
+ unsigned long pfn = prandom_u32() % max_pfn_mapped;
+
+ addr[i] = (unsigned long)__va(pfn << PAGE_SHIFT);
+ len[i] = prandom_u32() % 100;
+ len[i] = min_t(unsigned long, len[i], max_pfn_mapped - pfn - 1);
+
+ if (len[i] == 0)
+ len[i] = 1;
+
+ pte = NULL;
+ pte0 = pfn_pte(0, __pgprot(0)); /* shut gcc up */
+
+ for (k = 0; k < len[i]; k++) {
+ pte = lookup_address(addr[i] + k*PAGE_SIZE, &level);
+ if (!pte || pgprot_val(pte_pgprot(*pte)) == 0 ||
+ !(pte_val(*pte) & _PAGE_PRESENT)) {
+ addr[i] = 0;
+ break;
+ }
+ if (k == 0) {
+ pte0 = *pte;
+ } else {
+ if (pgprot_val(pte_pgprot(*pte)) !=
+ pgprot_val(pte_pgprot(pte0))) {
+ len[i] = k;
+ break;
+ }
+ }
+ if (test_bit(pfn + k, bm)) {
+ len[i] = k;
+ break;
+ }
+ __set_bit(pfn + k, bm);
+ }
+ if (!addr[i] || !pte || !k) {
+ addr[i] = 0;
+ continue;
+ }
+
+ test_addr = addr[i];
+ err = change_page_attr_set(&test_addr, len[i], PAGE_CPA_TEST, 0);
+ if (err < 0) {
+ printk(KERN_ERR "CPA %d failed %d\n", i, err);
+ failed++;
+ }
+
+ pte = lookup_address(addr[i], &level);
+ if (!pte || !pte_testbit(*pte) || pte_huge(*pte)) {
+ printk(KERN_ERR "CPA %lx: bad pte %Lx\n", addr[i],
+ pte ? (u64)pte_val(*pte) : 0ULL);
+ failed++;
+ }
+ if (level != PG_LEVEL_4K) {
+ printk(KERN_ERR "CPA %lx: unexpected level %d\n",
+ addr[i], level);
+ failed++;
+ }
+
+ }
+ vfree(bm);
+
+ failed += print_split(&sb);
+
+ for (i = 0; i < NTEST; i++) {
+ if (!addr[i])
+ continue;
+ pte = lookup_address(addr[i], &level);
+ if (!pte) {
+ printk(KERN_ERR "CPA lookup of %lx failed\n", addr[i]);
+ failed++;
+ continue;
+ }
+ test_addr = addr[i];
+ err = change_page_attr_clear(&test_addr, len[i], PAGE_CPA_TEST, 0);
+ if (err < 0) {
+ printk(KERN_ERR "CPA reverting failed: %d\n", err);
+ failed++;
+ }
+ pte = lookup_address(addr[i], &level);
+ if (!pte || pte_testbit(*pte)) {
+ printk(KERN_ERR "CPA %lx: bad pte after revert %Lx\n",
+ addr[i], pte ? (u64)pte_val(*pte) : 0ULL);
+ failed++;
+ }
+
+ }
+
+ failed += print_split(&sc);
+
+ if (failed) {
+ WARN(1, KERN_ERR "NOT PASSED. Please report.\n");
+ return -EINVAL;
+ } else {
+ if (print)
+ printk(KERN_INFO "ok.\n");
+ }
+
+ return 0;
+}
+
+static int do_pageattr_test(void *__unused)
+{
+ while (!kthread_should_stop()) {
+ schedule_timeout_interruptible(HZ*30);
+ if (pageattr_test() < 0)
+ break;
+ if (print)
+ print--;
+ }
+ return 0;
+}
+
+static int start_pageattr_test(void)
+{
+ struct task_struct *p;
+
+ p = kthread_create(do_pageattr_test, NULL, "pageattr-test");
+ if (!IS_ERR(p))
+ wake_up_process(p);
+ else
+ WARN_ON(1);
+
+ return 0;
+}
+device_initcall(start_pageattr_test);
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
new file mode 100644
index 000000000..101f3ad0d
--- /dev/null
+++ b/arch/x86/mm/pageattr.c
@@ -0,0 +1,2154 @@
+/*
+ * Copyright 2002 Andi Kleen, SuSE Labs.
+ * Thanks to Ben LaHaise for precious feedback.
+ */
+#include <linux/highmem.h>
+#include <linux/bootmem.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <linux/pfn.h>
+#include <linux/percpu.h>
+#include <linux/gfp.h>
+#include <linux/pci.h>
+#include <linux/vmalloc.h>
+
+#include <asm/e820/api.h>
+#include <asm/processor.h>
+#include <asm/tlbflush.h>
+#include <asm/sections.h>
+#include <asm/setup.h>
+#include <linux/uaccess.h>
+#include <asm/pgalloc.h>
+#include <asm/proto.h>
+#include <asm/pat.h>
+#include <asm/set_memory.h>
+
+/*
+ * The current flushing context - we pass it instead of 5 arguments:
+ */
+struct cpa_data {
+ unsigned long *vaddr;
+ pgd_t *pgd;
+ pgprot_t mask_set;
+ pgprot_t mask_clr;
+ unsigned long numpages;
+ int flags;
+ unsigned long pfn;
+ unsigned force_split : 1;
+ int curpage;
+ struct page **pages;
+};
+
+/*
+ * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings)
+ * using cpa_lock. So that we don't allow any other cpu, with stale large tlb
+ * entries change the page attribute in parallel to some other cpu
+ * splitting a large page entry along with changing the attribute.
+ */
+static DEFINE_SPINLOCK(cpa_lock);
+
+#define CPA_FLUSHTLB 1
+#define CPA_ARRAY 2
+#define CPA_PAGES_ARRAY 4
+#define CPA_NO_CHECK_ALIAS 8 /* Do not search for aliases */
+
+#ifdef CONFIG_PROC_FS
+static unsigned long direct_pages_count[PG_LEVEL_NUM];
+
+void update_page_count(int level, unsigned long pages)
+{
+ /* Protect against CPA */
+ spin_lock(&pgd_lock);
+ direct_pages_count[level] += pages;
+ spin_unlock(&pgd_lock);
+}
+
+static void split_page_count(int level)
+{
+ if (direct_pages_count[level] == 0)
+ return;
+
+ direct_pages_count[level]--;
+ direct_pages_count[level - 1] += PTRS_PER_PTE;
+}
+
+void arch_report_meminfo(struct seq_file *m)
+{
+ seq_printf(m, "DirectMap4k: %8lu kB\n",
+ direct_pages_count[PG_LEVEL_4K] << 2);
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
+ seq_printf(m, "DirectMap2M: %8lu kB\n",
+ direct_pages_count[PG_LEVEL_2M] << 11);
+#else
+ seq_printf(m, "DirectMap4M: %8lu kB\n",
+ direct_pages_count[PG_LEVEL_2M] << 12);
+#endif
+ if (direct_gbpages)
+ seq_printf(m, "DirectMap1G: %8lu kB\n",
+ direct_pages_count[PG_LEVEL_1G] << 20);
+}
+#else
+static inline void split_page_count(int level) { }
+#endif
+
+static inline int
+within(unsigned long addr, unsigned long start, unsigned long end)
+{
+ return addr >= start && addr < end;
+}
+
+static inline int
+within_inclusive(unsigned long addr, unsigned long start, unsigned long end)
+{
+ return addr >= start && addr <= end;
+}
+
+#ifdef CONFIG_X86_64
+
+static inline unsigned long highmap_start_pfn(void)
+{
+ return __pa_symbol(_text) >> PAGE_SHIFT;
+}
+
+static inline unsigned long highmap_end_pfn(void)
+{
+ /* Do not reference physical address outside the kernel. */
+ return __pa_symbol(roundup(_brk_end, PMD_SIZE) - 1) >> PAGE_SHIFT;
+}
+
+static bool __cpa_pfn_in_highmap(unsigned long pfn)
+{
+ /*
+ * Kernel text has an alias mapping at a high address, known
+ * here as "highmap".
+ */
+ return within_inclusive(pfn, highmap_start_pfn(), highmap_end_pfn());
+}
+
+#else
+
+static bool __cpa_pfn_in_highmap(unsigned long pfn)
+{
+ /* There is no highmap on 32-bit */
+ return false;
+}
+
+#endif
+
+/*
+ * Flushing functions
+ */
+
+/**
+ * clflush_cache_range - flush a cache range with clflush
+ * @vaddr: virtual start address
+ * @size: number of bytes to flush
+ *
+ * clflushopt is an unordered instruction which needs fencing with mfence or
+ * sfence to avoid ordering issues.
+ */
+void clflush_cache_range(void *vaddr, unsigned int size)
+{
+ const unsigned long clflush_size = boot_cpu_data.x86_clflush_size;
+ void *p = (void *)((unsigned long)vaddr & ~(clflush_size - 1));
+ void *vend = vaddr + size;
+
+ if (p >= vend)
+ return;
+
+ mb();
+
+ for (; p < vend; p += clflush_size)
+ clflushopt(p);
+
+ mb();
+}
+EXPORT_SYMBOL_GPL(clflush_cache_range);
+
+void arch_invalidate_pmem(void *addr, size_t size)
+{
+ clflush_cache_range(addr, size);
+}
+EXPORT_SYMBOL_GPL(arch_invalidate_pmem);
+
+static void __cpa_flush_all(void *arg)
+{
+ unsigned long cache = (unsigned long)arg;
+
+ /*
+ * Flush all to work around Errata in early athlons regarding
+ * large page flushing.
+ */
+ __flush_tlb_all();
+
+ if (cache && boot_cpu_data.x86 >= 4)
+ wbinvd();
+}
+
+static void cpa_flush_all(unsigned long cache)
+{
+ BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
+
+ on_each_cpu(__cpa_flush_all, (void *) cache, 1);
+}
+
+static void __cpa_flush_range(void *arg)
+{
+ /*
+ * We could optimize that further and do individual per page
+ * tlb invalidates for a low number of pages. Caveat: we must
+ * flush the high aliases on 64bit as well.
+ */
+ __flush_tlb_all();
+}
+
+static void cpa_flush_range(unsigned long start, int numpages, int cache)
+{
+ unsigned int i, level;
+ unsigned long addr;
+
+ BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
+ WARN_ON(PAGE_ALIGN(start) != start);
+
+ on_each_cpu(__cpa_flush_range, NULL, 1);
+
+ if (!cache)
+ return;
+
+ /*
+ * We only need to flush on one CPU,
+ * clflush is a MESI-coherent instruction that
+ * will cause all other CPUs to flush the same
+ * cachelines:
+ */
+ for (i = 0, addr = start; i < numpages; i++, addr += PAGE_SIZE) {
+ pte_t *pte = lookup_address(addr, &level);
+
+ /*
+ * Only flush present addresses:
+ */
+ if (pte && (pte_val(*pte) & _PAGE_PRESENT))
+ clflush_cache_range((void *) addr, PAGE_SIZE);
+ }
+}
+
+static void cpa_flush_array(unsigned long *start, int numpages, int cache,
+ int in_flags, struct page **pages)
+{
+ unsigned int i, level;
+#ifdef CONFIG_PREEMPT
+ /*
+ * Avoid wbinvd() because it causes latencies on all CPUs,
+ * regardless of any CPU isolation that may be in effect.
+ *
+ * This should be extended for CAT enabled systems independent of
+ * PREEMPT because wbinvd() does not respect the CAT partitions and
+ * this is exposed to unpriviledged users through the graphics
+ * subsystem.
+ */
+ unsigned long do_wbinvd = 0;
+#else
+ unsigned long do_wbinvd = cache && numpages >= 1024; /* 4M threshold */
+#endif
+
+ BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
+
+ on_each_cpu(__cpa_flush_all, (void *) do_wbinvd, 1);
+
+ if (!cache || do_wbinvd)
+ return;
+
+ /*
+ * We only need to flush on one CPU,
+ * clflush is a MESI-coherent instruction that
+ * will cause all other CPUs to flush the same
+ * cachelines:
+ */
+ for (i = 0; i < numpages; i++) {
+ unsigned long addr;
+ pte_t *pte;
+
+ if (in_flags & CPA_PAGES_ARRAY)
+ addr = (unsigned long)page_address(pages[i]);
+ else
+ addr = start[i];
+
+ pte = lookup_address(addr, &level);
+
+ /*
+ * Only flush present addresses:
+ */
+ if (pte && (pte_val(*pte) & _PAGE_PRESENT))
+ clflush_cache_range((void *)addr, PAGE_SIZE);
+ }
+}
+
+/*
+ * Certain areas of memory on x86 require very specific protection flags,
+ * for example the BIOS area or kernel text. Callers don't always get this
+ * right (again, ioremap() on BIOS memory is not uncommon) so this function
+ * checks and fixes these known static required protection bits.
+ */
+static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
+ unsigned long pfn)
+{
+ pgprot_t forbidden = __pgprot(0);
+
+ /*
+ * The BIOS area between 640k and 1Mb needs to be executable for
+ * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support.
+ */
+#ifdef CONFIG_PCI_BIOS
+ if (pcibios_enabled && within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
+ pgprot_val(forbidden) |= _PAGE_NX;
+#endif
+
+ /*
+ * The kernel text needs to be executable for obvious reasons
+ * Does not cover __inittext since that is gone later on. On
+ * 64bit we do not enforce !NX on the low mapping
+ */
+ if (within(address, (unsigned long)_text, (unsigned long)_etext))
+ pgprot_val(forbidden) |= _PAGE_NX;
+
+ /*
+ * The .rodata section needs to be read-only. Using the pfn
+ * catches all aliases. This also includes __ro_after_init,
+ * so do not enforce until kernel_set_to_readonly is true.
+ */
+ if (kernel_set_to_readonly &&
+ within(pfn, __pa_symbol(__start_rodata) >> PAGE_SHIFT,
+ __pa_symbol(__end_rodata) >> PAGE_SHIFT))
+ pgprot_val(forbidden) |= _PAGE_RW;
+
+#if defined(CONFIG_X86_64)
+ /*
+ * Once the kernel maps the text as RO (kernel_set_to_readonly is set),
+ * kernel text mappings for the large page aligned text, rodata sections
+ * will be always read-only. For the kernel identity mappings covering
+ * the holes caused by this alignment can be anything that user asks.
+ *
+ * This will preserve the large page mappings for kernel text/data
+ * at no extra cost.
+ */
+ if (kernel_set_to_readonly &&
+ within(address, (unsigned long)_text,
+ (unsigned long)__end_rodata_hpage_align)) {
+ unsigned int level;
+
+ /*
+ * Don't enforce the !RW mapping for the kernel text mapping,
+ * if the current mapping is already using small page mapping.
+ * No need to work hard to preserve large page mappings in this
+ * case.
+ *
+ * This also fixes the Linux Xen paravirt guest boot failure
+ * (because of unexpected read-only mappings for kernel identity
+ * mappings). In this paravirt guest case, the kernel text
+ * mapping and the kernel identity mapping share the same
+ * page-table pages. Thus we can't really use different
+ * protections for the kernel text and identity mappings. Also,
+ * these shared mappings are made of small page mappings.
+ * Thus this don't enforce !RW mapping for small page kernel
+ * text mapping logic will help Linux Xen parvirt guest boot
+ * as well.
+ */
+ if (lookup_address(address, &level) && (level != PG_LEVEL_4K))
+ pgprot_val(forbidden) |= _PAGE_RW;
+ }
+#endif
+
+ prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
+
+ return prot;
+}
+
+/*
+ * Lookup the page table entry for a virtual address in a specific pgd.
+ * Return a pointer to the entry and the level of the mapping.
+ */
+pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
+ unsigned int *level)
+{
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+
+ *level = PG_LEVEL_NONE;
+
+ if (pgd_none(*pgd))
+ return NULL;
+
+ p4d = p4d_offset(pgd, address);
+ if (p4d_none(*p4d))
+ return NULL;
+
+ *level = PG_LEVEL_512G;
+ if (p4d_large(*p4d) || !p4d_present(*p4d))
+ return (pte_t *)p4d;
+
+ pud = pud_offset(p4d, address);
+ if (pud_none(*pud))
+ return NULL;
+
+ *level = PG_LEVEL_1G;
+ if (pud_large(*pud) || !pud_present(*pud))
+ return (pte_t *)pud;
+
+ pmd = pmd_offset(pud, address);
+ if (pmd_none(*pmd))
+ return NULL;
+
+ *level = PG_LEVEL_2M;
+ if (pmd_large(*pmd) || !pmd_present(*pmd))
+ return (pte_t *)pmd;
+
+ *level = PG_LEVEL_4K;
+
+ return pte_offset_kernel(pmd, address);
+}
+
+/*
+ * Lookup the page table entry for a virtual address. Return a pointer
+ * to the entry and the level of the mapping.
+ *
+ * Note: We return pud and pmd either when the entry is marked large
+ * or when the present bit is not set. Otherwise we would return a
+ * pointer to a nonexisting mapping.
+ */
+pte_t *lookup_address(unsigned long address, unsigned int *level)
+{
+ return lookup_address_in_pgd(pgd_offset_k(address), address, level);
+}
+EXPORT_SYMBOL_GPL(lookup_address);
+
+static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address,
+ unsigned int *level)
+{
+ if (cpa->pgd)
+ return lookup_address_in_pgd(cpa->pgd + pgd_index(address),
+ address, level);
+
+ return lookup_address(address, level);
+}
+
+/*
+ * Lookup the PMD entry for a virtual address. Return a pointer to the entry
+ * or NULL if not present.
+ */
+pmd_t *lookup_pmd_address(unsigned long address)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+
+ pgd = pgd_offset_k(address);
+ if (pgd_none(*pgd))
+ return NULL;
+
+ p4d = p4d_offset(pgd, address);
+ if (p4d_none(*p4d) || p4d_large(*p4d) || !p4d_present(*p4d))
+ return NULL;
+
+ pud = pud_offset(p4d, address);
+ if (pud_none(*pud) || pud_large(*pud) || !pud_present(*pud))
+ return NULL;
+
+ return pmd_offset(pud, address);
+}
+
+/*
+ * This is necessary because __pa() does not work on some
+ * kinds of memory, like vmalloc() or the alloc_remap()
+ * areas on 32-bit NUMA systems. The percpu areas can
+ * end up in this kind of memory, for instance.
+ *
+ * This could be optimized, but it is only intended to be
+ * used at inititalization time, and keeping it
+ * unoptimized should increase the testing coverage for
+ * the more obscure platforms.
+ */
+phys_addr_t slow_virt_to_phys(void *__virt_addr)
+{
+ unsigned long virt_addr = (unsigned long)__virt_addr;
+ phys_addr_t phys_addr;
+ unsigned long offset;
+ enum pg_level level;
+ pte_t *pte;
+
+ pte = lookup_address(virt_addr, &level);
+ BUG_ON(!pte);
+
+ /*
+ * pXX_pfn() returns unsigned long, which must be cast to phys_addr_t
+ * before being left-shifted PAGE_SHIFT bits -- this trick is to
+ * make 32-PAE kernel work correctly.
+ */
+ switch (level) {
+ case PG_LEVEL_1G:
+ phys_addr = (phys_addr_t)pud_pfn(*(pud_t *)pte) << PAGE_SHIFT;
+ offset = virt_addr & ~PUD_PAGE_MASK;
+ break;
+ case PG_LEVEL_2M:
+ phys_addr = (phys_addr_t)pmd_pfn(*(pmd_t *)pte) << PAGE_SHIFT;
+ offset = virt_addr & ~PMD_PAGE_MASK;
+ break;
+ default:
+ phys_addr = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT;
+ offset = virt_addr & ~PAGE_MASK;
+ }
+
+ return (phys_addr_t)(phys_addr | offset);
+}
+EXPORT_SYMBOL_GPL(slow_virt_to_phys);
+
+/*
+ * Set the new pmd in all the pgds we know about:
+ */
+static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
+{
+ /* change init_mm */
+ set_pte_atomic(kpte, pte);
+#ifdef CONFIG_X86_32
+ if (!SHARED_KERNEL_PMD) {
+ struct page *page;
+
+ list_for_each_entry(page, &pgd_list, lru) {
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+
+ pgd = (pgd_t *)page_address(page) + pgd_index(address);
+ p4d = p4d_offset(pgd, address);
+ pud = pud_offset(p4d, address);
+ pmd = pmd_offset(pud, address);
+ set_pte_atomic((pte_t *)pmd, pte);
+ }
+ }
+#endif
+}
+
+static pgprot_t pgprot_clear_protnone_bits(pgprot_t prot)
+{
+ /*
+ * _PAGE_GLOBAL means "global page" for present PTEs.
+ * But, it is also used to indicate _PAGE_PROTNONE
+ * for non-present PTEs.
+ *
+ * This ensures that a _PAGE_GLOBAL PTE going from
+ * present to non-present is not confused as
+ * _PAGE_PROTNONE.
+ */
+ if (!(pgprot_val(prot) & _PAGE_PRESENT))
+ pgprot_val(prot) &= ~_PAGE_GLOBAL;
+
+ return prot;
+}
+
+static int
+try_preserve_large_page(pte_t *kpte, unsigned long address,
+ struct cpa_data *cpa)
+{
+ unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn, old_pfn;
+ pte_t new_pte, old_pte, *tmp;
+ pgprot_t old_prot, new_prot, req_prot;
+ int i, do_split = 1;
+ enum pg_level level;
+
+ if (cpa->force_split)
+ return 1;
+
+ spin_lock(&pgd_lock);
+ /*
+ * Check for races, another CPU might have split this page
+ * up already:
+ */
+ tmp = _lookup_address_cpa(cpa, address, &level);
+ if (tmp != kpte)
+ goto out_unlock;
+
+ switch (level) {
+ case PG_LEVEL_2M:
+ old_prot = pmd_pgprot(*(pmd_t *)kpte);
+ old_pfn = pmd_pfn(*(pmd_t *)kpte);
+ break;
+ case PG_LEVEL_1G:
+ old_prot = pud_pgprot(*(pud_t *)kpte);
+ old_pfn = pud_pfn(*(pud_t *)kpte);
+ break;
+ default:
+ do_split = -EINVAL;
+ goto out_unlock;
+ }
+
+ psize = page_level_size(level);
+ pmask = page_level_mask(level);
+
+ /*
+ * Calculate the number of pages, which fit into this large
+ * page starting at address:
+ */
+ nextpage_addr = (address + psize) & pmask;
+ numpages = (nextpage_addr - address) >> PAGE_SHIFT;
+ if (numpages < cpa->numpages)
+ cpa->numpages = numpages;
+
+ /*
+ * We are safe now. Check whether the new pgprot is the same:
+ * Convert protection attributes to 4k-format, as cpa->mask* are set
+ * up accordingly.
+ */
+ old_pte = *kpte;
+ /* Clear PSE (aka _PAGE_PAT) and move PAT bit to correct position */
+ req_prot = pgprot_large_2_4k(old_prot);
+
+ pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr);
+ pgprot_val(req_prot) |= pgprot_val(cpa->mask_set);
+
+ /*
+ * req_prot is in format of 4k pages. It must be converted to large
+ * page format: the caching mode includes the PAT bit located at
+ * different bit positions in the two formats.
+ */
+ req_prot = pgprot_4k_2_large(req_prot);
+ req_prot = pgprot_clear_protnone_bits(req_prot);
+ if (pgprot_val(req_prot) & _PAGE_PRESENT)
+ pgprot_val(req_prot) |= _PAGE_PSE;
+
+ /*
+ * old_pfn points to the large page base pfn. So we need
+ * to add the offset of the virtual address:
+ */
+ pfn = old_pfn + ((address & (psize - 1)) >> PAGE_SHIFT);
+ cpa->pfn = pfn;
+
+ new_prot = static_protections(req_prot, address, pfn);
+
+ /*
+ * We need to check the full range, whether
+ * static_protection() requires a different pgprot for one of
+ * the pages in the range we try to preserve:
+ */
+ addr = address & pmask;
+ pfn = old_pfn;
+ for (i = 0; i < (psize >> PAGE_SHIFT); i++, addr += PAGE_SIZE, pfn++) {
+ pgprot_t chk_prot = static_protections(req_prot, addr, pfn);
+
+ if (pgprot_val(chk_prot) != pgprot_val(new_prot))
+ goto out_unlock;
+ }
+
+ /*
+ * If there are no changes, return. maxpages has been updated
+ * above:
+ */
+ if (pgprot_val(new_prot) == pgprot_val(old_prot)) {
+ do_split = 0;
+ goto out_unlock;
+ }
+
+ /*
+ * We need to change the attributes. Check, whether we can
+ * change the large page in one go. We request a split, when
+ * the address is not aligned and the number of pages is
+ * smaller than the number of pages in the large page. Note
+ * that we limited the number of possible pages already to
+ * the number of pages in the large page.
+ */
+ if (address == (address & pmask) && cpa->numpages == (psize >> PAGE_SHIFT)) {
+ /*
+ * The address is aligned and the number of pages
+ * covers the full page.
+ */
+ new_pte = pfn_pte(old_pfn, new_prot);
+ __set_pmd_pte(kpte, address, new_pte);
+ cpa->flags |= CPA_FLUSHTLB;
+ do_split = 0;
+ }
+
+out_unlock:
+ spin_unlock(&pgd_lock);
+
+ return do_split;
+}
+
+static int
+__split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
+ struct page *base)
+{
+ pte_t *pbase = (pte_t *)page_address(base);
+ unsigned long ref_pfn, pfn, pfninc = 1;
+ unsigned int i, level;
+ pte_t *tmp;
+ pgprot_t ref_prot;
+
+ spin_lock(&pgd_lock);
+ /*
+ * Check for races, another CPU might have split this page
+ * up for us already:
+ */
+ tmp = _lookup_address_cpa(cpa, address, &level);
+ if (tmp != kpte) {
+ spin_unlock(&pgd_lock);
+ return 1;
+ }
+
+ paravirt_alloc_pte(&init_mm, page_to_pfn(base));
+
+ switch (level) {
+ case PG_LEVEL_2M:
+ ref_prot = pmd_pgprot(*(pmd_t *)kpte);
+ /*
+ * Clear PSE (aka _PAGE_PAT) and move
+ * PAT bit to correct position.
+ */
+ ref_prot = pgprot_large_2_4k(ref_prot);
+
+ ref_pfn = pmd_pfn(*(pmd_t *)kpte);
+ break;
+
+ case PG_LEVEL_1G:
+ ref_prot = pud_pgprot(*(pud_t *)kpte);
+ ref_pfn = pud_pfn(*(pud_t *)kpte);
+ pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT;
+
+ /*
+ * Clear the PSE flags if the PRESENT flag is not set
+ * otherwise pmd_present/pmd_huge will return true
+ * even on a non present pmd.
+ */
+ if (!(pgprot_val(ref_prot) & _PAGE_PRESENT))
+ pgprot_val(ref_prot) &= ~_PAGE_PSE;
+ break;
+
+ default:
+ spin_unlock(&pgd_lock);
+ return 1;
+ }
+
+ ref_prot = pgprot_clear_protnone_bits(ref_prot);
+
+ /*
+ * Get the target pfn from the original entry:
+ */
+ pfn = ref_pfn;
+ for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc)
+ set_pte(&pbase[i], pfn_pte(pfn, ref_prot));
+
+ if (virt_addr_valid(address)) {
+ unsigned long pfn = PFN_DOWN(__pa(address));
+
+ if (pfn_range_is_mapped(pfn, pfn + 1))
+ split_page_count(level);
+ }
+
+ /*
+ * Install the new, split up pagetable.
+ *
+ * We use the standard kernel pagetable protections for the new
+ * pagetable protections, the actual ptes set above control the
+ * primary protection behavior:
+ */
+ __set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE)));
+
+ /*
+ * Intel Atom errata AAH41 workaround.
+ *
+ * The real fix should be in hw or in a microcode update, but
+ * we also probabilistically try to reduce the window of having
+ * a large TLB mixed with 4K TLBs while instruction fetches are
+ * going on.
+ */
+ __flush_tlb_all();
+ spin_unlock(&pgd_lock);
+
+ return 0;
+}
+
+static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
+ unsigned long address)
+{
+ struct page *base;
+
+ if (!debug_pagealloc_enabled())
+ spin_unlock(&cpa_lock);
+ base = alloc_pages(GFP_KERNEL, 0);
+ if (!debug_pagealloc_enabled())
+ spin_lock(&cpa_lock);
+ if (!base)
+ return -ENOMEM;
+
+ if (__split_large_page(cpa, kpte, address, base))
+ __free_page(base);
+
+ return 0;
+}
+
+static bool try_to_free_pte_page(pte_t *pte)
+{
+ int i;
+
+ for (i = 0; i < PTRS_PER_PTE; i++)
+ if (!pte_none(pte[i]))
+ return false;
+
+ free_page((unsigned long)pte);
+ return true;
+}
+
+static bool try_to_free_pmd_page(pmd_t *pmd)
+{
+ int i;
+
+ for (i = 0; i < PTRS_PER_PMD; i++)
+ if (!pmd_none(pmd[i]))
+ return false;
+
+ free_page((unsigned long)pmd);
+ return true;
+}
+
+static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
+{
+ pte_t *pte = pte_offset_kernel(pmd, start);
+
+ while (start < end) {
+ set_pte(pte, __pte(0));
+
+ start += PAGE_SIZE;
+ pte++;
+ }
+
+ if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) {
+ pmd_clear(pmd);
+ return true;
+ }
+ return false;
+}
+
+static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd,
+ unsigned long start, unsigned long end)
+{
+ if (unmap_pte_range(pmd, start, end))
+ if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
+ pud_clear(pud);
+}
+
+static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
+{
+ pmd_t *pmd = pmd_offset(pud, start);
+
+ /*
+ * Not on a 2MB page boundary?
+ */
+ if (start & (PMD_SIZE - 1)) {
+ unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
+ unsigned long pre_end = min_t(unsigned long, end, next_page);
+
+ __unmap_pmd_range(pud, pmd, start, pre_end);
+
+ start = pre_end;
+ pmd++;
+ }
+
+ /*
+ * Try to unmap in 2M chunks.
+ */
+ while (end - start >= PMD_SIZE) {
+ if (pmd_large(*pmd))
+ pmd_clear(pmd);
+ else
+ __unmap_pmd_range(pud, pmd, start, start + PMD_SIZE);
+
+ start += PMD_SIZE;
+ pmd++;
+ }
+
+ /*
+ * 4K leftovers?
+ */
+ if (start < end)
+ return __unmap_pmd_range(pud, pmd, start, end);
+
+ /*
+ * Try again to free the PMD page if haven't succeeded above.
+ */
+ if (!pud_none(*pud))
+ if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
+ pud_clear(pud);
+}
+
+static void unmap_pud_range(p4d_t *p4d, unsigned long start, unsigned long end)
+{
+ pud_t *pud = pud_offset(p4d, start);
+
+ /*
+ * Not on a GB page boundary?
+ */
+ if (start & (PUD_SIZE - 1)) {
+ unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
+ unsigned long pre_end = min_t(unsigned long, end, next_page);
+
+ unmap_pmd_range(pud, start, pre_end);
+
+ start = pre_end;
+ pud++;
+ }
+
+ /*
+ * Try to unmap in 1G chunks?
+ */
+ while (end - start >= PUD_SIZE) {
+
+ if (pud_large(*pud))
+ pud_clear(pud);
+ else
+ unmap_pmd_range(pud, start, start + PUD_SIZE);
+
+ start += PUD_SIZE;
+ pud++;
+ }
+
+ /*
+ * 2M leftovers?
+ */
+ if (start < end)
+ unmap_pmd_range(pud, start, end);
+
+ /*
+ * No need to try to free the PUD page because we'll free it in
+ * populate_pgd's error path
+ */
+}
+
+static int alloc_pte_page(pmd_t *pmd)
+{
+ pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL);
+ if (!pte)
+ return -1;
+
+ set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
+ return 0;
+}
+
+static int alloc_pmd_page(pud_t *pud)
+{
+ pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL);
+ if (!pmd)
+ return -1;
+
+ set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
+ return 0;
+}
+
+static void populate_pte(struct cpa_data *cpa,
+ unsigned long start, unsigned long end,
+ unsigned num_pages, pmd_t *pmd, pgprot_t pgprot)
+{
+ pte_t *pte;
+
+ pte = pte_offset_kernel(pmd, start);
+
+ pgprot = pgprot_clear_protnone_bits(pgprot);
+
+ while (num_pages-- && start < end) {
+ set_pte(pte, pfn_pte(cpa->pfn, pgprot));
+
+ start += PAGE_SIZE;
+ cpa->pfn++;
+ pte++;
+ }
+}
+
+static long populate_pmd(struct cpa_data *cpa,
+ unsigned long start, unsigned long end,
+ unsigned num_pages, pud_t *pud, pgprot_t pgprot)
+{
+ long cur_pages = 0;
+ pmd_t *pmd;
+ pgprot_t pmd_pgprot;
+
+ /*
+ * Not on a 2M boundary?
+ */
+ if (start & (PMD_SIZE - 1)) {
+ unsigned long pre_end = start + (num_pages << PAGE_SHIFT);
+ unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
+
+ pre_end = min_t(unsigned long, pre_end, next_page);
+ cur_pages = (pre_end - start) >> PAGE_SHIFT;
+ cur_pages = min_t(unsigned int, num_pages, cur_pages);
+
+ /*
+ * Need a PTE page?
+ */
+ pmd = pmd_offset(pud, start);
+ if (pmd_none(*pmd))
+ if (alloc_pte_page(pmd))
+ return -1;
+
+ populate_pte(cpa, start, pre_end, cur_pages, pmd, pgprot);
+
+ start = pre_end;
+ }
+
+ /*
+ * We mapped them all?
+ */
+ if (num_pages == cur_pages)
+ return cur_pages;
+
+ pmd_pgprot = pgprot_4k_2_large(pgprot);
+
+ while (end - start >= PMD_SIZE) {
+
+ /*
+ * We cannot use a 1G page so allocate a PMD page if needed.
+ */
+ if (pud_none(*pud))
+ if (alloc_pmd_page(pud))
+ return -1;
+
+ pmd = pmd_offset(pud, start);
+
+ set_pmd(pmd, pmd_mkhuge(pfn_pmd(cpa->pfn,
+ canon_pgprot(pmd_pgprot))));
+
+ start += PMD_SIZE;
+ cpa->pfn += PMD_SIZE >> PAGE_SHIFT;
+ cur_pages += PMD_SIZE >> PAGE_SHIFT;
+ }
+
+ /*
+ * Map trailing 4K pages.
+ */
+ if (start < end) {
+ pmd = pmd_offset(pud, start);
+ if (pmd_none(*pmd))
+ if (alloc_pte_page(pmd))
+ return -1;
+
+ populate_pte(cpa, start, end, num_pages - cur_pages,
+ pmd, pgprot);
+ }
+ return num_pages;
+}
+
+static int populate_pud(struct cpa_data *cpa, unsigned long start, p4d_t *p4d,
+ pgprot_t pgprot)
+{
+ pud_t *pud;
+ unsigned long end;
+ long cur_pages = 0;
+ pgprot_t pud_pgprot;
+
+ end = start + (cpa->numpages << PAGE_SHIFT);
+
+ /*
+ * Not on a Gb page boundary? => map everything up to it with
+ * smaller pages.
+ */
+ if (start & (PUD_SIZE - 1)) {
+ unsigned long pre_end;
+ unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
+
+ pre_end = min_t(unsigned long, end, next_page);
+ cur_pages = (pre_end - start) >> PAGE_SHIFT;
+ cur_pages = min_t(int, (int)cpa->numpages, cur_pages);
+
+ pud = pud_offset(p4d, start);
+
+ /*
+ * Need a PMD page?
+ */
+ if (pud_none(*pud))
+ if (alloc_pmd_page(pud))
+ return -1;
+
+ cur_pages = populate_pmd(cpa, start, pre_end, cur_pages,
+ pud, pgprot);
+ if (cur_pages < 0)
+ return cur_pages;
+
+ start = pre_end;
+ }
+
+ /* We mapped them all? */
+ if (cpa->numpages == cur_pages)
+ return cur_pages;
+
+ pud = pud_offset(p4d, start);
+ pud_pgprot = pgprot_4k_2_large(pgprot);
+
+ /*
+ * Map everything starting from the Gb boundary, possibly with 1G pages
+ */
+ while (boot_cpu_has(X86_FEATURE_GBPAGES) && end - start >= PUD_SIZE) {
+ set_pud(pud, pud_mkhuge(pfn_pud(cpa->pfn,
+ canon_pgprot(pud_pgprot))));
+
+ start += PUD_SIZE;
+ cpa->pfn += PUD_SIZE >> PAGE_SHIFT;
+ cur_pages += PUD_SIZE >> PAGE_SHIFT;
+ pud++;
+ }
+
+ /* Map trailing leftover */
+ if (start < end) {
+ long tmp;
+
+ pud = pud_offset(p4d, start);
+ if (pud_none(*pud))
+ if (alloc_pmd_page(pud))
+ return -1;
+
+ tmp = populate_pmd(cpa, start, end, cpa->numpages - cur_pages,
+ pud, pgprot);
+ if (tmp < 0)
+ return cur_pages;
+
+ cur_pages += tmp;
+ }
+ return cur_pages;
+}
+
+/*
+ * Restrictions for kernel page table do not necessarily apply when mapping in
+ * an alternate PGD.
+ */
+static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
+{
+ pgprot_t pgprot = __pgprot(_KERNPG_TABLE);
+ pud_t *pud = NULL; /* shut up gcc */
+ p4d_t *p4d;
+ pgd_t *pgd_entry;
+ long ret;
+
+ pgd_entry = cpa->pgd + pgd_index(addr);
+
+ if (pgd_none(*pgd_entry)) {
+ p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL);
+ if (!p4d)
+ return -1;
+
+ set_pgd(pgd_entry, __pgd(__pa(p4d) | _KERNPG_TABLE));
+ }
+
+ /*
+ * Allocate a PUD page and hand it down for mapping.
+ */
+ p4d = p4d_offset(pgd_entry, addr);
+ if (p4d_none(*p4d)) {
+ pud = (pud_t *)get_zeroed_page(GFP_KERNEL);
+ if (!pud)
+ return -1;
+
+ set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE));
+ }
+
+ pgprot_val(pgprot) &= ~pgprot_val(cpa->mask_clr);
+ pgprot_val(pgprot) |= pgprot_val(cpa->mask_set);
+
+ ret = populate_pud(cpa, addr, p4d, pgprot);
+ if (ret < 0) {
+ /*
+ * Leave the PUD page in place in case some other CPU or thread
+ * already found it, but remove any useless entries we just
+ * added to it.
+ */
+ unmap_pud_range(p4d, addr,
+ addr + (cpa->numpages << PAGE_SHIFT));
+ return ret;
+ }
+
+ cpa->numpages = ret;
+ return 0;
+}
+
+static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr,
+ int primary)
+{
+ if (cpa->pgd) {
+ /*
+ * Right now, we only execute this code path when mapping
+ * the EFI virtual memory map regions, no other users
+ * provide a ->pgd value. This may change in the future.
+ */
+ return populate_pgd(cpa, vaddr);
+ }
+
+ /*
+ * Ignore all non primary paths.
+ */
+ if (!primary) {
+ cpa->numpages = 1;
+ return 0;
+ }
+
+ /*
+ * Ignore the NULL PTE for kernel identity mapping, as it is expected
+ * to have holes.
+ * Also set numpages to '1' indicating that we processed cpa req for
+ * one virtual address page and its pfn. TBD: numpages can be set based
+ * on the initial value and the level returned by lookup_address().
+ */
+ if (within(vaddr, PAGE_OFFSET,
+ PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) {
+ cpa->numpages = 1;
+ cpa->pfn = __pa(vaddr) >> PAGE_SHIFT;
+ return 0;
+
+ } else if (__cpa_pfn_in_highmap(cpa->pfn)) {
+ /* Faults in the highmap are OK, so do not warn: */
+ return -EFAULT;
+ } else {
+ WARN(1, KERN_WARNING "CPA: called for zero pte. "
+ "vaddr = %lx cpa->vaddr = %lx\n", vaddr,
+ *cpa->vaddr);
+
+ return -EFAULT;
+ }
+}
+
+static int __change_page_attr(struct cpa_data *cpa, int primary)
+{
+ unsigned long address;
+ int do_split, err;
+ unsigned int level;
+ pte_t *kpte, old_pte;
+
+ if (cpa->flags & CPA_PAGES_ARRAY) {
+ struct page *page = cpa->pages[cpa->curpage];
+ if (unlikely(PageHighMem(page)))
+ return 0;
+ address = (unsigned long)page_address(page);
+ } else if (cpa->flags & CPA_ARRAY)
+ address = cpa->vaddr[cpa->curpage];
+ else
+ address = *cpa->vaddr;
+repeat:
+ kpte = _lookup_address_cpa(cpa, address, &level);
+ if (!kpte)
+ return __cpa_process_fault(cpa, address, primary);
+
+ old_pte = *kpte;
+ if (pte_none(old_pte))
+ return __cpa_process_fault(cpa, address, primary);
+
+ if (level == PG_LEVEL_4K) {
+ pte_t new_pte;
+ pgprot_t new_prot = pte_pgprot(old_pte);
+ unsigned long pfn = pte_pfn(old_pte);
+
+ pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
+ pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
+
+ new_prot = static_protections(new_prot, address, pfn);
+
+ new_prot = pgprot_clear_protnone_bits(new_prot);
+
+ /*
+ * We need to keep the pfn from the existing PTE,
+ * after all we're only going to change it's attributes
+ * not the memory it points to
+ */
+ new_pte = pfn_pte(pfn, new_prot);
+ cpa->pfn = pfn;
+ /*
+ * Do we really change anything ?
+ */
+ if (pte_val(old_pte) != pte_val(new_pte)) {
+ set_pte_atomic(kpte, new_pte);
+ cpa->flags |= CPA_FLUSHTLB;
+ }
+ cpa->numpages = 1;
+ return 0;
+ }
+
+ /*
+ * Check, whether we can keep the large page intact
+ * and just change the pte:
+ */
+ do_split = try_preserve_large_page(kpte, address, cpa);
+ /*
+ * When the range fits into the existing large page,
+ * return. cp->numpages and cpa->tlbflush have been updated in
+ * try_large_page:
+ */
+ if (do_split <= 0)
+ return do_split;
+
+ /*
+ * We have to split the large page:
+ */
+ err = split_large_page(cpa, kpte, address);
+ if (!err) {
+ /*
+ * Do a global flush tlb after splitting the large page
+ * and before we do the actual change page attribute in the PTE.
+ *
+ * With out this, we violate the TLB application note, that says
+ * "The TLBs may contain both ordinary and large-page
+ * translations for a 4-KByte range of linear addresses. This
+ * may occur if software modifies the paging structures so that
+ * the page size used for the address range changes. If the two
+ * translations differ with respect to page frame or attributes
+ * (e.g., permissions), processor behavior is undefined and may
+ * be implementation-specific."
+ *
+ * We do this global tlb flush inside the cpa_lock, so that we
+ * don't allow any other cpu, with stale tlb entries change the
+ * page attribute in parallel, that also falls into the
+ * just split large page entry.
+ */
+ flush_tlb_all();
+ goto repeat;
+ }
+
+ return err;
+}
+
+static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias);
+
+static int cpa_process_alias(struct cpa_data *cpa)
+{
+ struct cpa_data alias_cpa;
+ unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT);
+ unsigned long vaddr;
+ int ret;
+
+ if (!pfn_range_is_mapped(cpa->pfn, cpa->pfn + 1))
+ return 0;
+
+ /*
+ * No need to redo, when the primary call touched the direct
+ * mapping already:
+ */
+ if (cpa->flags & CPA_PAGES_ARRAY) {
+ struct page *page = cpa->pages[cpa->curpage];
+ if (unlikely(PageHighMem(page)))
+ return 0;
+ vaddr = (unsigned long)page_address(page);
+ } else if (cpa->flags & CPA_ARRAY)
+ vaddr = cpa->vaddr[cpa->curpage];
+ else
+ vaddr = *cpa->vaddr;
+
+ if (!(within(vaddr, PAGE_OFFSET,
+ PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) {
+
+ alias_cpa = *cpa;
+ alias_cpa.vaddr = &laddr;
+ alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
+
+ ret = __change_page_attr_set_clr(&alias_cpa, 0);
+ if (ret)
+ return ret;
+ }
+
+#ifdef CONFIG_X86_64
+ /*
+ * If the primary call didn't touch the high mapping already
+ * and the physical address is inside the kernel map, we need
+ * to touch the high mapped kernel as well:
+ */
+ if (!within(vaddr, (unsigned long)_text, _brk_end) &&
+ __cpa_pfn_in_highmap(cpa->pfn)) {
+ unsigned long temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) +
+ __START_KERNEL_map - phys_base;
+ alias_cpa = *cpa;
+ alias_cpa.vaddr = &temp_cpa_vaddr;
+ alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
+
+ /*
+ * The high mapping range is imprecise, so ignore the
+ * return value.
+ */
+ __change_page_attr_set_clr(&alias_cpa, 0);
+ }
+#endif
+
+ return 0;
+}
+
+static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
+{
+ unsigned long numpages = cpa->numpages;
+ int ret;
+
+ while (numpages) {
+ /*
+ * Store the remaining nr of pages for the large page
+ * preservation check.
+ */
+ cpa->numpages = numpages;
+ /* for array changes, we can't use large page */
+ if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY))
+ cpa->numpages = 1;
+
+ if (!debug_pagealloc_enabled())
+ spin_lock(&cpa_lock);
+ ret = __change_page_attr(cpa, checkalias);
+ if (!debug_pagealloc_enabled())
+ spin_unlock(&cpa_lock);
+ if (ret)
+ return ret;
+
+ if (checkalias) {
+ ret = cpa_process_alias(cpa);
+ if (ret)
+ return ret;
+ }
+
+ /*
+ * Adjust the number of pages with the result of the
+ * CPA operation. Either a large page has been
+ * preserved or a single page update happened.
+ */
+ BUG_ON(cpa->numpages > numpages || !cpa->numpages);
+ numpages -= cpa->numpages;
+ if (cpa->flags & (CPA_PAGES_ARRAY | CPA_ARRAY))
+ cpa->curpage++;
+ else
+ *cpa->vaddr += cpa->numpages * PAGE_SIZE;
+
+ }
+ return 0;
+}
+
+/*
+ * Machine check recovery code needs to change cache mode of poisoned
+ * pages to UC to avoid speculative access logging another error. But
+ * passing the address of the 1:1 mapping to set_memory_uc() is a fine
+ * way to encourage a speculative access. So we cheat and flip the top
+ * bit of the address. This works fine for the code that updates the
+ * page tables. But at the end of the process we need to flush the cache
+ * and the non-canonical address causes a #GP fault when used by the
+ * CLFLUSH instruction.
+ *
+ * But in the common case we already have a canonical address. This code
+ * will fix the top bit if needed and is a no-op otherwise.
+ */
+static inline unsigned long make_addr_canonical_again(unsigned long addr)
+{
+#ifdef CONFIG_X86_64
+ return (long)(addr << 1) >> 1;
+#else
+ return addr;
+#endif
+}
+
+
+static int change_page_attr_set_clr(unsigned long *addr, int numpages,
+ pgprot_t mask_set, pgprot_t mask_clr,
+ int force_split, int in_flag,
+ struct page **pages)
+{
+ struct cpa_data cpa;
+ int ret, cache, checkalias;
+ unsigned long baddr = 0;
+
+ memset(&cpa, 0, sizeof(cpa));
+
+ /*
+ * Check, if we are requested to set a not supported
+ * feature. Clearing non-supported features is OK.
+ */
+ mask_set = canon_pgprot(mask_set);
+
+ if (!pgprot_val(mask_set) && !pgprot_val(mask_clr) && !force_split)
+ return 0;
+
+ /* Ensure we are PAGE_SIZE aligned */
+ if (in_flag & CPA_ARRAY) {
+ int i;
+ for (i = 0; i < numpages; i++) {
+ if (addr[i] & ~PAGE_MASK) {
+ addr[i] &= PAGE_MASK;
+ WARN_ON_ONCE(1);
+ }
+ }
+ } else if (!(in_flag & CPA_PAGES_ARRAY)) {
+ /*
+ * in_flag of CPA_PAGES_ARRAY implies it is aligned.
+ * No need to cehck in that case
+ */
+ if (*addr & ~PAGE_MASK) {
+ *addr &= PAGE_MASK;
+ /*
+ * People should not be passing in unaligned addresses:
+ */
+ WARN_ON_ONCE(1);
+ }
+ /*
+ * Save address for cache flush. *addr is modified in the call
+ * to __change_page_attr_set_clr() below.
+ */
+ baddr = make_addr_canonical_again(*addr);
+ }
+
+ /* Must avoid aliasing mappings in the highmem code */
+ kmap_flush_unused();
+
+ vm_unmap_aliases();
+
+ cpa.vaddr = addr;
+ cpa.pages = pages;
+ cpa.numpages = numpages;
+ cpa.mask_set = mask_set;
+ cpa.mask_clr = mask_clr;
+ cpa.flags = 0;
+ cpa.curpage = 0;
+ cpa.force_split = force_split;
+
+ if (in_flag & (CPA_ARRAY | CPA_PAGES_ARRAY))
+ cpa.flags |= in_flag;
+
+ /* No alias checking for _NX bit modifications */
+ checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
+ /* Has caller explicitly disabled alias checking? */
+ if (in_flag & CPA_NO_CHECK_ALIAS)
+ checkalias = 0;
+
+ ret = __change_page_attr_set_clr(&cpa, checkalias);
+
+ /*
+ * Check whether we really changed something:
+ */
+ if (!(cpa.flags & CPA_FLUSHTLB))
+ goto out;
+
+ /*
+ * No need to flush, when we did not set any of the caching
+ * attributes:
+ */
+ cache = !!pgprot2cachemode(mask_set);
+
+ /*
+ * On success we use CLFLUSH, when the CPU supports it to
+ * avoid the WBINVD. If the CPU does not support it and in the
+ * error case we fall back to cpa_flush_all (which uses
+ * WBINVD):
+ */
+ if (!ret && boot_cpu_has(X86_FEATURE_CLFLUSH)) {
+ if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
+ cpa_flush_array(addr, numpages, cache,
+ cpa.flags, pages);
+ } else
+ cpa_flush_range(baddr, numpages, cache);
+ } else
+ cpa_flush_all(cache);
+
+out:
+ return ret;
+}
+
+static inline int change_page_attr_set(unsigned long *addr, int numpages,
+ pgprot_t mask, int array)
+{
+ return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0,
+ (array ? CPA_ARRAY : 0), NULL);
+}
+
+static inline int change_page_attr_clear(unsigned long *addr, int numpages,
+ pgprot_t mask, int array)
+{
+ return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0,
+ (array ? CPA_ARRAY : 0), NULL);
+}
+
+static inline int cpa_set_pages_array(struct page **pages, int numpages,
+ pgprot_t mask)
+{
+ return change_page_attr_set_clr(NULL, numpages, mask, __pgprot(0), 0,
+ CPA_PAGES_ARRAY, pages);
+}
+
+static inline int cpa_clear_pages_array(struct page **pages, int numpages,
+ pgprot_t mask)
+{
+ return change_page_attr_set_clr(NULL, numpages, __pgprot(0), mask, 0,
+ CPA_PAGES_ARRAY, pages);
+}
+
+int _set_memory_uc(unsigned long addr, int numpages)
+{
+ /*
+ * for now UC MINUS. see comments in ioremap_nocache()
+ * If you really need strong UC use ioremap_uc(), but note
+ * that you cannot override IO areas with set_memory_*() as
+ * these helpers cannot work with IO memory.
+ */
+ return change_page_attr_set(&addr, numpages,
+ cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS),
+ 0);
+}
+
+int set_memory_uc(unsigned long addr, int numpages)
+{
+ int ret;
+
+ /*
+ * for now UC MINUS. see comments in ioremap_nocache()
+ */
+ ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
+ _PAGE_CACHE_MODE_UC_MINUS, NULL);
+ if (ret)
+ goto out_err;
+
+ ret = _set_memory_uc(addr, numpages);
+ if (ret)
+ goto out_free;
+
+ return 0;
+
+out_free:
+ free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
+out_err:
+ return ret;
+}
+EXPORT_SYMBOL(set_memory_uc);
+
+static int _set_memory_array(unsigned long *addr, int addrinarray,
+ enum page_cache_mode new_type)
+{
+ enum page_cache_mode set_type;
+ int i, j;
+ int ret;
+
+ for (i = 0; i < addrinarray; i++) {
+ ret = reserve_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE,
+ new_type, NULL);
+ if (ret)
+ goto out_free;
+ }
+
+ /* If WC, set to UC- first and then WC */
+ set_type = (new_type == _PAGE_CACHE_MODE_WC) ?
+ _PAGE_CACHE_MODE_UC_MINUS : new_type;
+
+ ret = change_page_attr_set(addr, addrinarray,
+ cachemode2pgprot(set_type), 1);
+
+ if (!ret && new_type == _PAGE_CACHE_MODE_WC)
+ ret = change_page_attr_set_clr(addr, addrinarray,
+ cachemode2pgprot(
+ _PAGE_CACHE_MODE_WC),
+ __pgprot(_PAGE_CACHE_MASK),
+ 0, CPA_ARRAY, NULL);
+ if (ret)
+ goto out_free;
+
+ return 0;
+
+out_free:
+ for (j = 0; j < i; j++)
+ free_memtype(__pa(addr[j]), __pa(addr[j]) + PAGE_SIZE);
+
+ return ret;
+}
+
+int set_memory_array_uc(unsigned long *addr, int addrinarray)
+{
+ return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_UC_MINUS);
+}
+EXPORT_SYMBOL(set_memory_array_uc);
+
+int set_memory_array_wc(unsigned long *addr, int addrinarray)
+{
+ return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_WC);
+}
+EXPORT_SYMBOL(set_memory_array_wc);
+
+int set_memory_array_wt(unsigned long *addr, int addrinarray)
+{
+ return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_WT);
+}
+EXPORT_SYMBOL_GPL(set_memory_array_wt);
+
+int _set_memory_wc(unsigned long addr, int numpages)
+{
+ int ret;
+ unsigned long addr_copy = addr;
+
+ ret = change_page_attr_set(&addr, numpages,
+ cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS),
+ 0);
+ if (!ret) {
+ ret = change_page_attr_set_clr(&addr_copy, numpages,
+ cachemode2pgprot(
+ _PAGE_CACHE_MODE_WC),
+ __pgprot(_PAGE_CACHE_MASK),
+ 0, 0, NULL);
+ }
+ return ret;
+}
+
+int set_memory_wc(unsigned long addr, int numpages)
+{
+ int ret;
+
+ ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
+ _PAGE_CACHE_MODE_WC, NULL);
+ if (ret)
+ return ret;
+
+ ret = _set_memory_wc(addr, numpages);
+ if (ret)
+ free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
+
+ return ret;
+}
+EXPORT_SYMBOL(set_memory_wc);
+
+int _set_memory_wt(unsigned long addr, int numpages)
+{
+ return change_page_attr_set(&addr, numpages,
+ cachemode2pgprot(_PAGE_CACHE_MODE_WT), 0);
+}
+
+int set_memory_wt(unsigned long addr, int numpages)
+{
+ int ret;
+
+ ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
+ _PAGE_CACHE_MODE_WT, NULL);
+ if (ret)
+ return ret;
+
+ ret = _set_memory_wt(addr, numpages);
+ if (ret)
+ free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(set_memory_wt);
+
+int _set_memory_wb(unsigned long addr, int numpages)
+{
+ /* WB cache mode is hard wired to all cache attribute bits being 0 */
+ return change_page_attr_clear(&addr, numpages,
+ __pgprot(_PAGE_CACHE_MASK), 0);
+}
+
+int set_memory_wb(unsigned long addr, int numpages)
+{
+ int ret;
+
+ ret = _set_memory_wb(addr, numpages);
+ if (ret)
+ return ret;
+
+ free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
+ return 0;
+}
+EXPORT_SYMBOL(set_memory_wb);
+
+int set_memory_array_wb(unsigned long *addr, int addrinarray)
+{
+ int i;
+ int ret;
+
+ /* WB cache mode is hard wired to all cache attribute bits being 0 */
+ ret = change_page_attr_clear(addr, addrinarray,
+ __pgprot(_PAGE_CACHE_MASK), 1);
+ if (ret)
+ return ret;
+
+ for (i = 0; i < addrinarray; i++)
+ free_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE);
+
+ return 0;
+}
+EXPORT_SYMBOL(set_memory_array_wb);
+
+int set_memory_x(unsigned long addr, int numpages)
+{
+ if (!(__supported_pte_mask & _PAGE_NX))
+ return 0;
+
+ return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0);
+}
+EXPORT_SYMBOL(set_memory_x);
+
+int set_memory_nx(unsigned long addr, int numpages)
+{
+ if (!(__supported_pte_mask & _PAGE_NX))
+ return 0;
+
+ return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0);
+}
+EXPORT_SYMBOL(set_memory_nx);
+
+int set_memory_ro(unsigned long addr, int numpages)
+{
+ return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0);
+}
+
+int set_memory_rw(unsigned long addr, int numpages)
+{
+ return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0);
+}
+
+int set_memory_np(unsigned long addr, int numpages)
+{
+ return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0);
+}
+
+int set_memory_np_noalias(unsigned long addr, int numpages)
+{
+ int cpa_flags = CPA_NO_CHECK_ALIAS;
+
+ return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
+ __pgprot(_PAGE_PRESENT), 0,
+ cpa_flags, NULL);
+}
+
+int set_memory_4k(unsigned long addr, int numpages)
+{
+ return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
+ __pgprot(0), 1, 0, NULL);
+}
+
+int set_memory_nonglobal(unsigned long addr, int numpages)
+{
+ return change_page_attr_clear(&addr, numpages,
+ __pgprot(_PAGE_GLOBAL), 0);
+}
+
+int set_memory_global(unsigned long addr, int numpages)
+{
+ return change_page_attr_set(&addr, numpages,
+ __pgprot(_PAGE_GLOBAL), 0);
+}
+
+static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
+{
+ struct cpa_data cpa;
+ unsigned long start;
+ int ret;
+
+ /* Nothing to do if memory encryption is not active */
+ if (!mem_encrypt_active())
+ return 0;
+
+ /* Should not be working on unaligned addresses */
+ if (WARN_ONCE(addr & ~PAGE_MASK, "misaligned address: %#lx\n", addr))
+ addr &= PAGE_MASK;
+
+ start = addr;
+
+ memset(&cpa, 0, sizeof(cpa));
+ cpa.vaddr = &addr;
+ cpa.numpages = numpages;
+ cpa.mask_set = enc ? __pgprot(_PAGE_ENC) : __pgprot(0);
+ cpa.mask_clr = enc ? __pgprot(0) : __pgprot(_PAGE_ENC);
+ cpa.pgd = init_mm.pgd;
+
+ /* Must avoid aliasing mappings in the highmem code */
+ kmap_flush_unused();
+ vm_unmap_aliases();
+
+ /*
+ * Before changing the encryption attribute, we need to flush caches.
+ */
+ if (static_cpu_has(X86_FEATURE_CLFLUSH))
+ cpa_flush_range(start, numpages, 1);
+ else
+ cpa_flush_all(1);
+
+ ret = __change_page_attr_set_clr(&cpa, 1);
+
+ /*
+ * After changing the encryption attribute, we need to flush TLBs
+ * again in case any speculative TLB caching occurred (but no need
+ * to flush caches again). We could just use cpa_flush_all(), but
+ * in case TLB flushing gets optimized in the cpa_flush_range()
+ * path use the same logic as above.
+ */
+ if (static_cpu_has(X86_FEATURE_CLFLUSH))
+ cpa_flush_range(start, numpages, 0);
+ else
+ cpa_flush_all(0);
+
+ return ret;
+}
+
+int set_memory_encrypted(unsigned long addr, int numpages)
+{
+ return __set_memory_enc_dec(addr, numpages, true);
+}
+EXPORT_SYMBOL_GPL(set_memory_encrypted);
+
+int set_memory_decrypted(unsigned long addr, int numpages)
+{
+ return __set_memory_enc_dec(addr, numpages, false);
+}
+EXPORT_SYMBOL_GPL(set_memory_decrypted);
+
+int set_pages_uc(struct page *page, int numpages)
+{
+ unsigned long addr = (unsigned long)page_address(page);
+
+ return set_memory_uc(addr, numpages);
+}
+EXPORT_SYMBOL(set_pages_uc);
+
+static int _set_pages_array(struct page **pages, int addrinarray,
+ enum page_cache_mode new_type)
+{
+ unsigned long start;
+ unsigned long end;
+ enum page_cache_mode set_type;
+ int i;
+ int free_idx;
+ int ret;
+
+ for (i = 0; i < addrinarray; i++) {
+ if (PageHighMem(pages[i]))
+ continue;
+ start = page_to_pfn(pages[i]) << PAGE_SHIFT;
+ end = start + PAGE_SIZE;
+ if (reserve_memtype(start, end, new_type, NULL))
+ goto err_out;
+ }
+
+ /* If WC, set to UC- first and then WC */
+ set_type = (new_type == _PAGE_CACHE_MODE_WC) ?
+ _PAGE_CACHE_MODE_UC_MINUS : new_type;
+
+ ret = cpa_set_pages_array(pages, addrinarray,
+ cachemode2pgprot(set_type));
+ if (!ret && new_type == _PAGE_CACHE_MODE_WC)
+ ret = change_page_attr_set_clr(NULL, addrinarray,
+ cachemode2pgprot(
+ _PAGE_CACHE_MODE_WC),
+ __pgprot(_PAGE_CACHE_MASK),
+ 0, CPA_PAGES_ARRAY, pages);
+ if (ret)
+ goto err_out;
+ return 0; /* Success */
+err_out:
+ free_idx = i;
+ for (i = 0; i < free_idx; i++) {
+ if (PageHighMem(pages[i]))
+ continue;
+ start = page_to_pfn(pages[i]) << PAGE_SHIFT;
+ end = start + PAGE_SIZE;
+ free_memtype(start, end);
+ }
+ return -EINVAL;
+}
+
+int set_pages_array_uc(struct page **pages, int addrinarray)
+{
+ return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_UC_MINUS);
+}
+EXPORT_SYMBOL(set_pages_array_uc);
+
+int set_pages_array_wc(struct page **pages, int addrinarray)
+{
+ return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_WC);
+}
+EXPORT_SYMBOL(set_pages_array_wc);
+
+int set_pages_array_wt(struct page **pages, int addrinarray)
+{
+ return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_WT);
+}
+EXPORT_SYMBOL_GPL(set_pages_array_wt);
+
+int set_pages_wb(struct page *page, int numpages)
+{
+ unsigned long addr = (unsigned long)page_address(page);
+
+ return set_memory_wb(addr, numpages);
+}
+EXPORT_SYMBOL(set_pages_wb);
+
+int set_pages_array_wb(struct page **pages, int addrinarray)
+{
+ int retval;
+ unsigned long start;
+ unsigned long end;
+ int i;
+
+ /* WB cache mode is hard wired to all cache attribute bits being 0 */
+ retval = cpa_clear_pages_array(pages, addrinarray,
+ __pgprot(_PAGE_CACHE_MASK));
+ if (retval)
+ return retval;
+
+ for (i = 0; i < addrinarray; i++) {
+ if (PageHighMem(pages[i]))
+ continue;
+ start = page_to_pfn(pages[i]) << PAGE_SHIFT;
+ end = start + PAGE_SIZE;
+ free_memtype(start, end);
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(set_pages_array_wb);
+
+int set_pages_x(struct page *page, int numpages)
+{
+ unsigned long addr = (unsigned long)page_address(page);
+
+ return set_memory_x(addr, numpages);
+}
+EXPORT_SYMBOL(set_pages_x);
+
+int set_pages_nx(struct page *page, int numpages)
+{
+ unsigned long addr = (unsigned long)page_address(page);
+
+ return set_memory_nx(addr, numpages);
+}
+EXPORT_SYMBOL(set_pages_nx);
+
+int set_pages_ro(struct page *page, int numpages)
+{
+ unsigned long addr = (unsigned long)page_address(page);
+
+ return set_memory_ro(addr, numpages);
+}
+
+int set_pages_rw(struct page *page, int numpages)
+{
+ unsigned long addr = (unsigned long)page_address(page);
+
+ return set_memory_rw(addr, numpages);
+}
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+
+static int __set_pages_p(struct page *page, int numpages)
+{
+ unsigned long tempaddr = (unsigned long) page_address(page);
+ struct cpa_data cpa = { .vaddr = &tempaddr,
+ .pgd = NULL,
+ .numpages = numpages,
+ .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
+ .mask_clr = __pgprot(0),
+ .flags = 0};
+
+ /*
+ * No alias checking needed for setting present flag. otherwise,
+ * we may need to break large pages for 64-bit kernel text
+ * mappings (this adds to complexity if we want to do this from
+ * atomic context especially). Let's keep it simple!
+ */
+ return __change_page_attr_set_clr(&cpa, 0);
+}
+
+static int __set_pages_np(struct page *page, int numpages)
+{
+ unsigned long tempaddr = (unsigned long) page_address(page);
+ struct cpa_data cpa = { .vaddr = &tempaddr,
+ .pgd = NULL,
+ .numpages = numpages,
+ .mask_set = __pgprot(0),
+ .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW),
+ .flags = 0};
+
+ /*
+ * No alias checking needed for setting not present flag. otherwise,
+ * we may need to break large pages for 64-bit kernel text
+ * mappings (this adds to complexity if we want to do this from
+ * atomic context especially). Let's keep it simple!
+ */
+ return __change_page_attr_set_clr(&cpa, 0);
+}
+
+void __kernel_map_pages(struct page *page, int numpages, int enable)
+{
+ if (PageHighMem(page))
+ return;
+ if (!enable) {
+ debug_check_no_locks_freed(page_address(page),
+ numpages * PAGE_SIZE);
+ }
+
+ /*
+ * The return value is ignored as the calls cannot fail.
+ * Large pages for identity mappings are not used at boot time
+ * and hence no memory allocations during large page split.
+ */
+ if (enable)
+ __set_pages_p(page, numpages);
+ else
+ __set_pages_np(page, numpages);
+
+ /*
+ * We should perform an IPI and flush all tlbs,
+ * but that can deadlock->flush only current cpu.
+ * Preemption needs to be disabled around __flush_tlb_all() due to
+ * CR3 reload in __native_flush_tlb().
+ */
+ preempt_disable();
+ __flush_tlb_all();
+ preempt_enable();
+
+ arch_flush_lazy_mmu_mode();
+}
+
+#ifdef CONFIG_HIBERNATION
+
+bool kernel_page_present(struct page *page)
+{
+ unsigned int level;
+ pte_t *pte;
+
+ if (PageHighMem(page))
+ return false;
+
+ pte = lookup_address((unsigned long)page_address(page), &level);
+ return (pte_val(*pte) & _PAGE_PRESENT);
+}
+
+#endif /* CONFIG_HIBERNATION */
+
+#endif /* CONFIG_DEBUG_PAGEALLOC */
+
+int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
+ unsigned numpages, unsigned long page_flags)
+{
+ int retval = -EINVAL;
+
+ struct cpa_data cpa = {
+ .vaddr = &address,
+ .pfn = pfn,
+ .pgd = pgd,
+ .numpages = numpages,
+ .mask_set = __pgprot(0),
+ .mask_clr = __pgprot(~page_flags & (_PAGE_NX|_PAGE_RW)),
+ .flags = 0,
+ };
+
+ if (!(__supported_pte_mask & _PAGE_NX))
+ goto out;
+
+ if (!(page_flags & _PAGE_ENC))
+ cpa.mask_clr = pgprot_encrypted(cpa.mask_clr);
+
+ cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags);
+
+ retval = __change_page_attr_set_clr(&cpa, 0);
+ __flush_tlb_all();
+
+out:
+ return retval;
+}
+
+/*
+ * The testcases use internal knowledge of the implementation that shouldn't
+ * be exposed to the rest of the kernel. Include these directly here.
+ */
+#ifdef CONFIG_CPA_DEBUG
+#include "pageattr-test.c"
+#endif
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
new file mode 100644
index 000000000..b33304c00
--- /dev/null
+++ b/arch/x86/mm/pat.c
@@ -0,0 +1,1184 @@
+/*
+ * Handle caching attributes in page tables (PAT)
+ *
+ * Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
+ * Suresh B Siddha <suresh.b.siddha@intel.com>
+ *
+ * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen.
+ */
+
+#include <linux/seq_file.h>
+#include <linux/bootmem.h>
+#include <linux/debugfs.h>
+#include <linux/ioport.h>
+#include <linux/kernel.h>
+#include <linux/pfn_t.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/rbtree.h>
+
+#include <asm/cacheflush.h>
+#include <asm/processor.h>
+#include <asm/tlbflush.h>
+#include <asm/x86_init.h>
+#include <asm/pgtable.h>
+#include <asm/fcntl.h>
+#include <asm/e820/api.h>
+#include <asm/mtrr.h>
+#include <asm/page.h>
+#include <asm/msr.h>
+#include <asm/pat.h>
+#include <asm/io.h>
+
+#include "pat_internal.h"
+#include "mm_internal.h"
+
+#undef pr_fmt
+#define pr_fmt(fmt) "" fmt
+
+static bool __read_mostly boot_cpu_done;
+static bool __read_mostly pat_disabled = !IS_ENABLED(CONFIG_X86_PAT);
+static bool __read_mostly pat_initialized;
+static bool __read_mostly init_cm_done;
+
+void pat_disable(const char *reason)
+{
+ if (pat_disabled)
+ return;
+
+ if (boot_cpu_done) {
+ WARN_ONCE(1, "x86/PAT: PAT cannot be disabled after initialization\n");
+ return;
+ }
+
+ pat_disabled = true;
+ pr_info("x86/PAT: %s\n", reason);
+}
+
+static int __init nopat(char *str)
+{
+ pat_disable("PAT support disabled.");
+ return 0;
+}
+early_param("nopat", nopat);
+
+bool pat_enabled(void)
+{
+ return pat_initialized;
+}
+EXPORT_SYMBOL_GPL(pat_enabled);
+
+int pat_debug_enable;
+
+static int __init pat_debug_setup(char *str)
+{
+ pat_debug_enable = 1;
+ return 1;
+}
+__setup("debugpat", pat_debug_setup);
+
+#ifdef CONFIG_X86_PAT
+/*
+ * X86 PAT uses page flags arch_1 and uncached together to keep track of
+ * memory type of pages that have backing page struct.
+ *
+ * X86 PAT supports 4 different memory types:
+ * - _PAGE_CACHE_MODE_WB
+ * - _PAGE_CACHE_MODE_WC
+ * - _PAGE_CACHE_MODE_UC_MINUS
+ * - _PAGE_CACHE_MODE_WT
+ *
+ * _PAGE_CACHE_MODE_WB is the default type.
+ */
+
+#define _PGMT_WB 0
+#define _PGMT_WC (1UL << PG_arch_1)
+#define _PGMT_UC_MINUS (1UL << PG_uncached)
+#define _PGMT_WT (1UL << PG_uncached | 1UL << PG_arch_1)
+#define _PGMT_MASK (1UL << PG_uncached | 1UL << PG_arch_1)
+#define _PGMT_CLEAR_MASK (~_PGMT_MASK)
+
+static inline enum page_cache_mode get_page_memtype(struct page *pg)
+{
+ unsigned long pg_flags = pg->flags & _PGMT_MASK;
+
+ if (pg_flags == _PGMT_WB)
+ return _PAGE_CACHE_MODE_WB;
+ else if (pg_flags == _PGMT_WC)
+ return _PAGE_CACHE_MODE_WC;
+ else if (pg_flags == _PGMT_UC_MINUS)
+ return _PAGE_CACHE_MODE_UC_MINUS;
+ else
+ return _PAGE_CACHE_MODE_WT;
+}
+
+static inline void set_page_memtype(struct page *pg,
+ enum page_cache_mode memtype)
+{
+ unsigned long memtype_flags;
+ unsigned long old_flags;
+ unsigned long new_flags;
+
+ switch (memtype) {
+ case _PAGE_CACHE_MODE_WC:
+ memtype_flags = _PGMT_WC;
+ break;
+ case _PAGE_CACHE_MODE_UC_MINUS:
+ memtype_flags = _PGMT_UC_MINUS;
+ break;
+ case _PAGE_CACHE_MODE_WT:
+ memtype_flags = _PGMT_WT;
+ break;
+ case _PAGE_CACHE_MODE_WB:
+ default:
+ memtype_flags = _PGMT_WB;
+ break;
+ }
+
+ do {
+ old_flags = pg->flags;
+ new_flags = (old_flags & _PGMT_CLEAR_MASK) | memtype_flags;
+ } while (cmpxchg(&pg->flags, old_flags, new_flags) != old_flags);
+}
+#else
+static inline enum page_cache_mode get_page_memtype(struct page *pg)
+{
+ return -1;
+}
+static inline void set_page_memtype(struct page *pg,
+ enum page_cache_mode memtype)
+{
+}
+#endif
+
+enum {
+ PAT_UC = 0, /* uncached */
+ PAT_WC = 1, /* Write combining */
+ PAT_WT = 4, /* Write Through */
+ PAT_WP = 5, /* Write Protected */
+ PAT_WB = 6, /* Write Back (default) */
+ PAT_UC_MINUS = 7, /* UC, but can be overridden by MTRR */
+};
+
+#define CM(c) (_PAGE_CACHE_MODE_ ## c)
+
+static enum page_cache_mode pat_get_cache_mode(unsigned pat_val, char *msg)
+{
+ enum page_cache_mode cache;
+ char *cache_mode;
+
+ switch (pat_val) {
+ case PAT_UC: cache = CM(UC); cache_mode = "UC "; break;
+ case PAT_WC: cache = CM(WC); cache_mode = "WC "; break;
+ case PAT_WT: cache = CM(WT); cache_mode = "WT "; break;
+ case PAT_WP: cache = CM(WP); cache_mode = "WP "; break;
+ case PAT_WB: cache = CM(WB); cache_mode = "WB "; break;
+ case PAT_UC_MINUS: cache = CM(UC_MINUS); cache_mode = "UC- "; break;
+ default: cache = CM(WB); cache_mode = "WB "; break;
+ }
+
+ memcpy(msg, cache_mode, 4);
+
+ return cache;
+}
+
+#undef CM
+
+/*
+ * Update the cache mode to pgprot translation tables according to PAT
+ * configuration.
+ * Using lower indices is preferred, so we start with highest index.
+ */
+static void __init_cache_modes(u64 pat)
+{
+ enum page_cache_mode cache;
+ char pat_msg[33];
+ int i;
+
+ pat_msg[32] = 0;
+ for (i = 7; i >= 0; i--) {
+ cache = pat_get_cache_mode((pat >> (i * 8)) & 7,
+ pat_msg + 4 * i);
+ update_cache_mode_entry(i, cache);
+ }
+ pr_info("x86/PAT: Configuration [0-7]: %s\n", pat_msg);
+
+ init_cm_done = true;
+}
+
+#define PAT(x, y) ((u64)PAT_ ## y << ((x)*8))
+
+static void pat_bsp_init(u64 pat)
+{
+ u64 tmp_pat;
+
+ if (!boot_cpu_has(X86_FEATURE_PAT)) {
+ pat_disable("PAT not supported by CPU.");
+ return;
+ }
+
+ rdmsrl(MSR_IA32_CR_PAT, tmp_pat);
+ if (!tmp_pat) {
+ pat_disable("PAT MSR is 0, disabled.");
+ return;
+ }
+
+ wrmsrl(MSR_IA32_CR_PAT, pat);
+ pat_initialized = true;
+
+ __init_cache_modes(pat);
+}
+
+static void pat_ap_init(u64 pat)
+{
+ if (!boot_cpu_has(X86_FEATURE_PAT)) {
+ /*
+ * If this happens we are on a secondary CPU, but switched to
+ * PAT on the boot CPU. We have no way to undo PAT.
+ */
+ panic("x86/PAT: PAT enabled, but not supported by secondary CPU\n");
+ }
+
+ wrmsrl(MSR_IA32_CR_PAT, pat);
+}
+
+void init_cache_modes(void)
+{
+ u64 pat = 0;
+
+ if (init_cm_done)
+ return;
+
+ if (boot_cpu_has(X86_FEATURE_PAT)) {
+ /*
+ * CPU supports PAT. Set PAT table to be consistent with
+ * PAT MSR. This case supports "nopat" boot option, and
+ * virtual machine environments which support PAT without
+ * MTRRs. In specific, Xen has unique setup to PAT MSR.
+ *
+ * If PAT MSR returns 0, it is considered invalid and emulates
+ * as No PAT.
+ */
+ rdmsrl(MSR_IA32_CR_PAT, pat);
+ }
+
+ if (!pat) {
+ /*
+ * No PAT. Emulate the PAT table that corresponds to the two
+ * cache bits, PWT (Write Through) and PCD (Cache Disable).
+ * This setup is also the same as the BIOS default setup.
+ *
+ * PTE encoding:
+ *
+ * PCD
+ * |PWT PAT
+ * || slot
+ * 00 0 WB : _PAGE_CACHE_MODE_WB
+ * 01 1 WT : _PAGE_CACHE_MODE_WT
+ * 10 2 UC-: _PAGE_CACHE_MODE_UC_MINUS
+ * 11 3 UC : _PAGE_CACHE_MODE_UC
+ *
+ * NOTE: When WC or WP is used, it is redirected to UC- per
+ * the default setup in __cachemode2pte_tbl[].
+ */
+ pat = PAT(0, WB) | PAT(1, WT) | PAT(2, UC_MINUS) | PAT(3, UC) |
+ PAT(4, WB) | PAT(5, WT) | PAT(6, UC_MINUS) | PAT(7, UC);
+ }
+
+ __init_cache_modes(pat);
+}
+
+/**
+ * pat_init - Initialize PAT MSR and PAT table
+ *
+ * This function initializes PAT MSR and PAT table with an OS-defined value
+ * to enable additional cache attributes, WC, WT and WP.
+ *
+ * This function must be called on all CPUs using the specific sequence of
+ * operations defined in Intel SDM. mtrr_rendezvous_handler() provides this
+ * procedure for PAT.
+ */
+void pat_init(void)
+{
+ u64 pat;
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+
+ if (pat_disabled)
+ return;
+
+ if ((c->x86_vendor == X86_VENDOR_INTEL) &&
+ (((c->x86 == 0x6) && (c->x86_model <= 0xd)) ||
+ ((c->x86 == 0xf) && (c->x86_model <= 0x6)))) {
+ /*
+ * PAT support with the lower four entries. Intel Pentium 2,
+ * 3, M, and 4 are affected by PAT errata, which makes the
+ * upper four entries unusable. To be on the safe side, we don't
+ * use those.
+ *
+ * PTE encoding:
+ * PAT
+ * |PCD
+ * ||PWT PAT
+ * ||| slot
+ * 000 0 WB : _PAGE_CACHE_MODE_WB
+ * 001 1 WC : _PAGE_CACHE_MODE_WC
+ * 010 2 UC-: _PAGE_CACHE_MODE_UC_MINUS
+ * 011 3 UC : _PAGE_CACHE_MODE_UC
+ * PAT bit unused
+ *
+ * NOTE: When WT or WP is used, it is redirected to UC- per
+ * the default setup in __cachemode2pte_tbl[].
+ */
+ pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
+ PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC);
+ } else {
+ /*
+ * Full PAT support. We put WT in slot 7 to improve
+ * robustness in the presence of errata that might cause
+ * the high PAT bit to be ignored. This way, a buggy slot 7
+ * access will hit slot 3, and slot 3 is UC, so at worst
+ * we lose performance without causing a correctness issue.
+ * Pentium 4 erratum N46 is an example for such an erratum,
+ * although we try not to use PAT at all on affected CPUs.
+ *
+ * PTE encoding:
+ * PAT
+ * |PCD
+ * ||PWT PAT
+ * ||| slot
+ * 000 0 WB : _PAGE_CACHE_MODE_WB
+ * 001 1 WC : _PAGE_CACHE_MODE_WC
+ * 010 2 UC-: _PAGE_CACHE_MODE_UC_MINUS
+ * 011 3 UC : _PAGE_CACHE_MODE_UC
+ * 100 4 WB : Reserved
+ * 101 5 WP : _PAGE_CACHE_MODE_WP
+ * 110 6 UC-: Reserved
+ * 111 7 WT : _PAGE_CACHE_MODE_WT
+ *
+ * The reserved slots are unused, but mapped to their
+ * corresponding types in the presence of PAT errata.
+ */
+ pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
+ PAT(4, WB) | PAT(5, WP) | PAT(6, UC_MINUS) | PAT(7, WT);
+ }
+
+ if (!boot_cpu_done) {
+ pat_bsp_init(pat);
+ boot_cpu_done = true;
+ } else {
+ pat_ap_init(pat);
+ }
+}
+
+#undef PAT
+
+static DEFINE_SPINLOCK(memtype_lock); /* protects memtype accesses */
+
+/*
+ * Does intersection of PAT memory type and MTRR memory type and returns
+ * the resulting memory type as PAT understands it.
+ * (Type in pat and mtrr will not have same value)
+ * The intersection is based on "Effective Memory Type" tables in IA-32
+ * SDM vol 3a
+ */
+static unsigned long pat_x_mtrr_type(u64 start, u64 end,
+ enum page_cache_mode req_type)
+{
+ /*
+ * Look for MTRR hint to get the effective type in case where PAT
+ * request is for WB.
+ */
+ if (req_type == _PAGE_CACHE_MODE_WB) {
+ u8 mtrr_type, uniform;
+
+ mtrr_type = mtrr_type_lookup(start, end, &uniform);
+ if (mtrr_type != MTRR_TYPE_WRBACK)
+ return _PAGE_CACHE_MODE_UC_MINUS;
+
+ return _PAGE_CACHE_MODE_WB;
+ }
+
+ return req_type;
+}
+
+struct pagerange_state {
+ unsigned long cur_pfn;
+ int ram;
+ int not_ram;
+};
+
+static int
+pagerange_is_ram_callback(unsigned long initial_pfn, unsigned long total_nr_pages, void *arg)
+{
+ struct pagerange_state *state = arg;
+
+ state->not_ram |= initial_pfn > state->cur_pfn;
+ state->ram |= total_nr_pages > 0;
+ state->cur_pfn = initial_pfn + total_nr_pages;
+
+ return state->ram && state->not_ram;
+}
+
+static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end)
+{
+ int ret = 0;
+ unsigned long start_pfn = start >> PAGE_SHIFT;
+ unsigned long end_pfn = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ struct pagerange_state state = {start_pfn, 0, 0};
+
+ /*
+ * For legacy reasons, physical address range in the legacy ISA
+ * region is tracked as non-RAM. This will allow users of
+ * /dev/mem to map portions of legacy ISA region, even when
+ * some of those portions are listed(or not even listed) with
+ * different e820 types(RAM/reserved/..)
+ */
+ if (start_pfn < ISA_END_ADDRESS >> PAGE_SHIFT)
+ start_pfn = ISA_END_ADDRESS >> PAGE_SHIFT;
+
+ if (start_pfn < end_pfn) {
+ ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn,
+ &state, pagerange_is_ram_callback);
+ }
+
+ return (ret > 0) ? -1 : (state.ram ? 1 : 0);
+}
+
+/*
+ * For RAM pages, we use page flags to mark the pages with appropriate type.
+ * The page flags are limited to four types, WB (default), WC, WT and UC-.
+ * WP request fails with -EINVAL, and UC gets redirected to UC-. Setting
+ * a new memory type is only allowed for a page mapped with the default WB
+ * type.
+ *
+ * Here we do two passes:
+ * - Find the memtype of all the pages in the range, look for any conflicts.
+ * - In case of no conflicts, set the new memtype for pages in the range.
+ */
+static int reserve_ram_pages_type(u64 start, u64 end,
+ enum page_cache_mode req_type,
+ enum page_cache_mode *new_type)
+{
+ struct page *page;
+ u64 pfn;
+
+ if (req_type == _PAGE_CACHE_MODE_WP) {
+ if (new_type)
+ *new_type = _PAGE_CACHE_MODE_UC_MINUS;
+ return -EINVAL;
+ }
+
+ if (req_type == _PAGE_CACHE_MODE_UC) {
+ /* We do not support strong UC */
+ WARN_ON_ONCE(1);
+ req_type = _PAGE_CACHE_MODE_UC_MINUS;
+ }
+
+ for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
+ enum page_cache_mode type;
+
+ page = pfn_to_page(pfn);
+ type = get_page_memtype(page);
+ if (type != _PAGE_CACHE_MODE_WB) {
+ pr_info("x86/PAT: reserve_ram_pages_type failed [mem %#010Lx-%#010Lx], track 0x%x, req 0x%x\n",
+ start, end - 1, type, req_type);
+ if (new_type)
+ *new_type = type;
+
+ return -EBUSY;
+ }
+ }
+
+ if (new_type)
+ *new_type = req_type;
+
+ for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
+ page = pfn_to_page(pfn);
+ set_page_memtype(page, req_type);
+ }
+ return 0;
+}
+
+static int free_ram_pages_type(u64 start, u64 end)
+{
+ struct page *page;
+ u64 pfn;
+
+ for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
+ page = pfn_to_page(pfn);
+ set_page_memtype(page, _PAGE_CACHE_MODE_WB);
+ }
+ return 0;
+}
+
+static u64 sanitize_phys(u64 address)
+{
+ /*
+ * When changing the memtype for pages containing poison allow
+ * for a "decoy" virtual address (bit 63 clear) passed to
+ * set_memory_X(). __pa() on a "decoy" address results in a
+ * physical address with bit 63 set.
+ *
+ * Decoy addresses are not present for 32-bit builds, see
+ * set_mce_nospec().
+ */
+ if (IS_ENABLED(CONFIG_X86_64))
+ return address & __PHYSICAL_MASK;
+ return address;
+}
+
+/*
+ * req_type typically has one of the:
+ * - _PAGE_CACHE_MODE_WB
+ * - _PAGE_CACHE_MODE_WC
+ * - _PAGE_CACHE_MODE_UC_MINUS
+ * - _PAGE_CACHE_MODE_UC
+ * - _PAGE_CACHE_MODE_WT
+ *
+ * If new_type is NULL, function will return an error if it cannot reserve the
+ * region with req_type. If new_type is non-NULL, function will return
+ * available type in new_type in case of no error. In case of any error
+ * it will return a negative return value.
+ */
+int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_type,
+ enum page_cache_mode *new_type)
+{
+ struct memtype *new;
+ enum page_cache_mode actual_type;
+ int is_range_ram;
+ int err = 0;
+
+ start = sanitize_phys(start);
+ end = sanitize_phys(end);
+ if (start >= end) {
+ WARN(1, "%s failed: [mem %#010Lx-%#010Lx], req %s\n", __func__,
+ start, end - 1, cattr_name(req_type));
+ return -EINVAL;
+ }
+
+ if (!pat_enabled()) {
+ /* This is identical to page table setting without PAT */
+ if (new_type)
+ *new_type = req_type;
+ return 0;
+ }
+
+ /* Low ISA region is always mapped WB in page table. No need to track */
+ if (x86_platform.is_untracked_pat_range(start, end)) {
+ if (new_type)
+ *new_type = _PAGE_CACHE_MODE_WB;
+ return 0;
+ }
+
+ /*
+ * Call mtrr_lookup to get the type hint. This is an
+ * optimization for /dev/mem mmap'ers into WB memory (BIOS
+ * tools and ACPI tools). Use WB request for WB memory and use
+ * UC_MINUS otherwise.
+ */
+ actual_type = pat_x_mtrr_type(start, end, req_type);
+
+ if (new_type)
+ *new_type = actual_type;
+
+ is_range_ram = pat_pagerange_is_ram(start, end);
+ if (is_range_ram == 1) {
+
+ err = reserve_ram_pages_type(start, end, req_type, new_type);
+
+ return err;
+ } else if (is_range_ram < 0) {
+ return -EINVAL;
+ }
+
+ new = kzalloc(sizeof(struct memtype), GFP_KERNEL);
+ if (!new)
+ return -ENOMEM;
+
+ new->start = start;
+ new->end = end;
+ new->type = actual_type;
+
+ spin_lock(&memtype_lock);
+
+ err = rbt_memtype_check_insert(new, new_type);
+ if (err) {
+ pr_info("x86/PAT: reserve_memtype failed [mem %#010Lx-%#010Lx], track %s, req %s\n",
+ start, end - 1,
+ cattr_name(new->type), cattr_name(req_type));
+ kfree(new);
+ spin_unlock(&memtype_lock);
+
+ return err;
+ }
+
+ spin_unlock(&memtype_lock);
+
+ dprintk("reserve_memtype added [mem %#010Lx-%#010Lx], track %s, req %s, ret %s\n",
+ start, end - 1, cattr_name(new->type), cattr_name(req_type),
+ new_type ? cattr_name(*new_type) : "-");
+
+ return err;
+}
+
+int free_memtype(u64 start, u64 end)
+{
+ int err = -EINVAL;
+ int is_range_ram;
+ struct memtype *entry;
+
+ if (!pat_enabled())
+ return 0;
+
+ start = sanitize_phys(start);
+ end = sanitize_phys(end);
+
+ /* Low ISA region is always mapped WB. No need to track */
+ if (x86_platform.is_untracked_pat_range(start, end))
+ return 0;
+
+ is_range_ram = pat_pagerange_is_ram(start, end);
+ if (is_range_ram == 1) {
+
+ err = free_ram_pages_type(start, end);
+
+ return err;
+ } else if (is_range_ram < 0) {
+ return -EINVAL;
+ }
+
+ spin_lock(&memtype_lock);
+ entry = rbt_memtype_erase(start, end);
+ spin_unlock(&memtype_lock);
+
+ if (IS_ERR(entry)) {
+ pr_info("x86/PAT: %s:%d freeing invalid memtype [mem %#010Lx-%#010Lx]\n",
+ current->comm, current->pid, start, end - 1);
+ return -EINVAL;
+ }
+
+ kfree(entry);
+
+ dprintk("free_memtype request [mem %#010Lx-%#010Lx]\n", start, end - 1);
+
+ return 0;
+}
+
+
+/**
+ * lookup_memtype - Looksup the memory type for a physical address
+ * @paddr: physical address of which memory type needs to be looked up
+ *
+ * Only to be called when PAT is enabled
+ *
+ * Returns _PAGE_CACHE_MODE_WB, _PAGE_CACHE_MODE_WC, _PAGE_CACHE_MODE_UC_MINUS
+ * or _PAGE_CACHE_MODE_WT.
+ */
+static enum page_cache_mode lookup_memtype(u64 paddr)
+{
+ enum page_cache_mode rettype = _PAGE_CACHE_MODE_WB;
+ struct memtype *entry;
+
+ if (x86_platform.is_untracked_pat_range(paddr, paddr + PAGE_SIZE))
+ return rettype;
+
+ if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) {
+ struct page *page;
+
+ page = pfn_to_page(paddr >> PAGE_SHIFT);
+ return get_page_memtype(page);
+ }
+
+ spin_lock(&memtype_lock);
+
+ entry = rbt_memtype_lookup(paddr);
+ if (entry != NULL)
+ rettype = entry->type;
+ else
+ rettype = _PAGE_CACHE_MODE_UC_MINUS;
+
+ spin_unlock(&memtype_lock);
+ return rettype;
+}
+
+/**
+ * pat_pfn_immune_to_uc_mtrr - Check whether the PAT memory type
+ * of @pfn cannot be overridden by UC MTRR memory type.
+ *
+ * Only to be called when PAT is enabled.
+ *
+ * Returns true, if the PAT memory type of @pfn is UC, UC-, or WC.
+ * Returns false in other cases.
+ */
+bool pat_pfn_immune_to_uc_mtrr(unsigned long pfn)
+{
+ enum page_cache_mode cm = lookup_memtype(PFN_PHYS(pfn));
+
+ return cm == _PAGE_CACHE_MODE_UC ||
+ cm == _PAGE_CACHE_MODE_UC_MINUS ||
+ cm == _PAGE_CACHE_MODE_WC;
+}
+EXPORT_SYMBOL_GPL(pat_pfn_immune_to_uc_mtrr);
+
+/**
+ * io_reserve_memtype - Request a memory type mapping for a region of memory
+ * @start: start (physical address) of the region
+ * @end: end (physical address) of the region
+ * @type: A pointer to memtype, with requested type. On success, requested
+ * or any other compatible type that was available for the region is returned
+ *
+ * On success, returns 0
+ * On failure, returns non-zero
+ */
+int io_reserve_memtype(resource_size_t start, resource_size_t end,
+ enum page_cache_mode *type)
+{
+ resource_size_t size = end - start;
+ enum page_cache_mode req_type = *type;
+ enum page_cache_mode new_type;
+ int ret;
+
+ WARN_ON_ONCE(iomem_map_sanity_check(start, size));
+
+ ret = reserve_memtype(start, end, req_type, &new_type);
+ if (ret)
+ goto out_err;
+
+ if (!is_new_memtype_allowed(start, size, req_type, new_type))
+ goto out_free;
+
+ if (kernel_map_sync_memtype(start, size, new_type) < 0)
+ goto out_free;
+
+ *type = new_type;
+ return 0;
+
+out_free:
+ free_memtype(start, end);
+ ret = -EBUSY;
+out_err:
+ return ret;
+}
+
+/**
+ * io_free_memtype - Release a memory type mapping for a region of memory
+ * @start: start (physical address) of the region
+ * @end: end (physical address) of the region
+ */
+void io_free_memtype(resource_size_t start, resource_size_t end)
+{
+ free_memtype(start, end);
+}
+
+int arch_io_reserve_memtype_wc(resource_size_t start, resource_size_t size)
+{
+ enum page_cache_mode type = _PAGE_CACHE_MODE_WC;
+
+ return io_reserve_memtype(start, start + size, &type);
+}
+EXPORT_SYMBOL(arch_io_reserve_memtype_wc);
+
+void arch_io_free_memtype_wc(resource_size_t start, resource_size_t size)
+{
+ io_free_memtype(start, start + size);
+}
+EXPORT_SYMBOL(arch_io_free_memtype_wc);
+
+pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
+ unsigned long size, pgprot_t vma_prot)
+{
+ if (!phys_mem_access_encrypted(pfn << PAGE_SHIFT, size))
+ vma_prot = pgprot_decrypted(vma_prot);
+
+ return vma_prot;
+}
+
+#ifdef CONFIG_STRICT_DEVMEM
+/* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM */
+static inline int range_is_allowed(unsigned long pfn, unsigned long size)
+{
+ return 1;
+}
+#else
+/* This check is needed to avoid cache aliasing when PAT is enabled */
+static inline int range_is_allowed(unsigned long pfn, unsigned long size)
+{
+ u64 from = ((u64)pfn) << PAGE_SHIFT;
+ u64 to = from + size;
+ u64 cursor = from;
+
+ if (!pat_enabled())
+ return 1;
+
+ while (cursor < to) {
+ if (!devmem_is_allowed(pfn))
+ return 0;
+ cursor += PAGE_SIZE;
+ pfn++;
+ }
+ return 1;
+}
+#endif /* CONFIG_STRICT_DEVMEM */
+
+int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
+ unsigned long size, pgprot_t *vma_prot)
+{
+ enum page_cache_mode pcm = _PAGE_CACHE_MODE_WB;
+
+ if (!range_is_allowed(pfn, size))
+ return 0;
+
+ if (file->f_flags & O_DSYNC)
+ pcm = _PAGE_CACHE_MODE_UC_MINUS;
+
+ *vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) |
+ cachemode2protval(pcm));
+ return 1;
+}
+
+/*
+ * Change the memory type for the physial address range in kernel identity
+ * mapping space if that range is a part of identity map.
+ */
+int kernel_map_sync_memtype(u64 base, unsigned long size,
+ enum page_cache_mode pcm)
+{
+ unsigned long id_sz;
+
+ if (base > __pa(high_memory-1))
+ return 0;
+
+ /*
+ * some areas in the middle of the kernel identity range
+ * are not mapped, like the PCI space.
+ */
+ if (!page_is_ram(base >> PAGE_SHIFT))
+ return 0;
+
+ id_sz = (__pa(high_memory-1) <= base + size) ?
+ __pa(high_memory) - base :
+ size;
+
+ if (ioremap_change_attr((unsigned long)__va(base), id_sz, pcm) < 0) {
+ pr_info("x86/PAT: %s:%d ioremap_change_attr failed %s for [mem %#010Lx-%#010Lx]\n",
+ current->comm, current->pid,
+ cattr_name(pcm),
+ base, (unsigned long long)(base + size-1));
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/*
+ * Internal interface to reserve a range of physical memory with prot.
+ * Reserved non RAM regions only and after successful reserve_memtype,
+ * this func also keeps identity mapping (if any) in sync with this new prot.
+ */
+static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
+ int strict_prot)
+{
+ int is_ram = 0;
+ int ret;
+ enum page_cache_mode want_pcm = pgprot2cachemode(*vma_prot);
+ enum page_cache_mode pcm = want_pcm;
+
+ is_ram = pat_pagerange_is_ram(paddr, paddr + size);
+
+ /*
+ * reserve_pfn_range() for RAM pages. We do not refcount to keep
+ * track of number of mappings of RAM pages. We can assert that
+ * the type requested matches the type of first page in the range.
+ */
+ if (is_ram) {
+ if (!pat_enabled())
+ return 0;
+
+ pcm = lookup_memtype(paddr);
+ if (want_pcm != pcm) {
+ pr_warn("x86/PAT: %s:%d map pfn RAM range req %s for [mem %#010Lx-%#010Lx], got %s\n",
+ current->comm, current->pid,
+ cattr_name(want_pcm),
+ (unsigned long long)paddr,
+ (unsigned long long)(paddr + size - 1),
+ cattr_name(pcm));
+ *vma_prot = __pgprot((pgprot_val(*vma_prot) &
+ (~_PAGE_CACHE_MASK)) |
+ cachemode2protval(pcm));
+ }
+ return 0;
+ }
+
+ ret = reserve_memtype(paddr, paddr + size, want_pcm, &pcm);
+ if (ret)
+ return ret;
+
+ if (pcm != want_pcm) {
+ if (strict_prot ||
+ !is_new_memtype_allowed(paddr, size, want_pcm, pcm)) {
+ free_memtype(paddr, paddr + size);
+ pr_err("x86/PAT: %s:%d map pfn expected mapping type %s for [mem %#010Lx-%#010Lx], got %s\n",
+ current->comm, current->pid,
+ cattr_name(want_pcm),
+ (unsigned long long)paddr,
+ (unsigned long long)(paddr + size - 1),
+ cattr_name(pcm));
+ return -EINVAL;
+ }
+ /*
+ * We allow returning different type than the one requested in
+ * non strict case.
+ */
+ *vma_prot = __pgprot((pgprot_val(*vma_prot) &
+ (~_PAGE_CACHE_MASK)) |
+ cachemode2protval(pcm));
+ }
+
+ if (kernel_map_sync_memtype(paddr, size, pcm) < 0) {
+ free_memtype(paddr, paddr + size);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/*
+ * Internal interface to free a range of physical memory.
+ * Frees non RAM regions only.
+ */
+static void free_pfn_range(u64 paddr, unsigned long size)
+{
+ int is_ram;
+
+ is_ram = pat_pagerange_is_ram(paddr, paddr + size);
+ if (is_ram == 0)
+ free_memtype(paddr, paddr + size);
+}
+
+/*
+ * track_pfn_copy is called when vma that is covering the pfnmap gets
+ * copied through copy_page_range().
+ *
+ * If the vma has a linear pfn mapping for the entire range, we get the prot
+ * from pte and reserve the entire vma range with single reserve_pfn_range call.
+ */
+int track_pfn_copy(struct vm_area_struct *vma)
+{
+ resource_size_t paddr;
+ unsigned long prot;
+ unsigned long vma_size = vma->vm_end - vma->vm_start;
+ pgprot_t pgprot;
+
+ if (vma->vm_flags & VM_PAT) {
+ /*
+ * reserve the whole chunk covered by vma. We need the
+ * starting address and protection from pte.
+ */
+ if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) {
+ WARN_ON_ONCE(1);
+ return -EINVAL;
+ }
+ pgprot = __pgprot(prot);
+ return reserve_pfn_range(paddr, vma_size, &pgprot, 1);
+ }
+
+ return 0;
+}
+
+/*
+ * prot is passed in as a parameter for the new mapping. If the vma has
+ * a linear pfn mapping for the entire range, or no vma is provided,
+ * reserve the entire pfn + size range with single reserve_pfn_range
+ * call.
+ */
+int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
+ unsigned long pfn, unsigned long addr, unsigned long size)
+{
+ resource_size_t paddr = (resource_size_t)pfn << PAGE_SHIFT;
+ enum page_cache_mode pcm;
+
+ /* reserve the whole chunk starting from paddr */
+ if (!vma || (addr == vma->vm_start
+ && size == (vma->vm_end - vma->vm_start))) {
+ int ret;
+
+ ret = reserve_pfn_range(paddr, size, prot, 0);
+ if (ret == 0 && vma)
+ vma->vm_flags |= VM_PAT;
+ return ret;
+ }
+
+ if (!pat_enabled())
+ return 0;
+
+ /*
+ * For anything smaller than the vma size we set prot based on the
+ * lookup.
+ */
+ pcm = lookup_memtype(paddr);
+
+ /* Check memtype for the remaining pages */
+ while (size > PAGE_SIZE) {
+ size -= PAGE_SIZE;
+ paddr += PAGE_SIZE;
+ if (pcm != lookup_memtype(paddr))
+ return -EINVAL;
+ }
+
+ *prot = __pgprot((pgprot_val(*prot) & (~_PAGE_CACHE_MASK)) |
+ cachemode2protval(pcm));
+
+ return 0;
+}
+
+void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, pfn_t pfn)
+{
+ enum page_cache_mode pcm;
+
+ if (!pat_enabled())
+ return;
+
+ /* Set prot based on lookup */
+ pcm = lookup_memtype(pfn_t_to_phys(pfn));
+ *prot = __pgprot((pgprot_val(*prot) & (~_PAGE_CACHE_MASK)) |
+ cachemode2protval(pcm));
+}
+
+/*
+ * untrack_pfn is called while unmapping a pfnmap for a region.
+ * untrack can be called for a specific region indicated by pfn and size or
+ * can be for the entire vma (in which case pfn, size are zero).
+ */
+void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
+ unsigned long size)
+{
+ resource_size_t paddr;
+ unsigned long prot;
+
+ if (vma && !(vma->vm_flags & VM_PAT))
+ return;
+
+ /* free the chunk starting from pfn or the whole chunk */
+ paddr = (resource_size_t)pfn << PAGE_SHIFT;
+ if (!paddr && !size) {
+ if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) {
+ WARN_ON_ONCE(1);
+ return;
+ }
+
+ size = vma->vm_end - vma->vm_start;
+ }
+ free_pfn_range(paddr, size);
+ if (vma)
+ vma->vm_flags &= ~VM_PAT;
+}
+
+/*
+ * untrack_pfn_moved is called, while mremapping a pfnmap for a new region,
+ * with the old vma after its pfnmap page table has been removed. The new
+ * vma has a new pfnmap to the same pfn & cache type with VM_PAT set.
+ */
+void untrack_pfn_moved(struct vm_area_struct *vma)
+{
+ vma->vm_flags &= ~VM_PAT;
+}
+
+pgprot_t pgprot_writecombine(pgprot_t prot)
+{
+ return __pgprot(pgprot_val(prot) |
+ cachemode2protval(_PAGE_CACHE_MODE_WC));
+}
+EXPORT_SYMBOL_GPL(pgprot_writecombine);
+
+pgprot_t pgprot_writethrough(pgprot_t prot)
+{
+ return __pgprot(pgprot_val(prot) |
+ cachemode2protval(_PAGE_CACHE_MODE_WT));
+}
+EXPORT_SYMBOL_GPL(pgprot_writethrough);
+
+#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT)
+
+static struct memtype *memtype_get_idx(loff_t pos)
+{
+ struct memtype *print_entry;
+ int ret;
+
+ print_entry = kzalloc(sizeof(struct memtype), GFP_KERNEL);
+ if (!print_entry)
+ return NULL;
+
+ spin_lock(&memtype_lock);
+ ret = rbt_memtype_copy_nth_element(print_entry, pos);
+ spin_unlock(&memtype_lock);
+
+ if (!ret) {
+ return print_entry;
+ } else {
+ kfree(print_entry);
+ return NULL;
+ }
+}
+
+static void *memtype_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ if (*pos == 0) {
+ ++*pos;
+ seq_puts(seq, "PAT memtype list:\n");
+ }
+
+ return memtype_get_idx(*pos);
+}
+
+static void *memtype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ kfree(v);
+ ++*pos;
+ return memtype_get_idx(*pos);
+}
+
+static void memtype_seq_stop(struct seq_file *seq, void *v)
+{
+ kfree(v);
+}
+
+static int memtype_seq_show(struct seq_file *seq, void *v)
+{
+ struct memtype *print_entry = (struct memtype *)v;
+
+ seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type),
+ print_entry->start, print_entry->end);
+
+ return 0;
+}
+
+static const struct seq_operations memtype_seq_ops = {
+ .start = memtype_seq_start,
+ .next = memtype_seq_next,
+ .stop = memtype_seq_stop,
+ .show = memtype_seq_show,
+};
+
+static int memtype_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &memtype_seq_ops);
+}
+
+static const struct file_operations memtype_fops = {
+ .open = memtype_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static int __init pat_memtype_list_init(void)
+{
+ if (pat_enabled()) {
+ debugfs_create_file("pat_memtype_list", S_IRUSR,
+ arch_debugfs_dir, NULL, &memtype_fops);
+ }
+ return 0;
+}
+
+late_initcall(pat_memtype_list_init);
+
+#endif /* CONFIG_DEBUG_FS && CONFIG_X86_PAT */
diff --git a/arch/x86/mm/pat_internal.h b/arch/x86/mm/pat_internal.h
new file mode 100644
index 000000000..eeb5caeb0
--- /dev/null
+++ b/arch/x86/mm/pat_internal.h
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __PAT_INTERNAL_H_
+#define __PAT_INTERNAL_H_
+
+extern int pat_debug_enable;
+
+#define dprintk(fmt, arg...) \
+ do { if (pat_debug_enable) pr_info("x86/PAT: " fmt, ##arg); } while (0)
+
+struct memtype {
+ u64 start;
+ u64 end;
+ u64 subtree_max_end;
+ enum page_cache_mode type;
+ struct rb_node rb;
+};
+
+static inline char *cattr_name(enum page_cache_mode pcm)
+{
+ switch (pcm) {
+ case _PAGE_CACHE_MODE_UC: return "uncached";
+ case _PAGE_CACHE_MODE_UC_MINUS: return "uncached-minus";
+ case _PAGE_CACHE_MODE_WB: return "write-back";
+ case _PAGE_CACHE_MODE_WC: return "write-combining";
+ case _PAGE_CACHE_MODE_WT: return "write-through";
+ case _PAGE_CACHE_MODE_WP: return "write-protected";
+ default: return "broken";
+ }
+}
+
+#ifdef CONFIG_X86_PAT
+extern int rbt_memtype_check_insert(struct memtype *new,
+ enum page_cache_mode *new_type);
+extern struct memtype *rbt_memtype_erase(u64 start, u64 end);
+extern struct memtype *rbt_memtype_lookup(u64 addr);
+extern int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos);
+#else
+static inline int rbt_memtype_check_insert(struct memtype *new,
+ enum page_cache_mode *new_type)
+{ return 0; }
+static inline struct memtype *rbt_memtype_erase(u64 start, u64 end)
+{ return NULL; }
+static inline struct memtype *rbt_memtype_lookup(u64 addr)
+{ return NULL; }
+static inline int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos)
+{ return 0; }
+#endif
+
+#endif /* __PAT_INTERNAL_H_ */
diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c
new file mode 100644
index 000000000..fa16036fa
--- /dev/null
+++ b/arch/x86/mm/pat_rbtree.c
@@ -0,0 +1,281 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Handle caching attributes in page tables (PAT)
+ *
+ * Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
+ * Suresh B Siddha <suresh.b.siddha@intel.com>
+ *
+ * Interval tree (augmented rbtree) used to store the PAT memory type
+ * reservations.
+ */
+
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <linux/kernel.h>
+#include <linux/rbtree_augmented.h>
+#include <linux/sched.h>
+#include <linux/gfp.h>
+
+#include <asm/pgtable.h>
+#include <asm/pat.h>
+
+#include "pat_internal.h"
+
+/*
+ * The memtype tree keeps track of memory type for specific
+ * physical memory areas. Without proper tracking, conflicting memory
+ * types in different mappings can cause CPU cache corruption.
+ *
+ * The tree is an interval tree (augmented rbtree) with tree ordered
+ * on starting address. Tree can contain multiple entries for
+ * different regions which overlap. All the aliases have the same
+ * cache attributes of course.
+ *
+ * memtype_lock protects the rbtree.
+ */
+
+static struct rb_root memtype_rbroot = RB_ROOT;
+
+static int is_node_overlap(struct memtype *node, u64 start, u64 end)
+{
+ if (node->start >= end || node->end <= start)
+ return 0;
+
+ return 1;
+}
+
+static u64 get_subtree_max_end(struct rb_node *node)
+{
+ u64 ret = 0;
+ if (node) {
+ struct memtype *data = rb_entry(node, struct memtype, rb);
+ ret = data->subtree_max_end;
+ }
+ return ret;
+}
+
+static u64 compute_subtree_max_end(struct memtype *data)
+{
+ u64 max_end = data->end, child_max_end;
+
+ child_max_end = get_subtree_max_end(data->rb.rb_right);
+ if (child_max_end > max_end)
+ max_end = child_max_end;
+
+ child_max_end = get_subtree_max_end(data->rb.rb_left);
+ if (child_max_end > max_end)
+ max_end = child_max_end;
+
+ return max_end;
+}
+
+RB_DECLARE_CALLBACKS(static, memtype_rb_augment_cb, struct memtype, rb,
+ u64, subtree_max_end, compute_subtree_max_end)
+
+/* Find the first (lowest start addr) overlapping range from rb tree */
+static struct memtype *memtype_rb_lowest_match(struct rb_root *root,
+ u64 start, u64 end)
+{
+ struct rb_node *node = root->rb_node;
+ struct memtype *last_lower = NULL;
+
+ while (node) {
+ struct memtype *data = rb_entry(node, struct memtype, rb);
+
+ if (get_subtree_max_end(node->rb_left) > start) {
+ /* Lowest overlap if any must be on left side */
+ node = node->rb_left;
+ } else if (is_node_overlap(data, start, end)) {
+ last_lower = data;
+ break;
+ } else if (start >= data->start) {
+ /* Lowest overlap if any must be on right side */
+ node = node->rb_right;
+ } else {
+ break;
+ }
+ }
+ return last_lower; /* Returns NULL if there is no overlap */
+}
+
+enum {
+ MEMTYPE_EXACT_MATCH = 0,
+ MEMTYPE_END_MATCH = 1
+};
+
+static struct memtype *memtype_rb_match(struct rb_root *root,
+ u64 start, u64 end, int match_type)
+{
+ struct memtype *match;
+
+ match = memtype_rb_lowest_match(root, start, end);
+ while (match != NULL && match->start < end) {
+ struct rb_node *node;
+
+ if ((match_type == MEMTYPE_EXACT_MATCH) &&
+ (match->start == start) && (match->end == end))
+ return match;
+
+ if ((match_type == MEMTYPE_END_MATCH) &&
+ (match->start < start) && (match->end == end))
+ return match;
+
+ node = rb_next(&match->rb);
+ if (node)
+ match = rb_entry(node, struct memtype, rb);
+ else
+ match = NULL;
+ }
+
+ return NULL; /* Returns NULL if there is no match */
+}
+
+static int memtype_rb_check_conflict(struct rb_root *root,
+ u64 start, u64 end,
+ enum page_cache_mode reqtype,
+ enum page_cache_mode *newtype)
+{
+ struct rb_node *node;
+ struct memtype *match;
+ enum page_cache_mode found_type = reqtype;
+
+ match = memtype_rb_lowest_match(&memtype_rbroot, start, end);
+ if (match == NULL)
+ goto success;
+
+ if (match->type != found_type && newtype == NULL)
+ goto failure;
+
+ dprintk("Overlap at 0x%Lx-0x%Lx\n", match->start, match->end);
+ found_type = match->type;
+
+ node = rb_next(&match->rb);
+ while (node) {
+ match = rb_entry(node, struct memtype, rb);
+
+ if (match->start >= end) /* Checked all possible matches */
+ goto success;
+
+ if (is_node_overlap(match, start, end) &&
+ match->type != found_type) {
+ goto failure;
+ }
+
+ node = rb_next(&match->rb);
+ }
+success:
+ if (newtype)
+ *newtype = found_type;
+
+ return 0;
+
+failure:
+ pr_info("x86/PAT: %s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
+ current->comm, current->pid, start, end,
+ cattr_name(found_type), cattr_name(match->type));
+ return -EBUSY;
+}
+
+static void memtype_rb_insert(struct rb_root *root, struct memtype *newdata)
+{
+ struct rb_node **node = &(root->rb_node);
+ struct rb_node *parent = NULL;
+
+ while (*node) {
+ struct memtype *data = rb_entry(*node, struct memtype, rb);
+
+ parent = *node;
+ if (data->subtree_max_end < newdata->end)
+ data->subtree_max_end = newdata->end;
+ if (newdata->start <= data->start)
+ node = &((*node)->rb_left);
+ else if (newdata->start > data->start)
+ node = &((*node)->rb_right);
+ }
+
+ newdata->subtree_max_end = newdata->end;
+ rb_link_node(&newdata->rb, parent, node);
+ rb_insert_augmented(&newdata->rb, root, &memtype_rb_augment_cb);
+}
+
+int rbt_memtype_check_insert(struct memtype *new,
+ enum page_cache_mode *ret_type)
+{
+ int err = 0;
+
+ err = memtype_rb_check_conflict(&memtype_rbroot, new->start, new->end,
+ new->type, ret_type);
+
+ if (!err) {
+ if (ret_type)
+ new->type = *ret_type;
+
+ new->subtree_max_end = new->end;
+ memtype_rb_insert(&memtype_rbroot, new);
+ }
+ return err;
+}
+
+struct memtype *rbt_memtype_erase(u64 start, u64 end)
+{
+ struct memtype *data;
+
+ /*
+ * Since the memtype_rbroot tree allows overlapping ranges,
+ * rbt_memtype_erase() checks with EXACT_MATCH first, i.e. free
+ * a whole node for the munmap case. If no such entry is found,
+ * it then checks with END_MATCH, i.e. shrink the size of a node
+ * from the end for the mremap case.
+ */
+ data = memtype_rb_match(&memtype_rbroot, start, end,
+ MEMTYPE_EXACT_MATCH);
+ if (!data) {
+ data = memtype_rb_match(&memtype_rbroot, start, end,
+ MEMTYPE_END_MATCH);
+ if (!data)
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (data->start == start) {
+ /* munmap: erase this node */
+ rb_erase_augmented(&data->rb, &memtype_rbroot,
+ &memtype_rb_augment_cb);
+ } else {
+ /* mremap: update the end value of this node */
+ rb_erase_augmented(&data->rb, &memtype_rbroot,
+ &memtype_rb_augment_cb);
+ data->end = start;
+ data->subtree_max_end = data->end;
+ memtype_rb_insert(&memtype_rbroot, data);
+ return NULL;
+ }
+
+ return data;
+}
+
+struct memtype *rbt_memtype_lookup(u64 addr)
+{
+ return memtype_rb_lowest_match(&memtype_rbroot, addr, addr + PAGE_SIZE);
+}
+
+#if defined(CONFIG_DEBUG_FS)
+int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos)
+{
+ struct rb_node *node;
+ int i = 1;
+
+ node = rb_first(&memtype_rbroot);
+ while (node && pos != i) {
+ node = rb_next(node);
+ i++;
+ }
+
+ if (node) { /* pos == i */
+ struct memtype *this = rb_entry(node, struct memtype, rb);
+ *out = *this;
+ return 0;
+ } else {
+ return 1;
+ }
+}
+#endif
diff --git a/arch/x86/mm/pf_in.c b/arch/x86/mm/pf_in.c
new file mode 100644
index 000000000..a23586953
--- /dev/null
+++ b/arch/x86/mm/pf_in.c
@@ -0,0 +1,531 @@
+/*
+ * Fault Injection Test harness (FI)
+ * Copyright (C) Intel Crop.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+ * USA.
+ *
+ */
+
+/* Id: pf_in.c,v 1.1.1.1 2002/11/12 05:56:32 brlock Exp
+ * Copyright by Intel Crop., 2002
+ * Louis Zhuang (louis.zhuang@intel.com)
+ *
+ * Bjorn Steinbrink (B.Steinbrink@gmx.de), 2007
+ */
+
+#include <linux/ptrace.h> /* struct pt_regs */
+#include "pf_in.h"
+
+#ifdef __i386__
+/* IA32 Manual 3, 2-1 */
+static unsigned char prefix_codes[] = {
+ 0xF0, 0xF2, 0xF3, 0x2E, 0x36, 0x3E, 0x26, 0x64,
+ 0x65, 0x66, 0x67
+};
+/* IA32 Manual 3, 3-432*/
+static unsigned int reg_rop[] = {
+ 0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
+};
+static unsigned int reg_wop[] = { 0x88, 0x89, 0xAA, 0xAB };
+static unsigned int imm_wop[] = { 0xC6, 0xC7 };
+/* IA32 Manual 3, 3-432*/
+static unsigned int rw8[] = { 0x88, 0x8A, 0xC6, 0xAA };
+static unsigned int rw32[] = {
+ 0x89, 0x8B, 0xC7, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F, 0xAB
+};
+static unsigned int mw8[] = { 0x88, 0x8A, 0xC6, 0xB60F, 0xBE0F, 0xAA };
+static unsigned int mw16[] = { 0xB70F, 0xBF0F };
+static unsigned int mw32[] = { 0x89, 0x8B, 0xC7, 0xAB };
+static unsigned int mw64[] = {};
+#else /* not __i386__ */
+static unsigned char prefix_codes[] = {
+ 0x66, 0x67, 0x2E, 0x3E, 0x26, 0x64, 0x65, 0x36,
+ 0xF0, 0xF3, 0xF2,
+ /* REX Prefixes */
+ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f
+};
+/* AMD64 Manual 3, Appendix A*/
+static unsigned int reg_rop[] = {
+ 0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
+};
+static unsigned int reg_wop[] = { 0x88, 0x89, 0xAA, 0xAB };
+static unsigned int imm_wop[] = { 0xC6, 0xC7 };
+static unsigned int rw8[] = { 0xC6, 0x88, 0x8A, 0xAA };
+static unsigned int rw32[] = {
+ 0xC7, 0x89, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F, 0xAB
+};
+/* 8 bit only */
+static unsigned int mw8[] = { 0xC6, 0x88, 0x8A, 0xB60F, 0xBE0F, 0xAA };
+/* 16 bit only */
+static unsigned int mw16[] = { 0xB70F, 0xBF0F };
+/* 16 or 32 bit */
+static unsigned int mw32[] = { 0xC7 };
+/* 16, 32 or 64 bit */
+static unsigned int mw64[] = { 0x89, 0x8B, 0xAB };
+#endif /* not __i386__ */
+
+struct prefix_bits {
+ unsigned shorted:1;
+ unsigned enlarged:1;
+ unsigned rexr:1;
+ unsigned rex:1;
+};
+
+static int skip_prefix(unsigned char *addr, struct prefix_bits *prf)
+{
+ int i;
+ unsigned char *p = addr;
+ prf->shorted = 0;
+ prf->enlarged = 0;
+ prf->rexr = 0;
+ prf->rex = 0;
+
+restart:
+ for (i = 0; i < ARRAY_SIZE(prefix_codes); i++) {
+ if (*p == prefix_codes[i]) {
+ if (*p == 0x66)
+ prf->shorted = 1;
+#ifdef __amd64__
+ if ((*p & 0xf8) == 0x48)
+ prf->enlarged = 1;
+ if ((*p & 0xf4) == 0x44)
+ prf->rexr = 1;
+ if ((*p & 0xf0) == 0x40)
+ prf->rex = 1;
+#endif
+ p++;
+ goto restart;
+ }
+ }
+
+ return (p - addr);
+}
+
+static int get_opcode(unsigned char *addr, unsigned int *opcode)
+{
+ int len;
+
+ if (*addr == 0x0F) {
+ /* 0x0F is extension instruction */
+ *opcode = *(unsigned short *)addr;
+ len = 2;
+ } else {
+ *opcode = *addr;
+ len = 1;
+ }
+
+ return len;
+}
+
+#define CHECK_OP_TYPE(opcode, array, type) \
+ for (i = 0; i < ARRAY_SIZE(array); i++) { \
+ if (array[i] == opcode) { \
+ rv = type; \
+ goto exit; \
+ } \
+ }
+
+enum reason_type get_ins_type(unsigned long ins_addr)
+{
+ unsigned int opcode;
+ unsigned char *p;
+ struct prefix_bits prf;
+ int i;
+ enum reason_type rv = OTHERS;
+
+ p = (unsigned char *)ins_addr;
+ p += skip_prefix(p, &prf);
+ p += get_opcode(p, &opcode);
+
+ CHECK_OP_TYPE(opcode, reg_rop, REG_READ);
+ CHECK_OP_TYPE(opcode, reg_wop, REG_WRITE);
+ CHECK_OP_TYPE(opcode, imm_wop, IMM_WRITE);
+
+exit:
+ return rv;
+}
+#undef CHECK_OP_TYPE
+
+static unsigned int get_ins_reg_width(unsigned long ins_addr)
+{
+ unsigned int opcode;
+ unsigned char *p;
+ struct prefix_bits prf;
+ int i;
+
+ p = (unsigned char *)ins_addr;
+ p += skip_prefix(p, &prf);
+ p += get_opcode(p, &opcode);
+
+ for (i = 0; i < ARRAY_SIZE(rw8); i++)
+ if (rw8[i] == opcode)
+ return 1;
+
+ for (i = 0; i < ARRAY_SIZE(rw32); i++)
+ if (rw32[i] == opcode)
+ return prf.shorted ? 2 : (prf.enlarged ? 8 : 4);
+
+ printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode);
+ return 0;
+}
+
+unsigned int get_ins_mem_width(unsigned long ins_addr)
+{
+ unsigned int opcode;
+ unsigned char *p;
+ struct prefix_bits prf;
+ int i;
+
+ p = (unsigned char *)ins_addr;
+ p += skip_prefix(p, &prf);
+ p += get_opcode(p, &opcode);
+
+ for (i = 0; i < ARRAY_SIZE(mw8); i++)
+ if (mw8[i] == opcode)
+ return 1;
+
+ for (i = 0; i < ARRAY_SIZE(mw16); i++)
+ if (mw16[i] == opcode)
+ return 2;
+
+ for (i = 0; i < ARRAY_SIZE(mw32); i++)
+ if (mw32[i] == opcode)
+ return prf.shorted ? 2 : 4;
+
+ for (i = 0; i < ARRAY_SIZE(mw64); i++)
+ if (mw64[i] == opcode)
+ return prf.shorted ? 2 : (prf.enlarged ? 8 : 4);
+
+ printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode);
+ return 0;
+}
+
+/*
+ * Define register ident in mod/rm byte.
+ * Note: these are NOT the same as in ptrace-abi.h.
+ */
+enum {
+ arg_AL = 0,
+ arg_CL = 1,
+ arg_DL = 2,
+ arg_BL = 3,
+ arg_AH = 4,
+ arg_CH = 5,
+ arg_DH = 6,
+ arg_BH = 7,
+
+ arg_AX = 0,
+ arg_CX = 1,
+ arg_DX = 2,
+ arg_BX = 3,
+ arg_SP = 4,
+ arg_BP = 5,
+ arg_SI = 6,
+ arg_DI = 7,
+#ifdef __amd64__
+ arg_R8 = 8,
+ arg_R9 = 9,
+ arg_R10 = 10,
+ arg_R11 = 11,
+ arg_R12 = 12,
+ arg_R13 = 13,
+ arg_R14 = 14,
+ arg_R15 = 15
+#endif
+};
+
+static unsigned char *get_reg_w8(int no, int rex, struct pt_regs *regs)
+{
+ unsigned char *rv = NULL;
+
+ switch (no) {
+ case arg_AL:
+ rv = (unsigned char *)&regs->ax;
+ break;
+ case arg_BL:
+ rv = (unsigned char *)&regs->bx;
+ break;
+ case arg_CL:
+ rv = (unsigned char *)&regs->cx;
+ break;
+ case arg_DL:
+ rv = (unsigned char *)&regs->dx;
+ break;
+#ifdef __amd64__
+ case arg_R8:
+ rv = (unsigned char *)&regs->r8;
+ break;
+ case arg_R9:
+ rv = (unsigned char *)&regs->r9;
+ break;
+ case arg_R10:
+ rv = (unsigned char *)&regs->r10;
+ break;
+ case arg_R11:
+ rv = (unsigned char *)&regs->r11;
+ break;
+ case arg_R12:
+ rv = (unsigned char *)&regs->r12;
+ break;
+ case arg_R13:
+ rv = (unsigned char *)&regs->r13;
+ break;
+ case arg_R14:
+ rv = (unsigned char *)&regs->r14;
+ break;
+ case arg_R15:
+ rv = (unsigned char *)&regs->r15;
+ break;
+#endif
+ default:
+ break;
+ }
+
+ if (rv)
+ return rv;
+
+ if (rex) {
+ /*
+ * If REX prefix exists, access low bytes of SI etc.
+ * instead of AH etc.
+ */
+ switch (no) {
+ case arg_SI:
+ rv = (unsigned char *)&regs->si;
+ break;
+ case arg_DI:
+ rv = (unsigned char *)&regs->di;
+ break;
+ case arg_BP:
+ rv = (unsigned char *)&regs->bp;
+ break;
+ case arg_SP:
+ rv = (unsigned char *)&regs->sp;
+ break;
+ default:
+ break;
+ }
+ } else {
+ switch (no) {
+ case arg_AH:
+ rv = 1 + (unsigned char *)&regs->ax;
+ break;
+ case arg_BH:
+ rv = 1 + (unsigned char *)&regs->bx;
+ break;
+ case arg_CH:
+ rv = 1 + (unsigned char *)&regs->cx;
+ break;
+ case arg_DH:
+ rv = 1 + (unsigned char *)&regs->dx;
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (!rv)
+ printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no);
+
+ return rv;
+}
+
+static unsigned long *get_reg_w32(int no, struct pt_regs *regs)
+{
+ unsigned long *rv = NULL;
+
+ switch (no) {
+ case arg_AX:
+ rv = &regs->ax;
+ break;
+ case arg_BX:
+ rv = &regs->bx;
+ break;
+ case arg_CX:
+ rv = &regs->cx;
+ break;
+ case arg_DX:
+ rv = &regs->dx;
+ break;
+ case arg_SP:
+ rv = &regs->sp;
+ break;
+ case arg_BP:
+ rv = &regs->bp;
+ break;
+ case arg_SI:
+ rv = &regs->si;
+ break;
+ case arg_DI:
+ rv = &regs->di;
+ break;
+#ifdef __amd64__
+ case arg_R8:
+ rv = &regs->r8;
+ break;
+ case arg_R9:
+ rv = &regs->r9;
+ break;
+ case arg_R10:
+ rv = &regs->r10;
+ break;
+ case arg_R11:
+ rv = &regs->r11;
+ break;
+ case arg_R12:
+ rv = &regs->r12;
+ break;
+ case arg_R13:
+ rv = &regs->r13;
+ break;
+ case arg_R14:
+ rv = &regs->r14;
+ break;
+ case arg_R15:
+ rv = &regs->r15;
+ break;
+#endif
+ default:
+ printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no);
+ }
+
+ return rv;
+}
+
+unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs)
+{
+ unsigned int opcode;
+ int reg;
+ unsigned char *p;
+ struct prefix_bits prf;
+ int i;
+
+ p = (unsigned char *)ins_addr;
+ p += skip_prefix(p, &prf);
+ p += get_opcode(p, &opcode);
+ for (i = 0; i < ARRAY_SIZE(reg_rop); i++)
+ if (reg_rop[i] == opcode)
+ goto do_work;
+
+ for (i = 0; i < ARRAY_SIZE(reg_wop); i++)
+ if (reg_wop[i] == opcode)
+ goto do_work;
+
+ printk(KERN_ERR "mmiotrace: Not a register instruction, opcode "
+ "0x%02x\n", opcode);
+ goto err;
+
+do_work:
+ /* for STOS, source register is fixed */
+ if (opcode == 0xAA || opcode == 0xAB) {
+ reg = arg_AX;
+ } else {
+ unsigned char mod_rm = *p;
+ reg = ((mod_rm >> 3) & 0x7) | (prf.rexr << 3);
+ }
+ switch (get_ins_reg_width(ins_addr)) {
+ case 1:
+ return *get_reg_w8(reg, prf.rex, regs);
+
+ case 2:
+ return *(unsigned short *)get_reg_w32(reg, regs);
+
+ case 4:
+ return *(unsigned int *)get_reg_w32(reg, regs);
+
+#ifdef __amd64__
+ case 8:
+ return *(unsigned long *)get_reg_w32(reg, regs);
+#endif
+
+ default:
+ printk(KERN_ERR "mmiotrace: Error width# %d\n", reg);
+ }
+
+err:
+ return 0;
+}
+
+unsigned long get_ins_imm_val(unsigned long ins_addr)
+{
+ unsigned int opcode;
+ unsigned char mod_rm;
+ unsigned char mod;
+ unsigned char *p;
+ struct prefix_bits prf;
+ int i;
+
+ p = (unsigned char *)ins_addr;
+ p += skip_prefix(p, &prf);
+ p += get_opcode(p, &opcode);
+ for (i = 0; i < ARRAY_SIZE(imm_wop); i++)
+ if (imm_wop[i] == opcode)
+ goto do_work;
+
+ printk(KERN_ERR "mmiotrace: Not an immediate instruction, opcode "
+ "0x%02x\n", opcode);
+ goto err;
+
+do_work:
+ mod_rm = *p;
+ mod = mod_rm >> 6;
+ p++;
+ switch (mod) {
+ case 0:
+ /* if r/m is 5 we have a 32 disp (IA32 Manual 3, Table 2-2) */
+ /* AMD64: XXX Check for address size prefix? */
+ if ((mod_rm & 0x7) == 0x5)
+ p += 4;
+ break;
+
+ case 1:
+ p += 1;
+ break;
+
+ case 2:
+ p += 4;
+ break;
+
+ case 3:
+ default:
+ printk(KERN_ERR "mmiotrace: not a memory access instruction "
+ "at 0x%lx, rm_mod=0x%02x\n",
+ ins_addr, mod_rm);
+ }
+
+ switch (get_ins_reg_width(ins_addr)) {
+ case 1:
+ return *(unsigned char *)p;
+
+ case 2:
+ return *(unsigned short *)p;
+
+ case 4:
+ return *(unsigned int *)p;
+
+#ifdef __amd64__
+ case 8:
+ return *(unsigned long *)p;
+#endif
+
+ default:
+ printk(KERN_ERR "mmiotrace: Error: width.\n");
+ }
+
+err:
+ return 0;
+}
diff --git a/arch/x86/mm/pf_in.h b/arch/x86/mm/pf_in.h
new file mode 100644
index 000000000..e05341a51
--- /dev/null
+++ b/arch/x86/mm/pf_in.h
@@ -0,0 +1,39 @@
+/*
+ * Fault Injection Test harness (FI)
+ * Copyright (C) Intel Crop.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+ * USA.
+ *
+ */
+
+#ifndef __PF_H_
+#define __PF_H_
+
+enum reason_type {
+ NOT_ME, /* page fault is not in regions */
+ NOTHING, /* access others point in regions */
+ REG_READ, /* read from addr to reg */
+ REG_WRITE, /* write from reg to addr */
+ IMM_WRITE, /* write from imm to addr */
+ OTHERS /* Other instructions can not intercept */
+};
+
+enum reason_type get_ins_type(unsigned long ins_addr);
+unsigned int get_ins_mem_width(unsigned long ins_addr);
+unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs);
+unsigned long get_ins_imm_val(unsigned long ins_addr);
+
+#endif /* __PF_H_ */
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
new file mode 100644
index 000000000..c0e9c0040
--- /dev/null
+++ b/arch/x86/mm/pgtable.c
@@ -0,0 +1,891 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/mm.h>
+#include <linux/gfp.h>
+#include <linux/hugetlb.h>
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
+#include <asm/tlb.h>
+#include <asm/fixmap.h>
+#include <asm/mtrr.h>
+
+#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
+phys_addr_t physical_mask __ro_after_init = (1ULL << __PHYSICAL_MASK_SHIFT) - 1;
+EXPORT_SYMBOL(physical_mask);
+#endif
+
+#define PGALLOC_GFP (GFP_KERNEL_ACCOUNT | __GFP_ZERO)
+
+#ifdef CONFIG_HIGHPTE
+#define PGALLOC_USER_GFP __GFP_HIGHMEM
+#else
+#define PGALLOC_USER_GFP 0
+#endif
+
+gfp_t __userpte_alloc_gfp = PGALLOC_GFP | PGALLOC_USER_GFP;
+
+pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
+{
+ return (pte_t *)__get_free_page(PGALLOC_GFP & ~__GFP_ACCOUNT);
+}
+
+pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
+{
+ struct page *pte;
+
+ pte = alloc_pages(__userpte_alloc_gfp, 0);
+ if (!pte)
+ return NULL;
+ if (!pgtable_page_ctor(pte)) {
+ __free_page(pte);
+ return NULL;
+ }
+ return pte;
+}
+
+static int __init setup_userpte(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
+
+ /*
+ * "userpte=nohigh" disables allocation of user pagetables in
+ * high memory.
+ */
+ if (strcmp(arg, "nohigh") == 0)
+ __userpte_alloc_gfp &= ~__GFP_HIGHMEM;
+ else
+ return -EINVAL;
+ return 0;
+}
+early_param("userpte", setup_userpte);
+
+void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
+{
+ pgtable_page_dtor(pte);
+ paravirt_release_pte(page_to_pfn(pte));
+ paravirt_tlb_remove_table(tlb, pte);
+}
+
+#if CONFIG_PGTABLE_LEVELS > 2
+void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
+{
+ struct page *page = virt_to_page(pmd);
+ paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);
+ /*
+ * NOTE! For PAE, any changes to the top page-directory-pointer-table
+ * entries need a full cr3 reload to flush.
+ */
+#ifdef CONFIG_X86_PAE
+ tlb->need_flush_all = 1;
+#endif
+ pgtable_pmd_page_dtor(page);
+ paravirt_tlb_remove_table(tlb, page);
+}
+
+#if CONFIG_PGTABLE_LEVELS > 3
+void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
+{
+ paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
+ paravirt_tlb_remove_table(tlb, virt_to_page(pud));
+}
+
+#if CONFIG_PGTABLE_LEVELS > 4
+void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d)
+{
+ paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT);
+ paravirt_tlb_remove_table(tlb, virt_to_page(p4d));
+}
+#endif /* CONFIG_PGTABLE_LEVELS > 4 */
+#endif /* CONFIG_PGTABLE_LEVELS > 3 */
+#endif /* CONFIG_PGTABLE_LEVELS > 2 */
+
+static inline void pgd_list_add(pgd_t *pgd)
+{
+ struct page *page = virt_to_page(pgd);
+
+ list_add(&page->lru, &pgd_list);
+}
+
+static inline void pgd_list_del(pgd_t *pgd)
+{
+ struct page *page = virt_to_page(pgd);
+
+ list_del(&page->lru);
+}
+
+#define UNSHARED_PTRS_PER_PGD \
+ (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
+#define MAX_UNSHARED_PTRS_PER_PGD \
+ max_t(size_t, KERNEL_PGD_BOUNDARY, PTRS_PER_PGD)
+
+
+static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
+{
+ virt_to_page(pgd)->pt_mm = mm;
+}
+
+struct mm_struct *pgd_page_get_mm(struct page *page)
+{
+ return page->pt_mm;
+}
+
+static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
+{
+ /* If the pgd points to a shared pagetable level (either the
+ ptes in non-PAE, or shared PMD in PAE), then just copy the
+ references from swapper_pg_dir. */
+ if (CONFIG_PGTABLE_LEVELS == 2 ||
+ (CONFIG_PGTABLE_LEVELS == 3 && SHARED_KERNEL_PMD) ||
+ CONFIG_PGTABLE_LEVELS >= 4) {
+ clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
+ swapper_pg_dir + KERNEL_PGD_BOUNDARY,
+ KERNEL_PGD_PTRS);
+ }
+
+ /* list required to sync kernel mapping updates */
+ if (!SHARED_KERNEL_PMD) {
+ pgd_set_mm(pgd, mm);
+ pgd_list_add(pgd);
+ }
+}
+
+static void pgd_dtor(pgd_t *pgd)
+{
+ if (SHARED_KERNEL_PMD)
+ return;
+
+ spin_lock(&pgd_lock);
+ pgd_list_del(pgd);
+ spin_unlock(&pgd_lock);
+}
+
+/*
+ * List of all pgd's needed for non-PAE so it can invalidate entries
+ * in both cached and uncached pgd's; not needed for PAE since the
+ * kernel pmd is shared. If PAE were not to share the pmd a similar
+ * tactic would be needed. This is essentially codepath-based locking
+ * against pageattr.c; it is the unique case in which a valid change
+ * of kernel pagetables can't be lazily synchronized by vmalloc faults.
+ * vmalloc faults work because attached pagetables are never freed.
+ * -- nyc
+ */
+
+#ifdef CONFIG_X86_PAE
+/*
+ * In PAE mode, we need to do a cr3 reload (=tlb flush) when
+ * updating the top-level pagetable entries to guarantee the
+ * processor notices the update. Since this is expensive, and
+ * all 4 top-level entries are used almost immediately in a
+ * new process's life, we just pre-populate them here.
+ *
+ * Also, if we're in a paravirt environment where the kernel pmd is
+ * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
+ * and initialize the kernel pmds here.
+ */
+#define PREALLOCATED_PMDS UNSHARED_PTRS_PER_PGD
+#define MAX_PREALLOCATED_PMDS MAX_UNSHARED_PTRS_PER_PGD
+
+/*
+ * We allocate separate PMDs for the kernel part of the user page-table
+ * when PTI is enabled. We need them to map the per-process LDT into the
+ * user-space page-table.
+ */
+#define PREALLOCATED_USER_PMDS (static_cpu_has(X86_FEATURE_PTI) ? \
+ KERNEL_PGD_PTRS : 0)
+#define MAX_PREALLOCATED_USER_PMDS KERNEL_PGD_PTRS
+
+void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
+{
+ paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
+
+ /* Note: almost everything apart from _PAGE_PRESENT is
+ reserved at the pmd (PDPT) level. */
+ set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
+
+ /*
+ * According to Intel App note "TLBs, Paging-Structure Caches,
+ * and Their Invalidation", April 2007, document 317080-001,
+ * section 8.1: in PAE mode we explicitly have to flush the
+ * TLB via cr3 if the top-level pgd is changed...
+ */
+ flush_tlb_mm(mm);
+}
+#else /* !CONFIG_X86_PAE */
+
+/* No need to prepopulate any pagetable entries in non-PAE modes. */
+#define PREALLOCATED_PMDS 0
+#define MAX_PREALLOCATED_PMDS 0
+#define PREALLOCATED_USER_PMDS 0
+#define MAX_PREALLOCATED_USER_PMDS 0
+#endif /* CONFIG_X86_PAE */
+
+static void free_pmds(struct mm_struct *mm, pmd_t *pmds[], int count)
+{
+ int i;
+
+ for (i = 0; i < count; i++)
+ if (pmds[i]) {
+ pgtable_pmd_page_dtor(virt_to_page(pmds[i]));
+ free_page((unsigned long)pmds[i]);
+ mm_dec_nr_pmds(mm);
+ }
+}
+
+static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[], int count)
+{
+ int i;
+ bool failed = false;
+ gfp_t gfp = PGALLOC_GFP;
+
+ if (mm == &init_mm)
+ gfp &= ~__GFP_ACCOUNT;
+
+ for (i = 0; i < count; i++) {
+ pmd_t *pmd = (pmd_t *)__get_free_page(gfp);
+ if (!pmd)
+ failed = true;
+ if (pmd && !pgtable_pmd_page_ctor(virt_to_page(pmd))) {
+ free_page((unsigned long)pmd);
+ pmd = NULL;
+ failed = true;
+ }
+ if (pmd)
+ mm_inc_nr_pmds(mm);
+ pmds[i] = pmd;
+ }
+
+ if (failed) {
+ free_pmds(mm, pmds, count);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+/*
+ * Mop up any pmd pages which may still be attached to the pgd.
+ * Normally they will be freed by munmap/exit_mmap, but any pmd we
+ * preallocate which never got a corresponding vma will need to be
+ * freed manually.
+ */
+static void mop_up_one_pmd(struct mm_struct *mm, pgd_t *pgdp)
+{
+ pgd_t pgd = *pgdp;
+
+ if (pgd_val(pgd) != 0) {
+ pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
+
+ pgd_clear(pgdp);
+
+ paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
+ pmd_free(mm, pmd);
+ mm_dec_nr_pmds(mm);
+ }
+}
+
+static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
+{
+ int i;
+
+ for (i = 0; i < PREALLOCATED_PMDS; i++)
+ mop_up_one_pmd(mm, &pgdp[i]);
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+
+ if (!static_cpu_has(X86_FEATURE_PTI))
+ return;
+
+ pgdp = kernel_to_user_pgdp(pgdp);
+
+ for (i = 0; i < PREALLOCATED_USER_PMDS; i++)
+ mop_up_one_pmd(mm, &pgdp[i + KERNEL_PGD_BOUNDARY]);
+#endif
+}
+
+static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
+{
+ p4d_t *p4d;
+ pud_t *pud;
+ int i;
+
+ if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */
+ return;
+
+ p4d = p4d_offset(pgd, 0);
+ pud = pud_offset(p4d, 0);
+
+ for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) {
+ pmd_t *pmd = pmds[i];
+
+ if (i >= KERNEL_PGD_BOUNDARY)
+ memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
+ sizeof(pmd_t) * PTRS_PER_PMD);
+
+ pud_populate(mm, pud, pmd);
+ }
+}
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+static void pgd_prepopulate_user_pmd(struct mm_struct *mm,
+ pgd_t *k_pgd, pmd_t *pmds[])
+{
+ pgd_t *s_pgd = kernel_to_user_pgdp(swapper_pg_dir);
+ pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd);
+ p4d_t *u_p4d;
+ pud_t *u_pud;
+ int i;
+
+ u_p4d = p4d_offset(u_pgd, 0);
+ u_pud = pud_offset(u_p4d, 0);
+
+ s_pgd += KERNEL_PGD_BOUNDARY;
+ u_pud += KERNEL_PGD_BOUNDARY;
+
+ for (i = 0; i < PREALLOCATED_USER_PMDS; i++, u_pud++, s_pgd++) {
+ pmd_t *pmd = pmds[i];
+
+ memcpy(pmd, (pmd_t *)pgd_page_vaddr(*s_pgd),
+ sizeof(pmd_t) * PTRS_PER_PMD);
+
+ pud_populate(mm, u_pud, pmd);
+ }
+
+}
+#else
+static void pgd_prepopulate_user_pmd(struct mm_struct *mm,
+ pgd_t *k_pgd, pmd_t *pmds[])
+{
+}
+#endif
+/*
+ * Xen paravirt assumes pgd table should be in one page. 64 bit kernel also
+ * assumes that pgd should be in one page.
+ *
+ * But kernel with PAE paging that is not running as a Xen domain
+ * only needs to allocate 32 bytes for pgd instead of one page.
+ */
+#ifdef CONFIG_X86_PAE
+
+#include <linux/slab.h>
+
+#define PGD_SIZE (PTRS_PER_PGD * sizeof(pgd_t))
+#define PGD_ALIGN 32
+
+static struct kmem_cache *pgd_cache;
+
+static int __init pgd_cache_init(void)
+{
+ /*
+ * When PAE kernel is running as a Xen domain, it does not use
+ * shared kernel pmd. And this requires a whole page for pgd.
+ */
+ if (!SHARED_KERNEL_PMD)
+ return 0;
+
+ /*
+ * when PAE kernel is not running as a Xen domain, it uses
+ * shared kernel pmd. Shared kernel pmd does not require a whole
+ * page for pgd. We are able to just allocate a 32-byte for pgd.
+ * During boot time, we create a 32-byte slab for pgd table allocation.
+ */
+ pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_ALIGN,
+ SLAB_PANIC, NULL);
+ return 0;
+}
+core_initcall(pgd_cache_init);
+
+static inline pgd_t *_pgd_alloc(void)
+{
+ /*
+ * If no SHARED_KERNEL_PMD, PAE kernel is running as a Xen domain.
+ * We allocate one page for pgd.
+ */
+ if (!SHARED_KERNEL_PMD)
+ return (pgd_t *)__get_free_pages(PGALLOC_GFP,
+ PGD_ALLOCATION_ORDER);
+
+ /*
+ * Now PAE kernel is not running as a Xen domain. We can allocate
+ * a 32-byte slab for pgd to save memory space.
+ */
+ return kmem_cache_alloc(pgd_cache, PGALLOC_GFP);
+}
+
+static inline void _pgd_free(pgd_t *pgd)
+{
+ if (!SHARED_KERNEL_PMD)
+ free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
+ else
+ kmem_cache_free(pgd_cache, pgd);
+}
+#else
+
+static inline pgd_t *_pgd_alloc(void)
+{
+ return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER);
+}
+
+static inline void _pgd_free(pgd_t *pgd)
+{
+ free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
+}
+#endif /* CONFIG_X86_PAE */
+
+pgd_t *pgd_alloc(struct mm_struct *mm)
+{
+ pgd_t *pgd;
+ pmd_t *u_pmds[MAX_PREALLOCATED_USER_PMDS];
+ pmd_t *pmds[MAX_PREALLOCATED_PMDS];
+
+ pgd = _pgd_alloc();
+
+ if (pgd == NULL)
+ goto out;
+
+ mm->pgd = pgd;
+
+ if (preallocate_pmds(mm, pmds, PREALLOCATED_PMDS) != 0)
+ goto out_free_pgd;
+
+ if (preallocate_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS) != 0)
+ goto out_free_pmds;
+
+ if (paravirt_pgd_alloc(mm) != 0)
+ goto out_free_user_pmds;
+
+ /*
+ * Make sure that pre-populating the pmds is atomic with
+ * respect to anything walking the pgd_list, so that they
+ * never see a partially populated pgd.
+ */
+ spin_lock(&pgd_lock);
+
+ pgd_ctor(mm, pgd);
+ pgd_prepopulate_pmd(mm, pgd, pmds);
+ pgd_prepopulate_user_pmd(mm, pgd, u_pmds);
+
+ spin_unlock(&pgd_lock);
+
+ return pgd;
+
+out_free_user_pmds:
+ free_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS);
+out_free_pmds:
+ free_pmds(mm, pmds, PREALLOCATED_PMDS);
+out_free_pgd:
+ _pgd_free(pgd);
+out:
+ return NULL;
+}
+
+void pgd_free(struct mm_struct *mm, pgd_t *pgd)
+{
+ pgd_mop_up_pmds(mm, pgd);
+ pgd_dtor(pgd);
+ paravirt_pgd_free(mm, pgd);
+ _pgd_free(pgd);
+}
+
+/*
+ * Used to set accessed or dirty bits in the page table entries
+ * on other architectures. On x86, the accessed and dirty bits
+ * are tracked by hardware. However, do_wp_page calls this function
+ * to also make the pte writeable at the same time the dirty bit is
+ * set. In that case we do actually need to write the PTE.
+ */
+int ptep_set_access_flags(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep,
+ pte_t entry, int dirty)
+{
+ int changed = !pte_same(*ptep, entry);
+
+ if (changed && dirty)
+ set_pte(ptep, entry);
+
+ return changed;
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+int pmdp_set_access_flags(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp,
+ pmd_t entry, int dirty)
+{
+ int changed = !pmd_same(*pmdp, entry);
+
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+
+ if (changed && dirty) {
+ set_pmd(pmdp, entry);
+ /*
+ * We had a write-protection fault here and changed the pmd
+ * to to more permissive. No need to flush the TLB for that,
+ * #PF is architecturally guaranteed to do that and in the
+ * worst-case we'll generate a spurious fault.
+ */
+ }
+
+ return changed;
+}
+
+int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
+ pud_t *pudp, pud_t entry, int dirty)
+{
+ int changed = !pud_same(*pudp, entry);
+
+ VM_BUG_ON(address & ~HPAGE_PUD_MASK);
+
+ if (changed && dirty) {
+ set_pud(pudp, entry);
+ /*
+ * We had a write-protection fault here and changed the pud
+ * to to more permissive. No need to flush the TLB for that,
+ * #PF is architecturally guaranteed to do that and in the
+ * worst-case we'll generate a spurious fault.
+ */
+ }
+
+ return changed;
+}
+#endif
+
+int ptep_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep)
+{
+ int ret = 0;
+
+ if (pte_young(*ptep))
+ ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
+ (unsigned long *) &ptep->pte);
+
+ return ret;
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmdp)
+{
+ int ret = 0;
+
+ if (pmd_young(*pmdp))
+ ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
+ (unsigned long *)pmdp);
+
+ return ret;
+}
+int pudp_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long addr, pud_t *pudp)
+{
+ int ret = 0;
+
+ if (pud_young(*pudp))
+ ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
+ (unsigned long *)pudp);
+
+ return ret;
+}
+#endif
+
+int ptep_clear_flush_young(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep)
+{
+ /*
+ * On x86 CPUs, clearing the accessed bit without a TLB flush
+ * doesn't cause data corruption. [ It could cause incorrect
+ * page aging and the (mistaken) reclaim of hot pages, but the
+ * chance of that should be relatively low. ]
+ *
+ * So as a performance optimization don't flush the TLB when
+ * clearing the accessed bit, it will eventually be flushed by
+ * a context switch or a VM operation anyway. [ In the rare
+ * event of it not getting flushed for a long time the delay
+ * shouldn't really matter because there's no real memory
+ * pressure for swapout to react to. ]
+ */
+ return ptep_test_and_clear_young(vma, address, ptep);
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+int pmdp_clear_flush_young(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp)
+{
+ int young;
+
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+
+ young = pmdp_test_and_clear_young(vma, address, pmdp);
+ if (young)
+ flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+
+ return young;
+}
+#endif
+
+/**
+ * reserve_top_address - reserves a hole in the top of kernel address space
+ * @reserve - size of hole to reserve
+ *
+ * Can be used to relocate the fixmap area and poke a hole in the top
+ * of kernel address space to make room for a hypervisor.
+ */
+void __init reserve_top_address(unsigned long reserve)
+{
+#ifdef CONFIG_X86_32
+ BUG_ON(fixmaps_set > 0);
+ __FIXADDR_TOP = round_down(-reserve, 1 << PMD_SHIFT) - PAGE_SIZE;
+ printk(KERN_INFO "Reserving virtual address space above 0x%08lx (rounded to 0x%08lx)\n",
+ -reserve, __FIXADDR_TOP + PAGE_SIZE);
+#endif
+}
+
+int fixmaps_set;
+
+void __native_set_fixmap(enum fixed_addresses idx, pte_t pte)
+{
+ unsigned long address = __fix_to_virt(idx);
+
+#ifdef CONFIG_X86_64
+ /*
+ * Ensure that the static initial page tables are covering the
+ * fixmap completely.
+ */
+ BUILD_BUG_ON(__end_of_permanent_fixed_addresses >
+ (FIXMAP_PMD_NUM * PTRS_PER_PTE));
+#endif
+
+ if (idx >= __end_of_fixed_addresses) {
+ BUG();
+ return;
+ }
+ set_pte_vaddr(address, pte);
+ fixmaps_set++;
+}
+
+void native_set_fixmap(unsigned /* enum fixed_addresses */ idx,
+ phys_addr_t phys, pgprot_t flags)
+{
+ /* Sanitize 'prot' against any unsupported bits: */
+ pgprot_val(flags) &= __default_kernel_pte_mask;
+
+ __native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags));
+}
+
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+#ifdef CONFIG_X86_5LEVEL
+/**
+ * p4d_set_huge - setup kernel P4D mapping
+ *
+ * No 512GB pages yet -- always return 0
+ */
+int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)
+{
+ return 0;
+}
+
+/**
+ * p4d_clear_huge - clear kernel P4D mapping when it is set
+ *
+ * No 512GB pages yet -- always return 0
+ */
+int p4d_clear_huge(p4d_t *p4d)
+{
+ return 0;
+}
+#endif
+
+/**
+ * pud_set_huge - setup kernel PUD mapping
+ *
+ * MTRRs can override PAT memory types with 4KiB granularity. Therefore, this
+ * function sets up a huge page only if any of the following conditions are met:
+ *
+ * - MTRRs are disabled, or
+ *
+ * - MTRRs are enabled and the range is completely covered by a single MTRR, or
+ *
+ * - MTRRs are enabled and the corresponding MTRR memory type is WB, which
+ * has no effect on the requested PAT memory type.
+ *
+ * Callers should try to decrease page size (1GB -> 2MB -> 4K) if the bigger
+ * page mapping attempt fails.
+ *
+ * Returns 1 on success and 0 on failure.
+ */
+int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
+{
+ u8 mtrr, uniform;
+
+ mtrr = mtrr_type_lookup(addr, addr + PUD_SIZE, &uniform);
+ if ((mtrr != MTRR_TYPE_INVALID) && (!uniform) &&
+ (mtrr != MTRR_TYPE_WRBACK))
+ return 0;
+
+ /* Bail out if we are we on a populated non-leaf entry: */
+ if (pud_present(*pud) && !pud_huge(*pud))
+ return 0;
+
+ prot = pgprot_4k_2_large(prot);
+
+ set_pte((pte_t *)pud, pfn_pte(
+ (u64)addr >> PAGE_SHIFT,
+ __pgprot(pgprot_val(prot) | _PAGE_PSE)));
+
+ return 1;
+}
+
+/**
+ * pmd_set_huge - setup kernel PMD mapping
+ *
+ * See text over pud_set_huge() above.
+ *
+ * Returns 1 on success and 0 on failure.
+ */
+int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
+{
+ u8 mtrr, uniform;
+
+ mtrr = mtrr_type_lookup(addr, addr + PMD_SIZE, &uniform);
+ if ((mtrr != MTRR_TYPE_INVALID) && (!uniform) &&
+ (mtrr != MTRR_TYPE_WRBACK)) {
+ pr_warn_once("%s: Cannot satisfy [mem %#010llx-%#010llx] with a huge-page mapping due to MTRR override.\n",
+ __func__, addr, addr + PMD_SIZE);
+ return 0;
+ }
+
+ /* Bail out if we are we on a populated non-leaf entry: */
+ if (pmd_present(*pmd) && !pmd_huge(*pmd))
+ return 0;
+
+ prot = pgprot_4k_2_large(prot);
+
+ set_pte((pte_t *)pmd, pfn_pte(
+ (u64)addr >> PAGE_SHIFT,
+ __pgprot(pgprot_val(prot) | _PAGE_PSE)));
+
+ return 1;
+}
+
+/**
+ * pud_clear_huge - clear kernel PUD mapping when it is set
+ *
+ * Returns 1 on success and 0 on failure (no PUD map is found).
+ */
+int pud_clear_huge(pud_t *pud)
+{
+ if (pud_large(*pud)) {
+ pud_clear(pud);
+ return 1;
+ }
+
+ return 0;
+}
+
+/**
+ * pmd_clear_huge - clear kernel PMD mapping when it is set
+ *
+ * Returns 1 on success and 0 on failure (no PMD map is found).
+ */
+int pmd_clear_huge(pmd_t *pmd)
+{
+ if (pmd_large(*pmd)) {
+ pmd_clear(pmd);
+ return 1;
+ }
+
+ return 0;
+}
+
+#ifdef CONFIG_X86_64
+/**
+ * pud_free_pmd_page - Clear pud entry and free pmd page.
+ * @pud: Pointer to a PUD.
+ * @addr: Virtual address associated with pud.
+ *
+ * Context: The pud range has been unmapped and TLB purged.
+ * Return: 1 if clearing the entry succeeded. 0 otherwise.
+ *
+ * NOTE: Callers must allow a single page allocation.
+ */
+int pud_free_pmd_page(pud_t *pud, unsigned long addr)
+{
+ pmd_t *pmd, *pmd_sv;
+ pte_t *pte;
+ int i;
+
+ if (pud_none(*pud))
+ return 1;
+
+ pmd = (pmd_t *)pud_page_vaddr(*pud);
+ pmd_sv = (pmd_t *)__get_free_page(GFP_KERNEL);
+ if (!pmd_sv)
+ return 0;
+
+ for (i = 0; i < PTRS_PER_PMD; i++) {
+ pmd_sv[i] = pmd[i];
+ if (!pmd_none(pmd[i]))
+ pmd_clear(&pmd[i]);
+ }
+
+ pud_clear(pud);
+
+ /* INVLPG to clear all paging-structure caches */
+ flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);
+
+ for (i = 0; i < PTRS_PER_PMD; i++) {
+ if (!pmd_none(pmd_sv[i])) {
+ pte = (pte_t *)pmd_page_vaddr(pmd_sv[i]);
+ free_page((unsigned long)pte);
+ }
+ }
+
+ free_page((unsigned long)pmd_sv);
+
+ pgtable_pmd_page_dtor(virt_to_page(pmd));
+ free_page((unsigned long)pmd);
+
+ return 1;
+}
+
+/**
+ * pmd_free_pte_page - Clear pmd entry and free pte page.
+ * @pmd: Pointer to a PMD.
+ * @addr: Virtual address associated with pmd.
+ *
+ * Context: The pmd range has been unmapped and TLB purged.
+ * Return: 1 if clearing the entry succeeded. 0 otherwise.
+ */
+int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
+{
+ pte_t *pte;
+
+ if (pmd_none(*pmd))
+ return 1;
+
+ pte = (pte_t *)pmd_page_vaddr(*pmd);
+ pmd_clear(pmd);
+
+ /* INVLPG to clear all paging-structure caches */
+ flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);
+
+ free_page((unsigned long)pte);
+
+ return 1;
+}
+
+#else /* !CONFIG_X86_64 */
+
+int pud_free_pmd_page(pud_t *pud, unsigned long addr)
+{
+ return pud_none(*pud);
+}
+
+/*
+ * Disable free page handling on x86-PAE. This assures that ioremap()
+ * does not update sync'd pmd entries. See vmalloc_sync_one().
+ */
+int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
+{
+ return pmd_none(*pmd);
+}
+
+#endif /* CONFIG_X86_64 */
+#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
new file mode 100644
index 000000000..9bb7f0ab9
--- /dev/null
+++ b/arch/x86/mm/pgtable_32.c
@@ -0,0 +1,105 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/nmi.h>
+#include <linux/swap.h>
+#include <linux/smp.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/spinlock.h>
+
+#include <asm/cpu_entry_area.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/fixmap.h>
+#include <asm/e820/api.h>
+#include <asm/tlb.h>
+#include <asm/tlbflush.h>
+#include <asm/io.h>
+
+unsigned int __VMALLOC_RESERVE = 128 << 20;
+
+/*
+ * Associate a virtual page frame with a given physical page frame
+ * and protection flags for that frame.
+ */
+void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ pgd = swapper_pg_dir + pgd_index(vaddr);
+ if (pgd_none(*pgd)) {
+ BUG();
+ return;
+ }
+ p4d = p4d_offset(pgd, vaddr);
+ if (p4d_none(*p4d)) {
+ BUG();
+ return;
+ }
+ pud = pud_offset(p4d, vaddr);
+ if (pud_none(*pud)) {
+ BUG();
+ return;
+ }
+ pmd = pmd_offset(pud, vaddr);
+ if (pmd_none(*pmd)) {
+ BUG();
+ return;
+ }
+ pte = pte_offset_kernel(pmd, vaddr);
+ if (!pte_none(pteval))
+ set_pte_at(&init_mm, vaddr, pte, pteval);
+ else
+ pte_clear(&init_mm, vaddr, pte);
+
+ /*
+ * It's enough to flush this one mapping.
+ * (PGE mappings get flushed as well)
+ */
+ __flush_tlb_one_kernel(vaddr);
+}
+
+unsigned long __FIXADDR_TOP = 0xfffff000;
+EXPORT_SYMBOL(__FIXADDR_TOP);
+
+/*
+ * vmalloc=size forces the vmalloc area to be exactly 'size'
+ * bytes. This can be used to increase (or decrease) the
+ * vmalloc area - the default is 128m.
+ */
+static int __init parse_vmalloc(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
+
+ /* Add VMALLOC_OFFSET to the parsed value due to vm area guard hole*/
+ __VMALLOC_RESERVE = memparse(arg, &arg) + VMALLOC_OFFSET;
+ return 0;
+}
+early_param("vmalloc", parse_vmalloc);
+
+/*
+ * reservetop=size reserves a hole at the top of the kernel address space which
+ * a hypervisor can load into later. Needed for dynamically loaded hypervisors,
+ * so relocating the fixmap can be done before paging initialization.
+ */
+static int __init parse_reservetop(char *arg)
+{
+ unsigned long address;
+
+ if (!arg)
+ return -EINVAL;
+
+ address = memparse(arg, &arg);
+ reserve_top_address(address);
+ early_ioremap_init();
+ return 0;
+}
+early_param("reservetop", parse_reservetop);
diff --git a/arch/x86/mm/physaddr.c b/arch/x86/mm/physaddr.c
new file mode 100644
index 000000000..7f9acb683
--- /dev/null
+++ b/arch/x86/mm/physaddr.c
@@ -0,0 +1,99 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bootmem.h>
+#include <linux/mmdebug.h>
+#include <linux/export.h>
+#include <linux/mm.h>
+
+#include <asm/page.h>
+
+#include "physaddr.h"
+
+#ifdef CONFIG_X86_64
+
+#ifdef CONFIG_DEBUG_VIRTUAL
+unsigned long __phys_addr(unsigned long x)
+{
+ unsigned long y = x - __START_KERNEL_map;
+
+ /* use the carry flag to determine if x was < __START_KERNEL_map */
+ if (unlikely(x > y)) {
+ x = y + phys_base;
+
+ VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE);
+ } else {
+ x = y + (__START_KERNEL_map - PAGE_OFFSET);
+
+ /* carry flag will be set if starting x was >= PAGE_OFFSET */
+ VIRTUAL_BUG_ON((x > y) || !phys_addr_valid(x));
+ }
+
+ return x;
+}
+EXPORT_SYMBOL(__phys_addr);
+
+unsigned long __phys_addr_symbol(unsigned long x)
+{
+ unsigned long y = x - __START_KERNEL_map;
+
+ /* only check upper bounds since lower bounds will trigger carry */
+ VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE);
+
+ return y + phys_base;
+}
+EXPORT_SYMBOL(__phys_addr_symbol);
+#endif
+
+bool __virt_addr_valid(unsigned long x)
+{
+ unsigned long y = x - __START_KERNEL_map;
+
+ /* use the carry flag to determine if x was < __START_KERNEL_map */
+ if (unlikely(x > y)) {
+ x = y + phys_base;
+
+ if (y >= KERNEL_IMAGE_SIZE)
+ return false;
+ } else {
+ x = y + (__START_KERNEL_map - PAGE_OFFSET);
+
+ /* carry flag will be set if starting x was >= PAGE_OFFSET */
+ if ((x > y) || !phys_addr_valid(x))
+ return false;
+ }
+
+ return pfn_valid(x >> PAGE_SHIFT);
+}
+EXPORT_SYMBOL(__virt_addr_valid);
+
+#else
+
+#ifdef CONFIG_DEBUG_VIRTUAL
+unsigned long __phys_addr(unsigned long x)
+{
+ unsigned long phys_addr = x - PAGE_OFFSET;
+ /* VMALLOC_* aren't constants */
+ VIRTUAL_BUG_ON(x < PAGE_OFFSET);
+ VIRTUAL_BUG_ON(__vmalloc_start_set && is_vmalloc_addr((void *) x));
+ /* max_low_pfn is set early, but not _that_ early */
+ if (max_low_pfn) {
+ VIRTUAL_BUG_ON((phys_addr >> PAGE_SHIFT) > max_low_pfn);
+ BUG_ON(slow_virt_to_phys((void *)x) != phys_addr);
+ }
+ return phys_addr;
+}
+EXPORT_SYMBOL(__phys_addr);
+#endif
+
+bool __virt_addr_valid(unsigned long x)
+{
+ if (x < PAGE_OFFSET)
+ return false;
+ if (__vmalloc_start_set && is_vmalloc_addr((void *) x))
+ return false;
+ if (x >= FIXADDR_START)
+ return false;
+ return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT);
+}
+EXPORT_SYMBOL(__virt_addr_valid);
+
+#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/mm/physaddr.h b/arch/x86/mm/physaddr.h
new file mode 100644
index 000000000..9f6419caf
--- /dev/null
+++ b/arch/x86/mm/physaddr.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <asm/processor.h>
+
+static inline int phys_addr_valid(resource_size_t addr)
+{
+#ifdef CONFIG_PHYS_ADDR_T_64BIT
+ return !(addr >> boot_cpu_data.x86_phys_bits);
+#else
+ return 1;
+#endif
+}
diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c
new file mode 100644
index 000000000..6e98e0a7c
--- /dev/null
+++ b/arch/x86/mm/pkeys.c
@@ -0,0 +1,226 @@
+/*
+ * Intel Memory Protection Keys management
+ * Copyright (c) 2015, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+#include <linux/debugfs.h> /* debugfs_create_u32() */
+#include <linux/mm_types.h> /* mm_struct, vma, etc... */
+#include <linux/pkeys.h> /* PKEY_* */
+#include <uapi/asm-generic/mman-common.h>
+
+#include <asm/cpufeature.h> /* boot_cpu_has, ... */
+#include <asm/mmu_context.h> /* vma_pkey() */
+
+int __execute_only_pkey(struct mm_struct *mm)
+{
+ bool need_to_set_mm_pkey = false;
+ int execute_only_pkey = mm->context.execute_only_pkey;
+ int ret;
+
+ /* Do we need to assign a pkey for mm's execute-only maps? */
+ if (execute_only_pkey == -1) {
+ /* Go allocate one to use, which might fail */
+ execute_only_pkey = mm_pkey_alloc(mm);
+ if (execute_only_pkey < 0)
+ return -1;
+ need_to_set_mm_pkey = true;
+ }
+
+ /*
+ * We do not want to go through the relatively costly
+ * dance to set PKRU if we do not need to. Check it
+ * first and assume that if the execute-only pkey is
+ * write-disabled that we do not have to set it
+ * ourselves. We need preempt off so that nobody
+ * can make fpregs inactive.
+ */
+ preempt_disable();
+ if (!need_to_set_mm_pkey &&
+ current->thread.fpu.initialized &&
+ !__pkru_allows_read(read_pkru(), execute_only_pkey)) {
+ preempt_enable();
+ return execute_only_pkey;
+ }
+ preempt_enable();
+
+ /*
+ * Set up PKRU so that it denies access for everything
+ * other than execution.
+ */
+ ret = arch_set_user_pkey_access(current, execute_only_pkey,
+ PKEY_DISABLE_ACCESS);
+ /*
+ * If the PKRU-set operation failed somehow, just return
+ * 0 and effectively disable execute-only support.
+ */
+ if (ret) {
+ mm_set_pkey_free(mm, execute_only_pkey);
+ return -1;
+ }
+
+ /* We got one, store it and use it from here on out */
+ if (need_to_set_mm_pkey)
+ mm->context.execute_only_pkey = execute_only_pkey;
+ return execute_only_pkey;
+}
+
+static inline bool vma_is_pkey_exec_only(struct vm_area_struct *vma)
+{
+ /* Do this check first since the vm_flags should be hot */
+ if ((vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) != VM_EXEC)
+ return false;
+ if (vma_pkey(vma) != vma->vm_mm->context.execute_only_pkey)
+ return false;
+
+ return true;
+}
+
+/*
+ * This is only called for *plain* mprotect calls.
+ */
+int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, int pkey)
+{
+ /*
+ * Is this an mprotect_pkey() call? If so, never
+ * override the value that came from the user.
+ */
+ if (pkey != -1)
+ return pkey;
+
+ /*
+ * The mapping is execute-only. Go try to get the
+ * execute-only protection key. If we fail to do that,
+ * fall through as if we do not have execute-only
+ * support in this mm.
+ */
+ if (prot == PROT_EXEC) {
+ pkey = execute_only_pkey(vma->vm_mm);
+ if (pkey > 0)
+ return pkey;
+ } else if (vma_is_pkey_exec_only(vma)) {
+ /*
+ * Protections are *not* PROT_EXEC, but the mapping
+ * is using the exec-only pkey. This mapping was
+ * PROT_EXEC and will no longer be. Move back to
+ * the default pkey.
+ */
+ return ARCH_DEFAULT_PKEY;
+ }
+
+ /*
+ * This is a vanilla, non-pkey mprotect (or we failed to
+ * setup execute-only), inherit the pkey from the VMA we
+ * are working on.
+ */
+ return vma_pkey(vma);
+}
+
+#define PKRU_AD_KEY(pkey) (PKRU_AD_BIT << ((pkey) * PKRU_BITS_PER_PKEY))
+
+/*
+ * Make the default PKRU value (at execve() time) as restrictive
+ * as possible. This ensures that any threads clone()'d early
+ * in the process's lifetime will not accidentally get access
+ * to data which is pkey-protected later on.
+ */
+u32 init_pkru_value = PKRU_AD_KEY( 1) | PKRU_AD_KEY( 2) | PKRU_AD_KEY( 3) |
+ PKRU_AD_KEY( 4) | PKRU_AD_KEY( 5) | PKRU_AD_KEY( 6) |
+ PKRU_AD_KEY( 7) | PKRU_AD_KEY( 8) | PKRU_AD_KEY( 9) |
+ PKRU_AD_KEY(10) | PKRU_AD_KEY(11) | PKRU_AD_KEY(12) |
+ PKRU_AD_KEY(13) | PKRU_AD_KEY(14) | PKRU_AD_KEY(15);
+
+/*
+ * Called from the FPU code when creating a fresh set of FPU
+ * registers. This is called from a very specific context where
+ * we know the FPU regstiers are safe for use and we can use PKRU
+ * directly.
+ */
+void copy_init_pkru_to_fpregs(void)
+{
+ u32 init_pkru_value_snapshot = READ_ONCE(init_pkru_value);
+ /*
+ * Any write to PKRU takes it out of the XSAVE 'init
+ * state' which increases context switch cost. Avoid
+ * writing 0 when PKRU was already 0.
+ */
+ if (!init_pkru_value_snapshot && !read_pkru())
+ return;
+ /*
+ * Override the PKRU state that came from 'init_fpstate'
+ * with the baseline from the process.
+ */
+ write_pkru(init_pkru_value_snapshot);
+}
+
+static ssize_t init_pkru_read_file(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ char buf[32];
+ unsigned int len;
+
+ len = sprintf(buf, "0x%x\n", init_pkru_value);
+ return simple_read_from_buffer(user_buf, count, ppos, buf, len);
+}
+
+static ssize_t init_pkru_write_file(struct file *file,
+ const char __user *user_buf, size_t count, loff_t *ppos)
+{
+ char buf[32];
+ ssize_t len;
+ u32 new_init_pkru;
+
+ len = min(count, sizeof(buf) - 1);
+ if (copy_from_user(buf, user_buf, len))
+ return -EFAULT;
+
+ /* Make the buffer a valid string that we can not overrun */
+ buf[len] = '\0';
+ if (kstrtouint(buf, 0, &new_init_pkru))
+ return -EINVAL;
+
+ /*
+ * Don't allow insane settings that will blow the system
+ * up immediately if someone attempts to disable access
+ * or writes to pkey 0.
+ */
+ if (new_init_pkru & (PKRU_AD_BIT|PKRU_WD_BIT))
+ return -EINVAL;
+
+ WRITE_ONCE(init_pkru_value, new_init_pkru);
+ return count;
+}
+
+static const struct file_operations fops_init_pkru = {
+ .read = init_pkru_read_file,
+ .write = init_pkru_write_file,
+ .llseek = default_llseek,
+};
+
+static int __init create_init_pkru_value(void)
+{
+ debugfs_create_file("init_pkru", S_IRUSR | S_IWUSR,
+ arch_debugfs_dir, NULL, &fops_init_pkru);
+ return 0;
+}
+late_initcall(create_init_pkru_value);
+
+static __init int setup_init_pkru(char *opt)
+{
+ u32 new_init_pkru;
+
+ if (kstrtouint(opt, 0, &new_init_pkru))
+ return 1;
+
+ WRITE_ONCE(init_pkru_value, new_init_pkru);
+
+ return 1;
+}
+__setup("init_pkru=", setup_init_pkru);
diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
new file mode 100644
index 000000000..622d5968c
--- /dev/null
+++ b/arch/x86/mm/pti.c
@@ -0,0 +1,658 @@
+/*
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * This code is based in part on work published here:
+ *
+ * https://github.com/IAIK/KAISER
+ *
+ * The original work was written by and and signed off by for the Linux
+ * kernel by:
+ *
+ * Signed-off-by: Richard Fellner <richard.fellner@student.tugraz.at>
+ * Signed-off-by: Moritz Lipp <moritz.lipp@iaik.tugraz.at>
+ * Signed-off-by: Daniel Gruss <daniel.gruss@iaik.tugraz.at>
+ * Signed-off-by: Michael Schwarz <michael.schwarz@iaik.tugraz.at>
+ *
+ * Major changes to the original code by: Dave Hansen <dave.hansen@intel.com>
+ * Mostly rewritten by Thomas Gleixner <tglx@linutronix.de> and
+ * Andy Lutomirsky <luto@amacapital.net>
+ */
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/bug.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+#include <linux/uaccess.h>
+#include <linux/cpu.h>
+
+#include <asm/cpufeature.h>
+#include <asm/hypervisor.h>
+#include <asm/vsyscall.h>
+#include <asm/cmdline.h>
+#include <asm/pti.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+#include <asm/desc.h>
+#include <asm/sections.h>
+
+#undef pr_fmt
+#define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt
+
+/* Backporting helper */
+#ifndef __GFP_NOTRACK
+#define __GFP_NOTRACK 0
+#endif
+
+/*
+ * Define the page-table levels we clone for user-space on 32
+ * and 64 bit.
+ */
+#ifdef CONFIG_X86_64
+#define PTI_LEVEL_KERNEL_IMAGE PTI_CLONE_PMD
+#else
+#define PTI_LEVEL_KERNEL_IMAGE PTI_CLONE_PTE
+#endif
+
+static void __init pti_print_if_insecure(const char *reason)
+{
+ if (boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
+ pr_info("%s\n", reason);
+}
+
+static void __init pti_print_if_secure(const char *reason)
+{
+ if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
+ pr_info("%s\n", reason);
+}
+
+enum pti_mode {
+ PTI_AUTO = 0,
+ PTI_FORCE_OFF,
+ PTI_FORCE_ON
+} pti_mode;
+
+void __init pti_check_boottime_disable(void)
+{
+ char arg[5];
+ int ret;
+
+ /* Assume mode is auto unless overridden. */
+ pti_mode = PTI_AUTO;
+
+ if (hypervisor_is_type(X86_HYPER_XEN_PV)) {
+ pti_mode = PTI_FORCE_OFF;
+ pti_print_if_insecure("disabled on XEN PV.");
+ return;
+ }
+
+ ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg));
+ if (ret > 0) {
+ if (ret == 3 && !strncmp(arg, "off", 3)) {
+ pti_mode = PTI_FORCE_OFF;
+ pti_print_if_insecure("disabled on command line.");
+ return;
+ }
+ if (ret == 2 && !strncmp(arg, "on", 2)) {
+ pti_mode = PTI_FORCE_ON;
+ pti_print_if_secure("force enabled on command line.");
+ goto enable;
+ }
+ if (ret == 4 && !strncmp(arg, "auto", 4)) {
+ pti_mode = PTI_AUTO;
+ goto autosel;
+ }
+ }
+
+ if (cmdline_find_option_bool(boot_command_line, "nopti") ||
+ cpu_mitigations_off()) {
+ pti_mode = PTI_FORCE_OFF;
+ pti_print_if_insecure("disabled on command line.");
+ return;
+ }
+
+autosel:
+ if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
+ return;
+enable:
+ setup_force_cpu_cap(X86_FEATURE_PTI);
+}
+
+pgd_t __pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd)
+{
+ /*
+ * Changes to the high (kernel) portion of the kernelmode page
+ * tables are not automatically propagated to the usermode tables.
+ *
+ * Users should keep in mind that, unlike the kernelmode tables,
+ * there is no vmalloc_fault equivalent for the usermode tables.
+ * Top-level entries added to init_mm's usermode pgd after boot
+ * will not be automatically propagated to other mms.
+ */
+ if (!pgdp_maps_userspace(pgdp))
+ return pgd;
+
+ /*
+ * The user page tables get the full PGD, accessible from
+ * userspace:
+ */
+ kernel_to_user_pgdp(pgdp)->pgd = pgd.pgd;
+
+ /*
+ * If this is normal user memory, make it NX in the kernel
+ * pagetables so that, if we somehow screw up and return to
+ * usermode with the kernel CR3 loaded, we'll get a page fault
+ * instead of allowing user code to execute with the wrong CR3.
+ *
+ * As exceptions, we don't set NX if:
+ * - _PAGE_USER is not set. This could be an executable
+ * EFI runtime mapping or something similar, and the kernel
+ * may execute from it
+ * - we don't have NX support
+ * - we're clearing the PGD (i.e. the new pgd is not present).
+ */
+ if ((pgd.pgd & (_PAGE_USER|_PAGE_PRESENT)) == (_PAGE_USER|_PAGE_PRESENT) &&
+ (__supported_pte_mask & _PAGE_NX))
+ pgd.pgd |= _PAGE_NX;
+
+ /* return the copy of the PGD we want the kernel to use: */
+ return pgd;
+}
+
+/*
+ * Walk the user copy of the page tables (optionally) trying to allocate
+ * page table pages on the way down.
+ *
+ * Returns a pointer to a P4D on success, or NULL on failure.
+ */
+static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address)
+{
+ pgd_t *pgd = kernel_to_user_pgdp(pgd_offset_k(address));
+ gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
+
+ if (address < PAGE_OFFSET) {
+ WARN_ONCE(1, "attempt to walk user address\n");
+ return NULL;
+ }
+
+ if (pgd_none(*pgd)) {
+ unsigned long new_p4d_page = __get_free_page(gfp);
+ if (WARN_ON_ONCE(!new_p4d_page))
+ return NULL;
+
+ set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page)));
+ }
+ BUILD_BUG_ON(pgd_large(*pgd) != 0);
+
+ return p4d_offset(pgd, address);
+}
+
+/*
+ * Walk the user copy of the page tables (optionally) trying to allocate
+ * page table pages on the way down.
+ *
+ * Returns a pointer to a PMD on success, or NULL on failure.
+ */
+static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
+{
+ gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
+ p4d_t *p4d;
+ pud_t *pud;
+
+ p4d = pti_user_pagetable_walk_p4d(address);
+ if (!p4d)
+ return NULL;
+
+ BUILD_BUG_ON(p4d_large(*p4d) != 0);
+ if (p4d_none(*p4d)) {
+ unsigned long new_pud_page = __get_free_page(gfp);
+ if (WARN_ON_ONCE(!new_pud_page))
+ return NULL;
+
+ set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page)));
+ }
+
+ pud = pud_offset(p4d, address);
+ /* The user page tables do not use large mappings: */
+ if (pud_large(*pud)) {
+ WARN_ON(1);
+ return NULL;
+ }
+ if (pud_none(*pud)) {
+ unsigned long new_pmd_page = __get_free_page(gfp);
+ if (WARN_ON_ONCE(!new_pmd_page))
+ return NULL;
+
+ set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
+ }
+
+ return pmd_offset(pud, address);
+}
+
+/*
+ * Walk the shadow copy of the page tables (optionally) trying to allocate
+ * page table pages on the way down. Does not support large pages.
+ *
+ * Note: this is only used when mapping *new* kernel data into the
+ * user/shadow page tables. It is never used for userspace data.
+ *
+ * Returns a pointer to a PTE on success, or NULL on failure.
+ */
+static pte_t *pti_user_pagetable_walk_pte(unsigned long address)
+{
+ gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
+ pmd_t *pmd;
+ pte_t *pte;
+
+ pmd = pti_user_pagetable_walk_pmd(address);
+ if (!pmd)
+ return NULL;
+
+ /* We can't do anything sensible if we hit a large mapping. */
+ if (pmd_large(*pmd)) {
+ WARN_ON(1);
+ return NULL;
+ }
+
+ if (pmd_none(*pmd)) {
+ unsigned long new_pte_page = __get_free_page(gfp);
+ if (!new_pte_page)
+ return NULL;
+
+ set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));
+ }
+
+ pte = pte_offset_kernel(pmd, address);
+ if (pte_flags(*pte) & _PAGE_USER) {
+ WARN_ONCE(1, "attempt to walk to user pte\n");
+ return NULL;
+ }
+ return pte;
+}
+
+#ifdef CONFIG_X86_VSYSCALL_EMULATION
+static void __init pti_setup_vsyscall(void)
+{
+ pte_t *pte, *target_pte;
+ unsigned int level;
+
+ pte = lookup_address(VSYSCALL_ADDR, &level);
+ if (!pte || WARN_ON(level != PG_LEVEL_4K) || pte_none(*pte))
+ return;
+
+ target_pte = pti_user_pagetable_walk_pte(VSYSCALL_ADDR);
+ if (WARN_ON(!target_pte))
+ return;
+
+ *target_pte = *pte;
+ set_vsyscall_pgtable_user_bits(kernel_to_user_pgdp(swapper_pg_dir));
+}
+#else
+static void __init pti_setup_vsyscall(void) { }
+#endif
+
+enum pti_clone_level {
+ PTI_CLONE_PMD,
+ PTI_CLONE_PTE,
+};
+
+static void
+pti_clone_pgtable(unsigned long start, unsigned long end,
+ enum pti_clone_level level)
+{
+ unsigned long addr;
+
+ /*
+ * Clone the populated PMDs which cover start to end. These PMD areas
+ * can have holes.
+ */
+ for (addr = start; addr < end;) {
+ pte_t *pte, *target_pte;
+ pmd_t *pmd, *target_pmd;
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+
+ /* Overflow check */
+ if (addr < start)
+ break;
+
+ pgd = pgd_offset_k(addr);
+ if (WARN_ON(pgd_none(*pgd)))
+ return;
+ p4d = p4d_offset(pgd, addr);
+ if (WARN_ON(p4d_none(*p4d)))
+ return;
+
+ pud = pud_offset(p4d, addr);
+ if (pud_none(*pud)) {
+ WARN_ON_ONCE(addr & ~PUD_MASK);
+ addr = round_up(addr + 1, PUD_SIZE);
+ continue;
+ }
+
+ pmd = pmd_offset(pud, addr);
+ if (pmd_none(*pmd)) {
+ WARN_ON_ONCE(addr & ~PMD_MASK);
+ addr = round_up(addr + 1, PMD_SIZE);
+ continue;
+ }
+
+ if (pmd_large(*pmd) || level == PTI_CLONE_PMD) {
+ target_pmd = pti_user_pagetable_walk_pmd(addr);
+ if (WARN_ON(!target_pmd))
+ return;
+
+ /*
+ * Only clone present PMDs. This ensures only setting
+ * _PAGE_GLOBAL on present PMDs. This should only be
+ * called on well-known addresses anyway, so a non-
+ * present PMD would be a surprise.
+ */
+ if (WARN_ON(!(pmd_flags(*pmd) & _PAGE_PRESENT)))
+ return;
+
+ /*
+ * Setting 'target_pmd' below creates a mapping in both
+ * the user and kernel page tables. It is effectively
+ * global, so set it as global in both copies. Note:
+ * the X86_FEATURE_PGE check is not _required_ because
+ * the CPU ignores _PAGE_GLOBAL when PGE is not
+ * supported. The check keeps consistentency with
+ * code that only set this bit when supported.
+ */
+ if (boot_cpu_has(X86_FEATURE_PGE))
+ *pmd = pmd_set_flags(*pmd, _PAGE_GLOBAL);
+
+ /*
+ * Copy the PMD. That is, the kernelmode and usermode
+ * tables will share the last-level page tables of this
+ * address range
+ */
+ *target_pmd = *pmd;
+
+ addr += PMD_SIZE;
+
+ } else if (level == PTI_CLONE_PTE) {
+
+ /* Walk the page-table down to the pte level */
+ pte = pte_offset_kernel(pmd, addr);
+ if (pte_none(*pte)) {
+ addr += PAGE_SIZE;
+ continue;
+ }
+
+ /* Only clone present PTEs */
+ if (WARN_ON(!(pte_flags(*pte) & _PAGE_PRESENT)))
+ return;
+
+ /* Allocate PTE in the user page-table */
+ target_pte = pti_user_pagetable_walk_pte(addr);
+ if (WARN_ON(!target_pte))
+ return;
+
+ /* Set GLOBAL bit in both PTEs */
+ if (boot_cpu_has(X86_FEATURE_PGE))
+ *pte = pte_set_flags(*pte, _PAGE_GLOBAL);
+
+ /* Clone the PTE */
+ *target_pte = *pte;
+
+ addr += PAGE_SIZE;
+
+ } else {
+ BUG();
+ }
+ }
+}
+
+#ifdef CONFIG_X86_64
+/*
+ * Clone a single p4d (i.e. a top-level entry on 4-level systems and a
+ * next-level entry on 5-level systems.
+ */
+static void __init pti_clone_p4d(unsigned long addr)
+{
+ p4d_t *kernel_p4d, *user_p4d;
+ pgd_t *kernel_pgd;
+
+ user_p4d = pti_user_pagetable_walk_p4d(addr);
+ if (!user_p4d)
+ return;
+
+ kernel_pgd = pgd_offset_k(addr);
+ kernel_p4d = p4d_offset(kernel_pgd, addr);
+ *user_p4d = *kernel_p4d;
+}
+
+/*
+ * Clone the CPU_ENTRY_AREA into the user space visible page table.
+ */
+static void __init pti_clone_user_shared(void)
+{
+ pti_clone_p4d(CPU_ENTRY_AREA_BASE);
+}
+
+#else /* CONFIG_X86_64 */
+
+/*
+ * On 32 bit PAE systems with 1GB of Kernel address space there is only
+ * one pgd/p4d for the whole kernel. Cloning that would map the whole
+ * address space into the user page-tables, making PTI useless. So clone
+ * the page-table on the PMD level to prevent that.
+ */
+static void __init pti_clone_user_shared(void)
+{
+ unsigned long start, end;
+
+ start = CPU_ENTRY_AREA_BASE;
+ end = start + (PAGE_SIZE * CPU_ENTRY_AREA_PAGES);
+
+ pti_clone_pgtable(start, end, PTI_CLONE_PMD);
+}
+#endif /* CONFIG_X86_64 */
+
+/*
+ * Clone the ESPFIX P4D into the user space visible page table
+ */
+static void __init pti_setup_espfix64(void)
+{
+#ifdef CONFIG_X86_ESPFIX64
+ pti_clone_p4d(ESPFIX_BASE_ADDR);
+#endif
+}
+
+/*
+ * Clone the populated PMDs of the entry and irqentry text and force it RO.
+ */
+static void pti_clone_entry_text(void)
+{
+ pti_clone_pgtable((unsigned long) __entry_text_start,
+ (unsigned long) __irqentry_text_end,
+ PTI_CLONE_PMD);
+}
+
+/*
+ * Global pages and PCIDs are both ways to make kernel TLB entries
+ * live longer, reduce TLB misses and improve kernel performance.
+ * But, leaving all kernel text Global makes it potentially accessible
+ * to Meltdown-style attacks which make it trivial to find gadgets or
+ * defeat KASLR.
+ *
+ * Only use global pages when it is really worth it.
+ */
+static inline bool pti_kernel_image_global_ok(void)
+{
+ /*
+ * Systems with PCIDs get litlle benefit from global
+ * kernel text and are not worth the downsides.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_PCID))
+ return false;
+
+ /*
+ * Only do global kernel image for pti=auto. Do the most
+ * secure thing (not global) if pti=on specified.
+ */
+ if (pti_mode != PTI_AUTO)
+ return false;
+
+ /*
+ * K8 may not tolerate the cleared _PAGE_RW on the userspace
+ * global kernel image pages. Do the safe thing (disable
+ * global kernel image). This is unlikely to ever be
+ * noticed because PTI is disabled by default on AMD CPUs.
+ */
+ if (boot_cpu_has(X86_FEATURE_K8))
+ return false;
+
+ /*
+ * RANDSTRUCT derives its hardening benefits from the
+ * attacker's lack of knowledge about the layout of kernel
+ * data structures. Keep the kernel image non-global in
+ * cases where RANDSTRUCT is in use to help keep the layout a
+ * secret.
+ */
+ if (IS_ENABLED(CONFIG_GCC_PLUGIN_RANDSTRUCT))
+ return false;
+
+ return true;
+}
+
+/*
+ * This is the only user for these and it is not arch-generic
+ * like the other set_memory.h functions. Just extern them.
+ */
+extern int set_memory_nonglobal(unsigned long addr, int numpages);
+extern int set_memory_global(unsigned long addr, int numpages);
+
+/*
+ * For some configurations, map all of kernel text into the user page
+ * tables. This reduces TLB misses, especially on non-PCID systems.
+ */
+static void pti_clone_kernel_text(void)
+{
+ /*
+ * rodata is part of the kernel image and is normally
+ * readable on the filesystem or on the web. But, do not
+ * clone the areas past rodata, they might contain secrets.
+ */
+ unsigned long start = PFN_ALIGN(_text);
+ unsigned long end_clone = (unsigned long)__end_rodata_aligned;
+ unsigned long end_global = PFN_ALIGN((unsigned long)__stop___ex_table);
+
+ if (!pti_kernel_image_global_ok())
+ return;
+
+ pr_debug("mapping partial kernel image into user address space\n");
+
+ /*
+ * Note that this will undo _some_ of the work that
+ * pti_set_kernel_image_nonglobal() did to clear the
+ * global bit.
+ */
+ pti_clone_pgtable(start, end_clone, PTI_LEVEL_KERNEL_IMAGE);
+
+ /*
+ * pti_clone_pgtable() will set the global bit in any PMDs
+ * that it clones, but we also need to get any PTEs in
+ * the last level for areas that are not huge-page-aligned.
+ */
+
+ /* Set the global bit for normal non-__init kernel text: */
+ set_memory_global(start, (end_global - start) >> PAGE_SHIFT);
+}
+
+void pti_set_kernel_image_nonglobal(void)
+{
+ /*
+ * The identity map is created with PMDs, regardless of the
+ * actual length of the kernel. We need to clear
+ * _PAGE_GLOBAL up to a PMD boundary, not just to the end
+ * of the image.
+ */
+ unsigned long start = PFN_ALIGN(_text);
+ unsigned long end = ALIGN((unsigned long)_end, PMD_PAGE_SIZE);
+
+ /*
+ * This clears _PAGE_GLOBAL from the entire kernel image.
+ * pti_clone_kernel_text() map put _PAGE_GLOBAL back for
+ * areas that are mapped to userspace.
+ */
+ set_memory_nonglobal(start, (end - start) >> PAGE_SHIFT);
+}
+
+/*
+ * Initialize kernel page table isolation
+ */
+void __init pti_init(void)
+{
+ if (!static_cpu_has(X86_FEATURE_PTI))
+ return;
+
+ pr_info("enabled\n");
+
+#ifdef CONFIG_X86_32
+ /*
+ * We check for X86_FEATURE_PCID here. But the init-code will
+ * clear the feature flag on 32 bit because the feature is not
+ * supported on 32 bit anyway. To print the warning we need to
+ * check with cpuid directly again.
+ */
+ if (cpuid_ecx(0x1) & BIT(17)) {
+ /* Use printk to work around pr_fmt() */
+ printk(KERN_WARNING "\n");
+ printk(KERN_WARNING "************************************************************\n");
+ printk(KERN_WARNING "** WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! **\n");
+ printk(KERN_WARNING "** **\n");
+ printk(KERN_WARNING "** You are using 32-bit PTI on a 64-bit PCID-capable CPU. **\n");
+ printk(KERN_WARNING "** Your performance will increase dramatically if you **\n");
+ printk(KERN_WARNING "** switch to a 64-bit kernel! **\n");
+ printk(KERN_WARNING "** **\n");
+ printk(KERN_WARNING "** WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! **\n");
+ printk(KERN_WARNING "************************************************************\n");
+ }
+#endif
+
+ pti_clone_user_shared();
+
+ /* Undo all global bits from the init pagetables in head_64.S: */
+ pti_set_kernel_image_nonglobal();
+ /* Replace some of the global bits just for shared entry text: */
+ pti_clone_entry_text();
+ pti_setup_espfix64();
+ pti_setup_vsyscall();
+}
+
+/*
+ * Finalize the kernel mappings in the userspace page-table. Some of the
+ * mappings for the kernel image might have changed since pti_init()
+ * cloned them. This is because parts of the kernel image have been
+ * mapped RO and/or NX. These changes need to be cloned again to the
+ * userspace page-table.
+ */
+void pti_finalize(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_PTI))
+ return;
+ /*
+ * We need to clone everything (again) that maps parts of the
+ * kernel image.
+ */
+ pti_clone_entry_text();
+ pti_clone_kernel_text();
+
+ debug_checkwx_user();
+}
diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c
new file mode 100644
index 000000000..adb3c5784
--- /dev/null
+++ b/arch/x86/mm/setup_nx.c
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/spinlock.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+
+#include <asm/pgtable.h>
+#include <asm/proto.h>
+#include <asm/cpufeature.h>
+
+static int disable_nx;
+
+/*
+ * noexec = on|off
+ *
+ * Control non-executable mappings for processes.
+ *
+ * on Enable
+ * off Disable
+ */
+static int __init noexec_setup(char *str)
+{
+ if (!str)
+ return -EINVAL;
+ if (!strncmp(str, "on", 2)) {
+ disable_nx = 0;
+ } else if (!strncmp(str, "off", 3)) {
+ disable_nx = 1;
+ }
+ x86_configure_nx();
+ return 0;
+}
+early_param("noexec", noexec_setup);
+
+void x86_configure_nx(void)
+{
+ if (boot_cpu_has(X86_FEATURE_NX) && !disable_nx)
+ __supported_pte_mask |= _PAGE_NX;
+ else
+ __supported_pte_mask &= ~_PAGE_NX;
+}
+
+void __init x86_report_nx(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_NX)) {
+ printk(KERN_NOTICE "Notice: NX (Execute Disable) protection "
+ "missing in CPU!\n");
+ } else {
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
+ if (disable_nx) {
+ printk(KERN_INFO "NX (Execute Disable) protection: "
+ "disabled by kernel command line option\n");
+ } else {
+ printk(KERN_INFO "NX (Execute Disable) protection: "
+ "active\n");
+ }
+#else
+ /* 32bit non-PAE kernel, NX cannot be used */
+ printk(KERN_NOTICE "Notice: NX (Execute Disable) protection "
+ "cannot be enabled: non-PAE kernel!\n");
+#endif
+ }
+}
diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
new file mode 100644
index 000000000..dac07e4f5
--- /dev/null
+++ b/arch/x86/mm/srat.c
@@ -0,0 +1,114 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ACPI 3.0 based NUMA setup
+ * Copyright 2004 Andi Kleen, SuSE Labs.
+ *
+ * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs.
+ *
+ * Called from acpi_numa_init while reading the SRAT and SLIT tables.
+ * Assumes all memory regions belonging to a single proximity domain
+ * are in one chunk. Holes between them will be included in the node.
+ */
+
+#include <linux/kernel.h>
+#include <linux/acpi.h>
+#include <linux/mmzone.h>
+#include <linux/bitmap.h>
+#include <linux/init.h>
+#include <linux/topology.h>
+#include <linux/mm.h>
+#include <asm/proto.h>
+#include <asm/numa.h>
+#include <asm/e820/api.h>
+#include <asm/apic.h>
+#include <asm/uv/uv.h>
+
+/* Callback for Proximity Domain -> x2APIC mapping */
+void __init
+acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
+{
+ int pxm, node;
+ int apic_id;
+
+ if (srat_disabled())
+ return;
+ if (pa->header.length < sizeof(struct acpi_srat_x2apic_cpu_affinity)) {
+ bad_srat();
+ return;
+ }
+ if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
+ return;
+ pxm = pa->proximity_domain;
+ apic_id = pa->apic_id;
+ if (!apic->apic_id_valid(apic_id)) {
+ printk(KERN_INFO "SRAT: PXM %u -> X2APIC 0x%04x ignored\n",
+ pxm, apic_id);
+ return;
+ }
+ node = acpi_map_pxm_to_node(pxm);
+ if (node < 0) {
+ printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
+ bad_srat();
+ return;
+ }
+
+ if (apic_id >= MAX_LOCAL_APIC) {
+ printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
+ return;
+ }
+ set_apicid_to_node(apic_id, node);
+ node_set(node, numa_nodes_parsed);
+ printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n",
+ pxm, apic_id, node);
+}
+
+/* Callback for Proximity Domain -> LAPIC mapping */
+void __init
+acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
+{
+ int pxm, node;
+ int apic_id;
+
+ if (srat_disabled())
+ return;
+ if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) {
+ bad_srat();
+ return;
+ }
+ if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
+ return;
+ pxm = pa->proximity_domain_lo;
+ if (acpi_srat_revision >= 2)
+ pxm |= *((unsigned int*)pa->proximity_domain_hi) << 8;
+ node = acpi_map_pxm_to_node(pxm);
+ if (node < 0) {
+ printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
+ bad_srat();
+ return;
+ }
+
+ if (get_uv_system_type() >= UV_X2APIC)
+ apic_id = (pa->apic_id << 8) | pa->local_sapic_eid;
+ else
+ apic_id = pa->apic_id;
+
+ if (apic_id >= MAX_LOCAL_APIC) {
+ printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
+ return;
+ }
+
+ set_apicid_to_node(apic_id, node);
+ node_set(node, numa_nodes_parsed);
+ printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n",
+ pxm, apic_id, node);
+}
+
+int __init x86_acpi_numa_init(void)
+{
+ int ret;
+
+ ret = acpi_numa_init();
+ if (ret < 0)
+ return ret;
+ return srat_disabled() ? -EINVAL : 0;
+}
diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c
new file mode 100644
index 000000000..f6ae6830b
--- /dev/null
+++ b/arch/x86/mm/testmmiotrace.c
@@ -0,0 +1,140 @@
+/*
+ * Written by Pekka Paalanen, 2008-2009 <pq@iki.fi>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/io.h>
+#include <linux/mmiotrace.h>
+
+static unsigned long mmio_address;
+module_param_hw(mmio_address, ulong, iomem, 0);
+MODULE_PARM_DESC(mmio_address, " Start address of the mapping of 16 kB "
+ "(or 8 MB if read_far is non-zero).");
+
+static unsigned long read_far = 0x400100;
+module_param(read_far, ulong, 0);
+MODULE_PARM_DESC(read_far, " Offset of a 32-bit read within 8 MB "
+ "(default: 0x400100).");
+
+static unsigned v16(unsigned i)
+{
+ return i * 12 + 7;
+}
+
+static unsigned v32(unsigned i)
+{
+ return i * 212371 + 13;
+}
+
+static void do_write_test(void __iomem *p)
+{
+ unsigned int i;
+ pr_info("write test.\n");
+ mmiotrace_printk("Write test.\n");
+
+ for (i = 0; i < 256; i++)
+ iowrite8(i, p + i);
+
+ for (i = 1024; i < (5 * 1024); i += 2)
+ iowrite16(v16(i), p + i);
+
+ for (i = (5 * 1024); i < (16 * 1024); i += 4)
+ iowrite32(v32(i), p + i);
+}
+
+static void do_read_test(void __iomem *p)
+{
+ unsigned int i;
+ unsigned errs[3] = { 0 };
+ pr_info("read test.\n");
+ mmiotrace_printk("Read test.\n");
+
+ for (i = 0; i < 256; i++)
+ if (ioread8(p + i) != i)
+ ++errs[0];
+
+ for (i = 1024; i < (5 * 1024); i += 2)
+ if (ioread16(p + i) != v16(i))
+ ++errs[1];
+
+ for (i = (5 * 1024); i < (16 * 1024); i += 4)
+ if (ioread32(p + i) != v32(i))
+ ++errs[2];
+
+ mmiotrace_printk("Read errors: 8-bit %d, 16-bit %d, 32-bit %d.\n",
+ errs[0], errs[1], errs[2]);
+}
+
+static void do_read_far_test(void __iomem *p)
+{
+ pr_info("read far test.\n");
+ mmiotrace_printk("Read far test.\n");
+
+ ioread32(p + read_far);
+}
+
+static void do_test(unsigned long size)
+{
+ void __iomem *p = ioremap_nocache(mmio_address, size);
+ if (!p) {
+ pr_err("could not ioremap, aborting.\n");
+ return;
+ }
+ mmiotrace_printk("ioremap returned %p.\n", p);
+ do_write_test(p);
+ do_read_test(p);
+ if (read_far && read_far < size - 4)
+ do_read_far_test(p);
+ iounmap(p);
+}
+
+/*
+ * Tests how mmiotrace behaves in face of multiple ioremap / iounmaps in
+ * a short time. We had a bug in deferred freeing procedure which tried
+ * to free this region multiple times (ioremap can reuse the same address
+ * for many mappings).
+ */
+static void do_test_bulk_ioremapping(void)
+{
+ void __iomem *p;
+ int i;
+
+ for (i = 0; i < 10; ++i) {
+ p = ioremap_nocache(mmio_address, PAGE_SIZE);
+ if (p)
+ iounmap(p);
+ }
+
+ /* Force freeing. If it will crash we will know why. */
+ synchronize_rcu();
+}
+
+static int __init init(void)
+{
+ unsigned long size = (read_far) ? (8 << 20) : (16 << 10);
+
+ if (mmio_address == 0) {
+ pr_err("you have to use the module argument mmio_address.\n");
+ pr_err("DO NOT LOAD THIS MODULE UNLESS YOU REALLY KNOW WHAT YOU ARE DOING!\n");
+ return -ENXIO;
+ }
+
+ pr_warning("WARNING: mapping %lu kB @ 0x%08lx in PCI address space, "
+ "and writing 16 kB of rubbish in there.\n",
+ size >> 10, mmio_address);
+ do_test(size);
+ do_test_bulk_ioremapping();
+ pr_info("All done.\n");
+ return 0;
+}
+
+static void __exit cleanup(void)
+{
+ pr_debug("unloaded.\n");
+}
+
+module_init(init);
+module_exit(cleanup);
+MODULE_LICENSE("GPL");
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
new file mode 100644
index 000000000..2f41a34c8
--- /dev/null
+++ b/arch/x86/mm/tlb.c
@@ -0,0 +1,840 @@
+#include <linux/init.h>
+
+#include <linux/mm.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/export.h>
+#include <linux/cpu.h>
+#include <linux/debugfs.h>
+
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+#include <asm/nospec-branch.h>
+#include <asm/cache.h>
+#include <asm/apic.h>
+#include <asm/uv/uv.h>
+
+/*
+ * TLB flushing, formerly SMP-only
+ * c/o Linus Torvalds.
+ *
+ * These mean you can really definitely utterly forget about
+ * writing to user space from interrupts. (Its not allowed anyway).
+ *
+ * Optimizations Manfred Spraul <manfred@colorfullife.com>
+ *
+ * More scalable flush, from Andi Kleen
+ *
+ * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
+ */
+
+/*
+ * Use bit 0 to mangle the TIF_SPEC_IB state into the mm pointer which is
+ * stored in cpu_tlb_state.last_user_mm_ibpb.
+ */
+#define LAST_USER_MM_IBPB 0x1UL
+
+/*
+ * We get here when we do something requiring a TLB invalidation
+ * but could not go invalidate all of the contexts. We do the
+ * necessary invalidation by clearing out the 'ctx_id' which
+ * forces a TLB flush when the context is loaded.
+ */
+static void clear_asid_other(void)
+{
+ u16 asid;
+
+ /*
+ * This is only expected to be set if we have disabled
+ * kernel _PAGE_GLOBAL pages.
+ */
+ if (!static_cpu_has(X86_FEATURE_PTI)) {
+ WARN_ON_ONCE(1);
+ return;
+ }
+
+ for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
+ /* Do not need to flush the current asid */
+ if (asid == this_cpu_read(cpu_tlbstate.loaded_mm_asid))
+ continue;
+ /*
+ * Make sure the next time we go to switch to
+ * this asid, we do a flush:
+ */
+ this_cpu_write(cpu_tlbstate.ctxs[asid].ctx_id, 0);
+ }
+ this_cpu_write(cpu_tlbstate.invalidate_other, false);
+}
+
+atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);
+
+
+static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
+ u16 *new_asid, bool *need_flush)
+{
+ u16 asid;
+
+ if (!static_cpu_has(X86_FEATURE_PCID)) {
+ *new_asid = 0;
+ *need_flush = true;
+ return;
+ }
+
+ if (this_cpu_read(cpu_tlbstate.invalidate_other))
+ clear_asid_other();
+
+ for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
+ if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) !=
+ next->context.ctx_id)
+ continue;
+
+ *new_asid = asid;
+ *need_flush = (this_cpu_read(cpu_tlbstate.ctxs[asid].tlb_gen) <
+ next_tlb_gen);
+ return;
+ }
+
+ /*
+ * We don't currently own an ASID slot on this CPU.
+ * Allocate a slot.
+ */
+ *new_asid = this_cpu_add_return(cpu_tlbstate.next_asid, 1) - 1;
+ if (*new_asid >= TLB_NR_DYN_ASIDS) {
+ *new_asid = 0;
+ this_cpu_write(cpu_tlbstate.next_asid, 1);
+ }
+ *need_flush = true;
+}
+
+static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush)
+{
+ unsigned long new_mm_cr3;
+
+ if (need_flush) {
+ invalidate_user_asid(new_asid);
+ new_mm_cr3 = build_cr3(pgdir, new_asid);
+ } else {
+ new_mm_cr3 = build_cr3_noflush(pgdir, new_asid);
+ }
+
+ /*
+ * Caution: many callers of this function expect
+ * that load_cr3() is serializing and orders TLB
+ * fills with respect to the mm_cpumask writes.
+ */
+ write_cr3(new_mm_cr3);
+}
+
+void leave_mm(int cpu)
+{
+ struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
+
+ /*
+ * It's plausible that we're in lazy TLB mode while our mm is init_mm.
+ * If so, our callers still expect us to flush the TLB, but there
+ * aren't any user TLB entries in init_mm to worry about.
+ *
+ * This needs to happen before any other sanity checks due to
+ * intel_idle's shenanigans.
+ */
+ if (loaded_mm == &init_mm)
+ return;
+
+ /* Warn if we're not lazy. */
+ WARN_ON(!this_cpu_read(cpu_tlbstate.is_lazy));
+
+ switch_mm(NULL, &init_mm, NULL);
+}
+EXPORT_SYMBOL_GPL(leave_mm);
+
+void switch_mm(struct mm_struct *prev, struct mm_struct *next,
+ struct task_struct *tsk)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ switch_mm_irqs_off(prev, next, tsk);
+ local_irq_restore(flags);
+}
+
+static void sync_current_stack_to_mm(struct mm_struct *mm)
+{
+ unsigned long sp = current_stack_pointer;
+ pgd_t *pgd = pgd_offset(mm, sp);
+
+ if (pgtable_l5_enabled()) {
+ if (unlikely(pgd_none(*pgd))) {
+ pgd_t *pgd_ref = pgd_offset_k(sp);
+
+ set_pgd(pgd, *pgd_ref);
+ }
+ } else {
+ /*
+ * "pgd" is faked. The top level entries are "p4d"s, so sync
+ * the p4d. This compiles to approximately the same code as
+ * the 5-level case.
+ */
+ p4d_t *p4d = p4d_offset(pgd, sp);
+
+ if (unlikely(p4d_none(*p4d))) {
+ pgd_t *pgd_ref = pgd_offset_k(sp);
+ p4d_t *p4d_ref = p4d_offset(pgd_ref, sp);
+
+ set_p4d(p4d, *p4d_ref);
+ }
+ }
+}
+
+static inline unsigned long mm_mangle_tif_spec_ib(struct task_struct *next)
+{
+ unsigned long next_tif = task_thread_info(next)->flags;
+ unsigned long ibpb = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_IBPB;
+
+ return (unsigned long)next->mm | ibpb;
+}
+
+static void cond_ibpb(struct task_struct *next)
+{
+ if (!next || !next->mm)
+ return;
+
+ /*
+ * Both, the conditional and the always IBPB mode use the mm
+ * pointer to avoid the IBPB when switching between tasks of the
+ * same process. Using the mm pointer instead of mm->context.ctx_id
+ * opens a hypothetical hole vs. mm_struct reuse, which is more or
+ * less impossible to control by an attacker. Aside of that it
+ * would only affect the first schedule so the theoretically
+ * exposed data is not really interesting.
+ */
+ if (static_branch_likely(&switch_mm_cond_ibpb)) {
+ unsigned long prev_mm, next_mm;
+
+ /*
+ * This is a bit more complex than the always mode because
+ * it has to handle two cases:
+ *
+ * 1) Switch from a user space task (potential attacker)
+ * which has TIF_SPEC_IB set to a user space task
+ * (potential victim) which has TIF_SPEC_IB not set.
+ *
+ * 2) Switch from a user space task (potential attacker)
+ * which has TIF_SPEC_IB not set to a user space task
+ * (potential victim) which has TIF_SPEC_IB set.
+ *
+ * This could be done by unconditionally issuing IBPB when
+ * a task which has TIF_SPEC_IB set is either scheduled in
+ * or out. Though that results in two flushes when:
+ *
+ * - the same user space task is scheduled out and later
+ * scheduled in again and only a kernel thread ran in
+ * between.
+ *
+ * - a user space task belonging to the same process is
+ * scheduled in after a kernel thread ran in between
+ *
+ * - a user space task belonging to the same process is
+ * scheduled in immediately.
+ *
+ * Optimize this with reasonably small overhead for the
+ * above cases. Mangle the TIF_SPEC_IB bit into the mm
+ * pointer of the incoming task which is stored in
+ * cpu_tlbstate.last_user_mm_ibpb for comparison.
+ */
+ next_mm = mm_mangle_tif_spec_ib(next);
+ prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_ibpb);
+
+ /*
+ * Issue IBPB only if the mm's are different and one or
+ * both have the IBPB bit set.
+ */
+ if (next_mm != prev_mm &&
+ (next_mm | prev_mm) & LAST_USER_MM_IBPB)
+ indirect_branch_prediction_barrier();
+
+ this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, next_mm);
+ }
+
+ if (static_branch_unlikely(&switch_mm_always_ibpb)) {
+ /*
+ * Only flush when switching to a user space task with a
+ * different context than the user space task which ran
+ * last on this CPU.
+ */
+ if (this_cpu_read(cpu_tlbstate.last_user_mm) != next->mm) {
+ indirect_branch_prediction_barrier();
+ this_cpu_write(cpu_tlbstate.last_user_mm, next->mm);
+ }
+ }
+}
+
+void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
+ struct task_struct *tsk)
+{
+ struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
+ u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+ unsigned cpu = smp_processor_id();
+ u64 next_tlb_gen;
+
+ /*
+ * NB: The scheduler will call us with prev == next when switching
+ * from lazy TLB mode to normal mode if active_mm isn't changing.
+ * When this happens, we don't assume that CR3 (and hence
+ * cpu_tlbstate.loaded_mm) matches next.
+ *
+ * NB: leave_mm() calls us with prev == NULL and tsk == NULL.
+ */
+
+ /* We don't want flush_tlb_func_* to run concurrently with us. */
+ if (IS_ENABLED(CONFIG_PROVE_LOCKING))
+ WARN_ON_ONCE(!irqs_disabled());
+
+ /*
+ * Verify that CR3 is what we think it is. This will catch
+ * hypothetical buggy code that directly switches to swapper_pg_dir
+ * without going through leave_mm() / switch_mm_irqs_off() or that
+ * does something like write_cr3(read_cr3_pa()).
+ *
+ * Only do this check if CONFIG_DEBUG_VM=y because __read_cr3()
+ * isn't free.
+ */
+#ifdef CONFIG_DEBUG_VM
+ if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid))) {
+ /*
+ * If we were to BUG here, we'd be very likely to kill
+ * the system so hard that we don't see the call trace.
+ * Try to recover instead by ignoring the error and doing
+ * a global flush to minimize the chance of corruption.
+ *
+ * (This is far from being a fully correct recovery.
+ * Architecturally, the CPU could prefetch something
+ * back into an incorrect ASID slot and leave it there
+ * to cause trouble down the road. It's better than
+ * nothing, though.)
+ */
+ __flush_tlb_all();
+ }
+#endif
+ this_cpu_write(cpu_tlbstate.is_lazy, false);
+
+ /*
+ * The membarrier system call requires a full memory barrier and
+ * core serialization before returning to user-space, after
+ * storing to rq->curr, when changing mm. This is because
+ * membarrier() sends IPIs to all CPUs that are in the target mm
+ * to make them issue memory barriers. However, if another CPU
+ * switches to/from the target mm concurrently with
+ * membarrier(), it can cause that CPU not to receive an IPI
+ * when it really should issue a memory barrier. Writing to CR3
+ * provides that full memory barrier and core serializing
+ * instruction.
+ */
+ if (real_prev == next) {
+ VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
+ next->context.ctx_id);
+
+ /*
+ * We don't currently support having a real mm loaded without
+ * our cpu set in mm_cpumask(). We have all the bookkeeping
+ * in place to figure out whether we would need to flush
+ * if our cpu were cleared in mm_cpumask(), but we don't
+ * currently use it.
+ */
+ if (WARN_ON_ONCE(real_prev != &init_mm &&
+ !cpumask_test_cpu(cpu, mm_cpumask(next))))
+ cpumask_set_cpu(cpu, mm_cpumask(next));
+
+ return;
+ } else {
+ u16 new_asid;
+ bool need_flush;
+
+ /*
+ * Avoid user/user BTB poisoning by flushing the branch
+ * predictor when switching between processes. This stops
+ * one process from doing Spectre-v2 attacks on another.
+ */
+ cond_ibpb(tsk);
+
+ if (IS_ENABLED(CONFIG_VMAP_STACK)) {
+ /*
+ * If our current stack is in vmalloc space and isn't
+ * mapped in the new pgd, we'll double-fault. Forcibly
+ * map it.
+ */
+ sync_current_stack_to_mm(next);
+ }
+
+ /*
+ * Stop remote flushes for the previous mm.
+ * Skip kernel threads; we never send init_mm TLB flushing IPIs,
+ * but the bitmap manipulation can cause cache line contention.
+ */
+ if (real_prev != &init_mm) {
+ VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu,
+ mm_cpumask(real_prev)));
+ cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
+ }
+
+ /*
+ * Start remote flushes and then read tlb_gen.
+ */
+ if (next != &init_mm)
+ cpumask_set_cpu(cpu, mm_cpumask(next));
+ next_tlb_gen = atomic64_read(&next->context.tlb_gen);
+
+ choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
+
+ /* Let nmi_uaccess_okay() know that we're changing CR3. */
+ this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
+ barrier();
+
+ if (need_flush) {
+ this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
+ this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
+ load_new_mm_cr3(next->pgd, new_asid, true);
+
+ /*
+ * NB: This gets called via leave_mm() in the idle path
+ * where RCU functions differently. Tracing normally
+ * uses RCU, so we need to use the _rcuidle variant.
+ *
+ * (There is no good reason for this. The idle code should
+ * be rearranged to call this before rcu_idle_enter().)
+ */
+ trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
+ } else {
+ /* The new ASID is already up to date. */
+ load_new_mm_cr3(next->pgd, new_asid, false);
+
+ /* See above wrt _rcuidle. */
+ trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
+ }
+
+ /* Make sure we write CR3 before loaded_mm. */
+ barrier();
+
+ this_cpu_write(cpu_tlbstate.loaded_mm, next);
+ this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
+ }
+
+ load_mm_cr4(next);
+ switch_ldt(real_prev, next);
+}
+
+/*
+ * Please ignore the name of this function. It should be called
+ * switch_to_kernel_thread().
+ *
+ * enter_lazy_tlb() is a hint from the scheduler that we are entering a
+ * kernel thread or other context without an mm. Acceptable implementations
+ * include doing nothing whatsoever, switching to init_mm, or various clever
+ * lazy tricks to try to minimize TLB flushes.
+ *
+ * The scheduler reserves the right to call enter_lazy_tlb() several times
+ * in a row. It will notify us that we're going back to a real mm by
+ * calling switch_mm_irqs_off().
+ */
+void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
+{
+ if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
+ return;
+
+ if (tlb_defer_switch_to_init_mm()) {
+ /*
+ * There's a significant optimization that may be possible
+ * here. We have accurate enough TLB flush tracking that we
+ * don't need to maintain coherence of TLB per se when we're
+ * lazy. We do, however, need to maintain coherence of
+ * paging-structure caches. We could, in principle, leave our
+ * old mm loaded and only switch to init_mm when
+ * tlb_remove_page() happens.
+ */
+ this_cpu_write(cpu_tlbstate.is_lazy, true);
+ } else {
+ switch_mm(NULL, &init_mm, NULL);
+ }
+}
+
+/*
+ * Call this when reinitializing a CPU. It fixes the following potential
+ * problems:
+ *
+ * - The ASID changed from what cpu_tlbstate thinks it is (most likely
+ * because the CPU was taken down and came back up with CR3's PCID
+ * bits clear. CPU hotplug can do this.
+ *
+ * - The TLB contains junk in slots corresponding to inactive ASIDs.
+ *
+ * - The CPU went so far out to lunch that it may have missed a TLB
+ * flush.
+ */
+void initialize_tlbstate_and_flush(void)
+{
+ int i;
+ struct mm_struct *mm = this_cpu_read(cpu_tlbstate.loaded_mm);
+ u64 tlb_gen = atomic64_read(&init_mm.context.tlb_gen);
+ unsigned long cr3 = __read_cr3();
+
+ /* Assert that CR3 already references the right mm. */
+ WARN_ON((cr3 & CR3_ADDR_MASK) != __pa(mm->pgd));
+
+ /*
+ * Assert that CR4.PCIDE is set if needed. (CR4.PCIDE initialization
+ * doesn't work like other CR4 bits because it can only be set from
+ * long mode.)
+ */
+ WARN_ON(boot_cpu_has(X86_FEATURE_PCID) &&
+ !(cr4_read_shadow() & X86_CR4_PCIDE));
+
+ /* Force ASID 0 and force a TLB flush. */
+ write_cr3(build_cr3(mm->pgd, 0));
+
+ /* Reinitialize tlbstate. */
+ this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, LAST_USER_MM_IBPB);
+ this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0);
+ this_cpu_write(cpu_tlbstate.next_asid, 1);
+ this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id);
+ this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, tlb_gen);
+
+ for (i = 1; i < TLB_NR_DYN_ASIDS; i++)
+ this_cpu_write(cpu_tlbstate.ctxs[i].ctx_id, 0);
+}
+
+/*
+ * flush_tlb_func_common()'s memory ordering requirement is that any
+ * TLB fills that happen after we flush the TLB are ordered after we
+ * read active_mm's tlb_gen. We don't need any explicit barriers
+ * because all x86 flush operations are serializing and the
+ * atomic64_read operation won't be reordered by the compiler.
+ */
+static void flush_tlb_func_common(const struct flush_tlb_info *f,
+ bool local, enum tlb_flush_reason reason)
+{
+ /*
+ * We have three different tlb_gen values in here. They are:
+ *
+ * - mm_tlb_gen: the latest generation.
+ * - local_tlb_gen: the generation that this CPU has already caught
+ * up to.
+ * - f->new_tlb_gen: the generation that the requester of the flush
+ * wants us to catch up to.
+ */
+ struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
+ u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+ u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen);
+ u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
+
+ /* This code cannot presently handle being reentered. */
+ VM_WARN_ON(!irqs_disabled());
+
+ if (unlikely(loaded_mm == &init_mm))
+ return;
+
+ VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) !=
+ loaded_mm->context.ctx_id);
+
+ if (this_cpu_read(cpu_tlbstate.is_lazy)) {
+ /*
+ * We're in lazy mode. We need to at least flush our
+ * paging-structure cache to avoid speculatively reading
+ * garbage into our TLB. Since switching to init_mm is barely
+ * slower than a minimal flush, just switch to init_mm.
+ */
+ switch_mm_irqs_off(NULL, &init_mm, NULL);
+ return;
+ }
+
+ if (unlikely(local_tlb_gen == mm_tlb_gen)) {
+ /*
+ * There's nothing to do: we're already up to date. This can
+ * happen if two concurrent flushes happen -- the first flush to
+ * be handled can catch us all the way up, leaving no work for
+ * the second flush.
+ */
+ trace_tlb_flush(reason, 0);
+ return;
+ }
+
+ WARN_ON_ONCE(local_tlb_gen > mm_tlb_gen);
+ WARN_ON_ONCE(f->new_tlb_gen > mm_tlb_gen);
+
+ /*
+ * If we get to this point, we know that our TLB is out of date.
+ * This does not strictly imply that we need to flush (it's
+ * possible that f->new_tlb_gen <= local_tlb_gen), but we're
+ * going to need to flush in the very near future, so we might
+ * as well get it over with.
+ *
+ * The only question is whether to do a full or partial flush.
+ *
+ * We do a partial flush if requested and two extra conditions
+ * are met:
+ *
+ * 1. f->new_tlb_gen == local_tlb_gen + 1. We have an invariant that
+ * we've always done all needed flushes to catch up to
+ * local_tlb_gen. If, for example, local_tlb_gen == 2 and
+ * f->new_tlb_gen == 3, then we know that the flush needed to bring
+ * us up to date for tlb_gen 3 is the partial flush we're
+ * processing.
+ *
+ * As an example of why this check is needed, suppose that there
+ * are two concurrent flushes. The first is a full flush that
+ * changes context.tlb_gen from 1 to 2. The second is a partial
+ * flush that changes context.tlb_gen from 2 to 3. If they get
+ * processed on this CPU in reverse order, we'll see
+ * local_tlb_gen == 1, mm_tlb_gen == 3, and end != TLB_FLUSH_ALL.
+ * If we were to use __flush_tlb_one_user() and set local_tlb_gen to
+ * 3, we'd be break the invariant: we'd update local_tlb_gen above
+ * 1 without the full flush that's needed for tlb_gen 2.
+ *
+ * 2. f->new_tlb_gen == mm_tlb_gen. This is purely an optimiation.
+ * Partial TLB flushes are not all that much cheaper than full TLB
+ * flushes, so it seems unlikely that it would be a performance win
+ * to do a partial flush if that won't bring our TLB fully up to
+ * date. By doing a full flush instead, we can increase
+ * local_tlb_gen all the way to mm_tlb_gen and we can probably
+ * avoid another flush in the very near future.
+ */
+ if (f->end != TLB_FLUSH_ALL &&
+ f->new_tlb_gen == local_tlb_gen + 1 &&
+ f->new_tlb_gen == mm_tlb_gen) {
+ /* Partial flush */
+ unsigned long addr;
+ unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT;
+
+ addr = f->start;
+ while (addr < f->end) {
+ __flush_tlb_one_user(addr);
+ addr += PAGE_SIZE;
+ }
+ if (local)
+ count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages);
+ trace_tlb_flush(reason, nr_pages);
+ } else {
+ /* Full flush. */
+ local_flush_tlb();
+ if (local)
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
+ trace_tlb_flush(reason, TLB_FLUSH_ALL);
+ }
+
+ /* Both paths above update our state to mm_tlb_gen. */
+ this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen);
+}
+
+static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason)
+{
+ const struct flush_tlb_info *f = info;
+
+ flush_tlb_func_common(f, true, reason);
+}
+
+static void flush_tlb_func_remote(void *info)
+{
+ const struct flush_tlb_info *f = info;
+
+ inc_irq_stat(irq_tlb_count);
+
+ if (f->mm && f->mm != this_cpu_read(cpu_tlbstate.loaded_mm))
+ return;
+
+ count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
+ flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN);
+}
+
+void native_flush_tlb_others(const struct cpumask *cpumask,
+ const struct flush_tlb_info *info)
+{
+ count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
+ if (info->end == TLB_FLUSH_ALL)
+ trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL);
+ else
+ trace_tlb_flush(TLB_REMOTE_SEND_IPI,
+ (info->end - info->start) >> PAGE_SHIFT);
+
+ if (is_uv_system()) {
+ /*
+ * This whole special case is confused. UV has a "Broadcast
+ * Assist Unit", which seems to be a fancy way to send IPIs.
+ * Back when x86 used an explicit TLB flush IPI, UV was
+ * optimized to use its own mechanism. These days, x86 uses
+ * smp_call_function_many(), but UV still uses a manual IPI,
+ * and that IPI's action is out of date -- it does a manual
+ * flush instead of calling flush_tlb_func_remote(). This
+ * means that the percpu tlb_gen variables won't be updated
+ * and we'll do pointless flushes on future context switches.
+ *
+ * Rather than hooking native_flush_tlb_others() here, I think
+ * that UV should be updated so that smp_call_function_many(),
+ * etc, are optimal on UV.
+ */
+ cpumask = uv_flush_tlb_others(cpumask, info);
+ if (cpumask)
+ smp_call_function_many(cpumask, flush_tlb_func_remote,
+ (void *)info, 1);
+ return;
+ }
+ smp_call_function_many(cpumask, flush_tlb_func_remote,
+ (void *)info, 1);
+}
+
+/*
+ * See Documentation/x86/tlb.txt for details. We choose 33
+ * because it is large enough to cover the vast majority (at
+ * least 95%) of allocations, and is small enough that we are
+ * confident it will not cause too much overhead. Each single
+ * flush is about 100 ns, so this caps the maximum overhead at
+ * _about_ 3,000 ns.
+ *
+ * This is in units of pages.
+ */
+static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
+
+void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
+ unsigned long end, unsigned long vmflag)
+{
+ int cpu;
+
+ struct flush_tlb_info info = {
+ .mm = mm,
+ };
+
+ cpu = get_cpu();
+
+ /* This is also a barrier that synchronizes with switch_mm(). */
+ info.new_tlb_gen = inc_mm_tlb_gen(mm);
+
+ /* Should we flush just the requested range? */
+ if ((end != TLB_FLUSH_ALL) &&
+ !(vmflag & VM_HUGETLB) &&
+ ((end - start) >> PAGE_SHIFT) <= tlb_single_page_flush_ceiling) {
+ info.start = start;
+ info.end = end;
+ } else {
+ info.start = 0UL;
+ info.end = TLB_FLUSH_ALL;
+ }
+
+ if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
+ VM_WARN_ON(irqs_disabled());
+ local_irq_disable();
+ flush_tlb_func_local(&info, TLB_LOCAL_MM_SHOOTDOWN);
+ local_irq_enable();
+ }
+
+ if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids)
+ flush_tlb_others(mm_cpumask(mm), &info);
+
+ put_cpu();
+}
+
+
+static void do_flush_tlb_all(void *info)
+{
+ count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
+ __flush_tlb_all();
+}
+
+void flush_tlb_all(void)
+{
+ count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
+ on_each_cpu(do_flush_tlb_all, NULL, 1);
+}
+
+static void do_kernel_range_flush(void *info)
+{
+ struct flush_tlb_info *f = info;
+ unsigned long addr;
+
+ /* flush range by one by one 'invlpg' */
+ for (addr = f->start; addr < f->end; addr += PAGE_SIZE)
+ __flush_tlb_one_kernel(addr);
+}
+
+void flush_tlb_kernel_range(unsigned long start, unsigned long end)
+{
+
+ /* Balance as user space task's flush, a bit conservative */
+ if (end == TLB_FLUSH_ALL ||
+ (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) {
+ on_each_cpu(do_flush_tlb_all, NULL, 1);
+ } else {
+ struct flush_tlb_info info;
+ info.start = start;
+ info.end = end;
+ on_each_cpu(do_kernel_range_flush, &info, 1);
+ }
+}
+
+void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
+{
+ struct flush_tlb_info info = {
+ .mm = NULL,
+ .start = 0UL,
+ .end = TLB_FLUSH_ALL,
+ };
+
+ int cpu = get_cpu();
+
+ if (cpumask_test_cpu(cpu, &batch->cpumask)) {
+ VM_WARN_ON(irqs_disabled());
+ local_irq_disable();
+ flush_tlb_func_local(&info, TLB_LOCAL_SHOOTDOWN);
+ local_irq_enable();
+ }
+
+ if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids)
+ flush_tlb_others(&batch->cpumask, &info);
+
+ cpumask_clear(&batch->cpumask);
+
+ put_cpu();
+}
+
+static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ char buf[32];
+ unsigned int len;
+
+ len = sprintf(buf, "%ld\n", tlb_single_page_flush_ceiling);
+ return simple_read_from_buffer(user_buf, count, ppos, buf, len);
+}
+
+static ssize_t tlbflush_write_file(struct file *file,
+ const char __user *user_buf, size_t count, loff_t *ppos)
+{
+ char buf[32];
+ ssize_t len;
+ int ceiling;
+
+ len = min(count, sizeof(buf) - 1);
+ if (copy_from_user(buf, user_buf, len))
+ return -EFAULT;
+
+ buf[len] = '\0';
+ if (kstrtoint(buf, 0, &ceiling))
+ return -EINVAL;
+
+ if (ceiling < 0)
+ return -EINVAL;
+
+ tlb_single_page_flush_ceiling = ceiling;
+ return count;
+}
+
+static const struct file_operations fops_tlbflush = {
+ .read = tlbflush_read_file,
+ .write = tlbflush_write_file,
+ .llseek = default_llseek,
+};
+
+static int __init create_tlb_single_page_flush_ceiling(void)
+{
+ debugfs_create_file("tlb_single_page_flush_ceiling", S_IRUSR | S_IWUSR,
+ arch_debugfs_dir, NULL, &fops_tlbflush);
+ return 0;
+}
+late_initcall(create_tlb_single_page_flush_ceiling);
diff --git a/arch/x86/net/Makefile b/arch/x86/net/Makefile
new file mode 100644
index 000000000..59e123da5
--- /dev/null
+++ b/arch/x86/net/Makefile
@@ -0,0 +1,9 @@
+#
+# Arch-specific network modules
+#
+
+ifeq ($(CONFIG_X86_32),y)
+ obj-$(CONFIG_BPF_JIT) += bpf_jit_comp32.o
+else
+ obj-$(CONFIG_BPF_JIT) += bpf_jit_comp.o
+endif
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
new file mode 100644
index 000000000..81c3d4b4c
--- /dev/null
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -0,0 +1,1190 @@
+/*
+ * bpf_jit_comp.c: BPF JIT compiler
+ *
+ * Copyright (C) 2011-2013 Eric Dumazet (eric.dumazet@gmail.com)
+ * Internal BPF Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+#include <linux/netdevice.h>
+#include <linux/filter.h>
+#include <linux/if_vlan.h>
+#include <linux/bpf.h>
+
+#include <asm/set_memory.h>
+#include <asm/nospec-branch.h>
+
+static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
+{
+ if (len == 1)
+ *ptr = bytes;
+ else if (len == 2)
+ *(u16 *)ptr = bytes;
+ else {
+ *(u32 *)ptr = bytes;
+ barrier();
+ }
+ return ptr + len;
+}
+
+#define EMIT(bytes, len) \
+ do { prog = emit_code(prog, bytes, len); cnt += len; } while (0)
+
+#define EMIT1(b1) EMIT(b1, 1)
+#define EMIT2(b1, b2) EMIT((b1) + ((b2) << 8), 2)
+#define EMIT3(b1, b2, b3) EMIT((b1) + ((b2) << 8) + ((b3) << 16), 3)
+#define EMIT4(b1, b2, b3, b4) EMIT((b1) + ((b2) << 8) + ((b3) << 16) + ((b4) << 24), 4)
+
+#define EMIT1_off32(b1, off) \
+ do { EMIT1(b1); EMIT(off, 4); } while (0)
+#define EMIT2_off32(b1, b2, off) \
+ do { EMIT2(b1, b2); EMIT(off, 4); } while (0)
+#define EMIT3_off32(b1, b2, b3, off) \
+ do { EMIT3(b1, b2, b3); EMIT(off, 4); } while (0)
+#define EMIT4_off32(b1, b2, b3, b4, off) \
+ do { EMIT4(b1, b2, b3, b4); EMIT(off, 4); } while (0)
+
+static bool is_imm8(int value)
+{
+ return value <= 127 && value >= -128;
+}
+
+static bool is_simm32(s64 value)
+{
+ return value == (s64)(s32)value;
+}
+
+static bool is_uimm32(u64 value)
+{
+ return value == (u64)(u32)value;
+}
+
+/* mov dst, src */
+#define EMIT_mov(DST, SRC) \
+ do { \
+ if (DST != SRC) \
+ EMIT3(add_2mod(0x48, DST, SRC), 0x89, add_2reg(0xC0, DST, SRC)); \
+ } while (0)
+
+static int bpf_size_to_x86_bytes(int bpf_size)
+{
+ if (bpf_size == BPF_W)
+ return 4;
+ else if (bpf_size == BPF_H)
+ return 2;
+ else if (bpf_size == BPF_B)
+ return 1;
+ else if (bpf_size == BPF_DW)
+ return 4; /* imm32 */
+ else
+ return 0;
+}
+
+/*
+ * List of x86 cond jumps opcodes (. + s8)
+ * Add 0x10 (and an extra 0x0f) to generate far jumps (. + s32)
+ */
+#define X86_JB 0x72
+#define X86_JAE 0x73
+#define X86_JE 0x74
+#define X86_JNE 0x75
+#define X86_JBE 0x76
+#define X86_JA 0x77
+#define X86_JL 0x7C
+#define X86_JGE 0x7D
+#define X86_JLE 0x7E
+#define X86_JG 0x7F
+
+/* Pick a register outside of BPF range for JIT internal work */
+#define AUX_REG (MAX_BPF_JIT_REG + 1)
+
+/*
+ * The following table maps BPF registers to x86-64 registers.
+ *
+ * x86-64 register R12 is unused, since if used as base address
+ * register in load/store instructions, it always needs an
+ * extra byte of encoding and is callee saved.
+ *
+ * Also x86-64 register R9 is unused. x86-64 register R10 is
+ * used for blinding (if enabled).
+ */
+static const int reg2hex[] = {
+ [BPF_REG_0] = 0, /* RAX */
+ [BPF_REG_1] = 7, /* RDI */
+ [BPF_REG_2] = 6, /* RSI */
+ [BPF_REG_3] = 2, /* RDX */
+ [BPF_REG_4] = 1, /* RCX */
+ [BPF_REG_5] = 0, /* R8 */
+ [BPF_REG_6] = 3, /* RBX callee saved */
+ [BPF_REG_7] = 5, /* R13 callee saved */
+ [BPF_REG_8] = 6, /* R14 callee saved */
+ [BPF_REG_9] = 7, /* R15 callee saved */
+ [BPF_REG_FP] = 5, /* RBP readonly */
+ [BPF_REG_AX] = 2, /* R10 temp register */
+ [AUX_REG] = 3, /* R11 temp register */
+};
+
+/*
+ * is_ereg() == true if BPF register 'reg' maps to x86-64 r8..r15
+ * which need extra byte of encoding.
+ * rax,rcx,...,rbp have simpler encoding
+ */
+static bool is_ereg(u32 reg)
+{
+ return (1 << reg) & (BIT(BPF_REG_5) |
+ BIT(AUX_REG) |
+ BIT(BPF_REG_7) |
+ BIT(BPF_REG_8) |
+ BIT(BPF_REG_9) |
+ BIT(BPF_REG_AX));
+}
+
+/*
+ * is_ereg_8l() == true if BPF register 'reg' is mapped to access x86-64
+ * lower 8-bit registers dil,sil,bpl,spl,r8b..r15b, which need extra byte
+ * of encoding. al,cl,dl,bl have simpler encoding.
+ */
+static bool is_ereg_8l(u32 reg)
+{
+ return is_ereg(reg) ||
+ (1 << reg) & (BIT(BPF_REG_1) |
+ BIT(BPF_REG_2) |
+ BIT(BPF_REG_FP));
+}
+
+static bool is_axreg(u32 reg)
+{
+ return reg == BPF_REG_0;
+}
+
+/* Add modifiers if 'reg' maps to x86-64 registers R8..R15 */
+static u8 add_1mod(u8 byte, u32 reg)
+{
+ if (is_ereg(reg))
+ byte |= 1;
+ return byte;
+}
+
+static u8 add_2mod(u8 byte, u32 r1, u32 r2)
+{
+ if (is_ereg(r1))
+ byte |= 1;
+ if (is_ereg(r2))
+ byte |= 4;
+ return byte;
+}
+
+/* Encode 'dst_reg' register into x86-64 opcode 'byte' */
+static u8 add_1reg(u8 byte, u32 dst_reg)
+{
+ return byte + reg2hex[dst_reg];
+}
+
+/* Encode 'dst_reg' and 'src_reg' registers into x86-64 opcode 'byte' */
+static u8 add_2reg(u8 byte, u32 dst_reg, u32 src_reg)
+{
+ return byte + reg2hex[dst_reg] + (reg2hex[src_reg] << 3);
+}
+
+static void jit_fill_hole(void *area, unsigned int size)
+{
+ /* Fill whole space with INT3 instructions */
+ memset(area, 0xcc, size);
+}
+
+struct jit_context {
+ int cleanup_addr; /* Epilogue code offset */
+};
+
+/* Maximum number of bytes emitted while JITing one eBPF insn */
+#define BPF_MAX_INSN_SIZE 128
+#define BPF_INSN_SAFETY 64
+
+#define PROLOGUE_SIZE 20
+
+/*
+ * Emit x86-64 prologue code for BPF program and check its size.
+ * bpf_tail_call helper will skip it while jumping into another program
+ */
+static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+
+ EMIT1(0x55); /* push rbp */
+ EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */
+ /* sub rsp, rounded_stack_depth */
+ EMIT3_off32(0x48, 0x81, 0xEC, round_up(stack_depth, 8));
+ EMIT1(0x53); /* push rbx */
+ EMIT2(0x41, 0x55); /* push r13 */
+ EMIT2(0x41, 0x56); /* push r14 */
+ EMIT2(0x41, 0x57); /* push r15 */
+ if (!ebpf_from_cbpf) {
+ /* zero init tail_call_cnt */
+ EMIT2(0x6a, 0x00);
+ BUILD_BUG_ON(cnt != PROLOGUE_SIZE);
+ }
+ *pprog = prog;
+}
+
+/*
+ * Generate the following code:
+ *
+ * ... bpf_tail_call(void *ctx, struct bpf_array *array, u64 index) ...
+ * if (index >= array->map.max_entries)
+ * goto out;
+ * if (++tail_call_cnt > MAX_TAIL_CALL_CNT)
+ * goto out;
+ * prog = array->ptrs[index];
+ * if (prog == NULL)
+ * goto out;
+ * goto *(prog->bpf_func + prologue_size);
+ * out:
+ */
+static void emit_bpf_tail_call(u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int label1, label2, label3;
+ int cnt = 0;
+
+ /*
+ * rdi - pointer to ctx
+ * rsi - pointer to bpf_array
+ * rdx - index in bpf_array
+ */
+
+ /*
+ * if (index >= array->map.max_entries)
+ * goto out;
+ */
+ EMIT2(0x89, 0xD2); /* mov edx, edx */
+ EMIT3(0x39, 0x56, /* cmp dword ptr [rsi + 16], edx */
+ offsetof(struct bpf_array, map.max_entries));
+#define OFFSET1 (41 + RETPOLINE_RAX_BPF_JIT_SIZE) /* Number of bytes to jump */
+ EMIT2(X86_JBE, OFFSET1); /* jbe out */
+ label1 = cnt;
+
+ /*
+ * if (tail_call_cnt > MAX_TAIL_CALL_CNT)
+ * goto out;
+ */
+ EMIT2_off32(0x8B, 0x85, -36 - MAX_BPF_STACK); /* mov eax, dword ptr [rbp - 548] */
+ EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */
+#define OFFSET2 (30 + RETPOLINE_RAX_BPF_JIT_SIZE)
+ EMIT2(X86_JA, OFFSET2); /* ja out */
+ label2 = cnt;
+ EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */
+ EMIT2_off32(0x89, 0x85, -36 - MAX_BPF_STACK); /* mov dword ptr [rbp -548], eax */
+
+ /* prog = array->ptrs[index]; */
+ EMIT4_off32(0x48, 0x8B, 0x84, 0xD6, /* mov rax, [rsi + rdx * 8 + offsetof(...)] */
+ offsetof(struct bpf_array, ptrs));
+
+ /*
+ * if (prog == NULL)
+ * goto out;
+ */
+ EMIT3(0x48, 0x85, 0xC0); /* test rax,rax */
+#define OFFSET3 (8 + RETPOLINE_RAX_BPF_JIT_SIZE)
+ EMIT2(X86_JE, OFFSET3); /* je out */
+ label3 = cnt;
+
+ /* goto *(prog->bpf_func + prologue_size); */
+ EMIT4(0x48, 0x8B, 0x40, /* mov rax, qword ptr [rax + 32] */
+ offsetof(struct bpf_prog, bpf_func));
+ EMIT4(0x48, 0x83, 0xC0, PROLOGUE_SIZE); /* add rax, prologue_size */
+
+ /*
+ * Wow we're ready to jump into next BPF program
+ * rdi == ctx (1st arg)
+ * rax == prog->bpf_func + prologue_size
+ */
+ RETPOLINE_RAX_BPF_JIT();
+
+ /* out: */
+ BUILD_BUG_ON(cnt - label1 != OFFSET1);
+ BUILD_BUG_ON(cnt - label2 != OFFSET2);
+ BUILD_BUG_ON(cnt - label3 != OFFSET3);
+ *pprog = prog;
+}
+
+static void emit_mov_imm32(u8 **pprog, bool sign_propagate,
+ u32 dst_reg, const u32 imm32)
+{
+ u8 *prog = *pprog;
+ u8 b1, b2, b3;
+ int cnt = 0;
+
+ /*
+ * Optimization: if imm32 is positive, use 'mov %eax, imm32'
+ * (which zero-extends imm32) to save 2 bytes.
+ */
+ if (sign_propagate && (s32)imm32 < 0) {
+ /* 'mov %rax, imm32' sign extends imm32 */
+ b1 = add_1mod(0x48, dst_reg);
+ b2 = 0xC7;
+ b3 = 0xC0;
+ EMIT3_off32(b1, b2, add_1reg(b3, dst_reg), imm32);
+ goto done;
+ }
+
+ /*
+ * Optimization: if imm32 is zero, use 'xor %eax, %eax'
+ * to save 3 bytes.
+ */
+ if (imm32 == 0) {
+ if (is_ereg(dst_reg))
+ EMIT1(add_2mod(0x40, dst_reg, dst_reg));
+ b2 = 0x31; /* xor */
+ b3 = 0xC0;
+ EMIT2(b2, add_2reg(b3, dst_reg, dst_reg));
+ goto done;
+ }
+
+ /* mov %eax, imm32 */
+ if (is_ereg(dst_reg))
+ EMIT1(add_1mod(0x40, dst_reg));
+ EMIT1_off32(add_1reg(0xB8, dst_reg), imm32);
+done:
+ *pprog = prog;
+}
+
+static void emit_mov_imm64(u8 **pprog, u32 dst_reg,
+ const u32 imm32_hi, const u32 imm32_lo)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+
+ if (is_uimm32(((u64)imm32_hi << 32) | (u32)imm32_lo)) {
+ /*
+ * For emitting plain u32, where sign bit must not be
+ * propagated LLVM tends to load imm64 over mov32
+ * directly, so save couple of bytes by just doing
+ * 'mov %eax, imm32' instead.
+ */
+ emit_mov_imm32(&prog, false, dst_reg, imm32_lo);
+ } else {
+ /* movabsq %rax, imm64 */
+ EMIT2(add_1mod(0x48, dst_reg), add_1reg(0xB8, dst_reg));
+ EMIT(imm32_lo, 4);
+ EMIT(imm32_hi, 4);
+ }
+
+ *pprog = prog;
+}
+
+static void emit_mov_reg(u8 **pprog, bool is64, u32 dst_reg, u32 src_reg)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+
+ if (is64) {
+ /* mov dst, src */
+ EMIT_mov(dst_reg, src_reg);
+ } else {
+ /* mov32 dst, src */
+ if (is_ereg(dst_reg) || is_ereg(src_reg))
+ EMIT1(add_2mod(0x40, dst_reg, src_reg));
+ EMIT2(0x89, add_2reg(0xC0, dst_reg, src_reg));
+ }
+
+ *pprog = prog;
+}
+
+static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
+ int oldproglen, struct jit_context *ctx)
+{
+ struct bpf_insn *insn = bpf_prog->insnsi;
+ int insn_cnt = bpf_prog->len;
+ bool seen_exit = false;
+ u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY];
+ int i, cnt = 0;
+ int proglen = 0;
+ u8 *prog = temp;
+
+ emit_prologue(&prog, bpf_prog->aux->stack_depth,
+ bpf_prog_was_classic(bpf_prog));
+
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ const s32 imm32 = insn->imm;
+ u32 dst_reg = insn->dst_reg;
+ u32 src_reg = insn->src_reg;
+ u8 b2 = 0, b3 = 0;
+ s64 jmp_offset;
+ u8 jmp_cond;
+ int ilen;
+ u8 *func;
+
+ switch (insn->code) {
+ /* ALU */
+ case BPF_ALU | BPF_ADD | BPF_X:
+ case BPF_ALU | BPF_SUB | BPF_X:
+ case BPF_ALU | BPF_AND | BPF_X:
+ case BPF_ALU | BPF_OR | BPF_X:
+ case BPF_ALU | BPF_XOR | BPF_X:
+ case BPF_ALU64 | BPF_ADD | BPF_X:
+ case BPF_ALU64 | BPF_SUB | BPF_X:
+ case BPF_ALU64 | BPF_AND | BPF_X:
+ case BPF_ALU64 | BPF_OR | BPF_X:
+ case BPF_ALU64 | BPF_XOR | BPF_X:
+ switch (BPF_OP(insn->code)) {
+ case BPF_ADD: b2 = 0x01; break;
+ case BPF_SUB: b2 = 0x29; break;
+ case BPF_AND: b2 = 0x21; break;
+ case BPF_OR: b2 = 0x09; break;
+ case BPF_XOR: b2 = 0x31; break;
+ }
+ if (BPF_CLASS(insn->code) == BPF_ALU64)
+ EMIT1(add_2mod(0x48, dst_reg, src_reg));
+ else if (is_ereg(dst_reg) || is_ereg(src_reg))
+ EMIT1(add_2mod(0x40, dst_reg, src_reg));
+ EMIT2(b2, add_2reg(0xC0, dst_reg, src_reg));
+ break;
+
+ case BPF_ALU64 | BPF_MOV | BPF_X:
+ case BPF_ALU | BPF_MOV | BPF_X:
+ emit_mov_reg(&prog,
+ BPF_CLASS(insn->code) == BPF_ALU64,
+ dst_reg, src_reg);
+ break;
+
+ /* neg dst */
+ case BPF_ALU | BPF_NEG:
+ case BPF_ALU64 | BPF_NEG:
+ if (BPF_CLASS(insn->code) == BPF_ALU64)
+ EMIT1(add_1mod(0x48, dst_reg));
+ else if (is_ereg(dst_reg))
+ EMIT1(add_1mod(0x40, dst_reg));
+ EMIT2(0xF7, add_1reg(0xD8, dst_reg));
+ break;
+
+ case BPF_ALU | BPF_ADD | BPF_K:
+ case BPF_ALU | BPF_SUB | BPF_K:
+ case BPF_ALU | BPF_AND | BPF_K:
+ case BPF_ALU | BPF_OR | BPF_K:
+ case BPF_ALU | BPF_XOR | BPF_K:
+ case BPF_ALU64 | BPF_ADD | BPF_K:
+ case BPF_ALU64 | BPF_SUB | BPF_K:
+ case BPF_ALU64 | BPF_AND | BPF_K:
+ case BPF_ALU64 | BPF_OR | BPF_K:
+ case BPF_ALU64 | BPF_XOR | BPF_K:
+ if (BPF_CLASS(insn->code) == BPF_ALU64)
+ EMIT1(add_1mod(0x48, dst_reg));
+ else if (is_ereg(dst_reg))
+ EMIT1(add_1mod(0x40, dst_reg));
+
+ /*
+ * b3 holds 'normal' opcode, b2 short form only valid
+ * in case dst is eax/rax.
+ */
+ switch (BPF_OP(insn->code)) {
+ case BPF_ADD:
+ b3 = 0xC0;
+ b2 = 0x05;
+ break;
+ case BPF_SUB:
+ b3 = 0xE8;
+ b2 = 0x2D;
+ break;
+ case BPF_AND:
+ b3 = 0xE0;
+ b2 = 0x25;
+ break;
+ case BPF_OR:
+ b3 = 0xC8;
+ b2 = 0x0D;
+ break;
+ case BPF_XOR:
+ b3 = 0xF0;
+ b2 = 0x35;
+ break;
+ }
+
+ if (is_imm8(imm32))
+ EMIT3(0x83, add_1reg(b3, dst_reg), imm32);
+ else if (is_axreg(dst_reg))
+ EMIT1_off32(b2, imm32);
+ else
+ EMIT2_off32(0x81, add_1reg(b3, dst_reg), imm32);
+ break;
+
+ case BPF_ALU64 | BPF_MOV | BPF_K:
+ case BPF_ALU | BPF_MOV | BPF_K:
+ emit_mov_imm32(&prog, BPF_CLASS(insn->code) == BPF_ALU64,
+ dst_reg, imm32);
+ break;
+
+ case BPF_LD | BPF_IMM | BPF_DW:
+ emit_mov_imm64(&prog, dst_reg, insn[1].imm, insn[0].imm);
+ insn++;
+ i++;
+ break;
+
+ /* dst %= src, dst /= src, dst %= imm32, dst /= imm32 */
+ case BPF_ALU | BPF_MOD | BPF_X:
+ case BPF_ALU | BPF_DIV | BPF_X:
+ case BPF_ALU | BPF_MOD | BPF_K:
+ case BPF_ALU | BPF_DIV | BPF_K:
+ case BPF_ALU64 | BPF_MOD | BPF_X:
+ case BPF_ALU64 | BPF_DIV | BPF_X:
+ case BPF_ALU64 | BPF_MOD | BPF_K:
+ case BPF_ALU64 | BPF_DIV | BPF_K:
+ EMIT1(0x50); /* push rax */
+ EMIT1(0x52); /* push rdx */
+
+ if (BPF_SRC(insn->code) == BPF_X)
+ /* mov r11, src_reg */
+ EMIT_mov(AUX_REG, src_reg);
+ else
+ /* mov r11, imm32 */
+ EMIT3_off32(0x49, 0xC7, 0xC3, imm32);
+
+ /* mov rax, dst_reg */
+ EMIT_mov(BPF_REG_0, dst_reg);
+
+ /*
+ * xor edx, edx
+ * equivalent to 'xor rdx, rdx', but one byte less
+ */
+ EMIT2(0x31, 0xd2);
+
+ if (BPF_CLASS(insn->code) == BPF_ALU64)
+ /* div r11 */
+ EMIT3(0x49, 0xF7, 0xF3);
+ else
+ /* div r11d */
+ EMIT3(0x41, 0xF7, 0xF3);
+
+ if (BPF_OP(insn->code) == BPF_MOD)
+ /* mov r11, rdx */
+ EMIT3(0x49, 0x89, 0xD3);
+ else
+ /* mov r11, rax */
+ EMIT3(0x49, 0x89, 0xC3);
+
+ EMIT1(0x5A); /* pop rdx */
+ EMIT1(0x58); /* pop rax */
+
+ /* mov dst_reg, r11 */
+ EMIT_mov(dst_reg, AUX_REG);
+ break;
+
+ case BPF_ALU | BPF_MUL | BPF_K:
+ case BPF_ALU | BPF_MUL | BPF_X:
+ case BPF_ALU64 | BPF_MUL | BPF_K:
+ case BPF_ALU64 | BPF_MUL | BPF_X:
+ {
+ bool is64 = BPF_CLASS(insn->code) == BPF_ALU64;
+
+ if (dst_reg != BPF_REG_0)
+ EMIT1(0x50); /* push rax */
+ if (dst_reg != BPF_REG_3)
+ EMIT1(0x52); /* push rdx */
+
+ /* mov r11, dst_reg */
+ EMIT_mov(AUX_REG, dst_reg);
+
+ if (BPF_SRC(insn->code) == BPF_X)
+ emit_mov_reg(&prog, is64, BPF_REG_0, src_reg);
+ else
+ emit_mov_imm32(&prog, is64, BPF_REG_0, imm32);
+
+ if (is64)
+ EMIT1(add_1mod(0x48, AUX_REG));
+ else if (is_ereg(AUX_REG))
+ EMIT1(add_1mod(0x40, AUX_REG));
+ /* mul(q) r11 */
+ EMIT2(0xF7, add_1reg(0xE0, AUX_REG));
+
+ if (dst_reg != BPF_REG_3)
+ EMIT1(0x5A); /* pop rdx */
+ if (dst_reg != BPF_REG_0) {
+ /* mov dst_reg, rax */
+ EMIT_mov(dst_reg, BPF_REG_0);
+ EMIT1(0x58); /* pop rax */
+ }
+ break;
+ }
+ /* Shifts */
+ case BPF_ALU | BPF_LSH | BPF_K:
+ case BPF_ALU | BPF_RSH | BPF_K:
+ case BPF_ALU | BPF_ARSH | BPF_K:
+ case BPF_ALU64 | BPF_LSH | BPF_K:
+ case BPF_ALU64 | BPF_RSH | BPF_K:
+ case BPF_ALU64 | BPF_ARSH | BPF_K:
+ if (BPF_CLASS(insn->code) == BPF_ALU64)
+ EMIT1(add_1mod(0x48, dst_reg));
+ else if (is_ereg(dst_reg))
+ EMIT1(add_1mod(0x40, dst_reg));
+
+ switch (BPF_OP(insn->code)) {
+ case BPF_LSH: b3 = 0xE0; break;
+ case BPF_RSH: b3 = 0xE8; break;
+ case BPF_ARSH: b3 = 0xF8; break;
+ }
+
+ if (imm32 == 1)
+ EMIT2(0xD1, add_1reg(b3, dst_reg));
+ else
+ EMIT3(0xC1, add_1reg(b3, dst_reg), imm32);
+ break;
+
+ case BPF_ALU | BPF_LSH | BPF_X:
+ case BPF_ALU | BPF_RSH | BPF_X:
+ case BPF_ALU | BPF_ARSH | BPF_X:
+ case BPF_ALU64 | BPF_LSH | BPF_X:
+ case BPF_ALU64 | BPF_RSH | BPF_X:
+ case BPF_ALU64 | BPF_ARSH | BPF_X:
+
+ /* Check for bad case when dst_reg == rcx */
+ if (dst_reg == BPF_REG_4) {
+ /* mov r11, dst_reg */
+ EMIT_mov(AUX_REG, dst_reg);
+ dst_reg = AUX_REG;
+ }
+
+ if (src_reg != BPF_REG_4) { /* common case */
+ EMIT1(0x51); /* push rcx */
+
+ /* mov rcx, src_reg */
+ EMIT_mov(BPF_REG_4, src_reg);
+ }
+
+ /* shl %rax, %cl | shr %rax, %cl | sar %rax, %cl */
+ if (BPF_CLASS(insn->code) == BPF_ALU64)
+ EMIT1(add_1mod(0x48, dst_reg));
+ else if (is_ereg(dst_reg))
+ EMIT1(add_1mod(0x40, dst_reg));
+
+ switch (BPF_OP(insn->code)) {
+ case BPF_LSH: b3 = 0xE0; break;
+ case BPF_RSH: b3 = 0xE8; break;
+ case BPF_ARSH: b3 = 0xF8; break;
+ }
+ EMIT2(0xD3, add_1reg(b3, dst_reg));
+
+ if (src_reg != BPF_REG_4)
+ EMIT1(0x59); /* pop rcx */
+
+ if (insn->dst_reg == BPF_REG_4)
+ /* mov dst_reg, r11 */
+ EMIT_mov(insn->dst_reg, AUX_REG);
+ break;
+
+ case BPF_ALU | BPF_END | BPF_FROM_BE:
+ switch (imm32) {
+ case 16:
+ /* Emit 'ror %ax, 8' to swap lower 2 bytes */
+ EMIT1(0x66);
+ if (is_ereg(dst_reg))
+ EMIT1(0x41);
+ EMIT3(0xC1, add_1reg(0xC8, dst_reg), 8);
+
+ /* Emit 'movzwl eax, ax' */
+ if (is_ereg(dst_reg))
+ EMIT3(0x45, 0x0F, 0xB7);
+ else
+ EMIT2(0x0F, 0xB7);
+ EMIT1(add_2reg(0xC0, dst_reg, dst_reg));
+ break;
+ case 32:
+ /* Emit 'bswap eax' to swap lower 4 bytes */
+ if (is_ereg(dst_reg))
+ EMIT2(0x41, 0x0F);
+ else
+ EMIT1(0x0F);
+ EMIT1(add_1reg(0xC8, dst_reg));
+ break;
+ case 64:
+ /* Emit 'bswap rax' to swap 8 bytes */
+ EMIT3(add_1mod(0x48, dst_reg), 0x0F,
+ add_1reg(0xC8, dst_reg));
+ break;
+ }
+ break;
+
+ case BPF_ALU | BPF_END | BPF_FROM_LE:
+ switch (imm32) {
+ case 16:
+ /*
+ * Emit 'movzwl eax, ax' to zero extend 16-bit
+ * into 64 bit
+ */
+ if (is_ereg(dst_reg))
+ EMIT3(0x45, 0x0F, 0xB7);
+ else
+ EMIT2(0x0F, 0xB7);
+ EMIT1(add_2reg(0xC0, dst_reg, dst_reg));
+ break;
+ case 32:
+ /* Emit 'mov eax, eax' to clear upper 32-bits */
+ if (is_ereg(dst_reg))
+ EMIT1(0x45);
+ EMIT2(0x89, add_2reg(0xC0, dst_reg, dst_reg));
+ break;
+ case 64:
+ /* nop */
+ break;
+ }
+ break;
+
+ /* speculation barrier */
+ case BPF_ST | BPF_NOSPEC:
+ if (boot_cpu_has(X86_FEATURE_XMM2))
+ /* Emit 'lfence' */
+ EMIT3(0x0F, 0xAE, 0xE8);
+ break;
+
+ /* ST: *(u8*)(dst_reg + off) = imm */
+ case BPF_ST | BPF_MEM | BPF_B:
+ if (is_ereg(dst_reg))
+ EMIT2(0x41, 0xC6);
+ else
+ EMIT1(0xC6);
+ goto st;
+ case BPF_ST | BPF_MEM | BPF_H:
+ if (is_ereg(dst_reg))
+ EMIT3(0x66, 0x41, 0xC7);
+ else
+ EMIT2(0x66, 0xC7);
+ goto st;
+ case BPF_ST | BPF_MEM | BPF_W:
+ if (is_ereg(dst_reg))
+ EMIT2(0x41, 0xC7);
+ else
+ EMIT1(0xC7);
+ goto st;
+ case BPF_ST | BPF_MEM | BPF_DW:
+ EMIT2(add_1mod(0x48, dst_reg), 0xC7);
+
+st: if (is_imm8(insn->off))
+ EMIT2(add_1reg(0x40, dst_reg), insn->off);
+ else
+ EMIT1_off32(add_1reg(0x80, dst_reg), insn->off);
+
+ EMIT(imm32, bpf_size_to_x86_bytes(BPF_SIZE(insn->code)));
+ break;
+
+ /* STX: *(u8*)(dst_reg + off) = src_reg */
+ case BPF_STX | BPF_MEM | BPF_B:
+ /* Emit 'mov byte ptr [rax + off], al' */
+ if (is_ereg(dst_reg) || is_ereg_8l(src_reg))
+ /* Add extra byte for eregs or SIL,DIL,BPL in src_reg */
+ EMIT2(add_2mod(0x40, dst_reg, src_reg), 0x88);
+ else
+ EMIT1(0x88);
+ goto stx;
+ case BPF_STX | BPF_MEM | BPF_H:
+ if (is_ereg(dst_reg) || is_ereg(src_reg))
+ EMIT3(0x66, add_2mod(0x40, dst_reg, src_reg), 0x89);
+ else
+ EMIT2(0x66, 0x89);
+ goto stx;
+ case BPF_STX | BPF_MEM | BPF_W:
+ if (is_ereg(dst_reg) || is_ereg(src_reg))
+ EMIT2(add_2mod(0x40, dst_reg, src_reg), 0x89);
+ else
+ EMIT1(0x89);
+ goto stx;
+ case BPF_STX | BPF_MEM | BPF_DW:
+ EMIT2(add_2mod(0x48, dst_reg, src_reg), 0x89);
+stx: if (is_imm8(insn->off))
+ EMIT2(add_2reg(0x40, dst_reg, src_reg), insn->off);
+ else
+ EMIT1_off32(add_2reg(0x80, dst_reg, src_reg),
+ insn->off);
+ break;
+
+ /* LDX: dst_reg = *(u8*)(src_reg + off) */
+ case BPF_LDX | BPF_MEM | BPF_B:
+ /* Emit 'movzx rax, byte ptr [rax + off]' */
+ EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xB6);
+ goto ldx;
+ case BPF_LDX | BPF_MEM | BPF_H:
+ /* Emit 'movzx rax, word ptr [rax + off]' */
+ EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xB7);
+ goto ldx;
+ case BPF_LDX | BPF_MEM | BPF_W:
+ /* Emit 'mov eax, dword ptr [rax+0x14]' */
+ if (is_ereg(dst_reg) || is_ereg(src_reg))
+ EMIT2(add_2mod(0x40, src_reg, dst_reg), 0x8B);
+ else
+ EMIT1(0x8B);
+ goto ldx;
+ case BPF_LDX | BPF_MEM | BPF_DW:
+ /* Emit 'mov rax, qword ptr [rax+0x14]' */
+ EMIT2(add_2mod(0x48, src_reg, dst_reg), 0x8B);
+ldx: /*
+ * If insn->off == 0 we can save one extra byte, but
+ * special case of x86 R13 which always needs an offset
+ * is not worth the hassle
+ */
+ if (is_imm8(insn->off))
+ EMIT2(add_2reg(0x40, src_reg, dst_reg), insn->off);
+ else
+ EMIT1_off32(add_2reg(0x80, src_reg, dst_reg),
+ insn->off);
+ break;
+
+ /* STX XADD: lock *(u32*)(dst_reg + off) += src_reg */
+ case BPF_STX | BPF_XADD | BPF_W:
+ /* Emit 'lock add dword ptr [rax + off], eax' */
+ if (is_ereg(dst_reg) || is_ereg(src_reg))
+ EMIT3(0xF0, add_2mod(0x40, dst_reg, src_reg), 0x01);
+ else
+ EMIT2(0xF0, 0x01);
+ goto xadd;
+ case BPF_STX | BPF_XADD | BPF_DW:
+ EMIT3(0xF0, add_2mod(0x48, dst_reg, src_reg), 0x01);
+xadd: if (is_imm8(insn->off))
+ EMIT2(add_2reg(0x40, dst_reg, src_reg), insn->off);
+ else
+ EMIT1_off32(add_2reg(0x80, dst_reg, src_reg),
+ insn->off);
+ break;
+
+ /* call */
+ case BPF_JMP | BPF_CALL:
+ func = (u8 *) __bpf_call_base + imm32;
+ jmp_offset = func - (image + addrs[i]);
+ if (!imm32 || !is_simm32(jmp_offset)) {
+ pr_err("unsupported BPF func %d addr %p image %p\n",
+ imm32, func, image);
+ return -EINVAL;
+ }
+ EMIT1_off32(0xE8, jmp_offset);
+ break;
+
+ case BPF_JMP | BPF_TAIL_CALL:
+ emit_bpf_tail_call(&prog);
+ break;
+
+ /* cond jump */
+ case BPF_JMP | BPF_JEQ | BPF_X:
+ case BPF_JMP | BPF_JNE | BPF_X:
+ case BPF_JMP | BPF_JGT | BPF_X:
+ case BPF_JMP | BPF_JLT | BPF_X:
+ case BPF_JMP | BPF_JGE | BPF_X:
+ case BPF_JMP | BPF_JLE | BPF_X:
+ case BPF_JMP | BPF_JSGT | BPF_X:
+ case BPF_JMP | BPF_JSLT | BPF_X:
+ case BPF_JMP | BPF_JSGE | BPF_X:
+ case BPF_JMP | BPF_JSLE | BPF_X:
+ /* cmp dst_reg, src_reg */
+ EMIT3(add_2mod(0x48, dst_reg, src_reg), 0x39,
+ add_2reg(0xC0, dst_reg, src_reg));
+ goto emit_cond_jmp;
+
+ case BPF_JMP | BPF_JSET | BPF_X:
+ /* test dst_reg, src_reg */
+ EMIT3(add_2mod(0x48, dst_reg, src_reg), 0x85,
+ add_2reg(0xC0, dst_reg, src_reg));
+ goto emit_cond_jmp;
+
+ case BPF_JMP | BPF_JSET | BPF_K:
+ /* test dst_reg, imm32 */
+ EMIT1(add_1mod(0x48, dst_reg));
+ EMIT2_off32(0xF7, add_1reg(0xC0, dst_reg), imm32);
+ goto emit_cond_jmp;
+
+ case BPF_JMP | BPF_JEQ | BPF_K:
+ case BPF_JMP | BPF_JNE | BPF_K:
+ case BPF_JMP | BPF_JGT | BPF_K:
+ case BPF_JMP | BPF_JLT | BPF_K:
+ case BPF_JMP | BPF_JGE | BPF_K:
+ case BPF_JMP | BPF_JLE | BPF_K:
+ case BPF_JMP | BPF_JSGT | BPF_K:
+ case BPF_JMP | BPF_JSLT | BPF_K:
+ case BPF_JMP | BPF_JSGE | BPF_K:
+ case BPF_JMP | BPF_JSLE | BPF_K:
+ /* cmp dst_reg, imm8/32 */
+ EMIT1(add_1mod(0x48, dst_reg));
+
+ if (is_imm8(imm32))
+ EMIT3(0x83, add_1reg(0xF8, dst_reg), imm32);
+ else
+ EMIT2_off32(0x81, add_1reg(0xF8, dst_reg), imm32);
+
+emit_cond_jmp: /* Convert BPF opcode to x86 */
+ switch (BPF_OP(insn->code)) {
+ case BPF_JEQ:
+ jmp_cond = X86_JE;
+ break;
+ case BPF_JSET:
+ case BPF_JNE:
+ jmp_cond = X86_JNE;
+ break;
+ case BPF_JGT:
+ /* GT is unsigned '>', JA in x86 */
+ jmp_cond = X86_JA;
+ break;
+ case BPF_JLT:
+ /* LT is unsigned '<', JB in x86 */
+ jmp_cond = X86_JB;
+ break;
+ case BPF_JGE:
+ /* GE is unsigned '>=', JAE in x86 */
+ jmp_cond = X86_JAE;
+ break;
+ case BPF_JLE:
+ /* LE is unsigned '<=', JBE in x86 */
+ jmp_cond = X86_JBE;
+ break;
+ case BPF_JSGT:
+ /* Signed '>', GT in x86 */
+ jmp_cond = X86_JG;
+ break;
+ case BPF_JSLT:
+ /* Signed '<', LT in x86 */
+ jmp_cond = X86_JL;
+ break;
+ case BPF_JSGE:
+ /* Signed '>=', GE in x86 */
+ jmp_cond = X86_JGE;
+ break;
+ case BPF_JSLE:
+ /* Signed '<=', LE in x86 */
+ jmp_cond = X86_JLE;
+ break;
+ default: /* to silence GCC warning */
+ return -EFAULT;
+ }
+ jmp_offset = addrs[i + insn->off] - addrs[i];
+ if (is_imm8(jmp_offset)) {
+ EMIT2(jmp_cond, jmp_offset);
+ } else if (is_simm32(jmp_offset)) {
+ EMIT2_off32(0x0F, jmp_cond + 0x10, jmp_offset);
+ } else {
+ pr_err("cond_jmp gen bug %llx\n", jmp_offset);
+ return -EFAULT;
+ }
+
+ break;
+
+ case BPF_JMP | BPF_JA:
+ if (insn->off == -1)
+ /* -1 jmp instructions will always jump
+ * backwards two bytes. Explicitly handling
+ * this case avoids wasting too many passes
+ * when there are long sequences of replaced
+ * dead code.
+ */
+ jmp_offset = -2;
+ else
+ jmp_offset = addrs[i + insn->off] - addrs[i];
+
+ if (!jmp_offset)
+ /* Optimize out nop jumps */
+ break;
+emit_jmp:
+ if (is_imm8(jmp_offset)) {
+ EMIT2(0xEB, jmp_offset);
+ } else if (is_simm32(jmp_offset)) {
+ EMIT1_off32(0xE9, jmp_offset);
+ } else {
+ pr_err("jmp gen bug %llx\n", jmp_offset);
+ return -EFAULT;
+ }
+ break;
+
+ case BPF_JMP | BPF_EXIT:
+ if (seen_exit) {
+ jmp_offset = ctx->cleanup_addr - addrs[i];
+ goto emit_jmp;
+ }
+ seen_exit = true;
+ /* Update cleanup_addr */
+ ctx->cleanup_addr = proglen;
+ if (!bpf_prog_was_classic(bpf_prog))
+ EMIT1(0x5B); /* get rid of tail_call_cnt */
+ EMIT2(0x41, 0x5F); /* pop r15 */
+ EMIT2(0x41, 0x5E); /* pop r14 */
+ EMIT2(0x41, 0x5D); /* pop r13 */
+ EMIT1(0x5B); /* pop rbx */
+ EMIT1(0xC9); /* leave */
+ EMIT1(0xC3); /* ret */
+ break;
+
+ default:
+ /*
+ * By design x86-64 JIT should support all BPF instructions.
+ * This error will be seen if new instruction was added
+ * to the interpreter, but not to the JIT, or if there is
+ * junk in bpf_prog.
+ */
+ pr_err("bpf_jit: unknown opcode %02x\n", insn->code);
+ return -EINVAL;
+ }
+
+ ilen = prog - temp;
+ if (ilen > BPF_MAX_INSN_SIZE) {
+ pr_err("bpf_jit: fatal insn size error\n");
+ return -EFAULT;
+ }
+
+ if (image) {
+ /*
+ * When populating the image, assert that:
+ *
+ * i) We do not write beyond the allocated space, and
+ * ii) addrs[i] did not change from the prior run, in order
+ * to validate assumptions made for computing branch
+ * displacements.
+ */
+ if (unlikely(proglen + ilen > oldproglen ||
+ proglen + ilen != addrs[i])) {
+ pr_err("bpf_jit: fatal error\n");
+ return -EFAULT;
+ }
+ memcpy(image + proglen, temp, ilen);
+ }
+ proglen += ilen;
+ addrs[i] = proglen;
+ prog = temp;
+ }
+ return proglen;
+}
+
+struct x64_jit_data {
+ struct bpf_binary_header *header;
+ int *addrs;
+ u8 *image;
+ int proglen;
+ struct jit_context ctx;
+};
+
+struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
+{
+ struct bpf_binary_header *header = NULL;
+ struct bpf_prog *tmp, *orig_prog = prog;
+ struct x64_jit_data *jit_data;
+ int proglen, oldproglen = 0;
+ struct jit_context ctx = {};
+ bool tmp_blinded = false;
+ bool extra_pass = false;
+ u8 *image = NULL;
+ int *addrs;
+ int pass;
+ int i;
+
+ if (!prog->jit_requested)
+ return orig_prog;
+
+ tmp = bpf_jit_blind_constants(prog);
+ /*
+ * If blinding was requested and we failed during blinding,
+ * we must fall back to the interpreter.
+ */
+ if (IS_ERR(tmp))
+ return orig_prog;
+ if (tmp != prog) {
+ tmp_blinded = true;
+ prog = tmp;
+ }
+
+ jit_data = prog->aux->jit_data;
+ if (!jit_data) {
+ jit_data = kzalloc(sizeof(*jit_data), GFP_KERNEL);
+ if (!jit_data) {
+ prog = orig_prog;
+ goto out;
+ }
+ prog->aux->jit_data = jit_data;
+ }
+ addrs = jit_data->addrs;
+ if (addrs) {
+ ctx = jit_data->ctx;
+ oldproglen = jit_data->proglen;
+ image = jit_data->image;
+ header = jit_data->header;
+ extra_pass = true;
+ goto skip_init_addrs;
+ }
+ addrs = kmalloc_array(prog->len, sizeof(*addrs), GFP_KERNEL);
+ if (!addrs) {
+ prog = orig_prog;
+ goto out_addrs;
+ }
+
+ /*
+ * Before first pass, make a rough estimation of addrs[]
+ * each BPF instruction is translated to less than 64 bytes
+ */
+ for (proglen = 0, i = 0; i < prog->len; i++) {
+ proglen += 64;
+ addrs[i] = proglen;
+ }
+ ctx.cleanup_addr = proglen;
+skip_init_addrs:
+
+ /*
+ * JITed image shrinks with every pass and the loop iterates
+ * until the image stops shrinking. Very large BPF programs
+ * may converge on the last pass. In such case do one more
+ * pass to emit the final image.
+ */
+ for (pass = 0; pass < 20 || image; pass++) {
+ proglen = do_jit(prog, addrs, image, oldproglen, &ctx);
+ if (proglen <= 0) {
+out_image:
+ image = NULL;
+ if (header)
+ bpf_jit_binary_free(header);
+ prog = orig_prog;
+ goto out_addrs;
+ }
+ if (image) {
+ if (proglen != oldproglen) {
+ pr_err("bpf_jit: proglen=%d != oldproglen=%d\n",
+ proglen, oldproglen);
+ goto out_image;
+ }
+ break;
+ }
+ if (proglen == oldproglen) {
+ header = bpf_jit_binary_alloc(proglen, &image,
+ 1, jit_fill_hole);
+ if (!header) {
+ prog = orig_prog;
+ goto out_addrs;
+ }
+ }
+ oldproglen = proglen;
+ cond_resched();
+ }
+
+ if (bpf_jit_enable > 1)
+ bpf_jit_dump(prog->len, proglen, pass + 1, image);
+
+ if (image) {
+ if (!prog->is_func || extra_pass) {
+ bpf_jit_binary_lock_ro(header);
+ } else {
+ jit_data->addrs = addrs;
+ jit_data->ctx = ctx;
+ jit_data->proglen = proglen;
+ jit_data->image = image;
+ jit_data->header = header;
+ }
+ prog->bpf_func = (void *)image;
+ prog->jited = 1;
+ prog->jited_len = proglen;
+ } else {
+ prog = orig_prog;
+ }
+
+ if (!image || !prog->is_func || extra_pass) {
+out_addrs:
+ kfree(addrs);
+ kfree(jit_data);
+ prog->aux->jit_data = NULL;
+ }
+out:
+ if (tmp_blinded)
+ bpf_jit_prog_release_other(prog, prog == orig_prog ?
+ tmp : orig_prog);
+ return prog;
+}
diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c
new file mode 100644
index 000000000..f48300988
--- /dev/null
+++ b/arch/x86/net/bpf_jit_comp32.c
@@ -0,0 +1,2330 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Just-In-Time compiler for eBPF filters on IA32 (32bit x86)
+ *
+ * Author: Wang YanQing (udknight@gmail.com)
+ * The code based on code and ideas from:
+ * Eric Dumazet (eric.dumazet@gmail.com)
+ * and from:
+ * Shubham Bansal <illusionist.neo@gmail.com>
+ */
+
+#include <linux/netdevice.h>
+#include <linux/filter.h>
+#include <linux/if_vlan.h>
+#include <asm/cacheflush.h>
+#include <asm/set_memory.h>
+#include <asm/nospec-branch.h>
+#include <linux/bpf.h>
+
+/*
+ * eBPF prog stack layout:
+ *
+ * high
+ * original ESP => +-----+
+ * | | callee saved registers
+ * +-----+
+ * | ... | eBPF JIT scratch space
+ * BPF_FP,IA32_EBP => +-----+
+ * | ... | eBPF prog stack
+ * +-----+
+ * |RSVD | JIT scratchpad
+ * current ESP => +-----+
+ * | |
+ * | ... | Function call stack
+ * | |
+ * +-----+
+ * low
+ *
+ * The callee saved registers:
+ *
+ * high
+ * original ESP => +------------------+ \
+ * | ebp | |
+ * current EBP => +------------------+ } callee saved registers
+ * | ebx,esi,edi | |
+ * +------------------+ /
+ * low
+ */
+
+static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
+{
+ if (len == 1)
+ *ptr = bytes;
+ else if (len == 2)
+ *(u16 *)ptr = bytes;
+ else {
+ *(u32 *)ptr = bytes;
+ barrier();
+ }
+ return ptr + len;
+}
+
+#define EMIT(bytes, len) \
+ do { prog = emit_code(prog, bytes, len); cnt += len; } while (0)
+
+#define EMIT1(b1) EMIT(b1, 1)
+#define EMIT2(b1, b2) EMIT((b1) + ((b2) << 8), 2)
+#define EMIT3(b1, b2, b3) EMIT((b1) + ((b2) << 8) + ((b3) << 16), 3)
+#define EMIT4(b1, b2, b3, b4) \
+ EMIT((b1) + ((b2) << 8) + ((b3) << 16) + ((b4) << 24), 4)
+
+#define EMIT1_off32(b1, off) \
+ do { EMIT1(b1); EMIT(off, 4); } while (0)
+#define EMIT2_off32(b1, b2, off) \
+ do { EMIT2(b1, b2); EMIT(off, 4); } while (0)
+#define EMIT3_off32(b1, b2, b3, off) \
+ do { EMIT3(b1, b2, b3); EMIT(off, 4); } while (0)
+#define EMIT4_off32(b1, b2, b3, b4, off) \
+ do { EMIT4(b1, b2, b3, b4); EMIT(off, 4); } while (0)
+
+#define jmp_label(label, jmp_insn_len) (label - cnt - jmp_insn_len)
+
+static bool is_imm8(int value)
+{
+ return value <= 127 && value >= -128;
+}
+
+static bool is_simm32(s64 value)
+{
+ return value == (s64) (s32) value;
+}
+
+#define STACK_OFFSET(k) (k)
+#define TCALL_CNT (MAX_BPF_JIT_REG + 0) /* Tail Call Count */
+
+#define IA32_EAX (0x0)
+#define IA32_EBX (0x3)
+#define IA32_ECX (0x1)
+#define IA32_EDX (0x2)
+#define IA32_ESI (0x6)
+#define IA32_EDI (0x7)
+#define IA32_EBP (0x5)
+#define IA32_ESP (0x4)
+
+/*
+ * List of x86 cond jumps opcodes (. + s8)
+ * Add 0x10 (and an extra 0x0f) to generate far jumps (. + s32)
+ */
+#define IA32_JB 0x72
+#define IA32_JAE 0x73
+#define IA32_JE 0x74
+#define IA32_JNE 0x75
+#define IA32_JBE 0x76
+#define IA32_JA 0x77
+#define IA32_JL 0x7C
+#define IA32_JGE 0x7D
+#define IA32_JLE 0x7E
+#define IA32_JG 0x7F
+
+#define COND_JMP_OPCODE_INVALID (0xFF)
+
+/*
+ * Map eBPF registers to IA32 32bit registers or stack scratch space.
+ *
+ * 1. All the registers, R0-R10, are mapped to scratch space on stack.
+ * 2. We need two 64 bit temp registers to do complex operations on eBPF
+ * registers.
+ * 3. For performance reason, the BPF_REG_AX for blinding constant, is
+ * mapped to real hardware register pair, IA32_ESI and IA32_EDI.
+ *
+ * As the eBPF registers are all 64 bit registers and IA32 has only 32 bit
+ * registers, we have to map each eBPF registers with two IA32 32 bit regs
+ * or scratch memory space and we have to build eBPF 64 bit register from those.
+ *
+ * We use IA32_EAX, IA32_EDX, IA32_ECX, IA32_EBX as temporary registers.
+ */
+static const u8 bpf2ia32[][2] = {
+ /* Return value from in-kernel function, and exit value from eBPF */
+ [BPF_REG_0] = {STACK_OFFSET(0), STACK_OFFSET(4)},
+
+ /* The arguments from eBPF program to in-kernel function */
+ /* Stored on stack scratch space */
+ [BPF_REG_1] = {STACK_OFFSET(8), STACK_OFFSET(12)},
+ [BPF_REG_2] = {STACK_OFFSET(16), STACK_OFFSET(20)},
+ [BPF_REG_3] = {STACK_OFFSET(24), STACK_OFFSET(28)},
+ [BPF_REG_4] = {STACK_OFFSET(32), STACK_OFFSET(36)},
+ [BPF_REG_5] = {STACK_OFFSET(40), STACK_OFFSET(44)},
+
+ /* Callee saved registers that in-kernel function will preserve */
+ /* Stored on stack scratch space */
+ [BPF_REG_6] = {STACK_OFFSET(48), STACK_OFFSET(52)},
+ [BPF_REG_7] = {STACK_OFFSET(56), STACK_OFFSET(60)},
+ [BPF_REG_8] = {STACK_OFFSET(64), STACK_OFFSET(68)},
+ [BPF_REG_9] = {STACK_OFFSET(72), STACK_OFFSET(76)},
+
+ /* Read only Frame Pointer to access Stack */
+ [BPF_REG_FP] = {STACK_OFFSET(80), STACK_OFFSET(84)},
+
+ /* Temporary register for blinding constants. */
+ [BPF_REG_AX] = {IA32_ESI, IA32_EDI},
+
+ /* Tail call count. Stored on stack scratch space. */
+ [TCALL_CNT] = {STACK_OFFSET(88), STACK_OFFSET(92)},
+};
+
+#define dst_lo dst[0]
+#define dst_hi dst[1]
+#define src_lo src[0]
+#define src_hi src[1]
+
+#define STACK_ALIGNMENT 8
+/*
+ * Stack space for BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4,
+ * BPF_REG_5, BPF_REG_6, BPF_REG_7, BPF_REG_8, BPF_REG_9,
+ * BPF_REG_FP, BPF_REG_AX and Tail call counts.
+ */
+#define SCRATCH_SIZE 96
+
+/* Total stack size used in JITed code */
+#define _STACK_SIZE (stack_depth + SCRATCH_SIZE)
+
+#define STACK_SIZE ALIGN(_STACK_SIZE, STACK_ALIGNMENT)
+
+/* Get the offset of eBPF REGISTERs stored on scratch space. */
+#define STACK_VAR(off) (off)
+
+/* Encode 'dst_reg' register into IA32 opcode 'byte' */
+static u8 add_1reg(u8 byte, u32 dst_reg)
+{
+ return byte + dst_reg;
+}
+
+/* Encode 'dst_reg' and 'src_reg' registers into IA32 opcode 'byte' */
+static u8 add_2reg(u8 byte, u32 dst_reg, u32 src_reg)
+{
+ return byte + dst_reg + (src_reg << 3);
+}
+
+static void jit_fill_hole(void *area, unsigned int size)
+{
+ /* Fill whole space with int3 instructions */
+ memset(area, 0xcc, size);
+}
+
+static inline void emit_ia32_mov_i(const u8 dst, const u32 val, bool dstk,
+ u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+
+ if (dstk) {
+ if (val == 0) {
+ /* xor eax,eax */
+ EMIT2(0x33, add_2reg(0xC0, IA32_EAX, IA32_EAX));
+ /* mov dword ptr [ebp+off],eax */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst));
+ } else {
+ EMIT3_off32(0xC7, add_1reg(0x40, IA32_EBP),
+ STACK_VAR(dst), val);
+ }
+ } else {
+ if (val == 0)
+ EMIT2(0x33, add_2reg(0xC0, dst, dst));
+ else
+ EMIT2_off32(0xC7, add_1reg(0xC0, dst),
+ val);
+ }
+ *pprog = prog;
+}
+
+/* dst = imm (4 bytes)*/
+static inline void emit_ia32_mov_r(const u8 dst, const u8 src, bool dstk,
+ bool sstk, u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+ u8 sreg = sstk ? IA32_EAX : src;
+
+ if (sstk)
+ /* mov eax,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(src));
+ if (dstk)
+ /* mov dword ptr [ebp+off],eax */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, sreg), STACK_VAR(dst));
+ else
+ /* mov dst,sreg */
+ EMIT2(0x89, add_2reg(0xC0, dst, sreg));
+
+ *pprog = prog;
+}
+
+/* dst = src */
+static inline void emit_ia32_mov_r64(const bool is64, const u8 dst[],
+ const u8 src[], bool dstk,
+ bool sstk, u8 **pprog)
+{
+ emit_ia32_mov_r(dst_lo, src_lo, dstk, sstk, pprog);
+ if (is64)
+ /* complete 8 byte move */
+ emit_ia32_mov_r(dst_hi, src_hi, dstk, sstk, pprog);
+ else
+ /* zero out high 4 bytes */
+ emit_ia32_mov_i(dst_hi, 0, dstk, pprog);
+}
+
+/* Sign extended move */
+static inline void emit_ia32_mov_i64(const bool is64, const u8 dst[],
+ const u32 val, bool dstk, u8 **pprog)
+{
+ u32 hi = 0;
+
+ if (is64 && (val & (1<<31)))
+ hi = (u32)~0;
+ emit_ia32_mov_i(dst_lo, val, dstk, pprog);
+ emit_ia32_mov_i(dst_hi, hi, dstk, pprog);
+}
+
+/*
+ * ALU operation (32 bit)
+ * dst = dst * src
+ */
+static inline void emit_ia32_mul_r(const u8 dst, const u8 src, bool dstk,
+ bool sstk, u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+ u8 sreg = sstk ? IA32_ECX : src;
+
+ if (sstk)
+ /* mov ecx,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX), STACK_VAR(src));
+
+ if (dstk)
+ /* mov eax,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(dst));
+ else
+ /* mov eax,dst */
+ EMIT2(0x8B, add_2reg(0xC0, dst, IA32_EAX));
+
+
+ EMIT2(0xF7, add_1reg(0xE0, sreg));
+
+ if (dstk)
+ /* mov dword ptr [ebp+off],eax */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst));
+ else
+ /* mov dst,eax */
+ EMIT2(0x89, add_2reg(0xC0, dst, IA32_EAX));
+
+ *pprog = prog;
+}
+
+static inline void emit_ia32_to_le_r64(const u8 dst[], s32 val,
+ bool dstk, u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+ u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+ u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+
+ if (dstk && val != 64) {
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst_lo));
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+ STACK_VAR(dst_hi));
+ }
+ switch (val) {
+ case 16:
+ /*
+ * Emit 'movzwl eax,ax' to zero extend 16-bit
+ * into 64 bit
+ */
+ EMIT2(0x0F, 0xB7);
+ EMIT1(add_2reg(0xC0, dreg_lo, dreg_lo));
+ /* xor dreg_hi,dreg_hi */
+ EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
+ break;
+ case 32:
+ /* xor dreg_hi,dreg_hi */
+ EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
+ break;
+ case 64:
+ /* nop */
+ break;
+ }
+
+ if (dstk && val != 64) {
+ /* mov dword ptr [ebp+off],dreg_lo */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo),
+ STACK_VAR(dst_lo));
+ /* mov dword ptr [ebp+off],dreg_hi */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi),
+ STACK_VAR(dst_hi));
+ }
+ *pprog = prog;
+}
+
+static inline void emit_ia32_to_be_r64(const u8 dst[], s32 val,
+ bool dstk, u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+ u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+ u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+
+ if (dstk) {
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst_lo));
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+ STACK_VAR(dst_hi));
+ }
+ switch (val) {
+ case 16:
+ /* Emit 'ror %ax, 8' to swap lower 2 bytes */
+ EMIT1(0x66);
+ EMIT3(0xC1, add_1reg(0xC8, dreg_lo), 8);
+
+ EMIT2(0x0F, 0xB7);
+ EMIT1(add_2reg(0xC0, dreg_lo, dreg_lo));
+
+ /* xor dreg_hi,dreg_hi */
+ EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
+ break;
+ case 32:
+ /* Emit 'bswap eax' to swap lower 4 bytes */
+ EMIT1(0x0F);
+ EMIT1(add_1reg(0xC8, dreg_lo));
+
+ /* xor dreg_hi,dreg_hi */
+ EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
+ break;
+ case 64:
+ /* Emit 'bswap eax' to swap lower 4 bytes */
+ EMIT1(0x0F);
+ EMIT1(add_1reg(0xC8, dreg_lo));
+
+ /* Emit 'bswap edx' to swap lower 4 bytes */
+ EMIT1(0x0F);
+ EMIT1(add_1reg(0xC8, dreg_hi));
+
+ /* mov ecx,dreg_hi */
+ EMIT2(0x89, add_2reg(0xC0, IA32_ECX, dreg_hi));
+ /* mov dreg_hi,dreg_lo */
+ EMIT2(0x89, add_2reg(0xC0, dreg_hi, dreg_lo));
+ /* mov dreg_lo,ecx */
+ EMIT2(0x89, add_2reg(0xC0, dreg_lo, IA32_ECX));
+
+ break;
+ }
+ if (dstk) {
+ /* mov dword ptr [ebp+off],dreg_lo */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo),
+ STACK_VAR(dst_lo));
+ /* mov dword ptr [ebp+off],dreg_hi */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi),
+ STACK_VAR(dst_hi));
+ }
+ *pprog = prog;
+}
+
+/*
+ * ALU operation (32 bit)
+ * dst = dst (div|mod) src
+ */
+static inline void emit_ia32_div_mod_r(const u8 op, const u8 dst, const u8 src,
+ bool dstk, bool sstk, u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+
+ if (sstk)
+ /* mov ecx,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX),
+ STACK_VAR(src));
+ else if (src != IA32_ECX)
+ /* mov ecx,src */
+ EMIT2(0x8B, add_2reg(0xC0, src, IA32_ECX));
+
+ if (dstk)
+ /* mov eax,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst));
+ else
+ /* mov eax,dst */
+ EMIT2(0x8B, add_2reg(0xC0, dst, IA32_EAX));
+
+ /* xor edx,edx */
+ EMIT2(0x31, add_2reg(0xC0, IA32_EDX, IA32_EDX));
+ /* div ecx */
+ EMIT2(0xF7, add_1reg(0xF0, IA32_ECX));
+
+ if (op == BPF_MOD) {
+ if (dstk)
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EDX),
+ STACK_VAR(dst));
+ else
+ EMIT2(0x89, add_2reg(0xC0, dst, IA32_EDX));
+ } else {
+ if (dstk)
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst));
+ else
+ EMIT2(0x89, add_2reg(0xC0, dst, IA32_EAX));
+ }
+ *pprog = prog;
+}
+
+/*
+ * ALU operation (32 bit)
+ * dst = dst (shift) src
+ */
+static inline void emit_ia32_shift_r(const u8 op, const u8 dst, const u8 src,
+ bool dstk, bool sstk, u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+ u8 dreg = dstk ? IA32_EAX : dst;
+ u8 b2;
+
+ if (dstk)
+ /* mov eax,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(dst));
+
+ if (sstk)
+ /* mov ecx,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX), STACK_VAR(src));
+ else if (src != IA32_ECX)
+ /* mov ecx,src */
+ EMIT2(0x8B, add_2reg(0xC0, src, IA32_ECX));
+
+ switch (op) {
+ case BPF_LSH:
+ b2 = 0xE0; break;
+ case BPF_RSH:
+ b2 = 0xE8; break;
+ case BPF_ARSH:
+ b2 = 0xF8; break;
+ default:
+ return;
+ }
+ EMIT2(0xD3, add_1reg(b2, dreg));
+
+ if (dstk)
+ /* mov dword ptr [ebp+off],dreg */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg), STACK_VAR(dst));
+ *pprog = prog;
+}
+
+/*
+ * ALU operation (32 bit)
+ * dst = dst (op) src
+ */
+static inline void emit_ia32_alu_r(const bool is64, const bool hi, const u8 op,
+ const u8 dst, const u8 src, bool dstk,
+ bool sstk, u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+ u8 sreg = sstk ? IA32_EAX : src;
+ u8 dreg = dstk ? IA32_EDX : dst;
+
+ if (sstk)
+ /* mov eax,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(src));
+
+ if (dstk)
+ /* mov eax,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX), STACK_VAR(dst));
+
+ switch (BPF_OP(op)) {
+ /* dst = dst + src */
+ case BPF_ADD:
+ if (hi && is64)
+ EMIT2(0x11, add_2reg(0xC0, dreg, sreg));
+ else
+ EMIT2(0x01, add_2reg(0xC0, dreg, sreg));
+ break;
+ /* dst = dst - src */
+ case BPF_SUB:
+ if (hi && is64)
+ EMIT2(0x19, add_2reg(0xC0, dreg, sreg));
+ else
+ EMIT2(0x29, add_2reg(0xC0, dreg, sreg));
+ break;
+ /* dst = dst | src */
+ case BPF_OR:
+ EMIT2(0x09, add_2reg(0xC0, dreg, sreg));
+ break;
+ /* dst = dst & src */
+ case BPF_AND:
+ EMIT2(0x21, add_2reg(0xC0, dreg, sreg));
+ break;
+ /* dst = dst ^ src */
+ case BPF_XOR:
+ EMIT2(0x31, add_2reg(0xC0, dreg, sreg));
+ break;
+ }
+
+ if (dstk)
+ /* mov dword ptr [ebp+off],dreg */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg),
+ STACK_VAR(dst));
+ *pprog = prog;
+}
+
+/* ALU operation (64 bit) */
+static inline void emit_ia32_alu_r64(const bool is64, const u8 op,
+ const u8 dst[], const u8 src[],
+ bool dstk, bool sstk,
+ u8 **pprog)
+{
+ u8 *prog = *pprog;
+
+ emit_ia32_alu_r(is64, false, op, dst_lo, src_lo, dstk, sstk, &prog);
+ if (is64)
+ emit_ia32_alu_r(is64, true, op, dst_hi, src_hi, dstk, sstk,
+ &prog);
+ else
+ emit_ia32_mov_i(dst_hi, 0, dstk, &prog);
+ *pprog = prog;
+}
+
+/*
+ * ALU operation (32 bit)
+ * dst = dst (op) val
+ */
+static inline void emit_ia32_alu_i(const bool is64, const bool hi, const u8 op,
+ const u8 dst, const s32 val, bool dstk,
+ u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+ u8 dreg = dstk ? IA32_EAX : dst;
+ u8 sreg = IA32_EDX;
+
+ if (dstk)
+ /* mov eax,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(dst));
+
+ if (!is_imm8(val))
+ /* mov edx,imm32*/
+ EMIT2_off32(0xC7, add_1reg(0xC0, IA32_EDX), val);
+
+ switch (op) {
+ /* dst = dst + val */
+ case BPF_ADD:
+ if (hi && is64) {
+ if (is_imm8(val))
+ EMIT3(0x83, add_1reg(0xD0, dreg), val);
+ else
+ EMIT2(0x11, add_2reg(0xC0, dreg, sreg));
+ } else {
+ if (is_imm8(val))
+ EMIT3(0x83, add_1reg(0xC0, dreg), val);
+ else
+ EMIT2(0x01, add_2reg(0xC0, dreg, sreg));
+ }
+ break;
+ /* dst = dst - val */
+ case BPF_SUB:
+ if (hi && is64) {
+ if (is_imm8(val))
+ EMIT3(0x83, add_1reg(0xD8, dreg), val);
+ else
+ EMIT2(0x19, add_2reg(0xC0, dreg, sreg));
+ } else {
+ if (is_imm8(val))
+ EMIT3(0x83, add_1reg(0xE8, dreg), val);
+ else
+ EMIT2(0x29, add_2reg(0xC0, dreg, sreg));
+ }
+ break;
+ /* dst = dst | val */
+ case BPF_OR:
+ if (is_imm8(val))
+ EMIT3(0x83, add_1reg(0xC8, dreg), val);
+ else
+ EMIT2(0x09, add_2reg(0xC0, dreg, sreg));
+ break;
+ /* dst = dst & val */
+ case BPF_AND:
+ if (is_imm8(val))
+ EMIT3(0x83, add_1reg(0xE0, dreg), val);
+ else
+ EMIT2(0x21, add_2reg(0xC0, dreg, sreg));
+ break;
+ /* dst = dst ^ val */
+ case BPF_XOR:
+ if (is_imm8(val))
+ EMIT3(0x83, add_1reg(0xF0, dreg), val);
+ else
+ EMIT2(0x31, add_2reg(0xC0, dreg, sreg));
+ break;
+ case BPF_NEG:
+ EMIT2(0xF7, add_1reg(0xD8, dreg));
+ break;
+ }
+
+ if (dstk)
+ /* mov dword ptr [ebp+off],dreg */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg),
+ STACK_VAR(dst));
+ *pprog = prog;
+}
+
+/* ALU operation (64 bit) */
+static inline void emit_ia32_alu_i64(const bool is64, const u8 op,
+ const u8 dst[], const u32 val,
+ bool dstk, u8 **pprog)
+{
+ u8 *prog = *pprog;
+ u32 hi = 0;
+
+ if (is64 && (val & (1<<31)))
+ hi = (u32)~0;
+
+ emit_ia32_alu_i(is64, false, op, dst_lo, val, dstk, &prog);
+ if (is64)
+ emit_ia32_alu_i(is64, true, op, dst_hi, hi, dstk, &prog);
+ else
+ emit_ia32_mov_i(dst_hi, 0, dstk, &prog);
+
+ *pprog = prog;
+}
+
+/* dst = ~dst (64 bit) */
+static inline void emit_ia32_neg64(const u8 dst[], bool dstk, u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+ u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+ u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+
+ if (dstk) {
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst_lo));
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+ STACK_VAR(dst_hi));
+ }
+
+ /* neg dreg_lo */
+ EMIT2(0xF7, add_1reg(0xD8, dreg_lo));
+ /* adc dreg_hi,0x0 */
+ EMIT3(0x83, add_1reg(0xD0, dreg_hi), 0x00);
+ /* neg dreg_hi */
+ EMIT2(0xF7, add_1reg(0xD8, dreg_hi));
+
+ if (dstk) {
+ /* mov dword ptr [ebp+off],dreg_lo */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo),
+ STACK_VAR(dst_lo));
+ /* mov dword ptr [ebp+off],dreg_hi */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi),
+ STACK_VAR(dst_hi));
+ }
+ *pprog = prog;
+}
+
+/* dst = dst << src */
+static inline void emit_ia32_lsh_r64(const u8 dst[], const u8 src[],
+ bool dstk, bool sstk, u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+ u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+ u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+
+ if (dstk) {
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst_lo));
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+ STACK_VAR(dst_hi));
+ }
+
+ if (sstk)
+ /* mov ecx,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX),
+ STACK_VAR(src_lo));
+ else
+ /* mov ecx,src_lo */
+ EMIT2(0x8B, add_2reg(0xC0, src_lo, IA32_ECX));
+
+ /* shld dreg_hi,dreg_lo,cl */
+ EMIT3(0x0F, 0xA5, add_2reg(0xC0, dreg_hi, dreg_lo));
+ /* shl dreg_lo,cl */
+ EMIT2(0xD3, add_1reg(0xE0, dreg_lo));
+
+ /* if ecx >= 32, mov dreg_lo into dreg_hi and clear dreg_lo */
+
+ /* cmp ecx,32 */
+ EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 32);
+ /* skip the next two instructions (4 bytes) when < 32 */
+ EMIT2(IA32_JB, 4);
+
+ /* mov dreg_hi,dreg_lo */
+ EMIT2(0x89, add_2reg(0xC0, dreg_hi, dreg_lo));
+ /* xor dreg_lo,dreg_lo */
+ EMIT2(0x33, add_2reg(0xC0, dreg_lo, dreg_lo));
+
+ if (dstk) {
+ /* mov dword ptr [ebp+off],dreg_lo */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo),
+ STACK_VAR(dst_lo));
+ /* mov dword ptr [ebp+off],dreg_hi */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi),
+ STACK_VAR(dst_hi));
+ }
+ /* out: */
+ *pprog = prog;
+}
+
+/* dst = dst >> src (signed)*/
+static inline void emit_ia32_arsh_r64(const u8 dst[], const u8 src[],
+ bool dstk, bool sstk, u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+ u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+ u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+
+ if (dstk) {
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst_lo));
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+ STACK_VAR(dst_hi));
+ }
+
+ if (sstk)
+ /* mov ecx,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX),
+ STACK_VAR(src_lo));
+ else
+ /* mov ecx,src_lo */
+ EMIT2(0x8B, add_2reg(0xC0, src_lo, IA32_ECX));
+
+ /* shrd dreg_lo,dreg_hi,cl */
+ EMIT3(0x0F, 0xAD, add_2reg(0xC0, dreg_lo, dreg_hi));
+ /* sar dreg_hi,cl */
+ EMIT2(0xD3, add_1reg(0xF8, dreg_hi));
+
+ /* if ecx >= 32, mov dreg_hi to dreg_lo and set/clear dreg_hi depending on sign */
+
+ /* cmp ecx,32 */
+ EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 32);
+ /* skip the next two instructions (5 bytes) when < 32 */
+ EMIT2(IA32_JB, 5);
+
+ /* mov dreg_lo,dreg_hi */
+ EMIT2(0x89, add_2reg(0xC0, dreg_lo, dreg_hi));
+ /* sar dreg_hi,31 */
+ EMIT3(0xC1, add_1reg(0xF8, dreg_hi), 31);
+
+ if (dstk) {
+ /* mov dword ptr [ebp+off],dreg_lo */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo),
+ STACK_VAR(dst_lo));
+ /* mov dword ptr [ebp+off],dreg_hi */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi),
+ STACK_VAR(dst_hi));
+ }
+ /* out: */
+ *pprog = prog;
+}
+
+/* dst = dst >> src */
+static inline void emit_ia32_rsh_r64(const u8 dst[], const u8 src[], bool dstk,
+ bool sstk, u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+ u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+ u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+
+ if (dstk) {
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst_lo));
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+ STACK_VAR(dst_hi));
+ }
+
+ if (sstk)
+ /* mov ecx,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX),
+ STACK_VAR(src_lo));
+ else
+ /* mov ecx,src_lo */
+ EMIT2(0x8B, add_2reg(0xC0, src_lo, IA32_ECX));
+
+ /* shrd dreg_lo,dreg_hi,cl */
+ EMIT3(0x0F, 0xAD, add_2reg(0xC0, dreg_lo, dreg_hi));
+ /* shr dreg_hi,cl */
+ EMIT2(0xD3, add_1reg(0xE8, dreg_hi));
+
+ /* if ecx >= 32, mov dreg_hi to dreg_lo and clear dreg_hi */
+
+ /* cmp ecx,32 */
+ EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 32);
+ /* skip the next two instructions (4 bytes) when < 32 */
+ EMIT2(IA32_JB, 4);
+
+ /* mov dreg_lo,dreg_hi */
+ EMIT2(0x89, add_2reg(0xC0, dreg_lo, dreg_hi));
+ /* xor dreg_hi,dreg_hi */
+ EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
+
+ if (dstk) {
+ /* mov dword ptr [ebp+off],dreg_lo */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo),
+ STACK_VAR(dst_lo));
+ /* mov dword ptr [ebp+off],dreg_hi */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi),
+ STACK_VAR(dst_hi));
+ }
+ /* out: */
+ *pprog = prog;
+}
+
+/* dst = dst << val */
+static inline void emit_ia32_lsh_i64(const u8 dst[], const u32 val,
+ bool dstk, u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+ u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+ u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+
+ if (dstk) {
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst_lo));
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+ STACK_VAR(dst_hi));
+ }
+ /* Do LSH operation */
+ if (val < 32) {
+ /* shld dreg_hi,dreg_lo,imm8 */
+ EMIT4(0x0F, 0xA4, add_2reg(0xC0, dreg_hi, dreg_lo), val);
+ /* shl dreg_lo,imm8 */
+ EMIT3(0xC1, add_1reg(0xE0, dreg_lo), val);
+ } else if (val >= 32 && val < 64) {
+ u32 value = val - 32;
+
+ /* shl dreg_lo,imm8 */
+ EMIT3(0xC1, add_1reg(0xE0, dreg_lo), value);
+ /* mov dreg_hi,dreg_lo */
+ EMIT2(0x89, add_2reg(0xC0, dreg_hi, dreg_lo));
+ /* xor dreg_lo,dreg_lo */
+ EMIT2(0x33, add_2reg(0xC0, dreg_lo, dreg_lo));
+ } else {
+ /* xor dreg_lo,dreg_lo */
+ EMIT2(0x33, add_2reg(0xC0, dreg_lo, dreg_lo));
+ /* xor dreg_hi,dreg_hi */
+ EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
+ }
+
+ if (dstk) {
+ /* mov dword ptr [ebp+off],dreg_lo */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo),
+ STACK_VAR(dst_lo));
+ /* mov dword ptr [ebp+off],dreg_hi */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi),
+ STACK_VAR(dst_hi));
+ }
+ *pprog = prog;
+}
+
+/* dst = dst >> val */
+static inline void emit_ia32_rsh_i64(const u8 dst[], const u32 val,
+ bool dstk, u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+ u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+ u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+
+ if (dstk) {
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst_lo));
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+ STACK_VAR(dst_hi));
+ }
+
+ /* Do RSH operation */
+ if (val < 32) {
+ /* shrd dreg_lo,dreg_hi,imm8 */
+ EMIT4(0x0F, 0xAC, add_2reg(0xC0, dreg_lo, dreg_hi), val);
+ /* shr dreg_hi,imm8 */
+ EMIT3(0xC1, add_1reg(0xE8, dreg_hi), val);
+ } else if (val >= 32 && val < 64) {
+ u32 value = val - 32;
+
+ /* shr dreg_hi,imm8 */
+ EMIT3(0xC1, add_1reg(0xE8, dreg_hi), value);
+ /* mov dreg_lo,dreg_hi */
+ EMIT2(0x89, add_2reg(0xC0, dreg_lo, dreg_hi));
+ /* xor dreg_hi,dreg_hi */
+ EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
+ } else {
+ /* xor dreg_lo,dreg_lo */
+ EMIT2(0x33, add_2reg(0xC0, dreg_lo, dreg_lo));
+ /* xor dreg_hi,dreg_hi */
+ EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
+ }
+
+ if (dstk) {
+ /* mov dword ptr [ebp+off],dreg_lo */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo),
+ STACK_VAR(dst_lo));
+ /* mov dword ptr [ebp+off],dreg_hi */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi),
+ STACK_VAR(dst_hi));
+ }
+ *pprog = prog;
+}
+
+/* dst = dst >> val (signed) */
+static inline void emit_ia32_arsh_i64(const u8 dst[], const u32 val,
+ bool dstk, u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+ u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+ u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+
+ if (dstk) {
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst_lo));
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+ STACK_VAR(dst_hi));
+ }
+ /* Do RSH operation */
+ if (val < 32) {
+ /* shrd dreg_lo,dreg_hi,imm8 */
+ EMIT4(0x0F, 0xAC, add_2reg(0xC0, dreg_lo, dreg_hi), val);
+ /* ashr dreg_hi,imm8 */
+ EMIT3(0xC1, add_1reg(0xF8, dreg_hi), val);
+ } else if (val >= 32 && val < 64) {
+ u32 value = val - 32;
+
+ /* ashr dreg_hi,imm8 */
+ EMIT3(0xC1, add_1reg(0xF8, dreg_hi), value);
+ /* mov dreg_lo,dreg_hi */
+ EMIT2(0x89, add_2reg(0xC0, dreg_lo, dreg_hi));
+
+ /* ashr dreg_hi,imm8 */
+ EMIT3(0xC1, add_1reg(0xF8, dreg_hi), 31);
+ } else {
+ /* ashr dreg_hi,imm8 */
+ EMIT3(0xC1, add_1reg(0xF8, dreg_hi), 31);
+ /* mov dreg_lo,dreg_hi */
+ EMIT2(0x89, add_2reg(0xC0, dreg_lo, dreg_hi));
+ }
+
+ if (dstk) {
+ /* mov dword ptr [ebp+off],dreg_lo */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo),
+ STACK_VAR(dst_lo));
+ /* mov dword ptr [ebp+off],dreg_hi */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi),
+ STACK_VAR(dst_hi));
+ }
+ *pprog = prog;
+}
+
+static inline void emit_ia32_mul_r64(const u8 dst[], const u8 src[], bool dstk,
+ bool sstk, u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+
+ if (dstk)
+ /* mov eax,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst_hi));
+ else
+ /* mov eax,dst_hi */
+ EMIT2(0x8B, add_2reg(0xC0, dst_hi, IA32_EAX));
+
+ if (sstk)
+ /* mul dword ptr [ebp+off] */
+ EMIT3(0xF7, add_1reg(0x60, IA32_EBP), STACK_VAR(src_lo));
+ else
+ /* mul src_lo */
+ EMIT2(0xF7, add_1reg(0xE0, src_lo));
+
+ /* mov ecx,eax */
+ EMIT2(0x89, add_2reg(0xC0, IA32_ECX, IA32_EAX));
+
+ if (dstk)
+ /* mov eax,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst_lo));
+ else
+ /* mov eax,dst_lo */
+ EMIT2(0x8B, add_2reg(0xC0, dst_lo, IA32_EAX));
+
+ if (sstk)
+ /* mul dword ptr [ebp+off] */
+ EMIT3(0xF7, add_1reg(0x60, IA32_EBP), STACK_VAR(src_hi));
+ else
+ /* mul src_hi */
+ EMIT2(0xF7, add_1reg(0xE0, src_hi));
+
+ /* add eax,eax */
+ EMIT2(0x01, add_2reg(0xC0, IA32_ECX, IA32_EAX));
+
+ if (dstk)
+ /* mov eax,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst_lo));
+ else
+ /* mov eax,dst_lo */
+ EMIT2(0x8B, add_2reg(0xC0, dst_lo, IA32_EAX));
+
+ if (sstk)
+ /* mul dword ptr [ebp+off] */
+ EMIT3(0xF7, add_1reg(0x60, IA32_EBP), STACK_VAR(src_lo));
+ else
+ /* mul src_lo */
+ EMIT2(0xF7, add_1reg(0xE0, src_lo));
+
+ /* add ecx,edx */
+ EMIT2(0x01, add_2reg(0xC0, IA32_ECX, IA32_EDX));
+
+ if (dstk) {
+ /* mov dword ptr [ebp+off],eax */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst_lo));
+ /* mov dword ptr [ebp+off],ecx */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_ECX),
+ STACK_VAR(dst_hi));
+ } else {
+ /* mov dst_lo,eax */
+ EMIT2(0x89, add_2reg(0xC0, dst_lo, IA32_EAX));
+ /* mov dst_hi,ecx */
+ EMIT2(0x89, add_2reg(0xC0, dst_hi, IA32_ECX));
+ }
+
+ *pprog = prog;
+}
+
+static inline void emit_ia32_mul_i64(const u8 dst[], const u32 val,
+ bool dstk, u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+ u32 hi;
+
+ hi = val & (1<<31) ? (u32)~0 : 0;
+ /* movl eax,imm32 */
+ EMIT2_off32(0xC7, add_1reg(0xC0, IA32_EAX), val);
+ if (dstk)
+ /* mul dword ptr [ebp+off] */
+ EMIT3(0xF7, add_1reg(0x60, IA32_EBP), STACK_VAR(dst_hi));
+ else
+ /* mul dst_hi */
+ EMIT2(0xF7, add_1reg(0xE0, dst_hi));
+
+ /* mov ecx,eax */
+ EMIT2(0x89, add_2reg(0xC0, IA32_ECX, IA32_EAX));
+
+ /* movl eax,imm32 */
+ EMIT2_off32(0xC7, add_1reg(0xC0, IA32_EAX), hi);
+ if (dstk)
+ /* mul dword ptr [ebp+off] */
+ EMIT3(0xF7, add_1reg(0x60, IA32_EBP), STACK_VAR(dst_lo));
+ else
+ /* mul dst_lo */
+ EMIT2(0xF7, add_1reg(0xE0, dst_lo));
+ /* add ecx,eax */
+ EMIT2(0x01, add_2reg(0xC0, IA32_ECX, IA32_EAX));
+
+ /* movl eax,imm32 */
+ EMIT2_off32(0xC7, add_1reg(0xC0, IA32_EAX), val);
+ if (dstk)
+ /* mul dword ptr [ebp+off] */
+ EMIT3(0xF7, add_1reg(0x60, IA32_EBP), STACK_VAR(dst_lo));
+ else
+ /* mul dst_lo */
+ EMIT2(0xF7, add_1reg(0xE0, dst_lo));
+
+ /* add ecx,edx */
+ EMIT2(0x01, add_2reg(0xC0, IA32_ECX, IA32_EDX));
+
+ if (dstk) {
+ /* mov dword ptr [ebp+off],eax */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst_lo));
+ /* mov dword ptr [ebp+off],ecx */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_ECX),
+ STACK_VAR(dst_hi));
+ } else {
+ /* mov dword ptr [ebp+off],eax */
+ EMIT2(0x89, add_2reg(0xC0, dst_lo, IA32_EAX));
+ /* mov dword ptr [ebp+off],ecx */
+ EMIT2(0x89, add_2reg(0xC0, dst_hi, IA32_ECX));
+ }
+
+ *pprog = prog;
+}
+
+static int bpf_size_to_x86_bytes(int bpf_size)
+{
+ if (bpf_size == BPF_W)
+ return 4;
+ else if (bpf_size == BPF_H)
+ return 2;
+ else if (bpf_size == BPF_B)
+ return 1;
+ else if (bpf_size == BPF_DW)
+ return 4; /* imm32 */
+ else
+ return 0;
+}
+
+struct jit_context {
+ int cleanup_addr; /* Epilogue code offset */
+};
+
+/* Maximum number of bytes emitted while JITing one eBPF insn */
+#define BPF_MAX_INSN_SIZE 128
+#define BPF_INSN_SAFETY 64
+
+#define PROLOGUE_SIZE 35
+
+/*
+ * Emit prologue code for BPF program and check it's size.
+ * bpf_tail_call helper will skip it while jumping into another program.
+ */
+static void emit_prologue(u8 **pprog, u32 stack_depth)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+ const u8 *r1 = bpf2ia32[BPF_REG_1];
+ const u8 fplo = bpf2ia32[BPF_REG_FP][0];
+ const u8 fphi = bpf2ia32[BPF_REG_FP][1];
+ const u8 *tcc = bpf2ia32[TCALL_CNT];
+
+ /* push ebp */
+ EMIT1(0x55);
+ /* mov ebp,esp */
+ EMIT2(0x89, 0xE5);
+ /* push edi */
+ EMIT1(0x57);
+ /* push esi */
+ EMIT1(0x56);
+ /* push ebx */
+ EMIT1(0x53);
+
+ /* sub esp,STACK_SIZE */
+ EMIT2_off32(0x81, 0xEC, STACK_SIZE);
+ /* sub ebp,SCRATCH_SIZE+12*/
+ EMIT3(0x83, add_1reg(0xE8, IA32_EBP), SCRATCH_SIZE + 12);
+ /* xor ebx,ebx */
+ EMIT2(0x31, add_2reg(0xC0, IA32_EBX, IA32_EBX));
+
+ /* Set up BPF prog stack base register */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EBP), STACK_VAR(fplo));
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EBX), STACK_VAR(fphi));
+
+ /* Move BPF_CTX (EAX) to BPF_REG_R1 */
+ /* mov dword ptr [ebp+off],eax */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(r1[0]));
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EBX), STACK_VAR(r1[1]));
+
+ /* Initialize Tail Count */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EBX), STACK_VAR(tcc[0]));
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EBX), STACK_VAR(tcc[1]));
+
+ BUILD_BUG_ON(cnt != PROLOGUE_SIZE);
+ *pprog = prog;
+}
+
+/* Emit epilogue code for BPF program */
+static void emit_epilogue(u8 **pprog, u32 stack_depth)
+{
+ u8 *prog = *pprog;
+ const u8 *r0 = bpf2ia32[BPF_REG_0];
+ int cnt = 0;
+
+ /* mov eax,dword ptr [ebp+off]*/
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(r0[0]));
+ /* mov edx,dword ptr [ebp+off]*/
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX), STACK_VAR(r0[1]));
+
+ /* add ebp,SCRATCH_SIZE+12*/
+ EMIT3(0x83, add_1reg(0xC0, IA32_EBP), SCRATCH_SIZE + 12);
+
+ /* mov ebx,dword ptr [ebp-12]*/
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EBX), -12);
+ /* mov esi,dword ptr [ebp-8]*/
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ESI), -8);
+ /* mov edi,dword ptr [ebp-4]*/
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDI), -4);
+
+ EMIT1(0xC9); /* leave */
+ EMIT1(0xC3); /* ret */
+ *pprog = prog;
+}
+
+/*
+ * Generate the following code:
+ * ... bpf_tail_call(void *ctx, struct bpf_array *array, u64 index) ...
+ * if (index >= array->map.max_entries)
+ * goto out;
+ * if (++tail_call_cnt > MAX_TAIL_CALL_CNT)
+ * goto out;
+ * prog = array->ptrs[index];
+ * if (prog == NULL)
+ * goto out;
+ * goto *(prog->bpf_func + prologue_size);
+ * out:
+ */
+static void emit_bpf_tail_call(u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+ const u8 *r1 = bpf2ia32[BPF_REG_1];
+ const u8 *r2 = bpf2ia32[BPF_REG_2];
+ const u8 *r3 = bpf2ia32[BPF_REG_3];
+ const u8 *tcc = bpf2ia32[TCALL_CNT];
+ u32 lo, hi;
+ static int jmp_label1 = -1;
+
+ /*
+ * if (index >= array->map.max_entries)
+ * goto out;
+ */
+ /* mov eax,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(r2[0]));
+ /* mov edx,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX), STACK_VAR(r3[0]));
+
+ /* cmp dword ptr [eax+off],edx */
+ EMIT3(0x39, add_2reg(0x40, IA32_EAX, IA32_EDX),
+ offsetof(struct bpf_array, map.max_entries));
+ /* jbe out */
+ EMIT2(IA32_JBE, jmp_label(jmp_label1, 2));
+
+ /*
+ * if (tail_call_cnt > MAX_TAIL_CALL_CNT)
+ * goto out;
+ */
+ lo = (u32)MAX_TAIL_CALL_CNT;
+ hi = (u32)((u64)MAX_TAIL_CALL_CNT >> 32);
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX), STACK_VAR(tcc[0]));
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EBX), STACK_VAR(tcc[1]));
+
+ /* cmp edx,hi */
+ EMIT3(0x83, add_1reg(0xF8, IA32_EBX), hi);
+ EMIT2(IA32_JNE, 3);
+ /* cmp ecx,lo */
+ EMIT3(0x83, add_1reg(0xF8, IA32_ECX), lo);
+
+ /* ja out */
+ EMIT2(IA32_JAE, jmp_label(jmp_label1, 2));
+
+ /* add eax,0x1 */
+ EMIT3(0x83, add_1reg(0xC0, IA32_ECX), 0x01);
+ /* adc ebx,0x0 */
+ EMIT3(0x83, add_1reg(0xD0, IA32_EBX), 0x00);
+
+ /* mov dword ptr [ebp+off],eax */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_ECX), STACK_VAR(tcc[0]));
+ /* mov dword ptr [ebp+off],edx */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EBX), STACK_VAR(tcc[1]));
+
+ /* prog = array->ptrs[index]; */
+ /* mov edx, [eax + edx * 4 + offsetof(...)] */
+ EMIT3_off32(0x8B, 0x94, 0x90, offsetof(struct bpf_array, ptrs));
+
+ /*
+ * if (prog == NULL)
+ * goto out;
+ */
+ /* test edx,edx */
+ EMIT2(0x85, add_2reg(0xC0, IA32_EDX, IA32_EDX));
+ /* je out */
+ EMIT2(IA32_JE, jmp_label(jmp_label1, 2));
+
+ /* goto *(prog->bpf_func + prologue_size); */
+ /* mov edx, dword ptr [edx + 32] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EDX, IA32_EDX),
+ offsetof(struct bpf_prog, bpf_func));
+ /* add edx,prologue_size */
+ EMIT3(0x83, add_1reg(0xC0, IA32_EDX), PROLOGUE_SIZE);
+
+ /* mov eax,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(r1[0]));
+
+ /*
+ * Now we're ready to jump into next BPF program:
+ * eax == ctx (1st arg)
+ * edx == prog->bpf_func + prologue_size
+ */
+ RETPOLINE_EDX_BPF_JIT();
+
+ if (jmp_label1 == -1)
+ jmp_label1 = cnt;
+
+ /* out: */
+ *pprog = prog;
+}
+
+/* Push the scratch stack register on top of the stack. */
+static inline void emit_push_r64(const u8 src[], u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+
+ /* mov ecx,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX), STACK_VAR(src_hi));
+ /* push ecx */
+ EMIT1(0x51);
+
+ /* mov ecx,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX), STACK_VAR(src_lo));
+ /* push ecx */
+ EMIT1(0x51);
+
+ *pprog = prog;
+}
+
+static u8 get_cond_jmp_opcode(const u8 op, bool is_cmp_lo)
+{
+ u8 jmp_cond;
+
+ /* Convert BPF opcode to x86 */
+ switch (op) {
+ case BPF_JEQ:
+ jmp_cond = IA32_JE;
+ break;
+ case BPF_JSET:
+ case BPF_JNE:
+ jmp_cond = IA32_JNE;
+ break;
+ case BPF_JGT:
+ /* GT is unsigned '>', JA in x86 */
+ jmp_cond = IA32_JA;
+ break;
+ case BPF_JLT:
+ /* LT is unsigned '<', JB in x86 */
+ jmp_cond = IA32_JB;
+ break;
+ case BPF_JGE:
+ /* GE is unsigned '>=', JAE in x86 */
+ jmp_cond = IA32_JAE;
+ break;
+ case BPF_JLE:
+ /* LE is unsigned '<=', JBE in x86 */
+ jmp_cond = IA32_JBE;
+ break;
+ case BPF_JSGT:
+ if (!is_cmp_lo)
+ /* Signed '>', GT in x86 */
+ jmp_cond = IA32_JG;
+ else
+ /* GT is unsigned '>', JA in x86 */
+ jmp_cond = IA32_JA;
+ break;
+ case BPF_JSLT:
+ if (!is_cmp_lo)
+ /* Signed '<', LT in x86 */
+ jmp_cond = IA32_JL;
+ else
+ /* LT is unsigned '<', JB in x86 */
+ jmp_cond = IA32_JB;
+ break;
+ case BPF_JSGE:
+ if (!is_cmp_lo)
+ /* Signed '>=', GE in x86 */
+ jmp_cond = IA32_JGE;
+ else
+ /* GE is unsigned '>=', JAE in x86 */
+ jmp_cond = IA32_JAE;
+ break;
+ case BPF_JSLE:
+ if (!is_cmp_lo)
+ /* Signed '<=', LE in x86 */
+ jmp_cond = IA32_JLE;
+ else
+ /* LE is unsigned '<=', JBE in x86 */
+ jmp_cond = IA32_JBE;
+ break;
+ default: /* to silence GCC warning */
+ jmp_cond = COND_JMP_OPCODE_INVALID;
+ break;
+ }
+
+ return jmp_cond;
+}
+
+static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
+ int oldproglen, struct jit_context *ctx)
+{
+ struct bpf_insn *insn = bpf_prog->insnsi;
+ int insn_cnt = bpf_prog->len;
+ bool seen_exit = false;
+ u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY];
+ int i, cnt = 0;
+ int proglen = 0;
+ u8 *prog = temp;
+
+ emit_prologue(&prog, bpf_prog->aux->stack_depth);
+
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ const s32 imm32 = insn->imm;
+ const bool is64 = BPF_CLASS(insn->code) == BPF_ALU64;
+ const bool dstk = insn->dst_reg == BPF_REG_AX ? false : true;
+ const bool sstk = insn->src_reg == BPF_REG_AX ? false : true;
+ const u8 code = insn->code;
+ const u8 *dst = bpf2ia32[insn->dst_reg];
+ const u8 *src = bpf2ia32[insn->src_reg];
+ const u8 *r0 = bpf2ia32[BPF_REG_0];
+ s64 jmp_offset;
+ u8 jmp_cond;
+ int ilen;
+ u8 *func;
+
+ switch (code) {
+ /* ALU operations */
+ /* dst = src */
+ case BPF_ALU | BPF_MOV | BPF_K:
+ case BPF_ALU | BPF_MOV | BPF_X:
+ case BPF_ALU64 | BPF_MOV | BPF_K:
+ case BPF_ALU64 | BPF_MOV | BPF_X:
+ switch (BPF_SRC(code)) {
+ case BPF_X:
+ emit_ia32_mov_r64(is64, dst, src, dstk,
+ sstk, &prog);
+ break;
+ case BPF_K:
+ /* Sign-extend immediate value to dst reg */
+ emit_ia32_mov_i64(is64, dst, imm32,
+ dstk, &prog);
+ break;
+ }
+ break;
+ /* dst = dst + src/imm */
+ /* dst = dst - src/imm */
+ /* dst = dst | src/imm */
+ /* dst = dst & src/imm */
+ /* dst = dst ^ src/imm */
+ /* dst = dst * src/imm */
+ /* dst = dst << src */
+ /* dst = dst >> src */
+ case BPF_ALU | BPF_ADD | BPF_K:
+ case BPF_ALU | BPF_ADD | BPF_X:
+ case BPF_ALU | BPF_SUB | BPF_K:
+ case BPF_ALU | BPF_SUB | BPF_X:
+ case BPF_ALU | BPF_OR | BPF_K:
+ case BPF_ALU | BPF_OR | BPF_X:
+ case BPF_ALU | BPF_AND | BPF_K:
+ case BPF_ALU | BPF_AND | BPF_X:
+ case BPF_ALU | BPF_XOR | BPF_K:
+ case BPF_ALU | BPF_XOR | BPF_X:
+ case BPF_ALU64 | BPF_ADD | BPF_K:
+ case BPF_ALU64 | BPF_ADD | BPF_X:
+ case BPF_ALU64 | BPF_SUB | BPF_K:
+ case BPF_ALU64 | BPF_SUB | BPF_X:
+ case BPF_ALU64 | BPF_OR | BPF_K:
+ case BPF_ALU64 | BPF_OR | BPF_X:
+ case BPF_ALU64 | BPF_AND | BPF_K:
+ case BPF_ALU64 | BPF_AND | BPF_X:
+ case BPF_ALU64 | BPF_XOR | BPF_K:
+ case BPF_ALU64 | BPF_XOR | BPF_X:
+ switch (BPF_SRC(code)) {
+ case BPF_X:
+ emit_ia32_alu_r64(is64, BPF_OP(code), dst,
+ src, dstk, sstk, &prog);
+ break;
+ case BPF_K:
+ emit_ia32_alu_i64(is64, BPF_OP(code), dst,
+ imm32, dstk, &prog);
+ break;
+ }
+ break;
+ case BPF_ALU | BPF_MUL | BPF_K:
+ case BPF_ALU | BPF_MUL | BPF_X:
+ switch (BPF_SRC(code)) {
+ case BPF_X:
+ emit_ia32_mul_r(dst_lo, src_lo, dstk,
+ sstk, &prog);
+ break;
+ case BPF_K:
+ /* mov ecx,imm32*/
+ EMIT2_off32(0xC7, add_1reg(0xC0, IA32_ECX),
+ imm32);
+ emit_ia32_mul_r(dst_lo, IA32_ECX, dstk,
+ false, &prog);
+ break;
+ }
+ emit_ia32_mov_i(dst_hi, 0, dstk, &prog);
+ break;
+ case BPF_ALU | BPF_LSH | BPF_X:
+ case BPF_ALU | BPF_RSH | BPF_X:
+ case BPF_ALU | BPF_ARSH | BPF_K:
+ case BPF_ALU | BPF_ARSH | BPF_X:
+ switch (BPF_SRC(code)) {
+ case BPF_X:
+ emit_ia32_shift_r(BPF_OP(code), dst_lo, src_lo,
+ dstk, sstk, &prog);
+ break;
+ case BPF_K:
+ /* mov ecx,imm32*/
+ EMIT2_off32(0xC7, add_1reg(0xC0, IA32_ECX),
+ imm32);
+ emit_ia32_shift_r(BPF_OP(code), dst_lo,
+ IA32_ECX, dstk, false,
+ &prog);
+ break;
+ }
+ emit_ia32_mov_i(dst_hi, 0, dstk, &prog);
+ break;
+ /* dst = dst / src(imm) */
+ /* dst = dst % src(imm) */
+ case BPF_ALU | BPF_DIV | BPF_K:
+ case BPF_ALU | BPF_DIV | BPF_X:
+ case BPF_ALU | BPF_MOD | BPF_K:
+ case BPF_ALU | BPF_MOD | BPF_X:
+ switch (BPF_SRC(code)) {
+ case BPF_X:
+ emit_ia32_div_mod_r(BPF_OP(code), dst_lo,
+ src_lo, dstk, sstk, &prog);
+ break;
+ case BPF_K:
+ /* mov ecx,imm32*/
+ EMIT2_off32(0xC7, add_1reg(0xC0, IA32_ECX),
+ imm32);
+ emit_ia32_div_mod_r(BPF_OP(code), dst_lo,
+ IA32_ECX, dstk, false,
+ &prog);
+ break;
+ }
+ emit_ia32_mov_i(dst_hi, 0, dstk, &prog);
+ break;
+ case BPF_ALU64 | BPF_DIV | BPF_K:
+ case BPF_ALU64 | BPF_DIV | BPF_X:
+ case BPF_ALU64 | BPF_MOD | BPF_K:
+ case BPF_ALU64 | BPF_MOD | BPF_X:
+ goto notyet;
+ /* dst = dst >> imm */
+ /* dst = dst << imm */
+ case BPF_ALU | BPF_RSH | BPF_K:
+ case BPF_ALU | BPF_LSH | BPF_K:
+ if (unlikely(imm32 > 31))
+ return -EINVAL;
+ /* mov ecx,imm32*/
+ EMIT2_off32(0xC7, add_1reg(0xC0, IA32_ECX), imm32);
+ emit_ia32_shift_r(BPF_OP(code), dst_lo, IA32_ECX, dstk,
+ false, &prog);
+ emit_ia32_mov_i(dst_hi, 0, dstk, &prog);
+ break;
+ /* dst = dst << imm */
+ case BPF_ALU64 | BPF_LSH | BPF_K:
+ if (unlikely(imm32 > 63))
+ return -EINVAL;
+ emit_ia32_lsh_i64(dst, imm32, dstk, &prog);
+ break;
+ /* dst = dst >> imm */
+ case BPF_ALU64 | BPF_RSH | BPF_K:
+ if (unlikely(imm32 > 63))
+ return -EINVAL;
+ emit_ia32_rsh_i64(dst, imm32, dstk, &prog);
+ break;
+ /* dst = dst << src */
+ case BPF_ALU64 | BPF_LSH | BPF_X:
+ emit_ia32_lsh_r64(dst, src, dstk, sstk, &prog);
+ break;
+ /* dst = dst >> src */
+ case BPF_ALU64 | BPF_RSH | BPF_X:
+ emit_ia32_rsh_r64(dst, src, dstk, sstk, &prog);
+ break;
+ /* dst = dst >> src (signed) */
+ case BPF_ALU64 | BPF_ARSH | BPF_X:
+ emit_ia32_arsh_r64(dst, src, dstk, sstk, &prog);
+ break;
+ /* dst = dst >> imm (signed) */
+ case BPF_ALU64 | BPF_ARSH | BPF_K:
+ if (unlikely(imm32 > 63))
+ return -EINVAL;
+ emit_ia32_arsh_i64(dst, imm32, dstk, &prog);
+ break;
+ /* dst = ~dst */
+ case BPF_ALU | BPF_NEG:
+ emit_ia32_alu_i(is64, false, BPF_OP(code),
+ dst_lo, 0, dstk, &prog);
+ emit_ia32_mov_i(dst_hi, 0, dstk, &prog);
+ break;
+ /* dst = ~dst (64 bit) */
+ case BPF_ALU64 | BPF_NEG:
+ emit_ia32_neg64(dst, dstk, &prog);
+ break;
+ /* dst = dst * src/imm */
+ case BPF_ALU64 | BPF_MUL | BPF_X:
+ case BPF_ALU64 | BPF_MUL | BPF_K:
+ switch (BPF_SRC(code)) {
+ case BPF_X:
+ emit_ia32_mul_r64(dst, src, dstk, sstk, &prog);
+ break;
+ case BPF_K:
+ emit_ia32_mul_i64(dst, imm32, dstk, &prog);
+ break;
+ }
+ break;
+ /* dst = htole(dst) */
+ case BPF_ALU | BPF_END | BPF_FROM_LE:
+ emit_ia32_to_le_r64(dst, imm32, dstk, &prog);
+ break;
+ /* dst = htobe(dst) */
+ case BPF_ALU | BPF_END | BPF_FROM_BE:
+ emit_ia32_to_be_r64(dst, imm32, dstk, &prog);
+ break;
+ /* dst = imm64 */
+ case BPF_LD | BPF_IMM | BPF_DW: {
+ s32 hi, lo = imm32;
+
+ hi = insn[1].imm;
+ emit_ia32_mov_i(dst_lo, lo, dstk, &prog);
+ emit_ia32_mov_i(dst_hi, hi, dstk, &prog);
+ insn++;
+ i++;
+ break;
+ }
+ /* speculation barrier */
+ case BPF_ST | BPF_NOSPEC:
+ if (boot_cpu_has(X86_FEATURE_XMM2))
+ /* Emit 'lfence' */
+ EMIT3(0x0F, 0xAE, 0xE8);
+ break;
+ /* ST: *(u8*)(dst_reg + off) = imm */
+ case BPF_ST | BPF_MEM | BPF_H:
+ case BPF_ST | BPF_MEM | BPF_B:
+ case BPF_ST | BPF_MEM | BPF_W:
+ case BPF_ST | BPF_MEM | BPF_DW:
+ if (dstk)
+ /* mov eax,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst_lo));
+ else
+ /* mov eax,dst_lo */
+ EMIT2(0x8B, add_2reg(0xC0, dst_lo, IA32_EAX));
+
+ switch (BPF_SIZE(code)) {
+ case BPF_B:
+ EMIT(0xC6, 1); break;
+ case BPF_H:
+ EMIT2(0x66, 0xC7); break;
+ case BPF_W:
+ case BPF_DW:
+ EMIT(0xC7, 1); break;
+ }
+
+ if (is_imm8(insn->off))
+ EMIT2(add_1reg(0x40, IA32_EAX), insn->off);
+ else
+ EMIT1_off32(add_1reg(0x80, IA32_EAX),
+ insn->off);
+ EMIT(imm32, bpf_size_to_x86_bytes(BPF_SIZE(code)));
+
+ if (BPF_SIZE(code) == BPF_DW) {
+ u32 hi;
+
+ hi = imm32 & (1<<31) ? (u32)~0 : 0;
+ EMIT2_off32(0xC7, add_1reg(0x80, IA32_EAX),
+ insn->off + 4);
+ EMIT(hi, 4);
+ }
+ break;
+
+ /* STX: *(u8*)(dst_reg + off) = src_reg */
+ case BPF_STX | BPF_MEM | BPF_B:
+ case BPF_STX | BPF_MEM | BPF_H:
+ case BPF_STX | BPF_MEM | BPF_W:
+ case BPF_STX | BPF_MEM | BPF_DW:
+ if (dstk)
+ /* mov eax,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst_lo));
+ else
+ /* mov eax,dst_lo */
+ EMIT2(0x8B, add_2reg(0xC0, dst_lo, IA32_EAX));
+
+ if (sstk)
+ /* mov edx,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+ STACK_VAR(src_lo));
+ else
+ /* mov edx,src_lo */
+ EMIT2(0x8B, add_2reg(0xC0, src_lo, IA32_EDX));
+
+ switch (BPF_SIZE(code)) {
+ case BPF_B:
+ EMIT(0x88, 1); break;
+ case BPF_H:
+ EMIT2(0x66, 0x89); break;
+ case BPF_W:
+ case BPF_DW:
+ EMIT(0x89, 1); break;
+ }
+
+ if (is_imm8(insn->off))
+ EMIT2(add_2reg(0x40, IA32_EAX, IA32_EDX),
+ insn->off);
+ else
+ EMIT1_off32(add_2reg(0x80, IA32_EAX, IA32_EDX),
+ insn->off);
+
+ if (BPF_SIZE(code) == BPF_DW) {
+ if (sstk)
+ /* mov edi,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP,
+ IA32_EDX),
+ STACK_VAR(src_hi));
+ else
+ /* mov edi,src_hi */
+ EMIT2(0x8B, add_2reg(0xC0, src_hi,
+ IA32_EDX));
+ EMIT1(0x89);
+ if (is_imm8(insn->off + 4)) {
+ EMIT2(add_2reg(0x40, IA32_EAX,
+ IA32_EDX),
+ insn->off + 4);
+ } else {
+ EMIT1(add_2reg(0x80, IA32_EAX,
+ IA32_EDX));
+ EMIT(insn->off + 4, 4);
+ }
+ }
+ break;
+
+ /* LDX: dst_reg = *(u8*)(src_reg + off) */
+ case BPF_LDX | BPF_MEM | BPF_B:
+ case BPF_LDX | BPF_MEM | BPF_H:
+ case BPF_LDX | BPF_MEM | BPF_W:
+ case BPF_LDX | BPF_MEM | BPF_DW:
+ if (sstk)
+ /* mov eax,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(src_lo));
+ else
+ /* mov eax,dword ptr [ebp+off] */
+ EMIT2(0x8B, add_2reg(0xC0, src_lo, IA32_EAX));
+
+ switch (BPF_SIZE(code)) {
+ case BPF_B:
+ EMIT2(0x0F, 0xB6); break;
+ case BPF_H:
+ EMIT2(0x0F, 0xB7); break;
+ case BPF_W:
+ case BPF_DW:
+ EMIT(0x8B, 1); break;
+ }
+
+ if (is_imm8(insn->off))
+ EMIT2(add_2reg(0x40, IA32_EAX, IA32_EDX),
+ insn->off);
+ else
+ EMIT1_off32(add_2reg(0x80, IA32_EAX, IA32_EDX),
+ insn->off);
+
+ if (dstk)
+ /* mov dword ptr [ebp+off],edx */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EDX),
+ STACK_VAR(dst_lo));
+ else
+ /* mov dst_lo,edx */
+ EMIT2(0x89, add_2reg(0xC0, dst_lo, IA32_EDX));
+ switch (BPF_SIZE(code)) {
+ case BPF_B:
+ case BPF_H:
+ case BPF_W:
+ if (dstk) {
+ EMIT3(0xC7, add_1reg(0x40, IA32_EBP),
+ STACK_VAR(dst_hi));
+ EMIT(0x0, 4);
+ } else {
+ /* xor dst_hi,dst_hi */
+ EMIT2(0x33,
+ add_2reg(0xC0, dst_hi, dst_hi));
+ }
+ break;
+ case BPF_DW:
+ EMIT2_off32(0x8B,
+ add_2reg(0x80, IA32_EAX, IA32_EDX),
+ insn->off + 4);
+ if (dstk)
+ EMIT3(0x89,
+ add_2reg(0x40, IA32_EBP,
+ IA32_EDX),
+ STACK_VAR(dst_hi));
+ else
+ EMIT2(0x89,
+ add_2reg(0xC0, dst_hi, IA32_EDX));
+ break;
+ default:
+ break;
+ }
+ break;
+ /* call */
+ case BPF_JMP | BPF_CALL:
+ {
+ const u8 *r1 = bpf2ia32[BPF_REG_1];
+ const u8 *r2 = bpf2ia32[BPF_REG_2];
+ const u8 *r3 = bpf2ia32[BPF_REG_3];
+ const u8 *r4 = bpf2ia32[BPF_REG_4];
+ const u8 *r5 = bpf2ia32[BPF_REG_5];
+
+ if (insn->src_reg == BPF_PSEUDO_CALL)
+ goto notyet;
+
+ func = (u8 *) __bpf_call_base + imm32;
+ jmp_offset = func - (image + addrs[i]);
+
+ if (!imm32 || !is_simm32(jmp_offset)) {
+ pr_err("unsupported BPF func %d addr %p image %p\n",
+ imm32, func, image);
+ return -EINVAL;
+ }
+
+ /* mov eax,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(r1[0]));
+ /* mov edx,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+ STACK_VAR(r1[1]));
+
+ emit_push_r64(r5, &prog);
+ emit_push_r64(r4, &prog);
+ emit_push_r64(r3, &prog);
+ emit_push_r64(r2, &prog);
+
+ EMIT1_off32(0xE8, jmp_offset + 9);
+
+ /* mov dword ptr [ebp+off],eax */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(r0[0]));
+ /* mov dword ptr [ebp+off],edx */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EDX),
+ STACK_VAR(r0[1]));
+
+ /* add esp,32 */
+ EMIT3(0x83, add_1reg(0xC0, IA32_ESP), 32);
+ break;
+ }
+ case BPF_JMP | BPF_TAIL_CALL:
+ emit_bpf_tail_call(&prog);
+ break;
+
+ /* cond jump */
+ case BPF_JMP | BPF_JEQ | BPF_X:
+ case BPF_JMP | BPF_JNE | BPF_X:
+ case BPF_JMP | BPF_JGT | BPF_X:
+ case BPF_JMP | BPF_JLT | BPF_X:
+ case BPF_JMP | BPF_JGE | BPF_X:
+ case BPF_JMP | BPF_JLE | BPF_X: {
+ u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+ u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+ u8 sreg_lo = sstk ? IA32_ECX : src_lo;
+ u8 sreg_hi = sstk ? IA32_EBX : src_hi;
+
+ if (dstk) {
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst_lo));
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+ STACK_VAR(dst_hi));
+ }
+
+ if (sstk) {
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX),
+ STACK_VAR(src_lo));
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EBX),
+ STACK_VAR(src_hi));
+ }
+
+ /* cmp dreg_hi,sreg_hi */
+ EMIT2(0x39, add_2reg(0xC0, dreg_hi, sreg_hi));
+ EMIT2(IA32_JNE, 2);
+ /* cmp dreg_lo,sreg_lo */
+ EMIT2(0x39, add_2reg(0xC0, dreg_lo, sreg_lo));
+ goto emit_cond_jmp;
+ }
+ case BPF_JMP | BPF_JSGT | BPF_X:
+ case BPF_JMP | BPF_JSLE | BPF_X:
+ case BPF_JMP | BPF_JSLT | BPF_X:
+ case BPF_JMP | BPF_JSGE | BPF_X: {
+ u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+ u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+ u8 sreg_lo = sstk ? IA32_ECX : src_lo;
+ u8 sreg_hi = sstk ? IA32_EBX : src_hi;
+
+ if (dstk) {
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst_lo));
+ EMIT3(0x8B,
+ add_2reg(0x40, IA32_EBP,
+ IA32_EDX),
+ STACK_VAR(dst_hi));
+ }
+
+ if (sstk) {
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX),
+ STACK_VAR(src_lo));
+ EMIT3(0x8B,
+ add_2reg(0x40, IA32_EBP,
+ IA32_EBX),
+ STACK_VAR(src_hi));
+ }
+
+ /* cmp dreg_hi,sreg_hi */
+ EMIT2(0x39, add_2reg(0xC0, dreg_hi, sreg_hi));
+ EMIT2(IA32_JNE, 10);
+ /* cmp dreg_lo,sreg_lo */
+ EMIT2(0x39, add_2reg(0xC0, dreg_lo, sreg_lo));
+ goto emit_cond_jmp_signed;
+ }
+ case BPF_JMP | BPF_JSET | BPF_X: {
+ u8 dreg_lo = IA32_EAX;
+ u8 dreg_hi = IA32_EDX;
+ u8 sreg_lo = sstk ? IA32_ECX : src_lo;
+ u8 sreg_hi = sstk ? IA32_EBX : src_hi;
+
+ if (dstk) {
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst_lo));
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+ STACK_VAR(dst_hi));
+ } else {
+ /* mov dreg_lo,dst_lo */
+ EMIT2(0x89, add_2reg(0xC0, dreg_lo, dst_lo));
+ /* mov dreg_hi,dst_hi */
+ EMIT2(0x89,
+ add_2reg(0xC0, dreg_hi, dst_hi));
+ }
+
+ if (sstk) {
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX),
+ STACK_VAR(src_lo));
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EBX),
+ STACK_VAR(src_hi));
+ }
+ /* and dreg_lo,sreg_lo */
+ EMIT2(0x23, add_2reg(0xC0, sreg_lo, dreg_lo));
+ /* and dreg_hi,sreg_hi */
+ EMIT2(0x23, add_2reg(0xC0, sreg_hi, dreg_hi));
+ /* or dreg_lo,dreg_hi */
+ EMIT2(0x09, add_2reg(0xC0, dreg_lo, dreg_hi));
+ goto emit_cond_jmp;
+ }
+ case BPF_JMP | BPF_JSET | BPF_K: {
+ u32 hi;
+ u8 dreg_lo = IA32_EAX;
+ u8 dreg_hi = IA32_EDX;
+ u8 sreg_lo = IA32_ECX;
+ u8 sreg_hi = IA32_EBX;
+
+ if (dstk) {
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst_lo));
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+ STACK_VAR(dst_hi));
+ } else {
+ /* mov dreg_lo,dst_lo */
+ EMIT2(0x89, add_2reg(0xC0, dreg_lo, dst_lo));
+ /* mov dreg_hi,dst_hi */
+ EMIT2(0x89,
+ add_2reg(0xC0, dreg_hi, dst_hi));
+ }
+ hi = imm32 & (1<<31) ? (u32)~0 : 0;
+
+ /* mov ecx,imm32 */
+ EMIT2_off32(0xC7, add_1reg(0xC0, IA32_ECX), imm32);
+ /* mov ebx,imm32 */
+ EMIT2_off32(0xC7, add_1reg(0xC0, IA32_EBX), hi);
+
+ /* and dreg_lo,sreg_lo */
+ EMIT2(0x23, add_2reg(0xC0, sreg_lo, dreg_lo));
+ /* and dreg_hi,sreg_hi */
+ EMIT2(0x23, add_2reg(0xC0, sreg_hi, dreg_hi));
+ /* or dreg_lo,dreg_hi */
+ EMIT2(0x09, add_2reg(0xC0, dreg_lo, dreg_hi));
+ goto emit_cond_jmp;
+ }
+ case BPF_JMP | BPF_JEQ | BPF_K:
+ case BPF_JMP | BPF_JNE | BPF_K:
+ case BPF_JMP | BPF_JGT | BPF_K:
+ case BPF_JMP | BPF_JLT | BPF_K:
+ case BPF_JMP | BPF_JGE | BPF_K:
+ case BPF_JMP | BPF_JLE | BPF_K: {
+ u32 hi;
+ u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+ u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+ u8 sreg_lo = IA32_ECX;
+ u8 sreg_hi = IA32_EBX;
+
+ if (dstk) {
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst_lo));
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+ STACK_VAR(dst_hi));
+ }
+
+ hi = imm32 & (1<<31) ? (u32)~0 : 0;
+ /* mov ecx,imm32 */
+ EMIT2_off32(0xC7, add_1reg(0xC0, IA32_ECX), imm32);
+ /* mov ebx,imm32 */
+ EMIT2_off32(0xC7, add_1reg(0xC0, IA32_EBX), hi);
+
+ /* cmp dreg_hi,sreg_hi */
+ EMIT2(0x39, add_2reg(0xC0, dreg_hi, sreg_hi));
+ EMIT2(IA32_JNE, 2);
+ /* cmp dreg_lo,sreg_lo */
+ EMIT2(0x39, add_2reg(0xC0, dreg_lo, sreg_lo));
+
+emit_cond_jmp: jmp_cond = get_cond_jmp_opcode(BPF_OP(code), false);
+ if (jmp_cond == COND_JMP_OPCODE_INVALID)
+ return -EFAULT;
+ jmp_offset = addrs[i + insn->off] - addrs[i];
+ if (is_imm8(jmp_offset)) {
+ EMIT2(jmp_cond, jmp_offset);
+ } else if (is_simm32(jmp_offset)) {
+ EMIT2_off32(0x0F, jmp_cond + 0x10, jmp_offset);
+ } else {
+ pr_err("cond_jmp gen bug %llx\n", jmp_offset);
+ return -EFAULT;
+ }
+ break;
+ }
+ case BPF_JMP | BPF_JSGT | BPF_K:
+ case BPF_JMP | BPF_JSLE | BPF_K:
+ case BPF_JMP | BPF_JSLT | BPF_K:
+ case BPF_JMP | BPF_JSGE | BPF_K: {
+ u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+ u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+ u8 sreg_lo = IA32_ECX;
+ u8 sreg_hi = IA32_EBX;
+ u32 hi;
+
+ if (dstk) {
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst_lo));
+ EMIT3(0x8B,
+ add_2reg(0x40, IA32_EBP,
+ IA32_EDX),
+ STACK_VAR(dst_hi));
+ }
+
+ /* mov ecx,imm32 */
+ EMIT2_off32(0xC7, add_1reg(0xC0, IA32_ECX), imm32);
+ hi = imm32 & (1 << 31) ? (u32)~0 : 0;
+ /* mov ebx,imm32 */
+ EMIT2_off32(0xC7, add_1reg(0xC0, IA32_EBX), hi);
+ /* cmp dreg_hi,sreg_hi */
+ EMIT2(0x39, add_2reg(0xC0, dreg_hi, sreg_hi));
+ EMIT2(IA32_JNE, 10);
+ /* cmp dreg_lo,sreg_lo */
+ EMIT2(0x39, add_2reg(0xC0, dreg_lo, sreg_lo));
+
+ /*
+ * For simplicity of branch offset computation,
+ * let's use fixed jump coding here.
+ */
+emit_cond_jmp_signed: /* Check the condition for low 32-bit comparison */
+ jmp_cond = get_cond_jmp_opcode(BPF_OP(code), true);
+ if (jmp_cond == COND_JMP_OPCODE_INVALID)
+ return -EFAULT;
+ jmp_offset = addrs[i + insn->off] - addrs[i] + 8;
+ if (is_simm32(jmp_offset)) {
+ EMIT2_off32(0x0F, jmp_cond + 0x10, jmp_offset);
+ } else {
+ pr_err("cond_jmp gen bug %llx\n", jmp_offset);
+ return -EFAULT;
+ }
+ EMIT2(0xEB, 6);
+
+ /* Check the condition for high 32-bit comparison */
+ jmp_cond = get_cond_jmp_opcode(BPF_OP(code), false);
+ if (jmp_cond == COND_JMP_OPCODE_INVALID)
+ return -EFAULT;
+ jmp_offset = addrs[i + insn->off] - addrs[i];
+ if (is_simm32(jmp_offset)) {
+ EMIT2_off32(0x0F, jmp_cond + 0x10, jmp_offset);
+ } else {
+ pr_err("cond_jmp gen bug %llx\n", jmp_offset);
+ return -EFAULT;
+ }
+ break;
+ }
+ case BPF_JMP | BPF_JA:
+ if (insn->off == -1)
+ /* -1 jmp instructions will always jump
+ * backwards two bytes. Explicitly handling
+ * this case avoids wasting too many passes
+ * when there are long sequences of replaced
+ * dead code.
+ */
+ jmp_offset = -2;
+ else
+ jmp_offset = addrs[i + insn->off] - addrs[i];
+
+ if (!jmp_offset)
+ /* Optimize out nop jumps */
+ break;
+emit_jmp:
+ if (is_imm8(jmp_offset)) {
+ EMIT2(0xEB, jmp_offset);
+ } else if (is_simm32(jmp_offset)) {
+ EMIT1_off32(0xE9, jmp_offset);
+ } else {
+ pr_err("jmp gen bug %llx\n", jmp_offset);
+ return -EFAULT;
+ }
+ break;
+ /* STX XADD: lock *(u32 *)(dst + off) += src */
+ case BPF_STX | BPF_XADD | BPF_W:
+ /* STX XADD: lock *(u64 *)(dst + off) += src */
+ case BPF_STX | BPF_XADD | BPF_DW:
+ goto notyet;
+ case BPF_JMP | BPF_EXIT:
+ if (seen_exit) {
+ jmp_offset = ctx->cleanup_addr - addrs[i];
+ goto emit_jmp;
+ }
+ seen_exit = true;
+ /* Update cleanup_addr */
+ ctx->cleanup_addr = proglen;
+ emit_epilogue(&prog, bpf_prog->aux->stack_depth);
+ break;
+notyet:
+ pr_info_once("*** NOT YET: opcode %02x ***\n", code);
+ return -EFAULT;
+ default:
+ /*
+ * This error will be seen if new instruction was added
+ * to interpreter, but not to JIT or if there is junk in
+ * bpf_prog
+ */
+ pr_err("bpf_jit: unknown opcode %02x\n", code);
+ return -EINVAL;
+ }
+
+ ilen = prog - temp;
+ if (ilen > BPF_MAX_INSN_SIZE) {
+ pr_err("bpf_jit: fatal insn size error\n");
+ return -EFAULT;
+ }
+
+ if (image) {
+ /*
+ * When populating the image, assert that:
+ *
+ * i) We do not write beyond the allocated space, and
+ * ii) addrs[i] did not change from the prior run, in order
+ * to validate assumptions made for computing branch
+ * displacements.
+ */
+ if (unlikely(proglen + ilen > oldproglen ||
+ proglen + ilen != addrs[i])) {
+ pr_err("bpf_jit: fatal error\n");
+ return -EFAULT;
+ }
+ memcpy(image + proglen, temp, ilen);
+ }
+ proglen += ilen;
+ addrs[i] = proglen;
+ prog = temp;
+ }
+ return proglen;
+}
+
+struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
+{
+ struct bpf_binary_header *header = NULL;
+ struct bpf_prog *tmp, *orig_prog = prog;
+ int proglen, oldproglen = 0;
+ struct jit_context ctx = {};
+ bool tmp_blinded = false;
+ u8 *image = NULL;
+ int *addrs;
+ int pass;
+ int i;
+
+ if (!prog->jit_requested)
+ return orig_prog;
+
+ tmp = bpf_jit_blind_constants(prog);
+ /*
+ * If blinding was requested and we failed during blinding,
+ * we must fall back to the interpreter.
+ */
+ if (IS_ERR(tmp))
+ return orig_prog;
+ if (tmp != prog) {
+ tmp_blinded = true;
+ prog = tmp;
+ }
+
+ addrs = kmalloc_array(prog->len, sizeof(*addrs), GFP_KERNEL);
+ if (!addrs) {
+ prog = orig_prog;
+ goto out;
+ }
+
+ /*
+ * Before first pass, make a rough estimation of addrs[]
+ * each BPF instruction is translated to less than 64 bytes
+ */
+ for (proglen = 0, i = 0; i < prog->len; i++) {
+ proglen += 64;
+ addrs[i] = proglen;
+ }
+ ctx.cleanup_addr = proglen;
+
+ /*
+ * JITed image shrinks with every pass and the loop iterates
+ * until the image stops shrinking. Very large BPF programs
+ * may converge on the last pass. In such case do one more
+ * pass to emit the final image.
+ */
+ for (pass = 0; pass < 20 || image; pass++) {
+ proglen = do_jit(prog, addrs, image, oldproglen, &ctx);
+ if (proglen <= 0) {
+out_image:
+ image = NULL;
+ if (header)
+ bpf_jit_binary_free(header);
+ prog = orig_prog;
+ goto out_addrs;
+ }
+ if (image) {
+ if (proglen != oldproglen) {
+ pr_err("bpf_jit: proglen=%d != oldproglen=%d\n",
+ proglen, oldproglen);
+ goto out_image;
+ }
+ break;
+ }
+ if (proglen == oldproglen) {
+ header = bpf_jit_binary_alloc(proglen, &image,
+ 1, jit_fill_hole);
+ if (!header) {
+ prog = orig_prog;
+ goto out_addrs;
+ }
+ }
+ oldproglen = proglen;
+ cond_resched();
+ }
+
+ if (bpf_jit_enable > 1)
+ bpf_jit_dump(prog->len, proglen, pass + 1, image);
+
+ if (image) {
+ bpf_jit_binary_lock_ro(header);
+ prog->bpf_func = (void *)image;
+ prog->jited = 1;
+ prog->jited_len = proglen;
+ } else {
+ prog = orig_prog;
+ }
+
+out_addrs:
+ kfree(addrs);
+out:
+ if (tmp_blinded)
+ bpf_jit_prog_release_other(prog, prog == orig_prog ?
+ tmp : orig_prog);
+ return prog;
+}
diff --git a/arch/x86/oprofile/Makefile b/arch/x86/oprofile/Makefile
new file mode 100644
index 000000000..4d49b5a27
--- /dev/null
+++ b/arch/x86/oprofile/Makefile
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_OPROFILE) += oprofile.o
+
+DRIVER_OBJS = $(addprefix ../../../drivers/oprofile/, \
+ oprof.o cpu_buffer.o buffer_sync.o \
+ event_buffer.o oprofile_files.o \
+ oprofilefs.o oprofile_stats.o \
+ timer_int.o nmi_timer_int.o )
+
+oprofile-y := $(DRIVER_OBJS) init.o backtrace.o
+oprofile-$(CONFIG_X86_LOCAL_APIC) += nmi_int.o op_model_amd.o \
+ op_model_ppro.o op_model_p4.o
diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c
new file mode 100644
index 000000000..a2488b6e2
--- /dev/null
+++ b/arch/x86/oprofile/backtrace.c
@@ -0,0 +1,127 @@
+/**
+ * @file backtrace.c
+ *
+ * @remark Copyright 2002 OProfile authors
+ * @remark Read the file COPYING
+ *
+ * @author John Levon
+ * @author David Smith
+ */
+
+#include <linux/oprofile.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/compat.h>
+#include <linux/uaccess.h>
+
+#include <asm/ptrace.h>
+#include <asm/stacktrace.h>
+#include <asm/unwind.h>
+
+#ifdef CONFIG_COMPAT
+static struct stack_frame_ia32 *
+dump_user_backtrace_32(struct stack_frame_ia32 *head)
+{
+ /* Also check accessibility of one struct frame_head beyond: */
+ struct stack_frame_ia32 bufhead[2];
+ struct stack_frame_ia32 *fp;
+ unsigned long bytes;
+
+ bytes = copy_from_user_nmi(bufhead, head, sizeof(bufhead));
+ if (bytes != 0)
+ return NULL;
+
+ fp = (struct stack_frame_ia32 *) compat_ptr(bufhead[0].next_frame);
+
+ oprofile_add_trace(bufhead[0].return_address);
+
+ /* frame pointers should strictly progress back up the stack
+ * (towards higher addresses) */
+ if (head >= fp)
+ return NULL;
+
+ return fp;
+}
+
+static inline int
+x86_backtrace_32(struct pt_regs * const regs, unsigned int depth)
+{
+ struct stack_frame_ia32 *head;
+
+ /* User process is IA32 */
+ if (!current || !test_thread_flag(TIF_IA32))
+ return 0;
+
+ head = (struct stack_frame_ia32 *) regs->bp;
+ while (depth-- && head)
+ head = dump_user_backtrace_32(head);
+
+ return 1;
+}
+
+#else
+static inline int
+x86_backtrace_32(struct pt_regs * const regs, unsigned int depth)
+{
+ return 0;
+}
+#endif /* CONFIG_COMPAT */
+
+static struct stack_frame *dump_user_backtrace(struct stack_frame *head)
+{
+ /* Also check accessibility of one struct frame_head beyond: */
+ struct stack_frame bufhead[2];
+ unsigned long bytes;
+
+ bytes = copy_from_user_nmi(bufhead, head, sizeof(bufhead));
+ if (bytes != 0)
+ return NULL;
+
+ oprofile_add_trace(bufhead[0].return_address);
+
+ /* frame pointers should strictly progress back up the stack
+ * (towards higher addresses) */
+ if (head >= bufhead[0].next_frame)
+ return NULL;
+
+ return bufhead[0].next_frame;
+}
+
+void
+x86_backtrace(struct pt_regs * const regs, unsigned int depth)
+{
+ struct stack_frame *head = (struct stack_frame *)frame_pointer(regs);
+
+ if (!user_mode(regs)) {
+ struct unwind_state state;
+ unsigned long addr;
+
+ if (!depth)
+ return;
+
+ oprofile_add_trace(regs->ip);
+
+ if (!--depth)
+ return;
+
+ for (unwind_start(&state, current, regs, NULL);
+ !unwind_done(&state); unwind_next_frame(&state)) {
+ addr = unwind_get_return_address(&state);
+ if (!addr)
+ break;
+
+ oprofile_add_trace(addr);
+
+ if (!--depth)
+ break;
+ }
+
+ return;
+ }
+
+ if (x86_backtrace_32(regs, depth))
+ return;
+
+ while (depth-- && head)
+ head = dump_user_backtrace(head);
+}
diff --git a/arch/x86/oprofile/init.c b/arch/x86/oprofile/init.c
new file mode 100644
index 000000000..9e138d00a
--- /dev/null
+++ b/arch/x86/oprofile/init.c
@@ -0,0 +1,38 @@
+/**
+ * @file init.c
+ *
+ * @remark Copyright 2002 OProfile authors
+ * @remark Read the file COPYING
+ *
+ * @author John Levon <levon@movementarian.org>
+ */
+
+#include <linux/oprofile.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+
+/*
+ * We support CPUs that have performance counters like the Pentium Pro
+ * with the NMI mode driver.
+ */
+
+#ifdef CONFIG_X86_LOCAL_APIC
+extern int op_nmi_init(struct oprofile_operations *ops);
+extern void op_nmi_exit(void);
+#else
+static int op_nmi_init(struct oprofile_operations *ops) { return -ENODEV; }
+static void op_nmi_exit(void) { }
+#endif
+
+extern void x86_backtrace(struct pt_regs * const regs, unsigned int depth);
+
+int __init oprofile_arch_init(struct oprofile_operations *ops)
+{
+ ops->backtrace = x86_backtrace;
+ return op_nmi_init(ops);
+}
+
+void oprofile_arch_exit(void)
+{
+ op_nmi_exit();
+}
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
new file mode 100644
index 000000000..a7a767726
--- /dev/null
+++ b/arch/x86/oprofile/nmi_int.c
@@ -0,0 +1,780 @@
+/**
+ * @file nmi_int.c
+ *
+ * @remark Copyright 2002-2009 OProfile authors
+ * @remark Read the file COPYING
+ *
+ * @author John Levon <levon@movementarian.org>
+ * @author Robert Richter <robert.richter@amd.com>
+ * @author Barry Kasindorf <barry.kasindorf@amd.com>
+ * @author Jason Yeh <jason.yeh@amd.com>
+ * @author Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
+ */
+
+#include <linux/init.h>
+#include <linux/notifier.h>
+#include <linux/smp.h>
+#include <linux/oprofile.h>
+#include <linux/syscore_ops.h>
+#include <linux/slab.h>
+#include <linux/moduleparam.h>
+#include <linux/kdebug.h>
+#include <linux/cpu.h>
+#include <asm/nmi.h>
+#include <asm/msr.h>
+#include <asm/apic.h>
+
+#include "op_counter.h"
+#include "op_x86_model.h"
+
+static struct op_x86_model_spec *model;
+static DEFINE_PER_CPU(struct op_msrs, cpu_msrs);
+static DEFINE_PER_CPU(unsigned long, saved_lvtpc);
+
+/* must be protected with get_online_cpus()/put_online_cpus(): */
+static int nmi_enabled;
+static int ctr_running;
+
+struct op_counter_config counter_config[OP_MAX_COUNTER];
+
+/* common functions */
+
+u64 op_x86_get_ctrl(struct op_x86_model_spec const *model,
+ struct op_counter_config *counter_config)
+{
+ u64 val = 0;
+ u16 event = (u16)counter_config->event;
+
+ val |= ARCH_PERFMON_EVENTSEL_INT;
+ val |= counter_config->user ? ARCH_PERFMON_EVENTSEL_USR : 0;
+ val |= counter_config->kernel ? ARCH_PERFMON_EVENTSEL_OS : 0;
+ val |= (counter_config->unit_mask & 0xFF) << 8;
+ counter_config->extra &= (ARCH_PERFMON_EVENTSEL_INV |
+ ARCH_PERFMON_EVENTSEL_EDGE |
+ ARCH_PERFMON_EVENTSEL_CMASK);
+ val |= counter_config->extra;
+ event &= model->event_mask ? model->event_mask : 0xFF;
+ val |= event & 0xFF;
+ val |= (u64)(event & 0x0F00) << 24;
+
+ return val;
+}
+
+
+static int profile_exceptions_notify(unsigned int val, struct pt_regs *regs)
+{
+ if (ctr_running)
+ model->check_ctrs(regs, this_cpu_ptr(&cpu_msrs));
+ else if (!nmi_enabled)
+ return NMI_DONE;
+ else
+ model->stop(this_cpu_ptr(&cpu_msrs));
+ return NMI_HANDLED;
+}
+
+static void nmi_cpu_save_registers(struct op_msrs *msrs)
+{
+ struct op_msr *counters = msrs->counters;
+ struct op_msr *controls = msrs->controls;
+ unsigned int i;
+
+ for (i = 0; i < model->num_counters; ++i) {
+ if (counters[i].addr)
+ rdmsrl(counters[i].addr, counters[i].saved);
+ }
+
+ for (i = 0; i < model->num_controls; ++i) {
+ if (controls[i].addr)
+ rdmsrl(controls[i].addr, controls[i].saved);
+ }
+}
+
+static void nmi_cpu_start(void *dummy)
+{
+ struct op_msrs const *msrs = this_cpu_ptr(&cpu_msrs);
+ if (!msrs->controls)
+ WARN_ON_ONCE(1);
+ else
+ model->start(msrs);
+}
+
+static int nmi_start(void)
+{
+ get_online_cpus();
+ ctr_running = 1;
+ /* make ctr_running visible to the nmi handler: */
+ smp_mb();
+ on_each_cpu(nmi_cpu_start, NULL, 1);
+ put_online_cpus();
+ return 0;
+}
+
+static void nmi_cpu_stop(void *dummy)
+{
+ struct op_msrs const *msrs = this_cpu_ptr(&cpu_msrs);
+ if (!msrs->controls)
+ WARN_ON_ONCE(1);
+ else
+ model->stop(msrs);
+}
+
+static void nmi_stop(void)
+{
+ get_online_cpus();
+ on_each_cpu(nmi_cpu_stop, NULL, 1);
+ ctr_running = 0;
+ put_online_cpus();
+}
+
+#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
+
+static DEFINE_PER_CPU(int, switch_index);
+
+static inline int has_mux(void)
+{
+ return !!model->switch_ctrl;
+}
+
+inline int op_x86_phys_to_virt(int phys)
+{
+ return __this_cpu_read(switch_index) + phys;
+}
+
+inline int op_x86_virt_to_phys(int virt)
+{
+ return virt % model->num_counters;
+}
+
+static void nmi_shutdown_mux(void)
+{
+ int i;
+
+ if (!has_mux())
+ return;
+
+ for_each_possible_cpu(i) {
+ kfree(per_cpu(cpu_msrs, i).multiplex);
+ per_cpu(cpu_msrs, i).multiplex = NULL;
+ per_cpu(switch_index, i) = 0;
+ }
+}
+
+static int nmi_setup_mux(void)
+{
+ size_t multiplex_size =
+ sizeof(struct op_msr) * model->num_virt_counters;
+ int i;
+
+ if (!has_mux())
+ return 1;
+
+ for_each_possible_cpu(i) {
+ per_cpu(cpu_msrs, i).multiplex =
+ kzalloc(multiplex_size, GFP_KERNEL);
+ if (!per_cpu(cpu_msrs, i).multiplex)
+ return 0;
+ }
+
+ return 1;
+}
+
+static void nmi_cpu_setup_mux(int cpu, struct op_msrs const * const msrs)
+{
+ int i;
+ struct op_msr *multiplex = msrs->multiplex;
+
+ if (!has_mux())
+ return;
+
+ for (i = 0; i < model->num_virt_counters; ++i) {
+ if (counter_config[i].enabled) {
+ multiplex[i].saved = -(u64)counter_config[i].count;
+ } else {
+ multiplex[i].saved = 0;
+ }
+ }
+
+ per_cpu(switch_index, cpu) = 0;
+}
+
+static void nmi_cpu_save_mpx_registers(struct op_msrs *msrs)
+{
+ struct op_msr *counters = msrs->counters;
+ struct op_msr *multiplex = msrs->multiplex;
+ int i;
+
+ for (i = 0; i < model->num_counters; ++i) {
+ int virt = op_x86_phys_to_virt(i);
+ if (counters[i].addr)
+ rdmsrl(counters[i].addr, multiplex[virt].saved);
+ }
+}
+
+static void nmi_cpu_restore_mpx_registers(struct op_msrs *msrs)
+{
+ struct op_msr *counters = msrs->counters;
+ struct op_msr *multiplex = msrs->multiplex;
+ int i;
+
+ for (i = 0; i < model->num_counters; ++i) {
+ int virt = op_x86_phys_to_virt(i);
+ if (counters[i].addr)
+ wrmsrl(counters[i].addr, multiplex[virt].saved);
+ }
+}
+
+static void nmi_cpu_switch(void *dummy)
+{
+ int cpu = smp_processor_id();
+ int si = per_cpu(switch_index, cpu);
+ struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu);
+
+ nmi_cpu_stop(NULL);
+ nmi_cpu_save_mpx_registers(msrs);
+
+ /* move to next set */
+ si += model->num_counters;
+ if ((si >= model->num_virt_counters) || (counter_config[si].count == 0))
+ per_cpu(switch_index, cpu) = 0;
+ else
+ per_cpu(switch_index, cpu) = si;
+
+ model->switch_ctrl(model, msrs);
+ nmi_cpu_restore_mpx_registers(msrs);
+
+ nmi_cpu_start(NULL);
+}
+
+
+/*
+ * Quick check to see if multiplexing is necessary.
+ * The check should be sufficient since counters are used
+ * in ordre.
+ */
+static int nmi_multiplex_on(void)
+{
+ return counter_config[model->num_counters].count ? 0 : -EINVAL;
+}
+
+static int nmi_switch_event(void)
+{
+ if (!has_mux())
+ return -ENOSYS; /* not implemented */
+ if (nmi_multiplex_on() < 0)
+ return -EINVAL; /* not necessary */
+
+ get_online_cpus();
+ if (ctr_running)
+ on_each_cpu(nmi_cpu_switch, NULL, 1);
+ put_online_cpus();
+
+ return 0;
+}
+
+static inline void mux_init(struct oprofile_operations *ops)
+{
+ if (has_mux())
+ ops->switch_events = nmi_switch_event;
+}
+
+static void mux_clone(int cpu)
+{
+ if (!has_mux())
+ return;
+
+ memcpy(per_cpu(cpu_msrs, cpu).multiplex,
+ per_cpu(cpu_msrs, 0).multiplex,
+ sizeof(struct op_msr) * model->num_virt_counters);
+}
+
+#else
+
+inline int op_x86_phys_to_virt(int phys) { return phys; }
+inline int op_x86_virt_to_phys(int virt) { return virt; }
+static inline void nmi_shutdown_mux(void) { }
+static inline int nmi_setup_mux(void) { return 1; }
+static inline void
+nmi_cpu_setup_mux(int cpu, struct op_msrs const * const msrs) { }
+static inline void mux_init(struct oprofile_operations *ops) { }
+static void mux_clone(int cpu) { }
+
+#endif
+
+static void free_msrs(void)
+{
+ int i;
+ for_each_possible_cpu(i) {
+ kfree(per_cpu(cpu_msrs, i).counters);
+ per_cpu(cpu_msrs, i).counters = NULL;
+ kfree(per_cpu(cpu_msrs, i).controls);
+ per_cpu(cpu_msrs, i).controls = NULL;
+ }
+ nmi_shutdown_mux();
+}
+
+static int allocate_msrs(void)
+{
+ size_t controls_size = sizeof(struct op_msr) * model->num_controls;
+ size_t counters_size = sizeof(struct op_msr) * model->num_counters;
+
+ int i;
+ for_each_possible_cpu(i) {
+ per_cpu(cpu_msrs, i).counters = kzalloc(counters_size,
+ GFP_KERNEL);
+ if (!per_cpu(cpu_msrs, i).counters)
+ goto fail;
+ per_cpu(cpu_msrs, i).controls = kzalloc(controls_size,
+ GFP_KERNEL);
+ if (!per_cpu(cpu_msrs, i).controls)
+ goto fail;
+ }
+
+ if (!nmi_setup_mux())
+ goto fail;
+
+ return 1;
+
+fail:
+ free_msrs();
+ return 0;
+}
+
+static void nmi_cpu_setup(void)
+{
+ int cpu = smp_processor_id();
+ struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu);
+
+ nmi_cpu_save_registers(msrs);
+ raw_spin_lock(&oprofilefs_lock);
+ model->setup_ctrs(model, msrs);
+ nmi_cpu_setup_mux(cpu, msrs);
+ raw_spin_unlock(&oprofilefs_lock);
+ per_cpu(saved_lvtpc, cpu) = apic_read(APIC_LVTPC);
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+}
+
+static void nmi_cpu_restore_registers(struct op_msrs *msrs)
+{
+ struct op_msr *counters = msrs->counters;
+ struct op_msr *controls = msrs->controls;
+ unsigned int i;
+
+ for (i = 0; i < model->num_controls; ++i) {
+ if (controls[i].addr)
+ wrmsrl(controls[i].addr, controls[i].saved);
+ }
+
+ for (i = 0; i < model->num_counters; ++i) {
+ if (counters[i].addr)
+ wrmsrl(counters[i].addr, counters[i].saved);
+ }
+}
+
+static void nmi_cpu_shutdown(void)
+{
+ unsigned int v;
+ int cpu = smp_processor_id();
+ struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu);
+
+ /* restoring APIC_LVTPC can trigger an apic error because the delivery
+ * mode and vector nr combination can be illegal. That's by design: on
+ * power on apic lvt contain a zero vector nr which are legal only for
+ * NMI delivery mode. So inhibit apic err before restoring lvtpc
+ */
+ v = apic_read(APIC_LVTERR);
+ apic_write(APIC_LVTERR, v | APIC_LVT_MASKED);
+ apic_write(APIC_LVTPC, per_cpu(saved_lvtpc, cpu));
+ apic_write(APIC_LVTERR, v);
+ nmi_cpu_restore_registers(msrs);
+}
+
+static int nmi_cpu_online(unsigned int cpu)
+{
+ local_irq_disable();
+ if (nmi_enabled)
+ nmi_cpu_setup();
+ if (ctr_running)
+ nmi_cpu_start(NULL);
+ local_irq_enable();
+ return 0;
+}
+
+static int nmi_cpu_down_prep(unsigned int cpu)
+{
+ local_irq_disable();
+ if (ctr_running)
+ nmi_cpu_stop(NULL);
+ if (nmi_enabled)
+ nmi_cpu_shutdown();
+ local_irq_enable();
+ return 0;
+}
+
+static int nmi_create_files(struct dentry *root)
+{
+ unsigned int i;
+
+ for (i = 0; i < model->num_virt_counters; ++i) {
+ struct dentry *dir;
+ char buf[4];
+
+ /* quick little hack to _not_ expose a counter if it is not
+ * available for use. This should protect userspace app.
+ * NOTE: assumes 1:1 mapping here (that counters are organized
+ * sequentially in their struct assignment).
+ */
+ if (!avail_to_resrv_perfctr_nmi_bit(op_x86_virt_to_phys(i)))
+ continue;
+
+ snprintf(buf, sizeof(buf), "%d", i);
+ dir = oprofilefs_mkdir(root, buf);
+ oprofilefs_create_ulong(dir, "enabled", &counter_config[i].enabled);
+ oprofilefs_create_ulong(dir, "event", &counter_config[i].event);
+ oprofilefs_create_ulong(dir, "count", &counter_config[i].count);
+ oprofilefs_create_ulong(dir, "unit_mask", &counter_config[i].unit_mask);
+ oprofilefs_create_ulong(dir, "kernel", &counter_config[i].kernel);
+ oprofilefs_create_ulong(dir, "user", &counter_config[i].user);
+ oprofilefs_create_ulong(dir, "extra", &counter_config[i].extra);
+ }
+
+ return 0;
+}
+
+static enum cpuhp_state cpuhp_nmi_online;
+
+static int nmi_setup(void)
+{
+ int err = 0;
+ int cpu;
+
+ if (!allocate_msrs())
+ return -ENOMEM;
+
+ /* We need to serialize save and setup for HT because the subset
+ * of msrs are distinct for save and setup operations
+ */
+
+ /* Assume saved/restored counters are the same on all CPUs */
+ err = model->fill_in_addresses(&per_cpu(cpu_msrs, 0));
+ if (err)
+ goto fail;
+
+ for_each_possible_cpu(cpu) {
+ if (!IS_ENABLED(CONFIG_SMP) || !cpu)
+ continue;
+
+ memcpy(per_cpu(cpu_msrs, cpu).counters,
+ per_cpu(cpu_msrs, 0).counters,
+ sizeof(struct op_msr) * model->num_counters);
+
+ memcpy(per_cpu(cpu_msrs, cpu).controls,
+ per_cpu(cpu_msrs, 0).controls,
+ sizeof(struct op_msr) * model->num_controls);
+
+ mux_clone(cpu);
+ }
+
+ nmi_enabled = 0;
+ ctr_running = 0;
+ /* make variables visible to the nmi handler: */
+ smp_mb();
+ err = register_nmi_handler(NMI_LOCAL, profile_exceptions_notify,
+ 0, "oprofile");
+ if (err)
+ goto fail;
+
+ nmi_enabled = 1;
+ /* make nmi_enabled visible to the nmi handler: */
+ smp_mb();
+ err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/oprofile:online",
+ nmi_cpu_online, nmi_cpu_down_prep);
+ if (err < 0)
+ goto fail_nmi;
+ cpuhp_nmi_online = err;
+ return 0;
+fail_nmi:
+ unregister_nmi_handler(NMI_LOCAL, "oprofile");
+fail:
+ free_msrs();
+ return err;
+}
+
+static void nmi_shutdown(void)
+{
+ struct op_msrs *msrs;
+
+ cpuhp_remove_state(cpuhp_nmi_online);
+ nmi_enabled = 0;
+ ctr_running = 0;
+
+ /* make variables visible to the nmi handler: */
+ smp_mb();
+ unregister_nmi_handler(NMI_LOCAL, "oprofile");
+ msrs = &get_cpu_var(cpu_msrs);
+ model->shutdown(msrs);
+ free_msrs();
+ put_cpu_var(cpu_msrs);
+}
+
+#ifdef CONFIG_PM
+
+static int nmi_suspend(void)
+{
+ /* Only one CPU left, just stop that one */
+ if (nmi_enabled == 1)
+ nmi_cpu_stop(NULL);
+ return 0;
+}
+
+static void nmi_resume(void)
+{
+ if (nmi_enabled == 1)
+ nmi_cpu_start(NULL);
+}
+
+static struct syscore_ops oprofile_syscore_ops = {
+ .resume = nmi_resume,
+ .suspend = nmi_suspend,
+};
+
+static void __init init_suspend_resume(void)
+{
+ register_syscore_ops(&oprofile_syscore_ops);
+}
+
+static void exit_suspend_resume(void)
+{
+ unregister_syscore_ops(&oprofile_syscore_ops);
+}
+
+#else
+
+static inline void init_suspend_resume(void) { }
+static inline void exit_suspend_resume(void) { }
+
+#endif /* CONFIG_PM */
+
+static int __init p4_init(char **cpu_type)
+{
+ __u8 cpu_model = boot_cpu_data.x86_model;
+
+ if (cpu_model > 6 || cpu_model == 5)
+ return 0;
+
+#ifndef CONFIG_SMP
+ *cpu_type = "i386/p4";
+ model = &op_p4_spec;
+ return 1;
+#else
+ switch (smp_num_siblings) {
+ case 1:
+ *cpu_type = "i386/p4";
+ model = &op_p4_spec;
+ return 1;
+
+ case 2:
+ *cpu_type = "i386/p4-ht";
+ model = &op_p4_ht2_spec;
+ return 1;
+ }
+#endif
+
+ printk(KERN_INFO "oprofile: P4 HyperThreading detected with > 2 threads\n");
+ printk(KERN_INFO "oprofile: Reverting to timer mode.\n");
+ return 0;
+}
+
+enum __force_cpu_type {
+ reserved = 0, /* do not force */
+ timer,
+ arch_perfmon,
+};
+
+static int force_cpu_type;
+
+static int set_cpu_type(const char *str, const struct kernel_param *kp)
+{
+ if (!strcmp(str, "timer")) {
+ force_cpu_type = timer;
+ printk(KERN_INFO "oprofile: forcing NMI timer mode\n");
+ } else if (!strcmp(str, "arch_perfmon")) {
+ force_cpu_type = arch_perfmon;
+ printk(KERN_INFO "oprofile: forcing architectural perfmon\n");
+ } else {
+ force_cpu_type = 0;
+ }
+
+ return 0;
+}
+module_param_call(cpu_type, set_cpu_type, NULL, NULL, 0);
+
+static int __init ppro_init(char **cpu_type)
+{
+ __u8 cpu_model = boot_cpu_data.x86_model;
+ struct op_x86_model_spec *spec = &op_ppro_spec; /* default */
+
+ if (force_cpu_type == arch_perfmon && boot_cpu_has(X86_FEATURE_ARCH_PERFMON))
+ return 0;
+
+ /*
+ * Documentation on identifying Intel processors by CPU family
+ * and model can be found in the Intel Software Developer's
+ * Manuals (SDM):
+ *
+ * http://www.intel.com/products/processor/manuals/
+ *
+ * As of May 2010 the documentation for this was in the:
+ * "Intel 64 and IA-32 Architectures Software Developer's
+ * Manual Volume 3B: System Programming Guide", "Table B-1
+ * CPUID Signature Values of DisplayFamily_DisplayModel".
+ */
+ switch (cpu_model) {
+ case 0 ... 2:
+ *cpu_type = "i386/ppro";
+ break;
+ case 3 ... 5:
+ *cpu_type = "i386/pii";
+ break;
+ case 6 ... 8:
+ case 10 ... 11:
+ *cpu_type = "i386/piii";
+ break;
+ case 9:
+ case 13:
+ *cpu_type = "i386/p6_mobile";
+ break;
+ case 14:
+ *cpu_type = "i386/core";
+ break;
+ case 0x0f:
+ case 0x16:
+ case 0x17:
+ case 0x1d:
+ *cpu_type = "i386/core_2";
+ break;
+ case 0x1a:
+ case 0x1e:
+ case 0x2e:
+ spec = &op_arch_perfmon_spec;
+ *cpu_type = "i386/core_i7";
+ break;
+ case 0x1c:
+ *cpu_type = "i386/atom";
+ break;
+ default:
+ /* Unknown */
+ return 0;
+ }
+
+ model = spec;
+ return 1;
+}
+
+int __init op_nmi_init(struct oprofile_operations *ops)
+{
+ __u8 vendor = boot_cpu_data.x86_vendor;
+ __u8 family = boot_cpu_data.x86;
+ char *cpu_type = NULL;
+ int ret = 0;
+
+ if (!boot_cpu_has(X86_FEATURE_APIC))
+ return -ENODEV;
+
+ if (force_cpu_type == timer)
+ return -ENODEV;
+
+ switch (vendor) {
+ case X86_VENDOR_AMD:
+ /* Needs to be at least an Athlon (or hammer in 32bit mode) */
+
+ switch (family) {
+ case 6:
+ cpu_type = "i386/athlon";
+ break;
+ case 0xf:
+ /*
+ * Actually it could be i386/hammer too, but
+ * give user space an consistent name.
+ */
+ cpu_type = "x86-64/hammer";
+ break;
+ case 0x10:
+ cpu_type = "x86-64/family10";
+ break;
+ case 0x11:
+ cpu_type = "x86-64/family11h";
+ break;
+ case 0x12:
+ cpu_type = "x86-64/family12h";
+ break;
+ case 0x14:
+ cpu_type = "x86-64/family14h";
+ break;
+ case 0x15:
+ cpu_type = "x86-64/family15h";
+ break;
+ default:
+ return -ENODEV;
+ }
+ model = &op_amd_spec;
+ break;
+
+ case X86_VENDOR_INTEL:
+ switch (family) {
+ /* Pentium IV */
+ case 0xf:
+ p4_init(&cpu_type);
+ break;
+
+ /* A P6-class processor */
+ case 6:
+ ppro_init(&cpu_type);
+ break;
+
+ default:
+ break;
+ }
+
+ if (cpu_type)
+ break;
+
+ if (!boot_cpu_has(X86_FEATURE_ARCH_PERFMON))
+ return -ENODEV;
+
+ /* use arch perfmon as fallback */
+ cpu_type = "i386/arch_perfmon";
+ model = &op_arch_perfmon_spec;
+ break;
+
+ default:
+ return -ENODEV;
+ }
+
+ /* default values, can be overwritten by model */
+ ops->create_files = nmi_create_files;
+ ops->setup = nmi_setup;
+ ops->shutdown = nmi_shutdown;
+ ops->start = nmi_start;
+ ops->stop = nmi_stop;
+ ops->cpu_type = cpu_type;
+
+ if (model->init)
+ ret = model->init(ops);
+ if (ret)
+ return ret;
+
+ if (!model->num_virt_counters)
+ model->num_virt_counters = model->num_counters;
+
+ mux_init(ops);
+
+ init_suspend_resume();
+
+ printk(KERN_INFO "oprofile: using NMI interrupt.\n");
+ return 0;
+}
+
+void op_nmi_exit(void)
+{
+ exit_suspend_resume();
+}
diff --git a/arch/x86/oprofile/op_counter.h b/arch/x86/oprofile/op_counter.h
new file mode 100644
index 000000000..0b7b7b179
--- /dev/null
+++ b/arch/x86/oprofile/op_counter.h
@@ -0,0 +1,30 @@
+/**
+ * @file op_counter.h
+ *
+ * @remark Copyright 2002 OProfile authors
+ * @remark Read the file COPYING
+ *
+ * @author John Levon
+ */
+
+#ifndef OP_COUNTER_H
+#define OP_COUNTER_H
+
+#define OP_MAX_COUNTER 32
+
+/* Per-perfctr configuration as set via
+ * oprofilefs.
+ */
+struct op_counter_config {
+ unsigned long count;
+ unsigned long enabled;
+ unsigned long event;
+ unsigned long kernel;
+ unsigned long user;
+ unsigned long unit_mask;
+ unsigned long extra;
+};
+
+extern struct op_counter_config counter_config[];
+
+#endif /* OP_COUNTER_H */
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
new file mode 100644
index 000000000..660a83c82
--- /dev/null
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -0,0 +1,542 @@
+/*
+ * @file op_model_amd.c
+ * athlon / K7 / K8 / Family 10h model-specific MSR operations
+ *
+ * @remark Copyright 2002-2009 OProfile authors
+ * @remark Read the file COPYING
+ *
+ * @author John Levon
+ * @author Philippe Elie
+ * @author Graydon Hoare
+ * @author Robert Richter <robert.richter@amd.com>
+ * @author Barry Kasindorf <barry.kasindorf@amd.com>
+ * @author Jason Yeh <jason.yeh@amd.com>
+ * @author Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
+ */
+
+#include <linux/oprofile.h>
+#include <linux/device.h>
+#include <linux/pci.h>
+#include <linux/percpu.h>
+
+#include <asm/ptrace.h>
+#include <asm/msr.h>
+#include <asm/nmi.h>
+#include <asm/apic.h>
+#include <asm/processor.h>
+
+#include "op_x86_model.h"
+#include "op_counter.h"
+
+#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
+#define NUM_VIRT_COUNTERS 32
+#else
+#define NUM_VIRT_COUNTERS 0
+#endif
+
+#define OP_EVENT_MASK 0x0FFF
+#define OP_CTR_OVERFLOW (1ULL<<31)
+
+#define MSR_AMD_EVENTSEL_RESERVED ((0xFFFFFCF0ULL<<32)|(1ULL<<21))
+
+static int num_counters;
+static unsigned long reset_value[OP_MAX_COUNTER];
+
+#define IBS_FETCH_SIZE 6
+#define IBS_OP_SIZE 12
+
+static u32 ibs_caps;
+
+struct ibs_config {
+ unsigned long op_enabled;
+ unsigned long fetch_enabled;
+ unsigned long max_cnt_fetch;
+ unsigned long max_cnt_op;
+ unsigned long rand_en;
+ unsigned long dispatched_ops;
+ unsigned long branch_target;
+};
+
+struct ibs_state {
+ u64 ibs_op_ctl;
+ int branch_target;
+ unsigned long sample_size;
+};
+
+static struct ibs_config ibs_config;
+static struct ibs_state ibs_state;
+
+/*
+ * IBS randomization macros
+ */
+#define IBS_RANDOM_BITS 12
+#define IBS_RANDOM_MASK ((1ULL << IBS_RANDOM_BITS) - 1)
+#define IBS_RANDOM_MAXCNT_OFFSET (1ULL << (IBS_RANDOM_BITS - 5))
+
+/*
+ * 16-bit Linear Feedback Shift Register (LFSR)
+ *
+ * 16 14 13 11
+ * Feedback polynomial = X + X + X + X + 1
+ */
+static unsigned int lfsr_random(void)
+{
+ static unsigned int lfsr_value = 0xF00D;
+ unsigned int bit;
+
+ /* Compute next bit to shift in */
+ bit = ((lfsr_value >> 0) ^
+ (lfsr_value >> 2) ^
+ (lfsr_value >> 3) ^
+ (lfsr_value >> 5)) & 0x0001;
+
+ /* Advance to next register value */
+ lfsr_value = (lfsr_value >> 1) | (bit << 15);
+
+ return lfsr_value;
+}
+
+/*
+ * IBS software randomization
+ *
+ * The IBS periodic op counter is randomized in software. The lower 12
+ * bits of the 20 bit counter are randomized. IbsOpCurCnt is
+ * initialized with a 12 bit random value.
+ */
+static inline u64 op_amd_randomize_ibs_op(u64 val)
+{
+ unsigned int random = lfsr_random();
+
+ if (!(ibs_caps & IBS_CAPS_RDWROPCNT))
+ /*
+ * Work around if the hw can not write to IbsOpCurCnt
+ *
+ * Randomize the lower 8 bits of the 16 bit
+ * IbsOpMaxCnt [15:0] value in the range of -128 to
+ * +127 by adding/subtracting an offset to the
+ * maximum count (IbsOpMaxCnt).
+ *
+ * To avoid over or underflows and protect upper bits
+ * starting at bit 16, the initial value for
+ * IbsOpMaxCnt must fit in the range from 0x0081 to
+ * 0xff80.
+ */
+ val += (s8)(random >> 4);
+ else
+ val |= (u64)(random & IBS_RANDOM_MASK) << 32;
+
+ return val;
+}
+
+static inline void
+op_amd_handle_ibs(struct pt_regs * const regs,
+ struct op_msrs const * const msrs)
+{
+ u64 val, ctl;
+ struct op_entry entry;
+
+ if (!ibs_caps)
+ return;
+
+ if (ibs_config.fetch_enabled) {
+ rdmsrl(MSR_AMD64_IBSFETCHCTL, ctl);
+ if (ctl & IBS_FETCH_VAL) {
+ rdmsrl(MSR_AMD64_IBSFETCHLINAD, val);
+ oprofile_write_reserve(&entry, regs, val,
+ IBS_FETCH_CODE, IBS_FETCH_SIZE);
+ oprofile_add_data64(&entry, val);
+ oprofile_add_data64(&entry, ctl);
+ rdmsrl(MSR_AMD64_IBSFETCHPHYSAD, val);
+ oprofile_add_data64(&entry, val);
+ oprofile_write_commit(&entry);
+
+ /* reenable the IRQ */
+ ctl &= ~(IBS_FETCH_VAL | IBS_FETCH_CNT);
+ ctl |= IBS_FETCH_ENABLE;
+ wrmsrl(MSR_AMD64_IBSFETCHCTL, ctl);
+ }
+ }
+
+ if (ibs_config.op_enabled) {
+ rdmsrl(MSR_AMD64_IBSOPCTL, ctl);
+ if (ctl & IBS_OP_VAL) {
+ rdmsrl(MSR_AMD64_IBSOPRIP, val);
+ oprofile_write_reserve(&entry, regs, val, IBS_OP_CODE,
+ ibs_state.sample_size);
+ oprofile_add_data64(&entry, val);
+ rdmsrl(MSR_AMD64_IBSOPDATA, val);
+ oprofile_add_data64(&entry, val);
+ rdmsrl(MSR_AMD64_IBSOPDATA2, val);
+ oprofile_add_data64(&entry, val);
+ rdmsrl(MSR_AMD64_IBSOPDATA3, val);
+ oprofile_add_data64(&entry, val);
+ rdmsrl(MSR_AMD64_IBSDCLINAD, val);
+ oprofile_add_data64(&entry, val);
+ rdmsrl(MSR_AMD64_IBSDCPHYSAD, val);
+ oprofile_add_data64(&entry, val);
+ if (ibs_state.branch_target) {
+ rdmsrl(MSR_AMD64_IBSBRTARGET, val);
+ oprofile_add_data(&entry, (unsigned long)val);
+ }
+ oprofile_write_commit(&entry);
+
+ /* reenable the IRQ */
+ ctl = op_amd_randomize_ibs_op(ibs_state.ibs_op_ctl);
+ wrmsrl(MSR_AMD64_IBSOPCTL, ctl);
+ }
+ }
+}
+
+static inline void op_amd_start_ibs(void)
+{
+ u64 val;
+
+ if (!ibs_caps)
+ return;
+
+ memset(&ibs_state, 0, sizeof(ibs_state));
+
+ /*
+ * Note: Since the max count settings may out of range we
+ * write back the actual used values so that userland can read
+ * it.
+ */
+
+ if (ibs_config.fetch_enabled) {
+ val = ibs_config.max_cnt_fetch >> 4;
+ val = min(val, IBS_FETCH_MAX_CNT);
+ ibs_config.max_cnt_fetch = val << 4;
+ val |= ibs_config.rand_en ? IBS_FETCH_RAND_EN : 0;
+ val |= IBS_FETCH_ENABLE;
+ wrmsrl(MSR_AMD64_IBSFETCHCTL, val);
+ }
+
+ if (ibs_config.op_enabled) {
+ val = ibs_config.max_cnt_op >> 4;
+ if (!(ibs_caps & IBS_CAPS_RDWROPCNT)) {
+ /*
+ * IbsOpCurCnt not supported. See
+ * op_amd_randomize_ibs_op() for details.
+ */
+ val = clamp(val, 0x0081ULL, 0xFF80ULL);
+ ibs_config.max_cnt_op = val << 4;
+ } else {
+ /*
+ * The start value is randomized with a
+ * positive offset, we need to compensate it
+ * with the half of the randomized range. Also
+ * avoid underflows.
+ */
+ val += IBS_RANDOM_MAXCNT_OFFSET;
+ if (ibs_caps & IBS_CAPS_OPCNTEXT)
+ val = min(val, IBS_OP_MAX_CNT_EXT);
+ else
+ val = min(val, IBS_OP_MAX_CNT);
+ ibs_config.max_cnt_op =
+ (val - IBS_RANDOM_MAXCNT_OFFSET) << 4;
+ }
+ val = ((val & ~IBS_OP_MAX_CNT) << 4) | (val & IBS_OP_MAX_CNT);
+ val |= ibs_config.dispatched_ops ? IBS_OP_CNT_CTL : 0;
+ val |= IBS_OP_ENABLE;
+ ibs_state.ibs_op_ctl = val;
+ ibs_state.sample_size = IBS_OP_SIZE;
+ if (ibs_config.branch_target) {
+ ibs_state.branch_target = 1;
+ ibs_state.sample_size++;
+ }
+ val = op_amd_randomize_ibs_op(ibs_state.ibs_op_ctl);
+ wrmsrl(MSR_AMD64_IBSOPCTL, val);
+ }
+}
+
+static void op_amd_stop_ibs(void)
+{
+ if (!ibs_caps)
+ return;
+
+ if (ibs_config.fetch_enabled)
+ /* clear max count and enable */
+ wrmsrl(MSR_AMD64_IBSFETCHCTL, 0);
+
+ if (ibs_config.op_enabled)
+ /* clear max count and enable */
+ wrmsrl(MSR_AMD64_IBSOPCTL, 0);
+}
+
+#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
+
+static void op_mux_switch_ctrl(struct op_x86_model_spec const *model,
+ struct op_msrs const * const msrs)
+{
+ u64 val;
+ int i;
+
+ /* enable active counters */
+ for (i = 0; i < num_counters; ++i) {
+ int virt = op_x86_phys_to_virt(i);
+ if (!reset_value[virt])
+ continue;
+ rdmsrl(msrs->controls[i].addr, val);
+ val &= model->reserved;
+ val |= op_x86_get_ctrl(model, &counter_config[virt]);
+ wrmsrl(msrs->controls[i].addr, val);
+ }
+}
+
+#endif
+
+/* functions for op_amd_spec */
+
+static void op_amd_shutdown(struct op_msrs const * const msrs)
+{
+ int i;
+
+ for (i = 0; i < num_counters; ++i) {
+ if (!msrs->counters[i].addr)
+ continue;
+ release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
+ release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
+ }
+}
+
+static int op_amd_fill_in_addresses(struct op_msrs * const msrs)
+{
+ int i;
+
+ for (i = 0; i < num_counters; i++) {
+ if (!reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i))
+ goto fail;
+ if (!reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i)) {
+ release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
+ goto fail;
+ }
+ /* both registers must be reserved */
+ if (num_counters == AMD64_NUM_COUNTERS_CORE) {
+ msrs->counters[i].addr = MSR_F15H_PERF_CTR + (i << 1);
+ msrs->controls[i].addr = MSR_F15H_PERF_CTL + (i << 1);
+ } else {
+ msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i;
+ msrs->counters[i].addr = MSR_K7_PERFCTR0 + i;
+ }
+ continue;
+ fail:
+ if (!counter_config[i].enabled)
+ continue;
+ op_x86_warn_reserved(i);
+ op_amd_shutdown(msrs);
+ return -EBUSY;
+ }
+
+ return 0;
+}
+
+static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
+ struct op_msrs const * const msrs)
+{
+ u64 val;
+ int i;
+
+ /* setup reset_value */
+ for (i = 0; i < OP_MAX_COUNTER; ++i) {
+ if (counter_config[i].enabled
+ && msrs->counters[op_x86_virt_to_phys(i)].addr)
+ reset_value[i] = counter_config[i].count;
+ else
+ reset_value[i] = 0;
+ }
+
+ /* clear all counters */
+ for (i = 0; i < num_counters; ++i) {
+ if (!msrs->controls[i].addr)
+ continue;
+ rdmsrl(msrs->controls[i].addr, val);
+ if (val & ARCH_PERFMON_EVENTSEL_ENABLE)
+ op_x86_warn_in_use(i);
+ val &= model->reserved;
+ wrmsrl(msrs->controls[i].addr, val);
+ /*
+ * avoid a false detection of ctr overflows in NMI
+ * handler
+ */
+ wrmsrl(msrs->counters[i].addr, -1LL);
+ }
+
+ /* enable active counters */
+ for (i = 0; i < num_counters; ++i) {
+ int virt = op_x86_phys_to_virt(i);
+ if (!reset_value[virt])
+ continue;
+
+ /* setup counter registers */
+ wrmsrl(msrs->counters[i].addr, -(u64)reset_value[virt]);
+
+ /* setup control registers */
+ rdmsrl(msrs->controls[i].addr, val);
+ val &= model->reserved;
+ val |= op_x86_get_ctrl(model, &counter_config[virt]);
+ wrmsrl(msrs->controls[i].addr, val);
+ }
+}
+
+static int op_amd_check_ctrs(struct pt_regs * const regs,
+ struct op_msrs const * const msrs)
+{
+ u64 val;
+ int i;
+
+ for (i = 0; i < num_counters; ++i) {
+ int virt = op_x86_phys_to_virt(i);
+ if (!reset_value[virt])
+ continue;
+ rdmsrl(msrs->counters[i].addr, val);
+ /* bit is clear if overflowed: */
+ if (val & OP_CTR_OVERFLOW)
+ continue;
+ oprofile_add_sample(regs, virt);
+ wrmsrl(msrs->counters[i].addr, -(u64)reset_value[virt]);
+ }
+
+ op_amd_handle_ibs(regs, msrs);
+
+ /* See op_model_ppro.c */
+ return 1;
+}
+
+static void op_amd_start(struct op_msrs const * const msrs)
+{
+ u64 val;
+ int i;
+
+ for (i = 0; i < num_counters; ++i) {
+ if (!reset_value[op_x86_phys_to_virt(i)])
+ continue;
+ rdmsrl(msrs->controls[i].addr, val);
+ val |= ARCH_PERFMON_EVENTSEL_ENABLE;
+ wrmsrl(msrs->controls[i].addr, val);
+ }
+
+ op_amd_start_ibs();
+}
+
+static void op_amd_stop(struct op_msrs const * const msrs)
+{
+ u64 val;
+ int i;
+
+ /*
+ * Subtle: stop on all counters to avoid race with setting our
+ * pm callback
+ */
+ for (i = 0; i < num_counters; ++i) {
+ if (!reset_value[op_x86_phys_to_virt(i)])
+ continue;
+ rdmsrl(msrs->controls[i].addr, val);
+ val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
+ wrmsrl(msrs->controls[i].addr, val);
+ }
+
+ op_amd_stop_ibs();
+}
+
+/*
+ * check and reserve APIC extended interrupt LVT offset for IBS if
+ * available
+ */
+
+static void init_ibs(void)
+{
+ ibs_caps = get_ibs_caps();
+
+ if (!ibs_caps)
+ return;
+
+ printk(KERN_INFO "oprofile: AMD IBS detected (0x%08x)\n", ibs_caps);
+}
+
+static int (*create_arch_files)(struct dentry *root);
+
+static int setup_ibs_files(struct dentry *root)
+{
+ struct dentry *dir;
+ int ret = 0;
+
+ /* architecture specific files */
+ if (create_arch_files)
+ ret = create_arch_files(root);
+
+ if (ret)
+ return ret;
+
+ if (!ibs_caps)
+ return ret;
+
+ /* model specific files */
+
+ /* setup some reasonable defaults */
+ memset(&ibs_config, 0, sizeof(ibs_config));
+ ibs_config.max_cnt_fetch = 250000;
+ ibs_config.max_cnt_op = 250000;
+
+ if (ibs_caps & IBS_CAPS_FETCHSAM) {
+ dir = oprofilefs_mkdir(root, "ibs_fetch");
+ oprofilefs_create_ulong(dir, "enable",
+ &ibs_config.fetch_enabled);
+ oprofilefs_create_ulong(dir, "max_count",
+ &ibs_config.max_cnt_fetch);
+ oprofilefs_create_ulong(dir, "rand_enable",
+ &ibs_config.rand_en);
+ }
+
+ if (ibs_caps & IBS_CAPS_OPSAM) {
+ dir = oprofilefs_mkdir(root, "ibs_op");
+ oprofilefs_create_ulong(dir, "enable",
+ &ibs_config.op_enabled);
+ oprofilefs_create_ulong(dir, "max_count",
+ &ibs_config.max_cnt_op);
+ if (ibs_caps & IBS_CAPS_OPCNT)
+ oprofilefs_create_ulong(dir, "dispatched_ops",
+ &ibs_config.dispatched_ops);
+ if (ibs_caps & IBS_CAPS_BRNTRGT)
+ oprofilefs_create_ulong(dir, "branch_target",
+ &ibs_config.branch_target);
+ }
+
+ return 0;
+}
+
+struct op_x86_model_spec op_amd_spec;
+
+static int op_amd_init(struct oprofile_operations *ops)
+{
+ init_ibs();
+ create_arch_files = ops->create_files;
+ ops->create_files = setup_ibs_files;
+
+ if (boot_cpu_data.x86 == 0x15) {
+ num_counters = AMD64_NUM_COUNTERS_CORE;
+ } else {
+ num_counters = AMD64_NUM_COUNTERS;
+ }
+
+ op_amd_spec.num_counters = num_counters;
+ op_amd_spec.num_controls = num_counters;
+ op_amd_spec.num_virt_counters = max(num_counters, NUM_VIRT_COUNTERS);
+
+ return 0;
+}
+
+struct op_x86_model_spec op_amd_spec = {
+ /* num_counters/num_controls filled in at runtime */
+ .reserved = MSR_AMD_EVENTSEL_RESERVED,
+ .event_mask = OP_EVENT_MASK,
+ .init = op_amd_init,
+ .fill_in_addresses = &op_amd_fill_in_addresses,
+ .setup_ctrs = &op_amd_setup_ctrs,
+ .check_ctrs = &op_amd_check_ctrs,
+ .start = &op_amd_start,
+ .stop = &op_amd_stop,
+ .shutdown = &op_amd_shutdown,
+#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
+ .switch_ctrl = &op_mux_switch_ctrl,
+#endif
+};
diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c
new file mode 100644
index 000000000..ad1d91f47
--- /dev/null
+++ b/arch/x86/oprofile/op_model_p4.c
@@ -0,0 +1,723 @@
+/**
+ * @file op_model_p4.c
+ * P4 model-specific MSR operations
+ *
+ * @remark Copyright 2002 OProfile authors
+ * @remark Read the file COPYING
+ *
+ * @author Graydon Hoare
+ */
+
+#include <linux/oprofile.h>
+#include <linux/smp.h>
+#include <linux/ptrace.h>
+#include <asm/nmi.h>
+#include <asm/msr.h>
+#include <asm/fixmap.h>
+#include <asm/apic.h>
+
+
+#include "op_x86_model.h"
+#include "op_counter.h"
+
+#define NUM_EVENTS 39
+
+#define NUM_COUNTERS_NON_HT 8
+#define NUM_ESCRS_NON_HT 45
+#define NUM_CCCRS_NON_HT 18
+#define NUM_CONTROLS_NON_HT (NUM_ESCRS_NON_HT + NUM_CCCRS_NON_HT)
+
+#define NUM_COUNTERS_HT2 4
+#define NUM_ESCRS_HT2 23
+#define NUM_CCCRS_HT2 9
+#define NUM_CONTROLS_HT2 (NUM_ESCRS_HT2 + NUM_CCCRS_HT2)
+
+#define OP_CTR_OVERFLOW (1ULL<<31)
+
+static unsigned int num_counters = NUM_COUNTERS_NON_HT;
+static unsigned int num_controls = NUM_CONTROLS_NON_HT;
+
+/* this has to be checked dynamically since the
+ hyper-threadedness of a chip is discovered at
+ kernel boot-time. */
+static inline void setup_num_counters(void)
+{
+#ifdef CONFIG_SMP
+ if (smp_num_siblings == 2) {
+ num_counters = NUM_COUNTERS_HT2;
+ num_controls = NUM_CONTROLS_HT2;
+ }
+#endif
+}
+
+static inline int addr_increment(void)
+{
+#ifdef CONFIG_SMP
+ return smp_num_siblings == 2 ? 2 : 1;
+#else
+ return 1;
+#endif
+}
+
+
+/* tables to simulate simplified hardware view of p4 registers */
+struct p4_counter_binding {
+ int virt_counter;
+ int counter_address;
+ int cccr_address;
+};
+
+struct p4_event_binding {
+ int escr_select; /* value to put in CCCR */
+ int event_select; /* value to put in ESCR */
+ struct {
+ int virt_counter; /* for this counter... */
+ int escr_address; /* use this ESCR */
+ } bindings[2];
+};
+
+/* nb: these CTR_* defines are a duplicate of defines in
+ event/i386.p4*events. */
+
+
+#define CTR_BPU_0 (1 << 0)
+#define CTR_MS_0 (1 << 1)
+#define CTR_FLAME_0 (1 << 2)
+#define CTR_IQ_4 (1 << 3)
+#define CTR_BPU_2 (1 << 4)
+#define CTR_MS_2 (1 << 5)
+#define CTR_FLAME_2 (1 << 6)
+#define CTR_IQ_5 (1 << 7)
+
+static struct p4_counter_binding p4_counters[NUM_COUNTERS_NON_HT] = {
+ { CTR_BPU_0, MSR_P4_BPU_PERFCTR0, MSR_P4_BPU_CCCR0 },
+ { CTR_MS_0, MSR_P4_MS_PERFCTR0, MSR_P4_MS_CCCR0 },
+ { CTR_FLAME_0, MSR_P4_FLAME_PERFCTR0, MSR_P4_FLAME_CCCR0 },
+ { CTR_IQ_4, MSR_P4_IQ_PERFCTR4, MSR_P4_IQ_CCCR4 },
+ { CTR_BPU_2, MSR_P4_BPU_PERFCTR2, MSR_P4_BPU_CCCR2 },
+ { CTR_MS_2, MSR_P4_MS_PERFCTR2, MSR_P4_MS_CCCR2 },
+ { CTR_FLAME_2, MSR_P4_FLAME_PERFCTR2, MSR_P4_FLAME_CCCR2 },
+ { CTR_IQ_5, MSR_P4_IQ_PERFCTR5, MSR_P4_IQ_CCCR5 }
+};
+
+#define NUM_UNUSED_CCCRS (NUM_CCCRS_NON_HT - NUM_COUNTERS_NON_HT)
+
+/* p4 event codes in libop/op_event.h are indices into this table. */
+
+static struct p4_event_binding p4_events[NUM_EVENTS] = {
+
+ { /* BRANCH_RETIRED */
+ 0x05, 0x06,
+ { {CTR_IQ_4, MSR_P4_CRU_ESCR2},
+ {CTR_IQ_5, MSR_P4_CRU_ESCR3} }
+ },
+
+ { /* MISPRED_BRANCH_RETIRED */
+ 0x04, 0x03,
+ { { CTR_IQ_4, MSR_P4_CRU_ESCR0},
+ { CTR_IQ_5, MSR_P4_CRU_ESCR1} }
+ },
+
+ { /* TC_DELIVER_MODE */
+ 0x01, 0x01,
+ { { CTR_MS_0, MSR_P4_TC_ESCR0},
+ { CTR_MS_2, MSR_P4_TC_ESCR1} }
+ },
+
+ { /* BPU_FETCH_REQUEST */
+ 0x00, 0x03,
+ { { CTR_BPU_0, MSR_P4_BPU_ESCR0},
+ { CTR_BPU_2, MSR_P4_BPU_ESCR1} }
+ },
+
+ { /* ITLB_REFERENCE */
+ 0x03, 0x18,
+ { { CTR_BPU_0, MSR_P4_ITLB_ESCR0},
+ { CTR_BPU_2, MSR_P4_ITLB_ESCR1} }
+ },
+
+ { /* MEMORY_CANCEL */
+ 0x05, 0x02,
+ { { CTR_FLAME_0, MSR_P4_DAC_ESCR0},
+ { CTR_FLAME_2, MSR_P4_DAC_ESCR1} }
+ },
+
+ { /* MEMORY_COMPLETE */
+ 0x02, 0x08,
+ { { CTR_FLAME_0, MSR_P4_SAAT_ESCR0},
+ { CTR_FLAME_2, MSR_P4_SAAT_ESCR1} }
+ },
+
+ { /* LOAD_PORT_REPLAY */
+ 0x02, 0x04,
+ { { CTR_FLAME_0, MSR_P4_SAAT_ESCR0},
+ { CTR_FLAME_2, MSR_P4_SAAT_ESCR1} }
+ },
+
+ { /* STORE_PORT_REPLAY */
+ 0x02, 0x05,
+ { { CTR_FLAME_0, MSR_P4_SAAT_ESCR0},
+ { CTR_FLAME_2, MSR_P4_SAAT_ESCR1} }
+ },
+
+ { /* MOB_LOAD_REPLAY */
+ 0x02, 0x03,
+ { { CTR_BPU_0, MSR_P4_MOB_ESCR0},
+ { CTR_BPU_2, MSR_P4_MOB_ESCR1} }
+ },
+
+ { /* PAGE_WALK_TYPE */
+ 0x04, 0x01,
+ { { CTR_BPU_0, MSR_P4_PMH_ESCR0},
+ { CTR_BPU_2, MSR_P4_PMH_ESCR1} }
+ },
+
+ { /* BSQ_CACHE_REFERENCE */
+ 0x07, 0x0c,
+ { { CTR_BPU_0, MSR_P4_BSU_ESCR0},
+ { CTR_BPU_2, MSR_P4_BSU_ESCR1} }
+ },
+
+ { /* IOQ_ALLOCATION */
+ 0x06, 0x03,
+ { { CTR_BPU_0, MSR_P4_FSB_ESCR0},
+ { 0, 0 } }
+ },
+
+ { /* IOQ_ACTIVE_ENTRIES */
+ 0x06, 0x1a,
+ { { CTR_BPU_2, MSR_P4_FSB_ESCR1},
+ { 0, 0 } }
+ },
+
+ { /* FSB_DATA_ACTIVITY */
+ 0x06, 0x17,
+ { { CTR_BPU_0, MSR_P4_FSB_ESCR0},
+ { CTR_BPU_2, MSR_P4_FSB_ESCR1} }
+ },
+
+ { /* BSQ_ALLOCATION */
+ 0x07, 0x05,
+ { { CTR_BPU_0, MSR_P4_BSU_ESCR0},
+ { 0, 0 } }
+ },
+
+ { /* BSQ_ACTIVE_ENTRIES */
+ 0x07, 0x06,
+ { { CTR_BPU_2, MSR_P4_BSU_ESCR1 /* guess */},
+ { 0, 0 } }
+ },
+
+ { /* X87_ASSIST */
+ 0x05, 0x03,
+ { { CTR_IQ_4, MSR_P4_CRU_ESCR2},
+ { CTR_IQ_5, MSR_P4_CRU_ESCR3} }
+ },
+
+ { /* SSE_INPUT_ASSIST */
+ 0x01, 0x34,
+ { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
+ { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
+ },
+
+ { /* PACKED_SP_UOP */
+ 0x01, 0x08,
+ { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
+ { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
+ },
+
+ { /* PACKED_DP_UOP */
+ 0x01, 0x0c,
+ { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
+ { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
+ },
+
+ { /* SCALAR_SP_UOP */
+ 0x01, 0x0a,
+ { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
+ { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
+ },
+
+ { /* SCALAR_DP_UOP */
+ 0x01, 0x0e,
+ { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
+ { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
+ },
+
+ { /* 64BIT_MMX_UOP */
+ 0x01, 0x02,
+ { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
+ { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
+ },
+
+ { /* 128BIT_MMX_UOP */
+ 0x01, 0x1a,
+ { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
+ { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
+ },
+
+ { /* X87_FP_UOP */
+ 0x01, 0x04,
+ { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
+ { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
+ },
+
+ { /* X87_SIMD_MOVES_UOP */
+ 0x01, 0x2e,
+ { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
+ { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
+ },
+
+ { /* MACHINE_CLEAR */
+ 0x05, 0x02,
+ { { CTR_IQ_4, MSR_P4_CRU_ESCR2},
+ { CTR_IQ_5, MSR_P4_CRU_ESCR3} }
+ },
+
+ { /* GLOBAL_POWER_EVENTS */
+ 0x06, 0x13 /* older manual says 0x05, newer 0x13 */,
+ { { CTR_BPU_0, MSR_P4_FSB_ESCR0},
+ { CTR_BPU_2, MSR_P4_FSB_ESCR1} }
+ },
+
+ { /* TC_MS_XFER */
+ 0x00, 0x05,
+ { { CTR_MS_0, MSR_P4_MS_ESCR0},
+ { CTR_MS_2, MSR_P4_MS_ESCR1} }
+ },
+
+ { /* UOP_QUEUE_WRITES */
+ 0x00, 0x09,
+ { { CTR_MS_0, MSR_P4_MS_ESCR0},
+ { CTR_MS_2, MSR_P4_MS_ESCR1} }
+ },
+
+ { /* FRONT_END_EVENT */
+ 0x05, 0x08,
+ { { CTR_IQ_4, MSR_P4_CRU_ESCR2},
+ { CTR_IQ_5, MSR_P4_CRU_ESCR3} }
+ },
+
+ { /* EXECUTION_EVENT */
+ 0x05, 0x0c,
+ { { CTR_IQ_4, MSR_P4_CRU_ESCR2},
+ { CTR_IQ_5, MSR_P4_CRU_ESCR3} }
+ },
+
+ { /* REPLAY_EVENT */
+ 0x05, 0x09,
+ { { CTR_IQ_4, MSR_P4_CRU_ESCR2},
+ { CTR_IQ_5, MSR_P4_CRU_ESCR3} }
+ },
+
+ { /* INSTR_RETIRED */
+ 0x04, 0x02,
+ { { CTR_IQ_4, MSR_P4_CRU_ESCR0},
+ { CTR_IQ_5, MSR_P4_CRU_ESCR1} }
+ },
+
+ { /* UOPS_RETIRED */
+ 0x04, 0x01,
+ { { CTR_IQ_4, MSR_P4_CRU_ESCR0},
+ { CTR_IQ_5, MSR_P4_CRU_ESCR1} }
+ },
+
+ { /* UOP_TYPE */
+ 0x02, 0x02,
+ { { CTR_IQ_4, MSR_P4_RAT_ESCR0},
+ { CTR_IQ_5, MSR_P4_RAT_ESCR1} }
+ },
+
+ { /* RETIRED_MISPRED_BRANCH_TYPE */
+ 0x02, 0x05,
+ { { CTR_MS_0, MSR_P4_TBPU_ESCR0},
+ { CTR_MS_2, MSR_P4_TBPU_ESCR1} }
+ },
+
+ { /* RETIRED_BRANCH_TYPE */
+ 0x02, 0x04,
+ { { CTR_MS_0, MSR_P4_TBPU_ESCR0},
+ { CTR_MS_2, MSR_P4_TBPU_ESCR1} }
+ }
+};
+
+
+#define MISC_PMC_ENABLED_P(x) ((x) & 1 << 7)
+
+#define ESCR_RESERVED_BITS 0x80000003
+#define ESCR_CLEAR(escr) ((escr) &= ESCR_RESERVED_BITS)
+#define ESCR_SET_USR_0(escr, usr) ((escr) |= (((usr) & 1) << 2))
+#define ESCR_SET_OS_0(escr, os) ((escr) |= (((os) & 1) << 3))
+#define ESCR_SET_USR_1(escr, usr) ((escr) |= (((usr) & 1)))
+#define ESCR_SET_OS_1(escr, os) ((escr) |= (((os) & 1) << 1))
+#define ESCR_SET_EVENT_SELECT(escr, sel) ((escr) |= (((sel) & 0x3f) << 25))
+#define ESCR_SET_EVENT_MASK(escr, mask) ((escr) |= (((mask) & 0xffff) << 9))
+
+#define CCCR_RESERVED_BITS 0x38030FFF
+#define CCCR_CLEAR(cccr) ((cccr) &= CCCR_RESERVED_BITS)
+#define CCCR_SET_REQUIRED_BITS(cccr) ((cccr) |= 0x00030000)
+#define CCCR_SET_ESCR_SELECT(cccr, sel) ((cccr) |= (((sel) & 0x07) << 13))
+#define CCCR_SET_PMI_OVF_0(cccr) ((cccr) |= (1<<26))
+#define CCCR_SET_PMI_OVF_1(cccr) ((cccr) |= (1<<27))
+#define CCCR_SET_ENABLE(cccr) ((cccr) |= (1<<12))
+#define CCCR_SET_DISABLE(cccr) ((cccr) &= ~(1<<12))
+#define CCCR_OVF_P(cccr) ((cccr) & (1U<<31))
+#define CCCR_CLEAR_OVF(cccr) ((cccr) &= (~(1U<<31)))
+
+
+/* this assigns a "stagger" to the current CPU, which is used throughout
+ the code in this module as an extra array offset, to select the "even"
+ or "odd" part of all the divided resources. */
+static unsigned int get_stagger(void)
+{
+#ifdef CONFIG_SMP
+ int cpu = smp_processor_id();
+ return cpu != cpumask_first(this_cpu_cpumask_var_ptr(cpu_sibling_map));
+#endif
+ return 0;
+}
+
+
+/* finally, mediate access to a real hardware counter
+ by passing a "virtual" counter numer to this macro,
+ along with your stagger setting. */
+#define VIRT_CTR(stagger, i) ((i) + ((num_counters) * (stagger)))
+
+static unsigned long reset_value[NUM_COUNTERS_NON_HT];
+
+static void p4_shutdown(struct op_msrs const * const msrs)
+{
+ int i;
+
+ for (i = 0; i < num_counters; ++i) {
+ if (msrs->counters[i].addr)
+ release_perfctr_nmi(msrs->counters[i].addr);
+ }
+ /*
+ * some of the control registers are specially reserved in
+ * conjunction with the counter registers (hence the starting offset).
+ * This saves a few bits.
+ */
+ for (i = num_counters; i < num_controls; ++i) {
+ if (msrs->controls[i].addr)
+ release_evntsel_nmi(msrs->controls[i].addr);
+ }
+}
+
+static int p4_fill_in_addresses(struct op_msrs * const msrs)
+{
+ unsigned int i;
+ unsigned int addr, cccraddr, stag;
+
+ setup_num_counters();
+ stag = get_stagger();
+
+ /* the counter & cccr registers we pay attention to */
+ for (i = 0; i < num_counters; ++i) {
+ addr = p4_counters[VIRT_CTR(stag, i)].counter_address;
+ cccraddr = p4_counters[VIRT_CTR(stag, i)].cccr_address;
+ if (reserve_perfctr_nmi(addr)) {
+ msrs->counters[i].addr = addr;
+ msrs->controls[i].addr = cccraddr;
+ }
+ }
+
+ /* 43 ESCR registers in three or four discontiguous group */
+ for (addr = MSR_P4_BSU_ESCR0 + stag;
+ addr < MSR_P4_IQ_ESCR0; ++i, addr += addr_increment()) {
+ if (reserve_evntsel_nmi(addr))
+ msrs->controls[i].addr = addr;
+ }
+
+ /* no IQ_ESCR0/1 on some models, we save a seconde time BSU_ESCR0/1
+ * to avoid special case in nmi_{save|restore}_registers() */
+ if (boot_cpu_data.x86_model >= 0x3) {
+ for (addr = MSR_P4_BSU_ESCR0 + stag;
+ addr <= MSR_P4_BSU_ESCR1; ++i, addr += addr_increment()) {
+ if (reserve_evntsel_nmi(addr))
+ msrs->controls[i].addr = addr;
+ }
+ } else {
+ for (addr = MSR_P4_IQ_ESCR0 + stag;
+ addr <= MSR_P4_IQ_ESCR1; ++i, addr += addr_increment()) {
+ if (reserve_evntsel_nmi(addr))
+ msrs->controls[i].addr = addr;
+ }
+ }
+
+ for (addr = MSR_P4_RAT_ESCR0 + stag;
+ addr <= MSR_P4_SSU_ESCR0; ++i, addr += addr_increment()) {
+ if (reserve_evntsel_nmi(addr))
+ msrs->controls[i].addr = addr;
+ }
+
+ for (addr = MSR_P4_MS_ESCR0 + stag;
+ addr <= MSR_P4_TC_ESCR1; ++i, addr += addr_increment()) {
+ if (reserve_evntsel_nmi(addr))
+ msrs->controls[i].addr = addr;
+ }
+
+ for (addr = MSR_P4_IX_ESCR0 + stag;
+ addr <= MSR_P4_CRU_ESCR3; ++i, addr += addr_increment()) {
+ if (reserve_evntsel_nmi(addr))
+ msrs->controls[i].addr = addr;
+ }
+
+ /* there are 2 remaining non-contiguously located ESCRs */
+
+ if (num_counters == NUM_COUNTERS_NON_HT) {
+ /* standard non-HT CPUs handle both remaining ESCRs*/
+ if (reserve_evntsel_nmi(MSR_P4_CRU_ESCR5))
+ msrs->controls[i++].addr = MSR_P4_CRU_ESCR5;
+ if (reserve_evntsel_nmi(MSR_P4_CRU_ESCR4))
+ msrs->controls[i++].addr = MSR_P4_CRU_ESCR4;
+
+ } else if (stag == 0) {
+ /* HT CPUs give the first remainder to the even thread, as
+ the 32nd control register */
+ if (reserve_evntsel_nmi(MSR_P4_CRU_ESCR4))
+ msrs->controls[i++].addr = MSR_P4_CRU_ESCR4;
+
+ } else {
+ /* and two copies of the second to the odd thread,
+ for the 22st and 23nd control registers */
+ if (reserve_evntsel_nmi(MSR_P4_CRU_ESCR5)) {
+ msrs->controls[i++].addr = MSR_P4_CRU_ESCR5;
+ msrs->controls[i++].addr = MSR_P4_CRU_ESCR5;
+ }
+ }
+
+ for (i = 0; i < num_counters; ++i) {
+ if (!counter_config[i].enabled)
+ continue;
+ if (msrs->controls[i].addr)
+ continue;
+ op_x86_warn_reserved(i);
+ p4_shutdown(msrs);
+ return -EBUSY;
+ }
+
+ return 0;
+}
+
+
+static void pmc_setup_one_p4_counter(unsigned int ctr)
+{
+ int i;
+ int const maxbind = 2;
+ unsigned int cccr = 0;
+ unsigned int escr = 0;
+ unsigned int high = 0;
+ unsigned int counter_bit;
+ struct p4_event_binding *ev = NULL;
+ unsigned int stag;
+
+ stag = get_stagger();
+
+ /* convert from counter *number* to counter *bit* */
+ counter_bit = 1 << VIRT_CTR(stag, ctr);
+
+ /* find our event binding structure. */
+ if (counter_config[ctr].event <= 0 || counter_config[ctr].event > NUM_EVENTS) {
+ printk(KERN_ERR
+ "oprofile: P4 event code 0x%lx out of range\n",
+ counter_config[ctr].event);
+ return;
+ }
+
+ ev = &(p4_events[counter_config[ctr].event - 1]);
+
+ for (i = 0; i < maxbind; i++) {
+ if (ev->bindings[i].virt_counter & counter_bit) {
+
+ /* modify ESCR */
+ rdmsr(ev->bindings[i].escr_address, escr, high);
+ ESCR_CLEAR(escr);
+ if (stag == 0) {
+ ESCR_SET_USR_0(escr, counter_config[ctr].user);
+ ESCR_SET_OS_0(escr, counter_config[ctr].kernel);
+ } else {
+ ESCR_SET_USR_1(escr, counter_config[ctr].user);
+ ESCR_SET_OS_1(escr, counter_config[ctr].kernel);
+ }
+ ESCR_SET_EVENT_SELECT(escr, ev->event_select);
+ ESCR_SET_EVENT_MASK(escr, counter_config[ctr].unit_mask);
+ wrmsr(ev->bindings[i].escr_address, escr, high);
+
+ /* modify CCCR */
+ rdmsr(p4_counters[VIRT_CTR(stag, ctr)].cccr_address,
+ cccr, high);
+ CCCR_CLEAR(cccr);
+ CCCR_SET_REQUIRED_BITS(cccr);
+ CCCR_SET_ESCR_SELECT(cccr, ev->escr_select);
+ if (stag == 0)
+ CCCR_SET_PMI_OVF_0(cccr);
+ else
+ CCCR_SET_PMI_OVF_1(cccr);
+ wrmsr(p4_counters[VIRT_CTR(stag, ctr)].cccr_address,
+ cccr, high);
+ return;
+ }
+ }
+
+ printk(KERN_ERR
+ "oprofile: P4 event code 0x%lx no binding, stag %d ctr %d\n",
+ counter_config[ctr].event, stag, ctr);
+}
+
+
+static void p4_setup_ctrs(struct op_x86_model_spec const *model,
+ struct op_msrs const * const msrs)
+{
+ unsigned int i;
+ unsigned int low, high;
+ unsigned int stag;
+
+ stag = get_stagger();
+
+ rdmsr(MSR_IA32_MISC_ENABLE, low, high);
+ if (!MISC_PMC_ENABLED_P(low)) {
+ printk(KERN_ERR "oprofile: P4 PMC not available\n");
+ return;
+ }
+
+ /* clear the cccrs we will use */
+ for (i = 0; i < num_counters; i++) {
+ if (unlikely(!msrs->controls[i].addr))
+ continue;
+ rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high);
+ CCCR_CLEAR(low);
+ CCCR_SET_REQUIRED_BITS(low);
+ wrmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high);
+ }
+
+ /* clear all escrs (including those outside our concern) */
+ for (i = num_counters; i < num_controls; i++) {
+ if (unlikely(!msrs->controls[i].addr))
+ continue;
+ wrmsr(msrs->controls[i].addr, 0, 0);
+ }
+
+ /* setup all counters */
+ for (i = 0; i < num_counters; ++i) {
+ if (counter_config[i].enabled && msrs->controls[i].addr) {
+ reset_value[i] = counter_config[i].count;
+ pmc_setup_one_p4_counter(i);
+ wrmsrl(p4_counters[VIRT_CTR(stag, i)].counter_address,
+ -(u64)counter_config[i].count);
+ } else {
+ reset_value[i] = 0;
+ }
+ }
+}
+
+
+static int p4_check_ctrs(struct pt_regs * const regs,
+ struct op_msrs const * const msrs)
+{
+ unsigned long ctr, low, high, stag, real;
+ int i;
+
+ stag = get_stagger();
+
+ for (i = 0; i < num_counters; ++i) {
+
+ if (!reset_value[i])
+ continue;
+
+ /*
+ * there is some eccentricity in the hardware which
+ * requires that we perform 2 extra corrections:
+ *
+ * - check both the CCCR:OVF flag for overflow and the
+ * counter high bit for un-flagged overflows.
+ *
+ * - write the counter back twice to ensure it gets
+ * updated properly.
+ *
+ * the former seems to be related to extra NMIs happening
+ * during the current NMI; the latter is reported as errata
+ * N15 in intel doc 249199-029, pentium 4 specification
+ * update, though their suggested work-around does not
+ * appear to solve the problem.
+ */
+
+ real = VIRT_CTR(stag, i);
+
+ rdmsr(p4_counters[real].cccr_address, low, high);
+ rdmsr(p4_counters[real].counter_address, ctr, high);
+ if (CCCR_OVF_P(low) || !(ctr & OP_CTR_OVERFLOW)) {
+ oprofile_add_sample(regs, i);
+ wrmsrl(p4_counters[real].counter_address,
+ -(u64)reset_value[i]);
+ CCCR_CLEAR_OVF(low);
+ wrmsr(p4_counters[real].cccr_address, low, high);
+ wrmsrl(p4_counters[real].counter_address,
+ -(u64)reset_value[i]);
+ }
+ }
+
+ /* P4 quirk: you have to re-unmask the apic vector */
+ apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
+
+ /* See op_model_ppro.c */
+ return 1;
+}
+
+
+static void p4_start(struct op_msrs const * const msrs)
+{
+ unsigned int low, high, stag;
+ int i;
+
+ stag = get_stagger();
+
+ for (i = 0; i < num_counters; ++i) {
+ if (!reset_value[i])
+ continue;
+ rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high);
+ CCCR_SET_ENABLE(low);
+ wrmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high);
+ }
+}
+
+
+static void p4_stop(struct op_msrs const * const msrs)
+{
+ unsigned int low, high, stag;
+ int i;
+
+ stag = get_stagger();
+
+ for (i = 0; i < num_counters; ++i) {
+ if (!reset_value[i])
+ continue;
+ rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high);
+ CCCR_SET_DISABLE(low);
+ wrmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high);
+ }
+}
+
+#ifdef CONFIG_SMP
+struct op_x86_model_spec op_p4_ht2_spec = {
+ .num_counters = NUM_COUNTERS_HT2,
+ .num_controls = NUM_CONTROLS_HT2,
+ .fill_in_addresses = &p4_fill_in_addresses,
+ .setup_ctrs = &p4_setup_ctrs,
+ .check_ctrs = &p4_check_ctrs,
+ .start = &p4_start,
+ .stop = &p4_stop,
+ .shutdown = &p4_shutdown
+};
+#endif
+
+struct op_x86_model_spec op_p4_spec = {
+ .num_counters = NUM_COUNTERS_NON_HT,
+ .num_controls = NUM_CONTROLS_NON_HT,
+ .fill_in_addresses = &p4_fill_in_addresses,
+ .setup_ctrs = &p4_setup_ctrs,
+ .check_ctrs = &p4_check_ctrs,
+ .start = &p4_start,
+ .stop = &p4_stop,
+ .shutdown = &p4_shutdown
+};
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
new file mode 100644
index 000000000..7913b6921
--- /dev/null
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -0,0 +1,245 @@
+/*
+ * @file op_model_ppro.h
+ * Family 6 perfmon and architectural perfmon MSR operations
+ *
+ * @remark Copyright 2002 OProfile authors
+ * @remark Copyright 2008 Intel Corporation
+ * @remark Read the file COPYING
+ *
+ * @author John Levon
+ * @author Philippe Elie
+ * @author Graydon Hoare
+ * @author Andi Kleen
+ * @author Robert Richter <robert.richter@amd.com>
+ */
+
+#include <linux/oprofile.h>
+#include <linux/slab.h>
+#include <asm/ptrace.h>
+#include <asm/msr.h>
+#include <asm/apic.h>
+#include <asm/nmi.h>
+
+#include "op_x86_model.h"
+#include "op_counter.h"
+
+static int num_counters = 2;
+static int counter_width = 32;
+
+#define MSR_PPRO_EVENTSEL_RESERVED ((0xFFFFFFFFULL<<32)|(1ULL<<21))
+
+static u64 reset_value[OP_MAX_COUNTER];
+
+static void ppro_shutdown(struct op_msrs const * const msrs)
+{
+ int i;
+
+ for (i = 0; i < num_counters; ++i) {
+ if (!msrs->counters[i].addr)
+ continue;
+ release_perfctr_nmi(MSR_P6_PERFCTR0 + i);
+ release_evntsel_nmi(MSR_P6_EVNTSEL0 + i);
+ }
+}
+
+static int ppro_fill_in_addresses(struct op_msrs * const msrs)
+{
+ int i;
+
+ for (i = 0; i < num_counters; i++) {
+ if (!reserve_perfctr_nmi(MSR_P6_PERFCTR0 + i))
+ goto fail;
+ if (!reserve_evntsel_nmi(MSR_P6_EVNTSEL0 + i)) {
+ release_perfctr_nmi(MSR_P6_PERFCTR0 + i);
+ goto fail;
+ }
+ /* both registers must be reserved */
+ msrs->counters[i].addr = MSR_P6_PERFCTR0 + i;
+ msrs->controls[i].addr = MSR_P6_EVNTSEL0 + i;
+ continue;
+ fail:
+ if (!counter_config[i].enabled)
+ continue;
+ op_x86_warn_reserved(i);
+ ppro_shutdown(msrs);
+ return -EBUSY;
+ }
+
+ return 0;
+}
+
+
+static void ppro_setup_ctrs(struct op_x86_model_spec const *model,
+ struct op_msrs const * const msrs)
+{
+ u64 val;
+ int i;
+
+ if (boot_cpu_has(X86_FEATURE_ARCH_PERFMON)) {
+ union cpuid10_eax eax;
+ eax.full = cpuid_eax(0xa);
+
+ /*
+ * For Core2 (family 6, model 15), don't reset the
+ * counter width:
+ */
+ if (!(eax.split.version_id == 0 &&
+ __this_cpu_read(cpu_info.x86) == 6 &&
+ __this_cpu_read(cpu_info.x86_model) == 15)) {
+
+ if (counter_width < eax.split.bit_width)
+ counter_width = eax.split.bit_width;
+ }
+ }
+
+ /* clear all counters */
+ for (i = 0; i < num_counters; ++i) {
+ if (!msrs->controls[i].addr)
+ continue;
+ rdmsrl(msrs->controls[i].addr, val);
+ if (val & ARCH_PERFMON_EVENTSEL_ENABLE)
+ op_x86_warn_in_use(i);
+ val &= model->reserved;
+ wrmsrl(msrs->controls[i].addr, val);
+ /*
+ * avoid a false detection of ctr overflows in NMI *
+ * handler
+ */
+ wrmsrl(msrs->counters[i].addr, -1LL);
+ }
+
+ /* enable active counters */
+ for (i = 0; i < num_counters; ++i) {
+ if (counter_config[i].enabled && msrs->counters[i].addr) {
+ reset_value[i] = counter_config[i].count;
+ wrmsrl(msrs->counters[i].addr, -reset_value[i]);
+ rdmsrl(msrs->controls[i].addr, val);
+ val &= model->reserved;
+ val |= op_x86_get_ctrl(model, &counter_config[i]);
+ wrmsrl(msrs->controls[i].addr, val);
+ } else {
+ reset_value[i] = 0;
+ }
+ }
+}
+
+
+static int ppro_check_ctrs(struct pt_regs * const regs,
+ struct op_msrs const * const msrs)
+{
+ u64 val;
+ int i;
+
+ for (i = 0; i < num_counters; ++i) {
+ if (!reset_value[i])
+ continue;
+ rdmsrl(msrs->counters[i].addr, val);
+ if (val & (1ULL << (counter_width - 1)))
+ continue;
+ oprofile_add_sample(regs, i);
+ wrmsrl(msrs->counters[i].addr, -reset_value[i]);
+ }
+
+ /* Only P6 based Pentium M need to re-unmask the apic vector but it
+ * doesn't hurt other P6 variant */
+ apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
+
+ /* We can't work out if we really handled an interrupt. We
+ * might have caught a *second* counter just after overflowing
+ * the interrupt for this counter then arrives
+ * and we don't find a counter that's overflowed, so we
+ * would return 0 and get dazed + confused. Instead we always
+ * assume we found an overflow. This sucks.
+ */
+ return 1;
+}
+
+
+static void ppro_start(struct op_msrs const * const msrs)
+{
+ u64 val;
+ int i;
+
+ for (i = 0; i < num_counters; ++i) {
+ if (reset_value[i]) {
+ rdmsrl(msrs->controls[i].addr, val);
+ val |= ARCH_PERFMON_EVENTSEL_ENABLE;
+ wrmsrl(msrs->controls[i].addr, val);
+ }
+ }
+}
+
+
+static void ppro_stop(struct op_msrs const * const msrs)
+{
+ u64 val;
+ int i;
+
+ for (i = 0; i < num_counters; ++i) {
+ if (!reset_value[i])
+ continue;
+ rdmsrl(msrs->controls[i].addr, val);
+ val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
+ wrmsrl(msrs->controls[i].addr, val);
+ }
+}
+
+struct op_x86_model_spec op_ppro_spec = {
+ .num_counters = 2,
+ .num_controls = 2,
+ .reserved = MSR_PPRO_EVENTSEL_RESERVED,
+ .fill_in_addresses = &ppro_fill_in_addresses,
+ .setup_ctrs = &ppro_setup_ctrs,
+ .check_ctrs = &ppro_check_ctrs,
+ .start = &ppro_start,
+ .stop = &ppro_stop,
+ .shutdown = &ppro_shutdown
+};
+
+/*
+ * Architectural performance monitoring.
+ *
+ * Newer Intel CPUs (Core1+) have support for architectural
+ * events described in CPUID 0xA. See the IA32 SDM Vol3b.18 for details.
+ * The advantage of this is that it can be done without knowing about
+ * the specific CPU.
+ */
+
+static void arch_perfmon_setup_counters(void)
+{
+ union cpuid10_eax eax;
+
+ eax.full = cpuid_eax(0xa);
+
+ /* Workaround for BIOS bugs in 6/15. Taken from perfmon2 */
+ if (eax.split.version_id == 0 && boot_cpu_data.x86 == 6 &&
+ boot_cpu_data.x86_model == 15) {
+ eax.split.version_id = 2;
+ eax.split.num_counters = 2;
+ eax.split.bit_width = 40;
+ }
+
+ num_counters = min((int)eax.split.num_counters, OP_MAX_COUNTER);
+
+ op_arch_perfmon_spec.num_counters = num_counters;
+ op_arch_perfmon_spec.num_controls = num_counters;
+}
+
+static int arch_perfmon_init(struct oprofile_operations *ignore)
+{
+ arch_perfmon_setup_counters();
+ return 0;
+}
+
+struct op_x86_model_spec op_arch_perfmon_spec = {
+ .reserved = MSR_PPRO_EVENTSEL_RESERVED,
+ .init = &arch_perfmon_init,
+ /* num_counters/num_controls filled in at runtime */
+ .fill_in_addresses = &ppro_fill_in_addresses,
+ /* user space does the cpuid check for available events */
+ .setup_ctrs = &ppro_setup_ctrs,
+ .check_ctrs = &ppro_check_ctrs,
+ .start = &ppro_start,
+ .stop = &ppro_stop,
+ .shutdown = &ppro_shutdown
+};
diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h
new file mode 100644
index 000000000..71e8a6733
--- /dev/null
+++ b/arch/x86/oprofile/op_x86_model.h
@@ -0,0 +1,90 @@
+/**
+ * @file op_x86_model.h
+ * interface to x86 model-specific MSR operations
+ *
+ * @remark Copyright 2002 OProfile authors
+ * @remark Read the file COPYING
+ *
+ * @author Graydon Hoare
+ * @author Robert Richter <robert.richter@amd.com>
+ */
+
+#ifndef OP_X86_MODEL_H
+#define OP_X86_MODEL_H
+
+#include <asm/types.h>
+#include <asm/perf_event.h>
+
+struct op_msr {
+ unsigned long addr;
+ u64 saved;
+};
+
+struct op_msrs {
+ struct op_msr *counters;
+ struct op_msr *controls;
+ struct op_msr *multiplex;
+};
+
+struct pt_regs;
+
+struct oprofile_operations;
+
+/* The model vtable abstracts the differences between
+ * various x86 CPU models' perfctr support.
+ */
+struct op_x86_model_spec {
+ unsigned int num_counters;
+ unsigned int num_controls;
+ unsigned int num_virt_counters;
+ u64 reserved;
+ u16 event_mask;
+ int (*init)(struct oprofile_operations *ops);
+ int (*fill_in_addresses)(struct op_msrs * const msrs);
+ void (*setup_ctrs)(struct op_x86_model_spec const *model,
+ struct op_msrs const * const msrs);
+ int (*check_ctrs)(struct pt_regs * const regs,
+ struct op_msrs const * const msrs);
+ void (*start)(struct op_msrs const * const msrs);
+ void (*stop)(struct op_msrs const * const msrs);
+ void (*shutdown)(struct op_msrs const * const msrs);
+#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
+ void (*switch_ctrl)(struct op_x86_model_spec const *model,
+ struct op_msrs const * const msrs);
+#endif
+};
+
+struct op_counter_config;
+
+static inline void op_x86_warn_in_use(int counter)
+{
+ /*
+ * The warning indicates an already running counter. If
+ * oprofile doesn't collect data, then try using a different
+ * performance counter on your platform to monitor the desired
+ * event. Delete counter #%d from the desired event by editing
+ * the /usr/share/oprofile/%s/<cpu>/events file. If the event
+ * cannot be monitored by any other counter, contact your
+ * hardware or BIOS vendor.
+ */
+ pr_warning("oprofile: counter #%d on cpu #%d may already be used\n",
+ counter, smp_processor_id());
+}
+
+static inline void op_x86_warn_reserved(int counter)
+{
+ pr_warning("oprofile: counter #%d is already reserved\n", counter);
+}
+
+extern u64 op_x86_get_ctrl(struct op_x86_model_spec const *model,
+ struct op_counter_config *counter_config);
+extern int op_x86_phys_to_virt(int phys);
+extern int op_x86_virt_to_phys(int virt);
+
+extern struct op_x86_model_spec op_ppro_spec;
+extern struct op_x86_model_spec op_p4_spec;
+extern struct op_x86_model_spec op_p4_ht2_spec;
+extern struct op_x86_model_spec op_amd_spec;
+extern struct op_x86_model_spec op_arch_perfmon_spec;
+
+#endif /* OP_X86_MODEL_H */
diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile
new file mode 100644
index 000000000..c806b57d3
--- /dev/null
+++ b/arch/x86/pci/Makefile
@@ -0,0 +1,29 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-y := i386.o init.o
+
+obj-$(CONFIG_PCI_BIOS) += pcbios.o
+obj-$(CONFIG_PCI_MMCONFIG) += mmconfig_$(BITS).o direct.o mmconfig-shared.o
+obj-$(CONFIG_PCI_DIRECT) += direct.o
+obj-$(CONFIG_PCI_OLPC) += olpc.o
+obj-$(CONFIG_PCI_XEN) += xen.o
+
+obj-y += fixup.o
+obj-$(CONFIG_X86_INTEL_CE) += ce4100.o
+obj-$(CONFIG_ACPI) += acpi.o
+obj-y += legacy.o irq.o
+
+obj-$(CONFIG_STA2X11) += sta2x11-fixup.o
+
+obj-$(CONFIG_X86_NUMACHIP) += numachip.o
+
+obj-$(CONFIG_X86_INTEL_MID) += intel_mid_pci.o
+
+obj-y += common.o early.o
+obj-y += bus_numa.o
+
+obj-$(CONFIG_AMD_NB) += amd_bus.o
+obj-$(CONFIG_PCI_CNB20LE_QUIRK) += broadcom_bus.o
+
+ifeq ($(CONFIG_PCI_DEBUG),y)
+EXTRA_CFLAGS += -DDEBUG
+endif
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
new file mode 100644
index 000000000..5559dcadd
--- /dev/null
+++ b/arch/x86/pci/acpi.c
@@ -0,0 +1,425 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/pci.h>
+#include <linux/acpi.h>
+#include <linux/init.h>
+#include <linux/irq.h>
+#include <linux/dmi.h>
+#include <linux/slab.h>
+#include <linux/pci-acpi.h>
+#include <asm/numa.h>
+#include <asm/pci_x86.h>
+
+struct pci_root_info {
+ struct acpi_pci_root_info common;
+ struct pci_sysdata sd;
+#ifdef CONFIG_PCI_MMCONFIG
+ bool mcfg_added;
+ u8 start_bus;
+ u8 end_bus;
+#endif
+};
+
+static bool pci_use_crs = true;
+static bool pci_ignore_seg = false;
+
+static int __init set_use_crs(const struct dmi_system_id *id)
+{
+ pci_use_crs = true;
+ return 0;
+}
+
+static int __init set_nouse_crs(const struct dmi_system_id *id)
+{
+ pci_use_crs = false;
+ return 0;
+}
+
+static int __init set_ignore_seg(const struct dmi_system_id *id)
+{
+ printk(KERN_INFO "PCI: %s detected: ignoring ACPI _SEG\n", id->ident);
+ pci_ignore_seg = true;
+ return 0;
+}
+
+static const struct dmi_system_id pci_crs_quirks[] __initconst = {
+ /* http://bugzilla.kernel.org/show_bug.cgi?id=14183 */
+ {
+ .callback = set_use_crs,
+ .ident = "IBM System x3800",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "IBM"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "x3800"),
+ },
+ },
+ /* https://bugzilla.kernel.org/show_bug.cgi?id=16007 */
+ /* 2006 AMD HT/VIA system with two host bridges */
+ {
+ .callback = set_use_crs,
+ .ident = "ASRock ALiveSATA2-GLAN",
+ .matches = {
+ DMI_MATCH(DMI_PRODUCT_NAME, "ALiveSATA2-GLAN"),
+ },
+ },
+ /* https://bugzilla.kernel.org/show_bug.cgi?id=30552 */
+ /* 2006 AMD HT/VIA system with two host bridges */
+ {
+ .callback = set_use_crs,
+ .ident = "ASUS M2V-MX SE",
+ .matches = {
+ DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
+ DMI_MATCH(DMI_BOARD_NAME, "M2V-MX SE"),
+ DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."),
+ },
+ },
+ /* https://bugzilla.kernel.org/show_bug.cgi?id=42619 */
+ {
+ .callback = set_use_crs,
+ .ident = "MSI MS-7253",
+ .matches = {
+ DMI_MATCH(DMI_BOARD_VENDOR, "MICRO-STAR INTERNATIONAL CO., LTD"),
+ DMI_MATCH(DMI_BOARD_NAME, "MS-7253"),
+ DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies, LTD"),
+ },
+ },
+ /* https://bugs.launchpad.net/ubuntu/+source/alsa-driver/+bug/931368 */
+ /* https://bugs.launchpad.net/ubuntu/+source/alsa-driver/+bug/1033299 */
+ {
+ .callback = set_use_crs,
+ .ident = "Foxconn K8M890-8237A",
+ .matches = {
+ DMI_MATCH(DMI_BOARD_VENDOR, "Foxconn"),
+ DMI_MATCH(DMI_BOARD_NAME, "K8M890-8237A"),
+ DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies, LTD"),
+ },
+ },
+
+ /* Now for the blacklist.. */
+
+ /* https://bugzilla.redhat.com/show_bug.cgi?id=769657 */
+ {
+ .callback = set_nouse_crs,
+ .ident = "Dell Studio 1557",
+ .matches = {
+ DMI_MATCH(DMI_BOARD_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Studio 1557"),
+ DMI_MATCH(DMI_BIOS_VERSION, "A09"),
+ },
+ },
+ /* https://bugzilla.redhat.com/show_bug.cgi?id=769657 */
+ {
+ .callback = set_nouse_crs,
+ .ident = "Thinkpad SL510",
+ .matches = {
+ DMI_MATCH(DMI_BOARD_VENDOR, "LENOVO"),
+ DMI_MATCH(DMI_BOARD_NAME, "2847DFG"),
+ DMI_MATCH(DMI_BIOS_VERSION, "6JET85WW (1.43 )"),
+ },
+ },
+ /* https://bugzilla.kernel.org/show_bug.cgi?id=42606 */
+ {
+ .callback = set_nouse_crs,
+ .ident = "Supermicro X8DTH",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Supermicro"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "X8DTH-i/6/iF/6F"),
+ DMI_MATCH(DMI_BIOS_VERSION, "2.0a"),
+ },
+ },
+
+ /* https://bugzilla.kernel.org/show_bug.cgi?id=15362 */
+ {
+ .callback = set_ignore_seg,
+ .ident = "HP xw9300",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "HP xw9300 Workstation"),
+ },
+ },
+ {}
+};
+
+void __init pci_acpi_crs_quirks(void)
+{
+ int year = dmi_get_bios_year();
+
+ if (year >= 0 && year < 2008 && iomem_resource.end <= 0xffffffff)
+ pci_use_crs = false;
+
+ dmi_check_system(pci_crs_quirks);
+
+ /*
+ * If the user specifies "pci=use_crs" or "pci=nocrs" explicitly, that
+ * takes precedence over anything we figured out above.
+ */
+ if (pci_probe & PCI_ROOT_NO_CRS)
+ pci_use_crs = false;
+ else if (pci_probe & PCI_USE__CRS)
+ pci_use_crs = true;
+
+ printk(KERN_INFO "PCI: %s host bridge windows from ACPI; "
+ "if necessary, use \"pci=%s\" and report a bug\n",
+ pci_use_crs ? "Using" : "Ignoring",
+ pci_use_crs ? "nocrs" : "use_crs");
+}
+
+#ifdef CONFIG_PCI_MMCONFIG
+static int check_segment(u16 seg, struct device *dev, char *estr)
+{
+ if (seg) {
+ dev_err(dev,
+ "%s can't access PCI configuration "
+ "space under this host bridge.\n",
+ estr);
+ return -EIO;
+ }
+
+ /*
+ * Failure in adding MMCFG information is not fatal,
+ * just can't access extended configuration space of
+ * devices under this host bridge.
+ */
+ dev_warn(dev,
+ "%s can't access extended PCI configuration "
+ "space under this bridge.\n",
+ estr);
+
+ return 0;
+}
+
+static int setup_mcfg_map(struct acpi_pci_root_info *ci)
+{
+ int result, seg;
+ struct pci_root_info *info;
+ struct acpi_pci_root *root = ci->root;
+ struct device *dev = &ci->bridge->dev;
+
+ info = container_of(ci, struct pci_root_info, common);
+ info->start_bus = (u8)root->secondary.start;
+ info->end_bus = (u8)root->secondary.end;
+ info->mcfg_added = false;
+ seg = info->sd.domain;
+
+ /* return success if MMCFG is not in use */
+ if (raw_pci_ext_ops && raw_pci_ext_ops != &pci_mmcfg)
+ return 0;
+
+ if (!(pci_probe & PCI_PROBE_MMCONF))
+ return check_segment(seg, dev, "MMCONFIG is disabled,");
+
+ result = pci_mmconfig_insert(dev, seg, info->start_bus, info->end_bus,
+ root->mcfg_addr);
+ if (result == 0) {
+ /* enable MMCFG if it hasn't been enabled yet */
+ if (raw_pci_ext_ops == NULL)
+ raw_pci_ext_ops = &pci_mmcfg;
+ info->mcfg_added = true;
+ } else if (result != -EEXIST)
+ return check_segment(seg, dev,
+ "fail to add MMCONFIG information,");
+
+ return 0;
+}
+
+static void teardown_mcfg_map(struct acpi_pci_root_info *ci)
+{
+ struct pci_root_info *info;
+
+ info = container_of(ci, struct pci_root_info, common);
+ if (info->mcfg_added) {
+ pci_mmconfig_delete(info->sd.domain,
+ info->start_bus, info->end_bus);
+ info->mcfg_added = false;
+ }
+}
+#else
+static int setup_mcfg_map(struct acpi_pci_root_info *ci)
+{
+ return 0;
+}
+
+static void teardown_mcfg_map(struct acpi_pci_root_info *ci)
+{
+}
+#endif
+
+static int pci_acpi_root_get_node(struct acpi_pci_root *root)
+{
+ int busnum = root->secondary.start;
+ struct acpi_device *device = root->device;
+ int node = acpi_get_node(device->handle);
+
+ if (node == NUMA_NO_NODE) {
+ node = x86_pci_root_bus_node(busnum);
+ if (node != 0 && node != NUMA_NO_NODE)
+ dev_info(&device->dev, FW_BUG "no _PXM; falling back to node %d from hardware (may be inconsistent with ACPI node numbers)\n",
+ node);
+ }
+ if (node != NUMA_NO_NODE && !node_online(node))
+ node = NUMA_NO_NODE;
+
+ return node;
+}
+
+static int pci_acpi_root_init_info(struct acpi_pci_root_info *ci)
+{
+ return setup_mcfg_map(ci);
+}
+
+static void pci_acpi_root_release_info(struct acpi_pci_root_info *ci)
+{
+ teardown_mcfg_map(ci);
+ kfree(container_of(ci, struct pci_root_info, common));
+}
+
+/*
+ * An IO port or MMIO resource assigned to a PCI host bridge may be
+ * consumed by the host bridge itself or available to its child
+ * bus/devices. The ACPI specification defines a bit (Producer/Consumer)
+ * to tell whether the resource is consumed by the host bridge itself,
+ * but firmware hasn't used that bit consistently, so we can't rely on it.
+ *
+ * On x86 and IA64 platforms, all IO port and MMIO resources are assumed
+ * to be available to child bus/devices except one special case:
+ * IO port [0xCF8-0xCFF] is consumed by the host bridge itself
+ * to access PCI configuration space.
+ *
+ * So explicitly filter out PCI CFG IO ports[0xCF8-0xCFF].
+ */
+static bool resource_is_pcicfg_ioport(struct resource *res)
+{
+ return (res->flags & IORESOURCE_IO) &&
+ res->start == 0xCF8 && res->end == 0xCFF;
+}
+
+static int pci_acpi_root_prepare_resources(struct acpi_pci_root_info *ci)
+{
+ struct acpi_device *device = ci->bridge;
+ int busnum = ci->root->secondary.start;
+ struct resource_entry *entry, *tmp;
+ int status;
+
+ status = acpi_pci_probe_root_resources(ci);
+ if (pci_use_crs) {
+ resource_list_for_each_entry_safe(entry, tmp, &ci->resources)
+ if (resource_is_pcicfg_ioport(entry->res))
+ resource_list_destroy_entry(entry);
+ return status;
+ }
+
+ resource_list_for_each_entry_safe(entry, tmp, &ci->resources) {
+ dev_printk(KERN_DEBUG, &device->dev,
+ "host bridge window %pR (ignored)\n", entry->res);
+ resource_list_destroy_entry(entry);
+ }
+ x86_pci_root_bus_resources(busnum, &ci->resources);
+
+ return 0;
+}
+
+static struct acpi_pci_root_ops acpi_pci_root_ops = {
+ .pci_ops = &pci_root_ops,
+ .init_info = pci_acpi_root_init_info,
+ .release_info = pci_acpi_root_release_info,
+ .prepare_resources = pci_acpi_root_prepare_resources,
+};
+
+struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root)
+{
+ int domain = root->segment;
+ int busnum = root->secondary.start;
+ int node = pci_acpi_root_get_node(root);
+ struct pci_bus *bus;
+
+ if (pci_ignore_seg)
+ root->segment = domain = 0;
+
+ if (domain && !pci_domains_supported) {
+ printk(KERN_WARNING "pci_bus %04x:%02x: "
+ "ignored (multiple domains not supported)\n",
+ domain, busnum);
+ return NULL;
+ }
+
+ bus = pci_find_bus(domain, busnum);
+ if (bus) {
+ /*
+ * If the desired bus has been scanned already, replace
+ * its bus->sysdata.
+ */
+ struct pci_sysdata sd = {
+ .domain = domain,
+ .node = node,
+ .companion = root->device
+ };
+
+ memcpy(bus->sysdata, &sd, sizeof(sd));
+ } else {
+ struct pci_root_info *info;
+
+ info = kzalloc_node(sizeof(*info), GFP_KERNEL, node);
+ if (!info)
+ dev_err(&root->device->dev,
+ "pci_bus %04x:%02x: ignored (out of memory)\n",
+ domain, busnum);
+ else {
+ info->sd.domain = domain;
+ info->sd.node = node;
+ info->sd.companion = root->device;
+ bus = acpi_pci_root_create(root, &acpi_pci_root_ops,
+ &info->common, &info->sd);
+ }
+ }
+
+ /* After the PCI-E bus has been walked and all devices discovered,
+ * configure any settings of the fabric that might be necessary.
+ */
+ if (bus) {
+ struct pci_bus *child;
+ list_for_each_entry(child, &bus->children, node)
+ pcie_bus_configure_settings(child);
+ }
+
+ return bus;
+}
+
+int pcibios_root_bridge_prepare(struct pci_host_bridge *bridge)
+{
+ /*
+ * We pass NULL as parent to pci_create_root_bus(), so if it is not NULL
+ * here, pci_create_root_bus() has been called by someone else and
+ * sysdata is likely to be different from what we expect. Let it go in
+ * that case.
+ */
+ if (!bridge->dev.parent) {
+ struct pci_sysdata *sd = bridge->bus->sysdata;
+ ACPI_COMPANION_SET(&bridge->dev, sd->companion);
+ }
+ return 0;
+}
+
+int __init pci_acpi_init(void)
+{
+ struct pci_dev *dev = NULL;
+
+ if (acpi_noirq)
+ return -ENODEV;
+
+ printk(KERN_INFO "PCI: Using ACPI for IRQ routing\n");
+ acpi_irq_penalty_init();
+ pcibios_enable_irq = acpi_pci_irq_enable;
+ pcibios_disable_irq = acpi_pci_irq_disable;
+ x86_init.pci.init_irq = x86_init_noop;
+
+ if (pci_routeirq) {
+ /*
+ * PCI IRQ routing is set up by pci_enable_device(), but we
+ * also do it here in case there are still broken drivers that
+ * don't use pci_enable_device().
+ */
+ printk(KERN_INFO "PCI: Routing PCI interrupts for all devices because \"pci=routeirq\" specified\n");
+ for_each_pci_dev(dev)
+ acpi_pci_irq_enable(dev);
+ }
+
+ return 0;
+}
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
new file mode 100644
index 000000000..649bdde63
--- /dev/null
+++ b/arch/x86/pci/amd_bus.c
@@ -0,0 +1,402 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/topology.h>
+#include <linux/cpu.h>
+#include <linux/range.h>
+
+#include <asm/amd_nb.h>
+#include <asm/pci_x86.h>
+
+#include <asm/pci-direct.h>
+
+#include "bus_numa.h"
+
+#define AMD_NB_F0_NODE_ID 0x60
+#define AMD_NB_F0_UNIT_ID 0x64
+#define AMD_NB_F1_CONFIG_MAP_REG 0xe0
+
+#define RANGE_NUM 16
+#define AMD_NB_F1_CONFIG_MAP_RANGES 4
+
+struct amd_hostbridge {
+ u32 bus;
+ u32 slot;
+ u32 device;
+};
+
+/*
+ * IMPORTANT NOTE:
+ * hb_probes[] and early_root_info_init() is in maintenance mode.
+ * It only supports K8, Fam10h, Fam11h, and Fam15h_00h-0fh .
+ * Future processor will rely on information in ACPI.
+ */
+static struct amd_hostbridge hb_probes[] __initdata = {
+ { 0, 0x18, 0x1100 }, /* K8 */
+ { 0, 0x18, 0x1200 }, /* Family10h */
+ { 0xff, 0, 0x1200 }, /* Family10h */
+ { 0, 0x18, 0x1300 }, /* Family11h */
+ { 0, 0x18, 0x1600 }, /* Family15h */
+};
+
+static struct pci_root_info __init *find_pci_root_info(int node, int link)
+{
+ struct pci_root_info *info;
+
+ /* find the position */
+ list_for_each_entry(info, &pci_root_infos, list)
+ if (info->node == node && info->link == link)
+ return info;
+
+ return NULL;
+}
+
+/**
+ * early_root_info_init()
+ * called before pcibios_scan_root and pci_scan_bus
+ * fills the mp_bus_to_cpumask array based according
+ * to the LDT Bus Number Registers found in the northbridge.
+ */
+static int __init early_root_info_init(void)
+{
+ int i;
+ unsigned bus;
+ unsigned slot;
+ int node;
+ int link;
+ int def_node;
+ int def_link;
+ struct pci_root_info *info;
+ u32 reg;
+ u64 start;
+ u64 end;
+ struct range range[RANGE_NUM];
+ u64 val;
+ u32 address;
+ bool found;
+ struct resource fam10h_mmconf_res, *fam10h_mmconf;
+ u64 fam10h_mmconf_start;
+ u64 fam10h_mmconf_end;
+
+ if (!early_pci_allowed())
+ return -1;
+
+ found = false;
+ for (i = 0; i < ARRAY_SIZE(hb_probes); i++) {
+ u32 id;
+ u16 device;
+ u16 vendor;
+
+ bus = hb_probes[i].bus;
+ slot = hb_probes[i].slot;
+ id = read_pci_config(bus, slot, 0, PCI_VENDOR_ID);
+ vendor = id & 0xffff;
+ device = (id>>16) & 0xffff;
+
+ if (vendor != PCI_VENDOR_ID_AMD)
+ continue;
+
+ if (hb_probes[i].device == device) {
+ found = true;
+ break;
+ }
+ }
+
+ if (!found)
+ return 0;
+
+ /*
+ * We should learn topology and routing information from _PXM and
+ * _CRS methods in the ACPI namespace. We extract node numbers
+ * here to work around BIOSes that don't supply _PXM.
+ */
+ for (i = 0; i < AMD_NB_F1_CONFIG_MAP_RANGES; i++) {
+ int min_bus;
+ int max_bus;
+ reg = read_pci_config(bus, slot, 1,
+ AMD_NB_F1_CONFIG_MAP_REG + (i << 2));
+
+ /* Check if that register is enabled for bus range */
+ if ((reg & 7) != 3)
+ continue;
+
+ min_bus = (reg >> 16) & 0xff;
+ max_bus = (reg >> 24) & 0xff;
+ node = (reg >> 4) & 0x07;
+ link = (reg >> 8) & 0x03;
+
+ info = alloc_pci_root_info(min_bus, max_bus, node, link);
+ }
+
+ /*
+ * The following code extracts routing information for use on old
+ * systems where Linux doesn't automatically use host bridge _CRS
+ * methods (or when the user specifies "pci=nocrs").
+ *
+ * We only do this through Fam11h, because _CRS should be enough on
+ * newer systems.
+ */
+ if (boot_cpu_data.x86 > 0x11)
+ return 0;
+
+ /* get the default node and link for left over res */
+ reg = read_pci_config(bus, slot, 0, AMD_NB_F0_NODE_ID);
+ def_node = (reg >> 8) & 0x07;
+ reg = read_pci_config(bus, slot, 0, AMD_NB_F0_UNIT_ID);
+ def_link = (reg >> 8) & 0x03;
+
+ memset(range, 0, sizeof(range));
+ add_range(range, RANGE_NUM, 0, 0, 0xffff + 1);
+ /* io port resource */
+ for (i = 0; i < 4; i++) {
+ reg = read_pci_config(bus, slot, 1, 0xc0 + (i << 3));
+ if (!(reg & 3))
+ continue;
+
+ start = reg & 0xfff000;
+ reg = read_pci_config(bus, slot, 1, 0xc4 + (i << 3));
+ node = reg & 0x07;
+ link = (reg >> 4) & 0x03;
+ end = (reg & 0xfff000) | 0xfff;
+
+ info = find_pci_root_info(node, link);
+ if (!info)
+ continue; /* not found */
+
+ printk(KERN_DEBUG "node %d link %d: io port [%llx, %llx]\n",
+ node, link, start, end);
+
+ /* kernel only handle 16 bit only */
+ if (end > 0xffff)
+ end = 0xffff;
+ update_res(info, start, end, IORESOURCE_IO, 1);
+ subtract_range(range, RANGE_NUM, start, end + 1);
+ }
+ /* add left over io port range to def node/link, [0, 0xffff] */
+ /* find the position */
+ info = find_pci_root_info(def_node, def_link);
+ if (info) {
+ for (i = 0; i < RANGE_NUM; i++) {
+ if (!range[i].end)
+ continue;
+
+ update_res(info, range[i].start, range[i].end - 1,
+ IORESOURCE_IO, 1);
+ }
+ }
+
+ memset(range, 0, sizeof(range));
+ /* 0xfd00000000-0xffffffffff for HT */
+ end = cap_resource((0xfdULL<<32) - 1);
+ end++;
+ add_range(range, RANGE_NUM, 0, 0, end);
+
+ /* need to take out [0, TOM) for RAM*/
+ address = MSR_K8_TOP_MEM1;
+ rdmsrl(address, val);
+ end = (val & 0xffffff800000ULL);
+ printk(KERN_INFO "TOM: %016llx aka %lldM\n", end, end>>20);
+ if (end < (1ULL<<32))
+ subtract_range(range, RANGE_NUM, 0, end);
+
+ /* get mmconfig */
+ fam10h_mmconf = amd_get_mmconfig_range(&fam10h_mmconf_res);
+ /* need to take out mmconf range */
+ if (fam10h_mmconf) {
+ printk(KERN_DEBUG "Fam 10h mmconf %pR\n", fam10h_mmconf);
+ fam10h_mmconf_start = fam10h_mmconf->start;
+ fam10h_mmconf_end = fam10h_mmconf->end;
+ subtract_range(range, RANGE_NUM, fam10h_mmconf_start,
+ fam10h_mmconf_end + 1);
+ } else {
+ fam10h_mmconf_start = 0;
+ fam10h_mmconf_end = 0;
+ }
+
+ /* mmio resource */
+ for (i = 0; i < 8; i++) {
+ reg = read_pci_config(bus, slot, 1, 0x80 + (i << 3));
+ if (!(reg & 3))
+ continue;
+
+ start = reg & 0xffffff00; /* 39:16 on 31:8*/
+ start <<= 8;
+ reg = read_pci_config(bus, slot, 1, 0x84 + (i << 3));
+ node = reg & 0x07;
+ link = (reg >> 4) & 0x03;
+ end = (reg & 0xffffff00);
+ end <<= 8;
+ end |= 0xffff;
+
+ info = find_pci_root_info(node, link);
+
+ if (!info)
+ continue;
+
+ printk(KERN_DEBUG "node %d link %d: mmio [%llx, %llx]",
+ node, link, start, end);
+ /*
+ * some sick allocation would have range overlap with fam10h
+ * mmconf range, so need to update start and end.
+ */
+ if (fam10h_mmconf_end) {
+ int changed = 0;
+ u64 endx = 0;
+ if (start >= fam10h_mmconf_start &&
+ start <= fam10h_mmconf_end) {
+ start = fam10h_mmconf_end + 1;
+ changed = 1;
+ }
+
+ if (end >= fam10h_mmconf_start &&
+ end <= fam10h_mmconf_end) {
+ end = fam10h_mmconf_start - 1;
+ changed = 1;
+ }
+
+ if (start < fam10h_mmconf_start &&
+ end > fam10h_mmconf_end) {
+ /* we got a hole */
+ endx = fam10h_mmconf_start - 1;
+ update_res(info, start, endx, IORESOURCE_MEM, 0);
+ subtract_range(range, RANGE_NUM, start,
+ endx + 1);
+ printk(KERN_CONT " ==> [%llx, %llx]", start, endx);
+ start = fam10h_mmconf_end + 1;
+ changed = 1;
+ }
+ if (changed) {
+ if (start <= end) {
+ printk(KERN_CONT " %s [%llx, %llx]", endx ? "and" : "==>", start, end);
+ } else {
+ printk(KERN_CONT "%s\n", endx?"":" ==> none");
+ continue;
+ }
+ }
+ }
+
+ update_res(info, cap_resource(start), cap_resource(end),
+ IORESOURCE_MEM, 1);
+ subtract_range(range, RANGE_NUM, start, end + 1);
+ printk(KERN_CONT "\n");
+ }
+
+ /* need to take out [4G, TOM2) for RAM*/
+ /* SYS_CFG */
+ address = MSR_K8_SYSCFG;
+ rdmsrl(address, val);
+ /* TOP_MEM2 is enabled? */
+ if (val & (1<<21)) {
+ /* TOP_MEM2 */
+ address = MSR_K8_TOP_MEM2;
+ rdmsrl(address, val);
+ end = (val & 0xffffff800000ULL);
+ printk(KERN_INFO "TOM2: %016llx aka %lldM\n", end, end>>20);
+ subtract_range(range, RANGE_NUM, 1ULL<<32, end);
+ }
+
+ /*
+ * add left over mmio range to def node/link ?
+ * that is tricky, just record range in from start_min to 4G
+ */
+ info = find_pci_root_info(def_node, def_link);
+ if (info) {
+ for (i = 0; i < RANGE_NUM; i++) {
+ if (!range[i].end)
+ continue;
+
+ update_res(info, cap_resource(range[i].start),
+ cap_resource(range[i].end - 1),
+ IORESOURCE_MEM, 1);
+ }
+ }
+
+ list_for_each_entry(info, &pci_root_infos, list) {
+ int busnum;
+ struct pci_root_res *root_res;
+
+ busnum = info->busn.start;
+ printk(KERN_DEBUG "bus: %pR on node %x link %x\n",
+ &info->busn, info->node, info->link);
+ list_for_each_entry(root_res, &info->resources, list)
+ printk(KERN_DEBUG "bus: %02x %pR\n",
+ busnum, &root_res->res);
+ }
+
+ return 0;
+}
+
+#define ENABLE_CF8_EXT_CFG (1ULL << 46)
+
+static int amd_bus_cpu_online(unsigned int cpu)
+{
+ u64 reg;
+
+ rdmsrl(MSR_AMD64_NB_CFG, reg);
+ if (!(reg & ENABLE_CF8_EXT_CFG)) {
+ reg |= ENABLE_CF8_EXT_CFG;
+ wrmsrl(MSR_AMD64_NB_CFG, reg);
+ }
+ return 0;
+}
+
+static void __init pci_enable_pci_io_ecs(void)
+{
+#ifdef CONFIG_AMD_NB
+ unsigned int i, n;
+
+ for (n = i = 0; !n && amd_nb_bus_dev_ranges[i].dev_limit; ++i) {
+ u8 bus = amd_nb_bus_dev_ranges[i].bus;
+ u8 slot = amd_nb_bus_dev_ranges[i].dev_base;
+ u8 limit = amd_nb_bus_dev_ranges[i].dev_limit;
+
+ for (; slot < limit; ++slot) {
+ u32 val = read_pci_config(bus, slot, 3, 0);
+
+ if (!early_is_amd_nb(val))
+ continue;
+
+ val = read_pci_config(bus, slot, 3, 0x8c);
+ if (!(val & (ENABLE_CF8_EXT_CFG >> 32))) {
+ val |= ENABLE_CF8_EXT_CFG >> 32;
+ write_pci_config(bus, slot, 3, 0x8c, val);
+ }
+ ++n;
+ }
+ }
+#endif
+}
+
+static int __init pci_io_ecs_init(void)
+{
+ int ret;
+
+ /* assume all cpus from fam10h have IO ECS */
+ if (boot_cpu_data.x86 < 0x10)
+ return 0;
+
+ /* Try the PCI method first. */
+ if (early_pci_allowed())
+ pci_enable_pci_io_ecs();
+
+ ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "pci/amd_bus:online",
+ amd_bus_cpu_online, NULL);
+ WARN_ON(ret < 0);
+
+ pci_probe |= PCI_HAS_IO_ECS;
+
+ return 0;
+}
+
+static int __init amd_postcore_init(void)
+{
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
+ return 0;
+
+ early_root_info_init();
+ pci_io_ecs_init();
+
+ return 0;
+}
+
+postcore_initcall(amd_postcore_init);
diff --git a/arch/x86/pci/broadcom_bus.c b/arch/x86/pci/broadcom_bus.c
new file mode 100644
index 000000000..ca1e8e6dc
--- /dev/null
+++ b/arch/x86/pci/broadcom_bus.c
@@ -0,0 +1,116 @@
+/*
+ * Read address ranges from a Broadcom CNB20LE Host Bridge
+ *
+ * Copyright (c) 2010 Ira W. Snyder <iws@ovro.caltech.edu>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ */
+
+#include <linux/acpi.h>
+#include <linux/delay.h>
+#include <linux/dmi.h>
+#include <linux/pci.h>
+#include <linux/init.h>
+#include <asm/pci_x86.h>
+#include <asm/pci-direct.h>
+
+#include "bus_numa.h"
+
+static void __init cnb20le_res(u8 bus, u8 slot, u8 func)
+{
+ struct pci_root_info *info;
+ struct pci_root_res *root_res;
+ struct resource res;
+ u16 word1, word2;
+ u8 fbus, lbus;
+
+ /* read the PCI bus numbers */
+ fbus = read_pci_config_byte(bus, slot, func, 0x44);
+ lbus = read_pci_config_byte(bus, slot, func, 0x45);
+ info = alloc_pci_root_info(fbus, lbus, 0, 0);
+
+ /*
+ * Add the legacy IDE ports on bus 0
+ *
+ * These do not exist anywhere in the bridge registers, AFAICT. I do
+ * not have the datasheet, so this is the best I can do.
+ */
+ if (fbus == 0) {
+ update_res(info, 0x01f0, 0x01f7, IORESOURCE_IO, 0);
+ update_res(info, 0x03f6, 0x03f6, IORESOURCE_IO, 0);
+ update_res(info, 0x0170, 0x0177, IORESOURCE_IO, 0);
+ update_res(info, 0x0376, 0x0376, IORESOURCE_IO, 0);
+ update_res(info, 0xffa0, 0xffaf, IORESOURCE_IO, 0);
+ }
+
+ /* read the non-prefetchable memory window */
+ word1 = read_pci_config_16(bus, slot, func, 0xc0);
+ word2 = read_pci_config_16(bus, slot, func, 0xc2);
+ if (word1 != word2) {
+ res.start = ((resource_size_t) word1 << 16) | 0x0000;
+ res.end = ((resource_size_t) word2 << 16) | 0xffff;
+ res.flags = IORESOURCE_MEM;
+ update_res(info, res.start, res.end, res.flags, 0);
+ }
+
+ /* read the prefetchable memory window */
+ word1 = read_pci_config_16(bus, slot, func, 0xc4);
+ word2 = read_pci_config_16(bus, slot, func, 0xc6);
+ if (word1 != word2) {
+ res.start = ((resource_size_t) word1 << 16) | 0x0000;
+ res.end = ((resource_size_t) word2 << 16) | 0xffff;
+ res.flags = IORESOURCE_MEM | IORESOURCE_PREFETCH;
+ update_res(info, res.start, res.end, res.flags, 0);
+ }
+
+ /* read the IO port window */
+ word1 = read_pci_config_16(bus, slot, func, 0xd0);
+ word2 = read_pci_config_16(bus, slot, func, 0xd2);
+ if (word1 != word2) {
+ res.start = word1;
+ res.end = word2;
+ res.flags = IORESOURCE_IO;
+ update_res(info, res.start, res.end, res.flags, 0);
+ }
+
+ /* print information about this host bridge */
+ res.start = fbus;
+ res.end = lbus;
+ res.flags = IORESOURCE_BUS;
+ printk(KERN_INFO "CNB20LE PCI Host Bridge (domain 0000 %pR)\n", &res);
+
+ list_for_each_entry(root_res, &info->resources, list)
+ printk(KERN_INFO "host bridge window %pR\n", &root_res->res);
+}
+
+static int __init broadcom_postcore_init(void)
+{
+ u8 bus = 0, slot = 0;
+ u32 id;
+ u16 vendor, device;
+
+#ifdef CONFIG_ACPI
+ /*
+ * We should get host bridge information from ACPI unless the BIOS
+ * doesn't support it.
+ */
+ if (!acpi_disabled && acpi_os_get_root_pointer())
+ return 0;
+#endif
+
+ id = read_pci_config(bus, slot, 0, PCI_VENDOR_ID);
+ vendor = id & 0xffff;
+ device = (id >> 16) & 0xffff;
+
+ if (vendor == PCI_VENDOR_ID_SERVERWORKS &&
+ device == PCI_DEVICE_ID_SERVERWORKS_LE) {
+ cnb20le_res(bus, slot, 0);
+ cnb20le_res(bus, slot, 1);
+ }
+ return 0;
+}
+
+postcore_initcall(broadcom_postcore_init);
diff --git a/arch/x86/pci/bus_numa.c b/arch/x86/pci/bus_numa.c
new file mode 100644
index 000000000..2752c02e3
--- /dev/null
+++ b/arch/x86/pci/bus_numa.c
@@ -0,0 +1,146 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/range.h>
+
+#include "bus_numa.h"
+
+LIST_HEAD(pci_root_infos);
+
+static struct pci_root_info *x86_find_pci_root_info(int bus)
+{
+ struct pci_root_info *info;
+
+ list_for_each_entry(info, &pci_root_infos, list)
+ if (info->busn.start == bus)
+ return info;
+
+ return NULL;
+}
+
+int x86_pci_root_bus_node(int bus)
+{
+ struct pci_root_info *info = x86_find_pci_root_info(bus);
+
+ if (!info)
+ return NUMA_NO_NODE;
+
+ return info->node;
+}
+
+void x86_pci_root_bus_resources(int bus, struct list_head *resources)
+{
+ struct pci_root_info *info = x86_find_pci_root_info(bus);
+ struct pci_root_res *root_res;
+ struct resource_entry *window;
+ bool found = false;
+
+ if (!info)
+ goto default_resources;
+
+ printk(KERN_DEBUG "PCI: root bus %02x: hardware-probed resources\n",
+ bus);
+
+ /* already added by acpi ? */
+ resource_list_for_each_entry(window, resources)
+ if (window->res->flags & IORESOURCE_BUS) {
+ found = true;
+ break;
+ }
+
+ if (!found)
+ pci_add_resource(resources, &info->busn);
+
+ list_for_each_entry(root_res, &info->resources, list)
+ pci_add_resource(resources, &root_res->res);
+
+ return;
+
+default_resources:
+ /*
+ * We don't have any host bridge aperture information from the
+ * "native host bridge drivers," e.g., amd_bus or broadcom_bus,
+ * so fall back to the defaults historically used by pci_create_bus().
+ */
+ printk(KERN_DEBUG "PCI: root bus %02x: using default resources\n", bus);
+ pci_add_resource(resources, &ioport_resource);
+ pci_add_resource(resources, &iomem_resource);
+}
+
+struct pci_root_info __init *alloc_pci_root_info(int bus_min, int bus_max,
+ int node, int link)
+{
+ struct pci_root_info *info;
+
+ info = kzalloc(sizeof(*info), GFP_KERNEL);
+
+ if (!info)
+ return info;
+
+ sprintf(info->name, "PCI Bus #%02x", bus_min);
+
+ INIT_LIST_HEAD(&info->resources);
+ info->busn.name = info->name;
+ info->busn.start = bus_min;
+ info->busn.end = bus_max;
+ info->busn.flags = IORESOURCE_BUS;
+ info->node = node;
+ info->link = link;
+
+ list_add_tail(&info->list, &pci_root_infos);
+
+ return info;
+}
+
+void update_res(struct pci_root_info *info, resource_size_t start,
+ resource_size_t end, unsigned long flags, int merge)
+{
+ struct resource *res;
+ struct pci_root_res *root_res;
+
+ if (start > end)
+ return;
+
+ if (start == MAX_RESOURCE)
+ return;
+
+ if (!merge)
+ goto addit;
+
+ /* try to merge it with old one */
+ list_for_each_entry(root_res, &info->resources, list) {
+ resource_size_t final_start, final_end;
+ resource_size_t common_start, common_end;
+
+ res = &root_res->res;
+ if (res->flags != flags)
+ continue;
+
+ common_start = max(res->start, start);
+ common_end = min(res->end, end);
+ if (common_start > common_end + 1)
+ continue;
+
+ final_start = min(res->start, start);
+ final_end = max(res->end, end);
+
+ res->start = final_start;
+ res->end = final_end;
+ return;
+ }
+
+addit:
+
+ /* need to add that */
+ root_res = kzalloc(sizeof(*root_res), GFP_KERNEL);
+ if (!root_res)
+ return;
+
+ res = &root_res->res;
+ res->name = info->name;
+ res->flags = flags;
+ res->start = start;
+ res->end = end;
+
+ list_add_tail(&root_res->list, &info->resources);
+}
diff --git a/arch/x86/pci/bus_numa.h b/arch/x86/pci/bus_numa.h
new file mode 100644
index 000000000..697dd841b
--- /dev/null
+++ b/arch/x86/pci/bus_numa.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __BUS_NUMA_H
+#define __BUS_NUMA_H
+/*
+ * sub bus (transparent) will use entres from 3 to store extra from
+ * root, so need to make sure we have enough slot there.
+ */
+struct pci_root_res {
+ struct list_head list;
+ struct resource res;
+};
+
+struct pci_root_info {
+ struct list_head list;
+ char name[12];
+ struct list_head resources;
+ struct resource busn;
+ int node;
+ int link;
+};
+
+extern struct list_head pci_root_infos;
+struct pci_root_info *alloc_pci_root_info(int bus_min, int bus_max,
+ int node, int link);
+extern void update_res(struct pci_root_info *info, resource_size_t start,
+ resource_size_t end, unsigned long flags, int merge);
+#endif
diff --git a/arch/x86/pci/ce4100.c b/arch/x86/pci/ce4100.c
new file mode 100644
index 000000000..3353b76dc
--- /dev/null
+++ b/arch/x86/pci/ce4100.c
@@ -0,0 +1,340 @@
+/*
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2010 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ * The full GNU General Public License is included in this distribution
+ * in the file called LICENSE.GPL.
+ *
+ * Contact Information:
+ * Intel Corporation
+ * 2200 Mission College Blvd.
+ * Santa Clara, CA 97052
+ *
+ * This provides access methods for PCI registers that mis-behave on
+ * the CE4100. Each register can be assigned a private init, read and
+ * write routine. The exception to this is the bridge device. The
+ * bridge device is the only device on bus zero (0) that requires any
+ * fixup so it is a special case ATM
+ */
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/init.h>
+
+#include <asm/ce4100.h>
+#include <asm/pci_x86.h>
+
+struct sim_reg {
+ u32 value;
+ u32 mask;
+};
+
+struct sim_dev_reg {
+ int dev_func;
+ int reg;
+ void (*init)(struct sim_dev_reg *reg);
+ void (*read)(struct sim_dev_reg *reg, u32 *value);
+ void (*write)(struct sim_dev_reg *reg, u32 value);
+ struct sim_reg sim_reg;
+};
+
+struct sim_reg_op {
+ void (*init)(struct sim_dev_reg *reg);
+ void (*read)(struct sim_dev_reg *reg, u32 value);
+ void (*write)(struct sim_dev_reg *reg, u32 value);
+};
+
+#define MB (1024 * 1024)
+#define KB (1024)
+#define SIZE_TO_MASK(size) (~(size - 1))
+
+#define DEFINE_REG(device, func, offset, size, init_op, read_op, write_op)\
+{ PCI_DEVFN(device, func), offset, init_op, read_op, write_op,\
+ {0, SIZE_TO_MASK(size)} },
+
+/*
+ * All read/write functions are called with pci_config_lock held.
+ */
+static void reg_init(struct sim_dev_reg *reg)
+{
+ pci_direct_conf1.read(0, 1, reg->dev_func, reg->reg, 4,
+ &reg->sim_reg.value);
+}
+
+static void reg_read(struct sim_dev_reg *reg, u32 *value)
+{
+ *value = reg->sim_reg.value;
+}
+
+static void reg_write(struct sim_dev_reg *reg, u32 value)
+{
+ reg->sim_reg.value = (value & reg->sim_reg.mask) |
+ (reg->sim_reg.value & ~reg->sim_reg.mask);
+}
+
+static void sata_reg_init(struct sim_dev_reg *reg)
+{
+ pci_direct_conf1.read(0, 1, PCI_DEVFN(14, 0), 0x10, 4,
+ &reg->sim_reg.value);
+ reg->sim_reg.value += 0x400;
+}
+
+static void ehci_reg_read(struct sim_dev_reg *reg, u32 *value)
+{
+ reg_read(reg, value);
+ if (*value != reg->sim_reg.mask)
+ *value |= 0x100;
+}
+
+void sata_revid_init(struct sim_dev_reg *reg)
+{
+ reg->sim_reg.value = 0x01060100;
+ reg->sim_reg.mask = 0;
+}
+
+static void sata_revid_read(struct sim_dev_reg *reg, u32 *value)
+{
+ reg_read(reg, value);
+}
+
+static void reg_noirq_read(struct sim_dev_reg *reg, u32 *value)
+{
+ /* force interrupt pin value to 0 */
+ *value = reg->sim_reg.value & 0xfff00ff;
+}
+
+static struct sim_dev_reg bus1_fixups[] = {
+ DEFINE_REG(2, 0, 0x10, (16*MB), reg_init, reg_read, reg_write)
+ DEFINE_REG(2, 0, 0x14, (256), reg_init, reg_read, reg_write)
+ DEFINE_REG(2, 1, 0x10, (64*KB), reg_init, reg_read, reg_write)
+ DEFINE_REG(3, 0, 0x10, (64*KB), reg_init, reg_read, reg_write)
+ DEFINE_REG(4, 0, 0x10, (128*KB), reg_init, reg_read, reg_write)
+ DEFINE_REG(4, 1, 0x10, (128*KB), reg_init, reg_read, reg_write)
+ DEFINE_REG(6, 0, 0x10, (512*KB), reg_init, reg_read, reg_write)
+ DEFINE_REG(6, 1, 0x10, (512*KB), reg_init, reg_read, reg_write)
+ DEFINE_REG(6, 2, 0x10, (64*KB), reg_init, reg_read, reg_write)
+ DEFINE_REG(8, 0, 0x10, (1*MB), reg_init, reg_read, reg_write)
+ DEFINE_REG(8, 1, 0x10, (64*KB), reg_init, reg_read, reg_write)
+ DEFINE_REG(8, 2, 0x10, (64*KB), reg_init, reg_read, reg_write)
+ DEFINE_REG(9, 0, 0x10 , (1*MB), reg_init, reg_read, reg_write)
+ DEFINE_REG(9, 0, 0x14, (64*KB), reg_init, reg_read, reg_write)
+ DEFINE_REG(10, 0, 0x10, (256), reg_init, reg_read, reg_write)
+ DEFINE_REG(10, 0, 0x14, (256*MB), reg_init, reg_read, reg_write)
+ DEFINE_REG(11, 0, 0x10, (256), reg_init, reg_read, reg_write)
+ DEFINE_REG(11, 0, 0x14, (256), reg_init, reg_read, reg_write)
+ DEFINE_REG(11, 1, 0x10, (256), reg_init, reg_read, reg_write)
+ DEFINE_REG(11, 2, 0x10, (256), reg_init, reg_read, reg_write)
+ DEFINE_REG(11, 2, 0x14, (256), reg_init, reg_read, reg_write)
+ DEFINE_REG(11, 2, 0x18, (256), reg_init, reg_read, reg_write)
+ DEFINE_REG(11, 3, 0x10, (256), reg_init, reg_read, reg_write)
+ DEFINE_REG(11, 3, 0x14, (256), reg_init, reg_read, reg_write)
+ DEFINE_REG(11, 4, 0x10, (256), reg_init, reg_read, reg_write)
+ DEFINE_REG(11, 5, 0x10, (64*KB), reg_init, reg_read, reg_write)
+ DEFINE_REG(11, 6, 0x10, (256), reg_init, reg_read, reg_write)
+ DEFINE_REG(11, 7, 0x10, (64*KB), reg_init, reg_read, reg_write)
+ DEFINE_REG(11, 7, 0x3c, 256, reg_init, reg_noirq_read, reg_write)
+ DEFINE_REG(12, 0, 0x10, (128*KB), reg_init, reg_read, reg_write)
+ DEFINE_REG(12, 0, 0x14, (256), reg_init, reg_read, reg_write)
+ DEFINE_REG(12, 1, 0x10, (1024), reg_init, reg_read, reg_write)
+ DEFINE_REG(13, 0, 0x10, (32*KB), reg_init, ehci_reg_read, reg_write)
+ DEFINE_REG(13, 1, 0x10, (32*KB), reg_init, ehci_reg_read, reg_write)
+ DEFINE_REG(14, 0, 0x8, 0, sata_revid_init, sata_revid_read, 0)
+ DEFINE_REG(14, 0, 0x10, 0, reg_init, reg_read, reg_write)
+ DEFINE_REG(14, 0, 0x14, 0, reg_init, reg_read, reg_write)
+ DEFINE_REG(14, 0, 0x18, 0, reg_init, reg_read, reg_write)
+ DEFINE_REG(14, 0, 0x1C, 0, reg_init, reg_read, reg_write)
+ DEFINE_REG(14, 0, 0x20, 0, reg_init, reg_read, reg_write)
+ DEFINE_REG(14, 0, 0x24, (0x200), sata_reg_init, reg_read, reg_write)
+ DEFINE_REG(15, 0, 0x10, (64*KB), reg_init, reg_read, reg_write)
+ DEFINE_REG(15, 0, 0x14, (64*KB), reg_init, reg_read, reg_write)
+ DEFINE_REG(16, 0, 0x10, (64*KB), reg_init, reg_read, reg_write)
+ DEFINE_REG(16, 0, 0x14, (64*MB), reg_init, reg_read, reg_write)
+ DEFINE_REG(16, 0, 0x18, (64*MB), reg_init, reg_read, reg_write)
+ DEFINE_REG(16, 0, 0x3c, 256, reg_init, reg_noirq_read, reg_write)
+ DEFINE_REG(17, 0, 0x10, (128*KB), reg_init, reg_read, reg_write)
+ DEFINE_REG(18, 0, 0x10, (1*KB), reg_init, reg_read, reg_write)
+ DEFINE_REG(18, 0, 0x3c, 256, reg_init, reg_noirq_read, reg_write)
+};
+
+static void __init init_sim_regs(void)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(bus1_fixups); i++) {
+ if (bus1_fixups[i].init)
+ bus1_fixups[i].init(&bus1_fixups[i]);
+ }
+}
+
+static inline void extract_bytes(u32 *value, int reg, int len)
+{
+ uint32_t mask;
+
+ *value >>= ((reg & 3) * 8);
+ mask = 0xFFFFFFFF >> ((4 - len) * 8);
+ *value &= mask;
+}
+
+int bridge_read(unsigned int devfn, int reg, int len, u32 *value)
+{
+ u32 av_bridge_base, av_bridge_limit;
+ int retval = 0;
+
+ switch (reg) {
+ /* Make BARs appear to not request any memory. */
+ case PCI_BASE_ADDRESS_0:
+ case PCI_BASE_ADDRESS_0 + 1:
+ case PCI_BASE_ADDRESS_0 + 2:
+ case PCI_BASE_ADDRESS_0 + 3:
+ *value = 0;
+ break;
+
+ /* Since subordinate bus number register is hardwired
+ * to zero and read only, so do the simulation.
+ */
+ case PCI_PRIMARY_BUS:
+ if (len == 4)
+ *value = 0x00010100;
+ break;
+
+ case PCI_SUBORDINATE_BUS:
+ *value = 1;
+ break;
+
+ case PCI_MEMORY_BASE:
+ case PCI_MEMORY_LIMIT:
+ /* Get the A/V bridge base address. */
+ pci_direct_conf1.read(0, 0, devfn,
+ PCI_BASE_ADDRESS_0, 4, &av_bridge_base);
+
+ av_bridge_limit = av_bridge_base + (512*MB - 1);
+ av_bridge_limit >>= 16;
+ av_bridge_limit &= 0xFFF0;
+
+ av_bridge_base >>= 16;
+ av_bridge_base &= 0xFFF0;
+
+ if (reg == PCI_MEMORY_LIMIT)
+ *value = av_bridge_limit;
+ else if (len == 2)
+ *value = av_bridge_base;
+ else
+ *value = (av_bridge_limit << 16) | av_bridge_base;
+ break;
+ /* Make prefetchable memory limit smaller than prefetchable
+ * memory base, so not claim prefetchable memory space.
+ */
+ case PCI_PREF_MEMORY_BASE:
+ *value = 0xFFF0;
+ break;
+ case PCI_PREF_MEMORY_LIMIT:
+ *value = 0x0;
+ break;
+ /* Make IO limit smaller than IO base, so not claim IO space. */
+ case PCI_IO_BASE:
+ *value = 0xF0;
+ break;
+ case PCI_IO_LIMIT:
+ *value = 0;
+ break;
+ default:
+ retval = 1;
+ }
+ return retval;
+}
+
+static int ce4100_bus1_read(unsigned int devfn, int reg, int len, u32 *value)
+{
+ unsigned long flags;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(bus1_fixups); i++) {
+ if (bus1_fixups[i].dev_func == devfn &&
+ bus1_fixups[i].reg == (reg & ~3) &&
+ bus1_fixups[i].read) {
+
+ raw_spin_lock_irqsave(&pci_config_lock, flags);
+ bus1_fixups[i].read(&(bus1_fixups[i]), value);
+ raw_spin_unlock_irqrestore(&pci_config_lock, flags);
+ extract_bytes(value, reg, len);
+ return 0;
+ }
+ }
+ return -1;
+}
+
+static int ce4100_conf_read(unsigned int seg, unsigned int bus,
+ unsigned int devfn, int reg, int len, u32 *value)
+{
+ WARN_ON(seg);
+
+ if (bus == 1 && !ce4100_bus1_read(devfn, reg, len, value))
+ return 0;
+
+ if (bus == 0 && (PCI_DEVFN(1, 0) == devfn) &&
+ !bridge_read(devfn, reg, len, value))
+ return 0;
+
+ return pci_direct_conf1.read(seg, bus, devfn, reg, len, value);
+}
+
+static int ce4100_bus1_write(unsigned int devfn, int reg, int len, u32 value)
+{
+ unsigned long flags;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(bus1_fixups); i++) {
+ if (bus1_fixups[i].dev_func == devfn &&
+ bus1_fixups[i].reg == (reg & ~3) &&
+ bus1_fixups[i].write) {
+
+ raw_spin_lock_irqsave(&pci_config_lock, flags);
+ bus1_fixups[i].write(&(bus1_fixups[i]), value);
+ raw_spin_unlock_irqrestore(&pci_config_lock, flags);
+ return 0;
+ }
+ }
+ return -1;
+}
+
+static int ce4100_conf_write(unsigned int seg, unsigned int bus,
+ unsigned int devfn, int reg, int len, u32 value)
+{
+ WARN_ON(seg);
+
+ if (bus == 1 && !ce4100_bus1_write(devfn, reg, len, value))
+ return 0;
+
+ /* Discard writes to A/V bridge BAR. */
+ if (bus == 0 && PCI_DEVFN(1, 0) == devfn &&
+ ((reg & ~3) == PCI_BASE_ADDRESS_0))
+ return 0;
+
+ return pci_direct_conf1.write(seg, bus, devfn, reg, len, value);
+}
+
+static const struct pci_raw_ops ce4100_pci_conf = {
+ .read = ce4100_conf_read,
+ .write = ce4100_conf_write,
+};
+
+int __init ce4100_pci_init(void)
+{
+ init_sim_regs();
+ raw_pci_ops = &ce4100_pci_conf;
+ /* Indicate caller that it should invoke pci_legacy_init() */
+ return 1;
+}
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
new file mode 100644
index 000000000..d4ec117c1
--- /dev/null
+++ b/arch/x86/pci/common.c
@@ -0,0 +1,737 @@
+/*
+ * Low-Level PCI Support for PC
+ *
+ * (c) 1999--2000 Martin Mares <mj@ucw.cz>
+ */
+
+#include <linux/sched.h>
+#include <linux/pci.h>
+#include <linux/pci-acpi.h>
+#include <linux/ioport.h>
+#include <linux/init.h>
+#include <linux/dmi.h>
+#include <linux/slab.h>
+
+#include <asm/acpi.h>
+#include <asm/segment.h>
+#include <asm/io.h>
+#include <asm/smp.h>
+#include <asm/pci_x86.h>
+#include <asm/setup.h>
+
+unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2 |
+ PCI_PROBE_MMCONF;
+
+static int pci_bf_sort;
+int pci_routeirq;
+int noioapicquirk;
+#ifdef CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS
+int noioapicreroute = 0;
+#else
+int noioapicreroute = 1;
+#endif
+int pcibios_last_bus = -1;
+unsigned long pirq_table_addr;
+const struct pci_raw_ops *__read_mostly raw_pci_ops;
+const struct pci_raw_ops *__read_mostly raw_pci_ext_ops;
+
+int raw_pci_read(unsigned int domain, unsigned int bus, unsigned int devfn,
+ int reg, int len, u32 *val)
+{
+ if (domain == 0 && reg < 256 && raw_pci_ops)
+ return raw_pci_ops->read(domain, bus, devfn, reg, len, val);
+ if (raw_pci_ext_ops)
+ return raw_pci_ext_ops->read(domain, bus, devfn, reg, len, val);
+ return -EINVAL;
+}
+
+int raw_pci_write(unsigned int domain, unsigned int bus, unsigned int devfn,
+ int reg, int len, u32 val)
+{
+ if (domain == 0 && reg < 256 && raw_pci_ops)
+ return raw_pci_ops->write(domain, bus, devfn, reg, len, val);
+ if (raw_pci_ext_ops)
+ return raw_pci_ext_ops->write(domain, bus, devfn, reg, len, val);
+ return -EINVAL;
+}
+
+static int pci_read(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 *value)
+{
+ return raw_pci_read(pci_domain_nr(bus), bus->number,
+ devfn, where, size, value);
+}
+
+static int pci_write(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 value)
+{
+ return raw_pci_write(pci_domain_nr(bus), bus->number,
+ devfn, where, size, value);
+}
+
+struct pci_ops pci_root_ops = {
+ .read = pci_read,
+ .write = pci_write,
+};
+
+/*
+ * This interrupt-safe spinlock protects all accesses to PCI configuration
+ * space, except for the mmconfig (ECAM) based operations.
+ */
+DEFINE_RAW_SPINLOCK(pci_config_lock);
+
+static int __init can_skip_ioresource_align(const struct dmi_system_id *d)
+{
+ pci_probe |= PCI_CAN_SKIP_ISA_ALIGN;
+ printk(KERN_INFO "PCI: %s detected, can skip ISA alignment\n", d->ident);
+ return 0;
+}
+
+static const struct dmi_system_id can_skip_pciprobe_dmi_table[] __initconst = {
+/*
+ * Systems where PCI IO resource ISA alignment can be skipped
+ * when the ISA enable bit in the bridge control is not set
+ */
+ {
+ .callback = can_skip_ioresource_align,
+ .ident = "IBM System x3800",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "IBM"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "x3800"),
+ },
+ },
+ {
+ .callback = can_skip_ioresource_align,
+ .ident = "IBM System x3850",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "IBM"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "x3850"),
+ },
+ },
+ {
+ .callback = can_skip_ioresource_align,
+ .ident = "IBM System x3950",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "IBM"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "x3950"),
+ },
+ },
+ {}
+};
+
+void __init dmi_check_skip_isa_align(void)
+{
+ dmi_check_system(can_skip_pciprobe_dmi_table);
+}
+
+static void pcibios_fixup_device_resources(struct pci_dev *dev)
+{
+ struct resource *rom_r = &dev->resource[PCI_ROM_RESOURCE];
+ struct resource *bar_r;
+ int bar;
+
+ if (pci_probe & PCI_NOASSIGN_BARS) {
+ /*
+ * If the BIOS did not assign the BAR, zero out the
+ * resource so the kernel doesn't attempt to assign
+ * it later on in pci_assign_unassigned_resources
+ */
+ for (bar = 0; bar <= PCI_STD_RESOURCE_END; bar++) {
+ bar_r = &dev->resource[bar];
+ if (bar_r->start == 0 && bar_r->end != 0) {
+ bar_r->flags = 0;
+ bar_r->end = 0;
+ }
+ }
+ }
+
+ if (pci_probe & PCI_NOASSIGN_ROMS) {
+ if (rom_r->parent)
+ return;
+ if (rom_r->start) {
+ /* we deal with BIOS assigned ROM later */
+ return;
+ }
+ rom_r->start = rom_r->end = rom_r->flags = 0;
+ }
+}
+
+/*
+ * Called after each bus is probed, but before its children
+ * are examined.
+ */
+
+void pcibios_fixup_bus(struct pci_bus *b)
+{
+ struct pci_dev *dev;
+
+ pci_read_bridge_bases(b);
+ list_for_each_entry(dev, &b->devices, bus_list)
+ pcibios_fixup_device_resources(dev);
+}
+
+void pcibios_add_bus(struct pci_bus *bus)
+{
+ acpi_pci_add_bus(bus);
+}
+
+void pcibios_remove_bus(struct pci_bus *bus)
+{
+ acpi_pci_remove_bus(bus);
+}
+
+/*
+ * Only use DMI information to set this if nothing was passed
+ * on the kernel command line (which was parsed earlier).
+ */
+
+static int __init set_bf_sort(const struct dmi_system_id *d)
+{
+ if (pci_bf_sort == pci_bf_sort_default) {
+ pci_bf_sort = pci_dmi_bf;
+ printk(KERN_INFO "PCI: %s detected, enabling pci=bfsort.\n", d->ident);
+ }
+ return 0;
+}
+
+static void __init read_dmi_type_b1(const struct dmi_header *dm,
+ void *private_data)
+{
+ u8 *data = (u8 *)dm + 4;
+
+ if (dm->type != 0xB1)
+ return;
+ if ((((*(u32 *)data) >> 9) & 0x03) == 0x01)
+ set_bf_sort((const struct dmi_system_id *)private_data);
+}
+
+static int __init find_sort_method(const struct dmi_system_id *d)
+{
+ dmi_walk(read_dmi_type_b1, (void *)d);
+ return 0;
+}
+
+/*
+ * Enable renumbering of PCI bus# ranges to reach all PCI busses (Cardbus)
+ */
+#ifdef __i386__
+static int __init assign_all_busses(const struct dmi_system_id *d)
+{
+ pci_probe |= PCI_ASSIGN_ALL_BUSSES;
+ printk(KERN_INFO "%s detected: enabling PCI bus# renumbering"
+ " (pci=assign-busses)\n", d->ident);
+ return 0;
+}
+#endif
+
+static int __init set_scan_all(const struct dmi_system_id *d)
+{
+ printk(KERN_INFO "PCI: %s detected, enabling pci=pcie_scan_all\n",
+ d->ident);
+ pci_add_flags(PCI_SCAN_ALL_PCIE_DEVS);
+ return 0;
+}
+
+static const struct dmi_system_id pciprobe_dmi_table[] __initconst = {
+#ifdef __i386__
+/*
+ * Laptops which need pci=assign-busses to see Cardbus cards
+ */
+ {
+ .callback = assign_all_busses,
+ .ident = "Samsung X20 Laptop",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Samsung Electronics"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "SX20S"),
+ },
+ },
+#endif /* __i386__ */
+ {
+ .callback = set_bf_sort,
+ .ident = "Dell PowerEdge 1950",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 1950"),
+ },
+ },
+ {
+ .callback = set_bf_sort,
+ .ident = "Dell PowerEdge 1955",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 1955"),
+ },
+ },
+ {
+ .callback = set_bf_sort,
+ .ident = "Dell PowerEdge 2900",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2900"),
+ },
+ },
+ {
+ .callback = set_bf_sort,
+ .ident = "Dell PowerEdge 2950",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2950"),
+ },
+ },
+ {
+ .callback = set_bf_sort,
+ .ident = "Dell PowerEdge R900",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge R900"),
+ },
+ },
+ {
+ .callback = find_sort_method,
+ .ident = "Dell System",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc"),
+ },
+ },
+ {
+ .callback = set_bf_sort,
+ .ident = "HP ProLiant BL20p G3",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "HP"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant BL20p G3"),
+ },
+ },
+ {
+ .callback = set_bf_sort,
+ .ident = "HP ProLiant BL20p G4",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "HP"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant BL20p G4"),
+ },
+ },
+ {
+ .callback = set_bf_sort,
+ .ident = "HP ProLiant BL30p G1",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "HP"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant BL30p G1"),
+ },
+ },
+ {
+ .callback = set_bf_sort,
+ .ident = "HP ProLiant BL25p G1",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "HP"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant BL25p G1"),
+ },
+ },
+ {
+ .callback = set_bf_sort,
+ .ident = "HP ProLiant BL35p G1",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "HP"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant BL35p G1"),
+ },
+ },
+ {
+ .callback = set_bf_sort,
+ .ident = "HP ProLiant BL45p G1",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "HP"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant BL45p G1"),
+ },
+ },
+ {
+ .callback = set_bf_sort,
+ .ident = "HP ProLiant BL45p G2",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "HP"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant BL45p G2"),
+ },
+ },
+ {
+ .callback = set_bf_sort,
+ .ident = "HP ProLiant BL460c G1",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "HP"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant BL460c G1"),
+ },
+ },
+ {
+ .callback = set_bf_sort,
+ .ident = "HP ProLiant BL465c G1",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "HP"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant BL465c G1"),
+ },
+ },
+ {
+ .callback = set_bf_sort,
+ .ident = "HP ProLiant BL480c G1",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "HP"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant BL480c G1"),
+ },
+ },
+ {
+ .callback = set_bf_sort,
+ .ident = "HP ProLiant BL685c G1",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "HP"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant BL685c G1"),
+ },
+ },
+ {
+ .callback = set_bf_sort,
+ .ident = "HP ProLiant DL360",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "HP"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant DL360"),
+ },
+ },
+ {
+ .callback = set_bf_sort,
+ .ident = "HP ProLiant DL380",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "HP"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant DL380"),
+ },
+ },
+#ifdef __i386__
+ {
+ .callback = assign_all_busses,
+ .ident = "Compaq EVO N800c",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Compaq"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "EVO N800c"),
+ },
+ },
+#endif
+ {
+ .callback = set_bf_sort,
+ .ident = "HP ProLiant DL385 G2",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "HP"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant DL385 G2"),
+ },
+ },
+ {
+ .callback = set_bf_sort,
+ .ident = "HP ProLiant DL585 G2",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "HP"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant DL585 G2"),
+ },
+ },
+ {
+ .callback = set_scan_all,
+ .ident = "Stratus/NEC ftServer",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Stratus"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "ftServer"),
+ },
+ },
+ {
+ .callback = set_scan_all,
+ .ident = "Stratus/NEC ftServer",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "NEC"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Express5800/R32"),
+ },
+ },
+ {
+ .callback = set_scan_all,
+ .ident = "Stratus/NEC ftServer",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "NEC"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Express5800/R31"),
+ },
+ },
+ {}
+};
+
+void __init dmi_check_pciprobe(void)
+{
+ dmi_check_system(pciprobe_dmi_table);
+}
+
+void pcibios_scan_root(int busnum)
+{
+ struct pci_bus *bus;
+ struct pci_sysdata *sd;
+ LIST_HEAD(resources);
+
+ sd = kzalloc(sizeof(*sd), GFP_KERNEL);
+ if (!sd) {
+ printk(KERN_ERR "PCI: OOM, skipping PCI bus %02x\n", busnum);
+ return;
+ }
+ sd->node = x86_pci_root_bus_node(busnum);
+ x86_pci_root_bus_resources(busnum, &resources);
+ printk(KERN_DEBUG "PCI: Probing PCI hardware (bus %02x)\n", busnum);
+ bus = pci_scan_root_bus(NULL, busnum, &pci_root_ops, sd, &resources);
+ if (!bus) {
+ pci_free_resource_list(&resources);
+ kfree(sd);
+ return;
+ }
+ pci_bus_add_devices(bus);
+}
+
+void __init pcibios_set_cache_line_size(void)
+{
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+
+ /*
+ * Set PCI cacheline size to that of the CPU if the CPU has reported it.
+ * (For older CPUs that don't support cpuid, we se it to 32 bytes
+ * It's also good for 386/486s (which actually have 16)
+ * as quite a few PCI devices do not support smaller values.
+ */
+ if (c->x86_clflush_size > 0) {
+ pci_dfl_cache_line_size = c->x86_clflush_size >> 2;
+ printk(KERN_DEBUG "PCI: pci_cache_line_size set to %d bytes\n",
+ pci_dfl_cache_line_size << 2);
+ } else {
+ pci_dfl_cache_line_size = 32 >> 2;
+ printk(KERN_DEBUG "PCI: Unknown cacheline size. Setting to 32 bytes\n");
+ }
+}
+
+int __init pcibios_init(void)
+{
+ if (!raw_pci_ops && !raw_pci_ext_ops) {
+ printk(KERN_WARNING "PCI: System does not support PCI\n");
+ return 0;
+ }
+
+ pcibios_set_cache_line_size();
+ pcibios_resource_survey();
+
+ if (pci_bf_sort >= pci_force_bf)
+ pci_sort_breadthfirst();
+ return 0;
+}
+
+char *__init pcibios_setup(char *str)
+{
+ if (!strcmp(str, "off")) {
+ pci_probe = 0;
+ return NULL;
+ } else if (!strcmp(str, "bfsort")) {
+ pci_bf_sort = pci_force_bf;
+ return NULL;
+ } else if (!strcmp(str, "nobfsort")) {
+ pci_bf_sort = pci_force_nobf;
+ return NULL;
+ }
+#ifdef CONFIG_PCI_BIOS
+ else if (!strcmp(str, "bios")) {
+ pci_probe = PCI_PROBE_BIOS;
+ return NULL;
+ } else if (!strcmp(str, "nobios")) {
+ pci_probe &= ~PCI_PROBE_BIOS;
+ return NULL;
+ } else if (!strcmp(str, "biosirq")) {
+ pci_probe |= PCI_BIOS_IRQ_SCAN;
+ return NULL;
+ } else if (!strncmp(str, "pirqaddr=", 9)) {
+ pirq_table_addr = simple_strtoul(str+9, NULL, 0);
+ return NULL;
+ }
+#endif
+#ifdef CONFIG_PCI_DIRECT
+ else if (!strcmp(str, "conf1")) {
+ pci_probe = PCI_PROBE_CONF1 | PCI_NO_CHECKS;
+ return NULL;
+ }
+ else if (!strcmp(str, "conf2")) {
+ pci_probe = PCI_PROBE_CONF2 | PCI_NO_CHECKS;
+ return NULL;
+ }
+#endif
+#ifdef CONFIG_PCI_MMCONFIG
+ else if (!strcmp(str, "nommconf")) {
+ pci_probe &= ~PCI_PROBE_MMCONF;
+ return NULL;
+ }
+ else if (!strcmp(str, "check_enable_amd_mmconf")) {
+ pci_probe |= PCI_CHECK_ENABLE_AMD_MMCONF;
+ return NULL;
+ }
+#endif
+ else if (!strcmp(str, "noacpi")) {
+ acpi_noirq_set();
+ return NULL;
+ }
+ else if (!strcmp(str, "noearly")) {
+ pci_probe |= PCI_PROBE_NOEARLY;
+ return NULL;
+ }
+ else if (!strcmp(str, "usepirqmask")) {
+ pci_probe |= PCI_USE_PIRQ_MASK;
+ return NULL;
+ } else if (!strncmp(str, "irqmask=", 8)) {
+ pcibios_irq_mask = simple_strtol(str+8, NULL, 0);
+ return NULL;
+ } else if (!strncmp(str, "lastbus=", 8)) {
+ pcibios_last_bus = simple_strtol(str+8, NULL, 0);
+ return NULL;
+ } else if (!strcmp(str, "rom")) {
+ pci_probe |= PCI_ASSIGN_ROMS;
+ return NULL;
+ } else if (!strcmp(str, "norom")) {
+ pci_probe |= PCI_NOASSIGN_ROMS;
+ return NULL;
+ } else if (!strcmp(str, "nobar")) {
+ pci_probe |= PCI_NOASSIGN_BARS;
+ return NULL;
+ } else if (!strcmp(str, "assign-busses")) {
+ pci_probe |= PCI_ASSIGN_ALL_BUSSES;
+ return NULL;
+ } else if (!strcmp(str, "use_crs")) {
+ pci_probe |= PCI_USE__CRS;
+ return NULL;
+ } else if (!strcmp(str, "nocrs")) {
+ pci_probe |= PCI_ROOT_NO_CRS;
+ return NULL;
+#ifdef CONFIG_PHYS_ADDR_T_64BIT
+ } else if (!strcmp(str, "big_root_window")) {
+ pci_probe |= PCI_BIG_ROOT_WINDOW;
+ return NULL;
+#endif
+ } else if (!strcmp(str, "routeirq")) {
+ pci_routeirq = 1;
+ return NULL;
+ } else if (!strcmp(str, "skip_isa_align")) {
+ pci_probe |= PCI_CAN_SKIP_ISA_ALIGN;
+ return NULL;
+ } else if (!strcmp(str, "noioapicquirk")) {
+ noioapicquirk = 1;
+ return NULL;
+ } else if (!strcmp(str, "ioapicreroute")) {
+ if (noioapicreroute != -1)
+ noioapicreroute = 0;
+ return NULL;
+ } else if (!strcmp(str, "noioapicreroute")) {
+ if (noioapicreroute != -1)
+ noioapicreroute = 1;
+ return NULL;
+ }
+ return str;
+}
+
+unsigned int pcibios_assign_all_busses(void)
+{
+ return (pci_probe & PCI_ASSIGN_ALL_BUSSES) ? 1 : 0;
+}
+
+#if defined(CONFIG_X86_DEV_DMA_OPS) && defined(CONFIG_PCI_DOMAINS)
+static LIST_HEAD(dma_domain_list);
+static DEFINE_SPINLOCK(dma_domain_list_lock);
+
+void add_dma_domain(struct dma_domain *domain)
+{
+ spin_lock(&dma_domain_list_lock);
+ list_add(&domain->node, &dma_domain_list);
+ spin_unlock(&dma_domain_list_lock);
+}
+EXPORT_SYMBOL_GPL(add_dma_domain);
+
+void del_dma_domain(struct dma_domain *domain)
+{
+ spin_lock(&dma_domain_list_lock);
+ list_del(&domain->node);
+ spin_unlock(&dma_domain_list_lock);
+}
+EXPORT_SYMBOL_GPL(del_dma_domain);
+
+static void set_dma_domain_ops(struct pci_dev *pdev)
+{
+ struct dma_domain *domain;
+
+ spin_lock(&dma_domain_list_lock);
+ list_for_each_entry(domain, &dma_domain_list, node) {
+ if (pci_domain_nr(pdev->bus) == domain->domain_nr) {
+ pdev->dev.dma_ops = domain->dma_ops;
+ break;
+ }
+ }
+ spin_unlock(&dma_domain_list_lock);
+}
+#else
+static void set_dma_domain_ops(struct pci_dev *pdev) {}
+#endif
+
+static void set_dev_domain_options(struct pci_dev *pdev)
+{
+ if (is_vmd(pdev->bus))
+ pdev->hotplug_user_indicators = 1;
+}
+
+int pcibios_add_device(struct pci_dev *dev)
+{
+ struct setup_data *data;
+ struct pci_setup_rom *rom;
+ u64 pa_data;
+
+ pa_data = boot_params.hdr.setup_data;
+ while (pa_data) {
+ data = memremap(pa_data, sizeof(*rom), MEMREMAP_WB);
+ if (!data)
+ return -ENOMEM;
+
+ if (data->type == SETUP_PCI) {
+ rom = (struct pci_setup_rom *)data;
+
+ if ((pci_domain_nr(dev->bus) == rom->segment) &&
+ (dev->bus->number == rom->bus) &&
+ (PCI_SLOT(dev->devfn) == rom->device) &&
+ (PCI_FUNC(dev->devfn) == rom->function) &&
+ (dev->vendor == rom->vendor) &&
+ (dev->device == rom->devid)) {
+ dev->rom = pa_data +
+ offsetof(struct pci_setup_rom, romdata);
+ dev->romlen = rom->pcilen;
+ }
+ }
+ pa_data = data->next;
+ memunmap(data);
+ }
+ set_dma_domain_ops(dev);
+ set_dev_domain_options(dev);
+ return 0;
+}
+
+int pcibios_enable_device(struct pci_dev *dev, int mask)
+{
+ int err;
+
+ if ((err = pci_enable_resources(dev, mask)) < 0)
+ return err;
+
+ if (!pci_dev_msi_enabled(dev))
+ return pcibios_enable_irq(dev);
+ return 0;
+}
+
+void pcibios_disable_device (struct pci_dev *dev)
+{
+ if (!pci_dev_msi_enabled(dev) && pcibios_disable_irq)
+ pcibios_disable_irq(dev);
+}
+
+#ifdef CONFIG_ACPI_HOTPLUG_IOAPIC
+void pcibios_release_device(struct pci_dev *dev)
+{
+ if (atomic_dec_return(&dev->enable_cnt) >= 0)
+ pcibios_disable_device(dev);
+
+}
+#endif
+
+int pci_ext_cfg_avail(void)
+{
+ if (raw_pci_ext_ops)
+ return 1;
+ else
+ return 0;
+}
diff --git a/arch/x86/pci/direct.c b/arch/x86/pci/direct.c
new file mode 100644
index 000000000..a51074c55
--- /dev/null
+++ b/arch/x86/pci/direct.c
@@ -0,0 +1,315 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * direct.c - Low-level direct PCI config space access
+ */
+
+#include <linux/pci.h>
+#include <linux/init.h>
+#include <linux/dmi.h>
+#include <asm/pci_x86.h>
+
+/*
+ * Functions for accessing PCI base (first 256 bytes) and extended
+ * (4096 bytes per PCI function) configuration space with type 1
+ * accesses.
+ */
+
+#define PCI_CONF1_ADDRESS(bus, devfn, reg) \
+ (0x80000000 | ((reg & 0xF00) << 16) | (bus << 16) \
+ | (devfn << 8) | (reg & 0xFC))
+
+static int pci_conf1_read(unsigned int seg, unsigned int bus,
+ unsigned int devfn, int reg, int len, u32 *value)
+{
+ unsigned long flags;
+
+ if (seg || (bus > 255) || (devfn > 255) || (reg > 4095)) {
+ *value = -1;
+ return -EINVAL;
+ }
+
+ raw_spin_lock_irqsave(&pci_config_lock, flags);
+
+ outl(PCI_CONF1_ADDRESS(bus, devfn, reg), 0xCF8);
+
+ switch (len) {
+ case 1:
+ *value = inb(0xCFC + (reg & 3));
+ break;
+ case 2:
+ *value = inw(0xCFC + (reg & 2));
+ break;
+ case 4:
+ *value = inl(0xCFC);
+ break;
+ }
+
+ raw_spin_unlock_irqrestore(&pci_config_lock, flags);
+
+ return 0;
+}
+
+static int pci_conf1_write(unsigned int seg, unsigned int bus,
+ unsigned int devfn, int reg, int len, u32 value)
+{
+ unsigned long flags;
+
+ if (seg || (bus > 255) || (devfn > 255) || (reg > 4095))
+ return -EINVAL;
+
+ raw_spin_lock_irqsave(&pci_config_lock, flags);
+
+ outl(PCI_CONF1_ADDRESS(bus, devfn, reg), 0xCF8);
+
+ switch (len) {
+ case 1:
+ outb((u8)value, 0xCFC + (reg & 3));
+ break;
+ case 2:
+ outw((u16)value, 0xCFC + (reg & 2));
+ break;
+ case 4:
+ outl((u32)value, 0xCFC);
+ break;
+ }
+
+ raw_spin_unlock_irqrestore(&pci_config_lock, flags);
+
+ return 0;
+}
+
+#undef PCI_CONF1_ADDRESS
+
+const struct pci_raw_ops pci_direct_conf1 = {
+ .read = pci_conf1_read,
+ .write = pci_conf1_write,
+};
+
+
+/*
+ * Functions for accessing PCI configuration space with type 2 accesses
+ */
+
+#define PCI_CONF2_ADDRESS(dev, reg) (u16)(0xC000 | (dev << 8) | reg)
+
+static int pci_conf2_read(unsigned int seg, unsigned int bus,
+ unsigned int devfn, int reg, int len, u32 *value)
+{
+ unsigned long flags;
+ int dev, fn;
+
+ WARN_ON(seg);
+ if ((bus > 255) || (devfn > 255) || (reg > 255)) {
+ *value = -1;
+ return -EINVAL;
+ }
+
+ dev = PCI_SLOT(devfn);
+ fn = PCI_FUNC(devfn);
+
+ if (dev & 0x10)
+ return PCIBIOS_DEVICE_NOT_FOUND;
+
+ raw_spin_lock_irqsave(&pci_config_lock, flags);
+
+ outb((u8)(0xF0 | (fn << 1)), 0xCF8);
+ outb((u8)bus, 0xCFA);
+
+ switch (len) {
+ case 1:
+ *value = inb(PCI_CONF2_ADDRESS(dev, reg));
+ break;
+ case 2:
+ *value = inw(PCI_CONF2_ADDRESS(dev, reg));
+ break;
+ case 4:
+ *value = inl(PCI_CONF2_ADDRESS(dev, reg));
+ break;
+ }
+
+ outb(0, 0xCF8);
+
+ raw_spin_unlock_irqrestore(&pci_config_lock, flags);
+
+ return 0;
+}
+
+static int pci_conf2_write(unsigned int seg, unsigned int bus,
+ unsigned int devfn, int reg, int len, u32 value)
+{
+ unsigned long flags;
+ int dev, fn;
+
+ WARN_ON(seg);
+ if ((bus > 255) || (devfn > 255) || (reg > 255))
+ return -EINVAL;
+
+ dev = PCI_SLOT(devfn);
+ fn = PCI_FUNC(devfn);
+
+ if (dev & 0x10)
+ return PCIBIOS_DEVICE_NOT_FOUND;
+
+ raw_spin_lock_irqsave(&pci_config_lock, flags);
+
+ outb((u8)(0xF0 | (fn << 1)), 0xCF8);
+ outb((u8)bus, 0xCFA);
+
+ switch (len) {
+ case 1:
+ outb((u8)value, PCI_CONF2_ADDRESS(dev, reg));
+ break;
+ case 2:
+ outw((u16)value, PCI_CONF2_ADDRESS(dev, reg));
+ break;
+ case 4:
+ outl((u32)value, PCI_CONF2_ADDRESS(dev, reg));
+ break;
+ }
+
+ outb(0, 0xCF8);
+
+ raw_spin_unlock_irqrestore(&pci_config_lock, flags);
+
+ return 0;
+}
+
+#undef PCI_CONF2_ADDRESS
+
+static const struct pci_raw_ops pci_direct_conf2 = {
+ .read = pci_conf2_read,
+ .write = pci_conf2_write,
+};
+
+
+/*
+ * Before we decide to use direct hardware access mechanisms, we try to do some
+ * trivial checks to ensure it at least _seems_ to be working -- we just test
+ * whether bus 00 contains a host bridge (this is similar to checking
+ * techniques used in XFree86, but ours should be more reliable since we
+ * attempt to make use of direct access hints provided by the PCI BIOS).
+ *
+ * This should be close to trivial, but it isn't, because there are buggy
+ * chipsets (yes, you guessed it, by Intel and Compaq) that have no class ID.
+ */
+static int __init pci_sanity_check(const struct pci_raw_ops *o)
+{
+ u32 x = 0;
+ int devfn;
+
+ if (pci_probe & PCI_NO_CHECKS)
+ return 1;
+ /* Assume Type 1 works for newer systems.
+ This handles machines that don't have anything on PCI Bus 0. */
+ if (dmi_get_bios_year() >= 2001)
+ return 1;
+
+ for (devfn = 0; devfn < 0x100; devfn++) {
+ if (o->read(0, 0, devfn, PCI_CLASS_DEVICE, 2, &x))
+ continue;
+ if (x == PCI_CLASS_BRIDGE_HOST || x == PCI_CLASS_DISPLAY_VGA)
+ return 1;
+
+ if (o->read(0, 0, devfn, PCI_VENDOR_ID, 2, &x))
+ continue;
+ if (x == PCI_VENDOR_ID_INTEL || x == PCI_VENDOR_ID_COMPAQ)
+ return 1;
+ }
+
+ DBG(KERN_WARNING "PCI: Sanity check failed\n");
+ return 0;
+}
+
+static int __init pci_check_type1(void)
+{
+ unsigned long flags;
+ unsigned int tmp;
+ int works = 0;
+
+ local_irq_save(flags);
+
+ outb(0x01, 0xCFB);
+ tmp = inl(0xCF8);
+ outl(0x80000000, 0xCF8);
+ if (inl(0xCF8) == 0x80000000 && pci_sanity_check(&pci_direct_conf1)) {
+ works = 1;
+ }
+ outl(tmp, 0xCF8);
+ local_irq_restore(flags);
+
+ return works;
+}
+
+static int __init pci_check_type2(void)
+{
+ unsigned long flags;
+ int works = 0;
+
+ local_irq_save(flags);
+
+ outb(0x00, 0xCFB);
+ outb(0x00, 0xCF8);
+ outb(0x00, 0xCFA);
+ if (inb(0xCF8) == 0x00 && inb(0xCFA) == 0x00 &&
+ pci_sanity_check(&pci_direct_conf2)) {
+ works = 1;
+ }
+
+ local_irq_restore(flags);
+
+ return works;
+}
+
+void __init pci_direct_init(int type)
+{
+ if (type == 0)
+ return;
+ printk(KERN_INFO "PCI: Using configuration type %d for base access\n",
+ type);
+ if (type == 1) {
+ raw_pci_ops = &pci_direct_conf1;
+ if (raw_pci_ext_ops)
+ return;
+ if (!(pci_probe & PCI_HAS_IO_ECS))
+ return;
+ printk(KERN_INFO "PCI: Using configuration type 1 "
+ "for extended access\n");
+ raw_pci_ext_ops = &pci_direct_conf1;
+ return;
+ }
+ raw_pci_ops = &pci_direct_conf2;
+}
+
+int __init pci_direct_probe(void)
+{
+ if ((pci_probe & PCI_PROBE_CONF1) == 0)
+ goto type2;
+ if (!request_region(0xCF8, 8, "PCI conf1"))
+ goto type2;
+
+ if (pci_check_type1()) {
+ raw_pci_ops = &pci_direct_conf1;
+ port_cf9_safe = true;
+ return 1;
+ }
+ release_region(0xCF8, 8);
+
+ type2:
+ if ((pci_probe & PCI_PROBE_CONF2) == 0)
+ return 0;
+ if (!request_region(0xCF8, 4, "PCI conf2"))
+ return 0;
+ if (!request_region(0xC000, 0x1000, "PCI conf2"))
+ goto fail2;
+
+ if (pci_check_type2()) {
+ raw_pci_ops = &pci_direct_conf2;
+ port_cf9_safe = true;
+ return 2;
+ }
+
+ release_region(0xC000, 0x1000);
+ fail2:
+ release_region(0xCF8, 4);
+ return 0;
+}
diff --git a/arch/x86/pci/early.c b/arch/x86/pci/early.c
new file mode 100644
index 000000000..f5fc953e5
--- /dev/null
+++ b/arch/x86/pci/early.c
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <asm/pci-direct.h>
+#include <asm/io.h>
+#include <asm/pci_x86.h>
+
+/* Direct PCI access. This is used for PCI accesses in early boot before
+ the PCI subsystem works. */
+
+u32 read_pci_config(u8 bus, u8 slot, u8 func, u8 offset)
+{
+ u32 v;
+ outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
+ v = inl(0xcfc);
+ return v;
+}
+
+u8 read_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset)
+{
+ u8 v;
+ outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
+ v = inb(0xcfc + (offset&3));
+ return v;
+}
+
+u16 read_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset)
+{
+ u16 v;
+ outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
+ v = inw(0xcfc + (offset&2));
+ return v;
+}
+
+void write_pci_config(u8 bus, u8 slot, u8 func, u8 offset,
+ u32 val)
+{
+ outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
+ outl(val, 0xcfc);
+}
+
+void write_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset, u8 val)
+{
+ outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
+ outb(val, 0xcfc + (offset&3));
+}
+
+void write_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset, u16 val)
+{
+ outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
+ outw(val, 0xcfc + (offset&2));
+}
+
+int early_pci_allowed(void)
+{
+ return (pci_probe & (PCI_PROBE_CONF1|PCI_PROBE_NOEARLY)) ==
+ PCI_PROBE_CONF1;
+}
+
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
new file mode 100644
index 000000000..76959a7d8
--- /dev/null
+++ b/arch/x86/pci/fixup.c
@@ -0,0 +1,826 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Exceptions for specific devices. Usually work-arounds for fatal design flaws.
+ */
+
+#include <linux/delay.h>
+#include <linux/dmi.h>
+#include <linux/pci.h>
+#include <linux/vgaarb.h>
+#include <asm/hpet.h>
+#include <asm/pci_x86.h>
+
+static void pci_fixup_i450nx(struct pci_dev *d)
+{
+ /*
+ * i450NX -- Find and scan all secondary buses on all PXB's.
+ */
+ int pxb, reg;
+ u8 busno, suba, subb;
+
+ dev_warn(&d->dev, "Searching for i450NX host bridges\n");
+ reg = 0xd0;
+ for(pxb = 0; pxb < 2; pxb++) {
+ pci_read_config_byte(d, reg++, &busno);
+ pci_read_config_byte(d, reg++, &suba);
+ pci_read_config_byte(d, reg++, &subb);
+ dev_dbg(&d->dev, "i450NX PXB %d: %02x/%02x/%02x\n", pxb, busno,
+ suba, subb);
+ if (busno)
+ pcibios_scan_root(busno); /* Bus A */
+ if (suba < subb)
+ pcibios_scan_root(suba+1); /* Bus B */
+ }
+ pcibios_last_bus = -1;
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82451NX, pci_fixup_i450nx);
+
+static void pci_fixup_i450gx(struct pci_dev *d)
+{
+ /*
+ * i450GX and i450KX -- Find and scan all secondary buses.
+ * (called separately for each PCI bridge found)
+ */
+ u8 busno;
+ pci_read_config_byte(d, 0x4a, &busno);
+ dev_info(&d->dev, "i440KX/GX host bridge; secondary bus %02x\n", busno);
+ pcibios_scan_root(busno);
+ pcibios_last_bus = -1;
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82454GX, pci_fixup_i450gx);
+
+static void pci_fixup_umc_ide(struct pci_dev *d)
+{
+ /*
+ * UM8886BF IDE controller sets region type bits incorrectly,
+ * therefore they look like memory despite of them being I/O.
+ */
+ int i;
+
+ dev_warn(&d->dev, "Fixing base address flags\n");
+ for(i = 0; i < 4; i++)
+ d->resource[i].flags |= PCI_BASE_ADDRESS_SPACE_IO;
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_UMC, PCI_DEVICE_ID_UMC_UM8886BF, pci_fixup_umc_ide);
+
+static void pci_fixup_latency(struct pci_dev *d)
+{
+ /*
+ * SiS 5597 and 5598 chipsets require latency timer set to
+ * at most 32 to avoid lockups.
+ */
+ dev_dbg(&d->dev, "Setting max latency to 32\n");
+ pcibios_max_latency = 32;
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_SI, PCI_DEVICE_ID_SI_5597, pci_fixup_latency);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_SI, PCI_DEVICE_ID_SI_5598, pci_fixup_latency);
+
+static void pci_fixup_piix4_acpi(struct pci_dev *d)
+{
+ /*
+ * PIIX4 ACPI device: hardwired IRQ9
+ */
+ d->irq = 9;
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82371AB_3, pci_fixup_piix4_acpi);
+
+/*
+ * Addresses issues with problems in the memory write queue timer in
+ * certain VIA Northbridges. This bugfix is per VIA's specifications,
+ * except for the KL133/KM133: clearing bit 5 on those Northbridges seems
+ * to trigger a bug in its integrated ProSavage video card, which
+ * causes screen corruption. We only clear bits 6 and 7 for that chipset,
+ * until VIA can provide us with definitive information on why screen
+ * corruption occurs, and what exactly those bits do.
+ *
+ * VIA 8363,8622,8361 Northbridges:
+ * - bits 5, 6, 7 at offset 0x55 need to be turned off
+ * VIA 8367 (KT266x) Northbridges:
+ * - bits 5, 6, 7 at offset 0x95 need to be turned off
+ * VIA 8363 rev 0x81/0x84 (KL133/KM133) Northbridges:
+ * - bits 6, 7 at offset 0x55 need to be turned off
+ */
+
+#define VIA_8363_KL133_REVISION_ID 0x81
+#define VIA_8363_KM133_REVISION_ID 0x84
+
+static void pci_fixup_via_northbridge_bug(struct pci_dev *d)
+{
+ u8 v;
+ int where = 0x55;
+ int mask = 0x1f; /* clear bits 5, 6, 7 by default */
+
+ if (d->device == PCI_DEVICE_ID_VIA_8367_0) {
+ /* fix pci bus latency issues resulted by NB bios error
+ it appears on bug free^Wreduced kt266x's bios forces
+ NB latency to zero */
+ pci_write_config_byte(d, PCI_LATENCY_TIMER, 0);
+
+ where = 0x95; /* the memory write queue timer register is
+ different for the KT266x's: 0x95 not 0x55 */
+ } else if (d->device == PCI_DEVICE_ID_VIA_8363_0 &&
+ (d->revision == VIA_8363_KL133_REVISION_ID ||
+ d->revision == VIA_8363_KM133_REVISION_ID)) {
+ mask = 0x3f; /* clear only bits 6 and 7; clearing bit 5
+ causes screen corruption on the KL133/KM133 */
+ }
+
+ pci_read_config_byte(d, where, &v);
+ if (v & ~mask) {
+ dev_warn(&d->dev, "Disabling VIA memory write queue (PCI ID %04x, rev %02x): [%02x] %02x & %02x -> %02x\n", \
+ d->device, d->revision, where, v, mask, v & mask);
+ v &= mask;
+ pci_write_config_byte(d, where, v);
+ }
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8363_0, pci_fixup_via_northbridge_bug);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8622, pci_fixup_via_northbridge_bug);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8361, pci_fixup_via_northbridge_bug);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8367_0, pci_fixup_via_northbridge_bug);
+DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8363_0, pci_fixup_via_northbridge_bug);
+DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8622, pci_fixup_via_northbridge_bug);
+DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8361, pci_fixup_via_northbridge_bug);
+DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8367_0, pci_fixup_via_northbridge_bug);
+
+/*
+ * For some reasons Intel decided that certain parts of their
+ * 815, 845 and some other chipsets must look like PCI-to-PCI bridges
+ * while they are obviously not. The 82801 family (AA, AB, BAM/CAM,
+ * BA/CA/DB and E) PCI bridges are actually HUB-to-PCI ones, according
+ * to Intel terminology. These devices do forward all addresses from
+ * system to PCI bus no matter what are their window settings, so they are
+ * "transparent" (or subtractive decoding) from programmers point of view.
+ */
+static void pci_fixup_transparent_bridge(struct pci_dev *dev)
+{
+ if ((dev->device & 0xff00) == 0x2400)
+ dev->transparent = 1;
+}
+DECLARE_PCI_FIXUP_CLASS_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID,
+ PCI_CLASS_BRIDGE_PCI, 8, pci_fixup_transparent_bridge);
+
+/*
+ * Fixup for C1 Halt Disconnect problem on nForce2 systems.
+ *
+ * From information provided by "Allen Martin" <AMartin@nvidia.com>:
+ *
+ * A hang is caused when the CPU generates a very fast CONNECT/HALT cycle
+ * sequence. Workaround is to set the SYSTEM_IDLE_TIMEOUT to 80 ns.
+ * This allows the state-machine and timer to return to a proper state within
+ * 80 ns of the CONNECT and probe appearing together. Since the CPU will not
+ * issue another HALT within 80 ns of the initial HALT, the failure condition
+ * is avoided.
+ */
+static void pci_fixup_nforce2(struct pci_dev *dev)
+{
+ u32 val;
+
+ /*
+ * Chip Old value New value
+ * C17 0x1F0FFF01 0x1F01FF01
+ * C18D 0x9F0FFF01 0x9F01FF01
+ *
+ * Northbridge chip version may be determined by
+ * reading the PCI revision ID (0xC1 or greater is C18D).
+ */
+ pci_read_config_dword(dev, 0x6c, &val);
+
+ /*
+ * Apply fixup if needed, but don't touch disconnect state
+ */
+ if ((val & 0x00FF0000) != 0x00010000) {
+ dev_warn(&dev->dev, "nForce2 C1 Halt Disconnect fixup\n");
+ pci_write_config_dword(dev, 0x6c, (val & 0xFF00FFFF) | 0x00010000);
+ }
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE2, pci_fixup_nforce2);
+DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE2, pci_fixup_nforce2);
+
+/* Max PCI Express root ports */
+#define MAX_PCIEROOT 6
+static int quirk_aspm_offset[MAX_PCIEROOT << 3];
+
+#define GET_INDEX(a, b) ((((a) - PCI_DEVICE_ID_INTEL_MCH_PA) << 3) + ((b) & 7))
+
+static int quirk_pcie_aspm_read(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 *value)
+{
+ return raw_pci_read(pci_domain_nr(bus), bus->number,
+ devfn, where, size, value);
+}
+
+/*
+ * Replace the original pci bus ops for write with a new one that will filter
+ * the request to insure ASPM cannot be enabled.
+ */
+static int quirk_pcie_aspm_write(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 value)
+{
+ u8 offset;
+
+ offset = quirk_aspm_offset[GET_INDEX(bus->self->device, devfn)];
+
+ if ((offset) && (where == offset))
+ value = value & ~PCI_EXP_LNKCTL_ASPMC;
+
+ return raw_pci_write(pci_domain_nr(bus), bus->number,
+ devfn, where, size, value);
+}
+
+static struct pci_ops quirk_pcie_aspm_ops = {
+ .read = quirk_pcie_aspm_read,
+ .write = quirk_pcie_aspm_write,
+};
+
+/*
+ * Prevents PCI Express ASPM (Active State Power Management) being enabled.
+ *
+ * Save the register offset, where the ASPM control bits are located,
+ * for each PCI Express device that is in the device list of
+ * the root port in an array for fast indexing. Replace the bus ops
+ * with the modified one.
+ */
+static void pcie_rootport_aspm_quirk(struct pci_dev *pdev)
+{
+ int i;
+ struct pci_bus *pbus;
+ struct pci_dev *dev;
+
+ if ((pbus = pdev->subordinate) == NULL)
+ return;
+
+ /*
+ * Check if the DID of pdev matches one of the six root ports. This
+ * check is needed in the case this function is called directly by the
+ * hot-plug driver.
+ */
+ if ((pdev->device < PCI_DEVICE_ID_INTEL_MCH_PA) ||
+ (pdev->device > PCI_DEVICE_ID_INTEL_MCH_PC1))
+ return;
+
+ if (list_empty(&pbus->devices)) {
+ /*
+ * If no device is attached to the root port at power-up or
+ * after hot-remove, the pbus->devices is empty and this code
+ * will set the offsets to zero and the bus ops to parent's bus
+ * ops, which is unmodified.
+ */
+ for (i = GET_INDEX(pdev->device, 0); i <= GET_INDEX(pdev->device, 7); ++i)
+ quirk_aspm_offset[i] = 0;
+
+ pci_bus_set_ops(pbus, pbus->parent->ops);
+ } else {
+ /*
+ * If devices are attached to the root port at power-up or
+ * after hot-add, the code loops through the device list of
+ * each root port to save the register offsets and replace the
+ * bus ops.
+ */
+ list_for_each_entry(dev, &pbus->devices, bus_list)
+ /* There are 0 to 8 devices attached to this bus */
+ quirk_aspm_offset[GET_INDEX(pdev->device, dev->devfn)] =
+ dev->pcie_cap + PCI_EXP_LNKCTL;
+
+ pci_bus_set_ops(pbus, &quirk_pcie_aspm_ops);
+ dev_info(&pbus->dev, "writes to ASPM control bits will be ignored\n");
+ }
+
+}
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PA, pcie_rootport_aspm_quirk);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PA1, pcie_rootport_aspm_quirk);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PB, pcie_rootport_aspm_quirk);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PB1, pcie_rootport_aspm_quirk);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PC, pcie_rootport_aspm_quirk);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PC1, pcie_rootport_aspm_quirk);
+
+/*
+ * Fixup to mark boot BIOS video selected by BIOS before it changes
+ *
+ * From information provided by "Jon Smirl" <jonsmirl@gmail.com>
+ *
+ * The standard boot ROM sequence for an x86 machine uses the BIOS
+ * to select an initial video card for boot display. This boot video
+ * card will have its BIOS copied to 0xC0000 in system RAM.
+ * IORESOURCE_ROM_SHADOW is used to associate the boot video
+ * card with this copy. On laptops this copy has to be used since
+ * the main ROM may be compressed or combined with another image.
+ * See pci_map_rom() for use of this flag. Before marking the device
+ * with IORESOURCE_ROM_SHADOW check if a vga_default_device is already set
+ * by either arch code or vga-arbitration; if so only apply the fixup to this
+ * already-determined primary video card.
+ */
+
+static void pci_fixup_video(struct pci_dev *pdev)
+{
+ struct pci_dev *bridge;
+ struct pci_bus *bus;
+ u16 config;
+ struct resource *res;
+
+ /* Is VGA routed to us? */
+ bus = pdev->bus;
+ while (bus) {
+ bridge = bus->self;
+
+ /*
+ * From information provided by
+ * "David Miller" <davem@davemloft.net>
+ * The bridge control register is valid for PCI header
+ * type BRIDGE, or CARDBUS. Host to PCI controllers use
+ * PCI header type NORMAL.
+ */
+ if (bridge && (pci_is_bridge(bridge))) {
+ pci_read_config_word(bridge, PCI_BRIDGE_CONTROL,
+ &config);
+ if (!(config & PCI_BRIDGE_CTL_VGA))
+ return;
+ }
+ bus = bus->parent;
+ }
+ if (!vga_default_device() || pdev == vga_default_device()) {
+ pci_read_config_word(pdev, PCI_COMMAND, &config);
+ if (config & (PCI_COMMAND_IO | PCI_COMMAND_MEMORY)) {
+ res = &pdev->resource[PCI_ROM_RESOURCE];
+
+ pci_disable_rom(pdev);
+ if (res->parent)
+ release_resource(res);
+
+ res->start = 0xC0000;
+ res->end = res->start + 0x20000 - 1;
+ res->flags = IORESOURCE_MEM | IORESOURCE_ROM_SHADOW |
+ IORESOURCE_PCI_FIXED;
+ dev_info(&pdev->dev, "Video device with shadowed ROM at %pR\n",
+ res);
+ }
+ }
+}
+DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_ANY_ID, PCI_ANY_ID,
+ PCI_CLASS_DISPLAY_VGA, 8, pci_fixup_video);
+
+
+static const struct dmi_system_id msi_k8t_dmi_table[] = {
+ {
+ .ident = "MSI-K8T-Neo2Fir",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "MSI"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "MS-6702E"),
+ },
+ },
+ {}
+};
+
+/*
+ * The AMD-Athlon64 board MSI "K8T Neo2-FIR" disables the onboard sound
+ * card if a PCI-soundcard is added.
+ *
+ * The BIOS only gives options "DISABLED" and "AUTO". This code sets
+ * the corresponding register-value to enable the soundcard.
+ *
+ * The soundcard is only enabled, if the mainborad is identified
+ * via DMI-tables and the soundcard is detected to be off.
+ */
+static void pci_fixup_msi_k8t_onboard_sound(struct pci_dev *dev)
+{
+ unsigned char val;
+ if (!dmi_check_system(msi_k8t_dmi_table))
+ return; /* only applies to MSI K8T Neo2-FIR */
+
+ pci_read_config_byte(dev, 0x50, &val);
+ if (val & 0x40) {
+ pci_write_config_byte(dev, 0x50, val & (~0x40));
+
+ /* verify the change for status output */
+ pci_read_config_byte(dev, 0x50, &val);
+ if (val & 0x40)
+ dev_info(&dev->dev, "Detected MSI K8T Neo2-FIR; "
+ "can't enable onboard soundcard!\n");
+ else
+ dev_info(&dev->dev, "Detected MSI K8T Neo2-FIR; "
+ "enabled onboard soundcard\n");
+ }
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8237,
+ pci_fixup_msi_k8t_onboard_sound);
+DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8237,
+ pci_fixup_msi_k8t_onboard_sound);
+
+/*
+ * Some Toshiba laptops need extra code to enable their TI TSB43AB22/A.
+ *
+ * We pretend to bring them out of full D3 state, and restore the proper
+ * IRQ, PCI cache line size, and BARs, otherwise the device won't function
+ * properly. In some cases, the device will generate an interrupt on
+ * the wrong IRQ line, causing any devices sharing the line it's
+ * *supposed* to use to be disabled by the kernel's IRQ debug code.
+ */
+static u16 toshiba_line_size;
+
+static const struct dmi_system_id toshiba_ohci1394_dmi_table[] = {
+ {
+ .ident = "Toshiba PS5 based laptop",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "TOSHIBA"),
+ DMI_MATCH(DMI_PRODUCT_VERSION, "PS5"),
+ },
+ },
+ {
+ .ident = "Toshiba PSM4 based laptop",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "TOSHIBA"),
+ DMI_MATCH(DMI_PRODUCT_VERSION, "PSM4"),
+ },
+ },
+ {
+ .ident = "Toshiba A40 based laptop",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "TOSHIBA"),
+ DMI_MATCH(DMI_PRODUCT_VERSION, "PSA40U"),
+ },
+ },
+ { }
+};
+
+static void pci_pre_fixup_toshiba_ohci1394(struct pci_dev *dev)
+{
+ if (!dmi_check_system(toshiba_ohci1394_dmi_table))
+ return; /* only applies to certain Toshibas (so far) */
+
+ dev->current_state = PCI_D3cold;
+ pci_read_config_word(dev, PCI_CACHE_LINE_SIZE, &toshiba_line_size);
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_TI, 0x8032,
+ pci_pre_fixup_toshiba_ohci1394);
+
+static void pci_post_fixup_toshiba_ohci1394(struct pci_dev *dev)
+{
+ if (!dmi_check_system(toshiba_ohci1394_dmi_table))
+ return; /* only applies to certain Toshibas (so far) */
+
+ /* Restore config space on Toshiba laptops */
+ pci_write_config_word(dev, PCI_CACHE_LINE_SIZE, toshiba_line_size);
+ pci_read_config_byte(dev, PCI_INTERRUPT_LINE, (u8 *)&dev->irq);
+ pci_write_config_dword(dev, PCI_BASE_ADDRESS_0,
+ pci_resource_start(dev, 0));
+ pci_write_config_dword(dev, PCI_BASE_ADDRESS_1,
+ pci_resource_start(dev, 1));
+}
+DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_TI, 0x8032,
+ pci_post_fixup_toshiba_ohci1394);
+
+
+/*
+ * Prevent the BIOS trapping accesses to the Cyrix CS5530A video device
+ * configuration space.
+ */
+static void pci_early_fixup_cyrix_5530(struct pci_dev *dev)
+{
+ u8 r;
+ /* clear 'F4 Video Configuration Trap' bit */
+ pci_read_config_byte(dev, 0x42, &r);
+ r &= 0xfd;
+ pci_write_config_byte(dev, 0x42, r);
+}
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY,
+ pci_early_fixup_cyrix_5530);
+DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY,
+ pci_early_fixup_cyrix_5530);
+
+/*
+ * Siemens Nixdorf AG FSC Multiprocessor Interrupt Controller:
+ * prevent update of the BAR0, which doesn't look like a normal BAR.
+ */
+static void pci_siemens_interrupt_controller(struct pci_dev *dev)
+{
+ dev->resource[0].flags |= IORESOURCE_PCI_FIXED;
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_SIEMENS, 0x0015,
+ pci_siemens_interrupt_controller);
+
+/*
+ * SB600: Disable BAR1 on device 14.0 to avoid HPET resources from
+ * confusing the PCI engine:
+ */
+static void sb600_disable_hpet_bar(struct pci_dev *dev)
+{
+ u8 val;
+
+ /*
+ * The SB600 and SB700 both share the same device
+ * ID, but the PM register 0x55 does something different
+ * for the SB700, so make sure we are dealing with the
+ * SB600 before touching the bit:
+ */
+
+ pci_read_config_byte(dev, 0x08, &val);
+
+ if (val < 0x2F) {
+ outb(0x55, 0xCD6);
+ val = inb(0xCD7);
+
+ /* Set bit 7 in PM register 0x55 */
+ outb(0x55, 0xCD6);
+ outb(val | 0x80, 0xCD7);
+ }
+}
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_ATI, 0x4385, sb600_disable_hpet_bar);
+
+#ifdef CONFIG_HPET_TIMER
+static void sb600_hpet_quirk(struct pci_dev *dev)
+{
+ struct resource *r = &dev->resource[1];
+
+ if (r->flags & IORESOURCE_MEM && r->start == hpet_address) {
+ r->flags |= IORESOURCE_PCI_FIXED;
+ dev_info(&dev->dev, "reg 0x14 contains HPET; making it immovable\n");
+ }
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, 0x4385, sb600_hpet_quirk);
+#endif
+
+/*
+ * Twinhead H12Y needs us to block out a region otherwise we map devices
+ * there and any access kills the box.
+ *
+ * See: https://bugzilla.kernel.org/show_bug.cgi?id=10231
+ *
+ * Match off the LPC and svid/sdid (older kernels lose the bridge subvendor)
+ */
+static void twinhead_reserve_killing_zone(struct pci_dev *dev)
+{
+ if (dev->subsystem_vendor == 0x14FF && dev->subsystem_device == 0xA003) {
+ pr_info("Reserving memory on Twinhead H12Y\n");
+ request_mem_region(0xFFB00000, 0x100000, "twinhead");
+ }
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x27B9, twinhead_reserve_killing_zone);
+
+/*
+ * Device [8086:2fc0]
+ * Erratum HSE43
+ * CONFIG_TDP_NOMINAL CSR Implemented at Incorrect Offset
+ * http://www.intel.com/content/www/us/en/processors/xeon/xeon-e5-v3-spec-update.html
+ *
+ * Devices [8086:6f60,6fa0,6fc0]
+ * Erratum BDF2
+ * PCI BARs in the Home Agent Will Return Non-Zero Values During Enumeration
+ * http://www.intel.com/content/www/us/en/processors/xeon/xeon-e5-v4-spec-update.html
+ */
+static void pci_invalid_bar(struct pci_dev *dev)
+{
+ dev->non_compliant_bars = 1;
+}
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2fc0, pci_invalid_bar);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6f60, pci_invalid_bar);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fa0, pci_invalid_bar);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fc0, pci_invalid_bar);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0xa1ec, pci_invalid_bar);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0xa1ed, pci_invalid_bar);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0xa26c, pci_invalid_bar);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0xa26d, pci_invalid_bar);
+
+/*
+ * Device [1022:7808]
+ * 23. USB Wake on Connect/Disconnect with Low Speed Devices
+ * https://support.amd.com/TechDocs/46837.pdf
+ * Appendix A2
+ * https://support.amd.com/TechDocs/42413.pdf
+ */
+static void pci_fixup_amd_ehci_pme(struct pci_dev *dev)
+{
+ dev_info(&dev->dev, "PME# does not work under D3, disabling it\n");
+ dev->pme_support &= ~((PCI_PM_CAP_PME_D3 | PCI_PM_CAP_PME_D3cold)
+ >> PCI_PM_CAP_PME_SHIFT);
+}
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, 0x7808, pci_fixup_amd_ehci_pme);
+
+/*
+ * Device [1022:7914]
+ * When in D0, PME# doesn't get asserted when plugging USB 2.0 device.
+ */
+static void pci_fixup_amd_fch_xhci_pme(struct pci_dev *dev)
+{
+ dev_info(&dev->dev, "PME# does not work under D0, disabling it\n");
+ dev->pme_support &= ~(PCI_PM_CAP_PME_D0 >> PCI_PM_CAP_PME_SHIFT);
+}
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, 0x7914, pci_fixup_amd_fch_xhci_pme);
+
+/*
+ * Apple MacBook Pro: Avoid [mem 0x7fa00000-0x7fbfffff]
+ *
+ * Using the [mem 0x7fa00000-0x7fbfffff] region, e.g., by assigning it to
+ * the 00:1c.0 Root Port, causes a conflict with [io 0x1804], which is used
+ * for soft poweroff and suspend-to-RAM.
+ *
+ * As far as we know, this is related to the address space, not to the Root
+ * Port itself. Attaching the quirk to the Root Port is a convenience, but
+ * it could probably also be a standalone DMI quirk.
+ *
+ * https://bugzilla.kernel.org/show_bug.cgi?id=103211
+ */
+static void quirk_apple_mbp_poweroff(struct pci_dev *pdev)
+{
+ struct device *dev = &pdev->dev;
+ struct resource *res;
+
+ if ((!dmi_match(DMI_PRODUCT_NAME, "MacBookPro11,4") &&
+ !dmi_match(DMI_PRODUCT_NAME, "MacBookPro11,5")) ||
+ pdev->bus->number != 0 || pdev->devfn != PCI_DEVFN(0x1c, 0))
+ return;
+
+ res = request_mem_region(0x7fa00000, 0x200000,
+ "MacBook Pro poweroff workaround");
+ if (res)
+ dev_info(dev, "claimed %s %pR\n", res->name, res);
+ else
+ dev_info(dev, "can't work around MacBook Pro poweroff issue\n");
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x8c10, quirk_apple_mbp_poweroff);
+
+/*
+ * VMD-enabled root ports will change the source ID for all messages
+ * to the VMD device. Rather than doing device matching with the source
+ * ID, the AER driver should traverse the child device tree, reading
+ * AER registers to find the faulting device.
+ */
+static void quirk_no_aersid(struct pci_dev *pdev)
+{
+ /* VMD Domain */
+ if (is_vmd(pdev->bus) && pci_is_root_bus(pdev->bus))
+ pdev->bus->bus_flags |= PCI_BUS_FLAGS_NO_AERSID;
+}
+DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_INTEL, PCI_ANY_ID,
+ PCI_CLASS_BRIDGE_PCI, 8, quirk_no_aersid);
+
+static void quirk_intel_th_dnv(struct pci_dev *dev)
+{
+ struct resource *r = &dev->resource[4];
+
+ /*
+ * Denverton reports 2k of RTIT_BAR (intel_th resource 4), which
+ * appears to be 4 MB in reality.
+ */
+ if (r->end == r->start + 0x7ff) {
+ r->start = 0;
+ r->end = 0x3fffff;
+ r->flags |= IORESOURCE_UNSET;
+ }
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x19e1, quirk_intel_th_dnv);
+
+#ifdef CONFIG_PHYS_ADDR_T_64BIT
+
+#define AMD_141b_MMIO_BASE(x) (0x80 + (x) * 0x8)
+#define AMD_141b_MMIO_BASE_RE_MASK BIT(0)
+#define AMD_141b_MMIO_BASE_WE_MASK BIT(1)
+#define AMD_141b_MMIO_BASE_MMIOBASE_MASK GENMASK(31,8)
+
+#define AMD_141b_MMIO_LIMIT(x) (0x84 + (x) * 0x8)
+#define AMD_141b_MMIO_LIMIT_MMIOLIMIT_MASK GENMASK(31,8)
+
+#define AMD_141b_MMIO_HIGH(x) (0x180 + (x) * 0x4)
+#define AMD_141b_MMIO_HIGH_MMIOBASE_MASK GENMASK(7,0)
+#define AMD_141b_MMIO_HIGH_MMIOLIMIT_SHIFT 16
+#define AMD_141b_MMIO_HIGH_MMIOLIMIT_MASK GENMASK(23,16)
+
+/*
+ * The PCI Firmware Spec, rev 3.2, notes that ACPI should optionally allow
+ * configuring host bridge windows using the _PRS and _SRS methods.
+ *
+ * But this is rarely implemented, so we manually enable a large 64bit BAR for
+ * PCIe device on AMD Family 15h (Models 00h-1fh, 30h-3fh, 60h-7fh) Processors
+ * here.
+ */
+static void pci_amd_enable_64bit_bar(struct pci_dev *dev)
+{
+ static const char *name = "PCI Bus 0000:00";
+ struct resource *res, *conflict;
+ u32 base, limit, high;
+ struct pci_dev *other;
+ unsigned i;
+
+ if (!(pci_probe & PCI_BIG_ROOT_WINDOW))
+ return;
+
+ /* Check that we are the only device of that type */
+ other = pci_get_device(dev->vendor, dev->device, NULL);
+ if (other != dev ||
+ (other = pci_get_device(dev->vendor, dev->device, other))) {
+ /* This is a multi-socket system, don't touch it for now */
+ pci_dev_put(other);
+ return;
+ }
+
+ for (i = 0; i < 8; i++) {
+ pci_read_config_dword(dev, AMD_141b_MMIO_BASE(i), &base);
+ pci_read_config_dword(dev, AMD_141b_MMIO_HIGH(i), &high);
+
+ /* Is this slot free? */
+ if (!(base & (AMD_141b_MMIO_BASE_RE_MASK |
+ AMD_141b_MMIO_BASE_WE_MASK)))
+ break;
+
+ base >>= 8;
+ base |= high << 24;
+
+ /* Abort if a slot already configures a 64bit BAR. */
+ if (base > 0x10000)
+ return;
+ }
+ if (i == 8)
+ return;
+
+ res = kzalloc(sizeof(*res), GFP_KERNEL);
+ if (!res)
+ return;
+
+ /*
+ * Allocate a 256GB window directly below the 0xfd00000000 hardware
+ * limit (see AMD Family 15h Models 30h-3Fh BKDG, sec 2.4.6).
+ */
+ res->name = name;
+ res->flags = IORESOURCE_PREFETCH | IORESOURCE_MEM |
+ IORESOURCE_MEM_64 | IORESOURCE_WINDOW;
+ res->start = 0xbd00000000ull;
+ res->end = 0xfd00000000ull - 1;
+
+ conflict = request_resource_conflict(&iomem_resource, res);
+ if (conflict) {
+ kfree(res);
+ if (conflict->name != name)
+ return;
+
+ /* We are resuming from suspend; just reenable the window */
+ res = conflict;
+ } else {
+ dev_info(&dev->dev, "adding root bus resource %pR (tainting kernel)\n",
+ res);
+ add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
+ pci_bus_add_resource(dev->bus, res, 0);
+ }
+
+ base = ((res->start >> 8) & AMD_141b_MMIO_BASE_MMIOBASE_MASK) |
+ AMD_141b_MMIO_BASE_RE_MASK | AMD_141b_MMIO_BASE_WE_MASK;
+ limit = ((res->end + 1) >> 8) & AMD_141b_MMIO_LIMIT_MMIOLIMIT_MASK;
+ high = ((res->start >> 40) & AMD_141b_MMIO_HIGH_MMIOBASE_MASK) |
+ ((((res->end + 1) >> 40) << AMD_141b_MMIO_HIGH_MMIOLIMIT_SHIFT)
+ & AMD_141b_MMIO_HIGH_MMIOLIMIT_MASK);
+
+ pci_write_config_dword(dev, AMD_141b_MMIO_HIGH(i), high);
+ pci_write_config_dword(dev, AMD_141b_MMIO_LIMIT(i), limit);
+ pci_write_config_dword(dev, AMD_141b_MMIO_BASE(i), base);
+}
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, 0x1401, pci_amd_enable_64bit_bar);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, 0x141b, pci_amd_enable_64bit_bar);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, 0x1571, pci_amd_enable_64bit_bar);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, 0x15b1, pci_amd_enable_64bit_bar);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, 0x1601, pci_amd_enable_64bit_bar);
+DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_AMD, 0x1401, pci_amd_enable_64bit_bar);
+DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_AMD, 0x141b, pci_amd_enable_64bit_bar);
+DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_AMD, 0x1571, pci_amd_enable_64bit_bar);
+DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_AMD, 0x15b1, pci_amd_enable_64bit_bar);
+DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_AMD, 0x1601, pci_amd_enable_64bit_bar);
+
+#define RS690_LOWER_TOP_OF_DRAM2 0x30
+#define RS690_LOWER_TOP_OF_DRAM2_VALID 0x1
+#define RS690_UPPER_TOP_OF_DRAM2 0x31
+#define RS690_HTIU_NB_INDEX 0xA8
+#define RS690_HTIU_NB_INDEX_WR_ENABLE 0x100
+#define RS690_HTIU_NB_DATA 0xAC
+
+/*
+ * Some BIOS implementations support RAM above 4GB, but do not configure the
+ * PCI host to respond to bus master accesses for these addresses. These
+ * implementations set the TOP_OF_DRAM_SLOT1 register correctly, so PCI DMA
+ * works as expected for addresses below 4GB.
+ *
+ * Reference: "AMD RS690 ASIC Family Register Reference Guide" (pg. 2-57)
+ * https://www.amd.com/system/files/TechDocs/43372_rs690_rrg_3.00o.pdf
+ */
+static void rs690_fix_64bit_dma(struct pci_dev *pdev)
+{
+ u32 val = 0;
+ phys_addr_t top_of_dram = __pa(high_memory - 1) + 1;
+
+ if (top_of_dram <= (1ULL << 32))
+ return;
+
+ pci_write_config_dword(pdev, RS690_HTIU_NB_INDEX,
+ RS690_LOWER_TOP_OF_DRAM2);
+ pci_read_config_dword(pdev, RS690_HTIU_NB_DATA, &val);
+
+ if (val)
+ return;
+
+ pci_info(pdev, "Adjusting top of DRAM to %pa for 64-bit DMA support\n", &top_of_dram);
+
+ pci_write_config_dword(pdev, RS690_HTIU_NB_INDEX,
+ RS690_UPPER_TOP_OF_DRAM2 | RS690_HTIU_NB_INDEX_WR_ENABLE);
+ pci_write_config_dword(pdev, RS690_HTIU_NB_DATA, top_of_dram >> 32);
+
+ pci_write_config_dword(pdev, RS690_HTIU_NB_INDEX,
+ RS690_LOWER_TOP_OF_DRAM2 | RS690_HTIU_NB_INDEX_WR_ENABLE);
+ pci_write_config_dword(pdev, RS690_HTIU_NB_DATA,
+ top_of_dram | RS690_LOWER_TOP_OF_DRAM2_VALID);
+}
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATI, 0x7910, rs690_fix_64bit_dma);
+
+#endif
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
new file mode 100644
index 000000000..ed4ac2153
--- /dev/null
+++ b/arch/x86/pci/i386.c
@@ -0,0 +1,409 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Low-Level PCI Access for i386 machines
+ *
+ * Copyright 1993, 1994 Drew Eckhardt
+ * Visionary Computing
+ * (Unix and Linux consulting and custom programming)
+ * Drew@Colorado.EDU
+ * +1 (303) 786-7975
+ *
+ * Drew's work was sponsored by:
+ * iX Multiuser Multitasking Magazine
+ * Hannover, Germany
+ * hm@ix.de
+ *
+ * Copyright 1997--2000 Martin Mares <mj@ucw.cz>
+ *
+ * For more information, please consult the following manuals (look at
+ * http://www.pcisig.com/ for how to get them):
+ *
+ * PCI BIOS Specification
+ * PCI Local Bus Specification
+ * PCI to PCI Bridge Specification
+ * PCI System Design Guide
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/pci.h>
+#include <linux/init.h>
+#include <linux/ioport.h>
+#include <linux/errno.h>
+#include <linux/bootmem.h>
+
+#include <asm/pat.h>
+#include <asm/e820/api.h>
+#include <asm/pci_x86.h>
+#include <asm/io_apic.h>
+
+
+/*
+ * This list of dynamic mappings is for temporarily maintaining
+ * original BIOS BAR addresses for possible reinstatement.
+ */
+struct pcibios_fwaddrmap {
+ struct list_head list;
+ struct pci_dev *dev;
+ resource_size_t fw_addr[DEVICE_COUNT_RESOURCE];
+};
+
+static LIST_HEAD(pcibios_fwaddrmappings);
+static DEFINE_SPINLOCK(pcibios_fwaddrmap_lock);
+static bool pcibios_fw_addr_done;
+
+/* Must be called with 'pcibios_fwaddrmap_lock' lock held. */
+static struct pcibios_fwaddrmap *pcibios_fwaddrmap_lookup(struct pci_dev *dev)
+{
+ struct pcibios_fwaddrmap *map;
+
+ WARN_ON_SMP(!spin_is_locked(&pcibios_fwaddrmap_lock));
+
+ list_for_each_entry(map, &pcibios_fwaddrmappings, list)
+ if (map->dev == dev)
+ return map;
+
+ return NULL;
+}
+
+static void
+pcibios_save_fw_addr(struct pci_dev *dev, int idx, resource_size_t fw_addr)
+{
+ unsigned long flags;
+ struct pcibios_fwaddrmap *map;
+
+ if (pcibios_fw_addr_done)
+ return;
+
+ spin_lock_irqsave(&pcibios_fwaddrmap_lock, flags);
+ map = pcibios_fwaddrmap_lookup(dev);
+ if (!map) {
+ spin_unlock_irqrestore(&pcibios_fwaddrmap_lock, flags);
+ map = kzalloc(sizeof(*map), GFP_KERNEL);
+ if (!map)
+ return;
+
+ map->dev = pci_dev_get(dev);
+ map->fw_addr[idx] = fw_addr;
+ INIT_LIST_HEAD(&map->list);
+
+ spin_lock_irqsave(&pcibios_fwaddrmap_lock, flags);
+ list_add_tail(&map->list, &pcibios_fwaddrmappings);
+ } else
+ map->fw_addr[idx] = fw_addr;
+ spin_unlock_irqrestore(&pcibios_fwaddrmap_lock, flags);
+}
+
+resource_size_t pcibios_retrieve_fw_addr(struct pci_dev *dev, int idx)
+{
+ unsigned long flags;
+ struct pcibios_fwaddrmap *map;
+ resource_size_t fw_addr = 0;
+
+ if (pcibios_fw_addr_done)
+ return 0;
+
+ spin_lock_irqsave(&pcibios_fwaddrmap_lock, flags);
+ map = pcibios_fwaddrmap_lookup(dev);
+ if (map)
+ fw_addr = map->fw_addr[idx];
+ spin_unlock_irqrestore(&pcibios_fwaddrmap_lock, flags);
+
+ return fw_addr;
+}
+
+static void __init pcibios_fw_addr_list_del(void)
+{
+ unsigned long flags;
+ struct pcibios_fwaddrmap *entry, *next;
+
+ spin_lock_irqsave(&pcibios_fwaddrmap_lock, flags);
+ list_for_each_entry_safe(entry, next, &pcibios_fwaddrmappings, list) {
+ list_del(&entry->list);
+ pci_dev_put(entry->dev);
+ kfree(entry);
+ }
+ spin_unlock_irqrestore(&pcibios_fwaddrmap_lock, flags);
+ pcibios_fw_addr_done = true;
+}
+
+static int
+skip_isa_ioresource_align(struct pci_dev *dev) {
+
+ if ((pci_probe & PCI_CAN_SKIP_ISA_ALIGN) &&
+ !(dev->bus->bridge_ctl & PCI_BRIDGE_CTL_ISA))
+ return 1;
+ return 0;
+}
+
+/*
+ * We need to avoid collisions with `mirrored' VGA ports
+ * and other strange ISA hardware, so we always want the
+ * addresses to be allocated in the 0x000-0x0ff region
+ * modulo 0x400.
+ *
+ * Why? Because some silly external IO cards only decode
+ * the low 10 bits of the IO address. The 0x00-0xff region
+ * is reserved for motherboard devices that decode all 16
+ * bits, so it's ok to allocate at, say, 0x2800-0x28ff,
+ * but we want to try to avoid allocating at 0x2900-0x2bff
+ * which might have be mirrored at 0x0100-0x03ff..
+ */
+resource_size_t
+pcibios_align_resource(void *data, const struct resource *res,
+ resource_size_t size, resource_size_t align)
+{
+ struct pci_dev *dev = data;
+ resource_size_t start = res->start;
+
+ if (res->flags & IORESOURCE_IO) {
+ if (skip_isa_ioresource_align(dev))
+ return start;
+ if (start & 0x300)
+ start = (start + 0x3ff) & ~0x3ff;
+ } else if (res->flags & IORESOURCE_MEM) {
+ /* The low 1MB range is reserved for ISA cards */
+ if (start < BIOS_END)
+ start = BIOS_END;
+ }
+ return start;
+}
+EXPORT_SYMBOL(pcibios_align_resource);
+
+/*
+ * Handle resources of PCI devices. If the world were perfect, we could
+ * just allocate all the resource regions and do nothing more. It isn't.
+ * On the other hand, we cannot just re-allocate all devices, as it would
+ * require us to know lots of host bridge internals. So we attempt to
+ * keep as much of the original configuration as possible, but tweak it
+ * when it's found to be wrong.
+ *
+ * Known BIOS problems we have to work around:
+ * - I/O or memory regions not configured
+ * - regions configured, but not enabled in the command register
+ * - bogus I/O addresses above 64K used
+ * - expansion ROMs left enabled (this may sound harmless, but given
+ * the fact the PCI specs explicitly allow address decoders to be
+ * shared between expansion ROMs and other resource regions, it's
+ * at least dangerous)
+ * - bad resource sizes or overlaps with other regions
+ *
+ * Our solution:
+ * (1) Allocate resources for all buses behind PCI-to-PCI bridges.
+ * This gives us fixed barriers on where we can allocate.
+ * (2) Allocate resources for all enabled devices. If there is
+ * a collision, just mark the resource as unallocated. Also
+ * disable expansion ROMs during this step.
+ * (3) Try to allocate resources for disabled devices. If the
+ * resources were assigned correctly, everything goes well,
+ * if they weren't, they won't disturb allocation of other
+ * resources.
+ * (4) Assign new addresses to resources which were either
+ * not configured at all or misconfigured. If explicitly
+ * requested by the user, configure expansion ROM address
+ * as well.
+ */
+
+static void pcibios_allocate_bridge_resources(struct pci_dev *dev)
+{
+ int idx;
+ struct resource *r;
+
+ for (idx = PCI_BRIDGE_RESOURCES; idx < PCI_NUM_RESOURCES; idx++) {
+ r = &dev->resource[idx];
+ if (!r->flags)
+ continue;
+ if (r->parent) /* Already allocated */
+ continue;
+ if (!r->start || pci_claim_bridge_resource(dev, idx) < 0) {
+ /*
+ * Something is wrong with the region.
+ * Invalidate the resource to prevent
+ * child resource allocations in this
+ * range.
+ */
+ r->start = r->end = 0;
+ r->flags = 0;
+ }
+ }
+}
+
+static void pcibios_allocate_bus_resources(struct pci_bus *bus)
+{
+ struct pci_bus *child;
+
+ /* Depth-First Search on bus tree */
+ if (bus->self)
+ pcibios_allocate_bridge_resources(bus->self);
+ list_for_each_entry(child, &bus->children, node)
+ pcibios_allocate_bus_resources(child);
+}
+
+struct pci_check_idx_range {
+ int start;
+ int end;
+};
+
+static void pcibios_allocate_dev_resources(struct pci_dev *dev, int pass)
+{
+ int idx, disabled, i;
+ u16 command;
+ struct resource *r;
+
+ struct pci_check_idx_range idx_range[] = {
+ { PCI_STD_RESOURCES, PCI_STD_RESOURCE_END },
+#ifdef CONFIG_PCI_IOV
+ { PCI_IOV_RESOURCES, PCI_IOV_RESOURCE_END },
+#endif
+ };
+
+ pci_read_config_word(dev, PCI_COMMAND, &command);
+ for (i = 0; i < ARRAY_SIZE(idx_range); i++)
+ for (idx = idx_range[i].start; idx <= idx_range[i].end; idx++) {
+ r = &dev->resource[idx];
+ if (r->parent) /* Already allocated */
+ continue;
+ if (!r->start) /* Address not assigned at all */
+ continue;
+ if (r->flags & IORESOURCE_IO)
+ disabled = !(command & PCI_COMMAND_IO);
+ else
+ disabled = !(command & PCI_COMMAND_MEMORY);
+ if (pass == disabled) {
+ dev_dbg(&dev->dev,
+ "BAR %d: reserving %pr (d=%d, p=%d)\n",
+ idx, r, disabled, pass);
+ if (pci_claim_resource(dev, idx) < 0) {
+ if (r->flags & IORESOURCE_PCI_FIXED) {
+ dev_info(&dev->dev, "BAR %d %pR is immovable\n",
+ idx, r);
+ } else {
+ /* We'll assign a new address later */
+ pcibios_save_fw_addr(dev,
+ idx, r->start);
+ r->end -= r->start;
+ r->start = 0;
+ }
+ }
+ }
+ }
+ if (!pass) {
+ r = &dev->resource[PCI_ROM_RESOURCE];
+ if (r->flags & IORESOURCE_ROM_ENABLE) {
+ /* Turn the ROM off, leave the resource region,
+ * but keep it unregistered. */
+ u32 reg;
+ dev_dbg(&dev->dev, "disabling ROM %pR\n", r);
+ r->flags &= ~IORESOURCE_ROM_ENABLE;
+ pci_read_config_dword(dev, dev->rom_base_reg, &reg);
+ pci_write_config_dword(dev, dev->rom_base_reg,
+ reg & ~PCI_ROM_ADDRESS_ENABLE);
+ }
+ }
+}
+
+static void pcibios_allocate_resources(struct pci_bus *bus, int pass)
+{
+ struct pci_dev *dev;
+ struct pci_bus *child;
+
+ list_for_each_entry(dev, &bus->devices, bus_list) {
+ pcibios_allocate_dev_resources(dev, pass);
+
+ child = dev->subordinate;
+ if (child)
+ pcibios_allocate_resources(child, pass);
+ }
+}
+
+static void pcibios_allocate_dev_rom_resource(struct pci_dev *dev)
+{
+ struct resource *r;
+
+ /*
+ * Try to use BIOS settings for ROMs, otherwise let
+ * pci_assign_unassigned_resources() allocate the new
+ * addresses.
+ */
+ r = &dev->resource[PCI_ROM_RESOURCE];
+ if (!r->flags || !r->start)
+ return;
+ if (r->parent) /* Already allocated */
+ return;
+
+ if (pci_claim_resource(dev, PCI_ROM_RESOURCE) < 0) {
+ r->end -= r->start;
+ r->start = 0;
+ }
+}
+static void pcibios_allocate_rom_resources(struct pci_bus *bus)
+{
+ struct pci_dev *dev;
+ struct pci_bus *child;
+
+ list_for_each_entry(dev, &bus->devices, bus_list) {
+ pcibios_allocate_dev_rom_resource(dev);
+
+ child = dev->subordinate;
+ if (child)
+ pcibios_allocate_rom_resources(child);
+ }
+}
+
+static int __init pcibios_assign_resources(void)
+{
+ struct pci_bus *bus;
+
+ if (!(pci_probe & PCI_ASSIGN_ROMS))
+ list_for_each_entry(bus, &pci_root_buses, node)
+ pcibios_allocate_rom_resources(bus);
+
+ pci_assign_unassigned_resources();
+ pcibios_fw_addr_list_del();
+
+ return 0;
+}
+
+/**
+ * called in fs_initcall (one below subsys_initcall),
+ * give a chance for motherboard reserve resources
+ */
+fs_initcall(pcibios_assign_resources);
+
+void pcibios_resource_survey_bus(struct pci_bus *bus)
+{
+ dev_printk(KERN_DEBUG, &bus->dev, "Allocating resources\n");
+
+ pcibios_allocate_bus_resources(bus);
+
+ pcibios_allocate_resources(bus, 0);
+ pcibios_allocate_resources(bus, 1);
+
+ if (!(pci_probe & PCI_ASSIGN_ROMS))
+ pcibios_allocate_rom_resources(bus);
+}
+
+void __init pcibios_resource_survey(void)
+{
+ struct pci_bus *bus;
+
+ DBG("PCI: Allocating resources\n");
+
+ list_for_each_entry(bus, &pci_root_buses, node)
+ pcibios_allocate_bus_resources(bus);
+
+ list_for_each_entry(bus, &pci_root_buses, node)
+ pcibios_allocate_resources(bus, 0);
+ list_for_each_entry(bus, &pci_root_buses, node)
+ pcibios_allocate_resources(bus, 1);
+
+ e820__reserve_resources_late();
+ /*
+ * Insert the IO APIC resources after PCI initialization has
+ * occurred to handle IO APICS that are mapped in on a BAR in
+ * PCI space, but before trying to assign unassigned pci res.
+ */
+ ioapic_insert_resources();
+}
diff --git a/arch/x86/pci/init.c b/arch/x86/pci/init.c
new file mode 100644
index 000000000..5fc617edf
--- /dev/null
+++ b/arch/x86/pci/init.c
@@ -0,0 +1,45 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/pci.h>
+#include <linux/init.h>
+#include <asm/pci_x86.h>
+#include <asm/x86_init.h>
+
+/* arch_initcall has too random ordering, so call the initializers
+ in the right sequence from here. */
+static __init int pci_arch_init(void)
+{
+#ifdef CONFIG_PCI_DIRECT
+ int type = 0;
+
+ type = pci_direct_probe();
+#endif
+
+ if (!(pci_probe & PCI_PROBE_NOEARLY))
+ pci_mmcfg_early_init();
+
+ if (x86_init.pci.arch_init && !x86_init.pci.arch_init())
+ return 0;
+
+#ifdef CONFIG_PCI_BIOS
+ pci_pcbios_init();
+#endif
+ /*
+ * don't check for raw_pci_ops here because we want pcbios as last
+ * fallback, yet it's needed to run first to set pcibios_last_bus
+ * in case legacy PCI probing is used. otherwise detecting peer busses
+ * fails.
+ */
+#ifdef CONFIG_PCI_DIRECT
+ pci_direct_init(type);
+#endif
+ if (!raw_pci_ops && !raw_pci_ext_ops)
+ printk(KERN_ERR
+ "PCI: Fatal: No config space access function found\n");
+
+ dmi_check_pciprobe();
+
+ dmi_check_skip_isa_align();
+
+ return 0;
+}
+arch_initcall(pci_arch_init);
diff --git a/arch/x86/pci/intel_mid_pci.c b/arch/x86/pci/intel_mid_pci.c
new file mode 100644
index 000000000..eea5a0f3b
--- /dev/null
+++ b/arch/x86/pci/intel_mid_pci.c
@@ -0,0 +1,392 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel MID PCI support
+ * Copyright (c) 2008 Intel Corporation
+ * Jesse Barnes <jesse.barnes@intel.com>
+ *
+ * Moorestown has an interesting PCI implementation:
+ * - configuration space is memory mapped (as defined by MCFG)
+ * - Lincroft devices also have a real, type 1 configuration space
+ * - Early Lincroft silicon has a type 1 access bug that will cause
+ * a hang if non-existent devices are accessed
+ * - some devices have the "fixed BAR" capability, which means
+ * they can't be relocated or modified; check for that during
+ * BAR sizing
+ *
+ * So, we use the MCFG space for all reads and writes, but also send
+ * Lincroft writes to type 1 space. But only read/write if the device
+ * actually exists, otherwise return all 1s for reads and bit bucket
+ * the writes.
+ */
+
+#include <linux/sched.h>
+#include <linux/pci.h>
+#include <linux/ioport.h>
+#include <linux/init.h>
+#include <linux/dmi.h>
+#include <linux/acpi.h>
+#include <linux/io.h>
+#include <linux/smp.h>
+
+#include <asm/segment.h>
+#include <asm/pci_x86.h>
+#include <asm/hw_irq.h>
+#include <asm/io_apic.h>
+#include <asm/intel-mid.h>
+#include <asm/acpi.h>
+
+#define PCIE_CAP_OFFSET 0x100
+
+/* Quirks for the listed devices */
+#define PCI_DEVICE_ID_INTEL_MRFLD_MMC 0x1190
+#define PCI_DEVICE_ID_INTEL_MRFLD_HSU 0x1191
+
+/* Fixed BAR fields */
+#define PCIE_VNDR_CAP_ID_FIXED_BAR 0x00 /* Fixed BAR (TBD) */
+#define PCI_FIXED_BAR_0_SIZE 0x04
+#define PCI_FIXED_BAR_1_SIZE 0x08
+#define PCI_FIXED_BAR_2_SIZE 0x0c
+#define PCI_FIXED_BAR_3_SIZE 0x10
+#define PCI_FIXED_BAR_4_SIZE 0x14
+#define PCI_FIXED_BAR_5_SIZE 0x1c
+
+static int pci_soc_mode;
+
+/**
+ * fixed_bar_cap - return the offset of the fixed BAR cap if found
+ * @bus: PCI bus
+ * @devfn: device in question
+ *
+ * Look for the fixed BAR cap on @bus and @devfn, returning its offset
+ * if found or 0 otherwise.
+ */
+static int fixed_bar_cap(struct pci_bus *bus, unsigned int devfn)
+{
+ int pos;
+ u32 pcie_cap = 0, cap_data;
+
+ pos = PCIE_CAP_OFFSET;
+
+ if (!raw_pci_ext_ops)
+ return 0;
+
+ while (pos) {
+ if (raw_pci_ext_ops->read(pci_domain_nr(bus), bus->number,
+ devfn, pos, 4, &pcie_cap))
+ return 0;
+
+ if (PCI_EXT_CAP_ID(pcie_cap) == 0x0000 ||
+ PCI_EXT_CAP_ID(pcie_cap) == 0xffff)
+ break;
+
+ if (PCI_EXT_CAP_ID(pcie_cap) == PCI_EXT_CAP_ID_VNDR) {
+ raw_pci_ext_ops->read(pci_domain_nr(bus), bus->number,
+ devfn, pos + 4, 4, &cap_data);
+ if ((cap_data & 0xffff) == PCIE_VNDR_CAP_ID_FIXED_BAR)
+ return pos;
+ }
+
+ pos = PCI_EXT_CAP_NEXT(pcie_cap);
+ }
+
+ return 0;
+}
+
+static int pci_device_update_fixed(struct pci_bus *bus, unsigned int devfn,
+ int reg, int len, u32 val, int offset)
+{
+ u32 size;
+ unsigned int domain, busnum;
+ int bar = (reg - PCI_BASE_ADDRESS_0) >> 2;
+
+ domain = pci_domain_nr(bus);
+ busnum = bus->number;
+
+ if (val == ~0 && len == 4) {
+ unsigned long decode;
+
+ raw_pci_ext_ops->read(domain, busnum, devfn,
+ offset + 8 + (bar * 4), 4, &size);
+
+ /* Turn the size into a decode pattern for the sizing code */
+ if (size) {
+ decode = size - 1;
+ decode |= decode >> 1;
+ decode |= decode >> 2;
+ decode |= decode >> 4;
+ decode |= decode >> 8;
+ decode |= decode >> 16;
+ decode++;
+ decode = ~(decode - 1);
+ } else {
+ decode = 0;
+ }
+
+ /*
+ * If val is all ones, the core code is trying to size the reg,
+ * so update the mmconfig space with the real size.
+ *
+ * Note: this assumes the fixed size we got is a power of two.
+ */
+ return raw_pci_ext_ops->write(domain, busnum, devfn, reg, 4,
+ decode);
+ }
+
+ /* This is some other kind of BAR write, so just do it. */
+ return raw_pci_ext_ops->write(domain, busnum, devfn, reg, len, val);
+}
+
+/**
+ * type1_access_ok - check whether to use type 1
+ * @bus: bus number
+ * @devfn: device & function in question
+ *
+ * If the bus is on a Lincroft chip and it exists, or is not on a Lincroft at
+ * all, the we can go ahead with any reads & writes. If it's on a Lincroft,
+ * but doesn't exist, avoid the access altogether to keep the chip from
+ * hanging.
+ */
+static bool type1_access_ok(unsigned int bus, unsigned int devfn, int reg)
+{
+ /*
+ * This is a workaround for A0 LNC bug where PCI status register does
+ * not have new CAP bit set. can not be written by SW either.
+ *
+ * PCI header type in real LNC indicates a single function device, this
+ * will prevent probing other devices under the same function in PCI
+ * shim. Therefore, use the header type in shim instead.
+ */
+ if (reg >= 0x100 || reg == PCI_STATUS || reg == PCI_HEADER_TYPE)
+ return false;
+ if (bus == 0 && (devfn == PCI_DEVFN(2, 0)
+ || devfn == PCI_DEVFN(0, 0)
+ || devfn == PCI_DEVFN(3, 0)))
+ return true;
+ return false; /* Langwell on others */
+}
+
+static int pci_read(struct pci_bus *bus, unsigned int devfn, int where,
+ int size, u32 *value)
+{
+ if (type1_access_ok(bus->number, devfn, where))
+ return pci_direct_conf1.read(pci_domain_nr(bus), bus->number,
+ devfn, where, size, value);
+ return raw_pci_ext_ops->read(pci_domain_nr(bus), bus->number,
+ devfn, where, size, value);
+}
+
+static int pci_write(struct pci_bus *bus, unsigned int devfn, int where,
+ int size, u32 value)
+{
+ int offset;
+
+ /*
+ * On MRST, there is no PCI ROM BAR, this will cause a subsequent read
+ * to ROM BAR return 0 then being ignored.
+ */
+ if (where == PCI_ROM_ADDRESS)
+ return 0;
+
+ /*
+ * Devices with fixed BARs need special handling:
+ * - BAR sizing code will save, write ~0, read size, restore
+ * - so writes to fixed BARs need special handling
+ * - other writes to fixed BAR devices should go through mmconfig
+ */
+ offset = fixed_bar_cap(bus, devfn);
+ if (offset &&
+ (where >= PCI_BASE_ADDRESS_0 && where <= PCI_BASE_ADDRESS_5)) {
+ return pci_device_update_fixed(bus, devfn, where, size, value,
+ offset);
+ }
+
+ /*
+ * On Moorestown update both real & mmconfig space
+ * Note: early Lincroft silicon can't handle type 1 accesses to
+ * non-existent devices, so just eat the write in that case.
+ */
+ if (type1_access_ok(bus->number, devfn, where))
+ return pci_direct_conf1.write(pci_domain_nr(bus), bus->number,
+ devfn, where, size, value);
+ return raw_pci_ext_ops->write(pci_domain_nr(bus), bus->number, devfn,
+ where, size, value);
+}
+
+static int intel_mid_pci_irq_enable(struct pci_dev *dev)
+{
+ struct irq_alloc_info info;
+ int polarity;
+ int ret;
+ u8 gsi;
+
+ if (dev->irq_managed && dev->irq > 0)
+ return 0;
+
+ ret = pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &gsi);
+ if (ret < 0) {
+ dev_warn(&dev->dev, "Failed to read interrupt line: %d\n", ret);
+ return ret;
+ }
+
+ switch (intel_mid_identify_cpu()) {
+ case INTEL_MID_CPU_CHIP_TANGIER:
+ polarity = IOAPIC_POL_HIGH;
+
+ /* Special treatment for IRQ0 */
+ if (gsi == 0) {
+ /*
+ * Skip HS UART common registers device since it has
+ * IRQ0 assigned and not used by the kernel.
+ */
+ if (dev->device == PCI_DEVICE_ID_INTEL_MRFLD_HSU)
+ return -EBUSY;
+ /*
+ * TNG has IRQ0 assigned to eMMC controller. But there
+ * are also other devices with bogus PCI configuration
+ * that have IRQ0 assigned. This check ensures that
+ * eMMC gets it. The rest of devices still could be
+ * enabled without interrupt line being allocated.
+ */
+ if (dev->device != PCI_DEVICE_ID_INTEL_MRFLD_MMC)
+ return 0;
+ }
+ break;
+ default:
+ polarity = IOAPIC_POL_LOW;
+ break;
+ }
+
+ ioapic_set_alloc_attr(&info, dev_to_node(&dev->dev), 1, polarity);
+
+ /*
+ * MRST only have IOAPIC, the PCI irq lines are 1:1 mapped to
+ * IOAPIC RTE entries, so we just enable RTE for the device.
+ */
+ ret = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC, &info);
+ if (ret < 0)
+ return ret;
+
+ dev->irq = ret;
+ dev->irq_managed = 1;
+
+ return 0;
+}
+
+static void intel_mid_pci_irq_disable(struct pci_dev *dev)
+{
+ if (!mp_should_keep_irq(&dev->dev) && dev->irq_managed &&
+ dev->irq > 0) {
+ mp_unmap_irq(dev->irq);
+ dev->irq_managed = 0;
+ }
+}
+
+static const struct pci_ops intel_mid_pci_ops __initconst = {
+ .read = pci_read,
+ .write = pci_write,
+};
+
+/**
+ * intel_mid_pci_init - installs intel_mid_pci_ops
+ *
+ * Moorestown has an interesting PCI implementation (see above).
+ * Called when the early platform detection installs it.
+ */
+int __init intel_mid_pci_init(void)
+{
+ pr_info("Intel MID platform detected, using MID PCI ops\n");
+ pci_mmcfg_late_init();
+ pcibios_enable_irq = intel_mid_pci_irq_enable;
+ pcibios_disable_irq = intel_mid_pci_irq_disable;
+ pci_root_ops = intel_mid_pci_ops;
+ pci_soc_mode = 1;
+ /* Continue with standard init */
+ acpi_noirq_set();
+ return 1;
+}
+
+/*
+ * Langwell devices are not true PCI devices; they are not subject to 10 ms
+ * d3 to d0 delay required by PCI spec.
+ */
+static void pci_d3delay_fixup(struct pci_dev *dev)
+{
+ /*
+ * PCI fixups are effectively decided compile time. If we have a dual
+ * SoC/non-SoC kernel we don't want to mangle d3 on non-SoC devices.
+ */
+ if (!pci_soc_mode)
+ return;
+ /*
+ * True PCI devices in Lincroft should allow type 1 access, the rest
+ * are Langwell fake PCI devices.
+ */
+ if (type1_access_ok(dev->bus->number, dev->devfn, PCI_DEVICE_ID))
+ return;
+ dev->d3_delay = 0;
+}
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, pci_d3delay_fixup);
+
+static void mid_power_off_one_device(struct pci_dev *dev)
+{
+ u16 pmcsr;
+
+ /*
+ * Update current state first, otherwise PCI core enforces PCI_D0 in
+ * pci_set_power_state() for devices which status was PCI_UNKNOWN.
+ */
+ pci_read_config_word(dev, dev->pm_cap + PCI_PM_CTRL, &pmcsr);
+ dev->current_state = (pci_power_t __force)(pmcsr & PCI_PM_CTRL_STATE_MASK);
+
+ pci_set_power_state(dev, PCI_D3hot);
+}
+
+static void mid_power_off_devices(struct pci_dev *dev)
+{
+ int id;
+
+ if (!pci_soc_mode)
+ return;
+
+ id = intel_mid_pwr_get_lss_id(dev);
+ if (id < 0)
+ return;
+
+ /*
+ * This sets only PMCSR bits. The actual power off will happen in
+ * arch/x86/platform/intel-mid/pwr.c.
+ */
+ mid_power_off_one_device(dev);
+}
+
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, mid_power_off_devices);
+
+/*
+ * Langwell devices reside at fixed offsets, don't try to move them.
+ */
+static void pci_fixed_bar_fixup(struct pci_dev *dev)
+{
+ unsigned long offset;
+ u32 size;
+ int i;
+
+ if (!pci_soc_mode)
+ return;
+
+ /* Must have extended configuration space */
+ if (dev->cfg_size < PCIE_CAP_OFFSET + 4)
+ return;
+
+ /* Fixup the BAR sizes for fixed BAR devices and make them unmoveable */
+ offset = fixed_bar_cap(dev->bus, dev->devfn);
+ if (!offset || PCI_DEVFN(2, 0) == dev->devfn ||
+ PCI_DEVFN(2, 2) == dev->devfn)
+ return;
+
+ for (i = 0; i < PCI_ROM_RESOURCE; i++) {
+ pci_read_config_dword(dev, offset + 8 + (i * 4), &size);
+ dev->resource[i].end = dev->resource[i].start + size - 1;
+ dev->resource[i].flags |= IORESOURCE_PCI_FIXED;
+ }
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, pci_fixed_bar_fixup);
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
new file mode 100644
index 000000000..d3a73f933
--- /dev/null
+++ b/arch/x86/pci/irq.c
@@ -0,0 +1,1288 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Low-Level PCI Support for PC -- Routing of Interrupts
+ *
+ * (c) 1999--2000 Martin Mares <mj@ucw.cz>
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/dmi.h>
+#include <linux/io.h>
+#include <linux/smp.h>
+#include <asm/io_apic.h>
+#include <linux/irq.h>
+#include <linux/acpi.h>
+#include <asm/pci_x86.h>
+
+#define PIRQ_SIGNATURE (('$' << 0) + ('P' << 8) + ('I' << 16) + ('R' << 24))
+#define PIRQ_VERSION 0x0100
+
+static int broken_hp_bios_irq9;
+static int acer_tm360_irqrouting;
+
+static struct irq_routing_table *pirq_table;
+
+static int pirq_enable_irq(struct pci_dev *dev);
+static void pirq_disable_irq(struct pci_dev *dev);
+
+/*
+ * Never use: 0, 1, 2 (timer, keyboard, and cascade)
+ * Avoid using: 13, 14 and 15 (FP error and IDE).
+ * Penalize: 3, 4, 6, 7, 12 (known ISA uses: serial, floppy, parallel and mouse)
+ */
+unsigned int pcibios_irq_mask = 0xfff8;
+
+static int pirq_penalty[16] = {
+ 1000000, 1000000, 1000000, 1000, 1000, 0, 1000, 1000,
+ 0, 0, 0, 0, 1000, 100000, 100000, 100000
+};
+
+struct irq_router {
+ char *name;
+ u16 vendor, device;
+ int (*get)(struct pci_dev *router, struct pci_dev *dev, int pirq);
+ int (*set)(struct pci_dev *router, struct pci_dev *dev, int pirq,
+ int new);
+};
+
+struct irq_router_handler {
+ u16 vendor;
+ int (*probe)(struct irq_router *r, struct pci_dev *router, u16 device);
+};
+
+int (*pcibios_enable_irq)(struct pci_dev *dev) = pirq_enable_irq;
+void (*pcibios_disable_irq)(struct pci_dev *dev) = pirq_disable_irq;
+
+/*
+ * Check passed address for the PCI IRQ Routing Table signature
+ * and perform checksum verification.
+ */
+
+static inline struct irq_routing_table *pirq_check_routing_table(u8 *addr)
+{
+ struct irq_routing_table *rt;
+ int i;
+ u8 sum;
+
+ rt = (struct irq_routing_table *) addr;
+ if (rt->signature != PIRQ_SIGNATURE ||
+ rt->version != PIRQ_VERSION ||
+ rt->size % 16 ||
+ rt->size < sizeof(struct irq_routing_table))
+ return NULL;
+ sum = 0;
+ for (i = 0; i < rt->size; i++)
+ sum += addr[i];
+ if (!sum) {
+ DBG(KERN_DEBUG "PCI: Interrupt Routing Table found at 0x%p\n",
+ rt);
+ return rt;
+ }
+ return NULL;
+}
+
+
+
+/*
+ * Search 0xf0000 -- 0xfffff for the PCI IRQ Routing Table.
+ */
+
+static struct irq_routing_table * __init pirq_find_routing_table(void)
+{
+ u8 *addr;
+ struct irq_routing_table *rt;
+
+ if (pirq_table_addr) {
+ rt = pirq_check_routing_table((u8 *) __va(pirq_table_addr));
+ if (rt)
+ return rt;
+ printk(KERN_WARNING "PCI: PIRQ table NOT found at pirqaddr\n");
+ }
+ for (addr = (u8 *) __va(0xf0000); addr < (u8 *) __va(0x100000); addr += 16) {
+ rt = pirq_check_routing_table(addr);
+ if (rt)
+ return rt;
+ }
+ return NULL;
+}
+
+/*
+ * If we have a IRQ routing table, use it to search for peer host
+ * bridges. It's a gross hack, but since there are no other known
+ * ways how to get a list of buses, we have to go this way.
+ */
+
+static void __init pirq_peer_trick(void)
+{
+ struct irq_routing_table *rt = pirq_table;
+ u8 busmap[256];
+ int i;
+ struct irq_info *e;
+
+ memset(busmap, 0, sizeof(busmap));
+ for (i = 0; i < (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info); i++) {
+ e = &rt->slots[i];
+#ifdef DEBUG
+ {
+ int j;
+ DBG(KERN_DEBUG "%02x:%02x slot=%02x", e->bus, e->devfn/8, e->slot);
+ for (j = 0; j < 4; j++)
+ DBG(" %d:%02x/%04x", j, e->irq[j].link, e->irq[j].bitmap);
+ DBG("\n");
+ }
+#endif
+ busmap[e->bus] = 1;
+ }
+ for (i = 1; i < 256; i++) {
+ if (!busmap[i] || pci_find_bus(0, i))
+ continue;
+ pcibios_scan_root(i);
+ }
+ pcibios_last_bus = -1;
+}
+
+/*
+ * Code for querying and setting of IRQ routes on various interrupt routers.
+ * PIC Edge/Level Control Registers (ELCR) 0x4d0 & 0x4d1.
+ */
+
+void elcr_set_level_irq(unsigned int irq)
+{
+ unsigned char mask = 1 << (irq & 7);
+ unsigned int port = 0x4d0 + (irq >> 3);
+ unsigned char val;
+ static u16 elcr_irq_mask;
+
+ if (irq >= 16 || (1 << irq) & elcr_irq_mask)
+ return;
+
+ elcr_irq_mask |= (1 << irq);
+ printk(KERN_DEBUG "PCI: setting IRQ %u as level-triggered\n", irq);
+ val = inb(port);
+ if (!(val & mask)) {
+ DBG(KERN_DEBUG " -> edge");
+ outb(val | mask, port);
+ }
+}
+
+/*
+ * Common IRQ routing practice: nibbles in config space,
+ * offset by some magic constant.
+ */
+static unsigned int read_config_nybble(struct pci_dev *router, unsigned offset, unsigned nr)
+{
+ u8 x;
+ unsigned reg = offset + (nr >> 1);
+
+ pci_read_config_byte(router, reg, &x);
+ return (nr & 1) ? (x >> 4) : (x & 0xf);
+}
+
+static void write_config_nybble(struct pci_dev *router, unsigned offset,
+ unsigned nr, unsigned int val)
+{
+ u8 x;
+ unsigned reg = offset + (nr >> 1);
+
+ pci_read_config_byte(router, reg, &x);
+ x = (nr & 1) ? ((x & 0x0f) | (val << 4)) : ((x & 0xf0) | val);
+ pci_write_config_byte(router, reg, x);
+}
+
+/*
+ * ALI pirq entries are damn ugly, and completely undocumented.
+ * This has been figured out from pirq tables, and it's not a pretty
+ * picture.
+ */
+static int pirq_ali_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
+{
+ static const unsigned char irqmap[16] = { 0, 9, 3, 10, 4, 5, 7, 6, 1, 11, 0, 12, 0, 14, 0, 15 };
+
+ WARN_ON_ONCE(pirq > 16);
+ return irqmap[read_config_nybble(router, 0x48, pirq-1)];
+}
+
+static int pirq_ali_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
+{
+ static const unsigned char irqmap[16] = { 0, 8, 0, 2, 4, 5, 7, 6, 0, 1, 3, 9, 11, 0, 13, 15 };
+ unsigned int val = irqmap[irq];
+
+ WARN_ON_ONCE(pirq > 16);
+ if (val) {
+ write_config_nybble(router, 0x48, pirq-1, val);
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * The Intel PIIX4 pirq rules are fairly simple: "pirq" is
+ * just a pointer to the config space.
+ */
+static int pirq_piix_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
+{
+ u8 x;
+
+ pci_read_config_byte(router, pirq, &x);
+ return (x < 16) ? x : 0;
+}
+
+static int pirq_piix_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
+{
+ pci_write_config_byte(router, pirq, irq);
+ return 1;
+}
+
+/*
+ * The VIA pirq rules are nibble-based, like ALI,
+ * but without the ugly irq number munging.
+ * However, PIRQD is in the upper instead of lower 4 bits.
+ */
+static int pirq_via_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
+{
+ return read_config_nybble(router, 0x55, pirq == 4 ? 5 : pirq);
+}
+
+static int pirq_via_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
+{
+ write_config_nybble(router, 0x55, pirq == 4 ? 5 : pirq, irq);
+ return 1;
+}
+
+/*
+ * The VIA pirq rules are nibble-based, like ALI,
+ * but without the ugly irq number munging.
+ * However, for 82C586, nibble map is different .
+ */
+static int pirq_via586_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
+{
+ static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 };
+
+ WARN_ON_ONCE(pirq > 5);
+ return read_config_nybble(router, 0x55, pirqmap[pirq-1]);
+}
+
+static int pirq_via586_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
+{
+ static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 };
+
+ WARN_ON_ONCE(pirq > 5);
+ write_config_nybble(router, 0x55, pirqmap[pirq-1], irq);
+ return 1;
+}
+
+/*
+ * ITE 8330G pirq rules are nibble-based
+ * FIXME: pirqmap may be { 1, 0, 3, 2 },
+ * 2+3 are both mapped to irq 9 on my system
+ */
+static int pirq_ite_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
+{
+ static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };
+
+ WARN_ON_ONCE(pirq > 4);
+ return read_config_nybble(router, 0x43, pirqmap[pirq-1]);
+}
+
+static int pirq_ite_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
+{
+ static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };
+
+ WARN_ON_ONCE(pirq > 4);
+ write_config_nybble(router, 0x43, pirqmap[pirq-1], irq);
+ return 1;
+}
+
+/*
+ * OPTI: high four bits are nibble pointer..
+ * I wonder what the low bits do?
+ */
+static int pirq_opti_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
+{
+ return read_config_nybble(router, 0xb8, pirq >> 4);
+}
+
+static int pirq_opti_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
+{
+ write_config_nybble(router, 0xb8, pirq >> 4, irq);
+ return 1;
+}
+
+/*
+ * Cyrix: nibble offset 0x5C
+ * 0x5C bits 7:4 is INTB bits 3:0 is INTA
+ * 0x5D bits 7:4 is INTD bits 3:0 is INTC
+ */
+static int pirq_cyrix_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
+{
+ return read_config_nybble(router, 0x5C, (pirq-1)^1);
+}
+
+static int pirq_cyrix_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
+{
+ write_config_nybble(router, 0x5C, (pirq-1)^1, irq);
+ return 1;
+}
+
+/*
+ * PIRQ routing for SiS 85C503 router used in several SiS chipsets.
+ * We have to deal with the following issues here:
+ * - vendors have different ideas about the meaning of link values
+ * - some onboard devices (integrated in the chipset) have special
+ * links and are thus routed differently (i.e. not via PCI INTA-INTD)
+ * - different revision of the router have a different layout for
+ * the routing registers, particularly for the onchip devices
+ *
+ * For all routing registers the common thing is we have one byte
+ * per routeable link which is defined as:
+ * bit 7 IRQ mapping enabled (0) or disabled (1)
+ * bits [6:4] reserved (sometimes used for onchip devices)
+ * bits [3:0] IRQ to map to
+ * allowed: 3-7, 9-12, 14-15
+ * reserved: 0, 1, 2, 8, 13
+ *
+ * The config-space registers located at 0x41/0x42/0x43/0x44 are
+ * always used to route the normal PCI INT A/B/C/D respectively.
+ * Apparently there are systems implementing PCI routing table using
+ * link values 0x01-0x04 and others using 0x41-0x44 for PCI INTA..D.
+ * We try our best to handle both link mappings.
+ *
+ * Currently (2003-05-21) it appears most SiS chipsets follow the
+ * definition of routing registers from the SiS-5595 southbridge.
+ * According to the SiS 5595 datasheets the revision id's of the
+ * router (ISA-bridge) should be 0x01 or 0xb0.
+ *
+ * Furthermore we've also seen lspci dumps with revision 0x00 and 0xb1.
+ * Looks like these are used in a number of SiS 5xx/6xx/7xx chipsets.
+ * They seem to work with the current routing code. However there is
+ * some concern because of the two USB-OHCI HCs (original SiS 5595
+ * had only one). YMMV.
+ *
+ * Onchip routing for router rev-id 0x01/0xb0 and probably 0x00/0xb1:
+ *
+ * 0x61: IDEIRQ:
+ * bits [6:5] must be written 01
+ * bit 4 channel-select primary (0), secondary (1)
+ *
+ * 0x62: USBIRQ:
+ * bit 6 OHCI function disabled (0), enabled (1)
+ *
+ * 0x6a: ACPI/SCI IRQ: bits 4-6 reserved
+ *
+ * 0x7e: Data Acq. Module IRQ - bits 4-6 reserved
+ *
+ * We support USBIRQ (in addition to INTA-INTD) and keep the
+ * IDE, ACPI and DAQ routing untouched as set by the BIOS.
+ *
+ * Currently the only reported exception is the new SiS 65x chipset
+ * which includes the SiS 69x southbridge. Here we have the 85C503
+ * router revision 0x04 and there are changes in the register layout
+ * mostly related to the different USB HCs with USB 2.0 support.
+ *
+ * Onchip routing for router rev-id 0x04 (try-and-error observation)
+ *
+ * 0x60/0x61/0x62/0x63: 1xEHCI and 3xOHCI (companion) USB-HCs
+ * bit 6-4 are probably unused, not like 5595
+ */
+
+#define PIRQ_SIS_IRQ_MASK 0x0f
+#define PIRQ_SIS_IRQ_DISABLE 0x80
+#define PIRQ_SIS_USB_ENABLE 0x40
+
+static int pirq_sis_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
+{
+ u8 x;
+ int reg;
+
+ reg = pirq;
+ if (reg >= 0x01 && reg <= 0x04)
+ reg += 0x40;
+ pci_read_config_byte(router, reg, &x);
+ return (x & PIRQ_SIS_IRQ_DISABLE) ? 0 : (x & PIRQ_SIS_IRQ_MASK);
+}
+
+static int pirq_sis_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
+{
+ u8 x;
+ int reg;
+
+ reg = pirq;
+ if (reg >= 0x01 && reg <= 0x04)
+ reg += 0x40;
+ pci_read_config_byte(router, reg, &x);
+ x &= ~(PIRQ_SIS_IRQ_MASK | PIRQ_SIS_IRQ_DISABLE);
+ x |= irq ? irq: PIRQ_SIS_IRQ_DISABLE;
+ pci_write_config_byte(router, reg, x);
+ return 1;
+}
+
+
+/*
+ * VLSI: nibble offset 0x74 - educated guess due to routing table and
+ * config space of VLSI 82C534 PCI-bridge/router (1004:0102)
+ * Tested on HP OmniBook 800 covering PIRQ 1, 2, 4, 8 for onboard
+ * devices, PIRQ 3 for non-pci(!) soundchip and (untested) PIRQ 6
+ * for the busbridge to the docking station.
+ */
+
+static int pirq_vlsi_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
+{
+ WARN_ON_ONCE(pirq >= 9);
+ if (pirq > 8) {
+ dev_info(&dev->dev, "VLSI router PIRQ escape (%d)\n", pirq);
+ return 0;
+ }
+ return read_config_nybble(router, 0x74, pirq-1);
+}
+
+static int pirq_vlsi_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
+{
+ WARN_ON_ONCE(pirq >= 9);
+ if (pirq > 8) {
+ dev_info(&dev->dev, "VLSI router PIRQ escape (%d)\n", pirq);
+ return 0;
+ }
+ write_config_nybble(router, 0x74, pirq-1, irq);
+ return 1;
+}
+
+/*
+ * ServerWorks: PCI interrupts mapped to system IRQ lines through Index
+ * and Redirect I/O registers (0x0c00 and 0x0c01). The Index register
+ * format is (PCIIRQ## | 0x10), e.g.: PCIIRQ10=0x1a. The Redirect
+ * register is a straight binary coding of desired PIC IRQ (low nibble).
+ *
+ * The 'link' value in the PIRQ table is already in the correct format
+ * for the Index register. There are some special index values:
+ * 0x00 for ACPI (SCI), 0x01 for USB, 0x02 for IDE0, 0x04 for IDE1,
+ * and 0x03 for SMBus.
+ */
+static int pirq_serverworks_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
+{
+ outb(pirq, 0xc00);
+ return inb(0xc01) & 0xf;
+}
+
+static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev,
+ int pirq, int irq)
+{
+ outb(pirq, 0xc00);
+ outb(irq, 0xc01);
+ return 1;
+}
+
+/* Support for AMD756 PCI IRQ Routing
+ * Jhon H. Caicedo <jhcaiced@osso.org.co>
+ * Jun/21/2001 0.2.0 Release, fixed to use "nybble" functions... (jhcaiced)
+ * Jun/19/2001 Alpha Release 0.1.0 (jhcaiced)
+ * The AMD756 pirq rules are nibble-based
+ * offset 0x56 0-3 PIRQA 4-7 PIRQB
+ * offset 0x57 0-3 PIRQC 4-7 PIRQD
+ */
+static int pirq_amd756_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
+{
+ u8 irq;
+ irq = 0;
+ if (pirq <= 4)
+ irq = read_config_nybble(router, 0x56, pirq - 1);
+ dev_info(&dev->dev,
+ "AMD756: dev [%04x:%04x], router PIRQ %d get IRQ %d\n",
+ dev->vendor, dev->device, pirq, irq);
+ return irq;
+}
+
+static int pirq_amd756_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
+{
+ dev_info(&dev->dev,
+ "AMD756: dev [%04x:%04x], router PIRQ %d set IRQ %d\n",
+ dev->vendor, dev->device, pirq, irq);
+ if (pirq <= 4)
+ write_config_nybble(router, 0x56, pirq - 1, irq);
+ return 1;
+}
+
+/*
+ * PicoPower PT86C523
+ */
+static int pirq_pico_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
+{
+ outb(0x10 + ((pirq - 1) >> 1), 0x24);
+ return ((pirq - 1) & 1) ? (inb(0x26) >> 4) : (inb(0x26) & 0xf);
+}
+
+static int pirq_pico_set(struct pci_dev *router, struct pci_dev *dev, int pirq,
+ int irq)
+{
+ unsigned int x;
+ outb(0x10 + ((pirq - 1) >> 1), 0x24);
+ x = inb(0x26);
+ x = ((pirq - 1) & 1) ? ((x & 0x0f) | (irq << 4)) : ((x & 0xf0) | (irq));
+ outb(x, 0x26);
+ return 1;
+}
+
+#ifdef CONFIG_PCI_BIOS
+
+static int pirq_bios_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
+{
+ struct pci_dev *bridge;
+ int pin = pci_get_interrupt_pin(dev, &bridge);
+ return pcibios_set_irq_routing(bridge, pin - 1, irq);
+}
+
+#endif
+
+static __init int intel_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
+{
+ static struct pci_device_id __initdata pirq_440gx[] = {
+ { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82443GX_0) },
+ { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82443GX_2) },
+ { },
+ };
+
+ /* 440GX has a proprietary PIRQ router -- don't use it */
+ if (pci_dev_present(pirq_440gx))
+ return 0;
+
+ switch (device) {
+ case PCI_DEVICE_ID_INTEL_82371FB_0:
+ case PCI_DEVICE_ID_INTEL_82371SB_0:
+ case PCI_DEVICE_ID_INTEL_82371AB_0:
+ case PCI_DEVICE_ID_INTEL_82371MX:
+ case PCI_DEVICE_ID_INTEL_82443MX_0:
+ case PCI_DEVICE_ID_INTEL_82801AA_0:
+ case PCI_DEVICE_ID_INTEL_82801AB_0:
+ case PCI_DEVICE_ID_INTEL_82801BA_0:
+ case PCI_DEVICE_ID_INTEL_82801BA_10:
+ case PCI_DEVICE_ID_INTEL_82801CA_0:
+ case PCI_DEVICE_ID_INTEL_82801CA_12:
+ case PCI_DEVICE_ID_INTEL_82801DB_0:
+ case PCI_DEVICE_ID_INTEL_82801E_0:
+ case PCI_DEVICE_ID_INTEL_82801EB_0:
+ case PCI_DEVICE_ID_INTEL_ESB_1:
+ case PCI_DEVICE_ID_INTEL_ICH6_0:
+ case PCI_DEVICE_ID_INTEL_ICH6_1:
+ case PCI_DEVICE_ID_INTEL_ICH7_0:
+ case PCI_DEVICE_ID_INTEL_ICH7_1:
+ case PCI_DEVICE_ID_INTEL_ICH7_30:
+ case PCI_DEVICE_ID_INTEL_ICH7_31:
+ case PCI_DEVICE_ID_INTEL_TGP_LPC:
+ case PCI_DEVICE_ID_INTEL_ESB2_0:
+ case PCI_DEVICE_ID_INTEL_ICH8_0:
+ case PCI_DEVICE_ID_INTEL_ICH8_1:
+ case PCI_DEVICE_ID_INTEL_ICH8_2:
+ case PCI_DEVICE_ID_INTEL_ICH8_3:
+ case PCI_DEVICE_ID_INTEL_ICH8_4:
+ case PCI_DEVICE_ID_INTEL_ICH9_0:
+ case PCI_DEVICE_ID_INTEL_ICH9_1:
+ case PCI_DEVICE_ID_INTEL_ICH9_2:
+ case PCI_DEVICE_ID_INTEL_ICH9_3:
+ case PCI_DEVICE_ID_INTEL_ICH9_4:
+ case PCI_DEVICE_ID_INTEL_ICH9_5:
+ case PCI_DEVICE_ID_INTEL_EP80579_0:
+ case PCI_DEVICE_ID_INTEL_ICH10_0:
+ case PCI_DEVICE_ID_INTEL_ICH10_1:
+ case PCI_DEVICE_ID_INTEL_ICH10_2:
+ case PCI_DEVICE_ID_INTEL_ICH10_3:
+ case PCI_DEVICE_ID_INTEL_PATSBURG_LPC_0:
+ case PCI_DEVICE_ID_INTEL_PATSBURG_LPC_1:
+ r->name = "PIIX/ICH";
+ r->get = pirq_piix_get;
+ r->set = pirq_piix_set;
+ return 1;
+ }
+
+ if ((device >= PCI_DEVICE_ID_INTEL_5_3400_SERIES_LPC_MIN &&
+ device <= PCI_DEVICE_ID_INTEL_5_3400_SERIES_LPC_MAX)
+ || (device >= PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MIN &&
+ device <= PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MAX)
+ || (device >= PCI_DEVICE_ID_INTEL_DH89XXCC_LPC_MIN &&
+ device <= PCI_DEVICE_ID_INTEL_DH89XXCC_LPC_MAX)
+ || (device >= PCI_DEVICE_ID_INTEL_PANTHERPOINT_LPC_MIN &&
+ device <= PCI_DEVICE_ID_INTEL_PANTHERPOINT_LPC_MAX)) {
+ r->name = "PIIX/ICH";
+ r->get = pirq_piix_get;
+ r->set = pirq_piix_set;
+ return 1;
+ }
+
+ return 0;
+}
+
+static __init int via_router_probe(struct irq_router *r,
+ struct pci_dev *router, u16 device)
+{
+ /* FIXME: We should move some of the quirk fixup stuff here */
+
+ /*
+ * workarounds for some buggy BIOSes
+ */
+ if (device == PCI_DEVICE_ID_VIA_82C586_0) {
+ switch (router->device) {
+ case PCI_DEVICE_ID_VIA_82C686:
+ /*
+ * Asus k7m bios wrongly reports 82C686A
+ * as 586-compatible
+ */
+ device = PCI_DEVICE_ID_VIA_82C686;
+ break;
+ case PCI_DEVICE_ID_VIA_8235:
+ /**
+ * Asus a7v-x bios wrongly reports 8235
+ * as 586-compatible
+ */
+ device = PCI_DEVICE_ID_VIA_8235;
+ break;
+ case PCI_DEVICE_ID_VIA_8237:
+ /**
+ * Asus a7v600 bios wrongly reports 8237
+ * as 586-compatible
+ */
+ device = PCI_DEVICE_ID_VIA_8237;
+ break;
+ }
+ }
+
+ switch (device) {
+ case PCI_DEVICE_ID_VIA_82C586_0:
+ r->name = "VIA";
+ r->get = pirq_via586_get;
+ r->set = pirq_via586_set;
+ return 1;
+ case PCI_DEVICE_ID_VIA_82C596:
+ case PCI_DEVICE_ID_VIA_82C686:
+ case PCI_DEVICE_ID_VIA_8231:
+ case PCI_DEVICE_ID_VIA_8233A:
+ case PCI_DEVICE_ID_VIA_8235:
+ case PCI_DEVICE_ID_VIA_8237:
+ /* FIXME: add new ones for 8233/5 */
+ r->name = "VIA";
+ r->get = pirq_via_get;
+ r->set = pirq_via_set;
+ return 1;
+ }
+ return 0;
+}
+
+static __init int vlsi_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
+{
+ switch (device) {
+ case PCI_DEVICE_ID_VLSI_82C534:
+ r->name = "VLSI 82C534";
+ r->get = pirq_vlsi_get;
+ r->set = pirq_vlsi_set;
+ return 1;
+ }
+ return 0;
+}
+
+
+static __init int serverworks_router_probe(struct irq_router *r,
+ struct pci_dev *router, u16 device)
+{
+ switch (device) {
+ case PCI_DEVICE_ID_SERVERWORKS_OSB4:
+ case PCI_DEVICE_ID_SERVERWORKS_CSB5:
+ r->name = "ServerWorks";
+ r->get = pirq_serverworks_get;
+ r->set = pirq_serverworks_set;
+ return 1;
+ }
+ return 0;
+}
+
+static __init int sis_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
+{
+ if (device != PCI_DEVICE_ID_SI_503)
+ return 0;
+
+ r->name = "SIS";
+ r->get = pirq_sis_get;
+ r->set = pirq_sis_set;
+ return 1;
+}
+
+static __init int cyrix_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
+{
+ switch (device) {
+ case PCI_DEVICE_ID_CYRIX_5520:
+ r->name = "NatSemi";
+ r->get = pirq_cyrix_get;
+ r->set = pirq_cyrix_set;
+ return 1;
+ }
+ return 0;
+}
+
+static __init int opti_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
+{
+ switch (device) {
+ case PCI_DEVICE_ID_OPTI_82C700:
+ r->name = "OPTI";
+ r->get = pirq_opti_get;
+ r->set = pirq_opti_set;
+ return 1;
+ }
+ return 0;
+}
+
+static __init int ite_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
+{
+ switch (device) {
+ case PCI_DEVICE_ID_ITE_IT8330G_0:
+ r->name = "ITE";
+ r->get = pirq_ite_get;
+ r->set = pirq_ite_set;
+ return 1;
+ }
+ return 0;
+}
+
+static __init int ali_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
+{
+ switch (device) {
+ case PCI_DEVICE_ID_AL_M1533:
+ case PCI_DEVICE_ID_AL_M1563:
+ r->name = "ALI";
+ r->get = pirq_ali_get;
+ r->set = pirq_ali_set;
+ return 1;
+ }
+ return 0;
+}
+
+static __init int amd_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
+{
+ switch (device) {
+ case PCI_DEVICE_ID_AMD_VIPER_740B:
+ r->name = "AMD756";
+ break;
+ case PCI_DEVICE_ID_AMD_VIPER_7413:
+ r->name = "AMD766";
+ break;
+ case PCI_DEVICE_ID_AMD_VIPER_7443:
+ r->name = "AMD768";
+ break;
+ default:
+ return 0;
+ }
+ r->get = pirq_amd756_get;
+ r->set = pirq_amd756_set;
+ return 1;
+}
+
+static __init int pico_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
+{
+ switch (device) {
+ case PCI_DEVICE_ID_PICOPOWER_PT86C523:
+ r->name = "PicoPower PT86C523";
+ r->get = pirq_pico_get;
+ r->set = pirq_pico_set;
+ return 1;
+
+ case PCI_DEVICE_ID_PICOPOWER_PT86C523BBP:
+ r->name = "PicoPower PT86C523 rev. BB+";
+ r->get = pirq_pico_get;
+ r->set = pirq_pico_set;
+ return 1;
+ }
+ return 0;
+}
+
+static __initdata struct irq_router_handler pirq_routers[] = {
+ { PCI_VENDOR_ID_INTEL, intel_router_probe },
+ { PCI_VENDOR_ID_AL, ali_router_probe },
+ { PCI_VENDOR_ID_ITE, ite_router_probe },
+ { PCI_VENDOR_ID_VIA, via_router_probe },
+ { PCI_VENDOR_ID_OPTI, opti_router_probe },
+ { PCI_VENDOR_ID_SI, sis_router_probe },
+ { PCI_VENDOR_ID_CYRIX, cyrix_router_probe },
+ { PCI_VENDOR_ID_VLSI, vlsi_router_probe },
+ { PCI_VENDOR_ID_SERVERWORKS, serverworks_router_probe },
+ { PCI_VENDOR_ID_AMD, amd_router_probe },
+ { PCI_VENDOR_ID_PICOPOWER, pico_router_probe },
+ /* Someone with docs needs to add the ATI Radeon IGP */
+ { 0, NULL }
+};
+static struct irq_router pirq_router;
+static struct pci_dev *pirq_router_dev;
+
+
+/*
+ * FIXME: should we have an option to say "generic for
+ * chipset" ?
+ */
+
+static void __init pirq_find_router(struct irq_router *r)
+{
+ struct irq_routing_table *rt = pirq_table;
+ struct irq_router_handler *h;
+
+#ifdef CONFIG_PCI_BIOS
+ if (!rt->signature) {
+ printk(KERN_INFO "PCI: Using BIOS for IRQ routing\n");
+ r->set = pirq_bios_set;
+ r->name = "BIOS";
+ return;
+ }
+#endif
+
+ /* Default unless a driver reloads it */
+ r->name = "default";
+ r->get = NULL;
+ r->set = NULL;
+
+ DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for [%04x:%04x]\n",
+ rt->rtr_vendor, rt->rtr_device);
+
+ pirq_router_dev = pci_get_domain_bus_and_slot(0, rt->rtr_bus,
+ rt->rtr_devfn);
+ if (!pirq_router_dev) {
+ DBG(KERN_DEBUG "PCI: Interrupt router not found at "
+ "%02x:%02x\n", rt->rtr_bus, rt->rtr_devfn);
+ return;
+ }
+
+ for (h = pirq_routers; h->vendor; h++) {
+ /* First look for a router match */
+ if (rt->rtr_vendor == h->vendor &&
+ h->probe(r, pirq_router_dev, rt->rtr_device))
+ break;
+ /* Fall back to a device match */
+ if (pirq_router_dev->vendor == h->vendor &&
+ h->probe(r, pirq_router_dev, pirq_router_dev->device))
+ break;
+ }
+ dev_info(&pirq_router_dev->dev, "%s IRQ router [%04x:%04x]\n",
+ pirq_router.name,
+ pirq_router_dev->vendor, pirq_router_dev->device);
+
+ /* The device remains referenced for the kernel lifetime */
+}
+
+static struct irq_info *pirq_get_info(struct pci_dev *dev)
+{
+ struct irq_routing_table *rt = pirq_table;
+ int entries = (rt->size - sizeof(struct irq_routing_table)) /
+ sizeof(struct irq_info);
+ struct irq_info *info;
+
+ for (info = rt->slots; entries--; info++)
+ if (info->bus == dev->bus->number &&
+ PCI_SLOT(info->devfn) == PCI_SLOT(dev->devfn))
+ return info;
+ return NULL;
+}
+
+static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
+{
+ u8 pin;
+ struct irq_info *info;
+ int i, pirq, newirq;
+ int irq = 0;
+ u32 mask;
+ struct irq_router *r = &pirq_router;
+ struct pci_dev *dev2 = NULL;
+ char *msg = NULL;
+
+ /* Find IRQ pin */
+ pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
+ if (!pin) {
+ dev_dbg(&dev->dev, "no interrupt pin\n");
+ return 0;
+ }
+
+ if (io_apic_assign_pci_irqs)
+ return 0;
+
+ /* Find IRQ routing entry */
+
+ if (!pirq_table)
+ return 0;
+
+ info = pirq_get_info(dev);
+ if (!info) {
+ dev_dbg(&dev->dev, "PCI INT %c not found in routing table\n",
+ 'A' + pin - 1);
+ return 0;
+ }
+ pirq = info->irq[pin - 1].link;
+ mask = info->irq[pin - 1].bitmap;
+ if (!pirq) {
+ dev_dbg(&dev->dev, "PCI INT %c not routed\n", 'A' + pin - 1);
+ return 0;
+ }
+ dev_dbg(&dev->dev, "PCI INT %c -> PIRQ %02x, mask %04x, excl %04x",
+ 'A' + pin - 1, pirq, mask, pirq_table->exclusive_irqs);
+ mask &= pcibios_irq_mask;
+
+ /* Work around broken HP Pavilion Notebooks which assign USB to
+ IRQ 9 even though it is actually wired to IRQ 11 */
+
+ if (broken_hp_bios_irq9 && pirq == 0x59 && dev->irq == 9) {
+ dev->irq = 11;
+ pci_write_config_byte(dev, PCI_INTERRUPT_LINE, 11);
+ r->set(pirq_router_dev, dev, pirq, 11);
+ }
+
+ /* same for Acer Travelmate 360, but with CB and irq 11 -> 10 */
+ if (acer_tm360_irqrouting && dev->irq == 11 &&
+ dev->vendor == PCI_VENDOR_ID_O2) {
+ pirq = 0x68;
+ mask = 0x400;
+ dev->irq = r->get(pirq_router_dev, dev, pirq);
+ pci_write_config_byte(dev, PCI_INTERRUPT_LINE, dev->irq);
+ }
+
+ /*
+ * Find the best IRQ to assign: use the one
+ * reported by the device if possible.
+ */
+ newirq = dev->irq;
+ if (newirq && !((1 << newirq) & mask)) {
+ if (pci_probe & PCI_USE_PIRQ_MASK)
+ newirq = 0;
+ else
+ dev_warn(&dev->dev, "IRQ %d doesn't match PIRQ mask "
+ "%#x; try pci=usepirqmask\n", newirq, mask);
+ }
+ if (!newirq && assign) {
+ for (i = 0; i < 16; i++) {
+ if (!(mask & (1 << i)))
+ continue;
+ if (pirq_penalty[i] < pirq_penalty[newirq] &&
+ can_request_irq(i, IRQF_SHARED))
+ newirq = i;
+ }
+ }
+ dev_dbg(&dev->dev, "PCI INT %c -> newirq %d", 'A' + pin - 1, newirq);
+
+ /* Check if it is hardcoded */
+ if ((pirq & 0xf0) == 0xf0) {
+ irq = pirq & 0xf;
+ msg = "hardcoded";
+ } else if (r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \
+ ((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask))) {
+ msg = "found";
+ elcr_set_level_irq(irq);
+ } else if (newirq && r->set &&
+ (dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) {
+ if (r->set(pirq_router_dev, dev, pirq, newirq)) {
+ elcr_set_level_irq(newirq);
+ msg = "assigned";
+ irq = newirq;
+ }
+ }
+
+ if (!irq) {
+ if (newirq && mask == (1 << newirq)) {
+ msg = "guessed";
+ irq = newirq;
+ } else {
+ dev_dbg(&dev->dev, "can't route interrupt\n");
+ return 0;
+ }
+ }
+ dev_info(&dev->dev, "%s PCI INT %c -> IRQ %d\n", msg, 'A' + pin - 1, irq);
+
+ /* Update IRQ for all devices with the same pirq value */
+ for_each_pci_dev(dev2) {
+ pci_read_config_byte(dev2, PCI_INTERRUPT_PIN, &pin);
+ if (!pin)
+ continue;
+
+ info = pirq_get_info(dev2);
+ if (!info)
+ continue;
+ if (info->irq[pin - 1].link == pirq) {
+ /*
+ * We refuse to override the dev->irq
+ * information. Give a warning!
+ */
+ if (dev2->irq && dev2->irq != irq && \
+ (!(pci_probe & PCI_USE_PIRQ_MASK) || \
+ ((1 << dev2->irq) & mask))) {
+#ifndef CONFIG_PCI_MSI
+ dev_info(&dev2->dev, "IRQ routing conflict: "
+ "have IRQ %d, want IRQ %d\n",
+ dev2->irq, irq);
+#endif
+ continue;
+ }
+ dev2->irq = irq;
+ pirq_penalty[irq]++;
+ if (dev != dev2)
+ dev_info(&dev->dev, "sharing IRQ %d with %s\n",
+ irq, pci_name(dev2));
+ }
+ }
+ return 1;
+}
+
+void __init pcibios_fixup_irqs(void)
+{
+ struct pci_dev *dev = NULL;
+ u8 pin;
+
+ DBG(KERN_DEBUG "PCI: IRQ fixup\n");
+ for_each_pci_dev(dev) {
+ /*
+ * If the BIOS has set an out of range IRQ number, just
+ * ignore it. Also keep track of which IRQ's are
+ * already in use.
+ */
+ if (dev->irq >= 16) {
+ dev_dbg(&dev->dev, "ignoring bogus IRQ %d\n", dev->irq);
+ dev->irq = 0;
+ }
+ /*
+ * If the IRQ is already assigned to a PCI device,
+ * ignore its ISA use penalty
+ */
+ if (pirq_penalty[dev->irq] >= 100 &&
+ pirq_penalty[dev->irq] < 100000)
+ pirq_penalty[dev->irq] = 0;
+ pirq_penalty[dev->irq]++;
+ }
+
+ if (io_apic_assign_pci_irqs)
+ return;
+
+ dev = NULL;
+ for_each_pci_dev(dev) {
+ pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
+ if (!pin)
+ continue;
+
+ /*
+ * Still no IRQ? Try to lookup one...
+ */
+ if (!dev->irq)
+ pcibios_lookup_irq(dev, 0);
+ }
+}
+
+/*
+ * Work around broken HP Pavilion Notebooks which assign USB to
+ * IRQ 9 even though it is actually wired to IRQ 11
+ */
+static int __init fix_broken_hp_bios_irq9(const struct dmi_system_id *d)
+{
+ if (!broken_hp_bios_irq9) {
+ broken_hp_bios_irq9 = 1;
+ printk(KERN_INFO "%s detected - fixing broken IRQ routing\n",
+ d->ident);
+ }
+ return 0;
+}
+
+/*
+ * Work around broken Acer TravelMate 360 Notebooks which assign
+ * Cardbus to IRQ 11 even though it is actually wired to IRQ 10
+ */
+static int __init fix_acer_tm360_irqrouting(const struct dmi_system_id *d)
+{
+ if (!acer_tm360_irqrouting) {
+ acer_tm360_irqrouting = 1;
+ printk(KERN_INFO "%s detected - fixing broken IRQ routing\n",
+ d->ident);
+ }
+ return 0;
+}
+
+static const struct dmi_system_id pciirq_dmi_table[] __initconst = {
+ {
+ .callback = fix_broken_hp_bios_irq9,
+ .ident = "HP Pavilion N5400 Series Laptop",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
+ DMI_MATCH(DMI_BIOS_VERSION, "GE.M1.03"),
+ DMI_MATCH(DMI_PRODUCT_VERSION,
+ "HP Pavilion Notebook Model GE"),
+ DMI_MATCH(DMI_BOARD_VERSION, "OmniBook N32N-736"),
+ },
+ },
+ {
+ .callback = fix_acer_tm360_irqrouting,
+ .ident = "Acer TravelMate 36x Laptop",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"),
+ },
+ },
+ { }
+};
+
+void __init pcibios_irq_init(void)
+{
+ struct irq_routing_table *rtable = NULL;
+
+ DBG(KERN_DEBUG "PCI: IRQ init\n");
+
+ if (raw_pci_ops == NULL)
+ return;
+
+ dmi_check_system(pciirq_dmi_table);
+
+ pirq_table = pirq_find_routing_table();
+
+#ifdef CONFIG_PCI_BIOS
+ if (!pirq_table && (pci_probe & PCI_BIOS_IRQ_SCAN)) {
+ pirq_table = pcibios_get_irq_routing_table();
+ rtable = pirq_table;
+ }
+#endif
+ if (pirq_table) {
+ pirq_peer_trick();
+ pirq_find_router(&pirq_router);
+ if (pirq_table->exclusive_irqs) {
+ int i;
+ for (i = 0; i < 16; i++)
+ if (!(pirq_table->exclusive_irqs & (1 << i)))
+ pirq_penalty[i] += 100;
+ }
+ /*
+ * If we're using the I/O APIC, avoid using the PCI IRQ
+ * routing table
+ */
+ if (io_apic_assign_pci_irqs) {
+ kfree(rtable);
+ pirq_table = NULL;
+ }
+ }
+
+ x86_init.pci.fixup_irqs();
+
+ if (io_apic_assign_pci_irqs && pci_routeirq) {
+ struct pci_dev *dev = NULL;
+ /*
+ * PCI IRQ routing is set up by pci_enable_device(), but we
+ * also do it here in case there are still broken drivers that
+ * don't use pci_enable_device().
+ */
+ printk(KERN_INFO "PCI: Routing PCI interrupts for all devices because \"pci=routeirq\" specified\n");
+ for_each_pci_dev(dev)
+ pirq_enable_irq(dev);
+ }
+}
+
+static void pirq_penalize_isa_irq(int irq, int active)
+{
+ /*
+ * If any ISAPnP device reports an IRQ in its list of possible
+ * IRQ's, we try to avoid assigning it to PCI devices.
+ */
+ if (irq < 16) {
+ if (active)
+ pirq_penalty[irq] += 1000;
+ else
+ pirq_penalty[irq] += 100;
+ }
+}
+
+void pcibios_penalize_isa_irq(int irq, int active)
+{
+#ifdef CONFIG_ACPI
+ if (!acpi_noirq)
+ acpi_penalize_isa_irq(irq, active);
+ else
+#endif
+ pirq_penalize_isa_irq(irq, active);
+}
+
+static int pirq_enable_irq(struct pci_dev *dev)
+{
+ u8 pin = 0;
+
+ pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
+ if (pin && !pcibios_lookup_irq(dev, 1)) {
+ char *msg = "";
+
+ if (!io_apic_assign_pci_irqs && dev->irq)
+ return 0;
+
+ if (io_apic_assign_pci_irqs) {
+#ifdef CONFIG_X86_IO_APIC
+ struct pci_dev *temp_dev;
+ int irq;
+
+ if (dev->irq_managed && dev->irq > 0)
+ return 0;
+
+ irq = IO_APIC_get_PCI_irq_vector(dev->bus->number,
+ PCI_SLOT(dev->devfn), pin - 1);
+ /*
+ * Busses behind bridges are typically not listed in the MP-table.
+ * In this case we have to look up the IRQ based on the parent bus,
+ * parent slot, and pin number. The SMP code detects such bridged
+ * busses itself so we should get into this branch reliably.
+ */
+ temp_dev = dev;
+ while (irq < 0 && dev->bus->parent) { /* go back to the bridge */
+ struct pci_dev *bridge = dev->bus->self;
+
+ pin = pci_swizzle_interrupt_pin(dev, pin);
+ irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
+ PCI_SLOT(bridge->devfn),
+ pin - 1);
+ if (irq >= 0)
+ dev_warn(&dev->dev, "using bridge %s "
+ "INT %c to get IRQ %d\n",
+ pci_name(bridge), 'A' + pin - 1,
+ irq);
+ dev = bridge;
+ }
+ dev = temp_dev;
+ if (irq >= 0) {
+ dev->irq_managed = 1;
+ dev->irq = irq;
+ dev_info(&dev->dev, "PCI->APIC IRQ transform: "
+ "INT %c -> IRQ %d\n", 'A' + pin - 1, irq);
+ return 0;
+ } else
+ msg = "; probably buggy MP table";
+#endif
+ } else if (pci_probe & PCI_BIOS_IRQ_SCAN)
+ msg = "";
+ else
+ msg = "; please try using pci=biosirq";
+
+ /*
+ * With IDE legacy devices the IRQ lookup failure is not
+ * a problem..
+ */
+ if (dev->class >> 8 == PCI_CLASS_STORAGE_IDE &&
+ !(dev->class & 0x5))
+ return 0;
+
+ dev_warn(&dev->dev, "can't find IRQ for PCI INT %c%s\n",
+ 'A' + pin - 1, msg);
+ }
+ return 0;
+}
+
+bool mp_should_keep_irq(struct device *dev)
+{
+ if (dev->power.is_prepared)
+ return true;
+#ifdef CONFIG_PM
+ if (dev->power.runtime_status == RPM_SUSPENDING)
+ return true;
+#endif
+
+ return false;
+}
+
+static void pirq_disable_irq(struct pci_dev *dev)
+{
+ if (io_apic_assign_pci_irqs && !mp_should_keep_irq(&dev->dev) &&
+ dev->irq_managed && dev->irq) {
+ mp_unmap_irq(dev->irq);
+ dev->irq = 0;
+ dev->irq_managed = 0;
+ }
+}
diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c
new file mode 100644
index 000000000..dfbe6ac38
--- /dev/null
+++ b/arch/x86/pci/legacy.c
@@ -0,0 +1,76 @@
+/*
+ * legacy.c - traditional, old school PCI bus probing
+ */
+#include <linux/init.h>
+#include <linux/export.h>
+#include <linux/pci.h>
+#include <asm/jailhouse_para.h>
+#include <asm/pci_x86.h>
+
+/*
+ * Discover remaining PCI buses in case there are peer host bridges.
+ * We use the number of last PCI bus provided by the PCI BIOS.
+ */
+static void pcibios_fixup_peer_bridges(void)
+{
+ int n;
+
+ if (pcibios_last_bus <= 0 || pcibios_last_bus > 0xff)
+ return;
+ DBG("PCI: Peer bridge fixup\n");
+
+ for (n=0; n <= pcibios_last_bus; n++)
+ pcibios_scan_specific_bus(n);
+}
+
+int __init pci_legacy_init(void)
+{
+ if (!raw_pci_ops)
+ return 1;
+
+ pr_info("PCI: Probing PCI hardware\n");
+ pcibios_scan_root(0);
+ return 0;
+}
+
+void pcibios_scan_specific_bus(int busn)
+{
+ int stride = jailhouse_paravirt() ? 1 : 8;
+ int devfn;
+ u32 l;
+
+ if (pci_find_bus(0, busn))
+ return;
+
+ for (devfn = 0; devfn < 256; devfn += stride) {
+ if (!raw_pci_read(0, busn, devfn, PCI_VENDOR_ID, 2, &l) &&
+ l != 0x0000 && l != 0xffff) {
+ DBG("Found device at %02x:%02x [%04x]\n", busn, devfn, l);
+ pr_info("PCI: Discovered peer bus %02x\n", busn);
+ pcibios_scan_root(busn);
+ return;
+ }
+ }
+}
+EXPORT_SYMBOL_GPL(pcibios_scan_specific_bus);
+
+static int __init pci_subsys_init(void)
+{
+ /*
+ * The init function returns an non zero value when
+ * pci_legacy_init should be invoked.
+ */
+ if (x86_init.pci.init()) {
+ if (pci_legacy_init()) {
+ pr_info("PCI: System does not support PCI\n");
+ return -ENODEV;
+ }
+ }
+
+ pcibios_fixup_peer_bridges();
+ x86_init.pci.init_irq();
+ pcibios_init();
+
+ return 0;
+}
+subsys_initcall(pci_subsys_init);
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
new file mode 100644
index 000000000..7389db538
--- /dev/null
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -0,0 +1,814 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * mmconfig-shared.c - Low-level direct PCI config space access via
+ * MMCONFIG - common code between i386 and x86-64.
+ *
+ * This code does:
+ * - known chipset handling
+ * - ACPI decoding and validation
+ *
+ * Per-architecture code takes care of the mappings and accesses
+ * themselves.
+ */
+
+#include <linux/pci.h>
+#include <linux/init.h>
+#include <linux/sfi_acpi.h>
+#include <linux/bitmap.h>
+#include <linux/dmi.h>
+#include <linux/slab.h>
+#include <linux/mutex.h>
+#include <linux/rculist.h>
+#include <asm/e820/api.h>
+#include <asm/pci_x86.h>
+#include <asm/acpi.h>
+
+#define PREFIX "PCI: "
+
+/* Indicate if the mmcfg resources have been placed into the resource table. */
+static bool pci_mmcfg_running_state;
+static bool pci_mmcfg_arch_init_failed;
+static DEFINE_MUTEX(pci_mmcfg_lock);
+
+LIST_HEAD(pci_mmcfg_list);
+
+static void __init pci_mmconfig_remove(struct pci_mmcfg_region *cfg)
+{
+ if (cfg->res.parent)
+ release_resource(&cfg->res);
+ list_del(&cfg->list);
+ kfree(cfg);
+}
+
+static void __init free_all_mmcfg(void)
+{
+ struct pci_mmcfg_region *cfg, *tmp;
+
+ pci_mmcfg_arch_free();
+ list_for_each_entry_safe(cfg, tmp, &pci_mmcfg_list, list)
+ pci_mmconfig_remove(cfg);
+}
+
+static void list_add_sorted(struct pci_mmcfg_region *new)
+{
+ struct pci_mmcfg_region *cfg;
+
+ /* keep list sorted by segment and starting bus number */
+ list_for_each_entry_rcu(cfg, &pci_mmcfg_list, list) {
+ if (cfg->segment > new->segment ||
+ (cfg->segment == new->segment &&
+ cfg->start_bus >= new->start_bus)) {
+ list_add_tail_rcu(&new->list, &cfg->list);
+ return;
+ }
+ }
+ list_add_tail_rcu(&new->list, &pci_mmcfg_list);
+}
+
+static struct pci_mmcfg_region *pci_mmconfig_alloc(int segment, int start,
+ int end, u64 addr)
+{
+ struct pci_mmcfg_region *new;
+ struct resource *res;
+
+ if (addr == 0)
+ return NULL;
+
+ new = kzalloc(sizeof(*new), GFP_KERNEL);
+ if (!new)
+ return NULL;
+
+ new->address = addr;
+ new->segment = segment;
+ new->start_bus = start;
+ new->end_bus = end;
+
+ res = &new->res;
+ res->start = addr + PCI_MMCFG_BUS_OFFSET(start);
+ res->end = addr + PCI_MMCFG_BUS_OFFSET(end + 1) - 1;
+ res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+ snprintf(new->name, PCI_MMCFG_RESOURCE_NAME_LEN,
+ "PCI MMCONFIG %04x [bus %02x-%02x]", segment, start, end);
+ res->name = new->name;
+
+ return new;
+}
+
+struct pci_mmcfg_region *__init pci_mmconfig_add(int segment, int start,
+ int end, u64 addr)
+{
+ struct pci_mmcfg_region *new;
+
+ new = pci_mmconfig_alloc(segment, start, end, addr);
+ if (new) {
+ mutex_lock(&pci_mmcfg_lock);
+ list_add_sorted(new);
+ mutex_unlock(&pci_mmcfg_lock);
+
+ pr_info(PREFIX
+ "MMCONFIG for domain %04x [bus %02x-%02x] at %pR "
+ "(base %#lx)\n",
+ segment, start, end, &new->res, (unsigned long)addr);
+ }
+
+ return new;
+}
+
+struct pci_mmcfg_region *pci_mmconfig_lookup(int segment, int bus)
+{
+ struct pci_mmcfg_region *cfg;
+
+ list_for_each_entry_rcu(cfg, &pci_mmcfg_list, list)
+ if (cfg->segment == segment &&
+ cfg->start_bus <= bus && bus <= cfg->end_bus)
+ return cfg;
+
+ return NULL;
+}
+
+static const char *__init pci_mmcfg_e7520(void)
+{
+ u32 win;
+ raw_pci_ops->read(0, 0, PCI_DEVFN(0, 0), 0xce, 2, &win);
+
+ win = win & 0xf000;
+ if (win == 0x0000 || win == 0xf000)
+ return NULL;
+
+ if (pci_mmconfig_add(0, 0, 255, win << 16) == NULL)
+ return NULL;
+
+ return "Intel Corporation E7520 Memory Controller Hub";
+}
+
+static const char *__init pci_mmcfg_intel_945(void)
+{
+ u32 pciexbar, mask = 0, len = 0;
+
+ raw_pci_ops->read(0, 0, PCI_DEVFN(0, 0), 0x48, 4, &pciexbar);
+
+ /* Enable bit */
+ if (!(pciexbar & 1))
+ return NULL;
+
+ /* Size bits */
+ switch ((pciexbar >> 1) & 3) {
+ case 0:
+ mask = 0xf0000000U;
+ len = 0x10000000U;
+ break;
+ case 1:
+ mask = 0xf8000000U;
+ len = 0x08000000U;
+ break;
+ case 2:
+ mask = 0xfc000000U;
+ len = 0x04000000U;
+ break;
+ default:
+ return NULL;
+ }
+
+ /* Errata #2, things break when not aligned on a 256Mb boundary */
+ /* Can only happen in 64M/128M mode */
+
+ if ((pciexbar & mask) & 0x0fffffffU)
+ return NULL;
+
+ /* Don't hit the APIC registers and their friends */
+ if ((pciexbar & mask) >= 0xf0000000U)
+ return NULL;
+
+ if (pci_mmconfig_add(0, 0, (len >> 20) - 1, pciexbar & mask) == NULL)
+ return NULL;
+
+ return "Intel Corporation 945G/GZ/P/PL Express Memory Controller Hub";
+}
+
+static const char *__init pci_mmcfg_amd_fam10h(void)
+{
+ u32 low, high, address;
+ u64 base, msr;
+ int i;
+ unsigned segnbits = 0, busnbits, end_bus;
+
+ if (!(pci_probe & PCI_CHECK_ENABLE_AMD_MMCONF))
+ return NULL;
+
+ address = MSR_FAM10H_MMIO_CONF_BASE;
+ if (rdmsr_safe(address, &low, &high))
+ return NULL;
+
+ msr = high;
+ msr <<= 32;
+ msr |= low;
+
+ /* mmconfig is not enable */
+ if (!(msr & FAM10H_MMIO_CONF_ENABLE))
+ return NULL;
+
+ base = msr & (FAM10H_MMIO_CONF_BASE_MASK<<FAM10H_MMIO_CONF_BASE_SHIFT);
+
+ busnbits = (msr >> FAM10H_MMIO_CONF_BUSRANGE_SHIFT) &
+ FAM10H_MMIO_CONF_BUSRANGE_MASK;
+
+ /*
+ * only handle bus 0 ?
+ * need to skip it
+ */
+ if (!busnbits)
+ return NULL;
+
+ if (busnbits > 8) {
+ segnbits = busnbits - 8;
+ busnbits = 8;
+ }
+
+ end_bus = (1 << busnbits) - 1;
+ for (i = 0; i < (1 << segnbits); i++)
+ if (pci_mmconfig_add(i, 0, end_bus,
+ base + (1<<28) * i) == NULL) {
+ free_all_mmcfg();
+ return NULL;
+ }
+
+ return "AMD Family 10h NB";
+}
+
+static bool __initdata mcp55_checked;
+static const char *__init pci_mmcfg_nvidia_mcp55(void)
+{
+ int bus;
+ int mcp55_mmconf_found = 0;
+
+ static const u32 extcfg_regnum __initconst = 0x90;
+ static const u32 extcfg_regsize __initconst = 4;
+ static const u32 extcfg_enable_mask __initconst = 1 << 31;
+ static const u32 extcfg_start_mask __initconst = 0xff << 16;
+ static const int extcfg_start_shift __initconst = 16;
+ static const u32 extcfg_size_mask __initconst = 0x3 << 28;
+ static const int extcfg_size_shift __initconst = 28;
+ static const int extcfg_sizebus[] __initconst = {
+ 0x100, 0x80, 0x40, 0x20
+ };
+ static const u32 extcfg_base_mask[] __initconst = {
+ 0x7ff8, 0x7ffc, 0x7ffe, 0x7fff
+ };
+ static const int extcfg_base_lshift __initconst = 25;
+
+ /*
+ * do check if amd fam10h already took over
+ */
+ if (!acpi_disabled || !list_empty(&pci_mmcfg_list) || mcp55_checked)
+ return NULL;
+
+ mcp55_checked = true;
+ for (bus = 0; bus < 256; bus++) {
+ u64 base;
+ u32 l, extcfg;
+ u16 vendor, device;
+ int start, size_index, end;
+
+ raw_pci_ops->read(0, bus, PCI_DEVFN(0, 0), 0, 4, &l);
+ vendor = l & 0xffff;
+ device = (l >> 16) & 0xffff;
+
+ if (PCI_VENDOR_ID_NVIDIA != vendor || 0x0369 != device)
+ continue;
+
+ raw_pci_ops->read(0, bus, PCI_DEVFN(0, 0), extcfg_regnum,
+ extcfg_regsize, &extcfg);
+
+ if (!(extcfg & extcfg_enable_mask))
+ continue;
+
+ size_index = (extcfg & extcfg_size_mask) >> extcfg_size_shift;
+ base = extcfg & extcfg_base_mask[size_index];
+ /* base could > 4G */
+ base <<= extcfg_base_lshift;
+ start = (extcfg & extcfg_start_mask) >> extcfg_start_shift;
+ end = start + extcfg_sizebus[size_index] - 1;
+ if (pci_mmconfig_add(0, start, end, base) == NULL)
+ continue;
+ mcp55_mmconf_found++;
+ }
+
+ if (!mcp55_mmconf_found)
+ return NULL;
+
+ return "nVidia MCP55";
+}
+
+struct pci_mmcfg_hostbridge_probe {
+ u32 bus;
+ u32 devfn;
+ u32 vendor;
+ u32 device;
+ const char *(*probe)(void);
+};
+
+static const struct pci_mmcfg_hostbridge_probe pci_mmcfg_probes[] __initconst = {
+ { 0, PCI_DEVFN(0, 0), PCI_VENDOR_ID_INTEL,
+ PCI_DEVICE_ID_INTEL_E7520_MCH, pci_mmcfg_e7520 },
+ { 0, PCI_DEVFN(0, 0), PCI_VENDOR_ID_INTEL,
+ PCI_DEVICE_ID_INTEL_82945G_HB, pci_mmcfg_intel_945 },
+ { 0, PCI_DEVFN(0x18, 0), PCI_VENDOR_ID_AMD,
+ 0x1200, pci_mmcfg_amd_fam10h },
+ { 0xff, PCI_DEVFN(0, 0), PCI_VENDOR_ID_AMD,
+ 0x1200, pci_mmcfg_amd_fam10h },
+ { 0, PCI_DEVFN(0, 0), PCI_VENDOR_ID_NVIDIA,
+ 0x0369, pci_mmcfg_nvidia_mcp55 },
+};
+
+static void __init pci_mmcfg_check_end_bus_number(void)
+{
+ struct pci_mmcfg_region *cfg, *cfgx;
+
+ /* Fixup overlaps */
+ list_for_each_entry(cfg, &pci_mmcfg_list, list) {
+ if (cfg->end_bus < cfg->start_bus)
+ cfg->end_bus = 255;
+
+ /* Don't access the list head ! */
+ if (cfg->list.next == &pci_mmcfg_list)
+ break;
+
+ cfgx = list_entry(cfg->list.next, typeof(*cfg), list);
+ if (cfg->end_bus >= cfgx->start_bus)
+ cfg->end_bus = cfgx->start_bus - 1;
+ }
+}
+
+static int __init pci_mmcfg_check_hostbridge(void)
+{
+ u32 l;
+ u32 bus, devfn;
+ u16 vendor, device;
+ int i;
+ const char *name;
+
+ if (!raw_pci_ops)
+ return 0;
+
+ free_all_mmcfg();
+
+ for (i = 0; i < ARRAY_SIZE(pci_mmcfg_probes); i++) {
+ bus = pci_mmcfg_probes[i].bus;
+ devfn = pci_mmcfg_probes[i].devfn;
+ raw_pci_ops->read(0, bus, devfn, 0, 4, &l);
+ vendor = l & 0xffff;
+ device = (l >> 16) & 0xffff;
+
+ name = NULL;
+ if (pci_mmcfg_probes[i].vendor == vendor &&
+ pci_mmcfg_probes[i].device == device)
+ name = pci_mmcfg_probes[i].probe();
+
+ if (name)
+ pr_info(PREFIX "%s with MMCONFIG support\n", name);
+ }
+
+ /* some end_bus_number is crazy, fix it */
+ pci_mmcfg_check_end_bus_number();
+
+ return !list_empty(&pci_mmcfg_list);
+}
+
+static acpi_status check_mcfg_resource(struct acpi_resource *res, void *data)
+{
+ struct resource *mcfg_res = data;
+ struct acpi_resource_address64 address;
+ acpi_status status;
+
+ if (res->type == ACPI_RESOURCE_TYPE_FIXED_MEMORY32) {
+ struct acpi_resource_fixed_memory32 *fixmem32 =
+ &res->data.fixed_memory32;
+ if (!fixmem32)
+ return AE_OK;
+ if ((mcfg_res->start >= fixmem32->address) &&
+ (mcfg_res->end < (fixmem32->address +
+ fixmem32->address_length))) {
+ mcfg_res->flags = 1;
+ return AE_CTRL_TERMINATE;
+ }
+ }
+ if ((res->type != ACPI_RESOURCE_TYPE_ADDRESS32) &&
+ (res->type != ACPI_RESOURCE_TYPE_ADDRESS64))
+ return AE_OK;
+
+ status = acpi_resource_to_address64(res, &address);
+ if (ACPI_FAILURE(status) ||
+ (address.address.address_length <= 0) ||
+ (address.resource_type != ACPI_MEMORY_RANGE))
+ return AE_OK;
+
+ if ((mcfg_res->start >= address.address.minimum) &&
+ (mcfg_res->end < (address.address.minimum + address.address.address_length))) {
+ mcfg_res->flags = 1;
+ return AE_CTRL_TERMINATE;
+ }
+ return AE_OK;
+}
+
+static acpi_status find_mboard_resource(acpi_handle handle, u32 lvl,
+ void *context, void **rv)
+{
+ struct resource *mcfg_res = context;
+
+ acpi_walk_resources(handle, METHOD_NAME__CRS,
+ check_mcfg_resource, context);
+
+ if (mcfg_res->flags)
+ return AE_CTRL_TERMINATE;
+
+ return AE_OK;
+}
+
+static bool is_acpi_reserved(u64 start, u64 end, unsigned not_used)
+{
+ struct resource mcfg_res;
+
+ mcfg_res.start = start;
+ mcfg_res.end = end - 1;
+ mcfg_res.flags = 0;
+
+ acpi_get_devices("PNP0C01", find_mboard_resource, &mcfg_res, NULL);
+
+ if (!mcfg_res.flags)
+ acpi_get_devices("PNP0C02", find_mboard_resource, &mcfg_res,
+ NULL);
+
+ return mcfg_res.flags;
+}
+
+typedef bool (*check_reserved_t)(u64 start, u64 end, unsigned type);
+
+static bool __ref is_mmconf_reserved(check_reserved_t is_reserved,
+ struct pci_mmcfg_region *cfg,
+ struct device *dev, int with_e820)
+{
+ u64 addr = cfg->res.start;
+ u64 size = resource_size(&cfg->res);
+ u64 old_size = size;
+ int num_buses;
+ char *method = with_e820 ? "E820" : "ACPI motherboard resources";
+
+ while (!is_reserved(addr, addr + size, E820_TYPE_RESERVED)) {
+ size >>= 1;
+ if (size < (16UL<<20))
+ break;
+ }
+
+ if (size < (16UL<<20) && size != old_size)
+ return 0;
+
+ if (dev)
+ dev_info(dev, "MMCONFIG at %pR reserved in %s\n",
+ &cfg->res, method);
+ else
+ pr_info(PREFIX "MMCONFIG at %pR reserved in %s\n",
+ &cfg->res, method);
+
+ if (old_size != size) {
+ /* update end_bus */
+ cfg->end_bus = cfg->start_bus + ((size>>20) - 1);
+ num_buses = cfg->end_bus - cfg->start_bus + 1;
+ cfg->res.end = cfg->res.start +
+ PCI_MMCFG_BUS_OFFSET(num_buses) - 1;
+ snprintf(cfg->name, PCI_MMCFG_RESOURCE_NAME_LEN,
+ "PCI MMCONFIG %04x [bus %02x-%02x]",
+ cfg->segment, cfg->start_bus, cfg->end_bus);
+
+ if (dev)
+ dev_info(dev,
+ "MMCONFIG "
+ "at %pR (base %#lx) (size reduced!)\n",
+ &cfg->res, (unsigned long) cfg->address);
+ else
+ pr_info(PREFIX
+ "MMCONFIG for %04x [bus%02x-%02x] "
+ "at %pR (base %#lx) (size reduced!)\n",
+ cfg->segment, cfg->start_bus, cfg->end_bus,
+ &cfg->res, (unsigned long) cfg->address);
+ }
+
+ return 1;
+}
+
+static bool __ref
+pci_mmcfg_check_reserved(struct device *dev, struct pci_mmcfg_region *cfg, int early)
+{
+ if (!early && !acpi_disabled) {
+ if (is_mmconf_reserved(is_acpi_reserved, cfg, dev, 0))
+ return 1;
+
+ if (dev)
+ dev_info(dev, FW_INFO
+ "MMCONFIG at %pR not reserved in "
+ "ACPI motherboard resources\n",
+ &cfg->res);
+ else
+ pr_info(FW_INFO PREFIX
+ "MMCONFIG at %pR not reserved in "
+ "ACPI motherboard resources\n",
+ &cfg->res);
+ }
+
+ /*
+ * e820__mapped_all() is marked as __init.
+ * All entries from ACPI MCFG table have been checked at boot time.
+ * For MCFG information constructed from hotpluggable host bridge's
+ * _CBA method, just assume it's reserved.
+ */
+ if (pci_mmcfg_running_state)
+ return 1;
+
+ /* Don't try to do this check unless configuration
+ type 1 is available. how about type 2 ?*/
+ if (raw_pci_ops)
+ return is_mmconf_reserved(e820__mapped_all, cfg, dev, 1);
+
+ return 0;
+}
+
+static void __init pci_mmcfg_reject_broken(int early)
+{
+ struct pci_mmcfg_region *cfg;
+
+ list_for_each_entry(cfg, &pci_mmcfg_list, list) {
+ if (pci_mmcfg_check_reserved(NULL, cfg, early) == 0) {
+ pr_info(PREFIX "not using MMCONFIG\n");
+ free_all_mmcfg();
+ return;
+ }
+ }
+}
+
+static int __init acpi_mcfg_check_entry(struct acpi_table_mcfg *mcfg,
+ struct acpi_mcfg_allocation *cfg)
+{
+ if (cfg->address < 0xFFFFFFFF)
+ return 0;
+
+ if (!strncmp(mcfg->header.oem_id, "SGI", 3))
+ return 0;
+
+ if ((mcfg->header.revision >= 1) && (dmi_get_bios_year() >= 2010))
+ return 0;
+
+ pr_err(PREFIX "MCFG region for %04x [bus %02x-%02x] at %#llx "
+ "is above 4GB, ignored\n", cfg->pci_segment,
+ cfg->start_bus_number, cfg->end_bus_number, cfg->address);
+ return -EINVAL;
+}
+
+static int __init pci_parse_mcfg(struct acpi_table_header *header)
+{
+ struct acpi_table_mcfg *mcfg;
+ struct acpi_mcfg_allocation *cfg_table, *cfg;
+ unsigned long i;
+ int entries;
+
+ if (!header)
+ return -EINVAL;
+
+ mcfg = (struct acpi_table_mcfg *)header;
+
+ /* how many config structures do we have */
+ free_all_mmcfg();
+ entries = 0;
+ i = header->length - sizeof(struct acpi_table_mcfg);
+ while (i >= sizeof(struct acpi_mcfg_allocation)) {
+ entries++;
+ i -= sizeof(struct acpi_mcfg_allocation);
+ }
+ if (entries == 0) {
+ pr_err(PREFIX "MMCONFIG has no entries\n");
+ return -ENODEV;
+ }
+
+ cfg_table = (struct acpi_mcfg_allocation *) &mcfg[1];
+ for (i = 0; i < entries; i++) {
+ cfg = &cfg_table[i];
+ if (acpi_mcfg_check_entry(mcfg, cfg)) {
+ free_all_mmcfg();
+ return -ENODEV;
+ }
+
+ if (pci_mmconfig_add(cfg->pci_segment, cfg->start_bus_number,
+ cfg->end_bus_number, cfg->address) == NULL) {
+ pr_warn(PREFIX "no memory for MCFG entries\n");
+ free_all_mmcfg();
+ return -ENOMEM;
+ }
+ }
+
+ return 0;
+}
+
+#ifdef CONFIG_ACPI_APEI
+extern int (*arch_apei_filter_addr)(int (*func)(__u64 start, __u64 size,
+ void *data), void *data);
+
+static int pci_mmcfg_for_each_region(int (*func)(__u64 start, __u64 size,
+ void *data), void *data)
+{
+ struct pci_mmcfg_region *cfg;
+ int rc;
+
+ if (list_empty(&pci_mmcfg_list))
+ return 0;
+
+ list_for_each_entry(cfg, &pci_mmcfg_list, list) {
+ rc = func(cfg->res.start, resource_size(&cfg->res), data);
+ if (rc)
+ return rc;
+ }
+
+ return 0;
+}
+#define set_apei_filter() (arch_apei_filter_addr = pci_mmcfg_for_each_region)
+#else
+#define set_apei_filter()
+#endif
+
+static void __init __pci_mmcfg_init(int early)
+{
+ pci_mmcfg_reject_broken(early);
+ if (list_empty(&pci_mmcfg_list))
+ return;
+
+ if (pcibios_last_bus < 0) {
+ const struct pci_mmcfg_region *cfg;
+
+ list_for_each_entry(cfg, &pci_mmcfg_list, list) {
+ if (cfg->segment)
+ break;
+ pcibios_last_bus = cfg->end_bus;
+ }
+ }
+
+ if (pci_mmcfg_arch_init())
+ pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF;
+ else {
+ free_all_mmcfg();
+ pci_mmcfg_arch_init_failed = true;
+ }
+}
+
+static int __initdata known_bridge;
+
+void __init pci_mmcfg_early_init(void)
+{
+ if (pci_probe & PCI_PROBE_MMCONF) {
+ if (pci_mmcfg_check_hostbridge())
+ known_bridge = 1;
+ else
+ acpi_sfi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg);
+ __pci_mmcfg_init(1);
+
+ set_apei_filter();
+ }
+}
+
+void __init pci_mmcfg_late_init(void)
+{
+ /* MMCONFIG disabled */
+ if ((pci_probe & PCI_PROBE_MMCONF) == 0)
+ return;
+
+ if (known_bridge)
+ return;
+
+ /* MMCONFIG hasn't been enabled yet, try again */
+ if (pci_probe & PCI_PROBE_MASK & ~PCI_PROBE_MMCONF) {
+ acpi_sfi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg);
+ __pci_mmcfg_init(0);
+ }
+}
+
+static int __init pci_mmcfg_late_insert_resources(void)
+{
+ struct pci_mmcfg_region *cfg;
+
+ pci_mmcfg_running_state = true;
+
+ /* If we are not using MMCONFIG, don't insert the resources. */
+ if ((pci_probe & PCI_PROBE_MMCONF) == 0)
+ return 1;
+
+ /*
+ * Attempt to insert the mmcfg resources but not with the busy flag
+ * marked so it won't cause request errors when __request_region is
+ * called.
+ */
+ list_for_each_entry(cfg, &pci_mmcfg_list, list)
+ if (!cfg->res.parent)
+ insert_resource(&iomem_resource, &cfg->res);
+
+ return 0;
+}
+
+/*
+ * Perform MMCONFIG resource insertion after PCI initialization to allow for
+ * misprogrammed MCFG tables that state larger sizes but actually conflict
+ * with other system resources.
+ */
+late_initcall(pci_mmcfg_late_insert_resources);
+
+/* Add MMCFG information for host bridges */
+int pci_mmconfig_insert(struct device *dev, u16 seg, u8 start, u8 end,
+ phys_addr_t addr)
+{
+ int rc;
+ struct resource *tmp = NULL;
+ struct pci_mmcfg_region *cfg;
+
+ if (!(pci_probe & PCI_PROBE_MMCONF) || pci_mmcfg_arch_init_failed)
+ return -ENODEV;
+
+ if (start > end)
+ return -EINVAL;
+
+ mutex_lock(&pci_mmcfg_lock);
+ cfg = pci_mmconfig_lookup(seg, start);
+ if (cfg) {
+ if (cfg->end_bus < end)
+ dev_info(dev, FW_INFO
+ "MMCONFIG for "
+ "domain %04x [bus %02x-%02x] "
+ "only partially covers this bridge\n",
+ cfg->segment, cfg->start_bus, cfg->end_bus);
+ mutex_unlock(&pci_mmcfg_lock);
+ return -EEXIST;
+ }
+
+ if (!addr) {
+ mutex_unlock(&pci_mmcfg_lock);
+ return -EINVAL;
+ }
+
+ rc = -EBUSY;
+ cfg = pci_mmconfig_alloc(seg, start, end, addr);
+ if (cfg == NULL) {
+ dev_warn(dev, "fail to add MMCONFIG (out of memory)\n");
+ rc = -ENOMEM;
+ } else if (!pci_mmcfg_check_reserved(dev, cfg, 0)) {
+ dev_warn(dev, FW_BUG "MMCONFIG %pR isn't reserved\n",
+ &cfg->res);
+ } else {
+ /* Insert resource if it's not in boot stage */
+ if (pci_mmcfg_running_state)
+ tmp = insert_resource_conflict(&iomem_resource,
+ &cfg->res);
+
+ if (tmp) {
+ dev_warn(dev,
+ "MMCONFIG %pR conflicts with "
+ "%s %pR\n",
+ &cfg->res, tmp->name, tmp);
+ } else if (pci_mmcfg_arch_map(cfg)) {
+ dev_warn(dev, "fail to map MMCONFIG %pR.\n",
+ &cfg->res);
+ } else {
+ list_add_sorted(cfg);
+ dev_info(dev, "MMCONFIG at %pR (base %#lx)\n",
+ &cfg->res, (unsigned long)addr);
+ cfg = NULL;
+ rc = 0;
+ }
+ }
+
+ if (cfg) {
+ if (cfg->res.parent)
+ release_resource(&cfg->res);
+ kfree(cfg);
+ }
+
+ mutex_unlock(&pci_mmcfg_lock);
+
+ return rc;
+}
+
+/* Delete MMCFG information for host bridges */
+int pci_mmconfig_delete(u16 seg, u8 start, u8 end)
+{
+ struct pci_mmcfg_region *cfg;
+
+ mutex_lock(&pci_mmcfg_lock);
+ list_for_each_entry_rcu(cfg, &pci_mmcfg_list, list)
+ if (cfg->segment == seg && cfg->start_bus == start &&
+ cfg->end_bus == end) {
+ list_del_rcu(&cfg->list);
+ synchronize_rcu();
+ pci_mmcfg_arch_unmap(cfg);
+ if (cfg->res.parent)
+ release_resource(&cfg->res);
+ mutex_unlock(&pci_mmcfg_lock);
+ kfree(cfg);
+ return 0;
+ }
+ mutex_unlock(&pci_mmcfg_lock);
+
+ return -ENOENT;
+}
diff --git a/arch/x86/pci/mmconfig_32.c b/arch/x86/pci/mmconfig_32.c
new file mode 100644
index 000000000..3e9e166f6
--- /dev/null
+++ b/arch/x86/pci/mmconfig_32.c
@@ -0,0 +1,157 @@
+/*
+ * Copyright (C) 2004 Matthew Wilcox <matthew@wil.cx>
+ * Copyright (C) 2004 Intel Corp.
+ *
+ * This code is released under the GNU General Public License version 2.
+ */
+
+/*
+ * mmconfig.c - Low-level direct PCI config space access via MMCONFIG
+ */
+
+#include <linux/pci.h>
+#include <linux/init.h>
+#include <linux/rcupdate.h>
+#include <asm/e820/api.h>
+#include <asm/pci_x86.h>
+
+/* Assume systems with more busses have correct MCFG */
+#define mmcfg_virt_addr ((void __iomem *) fix_to_virt(FIX_PCIE_MCFG))
+
+/* The base address of the last MMCONFIG device accessed */
+static u32 mmcfg_last_accessed_device;
+static int mmcfg_last_accessed_cpu;
+
+/*
+ * Functions for accessing PCI configuration space with MMCONFIG accesses
+ */
+static u32 get_base_addr(unsigned int seg, int bus, unsigned devfn)
+{
+ struct pci_mmcfg_region *cfg = pci_mmconfig_lookup(seg, bus);
+
+ if (cfg)
+ return cfg->address;
+ return 0;
+}
+
+/*
+ * This is always called under pci_config_lock
+ */
+static void pci_exp_set_dev_base(unsigned int base, int bus, int devfn)
+{
+ u32 dev_base = base | PCI_MMCFG_BUS_OFFSET(bus) | (devfn << 12);
+ int cpu = smp_processor_id();
+ if (dev_base != mmcfg_last_accessed_device ||
+ cpu != mmcfg_last_accessed_cpu) {
+ mmcfg_last_accessed_device = dev_base;
+ mmcfg_last_accessed_cpu = cpu;
+ set_fixmap_nocache(FIX_PCIE_MCFG, dev_base);
+ }
+}
+
+static int pci_mmcfg_read(unsigned int seg, unsigned int bus,
+ unsigned int devfn, int reg, int len, u32 *value)
+{
+ unsigned long flags;
+ u32 base;
+
+ if ((bus > 255) || (devfn > 255) || (reg > 4095)) {
+err: *value = -1;
+ return -EINVAL;
+ }
+
+ rcu_read_lock();
+ base = get_base_addr(seg, bus, devfn);
+ if (!base) {
+ rcu_read_unlock();
+ goto err;
+ }
+
+ raw_spin_lock_irqsave(&pci_config_lock, flags);
+
+ pci_exp_set_dev_base(base, bus, devfn);
+
+ switch (len) {
+ case 1:
+ *value = mmio_config_readb(mmcfg_virt_addr + reg);
+ break;
+ case 2:
+ *value = mmio_config_readw(mmcfg_virt_addr + reg);
+ break;
+ case 4:
+ *value = mmio_config_readl(mmcfg_virt_addr + reg);
+ break;
+ }
+ raw_spin_unlock_irqrestore(&pci_config_lock, flags);
+ rcu_read_unlock();
+
+ return 0;
+}
+
+static int pci_mmcfg_write(unsigned int seg, unsigned int bus,
+ unsigned int devfn, int reg, int len, u32 value)
+{
+ unsigned long flags;
+ u32 base;
+
+ if ((bus > 255) || (devfn > 255) || (reg > 4095))
+ return -EINVAL;
+
+ rcu_read_lock();
+ base = get_base_addr(seg, bus, devfn);
+ if (!base) {
+ rcu_read_unlock();
+ return -EINVAL;
+ }
+
+ raw_spin_lock_irqsave(&pci_config_lock, flags);
+
+ pci_exp_set_dev_base(base, bus, devfn);
+
+ switch (len) {
+ case 1:
+ mmio_config_writeb(mmcfg_virt_addr + reg, value);
+ break;
+ case 2:
+ mmio_config_writew(mmcfg_virt_addr + reg, value);
+ break;
+ case 4:
+ mmio_config_writel(mmcfg_virt_addr + reg, value);
+ break;
+ }
+ raw_spin_unlock_irqrestore(&pci_config_lock, flags);
+ rcu_read_unlock();
+
+ return 0;
+}
+
+const struct pci_raw_ops pci_mmcfg = {
+ .read = pci_mmcfg_read,
+ .write = pci_mmcfg_write,
+};
+
+int __init pci_mmcfg_arch_init(void)
+{
+ printk(KERN_INFO "PCI: Using MMCONFIG for extended config space\n");
+ raw_pci_ext_ops = &pci_mmcfg;
+ return 1;
+}
+
+void __init pci_mmcfg_arch_free(void)
+{
+}
+
+int pci_mmcfg_arch_map(struct pci_mmcfg_region *cfg)
+{
+ return 0;
+}
+
+void pci_mmcfg_arch_unmap(struct pci_mmcfg_region *cfg)
+{
+ unsigned long flags;
+
+ /* Invalidate the cached mmcfg map entry. */
+ raw_spin_lock_irqsave(&pci_config_lock, flags);
+ mmcfg_last_accessed_device = 0;
+ raw_spin_unlock_irqrestore(&pci_config_lock, flags);
+}
diff --git a/arch/x86/pci/mmconfig_64.c b/arch/x86/pci/mmconfig_64.c
new file mode 100644
index 000000000..887d181b7
--- /dev/null
+++ b/arch/x86/pci/mmconfig_64.c
@@ -0,0 +1,154 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * mmconfig.c - Low-level direct PCI config space access via MMCONFIG
+ *
+ * This is an 64bit optimized version that always keeps the full mmconfig
+ * space mapped. This allows lockless config space operation.
+ */
+
+#include <linux/pci.h>
+#include <linux/init.h>
+#include <linux/acpi.h>
+#include <linux/bitmap.h>
+#include <linux/rcupdate.h>
+#include <asm/e820/api.h>
+#include <asm/pci_x86.h>
+
+#define PREFIX "PCI: "
+
+static char __iomem *pci_dev_base(unsigned int seg, unsigned int bus, unsigned int devfn)
+{
+ struct pci_mmcfg_region *cfg = pci_mmconfig_lookup(seg, bus);
+
+ if (cfg && cfg->virt)
+ return cfg->virt + (PCI_MMCFG_BUS_OFFSET(bus) | (devfn << 12));
+ return NULL;
+}
+
+static int pci_mmcfg_read(unsigned int seg, unsigned int bus,
+ unsigned int devfn, int reg, int len, u32 *value)
+{
+ char __iomem *addr;
+
+ /* Why do we have this when nobody checks it. How about a BUG()!? -AK */
+ if (unlikely((bus > 255) || (devfn > 255) || (reg > 4095))) {
+err: *value = -1;
+ return -EINVAL;
+ }
+
+ rcu_read_lock();
+ addr = pci_dev_base(seg, bus, devfn);
+ if (!addr) {
+ rcu_read_unlock();
+ goto err;
+ }
+
+ switch (len) {
+ case 1:
+ *value = mmio_config_readb(addr + reg);
+ break;
+ case 2:
+ *value = mmio_config_readw(addr + reg);
+ break;
+ case 4:
+ *value = mmio_config_readl(addr + reg);
+ break;
+ }
+ rcu_read_unlock();
+
+ return 0;
+}
+
+static int pci_mmcfg_write(unsigned int seg, unsigned int bus,
+ unsigned int devfn, int reg, int len, u32 value)
+{
+ char __iomem *addr;
+
+ /* Why do we have this when nobody checks it. How about a BUG()!? -AK */
+ if (unlikely((bus > 255) || (devfn > 255) || (reg > 4095)))
+ return -EINVAL;
+
+ rcu_read_lock();
+ addr = pci_dev_base(seg, bus, devfn);
+ if (!addr) {
+ rcu_read_unlock();
+ return -EINVAL;
+ }
+
+ switch (len) {
+ case 1:
+ mmio_config_writeb(addr + reg, value);
+ break;
+ case 2:
+ mmio_config_writew(addr + reg, value);
+ break;
+ case 4:
+ mmio_config_writel(addr + reg, value);
+ break;
+ }
+ rcu_read_unlock();
+
+ return 0;
+}
+
+const struct pci_raw_ops pci_mmcfg = {
+ .read = pci_mmcfg_read,
+ .write = pci_mmcfg_write,
+};
+
+static void __iomem *mcfg_ioremap(struct pci_mmcfg_region *cfg)
+{
+ void __iomem *addr;
+ u64 start, size;
+ int num_buses;
+
+ start = cfg->address + PCI_MMCFG_BUS_OFFSET(cfg->start_bus);
+ num_buses = cfg->end_bus - cfg->start_bus + 1;
+ size = PCI_MMCFG_BUS_OFFSET(num_buses);
+ addr = ioremap_nocache(start, size);
+ if (addr)
+ addr -= PCI_MMCFG_BUS_OFFSET(cfg->start_bus);
+ return addr;
+}
+
+int __init pci_mmcfg_arch_init(void)
+{
+ struct pci_mmcfg_region *cfg;
+
+ list_for_each_entry(cfg, &pci_mmcfg_list, list)
+ if (pci_mmcfg_arch_map(cfg)) {
+ pci_mmcfg_arch_free();
+ return 0;
+ }
+
+ raw_pci_ext_ops = &pci_mmcfg;
+
+ return 1;
+}
+
+void __init pci_mmcfg_arch_free(void)
+{
+ struct pci_mmcfg_region *cfg;
+
+ list_for_each_entry(cfg, &pci_mmcfg_list, list)
+ pci_mmcfg_arch_unmap(cfg);
+}
+
+int pci_mmcfg_arch_map(struct pci_mmcfg_region *cfg)
+{
+ cfg->virt = mcfg_ioremap(cfg);
+ if (!cfg->virt) {
+ pr_err(PREFIX "can't map MMCONFIG at %pR\n", &cfg->res);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+void pci_mmcfg_arch_unmap(struct pci_mmcfg_region *cfg)
+{
+ if (cfg && cfg->virt) {
+ iounmap(cfg->virt + PCI_MMCFG_BUS_OFFSET(cfg->start_bus));
+ cfg->virt = NULL;
+ }
+}
diff --git a/arch/x86/pci/numachip.c b/arch/x86/pci/numachip.c
new file mode 100644
index 000000000..2e565e65c
--- /dev/null
+++ b/arch/x86/pci/numachip.c
@@ -0,0 +1,129 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Numascale NumaConnect-specific PCI code
+ *
+ * Copyright (C) 2012 Numascale AS. All rights reserved.
+ *
+ * Send feedback to <support@numascale.com>
+ *
+ * PCI accessor functions derived from mmconfig_64.c
+ *
+ */
+
+#include <linux/pci.h>
+#include <asm/pci_x86.h>
+
+static u8 limit __read_mostly;
+
+static inline char __iomem *pci_dev_base(unsigned int seg, unsigned int bus, unsigned int devfn)
+{
+ struct pci_mmcfg_region *cfg = pci_mmconfig_lookup(seg, bus);
+
+ if (cfg && cfg->virt)
+ return cfg->virt + (PCI_MMCFG_BUS_OFFSET(bus) | (devfn << 12));
+ return NULL;
+}
+
+static int pci_mmcfg_read_numachip(unsigned int seg, unsigned int bus,
+ unsigned int devfn, int reg, int len, u32 *value)
+{
+ char __iomem *addr;
+
+ /* Why do we have this when nobody checks it. How about a BUG()!? -AK */
+ if (unlikely((bus > 255) || (devfn > 255) || (reg > 4095))) {
+err: *value = -1;
+ return -EINVAL;
+ }
+
+ /* Ensure AMD Northbridges don't decode reads to other devices */
+ if (unlikely(bus == 0 && devfn >= limit)) {
+ *value = -1;
+ return 0;
+ }
+
+ rcu_read_lock();
+ addr = pci_dev_base(seg, bus, devfn);
+ if (!addr) {
+ rcu_read_unlock();
+ goto err;
+ }
+
+ switch (len) {
+ case 1:
+ *value = mmio_config_readb(addr + reg);
+ break;
+ case 2:
+ *value = mmio_config_readw(addr + reg);
+ break;
+ case 4:
+ *value = mmio_config_readl(addr + reg);
+ break;
+ }
+ rcu_read_unlock();
+
+ return 0;
+}
+
+static int pci_mmcfg_write_numachip(unsigned int seg, unsigned int bus,
+ unsigned int devfn, int reg, int len, u32 value)
+{
+ char __iomem *addr;
+
+ /* Why do we have this when nobody checks it. How about a BUG()!? -AK */
+ if (unlikely((bus > 255) || (devfn > 255) || (reg > 4095)))
+ return -EINVAL;
+
+ /* Ensure AMD Northbridges don't decode writes to other devices */
+ if (unlikely(bus == 0 && devfn >= limit))
+ return 0;
+
+ rcu_read_lock();
+ addr = pci_dev_base(seg, bus, devfn);
+ if (!addr) {
+ rcu_read_unlock();
+ return -EINVAL;
+ }
+
+ switch (len) {
+ case 1:
+ mmio_config_writeb(addr + reg, value);
+ break;
+ case 2:
+ mmio_config_writew(addr + reg, value);
+ break;
+ case 4:
+ mmio_config_writel(addr + reg, value);
+ break;
+ }
+ rcu_read_unlock();
+
+ return 0;
+}
+
+static const struct pci_raw_ops pci_mmcfg_numachip = {
+ .read = pci_mmcfg_read_numachip,
+ .write = pci_mmcfg_write_numachip,
+};
+
+int __init pci_numachip_init(void)
+{
+ int ret = 0;
+ u32 val;
+
+ /* For remote I/O, restrict bus 0 access to the actual number of AMD
+ Northbridges, which starts at device number 0x18 */
+ ret = raw_pci_read(0, 0, PCI_DEVFN(0x18, 0), 0x60, sizeof(val), &val);
+ if (ret)
+ goto out;
+
+ /* HyperTransport fabric size in bits 6:4 */
+ limit = PCI_DEVFN(0x18 + ((val >> 4) & 7) + 1, 0);
+
+ /* Use NumaChip PCI accessors for non-extended and extended access */
+ raw_pci_ops = raw_pci_ext_ops = &pci_mmcfg_numachip;
+out:
+ return ret;
+}
diff --git a/arch/x86/pci/olpc.c b/arch/x86/pci/olpc.c
new file mode 100644
index 000000000..7043a4f0e
--- /dev/null
+++ b/arch/x86/pci/olpc.c
@@ -0,0 +1,315 @@
+/*
+ * Low-level PCI config space access for OLPC systems who lack the VSA
+ * PCI virtualization software.
+ *
+ * Copyright © 2006 Advanced Micro Devices, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * The AMD Geode chipset (ie: GX2 processor, cs5536 I/O companion device)
+ * has some I/O functions (display, southbridge, sound, USB HCIs, etc)
+ * that more or less behave like PCI devices, but the hardware doesn't
+ * directly implement the PCI configuration space headers. AMD provides
+ * "VSA" (Virtual System Architecture) software that emulates PCI config
+ * space for these devices, by trapping I/O accesses to PCI config register
+ * (CF8/CFC) and running some code in System Management Mode interrupt state.
+ * On the OLPC platform, we don't want to use that VSA code because
+ * (a) it slows down suspend/resume, and (b) recompiling it requires special
+ * compilers that are hard to get. So instead of letting the complex VSA
+ * code simulate the PCI config registers for the on-chip devices, we
+ * just simulate them the easy way, by inserting the code into the
+ * pci_write_config and pci_read_config path. Most of the config registers
+ * are read-only anyway, so the bulk of the simulation is just table lookup.
+ */
+
+#include <linux/pci.h>
+#include <linux/init.h>
+#include <asm/olpc.h>
+#include <asm/geode.h>
+#include <asm/pci_x86.h>
+
+/*
+ * In the tables below, the first two line (8 longwords) are the
+ * size masks that are used when the higher level PCI code determines
+ * the size of the region by writing ~0 to a base address register
+ * and reading back the result.
+ *
+ * The following lines are the values that are read during normal
+ * PCI config access cycles, i.e. not after just having written
+ * ~0 to a base address register.
+ */
+
+static const uint32_t lxnb_hdr[] = { /* dev 1 function 0 - devfn = 8 */
+ 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0,
+
+ 0x281022, 0x2200005, 0x6000021, 0x80f808, /* AMD Vendor ID */
+ 0x0, 0x0, 0x0, 0x0, /* No virtual registers, hence no BAR */
+ 0x0, 0x0, 0x0, 0x28100b,
+ 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0,
+};
+
+static const uint32_t gxnb_hdr[] = { /* dev 1 function 0 - devfn = 8 */
+ 0xfffffffd, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0,
+
+ 0x28100b, 0x2200005, 0x6000021, 0x80f808, /* NSC Vendor ID */
+ 0xac1d, 0x0, 0x0, 0x0, /* I/O BAR - base of virtual registers */
+ 0x0, 0x0, 0x0, 0x28100b,
+ 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0,
+};
+
+static const uint32_t lxfb_hdr[] = { /* dev 1 function 1 - devfn = 9 */
+ 0xff000008, 0xffffc000, 0xffffc000, 0xffffc000,
+ 0xffffc000, 0x0, 0x0, 0x0,
+
+ 0x20811022, 0x2200003, 0x3000000, 0x0, /* AMD Vendor ID */
+ 0xfd000000, 0xfe000000, 0xfe004000, 0xfe008000, /* FB, GP, VG, DF */
+ 0xfe00c000, 0x0, 0x0, 0x30100b, /* VIP */
+ 0x0, 0x0, 0x0, 0x10e, /* INTA, IRQ14 for graphics accel */
+ 0x0, 0x0, 0x0, 0x0,
+ 0x3d0, 0x3c0, 0xa0000, 0x0, /* VG IO, VG IO, EGA FB, MONO FB */
+ 0x0, 0x0, 0x0, 0x0,
+};
+
+static const uint32_t gxfb_hdr[] = { /* dev 1 function 1 - devfn = 9 */
+ 0xff800008, 0xffffc000, 0xffffc000, 0xffffc000,
+ 0x0, 0x0, 0x0, 0x0,
+
+ 0x30100b, 0x2200003, 0x3000000, 0x0, /* NSC Vendor ID */
+ 0xfd000000, 0xfe000000, 0xfe004000, 0xfe008000, /* FB, GP, VG, DF */
+ 0x0, 0x0, 0x0, 0x30100b,
+ 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0,
+ 0x3d0, 0x3c0, 0xa0000, 0x0, /* VG IO, VG IO, EGA FB, MONO FB */
+ 0x0, 0x0, 0x0, 0x0,
+};
+
+static const uint32_t aes_hdr[] = { /* dev 1 function 2 - devfn = 0xa */
+ 0xffffc000, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0,
+
+ 0x20821022, 0x2a00006, 0x10100000, 0x8, /* NSC Vendor ID */
+ 0xfe010000, 0x0, 0x0, 0x0, /* AES registers */
+ 0x0, 0x0, 0x0, 0x20821022,
+ 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0,
+};
+
+
+static const uint32_t isa_hdr[] = { /* dev f function 0 - devfn = 78 */
+ 0xfffffff9, 0xffffff01, 0xffffffc1, 0xffffffe1,
+ 0xffffff81, 0xffffffc1, 0x0, 0x0,
+
+ 0x20901022, 0x2a00049, 0x6010003, 0x802000,
+ 0x18b1, 0x1001, 0x1801, 0x1881, /* SMB-8 GPIO-256 MFGPT-64 IRQ-32 */
+ 0x1401, 0x1841, 0x0, 0x20901022, /* PMS-128 ACPI-64 */
+ 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0xaa5b, /* IRQ steering */
+ 0x0, 0x0, 0x0, 0x0,
+};
+
+static const uint32_t ac97_hdr[] = { /* dev f function 3 - devfn = 7b */
+ 0xffffff81, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0,
+
+ 0x20931022, 0x2a00041, 0x4010001, 0x0,
+ 0x1481, 0x0, 0x0, 0x0, /* I/O BAR-128 */
+ 0x0, 0x0, 0x0, 0x20931022,
+ 0x0, 0x0, 0x0, 0x205, /* IntB, IRQ5 */
+ 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0,
+};
+
+static const uint32_t ohci_hdr[] = { /* dev f function 4 - devfn = 7c */
+ 0xfffff000, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0,
+
+ 0x20941022, 0x2300006, 0xc031002, 0x0,
+ 0xfe01a000, 0x0, 0x0, 0x0, /* MEMBAR-1000 */
+ 0x0, 0x0, 0x0, 0x20941022,
+ 0x0, 0x40, 0x0, 0x40a, /* CapPtr INT-D, IRQA */
+ 0xc8020001, 0x0, 0x0, 0x0, /* Capabilities - 40 is R/O,
+ 44 is mask 8103 (power control) */
+ 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0,
+};
+
+static const uint32_t ehci_hdr[] = { /* dev f function 4 - devfn = 7d */
+ 0xfffff000, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0,
+
+ 0x20951022, 0x2300006, 0xc032002, 0x0,
+ 0xfe01b000, 0x0, 0x0, 0x0, /* MEMBAR-1000 */
+ 0x0, 0x0, 0x0, 0x20951022,
+ 0x0, 0x40, 0x0, 0x40a, /* CapPtr INT-D, IRQA */
+ 0xc8020001, 0x0, 0x0, 0x0, /* Capabilities - 40 is R/O, 44 is
+ mask 8103 (power control) */
+#if 0
+ 0x1, 0x40080000, 0x0, 0x0, /* EECP - see EHCI spec section 2.1.7 */
+#endif
+ 0x01000001, 0x0, 0x0, 0x0, /* EECP - see EHCI spec section 2.1.7 */
+ 0x2020, 0x0, 0x0, 0x0, /* (EHCI page 8) 60 SBRN (R/O),
+ 61 FLADJ (R/W), PORTWAKECAP */
+};
+
+static uint32_t ff_loc = ~0;
+static uint32_t zero_loc;
+static int bar_probing; /* Set after a write of ~0 to a BAR */
+static int is_lx;
+
+#define NB_SLOT 0x1 /* Northbridge - GX chip - Device 1 */
+#define SB_SLOT 0xf /* Southbridge - CS5536 chip - Device F */
+
+static int is_simulated(unsigned int bus, unsigned int devfn)
+{
+ return (!bus && ((PCI_SLOT(devfn) == NB_SLOT) ||
+ (PCI_SLOT(devfn) == SB_SLOT)));
+}
+
+static uint32_t *hdr_addr(const uint32_t *hdr, int reg)
+{
+ uint32_t addr;
+
+ /*
+ * This is a little bit tricky. The header maps consist of
+ * 0x20 bytes of size masks, followed by 0x70 bytes of header data.
+ * In the normal case, when not probing a BAR's size, we want
+ * to access the header data, so we add 0x20 to the reg offset,
+ * thus skipping the size mask area.
+ * In the BAR probing case, we want to access the size mask for
+ * the BAR, so we subtract 0x10 (the config header offset for
+ * BAR0), and don't skip the size mask area.
+ */
+
+ addr = (uint32_t)hdr + reg + (bar_probing ? -0x10 : 0x20);
+
+ bar_probing = 0;
+ return (uint32_t *)addr;
+}
+
+static int pci_olpc_read(unsigned int seg, unsigned int bus,
+ unsigned int devfn, int reg, int len, uint32_t *value)
+{
+ uint32_t *addr;
+
+ WARN_ON(seg);
+
+ /* Use the hardware mechanism for non-simulated devices */
+ if (!is_simulated(bus, devfn))
+ return pci_direct_conf1.read(seg, bus, devfn, reg, len, value);
+
+ /*
+ * No device has config registers past 0x70, so we save table space
+ * by not storing entries for the nonexistent registers
+ */
+ if (reg >= 0x70)
+ addr = &zero_loc;
+ else {
+ switch (devfn) {
+ case 0x8:
+ addr = hdr_addr(is_lx ? lxnb_hdr : gxnb_hdr, reg);
+ break;
+ case 0x9:
+ addr = hdr_addr(is_lx ? lxfb_hdr : gxfb_hdr, reg);
+ break;
+ case 0xa:
+ addr = is_lx ? hdr_addr(aes_hdr, reg) : &ff_loc;
+ break;
+ case 0x78:
+ addr = hdr_addr(isa_hdr, reg);
+ break;
+ case 0x7b:
+ addr = hdr_addr(ac97_hdr, reg);
+ break;
+ case 0x7c:
+ addr = hdr_addr(ohci_hdr, reg);
+ break;
+ case 0x7d:
+ addr = hdr_addr(ehci_hdr, reg);
+ break;
+ default:
+ addr = &ff_loc;
+ break;
+ }
+ }
+ switch (len) {
+ case 1:
+ *value = *(uint8_t *)addr;
+ break;
+ case 2:
+ *value = *(uint16_t *)addr;
+ break;
+ case 4:
+ *value = *addr;
+ break;
+ default:
+ BUG();
+ }
+
+ return 0;
+}
+
+static int pci_olpc_write(unsigned int seg, unsigned int bus,
+ unsigned int devfn, int reg, int len, uint32_t value)
+{
+ WARN_ON(seg);
+
+ /* Use the hardware mechanism for non-simulated devices */
+ if (!is_simulated(bus, devfn))
+ return pci_direct_conf1.write(seg, bus, devfn, reg, len, value);
+
+ /* XXX we may want to extend this to simulate EHCI power management */
+
+ /*
+ * Mostly we just discard writes, but if the write is a size probe
+ * (i.e. writing ~0 to a BAR), we remember it and arrange to return
+ * the appropriate size mask on the next read. This is cheating
+ * to some extent, because it depends on the fact that the next
+ * access after such a write will always be a read to the same BAR.
+ */
+
+ if ((reg >= 0x10) && (reg < 0x2c)) {
+ /* write is to a BAR */
+ if (value == ~0)
+ bar_probing = 1;
+ } else {
+ /*
+ * No warning on writes to ROM BAR, CMD, LATENCY_TIMER,
+ * CACHE_LINE_SIZE, or PM registers.
+ */
+ if ((reg != PCI_ROM_ADDRESS) && (reg != PCI_COMMAND_MASTER) &&
+ (reg != PCI_LATENCY_TIMER) &&
+ (reg != PCI_CACHE_LINE_SIZE) && (reg != 0x44))
+ printk(KERN_WARNING "OLPC PCI: Config write to devfn"
+ " %x reg %x value %x\n", devfn, reg, value);
+ }
+
+ return 0;
+}
+
+static const struct pci_raw_ops pci_olpc_conf = {
+ .read = pci_olpc_read,
+ .write = pci_olpc_write,
+};
+
+int __init pci_olpc_init(void)
+{
+ printk(KERN_INFO "PCI: Using configuration type OLPC XO-1\n");
+ raw_pci_ops = &pci_olpc_conf;
+ is_lx = is_geode_lx();
+ return 0;
+}
diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c
new file mode 100644
index 000000000..9c97d8141
--- /dev/null
+++ b/arch/x86/pci/pcbios.c
@@ -0,0 +1,429 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * BIOS32 and PCI BIOS handling.
+ */
+
+#include <linux/pci.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/uaccess.h>
+
+#include <asm/pci_x86.h>
+#include <asm/e820/types.h>
+#include <asm/pci-functions.h>
+#include <asm/set_memory.h>
+
+/* BIOS32 signature: "_32_" */
+#define BIOS32_SIGNATURE (('_' << 0) + ('3' << 8) + ('2' << 16) + ('_' << 24))
+
+/* PCI signature: "PCI " */
+#define PCI_SIGNATURE (('P' << 0) + ('C' << 8) + ('I' << 16) + (' ' << 24))
+
+/* PCI service signature: "$PCI" */
+#define PCI_SERVICE (('$' << 0) + ('P' << 8) + ('C' << 16) + ('I' << 24))
+
+/* PCI BIOS hardware mechanism flags */
+#define PCIBIOS_HW_TYPE1 0x01
+#define PCIBIOS_HW_TYPE2 0x02
+#define PCIBIOS_HW_TYPE1_SPEC 0x10
+#define PCIBIOS_HW_TYPE2_SPEC 0x20
+
+int pcibios_enabled;
+
+/* According to the BIOS specification at:
+ * http://members.datafast.net.au/dft0802/specs/bios21.pdf, we could
+ * restrict the x zone to some pages and make it ro. But this may be
+ * broken on some bios, complex to handle with static_protections.
+ * We could make the 0xe0000-0x100000 range rox, but this can break
+ * some ISA mapping.
+ *
+ * So we let's an rw and x hole when pcibios is used. This shouldn't
+ * happen for modern system with mmconfig, and if you don't want it
+ * you could disable pcibios...
+ */
+static inline void set_bios_x(void)
+{
+ pcibios_enabled = 1;
+ set_memory_x(PAGE_OFFSET + BIOS_BEGIN, (BIOS_END - BIOS_BEGIN) >> PAGE_SHIFT);
+ if (__supported_pte_mask & _PAGE_NX)
+ printk(KERN_INFO "PCI: PCI BIOS area is rw and x. Use pci=nobios if you want it NX.\n");
+}
+
+/*
+ * This is the standard structure used to identify the entry point
+ * to the BIOS32 Service Directory, as documented in
+ * Standard BIOS 32-bit Service Directory Proposal
+ * Revision 0.4 May 24, 1993
+ * Phoenix Technologies Ltd.
+ * Norwood, MA
+ * and the PCI BIOS specification.
+ */
+
+union bios32 {
+ struct {
+ unsigned long signature; /* _32_ */
+ unsigned long entry; /* 32 bit physical address */
+ unsigned char revision; /* Revision level, 0 */
+ unsigned char length; /* Length in paragraphs should be 01 */
+ unsigned char checksum; /* All bytes must add up to zero */
+ unsigned char reserved[5]; /* Must be zero */
+ } fields;
+ char chars[16];
+};
+
+/*
+ * Physical address of the service directory. I don't know if we're
+ * allowed to have more than one of these or not, so just in case
+ * we'll make pcibios_present() take a memory start parameter and store
+ * the array there.
+ */
+
+static struct {
+ unsigned long address;
+ unsigned short segment;
+} bios32_indirect __initdata = { 0, __KERNEL_CS };
+
+/*
+ * Returns the entry point for the given service, NULL on error
+ */
+
+static unsigned long __init bios32_service(unsigned long service)
+{
+ unsigned char return_code; /* %al */
+ unsigned long address; /* %ebx */
+ unsigned long length; /* %ecx */
+ unsigned long entry; /* %edx */
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __asm__("lcall *(%%edi); cld"
+ : "=a" (return_code),
+ "=b" (address),
+ "=c" (length),
+ "=d" (entry)
+ : "0" (service),
+ "1" (0),
+ "D" (&bios32_indirect));
+ local_irq_restore(flags);
+
+ switch (return_code) {
+ case 0:
+ return address + entry;
+ case 0x80: /* Not present */
+ printk(KERN_WARNING "bios32_service(0x%lx): not present\n", service);
+ return 0;
+ default: /* Shouldn't happen */
+ printk(KERN_WARNING "bios32_service(0x%lx): returned 0x%x -- BIOS bug!\n",
+ service, return_code);
+ return 0;
+ }
+}
+
+static struct {
+ unsigned long address;
+ unsigned short segment;
+} pci_indirect __ro_after_init = {
+ .address = 0,
+ .segment = __KERNEL_CS,
+};
+
+static int pci_bios_present __ro_after_init;
+
+static int __init check_pcibios(void)
+{
+ u32 signature, eax, ebx, ecx;
+ u8 status, major_ver, minor_ver, hw_mech;
+ unsigned long flags, pcibios_entry;
+
+ if ((pcibios_entry = bios32_service(PCI_SERVICE))) {
+ pci_indirect.address = pcibios_entry + PAGE_OFFSET;
+
+ local_irq_save(flags);
+ __asm__(
+ "lcall *(%%edi); cld\n\t"
+ "jc 1f\n\t"
+ "xor %%ah, %%ah\n"
+ "1:"
+ : "=d" (signature),
+ "=a" (eax),
+ "=b" (ebx),
+ "=c" (ecx)
+ : "1" (PCIBIOS_PCI_BIOS_PRESENT),
+ "D" (&pci_indirect)
+ : "memory");
+ local_irq_restore(flags);
+
+ status = (eax >> 8) & 0xff;
+ hw_mech = eax & 0xff;
+ major_ver = (ebx >> 8) & 0xff;
+ minor_ver = ebx & 0xff;
+ if (pcibios_last_bus < 0)
+ pcibios_last_bus = ecx & 0xff;
+ DBG("PCI: BIOS probe returned s=%02x hw=%02x ver=%02x.%02x l=%02x\n",
+ status, hw_mech, major_ver, minor_ver, pcibios_last_bus);
+ if (status || signature != PCI_SIGNATURE) {
+ printk (KERN_ERR "PCI: BIOS BUG #%x[%08x] found\n",
+ status, signature);
+ return 0;
+ }
+ printk(KERN_INFO "PCI: PCI BIOS revision %x.%02x entry at 0x%lx, last bus=%d\n",
+ major_ver, minor_ver, pcibios_entry, pcibios_last_bus);
+#ifdef CONFIG_PCI_DIRECT
+ if (!(hw_mech & PCIBIOS_HW_TYPE1))
+ pci_probe &= ~PCI_PROBE_CONF1;
+ if (!(hw_mech & PCIBIOS_HW_TYPE2))
+ pci_probe &= ~PCI_PROBE_CONF2;
+#endif
+ return 1;
+ }
+ return 0;
+}
+
+static int pci_bios_read(unsigned int seg, unsigned int bus,
+ unsigned int devfn, int reg, int len, u32 *value)
+{
+ unsigned long result = 0;
+ unsigned long flags;
+ unsigned long bx = (bus << 8) | devfn;
+ u16 number = 0, mask = 0;
+
+ WARN_ON(seg);
+ if (!value || (bus > 255) || (devfn > 255) || (reg > 255))
+ return -EINVAL;
+
+ raw_spin_lock_irqsave(&pci_config_lock, flags);
+
+ switch (len) {
+ case 1:
+ number = PCIBIOS_READ_CONFIG_BYTE;
+ mask = 0xff;
+ break;
+ case 2:
+ number = PCIBIOS_READ_CONFIG_WORD;
+ mask = 0xffff;
+ break;
+ case 4:
+ number = PCIBIOS_READ_CONFIG_DWORD;
+ break;
+ }
+
+ __asm__("lcall *(%%esi); cld\n\t"
+ "jc 1f\n\t"
+ "xor %%ah, %%ah\n"
+ "1:"
+ : "=c" (*value),
+ "=a" (result)
+ : "1" (number),
+ "b" (bx),
+ "D" ((long)reg),
+ "S" (&pci_indirect));
+ /*
+ * Zero-extend the result beyond 8 or 16 bits, do not trust the
+ * BIOS having done it:
+ */
+ if (mask)
+ *value &= mask;
+
+ raw_spin_unlock_irqrestore(&pci_config_lock, flags);
+
+ return (int)((result & 0xff00) >> 8);
+}
+
+static int pci_bios_write(unsigned int seg, unsigned int bus,
+ unsigned int devfn, int reg, int len, u32 value)
+{
+ unsigned long result = 0;
+ unsigned long flags;
+ unsigned long bx = (bus << 8) | devfn;
+ u16 number = 0;
+
+ WARN_ON(seg);
+ if ((bus > 255) || (devfn > 255) || (reg > 255))
+ return -EINVAL;
+
+ raw_spin_lock_irqsave(&pci_config_lock, flags);
+
+ switch (len) {
+ case 1:
+ number = PCIBIOS_WRITE_CONFIG_BYTE;
+ break;
+ case 2:
+ number = PCIBIOS_WRITE_CONFIG_WORD;
+ break;
+ case 4:
+ number = PCIBIOS_WRITE_CONFIG_DWORD;
+ break;
+ }
+
+ __asm__("lcall *(%%esi); cld\n\t"
+ "jc 1f\n\t"
+ "xor %%ah, %%ah\n"
+ "1:"
+ : "=a" (result)
+ : "0" (number),
+ "c" (value),
+ "b" (bx),
+ "D" ((long)reg),
+ "S" (&pci_indirect));
+
+ raw_spin_unlock_irqrestore(&pci_config_lock, flags);
+
+ return (int)((result & 0xff00) >> 8);
+}
+
+
+/*
+ * Function table for BIOS32 access
+ */
+
+static const struct pci_raw_ops pci_bios_access = {
+ .read = pci_bios_read,
+ .write = pci_bios_write
+};
+
+/*
+ * Try to find PCI BIOS.
+ */
+
+static const struct pci_raw_ops *__init pci_find_bios(void)
+{
+ union bios32 *check;
+ unsigned char sum;
+ int i, length;
+
+ /*
+ * Follow the standard procedure for locating the BIOS32 Service
+ * directory by scanning the permissible address range from
+ * 0xe0000 through 0xfffff for a valid BIOS32 structure.
+ */
+
+ for (check = (union bios32 *) __va(0xe0000);
+ check <= (union bios32 *) __va(0xffff0);
+ ++check) {
+ long sig;
+ if (probe_kernel_address(&check->fields.signature, sig))
+ continue;
+
+ if (check->fields.signature != BIOS32_SIGNATURE)
+ continue;
+ length = check->fields.length * 16;
+ if (!length)
+ continue;
+ sum = 0;
+ for (i = 0; i < length ; ++i)
+ sum += check->chars[i];
+ if (sum != 0)
+ continue;
+ if (check->fields.revision != 0) {
+ printk("PCI: unsupported BIOS32 revision %d at 0x%p\n",
+ check->fields.revision, check);
+ continue;
+ }
+ DBG("PCI: BIOS32 Service Directory structure at 0x%p\n", check);
+ if (check->fields.entry >= 0x100000) {
+ printk("PCI: BIOS32 entry (0x%p) in high memory, "
+ "cannot use.\n", check);
+ return NULL;
+ } else {
+ unsigned long bios32_entry = check->fields.entry;
+ DBG("PCI: BIOS32 Service Directory entry at 0x%lx\n",
+ bios32_entry);
+ bios32_indirect.address = bios32_entry + PAGE_OFFSET;
+ set_bios_x();
+ if (check_pcibios())
+ return &pci_bios_access;
+ }
+ break; /* Hopefully more than one BIOS32 cannot happen... */
+ }
+
+ return NULL;
+}
+
+/*
+ * BIOS Functions for IRQ Routing
+ */
+
+struct irq_routing_options {
+ u16 size;
+ struct irq_info *table;
+ u16 segment;
+} __attribute__((packed));
+
+struct irq_routing_table * pcibios_get_irq_routing_table(void)
+{
+ struct irq_routing_options opt;
+ struct irq_routing_table *rt = NULL;
+ int ret, map;
+ unsigned long page;
+
+ if (!pci_bios_present)
+ return NULL;
+ page = __get_free_page(GFP_KERNEL);
+ if (!page)
+ return NULL;
+ opt.table = (struct irq_info *) page;
+ opt.size = PAGE_SIZE;
+ opt.segment = __KERNEL_DS;
+
+ DBG("PCI: Fetching IRQ routing table... ");
+ __asm__("push %%es\n\t"
+ "push %%ds\n\t"
+ "pop %%es\n\t"
+ "lcall *(%%esi); cld\n\t"
+ "pop %%es\n\t"
+ "jc 1f\n\t"
+ "xor %%ah, %%ah\n"
+ "1:"
+ : "=a" (ret),
+ "=b" (map),
+ "=m" (opt)
+ : "0" (PCIBIOS_GET_ROUTING_OPTIONS),
+ "1" (0),
+ "D" ((long) &opt),
+ "S" (&pci_indirect),
+ "m" (opt)
+ : "memory");
+ DBG("OK ret=%d, size=%d, map=%x\n", ret, opt.size, map);
+ if (ret & 0xff00)
+ printk(KERN_ERR "PCI: Error %02x when fetching IRQ routing table.\n", (ret >> 8) & 0xff);
+ else if (opt.size) {
+ rt = kmalloc(sizeof(struct irq_routing_table) + opt.size, GFP_KERNEL);
+ if (rt) {
+ memset(rt, 0, sizeof(struct irq_routing_table));
+ rt->size = opt.size + sizeof(struct irq_routing_table);
+ rt->exclusive_irqs = map;
+ memcpy(rt->slots, (void *) page, opt.size);
+ printk(KERN_INFO "PCI: Using BIOS Interrupt Routing Table\n");
+ }
+ }
+ free_page(page);
+ return rt;
+}
+EXPORT_SYMBOL(pcibios_get_irq_routing_table);
+
+int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq)
+{
+ int ret;
+
+ __asm__("lcall *(%%esi); cld\n\t"
+ "jc 1f\n\t"
+ "xor %%ah, %%ah\n"
+ "1:"
+ : "=a" (ret)
+ : "0" (PCIBIOS_SET_PCI_HW_INT),
+ "b" ((dev->bus->number << 8) | dev->devfn),
+ "c" ((irq << 8) | (pin + 10)),
+ "S" (&pci_indirect));
+ return !(ret & 0xff00);
+}
+EXPORT_SYMBOL(pcibios_set_irq_routing);
+
+void __init pci_pcbios_init(void)
+{
+ if ((pci_probe & PCI_PROBE_BIOS)
+ && ((raw_pci_ops = pci_find_bios()))) {
+ pci_bios_present = 1;
+ }
+}
+
diff --git a/arch/x86/pci/sta2x11-fixup.c b/arch/x86/pci/sta2x11-fixup.c
new file mode 100644
index 000000000..7a5bafb76
--- /dev/null
+++ b/arch/x86/pci/sta2x11-fixup.c
@@ -0,0 +1,330 @@
+/*
+ * arch/x86/pci/sta2x11-fixup.c
+ * glue code for lib/swiotlb.c and DMA translation between STA2x11
+ * AMBA memory mapping and the X86 memory mapping
+ *
+ * ST Microelectronics ConneXt (STA2X11/STA2X10)
+ *
+ * Copyright (c) 2010-2011 Wind River Systems, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+
+#include <linux/pci.h>
+#include <linux/pci_ids.h>
+#include <linux/export.h>
+#include <linux/list.h>
+#include <linux/dma-direct.h>
+#include <asm/iommu.h>
+
+#define STA2X11_SWIOTLB_SIZE (4*1024*1024)
+extern int swiotlb_late_init_with_default_size(size_t default_size);
+
+/*
+ * We build a list of bus numbers that are under the ConneXt. The
+ * main bridge hosts 4 busses, which are the 4 endpoints, in order.
+ */
+#define STA2X11_NR_EP 4 /* 0..3 included */
+#define STA2X11_NR_FUNCS 8 /* 0..7 included */
+#define STA2X11_AMBA_SIZE (512 << 20)
+
+struct sta2x11_ahb_regs { /* saved during suspend */
+ u32 base, pexlbase, pexhbase, crw;
+};
+
+struct sta2x11_mapping {
+ u32 amba_base;
+ int is_suspended;
+ struct sta2x11_ahb_regs regs[STA2X11_NR_FUNCS];
+};
+
+struct sta2x11_instance {
+ struct list_head list;
+ int bus0;
+ struct sta2x11_mapping map[STA2X11_NR_EP];
+};
+
+static LIST_HEAD(sta2x11_instance_list);
+
+/* At probe time, record new instances of this bridge (likely one only) */
+static void sta2x11_new_instance(struct pci_dev *pdev)
+{
+ struct sta2x11_instance *instance;
+
+ instance = kzalloc(sizeof(*instance), GFP_ATOMIC);
+ if (!instance)
+ return;
+ /* This has a subordinate bridge, with 4 more-subordinate ones */
+ instance->bus0 = pdev->subordinate->number + 1;
+
+ if (list_empty(&sta2x11_instance_list)) {
+ int size = STA2X11_SWIOTLB_SIZE;
+ /* First instance: register your own swiotlb area */
+ dev_info(&pdev->dev, "Using SWIOTLB (size %i)\n", size);
+ if (swiotlb_late_init_with_default_size(size))
+ dev_emerg(&pdev->dev, "init swiotlb failed\n");
+ }
+ list_add(&instance->list, &sta2x11_instance_list);
+}
+DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_STMICRO, 0xcc17, sta2x11_new_instance);
+
+/*
+ * Utility functions used in this file from below
+ */
+static struct sta2x11_instance *sta2x11_pdev_to_instance(struct pci_dev *pdev)
+{
+ struct sta2x11_instance *instance;
+ int ep;
+
+ list_for_each_entry(instance, &sta2x11_instance_list, list) {
+ ep = pdev->bus->number - instance->bus0;
+ if (ep >= 0 && ep < STA2X11_NR_EP)
+ return instance;
+ }
+ return NULL;
+}
+
+static int sta2x11_pdev_to_ep(struct pci_dev *pdev)
+{
+ struct sta2x11_instance *instance;
+
+ instance = sta2x11_pdev_to_instance(pdev);
+ if (!instance)
+ return -1;
+
+ return pdev->bus->number - instance->bus0;
+}
+
+static struct sta2x11_mapping *sta2x11_pdev_to_mapping(struct pci_dev *pdev)
+{
+ struct sta2x11_instance *instance;
+ int ep;
+
+ instance = sta2x11_pdev_to_instance(pdev);
+ if (!instance)
+ return NULL;
+ ep = sta2x11_pdev_to_ep(pdev);
+ return instance->map + ep;
+}
+
+/* This is exported, as some devices need to access the MFD registers */
+struct sta2x11_instance *sta2x11_get_instance(struct pci_dev *pdev)
+{
+ return sta2x11_pdev_to_instance(pdev);
+}
+EXPORT_SYMBOL(sta2x11_get_instance);
+
+
+/**
+ * p2a - Translate physical address to STA2x11 AMBA address,
+ * used for DMA transfers to STA2x11
+ * @p: Physical address
+ * @pdev: PCI device (must be hosted within the connext)
+ */
+static dma_addr_t p2a(dma_addr_t p, struct pci_dev *pdev)
+{
+ struct sta2x11_mapping *map;
+ dma_addr_t a;
+
+ map = sta2x11_pdev_to_mapping(pdev);
+ a = p + map->amba_base;
+ return a;
+}
+
+/**
+ * a2p - Translate STA2x11 AMBA address to physical address
+ * used for DMA transfers from STA2x11
+ * @a: STA2x11 AMBA address
+ * @pdev: PCI device (must be hosted within the connext)
+ */
+static dma_addr_t a2p(dma_addr_t a, struct pci_dev *pdev)
+{
+ struct sta2x11_mapping *map;
+ dma_addr_t p;
+
+ map = sta2x11_pdev_to_mapping(pdev);
+ p = a - map->amba_base;
+ return p;
+}
+
+/* At setup time, we use our own ops if the device is a ConneXt one */
+static void sta2x11_setup_pdev(struct pci_dev *pdev)
+{
+ struct sta2x11_instance *instance = sta2x11_pdev_to_instance(pdev);
+
+ if (!instance) /* either a sta2x11 bridge or another ST device */
+ return;
+ pci_set_consistent_dma_mask(pdev, STA2X11_AMBA_SIZE - 1);
+ pci_set_dma_mask(pdev, STA2X11_AMBA_SIZE - 1);
+ pdev->dev.dma_ops = &swiotlb_dma_ops;
+ pdev->dev.archdata.is_sta2x11 = true;
+
+ /* We must enable all devices as master, for audio DMA to work */
+ pci_set_master(pdev);
+}
+DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_STMICRO, PCI_ANY_ID, sta2x11_setup_pdev);
+
+/*
+ * The following three functions are exported (used in swiotlb: FIXME)
+ */
+/**
+ * dma_capable - Check if device can manage DMA transfers (FIXME: kill it)
+ * @dev: device for a PCI device
+ * @addr: DMA address
+ * @size: DMA size
+ */
+bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
+{
+ struct sta2x11_mapping *map;
+
+ if (!dev->archdata.is_sta2x11) {
+ if (!dev->dma_mask)
+ return false;
+ return addr + size - 1 <= *dev->dma_mask;
+ }
+
+ map = sta2x11_pdev_to_mapping(to_pci_dev(dev));
+
+ if (!map || (addr < map->amba_base))
+ return false;
+ if (addr + size >= map->amba_base + STA2X11_AMBA_SIZE) {
+ return false;
+ }
+
+ return true;
+}
+
+/**
+ * __phys_to_dma - Return the DMA AMBA address used for this STA2x11 device
+ * @dev: device for a PCI device
+ * @paddr: Physical address
+ */
+dma_addr_t __phys_to_dma(struct device *dev, phys_addr_t paddr)
+{
+ if (!dev->archdata.is_sta2x11)
+ return paddr;
+ return p2a(paddr, to_pci_dev(dev));
+}
+
+/**
+ * dma_to_phys - Return the physical address used for this STA2x11 DMA address
+ * @dev: device for a PCI device
+ * @daddr: STA2x11 AMBA DMA address
+ */
+phys_addr_t __dma_to_phys(struct device *dev, dma_addr_t daddr)
+{
+ if (!dev->archdata.is_sta2x11)
+ return daddr;
+ return a2p(daddr, to_pci_dev(dev));
+}
+
+
+/*
+ * At boot we must set up the mappings for the pcie-to-amba bridge.
+ * It involves device access, and the same happens at suspend/resume time
+ */
+
+#define AHB_MAPB 0xCA4
+#define AHB_CRW(i) (AHB_MAPB + 0 + (i) * 0x10)
+#define AHB_CRW_SZMASK 0xfffffc00UL
+#define AHB_CRW_ENABLE (1 << 0)
+#define AHB_CRW_WTYPE_MEM (2 << 1)
+#define AHB_CRW_ROE (1UL << 3) /* Relax Order Ena */
+#define AHB_CRW_NSE (1UL << 4) /* No Snoop Enable */
+#define AHB_BASE(i) (AHB_MAPB + 4 + (i) * 0x10)
+#define AHB_PEXLBASE(i) (AHB_MAPB + 8 + (i) * 0x10)
+#define AHB_PEXHBASE(i) (AHB_MAPB + 12 + (i) * 0x10)
+
+/* At probe time, enable mapping for each endpoint, using the pdev */
+static void sta2x11_map_ep(struct pci_dev *pdev)
+{
+ struct sta2x11_mapping *map = sta2x11_pdev_to_mapping(pdev);
+ int i;
+
+ if (!map)
+ return;
+ pci_read_config_dword(pdev, AHB_BASE(0), &map->amba_base);
+
+ /* Configure AHB mapping */
+ pci_write_config_dword(pdev, AHB_PEXLBASE(0), 0);
+ pci_write_config_dword(pdev, AHB_PEXHBASE(0), 0);
+ pci_write_config_dword(pdev, AHB_CRW(0), STA2X11_AMBA_SIZE |
+ AHB_CRW_WTYPE_MEM | AHB_CRW_ENABLE);
+
+ /* Disable all the other windows */
+ for (i = 1; i < STA2X11_NR_FUNCS; i++)
+ pci_write_config_dword(pdev, AHB_CRW(i), 0);
+
+ dev_info(&pdev->dev,
+ "sta2x11: Map EP %i: AMBA address %#8x-%#8x\n",
+ sta2x11_pdev_to_ep(pdev), map->amba_base,
+ map->amba_base + STA2X11_AMBA_SIZE - 1);
+}
+DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_STMICRO, PCI_ANY_ID, sta2x11_map_ep);
+
+#ifdef CONFIG_PM /* Some register values must be saved and restored */
+
+static void suspend_mapping(struct pci_dev *pdev)
+{
+ struct sta2x11_mapping *map = sta2x11_pdev_to_mapping(pdev);
+ int i;
+
+ if (!map)
+ return;
+
+ if (map->is_suspended)
+ return;
+ map->is_suspended = 1;
+
+ /* Save all window configs */
+ for (i = 0; i < STA2X11_NR_FUNCS; i++) {
+ struct sta2x11_ahb_regs *regs = map->regs + i;
+
+ pci_read_config_dword(pdev, AHB_BASE(i), &regs->base);
+ pci_read_config_dword(pdev, AHB_PEXLBASE(i), &regs->pexlbase);
+ pci_read_config_dword(pdev, AHB_PEXHBASE(i), &regs->pexhbase);
+ pci_read_config_dword(pdev, AHB_CRW(i), &regs->crw);
+ }
+}
+DECLARE_PCI_FIXUP_SUSPEND(PCI_VENDOR_ID_STMICRO, PCI_ANY_ID, suspend_mapping);
+
+static void resume_mapping(struct pci_dev *pdev)
+{
+ struct sta2x11_mapping *map = sta2x11_pdev_to_mapping(pdev);
+ int i;
+
+ if (!map)
+ return;
+
+
+ if (!map->is_suspended)
+ goto out;
+ map->is_suspended = 0;
+
+ /* Restore all window configs */
+ for (i = 0; i < STA2X11_NR_FUNCS; i++) {
+ struct sta2x11_ahb_regs *regs = map->regs + i;
+
+ pci_write_config_dword(pdev, AHB_BASE(i), regs->base);
+ pci_write_config_dword(pdev, AHB_PEXLBASE(i), regs->pexlbase);
+ pci_write_config_dword(pdev, AHB_PEXHBASE(i), regs->pexhbase);
+ pci_write_config_dword(pdev, AHB_CRW(i), regs->crw);
+ }
+out:
+ pci_set_master(pdev); /* Like at boot, enable master on all devices */
+}
+DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_STMICRO, PCI_ANY_ID, resume_mapping);
+
+#endif /* CONFIG_PM */
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
new file mode 100644
index 000000000..bacf8d988
--- /dev/null
+++ b/arch/x86/pci/xen.c
@@ -0,0 +1,582 @@
+/*
+ * Xen PCI - handle PCI (INTx) and MSI infrastructure calls for PV, HVM and
+ * initial domain support. We also handle the DSDT _PRT callbacks for GSI's
+ * used in HVM and initial domain mode (PV does not parse ACPI, so it has no
+ * concept of GSIs). Under PV we hook under the pnbbios API for IRQs and
+ * 0xcf8 PCI configuration read/write.
+ *
+ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ * Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+ * Stefano Stabellini <stefano.stabellini@eu.citrix.com>
+ */
+#include <linux/export.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/acpi.h>
+
+#include <linux/io.h>
+#include <asm/io_apic.h>
+#include <asm/pci_x86.h>
+
+#include <asm/xen/hypervisor.h>
+
+#include <xen/features.h>
+#include <xen/events.h>
+#include <asm/xen/pci.h>
+#include <asm/xen/cpuid.h>
+#include <asm/apic.h>
+#include <asm/acpi.h>
+#include <asm/i8259.h>
+
+static int xen_pcifront_enable_irq(struct pci_dev *dev)
+{
+ int rc;
+ int share = 1;
+ int pirq;
+ u8 gsi;
+
+ rc = pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &gsi);
+ if (rc < 0) {
+ dev_warn(&dev->dev, "Xen PCI: failed to read interrupt line: %d\n",
+ rc);
+ return rc;
+ }
+ /* In PV DomU the Xen PCI backend puts the PIRQ in the interrupt line.*/
+ pirq = gsi;
+
+ if (gsi < nr_legacy_irqs())
+ share = 0;
+
+ rc = xen_bind_pirq_gsi_to_irq(gsi, pirq, share, "pcifront");
+ if (rc < 0) {
+ dev_warn(&dev->dev, "Xen PCI: failed to bind GSI%d (PIRQ%d) to IRQ: %d\n",
+ gsi, pirq, rc);
+ return rc;
+ }
+
+ dev->irq = rc;
+ dev_info(&dev->dev, "Xen PCI mapped GSI%d to IRQ%d\n", gsi, dev->irq);
+ return 0;
+}
+
+#ifdef CONFIG_ACPI
+static int xen_register_pirq(u32 gsi, int gsi_override, int triggering,
+ bool set_pirq)
+{
+ int rc, pirq = -1, irq = -1;
+ struct physdev_map_pirq map_irq;
+ int shareable = 0;
+ char *name;
+
+ irq = xen_irq_from_gsi(gsi);
+ if (irq > 0)
+ return irq;
+
+ if (set_pirq)
+ pirq = gsi;
+
+ map_irq.domid = DOMID_SELF;
+ map_irq.type = MAP_PIRQ_TYPE_GSI;
+ map_irq.index = gsi;
+ map_irq.pirq = pirq;
+
+ rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
+ if (rc) {
+ printk(KERN_WARNING "xen map irq failed %d\n", rc);
+ return -1;
+ }
+
+ if (triggering == ACPI_EDGE_SENSITIVE) {
+ shareable = 0;
+ name = "ioapic-edge";
+ } else {
+ shareable = 1;
+ name = "ioapic-level";
+ }
+
+ if (gsi_override >= 0)
+ gsi = gsi_override;
+
+ irq = xen_bind_pirq_gsi_to_irq(gsi, map_irq.pirq, shareable, name);
+ if (irq < 0)
+ goto out;
+
+ printk(KERN_DEBUG "xen: --> pirq=%d -> irq=%d (gsi=%d)\n", map_irq.pirq, irq, gsi);
+out:
+ return irq;
+}
+
+static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi,
+ int trigger, int polarity)
+{
+ if (!xen_hvm_domain())
+ return -1;
+
+ return xen_register_pirq(gsi, -1 /* no GSI override */, trigger,
+ false /* no mapping of GSI to PIRQ */);
+}
+
+#ifdef CONFIG_XEN_DOM0
+static int xen_register_gsi(u32 gsi, int gsi_override, int triggering, int polarity)
+{
+ int rc, irq;
+ struct physdev_setup_gsi setup_gsi;
+
+ if (!xen_pv_domain())
+ return -1;
+
+ printk(KERN_DEBUG "xen: registering gsi %u triggering %d polarity %d\n",
+ gsi, triggering, polarity);
+
+ irq = xen_register_pirq(gsi, gsi_override, triggering, true);
+
+ setup_gsi.gsi = gsi;
+ setup_gsi.triggering = (triggering == ACPI_EDGE_SENSITIVE ? 0 : 1);
+ setup_gsi.polarity = (polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
+
+ rc = HYPERVISOR_physdev_op(PHYSDEVOP_setup_gsi, &setup_gsi);
+ if (rc == -EEXIST)
+ printk(KERN_INFO "Already setup the GSI :%d\n", gsi);
+ else if (rc) {
+ printk(KERN_ERR "Failed to setup GSI :%d, err_code:%d\n",
+ gsi, rc);
+ }
+
+ return irq;
+}
+
+static int acpi_register_gsi_xen(struct device *dev, u32 gsi,
+ int trigger, int polarity)
+{
+ return xen_register_gsi(gsi, -1 /* no GSI override */, trigger, polarity);
+}
+#endif
+#endif
+
+#if defined(CONFIG_PCI_MSI)
+#include <linux/msi.h>
+#include <asm/msidef.h>
+
+struct xen_pci_frontend_ops *xen_pci_frontend;
+EXPORT_SYMBOL_GPL(xen_pci_frontend);
+
+static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+{
+ int irq, ret, i;
+ struct msi_desc *msidesc;
+ int *v;
+
+ if (type == PCI_CAP_ID_MSI && nvec > 1)
+ return 1;
+
+ v = kcalloc(max(1, nvec), sizeof(int), GFP_KERNEL);
+ if (!v)
+ return -ENOMEM;
+
+ if (type == PCI_CAP_ID_MSIX)
+ ret = xen_pci_frontend_enable_msix(dev, v, nvec);
+ else
+ ret = xen_pci_frontend_enable_msi(dev, v);
+ if (ret)
+ goto error;
+ i = 0;
+ for_each_pci_msi_entry(msidesc, dev) {
+ irq = xen_bind_pirq_msi_to_irq(dev, msidesc, v[i],
+ (type == PCI_CAP_ID_MSI) ? nvec : 1,
+ (type == PCI_CAP_ID_MSIX) ?
+ "pcifront-msi-x" :
+ "pcifront-msi",
+ DOMID_SELF);
+ if (irq < 0) {
+ ret = irq;
+ goto free;
+ }
+ i++;
+ }
+ kfree(v);
+ return 0;
+
+error:
+ if (ret == -ENOSYS)
+ dev_err(&dev->dev, "Xen PCI frontend has not registered MSI/MSI-X support!\n");
+ else if (ret)
+ dev_err(&dev->dev, "Xen PCI frontend error: %d!\n", ret);
+free:
+ kfree(v);
+ return ret;
+}
+
+#define XEN_PIRQ_MSI_DATA (MSI_DATA_TRIGGER_EDGE | \
+ MSI_DATA_LEVEL_ASSERT | (3 << 8) | MSI_DATA_VECTOR(0))
+
+static void xen_msi_compose_msg(struct pci_dev *pdev, unsigned int pirq,
+ struct msi_msg *msg)
+{
+ /* We set vector == 0 to tell the hypervisor we don't care about it,
+ * but we want a pirq setup instead.
+ * We use the dest_id field to pass the pirq that we want. */
+ msg->address_hi = MSI_ADDR_BASE_HI | MSI_ADDR_EXT_DEST_ID(pirq);
+ msg->address_lo =
+ MSI_ADDR_BASE_LO |
+ MSI_ADDR_DEST_MODE_PHYSICAL |
+ MSI_ADDR_REDIRECTION_CPU |
+ MSI_ADDR_DEST_ID(pirq);
+
+ msg->data = XEN_PIRQ_MSI_DATA;
+}
+
+static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+{
+ int irq, pirq;
+ struct msi_desc *msidesc;
+ struct msi_msg msg;
+
+ if (type == PCI_CAP_ID_MSI && nvec > 1)
+ return 1;
+
+ for_each_pci_msi_entry(msidesc, dev) {
+ pirq = xen_allocate_pirq_msi(dev, msidesc);
+ if (pirq < 0) {
+ irq = -ENODEV;
+ goto error;
+ }
+ xen_msi_compose_msg(dev, pirq, &msg);
+ __pci_write_msi_msg(msidesc, &msg);
+ dev_dbg(&dev->dev, "xen: msi bound to pirq=%d\n", pirq);
+ irq = xen_bind_pirq_msi_to_irq(dev, msidesc, pirq,
+ (type == PCI_CAP_ID_MSI) ? nvec : 1,
+ (type == PCI_CAP_ID_MSIX) ?
+ "msi-x" : "msi",
+ DOMID_SELF);
+ if (irq < 0)
+ goto error;
+ dev_dbg(&dev->dev,
+ "xen: msi --> pirq=%d --> irq=%d\n", pirq, irq);
+ }
+ return 0;
+
+error:
+ dev_err(&dev->dev, "Failed to create MSI%s! ret=%d!\n",
+ type == PCI_CAP_ID_MSI ? "" : "-X", irq);
+ return irq;
+}
+
+#ifdef CONFIG_XEN_DOM0
+static bool __read_mostly pci_seg_supported = true;
+
+static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+{
+ int ret = 0;
+ struct msi_desc *msidesc;
+
+ for_each_pci_msi_entry(msidesc, dev) {
+ struct physdev_map_pirq map_irq;
+ domid_t domid;
+
+ domid = ret = xen_find_device_domain_owner(dev);
+ /* N.B. Casting int's -ENODEV to uint16_t results in 0xFFED,
+ * hence check ret value for < 0. */
+ if (ret < 0)
+ domid = DOMID_SELF;
+
+ memset(&map_irq, 0, sizeof(map_irq));
+ map_irq.domid = domid;
+ map_irq.type = MAP_PIRQ_TYPE_MSI_SEG;
+ map_irq.index = -1;
+ map_irq.pirq = -1;
+ map_irq.bus = dev->bus->number |
+ (pci_domain_nr(dev->bus) << 16);
+ map_irq.devfn = dev->devfn;
+
+ if (type == PCI_CAP_ID_MSI && nvec > 1) {
+ map_irq.type = MAP_PIRQ_TYPE_MULTI_MSI;
+ map_irq.entry_nr = nvec;
+ } else if (type == PCI_CAP_ID_MSIX) {
+ int pos;
+ unsigned long flags;
+ u32 table_offset, bir;
+
+ pos = dev->msix_cap;
+ pci_read_config_dword(dev, pos + PCI_MSIX_TABLE,
+ &table_offset);
+ bir = (u8)(table_offset & PCI_MSIX_TABLE_BIR);
+ flags = pci_resource_flags(dev, bir);
+ if (!flags || (flags & IORESOURCE_UNSET))
+ return -EINVAL;
+
+ map_irq.table_base = pci_resource_start(dev, bir);
+ map_irq.entry_nr = msidesc->msi_attrib.entry_nr;
+ }
+
+ ret = -EINVAL;
+ if (pci_seg_supported)
+ ret = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq,
+ &map_irq);
+ if (type == PCI_CAP_ID_MSI && nvec > 1 && ret) {
+ /*
+ * If MAP_PIRQ_TYPE_MULTI_MSI is not available
+ * there's nothing else we can do in this case.
+ * Just set ret > 0 so driver can retry with
+ * single MSI.
+ */
+ ret = 1;
+ goto out;
+ }
+ if (ret == -EINVAL && !pci_domain_nr(dev->bus)) {
+ map_irq.type = MAP_PIRQ_TYPE_MSI;
+ map_irq.index = -1;
+ map_irq.pirq = -1;
+ map_irq.bus = dev->bus->number;
+ ret = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq,
+ &map_irq);
+ if (ret != -EINVAL)
+ pci_seg_supported = false;
+ }
+ if (ret) {
+ dev_warn(&dev->dev, "xen map irq failed %d for %d domain\n",
+ ret, domid);
+ goto out;
+ }
+
+ ret = xen_bind_pirq_msi_to_irq(dev, msidesc, map_irq.pirq,
+ (type == PCI_CAP_ID_MSI) ? nvec : 1,
+ (type == PCI_CAP_ID_MSIX) ? "msi-x" : "msi",
+ domid);
+ if (ret < 0)
+ goto out;
+ }
+ ret = 0;
+out:
+ return ret;
+}
+
+static void xen_initdom_restore_msi_irqs(struct pci_dev *dev)
+{
+ int ret = 0;
+
+ if (pci_seg_supported) {
+ struct physdev_pci_device restore_ext;
+
+ restore_ext.seg = pci_domain_nr(dev->bus);
+ restore_ext.bus = dev->bus->number;
+ restore_ext.devfn = dev->devfn;
+ ret = HYPERVISOR_physdev_op(PHYSDEVOP_restore_msi_ext,
+ &restore_ext);
+ if (ret == -ENOSYS)
+ pci_seg_supported = false;
+ WARN(ret && ret != -ENOSYS, "restore_msi_ext -> %d\n", ret);
+ }
+ if (!pci_seg_supported) {
+ struct physdev_restore_msi restore;
+
+ restore.bus = dev->bus->number;
+ restore.devfn = dev->devfn;
+ ret = HYPERVISOR_physdev_op(PHYSDEVOP_restore_msi, &restore);
+ WARN(ret && ret != -ENOSYS, "restore_msi -> %d\n", ret);
+ }
+}
+#endif
+
+static void xen_teardown_msi_irqs(struct pci_dev *dev)
+{
+ struct msi_desc *msidesc;
+
+ msidesc = first_pci_msi_entry(dev);
+ if (msidesc->msi_attrib.is_msix)
+ xen_pci_frontend_disable_msix(dev);
+ else
+ xen_pci_frontend_disable_msi(dev);
+
+ /* Free the IRQ's and the msidesc using the generic code. */
+ default_teardown_msi_irqs(dev);
+}
+
+static void xen_teardown_msi_irq(unsigned int irq)
+{
+ xen_destroy_irq(irq);
+}
+
+#endif
+
+int __init pci_xen_init(void)
+{
+ if (!xen_pv_domain() || xen_initial_domain())
+ return -ENODEV;
+
+ printk(KERN_INFO "PCI: setting up Xen PCI frontend stub\n");
+
+ pcibios_set_cache_line_size();
+
+ pcibios_enable_irq = xen_pcifront_enable_irq;
+ pcibios_disable_irq = NULL;
+
+ /* Keep ACPI out of the picture */
+ acpi_noirq_set();
+
+#ifdef CONFIG_PCI_MSI
+ x86_msi.setup_msi_irqs = xen_setup_msi_irqs;
+ x86_msi.teardown_msi_irq = xen_teardown_msi_irq;
+ x86_msi.teardown_msi_irqs = xen_teardown_msi_irqs;
+ pci_msi_ignore_mask = 1;
+#endif
+ return 0;
+}
+
+#ifdef CONFIG_PCI_MSI
+void __init xen_msi_init(void)
+{
+ if (!disable_apic) {
+ /*
+ * If hardware supports (x2)APIC virtualization (as indicated
+ * by hypervisor's leaf 4) then we don't need to use pirqs/
+ * event channels for MSI handling and instead use regular
+ * APIC processing
+ */
+ uint32_t eax = cpuid_eax(xen_cpuid_base() + 4);
+
+ if (((eax & XEN_HVM_CPUID_X2APIC_VIRT) && x2apic_mode) ||
+ ((eax & XEN_HVM_CPUID_APIC_ACCESS_VIRT) && boot_cpu_has(X86_FEATURE_APIC)))
+ return;
+ }
+
+ x86_msi.setup_msi_irqs = xen_hvm_setup_msi_irqs;
+ x86_msi.teardown_msi_irq = xen_teardown_msi_irq;
+ /*
+ * With XEN PIRQ/Eventchannels in use PCI/MSI[-X] masking is solely
+ * controlled by the hypervisor.
+ */
+ pci_msi_ignore_mask = 1;
+}
+#endif
+
+int __init pci_xen_hvm_init(void)
+{
+ if (!xen_have_vector_callback || !xen_feature(XENFEAT_hvm_pirqs))
+ return 0;
+
+#ifdef CONFIG_ACPI
+ /*
+ * We don't want to change the actual ACPI delivery model,
+ * just how GSIs get registered.
+ */
+ __acpi_register_gsi = acpi_register_gsi_xen_hvm;
+ __acpi_unregister_gsi = NULL;
+#endif
+
+#ifdef CONFIG_PCI_MSI
+ /*
+ * We need to wait until after x2apic is initialized
+ * before we can set MSI IRQ ops.
+ */
+ x86_platform.apic_post_init = xen_msi_init;
+#endif
+ return 0;
+}
+
+#ifdef CONFIG_XEN_DOM0
+int __init pci_xen_initial_domain(void)
+{
+ int irq;
+
+#ifdef CONFIG_PCI_MSI
+ x86_msi.setup_msi_irqs = xen_initdom_setup_msi_irqs;
+ x86_msi.teardown_msi_irq = xen_teardown_msi_irq;
+ x86_msi.restore_msi_irqs = xen_initdom_restore_msi_irqs;
+ pci_msi_ignore_mask = 1;
+#endif
+ __acpi_register_gsi = acpi_register_gsi_xen;
+ __acpi_unregister_gsi = NULL;
+ /*
+ * Pre-allocate the legacy IRQs. Use NR_LEGACY_IRQS here
+ * because we don't have a PIC and thus nr_legacy_irqs() is zero.
+ */
+ for (irq = 0; irq < NR_IRQS_LEGACY; irq++) {
+ int trigger, polarity;
+
+ if (acpi_get_override_irq(irq, &trigger, &polarity) == -1)
+ continue;
+
+ xen_register_pirq(irq, -1 /* no GSI override */,
+ trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE,
+ true /* Map GSI to PIRQ */);
+ }
+ if (0 == nr_ioapics) {
+ for (irq = 0; irq < nr_legacy_irqs(); irq++)
+ xen_bind_pirq_gsi_to_irq(irq, irq, 0, "xt-pic");
+ }
+ return 0;
+}
+
+struct xen_device_domain_owner {
+ domid_t domain;
+ struct pci_dev *dev;
+ struct list_head list;
+};
+
+static DEFINE_SPINLOCK(dev_domain_list_spinlock);
+static struct list_head dev_domain_list = LIST_HEAD_INIT(dev_domain_list);
+
+static struct xen_device_domain_owner *find_device(struct pci_dev *dev)
+{
+ struct xen_device_domain_owner *owner;
+
+ list_for_each_entry(owner, &dev_domain_list, list) {
+ if (owner->dev == dev)
+ return owner;
+ }
+ return NULL;
+}
+
+int xen_find_device_domain_owner(struct pci_dev *dev)
+{
+ struct xen_device_domain_owner *owner;
+ int domain = -ENODEV;
+
+ spin_lock(&dev_domain_list_spinlock);
+ owner = find_device(dev);
+ if (owner)
+ domain = owner->domain;
+ spin_unlock(&dev_domain_list_spinlock);
+ return domain;
+}
+EXPORT_SYMBOL_GPL(xen_find_device_domain_owner);
+
+int xen_register_device_domain_owner(struct pci_dev *dev, uint16_t domain)
+{
+ struct xen_device_domain_owner *owner;
+
+ owner = kzalloc(sizeof(struct xen_device_domain_owner), GFP_KERNEL);
+ if (!owner)
+ return -ENODEV;
+
+ spin_lock(&dev_domain_list_spinlock);
+ if (find_device(dev)) {
+ spin_unlock(&dev_domain_list_spinlock);
+ kfree(owner);
+ return -EEXIST;
+ }
+ owner->domain = domain;
+ owner->dev = dev;
+ list_add_tail(&owner->list, &dev_domain_list);
+ spin_unlock(&dev_domain_list_spinlock);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(xen_register_device_domain_owner);
+
+int xen_unregister_device_domain_owner(struct pci_dev *dev)
+{
+ struct xen_device_domain_owner *owner;
+
+ spin_lock(&dev_domain_list_spinlock);
+ owner = find_device(dev);
+ if (!owner) {
+ spin_unlock(&dev_domain_list_spinlock);
+ return -ENODEV;
+ }
+ list_del(&owner->list);
+ spin_unlock(&dev_domain_list_spinlock);
+ kfree(owner);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(xen_unregister_device_domain_owner);
+#endif
diff --git a/arch/x86/platform/Makefile b/arch/x86/platform/Makefile
new file mode 100644
index 000000000..d0e835470
--- /dev/null
+++ b/arch/x86/platform/Makefile
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: GPL-2.0
+# Platform specific code goes here
+obj-y += atom/
+obj-y += ce4100/
+obj-y += efi/
+obj-y += geode/
+obj-y += goldfish/
+obj-y += iris/
+obj-y += intel/
+obj-y += intel-mid/
+obj-y += intel-quark/
+obj-y += olpc/
+obj-y += scx200/
+obj-y += sfi/
+obj-y += ts5500/
+obj-y += uv/
diff --git a/arch/x86/platform/atom/Makefile b/arch/x86/platform/atom/Makefile
new file mode 100644
index 000000000..57be88fa3
--- /dev/null
+++ b/arch/x86/platform/atom/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_PUNIT_ATOM_DEBUG) += punit_atom_debug.o
diff --git a/arch/x86/platform/atom/punit_atom_debug.c b/arch/x86/platform/atom/punit_atom_debug.c
new file mode 100644
index 000000000..41dae0f0d
--- /dev/null
+++ b/arch/x86/platform/atom/punit_atom_debug.c
@@ -0,0 +1,181 @@
+/*
+ * Intel SOC Punit device state debug driver
+ * Punit controls power management for North Complex devices (Graphics
+ * blocks, Image Signal Processing, video processing, display, DSP etc.)
+ *
+ * Copyright (c) 2015, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/device.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/io.h>
+#include <asm/cpu_device_id.h>
+#include <asm/intel-family.h>
+#include <asm/iosf_mbi.h>
+
+/* Subsystem config/status Video processor */
+#define VED_SS_PM0 0x32
+/* Subsystem config/status ISP (Image Signal Processor) */
+#define ISP_SS_PM0 0x39
+/* Subsystem config/status Input/output controller */
+#define MIO_SS_PM 0x3B
+/* Shift bits for getting status for video, isp and i/o */
+#define SSS_SHIFT 24
+
+/* Power gate status reg */
+#define PWRGT_STATUS 0x61
+/* Shift bits for getting status for graphics rendering */
+#define RENDER_POS 0
+/* Shift bits for getting status for media control */
+#define MEDIA_POS 2
+/* Shift bits for getting status for Valley View/Baytrail display */
+#define VLV_DISPLAY_POS 6
+
+/* Subsystem config/status display for Cherry Trail SOC */
+#define CHT_DSP_SSS 0x36
+/* Shift bits for getting status for display */
+#define CHT_DSP_SSS_POS 16
+
+struct punit_device {
+ char *name;
+ int reg;
+ int sss_pos;
+};
+
+static const struct punit_device punit_device_tng[] = {
+ { "DISPLAY", CHT_DSP_SSS, SSS_SHIFT },
+ { "VED", VED_SS_PM0, SSS_SHIFT },
+ { "ISP", ISP_SS_PM0, SSS_SHIFT },
+ { "MIO", MIO_SS_PM, SSS_SHIFT },
+ { NULL }
+};
+
+static const struct punit_device punit_device_byt[] = {
+ { "GFX RENDER", PWRGT_STATUS, RENDER_POS },
+ { "GFX MEDIA", PWRGT_STATUS, MEDIA_POS },
+ { "DISPLAY", PWRGT_STATUS, VLV_DISPLAY_POS },
+ { "VED", VED_SS_PM0, SSS_SHIFT },
+ { "ISP", ISP_SS_PM0, SSS_SHIFT },
+ { "MIO", MIO_SS_PM, SSS_SHIFT },
+ { NULL }
+};
+
+static const struct punit_device punit_device_cht[] = {
+ { "GFX RENDER", PWRGT_STATUS, RENDER_POS },
+ { "GFX MEDIA", PWRGT_STATUS, MEDIA_POS },
+ { "DISPLAY", CHT_DSP_SSS, CHT_DSP_SSS_POS },
+ { "VED", VED_SS_PM0, SSS_SHIFT },
+ { "ISP", ISP_SS_PM0, SSS_SHIFT },
+ { "MIO", MIO_SS_PM, SSS_SHIFT },
+ { NULL }
+};
+
+static const char * const dstates[] = {"D0", "D0i1", "D0i2", "D0i3"};
+
+static int punit_dev_state_show(struct seq_file *seq_file, void *unused)
+{
+ u32 punit_pwr_status;
+ struct punit_device *punit_devp = seq_file->private;
+ int index;
+ int status;
+
+ seq_puts(seq_file, "\n\nPUNIT NORTH COMPLEX DEVICES :\n");
+ while (punit_devp->name) {
+ status = iosf_mbi_read(BT_MBI_UNIT_PMC, MBI_REG_READ,
+ punit_devp->reg, &punit_pwr_status);
+ if (status) {
+ seq_printf(seq_file, "%9s : Read Failed\n",
+ punit_devp->name);
+ } else {
+ index = (punit_pwr_status >> punit_devp->sss_pos) & 3;
+ seq_printf(seq_file, "%9s : %s\n", punit_devp->name,
+ dstates[index]);
+ }
+ punit_devp++;
+ }
+
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(punit_dev_state);
+
+static struct dentry *punit_dbg_file;
+
+static int punit_dbgfs_register(struct punit_device *punit_device)
+{
+ static struct dentry *dev_state;
+
+ punit_dbg_file = debugfs_create_dir("punit_atom", NULL);
+ if (!punit_dbg_file)
+ return -ENXIO;
+
+ dev_state = debugfs_create_file("dev_power_state", 0444,
+ punit_dbg_file, punit_device,
+ &punit_dev_state_fops);
+ if (!dev_state) {
+ pr_err("punit_dev_state register failed\n");
+ debugfs_remove(punit_dbg_file);
+ return -ENXIO;
+ }
+
+ return 0;
+}
+
+static void punit_dbgfs_unregister(void)
+{
+ debugfs_remove_recursive(punit_dbg_file);
+}
+
+#define ICPU(model, drv_data) \
+ { X86_VENDOR_INTEL, 6, model, X86_FEATURE_MWAIT,\
+ (kernel_ulong_t)&drv_data }
+
+static const struct x86_cpu_id intel_punit_cpu_ids[] = {
+ ICPU(INTEL_FAM6_ATOM_SILVERMONT, punit_device_byt),
+ ICPU(INTEL_FAM6_ATOM_SILVERMONT_MID, punit_device_tng),
+ ICPU(INTEL_FAM6_ATOM_AIRMONT, punit_device_cht),
+ {}
+};
+
+MODULE_DEVICE_TABLE(x86cpu, intel_punit_cpu_ids);
+
+static int __init punit_atom_debug_init(void)
+{
+ const struct x86_cpu_id *id;
+ int ret;
+
+ id = x86_match_cpu(intel_punit_cpu_ids);
+ if (!id)
+ return -ENODEV;
+
+ ret = punit_dbgfs_register((struct punit_device *)id->driver_data);
+ if (ret < 0)
+ return ret;
+
+ return 0;
+}
+
+static void __exit punit_atom_debug_exit(void)
+{
+ punit_dbgfs_unregister();
+}
+
+module_init(punit_atom_debug_init);
+module_exit(punit_atom_debug_exit);
+
+MODULE_AUTHOR("Kumar P, Mahesh <mahesh.kumar.p@intel.com>");
+MODULE_AUTHOR("Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>");
+MODULE_DESCRIPTION("Driver for Punit devices states debugging");
+MODULE_LICENSE("GPL v2");
diff --git a/arch/x86/platform/ce4100/Makefile b/arch/x86/platform/ce4100/Makefile
new file mode 100644
index 000000000..91fc92971
--- /dev/null
+++ b/arch/x86/platform/ce4100/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_X86_INTEL_CE) += ce4100.o
diff --git a/arch/x86/platform/ce4100/ce4100.c b/arch/x86/platform/ce4100/ce4100.c
new file mode 100644
index 000000000..ce4b06733
--- /dev/null
+++ b/arch/x86/platform/ce4100/ce4100.c
@@ -0,0 +1,160 @@
+/*
+ * Intel CE4100 platform specific setup code
+ *
+ * (C) Copyright 2010 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/irq.h>
+#include <linux/reboot.h>
+#include <linux/serial_reg.h>
+#include <linux/serial_8250.h>
+
+#include <asm/ce4100.h>
+#include <asm/prom.h>
+#include <asm/setup.h>
+#include <asm/i8259.h>
+#include <asm/io.h>
+#include <asm/io_apic.h>
+#include <asm/emergency-restart.h>
+
+/*
+ * The CE4100 platform has an internal 8051 Microcontroller which is
+ * responsible for signaling to the external Power Management Unit the
+ * intention to reset, reboot or power off the system. This 8051 device has
+ * its command register mapped at I/O port 0xcf9 and the value 0x4 is used
+ * to power off the system.
+ */
+static void ce4100_power_off(void)
+{
+ outb(0x4, 0xcf9);
+}
+
+#ifdef CONFIG_SERIAL_8250
+
+static unsigned int mem_serial_in(struct uart_port *p, int offset)
+{
+ offset = offset << p->regshift;
+ return readl(p->membase + offset);
+}
+
+/*
+ * The UART Tx interrupts are not set under some conditions and therefore serial
+ * transmission hangs. This is a silicon issue and has not been root caused. The
+ * workaround for this silicon issue checks UART_LSR_THRE bit and UART_LSR_TEMT
+ * bit of LSR register in interrupt handler to see whether at least one of these
+ * two bits is set, if so then process the transmit request. If this workaround
+ * is not applied, then the serial transmission may hang. This workaround is for
+ * errata number 9 in Errata - B step.
+*/
+
+static unsigned int ce4100_mem_serial_in(struct uart_port *p, int offset)
+{
+ unsigned int ret, ier, lsr;
+
+ if (offset == UART_IIR) {
+ offset = offset << p->regshift;
+ ret = readl(p->membase + offset);
+ if (ret & UART_IIR_NO_INT) {
+ /* see if the TX interrupt should have really set */
+ ier = mem_serial_in(p, UART_IER);
+ /* see if the UART's XMIT interrupt is enabled */
+ if (ier & UART_IER_THRI) {
+ lsr = mem_serial_in(p, UART_LSR);
+ /* now check to see if the UART should be
+ generating an interrupt (but isn't) */
+ if (lsr & (UART_LSR_THRE | UART_LSR_TEMT))
+ ret &= ~UART_IIR_NO_INT;
+ }
+ }
+ } else
+ ret = mem_serial_in(p, offset);
+ return ret;
+}
+
+static void ce4100_mem_serial_out(struct uart_port *p, int offset, int value)
+{
+ offset = offset << p->regshift;
+ writel(value, p->membase + offset);
+}
+
+static void ce4100_serial_fixup(int port, struct uart_port *up,
+ u32 *capabilites)
+{
+#ifdef CONFIG_EARLY_PRINTK
+ /*
+ * Over ride the legacy port configuration that comes from
+ * asm/serial.h. Using the ioport driver then switching to the
+ * PCI memmaped driver hangs the IOAPIC
+ */
+ if (up->iotype != UPIO_MEM32) {
+ up->uartclk = 14745600;
+ up->mapbase = 0xdffe0200;
+ set_fixmap_nocache(FIX_EARLYCON_MEM_BASE,
+ up->mapbase & PAGE_MASK);
+ up->membase =
+ (void __iomem *)__fix_to_virt(FIX_EARLYCON_MEM_BASE);
+ up->membase += up->mapbase & ~PAGE_MASK;
+ up->mapbase += port * 0x100;
+ up->membase += port * 0x100;
+ up->iotype = UPIO_MEM32;
+ up->regshift = 2;
+ up->irq = 4;
+ }
+#endif
+ up->iobase = 0;
+ up->serial_in = ce4100_mem_serial_in;
+ up->serial_out = ce4100_mem_serial_out;
+
+ *capabilites |= (1 << 12);
+}
+
+static __init void sdv_serial_fixup(void)
+{
+ serial8250_set_isa_configurator(ce4100_serial_fixup);
+}
+
+#else
+static inline void sdv_serial_fixup(void) {};
+#endif
+
+static void __init sdv_arch_setup(void)
+{
+ sdv_serial_fixup();
+}
+
+static void sdv_pci_init(void)
+{
+ x86_of_pci_init();
+}
+
+/*
+ * CE4100 specific x86_init function overrides and early setup
+ * calls.
+ */
+void __init x86_ce4100_early_setup(void)
+{
+ x86_init.oem.arch_setup = sdv_arch_setup;
+ x86_init.resources.probe_roms = x86_init_noop;
+ x86_init.mpparse.get_smp_config = x86_init_uint_noop;
+ x86_init.mpparse.find_smp_config = x86_init_noop;
+ x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc_nocheck;
+ x86_init.pci.init = ce4100_pci_init;
+ x86_init.pci.init_irq = sdv_pci_init;
+
+ /*
+ * By default, the reboot method is ACPI which is supported by the
+ * CE4100 bootloader CEFDK using FADT.ResetReg Address and ResetValue
+ * the bootloader will however issue a system power off instead of
+ * reboot. By using BOOT_KBD we ensure proper system reboot as
+ * expected.
+ */
+ reboot_type = BOOT_KBD;
+
+ pm_power_off = ce4100_power_off;
+}
diff --git a/arch/x86/platform/ce4100/falconfalls.dts b/arch/x86/platform/ce4100/falconfalls.dts
new file mode 100644
index 000000000..ce874f872
--- /dev/null
+++ b/arch/x86/platform/ce4100/falconfalls.dts
@@ -0,0 +1,433 @@
+/*
+ * CE4100 on Falcon Falls
+ *
+ * (c) Copyright 2010 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; version 2 of the License.
+ */
+/dts-v1/;
+/ {
+ model = "intel,falconfalls";
+ compatible = "intel,falconfalls";
+ #address-cells = <1>;
+ #size-cells = <1>;
+
+ cpus {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ cpu@0 {
+ device_type = "cpu";
+ compatible = "intel,ce4100";
+ reg = <0>;
+ lapic = <&lapic0>;
+ };
+ };
+
+ soc@0 {
+ #address-cells = <1>;
+ #size-cells = <1>;
+ compatible = "intel,ce4100-cp";
+ ranges;
+
+ ioapic1: interrupt-controller@fec00000 {
+ #interrupt-cells = <2>;
+ compatible = "intel,ce4100-ioapic";
+ interrupt-controller;
+ reg = <0xfec00000 0x1000>;
+ };
+
+ timer@fed00000 {
+ compatible = "intel,ce4100-hpet";
+ reg = <0xfed00000 0x200>;
+ };
+
+ lapic0: interrupt-controller@fee00000 {
+ compatible = "intel,ce4100-lapic";
+ reg = <0xfee00000 0x1000>;
+ };
+
+ pci@3fc {
+ #address-cells = <3>;
+ #size-cells = <2>;
+ compatible = "intel,ce4100-pci", "pci";
+ device_type = "pci";
+ bus-range = <0 0>;
+ ranges = <0x2000000 0 0xbffff000 0xbffff000 0 0x1000
+ 0x2000000 0 0xdffe0000 0xdffe0000 0 0x1000
+ 0x0000000 0 0x0 0x0 0 0x100>;
+
+ /* Secondary IO-APIC */
+ ioapic2: interrupt-controller@0,1 {
+ #interrupt-cells = <2>;
+ compatible = "intel,ce4100-ioapic";
+ interrupt-controller;
+ reg = <0x100 0x0 0x0 0x0 0x0>;
+ assigned-addresses = <0x02000000 0x0 0xbffff000 0x0 0x1000>;
+ };
+
+ pci@1,0 {
+ #address-cells = <3>;
+ #size-cells = <2>;
+ compatible = "intel,ce4100-pci", "pci";
+ device_type = "pci";
+ bus-range = <1 1>;
+ reg = <0x0800 0x0 0x0 0x0 0x0>;
+ ranges = <0x2000000 0 0xdffe0000 0x2000000 0 0xdffe0000 0 0x1000>;
+
+ interrupt-parent = <&ioapic2>;
+
+ display@2,0 {
+ compatible = "pci8086,2e5b.2",
+ "pci8086,2e5b",
+ "pciclass038000",
+ "pciclass0380";
+
+ reg = <0x11000 0x0 0x0 0x0 0x0>;
+ interrupts = <0 1>;
+ };
+
+ multimedia@3,0 {
+ compatible = "pci8086,2e5c.2",
+ "pci8086,2e5c",
+ "pciclass048000",
+ "pciclass0480";
+
+ reg = <0x11800 0x0 0x0 0x0 0x0>;
+ interrupts = <2 1>;
+ };
+
+ multimedia@4,0 {
+ compatible = "pci8086,2e5d.2",
+ "pci8086,2e5d",
+ "pciclass048000",
+ "pciclass0480";
+
+ reg = <0x12000 0x0 0x0 0x0 0x0>;
+ interrupts = <4 1>;
+ };
+
+ multimedia@4,1 {
+ compatible = "pci8086,2e5e.2",
+ "pci8086,2e5e",
+ "pciclass048000",
+ "pciclass0480";
+
+ reg = <0x12100 0x0 0x0 0x0 0x0>;
+ interrupts = <5 1>;
+ };
+
+ sound@6,0 {
+ compatible = "pci8086,2e5f.2",
+ "pci8086,2e5f",
+ "pciclass040100",
+ "pciclass0401";
+
+ reg = <0x13000 0x0 0x0 0x0 0x0>;
+ interrupts = <6 1>;
+ };
+
+ sound@6,1 {
+ compatible = "pci8086,2e5f.2",
+ "pci8086,2e5f",
+ "pciclass040100",
+ "pciclass0401";
+
+ reg = <0x13100 0x0 0x0 0x0 0x0>;
+ interrupts = <7 1>;
+ };
+
+ sound@6,2 {
+ compatible = "pci8086,2e60.2",
+ "pci8086,2e60",
+ "pciclass040100",
+ "pciclass0401";
+
+ reg = <0x13200 0x0 0x0 0x0 0x0>;
+ interrupts = <8 1>;
+ };
+
+ display@8,0 {
+ compatible = "pci8086,2e61.2",
+ "pci8086,2e61",
+ "pciclass038000",
+ "pciclass0380";
+
+ reg = <0x14000 0x0 0x0 0x0 0x0>;
+ interrupts = <9 1>;
+ };
+
+ display@8,1 {
+ compatible = "pci8086,2e62.2",
+ "pci8086,2e62",
+ "pciclass038000",
+ "pciclass0380";
+
+ reg = <0x14100 0x0 0x0 0x0 0x0>;
+ interrupts = <10 1>;
+ };
+
+ multimedia@8,2 {
+ compatible = "pci8086,2e63.2",
+ "pci8086,2e63",
+ "pciclass048000",
+ "pciclass0480";
+
+ reg = <0x14200 0x0 0x0 0x0 0x0>;
+ interrupts = <11 1>;
+ };
+
+ entertainment-encryption@9,0 {
+ compatible = "pci8086,2e64.2",
+ "pci8086,2e64",
+ "pciclass101000",
+ "pciclass1010";
+
+ reg = <0x14800 0x0 0x0 0x0 0x0>;
+ interrupts = <12 1>;
+ };
+
+ localbus@a,0 {
+ compatible = "pci8086,2e65.2",
+ "pci8086,2e65",
+ "pciclassff0000",
+ "pciclassff00";
+
+ reg = <0x15000 0x0 0x0 0x0 0x0>;
+ };
+
+ serial@b,0 {
+ compatible = "pci8086,2e66.2",
+ "pci8086,2e66",
+ "pciclass070003",
+ "pciclass0700";
+
+ reg = <0x15800 0x0 0x0 0x0 0x0>;
+ interrupts = <14 1>;
+ };
+
+ pcigpio: gpio@b,1 {
+ #gpio-cells = <2>;
+ #interrupt-cells = <2>;
+ compatible = "pci8086,2e67.2",
+ "pci8086,2e67",
+ "pciclassff0000",
+ "pciclassff00";
+
+ reg = <0x15900 0x0 0x0 0x0 0x0>;
+ interrupts = <15 1>;
+ interrupt-controller;
+ gpio-controller;
+ intel,muxctl = <0>;
+ };
+
+ i2c-controller@b,2 {
+ #address-cells = <2>;
+ #size-cells = <1>;
+ compatible = "pci8086,2e68.2",
+ "pci8086,2e68",
+ "pciclass,ff0000",
+ "pciclass,ff00";
+
+ reg = <0x15a00 0x0 0x0 0x0 0x0>;
+ interrupts = <16 1>;
+ ranges = <0 0 0x02000000 0 0xdffe0500 0x100
+ 1 0 0x02000000 0 0xdffe0600 0x100
+ 2 0 0x02000000 0 0xdffe0700 0x100>;
+
+ i2c@0 {
+ #address-cells = <1>;
+ #size-cells = <0>;
+ compatible = "intel,ce4100-i2c-controller";
+ reg = <0 0 0x100>;
+ };
+
+ i2c@1 {
+ #address-cells = <1>;
+ #size-cells = <0>;
+ compatible = "intel,ce4100-i2c-controller";
+ reg = <1 0 0x100>;
+
+ gpio@26 {
+ #gpio-cells = <2>;
+ compatible = "ti,pcf8575";
+ reg = <0x26>;
+ gpio-controller;
+ };
+ };
+
+ i2c@2 {
+ #address-cells = <1>;
+ #size-cells = <0>;
+ compatible = "intel,ce4100-i2c-controller";
+ reg = <2 0 0x100>;
+
+ gpio@26 {
+ #gpio-cells = <2>;
+ compatible = "ti,pcf8575";
+ reg = <0x26>;
+ gpio-controller;
+ };
+ };
+ };
+
+ smard-card@b,3 {
+ compatible = "pci8086,2e69.2",
+ "pci8086,2e69",
+ "pciclass070500",
+ "pciclass0705";
+
+ reg = <0x15b00 0x0 0x0 0x0 0x0>;
+ interrupts = <15 1>;
+ };
+
+ spi-controller@b,4 {
+ #address-cells = <1>;
+ #size-cells = <0>;
+ compatible =
+ "pci8086,2e6a.2",
+ "pci8086,2e6a",
+ "pciclass,ff0000",
+ "pciclass,ff00";
+
+ reg = <0x15c00 0x0 0x0 0x0 0x0>;
+ interrupts = <15 1>;
+
+ dac@0 {
+ compatible = "ti,pcm1755";
+ reg = <0>;
+ spi-max-frequency = <115200>;
+ };
+
+ dac@1 {
+ compatible = "ti,pcm1609a";
+ reg = <1>;
+ spi-max-frequency = <115200>;
+ };
+
+ eeprom@2 {
+ compatible = "atmel,at93c46";
+ reg = <2>;
+ spi-max-frequency = <115200>;
+ };
+ };
+
+ multimedia@b,7 {
+ compatible = "pci8086,2e6d.2",
+ "pci8086,2e6d",
+ "pciclassff0000",
+ "pciclassff00";
+
+ reg = <0x15f00 0x0 0x0 0x0 0x0>;
+ };
+
+ ethernet@c,0 {
+ compatible = "pci8086,2e6e.2",
+ "pci8086,2e6e",
+ "pciclass020000",
+ "pciclass0200";
+
+ reg = <0x16000 0x0 0x0 0x0 0x0>;
+ interrupts = <21 1>;
+ };
+
+ clock@c,1 {
+ compatible = "pci8086,2e6f.2",
+ "pci8086,2e6f",
+ "pciclassff0000",
+ "pciclassff00";
+
+ reg = <0x16100 0x0 0x0 0x0 0x0>;
+ interrupts = <3 1>;
+ };
+
+ usb@d,0 {
+ compatible = "pci8086,2e70.2",
+ "pci8086,2e70",
+ "pciclass0c0320",
+ "pciclass0c03";
+
+ reg = <0x16800 0x0 0x0 0x0 0x0>;
+ interrupts = <22 1>;
+ };
+
+ usb@d,1 {
+ compatible = "pci8086,2e70.2",
+ "pci8086,2e70",
+ "pciclass0c0320",
+ "pciclass0c03";
+
+ reg = <0x16900 0x0 0x0 0x0 0x0>;
+ interrupts = <22 1>;
+ };
+
+ sata@e,0 {
+ compatible = "pci8086,2e71.0",
+ "pci8086,2e71",
+ "pciclass010601",
+ "pciclass0106";
+
+ reg = <0x17000 0x0 0x0 0x0 0x0>;
+ interrupts = <23 1>;
+ };
+
+ flash@f,0 {
+ compatible = "pci8086,701.1",
+ "pci8086,701",
+ "pciclass050100",
+ "pciclass0501";
+
+ reg = <0x17800 0x0 0x0 0x0 0x0>;
+ interrupts = <13 1>;
+ };
+
+ entertainment-encryption@10,0 {
+ compatible = "pci8086,702.1",
+ "pci8086,702",
+ "pciclass101000",
+ "pciclass1010";
+
+ reg = <0x18000 0x0 0x0 0x0 0x0>;
+ };
+
+ co-processor@11,0 {
+ compatible = "pci8086,703.1",
+ "pci8086,703",
+ "pciclass0b4000",
+ "pciclass0b40";
+
+ reg = <0x18800 0x0 0x0 0x0 0x0>;
+ interrupts = <1 1>;
+ };
+
+ multimedia@12,0 {
+ compatible = "pci8086,704.0",
+ "pci8086,704",
+ "pciclass048000",
+ "pciclass0480";
+
+ reg = <0x19000 0x0 0x0 0x0 0x0>;
+ };
+ };
+
+ isa@1f,0 {
+ #address-cells = <2>;
+ #size-cells = <1>;
+ compatible = "isa";
+ reg = <0xf800 0x0 0x0 0x0 0x0>;
+ ranges = <1 0 0 0 0 0x100>;
+
+ rtc@70 {
+ compatible = "intel,ce4100-rtc", "motorola,mc146818";
+ interrupts = <8 3>;
+ interrupt-parent = <&ioapic1>;
+ ctrl-reg = <2>;
+ freq-reg = <0x26>;
+ reg = <1 0x70 2>;
+ };
+ };
+ };
+ };
+};
diff --git a/arch/x86/platform/efi/Makefile b/arch/x86/platform/efi/Makefile
new file mode 100644
index 000000000..e4dc3862d
--- /dev/null
+++ b/arch/x86/platform/efi/Makefile
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0
+OBJECT_FILES_NON_STANDARD_efi_thunk_$(BITS).o := y
+OBJECT_FILES_NON_STANDARD_efi_stub_$(BITS).o := y
+
+obj-$(CONFIG_EFI) += quirks.o efi.o efi_$(BITS).o efi_stub_$(BITS).o
+obj-$(CONFIG_EARLY_PRINTK_EFI) += early_printk.o
+obj-$(CONFIG_EFI_MIXED) += efi_thunk_$(BITS).o
diff --git a/arch/x86/platform/efi/early_printk.c b/arch/x86/platform/efi/early_printk.c
new file mode 100644
index 000000000..c3e6be110
--- /dev/null
+++ b/arch/x86/platform/efi/early_printk.c
@@ -0,0 +1,236 @@
+/*
+ * Copyright (C) 2013 Intel Corporation; author Matt Fleming
+ *
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2.
+ */
+
+#include <linux/console.h>
+#include <linux/efi.h>
+#include <linux/font.h>
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <asm/setup.h>
+
+static const struct font_desc *font;
+static u32 efi_x, efi_y;
+static void *efi_fb;
+static bool early_efi_keep;
+
+/*
+ * efi earlyprintk need use early_ioremap to map the framebuffer.
+ * But early_ioremap is not usable for earlyprintk=efi,keep, ioremap should
+ * be used instead. ioremap will be available after paging_init() which is
+ * earlier than initcall callbacks. Thus adding this early initcall function
+ * early_efi_map_fb to map the whole efi framebuffer.
+ */
+static __init int early_efi_map_fb(void)
+{
+ unsigned long base, size;
+
+ if (!early_efi_keep)
+ return 0;
+
+ base = boot_params.screen_info.lfb_base;
+ size = boot_params.screen_info.lfb_size;
+ efi_fb = ioremap(base, size);
+
+ return efi_fb ? 0 : -ENOMEM;
+}
+early_initcall(early_efi_map_fb);
+
+/*
+ * early_efi_map maps efi framebuffer region [start, start + len -1]
+ * In case earlyprintk=efi,keep we have the whole framebuffer mapped already
+ * so just return the offset efi_fb + start.
+ */
+static __ref void *early_efi_map(unsigned long start, unsigned long len)
+{
+ unsigned long base;
+
+ base = boot_params.screen_info.lfb_base;
+
+ if (efi_fb)
+ return (efi_fb + start);
+ else
+ return early_ioremap(base + start, len);
+}
+
+static __ref void early_efi_unmap(void *addr, unsigned long len)
+{
+ if (!efi_fb)
+ early_iounmap(addr, len);
+}
+
+static void early_efi_clear_scanline(unsigned int y)
+{
+ unsigned long *dst;
+ u16 len;
+
+ len = boot_params.screen_info.lfb_linelength;
+ dst = early_efi_map(y*len, len);
+ if (!dst)
+ return;
+
+ memset(dst, 0, len);
+ early_efi_unmap(dst, len);
+}
+
+static void early_efi_scroll_up(void)
+{
+ unsigned long *dst, *src;
+ u16 len;
+ u32 i, height;
+
+ len = boot_params.screen_info.lfb_linelength;
+ height = boot_params.screen_info.lfb_height;
+
+ for (i = 0; i < height - font->height; i++) {
+ dst = early_efi_map(i*len, len);
+ if (!dst)
+ return;
+
+ src = early_efi_map((i + font->height) * len, len);
+ if (!src) {
+ early_efi_unmap(dst, len);
+ return;
+ }
+
+ memmove(dst, src, len);
+
+ early_efi_unmap(src, len);
+ early_efi_unmap(dst, len);
+ }
+}
+
+static void early_efi_write_char(u32 *dst, unsigned char c, unsigned int h)
+{
+ const u32 color_black = 0x00000000;
+ const u32 color_white = 0x00ffffff;
+ const u8 *src;
+ u8 s8;
+ int m;
+
+ src = font->data + c * font->height;
+ s8 = *(src + h);
+
+ for (m = 0; m < 8; m++) {
+ if ((s8 >> (7 - m)) & 1)
+ *dst = color_white;
+ else
+ *dst = color_black;
+ dst++;
+ }
+}
+
+static void
+early_efi_write(struct console *con, const char *str, unsigned int num)
+{
+ struct screen_info *si;
+ unsigned int len;
+ const char *s;
+ void *dst;
+
+ si = &boot_params.screen_info;
+ len = si->lfb_linelength;
+
+ while (num) {
+ unsigned int linemax;
+ unsigned int h, count = 0;
+
+ for (s = str; *s && *s != '\n'; s++) {
+ if (count == num)
+ break;
+ count++;
+ }
+
+ linemax = (si->lfb_width - efi_x) / font->width;
+ if (count > linemax)
+ count = linemax;
+
+ for (h = 0; h < font->height; h++) {
+ unsigned int n, x;
+
+ dst = early_efi_map((efi_y + h) * len, len);
+ if (!dst)
+ return;
+
+ s = str;
+ n = count;
+ x = efi_x;
+
+ while (n-- > 0) {
+ early_efi_write_char(dst + x*4, *s, h);
+ x += font->width;
+ s++;
+ }
+
+ early_efi_unmap(dst, len);
+ }
+
+ num -= count;
+ efi_x += count * font->width;
+ str += count;
+
+ if (num > 0 && *s == '\n') {
+ efi_x = 0;
+ efi_y += font->height;
+ str++;
+ num--;
+ }
+
+ if (efi_x + font->width > si->lfb_width) {
+ efi_x = 0;
+ efi_y += font->height;
+ }
+
+ if (efi_y + font->height > si->lfb_height) {
+ u32 i;
+
+ efi_y -= font->height;
+ early_efi_scroll_up();
+
+ for (i = 0; i < font->height; i++)
+ early_efi_clear_scanline(efi_y + i);
+ }
+ }
+}
+
+static __init int early_efi_setup(struct console *con, char *options)
+{
+ struct screen_info *si;
+ u16 xres, yres;
+ u32 i;
+
+ si = &boot_params.screen_info;
+ xres = si->lfb_width;
+ yres = si->lfb_height;
+
+ /*
+ * early_efi_write_char() implicitly assumes a framebuffer with
+ * 32-bits per pixel.
+ */
+ if (si->lfb_depth != 32)
+ return -ENODEV;
+
+ font = get_default_font(xres, yres, -1, -1);
+ if (!font)
+ return -ENODEV;
+
+ efi_y = rounddown(yres, font->height) - font->height;
+ for (i = 0; i < (yres - efi_y) / font->height; i++)
+ early_efi_scroll_up();
+
+ /* early_console_register will unset CON_BOOT in case ,keep */
+ if (!(con->flags & CON_BOOT))
+ early_efi_keep = true;
+ return 0;
+}
+
+struct console early_efi_console = {
+ .name = "earlyefi",
+ .write = early_efi_write,
+ .setup = early_efi_setup,
+ .flags = CON_PRINTBUFFER,
+ .index = -1,
+};
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
new file mode 100644
index 000000000..e7f19dec1
--- /dev/null
+++ b/arch/x86/platform/efi/efi.c
@@ -0,0 +1,1046 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Common EFI (Extensible Firmware Interface) support functions
+ * Based on Extensible Firmware Interface Specification version 1.0
+ *
+ * Copyright (C) 1999 VA Linux Systems
+ * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
+ * Copyright (C) 1999-2002 Hewlett-Packard Co.
+ * David Mosberger-Tang <davidm@hpl.hp.com>
+ * Stephane Eranian <eranian@hpl.hp.com>
+ * Copyright (C) 2005-2008 Intel Co.
+ * Fenghua Yu <fenghua.yu@intel.com>
+ * Bibo Mao <bibo.mao@intel.com>
+ * Chandramouli Narayanan <mouli@linux.intel.com>
+ * Huang Ying <ying.huang@intel.com>
+ * Copyright (C) 2013 SuSE Labs
+ * Borislav Petkov <bp@suse.de> - runtime services VA mapping
+ *
+ * Copied from efi_32.c to eliminate the duplicated code between EFI
+ * 32/64 support code. --ying 2007-10-26
+ *
+ * All EFI Runtime Services are not implemented yet as EFI only
+ * supports physical mode addressing on SoftSDV. This is to be fixed
+ * in a future version. --drummond 1999-07-20
+ *
+ * Implemented EFI runtime services and virtual mode calls. --davidm
+ *
+ * Goutham Rao: <goutham.rao@intel.com>
+ * Skip non-WB memory and ignore empty memory ranges.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/efi.h>
+#include <linux/efi-bgrt.h>
+#include <linux/export.h>
+#include <linux/bootmem.h>
+#include <linux/slab.h>
+#include <linux/memblock.h>
+#include <linux/spinlock.h>
+#include <linux/uaccess.h>
+#include <linux/time.h>
+#include <linux/io.h>
+#include <linux/reboot.h>
+#include <linux/bcd.h>
+
+#include <asm/setup.h>
+#include <asm/efi.h>
+#include <asm/e820/api.h>
+#include <asm/time.h>
+#include <asm/set_memory.h>
+#include <asm/tlbflush.h>
+#include <asm/x86_init.h>
+#include <asm/uv/uv.h>
+
+static struct efi efi_phys __initdata;
+static efi_system_table_t efi_systab __initdata;
+
+static efi_config_table_type_t arch_tables[] __initdata = {
+#ifdef CONFIG_X86_UV
+ {UV_SYSTEM_TABLE_GUID, "UVsystab", &efi.uv_systab},
+#endif
+ {NULL_GUID, NULL, NULL},
+};
+
+u64 efi_setup; /* efi setup_data physical address */
+
+static int add_efi_memmap __initdata;
+static int __init setup_add_efi_memmap(char *arg)
+{
+ add_efi_memmap = 1;
+ return 0;
+}
+early_param("add_efi_memmap", setup_add_efi_memmap);
+
+static efi_status_t __init phys_efi_set_virtual_address_map(
+ unsigned long memory_map_size,
+ unsigned long descriptor_size,
+ u32 descriptor_version,
+ efi_memory_desc_t *virtual_map)
+{
+ efi_status_t status;
+ unsigned long flags;
+ pgd_t *save_pgd;
+
+ save_pgd = efi_call_phys_prolog();
+
+ /* Disable interrupts around EFI calls: */
+ local_irq_save(flags);
+ status = efi_call_phys(efi_phys.set_virtual_address_map,
+ memory_map_size, descriptor_size,
+ descriptor_version, virtual_map);
+ local_irq_restore(flags);
+
+ efi_call_phys_epilog(save_pgd);
+
+ return status;
+}
+
+void __init efi_find_mirror(void)
+{
+ efi_memory_desc_t *md;
+ u64 mirror_size = 0, total_size = 0;
+
+ for_each_efi_memory_desc(md) {
+ unsigned long long start = md->phys_addr;
+ unsigned long long size = md->num_pages << EFI_PAGE_SHIFT;
+
+ total_size += size;
+ if (md->attribute & EFI_MEMORY_MORE_RELIABLE) {
+ memblock_mark_mirror(start, size);
+ mirror_size += size;
+ }
+ }
+ if (mirror_size)
+ pr_info("Memory: %lldM/%lldM mirrored memory\n",
+ mirror_size>>20, total_size>>20);
+}
+
+/*
+ * Tell the kernel about the EFI memory map. This might include
+ * more than the max 128 entries that can fit in the e820 legacy
+ * (zeropage) memory map.
+ */
+
+static void __init do_add_efi_memmap(void)
+{
+ efi_memory_desc_t *md;
+
+ for_each_efi_memory_desc(md) {
+ unsigned long long start = md->phys_addr;
+ unsigned long long size = md->num_pages << EFI_PAGE_SHIFT;
+ int e820_type;
+
+ switch (md->type) {
+ case EFI_LOADER_CODE:
+ case EFI_LOADER_DATA:
+ case EFI_BOOT_SERVICES_CODE:
+ case EFI_BOOT_SERVICES_DATA:
+ case EFI_CONVENTIONAL_MEMORY:
+ if (md->attribute & EFI_MEMORY_WB)
+ e820_type = E820_TYPE_RAM;
+ else
+ e820_type = E820_TYPE_RESERVED;
+ break;
+ case EFI_ACPI_RECLAIM_MEMORY:
+ e820_type = E820_TYPE_ACPI;
+ break;
+ case EFI_ACPI_MEMORY_NVS:
+ e820_type = E820_TYPE_NVS;
+ break;
+ case EFI_UNUSABLE_MEMORY:
+ e820_type = E820_TYPE_UNUSABLE;
+ break;
+ case EFI_PERSISTENT_MEMORY:
+ e820_type = E820_TYPE_PMEM;
+ break;
+ default:
+ /*
+ * EFI_RESERVED_TYPE EFI_RUNTIME_SERVICES_CODE
+ * EFI_RUNTIME_SERVICES_DATA EFI_MEMORY_MAPPED_IO
+ * EFI_MEMORY_MAPPED_IO_PORT_SPACE EFI_PAL_CODE
+ */
+ e820_type = E820_TYPE_RESERVED;
+ break;
+ }
+ e820__range_add(start, size, e820_type);
+ }
+ e820__update_table(e820_table);
+}
+
+int __init efi_memblock_x86_reserve_range(void)
+{
+ struct efi_info *e = &boot_params.efi_info;
+ struct efi_memory_map_data data;
+ phys_addr_t pmap;
+ int rv;
+
+ if (efi_enabled(EFI_PARAVIRT))
+ return 0;
+
+#ifdef CONFIG_X86_32
+ /* Can't handle data above 4GB at this time */
+ if (e->efi_memmap_hi) {
+ pr_err("Memory map is above 4GB, disabling EFI.\n");
+ return -EINVAL;
+ }
+ pmap = e->efi_memmap;
+#else
+ pmap = (e->efi_memmap | ((__u64)e->efi_memmap_hi << 32));
+#endif
+ data.phys_map = pmap;
+ data.size = e->efi_memmap_size;
+ data.desc_size = e->efi_memdesc_size;
+ data.desc_version = e->efi_memdesc_version;
+
+ rv = efi_memmap_init_early(&data);
+ if (rv)
+ return rv;
+
+ if (add_efi_memmap)
+ do_add_efi_memmap();
+
+ WARN(efi.memmap.desc_version != 1,
+ "Unexpected EFI_MEMORY_DESCRIPTOR version %ld",
+ efi.memmap.desc_version);
+
+ memblock_reserve(pmap, efi.memmap.nr_map * efi.memmap.desc_size);
+
+ return 0;
+}
+
+#define OVERFLOW_ADDR_SHIFT (64 - EFI_PAGE_SHIFT)
+#define OVERFLOW_ADDR_MASK (U64_MAX << OVERFLOW_ADDR_SHIFT)
+#define U64_HIGH_BIT (~(U64_MAX >> 1))
+
+static bool __init efi_memmap_entry_valid(const efi_memory_desc_t *md, int i)
+{
+ u64 end = (md->num_pages << EFI_PAGE_SHIFT) + md->phys_addr - 1;
+ u64 end_hi = 0;
+ char buf[64];
+
+ if (md->num_pages == 0) {
+ end = 0;
+ } else if (md->num_pages > EFI_PAGES_MAX ||
+ EFI_PAGES_MAX - md->num_pages <
+ (md->phys_addr >> EFI_PAGE_SHIFT)) {
+ end_hi = (md->num_pages & OVERFLOW_ADDR_MASK)
+ >> OVERFLOW_ADDR_SHIFT;
+
+ if ((md->phys_addr & U64_HIGH_BIT) && !(end & U64_HIGH_BIT))
+ end_hi += 1;
+ } else {
+ return true;
+ }
+
+ pr_warn_once(FW_BUG "Invalid EFI memory map entries:\n");
+
+ if (end_hi) {
+ pr_warn("mem%02u: %s range=[0x%016llx-0x%llx%016llx] (invalid)\n",
+ i, efi_md_typeattr_format(buf, sizeof(buf), md),
+ md->phys_addr, end_hi, end);
+ } else {
+ pr_warn("mem%02u: %s range=[0x%016llx-0x%016llx] (invalid)\n",
+ i, efi_md_typeattr_format(buf, sizeof(buf), md),
+ md->phys_addr, end);
+ }
+ return false;
+}
+
+static void __init efi_clean_memmap(void)
+{
+ efi_memory_desc_t *out = efi.memmap.map;
+ const efi_memory_desc_t *in = out;
+ const efi_memory_desc_t *end = efi.memmap.map_end;
+ int i, n_removal;
+
+ for (i = n_removal = 0; in < end; i++) {
+ if (efi_memmap_entry_valid(in, i)) {
+ if (out != in)
+ memcpy(out, in, efi.memmap.desc_size);
+ out = (void *)out + efi.memmap.desc_size;
+ } else {
+ n_removal++;
+ }
+ in = (void *)in + efi.memmap.desc_size;
+ }
+
+ if (n_removal > 0) {
+ u64 size = efi.memmap.nr_map - n_removal;
+
+ pr_warn("Removing %d invalid memory map entries.\n", n_removal);
+ efi_memmap_install(efi.memmap.phys_map, size);
+ }
+}
+
+void __init efi_print_memmap(void)
+{
+ efi_memory_desc_t *md;
+ int i = 0;
+
+ for_each_efi_memory_desc(md) {
+ char buf[64];
+
+ pr_info("mem%02u: %s range=[0x%016llx-0x%016llx] (%lluMB)\n",
+ i++, efi_md_typeattr_format(buf, sizeof(buf), md),
+ md->phys_addr,
+ md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT) - 1,
+ (md->num_pages >> (20 - EFI_PAGE_SHIFT)));
+ }
+}
+
+static int __init efi_systab_init(void *phys)
+{
+ if (efi_enabled(EFI_64BIT)) {
+ efi_system_table_64_t *systab64;
+ struct efi_setup_data *data = NULL;
+ u64 tmp = 0;
+
+ if (efi_setup) {
+ data = early_memremap(efi_setup, sizeof(*data));
+ if (!data)
+ return -ENOMEM;
+ }
+ systab64 = early_memremap((unsigned long)phys,
+ sizeof(*systab64));
+ if (systab64 == NULL) {
+ pr_err("Couldn't map the system table!\n");
+ if (data)
+ early_memunmap(data, sizeof(*data));
+ return -ENOMEM;
+ }
+
+ efi_systab.hdr = systab64->hdr;
+ efi_systab.fw_vendor = data ? (unsigned long)data->fw_vendor :
+ systab64->fw_vendor;
+ tmp |= data ? data->fw_vendor : systab64->fw_vendor;
+ efi_systab.fw_revision = systab64->fw_revision;
+ efi_systab.con_in_handle = systab64->con_in_handle;
+ tmp |= systab64->con_in_handle;
+ efi_systab.con_in = systab64->con_in;
+ tmp |= systab64->con_in;
+ efi_systab.con_out_handle = systab64->con_out_handle;
+ tmp |= systab64->con_out_handle;
+ efi_systab.con_out = systab64->con_out;
+ tmp |= systab64->con_out;
+ efi_systab.stderr_handle = systab64->stderr_handle;
+ tmp |= systab64->stderr_handle;
+ efi_systab.stderr = systab64->stderr;
+ tmp |= systab64->stderr;
+ efi_systab.runtime = data ?
+ (void *)(unsigned long)data->runtime :
+ (void *)(unsigned long)systab64->runtime;
+ tmp |= data ? data->runtime : systab64->runtime;
+ efi_systab.boottime = (void *)(unsigned long)systab64->boottime;
+ tmp |= systab64->boottime;
+ efi_systab.nr_tables = systab64->nr_tables;
+ efi_systab.tables = data ? (unsigned long)data->tables :
+ systab64->tables;
+ tmp |= data ? data->tables : systab64->tables;
+
+ early_memunmap(systab64, sizeof(*systab64));
+ if (data)
+ early_memunmap(data, sizeof(*data));
+#ifdef CONFIG_X86_32
+ if (tmp >> 32) {
+ pr_err("EFI data located above 4GB, disabling EFI.\n");
+ return -EINVAL;
+ }
+#endif
+ } else {
+ efi_system_table_32_t *systab32;
+
+ systab32 = early_memremap((unsigned long)phys,
+ sizeof(*systab32));
+ if (systab32 == NULL) {
+ pr_err("Couldn't map the system table!\n");
+ return -ENOMEM;
+ }
+
+ efi_systab.hdr = systab32->hdr;
+ efi_systab.fw_vendor = systab32->fw_vendor;
+ efi_systab.fw_revision = systab32->fw_revision;
+ efi_systab.con_in_handle = systab32->con_in_handle;
+ efi_systab.con_in = systab32->con_in;
+ efi_systab.con_out_handle = systab32->con_out_handle;
+ efi_systab.con_out = systab32->con_out;
+ efi_systab.stderr_handle = systab32->stderr_handle;
+ efi_systab.stderr = systab32->stderr;
+ efi_systab.runtime = (void *)(unsigned long)systab32->runtime;
+ efi_systab.boottime = (void *)(unsigned long)systab32->boottime;
+ efi_systab.nr_tables = systab32->nr_tables;
+ efi_systab.tables = systab32->tables;
+
+ early_memunmap(systab32, sizeof(*systab32));
+ }
+
+ efi.systab = &efi_systab;
+
+ /*
+ * Verify the EFI Table
+ */
+ if (efi.systab->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) {
+ pr_err("System table signature incorrect!\n");
+ return -EINVAL;
+ }
+ if ((efi.systab->hdr.revision >> 16) == 0)
+ pr_err("Warning: System table version %d.%02d, expected 1.00 or greater!\n",
+ efi.systab->hdr.revision >> 16,
+ efi.systab->hdr.revision & 0xffff);
+
+ return 0;
+}
+
+static int __init efi_runtime_init32(void)
+{
+ efi_runtime_services_32_t *runtime;
+
+ runtime = early_memremap((unsigned long)efi.systab->runtime,
+ sizeof(efi_runtime_services_32_t));
+ if (!runtime) {
+ pr_err("Could not map the runtime service table!\n");
+ return -ENOMEM;
+ }
+
+ /*
+ * We will only need *early* access to the SetVirtualAddressMap
+ * EFI runtime service. All other runtime services will be called
+ * via the virtual mapping.
+ */
+ efi_phys.set_virtual_address_map =
+ (efi_set_virtual_address_map_t *)
+ (unsigned long)runtime->set_virtual_address_map;
+ early_memunmap(runtime, sizeof(efi_runtime_services_32_t));
+
+ return 0;
+}
+
+static int __init efi_runtime_init64(void)
+{
+ efi_runtime_services_64_t *runtime;
+
+ runtime = early_memremap((unsigned long)efi.systab->runtime,
+ sizeof(efi_runtime_services_64_t));
+ if (!runtime) {
+ pr_err("Could not map the runtime service table!\n");
+ return -ENOMEM;
+ }
+
+ /*
+ * We will only need *early* access to the SetVirtualAddressMap
+ * EFI runtime service. All other runtime services will be called
+ * via the virtual mapping.
+ */
+ efi_phys.set_virtual_address_map =
+ (efi_set_virtual_address_map_t *)
+ (unsigned long)runtime->set_virtual_address_map;
+ early_memunmap(runtime, sizeof(efi_runtime_services_64_t));
+
+ return 0;
+}
+
+static int __init efi_runtime_init(void)
+{
+ int rv;
+
+ /*
+ * Check out the runtime services table. We need to map
+ * the runtime services table so that we can grab the physical
+ * address of several of the EFI runtime functions, needed to
+ * set the firmware into virtual mode.
+ *
+ * When EFI_PARAVIRT is in force then we could not map runtime
+ * service memory region because we do not have direct access to it.
+ * However, runtime services are available through proxy functions
+ * (e.g. in case of Xen dom0 EFI implementation they call special
+ * hypercall which executes relevant EFI functions) and that is why
+ * they are always enabled.
+ */
+
+ if (!efi_enabled(EFI_PARAVIRT)) {
+ if (efi_enabled(EFI_64BIT))
+ rv = efi_runtime_init64();
+ else
+ rv = efi_runtime_init32();
+
+ if (rv)
+ return rv;
+ }
+
+ set_bit(EFI_RUNTIME_SERVICES, &efi.flags);
+
+ return 0;
+}
+
+void __init efi_init(void)
+{
+ efi_char16_t *c16;
+ char vendor[100] = "unknown";
+ int i = 0;
+
+#ifdef CONFIG_X86_32
+ if (boot_params.efi_info.efi_systab_hi ||
+ boot_params.efi_info.efi_memmap_hi) {
+ pr_info("Table located above 4GB, disabling EFI.\n");
+ return;
+ }
+ efi_phys.systab = (efi_system_table_t *)boot_params.efi_info.efi_systab;
+#else
+ efi_phys.systab = (efi_system_table_t *)
+ (boot_params.efi_info.efi_systab |
+ ((__u64)boot_params.efi_info.efi_systab_hi<<32));
+#endif
+
+ if (efi_systab_init(efi_phys.systab))
+ return;
+
+ efi.config_table = (unsigned long)efi.systab->tables;
+ efi.fw_vendor = (unsigned long)efi.systab->fw_vendor;
+ efi.runtime = (unsigned long)efi.systab->runtime;
+
+ /*
+ * Show what we know for posterity
+ */
+ c16 = early_memremap_ro(efi.systab->fw_vendor,
+ sizeof(vendor) * sizeof(efi_char16_t));
+ if (c16) {
+ for (i = 0; i < sizeof(vendor) - 1 && c16[i]; ++i)
+ vendor[i] = c16[i];
+ vendor[i] = '\0';
+ early_memunmap(c16, sizeof(vendor) * sizeof(efi_char16_t));
+ } else {
+ pr_err("Could not map the firmware vendor!\n");
+ }
+
+ pr_info("EFI v%u.%.02u by %s\n",
+ efi.systab->hdr.revision >> 16,
+ efi.systab->hdr.revision & 0xffff, vendor);
+
+ if (efi_reuse_config(efi.systab->tables, efi.systab->nr_tables))
+ return;
+
+ if (efi_config_init(arch_tables))
+ return;
+
+ /*
+ * Note: We currently don't support runtime services on an EFI
+ * that doesn't match the kernel 32/64-bit mode.
+ */
+
+ if (!efi_runtime_supported())
+ pr_info("No EFI runtime due to 32/64-bit mismatch with kernel\n");
+ else {
+ if (efi_runtime_disabled() || efi_runtime_init()) {
+ efi_memmap_unmap();
+ return;
+ }
+ }
+
+ efi_clean_memmap();
+
+ if (efi_enabled(EFI_DBG))
+ efi_print_memmap();
+}
+
+void __init efi_set_executable(efi_memory_desc_t *md, bool executable)
+{
+ u64 addr, npages;
+
+ addr = md->virt_addr;
+ npages = md->num_pages;
+
+ memrange_efi_to_native(&addr, &npages);
+
+ if (executable)
+ set_memory_x(addr, npages);
+ else
+ set_memory_nx(addr, npages);
+}
+
+void __init runtime_code_page_mkexec(void)
+{
+ efi_memory_desc_t *md;
+
+ /* Make EFI runtime service code area executable */
+ for_each_efi_memory_desc(md) {
+ if (md->type != EFI_RUNTIME_SERVICES_CODE)
+ continue;
+
+ efi_set_executable(md, true);
+ }
+}
+
+void __init efi_memory_uc(u64 addr, unsigned long size)
+{
+ unsigned long page_shift = 1UL << EFI_PAGE_SHIFT;
+ u64 npages;
+
+ npages = round_up(size, page_shift) / page_shift;
+ memrange_efi_to_native(&addr, &npages);
+ set_memory_uc(addr, npages);
+}
+
+void __init old_map_region(efi_memory_desc_t *md)
+{
+ u64 start_pfn, end_pfn, end;
+ unsigned long size;
+ void *va;
+
+ start_pfn = PFN_DOWN(md->phys_addr);
+ size = md->num_pages << PAGE_SHIFT;
+ end = md->phys_addr + size;
+ end_pfn = PFN_UP(end);
+
+ if (pfn_range_is_mapped(start_pfn, end_pfn)) {
+ va = __va(md->phys_addr);
+
+ if (!(md->attribute & EFI_MEMORY_WB))
+ efi_memory_uc((u64)(unsigned long)va, size);
+ } else
+ va = efi_ioremap(md->phys_addr, size,
+ md->type, md->attribute);
+
+ md->virt_addr = (u64) (unsigned long) va;
+ if (!va)
+ pr_err("ioremap of 0x%llX failed!\n",
+ (unsigned long long)md->phys_addr);
+}
+
+/* Merge contiguous regions of the same type and attribute */
+static void __init efi_merge_regions(void)
+{
+ efi_memory_desc_t *md, *prev_md = NULL;
+
+ for_each_efi_memory_desc(md) {
+ u64 prev_size;
+
+ if (!prev_md) {
+ prev_md = md;
+ continue;
+ }
+
+ if (prev_md->type != md->type ||
+ prev_md->attribute != md->attribute) {
+ prev_md = md;
+ continue;
+ }
+
+ prev_size = prev_md->num_pages << EFI_PAGE_SHIFT;
+
+ if (md->phys_addr == (prev_md->phys_addr + prev_size)) {
+ prev_md->num_pages += md->num_pages;
+ md->type = EFI_RESERVED_TYPE;
+ md->attribute = 0;
+ continue;
+ }
+ prev_md = md;
+ }
+}
+
+static void __init get_systab_virt_addr(efi_memory_desc_t *md)
+{
+ unsigned long size;
+ u64 end, systab;
+
+ size = md->num_pages << EFI_PAGE_SHIFT;
+ end = md->phys_addr + size;
+ systab = (u64)(unsigned long)efi_phys.systab;
+ if (md->phys_addr <= systab && systab < end) {
+ systab += md->virt_addr - md->phys_addr;
+ efi.systab = (efi_system_table_t *)(unsigned long)systab;
+ }
+}
+
+static void *realloc_pages(void *old_memmap, int old_shift)
+{
+ void *ret;
+
+ ret = (void *)__get_free_pages(GFP_KERNEL, old_shift + 1);
+ if (!ret)
+ goto out;
+
+ /*
+ * A first-time allocation doesn't have anything to copy.
+ */
+ if (!old_memmap)
+ return ret;
+
+ memcpy(ret, old_memmap, PAGE_SIZE << old_shift);
+
+out:
+ free_pages((unsigned long)old_memmap, old_shift);
+ return ret;
+}
+
+/*
+ * Iterate the EFI memory map in reverse order because the regions
+ * will be mapped top-down. The end result is the same as if we had
+ * mapped things forward, but doesn't require us to change the
+ * existing implementation of efi_map_region().
+ */
+static inline void *efi_map_next_entry_reverse(void *entry)
+{
+ /* Initial call */
+ if (!entry)
+ return efi.memmap.map_end - efi.memmap.desc_size;
+
+ entry -= efi.memmap.desc_size;
+ if (entry < efi.memmap.map)
+ return NULL;
+
+ return entry;
+}
+
+/*
+ * efi_map_next_entry - Return the next EFI memory map descriptor
+ * @entry: Previous EFI memory map descriptor
+ *
+ * This is a helper function to iterate over the EFI memory map, which
+ * we do in different orders depending on the current configuration.
+ *
+ * To begin traversing the memory map @entry must be %NULL.
+ *
+ * Returns %NULL when we reach the end of the memory map.
+ */
+static void *efi_map_next_entry(void *entry)
+{
+ if (!efi_enabled(EFI_OLD_MEMMAP) && efi_enabled(EFI_64BIT)) {
+ /*
+ * Starting in UEFI v2.5 the EFI_PROPERTIES_TABLE
+ * config table feature requires us to map all entries
+ * in the same order as they appear in the EFI memory
+ * map. That is to say, entry N must have a lower
+ * virtual address than entry N+1. This is because the
+ * firmware toolchain leaves relative references in
+ * the code/data sections, which are split and become
+ * separate EFI memory regions. Mapping things
+ * out-of-order leads to the firmware accessing
+ * unmapped addresses.
+ *
+ * Since we need to map things this way whether or not
+ * the kernel actually makes use of
+ * EFI_PROPERTIES_TABLE, let's just switch to this
+ * scheme by default for 64-bit.
+ */
+ return efi_map_next_entry_reverse(entry);
+ }
+
+ /* Initial call */
+ if (!entry)
+ return efi.memmap.map;
+
+ entry += efi.memmap.desc_size;
+ if (entry >= efi.memmap.map_end)
+ return NULL;
+
+ return entry;
+}
+
+static bool should_map_region(efi_memory_desc_t *md)
+{
+ /*
+ * Runtime regions always require runtime mappings (obviously).
+ */
+ if (md->attribute & EFI_MEMORY_RUNTIME)
+ return true;
+
+ /*
+ * 32-bit EFI doesn't suffer from the bug that requires us to
+ * reserve boot services regions, and mixed mode support
+ * doesn't exist for 32-bit kernels.
+ */
+ if (IS_ENABLED(CONFIG_X86_32))
+ return false;
+
+ /*
+ * Map all of RAM so that we can access arguments in the 1:1
+ * mapping when making EFI runtime calls.
+ */
+ if (IS_ENABLED(CONFIG_EFI_MIXED) && !efi_is_native()) {
+ if (md->type == EFI_CONVENTIONAL_MEMORY ||
+ md->type == EFI_LOADER_DATA ||
+ md->type == EFI_LOADER_CODE)
+ return true;
+ }
+
+ /*
+ * Map boot services regions as a workaround for buggy
+ * firmware that accesses them even when they shouldn't.
+ *
+ * See efi_{reserve,free}_boot_services().
+ */
+ if (md->type == EFI_BOOT_SERVICES_CODE ||
+ md->type == EFI_BOOT_SERVICES_DATA)
+ return true;
+
+ return false;
+}
+
+/*
+ * Map the efi memory ranges of the runtime services and update new_mmap with
+ * virtual addresses.
+ */
+static void * __init efi_map_regions(int *count, int *pg_shift)
+{
+ void *p, *new_memmap = NULL;
+ unsigned long left = 0;
+ unsigned long desc_size;
+ efi_memory_desc_t *md;
+
+ desc_size = efi.memmap.desc_size;
+
+ p = NULL;
+ while ((p = efi_map_next_entry(p))) {
+ md = p;
+
+ if (!should_map_region(md))
+ continue;
+
+ efi_map_region(md);
+ get_systab_virt_addr(md);
+
+ if (left < desc_size) {
+ new_memmap = realloc_pages(new_memmap, *pg_shift);
+ if (!new_memmap)
+ return NULL;
+
+ left += PAGE_SIZE << *pg_shift;
+ (*pg_shift)++;
+ }
+
+ memcpy(new_memmap + (*count * desc_size), md, desc_size);
+
+ left -= desc_size;
+ (*count)++;
+ }
+
+ return new_memmap;
+}
+
+static void __init kexec_enter_virtual_mode(void)
+{
+#ifdef CONFIG_KEXEC_CORE
+ efi_memory_desc_t *md;
+ unsigned int num_pages;
+
+ efi.systab = NULL;
+
+ /*
+ * We don't do virtual mode, since we don't do runtime services, on
+ * non-native EFI. With efi=old_map, we don't do runtime services in
+ * kexec kernel because in the initial boot something else might
+ * have been mapped at these virtual addresses.
+ */
+ if (!efi_is_native() || efi_enabled(EFI_OLD_MEMMAP)) {
+ efi_memmap_unmap();
+ clear_bit(EFI_RUNTIME_SERVICES, &efi.flags);
+ return;
+ }
+
+ if (efi_alloc_page_tables()) {
+ pr_err("Failed to allocate EFI page tables\n");
+ clear_bit(EFI_RUNTIME_SERVICES, &efi.flags);
+ return;
+ }
+
+ /*
+ * Map efi regions which were passed via setup_data. The virt_addr is a
+ * fixed addr which was used in first kernel of a kexec boot.
+ */
+ for_each_efi_memory_desc(md) {
+ efi_map_region_fixed(md); /* FIXME: add error handling */
+ get_systab_virt_addr(md);
+ }
+
+ /*
+ * Unregister the early EFI memmap from efi_init() and install
+ * the new EFI memory map.
+ */
+ efi_memmap_unmap();
+
+ if (efi_memmap_init_late(efi.memmap.phys_map,
+ efi.memmap.desc_size * efi.memmap.nr_map)) {
+ pr_err("Failed to remap late EFI memory map\n");
+ clear_bit(EFI_RUNTIME_SERVICES, &efi.flags);
+ return;
+ }
+
+ BUG_ON(!efi.systab);
+
+ num_pages = ALIGN(efi.memmap.nr_map * efi.memmap.desc_size, PAGE_SIZE);
+ num_pages >>= PAGE_SHIFT;
+
+ if (efi_setup_page_tables(efi.memmap.phys_map, num_pages)) {
+ clear_bit(EFI_RUNTIME_SERVICES, &efi.flags);
+ return;
+ }
+
+ efi_sync_low_kernel_mappings();
+
+ /*
+ * Now that EFI is in virtual mode, update the function
+ * pointers in the runtime service table to the new virtual addresses.
+ *
+ * Call EFI services through wrapper functions.
+ */
+ efi.runtime_version = efi_systab.hdr.revision;
+
+ efi_native_runtime_setup();
+
+ efi.set_virtual_address_map = NULL;
+
+ if (efi_enabled(EFI_OLD_MEMMAP) && (__supported_pte_mask & _PAGE_NX))
+ runtime_code_page_mkexec();
+#endif
+}
+
+/*
+ * This function will switch the EFI runtime services to virtual mode.
+ * Essentially, we look through the EFI memmap and map every region that
+ * has the runtime attribute bit set in its memory descriptor into the
+ * efi_pgd page table.
+ *
+ * The old method which used to update that memory descriptor with the
+ * virtual address obtained from ioremap() is still supported when the
+ * kernel is booted with efi=old_map on its command line. Same old
+ * method enabled the runtime services to be called without having to
+ * thunk back into physical mode for every invocation.
+ *
+ * The new method does a pagetable switch in a preemption-safe manner
+ * so that we're in a different address space when calling a runtime
+ * function. For function arguments passing we do copy the PUDs of the
+ * kernel page table into efi_pgd prior to each call.
+ *
+ * Specially for kexec boot, efi runtime maps in previous kernel should
+ * be passed in via setup_data. In that case runtime ranges will be mapped
+ * to the same virtual addresses as the first kernel, see
+ * kexec_enter_virtual_mode().
+ */
+static void __init __efi_enter_virtual_mode(void)
+{
+ int count = 0, pg_shift = 0;
+ void *new_memmap = NULL;
+ efi_status_t status;
+ unsigned long pa;
+
+ efi.systab = NULL;
+
+ if (efi_alloc_page_tables()) {
+ pr_err("Failed to allocate EFI page tables\n");
+ goto err;
+ }
+
+ efi_merge_regions();
+ new_memmap = efi_map_regions(&count, &pg_shift);
+ if (!new_memmap) {
+ pr_err("Error reallocating memory, EFI runtime non-functional!\n");
+ goto err;
+ }
+
+ pa = __pa(new_memmap);
+
+ /*
+ * Unregister the early EFI memmap from efi_init() and install
+ * the new EFI memory map that we are about to pass to the
+ * firmware via SetVirtualAddressMap().
+ */
+ efi_memmap_unmap();
+
+ if (efi_memmap_init_late(pa, efi.memmap.desc_size * count)) {
+ pr_err("Failed to remap late EFI memory map\n");
+ goto err;
+ }
+
+ if (efi_enabled(EFI_DBG)) {
+ pr_info("EFI runtime memory map:\n");
+ efi_print_memmap();
+ }
+
+ if (WARN_ON(!efi.systab))
+ goto err;
+
+ if (efi_setup_page_tables(pa, 1 << pg_shift))
+ goto err;
+
+ efi_sync_low_kernel_mappings();
+
+ if (efi_is_native()) {
+ status = phys_efi_set_virtual_address_map(
+ efi.memmap.desc_size * count,
+ efi.memmap.desc_size,
+ efi.memmap.desc_version,
+ (efi_memory_desc_t *)pa);
+ } else {
+ status = efi_thunk_set_virtual_address_map(
+ efi_phys.set_virtual_address_map,
+ efi.memmap.desc_size * count,
+ efi.memmap.desc_size,
+ efi.memmap.desc_version,
+ (efi_memory_desc_t *)pa);
+ }
+
+ if (status != EFI_SUCCESS) {
+ pr_err("Unable to switch EFI into virtual mode (status=%lx)!\n",
+ status);
+ goto err;
+ }
+
+ /*
+ * Now that EFI is in virtual mode, update the function
+ * pointers in the runtime service table to the new virtual addresses.
+ *
+ * Call EFI services through wrapper functions.
+ */
+ efi.runtime_version = efi_systab.hdr.revision;
+
+ if (efi_is_native())
+ efi_native_runtime_setup();
+ else
+ efi_thunk_runtime_setup();
+
+ efi.set_virtual_address_map = NULL;
+
+ /*
+ * Apply more restrictive page table mapping attributes now that
+ * SVAM() has been called and the firmware has performed all
+ * necessary relocation fixups for the new virtual addresses.
+ */
+ efi_runtime_update_mappings();
+
+ /* clean DUMMY object */
+ efi_delete_dummy_variable();
+ return;
+
+err:
+ clear_bit(EFI_RUNTIME_SERVICES, &efi.flags);
+}
+
+void __init efi_enter_virtual_mode(void)
+{
+ if (efi_enabled(EFI_PARAVIRT))
+ return;
+
+ if (efi_setup)
+ kexec_enter_virtual_mode();
+ else
+ __efi_enter_virtual_mode();
+
+ efi_dump_pagetable();
+}
+
+static int __init arch_parse_efi_cmdline(char *str)
+{
+ if (!str) {
+ pr_warn("need at least one option\n");
+ return -EINVAL;
+ }
+
+ if (parse_option_str(str, "old_map"))
+ set_bit(EFI_OLD_MEMMAP, &efi.flags);
+
+ return 0;
+}
+early_param("efi", arch_parse_efi_cmdline);
diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c
new file mode 100644
index 000000000..995965712
--- /dev/null
+++ b/arch/x86/platform/efi/efi_32.c
@@ -0,0 +1,97 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Extensible Firmware Interface
+ *
+ * Based on Extensible Firmware Interface Specification version 1.0
+ *
+ * Copyright (C) 1999 VA Linux Systems
+ * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
+ * Copyright (C) 1999-2002 Hewlett-Packard Co.
+ * David Mosberger-Tang <davidm@hpl.hp.com>
+ * Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * All EFI Runtime Services are not implemented yet as EFI only
+ * supports physical mode addressing on SoftSDV. This is to be fixed
+ * in a future version. --drummond 1999-07-20
+ *
+ * Implemented EFI runtime services and virtual mode calls. --davidm
+ *
+ * Goutham Rao: <goutham.rao@intel.com>
+ * Skip non-WB memory and ignore empty memory ranges.
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/ioport.h>
+#include <linux/efi.h>
+
+#include <asm/io.h>
+#include <asm/desc.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/efi.h>
+
+/*
+ * To make EFI call EFI runtime service in physical addressing mode we need
+ * prolog/epilog before/after the invocation to claim the EFI runtime service
+ * handler exclusively and to duplicate a memory mapping in low memory space,
+ * say 0 - 3G.
+ */
+
+int __init efi_alloc_page_tables(void)
+{
+ return 0;
+}
+
+void efi_sync_low_kernel_mappings(void) {}
+
+void __init efi_dump_pagetable(void)
+{
+#ifdef CONFIG_EFI_PGT_DUMP
+ ptdump_walk_pgd_level(NULL, swapper_pg_dir);
+#endif
+}
+
+int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
+{
+ return 0;
+}
+
+void __init efi_map_region(efi_memory_desc_t *md)
+{
+ old_map_region(md);
+}
+
+void __init efi_map_region_fixed(efi_memory_desc_t *md) {}
+void __init parse_efi_setup(u64 phys_addr, u32 data_len) {}
+
+pgd_t * __init efi_call_phys_prolog(void)
+{
+ struct desc_ptr gdt_descr;
+ pgd_t *save_pgd;
+
+ /* Current pgd is swapper_pg_dir, we'll restore it later: */
+ save_pgd = swapper_pg_dir;
+ load_cr3(initial_page_table);
+ __flush_tlb_all();
+
+ gdt_descr.address = get_cpu_gdt_paddr(0);
+ gdt_descr.size = GDT_SIZE - 1;
+ load_gdt(&gdt_descr);
+
+ return save_pgd;
+}
+
+void __init efi_call_phys_epilog(pgd_t *save_pgd)
+{
+ load_fixmap_gdt(0);
+ load_cr3(save_pgd);
+ __flush_tlb_all();
+}
+
+void __init efi_runtime_update_mappings(void)
+{
+ if (__supported_pte_mask & _PAGE_NX)
+ runtime_code_page_mkexec();
+}
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
new file mode 100644
index 000000000..77d05b560
--- /dev/null
+++ b/arch/x86/platform/efi/efi_64.c
@@ -0,0 +1,1039 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * x86_64 specific EFI support functions
+ * Based on Extensible Firmware Interface Specification version 1.0
+ *
+ * Copyright (C) 2005-2008 Intel Co.
+ * Fenghua Yu <fenghua.yu@intel.com>
+ * Bibo Mao <bibo.mao@intel.com>
+ * Chandramouli Narayanan <mouli@linux.intel.com>
+ * Huang Ying <ying.huang@intel.com>
+ *
+ * Code to convert EFI to E820 map has been implemented in elilo bootloader
+ * based on a EFI patch by Edgar Hucek. Based on the E820 map, the page table
+ * is setup appropriately for EFI runtime code.
+ * - mouli 06/14/2007.
+ *
+ */
+
+#define pr_fmt(fmt) "efi: " fmt
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/bootmem.h>
+#include <linux/ioport.h>
+#include <linux/mc146818rtc.h>
+#include <linux/efi.h>
+#include <linux/export.h>
+#include <linux/uaccess.h>
+#include <linux/io.h>
+#include <linux/reboot.h>
+#include <linux/slab.h>
+#include <linux/ucs2_string.h>
+#include <linux/mem_encrypt.h>
+#include <linux/sched/task.h>
+
+#include <asm/setup.h>
+#include <asm/page.h>
+#include <asm/e820/api.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/proto.h>
+#include <asm/efi.h>
+#include <asm/cacheflush.h>
+#include <asm/fixmap.h>
+#include <asm/realmode.h>
+#include <asm/time.h>
+#include <asm/pgalloc.h>
+
+/*
+ * We allocate runtime services regions top-down, starting from -4G, i.e.
+ * 0xffff_ffff_0000_0000 and limit EFI VA mapping space to 64G.
+ */
+static u64 efi_va = EFI_VA_START;
+
+struct efi_scratch efi_scratch;
+
+static void __init early_code_mapping_set_exec(int executable)
+{
+ efi_memory_desc_t *md;
+
+ if (!(__supported_pte_mask & _PAGE_NX))
+ return;
+
+ /* Make EFI service code area executable */
+ for_each_efi_memory_desc(md) {
+ if (md->type == EFI_RUNTIME_SERVICES_CODE ||
+ md->type == EFI_BOOT_SERVICES_CODE)
+ efi_set_executable(md, executable);
+ }
+}
+
+pgd_t * __init efi_call_phys_prolog(void)
+{
+ unsigned long vaddr, addr_pgd, addr_p4d, addr_pud;
+ pgd_t *save_pgd, *pgd_k, *pgd_efi;
+ p4d_t *p4d, *p4d_k, *p4d_efi;
+ pud_t *pud;
+
+ int pgd;
+ int n_pgds, i, j;
+
+ if (!efi_enabled(EFI_OLD_MEMMAP)) {
+ efi_switch_mm(&efi_mm);
+ return NULL;
+ }
+
+ early_code_mapping_set_exec(1);
+
+ n_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT), PGDIR_SIZE);
+ save_pgd = kmalloc_array(n_pgds, sizeof(*save_pgd), GFP_KERNEL);
+
+ /*
+ * Build 1:1 identity mapping for efi=old_map usage. Note that
+ * PAGE_OFFSET is PGDIR_SIZE aligned when KASLR is disabled, while
+ * it is PUD_SIZE ALIGNED with KASLR enabled. So for a given physical
+ * address X, the pud_index(X) != pud_index(__va(X)), we can only copy
+ * PUD entry of __va(X) to fill in pud entry of X to build 1:1 mapping.
+ * This means here we can only reuse the PMD tables of the direct mapping.
+ */
+ for (pgd = 0; pgd < n_pgds; pgd++) {
+ addr_pgd = (unsigned long)(pgd * PGDIR_SIZE);
+ vaddr = (unsigned long)__va(pgd * PGDIR_SIZE);
+ pgd_efi = pgd_offset_k(addr_pgd);
+ save_pgd[pgd] = *pgd_efi;
+
+ p4d = p4d_alloc(&init_mm, pgd_efi, addr_pgd);
+ if (!p4d) {
+ pr_err("Failed to allocate p4d table!\n");
+ goto out;
+ }
+
+ for (i = 0; i < PTRS_PER_P4D; i++) {
+ addr_p4d = addr_pgd + i * P4D_SIZE;
+ p4d_efi = p4d + p4d_index(addr_p4d);
+
+ pud = pud_alloc(&init_mm, p4d_efi, addr_p4d);
+ if (!pud) {
+ pr_err("Failed to allocate pud table!\n");
+ goto out;
+ }
+
+ for (j = 0; j < PTRS_PER_PUD; j++) {
+ addr_pud = addr_p4d + j * PUD_SIZE;
+
+ if (addr_pud > (max_pfn << PAGE_SHIFT))
+ break;
+
+ vaddr = (unsigned long)__va(addr_pud);
+
+ pgd_k = pgd_offset_k(vaddr);
+ p4d_k = p4d_offset(pgd_k, vaddr);
+ pud[j] = *pud_offset(p4d_k, vaddr);
+ }
+ }
+ pgd_offset_k(pgd * PGDIR_SIZE)->pgd &= ~_PAGE_NX;
+ }
+
+out:
+ __flush_tlb_all();
+
+ return save_pgd;
+}
+
+void __init efi_call_phys_epilog(pgd_t *save_pgd)
+{
+ /*
+ * After the lock is released, the original page table is restored.
+ */
+ int pgd_idx, i;
+ int nr_pgds;
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+
+ if (!efi_enabled(EFI_OLD_MEMMAP)) {
+ efi_switch_mm(efi_scratch.prev_mm);
+ return;
+ }
+
+ nr_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT) , PGDIR_SIZE);
+
+ for (pgd_idx = 0; pgd_idx < nr_pgds; pgd_idx++) {
+ pgd = pgd_offset_k(pgd_idx * PGDIR_SIZE);
+ set_pgd(pgd_offset_k(pgd_idx * PGDIR_SIZE), save_pgd[pgd_idx]);
+
+ if (!pgd_present(*pgd))
+ continue;
+
+ for (i = 0; i < PTRS_PER_P4D; i++) {
+ p4d = p4d_offset(pgd,
+ pgd_idx * PGDIR_SIZE + i * P4D_SIZE);
+
+ if (!p4d_present(*p4d))
+ continue;
+
+ pud = (pud_t *)p4d_page_vaddr(*p4d);
+ pud_free(&init_mm, pud);
+ }
+
+ p4d = (p4d_t *)pgd_page_vaddr(*pgd);
+ p4d_free(&init_mm, p4d);
+ }
+
+ kfree(save_pgd);
+
+ __flush_tlb_all();
+ early_code_mapping_set_exec(0);
+}
+
+EXPORT_SYMBOL_GPL(efi_mm);
+
+/*
+ * We need our own copy of the higher levels of the page tables
+ * because we want to avoid inserting EFI region mappings (EFI_VA_END
+ * to EFI_VA_START) into the standard kernel page tables. Everything
+ * else can be shared, see efi_sync_low_kernel_mappings().
+ *
+ * We don't want the pgd on the pgd_list and cannot use pgd_alloc() for the
+ * allocation.
+ */
+int __init efi_alloc_page_tables(void)
+{
+ pgd_t *pgd, *efi_pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ gfp_t gfp_mask;
+
+ if (efi_enabled(EFI_OLD_MEMMAP))
+ return 0;
+
+ gfp_mask = GFP_KERNEL | __GFP_ZERO;
+ efi_pgd = (pgd_t *)__get_free_pages(gfp_mask, PGD_ALLOCATION_ORDER);
+ if (!efi_pgd)
+ goto fail;
+
+ pgd = efi_pgd + pgd_index(EFI_VA_END);
+ p4d = p4d_alloc(&init_mm, pgd, EFI_VA_END);
+ if (!p4d)
+ goto free_pgd;
+
+ pud = pud_alloc(&init_mm, p4d, EFI_VA_END);
+ if (!pud)
+ goto free_p4d;
+
+ efi_mm.pgd = efi_pgd;
+ mm_init_cpumask(&efi_mm);
+ init_new_context(NULL, &efi_mm);
+
+ return 0;
+
+free_p4d:
+ if (pgtable_l5_enabled())
+ free_page((unsigned long)pgd_page_vaddr(*pgd));
+free_pgd:
+ free_pages((unsigned long)efi_pgd, PGD_ALLOCATION_ORDER);
+fail:
+ return -ENOMEM;
+}
+
+/*
+ * Add low kernel mappings for passing arguments to EFI functions.
+ */
+void efi_sync_low_kernel_mappings(void)
+{
+ unsigned num_entries;
+ pgd_t *pgd_k, *pgd_efi;
+ p4d_t *p4d_k, *p4d_efi;
+ pud_t *pud_k, *pud_efi;
+ pgd_t *efi_pgd = efi_mm.pgd;
+
+ if (efi_enabled(EFI_OLD_MEMMAP))
+ return;
+
+ /*
+ * We can share all PGD entries apart from the one entry that
+ * covers the EFI runtime mapping space.
+ *
+ * Make sure the EFI runtime region mappings are guaranteed to
+ * only span a single PGD entry and that the entry also maps
+ * other important kernel regions.
+ */
+ MAYBE_BUILD_BUG_ON(pgd_index(EFI_VA_END) != pgd_index(MODULES_END));
+ MAYBE_BUILD_BUG_ON((EFI_VA_START & PGDIR_MASK) !=
+ (EFI_VA_END & PGDIR_MASK));
+
+ pgd_efi = efi_pgd + pgd_index(PAGE_OFFSET);
+ pgd_k = pgd_offset_k(PAGE_OFFSET);
+
+ num_entries = pgd_index(EFI_VA_END) - pgd_index(PAGE_OFFSET);
+ memcpy(pgd_efi, pgd_k, sizeof(pgd_t) * num_entries);
+
+ /*
+ * As with PGDs, we share all P4D entries apart from the one entry
+ * that covers the EFI runtime mapping space.
+ */
+ BUILD_BUG_ON(p4d_index(EFI_VA_END) != p4d_index(MODULES_END));
+ BUILD_BUG_ON((EFI_VA_START & P4D_MASK) != (EFI_VA_END & P4D_MASK));
+
+ pgd_efi = efi_pgd + pgd_index(EFI_VA_END);
+ pgd_k = pgd_offset_k(EFI_VA_END);
+ p4d_efi = p4d_offset(pgd_efi, 0);
+ p4d_k = p4d_offset(pgd_k, 0);
+
+ num_entries = p4d_index(EFI_VA_END);
+ memcpy(p4d_efi, p4d_k, sizeof(p4d_t) * num_entries);
+
+ /*
+ * We share all the PUD entries apart from those that map the
+ * EFI regions. Copy around them.
+ */
+ BUILD_BUG_ON((EFI_VA_START & ~PUD_MASK) != 0);
+ BUILD_BUG_ON((EFI_VA_END & ~PUD_MASK) != 0);
+
+ p4d_efi = p4d_offset(pgd_efi, EFI_VA_END);
+ p4d_k = p4d_offset(pgd_k, EFI_VA_END);
+ pud_efi = pud_offset(p4d_efi, 0);
+ pud_k = pud_offset(p4d_k, 0);
+
+ num_entries = pud_index(EFI_VA_END);
+ memcpy(pud_efi, pud_k, sizeof(pud_t) * num_entries);
+
+ pud_efi = pud_offset(p4d_efi, EFI_VA_START);
+ pud_k = pud_offset(p4d_k, EFI_VA_START);
+
+ num_entries = PTRS_PER_PUD - pud_index(EFI_VA_START);
+ memcpy(pud_efi, pud_k, sizeof(pud_t) * num_entries);
+}
+
+/*
+ * Wrapper for slow_virt_to_phys() that handles NULL addresses.
+ */
+static inline phys_addr_t
+virt_to_phys_or_null_size(void *va, unsigned long size)
+{
+ phys_addr_t pa;
+
+ if (!va)
+ return 0;
+
+ if (virt_addr_valid(va))
+ return virt_to_phys(va);
+
+ pa = slow_virt_to_phys(va);
+
+ /* check if the object crosses a page boundary */
+ if (WARN_ON((pa ^ (pa + size - 1)) & PAGE_MASK))
+ return 0;
+
+ return pa;
+}
+
+#define virt_to_phys_or_null(addr) \
+ virt_to_phys_or_null_size((addr), sizeof(*(addr)))
+
+int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
+{
+ unsigned long pfn, text, pf;
+ struct page *page;
+ unsigned npages;
+ pgd_t *pgd = efi_mm.pgd;
+
+ if (efi_enabled(EFI_OLD_MEMMAP))
+ return 0;
+
+ /*
+ * It can happen that the physical address of new_memmap lands in memory
+ * which is not mapped in the EFI page table. Therefore we need to go
+ * and ident-map those pages containing the map before calling
+ * phys_efi_set_virtual_address_map().
+ */
+ pfn = pa_memmap >> PAGE_SHIFT;
+ pf = _PAGE_NX | _PAGE_RW | _PAGE_ENC;
+ if (kernel_map_pages_in_pgd(pgd, pfn, pa_memmap, num_pages, pf)) {
+ pr_err("Error ident-mapping new memmap (0x%lx)!\n", pa_memmap);
+ return 1;
+ }
+
+ /*
+ * Certain firmware versions are way too sentimential and still believe
+ * they are exclusive and unquestionable owners of the first physical page,
+ * even though they explicitly mark it as EFI_CONVENTIONAL_MEMORY
+ * (but then write-access it later during SetVirtualAddressMap()).
+ *
+ * Create a 1:1 mapping for this page, to avoid triple faults during early
+ * boot with such firmware. We are free to hand this page to the BIOS,
+ * as trim_bios_range() will reserve the first page and isolate it away
+ * from memory allocators anyway.
+ */
+ pf = _PAGE_RW;
+ if (sev_active())
+ pf |= _PAGE_ENC;
+
+ if (kernel_map_pages_in_pgd(pgd, 0x0, 0x0, 1, pf)) {
+ pr_err("Failed to create 1:1 mapping for the first page!\n");
+ return 1;
+ }
+
+ /*
+ * When making calls to the firmware everything needs to be 1:1
+ * mapped and addressable with 32-bit pointers. Map the kernel
+ * text and allocate a new stack because we can't rely on the
+ * stack pointer being < 4GB.
+ */
+ if (!IS_ENABLED(CONFIG_EFI_MIXED) || efi_is_native())
+ return 0;
+
+ page = alloc_page(GFP_KERNEL|__GFP_DMA32);
+ if (!page) {
+ pr_err("Unable to allocate EFI runtime stack < 4GB\n");
+ return 1;
+ }
+
+ efi_scratch.phys_stack = page_to_phys(page + 1); /* stack grows down */
+
+ npages = (_etext - _text) >> PAGE_SHIFT;
+ text = __pa(_text);
+ pfn = text >> PAGE_SHIFT;
+
+ pf = _PAGE_RW | _PAGE_ENC;
+ if (kernel_map_pages_in_pgd(pgd, pfn, text, npages, pf)) {
+ pr_err("Failed to map kernel text 1:1\n");
+ return 1;
+ }
+
+ return 0;
+}
+
+static void __init __map_region(efi_memory_desc_t *md, u64 va)
+{
+ unsigned long flags = _PAGE_RW;
+ unsigned long pfn;
+ pgd_t *pgd = efi_mm.pgd;
+
+ if (!(md->attribute & EFI_MEMORY_WB))
+ flags |= _PAGE_PCD;
+
+ if (sev_active() && md->type != EFI_MEMORY_MAPPED_IO)
+ flags |= _PAGE_ENC;
+
+ pfn = md->phys_addr >> PAGE_SHIFT;
+ if (kernel_map_pages_in_pgd(pgd, pfn, va, md->num_pages, flags))
+ pr_warn("Error mapping PA 0x%llx -> VA 0x%llx!\n",
+ md->phys_addr, va);
+}
+
+void __init efi_map_region(efi_memory_desc_t *md)
+{
+ unsigned long size = md->num_pages << PAGE_SHIFT;
+ u64 pa = md->phys_addr;
+
+ if (efi_enabled(EFI_OLD_MEMMAP))
+ return old_map_region(md);
+
+ /*
+ * Make sure the 1:1 mappings are present as a catch-all for b0rked
+ * firmware which doesn't update all internal pointers after switching
+ * to virtual mode and would otherwise crap on us.
+ */
+ __map_region(md, md->phys_addr);
+
+ /*
+ * Enforce the 1:1 mapping as the default virtual address when
+ * booting in EFI mixed mode, because even though we may be
+ * running a 64-bit kernel, the firmware may only be 32-bit.
+ */
+ if (!efi_is_native () && IS_ENABLED(CONFIG_EFI_MIXED)) {
+ md->virt_addr = md->phys_addr;
+ return;
+ }
+
+ efi_va -= size;
+
+ /* Is PA 2M-aligned? */
+ if (!(pa & (PMD_SIZE - 1))) {
+ efi_va &= PMD_MASK;
+ } else {
+ u64 pa_offset = pa & (PMD_SIZE - 1);
+ u64 prev_va = efi_va;
+
+ /* get us the same offset within this 2M page */
+ efi_va = (efi_va & PMD_MASK) + pa_offset;
+
+ if (efi_va > prev_va)
+ efi_va -= PMD_SIZE;
+ }
+
+ if (efi_va < EFI_VA_END) {
+ pr_warn(FW_WARN "VA address range overflow!\n");
+ return;
+ }
+
+ /* Do the VA map */
+ __map_region(md, efi_va);
+ md->virt_addr = efi_va;
+}
+
+/*
+ * kexec kernel will use efi_map_region_fixed to map efi runtime memory ranges.
+ * md->virt_addr is the original virtual address which had been mapped in kexec
+ * 1st kernel.
+ */
+void __init efi_map_region_fixed(efi_memory_desc_t *md)
+{
+ __map_region(md, md->phys_addr);
+ __map_region(md, md->virt_addr);
+}
+
+void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size,
+ u32 type, u64 attribute)
+{
+ unsigned long last_map_pfn;
+
+ if (type == EFI_MEMORY_MAPPED_IO)
+ return ioremap(phys_addr, size);
+
+ last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size);
+ if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size) {
+ unsigned long top = last_map_pfn << PAGE_SHIFT;
+ efi_ioremap(top, size - (top - phys_addr), type, attribute);
+ }
+
+ if (!(attribute & EFI_MEMORY_WB))
+ efi_memory_uc((u64)(unsigned long)__va(phys_addr), size);
+
+ return (void __iomem *)__va(phys_addr);
+}
+
+void __init parse_efi_setup(u64 phys_addr, u32 data_len)
+{
+ efi_setup = phys_addr + sizeof(struct setup_data);
+}
+
+static int __init efi_update_mappings(efi_memory_desc_t *md, unsigned long pf)
+{
+ unsigned long pfn;
+ pgd_t *pgd = efi_mm.pgd;
+ int err1, err2;
+
+ /* Update the 1:1 mapping */
+ pfn = md->phys_addr >> PAGE_SHIFT;
+ err1 = kernel_map_pages_in_pgd(pgd, pfn, md->phys_addr, md->num_pages, pf);
+ if (err1) {
+ pr_err("Error while updating 1:1 mapping PA 0x%llx -> VA 0x%llx!\n",
+ md->phys_addr, md->virt_addr);
+ }
+
+ err2 = kernel_map_pages_in_pgd(pgd, pfn, md->virt_addr, md->num_pages, pf);
+ if (err2) {
+ pr_err("Error while updating VA mapping PA 0x%llx -> VA 0x%llx!\n",
+ md->phys_addr, md->virt_addr);
+ }
+
+ return err1 || err2;
+}
+
+static int __init efi_update_mem_attr(struct mm_struct *mm, efi_memory_desc_t *md)
+{
+ unsigned long pf = 0;
+
+ if (md->attribute & EFI_MEMORY_XP)
+ pf |= _PAGE_NX;
+
+ if (!(md->attribute & EFI_MEMORY_RO))
+ pf |= _PAGE_RW;
+
+ if (sev_active())
+ pf |= _PAGE_ENC;
+
+ return efi_update_mappings(md, pf);
+}
+
+void __init efi_runtime_update_mappings(void)
+{
+ efi_memory_desc_t *md;
+
+ if (efi_enabled(EFI_OLD_MEMMAP)) {
+ if (__supported_pte_mask & _PAGE_NX)
+ runtime_code_page_mkexec();
+ return;
+ }
+
+ /*
+ * Use the EFI Memory Attribute Table for mapping permissions if it
+ * exists, since it is intended to supersede EFI_PROPERTIES_TABLE.
+ */
+ if (efi_enabled(EFI_MEM_ATTR)) {
+ efi_memattr_apply_permissions(NULL, efi_update_mem_attr);
+ return;
+ }
+
+ /*
+ * EFI_MEMORY_ATTRIBUTES_TABLE is intended to replace
+ * EFI_PROPERTIES_TABLE. So, use EFI_PROPERTIES_TABLE to update
+ * permissions only if EFI_MEMORY_ATTRIBUTES_TABLE is not
+ * published by the firmware. Even if we find a buggy implementation of
+ * EFI_MEMORY_ATTRIBUTES_TABLE, don't fall back to
+ * EFI_PROPERTIES_TABLE, because of the same reason.
+ */
+
+ if (!efi_enabled(EFI_NX_PE_DATA))
+ return;
+
+ for_each_efi_memory_desc(md) {
+ unsigned long pf = 0;
+
+ if (!(md->attribute & EFI_MEMORY_RUNTIME))
+ continue;
+
+ if (!(md->attribute & EFI_MEMORY_WB))
+ pf |= _PAGE_PCD;
+
+ if ((md->attribute & EFI_MEMORY_XP) ||
+ (md->type == EFI_RUNTIME_SERVICES_DATA))
+ pf |= _PAGE_NX;
+
+ if (!(md->attribute & EFI_MEMORY_RO) &&
+ (md->type != EFI_RUNTIME_SERVICES_CODE))
+ pf |= _PAGE_RW;
+
+ if (sev_active())
+ pf |= _PAGE_ENC;
+
+ efi_update_mappings(md, pf);
+ }
+}
+
+void __init efi_dump_pagetable(void)
+{
+#ifdef CONFIG_EFI_PGT_DUMP
+ if (efi_enabled(EFI_OLD_MEMMAP))
+ ptdump_walk_pgd_level(NULL, swapper_pg_dir);
+ else
+ ptdump_walk_pgd_level(NULL, efi_mm.pgd);
+#endif
+}
+
+/*
+ * Makes the calling thread switch to/from efi_mm context. Can be used
+ * for SetVirtualAddressMap() i.e. current->active_mm == init_mm as well
+ * as during efi runtime calls i.e current->active_mm == current_mm.
+ * We are not mm_dropping()/mm_grabbing() any mm, because we are not
+ * losing/creating any references.
+ */
+void efi_switch_mm(struct mm_struct *mm)
+{
+ task_lock(current);
+ efi_scratch.prev_mm = current->active_mm;
+ current->active_mm = mm;
+ switch_mm(efi_scratch.prev_mm, mm, NULL);
+ task_unlock(current);
+}
+
+#ifdef CONFIG_EFI_MIXED
+extern efi_status_t efi64_thunk(u32, ...);
+
+static DEFINE_SPINLOCK(efi_runtime_lock);
+
+#define runtime_service32(func) \
+({ \
+ u32 table = (u32)(unsigned long)efi.systab; \
+ u32 *rt, *___f; \
+ \
+ rt = (u32 *)(table + offsetof(efi_system_table_32_t, runtime)); \
+ ___f = (u32 *)(*rt + offsetof(efi_runtime_services_32_t, func)); \
+ *___f; \
+})
+
+/*
+ * Switch to the EFI page tables early so that we can access the 1:1
+ * runtime services mappings which are not mapped in any other page
+ * tables. This function must be called before runtime_service32().
+ *
+ * Also, disable interrupts because the IDT points to 64-bit handlers,
+ * which aren't going to function correctly when we switch to 32-bit.
+ */
+#define efi_thunk(f, ...) \
+({ \
+ efi_status_t __s; \
+ u32 __func; \
+ \
+ arch_efi_call_virt_setup(); \
+ \
+ __func = runtime_service32(f); \
+ __s = efi64_thunk(__func, __VA_ARGS__); \
+ \
+ arch_efi_call_virt_teardown(); \
+ \
+ __s; \
+})
+
+efi_status_t efi_thunk_set_virtual_address_map(
+ void *phys_set_virtual_address_map,
+ unsigned long memory_map_size,
+ unsigned long descriptor_size,
+ u32 descriptor_version,
+ efi_memory_desc_t *virtual_map)
+{
+ efi_status_t status;
+ unsigned long flags;
+ u32 func;
+
+ efi_sync_low_kernel_mappings();
+ local_irq_save(flags);
+
+ efi_switch_mm(&efi_mm);
+
+ func = (u32)(unsigned long)phys_set_virtual_address_map;
+ status = efi64_thunk(func, memory_map_size, descriptor_size,
+ descriptor_version, virtual_map);
+
+ efi_switch_mm(efi_scratch.prev_mm);
+ local_irq_restore(flags);
+
+ return status;
+}
+
+static efi_status_t efi_thunk_get_time(efi_time_t *tm, efi_time_cap_t *tc)
+{
+ efi_status_t status;
+ u32 phys_tm, phys_tc;
+ unsigned long flags;
+
+ spin_lock(&rtc_lock);
+ spin_lock_irqsave(&efi_runtime_lock, flags);
+
+ phys_tm = virt_to_phys_or_null(tm);
+ phys_tc = virt_to_phys_or_null(tc);
+
+ status = efi_thunk(get_time, phys_tm, phys_tc);
+
+ spin_unlock_irqrestore(&efi_runtime_lock, flags);
+ spin_unlock(&rtc_lock);
+
+ return status;
+}
+
+static efi_status_t efi_thunk_set_time(efi_time_t *tm)
+{
+ efi_status_t status;
+ u32 phys_tm;
+ unsigned long flags;
+
+ spin_lock(&rtc_lock);
+ spin_lock_irqsave(&efi_runtime_lock, flags);
+
+ phys_tm = virt_to_phys_or_null(tm);
+
+ status = efi_thunk(set_time, phys_tm);
+
+ spin_unlock_irqrestore(&efi_runtime_lock, flags);
+ spin_unlock(&rtc_lock);
+
+ return status;
+}
+
+static efi_status_t
+efi_thunk_get_wakeup_time(efi_bool_t *enabled, efi_bool_t *pending,
+ efi_time_t *tm)
+{
+ efi_status_t status;
+ u32 phys_enabled, phys_pending, phys_tm;
+ unsigned long flags;
+
+ spin_lock(&rtc_lock);
+ spin_lock_irqsave(&efi_runtime_lock, flags);
+
+ phys_enabled = virt_to_phys_or_null(enabled);
+ phys_pending = virt_to_phys_or_null(pending);
+ phys_tm = virt_to_phys_or_null(tm);
+
+ status = efi_thunk(get_wakeup_time, phys_enabled,
+ phys_pending, phys_tm);
+
+ spin_unlock_irqrestore(&efi_runtime_lock, flags);
+ spin_unlock(&rtc_lock);
+
+ return status;
+}
+
+static efi_status_t
+efi_thunk_set_wakeup_time(efi_bool_t enabled, efi_time_t *tm)
+{
+ efi_status_t status;
+ u32 phys_tm;
+ unsigned long flags;
+
+ spin_lock(&rtc_lock);
+ spin_lock_irqsave(&efi_runtime_lock, flags);
+
+ phys_tm = virt_to_phys_or_null(tm);
+
+ status = efi_thunk(set_wakeup_time, enabled, phys_tm);
+
+ spin_unlock_irqrestore(&efi_runtime_lock, flags);
+ spin_unlock(&rtc_lock);
+
+ return status;
+}
+
+static unsigned long efi_name_size(efi_char16_t *name)
+{
+ return ucs2_strsize(name, EFI_VAR_NAME_LEN) + 1;
+}
+
+static efi_status_t
+efi_thunk_get_variable(efi_char16_t *name, efi_guid_t *vendor,
+ u32 *attr, unsigned long *data_size, void *data)
+{
+ u8 buf[24] __aligned(8);
+ efi_guid_t *vnd = PTR_ALIGN((efi_guid_t *)buf, sizeof(*vnd));
+ efi_status_t status;
+ u32 phys_name, phys_vendor, phys_attr;
+ u32 phys_data_size, phys_data;
+ unsigned long flags;
+
+ spin_lock_irqsave(&efi_runtime_lock, flags);
+
+ *vnd = *vendor;
+
+ phys_data_size = virt_to_phys_or_null(data_size);
+ phys_vendor = virt_to_phys_or_null(vnd);
+ phys_name = virt_to_phys_or_null_size(name, efi_name_size(name));
+ phys_attr = virt_to_phys_or_null(attr);
+ phys_data = virt_to_phys_or_null_size(data, *data_size);
+
+ if (!phys_name || (data && !phys_data))
+ status = EFI_INVALID_PARAMETER;
+ else
+ status = efi_thunk(get_variable, phys_name, phys_vendor,
+ phys_attr, phys_data_size, phys_data);
+
+ spin_unlock_irqrestore(&efi_runtime_lock, flags);
+
+ return status;
+}
+
+static efi_status_t
+efi_thunk_set_variable(efi_char16_t *name, efi_guid_t *vendor,
+ u32 attr, unsigned long data_size, void *data)
+{
+ u8 buf[24] __aligned(8);
+ efi_guid_t *vnd = PTR_ALIGN((efi_guid_t *)buf, sizeof(*vnd));
+ u32 phys_name, phys_vendor, phys_data;
+ efi_status_t status;
+ unsigned long flags;
+
+ spin_lock_irqsave(&efi_runtime_lock, flags);
+
+ *vnd = *vendor;
+
+ phys_name = virt_to_phys_or_null_size(name, efi_name_size(name));
+ phys_vendor = virt_to_phys_or_null(vnd);
+ phys_data = virt_to_phys_or_null_size(data, data_size);
+
+ if (!phys_name || (data && !phys_data))
+ status = EFI_INVALID_PARAMETER;
+ else
+ status = efi_thunk(set_variable, phys_name, phys_vendor,
+ attr, data_size, phys_data);
+
+ spin_unlock_irqrestore(&efi_runtime_lock, flags);
+
+ return status;
+}
+
+static efi_status_t
+efi_thunk_set_variable_nonblocking(efi_char16_t *name, efi_guid_t *vendor,
+ u32 attr, unsigned long data_size,
+ void *data)
+{
+ u8 buf[24] __aligned(8);
+ efi_guid_t *vnd = PTR_ALIGN((efi_guid_t *)buf, sizeof(*vnd));
+ u32 phys_name, phys_vendor, phys_data;
+ efi_status_t status;
+ unsigned long flags;
+
+ if (!spin_trylock_irqsave(&efi_runtime_lock, flags))
+ return EFI_NOT_READY;
+
+ *vnd = *vendor;
+
+ phys_name = virt_to_phys_or_null_size(name, efi_name_size(name));
+ phys_vendor = virt_to_phys_or_null(vnd);
+ phys_data = virt_to_phys_or_null_size(data, data_size);
+
+ if (!phys_name || (data && !phys_data))
+ status = EFI_INVALID_PARAMETER;
+ else
+ status = efi_thunk(set_variable, phys_name, phys_vendor,
+ attr, data_size, phys_data);
+
+ spin_unlock_irqrestore(&efi_runtime_lock, flags);
+
+ return status;
+}
+
+static efi_status_t
+efi_thunk_get_next_variable(unsigned long *name_size,
+ efi_char16_t *name,
+ efi_guid_t *vendor)
+{
+ u8 buf[24] __aligned(8);
+ efi_guid_t *vnd = PTR_ALIGN((efi_guid_t *)buf, sizeof(*vnd));
+ efi_status_t status;
+ u32 phys_name_size, phys_name, phys_vendor;
+ unsigned long flags;
+
+ spin_lock_irqsave(&efi_runtime_lock, flags);
+
+ *vnd = *vendor;
+
+ phys_name_size = virt_to_phys_or_null(name_size);
+ phys_vendor = virt_to_phys_or_null(vnd);
+ phys_name = virt_to_phys_or_null_size(name, *name_size);
+
+ if (!phys_name)
+ status = EFI_INVALID_PARAMETER;
+ else
+ status = efi_thunk(get_next_variable, phys_name_size,
+ phys_name, phys_vendor);
+
+ spin_unlock_irqrestore(&efi_runtime_lock, flags);
+
+ *vendor = *vnd;
+ return status;
+}
+
+static efi_status_t
+efi_thunk_get_next_high_mono_count(u32 *count)
+{
+ efi_status_t status;
+ u32 phys_count;
+ unsigned long flags;
+
+ spin_lock_irqsave(&efi_runtime_lock, flags);
+
+ phys_count = virt_to_phys_or_null(count);
+ status = efi_thunk(get_next_high_mono_count, phys_count);
+
+ spin_unlock_irqrestore(&efi_runtime_lock, flags);
+
+ return status;
+}
+
+static void
+efi_thunk_reset_system(int reset_type, efi_status_t status,
+ unsigned long data_size, efi_char16_t *data)
+{
+ u32 phys_data;
+ unsigned long flags;
+
+ spin_lock_irqsave(&efi_runtime_lock, flags);
+
+ phys_data = virt_to_phys_or_null_size(data, data_size);
+
+ efi_thunk(reset_system, reset_type, status, data_size, phys_data);
+
+ spin_unlock_irqrestore(&efi_runtime_lock, flags);
+}
+
+static efi_status_t
+efi_thunk_update_capsule(efi_capsule_header_t **capsules,
+ unsigned long count, unsigned long sg_list)
+{
+ /*
+ * To properly support this function we would need to repackage
+ * 'capsules' because the firmware doesn't understand 64-bit
+ * pointers.
+ */
+ return EFI_UNSUPPORTED;
+}
+
+static efi_status_t
+efi_thunk_query_variable_info(u32 attr, u64 *storage_space,
+ u64 *remaining_space,
+ u64 *max_variable_size)
+{
+ efi_status_t status;
+ u32 phys_storage, phys_remaining, phys_max;
+ unsigned long flags;
+
+ if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION)
+ return EFI_UNSUPPORTED;
+
+ spin_lock_irqsave(&efi_runtime_lock, flags);
+
+ phys_storage = virt_to_phys_or_null(storage_space);
+ phys_remaining = virt_to_phys_or_null(remaining_space);
+ phys_max = virt_to_phys_or_null(max_variable_size);
+
+ status = efi_thunk(query_variable_info, attr, phys_storage,
+ phys_remaining, phys_max);
+
+ spin_unlock_irqrestore(&efi_runtime_lock, flags);
+
+ return status;
+}
+
+static efi_status_t
+efi_thunk_query_variable_info_nonblocking(u32 attr, u64 *storage_space,
+ u64 *remaining_space,
+ u64 *max_variable_size)
+{
+ efi_status_t status;
+ u32 phys_storage, phys_remaining, phys_max;
+ unsigned long flags;
+
+ if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION)
+ return EFI_UNSUPPORTED;
+
+ if (!spin_trylock_irqsave(&efi_runtime_lock, flags))
+ return EFI_NOT_READY;
+
+ phys_storage = virt_to_phys_or_null(storage_space);
+ phys_remaining = virt_to_phys_or_null(remaining_space);
+ phys_max = virt_to_phys_or_null(max_variable_size);
+
+ status = efi_thunk(query_variable_info, attr, phys_storage,
+ phys_remaining, phys_max);
+
+ spin_unlock_irqrestore(&efi_runtime_lock, flags);
+
+ return status;
+}
+
+static efi_status_t
+efi_thunk_query_capsule_caps(efi_capsule_header_t **capsules,
+ unsigned long count, u64 *max_size,
+ int *reset_type)
+{
+ /*
+ * To properly support this function we would need to repackage
+ * 'capsules' because the firmware doesn't understand 64-bit
+ * pointers.
+ */
+ return EFI_UNSUPPORTED;
+}
+
+void efi_thunk_runtime_setup(void)
+{
+ efi.get_time = efi_thunk_get_time;
+ efi.set_time = efi_thunk_set_time;
+ efi.get_wakeup_time = efi_thunk_get_wakeup_time;
+ efi.set_wakeup_time = efi_thunk_set_wakeup_time;
+ efi.get_variable = efi_thunk_get_variable;
+ efi.get_next_variable = efi_thunk_get_next_variable;
+ efi.set_variable = efi_thunk_set_variable;
+ efi.set_variable_nonblocking = efi_thunk_set_variable_nonblocking;
+ efi.get_next_high_mono_count = efi_thunk_get_next_high_mono_count;
+ efi.reset_system = efi_thunk_reset_system;
+ efi.query_variable_info = efi_thunk_query_variable_info;
+ efi.query_variable_info_nonblocking = efi_thunk_query_variable_info_nonblocking;
+ efi.update_capsule = efi_thunk_update_capsule;
+ efi.query_capsule_caps = efi_thunk_query_capsule_caps;
+}
+#endif /* CONFIG_EFI_MIXED */
diff --git a/arch/x86/platform/efi/efi_stub_32.S b/arch/x86/platform/efi/efi_stub_32.S
new file mode 100644
index 000000000..ab2e91e76
--- /dev/null
+++ b/arch/x86/platform/efi/efi_stub_32.S
@@ -0,0 +1,124 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * EFI call stub for IA32.
+ *
+ * This stub allows us to make EFI calls in physical mode with interrupts
+ * turned off.
+ */
+
+#include <linux/linkage.h>
+#include <asm/page_types.h>
+
+/*
+ * efi_call_phys(void *, ...) is a function with variable parameters.
+ * All the callers of this function assure that all the parameters are 4-bytes.
+ */
+
+/*
+ * In gcc calling convention, EBX, ESP, EBP, ESI and EDI are all callee save.
+ * So we'd better save all of them at the beginning of this function and restore
+ * at the end no matter how many we use, because we can not assure EFI runtime
+ * service functions will comply with gcc calling convention, too.
+ */
+
+.text
+ENTRY(efi_call_phys)
+ /*
+ * 0. The function can only be called in Linux kernel. So CS has been
+ * set to 0x0010, DS and SS have been set to 0x0018. In EFI, I found
+ * the values of these registers are the same. And, the corresponding
+ * GDT entries are identical. So I will do nothing about segment reg
+ * and GDT, but change GDT base register in prolog and epilog.
+ */
+
+ /*
+ * 1. Now I am running with EIP = <physical address> + PAGE_OFFSET.
+ * But to make it smoothly switch from virtual mode to flat mode.
+ * The mapping of lower virtual memory has been created in prolog and
+ * epilog.
+ */
+ movl $1f, %edx
+ subl $__PAGE_OFFSET, %edx
+ jmp *%edx
+1:
+
+ /*
+ * 2. Now on the top of stack is the return
+ * address in the caller of efi_call_phys(), then parameter 1,
+ * parameter 2, ..., param n. To make things easy, we save the return
+ * address of efi_call_phys in a global variable.
+ */
+ popl %edx
+ movl %edx, saved_return_addr
+ /* get the function pointer into ECX*/
+ popl %ecx
+ movl %ecx, efi_rt_function_ptr
+ movl $2f, %edx
+ subl $__PAGE_OFFSET, %edx
+ pushl %edx
+
+ /*
+ * 3. Clear PG bit in %CR0.
+ */
+ movl %cr0, %edx
+ andl $0x7fffffff, %edx
+ movl %edx, %cr0
+ jmp 1f
+1:
+
+ /*
+ * 4. Adjust stack pointer.
+ */
+ subl $__PAGE_OFFSET, %esp
+
+ /*
+ * 5. Call the physical function.
+ */
+ jmp *%ecx
+
+2:
+ /*
+ * 6. After EFI runtime service returns, control will return to
+ * following instruction. We'd better readjust stack pointer first.
+ */
+ addl $__PAGE_OFFSET, %esp
+
+ /*
+ * 7. Restore PG bit
+ */
+ movl %cr0, %edx
+ orl $0x80000000, %edx
+ movl %edx, %cr0
+ jmp 1f
+1:
+ /*
+ * 8. Now restore the virtual mode from flat mode by
+ * adding EIP with PAGE_OFFSET.
+ */
+ movl $1f, %edx
+ jmp *%edx
+1:
+
+ /*
+ * 9. Balance the stack. And because EAX contain the return value,
+ * we'd better not clobber it.
+ */
+ leal efi_rt_function_ptr, %edx
+ movl (%edx), %ecx
+ pushl %ecx
+
+ /*
+ * 10. Push the saved return address onto the stack and return.
+ */
+ leal saved_return_addr, %edx
+ movl (%edx), %ecx
+ pushl %ecx
+ ret
+ENDPROC(efi_call_phys)
+.previous
+
+.data
+saved_return_addr:
+ .long 0
+efi_rt_function_ptr:
+ .long 0
diff --git a/arch/x86/platform/efi/efi_stub_64.S b/arch/x86/platform/efi/efi_stub_64.S
new file mode 100644
index 000000000..74628ec78
--- /dev/null
+++ b/arch/x86/platform/efi/efi_stub_64.S
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Function calling ABI conversion from Linux to EFI for x86_64
+ *
+ * Copyright (C) 2007 Intel Corp
+ * Bibo Mao <bibo.mao@intel.com>
+ * Huang Ying <ying.huang@intel.com>
+ */
+
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/msr.h>
+#include <asm/processor-flags.h>
+#include <asm/page_types.h>
+
+#define SAVE_XMM \
+ mov %rsp, %rax; \
+ subq $0x70, %rsp; \
+ and $~0xf, %rsp; \
+ mov %rax, (%rsp); \
+ mov %cr0, %rax; \
+ clts; \
+ mov %rax, 0x8(%rsp); \
+ movaps %xmm0, 0x60(%rsp); \
+ movaps %xmm1, 0x50(%rsp); \
+ movaps %xmm2, 0x40(%rsp); \
+ movaps %xmm3, 0x30(%rsp); \
+ movaps %xmm4, 0x20(%rsp); \
+ movaps %xmm5, 0x10(%rsp)
+
+#define RESTORE_XMM \
+ movaps 0x60(%rsp), %xmm0; \
+ movaps 0x50(%rsp), %xmm1; \
+ movaps 0x40(%rsp), %xmm2; \
+ movaps 0x30(%rsp), %xmm3; \
+ movaps 0x20(%rsp), %xmm4; \
+ movaps 0x10(%rsp), %xmm5; \
+ mov 0x8(%rsp), %rsi; \
+ mov %rsi, %cr0; \
+ mov (%rsp), %rsp
+
+ENTRY(efi_call)
+ pushq %rbp
+ movq %rsp, %rbp
+ SAVE_XMM
+ mov 16(%rbp), %rax
+ subq $48, %rsp
+ mov %r9, 32(%rsp)
+ mov %rax, 40(%rsp)
+ mov %r8, %r9
+ mov %rcx, %r8
+ mov %rsi, %rcx
+ call *%rdi
+ addq $48, %rsp
+ RESTORE_XMM
+ popq %rbp
+ ret
+ENDPROC(efi_call)
diff --git a/arch/x86/platform/efi/efi_thunk_64.S b/arch/x86/platform/efi/efi_thunk_64.S
new file mode 100644
index 000000000..46c58b087
--- /dev/null
+++ b/arch/x86/platform/efi/efi_thunk_64.S
@@ -0,0 +1,153 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2014 Intel Corporation; author Matt Fleming
+ *
+ * Support for invoking 32-bit EFI runtime services from a 64-bit
+ * kernel.
+ *
+ * The below thunking functions are only used after ExitBootServices()
+ * has been called. This simplifies things considerably as compared with
+ * the early EFI thunking because we can leave all the kernel state
+ * intact (GDT, IDT, etc) and simply invoke the the 32-bit EFI runtime
+ * services from __KERNEL32_CS. This means we can continue to service
+ * interrupts across an EFI mixed mode call.
+ *
+ * We do however, need to handle the fact that we're running in a full
+ * 64-bit virtual address space. Things like the stack and instruction
+ * addresses need to be accessible by the 32-bit firmware, so we rely on
+ * using the identity mappings in the EFI page table to access the stack
+ * and kernel text (see efi_setup_page_tables()).
+ */
+
+#include <linux/linkage.h>
+#include <asm/page_types.h>
+#include <asm/segment.h>
+
+ .text
+ .code64
+ENTRY(efi64_thunk)
+ push %rbp
+ push %rbx
+
+ /*
+ * Switch to 1:1 mapped 32-bit stack pointer.
+ */
+ movq %rsp, efi_saved_sp(%rip)
+ movq efi_scratch(%rip), %rsp
+
+ /*
+ * Calculate the physical address of the kernel text.
+ */
+ movq $__START_KERNEL_map, %rax
+ subq phys_base(%rip), %rax
+
+ /*
+ * Push some physical addresses onto the stack. This is easier
+ * to do now in a code64 section while the assembler can address
+ * 64-bit values. Note that all the addresses on the stack are
+ * 32-bit.
+ */
+ subq $16, %rsp
+ leaq efi_exit32(%rip), %rbx
+ subq %rax, %rbx
+ movl %ebx, 8(%rsp)
+
+ leaq __efi64_thunk(%rip), %rbx
+ subq %rax, %rbx
+ call *%rbx
+
+ movq efi_saved_sp(%rip), %rsp
+ pop %rbx
+ pop %rbp
+ retq
+ENDPROC(efi64_thunk)
+
+/*
+ * We run this function from the 1:1 mapping.
+ *
+ * This function must be invoked with a 1:1 mapped stack.
+ */
+ENTRY(__efi64_thunk)
+ movl %ds, %eax
+ push %rax
+ movl %es, %eax
+ push %rax
+ movl %ss, %eax
+ push %rax
+
+ subq $32, %rsp
+ movl %esi, 0x0(%rsp)
+ movl %edx, 0x4(%rsp)
+ movl %ecx, 0x8(%rsp)
+ movq %r8, %rsi
+ movl %esi, 0xc(%rsp)
+ movq %r9, %rsi
+ movl %esi, 0x10(%rsp)
+
+ leaq 1f(%rip), %rbx
+ movq %rbx, func_rt_ptr(%rip)
+
+ /* Switch to 32-bit descriptor */
+ pushq $__KERNEL32_CS
+ leaq efi_enter32(%rip), %rax
+ pushq %rax
+ lretq
+
+1: addq $32, %rsp
+
+ pop %rbx
+ movl %ebx, %ss
+ pop %rbx
+ movl %ebx, %es
+ pop %rbx
+ movl %ebx, %ds
+
+ /*
+ * Convert 32-bit status code into 64-bit.
+ */
+ test %rax, %rax
+ jz 1f
+ movl %eax, %ecx
+ andl $0x0fffffff, %ecx
+ andl $0xf0000000, %eax
+ shl $32, %rax
+ or %rcx, %rax
+1:
+ ret
+ENDPROC(__efi64_thunk)
+
+ENTRY(efi_exit32)
+ movq func_rt_ptr(%rip), %rax
+ push %rax
+ mov %rdi, %rax
+ ret
+ENDPROC(efi_exit32)
+
+ .code32
+/*
+ * EFI service pointer must be in %edi.
+ *
+ * The stack should represent the 32-bit calling convention.
+ */
+ENTRY(efi_enter32)
+ movl $__KERNEL_DS, %eax
+ movl %eax, %ds
+ movl %eax, %es
+ movl %eax, %ss
+
+ call *%edi
+
+ /* We must preserve return value */
+ movl %eax, %edi
+
+ movl 72(%esp), %eax
+ pushl $__KERNEL_CS
+ pushl %eax
+
+ lret
+ENDPROC(efi_enter32)
+
+ .data
+ .balign 8
+func_rt_ptr: .quad 0
+efi_saved_sp: .quad 0
diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c
new file mode 100644
index 000000000..006eb09e9
--- /dev/null
+++ b/arch/x86/platform/efi/quirks.c
@@ -0,0 +1,655 @@
+#define pr_fmt(fmt) "efi: " fmt
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/time.h>
+#include <linux/types.h>
+#include <linux/efi.h>
+#include <linux/slab.h>
+#include <linux/memblock.h>
+#include <linux/bootmem.h>
+#include <linux/acpi.h>
+#include <linux/dmi.h>
+
+#include <asm/e820/api.h>
+#include <asm/efi.h>
+#include <asm/uv/uv.h>
+#include <asm/cpu_device_id.h>
+
+#define EFI_MIN_RESERVE 5120
+
+#define EFI_DUMMY_GUID \
+ EFI_GUID(0x4424ac57, 0xbe4b, 0x47dd, 0x9e, 0x97, 0xed, 0x50, 0xf0, 0x9f, 0x92, 0xa9)
+
+#define QUARK_CSH_SIGNATURE 0x5f435348 /* _CSH */
+#define QUARK_SECURITY_HEADER_SIZE 0x400
+
+/*
+ * Header prepended to the standard EFI capsule on Quark systems the are based
+ * on Intel firmware BSP.
+ * @csh_signature: Unique identifier to sanity check signed module
+ * presence ("_CSH").
+ * @version: Current version of CSH used. Should be one for Quark A0.
+ * @modulesize: Size of the entire module including the module header
+ * and payload.
+ * @security_version_number_index: Index of SVN to use for validation of signed
+ * module.
+ * @security_version_number: Used to prevent against roll back of modules.
+ * @rsvd_module_id: Currently unused for Clanton (Quark).
+ * @rsvd_module_vendor: Vendor Identifier. For Intel products value is
+ * 0x00008086.
+ * @rsvd_date: BCD representation of build date as yyyymmdd, where
+ * yyyy=4 digit year, mm=1-12, dd=1-31.
+ * @headersize: Total length of the header including including any
+ * padding optionally added by the signing tool.
+ * @hash_algo: What Hash is used in the module signing.
+ * @cryp_algo: What Crypto is used in the module signing.
+ * @keysize: Total length of the key data including including any
+ * padding optionally added by the signing tool.
+ * @signaturesize: Total length of the signature including including any
+ * padding optionally added by the signing tool.
+ * @rsvd_next_header: 32-bit pointer to the next Secure Boot Module in the
+ * chain, if there is a next header.
+ * @rsvd: Reserved, padding structure to required size.
+ *
+ * See also QuartSecurityHeader_t in
+ * Quark_EDKII_v1.2.1.1/QuarkPlatformPkg/Include/QuarkBootRom.h
+ * from https://downloadcenter.intel.com/download/23197/Intel-Quark-SoC-X1000-Board-Support-Package-BSP
+ */
+struct quark_security_header {
+ u32 csh_signature;
+ u32 version;
+ u32 modulesize;
+ u32 security_version_number_index;
+ u32 security_version_number;
+ u32 rsvd_module_id;
+ u32 rsvd_module_vendor;
+ u32 rsvd_date;
+ u32 headersize;
+ u32 hash_algo;
+ u32 cryp_algo;
+ u32 keysize;
+ u32 signaturesize;
+ u32 rsvd_next_header;
+ u32 rsvd[2];
+};
+
+static const efi_char16_t efi_dummy_name[] = L"DUMMY";
+
+static bool efi_no_storage_paranoia;
+
+/*
+ * Some firmware implementations refuse to boot if there's insufficient
+ * space in the variable store. The implementation of garbage collection
+ * in some FW versions causes stale (deleted) variables to take up space
+ * longer than intended and space is only freed once the store becomes
+ * almost completely full.
+ *
+ * Enabling this option disables the space checks in
+ * efi_query_variable_store() and forces garbage collection.
+ *
+ * Only enable this option if deleting EFI variables does not free up
+ * space in your variable store, e.g. if despite deleting variables
+ * you're unable to create new ones.
+ */
+static int __init setup_storage_paranoia(char *arg)
+{
+ efi_no_storage_paranoia = true;
+ return 0;
+}
+early_param("efi_no_storage_paranoia", setup_storage_paranoia);
+
+/*
+ * Deleting the dummy variable which kicks off garbage collection
+*/
+void efi_delete_dummy_variable(void)
+{
+ efi.set_variable_nonblocking((efi_char16_t *)efi_dummy_name,
+ &EFI_DUMMY_GUID,
+ EFI_VARIABLE_NON_VOLATILE |
+ EFI_VARIABLE_BOOTSERVICE_ACCESS |
+ EFI_VARIABLE_RUNTIME_ACCESS, 0, NULL);
+}
+
+/*
+ * In the nonblocking case we do not attempt to perform garbage
+ * collection if we do not have enough free space. Rather, we do the
+ * bare minimum check and give up immediately if the available space
+ * is below EFI_MIN_RESERVE.
+ *
+ * This function is intended to be small and simple because it is
+ * invoked from crash handler paths.
+ */
+static efi_status_t
+query_variable_store_nonblocking(u32 attributes, unsigned long size)
+{
+ efi_status_t status;
+ u64 storage_size, remaining_size, max_size;
+
+ status = efi.query_variable_info_nonblocking(attributes, &storage_size,
+ &remaining_size,
+ &max_size);
+ if (status != EFI_SUCCESS)
+ return status;
+
+ if (remaining_size - size < EFI_MIN_RESERVE)
+ return EFI_OUT_OF_RESOURCES;
+
+ return EFI_SUCCESS;
+}
+
+/*
+ * Some firmware implementations refuse to boot if there's insufficient space
+ * in the variable store. Ensure that we never use more than a safe limit.
+ *
+ * Return EFI_SUCCESS if it is safe to write 'size' bytes to the variable
+ * store.
+ */
+efi_status_t efi_query_variable_store(u32 attributes, unsigned long size,
+ bool nonblocking)
+{
+ efi_status_t status;
+ u64 storage_size, remaining_size, max_size;
+
+ if (!(attributes & EFI_VARIABLE_NON_VOLATILE))
+ return 0;
+
+ if (nonblocking)
+ return query_variable_store_nonblocking(attributes, size);
+
+ status = efi.query_variable_info(attributes, &storage_size,
+ &remaining_size, &max_size);
+ if (status != EFI_SUCCESS)
+ return status;
+
+ /*
+ * We account for that by refusing the write if permitting it would
+ * reduce the available space to under 5KB. This figure was provided by
+ * Samsung, so should be safe.
+ */
+ if ((remaining_size - size < EFI_MIN_RESERVE) &&
+ !efi_no_storage_paranoia) {
+
+ /*
+ * Triggering garbage collection may require that the firmware
+ * generate a real EFI_OUT_OF_RESOURCES error. We can force
+ * that by attempting to use more space than is available.
+ */
+ unsigned long dummy_size = remaining_size + 1024;
+ void *dummy = kzalloc(dummy_size, GFP_KERNEL);
+
+ if (!dummy)
+ return EFI_OUT_OF_RESOURCES;
+
+ status = efi.set_variable((efi_char16_t *)efi_dummy_name,
+ &EFI_DUMMY_GUID,
+ EFI_VARIABLE_NON_VOLATILE |
+ EFI_VARIABLE_BOOTSERVICE_ACCESS |
+ EFI_VARIABLE_RUNTIME_ACCESS,
+ dummy_size, dummy);
+
+ if (status == EFI_SUCCESS) {
+ /*
+ * This should have failed, so if it didn't make sure
+ * that we delete it...
+ */
+ efi_delete_dummy_variable();
+ }
+
+ kfree(dummy);
+
+ /*
+ * The runtime code may now have triggered a garbage collection
+ * run, so check the variable info again
+ */
+ status = efi.query_variable_info(attributes, &storage_size,
+ &remaining_size, &max_size);
+
+ if (status != EFI_SUCCESS)
+ return status;
+
+ /*
+ * There still isn't enough room, so return an error
+ */
+ if (remaining_size - size < EFI_MIN_RESERVE)
+ return EFI_OUT_OF_RESOURCES;
+ }
+
+ return EFI_SUCCESS;
+}
+EXPORT_SYMBOL_GPL(efi_query_variable_store);
+
+/*
+ * The UEFI specification makes it clear that the operating system is
+ * free to do whatever it wants with boot services code after
+ * ExitBootServices() has been called. Ignoring this recommendation a
+ * significant bunch of EFI implementations continue calling into boot
+ * services code (SetVirtualAddressMap). In order to work around such
+ * buggy implementations we reserve boot services region during EFI
+ * init and make sure it stays executable. Then, after
+ * SetVirtualAddressMap(), it is discarded.
+ *
+ * However, some boot services regions contain data that is required
+ * by drivers, so we need to track which memory ranges can never be
+ * freed. This is done by tagging those regions with the
+ * EFI_MEMORY_RUNTIME attribute.
+ *
+ * Any driver that wants to mark a region as reserved must use
+ * efi_mem_reserve() which will insert a new EFI memory descriptor
+ * into efi.memmap (splitting existing regions if necessary) and tag
+ * it with EFI_MEMORY_RUNTIME.
+ */
+void __init efi_arch_mem_reserve(phys_addr_t addr, u64 size)
+{
+ phys_addr_t new_phys, new_size;
+ struct efi_mem_range mr;
+ efi_memory_desc_t md;
+ int num_entries;
+ void *new;
+
+ if (efi_mem_desc_lookup(addr, &md) ||
+ md.type != EFI_BOOT_SERVICES_DATA) {
+ pr_err("Failed to lookup EFI memory descriptor for %pa\n", &addr);
+ return;
+ }
+
+ if (addr + size > md.phys_addr + (md.num_pages << EFI_PAGE_SHIFT)) {
+ pr_err("Region spans EFI memory descriptors, %pa\n", &addr);
+ return;
+ }
+
+ size += addr % EFI_PAGE_SIZE;
+ size = round_up(size, EFI_PAGE_SIZE);
+ addr = round_down(addr, EFI_PAGE_SIZE);
+
+ mr.range.start = addr;
+ mr.range.end = addr + size - 1;
+ mr.attribute = md.attribute | EFI_MEMORY_RUNTIME;
+
+ num_entries = efi_memmap_split_count(&md, &mr.range);
+ num_entries += efi.memmap.nr_map;
+
+ new_size = efi.memmap.desc_size * num_entries;
+
+ new_phys = efi_memmap_alloc(num_entries);
+ if (!new_phys) {
+ pr_err("Could not allocate boot services memmap\n");
+ return;
+ }
+
+ new = early_memremap_prot(new_phys, new_size,
+ pgprot_val(pgprot_encrypted(FIXMAP_PAGE_NORMAL)));
+ if (!new) {
+ pr_err("Failed to map new boot services memmap\n");
+ return;
+ }
+
+ efi_memmap_insert(&efi.memmap, new, &mr);
+ early_memunmap(new, new_size);
+
+ efi_memmap_install(new_phys, num_entries);
+ e820__range_update(addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED);
+ e820__update_table(e820_table);
+}
+
+/*
+ * Helper function for efi_reserve_boot_services() to figure out if we
+ * can free regions in efi_free_boot_services().
+ *
+ * Use this function to ensure we do not free regions owned by somebody
+ * else. We must only reserve (and then free) regions:
+ *
+ * - Not within any part of the kernel
+ * - Not the BIOS reserved area (E820_TYPE_RESERVED, E820_TYPE_NVS, etc)
+ */
+static bool can_free_region(u64 start, u64 size)
+{
+ if (start + size > __pa_symbol(_text) && start <= __pa_symbol(_end))
+ return false;
+
+ if (!e820__mapped_all(start, start+size, E820_TYPE_RAM))
+ return false;
+
+ return true;
+}
+
+void __init efi_reserve_boot_services(void)
+{
+ efi_memory_desc_t *md;
+
+ for_each_efi_memory_desc(md) {
+ u64 start = md->phys_addr;
+ u64 size = md->num_pages << EFI_PAGE_SHIFT;
+ bool already_reserved;
+
+ if (md->type != EFI_BOOT_SERVICES_CODE &&
+ md->type != EFI_BOOT_SERVICES_DATA)
+ continue;
+
+ already_reserved = memblock_is_region_reserved(start, size);
+
+ /*
+ * Because the following memblock_reserve() is paired
+ * with free_bootmem_late() for this region in
+ * efi_free_boot_services(), we must be extremely
+ * careful not to reserve, and subsequently free,
+ * critical regions of memory (like the kernel image) or
+ * those regions that somebody else has already
+ * reserved.
+ *
+ * A good example of a critical region that must not be
+ * freed is page zero (first 4Kb of memory), which may
+ * contain boot services code/data but is marked
+ * E820_TYPE_RESERVED by trim_bios_range().
+ */
+ if (!already_reserved) {
+ memblock_reserve(start, size);
+
+ /*
+ * If we are the first to reserve the region, no
+ * one else cares about it. We own it and can
+ * free it later.
+ */
+ if (can_free_region(start, size))
+ continue;
+ }
+
+ /*
+ * We don't own the region. We must not free it.
+ *
+ * Setting this bit for a boot services region really
+ * doesn't make sense as far as the firmware is
+ * concerned, but it does provide us with a way to tag
+ * those regions that must not be paired with
+ * free_bootmem_late().
+ */
+ md->attribute |= EFI_MEMORY_RUNTIME;
+ }
+}
+
+void __init efi_free_boot_services(void)
+{
+ phys_addr_t new_phys, new_size;
+ efi_memory_desc_t *md;
+ int num_entries = 0;
+ void *new, *new_md;
+
+ for_each_efi_memory_desc(md) {
+ unsigned long long start = md->phys_addr;
+ unsigned long long size = md->num_pages << EFI_PAGE_SHIFT;
+ size_t rm_size;
+
+ if (md->type != EFI_BOOT_SERVICES_CODE &&
+ md->type != EFI_BOOT_SERVICES_DATA) {
+ num_entries++;
+ continue;
+ }
+
+ /* Do not free, someone else owns it: */
+ if (md->attribute & EFI_MEMORY_RUNTIME) {
+ num_entries++;
+ continue;
+ }
+
+ /*
+ * Nasty quirk: if all sub-1MB memory is used for boot
+ * services, we can get here without having allocated the
+ * real mode trampoline. It's too late to hand boot services
+ * memory back to the memblock allocator, so instead
+ * try to manually allocate the trampoline if needed.
+ *
+ * I've seen this on a Dell XPS 13 9350 with firmware
+ * 1.4.4 with SGX enabled booting Linux via Fedora 24's
+ * grub2-efi on a hard disk. (And no, I don't know why
+ * this happened, but Linux should still try to boot rather
+ * panicing early.)
+ */
+ rm_size = real_mode_size_needed();
+ if (rm_size && (start + rm_size) < (1<<20) && size >= rm_size) {
+ set_real_mode_mem(start, rm_size);
+ start += rm_size;
+ size -= rm_size;
+ }
+
+ free_bootmem_late(start, size);
+ }
+
+ if (!num_entries)
+ return;
+
+ new_size = efi.memmap.desc_size * num_entries;
+ new_phys = efi_memmap_alloc(num_entries);
+ if (!new_phys) {
+ pr_err("Failed to allocate new EFI memmap\n");
+ return;
+ }
+
+ new = memremap(new_phys, new_size, MEMREMAP_WB);
+ if (!new) {
+ pr_err("Failed to map new EFI memmap\n");
+ return;
+ }
+
+ /*
+ * Build a new EFI memmap that excludes any boot services
+ * regions that are not tagged EFI_MEMORY_RUNTIME, since those
+ * regions have now been freed.
+ */
+ new_md = new;
+ for_each_efi_memory_desc(md) {
+ if (!(md->attribute & EFI_MEMORY_RUNTIME) &&
+ (md->type == EFI_BOOT_SERVICES_CODE ||
+ md->type == EFI_BOOT_SERVICES_DATA))
+ continue;
+
+ memcpy(new_md, md, efi.memmap.desc_size);
+ new_md += efi.memmap.desc_size;
+ }
+
+ memunmap(new);
+
+ if (efi_memmap_install(new_phys, num_entries)) {
+ pr_err("Could not install new EFI memmap\n");
+ return;
+ }
+}
+
+/*
+ * A number of config table entries get remapped to virtual addresses
+ * after entering EFI virtual mode. However, the kexec kernel requires
+ * their physical addresses therefore we pass them via setup_data and
+ * correct those entries to their respective physical addresses here.
+ *
+ * Currently only handles smbios which is necessary for some firmware
+ * implementation.
+ */
+int __init efi_reuse_config(u64 tables, int nr_tables)
+{
+ int i, sz, ret = 0;
+ void *p, *tablep;
+ struct efi_setup_data *data;
+
+ if (!efi_setup)
+ return 0;
+
+ if (!efi_enabled(EFI_64BIT))
+ return 0;
+
+ data = early_memremap(efi_setup, sizeof(*data));
+ if (!data) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (!data->smbios)
+ goto out_memremap;
+
+ sz = sizeof(efi_config_table_64_t);
+
+ p = tablep = early_memremap(tables, nr_tables * sz);
+ if (!p) {
+ pr_err("Could not map Configuration table!\n");
+ ret = -ENOMEM;
+ goto out_memremap;
+ }
+
+ for (i = 0; i < efi.systab->nr_tables; i++) {
+ efi_guid_t guid;
+
+ guid = ((efi_config_table_64_t *)p)->guid;
+
+ if (!efi_guidcmp(guid, SMBIOS_TABLE_GUID))
+ ((efi_config_table_64_t *)p)->table = data->smbios;
+ p += sz;
+ }
+ early_memunmap(tablep, nr_tables * sz);
+
+out_memremap:
+ early_memunmap(data, sizeof(*data));
+out:
+ return ret;
+}
+
+static const struct dmi_system_id sgi_uv1_dmi[] = {
+ { NULL, "SGI UV1",
+ { DMI_MATCH(DMI_PRODUCT_NAME, "Stoutland Platform"),
+ DMI_MATCH(DMI_PRODUCT_VERSION, "1.0"),
+ DMI_MATCH(DMI_BIOS_VENDOR, "SGI.COM"),
+ }
+ },
+ { } /* NULL entry stops DMI scanning */
+};
+
+void __init efi_apply_memmap_quirks(void)
+{
+ /*
+ * Once setup is done earlier, unmap the EFI memory map on mismatched
+ * firmware/kernel architectures since there is no support for runtime
+ * services.
+ */
+ if (!efi_runtime_supported()) {
+ pr_info("Setup done, disabling due to 32/64-bit mismatch\n");
+ efi_memmap_unmap();
+ }
+
+ /* UV2+ BIOS has a fix for this issue. UV1 still needs the quirk. */
+ if (dmi_check_system(sgi_uv1_dmi))
+ set_bit(EFI_OLD_MEMMAP, &efi.flags);
+}
+
+/*
+ * For most modern platforms the preferred method of powering off is via
+ * ACPI. However, there are some that are known to require the use of
+ * EFI runtime services and for which ACPI does not work at all.
+ *
+ * Using EFI is a last resort, to be used only if no other option
+ * exists.
+ */
+bool efi_reboot_required(void)
+{
+ if (!acpi_gbl_reduced_hardware)
+ return false;
+
+ efi_reboot_quirk_mode = EFI_RESET_WARM;
+ return true;
+}
+
+bool efi_poweroff_required(void)
+{
+ return acpi_gbl_reduced_hardware || acpi_no_s5;
+}
+
+#ifdef CONFIG_EFI_CAPSULE_QUIRK_QUARK_CSH
+
+static int qrk_capsule_setup_info(struct capsule_info *cap_info, void **pkbuff,
+ size_t hdr_bytes)
+{
+ struct quark_security_header *csh = *pkbuff;
+
+ /* Only process data block that is larger than the security header */
+ if (hdr_bytes < sizeof(struct quark_security_header))
+ return 0;
+
+ if (csh->csh_signature != QUARK_CSH_SIGNATURE ||
+ csh->headersize != QUARK_SECURITY_HEADER_SIZE)
+ return 1;
+
+ /* Only process data block if EFI header is included */
+ if (hdr_bytes < QUARK_SECURITY_HEADER_SIZE +
+ sizeof(efi_capsule_header_t))
+ return 0;
+
+ pr_debug("Quark security header detected\n");
+
+ if (csh->rsvd_next_header != 0) {
+ pr_err("multiple Quark security headers not supported\n");
+ return -EINVAL;
+ }
+
+ *pkbuff += csh->headersize;
+ cap_info->total_size = csh->headersize;
+
+ /*
+ * Update the first page pointer to skip over the CSH header.
+ */
+ cap_info->phys[0] += csh->headersize;
+
+ /*
+ * cap_info->capsule should point at a virtual mapping of the entire
+ * capsule, starting at the capsule header. Our image has the Quark
+ * security header prepended, so we cannot rely on the default vmap()
+ * mapping created by the generic capsule code.
+ * Given that the Quark firmware does not appear to care about the
+ * virtual mapping, let's just point cap_info->capsule at our copy
+ * of the capsule header.
+ */
+ cap_info->capsule = &cap_info->header;
+
+ return 1;
+}
+
+#define ICPU(family, model, quirk_handler) \
+ { X86_VENDOR_INTEL, family, model, X86_FEATURE_ANY, \
+ (unsigned long)&quirk_handler }
+
+static const struct x86_cpu_id efi_capsule_quirk_ids[] = {
+ ICPU(5, 9, qrk_capsule_setup_info), /* Intel Quark X1000 */
+ { }
+};
+
+int efi_capsule_setup_info(struct capsule_info *cap_info, void *kbuff,
+ size_t hdr_bytes)
+{
+ int (*quirk_handler)(struct capsule_info *, void **, size_t);
+ const struct x86_cpu_id *id;
+ int ret;
+
+ if (hdr_bytes < sizeof(efi_capsule_header_t))
+ return 0;
+
+ cap_info->total_size = 0;
+
+ id = x86_match_cpu(efi_capsule_quirk_ids);
+ if (id) {
+ /*
+ * The quirk handler is supposed to return
+ * - a value > 0 if the setup should continue, after advancing
+ * kbuff as needed
+ * - 0 if not enough hdr_bytes are available yet
+ * - a negative error code otherwise
+ */
+ quirk_handler = (typeof(quirk_handler))id->driver_data;
+ ret = quirk_handler(cap_info, &kbuff, hdr_bytes);
+ if (ret <= 0)
+ return ret;
+ }
+
+ memcpy(&cap_info->header, kbuff, sizeof(cap_info->header));
+
+ cap_info->total_size += cap_info->header.imagesize;
+
+ return __efi_capsule_setup_info(cap_info);
+}
+
+#endif
diff --git a/arch/x86/platform/geode/Makefile b/arch/x86/platform/geode/Makefile
new file mode 100644
index 000000000..5b51194f4
--- /dev/null
+++ b/arch/x86/platform/geode/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_ALIX) += alix.o
+obj-$(CONFIG_NET5501) += net5501.o
+obj-$(CONFIG_GEOS) += geos.o
diff --git a/arch/x86/platform/geode/alix.c b/arch/x86/platform/geode/alix.c
new file mode 100644
index 000000000..1865c196f
--- /dev/null
+++ b/arch/x86/platform/geode/alix.c
@@ -0,0 +1,200 @@
+/*
+ * System Specific setup for PCEngines ALIX.
+ * At the moment this means setup of GPIO control of LEDs
+ * on Alix.2/3/6 boards.
+ *
+ *
+ * Copyright (C) 2008 Constantin Baranov <const@mimas.ru>
+ * Copyright (C) 2011 Ed Wildgoose <kernel@wildgooses.com>
+ * and Philip Prindeville <philipp@redfish-solutions.com>
+ *
+ * TODO: There are large similarities with leds-net5501.c
+ * by Alessandro Zummo <a.zummo@towertech.it>
+ * In the future leds-net5501.c should be migrated over to platform
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/string.h>
+#include <linux/moduleparam.h>
+#include <linux/leds.h>
+#include <linux/platform_device.h>
+#include <linux/gpio.h>
+#include <linux/input.h>
+#include <linux/gpio_keys.h>
+#include <linux/dmi.h>
+
+#include <asm/geode.h>
+
+#define BIOS_SIGNATURE_TINYBIOS 0xf0000
+#define BIOS_SIGNATURE_COREBOOT 0x500
+#define BIOS_REGION_SIZE 0x10000
+
+/*
+ * This driver is not modular, but to keep back compatibility
+ * with existing use cases, continuing with module_param is
+ * the easiest way forward.
+ */
+static bool force = 0;
+module_param(force, bool, 0444);
+/* FIXME: Award bios is not automatically detected as Alix platform */
+MODULE_PARM_DESC(force, "Force detection as ALIX.2/ALIX.3 platform");
+
+static struct gpio_keys_button alix_gpio_buttons[] = {
+ {
+ .code = KEY_RESTART,
+ .gpio = 24,
+ .active_low = 1,
+ .desc = "Reset button",
+ .type = EV_KEY,
+ .wakeup = 0,
+ .debounce_interval = 100,
+ .can_disable = 0,
+ }
+};
+static struct gpio_keys_platform_data alix_buttons_data = {
+ .buttons = alix_gpio_buttons,
+ .nbuttons = ARRAY_SIZE(alix_gpio_buttons),
+ .poll_interval = 20,
+};
+
+static struct platform_device alix_buttons_dev = {
+ .name = "gpio-keys-polled",
+ .id = 1,
+ .dev = {
+ .platform_data = &alix_buttons_data,
+ }
+};
+
+static struct gpio_led alix_leds[] = {
+ {
+ .name = "alix:1",
+ .gpio = 6,
+ .default_trigger = "default-on",
+ .active_low = 1,
+ },
+ {
+ .name = "alix:2",
+ .gpio = 25,
+ .default_trigger = "default-off",
+ .active_low = 1,
+ },
+ {
+ .name = "alix:3",
+ .gpio = 27,
+ .default_trigger = "default-off",
+ .active_low = 1,
+ },
+};
+
+static struct gpio_led_platform_data alix_leds_data = {
+ .num_leds = ARRAY_SIZE(alix_leds),
+ .leds = alix_leds,
+};
+
+static struct platform_device alix_leds_dev = {
+ .name = "leds-gpio",
+ .id = -1,
+ .dev.platform_data = &alix_leds_data,
+};
+
+static struct platform_device *alix_devs[] __initdata = {
+ &alix_buttons_dev,
+ &alix_leds_dev,
+};
+
+static void __init register_alix(void)
+{
+ /* Setup LED control through leds-gpio driver */
+ platform_add_devices(alix_devs, ARRAY_SIZE(alix_devs));
+}
+
+static bool __init alix_present(unsigned long bios_phys,
+ const char *alix_sig,
+ size_t alix_sig_len)
+{
+ const size_t bios_len = BIOS_REGION_SIZE;
+ const char *bios_virt;
+ const char *scan_end;
+ const char *p;
+ char name[64];
+
+ if (force) {
+ printk(KERN_NOTICE "%s: forced to skip BIOS test, "
+ "assume system is ALIX.2/ALIX.3\n",
+ KBUILD_MODNAME);
+ return true;
+ }
+
+ bios_virt = phys_to_virt(bios_phys);
+ scan_end = bios_virt + bios_len - (alix_sig_len + 2);
+ for (p = bios_virt; p < scan_end; p++) {
+ const char *tail;
+ char *a;
+
+ if (memcmp(p, alix_sig, alix_sig_len) != 0)
+ continue;
+
+ memcpy(name, p, sizeof(name));
+
+ /* remove the first \0 character from string */
+ a = strchr(name, '\0');
+ if (a)
+ *a = ' ';
+
+ /* cut the string at a newline */
+ a = strchr(name, '\r');
+ if (a)
+ *a = '\0';
+
+ tail = p + alix_sig_len;
+ if ((tail[0] == '2' || tail[0] == '3' || tail[0] == '6')) {
+ printk(KERN_INFO
+ "%s: system is recognized as \"%s\"\n",
+ KBUILD_MODNAME, name);
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static bool __init alix_present_dmi(void)
+{
+ const char *vendor, *product;
+
+ vendor = dmi_get_system_info(DMI_SYS_VENDOR);
+ if (!vendor || strcmp(vendor, "PC Engines"))
+ return false;
+
+ product = dmi_get_system_info(DMI_PRODUCT_NAME);
+ if (!product || (strcmp(product, "ALIX.2D") && strcmp(product, "ALIX.6")))
+ return false;
+
+ printk(KERN_INFO "%s: system is recognized as \"%s %s\"\n",
+ KBUILD_MODNAME, vendor, product);
+
+ return true;
+}
+
+static int __init alix_init(void)
+{
+ const char tinybios_sig[] = "PC Engines ALIX.";
+ const char coreboot_sig[] = "PC Engines\0ALIX.";
+
+ if (!is_geode())
+ return 0;
+
+ if (alix_present(BIOS_SIGNATURE_TINYBIOS, tinybios_sig, sizeof(tinybios_sig) - 1) ||
+ alix_present(BIOS_SIGNATURE_COREBOOT, coreboot_sig, sizeof(coreboot_sig) - 1) ||
+ alix_present_dmi())
+ register_alix();
+
+ return 0;
+}
+device_initcall(alix_init);
diff --git a/arch/x86/platform/geode/geos.c b/arch/x86/platform/geode/geos.c
new file mode 100644
index 000000000..4fcdb9131
--- /dev/null
+++ b/arch/x86/platform/geode/geos.c
@@ -0,0 +1,122 @@
+/*
+ * System Specific setup for Traverse Technologies GEOS.
+ * At the moment this means setup of GPIO control of LEDs.
+ *
+ * Copyright (C) 2008 Constantin Baranov <const@mimas.ru>
+ * Copyright (C) 2011 Ed Wildgoose <kernel@wildgooses.com>
+ * and Philip Prindeville <philipp@redfish-solutions.com>
+ *
+ * TODO: There are large similarities with leds-net5501.c
+ * by Alessandro Zummo <a.zummo@towertech.it>
+ * In the future leds-net5501.c should be migrated over to platform
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/string.h>
+#include <linux/leds.h>
+#include <linux/platform_device.h>
+#include <linux/gpio.h>
+#include <linux/input.h>
+#include <linux/gpio_keys.h>
+#include <linux/dmi.h>
+
+#include <asm/geode.h>
+
+static struct gpio_keys_button geos_gpio_buttons[] = {
+ {
+ .code = KEY_RESTART,
+ .gpio = 3,
+ .active_low = 1,
+ .desc = "Reset button",
+ .type = EV_KEY,
+ .wakeup = 0,
+ .debounce_interval = 100,
+ .can_disable = 0,
+ }
+};
+static struct gpio_keys_platform_data geos_buttons_data = {
+ .buttons = geos_gpio_buttons,
+ .nbuttons = ARRAY_SIZE(geos_gpio_buttons),
+ .poll_interval = 20,
+};
+
+static struct platform_device geos_buttons_dev = {
+ .name = "gpio-keys-polled",
+ .id = 1,
+ .dev = {
+ .platform_data = &geos_buttons_data,
+ }
+};
+
+static struct gpio_led geos_leds[] = {
+ {
+ .name = "geos:1",
+ .gpio = 6,
+ .default_trigger = "default-on",
+ .active_low = 1,
+ },
+ {
+ .name = "geos:2",
+ .gpio = 25,
+ .default_trigger = "default-off",
+ .active_low = 1,
+ },
+ {
+ .name = "geos:3",
+ .gpio = 27,
+ .default_trigger = "default-off",
+ .active_low = 1,
+ },
+};
+
+static struct gpio_led_platform_data geos_leds_data = {
+ .num_leds = ARRAY_SIZE(geos_leds),
+ .leds = geos_leds,
+};
+
+static struct platform_device geos_leds_dev = {
+ .name = "leds-gpio",
+ .id = -1,
+ .dev.platform_data = &geos_leds_data,
+};
+
+static struct platform_device *geos_devs[] __initdata = {
+ &geos_buttons_dev,
+ &geos_leds_dev,
+};
+
+static void __init register_geos(void)
+{
+ /* Setup LED control through leds-gpio driver */
+ platform_add_devices(geos_devs, ARRAY_SIZE(geos_devs));
+}
+
+static int __init geos_init(void)
+{
+ const char *vendor, *product;
+
+ if (!is_geode())
+ return 0;
+
+ vendor = dmi_get_system_info(DMI_SYS_VENDOR);
+ if (!vendor || strcmp(vendor, "Traverse Technologies"))
+ return 0;
+
+ product = dmi_get_system_info(DMI_PRODUCT_NAME);
+ if (!product || strcmp(product, "Geos"))
+ return 0;
+
+ printk(KERN_INFO "%s: system is recognized as \"%s %s\"\n",
+ KBUILD_MODNAME, vendor, product);
+
+ register_geos();
+
+ return 0;
+}
+device_initcall(geos_init);
diff --git a/arch/x86/platform/geode/net5501.c b/arch/x86/platform/geode/net5501.c
new file mode 100644
index 000000000..a2f6b982a
--- /dev/null
+++ b/arch/x86/platform/geode/net5501.c
@@ -0,0 +1,148 @@
+/*
+ * System Specific setup for Soekris net5501
+ * At the moment this means setup of GPIO control of LEDs and buttons
+ * on net5501 boards.
+ *
+ *
+ * Copyright (C) 2008-2009 Tower Technologies
+ * Written by Alessandro Zummo <a.zummo@towertech.it>
+ *
+ * Copyright (C) 2008 Constantin Baranov <const@mimas.ru>
+ * Copyright (C) 2011 Ed Wildgoose <kernel@wildgooses.com>
+ * and Philip Prindeville <philipp@redfish-solutions.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/string.h>
+#include <linux/leds.h>
+#include <linux/platform_device.h>
+#include <linux/gpio.h>
+#include <linux/input.h>
+#include <linux/gpio_keys.h>
+
+#include <asm/geode.h>
+
+#define BIOS_REGION_BASE 0xffff0000
+#define BIOS_REGION_SIZE 0x00010000
+
+static struct gpio_keys_button net5501_gpio_buttons[] = {
+ {
+ .code = KEY_RESTART,
+ .gpio = 24,
+ .active_low = 1,
+ .desc = "Reset button",
+ .type = EV_KEY,
+ .wakeup = 0,
+ .debounce_interval = 100,
+ .can_disable = 0,
+ }
+};
+static struct gpio_keys_platform_data net5501_buttons_data = {
+ .buttons = net5501_gpio_buttons,
+ .nbuttons = ARRAY_SIZE(net5501_gpio_buttons),
+ .poll_interval = 20,
+};
+
+static struct platform_device net5501_buttons_dev = {
+ .name = "gpio-keys-polled",
+ .id = 1,
+ .dev = {
+ .platform_data = &net5501_buttons_data,
+ }
+};
+
+static struct gpio_led net5501_leds[] = {
+ {
+ .name = "net5501:1",
+ .gpio = 6,
+ .default_trigger = "default-on",
+ .active_low = 0,
+ },
+};
+
+static struct gpio_led_platform_data net5501_leds_data = {
+ .num_leds = ARRAY_SIZE(net5501_leds),
+ .leds = net5501_leds,
+};
+
+static struct platform_device net5501_leds_dev = {
+ .name = "leds-gpio",
+ .id = -1,
+ .dev.platform_data = &net5501_leds_data,
+};
+
+static struct platform_device *net5501_devs[] __initdata = {
+ &net5501_buttons_dev,
+ &net5501_leds_dev,
+};
+
+static void __init register_net5501(void)
+{
+ /* Setup LED control through leds-gpio driver */
+ platform_add_devices(net5501_devs, ARRAY_SIZE(net5501_devs));
+}
+
+struct net5501_board {
+ u16 offset;
+ u16 len;
+ char *sig;
+};
+
+static struct net5501_board __initdata boards[] = {
+ { 0xb7b, 7, "net5501" }, /* net5501 v1.33/1.33c */
+ { 0xb1f, 7, "net5501" }, /* net5501 v1.32i */
+};
+
+static bool __init net5501_present(void)
+{
+ int i;
+ unsigned char *rombase, *bios;
+ bool found = false;
+
+ rombase = ioremap(BIOS_REGION_BASE, BIOS_REGION_SIZE - 1);
+ if (!rombase) {
+ printk(KERN_ERR "%s: failed to get rombase\n", KBUILD_MODNAME);
+ return found;
+ }
+
+ bios = rombase + 0x20; /* null terminated */
+
+ if (memcmp(bios, "comBIOS", 7))
+ goto unmap;
+
+ for (i = 0; i < ARRAY_SIZE(boards); i++) {
+ unsigned char *model = rombase + boards[i].offset;
+
+ if (!memcmp(model, boards[i].sig, boards[i].len)) {
+ printk(KERN_INFO "%s: system is recognized as \"%s\"\n",
+ KBUILD_MODNAME, model);
+
+ found = true;
+ break;
+ }
+ }
+
+unmap:
+ iounmap(rombase);
+ return found;
+}
+
+static int __init net5501_init(void)
+{
+ if (!is_geode())
+ return 0;
+
+ if (!net5501_present())
+ return 0;
+
+ register_net5501();
+
+ return 0;
+}
+device_initcall(net5501_init);
diff --git a/arch/x86/platform/goldfish/Makefile b/arch/x86/platform/goldfish/Makefile
new file mode 100644
index 000000000..f030b532f
--- /dev/null
+++ b/arch/x86/platform/goldfish/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_GOLDFISH) += goldfish.o
diff --git a/arch/x86/platform/goldfish/goldfish.c b/arch/x86/platform/goldfish/goldfish.c
new file mode 100644
index 000000000..0d17c0aaf
--- /dev/null
+++ b/arch/x86/platform/goldfish/goldfish.c
@@ -0,0 +1,63 @@
+/*
+ * Copyright (C) 2007 Google, Inc.
+ * Copyright (C) 2011 Intel, Inc.
+ * Copyright (C) 2013 Intel, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/irq.h>
+#include <linux/platform_device.h>
+
+/*
+ * Where in virtual device memory the IO devices (timers, system controllers
+ * and so on)
+ */
+
+#define GOLDFISH_PDEV_BUS_BASE (0xff001000)
+#define GOLDFISH_PDEV_BUS_END (0xff7fffff)
+#define GOLDFISH_PDEV_BUS_IRQ (4)
+
+#define GOLDFISH_TTY_BASE (0x2000)
+
+static struct resource goldfish_pdev_bus_resources[] = {
+ {
+ .start = GOLDFISH_PDEV_BUS_BASE,
+ .end = GOLDFISH_PDEV_BUS_END,
+ .flags = IORESOURCE_MEM,
+ },
+ {
+ .start = GOLDFISH_PDEV_BUS_IRQ,
+ .end = GOLDFISH_PDEV_BUS_IRQ,
+ .flags = IORESOURCE_IRQ,
+ }
+};
+
+static bool goldfish_enable __initdata;
+
+static int __init goldfish_setup(char *str)
+{
+ goldfish_enable = true;
+ return 0;
+}
+__setup("goldfish", goldfish_setup);
+
+static int __init goldfish_init(void)
+{
+ if (!goldfish_enable)
+ return -ENODEV;
+
+ platform_device_register_simple("goldfish_pdev_bus", -1,
+ goldfish_pdev_bus_resources, 2);
+ return 0;
+}
+device_initcall(goldfish_init);
diff --git a/arch/x86/platform/intel-mid/Makefile b/arch/x86/platform/intel-mid/Makefile
new file mode 100644
index 000000000..5cf886c86
--- /dev/null
+++ b/arch/x86/platform/intel-mid/Makefile
@@ -0,0 +1,6 @@
+obj-$(CONFIG_X86_INTEL_MID) += intel-mid.o intel_mid_vrtc.o pwr.o
+
+# SFI specific code
+ifdef CONFIG_X86_INTEL_MID
+obj-$(CONFIG_SFI) += sfi.o device_libs/
+endif
diff --git a/arch/x86/platform/intel-mid/device_libs/Makefile b/arch/x86/platform/intel-mid/device_libs/Makefile
new file mode 100644
index 000000000..480fed21c
--- /dev/null
+++ b/arch/x86/platform/intel-mid/device_libs/Makefile
@@ -0,0 +1,33 @@
+# SPDX-License-Identifier: GPL-2.0
+# Family-Level Interface Shim (FLIS)
+obj-$(subst m,y,$(CONFIG_PINCTRL_MERRIFIELD)) += platform_mrfld_pinctrl.o
+# SDHCI Devices
+obj-$(subst m,y,$(CONFIG_MMC_SDHCI_PCI)) += platform_mrfld_sd.o
+# WiFi + BT
+obj-$(subst m,y,$(CONFIG_BRCMFMAC_SDIO)) += platform_bcm43xx.o
+obj-$(subst m,y,$(CONFIG_BT_HCIUART_BCM)) += platform_bt.o
+# IPC Devices
+obj-$(subst m,y,$(CONFIG_MFD_INTEL_MSIC)) += platform_msic.o
+obj-$(subst m,y,$(CONFIG_SND_MFLD_MACHINE)) += platform_msic_audio.o
+obj-$(subst m,y,$(CONFIG_GPIO_MSIC)) += platform_msic_gpio.o
+obj-$(subst m,y,$(CONFIG_MFD_INTEL_MSIC)) += platform_msic_ocd.o
+obj-$(subst m,y,$(CONFIG_MFD_INTEL_MSIC)) += platform_msic_battery.o
+obj-$(subst m,y,$(CONFIG_INTEL_MID_POWER_BUTTON)) += platform_msic_power_btn.o
+obj-$(subst m,y,$(CONFIG_INTEL_MFLD_THERMAL)) += platform_msic_thermal.o
+# SPI Devices
+obj-$(subst m,y,$(CONFIG_SPI_SPIDEV)) += platform_mrfld_spidev.o
+# I2C Devices
+obj-$(subst m,y,$(CONFIG_SENSORS_EMC1403)) += platform_emc1403.o
+obj-$(subst m,y,$(CONFIG_SENSORS_LIS3LV02D)) += platform_lis331.o
+obj-$(subst m,y,$(CONFIG_MPU3050_I2C)) += platform_mpu3050.o
+obj-$(subst m,y,$(CONFIG_INPUT_BMA150)) += platform_bma023.o
+obj-$(subst m,y,$(CONFIG_DRM_MEDFIELD)) += platform_tc35876x.o
+# I2C GPIO Expanders
+obj-$(subst m,y,$(CONFIG_GPIO_PCA953X)) += platform_max7315.o
+obj-$(subst m,y,$(CONFIG_GPIO_PCA953X)) += platform_pcal9555a.o
+obj-$(subst m,y,$(CONFIG_GPIO_PCA953X)) += platform_tca6416.o
+# MISC Devices
+obj-$(subst m,y,$(CONFIG_KEYBOARD_GPIO)) += platform_gpio_keys.o
+obj-$(subst m,y,$(CONFIG_INTEL_MID_POWER_BUTTON)) += platform_mrfld_power_btn.o
+obj-$(subst m,y,$(CONFIG_RTC_DRV_CMOS)) += platform_mrfld_rtc.o
+obj-$(subst m,y,$(CONFIG_INTEL_MID_WATCHDOG)) += platform_mrfld_wdt.o
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_bcm43xx.c b/arch/x86/platform/intel-mid/device_libs/platform_bcm43xx.c
new file mode 100644
index 000000000..4392c15ed
--- /dev/null
+++ b/arch/x86/platform/intel-mid/device_libs/platform_bcm43xx.c
@@ -0,0 +1,95 @@
+/*
+ * platform_bcm43xx.c: bcm43xx platform data initilization file
+ *
+ * (C) Copyright 2016 Intel Corporation
+ * Author: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/gpio.h>
+#include <linux/platform_device.h>
+#include <linux/regulator/machine.h>
+#include <linux/regulator/fixed.h>
+#include <linux/sfi.h>
+
+#include <asm/intel-mid.h>
+
+#define WLAN_SFI_GPIO_IRQ_NAME "WLAN-interrupt"
+#define WLAN_SFI_GPIO_ENABLE_NAME "WLAN-enable"
+
+#define WLAN_DEV_NAME "0000:00:01.3"
+
+static struct regulator_consumer_supply bcm43xx_vmmc_supply = {
+ .dev_name = WLAN_DEV_NAME,
+ .supply = "vmmc",
+};
+
+static struct regulator_init_data bcm43xx_vmmc_data = {
+ .constraints = {
+ .valid_ops_mask = REGULATOR_CHANGE_STATUS,
+ },
+ .num_consumer_supplies = 1,
+ .consumer_supplies = &bcm43xx_vmmc_supply,
+};
+
+static struct fixed_voltage_config bcm43xx_vmmc = {
+ .supply_name = "bcm43xx-vmmc-regulator",
+ /*
+ * Announce 2.0V here to be compatible with SDIO specification. The
+ * real voltage and signaling are still 1.8V.
+ */
+ .microvolts = 2000000, /* 1.8V */
+ .gpio = -EINVAL,
+ .startup_delay = 250 * 1000, /* 250ms */
+ .enable_high = 1, /* active high */
+ .enabled_at_boot = 0, /* disabled at boot */
+ .init_data = &bcm43xx_vmmc_data,
+};
+
+static struct platform_device bcm43xx_vmmc_regulator = {
+ .name = "reg-fixed-voltage",
+ .id = PLATFORM_DEVID_AUTO,
+ .dev = {
+ .platform_data = &bcm43xx_vmmc,
+ },
+};
+
+static int __init bcm43xx_regulator_register(void)
+{
+ int ret;
+
+ bcm43xx_vmmc.gpio = get_gpio_by_name(WLAN_SFI_GPIO_ENABLE_NAME);
+ ret = platform_device_register(&bcm43xx_vmmc_regulator);
+ if (ret) {
+ pr_err("%s: vmmc regulator register failed\n", __func__);
+ return ret;
+ }
+
+ return 0;
+}
+
+static void __init *bcm43xx_platform_data(void *info)
+{
+ int ret;
+
+ ret = bcm43xx_regulator_register();
+ if (ret)
+ return NULL;
+
+ pr_info("Using generic wifi platform data\n");
+
+ /* For now it's empty */
+ return NULL;
+}
+
+static const struct devs_id bcm43xx_clk_vmmc_dev_id __initconst = {
+ .name = "bcm43xx_clk_vmmc",
+ .type = SFI_DEV_TYPE_SD,
+ .get_platform_data = &bcm43xx_platform_data,
+};
+
+sfi_device(bcm43xx_clk_vmmc_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_bma023.c b/arch/x86/platform/intel-mid/device_libs/platform_bma023.c
new file mode 100644
index 000000000..c26cf393d
--- /dev/null
+++ b/arch/x86/platform/intel-mid/device_libs/platform_bma023.c
@@ -0,0 +1,20 @@
+/*
+ * platform_bma023.c: bma023 platform data initialization file
+ *
+ * (C) Copyright 2013 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <asm/intel-mid.h>
+
+static const struct devs_id bma023_dev_id __initconst = {
+ .name = "bma023",
+ .type = SFI_DEV_TYPE_I2C,
+ .delay = 1,
+};
+
+sfi_device(bma023_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_bt.c b/arch/x86/platform/intel-mid/device_libs/platform_bt.c
new file mode 100644
index 000000000..31dce7813
--- /dev/null
+++ b/arch/x86/platform/intel-mid/device_libs/platform_bt.c
@@ -0,0 +1,108 @@
+/*
+ * Bluetooth platform data initialization file
+ *
+ * (C) Copyright 2017 Intel Corporation
+ * Author: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/gpio/machine.h>
+#include <linux/pci.h>
+#include <linux/platform_device.h>
+
+#include <asm/cpu_device_id.h>
+#include <asm/intel-family.h>
+#include <asm/intel-mid.h>
+
+struct bt_sfi_data {
+ struct device *dev;
+ const char *name;
+ int (*setup)(struct bt_sfi_data *ddata);
+};
+
+static struct gpiod_lookup_table tng_bt_sfi_gpio_table = {
+ .dev_id = "hci_bcm",
+ .table = {
+ GPIO_LOOKUP("0000:00:0c.0", -1, "device-wakeup", GPIO_ACTIVE_HIGH),
+ GPIO_LOOKUP("0000:00:0c.0", -1, "shutdown", GPIO_ACTIVE_HIGH),
+ GPIO_LOOKUP("0000:00:0c.0", -1, "host-wakeup", GPIO_ACTIVE_HIGH),
+ { },
+ },
+};
+
+#define TNG_BT_SFI_GPIO_DEVICE_WAKEUP "bt_wakeup"
+#define TNG_BT_SFI_GPIO_SHUTDOWN "BT-reset"
+#define TNG_BT_SFI_GPIO_HOST_WAKEUP "bt_uart_enable"
+
+static int __init tng_bt_sfi_setup(struct bt_sfi_data *ddata)
+{
+ struct gpiod_lookup_table *table = &tng_bt_sfi_gpio_table;
+ struct gpiod_lookup *lookup = table->table;
+ struct pci_dev *pdev;
+
+ /* Connected to /dev/ttyS0 */
+ pdev = pci_get_domain_bus_and_slot(0, 0, PCI_DEVFN(4, 1));
+ if (!pdev)
+ return -ENODEV;
+
+ ddata->dev = &pdev->dev;
+ ddata->name = table->dev_id;
+
+ lookup[0].chip_hwnum = get_gpio_by_name(TNG_BT_SFI_GPIO_DEVICE_WAKEUP);
+ lookup[1].chip_hwnum = get_gpio_by_name(TNG_BT_SFI_GPIO_SHUTDOWN);
+ lookup[2].chip_hwnum = get_gpio_by_name(TNG_BT_SFI_GPIO_HOST_WAKEUP);
+
+ gpiod_add_lookup_table(table);
+ return 0;
+}
+
+static struct bt_sfi_data tng_bt_sfi_data __initdata = {
+ .setup = tng_bt_sfi_setup,
+};
+
+#define ICPU(model, ddata) \
+ { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (kernel_ulong_t)&ddata }
+
+static const struct x86_cpu_id bt_sfi_cpu_ids[] = {
+ ICPU(INTEL_FAM6_ATOM_SILVERMONT_MID, tng_bt_sfi_data),
+ {}
+};
+
+static int __init bt_sfi_init(void)
+{
+ struct platform_device_info info;
+ struct platform_device *pdev;
+ const struct x86_cpu_id *id;
+ struct bt_sfi_data *ddata;
+ int ret;
+
+ id = x86_match_cpu(bt_sfi_cpu_ids);
+ if (!id)
+ return -ENODEV;
+
+ ddata = (struct bt_sfi_data *)id->driver_data;
+ if (!ddata)
+ return -ENODEV;
+
+ ret = ddata->setup(ddata);
+ if (ret)
+ return ret;
+
+ memset(&info, 0, sizeof(info));
+ info.fwnode = ddata->dev->fwnode;
+ info.parent = ddata->dev;
+ info.name = ddata->name,
+ info.id = PLATFORM_DEVID_NONE,
+
+ pdev = platform_device_register_full(&info);
+ if (IS_ERR(pdev))
+ return PTR_ERR(pdev);
+
+ dev_info(ddata->dev, "Registered Bluetooth device: %s\n", ddata->name);
+ return 0;
+}
+device_initcall(bt_sfi_init);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_emc1403.c b/arch/x86/platform/intel-mid/device_libs/platform_emc1403.c
new file mode 100644
index 000000000..c259fb6c8
--- /dev/null
+++ b/arch/x86/platform/intel-mid/device_libs/platform_emc1403.c
@@ -0,0 +1,43 @@
+/*
+ * platform_emc1403.c: emc1403 platform data initialization file
+ *
+ * (C) Copyright 2013 Intel Corporation
+ * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/init.h>
+#include <linux/gpio.h>
+#include <linux/i2c.h>
+#include <asm/intel-mid.h>
+
+static void __init *emc1403_platform_data(void *info)
+{
+ static short intr2nd_pdata;
+ struct i2c_board_info *i2c_info = info;
+ int intr = get_gpio_by_name("thermal_int");
+ int intr2nd = get_gpio_by_name("thermal_alert");
+
+ if (intr < 0)
+ return NULL;
+ if (intr2nd < 0)
+ return NULL;
+
+ i2c_info->irq = intr + INTEL_MID_IRQ_OFFSET;
+ intr2nd_pdata = intr2nd + INTEL_MID_IRQ_OFFSET;
+
+ return &intr2nd_pdata;
+}
+
+static const struct devs_id emc1403_dev_id __initconst = {
+ .name = "emc1403",
+ .type = SFI_DEV_TYPE_I2C,
+ .delay = 1,
+ .get_platform_data = &emc1403_platform_data,
+};
+
+sfi_device(emc1403_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c b/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c
new file mode 100644
index 000000000..e639e3116
--- /dev/null
+++ b/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c
@@ -0,0 +1,85 @@
+/*
+ * platform_gpio_keys.c: gpio_keys platform data initialization file
+ *
+ * (C) Copyright 2013 Intel Corporation
+ * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/input.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/gpio.h>
+#include <linux/gpio_keys.h>
+#include <linux/platform_device.h>
+#include <asm/intel-mid.h>
+
+#define DEVICE_NAME "gpio-keys"
+
+/*
+ * we will search these buttons in SFI GPIO table (by name)
+ * and register them dynamically. Please add all possible
+ * buttons here, we will shrink them if no GPIO found.
+ */
+static struct gpio_keys_button gpio_button[] = {
+ {KEY_POWER, -1, 1, "power_btn", EV_KEY, 0, 3000},
+ {KEY_PROG1, -1, 1, "prog_btn1", EV_KEY, 0, 20},
+ {KEY_PROG2, -1, 1, "prog_btn2", EV_KEY, 0, 20},
+ {SW_LID, -1, 1, "lid_switch", EV_SW, 0, 20},
+ {KEY_VOLUMEUP, -1, 1, "vol_up", EV_KEY, 0, 20},
+ {KEY_VOLUMEDOWN, -1, 1, "vol_down", EV_KEY, 0, 20},
+ {KEY_MUTE, -1, 1, "mute_enable", EV_KEY, 0, 20},
+ {KEY_VOLUMEUP, -1, 1, "volume_up", EV_KEY, 0, 20},
+ {KEY_VOLUMEDOWN, -1, 1, "volume_down", EV_KEY, 0, 20},
+ {KEY_CAMERA, -1, 1, "camera_full", EV_KEY, 0, 20},
+ {KEY_CAMERA_FOCUS, -1, 1, "camera_half", EV_KEY, 0, 20},
+ {SW_KEYPAD_SLIDE, -1, 1, "MagSw1", EV_SW, 0, 20},
+ {SW_KEYPAD_SLIDE, -1, 1, "MagSw2", EV_SW, 0, 20},
+};
+
+static struct gpio_keys_platform_data gpio_keys = {
+ .buttons = gpio_button,
+ .rep = 1,
+ .nbuttons = -1, /* will fill it after search */
+};
+
+static struct platform_device pb_device = {
+ .name = DEVICE_NAME,
+ .id = -1,
+ .dev = {
+ .platform_data = &gpio_keys,
+ },
+};
+
+/*
+ * Shrink the non-existent buttons, register the gpio button
+ * device if there is some
+ */
+static int __init pb_keys_init(void)
+{
+ struct gpio_keys_button *gb = gpio_button;
+ int i, good = 0;
+
+ for (i = 0; i < ARRAY_SIZE(gpio_button); i++) {
+ gb[i].gpio = get_gpio_by_name(gb[i].desc);
+ pr_debug("info[%2d]: name = %s, gpio = %d\n", i, gb[i].desc,
+ gb[i].gpio);
+ if (gb[i].gpio < 0)
+ continue;
+
+ if (i != good)
+ gb[good] = gb[i];
+ good++;
+ }
+
+ if (good) {
+ gpio_keys.nbuttons = good;
+ return platform_device_register(&pb_device);
+ }
+ return 0;
+}
+late_initcall(pb_keys_init);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_lis331.c b/arch/x86/platform/intel-mid/device_libs/platform_lis331.c
new file mode 100644
index 000000000..a35cf912d
--- /dev/null
+++ b/arch/x86/platform/intel-mid/device_libs/platform_lis331.c
@@ -0,0 +1,41 @@
+/*
+ * platform_lis331.c: lis331 platform data initialization file
+ *
+ * (C) Copyright 2013 Intel Corporation
+ * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/i2c.h>
+#include <linux/gpio.h>
+#include <asm/intel-mid.h>
+
+static void __init *lis331dl_platform_data(void *info)
+{
+ static short intr2nd_pdata;
+ struct i2c_board_info *i2c_info = info;
+ int intr = get_gpio_by_name("accel_int");
+ int intr2nd = get_gpio_by_name("accel_2");
+
+ if (intr < 0)
+ return NULL;
+ if (intr2nd < 0)
+ return NULL;
+
+ i2c_info->irq = intr + INTEL_MID_IRQ_OFFSET;
+ intr2nd_pdata = intr2nd + INTEL_MID_IRQ_OFFSET;
+
+ return &intr2nd_pdata;
+}
+
+static const struct devs_id lis331dl_dev_id __initconst = {
+ .name = "i2c_accel",
+ .type = SFI_DEV_TYPE_I2C,
+ .get_platform_data = &lis331dl_platform_data,
+};
+
+sfi_device(lis331dl_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_max7315.c b/arch/x86/platform/intel-mid/device_libs/platform_max7315.c
new file mode 100644
index 000000000..58337b2bc
--- /dev/null
+++ b/arch/x86/platform/intel-mid/device_libs/platform_max7315.c
@@ -0,0 +1,81 @@
+/*
+ * platform_max7315.c: max7315 platform data initialization file
+ *
+ * (C) Copyright 2013 Intel Corporation
+ * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/init.h>
+#include <linux/gpio.h>
+#include <linux/i2c.h>
+#include <linux/platform_data/pca953x.h>
+#include <asm/intel-mid.h>
+
+#define MAX7315_NUM 2
+
+static void __init *max7315_platform_data(void *info)
+{
+ static struct pca953x_platform_data max7315_pdata[MAX7315_NUM];
+ static int nr;
+ struct pca953x_platform_data *max7315 = &max7315_pdata[nr];
+ struct i2c_board_info *i2c_info = info;
+ int gpio_base, intr;
+ char base_pin_name[SFI_NAME_LEN + 1];
+ char intr_pin_name[SFI_NAME_LEN + 1];
+
+ if (nr == MAX7315_NUM) {
+ pr_err("too many max7315s, we only support %d\n",
+ MAX7315_NUM);
+ return NULL;
+ }
+ /* we have several max7315 on the board, we only need load several
+ * instances of the same pca953x driver to cover them
+ */
+ strcpy(i2c_info->type, "max7315");
+ if (nr++) {
+ snprintf(base_pin_name, sizeof(base_pin_name),
+ "max7315_%d_base", nr);
+ snprintf(intr_pin_name, sizeof(intr_pin_name),
+ "max7315_%d_int", nr);
+ } else {
+ strcpy(base_pin_name, "max7315_base");
+ strcpy(intr_pin_name, "max7315_int");
+ }
+
+ gpio_base = get_gpio_by_name(base_pin_name);
+ intr = get_gpio_by_name(intr_pin_name);
+
+ if (gpio_base < 0)
+ return NULL;
+ max7315->gpio_base = gpio_base;
+ if (intr != -1) {
+ i2c_info->irq = intr + INTEL_MID_IRQ_OFFSET;
+ max7315->irq_base = gpio_base + INTEL_MID_IRQ_OFFSET;
+ } else {
+ i2c_info->irq = -1;
+ max7315->irq_base = -1;
+ }
+ return max7315;
+}
+
+static const struct devs_id max7315_dev_id __initconst = {
+ .name = "i2c_max7315",
+ .type = SFI_DEV_TYPE_I2C,
+ .delay = 1,
+ .get_platform_data = &max7315_platform_data,
+};
+
+static const struct devs_id max7315_2_dev_id __initconst = {
+ .name = "i2c_max7315_2",
+ .type = SFI_DEV_TYPE_I2C,
+ .delay = 1,
+ .get_platform_data = &max7315_platform_data,
+};
+
+sfi_device(max7315_dev_id);
+sfi_device(max7315_2_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mpu3050.c b/arch/x86/platform/intel-mid/device_libs/platform_mpu3050.c
new file mode 100644
index 000000000..ee22864bb
--- /dev/null
+++ b/arch/x86/platform/intel-mid/device_libs/platform_mpu3050.c
@@ -0,0 +1,36 @@
+/*
+ * platform_mpu3050.c: mpu3050 platform data initialization file
+ *
+ * (C) Copyright 2013 Intel Corporation
+ * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/gpio.h>
+#include <linux/i2c.h>
+#include <asm/intel-mid.h>
+
+static void *mpu3050_platform_data(void *info)
+{
+ struct i2c_board_info *i2c_info = info;
+ int intr = get_gpio_by_name("mpu3050_int");
+
+ if (intr < 0)
+ return NULL;
+
+ i2c_info->irq = intr + INTEL_MID_IRQ_OFFSET;
+ return NULL;
+}
+
+static const struct devs_id mpu3050_dev_id __initconst = {
+ .name = "mpu3050",
+ .type = SFI_DEV_TYPE_I2C,
+ .delay = 1,
+ .get_platform_data = &mpu3050_platform_data,
+};
+
+sfi_device(mpu3050_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_pinctrl.c b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_pinctrl.c
new file mode 100644
index 000000000..4de8a664e
--- /dev/null
+++ b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_pinctrl.c
@@ -0,0 +1,43 @@
+/*
+ * Intel Merrifield FLIS platform device initialization file
+ *
+ * Copyright (C) 2016, Intel Corporation
+ *
+ * Author: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/init.h>
+#include <linux/ioport.h>
+#include <linux/platform_device.h>
+
+#include <asm/intel-mid.h>
+
+#define FLIS_BASE_ADDR 0xff0c0000
+#define FLIS_LENGTH 0x8000
+
+static struct resource mrfld_pinctrl_mmio_resource = {
+ .start = FLIS_BASE_ADDR,
+ .end = FLIS_BASE_ADDR + FLIS_LENGTH - 1,
+ .flags = IORESOURCE_MEM,
+};
+
+static struct platform_device mrfld_pinctrl_device = {
+ .name = "pinctrl-merrifield",
+ .id = PLATFORM_DEVID_NONE,
+ .resource = &mrfld_pinctrl_mmio_resource,
+ .num_resources = 1,
+};
+
+static int __init mrfld_pinctrl_init(void)
+{
+ if (intel_mid_identify_cpu() == INTEL_MID_CPU_CHIP_TANGIER)
+ return platform_device_register(&mrfld_pinctrl_device);
+
+ return -ENODEV;
+}
+arch_initcall(mrfld_pinctrl_init);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_power_btn.c b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_power_btn.c
new file mode 100644
index 000000000..a6c3705a2
--- /dev/null
+++ b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_power_btn.c
@@ -0,0 +1,82 @@
+/*
+ * Intel Merrifield power button support
+ *
+ * (C) Copyright 2017 Intel Corporation
+ *
+ * Author: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/init.h>
+#include <linux/ioport.h>
+#include <linux/platform_device.h>
+#include <linux/sfi.h>
+
+#include <asm/intel-mid.h>
+#include <asm/intel_scu_ipc.h>
+
+static struct resource mrfld_power_btn_resources[] = {
+ {
+ .flags = IORESOURCE_IRQ,
+ },
+};
+
+static struct platform_device mrfld_power_btn_dev = {
+ .name = "msic_power_btn",
+ .id = PLATFORM_DEVID_NONE,
+ .num_resources = ARRAY_SIZE(mrfld_power_btn_resources),
+ .resource = mrfld_power_btn_resources,
+};
+
+static int mrfld_power_btn_scu_status_change(struct notifier_block *nb,
+ unsigned long code, void *data)
+{
+ if (code == SCU_DOWN) {
+ platform_device_unregister(&mrfld_power_btn_dev);
+ return 0;
+ }
+
+ return platform_device_register(&mrfld_power_btn_dev);
+}
+
+static struct notifier_block mrfld_power_btn_scu_notifier = {
+ .notifier_call = mrfld_power_btn_scu_status_change,
+};
+
+static int __init register_mrfld_power_btn(void)
+{
+ if (intel_mid_identify_cpu() != INTEL_MID_CPU_CHIP_TANGIER)
+ return -ENODEV;
+
+ /*
+ * We need to be sure that the SCU IPC is ready before
+ * PMIC power button device can be registered:
+ */
+ intel_scu_notifier_add(&mrfld_power_btn_scu_notifier);
+
+ return 0;
+}
+arch_initcall(register_mrfld_power_btn);
+
+static void __init *mrfld_power_btn_platform_data(void *info)
+{
+ struct resource *res = mrfld_power_btn_resources;
+ struct sfi_device_table_entry *pentry = info;
+
+ res->start = res->end = pentry->irq;
+ return NULL;
+}
+
+static const struct devs_id mrfld_power_btn_dev_id __initconst = {
+ .name = "bcove_power_btn",
+ .type = SFI_DEV_TYPE_IPC,
+ .delay = 1,
+ .msic = 1,
+ .get_platform_data = &mrfld_power_btn_platform_data,
+};
+
+sfi_device(mrfld_power_btn_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_rtc.c b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_rtc.c
new file mode 100644
index 000000000..3135416df
--- /dev/null
+++ b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_rtc.c
@@ -0,0 +1,48 @@
+/*
+ * Intel Merrifield legacy RTC initialization file
+ *
+ * (C) Copyright 2017 Intel Corporation
+ *
+ * Author: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/init.h>
+
+#include <asm/hw_irq.h>
+#include <asm/intel-mid.h>
+#include <asm/io_apic.h>
+#include <asm/time.h>
+#include <asm/x86_init.h>
+
+static int __init mrfld_legacy_rtc_alloc_irq(void)
+{
+ struct irq_alloc_info info;
+ int ret;
+
+ if (!x86_platform.legacy.rtc)
+ return -ENODEV;
+
+ ioapic_set_alloc_attr(&info, NUMA_NO_NODE, 1, 0);
+ ret = mp_map_gsi_to_irq(RTC_IRQ, IOAPIC_MAP_ALLOC, &info);
+ if (ret < 0) {
+ pr_info("Failed to allocate RTC interrupt. Disabling RTC\n");
+ x86_platform.legacy.rtc = 0;
+ return ret;
+ }
+
+ return 0;
+}
+
+static int __init mrfld_legacy_rtc_init(void)
+{
+ if (intel_mid_identify_cpu() != INTEL_MID_CPU_CHIP_TANGIER)
+ return -ENODEV;
+
+ return mrfld_legacy_rtc_alloc_irq();
+}
+arch_initcall(mrfld_legacy_rtc_init);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_sd.c b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_sd.c
new file mode 100644
index 000000000..00c4a034a
--- /dev/null
+++ b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_sd.c
@@ -0,0 +1,47 @@
+/*
+ * SDHCI platform data initilisation file
+ *
+ * (C) Copyright 2016 Intel Corporation
+ * Author: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/init.h>
+#include <linux/pci.h>
+
+#include <linux/mmc/sdhci-pci-data.h>
+
+#include <asm/intel-mid.h>
+
+#define INTEL_MRFLD_SD 2
+#define INTEL_MRFLD_SD_CD_GPIO 77
+
+static struct sdhci_pci_data mrfld_sdhci_pci_data = {
+ .rst_n_gpio = -EINVAL,
+ .cd_gpio = INTEL_MRFLD_SD_CD_GPIO,
+};
+
+static struct sdhci_pci_data *
+mrfld_sdhci_pci_get_data(struct pci_dev *pdev, int slotno)
+{
+ unsigned int func = PCI_FUNC(pdev->devfn);
+
+ if (func == INTEL_MRFLD_SD)
+ return &mrfld_sdhci_pci_data;
+
+ return NULL;
+}
+
+static int __init mrfld_sd_init(void)
+{
+ if (intel_mid_identify_cpu() != INTEL_MID_CPU_CHIP_TANGIER)
+ return -ENODEV;
+
+ sdhci_pci_get_data = mrfld_sdhci_pci_get_data;
+ return 0;
+}
+arch_initcall(mrfld_sd_init);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_spidev.c b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_spidev.c
new file mode 100644
index 000000000..27186ad65
--- /dev/null
+++ b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_spidev.c
@@ -0,0 +1,54 @@
+/*
+ * spidev platform data initilization file
+ *
+ * (C) Copyright 2014, 2016 Intel Corporation
+ * Authors: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
+ * Dan O'Donovan <dan@emutex.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/sfi.h>
+#include <linux/spi/pxa2xx_spi.h>
+#include <linux/spi/spi.h>
+
+#include <asm/intel-mid.h>
+
+#define MRFLD_SPI_DEFAULT_DMA_BURST 8
+#define MRFLD_SPI_DEFAULT_TIMEOUT 500
+
+/* GPIO pin for spidev chipselect */
+#define MRFLD_SPIDEV_GPIO_CS 111
+
+static struct pxa2xx_spi_chip spidev_spi_chip = {
+ .dma_burst_size = MRFLD_SPI_DEFAULT_DMA_BURST,
+ .timeout = MRFLD_SPI_DEFAULT_TIMEOUT,
+ .gpio_cs = MRFLD_SPIDEV_GPIO_CS,
+};
+
+static void __init *spidev_platform_data(void *info)
+{
+ struct spi_board_info *spi_info = info;
+
+ if (intel_mid_identify_cpu() != INTEL_MID_CPU_CHIP_TANGIER)
+ return ERR_PTR(-ENODEV);
+
+ spi_info->mode = SPI_MODE_0;
+ spi_info->controller_data = &spidev_spi_chip;
+
+ return NULL;
+}
+
+static const struct devs_id spidev_dev_id __initconst = {
+ .name = "spidev",
+ .type = SFI_DEV_TYPE_SPI,
+ .delay = 0,
+ .get_platform_data = &spidev_platform_data,
+};
+
+sfi_device(spidev_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c
new file mode 100644
index 000000000..2acd6be13
--- /dev/null
+++ b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c
@@ -0,0 +1,86 @@
+/*
+ * Intel Merrifield watchdog platform device library file
+ *
+ * (C) Copyright 2014 Intel Corporation
+ * Author: David Cohen <david.a.cohen@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/platform_device.h>
+#include <linux/platform_data/intel-mid_wdt.h>
+
+#include <asm/intel-mid.h>
+#include <asm/intel_scu_ipc.h>
+#include <asm/io_apic.h>
+#include <asm/hw_irq.h>
+
+#define TANGIER_EXT_TIMER0_MSI 12
+
+static struct platform_device wdt_dev = {
+ .name = "intel_mid_wdt",
+ .id = -1,
+};
+
+static int tangier_probe(struct platform_device *pdev)
+{
+ struct irq_alloc_info info;
+ struct intel_mid_wdt_pdata *pdata = pdev->dev.platform_data;
+ int gsi = TANGIER_EXT_TIMER0_MSI;
+ int irq;
+
+ if (!pdata)
+ return -EINVAL;
+
+ /* IOAPIC builds identity mapping between GSI and IRQ on MID */
+ ioapic_set_alloc_attr(&info, cpu_to_node(0), 1, 0);
+ irq = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC, &info);
+ if (irq < 0) {
+ dev_warn(&pdev->dev, "cannot find interrupt %d in ioapic\n", gsi);
+ return irq;
+ }
+
+ pdata->irq = irq;
+ return 0;
+}
+
+static struct intel_mid_wdt_pdata tangier_pdata = {
+ .probe = tangier_probe,
+};
+
+static int wdt_scu_status_change(struct notifier_block *nb,
+ unsigned long code, void *data)
+{
+ if (code == SCU_DOWN) {
+ platform_device_unregister(&wdt_dev);
+ return 0;
+ }
+
+ return platform_device_register(&wdt_dev);
+}
+
+static struct notifier_block wdt_scu_notifier = {
+ .notifier_call = wdt_scu_status_change,
+};
+
+static int __init register_mid_wdt(void)
+{
+ if (intel_mid_identify_cpu() != INTEL_MID_CPU_CHIP_TANGIER)
+ return -ENODEV;
+
+ wdt_dev.dev.platform_data = &tangier_pdata;
+
+ /*
+ * We need to be sure that the SCU IPC is ready before watchdog device
+ * can be registered:
+ */
+ intel_scu_notifier_add(&wdt_scu_notifier);
+
+ return 0;
+}
+arch_initcall(register_mid_wdt);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic.c b/arch/x86/platform/intel-mid/device_libs/platform_msic.c
new file mode 100644
index 000000000..e421106c1
--- /dev/null
+++ b/arch/x86/platform/intel-mid/device_libs/platform_msic.c
@@ -0,0 +1,87 @@
+/*
+ * platform_msic.c: MSIC platform data initialization file
+ *
+ * (C) Copyright 2013 Intel Corporation
+ * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/scatterlist.h>
+#include <linux/init.h>
+#include <linux/sfi.h>
+#include <linux/mfd/intel_msic.h>
+#include <asm/intel_scu_ipc.h>
+#include <asm/intel-mid.h>
+#include "platform_msic.h"
+
+struct intel_msic_platform_data msic_pdata;
+
+static struct resource msic_resources[] = {
+ {
+ .start = INTEL_MSIC_IRQ_PHYS_BASE,
+ .end = INTEL_MSIC_IRQ_PHYS_BASE + 64 - 1,
+ .flags = IORESOURCE_MEM,
+ },
+};
+
+static struct platform_device msic_device = {
+ .name = "intel_msic",
+ .id = -1,
+ .dev = {
+ .platform_data = &msic_pdata,
+ },
+ .num_resources = ARRAY_SIZE(msic_resources),
+ .resource = msic_resources,
+};
+
+static int msic_scu_status_change(struct notifier_block *nb,
+ unsigned long code, void *data)
+{
+ if (code == SCU_DOWN) {
+ platform_device_unregister(&msic_device);
+ return 0;
+ }
+
+ return platform_device_register(&msic_device);
+}
+
+static int __init msic_init(void)
+{
+ static struct notifier_block msic_scu_notifier = {
+ .notifier_call = msic_scu_status_change,
+ };
+
+ /*
+ * We need to be sure that the SCU IPC is ready before MSIC device
+ * can be registered.
+ */
+ if (intel_mid_has_msic())
+ intel_scu_notifier_add(&msic_scu_notifier);
+
+ return 0;
+}
+arch_initcall(msic_init);
+
+/*
+ * msic_generic_platform_data - sets generic platform data for the block
+ * @info: pointer to the SFI device table entry for this block
+ * @block: MSIC block
+ *
+ * Function sets IRQ number from the SFI table entry for given device to
+ * the MSIC platform data.
+ */
+void *msic_generic_platform_data(void *info, enum intel_msic_block block)
+{
+ struct sfi_device_table_entry *entry = info;
+
+ BUG_ON(block < 0 || block >= INTEL_MSIC_BLOCK_LAST);
+ msic_pdata.irq[block] = entry->irq;
+
+ return NULL;
+}
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic.h b/arch/x86/platform/intel-mid/device_libs/platform_msic.h
new file mode 100644
index 000000000..b7be1d041
--- /dev/null
+++ b/arch/x86/platform/intel-mid/device_libs/platform_msic.h
@@ -0,0 +1,19 @@
+/*
+ * platform_msic.h: MSIC platform data header file
+ *
+ * (C) Copyright 2013 Intel Corporation
+ * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+#ifndef _PLATFORM_MSIC_H_
+#define _PLATFORM_MSIC_H_
+
+extern struct intel_msic_platform_data msic_pdata;
+
+void *msic_generic_platform_data(void *info, enum intel_msic_block block);
+
+#endif
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_audio.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_audio.c
new file mode 100644
index 000000000..d4dc744dd
--- /dev/null
+++ b/arch/x86/platform/intel-mid/device_libs/platform_msic_audio.c
@@ -0,0 +1,46 @@
+/*
+ * platform_msic_audio.c: MSIC audio platform data initialization file
+ *
+ * (C) Copyright 2013 Intel Corporation
+ * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/scatterlist.h>
+#include <linux/init.h>
+#include <linux/sfi.h>
+#include <linux/platform_device.h>
+#include <linux/mfd/intel_msic.h>
+#include <asm/intel-mid.h>
+
+#include "platform_msic.h"
+
+static void *msic_audio_platform_data(void *info)
+{
+ struct platform_device *pdev;
+
+ pdev = platform_device_register_simple("sst-platform", -1, NULL, 0);
+
+ if (IS_ERR(pdev)) {
+ pr_err("failed to create audio platform device\n");
+ return NULL;
+ }
+
+ return msic_generic_platform_data(info, INTEL_MSIC_BLOCK_AUDIO);
+}
+
+static const struct devs_id msic_audio_dev_id __initconst = {
+ .name = "msic_audio",
+ .type = SFI_DEV_TYPE_IPC,
+ .delay = 1,
+ .msic = 1,
+ .get_platform_data = &msic_audio_platform_data,
+};
+
+sfi_device(msic_audio_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_battery.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_battery.c
new file mode 100644
index 000000000..5c3e99196
--- /dev/null
+++ b/arch/x86/platform/intel-mid/device_libs/platform_msic_battery.c
@@ -0,0 +1,36 @@
+/*
+ * platform_msic_battery.c: MSIC battery platform data initialization file
+ *
+ * (C) Copyright 2013 Intel Corporation
+ * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/scatterlist.h>
+#include <linux/init.h>
+#include <linux/sfi.h>
+#include <linux/mfd/intel_msic.h>
+#include <asm/intel-mid.h>
+
+#include "platform_msic.h"
+
+static void __init *msic_battery_platform_data(void *info)
+{
+ return msic_generic_platform_data(info, INTEL_MSIC_BLOCK_BATTERY);
+}
+
+static const struct devs_id msic_battery_dev_id __initconst = {
+ .name = "msic_battery",
+ .type = SFI_DEV_TYPE_IPC,
+ .delay = 1,
+ .msic = 1,
+ .get_platform_data = &msic_battery_platform_data,
+};
+
+sfi_device(msic_battery_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_gpio.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_gpio.c
new file mode 100644
index 000000000..9fdb88d46
--- /dev/null
+++ b/arch/x86/platform/intel-mid/device_libs/platform_msic_gpio.c
@@ -0,0 +1,47 @@
+/*
+ * platform_msic_gpio.c: MSIC GPIO platform data initialization file
+ *
+ * (C) Copyright 2013 Intel Corporation
+ * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/scatterlist.h>
+#include <linux/sfi.h>
+#include <linux/init.h>
+#include <linux/gpio.h>
+#include <linux/mfd/intel_msic.h>
+#include <asm/intel-mid.h>
+
+#include "platform_msic.h"
+
+static void __init *msic_gpio_platform_data(void *info)
+{
+ static struct intel_msic_gpio_pdata msic_gpio_pdata;
+
+ int gpio = get_gpio_by_name("msic_gpio_base");
+
+ if (gpio < 0)
+ return NULL;
+
+ msic_gpio_pdata.gpio_base = gpio;
+ msic_pdata.gpio = &msic_gpio_pdata;
+
+ return msic_generic_platform_data(info, INTEL_MSIC_BLOCK_GPIO);
+}
+
+static const struct devs_id msic_gpio_dev_id __initconst = {
+ .name = "msic_gpio",
+ .type = SFI_DEV_TYPE_IPC,
+ .delay = 1,
+ .msic = 1,
+ .get_platform_data = &msic_gpio_platform_data,
+};
+
+sfi_device(msic_gpio_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_ocd.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_ocd.c
new file mode 100644
index 000000000..7ae37cdbf
--- /dev/null
+++ b/arch/x86/platform/intel-mid/device_libs/platform_msic_ocd.c
@@ -0,0 +1,48 @@
+/*
+ * platform_msic_ocd.c: MSIC OCD platform data initialization file
+ *
+ * (C) Copyright 2013 Intel Corporation
+ * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/scatterlist.h>
+#include <linux/sfi.h>
+#include <linux/init.h>
+#include <linux/gpio.h>
+#include <linux/mfd/intel_msic.h>
+#include <asm/intel-mid.h>
+
+#include "platform_msic.h"
+
+static void __init *msic_ocd_platform_data(void *info)
+{
+ static struct intel_msic_ocd_pdata msic_ocd_pdata;
+ int gpio;
+
+ gpio = get_gpio_by_name("ocd_gpio");
+
+ if (gpio < 0)
+ return NULL;
+
+ msic_ocd_pdata.gpio = gpio;
+ msic_pdata.ocd = &msic_ocd_pdata;
+
+ return msic_generic_platform_data(info, INTEL_MSIC_BLOCK_OCD);
+}
+
+static const struct devs_id msic_ocd_dev_id __initconst = {
+ .name = "msic_ocd",
+ .type = SFI_DEV_TYPE_IPC,
+ .delay = 1,
+ .msic = 1,
+ .get_platform_data = &msic_ocd_platform_data,
+};
+
+sfi_device(msic_ocd_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_power_btn.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_power_btn.c
new file mode 100644
index 000000000..96809b98c
--- /dev/null
+++ b/arch/x86/platform/intel-mid/device_libs/platform_msic_power_btn.c
@@ -0,0 +1,35 @@
+/*
+ * platform_msic_power_btn.c: MSIC power btn platform data initialization file
+ *
+ * (C) Copyright 2013 Intel Corporation
+ * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/scatterlist.h>
+#include <linux/sfi.h>
+#include <linux/init.h>
+#include <linux/mfd/intel_msic.h>
+#include <asm/intel-mid.h>
+
+#include "platform_msic.h"
+
+static void __init *msic_power_btn_platform_data(void *info)
+{
+ return msic_generic_platform_data(info, INTEL_MSIC_BLOCK_POWER_BTN);
+}
+
+static const struct devs_id msic_power_btn_dev_id __initconst = {
+ .name = "msic_power_btn",
+ .type = SFI_DEV_TYPE_IPC,
+ .delay = 1,
+ .msic = 1,
+ .get_platform_data = &msic_power_btn_platform_data,
+};
+
+sfi_device(msic_power_btn_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_thermal.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_thermal.c
new file mode 100644
index 000000000..3e4167d24
--- /dev/null
+++ b/arch/x86/platform/intel-mid/device_libs/platform_msic_thermal.c
@@ -0,0 +1,36 @@
+/*
+ * platform_msic_thermal.c: msic_thermal platform data initialization file
+ *
+ * (C) Copyright 2013 Intel Corporation
+ * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/input.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/gpio.h>
+#include <linux/platform_device.h>
+#include <linux/mfd/intel_msic.h>
+#include <asm/intel-mid.h>
+
+#include "platform_msic.h"
+
+static void __init *msic_thermal_platform_data(void *info)
+{
+ return msic_generic_platform_data(info, INTEL_MSIC_BLOCK_THERMAL);
+}
+
+static const struct devs_id msic_thermal_dev_id __initconst = {
+ .name = "msic_thermal",
+ .type = SFI_DEV_TYPE_IPC,
+ .delay = 1,
+ .msic = 1,
+ .get_platform_data = &msic_thermal_platform_data,
+};
+
+sfi_device(msic_thermal_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_pcal9555a.c b/arch/x86/platform/intel-mid/device_libs/platform_pcal9555a.c
new file mode 100644
index 000000000..429a94192
--- /dev/null
+++ b/arch/x86/platform/intel-mid/device_libs/platform_pcal9555a.c
@@ -0,0 +1,99 @@
+/*
+ * PCAL9555a platform data initilization file
+ *
+ * Copyright (C) 2016, Intel Corporation
+ *
+ * Authors: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
+ * Dan O'Donovan <dan@emutex.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/gpio.h>
+#include <linux/init.h>
+#include <linux/i2c.h>
+#include <linux/platform_data/pca953x.h>
+#include <linux/sfi.h>
+
+#include <asm/intel-mid.h>
+
+#define PCAL9555A_NUM 4
+
+static struct pca953x_platform_data pcal9555a_pdata[PCAL9555A_NUM];
+static int nr;
+
+static void __init *pcal9555a_platform_data(void *info)
+{
+ struct i2c_board_info *i2c_info = info;
+ char *type = i2c_info->type;
+ struct pca953x_platform_data *pcal9555a;
+ char base_pin_name[SFI_NAME_LEN + 1];
+ char intr_pin_name[SFI_NAME_LEN + 1];
+ int gpio_base, intr;
+
+ snprintf(base_pin_name, sizeof(base_pin_name), "%s_base", type);
+ snprintf(intr_pin_name, sizeof(intr_pin_name), "%s_int", type);
+
+ gpio_base = get_gpio_by_name(base_pin_name);
+ intr = get_gpio_by_name(intr_pin_name);
+
+ /* Check if the SFI record valid */
+ if (gpio_base == -1)
+ return NULL;
+
+ if (nr >= PCAL9555A_NUM) {
+ pr_err("%s: Too many instances, only %d supported\n", __func__,
+ PCAL9555A_NUM);
+ return NULL;
+ }
+
+ pcal9555a = &pcal9555a_pdata[nr++];
+ pcal9555a->gpio_base = gpio_base;
+
+ if (intr >= 0) {
+ i2c_info->irq = intr + INTEL_MID_IRQ_OFFSET;
+ pcal9555a->irq_base = gpio_base + INTEL_MID_IRQ_OFFSET;
+ } else {
+ i2c_info->irq = -1;
+ pcal9555a->irq_base = -1;
+ }
+
+ strcpy(type, "pcal9555a");
+ return pcal9555a;
+}
+
+static const struct devs_id pcal9555a_1_dev_id __initconst = {
+ .name = "pcal9555a-1",
+ .type = SFI_DEV_TYPE_I2C,
+ .delay = 1,
+ .get_platform_data = &pcal9555a_platform_data,
+};
+
+static const struct devs_id pcal9555a_2_dev_id __initconst = {
+ .name = "pcal9555a-2",
+ .type = SFI_DEV_TYPE_I2C,
+ .delay = 1,
+ .get_platform_data = &pcal9555a_platform_data,
+};
+
+static const struct devs_id pcal9555a_3_dev_id __initconst = {
+ .name = "pcal9555a-3",
+ .type = SFI_DEV_TYPE_I2C,
+ .delay = 1,
+ .get_platform_data = &pcal9555a_platform_data,
+};
+
+static const struct devs_id pcal9555a_4_dev_id __initconst = {
+ .name = "pcal9555a-4",
+ .type = SFI_DEV_TYPE_I2C,
+ .delay = 1,
+ .get_platform_data = &pcal9555a_platform_data,
+};
+
+sfi_device(pcal9555a_1_dev_id);
+sfi_device(pcal9555a_2_dev_id);
+sfi_device(pcal9555a_3_dev_id);
+sfi_device(pcal9555a_4_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_tc35876x.c b/arch/x86/platform/intel-mid/device_libs/platform_tc35876x.c
new file mode 100644
index 000000000..290537655
--- /dev/null
+++ b/arch/x86/platform/intel-mid/device_libs/platform_tc35876x.c
@@ -0,0 +1,36 @@
+/*
+ * platform_tc35876x.c: tc35876x platform data initialization file
+ *
+ * (C) Copyright 2013 Intel Corporation
+ * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/gpio.h>
+#include <linux/platform_data/tc35876x.h>
+#include <asm/intel-mid.h>
+
+/*tc35876x DSI_LVDS bridge chip and panel platform data*/
+static void *tc35876x_platform_data(void *data)
+{
+ static struct tc35876x_platform_data pdata;
+
+ /* gpio pins set to -1 will not be used by the driver */
+ pdata.gpio_bridge_reset = get_gpio_by_name("LCMB_RXEN");
+ pdata.gpio_panel_bl_en = get_gpio_by_name("6S6P_BL_EN");
+ pdata.gpio_panel_vadd = get_gpio_by_name("EN_VREG_LCD_V3P3");
+
+ return &pdata;
+}
+
+static const struct devs_id tc35876x_dev_id __initconst = {
+ .name = "i2c_disp_brig",
+ .type = SFI_DEV_TYPE_I2C,
+ .get_platform_data = &tc35876x_platform_data,
+};
+
+sfi_device(tc35876x_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_tca6416.c b/arch/x86/platform/intel-mid/device_libs/platform_tca6416.c
new file mode 100644
index 000000000..4f41372ce
--- /dev/null
+++ b/arch/x86/platform/intel-mid/device_libs/platform_tca6416.c
@@ -0,0 +1,57 @@
+/*
+ * platform_tca6416.c: tca6416 platform data initialization file
+ *
+ * (C) Copyright 2013 Intel Corporation
+ * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/platform_data/pca953x.h>
+#include <linux/i2c.h>
+#include <linux/gpio.h>
+#include <asm/intel-mid.h>
+
+#define TCA6416_NAME "tca6416"
+#define TCA6416_BASE "tca6416_base"
+#define TCA6416_INTR "tca6416_int"
+
+static void *tca6416_platform_data(void *info)
+{
+ static struct pca953x_platform_data tca6416;
+ struct i2c_board_info *i2c_info = info;
+ int gpio_base, intr;
+ char base_pin_name[SFI_NAME_LEN + 1];
+ char intr_pin_name[SFI_NAME_LEN + 1];
+
+ strcpy(i2c_info->type, TCA6416_NAME);
+ strcpy(base_pin_name, TCA6416_BASE);
+ strcpy(intr_pin_name, TCA6416_INTR);
+
+ gpio_base = get_gpio_by_name(base_pin_name);
+ intr = get_gpio_by_name(intr_pin_name);
+
+ if (gpio_base < 0)
+ return NULL;
+ tca6416.gpio_base = gpio_base;
+ if (intr >= 0) {
+ i2c_info->irq = intr + INTEL_MID_IRQ_OFFSET;
+ tca6416.irq_base = gpio_base + INTEL_MID_IRQ_OFFSET;
+ } else {
+ i2c_info->irq = -1;
+ tca6416.irq_base = -1;
+ }
+ return &tca6416;
+}
+
+static const struct devs_id tca6416_dev_id __initconst = {
+ .name = "tca6416",
+ .type = SFI_DEV_TYPE_I2C,
+ .delay = 1,
+ .get_platform_data = &tca6416_platform_data,
+};
+
+sfi_device(tca6416_dev_id);
diff --git a/arch/x86/platform/intel-mid/intel-mid.c b/arch/x86/platform/intel-mid/intel-mid.c
new file mode 100644
index 000000000..56f66eafb
--- /dev/null
+++ b/arch/x86/platform/intel-mid/intel-mid.c
@@ -0,0 +1,216 @@
+/*
+ * intel-mid.c: Intel MID platform setup code
+ *
+ * (C) Copyright 2008, 2012 Intel Corporation
+ * Author: Jacob Pan (jacob.jun.pan@intel.com)
+ * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#define pr_fmt(fmt) "intel_mid: " fmt
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/regulator/machine.h>
+#include <linux/scatterlist.h>
+#include <linux/sfi.h>
+#include <linux/irq.h>
+#include <linux/export.h>
+#include <linux/notifier.h>
+
+#include <asm/setup.h>
+#include <asm/mpspec_def.h>
+#include <asm/hw_irq.h>
+#include <asm/apic.h>
+#include <asm/io_apic.h>
+#include <asm/intel-mid.h>
+#include <asm/intel_mid_vrtc.h>
+#include <asm/io.h>
+#include <asm/i8259.h>
+#include <asm/intel_scu_ipc.h>
+#include <asm/apb_timer.h>
+#include <asm/reboot.h>
+
+/*
+ * the clockevent devices on Moorestown/Medfield can be APBT or LAPIC clock,
+ * cmdline option x86_intel_mid_timer can be used to override the configuration
+ * to prefer one or the other.
+ * at runtime, there are basically three timer configurations:
+ * 1. per cpu apbt clock only
+ * 2. per cpu always-on lapic clocks only, this is Penwell/Medfield only
+ * 3. per cpu lapic clock (C3STOP) and one apbt clock, with broadcast.
+ *
+ * by default (without cmdline option), platform code first detects cpu type
+ * to see if we are on lincroft or penwell, then set up both lapic or apbt
+ * clocks accordingly.
+ * i.e. by default, medfield uses configuration #2, moorestown uses #1.
+ * config #3 is supported but not recommended on medfield.
+ *
+ * rating and feature summary:
+ * lapic (with C3STOP) --------- 100
+ * apbt (always-on) ------------ 110
+ * lapic (always-on,ARAT) ------ 150
+ */
+
+enum intel_mid_timer_options intel_mid_timer_options;
+
+enum intel_mid_cpu_type __intel_mid_cpu_chip;
+EXPORT_SYMBOL_GPL(__intel_mid_cpu_chip);
+
+static void intel_mid_power_off(void)
+{
+ /* Shut down South Complex via PWRMU */
+ intel_mid_pwr_power_off();
+
+ /* Only for Tangier, the rest will ignore this command */
+ intel_scu_ipc_simple_command(IPCMSG_COLD_OFF, 1);
+};
+
+static void intel_mid_reboot(void)
+{
+ intel_scu_ipc_simple_command(IPCMSG_COLD_RESET, 0);
+}
+
+static void __init intel_mid_setup_bp_timer(void)
+{
+ apbt_time_init();
+ setup_boot_APIC_clock();
+}
+
+static void __init intel_mid_time_init(void)
+{
+ sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr);
+
+ switch (intel_mid_timer_options) {
+ case INTEL_MID_TIMER_APBT_ONLY:
+ break;
+ case INTEL_MID_TIMER_LAPIC_APBT:
+ /* Use apbt and local apic */
+ x86_init.timers.setup_percpu_clockev = intel_mid_setup_bp_timer;
+ x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock;
+ return;
+ default:
+ if (!boot_cpu_has(X86_FEATURE_ARAT))
+ break;
+ /* Lapic only, no apbt */
+ x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock;
+ x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock;
+ return;
+ }
+
+ x86_init.timers.setup_percpu_clockev = apbt_time_init;
+}
+
+static void intel_mid_arch_setup(void)
+{
+ if (boot_cpu_data.x86 != 6) {
+ pr_err("Unknown Intel MID CPU (%d:%d), default to Penwell\n",
+ boot_cpu_data.x86, boot_cpu_data.x86_model);
+ __intel_mid_cpu_chip = INTEL_MID_CPU_CHIP_PENWELL;
+ goto out;
+ }
+
+ switch (boot_cpu_data.x86_model) {
+ case 0x35:
+ __intel_mid_cpu_chip = INTEL_MID_CPU_CHIP_CLOVERVIEW;
+ break;
+ case 0x3C:
+ case 0x4A:
+ __intel_mid_cpu_chip = INTEL_MID_CPU_CHIP_TANGIER;
+ x86_platform.legacy.rtc = 1;
+ break;
+ case 0x27:
+ default:
+ __intel_mid_cpu_chip = INTEL_MID_CPU_CHIP_PENWELL;
+ break;
+ }
+
+out:
+ /*
+ * Intel MID platforms are using explicitly defined regulators.
+ *
+ * Let the regulator core know that we do not have any additional
+ * regulators left. This lets it substitute unprovided regulators with
+ * dummy ones:
+ */
+ regulator_has_full_constraints();
+}
+
+/*
+ * Moorestown does not have external NMI source nor port 0x61 to report
+ * NMI status. The possible NMI sources are from pmu as a result of NMI
+ * watchdog or lock debug. Reading io port 0x61 results in 0xff which
+ * misled NMI handler.
+ */
+static unsigned char intel_mid_get_nmi_reason(void)
+{
+ return 0;
+}
+
+/*
+ * Moorestown specific x86_init function overrides and early setup
+ * calls.
+ */
+void __init x86_intel_mid_early_setup(void)
+{
+ x86_init.resources.probe_roms = x86_init_noop;
+ x86_init.resources.reserve_resources = x86_init_noop;
+
+ x86_init.timers.timer_init = intel_mid_time_init;
+ x86_init.timers.setup_percpu_clockev = x86_init_noop;
+ x86_init.timers.wallclock_init = intel_mid_rtc_init;
+
+ x86_init.irqs.pre_vector_init = x86_init_noop;
+
+ x86_init.oem.arch_setup = intel_mid_arch_setup;
+
+ x86_cpuinit.setup_percpu_clockev = apbt_setup_secondary_clock;
+
+ x86_platform.get_nmi_reason = intel_mid_get_nmi_reason;
+
+ x86_init.pci.arch_init = intel_mid_pci_init;
+ x86_init.pci.fixup_irqs = x86_init_noop;
+
+ legacy_pic = &null_legacy_pic;
+
+ /*
+ * Do nothing for now as everything needed done in
+ * x86_intel_mid_early_setup() below.
+ */
+ x86_init.acpi.reduced_hw_early_init = x86_init_noop;
+
+ pm_power_off = intel_mid_power_off;
+ machine_ops.emergency_restart = intel_mid_reboot;
+
+ /* Avoid searching for BIOS MP tables */
+ x86_init.mpparse.find_smp_config = x86_init_noop;
+ x86_init.mpparse.get_smp_config = x86_init_uint_noop;
+ set_bit(MP_BUS_ISA, mp_bus_not_pci);
+}
+
+/*
+ * if user does not want to use per CPU apb timer, just give it a lower rating
+ * than local apic timer and skip the late per cpu timer init.
+ */
+static inline int __init setup_x86_intel_mid_timer(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
+
+ if (strcmp("apbt_only", arg) == 0)
+ intel_mid_timer_options = INTEL_MID_TIMER_APBT_ONLY;
+ else if (strcmp("lapic_and_apbt", arg) == 0)
+ intel_mid_timer_options = INTEL_MID_TIMER_LAPIC_APBT;
+ else {
+ pr_warn("X86 INTEL_MID timer option %s not recognised use x86_intel_mid_timer=apbt_only or lapic_and_apbt\n",
+ arg);
+ return -EINVAL;
+ }
+ return 0;
+}
+__setup("x86_intel_mid_timer=", setup_x86_intel_mid_timer);
diff --git a/arch/x86/platform/intel-mid/intel_mid_vrtc.c b/arch/x86/platform/intel-mid/intel_mid_vrtc.c
new file mode 100644
index 000000000..a52914aa3
--- /dev/null
+++ b/arch/x86/platform/intel-mid/intel_mid_vrtc.c
@@ -0,0 +1,177 @@
+/*
+ * intel_mid_vrtc.c: Driver for virtual RTC device on Intel MID platform
+ *
+ * (C) Copyright 2009 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ *
+ * Note:
+ * VRTC is emulated by system controller firmware, the real HW
+ * RTC is located in the PMIC device. SCU FW shadows PMIC RTC
+ * in a memory mapped IO space that is visible to the host IA
+ * processor.
+ *
+ * This driver is based on RTC CMOS driver.
+ */
+
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/init.h>
+#include <linux/sfi.h>
+#include <linux/platform_device.h>
+#include <linux/mc146818rtc.h>
+
+#include <asm/intel-mid.h>
+#include <asm/intel_mid_vrtc.h>
+#include <asm/time.h>
+#include <asm/fixmap.h>
+
+static unsigned char __iomem *vrtc_virt_base;
+
+unsigned char vrtc_cmos_read(unsigned char reg)
+{
+ unsigned char retval;
+
+ /* vRTC's registers range from 0x0 to 0xD */
+ if (reg > 0xd || !vrtc_virt_base)
+ return 0xff;
+
+ lock_cmos_prefix(reg);
+ retval = __raw_readb(vrtc_virt_base + (reg << 2));
+ lock_cmos_suffix(reg);
+ return retval;
+}
+EXPORT_SYMBOL_GPL(vrtc_cmos_read);
+
+void vrtc_cmos_write(unsigned char val, unsigned char reg)
+{
+ if (reg > 0xd || !vrtc_virt_base)
+ return;
+
+ lock_cmos_prefix(reg);
+ __raw_writeb(val, vrtc_virt_base + (reg << 2));
+ lock_cmos_suffix(reg);
+}
+EXPORT_SYMBOL_GPL(vrtc_cmos_write);
+
+void vrtc_get_time(struct timespec64 *now)
+{
+ u8 sec, min, hour, mday, mon;
+ unsigned long flags;
+ u32 year;
+
+ spin_lock_irqsave(&rtc_lock, flags);
+
+ while ((vrtc_cmos_read(RTC_FREQ_SELECT) & RTC_UIP))
+ cpu_relax();
+
+ sec = vrtc_cmos_read(RTC_SECONDS);
+ min = vrtc_cmos_read(RTC_MINUTES);
+ hour = vrtc_cmos_read(RTC_HOURS);
+ mday = vrtc_cmos_read(RTC_DAY_OF_MONTH);
+ mon = vrtc_cmos_read(RTC_MONTH);
+ year = vrtc_cmos_read(RTC_YEAR);
+
+ spin_unlock_irqrestore(&rtc_lock, flags);
+
+ /* vRTC YEAR reg contains the offset to 1972 */
+ year += 1972;
+
+ pr_info("vRTC: sec: %d min: %d hour: %d day: %d "
+ "mon: %d year: %d\n", sec, min, hour, mday, mon, year);
+
+ now->tv_sec = mktime64(year, mon, mday, hour, min, sec);
+ now->tv_nsec = 0;
+}
+
+int vrtc_set_mmss(const struct timespec64 *now)
+{
+ unsigned long flags;
+ struct rtc_time tm;
+ int year;
+ int retval = 0;
+
+ rtc_time64_to_tm(now->tv_sec, &tm);
+ if (!rtc_valid_tm(&tm) && tm.tm_year >= 72) {
+ /*
+ * tm.year is the number of years since 1900, and the
+ * vrtc need the years since 1972.
+ */
+ year = tm.tm_year - 72;
+ spin_lock_irqsave(&rtc_lock, flags);
+ vrtc_cmos_write(year, RTC_YEAR);
+ vrtc_cmos_write(tm.tm_mon, RTC_MONTH);
+ vrtc_cmos_write(tm.tm_mday, RTC_DAY_OF_MONTH);
+ vrtc_cmos_write(tm.tm_hour, RTC_HOURS);
+ vrtc_cmos_write(tm.tm_min, RTC_MINUTES);
+ vrtc_cmos_write(tm.tm_sec, RTC_SECONDS);
+ spin_unlock_irqrestore(&rtc_lock, flags);
+ } else {
+ pr_err("%s: Invalid vRTC value: write of %llx to vRTC failed\n",
+ __func__, (s64)now->tv_sec);
+ retval = -EINVAL;
+ }
+ return retval;
+}
+
+void __init intel_mid_rtc_init(void)
+{
+ unsigned long vrtc_paddr;
+
+ sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc);
+
+ vrtc_paddr = sfi_mrtc_array[0].phys_addr;
+ if (!sfi_mrtc_num || !vrtc_paddr)
+ return;
+
+ vrtc_virt_base = (void __iomem *)set_fixmap_offset_nocache(FIX_LNW_VRTC,
+ vrtc_paddr);
+ x86_platform.get_wallclock = vrtc_get_time;
+ x86_platform.set_wallclock = vrtc_set_mmss;
+}
+
+/*
+ * The Moorestown platform has a memory mapped virtual RTC device that emulates
+ * the programming interface of the RTC.
+ */
+
+static struct resource vrtc_resources[] = {
+ [0] = {
+ .flags = IORESOURCE_MEM,
+ },
+ [1] = {
+ .flags = IORESOURCE_IRQ,
+ }
+};
+
+static struct platform_device vrtc_device = {
+ .name = "rtc_mrst",
+ .id = -1,
+ .resource = vrtc_resources,
+ .num_resources = ARRAY_SIZE(vrtc_resources),
+};
+
+/* Register the RTC device if appropriate */
+static int __init intel_mid_device_create(void)
+{
+ /* No Moorestown, no device */
+ if (!intel_mid_identify_cpu())
+ return -ENODEV;
+ /* No timer, no device */
+ if (!sfi_mrtc_num)
+ return -ENODEV;
+
+ /* iomem resource */
+ vrtc_resources[0].start = sfi_mrtc_array[0].phys_addr;
+ vrtc_resources[0].end = sfi_mrtc_array[0].phys_addr +
+ MRST_VRTC_MAP_SZ;
+ /* irq resource */
+ vrtc_resources[1].start = sfi_mrtc_array[0].irq;
+ vrtc_resources[1].end = sfi_mrtc_array[0].irq;
+
+ return platform_device_register(&vrtc_device);
+}
+device_initcall(intel_mid_device_create);
diff --git a/arch/x86/platform/intel-mid/pwr.c b/arch/x86/platform/intel-mid/pwr.c
new file mode 100644
index 000000000..49ec5b94c
--- /dev/null
+++ b/arch/x86/platform/intel-mid/pwr.c
@@ -0,0 +1,488 @@
+/*
+ * Intel MID Power Management Unit (PWRMU) device driver
+ *
+ * Copyright (C) 2016, Intel Corporation
+ *
+ * Author: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * Intel MID Power Management Unit device driver handles the South Complex PCI
+ * devices such as GPDMA, SPI, I2C, PWM, and so on. By default PCI core
+ * modifies bits in PMCSR register in the PCI configuration space. This is not
+ * enough on some SoCs like Intel Tangier. In such case PCI core sets a new
+ * power state of the device in question through a PM hook registered in struct
+ * pci_platform_pm_ops (see drivers/pci/pci-mid.c).
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/mutex.h>
+#include <linux/pci.h>
+
+#include <asm/intel-mid.h>
+
+/* Registers */
+#define PM_STS 0x00
+#define PM_CMD 0x04
+#define PM_ICS 0x08
+#define PM_WKC(x) (0x10 + (x) * 4)
+#define PM_WKS(x) (0x18 + (x) * 4)
+#define PM_SSC(x) (0x20 + (x) * 4)
+#define PM_SSS(x) (0x30 + (x) * 4)
+
+/* Bits in PM_STS */
+#define PM_STS_BUSY (1 << 8)
+
+/* Bits in PM_CMD */
+#define PM_CMD_CMD(x) ((x) << 0)
+#define PM_CMD_IOC (1 << 8)
+#define PM_CMD_CM_NOP (0 << 9)
+#define PM_CMD_CM_IMMEDIATE (1 << 9)
+#define PM_CMD_CM_DELAY (2 << 9)
+#define PM_CMD_CM_TRIGGER (3 << 9)
+
+/* System states */
+#define PM_CMD_SYS_STATE_S5 (5 << 16)
+
+/* Trigger variants */
+#define PM_CMD_CFG_TRIGGER_NC (3 << 19)
+
+/* Message to wait for TRIGGER_NC case */
+#define TRIGGER_NC_MSG_2 (2 << 22)
+
+/* List of commands */
+#define CMD_SET_CFG 0x01
+
+/* Bits in PM_ICS */
+#define PM_ICS_INT_STATUS(x) ((x) & 0xff)
+#define PM_ICS_IE (1 << 8)
+#define PM_ICS_IP (1 << 9)
+#define PM_ICS_SW_INT_STS (1 << 10)
+
+/* List of interrupts */
+#define INT_INVALID 0
+#define INT_CMD_COMPLETE 1
+#define INT_CMD_ERR 2
+#define INT_WAKE_EVENT 3
+#define INT_LSS_POWER_ERR 4
+#define INT_S0iX_MSG_ERR 5
+#define INT_NO_C6 6
+#define INT_TRIGGER_ERR 7
+#define INT_INACTIVITY 8
+
+/* South Complex devices */
+#define LSS_MAX_SHARED_DEVS 4
+#define LSS_MAX_DEVS 64
+
+#define LSS_WS_BITS 1 /* wake state width */
+#define LSS_PWS_BITS 2 /* power state width */
+
+/* Supported device IDs */
+#define PCI_DEVICE_ID_PENWELL 0x0828
+#define PCI_DEVICE_ID_TANGIER 0x11a1
+
+struct mid_pwr_dev {
+ struct pci_dev *pdev;
+ pci_power_t state;
+};
+
+struct mid_pwr {
+ struct device *dev;
+ void __iomem *regs;
+ int irq;
+ bool available;
+
+ struct mutex lock;
+ struct mid_pwr_dev lss[LSS_MAX_DEVS][LSS_MAX_SHARED_DEVS];
+};
+
+static struct mid_pwr *midpwr;
+
+static u32 mid_pwr_get_state(struct mid_pwr *pwr, int reg)
+{
+ return readl(pwr->regs + PM_SSS(reg));
+}
+
+static void mid_pwr_set_state(struct mid_pwr *pwr, int reg, u32 value)
+{
+ writel(value, pwr->regs + PM_SSC(reg));
+}
+
+static void mid_pwr_set_wake(struct mid_pwr *pwr, int reg, u32 value)
+{
+ writel(value, pwr->regs + PM_WKC(reg));
+}
+
+static void mid_pwr_interrupt_disable(struct mid_pwr *pwr)
+{
+ writel(~PM_ICS_IE, pwr->regs + PM_ICS);
+}
+
+static bool mid_pwr_is_busy(struct mid_pwr *pwr)
+{
+ return !!(readl(pwr->regs + PM_STS) & PM_STS_BUSY);
+}
+
+/* Wait 500ms that the latest PWRMU command finished */
+static int mid_pwr_wait(struct mid_pwr *pwr)
+{
+ unsigned int count = 500000;
+ bool busy;
+
+ do {
+ busy = mid_pwr_is_busy(pwr);
+ if (!busy)
+ return 0;
+ udelay(1);
+ } while (--count);
+
+ return -EBUSY;
+}
+
+static int mid_pwr_wait_for_cmd(struct mid_pwr *pwr, u8 cmd)
+{
+ writel(PM_CMD_CMD(cmd) | PM_CMD_CM_IMMEDIATE, pwr->regs + PM_CMD);
+ return mid_pwr_wait(pwr);
+}
+
+static int __update_power_state(struct mid_pwr *pwr, int reg, int bit, int new)
+{
+ int curstate;
+ u32 power;
+ int ret;
+
+ /* Check if the device is already in desired state */
+ power = mid_pwr_get_state(pwr, reg);
+ curstate = (power >> bit) & 3;
+ if (curstate == new)
+ return 0;
+
+ /* Update the power state */
+ mid_pwr_set_state(pwr, reg, (power & ~(3 << bit)) | (new << bit));
+
+ /* Send command to SCU */
+ ret = mid_pwr_wait_for_cmd(pwr, CMD_SET_CFG);
+ if (ret)
+ return ret;
+
+ /* Check if the device is already in desired state */
+ power = mid_pwr_get_state(pwr, reg);
+ curstate = (power >> bit) & 3;
+ if (curstate != new)
+ return -EAGAIN;
+
+ return 0;
+}
+
+static pci_power_t __find_weakest_power_state(struct mid_pwr_dev *lss,
+ struct pci_dev *pdev,
+ pci_power_t state)
+{
+ pci_power_t weakest = PCI_D3hot;
+ unsigned int j;
+
+ /* Find device in cache or first free cell */
+ for (j = 0; j < LSS_MAX_SHARED_DEVS; j++) {
+ if (lss[j].pdev == pdev || !lss[j].pdev)
+ break;
+ }
+
+ /* Store the desired state in cache */
+ if (j < LSS_MAX_SHARED_DEVS) {
+ lss[j].pdev = pdev;
+ lss[j].state = state;
+ } else {
+ dev_WARN(&pdev->dev, "No room for device in PWRMU LSS cache\n");
+ weakest = state;
+ }
+
+ /* Find the power state we may use */
+ for (j = 0; j < LSS_MAX_SHARED_DEVS; j++) {
+ if (lss[j].state < weakest)
+ weakest = lss[j].state;
+ }
+
+ return weakest;
+}
+
+static int __set_power_state(struct mid_pwr *pwr, struct pci_dev *pdev,
+ pci_power_t state, int id, int reg, int bit)
+{
+ const char *name;
+ int ret;
+
+ state = __find_weakest_power_state(pwr->lss[id], pdev, state);
+ name = pci_power_name(state);
+
+ ret = __update_power_state(pwr, reg, bit, (__force int)state);
+ if (ret) {
+ dev_warn(&pdev->dev, "Can't set power state %s: %d\n", name, ret);
+ return ret;
+ }
+
+ dev_vdbg(&pdev->dev, "Set power state %s\n", name);
+ return 0;
+}
+
+static int mid_pwr_set_power_state(struct mid_pwr *pwr, struct pci_dev *pdev,
+ pci_power_t state)
+{
+ int id, reg, bit;
+ int ret;
+
+ id = intel_mid_pwr_get_lss_id(pdev);
+ if (id < 0)
+ return id;
+
+ reg = (id * LSS_PWS_BITS) / 32;
+ bit = (id * LSS_PWS_BITS) % 32;
+
+ /* We support states between PCI_D0 and PCI_D3hot */
+ if (state < PCI_D0)
+ state = PCI_D0;
+ if (state > PCI_D3hot)
+ state = PCI_D3hot;
+
+ mutex_lock(&pwr->lock);
+ ret = __set_power_state(pwr, pdev, state, id, reg, bit);
+ mutex_unlock(&pwr->lock);
+ return ret;
+}
+
+int intel_mid_pci_set_power_state(struct pci_dev *pdev, pci_power_t state)
+{
+ struct mid_pwr *pwr = midpwr;
+ int ret = 0;
+
+ might_sleep();
+
+ if (pwr && pwr->available)
+ ret = mid_pwr_set_power_state(pwr, pdev, state);
+ dev_vdbg(&pdev->dev, "set_power_state() returns %d\n", ret);
+
+ return 0;
+}
+
+pci_power_t intel_mid_pci_get_power_state(struct pci_dev *pdev)
+{
+ struct mid_pwr *pwr = midpwr;
+ int id, reg, bit;
+ u32 power;
+
+ if (!pwr || !pwr->available)
+ return PCI_UNKNOWN;
+
+ id = intel_mid_pwr_get_lss_id(pdev);
+ if (id < 0)
+ return PCI_UNKNOWN;
+
+ reg = (id * LSS_PWS_BITS) / 32;
+ bit = (id * LSS_PWS_BITS) % 32;
+ power = mid_pwr_get_state(pwr, reg);
+ return (__force pci_power_t)((power >> bit) & 3);
+}
+
+void intel_mid_pwr_power_off(void)
+{
+ struct mid_pwr *pwr = midpwr;
+ u32 cmd = PM_CMD_SYS_STATE_S5 |
+ PM_CMD_CMD(CMD_SET_CFG) |
+ PM_CMD_CM_TRIGGER |
+ PM_CMD_CFG_TRIGGER_NC |
+ TRIGGER_NC_MSG_2;
+
+ /* Send command to SCU */
+ writel(cmd, pwr->regs + PM_CMD);
+ mid_pwr_wait(pwr);
+}
+
+int intel_mid_pwr_get_lss_id(struct pci_dev *pdev)
+{
+ int vndr;
+ u8 id;
+
+ /*
+ * Mapping to PWRMU index is kept in the Logical SubSystem ID byte of
+ * Vendor capability.
+ */
+ vndr = pci_find_capability(pdev, PCI_CAP_ID_VNDR);
+ if (!vndr)
+ return -EINVAL;
+
+ /* Read the Logical SubSystem ID byte */
+ pci_read_config_byte(pdev, vndr + INTEL_MID_PWR_LSS_OFFSET, &id);
+ if (!(id & INTEL_MID_PWR_LSS_TYPE))
+ return -ENODEV;
+
+ id &= ~INTEL_MID_PWR_LSS_TYPE;
+ if (id >= LSS_MAX_DEVS)
+ return -ERANGE;
+
+ return id;
+}
+
+static irqreturn_t mid_pwr_irq_handler(int irq, void *dev_id)
+{
+ struct mid_pwr *pwr = dev_id;
+ u32 ics;
+
+ ics = readl(pwr->regs + PM_ICS);
+ if (!(ics & PM_ICS_IP))
+ return IRQ_NONE;
+
+ writel(ics | PM_ICS_IP, pwr->regs + PM_ICS);
+
+ dev_warn(pwr->dev, "Unexpected IRQ: %#x\n", PM_ICS_INT_STATUS(ics));
+ return IRQ_HANDLED;
+}
+
+struct mid_pwr_device_info {
+ int (*set_initial_state)(struct mid_pwr *pwr);
+};
+
+static int mid_pwr_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+ struct mid_pwr_device_info *info = (void *)id->driver_data;
+ struct device *dev = &pdev->dev;
+ struct mid_pwr *pwr;
+ int ret;
+
+ ret = pcim_enable_device(pdev);
+ if (ret < 0) {
+ dev_err(&pdev->dev, "error: could not enable device\n");
+ return ret;
+ }
+
+ ret = pcim_iomap_regions(pdev, 1 << 0, pci_name(pdev));
+ if (ret) {
+ dev_err(&pdev->dev, "I/O memory remapping failed\n");
+ return ret;
+ }
+
+ pwr = devm_kzalloc(dev, sizeof(*pwr), GFP_KERNEL);
+ if (!pwr)
+ return -ENOMEM;
+
+ pwr->dev = dev;
+ pwr->regs = pcim_iomap_table(pdev)[0];
+ pwr->irq = pdev->irq;
+
+ mutex_init(&pwr->lock);
+
+ /* Disable interrupts */
+ mid_pwr_interrupt_disable(pwr);
+
+ if (info && info->set_initial_state) {
+ ret = info->set_initial_state(pwr);
+ if (ret)
+ dev_warn(dev, "Can't set initial state: %d\n", ret);
+ }
+
+ ret = devm_request_irq(dev, pdev->irq, mid_pwr_irq_handler,
+ IRQF_NO_SUSPEND, pci_name(pdev), pwr);
+ if (ret)
+ return ret;
+
+ pwr->available = true;
+ midpwr = pwr;
+
+ pci_set_drvdata(pdev, pwr);
+ return 0;
+}
+
+static int mid_set_initial_state(struct mid_pwr *pwr, const u32 *states)
+{
+ unsigned int i, j;
+ int ret;
+
+ /*
+ * Enable wake events.
+ *
+ * PWRMU supports up to 32 sources for wake up the system. Ungate them
+ * all here.
+ */
+ mid_pwr_set_wake(pwr, 0, 0xffffffff);
+ mid_pwr_set_wake(pwr, 1, 0xffffffff);
+
+ /*
+ * Power off South Complex devices.
+ *
+ * There is a map (see a note below) of 64 devices with 2 bits per each
+ * on 32-bit HW registers. The following calls set all devices to one
+ * known initial state, i.e. PCI_D3hot. This is done in conjunction
+ * with PMCSR setting in arch/x86/pci/intel_mid_pci.c.
+ *
+ * NOTE: The actual device mapping is provided by a platform at run
+ * time using vendor capability of PCI configuration space.
+ */
+ mid_pwr_set_state(pwr, 0, states[0]);
+ mid_pwr_set_state(pwr, 1, states[1]);
+ mid_pwr_set_state(pwr, 2, states[2]);
+ mid_pwr_set_state(pwr, 3, states[3]);
+
+ /* Send command to SCU */
+ ret = mid_pwr_wait_for_cmd(pwr, CMD_SET_CFG);
+ if (ret)
+ return ret;
+
+ for (i = 0; i < LSS_MAX_DEVS; i++) {
+ for (j = 0; j < LSS_MAX_SHARED_DEVS; j++)
+ pwr->lss[i][j].state = PCI_D3hot;
+ }
+
+ return 0;
+}
+
+static int pnw_set_initial_state(struct mid_pwr *pwr)
+{
+ /* On Penwell SRAM must stay powered on */
+ static const u32 states[] = {
+ 0xf00fffff, /* PM_SSC(0) */
+ 0xffffffff, /* PM_SSC(1) */
+ 0xffffffff, /* PM_SSC(2) */
+ 0xffffffff, /* PM_SSC(3) */
+ };
+ return mid_set_initial_state(pwr, states);
+}
+
+static int tng_set_initial_state(struct mid_pwr *pwr)
+{
+ static const u32 states[] = {
+ 0xffffffff, /* PM_SSC(0) */
+ 0xffffffff, /* PM_SSC(1) */
+ 0xffffffff, /* PM_SSC(2) */
+ 0xffffffff, /* PM_SSC(3) */
+ };
+ return mid_set_initial_state(pwr, states);
+}
+
+static const struct mid_pwr_device_info pnw_info = {
+ .set_initial_state = pnw_set_initial_state,
+};
+
+static const struct mid_pwr_device_info tng_info = {
+ .set_initial_state = tng_set_initial_state,
+};
+
+/* This table should be in sync with the one in drivers/pci/pci-mid.c */
+static const struct pci_device_id mid_pwr_pci_ids[] = {
+ { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_PENWELL), (kernel_ulong_t)&pnw_info },
+ { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_TANGIER), (kernel_ulong_t)&tng_info },
+ {}
+};
+
+static struct pci_driver mid_pwr_pci_driver = {
+ .name = "intel_mid_pwr",
+ .probe = mid_pwr_probe,
+ .id_table = mid_pwr_pci_ids,
+};
+
+builtin_pci_driver(mid_pwr_pci_driver);
diff --git a/arch/x86/platform/intel-mid/sfi.c b/arch/x86/platform/intel-mid/sfi.c
new file mode 100644
index 000000000..7be1e1fe9
--- /dev/null
+++ b/arch/x86/platform/intel-mid/sfi.c
@@ -0,0 +1,547 @@
+/*
+ * intel_mid_sfi.c: Intel MID SFI initialization code
+ *
+ * (C) Copyright 2013 Intel Corporation
+ * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/scatterlist.h>
+#include <linux/sfi.h>
+#include <linux/spi/spi.h>
+#include <linux/i2c.h>
+#include <linux/skbuff.h>
+#include <linux/gpio.h>
+#include <linux/gpio_keys.h>
+#include <linux/input.h>
+#include <linux/platform_device.h>
+#include <linux/irq.h>
+#include <linux/export.h>
+#include <linux/notifier.h>
+#include <linux/mmc/core.h>
+#include <linux/mmc/card.h>
+#include <linux/blkdev.h>
+
+#include <asm/setup.h>
+#include <asm/mpspec_def.h>
+#include <asm/hw_irq.h>
+#include <asm/apic.h>
+#include <asm/io_apic.h>
+#include <asm/intel-mid.h>
+#include <asm/intel_mid_vrtc.h>
+#include <asm/io.h>
+#include <asm/i8259.h>
+#include <asm/intel_scu_ipc.h>
+#include <asm/apb_timer.h>
+#include <asm/reboot.h>
+
+#define SFI_SIG_OEM0 "OEM0"
+#define MAX_IPCDEVS 24
+#define MAX_SCU_SPI 24
+#define MAX_SCU_I2C 24
+
+static struct platform_device *ipc_devs[MAX_IPCDEVS];
+static struct spi_board_info *spi_devs[MAX_SCU_SPI];
+static struct i2c_board_info *i2c_devs[MAX_SCU_I2C];
+static struct sfi_gpio_table_entry *gpio_table;
+static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM];
+static int ipc_next_dev;
+static int spi_next_dev;
+static int i2c_next_dev;
+static int i2c_bus[MAX_SCU_I2C];
+static int gpio_num_entry;
+static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM];
+int sfi_mrtc_num;
+int sfi_mtimer_num;
+
+struct sfi_rtc_table_entry sfi_mrtc_array[SFI_MRTC_MAX];
+EXPORT_SYMBOL_GPL(sfi_mrtc_array);
+
+struct blocking_notifier_head intel_scu_notifier =
+ BLOCKING_NOTIFIER_INIT(intel_scu_notifier);
+EXPORT_SYMBOL_GPL(intel_scu_notifier);
+
+#define intel_mid_sfi_get_pdata(dev, priv) \
+ ((dev)->get_platform_data ? (dev)->get_platform_data(priv) : NULL)
+
+/* parse all the mtimer info to a static mtimer array */
+int __init sfi_parse_mtmr(struct sfi_table_header *table)
+{
+ struct sfi_table_simple *sb;
+ struct sfi_timer_table_entry *pentry;
+ struct mpc_intsrc mp_irq;
+ int totallen;
+
+ sb = (struct sfi_table_simple *)table;
+ if (!sfi_mtimer_num) {
+ sfi_mtimer_num = SFI_GET_NUM_ENTRIES(sb,
+ struct sfi_timer_table_entry);
+ pentry = (struct sfi_timer_table_entry *) sb->pentry;
+ totallen = sfi_mtimer_num * sizeof(*pentry);
+ memcpy(sfi_mtimer_array, pentry, totallen);
+ }
+
+ pr_debug("SFI MTIMER info (num = %d):\n", sfi_mtimer_num);
+ pentry = sfi_mtimer_array;
+ for (totallen = 0; totallen < sfi_mtimer_num; totallen++, pentry++) {
+ pr_debug("timer[%d]: paddr = 0x%08x, freq = %dHz, irq = %d\n",
+ totallen, (u32)pentry->phys_addr,
+ pentry->freq_hz, pentry->irq);
+ mp_irq.type = MP_INTSRC;
+ mp_irq.irqtype = mp_INT;
+ mp_irq.irqflag = MP_IRQTRIG_EDGE | MP_IRQPOL_ACTIVE_HIGH;
+ mp_irq.srcbus = MP_BUS_ISA;
+ mp_irq.srcbusirq = pentry->irq; /* IRQ */
+ mp_irq.dstapic = MP_APIC_ALL;
+ mp_irq.dstirq = pentry->irq;
+ mp_save_irq(&mp_irq);
+ mp_map_gsi_to_irq(pentry->irq, IOAPIC_MAP_ALLOC, NULL);
+ }
+
+ return 0;
+}
+
+struct sfi_timer_table_entry *sfi_get_mtmr(int hint)
+{
+ int i;
+ if (hint < sfi_mtimer_num) {
+ if (!sfi_mtimer_usage[hint]) {
+ pr_debug("hint taken for timer %d irq %d\n",
+ hint, sfi_mtimer_array[hint].irq);
+ sfi_mtimer_usage[hint] = 1;
+ return &sfi_mtimer_array[hint];
+ }
+ }
+ /* take the first timer available */
+ for (i = 0; i < sfi_mtimer_num;) {
+ if (!sfi_mtimer_usage[i]) {
+ sfi_mtimer_usage[i] = 1;
+ return &sfi_mtimer_array[i];
+ }
+ i++;
+ }
+ return NULL;
+}
+
+void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr)
+{
+ int i;
+ for (i = 0; i < sfi_mtimer_num;) {
+ if (mtmr->irq == sfi_mtimer_array[i].irq) {
+ sfi_mtimer_usage[i] = 0;
+ return;
+ }
+ i++;
+ }
+}
+
+/* parse all the mrtc info to a global mrtc array */
+int __init sfi_parse_mrtc(struct sfi_table_header *table)
+{
+ struct sfi_table_simple *sb;
+ struct sfi_rtc_table_entry *pentry;
+ struct mpc_intsrc mp_irq;
+
+ int totallen;
+
+ sb = (struct sfi_table_simple *)table;
+ if (!sfi_mrtc_num) {
+ sfi_mrtc_num = SFI_GET_NUM_ENTRIES(sb,
+ struct sfi_rtc_table_entry);
+ pentry = (struct sfi_rtc_table_entry *)sb->pentry;
+ totallen = sfi_mrtc_num * sizeof(*pentry);
+ memcpy(sfi_mrtc_array, pentry, totallen);
+ }
+
+ pr_debug("SFI RTC info (num = %d):\n", sfi_mrtc_num);
+ pentry = sfi_mrtc_array;
+ for (totallen = 0; totallen < sfi_mrtc_num; totallen++, pentry++) {
+ pr_debug("RTC[%d]: paddr = 0x%08x, irq = %d\n",
+ totallen, (u32)pentry->phys_addr, pentry->irq);
+ mp_irq.type = MP_INTSRC;
+ mp_irq.irqtype = mp_INT;
+ mp_irq.irqflag = MP_IRQTRIG_LEVEL | MP_IRQPOL_ACTIVE_LOW;
+ mp_irq.srcbus = MP_BUS_ISA;
+ mp_irq.srcbusirq = pentry->irq; /* IRQ */
+ mp_irq.dstapic = MP_APIC_ALL;
+ mp_irq.dstirq = pentry->irq;
+ mp_save_irq(&mp_irq);
+ mp_map_gsi_to_irq(pentry->irq, IOAPIC_MAP_ALLOC, NULL);
+ }
+ return 0;
+}
+
+
+/*
+ * Parsing GPIO table first, since the DEVS table will need this table
+ * to map the pin name to the actual pin.
+ */
+static int __init sfi_parse_gpio(struct sfi_table_header *table)
+{
+ struct sfi_table_simple *sb;
+ struct sfi_gpio_table_entry *pentry;
+ int num, i;
+
+ if (gpio_table)
+ return 0;
+ sb = (struct sfi_table_simple *)table;
+ num = SFI_GET_NUM_ENTRIES(sb, struct sfi_gpio_table_entry);
+ pentry = (struct sfi_gpio_table_entry *)sb->pentry;
+
+ gpio_table = kmemdup(pentry, num * sizeof(*pentry), GFP_KERNEL);
+ if (!gpio_table)
+ return -1;
+ gpio_num_entry = num;
+
+ pr_debug("GPIO pin info:\n");
+ for (i = 0; i < num; i++, pentry++)
+ pr_debug("info[%2d]: controller = %16.16s, pin_name = %16.16s,"
+ " pin = %d\n", i,
+ pentry->controller_name,
+ pentry->pin_name,
+ pentry->pin_no);
+ return 0;
+}
+
+int get_gpio_by_name(const char *name)
+{
+ struct sfi_gpio_table_entry *pentry = gpio_table;
+ int i;
+
+ if (!pentry)
+ return -1;
+ for (i = 0; i < gpio_num_entry; i++, pentry++) {
+ if (!strncmp(name, pentry->pin_name, SFI_NAME_LEN))
+ return pentry->pin_no;
+ }
+ return -EINVAL;
+}
+
+static void __init intel_scu_ipc_device_register(struct platform_device *pdev)
+{
+ if (ipc_next_dev == MAX_IPCDEVS)
+ pr_err("too many SCU IPC devices");
+ else
+ ipc_devs[ipc_next_dev++] = pdev;
+}
+
+static void __init intel_scu_spi_device_register(struct spi_board_info *sdev)
+{
+ struct spi_board_info *new_dev;
+
+ if (spi_next_dev == MAX_SCU_SPI) {
+ pr_err("too many SCU SPI devices");
+ return;
+ }
+
+ new_dev = kzalloc(sizeof(*sdev), GFP_KERNEL);
+ if (!new_dev) {
+ pr_err("failed to alloc mem for delayed spi dev %s\n",
+ sdev->modalias);
+ return;
+ }
+ *new_dev = *sdev;
+
+ spi_devs[spi_next_dev++] = new_dev;
+}
+
+static void __init intel_scu_i2c_device_register(int bus,
+ struct i2c_board_info *idev)
+{
+ struct i2c_board_info *new_dev;
+
+ if (i2c_next_dev == MAX_SCU_I2C) {
+ pr_err("too many SCU I2C devices");
+ return;
+ }
+
+ new_dev = kzalloc(sizeof(*idev), GFP_KERNEL);
+ if (!new_dev) {
+ pr_err("failed to alloc mem for delayed i2c dev %s\n",
+ idev->type);
+ return;
+ }
+ *new_dev = *idev;
+
+ i2c_bus[i2c_next_dev] = bus;
+ i2c_devs[i2c_next_dev++] = new_dev;
+}
+
+/* Called by IPC driver */
+void intel_scu_devices_create(void)
+{
+ int i;
+
+ for (i = 0; i < ipc_next_dev; i++)
+ platform_device_add(ipc_devs[i]);
+
+ for (i = 0; i < spi_next_dev; i++)
+ spi_register_board_info(spi_devs[i], 1);
+
+ for (i = 0; i < i2c_next_dev; i++) {
+ struct i2c_adapter *adapter;
+ struct i2c_client *client;
+
+ adapter = i2c_get_adapter(i2c_bus[i]);
+ if (adapter) {
+ client = i2c_new_device(adapter, i2c_devs[i]);
+ if (!client)
+ pr_err("can't create i2c device %s\n",
+ i2c_devs[i]->type);
+ } else
+ i2c_register_board_info(i2c_bus[i], i2c_devs[i], 1);
+ }
+ intel_scu_notifier_post(SCU_AVAILABLE, NULL);
+}
+EXPORT_SYMBOL_GPL(intel_scu_devices_create);
+
+/* Called by IPC driver */
+void intel_scu_devices_destroy(void)
+{
+ int i;
+
+ intel_scu_notifier_post(SCU_DOWN, NULL);
+
+ for (i = 0; i < ipc_next_dev; i++)
+ platform_device_del(ipc_devs[i]);
+}
+EXPORT_SYMBOL_GPL(intel_scu_devices_destroy);
+
+static void __init install_irq_resource(struct platform_device *pdev, int irq)
+{
+ /* Single threaded */
+ static struct resource res __initdata = {
+ .name = "IRQ",
+ .flags = IORESOURCE_IRQ,
+ };
+ res.start = irq;
+ platform_device_add_resources(pdev, &res, 1);
+}
+
+static void __init sfi_handle_ipc_dev(struct sfi_device_table_entry *pentry,
+ struct devs_id *dev)
+{
+ struct platform_device *pdev;
+ void *pdata = NULL;
+
+ pr_debug("IPC bus, name = %16.16s, irq = 0x%2x\n",
+ pentry->name, pentry->irq);
+
+ /*
+ * We need to call platform init of IPC devices to fill misc_pdata
+ * structure. It will be used in msic_init for initialization.
+ */
+ pdata = intel_mid_sfi_get_pdata(dev, pentry);
+ if (IS_ERR(pdata))
+ return;
+
+ /*
+ * On Medfield the platform device creation is handled by the MSIC
+ * MFD driver so we don't need to do it here.
+ */
+ if (dev->msic && intel_mid_has_msic())
+ return;
+
+ pdev = platform_device_alloc(pentry->name, 0);
+ if (pdev == NULL) {
+ pr_err("out of memory for SFI platform device '%s'.\n",
+ pentry->name);
+ return;
+ }
+ install_irq_resource(pdev, pentry->irq);
+
+ pdev->dev.platform_data = pdata;
+ if (dev->delay)
+ intel_scu_ipc_device_register(pdev);
+ else
+ platform_device_add(pdev);
+}
+
+static void __init sfi_handle_spi_dev(struct sfi_device_table_entry *pentry,
+ struct devs_id *dev)
+{
+ struct spi_board_info spi_info;
+ void *pdata = NULL;
+
+ memset(&spi_info, 0, sizeof(spi_info));
+ strncpy(spi_info.modalias, pentry->name, SFI_NAME_LEN);
+ spi_info.irq = ((pentry->irq == (u8)0xff) ? 0 : pentry->irq);
+ spi_info.bus_num = pentry->host_num;
+ spi_info.chip_select = pentry->addr;
+ spi_info.max_speed_hz = pentry->max_freq;
+ pr_debug("SPI bus=%d, name=%16.16s, irq=0x%2x, max_freq=%d, cs=%d\n",
+ spi_info.bus_num,
+ spi_info.modalias,
+ spi_info.irq,
+ spi_info.max_speed_hz,
+ spi_info.chip_select);
+
+ pdata = intel_mid_sfi_get_pdata(dev, &spi_info);
+ if (IS_ERR(pdata))
+ return;
+
+ spi_info.platform_data = pdata;
+ if (dev->delay)
+ intel_scu_spi_device_register(&spi_info);
+ else
+ spi_register_board_info(&spi_info, 1);
+}
+
+static void __init sfi_handle_i2c_dev(struct sfi_device_table_entry *pentry,
+ struct devs_id *dev)
+{
+ struct i2c_board_info i2c_info;
+ void *pdata = NULL;
+
+ memset(&i2c_info, 0, sizeof(i2c_info));
+ strncpy(i2c_info.type, pentry->name, SFI_NAME_LEN);
+ i2c_info.irq = ((pentry->irq == (u8)0xff) ? 0 : pentry->irq);
+ i2c_info.addr = pentry->addr;
+ pr_debug("I2C bus = %d, name = %16.16s, irq = 0x%2x, addr = 0x%x\n",
+ pentry->host_num,
+ i2c_info.type,
+ i2c_info.irq,
+ i2c_info.addr);
+ pdata = intel_mid_sfi_get_pdata(dev, &i2c_info);
+ i2c_info.platform_data = pdata;
+ if (IS_ERR(pdata))
+ return;
+
+ if (dev->delay)
+ intel_scu_i2c_device_register(pentry->host_num, &i2c_info);
+ else
+ i2c_register_board_info(pentry->host_num, &i2c_info, 1);
+}
+
+static void __init sfi_handle_sd_dev(struct sfi_device_table_entry *pentry,
+ struct devs_id *dev)
+{
+ struct mid_sd_board_info sd_info;
+ void *pdata;
+
+ memset(&sd_info, 0, sizeof(sd_info));
+ strncpy(sd_info.name, pentry->name, SFI_NAME_LEN);
+ sd_info.bus_num = pentry->host_num;
+ sd_info.max_clk = pentry->max_freq;
+ sd_info.addr = pentry->addr;
+ pr_debug("SD bus = %d, name = %16.16s, max_clk = %d, addr = 0x%x\n",
+ sd_info.bus_num,
+ sd_info.name,
+ sd_info.max_clk,
+ sd_info.addr);
+ pdata = intel_mid_sfi_get_pdata(dev, &sd_info);
+ if (IS_ERR(pdata))
+ return;
+
+ /* Nothing we can do with this for now */
+ sd_info.platform_data = pdata;
+
+ pr_debug("Successfully registered %16.16s", sd_info.name);
+}
+
+extern struct devs_id *const __x86_intel_mid_dev_start[],
+ *const __x86_intel_mid_dev_end[];
+
+static struct devs_id __init *get_device_id(u8 type, char *name)
+{
+ struct devs_id *const *dev_table;
+
+ for (dev_table = __x86_intel_mid_dev_start;
+ dev_table < __x86_intel_mid_dev_end; dev_table++) {
+ struct devs_id *dev = *dev_table;
+ if (dev->type == type &&
+ !strncmp(dev->name, name, SFI_NAME_LEN)) {
+ return dev;
+ }
+ }
+
+ return NULL;
+}
+
+static int __init sfi_parse_devs(struct sfi_table_header *table)
+{
+ struct sfi_table_simple *sb;
+ struct sfi_device_table_entry *pentry;
+ struct devs_id *dev = NULL;
+ int num, i, ret;
+ int polarity;
+ struct irq_alloc_info info;
+
+ sb = (struct sfi_table_simple *)table;
+ num = SFI_GET_NUM_ENTRIES(sb, struct sfi_device_table_entry);
+ pentry = (struct sfi_device_table_entry *)sb->pentry;
+
+ for (i = 0; i < num; i++, pentry++) {
+ int irq = pentry->irq;
+
+ if (irq != (u8)0xff) { /* native RTE case */
+ /* these SPI2 devices are not exposed to system as PCI
+ * devices, but they have separate RTE entry in IOAPIC
+ * so we have to enable them one by one here
+ */
+ if (intel_mid_identify_cpu() ==
+ INTEL_MID_CPU_CHIP_TANGIER) {
+ if (!strncmp(pentry->name, "r69001-ts-i2c", 13))
+ /* active low */
+ polarity = 1;
+ else if (!strncmp(pentry->name,
+ "synaptics_3202", 14))
+ /* active low */
+ polarity = 1;
+ else if (irq == 41)
+ /* fast_int_1 */
+ polarity = 1;
+ else
+ /* active high */
+ polarity = 0;
+ } else {
+ /* PNW and CLV go with active low */
+ polarity = 1;
+ }
+
+ ioapic_set_alloc_attr(&info, NUMA_NO_NODE, 1, polarity);
+ ret = mp_map_gsi_to_irq(irq, IOAPIC_MAP_ALLOC, &info);
+ WARN_ON(ret < 0);
+ }
+
+ dev = get_device_id(pentry->type, pentry->name);
+
+ if (!dev)
+ continue;
+
+ switch (pentry->type) {
+ case SFI_DEV_TYPE_IPC:
+ sfi_handle_ipc_dev(pentry, dev);
+ break;
+ case SFI_DEV_TYPE_SPI:
+ sfi_handle_spi_dev(pentry, dev);
+ break;
+ case SFI_DEV_TYPE_I2C:
+ sfi_handle_i2c_dev(pentry, dev);
+ break;
+ case SFI_DEV_TYPE_SD:
+ sfi_handle_sd_dev(pentry, dev);
+ break;
+ case SFI_DEV_TYPE_UART:
+ case SFI_DEV_TYPE_HSI:
+ default:
+ break;
+ }
+ }
+ return 0;
+}
+
+static int __init intel_mid_platform_init(void)
+{
+ sfi_table_parse(SFI_SIG_GPIO, NULL, NULL, sfi_parse_gpio);
+ sfi_table_parse(SFI_SIG_DEVS, NULL, NULL, sfi_parse_devs);
+ return 0;
+}
+arch_initcall(intel_mid_platform_init);
diff --git a/arch/x86/platform/intel-quark/Makefile b/arch/x86/platform/intel-quark/Makefile
new file mode 100644
index 000000000..9cc57ed36
--- /dev/null
+++ b/arch/x86/platform/intel-quark/Makefile
@@ -0,0 +1,2 @@
+obj-$(CONFIG_INTEL_IMR) += imr.o
+obj-$(CONFIG_DEBUG_IMR_SELFTEST) += imr_selftest.o
diff --git a/arch/x86/platform/intel-quark/imr.c b/arch/x86/platform/intel-quark/imr.c
new file mode 100644
index 000000000..49828c270
--- /dev/null
+++ b/arch/x86/platform/intel-quark/imr.c
@@ -0,0 +1,600 @@
+/**
+ * imr.c -- Intel Isolated Memory Region driver
+ *
+ * Copyright(c) 2013 Intel Corporation.
+ * Copyright(c) 2015 Bryan O'Donoghue <pure.logic@nexus-software.ie>
+ *
+ * IMR registers define an isolated region of memory that can
+ * be masked to prohibit certain system agents from accessing memory.
+ * When a device behind a masked port performs an access - snooped or
+ * not, an IMR may optionally prevent that transaction from changing
+ * the state of memory or from getting correct data in response to the
+ * operation.
+ *
+ * Write data will be dropped and reads will return 0xFFFFFFFF, the
+ * system will reset and system BIOS will print out an error message to
+ * inform the user that an IMR has been violated.
+ *
+ * This code is based on the Linux MTRR code and reference code from
+ * Intel's Quark BSP EFI, Linux and grub code.
+ *
+ * See quark-x1000-datasheet.pdf for register definitions.
+ * http://www.intel.com/content/dam/www/public/us/en/documents/datasheets/quark-x1000-datasheet.pdf
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <asm-generic/sections.h>
+#include <asm/cpu_device_id.h>
+#include <asm/imr.h>
+#include <asm/iosf_mbi.h>
+#include <linux/debugfs.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/types.h>
+
+struct imr_device {
+ struct dentry *file;
+ bool init;
+ struct mutex lock;
+ int max_imr;
+ int reg_base;
+};
+
+static struct imr_device imr_dev;
+
+/*
+ * IMR read/write mask control registers.
+ * See quark-x1000-datasheet.pdf sections 12.7.4.5 and 12.7.4.6 for
+ * bit definitions.
+ *
+ * addr_hi
+ * 31 Lock bit
+ * 30:24 Reserved
+ * 23:2 1 KiB aligned lo address
+ * 1:0 Reserved
+ *
+ * addr_hi
+ * 31:24 Reserved
+ * 23:2 1 KiB aligned hi address
+ * 1:0 Reserved
+ */
+#define IMR_LOCK BIT(31)
+
+struct imr_regs {
+ u32 addr_lo;
+ u32 addr_hi;
+ u32 rmask;
+ u32 wmask;
+};
+
+#define IMR_NUM_REGS (sizeof(struct imr_regs)/sizeof(u32))
+#define IMR_SHIFT 8
+#define imr_to_phys(x) ((x) << IMR_SHIFT)
+#define phys_to_imr(x) ((x) >> IMR_SHIFT)
+
+/**
+ * imr_is_enabled - true if an IMR is enabled false otherwise.
+ *
+ * Determines if an IMR is enabled based on address range and read/write
+ * mask. An IMR set with an address range set to zero and a read/write
+ * access mask set to all is considered to be disabled. An IMR in any
+ * other state - for example set to zero but without read/write access
+ * all is considered to be enabled. This definition of disabled is how
+ * firmware switches off an IMR and is maintained in kernel for
+ * consistency.
+ *
+ * @imr: pointer to IMR descriptor.
+ * @return: true if IMR enabled false if disabled.
+ */
+static inline int imr_is_enabled(struct imr_regs *imr)
+{
+ return !(imr->rmask == IMR_READ_ACCESS_ALL &&
+ imr->wmask == IMR_WRITE_ACCESS_ALL &&
+ imr_to_phys(imr->addr_lo) == 0 &&
+ imr_to_phys(imr->addr_hi) == 0);
+}
+
+/**
+ * imr_read - read an IMR at a given index.
+ *
+ * Requires caller to hold imr mutex.
+ *
+ * @idev: pointer to imr_device structure.
+ * @imr_id: IMR entry to read.
+ * @imr: IMR structure representing address and access masks.
+ * @return: 0 on success or error code passed from mbi_iosf on failure.
+ */
+static int imr_read(struct imr_device *idev, u32 imr_id, struct imr_regs *imr)
+{
+ u32 reg = imr_id * IMR_NUM_REGS + idev->reg_base;
+ int ret;
+
+ ret = iosf_mbi_read(QRK_MBI_UNIT_MM, MBI_REG_READ, reg++, &imr->addr_lo);
+ if (ret)
+ return ret;
+
+ ret = iosf_mbi_read(QRK_MBI_UNIT_MM, MBI_REG_READ, reg++, &imr->addr_hi);
+ if (ret)
+ return ret;
+
+ ret = iosf_mbi_read(QRK_MBI_UNIT_MM, MBI_REG_READ, reg++, &imr->rmask);
+ if (ret)
+ return ret;
+
+ return iosf_mbi_read(QRK_MBI_UNIT_MM, MBI_REG_READ, reg++, &imr->wmask);
+}
+
+/**
+ * imr_write - write an IMR at a given index.
+ *
+ * Requires caller to hold imr mutex.
+ * Note lock bits need to be written independently of address bits.
+ *
+ * @idev: pointer to imr_device structure.
+ * @imr_id: IMR entry to write.
+ * @imr: IMR structure representing address and access masks.
+ * @return: 0 on success or error code passed from mbi_iosf on failure.
+ */
+static int imr_write(struct imr_device *idev, u32 imr_id, struct imr_regs *imr)
+{
+ unsigned long flags;
+ u32 reg = imr_id * IMR_NUM_REGS + idev->reg_base;
+ int ret;
+
+ local_irq_save(flags);
+
+ ret = iosf_mbi_write(QRK_MBI_UNIT_MM, MBI_REG_WRITE, reg++, imr->addr_lo);
+ if (ret)
+ goto failed;
+
+ ret = iosf_mbi_write(QRK_MBI_UNIT_MM, MBI_REG_WRITE, reg++, imr->addr_hi);
+ if (ret)
+ goto failed;
+
+ ret = iosf_mbi_write(QRK_MBI_UNIT_MM, MBI_REG_WRITE, reg++, imr->rmask);
+ if (ret)
+ goto failed;
+
+ ret = iosf_mbi_write(QRK_MBI_UNIT_MM, MBI_REG_WRITE, reg++, imr->wmask);
+ if (ret)
+ goto failed;
+
+ local_irq_restore(flags);
+ return 0;
+failed:
+ /*
+ * If writing to the IOSF failed then we're in an unknown state,
+ * likely a very bad state. An IMR in an invalid state will almost
+ * certainly lead to a memory access violation.
+ */
+ local_irq_restore(flags);
+ WARN(ret, "IOSF-MBI write fail range 0x%08x-0x%08x unreliable\n",
+ imr_to_phys(imr->addr_lo), imr_to_phys(imr->addr_hi) + IMR_MASK);
+
+ return ret;
+}
+
+/**
+ * imr_dbgfs_state_show - print state of IMR registers.
+ *
+ * @s: pointer to seq_file for output.
+ * @unused: unused parameter.
+ * @return: 0 on success or error code passed from mbi_iosf on failure.
+ */
+static int imr_dbgfs_state_show(struct seq_file *s, void *unused)
+{
+ phys_addr_t base;
+ phys_addr_t end;
+ int i;
+ struct imr_device *idev = s->private;
+ struct imr_regs imr;
+ size_t size;
+ int ret = -ENODEV;
+
+ mutex_lock(&idev->lock);
+
+ for (i = 0; i < idev->max_imr; i++) {
+
+ ret = imr_read(idev, i, &imr);
+ if (ret)
+ break;
+
+ /*
+ * Remember to add IMR_ALIGN bytes to size to indicate the
+ * inherent IMR_ALIGN size bytes contained in the masked away
+ * lower ten bits.
+ */
+ if (imr_is_enabled(&imr)) {
+ base = imr_to_phys(imr.addr_lo);
+ end = imr_to_phys(imr.addr_hi) + IMR_MASK;
+ size = end - base + 1;
+ } else {
+ base = 0;
+ end = 0;
+ size = 0;
+ }
+ seq_printf(s, "imr%02i: base=%pa, end=%pa, size=0x%08zx "
+ "rmask=0x%08x, wmask=0x%08x, %s, %s\n", i,
+ &base, &end, size, imr.rmask, imr.wmask,
+ imr_is_enabled(&imr) ? "enabled " : "disabled",
+ imr.addr_lo & IMR_LOCK ? "locked" : "unlocked");
+ }
+
+ mutex_unlock(&idev->lock);
+ return ret;
+}
+DEFINE_SHOW_ATTRIBUTE(imr_dbgfs_state);
+
+/**
+ * imr_debugfs_register - register debugfs hooks.
+ *
+ * @idev: pointer to imr_device structure.
+ * @return: 0 on success - errno on failure.
+ */
+static int imr_debugfs_register(struct imr_device *idev)
+{
+ idev->file = debugfs_create_file("imr_state", 0444, NULL, idev,
+ &imr_dbgfs_state_fops);
+ return PTR_ERR_OR_ZERO(idev->file);
+}
+
+/**
+ * imr_check_params - check passed address range IMR alignment and non-zero size
+ *
+ * @base: base address of intended IMR.
+ * @size: size of intended IMR.
+ * @return: zero on valid range -EINVAL on unaligned base/size.
+ */
+static int imr_check_params(phys_addr_t base, size_t size)
+{
+ if ((base & IMR_MASK) || (size & IMR_MASK)) {
+ pr_err("base %pa size 0x%08zx must align to 1KiB\n",
+ &base, size);
+ return -EINVAL;
+ }
+ if (size == 0)
+ return -EINVAL;
+
+ return 0;
+}
+
+/**
+ * imr_raw_size - account for the IMR_ALIGN bytes that addr_hi appends.
+ *
+ * IMR addr_hi has a built in offset of plus IMR_ALIGN (0x400) bytes from the
+ * value in the register. We need to subtract IMR_ALIGN bytes from input sizes
+ * as a result.
+ *
+ * @size: input size bytes.
+ * @return: reduced size.
+ */
+static inline size_t imr_raw_size(size_t size)
+{
+ return size - IMR_ALIGN;
+}
+
+/**
+ * imr_address_overlap - detects an address overlap.
+ *
+ * @addr: address to check against an existing IMR.
+ * @imr: imr being checked.
+ * @return: true for overlap false for no overlap.
+ */
+static inline int imr_address_overlap(phys_addr_t addr, struct imr_regs *imr)
+{
+ return addr >= imr_to_phys(imr->addr_lo) && addr <= imr_to_phys(imr->addr_hi);
+}
+
+/**
+ * imr_add_range - add an Isolated Memory Region.
+ *
+ * @base: physical base address of region aligned to 1KiB.
+ * @size: physical size of region in bytes must be aligned to 1KiB.
+ * @read_mask: read access mask.
+ * @write_mask: write access mask.
+ * @return: zero on success or negative value indicating error.
+ */
+int imr_add_range(phys_addr_t base, size_t size,
+ unsigned int rmask, unsigned int wmask)
+{
+ phys_addr_t end;
+ unsigned int i;
+ struct imr_device *idev = &imr_dev;
+ struct imr_regs imr;
+ size_t raw_size;
+ int reg;
+ int ret;
+
+ if (WARN_ONCE(idev->init == false, "driver not initialized"))
+ return -ENODEV;
+
+ ret = imr_check_params(base, size);
+ if (ret)
+ return ret;
+
+ /* Tweak the size value. */
+ raw_size = imr_raw_size(size);
+ end = base + raw_size;
+
+ /*
+ * Check for reserved IMR value common to firmware, kernel and grub
+ * indicating a disabled IMR.
+ */
+ imr.addr_lo = phys_to_imr(base);
+ imr.addr_hi = phys_to_imr(end);
+ imr.rmask = rmask;
+ imr.wmask = wmask;
+ if (!imr_is_enabled(&imr))
+ return -ENOTSUPP;
+
+ mutex_lock(&idev->lock);
+
+ /*
+ * Find a free IMR while checking for an existing overlapping range.
+ * Note there's no restriction in silicon to prevent IMR overlaps.
+ * For the sake of simplicity and ease in defining/debugging an IMR
+ * memory map we exclude IMR overlaps.
+ */
+ reg = -1;
+ for (i = 0; i < idev->max_imr; i++) {
+ ret = imr_read(idev, i, &imr);
+ if (ret)
+ goto failed;
+
+ /* Find overlap @ base or end of requested range. */
+ ret = -EINVAL;
+ if (imr_is_enabled(&imr)) {
+ if (imr_address_overlap(base, &imr))
+ goto failed;
+ if (imr_address_overlap(end, &imr))
+ goto failed;
+ } else {
+ reg = i;
+ }
+ }
+
+ /* Error out if we have no free IMR entries. */
+ if (reg == -1) {
+ ret = -ENOMEM;
+ goto failed;
+ }
+
+ pr_debug("add %d phys %pa-%pa size %zx mask 0x%08x wmask 0x%08x\n",
+ reg, &base, &end, raw_size, rmask, wmask);
+
+ /* Enable IMR at specified range and access mask. */
+ imr.addr_lo = phys_to_imr(base);
+ imr.addr_hi = phys_to_imr(end);
+ imr.rmask = rmask;
+ imr.wmask = wmask;
+
+ ret = imr_write(idev, reg, &imr);
+ if (ret < 0) {
+ /*
+ * In the highly unlikely event iosf_mbi_write failed
+ * attempt to rollback the IMR setup skipping the trapping
+ * of further IOSF write failures.
+ */
+ imr.addr_lo = 0;
+ imr.addr_hi = 0;
+ imr.rmask = IMR_READ_ACCESS_ALL;
+ imr.wmask = IMR_WRITE_ACCESS_ALL;
+ imr_write(idev, reg, &imr);
+ }
+failed:
+ mutex_unlock(&idev->lock);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(imr_add_range);
+
+/**
+ * __imr_remove_range - delete an Isolated Memory Region.
+ *
+ * This function allows you to delete an IMR by its index specified by reg or
+ * by address range specified by base and size respectively. If you specify an
+ * index on its own the base and size parameters are ignored.
+ * imr_remove_range(0, base, size); delete IMR at index 0 base/size ignored.
+ * imr_remove_range(-1, base, size); delete IMR from base to base+size.
+ *
+ * @reg: imr index to remove.
+ * @base: physical base address of region aligned to 1 KiB.
+ * @size: physical size of region in bytes aligned to 1 KiB.
+ * @return: -EINVAL on invalid range or out or range id
+ * -ENODEV if reg is valid but no IMR exists or is locked
+ * 0 on success.
+ */
+static int __imr_remove_range(int reg, phys_addr_t base, size_t size)
+{
+ phys_addr_t end;
+ bool found = false;
+ unsigned int i;
+ struct imr_device *idev = &imr_dev;
+ struct imr_regs imr;
+ size_t raw_size;
+ int ret = 0;
+
+ if (WARN_ONCE(idev->init == false, "driver not initialized"))
+ return -ENODEV;
+
+ /*
+ * Validate address range if deleting by address, else we are
+ * deleting by index where base and size will be ignored.
+ */
+ if (reg == -1) {
+ ret = imr_check_params(base, size);
+ if (ret)
+ return ret;
+ }
+
+ /* Tweak the size value. */
+ raw_size = imr_raw_size(size);
+ end = base + raw_size;
+
+ mutex_lock(&idev->lock);
+
+ if (reg >= 0) {
+ /* If a specific IMR is given try to use it. */
+ ret = imr_read(idev, reg, &imr);
+ if (ret)
+ goto failed;
+
+ if (!imr_is_enabled(&imr) || imr.addr_lo & IMR_LOCK) {
+ ret = -ENODEV;
+ goto failed;
+ }
+ found = true;
+ } else {
+ /* Search for match based on address range. */
+ for (i = 0; i < idev->max_imr; i++) {
+ ret = imr_read(idev, i, &imr);
+ if (ret)
+ goto failed;
+
+ if (!imr_is_enabled(&imr) || imr.addr_lo & IMR_LOCK)
+ continue;
+
+ if ((imr_to_phys(imr.addr_lo) == base) &&
+ (imr_to_phys(imr.addr_hi) == end)) {
+ found = true;
+ reg = i;
+ break;
+ }
+ }
+ }
+
+ if (!found) {
+ ret = -ENODEV;
+ goto failed;
+ }
+
+ pr_debug("remove %d phys %pa-%pa size %zx\n", reg, &base, &end, raw_size);
+
+ /* Tear down the IMR. */
+ imr.addr_lo = 0;
+ imr.addr_hi = 0;
+ imr.rmask = IMR_READ_ACCESS_ALL;
+ imr.wmask = IMR_WRITE_ACCESS_ALL;
+
+ ret = imr_write(idev, reg, &imr);
+
+failed:
+ mutex_unlock(&idev->lock);
+ return ret;
+}
+
+/**
+ * imr_remove_range - delete an Isolated Memory Region by address
+ *
+ * This function allows you to delete an IMR by an address range specified
+ * by base and size respectively.
+ * imr_remove_range(base, size); delete IMR from base to base+size.
+ *
+ * @base: physical base address of region aligned to 1 KiB.
+ * @size: physical size of region in bytes aligned to 1 KiB.
+ * @return: -EINVAL on invalid range or out or range id
+ * -ENODEV if reg is valid but no IMR exists or is locked
+ * 0 on success.
+ */
+int imr_remove_range(phys_addr_t base, size_t size)
+{
+ return __imr_remove_range(-1, base, size);
+}
+EXPORT_SYMBOL_GPL(imr_remove_range);
+
+/**
+ * imr_clear - delete an Isolated Memory Region by index
+ *
+ * This function allows you to delete an IMR by an address range specified
+ * by the index of the IMR. Useful for initial sanitization of the IMR
+ * address map.
+ * imr_ge(base, size); delete IMR from base to base+size.
+ *
+ * @reg: imr index to remove.
+ * @return: -EINVAL on invalid range or out or range id
+ * -ENODEV if reg is valid but no IMR exists or is locked
+ * 0 on success.
+ */
+static inline int imr_clear(int reg)
+{
+ return __imr_remove_range(reg, 0, 0);
+}
+
+/**
+ * imr_fixup_memmap - Tear down IMRs used during bootup.
+ *
+ * BIOS and Grub both setup IMRs around compressed kernel, initrd memory
+ * that need to be removed before the kernel hands out one of the IMR
+ * encased addresses to a downstream DMA agent such as the SD or Ethernet.
+ * IMRs on Galileo are setup to immediately reset the system on violation.
+ * As a result if you're running a root filesystem from SD - you'll need
+ * the boot-time IMRs torn down or you'll find seemingly random resets when
+ * using your filesystem.
+ *
+ * @idev: pointer to imr_device structure.
+ * @return:
+ */
+static void __init imr_fixup_memmap(struct imr_device *idev)
+{
+ phys_addr_t base = virt_to_phys(&_text);
+ size_t size = virt_to_phys(&__end_rodata) - base;
+ unsigned long start, end;
+ int i;
+ int ret;
+
+ /* Tear down all existing unlocked IMRs. */
+ for (i = 0; i < idev->max_imr; i++)
+ imr_clear(i);
+
+ start = (unsigned long)_text;
+ end = (unsigned long)__end_rodata - 1;
+
+ /*
+ * Setup an unlocked IMR around the physical extent of the kernel
+ * from the beginning of the .text secton to the end of the
+ * .rodata section as one physically contiguous block.
+ *
+ * We don't round up @size since it is already PAGE_SIZE aligned.
+ * See vmlinux.lds.S for details.
+ */
+ ret = imr_add_range(base, size, IMR_CPU, IMR_CPU);
+ if (ret < 0) {
+ pr_err("unable to setup IMR for kernel: %zu KiB (%lx - %lx)\n",
+ size / 1024, start, end);
+ } else {
+ pr_info("protecting kernel .text - .rodata: %zu KiB (%lx - %lx)\n",
+ size / 1024, start, end);
+ }
+
+}
+
+static const struct x86_cpu_id imr_ids[] __initconst = {
+ { X86_VENDOR_INTEL, 5, 9 }, /* Intel Quark SoC X1000. */
+ {}
+};
+
+/**
+ * imr_init - entry point for IMR driver.
+ *
+ * return: -ENODEV for no IMR support 0 if good to go.
+ */
+static int __init imr_init(void)
+{
+ struct imr_device *idev = &imr_dev;
+ int ret;
+
+ if (!x86_match_cpu(imr_ids) || !iosf_mbi_available())
+ return -ENODEV;
+
+ idev->max_imr = QUARK_X1000_IMR_MAX;
+ idev->reg_base = QUARK_X1000_IMR_REGBASE;
+ idev->init = true;
+
+ mutex_init(&idev->lock);
+ ret = imr_debugfs_register(idev);
+ if (ret != 0)
+ pr_warn("debugfs register failed!\n");
+ imr_fixup_memmap(idev);
+ return 0;
+}
+device_initcall(imr_init);
diff --git a/arch/x86/platform/intel-quark/imr_selftest.c b/arch/x86/platform/intel-quark/imr_selftest.c
new file mode 100644
index 000000000..42f879b75
--- /dev/null
+++ b/arch/x86/platform/intel-quark/imr_selftest.c
@@ -0,0 +1,127 @@
+// SPDX-License-Identifier: GPL-2.0
+/**
+ * imr_selftest.c -- Intel Isolated Memory Region self-test driver
+ *
+ * Copyright(c) 2013 Intel Corporation.
+ * Copyright(c) 2015 Bryan O'Donoghue <pure.logic@nexus-software.ie>
+ *
+ * IMR self test. The purpose of this module is to run a set of tests on the
+ * IMR API to validate it's sanity. We check for overlapping, reserved
+ * addresses and setup/teardown sanity.
+ *
+ */
+
+#include <asm-generic/sections.h>
+#include <asm/cpu_device_id.h>
+#include <asm/imr.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/types.h>
+
+#define SELFTEST KBUILD_MODNAME ": "
+/**
+ * imr_self_test_result - Print result string for self test.
+ *
+ * @res: result code - true if test passed false otherwise.
+ * @fmt: format string.
+ * ... variadic argument list.
+ */
+static __printf(2, 3)
+void __init imr_self_test_result(int res, const char *fmt, ...)
+{
+ va_list vlist;
+
+ /* Print pass/fail. */
+ if (res)
+ pr_info(SELFTEST "pass ");
+ else
+ pr_info(SELFTEST "fail ");
+
+ /* Print variable string. */
+ va_start(vlist, fmt);
+ vprintk(fmt, vlist);
+ va_end(vlist);
+
+ /* Optional warning. */
+ WARN(res == 0, "test failed");
+}
+#undef SELFTEST
+
+/**
+ * imr_self_test
+ *
+ * Verify IMR self_test with some simple tests to verify overlap,
+ * zero sized allocations and 1 KiB sized areas.
+ *
+ */
+static void __init imr_self_test(void)
+{
+ phys_addr_t base = virt_to_phys(&_text);
+ size_t size = virt_to_phys(&__end_rodata) - base;
+ const char *fmt_over = "overlapped IMR @ (0x%08lx - 0x%08lx)\n";
+ int ret;
+
+ /* Test zero zero. */
+ ret = imr_add_range(0, 0, 0, 0);
+ imr_self_test_result(ret < 0, "zero sized IMR\n");
+
+ /* Test exact overlap. */
+ ret = imr_add_range(base, size, IMR_CPU, IMR_CPU);
+ imr_self_test_result(ret < 0, fmt_over, __va(base), __va(base + size));
+
+ /* Test overlap with base inside of existing. */
+ base += size - IMR_ALIGN;
+ ret = imr_add_range(base, size, IMR_CPU, IMR_CPU);
+ imr_self_test_result(ret < 0, fmt_over, __va(base), __va(base + size));
+
+ /* Test overlap with end inside of existing. */
+ base -= size + IMR_ALIGN * 2;
+ ret = imr_add_range(base, size, IMR_CPU, IMR_CPU);
+ imr_self_test_result(ret < 0, fmt_over, __va(base), __va(base + size));
+
+ /* Test that a 1 KiB IMR @ zero with read/write all will bomb out. */
+ ret = imr_add_range(0, IMR_ALIGN, IMR_READ_ACCESS_ALL,
+ IMR_WRITE_ACCESS_ALL);
+ imr_self_test_result(ret < 0, "1KiB IMR @ 0x00000000 - access-all\n");
+
+ /* Test that a 1 KiB IMR @ zero with CPU only will work. */
+ ret = imr_add_range(0, IMR_ALIGN, IMR_CPU, IMR_CPU);
+ imr_self_test_result(ret >= 0, "1KiB IMR @ 0x00000000 - cpu-access\n");
+ if (ret >= 0) {
+ ret = imr_remove_range(0, IMR_ALIGN);
+ imr_self_test_result(ret == 0, "teardown - cpu-access\n");
+ }
+
+ /* Test 2 KiB works. */
+ size = IMR_ALIGN * 2;
+ ret = imr_add_range(0, size, IMR_READ_ACCESS_ALL, IMR_WRITE_ACCESS_ALL);
+ imr_self_test_result(ret >= 0, "2KiB IMR @ 0x00000000\n");
+ if (ret >= 0) {
+ ret = imr_remove_range(0, size);
+ imr_self_test_result(ret == 0, "teardown 2KiB\n");
+ }
+}
+
+static const struct x86_cpu_id imr_ids[] __initconst = {
+ { X86_VENDOR_INTEL, 5, 9 }, /* Intel Quark SoC X1000. */
+ {}
+};
+
+/**
+ * imr_self_test_init - entry point for IMR driver.
+ *
+ * return: -ENODEV for no IMR support 0 if good to go.
+ */
+static int __init imr_self_test_init(void)
+{
+ if (x86_match_cpu(imr_ids))
+ imr_self_test();
+ return 0;
+}
+
+/**
+ * imr_self_test_exit - exit point for IMR code.
+ *
+ * return:
+ */
+device_initcall(imr_self_test_init);
diff --git a/arch/x86/platform/intel/Makefile b/arch/x86/platform/intel/Makefile
new file mode 100644
index 000000000..b878032fb
--- /dev/null
+++ b/arch/x86/platform/intel/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_IOSF_MBI) += iosf_mbi.o
diff --git a/arch/x86/platform/intel/iosf_mbi.c b/arch/x86/platform/intel/iosf_mbi.c
new file mode 100644
index 000000000..6f37a2137
--- /dev/null
+++ b/arch/x86/platform/intel/iosf_mbi.c
@@ -0,0 +1,391 @@
+/*
+ * IOSF-SB MailBox Interface Driver
+ * Copyright (c) 2013, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ *
+ * The IOSF-SB is a fabric bus available on Atom based SOC's that uses a
+ * mailbox interface (MBI) to communicate with mutiple devices. This
+ * driver implements access to this interface for those platforms that can
+ * enumerate the device using PCI.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+#include <linux/debugfs.h>
+#include <linux/capability.h>
+
+#include <asm/iosf_mbi.h>
+
+#define PCI_DEVICE_ID_BAYTRAIL 0x0F00
+#define PCI_DEVICE_ID_BRASWELL 0x2280
+#define PCI_DEVICE_ID_QUARK_X1000 0x0958
+#define PCI_DEVICE_ID_TANGIER 0x1170
+
+static struct pci_dev *mbi_pdev;
+static DEFINE_SPINLOCK(iosf_mbi_lock);
+static DEFINE_MUTEX(iosf_mbi_punit_mutex);
+static BLOCKING_NOTIFIER_HEAD(iosf_mbi_pmic_bus_access_notifier);
+
+static inline u32 iosf_mbi_form_mcr(u8 op, u8 port, u8 offset)
+{
+ return (op << 24) | (port << 16) | (offset << 8) | MBI_ENABLE;
+}
+
+static int iosf_mbi_pci_read_mdr(u32 mcrx, u32 mcr, u32 *mdr)
+{
+ int result;
+
+ if (!mbi_pdev)
+ return -ENODEV;
+
+ if (mcrx) {
+ result = pci_write_config_dword(mbi_pdev, MBI_MCRX_OFFSET,
+ mcrx);
+ if (result < 0)
+ goto fail_read;
+ }
+
+ result = pci_write_config_dword(mbi_pdev, MBI_MCR_OFFSET, mcr);
+ if (result < 0)
+ goto fail_read;
+
+ result = pci_read_config_dword(mbi_pdev, MBI_MDR_OFFSET, mdr);
+ if (result < 0)
+ goto fail_read;
+
+ return 0;
+
+fail_read:
+ dev_err(&mbi_pdev->dev, "PCI config access failed with %d\n", result);
+ return result;
+}
+
+static int iosf_mbi_pci_write_mdr(u32 mcrx, u32 mcr, u32 mdr)
+{
+ int result;
+
+ if (!mbi_pdev)
+ return -ENODEV;
+
+ result = pci_write_config_dword(mbi_pdev, MBI_MDR_OFFSET, mdr);
+ if (result < 0)
+ goto fail_write;
+
+ if (mcrx) {
+ result = pci_write_config_dword(mbi_pdev, MBI_MCRX_OFFSET,
+ mcrx);
+ if (result < 0)
+ goto fail_write;
+ }
+
+ result = pci_write_config_dword(mbi_pdev, MBI_MCR_OFFSET, mcr);
+ if (result < 0)
+ goto fail_write;
+
+ return 0;
+
+fail_write:
+ dev_err(&mbi_pdev->dev, "PCI config access failed with %d\n", result);
+ return result;
+}
+
+int iosf_mbi_read(u8 port, u8 opcode, u32 offset, u32 *mdr)
+{
+ u32 mcr, mcrx;
+ unsigned long flags;
+ int ret;
+
+ /* Access to the GFX unit is handled by GPU code */
+ if (port == BT_MBI_UNIT_GFX) {
+ WARN_ON(1);
+ return -EPERM;
+ }
+
+ mcr = iosf_mbi_form_mcr(opcode, port, offset & MBI_MASK_LO);
+ mcrx = offset & MBI_MASK_HI;
+
+ spin_lock_irqsave(&iosf_mbi_lock, flags);
+ ret = iosf_mbi_pci_read_mdr(mcrx, mcr, mdr);
+ spin_unlock_irqrestore(&iosf_mbi_lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(iosf_mbi_read);
+
+int iosf_mbi_write(u8 port, u8 opcode, u32 offset, u32 mdr)
+{
+ u32 mcr, mcrx;
+ unsigned long flags;
+ int ret;
+
+ /* Access to the GFX unit is handled by GPU code */
+ if (port == BT_MBI_UNIT_GFX) {
+ WARN_ON(1);
+ return -EPERM;
+ }
+
+ mcr = iosf_mbi_form_mcr(opcode, port, offset & MBI_MASK_LO);
+ mcrx = offset & MBI_MASK_HI;
+
+ spin_lock_irqsave(&iosf_mbi_lock, flags);
+ ret = iosf_mbi_pci_write_mdr(mcrx, mcr, mdr);
+ spin_unlock_irqrestore(&iosf_mbi_lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(iosf_mbi_write);
+
+int iosf_mbi_modify(u8 port, u8 opcode, u32 offset, u32 mdr, u32 mask)
+{
+ u32 mcr, mcrx;
+ u32 value;
+ unsigned long flags;
+ int ret;
+
+ /* Access to the GFX unit is handled by GPU code */
+ if (port == BT_MBI_UNIT_GFX) {
+ WARN_ON(1);
+ return -EPERM;
+ }
+
+ mcr = iosf_mbi_form_mcr(opcode, port, offset & MBI_MASK_LO);
+ mcrx = offset & MBI_MASK_HI;
+
+ spin_lock_irqsave(&iosf_mbi_lock, flags);
+
+ /* Read current mdr value */
+ ret = iosf_mbi_pci_read_mdr(mcrx, mcr & MBI_RD_MASK, &value);
+ if (ret < 0) {
+ spin_unlock_irqrestore(&iosf_mbi_lock, flags);
+ return ret;
+ }
+
+ /* Apply mask */
+ value &= ~mask;
+ mdr &= mask;
+ value |= mdr;
+
+ /* Write back */
+ ret = iosf_mbi_pci_write_mdr(mcrx, mcr | MBI_WR_MASK, value);
+
+ spin_unlock_irqrestore(&iosf_mbi_lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(iosf_mbi_modify);
+
+bool iosf_mbi_available(void)
+{
+ /* Mbi isn't hot-pluggable. No remove routine is provided */
+ return mbi_pdev;
+}
+EXPORT_SYMBOL(iosf_mbi_available);
+
+void iosf_mbi_punit_acquire(void)
+{
+ mutex_lock(&iosf_mbi_punit_mutex);
+}
+EXPORT_SYMBOL(iosf_mbi_punit_acquire);
+
+void iosf_mbi_punit_release(void)
+{
+ mutex_unlock(&iosf_mbi_punit_mutex);
+}
+EXPORT_SYMBOL(iosf_mbi_punit_release);
+
+int iosf_mbi_register_pmic_bus_access_notifier(struct notifier_block *nb)
+{
+ int ret;
+
+ /* Wait for the bus to go inactive before registering */
+ mutex_lock(&iosf_mbi_punit_mutex);
+ ret = blocking_notifier_chain_register(
+ &iosf_mbi_pmic_bus_access_notifier, nb);
+ mutex_unlock(&iosf_mbi_punit_mutex);
+
+ return ret;
+}
+EXPORT_SYMBOL(iosf_mbi_register_pmic_bus_access_notifier);
+
+int iosf_mbi_unregister_pmic_bus_access_notifier_unlocked(
+ struct notifier_block *nb)
+{
+ iosf_mbi_assert_punit_acquired();
+
+ return blocking_notifier_chain_unregister(
+ &iosf_mbi_pmic_bus_access_notifier, nb);
+}
+EXPORT_SYMBOL(iosf_mbi_unregister_pmic_bus_access_notifier_unlocked);
+
+int iosf_mbi_unregister_pmic_bus_access_notifier(struct notifier_block *nb)
+{
+ int ret;
+
+ /* Wait for the bus to go inactive before unregistering */
+ mutex_lock(&iosf_mbi_punit_mutex);
+ ret = iosf_mbi_unregister_pmic_bus_access_notifier_unlocked(nb);
+ mutex_unlock(&iosf_mbi_punit_mutex);
+
+ return ret;
+}
+EXPORT_SYMBOL(iosf_mbi_unregister_pmic_bus_access_notifier);
+
+int iosf_mbi_call_pmic_bus_access_notifier_chain(unsigned long val, void *v)
+{
+ return blocking_notifier_call_chain(
+ &iosf_mbi_pmic_bus_access_notifier, val, v);
+}
+EXPORT_SYMBOL(iosf_mbi_call_pmic_bus_access_notifier_chain);
+
+void iosf_mbi_assert_punit_acquired(void)
+{
+ WARN_ON(!mutex_is_locked(&iosf_mbi_punit_mutex));
+}
+EXPORT_SYMBOL(iosf_mbi_assert_punit_acquired);
+
+#ifdef CONFIG_IOSF_MBI_DEBUG
+static u32 dbg_mdr;
+static u32 dbg_mcr;
+static u32 dbg_mcrx;
+
+static int mcr_get(void *data, u64 *val)
+{
+ *val = *(u32 *)data;
+ return 0;
+}
+
+static int mcr_set(void *data, u64 val)
+{
+ u8 command = ((u32)val & 0xFF000000) >> 24,
+ port = ((u32)val & 0x00FF0000) >> 16,
+ offset = ((u32)val & 0x0000FF00) >> 8;
+ int err;
+
+ *(u32 *)data = val;
+
+ if (!capable(CAP_SYS_RAWIO))
+ return -EACCES;
+
+ if (command & 1u)
+ err = iosf_mbi_write(port,
+ command,
+ dbg_mcrx | offset,
+ dbg_mdr);
+ else
+ err = iosf_mbi_read(port,
+ command,
+ dbg_mcrx | offset,
+ &dbg_mdr);
+
+ return err;
+}
+DEFINE_SIMPLE_ATTRIBUTE(iosf_mcr_fops, mcr_get, mcr_set , "%llx\n");
+
+static struct dentry *iosf_dbg;
+
+static void iosf_sideband_debug_init(void)
+{
+ struct dentry *d;
+
+ iosf_dbg = debugfs_create_dir("iosf_sb", NULL);
+ if (IS_ERR_OR_NULL(iosf_dbg))
+ return;
+
+ /* mdr */
+ d = debugfs_create_x32("mdr", 0660, iosf_dbg, &dbg_mdr);
+ if (!d)
+ goto cleanup;
+
+ /* mcrx */
+ d = debugfs_create_x32("mcrx", 0660, iosf_dbg, &dbg_mcrx);
+ if (!d)
+ goto cleanup;
+
+ /* mcr - initiates mailbox tranaction */
+ d = debugfs_create_file("mcr", 0660, iosf_dbg, &dbg_mcr, &iosf_mcr_fops);
+ if (!d)
+ goto cleanup;
+
+ return;
+
+cleanup:
+ debugfs_remove_recursive(d);
+}
+
+static void iosf_debugfs_init(void)
+{
+ iosf_sideband_debug_init();
+}
+
+static void iosf_debugfs_remove(void)
+{
+ debugfs_remove_recursive(iosf_dbg);
+}
+#else
+static inline void iosf_debugfs_init(void) { }
+static inline void iosf_debugfs_remove(void) { }
+#endif /* CONFIG_IOSF_MBI_DEBUG */
+
+static int iosf_mbi_probe(struct pci_dev *pdev,
+ const struct pci_device_id *unused)
+{
+ int ret;
+
+ ret = pci_enable_device(pdev);
+ if (ret < 0) {
+ dev_err(&pdev->dev, "error: could not enable device\n");
+ return ret;
+ }
+
+ mbi_pdev = pci_dev_get(pdev);
+ return 0;
+}
+
+static const struct pci_device_id iosf_mbi_pci_ids[] = {
+ { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_BAYTRAIL) },
+ { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_BRASWELL) },
+ { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_QUARK_X1000) },
+ { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_TANGIER) },
+ { 0, },
+};
+MODULE_DEVICE_TABLE(pci, iosf_mbi_pci_ids);
+
+static struct pci_driver iosf_mbi_pci_driver = {
+ .name = "iosf_mbi_pci",
+ .probe = iosf_mbi_probe,
+ .id_table = iosf_mbi_pci_ids,
+};
+
+static int __init iosf_mbi_init(void)
+{
+ iosf_debugfs_init();
+
+ return pci_register_driver(&iosf_mbi_pci_driver);
+}
+
+static void __exit iosf_mbi_exit(void)
+{
+ iosf_debugfs_remove();
+
+ pci_unregister_driver(&iosf_mbi_pci_driver);
+ pci_dev_put(mbi_pdev);
+ mbi_pdev = NULL;
+}
+
+module_init(iosf_mbi_init);
+module_exit(iosf_mbi_exit);
+
+MODULE_AUTHOR("David E. Box <david.e.box@linux.intel.com>");
+MODULE_DESCRIPTION("IOSF Mailbox Interface accessor");
+MODULE_LICENSE("GPL v2");
diff --git a/arch/x86/platform/iris/Makefile b/arch/x86/platform/iris/Makefile
new file mode 100644
index 000000000..db921983a
--- /dev/null
+++ b/arch/x86/platform/iris/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_X86_32_IRIS) += iris.o
diff --git a/arch/x86/platform/iris/iris.c b/arch/x86/platform/iris/iris.c
new file mode 100644
index 000000000..735ba21ef
--- /dev/null
+++ b/arch/x86/platform/iris/iris.c
@@ -0,0 +1,136 @@
+/*
+ * Eurobraille/Iris power off support.
+ *
+ * Eurobraille's Iris machine is a PC with no APM or ACPI support.
+ * It is shutdown by a special I/O sequence which this module provides.
+ *
+ * Copyright (C) Shérab <Sebastien.Hinderer@ens-lyon.org>
+ *
+ * This program is free software ; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation ; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY ; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with the program ; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/moduleparam.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/delay.h>
+#include <linux/pm.h>
+#include <asm/io.h>
+
+#define IRIS_GIO_BASE 0x340
+#define IRIS_GIO_INPUT IRIS_GIO_BASE
+#define IRIS_GIO_OUTPUT (IRIS_GIO_BASE + 1)
+#define IRIS_GIO_PULSE 0x80 /* First byte to send */
+#define IRIS_GIO_REST 0x00 /* Second byte to send */
+#define IRIS_GIO_NODEV 0xff /* Likely not an Iris */
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Sébastien Hinderer <Sebastien.Hinderer@ens-lyon.org>");
+MODULE_DESCRIPTION("A power_off handler for Iris devices from EuroBraille");
+MODULE_SUPPORTED_DEVICE("Eurobraille/Iris");
+
+static bool force;
+
+module_param(force, bool, 0);
+MODULE_PARM_DESC(force, "Set to one to force poweroff handler installation.");
+
+static void (*old_pm_power_off)(void);
+
+static void iris_power_off(void)
+{
+ outb(IRIS_GIO_PULSE, IRIS_GIO_OUTPUT);
+ msleep(850);
+ outb(IRIS_GIO_REST, IRIS_GIO_OUTPUT);
+}
+
+/*
+ * Before installing the power_off handler, try to make sure the OS is
+ * running on an Iris. Since Iris does not support DMI, this is done
+ * by reading its input port and seeing whether the read value is
+ * meaningful.
+ */
+static int iris_probe(struct platform_device *pdev)
+{
+ unsigned char status = inb(IRIS_GIO_INPUT);
+ if (status == IRIS_GIO_NODEV) {
+ printk(KERN_ERR "This machine does not seem to be an Iris. "
+ "Power off handler not installed.\n");
+ return -ENODEV;
+ }
+ old_pm_power_off = pm_power_off;
+ pm_power_off = &iris_power_off;
+ printk(KERN_INFO "Iris power_off handler installed.\n");
+ return 0;
+}
+
+static int iris_remove(struct platform_device *pdev)
+{
+ pm_power_off = old_pm_power_off;
+ printk(KERN_INFO "Iris power_off handler uninstalled.\n");
+ return 0;
+}
+
+static struct platform_driver iris_driver = {
+ .driver = {
+ .name = "iris",
+ },
+ .probe = iris_probe,
+ .remove = iris_remove,
+};
+
+static struct resource iris_resources[] = {
+ {
+ .start = IRIS_GIO_BASE,
+ .end = IRIS_GIO_OUTPUT,
+ .flags = IORESOURCE_IO,
+ .name = "address"
+ }
+};
+
+static struct platform_device *iris_device;
+
+static int iris_init(void)
+{
+ int ret;
+ if (force != 1) {
+ printk(KERN_ERR "The force parameter has not been set to 1."
+ " The Iris poweroff handler will not be installed.\n");
+ return -ENODEV;
+ }
+ ret = platform_driver_register(&iris_driver);
+ if (ret < 0) {
+ printk(KERN_ERR "Failed to register iris platform driver: %d\n",
+ ret);
+ return ret;
+ }
+ iris_device = platform_device_register_simple("iris", (-1),
+ iris_resources, ARRAY_SIZE(iris_resources));
+ if (IS_ERR(iris_device)) {
+ printk(KERN_ERR "Failed to register iris platform device\n");
+ platform_driver_unregister(&iris_driver);
+ return PTR_ERR(iris_device);
+ }
+ return 0;
+}
+
+static void iris_exit(void)
+{
+ platform_device_unregister(iris_device);
+ platform_driver_unregister(&iris_driver);
+}
+
+module_init(iris_init);
+module_exit(iris_exit);
diff --git a/arch/x86/platform/olpc/Makefile b/arch/x86/platform/olpc/Makefile
new file mode 100644
index 000000000..049f92a93
--- /dev/null
+++ b/arch/x86/platform/olpc/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_OLPC) += olpc.o olpc_ofw.o olpc_dt.o
+obj-$(CONFIG_OLPC_XO1_PM) += olpc-xo1-pm.o xo1-wakeup.o
+obj-$(CONFIG_OLPC_XO1_RTC) += olpc-xo1-rtc.o
+obj-$(CONFIG_OLPC_XO1_SCI) += olpc-xo1-sci.o
+obj-$(CONFIG_OLPC_XO15_SCI) += olpc-xo15-sci.o
diff --git a/arch/x86/platform/olpc/olpc-xo1-pm.c b/arch/x86/platform/olpc/olpc-xo1-pm.c
new file mode 100644
index 000000000..0668aaff8
--- /dev/null
+++ b/arch/x86/platform/olpc/olpc-xo1-pm.c
@@ -0,0 +1,200 @@
+/*
+ * Support for power management features of the OLPC XO-1 laptop
+ *
+ * Copyright (C) 2010 Andres Salomon <dilinger@queued.net>
+ * Copyright (C) 2010 One Laptop per Child
+ * Copyright (C) 2006 Red Hat, Inc.
+ * Copyright (C) 2006 Advanced Micro Devices, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/cs5535.h>
+#include <linux/platform_device.h>
+#include <linux/export.h>
+#include <linux/pm.h>
+#include <linux/mfd/core.h>
+#include <linux/suspend.h>
+#include <linux/olpc-ec.h>
+
+#include <asm/io.h>
+#include <asm/olpc.h>
+
+#define DRV_NAME "olpc-xo1-pm"
+
+static unsigned long acpi_base;
+static unsigned long pms_base;
+
+static u16 wakeup_mask = CS5536_PM_PWRBTN;
+
+static struct {
+ unsigned long address;
+ unsigned short segment;
+} ofw_bios_entry = { 0xF0000 + PAGE_OFFSET, __KERNEL_CS };
+
+/* Set bits in the wakeup mask */
+void olpc_xo1_pm_wakeup_set(u16 value)
+{
+ wakeup_mask |= value;
+}
+EXPORT_SYMBOL_GPL(olpc_xo1_pm_wakeup_set);
+
+/* Clear bits in the wakeup mask */
+void olpc_xo1_pm_wakeup_clear(u16 value)
+{
+ wakeup_mask &= ~value;
+}
+EXPORT_SYMBOL_GPL(olpc_xo1_pm_wakeup_clear);
+
+static int xo1_power_state_enter(suspend_state_t pm_state)
+{
+ unsigned long saved_sci_mask;
+
+ /* Only STR is supported */
+ if (pm_state != PM_SUSPEND_MEM)
+ return -EINVAL;
+
+ /*
+ * Save SCI mask (this gets lost since PM1_EN is used as a mask for
+ * wakeup events, which is not necessarily the same event set)
+ */
+ saved_sci_mask = inl(acpi_base + CS5536_PM1_STS);
+ saved_sci_mask &= 0xffff0000;
+
+ /* Save CPU state */
+ do_olpc_suspend_lowlevel();
+
+ /* Resume path starts here */
+
+ /* Restore SCI mask (using dword access to CS5536_PM1_EN) */
+ outl(saved_sci_mask, acpi_base + CS5536_PM1_STS);
+
+ return 0;
+}
+
+asmlinkage __visible int xo1_do_sleep(u8 sleep_state)
+{
+ void *pgd_addr = __va(read_cr3_pa());
+
+ /* Program wakeup mask (using dword access to CS5536_PM1_EN) */
+ outl(wakeup_mask << 16, acpi_base + CS5536_PM1_STS);
+
+ __asm__("movl %0,%%eax" : : "r" (pgd_addr));
+ __asm__("call *(%%edi); cld"
+ : : "D" (&ofw_bios_entry));
+ __asm__("movb $0x34, %al\n\t"
+ "outb %al, $0x70\n\t"
+ "movb $0x30, %al\n\t"
+ "outb %al, $0x71\n\t");
+ return 0;
+}
+
+static void xo1_power_off(void)
+{
+ printk(KERN_INFO "OLPC XO-1 power off sequence...\n");
+
+ /* Enable all of these controls with 0 delay */
+ outl(0x40000000, pms_base + CS5536_PM_SCLK);
+ outl(0x40000000, pms_base + CS5536_PM_IN_SLPCTL);
+ outl(0x40000000, pms_base + CS5536_PM_WKXD);
+ outl(0x40000000, pms_base + CS5536_PM_WKD);
+
+ /* Clear status bits (possibly unnecessary) */
+ outl(0x0002ffff, pms_base + CS5536_PM_SSC);
+ outl(0xffffffff, acpi_base + CS5536_PM_GPE0_STS);
+
+ /* Write SLP_EN bit to start the machinery */
+ outl(0x00002000, acpi_base + CS5536_PM1_CNT);
+}
+
+static int xo1_power_state_valid(suspend_state_t pm_state)
+{
+ /* suspend-to-RAM only */
+ return pm_state == PM_SUSPEND_MEM;
+}
+
+static const struct platform_suspend_ops xo1_suspend_ops = {
+ .valid = xo1_power_state_valid,
+ .enter = xo1_power_state_enter,
+};
+
+static int xo1_pm_probe(struct platform_device *pdev)
+{
+ struct resource *res;
+ int err;
+
+ /* don't run on non-XOs */
+ if (!machine_is_olpc())
+ return -ENODEV;
+
+ err = mfd_cell_enable(pdev);
+ if (err)
+ return err;
+
+ res = platform_get_resource(pdev, IORESOURCE_IO, 0);
+ if (!res) {
+ dev_err(&pdev->dev, "can't fetch device resource info\n");
+ return -EIO;
+ }
+ if (strcmp(pdev->name, "cs5535-pms") == 0)
+ pms_base = res->start;
+ else if (strcmp(pdev->name, "olpc-xo1-pm-acpi") == 0)
+ acpi_base = res->start;
+
+ /* If we have both addresses, we can override the poweroff hook */
+ if (pms_base && acpi_base) {
+ suspend_set_ops(&xo1_suspend_ops);
+ pm_power_off = xo1_power_off;
+ printk(KERN_INFO "OLPC XO-1 support registered\n");
+ }
+
+ return 0;
+}
+
+static int xo1_pm_remove(struct platform_device *pdev)
+{
+ mfd_cell_disable(pdev);
+
+ if (strcmp(pdev->name, "cs5535-pms") == 0)
+ pms_base = 0;
+ else if (strcmp(pdev->name, "olpc-xo1-pm-acpi") == 0)
+ acpi_base = 0;
+
+ pm_power_off = NULL;
+ return 0;
+}
+
+static struct platform_driver cs5535_pms_driver = {
+ .driver = {
+ .name = "cs5535-pms",
+ },
+ .probe = xo1_pm_probe,
+ .remove = xo1_pm_remove,
+};
+
+static struct platform_driver cs5535_acpi_driver = {
+ .driver = {
+ .name = "olpc-xo1-pm-acpi",
+ },
+ .probe = xo1_pm_probe,
+ .remove = xo1_pm_remove,
+};
+
+static int __init xo1_pm_init(void)
+{
+ int r;
+
+ r = platform_driver_register(&cs5535_pms_driver);
+ if (r)
+ return r;
+
+ r = platform_driver_register(&cs5535_acpi_driver);
+ if (r)
+ platform_driver_unregister(&cs5535_pms_driver);
+
+ return r;
+}
+arch_initcall(xo1_pm_init);
diff --git a/arch/x86/platform/olpc/olpc-xo1-rtc.c b/arch/x86/platform/olpc/olpc-xo1-rtc.c
new file mode 100644
index 000000000..8e7ddd7e3
--- /dev/null
+++ b/arch/x86/platform/olpc/olpc-xo1-rtc.c
@@ -0,0 +1,84 @@
+/*
+ * Support for OLPC XO-1 Real Time Clock (RTC)
+ *
+ * Copyright (C) 2011 One Laptop per Child
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/mc146818rtc.h>
+#include <linux/platform_device.h>
+#include <linux/rtc.h>
+#include <linux/of.h>
+
+#include <asm/msr.h>
+#include <asm/olpc.h>
+#include <asm/x86_init.h>
+
+static void rtc_wake_on(struct device *dev)
+{
+ olpc_xo1_pm_wakeup_set(CS5536_PM_RTC);
+}
+
+static void rtc_wake_off(struct device *dev)
+{
+ olpc_xo1_pm_wakeup_clear(CS5536_PM_RTC);
+}
+
+static struct resource rtc_platform_resource[] = {
+ [0] = {
+ .start = RTC_PORT(0),
+ .end = RTC_PORT(1),
+ .flags = IORESOURCE_IO,
+ },
+ [1] = {
+ .start = RTC_IRQ,
+ .end = RTC_IRQ,
+ .flags = IORESOURCE_IRQ,
+ }
+};
+
+static struct cmos_rtc_board_info rtc_info = {
+ .rtc_day_alarm = 0,
+ .rtc_mon_alarm = 0,
+ .rtc_century = 0,
+ .wake_on = rtc_wake_on,
+ .wake_off = rtc_wake_off,
+};
+
+static struct platform_device xo1_rtc_device = {
+ .name = "rtc_cmos",
+ .id = -1,
+ .num_resources = ARRAY_SIZE(rtc_platform_resource),
+ .dev.platform_data = &rtc_info,
+ .resource = rtc_platform_resource,
+};
+
+static int __init xo1_rtc_init(void)
+{
+ int r;
+ struct device_node *node;
+
+ node = of_find_compatible_node(NULL, NULL, "olpc,xo1-rtc");
+ if (!node)
+ return 0;
+ of_node_put(node);
+
+ pr_info("olpc-xo1-rtc: Initializing OLPC XO-1 RTC\n");
+ rdmsrl(MSR_RTC_DOMA_OFFSET, rtc_info.rtc_day_alarm);
+ rdmsrl(MSR_RTC_MONA_OFFSET, rtc_info.rtc_mon_alarm);
+ rdmsrl(MSR_RTC_CEN_OFFSET, rtc_info.rtc_century);
+
+ r = platform_device_register(&xo1_rtc_device);
+ if (r)
+ return r;
+
+ x86_platform.legacy.rtc = 0;
+
+ device_init_wakeup(&xo1_rtc_device.dev, 1);
+ return 0;
+}
+arch_initcall(xo1_rtc_init);
diff --git a/arch/x86/platform/olpc/olpc-xo1-sci.c b/arch/x86/platform/olpc/olpc-xo1-sci.c
new file mode 100644
index 000000000..7fa8b3b53
--- /dev/null
+++ b/arch/x86/platform/olpc/olpc-xo1-sci.c
@@ -0,0 +1,642 @@
+/*
+ * Support for OLPC XO-1 System Control Interrupts (SCI)
+ *
+ * Copyright (C) 2010 One Laptop per Child
+ * Copyright (C) 2006 Red Hat, Inc.
+ * Copyright (C) 2006 Advanced Micro Devices, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/cs5535.h>
+#include <linux/device.h>
+#include <linux/gpio.h>
+#include <linux/input.h>
+#include <linux/interrupt.h>
+#include <linux/platform_device.h>
+#include <linux/pm.h>
+#include <linux/pm_wakeup.h>
+#include <linux/mfd/core.h>
+#include <linux/power_supply.h>
+#include <linux/suspend.h>
+#include <linux/workqueue.h>
+#include <linux/olpc-ec.h>
+
+#include <asm/io.h>
+#include <asm/msr.h>
+#include <asm/olpc.h>
+
+#define DRV_NAME "olpc-xo1-sci"
+#define PFX DRV_NAME ": "
+
+static unsigned long acpi_base;
+static struct input_dev *power_button_idev;
+static struct input_dev *ebook_switch_idev;
+static struct input_dev *lid_switch_idev;
+
+static int sci_irq;
+
+static bool lid_open;
+static bool lid_inverted;
+static int lid_wake_mode;
+
+enum lid_wake_modes {
+ LID_WAKE_ALWAYS,
+ LID_WAKE_OPEN,
+ LID_WAKE_CLOSE,
+};
+
+static const char * const lid_wake_mode_names[] = {
+ [LID_WAKE_ALWAYS] = "always",
+ [LID_WAKE_OPEN] = "open",
+ [LID_WAKE_CLOSE] = "close",
+};
+
+static void battery_status_changed(void)
+{
+ struct power_supply *psy = power_supply_get_by_name("olpc-battery");
+
+ if (psy) {
+ power_supply_changed(psy);
+ power_supply_put(psy);
+ }
+}
+
+static void ac_status_changed(void)
+{
+ struct power_supply *psy = power_supply_get_by_name("olpc-ac");
+
+ if (psy) {
+ power_supply_changed(psy);
+ power_supply_put(psy);
+ }
+}
+
+/* Report current ebook switch state through input layer */
+static void send_ebook_state(void)
+{
+ unsigned char state;
+
+ if (olpc_ec_cmd(EC_READ_EB_MODE, NULL, 0, &state, 1)) {
+ pr_err(PFX "failed to get ebook state\n");
+ return;
+ }
+
+ if (!!test_bit(SW_TABLET_MODE, ebook_switch_idev->sw) == state)
+ return; /* Nothing new to report. */
+
+ input_report_switch(ebook_switch_idev, SW_TABLET_MODE, state);
+ input_sync(ebook_switch_idev);
+ pm_wakeup_event(&ebook_switch_idev->dev, 0);
+}
+
+static void flip_lid_inverter(void)
+{
+ /* gpio is high; invert so we'll get l->h event interrupt */
+ if (lid_inverted)
+ cs5535_gpio_clear(OLPC_GPIO_LID, GPIO_INPUT_INVERT);
+ else
+ cs5535_gpio_set(OLPC_GPIO_LID, GPIO_INPUT_INVERT);
+ lid_inverted = !lid_inverted;
+}
+
+static void detect_lid_state(void)
+{
+ /*
+ * the edge detector hookup on the gpio inputs on the geode is
+ * odd, to say the least. See http://dev.laptop.org/ticket/5703
+ * for details, but in a nutshell: we don't use the edge
+ * detectors. instead, we make use of an anomoly: with the both
+ * edge detectors turned off, we still get an edge event on a
+ * positive edge transition. to take advantage of this, we use the
+ * front-end inverter to ensure that that's the edge we're always
+ * going to see next.
+ */
+
+ int state;
+
+ state = cs5535_gpio_isset(OLPC_GPIO_LID, GPIO_READ_BACK);
+ lid_open = !state ^ !lid_inverted; /* x ^^ y */
+ if (!state)
+ return;
+
+ flip_lid_inverter();
+}
+
+/* Report current lid switch state through input layer */
+static void send_lid_state(void)
+{
+ if (!!test_bit(SW_LID, lid_switch_idev->sw) == !lid_open)
+ return; /* Nothing new to report. */
+
+ input_report_switch(lid_switch_idev, SW_LID, !lid_open);
+ input_sync(lid_switch_idev);
+ pm_wakeup_event(&lid_switch_idev->dev, 0);
+}
+
+static ssize_t lid_wake_mode_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ const char *mode = lid_wake_mode_names[lid_wake_mode];
+ return sprintf(buf, "%s\n", mode);
+}
+static ssize_t lid_wake_mode_set(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ int i;
+ for (i = 0; i < ARRAY_SIZE(lid_wake_mode_names); i++) {
+ const char *mode = lid_wake_mode_names[i];
+ if (strlen(mode) != count || strncasecmp(mode, buf, count))
+ continue;
+
+ lid_wake_mode = i;
+ return count;
+ }
+ return -EINVAL;
+}
+static DEVICE_ATTR(lid_wake_mode, S_IWUSR | S_IRUGO, lid_wake_mode_show,
+ lid_wake_mode_set);
+
+/*
+ * Process all items in the EC's SCI queue.
+ *
+ * This is handled in a workqueue because olpc_ec_cmd can be slow (and
+ * can even timeout).
+ *
+ * If propagate_events is false, the queue is drained without events being
+ * generated for the interrupts.
+ */
+static void process_sci_queue(bool propagate_events)
+{
+ int r;
+ u16 data;
+
+ do {
+ r = olpc_ec_sci_query(&data);
+ if (r || !data)
+ break;
+
+ pr_debug(PFX "SCI 0x%x received\n", data);
+
+ switch (data) {
+ case EC_SCI_SRC_BATERR:
+ case EC_SCI_SRC_BATSOC:
+ case EC_SCI_SRC_BATTERY:
+ case EC_SCI_SRC_BATCRIT:
+ battery_status_changed();
+ break;
+ case EC_SCI_SRC_ACPWR:
+ ac_status_changed();
+ break;
+ }
+
+ if (data == EC_SCI_SRC_EBOOK && propagate_events)
+ send_ebook_state();
+ } while (data);
+
+ if (r)
+ pr_err(PFX "Failed to clear SCI queue");
+}
+
+static void process_sci_queue_work(struct work_struct *work)
+{
+ process_sci_queue(true);
+}
+
+static DECLARE_WORK(sci_work, process_sci_queue_work);
+
+static irqreturn_t xo1_sci_intr(int irq, void *dev_id)
+{
+ struct platform_device *pdev = dev_id;
+ u32 sts;
+ u32 gpe;
+
+ sts = inl(acpi_base + CS5536_PM1_STS);
+ outl(sts | 0xffff, acpi_base + CS5536_PM1_STS);
+
+ gpe = inl(acpi_base + CS5536_PM_GPE0_STS);
+ outl(0xffffffff, acpi_base + CS5536_PM_GPE0_STS);
+
+ dev_dbg(&pdev->dev, "sts %x gpe %x\n", sts, gpe);
+
+ if (sts & CS5536_PWRBTN_FLAG) {
+ if (!(sts & CS5536_WAK_FLAG)) {
+ /* Only report power button input when it was pressed
+ * during regular operation (as opposed to when it
+ * was used to wake the system). */
+ input_report_key(power_button_idev, KEY_POWER, 1);
+ input_sync(power_button_idev);
+ input_report_key(power_button_idev, KEY_POWER, 0);
+ input_sync(power_button_idev);
+ }
+ /* Report the wakeup event in all cases. */
+ pm_wakeup_event(&power_button_idev->dev, 0);
+ }
+
+ if ((sts & (CS5536_RTC_FLAG | CS5536_WAK_FLAG)) ==
+ (CS5536_RTC_FLAG | CS5536_WAK_FLAG)) {
+ /* When the system is woken by the RTC alarm, report the
+ * event on the rtc device. */
+ struct device *rtc = bus_find_device_by_name(
+ &platform_bus_type, NULL, "rtc_cmos");
+ if (rtc) {
+ pm_wakeup_event(rtc, 0);
+ put_device(rtc);
+ }
+ }
+
+ if (gpe & CS5536_GPIOM7_PME_FLAG) { /* EC GPIO */
+ cs5535_gpio_set(OLPC_GPIO_ECSCI, GPIO_NEGATIVE_EDGE_STS);
+ schedule_work(&sci_work);
+ }
+
+ cs5535_gpio_set(OLPC_GPIO_LID, GPIO_NEGATIVE_EDGE_STS);
+ cs5535_gpio_set(OLPC_GPIO_LID, GPIO_POSITIVE_EDGE_STS);
+ detect_lid_state();
+ send_lid_state();
+
+ return IRQ_HANDLED;
+}
+
+static int xo1_sci_suspend(struct platform_device *pdev, pm_message_t state)
+{
+ if (device_may_wakeup(&power_button_idev->dev))
+ olpc_xo1_pm_wakeup_set(CS5536_PM_PWRBTN);
+ else
+ olpc_xo1_pm_wakeup_clear(CS5536_PM_PWRBTN);
+
+ if (device_may_wakeup(&ebook_switch_idev->dev))
+ olpc_ec_wakeup_set(EC_SCI_SRC_EBOOK);
+ else
+ olpc_ec_wakeup_clear(EC_SCI_SRC_EBOOK);
+
+ if (!device_may_wakeup(&lid_switch_idev->dev)) {
+ cs5535_gpio_clear(OLPC_GPIO_LID, GPIO_EVENTS_ENABLE);
+ } else if ((lid_open && lid_wake_mode == LID_WAKE_OPEN) ||
+ (!lid_open && lid_wake_mode == LID_WAKE_CLOSE)) {
+ flip_lid_inverter();
+
+ /* we may have just caused an event */
+ cs5535_gpio_set(OLPC_GPIO_LID, GPIO_NEGATIVE_EDGE_STS);
+ cs5535_gpio_set(OLPC_GPIO_LID, GPIO_POSITIVE_EDGE_STS);
+
+ cs5535_gpio_set(OLPC_GPIO_LID, GPIO_EVENTS_ENABLE);
+ }
+
+ return 0;
+}
+
+static int xo1_sci_resume(struct platform_device *pdev)
+{
+ /*
+ * We don't know what may have happened while we were asleep.
+ * Reestablish our lid setup so we're sure to catch all transitions.
+ */
+ detect_lid_state();
+ send_lid_state();
+ cs5535_gpio_set(OLPC_GPIO_LID, GPIO_EVENTS_ENABLE);
+
+ /* Enable all EC events */
+ olpc_ec_mask_write(EC_SCI_SRC_ALL);
+
+ /* Power/battery status might have changed too */
+ battery_status_changed();
+ ac_status_changed();
+ return 0;
+}
+
+static int setup_sci_interrupt(struct platform_device *pdev)
+{
+ u32 lo, hi;
+ u32 sts;
+ int r;
+
+ rdmsr(0x51400020, lo, hi);
+ sci_irq = (lo >> 20) & 15;
+
+ if (sci_irq) {
+ dev_info(&pdev->dev, "SCI is mapped to IRQ %d\n", sci_irq);
+ } else {
+ /* Zero means masked */
+ dev_info(&pdev->dev, "SCI unmapped. Mapping to IRQ 3\n");
+ sci_irq = 3;
+ lo |= 0x00300000;
+ wrmsrl(0x51400020, lo);
+ }
+
+ /* Select level triggered in PIC */
+ if (sci_irq < 8) {
+ lo = inb(CS5536_PIC_INT_SEL1);
+ lo |= 1 << sci_irq;
+ outb(lo, CS5536_PIC_INT_SEL1);
+ } else {
+ lo = inb(CS5536_PIC_INT_SEL2);
+ lo |= 1 << (sci_irq - 8);
+ outb(lo, CS5536_PIC_INT_SEL2);
+ }
+
+ /* Enable interesting SCI events, and clear pending interrupts */
+ sts = inl(acpi_base + CS5536_PM1_STS);
+ outl(((CS5536_PM_PWRBTN | CS5536_PM_RTC) << 16) | 0xffff,
+ acpi_base + CS5536_PM1_STS);
+
+ r = request_irq(sci_irq, xo1_sci_intr, 0, DRV_NAME, pdev);
+ if (r)
+ dev_err(&pdev->dev, "can't request interrupt\n");
+
+ return r;
+}
+
+static int setup_ec_sci(void)
+{
+ int r;
+
+ r = gpio_request(OLPC_GPIO_ECSCI, "OLPC-ECSCI");
+ if (r)
+ return r;
+
+ gpio_direction_input(OLPC_GPIO_ECSCI);
+
+ /* Clear pending EC SCI events */
+ cs5535_gpio_set(OLPC_GPIO_ECSCI, GPIO_NEGATIVE_EDGE_STS);
+ cs5535_gpio_set(OLPC_GPIO_ECSCI, GPIO_POSITIVE_EDGE_STS);
+
+ /*
+ * Enable EC SCI events, and map them to both a PME and the SCI
+ * interrupt.
+ *
+ * Ordinarily, in addition to functioning as GPIOs, Geode GPIOs can
+ * be mapped to regular interrupts *or* Geode-specific Power
+ * Management Events (PMEs) - events that bring the system out of
+ * suspend. In this case, we want both of those things - the system
+ * wakeup, *and* the ability to get an interrupt when an event occurs.
+ *
+ * To achieve this, we map the GPIO to a PME, and then we use one
+ * of the many generic knobs on the CS5535 PIC to additionally map the
+ * PME to the regular SCI interrupt line.
+ */
+ cs5535_gpio_set(OLPC_GPIO_ECSCI, GPIO_EVENTS_ENABLE);
+
+ /* Set the SCI to cause a PME event on group 7 */
+ cs5535_gpio_setup_event(OLPC_GPIO_ECSCI, 7, 1);
+
+ /* And have group 7 also fire the SCI interrupt */
+ cs5535_pic_unreqz_select_high(7, sci_irq);
+
+ return 0;
+}
+
+static void free_ec_sci(void)
+{
+ gpio_free(OLPC_GPIO_ECSCI);
+}
+
+static int setup_lid_events(void)
+{
+ int r;
+
+ r = gpio_request(OLPC_GPIO_LID, "OLPC-LID");
+ if (r)
+ return r;
+
+ gpio_direction_input(OLPC_GPIO_LID);
+
+ cs5535_gpio_clear(OLPC_GPIO_LID, GPIO_INPUT_INVERT);
+ lid_inverted = 0;
+
+ /* Clear edge detection and event enable for now */
+ cs5535_gpio_clear(OLPC_GPIO_LID, GPIO_EVENTS_ENABLE);
+ cs5535_gpio_clear(OLPC_GPIO_LID, GPIO_NEGATIVE_EDGE_EN);
+ cs5535_gpio_clear(OLPC_GPIO_LID, GPIO_POSITIVE_EDGE_EN);
+ cs5535_gpio_set(OLPC_GPIO_LID, GPIO_NEGATIVE_EDGE_STS);
+ cs5535_gpio_set(OLPC_GPIO_LID, GPIO_POSITIVE_EDGE_STS);
+
+ /* Set the LID to cause an PME event on group 6 */
+ cs5535_gpio_setup_event(OLPC_GPIO_LID, 6, 1);
+
+ /* Set PME group 6 to fire the SCI interrupt */
+ cs5535_gpio_set_irq(6, sci_irq);
+
+ /* Enable the event */
+ cs5535_gpio_set(OLPC_GPIO_LID, GPIO_EVENTS_ENABLE);
+
+ return 0;
+}
+
+static void free_lid_events(void)
+{
+ gpio_free(OLPC_GPIO_LID);
+}
+
+static int setup_power_button(struct platform_device *pdev)
+{
+ int r;
+
+ power_button_idev = input_allocate_device();
+ if (!power_button_idev)
+ return -ENOMEM;
+
+ power_button_idev->name = "Power Button";
+ power_button_idev->phys = DRV_NAME "/input0";
+ set_bit(EV_KEY, power_button_idev->evbit);
+ set_bit(KEY_POWER, power_button_idev->keybit);
+
+ power_button_idev->dev.parent = &pdev->dev;
+ device_init_wakeup(&power_button_idev->dev, 1);
+
+ r = input_register_device(power_button_idev);
+ if (r) {
+ dev_err(&pdev->dev, "failed to register power button: %d\n", r);
+ input_free_device(power_button_idev);
+ }
+
+ return r;
+}
+
+static void free_power_button(void)
+{
+ input_unregister_device(power_button_idev);
+}
+
+static int setup_ebook_switch(struct platform_device *pdev)
+{
+ int r;
+
+ ebook_switch_idev = input_allocate_device();
+ if (!ebook_switch_idev)
+ return -ENOMEM;
+
+ ebook_switch_idev->name = "EBook Switch";
+ ebook_switch_idev->phys = DRV_NAME "/input1";
+ set_bit(EV_SW, ebook_switch_idev->evbit);
+ set_bit(SW_TABLET_MODE, ebook_switch_idev->swbit);
+
+ ebook_switch_idev->dev.parent = &pdev->dev;
+ device_set_wakeup_capable(&ebook_switch_idev->dev, true);
+
+ r = input_register_device(ebook_switch_idev);
+ if (r) {
+ dev_err(&pdev->dev, "failed to register ebook switch: %d\n", r);
+ input_free_device(ebook_switch_idev);
+ }
+
+ return r;
+}
+
+static void free_ebook_switch(void)
+{
+ input_unregister_device(ebook_switch_idev);
+}
+
+static int setup_lid_switch(struct platform_device *pdev)
+{
+ int r;
+
+ lid_switch_idev = input_allocate_device();
+ if (!lid_switch_idev)
+ return -ENOMEM;
+
+ lid_switch_idev->name = "Lid Switch";
+ lid_switch_idev->phys = DRV_NAME "/input2";
+ set_bit(EV_SW, lid_switch_idev->evbit);
+ set_bit(SW_LID, lid_switch_idev->swbit);
+
+ lid_switch_idev->dev.parent = &pdev->dev;
+ device_set_wakeup_capable(&lid_switch_idev->dev, true);
+
+ r = input_register_device(lid_switch_idev);
+ if (r) {
+ dev_err(&pdev->dev, "failed to register lid switch: %d\n", r);
+ goto err_register;
+ }
+
+ r = device_create_file(&lid_switch_idev->dev, &dev_attr_lid_wake_mode);
+ if (r) {
+ dev_err(&pdev->dev, "failed to create wake mode attr: %d\n", r);
+ goto err_create_attr;
+ }
+
+ return 0;
+
+err_create_attr:
+ input_unregister_device(lid_switch_idev);
+ lid_switch_idev = NULL;
+err_register:
+ input_free_device(lid_switch_idev);
+ return r;
+}
+
+static void free_lid_switch(void)
+{
+ device_remove_file(&lid_switch_idev->dev, &dev_attr_lid_wake_mode);
+ input_unregister_device(lid_switch_idev);
+}
+
+static int xo1_sci_probe(struct platform_device *pdev)
+{
+ struct resource *res;
+ int r;
+
+ /* don't run on non-XOs */
+ if (!machine_is_olpc())
+ return -ENODEV;
+
+ r = mfd_cell_enable(pdev);
+ if (r)
+ return r;
+
+ res = platform_get_resource(pdev, IORESOURCE_IO, 0);
+ if (!res) {
+ dev_err(&pdev->dev, "can't fetch device resource info\n");
+ return -EIO;
+ }
+ acpi_base = res->start;
+
+ r = setup_power_button(pdev);
+ if (r)
+ return r;
+
+ r = setup_ebook_switch(pdev);
+ if (r)
+ goto err_ebook;
+
+ r = setup_lid_switch(pdev);
+ if (r)
+ goto err_lid;
+
+ r = setup_lid_events();
+ if (r)
+ goto err_lidevt;
+
+ r = setup_ec_sci();
+ if (r)
+ goto err_ecsci;
+
+ /* Enable PME generation for EC-generated events */
+ outl(CS5536_GPIOM6_PME_EN | CS5536_GPIOM7_PME_EN,
+ acpi_base + CS5536_PM_GPE0_EN);
+
+ /* Clear pending events */
+ outl(0xffffffff, acpi_base + CS5536_PM_GPE0_STS);
+ process_sci_queue(false);
+
+ /* Initial sync */
+ send_ebook_state();
+ detect_lid_state();
+ send_lid_state();
+
+ r = setup_sci_interrupt(pdev);
+ if (r)
+ goto err_sci;
+
+ /* Enable all EC events */
+ olpc_ec_mask_write(EC_SCI_SRC_ALL);
+
+ return r;
+
+err_sci:
+ free_ec_sci();
+err_ecsci:
+ free_lid_events();
+err_lidevt:
+ free_lid_switch();
+err_lid:
+ free_ebook_switch();
+err_ebook:
+ free_power_button();
+ return r;
+}
+
+static int xo1_sci_remove(struct platform_device *pdev)
+{
+ mfd_cell_disable(pdev);
+ free_irq(sci_irq, pdev);
+ cancel_work_sync(&sci_work);
+ free_ec_sci();
+ free_lid_events();
+ free_lid_switch();
+ free_ebook_switch();
+ free_power_button();
+ acpi_base = 0;
+ return 0;
+}
+
+static struct platform_driver xo1_sci_driver = {
+ .driver = {
+ .name = "olpc-xo1-sci-acpi",
+ },
+ .probe = xo1_sci_probe,
+ .remove = xo1_sci_remove,
+ .suspend = xo1_sci_suspend,
+ .resume = xo1_sci_resume,
+};
+
+static int __init xo1_sci_init(void)
+{
+ return platform_driver_register(&xo1_sci_driver);
+}
+arch_initcall(xo1_sci_init);
diff --git a/arch/x86/platform/olpc/olpc-xo15-sci.c b/arch/x86/platform/olpc/olpc-xo15-sci.c
new file mode 100644
index 000000000..c0533fbc3
--- /dev/null
+++ b/arch/x86/platform/olpc/olpc-xo15-sci.c
@@ -0,0 +1,235 @@
+/*
+ * Support for OLPC XO-1.5 System Control Interrupts (SCI)
+ *
+ * Copyright (C) 2009-2010 One Laptop per Child
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+#include <linux/power_supply.h>
+#include <linux/olpc-ec.h>
+
+#include <linux/acpi.h>
+#include <asm/olpc.h>
+
+#define DRV_NAME "olpc-xo15-sci"
+#define PFX DRV_NAME ": "
+#define XO15_SCI_CLASS DRV_NAME
+#define XO15_SCI_DEVICE_NAME "OLPC XO-1.5 SCI"
+
+static unsigned long xo15_sci_gpe;
+static bool lid_wake_on_close;
+
+/*
+ * The normal ACPI LID wakeup behavior is wake-on-open, but not
+ * wake-on-close. This is implemented as standard by the XO-1.5 DSDT.
+ *
+ * We provide here a sysfs attribute that will additionally enable
+ * wake-on-close behavior. This is useful (e.g.) when we oportunistically
+ * suspend with the display running; if the lid is then closed, we want to
+ * wake up to turn the display off.
+ *
+ * This is controlled through a custom method in the XO-1.5 DSDT.
+ */
+static int set_lid_wake_behavior(bool wake_on_close)
+{
+ acpi_status status;
+
+ status = acpi_execute_simple_method(NULL, "\\_SB.PCI0.LID.LIDW", wake_on_close);
+ if (ACPI_FAILURE(status)) {
+ pr_warning(PFX "failed to set lid behavior\n");
+ return 1;
+ }
+
+ lid_wake_on_close = wake_on_close;
+
+ return 0;
+}
+
+static ssize_t
+lid_wake_on_close_show(struct kobject *s, struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%u\n", lid_wake_on_close);
+}
+
+static ssize_t lid_wake_on_close_store(struct kobject *s,
+ struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ unsigned int val;
+
+ if (sscanf(buf, "%u", &val) != 1)
+ return -EINVAL;
+
+ set_lid_wake_behavior(!!val);
+
+ return n;
+}
+
+static struct kobj_attribute lid_wake_on_close_attr =
+ __ATTR(lid_wake_on_close, 0644,
+ lid_wake_on_close_show,
+ lid_wake_on_close_store);
+
+static void battery_status_changed(void)
+{
+ struct power_supply *psy = power_supply_get_by_name("olpc-battery");
+
+ if (psy) {
+ power_supply_changed(psy);
+ power_supply_put(psy);
+ }
+}
+
+static void ac_status_changed(void)
+{
+ struct power_supply *psy = power_supply_get_by_name("olpc-ac");
+
+ if (psy) {
+ power_supply_changed(psy);
+ power_supply_put(psy);
+ }
+}
+
+static void process_sci_queue(void)
+{
+ u16 data;
+ int r;
+
+ do {
+ r = olpc_ec_sci_query(&data);
+ if (r || !data)
+ break;
+
+ pr_debug(PFX "SCI 0x%x received\n", data);
+
+ switch (data) {
+ case EC_SCI_SRC_BATERR:
+ case EC_SCI_SRC_BATSOC:
+ case EC_SCI_SRC_BATTERY:
+ case EC_SCI_SRC_BATCRIT:
+ battery_status_changed();
+ break;
+ case EC_SCI_SRC_ACPWR:
+ ac_status_changed();
+ break;
+ }
+ } while (data);
+
+ if (r)
+ pr_err(PFX "Failed to clear SCI queue");
+}
+
+static void process_sci_queue_work(struct work_struct *work)
+{
+ process_sci_queue();
+}
+
+static DECLARE_WORK(sci_work, process_sci_queue_work);
+
+static u32 xo15_sci_gpe_handler(acpi_handle gpe_device, u32 gpe, void *context)
+{
+ schedule_work(&sci_work);
+ return ACPI_INTERRUPT_HANDLED | ACPI_REENABLE_GPE;
+}
+
+static int xo15_sci_add(struct acpi_device *device)
+{
+ unsigned long long tmp;
+ acpi_status status;
+ int r;
+
+ if (!device)
+ return -EINVAL;
+
+ strcpy(acpi_device_name(device), XO15_SCI_DEVICE_NAME);
+ strcpy(acpi_device_class(device), XO15_SCI_CLASS);
+
+ /* Get GPE bit assignment (EC events). */
+ status = acpi_evaluate_integer(device->handle, "_GPE", NULL, &tmp);
+ if (ACPI_FAILURE(status))
+ return -EINVAL;
+
+ xo15_sci_gpe = tmp;
+ status = acpi_install_gpe_handler(NULL, xo15_sci_gpe,
+ ACPI_GPE_EDGE_TRIGGERED,
+ xo15_sci_gpe_handler, device);
+ if (ACPI_FAILURE(status))
+ return -ENODEV;
+
+ dev_info(&device->dev, "Initialized, GPE = 0x%lx\n", xo15_sci_gpe);
+
+ r = sysfs_create_file(&device->dev.kobj, &lid_wake_on_close_attr.attr);
+ if (r)
+ goto err_sysfs;
+
+ /* Flush queue, and enable all SCI events */
+ process_sci_queue();
+ olpc_ec_mask_write(EC_SCI_SRC_ALL);
+
+ acpi_enable_gpe(NULL, xo15_sci_gpe);
+
+ /* Enable wake-on-EC */
+ if (device->wakeup.flags.valid)
+ device_init_wakeup(&device->dev, true);
+
+ return 0;
+
+err_sysfs:
+ acpi_remove_gpe_handler(NULL, xo15_sci_gpe, xo15_sci_gpe_handler);
+ cancel_work_sync(&sci_work);
+ return r;
+}
+
+static int xo15_sci_remove(struct acpi_device *device)
+{
+ acpi_disable_gpe(NULL, xo15_sci_gpe);
+ acpi_remove_gpe_handler(NULL, xo15_sci_gpe, xo15_sci_gpe_handler);
+ cancel_work_sync(&sci_work);
+ sysfs_remove_file(&device->dev.kobj, &lid_wake_on_close_attr.attr);
+ return 0;
+}
+
+#ifdef CONFIG_PM_SLEEP
+static int xo15_sci_resume(struct device *dev)
+{
+ /* Enable all EC events */
+ olpc_ec_mask_write(EC_SCI_SRC_ALL);
+
+ /* Power/battery status might have changed */
+ battery_status_changed();
+ ac_status_changed();
+
+ return 0;
+}
+#endif
+
+static SIMPLE_DEV_PM_OPS(xo15_sci_pm, NULL, xo15_sci_resume);
+
+static const struct acpi_device_id xo15_sci_device_ids[] = {
+ {"XO15EC", 0},
+ {"", 0},
+};
+
+static struct acpi_driver xo15_sci_drv = {
+ .name = DRV_NAME,
+ .class = XO15_SCI_CLASS,
+ .ids = xo15_sci_device_ids,
+ .ops = {
+ .add = xo15_sci_add,
+ .remove = xo15_sci_remove,
+ },
+ .drv.pm = &xo15_sci_pm,
+};
+
+static int __init xo15_sci_init(void)
+{
+ return acpi_bus_register_driver(&xo15_sci_drv);
+}
+device_initcall(xo15_sci_init);
diff --git a/arch/x86/platform/olpc/olpc.c b/arch/x86/platform/olpc/olpc.c
new file mode 100644
index 000000000..f0e920fb9
--- /dev/null
+++ b/arch/x86/platform/olpc/olpc.c
@@ -0,0 +1,408 @@
+/*
+ * Support for the OLPC DCON and OLPC EC access
+ *
+ * Copyright © 2006 Advanced Micro Devices, Inc.
+ * Copyright © 2007-2008 Andres Salomon <dilinger@debian.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/export.h>
+#include <linux/delay.h>
+#include <linux/io.h>
+#include <linux/string.h>
+#include <linux/platform_device.h>
+#include <linux/of.h>
+#include <linux/syscore_ops.h>
+#include <linux/mutex.h>
+#include <linux/olpc-ec.h>
+
+#include <asm/geode.h>
+#include <asm/setup.h>
+#include <asm/olpc.h>
+#include <asm/olpc_ofw.h>
+
+struct olpc_platform_t olpc_platform_info;
+EXPORT_SYMBOL_GPL(olpc_platform_info);
+
+/* EC event mask to be applied during suspend (defining wakeup sources). */
+static u16 ec_wakeup_mask;
+
+/* what the timeout *should* be (in ms) */
+#define EC_BASE_TIMEOUT 20
+
+/* the timeout that bugs in the EC might force us to actually use */
+static int ec_timeout = EC_BASE_TIMEOUT;
+
+static int __init olpc_ec_timeout_set(char *str)
+{
+ if (get_option(&str, &ec_timeout) != 1) {
+ ec_timeout = EC_BASE_TIMEOUT;
+ printk(KERN_ERR "olpc-ec: invalid argument to "
+ "'olpc_ec_timeout=', ignoring!\n");
+ }
+ printk(KERN_DEBUG "olpc-ec: using %d ms delay for EC commands.\n",
+ ec_timeout);
+ return 1;
+}
+__setup("olpc_ec_timeout=", olpc_ec_timeout_set);
+
+/*
+ * These {i,o}bf_status functions return whether the buffers are full or not.
+ */
+
+static inline unsigned int ibf_status(unsigned int port)
+{
+ return !!(inb(port) & 0x02);
+}
+
+static inline unsigned int obf_status(unsigned int port)
+{
+ return inb(port) & 0x01;
+}
+
+#define wait_on_ibf(p, d) __wait_on_ibf(__LINE__, (p), (d))
+static int __wait_on_ibf(unsigned int line, unsigned int port, int desired)
+{
+ unsigned int timeo;
+ int state = ibf_status(port);
+
+ for (timeo = ec_timeout; state != desired && timeo; timeo--) {
+ mdelay(1);
+ state = ibf_status(port);
+ }
+
+ if ((state == desired) && (ec_timeout > EC_BASE_TIMEOUT) &&
+ timeo < (ec_timeout - EC_BASE_TIMEOUT)) {
+ printk(KERN_WARNING "olpc-ec: %d: waited %u ms for IBF!\n",
+ line, ec_timeout - timeo);
+ }
+
+ return !(state == desired);
+}
+
+#define wait_on_obf(p, d) __wait_on_obf(__LINE__, (p), (d))
+static int __wait_on_obf(unsigned int line, unsigned int port, int desired)
+{
+ unsigned int timeo;
+ int state = obf_status(port);
+
+ for (timeo = ec_timeout; state != desired && timeo; timeo--) {
+ mdelay(1);
+ state = obf_status(port);
+ }
+
+ if ((state == desired) && (ec_timeout > EC_BASE_TIMEOUT) &&
+ timeo < (ec_timeout - EC_BASE_TIMEOUT)) {
+ printk(KERN_WARNING "olpc-ec: %d: waited %u ms for OBF!\n",
+ line, ec_timeout - timeo);
+ }
+
+ return !(state == desired);
+}
+
+/*
+ * This allows the kernel to run Embedded Controller commands. The EC is
+ * documented at <http://wiki.laptop.org/go/Embedded_controller>, and the
+ * available EC commands are here:
+ * <http://wiki.laptop.org/go/Ec_specification>. Unfortunately, while
+ * OpenFirmware's source is available, the EC's is not.
+ */
+static int olpc_xo1_ec_cmd(u8 cmd, u8 *inbuf, size_t inlen, u8 *outbuf,
+ size_t outlen, void *arg)
+{
+ int ret = -EIO;
+ int i;
+ int restarts = 0;
+
+ /* Clear OBF */
+ for (i = 0; i < 10 && (obf_status(0x6c) == 1); i++)
+ inb(0x68);
+ if (i == 10) {
+ printk(KERN_ERR "olpc-ec: timeout while attempting to "
+ "clear OBF flag!\n");
+ goto err;
+ }
+
+ if (wait_on_ibf(0x6c, 0)) {
+ printk(KERN_ERR "olpc-ec: timeout waiting for EC to "
+ "quiesce!\n");
+ goto err;
+ }
+
+restart:
+ /*
+ * Note that if we time out during any IBF checks, that's a failure;
+ * we have to return. There's no way for the kernel to clear that.
+ *
+ * If we time out during an OBF check, we can restart the command;
+ * reissuing it will clear the OBF flag, and we should be alright.
+ * The OBF flag will sometimes misbehave due to what we believe
+ * is a hardware quirk..
+ */
+ pr_devel("olpc-ec: running cmd 0x%x\n", cmd);
+ outb(cmd, 0x6c);
+
+ if (wait_on_ibf(0x6c, 0)) {
+ printk(KERN_ERR "olpc-ec: timeout waiting for EC to read "
+ "command!\n");
+ goto err;
+ }
+
+ if (inbuf && inlen) {
+ /* write data to EC */
+ for (i = 0; i < inlen; i++) {
+ pr_devel("olpc-ec: sending cmd arg 0x%x\n", inbuf[i]);
+ outb(inbuf[i], 0x68);
+ if (wait_on_ibf(0x6c, 0)) {
+ printk(KERN_ERR "olpc-ec: timeout waiting for"
+ " EC accept data!\n");
+ goto err;
+ }
+ }
+ }
+ if (outbuf && outlen) {
+ /* read data from EC */
+ for (i = 0; i < outlen; i++) {
+ if (wait_on_obf(0x6c, 1)) {
+ printk(KERN_ERR "olpc-ec: timeout waiting for"
+ " EC to provide data!\n");
+ if (restarts++ < 10)
+ goto restart;
+ goto err;
+ }
+ outbuf[i] = inb(0x68);
+ pr_devel("olpc-ec: received 0x%x\n", outbuf[i]);
+ }
+ }
+
+ ret = 0;
+err:
+ return ret;
+}
+
+void olpc_ec_wakeup_set(u16 value)
+{
+ ec_wakeup_mask |= value;
+}
+EXPORT_SYMBOL_GPL(olpc_ec_wakeup_set);
+
+void olpc_ec_wakeup_clear(u16 value)
+{
+ ec_wakeup_mask &= ~value;
+}
+EXPORT_SYMBOL_GPL(olpc_ec_wakeup_clear);
+
+/*
+ * Returns true if the compile and runtime configurations allow for EC events
+ * to wake the system.
+ */
+bool olpc_ec_wakeup_available(void)
+{
+ if (!machine_is_olpc())
+ return false;
+
+ /*
+ * XO-1 EC wakeups are available when olpc-xo1-sci driver is
+ * compiled in
+ */
+#ifdef CONFIG_OLPC_XO1_SCI
+ if (olpc_platform_info.boardrev < olpc_board_pre(0xd0)) /* XO-1 */
+ return true;
+#endif
+
+ /*
+ * XO-1.5 EC wakeups are available when olpc-xo15-sci driver is
+ * compiled in
+ */
+#ifdef CONFIG_OLPC_XO15_SCI
+ if (olpc_platform_info.boardrev >= olpc_board_pre(0xd0)) /* XO-1.5 */
+ return true;
+#endif
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(olpc_ec_wakeup_available);
+
+int olpc_ec_mask_write(u16 bits)
+{
+ if (olpc_platform_info.flags & OLPC_F_EC_WIDE_SCI) {
+ __be16 ec_word = cpu_to_be16(bits);
+ return olpc_ec_cmd(EC_WRITE_EXT_SCI_MASK, (void *) &ec_word, 2,
+ NULL, 0);
+ } else {
+ unsigned char ec_byte = bits & 0xff;
+ return olpc_ec_cmd(EC_WRITE_SCI_MASK, &ec_byte, 1, NULL, 0);
+ }
+}
+EXPORT_SYMBOL_GPL(olpc_ec_mask_write);
+
+int olpc_ec_sci_query(u16 *sci_value)
+{
+ int ret;
+
+ if (olpc_platform_info.flags & OLPC_F_EC_WIDE_SCI) {
+ __be16 ec_word;
+ ret = olpc_ec_cmd(EC_EXT_SCI_QUERY,
+ NULL, 0, (void *) &ec_word, 2);
+ if (ret == 0)
+ *sci_value = be16_to_cpu(ec_word);
+ } else {
+ unsigned char ec_byte;
+ ret = olpc_ec_cmd(EC_SCI_QUERY, NULL, 0, &ec_byte, 1);
+ if (ret == 0)
+ *sci_value = ec_byte;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(olpc_ec_sci_query);
+
+static bool __init check_ofw_architecture(struct device_node *root)
+{
+ const char *olpc_arch;
+ int propsize;
+
+ olpc_arch = of_get_property(root, "architecture", &propsize);
+ return propsize == 5 && strncmp("OLPC", olpc_arch, 5) == 0;
+}
+
+static u32 __init get_board_revision(struct device_node *root)
+{
+ int propsize;
+ const __be32 *rev;
+
+ rev = of_get_property(root, "board-revision-int", &propsize);
+ if (propsize != 4)
+ return 0;
+
+ return be32_to_cpu(*rev);
+}
+
+static bool __init platform_detect(void)
+{
+ struct device_node *root = of_find_node_by_path("/");
+ bool success;
+
+ if (!root)
+ return false;
+
+ success = check_ofw_architecture(root);
+ if (success) {
+ olpc_platform_info.boardrev = get_board_revision(root);
+ olpc_platform_info.flags |= OLPC_F_PRESENT;
+ }
+
+ of_node_put(root);
+ return success;
+}
+
+static int __init add_xo1_platform_devices(void)
+{
+ struct platform_device *pdev;
+
+ pdev = platform_device_register_simple("xo1-rfkill", -1, NULL, 0);
+ if (IS_ERR(pdev))
+ return PTR_ERR(pdev);
+
+ pdev = platform_device_register_simple("olpc-xo1", -1, NULL, 0);
+
+ return PTR_ERR_OR_ZERO(pdev);
+}
+
+static int olpc_xo1_ec_probe(struct platform_device *pdev)
+{
+ /* get the EC revision */
+ olpc_ec_cmd(EC_FIRMWARE_REV, NULL, 0,
+ (unsigned char *) &olpc_platform_info.ecver, 1);
+
+ /* EC version 0x5f adds support for wide SCI mask */
+ if (olpc_platform_info.ecver >= 0x5f)
+ olpc_platform_info.flags |= OLPC_F_EC_WIDE_SCI;
+
+ pr_info("OLPC board revision %s%X (EC=%x)\n",
+ ((olpc_platform_info.boardrev & 0xf) < 8) ? "pre" : "",
+ olpc_platform_info.boardrev >> 4,
+ olpc_platform_info.ecver);
+
+ return 0;
+}
+static int olpc_xo1_ec_suspend(struct platform_device *pdev)
+{
+ olpc_ec_mask_write(ec_wakeup_mask);
+
+ /*
+ * Squelch SCIs while suspended. This is a fix for
+ * <http://dev.laptop.org/ticket/1835>.
+ */
+ return olpc_ec_cmd(EC_SET_SCI_INHIBIT, NULL, 0, NULL, 0);
+}
+
+static int olpc_xo1_ec_resume(struct platform_device *pdev)
+{
+ /* Tell the EC to stop inhibiting SCIs */
+ olpc_ec_cmd(EC_SET_SCI_INHIBIT_RELEASE, NULL, 0, NULL, 0);
+
+ /*
+ * Tell the wireless module to restart USB communication.
+ * Must be done twice.
+ */
+ olpc_ec_cmd(EC_WAKE_UP_WLAN, NULL, 0, NULL, 0);
+ olpc_ec_cmd(EC_WAKE_UP_WLAN, NULL, 0, NULL, 0);
+
+ return 0;
+}
+
+static struct olpc_ec_driver ec_xo1_driver = {
+ .probe = olpc_xo1_ec_probe,
+ .suspend = olpc_xo1_ec_suspend,
+ .resume = olpc_xo1_ec_resume,
+ .ec_cmd = olpc_xo1_ec_cmd,
+};
+
+static struct olpc_ec_driver ec_xo1_5_driver = {
+ .probe = olpc_xo1_ec_probe,
+ .ec_cmd = olpc_xo1_ec_cmd,
+};
+
+static int __init olpc_init(void)
+{
+ int r = 0;
+
+ if (!olpc_ofw_present() || !platform_detect())
+ return 0;
+
+ /* register the XO-1 and 1.5-specific EC handler */
+ if (olpc_platform_info.boardrev < olpc_board_pre(0xd0)) /* XO-1 */
+ olpc_ec_driver_register(&ec_xo1_driver, NULL);
+ else
+ olpc_ec_driver_register(&ec_xo1_5_driver, NULL);
+ platform_device_register_simple("olpc-ec", -1, NULL, 0);
+
+ /* assume B1 and above models always have a DCON */
+ if (olpc_board_at_least(olpc_board(0xb1)))
+ olpc_platform_info.flags |= OLPC_F_DCON;
+
+#ifdef CONFIG_PCI_OLPC
+ /* If the VSA exists let it emulate PCI, if not emulate in kernel.
+ * XO-1 only. */
+ if (olpc_platform_info.boardrev < olpc_board_pre(0xd0) &&
+ !cs5535_has_vsa2())
+ x86_init.pci.arch_init = pci_olpc_init;
+#endif
+
+ if (olpc_platform_info.boardrev < olpc_board_pre(0xd0)) { /* XO-1 */
+ r = add_xo1_platform_devices();
+ if (r)
+ return r;
+ }
+
+ return 0;
+}
+
+postcore_initcall(olpc_init);
diff --git a/arch/x86/platform/olpc/olpc_dt.c b/arch/x86/platform/olpc/olpc_dt.c
new file mode 100644
index 000000000..d6ee92986
--- /dev/null
+++ b/arch/x86/platform/olpc/olpc_dt.c
@@ -0,0 +1,304 @@
+/*
+ * OLPC-specific OFW device tree support code.
+ *
+ * Paul Mackerras August 1996.
+ * Copyright (C) 1996-2005 Paul Mackerras.
+ *
+ * Adapted for 64bit PowerPC by Dave Engebretsen and Peter Bergner.
+ * {engebret|bergner}@us.ibm.com
+ *
+ * Adapted for sparc by David S. Miller davem@davemloft.net
+ * Adapted for x86/OLPC by Andres Salomon <dilinger@queued.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/bootmem.h>
+#include <linux/of.h>
+#include <linux/of_platform.h>
+#include <linux/of_pdt.h>
+#include <asm/olpc.h>
+#include <asm/olpc_ofw.h>
+
+static phandle __init olpc_dt_getsibling(phandle node)
+{
+ const void *args[] = { (void *)node };
+ void *res[] = { &node };
+
+ if ((s32)node == -1)
+ return 0;
+
+ if (olpc_ofw("peer", args, res) || (s32)node == -1)
+ return 0;
+
+ return node;
+}
+
+static phandle __init olpc_dt_getchild(phandle node)
+{
+ const void *args[] = { (void *)node };
+ void *res[] = { &node };
+
+ if ((s32)node == -1)
+ return 0;
+
+ if (olpc_ofw("child", args, res) || (s32)node == -1) {
+ pr_err("PROM: %s: fetching child failed!\n", __func__);
+ return 0;
+ }
+
+ return node;
+}
+
+static int __init olpc_dt_getproplen(phandle node, const char *prop)
+{
+ const void *args[] = { (void *)node, prop };
+ int len;
+ void *res[] = { &len };
+
+ if ((s32)node == -1)
+ return -1;
+
+ if (olpc_ofw("getproplen", args, res)) {
+ pr_err("PROM: %s: getproplen failed!\n", __func__);
+ return -1;
+ }
+
+ return len;
+}
+
+static int __init olpc_dt_getproperty(phandle node, const char *prop,
+ char *buf, int bufsize)
+{
+ int plen;
+
+ plen = olpc_dt_getproplen(node, prop);
+ if (plen > bufsize || plen < 1) {
+ return -1;
+ } else {
+ const void *args[] = { (void *)node, prop, buf, (void *)plen };
+ void *res[] = { &plen };
+
+ if (olpc_ofw("getprop", args, res)) {
+ pr_err("PROM: %s: getprop failed!\n", __func__);
+ return -1;
+ }
+ }
+
+ return plen;
+}
+
+static int __init olpc_dt_nextprop(phandle node, char *prev, char *buf)
+{
+ const void *args[] = { (void *)node, prev, buf };
+ int success;
+ void *res[] = { &success };
+
+ buf[0] = '\0';
+
+ if ((s32)node == -1)
+ return -1;
+
+ if (olpc_ofw("nextprop", args, res) || success != 1)
+ return -1;
+
+ return 0;
+}
+
+static int __init olpc_dt_pkg2path(phandle node, char *buf,
+ const int buflen, int *len)
+{
+ const void *args[] = { (void *)node, buf, (void *)buflen };
+ void *res[] = { len };
+
+ if ((s32)node == -1)
+ return -1;
+
+ if (olpc_ofw("package-to-path", args, res) || *len < 1)
+ return -1;
+
+ return 0;
+}
+
+static unsigned int prom_early_allocated __initdata;
+
+void * __init prom_early_alloc(unsigned long size)
+{
+ static u8 *mem;
+ static size_t free_mem;
+ void *res;
+
+ if (free_mem < size) {
+ const size_t chunk_size = max(PAGE_SIZE, size);
+
+ /*
+ * To mimimize the number of allocations, grab at least
+ * PAGE_SIZE of memory (that's an arbitrary choice that's
+ * fast enough on the platforms we care about while minimizing
+ * wasted bootmem) and hand off chunks of it to callers.
+ */
+ res = alloc_bootmem(chunk_size);
+ BUG_ON(!res);
+ prom_early_allocated += chunk_size;
+ memset(res, 0, chunk_size);
+ free_mem = chunk_size;
+ mem = res;
+ }
+
+ /* allocate from the local cache */
+ free_mem -= size;
+ res = mem;
+ mem += size;
+ return res;
+}
+
+static struct of_pdt_ops prom_olpc_ops __initdata = {
+ .nextprop = olpc_dt_nextprop,
+ .getproplen = olpc_dt_getproplen,
+ .getproperty = olpc_dt_getproperty,
+ .getchild = olpc_dt_getchild,
+ .getsibling = olpc_dt_getsibling,
+ .pkg2path = olpc_dt_pkg2path,
+};
+
+static phandle __init olpc_dt_finddevice(const char *path)
+{
+ phandle node;
+ const void *args[] = { path };
+ void *res[] = { &node };
+
+ if (olpc_ofw("finddevice", args, res)) {
+ pr_err("olpc_dt: finddevice failed!\n");
+ return 0;
+ }
+
+ if ((s32) node == -1)
+ return 0;
+
+ return node;
+}
+
+static int __init olpc_dt_interpret(const char *words)
+{
+ int result;
+ const void *args[] = { words };
+ void *res[] = { &result };
+
+ if (olpc_ofw("interpret", args, res)) {
+ pr_err("olpc_dt: interpret failed!\n");
+ return -1;
+ }
+
+ return result;
+}
+
+/*
+ * Extract board revision directly from OFW device tree.
+ * We can't use olpc_platform_info because that hasn't been set up yet.
+ */
+static u32 __init olpc_dt_get_board_revision(void)
+{
+ phandle node;
+ __be32 rev;
+ int r;
+
+ node = olpc_dt_finddevice("/");
+ if (!node)
+ return 0;
+
+ r = olpc_dt_getproperty(node, "board-revision-int",
+ (char *) &rev, sizeof(rev));
+ if (r < 0)
+ return 0;
+
+ return be32_to_cpu(rev);
+}
+
+void __init olpc_dt_fixup(void)
+{
+ int r;
+ char buf[64];
+ phandle node;
+ u32 board_rev;
+
+ node = olpc_dt_finddevice("/battery@0");
+ if (!node)
+ return;
+
+ /*
+ * If the battery node has a compatible property, we are running a new
+ * enough firmware and don't have fixups to make.
+ */
+ r = olpc_dt_getproperty(node, "compatible", buf, sizeof(buf));
+ if (r > 0)
+ return;
+
+ pr_info("PROM DT: Old firmware detected, applying fixes\n");
+
+ /* Add olpc,xo1-battery compatible marker to battery node */
+ olpc_dt_interpret("\" /battery@0\" find-device"
+ " \" olpc,xo1-battery\" +compatible"
+ " device-end");
+
+ board_rev = olpc_dt_get_board_revision();
+ if (!board_rev)
+ return;
+
+ if (board_rev >= olpc_board_pre(0xd0)) {
+ /* XO-1.5: add dcon device */
+ olpc_dt_interpret("\" /pci/display@1\" find-device"
+ " new-device"
+ " \" dcon\" device-name \" olpc,xo1-dcon\" +compatible"
+ " finish-device device-end");
+ } else {
+ /* XO-1: add dcon device, mark RTC as olpc,xo1-rtc */
+ olpc_dt_interpret("\" /pci/display@1,1\" find-device"
+ " new-device"
+ " \" dcon\" device-name \" olpc,xo1-dcon\" +compatible"
+ " finish-device device-end"
+ " \" /rtc\" find-device"
+ " \" olpc,xo1-rtc\" +compatible"
+ " device-end");
+ }
+}
+
+void __init olpc_dt_build_devicetree(void)
+{
+ phandle root;
+
+ if (!olpc_ofw_is_installed())
+ return;
+
+ olpc_dt_fixup();
+
+ root = olpc_dt_getsibling(0);
+ if (!root) {
+ pr_err("PROM: unable to get root node from OFW!\n");
+ return;
+ }
+ of_pdt_build_devicetree(root, &prom_olpc_ops);
+
+ pr_info("PROM DT: Built device tree with %u bytes of memory.\n",
+ prom_early_allocated);
+}
+
+/* A list of DT node/bus matches that we want to expose as platform devices */
+static struct of_device_id __initdata of_ids[] = {
+ { .compatible = "olpc,xo1-battery" },
+ { .compatible = "olpc,xo1-dcon" },
+ { .compatible = "olpc,xo1-rtc" },
+ {},
+};
+
+static int __init olpc_create_platform_devices(void)
+{
+ if (machine_is_olpc())
+ return of_platform_bus_probe(NULL, of_ids, NULL);
+ else
+ return 0;
+}
+device_initcall(olpc_create_platform_devices);
diff --git a/arch/x86/platform/olpc/olpc_ofw.c b/arch/x86/platform/olpc/olpc_ofw.c
new file mode 100644
index 000000000..f1aab8cdb
--- /dev/null
+++ b/arch/x86/platform/olpc/olpc_ofw.c
@@ -0,0 +1,120 @@
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/spinlock_types.h>
+#include <linux/init.h>
+#include <asm/page.h>
+#include <asm/setup.h>
+#include <asm/io.h>
+#include <asm/cpufeature.h>
+#include <asm/special_insns.h>
+#include <asm/pgtable.h>
+#include <asm/olpc_ofw.h>
+
+/* address of OFW callback interface; will be NULL if OFW isn't found */
+static int (*olpc_ofw_cif)(int *);
+
+/* page dir entry containing OFW's pgdir table; filled in by head_32.S */
+u32 olpc_ofw_pgd __initdata;
+
+static DEFINE_SPINLOCK(ofw_lock);
+
+#define MAXARGS 10
+
+void __init setup_olpc_ofw_pgd(void)
+{
+ pgd_t *base, *ofw_pde;
+
+ if (!olpc_ofw_cif)
+ return;
+
+ /* fetch OFW's PDE */
+ base = early_ioremap(olpc_ofw_pgd, sizeof(olpc_ofw_pgd) * PTRS_PER_PGD);
+ if (!base) {
+ printk(KERN_ERR "failed to remap OFW's pgd - disabling OFW!\n");
+ olpc_ofw_cif = NULL;
+ return;
+ }
+ ofw_pde = &base[OLPC_OFW_PDE_NR];
+
+ /* install OFW's PDE permanently into the kernel's pgtable */
+ set_pgd(&swapper_pg_dir[OLPC_OFW_PDE_NR], *ofw_pde);
+ /* implicit optimization barrier here due to uninline function return */
+
+ early_iounmap(base, sizeof(olpc_ofw_pgd) * PTRS_PER_PGD);
+}
+
+int __olpc_ofw(const char *name, int nr_args, const void **args, int nr_res,
+ void **res)
+{
+ int ofw_args[MAXARGS + 3];
+ unsigned long flags;
+ int ret, i, *p;
+
+ BUG_ON(nr_args + nr_res > MAXARGS);
+
+ if (!olpc_ofw_cif)
+ return -EIO;
+
+ ofw_args[0] = (int)name;
+ ofw_args[1] = nr_args;
+ ofw_args[2] = nr_res;
+
+ p = &ofw_args[3];
+ for (i = 0; i < nr_args; i++, p++)
+ *p = (int)args[i];
+
+ /* call into ofw */
+ spin_lock_irqsave(&ofw_lock, flags);
+ ret = olpc_ofw_cif(ofw_args);
+ spin_unlock_irqrestore(&ofw_lock, flags);
+
+ if (!ret) {
+ for (i = 0; i < nr_res; i++, p++)
+ *((int *)res[i]) = *p;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(__olpc_ofw);
+
+bool olpc_ofw_present(void)
+{
+ return olpc_ofw_cif != NULL;
+}
+EXPORT_SYMBOL_GPL(olpc_ofw_present);
+
+/* OFW cif _should_ be above this address */
+#define OFW_MIN 0xff000000
+
+/* OFW starts on a 1MB boundary */
+#define OFW_BOUND (1<<20)
+
+void __init olpc_ofw_detect(void)
+{
+ struct olpc_ofw_header *hdr = &boot_params.olpc_ofw_header;
+ unsigned long start;
+
+ /* ensure OFW booted us by checking for "OFW " string */
+ if (hdr->ofw_magic != OLPC_OFW_SIG)
+ return;
+
+ olpc_ofw_cif = (int (*)(int *))hdr->cif_handler;
+
+ if ((unsigned long)olpc_ofw_cif < OFW_MIN) {
+ printk(KERN_ERR "OFW detected, but cif has invalid address 0x%lx - disabling.\n",
+ (unsigned long)olpc_ofw_cif);
+ olpc_ofw_cif = NULL;
+ return;
+ }
+
+ /* determine where OFW starts in memory */
+ start = round_down((unsigned long)olpc_ofw_cif, OFW_BOUND);
+ printk(KERN_INFO "OFW detected in memory, cif @ 0x%lx (reserving top %ldMB)\n",
+ (unsigned long)olpc_ofw_cif, (-start) >> 20);
+ reserve_top_address(-start);
+}
+
+bool __init olpc_ofw_is_installed(void)
+{
+ return olpc_ofw_cif != NULL;
+}
diff --git a/arch/x86/platform/olpc/xo1-wakeup.S b/arch/x86/platform/olpc/xo1-wakeup.S
new file mode 100644
index 000000000..5fee3a2c2
--- /dev/null
+++ b/arch/x86/platform/olpc/xo1-wakeup.S
@@ -0,0 +1,125 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+.text
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/page.h>
+#include <asm/pgtable_32.h>
+
+ .macro writepost,value
+ movb $0x34, %al
+ outb %al, $0x70
+ movb $\value, %al
+ outb %al, $0x71
+ .endm
+
+wakeup_start:
+ # OFW lands us here, running in protected mode, with a
+ # kernel-compatible GDT already setup.
+
+ # Clear any dangerous flags
+ pushl $0
+ popfl
+
+ writepost 0x31
+
+ # Set up %cr3
+ movl $initial_page_table - __PAGE_OFFSET, %eax
+ movl %eax, %cr3
+
+ movl saved_cr4, %eax
+ movl %eax, %cr4
+
+ movl saved_cr0, %eax
+ movl %eax, %cr0
+
+ # Control registers were modified, pipeline resync is needed
+ jmp 1f
+1:
+
+ movw $__KERNEL_DS, %ax
+ movw %ax, %ss
+ movw %ax, %ds
+ movw %ax, %es
+ movw %ax, %fs
+ movw %ax, %gs
+
+ lgdt saved_gdt
+ lidt saved_idt
+ lldt saved_ldt
+ ljmp $(__KERNEL_CS),$1f
+1:
+ movl %cr3, %eax
+ movl %eax, %cr3
+ wbinvd
+
+ # Go back to the return point
+ jmp ret_point
+
+save_registers:
+ sgdt saved_gdt
+ sidt saved_idt
+ sldt saved_ldt
+
+ pushl %edx
+ movl %cr4, %edx
+ movl %edx, saved_cr4
+
+ movl %cr0, %edx
+ movl %edx, saved_cr0
+
+ popl %edx
+
+ movl %ebx, saved_context_ebx
+ movl %ebp, saved_context_ebp
+ movl %esi, saved_context_esi
+ movl %edi, saved_context_edi
+
+ pushfl
+ popl saved_context_eflags
+
+ ret
+
+restore_registers:
+ movl saved_context_ebp, %ebp
+ movl saved_context_ebx, %ebx
+ movl saved_context_esi, %esi
+ movl saved_context_edi, %edi
+
+ pushl saved_context_eflags
+ popfl
+
+ ret
+
+ENTRY(do_olpc_suspend_lowlevel)
+ call save_processor_state
+ call save_registers
+
+ # This is the stack context we want to remember
+ movl %esp, saved_context_esp
+
+ pushl $3
+ call xo1_do_sleep
+
+ jmp wakeup_start
+ .p2align 4,,7
+ret_point:
+ movl saved_context_esp, %esp
+
+ writepost 0x32
+
+ call restore_registers
+ call restore_processor_state
+ ret
+
+.data
+saved_gdt: .long 0,0
+saved_idt: .long 0,0
+saved_ldt: .long 0
+saved_cr4: .long 0
+saved_cr0: .long 0
+saved_context_esp: .long 0
+saved_context_edi: .long 0
+saved_context_esi: .long 0
+saved_context_ebx: .long 0
+saved_context_ebp: .long 0
+saved_context_eflags: .long 0
diff --git a/arch/x86/platform/scx200/Makefile b/arch/x86/platform/scx200/Makefile
new file mode 100644
index 000000000..762b4c7f4
--- /dev/null
+++ b/arch/x86/platform/scx200/Makefile
@@ -0,0 +1,2 @@
+obj-$(CONFIG_SCx200) += scx200.o
+scx200-y += scx200_32.o
diff --git a/arch/x86/platform/scx200/scx200_32.c b/arch/x86/platform/scx200/scx200_32.c
new file mode 100644
index 000000000..3dc9aee41
--- /dev/null
+++ b/arch/x86/platform/scx200/scx200_32.c
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2001,2002 Christer Weinigel <wingel@nano-system.com>
+ *
+ * National Semiconductor SCx200 support.
+ */
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/mutex.h>
+#include <linux/pci.h>
+
+#include <linux/scx200.h>
+#include <linux/scx200_gpio.h>
+
+/* Verify that the configuration block really is there */
+#define scx200_cb_probe(base) (inw((base) + SCx200_CBA) == (base))
+
+MODULE_AUTHOR("Christer Weinigel <wingel@nano-system.com>");
+MODULE_DESCRIPTION("NatSemi SCx200 Driver");
+MODULE_LICENSE("GPL");
+
+unsigned scx200_gpio_base = 0;
+unsigned long scx200_gpio_shadow[2];
+
+unsigned scx200_cb_base = 0;
+
+static struct pci_device_id scx200_tbl[] = {
+ { PCI_VDEVICE(NS, PCI_DEVICE_ID_NS_SCx200_BRIDGE) },
+ { PCI_VDEVICE(NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE) },
+ { PCI_VDEVICE(NS, PCI_DEVICE_ID_NS_SCx200_XBUS) },
+ { PCI_VDEVICE(NS, PCI_DEVICE_ID_NS_SC1100_XBUS) },
+ { },
+};
+MODULE_DEVICE_TABLE(pci,scx200_tbl);
+
+static int scx200_probe(struct pci_dev *, const struct pci_device_id *);
+
+static struct pci_driver scx200_pci_driver = {
+ .name = "scx200",
+ .id_table = scx200_tbl,
+ .probe = scx200_probe,
+};
+
+static DEFINE_MUTEX(scx200_gpio_config_lock);
+
+static void scx200_init_shadow(void)
+{
+ int bank;
+
+ /* read the current values driven on the GPIO signals */
+ for (bank = 0; bank < 2; ++bank)
+ scx200_gpio_shadow[bank] = inl(scx200_gpio_base + 0x10 * bank);
+}
+
+static int scx200_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+ unsigned base;
+
+ if (pdev->device == PCI_DEVICE_ID_NS_SCx200_BRIDGE ||
+ pdev->device == PCI_DEVICE_ID_NS_SC1100_BRIDGE) {
+ base = pci_resource_start(pdev, 0);
+ pr_info("GPIO base 0x%x\n", base);
+
+ if (!request_region(base, SCx200_GPIO_SIZE,
+ "NatSemi SCx200 GPIO")) {
+ pr_err("can't allocate I/O for GPIOs\n");
+ return -EBUSY;
+ }
+
+ scx200_gpio_base = base;
+ scx200_init_shadow();
+
+ } else {
+ /* find the base of the Configuration Block */
+ if (scx200_cb_probe(SCx200_CB_BASE_FIXED)) {
+ scx200_cb_base = SCx200_CB_BASE_FIXED;
+ } else {
+ pci_read_config_dword(pdev, SCx200_CBA_SCRATCH, &base);
+ if (scx200_cb_probe(base)) {
+ scx200_cb_base = base;
+ } else {
+ pr_warn("Configuration Block not found\n");
+ return -ENODEV;
+ }
+ }
+ pr_info("Configuration Block base 0x%x\n", scx200_cb_base);
+ }
+
+ return 0;
+}
+
+u32 scx200_gpio_configure(unsigned index, u32 mask, u32 bits)
+{
+ u32 config, new_config;
+
+ mutex_lock(&scx200_gpio_config_lock);
+
+ outl(index, scx200_gpio_base + 0x20);
+ config = inl(scx200_gpio_base + 0x24);
+
+ new_config = (config & mask) | bits;
+ outl(new_config, scx200_gpio_base + 0x24);
+
+ mutex_unlock(&scx200_gpio_config_lock);
+
+ return config;
+}
+
+static int __init scx200_init(void)
+{
+ pr_info("NatSemi SCx200 Driver\n");
+ return pci_register_driver(&scx200_pci_driver);
+}
+
+static void __exit scx200_cleanup(void)
+{
+ pci_unregister_driver(&scx200_pci_driver);
+ release_region(scx200_gpio_base, SCx200_GPIO_SIZE);
+}
+
+module_init(scx200_init);
+module_exit(scx200_cleanup);
+
+EXPORT_SYMBOL(scx200_gpio_base);
+EXPORT_SYMBOL(scx200_gpio_shadow);
+EXPORT_SYMBOL(scx200_gpio_configure);
+EXPORT_SYMBOL(scx200_cb_base);
diff --git a/arch/x86/platform/sfi/Makefile b/arch/x86/platform/sfi/Makefile
new file mode 100644
index 000000000..cc5db1168
--- /dev/null
+++ b/arch/x86/platform/sfi/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_SFI) += sfi.o
diff --git a/arch/x86/platform/sfi/sfi.c b/arch/x86/platform/sfi/sfi.c
new file mode 100644
index 000000000..6c7111bbd
--- /dev/null
+++ b/arch/x86/platform/sfi/sfi.c
@@ -0,0 +1,114 @@
+/*
+ * sfi.c - x86 architecture SFI support.
+ *
+ * Copyright (c) 2009, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+
+#define KMSG_COMPONENT "SFI"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/acpi.h>
+#include <linux/init.h>
+#include <linux/sfi.h>
+#include <linux/io.h>
+
+#include <asm/irqdomain.h>
+#include <asm/io_apic.h>
+#include <asm/mpspec.h>
+#include <asm/setup.h>
+#include <asm/apic.h>
+
+#ifdef CONFIG_X86_LOCAL_APIC
+static unsigned long sfi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
+
+/* All CPUs enumerated by SFI must be present and enabled */
+static void __init mp_sfi_register_lapic(u8 id)
+{
+ if (MAX_LOCAL_APIC - id <= 0) {
+ pr_warning("Processor #%d invalid (max %d)\n",
+ id, MAX_LOCAL_APIC);
+ return;
+ }
+
+ pr_info("registering lapic[%d]\n", id);
+
+ generic_processor_info(id, GET_APIC_VERSION(apic_read(APIC_LVR)));
+}
+
+static int __init sfi_parse_cpus(struct sfi_table_header *table)
+{
+ struct sfi_table_simple *sb;
+ struct sfi_cpu_table_entry *pentry;
+ int i;
+ int cpu_num;
+
+ sb = (struct sfi_table_simple *)table;
+ cpu_num = SFI_GET_NUM_ENTRIES(sb, struct sfi_cpu_table_entry);
+ pentry = (struct sfi_cpu_table_entry *)sb->pentry;
+
+ for (i = 0; i < cpu_num; i++) {
+ mp_sfi_register_lapic(pentry->apic_id);
+ pentry++;
+ }
+
+ smp_found_config = 1;
+ return 0;
+}
+#endif /* CONFIG_X86_LOCAL_APIC */
+
+#ifdef CONFIG_X86_IO_APIC
+
+static int __init sfi_parse_ioapic(struct sfi_table_header *table)
+{
+ struct sfi_table_simple *sb;
+ struct sfi_apic_table_entry *pentry;
+ int i, num;
+ struct ioapic_domain_cfg cfg = {
+ .type = IOAPIC_DOMAIN_STRICT,
+ .ops = &mp_ioapic_irqdomain_ops,
+ };
+
+ sb = (struct sfi_table_simple *)table;
+ num = SFI_GET_NUM_ENTRIES(sb, struct sfi_apic_table_entry);
+ pentry = (struct sfi_apic_table_entry *)sb->pentry;
+
+ for (i = 0; i < num; i++) {
+ mp_register_ioapic(i, pentry->phys_addr, gsi_top, &cfg);
+ pentry++;
+ }
+
+ WARN(pic_mode, KERN_WARNING
+ "SFI: pic_mod shouldn't be 1 when IOAPIC table is present\n");
+ pic_mode = 0;
+ return 0;
+}
+#endif /* CONFIG_X86_IO_APIC */
+
+/*
+ * sfi_platform_init(): register lapics & io-apics
+ */
+int __init sfi_platform_init(void)
+{
+#ifdef CONFIG_X86_LOCAL_APIC
+ register_lapic_address(sfi_lapic_addr);
+ sfi_table_parse(SFI_SIG_CPUS, NULL, NULL, sfi_parse_cpus);
+#endif
+#ifdef CONFIG_X86_IO_APIC
+ sfi_table_parse(SFI_SIG_APIC, NULL, NULL, sfi_parse_ioapic);
+#endif
+ return 0;
+}
diff --git a/arch/x86/platform/ts5500/Makefile b/arch/x86/platform/ts5500/Makefile
new file mode 100644
index 000000000..c54e348c9
--- /dev/null
+++ b/arch/x86/platform/ts5500/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_TS5500) += ts5500.o
diff --git a/arch/x86/platform/ts5500/ts5500.c b/arch/x86/platform/ts5500/ts5500.c
new file mode 100644
index 000000000..fd39301f2
--- /dev/null
+++ b/arch/x86/platform/ts5500/ts5500.c
@@ -0,0 +1,347 @@
+/*
+ * Technologic Systems TS-5500 Single Board Computer support
+ *
+ * Copyright (C) 2013-2014 Savoir-faire Linux Inc.
+ * Vivien Didelot <vivien.didelot@savoirfairelinux.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation; either version 2 of the License, or (at your option) any later
+ * version.
+ *
+ *
+ * This driver registers the Technologic Systems TS-5500 Single Board Computer
+ * (SBC) and its devices, and exposes information to userspace such as jumpers'
+ * state or available options. For further information about sysfs entries, see
+ * Documentation/ABI/testing/sysfs-platform-ts5500.
+ *
+ * This code may be extended to support similar x86-based platforms.
+ * Actually, the TS-5500 and TS-5400 are supported.
+ */
+
+#include <linux/delay.h>
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/leds.h>
+#include <linux/init.h>
+#include <linux/platform_data/gpio-ts5500.h>
+#include <linux/platform_data/max197.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+
+/* Product code register */
+#define TS5500_PRODUCT_CODE_ADDR 0x74
+#define TS5500_PRODUCT_CODE 0x60 /* TS-5500 product code */
+#define TS5400_PRODUCT_CODE 0x40 /* TS-5400 product code */
+
+/* SRAM/RS-485/ADC options, and RS-485 RTS/Automatic RS-485 flags register */
+#define TS5500_SRAM_RS485_ADC_ADDR 0x75
+#define TS5500_SRAM BIT(0) /* SRAM option */
+#define TS5500_RS485 BIT(1) /* RS-485 option */
+#define TS5500_ADC BIT(2) /* A/D converter option */
+#define TS5500_RS485_RTS BIT(6) /* RTS for RS-485 */
+#define TS5500_RS485_AUTO BIT(7) /* Automatic RS-485 */
+
+/* External Reset/Industrial Temperature Range options register */
+#define TS5500_ERESET_ITR_ADDR 0x76
+#define TS5500_ERESET BIT(0) /* External Reset option */
+#define TS5500_ITR BIT(1) /* Indust. Temp. Range option */
+
+/* LED/Jumpers register */
+#define TS5500_LED_JP_ADDR 0x77
+#define TS5500_LED BIT(0) /* LED flag */
+#define TS5500_JP1 BIT(1) /* Automatic CMOS */
+#define TS5500_JP2 BIT(2) /* Enable Serial Console */
+#define TS5500_JP3 BIT(3) /* Write Enable Drive A */
+#define TS5500_JP4 BIT(4) /* Fast Console (115K baud) */
+#define TS5500_JP5 BIT(5) /* User Jumper */
+#define TS5500_JP6 BIT(6) /* Console on COM1 (req. JP2) */
+#define TS5500_JP7 BIT(7) /* Undocumented (Unused) */
+
+/* A/D Converter registers */
+#define TS5500_ADC_CONV_BUSY_ADDR 0x195 /* Conversion state register */
+#define TS5500_ADC_CONV_BUSY BIT(0)
+#define TS5500_ADC_CONV_INIT_LSB_ADDR 0x196 /* Start conv. / LSB register */
+#define TS5500_ADC_CONV_MSB_ADDR 0x197 /* MSB register */
+#define TS5500_ADC_CONV_DELAY 12 /* usec */
+
+/**
+ * struct ts5500_sbc - TS-5500 board description
+ * @name: Board model name.
+ * @id: Board product ID.
+ * @sram: Flag for SRAM option.
+ * @rs485: Flag for RS-485 option.
+ * @adc: Flag for Analog/Digital converter option.
+ * @ereset: Flag for External Reset option.
+ * @itr: Flag for Industrial Temperature Range option.
+ * @jumpers: Bitfield for jumpers' state.
+ */
+struct ts5500_sbc {
+ const char *name;
+ int id;
+ bool sram;
+ bool rs485;
+ bool adc;
+ bool ereset;
+ bool itr;
+ u8 jumpers;
+};
+
+/* Board signatures in BIOS shadow RAM */
+static const struct {
+ const char * const string;
+ const ssize_t offset;
+} ts5500_signatures[] __initconst = {
+ { "TS-5x00 AMD Elan", 0xb14 },
+};
+
+static int __init ts5500_check_signature(void)
+{
+ void __iomem *bios;
+ int i, ret = -ENODEV;
+
+ bios = ioremap(0xf0000, 0x10000);
+ if (!bios)
+ return -ENOMEM;
+
+ for (i = 0; i < ARRAY_SIZE(ts5500_signatures); i++) {
+ if (check_signature(bios + ts5500_signatures[i].offset,
+ ts5500_signatures[i].string,
+ strlen(ts5500_signatures[i].string))) {
+ ret = 0;
+ break;
+ }
+ }
+
+ iounmap(bios);
+ return ret;
+}
+
+static int __init ts5500_detect_config(struct ts5500_sbc *sbc)
+{
+ u8 tmp;
+ int ret = 0;
+
+ if (!request_region(TS5500_PRODUCT_CODE_ADDR, 4, "ts5500"))
+ return -EBUSY;
+
+ sbc->id = inb(TS5500_PRODUCT_CODE_ADDR);
+ if (sbc->id == TS5500_PRODUCT_CODE) {
+ sbc->name = "TS-5500";
+ } else if (sbc->id == TS5400_PRODUCT_CODE) {
+ sbc->name = "TS-5400";
+ } else {
+ pr_err("ts5500: unknown product code 0x%x\n", sbc->id);
+ ret = -ENODEV;
+ goto cleanup;
+ }
+
+ tmp = inb(TS5500_SRAM_RS485_ADC_ADDR);
+ sbc->sram = tmp & TS5500_SRAM;
+ sbc->rs485 = tmp & TS5500_RS485;
+ sbc->adc = tmp & TS5500_ADC;
+
+ tmp = inb(TS5500_ERESET_ITR_ADDR);
+ sbc->ereset = tmp & TS5500_ERESET;
+ sbc->itr = tmp & TS5500_ITR;
+
+ tmp = inb(TS5500_LED_JP_ADDR);
+ sbc->jumpers = tmp & ~TS5500_LED;
+
+cleanup:
+ release_region(TS5500_PRODUCT_CODE_ADDR, 4);
+ return ret;
+}
+
+static ssize_t name_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct ts5500_sbc *sbc = dev_get_drvdata(dev);
+
+ return sprintf(buf, "%s\n", sbc->name);
+}
+static DEVICE_ATTR_RO(name);
+
+static ssize_t id_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct ts5500_sbc *sbc = dev_get_drvdata(dev);
+
+ return sprintf(buf, "0x%.2x\n", sbc->id);
+}
+static DEVICE_ATTR_RO(id);
+
+static ssize_t jumpers_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct ts5500_sbc *sbc = dev_get_drvdata(dev);
+
+ return sprintf(buf, "0x%.2x\n", sbc->jumpers >> 1);
+}
+static DEVICE_ATTR_RO(jumpers);
+
+#define TS5500_ATTR_BOOL(_field) \
+ static ssize_t _field##_show(struct device *dev, \
+ struct device_attribute *attr, char *buf) \
+ { \
+ struct ts5500_sbc *sbc = dev_get_drvdata(dev); \
+ \
+ return sprintf(buf, "%d\n", sbc->_field); \
+ } \
+ static DEVICE_ATTR_RO(_field)
+
+TS5500_ATTR_BOOL(sram);
+TS5500_ATTR_BOOL(rs485);
+TS5500_ATTR_BOOL(adc);
+TS5500_ATTR_BOOL(ereset);
+TS5500_ATTR_BOOL(itr);
+
+static struct attribute *ts5500_attributes[] = {
+ &dev_attr_id.attr,
+ &dev_attr_name.attr,
+ &dev_attr_jumpers.attr,
+ &dev_attr_sram.attr,
+ &dev_attr_rs485.attr,
+ &dev_attr_adc.attr,
+ &dev_attr_ereset.attr,
+ &dev_attr_itr.attr,
+ NULL
+};
+
+static const struct attribute_group ts5500_attr_group = {
+ .attrs = ts5500_attributes,
+};
+
+static struct resource ts5500_dio1_resource[] = {
+ DEFINE_RES_IRQ_NAMED(7, "DIO1 interrupt"),
+};
+
+static struct platform_device ts5500_dio1_pdev = {
+ .name = "ts5500-dio1",
+ .id = -1,
+ .resource = ts5500_dio1_resource,
+ .num_resources = 1,
+};
+
+static struct resource ts5500_dio2_resource[] = {
+ DEFINE_RES_IRQ_NAMED(6, "DIO2 interrupt"),
+};
+
+static struct platform_device ts5500_dio2_pdev = {
+ .name = "ts5500-dio2",
+ .id = -1,
+ .resource = ts5500_dio2_resource,
+ .num_resources = 1,
+};
+
+static void ts5500_led_set(struct led_classdev *led_cdev,
+ enum led_brightness brightness)
+{
+ outb(!!brightness, TS5500_LED_JP_ADDR);
+}
+
+static enum led_brightness ts5500_led_get(struct led_classdev *led_cdev)
+{
+ return (inb(TS5500_LED_JP_ADDR) & TS5500_LED) ? LED_FULL : LED_OFF;
+}
+
+static struct led_classdev ts5500_led_cdev = {
+ .name = "ts5500:green:",
+ .brightness_set = ts5500_led_set,
+ .brightness_get = ts5500_led_get,
+};
+
+static int ts5500_adc_convert(u8 ctrl)
+{
+ u8 lsb, msb;
+
+ /* Start conversion (ensure the 3 MSB are set to 0) */
+ outb(ctrl & 0x1f, TS5500_ADC_CONV_INIT_LSB_ADDR);
+
+ /*
+ * The platform has CPLD logic driving the A/D converter.
+ * The conversion must complete within 11 microseconds,
+ * otherwise we have to re-initiate a conversion.
+ */
+ udelay(TS5500_ADC_CONV_DELAY);
+ if (inb(TS5500_ADC_CONV_BUSY_ADDR) & TS5500_ADC_CONV_BUSY)
+ return -EBUSY;
+
+ /* Read the raw data */
+ lsb = inb(TS5500_ADC_CONV_INIT_LSB_ADDR);
+ msb = inb(TS5500_ADC_CONV_MSB_ADDR);
+
+ return (msb << 8) | lsb;
+}
+
+static struct max197_platform_data ts5500_adc_pdata = {
+ .convert = ts5500_adc_convert,
+};
+
+static struct platform_device ts5500_adc_pdev = {
+ .name = "max197",
+ .id = -1,
+ .dev = {
+ .platform_data = &ts5500_adc_pdata,
+ },
+};
+
+static int __init ts5500_init(void)
+{
+ struct platform_device *pdev;
+ struct ts5500_sbc *sbc;
+ int err;
+
+ /*
+ * There is no DMI available or PCI bridge subvendor info,
+ * only the BIOS provides a 16-bit identification call.
+ * It is safer to find a signature in the BIOS shadow RAM.
+ */
+ err = ts5500_check_signature();
+ if (err)
+ return err;
+
+ pdev = platform_device_register_simple("ts5500", -1, NULL, 0);
+ if (IS_ERR(pdev))
+ return PTR_ERR(pdev);
+
+ sbc = devm_kzalloc(&pdev->dev, sizeof(struct ts5500_sbc), GFP_KERNEL);
+ if (!sbc) {
+ err = -ENOMEM;
+ goto error;
+ }
+
+ err = ts5500_detect_config(sbc);
+ if (err)
+ goto error;
+
+ platform_set_drvdata(pdev, sbc);
+
+ err = sysfs_create_group(&pdev->dev.kobj, &ts5500_attr_group);
+ if (err)
+ goto error;
+
+ if (sbc->id == TS5500_PRODUCT_CODE) {
+ ts5500_dio1_pdev.dev.parent = &pdev->dev;
+ if (platform_device_register(&ts5500_dio1_pdev))
+ dev_warn(&pdev->dev, "DIO1 block registration failed\n");
+ ts5500_dio2_pdev.dev.parent = &pdev->dev;
+ if (platform_device_register(&ts5500_dio2_pdev))
+ dev_warn(&pdev->dev, "DIO2 block registration failed\n");
+ }
+
+ if (led_classdev_register(&pdev->dev, &ts5500_led_cdev))
+ dev_warn(&pdev->dev, "LED registration failed\n");
+
+ if (sbc->adc) {
+ ts5500_adc_pdev.dev.parent = &pdev->dev;
+ if (platform_device_register(&ts5500_adc_pdev))
+ dev_warn(&pdev->dev, "ADC registration failed\n");
+ }
+
+ return 0;
+error:
+ platform_device_unregister(pdev);
+ return err;
+}
+device_initcall(ts5500_init);
diff --git a/arch/x86/platform/uv/Makefile b/arch/x86/platform/uv/Makefile
new file mode 100644
index 000000000..52079bebd
--- /dev/null
+++ b/arch/x86/platform/uv/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_X86_UV) += tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o uv_time.o uv_nmi.o
diff --git a/arch/x86/platform/uv/bios_uv.c b/arch/x86/platform/uv/bios_uv.c
new file mode 100644
index 000000000..eb33432f2
--- /dev/null
+++ b/arch/x86/platform/uv/bios_uv.c
@@ -0,0 +1,240 @@
+/*
+ * BIOS run time interface routines.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Copyright (c) 2008-2009 Silicon Graphics, Inc. All Rights Reserved.
+ * Copyright (c) Russ Anderson <rja@sgi.com>
+ */
+
+#include <linux/efi.h>
+#include <linux/export.h>
+#include <linux/slab.h>
+#include <asm/efi.h>
+#include <linux/io.h>
+#include <asm/uv/bios.h>
+#include <asm/uv/uv_hub.h>
+
+struct uv_systab *uv_systab;
+
+static s64 __uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
+ u64 a4, u64 a5)
+{
+ struct uv_systab *tab = uv_systab;
+ s64 ret;
+
+ if (!tab || !tab->function)
+ /*
+ * BIOS does not support UV systab
+ */
+ return BIOS_STATUS_UNIMPLEMENTED;
+
+ /*
+ * If EFI_OLD_MEMMAP is set, we need to fall back to using our old EFI
+ * callback method, which uses efi_call() directly, with the kernel page tables:
+ */
+ if (unlikely(test_bit(EFI_OLD_MEMMAP, &efi.flags)))
+ ret = efi_call((void *)__va(tab->function), (u64)which, a1, a2, a3, a4, a5);
+ else
+ ret = efi_call_virt_pointer(tab, function, (u64)which, a1, a2, a3, a4, a5);
+
+ return ret;
+}
+
+s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5)
+{
+ s64 ret;
+
+ if (down_interruptible(&__efi_uv_runtime_lock))
+ return BIOS_STATUS_ABORT;
+
+ ret = __uv_bios_call(which, a1, a2, a3, a4, a5);
+ up(&__efi_uv_runtime_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(uv_bios_call);
+
+s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
+ u64 a4, u64 a5)
+{
+ unsigned long bios_flags;
+ s64 ret;
+
+ if (down_interruptible(&__efi_uv_runtime_lock))
+ return BIOS_STATUS_ABORT;
+
+ local_irq_save(bios_flags);
+ ret = __uv_bios_call(which, a1, a2, a3, a4, a5);
+ local_irq_restore(bios_flags);
+
+ up(&__efi_uv_runtime_lock);
+
+ return ret;
+}
+
+s64 uv_bios_call_reentrant(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
+ u64 a4, u64 a5)
+{
+ s64 ret;
+
+ preempt_disable();
+ ret = uv_bios_call(which, a1, a2, a3, a4, a5);
+ preempt_enable();
+
+ return ret;
+}
+
+
+long sn_partition_id;
+EXPORT_SYMBOL_GPL(sn_partition_id);
+long sn_coherency_id;
+EXPORT_SYMBOL_GPL(sn_coherency_id);
+long sn_region_size;
+EXPORT_SYMBOL_GPL(sn_region_size);
+long system_serial_number;
+EXPORT_SYMBOL_GPL(system_serial_number);
+int uv_type;
+EXPORT_SYMBOL_GPL(uv_type);
+
+
+s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher,
+ long *region, long *ssn)
+{
+ s64 ret;
+ u64 v0, v1;
+ union partition_info_u part;
+
+ ret = uv_bios_call_irqsave(UV_BIOS_GET_SN_INFO, fc,
+ (u64)(&v0), (u64)(&v1), 0, 0);
+ if (ret != BIOS_STATUS_SUCCESS)
+ return ret;
+
+ part.val = v0;
+ if (uvtype)
+ *uvtype = part.hub_version;
+ if (partid)
+ *partid = part.partition_id;
+ if (coher)
+ *coher = part.coherence_id;
+ if (region)
+ *region = part.region_size;
+ if (ssn)
+ *ssn = v1;
+ return ret;
+}
+EXPORT_SYMBOL_GPL(uv_bios_get_sn_info);
+
+int
+uv_bios_mq_watchlist_alloc(unsigned long addr, unsigned int mq_size,
+ unsigned long *intr_mmr_offset)
+{
+ u64 watchlist;
+ s64 ret;
+
+ /*
+ * bios returns watchlist number or negative error number.
+ */
+ ret = (int)uv_bios_call_irqsave(UV_BIOS_WATCHLIST_ALLOC, addr,
+ mq_size, (u64)intr_mmr_offset,
+ (u64)&watchlist, 0);
+ if (ret < BIOS_STATUS_SUCCESS)
+ return ret;
+
+ return watchlist;
+}
+EXPORT_SYMBOL_GPL(uv_bios_mq_watchlist_alloc);
+
+int
+uv_bios_mq_watchlist_free(int blade, int watchlist_num)
+{
+ return (int)uv_bios_call_irqsave(UV_BIOS_WATCHLIST_FREE,
+ blade, watchlist_num, 0, 0, 0);
+}
+EXPORT_SYMBOL_GPL(uv_bios_mq_watchlist_free);
+
+s64
+uv_bios_change_memprotect(u64 paddr, u64 len, enum uv_memprotect perms)
+{
+ return uv_bios_call_irqsave(UV_BIOS_MEMPROTECT, paddr, len,
+ perms, 0, 0);
+}
+EXPORT_SYMBOL_GPL(uv_bios_change_memprotect);
+
+s64
+uv_bios_reserved_page_pa(u64 buf, u64 *cookie, u64 *addr, u64 *len)
+{
+ return uv_bios_call_irqsave(UV_BIOS_GET_PARTITION_ADDR, (u64)cookie,
+ (u64)addr, buf, (u64)len, 0);
+}
+EXPORT_SYMBOL_GPL(uv_bios_reserved_page_pa);
+
+s64 uv_bios_freq_base(u64 clock_type, u64 *ticks_per_second)
+{
+ return uv_bios_call(UV_BIOS_FREQ_BASE, clock_type,
+ (u64)ticks_per_second, 0, 0, 0);
+}
+EXPORT_SYMBOL_GPL(uv_bios_freq_base);
+
+/*
+ * uv_bios_set_legacy_vga_target - Set Legacy VGA I/O Target
+ * @decode: true to enable target, false to disable target
+ * @domain: PCI domain number
+ * @bus: PCI bus number
+ *
+ * Returns:
+ * 0: Success
+ * -EINVAL: Invalid domain or bus number
+ * -ENOSYS: Capability not available
+ * -EBUSY: Legacy VGA I/O cannot be retargeted at this time
+ */
+int uv_bios_set_legacy_vga_target(bool decode, int domain, int bus)
+{
+ return uv_bios_call(UV_BIOS_SET_LEGACY_VGA_TARGET,
+ (u64)decode, (u64)domain, (u64)bus, 0, 0);
+}
+EXPORT_SYMBOL_GPL(uv_bios_set_legacy_vga_target);
+
+#ifdef CONFIG_EFI
+void uv_bios_init(void)
+{
+ uv_systab = NULL;
+ if ((efi.uv_systab == EFI_INVALID_TABLE_ADDR) ||
+ !efi.uv_systab || efi_runtime_disabled()) {
+ pr_crit("UV: UVsystab: missing\n");
+ return;
+ }
+
+ uv_systab = ioremap(efi.uv_systab, sizeof(struct uv_systab));
+ if (!uv_systab || strncmp(uv_systab->signature, UV_SYSTAB_SIG, 4)) {
+ pr_err("UV: UVsystab: bad signature!\n");
+ iounmap(uv_systab);
+ return;
+ }
+
+ /* Starting with UV4 the UV systab size is variable */
+ if (uv_systab->revision >= UV_SYSTAB_VERSION_UV4) {
+ int size = uv_systab->size;
+
+ iounmap(uv_systab);
+ uv_systab = ioremap(efi.uv_systab, size);
+ if (!uv_systab) {
+ pr_err("UV: UVsystab: ioremap(%d) failed!\n", size);
+ return;
+ }
+ }
+ pr_info("UV: UVsystab: Revision:%x\n", uv_systab->revision);
+}
+#endif
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
new file mode 100644
index 000000000..a4130b84d
--- /dev/null
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -0,0 +1,2284 @@
+/*
+ * SGI UltraViolet TLB flush routines.
+ *
+ * (c) 2008-2014 Cliff Wickman <cpw@sgi.com>, SGI.
+ *
+ * This code is released under the GNU General Public License version 2 or
+ * later.
+ */
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+#include <linux/debugfs.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/delay.h>
+
+#include <asm/mmu_context.h>
+#include <asm/uv/uv.h>
+#include <asm/uv/uv_mmrs.h>
+#include <asm/uv/uv_hub.h>
+#include <asm/uv/uv_bau.h>
+#include <asm/apic.h>
+#include <asm/tsc.h>
+#include <asm/irq_vectors.h>
+#include <asm/timer.h>
+
+static struct bau_operations ops __ro_after_init;
+
+/* timeouts in nanoseconds (indexed by UVH_AGING_PRESCALE_SEL urgency7 30:28) */
+static const int timeout_base_ns[] = {
+ 20,
+ 160,
+ 1280,
+ 10240,
+ 81920,
+ 655360,
+ 5242880,
+ 167772160
+};
+
+static int timeout_us;
+static bool nobau = true;
+static int nobau_perm;
+
+/* tunables: */
+static int max_concurr = MAX_BAU_CONCURRENT;
+static int max_concurr_const = MAX_BAU_CONCURRENT;
+static int plugged_delay = PLUGGED_DELAY;
+static int plugsb4reset = PLUGSB4RESET;
+static int giveup_limit = GIVEUP_LIMIT;
+static int timeoutsb4reset = TIMEOUTSB4RESET;
+static int ipi_reset_limit = IPI_RESET_LIMIT;
+static int complete_threshold = COMPLETE_THRESHOLD;
+static int congested_respns_us = CONGESTED_RESPONSE_US;
+static int congested_reps = CONGESTED_REPS;
+static int disabled_period = DISABLED_PERIOD;
+
+static struct tunables tunables[] = {
+ {&max_concurr, MAX_BAU_CONCURRENT}, /* must be [0] */
+ {&plugged_delay, PLUGGED_DELAY},
+ {&plugsb4reset, PLUGSB4RESET},
+ {&timeoutsb4reset, TIMEOUTSB4RESET},
+ {&ipi_reset_limit, IPI_RESET_LIMIT},
+ {&complete_threshold, COMPLETE_THRESHOLD},
+ {&congested_respns_us, CONGESTED_RESPONSE_US},
+ {&congested_reps, CONGESTED_REPS},
+ {&disabled_period, DISABLED_PERIOD},
+ {&giveup_limit, GIVEUP_LIMIT}
+};
+
+static struct dentry *tunables_dir;
+static struct dentry *tunables_file;
+
+/* these correspond to the statistics printed by ptc_seq_show() */
+static char *stat_description[] = {
+ "sent: number of shootdown messages sent",
+ "stime: time spent sending messages",
+ "numuvhubs: number of hubs targeted with shootdown",
+ "numuvhubs16: number times 16 or more hubs targeted",
+ "numuvhubs8: number times 8 or more hubs targeted",
+ "numuvhubs4: number times 4 or more hubs targeted",
+ "numuvhubs2: number times 2 or more hubs targeted",
+ "numuvhubs1: number times 1 hub targeted",
+ "numcpus: number of cpus targeted with shootdown",
+ "dto: number of destination timeouts",
+ "retries: destination timeout retries sent",
+ "rok: : destination timeouts successfully retried",
+ "resetp: ipi-style resource resets for plugs",
+ "resett: ipi-style resource resets for timeouts",
+ "giveup: fall-backs to ipi-style shootdowns",
+ "sto: number of source timeouts",
+ "bz: number of stay-busy's",
+ "throt: number times spun in throttle",
+ "swack: image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE",
+ "recv: shootdown messages received",
+ "rtime: time spent processing messages",
+ "all: shootdown all-tlb messages",
+ "one: shootdown one-tlb messages",
+ "mult: interrupts that found multiple messages",
+ "none: interrupts that found no messages",
+ "retry: number of retry messages processed",
+ "canc: number messages canceled by retries",
+ "nocan: number retries that found nothing to cancel",
+ "reset: number of ipi-style reset requests processed",
+ "rcan: number messages canceled by reset requests",
+ "disable: number times use of the BAU was disabled",
+ "enable: number times use of the BAU was re-enabled"
+};
+
+static int __init setup_bau(char *arg)
+{
+ int result;
+
+ if (!arg)
+ return -EINVAL;
+
+ result = strtobool(arg, &nobau);
+ if (result)
+ return result;
+
+ /* we need to flip the logic here, so that bau=y sets nobau to false */
+ nobau = !nobau;
+
+ if (!nobau)
+ pr_info("UV BAU Enabled\n");
+ else
+ pr_info("UV BAU Disabled\n");
+
+ return 0;
+}
+early_param("bau", setup_bau);
+
+/* base pnode in this partition */
+static int uv_base_pnode __read_mostly;
+
+static DEFINE_PER_CPU(struct ptc_stats, ptcstats);
+static DEFINE_PER_CPU(struct bau_control, bau_control);
+static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask);
+
+static void
+set_bau_on(void)
+{
+ int cpu;
+ struct bau_control *bcp;
+
+ if (nobau_perm) {
+ pr_info("BAU not initialized; cannot be turned on\n");
+ return;
+ }
+ nobau = false;
+ for_each_present_cpu(cpu) {
+ bcp = &per_cpu(bau_control, cpu);
+ bcp->nobau = false;
+ }
+ pr_info("BAU turned on\n");
+ return;
+}
+
+static void
+set_bau_off(void)
+{
+ int cpu;
+ struct bau_control *bcp;
+
+ nobau = true;
+ for_each_present_cpu(cpu) {
+ bcp = &per_cpu(bau_control, cpu);
+ bcp->nobau = true;
+ }
+ pr_info("BAU turned off\n");
+ return;
+}
+
+/*
+ * Determine the first node on a uvhub. 'Nodes' are used for kernel
+ * memory allocation.
+ */
+static int __init uvhub_to_first_node(int uvhub)
+{
+ int node, b;
+
+ for_each_online_node(node) {
+ b = uv_node_to_blade_id(node);
+ if (uvhub == b)
+ return node;
+ }
+ return -1;
+}
+
+/*
+ * Determine the apicid of the first cpu on a uvhub.
+ */
+static int __init uvhub_to_first_apicid(int uvhub)
+{
+ int cpu;
+
+ for_each_present_cpu(cpu)
+ if (uvhub == uv_cpu_to_blade_id(cpu))
+ return per_cpu(x86_cpu_to_apicid, cpu);
+ return -1;
+}
+
+/*
+ * Free a software acknowledge hardware resource by clearing its Pending
+ * bit. This will return a reply to the sender.
+ * If the message has timed out, a reply has already been sent by the
+ * hardware but the resource has not been released. In that case our
+ * clear of the Timeout bit (as well) will free the resource. No reply will
+ * be sent (the hardware will only do one reply per message).
+ */
+static void reply_to_message(struct msg_desc *mdp, struct bau_control *bcp,
+ int do_acknowledge)
+{
+ unsigned long dw;
+ struct bau_pq_entry *msg;
+
+ msg = mdp->msg;
+ if (!msg->canceled && do_acknowledge) {
+ dw = (msg->swack_vec << UV_SW_ACK_NPENDING) | msg->swack_vec;
+ ops.write_l_sw_ack(dw);
+ }
+ msg->replied_to = 1;
+ msg->swack_vec = 0;
+}
+
+/*
+ * Process the receipt of a RETRY message
+ */
+static void bau_process_retry_msg(struct msg_desc *mdp,
+ struct bau_control *bcp)
+{
+ int i;
+ int cancel_count = 0;
+ unsigned long msg_res;
+ unsigned long mmr = 0;
+ struct bau_pq_entry *msg = mdp->msg;
+ struct bau_pq_entry *msg2;
+ struct ptc_stats *stat = bcp->statp;
+
+ stat->d_retries++;
+ /*
+ * cancel any message from msg+1 to the retry itself
+ */
+ for (msg2 = msg+1, i = 0; i < DEST_Q_SIZE; msg2++, i++) {
+ if (msg2 > mdp->queue_last)
+ msg2 = mdp->queue_first;
+ if (msg2 == msg)
+ break;
+
+ /* same conditions for cancellation as do_reset */
+ if ((msg2->replied_to == 0) && (msg2->canceled == 0) &&
+ (msg2->swack_vec) && ((msg2->swack_vec &
+ msg->swack_vec) == 0) &&
+ (msg2->sending_cpu == msg->sending_cpu) &&
+ (msg2->msg_type != MSG_NOOP)) {
+ mmr = ops.read_l_sw_ack();
+ msg_res = msg2->swack_vec;
+ /*
+ * This is a message retry; clear the resources held
+ * by the previous message only if they timed out.
+ * If it has not timed out we have an unexpected
+ * situation to report.
+ */
+ if (mmr & (msg_res << UV_SW_ACK_NPENDING)) {
+ unsigned long mr;
+ /*
+ * Is the resource timed out?
+ * Make everyone ignore the cancelled message.
+ */
+ msg2->canceled = 1;
+ stat->d_canceled++;
+ cancel_count++;
+ mr = (msg_res << UV_SW_ACK_NPENDING) | msg_res;
+ ops.write_l_sw_ack(mr);
+ }
+ }
+ }
+ if (!cancel_count)
+ stat->d_nocanceled++;
+}
+
+/*
+ * Do all the things a cpu should do for a TLB shootdown message.
+ * Other cpu's may come here at the same time for this message.
+ */
+static void bau_process_message(struct msg_desc *mdp, struct bau_control *bcp,
+ int do_acknowledge)
+{
+ short socket_ack_count = 0;
+ short *sp;
+ struct atomic_short *asp;
+ struct ptc_stats *stat = bcp->statp;
+ struct bau_pq_entry *msg = mdp->msg;
+ struct bau_control *smaster = bcp->socket_master;
+
+ /*
+ * This must be a normal message, or retry of a normal message
+ */
+ if (msg->address == TLB_FLUSH_ALL) {
+ local_flush_tlb();
+ stat->d_alltlb++;
+ } else {
+ __flush_tlb_one_user(msg->address);
+ stat->d_onetlb++;
+ }
+ stat->d_requestee++;
+
+ /*
+ * One cpu on each uvhub has the additional job on a RETRY
+ * of releasing the resource held by the message that is
+ * being retried. That message is identified by sending
+ * cpu number.
+ */
+ if (msg->msg_type == MSG_RETRY && bcp == bcp->uvhub_master)
+ bau_process_retry_msg(mdp, bcp);
+
+ /*
+ * This is a swack message, so we have to reply to it.
+ * Count each responding cpu on the socket. This avoids
+ * pinging the count's cache line back and forth between
+ * the sockets.
+ */
+ sp = &smaster->socket_acknowledge_count[mdp->msg_slot];
+ asp = (struct atomic_short *)sp;
+ socket_ack_count = atom_asr(1, asp);
+ if (socket_ack_count == bcp->cpus_in_socket) {
+ int msg_ack_count;
+ /*
+ * Both sockets dump their completed count total into
+ * the message's count.
+ */
+ *sp = 0;
+ asp = (struct atomic_short *)&msg->acknowledge_count;
+ msg_ack_count = atom_asr(socket_ack_count, asp);
+
+ if (msg_ack_count == bcp->cpus_in_uvhub) {
+ /*
+ * All cpus in uvhub saw it; reply
+ * (unless we are in the UV2 workaround)
+ */
+ reply_to_message(mdp, bcp, do_acknowledge);
+ }
+ }
+
+ return;
+}
+
+/*
+ * Determine the first cpu on a pnode.
+ */
+static int pnode_to_first_cpu(int pnode, struct bau_control *smaster)
+{
+ int cpu;
+ struct hub_and_pnode *hpp;
+
+ for_each_present_cpu(cpu) {
+ hpp = &smaster->thp[cpu];
+ if (pnode == hpp->pnode)
+ return cpu;
+ }
+ return -1;
+}
+
+/*
+ * Last resort when we get a large number of destination timeouts is
+ * to clear resources held by a given cpu.
+ * Do this with IPI so that all messages in the BAU message queue
+ * can be identified by their nonzero swack_vec field.
+ *
+ * This is entered for a single cpu on the uvhub.
+ * The sender want's this uvhub to free a specific message's
+ * swack resources.
+ */
+static void do_reset(void *ptr)
+{
+ int i;
+ struct bau_control *bcp = &per_cpu(bau_control, smp_processor_id());
+ struct reset_args *rap = (struct reset_args *)ptr;
+ struct bau_pq_entry *msg;
+ struct ptc_stats *stat = bcp->statp;
+
+ stat->d_resets++;
+ /*
+ * We're looking for the given sender, and
+ * will free its swack resource.
+ * If all cpu's finally responded after the timeout, its
+ * message 'replied_to' was set.
+ */
+ for (msg = bcp->queue_first, i = 0; i < DEST_Q_SIZE; msg++, i++) {
+ unsigned long msg_res;
+ /* do_reset: same conditions for cancellation as
+ bau_process_retry_msg() */
+ if ((msg->replied_to == 0) &&
+ (msg->canceled == 0) &&
+ (msg->sending_cpu == rap->sender) &&
+ (msg->swack_vec) &&
+ (msg->msg_type != MSG_NOOP)) {
+ unsigned long mmr;
+ unsigned long mr;
+ /*
+ * make everyone else ignore this message
+ */
+ msg->canceled = 1;
+ /*
+ * only reset the resource if it is still pending
+ */
+ mmr = ops.read_l_sw_ack();
+ msg_res = msg->swack_vec;
+ mr = (msg_res << UV_SW_ACK_NPENDING) | msg_res;
+ if (mmr & msg_res) {
+ stat->d_rcanceled++;
+ ops.write_l_sw_ack(mr);
+ }
+ }
+ }
+ return;
+}
+
+/*
+ * Use IPI to get all target uvhubs to release resources held by
+ * a given sending cpu number.
+ */
+static void reset_with_ipi(struct pnmask *distribution, struct bau_control *bcp)
+{
+ int pnode;
+ int apnode;
+ int maskbits;
+ int sender = bcp->cpu;
+ cpumask_t *mask = bcp->uvhub_master->cpumask;
+ struct bau_control *smaster = bcp->socket_master;
+ struct reset_args reset_args;
+
+ reset_args.sender = sender;
+ cpumask_clear(mask);
+ /* find a single cpu for each uvhub in this distribution mask */
+ maskbits = sizeof(struct pnmask) * BITSPERBYTE;
+ /* each bit is a pnode relative to the partition base pnode */
+ for (pnode = 0; pnode < maskbits; pnode++) {
+ int cpu;
+ if (!bau_uvhub_isset(pnode, distribution))
+ continue;
+ apnode = pnode + bcp->partition_base_pnode;
+ cpu = pnode_to_first_cpu(apnode, smaster);
+ cpumask_set_cpu(cpu, mask);
+ }
+
+ /* IPI all cpus; preemption is already disabled */
+ smp_call_function_many(mask, do_reset, (void *)&reset_args, 1);
+ return;
+}
+
+/*
+ * Not to be confused with cycles_2_ns() from tsc.c; this gives a relative
+ * number, not an absolute. It converts a duration in cycles to a duration in
+ * ns.
+ */
+static inline unsigned long long cycles_2_ns(unsigned long long cyc)
+{
+ struct cyc2ns_data data;
+ unsigned long long ns;
+
+ cyc2ns_read_begin(&data);
+ ns = mul_u64_u32_shr(cyc, data.cyc2ns_mul, data.cyc2ns_shift);
+ cyc2ns_read_end();
+
+ return ns;
+}
+
+/*
+ * The reverse of the above; converts a duration in ns to a duration in cycles.
+ */
+static inline unsigned long long ns_2_cycles(unsigned long long ns)
+{
+ struct cyc2ns_data data;
+ unsigned long long cyc;
+
+ cyc2ns_read_begin(&data);
+ cyc = (ns << data.cyc2ns_shift) / data.cyc2ns_mul;
+ cyc2ns_read_end();
+
+ return cyc;
+}
+
+static inline unsigned long cycles_2_us(unsigned long long cyc)
+{
+ return cycles_2_ns(cyc) / NSEC_PER_USEC;
+}
+
+static inline cycles_t sec_2_cycles(unsigned long sec)
+{
+ return ns_2_cycles(sec * NSEC_PER_SEC);
+}
+
+static inline unsigned long long usec_2_cycles(unsigned long usec)
+{
+ return ns_2_cycles(usec * NSEC_PER_USEC);
+}
+
+/*
+ * wait for all cpus on this hub to finish their sends and go quiet
+ * leaves uvhub_quiesce set so that no new broadcasts are started by
+ * bau_flush_send_and_wait()
+ */
+static inline void quiesce_local_uvhub(struct bau_control *hmaster)
+{
+ atom_asr(1, (struct atomic_short *)&hmaster->uvhub_quiesce);
+}
+
+/*
+ * mark this quiet-requestor as done
+ */
+static inline void end_uvhub_quiesce(struct bau_control *hmaster)
+{
+ atom_asr(-1, (struct atomic_short *)&hmaster->uvhub_quiesce);
+}
+
+static unsigned long uv1_read_status(unsigned long mmr_offset, int right_shift)
+{
+ unsigned long descriptor_status;
+
+ descriptor_status = uv_read_local_mmr(mmr_offset);
+ descriptor_status >>= right_shift;
+ descriptor_status &= UV_ACT_STATUS_MASK;
+ return descriptor_status;
+}
+
+/*
+ * Wait for completion of a broadcast software ack message
+ * return COMPLETE, RETRY(PLUGGED or TIMEOUT) or GIVEUP
+ */
+static int uv1_wait_completion(struct bau_desc *bau_desc,
+ struct bau_control *bcp, long try)
+{
+ unsigned long descriptor_status;
+ cycles_t ttm;
+ u64 mmr_offset = bcp->status_mmr;
+ int right_shift = bcp->status_index;
+ struct ptc_stats *stat = bcp->statp;
+
+ descriptor_status = uv1_read_status(mmr_offset, right_shift);
+ /* spin on the status MMR, waiting for it to go idle */
+ while ((descriptor_status != DS_IDLE)) {
+ /*
+ * Our software ack messages may be blocked because
+ * there are no swack resources available. As long
+ * as none of them has timed out hardware will NACK
+ * our message and its state will stay IDLE.
+ */
+ if (descriptor_status == DS_SOURCE_TIMEOUT) {
+ stat->s_stimeout++;
+ return FLUSH_GIVEUP;
+ } else if (descriptor_status == DS_DESTINATION_TIMEOUT) {
+ stat->s_dtimeout++;
+ ttm = get_cycles();
+
+ /*
+ * Our retries may be blocked by all destination
+ * swack resources being consumed, and a timeout
+ * pending. In that case hardware returns the
+ * ERROR that looks like a destination timeout.
+ */
+ if (cycles_2_us(ttm - bcp->send_message) < timeout_us) {
+ bcp->conseccompletes = 0;
+ return FLUSH_RETRY_PLUGGED;
+ }
+
+ bcp->conseccompletes = 0;
+ return FLUSH_RETRY_TIMEOUT;
+ } else {
+ /*
+ * descriptor_status is still BUSY
+ */
+ cpu_relax();
+ }
+ descriptor_status = uv1_read_status(mmr_offset, right_shift);
+ }
+ bcp->conseccompletes++;
+ return FLUSH_COMPLETE;
+}
+
+/*
+ * UV2 could have an extra bit of status in the ACTIVATION_STATUS_2 register.
+ * But not currently used.
+ */
+static unsigned long uv2_3_read_status(unsigned long offset, int rshft, int desc)
+{
+ return ((read_lmmr(offset) >> rshft) & UV_ACT_STATUS_MASK) << 1;
+}
+
+/*
+ * Entered when a bau descriptor has gone into a permanent busy wait because
+ * of a hardware bug.
+ * Workaround the bug.
+ */
+static int handle_uv2_busy(struct bau_control *bcp)
+{
+ struct ptc_stats *stat = bcp->statp;
+
+ stat->s_uv2_wars++;
+ bcp->busy = 1;
+ return FLUSH_GIVEUP;
+}
+
+static int uv2_3_wait_completion(struct bau_desc *bau_desc,
+ struct bau_control *bcp, long try)
+{
+ unsigned long descriptor_stat;
+ cycles_t ttm;
+ u64 mmr_offset = bcp->status_mmr;
+ int right_shift = bcp->status_index;
+ int desc = bcp->uvhub_cpu;
+ long busy_reps = 0;
+ struct ptc_stats *stat = bcp->statp;
+
+ descriptor_stat = uv2_3_read_status(mmr_offset, right_shift, desc);
+
+ /* spin on the status MMR, waiting for it to go idle */
+ while (descriptor_stat != UV2H_DESC_IDLE) {
+ if (descriptor_stat == UV2H_DESC_SOURCE_TIMEOUT) {
+ /*
+ * A h/w bug on the destination side may
+ * have prevented the message being marked
+ * pending, thus it doesn't get replied to
+ * and gets continually nacked until it times
+ * out with a SOURCE_TIMEOUT.
+ */
+ stat->s_stimeout++;
+ return FLUSH_GIVEUP;
+ } else if (descriptor_stat == UV2H_DESC_DEST_TIMEOUT) {
+ ttm = get_cycles();
+
+ /*
+ * Our retries may be blocked by all destination
+ * swack resources being consumed, and a timeout
+ * pending. In that case hardware returns the
+ * ERROR that looks like a destination timeout.
+ * Without using the extended status we have to
+ * deduce from the short time that this was a
+ * strong nack.
+ */
+ if (cycles_2_us(ttm - bcp->send_message) < timeout_us) {
+ bcp->conseccompletes = 0;
+ stat->s_plugged++;
+ /* FLUSH_RETRY_PLUGGED causes hang on boot */
+ return FLUSH_GIVEUP;
+ }
+ stat->s_dtimeout++;
+ bcp->conseccompletes = 0;
+ /* FLUSH_RETRY_TIMEOUT causes hang on boot */
+ return FLUSH_GIVEUP;
+ } else {
+ busy_reps++;
+ if (busy_reps > 1000000) {
+ /* not to hammer on the clock */
+ busy_reps = 0;
+ ttm = get_cycles();
+ if ((ttm - bcp->send_message) > bcp->timeout_interval)
+ return handle_uv2_busy(bcp);
+ }
+ /*
+ * descriptor_stat is still BUSY
+ */
+ cpu_relax();
+ }
+ descriptor_stat = uv2_3_read_status(mmr_offset, right_shift, desc);
+ }
+ bcp->conseccompletes++;
+ return FLUSH_COMPLETE;
+}
+
+/*
+ * Returns the status of current BAU message for cpu desc as a bit field
+ * [Error][Busy][Aux]
+ */
+static u64 read_status(u64 status_mmr, int index, int desc)
+{
+ u64 stat;
+
+ stat = ((read_lmmr(status_mmr) >> index) & UV_ACT_STATUS_MASK) << 1;
+ stat |= (read_lmmr(UVH_LB_BAU_SB_ACTIVATION_STATUS_2) >> desc) & 0x1;
+
+ return stat;
+}
+
+static int uv4_wait_completion(struct bau_desc *bau_desc,
+ struct bau_control *bcp, long try)
+{
+ struct ptc_stats *stat = bcp->statp;
+ u64 descriptor_stat;
+ u64 mmr = bcp->status_mmr;
+ int index = bcp->status_index;
+ int desc = bcp->uvhub_cpu;
+
+ descriptor_stat = read_status(mmr, index, desc);
+
+ /* spin on the status MMR, waiting for it to go idle */
+ while (descriptor_stat != UV2H_DESC_IDLE) {
+ switch (descriptor_stat) {
+ case UV2H_DESC_SOURCE_TIMEOUT:
+ stat->s_stimeout++;
+ return FLUSH_GIVEUP;
+
+ case UV2H_DESC_DEST_TIMEOUT:
+ stat->s_dtimeout++;
+ bcp->conseccompletes = 0;
+ return FLUSH_RETRY_TIMEOUT;
+
+ case UV2H_DESC_DEST_STRONG_NACK:
+ stat->s_plugged++;
+ bcp->conseccompletes = 0;
+ return FLUSH_RETRY_PLUGGED;
+
+ case UV2H_DESC_DEST_PUT_ERR:
+ bcp->conseccompletes = 0;
+ return FLUSH_GIVEUP;
+
+ default:
+ /* descriptor_stat is still BUSY */
+ cpu_relax();
+ }
+ descriptor_stat = read_status(mmr, index, desc);
+ }
+ bcp->conseccompletes++;
+ return FLUSH_COMPLETE;
+}
+
+/*
+ * Our retries are blocked by all destination sw ack resources being
+ * in use, and a timeout is pending. In that case hardware immediately
+ * returns the ERROR that looks like a destination timeout.
+ */
+static void destination_plugged(struct bau_desc *bau_desc,
+ struct bau_control *bcp,
+ struct bau_control *hmaster, struct ptc_stats *stat)
+{
+ udelay(bcp->plugged_delay);
+ bcp->plugged_tries++;
+
+ if (bcp->plugged_tries >= bcp->plugsb4reset) {
+ bcp->plugged_tries = 0;
+
+ quiesce_local_uvhub(hmaster);
+
+ spin_lock(&hmaster->queue_lock);
+ reset_with_ipi(&bau_desc->distribution, bcp);
+ spin_unlock(&hmaster->queue_lock);
+
+ end_uvhub_quiesce(hmaster);
+
+ bcp->ipi_attempts++;
+ stat->s_resets_plug++;
+ }
+}
+
+static void destination_timeout(struct bau_desc *bau_desc,
+ struct bau_control *bcp, struct bau_control *hmaster,
+ struct ptc_stats *stat)
+{
+ hmaster->max_concurr = 1;
+ bcp->timeout_tries++;
+ if (bcp->timeout_tries >= bcp->timeoutsb4reset) {
+ bcp->timeout_tries = 0;
+
+ quiesce_local_uvhub(hmaster);
+
+ spin_lock(&hmaster->queue_lock);
+ reset_with_ipi(&bau_desc->distribution, bcp);
+ spin_unlock(&hmaster->queue_lock);
+
+ end_uvhub_quiesce(hmaster);
+
+ bcp->ipi_attempts++;
+ stat->s_resets_timeout++;
+ }
+}
+
+/*
+ * Stop all cpus on a uvhub from using the BAU for a period of time.
+ * This is reversed by check_enable.
+ */
+static void disable_for_period(struct bau_control *bcp, struct ptc_stats *stat)
+{
+ int tcpu;
+ struct bau_control *tbcp;
+ struct bau_control *hmaster;
+ cycles_t tm1;
+
+ hmaster = bcp->uvhub_master;
+ spin_lock(&hmaster->disable_lock);
+ if (!bcp->baudisabled) {
+ stat->s_bau_disabled++;
+ tm1 = get_cycles();
+ for_each_present_cpu(tcpu) {
+ tbcp = &per_cpu(bau_control, tcpu);
+ if (tbcp->uvhub_master == hmaster) {
+ tbcp->baudisabled = 1;
+ tbcp->set_bau_on_time =
+ tm1 + bcp->disabled_period;
+ }
+ }
+ }
+ spin_unlock(&hmaster->disable_lock);
+}
+
+static void count_max_concurr(int stat, struct bau_control *bcp,
+ struct bau_control *hmaster)
+{
+ bcp->plugged_tries = 0;
+ bcp->timeout_tries = 0;
+ if (stat != FLUSH_COMPLETE)
+ return;
+ if (bcp->conseccompletes <= bcp->complete_threshold)
+ return;
+ if (hmaster->max_concurr >= hmaster->max_concurr_const)
+ return;
+ hmaster->max_concurr++;
+}
+
+static void record_send_stats(cycles_t time1, cycles_t time2,
+ struct bau_control *bcp, struct ptc_stats *stat,
+ int completion_status, int try)
+{
+ cycles_t elapsed;
+
+ if (time2 > time1) {
+ elapsed = time2 - time1;
+ stat->s_time += elapsed;
+
+ if ((completion_status == FLUSH_COMPLETE) && (try == 1)) {
+ bcp->period_requests++;
+ bcp->period_time += elapsed;
+ if ((elapsed > usec_2_cycles(bcp->cong_response_us)) &&
+ (bcp->period_requests > bcp->cong_reps) &&
+ ((bcp->period_time / bcp->period_requests) >
+ usec_2_cycles(bcp->cong_response_us))) {
+ stat->s_congested++;
+ disable_for_period(bcp, stat);
+ }
+ }
+ } else
+ stat->s_requestor--;
+
+ if (completion_status == FLUSH_COMPLETE && try > 1)
+ stat->s_retriesok++;
+ else if (completion_status == FLUSH_GIVEUP) {
+ stat->s_giveup++;
+ if (get_cycles() > bcp->period_end)
+ bcp->period_giveups = 0;
+ bcp->period_giveups++;
+ if (bcp->period_giveups == 1)
+ bcp->period_end = get_cycles() + bcp->disabled_period;
+ if (bcp->period_giveups > bcp->giveup_limit) {
+ disable_for_period(bcp, stat);
+ stat->s_giveuplimit++;
+ }
+ }
+}
+
+/*
+ * Because of a uv1 hardware bug only a limited number of concurrent
+ * requests can be made.
+ */
+static void uv1_throttle(struct bau_control *hmaster, struct ptc_stats *stat)
+{
+ spinlock_t *lock = &hmaster->uvhub_lock;
+ atomic_t *v;
+
+ v = &hmaster->active_descriptor_count;
+ if (!atomic_inc_unless_ge(lock, v, hmaster->max_concurr)) {
+ stat->s_throttles++;
+ do {
+ cpu_relax();
+ } while (!atomic_inc_unless_ge(lock, v, hmaster->max_concurr));
+ }
+}
+
+/*
+ * Handle the completion status of a message send.
+ */
+static void handle_cmplt(int completion_status, struct bau_desc *bau_desc,
+ struct bau_control *bcp, struct bau_control *hmaster,
+ struct ptc_stats *stat)
+{
+ if (completion_status == FLUSH_RETRY_PLUGGED)
+ destination_plugged(bau_desc, bcp, hmaster, stat);
+ else if (completion_status == FLUSH_RETRY_TIMEOUT)
+ destination_timeout(bau_desc, bcp, hmaster, stat);
+}
+
+/*
+ * Send a broadcast and wait for it to complete.
+ *
+ * The flush_mask contains the cpus the broadcast is to be sent to including
+ * cpus that are on the local uvhub.
+ *
+ * Returns 0 if all flushing represented in the mask was done.
+ * Returns 1 if it gives up entirely and the original cpu mask is to be
+ * returned to the kernel.
+ */
+static int uv_flush_send_and_wait(struct cpumask *flush_mask,
+ struct bau_control *bcp,
+ struct bau_desc *bau_desc)
+{
+ int seq_number = 0;
+ int completion_stat = 0;
+ int uv1 = 0;
+ long try = 0;
+ unsigned long index;
+ cycles_t time1;
+ cycles_t time2;
+ struct ptc_stats *stat = bcp->statp;
+ struct bau_control *hmaster = bcp->uvhub_master;
+ struct uv1_bau_msg_header *uv1_hdr = NULL;
+ struct uv2_3_bau_msg_header *uv2_3_hdr = NULL;
+
+ if (bcp->uvhub_version == UV_BAU_V1) {
+ uv1 = 1;
+ uv1_throttle(hmaster, stat);
+ }
+
+ while (hmaster->uvhub_quiesce)
+ cpu_relax();
+
+ time1 = get_cycles();
+ if (uv1)
+ uv1_hdr = &bau_desc->header.uv1_hdr;
+ else
+ /* uv2 and uv3 */
+ uv2_3_hdr = &bau_desc->header.uv2_3_hdr;
+
+ do {
+ if (try == 0) {
+ if (uv1)
+ uv1_hdr->msg_type = MSG_REGULAR;
+ else
+ uv2_3_hdr->msg_type = MSG_REGULAR;
+ seq_number = bcp->message_number++;
+ } else {
+ if (uv1)
+ uv1_hdr->msg_type = MSG_RETRY;
+ else
+ uv2_3_hdr->msg_type = MSG_RETRY;
+ stat->s_retry_messages++;
+ }
+
+ if (uv1)
+ uv1_hdr->sequence = seq_number;
+ else
+ uv2_3_hdr->sequence = seq_number;
+ index = (1UL << AS_PUSH_SHIFT) | bcp->uvhub_cpu;
+ bcp->send_message = get_cycles();
+
+ write_mmr_activation(index);
+
+ try++;
+ completion_stat = ops.wait_completion(bau_desc, bcp, try);
+
+ handle_cmplt(completion_stat, bau_desc, bcp, hmaster, stat);
+
+ if (bcp->ipi_attempts >= bcp->ipi_reset_limit) {
+ bcp->ipi_attempts = 0;
+ stat->s_overipilimit++;
+ completion_stat = FLUSH_GIVEUP;
+ break;
+ }
+ cpu_relax();
+ } while ((completion_stat == FLUSH_RETRY_PLUGGED) ||
+ (completion_stat == FLUSH_RETRY_TIMEOUT));
+
+ time2 = get_cycles();
+
+ count_max_concurr(completion_stat, bcp, hmaster);
+
+ while (hmaster->uvhub_quiesce)
+ cpu_relax();
+
+ atomic_dec(&hmaster->active_descriptor_count);
+
+ record_send_stats(time1, time2, bcp, stat, completion_stat, try);
+
+ if (completion_stat == FLUSH_GIVEUP)
+ /* FLUSH_GIVEUP will fall back to using IPI's for tlb flush */
+ return 1;
+ return 0;
+}
+
+/*
+ * The BAU is disabled for this uvhub. When the disabled time period has
+ * expired re-enable it.
+ * Return 0 if it is re-enabled for all cpus on this uvhub.
+ */
+static int check_enable(struct bau_control *bcp, struct ptc_stats *stat)
+{
+ int tcpu;
+ struct bau_control *tbcp;
+ struct bau_control *hmaster;
+
+ hmaster = bcp->uvhub_master;
+ spin_lock(&hmaster->disable_lock);
+ if (bcp->baudisabled && (get_cycles() >= bcp->set_bau_on_time)) {
+ stat->s_bau_reenabled++;
+ for_each_present_cpu(tcpu) {
+ tbcp = &per_cpu(bau_control, tcpu);
+ if (tbcp->uvhub_master == hmaster) {
+ tbcp->baudisabled = 0;
+ tbcp->period_requests = 0;
+ tbcp->period_time = 0;
+ tbcp->period_giveups = 0;
+ }
+ }
+ spin_unlock(&hmaster->disable_lock);
+ return 0;
+ }
+ spin_unlock(&hmaster->disable_lock);
+ return -1;
+}
+
+static void record_send_statistics(struct ptc_stats *stat, int locals, int hubs,
+ int remotes, struct bau_desc *bau_desc)
+{
+ stat->s_requestor++;
+ stat->s_ntargcpu += remotes + locals;
+ stat->s_ntargremotes += remotes;
+ stat->s_ntarglocals += locals;
+
+ /* uvhub statistics */
+ hubs = bau_uvhub_weight(&bau_desc->distribution);
+ if (locals) {
+ stat->s_ntarglocaluvhub++;
+ stat->s_ntargremoteuvhub += (hubs - 1);
+ } else
+ stat->s_ntargremoteuvhub += hubs;
+
+ stat->s_ntarguvhub += hubs;
+
+ if (hubs >= 16)
+ stat->s_ntarguvhub16++;
+ else if (hubs >= 8)
+ stat->s_ntarguvhub8++;
+ else if (hubs >= 4)
+ stat->s_ntarguvhub4++;
+ else if (hubs >= 2)
+ stat->s_ntarguvhub2++;
+ else
+ stat->s_ntarguvhub1++;
+}
+
+/*
+ * Translate a cpu mask to the uvhub distribution mask in the BAU
+ * activation descriptor.
+ */
+static int set_distrib_bits(struct cpumask *flush_mask, struct bau_control *bcp,
+ struct bau_desc *bau_desc, int *localsp, int *remotesp)
+{
+ int cpu;
+ int pnode;
+ int cnt = 0;
+ struct hub_and_pnode *hpp;
+
+ for_each_cpu(cpu, flush_mask) {
+ /*
+ * The distribution vector is a bit map of pnodes, relative
+ * to the partition base pnode (and the partition base nasid
+ * in the header).
+ * Translate cpu to pnode and hub using a local memory array.
+ */
+ hpp = &bcp->socket_master->thp[cpu];
+ pnode = hpp->pnode - bcp->partition_base_pnode;
+ bau_uvhub_set(pnode, &bau_desc->distribution);
+ cnt++;
+ if (hpp->uvhub == bcp->uvhub)
+ (*localsp)++;
+ else
+ (*remotesp)++;
+ }
+ if (!cnt)
+ return 1;
+ return 0;
+}
+
+/*
+ * globally purge translation cache of a virtual address or all TLB's
+ * @cpumask: mask of all cpu's in which the address is to be removed
+ * @mm: mm_struct containing virtual address range
+ * @start: start virtual address to be removed from TLB
+ * @end: end virtual address to be remove from TLB
+ * @cpu: the current cpu
+ *
+ * This is the entry point for initiating any UV global TLB shootdown.
+ *
+ * Purges the translation caches of all specified processors of the given
+ * virtual address, or purges all TLB's on specified processors.
+ *
+ * The caller has derived the cpumask from the mm_struct. This function
+ * is called only if there are bits set in the mask. (e.g. flush_tlb_page())
+ *
+ * The cpumask is converted into a uvhubmask of the uvhubs containing
+ * those cpus.
+ *
+ * Note that this function should be called with preemption disabled.
+ *
+ * Returns NULL if all remote flushing was done.
+ * Returns pointer to cpumask if some remote flushing remains to be
+ * done. The returned pointer is valid till preemption is re-enabled.
+ */
+const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
+ const struct flush_tlb_info *info)
+{
+ unsigned int cpu = smp_processor_id();
+ int locals = 0, remotes = 0, hubs = 0;
+ struct bau_desc *bau_desc;
+ struct cpumask *flush_mask;
+ struct ptc_stats *stat;
+ struct bau_control *bcp;
+ unsigned long descriptor_status, status, address;
+
+ bcp = &per_cpu(bau_control, cpu);
+
+ if (bcp->nobau)
+ return cpumask;
+
+ stat = bcp->statp;
+ stat->s_enters++;
+
+ if (bcp->busy) {
+ descriptor_status =
+ read_lmmr(UVH_LB_BAU_SB_ACTIVATION_STATUS_0);
+ status = ((descriptor_status >> (bcp->uvhub_cpu *
+ UV_ACT_STATUS_SIZE)) & UV_ACT_STATUS_MASK) << 1;
+ if (status == UV2H_DESC_BUSY)
+ return cpumask;
+ bcp->busy = 0;
+ }
+
+ /* bau was disabled due to slow response */
+ if (bcp->baudisabled) {
+ if (check_enable(bcp, stat)) {
+ stat->s_ipifordisabled++;
+ return cpumask;
+ }
+ }
+
+ /*
+ * Each sending cpu has a per-cpu mask which it fills from the caller's
+ * cpu mask. All cpus are converted to uvhubs and copied to the
+ * activation descriptor.
+ */
+ flush_mask = (struct cpumask *)per_cpu(uv_flush_tlb_mask, cpu);
+ /* don't actually do a shootdown of the local cpu */
+ cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
+
+ if (cpumask_test_cpu(cpu, cpumask))
+ stat->s_ntargself++;
+
+ bau_desc = bcp->descriptor_base;
+ bau_desc += (ITEMS_PER_DESC * bcp->uvhub_cpu);
+ bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
+ if (set_distrib_bits(flush_mask, bcp, bau_desc, &locals, &remotes))
+ return NULL;
+
+ record_send_statistics(stat, locals, hubs, remotes, bau_desc);
+
+ if (!info->end || (info->end - info->start) <= PAGE_SIZE)
+ address = info->start;
+ else
+ address = TLB_FLUSH_ALL;
+
+ switch (bcp->uvhub_version) {
+ case UV_BAU_V1:
+ case UV_BAU_V2:
+ case UV_BAU_V3:
+ bau_desc->payload.uv1_2_3.address = address;
+ bau_desc->payload.uv1_2_3.sending_cpu = cpu;
+ break;
+ case UV_BAU_V4:
+ bau_desc->payload.uv4.address = address;
+ bau_desc->payload.uv4.sending_cpu = cpu;
+ bau_desc->payload.uv4.qualifier = BAU_DESC_QUALIFIER;
+ break;
+ }
+
+ /*
+ * uv_flush_send_and_wait returns 0 if all cpu's were messaged,
+ * or 1 if it gave up and the original cpumask should be returned.
+ */
+ if (!uv_flush_send_and_wait(flush_mask, bcp, bau_desc))
+ return NULL;
+ else
+ return cpumask;
+}
+
+/*
+ * Search the message queue for any 'other' unprocessed message with the
+ * same software acknowledge resource bit vector as the 'msg' message.
+ */
+static struct bau_pq_entry *find_another_by_swack(struct bau_pq_entry *msg,
+ struct bau_control *bcp)
+{
+ struct bau_pq_entry *msg_next = msg + 1;
+ unsigned char swack_vec = msg->swack_vec;
+
+ if (msg_next > bcp->queue_last)
+ msg_next = bcp->queue_first;
+ while (msg_next != msg) {
+ if ((msg_next->canceled == 0) && (msg_next->replied_to == 0) &&
+ (msg_next->swack_vec == swack_vec))
+ return msg_next;
+ msg_next++;
+ if (msg_next > bcp->queue_last)
+ msg_next = bcp->queue_first;
+ }
+ return NULL;
+}
+
+/*
+ * UV2 needs to work around a bug in which an arriving message has not
+ * set a bit in the UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE register.
+ * Such a message must be ignored.
+ */
+static void process_uv2_message(struct msg_desc *mdp, struct bau_control *bcp)
+{
+ unsigned long mmr_image;
+ unsigned char swack_vec;
+ struct bau_pq_entry *msg = mdp->msg;
+ struct bau_pq_entry *other_msg;
+
+ mmr_image = ops.read_l_sw_ack();
+ swack_vec = msg->swack_vec;
+
+ if ((swack_vec & mmr_image) == 0) {
+ /*
+ * This message was assigned a swack resource, but no
+ * reserved acknowlegment is pending.
+ * The bug has prevented this message from setting the MMR.
+ */
+ /*
+ * Some message has set the MMR 'pending' bit; it might have
+ * been another message. Look for that message.
+ */
+ other_msg = find_another_by_swack(msg, bcp);
+ if (other_msg) {
+ /*
+ * There is another. Process this one but do not
+ * ack it.
+ */
+ bau_process_message(mdp, bcp, 0);
+ /*
+ * Let the natural processing of that other message
+ * acknowledge it. Don't get the processing of sw_ack's
+ * out of order.
+ */
+ return;
+ }
+ }
+
+ /*
+ * Either the MMR shows this one pending a reply or there is no
+ * other message using this sw_ack, so it is safe to acknowledge it.
+ */
+ bau_process_message(mdp, bcp, 1);
+
+ return;
+}
+
+/*
+ * The BAU message interrupt comes here. (registered by set_intr_gate)
+ * See entry_64.S
+ *
+ * We received a broadcast assist message.
+ *
+ * Interrupts are disabled; this interrupt could represent
+ * the receipt of several messages.
+ *
+ * All cores/threads on this hub get this interrupt.
+ * The last one to see it does the software ack.
+ * (the resource will not be freed until noninterruptable cpus see this
+ * interrupt; hardware may timeout the s/w ack and reply ERROR)
+ */
+void uv_bau_message_interrupt(struct pt_regs *regs)
+{
+ int count = 0;
+ cycles_t time_start;
+ struct bau_pq_entry *msg;
+ struct bau_control *bcp;
+ struct ptc_stats *stat;
+ struct msg_desc msgdesc;
+
+ ack_APIC_irq();
+ kvm_set_cpu_l1tf_flush_l1d();
+ time_start = get_cycles();
+
+ bcp = &per_cpu(bau_control, smp_processor_id());
+ stat = bcp->statp;
+
+ msgdesc.queue_first = bcp->queue_first;
+ msgdesc.queue_last = bcp->queue_last;
+
+ msg = bcp->bau_msg_head;
+ while (msg->swack_vec) {
+ count++;
+
+ msgdesc.msg_slot = msg - msgdesc.queue_first;
+ msgdesc.msg = msg;
+ if (bcp->uvhub_version == UV_BAU_V2)
+ process_uv2_message(&msgdesc, bcp);
+ else
+ /* no error workaround for uv1 or uv3 */
+ bau_process_message(&msgdesc, bcp, 1);
+
+ msg++;
+ if (msg > msgdesc.queue_last)
+ msg = msgdesc.queue_first;
+ bcp->bau_msg_head = msg;
+ }
+ stat->d_time += (get_cycles() - time_start);
+ if (!count)
+ stat->d_nomsg++;
+ else if (count > 1)
+ stat->d_multmsg++;
+}
+
+/*
+ * Each target uvhub (i.e. a uvhub that has cpu's) needs to have
+ * shootdown message timeouts enabled. The timeout does not cause
+ * an interrupt, but causes an error message to be returned to
+ * the sender.
+ */
+static void __init enable_timeouts(void)
+{
+ int uvhub;
+ int nuvhubs;
+ int pnode;
+ unsigned long mmr_image;
+
+ nuvhubs = uv_num_possible_blades();
+
+ for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
+ if (!uv_blade_nr_possible_cpus(uvhub))
+ continue;
+
+ pnode = uv_blade_to_pnode(uvhub);
+ mmr_image = read_mmr_misc_control(pnode);
+ /*
+ * Set the timeout period and then lock it in, in three
+ * steps; captures and locks in the period.
+ *
+ * To program the period, the SOFT_ACK_MODE must be off.
+ */
+ mmr_image &= ~(1L << SOFTACK_MSHIFT);
+ write_mmr_misc_control(pnode, mmr_image);
+ /*
+ * Set the 4-bit period.
+ */
+ mmr_image &= ~((unsigned long)0xf << SOFTACK_PSHIFT);
+ mmr_image |= (SOFTACK_TIMEOUT_PERIOD << SOFTACK_PSHIFT);
+ write_mmr_misc_control(pnode, mmr_image);
+ /*
+ * UV1:
+ * Subsequent reversals of the timebase bit (3) cause an
+ * immediate timeout of one or all INTD resources as
+ * indicated in bits 2:0 (7 causes all of them to timeout).
+ */
+ mmr_image |= (1L << SOFTACK_MSHIFT);
+ if (is_uv2_hub()) {
+ /* do not touch the legacy mode bit */
+ /* hw bug workaround; do not use extended status */
+ mmr_image &= ~(1L << UV2_EXT_SHFT);
+ } else if (is_uv3_hub()) {
+ mmr_image &= ~(1L << PREFETCH_HINT_SHFT);
+ mmr_image |= (1L << SB_STATUS_SHFT);
+ }
+ write_mmr_misc_control(pnode, mmr_image);
+ }
+}
+
+static void *ptc_seq_start(struct seq_file *file, loff_t *offset)
+{
+ if (*offset < num_possible_cpus())
+ return offset;
+ return NULL;
+}
+
+static void *ptc_seq_next(struct seq_file *file, void *data, loff_t *offset)
+{
+ (*offset)++;
+ if (*offset < num_possible_cpus())
+ return offset;
+ return NULL;
+}
+
+static void ptc_seq_stop(struct seq_file *file, void *data)
+{
+}
+
+/*
+ * Display the statistics thru /proc/sgi_uv/ptc_statistics
+ * 'data' points to the cpu number
+ * Note: see the descriptions in stat_description[].
+ */
+static int ptc_seq_show(struct seq_file *file, void *data)
+{
+ struct ptc_stats *stat;
+ struct bau_control *bcp;
+ int cpu;
+
+ cpu = *(loff_t *)data;
+ if (!cpu) {
+ seq_puts(file,
+ "# cpu bauoff sent stime self locals remotes ncpus localhub ");
+ seq_puts(file, "remotehub numuvhubs numuvhubs16 numuvhubs8 ");
+ seq_puts(file,
+ "numuvhubs4 numuvhubs2 numuvhubs1 dto snacks retries ");
+ seq_puts(file,
+ "rok resetp resett giveup sto bz throt disable ");
+ seq_puts(file,
+ "enable wars warshw warwaits enters ipidis plugged ");
+ seq_puts(file,
+ "ipiover glim cong swack recv rtime all one mult ");
+ seq_puts(file, "none retry canc nocan reset rcan\n");
+ }
+ if (cpu < num_possible_cpus() && cpu_online(cpu)) {
+ bcp = &per_cpu(bau_control, cpu);
+ if (bcp->nobau) {
+ seq_printf(file, "cpu %d bau disabled\n", cpu);
+ return 0;
+ }
+ stat = bcp->statp;
+ /* source side statistics */
+ seq_printf(file,
+ "cpu %d %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
+ cpu, bcp->nobau, stat->s_requestor,
+ cycles_2_us(stat->s_time),
+ stat->s_ntargself, stat->s_ntarglocals,
+ stat->s_ntargremotes, stat->s_ntargcpu,
+ stat->s_ntarglocaluvhub, stat->s_ntargremoteuvhub,
+ stat->s_ntarguvhub, stat->s_ntarguvhub16);
+ seq_printf(file, "%ld %ld %ld %ld %ld %ld ",
+ stat->s_ntarguvhub8, stat->s_ntarguvhub4,
+ stat->s_ntarguvhub2, stat->s_ntarguvhub1,
+ stat->s_dtimeout, stat->s_strongnacks);
+ seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld ",
+ stat->s_retry_messages, stat->s_retriesok,
+ stat->s_resets_plug, stat->s_resets_timeout,
+ stat->s_giveup, stat->s_stimeout,
+ stat->s_busy, stat->s_throttles);
+ seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
+ stat->s_bau_disabled, stat->s_bau_reenabled,
+ stat->s_uv2_wars, stat->s_uv2_wars_hw,
+ stat->s_uv2_war_waits, stat->s_enters,
+ stat->s_ipifordisabled, stat->s_plugged,
+ stat->s_overipilimit, stat->s_giveuplimit,
+ stat->s_congested);
+
+ /* destination side statistics */
+ seq_printf(file,
+ "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld\n",
+ ops.read_g_sw_ack(uv_cpu_to_pnode(cpu)),
+ stat->d_requestee, cycles_2_us(stat->d_time),
+ stat->d_alltlb, stat->d_onetlb, stat->d_multmsg,
+ stat->d_nomsg, stat->d_retries, stat->d_canceled,
+ stat->d_nocanceled, stat->d_resets,
+ stat->d_rcanceled);
+ }
+ return 0;
+}
+
+/*
+ * Display the tunables thru debugfs
+ */
+static ssize_t tunables_read(struct file *file, char __user *userbuf,
+ size_t count, loff_t *ppos)
+{
+ char *buf;
+ int ret;
+
+ buf = kasprintf(GFP_KERNEL, "%s %s %s\n%d %d %d %d %d %d %d %d %d %d\n",
+ "max_concur plugged_delay plugsb4reset timeoutsb4reset",
+ "ipi_reset_limit complete_threshold congested_response_us",
+ "congested_reps disabled_period giveup_limit",
+ max_concurr, plugged_delay, plugsb4reset,
+ timeoutsb4reset, ipi_reset_limit, complete_threshold,
+ congested_respns_us, congested_reps, disabled_period,
+ giveup_limit);
+
+ if (!buf)
+ return -ENOMEM;
+
+ ret = simple_read_from_buffer(userbuf, count, ppos, buf, strlen(buf));
+ kfree(buf);
+ return ret;
+}
+
+/*
+ * handle a write to /proc/sgi_uv/ptc_statistics
+ * -1: reset the statistics
+ * 0: display meaning of the statistics
+ */
+static ssize_t ptc_proc_write(struct file *file, const char __user *user,
+ size_t count, loff_t *data)
+{
+ int cpu;
+ int i;
+ int elements;
+ long input_arg;
+ char optstr[64];
+ struct ptc_stats *stat;
+
+ if (count == 0 || count > sizeof(optstr))
+ return -EINVAL;
+ if (copy_from_user(optstr, user, count))
+ return -EFAULT;
+ optstr[count - 1] = '\0';
+
+ if (!strcmp(optstr, "on")) {
+ set_bau_on();
+ return count;
+ } else if (!strcmp(optstr, "off")) {
+ set_bau_off();
+ return count;
+ }
+
+ if (kstrtol(optstr, 10, &input_arg) < 0) {
+ pr_debug("%s is invalid\n", optstr);
+ return -EINVAL;
+ }
+
+ if (input_arg == 0) {
+ elements = ARRAY_SIZE(stat_description);
+ pr_debug("# cpu: cpu number\n");
+ pr_debug("Sender statistics:\n");
+ for (i = 0; i < elements; i++)
+ pr_debug("%s\n", stat_description[i]);
+ } else if (input_arg == -1) {
+ for_each_present_cpu(cpu) {
+ stat = &per_cpu(ptcstats, cpu);
+ memset(stat, 0, sizeof(struct ptc_stats));
+ }
+ }
+
+ return count;
+}
+
+static int local_atoi(const char *name)
+{
+ int val = 0;
+
+ for (;; name++) {
+ switch (*name) {
+ case '0' ... '9':
+ val = 10*val+(*name-'0');
+ break;
+ default:
+ return val;
+ }
+ }
+}
+
+/*
+ * Parse the values written to /sys/kernel/debug/sgi_uv/bau_tunables.
+ * Zero values reset them to defaults.
+ */
+static int parse_tunables_write(struct bau_control *bcp, char *instr,
+ int count)
+{
+ char *p;
+ char *q;
+ int cnt = 0;
+ int val;
+ int e = ARRAY_SIZE(tunables);
+
+ p = instr + strspn(instr, WHITESPACE);
+ q = p;
+ for (; *p; p = q + strspn(q, WHITESPACE)) {
+ q = p + strcspn(p, WHITESPACE);
+ cnt++;
+ if (q == p)
+ break;
+ }
+ if (cnt != e) {
+ pr_info("bau tunable error: should be %d values\n", e);
+ return -EINVAL;
+ }
+
+ p = instr + strspn(instr, WHITESPACE);
+ q = p;
+ for (cnt = 0; *p; p = q + strspn(q, WHITESPACE), cnt++) {
+ q = p + strcspn(p, WHITESPACE);
+ val = local_atoi(p);
+ switch (cnt) {
+ case 0:
+ if (val == 0) {
+ max_concurr = MAX_BAU_CONCURRENT;
+ max_concurr_const = MAX_BAU_CONCURRENT;
+ continue;
+ }
+ if (val < 1 || val > bcp->cpus_in_uvhub) {
+ pr_debug(
+ "Error: BAU max concurrent %d is invalid\n",
+ val);
+ return -EINVAL;
+ }
+ max_concurr = val;
+ max_concurr_const = val;
+ continue;
+ default:
+ if (val == 0)
+ *tunables[cnt].tunp = tunables[cnt].deflt;
+ else
+ *tunables[cnt].tunp = val;
+ continue;
+ }
+ }
+ return 0;
+}
+
+/*
+ * Handle a write to debugfs. (/sys/kernel/debug/sgi_uv/bau_tunables)
+ */
+static ssize_t tunables_write(struct file *file, const char __user *user,
+ size_t count, loff_t *data)
+{
+ int cpu;
+ int ret;
+ char instr[100];
+ struct bau_control *bcp;
+
+ if (count == 0 || count > sizeof(instr)-1)
+ return -EINVAL;
+ if (copy_from_user(instr, user, count))
+ return -EFAULT;
+
+ instr[count] = '\0';
+
+ cpu = get_cpu();
+ bcp = &per_cpu(bau_control, cpu);
+ ret = parse_tunables_write(bcp, instr, count);
+ put_cpu();
+ if (ret)
+ return ret;
+
+ for_each_present_cpu(cpu) {
+ bcp = &per_cpu(bau_control, cpu);
+ bcp->max_concurr = max_concurr;
+ bcp->max_concurr_const = max_concurr;
+ bcp->plugged_delay = plugged_delay;
+ bcp->plugsb4reset = plugsb4reset;
+ bcp->timeoutsb4reset = timeoutsb4reset;
+ bcp->ipi_reset_limit = ipi_reset_limit;
+ bcp->complete_threshold = complete_threshold;
+ bcp->cong_response_us = congested_respns_us;
+ bcp->cong_reps = congested_reps;
+ bcp->disabled_period = sec_2_cycles(disabled_period);
+ bcp->giveup_limit = giveup_limit;
+ }
+ return count;
+}
+
+static const struct seq_operations uv_ptc_seq_ops = {
+ .start = ptc_seq_start,
+ .next = ptc_seq_next,
+ .stop = ptc_seq_stop,
+ .show = ptc_seq_show
+};
+
+static int ptc_proc_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &uv_ptc_seq_ops);
+}
+
+static int tunables_open(struct inode *inode, struct file *file)
+{
+ return 0;
+}
+
+static const struct file_operations proc_uv_ptc_operations = {
+ .open = ptc_proc_open,
+ .read = seq_read,
+ .write = ptc_proc_write,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static const struct file_operations tunables_fops = {
+ .open = tunables_open,
+ .read = tunables_read,
+ .write = tunables_write,
+ .llseek = default_llseek,
+};
+
+static int __init uv_ptc_init(void)
+{
+ struct proc_dir_entry *proc_uv_ptc;
+
+ if (!is_uv_system())
+ return 0;
+
+ proc_uv_ptc = proc_create(UV_PTC_BASENAME, 0444, NULL,
+ &proc_uv_ptc_operations);
+ if (!proc_uv_ptc) {
+ pr_err("unable to create %s proc entry\n",
+ UV_PTC_BASENAME);
+ return -EINVAL;
+ }
+
+ tunables_dir = debugfs_create_dir(UV_BAU_TUNABLES_DIR, NULL);
+ if (!tunables_dir) {
+ pr_err("unable to create debugfs directory %s\n",
+ UV_BAU_TUNABLES_DIR);
+ return -EINVAL;
+ }
+ tunables_file = debugfs_create_file(UV_BAU_TUNABLES_FILE, 0600,
+ tunables_dir, NULL, &tunables_fops);
+ if (!tunables_file) {
+ pr_err("unable to create debugfs file %s\n",
+ UV_BAU_TUNABLES_FILE);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/*
+ * Initialize the sending side's sending buffers.
+ */
+static void activation_descriptor_init(int node, int pnode, int base_pnode)
+{
+ int i;
+ int cpu;
+ int uv1 = 0;
+ unsigned long gpa;
+ unsigned long m;
+ unsigned long n;
+ size_t dsize;
+ struct bau_desc *bau_desc;
+ struct bau_desc *bd2;
+ struct uv1_bau_msg_header *uv1_hdr;
+ struct uv2_3_bau_msg_header *uv2_3_hdr;
+ struct bau_control *bcp;
+
+ /*
+ * each bau_desc is 64 bytes; there are 8 (ITEMS_PER_DESC)
+ * per cpu; and one per cpu on the uvhub (ADP_SZ)
+ */
+ dsize = sizeof(struct bau_desc) * ADP_SZ * ITEMS_PER_DESC;
+ bau_desc = kmalloc_node(dsize, GFP_KERNEL, node);
+ BUG_ON(!bau_desc);
+
+ gpa = uv_gpa(bau_desc);
+ n = uv_gpa_to_gnode(gpa);
+ m = ops.bau_gpa_to_offset(gpa);
+ if (is_uv1_hub())
+ uv1 = 1;
+
+ /* the 14-bit pnode */
+ write_mmr_descriptor_base(pnode,
+ (n << UVH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_SHFT | m));
+ /*
+ * Initializing all 8 (ITEMS_PER_DESC) descriptors for each
+ * cpu even though we only use the first one; one descriptor can
+ * describe a broadcast to 256 uv hubs.
+ */
+ for (i = 0, bd2 = bau_desc; i < (ADP_SZ * ITEMS_PER_DESC); i++, bd2++) {
+ memset(bd2, 0, sizeof(struct bau_desc));
+ if (uv1) {
+ uv1_hdr = &bd2->header.uv1_hdr;
+ uv1_hdr->swack_flag = 1;
+ /*
+ * The base_dest_nasid set in the message header
+ * is the nasid of the first uvhub in the partition.
+ * The bit map will indicate destination pnode numbers
+ * relative to that base. They may not be consecutive
+ * if nasid striding is being used.
+ */
+ uv1_hdr->base_dest_nasid =
+ UV_PNODE_TO_NASID(base_pnode);
+ uv1_hdr->dest_subnodeid = UV_LB_SUBNODEID;
+ uv1_hdr->command = UV_NET_ENDPOINT_INTD;
+ uv1_hdr->int_both = 1;
+ /*
+ * all others need to be set to zero:
+ * fairness chaining multilevel count replied_to
+ */
+ } else {
+ /*
+ * BIOS uses legacy mode, but uv2 and uv3 hardware always
+ * uses native mode for selective broadcasts.
+ */
+ uv2_3_hdr = &bd2->header.uv2_3_hdr;
+ uv2_3_hdr->swack_flag = 1;
+ uv2_3_hdr->base_dest_nasid =
+ UV_PNODE_TO_NASID(base_pnode);
+ uv2_3_hdr->dest_subnodeid = UV_LB_SUBNODEID;
+ uv2_3_hdr->command = UV_NET_ENDPOINT_INTD;
+ }
+ }
+ for_each_present_cpu(cpu) {
+ if (pnode != uv_blade_to_pnode(uv_cpu_to_blade_id(cpu)))
+ continue;
+ bcp = &per_cpu(bau_control, cpu);
+ bcp->descriptor_base = bau_desc;
+ }
+}
+
+/*
+ * initialize the destination side's receiving buffers
+ * entered for each uvhub in the partition
+ * - node is first node (kernel memory notion) on the uvhub
+ * - pnode is the uvhub's physical identifier
+ */
+static void pq_init(int node, int pnode)
+{
+ int cpu;
+ size_t plsize;
+ char *cp;
+ void *vp;
+ unsigned long gnode, first, last, tail;
+ struct bau_pq_entry *pqp;
+ struct bau_control *bcp;
+
+ plsize = (DEST_Q_SIZE + 1) * sizeof(struct bau_pq_entry);
+ vp = kmalloc_node(plsize, GFP_KERNEL, node);
+ pqp = (struct bau_pq_entry *)vp;
+ BUG_ON(!pqp);
+
+ cp = (char *)pqp + 31;
+ pqp = (struct bau_pq_entry *)(((unsigned long)cp >> 5) << 5);
+
+ for_each_present_cpu(cpu) {
+ if (pnode != uv_cpu_to_pnode(cpu))
+ continue;
+ /* for every cpu on this pnode: */
+ bcp = &per_cpu(bau_control, cpu);
+ bcp->queue_first = pqp;
+ bcp->bau_msg_head = pqp;
+ bcp->queue_last = pqp + (DEST_Q_SIZE - 1);
+ }
+
+ first = ops.bau_gpa_to_offset(uv_gpa(pqp));
+ last = ops.bau_gpa_to_offset(uv_gpa(pqp + (DEST_Q_SIZE - 1)));
+
+ /*
+ * Pre UV4, the gnode is required to locate the payload queue
+ * and the payload queue tail must be maintained by the kernel.
+ */
+ bcp = &per_cpu(bau_control, smp_processor_id());
+ if (bcp->uvhub_version <= UV_BAU_V3) {
+ tail = first;
+ gnode = uv_gpa_to_gnode(uv_gpa(pqp));
+ first = (gnode << UV_PAYLOADQ_GNODE_SHIFT) | tail;
+ write_mmr_payload_tail(pnode, tail);
+ }
+
+ ops.write_payload_first(pnode, first);
+ ops.write_payload_last(pnode, last);
+
+ /* in effect, all msg_type's are set to MSG_NOOP */
+ memset(pqp, 0, sizeof(struct bau_pq_entry) * DEST_Q_SIZE);
+}
+
+/*
+ * Initialization of each UV hub's structures
+ */
+static void __init init_uvhub(int uvhub, int vector, int base_pnode)
+{
+ int node;
+ int pnode;
+ unsigned long apicid;
+
+ node = uvhub_to_first_node(uvhub);
+ pnode = uv_blade_to_pnode(uvhub);
+
+ activation_descriptor_init(node, pnode, base_pnode);
+
+ pq_init(node, pnode);
+ /*
+ * The below initialization can't be in firmware because the
+ * messaging IRQ will be determined by the OS.
+ */
+ apicid = uvhub_to_first_apicid(uvhub) | uv_apicid_hibits;
+ write_mmr_data_config(pnode, ((apicid << 32) | vector));
+}
+
+/*
+ * We will set BAU_MISC_CONTROL with a timeout period.
+ * But the BIOS has set UVH_AGING_PRESCALE_SEL and UVH_TRANSACTION_TIMEOUT.
+ * So the destination timeout period has to be calculated from them.
+ */
+static int calculate_destination_timeout(void)
+{
+ unsigned long mmr_image;
+ int mult1;
+ int mult2;
+ int index;
+ int base;
+ int ret;
+ unsigned long ts_ns;
+
+ if (is_uv1_hub()) {
+ mult1 = SOFTACK_TIMEOUT_PERIOD & BAU_MISC_CONTROL_MULT_MASK;
+ mmr_image = uv_read_local_mmr(UVH_AGING_PRESCALE_SEL);
+ index = (mmr_image >> BAU_URGENCY_7_SHIFT) & BAU_URGENCY_7_MASK;
+ mmr_image = uv_read_local_mmr(UVH_TRANSACTION_TIMEOUT);
+ mult2 = (mmr_image >> BAU_TRANS_SHIFT) & BAU_TRANS_MASK;
+ ts_ns = timeout_base_ns[index];
+ ts_ns *= (mult1 * mult2);
+ ret = ts_ns / 1000;
+ } else {
+ /* same destination timeout for uv2 and uv3 */
+ /* 4 bits 0/1 for 10/80us base, 3 bits of multiplier */
+ mmr_image = uv_read_local_mmr(UVH_LB_BAU_MISC_CONTROL);
+ mmr_image = (mmr_image & UV_SA_MASK) >> UV_SA_SHFT;
+ if (mmr_image & (1L << UV2_ACK_UNITS_SHFT))
+ base = 80;
+ else
+ base = 10;
+ mult1 = mmr_image & UV2_ACK_MASK;
+ ret = mult1 * base;
+ }
+ return ret;
+}
+
+static void __init init_per_cpu_tunables(void)
+{
+ int cpu;
+ struct bau_control *bcp;
+
+ for_each_present_cpu(cpu) {
+ bcp = &per_cpu(bau_control, cpu);
+ bcp->baudisabled = 0;
+ if (nobau)
+ bcp->nobau = true;
+ bcp->statp = &per_cpu(ptcstats, cpu);
+ /* time interval to catch a hardware stay-busy bug */
+ bcp->timeout_interval = usec_2_cycles(2*timeout_us);
+ bcp->max_concurr = max_concurr;
+ bcp->max_concurr_const = max_concurr;
+ bcp->plugged_delay = plugged_delay;
+ bcp->plugsb4reset = plugsb4reset;
+ bcp->timeoutsb4reset = timeoutsb4reset;
+ bcp->ipi_reset_limit = ipi_reset_limit;
+ bcp->complete_threshold = complete_threshold;
+ bcp->cong_response_us = congested_respns_us;
+ bcp->cong_reps = congested_reps;
+ bcp->disabled_period = sec_2_cycles(disabled_period);
+ bcp->giveup_limit = giveup_limit;
+ spin_lock_init(&bcp->queue_lock);
+ spin_lock_init(&bcp->uvhub_lock);
+ spin_lock_init(&bcp->disable_lock);
+ }
+}
+
+/*
+ * Scan all cpus to collect blade and socket summaries.
+ */
+static int __init get_cpu_topology(int base_pnode,
+ struct uvhub_desc *uvhub_descs,
+ unsigned char *uvhub_mask)
+{
+ int cpu;
+ int pnode;
+ int uvhub;
+ int socket;
+ struct bau_control *bcp;
+ struct uvhub_desc *bdp;
+ struct socket_desc *sdp;
+
+ for_each_present_cpu(cpu) {
+ bcp = &per_cpu(bau_control, cpu);
+
+ memset(bcp, 0, sizeof(struct bau_control));
+
+ pnode = uv_cpu_hub_info(cpu)->pnode;
+ if ((pnode - base_pnode) >= UV_DISTRIBUTION_SIZE) {
+ pr_emerg(
+ "cpu %d pnode %d-%d beyond %d; BAU disabled\n",
+ cpu, pnode, base_pnode, UV_DISTRIBUTION_SIZE);
+ return 1;
+ }
+
+ bcp->osnode = cpu_to_node(cpu);
+ bcp->partition_base_pnode = base_pnode;
+
+ uvhub = uv_cpu_hub_info(cpu)->numa_blade_id;
+ *(uvhub_mask + (uvhub/8)) |= (1 << (uvhub%8));
+ bdp = &uvhub_descs[uvhub];
+
+ bdp->num_cpus++;
+ bdp->uvhub = uvhub;
+ bdp->pnode = pnode;
+
+ /* kludge: 'assuming' one node per socket, and assuming that
+ disabling a socket just leaves a gap in node numbers */
+ socket = bcp->osnode & 1;
+ bdp->socket_mask |= (1 << socket);
+ sdp = &bdp->socket[socket];
+ sdp->cpu_number[sdp->num_cpus] = cpu;
+ sdp->num_cpus++;
+ if (sdp->num_cpus > MAX_CPUS_PER_SOCKET) {
+ pr_emerg("%d cpus per socket invalid\n",
+ sdp->num_cpus);
+ return 1;
+ }
+ }
+ return 0;
+}
+
+/*
+ * Each socket is to get a local array of pnodes/hubs.
+ */
+static void make_per_cpu_thp(struct bau_control *smaster)
+{
+ int cpu;
+ size_t hpsz = sizeof(struct hub_and_pnode) * num_possible_cpus();
+
+ smaster->thp = kmalloc_node(hpsz, GFP_KERNEL, smaster->osnode);
+ memset(smaster->thp, 0, hpsz);
+ for_each_present_cpu(cpu) {
+ smaster->thp[cpu].pnode = uv_cpu_hub_info(cpu)->pnode;
+ smaster->thp[cpu].uvhub = uv_cpu_hub_info(cpu)->numa_blade_id;
+ }
+}
+
+/*
+ * Each uvhub is to get a local cpumask.
+ */
+static void make_per_hub_cpumask(struct bau_control *hmaster)
+{
+ int sz = sizeof(cpumask_t);
+
+ hmaster->cpumask = kzalloc_node(sz, GFP_KERNEL, hmaster->osnode);
+}
+
+/*
+ * Initialize all the per_cpu information for the cpu's on a given socket,
+ * given what has been gathered into the socket_desc struct.
+ * And reports the chosen hub and socket masters back to the caller.
+ */
+static int scan_sock(struct socket_desc *sdp, struct uvhub_desc *bdp,
+ struct bau_control **smasterp,
+ struct bau_control **hmasterp)
+{
+ int i, cpu, uvhub_cpu;
+ struct bau_control *bcp;
+
+ for (i = 0; i < sdp->num_cpus; i++) {
+ cpu = sdp->cpu_number[i];
+ bcp = &per_cpu(bau_control, cpu);
+ bcp->cpu = cpu;
+ if (i == 0) {
+ *smasterp = bcp;
+ if (!(*hmasterp))
+ *hmasterp = bcp;
+ }
+ bcp->cpus_in_uvhub = bdp->num_cpus;
+ bcp->cpus_in_socket = sdp->num_cpus;
+ bcp->socket_master = *smasterp;
+ bcp->uvhub = bdp->uvhub;
+ if (is_uv1_hub())
+ bcp->uvhub_version = UV_BAU_V1;
+ else if (is_uv2_hub())
+ bcp->uvhub_version = UV_BAU_V2;
+ else if (is_uv3_hub())
+ bcp->uvhub_version = UV_BAU_V3;
+ else if (is_uv4_hub())
+ bcp->uvhub_version = UV_BAU_V4;
+ else {
+ pr_emerg("uvhub version not 1, 2, 3, or 4\n");
+ return 1;
+ }
+ bcp->uvhub_master = *hmasterp;
+ uvhub_cpu = uv_cpu_blade_processor_id(cpu);
+ bcp->uvhub_cpu = uvhub_cpu;
+
+ /*
+ * The ERROR and BUSY status registers are located pairwise over
+ * the STATUS_0 and STATUS_1 mmrs; each an array[32] of 2 bits.
+ */
+ if (uvhub_cpu < UV_CPUS_PER_AS) {
+ bcp->status_mmr = UVH_LB_BAU_SB_ACTIVATION_STATUS_0;
+ bcp->status_index = uvhub_cpu * UV_ACT_STATUS_SIZE;
+ } else {
+ bcp->status_mmr = UVH_LB_BAU_SB_ACTIVATION_STATUS_1;
+ bcp->status_index = (uvhub_cpu - UV_CPUS_PER_AS)
+ * UV_ACT_STATUS_SIZE;
+ }
+
+ if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) {
+ pr_emerg("%d cpus per uvhub invalid\n",
+ bcp->uvhub_cpu);
+ return 1;
+ }
+ }
+ return 0;
+}
+
+/*
+ * Summarize the blade and socket topology into the per_cpu structures.
+ */
+static int __init summarize_uvhub_sockets(int nuvhubs,
+ struct uvhub_desc *uvhub_descs,
+ unsigned char *uvhub_mask)
+{
+ int socket;
+ int uvhub;
+ unsigned short socket_mask;
+
+ for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
+ struct uvhub_desc *bdp;
+ struct bau_control *smaster = NULL;
+ struct bau_control *hmaster = NULL;
+
+ if (!(*(uvhub_mask + (uvhub/8)) & (1 << (uvhub%8))))
+ continue;
+
+ bdp = &uvhub_descs[uvhub];
+ socket_mask = bdp->socket_mask;
+ socket = 0;
+ while (socket_mask) {
+ struct socket_desc *sdp;
+ if ((socket_mask & 1)) {
+ sdp = &bdp->socket[socket];
+ if (scan_sock(sdp, bdp, &smaster, &hmaster))
+ return 1;
+ make_per_cpu_thp(smaster);
+ }
+ socket++;
+ socket_mask = (socket_mask >> 1);
+ }
+ make_per_hub_cpumask(hmaster);
+ }
+ return 0;
+}
+
+/*
+ * initialize the bau_control structure for each cpu
+ */
+static int __init init_per_cpu(int nuvhubs, int base_part_pnode)
+{
+ unsigned char *uvhub_mask;
+ void *vp;
+ struct uvhub_desc *uvhub_descs;
+
+ if (is_uv3_hub() || is_uv2_hub() || is_uv1_hub())
+ timeout_us = calculate_destination_timeout();
+
+ vp = kmalloc_array(nuvhubs, sizeof(struct uvhub_desc), GFP_KERNEL);
+ uvhub_descs = (struct uvhub_desc *)vp;
+ memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc));
+ uvhub_mask = kzalloc((nuvhubs+7)/8, GFP_KERNEL);
+
+ if (get_cpu_topology(base_part_pnode, uvhub_descs, uvhub_mask))
+ goto fail;
+
+ if (summarize_uvhub_sockets(nuvhubs, uvhub_descs, uvhub_mask))
+ goto fail;
+
+ kfree(uvhub_descs);
+ kfree(uvhub_mask);
+ init_per_cpu_tunables();
+ return 0;
+
+fail:
+ kfree(uvhub_descs);
+ kfree(uvhub_mask);
+ return 1;
+}
+
+static const struct bau_operations uv1_bau_ops __initconst = {
+ .bau_gpa_to_offset = uv_gpa_to_offset,
+ .read_l_sw_ack = read_mmr_sw_ack,
+ .read_g_sw_ack = read_gmmr_sw_ack,
+ .write_l_sw_ack = write_mmr_sw_ack,
+ .write_g_sw_ack = write_gmmr_sw_ack,
+ .write_payload_first = write_mmr_payload_first,
+ .write_payload_last = write_mmr_payload_last,
+ .wait_completion = uv1_wait_completion,
+};
+
+static const struct bau_operations uv2_3_bau_ops __initconst = {
+ .bau_gpa_to_offset = uv_gpa_to_offset,
+ .read_l_sw_ack = read_mmr_sw_ack,
+ .read_g_sw_ack = read_gmmr_sw_ack,
+ .write_l_sw_ack = write_mmr_sw_ack,
+ .write_g_sw_ack = write_gmmr_sw_ack,
+ .write_payload_first = write_mmr_payload_first,
+ .write_payload_last = write_mmr_payload_last,
+ .wait_completion = uv2_3_wait_completion,
+};
+
+static const struct bau_operations uv4_bau_ops __initconst = {
+ .bau_gpa_to_offset = uv_gpa_to_soc_phys_ram,
+ .read_l_sw_ack = read_mmr_proc_sw_ack,
+ .read_g_sw_ack = read_gmmr_proc_sw_ack,
+ .write_l_sw_ack = write_mmr_proc_sw_ack,
+ .write_g_sw_ack = write_gmmr_proc_sw_ack,
+ .write_payload_first = write_mmr_proc_payload_first,
+ .write_payload_last = write_mmr_proc_payload_last,
+ .wait_completion = uv4_wait_completion,
+};
+
+/*
+ * Initialization of BAU-related structures
+ */
+static int __init uv_bau_init(void)
+{
+ int uvhub;
+ int pnode;
+ int nuvhubs;
+ int cur_cpu;
+ int cpus;
+ int vector;
+ cpumask_var_t *mask;
+
+ if (!is_uv_system())
+ return 0;
+
+ if (is_uv4_hub())
+ ops = uv4_bau_ops;
+ else if (is_uv3_hub())
+ ops = uv2_3_bau_ops;
+ else if (is_uv2_hub())
+ ops = uv2_3_bau_ops;
+ else if (is_uv1_hub())
+ ops = uv1_bau_ops;
+
+ nuvhubs = uv_num_possible_blades();
+ if (nuvhubs < 2) {
+ pr_crit("UV: BAU disabled - insufficient hub count\n");
+ goto err_bau_disable;
+ }
+
+ for_each_possible_cpu(cur_cpu) {
+ mask = &per_cpu(uv_flush_tlb_mask, cur_cpu);
+ zalloc_cpumask_var_node(mask, GFP_KERNEL, cpu_to_node(cur_cpu));
+ }
+
+ uv_base_pnode = 0x7fffffff;
+ for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
+ cpus = uv_blade_nr_possible_cpus(uvhub);
+ if (cpus && (uv_blade_to_pnode(uvhub) < uv_base_pnode))
+ uv_base_pnode = uv_blade_to_pnode(uvhub);
+ }
+
+ /* software timeouts are not supported on UV4 */
+ if (is_uv3_hub() || is_uv2_hub() || is_uv1_hub())
+ enable_timeouts();
+
+ if (init_per_cpu(nuvhubs, uv_base_pnode)) {
+ pr_crit("UV: BAU disabled - per CPU init failed\n");
+ goto err_bau_disable;
+ }
+
+ vector = UV_BAU_MESSAGE;
+ for_each_possible_blade(uvhub) {
+ if (uv_blade_nr_possible_cpus(uvhub))
+ init_uvhub(uvhub, vector, uv_base_pnode);
+ }
+
+ for_each_possible_blade(uvhub) {
+ if (uv_blade_nr_possible_cpus(uvhub)) {
+ unsigned long val;
+ unsigned long mmr;
+ pnode = uv_blade_to_pnode(uvhub);
+ /* INIT the bau */
+ val = 1L << 63;
+ write_gmmr_activation(pnode, val);
+ mmr = 1; /* should be 1 to broadcast to both sockets */
+ if (!is_uv1_hub())
+ write_mmr_data_broadcast(pnode, mmr);
+ }
+ }
+
+ return 0;
+
+err_bau_disable:
+
+ for_each_possible_cpu(cur_cpu)
+ free_cpumask_var(per_cpu(uv_flush_tlb_mask, cur_cpu));
+
+ set_bau_off();
+ nobau_perm = 1;
+
+ return -EINVAL;
+}
+core_initcall(uv_bau_init);
+fs_initcall(uv_ptc_init);
diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c
new file mode 100644
index 000000000..abb607539
--- /dev/null
+++ b/arch/x86/platform/uv/uv_irq.c
@@ -0,0 +1,217 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * SGI UV IRQ functions
+ *
+ * Copyright (C) 2008 Silicon Graphics, Inc. All rights reserved.
+ */
+
+#include <linux/export.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+#include <linux/irq.h>
+
+#include <asm/irqdomain.h>
+#include <asm/apic.h>
+#include <asm/uv/uv_irq.h>
+#include <asm/uv/uv_hub.h>
+
+/* MMR offset and pnode of hub sourcing interrupts for a given irq */
+struct uv_irq_2_mmr_pnode {
+ unsigned long offset;
+ int pnode;
+};
+
+static void uv_program_mmr(struct irq_cfg *cfg, struct uv_irq_2_mmr_pnode *info)
+{
+ unsigned long mmr_value;
+ struct uv_IO_APIC_route_entry *entry;
+
+ BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) !=
+ sizeof(unsigned long));
+
+ mmr_value = 0;
+ entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
+ entry->vector = cfg->vector;
+ entry->delivery_mode = apic->irq_delivery_mode;
+ entry->dest_mode = apic->irq_dest_mode;
+ entry->polarity = 0;
+ entry->trigger = 0;
+ entry->mask = 0;
+ entry->dest = cfg->dest_apicid;
+
+ uv_write_global_mmr64(info->pnode, info->offset, mmr_value);
+}
+
+static void uv_noop(struct irq_data *data) { }
+
+static int
+uv_set_irq_affinity(struct irq_data *data, const struct cpumask *mask,
+ bool force)
+{
+ struct irq_data *parent = data->parent_data;
+ struct irq_cfg *cfg = irqd_cfg(data);
+ int ret;
+
+ ret = parent->chip->irq_set_affinity(parent, mask, force);
+ if (ret >= 0) {
+ uv_program_mmr(cfg, data->chip_data);
+ send_cleanup_vector(cfg);
+ }
+
+ return ret;
+}
+
+static struct irq_chip uv_irq_chip = {
+ .name = "UV-CORE",
+ .irq_mask = uv_noop,
+ .irq_unmask = uv_noop,
+ .irq_eoi = apic_ack_irq,
+ .irq_set_affinity = uv_set_irq_affinity,
+};
+
+static int uv_domain_alloc(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs, void *arg)
+{
+ struct uv_irq_2_mmr_pnode *chip_data;
+ struct irq_alloc_info *info = arg;
+ struct irq_data *irq_data = irq_domain_get_irq_data(domain, virq);
+ int ret;
+
+ if (nr_irqs > 1 || !info || info->type != X86_IRQ_ALLOC_TYPE_UV)
+ return -EINVAL;
+
+ chip_data = kmalloc_node(sizeof(*chip_data), GFP_KERNEL,
+ irq_data_get_node(irq_data));
+ if (!chip_data)
+ return -ENOMEM;
+
+ ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg);
+ if (ret >= 0) {
+ if (info->uv_limit == UV_AFFINITY_CPU)
+ irq_set_status_flags(virq, IRQ_NO_BALANCING);
+ else
+ irq_set_status_flags(virq, IRQ_MOVE_PCNTXT);
+
+ chip_data->pnode = uv_blade_to_pnode(info->uv_blade);
+ chip_data->offset = info->uv_offset;
+ irq_domain_set_info(domain, virq, virq, &uv_irq_chip, chip_data,
+ handle_percpu_irq, NULL, info->uv_name);
+ } else {
+ kfree(chip_data);
+ }
+
+ return ret;
+}
+
+static void uv_domain_free(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs)
+{
+ struct irq_data *irq_data = irq_domain_get_irq_data(domain, virq);
+
+ BUG_ON(nr_irqs != 1);
+ kfree(irq_data->chip_data);
+ irq_clear_status_flags(virq, IRQ_MOVE_PCNTXT);
+ irq_clear_status_flags(virq, IRQ_NO_BALANCING);
+ irq_domain_free_irqs_top(domain, virq, nr_irqs);
+}
+
+/*
+ * Re-target the irq to the specified CPU and enable the specified MMR located
+ * on the specified blade to allow the sending of MSIs to the specified CPU.
+ */
+static int uv_domain_activate(struct irq_domain *domain,
+ struct irq_data *irq_data, bool reserve)
+{
+ uv_program_mmr(irqd_cfg(irq_data), irq_data->chip_data);
+ return 0;
+}
+
+/*
+ * Disable the specified MMR located on the specified blade so that MSIs are
+ * longer allowed to be sent.
+ */
+static void uv_domain_deactivate(struct irq_domain *domain,
+ struct irq_data *irq_data)
+{
+ unsigned long mmr_value;
+ struct uv_IO_APIC_route_entry *entry;
+
+ mmr_value = 0;
+ entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
+ entry->mask = 1;
+ uv_program_mmr(irqd_cfg(irq_data), irq_data->chip_data);
+}
+
+static const struct irq_domain_ops uv_domain_ops = {
+ .alloc = uv_domain_alloc,
+ .free = uv_domain_free,
+ .activate = uv_domain_activate,
+ .deactivate = uv_domain_deactivate,
+};
+
+static struct irq_domain *uv_get_irq_domain(void)
+{
+ static struct irq_domain *uv_domain;
+ static DEFINE_MUTEX(uv_lock);
+ struct fwnode_handle *fn;
+
+ mutex_lock(&uv_lock);
+ if (uv_domain)
+ goto out;
+
+ fn = irq_domain_alloc_named_fwnode("UV-CORE");
+ if (!fn)
+ goto out;
+
+ uv_domain = irq_domain_create_tree(fn, &uv_domain_ops, NULL);
+ if (uv_domain)
+ uv_domain->parent = x86_vector_domain;
+ else
+ irq_domain_free_fwnode(fn);
+out:
+ mutex_unlock(&uv_lock);
+
+ return uv_domain;
+}
+
+/*
+ * Set up a mapping of an available irq and vector, and enable the specified
+ * MMR that defines the MSI that is to be sent to the specified CPU when an
+ * interrupt is raised.
+ */
+int uv_setup_irq(char *irq_name, int cpu, int mmr_blade,
+ unsigned long mmr_offset, int limit)
+{
+ struct irq_alloc_info info;
+ struct irq_domain *domain = uv_get_irq_domain();
+
+ if (!domain)
+ return -ENOMEM;
+
+ init_irq_alloc_info(&info, cpumask_of(cpu));
+ info.type = X86_IRQ_ALLOC_TYPE_UV;
+ info.uv_limit = limit;
+ info.uv_blade = mmr_blade;
+ info.uv_offset = mmr_offset;
+ info.uv_name = irq_name;
+
+ return irq_domain_alloc_irqs(domain, 1,
+ uv_blade_to_memory_nid(mmr_blade), &info);
+}
+EXPORT_SYMBOL_GPL(uv_setup_irq);
+
+/*
+ * Tear down a mapping of an irq and vector, and disable the specified MMR that
+ * defined the MSI that was to be sent to the specified CPU when an interrupt
+ * was raised.
+ *
+ * Set mmr_blade and mmr_offset to what was passed in on uv_setup_irq().
+ */
+void uv_teardown_irq(unsigned int irq)
+{
+ irq_domain_free_irqs(irq, 1);
+}
+EXPORT_SYMBOL_GPL(uv_teardown_irq);
diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c
new file mode 100644
index 000000000..5f64f3087
--- /dev/null
+++ b/arch/x86/platform/uv/uv_nmi.c
@@ -0,0 +1,1066 @@
+/*
+ * SGI NMI support routines
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Copyright (c) 2009-2013 Silicon Graphics, Inc. All Rights Reserved.
+ * Copyright (c) Mike Travis
+ */
+
+#include <linux/cpu.h>
+#include <linux/delay.h>
+#include <linux/kdb.h>
+#include <linux/kexec.h>
+#include <linux/kgdb.h>
+#include <linux/moduleparam.h>
+#include <linux/nmi.h>
+#include <linux/sched.h>
+#include <linux/sched/debug.h>
+#include <linux/slab.h>
+#include <linux/clocksource.h>
+
+#include <asm/apic.h>
+#include <asm/current.h>
+#include <asm/kdebug.h>
+#include <asm/local64.h>
+#include <asm/nmi.h>
+#include <asm/traps.h>
+#include <asm/uv/uv.h>
+#include <asm/uv/uv_hub.h>
+#include <asm/uv/uv_mmrs.h>
+
+/*
+ * UV handler for NMI
+ *
+ * Handle system-wide NMI events generated by the global 'power nmi' command.
+ *
+ * Basic operation is to field the NMI interrupt on each CPU and wait
+ * until all CPU's have arrived into the nmi handler. If some CPU's do not
+ * make it into the handler, try and force them in with the IPI(NMI) signal.
+ *
+ * We also have to lessen UV Hub MMR accesses as much as possible as this
+ * disrupts the UV Hub's primary mission of directing NumaLink traffic and
+ * can cause system problems to occur.
+ *
+ * To do this we register our primary NMI notifier on the NMI_UNKNOWN
+ * chain. This reduces the number of false NMI calls when the perf
+ * tools are running which generate an enormous number of NMIs per
+ * second (~4M/s for 1024 CPU threads). Our secondary NMI handler is
+ * very short as it only checks that if it has been "pinged" with the
+ * IPI(NMI) signal as mentioned above, and does not read the UV Hub's MMR.
+ *
+ */
+
+static struct uv_hub_nmi_s **uv_hub_nmi_list;
+
+DEFINE_PER_CPU(struct uv_cpu_nmi_s, uv_cpu_nmi);
+
+/* UV hubless values */
+#define NMI_CONTROL_PORT 0x70
+#define NMI_DUMMY_PORT 0x71
+#define PAD_OWN_GPP_D_0 0x2c
+#define GPI_NMI_STS_GPP_D_0 0x164
+#define GPI_NMI_ENA_GPP_D_0 0x174
+#define STS_GPP_D_0_MASK 0x1
+#define PAD_CFG_DW0_GPP_D_0 0x4c0
+#define GPIROUTNMI (1ul << 17)
+#define PCH_PCR_GPIO_1_BASE 0xfdae0000ul
+#define PCH_PCR_GPIO_ADDRESS(offset) (int *)((u64)(pch_base) | (u64)(offset))
+
+static u64 *pch_base;
+static unsigned long nmi_mmr;
+static unsigned long nmi_mmr_clear;
+static unsigned long nmi_mmr_pending;
+
+static atomic_t uv_in_nmi;
+static atomic_t uv_nmi_cpu = ATOMIC_INIT(-1);
+static atomic_t uv_nmi_cpus_in_nmi = ATOMIC_INIT(-1);
+static atomic_t uv_nmi_slave_continue;
+static cpumask_var_t uv_nmi_cpu_mask;
+
+/* Values for uv_nmi_slave_continue */
+#define SLAVE_CLEAR 0
+#define SLAVE_CONTINUE 1
+#define SLAVE_EXIT 2
+
+/*
+ * Default is all stack dumps go to the console and buffer.
+ * Lower level to send to log buffer only.
+ */
+static int uv_nmi_loglevel = CONSOLE_LOGLEVEL_DEFAULT;
+module_param_named(dump_loglevel, uv_nmi_loglevel, int, 0644);
+
+/*
+ * The following values show statistics on how perf events are affecting
+ * this system.
+ */
+static int param_get_local64(char *buffer, const struct kernel_param *kp)
+{
+ return sprintf(buffer, "%lu\n", local64_read((local64_t *)kp->arg));
+}
+
+static int param_set_local64(const char *val, const struct kernel_param *kp)
+{
+ /* Clear on any write */
+ local64_set((local64_t *)kp->arg, 0);
+ return 0;
+}
+
+static const struct kernel_param_ops param_ops_local64 = {
+ .get = param_get_local64,
+ .set = param_set_local64,
+};
+#define param_check_local64(name, p) __param_check(name, p, local64_t)
+
+static local64_t uv_nmi_count;
+module_param_named(nmi_count, uv_nmi_count, local64, 0644);
+
+static local64_t uv_nmi_misses;
+module_param_named(nmi_misses, uv_nmi_misses, local64, 0644);
+
+static local64_t uv_nmi_ping_count;
+module_param_named(ping_count, uv_nmi_ping_count, local64, 0644);
+
+static local64_t uv_nmi_ping_misses;
+module_param_named(ping_misses, uv_nmi_ping_misses, local64, 0644);
+
+/*
+ * Following values allow tuning for large systems under heavy loading
+ */
+static int uv_nmi_initial_delay = 100;
+module_param_named(initial_delay, uv_nmi_initial_delay, int, 0644);
+
+static int uv_nmi_slave_delay = 100;
+module_param_named(slave_delay, uv_nmi_slave_delay, int, 0644);
+
+static int uv_nmi_loop_delay = 100;
+module_param_named(loop_delay, uv_nmi_loop_delay, int, 0644);
+
+static int uv_nmi_trigger_delay = 10000;
+module_param_named(trigger_delay, uv_nmi_trigger_delay, int, 0644);
+
+static int uv_nmi_wait_count = 100;
+module_param_named(wait_count, uv_nmi_wait_count, int, 0644);
+
+static int uv_nmi_retry_count = 500;
+module_param_named(retry_count, uv_nmi_retry_count, int, 0644);
+
+static bool uv_pch_intr_enable = true;
+static bool uv_pch_intr_now_enabled;
+module_param_named(pch_intr_enable, uv_pch_intr_enable, bool, 0644);
+
+static bool uv_pch_init_enable = true;
+module_param_named(pch_init_enable, uv_pch_init_enable, bool, 0644);
+
+static int uv_nmi_debug;
+module_param_named(debug, uv_nmi_debug, int, 0644);
+
+#define nmi_debug(fmt, ...) \
+ do { \
+ if (uv_nmi_debug) \
+ pr_info(fmt, ##__VA_ARGS__); \
+ } while (0)
+
+/* Valid NMI Actions */
+#define ACTION_LEN 16
+static struct nmi_action {
+ char *action;
+ char *desc;
+} valid_acts[] = {
+ { "kdump", "do kernel crash dump" },
+ { "dump", "dump process stack for each cpu" },
+ { "ips", "dump Inst Ptr info for each cpu" },
+ { "kdb", "enter KDB (needs kgdboc= assignment)" },
+ { "kgdb", "enter KGDB (needs gdb target remote)" },
+ { "health", "check if CPUs respond to NMI" },
+};
+typedef char action_t[ACTION_LEN];
+static action_t uv_nmi_action = { "dump" };
+
+static int param_get_action(char *buffer, const struct kernel_param *kp)
+{
+ return sprintf(buffer, "%s\n", uv_nmi_action);
+}
+
+static int param_set_action(const char *val, const struct kernel_param *kp)
+{
+ int i;
+ int n = ARRAY_SIZE(valid_acts);
+ char arg[ACTION_LEN], *p;
+
+ /* (remove possible '\n') */
+ strncpy(arg, val, ACTION_LEN - 1);
+ arg[ACTION_LEN - 1] = '\0';
+ p = strchr(arg, '\n');
+ if (p)
+ *p = '\0';
+
+ for (i = 0; i < n; i++)
+ if (!strcmp(arg, valid_acts[i].action))
+ break;
+
+ if (i < n) {
+ strcpy(uv_nmi_action, arg);
+ pr_info("UV: New NMI action:%s\n", uv_nmi_action);
+ return 0;
+ }
+
+ pr_err("UV: Invalid NMI action:%s, valid actions are:\n", arg);
+ for (i = 0; i < n; i++)
+ pr_err("UV: %-8s - %s\n",
+ valid_acts[i].action, valid_acts[i].desc);
+ return -EINVAL;
+}
+
+static const struct kernel_param_ops param_ops_action = {
+ .get = param_get_action,
+ .set = param_set_action,
+};
+#define param_check_action(name, p) __param_check(name, p, action_t)
+
+module_param_named(action, uv_nmi_action, action, 0644);
+
+static inline bool uv_nmi_action_is(const char *action)
+{
+ return (strncmp(uv_nmi_action, action, strlen(action)) == 0);
+}
+
+/* Setup which NMI support is present in system */
+static void uv_nmi_setup_mmrs(void)
+{
+ if (uv_read_local_mmr(UVH_NMI_MMRX_SUPPORTED)) {
+ uv_write_local_mmr(UVH_NMI_MMRX_REQ,
+ 1UL << UVH_NMI_MMRX_REQ_SHIFT);
+ nmi_mmr = UVH_NMI_MMRX;
+ nmi_mmr_clear = UVH_NMI_MMRX_CLEAR;
+ nmi_mmr_pending = 1UL << UVH_NMI_MMRX_SHIFT;
+ pr_info("UV: SMI NMI support: %s\n", UVH_NMI_MMRX_TYPE);
+ } else {
+ nmi_mmr = UVH_NMI_MMR;
+ nmi_mmr_clear = UVH_NMI_MMR_CLEAR;
+ nmi_mmr_pending = 1UL << UVH_NMI_MMR_SHIFT;
+ pr_info("UV: SMI NMI support: %s\n", UVH_NMI_MMR_TYPE);
+ }
+}
+
+/* Read NMI MMR and check if NMI flag was set by BMC. */
+static inline int uv_nmi_test_mmr(struct uv_hub_nmi_s *hub_nmi)
+{
+ hub_nmi->nmi_value = uv_read_local_mmr(nmi_mmr);
+ atomic_inc(&hub_nmi->read_mmr_count);
+ return !!(hub_nmi->nmi_value & nmi_mmr_pending);
+}
+
+static inline void uv_local_mmr_clear_nmi(void)
+{
+ uv_write_local_mmr(nmi_mmr_clear, nmi_mmr_pending);
+}
+
+/*
+ * UV hubless NMI handler functions
+ */
+static inline void uv_reassert_nmi(void)
+{
+ /* (from arch/x86/include/asm/mach_traps.h) */
+ outb(0x8f, NMI_CONTROL_PORT);
+ inb(NMI_DUMMY_PORT); /* dummy read */
+ outb(0x0f, NMI_CONTROL_PORT);
+ inb(NMI_DUMMY_PORT); /* dummy read */
+}
+
+static void uv_init_hubless_pch_io(int offset, int mask, int data)
+{
+ int *addr = PCH_PCR_GPIO_ADDRESS(offset);
+ int readd = readl(addr);
+
+ if (mask) { /* OR in new data */
+ int writed = (readd & ~mask) | data;
+
+ nmi_debug("UV:PCH: %p = %x & %x | %x (%x)\n",
+ addr, readd, ~mask, data, writed);
+ writel(writed, addr);
+ } else if (readd & data) { /* clear status bit */
+ nmi_debug("UV:PCH: %p = %x\n", addr, data);
+ writel(data, addr);
+ }
+
+ (void)readl(addr); /* flush write data */
+}
+
+static void uv_nmi_setup_hubless_intr(void)
+{
+ uv_pch_intr_now_enabled = uv_pch_intr_enable;
+
+ uv_init_hubless_pch_io(
+ PAD_CFG_DW0_GPP_D_0, GPIROUTNMI,
+ uv_pch_intr_now_enabled ? GPIROUTNMI : 0);
+
+ nmi_debug("UV:NMI: GPP_D_0 interrupt %s\n",
+ uv_pch_intr_now_enabled ? "enabled" : "disabled");
+}
+
+static struct init_nmi {
+ unsigned int offset;
+ unsigned int mask;
+ unsigned int data;
+} init_nmi[] = {
+ { /* HOSTSW_OWN_GPP_D_0 */
+ .offset = 0x84,
+ .mask = 0x1,
+ .data = 0x0, /* ACPI Mode */
+ },
+
+/* Clear status: */
+ { /* GPI_INT_STS_GPP_D_0 */
+ .offset = 0x104,
+ .mask = 0x0,
+ .data = 0x1, /* Clear Status */
+ },
+ { /* GPI_GPE_STS_GPP_D_0 */
+ .offset = 0x124,
+ .mask = 0x0,
+ .data = 0x1, /* Clear Status */
+ },
+ { /* GPI_SMI_STS_GPP_D_0 */
+ .offset = 0x144,
+ .mask = 0x0,
+ .data = 0x1, /* Clear Status */
+ },
+ { /* GPI_NMI_STS_GPP_D_0 */
+ .offset = 0x164,
+ .mask = 0x0,
+ .data = 0x1, /* Clear Status */
+ },
+
+/* Disable interrupts: */
+ { /* GPI_INT_EN_GPP_D_0 */
+ .offset = 0x114,
+ .mask = 0x1,
+ .data = 0x0, /* Disable interrupt generation */
+ },
+ { /* GPI_GPE_EN_GPP_D_0 */
+ .offset = 0x134,
+ .mask = 0x1,
+ .data = 0x0, /* Disable interrupt generation */
+ },
+ { /* GPI_SMI_EN_GPP_D_0 */
+ .offset = 0x154,
+ .mask = 0x1,
+ .data = 0x0, /* Disable interrupt generation */
+ },
+ { /* GPI_NMI_EN_GPP_D_0 */
+ .offset = 0x174,
+ .mask = 0x1,
+ .data = 0x0, /* Disable interrupt generation */
+ },
+
+/* Setup GPP_D_0 Pad Config: */
+ { /* PAD_CFG_DW0_GPP_D_0 */
+ .offset = 0x4c0,
+ .mask = 0xffffffff,
+ .data = 0x82020100,
+/*
+ * 31:30 Pad Reset Config (PADRSTCFG): = 2h # PLTRST# (default)
+ *
+ * 29 RX Pad State Select (RXPADSTSEL): = 0 # Raw RX pad state directly
+ * from RX buffer (default)
+ *
+ * 28 RX Raw Override to '1' (RXRAW1): = 0 # No Override
+ *
+ * 26:25 RX Level/Edge Configuration (RXEVCFG):
+ * = 0h # Level
+ * = 1h # Edge
+ *
+ * 23 RX Invert (RXINV): = 0 # No Inversion (signal active high)
+ *
+ * 20 GPIO Input Route IOxAPIC (GPIROUTIOXAPIC):
+ * = 0 # Routing does not cause peripheral IRQ...
+ * # (we want an NMI not an IRQ)
+ *
+ * 19 GPIO Input Route SCI (GPIROUTSCI): = 0 # Routing does not cause SCI.
+ * 18 GPIO Input Route SMI (GPIROUTSMI): = 0 # Routing does not cause SMI.
+ * 17 GPIO Input Route NMI (GPIROUTNMI): = 1 # Routing can cause NMI.
+ *
+ * 11:10 Pad Mode (PMODE1/0): = 0h = GPIO control the Pad.
+ * 9 GPIO RX Disable (GPIORXDIS):
+ * = 0 # Enable the input buffer (active low enable)
+ *
+ * 8 GPIO TX Disable (GPIOTXDIS):
+ * = 1 # Disable the output buffer; i.e. Hi-Z
+ *
+ * 1 GPIO RX State (GPIORXSTATE): This is the current internal RX pad state..
+ * 0 GPIO TX State (GPIOTXSTATE):
+ * = 0 # (Leave at default)
+ */
+ },
+
+/* Pad Config DW1 */
+ { /* PAD_CFG_DW1_GPP_D_0 */
+ .offset = 0x4c4,
+ .mask = 0x3c00,
+ .data = 0, /* Termination = none (default) */
+ },
+};
+
+static void uv_init_hubless_pch_d0(void)
+{
+ int i, read;
+
+ read = *PCH_PCR_GPIO_ADDRESS(PAD_OWN_GPP_D_0);
+ if (read != 0) {
+ pr_info("UV: Hubless NMI already configured\n");
+ return;
+ }
+
+ nmi_debug("UV: Initializing UV Hubless NMI on PCH\n");
+ for (i = 0; i < ARRAY_SIZE(init_nmi); i++) {
+ uv_init_hubless_pch_io(init_nmi[i].offset,
+ init_nmi[i].mask,
+ init_nmi[i].data);
+ }
+}
+
+static int uv_nmi_test_hubless(struct uv_hub_nmi_s *hub_nmi)
+{
+ int *pstat = PCH_PCR_GPIO_ADDRESS(GPI_NMI_STS_GPP_D_0);
+ int status = *pstat;
+
+ hub_nmi->nmi_value = status;
+ atomic_inc(&hub_nmi->read_mmr_count);
+
+ if (!(status & STS_GPP_D_0_MASK)) /* Not a UV external NMI */
+ return 0;
+
+ *pstat = STS_GPP_D_0_MASK; /* Is a UV NMI: clear GPP_D_0 status */
+ (void)*pstat; /* Flush write */
+
+ return 1;
+}
+
+static int uv_test_nmi(struct uv_hub_nmi_s *hub_nmi)
+{
+ if (hub_nmi->hub_present)
+ return uv_nmi_test_mmr(hub_nmi);
+
+ if (hub_nmi->pch_owner) /* Only PCH owner can check status */
+ return uv_nmi_test_hubless(hub_nmi);
+
+ return -1;
+}
+
+/*
+ * If first CPU in on this hub, set hub_nmi "in_nmi" and "owner" values and
+ * return true. If first CPU in on the system, set global "in_nmi" flag.
+ */
+static int uv_set_in_nmi(int cpu, struct uv_hub_nmi_s *hub_nmi)
+{
+ int first = atomic_add_unless(&hub_nmi->in_nmi, 1, 1);
+
+ if (first) {
+ atomic_set(&hub_nmi->cpu_owner, cpu);
+ if (atomic_add_unless(&uv_in_nmi, 1, 1))
+ atomic_set(&uv_nmi_cpu, cpu);
+
+ atomic_inc(&hub_nmi->nmi_count);
+ }
+ return first;
+}
+
+/* Check if this is a system NMI event */
+static int uv_check_nmi(struct uv_hub_nmi_s *hub_nmi)
+{
+ int cpu = smp_processor_id();
+ int nmi = 0;
+ int nmi_detected = 0;
+
+ local64_inc(&uv_nmi_count);
+ this_cpu_inc(uv_cpu_nmi.queries);
+
+ do {
+ nmi = atomic_read(&hub_nmi->in_nmi);
+ if (nmi)
+ break;
+
+ if (raw_spin_trylock(&hub_nmi->nmi_lock)) {
+ nmi_detected = uv_test_nmi(hub_nmi);
+
+ /* Check flag for UV external NMI */
+ if (nmi_detected > 0) {
+ uv_set_in_nmi(cpu, hub_nmi);
+ nmi = 1;
+ break;
+ }
+
+ /* A non-PCH node in a hubless system waits for NMI */
+ else if (nmi_detected < 0)
+ goto slave_wait;
+
+ /* MMR/PCH NMI flag is clear */
+ raw_spin_unlock(&hub_nmi->nmi_lock);
+
+ } else {
+
+ /* Wait a moment for the HUB NMI locker to set flag */
+slave_wait: cpu_relax();
+ udelay(uv_nmi_slave_delay);
+
+ /* Re-check hub in_nmi flag */
+ nmi = atomic_read(&hub_nmi->in_nmi);
+ if (nmi)
+ break;
+ }
+
+ /*
+ * Check if this BMC missed setting the MMR NMI flag (or)
+ * UV hubless system where only PCH owner can check flag
+ */
+ if (!nmi) {
+ nmi = atomic_read(&uv_in_nmi);
+ if (nmi)
+ uv_set_in_nmi(cpu, hub_nmi);
+ }
+
+ /* If we're holding the hub lock, release it now */
+ if (nmi_detected < 0)
+ raw_spin_unlock(&hub_nmi->nmi_lock);
+
+ } while (0);
+
+ if (!nmi)
+ local64_inc(&uv_nmi_misses);
+
+ return nmi;
+}
+
+/* Need to reset the NMI MMR register, but only once per hub. */
+static inline void uv_clear_nmi(int cpu)
+{
+ struct uv_hub_nmi_s *hub_nmi = uv_hub_nmi;
+
+ if (cpu == atomic_read(&hub_nmi->cpu_owner)) {
+ atomic_set(&hub_nmi->cpu_owner, -1);
+ atomic_set(&hub_nmi->in_nmi, 0);
+ if (hub_nmi->hub_present)
+ uv_local_mmr_clear_nmi();
+ else
+ uv_reassert_nmi();
+ raw_spin_unlock(&hub_nmi->nmi_lock);
+ }
+}
+
+/* Ping non-responding CPU's attemping to force them into the NMI handler */
+static void uv_nmi_nr_cpus_ping(void)
+{
+ int cpu;
+
+ for_each_cpu(cpu, uv_nmi_cpu_mask)
+ uv_cpu_nmi_per(cpu).pinging = 1;
+
+ apic->send_IPI_mask(uv_nmi_cpu_mask, APIC_DM_NMI);
+}
+
+/* Clean up flags for CPU's that ignored both NMI and ping */
+static void uv_nmi_cleanup_mask(void)
+{
+ int cpu;
+
+ for_each_cpu(cpu, uv_nmi_cpu_mask) {
+ uv_cpu_nmi_per(cpu).pinging = 0;
+ uv_cpu_nmi_per(cpu).state = UV_NMI_STATE_OUT;
+ cpumask_clear_cpu(cpu, uv_nmi_cpu_mask);
+ }
+}
+
+/* Loop waiting as CPU's enter NMI handler */
+static int uv_nmi_wait_cpus(int first)
+{
+ int i, j, k, n = num_online_cpus();
+ int last_k = 0, waiting = 0;
+ int cpu = smp_processor_id();
+
+ if (first) {
+ cpumask_copy(uv_nmi_cpu_mask, cpu_online_mask);
+ k = 0;
+ } else {
+ k = n - cpumask_weight(uv_nmi_cpu_mask);
+ }
+
+ /* PCH NMI causes only one CPU to respond */
+ if (first && uv_pch_intr_now_enabled) {
+ cpumask_clear_cpu(cpu, uv_nmi_cpu_mask);
+ return n - k - 1;
+ }
+
+ udelay(uv_nmi_initial_delay);
+ for (i = 0; i < uv_nmi_retry_count; i++) {
+ int loop_delay = uv_nmi_loop_delay;
+
+ for_each_cpu(j, uv_nmi_cpu_mask) {
+ if (uv_cpu_nmi_per(j).state) {
+ cpumask_clear_cpu(j, uv_nmi_cpu_mask);
+ if (++k >= n)
+ break;
+ }
+ }
+ if (k >= n) { /* all in? */
+ k = n;
+ break;
+ }
+ if (last_k != k) { /* abort if no new CPU's coming in */
+ last_k = k;
+ waiting = 0;
+ } else if (++waiting > uv_nmi_wait_count)
+ break;
+
+ /* Extend delay if waiting only for CPU 0: */
+ if (waiting && (n - k) == 1 &&
+ cpumask_test_cpu(0, uv_nmi_cpu_mask))
+ loop_delay *= 100;
+
+ udelay(loop_delay);
+ }
+ atomic_set(&uv_nmi_cpus_in_nmi, k);
+ return n - k;
+}
+
+/* Wait until all slave CPU's have entered UV NMI handler */
+static void uv_nmi_wait(int master)
+{
+ /* Indicate this CPU is in: */
+ this_cpu_write(uv_cpu_nmi.state, UV_NMI_STATE_IN);
+
+ /* If not the first CPU in (the master), then we are a slave CPU */
+ if (!master)
+ return;
+
+ do {
+ /* Wait for all other CPU's to gather here */
+ if (!uv_nmi_wait_cpus(1))
+ break;
+
+ /* If not all made it in, send IPI NMI to them */
+ pr_alert("UV: Sending NMI IPI to %d CPUs: %*pbl\n",
+ cpumask_weight(uv_nmi_cpu_mask),
+ cpumask_pr_args(uv_nmi_cpu_mask));
+
+ uv_nmi_nr_cpus_ping();
+
+ /* If all CPU's are in, then done */
+ if (!uv_nmi_wait_cpus(0))
+ break;
+
+ pr_alert("UV: %d CPUs not in NMI loop: %*pbl\n",
+ cpumask_weight(uv_nmi_cpu_mask),
+ cpumask_pr_args(uv_nmi_cpu_mask));
+ } while (0);
+
+ pr_alert("UV: %d of %d CPUs in NMI\n",
+ atomic_read(&uv_nmi_cpus_in_nmi), num_online_cpus());
+}
+
+/* Dump Instruction Pointer header */
+static void uv_nmi_dump_cpu_ip_hdr(void)
+{
+ pr_info("\nUV: %4s %6s %-32s %s (Note: PID 0 not listed)\n",
+ "CPU", "PID", "COMMAND", "IP");
+}
+
+/* Dump Instruction Pointer info */
+static void uv_nmi_dump_cpu_ip(int cpu, struct pt_regs *regs)
+{
+ pr_info("UV: %4d %6d %-32.32s %pS",
+ cpu, current->pid, current->comm, (void *)regs->ip);
+}
+
+/*
+ * Dump this CPU's state. If action was set to "kdump" and the crash_kexec
+ * failed, then we provide "dump" as an alternate action. Action "dump" now
+ * also includes the show "ips" (instruction pointers) action whereas the
+ * action "ips" only displays instruction pointers for the non-idle CPU's.
+ * This is an abbreviated form of the "ps" command.
+ */
+static void uv_nmi_dump_state_cpu(int cpu, struct pt_regs *regs)
+{
+ const char *dots = " ................................. ";
+
+ if (cpu == 0)
+ uv_nmi_dump_cpu_ip_hdr();
+
+ if (current->pid != 0 || !uv_nmi_action_is("ips"))
+ uv_nmi_dump_cpu_ip(cpu, regs);
+
+ if (uv_nmi_action_is("dump")) {
+ pr_info("UV:%sNMI process trace for CPU %d\n", dots, cpu);
+ show_regs(regs);
+ }
+
+ this_cpu_write(uv_cpu_nmi.state, UV_NMI_STATE_DUMP_DONE);
+}
+
+/* Trigger a slave CPU to dump it's state */
+static void uv_nmi_trigger_dump(int cpu)
+{
+ int retry = uv_nmi_trigger_delay;
+
+ if (uv_cpu_nmi_per(cpu).state != UV_NMI_STATE_IN)
+ return;
+
+ uv_cpu_nmi_per(cpu).state = UV_NMI_STATE_DUMP;
+ do {
+ cpu_relax();
+ udelay(10);
+ if (uv_cpu_nmi_per(cpu).state
+ != UV_NMI_STATE_DUMP)
+ return;
+ } while (--retry > 0);
+
+ pr_crit("UV: CPU %d stuck in process dump function\n", cpu);
+ uv_cpu_nmi_per(cpu).state = UV_NMI_STATE_DUMP_DONE;
+}
+
+/* Wait until all CPU's ready to exit */
+static void uv_nmi_sync_exit(int master)
+{
+ atomic_dec(&uv_nmi_cpus_in_nmi);
+ if (master) {
+ while (atomic_read(&uv_nmi_cpus_in_nmi) > 0)
+ cpu_relax();
+ atomic_set(&uv_nmi_slave_continue, SLAVE_CLEAR);
+ } else {
+ while (atomic_read(&uv_nmi_slave_continue))
+ cpu_relax();
+ }
+}
+
+/* Current "health" check is to check which CPU's are responsive */
+static void uv_nmi_action_health(int cpu, struct pt_regs *regs, int master)
+{
+ if (master) {
+ int in = atomic_read(&uv_nmi_cpus_in_nmi);
+ int out = num_online_cpus() - in;
+
+ pr_alert("UV: NMI CPU health check (non-responding:%d)\n", out);
+ atomic_set(&uv_nmi_slave_continue, SLAVE_EXIT);
+ } else {
+ while (!atomic_read(&uv_nmi_slave_continue))
+ cpu_relax();
+ }
+ uv_nmi_sync_exit(master);
+}
+
+/* Walk through CPU list and dump state of each */
+static void uv_nmi_dump_state(int cpu, struct pt_regs *regs, int master)
+{
+ if (master) {
+ int tcpu;
+ int ignored = 0;
+ int saved_console_loglevel = console_loglevel;
+
+ pr_alert("UV: tracing %s for %d CPUs from CPU %d\n",
+ uv_nmi_action_is("ips") ? "IPs" : "processes",
+ atomic_read(&uv_nmi_cpus_in_nmi), cpu);
+
+ console_loglevel = uv_nmi_loglevel;
+ atomic_set(&uv_nmi_slave_continue, SLAVE_EXIT);
+ for_each_online_cpu(tcpu) {
+ if (cpumask_test_cpu(tcpu, uv_nmi_cpu_mask))
+ ignored++;
+ else if (tcpu == cpu)
+ uv_nmi_dump_state_cpu(tcpu, regs);
+ else
+ uv_nmi_trigger_dump(tcpu);
+ }
+ if (ignored)
+ pr_alert("UV: %d CPUs ignored NMI\n", ignored);
+
+ console_loglevel = saved_console_loglevel;
+ pr_alert("UV: process trace complete\n");
+ } else {
+ while (!atomic_read(&uv_nmi_slave_continue))
+ cpu_relax();
+ while (this_cpu_read(uv_cpu_nmi.state) != UV_NMI_STATE_DUMP)
+ cpu_relax();
+ uv_nmi_dump_state_cpu(cpu, regs);
+ }
+ uv_nmi_sync_exit(master);
+}
+
+static void uv_nmi_touch_watchdogs(void)
+{
+ touch_softlockup_watchdog_sync();
+ clocksource_touch_watchdog();
+ rcu_cpu_stall_reset();
+ touch_nmi_watchdog();
+}
+
+static atomic_t uv_nmi_kexec_failed;
+
+#if defined(CONFIG_KEXEC_CORE)
+static void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs)
+{
+ /* Call crash to dump system state */
+ if (master) {
+ pr_emerg("UV: NMI executing crash_kexec on CPU%d\n", cpu);
+ crash_kexec(regs);
+
+ pr_emerg("UV: crash_kexec unexpectedly returned, ");
+ atomic_set(&uv_nmi_kexec_failed, 1);
+ if (!kexec_crash_image) {
+ pr_cont("crash kernel not loaded\n");
+ return;
+ }
+ pr_cont("kexec busy, stalling cpus while waiting\n");
+ }
+
+ /* If crash exec fails the slaves should return, otherwise stall */
+ while (atomic_read(&uv_nmi_kexec_failed) == 0)
+ mdelay(10);
+}
+
+#else /* !CONFIG_KEXEC_CORE */
+static inline void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs)
+{
+ if (master)
+ pr_err("UV: NMI kdump: KEXEC not supported in this kernel\n");
+ atomic_set(&uv_nmi_kexec_failed, 1);
+}
+#endif /* !CONFIG_KEXEC_CORE */
+
+#ifdef CONFIG_KGDB
+#ifdef CONFIG_KGDB_KDB
+static inline int uv_nmi_kdb_reason(void)
+{
+ return KDB_REASON_SYSTEM_NMI;
+}
+#else /* !CONFIG_KGDB_KDB */
+static inline int uv_nmi_kdb_reason(void)
+{
+ /* Ensure user is expecting to attach gdb remote */
+ if (uv_nmi_action_is("kgdb"))
+ return 0;
+
+ pr_err("UV: NMI error: KDB is not enabled in this kernel\n");
+ return -1;
+}
+#endif /* CONFIG_KGDB_KDB */
+
+/*
+ * Call KGDB/KDB from NMI handler
+ *
+ * Note that if both KGDB and KDB are configured, then the action of 'kgdb' or
+ * 'kdb' has no affect on which is used. See the KGDB documention for further
+ * information.
+ */
+static void uv_call_kgdb_kdb(int cpu, struct pt_regs *regs, int master)
+{
+ if (master) {
+ int reason = uv_nmi_kdb_reason();
+ int ret;
+
+ if (reason < 0)
+ return;
+
+ /* Call KGDB NMI handler as MASTER */
+ ret = kgdb_nmicallin(cpu, X86_TRAP_NMI, regs, reason,
+ &uv_nmi_slave_continue);
+ if (ret) {
+ pr_alert("KGDB returned error, is kgdboc set?\n");
+ atomic_set(&uv_nmi_slave_continue, SLAVE_EXIT);
+ }
+ } else {
+ /* Wait for KGDB signal that it's ready for slaves to enter */
+ int sig;
+
+ do {
+ cpu_relax();
+ sig = atomic_read(&uv_nmi_slave_continue);
+ } while (!sig);
+
+ /* Call KGDB as slave */
+ if (sig == SLAVE_CONTINUE)
+ kgdb_nmicallback(cpu, regs);
+ }
+ uv_nmi_sync_exit(master);
+}
+
+#else /* !CONFIG_KGDB */
+static inline void uv_call_kgdb_kdb(int cpu, struct pt_regs *regs, int master)
+{
+ pr_err("UV: NMI error: KGDB is not enabled in this kernel\n");
+}
+#endif /* !CONFIG_KGDB */
+
+/*
+ * UV NMI handler
+ */
+static int uv_handle_nmi(unsigned int reason, struct pt_regs *regs)
+{
+ struct uv_hub_nmi_s *hub_nmi = uv_hub_nmi;
+ int cpu = smp_processor_id();
+ int master = 0;
+ unsigned long flags;
+
+ local_irq_save(flags);
+
+ /* If not a UV System NMI, ignore */
+ if (!this_cpu_read(uv_cpu_nmi.pinging) && !uv_check_nmi(hub_nmi)) {
+ local_irq_restore(flags);
+ return NMI_DONE;
+ }
+
+ /* Indicate we are the first CPU into the NMI handler */
+ master = (atomic_read(&uv_nmi_cpu) == cpu);
+
+ /* If NMI action is "kdump", then attempt to do it */
+ if (uv_nmi_action_is("kdump")) {
+ uv_nmi_kdump(cpu, master, regs);
+
+ /* Unexpected return, revert action to "dump" */
+ if (master)
+ strncpy(uv_nmi_action, "dump", strlen(uv_nmi_action));
+ }
+
+ /* Pause as all CPU's enter the NMI handler */
+ uv_nmi_wait(master);
+
+ /* Process actions other than "kdump": */
+ if (uv_nmi_action_is("health")) {
+ uv_nmi_action_health(cpu, regs, master);
+ } else if (uv_nmi_action_is("ips") || uv_nmi_action_is("dump")) {
+ uv_nmi_dump_state(cpu, regs, master);
+ } else if (uv_nmi_action_is("kdb") || uv_nmi_action_is("kgdb")) {
+ uv_call_kgdb_kdb(cpu, regs, master);
+ } else {
+ if (master)
+ pr_alert("UV: unknown NMI action: %s\n", uv_nmi_action);
+ uv_nmi_sync_exit(master);
+ }
+
+ /* Clear per_cpu "in_nmi" flag */
+ this_cpu_write(uv_cpu_nmi.state, UV_NMI_STATE_OUT);
+
+ /* Clear MMR NMI flag on each hub */
+ uv_clear_nmi(cpu);
+
+ /* Clear global flags */
+ if (master) {
+ if (cpumask_weight(uv_nmi_cpu_mask))
+ uv_nmi_cleanup_mask();
+ atomic_set(&uv_nmi_cpus_in_nmi, -1);
+ atomic_set(&uv_nmi_cpu, -1);
+ atomic_set(&uv_in_nmi, 0);
+ atomic_set(&uv_nmi_kexec_failed, 0);
+ atomic_set(&uv_nmi_slave_continue, SLAVE_CLEAR);
+ }
+
+ uv_nmi_touch_watchdogs();
+ local_irq_restore(flags);
+
+ return NMI_HANDLED;
+}
+
+/*
+ * NMI handler for pulling in CPU's when perf events are grabbing our NMI
+ */
+static int uv_handle_nmi_ping(unsigned int reason, struct pt_regs *regs)
+{
+ int ret;
+
+ this_cpu_inc(uv_cpu_nmi.queries);
+ if (!this_cpu_read(uv_cpu_nmi.pinging)) {
+ local64_inc(&uv_nmi_ping_misses);
+ return NMI_DONE;
+ }
+
+ this_cpu_inc(uv_cpu_nmi.pings);
+ local64_inc(&uv_nmi_ping_count);
+ ret = uv_handle_nmi(reason, regs);
+ this_cpu_write(uv_cpu_nmi.pinging, 0);
+ return ret;
+}
+
+static void uv_register_nmi_notifier(void)
+{
+ if (register_nmi_handler(NMI_UNKNOWN, uv_handle_nmi, 0, "uv"))
+ pr_warn("UV: NMI handler failed to register\n");
+
+ if (register_nmi_handler(NMI_LOCAL, uv_handle_nmi_ping, 0, "uvping"))
+ pr_warn("UV: PING NMI handler failed to register\n");
+}
+
+void uv_nmi_init(void)
+{
+ unsigned int value;
+
+ /*
+ * Unmask NMI on all CPU's
+ */
+ value = apic_read(APIC_LVT1) | APIC_DM_NMI;
+ value &= ~APIC_LVT_MASKED;
+ apic_write(APIC_LVT1, value);
+}
+
+/* Setup HUB NMI info */
+static void __init uv_nmi_setup_common(bool hubbed)
+{
+ int size = sizeof(void *) * (1 << NODES_SHIFT);
+ int cpu;
+
+ uv_hub_nmi_list = kzalloc(size, GFP_KERNEL);
+ nmi_debug("UV: NMI hub list @ 0x%p (%d)\n", uv_hub_nmi_list, size);
+ BUG_ON(!uv_hub_nmi_list);
+ size = sizeof(struct uv_hub_nmi_s);
+ for_each_present_cpu(cpu) {
+ int nid = cpu_to_node(cpu);
+ if (uv_hub_nmi_list[nid] == NULL) {
+ uv_hub_nmi_list[nid] = kzalloc_node(size,
+ GFP_KERNEL, nid);
+ BUG_ON(!uv_hub_nmi_list[nid]);
+ raw_spin_lock_init(&(uv_hub_nmi_list[nid]->nmi_lock));
+ atomic_set(&uv_hub_nmi_list[nid]->cpu_owner, -1);
+ uv_hub_nmi_list[nid]->hub_present = hubbed;
+ uv_hub_nmi_list[nid]->pch_owner = (nid == 0);
+ }
+ uv_hub_nmi_per(cpu) = uv_hub_nmi_list[nid];
+ }
+ BUG_ON(!alloc_cpumask_var(&uv_nmi_cpu_mask, GFP_KERNEL));
+}
+
+/* Setup for UV Hub systems */
+void __init uv_nmi_setup(void)
+{
+ uv_nmi_setup_mmrs();
+ uv_nmi_setup_common(true);
+ uv_register_nmi_notifier();
+ pr_info("UV: Hub NMI enabled\n");
+}
+
+/* Setup for UV Hubless systems */
+void __init uv_nmi_setup_hubless(void)
+{
+ uv_nmi_setup_common(false);
+ pch_base = xlate_dev_mem_ptr(PCH_PCR_GPIO_1_BASE);
+ nmi_debug("UV: PCH base:%p from 0x%lx, GPP_D_0\n",
+ pch_base, PCH_PCR_GPIO_1_BASE);
+ if (uv_pch_init_enable)
+ uv_init_hubless_pch_d0();
+ uv_init_hubless_pch_io(GPI_NMI_ENA_GPP_D_0,
+ STS_GPP_D_0_MASK, STS_GPP_D_0_MASK);
+ uv_nmi_setup_hubless_intr();
+ /* Ensure NMI enabled in Processor Interface Reg: */
+ uv_reassert_nmi();
+ uv_register_nmi_notifier();
+ pr_info("UV: Hubless NMI enabled\n");
+}
diff --git a/arch/x86/platform/uv/uv_sysfs.c b/arch/x86/platform/uv/uv_sysfs.c
new file mode 100644
index 000000000..e9da9ebd9
--- /dev/null
+++ b/arch/x86/platform/uv/uv_sysfs.c
@@ -0,0 +1,76 @@
+/*
+ * This file supports the /sys/firmware/sgi_uv interfaces for SGI UV.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
+ * Copyright (c) Russ Anderson
+ */
+
+#include <linux/device.h>
+#include <asm/uv/bios.h>
+#include <asm/uv/uv.h>
+
+struct kobject *sgi_uv_kobj;
+
+static ssize_t partition_id_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%ld\n", sn_partition_id);
+}
+
+static ssize_t coherence_id_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%ld\n", uv_partition_coherence_id());
+}
+
+static struct kobj_attribute partition_id_attr =
+ __ATTR(partition_id, S_IRUGO, partition_id_show, NULL);
+
+static struct kobj_attribute coherence_id_attr =
+ __ATTR(coherence_id, S_IRUGO, coherence_id_show, NULL);
+
+
+static int __init sgi_uv_sysfs_init(void)
+{
+ unsigned long ret;
+
+ if (!is_uv_system())
+ return -ENODEV;
+
+ if (!sgi_uv_kobj)
+ sgi_uv_kobj = kobject_create_and_add("sgi_uv", firmware_kobj);
+ if (!sgi_uv_kobj) {
+ printk(KERN_WARNING "kobject_create_and_add sgi_uv failed\n");
+ return -EINVAL;
+ }
+
+ ret = sysfs_create_file(sgi_uv_kobj, &partition_id_attr.attr);
+ if (ret) {
+ printk(KERN_WARNING "sysfs_create_file partition_id failed\n");
+ return ret;
+ }
+
+ ret = sysfs_create_file(sgi_uv_kobj, &coherence_id_attr.attr);
+ if (ret) {
+ printk(KERN_WARNING "sysfs_create_file coherence_id failed\n");
+ return ret;
+ }
+
+ return 0;
+}
+
+device_initcall(sgi_uv_sysfs_init);
diff --git a/arch/x86/platform/uv/uv_time.c b/arch/x86/platform/uv/uv_time.c
new file mode 100644
index 000000000..a36b368ee
--- /dev/null
+++ b/arch/x86/platform/uv/uv_time.c
@@ -0,0 +1,416 @@
+/*
+ * SGI RTC clock/timer routines.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Copyright (c) 2009-2013 Silicon Graphics, Inc. All Rights Reserved.
+ * Copyright (c) Dimitri Sivanich
+ */
+#include <linux/clockchips.h>
+#include <linux/slab.h>
+
+#include <asm/uv/uv_mmrs.h>
+#include <asm/uv/uv_hub.h>
+#include <asm/uv/bios.h>
+#include <asm/uv/uv.h>
+#include <asm/apic.h>
+#include <asm/cpu.h>
+
+#define RTC_NAME "sgi_rtc"
+
+static u64 uv_read_rtc(struct clocksource *cs);
+static int uv_rtc_next_event(unsigned long, struct clock_event_device *);
+static int uv_rtc_shutdown(struct clock_event_device *evt);
+
+static struct clocksource clocksource_uv = {
+ .name = RTC_NAME,
+ .rating = 299,
+ .read = uv_read_rtc,
+ .mask = (u64)UVH_RTC_REAL_TIME_CLOCK_MASK,
+ .flags = CLOCK_SOURCE_IS_CONTINUOUS,
+};
+
+static struct clock_event_device clock_event_device_uv = {
+ .name = RTC_NAME,
+ .features = CLOCK_EVT_FEAT_ONESHOT,
+ .shift = 20,
+ .rating = 400,
+ .irq = -1,
+ .set_next_event = uv_rtc_next_event,
+ .set_state_shutdown = uv_rtc_shutdown,
+ .event_handler = NULL,
+};
+
+static DEFINE_PER_CPU(struct clock_event_device, cpu_ced);
+
+/* There is one of these allocated per node */
+struct uv_rtc_timer_head {
+ spinlock_t lock;
+ /* next cpu waiting for timer, local node relative: */
+ int next_cpu;
+ /* number of cpus on this node: */
+ int ncpus;
+ struct {
+ int lcpu; /* systemwide logical cpu number */
+ u64 expires; /* next timer expiration for this cpu */
+ } cpu[1];
+};
+
+/*
+ * Access to uv_rtc_timer_head via blade id.
+ */
+static struct uv_rtc_timer_head **blade_info __read_mostly;
+
+static int uv_rtc_evt_enable;
+
+/*
+ * Hardware interface routines
+ */
+
+/* Send IPIs to another node */
+static void uv_rtc_send_IPI(int cpu)
+{
+ unsigned long apicid, val;
+ int pnode;
+
+ apicid = cpu_physical_id(cpu);
+ pnode = uv_apicid_to_pnode(apicid);
+ apicid |= uv_apicid_hibits;
+ val = (1UL << UVH_IPI_INT_SEND_SHFT) |
+ (apicid << UVH_IPI_INT_APIC_ID_SHFT) |
+ (X86_PLATFORM_IPI_VECTOR << UVH_IPI_INT_VECTOR_SHFT);
+
+ uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
+}
+
+/* Check for an RTC interrupt pending */
+static int uv_intr_pending(int pnode)
+{
+ if (is_uv1_hub())
+ return uv_read_global_mmr64(pnode, UVH_EVENT_OCCURRED0) &
+ UV1H_EVENT_OCCURRED0_RTC1_MASK;
+ else if (is_uvx_hub())
+ return uv_read_global_mmr64(pnode, UVXH_EVENT_OCCURRED2) &
+ UVXH_EVENT_OCCURRED2_RTC_1_MASK;
+ return 0;
+}
+
+/* Setup interrupt and return non-zero if early expiration occurred. */
+static int uv_setup_intr(int cpu, u64 expires)
+{
+ u64 val;
+ unsigned long apicid = cpu_physical_id(cpu) | uv_apicid_hibits;
+ int pnode = uv_cpu_to_pnode(cpu);
+
+ uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG,
+ UVH_RTC1_INT_CONFIG_M_MASK);
+ uv_write_global_mmr64(pnode, UVH_INT_CMPB, -1L);
+
+ if (is_uv1_hub())
+ uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS,
+ UV1H_EVENT_OCCURRED0_RTC1_MASK);
+ else
+ uv_write_global_mmr64(pnode, UVXH_EVENT_OCCURRED2_ALIAS,
+ UVXH_EVENT_OCCURRED2_RTC_1_MASK);
+
+ val = (X86_PLATFORM_IPI_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) |
+ ((u64)apicid << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT);
+
+ /* Set configuration */
+ uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG, val);
+ /* Initialize comparator value */
+ uv_write_global_mmr64(pnode, UVH_INT_CMPB, expires);
+
+ if (uv_read_rtc(NULL) <= expires)
+ return 0;
+
+ return !uv_intr_pending(pnode);
+}
+
+/*
+ * Per-cpu timer tracking routines
+ */
+
+static __init void uv_rtc_deallocate_timers(void)
+{
+ int bid;
+
+ for_each_possible_blade(bid) {
+ kfree(blade_info[bid]);
+ }
+ kfree(blade_info);
+}
+
+/* Allocate per-node list of cpu timer expiration times. */
+static __init int uv_rtc_allocate_timers(void)
+{
+ int cpu;
+
+ blade_info = kcalloc(uv_possible_blades, sizeof(void *), GFP_KERNEL);
+ if (!blade_info)
+ return -ENOMEM;
+
+ for_each_present_cpu(cpu) {
+ int nid = cpu_to_node(cpu);
+ int bid = uv_cpu_to_blade_id(cpu);
+ int bcpu = uv_cpu_blade_processor_id(cpu);
+ struct uv_rtc_timer_head *head = blade_info[bid];
+
+ if (!head) {
+ head = kmalloc_node(sizeof(struct uv_rtc_timer_head) +
+ (uv_blade_nr_possible_cpus(bid) *
+ 2 * sizeof(u64)),
+ GFP_KERNEL, nid);
+ if (!head) {
+ uv_rtc_deallocate_timers();
+ return -ENOMEM;
+ }
+ spin_lock_init(&head->lock);
+ head->ncpus = uv_blade_nr_possible_cpus(bid);
+ head->next_cpu = -1;
+ blade_info[bid] = head;
+ }
+
+ head->cpu[bcpu].lcpu = cpu;
+ head->cpu[bcpu].expires = ULLONG_MAX;
+ }
+
+ return 0;
+}
+
+/* Find and set the next expiring timer. */
+static void uv_rtc_find_next_timer(struct uv_rtc_timer_head *head, int pnode)
+{
+ u64 lowest = ULLONG_MAX;
+ int c, bcpu = -1;
+
+ head->next_cpu = -1;
+ for (c = 0; c < head->ncpus; c++) {
+ u64 exp = head->cpu[c].expires;
+ if (exp < lowest) {
+ bcpu = c;
+ lowest = exp;
+ }
+ }
+ if (bcpu >= 0) {
+ head->next_cpu = bcpu;
+ c = head->cpu[bcpu].lcpu;
+ if (uv_setup_intr(c, lowest))
+ /* If we didn't set it up in time, trigger */
+ uv_rtc_send_IPI(c);
+ } else {
+ uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG,
+ UVH_RTC1_INT_CONFIG_M_MASK);
+ }
+}
+
+/*
+ * Set expiration time for current cpu.
+ *
+ * Returns 1 if we missed the expiration time.
+ */
+static int uv_rtc_set_timer(int cpu, u64 expires)
+{
+ int pnode = uv_cpu_to_pnode(cpu);
+ int bid = uv_cpu_to_blade_id(cpu);
+ struct uv_rtc_timer_head *head = blade_info[bid];
+ int bcpu = uv_cpu_blade_processor_id(cpu);
+ u64 *t = &head->cpu[bcpu].expires;
+ unsigned long flags;
+ int next_cpu;
+
+ spin_lock_irqsave(&head->lock, flags);
+
+ next_cpu = head->next_cpu;
+ *t = expires;
+
+ /* Will this one be next to go off? */
+ if (next_cpu < 0 || bcpu == next_cpu ||
+ expires < head->cpu[next_cpu].expires) {
+ head->next_cpu = bcpu;
+ if (uv_setup_intr(cpu, expires)) {
+ *t = ULLONG_MAX;
+ uv_rtc_find_next_timer(head, pnode);
+ spin_unlock_irqrestore(&head->lock, flags);
+ return -ETIME;
+ }
+ }
+
+ spin_unlock_irqrestore(&head->lock, flags);
+ return 0;
+}
+
+/*
+ * Unset expiration time for current cpu.
+ *
+ * Returns 1 if this timer was pending.
+ */
+static int uv_rtc_unset_timer(int cpu, int force)
+{
+ int pnode = uv_cpu_to_pnode(cpu);
+ int bid = uv_cpu_to_blade_id(cpu);
+ struct uv_rtc_timer_head *head = blade_info[bid];
+ int bcpu = uv_cpu_blade_processor_id(cpu);
+ u64 *t = &head->cpu[bcpu].expires;
+ unsigned long flags;
+ int rc = 0;
+
+ spin_lock_irqsave(&head->lock, flags);
+
+ if ((head->next_cpu == bcpu && uv_read_rtc(NULL) >= *t) || force)
+ rc = 1;
+
+ if (rc) {
+ *t = ULLONG_MAX;
+ /* Was the hardware setup for this timer? */
+ if (head->next_cpu == bcpu)
+ uv_rtc_find_next_timer(head, pnode);
+ }
+
+ spin_unlock_irqrestore(&head->lock, flags);
+
+ return rc;
+}
+
+
+/*
+ * Kernel interface routines.
+ */
+
+/*
+ * Read the RTC.
+ *
+ * Starting with HUB rev 2.0, the UV RTC register is replicated across all
+ * cachelines of it's own page. This allows faster simultaneous reads
+ * from a given socket.
+ */
+static u64 uv_read_rtc(struct clocksource *cs)
+{
+ unsigned long offset;
+
+ if (uv_get_min_hub_revision_id() == 1)
+ offset = 0;
+ else
+ offset = (uv_blade_processor_id() * L1_CACHE_BYTES) % PAGE_SIZE;
+
+ return (u64)uv_read_local_mmr(UVH_RTC | offset);
+}
+
+/*
+ * Program the next event, relative to now
+ */
+static int uv_rtc_next_event(unsigned long delta,
+ struct clock_event_device *ced)
+{
+ int ced_cpu = cpumask_first(ced->cpumask);
+
+ return uv_rtc_set_timer(ced_cpu, delta + uv_read_rtc(NULL));
+}
+
+/*
+ * Shutdown the RTC timer
+ */
+static int uv_rtc_shutdown(struct clock_event_device *evt)
+{
+ int ced_cpu = cpumask_first(evt->cpumask);
+
+ uv_rtc_unset_timer(ced_cpu, 1);
+ return 0;
+}
+
+static void uv_rtc_interrupt(void)
+{
+ int cpu = smp_processor_id();
+ struct clock_event_device *ced = &per_cpu(cpu_ced, cpu);
+
+ if (!ced || !ced->event_handler)
+ return;
+
+ if (uv_rtc_unset_timer(cpu, 0) != 1)
+ return;
+
+ ced->event_handler(ced);
+}
+
+static int __init uv_enable_evt_rtc(char *str)
+{
+ uv_rtc_evt_enable = 1;
+
+ return 1;
+}
+__setup("uvrtcevt", uv_enable_evt_rtc);
+
+static __init void uv_rtc_register_clockevents(struct work_struct *dummy)
+{
+ struct clock_event_device *ced = this_cpu_ptr(&cpu_ced);
+
+ *ced = clock_event_device_uv;
+ ced->cpumask = cpumask_of(smp_processor_id());
+ clockevents_register_device(ced);
+}
+
+static __init int uv_rtc_setup_clock(void)
+{
+ int rc;
+
+ if (!is_uv_system())
+ return -ENODEV;
+
+ rc = clocksource_register_hz(&clocksource_uv, sn_rtc_cycles_per_second);
+ if (rc)
+ printk(KERN_INFO "UV RTC clocksource failed rc %d\n", rc);
+ else
+ printk(KERN_INFO "UV RTC clocksource registered freq %lu MHz\n",
+ sn_rtc_cycles_per_second/(unsigned long)1E6);
+
+ if (rc || !uv_rtc_evt_enable || x86_platform_ipi_callback)
+ return rc;
+
+ /* Setup and register clockevents */
+ rc = uv_rtc_allocate_timers();
+ if (rc)
+ goto error;
+
+ x86_platform_ipi_callback = uv_rtc_interrupt;
+
+ clock_event_device_uv.mult = div_sc(sn_rtc_cycles_per_second,
+ NSEC_PER_SEC, clock_event_device_uv.shift);
+
+ clock_event_device_uv.min_delta_ns = NSEC_PER_SEC /
+ sn_rtc_cycles_per_second;
+ clock_event_device_uv.min_delta_ticks = 1;
+
+ clock_event_device_uv.max_delta_ns = clocksource_uv.mask *
+ (NSEC_PER_SEC / sn_rtc_cycles_per_second);
+ clock_event_device_uv.max_delta_ticks = clocksource_uv.mask;
+
+ rc = schedule_on_each_cpu(uv_rtc_register_clockevents);
+ if (rc) {
+ x86_platform_ipi_callback = NULL;
+ uv_rtc_deallocate_timers();
+ goto error;
+ }
+
+ printk(KERN_INFO "UV RTC clockevents registered\n");
+
+ return 0;
+
+error:
+ clocksource_unregister(&clocksource_uv);
+ printk(KERN_INFO "UV RTC clockevents failed rc %d\n", rc);
+
+ return rc;
+}
+arch_initcall(uv_rtc_setup_clock);
diff --git a/arch/x86/power/Makefile b/arch/x86/power/Makefile
new file mode 100644
index 000000000..a47013895
--- /dev/null
+++ b/arch/x86/power/Makefile
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: GPL-2.0
+OBJECT_FILES_NON_STANDARD_hibernate_asm_$(BITS).o := y
+
+# __restore_processor_state() restores %gs after S3 resume and so should not
+# itself be stack-protected
+nostackp := $(call cc-option, -fno-stack-protector)
+CFLAGS_cpu.o := $(nostackp)
+
+obj-$(CONFIG_PM_SLEEP) += cpu.o
+obj-$(CONFIG_HIBERNATION) += hibernate_$(BITS).o hibernate_asm_$(BITS).o
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
new file mode 100644
index 000000000..794824948
--- /dev/null
+++ b/arch/x86/power/cpu.c
@@ -0,0 +1,550 @@
+/*
+ * Suspend support specific for i386/x86-64.
+ *
+ * Distribute under GPLv2
+ *
+ * Copyright (c) 2007 Rafael J. Wysocki <rjw@sisk.pl>
+ * Copyright (c) 2002 Pavel Machek <pavel@ucw.cz>
+ * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org>
+ */
+
+#include <linux/suspend.h>
+#include <linux/export.h>
+#include <linux/smp.h>
+#include <linux/perf_event.h>
+#include <linux/tboot.h>
+#include <linux/dmi.h>
+
+#include <asm/pgtable.h>
+#include <asm/proto.h>
+#include <asm/mtrr.h>
+#include <asm/page.h>
+#include <asm/mce.h>
+#include <asm/suspend.h>
+#include <asm/fpu/internal.h>
+#include <asm/debugreg.h>
+#include <asm/cpu.h>
+#include <asm/mmu_context.h>
+#include <asm/cpu_device_id.h>
+#include <asm/microcode.h>
+
+#ifdef CONFIG_X86_32
+__visible unsigned long saved_context_ebx;
+__visible unsigned long saved_context_esp, saved_context_ebp;
+__visible unsigned long saved_context_esi, saved_context_edi;
+__visible unsigned long saved_context_eflags;
+#endif
+struct saved_context saved_context;
+
+static void msr_save_context(struct saved_context *ctxt)
+{
+ struct saved_msr *msr = ctxt->saved_msrs.array;
+ struct saved_msr *end = msr + ctxt->saved_msrs.num;
+
+ while (msr < end) {
+ if (msr->valid)
+ rdmsrl(msr->info.msr_no, msr->info.reg.q);
+ msr++;
+ }
+}
+
+static void msr_restore_context(struct saved_context *ctxt)
+{
+ struct saved_msr *msr = ctxt->saved_msrs.array;
+ struct saved_msr *end = msr + ctxt->saved_msrs.num;
+
+ while (msr < end) {
+ if (msr->valid)
+ wrmsrl(msr->info.msr_no, msr->info.reg.q);
+ msr++;
+ }
+}
+
+/**
+ * __save_processor_state - save CPU registers before creating a
+ * hibernation image and before restoring the memory state from it
+ * @ctxt - structure to store the registers contents in
+ *
+ * NOTE: If there is a CPU register the modification of which by the
+ * boot kernel (ie. the kernel used for loading the hibernation image)
+ * might affect the operations of the restored target kernel (ie. the one
+ * saved in the hibernation image), then its contents must be saved by this
+ * function. In other words, if kernel A is hibernated and different
+ * kernel B is used for loading the hibernation image into memory, the
+ * kernel A's __save_processor_state() function must save all registers
+ * needed by kernel A, so that it can operate correctly after the resume
+ * regardless of what kernel B does in the meantime.
+ */
+static void __save_processor_state(struct saved_context *ctxt)
+{
+#ifdef CONFIG_X86_32
+ mtrr_save_fixed_ranges(NULL);
+#endif
+ kernel_fpu_begin();
+
+ /*
+ * descriptor tables
+ */
+ store_idt(&ctxt->idt);
+
+ /*
+ * We save it here, but restore it only in the hibernate case.
+ * For ACPI S3 resume, this is loaded via 'early_gdt_desc' in 64-bit
+ * mode in "secondary_startup_64". In 32-bit mode it is done via
+ * 'pmode_gdt' in wakeup_start.
+ */
+ ctxt->gdt_desc.size = GDT_SIZE - 1;
+ ctxt->gdt_desc.address = (unsigned long)get_cpu_gdt_rw(smp_processor_id());
+
+ store_tr(ctxt->tr);
+
+ /* XMM0..XMM15 should be handled by kernel_fpu_begin(). */
+ /*
+ * segment registers
+ */
+#ifdef CONFIG_X86_32_LAZY_GS
+ savesegment(gs, ctxt->gs);
+#endif
+#ifdef CONFIG_X86_64
+ savesegment(gs, ctxt->gs);
+ savesegment(fs, ctxt->fs);
+ savesegment(ds, ctxt->ds);
+ savesegment(es, ctxt->es);
+
+ rdmsrl(MSR_FS_BASE, ctxt->fs_base);
+ rdmsrl(MSR_GS_BASE, ctxt->kernelmode_gs_base);
+ rdmsrl(MSR_KERNEL_GS_BASE, ctxt->usermode_gs_base);
+ mtrr_save_fixed_ranges(NULL);
+
+ rdmsrl(MSR_EFER, ctxt->efer);
+#endif
+
+ /*
+ * control registers
+ */
+ ctxt->cr0 = read_cr0();
+ ctxt->cr2 = read_cr2();
+ ctxt->cr3 = __read_cr3();
+ ctxt->cr4 = __read_cr4();
+#ifdef CONFIG_X86_64
+ ctxt->cr8 = read_cr8();
+#endif
+ ctxt->misc_enable_saved = !rdmsrl_safe(MSR_IA32_MISC_ENABLE,
+ &ctxt->misc_enable);
+ msr_save_context(ctxt);
+}
+
+/* Needed by apm.c */
+void save_processor_state(void)
+{
+ __save_processor_state(&saved_context);
+ x86_platform.save_sched_clock_state();
+}
+#ifdef CONFIG_X86_32
+EXPORT_SYMBOL(save_processor_state);
+#endif
+
+static void do_fpu_end(void)
+{
+ /*
+ * Restore FPU regs if necessary.
+ */
+ kernel_fpu_end();
+}
+
+static void fix_processor_context(void)
+{
+ int cpu = smp_processor_id();
+#ifdef CONFIG_X86_64
+ struct desc_struct *desc = get_cpu_gdt_rw(cpu);
+ tss_desc tss;
+#endif
+
+ /*
+ * We need to reload TR, which requires that we change the
+ * GDT entry to indicate "available" first.
+ *
+ * XXX: This could probably all be replaced by a call to
+ * force_reload_TR().
+ */
+ set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
+
+#ifdef CONFIG_X86_64
+ memcpy(&tss, &desc[GDT_ENTRY_TSS], sizeof(tss_desc));
+ tss.type = 0x9; /* The available 64-bit TSS (see AMD vol 2, pg 91 */
+ write_gdt_entry(desc, GDT_ENTRY_TSS, &tss, DESC_TSS);
+
+ syscall_init(); /* This sets MSR_*STAR and related */
+#else
+ if (boot_cpu_has(X86_FEATURE_SEP))
+ enable_sep_cpu();
+#endif
+ load_TR_desc(); /* This does ltr */
+ load_mm_ldt(current->active_mm); /* This does lldt */
+ initialize_tlbstate_and_flush();
+
+ fpu__resume_cpu();
+
+ /* The processor is back on the direct GDT, load back the fixmap */
+ load_fixmap_gdt(cpu);
+}
+
+/**
+ * __restore_processor_state - restore the contents of CPU registers saved
+ * by __save_processor_state()
+ * @ctxt - structure to load the registers contents from
+ *
+ * The asm code that gets us here will have restored a usable GDT, although
+ * it will be pointing to the wrong alias.
+ */
+static void notrace __restore_processor_state(struct saved_context *ctxt)
+{
+ if (ctxt->misc_enable_saved)
+ wrmsrl(MSR_IA32_MISC_ENABLE, ctxt->misc_enable);
+ /*
+ * control registers
+ */
+ /* cr4 was introduced in the Pentium CPU */
+#ifdef CONFIG_X86_32
+ if (ctxt->cr4)
+ __write_cr4(ctxt->cr4);
+#else
+/* CONFIG X86_64 */
+ wrmsrl(MSR_EFER, ctxt->efer);
+ write_cr8(ctxt->cr8);
+ __write_cr4(ctxt->cr4);
+#endif
+ write_cr3(ctxt->cr3);
+ write_cr2(ctxt->cr2);
+ write_cr0(ctxt->cr0);
+
+ /* Restore the IDT. */
+ load_idt(&ctxt->idt);
+
+ /*
+ * Just in case the asm code got us here with the SS, DS, or ES
+ * out of sync with the GDT, update them.
+ */
+ loadsegment(ss, __KERNEL_DS);
+ loadsegment(ds, __USER_DS);
+ loadsegment(es, __USER_DS);
+
+ /*
+ * Restore percpu access. Percpu access can happen in exception
+ * handlers or in complicated helpers like load_gs_index().
+ */
+#ifdef CONFIG_X86_64
+ wrmsrl(MSR_GS_BASE, ctxt->kernelmode_gs_base);
+#else
+ loadsegment(fs, __KERNEL_PERCPU);
+ loadsegment(gs, __KERNEL_STACK_CANARY);
+#endif
+
+ /* Restore the TSS, RO GDT, LDT, and usermode-relevant MSRs. */
+ fix_processor_context();
+
+ /*
+ * Now that we have descriptor tables fully restored and working
+ * exception handling, restore the usermode segments.
+ */
+#ifdef CONFIG_X86_64
+ loadsegment(ds, ctxt->es);
+ loadsegment(es, ctxt->es);
+ loadsegment(fs, ctxt->fs);
+ load_gs_index(ctxt->gs);
+
+ /*
+ * Restore FSBASE and GSBASE after restoring the selectors, since
+ * restoring the selectors clobbers the bases. Keep in mind
+ * that MSR_KERNEL_GS_BASE is horribly misnamed.
+ */
+ wrmsrl(MSR_FS_BASE, ctxt->fs_base);
+ wrmsrl(MSR_KERNEL_GS_BASE, ctxt->usermode_gs_base);
+#elif defined(CONFIG_X86_32_LAZY_GS)
+ loadsegment(gs, ctxt->gs);
+#endif
+
+ do_fpu_end();
+ tsc_verify_tsc_adjust(true);
+ x86_platform.restore_sched_clock_state();
+ mtrr_bp_restore();
+ perf_restore_debug_store();
+
+ microcode_bsp_resume();
+
+ /*
+ * This needs to happen after the microcode has been updated upon resume
+ * because some of the MSRs are "emulated" in microcode.
+ */
+ msr_restore_context(ctxt);
+}
+
+/* Needed by apm.c */
+void notrace restore_processor_state(void)
+{
+ __restore_processor_state(&saved_context);
+}
+#ifdef CONFIG_X86_32
+EXPORT_SYMBOL(restore_processor_state);
+#endif
+
+#if defined(CONFIG_HIBERNATION) && defined(CONFIG_HOTPLUG_CPU)
+static void resume_play_dead(void)
+{
+ play_dead_common();
+ tboot_shutdown(TB_SHUTDOWN_WFS);
+ hlt_play_dead();
+}
+
+int hibernate_resume_nonboot_cpu_disable(void)
+{
+ void (*play_dead)(void) = smp_ops.play_dead;
+ int ret;
+
+ /*
+ * Ensure that MONITOR/MWAIT will not be used in the "play dead" loop
+ * during hibernate image restoration, because it is likely that the
+ * monitored address will be actually written to at that time and then
+ * the "dead" CPU will attempt to execute instructions again, but the
+ * address in its instruction pointer may not be possible to resolve
+ * any more at that point (the page tables used by it previously may
+ * have been overwritten by hibernate image data).
+ *
+ * First, make sure that we wake up all the potentially disabled SMT
+ * threads which have been initially brought up and then put into
+ * mwait/cpuidle sleep.
+ * Those will be put to proper (not interfering with hibernation
+ * resume) sleep afterwards, and the resumed kernel will decide itself
+ * what to do with them.
+ */
+ ret = cpuhp_smt_enable();
+ if (ret)
+ return ret;
+ smp_ops.play_dead = resume_play_dead;
+ ret = disable_nonboot_cpus();
+ smp_ops.play_dead = play_dead;
+ return ret;
+}
+#endif
+
+/*
+ * When bsp_check() is called in hibernate and suspend, cpu hotplug
+ * is disabled already. So it's unnessary to handle race condition between
+ * cpumask query and cpu hotplug.
+ */
+static int bsp_check(void)
+{
+ if (cpumask_first(cpu_online_mask) != 0) {
+ pr_warn("CPU0 is offline.\n");
+ return -ENODEV;
+ }
+
+ return 0;
+}
+
+static int bsp_pm_callback(struct notifier_block *nb, unsigned long action,
+ void *ptr)
+{
+ int ret = 0;
+
+ switch (action) {
+ case PM_SUSPEND_PREPARE:
+ case PM_HIBERNATION_PREPARE:
+ ret = bsp_check();
+ break;
+#ifdef CONFIG_DEBUG_HOTPLUG_CPU0
+ case PM_RESTORE_PREPARE:
+ /*
+ * When system resumes from hibernation, online CPU0 because
+ * 1. it's required for resume and
+ * 2. the CPU was online before hibernation
+ */
+ if (!cpu_online(0))
+ _debug_hotplug_cpu(0, 1);
+ break;
+ case PM_POST_RESTORE:
+ /*
+ * When a resume really happens, this code won't be called.
+ *
+ * This code is called only when user space hibernation software
+ * prepares for snapshot device during boot time. So we just
+ * call _debug_hotplug_cpu() to restore to CPU0's state prior to
+ * preparing the snapshot device.
+ *
+ * This works for normal boot case in our CPU0 hotplug debug
+ * mode, i.e. CPU0 is offline and user mode hibernation
+ * software initializes during boot time.
+ *
+ * If CPU0 is online and user application accesses snapshot
+ * device after boot time, this will offline CPU0 and user may
+ * see different CPU0 state before and after accessing
+ * the snapshot device. But hopefully this is not a case when
+ * user debugging CPU0 hotplug. Even if users hit this case,
+ * they can easily online CPU0 back.
+ *
+ * To simplify this debug code, we only consider normal boot
+ * case. Otherwise we need to remember CPU0's state and restore
+ * to that state and resolve racy conditions etc.
+ */
+ _debug_hotplug_cpu(0, 0);
+ break;
+#endif
+ default:
+ break;
+ }
+ return notifier_from_errno(ret);
+}
+
+static int __init bsp_pm_check_init(void)
+{
+ /*
+ * Set this bsp_pm_callback as lower priority than
+ * cpu_hotplug_pm_callback. So cpu_hotplug_pm_callback will be called
+ * earlier to disable cpu hotplug before bsp online check.
+ */
+ pm_notifier(bsp_pm_callback, -INT_MAX);
+ return 0;
+}
+
+core_initcall(bsp_pm_check_init);
+
+static int msr_build_context(const u32 *msr_id, const int num)
+{
+ struct saved_msrs *saved_msrs = &saved_context.saved_msrs;
+ struct saved_msr *msr_array;
+ int total_num;
+ int i, j;
+
+ total_num = saved_msrs->num + num;
+
+ msr_array = kmalloc_array(total_num, sizeof(struct saved_msr), GFP_KERNEL);
+ if (!msr_array) {
+ pr_err("x86/pm: Can not allocate memory to save/restore MSRs during suspend.\n");
+ return -ENOMEM;
+ }
+
+ if (saved_msrs->array) {
+ /*
+ * Multiple callbacks can invoke this function, so copy any
+ * MSR save requests from previous invocations.
+ */
+ memcpy(msr_array, saved_msrs->array,
+ sizeof(struct saved_msr) * saved_msrs->num);
+
+ kfree(saved_msrs->array);
+ }
+
+ for (i = saved_msrs->num, j = 0; i < total_num; i++, j++) {
+ u64 dummy;
+
+ msr_array[i].info.msr_no = msr_id[j];
+ msr_array[i].valid = !rdmsrl_safe(msr_id[j], &dummy);
+ msr_array[i].info.reg.q = 0;
+ }
+ saved_msrs->num = total_num;
+ saved_msrs->array = msr_array;
+
+ return 0;
+}
+
+/*
+ * The following sections are a quirk framework for problematic BIOSen:
+ * Sometimes MSRs are modified by the BIOSen after suspended to
+ * RAM, this might cause unexpected behavior after wakeup.
+ * Thus we save/restore these specified MSRs across suspend/resume
+ * in order to work around it.
+ *
+ * For any further problematic BIOSen/platforms,
+ * please add your own function similar to msr_initialize_bdw.
+ */
+static int msr_initialize_bdw(const struct dmi_system_id *d)
+{
+ /* Add any extra MSR ids into this array. */
+ u32 bdw_msr_id[] = { MSR_IA32_THERM_CONTROL };
+
+ pr_info("x86/pm: %s detected, MSR saving is needed during suspending.\n", d->ident);
+ return msr_build_context(bdw_msr_id, ARRAY_SIZE(bdw_msr_id));
+}
+
+static const struct dmi_system_id msr_save_dmi_table[] = {
+ {
+ .callback = msr_initialize_bdw,
+ .ident = "BROADWELL BDX_EP",
+ .matches = {
+ DMI_MATCH(DMI_PRODUCT_NAME, "GRANTLEY"),
+ DMI_MATCH(DMI_PRODUCT_VERSION, "E63448-400"),
+ },
+ },
+ {}
+};
+
+static int msr_save_cpuid_features(const struct x86_cpu_id *c)
+{
+ u32 cpuid_msr_id[] = {
+ MSR_AMD64_CPUID_FN_1,
+ };
+
+ pr_info("x86/pm: family %#hx cpu detected, MSR saving is needed during suspending.\n",
+ c->family);
+
+ return msr_build_context(cpuid_msr_id, ARRAY_SIZE(cpuid_msr_id));
+}
+
+static const struct x86_cpu_id msr_save_cpu_table[] = {
+ {
+ .vendor = X86_VENDOR_AMD,
+ .family = 0x15,
+ .model = X86_MODEL_ANY,
+ .feature = X86_FEATURE_ANY,
+ .driver_data = (kernel_ulong_t)msr_save_cpuid_features,
+ },
+ {
+ .vendor = X86_VENDOR_AMD,
+ .family = 0x16,
+ .model = X86_MODEL_ANY,
+ .feature = X86_FEATURE_ANY,
+ .driver_data = (kernel_ulong_t)msr_save_cpuid_features,
+ },
+ {}
+};
+
+typedef int (*pm_cpu_match_t)(const struct x86_cpu_id *);
+static int pm_cpu_check(const struct x86_cpu_id *c)
+{
+ const struct x86_cpu_id *m;
+ int ret = 0;
+
+ m = x86_match_cpu(msr_save_cpu_table);
+ if (m) {
+ pm_cpu_match_t fn;
+
+ fn = (pm_cpu_match_t)m->driver_data;
+ ret = fn(m);
+ }
+
+ return ret;
+}
+
+static void pm_save_spec_msr(void)
+{
+ u32 spec_msr_id[] = {
+ MSR_IA32_SPEC_CTRL,
+ MSR_IA32_TSX_CTRL,
+ MSR_TSX_FORCE_ABORT,
+ MSR_IA32_MCU_OPT_CTRL,
+ MSR_AMD64_LS_CFG,
+ };
+
+ msr_build_context(spec_msr_id, ARRAY_SIZE(spec_msr_id));
+}
+
+static int pm_check_save_msr(void)
+{
+ dmi_check_system(msr_save_dmi_table);
+ pm_cpu_check(msr_save_cpu_table);
+ pm_save_spec_msr();
+
+ return 0;
+}
+
+device_initcall(pm_check_save_msr);
diff --git a/arch/x86/power/hibernate_32.c b/arch/x86/power/hibernate_32.c
new file mode 100644
index 000000000..afc4ed7b1
--- /dev/null
+++ b/arch/x86/power/hibernate_32.c
@@ -0,0 +1,175 @@
+/*
+ * Hibernation support specific for i386 - temporary page tables
+ *
+ * Distribute under GPLv2
+ *
+ * Copyright (c) 2006 Rafael J. Wysocki <rjw@sisk.pl>
+ */
+
+#include <linux/gfp.h>
+#include <linux/suspend.h>
+#include <linux/bootmem.h>
+
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/mmzone.h>
+#include <asm/sections.h>
+
+/* Defined in hibernate_asm_32.S */
+extern int restore_image(void);
+
+/* Pointer to the temporary resume page tables */
+pgd_t *resume_pg_dir;
+
+/* The following three functions are based on the analogous code in
+ * arch/x86/mm/init_32.c
+ */
+
+/*
+ * Create a middle page table on a resume-safe page and put a pointer to it in
+ * the given global directory entry. This only returns the gd entry
+ * in non-PAE compilation mode, since the middle layer is folded.
+ */
+static pmd_t *resume_one_md_table_init(pgd_t *pgd)
+{
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd_table;
+
+#ifdef CONFIG_X86_PAE
+ pmd_table = (pmd_t *)get_safe_page(GFP_ATOMIC);
+ if (!pmd_table)
+ return NULL;
+
+ set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
+ p4d = p4d_offset(pgd, 0);
+ pud = pud_offset(p4d, 0);
+
+ BUG_ON(pmd_table != pmd_offset(pud, 0));
+#else
+ p4d = p4d_offset(pgd, 0);
+ pud = pud_offset(p4d, 0);
+ pmd_table = pmd_offset(pud, 0);
+#endif
+
+ return pmd_table;
+}
+
+/*
+ * Create a page table on a resume-safe page and place a pointer to it in
+ * a middle page directory entry.
+ */
+static pte_t *resume_one_page_table_init(pmd_t *pmd)
+{
+ if (pmd_none(*pmd)) {
+ pte_t *page_table = (pte_t *)get_safe_page(GFP_ATOMIC);
+ if (!page_table)
+ return NULL;
+
+ set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
+
+ BUG_ON(page_table != pte_offset_kernel(pmd, 0));
+
+ return page_table;
+ }
+
+ return pte_offset_kernel(pmd, 0);
+}
+
+/*
+ * This maps the physical memory to kernel virtual address space, a total
+ * of max_low_pfn pages, by creating page tables starting from address
+ * PAGE_OFFSET. The page tables are allocated out of resume-safe pages.
+ */
+static int resume_physical_mapping_init(pgd_t *pgd_base)
+{
+ unsigned long pfn;
+ pgd_t *pgd;
+ pmd_t *pmd;
+ pte_t *pte;
+ int pgd_idx, pmd_idx;
+
+ pgd_idx = pgd_index(PAGE_OFFSET);
+ pgd = pgd_base + pgd_idx;
+ pfn = 0;
+
+ for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
+ pmd = resume_one_md_table_init(pgd);
+ if (!pmd)
+ return -ENOMEM;
+
+ if (pfn >= max_low_pfn)
+ continue;
+
+ for (pmd_idx = 0; pmd_idx < PTRS_PER_PMD; pmd++, pmd_idx++) {
+ if (pfn >= max_low_pfn)
+ break;
+
+ /* Map with big pages if possible, otherwise create
+ * normal page tables.
+ * NOTE: We can mark everything as executable here
+ */
+ if (boot_cpu_has(X86_FEATURE_PSE)) {
+ set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC));
+ pfn += PTRS_PER_PTE;
+ } else {
+ pte_t *max_pte;
+
+ pte = resume_one_page_table_init(pmd);
+ if (!pte)
+ return -ENOMEM;
+
+ max_pte = pte + PTRS_PER_PTE;
+ for (; pte < max_pte; pte++, pfn++) {
+ if (pfn >= max_low_pfn)
+ break;
+
+ set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC));
+ }
+ }
+ }
+ }
+
+ return 0;
+}
+
+static inline void resume_init_first_level_page_table(pgd_t *pg_dir)
+{
+#ifdef CONFIG_X86_PAE
+ int i;
+
+ /* Init entries of the first-level page table to the zero page */
+ for (i = 0; i < PTRS_PER_PGD; i++)
+ set_pgd(pg_dir + i,
+ __pgd(__pa(empty_zero_page) | _PAGE_PRESENT));
+#endif
+}
+
+asmlinkage int swsusp_arch_resume(void)
+{
+ int error;
+
+ resume_pg_dir = (pgd_t *)get_safe_page(GFP_ATOMIC);
+ if (!resume_pg_dir)
+ return -ENOMEM;
+
+ resume_init_first_level_page_table(resume_pg_dir);
+ error = resume_physical_mapping_init(resume_pg_dir);
+ if (error)
+ return error;
+
+ /* We have got enough memory and from now on we cannot recover */
+ restore_image();
+ return 0;
+}
+
+/*
+ * pfn_is_nosave - check if given pfn is in the 'nosave' section
+ */
+
+int pfn_is_nosave(unsigned long pfn)
+{
+ unsigned long nosave_begin_pfn = __pa_symbol(&__nosave_begin) >> PAGE_SHIFT;
+ unsigned long nosave_end_pfn = PAGE_ALIGN(__pa_symbol(&__nosave_end)) >> PAGE_SHIFT;
+ return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn);
+}
diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
new file mode 100644
index 000000000..6c3ec193a
--- /dev/null
+++ b/arch/x86/power/hibernate_64.c
@@ -0,0 +1,397 @@
+/*
+ * Hibernation support for x86-64
+ *
+ * Distribute under GPLv2
+ *
+ * Copyright (c) 2007 Rafael J. Wysocki <rjw@sisk.pl>
+ * Copyright (c) 2002 Pavel Machek <pavel@ucw.cz>
+ * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org>
+ */
+
+#include <linux/gfp.h>
+#include <linux/smp.h>
+#include <linux/suspend.h>
+#include <linux/scatterlist.h>
+#include <linux/kdebug.h>
+#include <linux/cpu.h>
+
+#include <crypto/hash.h>
+
+#include <asm/e820/api.h>
+#include <asm/init.h>
+#include <asm/proto.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/mtrr.h>
+#include <asm/sections.h>
+#include <asm/suspend.h>
+#include <asm/tlbflush.h>
+
+/* Defined in hibernate_asm_64.S */
+extern asmlinkage __visible int restore_image(void);
+
+/*
+ * Address to jump to in the last phase of restore in order to get to the image
+ * kernel's text (this value is passed in the image header).
+ */
+unsigned long restore_jump_address __visible;
+unsigned long jump_address_phys;
+
+/*
+ * Value of the cr3 register from before the hibernation (this value is passed
+ * in the image header).
+ */
+unsigned long restore_cr3 __visible;
+
+unsigned long temp_level4_pgt __visible;
+
+unsigned long relocated_restore_code __visible;
+
+static int set_up_temporary_text_mapping(pgd_t *pgd)
+{
+ pmd_t *pmd;
+ pud_t *pud;
+ p4d_t *p4d = NULL;
+ pgprot_t pgtable_prot = __pgprot(_KERNPG_TABLE);
+ pgprot_t pmd_text_prot = __pgprot(__PAGE_KERNEL_LARGE_EXEC);
+
+ /* Filter out unsupported __PAGE_KERNEL* bits: */
+ pgprot_val(pmd_text_prot) &= __default_kernel_pte_mask;
+ pgprot_val(pgtable_prot) &= __default_kernel_pte_mask;
+
+ /*
+ * The new mapping only has to cover the page containing the image
+ * kernel's entry point (jump_address_phys), because the switch over to
+ * it is carried out by relocated code running from a page allocated
+ * specifically for this purpose and covered by the identity mapping, so
+ * the temporary kernel text mapping is only needed for the final jump.
+ * Moreover, in that mapping the virtual address of the image kernel's
+ * entry point must be the same as its virtual address in the image
+ * kernel (restore_jump_address), so the image kernel's
+ * restore_registers() code doesn't find itself in a different area of
+ * the virtual address space after switching over to the original page
+ * tables used by the image kernel.
+ */
+
+ if (pgtable_l5_enabled()) {
+ p4d = (p4d_t *)get_safe_page(GFP_ATOMIC);
+ if (!p4d)
+ return -ENOMEM;
+ }
+
+ pud = (pud_t *)get_safe_page(GFP_ATOMIC);
+ if (!pud)
+ return -ENOMEM;
+
+ pmd = (pmd_t *)get_safe_page(GFP_ATOMIC);
+ if (!pmd)
+ return -ENOMEM;
+
+ set_pmd(pmd + pmd_index(restore_jump_address),
+ __pmd((jump_address_phys & PMD_MASK) | pgprot_val(pmd_text_prot)));
+ set_pud(pud + pud_index(restore_jump_address),
+ __pud(__pa(pmd) | pgprot_val(pgtable_prot)));
+ if (p4d) {
+ p4d_t new_p4d = __p4d(__pa(pud) | pgprot_val(pgtable_prot));
+ pgd_t new_pgd = __pgd(__pa(p4d) | pgprot_val(pgtable_prot));
+
+ set_p4d(p4d + p4d_index(restore_jump_address), new_p4d);
+ set_pgd(pgd + pgd_index(restore_jump_address), new_pgd);
+ } else {
+ /* No p4d for 4-level paging: point the pgd to the pud page table */
+ pgd_t new_pgd = __pgd(__pa(pud) | pgprot_val(pgtable_prot));
+ set_pgd(pgd + pgd_index(restore_jump_address), new_pgd);
+ }
+
+ return 0;
+}
+
+static void *alloc_pgt_page(void *context)
+{
+ return (void *)get_safe_page(GFP_ATOMIC);
+}
+
+static int set_up_temporary_mappings(void)
+{
+ struct x86_mapping_info info = {
+ .alloc_pgt_page = alloc_pgt_page,
+ .page_flag = __PAGE_KERNEL_LARGE_EXEC,
+ .offset = __PAGE_OFFSET,
+ };
+ unsigned long mstart, mend;
+ pgd_t *pgd;
+ int result;
+ int i;
+
+ pgd = (pgd_t *)get_safe_page(GFP_ATOMIC);
+ if (!pgd)
+ return -ENOMEM;
+
+ /* Prepare a temporary mapping for the kernel text */
+ result = set_up_temporary_text_mapping(pgd);
+ if (result)
+ return result;
+
+ /* Set up the direct mapping from scratch */
+ for (i = 0; i < nr_pfn_mapped; i++) {
+ mstart = pfn_mapped[i].start << PAGE_SHIFT;
+ mend = pfn_mapped[i].end << PAGE_SHIFT;
+
+ result = kernel_ident_mapping_init(&info, pgd, mstart, mend);
+ if (result)
+ return result;
+ }
+
+ temp_level4_pgt = __pa(pgd);
+ return 0;
+}
+
+static int relocate_restore_code(void)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ relocated_restore_code = get_safe_page(GFP_ATOMIC);
+ if (!relocated_restore_code)
+ return -ENOMEM;
+
+ memcpy((void *)relocated_restore_code, core_restore_code, PAGE_SIZE);
+
+ /* Make the page containing the relocated code executable */
+ pgd = (pgd_t *)__va(read_cr3_pa()) +
+ pgd_index(relocated_restore_code);
+ p4d = p4d_offset(pgd, relocated_restore_code);
+ if (p4d_large(*p4d)) {
+ set_p4d(p4d, __p4d(p4d_val(*p4d) & ~_PAGE_NX));
+ goto out;
+ }
+ pud = pud_offset(p4d, relocated_restore_code);
+ if (pud_large(*pud)) {
+ set_pud(pud, __pud(pud_val(*pud) & ~_PAGE_NX));
+ goto out;
+ }
+ pmd = pmd_offset(pud, relocated_restore_code);
+ if (pmd_large(*pmd)) {
+ set_pmd(pmd, __pmd(pmd_val(*pmd) & ~_PAGE_NX));
+ goto out;
+ }
+ pte = pte_offset_kernel(pmd, relocated_restore_code);
+ set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_NX));
+out:
+ __flush_tlb_all();
+ return 0;
+}
+
+asmlinkage int swsusp_arch_resume(void)
+{
+ int error;
+
+ /* We have got enough memory and from now on we cannot recover */
+ error = set_up_temporary_mappings();
+ if (error)
+ return error;
+
+ error = relocate_restore_code();
+ if (error)
+ return error;
+
+ restore_image();
+ return 0;
+}
+
+/*
+ * pfn_is_nosave - check if given pfn is in the 'nosave' section
+ */
+
+int pfn_is_nosave(unsigned long pfn)
+{
+ unsigned long nosave_begin_pfn = __pa_symbol(&__nosave_begin) >> PAGE_SHIFT;
+ unsigned long nosave_end_pfn = PAGE_ALIGN(__pa_symbol(&__nosave_end)) >> PAGE_SHIFT;
+ return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn);
+}
+
+#define MD5_DIGEST_SIZE 16
+
+struct restore_data_record {
+ unsigned long jump_address;
+ unsigned long jump_address_phys;
+ unsigned long cr3;
+ unsigned long magic;
+ u8 e820_digest[MD5_DIGEST_SIZE];
+};
+
+#define RESTORE_MAGIC 0x23456789ABCDEF01UL
+
+#if IS_BUILTIN(CONFIG_CRYPTO_MD5)
+/**
+ * get_e820_md5 - calculate md5 according to given e820 table
+ *
+ * @table: the e820 table to be calculated
+ * @buf: the md5 result to be stored to
+ */
+static int get_e820_md5(struct e820_table *table, void *buf)
+{
+ struct crypto_shash *tfm;
+ struct shash_desc *desc;
+ int size;
+ int ret = 0;
+
+ tfm = crypto_alloc_shash("md5", 0, 0);
+ if (IS_ERR(tfm))
+ return -ENOMEM;
+
+ desc = kmalloc(sizeof(struct shash_desc) + crypto_shash_descsize(tfm),
+ GFP_KERNEL);
+ if (!desc) {
+ ret = -ENOMEM;
+ goto free_tfm;
+ }
+
+ desc->tfm = tfm;
+ desc->flags = 0;
+
+ size = offsetof(struct e820_table, entries) +
+ sizeof(struct e820_entry) * table->nr_entries;
+
+ if (crypto_shash_digest(desc, (u8 *)table, size, buf))
+ ret = -EINVAL;
+
+ kzfree(desc);
+
+free_tfm:
+ crypto_free_shash(tfm);
+ return ret;
+}
+
+static int hibernation_e820_save(void *buf)
+{
+ return get_e820_md5(e820_table_firmware, buf);
+}
+
+static bool hibernation_e820_mismatch(void *buf)
+{
+ int ret;
+ u8 result[MD5_DIGEST_SIZE];
+
+ memset(result, 0, MD5_DIGEST_SIZE);
+ /* If there is no digest in suspend kernel, let it go. */
+ if (!memcmp(result, buf, MD5_DIGEST_SIZE))
+ return false;
+
+ ret = get_e820_md5(e820_table_firmware, result);
+ if (ret)
+ return true;
+
+ return memcmp(result, buf, MD5_DIGEST_SIZE) ? true : false;
+}
+#else
+static int hibernation_e820_save(void *buf)
+{
+ return 0;
+}
+
+static bool hibernation_e820_mismatch(void *buf)
+{
+ /* If md5 is not builtin for restore kernel, let it go. */
+ return false;
+}
+#endif
+
+/**
+ * arch_hibernation_header_save - populate the architecture specific part
+ * of a hibernation image header
+ * @addr: address to save the data at
+ */
+int arch_hibernation_header_save(void *addr, unsigned int max_size)
+{
+ struct restore_data_record *rdr = addr;
+
+ if (max_size < sizeof(struct restore_data_record))
+ return -EOVERFLOW;
+ rdr->jump_address = (unsigned long)restore_registers;
+ rdr->jump_address_phys = __pa_symbol(restore_registers);
+
+ /*
+ * The restore code fixes up CR3 and CR4 in the following sequence:
+ *
+ * [in hibernation asm]
+ * 1. CR3 <= temporary page tables
+ * 2. CR4 <= mmu_cr4_features (from the kernel that restores us)
+ * 3. CR3 <= rdr->cr3
+ * 4. CR4 <= mmu_cr4_features (from us, i.e. the image kernel)
+ * [in restore_processor_state()]
+ * 5. CR4 <= saved CR4
+ * 6. CR3 <= saved CR3
+ *
+ * Our mmu_cr4_features has CR4.PCIDE=0, and toggling
+ * CR4.PCIDE while CR3's PCID bits are nonzero is illegal, so
+ * rdr->cr3 needs to point to valid page tables but must not
+ * have any of the PCID bits set.
+ */
+ rdr->cr3 = restore_cr3 & ~CR3_PCID_MASK;
+
+ rdr->magic = RESTORE_MAGIC;
+
+ return hibernation_e820_save(rdr->e820_digest);
+}
+
+/**
+ * arch_hibernation_header_restore - read the architecture specific data
+ * from the hibernation image header
+ * @addr: address to read the data from
+ */
+int arch_hibernation_header_restore(void *addr)
+{
+ struct restore_data_record *rdr = addr;
+
+ restore_jump_address = rdr->jump_address;
+ jump_address_phys = rdr->jump_address_phys;
+ restore_cr3 = rdr->cr3;
+
+ if (rdr->magic != RESTORE_MAGIC) {
+ pr_crit("Unrecognized hibernate image header format!\n");
+ return -EINVAL;
+ }
+
+ if (hibernation_e820_mismatch(rdr->e820_digest)) {
+ pr_crit("Hibernate inconsistent memory map detected!\n");
+ return -ENODEV;
+ }
+
+ return 0;
+}
+
+int arch_resume_nosmt(void)
+{
+ int ret = 0;
+ /*
+ * We reached this while coming out of hibernation. This means
+ * that SMT siblings are sleeping in hlt, as mwait is not safe
+ * against control transition during resume (see comment in
+ * hibernate_resume_nonboot_cpu_disable()).
+ *
+ * If the resumed kernel has SMT disabled, we have to take all the
+ * SMT siblings out of hlt, and offline them again so that they
+ * end up in mwait proper.
+ *
+ * Called with hotplug disabled.
+ */
+ cpu_hotplug_enable();
+ if (cpu_smt_control == CPU_SMT_DISABLED ||
+ cpu_smt_control == CPU_SMT_FORCE_DISABLED) {
+ enum cpuhp_smt_control old = cpu_smt_control;
+
+ ret = cpuhp_smt_enable();
+ if (ret)
+ goto out;
+ ret = cpuhp_smt_disable(old);
+ if (ret)
+ goto out;
+ }
+out:
+ cpu_hotplug_disable();
+ return ret;
+}
diff --git a/arch/x86/power/hibernate_asm_32.S b/arch/x86/power/hibernate_asm_32.S
new file mode 100644
index 000000000..6e56815e1
--- /dev/null
+++ b/arch/x86/power/hibernate_asm_32.S
@@ -0,0 +1,85 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * This may not use any stack, nor any variable that is not "NoSave":
+ *
+ * Its rewriting one kernel image with another. What is stack in "old"
+ * image could very well be data page in "new" image, and overwriting
+ * your own stack under you is bad idea.
+ */
+
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/page_types.h>
+#include <asm/asm-offsets.h>
+#include <asm/processor-flags.h>
+
+.text
+
+ENTRY(swsusp_arch_suspend)
+ movl %esp, saved_context_esp
+ movl %ebx, saved_context_ebx
+ movl %ebp, saved_context_ebp
+ movl %esi, saved_context_esi
+ movl %edi, saved_context_edi
+ pushfl
+ popl saved_context_eflags
+
+ call swsusp_save
+ ret
+
+ENTRY(restore_image)
+ movl mmu_cr4_features, %ecx
+ movl resume_pg_dir, %eax
+ subl $__PAGE_OFFSET, %eax
+ movl %eax, %cr3
+
+ jecxz 1f # cr4 Pentium and higher, skip if zero
+ andl $~(X86_CR4_PGE), %ecx
+ movl %ecx, %cr4; # turn off PGE
+ movl %cr3, %eax; # flush TLB
+ movl %eax, %cr3
+1:
+ movl restore_pblist, %edx
+ .p2align 4,,7
+
+copy_loop:
+ testl %edx, %edx
+ jz done
+
+ movl pbe_address(%edx), %esi
+ movl pbe_orig_address(%edx), %edi
+
+ movl $1024, %ecx
+ rep
+ movsl
+
+ movl pbe_next(%edx), %edx
+ jmp copy_loop
+ .p2align 4,,7
+
+done:
+ /* go back to the original page tables */
+ movl $swapper_pg_dir, %eax
+ subl $__PAGE_OFFSET, %eax
+ movl %eax, %cr3
+ movl mmu_cr4_features, %ecx
+ jecxz 1f # cr4 Pentium and higher, skip if zero
+ movl %ecx, %cr4; # turn PGE back on
+1:
+
+ movl saved_context_esp, %esp
+ movl saved_context_ebp, %ebp
+ movl saved_context_ebx, %ebx
+ movl saved_context_esi, %esi
+ movl saved_context_edi, %edi
+
+ pushl saved_context_eflags
+ popfl
+
+ /* Saved in save_processor_state. */
+ movl $saved_context, %eax
+ lgdt saved_context_gdt_desc(%eax)
+
+ xorl %eax, %eax
+
+ ret
diff --git a/arch/x86/power/hibernate_asm_64.S b/arch/x86/power/hibernate_asm_64.S
new file mode 100644
index 000000000..fd369a6e9
--- /dev/null
+++ b/arch/x86/power/hibernate_asm_64.S
@@ -0,0 +1,146 @@
+/*
+ * Hibernation support for x86-64
+ *
+ * Distribute under GPLv2.
+ *
+ * Copyright 2007 Rafael J. Wysocki <rjw@sisk.pl>
+ * Copyright 2005 Andi Kleen <ak@suse.de>
+ * Copyright 2004 Pavel Machek <pavel@suse.cz>
+ *
+ * swsusp_arch_resume must not use any stack or any nonlocal variables while
+ * copying pages:
+ *
+ * Its rewriting one kernel image with another. What is stack in "old"
+ * image could very well be data page in "new" image, and overwriting
+ * your own stack under you is bad idea.
+ */
+
+ .text
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/page_types.h>
+#include <asm/asm-offsets.h>
+#include <asm/processor-flags.h>
+#include <asm/frame.h>
+
+ENTRY(swsusp_arch_suspend)
+ movq $saved_context, %rax
+ movq %rsp, pt_regs_sp(%rax)
+ movq %rbp, pt_regs_bp(%rax)
+ movq %rsi, pt_regs_si(%rax)
+ movq %rdi, pt_regs_di(%rax)
+ movq %rbx, pt_regs_bx(%rax)
+ movq %rcx, pt_regs_cx(%rax)
+ movq %rdx, pt_regs_dx(%rax)
+ movq %r8, pt_regs_r8(%rax)
+ movq %r9, pt_regs_r9(%rax)
+ movq %r10, pt_regs_r10(%rax)
+ movq %r11, pt_regs_r11(%rax)
+ movq %r12, pt_regs_r12(%rax)
+ movq %r13, pt_regs_r13(%rax)
+ movq %r14, pt_regs_r14(%rax)
+ movq %r15, pt_regs_r15(%rax)
+ pushfq
+ popq pt_regs_flags(%rax)
+
+ /* save cr3 */
+ movq %cr3, %rax
+ movq %rax, restore_cr3(%rip)
+
+ FRAME_BEGIN
+ call swsusp_save
+ FRAME_END
+ ret
+ENDPROC(swsusp_arch_suspend)
+
+ENTRY(restore_image)
+ /* prepare to jump to the image kernel */
+ movq restore_jump_address(%rip), %r8
+ movq restore_cr3(%rip), %r9
+
+ /* prepare to switch to temporary page tables */
+ movq temp_level4_pgt(%rip), %rax
+ movq mmu_cr4_features(%rip), %rbx
+
+ /* prepare to copy image data to their original locations */
+ movq restore_pblist(%rip), %rdx
+
+ /* jump to relocated restore code */
+ movq relocated_restore_code(%rip), %rcx
+ jmpq *%rcx
+
+ /* code below has been relocated to a safe page */
+ENTRY(core_restore_code)
+ /* switch to temporary page tables */
+ movq %rax, %cr3
+ /* flush TLB */
+ movq %rbx, %rcx
+ andq $~(X86_CR4_PGE), %rcx
+ movq %rcx, %cr4; # turn off PGE
+ movq %cr3, %rcx; # flush TLB
+ movq %rcx, %cr3;
+ movq %rbx, %cr4; # turn PGE back on
+.Lloop:
+ testq %rdx, %rdx
+ jz .Ldone
+
+ /* get addresses from the pbe and copy the page */
+ movq pbe_address(%rdx), %rsi
+ movq pbe_orig_address(%rdx), %rdi
+ movq $(PAGE_SIZE >> 3), %rcx
+ rep
+ movsq
+
+ /* progress to the next pbe */
+ movq pbe_next(%rdx), %rdx
+ jmp .Lloop
+
+.Ldone:
+ /* jump to the restore_registers address from the image header */
+ jmpq *%r8
+
+ /* code below belongs to the image kernel */
+ .align PAGE_SIZE
+ENTRY(restore_registers)
+ /* go back to the original page tables */
+ movq %r9, %cr3
+
+ /* Flush TLB, including "global" things (vmalloc) */
+ movq mmu_cr4_features(%rip), %rax
+ movq %rax, %rdx
+ andq $~(X86_CR4_PGE), %rdx
+ movq %rdx, %cr4; # turn off PGE
+ movq %cr3, %rcx; # flush TLB
+ movq %rcx, %cr3
+ movq %rax, %cr4; # turn PGE back on
+
+ /* We don't restore %rax, it must be 0 anyway */
+ movq $saved_context, %rax
+ movq pt_regs_sp(%rax), %rsp
+ movq pt_regs_bp(%rax), %rbp
+ movq pt_regs_si(%rax), %rsi
+ movq pt_regs_di(%rax), %rdi
+ movq pt_regs_bx(%rax), %rbx
+ movq pt_regs_cx(%rax), %rcx
+ movq pt_regs_dx(%rax), %rdx
+ movq pt_regs_r8(%rax), %r8
+ movq pt_regs_r9(%rax), %r9
+ movq pt_regs_r10(%rax), %r10
+ movq pt_regs_r11(%rax), %r11
+ movq pt_regs_r12(%rax), %r12
+ movq pt_regs_r13(%rax), %r13
+ movq pt_regs_r14(%rax), %r14
+ movq pt_regs_r15(%rax), %r15
+ pushq pt_regs_flags(%rax)
+ popfq
+
+ /* Saved in save_processor_state. */
+ lgdt saved_context_gdt_desc(%rax)
+
+ xorl %eax, %eax
+
+ /* tell the hibernation core that we've just restored the memory */
+ movq %rax, in_suspend(%rip)
+
+ ret
+ENDPROC(restore_registers)
diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile
new file mode 100644
index 000000000..2cfa0caef
--- /dev/null
+++ b/arch/x86/purgatory/Makefile
@@ -0,0 +1,70 @@
+# SPDX-License-Identifier: GPL-2.0
+OBJECT_FILES_NON_STANDARD := y
+
+purgatory-y := purgatory.o stack.o setup-x86_$(BITS).o sha256.o entry64.o string.o
+
+targets += $(purgatory-y)
+PURGATORY_OBJS = $(addprefix $(obj)/,$(purgatory-y))
+
+$(obj)/string.o: $(srctree)/arch/x86/boot/compressed/string.c FORCE
+ $(call if_changed_rule,cc_o_c)
+
+$(obj)/sha256.o: $(srctree)/lib/sha256.c FORCE
+ $(call if_changed_rule,cc_o_c)
+
+LDFLAGS_purgatory.ro := -e purgatory_start -r --no-undefined -nostdlib -z nodefaultlib
+targets += purgatory.ro
+
+# Sanitizer, etc. runtimes are unavailable and cannot be linked here.
+GCOV_PROFILE := n
+KASAN_SANITIZE := n
+UBSAN_SANITIZE := n
+KCOV_INSTRUMENT := n
+
+# These are adjustments to the compiler flags used for objects that
+# make up the standalone purgatory.ro
+
+PURGATORY_CFLAGS_REMOVE := -mcmodel=kernel
+PURGATORY_CFLAGS := -mcmodel=large -ffreestanding -fno-zero-initialized-in-bss
+PURGATORY_CFLAGS += $(DISABLE_STACKLEAK_PLUGIN) -DDISABLE_BRANCH_PROFILING
+
+# Default KBUILD_CFLAGS can have -pg option set when FTRACE is enabled. That
+# in turn leaves some undefined symbols like __fentry__ in purgatory and not
+# sure how to relocate those.
+ifdef CONFIG_FUNCTION_TRACER
+PURGATORY_CFLAGS_REMOVE += $(CC_FLAGS_FTRACE)
+endif
+
+ifdef CONFIG_STACKPROTECTOR
+PURGATORY_CFLAGS_REMOVE += -fstack-protector
+endif
+
+ifdef CONFIG_STACKPROTECTOR_STRONG
+PURGATORY_CFLAGS_REMOVE += -fstack-protector-strong
+endif
+
+ifdef CONFIG_RETPOLINE
+PURGATORY_CFLAGS_REMOVE += $(RETPOLINE_CFLAGS)
+endif
+
+CFLAGS_REMOVE_purgatory.o += $(PURGATORY_CFLAGS_REMOVE)
+CFLAGS_purgatory.o += $(PURGATORY_CFLAGS)
+
+CFLAGS_REMOVE_sha256.o += $(PURGATORY_CFLAGS_REMOVE)
+CFLAGS_sha256.o += $(PURGATORY_CFLAGS)
+
+CFLAGS_REMOVE_string.o += $(PURGATORY_CFLAGS_REMOVE)
+CFLAGS_string.o += $(PURGATORY_CFLAGS)
+
+$(obj)/purgatory.ro: $(PURGATORY_OBJS) FORCE
+ $(call if_changed,ld)
+
+targets += kexec-purgatory.c
+
+quiet_cmd_bin2c = BIN2C $@
+ cmd_bin2c = $(objtree)/scripts/bin2c kexec_purgatory < $< > $@
+
+$(obj)/kexec-purgatory.c: $(obj)/purgatory.ro FORCE
+ $(call if_changed,bin2c)
+
+obj-$(CONFIG_KEXEC_FILE) += kexec-purgatory.o
diff --git a/arch/x86/purgatory/entry64.S b/arch/x86/purgatory/entry64.S
new file mode 100644
index 000000000..d1a4291d3
--- /dev/null
+++ b/arch/x86/purgatory/entry64.S
@@ -0,0 +1,101 @@
+/*
+ * Copyright (C) 2003,2004 Eric Biederman (ebiederm@xmission.com)
+ * Copyright (C) 2014 Red Hat Inc.
+
+ * Author(s): Vivek Goyal <vgoyal@redhat.com>
+ *
+ * This code has been taken from kexec-tools.
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+ .text
+ .balign 16
+ .code64
+ .globl entry64, entry64_regs
+
+
+entry64:
+ /* Setup a gdt that should be preserved */
+ lgdt gdt(%rip)
+
+ /* load the data segments */
+ movl $0x18, %eax /* data segment */
+ movl %eax, %ds
+ movl %eax, %es
+ movl %eax, %ss
+ movl %eax, %fs
+ movl %eax, %gs
+
+ /* Setup new stack */
+ leaq stack_init(%rip), %rsp
+ pushq $0x10 /* CS */
+ leaq new_cs_exit(%rip), %rax
+ pushq %rax
+ lretq
+new_cs_exit:
+
+ /* Load the registers */
+ movq rax(%rip), %rax
+ movq rbx(%rip), %rbx
+ movq rcx(%rip), %rcx
+ movq rdx(%rip), %rdx
+ movq rsi(%rip), %rsi
+ movq rdi(%rip), %rdi
+ movq rsp(%rip), %rsp
+ movq rbp(%rip), %rbp
+ movq r8(%rip), %r8
+ movq r9(%rip), %r9
+ movq r10(%rip), %r10
+ movq r11(%rip), %r11
+ movq r12(%rip), %r12
+ movq r13(%rip), %r13
+ movq r14(%rip), %r14
+ movq r15(%rip), %r15
+
+ /* Jump to the new code... */
+ jmpq *rip(%rip)
+
+ .section ".rodata"
+ .balign 4
+entry64_regs:
+rax: .quad 0x0
+rcx: .quad 0x0
+rdx: .quad 0x0
+rbx: .quad 0x0
+rsp: .quad 0x0
+rbp: .quad 0x0
+rsi: .quad 0x0
+rdi: .quad 0x0
+r8: .quad 0x0
+r9: .quad 0x0
+r10: .quad 0x0
+r11: .quad 0x0
+r12: .quad 0x0
+r13: .quad 0x0
+r14: .quad 0x0
+r15: .quad 0x0
+rip: .quad 0x0
+ .size entry64_regs, . - entry64_regs
+
+ /* GDT */
+ .section ".rodata"
+ .balign 16
+gdt:
+ /* 0x00 unusable segment
+ * 0x08 unused
+ * so use them as gdt ptr
+ */
+ .word gdt_end - gdt - 1
+ .quad gdt
+ .word 0, 0, 0
+
+ /* 0x10 4GB flat code segment */
+ .word 0xFFFF, 0x0000, 0x9A00, 0x00AF
+
+ /* 0x18 4GB flat data segment */
+ .word 0xFFFF, 0x0000, 0x9200, 0x00CF
+gdt_end:
+stack: .quad 0, 0
+stack_init:
diff --git a/arch/x86/purgatory/purgatory.c b/arch/x86/purgatory/purgatory.c
new file mode 100644
index 000000000..7971f7a8a
--- /dev/null
+++ b/arch/x86/purgatory/purgatory.c
@@ -0,0 +1,78 @@
+/*
+ * purgatory: Runs between two kernels
+ *
+ * Copyright (C) 2014 Red Hat Inc.
+ *
+ * Author:
+ * Vivek Goyal <vgoyal@redhat.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+#include <linux/bug.h>
+#include <linux/sha256.h>
+#include <asm/purgatory.h>
+
+#include "../boot/string.h"
+
+unsigned long purgatory_backup_dest __section(.kexec-purgatory);
+unsigned long purgatory_backup_src __section(.kexec-purgatory);
+unsigned long purgatory_backup_sz __section(.kexec-purgatory);
+
+u8 purgatory_sha256_digest[SHA256_DIGEST_SIZE] __section(.kexec-purgatory);
+
+struct kexec_sha_region purgatory_sha_regions[KEXEC_SEGMENT_MAX] __section(.kexec-purgatory);
+
+/*
+ * On x86, second kernel requries first 640K of memory to boot. Copy
+ * first 640K to a backup region in reserved memory range so that second
+ * kernel can use first 640K.
+ */
+static int copy_backup_region(void)
+{
+ if (purgatory_backup_dest) {
+ memcpy((void *)purgatory_backup_dest,
+ (void *)purgatory_backup_src, purgatory_backup_sz);
+ }
+ return 0;
+}
+
+static int verify_sha256_digest(void)
+{
+ struct kexec_sha_region *ptr, *end;
+ u8 digest[SHA256_DIGEST_SIZE];
+ struct sha256_state sctx;
+
+ sha256_init(&sctx);
+ end = purgatory_sha_regions + ARRAY_SIZE(purgatory_sha_regions);
+
+ for (ptr = purgatory_sha_regions; ptr < end; ptr++)
+ sha256_update(&sctx, (uint8_t *)(ptr->start), ptr->len);
+
+ sha256_final(&sctx, digest);
+
+ if (memcmp(digest, purgatory_sha256_digest, sizeof(digest)))
+ return 1;
+
+ return 0;
+}
+
+void purgatory(void)
+{
+ int ret;
+
+ ret = verify_sha256_digest();
+ if (ret) {
+ /* loop forever */
+ for (;;)
+ ;
+ }
+ copy_backup_region();
+}
+
+/*
+ * Defined in order to reuse memcpy() and memset() from
+ * arch/x86/boot/compressed/string.c
+ */
+void warn(const char *msg) {}
diff --git a/arch/x86/purgatory/setup-x86_64.S b/arch/x86/purgatory/setup-x86_64.S
new file mode 100644
index 000000000..dfae9b9e6
--- /dev/null
+++ b/arch/x86/purgatory/setup-x86_64.S
@@ -0,0 +1,59 @@
+/*
+ * purgatory: setup code
+ *
+ * Copyright (C) 2003,2004 Eric Biederman (ebiederm@xmission.com)
+ * Copyright (C) 2014 Red Hat Inc.
+ *
+ * This code has been taken from kexec-tools.
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+#include <asm/purgatory.h>
+
+ .text
+ .globl purgatory_start
+ .balign 16
+purgatory_start:
+ .code64
+
+ /* Load a gdt so I know what the segment registers are */
+ lgdt gdt(%rip)
+
+ /* load the data segments */
+ movl $0x18, %eax /* data segment */
+ movl %eax, %ds
+ movl %eax, %es
+ movl %eax, %ss
+ movl %eax, %fs
+ movl %eax, %gs
+
+ /* Setup a stack */
+ leaq lstack_end(%rip), %rsp
+
+ /* Call the C code */
+ call purgatory
+ jmp entry64
+
+ .section ".rodata"
+ .balign 16
+gdt: /* 0x00 unusable segment
+ * 0x08 unused
+ * so use them as the gdt ptr
+ */
+ .word gdt_end - gdt - 1
+ .quad gdt
+ .word 0, 0, 0
+
+ /* 0x10 4GB flat code segment */
+ .word 0xFFFF, 0x0000, 0x9A00, 0x00AF
+
+ /* 0x18 4GB flat data segment */
+ .word 0xFFFF, 0x0000, 0x9200, 0x00CF
+gdt_end:
+
+ .bss
+ .balign 4096
+lstack:
+ .skip 4096
+lstack_end:
diff --git a/arch/x86/purgatory/stack.S b/arch/x86/purgatory/stack.S
new file mode 100644
index 000000000..50a4147f9
--- /dev/null
+++ b/arch/x86/purgatory/stack.S
@@ -0,0 +1,19 @@
+/*
+ * purgatory: stack
+ *
+ * Copyright (C) 2014 Red Hat Inc.
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+ /* A stack for the loaded kernel.
+ * Separate and in the data section so it can be prepopulated.
+ */
+ .data
+ .balign 4096
+ .globl stack, stack_end
+
+stack:
+ .skip 4096
+stack_end:
diff --git a/arch/x86/ras/Kconfig b/arch/x86/ras/Kconfig
new file mode 100644
index 000000000..a9c3db125
--- /dev/null
+++ b/arch/x86/ras/Kconfig
@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: GPL-2.0
+config RAS_CEC
+ bool "Correctable Errors Collector"
+ depends on X86_MCE && MEMORY_FAILURE && DEBUG_FS
+ ---help---
+ This is a small cache which collects correctable memory errors per 4K
+ page PFN and counts their repeated occurrence. Once the counter for a
+ PFN overflows, we try to soft-offline that page as we take it to mean
+ that it has reached a relatively high error count and would probably
+ be best if we don't use it anymore.
+
+ Bear in mind that this is absolutely useless if your platform doesn't
+ have ECC DIMMs and doesn't have DRAM ECC checking enabled in the BIOS.
diff --git a/arch/x86/realmode/Makefile b/arch/x86/realmode/Makefile
new file mode 100644
index 000000000..682c89575
--- /dev/null
+++ b/arch/x86/realmode/Makefile
@@ -0,0 +1,20 @@
+#
+# arch/x86/realmode/Makefile
+#
+# This file is subject to the terms and conditions of the GNU General Public
+# License. See the file "COPYING" in the main directory of this archive
+# for more details.
+#
+#
+KASAN_SANITIZE := n
+OBJECT_FILES_NON_STANDARD := y
+
+subdir- := rm
+
+obj-y += init.o
+obj-y += rmpiggy.o
+
+$(obj)/rmpiggy.o: $(obj)/rm/realmode.bin
+
+$(obj)/rm/realmode.bin: FORCE
+ $(Q)$(MAKE) $(build)=$(obj)/rm $@
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
new file mode 100644
index 000000000..7bfb9af57
--- /dev/null
+++ b/arch/x86/realmode/init.c
@@ -0,0 +1,170 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/io.h>
+#include <linux/slab.h>
+#include <linux/memblock.h>
+#include <linux/mem_encrypt.h>
+
+#include <asm/set_memory.h>
+#include <asm/pgtable.h>
+#include <asm/realmode.h>
+#include <asm/tlbflush.h>
+
+struct real_mode_header *real_mode_header;
+u32 *trampoline_cr4_features;
+
+/* Hold the pgd entry used on booting additional CPUs */
+pgd_t trampoline_pgd_entry;
+
+void __init set_real_mode_mem(phys_addr_t mem, size_t size)
+{
+ void *base = __va(mem);
+
+ real_mode_header = (struct real_mode_header *) base;
+}
+
+void __init reserve_real_mode(void)
+{
+ phys_addr_t mem;
+ size_t size = real_mode_size_needed();
+
+ if (!size)
+ return;
+
+ WARN_ON(slab_is_available());
+
+ /* Has to be under 1M so we can execute real-mode AP code. */
+ mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE);
+ if (!mem) {
+ pr_info("No sub-1M memory is available for the trampoline\n");
+ return;
+ }
+
+ memblock_reserve(mem, size);
+ set_real_mode_mem(mem, size);
+}
+
+static void __init setup_real_mode(void)
+{
+ u16 real_mode_seg;
+ const u32 *rel;
+ u32 count;
+ unsigned char *base;
+ unsigned long phys_base;
+ struct trampoline_header *trampoline_header;
+ size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob);
+#ifdef CONFIG_X86_64
+ u64 *trampoline_pgd;
+ u64 efer;
+ int i;
+#endif
+
+ base = (unsigned char *)real_mode_header;
+
+ /*
+ * If SME is active, the trampoline area will need to be in
+ * decrypted memory in order to bring up other processors
+ * successfully. This is not needed for SEV.
+ */
+ if (sme_active())
+ set_memory_decrypted((unsigned long)base, size >> PAGE_SHIFT);
+
+ memcpy(base, real_mode_blob, size);
+
+ phys_base = __pa(base);
+ real_mode_seg = phys_base >> 4;
+
+ rel = (u32 *) real_mode_relocs;
+
+ /* 16-bit segment relocations. */
+ count = *rel++;
+ while (count--) {
+ u16 *seg = (u16 *) (base + *rel++);
+ *seg = real_mode_seg;
+ }
+
+ /* 32-bit linear relocations. */
+ count = *rel++;
+ while (count--) {
+ u32 *ptr = (u32 *) (base + *rel++);
+ *ptr += phys_base;
+ }
+
+ /* Must be perfomed *after* relocation. */
+ trampoline_header = (struct trampoline_header *)
+ __va(real_mode_header->trampoline_header);
+
+#ifdef CONFIG_X86_32
+ trampoline_header->start = __pa_symbol(startup_32_smp);
+ trampoline_header->gdt_limit = __BOOT_DS + 7;
+ trampoline_header->gdt_base = __pa_symbol(boot_gdt);
+#else
+ /*
+ * Some AMD processors will #GP(0) if EFER.LMA is set in WRMSR
+ * so we need to mask it out.
+ */
+ rdmsrl(MSR_EFER, efer);
+ trampoline_header->efer = efer & ~EFER_LMA;
+
+ trampoline_header->start = (u64) secondary_startup_64;
+ trampoline_cr4_features = &trampoline_header->cr4;
+ *trampoline_cr4_features = mmu_cr4_features;
+
+ trampoline_header->flags = 0;
+ if (sme_active())
+ trampoline_header->flags |= TH_FLAGS_SME_ACTIVE;
+
+ trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd);
+
+ /* Map the real mode stub as virtual == physical */
+ trampoline_pgd[0] = trampoline_pgd_entry.pgd;
+
+ /*
+ * Include the entirety of the kernel mapping into the trampoline
+ * PGD. This way, all mappings present in the normal kernel page
+ * tables are usable while running on trampoline_pgd.
+ */
+ for (i = pgd_index(__PAGE_OFFSET); i < PTRS_PER_PGD; i++)
+ trampoline_pgd[i] = init_top_pgt[i].pgd;
+#endif
+}
+
+/*
+ * reserve_real_mode() gets called very early, to guarantee the
+ * availability of low memory. This is before the proper kernel page
+ * tables are set up, so we cannot set page permissions in that
+ * function. Also trampoline code will be executed by APs so we
+ * need to mark it executable at do_pre_smp_initcalls() at least,
+ * thus run it as a early_initcall().
+ */
+static void __init set_real_mode_permissions(void)
+{
+ unsigned char *base = (unsigned char *) real_mode_header;
+ size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob);
+
+ size_t ro_size =
+ PAGE_ALIGN(real_mode_header->ro_end) -
+ __pa(base);
+
+ size_t text_size =
+ PAGE_ALIGN(real_mode_header->ro_end) -
+ real_mode_header->text_start;
+
+ unsigned long text_start =
+ (unsigned long) __va(real_mode_header->text_start);
+
+ set_memory_nx((unsigned long) base, size >> PAGE_SHIFT);
+ set_memory_ro((unsigned long) base, ro_size >> PAGE_SHIFT);
+ set_memory_x((unsigned long) text_start, text_size >> PAGE_SHIFT);
+}
+
+static int __init init_real_mode(void)
+{
+ if (!real_mode_header)
+ panic("Real mode trampoline was not allocated");
+
+ setup_real_mode();
+ set_real_mode_permissions();
+
+ return 0;
+}
+early_initcall(init_real_mode);
diff --git a/arch/x86/realmode/rm/.gitignore b/arch/x86/realmode/rm/.gitignore
new file mode 100644
index 000000000..b6ed3a255
--- /dev/null
+++ b/arch/x86/realmode/rm/.gitignore
@@ -0,0 +1,3 @@
+pasyms.h
+realmode.lds
+realmode.relocs
diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile
new file mode 100644
index 000000000..96cb20de0
--- /dev/null
+++ b/arch/x86/realmode/rm/Makefile
@@ -0,0 +1,76 @@
+#
+# arch/x86/realmode/Makefile
+#
+# This file is subject to the terms and conditions of the GNU General Public
+# License. See the file "COPYING" in the main directory of this archive
+# for more details.
+#
+#
+KASAN_SANITIZE := n
+OBJECT_FILES_NON_STANDARD := y
+
+# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in.
+KCOV_INSTRUMENT := n
+
+always := realmode.bin realmode.relocs
+
+wakeup-objs := wakeup_asm.o wakemain.o video-mode.o
+wakeup-objs += copy.o bioscall.o regs.o
+# The link order of the video-*.o modules can matter. In particular,
+# video-vga.o *must* be listed first, followed by video-vesa.o.
+# Hardware-specific drivers should follow in the order they should be
+# probed, and video-bios.o should typically be last.
+wakeup-objs += video-vga.o
+wakeup-objs += video-vesa.o
+wakeup-objs += video-bios.o
+
+realmode-y += header.o
+realmode-y += trampoline_$(BITS).o
+realmode-y += stack.o
+realmode-y += reboot.o
+realmode-$(CONFIG_ACPI_SLEEP) += $(wakeup-objs)
+
+targets += $(realmode-y)
+
+REALMODE_OBJS = $(addprefix $(obj)/,$(realmode-y))
+
+sed-pasyms := -n -r -e 's/^([0-9a-fA-F]+) [ABCDGRSTVW] (.+)$$/pa_\2 = \2;/p'
+
+quiet_cmd_pasyms = PASYMS $@
+ cmd_pasyms = $(NM) $(filter-out FORCE,$^) | \
+ sed $(sed-pasyms) | sort | uniq > $@
+
+targets += pasyms.h
+$(obj)/pasyms.h: $(REALMODE_OBJS) FORCE
+ $(call if_changed,pasyms)
+
+targets += realmode.lds
+$(obj)/realmode.lds: $(obj)/pasyms.h
+
+LDFLAGS_realmode.elf := -m elf_i386 --emit-relocs -T
+CPPFLAGS_realmode.lds += -P -C -I$(objtree)/$(obj)
+
+targets += realmode.elf
+$(obj)/realmode.elf: $(obj)/realmode.lds $(REALMODE_OBJS) FORCE
+ $(call if_changed,ld)
+
+OBJCOPYFLAGS_realmode.bin := -O binary
+
+targets += realmode.bin
+$(obj)/realmode.bin: $(obj)/realmode.elf $(obj)/realmode.relocs FORCE
+ $(call if_changed,objcopy)
+
+quiet_cmd_relocs = RELOCS $@
+ cmd_relocs = arch/x86/tools/relocs --realmode $< > $@
+
+targets += realmode.relocs
+$(obj)/realmode.relocs: $(obj)/realmode.elf FORCE
+ $(call if_changed,relocs)
+
+# ---------------------------------------------------------------------------
+
+KBUILD_CFLAGS := $(REALMODE_CFLAGS) -D_SETUP -D_WAKEUP \
+ -I$(srctree)/arch/x86/boot
+KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
+GCOV_PROFILE := n
+UBSAN_SANITIZE := n
diff --git a/arch/x86/realmode/rm/bioscall.S b/arch/x86/realmode/rm/bioscall.S
new file mode 100644
index 000000000..16162d197
--- /dev/null
+++ b/arch/x86/realmode/rm/bioscall.S
@@ -0,0 +1 @@
+#include "../../boot/bioscall.S"
diff --git a/arch/x86/realmode/rm/copy.S b/arch/x86/realmode/rm/copy.S
new file mode 100644
index 000000000..b785e6f38
--- /dev/null
+++ b/arch/x86/realmode/rm/copy.S
@@ -0,0 +1 @@
+#include "../../boot/copy.S"
diff --git a/arch/x86/realmode/rm/header.S b/arch/x86/realmode/rm/header.S
new file mode 100644
index 000000000..30b0d30d8
--- /dev/null
+++ b/arch/x86/realmode/rm/header.S
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Real-mode blob header; this should match realmode.h and be
+ * readonly; for mutable data instead add pointers into the .data
+ * or .bss sections as appropriate.
+ */
+
+#include <linux/linkage.h>
+#include <asm/page_types.h>
+#include <asm/segment.h>
+
+#include "realmode.h"
+
+ .section ".header", "a"
+
+ .balign 16
+GLOBAL(real_mode_header)
+ .long pa_text_start
+ .long pa_ro_end
+ /* SMP trampoline */
+ .long pa_trampoline_start
+ .long pa_trampoline_status
+ .long pa_trampoline_header
+#ifdef CONFIG_X86_64
+ .long pa_trampoline_pgd;
+#endif
+ /* ACPI S3 wakeup */
+#ifdef CONFIG_ACPI_SLEEP
+ .long pa_wakeup_start
+ .long pa_wakeup_header
+#endif
+ /* APM/BIOS reboot */
+ .long pa_machine_real_restart_asm
+#ifdef CONFIG_X86_64
+ .long __KERNEL32_CS
+#endif
+END(real_mode_header)
+
+ /* End signature, used to verify integrity */
+ .section ".signature","a"
+ .balign 4
+GLOBAL(end_signature)
+ .long REALMODE_END_SIGNATURE
+END(end_signature)
diff --git a/arch/x86/realmode/rm/realmode.h b/arch/x86/realmode/rm/realmode.h
new file mode 100644
index 000000000..c76041a35
--- /dev/null
+++ b/arch/x86/realmode/rm/realmode.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ARCH_X86_REALMODE_RM_REALMODE_H
+#define ARCH_X86_REALMODE_RM_REALMODE_H
+
+#ifdef __ASSEMBLY__
+
+/*
+ * 16-bit ljmpw to the real_mode_seg
+ *
+ * This must be open-coded since gas will choke on using a
+ * relocatable symbol for the segment portion.
+ */
+#define LJMPW_RM(to) .byte 0xea ; .word (to), real_mode_seg
+
+#endif /* __ASSEMBLY__ */
+
+/*
+ * Signature at the end of the realmode region
+ */
+#define REALMODE_END_SIGNATURE 0x65a22c82
+
+#endif /* ARCH_X86_REALMODE_RM_REALMODE_H */
diff --git a/arch/x86/realmode/rm/realmode.lds.S b/arch/x86/realmode/rm/realmode.lds.S
new file mode 100644
index 000000000..df8e11e26
--- /dev/null
+++ b/arch/x86/realmode/rm/realmode.lds.S
@@ -0,0 +1,77 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * realmode.lds.S
+ *
+ * Linker script for the real-mode code
+ */
+
+#include <asm/page_types.h>
+
+#undef i386
+
+OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
+OUTPUT_ARCH(i386)
+
+SECTIONS
+{
+ real_mode_seg = 0;
+
+ . = 0;
+ .header : {
+ pa_real_mode_base = .;
+ *(.header)
+ }
+
+ . = ALIGN(4);
+ .rodata : {
+ *(.rodata)
+ *(.rodata.*)
+ . = ALIGN(16);
+ video_cards = .;
+ *(.videocards)
+ video_cards_end = .;
+ }
+
+ . = ALIGN(PAGE_SIZE);
+ pa_text_start = .;
+ .text : {
+ *(.text)
+ *(.text.*)
+ }
+
+ .text32 : {
+ *(.text32)
+ *(.text32.*)
+ }
+
+ .text64 : {
+ *(.text64)
+ *(.text64.*)
+ }
+ pa_ro_end = .;
+
+ . = ALIGN(PAGE_SIZE);
+ .data : {
+ *(.data)
+ *(.data.*)
+ }
+
+ . = ALIGN(128);
+ .bss : {
+ *(.bss*)
+ }
+
+ /* End signature for integrity checking */
+ . = ALIGN(4);
+ .signature : {
+ *(.signature)
+ }
+
+ /DISCARD/ : {
+ *(.note*)
+ *(.debug*)
+ *(.eh_frame*)
+ }
+
+#include "pasyms.h"
+}
diff --git a/arch/x86/realmode/rm/reboot.S b/arch/x86/realmode/rm/reboot.S
new file mode 100644
index 000000000..cd2f97b96
--- /dev/null
+++ b/arch/x86/realmode/rm/reboot.S
@@ -0,0 +1,156 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/page_types.h>
+#include <asm/processor-flags.h>
+#include <asm/msr-index.h>
+#include "realmode.h"
+
+/*
+ * The following code and data reboots the machine by switching to real
+ * mode and jumping to the BIOS reset entry point, as if the CPU has
+ * really been reset. The previous version asked the keyboard
+ * controller to pulse the CPU reset line, which is more thorough, but
+ * doesn't work with at least one type of 486 motherboard. It is easy
+ * to stop this code working; hence the copious comments.
+ *
+ * This code is called with the restart type (0 = BIOS, 1 = APM) in
+ * the primary argument register (%eax for 32 bit, %edi for 64 bit).
+ */
+ .section ".text32", "ax"
+ .code32
+ENTRY(machine_real_restart_asm)
+
+#ifdef CONFIG_X86_64
+ /* Switch to trampoline GDT as it is guaranteed < 4 GiB */
+ movl $__KERNEL_DS, %eax
+ movl %eax, %ds
+ lgdtl pa_tr_gdt
+
+ /* Disable paging to drop us out of long mode */
+ movl %cr0, %eax
+ andl $~X86_CR0_PG, %eax
+ movl %eax, %cr0
+ ljmpl $__KERNEL32_CS, $pa_machine_real_restart_paging_off
+
+GLOBAL(machine_real_restart_paging_off)
+ xorl %eax, %eax
+ xorl %edx, %edx
+ movl $MSR_EFER, %ecx
+ wrmsr
+
+ movl %edi, %eax
+
+#endif /* CONFIG_X86_64 */
+
+ /* Set up the IDT for real mode. */
+ lidtl pa_machine_real_restart_idt
+
+ /*
+ * Set up a GDT from which we can load segment descriptors for real
+ * mode. The GDT is not used in real mode; it is just needed here to
+ * prepare the descriptors.
+ */
+ lgdtl pa_machine_real_restart_gdt
+
+ /*
+ * Load the data segment registers with 16-bit compatible values
+ */
+ movl $16, %ecx
+ movl %ecx, %ds
+ movl %ecx, %es
+ movl %ecx, %fs
+ movl %ecx, %gs
+ movl %ecx, %ss
+ ljmpw $8, $1f
+
+/*
+ * This is 16-bit protected mode code to disable paging and the cache,
+ * switch to real mode and jump to the BIOS reset code.
+ *
+ * The instruction that switches to real mode by writing to CR0 must be
+ * followed immediately by a far jump instruction, which set CS to a
+ * valid value for real mode, and flushes the prefetch queue to avoid
+ * running instructions that have already been decoded in protected
+ * mode.
+ *
+ * Clears all the flags except ET, especially PG (paging), PE
+ * (protected-mode enable) and TS (task switch for coprocessor state
+ * save). Flushes the TLB after paging has been disabled. Sets CD and
+ * NW, to disable the cache on a 486, and invalidates the cache. This
+ * is more like the state of a 486 after reset. I don't know if
+ * something else should be done for other chips.
+ *
+ * More could be done here to set up the registers as if a CPU reset had
+ * occurred; hopefully real BIOSs don't assume much. This is not the
+ * actual BIOS entry point, anyway (that is at 0xfffffff0).
+ *
+ * Most of this work is probably excessive, but it is what is tested.
+ */
+ .text
+ .code16
+
+ .balign 16
+machine_real_restart_asm16:
+1:
+ xorl %ecx, %ecx
+ movl %cr0, %edx
+ andl $0x00000011, %edx
+ orl $0x60000000, %edx
+ movl %edx, %cr0
+ movl %ecx, %cr3
+ movl %cr0, %edx
+ testl $0x60000000, %edx /* If no cache bits -> no wbinvd */
+ jz 2f
+ wbinvd
+2:
+ andb $0x10, %dl
+ movl %edx, %cr0
+ LJMPW_RM(3f)
+3:
+ andw %ax, %ax
+ jz bios
+
+apm:
+ movw $0x1000, %ax
+ movw %ax, %ss
+ movw $0xf000, %sp
+ movw $0x5307, %ax
+ movw $0x0001, %bx
+ movw $0x0003, %cx
+ int $0x15
+ /* This should never return... */
+
+bios:
+ ljmpw $0xf000, $0xfff0
+
+ .section ".rodata", "a"
+
+ .balign 16
+GLOBAL(machine_real_restart_idt)
+ .word 0xffff /* Length - real mode default value */
+ .long 0 /* Base - real mode default value */
+END(machine_real_restart_idt)
+
+ .balign 16
+GLOBAL(machine_real_restart_gdt)
+ /* Self-pointer */
+ .word 0xffff /* Length - real mode default value */
+ .long pa_machine_real_restart_gdt
+ .word 0
+
+ /*
+ * 16-bit code segment pointing to real_mode_seg
+ * Selector value 8
+ */
+ .word 0xffff /* Limit */
+ .long 0x9b000000 + pa_real_mode_base
+ .word 0
+
+ /*
+ * 16-bit data segment with the selector value 16 = 0x10 and
+ * base value 0x100; since this is consistent with real mode
+ * semantics we don't have to reload the segments once CR0.PE = 0.
+ */
+ .quad GDT_ENTRY(0x0093, 0x100, 0xffff)
+END(machine_real_restart_gdt)
diff --git a/arch/x86/realmode/rm/regs.c b/arch/x86/realmode/rm/regs.c
new file mode 100644
index 000000000..fbb15b9f9
--- /dev/null
+++ b/arch/x86/realmode/rm/regs.c
@@ -0,0 +1 @@
+#include "../../boot/regs.c"
diff --git a/arch/x86/realmode/rm/stack.S b/arch/x86/realmode/rm/stack.S
new file mode 100644
index 000000000..8d4cb6479
--- /dev/null
+++ b/arch/x86/realmode/rm/stack.S
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Common heap and stack allocations
+ */
+
+#include <linux/linkage.h>
+
+ .data
+GLOBAL(HEAP)
+ .long rm_heap
+GLOBAL(heap_end)
+ .long rm_stack
+
+ .bss
+ .balign 16
+GLOBAL(rm_heap)
+ .space 2048
+GLOBAL(rm_stack)
+ .space 2048
+GLOBAL(rm_stack_end)
diff --git a/arch/x86/realmode/rm/trampoline_32.S b/arch/x86/realmode/rm/trampoline_32.S
new file mode 100644
index 000000000..2dd866c9e
--- /dev/null
+++ b/arch/x86/realmode/rm/trampoline_32.S
@@ -0,0 +1,74 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ *
+ * Trampoline.S Derived from Setup.S by Linus Torvalds
+ *
+ * 4 Jan 1997 Michael Chastain: changed to gnu as.
+ *
+ * This is only used for booting secondary CPUs in SMP machine
+ *
+ * Entry: CS:IP point to the start of our code, we are
+ * in real mode with no stack, but the rest of the
+ * trampoline page to make our stack and everything else
+ * is a mystery.
+ *
+ * We jump into arch/x86/kernel/head_32.S.
+ *
+ * On entry to trampoline_start, the processor is in real mode
+ * with 16-bit addressing and 16-bit data. CS has some value
+ * and IP is zero. Thus, we load CS to the physical segment
+ * of the real mode code before doing anything further.
+ */
+
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/page_types.h>
+#include "realmode.h"
+
+ .text
+ .code16
+
+ .balign PAGE_SIZE
+ENTRY(trampoline_start)
+ wbinvd # Needed for NUMA-Q should be harmless for others
+
+ LJMPW_RM(1f)
+1:
+ mov %cs, %ax # Code and data in the same place
+ mov %ax, %ds
+
+ cli # We should be safe anyway
+
+ movl tr_start, %eax # where we need to go
+
+ movl $0xA5A5A5A5, trampoline_status
+ # write marker for master knows we're running
+
+ /*
+ * GDT tables in non default location kernel can be beyond 16MB and
+ * lgdt will not be able to load the address as in real mode default
+ * operand size is 16bit. Use lgdtl instead to force operand size
+ * to 32 bit.
+ */
+ lidtl tr_idt # load idt with 0, 0
+ lgdtl tr_gdt # load gdt with whatever is appropriate
+
+ movw $1, %dx # protected mode (PE) bit
+ lmsw %dx # into protected mode
+
+ ljmpl $__BOOT_CS, $pa_startup_32
+
+ .section ".text32","ax"
+ .code32
+ENTRY(startup_32) # note: also used from wakeup_asm.S
+ jmp *%eax
+
+ .bss
+ .balign 8
+GLOBAL(trampoline_header)
+ tr_start: .space 4
+ tr_gdt_pad: .space 2
+ tr_gdt: .space 6
+END(trampoline_header)
+
+#include "trampoline_common.S"
diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S
new file mode 100644
index 000000000..24bb75987
--- /dev/null
+++ b/arch/x86/realmode/rm/trampoline_64.S
@@ -0,0 +1,177 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ *
+ * Trampoline.S Derived from Setup.S by Linus Torvalds
+ *
+ * 4 Jan 1997 Michael Chastain: changed to gnu as.
+ * 15 Sept 2005 Eric Biederman: 64bit PIC support
+ *
+ * Entry: CS:IP point to the start of our code, we are
+ * in real mode with no stack, but the rest of the
+ * trampoline page to make our stack and everything else
+ * is a mystery.
+ *
+ * On entry to trampoline_start, the processor is in real mode
+ * with 16-bit addressing and 16-bit data. CS has some value
+ * and IP is zero. Thus, data addresses need to be absolute
+ * (no relocation) and are taken with regard to r_base.
+ *
+ * With the addition of trampoline_level4_pgt this code can
+ * now enter a 64bit kernel that lives at arbitrary 64bit
+ * physical addresses.
+ *
+ * If you work on this file, check the object module with objdump
+ * --full-contents --reloc to make sure there are no relocation
+ * entries.
+ */
+
+#include <linux/linkage.h>
+#include <asm/pgtable_types.h>
+#include <asm/page_types.h>
+#include <asm/msr.h>
+#include <asm/segment.h>
+#include <asm/processor-flags.h>
+#include <asm/realmode.h>
+#include "realmode.h"
+
+ .text
+ .code16
+
+ .balign PAGE_SIZE
+ENTRY(trampoline_start)
+ cli # We should be safe anyway
+ wbinvd
+
+ LJMPW_RM(1f)
+1:
+ mov %cs, %ax # Code and data in the same place
+ mov %ax, %ds
+ mov %ax, %es
+ mov %ax, %ss
+
+ movl $0xA5A5A5A5, trampoline_status
+ # write marker for master knows we're running
+
+ # Setup stack
+ movl $rm_stack_end, %esp
+
+ call verify_cpu # Verify the cpu supports long mode
+ testl %eax, %eax # Check for return code
+ jnz no_longmode
+
+ /*
+ * GDT tables in non default location kernel can be beyond 16MB and
+ * lgdt will not be able to load the address as in real mode default
+ * operand size is 16bit. Use lgdtl instead to force operand size
+ * to 32 bit.
+ */
+
+ lidtl tr_idt # load idt with 0, 0
+ lgdtl tr_gdt # load gdt with whatever is appropriate
+
+ movw $__KERNEL_DS, %dx # Data segment descriptor
+
+ # Enable protected mode
+ movl $X86_CR0_PE, %eax # protected mode (PE) bit
+ movl %eax, %cr0 # into protected mode
+
+ # flush prefetch and jump to startup_32
+ ljmpl $__KERNEL32_CS, $pa_startup_32
+
+no_longmode:
+ hlt
+ jmp no_longmode
+#include "../kernel/verify_cpu.S"
+
+ .section ".text32","ax"
+ .code32
+ .balign 4
+ENTRY(startup_32)
+ movl %edx, %ss
+ addl $pa_real_mode_base, %esp
+ movl %edx, %ds
+ movl %edx, %es
+ movl %edx, %fs
+ movl %edx, %gs
+
+ /*
+ * Check for memory encryption support. This is a safety net in
+ * case BIOS hasn't done the necessary step of setting the bit in
+ * the MSR for this AP. If SME is active and we've gotten this far
+ * then it is safe for us to set the MSR bit and continue. If we
+ * don't we'll eventually crash trying to execute encrypted
+ * instructions.
+ */
+ btl $TH_FLAGS_SME_ACTIVE_BIT, pa_tr_flags
+ jnc .Ldone
+ movl $MSR_K8_SYSCFG, %ecx
+ rdmsr
+ bts $MSR_K8_SYSCFG_MEM_ENCRYPT_BIT, %eax
+ jc .Ldone
+
+ /*
+ * Memory encryption is enabled but the SME enable bit for this
+ * CPU has has not been set. It is safe to set it, so do so.
+ */
+ wrmsr
+.Ldone:
+
+ movl pa_tr_cr4, %eax
+ movl %eax, %cr4 # Enable PAE mode
+
+ # Setup trampoline 4 level pagetables
+ movl $pa_trampoline_pgd, %eax
+ movl %eax, %cr3
+
+ # Set up EFER
+ movl pa_tr_efer, %eax
+ movl pa_tr_efer + 4, %edx
+ movl $MSR_EFER, %ecx
+ wrmsr
+
+ # Enable paging and in turn activate Long Mode
+ movl $(X86_CR0_PG | X86_CR0_WP | X86_CR0_PE), %eax
+ movl %eax, %cr0
+
+ /*
+ * At this point we're in long mode but in 32bit compatibility mode
+ * with EFER.LME = 1, CS.L = 0, CS.D = 1 (and in turn
+ * EFER.LMA = 1). Now we want to jump in 64bit mode, to do that we use
+ * the new gdt/idt that has __KERNEL_CS with CS.L = 1.
+ */
+ ljmpl $__KERNEL_CS, $pa_startup_64
+
+ .section ".text64","ax"
+ .code64
+ .balign 4
+ENTRY(startup_64)
+ # Now jump into the kernel using virtual addresses
+ jmpq *tr_start(%rip)
+
+ .section ".rodata","a"
+ # Duplicate the global descriptor table
+ # so the kernel can live anywhere
+ .balign 16
+ .globl tr_gdt
+tr_gdt:
+ .short tr_gdt_end - tr_gdt - 1 # gdt limit
+ .long pa_tr_gdt
+ .short 0
+ .quad 0x00cf9b000000ffff # __KERNEL32_CS
+ .quad 0x00af9b000000ffff # __KERNEL_CS
+ .quad 0x00cf93000000ffff # __KERNEL_DS
+tr_gdt_end:
+
+ .bss
+ .balign PAGE_SIZE
+GLOBAL(trampoline_pgd) .space PAGE_SIZE
+
+ .balign 8
+GLOBAL(trampoline_header)
+ tr_start: .space 8
+ GLOBAL(tr_efer) .space 8
+ GLOBAL(tr_cr4) .space 4
+ GLOBAL(tr_flags) .space 4
+END(trampoline_header)
+
+#include "trampoline_common.S"
diff --git a/arch/x86/realmode/rm/trampoline_common.S b/arch/x86/realmode/rm/trampoline_common.S
new file mode 100644
index 000000000..7c706772a
--- /dev/null
+++ b/arch/x86/realmode/rm/trampoline_common.S
@@ -0,0 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+ .section ".rodata","a"
+ .balign 16
+tr_idt: .fill 1, 6, 0
+
+ .bss
+ .balign 4
+GLOBAL(trampoline_status) .space 4
diff --git a/arch/x86/realmode/rm/video-bios.c b/arch/x86/realmode/rm/video-bios.c
new file mode 100644
index 000000000..848b25aaf
--- /dev/null
+++ b/arch/x86/realmode/rm/video-bios.c
@@ -0,0 +1 @@
+#include "../../boot/video-bios.c"
diff --git a/arch/x86/realmode/rm/video-mode.c b/arch/x86/realmode/rm/video-mode.c
new file mode 100644
index 000000000..2a98b7e23
--- /dev/null
+++ b/arch/x86/realmode/rm/video-mode.c
@@ -0,0 +1 @@
+#include "../../boot/video-mode.c"
diff --git a/arch/x86/realmode/rm/video-vesa.c b/arch/x86/realmode/rm/video-vesa.c
new file mode 100644
index 000000000..413edddb5
--- /dev/null
+++ b/arch/x86/realmode/rm/video-vesa.c
@@ -0,0 +1 @@
+#include "../../boot/video-vesa.c"
diff --git a/arch/x86/realmode/rm/video-vga.c b/arch/x86/realmode/rm/video-vga.c
new file mode 100644
index 000000000..3085f5c9d
--- /dev/null
+++ b/arch/x86/realmode/rm/video-vga.c
@@ -0,0 +1 @@
+#include "../../boot/video-vga.c"
diff --git a/arch/x86/realmode/rm/wakemain.c b/arch/x86/realmode/rm/wakemain.c
new file mode 100644
index 000000000..1d6437e6d
--- /dev/null
+++ b/arch/x86/realmode/rm/wakemain.c
@@ -0,0 +1,83 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "wakeup.h"
+#include "boot.h"
+
+static void udelay(int loops)
+{
+ while (loops--)
+ io_delay(); /* Approximately 1 us */
+}
+
+static void beep(unsigned int hz)
+{
+ u8 enable;
+
+ if (!hz) {
+ enable = 0x00; /* Turn off speaker */
+ } else {
+ u16 div = 1193181/hz;
+
+ outb(0xb6, 0x43); /* Ctr 2, squarewave, load, binary */
+ io_delay();
+ outb(div, 0x42); /* LSB of counter */
+ io_delay();
+ outb(div >> 8, 0x42); /* MSB of counter */
+ io_delay();
+
+ enable = 0x03; /* Turn on speaker */
+ }
+ inb(0x61); /* Dummy read of System Control Port B */
+ io_delay();
+ outb(enable, 0x61); /* Enable timer 2 output to speaker */
+ io_delay();
+}
+
+#define DOT_HZ 880
+#define DASH_HZ 587
+#define US_PER_DOT 125000
+
+/* Okay, this is totally silly, but it's kind of fun. */
+static void send_morse(const char *pattern)
+{
+ char s;
+
+ while ((s = *pattern++)) {
+ switch (s) {
+ case '.':
+ beep(DOT_HZ);
+ udelay(US_PER_DOT);
+ beep(0);
+ udelay(US_PER_DOT);
+ break;
+ case '-':
+ beep(DASH_HZ);
+ udelay(US_PER_DOT * 3);
+ beep(0);
+ udelay(US_PER_DOT);
+ break;
+ default: /* Assume it's a space */
+ udelay(US_PER_DOT * 3);
+ break;
+ }
+ }
+}
+
+void main(void)
+{
+ /* Kill machine if structures are wrong */
+ if (wakeup_header.real_magic != 0x12345678)
+ while (1)
+ ;
+
+ if (wakeup_header.realmode_flags & 4)
+ send_morse("...-");
+
+ if (wakeup_header.realmode_flags & 1)
+ asm volatile("lcallw $0xc000,$3");
+
+ if (wakeup_header.realmode_flags & 2) {
+ /* Need to call BIOS */
+ probe_cards(0);
+ set_mode(wakeup_header.video_mode);
+ }
+}
diff --git a/arch/x86/realmode/rm/wakeup.h b/arch/x86/realmode/rm/wakeup.h
new file mode 100644
index 000000000..0e4fd08ae
--- /dev/null
+++ b/arch/x86/realmode/rm/wakeup.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Definitions for the wakeup data structure at the head of the
+ * wakeup code.
+ */
+
+#ifndef ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H
+#define ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H
+
+#ifndef __ASSEMBLY__
+#include <linux/types.h>
+
+/* This must match data at wakeup.S */
+struct wakeup_header {
+ u16 video_mode; /* Video mode number */
+ u32 pmode_entry; /* Protected mode resume point, 32-bit only */
+ u16 pmode_cs;
+ u32 pmode_cr0; /* Protected mode cr0 */
+ u32 pmode_cr3; /* Protected mode cr3 */
+ u32 pmode_cr4; /* Protected mode cr4 */
+ u32 pmode_efer_low; /* Protected mode EFER */
+ u32 pmode_efer_high;
+ u64 pmode_gdt;
+ u32 pmode_misc_en_low; /* Protected mode MISC_ENABLE */
+ u32 pmode_misc_en_high;
+ u32 pmode_behavior; /* Wakeup routine behavior flags */
+ u32 realmode_flags;
+ u32 real_magic;
+ u32 signature; /* To check we have correct structure */
+} __attribute__((__packed__));
+
+extern struct wakeup_header wakeup_header;
+#endif
+
+#define WAKEUP_HEADER_OFFSET 8
+#define WAKEUP_HEADER_SIGNATURE 0x51ee1111
+
+/* Wakeup behavior bits */
+#define WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE 0
+#define WAKEUP_BEHAVIOR_RESTORE_CR4 1
+#define WAKEUP_BEHAVIOR_RESTORE_EFER 2
+
+#endif /* ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H */
diff --git a/arch/x86/realmode/rm/wakeup_asm.S b/arch/x86/realmode/rm/wakeup_asm.S
new file mode 100644
index 000000000..05ac9c17c
--- /dev/null
+++ b/arch/x86/realmode/rm/wakeup_asm.S
@@ -0,0 +1,178 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * ACPI wakeup real mode startup stub
+ */
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/msr-index.h>
+#include <asm/page_types.h>
+#include <asm/pgtable_types.h>
+#include <asm/processor-flags.h>
+#include "realmode.h"
+#include "wakeup.h"
+
+ .code16
+
+/* This should match the structure in wakeup.h */
+ .section ".data", "aw"
+
+ .balign 16
+GLOBAL(wakeup_header)
+ video_mode: .short 0 /* Video mode number */
+ pmode_entry: .long 0
+ pmode_cs: .short __KERNEL_CS
+ pmode_cr0: .long 0 /* Saved %cr0 */
+ pmode_cr3: .long 0 /* Saved %cr3 */
+ pmode_cr4: .long 0 /* Saved %cr4 */
+ pmode_efer: .quad 0 /* Saved EFER */
+ pmode_gdt: .quad 0
+ pmode_misc_en: .quad 0 /* Saved MISC_ENABLE MSR */
+ pmode_behavior: .long 0 /* Wakeup behavior flags */
+ realmode_flags: .long 0
+ real_magic: .long 0
+ signature: .long WAKEUP_HEADER_SIGNATURE
+END(wakeup_header)
+
+ .text
+ .code16
+
+ .balign 16
+ENTRY(wakeup_start)
+ cli
+ cld
+
+ LJMPW_RM(3f)
+3:
+ /* Apparently some dimwit BIOS programmers don't know how to
+ program a PM to RM transition, and we might end up here with
+ junk in the data segment descriptor registers. The only way
+ to repair that is to go into PM and fix it ourselves... */
+ movw $16, %cx
+ lgdtl %cs:wakeup_gdt
+ movl %cr0, %eax
+ orb $X86_CR0_PE, %al
+ movl %eax, %cr0
+ ljmpw $8, $2f
+2:
+ movw %cx, %ds
+ movw %cx, %es
+ movw %cx, %ss
+ movw %cx, %fs
+ movw %cx, %gs
+
+ andb $~X86_CR0_PE, %al
+ movl %eax, %cr0
+ LJMPW_RM(3f)
+3:
+ /* Set up segments */
+ movw %cs, %ax
+ movw %ax, %ss
+ movl $rm_stack_end, %esp
+ movw %ax, %ds
+ movw %ax, %es
+ movw %ax, %fs
+ movw %ax, %gs
+
+ lidtl wakeup_idt
+
+ /* Clear the EFLAGS */
+ pushl $0
+ popfl
+
+ /* Check header signature... */
+ movl signature, %eax
+ cmpl $WAKEUP_HEADER_SIGNATURE, %eax
+ jne bogus_real_magic
+
+ /* Check we really have everything... */
+ movl end_signature, %eax
+ cmpl $REALMODE_END_SIGNATURE, %eax
+ jne bogus_real_magic
+
+ /* Call the C code */
+ calll main
+
+ /* Restore MISC_ENABLE before entering protected mode, in case
+ BIOS decided to clear XD_DISABLE during S3. */
+ movl pmode_behavior, %edi
+ btl $WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE, %edi
+ jnc 1f
+
+ movl pmode_misc_en, %eax
+ movl pmode_misc_en + 4, %edx
+ movl $MSR_IA32_MISC_ENABLE, %ecx
+ wrmsr
+1:
+
+ /* Do any other stuff... */
+
+#ifndef CONFIG_64BIT
+ /* This could also be done in C code... */
+ movl pmode_cr3, %eax
+ movl %eax, %cr3
+
+ btl $WAKEUP_BEHAVIOR_RESTORE_CR4, %edi
+ jnc 1f
+ movl pmode_cr4, %eax
+ movl %eax, %cr4
+1:
+ btl $WAKEUP_BEHAVIOR_RESTORE_EFER, %edi
+ jnc 1f
+ movl pmode_efer, %eax
+ movl pmode_efer + 4, %edx
+ movl $MSR_EFER, %ecx
+ wrmsr
+1:
+
+ lgdtl pmode_gdt
+
+ /* This really couldn't... */
+ movl pmode_entry, %eax
+ movl pmode_cr0, %ecx
+ movl %ecx, %cr0
+ ljmpl $__KERNEL_CS, $pa_startup_32
+ /* -> jmp *%eax in trampoline_32.S */
+#else
+ jmp trampoline_start
+#endif
+
+bogus_real_magic:
+1:
+ hlt
+ jmp 1b
+
+ .section ".rodata","a"
+
+ /*
+ * Set up the wakeup GDT. We set these up as Big Real Mode,
+ * that is, with limits set to 4 GB. At least the Lenovo
+ * Thinkpad X61 is known to need this for the video BIOS
+ * initialization quirk to work; this is likely to also
+ * be the case for other laptops or integrated video devices.
+ */
+
+ .balign 16
+GLOBAL(wakeup_gdt)
+ .word 3*8-1 /* Self-descriptor */
+ .long pa_wakeup_gdt
+ .word 0
+
+ .word 0xffff /* 16-bit code segment @ real_mode_base */
+ .long 0x9b000000 + pa_real_mode_base
+ .word 0x008f /* big real mode */
+
+ .word 0xffff /* 16-bit data segment @ real_mode_base */
+ .long 0x93000000 + pa_real_mode_base
+ .word 0x008f /* big real mode */
+END(wakeup_gdt)
+
+ .section ".rodata","a"
+ .balign 8
+
+ /* This is the standard real-mode IDT */
+ .balign 16
+GLOBAL(wakeup_idt)
+ .word 0xffff /* limit */
+ .long 0 /* address */
+ .word 0
+END(wakeup_idt)
diff --git a/arch/x86/realmode/rmpiggy.S b/arch/x86/realmode/rmpiggy.S
new file mode 100644
index 000000000..c078dba40
--- /dev/null
+++ b/arch/x86/realmode/rmpiggy.S
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Wrapper script for the realmode binary as a transport object
+ * before copying to low memory.
+ */
+#include <linux/linkage.h>
+#include <asm/page_types.h>
+
+ .section ".init.data","aw"
+
+ .balign PAGE_SIZE
+
+GLOBAL(real_mode_blob)
+ .incbin "arch/x86/realmode/rm/realmode.bin"
+END(real_mode_blob)
+
+GLOBAL(real_mode_blob_end);
+
+GLOBAL(real_mode_relocs)
+ .incbin "arch/x86/realmode/rm/realmode.relocs"
+END(real_mode_relocs)
diff --git a/arch/x86/tools/.gitignore b/arch/x86/tools/.gitignore
new file mode 100644
index 000000000..be0ed0652
--- /dev/null
+++ b/arch/x86/tools/.gitignore
@@ -0,0 +1 @@
+relocs
diff --git a/arch/x86/tools/Makefile b/arch/x86/tools/Makefile
new file mode 100644
index 000000000..09af7ff53
--- /dev/null
+++ b/arch/x86/tools/Makefile
@@ -0,0 +1,46 @@
+# SPDX-License-Identifier: GPL-2.0
+PHONY += posttest
+
+ifeq ($(KBUILD_VERBOSE),1)
+ posttest_verbose = -v
+else
+ posttest_verbose =
+endif
+
+ifeq ($(CONFIG_64BIT),y)
+ posttest_64bit = -y
+else
+ posttest_64bit = -n
+endif
+
+reformatter = $(srctree)/arch/x86/tools/objdump_reformat.awk
+chkobjdump = $(srctree)/arch/x86/tools/chkobjdump.awk
+
+quiet_cmd_posttest = TEST $@
+ cmd_posttest = ($(OBJDUMP) -v | $(AWK) -f $(chkobjdump)) || $(OBJDUMP) -d -j .text $(objtree)/vmlinux | $(AWK) -f $(reformatter) | $(obj)/insn_decoder_test $(posttest_64bit) $(posttest_verbose)
+
+quiet_cmd_sanitytest = TEST $@
+ cmd_sanitytest = $(obj)/insn_sanity $(posttest_64bit) -m 1000000
+
+posttest: $(obj)/insn_decoder_test vmlinux $(obj)/insn_sanity
+ $(call cmd,posttest)
+ $(call cmd,sanitytest)
+
+hostprogs-y += insn_decoder_test insn_sanity
+
+# -I needed for generated C source and C source which in the kernel tree.
+HOSTCFLAGS_insn_decoder_test.o := -Wall -I$(objtree)/arch/x86/lib/ -I$(srctree)/arch/x86/include/uapi/ -I$(srctree)/arch/x86/include/ -I$(srctree)/arch/x86/lib/ -I$(srctree)/include/uapi/
+
+HOSTCFLAGS_insn_sanity.o := -Wall -I$(objtree)/arch/x86/lib/ -I$(srctree)/arch/x86/include/ -I$(srctree)/arch/x86/lib/ -I$(srctree)/include/
+
+# Dependencies are also needed.
+$(obj)/insn_decoder_test.o: $(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/lib/inat.c $(srctree)/arch/x86/include/asm/inat_types.h $(srctree)/arch/x86/include/asm/inat.h $(srctree)/arch/x86/include/asm/insn.h $(objtree)/arch/x86/lib/inat-tables.c
+
+$(obj)/insn_sanity.o: $(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/lib/inat.c $(srctree)/arch/x86/include/asm/inat_types.h $(srctree)/arch/x86/include/asm/inat.h $(srctree)/arch/x86/include/asm/insn.h $(objtree)/arch/x86/lib/inat-tables.c
+
+HOST_EXTRACFLAGS += -I$(srctree)/tools/include
+hostprogs-y += relocs
+relocs-objs := relocs_32.o relocs_64.o relocs_common.o
+PHONY += relocs
+relocs: $(obj)/relocs
+ @:
diff --git a/arch/x86/tools/chkobjdump.awk b/arch/x86/tools/chkobjdump.awk
new file mode 100644
index 000000000..a4cf678cf
--- /dev/null
+++ b/arch/x86/tools/chkobjdump.awk
@@ -0,0 +1,34 @@
+# GNU objdump version checker
+#
+# Usage:
+# objdump -v | awk -f chkobjdump.awk
+BEGIN {
+ # objdump version 2.19 or later is OK for the test.
+ od_ver = 2;
+ od_sver = 19;
+}
+
+/^GNU objdump/ {
+ verstr = ""
+ gsub(/\(.*\)/, "");
+ for (i = 3; i <= NF; i++)
+ if (match($(i), "^[0-9]")) {
+ verstr = $(i);
+ break;
+ }
+ if (verstr == "") {
+ printf("Warning: Failed to find objdump version number.\n");
+ exit 0;
+ }
+ split(verstr, ver, ".");
+ if (ver[1] > od_ver ||
+ (ver[1] == od_ver && ver[2] >= od_sver)) {
+ exit 1;
+ } else {
+ printf("Warning: objdump version %s is older than %d.%d\n",
+ verstr, od_ver, od_sver);
+ print("Warning: Skipping posttest.");
+ # Logic is inverted, because we just skip test without error.
+ exit 0;
+ }
+}
diff --git a/arch/x86/tools/gen-insn-attr-x86.awk b/arch/x86/tools/gen-insn-attr-x86.awk
new file mode 100644
index 000000000..a42015b30
--- /dev/null
+++ b/arch/x86/tools/gen-insn-attr-x86.awk
@@ -0,0 +1,393 @@
+#!/bin/awk -f
+# SPDX-License-Identifier: GPL-2.0
+# gen-insn-attr-x86.awk: Instruction attribute table generator
+# Written by Masami Hiramatsu <mhiramat@redhat.com>
+#
+# Usage: awk -f gen-insn-attr-x86.awk x86-opcode-map.txt > inat-tables.c
+
+# Awk implementation sanity check
+function check_awk_implement() {
+ if (sprintf("%x", 0) != "0")
+ return "Your awk has a printf-format problem."
+ return ""
+}
+
+# Clear working vars
+function clear_vars() {
+ delete table
+ delete lptable2
+ delete lptable1
+ delete lptable3
+ eid = -1 # escape id
+ gid = -1 # group id
+ aid = -1 # AVX id
+ tname = ""
+}
+
+BEGIN {
+ # Implementation error checking
+ awkchecked = check_awk_implement()
+ if (awkchecked != "") {
+ print "Error: " awkchecked > "/dev/stderr"
+ print "Please try to use gawk." > "/dev/stderr"
+ exit 1
+ }
+
+ # Setup generating tables
+ print "/* x86 opcode map generated from x86-opcode-map.txt */"
+ print "/* Do not change this code. */\n"
+ ggid = 1
+ geid = 1
+ gaid = 0
+ delete etable
+ delete gtable
+ delete atable
+
+ opnd_expr = "^[A-Za-z/]"
+ ext_expr = "^\\("
+ sep_expr = "^\\|$"
+ group_expr = "^Grp[0-9A-Za-z]+"
+
+ imm_expr = "^[IJAOL][a-z]"
+ imm_flag["Ib"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)"
+ imm_flag["Jb"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)"
+ imm_flag["Iw"] = "INAT_MAKE_IMM(INAT_IMM_WORD)"
+ imm_flag["Id"] = "INAT_MAKE_IMM(INAT_IMM_DWORD)"
+ imm_flag["Iq"] = "INAT_MAKE_IMM(INAT_IMM_QWORD)"
+ imm_flag["Ap"] = "INAT_MAKE_IMM(INAT_IMM_PTR)"
+ imm_flag["Iz"] = "INAT_MAKE_IMM(INAT_IMM_VWORD32)"
+ imm_flag["Jz"] = "INAT_MAKE_IMM(INAT_IMM_VWORD32)"
+ imm_flag["Iv"] = "INAT_MAKE_IMM(INAT_IMM_VWORD)"
+ imm_flag["Ob"] = "INAT_MOFFSET"
+ imm_flag["Ov"] = "INAT_MOFFSET"
+ imm_flag["Lx"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)"
+
+ modrm_expr = "^([CDEGMNPQRSUVW/][a-z]+|NTA|T[012])"
+ force64_expr = "\\([df]64\\)"
+ rex_expr = "^REX(\\.[XRWB]+)*"
+ fpu_expr = "^ESC" # TODO
+
+ lprefix1_expr = "\\((66|!F3)\\)"
+ lprefix2_expr = "\\(F3\\)"
+ lprefix3_expr = "\\((F2|!F3|66&F2)\\)"
+ lprefix_expr = "\\((66|F2|F3)\\)"
+ max_lprefix = 4
+
+ # All opcodes starting with lower-case 'v', 'k' or with (v1) superscript
+ # accepts VEX prefix
+ vexok_opcode_expr = "^[vk].*"
+ vexok_expr = "\\(v1\\)"
+ # All opcodes with (v) superscript supports *only* VEX prefix
+ vexonly_expr = "\\(v\\)"
+ # All opcodes with (ev) superscript supports *only* EVEX prefix
+ evexonly_expr = "\\(ev\\)"
+
+ prefix_expr = "\\(Prefix\\)"
+ prefix_num["Operand-Size"] = "INAT_PFX_OPNDSZ"
+ prefix_num["REPNE"] = "INAT_PFX_REPNE"
+ prefix_num["REP/REPE"] = "INAT_PFX_REPE"
+ prefix_num["XACQUIRE"] = "INAT_PFX_REPNE"
+ prefix_num["XRELEASE"] = "INAT_PFX_REPE"
+ prefix_num["LOCK"] = "INAT_PFX_LOCK"
+ prefix_num["SEG=CS"] = "INAT_PFX_CS"
+ prefix_num["SEG=DS"] = "INAT_PFX_DS"
+ prefix_num["SEG=ES"] = "INAT_PFX_ES"
+ prefix_num["SEG=FS"] = "INAT_PFX_FS"
+ prefix_num["SEG=GS"] = "INAT_PFX_GS"
+ prefix_num["SEG=SS"] = "INAT_PFX_SS"
+ prefix_num["Address-Size"] = "INAT_PFX_ADDRSZ"
+ prefix_num["VEX+1byte"] = "INAT_PFX_VEX2"
+ prefix_num["VEX+2byte"] = "INAT_PFX_VEX3"
+ prefix_num["EVEX"] = "INAT_PFX_EVEX"
+
+ clear_vars()
+}
+
+function semantic_error(msg) {
+ print "Semantic error at " NR ": " msg > "/dev/stderr"
+ exit 1
+}
+
+function debug(msg) {
+ print "DEBUG: " msg
+}
+
+function array_size(arr, i,c) {
+ c = 0
+ for (i in arr)
+ c++
+ return c
+}
+
+/^Table:/ {
+ print "/* " $0 " */"
+ if (tname != "")
+ semantic_error("Hit Table: before EndTable:.");
+}
+
+/^Referrer:/ {
+ if (NF != 1) {
+ # escape opcode table
+ ref = ""
+ for (i = 2; i <= NF; i++)
+ ref = ref $i
+ eid = escape[ref]
+ tname = sprintf("inat_escape_table_%d", eid)
+ }
+}
+
+/^AVXcode:/ {
+ if (NF != 1) {
+ # AVX/escape opcode table
+ aid = $2
+ if (gaid <= aid)
+ gaid = aid + 1
+ if (tname == "") # AVX only opcode table
+ tname = sprintf("inat_avx_table_%d", $2)
+ }
+ if (aid == -1 && eid == -1) # primary opcode table
+ tname = "inat_primary_table"
+}
+
+/^GrpTable:/ {
+ print "/* " $0 " */"
+ if (!($2 in group))
+ semantic_error("No group: " $2 )
+ gid = group[$2]
+ tname = "inat_group_table_" gid
+}
+
+function print_table(tbl,name,fmt,n)
+{
+ print "const insn_attr_t " name " = {"
+ for (i = 0; i < n; i++) {
+ id = sprintf(fmt, i)
+ if (tbl[id])
+ print " [" id "] = " tbl[id] ","
+ }
+ print "};"
+}
+
+/^EndTable/ {
+ if (gid != -1) {
+ # print group tables
+ if (array_size(table) != 0) {
+ print_table(table, tname "[INAT_GROUP_TABLE_SIZE]",
+ "0x%x", 8)
+ gtable[gid,0] = tname
+ }
+ if (array_size(lptable1) != 0) {
+ print_table(lptable1, tname "_1[INAT_GROUP_TABLE_SIZE]",
+ "0x%x", 8)
+ gtable[gid,1] = tname "_1"
+ }
+ if (array_size(lptable2) != 0) {
+ print_table(lptable2, tname "_2[INAT_GROUP_TABLE_SIZE]",
+ "0x%x", 8)
+ gtable[gid,2] = tname "_2"
+ }
+ if (array_size(lptable3) != 0) {
+ print_table(lptable3, tname "_3[INAT_GROUP_TABLE_SIZE]",
+ "0x%x", 8)
+ gtable[gid,3] = tname "_3"
+ }
+ } else {
+ # print primary/escaped tables
+ if (array_size(table) != 0) {
+ print_table(table, tname "[INAT_OPCODE_TABLE_SIZE]",
+ "0x%02x", 256)
+ etable[eid,0] = tname
+ if (aid >= 0)
+ atable[aid,0] = tname
+ }
+ if (array_size(lptable1) != 0) {
+ print_table(lptable1,tname "_1[INAT_OPCODE_TABLE_SIZE]",
+ "0x%02x", 256)
+ etable[eid,1] = tname "_1"
+ if (aid >= 0)
+ atable[aid,1] = tname "_1"
+ }
+ if (array_size(lptable2) != 0) {
+ print_table(lptable2,tname "_2[INAT_OPCODE_TABLE_SIZE]",
+ "0x%02x", 256)
+ etable[eid,2] = tname "_2"
+ if (aid >= 0)
+ atable[aid,2] = tname "_2"
+ }
+ if (array_size(lptable3) != 0) {
+ print_table(lptable3,tname "_3[INAT_OPCODE_TABLE_SIZE]",
+ "0x%02x", 256)
+ etable[eid,3] = tname "_3"
+ if (aid >= 0)
+ atable[aid,3] = tname "_3"
+ }
+ }
+ print ""
+ clear_vars()
+}
+
+function add_flags(old,new) {
+ if (old && new)
+ return old " | " new
+ else if (old)
+ return old
+ else
+ return new
+}
+
+# convert operands to flags.
+function convert_operands(count,opnd, i,j,imm,mod)
+{
+ imm = null
+ mod = null
+ for (j = 1; j <= count; j++) {
+ i = opnd[j]
+ if (match(i, imm_expr) == 1) {
+ if (!imm_flag[i])
+ semantic_error("Unknown imm opnd: " i)
+ if (imm) {
+ if (i != "Ib")
+ semantic_error("Second IMM error")
+ imm = add_flags(imm, "INAT_SCNDIMM")
+ } else
+ imm = imm_flag[i]
+ } else if (match(i, modrm_expr))
+ mod = "INAT_MODRM"
+ }
+ return add_flags(imm, mod)
+}
+
+/^[0-9a-f]+:/ {
+ if (NR == 1)
+ next
+ # get index
+ idx = "0x" substr($1, 1, index($1,":") - 1)
+ if (idx in table)
+ semantic_error("Redefine " idx " in " tname)
+
+ # check if escaped opcode
+ if ("escape" == $2) {
+ if ($3 != "#")
+ semantic_error("No escaped name")
+ ref = ""
+ for (i = 4; i <= NF; i++)
+ ref = ref $i
+ if (ref in escape)
+ semantic_error("Redefine escape (" ref ")")
+ escape[ref] = geid
+ geid++
+ table[idx] = "INAT_MAKE_ESCAPE(" escape[ref] ")"
+ next
+ }
+
+ variant = null
+ # converts
+ i = 2
+ while (i <= NF) {
+ opcode = $(i++)
+ delete opnds
+ ext = null
+ flags = null
+ opnd = null
+ # parse one opcode
+ if (match($i, opnd_expr)) {
+ opnd = $i
+ count = split($(i++), opnds, ",")
+ flags = convert_operands(count, opnds)
+ }
+ if (match($i, ext_expr))
+ ext = $(i++)
+ if (match($i, sep_expr))
+ i++
+ else if (i < NF)
+ semantic_error($i " is not a separator")
+
+ # check if group opcode
+ if (match(opcode, group_expr)) {
+ if (!(opcode in group)) {
+ group[opcode] = ggid
+ ggid++
+ }
+ flags = add_flags(flags, "INAT_MAKE_GROUP(" group[opcode] ")")
+ }
+ # check force(or default) 64bit
+ if (match(ext, force64_expr))
+ flags = add_flags(flags, "INAT_FORCE64")
+
+ # check REX prefix
+ if (match(opcode, rex_expr))
+ flags = add_flags(flags, "INAT_MAKE_PREFIX(INAT_PFX_REX)")
+
+ # check coprocessor escape : TODO
+ if (match(opcode, fpu_expr))
+ flags = add_flags(flags, "INAT_MODRM")
+
+ # check VEX codes
+ if (match(ext, evexonly_expr))
+ flags = add_flags(flags, "INAT_VEXOK | INAT_EVEXONLY")
+ else if (match(ext, vexonly_expr))
+ flags = add_flags(flags, "INAT_VEXOK | INAT_VEXONLY")
+ else if (match(ext, vexok_expr) || match(opcode, vexok_opcode_expr))
+ flags = add_flags(flags, "INAT_VEXOK")
+
+ # check prefixes
+ if (match(ext, prefix_expr)) {
+ if (!prefix_num[opcode])
+ semantic_error("Unknown prefix: " opcode)
+ flags = add_flags(flags, "INAT_MAKE_PREFIX(" prefix_num[opcode] ")")
+ }
+ if (length(flags) == 0)
+ continue
+ # check if last prefix
+ if (match(ext, lprefix1_expr)) {
+ lptable1[idx] = add_flags(lptable1[idx],flags)
+ variant = "INAT_VARIANT"
+ }
+ if (match(ext, lprefix2_expr)) {
+ lptable2[idx] = add_flags(lptable2[idx],flags)
+ variant = "INAT_VARIANT"
+ }
+ if (match(ext, lprefix3_expr)) {
+ lptable3[idx] = add_flags(lptable3[idx],flags)
+ variant = "INAT_VARIANT"
+ }
+ if (!match(ext, lprefix_expr)){
+ table[idx] = add_flags(table[idx],flags)
+ }
+ }
+ if (variant)
+ table[idx] = add_flags(table[idx],variant)
+}
+
+END {
+ if (awkchecked != "")
+ exit 1
+ # print escape opcode map's array
+ print "/* Escape opcode map array */"
+ print "const insn_attr_t * const inat_escape_tables[INAT_ESC_MAX + 1]" \
+ "[INAT_LSTPFX_MAX + 1] = {"
+ for (i = 0; i < geid; i++)
+ for (j = 0; j < max_lprefix; j++)
+ if (etable[i,j])
+ print " ["i"]["j"] = "etable[i,j]","
+ print "};\n"
+ # print group opcode map's array
+ print "/* Group opcode map array */"
+ print "const insn_attr_t * const inat_group_tables[INAT_GRP_MAX + 1]"\
+ "[INAT_LSTPFX_MAX + 1] = {"
+ for (i = 0; i < ggid; i++)
+ for (j = 0; j < max_lprefix; j++)
+ if (gtable[i,j])
+ print " ["i"]["j"] = "gtable[i,j]","
+ print "};\n"
+ # print AVX opcode map's array
+ print "/* AVX opcode map array */"
+ print "const insn_attr_t * const inat_avx_tables[X86_VEX_M_MAX + 1]"\
+ "[INAT_LSTPFX_MAX + 1] = {"
+ for (i = 0; i < gaid; i++)
+ for (j = 0; j < max_lprefix; j++)
+ if (atable[i,j])
+ print " ["i"]["j"] = "atable[i,j]","
+ print "};"
+}
+
diff --git a/arch/x86/tools/insn_decoder_test.c b/arch/x86/tools/insn_decoder_test.c
new file mode 100644
index 000000000..a3b4fd954
--- /dev/null
+++ b/arch/x86/tools/insn_decoder_test.c
@@ -0,0 +1,180 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * Copyright (C) IBM Corporation, 2009
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include <unistd.h>
+#include <stdarg.h>
+
+#define unlikely(cond) (cond)
+
+#include <asm/insn.h>
+#include <inat.c>
+#include <insn.c>
+
+/*
+ * Test of instruction analysis in general and insn_get_length() in
+ * particular. See if insn_get_length() and the disassembler agree
+ * on the length of each instruction in an elf disassembly.
+ *
+ * Usage: objdump -d a.out | awk -f objdump_reformat.awk | ./insn_decoder_test
+ */
+
+const char *prog;
+static int verbose;
+static int x86_64;
+
+static void usage(void)
+{
+ fprintf(stderr, "Usage: objdump -d a.out | awk -f objdump_reformat.awk"
+ " | %s [-y|-n] [-v]\n", prog);
+ fprintf(stderr, "\t-y 64bit mode\n");
+ fprintf(stderr, "\t-n 32bit mode\n");
+ fprintf(stderr, "\t-v verbose mode\n");
+ exit(1);
+}
+
+static void malformed_line(const char *line, int line_nr)
+{
+ fprintf(stderr, "%s: error: malformed line %d:\n%s",
+ prog, line_nr, line);
+ exit(3);
+}
+
+static void pr_warn(const char *fmt, ...)
+{
+ va_list ap;
+
+ fprintf(stderr, "%s: warning: ", prog);
+ va_start(ap, fmt);
+ vfprintf(stderr, fmt, ap);
+ va_end(ap);
+}
+
+static void dump_field(FILE *fp, const char *name, const char *indent,
+ struct insn_field *field)
+{
+ fprintf(fp, "%s.%s = {\n", indent, name);
+ fprintf(fp, "%s\t.value = %d, bytes[] = {%x, %x, %x, %x},\n",
+ indent, field->value, field->bytes[0], field->bytes[1],
+ field->bytes[2], field->bytes[3]);
+ fprintf(fp, "%s\t.got = %d, .nbytes = %d},\n", indent,
+ field->got, field->nbytes);
+}
+
+static void dump_insn(FILE *fp, struct insn *insn)
+{
+ fprintf(fp, "Instruction = {\n");
+ dump_field(fp, "prefixes", "\t", &insn->prefixes);
+ dump_field(fp, "rex_prefix", "\t", &insn->rex_prefix);
+ dump_field(fp, "vex_prefix", "\t", &insn->vex_prefix);
+ dump_field(fp, "opcode", "\t", &insn->opcode);
+ dump_field(fp, "modrm", "\t", &insn->modrm);
+ dump_field(fp, "sib", "\t", &insn->sib);
+ dump_field(fp, "displacement", "\t", &insn->displacement);
+ dump_field(fp, "immediate1", "\t", &insn->immediate1);
+ dump_field(fp, "immediate2", "\t", &insn->immediate2);
+ fprintf(fp, "\t.attr = %x, .opnd_bytes = %d, .addr_bytes = %d,\n",
+ insn->attr, insn->opnd_bytes, insn->addr_bytes);
+ fprintf(fp, "\t.length = %d, .x86_64 = %d, .kaddr = %p}\n",
+ insn->length, insn->x86_64, insn->kaddr);
+}
+
+static void parse_args(int argc, char **argv)
+{
+ int c;
+ prog = argv[0];
+ while ((c = getopt(argc, argv, "ynv")) != -1) {
+ switch (c) {
+ case 'y':
+ x86_64 = 1;
+ break;
+ case 'n':
+ x86_64 = 0;
+ break;
+ case 'v':
+ verbose = 1;
+ break;
+ default:
+ usage();
+ }
+ }
+}
+
+#define BUFSIZE 256
+
+int main(int argc, char **argv)
+{
+ char line[BUFSIZE], sym[BUFSIZE] = "<unknown>";
+ unsigned char insn_buf[16];
+ struct insn insn;
+ int insns = 0;
+ int warnings = 0;
+
+ parse_args(argc, argv);
+
+ while (fgets(line, BUFSIZE, stdin)) {
+ char copy[BUFSIZE], *s, *tab1, *tab2;
+ int nb = 0;
+ unsigned int b;
+
+ if (line[0] == '<') {
+ /* Symbol line */
+ strcpy(sym, line);
+ continue;
+ }
+
+ insns++;
+ memset(insn_buf, 0, 16);
+ strcpy(copy, line);
+ tab1 = strchr(copy, '\t');
+ if (!tab1)
+ malformed_line(line, insns);
+ s = tab1 + 1;
+ s += strspn(s, " ");
+ tab2 = strchr(s, '\t');
+ if (!tab2)
+ malformed_line(line, insns);
+ *tab2 = '\0'; /* Characters beyond tab2 aren't examined */
+ while (s < tab2) {
+ if (sscanf(s, "%x", &b) == 1) {
+ insn_buf[nb++] = (unsigned char) b;
+ s += 3;
+ } else
+ break;
+ }
+ /* Decode an instruction */
+ insn_init(&insn, insn_buf, sizeof(insn_buf), x86_64);
+ insn_get_length(&insn);
+ if (insn.length != nb) {
+ warnings++;
+ pr_warn("Found an x86 instruction decoder bug, "
+ "please report this.\n", sym);
+ pr_warn("%s", line);
+ pr_warn("objdump says %d bytes, but insn_get_length() "
+ "says %d\n", nb, insn.length);
+ if (verbose)
+ dump_insn(stderr, &insn);
+ }
+ }
+ if (warnings)
+ pr_warn("Decoded and checked %d instructions with %d "
+ "failures\n", insns, warnings);
+ else
+ fprintf(stdout, "%s: success: Decoded and checked %d"
+ " instructions\n", prog, insns);
+ return 0;
+}
diff --git a/arch/x86/tools/insn_sanity.c b/arch/x86/tools/insn_sanity.c
new file mode 100644
index 000000000..1972565ab
--- /dev/null
+++ b/arch/x86/tools/insn_sanity.c
@@ -0,0 +1,282 @@
+/*
+ * x86 decoder sanity test - based on test_get_insn.c
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2009
+ * Copyright (C) Hitachi, Ltd., 2011
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+#define unlikely(cond) (cond)
+#define ARRAY_SIZE(a) (sizeof(a)/sizeof(a[0]))
+
+#include <asm/insn.h>
+#include <inat.c>
+#include <insn.c>
+
+/*
+ * Test of instruction analysis against tampering.
+ * Feed random binary to instruction decoder and ensure not to
+ * access out-of-instruction-buffer.
+ */
+
+#define DEFAULT_MAX_ITER 10000
+#define INSN_NOP 0x90
+
+static const char *prog; /* Program name */
+static int verbose; /* Verbosity */
+static int x86_64; /* x86-64 bit mode flag */
+static unsigned int seed; /* Random seed */
+static unsigned long iter_start; /* Start of iteration number */
+static unsigned long iter_end = DEFAULT_MAX_ITER; /* End of iteration number */
+static FILE *input_file; /* Input file name */
+
+static void usage(const char *err)
+{
+ if (err)
+ fprintf(stderr, "%s: Error: %s\n\n", prog, err);
+ fprintf(stderr, "Usage: %s [-y|-n|-v] [-s seed[,no]] [-m max] [-i input]\n", prog);
+ fprintf(stderr, "\t-y 64bit mode\n");
+ fprintf(stderr, "\t-n 32bit mode\n");
+ fprintf(stderr, "\t-v Verbosity(-vv dumps any decoded result)\n");
+ fprintf(stderr, "\t-s Give a random seed (and iteration number)\n");
+ fprintf(stderr, "\t-m Give a maximum iteration number\n");
+ fprintf(stderr, "\t-i Give an input file with decoded binary\n");
+ exit(1);
+}
+
+static void dump_field(FILE *fp, const char *name, const char *indent,
+ struct insn_field *field)
+{
+ fprintf(fp, "%s.%s = {\n", indent, name);
+ fprintf(fp, "%s\t.value = %d, bytes[] = {%x, %x, %x, %x},\n",
+ indent, field->value, field->bytes[0], field->bytes[1],
+ field->bytes[2], field->bytes[3]);
+ fprintf(fp, "%s\t.got = %d, .nbytes = %d},\n", indent,
+ field->got, field->nbytes);
+}
+
+static void dump_insn(FILE *fp, struct insn *insn)
+{
+ fprintf(fp, "Instruction = {\n");
+ dump_field(fp, "prefixes", "\t", &insn->prefixes);
+ dump_field(fp, "rex_prefix", "\t", &insn->rex_prefix);
+ dump_field(fp, "vex_prefix", "\t", &insn->vex_prefix);
+ dump_field(fp, "opcode", "\t", &insn->opcode);
+ dump_field(fp, "modrm", "\t", &insn->modrm);
+ dump_field(fp, "sib", "\t", &insn->sib);
+ dump_field(fp, "displacement", "\t", &insn->displacement);
+ dump_field(fp, "immediate1", "\t", &insn->immediate1);
+ dump_field(fp, "immediate2", "\t", &insn->immediate2);
+ fprintf(fp, "\t.attr = %x, .opnd_bytes = %d, .addr_bytes = %d,\n",
+ insn->attr, insn->opnd_bytes, insn->addr_bytes);
+ fprintf(fp, "\t.length = %d, .x86_64 = %d, .kaddr = %p}\n",
+ insn->length, insn->x86_64, insn->kaddr);
+}
+
+static void dump_stream(FILE *fp, const char *msg, unsigned long nr_iter,
+ unsigned char *insn_buf, struct insn *insn)
+{
+ int i;
+
+ fprintf(fp, "%s:\n", msg);
+
+ dump_insn(fp, insn);
+
+ fprintf(fp, "You can reproduce this with below command(s);\n");
+
+ /* Input a decoded instruction sequence directly */
+ fprintf(fp, " $ echo ");
+ for (i = 0; i < MAX_INSN_SIZE; i++)
+ fprintf(fp, " %02x", insn_buf[i]);
+ fprintf(fp, " | %s -i -\n", prog);
+
+ if (!input_file) {
+ fprintf(fp, "Or \n");
+ /* Give a seed and iteration number */
+ fprintf(fp, " $ %s -s 0x%x,%lu\n", prog, seed, nr_iter);
+ }
+}
+
+static void init_random_seed(void)
+{
+ int fd;
+
+ fd = open("/dev/urandom", O_RDONLY);
+ if (fd < 0)
+ goto fail;
+
+ if (read(fd, &seed, sizeof(seed)) != sizeof(seed))
+ goto fail;
+
+ close(fd);
+ return;
+fail:
+ usage("Failed to open /dev/urandom");
+}
+
+/* Read given instruction sequence from the input file */
+static int read_next_insn(unsigned char *insn_buf)
+{
+ char buf[256] = "", *tmp;
+ int i;
+
+ tmp = fgets(buf, ARRAY_SIZE(buf), input_file);
+ if (tmp == NULL || feof(input_file))
+ return 0;
+
+ for (i = 0; i < MAX_INSN_SIZE; i++) {
+ insn_buf[i] = (unsigned char)strtoul(tmp, &tmp, 16);
+ if (*tmp != ' ')
+ break;
+ }
+
+ return i;
+}
+
+static int generate_insn(unsigned char *insn_buf)
+{
+ int i;
+
+ if (input_file)
+ return read_next_insn(insn_buf);
+
+ /* Fills buffer with random binary up to MAX_INSN_SIZE */
+ for (i = 0; i < MAX_INSN_SIZE - 1; i += 2)
+ *(unsigned short *)(&insn_buf[i]) = random() & 0xffff;
+
+ while (i < MAX_INSN_SIZE)
+ insn_buf[i++] = random() & 0xff;
+
+ return i;
+}
+
+static void parse_args(int argc, char **argv)
+{
+ int c;
+ char *tmp = NULL;
+ int set_seed = 0;
+
+ prog = argv[0];
+ while ((c = getopt(argc, argv, "ynvs:m:i:")) != -1) {
+ switch (c) {
+ case 'y':
+ x86_64 = 1;
+ break;
+ case 'n':
+ x86_64 = 0;
+ break;
+ case 'v':
+ verbose++;
+ break;
+ case 'i':
+ if (strcmp("-", optarg) == 0)
+ input_file = stdin;
+ else
+ input_file = fopen(optarg, "r");
+ if (!input_file)
+ usage("Failed to open input file");
+ break;
+ case 's':
+ seed = (unsigned int)strtoul(optarg, &tmp, 0);
+ if (*tmp == ',') {
+ optarg = tmp + 1;
+ iter_start = strtoul(optarg, &tmp, 0);
+ }
+ if (*tmp != '\0' || tmp == optarg)
+ usage("Failed to parse seed");
+ set_seed = 1;
+ break;
+ case 'm':
+ iter_end = strtoul(optarg, &tmp, 0);
+ if (*tmp != '\0' || tmp == optarg)
+ usage("Failed to parse max_iter");
+ break;
+ default:
+ usage(NULL);
+ }
+ }
+
+ /* Check errors */
+ if (iter_end < iter_start)
+ usage("Max iteration number must be bigger than iter-num");
+
+ if (set_seed && input_file)
+ usage("Don't use input file (-i) with random seed (-s)");
+
+ /* Initialize random seed */
+ if (!input_file) {
+ if (!set_seed) /* No seed is given */
+ init_random_seed();
+ srand(seed);
+ }
+}
+
+int main(int argc, char **argv)
+{
+ struct insn insn;
+ int insns = 0;
+ int errors = 0;
+ unsigned long i;
+ unsigned char insn_buf[MAX_INSN_SIZE * 2];
+
+ parse_args(argc, argv);
+
+ /* Prepare stop bytes with NOPs */
+ memset(insn_buf + MAX_INSN_SIZE, INSN_NOP, MAX_INSN_SIZE);
+
+ for (i = 0; i < iter_end; i++) {
+ if (generate_insn(insn_buf) <= 0)
+ break;
+
+ if (i < iter_start) /* Skip to given iteration number */
+ continue;
+
+ /* Decode an instruction */
+ insn_init(&insn, insn_buf, sizeof(insn_buf), x86_64);
+ insn_get_length(&insn);
+
+ if (insn.next_byte <= insn.kaddr ||
+ insn.kaddr + MAX_INSN_SIZE < insn.next_byte) {
+ /* Access out-of-range memory */
+ dump_stream(stderr, "Error: Found an access violation", i, insn_buf, &insn);
+ errors++;
+ } else if (verbose && !insn_complete(&insn))
+ dump_stream(stdout, "Info: Found an undecodable input", i, insn_buf, &insn);
+ else if (verbose >= 2)
+ dump_insn(stdout, &insn);
+ insns++;
+ }
+
+ fprintf((errors) ? stderr : stdout,
+ "%s: %s: decoded and checked %d %s instructions with %d errors (seed:0x%x)\n",
+ prog,
+ (errors) ? "Failure" : "Success",
+ insns,
+ (input_file) ? "given" : "random",
+ errors,
+ seed);
+
+ return errors ? 1 : 0;
+}
diff --git a/arch/x86/tools/objdump_reformat.awk b/arch/x86/tools/objdump_reformat.awk
new file mode 100644
index 000000000..f418c91b7
--- /dev/null
+++ b/arch/x86/tools/objdump_reformat.awk
@@ -0,0 +1,48 @@
+#!/bin/awk -f
+# SPDX-License-Identifier: GPL-2.0
+# Usage: objdump -d a.out | awk -f objdump_reformat.awk | ./insn_decoder_test
+# Reformats the disassembly as follows:
+# - Removes all lines except the disassembled instructions.
+# - For instructions that exceed 1 line (7 bytes), crams all the hex bytes
+# into a single line.
+# - Remove bad(or prefix only) instructions
+
+BEGIN {
+ prev_addr = ""
+ prev_hex = ""
+ prev_mnemonic = ""
+ bad_expr = "(\\(bad\\)|^rex|^.byte|^rep(z|nz)$|^lock$|^es$|^cs$|^ss$|^ds$|^fs$|^gs$|^data(16|32)$|^addr(16|32|64))"
+ fwait_expr = "^9b "
+ fwait_str="9b\tfwait"
+}
+
+/^ *[0-9a-f]+ <[^>]*>:/ {
+ # Symbol entry
+ printf("%s%s\n", $2, $1)
+}
+
+/^ *[0-9a-f]+:/ {
+ if (split($0, field, "\t") < 3) {
+ # This is a continuation of the same insn.
+ prev_hex = prev_hex field[2]
+ } else {
+ # Skip bad instructions
+ if (match(prev_mnemonic, bad_expr))
+ prev_addr = ""
+ # Split fwait from other f* instructions
+ if (match(prev_hex, fwait_expr) && prev_mnemonic != "fwait") {
+ printf "%s\t%s\n", prev_addr, fwait_str
+ sub(fwait_expr, "", prev_hex)
+ }
+ if (prev_addr != "")
+ printf "%s\t%s\t%s\n", prev_addr, prev_hex, prev_mnemonic
+ prev_addr = field[1]
+ prev_hex = field[2]
+ prev_mnemonic = field[3]
+ }
+}
+
+END {
+ if (prev_addr != "")
+ printf "%s\t%s\t%s\n", prev_addr, prev_hex, prev_mnemonic
+}
diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c
new file mode 100644
index 000000000..aa046d46f
--- /dev/null
+++ b/arch/x86/tools/relocs.c
@@ -0,0 +1,1110 @@
+// SPDX-License-Identifier: GPL-2.0
+/* This is included from relocs_32/64.c */
+
+#define ElfW(type) _ElfW(ELF_BITS, type)
+#define _ElfW(bits, type) __ElfW(bits, type)
+#define __ElfW(bits, type) Elf##bits##_##type
+
+#define Elf_Addr ElfW(Addr)
+#define Elf_Ehdr ElfW(Ehdr)
+#define Elf_Phdr ElfW(Phdr)
+#define Elf_Shdr ElfW(Shdr)
+#define Elf_Sym ElfW(Sym)
+
+static Elf_Ehdr ehdr;
+
+struct relocs {
+ uint32_t *offset;
+ unsigned long count;
+ unsigned long size;
+};
+
+static struct relocs relocs16;
+static struct relocs relocs32;
+#if ELF_BITS == 64
+static struct relocs relocs32neg;
+static struct relocs relocs64;
+#endif
+
+struct section {
+ Elf_Shdr shdr;
+ struct section *link;
+ Elf_Sym *symtab;
+ Elf_Rel *reltab;
+ char *strtab;
+};
+static struct section *secs;
+
+static const char * const sym_regex_kernel[S_NSYMTYPES] = {
+/*
+ * Following symbols have been audited. There values are constant and do
+ * not change if bzImage is loaded at a different physical address than
+ * the address for which it has been compiled. Don't warn user about
+ * absolute relocations present w.r.t these symbols.
+ */
+ [S_ABS] =
+ "^(xen_irq_disable_direct_reloc$|"
+ "xen_save_fl_direct_reloc$|"
+ "VDSO|"
+ "__crc_)",
+
+/*
+ * These symbols are known to be relative, even if the linker marks them
+ * as absolute (typically defined outside any section in the linker script.)
+ */
+ [S_REL] =
+ "^(__init_(begin|end)|"
+ "__x86_cpu_dev_(start|end)|"
+ "(__parainstructions|__alt_instructions)(|_end)|"
+ "(__iommu_table|__apicdrivers|__smp_locks)(|_end)|"
+ "__(start|end)_pci_.*|"
+ "__(start|end)_builtin_fw|"
+ "__(start|stop)___ksymtab(|_gpl|_unused|_unused_gpl|_gpl_future)|"
+ "__(start|stop)___kcrctab(|_gpl|_unused|_unused_gpl|_gpl_future)|"
+ "__(start|stop)___param|"
+ "__(start|stop)___modver|"
+ "__(start|stop)___bug_table|"
+ "__tracedata_(start|end)|"
+ "__(start|stop)_notes|"
+ "__end_rodata|"
+ "__end_rodata_aligned|"
+ "__initramfs_start|"
+ "(jiffies|jiffies_64)|"
+#if ELF_BITS == 64
+ "__per_cpu_load|"
+ "init_per_cpu__.*|"
+ "__end_rodata_hpage_align|"
+#endif
+ "__vvar_page|"
+ "_end)$"
+};
+
+
+static const char * const sym_regex_realmode[S_NSYMTYPES] = {
+/*
+ * These symbols are known to be relative, even if the linker marks them
+ * as absolute (typically defined outside any section in the linker script.)
+ */
+ [S_REL] =
+ "^pa_",
+
+/*
+ * These are 16-bit segment symbols when compiling 16-bit code.
+ */
+ [S_SEG] =
+ "^real_mode_seg$",
+
+/*
+ * These are offsets belonging to segments, as opposed to linear addresses,
+ * when compiling 16-bit code.
+ */
+ [S_LIN] =
+ "^pa_",
+};
+
+static const char * const *sym_regex;
+
+static regex_t sym_regex_c[S_NSYMTYPES];
+static int is_reloc(enum symtype type, const char *sym_name)
+{
+ return sym_regex[type] &&
+ !regexec(&sym_regex_c[type], sym_name, 0, NULL, 0);
+}
+
+static void regex_init(int use_real_mode)
+{
+ char errbuf[128];
+ int err;
+ int i;
+
+ if (use_real_mode)
+ sym_regex = sym_regex_realmode;
+ else
+ sym_regex = sym_regex_kernel;
+
+ for (i = 0; i < S_NSYMTYPES; i++) {
+ if (!sym_regex[i])
+ continue;
+
+ err = regcomp(&sym_regex_c[i], sym_regex[i],
+ REG_EXTENDED|REG_NOSUB);
+
+ if (err) {
+ regerror(err, &sym_regex_c[i], errbuf, sizeof errbuf);
+ die("%s", errbuf);
+ }
+ }
+}
+
+static const char *sym_type(unsigned type)
+{
+ static const char *type_name[] = {
+#define SYM_TYPE(X) [X] = #X
+ SYM_TYPE(STT_NOTYPE),
+ SYM_TYPE(STT_OBJECT),
+ SYM_TYPE(STT_FUNC),
+ SYM_TYPE(STT_SECTION),
+ SYM_TYPE(STT_FILE),
+ SYM_TYPE(STT_COMMON),
+ SYM_TYPE(STT_TLS),
+#undef SYM_TYPE
+ };
+ const char *name = "unknown sym type name";
+ if (type < ARRAY_SIZE(type_name)) {
+ name = type_name[type];
+ }
+ return name;
+}
+
+static const char *sym_bind(unsigned bind)
+{
+ static const char *bind_name[] = {
+#define SYM_BIND(X) [X] = #X
+ SYM_BIND(STB_LOCAL),
+ SYM_BIND(STB_GLOBAL),
+ SYM_BIND(STB_WEAK),
+#undef SYM_BIND
+ };
+ const char *name = "unknown sym bind name";
+ if (bind < ARRAY_SIZE(bind_name)) {
+ name = bind_name[bind];
+ }
+ return name;
+}
+
+static const char *sym_visibility(unsigned visibility)
+{
+ static const char *visibility_name[] = {
+#define SYM_VISIBILITY(X) [X] = #X
+ SYM_VISIBILITY(STV_DEFAULT),
+ SYM_VISIBILITY(STV_INTERNAL),
+ SYM_VISIBILITY(STV_HIDDEN),
+ SYM_VISIBILITY(STV_PROTECTED),
+#undef SYM_VISIBILITY
+ };
+ const char *name = "unknown sym visibility name";
+ if (visibility < ARRAY_SIZE(visibility_name)) {
+ name = visibility_name[visibility];
+ }
+ return name;
+}
+
+static const char *rel_type(unsigned type)
+{
+ static const char *type_name[] = {
+#define REL_TYPE(X) [X] = #X
+#if ELF_BITS == 64
+ REL_TYPE(R_X86_64_NONE),
+ REL_TYPE(R_X86_64_64),
+ REL_TYPE(R_X86_64_PC32),
+ REL_TYPE(R_X86_64_GOT32),
+ REL_TYPE(R_X86_64_PLT32),
+ REL_TYPE(R_X86_64_COPY),
+ REL_TYPE(R_X86_64_GLOB_DAT),
+ REL_TYPE(R_X86_64_JUMP_SLOT),
+ REL_TYPE(R_X86_64_RELATIVE),
+ REL_TYPE(R_X86_64_GOTPCREL),
+ REL_TYPE(R_X86_64_32),
+ REL_TYPE(R_X86_64_32S),
+ REL_TYPE(R_X86_64_16),
+ REL_TYPE(R_X86_64_PC16),
+ REL_TYPE(R_X86_64_8),
+ REL_TYPE(R_X86_64_PC8),
+#else
+ REL_TYPE(R_386_NONE),
+ REL_TYPE(R_386_32),
+ REL_TYPE(R_386_PC32),
+ REL_TYPE(R_386_GOT32),
+ REL_TYPE(R_386_PLT32),
+ REL_TYPE(R_386_COPY),
+ REL_TYPE(R_386_GLOB_DAT),
+ REL_TYPE(R_386_JMP_SLOT),
+ REL_TYPE(R_386_RELATIVE),
+ REL_TYPE(R_386_GOTOFF),
+ REL_TYPE(R_386_GOTPC),
+ REL_TYPE(R_386_8),
+ REL_TYPE(R_386_PC8),
+ REL_TYPE(R_386_16),
+ REL_TYPE(R_386_PC16),
+#endif
+#undef REL_TYPE
+ };
+ const char *name = "unknown type rel type name";
+ if (type < ARRAY_SIZE(type_name) && type_name[type]) {
+ name = type_name[type];
+ }
+ return name;
+}
+
+static const char *sec_name(unsigned shndx)
+{
+ const char *sec_strtab;
+ const char *name;
+ sec_strtab = secs[ehdr.e_shstrndx].strtab;
+ name = "<noname>";
+ if (shndx < ehdr.e_shnum) {
+ name = sec_strtab + secs[shndx].shdr.sh_name;
+ }
+ else if (shndx == SHN_ABS) {
+ name = "ABSOLUTE";
+ }
+ else if (shndx == SHN_COMMON) {
+ name = "COMMON";
+ }
+ return name;
+}
+
+static const char *sym_name(const char *sym_strtab, Elf_Sym *sym)
+{
+ const char *name;
+ name = "<noname>";
+ if (sym->st_name) {
+ name = sym_strtab + sym->st_name;
+ }
+ else {
+ name = sec_name(sym->st_shndx);
+ }
+ return name;
+}
+
+static Elf_Sym *sym_lookup(const char *symname)
+{
+ int i;
+ for (i = 0; i < ehdr.e_shnum; i++) {
+ struct section *sec = &secs[i];
+ long nsyms;
+ char *strtab;
+ Elf_Sym *symtab;
+ Elf_Sym *sym;
+
+ if (sec->shdr.sh_type != SHT_SYMTAB)
+ continue;
+
+ nsyms = sec->shdr.sh_size/sizeof(Elf_Sym);
+ symtab = sec->symtab;
+ strtab = sec->link->strtab;
+
+ for (sym = symtab; --nsyms >= 0; sym++) {
+ if (!sym->st_name)
+ continue;
+ if (strcmp(symname, strtab + sym->st_name) == 0)
+ return sym;
+ }
+ }
+ return 0;
+}
+
+#if BYTE_ORDER == LITTLE_ENDIAN
+#define le16_to_cpu(val) (val)
+#define le32_to_cpu(val) (val)
+#define le64_to_cpu(val) (val)
+#endif
+#if BYTE_ORDER == BIG_ENDIAN
+#define le16_to_cpu(val) bswap_16(val)
+#define le32_to_cpu(val) bswap_32(val)
+#define le64_to_cpu(val) bswap_64(val)
+#endif
+
+static uint16_t elf16_to_cpu(uint16_t val)
+{
+ return le16_to_cpu(val);
+}
+
+static uint32_t elf32_to_cpu(uint32_t val)
+{
+ return le32_to_cpu(val);
+}
+
+#define elf_half_to_cpu(x) elf16_to_cpu(x)
+#define elf_word_to_cpu(x) elf32_to_cpu(x)
+
+#if ELF_BITS == 64
+static uint64_t elf64_to_cpu(uint64_t val)
+{
+ return le64_to_cpu(val);
+}
+#define elf_addr_to_cpu(x) elf64_to_cpu(x)
+#define elf_off_to_cpu(x) elf64_to_cpu(x)
+#define elf_xword_to_cpu(x) elf64_to_cpu(x)
+#else
+#define elf_addr_to_cpu(x) elf32_to_cpu(x)
+#define elf_off_to_cpu(x) elf32_to_cpu(x)
+#define elf_xword_to_cpu(x) elf32_to_cpu(x)
+#endif
+
+static void read_ehdr(FILE *fp)
+{
+ if (fread(&ehdr, sizeof(ehdr), 1, fp) != 1) {
+ die("Cannot read ELF header: %s\n",
+ strerror(errno));
+ }
+ if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0) {
+ die("No ELF magic\n");
+ }
+ if (ehdr.e_ident[EI_CLASS] != ELF_CLASS) {
+ die("Not a %d bit executable\n", ELF_BITS);
+ }
+ if (ehdr.e_ident[EI_DATA] != ELFDATA2LSB) {
+ die("Not a LSB ELF executable\n");
+ }
+ if (ehdr.e_ident[EI_VERSION] != EV_CURRENT) {
+ die("Unknown ELF version\n");
+ }
+ /* Convert the fields to native endian */
+ ehdr.e_type = elf_half_to_cpu(ehdr.e_type);
+ ehdr.e_machine = elf_half_to_cpu(ehdr.e_machine);
+ ehdr.e_version = elf_word_to_cpu(ehdr.e_version);
+ ehdr.e_entry = elf_addr_to_cpu(ehdr.e_entry);
+ ehdr.e_phoff = elf_off_to_cpu(ehdr.e_phoff);
+ ehdr.e_shoff = elf_off_to_cpu(ehdr.e_shoff);
+ ehdr.e_flags = elf_word_to_cpu(ehdr.e_flags);
+ ehdr.e_ehsize = elf_half_to_cpu(ehdr.e_ehsize);
+ ehdr.e_phentsize = elf_half_to_cpu(ehdr.e_phentsize);
+ ehdr.e_phnum = elf_half_to_cpu(ehdr.e_phnum);
+ ehdr.e_shentsize = elf_half_to_cpu(ehdr.e_shentsize);
+ ehdr.e_shnum = elf_half_to_cpu(ehdr.e_shnum);
+ ehdr.e_shstrndx = elf_half_to_cpu(ehdr.e_shstrndx);
+
+ if ((ehdr.e_type != ET_EXEC) && (ehdr.e_type != ET_DYN)) {
+ die("Unsupported ELF header type\n");
+ }
+ if (ehdr.e_machine != ELF_MACHINE) {
+ die("Not for %s\n", ELF_MACHINE_NAME);
+ }
+ if (ehdr.e_version != EV_CURRENT) {
+ die("Unknown ELF version\n");
+ }
+ if (ehdr.e_ehsize != sizeof(Elf_Ehdr)) {
+ die("Bad Elf header size\n");
+ }
+ if (ehdr.e_phentsize != sizeof(Elf_Phdr)) {
+ die("Bad program header entry\n");
+ }
+ if (ehdr.e_shentsize != sizeof(Elf_Shdr)) {
+ die("Bad section header entry\n");
+ }
+ if (ehdr.e_shstrndx >= ehdr.e_shnum) {
+ die("String table index out of bounds\n");
+ }
+}
+
+static void read_shdrs(FILE *fp)
+{
+ int i;
+ Elf_Shdr shdr;
+
+ secs = calloc(ehdr.e_shnum, sizeof(struct section));
+ if (!secs) {
+ die("Unable to allocate %d section headers\n",
+ ehdr.e_shnum);
+ }
+ if (fseek(fp, ehdr.e_shoff, SEEK_SET) < 0) {
+ die("Seek to %d failed: %s\n",
+ ehdr.e_shoff, strerror(errno));
+ }
+ for (i = 0; i < ehdr.e_shnum; i++) {
+ struct section *sec = &secs[i];
+ if (fread(&shdr, sizeof shdr, 1, fp) != 1)
+ die("Cannot read ELF section headers %d/%d: %s\n",
+ i, ehdr.e_shnum, strerror(errno));
+ sec->shdr.sh_name = elf_word_to_cpu(shdr.sh_name);
+ sec->shdr.sh_type = elf_word_to_cpu(shdr.sh_type);
+ sec->shdr.sh_flags = elf_xword_to_cpu(shdr.sh_flags);
+ sec->shdr.sh_addr = elf_addr_to_cpu(shdr.sh_addr);
+ sec->shdr.sh_offset = elf_off_to_cpu(shdr.sh_offset);
+ sec->shdr.sh_size = elf_xword_to_cpu(shdr.sh_size);
+ sec->shdr.sh_link = elf_word_to_cpu(shdr.sh_link);
+ sec->shdr.sh_info = elf_word_to_cpu(shdr.sh_info);
+ sec->shdr.sh_addralign = elf_xword_to_cpu(shdr.sh_addralign);
+ sec->shdr.sh_entsize = elf_xword_to_cpu(shdr.sh_entsize);
+ if (sec->shdr.sh_link < ehdr.e_shnum)
+ sec->link = &secs[sec->shdr.sh_link];
+ }
+
+}
+
+static void read_strtabs(FILE *fp)
+{
+ int i;
+ for (i = 0; i < ehdr.e_shnum; i++) {
+ struct section *sec = &secs[i];
+ if (sec->shdr.sh_type != SHT_STRTAB) {
+ continue;
+ }
+ sec->strtab = malloc(sec->shdr.sh_size);
+ if (!sec->strtab) {
+ die("malloc of %d bytes for strtab failed\n",
+ sec->shdr.sh_size);
+ }
+ if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) {
+ die("Seek to %d failed: %s\n",
+ sec->shdr.sh_offset, strerror(errno));
+ }
+ if (fread(sec->strtab, 1, sec->shdr.sh_size, fp)
+ != sec->shdr.sh_size) {
+ die("Cannot read symbol table: %s\n",
+ strerror(errno));
+ }
+ }
+}
+
+static void read_symtabs(FILE *fp)
+{
+ int i,j;
+ for (i = 0; i < ehdr.e_shnum; i++) {
+ struct section *sec = &secs[i];
+ if (sec->shdr.sh_type != SHT_SYMTAB) {
+ continue;
+ }
+ sec->symtab = malloc(sec->shdr.sh_size);
+ if (!sec->symtab) {
+ die("malloc of %d bytes for symtab failed\n",
+ sec->shdr.sh_size);
+ }
+ if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) {
+ die("Seek to %d failed: %s\n",
+ sec->shdr.sh_offset, strerror(errno));
+ }
+ if (fread(sec->symtab, 1, sec->shdr.sh_size, fp)
+ != sec->shdr.sh_size) {
+ die("Cannot read symbol table: %s\n",
+ strerror(errno));
+ }
+ for (j = 0; j < sec->shdr.sh_size/sizeof(Elf_Sym); j++) {
+ Elf_Sym *sym = &sec->symtab[j];
+ sym->st_name = elf_word_to_cpu(sym->st_name);
+ sym->st_value = elf_addr_to_cpu(sym->st_value);
+ sym->st_size = elf_xword_to_cpu(sym->st_size);
+ sym->st_shndx = elf_half_to_cpu(sym->st_shndx);
+ }
+ }
+}
+
+
+static void read_relocs(FILE *fp)
+{
+ int i,j;
+ for (i = 0; i < ehdr.e_shnum; i++) {
+ struct section *sec = &secs[i];
+ if (sec->shdr.sh_type != SHT_REL_TYPE) {
+ continue;
+ }
+ sec->reltab = malloc(sec->shdr.sh_size);
+ if (!sec->reltab) {
+ die("malloc of %d bytes for relocs failed\n",
+ sec->shdr.sh_size);
+ }
+ if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) {
+ die("Seek to %d failed: %s\n",
+ sec->shdr.sh_offset, strerror(errno));
+ }
+ if (fread(sec->reltab, 1, sec->shdr.sh_size, fp)
+ != sec->shdr.sh_size) {
+ die("Cannot read symbol table: %s\n",
+ strerror(errno));
+ }
+ for (j = 0; j < sec->shdr.sh_size/sizeof(Elf_Rel); j++) {
+ Elf_Rel *rel = &sec->reltab[j];
+ rel->r_offset = elf_addr_to_cpu(rel->r_offset);
+ rel->r_info = elf_xword_to_cpu(rel->r_info);
+#if (SHT_REL_TYPE == SHT_RELA)
+ rel->r_addend = elf_xword_to_cpu(rel->r_addend);
+#endif
+ }
+ }
+}
+
+
+static void print_absolute_symbols(void)
+{
+ int i;
+ const char *format;
+
+ if (ELF_BITS == 64)
+ format = "%5d %016"PRIx64" %5"PRId64" %10s %10s %12s %s\n";
+ else
+ format = "%5d %08"PRIx32" %5"PRId32" %10s %10s %12s %s\n";
+
+ printf("Absolute symbols\n");
+ printf(" Num: Value Size Type Bind Visibility Name\n");
+ for (i = 0; i < ehdr.e_shnum; i++) {
+ struct section *sec = &secs[i];
+ char *sym_strtab;
+ int j;
+
+ if (sec->shdr.sh_type != SHT_SYMTAB) {
+ continue;
+ }
+ sym_strtab = sec->link->strtab;
+ for (j = 0; j < sec->shdr.sh_size/sizeof(Elf_Sym); j++) {
+ Elf_Sym *sym;
+ const char *name;
+ sym = &sec->symtab[j];
+ name = sym_name(sym_strtab, sym);
+ if (sym->st_shndx != SHN_ABS) {
+ continue;
+ }
+ printf(format,
+ j, sym->st_value, sym->st_size,
+ sym_type(ELF_ST_TYPE(sym->st_info)),
+ sym_bind(ELF_ST_BIND(sym->st_info)),
+ sym_visibility(ELF_ST_VISIBILITY(sym->st_other)),
+ name);
+ }
+ }
+ printf("\n");
+}
+
+static void print_absolute_relocs(void)
+{
+ int i, printed = 0;
+ const char *format;
+
+ if (ELF_BITS == 64)
+ format = "%016"PRIx64" %016"PRIx64" %10s %016"PRIx64" %s\n";
+ else
+ format = "%08"PRIx32" %08"PRIx32" %10s %08"PRIx32" %s\n";
+
+ for (i = 0; i < ehdr.e_shnum; i++) {
+ struct section *sec = &secs[i];
+ struct section *sec_applies, *sec_symtab;
+ char *sym_strtab;
+ Elf_Sym *sh_symtab;
+ int j;
+ if (sec->shdr.sh_type != SHT_REL_TYPE) {
+ continue;
+ }
+ sec_symtab = sec->link;
+ sec_applies = &secs[sec->shdr.sh_info];
+ if (!(sec_applies->shdr.sh_flags & SHF_ALLOC)) {
+ continue;
+ }
+ sh_symtab = sec_symtab->symtab;
+ sym_strtab = sec_symtab->link->strtab;
+ for (j = 0; j < sec->shdr.sh_size/sizeof(Elf_Rel); j++) {
+ Elf_Rel *rel;
+ Elf_Sym *sym;
+ const char *name;
+ rel = &sec->reltab[j];
+ sym = &sh_symtab[ELF_R_SYM(rel->r_info)];
+ name = sym_name(sym_strtab, sym);
+ if (sym->st_shndx != SHN_ABS) {
+ continue;
+ }
+
+ /* Absolute symbols are not relocated if bzImage is
+ * loaded at a non-compiled address. Display a warning
+ * to user at compile time about the absolute
+ * relocations present.
+ *
+ * User need to audit the code to make sure
+ * some symbols which should have been section
+ * relative have not become absolute because of some
+ * linker optimization or wrong programming usage.
+ *
+ * Before warning check if this absolute symbol
+ * relocation is harmless.
+ */
+ if (is_reloc(S_ABS, name) || is_reloc(S_REL, name))
+ continue;
+
+ if (!printed) {
+ printf("WARNING: Absolute relocations"
+ " present\n");
+ printf("Offset Info Type Sym.Value "
+ "Sym.Name\n");
+ printed = 1;
+ }
+
+ printf(format,
+ rel->r_offset,
+ rel->r_info,
+ rel_type(ELF_R_TYPE(rel->r_info)),
+ sym->st_value,
+ name);
+ }
+ }
+
+ if (printed)
+ printf("\n");
+}
+
+static void add_reloc(struct relocs *r, uint32_t offset)
+{
+ if (r->count == r->size) {
+ unsigned long newsize = r->size + 50000;
+ void *mem = realloc(r->offset, newsize * sizeof(r->offset[0]));
+
+ if (!mem)
+ die("realloc of %ld entries for relocs failed\n",
+ newsize);
+ r->offset = mem;
+ r->size = newsize;
+ }
+ r->offset[r->count++] = offset;
+}
+
+static void walk_relocs(int (*process)(struct section *sec, Elf_Rel *rel,
+ Elf_Sym *sym, const char *symname))
+{
+ int i;
+ /* Walk through the relocations */
+ for (i = 0; i < ehdr.e_shnum; i++) {
+ char *sym_strtab;
+ Elf_Sym *sh_symtab;
+ struct section *sec_applies, *sec_symtab;
+ int j;
+ struct section *sec = &secs[i];
+
+ if (sec->shdr.sh_type != SHT_REL_TYPE) {
+ continue;
+ }
+ sec_symtab = sec->link;
+ sec_applies = &secs[sec->shdr.sh_info];
+ if (!(sec_applies->shdr.sh_flags & SHF_ALLOC)) {
+ continue;
+ }
+ sh_symtab = sec_symtab->symtab;
+ sym_strtab = sec_symtab->link->strtab;
+ for (j = 0; j < sec->shdr.sh_size/sizeof(Elf_Rel); j++) {
+ Elf_Rel *rel = &sec->reltab[j];
+ Elf_Sym *sym = &sh_symtab[ELF_R_SYM(rel->r_info)];
+ const char *symname = sym_name(sym_strtab, sym);
+
+ process(sec, rel, sym, symname);
+ }
+ }
+}
+
+/*
+ * The .data..percpu section is a special case for x86_64 SMP kernels.
+ * It is used to initialize the actual per_cpu areas and to provide
+ * definitions for the per_cpu variables that correspond to their offsets
+ * within the percpu area. Since the values of all of the symbols need
+ * to be offsets from the start of the per_cpu area the virtual address
+ * (sh_addr) of .data..percpu is 0 in SMP kernels.
+ *
+ * This means that:
+ *
+ * Relocations that reference symbols in the per_cpu area do not
+ * need further relocation (since the value is an offset relative
+ * to the start of the per_cpu area that does not change).
+ *
+ * Relocations that apply to the per_cpu area need to have their
+ * offset adjusted by by the value of __per_cpu_load to make them
+ * point to the correct place in the loaded image (because the
+ * virtual address of .data..percpu is 0).
+ *
+ * For non SMP kernels .data..percpu is linked as part of the normal
+ * kernel data and does not require special treatment.
+ *
+ */
+static int per_cpu_shndx = -1;
+static Elf_Addr per_cpu_load_addr;
+
+static void percpu_init(void)
+{
+ int i;
+ for (i = 0; i < ehdr.e_shnum; i++) {
+ ElfW(Sym) *sym;
+ if (strcmp(sec_name(i), ".data..percpu"))
+ continue;
+
+ if (secs[i].shdr.sh_addr != 0) /* non SMP kernel */
+ return;
+
+ sym = sym_lookup("__per_cpu_load");
+ if (!sym)
+ die("can't find __per_cpu_load\n");
+
+ per_cpu_shndx = i;
+ per_cpu_load_addr = sym->st_value;
+ return;
+ }
+}
+
+#if ELF_BITS == 64
+
+/*
+ * Check to see if a symbol lies in the .data..percpu section.
+ *
+ * The linker incorrectly associates some symbols with the
+ * .data..percpu section so we also need to check the symbol
+ * name to make sure that we classify the symbol correctly.
+ *
+ * The GNU linker incorrectly associates:
+ * __init_begin
+ * __per_cpu_load
+ *
+ * The "gold" linker incorrectly associates:
+ * init_per_cpu__irq_stack_union
+ * init_per_cpu__gdt_page
+ */
+static int is_percpu_sym(ElfW(Sym) *sym, const char *symname)
+{
+ return (sym->st_shndx == per_cpu_shndx) &&
+ strcmp(symname, "__init_begin") &&
+ strcmp(symname, "__per_cpu_load") &&
+ strncmp(symname, "init_per_cpu_", 13);
+}
+
+
+static int do_reloc64(struct section *sec, Elf_Rel *rel, ElfW(Sym) *sym,
+ const char *symname)
+{
+ unsigned r_type = ELF64_R_TYPE(rel->r_info);
+ ElfW(Addr) offset = rel->r_offset;
+ int shn_abs = (sym->st_shndx == SHN_ABS) && !is_reloc(S_REL, symname);
+
+ if (sym->st_shndx == SHN_UNDEF)
+ return 0;
+
+ /*
+ * Adjust the offset if this reloc applies to the percpu section.
+ */
+ if (sec->shdr.sh_info == per_cpu_shndx)
+ offset += per_cpu_load_addr;
+
+ switch (r_type) {
+ case R_X86_64_NONE:
+ /* NONE can be ignored. */
+ break;
+
+ case R_X86_64_PC32:
+ case R_X86_64_PLT32:
+ /*
+ * PC relative relocations don't need to be adjusted unless
+ * referencing a percpu symbol.
+ *
+ * NB: R_X86_64_PLT32 can be treated as R_X86_64_PC32.
+ */
+ if (is_percpu_sym(sym, symname))
+ add_reloc(&relocs32neg, offset);
+ break;
+
+ case R_X86_64_32:
+ case R_X86_64_32S:
+ case R_X86_64_64:
+ /*
+ * References to the percpu area don't need to be adjusted.
+ */
+ if (is_percpu_sym(sym, symname))
+ break;
+
+ if (shn_abs) {
+ /*
+ * Whitelisted absolute symbols do not require
+ * relocation.
+ */
+ if (is_reloc(S_ABS, symname))
+ break;
+
+ die("Invalid absolute %s relocation: %s\n",
+ rel_type(r_type), symname);
+ break;
+ }
+
+ /*
+ * Relocation offsets for 64 bit kernels are output
+ * as 32 bits and sign extended back to 64 bits when
+ * the relocations are processed.
+ * Make sure that the offset will fit.
+ */
+ if ((int32_t)offset != (int64_t)offset)
+ die("Relocation offset doesn't fit in 32 bits\n");
+
+ if (r_type == R_X86_64_64)
+ add_reloc(&relocs64, offset);
+ else
+ add_reloc(&relocs32, offset);
+ break;
+
+ default:
+ die("Unsupported relocation type: %s (%d)\n",
+ rel_type(r_type), r_type);
+ break;
+ }
+
+ return 0;
+}
+
+#else
+
+static int do_reloc32(struct section *sec, Elf_Rel *rel, Elf_Sym *sym,
+ const char *symname)
+{
+ unsigned r_type = ELF32_R_TYPE(rel->r_info);
+ int shn_abs = (sym->st_shndx == SHN_ABS) && !is_reloc(S_REL, symname);
+
+ switch (r_type) {
+ case R_386_NONE:
+ case R_386_PC32:
+ case R_386_PC16:
+ case R_386_PC8:
+ case R_386_PLT32:
+ /*
+ * NONE can be ignored and PC relative relocations don't need
+ * to be adjusted. Because sym must be defined, R_386_PLT32 can
+ * be treated the same way as R_386_PC32.
+ */
+ break;
+
+ case R_386_32:
+ if (shn_abs) {
+ /*
+ * Whitelisted absolute symbols do not require
+ * relocation.
+ */
+ if (is_reloc(S_ABS, symname))
+ break;
+
+ die("Invalid absolute %s relocation: %s\n",
+ rel_type(r_type), symname);
+ break;
+ }
+
+ add_reloc(&relocs32, rel->r_offset);
+ break;
+
+ default:
+ die("Unsupported relocation type: %s (%d)\n",
+ rel_type(r_type), r_type);
+ break;
+ }
+
+ return 0;
+}
+
+static int do_reloc_real(struct section *sec, Elf_Rel *rel, Elf_Sym *sym,
+ const char *symname)
+{
+ unsigned r_type = ELF32_R_TYPE(rel->r_info);
+ int shn_abs = (sym->st_shndx == SHN_ABS) && !is_reloc(S_REL, symname);
+
+ switch (r_type) {
+ case R_386_NONE:
+ case R_386_PC32:
+ case R_386_PC16:
+ case R_386_PC8:
+ case R_386_PLT32:
+ /*
+ * NONE can be ignored and PC relative relocations don't need
+ * to be adjusted. Because sym must be defined, R_386_PLT32 can
+ * be treated the same way as R_386_PC32.
+ */
+ break;
+
+ case R_386_16:
+ if (shn_abs) {
+ /*
+ * Whitelisted absolute symbols do not require
+ * relocation.
+ */
+ if (is_reloc(S_ABS, symname))
+ break;
+
+ if (is_reloc(S_SEG, symname)) {
+ add_reloc(&relocs16, rel->r_offset);
+ break;
+ }
+ } else {
+ if (!is_reloc(S_LIN, symname))
+ break;
+ }
+ die("Invalid %s %s relocation: %s\n",
+ shn_abs ? "absolute" : "relative",
+ rel_type(r_type), symname);
+ break;
+
+ case R_386_32:
+ if (shn_abs) {
+ /*
+ * Whitelisted absolute symbols do not require
+ * relocation.
+ */
+ if (is_reloc(S_ABS, symname))
+ break;
+
+ if (is_reloc(S_REL, symname)) {
+ add_reloc(&relocs32, rel->r_offset);
+ break;
+ }
+ } else {
+ if (is_reloc(S_LIN, symname))
+ add_reloc(&relocs32, rel->r_offset);
+ break;
+ }
+ die("Invalid %s %s relocation: %s\n",
+ shn_abs ? "absolute" : "relative",
+ rel_type(r_type), symname);
+ break;
+
+ default:
+ die("Unsupported relocation type: %s (%d)\n",
+ rel_type(r_type), r_type);
+ break;
+ }
+
+ return 0;
+}
+
+#endif
+
+static int cmp_relocs(const void *va, const void *vb)
+{
+ const uint32_t *a, *b;
+ a = va; b = vb;
+ return (*a == *b)? 0 : (*a > *b)? 1 : -1;
+}
+
+static void sort_relocs(struct relocs *r)
+{
+ qsort(r->offset, r->count, sizeof(r->offset[0]), cmp_relocs);
+}
+
+static int write32(uint32_t v, FILE *f)
+{
+ unsigned char buf[4];
+
+ put_unaligned_le32(v, buf);
+ return fwrite(buf, 1, 4, f) == 4 ? 0 : -1;
+}
+
+static int write32_as_text(uint32_t v, FILE *f)
+{
+ return fprintf(f, "\t.long 0x%08"PRIx32"\n", v) > 0 ? 0 : -1;
+}
+
+static void emit_relocs(int as_text, int use_real_mode)
+{
+ int i;
+ int (*write_reloc)(uint32_t, FILE *) = write32;
+ int (*do_reloc)(struct section *sec, Elf_Rel *rel, Elf_Sym *sym,
+ const char *symname);
+
+#if ELF_BITS == 64
+ if (!use_real_mode)
+ do_reloc = do_reloc64;
+ else
+ die("--realmode not valid for a 64-bit ELF file");
+#else
+ if (!use_real_mode)
+ do_reloc = do_reloc32;
+ else
+ do_reloc = do_reloc_real;
+#endif
+
+ /* Collect up the relocations */
+ walk_relocs(do_reloc);
+
+ if (relocs16.count && !use_real_mode)
+ die("Segment relocations found but --realmode not specified\n");
+
+ /* Order the relocations for more efficient processing */
+ sort_relocs(&relocs32);
+#if ELF_BITS == 64
+ sort_relocs(&relocs32neg);
+ sort_relocs(&relocs64);
+#else
+ sort_relocs(&relocs16);
+#endif
+
+ /* Print the relocations */
+ if (as_text) {
+ /* Print the relocations in a form suitable that
+ * gas will like.
+ */
+ printf(".section \".data.reloc\",\"a\"\n");
+ printf(".balign 4\n");
+ write_reloc = write32_as_text;
+ }
+
+ if (use_real_mode) {
+ write_reloc(relocs16.count, stdout);
+ for (i = 0; i < relocs16.count; i++)
+ write_reloc(relocs16.offset[i], stdout);
+
+ write_reloc(relocs32.count, stdout);
+ for (i = 0; i < relocs32.count; i++)
+ write_reloc(relocs32.offset[i], stdout);
+ } else {
+#if ELF_BITS == 64
+ /* Print a stop */
+ write_reloc(0, stdout);
+
+ /* Now print each relocation */
+ for (i = 0; i < relocs64.count; i++)
+ write_reloc(relocs64.offset[i], stdout);
+
+ /* Print a stop */
+ write_reloc(0, stdout);
+
+ /* Now print each inverse 32-bit relocation */
+ for (i = 0; i < relocs32neg.count; i++)
+ write_reloc(relocs32neg.offset[i], stdout);
+#endif
+
+ /* Print a stop */
+ write_reloc(0, stdout);
+
+ /* Now print each relocation */
+ for (i = 0; i < relocs32.count; i++)
+ write_reloc(relocs32.offset[i], stdout);
+ }
+}
+
+/*
+ * As an aid to debugging problems with different linkers
+ * print summary information about the relocs.
+ * Since different linkers tend to emit the sections in
+ * different orders we use the section names in the output.
+ */
+static int do_reloc_info(struct section *sec, Elf_Rel *rel, ElfW(Sym) *sym,
+ const char *symname)
+{
+ printf("%s\t%s\t%s\t%s\n",
+ sec_name(sec->shdr.sh_info),
+ rel_type(ELF_R_TYPE(rel->r_info)),
+ symname,
+ sec_name(sym->st_shndx));
+ return 0;
+}
+
+static void print_reloc_info(void)
+{
+ printf("reloc section\treloc type\tsymbol\tsymbol section\n");
+ walk_relocs(do_reloc_info);
+}
+
+#if ELF_BITS == 64
+# define process process_64
+#else
+# define process process_32
+#endif
+
+void process(FILE *fp, int use_real_mode, int as_text,
+ int show_absolute_syms, int show_absolute_relocs,
+ int show_reloc_info)
+{
+ regex_init(use_real_mode);
+ read_ehdr(fp);
+ read_shdrs(fp);
+ read_strtabs(fp);
+ read_symtabs(fp);
+ read_relocs(fp);
+ if (ELF_BITS == 64)
+ percpu_init();
+ if (show_absolute_syms) {
+ print_absolute_symbols();
+ return;
+ }
+ if (show_absolute_relocs) {
+ print_absolute_relocs();
+ return;
+ }
+ if (show_reloc_info) {
+ print_reloc_info();
+ return;
+ }
+ emit_relocs(as_text, use_real_mode);
+}
diff --git a/arch/x86/tools/relocs.h b/arch/x86/tools/relocs.h
new file mode 100644
index 000000000..43c83c0fd
--- /dev/null
+++ b/arch/x86/tools/relocs.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef RELOCS_H
+#define RELOCS_H
+
+#include <stdio.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <inttypes.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+#include <elf.h>
+#include <byteswap.h>
+#define USE_BSD
+#include <endian.h>
+#include <regex.h>
+#include <tools/le_byteshift.h>
+
+void die(char *fmt, ...) __attribute__((noreturn));
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+
+enum symtype {
+ S_ABS,
+ S_REL,
+ S_SEG,
+ S_LIN,
+ S_NSYMTYPES
+};
+
+void process_32(FILE *fp, int use_real_mode, int as_text,
+ int show_absolute_syms, int show_absolute_relocs,
+ int show_reloc_info);
+void process_64(FILE *fp, int use_real_mode, int as_text,
+ int show_absolute_syms, int show_absolute_relocs,
+ int show_reloc_info);
+#endif /* RELOCS_H */
diff --git a/arch/x86/tools/relocs_32.c b/arch/x86/tools/relocs_32.c
new file mode 100644
index 000000000..9442ff78b
--- /dev/null
+++ b/arch/x86/tools/relocs_32.c
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "relocs.h"
+
+#define ELF_BITS 32
+
+#define ELF_MACHINE EM_386
+#define ELF_MACHINE_NAME "i386"
+#define SHT_REL_TYPE SHT_REL
+#define Elf_Rel ElfW(Rel)
+
+#define ELF_CLASS ELFCLASS32
+#define ELF_R_SYM(val) ELF32_R_SYM(val)
+#define ELF_R_TYPE(val) ELF32_R_TYPE(val)
+#define ELF_ST_TYPE(o) ELF32_ST_TYPE(o)
+#define ELF_ST_BIND(o) ELF32_ST_BIND(o)
+#define ELF_ST_VISIBILITY(o) ELF32_ST_VISIBILITY(o)
+
+#include "relocs.c"
diff --git a/arch/x86/tools/relocs_64.c b/arch/x86/tools/relocs_64.c
new file mode 100644
index 000000000..9029cb619
--- /dev/null
+++ b/arch/x86/tools/relocs_64.c
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "relocs.h"
+
+#define ELF_BITS 64
+
+#define ELF_MACHINE EM_X86_64
+#define ELF_MACHINE_NAME "x86_64"
+#define SHT_REL_TYPE SHT_RELA
+#define Elf_Rel Elf64_Rela
+
+#define ELF_CLASS ELFCLASS64
+#define ELF_R_SYM(val) ELF64_R_SYM(val)
+#define ELF_R_TYPE(val) ELF64_R_TYPE(val)
+#define ELF_ST_TYPE(o) ELF64_ST_TYPE(o)
+#define ELF_ST_BIND(o) ELF64_ST_BIND(o)
+#define ELF_ST_VISIBILITY(o) ELF64_ST_VISIBILITY(o)
+
+#include "relocs.c"
diff --git a/arch/x86/tools/relocs_common.c b/arch/x86/tools/relocs_common.c
new file mode 100644
index 000000000..6634352a2
--- /dev/null
+++ b/arch/x86/tools/relocs_common.c
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "relocs.h"
+
+void die(char *fmt, ...)
+{
+ va_list ap;
+ va_start(ap, fmt);
+ vfprintf(stderr, fmt, ap);
+ va_end(ap);
+ exit(1);
+}
+
+static void usage(void)
+{
+ die("relocs [--abs-syms|--abs-relocs|--reloc-info|--text|--realmode]" \
+ " vmlinux\n");
+}
+
+int main(int argc, char **argv)
+{
+ int show_absolute_syms, show_absolute_relocs, show_reloc_info;
+ int as_text, use_real_mode;
+ const char *fname;
+ FILE *fp;
+ int i;
+ unsigned char e_ident[EI_NIDENT];
+
+ show_absolute_syms = 0;
+ show_absolute_relocs = 0;
+ show_reloc_info = 0;
+ as_text = 0;
+ use_real_mode = 0;
+ fname = NULL;
+ for (i = 1; i < argc; i++) {
+ char *arg = argv[i];
+ if (*arg == '-') {
+ if (strcmp(arg, "--abs-syms") == 0) {
+ show_absolute_syms = 1;
+ continue;
+ }
+ if (strcmp(arg, "--abs-relocs") == 0) {
+ show_absolute_relocs = 1;
+ continue;
+ }
+ if (strcmp(arg, "--reloc-info") == 0) {
+ show_reloc_info = 1;
+ continue;
+ }
+ if (strcmp(arg, "--text") == 0) {
+ as_text = 1;
+ continue;
+ }
+ if (strcmp(arg, "--realmode") == 0) {
+ use_real_mode = 1;
+ continue;
+ }
+ }
+ else if (!fname) {
+ fname = arg;
+ continue;
+ }
+ usage();
+ }
+ if (!fname) {
+ usage();
+ }
+ fp = fopen(fname, "r");
+ if (!fp) {
+ die("Cannot open %s: %s\n", fname, strerror(errno));
+ }
+ if (fread(&e_ident, 1, EI_NIDENT, fp) != EI_NIDENT) {
+ die("Cannot read %s: %s", fname, strerror(errno));
+ }
+ rewind(fp);
+ if (e_ident[EI_CLASS] == ELFCLASS64)
+ process_64(fp, use_real_mode, as_text,
+ show_absolute_syms, show_absolute_relocs,
+ show_reloc_info);
+ else
+ process_32(fp, use_real_mode, as_text,
+ show_absolute_syms, show_absolute_relocs,
+ show_reloc_info);
+ fclose(fp);
+ return 0;
+}
diff --git a/arch/x86/um/Kconfig b/arch/x86/um/Kconfig
new file mode 100644
index 000000000..f518b4744
--- /dev/null
+++ b/arch/x86/um/Kconfig
@@ -0,0 +1,59 @@
+# SPDX-License-Identifier: GPL-2.0
+
+menu "Host processor type and features"
+
+source "arch/x86/Kconfig.cpu"
+
+endmenu
+
+config UML_X86
+ def_bool y
+ select GENERIC_FIND_FIRST_BIT
+
+config 64BIT
+ bool "64-bit kernel" if "$(SUBARCH)" = "x86"
+ default "$(SUBARCH)" != "i386"
+
+config X86_32
+ def_bool !64BIT
+ select HAVE_AOUT
+ select ARCH_WANT_IPC_PARSE_VERSION
+ select MODULES_USE_ELF_REL
+ select CLONE_BACKWARDS
+ select OLD_SIGSUSPEND3
+ select OLD_SIGACTION
+
+config X86_64
+ def_bool 64BIT
+ select MODULES_USE_ELF_RELA
+
+config ARCH_DEFCONFIG
+ string
+ default "arch/um/configs/i386_defconfig" if X86_32
+ default "arch/um/configs/x86_64_defconfig" if X86_64
+
+config RWSEM_XCHGADD_ALGORITHM
+ def_bool 64BIT
+
+config RWSEM_GENERIC_SPINLOCK
+ def_bool !RWSEM_XCHGADD_ALGORITHM
+
+config 3_LEVEL_PGTABLES
+ bool "Three-level pagetables" if !64BIT
+ default 64BIT
+ help
+ Three-level pagetables will let UML have more than 4G of physical
+ memory. All the memory that can't be mapped directly will be treated
+ as high memory.
+
+ However, this it experimental on 32-bit architectures, so if unsure say
+ N (on x86-64 it's automatically enabled, instead, as it's safe there).
+
+config ARCH_HAS_SC_SIGNALS
+ def_bool !64BIT
+
+config ARCH_REUSE_HOST_VSYSCALL_AREA
+ def_bool !64BIT
+
+config GENERIC_HWEIGHT
+ def_bool y
diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile
new file mode 100644
index 000000000..c2d3d7c51
--- /dev/null
+++ b/arch/x86/um/Makefile
@@ -0,0 +1,46 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
+#
+
+ifeq ($(CONFIG_X86_32),y)
+ BITS := 32
+else
+ BITS := 64
+endif
+
+obj-y = bugs_$(BITS).o delay.o fault.o ldt.o \
+ ptrace_$(BITS).o ptrace_user.o setjmp_$(BITS).o signal.o \
+ stub_$(BITS).o stub_segv.o \
+ sys_call_table_$(BITS).o sysrq_$(BITS).o tls_$(BITS).o \
+ mem_$(BITS).o subarch.o os-$(OS)/
+
+ifeq ($(CONFIG_X86_32),y)
+
+obj-y += checksum_32.o syscalls_32.o
+obj-$(CONFIG_ELF_CORE) += elfcore.o
+
+subarch-y = ../lib/string_32.o ../lib/atomic64_32.o ../lib/atomic64_cx8_32.o
+subarch-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += ../lib/rwsem.o
+
+else
+
+obj-y += syscalls_64.o vdso/
+
+subarch-y = ../lib/csum-partial_64.o ../lib/memcpy_64.o ../entry/thunk_64.o \
+ ../lib/rwsem.o
+
+endif
+
+subarch-$(CONFIG_MODULES) += ../kernel/module.o
+
+USER_OBJS := bugs_$(BITS).o ptrace_user.o fault.o
+
+extra-y += user-offsets.s
+$(obj)/user-offsets.s: c_flags = -Wp,-MD,$(depfile) $(USER_CFLAGS) \
+ -Iarch/x86/include/generated
+
+UNPROFILE_OBJS := stub_segv.o
+CFLAGS_stub_segv.o := $(CFLAGS_NO_HARDENING)
+
+include arch/um/scripts/Makefile.rules
diff --git a/arch/x86/um/asm/apic.h b/arch/x86/um/asm/apic.h
new file mode 100644
index 000000000..876dee84a
--- /dev/null
+++ b/arch/x86/um/asm/apic.h
@@ -0,0 +1,4 @@
+#ifndef __UM_APIC_H
+#define __UM_APIC_H
+
+#endif
diff --git a/arch/x86/um/asm/arch_hweight.h b/arch/x86/um/asm/arch_hweight.h
new file mode 100644
index 000000000..0d2d5fbb3
--- /dev/null
+++ b/arch/x86/um/asm/arch_hweight.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_UM_HWEIGHT_H
+#define _ASM_UM_HWEIGHT_H
+
+#include <asm-generic/bitops/arch_hweight.h>
+
+#endif
diff --git a/arch/x86/um/asm/archparam.h b/arch/x86/um/asm/archparam.h
new file mode 100644
index 000000000..c17cf68dd
--- /dev/null
+++ b/arch/x86/um/asm/archparam.h
@@ -0,0 +1,20 @@
+/*
+ * Copyright (C) 2000 - 2003 Jeff Dike (jdike@addtoit.com)
+ * Copyright 2003 PathScale, Inc.
+ * Licensed under the GPL
+ */
+
+#ifndef __UM_ARCHPARAM_H
+#define __UM_ARCHPARAM_H
+
+#ifdef CONFIG_X86_32
+
+#ifdef CONFIG_X86_PAE
+#define LAST_PKMAP 512
+#else
+#define LAST_PKMAP 1024
+#endif
+
+#endif
+
+#endif
diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h
new file mode 100644
index 000000000..f31e5d903
--- /dev/null
+++ b/arch/x86/um/asm/barrier.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_UM_BARRIER_H_
+#define _ASM_UM_BARRIER_H_
+
+#include <asm/asm.h>
+#include <asm/segment.h>
+#include <asm/cpufeatures.h>
+#include <asm/cmpxchg.h>
+#include <asm/nops.h>
+
+#include <linux/kernel.h>
+#include <linux/irqflags.h>
+
+/*
+ * Force strict CPU ordering.
+ * And yes, this is required on UP too when we're talking
+ * to devices.
+ */
+#ifdef CONFIG_X86_32
+
+#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2)
+#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2)
+#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM)
+
+#else /* CONFIG_X86_32 */
+
+#define mb() asm volatile("mfence" : : : "memory")
+#define rmb() asm volatile("lfence" : : : "memory")
+#define wmb() asm volatile("sfence" : : : "memory")
+
+#endif /* CONFIG_X86_32 */
+
+#define dma_rmb() barrier()
+#define dma_wmb() barrier()
+
+#include <asm-generic/barrier.h>
+
+#endif
diff --git a/arch/x86/um/asm/checksum.h b/arch/x86/um/asm/checksum.h
new file mode 100644
index 000000000..2a56cac64
--- /dev/null
+++ b/arch/x86/um/asm/checksum.h
@@ -0,0 +1,155 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __UM_CHECKSUM_H
+#define __UM_CHECKSUM_H
+
+#include <linux/string.h>
+#include <linux/in6.h>
+#include <linux/uaccess.h>
+
+/*
+ * computes the checksum of a memory block at buff, length len,
+ * and adds in "sum" (32-bit)
+ *
+ * returns a 32-bit number suitable for feeding into itself
+ * or csum_tcpudp_magic
+ *
+ * this function must be called with even lengths, except
+ * for the last fragment, which may be odd
+ *
+ * it's best to have buff aligned on a 32-bit boundary
+ */
+extern __wsum csum_partial(const void *buff, int len, __wsum sum);
+
+/*
+ * Note: when you get a NULL pointer exception here this means someone
+ * passed in an incorrect kernel address to one of these functions.
+ *
+ * If you use these functions directly please don't forget the
+ * access_ok().
+ */
+
+static __inline__
+__wsum csum_partial_copy_nocheck(const void *src, void *dst,
+ int len, __wsum sum)
+{
+ memcpy(dst, src, len);
+ return csum_partial(dst, len, sum);
+}
+
+/*
+ * the same as csum_partial, but copies from src while it
+ * checksums, and handles user-space pointer exceptions correctly, when needed.
+ *
+ * here even more important to align src and dst on a 32-bit (or even
+ * better 64-bit) boundary
+ */
+
+static __inline__
+__wsum csum_partial_copy_from_user(const void __user *src, void *dst,
+ int len, __wsum sum, int *err_ptr)
+{
+ if (copy_from_user(dst, src, len)) {
+ *err_ptr = -EFAULT;
+ return (__force __wsum)-1;
+ }
+
+ return csum_partial(dst, len, sum);
+}
+
+/**
+ * csum_fold - Fold and invert a 32bit checksum.
+ * sum: 32bit unfolded sum
+ *
+ * Fold a 32bit running checksum to 16bit and invert it. This is usually
+ * the last step before putting a checksum into a packet.
+ * Make sure not to mix with 64bit checksums.
+ */
+static inline __sum16 csum_fold(__wsum sum)
+{
+ __asm__(
+ " addl %1,%0\n"
+ " adcl $0xffff,%0"
+ : "=r" (sum)
+ : "r" ((__force u32)sum << 16),
+ "0" ((__force u32)sum & 0xffff0000)
+ );
+ return (__force __sum16)(~(__force u32)sum >> 16);
+}
+
+/**
+ * csum_tcpup_nofold - Compute an IPv4 pseudo header checksum.
+ * @saddr: source address
+ * @daddr: destination address
+ * @len: length of packet
+ * @proto: ip protocol of packet
+ * @sum: initial sum to be added in (32bit unfolded)
+ *
+ * Returns the pseudo header checksum the input data. Result is
+ * 32bit unfolded.
+ */
+static inline __wsum
+csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len,
+ __u8 proto, __wsum sum)
+{
+ asm(" addl %1, %0\n"
+ " adcl %2, %0\n"
+ " adcl %3, %0\n"
+ " adcl $0, %0\n"
+ : "=r" (sum)
+ : "g" (daddr), "g" (saddr), "g" ((len + proto) << 8), "0" (sum));
+ return sum;
+}
+
+/*
+ * computes the checksum of the TCP/UDP pseudo-header
+ * returns a 16-bit checksum, already complemented
+ */
+static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
+ __u32 len, __u8 proto,
+ __wsum sum)
+{
+ return csum_fold(csum_tcpudp_nofold(saddr,daddr,len,proto,sum));
+}
+
+/**
+ * ip_fast_csum - Compute the IPv4 header checksum efficiently.
+ * iph: ipv4 header
+ * ihl: length of header / 4
+ */
+static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
+{
+ unsigned int sum;
+
+ asm( " movl (%1), %0\n"
+ " subl $4, %2\n"
+ " jbe 2f\n"
+ " addl 4(%1), %0\n"
+ " adcl 8(%1), %0\n"
+ " adcl 12(%1), %0\n"
+ "1: adcl 16(%1), %0\n"
+ " lea 4(%1), %1\n"
+ " decl %2\n"
+ " jne 1b\n"
+ " adcl $0, %0\n"
+ " movl %0, %2\n"
+ " shrl $16, %0\n"
+ " addw %w2, %w0\n"
+ " adcl $0, %0\n"
+ " notl %0\n"
+ "2:"
+ /* Since the input registers which are loaded with iph and ipl
+ are modified, we must also specify them as outputs, or gcc
+ will assume they contain their original values. */
+ : "=r" (sum), "=r" (iph), "=r" (ihl)
+ : "1" (iph), "2" (ihl)
+ : "memory");
+ return (__force __sum16)sum;
+}
+
+#ifdef CONFIG_X86_32
+# include "checksum_32.h"
+#else
+# include "checksum_64.h"
+#endif
+
+#endif
diff --git a/arch/x86/um/asm/checksum_32.h b/arch/x86/um/asm/checksum_32.h
new file mode 100644
index 000000000..83a75f8a1
--- /dev/null
+++ b/arch/x86/um/asm/checksum_32.h
@@ -0,0 +1,61 @@
+/*
+ * Licensed under the GPL
+ */
+
+#ifndef __UM_SYSDEP_CHECKSUM_H
+#define __UM_SYSDEP_CHECKSUM_H
+
+static inline __sum16 ip_compute_csum(const void *buff, int len)
+{
+ return csum_fold (csum_partial(buff, len, 0));
+}
+
+#define _HAVE_ARCH_IPV6_CSUM
+static __inline__ __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
+ const struct in6_addr *daddr,
+ __u32 len, __u8 proto,
+ __wsum sum)
+{
+ __asm__(
+ "addl 0(%1), %0 ;\n"
+ "adcl 4(%1), %0 ;\n"
+ "adcl 8(%1), %0 ;\n"
+ "adcl 12(%1), %0 ;\n"
+ "adcl 0(%2), %0 ;\n"
+ "adcl 4(%2), %0 ;\n"
+ "adcl 8(%2), %0 ;\n"
+ "adcl 12(%2), %0 ;\n"
+ "adcl %3, %0 ;\n"
+ "adcl %4, %0 ;\n"
+ "adcl $0, %0 ;\n"
+ : "=&r" (sum)
+ : "r" (saddr), "r" (daddr),
+ "r"(htonl(len)), "r"(htonl(proto)), "0"(sum));
+
+ return csum_fold(sum);
+}
+
+/*
+ * Copy and checksum to user
+ */
+#define HAVE_CSUM_COPY_USER
+static __inline__ __wsum csum_and_copy_to_user(const void *src,
+ void __user *dst,
+ int len, __wsum sum, int *err_ptr)
+{
+ if (access_ok(VERIFY_WRITE, dst, len)) {
+ if (copy_to_user(dst, src, len)) {
+ *err_ptr = -EFAULT;
+ return (__force __wsum)-1;
+ }
+
+ return csum_partial(src, len, sum);
+ }
+
+ if (len)
+ *err_ptr = -EFAULT;
+
+ return (__force __wsum)-1; /* invalid checksum */
+}
+
+#endif
diff --git a/arch/x86/um/asm/checksum_64.h b/arch/x86/um/asm/checksum_64.h
new file mode 100644
index 000000000..7b6cd1921
--- /dev/null
+++ b/arch/x86/um/asm/checksum_64.h
@@ -0,0 +1,19 @@
+/*
+ * Licensed under the GPL
+ */
+
+#ifndef __UM_SYSDEP_CHECKSUM_H
+#define __UM_SYSDEP_CHECKSUM_H
+
+static inline unsigned add32_with_carry(unsigned a, unsigned b)
+{
+ asm("addl %2,%0\n\t"
+ "adcl $0,%0"
+ : "=r" (a)
+ : "0" (a), "r" (b));
+ return a;
+}
+
+extern __sum16 ip_compute_csum(const void *buff, int len);
+
+#endif
diff --git a/arch/x86/um/asm/desc.h b/arch/x86/um/asm/desc.h
new file mode 100644
index 000000000..34de4e93d
--- /dev/null
+++ b/arch/x86/um/asm/desc.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __UM_DESC_H
+#define __UM_DESC_H
+
+/* Taken from asm-i386/desc.h, it's the only thing we need. The rest wouldn't
+ * compile, and has never been used. */
+#define LDT_empty(info) (\
+ (info)->base_addr == 0 && \
+ (info)->limit == 0 && \
+ (info)->contents == 0 && \
+ (info)->read_exec_only == 1 && \
+ (info)->seg_32bit == 0 && \
+ (info)->limit_in_pages == 0 && \
+ (info)->seg_not_present == 1 && \
+ (info)->useable == 0 )
+
+#endif
diff --git a/arch/x86/um/asm/elf.h b/arch/x86/um/asm/elf.h
new file mode 100644
index 000000000..548197212
--- /dev/null
+++ b/arch/x86/um/asm/elf.h
@@ -0,0 +1,218 @@
+/*
+ * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
+ * Licensed under the GPL
+ */
+#ifndef __UM_ELF_X86_H
+#define __UM_ELF_X86_H
+
+#include <asm/user.h>
+#include <skas.h>
+
+#ifdef CONFIG_X86_32
+
+#define R_386_NONE 0
+#define R_386_32 1
+#define R_386_PC32 2
+#define R_386_GOT32 3
+#define R_386_PLT32 4
+#define R_386_COPY 5
+#define R_386_GLOB_DAT 6
+#define R_386_JMP_SLOT 7
+#define R_386_RELATIVE 8
+#define R_386_GOTOFF 9
+#define R_386_GOTPC 10
+#define R_386_NUM 11
+
+/*
+ * This is used to ensure we don't load something for the wrong architecture.
+ */
+#define elf_check_arch(x) \
+ (((x)->e_machine == EM_386) || ((x)->e_machine == EM_486))
+
+#define ELF_CLASS ELFCLASS32
+#define ELF_DATA ELFDATA2LSB
+#define ELF_ARCH EM_386
+
+#define ELF_PLAT_INIT(regs, load_addr) do { \
+ PT_REGS_BX(regs) = 0; \
+ PT_REGS_CX(regs) = 0; \
+ PT_REGS_DX(regs) = 0; \
+ PT_REGS_SI(regs) = 0; \
+ PT_REGS_DI(regs) = 0; \
+ PT_REGS_BP(regs) = 0; \
+ PT_REGS_AX(regs) = 0; \
+} while (0)
+
+/* Shamelessly stolen from include/asm-i386/elf.h */
+
+#define ELF_CORE_COPY_REGS(pr_reg, regs) do { \
+ pr_reg[0] = PT_REGS_BX(regs); \
+ pr_reg[1] = PT_REGS_CX(regs); \
+ pr_reg[2] = PT_REGS_DX(regs); \
+ pr_reg[3] = PT_REGS_SI(regs); \
+ pr_reg[4] = PT_REGS_DI(regs); \
+ pr_reg[5] = PT_REGS_BP(regs); \
+ pr_reg[6] = PT_REGS_AX(regs); \
+ pr_reg[7] = PT_REGS_DS(regs); \
+ pr_reg[8] = PT_REGS_ES(regs); \
+ /* fake once used fs and gs selectors? */ \
+ pr_reg[9] = PT_REGS_DS(regs); \
+ pr_reg[10] = PT_REGS_DS(regs); \
+ pr_reg[11] = PT_REGS_SYSCALL_NR(regs); \
+ pr_reg[12] = PT_REGS_IP(regs); \
+ pr_reg[13] = PT_REGS_CS(regs); \
+ pr_reg[14] = PT_REGS_EFLAGS(regs); \
+ pr_reg[15] = PT_REGS_SP(regs); \
+ pr_reg[16] = PT_REGS_SS(regs); \
+} while (0);
+
+extern char * elf_aux_platform;
+#define ELF_PLATFORM (elf_aux_platform)
+
+extern unsigned long vsyscall_ehdr;
+extern unsigned long vsyscall_end;
+extern unsigned long __kernel_vsyscall;
+
+/*
+ * This is the range that is readable by user mode, and things
+ * acting like user mode such as get_user_pages.
+ */
+#define FIXADDR_USER_START vsyscall_ehdr
+#define FIXADDR_USER_END vsyscall_end
+
+
+/*
+ * Architecture-neutral AT_ values in 0-17, leave some room
+ * for more of them, start the x86-specific ones at 32.
+ */
+#define AT_SYSINFO 32
+#define AT_SYSINFO_EHDR 33
+
+#define ARCH_DLINFO \
+do { \
+ if ( vsyscall_ehdr ) { \
+ NEW_AUX_ENT(AT_SYSINFO, __kernel_vsyscall); \
+ NEW_AUX_ENT(AT_SYSINFO_EHDR, vsyscall_ehdr); \
+ } \
+} while (0)
+
+#else
+
+/* x86-64 relocation types, taken from asm-x86_64/elf.h */
+#define R_X86_64_NONE 0 /* No reloc */
+#define R_X86_64_64 1 /* Direct 64 bit */
+#define R_X86_64_PC32 2 /* PC relative 32 bit signed */
+#define R_X86_64_GOT32 3 /* 32 bit GOT entry */
+#define R_X86_64_PLT32 4 /* 32 bit PLT address */
+#define R_X86_64_COPY 5 /* Copy symbol at runtime */
+#define R_X86_64_GLOB_DAT 6 /* Create GOT entry */
+#define R_X86_64_JUMP_SLOT 7 /* Create PLT entry */
+#define R_X86_64_RELATIVE 8 /* Adjust by program base */
+#define R_X86_64_GOTPCREL 9 /* 32 bit signed pc relative
+ offset to GOT */
+#define R_X86_64_32 10 /* Direct 32 bit zero extended */
+#define R_X86_64_32S 11 /* Direct 32 bit sign extended */
+#define R_X86_64_16 12 /* Direct 16 bit zero extended */
+#define R_X86_64_PC16 13 /* 16 bit sign extended pc relative */
+#define R_X86_64_8 14 /* Direct 8 bit sign extended */
+#define R_X86_64_PC8 15 /* 8 bit sign extended pc relative */
+
+#define R_X86_64_NUM 16
+
+/*
+ * This is used to ensure we don't load something for the wrong architecture.
+ */
+#define elf_check_arch(x) \
+ ((x)->e_machine == EM_X86_64)
+
+#define ELF_CLASS ELFCLASS64
+#define ELF_DATA ELFDATA2LSB
+#define ELF_ARCH EM_X86_64
+
+#define ELF_PLAT_INIT(regs, load_addr) do { \
+ PT_REGS_BX(regs) = 0; \
+ PT_REGS_CX(regs) = 0; \
+ PT_REGS_DX(regs) = 0; \
+ PT_REGS_SI(regs) = 0; \
+ PT_REGS_DI(regs) = 0; \
+ PT_REGS_BP(regs) = 0; \
+ PT_REGS_AX(regs) = 0; \
+ PT_REGS_R8(regs) = 0; \
+ PT_REGS_R9(regs) = 0; \
+ PT_REGS_R10(regs) = 0; \
+ PT_REGS_R11(regs) = 0; \
+ PT_REGS_R12(regs) = 0; \
+ PT_REGS_R13(regs) = 0; \
+ PT_REGS_R14(regs) = 0; \
+ PT_REGS_R15(regs) = 0; \
+} while (0)
+
+#define ELF_CORE_COPY_REGS(pr_reg, _regs) \
+ (pr_reg)[0] = (_regs)->regs.gp[0]; \
+ (pr_reg)[1] = (_regs)->regs.gp[1]; \
+ (pr_reg)[2] = (_regs)->regs.gp[2]; \
+ (pr_reg)[3] = (_regs)->regs.gp[3]; \
+ (pr_reg)[4] = (_regs)->regs.gp[4]; \
+ (pr_reg)[5] = (_regs)->regs.gp[5]; \
+ (pr_reg)[6] = (_regs)->regs.gp[6]; \
+ (pr_reg)[7] = (_regs)->regs.gp[7]; \
+ (pr_reg)[8] = (_regs)->regs.gp[8]; \
+ (pr_reg)[9] = (_regs)->regs.gp[9]; \
+ (pr_reg)[10] = (_regs)->regs.gp[10]; \
+ (pr_reg)[11] = (_regs)->regs.gp[11]; \
+ (pr_reg)[12] = (_regs)->regs.gp[12]; \
+ (pr_reg)[13] = (_regs)->regs.gp[13]; \
+ (pr_reg)[14] = (_regs)->regs.gp[14]; \
+ (pr_reg)[15] = (_regs)->regs.gp[15]; \
+ (pr_reg)[16] = (_regs)->regs.gp[16]; \
+ (pr_reg)[17] = (_regs)->regs.gp[17]; \
+ (pr_reg)[18] = (_regs)->regs.gp[18]; \
+ (pr_reg)[19] = (_regs)->regs.gp[19]; \
+ (pr_reg)[20] = (_regs)->regs.gp[20]; \
+ (pr_reg)[21] = current->thread.arch.fs; \
+ (pr_reg)[22] = 0; \
+ (pr_reg)[23] = 0; \
+ (pr_reg)[24] = 0; \
+ (pr_reg)[25] = 0; \
+ (pr_reg)[26] = 0;
+
+#define ELF_PLATFORM "x86_64"
+
+/* No user-accessible fixmap addresses, i.e. vsyscall */
+#define FIXADDR_USER_START 0
+#define FIXADDR_USER_END 0
+
+#define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1
+struct linux_binprm;
+extern int arch_setup_additional_pages(struct linux_binprm *bprm,
+ int uses_interp);
+
+extern unsigned long um_vdso_addr;
+#define AT_SYSINFO_EHDR 33
+#define ARCH_DLINFO NEW_AUX_ENT(AT_SYSINFO_EHDR, um_vdso_addr)
+
+#endif
+
+typedef unsigned long elf_greg_t;
+
+#define ELF_NGREG (sizeof (struct user_regs_struct) / sizeof(elf_greg_t))
+typedef elf_greg_t elf_gregset_t[ELF_NGREG];
+
+typedef struct user_i387_struct elf_fpregset_t;
+
+struct task_struct;
+
+extern int elf_core_copy_fpregs(struct task_struct *t, elf_fpregset_t *fpu);
+
+#define ELF_CORE_COPY_FPREGS(t, fpu) elf_core_copy_fpregs(t, fpu)
+
+#define ELF_EXEC_PAGESIZE 4096
+
+#define ELF_ET_DYN_BASE (TASK_SIZE / 3 * 2)
+
+extern long elf_aux_hwcap;
+#define ELF_HWCAP (elf_aux_hwcap)
+
+#define SET_PERSONALITY(ex) do ; while(0)
+
+#endif
diff --git a/arch/x86/um/asm/irq_vectors.h b/arch/x86/um/asm/irq_vectors.h
new file mode 100644
index 000000000..272a81e0c
--- /dev/null
+++ b/arch/x86/um/asm/irq_vectors.h
@@ -0,0 +1,10 @@
+/*
+ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
+ * Licensed under the GPL
+ */
+
+#ifndef __UM_IRQ_VECTORS_H
+#define __UM_IRQ_VECTORS_H
+
+#endif
+
diff --git a/arch/x86/um/asm/mm_context.h b/arch/x86/um/asm/mm_context.h
new file mode 100644
index 000000000..4a73d63e4
--- /dev/null
+++ b/arch/x86/um/asm/mm_context.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (C) 2004 Fujitsu Siemens Computers GmbH
+ * Licensed under the GPL
+ *
+ * Author: Bodo Stroesser <bstroesser@fujitsu-siemens.com>
+ */
+
+#ifndef __ASM_LDT_H
+#define __ASM_LDT_H
+
+#include <linux/mutex.h>
+#include <asm/ldt.h>
+
+extern void ldt_host_info(void);
+
+#define LDT_PAGES_MAX \
+ ((LDT_ENTRIES * LDT_ENTRY_SIZE)/PAGE_SIZE)
+#define LDT_ENTRIES_PER_PAGE \
+ (PAGE_SIZE/LDT_ENTRY_SIZE)
+#define LDT_DIRECT_ENTRIES \
+ ((LDT_PAGES_MAX*sizeof(void *))/LDT_ENTRY_SIZE)
+
+struct ldt_entry {
+ __u32 a;
+ __u32 b;
+};
+
+typedef struct uml_ldt {
+ int entry_count;
+ struct mutex lock;
+ union {
+ struct ldt_entry * pages[LDT_PAGES_MAX];
+ struct ldt_entry entries[LDT_DIRECT_ENTRIES];
+ } u;
+} uml_ldt_t;
+
+#define LDT_entry_a(info) \
+ ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff))
+
+#define LDT_entry_b(info) \
+ (((info)->base_addr & 0xff000000) | \
+ (((info)->base_addr & 0x00ff0000) >> 16) | \
+ ((info)->limit & 0xf0000) | \
+ (((info)->read_exec_only ^ 1) << 9) | \
+ ((info)->contents << 10) | \
+ (((info)->seg_not_present ^ 1) << 15) | \
+ ((info)->seg_32bit << 22) | \
+ ((info)->limit_in_pages << 23) | \
+ ((info)->useable << 20) | \
+ 0x7000)
+
+#define _LDT_empty(info) (\
+ (info)->base_addr == 0 && \
+ (info)->limit == 0 && \
+ (info)->contents == 0 && \
+ (info)->read_exec_only == 1 && \
+ (info)->seg_32bit == 0 && \
+ (info)->limit_in_pages == 0 && \
+ (info)->seg_not_present == 1 && \
+ (info)->useable == 0 )
+
+#ifdef CONFIG_X86_64
+#define LDT_empty(info) (_LDT_empty(info) && ((info)->lm == 0))
+#else
+#define LDT_empty(info) (_LDT_empty(info))
+#endif
+
+struct uml_arch_mm_context {
+ uml_ldt_t ldt;
+};
+
+#endif
diff --git a/arch/x86/um/asm/module.h b/arch/x86/um/asm/module.h
new file mode 100644
index 000000000..a3b061d66
--- /dev/null
+++ b/arch/x86/um/asm/module.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __UM_MODULE_H
+#define __UM_MODULE_H
+
+/* UML is simple */
+struct mod_arch_specific
+{
+};
+
+#ifdef CONFIG_X86_32
+
+#define Elf_Shdr Elf32_Shdr
+#define Elf_Sym Elf32_Sym
+#define Elf_Ehdr Elf32_Ehdr
+
+#else
+
+#define Elf_Shdr Elf64_Shdr
+#define Elf_Sym Elf64_Sym
+#define Elf_Ehdr Elf64_Ehdr
+
+#endif
+
+#endif
diff --git a/arch/x86/um/asm/processor.h b/arch/x86/um/asm/processor.h
new file mode 100644
index 000000000..593d5f390
--- /dev/null
+++ b/arch/x86/um/asm/processor.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __UM_PROCESSOR_H
+#define __UM_PROCESSOR_H
+
+/* include faultinfo structure */
+#include <sysdep/faultinfo.h>
+
+#ifdef CONFIG_X86_32
+# include "processor_32.h"
+#else
+# include "processor_64.h"
+#endif
+
+#define KSTK_EIP(tsk) KSTK_REG(tsk, HOST_IP)
+#define KSTK_ESP(tsk) KSTK_REG(tsk, HOST_SP)
+#define KSTK_EBP(tsk) KSTK_REG(tsk, HOST_BP)
+
+#define ARCH_IS_STACKGROW(address) \
+ (address + 65536 + 32 * sizeof(unsigned long) >= UPT_SP(&current->thread.regs.regs))
+
+#include <asm/user.h>
+
+/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
+static inline void rep_nop(void)
+{
+ __asm__ __volatile__("rep;nop": : :"memory");
+}
+
+#define cpu_relax() rep_nop()
+
+#define task_pt_regs(t) (&(t)->thread.regs)
+
+#include <asm/processor-generic.h>
+
+#endif
diff --git a/arch/x86/um/asm/processor_32.h b/arch/x86/um/asm/processor_32.h
new file mode 100644
index 000000000..c112de81c
--- /dev/null
+++ b/arch/x86/um/asm/processor_32.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
+ * Licensed under the GPL
+ */
+
+#ifndef __UM_PROCESSOR_I386_H
+#define __UM_PROCESSOR_I386_H
+
+#include <linux/string.h>
+#include <asm/segment.h>
+#include <asm/ldt.h>
+
+extern int host_has_cmov;
+
+struct uml_tls_struct {
+ struct user_desc tls;
+ unsigned flushed:1;
+ unsigned present:1;
+};
+
+struct arch_thread {
+ struct uml_tls_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
+ unsigned long debugregs[8];
+ int debugregs_seq;
+ struct faultinfo faultinfo;
+};
+
+#define INIT_ARCH_THREAD { \
+ .tls_array = { [ 0 ... GDT_ENTRY_TLS_ENTRIES - 1 ] = \
+ { .present = 0, .flushed = 0 } }, \
+ .debugregs = { [ 0 ... 7 ] = 0 }, \
+ .debugregs_seq = 0, \
+ .faultinfo = { 0, 0, 0 } \
+}
+
+#define STACKSLOTS_PER_LINE 8
+
+static inline void arch_flush_thread(struct arch_thread *thread)
+{
+ /* Clear any TLS still hanging */
+ memset(&thread->tls_array, 0, sizeof(thread->tls_array));
+}
+
+static inline void arch_copy_thread(struct arch_thread *from,
+ struct arch_thread *to)
+{
+ memcpy(&to->tls_array, &from->tls_array, sizeof(from->tls_array));
+}
+
+/*
+ * Default implementation of macro that returns current
+ * instruction pointer ("program counter"). Stolen
+ * from asm-i386/processor.h
+ */
+#define current_text_addr() \
+ ({ void *pc; __asm__("movl $1f,%0\n1:":"=g" (pc)); pc; })
+
+#define current_sp() ({ void *sp; __asm__("movl %%esp, %0" : "=r" (sp) : ); sp; })
+#define current_bp() ({ unsigned long bp; __asm__("movl %%ebp, %0" : "=r" (bp) : ); bp; })
+
+#endif
diff --git a/arch/x86/um/asm/processor_64.h b/arch/x86/um/asm/processor_64.h
new file mode 100644
index 000000000..c3be85205
--- /dev/null
+++ b/arch/x86/um/asm/processor_64.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2003 PathScale, Inc.
+ *
+ * Licensed under the GPL
+ */
+
+#ifndef __UM_PROCESSOR_X86_64_H
+#define __UM_PROCESSOR_X86_64_H
+
+struct arch_thread {
+ unsigned long debugregs[8];
+ int debugregs_seq;
+ unsigned long fs;
+ struct faultinfo faultinfo;
+};
+
+#define INIT_ARCH_THREAD { .debugregs = { [ 0 ... 7 ] = 0 }, \
+ .debugregs_seq = 0, \
+ .fs = 0, \
+ .faultinfo = { 0, 0, 0 } }
+
+#define STACKSLOTS_PER_LINE 4
+
+static inline void arch_flush_thread(struct arch_thread *thread)
+{
+}
+
+static inline void arch_copy_thread(struct arch_thread *from,
+ struct arch_thread *to)
+{
+ to->fs = from->fs;
+}
+
+#define current_text_addr() \
+ ({ void *pc; __asm__("movq $1f,%0\n1:":"=g" (pc)); pc; })
+
+#define current_sp() ({ void *sp; __asm__("movq %%rsp, %0" : "=r" (sp) : ); sp; })
+#define current_bp() ({ unsigned long bp; __asm__("movq %%rbp, %0" : "=r" (bp) : ); bp; })
+
+#endif
diff --git a/arch/x86/um/asm/ptrace.h b/arch/x86/um/asm/ptrace.h
new file mode 100644
index 000000000..83822fd42
--- /dev/null
+++ b/arch/x86/um/asm/ptrace.h
@@ -0,0 +1,87 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __UM_X86_PTRACE_H
+#define __UM_X86_PTRACE_H
+
+#include <linux/compiler.h>
+#ifndef CONFIG_X86_32
+#define __FRAME_OFFSETS /* Needed to get the R* macros */
+#endif
+#include <asm/ptrace-generic.h>
+
+#define user_mode(r) UPT_IS_USER(&(r)->regs)
+
+#define PT_REGS_AX(r) UPT_AX(&(r)->regs)
+#define PT_REGS_BX(r) UPT_BX(&(r)->regs)
+#define PT_REGS_CX(r) UPT_CX(&(r)->regs)
+#define PT_REGS_DX(r) UPT_DX(&(r)->regs)
+
+#define PT_REGS_SI(r) UPT_SI(&(r)->regs)
+#define PT_REGS_DI(r) UPT_DI(&(r)->regs)
+#define PT_REGS_BP(r) UPT_BP(&(r)->regs)
+#define PT_REGS_EFLAGS(r) UPT_EFLAGS(&(r)->regs)
+
+#define PT_REGS_CS(r) UPT_CS(&(r)->regs)
+#define PT_REGS_SS(r) UPT_SS(&(r)->regs)
+#define PT_REGS_DS(r) UPT_DS(&(r)->regs)
+#define PT_REGS_ES(r) UPT_ES(&(r)->regs)
+
+#define PT_REGS_ORIG_SYSCALL(r) PT_REGS_AX(r)
+#define PT_REGS_SYSCALL_RET(r) PT_REGS_AX(r)
+
+#define PT_FIX_EXEC_STACK(sp) do ; while(0)
+
+#define profile_pc(regs) PT_REGS_IP(regs)
+
+#define UPT_RESTART_SYSCALL(r) (UPT_IP(r) -= 2)
+#define PT_REGS_SET_SYSCALL_RETURN(r, res) (PT_REGS_AX(r) = (res))
+
+static inline long regs_return_value(struct pt_regs *regs)
+{
+ return PT_REGS_AX(regs);
+}
+
+/*
+ * Forward declaration to avoid including sysdep/tls.h, which causes a
+ * circular include, and compilation failures.
+ */
+struct user_desc;
+
+#ifdef CONFIG_X86_32
+
+extern int ptrace_get_thread_area(struct task_struct *child, int idx,
+ struct user_desc __user *user_desc);
+
+extern int ptrace_set_thread_area(struct task_struct *child, int idx,
+ struct user_desc __user *user_desc);
+
+#else
+
+#define PT_REGS_R8(r) UPT_R8(&(r)->regs)
+#define PT_REGS_R9(r) UPT_R9(&(r)->regs)
+#define PT_REGS_R10(r) UPT_R10(&(r)->regs)
+#define PT_REGS_R11(r) UPT_R11(&(r)->regs)
+#define PT_REGS_R12(r) UPT_R12(&(r)->regs)
+#define PT_REGS_R13(r) UPT_R13(&(r)->regs)
+#define PT_REGS_R14(r) UPT_R14(&(r)->regs)
+#define PT_REGS_R15(r) UPT_R15(&(r)->regs)
+
+#include <asm/errno.h>
+
+static inline int ptrace_get_thread_area(struct task_struct *child, int idx,
+ struct user_desc __user *user_desc)
+{
+ return -ENOSYS;
+}
+
+static inline int ptrace_set_thread_area(struct task_struct *child, int idx,
+ struct user_desc __user *user_desc)
+{
+ return -ENOSYS;
+}
+
+extern long arch_prctl(struct task_struct *task, int option,
+ unsigned long __user *addr);
+
+#endif
+#define user_stack_pointer(regs) PT_REGS_SP(regs)
+#endif /* __UM_X86_PTRACE_H */
diff --git a/arch/x86/um/asm/required-features.h b/arch/x86/um/asm/required-features.h
new file mode 100644
index 000000000..dfb967b2d
--- /dev/null
+++ b/arch/x86/um/asm/required-features.h
@@ -0,0 +1,9 @@
+#ifndef __UM_REQUIRED_FEATURES_H
+#define __UM_REQUIRED_FEATURES_H
+
+/*
+ * Nothing to see, just need something for the i386 and x86_64 asm
+ * headers to include.
+ */
+
+#endif
diff --git a/arch/x86/um/asm/segment.h b/arch/x86/um/asm/segment.h
new file mode 100644
index 000000000..453db3771
--- /dev/null
+++ b/arch/x86/um/asm/segment.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __UM_SEGMENT_H
+#define __UM_SEGMENT_H
+
+extern int host_gdt_entry_tls_min;
+
+#define GDT_ENTRY_TLS_ENTRIES 3
+#define GDT_ENTRY_TLS_MIN host_gdt_entry_tls_min
+#define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
+
+typedef struct {
+ unsigned long seg;
+} mm_segment_t;
+
+#define MAKE_MM_SEG(s) ((mm_segment_t) { (s) })
+#define KERNEL_DS MAKE_MM_SEG(~0UL)
+#define USER_DS MAKE_MM_SEG(TASK_SIZE)
+
+#endif
diff --git a/arch/x86/um/asm/syscall.h b/arch/x86/um/asm/syscall.h
new file mode 100644
index 000000000..ef898af10
--- /dev/null
+++ b/arch/x86/um/asm/syscall.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __UM_ASM_SYSCALL_H
+#define __UM_ASM_SYSCALL_H
+
+#include <asm/syscall-generic.h>
+#include <uapi/linux/audit.h>
+
+typedef asmlinkage long (*sys_call_ptr_t)(unsigned long, unsigned long,
+ unsigned long, unsigned long,
+ unsigned long, unsigned long);
+
+static inline int syscall_get_arch(void)
+{
+#ifdef CONFIG_X86_32
+ return AUDIT_ARCH_I386;
+#else
+ return AUDIT_ARCH_X86_64;
+#endif
+}
+
+#endif /* __UM_ASM_SYSCALL_H */
diff --git a/arch/x86/um/asm/vm-flags.h b/arch/x86/um/asm/vm-flags.h
new file mode 100644
index 000000000..7c297e9e2
--- /dev/null
+++ b/arch/x86/um/asm/vm-flags.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (C) 2004 Jeff Dike (jdike@addtoit.com)
+ * Copyright 2003 PathScale, Inc.
+ * Licensed under the GPL
+ */
+
+#ifndef __VM_FLAGS_X86_H
+#define __VM_FLAGS_X86_H
+
+#ifdef CONFIG_X86_32
+
+#define VM_DATA_DEFAULT_FLAGS \
+ (VM_READ | VM_WRITE | \
+ ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \
+ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
+
+#else
+
+#define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \
+ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
+#define VM_STACK_DEFAULT_FLAGS (VM_GROWSDOWN | VM_READ | VM_WRITE | \
+ VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
+
+#endif
+#endif
diff --git a/arch/x86/um/bugs_32.c b/arch/x86/um/bugs_32.c
new file mode 100644
index 000000000..33daff4da
--- /dev/null
+++ b/arch/x86/um/bugs_32.c
@@ -0,0 +1,74 @@
+/*
+ * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
+ * Licensed under the GPL
+ */
+
+#include <signal.h>
+#include <kern_util.h>
+#include <longjmp.h>
+#include <sysdep/ptrace.h>
+#include <generated/asm-offsets.h>
+
+/* Set during early boot */
+static int host_has_cmov = 1;
+static jmp_buf cmov_test_return;
+
+static void cmov_sigill_test_handler(int sig)
+{
+ host_has_cmov = 0;
+ longjmp(cmov_test_return, 1);
+}
+
+void arch_check_bugs(void)
+{
+ struct sigaction old, new;
+
+ printk(UM_KERN_INFO "Checking for host processor cmov support...");
+ new.sa_handler = cmov_sigill_test_handler;
+
+ /* Make sure that SIGILL is enabled after the handler longjmps back */
+ new.sa_flags = SA_NODEFER;
+ sigemptyset(&new.sa_mask);
+ sigaction(SIGILL, &new, &old);
+
+ if (setjmp(cmov_test_return) == 0) {
+ unsigned long foo = 0;
+ __asm__ __volatile__("cmovz %0, %1" : "=r" (foo) : "0" (foo));
+ printk(UM_KERN_CONT "Yes\n");
+ } else
+ printk(UM_KERN_CONT "No\n");
+
+ sigaction(SIGILL, &old, &new);
+}
+
+void arch_examine_signal(int sig, struct uml_pt_regs *regs)
+{
+ unsigned char tmp[2];
+
+ /*
+ * This is testing for a cmov (0x0f 0x4x) instruction causing a
+ * SIGILL in init.
+ */
+ if ((sig != SIGILL) || (get_current_pid() != 1))
+ return;
+
+ if (copy_from_user_proc(tmp, (void *) UPT_IP(regs), 2)) {
+ printk(UM_KERN_ERR "SIGILL in init, could not read "
+ "instructions!\n");
+ return;
+ }
+
+ if ((tmp[0] != 0x0f) || ((tmp[1] & 0xf0) != 0x40))
+ return;
+
+ if (host_has_cmov == 0)
+ printk(UM_KERN_ERR "SIGILL caused by cmov, which this "
+ "processor doesn't implement. Boot a filesystem "
+ "compiled for older processors");
+ else if (host_has_cmov == 1)
+ printk(UM_KERN_ERR "SIGILL caused by cmov, which this "
+ "processor claims to implement");
+ else
+ printk(UM_KERN_ERR "Bad value for host_has_cmov (%d)",
+ host_has_cmov);
+}
diff --git a/arch/x86/um/bugs_64.c b/arch/x86/um/bugs_64.c
new file mode 100644
index 000000000..8cc8256c6
--- /dev/null
+++ b/arch/x86/um/bugs_64.c
@@ -0,0 +1,15 @@
+/*
+ * Copyright 2003 PathScale, Inc.
+ *
+ * Licensed under the GPL
+ */
+
+#include <sysdep/ptrace.h>
+
+void arch_check_bugs(void)
+{
+}
+
+void arch_examine_signal(int sig, struct uml_pt_regs *regs)
+{
+}
diff --git a/arch/x86/um/checksum_32.S b/arch/x86/um/checksum_32.S
new file mode 100644
index 000000000..b9933eb92
--- /dev/null
+++ b/arch/x86/um/checksum_32.S
@@ -0,0 +1,218 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * IP/TCP/UDP checksumming routines
+ *
+ * Authors: Jorge Cwik, <jorge@laser.satlink.net>
+ * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ * Tom May, <ftom@netcom.com>
+ * Pentium Pro/II routines:
+ * Alexander Kjeldaas <astor@guardian.no>
+ * Finn Arne Gangstad <finnag@guardian.no>
+ * Lots of code moved from tcp.c and ip.c; see those files
+ * for more names.
+ *
+ * Changes: Ingo Molnar, converted csum_partial_copy() to 2.1 exception
+ * handling.
+ * Andi Kleen, add zeroing on error
+ * converted to pure assembler
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <asm/errno.h>
+#include <asm/asm.h>
+#include <asm/export.h>
+
+/*
+ * computes a partial checksum, e.g. for TCP/UDP fragments
+ */
+
+/*
+unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
+ */
+
+.text
+.align 4
+.globl csum_partial
+
+#ifndef CONFIG_X86_USE_PPRO_CHECKSUM
+
+ /*
+ * Experiments with Ethernet and SLIP connections show that buff
+ * is aligned on either a 2-byte or 4-byte boundary. We get at
+ * least a twofold speedup on 486 and Pentium if it is 4-byte aligned.
+ * Fortunately, it is easy to convert 2-byte alignment to 4-byte
+ * alignment for the unrolled loop.
+ */
+csum_partial:
+ pushl %esi
+ pushl %ebx
+ movl 20(%esp),%eax # Function arg: unsigned int sum
+ movl 16(%esp),%ecx # Function arg: int len
+ movl 12(%esp),%esi # Function arg: unsigned char *buff
+ testl $2, %esi # Check alignment.
+ jz 2f # Jump if alignment is ok.
+ subl $2, %ecx # Alignment uses up two bytes.
+ jae 1f # Jump if we had at least two bytes.
+ addl $2, %ecx # ecx was < 2. Deal with it.
+ jmp 4f
+1: movw (%esi), %bx
+ addl $2, %esi
+ addw %bx, %ax
+ adcl $0, %eax
+2:
+ movl %ecx, %edx
+ shrl $5, %ecx
+ jz 2f
+ testl %esi, %esi
+1: movl (%esi), %ebx
+ adcl %ebx, %eax
+ movl 4(%esi), %ebx
+ adcl %ebx, %eax
+ movl 8(%esi), %ebx
+ adcl %ebx, %eax
+ movl 12(%esi), %ebx
+ adcl %ebx, %eax
+ movl 16(%esi), %ebx
+ adcl %ebx, %eax
+ movl 20(%esi), %ebx
+ adcl %ebx, %eax
+ movl 24(%esi), %ebx
+ adcl %ebx, %eax
+ movl 28(%esi), %ebx
+ adcl %ebx, %eax
+ lea 32(%esi), %esi
+ dec %ecx
+ jne 1b
+ adcl $0, %eax
+2: movl %edx, %ecx
+ andl $0x1c, %edx
+ je 4f
+ shrl $2, %edx # This clears CF
+3: adcl (%esi), %eax
+ lea 4(%esi), %esi
+ dec %edx
+ jne 3b
+ adcl $0, %eax
+4: andl $3, %ecx
+ jz 7f
+ cmpl $2, %ecx
+ jb 5f
+ movw (%esi),%cx
+ leal 2(%esi),%esi
+ je 6f
+ shll $16,%ecx
+5: movb (%esi),%cl
+6: addl %ecx,%eax
+ adcl $0, %eax
+7:
+ popl %ebx
+ popl %esi
+ ret
+
+#else
+
+/* Version for PentiumII/PPro */
+
+csum_partial:
+ pushl %esi
+ pushl %ebx
+ movl 20(%esp),%eax # Function arg: unsigned int sum
+ movl 16(%esp),%ecx # Function arg: int len
+ movl 12(%esp),%esi # Function arg: const unsigned char *buf
+
+ testl $2, %esi
+ jnz 30f
+10:
+ movl %ecx, %edx
+ movl %ecx, %ebx
+ andl $0x7c, %ebx
+ shrl $7, %ecx
+ addl %ebx,%esi
+ shrl $2, %ebx
+ negl %ebx
+ lea 45f(%ebx,%ebx,2), %ebx
+ testl %esi, %esi
+ jmp *%ebx
+
+ # Handle 2-byte-aligned regions
+20: addw (%esi), %ax
+ lea 2(%esi), %esi
+ adcl $0, %eax
+ jmp 10b
+
+30: subl $2, %ecx
+ ja 20b
+ je 32f
+ movzbl (%esi),%ebx # csumming 1 byte, 2-aligned
+ addl %ebx, %eax
+ adcl $0, %eax
+ jmp 80f
+32:
+ addw (%esi), %ax # csumming 2 bytes, 2-aligned
+ adcl $0, %eax
+ jmp 80f
+
+40:
+ addl -128(%esi), %eax
+ adcl -124(%esi), %eax
+ adcl -120(%esi), %eax
+ adcl -116(%esi), %eax
+ adcl -112(%esi), %eax
+ adcl -108(%esi), %eax
+ adcl -104(%esi), %eax
+ adcl -100(%esi), %eax
+ adcl -96(%esi), %eax
+ adcl -92(%esi), %eax
+ adcl -88(%esi), %eax
+ adcl -84(%esi), %eax
+ adcl -80(%esi), %eax
+ adcl -76(%esi), %eax
+ adcl -72(%esi), %eax
+ adcl -68(%esi), %eax
+ adcl -64(%esi), %eax
+ adcl -60(%esi), %eax
+ adcl -56(%esi), %eax
+ adcl -52(%esi), %eax
+ adcl -48(%esi), %eax
+ adcl -44(%esi), %eax
+ adcl -40(%esi), %eax
+ adcl -36(%esi), %eax
+ adcl -32(%esi), %eax
+ adcl -28(%esi), %eax
+ adcl -24(%esi), %eax
+ adcl -20(%esi), %eax
+ adcl -16(%esi), %eax
+ adcl -12(%esi), %eax
+ adcl -8(%esi), %eax
+ adcl -4(%esi), %eax
+45:
+ lea 128(%esi), %esi
+ adcl $0, %eax
+ dec %ecx
+ jge 40b
+ movl %edx, %ecx
+50: andl $3, %ecx
+ jz 80f
+
+ # Handle the last 1-3 bytes without jumping
+ notl %ecx # 1->2, 2->1, 3->0, higher bits are masked
+ movl $0xffffff,%ebx # by the shll and shrl instructions
+ shll $3,%ecx
+ shrl %cl,%ebx
+ andl -128(%esi),%ebx # esi is 4-aligned so should be ok
+ addl %ebx,%eax
+ adcl $0,%eax
+80:
+ popl %ebx
+ popl %esi
+ ret
+
+#endif
+ EXPORT_SYMBOL(csum_partial)
diff --git a/arch/x86/um/delay.c b/arch/x86/um/delay.c
new file mode 100644
index 000000000..a8fb7ca48
--- /dev/null
+++ b/arch/x86/um/delay.c
@@ -0,0 +1,60 @@
+/*
+ * Copyright (C) 2011 Richard Weinberger <richrd@nod.at>
+ * Mostly copied from arch/x86/lib/delay.c
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/delay.h>
+#include <asm/param.h>
+
+void __delay(unsigned long loops)
+{
+ asm volatile(
+ "test %0,%0\n"
+ "jz 3f\n"
+ "jmp 1f\n"
+
+ ".align 16\n"
+ "1: jmp 2f\n"
+
+ ".align 16\n"
+ "2: dec %0\n"
+ " jnz 2b\n"
+ "3: dec %0\n"
+
+ : /* we don't need output */
+ : "a" (loops)
+ );
+}
+EXPORT_SYMBOL(__delay);
+
+inline void __const_udelay(unsigned long xloops)
+{
+ int d0;
+
+ xloops *= 4;
+ asm("mull %%edx"
+ : "=d" (xloops), "=&a" (d0)
+ : "1" (xloops), "0"
+ (loops_per_jiffy * (HZ/4)));
+
+ __delay(++xloops);
+}
+EXPORT_SYMBOL(__const_udelay);
+
+void __udelay(unsigned long usecs)
+{
+ __const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */
+}
+EXPORT_SYMBOL(__udelay);
+
+void __ndelay(unsigned long nsecs)
+{
+ __const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */
+}
+EXPORT_SYMBOL(__ndelay);
diff --git a/arch/x86/um/elfcore.c b/arch/x86/um/elfcore.c
new file mode 100644
index 000000000..48a3eb09d
--- /dev/null
+++ b/arch/x86/um/elfcore.c
@@ -0,0 +1,77 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/elf.h>
+#include <linux/coredump.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+
+#include <asm/elf.h>
+
+
+Elf32_Half elf_core_extra_phdrs(void)
+{
+ return vsyscall_ehdr ? (((struct elfhdr *)vsyscall_ehdr)->e_phnum) : 0;
+}
+
+int elf_core_write_extra_phdrs(struct coredump_params *cprm, loff_t offset)
+{
+ if ( vsyscall_ehdr ) {
+ const struct elfhdr *const ehdrp =
+ (struct elfhdr *) vsyscall_ehdr;
+ const struct elf_phdr *const phdrp =
+ (const struct elf_phdr *) (vsyscall_ehdr + ehdrp->e_phoff);
+ int i;
+ Elf32_Off ofs = 0;
+
+ for (i = 0; i < ehdrp->e_phnum; ++i) {
+ struct elf_phdr phdr = phdrp[i];
+
+ if (phdr.p_type == PT_LOAD) {
+ ofs = phdr.p_offset = offset;
+ offset += phdr.p_filesz;
+ } else {
+ phdr.p_offset += ofs;
+ }
+ phdr.p_paddr = 0; /* match other core phdrs */
+ if (!dump_emit(cprm, &phdr, sizeof(phdr)))
+ return 0;
+ }
+ }
+ return 1;
+}
+
+int elf_core_write_extra_data(struct coredump_params *cprm)
+{
+ if ( vsyscall_ehdr ) {
+ const struct elfhdr *const ehdrp =
+ (struct elfhdr *) vsyscall_ehdr;
+ const struct elf_phdr *const phdrp =
+ (const struct elf_phdr *) (vsyscall_ehdr + ehdrp->e_phoff);
+ int i;
+
+ for (i = 0; i < ehdrp->e_phnum; ++i) {
+ if (phdrp[i].p_type == PT_LOAD) {
+ void *addr = (void *) phdrp[i].p_vaddr;
+ size_t filesz = phdrp[i].p_filesz;
+ if (!dump_emit(cprm, addr, filesz))
+ return 0;
+ }
+ }
+ }
+ return 1;
+}
+
+size_t elf_core_extra_data_size(void)
+{
+ if ( vsyscall_ehdr ) {
+ const struct elfhdr *const ehdrp =
+ (struct elfhdr *)vsyscall_ehdr;
+ const struct elf_phdr *const phdrp =
+ (const struct elf_phdr *) (vsyscall_ehdr + ehdrp->e_phoff);
+ int i;
+
+ for (i = 0; i < ehdrp->e_phnum; ++i)
+ if (phdrp[i].p_type == PT_LOAD)
+ return (size_t) phdrp[i].p_filesz;
+ }
+ return 0;
+}
diff --git a/arch/x86/um/fault.c b/arch/x86/um/fault.c
new file mode 100644
index 000000000..84ac7f7b0
--- /dev/null
+++ b/arch/x86/um/fault.c
@@ -0,0 +1,28 @@
+/*
+ * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
+ * Licensed under the GPL
+ */
+
+#include <sysdep/ptrace.h>
+
+/* These two are from asm-um/uaccess.h and linux/module.h, check them. */
+struct exception_table_entry
+{
+ unsigned long insn;
+ unsigned long fixup;
+};
+
+const struct exception_table_entry *search_exception_tables(unsigned long add);
+
+/* Compare this to arch/i386/mm/extable.c:fixup_exception() */
+int arch_fixup(unsigned long address, struct uml_pt_regs *regs)
+{
+ const struct exception_table_entry *fixup;
+
+ fixup = search_exception_tables(address);
+ if (fixup) {
+ UPT_IP(regs) = fixup->fixup;
+ return 1;
+ }
+ return 0;
+}
diff --git a/arch/x86/um/ldt.c b/arch/x86/um/ldt.c
new file mode 100644
index 000000000..255a44dd4
--- /dev/null
+++ b/arch/x86/um/ldt.c
@@ -0,0 +1,380 @@
+/*
+ * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
+ * Licensed under the GPL
+ */
+
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/syscalls.h>
+#include <linux/uaccess.h>
+#include <asm/unistd.h>
+#include <os.h>
+#include <skas.h>
+#include <sysdep/tls.h>
+
+static inline int modify_ldt (int func, void *ptr, unsigned long bytecount)
+{
+ return syscall(__NR_modify_ldt, func, ptr, bytecount);
+}
+
+static long write_ldt_entry(struct mm_id *mm_idp, int func,
+ struct user_desc *desc, void **addr, int done)
+{
+ long res;
+ void *stub_addr;
+
+ BUILD_BUG_ON(sizeof(*desc) % sizeof(long));
+
+ res = syscall_stub_data(mm_idp, (unsigned long *)desc,
+ sizeof(*desc) / sizeof(long),
+ addr, &stub_addr);
+ if (!res) {
+ unsigned long args[] = { func,
+ (unsigned long)stub_addr,
+ sizeof(*desc),
+ 0, 0, 0 };
+ res = run_syscall_stub(mm_idp, __NR_modify_ldt, args,
+ 0, addr, done);
+ }
+
+ return res;
+}
+
+/*
+ * In skas mode, we hold our own ldt data in UML.
+ * Thus, the code implementing sys_modify_ldt_skas
+ * is very similar to (and mostly stolen from) sys_modify_ldt
+ * for arch/i386/kernel/ldt.c
+ * The routines copied and modified in part are:
+ * - read_ldt
+ * - read_default_ldt
+ * - write_ldt
+ * - sys_modify_ldt_skas
+ */
+
+static int read_ldt(void __user * ptr, unsigned long bytecount)
+{
+ int i, err = 0;
+ unsigned long size;
+ uml_ldt_t *ldt = &current->mm->context.arch.ldt;
+
+ if (!ldt->entry_count)
+ goto out;
+ if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES)
+ bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
+ err = bytecount;
+
+ mutex_lock(&ldt->lock);
+ if (ldt->entry_count <= LDT_DIRECT_ENTRIES) {
+ size = LDT_ENTRY_SIZE*LDT_DIRECT_ENTRIES;
+ if (size > bytecount)
+ size = bytecount;
+ if (copy_to_user(ptr, ldt->u.entries, size))
+ err = -EFAULT;
+ bytecount -= size;
+ ptr += size;
+ }
+ else {
+ for (i=0; i<ldt->entry_count/LDT_ENTRIES_PER_PAGE && bytecount;
+ i++) {
+ size = PAGE_SIZE;
+ if (size > bytecount)
+ size = bytecount;
+ if (copy_to_user(ptr, ldt->u.pages[i], size)) {
+ err = -EFAULT;
+ break;
+ }
+ bytecount -= size;
+ ptr += size;
+ }
+ }
+ mutex_unlock(&ldt->lock);
+
+ if (bytecount == 0 || err == -EFAULT)
+ goto out;
+
+ if (clear_user(ptr, bytecount))
+ err = -EFAULT;
+
+out:
+ return err;
+}
+
+static int read_default_ldt(void __user * ptr, unsigned long bytecount)
+{
+ int err;
+
+ if (bytecount > 5*LDT_ENTRY_SIZE)
+ bytecount = 5*LDT_ENTRY_SIZE;
+
+ err = bytecount;
+ /*
+ * UML doesn't support lcall7 and lcall27.
+ * So, we don't really have a default ldt, but emulate
+ * an empty ldt of common host default ldt size.
+ */
+ if (clear_user(ptr, bytecount))
+ err = -EFAULT;
+
+ return err;
+}
+
+static int write_ldt(void __user * ptr, unsigned long bytecount, int func)
+{
+ uml_ldt_t *ldt = &current->mm->context.arch.ldt;
+ struct mm_id * mm_idp = &current->mm->context.id;
+ int i, err;
+ struct user_desc ldt_info;
+ struct ldt_entry entry0, *ldt_p;
+ void *addr = NULL;
+
+ err = -EINVAL;
+ if (bytecount != sizeof(ldt_info))
+ goto out;
+ err = -EFAULT;
+ if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info)))
+ goto out;
+
+ err = -EINVAL;
+ if (ldt_info.entry_number >= LDT_ENTRIES)
+ goto out;
+ if (ldt_info.contents == 3) {
+ if (func == 1)
+ goto out;
+ if (ldt_info.seg_not_present == 0)
+ goto out;
+ }
+
+ mutex_lock(&ldt->lock);
+
+ err = write_ldt_entry(mm_idp, func, &ldt_info, &addr, 1);
+ if (err)
+ goto out_unlock;
+
+ if (ldt_info.entry_number >= ldt->entry_count &&
+ ldt_info.entry_number >= LDT_DIRECT_ENTRIES) {
+ for (i=ldt->entry_count/LDT_ENTRIES_PER_PAGE;
+ i*LDT_ENTRIES_PER_PAGE <= ldt_info.entry_number;
+ i++) {
+ if (i == 0)
+ memcpy(&entry0, ldt->u.entries,
+ sizeof(entry0));
+ ldt->u.pages[i] = (struct ldt_entry *)
+ __get_free_page(GFP_KERNEL|__GFP_ZERO);
+ if (!ldt->u.pages[i]) {
+ err = -ENOMEM;
+ /* Undo the change in host */
+ memset(&ldt_info, 0, sizeof(ldt_info));
+ write_ldt_entry(mm_idp, 1, &ldt_info, &addr, 1);
+ goto out_unlock;
+ }
+ if (i == 0) {
+ memcpy(ldt->u.pages[0], &entry0,
+ sizeof(entry0));
+ memcpy(ldt->u.pages[0]+1, ldt->u.entries+1,
+ sizeof(entry0)*(LDT_DIRECT_ENTRIES-1));
+ }
+ ldt->entry_count = (i + 1) * LDT_ENTRIES_PER_PAGE;
+ }
+ }
+ if (ldt->entry_count <= ldt_info.entry_number)
+ ldt->entry_count = ldt_info.entry_number + 1;
+
+ if (ldt->entry_count <= LDT_DIRECT_ENTRIES)
+ ldt_p = ldt->u.entries + ldt_info.entry_number;
+ else
+ ldt_p = ldt->u.pages[ldt_info.entry_number/LDT_ENTRIES_PER_PAGE] +
+ ldt_info.entry_number%LDT_ENTRIES_PER_PAGE;
+
+ if (ldt_info.base_addr == 0 && ldt_info.limit == 0 &&
+ (func == 1 || LDT_empty(&ldt_info))) {
+ ldt_p->a = 0;
+ ldt_p->b = 0;
+ }
+ else{
+ if (func == 1)
+ ldt_info.useable = 0;
+ ldt_p->a = LDT_entry_a(&ldt_info);
+ ldt_p->b = LDT_entry_b(&ldt_info);
+ }
+ err = 0;
+
+out_unlock:
+ mutex_unlock(&ldt->lock);
+out:
+ return err;
+}
+
+static long do_modify_ldt_skas(int func, void __user *ptr,
+ unsigned long bytecount)
+{
+ int ret = -ENOSYS;
+
+ switch (func) {
+ case 0:
+ ret = read_ldt(ptr, bytecount);
+ break;
+ case 1:
+ case 0x11:
+ ret = write_ldt(ptr, bytecount, func);
+ break;
+ case 2:
+ ret = read_default_ldt(ptr, bytecount);
+ break;
+ }
+ return ret;
+}
+
+static DEFINE_SPINLOCK(host_ldt_lock);
+static short dummy_list[9] = {0, -1};
+static short * host_ldt_entries = NULL;
+
+static void ldt_get_host_info(void)
+{
+ long ret;
+ struct ldt_entry * ldt;
+ short *tmp;
+ int i, size, k, order;
+
+ spin_lock(&host_ldt_lock);
+
+ if (host_ldt_entries != NULL) {
+ spin_unlock(&host_ldt_lock);
+ return;
+ }
+ host_ldt_entries = dummy_list+1;
+
+ spin_unlock(&host_ldt_lock);
+
+ for (i = LDT_PAGES_MAX-1, order=0; i; i>>=1, order++)
+ ;
+
+ ldt = (struct ldt_entry *)
+ __get_free_pages(GFP_KERNEL|__GFP_ZERO, order);
+ if (ldt == NULL) {
+ printk(KERN_ERR "ldt_get_host_info: couldn't allocate buffer "
+ "for host ldt\n");
+ return;
+ }
+
+ ret = modify_ldt(0, ldt, (1<<order)*PAGE_SIZE);
+ if (ret < 0) {
+ printk(KERN_ERR "ldt_get_host_info: couldn't read host ldt\n");
+ goto out_free;
+ }
+ if (ret == 0) {
+ /* default_ldt is active, simply write an empty entry 0 */
+ host_ldt_entries = dummy_list;
+ goto out_free;
+ }
+
+ for (i=0, size=0; i<ret/LDT_ENTRY_SIZE; i++) {
+ if (ldt[i].a != 0 || ldt[i].b != 0)
+ size++;
+ }
+
+ if (size < ARRAY_SIZE(dummy_list))
+ host_ldt_entries = dummy_list;
+ else {
+ size = (size + 1) * sizeof(dummy_list[0]);
+ tmp = kmalloc(size, GFP_KERNEL);
+ if (tmp == NULL) {
+ printk(KERN_ERR "ldt_get_host_info: couldn't allocate "
+ "host ldt list\n");
+ goto out_free;
+ }
+ host_ldt_entries = tmp;
+ }
+
+ for (i=0, k=0; i<ret/LDT_ENTRY_SIZE; i++) {
+ if (ldt[i].a != 0 || ldt[i].b != 0)
+ host_ldt_entries[k++] = i;
+ }
+ host_ldt_entries[k] = -1;
+
+out_free:
+ free_pages((unsigned long)ldt, order);
+}
+
+long init_new_ldt(struct mm_context *new_mm, struct mm_context *from_mm)
+{
+ struct user_desc desc;
+ short * num_p;
+ int i;
+ long page, err=0;
+ void *addr = NULL;
+
+
+ mutex_init(&new_mm->arch.ldt.lock);
+
+ if (!from_mm) {
+ memset(&desc, 0, sizeof(desc));
+ /*
+ * Now we try to retrieve info about the ldt, we
+ * inherited from the host. All ldt-entries found
+ * will be reset in the following loop
+ */
+ ldt_get_host_info();
+ for (num_p=host_ldt_entries; *num_p != -1; num_p++) {
+ desc.entry_number = *num_p;
+ err = write_ldt_entry(&new_mm->id, 1, &desc,
+ &addr, *(num_p + 1) == -1);
+ if (err)
+ break;
+ }
+ new_mm->arch.ldt.entry_count = 0;
+
+ goto out;
+ }
+
+ /*
+ * Our local LDT is used to supply the data for
+ * modify_ldt(READLDT), if PTRACE_LDT isn't available,
+ * i.e., we have to use the stub for modify_ldt, which
+ * can't handle the big read buffer of up to 64kB.
+ */
+ mutex_lock(&from_mm->arch.ldt.lock);
+ if (from_mm->arch.ldt.entry_count <= LDT_DIRECT_ENTRIES)
+ memcpy(new_mm->arch.ldt.u.entries, from_mm->arch.ldt.u.entries,
+ sizeof(new_mm->arch.ldt.u.entries));
+ else {
+ i = from_mm->arch.ldt.entry_count / LDT_ENTRIES_PER_PAGE;
+ while (i-->0) {
+ page = __get_free_page(GFP_KERNEL|__GFP_ZERO);
+ if (!page) {
+ err = -ENOMEM;
+ break;
+ }
+ new_mm->arch.ldt.u.pages[i] =
+ (struct ldt_entry *) page;
+ memcpy(new_mm->arch.ldt.u.pages[i],
+ from_mm->arch.ldt.u.pages[i], PAGE_SIZE);
+ }
+ }
+ new_mm->arch.ldt.entry_count = from_mm->arch.ldt.entry_count;
+ mutex_unlock(&from_mm->arch.ldt.lock);
+
+ out:
+ return err;
+}
+
+
+void free_ldt(struct mm_context *mm)
+{
+ int i;
+
+ if (mm->arch.ldt.entry_count > LDT_DIRECT_ENTRIES) {
+ i = mm->arch.ldt.entry_count / LDT_ENTRIES_PER_PAGE;
+ while (i-- > 0)
+ free_page((long) mm->arch.ldt.u.pages[i]);
+ }
+ mm->arch.ldt.entry_count = 0;
+}
+
+SYSCALL_DEFINE3(modify_ldt, int , func , void __user * , ptr ,
+ unsigned long , bytecount)
+{
+ /* See non-um modify_ldt() for why we do this cast */
+ return (unsigned int)do_modify_ldt_skas(func, ptr, bytecount);
+}
diff --git a/arch/x86/um/mem_32.c b/arch/x86/um/mem_32.c
new file mode 100644
index 000000000..56c44d865
--- /dev/null
+++ b/arch/x86/um/mem_32.c
@@ -0,0 +1,53 @@
+/*
+ * Copyright (C) 2011 Richard Weinberger <richrd@nod.at>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/mm.h>
+#include <asm/elf.h>
+
+static struct vm_area_struct gate_vma;
+
+static int __init gate_vma_init(void)
+{
+ if (!FIXADDR_USER_START)
+ return 0;
+
+ vma_init(&gate_vma, NULL);
+ gate_vma.vm_start = FIXADDR_USER_START;
+ gate_vma.vm_end = FIXADDR_USER_END;
+ gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
+ gate_vma.vm_page_prot = __P101;
+
+ return 0;
+}
+__initcall(gate_vma_init);
+
+struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
+{
+ return FIXADDR_USER_START ? &gate_vma : NULL;
+}
+
+int in_gate_area_no_mm(unsigned long addr)
+{
+ if (!FIXADDR_USER_START)
+ return 0;
+
+ if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END))
+ return 1;
+
+ return 0;
+}
+
+int in_gate_area(struct mm_struct *mm, unsigned long addr)
+{
+ struct vm_area_struct *vma = get_gate_vma(mm);
+
+ if (!vma)
+ return 0;
+
+ return (addr >= vma->vm_start) && (addr < vma->vm_end);
+}
diff --git a/arch/x86/um/mem_64.c b/arch/x86/um/mem_64.c
new file mode 100644
index 000000000..c027e93d1
--- /dev/null
+++ b/arch/x86/um/mem_64.c
@@ -0,0 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/mm.h>
+#include <asm/elf.h>
+
+const char *arch_vma_name(struct vm_area_struct *vma)
+{
+ if (vma->vm_mm && vma->vm_start == um_vdso_addr)
+ return "[vdso]";
+
+ return NULL;
+}
diff --git a/arch/x86/um/os-Linux/Makefile b/arch/x86/um/os-Linux/Makefile
new file mode 100644
index 000000000..253bfb8cb
--- /dev/null
+++ b/arch/x86/um/os-Linux/Makefile
@@ -0,0 +1,13 @@
+#
+# Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
+# Licensed under the GPL
+#
+
+obj-y = registers.o task_size.o mcontext.o
+
+obj-$(CONFIG_X86_32) += tls.o
+obj-$(CONFIG_64BIT) += prctl.o
+
+USER_OBJS := $(obj-y)
+
+include arch/um/scripts/Makefile.rules
diff --git a/arch/x86/um/os-Linux/mcontext.c b/arch/x86/um/os-Linux/mcontext.c
new file mode 100644
index 000000000..49c3744ca
--- /dev/null
+++ b/arch/x86/um/os-Linux/mcontext.c
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <sys/ucontext.h>
+#define __FRAME_OFFSETS
+#include <asm/ptrace.h>
+#include <sysdep/ptrace.h>
+
+void get_regs_from_mc(struct uml_pt_regs *regs, mcontext_t *mc)
+{
+#ifdef __i386__
+#define COPY2(X,Y) regs->gp[X] = mc->gregs[REG_##Y]
+#define COPY(X) regs->gp[X] = mc->gregs[REG_##X]
+#define COPY_SEG(X) regs->gp[X] = mc->gregs[REG_##X] & 0xffff;
+#define COPY_SEG_CPL3(X) regs->gp[X] = (mc->gregs[REG_##X] & 0xffff) | 3;
+ COPY_SEG(GS); COPY_SEG(FS); COPY_SEG(ES); COPY_SEG(DS);
+ COPY(EDI); COPY(ESI); COPY(EBP);
+ COPY2(UESP, ESP); /* sic */
+ COPY(EBX); COPY(EDX); COPY(ECX); COPY(EAX);
+ COPY(EIP); COPY_SEG_CPL3(CS); COPY(EFL); COPY_SEG_CPL3(SS);
+#else
+#define COPY2(X,Y) regs->gp[X/sizeof(unsigned long)] = mc->gregs[REG_##Y]
+#define COPY(X) regs->gp[X/sizeof(unsigned long)] = mc->gregs[REG_##X]
+ COPY(R8); COPY(R9); COPY(R10); COPY(R11);
+ COPY(R12); COPY(R13); COPY(R14); COPY(R15);
+ COPY(RDI); COPY(RSI); COPY(RBP); COPY(RBX);
+ COPY(RDX); COPY(RAX); COPY(RCX); COPY(RSP);
+ COPY(RIP);
+ COPY2(EFLAGS, EFL);
+ COPY2(CS, CSGSFS);
+ regs->gp[CS / sizeof(unsigned long)] &= 0xffff;
+ regs->gp[CS / sizeof(unsigned long)] |= 3;
+#endif
+}
diff --git a/arch/x86/um/os-Linux/prctl.c b/arch/x86/um/os-Linux/prctl.c
new file mode 100644
index 000000000..8431e87ac
--- /dev/null
+++ b/arch/x86/um/os-Linux/prctl.c
@@ -0,0 +1,12 @@
+/*
+ * Copyright (C) 2007 Jeff Dike (jdike@{addtoit.com,linux.intel.com})
+ * Licensed under the GPL
+ */
+
+#include <sys/ptrace.h>
+#include <asm/ptrace.h>
+
+int os_arch_prctl(int pid, int option, unsigned long *arg2)
+{
+ return ptrace(PTRACE_ARCH_PRCTL, pid, (unsigned long) arg2, option);
+}
diff --git a/arch/x86/um/os-Linux/registers.c b/arch/x86/um/os-Linux/registers.c
new file mode 100644
index 000000000..3c423dfcd
--- /dev/null
+++ b/arch/x86/um/os-Linux/registers.c
@@ -0,0 +1,168 @@
+/*
+ * Copyright (C) 2004 PathScale, Inc
+ * Copyright (C) 2004 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
+ * Licensed under the GPL
+ */
+
+#include <errno.h>
+#include <stdlib.h>
+#include <sys/ptrace.h>
+#ifdef __i386__
+#include <sys/user.h>
+#endif
+#include <longjmp.h>
+#include <sysdep/ptrace_user.h>
+#include <sys/uio.h>
+#include <asm/sigcontext.h>
+#include <linux/elf.h>
+
+int have_xstate_support;
+
+int save_i387_registers(int pid, unsigned long *fp_regs)
+{
+ if (ptrace(PTRACE_GETFPREGS, pid, 0, fp_regs) < 0)
+ return -errno;
+ return 0;
+}
+
+int save_fp_registers(int pid, unsigned long *fp_regs)
+{
+#ifdef PTRACE_GETREGSET
+ struct iovec iov;
+
+ if (have_xstate_support) {
+ iov.iov_base = fp_regs;
+ iov.iov_len = FP_SIZE * sizeof(unsigned long);
+ if (ptrace(PTRACE_GETREGSET, pid, NT_X86_XSTATE, &iov) < 0)
+ return -errno;
+ return 0;
+ } else
+#endif
+ return save_i387_registers(pid, fp_regs);
+}
+
+int restore_i387_registers(int pid, unsigned long *fp_regs)
+{
+ if (ptrace(PTRACE_SETFPREGS, pid, 0, fp_regs) < 0)
+ return -errno;
+ return 0;
+}
+
+int restore_fp_registers(int pid, unsigned long *fp_regs)
+{
+#ifdef PTRACE_SETREGSET
+ struct iovec iov;
+ if (have_xstate_support) {
+ iov.iov_base = fp_regs;
+ iov.iov_len = FP_SIZE * sizeof(unsigned long);
+ if (ptrace(PTRACE_SETREGSET, pid, NT_X86_XSTATE, &iov) < 0)
+ return -errno;
+ return 0;
+ } else
+#endif
+ return restore_i387_registers(pid, fp_regs);
+}
+
+#ifdef __i386__
+int have_fpx_regs = 1;
+int save_fpx_registers(int pid, unsigned long *fp_regs)
+{
+ if (ptrace(PTRACE_GETFPXREGS, pid, 0, fp_regs) < 0)
+ return -errno;
+ return 0;
+}
+
+int restore_fpx_registers(int pid, unsigned long *fp_regs)
+{
+ if (ptrace(PTRACE_SETFPXREGS, pid, 0, fp_regs) < 0)
+ return -errno;
+ return 0;
+}
+
+int get_fp_registers(int pid, unsigned long *regs)
+{
+ if (have_fpx_regs)
+ return save_fpx_registers(pid, regs);
+ else
+ return save_fp_registers(pid, regs);
+}
+
+int put_fp_registers(int pid, unsigned long *regs)
+{
+ if (have_fpx_regs)
+ return restore_fpx_registers(pid, regs);
+ else
+ return restore_fp_registers(pid, regs);
+}
+
+void arch_init_registers(int pid)
+{
+ struct user_fpxregs_struct fpx_regs;
+ int err;
+
+ err = ptrace(PTRACE_GETFPXREGS, pid, 0, &fpx_regs);
+ if (!err)
+ return;
+
+ if (errno != EIO)
+ panic("check_ptrace : PTRACE_GETFPXREGS failed, errno = %d",
+ errno);
+
+ have_fpx_regs = 0;
+}
+#else
+
+int get_fp_registers(int pid, unsigned long *regs)
+{
+ return save_fp_registers(pid, regs);
+}
+
+int put_fp_registers(int pid, unsigned long *regs)
+{
+ return restore_fp_registers(pid, regs);
+}
+
+void arch_init_registers(int pid)
+{
+#ifdef PTRACE_GETREGSET
+ void * fp_regs;
+ struct iovec iov;
+
+ fp_regs = malloc(FP_SIZE * sizeof(unsigned long));
+ if(fp_regs == NULL)
+ return;
+
+ iov.iov_base = fp_regs;
+ iov.iov_len = FP_SIZE * sizeof(unsigned long);
+ if (ptrace(PTRACE_GETREGSET, pid, NT_X86_XSTATE, &iov) == 0)
+ have_xstate_support = 1;
+
+ free(fp_regs);
+#endif
+}
+#endif
+
+unsigned long get_thread_reg(int reg, jmp_buf *buf)
+{
+ switch (reg) {
+#ifdef __i386__
+ case HOST_IP:
+ return buf[0]->__eip;
+ case HOST_SP:
+ return buf[0]->__esp;
+ case HOST_BP:
+ return buf[0]->__ebp;
+#else
+ case HOST_IP:
+ return buf[0]->__rip;
+ case HOST_SP:
+ return buf[0]->__rsp;
+ case HOST_BP:
+ return buf[0]->__rbp;
+#endif
+ default:
+ printk(UM_KERN_ERR "get_thread_regs - unknown register %d\n",
+ reg);
+ return 0;
+ }
+}
diff --git a/arch/x86/um/os-Linux/task_size.c b/arch/x86/um/os-Linux/task_size.c
new file mode 100644
index 000000000..e62174638
--- /dev/null
+++ b/arch/x86/um/os-Linux/task_size.c
@@ -0,0 +1,151 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <sys/mman.h>
+#include <longjmp.h>
+
+#ifdef __i386__
+
+static jmp_buf buf;
+
+static void segfault(int sig)
+{
+ longjmp(buf, 1);
+}
+
+static int page_ok(unsigned long page)
+{
+ unsigned long *address = (unsigned long *) (page << UM_KERN_PAGE_SHIFT);
+ unsigned long n = ~0UL;
+ void *mapped = NULL;
+ int ok = 0;
+
+ /*
+ * First see if the page is readable. If it is, it may still
+ * be a VDSO, so we go on to see if it's writable. If not
+ * then try mapping memory there. If that fails, then we're
+ * still in the kernel area. As a sanity check, we'll fail if
+ * the mmap succeeds, but gives us an address different from
+ * what we wanted.
+ */
+ if (setjmp(buf) == 0)
+ n = *address;
+ else {
+ mapped = mmap(address, UM_KERN_PAGE_SIZE,
+ PROT_READ | PROT_WRITE,
+ MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ if (mapped == MAP_FAILED)
+ return 0;
+ if (mapped != address)
+ goto out;
+ }
+
+ /*
+ * Now, is it writeable? If so, then we're in user address
+ * space. If not, then try mprotecting it and try the write
+ * again.
+ */
+ if (setjmp(buf) == 0) {
+ *address = n;
+ ok = 1;
+ goto out;
+ } else if (mprotect(address, UM_KERN_PAGE_SIZE,
+ PROT_READ | PROT_WRITE) != 0)
+ goto out;
+
+ if (setjmp(buf) == 0) {
+ *address = n;
+ ok = 1;
+ }
+
+ out:
+ if (mapped != NULL)
+ munmap(mapped, UM_KERN_PAGE_SIZE);
+ return ok;
+}
+
+unsigned long os_get_top_address(void)
+{
+ struct sigaction sa, old;
+ unsigned long bottom = 0;
+ /*
+ * A 32-bit UML on a 64-bit host gets confused about the VDSO at
+ * 0xffffe000. It is mapped, is readable, can be reprotected writeable
+ * and written. However, exec discovers later that it can't be
+ * unmapped. So, just set the highest address to be checked to just
+ * below it. This might waste some address space on 4G/4G 32-bit
+ * hosts, but shouldn't hurt otherwise.
+ */
+ unsigned long top = 0xffffd000 >> UM_KERN_PAGE_SHIFT;
+ unsigned long test, original;
+
+ printf("Locating the bottom of the address space ... ");
+ fflush(stdout);
+
+ /*
+ * We're going to be longjmping out of the signal handler, so
+ * SA_DEFER needs to be set.
+ */
+ sa.sa_handler = segfault;
+ sigemptyset(&sa.sa_mask);
+ sa.sa_flags = SA_NODEFER;
+ if (sigaction(SIGSEGV, &sa, &old)) {
+ perror("os_get_top_address");
+ exit(1);
+ }
+
+ /* Manually scan the address space, bottom-up, until we find
+ * the first valid page (or run out of them).
+ */
+ for (bottom = 0; bottom < top; bottom++) {
+ if (page_ok(bottom))
+ break;
+ }
+
+ /* If we've got this far, we ran out of pages. */
+ if (bottom == top) {
+ fprintf(stderr, "Unable to determine bottom of address "
+ "space.\n");
+ exit(1);
+ }
+
+ printf("0x%lx\n", bottom << UM_KERN_PAGE_SHIFT);
+ printf("Locating the top of the address space ... ");
+ fflush(stdout);
+
+ original = bottom;
+
+ /* This could happen with a 4G/4G split */
+ if (page_ok(top))
+ goto out;
+
+ do {
+ test = bottom + (top - bottom) / 2;
+ if (page_ok(test))
+ bottom = test;
+ else
+ top = test;
+ } while (top - bottom > 1);
+
+out:
+ /* Restore the old SIGSEGV handling */
+ if (sigaction(SIGSEGV, &old, NULL)) {
+ perror("os_get_top_address");
+ exit(1);
+ }
+ top <<= UM_KERN_PAGE_SHIFT;
+ printf("0x%lx\n", top);
+
+ return top;
+}
+
+#else
+
+unsigned long os_get_top_address(void)
+{
+ /* The old value of CONFIG_TOP_ADDR */
+ return 0x7fc0000000;
+}
+
+#endif
diff --git a/arch/x86/um/os-Linux/tls.c b/arch/x86/um/os-Linux/tls.c
new file mode 100644
index 000000000..3e1b1bf6a
--- /dev/null
+++ b/arch/x86/um/os-Linux/tls.c
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <errno.h>
+#include <linux/unistd.h>
+
+#include <sys/ptrace.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+#include <sysdep/tls.h>
+
+#ifndef PTRACE_GET_THREAD_AREA
+#define PTRACE_GET_THREAD_AREA 25
+#endif
+
+#ifndef PTRACE_SET_THREAD_AREA
+#define PTRACE_SET_THREAD_AREA 26
+#endif
+
+/* Checks whether host supports TLS, and sets *tls_min according to the value
+ * valid on the host.
+ * i386 host have it == 6; x86_64 host have it == 12, for i386 emulation. */
+void check_host_supports_tls(int *supports_tls, int *tls_min)
+{
+ /* Values for x86 and x86_64.*/
+ int val[] = {GDT_ENTRY_TLS_MIN_I386, GDT_ENTRY_TLS_MIN_X86_64};
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(val); i++) {
+ user_desc_t info;
+ info.entry_number = val[i];
+
+ if (syscall(__NR_get_thread_area, &info) == 0) {
+ *tls_min = val[i];
+ *supports_tls = 1;
+ return;
+ } else {
+ if (errno == EINVAL)
+ continue;
+ else if (errno == ENOSYS)
+ *supports_tls = 0;
+ return;
+ }
+ }
+
+ *supports_tls = 0;
+}
+
+int os_set_thread_area(user_desc_t *info, int pid)
+{
+ int ret;
+
+ ret = ptrace(PTRACE_SET_THREAD_AREA, pid, info->entry_number,
+ (unsigned long) info);
+ if (ret < 0)
+ ret = -errno;
+ return ret;
+}
+
+int os_get_thread_area(user_desc_t *info, int pid)
+{
+ int ret;
+
+ ret = ptrace(PTRACE_GET_THREAD_AREA, pid, info->entry_number,
+ (unsigned long) info);
+ if (ret < 0)
+ ret = -errno;
+ return ret;
+}
diff --git a/arch/x86/um/ptrace_32.c b/arch/x86/um/ptrace_32.c
new file mode 100644
index 000000000..2497bac56
--- /dev/null
+++ b/arch/x86/um/ptrace_32.c
@@ -0,0 +1,277 @@
+/*
+ * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
+ * Licensed under the GPL
+ */
+
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/uaccess.h>
+#include <asm/ptrace-abi.h>
+#include <skas.h>
+
+extern int arch_switch_tls(struct task_struct *to);
+
+void arch_switch_to(struct task_struct *to)
+{
+ int err = arch_switch_tls(to);
+ if (!err)
+ return;
+
+ if (err != -EINVAL)
+ printk(KERN_WARNING "arch_switch_tls failed, errno %d, "
+ "not EINVAL\n", -err);
+ else
+ printk(KERN_WARNING "arch_switch_tls failed, errno = EINVAL\n");
+}
+
+int is_syscall(unsigned long addr)
+{
+ unsigned short instr;
+ int n;
+
+ n = copy_from_user(&instr, (void __user *) addr, sizeof(instr));
+ if (n) {
+ /* access_process_vm() grants access to vsyscall and stub,
+ * while copy_from_user doesn't. Maybe access_process_vm is
+ * slow, but that doesn't matter, since it will be called only
+ * in case of singlestepping, if copy_from_user failed.
+ */
+ n = access_process_vm(current, addr, &instr, sizeof(instr),
+ FOLL_FORCE);
+ if (n != sizeof(instr)) {
+ printk(KERN_ERR "is_syscall : failed to read "
+ "instruction from 0x%lx\n", addr);
+ return 1;
+ }
+ }
+ /* int 0x80 or sysenter */
+ return (instr == 0x80cd) || (instr == 0x340f);
+}
+
+/* determines which flags the user has access to. */
+/* 1 = access 0 = no access */
+#define FLAG_MASK 0x00044dd5
+
+static const int reg_offsets[] = {
+ [EBX] = HOST_BX,
+ [ECX] = HOST_CX,
+ [EDX] = HOST_DX,
+ [ESI] = HOST_SI,
+ [EDI] = HOST_DI,
+ [EBP] = HOST_BP,
+ [EAX] = HOST_AX,
+ [DS] = HOST_DS,
+ [ES] = HOST_ES,
+ [FS] = HOST_FS,
+ [GS] = HOST_GS,
+ [EIP] = HOST_IP,
+ [CS] = HOST_CS,
+ [EFL] = HOST_EFLAGS,
+ [UESP] = HOST_SP,
+ [SS] = HOST_SS,
+ [ORIG_EAX] = HOST_ORIG_AX,
+};
+
+int putreg(struct task_struct *child, int regno, unsigned long value)
+{
+ regno >>= 2;
+ switch (regno) {
+ case EBX:
+ case ECX:
+ case EDX:
+ case ESI:
+ case EDI:
+ case EBP:
+ case EAX:
+ case EIP:
+ case UESP:
+ break;
+ case ORIG_EAX:
+ /* Update the syscall number. */
+ UPT_SYSCALL_NR(&child->thread.regs.regs) = value;
+ break;
+ case FS:
+ if (value && (value & 3) != 3)
+ return -EIO;
+ break;
+ case GS:
+ if (value && (value & 3) != 3)
+ return -EIO;
+ break;
+ case DS:
+ case ES:
+ if (value && (value & 3) != 3)
+ return -EIO;
+ value &= 0xffff;
+ break;
+ case SS:
+ case CS:
+ if ((value & 3) != 3)
+ return -EIO;
+ value &= 0xffff;
+ break;
+ case EFL:
+ value &= FLAG_MASK;
+ child->thread.regs.regs.gp[HOST_EFLAGS] |= value;
+ return 0;
+ default :
+ panic("Bad register in putreg() : %d\n", regno);
+ }
+ child->thread.regs.regs.gp[reg_offsets[regno]] = value;
+ return 0;
+}
+
+int poke_user(struct task_struct *child, long addr, long data)
+{
+ if ((addr & 3) || addr < 0)
+ return -EIO;
+
+ if (addr < MAX_REG_OFFSET)
+ return putreg(child, addr, data);
+ else if ((addr >= offsetof(struct user, u_debugreg[0])) &&
+ (addr <= offsetof(struct user, u_debugreg[7]))) {
+ addr -= offsetof(struct user, u_debugreg[0]);
+ addr = addr >> 2;
+ if ((addr == 4) || (addr == 5))
+ return -EIO;
+ child->thread.arch.debugregs[addr] = data;
+ return 0;
+ }
+ return -EIO;
+}
+
+unsigned long getreg(struct task_struct *child, int regno)
+{
+ unsigned long mask = ~0UL;
+
+ regno >>= 2;
+ switch (regno) {
+ case FS:
+ case GS:
+ case DS:
+ case ES:
+ case SS:
+ case CS:
+ mask = 0xffff;
+ break;
+ case EIP:
+ case UESP:
+ case EAX:
+ case EBX:
+ case ECX:
+ case EDX:
+ case ESI:
+ case EDI:
+ case EBP:
+ case EFL:
+ case ORIG_EAX:
+ break;
+ default:
+ panic("Bad register in getreg() : %d\n", regno);
+ }
+ return mask & child->thread.regs.regs.gp[reg_offsets[regno]];
+}
+
+/* read the word at location addr in the USER area. */
+int peek_user(struct task_struct *child, long addr, long data)
+{
+ unsigned long tmp;
+
+ if ((addr & 3) || addr < 0)
+ return -EIO;
+
+ tmp = 0; /* Default return condition */
+ if (addr < MAX_REG_OFFSET) {
+ tmp = getreg(child, addr);
+ }
+ else if ((addr >= offsetof(struct user, u_debugreg[0])) &&
+ (addr <= offsetof(struct user, u_debugreg[7]))) {
+ addr -= offsetof(struct user, u_debugreg[0]);
+ addr = addr >> 2;
+ tmp = child->thread.arch.debugregs[addr];
+ }
+ return put_user(tmp, (unsigned long __user *) data);
+}
+
+static int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *child)
+{
+ int err, n, cpu = task_cpu(child);
+ struct user_i387_struct fpregs;
+
+ err = save_i387_registers(userspace_pid[cpu],
+ (unsigned long *) &fpregs);
+ if (err)
+ return err;
+
+ n = copy_to_user(buf, &fpregs, sizeof(fpregs));
+ if(n > 0)
+ return -EFAULT;
+
+ return n;
+}
+
+static int set_fpregs(struct user_i387_struct __user *buf, struct task_struct *child)
+{
+ int n, cpu = task_cpu(child);
+ struct user_i387_struct fpregs;
+
+ n = copy_from_user(&fpregs, buf, sizeof(fpregs));
+ if (n > 0)
+ return -EFAULT;
+
+ return restore_i387_registers(userspace_pid[cpu],
+ (unsigned long *) &fpregs);
+}
+
+static int get_fpxregs(struct user_fxsr_struct __user *buf, struct task_struct *child)
+{
+ int err, n, cpu = task_cpu(child);
+ struct user_fxsr_struct fpregs;
+
+ err = save_fpx_registers(userspace_pid[cpu], (unsigned long *) &fpregs);
+ if (err)
+ return err;
+
+ n = copy_to_user(buf, &fpregs, sizeof(fpregs));
+ if(n > 0)
+ return -EFAULT;
+
+ return n;
+}
+
+static int set_fpxregs(struct user_fxsr_struct __user *buf, struct task_struct *child)
+{
+ int n, cpu = task_cpu(child);
+ struct user_fxsr_struct fpregs;
+
+ n = copy_from_user(&fpregs, buf, sizeof(fpregs));
+ if (n > 0)
+ return -EFAULT;
+
+ return restore_fpx_registers(userspace_pid[cpu],
+ (unsigned long *) &fpregs);
+}
+
+long subarch_ptrace(struct task_struct *child, long request,
+ unsigned long addr, unsigned long data)
+{
+ int ret = -EIO;
+ void __user *datap = (void __user *) data;
+ switch (request) {
+ case PTRACE_GETFPREGS: /* Get the child FPU state. */
+ ret = get_fpregs(datap, child);
+ break;
+ case PTRACE_SETFPREGS: /* Set the child FPU state. */
+ ret = set_fpregs(datap, child);
+ break;
+ case PTRACE_GETFPXREGS: /* Get the child FPU state. */
+ ret = get_fpxregs(datap, child);
+ break;
+ case PTRACE_SETFPXREGS: /* Set the child FPU state. */
+ ret = set_fpxregs(datap, child);
+ break;
+ default:
+ ret = -EIO;
+ }
+ return ret;
+}
diff --git a/arch/x86/um/ptrace_64.c b/arch/x86/um/ptrace_64.c
new file mode 100644
index 000000000..09a085bde
--- /dev/null
+++ b/arch/x86/um/ptrace_64.c
@@ -0,0 +1,277 @@
+/*
+ * Copyright 2003 PathScale, Inc.
+ * Copyright (C) 2003 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
+ *
+ * Licensed under the GPL
+ */
+
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/errno.h>
+#define __FRAME_OFFSETS
+#include <asm/ptrace.h>
+#include <linux/uaccess.h>
+#include <asm/ptrace-abi.h>
+
+/*
+ * determines which flags the user has access to.
+ * 1 = access 0 = no access
+ */
+#define FLAG_MASK 0x44dd5UL
+
+static const int reg_offsets[] =
+{
+ [R8 >> 3] = HOST_R8,
+ [R9 >> 3] = HOST_R9,
+ [R10 >> 3] = HOST_R10,
+ [R11 >> 3] = HOST_R11,
+ [R12 >> 3] = HOST_R12,
+ [R13 >> 3] = HOST_R13,
+ [R14 >> 3] = HOST_R14,
+ [R15 >> 3] = HOST_R15,
+ [RIP >> 3] = HOST_IP,
+ [RSP >> 3] = HOST_SP,
+ [RAX >> 3] = HOST_AX,
+ [RBX >> 3] = HOST_BX,
+ [RCX >> 3] = HOST_CX,
+ [RDX >> 3] = HOST_DX,
+ [RSI >> 3] = HOST_SI,
+ [RDI >> 3] = HOST_DI,
+ [RBP >> 3] = HOST_BP,
+ [CS >> 3] = HOST_CS,
+ [SS >> 3] = HOST_SS,
+ [FS_BASE >> 3] = HOST_FS_BASE,
+ [GS_BASE >> 3] = HOST_GS_BASE,
+ [DS >> 3] = HOST_DS,
+ [ES >> 3] = HOST_ES,
+ [FS >> 3] = HOST_FS,
+ [GS >> 3] = HOST_GS,
+ [EFLAGS >> 3] = HOST_EFLAGS,
+ [ORIG_RAX >> 3] = HOST_ORIG_AX,
+};
+
+int putreg(struct task_struct *child, int regno, unsigned long value)
+{
+#ifdef TIF_IA32
+ /*
+ * Some code in the 64bit emulation may not be 64bit clean.
+ * Don't take any chances.
+ */
+ if (test_tsk_thread_flag(child, TIF_IA32))
+ value &= 0xffffffff;
+#endif
+ switch (regno) {
+ case R8:
+ case R9:
+ case R10:
+ case R11:
+ case R12:
+ case R13:
+ case R14:
+ case R15:
+ case RIP:
+ case RSP:
+ case RAX:
+ case RBX:
+ case RCX:
+ case RDX:
+ case RSI:
+ case RDI:
+ case RBP:
+ break;
+
+ case ORIG_RAX:
+ /* Update the syscall number. */
+ UPT_SYSCALL_NR(&child->thread.regs.regs) = value;
+ break;
+
+ case FS:
+ case GS:
+ case DS:
+ case ES:
+ case SS:
+ case CS:
+ if (value && (value & 3) != 3)
+ return -EIO;
+ value &= 0xffff;
+ break;
+
+ case FS_BASE:
+ case GS_BASE:
+ if (!((value >> 48) == 0 || (value >> 48) == 0xffff))
+ return -EIO;
+ break;
+
+ case EFLAGS:
+ value &= FLAG_MASK;
+ child->thread.regs.regs.gp[HOST_EFLAGS] |= value;
+ return 0;
+
+ default:
+ panic("Bad register in putreg(): %d\n", regno);
+ }
+
+ child->thread.regs.regs.gp[reg_offsets[regno >> 3]] = value;
+ return 0;
+}
+
+int poke_user(struct task_struct *child, long addr, long data)
+{
+ if ((addr & 3) || addr < 0)
+ return -EIO;
+
+ if (addr < MAX_REG_OFFSET)
+ return putreg(child, addr, data);
+ else if ((addr >= offsetof(struct user, u_debugreg[0])) &&
+ (addr <= offsetof(struct user, u_debugreg[7]))) {
+ addr -= offsetof(struct user, u_debugreg[0]);
+ addr = addr >> 3;
+ if ((addr == 4) || (addr == 5))
+ return -EIO;
+ child->thread.arch.debugregs[addr] = data;
+ return 0;
+ }
+ return -EIO;
+}
+
+unsigned long getreg(struct task_struct *child, int regno)
+{
+ unsigned long mask = ~0UL;
+#ifdef TIF_IA32
+ if (test_tsk_thread_flag(child, TIF_IA32))
+ mask = 0xffffffff;
+#endif
+ switch (regno) {
+ case R8:
+ case R9:
+ case R10:
+ case R11:
+ case R12:
+ case R13:
+ case R14:
+ case R15:
+ case RIP:
+ case RSP:
+ case RAX:
+ case RBX:
+ case RCX:
+ case RDX:
+ case RSI:
+ case RDI:
+ case RBP:
+ case ORIG_RAX:
+ case EFLAGS:
+ case FS_BASE:
+ case GS_BASE:
+ break;
+ case FS:
+ case GS:
+ case DS:
+ case ES:
+ case SS:
+ case CS:
+ mask = 0xffff;
+ break;
+ default:
+ panic("Bad register in getreg: %d\n", regno);
+ }
+ return mask & child->thread.regs.regs.gp[reg_offsets[regno >> 3]];
+}
+
+int peek_user(struct task_struct *child, long addr, long data)
+{
+ /* read the word at location addr in the USER area. */
+ unsigned long tmp;
+
+ if ((addr & 3) || addr < 0)
+ return -EIO;
+
+ tmp = 0; /* Default return condition */
+ if (addr < MAX_REG_OFFSET)
+ tmp = getreg(child, addr);
+ else if ((addr >= offsetof(struct user, u_debugreg[0])) &&
+ (addr <= offsetof(struct user, u_debugreg[7]))) {
+ addr -= offsetof(struct user, u_debugreg[0]);
+ addr = addr >> 2;
+ tmp = child->thread.arch.debugregs[addr];
+ }
+ return put_user(tmp, (unsigned long *) data);
+}
+
+/* XXX Mostly copied from sys-i386 */
+int is_syscall(unsigned long addr)
+{
+ unsigned short instr;
+ int n;
+
+ n = copy_from_user(&instr, (void __user *) addr, sizeof(instr));
+ if (n) {
+ /*
+ * access_process_vm() grants access to vsyscall and stub,
+ * while copy_from_user doesn't. Maybe access_process_vm is
+ * slow, but that doesn't matter, since it will be called only
+ * in case of singlestepping, if copy_from_user failed.
+ */
+ n = access_process_vm(current, addr, &instr, sizeof(instr),
+ FOLL_FORCE);
+ if (n != sizeof(instr)) {
+ printk("is_syscall : failed to read instruction from "
+ "0x%lx\n", addr);
+ return 1;
+ }
+ }
+ /* sysenter */
+ return instr == 0x050f;
+}
+
+static int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *child)
+{
+ int err, n, cpu = ((struct thread_info *) child->stack)->cpu;
+ struct user_i387_struct fpregs;
+
+ err = save_i387_registers(userspace_pid[cpu],
+ (unsigned long *) &fpregs);
+ if (err)
+ return err;
+
+ n = copy_to_user(buf, &fpregs, sizeof(fpregs));
+ if (n > 0)
+ return -EFAULT;
+
+ return n;
+}
+
+static int set_fpregs(struct user_i387_struct __user *buf, struct task_struct *child)
+{
+ int n, cpu = ((struct thread_info *) child->stack)->cpu;
+ struct user_i387_struct fpregs;
+
+ n = copy_from_user(&fpregs, buf, sizeof(fpregs));
+ if (n > 0)
+ return -EFAULT;
+
+ return restore_i387_registers(userspace_pid[cpu],
+ (unsigned long *) &fpregs);
+}
+
+long subarch_ptrace(struct task_struct *child, long request,
+ unsigned long addr, unsigned long data)
+{
+ int ret = -EIO;
+ void __user *datap = (void __user *) data;
+
+ switch (request) {
+ case PTRACE_GETFPREGS: /* Get the child FPU state. */
+ ret = get_fpregs(datap, child);
+ break;
+ case PTRACE_SETFPREGS: /* Set the child FPU state. */
+ ret = set_fpregs(datap, child);
+ break;
+ case PTRACE_ARCH_PRCTL:
+ /* XXX Calls ptrace on the host - needs some SMP thinking */
+ ret = arch_prctl(child, data, (void __user *) addr);
+ break;
+ }
+
+ return ret;
+}
diff --git a/arch/x86/um/ptrace_user.c b/arch/x86/um/ptrace_user.c
new file mode 100644
index 000000000..617885b18
--- /dev/null
+++ b/arch/x86/um/ptrace_user.c
@@ -0,0 +1,21 @@
+/*
+ * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
+ * Licensed under the GPL
+ */
+
+#include <errno.h>
+#include <ptrace_user.h>
+
+int ptrace_getregs(long pid, unsigned long *regs_out)
+{
+ if (ptrace(PTRACE_GETREGS, pid, 0, regs_out) < 0)
+ return -errno;
+ return 0;
+}
+
+int ptrace_setregs(long pid, unsigned long *regs)
+{
+ if (ptrace(PTRACE_SETREGS, pid, 0, regs) < 0)
+ return -errno;
+ return 0;
+}
diff --git a/arch/x86/um/setjmp_32.S b/arch/x86/um/setjmp_32.S
new file mode 100644
index 000000000..62eaf8c80
--- /dev/null
+++ b/arch/x86/um/setjmp_32.S
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#
+# arch/i386/setjmp.S
+#
+# setjmp/longjmp for the i386 architecture
+#
+
+#
+# The jmp_buf is assumed to contain the following, in order:
+# %ebx
+# %esp
+# %ebp
+# %esi
+# %edi
+# <return address>
+#
+
+ .text
+ .align 4
+ .globl kernel_setjmp
+ .type kernel_setjmp, @function
+kernel_setjmp:
+#ifdef _REGPARM
+ movl %eax,%edx
+#else
+ movl 4(%esp),%edx
+#endif
+ popl %ecx # Return address, and adjust the stack
+ xorl %eax,%eax # Return value
+ movl %ebx,(%edx)
+ movl %esp,4(%edx) # Post-return %esp!
+ pushl %ecx # Make the call/return stack happy
+ movl %ebp,8(%edx)
+ movl %esi,12(%edx)
+ movl %edi,16(%edx)
+ movl %ecx,20(%edx) # Return address
+ ret
+
+ .size kernel_setjmp,.-kernel_setjmp
+
+ .text
+ .align 4
+ .globl kernel_longjmp
+ .type kernel_longjmp, @function
+kernel_longjmp:
+#ifdef _REGPARM
+ xchgl %eax,%edx
+#else
+ movl 4(%esp),%edx # jmp_ptr address
+ movl 8(%esp),%eax # Return value
+#endif
+ movl (%edx),%ebx
+ movl 4(%edx),%esp
+ movl 8(%edx),%ebp
+ movl 12(%edx),%esi
+ movl 16(%edx),%edi
+ jmp *20(%edx)
+
+ .size kernel_longjmp,.-kernel_longjmp
diff --git a/arch/x86/um/setjmp_64.S b/arch/x86/um/setjmp_64.S
new file mode 100644
index 000000000..1b5d40d4f
--- /dev/null
+++ b/arch/x86/um/setjmp_64.S
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#
+# arch/x86_64/setjmp.S
+#
+# setjmp/longjmp for the x86-64 architecture
+#
+
+#
+# The jmp_buf is assumed to contain the following, in order:
+# %rbx
+# %rsp (post-return)
+# %rbp
+# %r12
+# %r13
+# %r14
+# %r15
+# <return address>
+#
+
+ .text
+ .align 4
+ .globl kernel_setjmp
+ .type kernel_setjmp, @function
+kernel_setjmp:
+ pop %rsi # Return address, and adjust the stack
+ xorl %eax,%eax # Return value
+ movq %rbx,(%rdi)
+ movq %rsp,8(%rdi) # Post-return %rsp!
+ push %rsi # Make the call/return stack happy
+ movq %rbp,16(%rdi)
+ movq %r12,24(%rdi)
+ movq %r13,32(%rdi)
+ movq %r14,40(%rdi)
+ movq %r15,48(%rdi)
+ movq %rsi,56(%rdi) # Return address
+ ret
+
+ .size kernel_setjmp,.-kernel_setjmp
+
+ .text
+ .align 4
+ .globl kernel_longjmp
+ .type kernel_longjmp, @function
+kernel_longjmp:
+ movl %esi,%eax # Return value (int)
+ movq (%rdi),%rbx
+ movq 8(%rdi),%rsp
+ movq 16(%rdi),%rbp
+ movq 24(%rdi),%r12
+ movq 32(%rdi),%r13
+ movq 40(%rdi),%r14
+ movq 48(%rdi),%r15
+ jmp *56(%rdi)
+
+ .size kernel_longjmp,.-kernel_longjmp
diff --git a/arch/x86/um/shared/sysdep/archsetjmp.h b/arch/x86/um/shared/sysdep/archsetjmp.h
new file mode 100644
index 000000000..166cedbab
--- /dev/null
+++ b/arch/x86/um/shared/sysdep/archsetjmp.h
@@ -0,0 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifdef __i386__
+#include "archsetjmp_32.h"
+#else
+#include "archsetjmp_64.h"
+#endif
diff --git a/arch/x86/um/shared/sysdep/archsetjmp_32.h b/arch/x86/um/shared/sysdep/archsetjmp_32.h
new file mode 100644
index 000000000..fb08f2576
--- /dev/null
+++ b/arch/x86/um/shared/sysdep/archsetjmp_32.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * arch/um/include/sysdep-i386/archsetjmp.h
+ */
+
+#ifndef _KLIBC_ARCHSETJMP_H
+#define _KLIBC_ARCHSETJMP_H
+
+struct __jmp_buf {
+ unsigned int __ebx;
+ unsigned int __esp;
+ unsigned int __ebp;
+ unsigned int __esi;
+ unsigned int __edi;
+ unsigned int __eip;
+};
+
+typedef struct __jmp_buf jmp_buf[1];
+
+#define JB_IP __eip
+#define JB_SP __esp
+
+#endif /* _SETJMP_H */
diff --git a/arch/x86/um/shared/sysdep/archsetjmp_64.h b/arch/x86/um/shared/sysdep/archsetjmp_64.h
new file mode 100644
index 000000000..9b499e457
--- /dev/null
+++ b/arch/x86/um/shared/sysdep/archsetjmp_64.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * arch/um/include/sysdep-x86_64/archsetjmp.h
+ */
+
+#ifndef _KLIBC_ARCHSETJMP_H
+#define _KLIBC_ARCHSETJMP_H
+
+struct __jmp_buf {
+ unsigned long __rbx;
+ unsigned long __rsp;
+ unsigned long __rbp;
+ unsigned long __r12;
+ unsigned long __r13;
+ unsigned long __r14;
+ unsigned long __r15;
+ unsigned long __rip;
+};
+
+typedef struct __jmp_buf jmp_buf[1];
+
+#define JB_IP __rip
+#define JB_SP __rsp
+
+#endif /* _SETJMP_H */
diff --git a/arch/x86/um/shared/sysdep/faultinfo.h b/arch/x86/um/shared/sysdep/faultinfo.h
new file mode 100644
index 000000000..4390803e0
--- /dev/null
+++ b/arch/x86/um/shared/sysdep/faultinfo.h
@@ -0,0 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifdef __i386__
+#include "faultinfo_32.h"
+#else
+#include "faultinfo_64.h"
+#endif
diff --git a/arch/x86/um/shared/sysdep/faultinfo_32.h b/arch/x86/um/shared/sysdep/faultinfo_32.h
new file mode 100644
index 000000000..b6f2437ec
--- /dev/null
+++ b/arch/x86/um/shared/sysdep/faultinfo_32.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (C) 2004 Fujitsu Siemens Computers GmbH
+ * Author: Bodo Stroesser <bstroesser@fujitsu-siemens.com>
+ * Licensed under the GPL
+ */
+
+#ifndef __FAULTINFO_I386_H
+#define __FAULTINFO_I386_H
+
+/* this structure contains the full arch-specific faultinfo
+ * from the traps.
+ * On i386, ptrace_faultinfo unfortunately doesn't provide
+ * all the info, since trap_no is missing.
+ * All common elements are defined at the same position in
+ * both structures, thus making it easy to copy the
+ * contents without knowledge about the structure elements.
+ */
+struct faultinfo {
+ int error_code; /* in ptrace_faultinfo misleadingly called is_write */
+ unsigned long cr2; /* in ptrace_faultinfo called addr */
+ int trap_no; /* missing in ptrace_faultinfo */
+};
+
+#define FAULT_WRITE(fi) ((fi).error_code & 2)
+#define FAULT_ADDRESS(fi) ((fi).cr2)
+
+/* This is Page Fault */
+#define SEGV_IS_FIXABLE(fi) ((fi)->trap_no == 14)
+
+#define PTRACE_FULL_FAULTINFO 0
+
+#endif
diff --git a/arch/x86/um/shared/sysdep/faultinfo_64.h b/arch/x86/um/shared/sysdep/faultinfo_64.h
new file mode 100644
index 000000000..ee88f8897
--- /dev/null
+++ b/arch/x86/um/shared/sysdep/faultinfo_64.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (C) 2004 Fujitsu Siemens Computers GmbH
+ * Author: Bodo Stroesser <bstroesser@fujitsu-siemens.com>
+ * Licensed under the GPL
+ */
+
+#ifndef __FAULTINFO_X86_64_H
+#define __FAULTINFO_X86_64_H
+
+/* this structure contains the full arch-specific faultinfo
+ * from the traps.
+ * On i386, ptrace_faultinfo unfortunately doesn't provide
+ * all the info, since trap_no is missing.
+ * All common elements are defined at the same position in
+ * both structures, thus making it easy to copy the
+ * contents without knowledge about the structure elements.
+ */
+struct faultinfo {
+ int error_code; /* in ptrace_faultinfo misleadingly called is_write */
+ unsigned long cr2; /* in ptrace_faultinfo called addr */
+ int trap_no; /* missing in ptrace_faultinfo */
+};
+
+#define FAULT_WRITE(fi) ((fi).error_code & 2)
+#define FAULT_ADDRESS(fi) ((fi).cr2)
+
+/* This is Page Fault */
+#define SEGV_IS_FIXABLE(fi) ((fi)->trap_no == 14)
+
+#define PTRACE_FULL_FAULTINFO 1
+
+#endif
diff --git a/arch/x86/um/shared/sysdep/kernel-offsets.h b/arch/x86/um/shared/sysdep/kernel-offsets.h
new file mode 100644
index 000000000..a004bffb7
--- /dev/null
+++ b/arch/x86/um/shared/sysdep/kernel-offsets.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/stddef.h>
+#include <linux/sched.h>
+#include <linux/elf.h>
+#include <linux/crypto.h>
+#include <linux/kbuild.h>
+#include <asm/mman.h>
+
+void foo(void)
+{
+#include <common-offsets.h>
+}
diff --git a/arch/x86/um/shared/sysdep/mcontext.h b/arch/x86/um/shared/sysdep/mcontext.h
new file mode 100644
index 000000000..b724c54da
--- /dev/null
+++ b/arch/x86/um/shared/sysdep/mcontext.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
+ * Licensed under the GPL
+ */
+
+#ifndef __SYS_SIGCONTEXT_X86_H
+#define __SYS_SIGCONTEXT_X86_H
+
+extern void get_regs_from_mc(struct uml_pt_regs *, mcontext_t *);
+
+#ifdef __i386__
+
+#define GET_FAULTINFO_FROM_MC(fi, mc) \
+ { \
+ (fi).cr2 = (mc)->cr2; \
+ (fi).error_code = (mc)->gregs[REG_ERR]; \
+ (fi).trap_no = (mc)->gregs[REG_TRAPNO]; \
+ }
+
+#else
+
+#define GET_FAULTINFO_FROM_MC(fi, mc) \
+ { \
+ (fi).cr2 = (mc)->gregs[REG_CR2]; \
+ (fi).error_code = (mc)->gregs[REG_ERR]; \
+ (fi).trap_no = (mc)->gregs[REG_TRAPNO]; \
+ }
+
+#endif
+
+#endif
diff --git a/arch/x86/um/shared/sysdep/ptrace.h b/arch/x86/um/shared/sysdep/ptrace.h
new file mode 100644
index 000000000..6ca4ecabc
--- /dev/null
+++ b/arch/x86/um/shared/sysdep/ptrace.h
@@ -0,0 +1,75 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __SYSDEP_X86_PTRACE_H
+#define __SYSDEP_X86_PTRACE_H
+
+#include <generated/user_constants.h>
+#include <sysdep/faultinfo.h>
+
+#define MAX_REG_OFFSET (UM_FRAME_SIZE)
+#define MAX_REG_NR ((MAX_REG_OFFSET) / sizeof(unsigned long))
+
+#define REGS_IP(r) ((r)[HOST_IP])
+#define REGS_SP(r) ((r)[HOST_SP])
+#define REGS_EFLAGS(r) ((r)[HOST_EFLAGS])
+#define REGS_AX(r) ((r)[HOST_AX])
+#define REGS_BX(r) ((r)[HOST_BX])
+#define REGS_CX(r) ((r)[HOST_CX])
+#define REGS_DX(r) ((r)[HOST_DX])
+#define REGS_SI(r) ((r)[HOST_SI])
+#define REGS_DI(r) ((r)[HOST_DI])
+#define REGS_BP(r) ((r)[HOST_BP])
+#define REGS_CS(r) ((r)[HOST_CS])
+#define REGS_SS(r) ((r)[HOST_SS])
+#define REGS_DS(r) ((r)[HOST_DS])
+#define REGS_ES(r) ((r)[HOST_ES])
+
+#define UPT_IP(r) REGS_IP((r)->gp)
+#define UPT_SP(r) REGS_SP((r)->gp)
+#define UPT_EFLAGS(r) REGS_EFLAGS((r)->gp)
+#define UPT_AX(r) REGS_AX((r)->gp)
+#define UPT_BX(r) REGS_BX((r)->gp)
+#define UPT_CX(r) REGS_CX((r)->gp)
+#define UPT_DX(r) REGS_DX((r)->gp)
+#define UPT_SI(r) REGS_SI((r)->gp)
+#define UPT_DI(r) REGS_DI((r)->gp)
+#define UPT_BP(r) REGS_BP((r)->gp)
+#define UPT_CS(r) REGS_CS((r)->gp)
+#define UPT_SS(r) REGS_SS((r)->gp)
+#define UPT_DS(r) REGS_DS((r)->gp)
+#define UPT_ES(r) REGS_ES((r)->gp)
+
+#ifdef __i386__
+#include "ptrace_32.h"
+#else
+#include "ptrace_64.h"
+#endif
+
+struct syscall_args {
+ unsigned long args[6];
+};
+
+#define SYSCALL_ARGS(r) ((struct syscall_args) \
+ { .args = { UPT_SYSCALL_ARG1(r), \
+ UPT_SYSCALL_ARG2(r), \
+ UPT_SYSCALL_ARG3(r), \
+ UPT_SYSCALL_ARG4(r), \
+ UPT_SYSCALL_ARG5(r), \
+ UPT_SYSCALL_ARG6(r) } } )
+
+struct uml_pt_regs {
+ unsigned long gp[MAX_REG_NR];
+ unsigned long fp[MAX_FP_NR];
+ struct faultinfo faultinfo;
+ long syscall;
+ int is_user;
+};
+
+#define EMPTY_UML_PT_REGS { }
+
+#define UPT_SYSCALL_NR(r) ((r)->syscall)
+#define UPT_FAULTINFO(r) (&(r)->faultinfo)
+#define UPT_IS_USER(r) ((r)->is_user)
+
+extern int user_context(unsigned long sp);
+
+#endif /* __SYSDEP_X86_PTRACE_H */
diff --git a/arch/x86/um/shared/sysdep/ptrace_32.h b/arch/x86/um/shared/sysdep/ptrace_32.h
new file mode 100644
index 000000000..ae00d22bc
--- /dev/null
+++ b/arch/x86/um/shared/sysdep/ptrace_32.h
@@ -0,0 +1,26 @@
+/*
+ * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
+ * Licensed under the GPL
+ */
+
+#ifndef __SYSDEP_I386_PTRACE_H
+#define __SYSDEP_I386_PTRACE_H
+
+#define MAX_FP_NR HOST_FPX_SIZE
+
+static inline void update_debugregs(int seq) {}
+
+void set_using_sysemu(int value);
+int get_using_sysemu(void);
+extern int sysemu_supported;
+
+#define UPT_SYSCALL_ARG1(r) UPT_BX(r)
+#define UPT_SYSCALL_ARG2(r) UPT_CX(r)
+#define UPT_SYSCALL_ARG3(r) UPT_DX(r)
+#define UPT_SYSCALL_ARG4(r) UPT_SI(r)
+#define UPT_SYSCALL_ARG5(r) UPT_DI(r)
+#define UPT_SYSCALL_ARG6(r) UPT_BP(r)
+
+extern void arch_init_registers(int pid);
+
+#endif
diff --git a/arch/x86/um/shared/sysdep/ptrace_64.h b/arch/x86/um/shared/sysdep/ptrace_64.h
new file mode 100644
index 000000000..0dc223aa1
--- /dev/null
+++ b/arch/x86/um/shared/sysdep/ptrace_64.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright 2003 PathScale, Inc.
+ * Copyright (C) 2003 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
+ *
+ * Licensed under the GPL
+ */
+
+#ifndef __SYSDEP_X86_64_PTRACE_H
+#define __SYSDEP_X86_64_PTRACE_H
+
+#define MAX_FP_NR HOST_FP_SIZE
+
+#define REGS_R8(r) ((r)[HOST_R8])
+#define REGS_R9(r) ((r)[HOST_R9])
+#define REGS_R10(r) ((r)[HOST_R10])
+#define REGS_R11(r) ((r)[HOST_R11])
+#define REGS_R12(r) ((r)[HOST_R12])
+#define REGS_R13(r) ((r)[HOST_R13])
+#define REGS_R14(r) ((r)[HOST_R14])
+#define REGS_R15(r) ((r)[HOST_R15])
+
+#define HOST_FS_BASE 21
+#define HOST_GS_BASE 22
+#define HOST_DS 23
+#define HOST_ES 24
+#define HOST_FS 25
+#define HOST_GS 26
+
+/* Also defined in asm/ptrace-x86_64.h, but not in libc headers. So, these
+ * are already defined for kernel code, but not for userspace code.
+ */
+#ifndef FS_BASE
+/* These aren't defined in ptrace.h, but exist in struct user_regs_struct,
+ * which is what x86_64 ptrace actually uses.
+ */
+#define FS_BASE (HOST_FS_BASE * sizeof(long))
+#define GS_BASE (HOST_GS_BASE * sizeof(long))
+#define DS (HOST_DS * sizeof(long))
+#define ES (HOST_ES * sizeof(long))
+#define FS (HOST_FS * sizeof(long))
+#define GS (HOST_GS * sizeof(long))
+#endif
+
+#define UPT_R8(r) REGS_R8((r)->gp)
+#define UPT_R9(r) REGS_R9((r)->gp)
+#define UPT_R10(r) REGS_R10((r)->gp)
+#define UPT_R11(r) REGS_R11((r)->gp)
+#define UPT_R12(r) REGS_R12((r)->gp)
+#define UPT_R13(r) REGS_R13((r)->gp)
+#define UPT_R14(r) REGS_R14((r)->gp)
+#define UPT_R15(r) REGS_R15((r)->gp)
+
+#define UPT_SYSCALL_ARG1(r) UPT_DI(r)
+#define UPT_SYSCALL_ARG2(r) UPT_SI(r)
+#define UPT_SYSCALL_ARG3(r) UPT_DX(r)
+#define UPT_SYSCALL_ARG4(r) UPT_R10(r)
+#define UPT_SYSCALL_ARG5(r) UPT_R8(r)
+#define UPT_SYSCALL_ARG6(r) UPT_R9(r)
+
+extern void arch_init_registers(int pid);
+
+#endif
diff --git a/arch/x86/um/shared/sysdep/ptrace_user.h b/arch/x86/um/shared/sysdep/ptrace_user.h
new file mode 100644
index 000000000..44782bbad
--- /dev/null
+++ b/arch/x86/um/shared/sysdep/ptrace_user.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <generated/user_constants.h>
+
+#define PT_OFFSET(r) ((r) * sizeof(long))
+
+#define PT_SYSCALL_NR(regs) ((regs)[HOST_ORIG_AX])
+#define PT_SYSCALL_NR_OFFSET PT_OFFSET(HOST_ORIG_AX)
+
+#define PT_SYSCALL_RET_OFFSET PT_OFFSET(HOST_AX)
+
+#define REGS_IP_INDEX HOST_IP
+#define REGS_SP_INDEX HOST_SP
+
+#ifdef __i386__
+#define FP_SIZE ((HOST_FPX_SIZE > HOST_FP_SIZE) ? HOST_FPX_SIZE : HOST_FP_SIZE)
+#else
+#define FP_SIZE HOST_FP_SIZE
+
+/*
+ * x86_64 FC3 doesn't define this in /usr/include/linux/ptrace.h even though
+ * it's defined in the kernel's include/linux/ptrace.h. Additionally, use the
+ * 2.4 name and value for 2.4 host compatibility.
+ */
+#ifndef PTRACE_OLDSETOPTIONS
+#define PTRACE_OLDSETOPTIONS 21
+#endif
+
+#endif
diff --git a/arch/x86/um/shared/sysdep/stub.h b/arch/x86/um/shared/sysdep/stub.h
new file mode 100644
index 000000000..ce0ca46ad
--- /dev/null
+++ b/arch/x86/um/shared/sysdep/stub.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <asm/unistd.h>
+#include <sys/mman.h>
+#include <signal.h>
+#include <as-layout.h>
+#include <stub-data.h>
+
+#ifdef __i386__
+#include "stub_32.h"
+#else
+#include "stub_64.h"
+#endif
+
+extern void stub_segv_handler(int, siginfo_t *, void *);
+extern void stub_clone_handler(void);
diff --git a/arch/x86/um/shared/sysdep/stub_32.h b/arch/x86/um/shared/sysdep/stub_32.h
new file mode 100644
index 000000000..51fd256c7
--- /dev/null
+++ b/arch/x86/um/shared/sysdep/stub_32.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright (C) 2004 Jeff Dike (jdike@addtoit.com)
+ * Licensed under the GPL
+ */
+
+#ifndef __SYSDEP_STUB_H
+#define __SYSDEP_STUB_H
+
+#include <asm/ptrace.h>
+
+#define STUB_SYSCALL_RET EAX
+#define STUB_MMAP_NR __NR_mmap2
+#define MMAP_OFFSET(o) ((o) >> UM_KERN_PAGE_SHIFT)
+
+static inline long stub_syscall0(long syscall)
+{
+ long ret;
+
+ __asm__ volatile ("int $0x80" : "=a" (ret) : "0" (syscall));
+
+ return ret;
+}
+
+static inline long stub_syscall1(long syscall, long arg1)
+{
+ long ret;
+
+ __asm__ volatile ("int $0x80" : "=a" (ret) : "0" (syscall), "b" (arg1));
+
+ return ret;
+}
+
+static inline long stub_syscall2(long syscall, long arg1, long arg2)
+{
+ long ret;
+
+ __asm__ volatile ("int $0x80" : "=a" (ret) : "0" (syscall), "b" (arg1),
+ "c" (arg2));
+
+ return ret;
+}
+
+static inline long stub_syscall3(long syscall, long arg1, long arg2, long arg3)
+{
+ long ret;
+
+ __asm__ volatile ("int $0x80" : "=a" (ret) : "0" (syscall), "b" (arg1),
+ "c" (arg2), "d" (arg3));
+
+ return ret;
+}
+
+static inline long stub_syscall4(long syscall, long arg1, long arg2, long arg3,
+ long arg4)
+{
+ long ret;
+
+ __asm__ volatile ("int $0x80" : "=a" (ret) : "0" (syscall), "b" (arg1),
+ "c" (arg2), "d" (arg3), "S" (arg4));
+
+ return ret;
+}
+
+static inline long stub_syscall5(long syscall, long arg1, long arg2, long arg3,
+ long arg4, long arg5)
+{
+ long ret;
+
+ __asm__ volatile ("int $0x80" : "=a" (ret) : "0" (syscall), "b" (arg1),
+ "c" (arg2), "d" (arg3), "S" (arg4), "D" (arg5));
+
+ return ret;
+}
+
+static inline void trap_myself(void)
+{
+ __asm("int3");
+}
+
+static inline void remap_stack(int fd, unsigned long offset)
+{
+ __asm__ volatile ("movl %%eax,%%ebp ; movl %0,%%eax ; int $0x80 ;"
+ "movl %7, %%ebx ; movl %%eax, (%%ebx)"
+ : : "g" (STUB_MMAP_NR), "b" (STUB_DATA),
+ "c" (UM_KERN_PAGE_SIZE),
+ "d" (PROT_READ | PROT_WRITE),
+ "S" (MAP_FIXED | MAP_SHARED), "D" (fd),
+ "a" (offset),
+ "i" (&((struct stub_data *) STUB_DATA)->err)
+ : "memory");
+}
+
+#endif
diff --git a/arch/x86/um/shared/sysdep/stub_64.h b/arch/x86/um/shared/sysdep/stub_64.h
new file mode 100644
index 000000000..994df93c5
--- /dev/null
+++ b/arch/x86/um/shared/sysdep/stub_64.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright (C) 2004 Jeff Dike (jdike@addtoit.com)
+ * Licensed under the GPL
+ */
+
+#ifndef __SYSDEP_STUB_H
+#define __SYSDEP_STUB_H
+
+#include <sysdep/ptrace_user.h>
+
+#define STUB_SYSCALL_RET PT_INDEX(RAX)
+#define STUB_MMAP_NR __NR_mmap
+#define MMAP_OFFSET(o) (o)
+
+#define __syscall_clobber "r11","rcx","memory"
+#define __syscall "syscall"
+
+static inline long stub_syscall0(long syscall)
+{
+ long ret;
+
+ __asm__ volatile (__syscall
+ : "=a" (ret)
+ : "0" (syscall) : __syscall_clobber );
+
+ return ret;
+}
+
+static inline long stub_syscall2(long syscall, long arg1, long arg2)
+{
+ long ret;
+
+ __asm__ volatile (__syscall
+ : "=a" (ret)
+ : "0" (syscall), "D" (arg1), "S" (arg2) : __syscall_clobber );
+
+ return ret;
+}
+
+static inline long stub_syscall3(long syscall, long arg1, long arg2, long arg3)
+{
+ long ret;
+
+ __asm__ volatile (__syscall
+ : "=a" (ret)
+ : "0" (syscall), "D" (arg1), "S" (arg2), "d" (arg3)
+ : __syscall_clobber );
+
+ return ret;
+}
+
+static inline long stub_syscall4(long syscall, long arg1, long arg2, long arg3,
+ long arg4)
+{
+ long ret;
+
+ __asm__ volatile ("movq %5,%%r10 ; " __syscall
+ : "=a" (ret)
+ : "0" (syscall), "D" (arg1), "S" (arg2), "d" (arg3),
+ "g" (arg4)
+ : __syscall_clobber, "r10" );
+
+ return ret;
+}
+
+static inline long stub_syscall5(long syscall, long arg1, long arg2, long arg3,
+ long arg4, long arg5)
+{
+ long ret;
+
+ __asm__ volatile ("movq %5,%%r10 ; movq %6,%%r8 ; " __syscall
+ : "=a" (ret)
+ : "0" (syscall), "D" (arg1), "S" (arg2), "d" (arg3),
+ "g" (arg4), "g" (arg5)
+ : __syscall_clobber, "r10", "r8" );
+
+ return ret;
+}
+
+static inline void trap_myself(void)
+{
+ __asm("int3");
+}
+
+static inline void remap_stack(long fd, unsigned long offset)
+{
+ __asm__ volatile ("movq %4,%%r10 ; movq %5,%%r8 ; "
+ "movq %6, %%r9; " __syscall "; movq %7, %%rbx ; "
+ "movq %%rax, (%%rbx)":
+ : "a" (STUB_MMAP_NR), "D" (STUB_DATA),
+ "S" (UM_KERN_PAGE_SIZE),
+ "d" (PROT_READ | PROT_WRITE),
+ "g" (MAP_FIXED | MAP_SHARED), "g" (fd),
+ "g" (offset),
+ "i" (&((struct stub_data *) STUB_DATA)->err)
+ : __syscall_clobber, "r10", "r8", "r9" );
+}
+
+#endif
diff --git a/arch/x86/um/shared/sysdep/syscalls.h b/arch/x86/um/shared/sysdep/syscalls.h
new file mode 100644
index 000000000..b2060ac70
--- /dev/null
+++ b/arch/x86/um/shared/sysdep/syscalls.h
@@ -0,0 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifdef __i386__
+#include "syscalls_32.h"
+#else
+#include "syscalls_64.h"
+#endif
diff --git a/arch/x86/um/shared/sysdep/syscalls_32.h b/arch/x86/um/shared/sysdep/syscalls_32.h
new file mode 100644
index 000000000..68fd2cf52
--- /dev/null
+++ b/arch/x86/um/shared/sysdep/syscalls_32.h
@@ -0,0 +1,15 @@
+/*
+ * Copyright (C) 2000 - 2008 Jeff Dike (jdike@{addtoit,linux.intel}.com)
+ * Licensed under the GPL
+ */
+
+#include <asm/unistd.h>
+#include <sysdep/ptrace.h>
+
+typedef long syscall_handler_t(struct pt_regs);
+
+extern syscall_handler_t *sys_call_table[];
+
+#define EXECUTE_SYSCALL(syscall, regs) \
+ ((long (*)(struct syscall_args)) \
+ (*sys_call_table[syscall]))(SYSCALL_ARGS(&regs->regs))
diff --git a/arch/x86/um/shared/sysdep/syscalls_64.h b/arch/x86/um/shared/sysdep/syscalls_64.h
new file mode 100644
index 000000000..1e6875b4f
--- /dev/null
+++ b/arch/x86/um/shared/sysdep/syscalls_64.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright 2003 PathScale, Inc.
+ *
+ * Licensed under the GPL
+ */
+
+#ifndef __SYSDEP_X86_64_SYSCALLS_H__
+#define __SYSDEP_X86_64_SYSCALLS_H__
+
+#include <linux/msg.h>
+#include <linux/shm.h>
+
+typedef long syscall_handler_t(long, long, long, long, long, long);
+
+extern syscall_handler_t *sys_call_table[];
+
+#define EXECUTE_SYSCALL(syscall, regs) \
+ (((*sys_call_table[syscall]))(UPT_SYSCALL_ARG1(&regs->regs), \
+ UPT_SYSCALL_ARG2(&regs->regs), \
+ UPT_SYSCALL_ARG3(&regs->regs), \
+ UPT_SYSCALL_ARG4(&regs->regs), \
+ UPT_SYSCALL_ARG5(&regs->regs), \
+ UPT_SYSCALL_ARG6(&regs->regs)))
+
+extern long old_mmap(unsigned long addr, unsigned long len,
+ unsigned long prot, unsigned long flags,
+ unsigned long fd, unsigned long pgoff);
+extern syscall_handler_t sys_modify_ldt;
+extern syscall_handler_t sys_arch_prctl;
+
+#endif
diff --git a/arch/x86/um/shared/sysdep/tls.h b/arch/x86/um/shared/sysdep/tls.h
new file mode 100644
index 000000000..b968016aa
--- /dev/null
+++ b/arch/x86/um/shared/sysdep/tls.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _SYSDEP_TLS_H
+#define _SYSDEP_TLS_H
+
+#ifdef __UM_HOST__
+
+/* Change name to avoid conflicts with the original one from <asm/ldt.h>, which
+ * may be named user_desc (but in 2.4 and in header matching its API was named
+ * modify_ldt_ldt_s). */
+
+typedef struct um_dup_user_desc {
+ unsigned int entry_number;
+ unsigned int base_addr;
+ unsigned int limit;
+ unsigned int seg_32bit:1;
+ unsigned int contents:2;
+ unsigned int read_exec_only:1;
+ unsigned int limit_in_pages:1;
+ unsigned int seg_not_present:1;
+ unsigned int useable:1;
+#ifdef __x86_64__
+ unsigned int lm:1;
+#endif
+} user_desc_t;
+
+#else /* __UM_HOST__ */
+
+typedef struct user_desc user_desc_t;
+
+#endif /* __UM_HOST__ */
+
+extern int os_set_thread_area(user_desc_t *info, int pid);
+extern int os_get_thread_area(user_desc_t *info, int pid);
+
+#ifdef __i386__
+#define GDT_ENTRY_TLS_MIN_I386 6
+#define GDT_ENTRY_TLS_MIN_X86_64 12
+#endif
+
+#endif /* _SYSDEP_TLS_H */
diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c
new file mode 100644
index 000000000..727ed442e
--- /dev/null
+++ b/arch/x86/um/signal.c
@@ -0,0 +1,582 @@
+/*
+ * Copyright (C) 2003 PathScale, Inc.
+ * Copyright (C) 2003 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
+ * Licensed under the GPL
+ */
+
+
+#include <linux/personality.h>
+#include <linux/ptrace.h>
+#include <linux/kernel.h>
+#include <asm/unistd.h>
+#include <linux/uaccess.h>
+#include <asm/ucontext.h>
+#include <frame_kern.h>
+#include <skas.h>
+
+#ifdef CONFIG_X86_32
+
+/*
+ * FPU tag word conversions.
+ */
+
+static inline unsigned short twd_i387_to_fxsr(unsigned short twd)
+{
+ unsigned int tmp; /* to avoid 16 bit prefixes in the code */
+
+ /* Transform each pair of bits into 01 (valid) or 00 (empty) */
+ tmp = ~twd;
+ tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */
+ /* and move the valid bits to the lower byte. */
+ tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */
+ tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */
+ tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */
+ return tmp;
+}
+
+static inline unsigned long twd_fxsr_to_i387(struct user_fxsr_struct *fxsave)
+{
+ struct _fpxreg *st = NULL;
+ unsigned long twd = (unsigned long) fxsave->twd;
+ unsigned long tag;
+ unsigned long ret = 0xffff0000;
+ int i;
+
+#define FPREG_ADDR(f, n) ((char *)&(f)->st_space + (n) * 16)
+
+ for (i = 0; i < 8; i++) {
+ if (twd & 0x1) {
+ st = (struct _fpxreg *) FPREG_ADDR(fxsave, i);
+
+ switch (st->exponent & 0x7fff) {
+ case 0x7fff:
+ tag = 2; /* Special */
+ break;
+ case 0x0000:
+ if ( !st->significand[0] &&
+ !st->significand[1] &&
+ !st->significand[2] &&
+ !st->significand[3] ) {
+ tag = 1; /* Zero */
+ } else {
+ tag = 2; /* Special */
+ }
+ break;
+ default:
+ if (st->significand[3] & 0x8000) {
+ tag = 0; /* Valid */
+ } else {
+ tag = 2; /* Special */
+ }
+ break;
+ }
+ } else {
+ tag = 3; /* Empty */
+ }
+ ret |= (tag << (2 * i));
+ twd = twd >> 1;
+ }
+ return ret;
+}
+
+static int convert_fxsr_to_user(struct _fpstate __user *buf,
+ struct user_fxsr_struct *fxsave)
+{
+ unsigned long env[7];
+ struct _fpreg __user *to;
+ struct _fpxreg *from;
+ int i;
+
+ env[0] = (unsigned long)fxsave->cwd | 0xffff0000ul;
+ env[1] = (unsigned long)fxsave->swd | 0xffff0000ul;
+ env[2] = twd_fxsr_to_i387(fxsave);
+ env[3] = fxsave->fip;
+ env[4] = fxsave->fcs | ((unsigned long)fxsave->fop << 16);
+ env[5] = fxsave->foo;
+ env[6] = fxsave->fos;
+
+ if (__copy_to_user(buf, env, 7 * sizeof(unsigned long)))
+ return 1;
+
+ to = &buf->_st[0];
+ from = (struct _fpxreg *) &fxsave->st_space[0];
+ for (i = 0; i < 8; i++, to++, from++) {
+ unsigned long __user *t = (unsigned long __user *)to;
+ unsigned long *f = (unsigned long *)from;
+
+ if (__put_user(*f, t) ||
+ __put_user(*(f + 1), t + 1) ||
+ __put_user(from->exponent, &to->exponent))
+ return 1;
+ }
+ return 0;
+}
+
+static int convert_fxsr_from_user(struct user_fxsr_struct *fxsave,
+ struct _fpstate __user *buf)
+{
+ unsigned long env[7];
+ struct _fpxreg *to;
+ struct _fpreg __user *from;
+ int i;
+
+ if (copy_from_user( env, buf, 7 * sizeof(long)))
+ return 1;
+
+ fxsave->cwd = (unsigned short)(env[0] & 0xffff);
+ fxsave->swd = (unsigned short)(env[1] & 0xffff);
+ fxsave->twd = twd_i387_to_fxsr((unsigned short)(env[2] & 0xffff));
+ fxsave->fip = env[3];
+ fxsave->fop = (unsigned short)((env[4] & 0xffff0000ul) >> 16);
+ fxsave->fcs = (env[4] & 0xffff);
+ fxsave->foo = env[5];
+ fxsave->fos = env[6];
+
+ to = (struct _fpxreg *) &fxsave->st_space[0];
+ from = &buf->_st[0];
+ for (i = 0; i < 8; i++, to++, from++) {
+ unsigned long *t = (unsigned long *)to;
+ unsigned long __user *f = (unsigned long __user *)from;
+
+ if (__get_user(*t, f) ||
+ __get_user(*(t + 1), f + 1) ||
+ __get_user(to->exponent, &from->exponent))
+ return 1;
+ }
+ return 0;
+}
+
+extern int have_fpx_regs;
+
+#endif
+
+static int copy_sc_from_user(struct pt_regs *regs,
+ struct sigcontext __user *from)
+{
+ struct sigcontext sc;
+ int err, pid;
+
+ /* Always make any pending restarted system calls return -EINTR */
+ current->restart_block.fn = do_no_restart_syscall;
+
+ err = copy_from_user(&sc, from, sizeof(sc));
+ if (err)
+ return err;
+
+#define GETREG(regno, regname) regs->regs.gp[HOST_##regno] = sc.regname
+
+#ifdef CONFIG_X86_32
+ GETREG(GS, gs);
+ GETREG(FS, fs);
+ GETREG(ES, es);
+ GETREG(DS, ds);
+#endif
+ GETREG(DI, di);
+ GETREG(SI, si);
+ GETREG(BP, bp);
+ GETREG(SP, sp);
+ GETREG(BX, bx);
+ GETREG(DX, dx);
+ GETREG(CX, cx);
+ GETREG(AX, ax);
+ GETREG(IP, ip);
+
+#ifdef CONFIG_X86_64
+ GETREG(R8, r8);
+ GETREG(R9, r9);
+ GETREG(R10, r10);
+ GETREG(R11, r11);
+ GETREG(R12, r12);
+ GETREG(R13, r13);
+ GETREG(R14, r14);
+ GETREG(R15, r15);
+#endif
+
+ GETREG(CS, cs);
+ GETREG(EFLAGS, flags);
+#ifdef CONFIG_X86_32
+ GETREG(SS, ss);
+#endif
+
+#undef GETREG
+
+ pid = userspace_pid[current_thread_info()->cpu];
+#ifdef CONFIG_X86_32
+ if (have_fpx_regs) {
+ struct user_fxsr_struct fpx;
+
+ err = copy_from_user(&fpx,
+ &((struct _fpstate __user *)sc.fpstate)->_fxsr_env[0],
+ sizeof(struct user_fxsr_struct));
+ if (err)
+ return 1;
+
+ err = convert_fxsr_from_user(&fpx, (void *)sc.fpstate);
+ if (err)
+ return 1;
+
+ err = restore_fpx_registers(pid, (unsigned long *) &fpx);
+ if (err < 0) {
+ printk(KERN_ERR "copy_sc_from_user - "
+ "restore_fpx_registers failed, errno = %d\n",
+ -err);
+ return 1;
+ }
+ } else
+#endif
+ {
+ err = copy_from_user(regs->regs.fp, (void *)sc.fpstate,
+ sizeof(struct _xstate));
+ if (err)
+ return 1;
+ }
+ return 0;
+}
+
+static int copy_sc_to_user(struct sigcontext __user *to,
+ struct _xstate __user *to_fp, struct pt_regs *regs,
+ unsigned long mask)
+{
+ struct sigcontext sc;
+ struct faultinfo * fi = &current->thread.arch.faultinfo;
+ int err, pid;
+ memset(&sc, 0, sizeof(struct sigcontext));
+
+#define PUTREG(regno, regname) sc.regname = regs->regs.gp[HOST_##regno]
+
+#ifdef CONFIG_X86_32
+ PUTREG(GS, gs);
+ PUTREG(FS, fs);
+ PUTREG(ES, es);
+ PUTREG(DS, ds);
+#endif
+ PUTREG(DI, di);
+ PUTREG(SI, si);
+ PUTREG(BP, bp);
+ PUTREG(SP, sp);
+ PUTREG(BX, bx);
+ PUTREG(DX, dx);
+ PUTREG(CX, cx);
+ PUTREG(AX, ax);
+#ifdef CONFIG_X86_64
+ PUTREG(R8, r8);
+ PUTREG(R9, r9);
+ PUTREG(R10, r10);
+ PUTREG(R11, r11);
+ PUTREG(R12, r12);
+ PUTREG(R13, r13);
+ PUTREG(R14, r14);
+ PUTREG(R15, r15);
+#endif
+
+ sc.cr2 = fi->cr2;
+ sc.err = fi->error_code;
+ sc.trapno = fi->trap_no;
+ PUTREG(IP, ip);
+ PUTREG(CS, cs);
+ PUTREG(EFLAGS, flags);
+#ifdef CONFIG_X86_32
+ PUTREG(SP, sp_at_signal);
+ PUTREG(SS, ss);
+#endif
+#undef PUTREG
+ sc.oldmask = mask;
+ sc.fpstate = (unsigned long)to_fp;
+
+ err = copy_to_user(to, &sc, sizeof(struct sigcontext));
+ if (err)
+ return 1;
+
+ pid = userspace_pid[current_thread_info()->cpu];
+
+#ifdef CONFIG_X86_32
+ if (have_fpx_regs) {
+ struct user_fxsr_struct fpx;
+
+ err = save_fpx_registers(pid, (unsigned long *) &fpx);
+ if (err < 0){
+ printk(KERN_ERR "copy_sc_to_user - save_fpx_registers "
+ "failed, errno = %d\n", err);
+ return 1;
+ }
+
+ err = convert_fxsr_to_user(&to_fp->fpstate, &fpx);
+ if (err)
+ return 1;
+
+ err |= __put_user(fpx.swd, &to_fp->fpstate.status);
+ err |= __put_user(X86_FXSR_MAGIC, &to_fp->fpstate.magic);
+ if (err)
+ return 1;
+
+ if (copy_to_user(&to_fp->fpstate._fxsr_env[0], &fpx,
+ sizeof(struct user_fxsr_struct)))
+ return 1;
+ } else
+#endif
+ {
+ if (copy_to_user(to_fp, regs->regs.fp, sizeof(struct _xstate)))
+ return 1;
+ }
+
+ return 0;
+}
+
+#ifdef CONFIG_X86_32
+static int copy_ucontext_to_user(struct ucontext __user *uc,
+ struct _xstate __user *fp, sigset_t *set,
+ unsigned long sp)
+{
+ int err = 0;
+
+ err |= __save_altstack(&uc->uc_stack, sp);
+ err |= copy_sc_to_user(&uc->uc_mcontext, fp, &current->thread.regs, 0);
+ err |= copy_to_user(&uc->uc_sigmask, set, sizeof(*set));
+ return err;
+}
+
+struct sigframe
+{
+ char __user *pretcode;
+ int sig;
+ struct sigcontext sc;
+ struct _xstate fpstate;
+ unsigned long extramask[_NSIG_WORDS-1];
+ char retcode[8];
+};
+
+struct rt_sigframe
+{
+ char __user *pretcode;
+ int sig;
+ struct siginfo __user *pinfo;
+ void __user *puc;
+ struct siginfo info;
+ struct ucontext uc;
+ struct _xstate fpstate;
+ char retcode[8];
+};
+
+int setup_signal_stack_sc(unsigned long stack_top, struct ksignal *ksig,
+ struct pt_regs *regs, sigset_t *mask)
+{
+ struct sigframe __user *frame;
+ void __user *restorer;
+ int err = 0, sig = ksig->sig;
+
+ /* This is the same calculation as i386 - ((sp + 4) & 15) == 0 */
+ stack_top = ((stack_top + 4) & -16UL) - 4;
+ frame = (struct sigframe __user *) stack_top - 1;
+ if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
+ return 1;
+
+ restorer = frame->retcode;
+ if (ksig->ka.sa.sa_flags & SA_RESTORER)
+ restorer = ksig->ka.sa.sa_restorer;
+
+ err |= __put_user(restorer, &frame->pretcode);
+ err |= __put_user(sig, &frame->sig);
+ err |= copy_sc_to_user(&frame->sc, &frame->fpstate, regs, mask->sig[0]);
+ if (_NSIG_WORDS > 1)
+ err |= __copy_to_user(&frame->extramask, &mask->sig[1],
+ sizeof(frame->extramask));
+
+ /*
+ * This is popl %eax ; movl $,%eax ; int $0x80
+ *
+ * WE DO NOT USE IT ANY MORE! It's only left here for historical
+ * reasons and because gdb uses it as a signature to notice
+ * signal handler stack frames.
+ */
+ err |= __put_user(0xb858, (short __user *)(frame->retcode+0));
+ err |= __put_user(__NR_sigreturn, (int __user *)(frame->retcode+2));
+ err |= __put_user(0x80cd, (short __user *)(frame->retcode+6));
+
+ if (err)
+ return err;
+
+ PT_REGS_SP(regs) = (unsigned long) frame;
+ PT_REGS_IP(regs) = (unsigned long) ksig->ka.sa.sa_handler;
+ PT_REGS_AX(regs) = (unsigned long) sig;
+ PT_REGS_DX(regs) = (unsigned long) 0;
+ PT_REGS_CX(regs) = (unsigned long) 0;
+ return 0;
+}
+
+int setup_signal_stack_si(unsigned long stack_top, struct ksignal *ksig,
+ struct pt_regs *regs, sigset_t *mask)
+{
+ struct rt_sigframe __user *frame;
+ void __user *restorer;
+ int err = 0, sig = ksig->sig;
+
+ stack_top &= -8UL;
+ frame = (struct rt_sigframe __user *) stack_top - 1;
+ if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
+ return 1;
+
+ restorer = frame->retcode;
+ if (ksig->ka.sa.sa_flags & SA_RESTORER)
+ restorer = ksig->ka.sa.sa_restorer;
+
+ err |= __put_user(restorer, &frame->pretcode);
+ err |= __put_user(sig, &frame->sig);
+ err |= __put_user(&frame->info, &frame->pinfo);
+ err |= __put_user(&frame->uc, &frame->puc);
+ err |= copy_siginfo_to_user(&frame->info, &ksig->info);
+ err |= copy_ucontext_to_user(&frame->uc, &frame->fpstate, mask,
+ PT_REGS_SP(regs));
+
+ /*
+ * This is movl $,%eax ; int $0x80
+ *
+ * WE DO NOT USE IT ANY MORE! It's only left here for historical
+ * reasons and because gdb uses it as a signature to notice
+ * signal handler stack frames.
+ */
+ err |= __put_user(0xb8, (char __user *)(frame->retcode+0));
+ err |= __put_user(__NR_rt_sigreturn, (int __user *)(frame->retcode+1));
+ err |= __put_user(0x80cd, (short __user *)(frame->retcode+5));
+
+ if (err)
+ return err;
+
+ PT_REGS_SP(regs) = (unsigned long) frame;
+ PT_REGS_IP(regs) = (unsigned long) ksig->ka.sa.sa_handler;
+ PT_REGS_AX(regs) = (unsigned long) sig;
+ PT_REGS_DX(regs) = (unsigned long) &frame->info;
+ PT_REGS_CX(regs) = (unsigned long) &frame->uc;
+ return 0;
+}
+
+long sys_sigreturn(void)
+{
+ unsigned long sp = PT_REGS_SP(&current->thread.regs);
+ struct sigframe __user *frame = (struct sigframe __user *)(sp - 8);
+ sigset_t set;
+ struct sigcontext __user *sc = &frame->sc;
+ int sig_size = (_NSIG_WORDS - 1) * sizeof(unsigned long);
+
+ if (copy_from_user(&set.sig[0], &sc->oldmask, sizeof(set.sig[0])) ||
+ copy_from_user(&set.sig[1], frame->extramask, sig_size))
+ goto segfault;
+
+ set_current_blocked(&set);
+
+ if (copy_sc_from_user(&current->thread.regs, sc))
+ goto segfault;
+
+ /* Avoid ERESTART handling */
+ PT_REGS_SYSCALL_NR(&current->thread.regs) = -1;
+ return PT_REGS_SYSCALL_RET(&current->thread.regs);
+
+ segfault:
+ force_sig(SIGSEGV, current);
+ return 0;
+}
+
+#else
+
+struct rt_sigframe
+{
+ char __user *pretcode;
+ struct ucontext uc;
+ struct siginfo info;
+ struct _xstate fpstate;
+};
+
+int setup_signal_stack_si(unsigned long stack_top, struct ksignal *ksig,
+ struct pt_regs *regs, sigset_t *set)
+{
+ struct rt_sigframe __user *frame;
+ int err = 0, sig = ksig->sig;
+ unsigned long fp_to;
+
+ frame = (struct rt_sigframe __user *)
+ round_down(stack_top - sizeof(struct rt_sigframe), 16);
+ /* Subtract 128 for a red zone and 8 for proper alignment */
+ frame = (struct rt_sigframe __user *) ((unsigned long) frame - 128 - 8);
+
+ if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
+ goto out;
+
+ if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
+ err |= copy_siginfo_to_user(&frame->info, &ksig->info);
+ if (err)
+ goto out;
+ }
+
+ /* Create the ucontext. */
+ err |= __put_user(0, &frame->uc.uc_flags);
+ err |= __put_user(0, &frame->uc.uc_link);
+ err |= __save_altstack(&frame->uc.uc_stack, PT_REGS_SP(regs));
+ err |= copy_sc_to_user(&frame->uc.uc_mcontext, &frame->fpstate, regs,
+ set->sig[0]);
+
+ fp_to = (unsigned long)&frame->fpstate;
+
+ err |= __put_user(fp_to, &frame->uc.uc_mcontext.fpstate);
+ if (sizeof(*set) == 16) {
+ err |= __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]);
+ err |= __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]);
+ }
+ else
+ err |= __copy_to_user(&frame->uc.uc_sigmask, set,
+ sizeof(*set));
+
+ /*
+ * Set up to return from userspace. If provided, use a stub
+ * already in userspace.
+ */
+ /* x86-64 should always use SA_RESTORER. */
+ if (ksig->ka.sa.sa_flags & SA_RESTORER)
+ err |= __put_user((void *)ksig->ka.sa.sa_restorer,
+ &frame->pretcode);
+ else
+ /* could use a vstub here */
+ return err;
+
+ if (err)
+ return err;
+
+ PT_REGS_SP(regs) = (unsigned long) frame;
+ PT_REGS_DI(regs) = sig;
+ /* In case the signal handler was declared without prototypes */
+ PT_REGS_AX(regs) = 0;
+
+ /*
+ * This also works for non SA_SIGINFO handlers because they expect the
+ * next argument after the signal number on the stack.
+ */
+ PT_REGS_SI(regs) = (unsigned long) &frame->info;
+ PT_REGS_DX(regs) = (unsigned long) &frame->uc;
+ PT_REGS_IP(regs) = (unsigned long) ksig->ka.sa.sa_handler;
+ out:
+ return err;
+}
+#endif
+
+long sys_rt_sigreturn(void)
+{
+ unsigned long sp = PT_REGS_SP(&current->thread.regs);
+ struct rt_sigframe __user *frame =
+ (struct rt_sigframe __user *)(sp - sizeof(long));
+ struct ucontext __user *uc = &frame->uc;
+ sigset_t set;
+
+ if (copy_from_user(&set, &uc->uc_sigmask, sizeof(set)))
+ goto segfault;
+
+ set_current_blocked(&set);
+
+ if (copy_sc_from_user(&current->thread.regs, &uc->uc_mcontext))
+ goto segfault;
+
+ /* Avoid ERESTART handling */
+ PT_REGS_SYSCALL_NR(&current->thread.regs) = -1;
+ return PT_REGS_SYSCALL_RET(&current->thread.regs);
+
+ segfault:
+ force_sig(SIGSEGV, current);
+ return 0;
+}
diff --git a/arch/x86/um/stub_32.S b/arch/x86/um/stub_32.S
new file mode 100644
index 000000000..a193e8853
--- /dev/null
+++ b/arch/x86/um/stub_32.S
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <as-layout.h>
+
+.section .__syscall_stub, "ax"
+
+ .globl batch_syscall_stub
+batch_syscall_stub:
+ /* load pointer to first operation */
+ mov $(STUB_DATA+8), %esp
+
+again:
+ /* load length of additional data */
+ mov 0x0(%esp), %eax
+
+ /* if(length == 0) : end of list */
+ /* write possible 0 to header */
+ mov %eax, STUB_DATA+4
+ cmpl $0, %eax
+ jz done
+
+ /* save current pointer */
+ mov %esp, STUB_DATA+4
+
+ /* skip additional data */
+ add %eax, %esp
+
+ /* load syscall-# */
+ pop %eax
+
+ /* load syscall params */
+ pop %ebx
+ pop %ecx
+ pop %edx
+ pop %esi
+ pop %edi
+ pop %ebp
+
+ /* execute syscall */
+ int $0x80
+
+ /* check return value */
+ pop %ebx
+ cmp %ebx, %eax
+ je again
+
+done:
+ /* save return value */
+ mov %eax, STUB_DATA
+
+ /* stop */
+ int3
diff --git a/arch/x86/um/stub_64.S b/arch/x86/um/stub_64.S
new file mode 100644
index 000000000..8a95c5b2e
--- /dev/null
+++ b/arch/x86/um/stub_64.S
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <as-layout.h>
+
+.section .__syscall_stub, "ax"
+ .globl batch_syscall_stub
+batch_syscall_stub:
+ mov $(STUB_DATA), %rbx
+ /* load pointer to first operation */
+ mov %rbx, %rsp
+ add $0x10, %rsp
+again:
+ /* load length of additional data */
+ mov 0x0(%rsp), %rax
+
+ /* if(length == 0) : end of list */
+ /* write possible 0 to header */
+ mov %rax, 8(%rbx)
+ cmp $0, %rax
+ jz done
+
+ /* save current pointer */
+ mov %rsp, 8(%rbx)
+
+ /* skip additional data */
+ add %rax, %rsp
+
+ /* load syscall-# */
+ pop %rax
+
+ /* load syscall params */
+ pop %rdi
+ pop %rsi
+ pop %rdx
+ pop %r10
+ pop %r8
+ pop %r9
+
+ /* execute syscall */
+ syscall
+
+ /* check return value */
+ pop %rcx
+ cmp %rcx, %rax
+ je again
+
+done:
+ /* save return value */
+ mov %rax, (%rbx)
+
+ /* stop */
+ int3
diff --git a/arch/x86/um/stub_segv.c b/arch/x86/um/stub_segv.c
new file mode 100644
index 000000000..27361cbb7
--- /dev/null
+++ b/arch/x86/um/stub_segv.c
@@ -0,0 +1,20 @@
+/*
+ * Copyright (C) 2004 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
+ * Licensed under the GPL
+ */
+
+#include <sysdep/stub.h>
+#include <sysdep/faultinfo.h>
+#include <sysdep/mcontext.h>
+#include <sys/ucontext.h>
+
+void __attribute__ ((__section__ (".__syscall_stub")))
+stub_segv_handler(int sig, siginfo_t *info, void *p)
+{
+ ucontext_t *uc = p;
+
+ GET_FAULTINFO_FROM_MC(*((struct faultinfo *) STUB_DATA),
+ &uc->uc_mcontext);
+ trap_myself();
+}
+
diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c
new file mode 100644
index 000000000..9649b5ad2
--- /dev/null
+++ b/arch/x86/um/sys_call_table_32.c
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * System call table for UML/i386, copied from arch/x86/kernel/syscall_*.c
+ * with some changes for UML.
+ */
+
+#include <linux/linkage.h>
+#include <linux/sys.h>
+#include <linux/cache.h>
+#include <generated/user_constants.h>
+#include <asm/syscall.h>
+
+#define __NO_STUBS
+
+/*
+ * Below you can see, in terms of #define's, the differences between the x86-64
+ * and the UML syscall table.
+ */
+
+/* Not going to be implemented by UML, since we have no hardware. */
+#define sys_iopl sys_ni_syscall
+#define sys_ioperm sys_ni_syscall
+
+#define sys_vm86old sys_ni_syscall
+#define sys_vm86 sys_ni_syscall
+
+#define old_mmap sys_old_mmap
+
+#define __SYSCALL_I386(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
+#include <asm/syscalls_32.h>
+
+#undef __SYSCALL_I386
+#define __SYSCALL_I386(nr, sym, qual) [ nr ] = sym,
+
+extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
+
+const sys_call_ptr_t sys_call_table[] ____cacheline_aligned = {
+ /*
+ * Smells like a compiler bug -- it doesn't work
+ * when the & below is removed.
+ */
+ [0 ... __NR_syscall_max] = &sys_ni_syscall,
+#include <asm/syscalls_32.h>
+};
+
+int syscall_table_size = sizeof(sys_call_table);
diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c
new file mode 100644
index 000000000..c8bc7fb8c
--- /dev/null
+++ b/arch/x86/um/sys_call_table_64.c
@@ -0,0 +1,56 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * System call table for UML/x86-64, copied from arch/x86/kernel/syscall_*.c
+ * with some changes for UML.
+ */
+
+#include <linux/linkage.h>
+#include <linux/sys.h>
+#include <linux/cache.h>
+#include <generated/user_constants.h>
+#include <asm/syscall.h>
+
+#define __NO_STUBS
+
+/*
+ * Below you can see, in terms of #define's, the differences between the x86-64
+ * and the UML syscall table.
+ */
+
+/* Not going to be implemented by UML, since we have no hardware. */
+#define sys_iopl sys_ni_syscall
+#define sys_ioperm sys_ni_syscall
+
+/*
+ * The UML TLS problem. Note that x86_64 does not implement this, so the below
+ * is needed only for the ia32 compatibility.
+ */
+
+/* On UML we call it this way ("old" means it's not mmap2) */
+#define sys_mmap old_mmap
+
+#define stub_clone sys_clone
+#define stub_fork sys_fork
+#define stub_vfork sys_vfork
+#define stub_execve sys_execve
+#define stub_execveat sys_execveat
+#define stub_rt_sigreturn sys_rt_sigreturn
+
+#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
+#include <asm/syscalls_64.h>
+
+#undef __SYSCALL_64
+#define __SYSCALL_64(nr, sym, qual) [ nr ] = sym,
+
+extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
+
+const sys_call_ptr_t sys_call_table[] ____cacheline_aligned = {
+ /*
+ * Smells like a compiler bug -- it doesn't work
+ * when the & below is removed.
+ */
+ [0 ... __NR_syscall_max] = &sys_ni_syscall,
+#include <asm/syscalls_64.h>
+};
+
+int syscall_table_size = sizeof(sys_call_table);
diff --git a/arch/x86/um/syscalls_32.c b/arch/x86/um/syscalls_32.c
new file mode 100644
index 000000000..5c65254c8
--- /dev/null
+++ b/arch/x86/um/syscalls_32.c
@@ -0,0 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/syscalls.h>
+#include <os.h>
+
+SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
+{
+ return -EINVAL;
+}
diff --git a/arch/x86/um/syscalls_64.c b/arch/x86/um/syscalls_64.c
new file mode 100644
index 000000000..8249685b4
--- /dev/null
+++ b/arch/x86/um/syscalls_64.c
@@ -0,0 +1,90 @@
+/*
+ * Copyright (C) 2003 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
+ * Copyright 2003 PathScale, Inc.
+ *
+ * Licensed under the GPL
+ */
+
+#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/syscalls.h>
+#include <linux/uaccess.h>
+#include <asm/prctl.h> /* XXX This should get the constants from libc */
+#include <os.h>
+#include <registers.h>
+
+long arch_prctl(struct task_struct *task, int option,
+ unsigned long __user *arg2)
+{
+ unsigned long *ptr = arg2, tmp;
+ long ret;
+ int pid = task->mm->context.id.u.pid;
+
+ /*
+ * With ARCH_SET_FS (and ARCH_SET_GS is treated similarly to
+ * be safe), we need to call arch_prctl on the host because
+ * setting %fs may result in something else happening (like a
+ * GDT or thread.fs being set instead). So, we let the host
+ * fiddle the registers and thread struct and restore the
+ * registers afterwards.
+ *
+ * So, the saved registers are stored to the process (this
+ * needed because a stub may have been the last thing to run),
+ * arch_prctl is run on the host, then the registers are read
+ * back.
+ */
+ switch (option) {
+ case ARCH_SET_FS:
+ case ARCH_SET_GS:
+ ret = restore_pid_registers(pid, &current->thread.regs.regs);
+ if (ret)
+ return ret;
+ break;
+ case ARCH_GET_FS:
+ case ARCH_GET_GS:
+ /*
+ * With these two, we read to a local pointer and
+ * put_user it to the userspace pointer that we were
+ * given. If addr isn't valid (because it hasn't been
+ * faulted in or is just bogus), we want put_user to
+ * fault it in (or return -EFAULT) instead of having
+ * the host return -EFAULT.
+ */
+ ptr = &tmp;
+ }
+
+ ret = os_arch_prctl(pid, option, ptr);
+ if (ret)
+ return ret;
+
+ switch (option) {
+ case ARCH_SET_FS:
+ current->thread.arch.fs = (unsigned long) ptr;
+ ret = save_registers(pid, &current->thread.regs.regs);
+ break;
+ case ARCH_SET_GS:
+ ret = save_registers(pid, &current->thread.regs.regs);
+ break;
+ case ARCH_GET_FS:
+ ret = put_user(tmp, arg2);
+ break;
+ case ARCH_GET_GS:
+ ret = put_user(tmp, arg2);
+ break;
+ }
+
+ return ret;
+}
+
+SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
+{
+ return arch_prctl(current, option, (unsigned long __user *) arg2);
+}
+
+void arch_switch_to(struct task_struct *to)
+{
+ if ((to->thread.arch.fs == 0) || (to->mm == NULL))
+ return;
+
+ arch_prctl(to, ARCH_SET_FS, (void __user *) to->thread.arch.fs);
+}
diff --git a/arch/x86/um/sysrq_32.c b/arch/x86/um/sysrq_32.c
new file mode 100644
index 000000000..f23834848
--- /dev/null
+++ b/arch/x86/um/sysrq_32.c
@@ -0,0 +1,34 @@
+/*
+ * Copyright (C) 2001 - 2003 Jeff Dike (jdike@addtoit.com)
+ * Licensed under the GPL
+ */
+
+#include <linux/kernel.h>
+#include <linux/smp.h>
+#include <linux/sched.h>
+#include <linux/sched/debug.h>
+#include <linux/kallsyms.h>
+#include <asm/ptrace.h>
+#include <asm/sysrq.h>
+
+/* This is declared by <linux/sched.h> */
+void show_regs(struct pt_regs *regs)
+{
+ printk("\n");
+ printk("EIP: %04lx:[<%08lx>] CPU: %d %s",
+ 0xffff & PT_REGS_CS(regs), PT_REGS_IP(regs),
+ smp_processor_id(), print_tainted());
+ if (PT_REGS_CS(regs) & 3)
+ printk(" ESP: %04lx:%08lx", 0xffff & PT_REGS_SS(regs),
+ PT_REGS_SP(regs));
+ printk(" EFLAGS: %08lx\n %s\n", PT_REGS_EFLAGS(regs),
+ print_tainted());
+ printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
+ PT_REGS_AX(regs), PT_REGS_BX(regs),
+ PT_REGS_CX(regs), PT_REGS_DX(regs));
+ printk("ESI: %08lx EDI: %08lx EBP: %08lx",
+ PT_REGS_SI(regs), PT_REGS_DI(regs), PT_REGS_BP(regs));
+ printk(" DS: %04lx ES: %04lx\n",
+ 0xffff & PT_REGS_DS(regs),
+ 0xffff & PT_REGS_ES(regs));
+}
diff --git a/arch/x86/um/sysrq_64.c b/arch/x86/um/sysrq_64.c
new file mode 100644
index 000000000..903ad91b6
--- /dev/null
+++ b/arch/x86/um/sysrq_64.c
@@ -0,0 +1,36 @@
+/*
+ * Copyright 2003 PathScale, Inc.
+ *
+ * Licensed under the GPL
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/sched/debug.h>
+#include <linux/utsname.h>
+#include <asm/current.h>
+#include <asm/ptrace.h>
+#include <asm/sysrq.h>
+
+void show_regs(struct pt_regs *regs)
+{
+ printk("\n");
+ print_modules();
+ printk(KERN_INFO "Pid: %d, comm: %.20s %s %s\n", task_pid_nr(current),
+ current->comm, print_tainted(), init_utsname()->release);
+ printk(KERN_INFO "RIP: %04lx:[<%016lx>]\n", PT_REGS_CS(regs) & 0xffff,
+ PT_REGS_IP(regs));
+ printk(KERN_INFO "RSP: %016lx EFLAGS: %08lx\n", PT_REGS_SP(regs),
+ PT_REGS_EFLAGS(regs));
+ printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n",
+ PT_REGS_AX(regs), PT_REGS_BX(regs), PT_REGS_CX(regs));
+ printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n",
+ PT_REGS_DX(regs), PT_REGS_SI(regs), PT_REGS_DI(regs));
+ printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n",
+ PT_REGS_BP(regs), PT_REGS_R8(regs), PT_REGS_R9(regs));
+ printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n",
+ PT_REGS_R10(regs), PT_REGS_R11(regs), PT_REGS_R12(regs));
+ printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n",
+ PT_REGS_R13(regs), PT_REGS_R14(regs), PT_REGS_R15(regs));
+}
diff --git a/arch/x86/um/tls_32.c b/arch/x86/um/tls_32.c
new file mode 100644
index 000000000..5bd949da7
--- /dev/null
+++ b/arch/x86/um/tls_32.c
@@ -0,0 +1,398 @@
+/*
+ * Copyright (C) 2005 Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
+ * Licensed under the GPL
+ */
+
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/syscalls.h>
+#include <linux/uaccess.h>
+#include <asm/ptrace-abi.h>
+#include <os.h>
+#include <skas.h>
+#include <sysdep/tls.h>
+
+/*
+ * If needed we can detect when it's uninitialized.
+ *
+ * These are initialized in an initcall and unchanged thereafter.
+ */
+static int host_supports_tls = -1;
+int host_gdt_entry_tls_min;
+
+int do_set_thread_area(struct user_desc *info)
+{
+ int ret;
+ u32 cpu;
+
+ cpu = get_cpu();
+ ret = os_set_thread_area(info, userspace_pid[cpu]);
+ put_cpu();
+
+ if (ret)
+ printk(KERN_ERR "PTRACE_SET_THREAD_AREA failed, err = %d, "
+ "index = %d\n", ret, info->entry_number);
+
+ return ret;
+}
+
+int do_get_thread_area(struct user_desc *info)
+{
+ int ret;
+ u32 cpu;
+
+ cpu = get_cpu();
+ ret = os_get_thread_area(info, userspace_pid[cpu]);
+ put_cpu();
+
+ if (ret)
+ printk(KERN_ERR "PTRACE_GET_THREAD_AREA failed, err = %d, "
+ "index = %d\n", ret, info->entry_number);
+
+ return ret;
+}
+
+/*
+ * sys_get_thread_area: get a yet unused TLS descriptor index.
+ * XXX: Consider leaving one free slot for glibc usage at first place. This must
+ * be done here (and by changing GDT_ENTRY_TLS_* macros) and nowhere else.
+ *
+ * Also, this must be tested when compiling in SKAS mode with dynamic linking
+ * and running against NPTL.
+ */
+static int get_free_idx(struct task_struct* task)
+{
+ struct thread_struct *t = &task->thread;
+ int idx;
+
+ if (!t->arch.tls_array)
+ return GDT_ENTRY_TLS_MIN;
+
+ for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++)
+ if (!t->arch.tls_array[idx].present)
+ return idx + GDT_ENTRY_TLS_MIN;
+ return -ESRCH;
+}
+
+static inline void clear_user_desc(struct user_desc* info)
+{
+ /* Postcondition: LDT_empty(info) returns true. */
+ memset(info, 0, sizeof(*info));
+
+ /*
+ * Check the LDT_empty or the i386 sys_get_thread_area code - we obtain
+ * indeed an empty user_desc.
+ */
+ info->read_exec_only = 1;
+ info->seg_not_present = 1;
+}
+
+#define O_FORCE 1
+
+static int load_TLS(int flags, struct task_struct *to)
+{
+ int ret = 0;
+ int idx;
+
+ for (idx = GDT_ENTRY_TLS_MIN; idx < GDT_ENTRY_TLS_MAX; idx++) {
+ struct uml_tls_struct* curr =
+ &to->thread.arch.tls_array[idx - GDT_ENTRY_TLS_MIN];
+
+ /*
+ * Actually, now if it wasn't flushed it gets cleared and
+ * flushed to the host, which will clear it.
+ */
+ if (!curr->present) {
+ if (!curr->flushed) {
+ clear_user_desc(&curr->tls);
+ curr->tls.entry_number = idx;
+ } else {
+ WARN_ON(!LDT_empty(&curr->tls));
+ continue;
+ }
+ }
+
+ if (!(flags & O_FORCE) && curr->flushed)
+ continue;
+
+ ret = do_set_thread_area(&curr->tls);
+ if (ret)
+ goto out;
+
+ curr->flushed = 1;
+ }
+out:
+ return ret;
+}
+
+/*
+ * Verify if we need to do a flush for the new process, i.e. if there are any
+ * present desc's, only if they haven't been flushed.
+ */
+static inline int needs_TLS_update(struct task_struct *task)
+{
+ int i;
+ int ret = 0;
+
+ for (i = GDT_ENTRY_TLS_MIN; i < GDT_ENTRY_TLS_MAX; i++) {
+ struct uml_tls_struct* curr =
+ &task->thread.arch.tls_array[i - GDT_ENTRY_TLS_MIN];
+
+ /*
+ * Can't test curr->present, we may need to clear a descriptor
+ * which had a value.
+ */
+ if (curr->flushed)
+ continue;
+ ret = 1;
+ break;
+ }
+ return ret;
+}
+
+/*
+ * On a newly forked process, the TLS descriptors haven't yet been flushed. So
+ * we mark them as such and the first switch_to will do the job.
+ */
+void clear_flushed_tls(struct task_struct *task)
+{
+ int i;
+
+ for (i = GDT_ENTRY_TLS_MIN; i < GDT_ENTRY_TLS_MAX; i++) {
+ struct uml_tls_struct* curr =
+ &task->thread.arch.tls_array[i - GDT_ENTRY_TLS_MIN];
+
+ /*
+ * Still correct to do this, if it wasn't present on the host it
+ * will remain as flushed as it was.
+ */
+ if (!curr->present)
+ continue;
+
+ curr->flushed = 0;
+ }
+}
+
+/*
+ * In SKAS0 mode, currently, multiple guest threads sharing the same ->mm have a
+ * common host process. So this is needed in SKAS0 too.
+ *
+ * However, if each thread had a different host process (and this was discussed
+ * for SMP support) this won't be needed.
+ *
+ * And this will not need be used when (and if) we'll add support to the host
+ * SKAS patch.
+ */
+
+int arch_switch_tls(struct task_struct *to)
+{
+ if (!host_supports_tls)
+ return 0;
+
+ /*
+ * We have no need whatsoever to switch TLS for kernel threads; beyond
+ * that, that would also result in us calling os_set_thread_area with
+ * userspace_pid[cpu] == 0, which gives an error.
+ */
+ if (likely(to->mm))
+ return load_TLS(O_FORCE, to);
+
+ return 0;
+}
+
+static int set_tls_entry(struct task_struct* task, struct user_desc *info,
+ int idx, int flushed)
+{
+ struct thread_struct *t = &task->thread;
+
+ if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
+ return -EINVAL;
+
+ t->arch.tls_array[idx - GDT_ENTRY_TLS_MIN].tls = *info;
+ t->arch.tls_array[idx - GDT_ENTRY_TLS_MIN].present = 1;
+ t->arch.tls_array[idx - GDT_ENTRY_TLS_MIN].flushed = flushed;
+
+ return 0;
+}
+
+int arch_copy_tls(struct task_struct *new)
+{
+ struct user_desc info;
+ int idx, ret = -EFAULT;
+
+ if (copy_from_user(&info,
+ (void __user *) UPT_SI(&new->thread.regs.regs),
+ sizeof(info)))
+ goto out;
+
+ ret = -EINVAL;
+ if (LDT_empty(&info))
+ goto out;
+
+ idx = info.entry_number;
+
+ ret = set_tls_entry(new, &info, idx, 0);
+out:
+ return ret;
+}
+
+/* XXX: use do_get_thread_area to read the host value? I'm not at all sure! */
+static int get_tls_entry(struct task_struct *task, struct user_desc *info,
+ int idx)
+{
+ struct thread_struct *t = &task->thread;
+
+ if (!t->arch.tls_array)
+ goto clear;
+
+ if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
+ return -EINVAL;
+
+ if (!t->arch.tls_array[idx - GDT_ENTRY_TLS_MIN].present)
+ goto clear;
+
+ *info = t->arch.tls_array[idx - GDT_ENTRY_TLS_MIN].tls;
+
+out:
+ /*
+ * Temporary debugging check, to make sure that things have been
+ * flushed. This could be triggered if load_TLS() failed.
+ */
+ if (unlikely(task == current &&
+ !t->arch.tls_array[idx - GDT_ENTRY_TLS_MIN].flushed)) {
+ printk(KERN_ERR "get_tls_entry: task with pid %d got here "
+ "without flushed TLS.", current->pid);
+ }
+
+ return 0;
+clear:
+ /*
+ * When the TLS entry has not been set, the values read to user in the
+ * tls_array are 0 (because it's cleared at boot, see
+ * arch/i386/kernel/head.S:cpu_gdt_table). Emulate that.
+ */
+ clear_user_desc(info);
+ info->entry_number = idx;
+ goto out;
+}
+
+SYSCALL_DEFINE1(set_thread_area, struct user_desc __user *, user_desc)
+{
+ struct user_desc info;
+ int idx, ret;
+
+ if (!host_supports_tls)
+ return -ENOSYS;
+
+ if (copy_from_user(&info, user_desc, sizeof(info)))
+ return -EFAULT;
+
+ idx = info.entry_number;
+
+ if (idx == -1) {
+ idx = get_free_idx(current);
+ if (idx < 0)
+ return idx;
+ info.entry_number = idx;
+ /* Tell the user which slot we chose for him.*/
+ if (put_user(idx, &user_desc->entry_number))
+ return -EFAULT;
+ }
+
+ ret = do_set_thread_area(&info);
+ if (ret)
+ return ret;
+ return set_tls_entry(current, &info, idx, 1);
+}
+
+/*
+ * Perform set_thread_area on behalf of the traced child.
+ * Note: error handling is not done on the deferred load, and this differ from
+ * i386. However the only possible error are caused by bugs.
+ */
+int ptrace_set_thread_area(struct task_struct *child, int idx,
+ struct user_desc __user *user_desc)
+{
+ struct user_desc info;
+
+ if (!host_supports_tls)
+ return -EIO;
+
+ if (copy_from_user(&info, user_desc, sizeof(info)))
+ return -EFAULT;
+
+ return set_tls_entry(child, &info, idx, 0);
+}
+
+SYSCALL_DEFINE1(get_thread_area, struct user_desc __user *, user_desc)
+{
+ struct user_desc info;
+ int idx, ret;
+
+ if (!host_supports_tls)
+ return -ENOSYS;
+
+ if (get_user(idx, &user_desc->entry_number))
+ return -EFAULT;
+
+ ret = get_tls_entry(current, &info, idx);
+ if (ret < 0)
+ goto out;
+
+ if (copy_to_user(user_desc, &info, sizeof(info)))
+ ret = -EFAULT;
+
+out:
+ return ret;
+}
+
+/*
+ * Perform get_thread_area on behalf of the traced child.
+ */
+int ptrace_get_thread_area(struct task_struct *child, int idx,
+ struct user_desc __user *user_desc)
+{
+ struct user_desc info;
+ int ret;
+
+ if (!host_supports_tls)
+ return -EIO;
+
+ ret = get_tls_entry(child, &info, idx);
+ if (ret < 0)
+ goto out;
+
+ if (copy_to_user(user_desc, &info, sizeof(info)))
+ ret = -EFAULT;
+out:
+ return ret;
+}
+
+/*
+ * This code is really i386-only, but it detects and logs x86_64 GDT indexes
+ * if a 32-bit UML is running on a 64-bit host.
+ */
+static int __init __setup_host_supports_tls(void)
+{
+ check_host_supports_tls(&host_supports_tls, &host_gdt_entry_tls_min);
+ if (host_supports_tls) {
+ printk(KERN_INFO "Host TLS support detected\n");
+ printk(KERN_INFO "Detected host type: ");
+ switch (host_gdt_entry_tls_min) {
+ case GDT_ENTRY_TLS_MIN_I386:
+ printk(KERN_CONT "i386");
+ break;
+ case GDT_ENTRY_TLS_MIN_X86_64:
+ printk(KERN_CONT "x86_64");
+ break;
+ }
+ printk(KERN_CONT " (GDT indexes %d to %d)\n",
+ host_gdt_entry_tls_min,
+ host_gdt_entry_tls_min + GDT_ENTRY_TLS_ENTRIES);
+ } else
+ printk(KERN_ERR " Host TLS support NOT detected! "
+ "TLS support inside UML will not work\n");
+ return 0;
+}
+
+__initcall(__setup_host_supports_tls);
diff --git a/arch/x86/um/tls_64.c b/arch/x86/um/tls_64.c
new file mode 100644
index 000000000..3a621e0d3
--- /dev/null
+++ b/arch/x86/um/tls_64.c
@@ -0,0 +1,19 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/sched.h>
+#include <asm/ptrace-abi.h>
+
+void clear_flushed_tls(struct task_struct *task)
+{
+}
+
+int arch_copy_tls(struct task_struct *t)
+{
+ /*
+ * If CLONE_SETTLS is set, we need to save the thread id
+ * (which is argument 5, child_tid, of clone) so it can be set
+ * during context switches.
+ */
+ t->thread.arch.fs = t->thread.regs.regs.gp[R8 / sizeof(long)];
+
+ return 0;
+}
diff --git a/arch/x86/um/user-offsets.c b/arch/x86/um/user-offsets.c
new file mode 100644
index 000000000..5b37b7f95
--- /dev/null
+++ b/arch/x86/um/user-offsets.c
@@ -0,0 +1,100 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stdio.h>
+#include <stddef.h>
+#include <signal.h>
+#include <sys/poll.h>
+#include <sys/mman.h>
+#include <sys/user.h>
+#define __FRAME_OFFSETS
+#include <linux/ptrace.h>
+#include <asm/types.h>
+
+#ifdef __i386__
+#define __SYSCALL_I386(nr, sym, qual) [nr] = 1,
+static char syscalls[] = {
+#include <asm/syscalls_32.h>
+};
+#else
+#define __SYSCALL_64(nr, sym, qual) [nr] = 1,
+static char syscalls[] = {
+#include <asm/syscalls_64.h>
+};
+#endif
+
+#define DEFINE(sym, val) \
+ asm volatile("\n->" #sym " %0 " #val : : "i" (val))
+
+#define DEFINE_LONGS(sym, val) \
+ asm volatile("\n->" #sym " %0 " #val : : "i" (val/sizeof(unsigned long)))
+
+void foo(void)
+{
+#ifdef __i386__
+ DEFINE_LONGS(HOST_FP_SIZE, sizeof(struct user_fpregs_struct));
+ DEFINE_LONGS(HOST_FPX_SIZE, sizeof(struct user_fpxregs_struct));
+
+ DEFINE(HOST_IP, EIP);
+ DEFINE(HOST_SP, UESP);
+ DEFINE(HOST_EFLAGS, EFL);
+ DEFINE(HOST_AX, EAX);
+ DEFINE(HOST_BX, EBX);
+ DEFINE(HOST_CX, ECX);
+ DEFINE(HOST_DX, EDX);
+ DEFINE(HOST_SI, ESI);
+ DEFINE(HOST_DI, EDI);
+ DEFINE(HOST_BP, EBP);
+ DEFINE(HOST_CS, CS);
+ DEFINE(HOST_SS, SS);
+ DEFINE(HOST_DS, DS);
+ DEFINE(HOST_FS, FS);
+ DEFINE(HOST_ES, ES);
+ DEFINE(HOST_GS, GS);
+ DEFINE(HOST_ORIG_AX, ORIG_EAX);
+#else
+#ifdef FP_XSTATE_MAGIC1
+ DEFINE_LONGS(HOST_FP_SIZE, 2696);
+#else
+ DEFINE(HOST_FP_SIZE, sizeof(struct _fpstate) / sizeof(unsigned long));
+#endif
+ DEFINE_LONGS(HOST_BX, RBX);
+ DEFINE_LONGS(HOST_CX, RCX);
+ DEFINE_LONGS(HOST_DI, RDI);
+ DEFINE_LONGS(HOST_SI, RSI);
+ DEFINE_LONGS(HOST_DX, RDX);
+ DEFINE_LONGS(HOST_BP, RBP);
+ DEFINE_LONGS(HOST_AX, RAX);
+ DEFINE_LONGS(HOST_R8, R8);
+ DEFINE_LONGS(HOST_R9, R9);
+ DEFINE_LONGS(HOST_R10, R10);
+ DEFINE_LONGS(HOST_R11, R11);
+ DEFINE_LONGS(HOST_R12, R12);
+ DEFINE_LONGS(HOST_R13, R13);
+ DEFINE_LONGS(HOST_R14, R14);
+ DEFINE_LONGS(HOST_R15, R15);
+ DEFINE_LONGS(HOST_ORIG_AX, ORIG_RAX);
+ DEFINE_LONGS(HOST_CS, CS);
+ DEFINE_LONGS(HOST_SS, SS);
+ DEFINE_LONGS(HOST_EFLAGS, EFLAGS);
+#if 0
+ DEFINE_LONGS(HOST_FS, FS);
+ DEFINE_LONGS(HOST_GS, GS);
+ DEFINE_LONGS(HOST_DS, DS);
+ DEFINE_LONGS(HOST_ES, ES);
+#endif
+
+ DEFINE_LONGS(HOST_IP, RIP);
+ DEFINE_LONGS(HOST_SP, RSP);
+#endif
+
+ DEFINE(UM_FRAME_SIZE, sizeof(struct user_regs_struct));
+ DEFINE(UM_POLLIN, POLLIN);
+ DEFINE(UM_POLLPRI, POLLPRI);
+ DEFINE(UM_POLLOUT, POLLOUT);
+
+ DEFINE(UM_PROT_READ, PROT_READ);
+ DEFINE(UM_PROT_WRITE, PROT_WRITE);
+ DEFINE(UM_PROT_EXEC, PROT_EXEC);
+
+ DEFINE(__NR_syscall_max, sizeof(syscalls) - 1);
+ DEFINE(NR_syscalls, sizeof(syscalls));
+}
diff --git a/arch/x86/um/vdso/.gitignore b/arch/x86/um/vdso/.gitignore
new file mode 100644
index 000000000..f8b69d842
--- /dev/null
+++ b/arch/x86/um/vdso/.gitignore
@@ -0,0 +1 @@
+vdso.lds
diff --git a/arch/x86/um/vdso/Makefile b/arch/x86/um/vdso/Makefile
new file mode 100644
index 000000000..822ccdba9
--- /dev/null
+++ b/arch/x86/um/vdso/Makefile
@@ -0,0 +1,78 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Building vDSO images for x86.
+#
+
+# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in.
+KCOV_INSTRUMENT := n
+
+VDSO64-y := y
+
+vdso-install-$(VDSO64-y) += vdso.so
+
+
+# files to link into the vdso
+vobjs-y := vdso-note.o um_vdso.o
+
+# files to link into kernel
+obj-$(VDSO64-y) += vdso.o vma.o
+
+vobjs := $(foreach F,$(vobjs-y),$(obj)/$F)
+
+$(obj)/vdso.o: $(obj)/vdso.so
+
+targets += vdso.so vdso.so.dbg vdso.lds $(vobjs-y)
+
+CPPFLAGS_vdso.lds += -P -C
+
+VDSO_LDFLAGS_vdso.lds = -m64 -Wl,-soname=linux-vdso.so.1 \
+ -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096
+
+$(obj)/vdso.o: $(src)/vdso.S $(obj)/vdso.so
+
+$(obj)/vdso.so.dbg: $(obj)/vdso.lds $(vobjs) FORCE
+ $(call if_changed,vdso)
+
+$(obj)/%.so: OBJCOPYFLAGS := -S
+$(obj)/%.so: $(obj)/%.so.dbg FORCE
+ $(call if_changed,objcopy)
+
+#
+# Don't omit frame pointers for ease of userspace debugging, but do
+# optimize sibling calls.
+#
+CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \
+ $(filter -g%,$(KBUILD_CFLAGS)) $(call cc-option, -fno-stack-protector) \
+ -fno-omit-frame-pointer -foptimize-sibling-calls
+
+$(vobjs): KBUILD_CFLAGS += $(CFL)
+
+#
+# vDSO code runs in userspace and -pg doesn't help with profiling anyway.
+#
+CFLAGS_REMOVE_vdso-note.o = -pg -fprofile-arcs -ftest-coverage
+CFLAGS_REMOVE_um_vdso.o = -pg -fprofile-arcs -ftest-coverage
+
+#
+# The DSO images are built using a special linker script.
+#
+quiet_cmd_vdso = VDSO $@
+ cmd_vdso = $(CC) -nostdlib -o $@ \
+ $(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \
+ -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) && \
+ sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@'
+
+VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=sysv)
+GCOV_PROFILE := n
+
+#
+# Install the unstripped copy of vdso*.so listed in $(vdso-install-y).
+#
+quiet_cmd_vdso_install = INSTALL $@
+ cmd_vdso_install = cp $(obj)/$@.dbg $(MODLIB)/vdso/$@
+$(vdso-install-y): %.so: $(obj)/%.so.dbg FORCE
+ @mkdir -p $(MODLIB)/vdso
+ $(call cmd,vdso_install)
+
+PHONY += vdso_install $(vdso-install-y)
+vdso_install: $(vdso-install-y)
diff --git a/arch/x86/um/vdso/checkundef.sh b/arch/x86/um/vdso/checkundef.sh
new file mode 100644
index 000000000..8e3ea6bb9
--- /dev/null
+++ b/arch/x86/um/vdso/checkundef.sh
@@ -0,0 +1,11 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+nm="$1"
+file="$2"
+$nm "$file" | grep '^ *U' > /dev/null 2>&1
+if [ $? -eq 1 ]; then
+ exit 0
+else
+ echo "$file: undefined symbols found" >&2
+ exit 1
+fi
diff --git a/arch/x86/um/vdso/um_vdso.c b/arch/x86/um/vdso/um_vdso.c
new file mode 100644
index 000000000..7c441b59d
--- /dev/null
+++ b/arch/x86/um/vdso/um_vdso.c
@@ -0,0 +1,71 @@
+/*
+ * Copyright (C) 2011 Richard Weinberger <richrd@nod.at>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This vDSO turns all calls into a syscall so that UML can trap them.
+ */
+
+
+/* Disable profiling for userspace code */
+#define DISABLE_BRANCH_PROFILING
+
+#include <linux/time.h>
+#include <linux/getcpu.h>
+#include <asm/unistd.h>
+
+int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
+{
+ long ret;
+
+ asm("syscall" : "=a" (ret) :
+ "0" (__NR_clock_gettime), "D" (clock), "S" (ts) : "memory");
+
+ return ret;
+}
+int clock_gettime(clockid_t, struct timespec *)
+ __attribute__((weak, alias("__vdso_clock_gettime")));
+
+int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
+{
+ long ret;
+
+ asm("syscall" : "=a" (ret) :
+ "0" (__NR_gettimeofday), "D" (tv), "S" (tz) : "memory");
+
+ return ret;
+}
+int gettimeofday(struct timeval *, struct timezone *)
+ __attribute__((weak, alias("__vdso_gettimeofday")));
+
+time_t __vdso_time(time_t *t)
+{
+ long secs;
+
+ asm volatile("syscall"
+ : "=a" (secs)
+ : "0" (__NR_time), "D" (t) : "cc", "r11", "cx", "memory");
+
+ return secs;
+}
+int time(time_t *t) __attribute__((weak, alias("__vdso_time")));
+
+long
+__vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused)
+{
+ /*
+ * UML does not support SMP, we can cheat here. :)
+ */
+
+ if (cpu)
+ *cpu = 0;
+ if (node)
+ *node = 0;
+
+ return 0;
+}
+
+long getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
+ __attribute__((weak, alias("__vdso_getcpu")));
diff --git a/arch/x86/um/vdso/vdso-layout.lds.S b/arch/x86/um/vdso/vdso-layout.lds.S
new file mode 100644
index 000000000..439b790df
--- /dev/null
+++ b/arch/x86/um/vdso/vdso-layout.lds.S
@@ -0,0 +1,65 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Linker script for vDSO. This is an ELF shared object prelinked to
+ * its virtual address, and with only one read-only segment.
+ * This script controls its layout.
+ */
+
+SECTIONS
+{
+ . = VDSO_PRELINK + SIZEOF_HEADERS;
+
+ .hash : { *(.hash) } :text
+ .gnu.hash : { *(.gnu.hash) }
+ .dynsym : { *(.dynsym) }
+ .dynstr : { *(.dynstr) }
+ .gnu.version : { *(.gnu.version) }
+ .gnu.version_d : { *(.gnu.version_d) }
+ .gnu.version_r : { *(.gnu.version_r) }
+
+ .note : { *(.note.*) } :text :note
+
+ .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr
+ .eh_frame : { KEEP (*(.eh_frame)) } :text
+
+ .dynamic : { *(.dynamic) } :text :dynamic
+
+ .rodata : { *(.rodata*) } :text
+ .data : {
+ *(.data*)
+ *(.sdata*)
+ *(.got.plt) *(.got)
+ *(.gnu.linkonce.d.*)
+ *(.bss*)
+ *(.dynbss*)
+ *(.gnu.linkonce.b.*)
+ }
+
+ .altinstructions : { *(.altinstructions) }
+ .altinstr_replacement : { *(.altinstr_replacement) }
+
+ /*
+ * Align the actual code well away from the non-instruction data.
+ * This is the best thing for the I-cache.
+ */
+ . = ALIGN(0x100);
+
+ .text : { *(.text*) } :text =0x90909090
+}
+
+/*
+ * Very old versions of ld do not recognize this name token; use the constant.
+ */
+#define PT_GNU_EH_FRAME 0x6474e550
+
+/*
+ * We must supply the ELF program headers explicitly to get just one
+ * PT_LOAD segment, and set the flags explicitly to make segments read-only.
+ */
+PHDRS
+{
+ text PT_LOAD FLAGS(5) FILEHDR PHDRS; /* PF_R|PF_X */
+ dynamic PT_DYNAMIC FLAGS(4); /* PF_R */
+ note PT_NOTE FLAGS(4); /* PF_R */
+ eh_frame_hdr PT_GNU_EH_FRAME;
+}
diff --git a/arch/x86/um/vdso/vdso-note.S b/arch/x86/um/vdso/vdso-note.S
new file mode 100644
index 000000000..79a071e43
--- /dev/null
+++ b/arch/x86/um/vdso/vdso-note.S
@@ -0,0 +1,12 @@
+/*
+ * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text.
+ * Here we can supply some information useful to userland.
+ */
+
+#include <linux/uts.h>
+#include <linux/version.h>
+#include <linux/elfnote.h>
+
+ELFNOTE_START(Linux, 0, "a")
+ .long LINUX_VERSION_CODE
+ELFNOTE_END
diff --git a/arch/x86/um/vdso/vdso.S b/arch/x86/um/vdso/vdso.S
new file mode 100644
index 000000000..a4a3870dc
--- /dev/null
+++ b/arch/x86/um/vdso/vdso.S
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/init.h>
+
+__INITDATA
+
+ .globl vdso_start, vdso_end
+vdso_start:
+ .incbin "arch/x86/um/vdso/vdso.so"
+vdso_end:
+
+__FINIT
diff --git a/arch/x86/um/vdso/vdso.lds.S b/arch/x86/um/vdso/vdso.lds.S
new file mode 100644
index 000000000..73c508587
--- /dev/null
+++ b/arch/x86/um/vdso/vdso.lds.S
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Linker script for 64-bit vDSO.
+ * We #include the file to define the layout details.
+ * Here we only choose the prelinked virtual address.
+ *
+ * This file defines the version script giving the user-exported symbols in
+ * the DSO. We can define local symbols here called VDSO* to make their
+ * values visible using the asm-x86/vdso.h macros from the kernel proper.
+ */
+
+#define VDSO_PRELINK 0xffffffffff700000
+#include "vdso-layout.lds.S"
+
+/*
+ * This controls what userland symbols we export from the vDSO.
+ */
+VERSION {
+ LINUX_2.6 {
+ global:
+ clock_gettime;
+ __vdso_clock_gettime;
+ gettimeofday;
+ __vdso_gettimeofday;
+ getcpu;
+ __vdso_getcpu;
+ time;
+ __vdso_time;
+ local: *;
+ };
+}
+
+VDSO64_PRELINK = VDSO_PRELINK;
diff --git a/arch/x86/um/vdso/vma.c b/arch/x86/um/vdso/vma.c
new file mode 100644
index 000000000..6be22f991
--- /dev/null
+++ b/arch/x86/um/vdso/vma.c
@@ -0,0 +1,75 @@
+/*
+ * Copyright (C) 2011 Richard Weinberger <richrd@nod.at>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <asm/page.h>
+#include <asm/elf.h>
+#include <linux/init.h>
+
+static unsigned int __read_mostly vdso_enabled = 1;
+unsigned long um_vdso_addr;
+
+extern unsigned long task_size;
+extern char vdso_start[], vdso_end[];
+
+static struct page **vdsop;
+
+static int __init init_vdso(void)
+{
+ struct page *um_vdso;
+
+ BUG_ON(vdso_end - vdso_start > PAGE_SIZE);
+
+ um_vdso_addr = task_size - PAGE_SIZE;
+
+ vdsop = kmalloc(sizeof(struct page *), GFP_KERNEL);
+ if (!vdsop)
+ goto oom;
+
+ um_vdso = alloc_page(GFP_KERNEL);
+ if (!um_vdso) {
+ kfree(vdsop);
+
+ goto oom;
+ }
+
+ copy_page(page_address(um_vdso), vdso_start);
+ *vdsop = um_vdso;
+
+ return 0;
+
+oom:
+ printk(KERN_ERR "Cannot allocate vdso\n");
+ vdso_enabled = 0;
+
+ return -ENOMEM;
+}
+subsys_initcall(init_vdso);
+
+int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
+{
+ int err;
+ struct mm_struct *mm = current->mm;
+
+ if (!vdso_enabled)
+ return 0;
+
+ if (down_write_killable(&mm->mmap_sem))
+ return -EINTR;
+
+ err = install_special_mapping(mm, um_vdso_addr, PAGE_SIZE,
+ VM_READ|VM_EXEC|
+ VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
+ vdsop);
+
+ up_write(&mm->mmap_sem);
+
+ return err;
+}
diff --git a/arch/x86/video/Makefile b/arch/x86/video/Makefile
new file mode 100644
index 000000000..2c447c94a
--- /dev/null
+++ b/arch/x86/video/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_FB) += fbdev.o
diff --git a/arch/x86/video/fbdev.c b/arch/x86/video/fbdev.c
new file mode 100644
index 000000000..9fd24846d
--- /dev/null
+++ b/arch/x86/video/fbdev.c
@@ -0,0 +1,40 @@
+/*
+ * Copyright (C) 2007 Antonino Daplas <adaplas@gmail.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file COPYING in the main directory of this archive
+ * for more details.
+ *
+ */
+#include <linux/fb.h>
+#include <linux/pci.h>
+#include <linux/module.h>
+#include <linux/vgaarb.h>
+
+int fb_is_primary_device(struct fb_info *info)
+{
+ struct device *device = info->device;
+ struct pci_dev *default_device = vga_default_device();
+ struct pci_dev *pci_dev;
+ struct resource *res;
+
+ if (!device || !dev_is_pci(device))
+ return 0;
+
+ pci_dev = to_pci_dev(device);
+
+ if (default_device) {
+ if (pci_dev == default_device)
+ return 1;
+ return 0;
+ }
+
+ res = pci_dev->resource + PCI_ROM_RESOURCE;
+
+ if (res->flags & IORESOURCE_ROM_SHADOW)
+ return 1;
+
+ return 0;
+}
+EXPORT_SYMBOL(fb_is_primary_device);
+MODULE_LICENSE("GPL");
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
new file mode 100644
index 000000000..c1f98f32c
--- /dev/null
+++ b/arch/x86/xen/Kconfig
@@ -0,0 +1,79 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# This Kconfig describes xen options
+#
+
+config XEN
+ bool "Xen guest support"
+ depends on PARAVIRT
+ select PARAVIRT_CLOCK
+ depends on X86_64 || (X86_32 && X86_PAE)
+ depends on X86_LOCAL_APIC && X86_TSC
+ help
+ This is the Linux Xen port. Enabling this will allow the
+ kernel to boot in a paravirtualized environment under the
+ Xen hypervisor.
+
+config XEN_PV
+ bool "Xen PV guest support"
+ default y
+ depends on XEN
+ select XEN_HAVE_PVMMU
+ select XEN_HAVE_VPMU
+ help
+ Support running as a Xen PV guest.
+
+config XEN_PV_SMP
+ def_bool y
+ depends on XEN_PV && SMP
+
+config XEN_DOM0
+ bool "Xen PV Dom0 support"
+ default y
+ depends on XEN_PV && PCI_XEN && SWIOTLB_XEN
+ depends on X86_IO_APIC && ACPI && PCI
+ help
+ Support running as a Xen PV Dom0 guest.
+
+config XEN_PVHVM
+ bool "Xen PVHVM guest support"
+ default y
+ depends on XEN && PCI && X86_LOCAL_APIC
+ help
+ Support running as a Xen PVHVM guest.
+
+config XEN_PVHVM_SMP
+ def_bool y
+ depends on XEN_PVHVM && SMP
+
+config XEN_512GB
+ bool "Limit Xen pv-domain memory to 512GB"
+ depends on XEN_PV && X86_64
+ default y
+ help
+ Limit paravirtualized user domains to 512GB of RAM.
+
+ The Xen tools and crash dump analysis tools might not support
+ pv-domains with more than 512 GB of RAM. This option controls the
+ default setting of the kernel to use only up to 512 GB or more.
+ It is always possible to change the default via specifying the
+ boot parameter "xen_512gb_limit".
+
+config XEN_SAVE_RESTORE
+ bool
+ depends on XEN
+ select HIBERNATE_CALLBACKS
+ default y
+
+config XEN_DEBUG_FS
+ bool "Enable Xen debug and tuning parameters in debugfs"
+ depends on XEN && DEBUG_FS
+ default n
+ help
+ Enable statistics output and various tuning options in debugfs.
+ Enabling this option may incur a significant performance overhead.
+
+config XEN_PVH
+ bool "Support for running as a PVH guest"
+ depends on XEN && XEN_PVHVM && ACPI
+ def_bool n
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
new file mode 100644
index 000000000..d83cb5478
--- /dev/null
+++ b/arch/x86/xen/Makefile
@@ -0,0 +1,36 @@
+# SPDX-License-Identifier: GPL-2.0
+OBJECT_FILES_NON_STANDARD_xen-asm_$(BITS).o := y
+OBJECT_FILES_NON_STANDARD_xen-pvh.o := y
+
+ifdef CONFIG_FUNCTION_TRACER
+# Do not profile debug and lowlevel utilities
+CFLAGS_REMOVE_spinlock.o = -pg
+CFLAGS_REMOVE_time.o = -pg
+CFLAGS_REMOVE_irq.o = -pg
+endif
+
+# Make sure early boot has no stackprotector
+nostackp := $(call cc-option, -fno-stack-protector)
+CFLAGS_enlighten_pv.o := $(nostackp)
+CFLAGS_mmu_pv.o := $(nostackp)
+
+obj-y := enlighten.o multicalls.o mmu.o irq.o \
+ time.o xen-asm.o xen-asm_$(BITS).o \
+ grant-table.o suspend.o platform-pci-unplug.o
+
+obj-$(CONFIG_XEN_PVHVM) += enlighten_hvm.o mmu_hvm.o suspend_hvm.o
+obj-$(CONFIG_XEN_PV) += setup.o apic.o pmu.o suspend_pv.o \
+ p2m.o enlighten_pv.o mmu_pv.o
+obj-$(CONFIG_XEN_PVH) += enlighten_pvh.o
+
+obj-$(CONFIG_EVENT_TRACING) += trace.o
+
+obj-$(CONFIG_SMP) += smp.o
+obj-$(CONFIG_XEN_PV_SMP) += smp_pv.o
+obj-$(CONFIG_XEN_PVHVM_SMP) += smp_hvm.o
+obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o
+obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o
+obj-$(CONFIG_XEN_DOM0) += vga.o
+obj-$(CONFIG_SWIOTLB_XEN) += pci-swiotlb-xen.o
+obj-$(CONFIG_XEN_EFI) += efi.o
+obj-$(CONFIG_XEN_PVH) += xen-pvh.o
diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c
new file mode 100644
index 000000000..5e53bfbe5
--- /dev/null
+++ b/arch/x86/xen/apic.c
@@ -0,0 +1,226 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/init.h>
+
+#include <asm/x86_init.h>
+#include <asm/apic.h>
+#include <asm/xen/hypercall.h>
+
+#include <xen/xen.h>
+#include <xen/interface/physdev.h>
+#include "xen-ops.h"
+#include "pmu.h"
+#include "smp.h"
+
+static unsigned int xen_io_apic_read(unsigned apic, unsigned reg)
+{
+ struct physdev_apic apic_op;
+ int ret;
+
+ apic_op.apic_physbase = mpc_ioapic_addr(apic);
+ apic_op.reg = reg;
+ ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op);
+ if (!ret)
+ return apic_op.value;
+
+ /* fallback to return an emulated IO_APIC values */
+ if (reg == 0x1)
+ return 0x00170020;
+ else if (reg == 0x0)
+ return apic << 24;
+
+ return 0xfd;
+}
+
+static u32 xen_set_apic_id(unsigned int x)
+{
+ WARN_ON(1);
+ return x;
+}
+
+static unsigned int xen_get_apic_id(unsigned long x)
+{
+ return ((x)>>24) & 0xFFu;
+}
+
+static u32 xen_apic_read(u32 reg)
+{
+ struct xen_platform_op op = {
+ .cmd = XENPF_get_cpuinfo,
+ .interface_version = XENPF_INTERFACE_VERSION,
+ .u.pcpu_info.xen_cpuid = 0,
+ };
+ int ret = 0;
+
+ /* Shouldn't need this as APIC is turned off for PV, and we only
+ * get called on the bootup processor. But just in case. */
+ if (!xen_initial_domain() || smp_processor_id())
+ return 0;
+
+ if (reg == APIC_LVR)
+ return 0x14;
+#ifdef CONFIG_X86_32
+ if (reg == APIC_LDR)
+ return SET_APIC_LOGICAL_ID(1UL << smp_processor_id());
+#endif
+ if (reg != APIC_ID)
+ return 0;
+
+ ret = HYPERVISOR_platform_op(&op);
+ if (ret)
+ op.u.pcpu_info.apic_id = BAD_APICID;
+
+ return op.u.pcpu_info.apic_id << 24;
+}
+
+static void xen_apic_write(u32 reg, u32 val)
+{
+ if (reg == APIC_LVTPC) {
+ (void)pmu_apic_update(reg);
+ return;
+ }
+
+ /* Warn to see if there's any stray references */
+ WARN(1,"register: %x, value: %x\n", reg, val);
+}
+
+static u64 xen_apic_icr_read(void)
+{
+ return 0;
+}
+
+static void xen_apic_icr_write(u32 low, u32 id)
+{
+ /* Warn to see if there's any stray references */
+ WARN_ON(1);
+}
+
+static u32 xen_safe_apic_wait_icr_idle(void)
+{
+ return 0;
+}
+
+static int xen_apic_probe_pv(void)
+{
+ if (xen_pv_domain())
+ return 1;
+
+ return 0;
+}
+
+static int xen_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+ return xen_pv_domain();
+}
+
+static int xen_id_always_valid(u32 apicid)
+{
+ return 1;
+}
+
+static int xen_id_always_registered(void)
+{
+ return 1;
+}
+
+static int xen_phys_pkg_id(int initial_apic_id, int index_msb)
+{
+ return initial_apic_id >> index_msb;
+}
+
+#ifdef CONFIG_X86_32
+static int xen_x86_32_early_logical_apicid(int cpu)
+{
+ /* Match with APIC_LDR read. Otherwise setup_local_APIC complains. */
+ return 1 << cpu;
+}
+#endif
+
+static void xen_noop(void)
+{
+}
+
+static void xen_silent_inquire(int apicid)
+{
+}
+
+static int xen_cpu_present_to_apicid(int cpu)
+{
+ if (cpu_present(cpu))
+ return cpu_data(cpu).apicid;
+ else
+ return BAD_APICID;
+}
+
+static struct apic xen_pv_apic = {
+ .name = "Xen PV",
+ .probe = xen_apic_probe_pv,
+ .acpi_madt_oem_check = xen_madt_oem_check,
+ .apic_id_valid = xen_id_always_valid,
+ .apic_id_registered = xen_id_always_registered,
+
+ /* .irq_delivery_mode - used in native_compose_msi_msg only */
+ /* .irq_dest_mode - used in native_compose_msi_msg only */
+
+ .disable_esr = 0,
+ /* .dest_logical - default_send_IPI_ use it but we use our own. */
+ .check_apicid_used = default_check_apicid_used, /* Used on 32-bit */
+
+ .init_apic_ldr = xen_noop, /* setup_local_APIC calls it */
+
+ .ioapic_phys_id_map = default_ioapic_phys_id_map, /* Used on 32-bit */
+ .setup_apic_routing = NULL,
+ .cpu_present_to_apicid = xen_cpu_present_to_apicid,
+ .apicid_to_cpu_present = physid_set_mask_of_physid, /* Used on 32-bit */
+ .check_phys_apicid_present = default_check_phys_apicid_present, /* smp_sanity_check needs it */
+ .phys_pkg_id = xen_phys_pkg_id, /* detect_ht */
+
+ .get_apic_id = xen_get_apic_id,
+ .set_apic_id = xen_set_apic_id, /* Can be NULL on 32-bit. */
+
+ .calc_dest_apicid = apic_flat_calc_apicid,
+
+#ifdef CONFIG_SMP
+ .send_IPI_mask = xen_send_IPI_mask,
+ .send_IPI_mask_allbutself = xen_send_IPI_mask_allbutself,
+ .send_IPI_allbutself = xen_send_IPI_allbutself,
+ .send_IPI_all = xen_send_IPI_all,
+ .send_IPI_self = xen_send_IPI_self,
+#endif
+ /* .wait_for_init_deassert- used by AP bootup - smp_callin which we don't use */
+ .inquire_remote_apic = xen_silent_inquire,
+
+ .read = xen_apic_read,
+ .write = xen_apic_write,
+ .eoi_write = xen_apic_write,
+
+ .icr_read = xen_apic_icr_read,
+ .icr_write = xen_apic_icr_write,
+ .wait_icr_idle = xen_noop,
+ .safe_wait_icr_idle = xen_safe_apic_wait_icr_idle,
+
+#ifdef CONFIG_X86_32
+ /* generic_processor_info and setup_local_APIC. */
+ .x86_32_early_logical_apicid = xen_x86_32_early_logical_apicid,
+#endif
+};
+
+static void __init xen_apic_check(void)
+{
+ if (apic == &xen_pv_apic)
+ return;
+
+ pr_info("Switched APIC routing from %s to %s.\n", apic->name,
+ xen_pv_apic.name);
+ apic = &xen_pv_apic;
+}
+void __init xen_init_apic(void)
+{
+ x86_apic_ops.io_apic_read = xen_io_apic_read;
+ /* On PV guests the APIC CPUID bit is disabled so none of the
+ * routines end up executing. */
+ if (!xen_initial_domain())
+ apic = &xen_pv_apic;
+
+ x86_platform.apic_post_init = xen_apic_check;
+}
+apic_driver(xen_pv_apic);
diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c
new file mode 100644
index 000000000..13da87918
--- /dev/null
+++ b/arch/x86/xen/debugfs.c
@@ -0,0 +1,21 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/init.h>
+#include <linux/debugfs.h>
+#include <linux/slab.h>
+
+#include "debugfs.h"
+
+static struct dentry *d_xen_debug;
+
+struct dentry * __init xen_init_debugfs(void)
+{
+ if (!d_xen_debug) {
+ d_xen_debug = debugfs_create_dir("xen", NULL);
+
+ if (!d_xen_debug)
+ pr_warning("Could not create 'xen' debugfs directory\n");
+ }
+
+ return d_xen_debug;
+}
+
diff --git a/arch/x86/xen/debugfs.h b/arch/x86/xen/debugfs.h
new file mode 100644
index 000000000..6b813ad10
--- /dev/null
+++ b/arch/x86/xen/debugfs.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _XEN_DEBUGFS_H
+#define _XEN_DEBUGFS_H
+
+struct dentry * __init xen_init_debugfs(void);
+
+#endif /* _XEN_DEBUGFS_H */
diff --git a/arch/x86/xen/efi.c b/arch/x86/xen/efi.c
new file mode 100644
index 000000000..90d2e4ce7
--- /dev/null
+++ b/arch/x86/xen/efi.c
@@ -0,0 +1,194 @@
+/*
+ * Copyright (c) 2014 Oracle Co., Daniel Kiper
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/bitops.h>
+#include <linux/efi.h>
+#include <linux/init.h>
+#include <linux/string.h>
+
+#include <xen/xen.h>
+#include <xen/xen-ops.h>
+#include <xen/interface/platform.h>
+
+#include <asm/page.h>
+#include <asm/setup.h>
+#include <asm/xen/hypercall.h>
+
+static efi_char16_t vendor[100] __initdata;
+
+static efi_system_table_t efi_systab_xen __initdata = {
+ .hdr = {
+ .signature = EFI_SYSTEM_TABLE_SIGNATURE,
+ .revision = 0, /* Initialized later. */
+ .headersize = 0, /* Ignored by Linux Kernel. */
+ .crc32 = 0, /* Ignored by Linux Kernel. */
+ .reserved = 0
+ },
+ .fw_vendor = EFI_INVALID_TABLE_ADDR, /* Initialized later. */
+ .fw_revision = 0, /* Initialized later. */
+ .con_in_handle = EFI_INVALID_TABLE_ADDR, /* Not used under Xen. */
+ .con_in = EFI_INVALID_TABLE_ADDR, /* Not used under Xen. */
+ .con_out_handle = EFI_INVALID_TABLE_ADDR, /* Not used under Xen. */
+ .con_out = EFI_INVALID_TABLE_ADDR, /* Not used under Xen. */
+ .stderr_handle = EFI_INVALID_TABLE_ADDR, /* Not used under Xen. */
+ .stderr = EFI_INVALID_TABLE_ADDR, /* Not used under Xen. */
+ .runtime = (efi_runtime_services_t *)EFI_INVALID_TABLE_ADDR,
+ /* Not used under Xen. */
+ .boottime = (efi_boot_services_t *)EFI_INVALID_TABLE_ADDR,
+ /* Not used under Xen. */
+ .nr_tables = 0, /* Initialized later. */
+ .tables = EFI_INVALID_TABLE_ADDR /* Initialized later. */
+};
+
+static efi_system_table_t __init *xen_efi_probe(void)
+{
+ struct xen_platform_op op = {
+ .cmd = XENPF_firmware_info,
+ .u.firmware_info = {
+ .type = XEN_FW_EFI_INFO,
+ .index = XEN_FW_EFI_CONFIG_TABLE
+ }
+ };
+ union xenpf_efi_info *info = &op.u.firmware_info.u.efi_info;
+
+ if (!xen_initial_domain() || HYPERVISOR_platform_op(&op) < 0)
+ return NULL;
+
+ /* Here we know that Xen runs on EFI platform. */
+
+ efi.get_time = xen_efi_get_time;
+ efi.set_time = xen_efi_set_time;
+ efi.get_wakeup_time = xen_efi_get_wakeup_time;
+ efi.set_wakeup_time = xen_efi_set_wakeup_time;
+ efi.get_variable = xen_efi_get_variable;
+ efi.get_next_variable = xen_efi_get_next_variable;
+ efi.set_variable = xen_efi_set_variable;
+ efi.set_variable_nonblocking = xen_efi_set_variable;
+ efi.query_variable_info = xen_efi_query_variable_info;
+ efi.query_variable_info_nonblocking = xen_efi_query_variable_info;
+ efi.update_capsule = xen_efi_update_capsule;
+ efi.query_capsule_caps = xen_efi_query_capsule_caps;
+ efi.get_next_high_mono_count = xen_efi_get_next_high_mono_count;
+ efi.reset_system = xen_efi_reset_system;
+
+ efi_systab_xen.tables = info->cfg.addr;
+ efi_systab_xen.nr_tables = info->cfg.nent;
+
+ op.cmd = XENPF_firmware_info;
+ op.u.firmware_info.type = XEN_FW_EFI_INFO;
+ op.u.firmware_info.index = XEN_FW_EFI_VENDOR;
+ info->vendor.bufsz = sizeof(vendor);
+ set_xen_guest_handle(info->vendor.name, vendor);
+
+ if (HYPERVISOR_platform_op(&op) == 0) {
+ efi_systab_xen.fw_vendor = __pa_symbol(vendor);
+ efi_systab_xen.fw_revision = info->vendor.revision;
+ } else
+ efi_systab_xen.fw_vendor = __pa_symbol(L"UNKNOWN");
+
+ op.cmd = XENPF_firmware_info;
+ op.u.firmware_info.type = XEN_FW_EFI_INFO;
+ op.u.firmware_info.index = XEN_FW_EFI_VERSION;
+
+ if (HYPERVISOR_platform_op(&op) == 0)
+ efi_systab_xen.hdr.revision = info->version;
+
+ op.cmd = XENPF_firmware_info;
+ op.u.firmware_info.type = XEN_FW_EFI_INFO;
+ op.u.firmware_info.index = XEN_FW_EFI_RT_VERSION;
+
+ if (HYPERVISOR_platform_op(&op) == 0)
+ efi.runtime_version = info->version;
+
+ return &efi_systab_xen;
+}
+
+/*
+ * Determine whether we're in secure boot mode.
+ *
+ * Please keep the logic in sync with
+ * drivers/firmware/efi/libstub/secureboot.c:efi_get_secureboot().
+ */
+static enum efi_secureboot_mode xen_efi_get_secureboot(void)
+{
+ static efi_guid_t efi_variable_guid = EFI_GLOBAL_VARIABLE_GUID;
+ static efi_guid_t shim_guid = EFI_SHIM_LOCK_GUID;
+ efi_status_t status;
+ u8 moksbstate, secboot, setupmode;
+ unsigned long size;
+
+ size = sizeof(secboot);
+ status = efi.get_variable(L"SecureBoot", &efi_variable_guid,
+ NULL, &size, &secboot);
+
+ if (status == EFI_NOT_FOUND)
+ return efi_secureboot_mode_disabled;
+
+ if (status != EFI_SUCCESS)
+ goto out_efi_err;
+
+ size = sizeof(setupmode);
+ status = efi.get_variable(L"SetupMode", &efi_variable_guid,
+ NULL, &size, &setupmode);
+
+ if (status != EFI_SUCCESS)
+ goto out_efi_err;
+
+ if (secboot == 0 || setupmode == 1)
+ return efi_secureboot_mode_disabled;
+
+ /* See if a user has put the shim into insecure mode. */
+ size = sizeof(moksbstate);
+ status = efi.get_variable(L"MokSBStateRT", &shim_guid,
+ NULL, &size, &moksbstate);
+
+ /* If it fails, we don't care why. Default to secure. */
+ if (status != EFI_SUCCESS)
+ goto secure_boot_enabled;
+
+ if (moksbstate == 1)
+ return efi_secureboot_mode_disabled;
+
+ secure_boot_enabled:
+ pr_info("UEFI Secure Boot is enabled.\n");
+ return efi_secureboot_mode_enabled;
+
+ out_efi_err:
+ pr_err("Could not determine UEFI Secure Boot status.\n");
+ return efi_secureboot_mode_unknown;
+}
+
+void __init xen_efi_init(struct boot_params *boot_params)
+{
+ efi_system_table_t *efi_systab_xen;
+
+ efi_systab_xen = xen_efi_probe();
+
+ if (efi_systab_xen == NULL)
+ return;
+
+ strncpy((char *)&boot_params->efi_info.efi_loader_signature, "Xen",
+ sizeof(boot_params->efi_info.efi_loader_signature));
+ boot_params->efi_info.efi_systab = (__u32)__pa(efi_systab_xen);
+ boot_params->efi_info.efi_systab_hi = (__u32)(__pa(efi_systab_xen) >> 32);
+
+ boot_params->secure_boot = xen_efi_get_secureboot();
+
+ set_bit(EFI_BOOT, &efi.flags);
+ set_bit(EFI_PARAVIRT, &efi.flags);
+ set_bit(EFI_64BIT, &efi.flags);
+}
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
new file mode 100644
index 000000000..2483ff345
--- /dev/null
+++ b/arch/x86/xen/enlighten.c
@@ -0,0 +1,366 @@
+#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG
+#include <linux/bootmem.h>
+#endif
+#include <linux/cpu.h>
+#include <linux/kexec.h>
+#include <linux/slab.h>
+
+#include <xen/features.h>
+#include <xen/page.h>
+
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/cpu.h>
+#include <asm/e820/api.h>
+
+#include "xen-ops.h"
+#include "smp.h"
+#include "pmu.h"
+
+EXPORT_SYMBOL_GPL(hypercall_page);
+
+/*
+ * Pointer to the xen_vcpu_info structure or
+ * &HYPERVISOR_shared_info->vcpu_info[cpu]. See xen_hvm_init_shared_info
+ * and xen_vcpu_setup for details. By default it points to share_info->vcpu_info
+ * but if the hypervisor supports VCPUOP_register_vcpu_info then it can point
+ * to xen_vcpu_info. The pointer is used in __xen_evtchn_do_upcall to
+ * acknowledge pending events.
+ * Also more subtly it is used by the patched version of irq enable/disable
+ * e.g. xen_irq_enable_direct and xen_iret in PV mode.
+ *
+ * The desire to be able to do those mask/unmask operations as a single
+ * instruction by using the per-cpu offset held in %gs is the real reason
+ * vcpu info is in a per-cpu pointer and the original reason for this
+ * hypercall.
+ *
+ */
+DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
+
+/*
+ * Per CPU pages used if hypervisor supports VCPUOP_register_vcpu_info
+ * hypercall. This can be used both in PV and PVHVM mode. The structure
+ * overrides the default per_cpu(xen_vcpu, cpu) value.
+ */
+DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
+
+/* Linux <-> Xen vCPU id mapping */
+DEFINE_PER_CPU(uint32_t, xen_vcpu_id);
+EXPORT_PER_CPU_SYMBOL(xen_vcpu_id);
+
+enum xen_domain_type xen_domain_type = XEN_NATIVE;
+EXPORT_SYMBOL_GPL(xen_domain_type);
+
+unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START;
+EXPORT_SYMBOL(machine_to_phys_mapping);
+unsigned long machine_to_phys_nr;
+EXPORT_SYMBOL(machine_to_phys_nr);
+
+struct start_info *xen_start_info;
+EXPORT_SYMBOL_GPL(xen_start_info);
+
+struct shared_info xen_dummy_shared_info;
+
+__read_mostly int xen_have_vector_callback;
+EXPORT_SYMBOL_GPL(xen_have_vector_callback);
+
+/*
+ * NB: needs to live in .data because it's used by xen_prepare_pvh which runs
+ * before clearing the bss.
+ */
+uint32_t xen_start_flags __attribute__((section(".data"))) = 0;
+EXPORT_SYMBOL(xen_start_flags);
+
+/*
+ * Point at some empty memory to start with. We map the real shared_info
+ * page as soon as fixmap is up and running.
+ */
+struct shared_info *HYPERVISOR_shared_info = &xen_dummy_shared_info;
+
+/*
+ * Flag to determine whether vcpu info placement is available on all
+ * VCPUs. We assume it is to start with, and then set it to zero on
+ * the first failure. This is because it can succeed on some VCPUs
+ * and not others, since it can involve hypervisor memory allocation,
+ * or because the guest failed to guarantee all the appropriate
+ * constraints on all VCPUs (ie buffer can't cross a page boundary).
+ *
+ * Note that any particular CPU may be using a placed vcpu structure,
+ * but we can only optimise if the all are.
+ *
+ * 0: not available, 1: available
+ */
+int xen_have_vcpu_info_placement = 1;
+
+static int xen_cpu_up_online(unsigned int cpu)
+{
+ xen_init_lock_cpu(cpu);
+ return 0;
+}
+
+int xen_cpuhp_setup(int (*cpu_up_prepare_cb)(unsigned int),
+ int (*cpu_dead_cb)(unsigned int))
+{
+ int rc;
+
+ rc = cpuhp_setup_state_nocalls(CPUHP_XEN_PREPARE,
+ "x86/xen/guest:prepare",
+ cpu_up_prepare_cb, cpu_dead_cb);
+ if (rc >= 0) {
+ rc = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
+ "x86/xen/guest:online",
+ xen_cpu_up_online, NULL);
+ if (rc < 0)
+ cpuhp_remove_state_nocalls(CPUHP_XEN_PREPARE);
+ }
+
+ return rc >= 0 ? 0 : rc;
+}
+
+static int xen_vcpu_setup_restore(int cpu)
+{
+ int rc = 0;
+
+ /* Any per_cpu(xen_vcpu) is stale, so reset it */
+ xen_vcpu_info_reset(cpu);
+
+ /*
+ * For PVH and PVHVM, setup online VCPUs only. The rest will
+ * be handled by hotplug.
+ */
+ if (xen_pv_domain() ||
+ (xen_hvm_domain() && cpu_online(cpu))) {
+ rc = xen_vcpu_setup(cpu);
+ }
+
+ return rc;
+}
+
+/*
+ * On restore, set the vcpu placement up again.
+ * If it fails, then we're in a bad state, since
+ * we can't back out from using it...
+ */
+void xen_vcpu_restore(void)
+{
+ int cpu, rc;
+
+ for_each_possible_cpu(cpu) {
+ bool other_cpu = (cpu != smp_processor_id());
+ bool is_up;
+
+ if (xen_vcpu_nr(cpu) == XEN_VCPU_ID_INVALID)
+ continue;
+
+ /* Only Xen 4.5 and higher support this. */
+ is_up = HYPERVISOR_vcpu_op(VCPUOP_is_up,
+ xen_vcpu_nr(cpu), NULL) > 0;
+
+ if (other_cpu && is_up &&
+ HYPERVISOR_vcpu_op(VCPUOP_down, xen_vcpu_nr(cpu), NULL))
+ BUG();
+
+ if (xen_pv_domain() || xen_feature(XENFEAT_hvm_safe_pvclock))
+ xen_setup_runstate_info(cpu);
+
+ rc = xen_vcpu_setup_restore(cpu);
+ if (rc)
+ pr_emerg_once("vcpu restore failed for cpu=%d err=%d. "
+ "System will hang.\n", cpu, rc);
+ /*
+ * In case xen_vcpu_setup_restore() fails, do not bring up the
+ * VCPU. This helps us avoid the resulting OOPS when the VCPU
+ * accesses pvclock_vcpu_time via xen_vcpu (which is NULL.)
+ * Note that this does not improve the situation much -- now the
+ * VM hangs instead of OOPSing -- with the VCPUs that did not
+ * fail, spinning in stop_machine(), waiting for the failed
+ * VCPUs to come up.
+ */
+ if (other_cpu && is_up && (rc == 0) &&
+ HYPERVISOR_vcpu_op(VCPUOP_up, xen_vcpu_nr(cpu), NULL))
+ BUG();
+ }
+}
+
+void xen_vcpu_info_reset(int cpu)
+{
+ if (xen_vcpu_nr(cpu) < MAX_VIRT_CPUS) {
+ per_cpu(xen_vcpu, cpu) =
+ &HYPERVISOR_shared_info->vcpu_info[xen_vcpu_nr(cpu)];
+ } else {
+ /* Set to NULL so that if somebody accesses it we get an OOPS */
+ per_cpu(xen_vcpu, cpu) = NULL;
+ }
+}
+
+int xen_vcpu_setup(int cpu)
+{
+ struct vcpu_register_vcpu_info info;
+ int err;
+ struct vcpu_info *vcpup;
+
+ BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
+
+ /*
+ * This path is called on PVHVM at bootup (xen_hvm_smp_prepare_boot_cpu)
+ * and at restore (xen_vcpu_restore). Also called for hotplugged
+ * VCPUs (cpu_init -> xen_hvm_cpu_prepare_hvm).
+ * However, the hypercall can only be done once (see below) so if a VCPU
+ * is offlined and comes back online then let's not redo the hypercall.
+ *
+ * For PV it is called during restore (xen_vcpu_restore) and bootup
+ * (xen_setup_vcpu_info_placement). The hotplug mechanism does not
+ * use this function.
+ */
+ if (xen_hvm_domain()) {
+ if (per_cpu(xen_vcpu, cpu) == &per_cpu(xen_vcpu_info, cpu))
+ return 0;
+ }
+
+ if (xen_have_vcpu_info_placement) {
+ vcpup = &per_cpu(xen_vcpu_info, cpu);
+ info.mfn = arbitrary_virt_to_mfn(vcpup);
+ info.offset = offset_in_page(vcpup);
+
+ /*
+ * Check to see if the hypervisor will put the vcpu_info
+ * structure where we want it, which allows direct access via
+ * a percpu-variable.
+ * N.B. This hypercall can _only_ be called once per CPU.
+ * Subsequent calls will error out with -EINVAL. This is due to
+ * the fact that hypervisor has no unregister variant and this
+ * hypercall does not allow to over-write info.mfn and
+ * info.offset.
+ */
+ err = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info,
+ xen_vcpu_nr(cpu), &info);
+
+ if (err) {
+ pr_warn_once("register_vcpu_info failed: cpu=%d err=%d\n",
+ cpu, err);
+ xen_have_vcpu_info_placement = 0;
+ } else {
+ /*
+ * This cpu is using the registered vcpu info, even if
+ * later ones fail to.
+ */
+ per_cpu(xen_vcpu, cpu) = vcpup;
+ }
+ }
+
+ if (!xen_have_vcpu_info_placement)
+ xen_vcpu_info_reset(cpu);
+
+ return ((per_cpu(xen_vcpu, cpu) == NULL) ? -ENODEV : 0);
+}
+
+void xen_reboot(int reason)
+{
+ struct sched_shutdown r = { .reason = reason };
+ int cpu;
+
+ for_each_online_cpu(cpu)
+ xen_pmu_finish(cpu);
+
+ if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r))
+ BUG();
+}
+
+static int reboot_reason = SHUTDOWN_reboot;
+static bool xen_legacy_crash;
+void xen_emergency_restart(void)
+{
+ xen_reboot(reboot_reason);
+}
+
+static int
+xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+ if (!kexec_crash_loaded()) {
+ if (xen_legacy_crash)
+ xen_reboot(SHUTDOWN_crash);
+
+ reboot_reason = SHUTDOWN_crash;
+
+ /*
+ * If panic_timeout==0 then we are supposed to wait forever.
+ * However, to preserve original dom0 behavior we have to drop
+ * into hypervisor. (domU behavior is controlled by its
+ * config file)
+ */
+ if (panic_timeout == 0)
+ panic_timeout = -1;
+ }
+ return NOTIFY_DONE;
+}
+
+static int __init parse_xen_legacy_crash(char *arg)
+{
+ xen_legacy_crash = true;
+ return 0;
+}
+early_param("xen_legacy_crash", parse_xen_legacy_crash);
+
+static struct notifier_block xen_panic_block = {
+ .notifier_call = xen_panic_event,
+ .priority = INT_MIN
+};
+
+int xen_panic_handler_init(void)
+{
+ atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
+ return 0;
+}
+
+void xen_pin_vcpu(int cpu)
+{
+ static bool disable_pinning;
+ struct sched_pin_override pin_override;
+ int ret;
+
+ if (disable_pinning)
+ return;
+
+ pin_override.pcpu = cpu;
+ ret = HYPERVISOR_sched_op(SCHEDOP_pin_override, &pin_override);
+
+ /* Ignore errors when removing override. */
+ if (cpu < 0)
+ return;
+
+ switch (ret) {
+ case -ENOSYS:
+ pr_warn("Unable to pin on physical cpu %d. In case of problems consider vcpu pinning.\n",
+ cpu);
+ disable_pinning = true;
+ break;
+ case -EPERM:
+ WARN(1, "Trying to pin vcpu without having privilege to do so\n");
+ disable_pinning = true;
+ break;
+ case -EINVAL:
+ case -EBUSY:
+ pr_warn("Physical cpu %d not available for pinning. Check Xen cpu configuration.\n",
+ cpu);
+ break;
+ case 0:
+ break;
+ default:
+ WARN(1, "rc %d while trying to pin vcpu\n", ret);
+ disable_pinning = true;
+ }
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+void xen_arch_register_cpu(int num)
+{
+ arch_register_cpu(num);
+}
+EXPORT_SYMBOL(xen_arch_register_cpu);
+
+void xen_arch_unregister_cpu(int num)
+{
+ arch_unregister_cpu(num);
+}
+EXPORT_SYMBOL(xen_arch_unregister_cpu);
+#endif
diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c
new file mode 100644
index 000000000..19c1ff542
--- /dev/null
+++ b/arch/x86/xen/enlighten_hvm.c
@@ -0,0 +1,270 @@
+#include <linux/acpi.h>
+#include <linux/cpu.h>
+#include <linux/kexec.h>
+#include <linux/memblock.h>
+
+#include <xen/features.h>
+#include <xen/events.h>
+#include <xen/interface/memory.h>
+
+#include <asm/cpu.h>
+#include <asm/smp.h>
+#include <asm/reboot.h>
+#include <asm/setup.h>
+#include <asm/hypervisor.h>
+#include <asm/e820/api.h>
+#include <asm/early_ioremap.h>
+
+#include <asm/xen/cpuid.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/page.h>
+
+#include "xen-ops.h"
+#include "mmu.h"
+#include "smp.h"
+
+static unsigned long shared_info_pfn;
+
+void xen_hvm_init_shared_info(void)
+{
+ struct xen_add_to_physmap xatp;
+
+ xatp.domid = DOMID_SELF;
+ xatp.idx = 0;
+ xatp.space = XENMAPSPACE_shared_info;
+ xatp.gpfn = shared_info_pfn;
+ if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp))
+ BUG();
+}
+
+static void __init reserve_shared_info(void)
+{
+ u64 pa;
+
+ /*
+ * Search for a free page starting at 4kB physical address.
+ * Low memory is preferred to avoid an EPT large page split up
+ * by the mapping.
+ * Starting below X86_RESERVE_LOW (usually 64kB) is fine as
+ * the BIOS used for HVM guests is well behaved and won't
+ * clobber memory other than the first 4kB.
+ */
+ for (pa = PAGE_SIZE;
+ !e820__mapped_all(pa, pa + PAGE_SIZE, E820_TYPE_RAM) ||
+ memblock_is_reserved(pa);
+ pa += PAGE_SIZE)
+ ;
+
+ shared_info_pfn = PHYS_PFN(pa);
+
+ memblock_reserve(pa, PAGE_SIZE);
+ HYPERVISOR_shared_info = early_memremap(pa, PAGE_SIZE);
+}
+
+static void __init xen_hvm_init_mem_mapping(void)
+{
+ early_memunmap(HYPERVISOR_shared_info, PAGE_SIZE);
+ HYPERVISOR_shared_info = __va(PFN_PHYS(shared_info_pfn));
+
+ /*
+ * The virtual address of the shared_info page has changed, so
+ * the vcpu_info pointer for VCPU 0 is now stale.
+ *
+ * The prepare_boot_cpu callback will re-initialize it via
+ * xen_vcpu_setup, but we can't rely on that to be called for
+ * old Xen versions (xen_have_vector_callback == 0).
+ *
+ * It is, in any case, bad to have a stale vcpu_info pointer
+ * so reset it now.
+ */
+ xen_vcpu_info_reset(0);
+}
+
+static void __init init_hvm_pv_info(void)
+{
+ int major, minor;
+ uint32_t eax, ebx, ecx, edx, base;
+
+ base = xen_cpuid_base();
+ eax = cpuid_eax(base + 1);
+
+ major = eax >> 16;
+ minor = eax & 0xffff;
+ printk(KERN_INFO "Xen version %d.%d.\n", major, minor);
+
+ xen_domain_type = XEN_HVM_DOMAIN;
+
+ /* PVH set up hypercall page in xen_prepare_pvh(). */
+ if (xen_pvh_domain())
+ pv_info.name = "Xen PVH";
+ else {
+ u64 pfn;
+ uint32_t msr;
+
+ pv_info.name = "Xen HVM";
+ msr = cpuid_ebx(base + 2);
+ pfn = __pa(hypercall_page);
+ wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
+ }
+
+ xen_setup_features();
+
+ cpuid(base + 4, &eax, &ebx, &ecx, &edx);
+ if (eax & XEN_HVM_CPUID_VCPU_ID_PRESENT)
+ this_cpu_write(xen_vcpu_id, ebx);
+ else
+ this_cpu_write(xen_vcpu_id, smp_processor_id());
+}
+
+#ifdef CONFIG_KEXEC_CORE
+static void xen_hvm_shutdown(void)
+{
+ native_machine_shutdown();
+ if (kexec_in_progress)
+ xen_reboot(SHUTDOWN_soft_reset);
+}
+
+static void xen_hvm_crash_shutdown(struct pt_regs *regs)
+{
+ native_machine_crash_shutdown(regs);
+ xen_reboot(SHUTDOWN_soft_reset);
+}
+#endif
+
+static int xen_cpu_up_prepare_hvm(unsigned int cpu)
+{
+ int rc = 0;
+
+ /*
+ * This can happen if CPU was offlined earlier and
+ * offlining timed out in common_cpu_die().
+ */
+ if (cpu_report_state(cpu) == CPU_DEAD_FROZEN) {
+ xen_smp_intr_free(cpu);
+ xen_uninit_lock_cpu(cpu);
+ }
+
+ if (cpu_acpi_id(cpu) != U32_MAX)
+ per_cpu(xen_vcpu_id, cpu) = cpu_acpi_id(cpu);
+ else
+ per_cpu(xen_vcpu_id, cpu) = cpu;
+ rc = xen_vcpu_setup(cpu);
+ if (rc)
+ return rc;
+
+ if (xen_have_vector_callback && xen_feature(XENFEAT_hvm_safe_pvclock))
+ xen_setup_timer(cpu);
+
+ rc = xen_smp_intr_init(cpu);
+ if (rc) {
+ WARN(1, "xen_smp_intr_init() for CPU %d failed: %d\n",
+ cpu, rc);
+ }
+ return rc;
+}
+
+static int xen_cpu_dead_hvm(unsigned int cpu)
+{
+ xen_smp_intr_free(cpu);
+
+ if (xen_have_vector_callback && xen_feature(XENFEAT_hvm_safe_pvclock))
+ xen_teardown_timer(cpu);
+
+ return 0;
+}
+
+static void __init xen_hvm_guest_init(void)
+{
+ if (xen_pv_domain())
+ return;
+
+ init_hvm_pv_info();
+
+ reserve_shared_info();
+ xen_hvm_init_shared_info();
+
+ /*
+ * xen_vcpu is a pointer to the vcpu_info struct in the shared_info
+ * page, we use it in the event channel upcall and in some pvclock
+ * related functions.
+ */
+ xen_vcpu_info_reset(0);
+
+ xen_panic_handler_init();
+
+ if (xen_feature(XENFEAT_hvm_callback_vector))
+ xen_have_vector_callback = 1;
+
+ xen_hvm_smp_init();
+ WARN_ON(xen_cpuhp_setup(xen_cpu_up_prepare_hvm, xen_cpu_dead_hvm));
+ xen_unplug_emulated_devices();
+ x86_init.irqs.intr_init = xen_init_IRQ;
+ xen_hvm_init_time_ops();
+ xen_hvm_init_mmu_ops();
+
+#ifdef CONFIG_KEXEC_CORE
+ machine_ops.shutdown = xen_hvm_shutdown;
+ machine_ops.crash_shutdown = xen_hvm_crash_shutdown;
+#endif
+}
+
+static bool xen_nopv;
+static __init int xen_parse_nopv(char *arg)
+{
+ xen_nopv = true;
+ return 0;
+}
+early_param("xen_nopv", xen_parse_nopv);
+
+bool xen_hvm_need_lapic(void)
+{
+ if (xen_nopv)
+ return false;
+ if (xen_pv_domain())
+ return false;
+ if (!xen_hvm_domain())
+ return false;
+ if (xen_feature(XENFEAT_hvm_pirqs) && xen_have_vector_callback)
+ return false;
+ return true;
+}
+EXPORT_SYMBOL_GPL(xen_hvm_need_lapic);
+
+static uint32_t __init xen_platform_hvm(void)
+{
+ if (xen_pv_domain() || xen_nopv)
+ return 0;
+
+ return xen_cpuid_base();
+}
+
+static __init void xen_hvm_guest_late_init(void)
+{
+#ifdef CONFIG_XEN_PVH
+ /* Test for PVH domain (PVH boot path taken overrides ACPI flags). */
+ if (!xen_pvh &&
+ (x86_platform.legacy.rtc || !x86_platform.legacy.no_vga))
+ return;
+
+ /* PVH detected. */
+ xen_pvh = true;
+
+ /* Make sure we don't fall back to (default) ACPI_IRQ_MODEL_PIC. */
+ if (!nr_ioapics && acpi_irq_model == ACPI_IRQ_MODEL_PIC)
+ acpi_irq_model = ACPI_IRQ_MODEL_PLATFORM;
+
+ machine_ops.emergency_restart = xen_emergency_restart;
+ pv_info.name = "Xen PVH";
+#endif
+}
+
+const __initconst struct hypervisor_x86 x86_hyper_xen_hvm = {
+ .name = "Xen HVM",
+ .detect = xen_platform_hvm,
+ .type = X86_HYPER_XEN_HVM,
+ .init.init_platform = xen_hvm_guest_init,
+ .init.x2apic_available = xen_x2apic_para_available,
+ .init.init_mem_mapping = xen_hvm_init_mem_mapping,
+ .init.guest_late_init = xen_hvm_guest_late_init,
+ .runtime.pin_vcpu = xen_pin_vcpu,
+};
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
new file mode 100644
index 000000000..8f1ff8dad
--- /dev/null
+++ b/arch/x86/xen/enlighten_pv.c
@@ -0,0 +1,1482 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Core of Xen paravirt_ops implementation.
+ *
+ * This file contains the xen_paravirt_ops structure itself, and the
+ * implementations for:
+ * - privileged instructions
+ * - interrupt flags
+ * - segment operations
+ * - booting and setup
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+
+#include <linux/cpu.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/preempt.h>
+#include <linux/hardirq.h>
+#include <linux/percpu.h>
+#include <linux/delay.h>
+#include <linux/start_kernel.h>
+#include <linux/sched.h>
+#include <linux/kprobes.h>
+#include <linux/bootmem.h>
+#include <linux/export.h>
+#include <linux/mm.h>
+#include <linux/page-flags.h>
+#include <linux/highmem.h>
+#include <linux/console.h>
+#include <linux/pci.h>
+#include <linux/gfp.h>
+#include <linux/memblock.h>
+#include <linux/edd.h>
+#include <linux/frame.h>
+
+#include <xen/xen.h>
+#include <xen/events.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/version.h>
+#include <xen/interface/physdev.h>
+#include <xen/interface/vcpu.h>
+#include <xen/interface/memory.h>
+#include <xen/interface/nmi.h>
+#include <xen/interface/xen-mca.h>
+#include <xen/features.h>
+#include <xen/page.h>
+#include <xen/hvc-console.h>
+#include <xen/acpi.h>
+
+#include <asm/paravirt.h>
+#include <asm/apic.h>
+#include <asm/page.h>
+#include <asm/xen/pci.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/cpuid.h>
+#include <asm/fixmap.h>
+#include <asm/processor.h>
+#include <asm/proto.h>
+#include <asm/msr-index.h>
+#include <asm/traps.h>
+#include <asm/setup.h>
+#include <asm/desc.h>
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/reboot.h>
+#include <asm/stackprotector.h>
+#include <asm/hypervisor.h>
+#include <asm/mach_traps.h>
+#include <asm/mwait.h>
+#include <asm/pci_x86.h>
+#include <asm/cpu.h>
+
+#ifdef CONFIG_ACPI
+#include <linux/acpi.h>
+#include <asm/acpi.h>
+#include <acpi/pdc_intel.h>
+#include <acpi/processor.h>
+#include <xen/interface/platform.h>
+#endif
+
+#include "xen-ops.h"
+#include "mmu.h"
+#include "smp.h"
+#include "multicalls.h"
+#include "pmu.h"
+
+#include "../kernel/cpu/cpu.h" /* get_cpu_cap() */
+
+void *xen_initial_gdt;
+
+static int xen_cpu_up_prepare_pv(unsigned int cpu);
+static int xen_cpu_dead_pv(unsigned int cpu);
+
+struct tls_descs {
+ struct desc_struct desc[3];
+};
+
+/*
+ * Updating the 3 TLS descriptors in the GDT on every task switch is
+ * surprisingly expensive so we avoid updating them if they haven't
+ * changed. Since Xen writes different descriptors than the one
+ * passed in the update_descriptor hypercall we keep shadow copies to
+ * compare against.
+ */
+static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc);
+
+static void __init xen_banner(void)
+{
+ unsigned version = HYPERVISOR_xen_version(XENVER_version, NULL);
+ struct xen_extraversion extra;
+ HYPERVISOR_xen_version(XENVER_extraversion, &extra);
+
+ pr_info("Booting paravirtualized kernel on %s\n", pv_info.name);
+ printk(KERN_INFO "Xen version: %d.%d%s%s\n",
+ version >> 16, version & 0xffff, extra.extraversion,
+ xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : "");
+}
+
+static void __init xen_pv_init_platform(void)
+{
+ populate_extra_pte(fix_to_virt(FIX_PARAVIRT_BOOTMAP));
+
+ set_fixmap(FIX_PARAVIRT_BOOTMAP, xen_start_info->shared_info);
+ HYPERVISOR_shared_info = (void *)fix_to_virt(FIX_PARAVIRT_BOOTMAP);
+
+ /* xen clock uses per-cpu vcpu_info, need to init it for boot cpu */
+ xen_vcpu_info_reset(0);
+
+ /* pvclock is in shared info area */
+ xen_init_time_ops();
+}
+
+static void __init xen_pv_guest_late_init(void)
+{
+#ifndef CONFIG_SMP
+ /* Setup shared vcpu info for non-smp configurations */
+ xen_setup_vcpu_info_placement();
+#endif
+}
+
+/* Check if running on Xen version (major, minor) or later */
+bool
+xen_running_on_version_or_later(unsigned int major, unsigned int minor)
+{
+ unsigned int version;
+
+ if (!xen_domain())
+ return false;
+
+ version = HYPERVISOR_xen_version(XENVER_version, NULL);
+ if ((((version >> 16) == major) && ((version & 0xffff) >= minor)) ||
+ ((version >> 16) > major))
+ return true;
+ return false;
+}
+
+static __read_mostly unsigned int cpuid_leaf5_ecx_val;
+static __read_mostly unsigned int cpuid_leaf5_edx_val;
+
+static void xen_cpuid(unsigned int *ax, unsigned int *bx,
+ unsigned int *cx, unsigned int *dx)
+{
+ unsigned maskebx = ~0;
+
+ /*
+ * Mask out inconvenient features, to try and disable as many
+ * unsupported kernel subsystems as possible.
+ */
+ switch (*ax) {
+ case CPUID_MWAIT_LEAF:
+ /* Synthesize the values.. */
+ *ax = 0;
+ *bx = 0;
+ *cx = cpuid_leaf5_ecx_val;
+ *dx = cpuid_leaf5_edx_val;
+ return;
+
+ case 0xb:
+ /* Suppress extended topology stuff */
+ maskebx = 0;
+ break;
+ }
+
+ asm(XEN_EMULATE_PREFIX "cpuid"
+ : "=a" (*ax),
+ "=b" (*bx),
+ "=c" (*cx),
+ "=d" (*dx)
+ : "0" (*ax), "2" (*cx));
+
+ *bx &= maskebx;
+}
+STACK_FRAME_NON_STANDARD(xen_cpuid); /* XEN_EMULATE_PREFIX */
+
+static bool __init xen_check_mwait(void)
+{
+#ifdef CONFIG_ACPI
+ struct xen_platform_op op = {
+ .cmd = XENPF_set_processor_pminfo,
+ .u.set_pminfo.id = -1,
+ .u.set_pminfo.type = XEN_PM_PDC,
+ };
+ uint32_t buf[3];
+ unsigned int ax, bx, cx, dx;
+ unsigned int mwait_mask;
+
+ /* We need to determine whether it is OK to expose the MWAIT
+ * capability to the kernel to harvest deeper than C3 states from ACPI
+ * _CST using the processor_harvest_xen.c module. For this to work, we
+ * need to gather the MWAIT_LEAF values (which the cstate.c code
+ * checks against). The hypervisor won't expose the MWAIT flag because
+ * it would break backwards compatibility; so we will find out directly
+ * from the hardware and hypercall.
+ */
+ if (!xen_initial_domain())
+ return false;
+
+ /*
+ * When running under platform earlier than Xen4.2, do not expose
+ * mwait, to avoid the risk of loading native acpi pad driver
+ */
+ if (!xen_running_on_version_or_later(4, 2))
+ return false;
+
+ ax = 1;
+ cx = 0;
+
+ native_cpuid(&ax, &bx, &cx, &dx);
+
+ mwait_mask = (1 << (X86_FEATURE_EST % 32)) |
+ (1 << (X86_FEATURE_MWAIT % 32));
+
+ if ((cx & mwait_mask) != mwait_mask)
+ return false;
+
+ /* We need to emulate the MWAIT_LEAF and for that we need both
+ * ecx and edx. The hypercall provides only partial information.
+ */
+
+ ax = CPUID_MWAIT_LEAF;
+ bx = 0;
+ cx = 0;
+ dx = 0;
+
+ native_cpuid(&ax, &bx, &cx, &dx);
+
+ /* Ask the Hypervisor whether to clear ACPI_PDC_C_C2C3_FFH. If so,
+ * don't expose MWAIT_LEAF and let ACPI pick the IOPORT version of C3.
+ */
+ buf[0] = ACPI_PDC_REVISION_ID;
+ buf[1] = 1;
+ buf[2] = (ACPI_PDC_C_CAPABILITY_SMP | ACPI_PDC_EST_CAPABILITY_SWSMP);
+
+ set_xen_guest_handle(op.u.set_pminfo.pdc, buf);
+
+ if ((HYPERVISOR_platform_op(&op) == 0) &&
+ (buf[2] & (ACPI_PDC_C_C1_FFH | ACPI_PDC_C_C2C3_FFH))) {
+ cpuid_leaf5_ecx_val = cx;
+ cpuid_leaf5_edx_val = dx;
+ }
+ return true;
+#else
+ return false;
+#endif
+}
+
+static bool __init xen_check_xsave(void)
+{
+ unsigned int cx, xsave_mask;
+
+ cx = cpuid_ecx(1);
+
+ xsave_mask = (1 << (X86_FEATURE_XSAVE % 32)) |
+ (1 << (X86_FEATURE_OSXSAVE % 32));
+
+ /* Xen will set CR4.OSXSAVE if supported and not disabled by force */
+ return (cx & xsave_mask) == xsave_mask;
+}
+
+static void __init xen_init_capabilities(void)
+{
+ setup_force_cpu_cap(X86_FEATURE_XENPV);
+ setup_clear_cpu_cap(X86_FEATURE_DCA);
+ setup_clear_cpu_cap(X86_FEATURE_APERFMPERF);
+ setup_clear_cpu_cap(X86_FEATURE_MTRR);
+ setup_clear_cpu_cap(X86_FEATURE_ACC);
+ setup_clear_cpu_cap(X86_FEATURE_X2APIC);
+ setup_clear_cpu_cap(X86_FEATURE_SME);
+
+ /*
+ * Xen PV would need some work to support PCID: CR3 handling as well
+ * as xen_flush_tlb_others() would need updating.
+ */
+ setup_clear_cpu_cap(X86_FEATURE_PCID);
+
+ if (!xen_initial_domain())
+ setup_clear_cpu_cap(X86_FEATURE_ACPI);
+
+ if (xen_check_mwait())
+ setup_force_cpu_cap(X86_FEATURE_MWAIT);
+ else
+ setup_clear_cpu_cap(X86_FEATURE_MWAIT);
+
+ if (!xen_check_xsave()) {
+ setup_clear_cpu_cap(X86_FEATURE_XSAVE);
+ setup_clear_cpu_cap(X86_FEATURE_OSXSAVE);
+ }
+}
+
+static void xen_set_debugreg(int reg, unsigned long val)
+{
+ HYPERVISOR_set_debugreg(reg, val);
+}
+
+static unsigned long xen_get_debugreg(int reg)
+{
+ return HYPERVISOR_get_debugreg(reg);
+}
+
+static void xen_end_context_switch(struct task_struct *next)
+{
+ xen_mc_flush();
+ paravirt_end_context_switch(next);
+}
+
+static unsigned long xen_store_tr(void)
+{
+ return 0;
+}
+
+/*
+ * Set the page permissions for a particular virtual address. If the
+ * address is a vmalloc mapping (or other non-linear mapping), then
+ * find the linear mapping of the page and also set its protections to
+ * match.
+ */
+static void set_aliased_prot(void *v, pgprot_t prot)
+{
+ int level;
+ pte_t *ptep;
+ pte_t pte;
+ unsigned long pfn;
+ struct page *page;
+ unsigned char dummy;
+
+ ptep = lookup_address((unsigned long)v, &level);
+ BUG_ON(ptep == NULL);
+
+ pfn = pte_pfn(*ptep);
+ page = pfn_to_page(pfn);
+
+ pte = pfn_pte(pfn, prot);
+
+ /*
+ * Careful: update_va_mapping() will fail if the virtual address
+ * we're poking isn't populated in the page tables. We don't
+ * need to worry about the direct map (that's always in the page
+ * tables), but we need to be careful about vmap space. In
+ * particular, the top level page table can lazily propagate
+ * entries between processes, so if we've switched mms since we
+ * vmapped the target in the first place, we might not have the
+ * top-level page table entry populated.
+ *
+ * We disable preemption because we want the same mm active when
+ * we probe the target and when we issue the hypercall. We'll
+ * have the same nominal mm, but if we're a kernel thread, lazy
+ * mm dropping could change our pgd.
+ *
+ * Out of an abundance of caution, this uses __get_user() to fault
+ * in the target address just in case there's some obscure case
+ * in which the target address isn't readable.
+ */
+
+ preempt_disable();
+
+ probe_kernel_read(&dummy, v, 1);
+
+ if (HYPERVISOR_update_va_mapping((unsigned long)v, pte, 0))
+ BUG();
+
+ if (!PageHighMem(page)) {
+ void *av = __va(PFN_PHYS(pfn));
+
+ if (av != v)
+ if (HYPERVISOR_update_va_mapping((unsigned long)av, pte, 0))
+ BUG();
+ } else
+ kmap_flush_unused();
+
+ preempt_enable();
+}
+
+static void xen_alloc_ldt(struct desc_struct *ldt, unsigned entries)
+{
+ const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE;
+ int i;
+
+ /*
+ * We need to mark the all aliases of the LDT pages RO. We
+ * don't need to call vm_flush_aliases(), though, since that's
+ * only responsible for flushing aliases out the TLBs, not the
+ * page tables, and Xen will flush the TLB for us if needed.
+ *
+ * To avoid confusing future readers: none of this is necessary
+ * to load the LDT. The hypervisor only checks this when the
+ * LDT is faulted in due to subsequent descriptor access.
+ */
+
+ for (i = 0; i < entries; i += entries_per_page)
+ set_aliased_prot(ldt + i, PAGE_KERNEL_RO);
+}
+
+static void xen_free_ldt(struct desc_struct *ldt, unsigned entries)
+{
+ const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE;
+ int i;
+
+ for (i = 0; i < entries; i += entries_per_page)
+ set_aliased_prot(ldt + i, PAGE_KERNEL);
+}
+
+static void xen_set_ldt(const void *addr, unsigned entries)
+{
+ struct mmuext_op *op;
+ struct multicall_space mcs = xen_mc_entry(sizeof(*op));
+
+ trace_xen_cpu_set_ldt(addr, entries);
+
+ op = mcs.args;
+ op->cmd = MMUEXT_SET_LDT;
+ op->arg1.linear_addr = (unsigned long)addr;
+ op->arg2.nr_ents = entries;
+
+ MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+ xen_mc_issue(PARAVIRT_LAZY_CPU);
+}
+
+static void xen_load_gdt(const struct desc_ptr *dtr)
+{
+ unsigned long va = dtr->address;
+ unsigned int size = dtr->size + 1;
+ unsigned long pfn, mfn;
+ int level;
+ pte_t *ptep;
+ void *virt;
+
+ /* @size should be at most GDT_SIZE which is smaller than PAGE_SIZE. */
+ BUG_ON(size > PAGE_SIZE);
+ BUG_ON(va & ~PAGE_MASK);
+
+ /*
+ * The GDT is per-cpu and is in the percpu data area.
+ * That can be virtually mapped, so we need to do a
+ * page-walk to get the underlying MFN for the
+ * hypercall. The page can also be in the kernel's
+ * linear range, so we need to RO that mapping too.
+ */
+ ptep = lookup_address(va, &level);
+ BUG_ON(ptep == NULL);
+
+ pfn = pte_pfn(*ptep);
+ mfn = pfn_to_mfn(pfn);
+ virt = __va(PFN_PHYS(pfn));
+
+ make_lowmem_page_readonly((void *)va);
+ make_lowmem_page_readonly(virt);
+
+ if (HYPERVISOR_set_gdt(&mfn, size / sizeof(struct desc_struct)))
+ BUG();
+}
+
+/*
+ * load_gdt for early boot, when the gdt is only mapped once
+ */
+static void __init xen_load_gdt_boot(const struct desc_ptr *dtr)
+{
+ unsigned long va = dtr->address;
+ unsigned int size = dtr->size + 1;
+ unsigned long pfn, mfn;
+ pte_t pte;
+
+ /* @size should be at most GDT_SIZE which is smaller than PAGE_SIZE. */
+ BUG_ON(size > PAGE_SIZE);
+ BUG_ON(va & ~PAGE_MASK);
+
+ pfn = virt_to_pfn(va);
+ mfn = pfn_to_mfn(pfn);
+
+ pte = pfn_pte(pfn, PAGE_KERNEL_RO);
+
+ if (HYPERVISOR_update_va_mapping((unsigned long)va, pte, 0))
+ BUG();
+
+ if (HYPERVISOR_set_gdt(&mfn, size / sizeof(struct desc_struct)))
+ BUG();
+}
+
+static inline bool desc_equal(const struct desc_struct *d1,
+ const struct desc_struct *d2)
+{
+ return !memcmp(d1, d2, sizeof(*d1));
+}
+
+static void load_TLS_descriptor(struct thread_struct *t,
+ unsigned int cpu, unsigned int i)
+{
+ struct desc_struct *shadow = &per_cpu(shadow_tls_desc, cpu).desc[i];
+ struct desc_struct *gdt;
+ xmaddr_t maddr;
+ struct multicall_space mc;
+
+ if (desc_equal(shadow, &t->tls_array[i]))
+ return;
+
+ *shadow = t->tls_array[i];
+
+ gdt = get_cpu_gdt_rw(cpu);
+ maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
+ mc = __xen_mc_entry(0);
+
+ MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]);
+}
+
+static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
+{
+ /*
+ * XXX sleazy hack: If we're being called in a lazy-cpu zone
+ * and lazy gs handling is enabled, it means we're in a
+ * context switch, and %gs has just been saved. This means we
+ * can zero it out to prevent faults on exit from the
+ * hypervisor if the next process has no %gs. Either way, it
+ * has been saved, and the new value will get loaded properly.
+ * This will go away as soon as Xen has been modified to not
+ * save/restore %gs for normal hypercalls.
+ *
+ * On x86_64, this hack is not used for %gs, because gs points
+ * to KERNEL_GS_BASE (and uses it for PDA references), so we
+ * must not zero %gs on x86_64
+ *
+ * For x86_64, we need to zero %fs, otherwise we may get an
+ * exception between the new %fs descriptor being loaded and
+ * %fs being effectively cleared at __switch_to().
+ */
+ if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) {
+#ifdef CONFIG_X86_32
+ lazy_load_gs(0);
+#else
+ loadsegment(fs, 0);
+#endif
+ }
+
+ xen_mc_batch();
+
+ load_TLS_descriptor(t, cpu, 0);
+ load_TLS_descriptor(t, cpu, 1);
+ load_TLS_descriptor(t, cpu, 2);
+
+ xen_mc_issue(PARAVIRT_LAZY_CPU);
+}
+
+#ifdef CONFIG_X86_64
+static void xen_load_gs_index(unsigned int idx)
+{
+ if (HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, idx))
+ BUG();
+}
+#endif
+
+static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
+ const void *ptr)
+{
+ xmaddr_t mach_lp = arbitrary_virt_to_machine(&dt[entrynum]);
+ u64 entry = *(u64 *)ptr;
+
+ trace_xen_cpu_write_ldt_entry(dt, entrynum, entry);
+
+ preempt_disable();
+
+ xen_mc_flush();
+ if (HYPERVISOR_update_descriptor(mach_lp.maddr, entry))
+ BUG();
+
+ preempt_enable();
+}
+
+#ifdef CONFIG_X86_64
+struct trap_array_entry {
+ void (*orig)(void);
+ void (*xen)(void);
+ bool ist_okay;
+};
+
+static struct trap_array_entry trap_array[] = {
+ { debug, xen_xendebug, true },
+ { double_fault, xen_double_fault, true },
+#ifdef CONFIG_X86_MCE
+ { machine_check, xen_machine_check, true },
+#endif
+ { nmi, xen_xennmi, true },
+ { int3, xen_int3, false },
+ { overflow, xen_overflow, false },
+#ifdef CONFIG_IA32_EMULATION
+ { entry_INT80_compat, xen_entry_INT80_compat, false },
+#endif
+ { page_fault, xen_page_fault, false },
+ { divide_error, xen_divide_error, false },
+ { bounds, xen_bounds, false },
+ { invalid_op, xen_invalid_op, false },
+ { device_not_available, xen_device_not_available, false },
+ { coprocessor_segment_overrun, xen_coprocessor_segment_overrun, false },
+ { invalid_TSS, xen_invalid_TSS, false },
+ { segment_not_present, xen_segment_not_present, false },
+ { stack_segment, xen_stack_segment, false },
+ { general_protection, xen_general_protection, false },
+ { spurious_interrupt_bug, xen_spurious_interrupt_bug, false },
+ { coprocessor_error, xen_coprocessor_error, false },
+ { alignment_check, xen_alignment_check, false },
+ { simd_coprocessor_error, xen_simd_coprocessor_error, false },
+};
+
+static bool __ref get_trap_addr(void **addr, unsigned int ist)
+{
+ unsigned int nr;
+ bool ist_okay = false;
+
+ /*
+ * Replace trap handler addresses by Xen specific ones.
+ * Check for known traps using IST and whitelist them.
+ * The debugger ones are the only ones we care about.
+ * Xen will handle faults like double_fault, * so we should never see
+ * them. Warn if there's an unexpected IST-using fault handler.
+ */
+ for (nr = 0; nr < ARRAY_SIZE(trap_array); nr++) {
+ struct trap_array_entry *entry = trap_array + nr;
+
+ if (*addr == entry->orig) {
+ *addr = entry->xen;
+ ist_okay = entry->ist_okay;
+ break;
+ }
+ }
+
+ if (nr == ARRAY_SIZE(trap_array) &&
+ *addr >= (void *)early_idt_handler_array[0] &&
+ *addr < (void *)early_idt_handler_array[NUM_EXCEPTION_VECTORS]) {
+ nr = (*addr - (void *)early_idt_handler_array[0]) /
+ EARLY_IDT_HANDLER_SIZE;
+ *addr = (void *)xen_early_idt_handler_array[nr];
+ }
+
+ if (WARN_ON(ist != 0 && !ist_okay))
+ return false;
+
+ return true;
+}
+#endif
+
+static int cvt_gate_to_trap(int vector, const gate_desc *val,
+ struct trap_info *info)
+{
+ unsigned long addr;
+
+ if (val->bits.type != GATE_TRAP && val->bits.type != GATE_INTERRUPT)
+ return 0;
+
+ info->vector = vector;
+
+ addr = gate_offset(val);
+#ifdef CONFIG_X86_64
+ if (!get_trap_addr((void **)&addr, val->bits.ist))
+ return 0;
+#endif /* CONFIG_X86_64 */
+ info->address = addr;
+
+ info->cs = gate_segment(val);
+ info->flags = val->bits.dpl;
+ /* interrupt gates clear IF */
+ if (val->bits.type == GATE_INTERRUPT)
+ info->flags |= 1 << 2;
+
+ return 1;
+}
+
+/* Locations of each CPU's IDT */
+static DEFINE_PER_CPU(struct desc_ptr, idt_desc);
+
+/* Set an IDT entry. If the entry is part of the current IDT, then
+ also update Xen. */
+static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g)
+{
+ unsigned long p = (unsigned long)&dt[entrynum];
+ unsigned long start, end;
+
+ trace_xen_cpu_write_idt_entry(dt, entrynum, g);
+
+ preempt_disable();
+
+ start = __this_cpu_read(idt_desc.address);
+ end = start + __this_cpu_read(idt_desc.size) + 1;
+
+ xen_mc_flush();
+
+ native_write_idt_entry(dt, entrynum, g);
+
+ if (p >= start && (p + 8) <= end) {
+ struct trap_info info[2];
+
+ info[1].address = 0;
+
+ if (cvt_gate_to_trap(entrynum, g, &info[0]))
+ if (HYPERVISOR_set_trap_table(info))
+ BUG();
+ }
+
+ preempt_enable();
+}
+
+static unsigned xen_convert_trap_info(const struct desc_ptr *desc,
+ struct trap_info *traps, bool full)
+{
+ unsigned in, out, count;
+
+ count = (desc->size+1) / sizeof(gate_desc);
+ BUG_ON(count > 256);
+
+ for (in = out = 0; in < count; in++) {
+ gate_desc *entry = (gate_desc *)(desc->address) + in;
+
+ if (cvt_gate_to_trap(in, entry, &traps[out]) || full)
+ out++;
+ }
+
+ return out;
+}
+
+void xen_copy_trap_info(struct trap_info *traps)
+{
+ const struct desc_ptr *desc = this_cpu_ptr(&idt_desc);
+
+ xen_convert_trap_info(desc, traps, true);
+}
+
+/* Load a new IDT into Xen. In principle this can be per-CPU, so we
+ hold a spinlock to protect the static traps[] array (static because
+ it avoids allocation, and saves stack space). */
+static void xen_load_idt(const struct desc_ptr *desc)
+{
+ static DEFINE_SPINLOCK(lock);
+ static struct trap_info traps[257];
+ unsigned out;
+
+ trace_xen_cpu_load_idt(desc);
+
+ spin_lock(&lock);
+
+ memcpy(this_cpu_ptr(&idt_desc), desc, sizeof(idt_desc));
+
+ out = xen_convert_trap_info(desc, traps, false);
+ memset(&traps[out], 0, sizeof(traps[0]));
+
+ xen_mc_flush();
+ if (HYPERVISOR_set_trap_table(traps))
+ BUG();
+
+ spin_unlock(&lock);
+}
+
+/* Write a GDT descriptor entry. Ignore LDT descriptors, since
+ they're handled differently. */
+static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
+ const void *desc, int type)
+{
+ trace_xen_cpu_write_gdt_entry(dt, entry, desc, type);
+
+ preempt_disable();
+
+ switch (type) {
+ case DESC_LDT:
+ case DESC_TSS:
+ /* ignore */
+ break;
+
+ default: {
+ xmaddr_t maddr = arbitrary_virt_to_machine(&dt[entry]);
+
+ xen_mc_flush();
+ if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc))
+ BUG();
+ }
+
+ }
+
+ preempt_enable();
+}
+
+/*
+ * Version of write_gdt_entry for use at early boot-time needed to
+ * update an entry as simply as possible.
+ */
+static void __init xen_write_gdt_entry_boot(struct desc_struct *dt, int entry,
+ const void *desc, int type)
+{
+ trace_xen_cpu_write_gdt_entry(dt, entry, desc, type);
+
+ switch (type) {
+ case DESC_LDT:
+ case DESC_TSS:
+ /* ignore */
+ break;
+
+ default: {
+ xmaddr_t maddr = virt_to_machine(&dt[entry]);
+
+ if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc))
+ dt[entry] = *(struct desc_struct *)desc;
+ }
+
+ }
+}
+
+static void xen_load_sp0(unsigned long sp0)
+{
+ struct multicall_space mcs;
+
+ mcs = xen_mc_entry(0);
+ MULTI_stack_switch(mcs.mc, __KERNEL_DS, sp0);
+ xen_mc_issue(PARAVIRT_LAZY_CPU);
+ this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0);
+}
+
+void xen_set_iopl_mask(unsigned mask)
+{
+ struct physdev_set_iopl set_iopl;
+
+ /* Force the change at ring 0. */
+ set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3;
+ HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
+}
+
+static void xen_io_delay(void)
+{
+}
+
+static DEFINE_PER_CPU(unsigned long, xen_cr0_value);
+
+static unsigned long xen_read_cr0(void)
+{
+ unsigned long cr0 = this_cpu_read(xen_cr0_value);
+
+ if (unlikely(cr0 == 0)) {
+ cr0 = native_read_cr0();
+ this_cpu_write(xen_cr0_value, cr0);
+ }
+
+ return cr0;
+}
+
+static void xen_write_cr0(unsigned long cr0)
+{
+ struct multicall_space mcs;
+
+ this_cpu_write(xen_cr0_value, cr0);
+
+ /* Only pay attention to cr0.TS; everything else is
+ ignored. */
+ mcs = xen_mc_entry(0);
+
+ MULTI_fpu_taskswitch(mcs.mc, (cr0 & X86_CR0_TS) != 0);
+
+ xen_mc_issue(PARAVIRT_LAZY_CPU);
+}
+
+static void xen_write_cr4(unsigned long cr4)
+{
+ cr4 &= ~(X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PCE);
+
+ native_write_cr4(cr4);
+}
+#ifdef CONFIG_X86_64
+static inline unsigned long xen_read_cr8(void)
+{
+ return 0;
+}
+static inline void xen_write_cr8(unsigned long val)
+{
+ BUG_ON(val);
+}
+#endif
+
+static u64 xen_read_msr_safe(unsigned int msr, int *err)
+{
+ u64 val;
+
+ if (pmu_msr_read(msr, &val, err))
+ return val;
+
+ val = native_read_msr_safe(msr, err);
+ switch (msr) {
+ case MSR_IA32_APICBASE:
+ val &= ~X2APIC_ENABLE;
+ break;
+ }
+ return val;
+}
+
+static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
+{
+ int ret;
+#ifdef CONFIG_X86_64
+ unsigned int which;
+ u64 base;
+#endif
+
+ ret = 0;
+
+ switch (msr) {
+#ifdef CONFIG_X86_64
+ case MSR_FS_BASE: which = SEGBASE_FS; goto set;
+ case MSR_KERNEL_GS_BASE: which = SEGBASE_GS_USER; goto set;
+ case MSR_GS_BASE: which = SEGBASE_GS_KERNEL; goto set;
+
+ set:
+ base = ((u64)high << 32) | low;
+ if (HYPERVISOR_set_segment_base(which, base) != 0)
+ ret = -EIO;
+ break;
+#endif
+
+ case MSR_STAR:
+ case MSR_CSTAR:
+ case MSR_LSTAR:
+ case MSR_SYSCALL_MASK:
+ case MSR_IA32_SYSENTER_CS:
+ case MSR_IA32_SYSENTER_ESP:
+ case MSR_IA32_SYSENTER_EIP:
+ /* Fast syscall setup is all done in hypercalls, so
+ these are all ignored. Stub them out here to stop
+ Xen console noise. */
+ break;
+
+ default:
+ if (!pmu_msr_write(msr, low, high, &ret))
+ ret = native_write_msr_safe(msr, low, high);
+ }
+
+ return ret;
+}
+
+static u64 xen_read_msr(unsigned int msr)
+{
+ /*
+ * This will silently swallow a #GP from RDMSR. It may be worth
+ * changing that.
+ */
+ int err;
+
+ return xen_read_msr_safe(msr, &err);
+}
+
+static void xen_write_msr(unsigned int msr, unsigned low, unsigned high)
+{
+ /*
+ * This will silently swallow a #GP from WRMSR. It may be worth
+ * changing that.
+ */
+ xen_write_msr_safe(msr, low, high);
+}
+
+/* This is called once we have the cpu_possible_mask */
+void __init xen_setup_vcpu_info_placement(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ /* Set up direct vCPU id mapping for PV guests. */
+ per_cpu(xen_vcpu_id, cpu) = cpu;
+
+ /*
+ * xen_vcpu_setup(cpu) can fail -- in which case it
+ * falls back to the shared_info version for cpus
+ * where xen_vcpu_nr(cpu) < MAX_VIRT_CPUS.
+ *
+ * xen_cpu_up_prepare_pv() handles the rest by failing
+ * them in hotplug.
+ */
+ (void) xen_vcpu_setup(cpu);
+ }
+
+ /*
+ * xen_vcpu_setup managed to place the vcpu_info within the
+ * percpu area for all cpus, so make use of it.
+ */
+ if (xen_have_vcpu_info_placement) {
+ pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct);
+ pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct);
+ pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct);
+ pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(xen_irq_enable_direct);
+ pv_mmu_ops.read_cr2 = xen_read_cr2_direct;
+ }
+}
+
+static const struct pv_info xen_info __initconst = {
+ .shared_kernel_pmd = 0,
+
+#ifdef CONFIG_X86_64
+ .extra_user_64bit_cs = FLAT_USER_CS64,
+#endif
+ .name = "Xen",
+};
+
+static const struct pv_cpu_ops xen_cpu_ops __initconst = {
+ .cpuid = xen_cpuid,
+
+ .set_debugreg = xen_set_debugreg,
+ .get_debugreg = xen_get_debugreg,
+
+ .read_cr0 = xen_read_cr0,
+ .write_cr0 = xen_write_cr0,
+
+ .write_cr4 = xen_write_cr4,
+
+#ifdef CONFIG_X86_64
+ .read_cr8 = xen_read_cr8,
+ .write_cr8 = xen_write_cr8,
+#endif
+
+ .wbinvd = native_wbinvd,
+
+ .read_msr = xen_read_msr,
+ .write_msr = xen_write_msr,
+
+ .read_msr_safe = xen_read_msr_safe,
+ .write_msr_safe = xen_write_msr_safe,
+
+ .read_pmc = xen_read_pmc,
+
+ .iret = xen_iret,
+#ifdef CONFIG_X86_64
+ .usergs_sysret64 = xen_sysret64,
+#endif
+
+ .load_tr_desc = paravirt_nop,
+ .set_ldt = xen_set_ldt,
+ .load_gdt = xen_load_gdt,
+ .load_idt = xen_load_idt,
+ .load_tls = xen_load_tls,
+#ifdef CONFIG_X86_64
+ .load_gs_index = xen_load_gs_index,
+#endif
+
+ .alloc_ldt = xen_alloc_ldt,
+ .free_ldt = xen_free_ldt,
+
+ .store_tr = xen_store_tr,
+
+ .write_ldt_entry = xen_write_ldt_entry,
+ .write_gdt_entry = xen_write_gdt_entry,
+ .write_idt_entry = xen_write_idt_entry,
+ .load_sp0 = xen_load_sp0,
+
+ .set_iopl_mask = xen_set_iopl_mask,
+ .io_delay = xen_io_delay,
+
+ /* Xen takes care of %gs when switching to usermode for us */
+ .swapgs = paravirt_nop,
+
+ .start_context_switch = paravirt_start_context_switch,
+ .end_context_switch = xen_end_context_switch,
+};
+
+static void xen_restart(char *msg)
+{
+ xen_reboot(SHUTDOWN_reboot);
+}
+
+static void xen_machine_halt(void)
+{
+ xen_reboot(SHUTDOWN_poweroff);
+}
+
+static void xen_machine_power_off(void)
+{
+ if (pm_power_off)
+ pm_power_off();
+ xen_reboot(SHUTDOWN_poweroff);
+}
+
+static void xen_crash_shutdown(struct pt_regs *regs)
+{
+ xen_reboot(SHUTDOWN_crash);
+}
+
+static const struct machine_ops xen_machine_ops __initconst = {
+ .restart = xen_restart,
+ .halt = xen_machine_halt,
+ .power_off = xen_machine_power_off,
+ .shutdown = xen_machine_halt,
+ .crash_shutdown = xen_crash_shutdown,
+ .emergency_restart = xen_emergency_restart,
+};
+
+static unsigned char xen_get_nmi_reason(void)
+{
+ unsigned char reason = 0;
+
+ /* Construct a value which looks like it came from port 0x61. */
+ if (test_bit(_XEN_NMIREASON_io_error,
+ &HYPERVISOR_shared_info->arch.nmi_reason))
+ reason |= NMI_REASON_IOCHK;
+ if (test_bit(_XEN_NMIREASON_pci_serr,
+ &HYPERVISOR_shared_info->arch.nmi_reason))
+ reason |= NMI_REASON_SERR;
+
+ return reason;
+}
+
+static void __init xen_boot_params_init_edd(void)
+{
+#if IS_ENABLED(CONFIG_EDD)
+ struct xen_platform_op op;
+ struct edd_info *edd_info;
+ u32 *mbr_signature;
+ unsigned nr;
+ int ret;
+
+ edd_info = boot_params.eddbuf;
+ mbr_signature = boot_params.edd_mbr_sig_buffer;
+
+ op.cmd = XENPF_firmware_info;
+
+ op.u.firmware_info.type = XEN_FW_DISK_INFO;
+ for (nr = 0; nr < EDDMAXNR; nr++) {
+ struct edd_info *info = edd_info + nr;
+
+ op.u.firmware_info.index = nr;
+ info->params.length = sizeof(info->params);
+ set_xen_guest_handle(op.u.firmware_info.u.disk_info.edd_params,
+ &info->params);
+ ret = HYPERVISOR_platform_op(&op);
+ if (ret)
+ break;
+
+#define C(x) info->x = op.u.firmware_info.u.disk_info.x
+ C(device);
+ C(version);
+ C(interface_support);
+ C(legacy_max_cylinder);
+ C(legacy_max_head);
+ C(legacy_sectors_per_track);
+#undef C
+ }
+ boot_params.eddbuf_entries = nr;
+
+ op.u.firmware_info.type = XEN_FW_DISK_MBR_SIGNATURE;
+ for (nr = 0; nr < EDD_MBR_SIG_MAX; nr++) {
+ op.u.firmware_info.index = nr;
+ ret = HYPERVISOR_platform_op(&op);
+ if (ret)
+ break;
+ mbr_signature[nr] = op.u.firmware_info.u.disk_mbr_signature.mbr_signature;
+ }
+ boot_params.edd_mbr_sig_buf_entries = nr;
+#endif
+}
+
+/*
+ * Set up the GDT and segment registers for -fstack-protector. Until
+ * we do this, we have to be careful not to call any stack-protected
+ * function, which is most of the kernel.
+ */
+static void __init xen_setup_gdt(int cpu)
+{
+ pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry_boot;
+ pv_cpu_ops.load_gdt = xen_load_gdt_boot;
+
+ setup_stack_canary_segment(cpu);
+ switch_to_new_gdt(cpu);
+
+ pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry;
+ pv_cpu_ops.load_gdt = xen_load_gdt;
+}
+
+static void __init xen_dom0_set_legacy_features(void)
+{
+ x86_platform.legacy.rtc = 1;
+}
+
+static void __init xen_domu_set_legacy_features(void)
+{
+ x86_platform.legacy.rtc = 0;
+}
+
+/* First C function to be called on Xen boot */
+asmlinkage __visible void __init xen_start_kernel(void)
+{
+ struct physdev_set_iopl set_iopl;
+ unsigned long initrd_start = 0;
+ int rc;
+
+ if (!xen_start_info)
+ return;
+
+ xen_domain_type = XEN_PV_DOMAIN;
+ xen_start_flags = xen_start_info->flags;
+
+ xen_setup_features();
+
+ /* Install Xen paravirt ops */
+ pv_info = xen_info;
+ pv_init_ops.patch = paravirt_patch_default;
+ pv_cpu_ops = xen_cpu_ops;
+ xen_init_irq_ops();
+
+ /*
+ * Setup xen_vcpu early because it is needed for
+ * local_irq_disable(), irqs_disabled(), e.g. in printk().
+ *
+ * Don't do the full vcpu_info placement stuff until we have
+ * the cpu_possible_mask and a non-dummy shared_info.
+ */
+ xen_vcpu_info_reset(0);
+
+ x86_platform.get_nmi_reason = xen_get_nmi_reason;
+
+ x86_init.resources.memory_setup = xen_memory_setup;
+ x86_init.irqs.intr_mode_init = x86_init_noop;
+ x86_init.oem.arch_setup = xen_arch_setup;
+ x86_init.oem.banner = xen_banner;
+ x86_init.hyper.init_platform = xen_pv_init_platform;
+ x86_init.hyper.guest_late_init = xen_pv_guest_late_init;
+
+ /*
+ * Set up some pagetable state before starting to set any ptes.
+ */
+
+ xen_setup_machphys_mapping();
+ xen_init_mmu_ops();
+
+ /* Prevent unwanted bits from being set in PTEs. */
+ __supported_pte_mask &= ~_PAGE_GLOBAL;
+ __default_kernel_pte_mask &= ~_PAGE_GLOBAL;
+
+ /*
+ * Prevent page tables from being allocated in highmem, even
+ * if CONFIG_HIGHPTE is enabled.
+ */
+ __userpte_alloc_gfp &= ~__GFP_HIGHMEM;
+
+ /* Get mfn list */
+ xen_build_dynamic_phys_to_machine();
+
+ /*
+ * Set up kernel GDT and segment registers, mainly so that
+ * -fstack-protector code can be executed.
+ */
+ xen_setup_gdt(0);
+
+ /* Work out if we support NX */
+ get_cpu_cap(&boot_cpu_data);
+ x86_configure_nx();
+
+ /* Determine virtual and physical address sizes */
+ get_cpu_address_sizes(&boot_cpu_data);
+
+ /* Let's presume PV guests always boot on vCPU with id 0. */
+ per_cpu(xen_vcpu_id, 0) = 0;
+
+ idt_setup_early_handler();
+
+ xen_init_capabilities();
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ /*
+ * set up the basic apic ops.
+ */
+ xen_init_apic();
+#endif
+
+ if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) {
+ pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start;
+ pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit;
+ }
+
+ machine_ops = xen_machine_ops;
+
+ /*
+ * The only reliable way to retain the initial address of the
+ * percpu gdt_page is to remember it here, so we can go and
+ * mark it RW later, when the initial percpu area is freed.
+ */
+ xen_initial_gdt = &per_cpu(gdt_page, 0);
+
+ xen_smp_init();
+
+#ifdef CONFIG_ACPI_NUMA
+ /*
+ * The pages we from Xen are not related to machine pages, so
+ * any NUMA information the kernel tries to get from ACPI will
+ * be meaningless. Prevent it from trying.
+ */
+ acpi_numa = -1;
+#endif
+ WARN_ON(xen_cpuhp_setup(xen_cpu_up_prepare_pv, xen_cpu_dead_pv));
+
+ local_irq_disable();
+ early_boot_irqs_disabled = true;
+
+ xen_raw_console_write("mapping kernel into physical memory\n");
+ xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base,
+ xen_start_info->nr_pages);
+ xen_reserve_special_pages();
+
+ /* keep using Xen gdt for now; no urgent need to change it */
+
+#ifdef CONFIG_X86_32
+ pv_info.kernel_rpl = 1;
+ if (xen_feature(XENFEAT_supervisor_mode_kernel))
+ pv_info.kernel_rpl = 0;
+#else
+ pv_info.kernel_rpl = 0;
+#endif
+ /* set the limit of our address space */
+ xen_reserve_top();
+
+ /*
+ * We used to do this in xen_arch_setup, but that is too late
+ * on AMD were early_cpu_init (run before ->arch_setup()) calls
+ * early_amd_init which pokes 0xcf8 port.
+ */
+ set_iopl.iopl = 1;
+ rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
+ if (rc != 0)
+ xen_raw_printk("physdev_op failed %d\n", rc);
+
+#ifdef CONFIG_X86_32
+ /* set up basic CPUID stuff */
+ cpu_detect(&new_cpu_data);
+ set_cpu_cap(&new_cpu_data, X86_FEATURE_FPU);
+ new_cpu_data.x86_capability[CPUID_1_EDX] = cpuid_edx(1);
+#endif
+
+ if (xen_start_info->mod_start) {
+ if (xen_start_info->flags & SIF_MOD_START_PFN)
+ initrd_start = PFN_PHYS(xen_start_info->mod_start);
+ else
+ initrd_start = __pa(xen_start_info->mod_start);
+ }
+
+ /* Poke various useful things into boot_params */
+ boot_params.hdr.type_of_loader = (9 << 4) | 0;
+ boot_params.hdr.ramdisk_image = initrd_start;
+ boot_params.hdr.ramdisk_size = xen_start_info->mod_len;
+ boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line);
+ boot_params.hdr.hardware_subarch = X86_SUBARCH_XEN;
+
+ if (!xen_initial_domain()) {
+ add_preferred_console("xenboot", 0, NULL);
+ if (pci_xen)
+ x86_init.pci.arch_init = pci_xen_init;
+ x86_platform.set_legacy_features =
+ xen_domu_set_legacy_features;
+ } else {
+ const struct dom0_vga_console_info *info =
+ (void *)((char *)xen_start_info +
+ xen_start_info->console.dom0.info_off);
+ struct xen_platform_op op = {
+ .cmd = XENPF_firmware_info,
+ .interface_version = XENPF_INTERFACE_VERSION,
+ .u.firmware_info.type = XEN_FW_KBD_SHIFT_FLAGS,
+ };
+
+ x86_platform.set_legacy_features =
+ xen_dom0_set_legacy_features;
+ xen_init_vga(info, xen_start_info->console.dom0.info_size);
+ xen_start_info->console.domU.mfn = 0;
+ xen_start_info->console.domU.evtchn = 0;
+
+ if (HYPERVISOR_platform_op(&op) == 0)
+ boot_params.kbd_status = op.u.firmware_info.u.kbd_shift_flags;
+
+ /* Make sure ACS will be enabled */
+ pci_request_acs();
+
+ xen_acpi_sleep_register();
+
+ /* Avoid searching for BIOS MP tables */
+ x86_init.mpparse.find_smp_config = x86_init_noop;
+ x86_init.mpparse.get_smp_config = x86_init_uint_noop;
+
+ xen_boot_params_init_edd();
+
+#ifdef CONFIG_ACPI
+ /*
+ * Disable selecting "Firmware First mode" for correctable
+ * memory errors, as this is the duty of the hypervisor to
+ * decide.
+ */
+ acpi_disable_cmcff = 1;
+#endif
+ }
+
+ if (!boot_params.screen_info.orig_video_isVGA)
+ add_preferred_console("tty", 0, NULL);
+ add_preferred_console("hvc", 0, NULL);
+ if (boot_params.screen_info.orig_video_isVGA)
+ add_preferred_console("tty", 0, NULL);
+
+#ifdef CONFIG_PCI
+ /* PCI BIOS service won't work from a PV guest. */
+ pci_probe &= ~PCI_PROBE_BIOS;
+#endif
+ xen_raw_console_write("about to get started...\n");
+
+ /* We need this for printk timestamps */
+ xen_setup_runstate_info(0);
+
+ xen_efi_init(&boot_params);
+
+ /* Start the world */
+#ifdef CONFIG_X86_32
+ i386_start_kernel();
+#else
+ cr4_init_shadow(); /* 32b kernel does this in i386_start_kernel() */
+ x86_64_start_reservations((char *)__pa_symbol(&boot_params));
+#endif
+}
+
+static int xen_cpu_up_prepare_pv(unsigned int cpu)
+{
+ int rc;
+
+ if (per_cpu(xen_vcpu, cpu) == NULL)
+ return -ENODEV;
+
+ xen_setup_timer(cpu);
+
+ rc = xen_smp_intr_init(cpu);
+ if (rc) {
+ WARN(1, "xen_smp_intr_init() for CPU %d failed: %d\n",
+ cpu, rc);
+ return rc;
+ }
+
+ rc = xen_smp_intr_init_pv(cpu);
+ if (rc) {
+ WARN(1, "xen_smp_intr_init_pv() for CPU %d failed: %d\n",
+ cpu, rc);
+ return rc;
+ }
+
+ return 0;
+}
+
+static int xen_cpu_dead_pv(unsigned int cpu)
+{
+ xen_smp_intr_free(cpu);
+ xen_smp_intr_free_pv(cpu);
+
+ xen_teardown_timer(cpu);
+
+ return 0;
+}
+
+static uint32_t __init xen_platform_pv(void)
+{
+ if (xen_pv_domain())
+ return xen_cpuid_base();
+
+ return 0;
+}
+
+const __initconst struct hypervisor_x86 x86_hyper_xen_pv = {
+ .name = "Xen PV",
+ .detect = xen_platform_pv,
+ .type = X86_HYPER_XEN_PV,
+ .runtime.pin_vcpu = xen_pin_vcpu,
+};
diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c
new file mode 100644
index 000000000..f04d22bcf
--- /dev/null
+++ b/arch/x86/xen/enlighten_pvh.c
@@ -0,0 +1,112 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/acpi.h>
+
+#include <xen/hvc-console.h>
+
+#include <asm/io_apic.h>
+#include <asm/hypervisor.h>
+#include <asm/e820/api.h>
+#include <asm/x86_init.h>
+
+#include <asm/xen/interface.h>
+#include <asm/xen/hypercall.h>
+
+#include <xen/interface/memory.h>
+#include <xen/interface/hvm/start_info.h>
+
+#include "xen-ops.h"
+
+/*
+ * PVH variables.
+ *
+ * xen_pvh pvh_bootparams and pvh_start_info need to live in data segment
+ * since they are used after startup_{32|64}, which clear .bss, are invoked.
+ */
+bool xen_pvh __attribute__((section(".data"))) = 0;
+struct boot_params pvh_bootparams __attribute__((section(".data")));
+struct hvm_start_info pvh_start_info __attribute__((section(".data")));
+
+unsigned int pvh_start_info_sz = sizeof(pvh_start_info);
+
+static u64 pvh_get_root_pointer(void)
+{
+ return pvh_start_info.rsdp_paddr;
+}
+
+static void __init init_pvh_bootparams(void)
+{
+ struct xen_memory_map memmap;
+ int rc;
+
+ memset(&pvh_bootparams, 0, sizeof(pvh_bootparams));
+
+ memmap.nr_entries = ARRAY_SIZE(pvh_bootparams.e820_table);
+ set_xen_guest_handle(memmap.buffer, pvh_bootparams.e820_table);
+ rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
+ if (rc) {
+ xen_raw_printk("XENMEM_memory_map failed (%d)\n", rc);
+ BUG();
+ }
+ pvh_bootparams.e820_entries = memmap.nr_entries;
+
+ if (pvh_bootparams.e820_entries < E820_MAX_ENTRIES_ZEROPAGE - 1) {
+ pvh_bootparams.e820_table[pvh_bootparams.e820_entries].addr =
+ ISA_START_ADDRESS;
+ pvh_bootparams.e820_table[pvh_bootparams.e820_entries].size =
+ ISA_END_ADDRESS - ISA_START_ADDRESS;
+ pvh_bootparams.e820_table[pvh_bootparams.e820_entries].type =
+ E820_TYPE_RESERVED;
+ pvh_bootparams.e820_entries++;
+ } else
+ xen_raw_printk("Warning: Can fit ISA range into e820\n");
+
+ pvh_bootparams.hdr.cmd_line_ptr =
+ pvh_start_info.cmdline_paddr;
+
+ /* The first module is always ramdisk. */
+ if (pvh_start_info.nr_modules) {
+ struct hvm_modlist_entry *modaddr =
+ __va(pvh_start_info.modlist_paddr);
+ pvh_bootparams.hdr.ramdisk_image = modaddr->paddr;
+ pvh_bootparams.hdr.ramdisk_size = modaddr->size;
+ }
+
+ /*
+ * See Documentation/x86/boot.txt.
+ *
+ * Version 2.12 supports Xen entry point but we will use default x86/PC
+ * environment (i.e. hardware_subarch 0).
+ */
+ pvh_bootparams.hdr.version = (2 << 8) | 12;
+ pvh_bootparams.hdr.type_of_loader = (9 << 4) | 0; /* Xen loader */
+
+ x86_init.acpi.get_root_pointer = pvh_get_root_pointer;
+
+ xen_efi_init(&pvh_bootparams);
+}
+
+/*
+ * This routine (and those that it might call) should not use
+ * anything that lives in .bss since that segment will be cleared later.
+ */
+void __init xen_prepare_pvh(void)
+{
+ u32 msr;
+ u64 pfn;
+
+ if (pvh_start_info.magic != XEN_HVM_START_MAGIC_VALUE) {
+ xen_raw_printk("Error: Unexpected magic value (0x%08x)\n",
+ pvh_start_info.magic);
+ BUG();
+ }
+
+ xen_pvh = 1;
+ xen_domain_type = XEN_HVM_DOMAIN;
+ xen_start_flags = pvh_start_info.flags;
+
+ msr = cpuid_ebx(xen_cpuid_base() + 2);
+ pfn = __pa(hypercall_page);
+ wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
+
+ init_pvh_bootparams();
+}
diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c
new file mode 100644
index 000000000..92ccc7181
--- /dev/null
+++ b/arch/x86/xen/grant-table.c
@@ -0,0 +1,180 @@
+/******************************************************************************
+ * grant_table.c
+ * x86 specific part
+ *
+ * Granting foreign access to our memory reservation.
+ *
+ * Copyright (c) 2005-2006, Christopher Clark
+ * Copyright (c) 2004-2005, K A Fraser
+ * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp>
+ * VA Linux Systems Japan. Split out x86 specific part.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#include <xen/interface/xen.h>
+#include <xen/page.h>
+#include <xen/grant_table.h>
+#include <xen/xen.h>
+
+#include <asm/pgtable.h>
+
+static struct gnttab_vm_area {
+ struct vm_struct *area;
+ pte_t **ptes;
+} gnttab_shared_vm_area, gnttab_status_vm_area;
+
+int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes,
+ unsigned long max_nr_gframes,
+ void **__shared)
+{
+ void *shared = *__shared;
+ unsigned long addr;
+ unsigned long i;
+
+ if (shared == NULL)
+ *__shared = shared = gnttab_shared_vm_area.area->addr;
+
+ addr = (unsigned long)shared;
+
+ for (i = 0; i < nr_gframes; i++) {
+ set_pte_at(&init_mm, addr, gnttab_shared_vm_area.ptes[i],
+ mfn_pte(frames[i], PAGE_KERNEL));
+ addr += PAGE_SIZE;
+ }
+
+ return 0;
+}
+
+int arch_gnttab_map_status(uint64_t *frames, unsigned long nr_gframes,
+ unsigned long max_nr_gframes,
+ grant_status_t **__shared)
+{
+ grant_status_t *shared = *__shared;
+ unsigned long addr;
+ unsigned long i;
+
+ if (shared == NULL)
+ *__shared = shared = gnttab_status_vm_area.area->addr;
+
+ addr = (unsigned long)shared;
+
+ for (i = 0; i < nr_gframes; i++) {
+ set_pte_at(&init_mm, addr, gnttab_status_vm_area.ptes[i],
+ mfn_pte(frames[i], PAGE_KERNEL));
+ addr += PAGE_SIZE;
+ }
+
+ return 0;
+}
+
+void arch_gnttab_unmap(void *shared, unsigned long nr_gframes)
+{
+ pte_t **ptes;
+ unsigned long addr;
+ unsigned long i;
+
+ if (shared == gnttab_status_vm_area.area->addr)
+ ptes = gnttab_status_vm_area.ptes;
+ else
+ ptes = gnttab_shared_vm_area.ptes;
+
+ addr = (unsigned long)shared;
+
+ for (i = 0; i < nr_gframes; i++) {
+ set_pte_at(&init_mm, addr, ptes[i], __pte(0));
+ addr += PAGE_SIZE;
+ }
+}
+
+static int arch_gnttab_valloc(struct gnttab_vm_area *area, unsigned nr_frames)
+{
+ area->ptes = kmalloc_array(nr_frames, sizeof(*area->ptes), GFP_KERNEL);
+ if (area->ptes == NULL)
+ return -ENOMEM;
+
+ area->area = alloc_vm_area(PAGE_SIZE * nr_frames, area->ptes);
+ if (area->area == NULL) {
+ kfree(area->ptes);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void arch_gnttab_vfree(struct gnttab_vm_area *area)
+{
+ free_vm_area(area->area);
+ kfree(area->ptes);
+}
+
+int arch_gnttab_init(unsigned long nr_shared, unsigned long nr_status)
+{
+ int ret;
+
+ if (!xen_pv_domain())
+ return 0;
+
+ ret = arch_gnttab_valloc(&gnttab_shared_vm_area, nr_shared);
+ if (ret < 0)
+ return ret;
+
+ /*
+ * Always allocate the space for the status frames in case
+ * we're migrated to a host with V2 support.
+ */
+ ret = arch_gnttab_valloc(&gnttab_status_vm_area, nr_status);
+ if (ret < 0)
+ goto err;
+
+ return 0;
+err:
+ arch_gnttab_vfree(&gnttab_shared_vm_area);
+ return -ENOMEM;
+}
+
+#ifdef CONFIG_XEN_PVH
+#include <xen/events.h>
+#include <xen/xen-ops.h>
+static int __init xen_pvh_gnttab_setup(void)
+{
+ if (!xen_pvh_domain())
+ return -ENODEV;
+
+ xen_auto_xlat_grant_frames.count = gnttab_max_grant_frames();
+
+ return xen_xlate_map_ballooned_pages(&xen_auto_xlat_grant_frames.pfn,
+ &xen_auto_xlat_grant_frames.vaddr,
+ xen_auto_xlat_grant_frames.count);
+}
+/* Call it _before_ __gnttab_init as we need to initialize the
+ * xen_auto_xlat_grant_frames first. */
+core_initcall(xen_pvh_gnttab_setup);
+#endif
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
new file mode 100644
index 000000000..7515a19fd
--- /dev/null
+++ b/arch/x86/xen/irq.c
@@ -0,0 +1,133 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/hardirq.h>
+
+#include <asm/x86_init.h>
+
+#include <xen/interface/xen.h>
+#include <xen/interface/sched.h>
+#include <xen/interface/vcpu.h>
+#include <xen/features.h>
+#include <xen/events.h>
+
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+
+#include "xen-ops.h"
+
+/*
+ * Force a proper event-channel callback from Xen after clearing the
+ * callback mask. We do this in a very simple manner, by making a call
+ * down into Xen. The pending flag will be checked by Xen on return.
+ */
+void xen_force_evtchn_callback(void)
+{
+ (void)HYPERVISOR_xen_version(0, NULL);
+}
+
+asmlinkage __visible unsigned long xen_save_fl(void)
+{
+ struct vcpu_info *vcpu;
+ unsigned long flags;
+
+ vcpu = this_cpu_read(xen_vcpu);
+
+ /* flag has opposite sense of mask */
+ flags = !vcpu->evtchn_upcall_mask;
+
+ /* convert to IF type flag
+ -0 -> 0x00000000
+ -1 -> 0xffffffff
+ */
+ return (-flags) & X86_EFLAGS_IF;
+}
+PV_CALLEE_SAVE_REGS_THUNK(xen_save_fl);
+
+__visible void xen_restore_fl(unsigned long flags)
+{
+ struct vcpu_info *vcpu;
+
+ /* convert from IF type flag */
+ flags = !(flags & X86_EFLAGS_IF);
+
+ /* See xen_irq_enable() for why preemption must be disabled. */
+ preempt_disable();
+ vcpu = this_cpu_read(xen_vcpu);
+ vcpu->evtchn_upcall_mask = flags;
+
+ if (flags == 0) {
+ barrier(); /* unmask then check (avoid races) */
+ if (unlikely(vcpu->evtchn_upcall_pending))
+ xen_force_evtchn_callback();
+ preempt_enable();
+ } else
+ preempt_enable_no_resched();
+}
+PV_CALLEE_SAVE_REGS_THUNK(xen_restore_fl);
+
+asmlinkage __visible void xen_irq_disable(void)
+{
+ /* There's a one instruction preempt window here. We need to
+ make sure we're don't switch CPUs between getting the vcpu
+ pointer and updating the mask. */
+ preempt_disable();
+ this_cpu_read(xen_vcpu)->evtchn_upcall_mask = 1;
+ preempt_enable_no_resched();
+}
+PV_CALLEE_SAVE_REGS_THUNK(xen_irq_disable);
+
+asmlinkage __visible void xen_irq_enable(void)
+{
+ struct vcpu_info *vcpu;
+
+ /*
+ * We may be preempted as soon as vcpu->evtchn_upcall_mask is
+ * cleared, so disable preemption to ensure we check for
+ * events on the VCPU we are still running on.
+ */
+ preempt_disable();
+
+ vcpu = this_cpu_read(xen_vcpu);
+ vcpu->evtchn_upcall_mask = 0;
+
+ /* Doesn't matter if we get preempted here, because any
+ pending event will get dealt with anyway. */
+
+ barrier(); /* unmask then check (avoid races) */
+ if (unlikely(vcpu->evtchn_upcall_pending))
+ xen_force_evtchn_callback();
+
+ preempt_enable();
+}
+PV_CALLEE_SAVE_REGS_THUNK(xen_irq_enable);
+
+static void xen_safe_halt(void)
+{
+ /* Blocking includes an implicit local_irq_enable(). */
+ if (HYPERVISOR_sched_op(SCHEDOP_block, NULL) != 0)
+ BUG();
+}
+
+static void xen_halt(void)
+{
+ if (irqs_disabled())
+ HYPERVISOR_vcpu_op(VCPUOP_down,
+ xen_vcpu_nr(smp_processor_id()), NULL);
+ else
+ xen_safe_halt();
+}
+
+static const struct pv_irq_ops xen_irq_ops __initconst = {
+ .save_fl = PV_CALLEE_SAVE(xen_save_fl),
+ .restore_fl = PV_CALLEE_SAVE(xen_restore_fl),
+ .irq_disable = PV_CALLEE_SAVE(xen_irq_disable),
+ .irq_enable = PV_CALLEE_SAVE(xen_irq_enable),
+
+ .safe_halt = xen_safe_halt,
+ .halt = xen_halt,
+};
+
+void __init xen_init_irq_ops(void)
+{
+ pv_irq_ops = xen_irq_ops;
+ x86_init.irqs.intr_init = xen_init_IRQ;
+}
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
new file mode 100644
index 000000000..96fc2f0fd
--- /dev/null
+++ b/arch/x86/xen/mmu.c
@@ -0,0 +1,237 @@
+#include <linux/pfn.h>
+#include <asm/xen/page.h>
+#include <asm/xen/hypercall.h>
+#include <xen/interface/memory.h>
+
+#include "multicalls.h"
+#include "mmu.h"
+
+/*
+ * Protects atomic reservation decrease/increase against concurrent increases.
+ * Also protects non-atomic updates of current_pages and balloon lists.
+ */
+DEFINE_SPINLOCK(xen_reservation_lock);
+
+unsigned long arbitrary_virt_to_mfn(void *vaddr)
+{
+ xmaddr_t maddr = arbitrary_virt_to_machine(vaddr);
+
+ return PFN_DOWN(maddr.maddr);
+}
+
+xmaddr_t arbitrary_virt_to_machine(void *vaddr)
+{
+ unsigned long address = (unsigned long)vaddr;
+ unsigned int level;
+ pte_t *pte;
+ unsigned offset;
+
+ /*
+ * if the PFN is in the linear mapped vaddr range, we can just use
+ * the (quick) virt_to_machine() p2m lookup
+ */
+ if (virt_addr_valid(vaddr))
+ return virt_to_machine(vaddr);
+
+ /* otherwise we have to do a (slower) full page-table walk */
+
+ pte = lookup_address(address, &level);
+ BUG_ON(pte == NULL);
+ offset = address & ~PAGE_MASK;
+ return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
+}
+EXPORT_SYMBOL_GPL(arbitrary_virt_to_machine);
+
+static noinline void xen_flush_tlb_all(void)
+{
+ struct mmuext_op *op;
+ struct multicall_space mcs;
+
+ preempt_disable();
+
+ mcs = xen_mc_entry(sizeof(*op));
+
+ op = mcs.args;
+ op->cmd = MMUEXT_TLB_FLUSH_ALL;
+ MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+ xen_mc_issue(PARAVIRT_LAZY_MMU);
+
+ preempt_enable();
+}
+
+#define REMAP_BATCH_SIZE 16
+
+struct remap_data {
+ xen_pfn_t *pfn;
+ bool contiguous;
+ bool no_translate;
+ pgprot_t prot;
+ struct mmu_update *mmu_update;
+};
+
+static int remap_area_pfn_pte_fn(pte_t *ptep, pgtable_t token,
+ unsigned long addr, void *data)
+{
+ struct remap_data *rmd = data;
+ pte_t pte = pte_mkspecial(mfn_pte(*rmd->pfn, rmd->prot));
+
+ /*
+ * If we have a contiguous range, just update the pfn itself,
+ * else update pointer to be "next pfn".
+ */
+ if (rmd->contiguous)
+ (*rmd->pfn)++;
+ else
+ rmd->pfn++;
+
+ rmd->mmu_update->ptr = virt_to_machine(ptep).maddr;
+ rmd->mmu_update->ptr |= rmd->no_translate ?
+ MMU_PT_UPDATE_NO_TRANSLATE :
+ MMU_NORMAL_PT_UPDATE;
+ rmd->mmu_update->val = pte_val_ma(pte);
+ rmd->mmu_update++;
+
+ return 0;
+}
+
+static int do_remap_pfn(struct vm_area_struct *vma,
+ unsigned long addr,
+ xen_pfn_t *pfn, int nr,
+ int *err_ptr, pgprot_t prot,
+ unsigned int domid,
+ bool no_translate,
+ struct page **pages)
+{
+ int err = 0;
+ struct remap_data rmd;
+ struct mmu_update mmu_update[REMAP_BATCH_SIZE];
+ unsigned long range;
+ int mapped = 0;
+
+ BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO)));
+
+ rmd.pfn = pfn;
+ rmd.prot = prot;
+ /*
+ * We use the err_ptr to indicate if there we are doing a contiguous
+ * mapping or a discontigious mapping.
+ */
+ rmd.contiguous = !err_ptr;
+ rmd.no_translate = no_translate;
+
+ while (nr) {
+ int index = 0;
+ int done = 0;
+ int batch = min(REMAP_BATCH_SIZE, nr);
+ int batch_left = batch;
+ range = (unsigned long)batch << PAGE_SHIFT;
+
+ rmd.mmu_update = mmu_update;
+ err = apply_to_page_range(vma->vm_mm, addr, range,
+ remap_area_pfn_pte_fn, &rmd);
+ if (err)
+ goto out;
+
+ /* We record the error for each page that gives an error, but
+ * continue mapping until the whole set is done */
+ do {
+ int i;
+
+ err = HYPERVISOR_mmu_update(&mmu_update[index],
+ batch_left, &done, domid);
+
+ /*
+ * @err_ptr may be the same buffer as @gfn, so
+ * only clear it after each chunk of @gfn is
+ * used.
+ */
+ if (err_ptr) {
+ for (i = index; i < index + done; i++)
+ err_ptr[i] = 0;
+ }
+ if (err < 0) {
+ if (!err_ptr)
+ goto out;
+ err_ptr[i] = err;
+ done++; /* Skip failed frame. */
+ } else
+ mapped += done;
+ batch_left -= done;
+ index += done;
+ } while (batch_left);
+
+ nr -= batch;
+ addr += range;
+ if (err_ptr)
+ err_ptr += batch;
+ cond_resched();
+ }
+out:
+
+ xen_flush_tlb_all();
+
+ return err < 0 ? err : mapped;
+}
+
+int xen_remap_domain_gfn_range(struct vm_area_struct *vma,
+ unsigned long addr,
+ xen_pfn_t gfn, int nr,
+ pgprot_t prot, unsigned domid,
+ struct page **pages)
+{
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return -EOPNOTSUPP;
+
+ return do_remap_pfn(vma, addr, &gfn, nr, NULL, prot, domid, false,
+ pages);
+}
+EXPORT_SYMBOL_GPL(xen_remap_domain_gfn_range);
+
+int xen_remap_domain_gfn_array(struct vm_area_struct *vma,
+ unsigned long addr,
+ xen_pfn_t *gfn, int nr,
+ int *err_ptr, pgprot_t prot,
+ unsigned domid, struct page **pages)
+{
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return xen_xlate_remap_gfn_array(vma, addr, gfn, nr, err_ptr,
+ prot, domid, pages);
+
+ /* We BUG_ON because it's a programmer error to pass a NULL err_ptr,
+ * and the consequences later is quite hard to detect what the actual
+ * cause of "wrong memory was mapped in".
+ */
+ BUG_ON(err_ptr == NULL);
+ return do_remap_pfn(vma, addr, gfn, nr, err_ptr, prot, domid,
+ false, pages);
+}
+EXPORT_SYMBOL_GPL(xen_remap_domain_gfn_array);
+
+int xen_remap_domain_mfn_array(struct vm_area_struct *vma,
+ unsigned long addr,
+ xen_pfn_t *mfn, int nr,
+ int *err_ptr, pgprot_t prot,
+ unsigned int domid, struct page **pages)
+{
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return -EOPNOTSUPP;
+
+ return do_remap_pfn(vma, addr, mfn, nr, err_ptr, prot, domid,
+ true, pages);
+}
+EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_array);
+
+/* Returns: 0 success */
+int xen_unmap_domain_gfn_range(struct vm_area_struct *vma,
+ int nr, struct page **pages)
+{
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return xen_xlate_unmap_gfn_range(vma, nr, pages);
+
+ if (!pages)
+ return 0;
+
+ return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(xen_unmap_domain_gfn_range);
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
new file mode 100644
index 000000000..a7e47cf7e
--- /dev/null
+++ b/arch/x86/xen/mmu.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _XEN_MMU_H
+
+#include <linux/linkage.h>
+#include <asm/page.h>
+
+enum pt_level {
+ PT_PGD,
+ PT_P4D,
+ PT_PUD,
+ PT_PMD,
+ PT_PTE
+};
+
+
+bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
+
+void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
+
+pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
+void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte);
+
+unsigned long xen_read_cr2_direct(void);
+
+extern void xen_init_mmu_ops(void);
+extern void xen_hvm_init_mmu_ops(void);
+#endif /* _XEN_MMU_H */
diff --git a/arch/x86/xen/mmu_hvm.c b/arch/x86/xen/mmu_hvm.c
new file mode 100644
index 000000000..dd2ad82ee
--- /dev/null
+++ b/arch/x86/xen/mmu_hvm.c
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/types.h>
+#include <linux/crash_dump.h>
+
+#include <xen/interface/xen.h>
+#include <xen/hvm.h>
+
+#include "mmu.h"
+
+#ifdef CONFIG_PROC_VMCORE
+/*
+ * This function is used in two contexts:
+ * - the kdump kernel has to check whether a pfn of the crashed kernel
+ * was a ballooned page. vmcore is using this function to decide
+ * whether to access a pfn of the crashed kernel.
+ * - the kexec kernel has to check whether a pfn was ballooned by the
+ * previous kernel. If the pfn is ballooned, handle it properly.
+ * Returns 0 if the pfn is not backed by a RAM page, the caller may
+ * handle the pfn special in this case.
+ */
+static int xen_oldmem_pfn_is_ram(unsigned long pfn)
+{
+ struct xen_hvm_get_mem_type a = {
+ .domid = DOMID_SELF,
+ .pfn = pfn,
+ };
+ int ram;
+
+ if (HYPERVISOR_hvm_op(HVMOP_get_mem_type, &a))
+ return -ENXIO;
+
+ switch (a.mem_type) {
+ case HVMMEM_mmio_dm:
+ ram = 0;
+ break;
+ case HVMMEM_ram_rw:
+ case HVMMEM_ram_ro:
+ default:
+ ram = 1;
+ break;
+ }
+
+ return ram;
+}
+#endif
+
+static void xen_hvm_exit_mmap(struct mm_struct *mm)
+{
+ struct xen_hvm_pagetable_dying a;
+ int rc;
+
+ a.domid = DOMID_SELF;
+ a.gpa = __pa(mm->pgd);
+ rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
+ WARN_ON_ONCE(rc < 0);
+}
+
+static int is_pagetable_dying_supported(void)
+{
+ struct xen_hvm_pagetable_dying a;
+ int rc = 0;
+
+ a.domid = DOMID_SELF;
+ a.gpa = 0x00;
+ rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
+ if (rc < 0) {
+ printk(KERN_DEBUG "HVMOP_pagetable_dying not supported\n");
+ return 0;
+ }
+ return 1;
+}
+
+void __init xen_hvm_init_mmu_ops(void)
+{
+ if (is_pagetable_dying_supported())
+ pv_mmu_ops.exit_mmap = xen_hvm_exit_mmap;
+#ifdef CONFIG_PROC_VMCORE
+ WARN_ON(register_oldmem_pfn_is_ram(&xen_oldmem_pfn_is_ram));
+#endif
+}
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
new file mode 100644
index 000000000..73aa0b89a
--- /dev/null
+++ b/arch/x86/xen/mmu_pv.c
@@ -0,0 +1,2677 @@
+/*
+ * Xen mmu operations
+ *
+ * This file contains the various mmu fetch and update operations.
+ * The most important job they must perform is the mapping between the
+ * domain's pfn and the overall machine mfns.
+ *
+ * Xen allows guests to directly update the pagetable, in a controlled
+ * fashion. In other words, the guest modifies the same pagetable
+ * that the CPU actually uses, which eliminates the overhead of having
+ * a separate shadow pagetable.
+ *
+ * In order to allow this, it falls on the guest domain to map its
+ * notion of a "physical" pfn - which is just a domain-local linear
+ * address - into a real "machine address" which the CPU's MMU can
+ * use.
+ *
+ * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
+ * inserted directly into the pagetable. When creating a new
+ * pte/pmd/pgd, it converts the passed pfn into an mfn. Conversely,
+ * when reading the content back with __(pgd|pmd|pte)_val, it converts
+ * the mfn back into a pfn.
+ *
+ * The other constraint is that all pages which make up a pagetable
+ * must be mapped read-only in the guest. This prevents uncontrolled
+ * guest updates to the pagetable. Xen strictly enforces this, and
+ * will disallow any pagetable update which will end up mapping a
+ * pagetable page RW, and will disallow using any writable page as a
+ * pagetable.
+ *
+ * Naively, when loading %cr3 with the base of a new pagetable, Xen
+ * would need to validate the whole pagetable before going on.
+ * Naturally, this is quite slow. The solution is to "pin" a
+ * pagetable, which enforces all the constraints on the pagetable even
+ * when it is not actively in use. This menas that Xen can be assured
+ * that it is still valid when you do load it into %cr3, and doesn't
+ * need to revalidate it.
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+#include <linux/sched/mm.h>
+#include <linux/highmem.h>
+#include <linux/debugfs.h>
+#include <linux/bug.h>
+#include <linux/vmalloc.h>
+#include <linux/export.h>
+#include <linux/init.h>
+#include <linux/gfp.h>
+#include <linux/memblock.h>
+#include <linux/seq_file.h>
+#include <linux/crash_dump.h>
+#ifdef CONFIG_KEXEC_CORE
+#include <linux/kexec.h>
+#endif
+
+#include <trace/events/xen.h>
+
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/fixmap.h>
+#include <asm/mmu_context.h>
+#include <asm/setup.h>
+#include <asm/paravirt.h>
+#include <asm/e820/api.h>
+#include <asm/linkage.h>
+#include <asm/page.h>
+#include <asm/init.h>
+#include <asm/pat.h>
+#include <asm/smp.h>
+#include <asm/tlb.h>
+
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+
+#include <xen/xen.h>
+#include <xen/page.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/hvm/hvm_op.h>
+#include <xen/interface/version.h>
+#include <xen/interface/memory.h>
+#include <xen/hvc-console.h>
+
+#include "multicalls.h"
+#include "mmu.h"
+#include "debugfs.h"
+
+#ifdef CONFIG_X86_32
+/*
+ * Identity map, in addition to plain kernel map. This needs to be
+ * large enough to allocate page table pages to allocate the rest.
+ * Each page can map 2MB.
+ */
+#define LEVEL1_IDENT_ENTRIES (PTRS_PER_PTE * 4)
+static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES);
+#endif
+#ifdef CONFIG_X86_64
+/* l3 pud for userspace vsyscall mapping */
+static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
+#endif /* CONFIG_X86_64 */
+
+/*
+ * Note about cr3 (pagetable base) values:
+ *
+ * xen_cr3 contains the current logical cr3 value; it contains the
+ * last set cr3. This may not be the current effective cr3, because
+ * its update may be being lazily deferred. However, a vcpu looking
+ * at its own cr3 can use this value knowing that it everything will
+ * be self-consistent.
+ *
+ * xen_current_cr3 contains the actual vcpu cr3; it is set once the
+ * hypercall to set the vcpu cr3 is complete (so it may be a little
+ * out of date, but it will never be set early). If one vcpu is
+ * looking at another vcpu's cr3 value, it should use this variable.
+ */
+DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */
+DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
+
+static phys_addr_t xen_pt_base, xen_pt_size __initdata;
+
+static DEFINE_STATIC_KEY_FALSE(xen_struct_pages_ready);
+
+/*
+ * Just beyond the highest usermode address. STACK_TOP_MAX has a
+ * redzone above it, so round it up to a PGD boundary.
+ */
+#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
+
+void make_lowmem_page_readonly(void *vaddr)
+{
+ pte_t *pte, ptev;
+ unsigned long address = (unsigned long)vaddr;
+ unsigned int level;
+
+ pte = lookup_address(address, &level);
+ if (pte == NULL)
+ return; /* vaddr missing */
+
+ ptev = pte_wrprotect(*pte);
+
+ if (HYPERVISOR_update_va_mapping(address, ptev, 0))
+ BUG();
+}
+
+void make_lowmem_page_readwrite(void *vaddr)
+{
+ pte_t *pte, ptev;
+ unsigned long address = (unsigned long)vaddr;
+ unsigned int level;
+
+ pte = lookup_address(address, &level);
+ if (pte == NULL)
+ return; /* vaddr missing */
+
+ ptev = pte_mkwrite(*pte);
+
+ if (HYPERVISOR_update_va_mapping(address, ptev, 0))
+ BUG();
+}
+
+
+/*
+ * During early boot all page table pages are pinned, but we do not have struct
+ * pages, so return true until struct pages are ready.
+ */
+static bool xen_page_pinned(void *ptr)
+{
+ if (static_branch_likely(&xen_struct_pages_ready)) {
+ struct page *page = virt_to_page(ptr);
+
+ return PagePinned(page);
+ }
+ return true;
+}
+
+static void xen_extend_mmu_update(const struct mmu_update *update)
+{
+ struct multicall_space mcs;
+ struct mmu_update *u;
+
+ mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
+
+ if (mcs.mc != NULL) {
+ mcs.mc->args[1]++;
+ } else {
+ mcs = __xen_mc_entry(sizeof(*u));
+ MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
+ }
+
+ u = mcs.args;
+ *u = *update;
+}
+
+static void xen_extend_mmuext_op(const struct mmuext_op *op)
+{
+ struct multicall_space mcs;
+ struct mmuext_op *u;
+
+ mcs = xen_mc_extend_args(__HYPERVISOR_mmuext_op, sizeof(*u));
+
+ if (mcs.mc != NULL) {
+ mcs.mc->args[1]++;
+ } else {
+ mcs = __xen_mc_entry(sizeof(*u));
+ MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
+ }
+
+ u = mcs.args;
+ *u = *op;
+}
+
+static void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
+{
+ struct mmu_update u;
+
+ preempt_disable();
+
+ xen_mc_batch();
+
+ /* ptr may be ioremapped for 64-bit pagetable setup */
+ u.ptr = arbitrary_virt_to_machine(ptr).maddr;
+ u.val = pmd_val_ma(val);
+ xen_extend_mmu_update(&u);
+
+ xen_mc_issue(PARAVIRT_LAZY_MMU);
+
+ preempt_enable();
+}
+
+static void xen_set_pmd(pmd_t *ptr, pmd_t val)
+{
+ trace_xen_mmu_set_pmd(ptr, val);
+
+ /* If page is not pinned, we can just update the entry
+ directly */
+ if (!xen_page_pinned(ptr)) {
+ *ptr = val;
+ return;
+ }
+
+ xen_set_pmd_hyper(ptr, val);
+}
+
+/*
+ * Associate a virtual page frame with a given physical page frame
+ * and protection flags for that frame.
+ */
+void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
+{
+ set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
+}
+
+static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval)
+{
+ struct mmu_update u;
+
+ if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU)
+ return false;
+
+ xen_mc_batch();
+
+ u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE;
+ u.val = pte_val_ma(pteval);
+ xen_extend_mmu_update(&u);
+
+ xen_mc_issue(PARAVIRT_LAZY_MMU);
+
+ return true;
+}
+
+static inline void __xen_set_pte(pte_t *ptep, pte_t pteval)
+{
+ if (!xen_batched_set_pte(ptep, pteval)) {
+ /*
+ * Could call native_set_pte() here and trap and
+ * emulate the PTE write but with 32-bit guests this
+ * needs two traps (one for each of the two 32-bit
+ * words in the PTE) so do one hypercall directly
+ * instead.
+ */
+ struct mmu_update u;
+
+ u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE;
+ u.val = pte_val_ma(pteval);
+ HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF);
+ }
+}
+
+static void xen_set_pte(pte_t *ptep, pte_t pteval)
+{
+ trace_xen_mmu_set_pte(ptep, pteval);
+ __xen_set_pte(ptep, pteval);
+}
+
+static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pteval)
+{
+ trace_xen_mmu_set_pte_at(mm, addr, ptep, pteval);
+ __xen_set_pte(ptep, pteval);
+}
+
+pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep)
+{
+ /* Just return the pte as-is. We preserve the bits on commit */
+ trace_xen_mmu_ptep_modify_prot_start(mm, addr, ptep, *ptep);
+ return *ptep;
+}
+
+void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte)
+{
+ struct mmu_update u;
+
+ trace_xen_mmu_ptep_modify_prot_commit(mm, addr, ptep, pte);
+ xen_mc_batch();
+
+ u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
+ u.val = pte_val_ma(pte);
+ xen_extend_mmu_update(&u);
+
+ xen_mc_issue(PARAVIRT_LAZY_MMU);
+}
+
+/* Assume pteval_t is equivalent to all the other *val_t types. */
+static pteval_t pte_mfn_to_pfn(pteval_t val)
+{
+ if (val & _PAGE_PRESENT) {
+ unsigned long mfn = (val & XEN_PTE_MFN_MASK) >> PAGE_SHIFT;
+ unsigned long pfn = mfn_to_pfn(mfn);
+
+ pteval_t flags = val & PTE_FLAGS_MASK;
+ if (unlikely(pfn == ~0))
+ val = flags & ~_PAGE_PRESENT;
+ else
+ val = ((pteval_t)pfn << PAGE_SHIFT) | flags;
+ }
+
+ return val;
+}
+
+static pteval_t pte_pfn_to_mfn(pteval_t val)
+{
+ if (val & _PAGE_PRESENT) {
+ unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
+ pteval_t flags = val & PTE_FLAGS_MASK;
+ unsigned long mfn;
+
+ mfn = __pfn_to_mfn(pfn);
+
+ /*
+ * If there's no mfn for the pfn, then just create an
+ * empty non-present pte. Unfortunately this loses
+ * information about the original pfn, so
+ * pte_mfn_to_pfn is asymmetric.
+ */
+ if (unlikely(mfn == INVALID_P2M_ENTRY)) {
+ mfn = 0;
+ flags = 0;
+ } else
+ mfn &= ~(FOREIGN_FRAME_BIT | IDENTITY_FRAME_BIT);
+ val = ((pteval_t)mfn << PAGE_SHIFT) | flags;
+ }
+
+ return val;
+}
+
+__visible pteval_t xen_pte_val(pte_t pte)
+{
+ pteval_t pteval = pte.pte;
+
+ return pte_mfn_to_pfn(pteval);
+}
+PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
+
+__visible pgdval_t xen_pgd_val(pgd_t pgd)
+{
+ return pte_mfn_to_pfn(pgd.pgd);
+}
+PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
+
+__visible pte_t xen_make_pte(pteval_t pte)
+{
+ pte = pte_pfn_to_mfn(pte);
+
+ return native_make_pte(pte);
+}
+PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
+
+__visible pgd_t xen_make_pgd(pgdval_t pgd)
+{
+ pgd = pte_pfn_to_mfn(pgd);
+ return native_make_pgd(pgd);
+}
+PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd);
+
+__visible pmdval_t xen_pmd_val(pmd_t pmd)
+{
+ return pte_mfn_to_pfn(pmd.pmd);
+}
+PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val);
+
+static void xen_set_pud_hyper(pud_t *ptr, pud_t val)
+{
+ struct mmu_update u;
+
+ preempt_disable();
+
+ xen_mc_batch();
+
+ /* ptr may be ioremapped for 64-bit pagetable setup */
+ u.ptr = arbitrary_virt_to_machine(ptr).maddr;
+ u.val = pud_val_ma(val);
+ xen_extend_mmu_update(&u);
+
+ xen_mc_issue(PARAVIRT_LAZY_MMU);
+
+ preempt_enable();
+}
+
+static void xen_set_pud(pud_t *ptr, pud_t val)
+{
+ trace_xen_mmu_set_pud(ptr, val);
+
+ /* If page is not pinned, we can just update the entry
+ directly */
+ if (!xen_page_pinned(ptr)) {
+ *ptr = val;
+ return;
+ }
+
+ xen_set_pud_hyper(ptr, val);
+}
+
+#ifdef CONFIG_X86_PAE
+static void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
+{
+ trace_xen_mmu_set_pte_atomic(ptep, pte);
+ __xen_set_pte(ptep, pte);
+}
+
+static void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+{
+ trace_xen_mmu_pte_clear(mm, addr, ptep);
+ __xen_set_pte(ptep, native_make_pte(0));
+}
+
+static void xen_pmd_clear(pmd_t *pmdp)
+{
+ trace_xen_mmu_pmd_clear(pmdp);
+ set_pmd(pmdp, __pmd(0));
+}
+#endif /* CONFIG_X86_PAE */
+
+__visible pmd_t xen_make_pmd(pmdval_t pmd)
+{
+ pmd = pte_pfn_to_mfn(pmd);
+ return native_make_pmd(pmd);
+}
+PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
+
+#ifdef CONFIG_X86_64
+__visible pudval_t xen_pud_val(pud_t pud)
+{
+ return pte_mfn_to_pfn(pud.pud);
+}
+PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val);
+
+__visible pud_t xen_make_pud(pudval_t pud)
+{
+ pud = pte_pfn_to_mfn(pud);
+
+ return native_make_pud(pud);
+}
+PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud);
+
+static pgd_t *xen_get_user_pgd(pgd_t *pgd)
+{
+ pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
+ unsigned offset = pgd - pgd_page;
+ pgd_t *user_ptr = NULL;
+
+ if (offset < pgd_index(USER_LIMIT)) {
+ struct page *page = virt_to_page(pgd_page);
+ user_ptr = (pgd_t *)page->private;
+ if (user_ptr)
+ user_ptr += offset;
+ }
+
+ return user_ptr;
+}
+
+static void __xen_set_p4d_hyper(p4d_t *ptr, p4d_t val)
+{
+ struct mmu_update u;
+
+ u.ptr = virt_to_machine(ptr).maddr;
+ u.val = p4d_val_ma(val);
+ xen_extend_mmu_update(&u);
+}
+
+/*
+ * Raw hypercall-based set_p4d, intended for in early boot before
+ * there's a page structure. This implies:
+ * 1. The only existing pagetable is the kernel's
+ * 2. It is always pinned
+ * 3. It has no user pagetable attached to it
+ */
+static void __init xen_set_p4d_hyper(p4d_t *ptr, p4d_t val)
+{
+ preempt_disable();
+
+ xen_mc_batch();
+
+ __xen_set_p4d_hyper(ptr, val);
+
+ xen_mc_issue(PARAVIRT_LAZY_MMU);
+
+ preempt_enable();
+}
+
+static void xen_set_p4d(p4d_t *ptr, p4d_t val)
+{
+ pgd_t *user_ptr = xen_get_user_pgd((pgd_t *)ptr);
+ pgd_t pgd_val;
+
+ trace_xen_mmu_set_p4d(ptr, (p4d_t *)user_ptr, val);
+
+ /* If page is not pinned, we can just update the entry
+ directly */
+ if (!xen_page_pinned(ptr)) {
+ *ptr = val;
+ if (user_ptr) {
+ WARN_ON(xen_page_pinned(user_ptr));
+ pgd_val.pgd = p4d_val_ma(val);
+ *user_ptr = pgd_val;
+ }
+ return;
+ }
+
+ /* If it's pinned, then we can at least batch the kernel and
+ user updates together. */
+ xen_mc_batch();
+
+ __xen_set_p4d_hyper(ptr, val);
+ if (user_ptr)
+ __xen_set_p4d_hyper((p4d_t *)user_ptr, val);
+
+ xen_mc_issue(PARAVIRT_LAZY_MMU);
+}
+
+#if CONFIG_PGTABLE_LEVELS >= 5
+__visible p4dval_t xen_p4d_val(p4d_t p4d)
+{
+ return pte_mfn_to_pfn(p4d.p4d);
+}
+PV_CALLEE_SAVE_REGS_THUNK(xen_p4d_val);
+
+__visible p4d_t xen_make_p4d(p4dval_t p4d)
+{
+ p4d = pte_pfn_to_mfn(p4d);
+
+ return native_make_p4d(p4d);
+}
+PV_CALLEE_SAVE_REGS_THUNK(xen_make_p4d);
+#endif /* CONFIG_PGTABLE_LEVELS >= 5 */
+#endif /* CONFIG_X86_64 */
+
+static int xen_pmd_walk(struct mm_struct *mm, pmd_t *pmd,
+ int (*func)(struct mm_struct *mm, struct page *, enum pt_level),
+ bool last, unsigned long limit)
+{
+ int i, nr, flush = 0;
+
+ nr = last ? pmd_index(limit) + 1 : PTRS_PER_PMD;
+ for (i = 0; i < nr; i++) {
+ if (!pmd_none(pmd[i]))
+ flush |= (*func)(mm, pmd_page(pmd[i]), PT_PTE);
+ }
+ return flush;
+}
+
+static int xen_pud_walk(struct mm_struct *mm, pud_t *pud,
+ int (*func)(struct mm_struct *mm, struct page *, enum pt_level),
+ bool last, unsigned long limit)
+{
+ int i, nr, flush = 0;
+
+ nr = last ? pud_index(limit) + 1 : PTRS_PER_PUD;
+ for (i = 0; i < nr; i++) {
+ pmd_t *pmd;
+
+ if (pud_none(pud[i]))
+ continue;
+
+ pmd = pmd_offset(&pud[i], 0);
+ if (PTRS_PER_PMD > 1)
+ flush |= (*func)(mm, virt_to_page(pmd), PT_PMD);
+ flush |= xen_pmd_walk(mm, pmd, func,
+ last && i == nr - 1, limit);
+ }
+ return flush;
+}
+
+static int xen_p4d_walk(struct mm_struct *mm, p4d_t *p4d,
+ int (*func)(struct mm_struct *mm, struct page *, enum pt_level),
+ bool last, unsigned long limit)
+{
+ int flush = 0;
+ pud_t *pud;
+
+
+ if (p4d_none(*p4d))
+ return flush;
+
+ pud = pud_offset(p4d, 0);
+ if (PTRS_PER_PUD > 1)
+ flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
+ flush |= xen_pud_walk(mm, pud, func, last, limit);
+ return flush;
+}
+
+/*
+ * (Yet another) pagetable walker. This one is intended for pinning a
+ * pagetable. This means that it walks a pagetable and calls the
+ * callback function on each page it finds making up the page table,
+ * at every level. It walks the entire pagetable, but it only bothers
+ * pinning pte pages which are below limit. In the normal case this
+ * will be STACK_TOP_MAX, but at boot we need to pin up to
+ * FIXADDR_TOP.
+ *
+ * For 32-bit the important bit is that we don't pin beyond there,
+ * because then we start getting into Xen's ptes.
+ *
+ * For 64-bit, we must skip the Xen hole in the middle of the address
+ * space, just after the big x86-64 virtual hole.
+ */
+static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd,
+ int (*func)(struct mm_struct *mm, struct page *,
+ enum pt_level),
+ unsigned long limit)
+{
+ int i, nr, flush = 0;
+ unsigned hole_low = 0, hole_high = 0;
+
+ /* The limit is the last byte to be touched */
+ limit--;
+ BUG_ON(limit >= FIXADDR_TOP);
+
+#ifdef CONFIG_X86_64
+ /*
+ * 64-bit has a great big hole in the middle of the address
+ * space, which contains the Xen mappings.
+ */
+ hole_low = pgd_index(GUARD_HOLE_BASE_ADDR);
+ hole_high = pgd_index(GUARD_HOLE_END_ADDR);
+#endif
+
+ nr = pgd_index(limit) + 1;
+ for (i = 0; i < nr; i++) {
+ p4d_t *p4d;
+
+ if (i >= hole_low && i < hole_high)
+ continue;
+
+ if (pgd_none(pgd[i]))
+ continue;
+
+ p4d = p4d_offset(&pgd[i], 0);
+ flush |= xen_p4d_walk(mm, p4d, func, i == nr - 1, limit);
+ }
+
+ /* Do the top level last, so that the callbacks can use it as
+ a cue to do final things like tlb flushes. */
+ flush |= (*func)(mm, virt_to_page(pgd), PT_PGD);
+
+ return flush;
+}
+
+static int xen_pgd_walk(struct mm_struct *mm,
+ int (*func)(struct mm_struct *mm, struct page *,
+ enum pt_level),
+ unsigned long limit)
+{
+ return __xen_pgd_walk(mm, mm->pgd, func, limit);
+}
+
+/* If we're using split pte locks, then take the page's lock and
+ return a pointer to it. Otherwise return NULL. */
+static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm)
+{
+ spinlock_t *ptl = NULL;
+
+#if USE_SPLIT_PTE_PTLOCKS
+ ptl = ptlock_ptr(page);
+ spin_lock_nest_lock(ptl, &mm->page_table_lock);
+#endif
+
+ return ptl;
+}
+
+static void xen_pte_unlock(void *v)
+{
+ spinlock_t *ptl = v;
+ spin_unlock(ptl);
+}
+
+static void xen_do_pin(unsigned level, unsigned long pfn)
+{
+ struct mmuext_op op;
+
+ op.cmd = level;
+ op.arg1.mfn = pfn_to_mfn(pfn);
+
+ xen_extend_mmuext_op(&op);
+}
+
+static int xen_pin_page(struct mm_struct *mm, struct page *page,
+ enum pt_level level)
+{
+ unsigned pgfl = TestSetPagePinned(page);
+ int flush;
+
+ if (pgfl)
+ flush = 0; /* already pinned */
+ else if (PageHighMem(page))
+ /* kmaps need flushing if we found an unpinned
+ highpage */
+ flush = 1;
+ else {
+ void *pt = lowmem_page_address(page);
+ unsigned long pfn = page_to_pfn(page);
+ struct multicall_space mcs = __xen_mc_entry(0);
+ spinlock_t *ptl;
+
+ flush = 0;
+
+ /*
+ * We need to hold the pagetable lock between the time
+ * we make the pagetable RO and when we actually pin
+ * it. If we don't, then other users may come in and
+ * attempt to update the pagetable by writing it,
+ * which will fail because the memory is RO but not
+ * pinned, so Xen won't do the trap'n'emulate.
+ *
+ * If we're using split pte locks, we can't hold the
+ * entire pagetable's worth of locks during the
+ * traverse, because we may wrap the preempt count (8
+ * bits). The solution is to mark RO and pin each PTE
+ * page while holding the lock. This means the number
+ * of locks we end up holding is never more than a
+ * batch size (~32 entries, at present).
+ *
+ * If we're not using split pte locks, we needn't pin
+ * the PTE pages independently, because we're
+ * protected by the overall pagetable lock.
+ */
+ ptl = NULL;
+ if (level == PT_PTE)
+ ptl = xen_pte_lock(page, mm);
+
+ MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
+ pfn_pte(pfn, PAGE_KERNEL_RO),
+ level == PT_PGD ? UVMF_TLB_FLUSH : 0);
+
+ if (ptl) {
+ xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
+
+ /* Queue a deferred unlock for when this batch
+ is completed. */
+ xen_mc_callback(xen_pte_unlock, ptl);
+ }
+ }
+
+ return flush;
+}
+
+/* This is called just after a mm has been created, but it has not
+ been used yet. We need to make sure that its pagetable is all
+ read-only, and can be pinned. */
+static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
+{
+ trace_xen_mmu_pgd_pin(mm, pgd);
+
+ xen_mc_batch();
+
+ if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) {
+ /* re-enable interrupts for flushing */
+ xen_mc_issue(0);
+
+ kmap_flush_unused();
+
+ xen_mc_batch();
+ }
+
+#ifdef CONFIG_X86_64
+ {
+ pgd_t *user_pgd = xen_get_user_pgd(pgd);
+
+ xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
+
+ if (user_pgd) {
+ xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD);
+ xen_do_pin(MMUEXT_PIN_L4_TABLE,
+ PFN_DOWN(__pa(user_pgd)));
+ }
+ }
+#else /* CONFIG_X86_32 */
+#ifdef CONFIG_X86_PAE
+ /* Need to make sure unshared kernel PMD is pinnable */
+ xen_pin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
+ PT_PMD);
+#endif
+ xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
+#endif /* CONFIG_X86_64 */
+ xen_mc_issue(0);
+}
+
+static void xen_pgd_pin(struct mm_struct *mm)
+{
+ __xen_pgd_pin(mm, mm->pgd);
+}
+
+/*
+ * On save, we need to pin all pagetables to make sure they get their
+ * mfns turned into pfns. Search the list for any unpinned pgds and pin
+ * them (unpinned pgds are not currently in use, probably because the
+ * process is under construction or destruction).
+ *
+ * Expected to be called in stop_machine() ("equivalent to taking
+ * every spinlock in the system"), so the locking doesn't really
+ * matter all that much.
+ */
+void xen_mm_pin_all(void)
+{
+ struct page *page;
+
+ spin_lock(&pgd_lock);
+
+ list_for_each_entry(page, &pgd_list, lru) {
+ if (!PagePinned(page)) {
+ __xen_pgd_pin(&init_mm, (pgd_t *)page_address(page));
+ SetPageSavePinned(page);
+ }
+ }
+
+ spin_unlock(&pgd_lock);
+}
+
+static int __init xen_mark_pinned(struct mm_struct *mm, struct page *page,
+ enum pt_level level)
+{
+ SetPagePinned(page);
+ return 0;
+}
+
+/*
+ * The init_mm pagetable is really pinned as soon as its created, but
+ * that's before we have page structures to store the bits. So do all
+ * the book-keeping now once struct pages for allocated pages are
+ * initialized. This happens only after free_all_bootmem() is called.
+ */
+static void __init xen_after_bootmem(void)
+{
+ static_branch_enable(&xen_struct_pages_ready);
+#ifdef CONFIG_X86_64
+ SetPagePinned(virt_to_page(level3_user_vsyscall));
+#endif
+ xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);
+}
+
+static int xen_unpin_page(struct mm_struct *mm, struct page *page,
+ enum pt_level level)
+{
+ unsigned pgfl = TestClearPagePinned(page);
+
+ if (pgfl && !PageHighMem(page)) {
+ void *pt = lowmem_page_address(page);
+ unsigned long pfn = page_to_pfn(page);
+ spinlock_t *ptl = NULL;
+ struct multicall_space mcs;
+
+ /*
+ * Do the converse to pin_page. If we're using split
+ * pte locks, we must be holding the lock for while
+ * the pte page is unpinned but still RO to prevent
+ * concurrent updates from seeing it in this
+ * partially-pinned state.
+ */
+ if (level == PT_PTE) {
+ ptl = xen_pte_lock(page, mm);
+
+ if (ptl)
+ xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
+ }
+
+ mcs = __xen_mc_entry(0);
+
+ MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
+ pfn_pte(pfn, PAGE_KERNEL),
+ level == PT_PGD ? UVMF_TLB_FLUSH : 0);
+
+ if (ptl) {
+ /* unlock when batch completed */
+ xen_mc_callback(xen_pte_unlock, ptl);
+ }
+ }
+
+ return 0; /* never need to flush on unpin */
+}
+
+/* Release a pagetables pages back as normal RW */
+static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)
+{
+ trace_xen_mmu_pgd_unpin(mm, pgd);
+
+ xen_mc_batch();
+
+ xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
+
+#ifdef CONFIG_X86_64
+ {
+ pgd_t *user_pgd = xen_get_user_pgd(pgd);
+
+ if (user_pgd) {
+ xen_do_pin(MMUEXT_UNPIN_TABLE,
+ PFN_DOWN(__pa(user_pgd)));
+ xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD);
+ }
+ }
+#endif
+
+#ifdef CONFIG_X86_PAE
+ /* Need to make sure unshared kernel PMD is unpinned */
+ xen_unpin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
+ PT_PMD);
+#endif
+
+ __xen_pgd_walk(mm, pgd, xen_unpin_page, USER_LIMIT);
+
+ xen_mc_issue(0);
+}
+
+static void xen_pgd_unpin(struct mm_struct *mm)
+{
+ __xen_pgd_unpin(mm, mm->pgd);
+}
+
+/*
+ * On resume, undo any pinning done at save, so that the rest of the
+ * kernel doesn't see any unexpected pinned pagetables.
+ */
+void xen_mm_unpin_all(void)
+{
+ struct page *page;
+
+ spin_lock(&pgd_lock);
+
+ list_for_each_entry(page, &pgd_list, lru) {
+ if (PageSavePinned(page)) {
+ BUG_ON(!PagePinned(page));
+ __xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page));
+ ClearPageSavePinned(page);
+ }
+ }
+
+ spin_unlock(&pgd_lock);
+}
+
+static void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
+{
+ spin_lock(&next->page_table_lock);
+ xen_pgd_pin(next);
+ spin_unlock(&next->page_table_lock);
+}
+
+static void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
+{
+ spin_lock(&mm->page_table_lock);
+ xen_pgd_pin(mm);
+ spin_unlock(&mm->page_table_lock);
+}
+
+static void drop_mm_ref_this_cpu(void *info)
+{
+ struct mm_struct *mm = info;
+
+ if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm)
+ leave_mm(smp_processor_id());
+
+ /*
+ * If this cpu still has a stale cr3 reference, then make sure
+ * it has been flushed.
+ */
+ if (this_cpu_read(xen_current_cr3) == __pa(mm->pgd))
+ xen_mc_flush();
+}
+
+#ifdef CONFIG_SMP
+/*
+ * Another cpu may still have their %cr3 pointing at the pagetable, so
+ * we need to repoint it somewhere else before we can unpin it.
+ */
+static void xen_drop_mm_ref(struct mm_struct *mm)
+{
+ cpumask_var_t mask;
+ unsigned cpu;
+
+ drop_mm_ref_this_cpu(mm);
+
+ /* Get the "official" set of cpus referring to our pagetable. */
+ if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
+ for_each_online_cpu(cpu) {
+ if (per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
+ continue;
+ smp_call_function_single(cpu, drop_mm_ref_this_cpu, mm, 1);
+ }
+ return;
+ }
+
+ /*
+ * It's possible that a vcpu may have a stale reference to our
+ * cr3, because its in lazy mode, and it hasn't yet flushed
+ * its set of pending hypercalls yet. In this case, we can
+ * look at its actual current cr3 value, and force it to flush
+ * if needed.
+ */
+ cpumask_clear(mask);
+ for_each_online_cpu(cpu) {
+ if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
+ cpumask_set_cpu(cpu, mask);
+ }
+
+ smp_call_function_many(mask, drop_mm_ref_this_cpu, mm, 1);
+ free_cpumask_var(mask);
+}
+#else
+static void xen_drop_mm_ref(struct mm_struct *mm)
+{
+ drop_mm_ref_this_cpu(mm);
+}
+#endif
+
+/*
+ * While a process runs, Xen pins its pagetables, which means that the
+ * hypervisor forces it to be read-only, and it controls all updates
+ * to it. This means that all pagetable updates have to go via the
+ * hypervisor, which is moderately expensive.
+ *
+ * Since we're pulling the pagetable down, we switch to use init_mm,
+ * unpin old process pagetable and mark it all read-write, which
+ * allows further operations on it to be simple memory accesses.
+ *
+ * The only subtle point is that another CPU may be still using the
+ * pagetable because of lazy tlb flushing. This means we need need to
+ * switch all CPUs off this pagetable before we can unpin it.
+ */
+static void xen_exit_mmap(struct mm_struct *mm)
+{
+ get_cpu(); /* make sure we don't move around */
+ xen_drop_mm_ref(mm);
+ put_cpu();
+
+ spin_lock(&mm->page_table_lock);
+
+ /* pgd may not be pinned in the error exit path of execve */
+ if (xen_page_pinned(mm->pgd))
+ xen_pgd_unpin(mm);
+
+ spin_unlock(&mm->page_table_lock);
+}
+
+static void xen_post_allocator_init(void);
+
+static void __init pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
+{
+ struct mmuext_op op;
+
+ op.cmd = cmd;
+ op.arg1.mfn = pfn_to_mfn(pfn);
+ if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
+ BUG();
+}
+
+#ifdef CONFIG_X86_64
+static void __init xen_cleanhighmap(unsigned long vaddr,
+ unsigned long vaddr_end)
+{
+ unsigned long kernel_end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1;
+ pmd_t *pmd = level2_kernel_pgt + pmd_index(vaddr);
+
+ /* NOTE: The loop is more greedy than the cleanup_highmap variant.
+ * We include the PMD passed in on _both_ boundaries. */
+ for (; vaddr <= vaddr_end && (pmd < (level2_kernel_pgt + PTRS_PER_PMD));
+ pmd++, vaddr += PMD_SIZE) {
+ if (pmd_none(*pmd))
+ continue;
+ if (vaddr < (unsigned long) _text || vaddr > kernel_end)
+ set_pmd(pmd, __pmd(0));
+ }
+ /* In case we did something silly, we should crash in this function
+ * instead of somewhere later and be confusing. */
+ xen_mc_flush();
+}
+
+/*
+ * Make a page range writeable and free it.
+ */
+static void __init xen_free_ro_pages(unsigned long paddr, unsigned long size)
+{
+ void *vaddr = __va(paddr);
+ void *vaddr_end = vaddr + size;
+
+ for (; vaddr < vaddr_end; vaddr += PAGE_SIZE)
+ make_lowmem_page_readwrite(vaddr);
+
+ memblock_free(paddr, size);
+}
+
+static void __init xen_cleanmfnmap_free_pgtbl(void *pgtbl, bool unpin)
+{
+ unsigned long pa = __pa(pgtbl) & PHYSICAL_PAGE_MASK;
+
+ if (unpin)
+ pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(pa));
+ ClearPagePinned(virt_to_page(__va(pa)));
+ xen_free_ro_pages(pa, PAGE_SIZE);
+}
+
+static void __init xen_cleanmfnmap_pmd(pmd_t *pmd, bool unpin)
+{
+ unsigned long pa;
+ pte_t *pte_tbl;
+ int i;
+
+ if (pmd_large(*pmd)) {
+ pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK;
+ xen_free_ro_pages(pa, PMD_SIZE);
+ return;
+ }
+
+ pte_tbl = pte_offset_kernel(pmd, 0);
+ for (i = 0; i < PTRS_PER_PTE; i++) {
+ if (pte_none(pte_tbl[i]))
+ continue;
+ pa = pte_pfn(pte_tbl[i]) << PAGE_SHIFT;
+ xen_free_ro_pages(pa, PAGE_SIZE);
+ }
+ set_pmd(pmd, __pmd(0));
+ xen_cleanmfnmap_free_pgtbl(pte_tbl, unpin);
+}
+
+static void __init xen_cleanmfnmap_pud(pud_t *pud, bool unpin)
+{
+ unsigned long pa;
+ pmd_t *pmd_tbl;
+ int i;
+
+ if (pud_large(*pud)) {
+ pa = pud_val(*pud) & PHYSICAL_PAGE_MASK;
+ xen_free_ro_pages(pa, PUD_SIZE);
+ return;
+ }
+
+ pmd_tbl = pmd_offset(pud, 0);
+ for (i = 0; i < PTRS_PER_PMD; i++) {
+ if (pmd_none(pmd_tbl[i]))
+ continue;
+ xen_cleanmfnmap_pmd(pmd_tbl + i, unpin);
+ }
+ set_pud(pud, __pud(0));
+ xen_cleanmfnmap_free_pgtbl(pmd_tbl, unpin);
+}
+
+static void __init xen_cleanmfnmap_p4d(p4d_t *p4d, bool unpin)
+{
+ unsigned long pa;
+ pud_t *pud_tbl;
+ int i;
+
+ if (p4d_large(*p4d)) {
+ pa = p4d_val(*p4d) & PHYSICAL_PAGE_MASK;
+ xen_free_ro_pages(pa, P4D_SIZE);
+ return;
+ }
+
+ pud_tbl = pud_offset(p4d, 0);
+ for (i = 0; i < PTRS_PER_PUD; i++) {
+ if (pud_none(pud_tbl[i]))
+ continue;
+ xen_cleanmfnmap_pud(pud_tbl + i, unpin);
+ }
+ set_p4d(p4d, __p4d(0));
+ xen_cleanmfnmap_free_pgtbl(pud_tbl, unpin);
+}
+
+/*
+ * Since it is well isolated we can (and since it is perhaps large we should)
+ * also free the page tables mapping the initial P->M table.
+ */
+static void __init xen_cleanmfnmap(unsigned long vaddr)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+ bool unpin;
+
+ unpin = (vaddr == 2 * PGDIR_SIZE);
+ vaddr &= PMD_MASK;
+ pgd = pgd_offset_k(vaddr);
+ p4d = p4d_offset(pgd, 0);
+ if (!p4d_none(*p4d))
+ xen_cleanmfnmap_p4d(p4d, unpin);
+}
+
+static void __init xen_pagetable_p2m_free(void)
+{
+ unsigned long size;
+ unsigned long addr;
+
+ size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
+
+ /* No memory or already called. */
+ if ((unsigned long)xen_p2m_addr == xen_start_info->mfn_list)
+ return;
+
+ /* using __ka address and sticking INVALID_P2M_ENTRY! */
+ memset((void *)xen_start_info->mfn_list, 0xff, size);
+
+ addr = xen_start_info->mfn_list;
+ /*
+ * We could be in __ka space.
+ * We roundup to the PMD, which means that if anybody at this stage is
+ * using the __ka address of xen_start_info or
+ * xen_start_info->shared_info they are in going to crash. Fortunatly
+ * we have already revectored in xen_setup_kernel_pagetable.
+ */
+ size = roundup(size, PMD_SIZE);
+
+ if (addr >= __START_KERNEL_map) {
+ xen_cleanhighmap(addr, addr + size);
+ size = PAGE_ALIGN(xen_start_info->nr_pages *
+ sizeof(unsigned long));
+ memblock_free(__pa(addr), size);
+ } else {
+ xen_cleanmfnmap(addr);
+ }
+}
+
+static void __init xen_pagetable_cleanhighmap(void)
+{
+ unsigned long size;
+ unsigned long addr;
+
+ /* At this stage, cleanup_highmap has already cleaned __ka space
+ * from _brk_limit way up to the max_pfn_mapped (which is the end of
+ * the ramdisk). We continue on, erasing PMD entries that point to page
+ * tables - do note that they are accessible at this stage via __va.
+ * As Xen is aligning the memory end to a 4MB boundary, for good
+ * measure we also round up to PMD_SIZE * 2 - which means that if
+ * anybody is using __ka address to the initial boot-stack - and try
+ * to use it - they are going to crash. The xen_start_info has been
+ * taken care of already in xen_setup_kernel_pagetable. */
+ addr = xen_start_info->pt_base;
+ size = xen_start_info->nr_pt_frames * PAGE_SIZE;
+
+ xen_cleanhighmap(addr, roundup(addr + size, PMD_SIZE * 2));
+ xen_start_info->pt_base = (unsigned long)__va(__pa(xen_start_info->pt_base));
+}
+#endif
+
+static void __init xen_pagetable_p2m_setup(void)
+{
+ xen_vmalloc_p2m_tree();
+
+#ifdef CONFIG_X86_64
+ xen_pagetable_p2m_free();
+
+ xen_pagetable_cleanhighmap();
+#endif
+ /* And revector! Bye bye old array */
+ xen_start_info->mfn_list = (unsigned long)xen_p2m_addr;
+}
+
+static void __init xen_pagetable_init(void)
+{
+ paging_init();
+ xen_post_allocator_init();
+
+ xen_pagetable_p2m_setup();
+
+ /* Allocate and initialize top and mid mfn levels for p2m structure */
+ xen_build_mfn_list_list();
+
+ /* Remap memory freed due to conflicts with E820 map */
+ xen_remap_memory();
+ xen_setup_mfn_list_list();
+}
+static void xen_write_cr2(unsigned long cr2)
+{
+ this_cpu_read(xen_vcpu)->arch.cr2 = cr2;
+}
+
+static unsigned long xen_read_cr2(void)
+{
+ return this_cpu_read(xen_vcpu)->arch.cr2;
+}
+
+unsigned long xen_read_cr2_direct(void)
+{
+ return this_cpu_read(xen_vcpu_info.arch.cr2);
+}
+
+static noinline void xen_flush_tlb(void)
+{
+ struct mmuext_op *op;
+ struct multicall_space mcs;
+
+ preempt_disable();
+
+ mcs = xen_mc_entry(sizeof(*op));
+
+ op = mcs.args;
+ op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
+ MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+ xen_mc_issue(PARAVIRT_LAZY_MMU);
+
+ preempt_enable();
+}
+
+static void xen_flush_tlb_one_user(unsigned long addr)
+{
+ struct mmuext_op *op;
+ struct multicall_space mcs;
+
+ trace_xen_mmu_flush_tlb_one_user(addr);
+
+ preempt_disable();
+
+ mcs = xen_mc_entry(sizeof(*op));
+ op = mcs.args;
+ op->cmd = MMUEXT_INVLPG_LOCAL;
+ op->arg1.linear_addr = addr & PAGE_MASK;
+ MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+ xen_mc_issue(PARAVIRT_LAZY_MMU);
+
+ preempt_enable();
+}
+
+static void xen_flush_tlb_others(const struct cpumask *cpus,
+ const struct flush_tlb_info *info)
+{
+ struct {
+ struct mmuext_op op;
+ DECLARE_BITMAP(mask, NR_CPUS);
+ } *args;
+ struct multicall_space mcs;
+ const size_t mc_entry_size = sizeof(args->op) +
+ sizeof(args->mask[0]) * BITS_TO_LONGS(num_possible_cpus());
+
+ trace_xen_mmu_flush_tlb_others(cpus, info->mm, info->start, info->end);
+
+ if (cpumask_empty(cpus))
+ return; /* nothing to do */
+
+ mcs = xen_mc_entry(mc_entry_size);
+ args = mcs.args;
+ args->op.arg2.vcpumask = to_cpumask(args->mask);
+
+ /* Remove us, and any offline CPUS. */
+ cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
+ cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
+
+ args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
+ if (info->end != TLB_FLUSH_ALL &&
+ (info->end - info->start) <= PAGE_SIZE) {
+ args->op.cmd = MMUEXT_INVLPG_MULTI;
+ args->op.arg1.linear_addr = info->start;
+ }
+
+ MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
+
+ xen_mc_issue(PARAVIRT_LAZY_MMU);
+}
+
+static unsigned long xen_read_cr3(void)
+{
+ return this_cpu_read(xen_cr3);
+}
+
+static void set_current_cr3(void *v)
+{
+ this_cpu_write(xen_current_cr3, (unsigned long)v);
+}
+
+static void __xen_write_cr3(bool kernel, unsigned long cr3)
+{
+ struct mmuext_op op;
+ unsigned long mfn;
+
+ trace_xen_mmu_write_cr3(kernel, cr3);
+
+ if (cr3)
+ mfn = pfn_to_mfn(PFN_DOWN(cr3));
+ else
+ mfn = 0;
+
+ WARN_ON(mfn == 0 && kernel);
+
+ op.cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
+ op.arg1.mfn = mfn;
+
+ xen_extend_mmuext_op(&op);
+
+ if (kernel) {
+ this_cpu_write(xen_cr3, cr3);
+
+ /* Update xen_current_cr3 once the batch has actually
+ been submitted. */
+ xen_mc_callback(set_current_cr3, (void *)cr3);
+ }
+}
+static void xen_write_cr3(unsigned long cr3)
+{
+ BUG_ON(preemptible());
+
+ xen_mc_batch(); /* disables interrupts */
+
+ /* Update while interrupts are disabled, so its atomic with
+ respect to ipis */
+ this_cpu_write(xen_cr3, cr3);
+
+ __xen_write_cr3(true, cr3);
+
+#ifdef CONFIG_X86_64
+ {
+ pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
+ if (user_pgd)
+ __xen_write_cr3(false, __pa(user_pgd));
+ else
+ __xen_write_cr3(false, 0);
+ }
+#endif
+
+ xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
+}
+
+#ifdef CONFIG_X86_64
+/*
+ * At the start of the day - when Xen launches a guest, it has already
+ * built pagetables for the guest. We diligently look over them
+ * in xen_setup_kernel_pagetable and graft as appropriate them in the
+ * init_top_pgt and its friends. Then when we are happy we load
+ * the new init_top_pgt - and continue on.
+ *
+ * The generic code starts (start_kernel) and 'init_mem_mapping' sets
+ * up the rest of the pagetables. When it has completed it loads the cr3.
+ * N.B. that baremetal would start at 'start_kernel' (and the early
+ * #PF handler would create bootstrap pagetables) - so we are running
+ * with the same assumptions as what to do when write_cr3 is executed
+ * at this point.
+ *
+ * Since there are no user-page tables at all, we have two variants
+ * of xen_write_cr3 - the early bootup (this one), and the late one
+ * (xen_write_cr3). The reason we have to do that is that in 64-bit
+ * the Linux kernel and user-space are both in ring 3 while the
+ * hypervisor is in ring 0.
+ */
+static void __init xen_write_cr3_init(unsigned long cr3)
+{
+ BUG_ON(preemptible());
+
+ xen_mc_batch(); /* disables interrupts */
+
+ /* Update while interrupts are disabled, so its atomic with
+ respect to ipis */
+ this_cpu_write(xen_cr3, cr3);
+
+ __xen_write_cr3(true, cr3);
+
+ xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
+}
+#endif
+
+static int xen_pgd_alloc(struct mm_struct *mm)
+{
+ pgd_t *pgd = mm->pgd;
+ int ret = 0;
+
+ BUG_ON(PagePinned(virt_to_page(pgd)));
+
+#ifdef CONFIG_X86_64
+ {
+ struct page *page = virt_to_page(pgd);
+ pgd_t *user_pgd;
+
+ BUG_ON(page->private != 0);
+
+ ret = -ENOMEM;
+
+ user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+ page->private = (unsigned long)user_pgd;
+
+ if (user_pgd != NULL) {
+#ifdef CONFIG_X86_VSYSCALL_EMULATION
+ user_pgd[pgd_index(VSYSCALL_ADDR)] =
+ __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
+#endif
+ ret = 0;
+ }
+
+ BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
+ }
+#endif
+ return ret;
+}
+
+static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
+{
+#ifdef CONFIG_X86_64
+ pgd_t *user_pgd = xen_get_user_pgd(pgd);
+
+ if (user_pgd)
+ free_page((unsigned long)user_pgd);
+#endif
+}
+
+/*
+ * Init-time set_pte while constructing initial pagetables, which
+ * doesn't allow RO page table pages to be remapped RW.
+ *
+ * If there is no MFN for this PFN then this page is initially
+ * ballooned out so clear the PTE (as in decrease_reservation() in
+ * drivers/xen/balloon.c).
+ *
+ * Many of these PTE updates are done on unpinned and writable pages
+ * and doing a hypercall for these is unnecessary and expensive. At
+ * this point it is not possible to tell if a page is pinned or not,
+ * so always write the PTE directly and rely on Xen trapping and
+ * emulating any updates as necessary.
+ */
+__visible pte_t xen_make_pte_init(pteval_t pte)
+{
+#ifdef CONFIG_X86_64
+ unsigned long pfn;
+
+ /*
+ * Pages belonging to the initial p2m list mapped outside the default
+ * address range must be mapped read-only. This region contains the
+ * page tables for mapping the p2m list, too, and page tables MUST be
+ * mapped read-only.
+ */
+ pfn = (pte & PTE_PFN_MASK) >> PAGE_SHIFT;
+ if (xen_start_info->mfn_list < __START_KERNEL_map &&
+ pfn >= xen_start_info->first_p2m_pfn &&
+ pfn < xen_start_info->first_p2m_pfn + xen_start_info->nr_p2m_frames)
+ pte &= ~_PAGE_RW;
+#endif
+ pte = pte_pfn_to_mfn(pte);
+ return native_make_pte(pte);
+}
+PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte_init);
+
+static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
+{
+#ifdef CONFIG_X86_32
+ /* If there's an existing pte, then don't allow _PAGE_RW to be set */
+ if (pte_mfn(pte) != INVALID_P2M_ENTRY
+ && pte_val_ma(*ptep) & _PAGE_PRESENT)
+ pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
+ pte_val_ma(pte));
+#endif
+ __xen_set_pte(ptep, pte);
+}
+
+/* Early in boot, while setting up the initial pagetable, assume
+ everything is pinned. */
+static void __init xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
+{
+#ifdef CONFIG_FLATMEM
+ BUG_ON(mem_map); /* should only be used early */
+#endif
+ make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
+ pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
+}
+
+/* Used for pmd and pud */
+static void __init xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn)
+{
+#ifdef CONFIG_FLATMEM
+ BUG_ON(mem_map); /* should only be used early */
+#endif
+ make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
+}
+
+/* Early release_pte assumes that all pts are pinned, since there's
+ only init_mm and anything attached to that is pinned. */
+static void __init xen_release_pte_init(unsigned long pfn)
+{
+ pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
+ make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
+}
+
+static void __init xen_release_pmd_init(unsigned long pfn)
+{
+ make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
+}
+
+static inline void __pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
+{
+ struct multicall_space mcs;
+ struct mmuext_op *op;
+
+ mcs = __xen_mc_entry(sizeof(*op));
+ op = mcs.args;
+ op->cmd = cmd;
+ op->arg1.mfn = pfn_to_mfn(pfn);
+
+ MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
+}
+
+static inline void __set_pfn_prot(unsigned long pfn, pgprot_t prot)
+{
+ struct multicall_space mcs;
+ unsigned long addr = (unsigned long)__va(pfn << PAGE_SHIFT);
+
+ mcs = __xen_mc_entry(0);
+ MULTI_update_va_mapping(mcs.mc, (unsigned long)addr,
+ pfn_pte(pfn, prot), 0);
+}
+
+/* This needs to make sure the new pte page is pinned iff its being
+ attached to a pinned pagetable. */
+static inline void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn,
+ unsigned level)
+{
+ bool pinned = xen_page_pinned(mm->pgd);
+
+ trace_xen_mmu_alloc_ptpage(mm, pfn, level, pinned);
+
+ if (pinned) {
+ struct page *page = pfn_to_page(pfn);
+
+ if (static_branch_likely(&xen_struct_pages_ready))
+ SetPagePinned(page);
+
+ if (!PageHighMem(page)) {
+ xen_mc_batch();
+
+ __set_pfn_prot(pfn, PAGE_KERNEL_RO);
+
+ if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS)
+ __pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
+
+ xen_mc_issue(PARAVIRT_LAZY_MMU);
+ } else {
+ /* make sure there are no stray mappings of
+ this page */
+ kmap_flush_unused();
+ }
+ }
+}
+
+static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
+{
+ xen_alloc_ptpage(mm, pfn, PT_PTE);
+}
+
+static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
+{
+ xen_alloc_ptpage(mm, pfn, PT_PMD);
+}
+
+/* This should never happen until we're OK to use struct page */
+static inline void xen_release_ptpage(unsigned long pfn, unsigned level)
+{
+ struct page *page = pfn_to_page(pfn);
+ bool pinned = PagePinned(page);
+
+ trace_xen_mmu_release_ptpage(pfn, level, pinned);
+
+ if (pinned) {
+ if (!PageHighMem(page)) {
+ xen_mc_batch();
+
+ if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS)
+ __pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
+
+ __set_pfn_prot(pfn, PAGE_KERNEL);
+
+ xen_mc_issue(PARAVIRT_LAZY_MMU);
+ }
+ ClearPagePinned(page);
+ }
+}
+
+static void xen_release_pte(unsigned long pfn)
+{
+ xen_release_ptpage(pfn, PT_PTE);
+}
+
+static void xen_release_pmd(unsigned long pfn)
+{
+ xen_release_ptpage(pfn, PT_PMD);
+}
+
+#ifdef CONFIG_X86_64
+static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
+{
+ xen_alloc_ptpage(mm, pfn, PT_PUD);
+}
+
+static void xen_release_pud(unsigned long pfn)
+{
+ xen_release_ptpage(pfn, PT_PUD);
+}
+#endif
+
+void __init xen_reserve_top(void)
+{
+#ifdef CONFIG_X86_32
+ unsigned long top = HYPERVISOR_VIRT_START;
+ struct xen_platform_parameters pp;
+
+ if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
+ top = pp.virt_start;
+
+ reserve_top_address(-top);
+#endif /* CONFIG_X86_32 */
+}
+
+/*
+ * Like __va(), but returns address in the kernel mapping (which is
+ * all we have until the physical memory mapping has been set up.
+ */
+static void * __init __ka(phys_addr_t paddr)
+{
+#ifdef CONFIG_X86_64
+ return (void *)(paddr + __START_KERNEL_map);
+#else
+ return __va(paddr);
+#endif
+}
+
+/* Convert a machine address to physical address */
+static unsigned long __init m2p(phys_addr_t maddr)
+{
+ phys_addr_t paddr;
+
+ maddr &= XEN_PTE_MFN_MASK;
+ paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
+
+ return paddr;
+}
+
+/* Convert a machine address to kernel virtual */
+static void * __init m2v(phys_addr_t maddr)
+{
+ return __ka(m2p(maddr));
+}
+
+/* Set the page permissions on an identity-mapped pages */
+static void __init set_page_prot_flags(void *addr, pgprot_t prot,
+ unsigned long flags)
+{
+ unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
+ pte_t pte = pfn_pte(pfn, prot);
+
+ if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, flags))
+ BUG();
+}
+static void __init set_page_prot(void *addr, pgprot_t prot)
+{
+ return set_page_prot_flags(addr, prot, UVMF_NONE);
+}
+#ifdef CONFIG_X86_32
+static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
+{
+ unsigned pmdidx, pteidx;
+ unsigned ident_pte;
+ unsigned long pfn;
+
+ level1_ident_pgt = extend_brk(sizeof(pte_t) * LEVEL1_IDENT_ENTRIES,
+ PAGE_SIZE);
+
+ ident_pte = 0;
+ pfn = 0;
+ for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
+ pte_t *pte_page;
+
+ /* Reuse or allocate a page of ptes */
+ if (pmd_present(pmd[pmdidx]))
+ pte_page = m2v(pmd[pmdidx].pmd);
+ else {
+ /* Check for free pte pages */
+ if (ident_pte == LEVEL1_IDENT_ENTRIES)
+ break;
+
+ pte_page = &level1_ident_pgt[ident_pte];
+ ident_pte += PTRS_PER_PTE;
+
+ pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
+ }
+
+ /* Install mappings */
+ for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
+ pte_t pte;
+
+ if (pfn > max_pfn_mapped)
+ max_pfn_mapped = pfn;
+
+ if (!pte_none(pte_page[pteidx]))
+ continue;
+
+ pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
+ pte_page[pteidx] = pte;
+ }
+ }
+
+ for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
+ set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
+
+ set_page_prot(pmd, PAGE_KERNEL_RO);
+}
+#endif
+void __init xen_setup_machphys_mapping(void)
+{
+ struct xen_machphys_mapping mapping;
+
+ if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
+ machine_to_phys_mapping = (unsigned long *)mapping.v_start;
+ machine_to_phys_nr = mapping.max_mfn + 1;
+ } else {
+ machine_to_phys_nr = MACH2PHYS_NR_ENTRIES;
+ }
+#ifdef CONFIG_X86_32
+ WARN_ON((machine_to_phys_mapping + (machine_to_phys_nr - 1))
+ < machine_to_phys_mapping);
+#endif
+}
+
+#ifdef CONFIG_X86_64
+static void __init convert_pfn_mfn(void *v)
+{
+ pte_t *pte = v;
+ int i;
+
+ /* All levels are converted the same way, so just treat them
+ as ptes. */
+ for (i = 0; i < PTRS_PER_PTE; i++)
+ pte[i] = xen_make_pte(pte[i].pte);
+}
+static void __init check_pt_base(unsigned long *pt_base, unsigned long *pt_end,
+ unsigned long addr)
+{
+ if (*pt_base == PFN_DOWN(__pa(addr))) {
+ set_page_prot_flags((void *)addr, PAGE_KERNEL, UVMF_INVLPG);
+ clear_page((void *)addr);
+ (*pt_base)++;
+ }
+ if (*pt_end == PFN_DOWN(__pa(addr))) {
+ set_page_prot_flags((void *)addr, PAGE_KERNEL, UVMF_INVLPG);
+ clear_page((void *)addr);
+ (*pt_end)--;
+ }
+}
+/*
+ * Set up the initial kernel pagetable.
+ *
+ * We can construct this by grafting the Xen provided pagetable into
+ * head_64.S's preconstructed pagetables. We copy the Xen L2's into
+ * level2_ident_pgt, and level2_kernel_pgt. This means that only the
+ * kernel has a physical mapping to start with - but that's enough to
+ * get __va working. We need to fill in the rest of the physical
+ * mapping once some sort of allocator has been set up.
+ */
+void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
+{
+ pud_t *l3;
+ pmd_t *l2;
+ unsigned long addr[3];
+ unsigned long pt_base, pt_end;
+ unsigned i;
+
+ /* max_pfn_mapped is the last pfn mapped in the initial memory
+ * mappings. Considering that on Xen after the kernel mappings we
+ * have the mappings of some pages that don't exist in pfn space, we
+ * set max_pfn_mapped to the last real pfn mapped. */
+ if (xen_start_info->mfn_list < __START_KERNEL_map)
+ max_pfn_mapped = xen_start_info->first_p2m_pfn;
+ else
+ max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list));
+
+ pt_base = PFN_DOWN(__pa(xen_start_info->pt_base));
+ pt_end = pt_base + xen_start_info->nr_pt_frames;
+
+ /* Zap identity mapping */
+ init_top_pgt[0] = __pgd(0);
+
+ /* Pre-constructed entries are in pfn, so convert to mfn */
+ /* L4[273] -> level3_ident_pgt */
+ /* L4[511] -> level3_kernel_pgt */
+ convert_pfn_mfn(init_top_pgt);
+
+ /* L3_i[0] -> level2_ident_pgt */
+ convert_pfn_mfn(level3_ident_pgt);
+ /* L3_k[510] -> level2_kernel_pgt */
+ /* L3_k[511] -> level2_fixmap_pgt */
+ convert_pfn_mfn(level3_kernel_pgt);
+
+ /* L3_k[511][508-FIXMAP_PMD_NUM ... 507] -> level1_fixmap_pgt */
+ convert_pfn_mfn(level2_fixmap_pgt);
+
+ /* We get [511][511] and have Xen's version of level2_kernel_pgt */
+ l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
+ l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
+
+ addr[0] = (unsigned long)pgd;
+ addr[1] = (unsigned long)l3;
+ addr[2] = (unsigned long)l2;
+ /* Graft it onto L4[273][0]. Note that we creating an aliasing problem:
+ * Both L4[273][0] and L4[511][510] have entries that point to the same
+ * L2 (PMD) tables. Meaning that if you modify it in __va space
+ * it will be also modified in the __ka space! (But if you just
+ * modify the PMD table to point to other PTE's or none, then you
+ * are OK - which is what cleanup_highmap does) */
+ copy_page(level2_ident_pgt, l2);
+ /* Graft it onto L4[511][510] */
+ copy_page(level2_kernel_pgt, l2);
+
+ /*
+ * Zap execute permission from the ident map. Due to the sharing of
+ * L1 entries we need to do this in the L2.
+ */
+ if (__supported_pte_mask & _PAGE_NX) {
+ for (i = 0; i < PTRS_PER_PMD; ++i) {
+ if (pmd_none(level2_ident_pgt[i]))
+ continue;
+ level2_ident_pgt[i] = pmd_set_flags(level2_ident_pgt[i], _PAGE_NX);
+ }
+ }
+
+ /* Copy the initial P->M table mappings if necessary. */
+ i = pgd_index(xen_start_info->mfn_list);
+ if (i && i < pgd_index(__START_KERNEL_map))
+ init_top_pgt[i] = ((pgd_t *)xen_start_info->pt_base)[i];
+
+ /* Make pagetable pieces RO */
+ set_page_prot(init_top_pgt, PAGE_KERNEL_RO);
+ set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
+ set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
+ set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
+ set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO);
+ set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
+ set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
+
+ for (i = 0; i < FIXMAP_PMD_NUM; i++) {
+ set_page_prot(level1_fixmap_pgt + i * PTRS_PER_PTE,
+ PAGE_KERNEL_RO);
+ }
+
+ /* Pin down new L4 */
+ pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
+ PFN_DOWN(__pa_symbol(init_top_pgt)));
+
+ /* Unpin Xen-provided one */
+ pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
+
+ /*
+ * At this stage there can be no user pgd, and no page structure to
+ * attach it to, so make sure we just set kernel pgd.
+ */
+ xen_mc_batch();
+ __xen_write_cr3(true, __pa(init_top_pgt));
+ xen_mc_issue(PARAVIRT_LAZY_CPU);
+
+ /* We can't that easily rip out L3 and L2, as the Xen pagetables are
+ * set out this way: [L4], [L1], [L2], [L3], [L1], [L1] ... for
+ * the initial domain. For guests using the toolstack, they are in:
+ * [L4], [L3], [L2], [L1], [L1], order .. So for dom0 we can only
+ * rip out the [L4] (pgd), but for guests we shave off three pages.
+ */
+ for (i = 0; i < ARRAY_SIZE(addr); i++)
+ check_pt_base(&pt_base, &pt_end, addr[i]);
+
+ /* Our (by three pages) smaller Xen pagetable that we are using */
+ xen_pt_base = PFN_PHYS(pt_base);
+ xen_pt_size = (pt_end - pt_base) * PAGE_SIZE;
+ memblock_reserve(xen_pt_base, xen_pt_size);
+
+ /* Revector the xen_start_info */
+ xen_start_info = (struct start_info *)__va(__pa(xen_start_info));
+}
+
+/*
+ * Read a value from a physical address.
+ */
+static unsigned long __init xen_read_phys_ulong(phys_addr_t addr)
+{
+ unsigned long *vaddr;
+ unsigned long val;
+
+ vaddr = early_memremap_ro(addr, sizeof(val));
+ val = *vaddr;
+ early_memunmap(vaddr, sizeof(val));
+ return val;
+}
+
+/*
+ * Translate a virtual address to a physical one without relying on mapped
+ * page tables. Don't rely on big pages being aligned in (guest) physical
+ * space!
+ */
+static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr)
+{
+ phys_addr_t pa;
+ pgd_t pgd;
+ pud_t pud;
+ pmd_t pmd;
+ pte_t pte;
+
+ pa = read_cr3_pa();
+ pgd = native_make_pgd(xen_read_phys_ulong(pa + pgd_index(vaddr) *
+ sizeof(pgd)));
+ if (!pgd_present(pgd))
+ return 0;
+
+ pa = pgd_val(pgd) & PTE_PFN_MASK;
+ pud = native_make_pud(xen_read_phys_ulong(pa + pud_index(vaddr) *
+ sizeof(pud)));
+ if (!pud_present(pud))
+ return 0;
+ pa = pud_val(pud) & PTE_PFN_MASK;
+ if (pud_large(pud))
+ return pa + (vaddr & ~PUD_MASK);
+
+ pmd = native_make_pmd(xen_read_phys_ulong(pa + pmd_index(vaddr) *
+ sizeof(pmd)));
+ if (!pmd_present(pmd))
+ return 0;
+ pa = pmd_val(pmd) & PTE_PFN_MASK;
+ if (pmd_large(pmd))
+ return pa + (vaddr & ~PMD_MASK);
+
+ pte = native_make_pte(xen_read_phys_ulong(pa + pte_index(vaddr) *
+ sizeof(pte)));
+ if (!pte_present(pte))
+ return 0;
+ pa = pte_pfn(pte) << PAGE_SHIFT;
+
+ return pa | (vaddr & ~PAGE_MASK);
+}
+
+/*
+ * Find a new area for the hypervisor supplied p2m list and relocate the p2m to
+ * this area.
+ */
+void __init xen_relocate_p2m(void)
+{
+ phys_addr_t size, new_area, pt_phys, pmd_phys, pud_phys;
+ unsigned long p2m_pfn, p2m_pfn_end, n_frames, pfn, pfn_end;
+ int n_pte, n_pt, n_pmd, n_pud, idx_pte, idx_pt, idx_pmd, idx_pud;
+ pte_t *pt;
+ pmd_t *pmd;
+ pud_t *pud;
+ pgd_t *pgd;
+ unsigned long *new_p2m;
+
+ size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
+ n_pte = roundup(size, PAGE_SIZE) >> PAGE_SHIFT;
+ n_pt = roundup(size, PMD_SIZE) >> PMD_SHIFT;
+ n_pmd = roundup(size, PUD_SIZE) >> PUD_SHIFT;
+ n_pud = roundup(size, P4D_SIZE) >> P4D_SHIFT;
+ n_frames = n_pte + n_pt + n_pmd + n_pud;
+
+ new_area = xen_find_free_area(PFN_PHYS(n_frames));
+ if (!new_area) {
+ xen_raw_console_write("Can't find new memory area for p2m needed due to E820 map conflict\n");
+ BUG();
+ }
+
+ /*
+ * Setup the page tables for addressing the new p2m list.
+ * We have asked the hypervisor to map the p2m list at the user address
+ * PUD_SIZE. It may have done so, or it may have used a kernel space
+ * address depending on the Xen version.
+ * To avoid any possible virtual address collision, just use
+ * 2 * PUD_SIZE for the new area.
+ */
+ pud_phys = new_area;
+ pmd_phys = pud_phys + PFN_PHYS(n_pud);
+ pt_phys = pmd_phys + PFN_PHYS(n_pmd);
+ p2m_pfn = PFN_DOWN(pt_phys) + n_pt;
+
+ pgd = __va(read_cr3_pa());
+ new_p2m = (unsigned long *)(2 * PGDIR_SIZE);
+ for (idx_pud = 0; idx_pud < n_pud; idx_pud++) {
+ pud = early_memremap(pud_phys, PAGE_SIZE);
+ clear_page(pud);
+ for (idx_pmd = 0; idx_pmd < min(n_pmd, PTRS_PER_PUD);
+ idx_pmd++) {
+ pmd = early_memremap(pmd_phys, PAGE_SIZE);
+ clear_page(pmd);
+ for (idx_pt = 0; idx_pt < min(n_pt, PTRS_PER_PMD);
+ idx_pt++) {
+ pt = early_memremap(pt_phys, PAGE_SIZE);
+ clear_page(pt);
+ for (idx_pte = 0;
+ idx_pte < min(n_pte, PTRS_PER_PTE);
+ idx_pte++) {
+ pt[idx_pte] = pfn_pte(p2m_pfn,
+ PAGE_KERNEL);
+ p2m_pfn++;
+ }
+ n_pte -= PTRS_PER_PTE;
+ early_memunmap(pt, PAGE_SIZE);
+ make_lowmem_page_readonly(__va(pt_phys));
+ pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE,
+ PFN_DOWN(pt_phys));
+ pmd[idx_pt] = __pmd(_PAGE_TABLE | pt_phys);
+ pt_phys += PAGE_SIZE;
+ }
+ n_pt -= PTRS_PER_PMD;
+ early_memunmap(pmd, PAGE_SIZE);
+ make_lowmem_page_readonly(__va(pmd_phys));
+ pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE,
+ PFN_DOWN(pmd_phys));
+ pud[idx_pmd] = __pud(_PAGE_TABLE | pmd_phys);
+ pmd_phys += PAGE_SIZE;
+ }
+ n_pmd -= PTRS_PER_PUD;
+ early_memunmap(pud, PAGE_SIZE);
+ make_lowmem_page_readonly(__va(pud_phys));
+ pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(pud_phys));
+ set_pgd(pgd + 2 + idx_pud, __pgd(_PAGE_TABLE | pud_phys));
+ pud_phys += PAGE_SIZE;
+ }
+
+ /* Now copy the old p2m info to the new area. */
+ memcpy(new_p2m, xen_p2m_addr, size);
+ xen_p2m_addr = new_p2m;
+
+ /* Release the old p2m list and set new list info. */
+ p2m_pfn = PFN_DOWN(xen_early_virt_to_phys(xen_start_info->mfn_list));
+ BUG_ON(!p2m_pfn);
+ p2m_pfn_end = p2m_pfn + PFN_DOWN(size);
+
+ if (xen_start_info->mfn_list < __START_KERNEL_map) {
+ pfn = xen_start_info->first_p2m_pfn;
+ pfn_end = xen_start_info->first_p2m_pfn +
+ xen_start_info->nr_p2m_frames;
+ set_pgd(pgd + 1, __pgd(0));
+ } else {
+ pfn = p2m_pfn;
+ pfn_end = p2m_pfn_end;
+ }
+
+ memblock_free(PFN_PHYS(pfn), PAGE_SIZE * (pfn_end - pfn));
+ while (pfn < pfn_end) {
+ if (pfn == p2m_pfn) {
+ pfn = p2m_pfn_end;
+ continue;
+ }
+ make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
+ pfn++;
+ }
+
+ xen_start_info->mfn_list = (unsigned long)xen_p2m_addr;
+ xen_start_info->first_p2m_pfn = PFN_DOWN(new_area);
+ xen_start_info->nr_p2m_frames = n_frames;
+}
+
+#else /* !CONFIG_X86_64 */
+static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD);
+static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD);
+RESERVE_BRK(fixup_kernel_pmd, PAGE_SIZE);
+RESERVE_BRK(fixup_kernel_pte, PAGE_SIZE);
+
+static void __init xen_write_cr3_init(unsigned long cr3)
+{
+ unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir));
+
+ BUG_ON(read_cr3_pa() != __pa(initial_page_table));
+ BUG_ON(cr3 != __pa(swapper_pg_dir));
+
+ /*
+ * We are switching to swapper_pg_dir for the first time (from
+ * initial_page_table) and therefore need to mark that page
+ * read-only and then pin it.
+ *
+ * Xen disallows sharing of kernel PMDs for PAE
+ * guests. Therefore we must copy the kernel PMD from
+ * initial_page_table into a new kernel PMD to be used in
+ * swapper_pg_dir.
+ */
+ swapper_kernel_pmd =
+ extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
+ copy_page(swapper_kernel_pmd, initial_kernel_pmd);
+ swapper_pg_dir[KERNEL_PGD_BOUNDARY] =
+ __pgd(__pa(swapper_kernel_pmd) | _PAGE_PRESENT);
+ set_page_prot(swapper_kernel_pmd, PAGE_KERNEL_RO);
+
+ set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
+ xen_write_cr3(cr3);
+ pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, pfn);
+
+ pin_pagetable_pfn(MMUEXT_UNPIN_TABLE,
+ PFN_DOWN(__pa(initial_page_table)));
+ set_page_prot(initial_page_table, PAGE_KERNEL);
+ set_page_prot(initial_kernel_pmd, PAGE_KERNEL);
+
+ pv_mmu_ops.write_cr3 = &xen_write_cr3;
+}
+
+/*
+ * For 32 bit domains xen_start_info->pt_base is the pgd address which might be
+ * not the first page table in the page table pool.
+ * Iterate through the initial page tables to find the real page table base.
+ */
+static phys_addr_t __init xen_find_pt_base(pmd_t *pmd)
+{
+ phys_addr_t pt_base, paddr;
+ unsigned pmdidx;
+
+ pt_base = min(__pa(xen_start_info->pt_base), __pa(pmd));
+
+ for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++)
+ if (pmd_present(pmd[pmdidx]) && !pmd_large(pmd[pmdidx])) {
+ paddr = m2p(pmd[pmdidx].pmd);
+ pt_base = min(pt_base, paddr);
+ }
+
+ return pt_base;
+}
+
+void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
+{
+ pmd_t *kernel_pmd;
+
+ kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
+
+ xen_pt_base = xen_find_pt_base(kernel_pmd);
+ xen_pt_size = xen_start_info->nr_pt_frames * PAGE_SIZE;
+
+ initial_kernel_pmd =
+ extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
+
+ max_pfn_mapped = PFN_DOWN(xen_pt_base + xen_pt_size + 512 * 1024);
+
+ copy_page(initial_kernel_pmd, kernel_pmd);
+
+ xen_map_identity_early(initial_kernel_pmd, max_pfn);
+
+ copy_page(initial_page_table, pgd);
+ initial_page_table[KERNEL_PGD_BOUNDARY] =
+ __pgd(__pa(initial_kernel_pmd) | _PAGE_PRESENT);
+
+ set_page_prot(initial_kernel_pmd, PAGE_KERNEL_RO);
+ set_page_prot(initial_page_table, PAGE_KERNEL_RO);
+ set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
+
+ pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
+
+ pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE,
+ PFN_DOWN(__pa(initial_page_table)));
+ xen_write_cr3(__pa(initial_page_table));
+
+ memblock_reserve(xen_pt_base, xen_pt_size);
+}
+#endif /* CONFIG_X86_64 */
+
+void __init xen_reserve_special_pages(void)
+{
+ phys_addr_t paddr;
+
+ memblock_reserve(__pa(xen_start_info), PAGE_SIZE);
+ if (xen_start_info->store_mfn) {
+ paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->store_mfn));
+ memblock_reserve(paddr, PAGE_SIZE);
+ }
+ if (!xen_initial_domain()) {
+ paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->console.domU.mfn));
+ memblock_reserve(paddr, PAGE_SIZE);
+ }
+}
+
+void __init xen_pt_check_e820(void)
+{
+ if (xen_is_e820_reserved(xen_pt_base, xen_pt_size)) {
+ xen_raw_console_write("Xen hypervisor allocated page table memory conflicts with E820 map\n");
+ BUG();
+ }
+}
+
+static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss;
+
+static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
+{
+ pte_t pte;
+
+ phys >>= PAGE_SHIFT;
+
+ switch (idx) {
+ case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
+#ifdef CONFIG_X86_32
+ case FIX_WP_TEST:
+# ifdef CONFIG_HIGHMEM
+ case FIX_KMAP_BEGIN ... FIX_KMAP_END:
+# endif
+#elif defined(CONFIG_X86_VSYSCALL_EMULATION)
+ case VSYSCALL_PAGE:
+#endif
+ case FIX_TEXT_POKE0:
+ case FIX_TEXT_POKE1:
+ /* All local page mappings */
+ pte = pfn_pte(phys, prot);
+ break;
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ case FIX_APIC_BASE: /* maps dummy local APIC */
+ pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
+ break;
+#endif
+
+#ifdef CONFIG_X86_IO_APIC
+ case FIX_IO_APIC_BASE_0 ... FIX_IO_APIC_BASE_END:
+ /*
+ * We just don't map the IO APIC - all access is via
+ * hypercalls. Keep the address in the pte for reference.
+ */
+ pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
+ break;
+#endif
+
+ case FIX_PARAVIRT_BOOTMAP:
+ /* This is an MFN, but it isn't an IO mapping from the
+ IO domain */
+ pte = mfn_pte(phys, prot);
+ break;
+
+ default:
+ /* By default, set_fixmap is used for hardware mappings */
+ pte = mfn_pte(phys, prot);
+ break;
+ }
+
+ __native_set_fixmap(idx, pte);
+
+#ifdef CONFIG_X86_VSYSCALL_EMULATION
+ /* Replicate changes to map the vsyscall page into the user
+ pagetable vsyscall mapping. */
+ if (idx == VSYSCALL_PAGE) {
+ unsigned long vaddr = __fix_to_virt(idx);
+ set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
+ }
+#endif
+}
+
+static void __init xen_post_allocator_init(void)
+{
+ pv_mmu_ops.set_pte = xen_set_pte;
+ pv_mmu_ops.set_pmd = xen_set_pmd;
+ pv_mmu_ops.set_pud = xen_set_pud;
+#ifdef CONFIG_X86_64
+ pv_mmu_ops.set_p4d = xen_set_p4d;
+#endif
+
+ /* This will work as long as patching hasn't happened yet
+ (which it hasn't) */
+ pv_mmu_ops.alloc_pte = xen_alloc_pte;
+ pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
+ pv_mmu_ops.release_pte = xen_release_pte;
+ pv_mmu_ops.release_pmd = xen_release_pmd;
+#ifdef CONFIG_X86_64
+ pv_mmu_ops.alloc_pud = xen_alloc_pud;
+ pv_mmu_ops.release_pud = xen_release_pud;
+#endif
+ pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte);
+
+#ifdef CONFIG_X86_64
+ pv_mmu_ops.write_cr3 = &xen_write_cr3;
+#endif
+}
+
+static void xen_leave_lazy_mmu(void)
+{
+ preempt_disable();
+ xen_mc_flush();
+ paravirt_leave_lazy_mmu();
+ preempt_enable();
+}
+
+static const struct pv_mmu_ops xen_mmu_ops __initconst = {
+ .read_cr2 = xen_read_cr2,
+ .write_cr2 = xen_write_cr2,
+
+ .read_cr3 = xen_read_cr3,
+ .write_cr3 = xen_write_cr3_init,
+
+ .flush_tlb_user = xen_flush_tlb,
+ .flush_tlb_kernel = xen_flush_tlb,
+ .flush_tlb_one_user = xen_flush_tlb_one_user,
+ .flush_tlb_others = xen_flush_tlb_others,
+ .tlb_remove_table = tlb_remove_table,
+
+ .pgd_alloc = xen_pgd_alloc,
+ .pgd_free = xen_pgd_free,
+
+ .alloc_pte = xen_alloc_pte_init,
+ .release_pte = xen_release_pte_init,
+ .alloc_pmd = xen_alloc_pmd_init,
+ .release_pmd = xen_release_pmd_init,
+
+ .set_pte = xen_set_pte_init,
+ .set_pte_at = xen_set_pte_at,
+ .set_pmd = xen_set_pmd_hyper,
+
+ .ptep_modify_prot_start = __ptep_modify_prot_start,
+ .ptep_modify_prot_commit = __ptep_modify_prot_commit,
+
+ .pte_val = PV_CALLEE_SAVE(xen_pte_val),
+ .pgd_val = PV_CALLEE_SAVE(xen_pgd_val),
+
+ .make_pte = PV_CALLEE_SAVE(xen_make_pte_init),
+ .make_pgd = PV_CALLEE_SAVE(xen_make_pgd),
+
+#ifdef CONFIG_X86_PAE
+ .set_pte_atomic = xen_set_pte_atomic,
+ .pte_clear = xen_pte_clear,
+ .pmd_clear = xen_pmd_clear,
+#endif /* CONFIG_X86_PAE */
+ .set_pud = xen_set_pud_hyper,
+
+ .make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
+ .pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
+
+#ifdef CONFIG_X86_64
+ .pud_val = PV_CALLEE_SAVE(xen_pud_val),
+ .make_pud = PV_CALLEE_SAVE(xen_make_pud),
+ .set_p4d = xen_set_p4d_hyper,
+
+ .alloc_pud = xen_alloc_pmd_init,
+ .release_pud = xen_release_pmd_init,
+
+#if CONFIG_PGTABLE_LEVELS >= 5
+ .p4d_val = PV_CALLEE_SAVE(xen_p4d_val),
+ .make_p4d = PV_CALLEE_SAVE(xen_make_p4d),
+#endif
+#endif /* CONFIG_X86_64 */
+
+ .activate_mm = xen_activate_mm,
+ .dup_mmap = xen_dup_mmap,
+ .exit_mmap = xen_exit_mmap,
+
+ .lazy_mode = {
+ .enter = paravirt_enter_lazy_mmu,
+ .leave = xen_leave_lazy_mmu,
+ .flush = paravirt_flush_lazy_mmu,
+ },
+
+ .set_fixmap = xen_set_fixmap,
+};
+
+void __init xen_init_mmu_ops(void)
+{
+ x86_init.paging.pagetable_init = xen_pagetable_init;
+ x86_init.hyper.init_after_bootmem = xen_after_bootmem;
+
+ pv_mmu_ops = xen_mmu_ops;
+
+ memset(dummy_mapping, 0xff, PAGE_SIZE);
+}
+
+/* Protected by xen_reservation_lock. */
+#define MAX_CONTIG_ORDER 9 /* 2MB */
+static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER];
+
+#define VOID_PTE (mfn_pte(0, __pgprot(0)))
+static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order,
+ unsigned long *in_frames,
+ unsigned long *out_frames)
+{
+ int i;
+ struct multicall_space mcs;
+
+ xen_mc_batch();
+ for (i = 0; i < (1UL<<order); i++, vaddr += PAGE_SIZE) {
+ mcs = __xen_mc_entry(0);
+
+ if (in_frames)
+ in_frames[i] = virt_to_mfn(vaddr);
+
+ MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0);
+ __set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY);
+
+ if (out_frames)
+ out_frames[i] = virt_to_pfn(vaddr);
+ }
+ xen_mc_issue(0);
+}
+
+/*
+ * Update the pfn-to-mfn mappings for a virtual address range, either to
+ * point to an array of mfns, or contiguously from a single starting
+ * mfn.
+ */
+static void xen_remap_exchanged_ptes(unsigned long vaddr, int order,
+ unsigned long *mfns,
+ unsigned long first_mfn)
+{
+ unsigned i, limit;
+ unsigned long mfn;
+
+ xen_mc_batch();
+
+ limit = 1u << order;
+ for (i = 0; i < limit; i++, vaddr += PAGE_SIZE) {
+ struct multicall_space mcs;
+ unsigned flags;
+
+ mcs = __xen_mc_entry(0);
+ if (mfns)
+ mfn = mfns[i];
+ else
+ mfn = first_mfn + i;
+
+ if (i < (limit - 1))
+ flags = 0;
+ else {
+ if (order == 0)
+ flags = UVMF_INVLPG | UVMF_ALL;
+ else
+ flags = UVMF_TLB_FLUSH | UVMF_ALL;
+ }
+
+ MULTI_update_va_mapping(mcs.mc, vaddr,
+ mfn_pte(mfn, PAGE_KERNEL), flags);
+
+ set_phys_to_machine(virt_to_pfn(vaddr), mfn);
+ }
+
+ xen_mc_issue(0);
+}
+
+/*
+ * Perform the hypercall to exchange a region of our pfns to point to
+ * memory with the required contiguous alignment. Takes the pfns as
+ * input, and populates mfns as output.
+ *
+ * Returns a success code indicating whether the hypervisor was able to
+ * satisfy the request or not.
+ */
+static int xen_exchange_memory(unsigned long extents_in, unsigned int order_in,
+ unsigned long *pfns_in,
+ unsigned long extents_out,
+ unsigned int order_out,
+ unsigned long *mfns_out,
+ unsigned int address_bits)
+{
+ long rc;
+ int success;
+
+ struct xen_memory_exchange exchange = {
+ .in = {
+ .nr_extents = extents_in,
+ .extent_order = order_in,
+ .extent_start = pfns_in,
+ .domid = DOMID_SELF
+ },
+ .out = {
+ .nr_extents = extents_out,
+ .extent_order = order_out,
+ .extent_start = mfns_out,
+ .address_bits = address_bits,
+ .domid = DOMID_SELF
+ }
+ };
+
+ BUG_ON(extents_in << order_in != extents_out << order_out);
+
+ rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
+ success = (exchange.nr_exchanged == extents_in);
+
+ BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
+ BUG_ON(success && (rc != 0));
+
+ return success;
+}
+
+int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order,
+ unsigned int address_bits,
+ dma_addr_t *dma_handle)
+{
+ unsigned long *in_frames = discontig_frames, out_frame;
+ unsigned long flags;
+ int success;
+ unsigned long vstart = (unsigned long)phys_to_virt(pstart);
+
+ /*
+ * Currently an auto-translated guest will not perform I/O, nor will
+ * it require PAE page directories below 4GB. Therefore any calls to
+ * this function are redundant and can be ignored.
+ */
+
+ if (unlikely(order > MAX_CONTIG_ORDER))
+ return -ENOMEM;
+
+ memset((void *) vstart, 0, PAGE_SIZE << order);
+
+ spin_lock_irqsave(&xen_reservation_lock, flags);
+
+ /* 1. Zap current PTEs, remembering MFNs. */
+ xen_zap_pfn_range(vstart, order, in_frames, NULL);
+
+ /* 2. Get a new contiguous memory extent. */
+ out_frame = virt_to_pfn(vstart);
+ success = xen_exchange_memory(1UL << order, 0, in_frames,
+ 1, order, &out_frame,
+ address_bits);
+
+ /* 3. Map the new extent in place of old pages. */
+ if (success)
+ xen_remap_exchanged_ptes(vstart, order, NULL, out_frame);
+ else
+ xen_remap_exchanged_ptes(vstart, order, in_frames, 0);
+
+ spin_unlock_irqrestore(&xen_reservation_lock, flags);
+
+ *dma_handle = virt_to_machine(vstart).maddr;
+ return success ? 0 : -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(xen_create_contiguous_region);
+
+void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order)
+{
+ unsigned long *out_frames = discontig_frames, in_frame;
+ unsigned long flags;
+ int success;
+ unsigned long vstart;
+
+ if (unlikely(order > MAX_CONTIG_ORDER))
+ return;
+
+ vstart = (unsigned long)phys_to_virt(pstart);
+ memset((void *) vstart, 0, PAGE_SIZE << order);
+
+ spin_lock_irqsave(&xen_reservation_lock, flags);
+
+ /* 1. Find start MFN of contiguous extent. */
+ in_frame = virt_to_mfn(vstart);
+
+ /* 2. Zap current PTEs. */
+ xen_zap_pfn_range(vstart, order, NULL, out_frames);
+
+ /* 3. Do the exchange for non-contiguous MFNs. */
+ success = xen_exchange_memory(1, order, &in_frame, 1UL << order,
+ 0, out_frames, 0);
+
+ /* 4. Map new pages in place of old pages. */
+ if (success)
+ xen_remap_exchanged_ptes(vstart, order, out_frames, 0);
+ else
+ xen_remap_exchanged_ptes(vstart, order, NULL, in_frame);
+
+ spin_unlock_irqrestore(&xen_reservation_lock, flags);
+}
+EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region);
+
+#ifdef CONFIG_KEXEC_CORE
+phys_addr_t paddr_vmcoreinfo_note(void)
+{
+ if (xen_pv_domain())
+ return virt_to_machine(vmcoreinfo_note).maddr;
+ else
+ return __pa(vmcoreinfo_note);
+}
+#endif /* CONFIG_KEXEC_CORE */
diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c
new file mode 100644
index 000000000..2bce7958c
--- /dev/null
+++ b/arch/x86/xen/multicalls.c
@@ -0,0 +1,209 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Xen hypercall batching.
+ *
+ * Xen allows multiple hypercalls to be issued at once, using the
+ * multicall interface. This allows the cost of trapping into the
+ * hypervisor to be amortized over several calls.
+ *
+ * This file implements a simple interface for multicalls. There's a
+ * per-cpu buffer of outstanding multicalls. When you want to queue a
+ * multicall for issuing, you can allocate a multicall slot for the
+ * call and its arguments, along with storage for space which is
+ * pointed to by the arguments (for passing pointers to structures,
+ * etc). When the multicall is actually issued, all the space for the
+ * commands and allocated memory is freed for reuse.
+ *
+ * Multicalls are flushed whenever any of the buffers get full, or
+ * when explicitly requested. There's no way to get per-multicall
+ * return results back. It will BUG if any of the multicalls fail.
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+#include <linux/debugfs.h>
+
+#include <asm/xen/hypercall.h>
+
+#include "multicalls.h"
+#include "debugfs.h"
+
+#define MC_BATCH 32
+
+#define MC_DEBUG 0
+
+#define MC_ARGS (MC_BATCH * 16)
+
+
+struct mc_buffer {
+ unsigned mcidx, argidx, cbidx;
+ struct multicall_entry entries[MC_BATCH];
+#if MC_DEBUG
+ struct multicall_entry debug[MC_BATCH];
+ void *caller[MC_BATCH];
+#endif
+ unsigned char args[MC_ARGS];
+ struct callback {
+ void (*fn)(void *);
+ void *data;
+ } callbacks[MC_BATCH];
+};
+
+static DEFINE_PER_CPU(struct mc_buffer, mc_buffer);
+DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags);
+
+void xen_mc_flush(void)
+{
+ struct mc_buffer *b = this_cpu_ptr(&mc_buffer);
+ struct multicall_entry *mc;
+ int ret = 0;
+ unsigned long flags;
+ int i;
+
+ BUG_ON(preemptible());
+
+ /* Disable interrupts in case someone comes in and queues
+ something in the middle */
+ local_irq_save(flags);
+
+ trace_xen_mc_flush(b->mcidx, b->argidx, b->cbidx);
+
+ switch (b->mcidx) {
+ case 0:
+ /* no-op */
+ BUG_ON(b->argidx != 0);
+ break;
+
+ case 1:
+ /* Singleton multicall - bypass multicall machinery
+ and just do the call directly. */
+ mc = &b->entries[0];
+
+ mc->result = xen_single_call(mc->op, mc->args[0], mc->args[1],
+ mc->args[2], mc->args[3],
+ mc->args[4]);
+ ret = mc->result < 0;
+ break;
+
+ default:
+#if MC_DEBUG
+ memcpy(b->debug, b->entries,
+ b->mcidx * sizeof(struct multicall_entry));
+#endif
+
+ if (HYPERVISOR_multicall(b->entries, b->mcidx) != 0)
+ BUG();
+ for (i = 0; i < b->mcidx; i++)
+ if (b->entries[i].result < 0)
+ ret++;
+
+#if MC_DEBUG
+ if (ret) {
+ printk(KERN_ERR "%d multicall(s) failed: cpu %d\n",
+ ret, smp_processor_id());
+ dump_stack();
+ for (i = 0; i < b->mcidx; i++) {
+ printk(KERN_DEBUG " call %2d/%d: op=%lu arg=[%lx] result=%ld\t%pF\n",
+ i+1, b->mcidx,
+ b->debug[i].op,
+ b->debug[i].args[0],
+ b->entries[i].result,
+ b->caller[i]);
+ }
+ }
+#endif
+ }
+
+ b->mcidx = 0;
+ b->argidx = 0;
+
+ for (i = 0; i < b->cbidx; i++) {
+ struct callback *cb = &b->callbacks[i];
+
+ (*cb->fn)(cb->data);
+ }
+ b->cbidx = 0;
+
+ local_irq_restore(flags);
+
+ WARN_ON(ret);
+}
+
+struct multicall_space __xen_mc_entry(size_t args)
+{
+ struct mc_buffer *b = this_cpu_ptr(&mc_buffer);
+ struct multicall_space ret;
+ unsigned argidx = roundup(b->argidx, sizeof(u64));
+
+ trace_xen_mc_entry_alloc(args);
+
+ BUG_ON(preemptible());
+ BUG_ON(b->argidx >= MC_ARGS);
+
+ if (unlikely(b->mcidx == MC_BATCH ||
+ (argidx + args) >= MC_ARGS)) {
+ trace_xen_mc_flush_reason((b->mcidx == MC_BATCH) ?
+ XEN_MC_FL_BATCH : XEN_MC_FL_ARGS);
+ xen_mc_flush();
+ argidx = roundup(b->argidx, sizeof(u64));
+ }
+
+ ret.mc = &b->entries[b->mcidx];
+#if MC_DEBUG
+ b->caller[b->mcidx] = __builtin_return_address(0);
+#endif
+ b->mcidx++;
+ ret.args = &b->args[argidx];
+ b->argidx = argidx + args;
+
+ BUG_ON(b->argidx >= MC_ARGS);
+ return ret;
+}
+
+struct multicall_space xen_mc_extend_args(unsigned long op, size_t size)
+{
+ struct mc_buffer *b = this_cpu_ptr(&mc_buffer);
+ struct multicall_space ret = { NULL, NULL };
+
+ BUG_ON(preemptible());
+ BUG_ON(b->argidx >= MC_ARGS);
+
+ if (unlikely(b->mcidx == 0 ||
+ b->entries[b->mcidx - 1].op != op)) {
+ trace_xen_mc_extend_args(op, size, XEN_MC_XE_BAD_OP);
+ goto out;
+ }
+
+ if (unlikely((b->argidx + size) >= MC_ARGS)) {
+ trace_xen_mc_extend_args(op, size, XEN_MC_XE_NO_SPACE);
+ goto out;
+ }
+
+ ret.mc = &b->entries[b->mcidx - 1];
+ ret.args = &b->args[b->argidx];
+ b->argidx += size;
+
+ BUG_ON(b->argidx >= MC_ARGS);
+
+ trace_xen_mc_extend_args(op, size, XEN_MC_XE_OK);
+out:
+ return ret;
+}
+
+void xen_mc_callback(void (*fn)(void *), void *data)
+{
+ struct mc_buffer *b = this_cpu_ptr(&mc_buffer);
+ struct callback *cb;
+
+ if (b->cbidx == MC_BATCH) {
+ trace_xen_mc_flush_reason(XEN_MC_FL_CALLBACK);
+ xen_mc_flush();
+ }
+
+ trace_xen_mc_callback(fn, data);
+
+ cb = &b->callbacks[b->cbidx++];
+ cb->fn = fn;
+ cb->data = data;
+}
diff --git a/arch/x86/xen/multicalls.h b/arch/x86/xen/multicalls.h
new file mode 100644
index 000000000..1c51b2c87
--- /dev/null
+++ b/arch/x86/xen/multicalls.h
@@ -0,0 +1,69 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _XEN_MULTICALLS_H
+#define _XEN_MULTICALLS_H
+
+#include <trace/events/xen.h>
+
+#include "xen-ops.h"
+
+/* Multicalls */
+struct multicall_space
+{
+ struct multicall_entry *mc;
+ void *args;
+};
+
+/* Allocate room for a multicall and its args */
+struct multicall_space __xen_mc_entry(size_t args);
+
+DECLARE_PER_CPU(unsigned long, xen_mc_irq_flags);
+
+/* Call to start a batch of multiple __xen_mc_entry()s. Must be
+ paired with xen_mc_issue() */
+static inline void xen_mc_batch(void)
+{
+ unsigned long flags;
+
+ /* need to disable interrupts until this entry is complete */
+ local_irq_save(flags);
+ trace_xen_mc_batch(paravirt_get_lazy_mode());
+ __this_cpu_write(xen_mc_irq_flags, flags);
+}
+
+static inline struct multicall_space xen_mc_entry(size_t args)
+{
+ xen_mc_batch();
+ return __xen_mc_entry(args);
+}
+
+/* Flush all pending multicalls */
+void xen_mc_flush(void);
+
+/* Issue a multicall if we're not in a lazy mode */
+static inline void xen_mc_issue(unsigned mode)
+{
+ trace_xen_mc_issue(mode);
+
+ if ((paravirt_get_lazy_mode() & mode) == 0)
+ xen_mc_flush();
+
+ /* restore flags saved in xen_mc_batch */
+ local_irq_restore(this_cpu_read(xen_mc_irq_flags));
+}
+
+/* Set up a callback to be called when the current batch is flushed */
+void xen_mc_callback(void (*fn)(void *), void *data);
+
+/*
+ * Try to extend the arguments of the previous multicall command. The
+ * previous command's op must match. If it does, then it attempts to
+ * extend the argument space allocated to the multicall entry by
+ * arg_size bytes.
+ *
+ * The returned multicall_space will return with mc pointing to the
+ * command on success, or NULL on failure, and args pointing to the
+ * newly allocated space.
+ */
+struct multicall_space xen_mc_extend_args(unsigned long op, size_t arg_size);
+
+#endif /* _XEN_MULTICALLS_H */
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
new file mode 100644
index 000000000..f9b31eb68
--- /dev/null
+++ b/arch/x86/xen/p2m.c
@@ -0,0 +1,858 @@
+/*
+ * Xen leaves the responsibility for maintaining p2m mappings to the
+ * guests themselves, but it must also access and update the p2m array
+ * during suspend/resume when all the pages are reallocated.
+ *
+ * The logical flat p2m table is mapped to a linear kernel memory area.
+ * For accesses by Xen a three-level tree linked via mfns only is set up to
+ * allow the address space to be sparse.
+ *
+ * Xen
+ * |
+ * p2m_top_mfn
+ * / \
+ * p2m_mid_mfn p2m_mid_mfn
+ * / /
+ * p2m p2m p2m ...
+ *
+ * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p.
+ *
+ * The p2m_top_mfn level is limited to 1 page, so the maximum representable
+ * pseudo-physical address space is:
+ * P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages
+ *
+ * P2M_PER_PAGE depends on the architecture, as a mfn is always
+ * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to
+ * 512 and 1024 entries respectively.
+ *
+ * In short, these structures contain the Machine Frame Number (MFN) of the PFN.
+ *
+ * However not all entries are filled with MFNs. Specifically for all other
+ * leaf entries, or for the top root, or middle one, for which there is a void
+ * entry, we assume it is "missing". So (for example)
+ * pfn_to_mfn(0x90909090)=INVALID_P2M_ENTRY.
+ * We have a dedicated page p2m_missing with all entries being
+ * INVALID_P2M_ENTRY. This page may be referenced multiple times in the p2m
+ * list/tree in case there are multiple areas with P2M_PER_PAGE invalid pfns.
+ *
+ * We also have the possibility of setting 1-1 mappings on certain regions, so
+ * that:
+ * pfn_to_mfn(0xc0000)=0xc0000
+ *
+ * The benefit of this is, that we can assume for non-RAM regions (think
+ * PCI BARs, or ACPI spaces), we can create mappings easily because we
+ * get the PFN value to match the MFN.
+ *
+ * For this to work efficiently we have one new page p2m_identity. All entries
+ * in p2m_identity are set to INVALID_P2M_ENTRY type (Xen toolstack only
+ * recognizes that and MFNs, no other fancy value).
+ *
+ * On lookup we spot that the entry points to p2m_identity and return the
+ * identity value instead of dereferencing and returning INVALID_P2M_ENTRY.
+ * If the entry points to an allocated page, we just proceed as before and
+ * return the PFN. If the PFN has IDENTITY_FRAME_BIT set we unmask that in
+ * appropriate functions (pfn_to_mfn).
+ *
+ * The reason for having the IDENTITY_FRAME_BIT instead of just returning the
+ * PFN is that we could find ourselves where pfn_to_mfn(pfn)==pfn for a
+ * non-identity pfn. To protect ourselves against we elect to set (and get) the
+ * IDENTITY_FRAME_BIT on all identity mapped PFNs.
+ */
+
+#include <linux/init.h>
+#include <linux/export.h>
+#include <linux/list.h>
+#include <linux/hash.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/bootmem.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#include <asm/cache.h>
+#include <asm/setup.h>
+#include <linux/uaccess.h>
+
+#include <asm/xen/page.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+#include <xen/balloon.h>
+#include <xen/grant_table.h>
+
+#include "multicalls.h"
+#include "xen-ops.h"
+
+#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *))
+#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **))
+
+#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
+
+#define PMDS_PER_MID_PAGE (P2M_MID_PER_PAGE / PTRS_PER_PTE)
+
+unsigned long *xen_p2m_addr __read_mostly;
+EXPORT_SYMBOL_GPL(xen_p2m_addr);
+unsigned long xen_p2m_size __read_mostly;
+EXPORT_SYMBOL_GPL(xen_p2m_size);
+unsigned long xen_max_p2m_pfn __read_mostly;
+EXPORT_SYMBOL_GPL(xen_max_p2m_pfn);
+
+#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG_LIMIT
+#define P2M_LIMIT CONFIG_XEN_BALLOON_MEMORY_HOTPLUG_LIMIT
+#else
+#define P2M_LIMIT 0
+#endif
+
+static DEFINE_SPINLOCK(p2m_update_lock);
+
+static unsigned long *p2m_mid_missing_mfn;
+static unsigned long *p2m_top_mfn;
+static unsigned long **p2m_top_mfn_p;
+static unsigned long *p2m_missing;
+static unsigned long *p2m_identity;
+static pte_t *p2m_missing_pte;
+static pte_t *p2m_identity_pte;
+
+/*
+ * Hint at last populated PFN.
+ *
+ * Used to set HYPERVISOR_shared_info->arch.max_pfn so the toolstack
+ * can avoid scanning the whole P2M (which may be sized to account for
+ * hotplugged memory).
+ */
+static unsigned long xen_p2m_last_pfn;
+
+static inline unsigned p2m_top_index(unsigned long pfn)
+{
+ BUG_ON(pfn >= MAX_P2M_PFN);
+ return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE);
+}
+
+static inline unsigned p2m_mid_index(unsigned long pfn)
+{
+ return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE;
+}
+
+static inline unsigned p2m_index(unsigned long pfn)
+{
+ return pfn % P2M_PER_PAGE;
+}
+
+static void p2m_top_mfn_init(unsigned long *top)
+{
+ unsigned i;
+
+ for (i = 0; i < P2M_TOP_PER_PAGE; i++)
+ top[i] = virt_to_mfn(p2m_mid_missing_mfn);
+}
+
+static void p2m_top_mfn_p_init(unsigned long **top)
+{
+ unsigned i;
+
+ for (i = 0; i < P2M_TOP_PER_PAGE; i++)
+ top[i] = p2m_mid_missing_mfn;
+}
+
+static void p2m_mid_mfn_init(unsigned long *mid, unsigned long *leaf)
+{
+ unsigned i;
+
+ for (i = 0; i < P2M_MID_PER_PAGE; i++)
+ mid[i] = virt_to_mfn(leaf);
+}
+
+static void p2m_init(unsigned long *p2m)
+{
+ unsigned i;
+
+ for (i = 0; i < P2M_PER_PAGE; i++)
+ p2m[i] = INVALID_P2M_ENTRY;
+}
+
+static void p2m_init_identity(unsigned long *p2m, unsigned long pfn)
+{
+ unsigned i;
+
+ for (i = 0; i < P2M_PER_PAGE; i++)
+ p2m[i] = IDENTITY_FRAME(pfn + i);
+}
+
+static void * __ref alloc_p2m_page(void)
+{
+ if (unlikely(!slab_is_available()))
+ return alloc_bootmem_align(PAGE_SIZE, PAGE_SIZE);
+
+ return (void *)__get_free_page(GFP_KERNEL);
+}
+
+static void __ref free_p2m_page(void *p)
+{
+ if (unlikely(!slab_is_available())) {
+ free_bootmem((unsigned long)p, PAGE_SIZE);
+ return;
+ }
+
+ free_page((unsigned long)p);
+}
+
+/*
+ * Build the parallel p2m_top_mfn and p2m_mid_mfn structures
+ *
+ * This is called both at boot time, and after resuming from suspend:
+ * - At boot time we're called rather early, and must use alloc_bootmem*()
+ * to allocate memory.
+ *
+ * - After resume we're called from within stop_machine, but the mfn
+ * tree should already be completely allocated.
+ */
+void __ref xen_build_mfn_list_list(void)
+{
+ unsigned long pfn, mfn;
+ pte_t *ptep;
+ unsigned int level, topidx, mididx;
+ unsigned long *mid_mfn_p;
+
+ if (xen_start_info->flags & SIF_VIRT_P2M_4TOOLS)
+ return;
+
+ /* Pre-initialize p2m_top_mfn to be completely missing */
+ if (p2m_top_mfn == NULL) {
+ p2m_mid_missing_mfn = alloc_p2m_page();
+ p2m_mid_mfn_init(p2m_mid_missing_mfn, p2m_missing);
+
+ p2m_top_mfn_p = alloc_p2m_page();
+ p2m_top_mfn_p_init(p2m_top_mfn_p);
+
+ p2m_top_mfn = alloc_p2m_page();
+ p2m_top_mfn_init(p2m_top_mfn);
+ } else {
+ /* Reinitialise, mfn's all change after migration */
+ p2m_mid_mfn_init(p2m_mid_missing_mfn, p2m_missing);
+ }
+
+ for (pfn = 0; pfn < xen_max_p2m_pfn && pfn < MAX_P2M_PFN;
+ pfn += P2M_PER_PAGE) {
+ topidx = p2m_top_index(pfn);
+ mididx = p2m_mid_index(pfn);
+
+ mid_mfn_p = p2m_top_mfn_p[topidx];
+ ptep = lookup_address((unsigned long)(xen_p2m_addr + pfn),
+ &level);
+ BUG_ON(!ptep || level != PG_LEVEL_4K);
+ mfn = pte_mfn(*ptep);
+ ptep = (pte_t *)((unsigned long)ptep & ~(PAGE_SIZE - 1));
+
+ /* Don't bother allocating any mfn mid levels if
+ * they're just missing, just update the stored mfn,
+ * since all could have changed over a migrate.
+ */
+ if (ptep == p2m_missing_pte || ptep == p2m_identity_pte) {
+ BUG_ON(mididx);
+ BUG_ON(mid_mfn_p != p2m_mid_missing_mfn);
+ p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn);
+ pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE;
+ continue;
+ }
+
+ if (mid_mfn_p == p2m_mid_missing_mfn) {
+ mid_mfn_p = alloc_p2m_page();
+ p2m_mid_mfn_init(mid_mfn_p, p2m_missing);
+
+ p2m_top_mfn_p[topidx] = mid_mfn_p;
+ }
+
+ p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
+ mid_mfn_p[mididx] = mfn;
+ }
+}
+
+void xen_setup_mfn_list_list(void)
+{
+ BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
+
+ if (xen_start_info->flags & SIF_VIRT_P2M_4TOOLS)
+ HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = ~0UL;
+ else
+ HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
+ virt_to_mfn(p2m_top_mfn);
+ HYPERVISOR_shared_info->arch.max_pfn = xen_p2m_last_pfn;
+ HYPERVISOR_shared_info->arch.p2m_generation = 0;
+ HYPERVISOR_shared_info->arch.p2m_vaddr = (unsigned long)xen_p2m_addr;
+ HYPERVISOR_shared_info->arch.p2m_cr3 =
+ xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
+}
+
+/* Set up p2m_top to point to the domain-builder provided p2m pages */
+void __init xen_build_dynamic_phys_to_machine(void)
+{
+ unsigned long pfn;
+
+ xen_p2m_addr = (unsigned long *)xen_start_info->mfn_list;
+ xen_p2m_size = ALIGN(xen_start_info->nr_pages, P2M_PER_PAGE);
+
+ for (pfn = xen_start_info->nr_pages; pfn < xen_p2m_size; pfn++)
+ xen_p2m_addr[pfn] = INVALID_P2M_ENTRY;
+
+ xen_max_p2m_pfn = xen_p2m_size;
+}
+
+#define P2M_TYPE_IDENTITY 0
+#define P2M_TYPE_MISSING 1
+#define P2M_TYPE_PFN 2
+#define P2M_TYPE_UNKNOWN 3
+
+static int xen_p2m_elem_type(unsigned long pfn)
+{
+ unsigned long mfn;
+
+ if (pfn >= xen_p2m_size)
+ return P2M_TYPE_IDENTITY;
+
+ mfn = xen_p2m_addr[pfn];
+
+ if (mfn == INVALID_P2M_ENTRY)
+ return P2M_TYPE_MISSING;
+
+ if (mfn & IDENTITY_FRAME_BIT)
+ return P2M_TYPE_IDENTITY;
+
+ return P2M_TYPE_PFN;
+}
+
+static void __init xen_rebuild_p2m_list(unsigned long *p2m)
+{
+ unsigned int i, chunk;
+ unsigned long pfn;
+ unsigned long *mfns;
+ pte_t *ptep;
+ pmd_t *pmdp;
+ int type;
+
+ p2m_missing = alloc_p2m_page();
+ p2m_init(p2m_missing);
+ p2m_identity = alloc_p2m_page();
+ p2m_init(p2m_identity);
+
+ p2m_missing_pte = alloc_p2m_page();
+ paravirt_alloc_pte(&init_mm, __pa(p2m_missing_pte) >> PAGE_SHIFT);
+ p2m_identity_pte = alloc_p2m_page();
+ paravirt_alloc_pte(&init_mm, __pa(p2m_identity_pte) >> PAGE_SHIFT);
+ for (i = 0; i < PTRS_PER_PTE; i++) {
+ set_pte(p2m_missing_pte + i,
+ pfn_pte(PFN_DOWN(__pa(p2m_missing)), PAGE_KERNEL_RO));
+ set_pte(p2m_identity_pte + i,
+ pfn_pte(PFN_DOWN(__pa(p2m_identity)), PAGE_KERNEL_RO));
+ }
+
+ for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += chunk) {
+ /*
+ * Try to map missing/identity PMDs or p2m-pages if possible.
+ * We have to respect the structure of the mfn_list_list
+ * which will be built just afterwards.
+ * Chunk size to test is one p2m page if we are in the middle
+ * of a mfn_list_list mid page and the complete mid page area
+ * if we are at index 0 of the mid page. Please note that a
+ * mid page might cover more than one PMD, e.g. on 32 bit PAE
+ * kernels.
+ */
+ chunk = (pfn & (P2M_PER_PAGE * P2M_MID_PER_PAGE - 1)) ?
+ P2M_PER_PAGE : P2M_PER_PAGE * P2M_MID_PER_PAGE;
+
+ type = xen_p2m_elem_type(pfn);
+ i = 0;
+ if (type != P2M_TYPE_PFN)
+ for (i = 1; i < chunk; i++)
+ if (xen_p2m_elem_type(pfn + i) != type)
+ break;
+ if (i < chunk)
+ /* Reset to minimal chunk size. */
+ chunk = P2M_PER_PAGE;
+
+ if (type == P2M_TYPE_PFN || i < chunk) {
+ /* Use initial p2m page contents. */
+#ifdef CONFIG_X86_64
+ mfns = alloc_p2m_page();
+ copy_page(mfns, xen_p2m_addr + pfn);
+#else
+ mfns = xen_p2m_addr + pfn;
+#endif
+ ptep = populate_extra_pte((unsigned long)(p2m + pfn));
+ set_pte(ptep,
+ pfn_pte(PFN_DOWN(__pa(mfns)), PAGE_KERNEL));
+ continue;
+ }
+
+ if (chunk == P2M_PER_PAGE) {
+ /* Map complete missing or identity p2m-page. */
+ mfns = (type == P2M_TYPE_MISSING) ?
+ p2m_missing : p2m_identity;
+ ptep = populate_extra_pte((unsigned long)(p2m + pfn));
+ set_pte(ptep,
+ pfn_pte(PFN_DOWN(__pa(mfns)), PAGE_KERNEL_RO));
+ continue;
+ }
+
+ /* Complete missing or identity PMD(s) can be mapped. */
+ ptep = (type == P2M_TYPE_MISSING) ?
+ p2m_missing_pte : p2m_identity_pte;
+ for (i = 0; i < PMDS_PER_MID_PAGE; i++) {
+ pmdp = populate_extra_pmd(
+ (unsigned long)(p2m + pfn) + i * PMD_SIZE);
+ set_pmd(pmdp, __pmd(__pa(ptep) | _KERNPG_TABLE));
+ }
+ }
+}
+
+void __init xen_vmalloc_p2m_tree(void)
+{
+ static struct vm_struct vm;
+ unsigned long p2m_limit;
+
+ xen_p2m_last_pfn = xen_max_p2m_pfn;
+
+ p2m_limit = (phys_addr_t)P2M_LIMIT * 1024 * 1024 * 1024 / PAGE_SIZE;
+ vm.flags = VM_ALLOC;
+ vm.size = ALIGN(sizeof(unsigned long) * max(xen_max_p2m_pfn, p2m_limit),
+ PMD_SIZE * PMDS_PER_MID_PAGE);
+ vm_area_register_early(&vm, PMD_SIZE * PMDS_PER_MID_PAGE);
+ pr_notice("p2m virtual area at %p, size is %lx\n", vm.addr, vm.size);
+
+ xen_max_p2m_pfn = vm.size / sizeof(unsigned long);
+
+ xen_rebuild_p2m_list(vm.addr);
+
+ xen_p2m_addr = vm.addr;
+ xen_p2m_size = xen_max_p2m_pfn;
+
+ xen_inv_extra_mem();
+}
+
+unsigned long get_phys_to_machine(unsigned long pfn)
+{
+ pte_t *ptep;
+ unsigned int level;
+
+ if (unlikely(pfn >= xen_p2m_size)) {
+ if (pfn < xen_max_p2m_pfn)
+ return xen_chk_extra_mem(pfn);
+
+ return IDENTITY_FRAME(pfn);
+ }
+
+ ptep = lookup_address((unsigned long)(xen_p2m_addr + pfn), &level);
+ BUG_ON(!ptep || level != PG_LEVEL_4K);
+
+ /*
+ * The INVALID_P2M_ENTRY is filled in both p2m_*identity
+ * and in p2m_*missing, so returning the INVALID_P2M_ENTRY
+ * would be wrong.
+ */
+ if (pte_pfn(*ptep) == PFN_DOWN(__pa(p2m_identity)))
+ return IDENTITY_FRAME(pfn);
+
+ return xen_p2m_addr[pfn];
+}
+EXPORT_SYMBOL_GPL(get_phys_to_machine);
+
+/*
+ * Allocate new pmd(s). It is checked whether the old pmd is still in place.
+ * If not, nothing is changed. This is okay as the only reason for allocating
+ * a new pmd is to replace p2m_missing_pte or p2m_identity_pte by a individual
+ * pmd. In case of PAE/x86-32 there are multiple pmds to allocate!
+ */
+static pte_t *alloc_p2m_pmd(unsigned long addr, pte_t *pte_pg)
+{
+ pte_t *ptechk;
+ pte_t *pte_newpg[PMDS_PER_MID_PAGE];
+ pmd_t *pmdp;
+ unsigned int level;
+ unsigned long flags;
+ unsigned long vaddr;
+ int i;
+
+ /* Do all allocations first to bail out in error case. */
+ for (i = 0; i < PMDS_PER_MID_PAGE; i++) {
+ pte_newpg[i] = alloc_p2m_page();
+ if (!pte_newpg[i]) {
+ for (i--; i >= 0; i--)
+ free_p2m_page(pte_newpg[i]);
+
+ return NULL;
+ }
+ }
+
+ vaddr = addr & ~(PMD_SIZE * PMDS_PER_MID_PAGE - 1);
+
+ for (i = 0; i < PMDS_PER_MID_PAGE; i++) {
+ copy_page(pte_newpg[i], pte_pg);
+ paravirt_alloc_pte(&init_mm, __pa(pte_newpg[i]) >> PAGE_SHIFT);
+
+ pmdp = lookup_pmd_address(vaddr);
+ BUG_ON(!pmdp);
+
+ spin_lock_irqsave(&p2m_update_lock, flags);
+
+ ptechk = lookup_address(vaddr, &level);
+ if (ptechk == pte_pg) {
+ HYPERVISOR_shared_info->arch.p2m_generation++;
+ wmb(); /* Tools are synchronizing via p2m_generation. */
+ set_pmd(pmdp,
+ __pmd(__pa(pte_newpg[i]) | _KERNPG_TABLE));
+ wmb(); /* Tools are synchronizing via p2m_generation. */
+ HYPERVISOR_shared_info->arch.p2m_generation++;
+ pte_newpg[i] = NULL;
+ }
+
+ spin_unlock_irqrestore(&p2m_update_lock, flags);
+
+ if (pte_newpg[i]) {
+ paravirt_release_pte(__pa(pte_newpg[i]) >> PAGE_SHIFT);
+ free_p2m_page(pte_newpg[i]);
+ }
+
+ vaddr += PMD_SIZE;
+ }
+
+ return lookup_address(addr, &level);
+}
+
+/*
+ * Fully allocate the p2m structure for a given pfn. We need to check
+ * that both the top and mid levels are allocated, and make sure the
+ * parallel mfn tree is kept in sync. We may race with other cpus, so
+ * the new pages are installed with cmpxchg; if we lose the race then
+ * simply free the page we allocated and use the one that's there.
+ */
+int xen_alloc_p2m_entry(unsigned long pfn)
+{
+ unsigned topidx;
+ unsigned long *top_mfn_p, *mid_mfn;
+ pte_t *ptep, *pte_pg;
+ unsigned int level;
+ unsigned long flags;
+ unsigned long addr = (unsigned long)(xen_p2m_addr + pfn);
+ unsigned long p2m_pfn;
+
+ ptep = lookup_address(addr, &level);
+ BUG_ON(!ptep || level != PG_LEVEL_4K);
+ pte_pg = (pte_t *)((unsigned long)ptep & ~(PAGE_SIZE - 1));
+
+ if (pte_pg == p2m_missing_pte || pte_pg == p2m_identity_pte) {
+ /* PMD level is missing, allocate a new one */
+ ptep = alloc_p2m_pmd(addr, pte_pg);
+ if (!ptep)
+ return -ENOMEM;
+ }
+
+ if (p2m_top_mfn && pfn < MAX_P2M_PFN) {
+ topidx = p2m_top_index(pfn);
+ top_mfn_p = &p2m_top_mfn[topidx];
+ mid_mfn = READ_ONCE(p2m_top_mfn_p[topidx]);
+
+ BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p);
+
+ if (mid_mfn == p2m_mid_missing_mfn) {
+ /* Separately check the mid mfn level */
+ unsigned long missing_mfn;
+ unsigned long mid_mfn_mfn;
+ unsigned long old_mfn;
+
+ mid_mfn = alloc_p2m_page();
+ if (!mid_mfn)
+ return -ENOMEM;
+
+ p2m_mid_mfn_init(mid_mfn, p2m_missing);
+
+ missing_mfn = virt_to_mfn(p2m_mid_missing_mfn);
+ mid_mfn_mfn = virt_to_mfn(mid_mfn);
+ old_mfn = cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn);
+ if (old_mfn != missing_mfn) {
+ free_p2m_page(mid_mfn);
+ mid_mfn = mfn_to_virt(old_mfn);
+ } else {
+ p2m_top_mfn_p[topidx] = mid_mfn;
+ }
+ }
+ } else {
+ mid_mfn = NULL;
+ }
+
+ p2m_pfn = pte_pfn(READ_ONCE(*ptep));
+ if (p2m_pfn == PFN_DOWN(__pa(p2m_identity)) ||
+ p2m_pfn == PFN_DOWN(__pa(p2m_missing))) {
+ /* p2m leaf page is missing */
+ unsigned long *p2m;
+
+ p2m = alloc_p2m_page();
+ if (!p2m)
+ return -ENOMEM;
+
+ if (p2m_pfn == PFN_DOWN(__pa(p2m_missing)))
+ p2m_init(p2m);
+ else
+ p2m_init_identity(p2m, pfn & ~(P2M_PER_PAGE - 1));
+
+ spin_lock_irqsave(&p2m_update_lock, flags);
+
+ if (pte_pfn(*ptep) == p2m_pfn) {
+ HYPERVISOR_shared_info->arch.p2m_generation++;
+ wmb(); /* Tools are synchronizing via p2m_generation. */
+ set_pte(ptep,
+ pfn_pte(PFN_DOWN(__pa(p2m)), PAGE_KERNEL));
+ wmb(); /* Tools are synchronizing via p2m_generation. */
+ HYPERVISOR_shared_info->arch.p2m_generation++;
+ if (mid_mfn)
+ mid_mfn[p2m_mid_index(pfn)] = virt_to_mfn(p2m);
+ p2m = NULL;
+ }
+
+ spin_unlock_irqrestore(&p2m_update_lock, flags);
+
+ if (p2m)
+ free_p2m_page(p2m);
+ }
+
+ /* Expanded the p2m? */
+ if (pfn >= xen_p2m_last_pfn) {
+ xen_p2m_last_pfn = ALIGN(pfn + 1, P2M_PER_PAGE);
+ HYPERVISOR_shared_info->arch.max_pfn = xen_p2m_last_pfn;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(xen_alloc_p2m_entry);
+
+unsigned long __init set_phys_range_identity(unsigned long pfn_s,
+ unsigned long pfn_e)
+{
+ unsigned long pfn;
+
+ if (unlikely(pfn_s >= xen_p2m_size))
+ return 0;
+
+ if (pfn_s > pfn_e)
+ return 0;
+
+ if (pfn_e > xen_p2m_size)
+ pfn_e = xen_p2m_size;
+
+ for (pfn = pfn_s; pfn < pfn_e; pfn++)
+ xen_p2m_addr[pfn] = IDENTITY_FRAME(pfn);
+
+ return pfn - pfn_s;
+}
+
+bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
+{
+ pte_t *ptep;
+ unsigned int level;
+
+ if (unlikely(pfn >= xen_p2m_size)) {
+ BUG_ON(mfn != INVALID_P2M_ENTRY);
+ return true;
+ }
+
+ /*
+ * The interface requires atomic updates on p2m elements.
+ * xen_safe_write_ulong() is using __put_user which does an atomic
+ * store via asm().
+ */
+ if (likely(!xen_safe_write_ulong(xen_p2m_addr + pfn, mfn)))
+ return true;
+
+ ptep = lookup_address((unsigned long)(xen_p2m_addr + pfn), &level);
+ BUG_ON(!ptep || level != PG_LEVEL_4K);
+
+ if (pte_pfn(*ptep) == PFN_DOWN(__pa(p2m_missing)))
+ return mfn == INVALID_P2M_ENTRY;
+
+ if (pte_pfn(*ptep) == PFN_DOWN(__pa(p2m_identity)))
+ return mfn == IDENTITY_FRAME(pfn);
+
+ return false;
+}
+
+bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
+{
+ if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
+ int ret;
+
+ ret = xen_alloc_p2m_entry(pfn);
+ if (ret < 0)
+ return false;
+
+ return __set_phys_to_machine(pfn, mfn);
+ }
+
+ return true;
+}
+
+int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
+ struct gnttab_map_grant_ref *kmap_ops,
+ struct page **pages, unsigned int count)
+{
+ int i, ret = 0;
+ pte_t *pte;
+
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return 0;
+
+ if (kmap_ops) {
+ ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
+ kmap_ops, count);
+ if (ret)
+ goto out;
+ }
+
+ for (i = 0; i < count; i++) {
+ unsigned long mfn, pfn;
+ struct gnttab_unmap_grant_ref unmap[2];
+ int rc;
+
+ /* Do not add to override if the map failed. */
+ if (map_ops[i].status != GNTST_okay ||
+ (kmap_ops && kmap_ops[i].status != GNTST_okay))
+ continue;
+
+ if (map_ops[i].flags & GNTMAP_contains_pte) {
+ pte = (pte_t *)(mfn_to_virt(PFN_DOWN(map_ops[i].host_addr)) +
+ (map_ops[i].host_addr & ~PAGE_MASK));
+ mfn = pte_mfn(*pte);
+ } else {
+ mfn = PFN_DOWN(map_ops[i].dev_bus_addr);
+ }
+ pfn = page_to_pfn(pages[i]);
+
+ WARN(pfn_to_mfn(pfn) != INVALID_P2M_ENTRY, "page must be ballooned");
+
+ if (likely(set_phys_to_machine(pfn, FOREIGN_FRAME(mfn))))
+ continue;
+
+ /*
+ * Signal an error for this slot. This in turn requires
+ * immediate unmapping.
+ */
+ map_ops[i].status = GNTST_general_error;
+ unmap[0].host_addr = map_ops[i].host_addr,
+ unmap[0].handle = map_ops[i].handle;
+ map_ops[i].handle = ~0;
+ if (map_ops[i].flags & GNTMAP_device_map)
+ unmap[0].dev_bus_addr = map_ops[i].dev_bus_addr;
+ else
+ unmap[0].dev_bus_addr = 0;
+
+ if (kmap_ops) {
+ kmap_ops[i].status = GNTST_general_error;
+ unmap[1].host_addr = kmap_ops[i].host_addr,
+ unmap[1].handle = kmap_ops[i].handle;
+ kmap_ops[i].handle = ~0;
+ if (kmap_ops[i].flags & GNTMAP_device_map)
+ unmap[1].dev_bus_addr = kmap_ops[i].dev_bus_addr;
+ else
+ unmap[1].dev_bus_addr = 0;
+ }
+
+ /*
+ * Pre-populate both status fields, to be recognizable in
+ * the log message below.
+ */
+ unmap[0].status = 1;
+ unmap[1].status = 1;
+
+ rc = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
+ unmap, 1 + !!kmap_ops);
+ if (rc || unmap[0].status != GNTST_okay ||
+ unmap[1].status != GNTST_okay)
+ pr_err_once("gnttab unmap failed: rc=%d st0=%d st1=%d\n",
+ rc, unmap[0].status, unmap[1].status);
+ }
+
+out:
+ return ret;
+}
+EXPORT_SYMBOL_GPL(set_foreign_p2m_mapping);
+
+int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops,
+ struct gnttab_unmap_grant_ref *kunmap_ops,
+ struct page **pages, unsigned int count)
+{
+ int i, ret = 0;
+
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return 0;
+
+ for (i = 0; i < count; i++) {
+ unsigned long mfn = __pfn_to_mfn(page_to_pfn(pages[i]));
+ unsigned long pfn = page_to_pfn(pages[i]);
+
+ if (mfn != INVALID_P2M_ENTRY && (mfn & FOREIGN_FRAME_BIT))
+ set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
+ else
+ ret = -EINVAL;
+ }
+ if (kunmap_ops)
+ ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
+ kunmap_ops, count) ?: ret;
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(clear_foreign_p2m_mapping);
+
+#ifdef CONFIG_XEN_DEBUG_FS
+#include <linux/debugfs.h>
+#include "debugfs.h"
+static int p2m_dump_show(struct seq_file *m, void *v)
+{
+ static const char * const type_name[] = {
+ [P2M_TYPE_IDENTITY] = "identity",
+ [P2M_TYPE_MISSING] = "missing",
+ [P2M_TYPE_PFN] = "pfn",
+ [P2M_TYPE_UNKNOWN] = "abnormal"};
+ unsigned long pfn, first_pfn;
+ int type, prev_type;
+
+ prev_type = xen_p2m_elem_type(0);
+ first_pfn = 0;
+
+ for (pfn = 0; pfn < xen_p2m_size; pfn++) {
+ type = xen_p2m_elem_type(pfn);
+ if (type != prev_type) {
+ seq_printf(m, " [0x%lx->0x%lx] %s\n", first_pfn, pfn,
+ type_name[prev_type]);
+ prev_type = type;
+ first_pfn = pfn;
+ }
+ }
+ seq_printf(m, " [0x%lx->0x%lx] %s\n", first_pfn, pfn,
+ type_name[prev_type]);
+ return 0;
+}
+
+static int p2m_dump_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, p2m_dump_show, NULL);
+}
+
+static const struct file_operations p2m_dump_fops = {
+ .open = p2m_dump_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static struct dentry *d_mmu_debug;
+
+static int __init xen_p2m_debugfs(void)
+{
+ struct dentry *d_xen = xen_init_debugfs();
+
+ if (d_xen == NULL)
+ return -ENOMEM;
+
+ d_mmu_debug = debugfs_create_dir("mmu", d_xen);
+
+ debugfs_create_file("p2m", 0600, d_mmu_debug, NULL, &p2m_dump_fops);
+ return 0;
+}
+fs_initcall(xen_p2m_debugfs);
+#endif /* CONFIG_XEN_DEBUG_FS */
diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c
new file mode 100644
index 000000000..37c6056a7
--- /dev/null
+++ b/arch/x86/xen/pci-swiotlb-xen.c
@@ -0,0 +1,94 @@
+/* Glue code to lib/swiotlb-xen.c */
+
+#include <linux/dma-mapping.h>
+#include <linux/pci.h>
+#include <xen/swiotlb-xen.h>
+
+#include <asm/xen/hypervisor.h>
+#include <xen/xen.h>
+#include <asm/iommu_table.h>
+
+
+#include <asm/xen/swiotlb-xen.h>
+#ifdef CONFIG_X86_64
+#include <asm/iommu.h>
+#include <asm/dma.h>
+#endif
+#include <linux/export.h>
+
+int xen_swiotlb __read_mostly;
+
+/*
+ * pci_xen_swiotlb_detect - set xen_swiotlb to 1 if necessary
+ *
+ * This returns non-zero if we are forced to use xen_swiotlb (by the boot
+ * option).
+ */
+int __init pci_xen_swiotlb_detect(void)
+{
+
+ if (!xen_pv_domain())
+ return 0;
+
+ /* If running as PV guest, either iommu=soft, or swiotlb=force will
+ * activate this IOMMU. If running as PV privileged, activate it
+ * irregardless.
+ */
+ if (xen_initial_domain() || swiotlb || swiotlb_force == SWIOTLB_FORCE)
+ xen_swiotlb = 1;
+
+ /* If we are running under Xen, we MUST disable the native SWIOTLB.
+ * Don't worry about swiotlb_force flag activating the native, as
+ * the 'swiotlb' flag is the only one turning it on. */
+ swiotlb = 0;
+
+#ifdef CONFIG_X86_64
+ /* pci_swiotlb_detect_4gb turns on native SWIOTLB if no_iommu == 0
+ * (so no iommu=X command line over-writes).
+ * Considering that PV guests do not want the *native SWIOTLB* but
+ * only Xen SWIOTLB it is not useful to us so set no_iommu=1 here.
+ */
+ if (max_pfn > MAX_DMA32_PFN)
+ no_iommu = 1;
+#endif
+ return xen_swiotlb;
+}
+
+void __init pci_xen_swiotlb_init(void)
+{
+ if (xen_swiotlb) {
+ xen_swiotlb_init(1, true /* early */);
+ dma_ops = &xen_swiotlb_dma_ops;
+
+#ifdef CONFIG_PCI
+ /* Make sure ACS will be enabled */
+ pci_request_acs();
+#endif
+ }
+}
+
+int pci_xen_swiotlb_init_late(void)
+{
+ int rc;
+
+ if (xen_swiotlb)
+ return 0;
+
+ rc = xen_swiotlb_init(1, false /* late */);
+ if (rc)
+ return rc;
+
+ dma_ops = &xen_swiotlb_dma_ops;
+#ifdef CONFIG_PCI
+ /* Make sure ACS will be enabled */
+ pci_request_acs();
+#endif
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(pci_xen_swiotlb_init_late);
+
+IOMMU_INIT_FINISH(pci_xen_swiotlb_detect,
+ NULL,
+ pci_xen_swiotlb_init,
+ NULL);
diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c
new file mode 100644
index 000000000..184b36922
--- /dev/null
+++ b/arch/x86/xen/platform-pci-unplug.c
@@ -0,0 +1,221 @@
+/******************************************************************************
+ * platform-pci-unplug.c
+ *
+ * Xen platform PCI device driver
+ * Copyright (c) 2010, Citrix
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/export.h>
+
+#include <xen/platform_pci.h>
+#include "xen-ops.h"
+
+#define XEN_PLATFORM_ERR_MAGIC -1
+#define XEN_PLATFORM_ERR_PROTOCOL -2
+#define XEN_PLATFORM_ERR_BLACKLIST -3
+
+#ifdef CONFIG_XEN_PVHVM
+/* store the value of xen_emul_unplug after the unplug is done */
+static int xen_platform_pci_unplug;
+static int xen_emul_unplug;
+
+static int check_platform_magic(void)
+{
+ short magic;
+ char protocol;
+
+ magic = inw(XEN_IOPORT_MAGIC);
+ if (magic != XEN_IOPORT_MAGIC_VAL) {
+ printk(KERN_ERR "Xen Platform PCI: unrecognised magic value\n");
+ return XEN_PLATFORM_ERR_MAGIC;
+ }
+
+ protocol = inb(XEN_IOPORT_PROTOVER);
+
+ printk(KERN_DEBUG "Xen Platform PCI: I/O protocol version %d\n",
+ protocol);
+
+ switch (protocol) {
+ case 1:
+ outw(XEN_IOPORT_LINUX_PRODNUM, XEN_IOPORT_PRODNUM);
+ outl(XEN_IOPORT_LINUX_DRVVER, XEN_IOPORT_DRVVER);
+ if (inw(XEN_IOPORT_MAGIC) != XEN_IOPORT_MAGIC_VAL) {
+ printk(KERN_ERR "Xen Platform: blacklisted by host\n");
+ return XEN_PLATFORM_ERR_BLACKLIST;
+ }
+ break;
+ default:
+ printk(KERN_WARNING "Xen Platform PCI: unknown I/O protocol version\n");
+ return XEN_PLATFORM_ERR_PROTOCOL;
+ }
+
+ return 0;
+}
+
+bool xen_has_pv_devices(void)
+{
+ if (!xen_domain())
+ return false;
+
+ /* PV and PVH domains always have them. */
+ if (xen_pv_domain() || xen_pvh_domain())
+ return true;
+
+ /* And user has xen_platform_pci=0 set in guest config as
+ * driver did not modify the value. */
+ if (xen_platform_pci_unplug == 0)
+ return false;
+
+ if (xen_platform_pci_unplug & XEN_UNPLUG_NEVER)
+ return false;
+
+ if (xen_platform_pci_unplug & XEN_UNPLUG_ALL)
+ return true;
+
+ /* This is an odd one - we are going to run legacy
+ * and PV drivers at the same time. */
+ if (xen_platform_pci_unplug & XEN_UNPLUG_UNNECESSARY)
+ return true;
+
+ /* And the caller has to follow with xen_pv_{disk,nic}_devices
+ * to be certain which driver can load. */
+ return false;
+}
+EXPORT_SYMBOL_GPL(xen_has_pv_devices);
+
+static bool __xen_has_pv_device(int state)
+{
+ /* HVM domains might or might not */
+ if (xen_hvm_domain() && (xen_platform_pci_unplug & state))
+ return true;
+
+ return xen_has_pv_devices();
+}
+
+bool xen_has_pv_nic_devices(void)
+{
+ return __xen_has_pv_device(XEN_UNPLUG_ALL_NICS | XEN_UNPLUG_ALL);
+}
+EXPORT_SYMBOL_GPL(xen_has_pv_nic_devices);
+
+bool xen_has_pv_disk_devices(void)
+{
+ return __xen_has_pv_device(XEN_UNPLUG_ALL_IDE_DISKS |
+ XEN_UNPLUG_AUX_IDE_DISKS | XEN_UNPLUG_ALL);
+}
+EXPORT_SYMBOL_GPL(xen_has_pv_disk_devices);
+
+/*
+ * This one is odd - it determines whether you want to run PV _and_
+ * legacy (IDE) drivers together. This combination is only possible
+ * under HVM.
+ */
+bool xen_has_pv_and_legacy_disk_devices(void)
+{
+ if (!xen_domain())
+ return false;
+
+ /* N.B. This is only ever used in HVM mode */
+ if (xen_pv_domain())
+ return false;
+
+ if (xen_platform_pci_unplug & XEN_UNPLUG_UNNECESSARY)
+ return true;
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(xen_has_pv_and_legacy_disk_devices);
+
+void xen_unplug_emulated_devices(void)
+{
+ int r;
+
+ /* PVH guests don't have emulated devices. */
+ if (xen_pvh_domain())
+ return;
+
+ /* user explicitly requested no unplug */
+ if (xen_emul_unplug & XEN_UNPLUG_NEVER)
+ return;
+ /* check the version of the xen platform PCI device */
+ r = check_platform_magic();
+ /* If the version matches enable the Xen platform PCI driver.
+ * Also enable the Xen platform PCI driver if the host does
+ * not support the unplug protocol (XEN_PLATFORM_ERR_MAGIC)
+ * but the user told us that unplugging is unnecessary. */
+ if (r && !(r == XEN_PLATFORM_ERR_MAGIC &&
+ (xen_emul_unplug & XEN_UNPLUG_UNNECESSARY)))
+ return;
+ /* Set the default value of xen_emul_unplug depending on whether or
+ * not the Xen PV frontends and the Xen platform PCI driver have
+ * been compiled for this kernel (modules or built-in are both OK). */
+ if (!xen_emul_unplug) {
+ if (xen_must_unplug_nics()) {
+ printk(KERN_INFO "Netfront and the Xen platform PCI driver have "
+ "been compiled for this kernel: unplug emulated NICs.\n");
+ xen_emul_unplug |= XEN_UNPLUG_ALL_NICS;
+ }
+ if (xen_must_unplug_disks()) {
+ printk(KERN_INFO "Blkfront and the Xen platform PCI driver have "
+ "been compiled for this kernel: unplug emulated disks.\n"
+ "You might have to change the root device\n"
+ "from /dev/hd[a-d] to /dev/xvd[a-d]\n"
+ "in your root= kernel command line option\n");
+ xen_emul_unplug |= XEN_UNPLUG_ALL_IDE_DISKS;
+ }
+ }
+ /* Now unplug the emulated devices */
+ if (!(xen_emul_unplug & XEN_UNPLUG_UNNECESSARY))
+ outw(xen_emul_unplug, XEN_IOPORT_UNPLUG);
+ xen_platform_pci_unplug = xen_emul_unplug;
+}
+
+static int __init parse_xen_emul_unplug(char *arg)
+{
+ char *p, *q;
+ int l;
+
+ for (p = arg; p; p = q) {
+ q = strchr(p, ',');
+ if (q) {
+ l = q - p;
+ q++;
+ } else {
+ l = strlen(p);
+ }
+ if (!strncmp(p, "all", l))
+ xen_emul_unplug |= XEN_UNPLUG_ALL;
+ else if (!strncmp(p, "ide-disks", l))
+ xen_emul_unplug |= XEN_UNPLUG_ALL_IDE_DISKS;
+ else if (!strncmp(p, "aux-ide-disks", l))
+ xen_emul_unplug |= XEN_UNPLUG_AUX_IDE_DISKS;
+ else if (!strncmp(p, "nics", l))
+ xen_emul_unplug |= XEN_UNPLUG_ALL_NICS;
+ else if (!strncmp(p, "unnecessary", l))
+ xen_emul_unplug |= XEN_UNPLUG_UNNECESSARY;
+ else if (!strncmp(p, "never", l))
+ xen_emul_unplug |= XEN_UNPLUG_NEVER;
+ else
+ printk(KERN_WARNING "unrecognised option '%s' "
+ "in parameter 'xen_emul_unplug'\n", p);
+ }
+ return 0;
+}
+early_param("xen_emul_unplug", parse_xen_emul_unplug);
+#endif
diff --git a/arch/x86/xen/pmu.c b/arch/x86/xen/pmu.c
new file mode 100644
index 000000000..9813298ba
--- /dev/null
+++ b/arch/x86/xen/pmu.c
@@ -0,0 +1,572 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/types.h>
+#include <linux/interrupt.h>
+
+#include <asm/xen/hypercall.h>
+#include <xen/page.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/vcpu.h>
+#include <xen/interface/xenpmu.h>
+
+#include "xen-ops.h"
+#include "pmu.h"
+
+/* x86_pmu.handle_irq definition */
+#include "../events/perf_event.h"
+
+#define XENPMU_IRQ_PROCESSING 1
+struct xenpmu {
+ /* Shared page between hypervisor and domain */
+ struct xen_pmu_data *xenpmu_data;
+
+ uint8_t flags;
+};
+static DEFINE_PER_CPU(struct xenpmu, xenpmu_shared);
+#define get_xenpmu_data() (this_cpu_ptr(&xenpmu_shared)->xenpmu_data)
+#define get_xenpmu_flags() (this_cpu_ptr(&xenpmu_shared)->flags)
+
+/* Macro for computing address of a PMU MSR bank */
+#define field_offset(ctxt, field) ((void *)((uintptr_t)ctxt + \
+ (uintptr_t)ctxt->field))
+
+/* AMD PMU */
+#define F15H_NUM_COUNTERS 6
+#define F10H_NUM_COUNTERS 4
+
+static __read_mostly uint32_t amd_counters_base;
+static __read_mostly uint32_t amd_ctrls_base;
+static __read_mostly int amd_msr_step;
+static __read_mostly int k7_counters_mirrored;
+static __read_mostly int amd_num_counters;
+
+/* Intel PMU */
+#define MSR_TYPE_COUNTER 0
+#define MSR_TYPE_CTRL 1
+#define MSR_TYPE_GLOBAL 2
+#define MSR_TYPE_ARCH_COUNTER 3
+#define MSR_TYPE_ARCH_CTRL 4
+
+/* Number of general pmu registers (CPUID.EAX[0xa].EAX[8..15]) */
+#define PMU_GENERAL_NR_SHIFT 8
+#define PMU_GENERAL_NR_BITS 8
+#define PMU_GENERAL_NR_MASK (((1 << PMU_GENERAL_NR_BITS) - 1) \
+ << PMU_GENERAL_NR_SHIFT)
+
+/* Number of fixed pmu registers (CPUID.EDX[0xa].EDX[0..4]) */
+#define PMU_FIXED_NR_SHIFT 0
+#define PMU_FIXED_NR_BITS 5
+#define PMU_FIXED_NR_MASK (((1 << PMU_FIXED_NR_BITS) - 1) \
+ << PMU_FIXED_NR_SHIFT)
+
+/* Alias registers (0x4c1) for full-width writes to PMCs */
+#define MSR_PMC_ALIAS_MASK (~(MSR_IA32_PERFCTR0 ^ MSR_IA32_PMC0))
+
+#define INTEL_PMC_TYPE_SHIFT 30
+
+static __read_mostly int intel_num_arch_counters, intel_num_fixed_counters;
+
+
+static void xen_pmu_arch_init(void)
+{
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
+
+ switch (boot_cpu_data.x86) {
+ case 0x15:
+ amd_num_counters = F15H_NUM_COUNTERS;
+ amd_counters_base = MSR_F15H_PERF_CTR;
+ amd_ctrls_base = MSR_F15H_PERF_CTL;
+ amd_msr_step = 2;
+ k7_counters_mirrored = 1;
+ break;
+ case 0x10:
+ case 0x12:
+ case 0x14:
+ case 0x16:
+ default:
+ amd_num_counters = F10H_NUM_COUNTERS;
+ amd_counters_base = MSR_K7_PERFCTR0;
+ amd_ctrls_base = MSR_K7_EVNTSEL0;
+ amd_msr_step = 1;
+ k7_counters_mirrored = 0;
+ break;
+ }
+ } else {
+ uint32_t eax, ebx, ecx, edx;
+
+ cpuid(0xa, &eax, &ebx, &ecx, &edx);
+
+ intel_num_arch_counters = (eax & PMU_GENERAL_NR_MASK) >>
+ PMU_GENERAL_NR_SHIFT;
+ intel_num_fixed_counters = (edx & PMU_FIXED_NR_MASK) >>
+ PMU_FIXED_NR_SHIFT;
+ }
+}
+
+static inline uint32_t get_fam15h_addr(u32 addr)
+{
+ switch (addr) {
+ case MSR_K7_PERFCTR0:
+ case MSR_K7_PERFCTR1:
+ case MSR_K7_PERFCTR2:
+ case MSR_K7_PERFCTR3:
+ return MSR_F15H_PERF_CTR + (addr - MSR_K7_PERFCTR0);
+ case MSR_K7_EVNTSEL0:
+ case MSR_K7_EVNTSEL1:
+ case MSR_K7_EVNTSEL2:
+ case MSR_K7_EVNTSEL3:
+ return MSR_F15H_PERF_CTL + (addr - MSR_K7_EVNTSEL0);
+ default:
+ break;
+ }
+
+ return addr;
+}
+
+static inline bool is_amd_pmu_msr(unsigned int msr)
+{
+ if ((msr >= MSR_F15H_PERF_CTL &&
+ msr < MSR_F15H_PERF_CTR + (amd_num_counters * 2)) ||
+ (msr >= MSR_K7_EVNTSEL0 &&
+ msr < MSR_K7_PERFCTR0 + amd_num_counters))
+ return true;
+
+ return false;
+}
+
+static int is_intel_pmu_msr(u32 msr_index, int *type, int *index)
+{
+ u32 msr_index_pmc;
+
+ switch (msr_index) {
+ case MSR_CORE_PERF_FIXED_CTR_CTRL:
+ case MSR_IA32_DS_AREA:
+ case MSR_IA32_PEBS_ENABLE:
+ *type = MSR_TYPE_CTRL;
+ return true;
+
+ case MSR_CORE_PERF_GLOBAL_CTRL:
+ case MSR_CORE_PERF_GLOBAL_STATUS:
+ case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+ *type = MSR_TYPE_GLOBAL;
+ return true;
+
+ default:
+
+ if ((msr_index >= MSR_CORE_PERF_FIXED_CTR0) &&
+ (msr_index < MSR_CORE_PERF_FIXED_CTR0 +
+ intel_num_fixed_counters)) {
+ *index = msr_index - MSR_CORE_PERF_FIXED_CTR0;
+ *type = MSR_TYPE_COUNTER;
+ return true;
+ }
+
+ if ((msr_index >= MSR_P6_EVNTSEL0) &&
+ (msr_index < MSR_P6_EVNTSEL0 + intel_num_arch_counters)) {
+ *index = msr_index - MSR_P6_EVNTSEL0;
+ *type = MSR_TYPE_ARCH_CTRL;
+ return true;
+ }
+
+ msr_index_pmc = msr_index & MSR_PMC_ALIAS_MASK;
+ if ((msr_index_pmc >= MSR_IA32_PERFCTR0) &&
+ (msr_index_pmc < MSR_IA32_PERFCTR0 +
+ intel_num_arch_counters)) {
+ *type = MSR_TYPE_ARCH_COUNTER;
+ *index = msr_index_pmc - MSR_IA32_PERFCTR0;
+ return true;
+ }
+ return false;
+ }
+}
+
+static bool xen_intel_pmu_emulate(unsigned int msr, u64 *val, int type,
+ int index, bool is_read)
+{
+ uint64_t *reg = NULL;
+ struct xen_pmu_intel_ctxt *ctxt;
+ uint64_t *fix_counters;
+ struct xen_pmu_cntr_pair *arch_cntr_pair;
+ struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+ uint8_t xenpmu_flags = get_xenpmu_flags();
+
+
+ if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING))
+ return false;
+
+ ctxt = &xenpmu_data->pmu.c.intel;
+
+ switch (msr) {
+ case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+ reg = &ctxt->global_ovf_ctrl;
+ break;
+ case MSR_CORE_PERF_GLOBAL_STATUS:
+ reg = &ctxt->global_status;
+ break;
+ case MSR_CORE_PERF_GLOBAL_CTRL:
+ reg = &ctxt->global_ctrl;
+ break;
+ case MSR_CORE_PERF_FIXED_CTR_CTRL:
+ reg = &ctxt->fixed_ctrl;
+ break;
+ default:
+ switch (type) {
+ case MSR_TYPE_COUNTER:
+ fix_counters = field_offset(ctxt, fixed_counters);
+ reg = &fix_counters[index];
+ break;
+ case MSR_TYPE_ARCH_COUNTER:
+ arch_cntr_pair = field_offset(ctxt, arch_counters);
+ reg = &arch_cntr_pair[index].counter;
+ break;
+ case MSR_TYPE_ARCH_CTRL:
+ arch_cntr_pair = field_offset(ctxt, arch_counters);
+ reg = &arch_cntr_pair[index].control;
+ break;
+ default:
+ return false;
+ }
+ }
+
+ if (reg) {
+ if (is_read)
+ *val = *reg;
+ else {
+ *reg = *val;
+
+ if (msr == MSR_CORE_PERF_GLOBAL_OVF_CTRL)
+ ctxt->global_status &= (~(*val));
+ }
+ return true;
+ }
+
+ return false;
+}
+
+static bool xen_amd_pmu_emulate(unsigned int msr, u64 *val, bool is_read)
+{
+ uint64_t *reg = NULL;
+ int i, off = 0;
+ struct xen_pmu_amd_ctxt *ctxt;
+ uint64_t *counter_regs, *ctrl_regs;
+ struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+ uint8_t xenpmu_flags = get_xenpmu_flags();
+
+ if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING))
+ return false;
+
+ if (k7_counters_mirrored &&
+ ((msr >= MSR_K7_EVNTSEL0) && (msr <= MSR_K7_PERFCTR3)))
+ msr = get_fam15h_addr(msr);
+
+ ctxt = &xenpmu_data->pmu.c.amd;
+ for (i = 0; i < amd_num_counters; i++) {
+ if (msr == amd_ctrls_base + off) {
+ ctrl_regs = field_offset(ctxt, ctrls);
+ reg = &ctrl_regs[i];
+ break;
+ } else if (msr == amd_counters_base + off) {
+ counter_regs = field_offset(ctxt, counters);
+ reg = &counter_regs[i];
+ break;
+ }
+ off += amd_msr_step;
+ }
+
+ if (reg) {
+ if (is_read)
+ *val = *reg;
+ else
+ *reg = *val;
+
+ return true;
+ }
+ return false;
+}
+
+bool pmu_msr_read(unsigned int msr, uint64_t *val, int *err)
+{
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
+ if (is_amd_pmu_msr(msr)) {
+ if (!xen_amd_pmu_emulate(msr, val, 1))
+ *val = native_read_msr_safe(msr, err);
+ return true;
+ }
+ } else {
+ int type, index;
+
+ if (is_intel_pmu_msr(msr, &type, &index)) {
+ if (!xen_intel_pmu_emulate(msr, val, type, index, 1))
+ *val = native_read_msr_safe(msr, err);
+ return true;
+ }
+ }
+
+ return false;
+}
+
+bool pmu_msr_write(unsigned int msr, uint32_t low, uint32_t high, int *err)
+{
+ uint64_t val = ((uint64_t)high << 32) | low;
+
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
+ if (is_amd_pmu_msr(msr)) {
+ if (!xen_amd_pmu_emulate(msr, &val, 0))
+ *err = native_write_msr_safe(msr, low, high);
+ return true;
+ }
+ } else {
+ int type, index;
+
+ if (is_intel_pmu_msr(msr, &type, &index)) {
+ if (!xen_intel_pmu_emulate(msr, &val, type, index, 0))
+ *err = native_write_msr_safe(msr, low, high);
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static unsigned long long xen_amd_read_pmc(int counter)
+{
+ struct xen_pmu_amd_ctxt *ctxt;
+ uint64_t *counter_regs;
+ struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+ uint8_t xenpmu_flags = get_xenpmu_flags();
+
+ if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING)) {
+ uint32_t msr;
+ int err;
+
+ msr = amd_counters_base + (counter * amd_msr_step);
+ return native_read_msr_safe(msr, &err);
+ }
+
+ ctxt = &xenpmu_data->pmu.c.amd;
+ counter_regs = field_offset(ctxt, counters);
+ return counter_regs[counter];
+}
+
+static unsigned long long xen_intel_read_pmc(int counter)
+{
+ struct xen_pmu_intel_ctxt *ctxt;
+ uint64_t *fixed_counters;
+ struct xen_pmu_cntr_pair *arch_cntr_pair;
+ struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+ uint8_t xenpmu_flags = get_xenpmu_flags();
+
+ if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING)) {
+ uint32_t msr;
+ int err;
+
+ if (counter & (1 << INTEL_PMC_TYPE_SHIFT))
+ msr = MSR_CORE_PERF_FIXED_CTR0 + (counter & 0xffff);
+ else
+ msr = MSR_IA32_PERFCTR0 + counter;
+
+ return native_read_msr_safe(msr, &err);
+ }
+
+ ctxt = &xenpmu_data->pmu.c.intel;
+ if (counter & (1 << INTEL_PMC_TYPE_SHIFT)) {
+ fixed_counters = field_offset(ctxt, fixed_counters);
+ return fixed_counters[counter & 0xffff];
+ }
+
+ arch_cntr_pair = field_offset(ctxt, arch_counters);
+ return arch_cntr_pair[counter].counter;
+}
+
+unsigned long long xen_read_pmc(int counter)
+{
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+ return xen_amd_read_pmc(counter);
+ else
+ return xen_intel_read_pmc(counter);
+}
+
+int pmu_apic_update(uint32_t val)
+{
+ int ret;
+ struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+
+ if (!xenpmu_data) {
+ pr_warn_once("%s: pmudata not initialized\n", __func__);
+ return -EINVAL;
+ }
+
+ xenpmu_data->pmu.l.lapic_lvtpc = val;
+
+ if (get_xenpmu_flags() & XENPMU_IRQ_PROCESSING)
+ return 0;
+
+ ret = HYPERVISOR_xenpmu_op(XENPMU_lvtpc_set, NULL);
+
+ return ret;
+}
+
+/* perf callbacks */
+static int xen_is_in_guest(void)
+{
+ const struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+
+ if (!xenpmu_data) {
+ pr_warn_once("%s: pmudata not initialized\n", __func__);
+ return 0;
+ }
+
+ if (!xen_initial_domain() || (xenpmu_data->domain_id >= DOMID_SELF))
+ return 0;
+
+ return 1;
+}
+
+static int xen_is_user_mode(void)
+{
+ const struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+
+ if (!xenpmu_data) {
+ pr_warn_once("%s: pmudata not initialized\n", __func__);
+ return 0;
+ }
+
+ if (xenpmu_data->pmu.pmu_flags & PMU_SAMPLE_PV)
+ return (xenpmu_data->pmu.pmu_flags & PMU_SAMPLE_USER);
+ else
+ return !!(xenpmu_data->pmu.r.regs.cpl & 3);
+}
+
+static unsigned long xen_get_guest_ip(void)
+{
+ const struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+
+ if (!xenpmu_data) {
+ pr_warn_once("%s: pmudata not initialized\n", __func__);
+ return 0;
+ }
+
+ return xenpmu_data->pmu.r.regs.ip;
+}
+
+static struct perf_guest_info_callbacks xen_guest_cbs = {
+ .is_in_guest = xen_is_in_guest,
+ .is_user_mode = xen_is_user_mode,
+ .get_guest_ip = xen_get_guest_ip,
+};
+
+/* Convert registers from Xen's format to Linux' */
+static void xen_convert_regs(const struct xen_pmu_regs *xen_regs,
+ struct pt_regs *regs, uint64_t pmu_flags)
+{
+ regs->ip = xen_regs->ip;
+ regs->cs = xen_regs->cs;
+ regs->sp = xen_regs->sp;
+
+ if (pmu_flags & PMU_SAMPLE_PV) {
+ if (pmu_flags & PMU_SAMPLE_USER)
+ regs->cs |= 3;
+ else
+ regs->cs &= ~3;
+ } else {
+ if (xen_regs->cpl)
+ regs->cs |= 3;
+ else
+ regs->cs &= ~3;
+ }
+}
+
+irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id)
+{
+ int err, ret = IRQ_NONE;
+ struct pt_regs regs = {0};
+ const struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+ uint8_t xenpmu_flags = get_xenpmu_flags();
+
+ if (!xenpmu_data) {
+ pr_warn_once("%s: pmudata not initialized\n", __func__);
+ return ret;
+ }
+
+ this_cpu_ptr(&xenpmu_shared)->flags =
+ xenpmu_flags | XENPMU_IRQ_PROCESSING;
+ xen_convert_regs(&xenpmu_data->pmu.r.regs, &regs,
+ xenpmu_data->pmu.pmu_flags);
+ if (x86_pmu.handle_irq(&regs))
+ ret = IRQ_HANDLED;
+
+ /* Write out cached context to HW */
+ err = HYPERVISOR_xenpmu_op(XENPMU_flush, NULL);
+ this_cpu_ptr(&xenpmu_shared)->flags = xenpmu_flags;
+ if (err) {
+ pr_warn_once("%s: failed hypercall, err: %d\n", __func__, err);
+ return IRQ_NONE;
+ }
+
+ return ret;
+}
+
+bool is_xen_pmu;
+
+void xen_pmu_init(int cpu)
+{
+ int err;
+ struct xen_pmu_params xp;
+ unsigned long pfn;
+ struct xen_pmu_data *xenpmu_data;
+
+ BUILD_BUG_ON(sizeof(struct xen_pmu_data) > PAGE_SIZE);
+
+ if (xen_hvm_domain() || (cpu != 0 && !is_xen_pmu))
+ return;
+
+ xenpmu_data = (struct xen_pmu_data *)get_zeroed_page(GFP_KERNEL);
+ if (!xenpmu_data) {
+ pr_err("VPMU init: No memory\n");
+ return;
+ }
+ pfn = virt_to_pfn(xenpmu_data);
+
+ xp.val = pfn_to_mfn(pfn);
+ xp.vcpu = cpu;
+ xp.version.maj = XENPMU_VER_MAJ;
+ xp.version.min = XENPMU_VER_MIN;
+ err = HYPERVISOR_xenpmu_op(XENPMU_init, &xp);
+ if (err)
+ goto fail;
+
+ per_cpu(xenpmu_shared, cpu).xenpmu_data = xenpmu_data;
+ per_cpu(xenpmu_shared, cpu).flags = 0;
+
+ if (!is_xen_pmu) {
+ is_xen_pmu = true;
+ perf_register_guest_info_callbacks(&xen_guest_cbs);
+ xen_pmu_arch_init();
+ }
+
+ return;
+
+fail:
+ if (err == -EOPNOTSUPP || err == -ENOSYS)
+ pr_info_once("VPMU disabled by hypervisor.\n");
+ else
+ pr_info_once("Could not initialize VPMU for cpu %d, error %d\n",
+ cpu, err);
+ free_pages((unsigned long)xenpmu_data, 0);
+}
+
+void xen_pmu_finish(int cpu)
+{
+ struct xen_pmu_params xp;
+
+ if (xen_hvm_domain())
+ return;
+
+ xp.vcpu = cpu;
+ xp.version.maj = XENPMU_VER_MAJ;
+ xp.version.min = XENPMU_VER_MIN;
+
+ (void)HYPERVISOR_xenpmu_op(XENPMU_finish, &xp);
+
+ free_pages((unsigned long)per_cpu(xenpmu_shared, cpu).xenpmu_data, 0);
+ per_cpu(xenpmu_shared, cpu).xenpmu_data = NULL;
+}
diff --git a/arch/x86/xen/pmu.h b/arch/x86/xen/pmu.h
new file mode 100644
index 000000000..65c58894f
--- /dev/null
+++ b/arch/x86/xen/pmu.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __XEN_PMU_H
+#define __XEN_PMU_H
+
+#include <xen/interface/xenpmu.h>
+
+extern bool is_xen_pmu;
+
+irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id);
+#ifdef CONFIG_XEN_HAVE_VPMU
+void xen_pmu_init(int cpu);
+void xen_pmu_finish(int cpu);
+#else
+static inline void xen_pmu_init(int cpu) {}
+static inline void xen_pmu_finish(int cpu) {}
+#endif
+bool pmu_msr_read(unsigned int msr, uint64_t *val, int *err);
+bool pmu_msr_write(unsigned int msr, uint32_t low, uint32_t high, int *err);
+int pmu_apic_update(uint32_t reg);
+unsigned long long xen_read_pmc(int counter);
+
+#endif /* __XEN_PMU_H */
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
new file mode 100644
index 000000000..075ed4799
--- /dev/null
+++ b/arch/x86/xen/setup.c
@@ -0,0 +1,1016 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Machine specific setup for xen
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/pm.h>
+#include <linux/memblock.h>
+#include <linux/cpuidle.h>
+#include <linux/cpufreq.h>
+
+#include <asm/elf.h>
+#include <asm/vdso.h>
+#include <asm/e820/api.h>
+#include <asm/setup.h>
+#include <asm/acpi.h>
+#include <asm/numa.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+
+#include <xen/xen.h>
+#include <xen/page.h>
+#include <xen/interface/callback.h>
+#include <xen/interface/memory.h>
+#include <xen/interface/physdev.h>
+#include <xen/features.h>
+#include <xen/hvc-console.h>
+#include "xen-ops.h"
+#include "vdso.h"
+#include "mmu.h"
+
+#define GB(x) ((uint64_t)(x) * 1024 * 1024 * 1024)
+
+/* Amount of extra memory space we add to the e820 ranges */
+struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
+
+/* Number of pages released from the initial allocation. */
+unsigned long xen_released_pages;
+
+/* E820 map used during setting up memory. */
+static struct e820_table xen_e820_table __initdata;
+
+/*
+ * Buffer used to remap identity mapped pages. We only need the virtual space.
+ * The physical page behind this address is remapped as needed to different
+ * buffer pages.
+ */
+#define REMAP_SIZE (P2M_PER_PAGE - 3)
+static struct {
+ unsigned long next_area_mfn;
+ unsigned long target_pfn;
+ unsigned long size;
+ unsigned long mfns[REMAP_SIZE];
+} xen_remap_buf __initdata __aligned(PAGE_SIZE);
+static unsigned long xen_remap_mfn __initdata = INVALID_P2M_ENTRY;
+
+/*
+ * The maximum amount of extra memory compared to the base size. The
+ * main scaling factor is the size of struct page. At extreme ratios
+ * of base:extra, all the base memory can be filled with page
+ * structures for the extra memory, leaving no space for anything
+ * else.
+ *
+ * 10x seems like a reasonable balance between scaling flexibility and
+ * leaving a practically usable system.
+ */
+#define EXTRA_MEM_RATIO (10)
+
+static bool xen_512gb_limit __initdata = IS_ENABLED(CONFIG_XEN_512GB);
+
+static void __init xen_parse_512gb(void)
+{
+ bool val = false;
+ char *arg;
+
+ arg = strstr(xen_start_info->cmd_line, "xen_512gb_limit");
+ if (!arg)
+ return;
+
+ arg = strstr(xen_start_info->cmd_line, "xen_512gb_limit=");
+ if (!arg)
+ val = true;
+ else if (strtobool(arg + strlen("xen_512gb_limit="), &val))
+ return;
+
+ xen_512gb_limit = val;
+}
+
+static void __init xen_add_extra_mem(unsigned long start_pfn,
+ unsigned long n_pfns)
+{
+ int i;
+
+ /*
+ * No need to check for zero size, should happen rarely and will only
+ * write a new entry regarded to be unused due to zero size.
+ */
+ for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
+ /* Add new region. */
+ if (xen_extra_mem[i].n_pfns == 0) {
+ xen_extra_mem[i].start_pfn = start_pfn;
+ xen_extra_mem[i].n_pfns = n_pfns;
+ break;
+ }
+ /* Append to existing region. */
+ if (xen_extra_mem[i].start_pfn + xen_extra_mem[i].n_pfns ==
+ start_pfn) {
+ xen_extra_mem[i].n_pfns += n_pfns;
+ break;
+ }
+ }
+ if (i == XEN_EXTRA_MEM_MAX_REGIONS)
+ printk(KERN_WARNING "Warning: not enough extra memory regions\n");
+
+ memblock_reserve(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns));
+}
+
+static void __init xen_del_extra_mem(unsigned long start_pfn,
+ unsigned long n_pfns)
+{
+ int i;
+ unsigned long start_r, size_r;
+
+ for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
+ start_r = xen_extra_mem[i].start_pfn;
+ size_r = xen_extra_mem[i].n_pfns;
+
+ /* Start of region. */
+ if (start_r == start_pfn) {
+ BUG_ON(n_pfns > size_r);
+ xen_extra_mem[i].start_pfn += n_pfns;
+ xen_extra_mem[i].n_pfns -= n_pfns;
+ break;
+ }
+ /* End of region. */
+ if (start_r + size_r == start_pfn + n_pfns) {
+ BUG_ON(n_pfns > size_r);
+ xen_extra_mem[i].n_pfns -= n_pfns;
+ break;
+ }
+ /* Mid of region. */
+ if (start_pfn > start_r && start_pfn < start_r + size_r) {
+ BUG_ON(start_pfn + n_pfns > start_r + size_r);
+ xen_extra_mem[i].n_pfns = start_pfn - start_r;
+ /* Calling memblock_reserve() again is okay. */
+ xen_add_extra_mem(start_pfn + n_pfns, start_r + size_r -
+ (start_pfn + n_pfns));
+ break;
+ }
+ }
+ memblock_free(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns));
+}
+
+/*
+ * Called during boot before the p2m list can take entries beyond the
+ * hypervisor supplied p2m list. Entries in extra mem are to be regarded as
+ * invalid.
+ */
+unsigned long __ref xen_chk_extra_mem(unsigned long pfn)
+{
+ int i;
+
+ for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
+ if (pfn >= xen_extra_mem[i].start_pfn &&
+ pfn < xen_extra_mem[i].start_pfn + xen_extra_mem[i].n_pfns)
+ return INVALID_P2M_ENTRY;
+ }
+
+ return IDENTITY_FRAME(pfn);
+}
+
+/*
+ * Mark all pfns of extra mem as invalid in p2m list.
+ */
+void __init xen_inv_extra_mem(void)
+{
+ unsigned long pfn, pfn_s, pfn_e;
+ int i;
+
+ for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
+ if (!xen_extra_mem[i].n_pfns)
+ continue;
+ pfn_s = xen_extra_mem[i].start_pfn;
+ pfn_e = pfn_s + xen_extra_mem[i].n_pfns;
+ for (pfn = pfn_s; pfn < pfn_e; pfn++)
+ set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
+ }
+}
+
+/*
+ * Finds the next RAM pfn available in the E820 map after min_pfn.
+ * This function updates min_pfn with the pfn found and returns
+ * the size of that range or zero if not found.
+ */
+static unsigned long __init xen_find_pfn_range(unsigned long *min_pfn)
+{
+ const struct e820_entry *entry = xen_e820_table.entries;
+ unsigned int i;
+ unsigned long done = 0;
+
+ for (i = 0; i < xen_e820_table.nr_entries; i++, entry++) {
+ unsigned long s_pfn;
+ unsigned long e_pfn;
+
+ if (entry->type != E820_TYPE_RAM)
+ continue;
+
+ e_pfn = PFN_DOWN(entry->addr + entry->size);
+
+ /* We only care about E820 after this */
+ if (e_pfn <= *min_pfn)
+ continue;
+
+ s_pfn = PFN_UP(entry->addr);
+
+ /* If min_pfn falls within the E820 entry, we want to start
+ * at the min_pfn PFN.
+ */
+ if (s_pfn <= *min_pfn) {
+ done = e_pfn - *min_pfn;
+ } else {
+ done = e_pfn - s_pfn;
+ *min_pfn = s_pfn;
+ }
+ break;
+ }
+
+ return done;
+}
+
+static int __init xen_free_mfn(unsigned long mfn)
+{
+ struct xen_memory_reservation reservation = {
+ .address_bits = 0,
+ .extent_order = 0,
+ .domid = DOMID_SELF
+ };
+
+ set_xen_guest_handle(reservation.extent_start, &mfn);
+ reservation.nr_extents = 1;
+
+ return HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
+}
+
+/*
+ * This releases a chunk of memory and then does the identity map. It's used
+ * as a fallback if the remapping fails.
+ */
+static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn,
+ unsigned long end_pfn, unsigned long nr_pages)
+{
+ unsigned long pfn, end;
+ int ret;
+
+ WARN_ON(start_pfn > end_pfn);
+
+ /* Release pages first. */
+ end = min(end_pfn, nr_pages);
+ for (pfn = start_pfn; pfn < end; pfn++) {
+ unsigned long mfn = pfn_to_mfn(pfn);
+
+ /* Make sure pfn exists to start with */
+ if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn)
+ continue;
+
+ ret = xen_free_mfn(mfn);
+ WARN(ret != 1, "Failed to release pfn %lx err=%d\n", pfn, ret);
+
+ if (ret == 1) {
+ xen_released_pages++;
+ if (!__set_phys_to_machine(pfn, INVALID_P2M_ENTRY))
+ break;
+ } else
+ break;
+ }
+
+ set_phys_range_identity(start_pfn, end_pfn);
+}
+
+/*
+ * Helper function to update the p2m and m2p tables and kernel mapping.
+ */
+static void __init xen_update_mem_tables(unsigned long pfn, unsigned long mfn)
+{
+ struct mmu_update update = {
+ .ptr = ((uint64_t)mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE,
+ .val = pfn
+ };
+
+ /* Update p2m */
+ if (!set_phys_to_machine(pfn, mfn)) {
+ WARN(1, "Failed to set p2m mapping for pfn=%ld mfn=%ld\n",
+ pfn, mfn);
+ BUG();
+ }
+
+ /* Update m2p */
+ if (HYPERVISOR_mmu_update(&update, 1, NULL, DOMID_SELF) < 0) {
+ WARN(1, "Failed to set m2p mapping for mfn=%ld pfn=%ld\n",
+ mfn, pfn);
+ BUG();
+ }
+
+ /* Update kernel mapping, but not for highmem. */
+ if (pfn >= PFN_UP(__pa(high_memory - 1)))
+ return;
+
+ if (HYPERVISOR_update_va_mapping((unsigned long)__va(pfn << PAGE_SHIFT),
+ mfn_pte(mfn, PAGE_KERNEL), 0)) {
+ WARN(1, "Failed to update kernel mapping for mfn=%ld pfn=%ld\n",
+ mfn, pfn);
+ BUG();
+ }
+}
+
+/*
+ * This function updates the p2m and m2p tables with an identity map from
+ * start_pfn to start_pfn+size and prepares remapping the underlying RAM of the
+ * original allocation at remap_pfn. The information needed for remapping is
+ * saved in the memory itself to avoid the need for allocating buffers. The
+ * complete remap information is contained in a list of MFNs each containing
+ * up to REMAP_SIZE MFNs and the start target PFN for doing the remap.
+ * This enables us to preserve the original mfn sequence while doing the
+ * remapping at a time when the memory management is capable of allocating
+ * virtual and physical memory in arbitrary amounts, see 'xen_remap_memory' and
+ * its callers.
+ */
+static void __init xen_do_set_identity_and_remap_chunk(
+ unsigned long start_pfn, unsigned long size, unsigned long remap_pfn)
+{
+ unsigned long buf = (unsigned long)&xen_remap_buf;
+ unsigned long mfn_save, mfn;
+ unsigned long ident_pfn_iter, remap_pfn_iter;
+ unsigned long ident_end_pfn = start_pfn + size;
+ unsigned long left = size;
+ unsigned int i, chunk;
+
+ WARN_ON(size == 0);
+
+ mfn_save = virt_to_mfn(buf);
+
+ for (ident_pfn_iter = start_pfn, remap_pfn_iter = remap_pfn;
+ ident_pfn_iter < ident_end_pfn;
+ ident_pfn_iter += REMAP_SIZE, remap_pfn_iter += REMAP_SIZE) {
+ chunk = (left < REMAP_SIZE) ? left : REMAP_SIZE;
+
+ /* Map first pfn to xen_remap_buf */
+ mfn = pfn_to_mfn(ident_pfn_iter);
+ set_pte_mfn(buf, mfn, PAGE_KERNEL);
+
+ /* Save mapping information in page */
+ xen_remap_buf.next_area_mfn = xen_remap_mfn;
+ xen_remap_buf.target_pfn = remap_pfn_iter;
+ xen_remap_buf.size = chunk;
+ for (i = 0; i < chunk; i++)
+ xen_remap_buf.mfns[i] = pfn_to_mfn(ident_pfn_iter + i);
+
+ /* Put remap buf into list. */
+ xen_remap_mfn = mfn;
+
+ /* Set identity map */
+ set_phys_range_identity(ident_pfn_iter, ident_pfn_iter + chunk);
+
+ left -= chunk;
+ }
+
+ /* Restore old xen_remap_buf mapping */
+ set_pte_mfn(buf, mfn_save, PAGE_KERNEL);
+}
+
+/*
+ * This function takes a contiguous pfn range that needs to be identity mapped
+ * and:
+ *
+ * 1) Finds a new range of pfns to use to remap based on E820 and remap_pfn.
+ * 2) Calls the do_ function to actually do the mapping/remapping work.
+ *
+ * The goal is to not allocate additional memory but to remap the existing
+ * pages. In the case of an error the underlying memory is simply released back
+ * to Xen and not remapped.
+ */
+static unsigned long __init xen_set_identity_and_remap_chunk(
+ unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages,
+ unsigned long remap_pfn)
+{
+ unsigned long pfn;
+ unsigned long i = 0;
+ unsigned long n = end_pfn - start_pfn;
+
+ if (remap_pfn == 0)
+ remap_pfn = nr_pages;
+
+ while (i < n) {
+ unsigned long cur_pfn = start_pfn + i;
+ unsigned long left = n - i;
+ unsigned long size = left;
+ unsigned long remap_range_size;
+
+ /* Do not remap pages beyond the current allocation */
+ if (cur_pfn >= nr_pages) {
+ /* Identity map remaining pages */
+ set_phys_range_identity(cur_pfn, cur_pfn + size);
+ break;
+ }
+ if (cur_pfn + size > nr_pages)
+ size = nr_pages - cur_pfn;
+
+ remap_range_size = xen_find_pfn_range(&remap_pfn);
+ if (!remap_range_size) {
+ pr_warning("Unable to find available pfn range, not remapping identity pages\n");
+ xen_set_identity_and_release_chunk(cur_pfn,
+ cur_pfn + left, nr_pages);
+ break;
+ }
+ /* Adjust size to fit in current e820 RAM region */
+ if (size > remap_range_size)
+ size = remap_range_size;
+
+ xen_do_set_identity_and_remap_chunk(cur_pfn, size, remap_pfn);
+
+ /* Update variables to reflect new mappings. */
+ i += size;
+ remap_pfn += size;
+ }
+
+ /*
+ * If the PFNs are currently mapped, the VA mapping also needs
+ * to be updated to be 1:1.
+ */
+ for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++)
+ (void)HYPERVISOR_update_va_mapping(
+ (unsigned long)__va(pfn << PAGE_SHIFT),
+ mfn_pte(pfn, PAGE_KERNEL_IO), 0);
+
+ return remap_pfn;
+}
+
+static unsigned long __init xen_count_remap_pages(
+ unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages,
+ unsigned long remap_pages)
+{
+ if (start_pfn >= nr_pages)
+ return remap_pages;
+
+ return remap_pages + min(end_pfn, nr_pages) - start_pfn;
+}
+
+static unsigned long __init xen_foreach_remap_area(unsigned long nr_pages,
+ unsigned long (*func)(unsigned long start_pfn, unsigned long end_pfn,
+ unsigned long nr_pages, unsigned long last_val))
+{
+ phys_addr_t start = 0;
+ unsigned long ret_val = 0;
+ const struct e820_entry *entry = xen_e820_table.entries;
+ int i;
+
+ /*
+ * Combine non-RAM regions and gaps until a RAM region (or the
+ * end of the map) is reached, then call the provided function
+ * to perform its duty on the non-RAM region.
+ *
+ * The combined non-RAM regions are rounded to a whole number
+ * of pages so any partial pages are accessible via the 1:1
+ * mapping. This is needed for some BIOSes that put (for
+ * example) the DMI tables in a reserved region that begins on
+ * a non-page boundary.
+ */
+ for (i = 0; i < xen_e820_table.nr_entries; i++, entry++) {
+ phys_addr_t end = entry->addr + entry->size;
+ if (entry->type == E820_TYPE_RAM || i == xen_e820_table.nr_entries - 1) {
+ unsigned long start_pfn = PFN_DOWN(start);
+ unsigned long end_pfn = PFN_UP(end);
+
+ if (entry->type == E820_TYPE_RAM)
+ end_pfn = PFN_UP(entry->addr);
+
+ if (start_pfn < end_pfn)
+ ret_val = func(start_pfn, end_pfn, nr_pages,
+ ret_val);
+ start = end;
+ }
+ }
+
+ return ret_val;
+}
+
+/*
+ * Remap the memory prepared in xen_do_set_identity_and_remap_chunk().
+ * The remap information (which mfn remap to which pfn) is contained in the
+ * to be remapped memory itself in a linked list anchored at xen_remap_mfn.
+ * This scheme allows to remap the different chunks in arbitrary order while
+ * the resulting mapping will be independant from the order.
+ */
+void __init xen_remap_memory(void)
+{
+ unsigned long buf = (unsigned long)&xen_remap_buf;
+ unsigned long mfn_save, pfn;
+ unsigned long remapped = 0;
+ unsigned int i;
+ unsigned long pfn_s = ~0UL;
+ unsigned long len = 0;
+
+ mfn_save = virt_to_mfn(buf);
+
+ while (xen_remap_mfn != INVALID_P2M_ENTRY) {
+ /* Map the remap information */
+ set_pte_mfn(buf, xen_remap_mfn, PAGE_KERNEL);
+
+ BUG_ON(xen_remap_mfn != xen_remap_buf.mfns[0]);
+
+ pfn = xen_remap_buf.target_pfn;
+ for (i = 0; i < xen_remap_buf.size; i++) {
+ xen_update_mem_tables(pfn, xen_remap_buf.mfns[i]);
+ remapped++;
+ pfn++;
+ }
+ if (pfn_s == ~0UL || pfn == pfn_s) {
+ pfn_s = xen_remap_buf.target_pfn;
+ len += xen_remap_buf.size;
+ } else if (pfn_s + len == xen_remap_buf.target_pfn) {
+ len += xen_remap_buf.size;
+ } else {
+ xen_del_extra_mem(pfn_s, len);
+ pfn_s = xen_remap_buf.target_pfn;
+ len = xen_remap_buf.size;
+ }
+ xen_remap_mfn = xen_remap_buf.next_area_mfn;
+ }
+
+ if (pfn_s != ~0UL && len)
+ xen_del_extra_mem(pfn_s, len);
+
+ set_pte_mfn(buf, mfn_save, PAGE_KERNEL);
+
+ pr_info("Remapped %ld page(s)\n", remapped);
+}
+
+static unsigned long __init xen_get_pages_limit(void)
+{
+ unsigned long limit;
+
+#ifdef CONFIG_X86_32
+ limit = GB(64) / PAGE_SIZE;
+#else
+ limit = MAXMEM / PAGE_SIZE;
+ if (!xen_initial_domain() && xen_512gb_limit)
+ limit = GB(512) / PAGE_SIZE;
+#endif
+ return limit;
+}
+
+static unsigned long __init xen_get_max_pages(void)
+{
+ unsigned long max_pages, limit;
+ domid_t domid = DOMID_SELF;
+ long ret;
+
+ limit = xen_get_pages_limit();
+ max_pages = limit;
+
+ /*
+ * For the initial domain we use the maximum reservation as
+ * the maximum page.
+ *
+ * For guest domains the current maximum reservation reflects
+ * the current maximum rather than the static maximum. In this
+ * case the e820 map provided to us will cover the static
+ * maximum region.
+ */
+ if (xen_initial_domain()) {
+ ret = HYPERVISOR_memory_op(XENMEM_maximum_reservation, &domid);
+ if (ret > 0)
+ max_pages = ret;
+ }
+
+ return min(max_pages, limit);
+}
+
+static void __init xen_align_and_add_e820_region(phys_addr_t start,
+ phys_addr_t size, int type)
+{
+ phys_addr_t end = start + size;
+
+ /* Align RAM regions to page boundaries. */
+ if (type == E820_TYPE_RAM) {
+ start = PAGE_ALIGN(start);
+ end &= ~((phys_addr_t)PAGE_SIZE - 1);
+ }
+
+ e820__range_add(start, end - start, type);
+}
+
+static void __init xen_ignore_unusable(void)
+{
+ struct e820_entry *entry = xen_e820_table.entries;
+ unsigned int i;
+
+ for (i = 0; i < xen_e820_table.nr_entries; i++, entry++) {
+ if (entry->type == E820_TYPE_UNUSABLE)
+ entry->type = E820_TYPE_RAM;
+ }
+}
+
+bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size)
+{
+ struct e820_entry *entry;
+ unsigned mapcnt;
+ phys_addr_t end;
+
+ if (!size)
+ return false;
+
+ end = start + size;
+ entry = xen_e820_table.entries;
+
+ for (mapcnt = 0; mapcnt < xen_e820_table.nr_entries; mapcnt++) {
+ if (entry->type == E820_TYPE_RAM && entry->addr <= start &&
+ (entry->addr + entry->size) >= end)
+ return false;
+
+ entry++;
+ }
+
+ return true;
+}
+
+/*
+ * Find a free area in physical memory not yet reserved and compliant with
+ * E820 map.
+ * Used to relocate pre-allocated areas like initrd or p2m list which are in
+ * conflict with the to be used E820 map.
+ * In case no area is found, return 0. Otherwise return the physical address
+ * of the area which is already reserved for convenience.
+ */
+phys_addr_t __init xen_find_free_area(phys_addr_t size)
+{
+ unsigned mapcnt;
+ phys_addr_t addr, start;
+ struct e820_entry *entry = xen_e820_table.entries;
+
+ for (mapcnt = 0; mapcnt < xen_e820_table.nr_entries; mapcnt++, entry++) {
+ if (entry->type != E820_TYPE_RAM || entry->size < size)
+ continue;
+ start = entry->addr;
+ for (addr = start; addr < start + size; addr += PAGE_SIZE) {
+ if (!memblock_is_reserved(addr))
+ continue;
+ start = addr + PAGE_SIZE;
+ if (start + size > entry->addr + entry->size)
+ break;
+ }
+ if (addr >= start + size) {
+ memblock_reserve(start, size);
+ return start;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Like memcpy, but with physical addresses for dest and src.
+ */
+static void __init xen_phys_memcpy(phys_addr_t dest, phys_addr_t src,
+ phys_addr_t n)
+{
+ phys_addr_t dest_off, src_off, dest_len, src_len, len;
+ void *from, *to;
+
+ while (n) {
+ dest_off = dest & ~PAGE_MASK;
+ src_off = src & ~PAGE_MASK;
+ dest_len = n;
+ if (dest_len > (NR_FIX_BTMAPS << PAGE_SHIFT) - dest_off)
+ dest_len = (NR_FIX_BTMAPS << PAGE_SHIFT) - dest_off;
+ src_len = n;
+ if (src_len > (NR_FIX_BTMAPS << PAGE_SHIFT) - src_off)
+ src_len = (NR_FIX_BTMAPS << PAGE_SHIFT) - src_off;
+ len = min(dest_len, src_len);
+ to = early_memremap(dest - dest_off, dest_len + dest_off);
+ from = early_memremap(src - src_off, src_len + src_off);
+ memcpy(to, from, len);
+ early_memunmap(to, dest_len + dest_off);
+ early_memunmap(from, src_len + src_off);
+ n -= len;
+ dest += len;
+ src += len;
+ }
+}
+
+/*
+ * Reserve Xen mfn_list.
+ */
+static void __init xen_reserve_xen_mfnlist(void)
+{
+ phys_addr_t start, size;
+
+ if (xen_start_info->mfn_list >= __START_KERNEL_map) {
+ start = __pa(xen_start_info->mfn_list);
+ size = PFN_ALIGN(xen_start_info->nr_pages *
+ sizeof(unsigned long));
+ } else {
+ start = PFN_PHYS(xen_start_info->first_p2m_pfn);
+ size = PFN_PHYS(xen_start_info->nr_p2m_frames);
+ }
+
+ memblock_reserve(start, size);
+ if (!xen_is_e820_reserved(start, size))
+ return;
+
+#ifdef CONFIG_X86_32
+ /*
+ * Relocating the p2m on 32 bit system to an arbitrary virtual address
+ * is not supported, so just give up.
+ */
+ xen_raw_console_write("Xen hypervisor allocated p2m list conflicts with E820 map\n");
+ BUG();
+#else
+ xen_relocate_p2m();
+ memblock_free(start, size);
+#endif
+}
+
+/**
+ * machine_specific_memory_setup - Hook for machine specific memory setup.
+ **/
+char * __init xen_memory_setup(void)
+{
+ unsigned long max_pfn, pfn_s, n_pfns;
+ phys_addr_t mem_end, addr, size, chunk_size;
+ u32 type;
+ int rc;
+ struct xen_memory_map memmap;
+ unsigned long max_pages;
+ unsigned long extra_pages = 0;
+ int i;
+ int op;
+
+ xen_parse_512gb();
+ max_pfn = xen_get_pages_limit();
+ max_pfn = min(max_pfn, xen_start_info->nr_pages);
+ mem_end = PFN_PHYS(max_pfn);
+
+ memmap.nr_entries = ARRAY_SIZE(xen_e820_table.entries);
+ set_xen_guest_handle(memmap.buffer, xen_e820_table.entries);
+
+ op = xen_initial_domain() ?
+ XENMEM_machine_memory_map :
+ XENMEM_memory_map;
+ rc = HYPERVISOR_memory_op(op, &memmap);
+ if (rc == -ENOSYS) {
+ BUG_ON(xen_initial_domain());
+ memmap.nr_entries = 1;
+ xen_e820_table.entries[0].addr = 0ULL;
+ xen_e820_table.entries[0].size = mem_end;
+ /* 8MB slack (to balance backend allocations). */
+ xen_e820_table.entries[0].size += 8ULL << 20;
+ xen_e820_table.entries[0].type = E820_TYPE_RAM;
+ rc = 0;
+ }
+ BUG_ON(rc);
+ BUG_ON(memmap.nr_entries == 0);
+ xen_e820_table.nr_entries = memmap.nr_entries;
+
+ /*
+ * Xen won't allow a 1:1 mapping to be created to UNUSABLE
+ * regions, so if we're using the machine memory map leave the
+ * region as RAM as it is in the pseudo-physical map.
+ *
+ * UNUSABLE regions in domUs are not handled and will need
+ * a patch in the future.
+ */
+ if (xen_initial_domain())
+ xen_ignore_unusable();
+
+ /* Make sure the Xen-supplied memory map is well-ordered. */
+ e820__update_table(&xen_e820_table);
+
+ max_pages = xen_get_max_pages();
+
+ /* How many extra pages do we need due to remapping? */
+ max_pages += xen_foreach_remap_area(max_pfn, xen_count_remap_pages);
+
+ if (max_pages > max_pfn)
+ extra_pages += max_pages - max_pfn;
+
+ /*
+ * Clamp the amount of extra memory to a EXTRA_MEM_RATIO
+ * factor the base size. On non-highmem systems, the base
+ * size is the full initial memory allocation; on highmem it
+ * is limited to the max size of lowmem, so that it doesn't
+ * get completely filled.
+ *
+ * Make sure we have no memory above max_pages, as this area
+ * isn't handled by the p2m management.
+ *
+ * In principle there could be a problem in lowmem systems if
+ * the initial memory is also very large with respect to
+ * lowmem, but we won't try to deal with that here.
+ */
+ extra_pages = min3(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
+ extra_pages, max_pages - max_pfn);
+ i = 0;
+ addr = xen_e820_table.entries[0].addr;
+ size = xen_e820_table.entries[0].size;
+ while (i < xen_e820_table.nr_entries) {
+ bool discard = false;
+
+ chunk_size = size;
+ type = xen_e820_table.entries[i].type;
+
+ if (type == E820_TYPE_RAM) {
+ if (addr < mem_end) {
+ chunk_size = min(size, mem_end - addr);
+ } else if (extra_pages) {
+ chunk_size = min(size, PFN_PHYS(extra_pages));
+ pfn_s = PFN_UP(addr);
+ n_pfns = PFN_DOWN(addr + chunk_size) - pfn_s;
+ extra_pages -= n_pfns;
+ xen_add_extra_mem(pfn_s, n_pfns);
+ xen_max_p2m_pfn = pfn_s + n_pfns;
+ } else
+ discard = true;
+ }
+
+ if (!discard)
+ xen_align_and_add_e820_region(addr, chunk_size, type);
+
+ addr += chunk_size;
+ size -= chunk_size;
+ if (size == 0) {
+ i++;
+ if (i < xen_e820_table.nr_entries) {
+ addr = xen_e820_table.entries[i].addr;
+ size = xen_e820_table.entries[i].size;
+ }
+ }
+ }
+
+ /*
+ * Set the rest as identity mapped, in case PCI BARs are
+ * located here.
+ */
+ set_phys_range_identity(addr / PAGE_SIZE, ~0ul);
+
+ /*
+ * In domU, the ISA region is normal, usable memory, but we
+ * reserve ISA memory anyway because too many things poke
+ * about in there.
+ */
+ e820__range_add(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS, E820_TYPE_RESERVED);
+
+ e820__update_table(e820_table);
+
+ /*
+ * Check whether the kernel itself conflicts with the target E820 map.
+ * Failing now is better than running into weird problems later due
+ * to relocating (and even reusing) pages with kernel text or data.
+ */
+ if (xen_is_e820_reserved(__pa_symbol(_text),
+ __pa_symbol(__bss_stop) - __pa_symbol(_text))) {
+ xen_raw_console_write("Xen hypervisor allocated kernel memory conflicts with E820 map\n");
+ BUG();
+ }
+
+ /*
+ * Check for a conflict of the hypervisor supplied page tables with
+ * the target E820 map.
+ */
+ xen_pt_check_e820();
+
+ xen_reserve_xen_mfnlist();
+
+ /* Check for a conflict of the initrd with the target E820 map. */
+ if (xen_is_e820_reserved(boot_params.hdr.ramdisk_image,
+ boot_params.hdr.ramdisk_size)) {
+ phys_addr_t new_area, start, size;
+
+ new_area = xen_find_free_area(boot_params.hdr.ramdisk_size);
+ if (!new_area) {
+ xen_raw_console_write("Can't find new memory area for initrd needed due to E820 map conflict\n");
+ BUG();
+ }
+
+ start = boot_params.hdr.ramdisk_image;
+ size = boot_params.hdr.ramdisk_size;
+ xen_phys_memcpy(new_area, start, size);
+ pr_info("initrd moved from [mem %#010llx-%#010llx] to [mem %#010llx-%#010llx]\n",
+ start, start + size, new_area, new_area + size);
+ memblock_free(start, size);
+ boot_params.hdr.ramdisk_image = new_area;
+ boot_params.ext_ramdisk_image = new_area >> 32;
+ }
+
+ /*
+ * Set identity map on non-RAM pages and prepare remapping the
+ * underlying RAM.
+ */
+ xen_foreach_remap_area(max_pfn, xen_set_identity_and_remap_chunk);
+
+ pr_info("Released %ld page(s)\n", xen_released_pages);
+
+ return "Xen";
+}
+
+/*
+ * Set the bit indicating "nosegneg" library variants should be used.
+ * We only need to bother in pure 32-bit mode; compat 32-bit processes
+ * can have un-truncated segments, so wrapping around is allowed.
+ */
+static void __init fiddle_vdso(void)
+{
+#ifdef CONFIG_X86_32
+ u32 *mask = vdso_image_32.data +
+ vdso_image_32.sym_VDSO32_NOTE_MASK;
+ *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
+#endif
+}
+
+static int register_callback(unsigned type, const void *func)
+{
+ struct callback_register callback = {
+ .type = type,
+ .address = XEN_CALLBACK(__KERNEL_CS, func),
+ .flags = CALLBACKF_mask_events,
+ };
+
+ return HYPERVISOR_callback_op(CALLBACKOP_register, &callback);
+}
+
+void xen_enable_sysenter(void)
+{
+ int ret;
+ unsigned sysenter_feature;
+
+#ifdef CONFIG_X86_32
+ sysenter_feature = X86_FEATURE_SEP;
+#else
+ sysenter_feature = X86_FEATURE_SYSENTER32;
+#endif
+
+ if (!boot_cpu_has(sysenter_feature))
+ return;
+
+ ret = register_callback(CALLBACKTYPE_sysenter, xen_sysenter_target);
+ if(ret != 0)
+ setup_clear_cpu_cap(sysenter_feature);
+}
+
+void xen_enable_syscall(void)
+{
+#ifdef CONFIG_X86_64
+ int ret;
+
+ ret = register_callback(CALLBACKTYPE_syscall, xen_syscall_target);
+ if (ret != 0) {
+ printk(KERN_ERR "Failed to set syscall callback: %d\n", ret);
+ /* Pretty fatal; 64-bit userspace has no other
+ mechanism for syscalls. */
+ }
+
+ if (boot_cpu_has(X86_FEATURE_SYSCALL32)) {
+ ret = register_callback(CALLBACKTYPE_syscall32,
+ xen_syscall32_target);
+ if (ret != 0)
+ setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
+ }
+#endif /* CONFIG_X86_64 */
+}
+
+void __init xen_pvmmu_arch_setup(void)
+{
+ HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
+ HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
+
+ HYPERVISOR_vm_assist(VMASST_CMD_enable,
+ VMASST_TYPE_pae_extended_cr3);
+
+ if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) ||
+ register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback))
+ BUG();
+
+ xen_enable_sysenter();
+ xen_enable_syscall();
+}
+
+/* This function is not called for HVM domains */
+void __init xen_arch_setup(void)
+{
+ xen_panic_handler_init();
+ xen_pvmmu_arch_setup();
+
+#ifdef CONFIG_ACPI
+ if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
+ printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
+ disable_acpi();
+ }
+#endif
+
+ memcpy(boot_command_line, xen_start_info->cmd_line,
+ MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ?
+ COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE);
+
+ /* Set up idle, making sure it calls safe_halt() pvop */
+ disable_cpuidle();
+ disable_cpufreq();
+ WARN_ON(xen_set_default_idle());
+ fiddle_vdso();
+#ifdef CONFIG_NUMA
+ numa_off = 1;
+#endif
+}
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
new file mode 100644
index 000000000..7a43b2ae1
--- /dev/null
+++ b/arch/x86/xen/smp.c
@@ -0,0 +1,284 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/smp.h>
+#include <linux/cpu.h>
+#include <linux/slab.h>
+#include <linux/cpumask.h>
+#include <linux/percpu.h>
+
+#include <xen/events.h>
+
+#include <xen/hvc-console.h>
+#include "xen-ops.h"
+#include "smp.h"
+
+static DEFINE_PER_CPU(struct xen_common_irq, xen_resched_irq) = { .irq = -1 };
+static DEFINE_PER_CPU(struct xen_common_irq, xen_callfunc_irq) = { .irq = -1 };
+static DEFINE_PER_CPU(struct xen_common_irq, xen_callfuncsingle_irq) = { .irq = -1 };
+static DEFINE_PER_CPU(struct xen_common_irq, xen_debug_irq) = { .irq = -1 };
+
+static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id);
+static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id);
+
+/*
+ * Reschedule call back.
+ */
+static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
+{
+ inc_irq_stat(irq_resched_count);
+ scheduler_ipi();
+
+ return IRQ_HANDLED;
+}
+
+void xen_smp_intr_free(unsigned int cpu)
+{
+ if (per_cpu(xen_resched_irq, cpu).irq >= 0) {
+ unbind_from_irqhandler(per_cpu(xen_resched_irq, cpu).irq, NULL);
+ per_cpu(xen_resched_irq, cpu).irq = -1;
+ kfree(per_cpu(xen_resched_irq, cpu).name);
+ per_cpu(xen_resched_irq, cpu).name = NULL;
+ }
+ if (per_cpu(xen_callfunc_irq, cpu).irq >= 0) {
+ unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu).irq, NULL);
+ per_cpu(xen_callfunc_irq, cpu).irq = -1;
+ kfree(per_cpu(xen_callfunc_irq, cpu).name);
+ per_cpu(xen_callfunc_irq, cpu).name = NULL;
+ }
+ if (per_cpu(xen_debug_irq, cpu).irq >= 0) {
+ unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu).irq, NULL);
+ per_cpu(xen_debug_irq, cpu).irq = -1;
+ kfree(per_cpu(xen_debug_irq, cpu).name);
+ per_cpu(xen_debug_irq, cpu).name = NULL;
+ }
+ if (per_cpu(xen_callfuncsingle_irq, cpu).irq >= 0) {
+ unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu).irq,
+ NULL);
+ per_cpu(xen_callfuncsingle_irq, cpu).irq = -1;
+ kfree(per_cpu(xen_callfuncsingle_irq, cpu).name);
+ per_cpu(xen_callfuncsingle_irq, cpu).name = NULL;
+ }
+}
+
+int xen_smp_intr_init(unsigned int cpu)
+{
+ int rc;
+ char *resched_name, *callfunc_name, *debug_name;
+
+ resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu);
+ rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR,
+ cpu,
+ xen_reschedule_interrupt,
+ IRQF_PERCPU|IRQF_NOBALANCING,
+ resched_name,
+ NULL);
+ if (rc < 0)
+ goto fail;
+ per_cpu(xen_resched_irq, cpu).irq = rc;
+ per_cpu(xen_resched_irq, cpu).name = resched_name;
+
+ callfunc_name = kasprintf(GFP_KERNEL, "callfunc%d", cpu);
+ rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_VECTOR,
+ cpu,
+ xen_call_function_interrupt,
+ IRQF_PERCPU|IRQF_NOBALANCING,
+ callfunc_name,
+ NULL);
+ if (rc < 0)
+ goto fail;
+ per_cpu(xen_callfunc_irq, cpu).irq = rc;
+ per_cpu(xen_callfunc_irq, cpu).name = callfunc_name;
+
+ debug_name = kasprintf(GFP_KERNEL, "debug%d", cpu);
+ rc = bind_virq_to_irqhandler(VIRQ_DEBUG, cpu, xen_debug_interrupt,
+ IRQF_PERCPU | IRQF_NOBALANCING,
+ debug_name, NULL);
+ if (rc < 0)
+ goto fail;
+ per_cpu(xen_debug_irq, cpu).irq = rc;
+ per_cpu(xen_debug_irq, cpu).name = debug_name;
+
+ callfunc_name = kasprintf(GFP_KERNEL, "callfuncsingle%d", cpu);
+ rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_SINGLE_VECTOR,
+ cpu,
+ xen_call_function_single_interrupt,
+ IRQF_PERCPU|IRQF_NOBALANCING,
+ callfunc_name,
+ NULL);
+ if (rc < 0)
+ goto fail;
+ per_cpu(xen_callfuncsingle_irq, cpu).irq = rc;
+ per_cpu(xen_callfuncsingle_irq, cpu).name = callfunc_name;
+
+ return 0;
+
+ fail:
+ xen_smp_intr_free(cpu);
+ return rc;
+}
+
+void __init xen_smp_cpus_done(unsigned int max_cpus)
+{
+ int cpu, rc, count = 0;
+
+ if (xen_hvm_domain())
+ native_smp_cpus_done(max_cpus);
+ else
+ calculate_max_logical_packages();
+
+ if (xen_have_vcpu_info_placement)
+ return;
+
+ for_each_online_cpu(cpu) {
+ if (xen_vcpu_nr(cpu) < MAX_VIRT_CPUS)
+ continue;
+
+ rc = cpu_down(cpu);
+
+ if (rc == 0) {
+ /*
+ * Reset vcpu_info so this cpu cannot be onlined again.
+ */
+ xen_vcpu_info_reset(cpu);
+ count++;
+ } else {
+ pr_warn("%s: failed to bring CPU %d down, error %d\n",
+ __func__, cpu, rc);
+ }
+ }
+ WARN(count, "%s: brought %d CPUs offline\n", __func__, count);
+}
+
+void xen_smp_send_reschedule(int cpu)
+{
+ xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR);
+}
+
+static void __xen_send_IPI_mask(const struct cpumask *mask,
+ int vector)
+{
+ unsigned cpu;
+
+ for_each_cpu_and(cpu, mask, cpu_online_mask)
+ xen_send_IPI_one(cpu, vector);
+}
+
+void xen_smp_send_call_function_ipi(const struct cpumask *mask)
+{
+ int cpu;
+
+ __xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR);
+
+ /* Make sure other vcpus get a chance to run if they need to. */
+ for_each_cpu(cpu, mask) {
+ if (xen_vcpu_stolen(cpu)) {
+ HYPERVISOR_sched_op(SCHEDOP_yield, NULL);
+ break;
+ }
+ }
+}
+
+void xen_smp_send_call_function_single_ipi(int cpu)
+{
+ __xen_send_IPI_mask(cpumask_of(cpu),
+ XEN_CALL_FUNCTION_SINGLE_VECTOR);
+}
+
+static inline int xen_map_vector(int vector)
+{
+ int xen_vector;
+
+ switch (vector) {
+ case RESCHEDULE_VECTOR:
+ xen_vector = XEN_RESCHEDULE_VECTOR;
+ break;
+ case CALL_FUNCTION_VECTOR:
+ xen_vector = XEN_CALL_FUNCTION_VECTOR;
+ break;
+ case CALL_FUNCTION_SINGLE_VECTOR:
+ xen_vector = XEN_CALL_FUNCTION_SINGLE_VECTOR;
+ break;
+ case IRQ_WORK_VECTOR:
+ xen_vector = XEN_IRQ_WORK_VECTOR;
+ break;
+#ifdef CONFIG_X86_64
+ case NMI_VECTOR:
+ case APIC_DM_NMI: /* Some use that instead of NMI_VECTOR */
+ xen_vector = XEN_NMI_VECTOR;
+ break;
+#endif
+ default:
+ xen_vector = -1;
+ printk(KERN_ERR "xen: vector 0x%x is not implemented\n",
+ vector);
+ }
+
+ return xen_vector;
+}
+
+void xen_send_IPI_mask(const struct cpumask *mask,
+ int vector)
+{
+ int xen_vector = xen_map_vector(vector);
+
+ if (xen_vector >= 0)
+ __xen_send_IPI_mask(mask, xen_vector);
+}
+
+void xen_send_IPI_all(int vector)
+{
+ int xen_vector = xen_map_vector(vector);
+
+ if (xen_vector >= 0)
+ __xen_send_IPI_mask(cpu_online_mask, xen_vector);
+}
+
+void xen_send_IPI_self(int vector)
+{
+ int xen_vector = xen_map_vector(vector);
+
+ if (xen_vector >= 0)
+ xen_send_IPI_one(smp_processor_id(), xen_vector);
+}
+
+void xen_send_IPI_mask_allbutself(const struct cpumask *mask,
+ int vector)
+{
+ unsigned cpu;
+ unsigned int this_cpu = smp_processor_id();
+ int xen_vector = xen_map_vector(vector);
+
+ if (!(num_online_cpus() > 1) || (xen_vector < 0))
+ return;
+
+ for_each_cpu_and(cpu, mask, cpu_online_mask) {
+ if (this_cpu == cpu)
+ continue;
+
+ xen_send_IPI_one(cpu, xen_vector);
+ }
+}
+
+void xen_send_IPI_allbutself(int vector)
+{
+ xen_send_IPI_mask_allbutself(cpu_online_mask, vector);
+}
+
+static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
+{
+ irq_enter();
+ generic_smp_call_function_interrupt();
+ inc_irq_stat(irq_call_count);
+ irq_exit();
+
+ return IRQ_HANDLED;
+}
+
+static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
+{
+ irq_enter();
+ generic_smp_call_function_single_interrupt();
+ inc_irq_stat(irq_call_count);
+ irq_exit();
+
+ return IRQ_HANDLED;
+}
diff --git a/arch/x86/xen/smp.h b/arch/x86/xen/smp.h
new file mode 100644
index 000000000..bd02f9d50
--- /dev/null
+++ b/arch/x86/xen/smp.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _XEN_SMP_H
+
+#ifdef CONFIG_SMP
+extern void xen_send_IPI_mask(const struct cpumask *mask,
+ int vector);
+extern void xen_send_IPI_mask_allbutself(const struct cpumask *mask,
+ int vector);
+extern void xen_send_IPI_allbutself(int vector);
+extern void xen_send_IPI_all(int vector);
+extern void xen_send_IPI_self(int vector);
+
+extern int xen_smp_intr_init(unsigned int cpu);
+extern void xen_smp_intr_free(unsigned int cpu);
+int xen_smp_intr_init_pv(unsigned int cpu);
+void xen_smp_intr_free_pv(unsigned int cpu);
+
+void xen_smp_cpus_done(unsigned int max_cpus);
+
+void xen_smp_send_reschedule(int cpu);
+void xen_smp_send_call_function_ipi(const struct cpumask *mask);
+void xen_smp_send_call_function_single_ipi(int cpu);
+
+struct xen_common_irq {
+ int irq;
+ char *name;
+};
+#else /* CONFIG_SMP */
+
+static inline int xen_smp_intr_init(unsigned int cpu)
+{
+ return 0;
+}
+static inline void xen_smp_intr_free(unsigned int cpu) {}
+
+static inline int xen_smp_intr_init_pv(unsigned int cpu)
+{
+ return 0;
+}
+static inline void xen_smp_intr_free_pv(unsigned int cpu) {}
+#endif /* CONFIG_SMP */
+
+#endif
diff --git a/arch/x86/xen/smp_hvm.c b/arch/x86/xen/smp_hvm.c
new file mode 100644
index 000000000..e5bd9eb42
--- /dev/null
+++ b/arch/x86/xen/smp_hvm.c
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <asm/smp.h>
+
+#include <xen/events.h>
+
+#include "xen-ops.h"
+#include "smp.h"
+
+
+static void __init xen_hvm_smp_prepare_boot_cpu(void)
+{
+ BUG_ON(smp_processor_id() != 0);
+ native_smp_prepare_boot_cpu();
+
+ /*
+ * Setup vcpu_info for boot CPU. Secondary CPUs get their vcpu_info
+ * in xen_cpu_up_prepare_hvm().
+ */
+ xen_vcpu_setup(0);
+
+ /*
+ * Called again in case the kernel boots on vcpu >= MAX_VIRT_CPUS.
+ * Refer to comments in xen_hvm_init_time_ops().
+ */
+ xen_hvm_init_time_ops();
+
+ /*
+ * The alternative logic (which patches the unlock/lock) runs before
+ * the smp bootup up code is activated. Hence we need to set this up
+ * the core kernel is being patched. Otherwise we will have only
+ * modules patched but not core code.
+ */
+ xen_init_spinlocks();
+}
+
+static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus)
+{
+ int cpu;
+
+ native_smp_prepare_cpus(max_cpus);
+ WARN_ON(xen_smp_intr_init(0));
+
+ xen_init_lock_cpu(0);
+
+ for_each_possible_cpu(cpu) {
+ if (cpu == 0)
+ continue;
+
+ /* Set default vcpu_id to make sure that we don't use cpu-0's */
+ per_cpu(xen_vcpu_id, cpu) = XEN_VCPU_ID_INVALID;
+ }
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static void xen_hvm_cpu_die(unsigned int cpu)
+{
+ if (common_cpu_die(cpu) == 0) {
+ xen_smp_intr_free(cpu);
+ xen_uninit_lock_cpu(cpu);
+ xen_teardown_timer(cpu);
+ }
+}
+#else
+static void xen_hvm_cpu_die(unsigned int cpu)
+{
+ BUG();
+}
+#endif
+
+void __init xen_hvm_smp_init(void)
+{
+ if (!xen_have_vector_callback)
+ return;
+
+ smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus;
+ smp_ops.smp_send_reschedule = xen_smp_send_reschedule;
+ smp_ops.cpu_die = xen_hvm_cpu_die;
+ smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi;
+ smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi;
+ smp_ops.smp_prepare_boot_cpu = xen_hvm_smp_prepare_boot_cpu;
+ smp_ops.smp_cpus_done = xen_smp_cpus_done;
+}
diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c
new file mode 100644
index 000000000..e8248e8a0
--- /dev/null
+++ b/arch/x86/xen/smp_pv.c
@@ -0,0 +1,506 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Xen SMP support
+ *
+ * This file implements the Xen versions of smp_ops. SMP under Xen is
+ * very straightforward. Bringing a CPU up is simply a matter of
+ * loading its initial context and setting it running.
+ *
+ * IPIs are handled through the Xen event mechanism.
+ *
+ * Because virtual CPUs can be scheduled onto any real CPU, there's no
+ * useful topology information for the kernel to make use of. As a
+ * result, all CPUs are treated as if they're single-core and
+ * single-threaded.
+ */
+#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/smp.h>
+#include <linux/irq_work.h>
+#include <linux/tick.h>
+#include <linux/nmi.h>
+#include <linux/cpuhotplug.h>
+
+#include <asm/paravirt.h>
+#include <asm/desc.h>
+#include <asm/pgtable.h>
+#include <asm/cpu.h>
+
+#include <xen/interface/xen.h>
+#include <xen/interface/vcpu.h>
+#include <xen/interface/xenpmu.h>
+
+#include <asm/spec-ctrl.h>
+#include <asm/xen/interface.h>
+#include <asm/xen/hypercall.h>
+
+#include <xen/xen.h>
+#include <xen/page.h>
+#include <xen/events.h>
+
+#include <xen/hvc-console.h>
+#include "xen-ops.h"
+#include "mmu.h"
+#include "smp.h"
+#include "pmu.h"
+
+cpumask_var_t xen_cpu_initialized_map;
+
+static DEFINE_PER_CPU(struct xen_common_irq, xen_irq_work) = { .irq = -1 };
+static DEFINE_PER_CPU(struct xen_common_irq, xen_pmu_irq) = { .irq = -1 };
+
+static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id);
+
+static void cpu_bringup(void)
+{
+ int cpu;
+
+ cpu_init();
+ touch_softlockup_watchdog();
+ preempt_disable();
+
+ /* PVH runs in ring 0 and allows us to do native syscalls. Yay! */
+ if (!xen_feature(XENFEAT_supervisor_mode_kernel)) {
+ xen_enable_sysenter();
+ xen_enable_syscall();
+ }
+ cpu = smp_processor_id();
+ smp_store_cpu_info(cpu);
+ cpu_data(cpu).x86_max_cores = 1;
+ set_cpu_sibling_map(cpu);
+
+ speculative_store_bypass_ht_init();
+
+ xen_setup_cpu_clockevents();
+
+ notify_cpu_starting(cpu);
+
+ set_cpu_online(cpu, true);
+
+ cpu_set_state_online(cpu); /* Implies full memory barrier. */
+
+ /* We can take interrupts now: we're officially "up". */
+ local_irq_enable();
+}
+
+asmlinkage __visible void cpu_bringup_and_idle(void)
+{
+ cpu_bringup();
+ cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
+ prevent_tail_call_optimization();
+}
+
+void xen_smp_intr_free_pv(unsigned int cpu)
+{
+ if (per_cpu(xen_irq_work, cpu).irq >= 0) {
+ unbind_from_irqhandler(per_cpu(xen_irq_work, cpu).irq, NULL);
+ per_cpu(xen_irq_work, cpu).irq = -1;
+ kfree(per_cpu(xen_irq_work, cpu).name);
+ per_cpu(xen_irq_work, cpu).name = NULL;
+ }
+
+ if (per_cpu(xen_pmu_irq, cpu).irq >= 0) {
+ unbind_from_irqhandler(per_cpu(xen_pmu_irq, cpu).irq, NULL);
+ per_cpu(xen_pmu_irq, cpu).irq = -1;
+ kfree(per_cpu(xen_pmu_irq, cpu).name);
+ per_cpu(xen_pmu_irq, cpu).name = NULL;
+ }
+}
+
+int xen_smp_intr_init_pv(unsigned int cpu)
+{
+ int rc;
+ char *callfunc_name, *pmu_name;
+
+ callfunc_name = kasprintf(GFP_KERNEL, "irqwork%d", cpu);
+ rc = bind_ipi_to_irqhandler(XEN_IRQ_WORK_VECTOR,
+ cpu,
+ xen_irq_work_interrupt,
+ IRQF_PERCPU|IRQF_NOBALANCING,
+ callfunc_name,
+ NULL);
+ if (rc < 0)
+ goto fail;
+ per_cpu(xen_irq_work, cpu).irq = rc;
+ per_cpu(xen_irq_work, cpu).name = callfunc_name;
+
+ if (is_xen_pmu) {
+ pmu_name = kasprintf(GFP_KERNEL, "pmu%d", cpu);
+ rc = bind_virq_to_irqhandler(VIRQ_XENPMU, cpu,
+ xen_pmu_irq_handler,
+ IRQF_PERCPU|IRQF_NOBALANCING,
+ pmu_name, NULL);
+ if (rc < 0)
+ goto fail;
+ per_cpu(xen_pmu_irq, cpu).irq = rc;
+ per_cpu(xen_pmu_irq, cpu).name = pmu_name;
+ }
+
+ return 0;
+
+ fail:
+ xen_smp_intr_free_pv(cpu);
+ return rc;
+}
+
+static void __init xen_fill_possible_map(void)
+{
+ int i, rc;
+
+ if (xen_initial_domain())
+ return;
+
+ for (i = 0; i < nr_cpu_ids; i++) {
+ rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
+ if (rc >= 0) {
+ num_processors++;
+ set_cpu_possible(i, true);
+ }
+ }
+}
+
+static void __init xen_filter_cpu_maps(void)
+{
+ int i, rc;
+ unsigned int subtract = 0;
+
+ if (!xen_initial_domain())
+ return;
+
+ num_processors = 0;
+ disabled_cpus = 0;
+ for (i = 0; i < nr_cpu_ids; i++) {
+ rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
+ if (rc >= 0) {
+ num_processors++;
+ set_cpu_possible(i, true);
+ } else {
+ set_cpu_possible(i, false);
+ set_cpu_present(i, false);
+ subtract++;
+ }
+ }
+#ifdef CONFIG_HOTPLUG_CPU
+ /* This is akin to using 'nr_cpus' on the Linux command line.
+ * Which is OK as when we use 'dom0_max_vcpus=X' we can only
+ * have up to X, while nr_cpu_ids is greater than X. This
+ * normally is not a problem, except when CPU hotplugging
+ * is involved and then there might be more than X CPUs
+ * in the guest - which will not work as there is no
+ * hypercall to expand the max number of VCPUs an already
+ * running guest has. So cap it up to X. */
+ if (subtract)
+ nr_cpu_ids = nr_cpu_ids - subtract;
+#endif
+
+}
+
+static void __init xen_pv_smp_prepare_boot_cpu(void)
+{
+ BUG_ON(smp_processor_id() != 0);
+ native_smp_prepare_boot_cpu();
+
+ if (!xen_feature(XENFEAT_writable_page_tables))
+ /* We've switched to the "real" per-cpu gdt, so make
+ * sure the old memory can be recycled. */
+ make_lowmem_page_readwrite(xen_initial_gdt);
+
+#ifdef CONFIG_X86_32
+ /*
+ * Xen starts us with XEN_FLAT_RING1_DS, but linux code
+ * expects __USER_DS
+ */
+ loadsegment(ds, __USER_DS);
+ loadsegment(es, __USER_DS);
+#endif
+
+ xen_filter_cpu_maps();
+ xen_setup_vcpu_info_placement();
+
+ /*
+ * The alternative logic (which patches the unlock/lock) runs before
+ * the smp bootup up code is activated. Hence we need to set this up
+ * the core kernel is being patched. Otherwise we will have only
+ * modules patched but not core code.
+ */
+ xen_init_spinlocks();
+}
+
+static void __init xen_pv_smp_prepare_cpus(unsigned int max_cpus)
+{
+ unsigned cpu;
+ unsigned int i;
+
+ if (skip_ioapic_setup) {
+ char *m = (max_cpus == 0) ?
+ "The nosmp parameter is incompatible with Xen; " \
+ "use Xen dom0_max_vcpus=1 parameter" :
+ "The noapic parameter is incompatible with Xen";
+
+ xen_raw_printk(m);
+ panic(m);
+ }
+ xen_init_lock_cpu(0);
+
+ smp_store_boot_cpu_info();
+ cpu_data(0).x86_max_cores = 1;
+
+ for_each_possible_cpu(i) {
+ zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
+ zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
+ zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL);
+ }
+ set_cpu_sibling_map(0);
+
+ speculative_store_bypass_ht_init();
+
+ xen_pmu_init(0);
+
+ if (xen_smp_intr_init(0) || xen_smp_intr_init_pv(0))
+ BUG();
+
+ if (!alloc_cpumask_var(&xen_cpu_initialized_map, GFP_KERNEL))
+ panic("could not allocate xen_cpu_initialized_map\n");
+
+ cpumask_copy(xen_cpu_initialized_map, cpumask_of(0));
+
+ /* Restrict the possible_map according to max_cpus. */
+ while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) {
+ for (cpu = nr_cpu_ids - 1; !cpu_possible(cpu); cpu--)
+ continue;
+ set_cpu_possible(cpu, false);
+ }
+
+ for_each_possible_cpu(cpu)
+ set_cpu_present(cpu, true);
+}
+
+static int
+cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
+{
+ struct vcpu_guest_context *ctxt;
+ struct desc_struct *gdt;
+ unsigned long gdt_mfn;
+
+ /* used to tell cpu_init() that it can proceed with initialization */
+ cpumask_set_cpu(cpu, cpu_callout_mask);
+ if (cpumask_test_and_set_cpu(cpu, xen_cpu_initialized_map))
+ return 0;
+
+ ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
+ if (ctxt == NULL)
+ return -ENOMEM;
+
+ gdt = get_cpu_gdt_rw(cpu);
+
+#ifdef CONFIG_X86_32
+ ctxt->user_regs.fs = __KERNEL_PERCPU;
+ ctxt->user_regs.gs = __KERNEL_STACK_CANARY;
+#endif
+ memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt));
+
+ /*
+ * Bring up the CPU in cpu_bringup_and_idle() with the stack
+ * pointing just below where pt_regs would be if it were a normal
+ * kernel entry.
+ */
+ ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
+ ctxt->flags = VGCF_IN_KERNEL;
+ ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
+ ctxt->user_regs.ds = __USER_DS;
+ ctxt->user_regs.es = __USER_DS;
+ ctxt->user_regs.ss = __KERNEL_DS;
+ ctxt->user_regs.cs = __KERNEL_CS;
+ ctxt->user_regs.esp = (unsigned long)task_pt_regs(idle);
+
+ xen_copy_trap_info(ctxt->trap_ctxt);
+
+ ctxt->ldt_ents = 0;
+
+ BUG_ON((unsigned long)gdt & ~PAGE_MASK);
+
+ gdt_mfn = arbitrary_virt_to_mfn(gdt);
+ make_lowmem_page_readonly(gdt);
+ make_lowmem_page_readonly(mfn_to_virt(gdt_mfn));
+
+ ctxt->gdt_frames[0] = gdt_mfn;
+ ctxt->gdt_ents = GDT_ENTRIES;
+
+ /*
+ * Set SS:SP that Xen will use when entering guest kernel mode
+ * from guest user mode. Subsequent calls to load_sp0() can
+ * change this value.
+ */
+ ctxt->kernel_ss = __KERNEL_DS;
+ ctxt->kernel_sp = task_top_of_stack(idle);
+
+#ifdef CONFIG_X86_32
+ ctxt->event_callback_cs = __KERNEL_CS;
+ ctxt->failsafe_callback_cs = __KERNEL_CS;
+#else
+ ctxt->gs_base_kernel = per_cpu_offset(cpu);
+#endif
+ ctxt->event_callback_eip =
+ (unsigned long)xen_hypervisor_callback;
+ ctxt->failsafe_callback_eip =
+ (unsigned long)xen_failsafe_callback;
+ per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
+
+ ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_gfn(swapper_pg_dir));
+ if (HYPERVISOR_vcpu_op(VCPUOP_initialise, xen_vcpu_nr(cpu), ctxt))
+ BUG();
+
+ kfree(ctxt);
+ return 0;
+}
+
+static int xen_pv_cpu_up(unsigned int cpu, struct task_struct *idle)
+{
+ int rc;
+
+ common_cpu_up(cpu, idle);
+
+ xen_setup_runstate_info(cpu);
+
+ /*
+ * PV VCPUs are always successfully taken down (see 'while' loop
+ * in xen_cpu_die()), so -EBUSY is an error.
+ */
+ rc = cpu_check_up_prepare(cpu);
+ if (rc)
+ return rc;
+
+ /* make sure interrupts start blocked */
+ per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1;
+
+ rc = cpu_initialize_context(cpu, idle);
+ if (rc)
+ return rc;
+
+ xen_pmu_init(cpu);
+
+ rc = HYPERVISOR_vcpu_op(VCPUOP_up, xen_vcpu_nr(cpu), NULL);
+ BUG_ON(rc);
+
+ while (cpu_report_state(cpu) != CPU_ONLINE)
+ HYPERVISOR_sched_op(SCHEDOP_yield, NULL);
+
+ return 0;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static int xen_pv_cpu_disable(void)
+{
+ unsigned int cpu = smp_processor_id();
+ if (cpu == 0)
+ return -EBUSY;
+
+ cpu_disable_common();
+
+ load_cr3(swapper_pg_dir);
+ return 0;
+}
+
+static void xen_pv_cpu_die(unsigned int cpu)
+{
+ while (HYPERVISOR_vcpu_op(VCPUOP_is_up,
+ xen_vcpu_nr(cpu), NULL)) {
+ __set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_timeout(HZ/10);
+ }
+
+ if (common_cpu_die(cpu) == 0) {
+ xen_smp_intr_free(cpu);
+ xen_uninit_lock_cpu(cpu);
+ xen_teardown_timer(cpu);
+ xen_pmu_finish(cpu);
+ }
+}
+
+static void xen_pv_play_dead(void) /* used only with HOTPLUG_CPU */
+{
+ play_dead_common();
+ HYPERVISOR_vcpu_op(VCPUOP_down, xen_vcpu_nr(smp_processor_id()), NULL);
+ cpu_bringup();
+ /*
+ * commit 4b0c0f294 (tick: Cleanup NOHZ per cpu data on cpu down)
+ * clears certain data that the cpu_idle loop (which called us
+ * and that we return from) expects. The only way to get that
+ * data back is to call:
+ */
+ tick_nohz_idle_enter();
+ tick_nohz_idle_stop_tick_protected();
+
+ cpuhp_online_idle(CPUHP_AP_ONLINE_IDLE);
+}
+
+#else /* !CONFIG_HOTPLUG_CPU */
+static int xen_pv_cpu_disable(void)
+{
+ return -ENOSYS;
+}
+
+static void xen_pv_cpu_die(unsigned int cpu)
+{
+ BUG();
+}
+
+static void xen_pv_play_dead(void)
+{
+ BUG();
+}
+
+#endif
+static void stop_self(void *v)
+{
+ int cpu = smp_processor_id();
+
+ /* make sure we're not pinning something down */
+ load_cr3(swapper_pg_dir);
+ /* should set up a minimal gdt */
+
+ set_cpu_online(cpu, false);
+
+ HYPERVISOR_vcpu_op(VCPUOP_down, xen_vcpu_nr(cpu), NULL);
+ BUG();
+}
+
+static void xen_pv_stop_other_cpus(int wait)
+{
+ smp_call_function(stop_self, NULL, wait);
+}
+
+static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id)
+{
+ irq_enter();
+ irq_work_run();
+ inc_irq_stat(apic_irq_work_irqs);
+ irq_exit();
+
+ return IRQ_HANDLED;
+}
+
+static const struct smp_ops xen_smp_ops __initconst = {
+ .smp_prepare_boot_cpu = xen_pv_smp_prepare_boot_cpu,
+ .smp_prepare_cpus = xen_pv_smp_prepare_cpus,
+ .smp_cpus_done = xen_smp_cpus_done,
+
+ .cpu_up = xen_pv_cpu_up,
+ .cpu_die = xen_pv_cpu_die,
+ .cpu_disable = xen_pv_cpu_disable,
+ .play_dead = xen_pv_play_dead,
+
+ .stop_other_cpus = xen_pv_stop_other_cpus,
+ .smp_send_reschedule = xen_smp_send_reschedule,
+
+ .send_call_func_ipi = xen_smp_send_call_function_ipi,
+ .send_call_func_single_ipi = xen_smp_send_call_function_single_ipi,
+};
+
+void __init xen_smp_init(void)
+{
+ smp_ops = xen_smp_ops;
+ xen_fill_possible_map();
+}
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
new file mode 100644
index 000000000..6fffb86a3
--- /dev/null
+++ b/arch/x86/xen/spinlock.c
@@ -0,0 +1,160 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Split spinlock implementation out into its own file, so it can be
+ * compiled in a FTRACE-compatible way.
+ */
+#include <linux/kernel_stat.h>
+#include <linux/spinlock.h>
+#include <linux/debugfs.h>
+#include <linux/log2.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/atomic.h>
+
+#include <asm/paravirt.h>
+#include <asm/qspinlock.h>
+
+#include <xen/interface/xen.h>
+#include <xen/events.h>
+
+#include "xen-ops.h"
+#include "debugfs.h"
+
+static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
+static DEFINE_PER_CPU(char *, irq_name);
+static DEFINE_PER_CPU(atomic_t, xen_qlock_wait_nest);
+static bool xen_pvspin = true;
+
+static void xen_qlock_kick(int cpu)
+{
+ int irq = per_cpu(lock_kicker_irq, cpu);
+
+ /* Don't kick if the target's kicker interrupt is not initialized. */
+ if (irq == -1)
+ return;
+
+ xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR);
+}
+
+/*
+ * Halt the current CPU & release it back to the host
+ */
+static void xen_qlock_wait(u8 *byte, u8 val)
+{
+ int irq = __this_cpu_read(lock_kicker_irq);
+ atomic_t *nest_cnt = this_cpu_ptr(&xen_qlock_wait_nest);
+
+ /* If kicker interrupts not initialized yet, just spin */
+ if (irq == -1 || in_nmi())
+ return;
+
+ /* Detect reentry. */
+ atomic_inc(nest_cnt);
+
+ /* If irq pending already and no nested call clear it. */
+ if (atomic_read(nest_cnt) == 1 && xen_test_irq_pending(irq)) {
+ xen_clear_irq_pending(irq);
+ } else if (READ_ONCE(*byte) == val) {
+ /* Block until irq becomes pending (or a spurious wakeup) */
+ xen_poll_irq(irq);
+ }
+
+ atomic_dec(nest_cnt);
+}
+
+static irqreturn_t dummy_handler(int irq, void *dev_id)
+{
+ BUG();
+ return IRQ_HANDLED;
+}
+
+void xen_init_lock_cpu(int cpu)
+{
+ int irq;
+ char *name;
+
+ if (!xen_pvspin) {
+ if (cpu == 0)
+ static_branch_disable(&virt_spin_lock_key);
+ return;
+ }
+
+ WARN(per_cpu(lock_kicker_irq, cpu) >= 0, "spinlock on CPU%d exists on IRQ%d!\n",
+ cpu, per_cpu(lock_kicker_irq, cpu));
+
+ name = kasprintf(GFP_KERNEL, "spinlock%d", cpu);
+ irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR,
+ cpu,
+ dummy_handler,
+ IRQF_PERCPU|IRQF_NOBALANCING,
+ name,
+ NULL);
+
+ if (irq >= 0) {
+ disable_irq(irq); /* make sure it's never delivered */
+ per_cpu(lock_kicker_irq, cpu) = irq;
+ per_cpu(irq_name, cpu) = name;
+ }
+
+ printk("cpu %d spinlock event irq %d\n", cpu, irq);
+}
+
+void xen_uninit_lock_cpu(int cpu)
+{
+ int irq;
+
+ if (!xen_pvspin)
+ return;
+
+ /*
+ * When booting the kernel with 'mitigations=auto,nosmt', the secondary
+ * CPUs are not activated, and lock_kicker_irq is not initialized.
+ */
+ irq = per_cpu(lock_kicker_irq, cpu);
+ if (irq == -1)
+ return;
+
+ unbind_from_irqhandler(irq, NULL);
+ per_cpu(lock_kicker_irq, cpu) = -1;
+ kfree(per_cpu(irq_name, cpu));
+ per_cpu(irq_name, cpu) = NULL;
+}
+
+PV_CALLEE_SAVE_REGS_THUNK(xen_vcpu_stolen);
+
+/*
+ * Our init of PV spinlocks is split in two init functions due to us
+ * using paravirt patching and jump labels patching and having to do
+ * all of this before SMP code is invoked.
+ *
+ * The paravirt patching needs to be done _before_ the alternative asm code
+ * is started, otherwise we would not patch the core kernel code.
+ */
+void __init xen_init_spinlocks(void)
+{
+
+ /* Don't need to use pvqspinlock code if there is only 1 vCPU. */
+ if (num_possible_cpus() == 1)
+ xen_pvspin = false;
+
+ if (!xen_pvspin) {
+ printk(KERN_DEBUG "xen: PV spinlocks disabled\n");
+ return;
+ }
+ printk(KERN_DEBUG "xen: PV spinlocks enabled\n");
+
+ __pv_init_lock_hash();
+ pv_lock_ops.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
+ pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock);
+ pv_lock_ops.wait = xen_qlock_wait;
+ pv_lock_ops.kick = xen_qlock_kick;
+ pv_lock_ops.vcpu_is_preempted = PV_CALLEE_SAVE(xen_vcpu_stolen);
+}
+
+static __init int xen_parse_nopvspin(char *arg)
+{
+ xen_pvspin = false;
+ return 0;
+}
+early_param("xen_nopvspin", xen_parse_nopvspin);
+
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
new file mode 100644
index 000000000..1d83152c7
--- /dev/null
+++ b/arch/x86/xen/suspend.c
@@ -0,0 +1,84 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/types.h>
+#include <linux/tick.h>
+#include <linux/percpu-defs.h>
+
+#include <xen/xen.h>
+#include <xen/interface/xen.h>
+#include <xen/grant_table.h>
+#include <xen/events.h>
+
+#include <asm/cpufeatures.h>
+#include <asm/msr-index.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/page.h>
+#include <asm/fixmap.h>
+
+#include "xen-ops.h"
+#include "mmu.h"
+#include "pmu.h"
+
+static DEFINE_PER_CPU(u64, spec_ctrl);
+
+void xen_arch_pre_suspend(void)
+{
+ xen_save_time_memory_area();
+
+ if (xen_pv_domain())
+ xen_pv_pre_suspend();
+}
+
+void xen_arch_post_suspend(int cancelled)
+{
+ if (xen_pv_domain())
+ xen_pv_post_suspend(cancelled);
+ else
+ xen_hvm_post_suspend(cancelled);
+
+ xen_restore_time_memory_area();
+}
+
+static void xen_vcpu_notify_restore(void *data)
+{
+ if (xen_pv_domain() && boot_cpu_has(X86_FEATURE_SPEC_CTRL))
+ wrmsrl(MSR_IA32_SPEC_CTRL, this_cpu_read(spec_ctrl));
+
+ /* Boot processor notified via generic timekeeping_resume() */
+ if (smp_processor_id() == 0)
+ return;
+
+ tick_resume_local();
+}
+
+static void xen_vcpu_notify_suspend(void *data)
+{
+ u64 tmp;
+
+ tick_suspend_local();
+
+ if (xen_pv_domain() && boot_cpu_has(X86_FEATURE_SPEC_CTRL)) {
+ rdmsrl(MSR_IA32_SPEC_CTRL, tmp);
+ this_cpu_write(spec_ctrl, tmp);
+ wrmsrl(MSR_IA32_SPEC_CTRL, 0);
+ }
+}
+
+void xen_arch_resume(void)
+{
+ int cpu;
+
+ on_each_cpu(xen_vcpu_notify_restore, NULL, 1);
+
+ for_each_online_cpu(cpu)
+ xen_pmu_init(cpu);
+}
+
+void xen_arch_suspend(void)
+{
+ int cpu;
+
+ for_each_online_cpu(cpu)
+ xen_pmu_finish(cpu);
+
+ on_each_cpu(xen_vcpu_notify_suspend, NULL, 1);
+}
diff --git a/arch/x86/xen/suspend_hvm.c b/arch/x86/xen/suspend_hvm.c
new file mode 100644
index 000000000..e666b614c
--- /dev/null
+++ b/arch/x86/xen/suspend_hvm.c
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/types.h>
+
+#include <xen/xen.h>
+#include <xen/features.h>
+#include <xen/interface/features.h>
+
+#include "xen-ops.h"
+
+void xen_hvm_post_suspend(int suspend_cancelled)
+{
+ if (!suspend_cancelled) {
+ xen_hvm_init_shared_info();
+ xen_vcpu_restore();
+ }
+ xen_callback_vector();
+ xen_unplug_emulated_devices();
+}
diff --git a/arch/x86/xen/suspend_pv.c b/arch/x86/xen/suspend_pv.c
new file mode 100644
index 000000000..8303b58c7
--- /dev/null
+++ b/arch/x86/xen/suspend_pv.c
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/types.h>
+
+#include <asm/fixmap.h>
+
+#include <asm/xen/hypercall.h>
+#include <asm/xen/page.h>
+
+#include "xen-ops.h"
+
+void xen_pv_pre_suspend(void)
+{
+ xen_mm_pin_all();
+
+ xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn);
+ xen_start_info->console.domU.mfn =
+ mfn_to_pfn(xen_start_info->console.domU.mfn);
+
+ BUG_ON(!irqs_disabled());
+
+ HYPERVISOR_shared_info = &xen_dummy_shared_info;
+ if (HYPERVISOR_update_va_mapping(fix_to_virt(FIX_PARAVIRT_BOOTMAP),
+ __pte_ma(0), 0))
+ BUG();
+}
+
+void xen_pv_post_suspend(int suspend_cancelled)
+{
+ xen_build_mfn_list_list();
+ set_fixmap(FIX_PARAVIRT_BOOTMAP, xen_start_info->shared_info);
+ HYPERVISOR_shared_info = (void *)fix_to_virt(FIX_PARAVIRT_BOOTMAP);
+ xen_setup_mfn_list_list();
+
+ if (suspend_cancelled) {
+ xen_start_info->store_mfn =
+ pfn_to_mfn(xen_start_info->store_mfn);
+ xen_start_info->console.domU.mfn =
+ pfn_to_mfn(xen_start_info->console.domU.mfn);
+ } else {
+#ifdef CONFIG_SMP
+ BUG_ON(xen_cpu_initialized_map == NULL);
+ cpumask_copy(xen_cpu_initialized_map, cpu_online_mask);
+#endif
+ xen_vcpu_restore();
+ }
+
+ xen_mm_unpin_all();
+}
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
new file mode 100644
index 000000000..9809de9f2
--- /dev/null
+++ b/arch/x86/xen/time.c
@@ -0,0 +1,594 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Xen time implementation.
+ *
+ * This is implemented in terms of a clocksource driver which uses
+ * the hypervisor clock as a nanosecond timebase, and a clockevent
+ * driver which uses the hypervisor's timer mechanism.
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/clocksource.h>
+#include <linux/clockchips.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/pvclock_gtod.h>
+#include <linux/timekeeper_internal.h>
+
+#include <asm/pvclock.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+
+#include <xen/events.h>
+#include <xen/features.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/vcpu.h>
+
+#include "xen-ops.h"
+
+/* Xen may fire a timer up to this many ns early */
+#define TIMER_SLOP 100000
+
+static u64 xen_sched_clock_offset __read_mostly;
+
+/* Get the TSC speed from Xen */
+static unsigned long xen_tsc_khz(void)
+{
+ struct pvclock_vcpu_time_info *info =
+ &HYPERVISOR_shared_info->vcpu_info[0].time;
+
+ return pvclock_tsc_khz(info);
+}
+
+static u64 xen_clocksource_read(void)
+{
+ struct pvclock_vcpu_time_info *src;
+ u64 ret;
+
+ preempt_disable_notrace();
+ src = &__this_cpu_read(xen_vcpu)->time;
+ ret = pvclock_clocksource_read(src);
+ preempt_enable_notrace();
+ return ret;
+}
+
+static u64 xen_clocksource_get_cycles(struct clocksource *cs)
+{
+ return xen_clocksource_read();
+}
+
+static u64 xen_sched_clock(void)
+{
+ return xen_clocksource_read() - xen_sched_clock_offset;
+}
+
+static void xen_read_wallclock(struct timespec64 *ts)
+{
+ struct shared_info *s = HYPERVISOR_shared_info;
+ struct pvclock_wall_clock *wall_clock = &(s->wc);
+ struct pvclock_vcpu_time_info *vcpu_time;
+
+ vcpu_time = &get_cpu_var(xen_vcpu)->time;
+ pvclock_read_wallclock(wall_clock, vcpu_time, ts);
+ put_cpu_var(xen_vcpu);
+}
+
+static void xen_get_wallclock(struct timespec64 *now)
+{
+ xen_read_wallclock(now);
+}
+
+static int xen_set_wallclock(const struct timespec64 *now)
+{
+ return -ENODEV;
+}
+
+static int xen_pvclock_gtod_notify(struct notifier_block *nb,
+ unsigned long was_set, void *priv)
+{
+ /* Protected by the calling core code serialization */
+ static struct timespec64 next_sync;
+
+ struct xen_platform_op op;
+ struct timespec64 now;
+ struct timekeeper *tk = priv;
+ static bool settime64_supported = true;
+ int ret;
+
+ now.tv_sec = tk->xtime_sec;
+ now.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
+
+ /*
+ * We only take the expensive HV call when the clock was set
+ * or when the 11 minutes RTC synchronization time elapsed.
+ */
+ if (!was_set && timespec64_compare(&now, &next_sync) < 0)
+ return NOTIFY_OK;
+
+again:
+ if (settime64_supported) {
+ op.cmd = XENPF_settime64;
+ op.u.settime64.mbz = 0;
+ op.u.settime64.secs = now.tv_sec;
+ op.u.settime64.nsecs = now.tv_nsec;
+ op.u.settime64.system_time = xen_clocksource_read();
+ } else {
+ op.cmd = XENPF_settime32;
+ op.u.settime32.secs = now.tv_sec;
+ op.u.settime32.nsecs = now.tv_nsec;
+ op.u.settime32.system_time = xen_clocksource_read();
+ }
+
+ ret = HYPERVISOR_platform_op(&op);
+
+ if (ret == -ENOSYS && settime64_supported) {
+ settime64_supported = false;
+ goto again;
+ }
+ if (ret < 0)
+ return NOTIFY_BAD;
+
+ /*
+ * Move the next drift compensation time 11 minutes
+ * ahead. That's emulating the sync_cmos_clock() update for
+ * the hardware RTC.
+ */
+ next_sync = now;
+ next_sync.tv_sec += 11 * 60;
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block xen_pvclock_gtod_notifier = {
+ .notifier_call = xen_pvclock_gtod_notify,
+};
+
+static struct clocksource xen_clocksource __read_mostly = {
+ .name = "xen",
+ .rating = 400,
+ .read = xen_clocksource_get_cycles,
+ .mask = ~0,
+ .flags = CLOCK_SOURCE_IS_CONTINUOUS,
+};
+
+/*
+ Xen clockevent implementation
+
+ Xen has two clockevent implementations:
+
+ The old timer_op one works with all released versions of Xen prior
+ to version 3.0.4. This version of the hypervisor provides a
+ single-shot timer with nanosecond resolution. However, sharing the
+ same event channel is a 100Hz tick which is delivered while the
+ vcpu is running. We don't care about or use this tick, but it will
+ cause the core time code to think the timer fired too soon, and
+ will end up resetting it each time. It could be filtered, but
+ doing so has complications when the ktime clocksource is not yet
+ the xen clocksource (ie, at boot time).
+
+ The new vcpu_op-based timer interface allows the tick timer period
+ to be changed or turned off. The tick timer is not useful as a
+ periodic timer because events are only delivered to running vcpus.
+ The one-shot timer can report when a timeout is in the past, so
+ set_next_event is capable of returning -ETIME when appropriate.
+ This interface is used when available.
+*/
+
+
+/*
+ Get a hypervisor absolute time. In theory we could maintain an
+ offset between the kernel's time and the hypervisor's time, and
+ apply that to a kernel's absolute timeout. Unfortunately the
+ hypervisor and kernel times can drift even if the kernel is using
+ the Xen clocksource, because ntp can warp the kernel's clocksource.
+*/
+static s64 get_abs_timeout(unsigned long delta)
+{
+ return xen_clocksource_read() + delta;
+}
+
+static int xen_timerop_shutdown(struct clock_event_device *evt)
+{
+ /* cancel timeout */
+ HYPERVISOR_set_timer_op(0);
+
+ return 0;
+}
+
+static int xen_timerop_set_next_event(unsigned long delta,
+ struct clock_event_device *evt)
+{
+ WARN_ON(!clockevent_state_oneshot(evt));
+
+ if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0)
+ BUG();
+
+ /* We may have missed the deadline, but there's no real way of
+ knowing for sure. If the event was in the past, then we'll
+ get an immediate interrupt. */
+
+ return 0;
+}
+
+static const struct clock_event_device xen_timerop_clockevent = {
+ .name = "xen",
+ .features = CLOCK_EVT_FEAT_ONESHOT,
+
+ .max_delta_ns = 0xffffffff,
+ .max_delta_ticks = 0xffffffff,
+ .min_delta_ns = TIMER_SLOP,
+ .min_delta_ticks = TIMER_SLOP,
+
+ .mult = 1,
+ .shift = 0,
+ .rating = 500,
+
+ .set_state_shutdown = xen_timerop_shutdown,
+ .set_next_event = xen_timerop_set_next_event,
+};
+
+static int xen_vcpuop_shutdown(struct clock_event_device *evt)
+{
+ int cpu = smp_processor_id();
+
+ if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, xen_vcpu_nr(cpu),
+ NULL) ||
+ HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, xen_vcpu_nr(cpu),
+ NULL))
+ BUG();
+
+ return 0;
+}
+
+static int xen_vcpuop_set_oneshot(struct clock_event_device *evt)
+{
+ int cpu = smp_processor_id();
+
+ if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, xen_vcpu_nr(cpu),
+ NULL))
+ BUG();
+
+ return 0;
+}
+
+static int xen_vcpuop_set_next_event(unsigned long delta,
+ struct clock_event_device *evt)
+{
+ int cpu = smp_processor_id();
+ struct vcpu_set_singleshot_timer single;
+ int ret;
+
+ WARN_ON(!clockevent_state_oneshot(evt));
+
+ single.timeout_abs_ns = get_abs_timeout(delta);
+ /* Get an event anyway, even if the timeout is already expired */
+ single.flags = 0;
+
+ ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, xen_vcpu_nr(cpu),
+ &single);
+ BUG_ON(ret != 0);
+
+ return ret;
+}
+
+static const struct clock_event_device xen_vcpuop_clockevent = {
+ .name = "xen",
+ .features = CLOCK_EVT_FEAT_ONESHOT,
+
+ .max_delta_ns = 0xffffffff,
+ .max_delta_ticks = 0xffffffff,
+ .min_delta_ns = TIMER_SLOP,
+ .min_delta_ticks = TIMER_SLOP,
+
+ .mult = 1,
+ .shift = 0,
+ .rating = 500,
+
+ .set_state_shutdown = xen_vcpuop_shutdown,
+ .set_state_oneshot = xen_vcpuop_set_oneshot,
+ .set_next_event = xen_vcpuop_set_next_event,
+};
+
+static const struct clock_event_device *xen_clockevent =
+ &xen_timerop_clockevent;
+
+struct xen_clock_event_device {
+ struct clock_event_device evt;
+ char name[16];
+};
+static DEFINE_PER_CPU(struct xen_clock_event_device, xen_clock_events) = { .evt.irq = -1 };
+
+static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
+{
+ struct clock_event_device *evt = this_cpu_ptr(&xen_clock_events.evt);
+ irqreturn_t ret;
+
+ ret = IRQ_NONE;
+ if (evt->event_handler) {
+ evt->event_handler(evt);
+ ret = IRQ_HANDLED;
+ }
+
+ return ret;
+}
+
+void xen_teardown_timer(int cpu)
+{
+ struct clock_event_device *evt;
+ evt = &per_cpu(xen_clock_events, cpu).evt;
+
+ if (evt->irq >= 0) {
+ unbind_from_irqhandler(evt->irq, NULL);
+ evt->irq = -1;
+ }
+}
+
+void xen_setup_timer(int cpu)
+{
+ struct xen_clock_event_device *xevt = &per_cpu(xen_clock_events, cpu);
+ struct clock_event_device *evt = &xevt->evt;
+ int irq;
+
+ WARN(evt->irq >= 0, "IRQ%d for CPU%d is already allocated\n", evt->irq, cpu);
+ if (evt->irq >= 0)
+ xen_teardown_timer(cpu);
+
+ printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu);
+
+ snprintf(xevt->name, sizeof(xevt->name), "timer%d", cpu);
+
+ irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
+ IRQF_PERCPU|IRQF_NOBALANCING|IRQF_TIMER|
+ IRQF_FORCE_RESUME|IRQF_EARLY_RESUME,
+ xevt->name, NULL);
+ (void)xen_set_irq_priority(irq, XEN_IRQ_PRIORITY_MAX);
+
+ memcpy(evt, xen_clockevent, sizeof(*evt));
+
+ evt->cpumask = cpumask_of(cpu);
+ evt->irq = irq;
+}
+
+
+void xen_setup_cpu_clockevents(void)
+{
+ clockevents_register_device(this_cpu_ptr(&xen_clock_events.evt));
+}
+
+void xen_timer_resume(void)
+{
+ int cpu;
+
+ if (xen_clockevent != &xen_vcpuop_clockevent)
+ return;
+
+ for_each_online_cpu(cpu) {
+ if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer,
+ xen_vcpu_nr(cpu), NULL))
+ BUG();
+ }
+}
+
+static const struct pv_time_ops xen_time_ops __initconst = {
+ .sched_clock = xen_sched_clock,
+ .steal_clock = xen_steal_clock,
+};
+
+static struct pvclock_vsyscall_time_info *xen_clock __read_mostly;
+static u64 xen_clock_value_saved;
+
+void xen_save_time_memory_area(void)
+{
+ struct vcpu_register_time_memory_area t;
+ int ret;
+
+ xen_clock_value_saved = xen_clocksource_read() - xen_sched_clock_offset;
+
+ if (!xen_clock)
+ return;
+
+ t.addr.v = NULL;
+
+ ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area, 0, &t);
+ if (ret != 0)
+ pr_notice("Cannot save secondary vcpu_time_info (err %d)",
+ ret);
+ else
+ clear_page(xen_clock);
+}
+
+void xen_restore_time_memory_area(void)
+{
+ struct vcpu_register_time_memory_area t;
+ int ret;
+
+ if (!xen_clock)
+ goto out;
+
+ t.addr.v = &xen_clock->pvti;
+
+ ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area, 0, &t);
+
+ /*
+ * We don't disable VCLOCK_PVCLOCK entirely if it fails to register the
+ * secondary time info with Xen or if we migrated to a host without the
+ * necessary flags. On both of these cases what happens is either
+ * process seeing a zeroed out pvti or seeing no PVCLOCK_TSC_STABLE_BIT
+ * bit set. Userspace checks the latter and if 0, it discards the data
+ * in pvti and fallbacks to a system call for a reliable timestamp.
+ */
+ if (ret != 0)
+ pr_notice("Cannot restore secondary vcpu_time_info (err %d)",
+ ret);
+
+out:
+ /* Need pvclock_resume() before using xen_clocksource_read(). */
+ pvclock_resume();
+ xen_sched_clock_offset = xen_clocksource_read() - xen_clock_value_saved;
+}
+
+static void xen_setup_vsyscall_time_info(void)
+{
+ struct vcpu_register_time_memory_area t;
+ struct pvclock_vsyscall_time_info *ti;
+ int ret;
+
+ ti = (struct pvclock_vsyscall_time_info *)get_zeroed_page(GFP_KERNEL);
+ if (!ti)
+ return;
+
+ t.addr.v = &ti->pvti;
+
+ ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area, 0, &t);
+ if (ret) {
+ pr_notice("xen: VCLOCK_PVCLOCK not supported (err %d)\n", ret);
+ free_page((unsigned long)ti);
+ return;
+ }
+
+ /*
+ * If primary time info had this bit set, secondary should too since
+ * it's the same data on both just different memory regions. But we
+ * still check it in case hypervisor is buggy.
+ */
+ if (!(ti->pvti.flags & PVCLOCK_TSC_STABLE_BIT)) {
+ t.addr.v = NULL;
+ ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area,
+ 0, &t);
+ if (!ret)
+ free_page((unsigned long)ti);
+
+ pr_notice("xen: VCLOCK_PVCLOCK not supported (tsc unstable)\n");
+ return;
+ }
+
+ xen_clock = ti;
+ pvclock_set_pvti_cpu0_va(xen_clock);
+
+ xen_clocksource.archdata.vclock_mode = VCLOCK_PVCLOCK;
+}
+
+static void __init xen_time_init(void)
+{
+ struct pvclock_vcpu_time_info *pvti;
+ int cpu = smp_processor_id();
+ struct timespec64 tp;
+
+ /* As Dom0 is never moved, no penalty on using TSC there */
+ if (xen_initial_domain())
+ xen_clocksource.rating = 275;
+
+ clocksource_register_hz(&xen_clocksource, NSEC_PER_SEC);
+
+ if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, xen_vcpu_nr(cpu),
+ NULL) == 0) {
+ /* Successfully turned off 100Hz tick, so we have the
+ vcpuop-based timer interface */
+ printk(KERN_DEBUG "Xen: using vcpuop timer interface\n");
+ xen_clockevent = &xen_vcpuop_clockevent;
+ }
+
+ /* Set initial system time with full resolution */
+ xen_read_wallclock(&tp);
+ do_settimeofday64(&tp);
+
+ setup_force_cpu_cap(X86_FEATURE_TSC);
+
+ /*
+ * We check ahead on the primary time info if this
+ * bit is supported hence speeding up Xen clocksource.
+ */
+ pvti = &__this_cpu_read(xen_vcpu)->time;
+ if (pvti->flags & PVCLOCK_TSC_STABLE_BIT) {
+ pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
+ xen_setup_vsyscall_time_info();
+ }
+
+ xen_setup_runstate_info(cpu);
+ xen_setup_timer(cpu);
+ xen_setup_cpu_clockevents();
+
+ xen_time_setup_guest();
+
+ if (xen_initial_domain())
+ pvclock_gtod_register_notifier(&xen_pvclock_gtod_notifier);
+}
+
+void __init xen_init_time_ops(void)
+{
+ xen_sched_clock_offset = xen_clocksource_read();
+ pv_time_ops = xen_time_ops;
+
+ x86_init.timers.timer_init = xen_time_init;
+ x86_init.timers.setup_percpu_clockev = x86_init_noop;
+ x86_cpuinit.setup_percpu_clockev = x86_init_noop;
+
+ x86_platform.calibrate_tsc = xen_tsc_khz;
+ x86_platform.get_wallclock = xen_get_wallclock;
+ /* Dom0 uses the native method to set the hardware RTC. */
+ if (!xen_initial_domain())
+ x86_platform.set_wallclock = xen_set_wallclock;
+}
+
+#ifdef CONFIG_XEN_PVHVM
+static void xen_hvm_setup_cpu_clockevents(void)
+{
+ int cpu = smp_processor_id();
+ xen_setup_runstate_info(cpu);
+ /*
+ * xen_setup_timer(cpu) - snprintf is bad in atomic context. Hence
+ * doing it xen_hvm_cpu_notify (which gets called by smp_init during
+ * early bootup and also during CPU hotplug events).
+ */
+ xen_setup_cpu_clockevents();
+}
+
+void __init xen_hvm_init_time_ops(void)
+{
+ static bool hvm_time_initialized;
+
+ if (hvm_time_initialized)
+ return;
+
+ /*
+ * vector callback is needed otherwise we cannot receive interrupts
+ * on cpu > 0 and at this point we don't know how many cpus are
+ * available.
+ */
+ if (!xen_have_vector_callback)
+ return;
+
+ if (!xen_feature(XENFEAT_hvm_safe_pvclock)) {
+ pr_info_once("Xen doesn't support pvclock on HVM, disable pv timer");
+ return;
+ }
+
+ /*
+ * Only MAX_VIRT_CPUS 'vcpu_info' are embedded inside 'shared_info'.
+ * The __this_cpu_read(xen_vcpu) is still NULL when Xen HVM guest
+ * boots on vcpu >= MAX_VIRT_CPUS (e.g., kexec), To access
+ * __this_cpu_read(xen_vcpu) via xen_clocksource_read() will panic.
+ *
+ * The xen_hvm_init_time_ops() should be called again later after
+ * __this_cpu_read(xen_vcpu) is available.
+ */
+ if (!__this_cpu_read(xen_vcpu)) {
+ pr_info("Delay xen_init_time_common() as kernel is running on vcpu=%d\n",
+ xen_vcpu_nr(0));
+ return;
+ }
+
+ xen_sched_clock_offset = xen_clocksource_read();
+ pv_time_ops = xen_time_ops;
+ x86_init.timers.setup_percpu_clockev = xen_time_init;
+ x86_cpuinit.setup_percpu_clockev = xen_hvm_setup_cpu_clockevents;
+
+ x86_platform.calibrate_tsc = xen_tsc_khz;
+ x86_platform.get_wallclock = xen_get_wallclock;
+ x86_platform.set_wallclock = xen_set_wallclock;
+
+ hvm_time_initialized = true;
+}
+#endif
diff --git a/arch/x86/xen/trace.c b/arch/x86/xen/trace.c
new file mode 100644
index 000000000..329f60eb9
--- /dev/null
+++ b/arch/x86/xen/trace.c
@@ -0,0 +1,21 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ftrace.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/xen-mca.h>
+
+#define HYPERCALL(x) [__HYPERVISOR_##x] = "("#x")",
+static const char *xen_hypercall_names[] = {
+#include <asm/xen-hypercalls.h>
+};
+#undef HYPERCALL
+
+static const char *xen_hypercall_name(unsigned op)
+{
+ if (op < ARRAY_SIZE(xen_hypercall_names) && xen_hypercall_names[op] != NULL)
+ return xen_hypercall_names[op];
+
+ return "";
+}
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/xen.h>
diff --git a/arch/x86/xen/vdso.h b/arch/x86/xen/vdso.h
new file mode 100644
index 000000000..861fedfe5
--- /dev/null
+++ b/arch/x86/xen/vdso.h
@@ -0,0 +1,4 @@
+/* Bit used for the pseudo-hwcap for non-negative segments. We use
+ bit 1 to avoid bugs in some versions of glibc when bit 0 is
+ used; the choice is otherwise arbitrary. */
+#define VDSO_NOTE_NONEGSEG_BIT 1
diff --git a/arch/x86/xen/vga.c b/arch/x86/xen/vga.c
new file mode 100644
index 000000000..e336f223f
--- /dev/null
+++ b/arch/x86/xen/vga.c
@@ -0,0 +1,75 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/screen_info.h>
+#include <linux/init.h>
+
+#include <asm/bootparam.h>
+#include <asm/setup.h>
+
+#include <xen/interface/xen.h>
+
+#include "xen-ops.h"
+
+void __init xen_init_vga(const struct dom0_vga_console_info *info, size_t size)
+{
+ struct screen_info *screen_info = &boot_params.screen_info;
+
+ /* This is drawn from a dump from vgacon:startup in
+ * standard Linux. */
+ screen_info->orig_video_mode = 3;
+ screen_info->orig_video_isVGA = 1;
+ screen_info->orig_video_lines = 25;
+ screen_info->orig_video_cols = 80;
+ screen_info->orig_video_ega_bx = 3;
+ screen_info->orig_video_points = 16;
+ screen_info->orig_y = screen_info->orig_video_lines - 1;
+
+ switch (info->video_type) {
+ case XEN_VGATYPE_TEXT_MODE_3:
+ if (size < offsetof(struct dom0_vga_console_info, u.text_mode_3)
+ + sizeof(info->u.text_mode_3))
+ break;
+ screen_info->orig_video_lines = info->u.text_mode_3.rows;
+ screen_info->orig_video_cols = info->u.text_mode_3.columns;
+ screen_info->orig_x = info->u.text_mode_3.cursor_x;
+ screen_info->orig_y = info->u.text_mode_3.cursor_y;
+ screen_info->orig_video_points =
+ info->u.text_mode_3.font_height;
+ break;
+
+ case XEN_VGATYPE_EFI_LFB:
+ case XEN_VGATYPE_VESA_LFB:
+ if (size < offsetof(struct dom0_vga_console_info,
+ u.vesa_lfb.gbl_caps))
+ break;
+ screen_info->orig_video_isVGA = VIDEO_TYPE_VLFB;
+ screen_info->lfb_width = info->u.vesa_lfb.width;
+ screen_info->lfb_height = info->u.vesa_lfb.height;
+ screen_info->lfb_depth = info->u.vesa_lfb.bits_per_pixel;
+ screen_info->lfb_base = info->u.vesa_lfb.lfb_base;
+ screen_info->lfb_size = info->u.vesa_lfb.lfb_size;
+ screen_info->lfb_linelength = info->u.vesa_lfb.bytes_per_line;
+ screen_info->red_size = info->u.vesa_lfb.red_size;
+ screen_info->red_pos = info->u.vesa_lfb.red_pos;
+ screen_info->green_size = info->u.vesa_lfb.green_size;
+ screen_info->green_pos = info->u.vesa_lfb.green_pos;
+ screen_info->blue_size = info->u.vesa_lfb.blue_size;
+ screen_info->blue_pos = info->u.vesa_lfb.blue_pos;
+ screen_info->rsvd_size = info->u.vesa_lfb.rsvd_size;
+ screen_info->rsvd_pos = info->u.vesa_lfb.rsvd_pos;
+
+ if (info->video_type == XEN_VGATYPE_EFI_LFB) {
+ screen_info->orig_video_isVGA = VIDEO_TYPE_EFI;
+ break;
+ }
+
+ if (size >= offsetof(struct dom0_vga_console_info,
+ u.vesa_lfb.gbl_caps)
+ + sizeof(info->u.vesa_lfb.gbl_caps))
+ screen_info->capabilities = info->u.vesa_lfb.gbl_caps;
+ if (size >= offsetof(struct dom0_vga_console_info,
+ u.vesa_lfb.mode_attrs)
+ + sizeof(info->u.vesa_lfb.mode_attrs))
+ screen_info->vesa_attributes = info->u.vesa_lfb.mode_attrs;
+ break;
+ }
+}
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S
new file mode 100644
index 000000000..8019edd01
--- /dev/null
+++ b/arch/x86/xen/xen-asm.S
@@ -0,0 +1,137 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Asm versions of Xen pv-ops, suitable for direct use.
+ *
+ * We only bother with direct forms (ie, vcpu in percpu data) of the
+ * operations here; the indirect forms are better handled in C.
+ */
+
+#include <asm/asm-offsets.h>
+#include <asm/percpu.h>
+#include <asm/processor-flags.h>
+#include <asm/frame.h>
+
+#include <linux/linkage.h>
+
+/*
+ * Enable events. This clears the event mask and tests the pending
+ * event status with one and operation. If there are pending events,
+ * then enter the hypervisor to get them handled.
+ */
+ENTRY(xen_irq_enable_direct)
+ FRAME_BEGIN
+ /* Unmask events */
+ movb $0, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
+
+ /*
+ * Preempt here doesn't matter because that will deal with any
+ * pending interrupts. The pending check may end up being run
+ * on the wrong CPU, but that doesn't hurt.
+ */
+
+ /* Test for pending */
+ testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
+ jz 1f
+
+ call check_events
+1:
+ FRAME_END
+ ret
+ ENDPROC(xen_irq_enable_direct)
+
+
+/*
+ * Disabling events is simply a matter of making the event mask
+ * non-zero.
+ */
+ENTRY(xen_irq_disable_direct)
+ movb $1, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
+ ret
+ENDPROC(xen_irq_disable_direct)
+
+/*
+ * (xen_)save_fl is used to get the current interrupt enable status.
+ * Callers expect the status to be in X86_EFLAGS_IF, and other bits
+ * may be set in the return value. We take advantage of this by
+ * making sure that X86_EFLAGS_IF has the right value (and other bits
+ * in that byte are 0), but other bits in the return value are
+ * undefined. We need to toggle the state of the bit, because Xen and
+ * x86 use opposite senses (mask vs enable).
+ */
+ENTRY(xen_save_fl_direct)
+ testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
+ setz %ah
+ addb %ah, %ah
+ ret
+ ENDPROC(xen_save_fl_direct)
+
+
+/*
+ * In principle the caller should be passing us a value return from
+ * xen_save_fl_direct, but for robustness sake we test only the
+ * X86_EFLAGS_IF flag rather than the whole byte. After setting the
+ * interrupt mask state, it checks for unmasked pending events and
+ * enters the hypervisor to get them delivered if so.
+ */
+ENTRY(xen_restore_fl_direct)
+ FRAME_BEGIN
+#ifdef CONFIG_X86_64
+ testw $X86_EFLAGS_IF, %di
+#else
+ testb $X86_EFLAGS_IF>>8, %ah
+#endif
+ setz PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
+ /*
+ * Preempt here doesn't matter because that will deal with any
+ * pending interrupts. The pending check may end up being run
+ * on the wrong CPU, but that doesn't hurt.
+ */
+
+ /* check for unmasked and pending */
+ cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
+ jnz 1f
+ call check_events
+1:
+ FRAME_END
+ ret
+ ENDPROC(xen_restore_fl_direct)
+
+
+/*
+ * Force an event check by making a hypercall, but preserve regs
+ * before making the call.
+ */
+ENTRY(check_events)
+ FRAME_BEGIN
+#ifdef CONFIG_X86_32
+ push %eax
+ push %ecx
+ push %edx
+ call xen_force_evtchn_callback
+ pop %edx
+ pop %ecx
+ pop %eax
+#else
+ push %rax
+ push %rcx
+ push %rdx
+ push %rsi
+ push %rdi
+ push %r8
+ push %r9
+ push %r10
+ push %r11
+ call xen_force_evtchn_callback
+ pop %r11
+ pop %r10
+ pop %r9
+ pop %r8
+ pop %rdi
+ pop %rsi
+ pop %rdx
+ pop %rcx
+ pop %rax
+#endif
+ FRAME_END
+ ret
+ENDPROC(check_events)
diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S
new file mode 100644
index 000000000..c15db060a
--- /dev/null
+++ b/arch/x86/xen/xen-asm_32.S
@@ -0,0 +1,207 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Asm versions of Xen pv-ops, suitable for direct use.
+ *
+ * We only bother with direct forms (ie, vcpu in pda) of the
+ * operations here; the indirect forms are better handled in C.
+ */
+
+#include <asm/thread_info.h>
+#include <asm/processor-flags.h>
+#include <asm/segment.h>
+#include <asm/asm.h>
+
+#include <xen/interface/xen.h>
+
+#include <linux/linkage.h>
+
+/* Pseudo-flag used for virtual NMI, which we don't implement yet */
+#define XEN_EFLAGS_NMI 0x80000000
+
+/*
+ * This is run where a normal iret would be run, with the same stack setup:
+ * 8: eflags
+ * 4: cs
+ * esp-> 0: eip
+ *
+ * This attempts to make sure that any pending events are dealt with
+ * on return to usermode, but there is a small window in which an
+ * event can happen just before entering usermode. If the nested
+ * interrupt ends up setting one of the TIF_WORK_MASK pending work
+ * flags, they will not be tested again before returning to
+ * usermode. This means that a process can end up with pending work,
+ * which will be unprocessed until the process enters and leaves the
+ * kernel again, which could be an unbounded amount of time. This
+ * means that a pending signal or reschedule event could be
+ * indefinitely delayed.
+ *
+ * The fix is to notice a nested interrupt in the critical window, and
+ * if one occurs, then fold the nested interrupt into the current
+ * interrupt stack frame, and re-process it iteratively rather than
+ * recursively. This means that it will exit via the normal path, and
+ * all pending work will be dealt with appropriately.
+ *
+ * Because the nested interrupt handler needs to deal with the current
+ * stack state in whatever form its in, we keep things simple by only
+ * using a single register which is pushed/popped on the stack.
+ */
+
+.macro POP_FS
+1:
+ popw %fs
+.pushsection .fixup, "ax"
+2: movw $0, (%esp)
+ jmp 1b
+.popsection
+ _ASM_EXTABLE(1b,2b)
+.endm
+
+ENTRY(xen_iret)
+ /* test eflags for special cases */
+ testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(%esp)
+ jnz hyper_iret
+
+ push %eax
+ ESP_OFFSET=4 # bytes pushed onto stack
+
+ /* Store vcpu_info pointer for easy access */
+#ifdef CONFIG_SMP
+ pushw %fs
+ movl $(__KERNEL_PERCPU), %eax
+ movl %eax, %fs
+ movl %fs:xen_vcpu, %eax
+ POP_FS
+#else
+ movl %ss:xen_vcpu, %eax
+#endif
+
+ /* check IF state we're restoring */
+ testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp)
+
+ /*
+ * Maybe enable events. Once this happens we could get a
+ * recursive event, so the critical region starts immediately
+ * afterwards. However, if that happens we don't end up
+ * resuming the code, so we don't have to be worried about
+ * being preempted to another CPU.
+ */
+ setz %ss:XEN_vcpu_info_mask(%eax)
+xen_iret_start_crit:
+
+ /* check for unmasked and pending */
+ cmpw $0x0001, %ss:XEN_vcpu_info_pending(%eax)
+
+ /*
+ * If there's something pending, mask events again so we can
+ * jump back into xen_hypervisor_callback. Otherwise do not
+ * touch XEN_vcpu_info_mask.
+ */
+ jne 1f
+ movb $1, %ss:XEN_vcpu_info_mask(%eax)
+
+1: popl %eax
+
+ /*
+ * From this point on the registers are restored and the stack
+ * updated, so we don't need to worry about it if we're
+ * preempted
+ */
+iret_restore_end:
+
+ /*
+ * Jump to hypervisor_callback after fixing up the stack.
+ * Events are masked, so jumping out of the critical region is
+ * OK.
+ */
+ je xen_hypervisor_callback
+
+1: iret
+xen_iret_end_crit:
+ _ASM_EXTABLE(1b, iret_exc)
+
+hyper_iret:
+ /* put this out of line since its very rarely used */
+ jmp hypercall_page + __HYPERVISOR_iret * 32
+
+ .globl xen_iret_start_crit, xen_iret_end_crit
+
+/*
+ * This is called by xen_hypervisor_callback in entry.S when it sees
+ * that the EIP at the time of interrupt was between
+ * xen_iret_start_crit and xen_iret_end_crit. We're passed the EIP in
+ * %eax so we can do a more refined determination of what to do.
+ *
+ * The stack format at this point is:
+ * ----------------
+ * ss : (ss/esp may be present if we came from usermode)
+ * esp :
+ * eflags } outer exception info
+ * cs }
+ * eip }
+ * ---------------- <- edi (copy dest)
+ * eax : outer eax if it hasn't been restored
+ * ----------------
+ * eflags } nested exception info
+ * cs } (no ss/esp because we're nested
+ * eip } from the same ring)
+ * orig_eax }<- esi (copy src)
+ * - - - - - - - -
+ * fs }
+ * es }
+ * ds } SAVE_ALL state
+ * eax }
+ * : :
+ * ebx }<- esp
+ * ----------------
+ *
+ * In order to deliver the nested exception properly, we need to shift
+ * everything from the return addr up to the error code so it sits
+ * just under the outer exception info. This means that when we
+ * handle the exception, we do it in the context of the outer
+ * exception rather than starting a new one.
+ *
+ * The only caveat is that if the outer eax hasn't been restored yet
+ * (ie, it's still on stack), we need to insert its value into the
+ * SAVE_ALL state before going on, since it's usermode state which we
+ * eventually need to restore.
+ */
+ENTRY(xen_iret_crit_fixup)
+ /*
+ * Paranoia: Make sure we're really coming from kernel space.
+ * One could imagine a case where userspace jumps into the
+ * critical range address, but just before the CPU delivers a
+ * GP, it decides to deliver an interrupt instead. Unlikely?
+ * Definitely. Easy to avoid? Yes. The Intel documents
+ * explicitly say that the reported EIP for a bad jump is the
+ * jump instruction itself, not the destination, but some
+ * virtual environments get this wrong.
+ */
+ movl PT_CS(%esp), %ecx
+ andl $SEGMENT_RPL_MASK, %ecx
+ cmpl $USER_RPL, %ecx
+ je 2f
+
+ lea PT_ORIG_EAX(%esp), %esi
+ lea PT_EFLAGS(%esp), %edi
+
+ /*
+ * If eip is before iret_restore_end then stack
+ * hasn't been restored yet.
+ */
+ cmp $iret_restore_end, %eax
+ jae 1f
+
+ movl 0+4(%edi), %eax /* copy EAX (just above top of frame) */
+ movl %eax, PT_EAX(%esp)
+
+ lea ESP_OFFSET(%edi), %edi /* move dest up over saved regs */
+
+ /* set up the copy */
+1: std
+ mov $PT_EIP / 4, %ecx /* saved regs up to orig_eax */
+ rep movsl
+ cld
+
+ lea 4(%edi), %esp /* point esp to new frame */
+2: jmp xen_do_upcall
+
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
new file mode 100644
index 000000000..a93d8a7ce
--- /dev/null
+++ b/arch/x86/xen/xen-asm_64.S
@@ -0,0 +1,177 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Asm versions of Xen pv-ops, suitable for direct use.
+ *
+ * We only bother with direct forms (ie, vcpu in pda) of the
+ * operations here; the indirect forms are better handled in C.
+ */
+
+#include <asm/errno.h>
+#include <asm/percpu.h>
+#include <asm/processor-flags.h>
+#include <asm/segment.h>
+#include <asm/asm-offsets.h>
+#include <asm/thread_info.h>
+#include <asm/asm.h>
+
+#include <xen/interface/xen.h>
+
+#include <linux/init.h>
+#include <linux/linkage.h>
+
+.macro xen_pv_trap name
+ENTRY(xen_\name)
+ pop %rcx
+ pop %r11
+ jmp \name
+END(xen_\name)
+_ASM_NOKPROBE(xen_\name)
+.endm
+
+xen_pv_trap divide_error
+xen_pv_trap debug
+xen_pv_trap xendebug
+xen_pv_trap int3
+xen_pv_trap xennmi
+xen_pv_trap overflow
+xen_pv_trap bounds
+xen_pv_trap invalid_op
+xen_pv_trap device_not_available
+xen_pv_trap double_fault
+xen_pv_trap coprocessor_segment_overrun
+xen_pv_trap invalid_TSS
+xen_pv_trap segment_not_present
+xen_pv_trap stack_segment
+xen_pv_trap general_protection
+xen_pv_trap page_fault
+xen_pv_trap spurious_interrupt_bug
+xen_pv_trap coprocessor_error
+xen_pv_trap alignment_check
+#ifdef CONFIG_X86_MCE
+xen_pv_trap machine_check
+#endif /* CONFIG_X86_MCE */
+xen_pv_trap simd_coprocessor_error
+#ifdef CONFIG_IA32_EMULATION
+xen_pv_trap entry_INT80_compat
+#endif
+xen_pv_trap hypervisor_callback
+
+ __INIT
+ENTRY(xen_early_idt_handler_array)
+ i = 0
+ .rept NUM_EXCEPTION_VECTORS
+ pop %rcx
+ pop %r11
+ jmp early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE
+ i = i + 1
+ .fill xen_early_idt_handler_array + i*XEN_EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc
+ .endr
+END(xen_early_idt_handler_array)
+ __FINIT
+
+hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32
+/*
+ * Xen64 iret frame:
+ *
+ * ss
+ * rsp
+ * rflags
+ * cs
+ * rip <-- standard iret frame
+ *
+ * flags
+ *
+ * rcx }
+ * r11 }<-- pushed by hypercall page
+ * rsp->rax }
+ */
+ENTRY(xen_iret)
+ pushq $0
+ jmp hypercall_iret
+
+ENTRY(xen_sysret64)
+ /*
+ * We're already on the usermode stack at this point, but
+ * still with the kernel gs, so we can easily switch back
+ */
+ movq %rsp, PER_CPU_VAR(rsp_scratch)
+ movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+
+ pushq $__USER_DS
+ pushq PER_CPU_VAR(rsp_scratch)
+ pushq %r11
+ pushq $__USER_CS
+ pushq %rcx
+
+ pushq $VGCF_in_syscall
+ jmp hypercall_iret
+
+/*
+ * Xen handles syscall callbacks much like ordinary exceptions, which
+ * means we have:
+ * - kernel gs
+ * - kernel rsp
+ * - an iret-like stack frame on the stack (including rcx and r11):
+ * ss
+ * rsp
+ * rflags
+ * cs
+ * rip
+ * r11
+ * rsp->rcx
+ */
+
+/* Normal 64-bit system call target */
+ENTRY(xen_syscall_target)
+ popq %rcx
+ popq %r11
+
+ /*
+ * Neither Xen nor the kernel really knows what the old SS and
+ * CS were. The kernel expects __USER_DS and __USER_CS, so
+ * report those values even though Xen will guess its own values.
+ */
+ movq $__USER_DS, 4*8(%rsp)
+ movq $__USER_CS, 1*8(%rsp)
+
+ jmp entry_SYSCALL_64_after_hwframe
+ENDPROC(xen_syscall_target)
+
+#ifdef CONFIG_IA32_EMULATION
+
+/* 32-bit compat syscall target */
+ENTRY(xen_syscall32_target)
+ popq %rcx
+ popq %r11
+
+ /*
+ * Neither Xen nor the kernel really knows what the old SS and
+ * CS were. The kernel expects __USER32_DS and __USER32_CS, so
+ * report those values even though Xen will guess its own values.
+ */
+ movq $__USER32_DS, 4*8(%rsp)
+ movq $__USER32_CS, 1*8(%rsp)
+
+ jmp entry_SYSCALL_compat_after_hwframe
+ENDPROC(xen_syscall32_target)
+
+/* 32-bit compat sysenter target */
+ENTRY(xen_sysenter_target)
+ mov 0*8(%rsp), %rcx
+ mov 1*8(%rsp), %r11
+ mov 5*8(%rsp), %rsp
+ jmp entry_SYSENTER_compat
+ENDPROC(xen_sysenter_target)
+
+#else /* !CONFIG_IA32_EMULATION */
+
+ENTRY(xen_syscall32_target)
+ENTRY(xen_sysenter_target)
+ lea 16(%rsp), %rsp /* strip %rcx, %r11 */
+ mov $-ENOSYS, %rax
+ pushq $0
+ jmp hypercall_iret
+ENDPROC(xen_syscall32_target)
+ENDPROC(xen_sysenter_target)
+
+#endif /* CONFIG_IA32_EMULATION */
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
new file mode 100644
index 000000000..5077ead5e
--- /dev/null
+++ b/arch/x86/xen/xen-head.S
@@ -0,0 +1,104 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Xen-specific pieces of head.S, intended to be included in the right
+ place in head.S */
+
+#ifdef CONFIG_XEN
+
+#include <linux/elfnote.h>
+#include <linux/init.h>
+
+#include <asm/boot.h>
+#include <asm/asm.h>
+#include <asm/msr.h>
+#include <asm/page_types.h>
+#include <asm/percpu.h>
+#include <asm/unwind_hints.h>
+
+#include <xen/interface/elfnote.h>
+#include <xen/interface/features.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/xen-mca.h>
+#include <asm/xen/interface.h>
+
+#ifdef CONFIG_XEN_PV
+ __INIT
+ENTRY(startup_xen)
+ UNWIND_HINT_EMPTY
+ cld
+
+ /* Clear .bss */
+ xor %eax,%eax
+ mov $__bss_start, %_ASM_DI
+ mov $__bss_stop, %_ASM_CX
+ sub %_ASM_DI, %_ASM_CX
+ shr $__ASM_SEL(2, 3), %_ASM_CX
+ rep __ASM_SIZE(stos)
+
+ mov %_ASM_SI, xen_start_info
+ mov $init_thread_union+THREAD_SIZE, %_ASM_SP
+
+#ifdef CONFIG_X86_64
+ /* Set up %gs.
+ *
+ * The base of %gs always points to the bottom of the irqstack
+ * union. If the stack protector canary is enabled, it is
+ * located at %gs:40. Note that, on SMP, the boot cpu uses
+ * init data section till per cpu areas are set up.
+ */
+ movl $MSR_GS_BASE,%ecx
+ movq $INIT_PER_CPU_VAR(irq_stack_union),%rax
+ cdq
+ wrmsr
+#endif
+
+ jmp xen_start_kernel
+END(startup_xen)
+ __FINIT
+#endif
+
+.pushsection .text
+ .balign PAGE_SIZE
+ENTRY(hypercall_page)
+ .rept (PAGE_SIZE / 32)
+ UNWIND_HINT_EMPTY
+ .skip 32
+ .endr
+
+#define HYPERCALL(n) \
+ .equ xen_hypercall_##n, hypercall_page + __HYPERVISOR_##n * 32; \
+ .type xen_hypercall_##n, @function; .size xen_hypercall_##n, 32
+#include <asm/xen-hypercalls.h>
+#undef HYPERCALL
+END(hypercall_page)
+.popsection
+
+ ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux")
+ ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6")
+ ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0")
+#ifdef CONFIG_X86_32
+ ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __PAGE_OFFSET)
+#else
+ ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __START_KERNEL_map)
+ /* Map the p2m table to a 512GB-aligned user address. */
+ ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M, .quad (PUD_SIZE * PTRS_PER_PUD))
+#endif
+#ifdef CONFIG_XEN_PV
+ ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen)
+#endif
+ ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page)
+ ELFNOTE(Xen, XEN_ELFNOTE_FEATURES,
+ .ascii "!writable_page_tables|pae_pgdir_above_4gb")
+ ELFNOTE(Xen, XEN_ELFNOTE_SUPPORTED_FEATURES,
+ .long (1 << XENFEAT_writable_page_tables) | \
+ (1 << XENFEAT_dom0) | \
+ (1 << XENFEAT_linux_rsdp_unrestricted))
+ ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes")
+ ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic")
+ ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID,
+ .quad _PAGE_PRESENT; .quad _PAGE_PRESENT)
+ ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1)
+ ELFNOTE(Xen, XEN_ELFNOTE_MOD_START_PFN, .long 1)
+ ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, _ASM_PTR __HYPERVISOR_VIRT_START)
+ ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, _ASM_PTR 0)
+
+#endif /*CONFIG_XEN */
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
new file mode 100644
index 000000000..2f111f47b
--- /dev/null
+++ b/arch/x86/xen/xen-ops.h
@@ -0,0 +1,164 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef XEN_OPS_H
+#define XEN_OPS_H
+
+#include <linux/init.h>
+#include <linux/clocksource.h>
+#include <linux/irqreturn.h>
+#include <xen/xen-ops.h>
+
+/* These are code, but not functions. Defined in entry.S */
+extern const char xen_hypervisor_callback[];
+extern const char xen_failsafe_callback[];
+
+void xen_sysenter_target(void);
+#ifdef CONFIG_X86_64
+void xen_syscall_target(void);
+void xen_syscall32_target(void);
+#endif
+
+extern void *xen_initial_gdt;
+
+struct trap_info;
+void xen_copy_trap_info(struct trap_info *traps);
+
+DECLARE_PER_CPU(struct vcpu_info, xen_vcpu_info);
+DECLARE_PER_CPU(unsigned long, xen_cr3);
+DECLARE_PER_CPU(unsigned long, xen_current_cr3);
+
+extern struct start_info *xen_start_info;
+extern struct shared_info xen_dummy_shared_info;
+extern struct shared_info *HYPERVISOR_shared_info;
+
+void xen_setup_mfn_list_list(void);
+void xen_build_mfn_list_list(void);
+void xen_setup_machphys_mapping(void);
+void xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
+void xen_reserve_top(void);
+void __init xen_reserve_special_pages(void);
+void __init xen_pt_check_e820(void);
+
+void xen_mm_pin_all(void);
+void xen_mm_unpin_all(void);
+#ifdef CONFIG_X86_64
+void __init xen_relocate_p2m(void);
+#endif
+
+bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size);
+unsigned long __ref xen_chk_extra_mem(unsigned long pfn);
+void __init xen_inv_extra_mem(void);
+void __init xen_remap_memory(void);
+phys_addr_t __init xen_find_free_area(phys_addr_t size);
+char * __init xen_memory_setup(void);
+void __init xen_arch_setup(void);
+void xen_enable_sysenter(void);
+void xen_enable_syscall(void);
+void xen_vcpu_restore(void);
+
+void xen_callback_vector(void);
+void xen_hvm_init_shared_info(void);
+void xen_unplug_emulated_devices(void);
+
+void __init xen_build_dynamic_phys_to_machine(void);
+void __init xen_vmalloc_p2m_tree(void);
+
+void xen_init_irq_ops(void);
+void xen_setup_timer(int cpu);
+void xen_setup_runstate_info(int cpu);
+void xen_teardown_timer(int cpu);
+void xen_setup_cpu_clockevents(void);
+void xen_save_time_memory_area(void);
+void xen_restore_time_memory_area(void);
+void xen_init_time_ops(void);
+void xen_hvm_init_time_ops(void);
+
+irqreturn_t xen_debug_interrupt(int irq, void *dev_id);
+
+bool xen_vcpu_stolen(int vcpu);
+
+extern int xen_have_vcpu_info_placement;
+
+int xen_vcpu_setup(int cpu);
+void xen_vcpu_info_reset(int cpu);
+void xen_setup_vcpu_info_placement(void);
+
+#ifdef CONFIG_SMP
+void xen_smp_init(void);
+void __init xen_hvm_smp_init(void);
+
+extern cpumask_var_t xen_cpu_initialized_map;
+#else
+static inline void xen_smp_init(void) {}
+static inline void xen_hvm_smp_init(void) {}
+#endif
+
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+void __init xen_init_spinlocks(void);
+void xen_init_lock_cpu(int cpu);
+void xen_uninit_lock_cpu(int cpu);
+#else
+static inline void xen_init_spinlocks(void)
+{
+}
+static inline void xen_init_lock_cpu(int cpu)
+{
+}
+static inline void xen_uninit_lock_cpu(int cpu)
+{
+}
+#endif
+
+struct dom0_vga_console_info;
+
+#ifdef CONFIG_XEN_DOM0
+void __init xen_init_vga(const struct dom0_vga_console_info *, size_t size);
+#else
+static inline void __init xen_init_vga(const struct dom0_vga_console_info *info,
+ size_t size)
+{
+}
+#endif
+
+void __init xen_init_apic(void);
+
+#ifdef CONFIG_XEN_EFI
+extern void xen_efi_init(struct boot_params *boot_params);
+#else
+static inline void __init xen_efi_init(struct boot_params *boot_params)
+{
+}
+#endif
+
+__visible void xen_irq_enable_direct(void);
+__visible void xen_irq_disable_direct(void);
+__visible unsigned long xen_save_fl_direct(void);
+__visible void xen_restore_fl_direct(unsigned long);
+
+/* These are not functions, and cannot be called normally */
+__visible void xen_iret(void);
+__visible void xen_sysret32(void);
+__visible void xen_sysret64(void);
+
+extern int xen_panic_handler_init(void);
+
+int xen_cpuhp_setup(int (*cpu_up_prepare_cb)(unsigned int),
+ int (*cpu_dead_cb)(unsigned int));
+
+void xen_pin_vcpu(int cpu);
+
+void xen_emergency_restart(void);
+#ifdef CONFIG_XEN_PV
+void xen_pv_pre_suspend(void);
+void xen_pv_post_suspend(int suspend_cancelled);
+#else
+static inline void xen_pv_pre_suspend(void) {}
+static inline void xen_pv_post_suspend(int suspend_cancelled) {}
+#endif
+
+#ifdef CONFIG_XEN_PVHVM
+void xen_hvm_post_suspend(int suspend_cancelled);
+#else
+static inline void xen_hvm_post_suspend(int suspend_cancelled) {}
+#endif
+
+#endif /* XEN_OPS_H */
diff --git a/arch/x86/xen/xen-pvh.S b/arch/x86/xen/xen-pvh.S
new file mode 100644
index 000000000..58722a052
--- /dev/null
+++ b/arch/x86/xen/xen-pvh.S
@@ -0,0 +1,188 @@
+/*
+ * Copyright C 2016, Oracle and/or its affiliates. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+ .code32
+ .text
+#define _pa(x) ((x) - __START_KERNEL_map)
+
+#include <linux/elfnote.h>
+#include <linux/init.h>
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/asm.h>
+#include <asm/boot.h>
+#include <asm/processor-flags.h>
+#include <asm/msr.h>
+#include <xen/interface/elfnote.h>
+
+ __HEAD
+
+/*
+ * Entry point for PVH guests.
+ *
+ * Xen ABI specifies the following register state when we come here:
+ *
+ * - `ebx`: contains the physical memory address where the loader has placed
+ * the boot start info structure.
+ * - `cr0`: bit 0 (PE) must be set. All the other writeable bits are cleared.
+ * - `cr4`: all bits are cleared.
+ * - `cs `: must be a 32-bit read/execute code segment with a base of ‘0’
+ * and a limit of ‘0xFFFFFFFF’. The selector value is unspecified.
+ * - `ds`, `es`: must be a 32-bit read/write data segment with a base of
+ * ‘0’ and a limit of ‘0xFFFFFFFF’. The selector values are all
+ * unspecified.
+ * - `tr`: must be a 32-bit TSS (active) with a base of '0' and a limit
+ * of '0x67'.
+ * - `eflags`: bit 17 (VM) must be cleared. Bit 9 (IF) must be cleared.
+ * Bit 8 (TF) must be cleared. Other bits are all unspecified.
+ *
+ * All other processor registers and flag bits are unspecified. The OS is in
+ * charge of setting up it's own stack, GDT and IDT.
+ */
+
+#define PVH_GDT_ENTRY_CS 1
+#define PVH_GDT_ENTRY_DS 2
+#define PVH_GDT_ENTRY_CANARY 3
+#define PVH_CS_SEL (PVH_GDT_ENTRY_CS * 8)
+#define PVH_DS_SEL (PVH_GDT_ENTRY_DS * 8)
+#define PVH_CANARY_SEL (PVH_GDT_ENTRY_CANARY * 8)
+
+ENTRY(pvh_start_xen)
+ cld
+
+ lgdt (_pa(gdt))
+
+ mov $PVH_DS_SEL,%eax
+ mov %eax,%ds
+ mov %eax,%es
+ mov %eax,%ss
+
+ /* Stash hvm_start_info. */
+ mov $_pa(pvh_start_info), %edi
+ mov %ebx, %esi
+ mov _pa(pvh_start_info_sz), %ecx
+ shr $2,%ecx
+ rep
+ movsl
+
+ mov $_pa(early_stack_end), %esp
+
+ /* Enable PAE mode. */
+ mov %cr4, %eax
+ orl $X86_CR4_PAE, %eax
+ mov %eax, %cr4
+
+#ifdef CONFIG_X86_64
+ /* Enable Long mode. */
+ mov $MSR_EFER, %ecx
+ rdmsr
+ btsl $_EFER_LME, %eax
+ wrmsr
+
+ /* Enable pre-constructed page tables. */
+ mov $_pa(init_top_pgt), %eax
+ mov %eax, %cr3
+ mov $(X86_CR0_PG | X86_CR0_PE), %eax
+ mov %eax, %cr0
+
+ /* Jump to 64-bit mode. */
+ ljmp $PVH_CS_SEL, $_pa(1f)
+
+ /* 64-bit entry point. */
+ .code64
+1:
+ /* Set base address in stack canary descriptor. */
+ mov $MSR_GS_BASE,%ecx
+ mov $_pa(canary), %eax
+ xor %edx, %edx
+ wrmsr
+
+ call xen_prepare_pvh
+
+ /* startup_64 expects boot_params in %rsi. */
+ mov $_pa(pvh_bootparams), %rsi
+ mov $_pa(startup_64), %rax
+ jmp *%rax
+
+#else /* CONFIG_X86_64 */
+
+ /* Set base address in stack canary descriptor. */
+ movl $_pa(gdt_start),%eax
+ movl $_pa(canary),%ecx
+ movw %cx, (PVH_GDT_ENTRY_CANARY * 8) + 2(%eax)
+ shrl $16, %ecx
+ movb %cl, (PVH_GDT_ENTRY_CANARY * 8) + 4(%eax)
+ movb %ch, (PVH_GDT_ENTRY_CANARY * 8) + 7(%eax)
+
+ mov $PVH_CANARY_SEL,%eax
+ mov %eax,%gs
+
+ call mk_early_pgtbl_32
+
+ mov $_pa(initial_page_table), %eax
+ mov %eax, %cr3
+
+ mov %cr0, %eax
+ or $(X86_CR0_PG | X86_CR0_PE), %eax
+ mov %eax, %cr0
+
+ ljmp $PVH_CS_SEL, $1f
+1:
+ call xen_prepare_pvh
+ mov $_pa(pvh_bootparams), %esi
+
+ /* startup_32 doesn't expect paging and PAE to be on. */
+ ljmp $PVH_CS_SEL, $_pa(2f)
+2:
+ mov %cr0, %eax
+ and $~X86_CR0_PG, %eax
+ mov %eax, %cr0
+ mov %cr4, %eax
+ and $~X86_CR4_PAE, %eax
+ mov %eax, %cr4
+
+ ljmp $PVH_CS_SEL, $_pa(startup_32)
+#endif
+END(pvh_start_xen)
+
+ .section ".init.data","aw"
+ .balign 8
+gdt:
+ .word gdt_end - gdt_start
+ .long _pa(gdt_start)
+ .word 0
+gdt_start:
+ .quad 0x0000000000000000 /* NULL descriptor */
+#ifdef CONFIG_X86_64
+ .quad GDT_ENTRY(0xa09a, 0, 0xfffff) /* PVH_CS_SEL */
+#else
+ .quad GDT_ENTRY(0xc09a, 0, 0xfffff) /* PVH_CS_SEL */
+#endif
+ .quad GDT_ENTRY(0xc092, 0, 0xfffff) /* PVH_DS_SEL */
+ .quad GDT_ENTRY(0x4090, 0, 0x18) /* PVH_CANARY_SEL */
+gdt_end:
+
+ .balign 16
+canary:
+ .fill 48, 1, 0
+
+early_stack:
+ .fill BOOT_STACK_SIZE, 1, 0
+early_stack_end:
+
+ ELFNOTE(Xen, XEN_ELFNOTE_PHYS32_ENTRY,
+ _ASM_PTR (pvh_start_xen - __START_KERNEL_map))